diff --git a/Makefile.am b/Makefile.am
index 44ec7a852e..60746dbf06 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -26,8 +26,12 @@ AM_LIBAPU_LDFLAGS := $(shell ./libs/apr-util/apu-1-config --ldflags)
 AM_LIBAPU_LIBS := $(subst $(switch_builddir)/,,$(shell ./libs/apr-util/apu-1-config \--libs))
 endif
 
+
 AM_CFLAGS   = $(SWITCH_AM_CFLAGS) $(SWITCH_ANSI_CFLAGS) 
-AM_CPPFLAGS = $(SWITCH_AM_CXXFLAGS) -I$(switch_srcdir)/libs/sofia-sip/libsofia-sip-ua/sdp -I$(switch_srcdir)/libs/sofia-sip/libsofia-sip-ua/su -I$(switch_builddir)/libs/sofia-sip/libsofia-sip-ua/su
+AM_CPPFLAGS =
+AM_CPPFLAGS += -I$(switch_srcdir)/libs/libvpx
+AM_CPPFLAGS += $(SWITCH_AM_CXXFLAGS) -I$(switch_srcdir)/libs/sofia-sip/libsofia-sip-ua/sdp
+AM_CPPFLAGS += -I$(switch_srcdir)/libs/sofia-sip/libsofia-sip-ua/su -I$(switch_builddir)/libs/sofia-sip/libsofia-sip-ua/su
 AM_LDFLAGS  = $(SWITCH_AM_LDFLAGS) $(AM_LIBAPR_LDFLAGS) $(AM_LIBAPU_LDFLAGS)
 
 DEFAULT_SOUNDS=en-us-callie-8000
@@ -115,12 +119,23 @@ CORE_CFLAGS  = $(AM_LIBAPR_CFLAGS) $(AM_LIBAPR_CPPFLAGS)
 CORE_CFLAGS += $(AM_LIBAPU_CPPFLAGS)
 CORE_CFLAGS += -I$(switch_srcdir)/libs/libtpl-1.5/src
 CORE_CFLAGS += -I$(switch_srcdir)/libs/srtp/include
+if ENABLE_LIBYUV
+CORE_CFLAGS += -I$(switch_srcdir)/libs/libyuv/include
+CORE_CFLAGS += -DSWITCH_HAVE_YUV
+endif
 CORE_CFLAGS += -I$(switch_srcdir)/libs/srtp/crypto/include -Ilibs/srtp/crypto/include
 CORE_CFLAGS += -I$(switch_builddir)/libs/spandsp/src -I$(switch_srcdir)/libs/spandsp/src 
 CORE_CFLAGS += -I$(switch_builddir)/libs/tiff-4.0.2/libtiff -I$(switch_srcdir)/libs/tiff-4.0.2/libtiff
+if ENABLE_LIBVPX
+CORE_CFLAGS += -DSWITCH_HAVE_VPX
+endif
 
 APR_LIBS   = $(AM_LIBAPU_LIBS) $(AM_LIBAPR_LIBS)
 CORE_LIBS=
+
+if ENABLE_LIBVPX
+CORE_LIBS += libs/libvpx/libvpx.a
+endif
 if SYSTEM_APRUTIL
 CORE_LIBS += $(AM_LIBAPU_LINKLIBTOOL)
 else
@@ -149,14 +164,6 @@ if HAVE_PNG
 CORE_CFLAGS += -DSWITCH_HAVE_PNG $(LIBPNG_CFLAGS)
 endif
 
-if HAVE_VPX
-CORE_CFLAGS += -DSWITCH_HAVE_VPX $(VPX_CFLAGS)
-endif
-
-if HAVE_YUV
-CORE_CFLAGS += -DSWITCH_HAVE_YUV $(YUV_CFLAGS)
-endif
-
 if HAVE_FREETYPE
 CORE_CFLAGS += -DSWITCH_HAVE_FREETYPE $(LIBFREETYPE_CFLAGS)
 endif
@@ -165,13 +172,52 @@ endif
 ## libfreeswitch
 ##
 noinst_LTLIBRARIES        = libfreeswitch_spandsp.la
+if ENABLE_LIBYUV
+noinst_LTLIBRARIES += libfreeswitch_libyuv.la
+endif
 libfreeswitch_spandsp_la_SOURCES = libs/spandsp/src/plc.c libs/spandsp/src/alloc.c libs/spandsp/src/bit_operations.c
 libfreeswitch_spandsp_la_CFLAGS  = -Ilibs/spandsp/src $(CORE_CFLAGS) $(AM_CFLAGS)
 CORE_LIBS+=libfreeswitch_spandsp.la
+
+if ENABLE_LIBYUV
+libfreeswitch_libyuv_la_SOURCES = \
+    libs/libyuv/source/compare.cc           \
+    libs/libyuv/source/compare_common.cc    \
+    libs/libyuv/source/compare_gcc.cc       \
+    libs/libyuv/source/convert.cc           \
+    libs/libyuv/source/convert_argb.cc      \
+    libs/libyuv/source/convert_from.cc      \
+    libs/libyuv/source/convert_from_argb.cc \
+    libs/libyuv/source/convert_to_argb.cc   \
+    libs/libyuv/source/convert_to_i420.cc   \
+    libs/libyuv/source/cpu_id.cc            \
+    libs/libyuv/source/planar_functions.cc  \
+    libs/libyuv/source/rotate.cc            \
+    libs/libyuv/source/rotate_any.cc        \
+    libs/libyuv/source/rotate_argb.cc       \
+    libs/libyuv/source/rotate_common.cc     \
+    libs/libyuv/source/rotate_gcc.cc        \
+    libs/libyuv/source/rotate_mips.cc       \
+    libs/libyuv/source/row_any.cc           \
+    libs/libyuv/source/row_common.cc        \
+    libs/libyuv/source/row_mips.cc          \
+    libs/libyuv/source/row_gcc.cc           \
+    libs/libyuv/source/scale.cc             \
+    libs/libyuv/source/scale_any.cc         \
+    libs/libyuv/source/scale_argb.cc        \
+    libs/libyuv/source/scale_common.cc      \
+    libs/libyuv/source/scale_gcc.cc         \
+    libs/libyuv/source/scale_mips.cc        \
+    libs/libyuv/source/video_common.cc
+
+libfreeswitch_libyuv_la_CPPFLAGS = -O2 -fomit-frame-pointer -Ilibs/libyuv/include
+CORE_LIBS+=libfreeswitch_libyuv.la
+endif
+
 lib_LTLIBRARIES	         = libfreeswitch.la
-libfreeswitch_la_CFLAGS  = $(CORE_CFLAGS) $(SQLITE_CFLAGS) $(FREETYPE_CFLAGS) $(CURL_CFLAGS) $(PCRE_CFLAGS) $(SPEEX_CFLAGS) $(LIBEDIT_CFLAGS) $(openssl_CFLAGS) $(VPX_CFLAGS) $(AM_CFLAGS)
+libfreeswitch_la_CFLAGS  = $(CORE_CFLAGS) $(SQLITE_CFLAGS) $(FREETYPE_CFLAGS) $(CURL_CFLAGS) $(PCRE_CFLAGS) $(SPEEX_CFLAGS) $(LIBEDIT_CFLAGS) $(openssl_CFLAGS)  $(AM_CFLAGS)
 libfreeswitch_la_LDFLAGS = -version-info 1:0:0 $(AM_LDFLAGS) $(PLATFORM_CORE_LDFLAGS) -no-undefined
-libfreeswitch_la_LIBADD  = $(CORE_LIBS) $(APR_LIBS) $(SQLITE_LIBS) $(FREETYPE_LIBS) $(CURL_LIBS) $(PCRE_LIBS) $(SPEEX_LIBS) $(LIBEDIT_LIBS) $(openssl_LIBS) $(VPX_LIBS) $(PLATFORM_CORE_LIBS)
+libfreeswitch_la_LIBADD  = $(CORE_LIBS) $(APR_LIBS) $(SQLITE_LIBS) $(FREETYPE_LIBS) $(CURL_LIBS) $(PCRE_LIBS) $(SPEEX_LIBS) $(LIBEDIT_LIBS) $(openssl_LIBS) $(PLATFORM_CORE_LIBS)
 libfreeswitch_la_DEPENDENCIES = $(BUILT_SOURCES)
 
 if HAVE_PNG
@@ -315,6 +361,7 @@ libfreeswitch_la_SOURCES = \
 	src/switch_curl.c \
 	src/switch_hashtable.c\
 	src/switch_utf8.c \
+	src/switch_vpx.c \
 	libs/libtpl-1.5/src/tpl.c \
 	libs/libteletone/src/libteletone_detect.c \
 	libs/libteletone/src/libteletone_generate.c \
@@ -485,6 +532,12 @@ libs/libedit/src/.libs/libedit.a:
 libs/libzrtp/libzrtp.a:
 	cd libs/libzrtp && $(MAKE)
 
+libs/libvpx/Makefile:
+	cd libs/libvpx && sh ./configure --enable-pic --disable-docs --disable-examples --disable-install-bins --disable-install-srcs --disable-unit-tests --extra-cflags="-fvisibility=hidden"
+
+libs/libvpx/libvpx.a: libs/libvpx/Makefile
+	@cd libs/libvpx && $(MAKE)
+
 libs/sofia-sip/Makefile:
 	cd libs/sofia-sip && sh ./configure.gnu $(MY_DEFAULT_ARGS)
 
diff --git a/build/modules.conf.in b/build/modules.conf.in
index f87353c193..9c1a6d4ac0 100644
--- a/build/modules.conf.in
+++ b/build/modules.conf.in
@@ -75,7 +75,6 @@ codecs/mod_opus
 #codecs/mod_silk
 #codecs/mod_siren
 #codecs/mod_theora
-codecs/mod_vpx
 dialplans/mod_dialplan_asterisk
 #dialplans/mod_dialplan_directory
 dialplans/mod_dialplan_xml
diff --git a/build/modules.conf.most b/build/modules.conf.most
index 408e9196a5..3becf53b28 100644
--- a/build/modules.conf.most
+++ b/build/modules.conf.most
@@ -74,7 +74,6 @@ codecs/mod_sangoma_codec
 codecs/mod_silk
 codecs/mod_siren
 codecs/mod_theora
-codecs/mod_vpx
 dialplans/mod_dialplan_asterisk
 dialplans/mod_dialplan_directory
 dialplans/mod_dialplan_xml
@@ -103,6 +102,7 @@ event_handlers/mod_event_multicast
 event_handlers/mod_event_socket
 event_handlers/mod_format_cdr
 event_handlers/mod_json_cdr
+event_handlers/mod_kazoo
 #event_handlers/mod_radius_cdr
 event_handlers/mod_odbc_cdr
 event_handlers/mod_rayo
diff --git a/conf/insideout/autoload_configs/modules.conf.xml b/conf/insideout/autoload_configs/modules.conf.xml
index 22f35c6cf3..3e2b09cdcb 100644
--- a/conf/insideout/autoload_configs/modules.conf.xml
+++ b/conf/insideout/autoload_configs/modules.conf.xml
@@ -63,7 +63,6 @@
     <load module="mod_amr"/>
     <!--<load module="mod_ilbc"/>-->
     <load module="mod_h26x"/>
-    <load module="mod_vpx"/>
     <!--<load module="mod_siren"/>-->
 
     <!-- File Format Interfaces -->
diff --git a/conf/rayo/autoload_configs/modules.conf.xml b/conf/rayo/autoload_configs/modules.conf.xml
index 3a2ace10ad..f6c58680bc 100644
--- a/conf/rayo/autoload_configs/modules.conf.xml
+++ b/conf/rayo/autoload_configs/modules.conf.xml
@@ -30,7 +30,6 @@
     <load module="mod_opus"/>
     <load module="mod_ilbc"/>
     <load module="mod_h26x"/>
-    <load module="mod_vpx"/>
 
     <!-- File Format Interfaces -->
     <load module="mod_sndfile"/>
diff --git a/conf/sbc/autoload_configs/modules.conf.xml b/conf/sbc/autoload_configs/modules.conf.xml
index 940c82b7a9..579b1d33be 100644
--- a/conf/sbc/autoload_configs/modules.conf.xml
+++ b/conf/sbc/autoload_configs/modules.conf.xml
@@ -42,7 +42,6 @@
     <load module="mod_amr"/>
     <!--<load module="mod_ilbc"/>-->
     <load module="mod_h26x"/>
-    <load module="mod_vpx"/>
     <!--<load module="mod_siren"/>-->
 
     <!-- Timers -->
diff --git a/conf/testing/autoload_configs/modules.conf.xml b/conf/testing/autoload_configs/modules.conf.xml
index f0e099661f..9849ce2d94 100644
--- a/conf/testing/autoload_configs/modules.conf.xml
+++ b/conf/testing/autoload_configs/modules.conf.xml
@@ -23,7 +23,6 @@
     <load module="mod_g723_1"/>
     <load module="mod_g729"/>
     <load module="mod_amr"/>
-    <load module="mod_vpx"/>
     <load module="mod_opus"/>
     <load module="mod_sndfile"/>
     <load module="mod_native_file"/>
diff --git a/conf/vanilla/autoload_configs/abstraction.conf.xml b/conf/vanilla/autoload_configs/abstraction.conf.xml
index d4b1dfd274..7244681374 100644
--- a/conf/vanilla/autoload_configs/abstraction.conf.xml
+++ b/conf/vanilla/autoload_configs/abstraction.conf.xml
@@ -1,5 +1,5 @@
 <configuration name="abstraction.conf" description="Abstraction">
 <apis>
-	<api name="user_name" description="Return Name for extension" syntax="<exten>" parse="(.*)" destination="user_data" argument="$1@default var effective_caller_id_name"/>
+	<api name="user_name" description="Return Name for extension" syntax="&lt;exten&gt;" parse="(.*)" destination="user_data" argument="$1@default var effective_caller_id_name"/>
 </apis>
 </configuration>
diff --git a/conf/vanilla/autoload_configs/amqp.conf.xml b/conf/vanilla/autoload_configs/amqp.conf.xml
index d665d1104b..d6c24f4ff7 100644
--- a/conf/vanilla/autoload_configs/amqp.conf.xml
+++ b/conf/vanilla/autoload_configs/amqp.conf.xml
@@ -60,4 +60,24 @@
       </params>
     </profile>
   </commands>
+  <logging>
+    <profile name="default">
+      <connections>
+	<connection name="primary">
+	  <param name="hostname" value="localhost"/>
+	  <param name="virtualhost" value="/"/>
+	  <param name="username" value="guest"/>
+	  <param name="password" value="guest"/>
+	  <param name="port" value="5672"/>
+	  <param name="heartbeat" value="0"/>
+	</connection>
+      </connections>
+      <params>
+	<param name="exchange-name" value="TAP.Logging"/>
+	<param name="send_queue_size" value="5000"/>
+	<param name="reconnect_interval_ms" value="1000"/>
+	<param name="log-levels" value="debug,info,notice,warning,err,crit,alert"/>
+      </params>
+    </profile>
+  </logging>
 </configuration>
diff --git a/conf/vanilla/autoload_configs/cidlookup.conf.xml b/conf/vanilla/autoload_configs/cidlookup.conf.xml
index a30f9f5c10..fd28f9b0c0 100644
--- a/conf/vanilla/autoload_configs/cidlookup.conf.xml
+++ b/conf/vanilla/autoload_configs/cidlookup.conf.xml
@@ -1,7 +1,7 @@
 <configuration name="cidlookup.conf" description="cidlookup Configuration">
   <settings>
     <!-- comment out url to not setup a url based lookup -->
-    <param name="url" value="http://query.voipcnam.com/query.php?api_key=MYAPIKEY&number=${caller_id_number}"/>
+    <param name="url" value="http://query.voipcnam.com/query.php?api_key=MYAPIKEY&amp;number=${caller_id_number}"/>
 
     <!-- comment out whitepages-apikey to not use whitepages.com, you must
          get an API key from http://developer.whitepages.com/ -->
diff --git a/conf/vanilla/autoload_configs/modules.conf.xml b/conf/vanilla/autoload_configs/modules.conf.xml
index 5437153660..c850d7365f 100644
--- a/conf/vanilla/autoload_configs/modules.conf.xml
+++ b/conf/vanilla/autoload_configs/modules.conf.xml
@@ -94,7 +94,6 @@
     <load module="mod_amr"/>
     <!--<load module="mod_ilbc"/>-->
     <!--<load module="mod_h26x"/>-->
-    <load module="mod_vpx"/>
     <load module="mod_b64"/>
     <!--<load module="mod_siren"/>-->
     <!--<load module="mod_isac"/>-->
diff --git a/conf/vanilla/autoload_configs/timezones.conf.xml b/conf/vanilla/autoload_configs/timezones.conf.xml
index 757d16c19d..50bd7f3760 100644
--- a/conf/vanilla/autoload_configs/timezones.conf.xml
+++ b/conf/vanilla/autoload_configs/timezones.conf.xml
@@ -13,7 +13,7 @@
 	<zone name="Africa/Blantyre" value="CAT-2" />
 	<zone name="Africa/Brazzaville" value="WAT-1" />
 	<zone name="Africa/Bujumbura" value="CAT-2" />
-	<zone name="Africa/Cairo" value="EEST" />
+	<zone name="Africa/Cairo" value="EET-2" />
 	<zone name="Africa/Casablanca" value="WET0WEST,M3.5.0,M10.5.0/3" />
 	<zone name="Africa/Ceuta" value="CET-1CEST,M3.5.0,M10.5.0/3" />
 	<zone name="Africa/Conakry" value="GMT0" />
@@ -55,7 +55,7 @@
 	<zone name="Africa/Tunis" value="CET-1" />
 	<zone name="Africa/Windhoek" value="WAT-1WAST,M9.1.0,M4.1.0" />
 
-	<zone name="America/Adak" value="HAST10HADT,M3.2.0,M11.1.0" />
+	<zone name="America/Adak" value="HST10HDT,M3.2.0,M11.1.0" />
 	<zone name="America/Anchorage" value="AKST9AKDT,M3.2.0,M11.1.0" />
 	<zone name="America/Anguilla" value="AST4" />
 	<zone name="America/Antigua" value="AST4" />
@@ -76,7 +76,7 @@
 	<zone name="America/Aruba" value="AST4" />
 	<zone name="America/Asuncion" value="PYT4PYST,M10.1.0/0,M3.4.0/0" />
 	<zone name="America/Atikokan" value="EST5" />
-	<zone name="America/Atka" value="HAST10HADT,M3.2.0,M11.1.0" />
+	<zone name="America/Atka" value="HST10HDT,M3.2.0,M11.1.0" />
 	<zone name="America/Bahia" value="BRT3" />
 	<zone name="America/Bahia_Banderas" value="CST6CDT,M4.1.0,M10.5.0" />
 	<zone name="America/Barbados" value="AST4" />
@@ -89,11 +89,11 @@
 	<zone name="America/Buenos_Aires" value="ART3" />
 	<zone name="America/Cambridge_Bay" value="MST7MDT,M3.2.0,M11.1.0" />
 	<zone name="America/Campo_Grande" value="AMT4AMST,M10.3.0/0,M2.3.0/0" />
-	<zone name="America/Cancun" value="CST6CDT,M4.1.0,M10.5.0" />
+	<zone name="America/Cancun" value="EST5" />
 	<zone name="America/Caracas" value="VET4:30" />
 	<zone name="America/Catamarca" value="ART3" />
 	<zone name="America/Cayenne" value="GFT3" />
-	<zone name="America/Cayman" value="EST5" />
+	<zone name="America/Cayman" value="EST5EDT,M3.2.0,M11.1.0" />
 	<zone name="America/Chicago" value="CST6CDT,M3.2.0,M11.1.0" />
 	<zone name="America/Chihuahua" value="MST7MDT,M4.1.0,M10.5.0" />
 	<zone name="America/Coral_Harbour" value="EST5" />
@@ -112,12 +112,13 @@
 	<zone name="America/Eirunepe" value="ACT5" />
 	<zone name="America/El_Salvador" value="CST6" />
 	<zone name="America/Ensenada" value="PST8PDT,M3.2.0,M11.1.0" />
+	<zone name="America/Fort_Nelson" value="MST7" />
 	<zone name="America/Fort_Wayne" value="EST5EDT,M3.2.0,M11.1.0" />
 	<zone name="America/Fortaleza" value="BRT3" />
 	<zone name="America/Glace_Bay" value="AST4ADT,M3.2.0,M11.1.0" />
 	<zone name="America/Godthab" value="WGST" />
 	<zone name="America/Goose_Bay" value="AST4ADT,M3.2.0,M11.1.0" />
-	<zone name="America/Grand_Turk" value="EST5EDT,M3.2.0,M11.1.0" />
+	<zone name="America/Grand_Turk" value="AST4" />
 	<zone name="America/Grenada" value="AST4" />
 	<zone name="America/Guadeloupe" value="AST4" />
 	<zone name="America/Guatemala" value="CST6" />
@@ -159,12 +160,12 @@
 	<zone name="America/Mendoza" value="ART3" />
 	<zone name="America/Menominee" value="CST6CDT,M3.2.0,M11.1.0" />
 	<zone name="America/Merida" value="CST6CDT,M4.1.0,M10.5.0" />
-	<zone name="America/Metlakatla" value="MeST8" />
+	<zone name="America/Metlakatla" value="PST8" />
 	<zone name="America/Mexico_City" value="CST6CDT,M4.1.0,M10.5.0" />
 	<zone name="America/Miquelon" value="PMST3PMDT,M3.2.0,M11.1.0" />
 	<zone name="America/Moncton" value="AST4ADT,M3.2.0,M11.1.0" />
 	<zone name="America/Monterrey" value="CST6CDT,M4.1.0,M10.5.0" />
-	<zone name="America/Montevideo" value="UYT3UYST,M10.1.0,M3.2.0" />
+	<zone name="America/Montevideo" value="UYT3" />
 	<zone name="America/Montreal" value="EST5EDT,M3.2.0,M11.1.0" />
 	<zone name="America/Montserrat" value="AST4" />
 	<zone name="America/Nassau" value="EST5EDT,M3.2.0,M11.1.0" />
@@ -194,7 +195,7 @@
 	<zone name="America/Rosario" value="ART3" />
 	<zone name="America/Santa_Isabel" value="PST8PDT,M4.1.0,M10.5.0" />
 	<zone name="America/Santarem" value="BRT3" />
-	<zone name="America/Santiago" value="CLST" />
+	<zone name="America/Santiago" value="CLT3" />
 	<zone name="America/Santo_Domingo" value="AST4" />
 	<zone name="America/Sao_Paulo" value="BRT3BRST,M10.3.0/0,M2.3.0/0" />
 	<zone name="America/Scoresbysund" value="EGT1EGST,M3.5.0/0,M10.5.0/1" />
@@ -220,13 +221,13 @@
 	<zone name="America/Yakutat" value="AKST9AKDT,M3.2.0,M11.1.0" />
 	<zone name="America/Yellowknife" value="MST7MDT,M3.2.0,M11.1.0" />
 
-	<zone name="Antarctica/Casey" value="WST-8" />
+	<zone name="Antarctica/Casey" value="AWST-8" />
 	<zone name="Antarctica/Davis" value="DAVT-7" />
 	<zone name="Antarctica/DumontDUrville" value="DDUT-10" />
 	<zone name="Antarctica/Macquarie" value="MIST-11" />
 	<zone name="Antarctica/Mawson" value="MAWT-5" />
 	<zone name="Antarctica/McMurdo" value="NZST-12NZDT,M9.5.0,M4.1.0/3" />
-	<zone name="Antarctica/Palmer" value="CLST" />
+	<zone name="Antarctica/Palmer" value="CLT3" />
 	<zone name="Antarctica/Rothera" value="ROTT3" />
 	<zone name="Antarctica/South_Pole" value="NZST-12NZDT,M9.5.0,M4.1.0/3" />
 	<zone name="Antarctica/Syowa" value="SYOT-3" />
@@ -251,7 +252,8 @@
 	<zone name="Asia/Bishkek" value="KGT-6" />
 	<zone name="Asia/Brunei" value="BNT-8" />
 	<zone name="Asia/Calcutta" value="IST-5:30" />
-	<zone name="Asia/Choibalsan" value="CHOT-8" />
+	<zone name="Asia/Chita" value="IRKT-8" />
+	<zone name="Asia/Choibalsan" value="CHOT-8CHOST,M3.5.6,M9.5.6/0" />
 	<zone name="Asia/Chongqing" value="CST-8" />
 	<zone name="Asia/Chungking" value="CST-8" />
 	<zone name="Asia/Colombo" value="IST-5:30" />
@@ -266,8 +268,8 @@
 	<zone name="Asia/Hebron" value="EEST" />
 	<zone name="Asia/Ho_Chi_Minh" value="ICT-7" />
 	<zone name="Asia/Hong_Kong" value="HKT-8" />
-	<zone name="Asia/Hovd" value="HOVT-7" />
-	<zone name="Asia/Irkutsk" value="IRKT-9" />
+	<zone name="Asia/Hovd" value="HOVT-7HOVST,M3.5.6,M9.5.6/0" />
+	<zone name="Asia/Irkutsk" value="IRKT-8" />
 	<zone name="Asia/Istanbul" value="EET-2EEST,M3.5.0/3,M10.5.0/4" />
 	<zone name="Asia/Jakarta" value="WIB-7" />
 	<zone name="Asia/Jayapura" value="WIT-9" />
@@ -275,39 +277,40 @@
 	<zone name="Asia/Kabul" value="AFT-4:30" />
 	<zone name="Asia/Kamchatka" value="PETT-12" />
 	<zone name="Asia/Karachi" value="PKT-5" />
-	<zone name="Asia/Kashgar" value="CST-8" />
+	<zone name="Asia/Kashgar" value="XJT-6" />
 	<zone name="Asia/Kathmandu" value="NPT-5:45" />
 	<zone name="Asia/Katmandu" value="NPT-5:45" />
-	<zone name="Asia/Khandyga" value="YAKT-10" />
+	<zone name="Asia/Khandyga" value="YAKT-9" />
 	<zone name="Asia/Kolkata" value="IST-5:30" />
-	<zone name="Asia/Krasnoyarsk" value="KRAT-8" />
+	<zone name="Asia/Krasnoyarsk" value="KRAT-7" />
 	<zone name="Asia/Kuala_Lumpur" value="MYT-8" />
 	<zone name="Asia/Kuching" value="MYT-8" />
 	<zone name="Asia/Kuwait" value="AST-3" />
 	<zone name="Asia/Macao" value="CST-8" />
 	<zone name="Asia/Macau" value="CST-8" />
-	<zone name="Asia/Magadan" value="MAGT-12" />
+	<zone name="Asia/Magadan" value="MAGT-10" />
 	<zone name="Asia/Makassar" value="WITA-8" />
 	<zone name="Asia/Manila" value="PHT-8" />
 	<zone name="Asia/Muscat" value="GST-4" />
 	<zone name="Asia/Nicosia" value="EET-2EEST,M3.5.0/3,M10.5.0/4" />
-	<zone name="Asia/Novokuznetsk" value="NOVT-7" />
-	<zone name="Asia/Novosibirsk" value="NOVT-7" />
-	<zone name="Asia/Omsk" value="OMST-7" />
+	<zone name="Asia/Novokuznetsk" value="KRAT-7" />
+	<zone name="Asia/Novosibirsk" value="NOVT-6" />
+	<zone name="Asia/Omsk" value="OMST-6" />
 	<zone name="Asia/Oral" value="ORAT-5" />
 	<zone name="Asia/Phnom_Penh" value="ICT-7" />
 	<zone name="Asia/Pontianak" value="WIB-7" />
-	<zone name="Asia/Pyongyang" value="KST-9" />
+	<zone name="Asia/Pyongyang" value="KST-8:30" />
 	<zone name="Asia/Qatar" value="AST-3" />
 	<zone name="Asia/Qyzylorda" value="QYZT-6" />
 	<zone name="Asia/Rangoon" value="MMT-6:30" />
 	<zone name="Asia/Riyadh" value="AST-3" />
 	<zone name="Asia/Saigon" value="ICT-7" />
-	<zone name="Asia/Sakhalin" value="SAKT-11" />
+	<zone name="Asia/Sakhalin" value="SAKT-10" />
 	<zone name="Asia/Samarkand" value="UZT-5" />
 	<zone name="Asia/Seoul" value="KST-9" />
 	<zone name="Asia/Shanghai" value="CST-8" />
 	<zone name="Asia/Singapore" value="SGT-8" />
+	<zone name="Asia/Srednekolymsk" value="SRET-11" />
 	<zone name="Asia/Taipei" value="CST-8" />
 	<zone name="Asia/Tashkent" value="UZT-5" />
 	<zone name="Asia/Tbilisi" value="GET-4" />
@@ -317,14 +320,14 @@
 	<zone name="Asia/Thimphu" value="BTT-6" />
 	<zone name="Asia/Tokyo" value="JST-9" />
 	<zone name="Asia/Ujung_Pandang" value="WITA-8" />
-	<zone name="Asia/Ulaanbaatar" value="ULAT-8" />
-	<zone name="Asia/Ulan_Bator" value="ULAT-8" />
-	<zone name="Asia/Urumqi" value="CST-8" />
-	<zone name="Asia/Ust-Nera" value="VLAT-11" />
+	<zone name="Asia/Ulaanbaatar" value="ULAT-8ULAST,M3.5.6,M9.5.6/0" />
+	<zone name="Asia/Ulan_Bator" value="ULAT-8ULAST,M3.5.6,M9.5.6/0" />
+	<zone name="Asia/Urumqi" value="XJT-6" />
+	<zone name="Asia/Ust-Nera" value="VLAT-10" />
 	<zone name="Asia/Vientiane" value="ICT-7" />
-	<zone name="Asia/Vladivostok" value="VLAT-11" />
-	<zone name="Asia/Yakutsk" value="YAKT-10" />
-	<zone name="Asia/Yekaterinburg" value="YEKT-6" />
+	<zone name="Asia/Vladivostok" value="VLAT-10" />
+	<zone name="Asia/Yakutsk" value="YAKT-9" />
+	<zone name="Asia/Yekaterinburg" value="YEKT-5" />
 	<zone name="Asia/Yerevan" value="AMT-4" />
 
 	<zone name="Atlantic/Azores" value="AZOT1AZOST,M3.5.0/0,M10.5.0/1" />
@@ -340,29 +343,29 @@
 	<zone name="Atlantic/St_Helena" value="GMT0" />
 	<zone name="Atlantic/Stanley" value="FKST3" />
 
-	<zone name="Australia/ACT" value="EST-10EST,M10.1.0,M4.1.0/3" />
-	<zone name="Australia/Adelaide" value="CST-9:30CST,M10.1.0,M4.1.0/3" />
-	<zone name="Australia/Brisbane" value="EST-10" />
-	<zone name="Australia/Broken_Hill" value="CST-9:30CST,M10.1.0,M4.1.0/3" />
-	<zone name="Australia/Canberra" value="EST-10EST,M10.1.0,M4.1.0/3" />
-	<zone name="Australia/Currie" value="EST-10EST,M10.1.0,M4.1.0/3" />
-	<zone name="Australia/Darwin" value="CST-9:30" />
-	<zone name="Australia/Eucla" value="CWST-8:45" />
-	<zone name="Australia/Hobart" value="EST-10EST,M10.1.0,M4.1.0/3" />
-	<zone name="Australia/LHI" value="LHST-10:30LHST-11,M10.1.0,M4.1.0" />
-	<zone name="Australia/Lindeman" value="EST-10" />
-	<zone name="Australia/Lord_Howe" value="LHST-10:30LHST-11,M10.1.0,M4.1.0" />
-	<zone name="Australia/Melbourne" value="EST-10EST,M10.1.0,M4.1.0/3" />
-	<zone name="Australia/NSW" value="EST-10EST,M10.1.0,M4.1.0/3" />
-	<zone name="Australia/North" value="CST-9:30" />
-	<zone name="Australia/Perth" value="WST-8" />
-	<zone name="Australia/Queensland" value="EST-10" />
-	<zone name="Australia/South" value="CST-9:30CST,M10.1.0,M4.1.0/3" />
-	<zone name="Australia/Sydney" value="EST-10EST,M10.1.0,M4.1.0/3" />
-	<zone name="Australia/Tasmania" value="EST-10EST,M10.1.0,M4.1.0/3" />
-	<zone name="Australia/Victoria" value="EST-10EST,M10.1.0,M4.1.0/3" />
-	<zone name="Australia/West" value="WST-8" />
-	<zone name="Australia/Yancowinna" value="CST-9:30CST,M10.1.0,M4.1.0/3" />
+	<zone name="Australia/ACT" value="AEST-10AEDT,M10.1.0,M4.1.0/3" />
+	<zone name="Australia/Adelaide" value="ACST-9:30ACDT,M10.1.0,M4.1.0/3" />
+	<zone name="Australia/Brisbane" value="AEST-10" />
+	<zone name="Australia/Broken_Hill" value="ACST-9:30ACDT,M10.1.0,M4.1.0/3" />
+	<zone name="Australia/Canberra" value="AEST-10AEDT,M10.1.0,M4.1.0/3" />
+	<zone name="Australia/Currie" value="AEST-10AEDT,M10.1.0,M4.1.0/3" />
+	<zone name="Australia/Darwin" value="ACST-9:30" />
+	<zone name="Australia/Eucla" value="ACWST-8:45" />
+	<zone name="Australia/Hobart" value="AEST-10AEDT,M10.1.0,M4.1.0/3" />
+	<zone name="Australia/LHI" value="LHST-10:30LHDT-11,M10.1.0,M4.1.0" />
+	<zone name="Australia/Lindeman" value="AEST-10" />
+	<zone name="Australia/Lord_Howe" value="LHST-10:30LHDT-11,M10.1.0,M4.1.0" />
+	<zone name="Australia/Melbourne" value="AEST-10AEDT,M10.1.0,M4.1.0/3" />
+	<zone name="Australia/NSW" value="AEST-10AEDT,M10.1.0,M4.1.0/3" />
+	<zone name="Australia/North" value="ACST-9:30" />
+	<zone name="Australia/Perth" value="AWST-8" />
+	<zone name="Australia/Queensland" value="AEST-10" />
+	<zone name="Australia/South" value="ACST-9:30ACDT,M10.1.0,M4.1.0/3" />
+	<zone name="Australia/Sydney" value="AEST-10AEDT,M10.1.0,M4.1.0/3" />
+	<zone name="Australia/Tasmania" value="AEST-10AEDT,M10.1.0,M4.1.0/3" />
+	<zone name="Australia/Victoria" value="AEST-10AEDT,M10.1.0,M4.1.0/3" />
+	<zone name="Australia/West" value="AWST-8" />
+	<zone name="Australia/Yancowinna" value="ACST-9:30ACDT,M10.1.0,M4.1.0/3" />
 
 	<zone name="Brazil/Acre" value="ACT5" />
 	<zone name="Brazil/DeNoronha" value="FNT2" />
@@ -383,8 +386,8 @@
 	<zone name="Canada/Saskatchewan" value="CST6" />
 	<zone name="Canada/Yukon" value="PST8PDT,M3.2.0,M11.1.0" />
 
-	<zone name="Chile/Continental" value="CLST" />
-	<zone name="Chile/EasterIsland" value="EASST" />
+	<zone name="Chile/Continental" value="CLT3" />
+	<zone name="Chile/EasterIsland" value="EAST5" />
 
 	<zone name="Cuba" value="CST5CDT,M3.2.0/0,M11.1.0/1" />
 
@@ -394,39 +397,39 @@
 
 	<zone name="EST5EDT" value="EST5EDT,M3.2.0,M11.1.0" />
 
-	<zone name="Egypt" value="EEST" />
+	<zone name="Egypt" value="EET-2" />
 
 	<zone name="Eire" value="GMT0IST,M3.5.0/1,M10.5.0" />
 
 	<zone name="Etc/GMT" value="GMT0" />
 	<zone name="Etc/GMT+0" value="GMT0" />
-	<zone name="Etc/GMT+1" value="<GMT+1>1" />
-	<zone name="Etc/GMT+10" value="<GMT+10>10" />
-	<zone name="Etc/GMT+11" value="<GMT+11>11" />
-	<zone name="Etc/GMT+12" value="<GMT+12>12" />
-	<zone name="Etc/GMT+2" value="<GMT+2>2" />
-	<zone name="Etc/GMT+3" value="<GMT+3>3" />
-	<zone name="Etc/GMT+4" value="<GMT+4>4" />
-	<zone name="Etc/GMT+5" value="<GMT+5>5" />
-	<zone name="Etc/GMT+6" value="<GMT+6>6" />
-	<zone name="Etc/GMT+7" value="<GMT+7>7" />
-	<zone name="Etc/GMT+8" value="<GMT+8>8" />
-	<zone name="Etc/GMT+9" value="<GMT+9>9" />
+	<zone name="Etc/GMT+1" value="&lt;GMT+1&gt;1" />
+	<zone name="Etc/GMT+10" value="&lt;GMT+10&gt;10" />
+	<zone name="Etc/GMT+11" value="&lt;GMT+11&gt;11" />
+	<zone name="Etc/GMT+12" value="&lt;GMT+12&gt;12" />
+	<zone name="Etc/GMT+2" value="&lt;GMT+2&gt;2" />
+	<zone name="Etc/GMT+3" value="&lt;GMT+3&gt;3" />
+	<zone name="Etc/GMT+4" value="&lt;GMT+4&gt;4" />
+	<zone name="Etc/GMT+5" value="&lt;GMT+5&gt;5" />
+	<zone name="Etc/GMT+6" value="&lt;GMT+6&gt;6" />
+	<zone name="Etc/GMT+7" value="&lt;GMT+7&gt;7" />
+	<zone name="Etc/GMT+8" value="&lt;GMT+8&gt;8" />
+	<zone name="Etc/GMT+9" value="&lt;GMT+9&gt;9" />
 	<zone name="Etc/GMT-0" value="GMT0" />
-	<zone name="Etc/GMT-1" value="<GMT-1>-1" />
-	<zone name="Etc/GMT-10" value="<GMT-10>-10" />
-	<zone name="Etc/GMT-11" value="<GMT-11>-11" />
-	<zone name="Etc/GMT-12" value="<GMT-12>-12" />
-	<zone name="Etc/GMT-13" value="<GMT-13>-13" />
-	<zone name="Etc/GMT-14" value="<GMT-14>-14" />
-	<zone name="Etc/GMT-2" value="<GMT-2>-2" />
-	<zone name="Etc/GMT-3" value="<GMT-3>-3" />
-	<zone name="Etc/GMT-4" value="<GMT-4>-4" />
-	<zone name="Etc/GMT-5" value="<GMT-5>-5" />
-	<zone name="Etc/GMT-6" value="<GMT-6>-6" />
-	<zone name="Etc/GMT-7" value="<GMT-7>-7" />
-	<zone name="Etc/GMT-8" value="<GMT-8>-8" />
-	<zone name="Etc/GMT-9" value="<GMT-9>-9" />
+	<zone name="Etc/GMT-1" value="&lt;GMT-1&gt;-1" />
+	<zone name="Etc/GMT-10" value="&lt;GMT-10&gt;-10" />
+	<zone name="Etc/GMT-11" value="&lt;GMT-11&gt;-11" />
+	<zone name="Etc/GMT-12" value="&lt;GMT-12&gt;-12" />
+	<zone name="Etc/GMT-13" value="&lt;GMT-13&gt;-13" />
+	<zone name="Etc/GMT-14" value="&lt;GMT-14&gt;-14" />
+	<zone name="Etc/GMT-2" value="&lt;GMT-2&gt;-2" />
+	<zone name="Etc/GMT-3" value="&lt;GMT-3&gt;-3" />
+	<zone name="Etc/GMT-4" value="&lt;GMT-4&gt;-4" />
+	<zone name="Etc/GMT-5" value="&lt;GMT-5&gt;-5" />
+	<zone name="Etc/GMT-6" value="&lt;GMT-6&gt;-6" />
+	<zone name="Etc/GMT-7" value="&lt;GMT-7&gt;-7" />
+	<zone name="Etc/GMT-8" value="&lt;GMT-8&gt;-8" />
+	<zone name="Etc/GMT-9" value="&lt;GMT-9&gt;-9" />
 	<zone name="Etc/GMT0" value="GMT0" />
 	<zone name="Etc/Greenwich" value="GMT0" />
 	<zone name="Etc/UCT" value="UCT0" />
@@ -445,7 +448,7 @@
 	<zone name="Europe/Bucharest" value="EET-2EEST,M3.5.0/3,M10.5.0/4" />
 	<zone name="Europe/Budapest" value="CET-1CEST,M3.5.0,M10.5.0/3" />
 	<zone name="Europe/Busingen" value="CET-1CEST,M3.5.0,M10.5.0/3" />
-	<zone name="Europe/Chisinau" value="EET-2EEST,M3.5.0/3,M10.5.0/4" />
+	<zone name="Europe/Chisinau" value="EET-2EEST,M3.5.0,M10.5.0/3" />
 	<zone name="Europe/Copenhagen" value="CET-1CEST,M3.5.0,M10.5.0/3" />
 	<zone name="Europe/Dublin" value="GMT0IST,M3.5.0/1,M10.5.0" />
 	<zone name="Europe/Gibraltar" value="CET-1CEST,M3.5.0,M10.5.0/3" />
@@ -454,7 +457,7 @@
 	<zone name="Europe/Isle_of_Man" value="GMT0BST,M3.5.0/1,M10.5.0" />
 	<zone name="Europe/Istanbul" value="EET-2EEST,M3.5.0/3,M10.5.0/4" />
 	<zone name="Europe/Jersey" value="GMT0BST,M3.5.0/1,M10.5.0" />
-	<zone name="Europe/Kaliningrad" value="FET-3" />
+	<zone name="Europe/Kaliningrad" value="EET-2" />
 	<zone name="Europe/Kiev" value="EET-2EEST,M3.5.0/3,M10.5.0/4" />
 	<zone name="Europe/Lisbon" value="WET0WEST,M3.5.0/1,M10.5.0" />
 	<zone name="Europe/Ljubljana" value="CET-1CEST,M3.5.0,M10.5.0/3" />
@@ -463,9 +466,9 @@
 	<zone name="Europe/Madrid" value="CET-1CEST,M3.5.0,M10.5.0/3" />
 	<zone name="Europe/Malta" value="CET-1CEST,M3.5.0,M10.5.0/3" />
 	<zone name="Europe/Mariehamn" value="EET-2EEST,M3.5.0/3,M10.5.0/4" />
-	<zone name="Europe/Minsk" value="FET-3" />
+	<zone name="Europe/Minsk" value="MSK-3" />
 	<zone name="Europe/Monaco" value="CET-1CEST,M3.5.0,M10.5.0/3" />
-	<zone name="Europe/Moscow" value="MSK-4" />
+	<zone name="Europe/Moscow" value="MSK-3" />
 	<zone name="Europe/Nicosia" value="EET-2EEST,M3.5.0/3,M10.5.0/4" />
 	<zone name="Europe/Oslo" value="CET-1CEST,M3.5.0,M10.5.0/3" />
 	<zone name="Europe/Paris" value="CET-1CEST,M3.5.0,M10.5.0/3" />
@@ -476,26 +479,24 @@
 	<zone name="Europe/Samara" value="SAMT-4" />
 	<zone name="Europe/San_Marino" value="CET-1CEST,M3.5.0,M10.5.0/3" />
 	<zone name="Europe/Sarajevo" value="CET-1CEST,M3.5.0,M10.5.0/3" />
-	<zone name="Europe/Simferopol" value="MSK-4" />
+	<zone name="Europe/Simferopol" value="MSK-3" />
 	<zone name="Europe/Skopje" value="CET-1CEST,M3.5.0,M10.5.0/3" />
 	<zone name="Europe/Sofia" value="EET-2EEST,M3.5.0/3,M10.5.0/4" />
 	<zone name="Europe/Stockholm" value="CET-1CEST,M3.5.0,M10.5.0/3" />
 	<zone name="Europe/Tallinn" value="EET-2EEST,M3.5.0/3,M10.5.0/4" />
 	<zone name="Europe/Tirane" value="CET-1CEST,M3.5.0,M10.5.0/3" />
-	<zone name="Europe/Tiraspol" value="EET-2EEST,M3.5.0/3,M10.5.0/4" />
+	<zone name="Europe/Tiraspol" value="EET-2EEST,M3.5.0,M10.5.0/3" />
 	<zone name="Europe/Uzhgorod" value="EET-2EEST,M3.5.0/3,M10.5.0/4" />
 	<zone name="Europe/Vaduz" value="CET-1CEST,M3.5.0,M10.5.0/3" />
 	<zone name="Europe/Vatican" value="CET-1CEST,M3.5.0,M10.5.0/3" />
 	<zone name="Europe/Vienna" value="CET-1CEST,M3.5.0,M10.5.0/3" />
 	<zone name="Europe/Vilnius" value="EET-2EEST,M3.5.0/3,M10.5.0/4" />
-	<zone name="Europe/Volgograd" value="VOLT-4" />
+	<zone name="Europe/Volgograd" value="MSK-3" />
 	<zone name="Europe/Warsaw" value="CET-1CEST,M3.5.0,M10.5.0/3" />
 	<zone name="Europe/Zagreb" value="CET-1CEST,M3.5.0,M10.5.0/3" />
 	<zone name="Europe/Zaporozhye" value="EET-2EEST,M3.5.0/3,M10.5.0/4" />
 	<zone name="Europe/Zurich" value="CET-1CEST,M3.5.0,M10.5.0/3" />
 
-	<zone name="Factory" value="<Local time zone must be set--see zic manual page>0" />
-
 	<zone name="GB" value="GMT0BST,M3.5.0/1,M10.5.0" />
 
 	<zone name="GB-Eire" value="GMT0BST,M3.5.0/1,M10.5.0" />
@@ -560,15 +561,16 @@
 
 	<zone name="PST8PDT" value="PST8PDT,M3.2.0,M11.1.0" />
 
-	<zone name="Pacific/Apia" value="WST-13WSDT,M9.5.0/3,M4.1.0/4" />
+	<zone name="Pacific/Apia" value="WSST-13WSDT,M9.5.0/3,M4.1.0/4" />
 	<zone name="Pacific/Auckland" value="NZST-12NZDT,M9.5.0,M4.1.0/3" />
+	<zone name="Pacific/Bougainville" value="BST-11" />
 	<zone name="Pacific/Chatham" value="CHAST-12:45CHADT,M9.5.0/2:45,M4.1.0/3:45" />
 	<zone name="Pacific/Chuuk" value="CHUT-10" />
-	<zone name="Pacific/Easter" value="EASST" />
+	<zone name="Pacific/Easter" value="EAST5" />
 	<zone name="Pacific/Efate" value="VUT-11" />
 	<zone name="Pacific/Enderbury" value="PHOT-13" />
 	<zone name="Pacific/Fakaofo" value="TKT-13" />
-	<zone name="Pacific/Fiji" value="FJST" />
+	<zone name="Pacific/Fiji" value="FJT-12FJST,M11.1.0,M1.3.0/3" />
 	<zone name="Pacific/Funafuti" value="TVT-12" />
 	<zone name="Pacific/Galapagos" value="GALT6" />
 	<zone name="Pacific/Gambier" value="GAMT9" />
@@ -584,7 +586,7 @@
 	<zone name="Pacific/Midway" value="SST11" />
 	<zone name="Pacific/Nauru" value="NRT-12" />
 	<zone name="Pacific/Niue" value="NUT11" />
-	<zone name="Pacific/Norfolk" value="NFT-11:30" />
+	<zone name="Pacific/Norfolk" value="NFT-11" />
 	<zone name="Pacific/Noumea" value="NCT-11" />
 	<zone name="Pacific/Pago_Pago" value="SST11" />
 	<zone name="Pacific/Palau" value="PWT-9" />
@@ -613,26 +615,12 @@
 
 	<zone name="Singapore" value="SGT-8" />
 
-	<zone name="SystemV/AST4" value="AST4" />
-	<zone name="SystemV/AST4ADT" value="AST4ADT,M3.2.0,M11.1.0" />
-	<zone name="SystemV/CST6" value="CST6" />
-	<zone name="SystemV/CST6CDT" value="CST6CDT,M3.2.0,M11.1.0" />
-	<zone name="SystemV/EST5" value="EST5" />
-	<zone name="SystemV/EST5EDT" value="EST5EDT,M3.2.0,M11.1.0" />
-	<zone name="SystemV/HST10" value="HST10" />
-	<zone name="SystemV/MST7" value="MST7" />
-	<zone name="SystemV/MST7MDT" value="MST7MDT,M3.2.0,M11.1.0" />
-	<zone name="SystemV/PST8" value="PST8" />
-	<zone name="SystemV/PST8PDT" value="PST8PDT,M3.2.0,M11.1.0" />
-	<zone name="SystemV/YST9" value="GAMT9" />
-	<zone name="SystemV/YST9YDT" value="AKST9AKDT,M3.2.0,M11.1.0" />
-
 	<zone name="Turkey" value="EET-2EEST,M3.5.0/3,M10.5.0/4" />
 
 	<zone name="UCT" value="UCT0" />
 
 	<zone name="US/Alaska" value="AKST9AKDT,M3.2.0,M11.1.0" />
-	<zone name="US/Aleutian" value="HAST10HADT,M3.2.0,M11.1.0" />
+	<zone name="US/Aleutian" value="HST10HDT,M3.2.0,M11.1.0" />
 	<zone name="US/Arizona" value="MST7" />
 	<zone name="US/Central" value="CST6CDT,M3.2.0,M11.1.0" />
 	<zone name="US/East-Indiana" value="EST5EDT,M3.2.0,M11.1.0" />
@@ -649,14 +637,12 @@
 
 	<zone name="Universal" value="UTC0" />
 
-	<zone name="W-SU" value="MSK-4" />
+	<zone name="W-SU" value="MSK-3" />
 
 	<zone name="WET" value="WET0WEST,M3.5.0/1,M10.5.0" />
 
 	<zone name="Zulu" value="UTC0" />
 
-	<zone name="localtime" value="CST6CDT,M3.2.0,M11.1.0" />
-
 	<zone name="posix/Africa/Abidjan" value="GMT0" />
 	<zone name="posix/Africa/Accra" value="GMT0" />
 	<zone name="posix/Africa/Addis_Ababa" value="EAT-3" />
@@ -670,7 +656,7 @@
 	<zone name="posix/Africa/Blantyre" value="CAT-2" />
 	<zone name="posix/Africa/Brazzaville" value="WAT-1" />
 	<zone name="posix/Africa/Bujumbura" value="CAT-2" />
-	<zone name="posix/Africa/Cairo" value="EEST" />
+	<zone name="posix/Africa/Cairo" value="EET-2" />
 	<zone name="posix/Africa/Casablanca" value="WET0WEST,M3.5.0,M10.5.0/3" />
 	<zone name="posix/Africa/Ceuta" value="CET-1CEST,M3.5.0,M10.5.0/3" />
 	<zone name="posix/Africa/Conakry" value="GMT0" />
@@ -711,7 +697,7 @@
 	<zone name="posix/Africa/Tripoli" value="EET-2" />
 	<zone name="posix/Africa/Tunis" value="CET-1" />
 	<zone name="posix/Africa/Windhoek" value="WAT-1WAST,M9.1.0,M4.1.0" />
-	<zone name="posix/America/Adak" value="HAST10HADT,M3.2.0,M11.1.0" />
+	<zone name="posix/America/Adak" value="HST10HDT,M3.2.0,M11.1.0" />
 	<zone name="posix/America/Anchorage" value="AKST9AKDT,M3.2.0,M11.1.0" />
 	<zone name="posix/America/Anguilla" value="AST4" />
 	<zone name="posix/America/Antigua" value="AST4" />
@@ -732,7 +718,7 @@
 	<zone name="posix/America/Aruba" value="AST4" />
 	<zone name="posix/America/Asuncion" value="PYT4PYST,M10.1.0/0,M3.4.0/0" />
 	<zone name="posix/America/Atikokan" value="EST5" />
-	<zone name="posix/America/Atka" value="HAST10HADT,M3.2.0,M11.1.0" />
+	<zone name="posix/America/Atka" value="HST10HDT,M3.2.0,M11.1.0" />
 	<zone name="posix/America/Bahia" value="BRT3" />
 	<zone name="posix/America/Bahia_Banderas" value="CST6CDT,M4.1.0,M10.5.0" />
 	<zone name="posix/America/Barbados" value="AST4" />
@@ -745,11 +731,11 @@
 	<zone name="posix/America/Buenos_Aires" value="ART3" />
 	<zone name="posix/America/Cambridge_Bay" value="MST7MDT,M3.2.0,M11.1.0" />
 	<zone name="posix/America/Campo_Grande" value="AMT4AMST,M10.3.0/0,M2.3.0/0" />
-	<zone name="posix/America/Cancun" value="CST6CDT,M4.1.0,M10.5.0" />
+	<zone name="posix/America/Cancun" value="EST5" />
 	<zone name="posix/America/Caracas" value="VET4:30" />
 	<zone name="posix/America/Catamarca" value="ART3" />
 	<zone name="posix/America/Cayenne" value="GFT3" />
-	<zone name="posix/America/Cayman" value="EST5" />
+	<zone name="posix/America/Cayman" value="EST5EDT,M3.2.0,M11.1.0" />
 	<zone name="posix/America/Chicago" value="CST6CDT,M3.2.0,M11.1.0" />
 	<zone name="posix/America/Chihuahua" value="MST7MDT,M4.1.0,M10.5.0" />
 	<zone name="posix/America/Coral_Harbour" value="EST5" />
@@ -768,12 +754,13 @@
 	<zone name="posix/America/Eirunepe" value="ACT5" />
 	<zone name="posix/America/El_Salvador" value="CST6" />
 	<zone name="posix/America/Ensenada" value="PST8PDT,M3.2.0,M11.1.0" />
+	<zone name="posix/America/Fort_Nelson" value="MST7" />
 	<zone name="posix/America/Fort_Wayne" value="EST5EDT,M3.2.0,M11.1.0" />
 	<zone name="posix/America/Fortaleza" value="BRT3" />
 	<zone name="posix/America/Glace_Bay" value="AST4ADT,M3.2.0,M11.1.0" />
 	<zone name="posix/America/Godthab" value="WGST" />
 	<zone name="posix/America/Goose_Bay" value="AST4ADT,M3.2.0,M11.1.0" />
-	<zone name="posix/America/Grand_Turk" value="EST5EDT,M3.2.0,M11.1.0" />
+	<zone name="posix/America/Grand_Turk" value="AST4" />
 	<zone name="posix/America/Grenada" value="AST4" />
 	<zone name="posix/America/Guadeloupe" value="AST4" />
 	<zone name="posix/America/Guatemala" value="CST6" />
@@ -815,12 +802,12 @@
 	<zone name="posix/America/Mendoza" value="ART3" />
 	<zone name="posix/America/Menominee" value="CST6CDT,M3.2.0,M11.1.0" />
 	<zone name="posix/America/Merida" value="CST6CDT,M4.1.0,M10.5.0" />
-	<zone name="posix/America/Metlakatla" value="MeST8" />
+	<zone name="posix/America/Metlakatla" value="PST8" />
 	<zone name="posix/America/Mexico_City" value="CST6CDT,M4.1.0,M10.5.0" />
 	<zone name="posix/America/Miquelon" value="PMST3PMDT,M3.2.0,M11.1.0" />
 	<zone name="posix/America/Moncton" value="AST4ADT,M3.2.0,M11.1.0" />
 	<zone name="posix/America/Monterrey" value="CST6CDT,M4.1.0,M10.5.0" />
-	<zone name="posix/America/Montevideo" value="UYT3UYST,M10.1.0,M3.2.0" />
+	<zone name="posix/America/Montevideo" value="UYT3" />
 	<zone name="posix/America/Montreal" value="EST5EDT,M3.2.0,M11.1.0" />
 	<zone name="posix/America/Montserrat" value="AST4" />
 	<zone name="posix/America/Nassau" value="EST5EDT,M3.2.0,M11.1.0" />
@@ -850,7 +837,7 @@
 	<zone name="posix/America/Rosario" value="ART3" />
 	<zone name="posix/America/Santa_Isabel" value="PST8PDT,M4.1.0,M10.5.0" />
 	<zone name="posix/America/Santarem" value="BRT3" />
-	<zone name="posix/America/Santiago" value="CLST" />
+	<zone name="posix/America/Santiago" value="CLT3" />
 	<zone name="posix/America/Santo_Domingo" value="AST4" />
 	<zone name="posix/America/Sao_Paulo" value="BRT3BRST,M10.3.0/0,M2.3.0/0" />
 	<zone name="posix/America/Scoresbysund" value="EGT1EGST,M3.5.0/0,M10.5.0/1" />
@@ -875,13 +862,13 @@
 	<zone name="posix/America/Winnipeg" value="CST6CDT,M3.2.0,M11.1.0" />
 	<zone name="posix/America/Yakutat" value="AKST9AKDT,M3.2.0,M11.1.0" />
 	<zone name="posix/America/Yellowknife" value="MST7MDT,M3.2.0,M11.1.0" />
-	<zone name="posix/Antarctica/Casey" value="WST-8" />
+	<zone name="posix/Antarctica/Casey" value="AWST-8" />
 	<zone name="posix/Antarctica/Davis" value="DAVT-7" />
 	<zone name="posix/Antarctica/DumontDUrville" value="DDUT-10" />
 	<zone name="posix/Antarctica/Macquarie" value="MIST-11" />
 	<zone name="posix/Antarctica/Mawson" value="MAWT-5" />
 	<zone name="posix/Antarctica/McMurdo" value="NZST-12NZDT,M9.5.0,M4.1.0/3" />
-	<zone name="posix/Antarctica/Palmer" value="CLST" />
+	<zone name="posix/Antarctica/Palmer" value="CLT3" />
 	<zone name="posix/Antarctica/Rothera" value="ROTT3" />
 	<zone name="posix/Antarctica/South_Pole" value="NZST-12NZDT,M9.5.0,M4.1.0/3" />
 	<zone name="posix/Antarctica/Syowa" value="SYOT-3" />
@@ -904,7 +891,8 @@
 	<zone name="posix/Asia/Bishkek" value="KGT-6" />
 	<zone name="posix/Asia/Brunei" value="BNT-8" />
 	<zone name="posix/Asia/Calcutta" value="IST-5:30" />
-	<zone name="posix/Asia/Choibalsan" value="CHOT-8" />
+	<zone name="posix/Asia/Chita" value="IRKT-8" />
+	<zone name="posix/Asia/Choibalsan" value="CHOT-8CHOST,M3.5.6,M9.5.6/0" />
 	<zone name="posix/Asia/Chongqing" value="CST-8" />
 	<zone name="posix/Asia/Chungking" value="CST-8" />
 	<zone name="posix/Asia/Colombo" value="IST-5:30" />
@@ -919,8 +907,8 @@
 	<zone name="posix/Asia/Hebron" value="EEST" />
 	<zone name="posix/Asia/Ho_Chi_Minh" value="ICT-7" />
 	<zone name="posix/Asia/Hong_Kong" value="HKT-8" />
-	<zone name="posix/Asia/Hovd" value="HOVT-7" />
-	<zone name="posix/Asia/Irkutsk" value="IRKT-9" />
+	<zone name="posix/Asia/Hovd" value="HOVT-7HOVST,M3.5.6,M9.5.6/0" />
+	<zone name="posix/Asia/Irkutsk" value="IRKT-8" />
 	<zone name="posix/Asia/Istanbul" value="EET-2EEST,M3.5.0/3,M10.5.0/4" />
 	<zone name="posix/Asia/Jakarta" value="WIB-7" />
 	<zone name="posix/Asia/Jayapura" value="WIT-9" />
@@ -928,39 +916,40 @@
 	<zone name="posix/Asia/Kabul" value="AFT-4:30" />
 	<zone name="posix/Asia/Kamchatka" value="PETT-12" />
 	<zone name="posix/Asia/Karachi" value="PKT-5" />
-	<zone name="posix/Asia/Kashgar" value="CST-8" />
+	<zone name="posix/Asia/Kashgar" value="XJT-6" />
 	<zone name="posix/Asia/Kathmandu" value="NPT-5:45" />
 	<zone name="posix/Asia/Katmandu" value="NPT-5:45" />
-	<zone name="posix/Asia/Khandyga" value="YAKT-10" />
+	<zone name="posix/Asia/Khandyga" value="YAKT-9" />
 	<zone name="posix/Asia/Kolkata" value="IST-5:30" />
-	<zone name="posix/Asia/Krasnoyarsk" value="KRAT-8" />
+	<zone name="posix/Asia/Krasnoyarsk" value="KRAT-7" />
 	<zone name="posix/Asia/Kuala_Lumpur" value="MYT-8" />
 	<zone name="posix/Asia/Kuching" value="MYT-8" />
 	<zone name="posix/Asia/Kuwait" value="AST-3" />
 	<zone name="posix/Asia/Macao" value="CST-8" />
 	<zone name="posix/Asia/Macau" value="CST-8" />
-	<zone name="posix/Asia/Magadan" value="MAGT-12" />
+	<zone name="posix/Asia/Magadan" value="MAGT-10" />
 	<zone name="posix/Asia/Makassar" value="WITA-8" />
 	<zone name="posix/Asia/Manila" value="PHT-8" />
 	<zone name="posix/Asia/Muscat" value="GST-4" />
 	<zone name="posix/Asia/Nicosia" value="EET-2EEST,M3.5.0/3,M10.5.0/4" />
-	<zone name="posix/Asia/Novokuznetsk" value="NOVT-7" />
-	<zone name="posix/Asia/Novosibirsk" value="NOVT-7" />
-	<zone name="posix/Asia/Omsk" value="OMST-7" />
+	<zone name="posix/Asia/Novokuznetsk" value="KRAT-7" />
+	<zone name="posix/Asia/Novosibirsk" value="NOVT-6" />
+	<zone name="posix/Asia/Omsk" value="OMST-6" />
 	<zone name="posix/Asia/Oral" value="ORAT-5" />
 	<zone name="posix/Asia/Phnom_Penh" value="ICT-7" />
 	<zone name="posix/Asia/Pontianak" value="WIB-7" />
-	<zone name="posix/Asia/Pyongyang" value="KST-9" />
+	<zone name="posix/Asia/Pyongyang" value="KST-8:30" />
 	<zone name="posix/Asia/Qatar" value="AST-3" />
 	<zone name="posix/Asia/Qyzylorda" value="QYZT-6" />
 	<zone name="posix/Asia/Rangoon" value="MMT-6:30" />
 	<zone name="posix/Asia/Riyadh" value="AST-3" />
 	<zone name="posix/Asia/Saigon" value="ICT-7" />
-	<zone name="posix/Asia/Sakhalin" value="SAKT-11" />
+	<zone name="posix/Asia/Sakhalin" value="SAKT-10" />
 	<zone name="posix/Asia/Samarkand" value="UZT-5" />
 	<zone name="posix/Asia/Seoul" value="KST-9" />
 	<zone name="posix/Asia/Shanghai" value="CST-8" />
 	<zone name="posix/Asia/Singapore" value="SGT-8" />
+	<zone name="posix/Asia/Srednekolymsk" value="SRET-11" />
 	<zone name="posix/Asia/Taipei" value="CST-8" />
 	<zone name="posix/Asia/Tashkent" value="UZT-5" />
 	<zone name="posix/Asia/Tbilisi" value="GET-4" />
@@ -970,14 +959,14 @@
 	<zone name="posix/Asia/Thimphu" value="BTT-6" />
 	<zone name="posix/Asia/Tokyo" value="JST-9" />
 	<zone name="posix/Asia/Ujung_Pandang" value="WITA-8" />
-	<zone name="posix/Asia/Ulaanbaatar" value="ULAT-8" />
-	<zone name="posix/Asia/Ulan_Bator" value="ULAT-8" />
-	<zone name="posix/Asia/Urumqi" value="CST-8" />
-	<zone name="posix/Asia/Ust-Nera" value="VLAT-11" />
+	<zone name="posix/Asia/Ulaanbaatar" value="ULAT-8ULAST,M3.5.6,M9.5.6/0" />
+	<zone name="posix/Asia/Ulan_Bator" value="ULAT-8ULAST,M3.5.6,M9.5.6/0" />
+	<zone name="posix/Asia/Urumqi" value="XJT-6" />
+	<zone name="posix/Asia/Ust-Nera" value="VLAT-10" />
 	<zone name="posix/Asia/Vientiane" value="ICT-7" />
-	<zone name="posix/Asia/Vladivostok" value="VLAT-11" />
-	<zone name="posix/Asia/Yakutsk" value="YAKT-10" />
-	<zone name="posix/Asia/Yekaterinburg" value="YEKT-6" />
+	<zone name="posix/Asia/Vladivostok" value="VLAT-10" />
+	<zone name="posix/Asia/Yakutsk" value="YAKT-9" />
+	<zone name="posix/Asia/Yekaterinburg" value="YEKT-5" />
 	<zone name="posix/Asia/Yerevan" value="AMT-4" />
 	<zone name="posix/Atlantic/Azores" value="AZOT1AZOST,M3.5.0/0,M10.5.0/1" />
 	<zone name="posix/Atlantic/Bermuda" value="AST4ADT,M3.2.0,M11.1.0" />
@@ -991,29 +980,29 @@
 	<zone name="posix/Atlantic/South_Georgia" value="GST2" />
 	<zone name="posix/Atlantic/St_Helena" value="GMT0" />
 	<zone name="posix/Atlantic/Stanley" value="FKST3" />
-	<zone name="posix/Australia/ACT" value="EST-10EST,M10.1.0,M4.1.0/3" />
-	<zone name="posix/Australia/Adelaide" value="CST-9:30CST,M10.1.0,M4.1.0/3" />
-	<zone name="posix/Australia/Brisbane" value="EST-10" />
-	<zone name="posix/Australia/Broken_Hill" value="CST-9:30CST,M10.1.0,M4.1.0/3" />
-	<zone name="posix/Australia/Canberra" value="EST-10EST,M10.1.0,M4.1.0/3" />
-	<zone name="posix/Australia/Currie" value="EST-10EST,M10.1.0,M4.1.0/3" />
-	<zone name="posix/Australia/Darwin" value="CST-9:30" />
-	<zone name="posix/Australia/Eucla" value="CWST-8:45" />
-	<zone name="posix/Australia/Hobart" value="EST-10EST,M10.1.0,M4.1.0/3" />
-	<zone name="posix/Australia/LHI" value="LHST-10:30LHST-11,M10.1.0,M4.1.0" />
-	<zone name="posix/Australia/Lindeman" value="EST-10" />
-	<zone name="posix/Australia/Lord_Howe" value="LHST-10:30LHST-11,M10.1.0,M4.1.0" />
-	<zone name="posix/Australia/Melbourne" value="EST-10EST,M10.1.0,M4.1.0/3" />
-	<zone name="posix/Australia/NSW" value="EST-10EST,M10.1.0,M4.1.0/3" />
-	<zone name="posix/Australia/North" value="CST-9:30" />
-	<zone name="posix/Australia/Perth" value="WST-8" />
-	<zone name="posix/Australia/Queensland" value="EST-10" />
-	<zone name="posix/Australia/South" value="CST-9:30CST,M10.1.0,M4.1.0/3" />
-	<zone name="posix/Australia/Sydney" value="EST-10EST,M10.1.0,M4.1.0/3" />
-	<zone name="posix/Australia/Tasmania" value="EST-10EST,M10.1.0,M4.1.0/3" />
-	<zone name="posix/Australia/Victoria" value="EST-10EST,M10.1.0,M4.1.0/3" />
-	<zone name="posix/Australia/West" value="WST-8" />
-	<zone name="posix/Australia/Yancowinna" value="CST-9:30CST,M10.1.0,M4.1.0/3" />
+	<zone name="posix/Australia/ACT" value="AEST-10AEDT,M10.1.0,M4.1.0/3" />
+	<zone name="posix/Australia/Adelaide" value="ACST-9:30ACDT,M10.1.0,M4.1.0/3" />
+	<zone name="posix/Australia/Brisbane" value="AEST-10" />
+	<zone name="posix/Australia/Broken_Hill" value="ACST-9:30ACDT,M10.1.0,M4.1.0/3" />
+	<zone name="posix/Australia/Canberra" value="AEST-10AEDT,M10.1.0,M4.1.0/3" />
+	<zone name="posix/Australia/Currie" value="AEST-10AEDT,M10.1.0,M4.1.0/3" />
+	<zone name="posix/Australia/Darwin" value="ACST-9:30" />
+	<zone name="posix/Australia/Eucla" value="ACWST-8:45" />
+	<zone name="posix/Australia/Hobart" value="AEST-10AEDT,M10.1.0,M4.1.0/3" />
+	<zone name="posix/Australia/LHI" value="LHST-10:30LHDT-11,M10.1.0,M4.1.0" />
+	<zone name="posix/Australia/Lindeman" value="AEST-10" />
+	<zone name="posix/Australia/Lord_Howe" value="LHST-10:30LHDT-11,M10.1.0,M4.1.0" />
+	<zone name="posix/Australia/Melbourne" value="AEST-10AEDT,M10.1.0,M4.1.0/3" />
+	<zone name="posix/Australia/NSW" value="AEST-10AEDT,M10.1.0,M4.1.0/3" />
+	<zone name="posix/Australia/North" value="ACST-9:30" />
+	<zone name="posix/Australia/Perth" value="AWST-8" />
+	<zone name="posix/Australia/Queensland" value="AEST-10" />
+	<zone name="posix/Australia/South" value="ACST-9:30ACDT,M10.1.0,M4.1.0/3" />
+	<zone name="posix/Australia/Sydney" value="AEST-10AEDT,M10.1.0,M4.1.0/3" />
+	<zone name="posix/Australia/Tasmania" value="AEST-10AEDT,M10.1.0,M4.1.0/3" />
+	<zone name="posix/Australia/Victoria" value="AEST-10AEDT,M10.1.0,M4.1.0/3" />
+	<zone name="posix/Australia/West" value="AWST-8" />
+	<zone name="posix/Australia/Yancowinna" value="ACST-9:30ACDT,M10.1.0,M4.1.0/3" />
 	<zone name="posix/Brazil/Acre" value="ACT5" />
 	<zone name="posix/Brazil/DeNoronha" value="FNT2" />
 	<zone name="posix/Brazil/East" value="BRT3BRST,M10.3.0/0,M2.3.0/0" />
@@ -1029,43 +1018,43 @@
 	<zone name="posix/Canada/Pacific" value="PST8PDT,M3.2.0,M11.1.0" />
 	<zone name="posix/Canada/Saskatchewan" value="CST6" />
 	<zone name="posix/Canada/Yukon" value="PST8PDT,M3.2.0,M11.1.0" />
-	<zone name="posix/Chile/Continental" value="CLST" />
-	<zone name="posix/Chile/EasterIsland" value="EASST" />
+	<zone name="posix/Chile/Continental" value="CLT3" />
+	<zone name="posix/Chile/EasterIsland" value="EAST5" />
 	<zone name="posix/Cuba" value="CST5CDT,M3.2.0/0,M11.1.0/1" />
 	<zone name="posix/EET" value="EET-2EEST,M3.5.0/3,M10.5.0/4" />
 	<zone name="posix/EST" value="EST5" />
 	<zone name="posix/EST5EDT" value="EST5EDT,M3.2.0,M11.1.0" />
-	<zone name="posix/Egypt" value="EEST" />
+	<zone name="posix/Egypt" value="EET-2" />
 	<zone name="posix/Eire" value="GMT0IST,M3.5.0/1,M10.5.0" />
 	<zone name="posix/Etc/GMT" value="GMT0" />
 	<zone name="posix/Etc/GMT+0" value="GMT0" />
-	<zone name="posix/Etc/GMT+1" value="<GMT+1>1" />
-	<zone name="posix/Etc/GMT+10" value="<GMT+10>10" />
-	<zone name="posix/Etc/GMT+11" value="<GMT+11>11" />
-	<zone name="posix/Etc/GMT+12" value="<GMT+12>12" />
-	<zone name="posix/Etc/GMT+2" value="<GMT+2>2" />
-	<zone name="posix/Etc/GMT+3" value="<GMT+3>3" />
-	<zone name="posix/Etc/GMT+4" value="<GMT+4>4" />
-	<zone name="posix/Etc/GMT+5" value="<GMT+5>5" />
-	<zone name="posix/Etc/GMT+6" value="<GMT+6>6" />
-	<zone name="posix/Etc/GMT+7" value="<GMT+7>7" />
-	<zone name="posix/Etc/GMT+8" value="<GMT+8>8" />
-	<zone name="posix/Etc/GMT+9" value="<GMT+9>9" />
+	<zone name="posix/Etc/GMT+1" value="&lt;GMT+1&gt;1" />
+	<zone name="posix/Etc/GMT+10" value="&lt;GMT+10&gt;10" />
+	<zone name="posix/Etc/GMT+11" value="&lt;GMT+11&gt;11" />
+	<zone name="posix/Etc/GMT+12" value="&lt;GMT+12&gt;12" />
+	<zone name="posix/Etc/GMT+2" value="&lt;GMT+2&gt;2" />
+	<zone name="posix/Etc/GMT+3" value="&lt;GMT+3&gt;3" />
+	<zone name="posix/Etc/GMT+4" value="&lt;GMT+4&gt;4" />
+	<zone name="posix/Etc/GMT+5" value="&lt;GMT+5&gt;5" />
+	<zone name="posix/Etc/GMT+6" value="&lt;GMT+6&gt;6" />
+	<zone name="posix/Etc/GMT+7" value="&lt;GMT+7&gt;7" />
+	<zone name="posix/Etc/GMT+8" value="&lt;GMT+8&gt;8" />
+	<zone name="posix/Etc/GMT+9" value="&lt;GMT+9&gt;9" />
 	<zone name="posix/Etc/GMT-0" value="GMT0" />
-	<zone name="posix/Etc/GMT-1" value="<GMT-1>-1" />
-	<zone name="posix/Etc/GMT-10" value="<GMT-10>-10" />
-	<zone name="posix/Etc/GMT-11" value="<GMT-11>-11" />
-	<zone name="posix/Etc/GMT-12" value="<GMT-12>-12" />
-	<zone name="posix/Etc/GMT-13" value="<GMT-13>-13" />
-	<zone name="posix/Etc/GMT-14" value="<GMT-14>-14" />
-	<zone name="posix/Etc/GMT-2" value="<GMT-2>-2" />
-	<zone name="posix/Etc/GMT-3" value="<GMT-3>-3" />
-	<zone name="posix/Etc/GMT-4" value="<GMT-4>-4" />
-	<zone name="posix/Etc/GMT-5" value="<GMT-5>-5" />
-	<zone name="posix/Etc/GMT-6" value="<GMT-6>-6" />
-	<zone name="posix/Etc/GMT-7" value="<GMT-7>-7" />
-	<zone name="posix/Etc/GMT-8" value="<GMT-8>-8" />
-	<zone name="posix/Etc/GMT-9" value="<GMT-9>-9" />
+	<zone name="posix/Etc/GMT-1" value="&lt;GMT-1&gt;-1" />
+	<zone name="posix/Etc/GMT-10" value="&lt;GMT-10&gt;-10" />
+	<zone name="posix/Etc/GMT-11" value="&lt;GMT-11&gt;-11" />
+	<zone name="posix/Etc/GMT-12" value="&lt;GMT-12&gt;-12" />
+	<zone name="posix/Etc/GMT-13" value="&lt;GMT-13&gt;-13" />
+	<zone name="posix/Etc/GMT-14" value="&lt;GMT-14&gt;-14" />
+	<zone name="posix/Etc/GMT-2" value="&lt;GMT-2&gt;-2" />
+	<zone name="posix/Etc/GMT-3" value="&lt;GMT-3&gt;-3" />
+	<zone name="posix/Etc/GMT-4" value="&lt;GMT-4&gt;-4" />
+	<zone name="posix/Etc/GMT-5" value="&lt;GMT-5&gt;-5" />
+	<zone name="posix/Etc/GMT-6" value="&lt;GMT-6&gt;-6" />
+	<zone name="posix/Etc/GMT-7" value="&lt;GMT-7&gt;-7" />
+	<zone name="posix/Etc/GMT-8" value="&lt;GMT-8&gt;-8" />
+	<zone name="posix/Etc/GMT-9" value="&lt;GMT-9&gt;-9" />
 	<zone name="posix/Etc/GMT0" value="GMT0" />
 	<zone name="posix/Etc/Greenwich" value="GMT0" />
 	<zone name="posix/Etc/UCT" value="UCT0" />
@@ -1083,7 +1072,7 @@
 	<zone name="posix/Europe/Bucharest" value="EET-2EEST,M3.5.0/3,M10.5.0/4" />
 	<zone name="posix/Europe/Budapest" value="CET-1CEST,M3.5.0,M10.5.0/3" />
 	<zone name="posix/Europe/Busingen" value="CET-1CEST,M3.5.0,M10.5.0/3" />
-	<zone name="posix/Europe/Chisinau" value="EET-2EEST,M3.5.0/3,M10.5.0/4" />
+	<zone name="posix/Europe/Chisinau" value="EET-2EEST,M3.5.0,M10.5.0/3" />
 	<zone name="posix/Europe/Copenhagen" value="CET-1CEST,M3.5.0,M10.5.0/3" />
 	<zone name="posix/Europe/Dublin" value="GMT0IST,M3.5.0/1,M10.5.0" />
 	<zone name="posix/Europe/Gibraltar" value="CET-1CEST,M3.5.0,M10.5.0/3" />
@@ -1092,7 +1081,7 @@
 	<zone name="posix/Europe/Isle_of_Man" value="GMT0BST,M3.5.0/1,M10.5.0" />
 	<zone name="posix/Europe/Istanbul" value="EET-2EEST,M3.5.0/3,M10.5.0/4" />
 	<zone name="posix/Europe/Jersey" value="GMT0BST,M3.5.0/1,M10.5.0" />
-	<zone name="posix/Europe/Kaliningrad" value="FET-3" />
+	<zone name="posix/Europe/Kaliningrad" value="EET-2" />
 	<zone name="posix/Europe/Kiev" value="EET-2EEST,M3.5.0/3,M10.5.0/4" />
 	<zone name="posix/Europe/Lisbon" value="WET0WEST,M3.5.0/1,M10.5.0" />
 	<zone name="posix/Europe/Ljubljana" value="CET-1CEST,M3.5.0,M10.5.0/3" />
@@ -1101,9 +1090,9 @@
 	<zone name="posix/Europe/Madrid" value="CET-1CEST,M3.5.0,M10.5.0/3" />
 	<zone name="posix/Europe/Malta" value="CET-1CEST,M3.5.0,M10.5.0/3" />
 	<zone name="posix/Europe/Mariehamn" value="EET-2EEST,M3.5.0/3,M10.5.0/4" />
-	<zone name="posix/Europe/Minsk" value="FET-3" />
+	<zone name="posix/Europe/Minsk" value="MSK-3" />
 	<zone name="posix/Europe/Monaco" value="CET-1CEST,M3.5.0,M10.5.0/3" />
-	<zone name="posix/Europe/Moscow" value="MSK-4" />
+	<zone name="posix/Europe/Moscow" value="MSK-3" />
 	<zone name="posix/Europe/Nicosia" value="EET-2EEST,M3.5.0/3,M10.5.0/4" />
 	<zone name="posix/Europe/Oslo" value="CET-1CEST,M3.5.0,M10.5.0/3" />
 	<zone name="posix/Europe/Paris" value="CET-1CEST,M3.5.0,M10.5.0/3" />
@@ -1114,24 +1103,23 @@
 	<zone name="posix/Europe/Samara" value="SAMT-4" />
 	<zone name="posix/Europe/San_Marino" value="CET-1CEST,M3.5.0,M10.5.0/3" />
 	<zone name="posix/Europe/Sarajevo" value="CET-1CEST,M3.5.0,M10.5.0/3" />
-	<zone name="posix/Europe/Simferopol" value="MSK-4" />
+	<zone name="posix/Europe/Simferopol" value="MSK-3" />
 	<zone name="posix/Europe/Skopje" value="CET-1CEST,M3.5.0,M10.5.0/3" />
 	<zone name="posix/Europe/Sofia" value="EET-2EEST,M3.5.0/3,M10.5.0/4" />
 	<zone name="posix/Europe/Stockholm" value="CET-1CEST,M3.5.0,M10.5.0/3" />
 	<zone name="posix/Europe/Tallinn" value="EET-2EEST,M3.5.0/3,M10.5.0/4" />
 	<zone name="posix/Europe/Tirane" value="CET-1CEST,M3.5.0,M10.5.0/3" />
-	<zone name="posix/Europe/Tiraspol" value="EET-2EEST,M3.5.0/3,M10.5.0/4" />
+	<zone name="posix/Europe/Tiraspol" value="EET-2EEST,M3.5.0,M10.5.0/3" />
 	<zone name="posix/Europe/Uzhgorod" value="EET-2EEST,M3.5.0/3,M10.5.0/4" />
 	<zone name="posix/Europe/Vaduz" value="CET-1CEST,M3.5.0,M10.5.0/3" />
 	<zone name="posix/Europe/Vatican" value="CET-1CEST,M3.5.0,M10.5.0/3" />
 	<zone name="posix/Europe/Vienna" value="CET-1CEST,M3.5.0,M10.5.0/3" />
 	<zone name="posix/Europe/Vilnius" value="EET-2EEST,M3.5.0/3,M10.5.0/4" />
-	<zone name="posix/Europe/Volgograd" value="VOLT-4" />
+	<zone name="posix/Europe/Volgograd" value="MSK-3" />
 	<zone name="posix/Europe/Warsaw" value="CET-1CEST,M3.5.0,M10.5.0/3" />
 	<zone name="posix/Europe/Zagreb" value="CET-1CEST,M3.5.0,M10.5.0/3" />
 	<zone name="posix/Europe/Zaporozhye" value="EET-2EEST,M3.5.0/3,M10.5.0/4" />
 	<zone name="posix/Europe/Zurich" value="CET-1CEST,M3.5.0,M10.5.0/3" />
-	<zone name="posix/Factory" value="<Local time zone must be set--see zic manual page>0" />
 	<zone name="posix/GB" value="GMT0BST,M3.5.0/1,M10.5.0" />
 	<zone name="posix/GB-Eire" value="GMT0BST,M3.5.0/1,M10.5.0" />
 	<zone name="posix/GMT" value="GMT0" />
@@ -1170,15 +1158,16 @@
 	<zone name="posix/Navajo" value="MST7MDT,M3.2.0,M11.1.0" />
 	<zone name="posix/PRC" value="CST-8" />
 	<zone name="posix/PST8PDT" value="PST8PDT,M3.2.0,M11.1.0" />
-	<zone name="posix/Pacific/Apia" value="WST-13WSDT,M9.5.0/3,M4.1.0/4" />
+	<zone name="posix/Pacific/Apia" value="WSST-13WSDT,M9.5.0/3,M4.1.0/4" />
 	<zone name="posix/Pacific/Auckland" value="NZST-12NZDT,M9.5.0,M4.1.0/3" />
+	<zone name="posix/Pacific/Bougainville" value="BST-11" />
 	<zone name="posix/Pacific/Chatham" value="CHAST-12:45CHADT,M9.5.0/2:45,M4.1.0/3:45" />
 	<zone name="posix/Pacific/Chuuk" value="CHUT-10" />
-	<zone name="posix/Pacific/Easter" value="EASST" />
+	<zone name="posix/Pacific/Easter" value="EAST5" />
 	<zone name="posix/Pacific/Efate" value="VUT-11" />
 	<zone name="posix/Pacific/Enderbury" value="PHOT-13" />
 	<zone name="posix/Pacific/Fakaofo" value="TKT-13" />
-	<zone name="posix/Pacific/Fiji" value="FJST" />
+	<zone name="posix/Pacific/Fiji" value="FJT-12FJST,M11.1.0,M1.3.0/3" />
 	<zone name="posix/Pacific/Funafuti" value="TVT-12" />
 	<zone name="posix/Pacific/Galapagos" value="GALT6" />
 	<zone name="posix/Pacific/Gambier" value="GAMT9" />
@@ -1194,7 +1183,7 @@
 	<zone name="posix/Pacific/Midway" value="SST11" />
 	<zone name="posix/Pacific/Nauru" value="NRT-12" />
 	<zone name="posix/Pacific/Niue" value="NUT11" />
-	<zone name="posix/Pacific/Norfolk" value="NFT-11:30" />
+	<zone name="posix/Pacific/Norfolk" value="NFT-11" />
 	<zone name="posix/Pacific/Noumea" value="NCT-11" />
 	<zone name="posix/Pacific/Pago_Pago" value="SST11" />
 	<zone name="posix/Pacific/Palau" value="PWT-9" />
@@ -1217,23 +1206,10 @@
 	<zone name="posix/ROC" value="CST-8" />
 	<zone name="posix/ROK" value="KST-9" />
 	<zone name="posix/Singapore" value="SGT-8" />
-	<zone name="posix/SystemV/AST4" value="AST4" />
-	<zone name="posix/SystemV/AST4ADT" value="AST4ADT,M3.2.0,M11.1.0" />
-	<zone name="posix/SystemV/CST6" value="CST6" />
-	<zone name="posix/SystemV/CST6CDT" value="CST6CDT,M3.2.0,M11.1.0" />
-	<zone name="posix/SystemV/EST5" value="EST5" />
-	<zone name="posix/SystemV/EST5EDT" value="EST5EDT,M3.2.0,M11.1.0" />
-	<zone name="posix/SystemV/HST10" value="HST10" />
-	<zone name="posix/SystemV/MST7" value="MST7" />
-	<zone name="posix/SystemV/MST7MDT" value="MST7MDT,M3.2.0,M11.1.0" />
-	<zone name="posix/SystemV/PST8" value="PST8" />
-	<zone name="posix/SystemV/PST8PDT" value="PST8PDT,M3.2.0,M11.1.0" />
-	<zone name="posix/SystemV/YST9" value="GAMT9" />
-	<zone name="posix/SystemV/YST9YDT" value="AKST9AKDT,M3.2.0,M11.1.0" />
 	<zone name="posix/Turkey" value="EET-2EEST,M3.5.0/3,M10.5.0/4" />
 	<zone name="posix/UCT" value="UCT0" />
 	<zone name="posix/US/Alaska" value="AKST9AKDT,M3.2.0,M11.1.0" />
-	<zone name="posix/US/Aleutian" value="HAST10HADT,M3.2.0,M11.1.0" />
+	<zone name="posix/US/Aleutian" value="HST10HDT,M3.2.0,M11.1.0" />
 	<zone name="posix/US/Arizona" value="MST7" />
 	<zone name="posix/US/Central" value="CST6CDT,M3.2.0,M11.1.0" />
 	<zone name="posix/US/East-Indiana" value="EST5EDT,M3.2.0,M11.1.0" />
@@ -1247,7 +1223,7 @@
 	<zone name="posix/US/Samoa" value="SST11" />
 	<zone name="posix/UTC" value="UTC0" />
 	<zone name="posix/Universal" value="UTC0" />
-	<zone name="posix/W-SU" value="MSK-4" />
+	<zone name="posix/W-SU" value="MSK-3" />
 	<zone name="posix/WET" value="WET0WEST,M3.5.0/1,M10.5.0" />
 	<zone name="posix/Zulu" value="UTC0" />
 
@@ -1266,7 +1242,7 @@
 	<zone name="right/Africa/Blantyre" value="CAT-2" />
 	<zone name="right/Africa/Brazzaville" value="WAT-1" />
 	<zone name="right/Africa/Bujumbura" value="CAT-2" />
-	<zone name="right/Africa/Cairo" value="EEST" />
+	<zone name="right/Africa/Cairo" value="EET-2" />
 	<zone name="right/Africa/Casablanca" value="WET0WEST,M3.5.0,M10.5.0/3" />
 	<zone name="right/Africa/Ceuta" value="CET-1CEST,M3.5.0,M10.5.0/3" />
 	<zone name="right/Africa/Conakry" value="GMT0" />
@@ -1307,7 +1283,7 @@
 	<zone name="right/Africa/Tripoli" value="EET-2" />
 	<zone name="right/Africa/Tunis" value="CET-1" />
 	<zone name="right/Africa/Windhoek" value="WAT-1WAST,M9.1.0,M4.1.0" />
-	<zone name="right/America/Adak" value="HAST10HADT,M3.2.0,M11.1.0" />
+	<zone name="right/America/Adak" value="HST10HDT,M3.2.0,M11.1.0" />
 	<zone name="right/America/Anchorage" value="AKST9AKDT,M3.2.0,M11.1.0" />
 	<zone name="right/America/Anguilla" value="AST4" />
 	<zone name="right/America/Antigua" value="AST4" />
@@ -1328,7 +1304,7 @@
 	<zone name="right/America/Aruba" value="AST4" />
 	<zone name="right/America/Asuncion" value="PYT4PYST,M10.1.0/0,M3.4.0/0" />
 	<zone name="right/America/Atikokan" value="EST5" />
-	<zone name="right/America/Atka" value="HAST10HADT,M3.2.0,M11.1.0" />
+	<zone name="right/America/Atka" value="HST10HDT,M3.2.0,M11.1.0" />
 	<zone name="right/America/Bahia" value="BRT3" />
 	<zone name="right/America/Bahia_Banderas" value="CST6CDT,M4.1.0,M10.5.0" />
 	<zone name="right/America/Barbados" value="AST4" />
@@ -1341,11 +1317,11 @@
 	<zone name="right/America/Buenos_Aires" value="ART3" />
 	<zone name="right/America/Cambridge_Bay" value="MST7MDT,M3.2.0,M11.1.0" />
 	<zone name="right/America/Campo_Grande" value="AMT4AMST,M10.3.0/0,M2.3.0/0" />
-	<zone name="right/America/Cancun" value="CST6CDT,M4.1.0,M10.5.0" />
+	<zone name="right/America/Cancun" value="EST5" />
 	<zone name="right/America/Caracas" value="VET4:30" />
 	<zone name="right/America/Catamarca" value="ART3" />
 	<zone name="right/America/Cayenne" value="GFT3" />
-	<zone name="right/America/Cayman" value="EST5" />
+	<zone name="right/America/Cayman" value="EST5EDT,M3.2.0,M11.1.0" />
 	<zone name="right/America/Chicago" value="CST6CDT,M3.2.0,M11.1.0" />
 	<zone name="right/America/Chihuahua" value="MST7MDT,M4.1.0,M10.5.0" />
 	<zone name="right/America/Coral_Harbour" value="EST5" />
@@ -1364,12 +1340,13 @@
 	<zone name="right/America/Eirunepe" value="ACT5" />
 	<zone name="right/America/El_Salvador" value="CST6" />
 	<zone name="right/America/Ensenada" value="PST8PDT,M3.2.0,M11.1.0" />
+	<zone name="right/America/Fort_Nelson" value="MST7" />
 	<zone name="right/America/Fort_Wayne" value="EST5EDT,M3.2.0,M11.1.0" />
 	<zone name="right/America/Fortaleza" value="BRT3" />
 	<zone name="right/America/Glace_Bay" value="AST4ADT,M3.2.0,M11.1.0" />
 	<zone name="right/America/Godthab" value="WGST" />
 	<zone name="right/America/Goose_Bay" value="AST4ADT,M3.2.0,M11.1.0" />
-	<zone name="right/America/Grand_Turk" value="EST5EDT,M3.2.0,M11.1.0" />
+	<zone name="right/America/Grand_Turk" value="AST4" />
 	<zone name="right/America/Grenada" value="AST4" />
 	<zone name="right/America/Guadeloupe" value="AST4" />
 	<zone name="right/America/Guatemala" value="CST6" />
@@ -1411,12 +1388,12 @@
 	<zone name="right/America/Mendoza" value="ART3" />
 	<zone name="right/America/Menominee" value="CST6CDT,M3.2.0,M11.1.0" />
 	<zone name="right/America/Merida" value="CST6CDT,M4.1.0,M10.5.0" />
-	<zone name="right/America/Metlakatla" value="MeST8" />
+	<zone name="right/America/Metlakatla" value="PST8" />
 	<zone name="right/America/Mexico_City" value="CST6CDT,M4.1.0,M10.5.0" />
 	<zone name="right/America/Miquelon" value="PMST3PMDT,M3.2.0,M11.1.0" />
 	<zone name="right/America/Moncton" value="AST4ADT,M3.2.0,M11.1.0" />
 	<zone name="right/America/Monterrey" value="CST6CDT,M4.1.0,M10.5.0" />
-	<zone name="right/America/Montevideo" value="UYT3UYST,M10.1.0,M3.2.0" />
+	<zone name="right/America/Montevideo" value="UYT3" />
 	<zone name="right/America/Montreal" value="EST5EDT,M3.2.0,M11.1.0" />
 	<zone name="right/America/Montserrat" value="AST4" />
 	<zone name="right/America/Nassau" value="EST5EDT,M3.2.0,M11.1.0" />
@@ -1446,7 +1423,7 @@
 	<zone name="right/America/Rosario" value="ART3" />
 	<zone name="right/America/Santa_Isabel" value="PST8PDT,M4.1.0,M10.5.0" />
 	<zone name="right/America/Santarem" value="BRT3" />
-	<zone name="right/America/Santiago" value="CLST" />
+	<zone name="right/America/Santiago" value="CLT3" />
 	<zone name="right/America/Santo_Domingo" value="AST4" />
 	<zone name="right/America/Sao_Paulo" value="BRT3BRST,M10.3.0/0,M2.3.0/0" />
 	<zone name="right/America/Scoresbysund" value="EGT1EGST,M3.5.0/0,M10.5.0/1" />
@@ -1471,13 +1448,13 @@
 	<zone name="right/America/Winnipeg" value="CST6CDT,M3.2.0,M11.1.0" />
 	<zone name="right/America/Yakutat" value="AKST9AKDT,M3.2.0,M11.1.0" />
 	<zone name="right/America/Yellowknife" value="MST7MDT,M3.2.0,M11.1.0" />
-	<zone name="right/Antarctica/Casey" value="WST-8" />
+	<zone name="right/Antarctica/Casey" value="AWST-8" />
 	<zone name="right/Antarctica/Davis" value="DAVT-7" />
 	<zone name="right/Antarctica/DumontDUrville" value="DDUT-10" />
 	<zone name="right/Antarctica/Macquarie" value="MIST-11" />
 	<zone name="right/Antarctica/Mawson" value="MAWT-5" />
 	<zone name="right/Antarctica/McMurdo" value="NZST-12NZDT,M9.5.0,M4.1.0/3" />
-	<zone name="right/Antarctica/Palmer" value="CLST" />
+	<zone name="right/Antarctica/Palmer" value="CLT3" />
 	<zone name="right/Antarctica/Rothera" value="ROTT3" />
 	<zone name="right/Antarctica/South_Pole" value="NZST-12NZDT,M9.5.0,M4.1.0/3" />
 	<zone name="right/Antarctica/Syowa" value="SYOT-3" />
@@ -1500,7 +1477,8 @@
 	<zone name="right/Asia/Bishkek" value="KGT-6" />
 	<zone name="right/Asia/Brunei" value="BNT-8" />
 	<zone name="right/Asia/Calcutta" value="IST-5:30" />
-	<zone name="right/Asia/Choibalsan" value="CHOT-8" />
+	<zone name="right/Asia/Chita" value="IRKT-8" />
+	<zone name="right/Asia/Choibalsan" value="CHOT-8CHOST,M3.5.6,M9.5.6/0" />
 	<zone name="right/Asia/Chongqing" value="CST-8" />
 	<zone name="right/Asia/Chungking" value="CST-8" />
 	<zone name="right/Asia/Colombo" value="IST-5:30" />
@@ -1515,8 +1493,8 @@
 	<zone name="right/Asia/Hebron" value="EEST" />
 	<zone name="right/Asia/Ho_Chi_Minh" value="ICT-7" />
 	<zone name="right/Asia/Hong_Kong" value="HKT-8" />
-	<zone name="right/Asia/Hovd" value="HOVT-7" />
-	<zone name="right/Asia/Irkutsk" value="IRKT-9" />
+	<zone name="right/Asia/Hovd" value="HOVT-7HOVST,M3.5.6,M9.5.6/0" />
+	<zone name="right/Asia/Irkutsk" value="IRKT-8" />
 	<zone name="right/Asia/Istanbul" value="EET-2EEST,M3.5.0/3,M10.5.0/4" />
 	<zone name="right/Asia/Jakarta" value="WIB-7" />
 	<zone name="right/Asia/Jayapura" value="WIT-9" />
@@ -1524,39 +1502,40 @@
 	<zone name="right/Asia/Kabul" value="AFT-4:30" />
 	<zone name="right/Asia/Kamchatka" value="PETT-12" />
 	<zone name="right/Asia/Karachi" value="PKT-5" />
-	<zone name="right/Asia/Kashgar" value="CST-8" />
+	<zone name="right/Asia/Kashgar" value="XJT-6" />
 	<zone name="right/Asia/Kathmandu" value="NPT-5:45" />
 	<zone name="right/Asia/Katmandu" value="NPT-5:45" />
-	<zone name="right/Asia/Khandyga" value="YAKT-10" />
+	<zone name="right/Asia/Khandyga" value="YAKT-9" />
 	<zone name="right/Asia/Kolkata" value="IST-5:30" />
-	<zone name="right/Asia/Krasnoyarsk" value="KRAT-8" />
+	<zone name="right/Asia/Krasnoyarsk" value="KRAT-7" />
 	<zone name="right/Asia/Kuala_Lumpur" value="MYT-8" />
 	<zone name="right/Asia/Kuching" value="MYT-8" />
 	<zone name="right/Asia/Kuwait" value="AST-3" />
 	<zone name="right/Asia/Macao" value="CST-8" />
 	<zone name="right/Asia/Macau" value="CST-8" />
-	<zone name="right/Asia/Magadan" value="MAGT-12" />
+	<zone name="right/Asia/Magadan" value="MAGT-10" />
 	<zone name="right/Asia/Makassar" value="WITA-8" />
 	<zone name="right/Asia/Manila" value="PHT-8" />
 	<zone name="right/Asia/Muscat" value="GST-4" />
 	<zone name="right/Asia/Nicosia" value="EET-2EEST,M3.5.0/3,M10.5.0/4" />
-	<zone name="right/Asia/Novokuznetsk" value="NOVT-7" />
-	<zone name="right/Asia/Novosibirsk" value="NOVT-7" />
-	<zone name="right/Asia/Omsk" value="OMST-7" />
+	<zone name="right/Asia/Novokuznetsk" value="KRAT-7" />
+	<zone name="right/Asia/Novosibirsk" value="NOVT-6" />
+	<zone name="right/Asia/Omsk" value="OMST-6" />
 	<zone name="right/Asia/Oral" value="ORAT-5" />
 	<zone name="right/Asia/Phnom_Penh" value="ICT-7" />
 	<zone name="right/Asia/Pontianak" value="WIB-7" />
-	<zone name="right/Asia/Pyongyang" value="KST-9" />
+	<zone name="right/Asia/Pyongyang" value="KST-8:30" />
 	<zone name="right/Asia/Qatar" value="AST-3" />
 	<zone name="right/Asia/Qyzylorda" value="QYZT-6" />
 	<zone name="right/Asia/Rangoon" value="MMT-6:30" />
 	<zone name="right/Asia/Riyadh" value="AST-3" />
 	<zone name="right/Asia/Saigon" value="ICT-7" />
-	<zone name="right/Asia/Sakhalin" value="SAKT-11" />
+	<zone name="right/Asia/Sakhalin" value="SAKT-10" />
 	<zone name="right/Asia/Samarkand" value="UZT-5" />
 	<zone name="right/Asia/Seoul" value="KST-9" />
 	<zone name="right/Asia/Shanghai" value="CST-8" />
 	<zone name="right/Asia/Singapore" value="SGT-8" />
+	<zone name="right/Asia/Srednekolymsk" value="SRET-11" />
 	<zone name="right/Asia/Taipei" value="CST-8" />
 	<zone name="right/Asia/Tashkent" value="UZT-5" />
 	<zone name="right/Asia/Tbilisi" value="GET-4" />
@@ -1566,14 +1545,14 @@
 	<zone name="right/Asia/Thimphu" value="BTT-6" />
 	<zone name="right/Asia/Tokyo" value="JST-9" />
 	<zone name="right/Asia/Ujung_Pandang" value="WITA-8" />
-	<zone name="right/Asia/Ulaanbaatar" value="ULAT-8" />
-	<zone name="right/Asia/Ulan_Bator" value="ULAT-8" />
-	<zone name="right/Asia/Urumqi" value="CST-8" />
-	<zone name="right/Asia/Ust-Nera" value="VLAT-11" />
+	<zone name="right/Asia/Ulaanbaatar" value="ULAT-8ULAST,M3.5.6,M9.5.6/0" />
+	<zone name="right/Asia/Ulan_Bator" value="ULAT-8ULAST,M3.5.6,M9.5.6/0" />
+	<zone name="right/Asia/Urumqi" value="XJT-6" />
+	<zone name="right/Asia/Ust-Nera" value="VLAT-10" />
 	<zone name="right/Asia/Vientiane" value="ICT-7" />
-	<zone name="right/Asia/Vladivostok" value="VLAT-11" />
-	<zone name="right/Asia/Yakutsk" value="YAKT-10" />
-	<zone name="right/Asia/Yekaterinburg" value="YEKT-6" />
+	<zone name="right/Asia/Vladivostok" value="VLAT-10" />
+	<zone name="right/Asia/Yakutsk" value="YAKT-9" />
+	<zone name="right/Asia/Yekaterinburg" value="YEKT-5" />
 	<zone name="right/Asia/Yerevan" value="AMT-4" />
 	<zone name="right/Atlantic/Azores" value="AZOT1AZOST,M3.5.0/0,M10.5.0/1" />
 	<zone name="right/Atlantic/Bermuda" value="AST4ADT,M3.2.0,M11.1.0" />
@@ -1587,29 +1566,29 @@
 	<zone name="right/Atlantic/South_Georgia" value="GST2" />
 	<zone name="right/Atlantic/St_Helena" value="GMT0" />
 	<zone name="right/Atlantic/Stanley" value="FKST3" />
-	<zone name="right/Australia/ACT" value="EST-10EST,M10.1.0,M4.1.0/3" />
-	<zone name="right/Australia/Adelaide" value="CST-9:30CST,M10.1.0,M4.1.0/3" />
-	<zone name="right/Australia/Brisbane" value="EST-10" />
-	<zone name="right/Australia/Broken_Hill" value="CST-9:30CST,M10.1.0,M4.1.0/3" />
-	<zone name="right/Australia/Canberra" value="EST-10EST,M10.1.0,M4.1.0/3" />
-	<zone name="right/Australia/Currie" value="EST-10EST,M10.1.0,M4.1.0/3" />
-	<zone name="right/Australia/Darwin" value="CST-9:30" />
-	<zone name="right/Australia/Eucla" value="CWST-8:45" />
-	<zone name="right/Australia/Hobart" value="EST-10EST,M10.1.0,M4.1.0/3" />
-	<zone name="right/Australia/LHI" value="LHST-10:30LHST-11,M10.1.0,M4.1.0" />
-	<zone name="right/Australia/Lindeman" value="EST-10" />
-	<zone name="right/Australia/Lord_Howe" value="LHST-10:30LHST-11,M10.1.0,M4.1.0" />
-	<zone name="right/Australia/Melbourne" value="EST-10EST,M10.1.0,M4.1.0/3" />
-	<zone name="right/Australia/NSW" value="EST-10EST,M10.1.0,M4.1.0/3" />
-	<zone name="right/Australia/North" value="CST-9:30" />
-	<zone name="right/Australia/Perth" value="WST-8" />
-	<zone name="right/Australia/Queensland" value="EST-10" />
-	<zone name="right/Australia/South" value="CST-9:30CST,M10.1.0,M4.1.0/3" />
-	<zone name="right/Australia/Sydney" value="EST-10EST,M10.1.0,M4.1.0/3" />
-	<zone name="right/Australia/Tasmania" value="EST-10EST,M10.1.0,M4.1.0/3" />
-	<zone name="right/Australia/Victoria" value="EST-10EST,M10.1.0,M4.1.0/3" />
-	<zone name="right/Australia/West" value="WST-8" />
-	<zone name="right/Australia/Yancowinna" value="CST-9:30CST,M10.1.0,M4.1.0/3" />
+	<zone name="right/Australia/ACT" value="AEST-10AEDT,M10.1.0,M4.1.0/3" />
+	<zone name="right/Australia/Adelaide" value="ACST-9:30ACDT,M10.1.0,M4.1.0/3" />
+	<zone name="right/Australia/Brisbane" value="AEST-10" />
+	<zone name="right/Australia/Broken_Hill" value="ACST-9:30ACDT,M10.1.0,M4.1.0/3" />
+	<zone name="right/Australia/Canberra" value="AEST-10AEDT,M10.1.0,M4.1.0/3" />
+	<zone name="right/Australia/Currie" value="AEST-10AEDT,M10.1.0,M4.1.0/3" />
+	<zone name="right/Australia/Darwin" value="ACST-9:30" />
+	<zone name="right/Australia/Eucla" value="ACWST-8:45" />
+	<zone name="right/Australia/Hobart" value="AEST-10AEDT,M10.1.0,M4.1.0/3" />
+	<zone name="right/Australia/LHI" value="LHST-10:30LHDT-11,M10.1.0,M4.1.0" />
+	<zone name="right/Australia/Lindeman" value="AEST-10" />
+	<zone name="right/Australia/Lord_Howe" value="LHST-10:30LHDT-11,M10.1.0,M4.1.0" />
+	<zone name="right/Australia/Melbourne" value="AEST-10AEDT,M10.1.0,M4.1.0/3" />
+	<zone name="right/Australia/NSW" value="AEST-10AEDT,M10.1.0,M4.1.0/3" />
+	<zone name="right/Australia/North" value="ACST-9:30" />
+	<zone name="right/Australia/Perth" value="AWST-8" />
+	<zone name="right/Australia/Queensland" value="AEST-10" />
+	<zone name="right/Australia/South" value="ACST-9:30ACDT,M10.1.0,M4.1.0/3" />
+	<zone name="right/Australia/Sydney" value="AEST-10AEDT,M10.1.0,M4.1.0/3" />
+	<zone name="right/Australia/Tasmania" value="AEST-10AEDT,M10.1.0,M4.1.0/3" />
+	<zone name="right/Australia/Victoria" value="AEST-10AEDT,M10.1.0,M4.1.0/3" />
+	<zone name="right/Australia/West" value="AWST-8" />
+	<zone name="right/Australia/Yancowinna" value="ACST-9:30ACDT,M10.1.0,M4.1.0/3" />
 	<zone name="right/Brazil/Acre" value="ACT5" />
 	<zone name="right/Brazil/DeNoronha" value="FNT2" />
 	<zone name="right/Brazil/East" value="BRT3BRST,M10.3.0/0,M2.3.0/0" />
@@ -1625,43 +1604,43 @@
 	<zone name="right/Canada/Pacific" value="PST8PDT,M3.2.0,M11.1.0" />
 	<zone name="right/Canada/Saskatchewan" value="CST6" />
 	<zone name="right/Canada/Yukon" value="PST8PDT,M3.2.0,M11.1.0" />
-	<zone name="right/Chile/Continental" value="CLST" />
-	<zone name="right/Chile/EasterIsland" value="EASST" />
+	<zone name="right/Chile/Continental" value="CLT3" />
+	<zone name="right/Chile/EasterIsland" value="EAST5" />
 	<zone name="right/Cuba" value="CST5CDT,M3.2.0/0,M11.1.0/1" />
 	<zone name="right/EET" value="EET-2EEST,M3.5.0/3,M10.5.0/4" />
 	<zone name="right/EST" value="EST5" />
 	<zone name="right/EST5EDT" value="EST5EDT,M3.2.0,M11.1.0" />
-	<zone name="right/Egypt" value="EEST" />
+	<zone name="right/Egypt" value="EET-2" />
 	<zone name="right/Eire" value="GMT0IST,M3.5.0/1,M10.5.0" />
 	<zone name="right/Etc/GMT" value="GMT0" />
 	<zone name="right/Etc/GMT+0" value="GMT0" />
-	<zone name="right/Etc/GMT+1" value="<GMT+1>1" />
-	<zone name="right/Etc/GMT+10" value="<GMT+10>10" />
-	<zone name="right/Etc/GMT+11" value="<GMT+11>11" />
-	<zone name="right/Etc/GMT+12" value="<GMT+12>12" />
-	<zone name="right/Etc/GMT+2" value="<GMT+2>2" />
-	<zone name="right/Etc/GMT+3" value="<GMT+3>3" />
-	<zone name="right/Etc/GMT+4" value="<GMT+4>4" />
-	<zone name="right/Etc/GMT+5" value="<GMT+5>5" />
-	<zone name="right/Etc/GMT+6" value="<GMT+6>6" />
-	<zone name="right/Etc/GMT+7" value="<GMT+7>7" />
-	<zone name="right/Etc/GMT+8" value="<GMT+8>8" />
-	<zone name="right/Etc/GMT+9" value="<GMT+9>9" />
+	<zone name="right/Etc/GMT+1" value="&lt;GMT+1&gt;1" />
+	<zone name="right/Etc/GMT+10" value="&lt;GMT+10&gt;10" />
+	<zone name="right/Etc/GMT+11" value="&lt;GMT+11&gt;11" />
+	<zone name="right/Etc/GMT+12" value="&lt;GMT+12&gt;12" />
+	<zone name="right/Etc/GMT+2" value="&lt;GMT+2&gt;2" />
+	<zone name="right/Etc/GMT+3" value="&lt;GMT+3&gt;3" />
+	<zone name="right/Etc/GMT+4" value="&lt;GMT+4&gt;4" />
+	<zone name="right/Etc/GMT+5" value="&lt;GMT+5&gt;5" />
+	<zone name="right/Etc/GMT+6" value="&lt;GMT+6&gt;6" />
+	<zone name="right/Etc/GMT+7" value="&lt;GMT+7&gt;7" />
+	<zone name="right/Etc/GMT+8" value="&lt;GMT+8&gt;8" />
+	<zone name="right/Etc/GMT+9" value="&lt;GMT+9&gt;9" />
 	<zone name="right/Etc/GMT-0" value="GMT0" />
-	<zone name="right/Etc/GMT-1" value="<GMT-1>-1" />
-	<zone name="right/Etc/GMT-10" value="<GMT-10>-10" />
-	<zone name="right/Etc/GMT-11" value="<GMT-11>-11" />
-	<zone name="right/Etc/GMT-12" value="<GMT-12>-12" />
-	<zone name="right/Etc/GMT-13" value="<GMT-13>-13" />
-	<zone name="right/Etc/GMT-14" value="<GMT-14>-14" />
-	<zone name="right/Etc/GMT-2" value="<GMT-2>-2" />
-	<zone name="right/Etc/GMT-3" value="<GMT-3>-3" />
-	<zone name="right/Etc/GMT-4" value="<GMT-4>-4" />
-	<zone name="right/Etc/GMT-5" value="<GMT-5>-5" />
-	<zone name="right/Etc/GMT-6" value="<GMT-6>-6" />
-	<zone name="right/Etc/GMT-7" value="<GMT-7>-7" />
-	<zone name="right/Etc/GMT-8" value="<GMT-8>-8" />
-	<zone name="right/Etc/GMT-9" value="<GMT-9>-9" />
+	<zone name="right/Etc/GMT-1" value="&lt;GMT-1&gt;-1" />
+	<zone name="right/Etc/GMT-10" value="&lt;GMT-10&gt;-10" />
+	<zone name="right/Etc/GMT-11" value="&lt;GMT-11&gt;-11" />
+	<zone name="right/Etc/GMT-12" value="&lt;GMT-12&gt;-12" />
+	<zone name="right/Etc/GMT-13" value="&lt;GMT-13&gt;-13" />
+	<zone name="right/Etc/GMT-14" value="&lt;GMT-14&gt;-14" />
+	<zone name="right/Etc/GMT-2" value="&lt;GMT-2&gt;-2" />
+	<zone name="right/Etc/GMT-3" value="&lt;GMT-3&gt;-3" />
+	<zone name="right/Etc/GMT-4" value="&lt;GMT-4&gt;-4" />
+	<zone name="right/Etc/GMT-5" value="&lt;GMT-5&gt;-5" />
+	<zone name="right/Etc/GMT-6" value="&lt;GMT-6&gt;-6" />
+	<zone name="right/Etc/GMT-7" value="&lt;GMT-7&gt;-7" />
+	<zone name="right/Etc/GMT-8" value="&lt;GMT-8&gt;-8" />
+	<zone name="right/Etc/GMT-9" value="&lt;GMT-9&gt;-9" />
 	<zone name="right/Etc/GMT0" value="GMT0" />
 	<zone name="right/Etc/Greenwich" value="GMT0" />
 	<zone name="right/Etc/UCT" value="UCT0" />
@@ -1679,7 +1658,7 @@
 	<zone name="right/Europe/Bucharest" value="EET-2EEST,M3.5.0/3,M10.5.0/4" />
 	<zone name="right/Europe/Budapest" value="CET-1CEST,M3.5.0,M10.5.0/3" />
 	<zone name="right/Europe/Busingen" value="CET-1CEST,M3.5.0,M10.5.0/3" />
-	<zone name="right/Europe/Chisinau" value="EET-2EEST,M3.5.0/3,M10.5.0/4" />
+	<zone name="right/Europe/Chisinau" value="EET-2EEST,M3.5.0,M10.5.0/3" />
 	<zone name="right/Europe/Copenhagen" value="CET-1CEST,M3.5.0,M10.5.0/3" />
 	<zone name="right/Europe/Dublin" value="GMT0IST,M3.5.0/1,M10.5.0" />
 	<zone name="right/Europe/Gibraltar" value="CET-1CEST,M3.5.0,M10.5.0/3" />
@@ -1688,7 +1667,7 @@
 	<zone name="right/Europe/Isle_of_Man" value="GMT0BST,M3.5.0/1,M10.5.0" />
 	<zone name="right/Europe/Istanbul" value="EET-2EEST,M3.5.0/3,M10.5.0/4" />
 	<zone name="right/Europe/Jersey" value="GMT0BST,M3.5.0/1,M10.5.0" />
-	<zone name="right/Europe/Kaliningrad" value="FET-3" />
+	<zone name="right/Europe/Kaliningrad" value="EET-2" />
 	<zone name="right/Europe/Kiev" value="EET-2EEST,M3.5.0/3,M10.5.0/4" />
 	<zone name="right/Europe/Lisbon" value="WET0WEST,M3.5.0/1,M10.5.0" />
 	<zone name="right/Europe/Ljubljana" value="CET-1CEST,M3.5.0,M10.5.0/3" />
@@ -1697,9 +1676,9 @@
 	<zone name="right/Europe/Madrid" value="CET-1CEST,M3.5.0,M10.5.0/3" />
 	<zone name="right/Europe/Malta" value="CET-1CEST,M3.5.0,M10.5.0/3" />
 	<zone name="right/Europe/Mariehamn" value="EET-2EEST,M3.5.0/3,M10.5.0/4" />
-	<zone name="right/Europe/Minsk" value="FET-3" />
+	<zone name="right/Europe/Minsk" value="MSK-3" />
 	<zone name="right/Europe/Monaco" value="CET-1CEST,M3.5.0,M10.5.0/3" />
-	<zone name="right/Europe/Moscow" value="MSK-4" />
+	<zone name="right/Europe/Moscow" value="MSK-3" />
 	<zone name="right/Europe/Nicosia" value="EET-2EEST,M3.5.0/3,M10.5.0/4" />
 	<zone name="right/Europe/Oslo" value="CET-1CEST,M3.5.0,M10.5.0/3" />
 	<zone name="right/Europe/Paris" value="CET-1CEST,M3.5.0,M10.5.0/3" />
@@ -1710,24 +1689,23 @@
 	<zone name="right/Europe/Samara" value="SAMT-4" />
 	<zone name="right/Europe/San_Marino" value="CET-1CEST,M3.5.0,M10.5.0/3" />
 	<zone name="right/Europe/Sarajevo" value="CET-1CEST,M3.5.0,M10.5.0/3" />
-	<zone name="right/Europe/Simferopol" value="MSK-4" />
+	<zone name="right/Europe/Simferopol" value="MSK-3" />
 	<zone name="right/Europe/Skopje" value="CET-1CEST,M3.5.0,M10.5.0/3" />
 	<zone name="right/Europe/Sofia" value="EET-2EEST,M3.5.0/3,M10.5.0/4" />
 	<zone name="right/Europe/Stockholm" value="CET-1CEST,M3.5.0,M10.5.0/3" />
 	<zone name="right/Europe/Tallinn" value="EET-2EEST,M3.5.0/3,M10.5.0/4" />
 	<zone name="right/Europe/Tirane" value="CET-1CEST,M3.5.0,M10.5.0/3" />
-	<zone name="right/Europe/Tiraspol" value="EET-2EEST,M3.5.0/3,M10.5.0/4" />
+	<zone name="right/Europe/Tiraspol" value="EET-2EEST,M3.5.0,M10.5.0/3" />
 	<zone name="right/Europe/Uzhgorod" value="EET-2EEST,M3.5.0/3,M10.5.0/4" />
 	<zone name="right/Europe/Vaduz" value="CET-1CEST,M3.5.0,M10.5.0/3" />
 	<zone name="right/Europe/Vatican" value="CET-1CEST,M3.5.0,M10.5.0/3" />
 	<zone name="right/Europe/Vienna" value="CET-1CEST,M3.5.0,M10.5.0/3" />
 	<zone name="right/Europe/Vilnius" value="EET-2EEST,M3.5.0/3,M10.5.0/4" />
-	<zone name="right/Europe/Volgograd" value="VOLT-4" />
+	<zone name="right/Europe/Volgograd" value="MSK-3" />
 	<zone name="right/Europe/Warsaw" value="CET-1CEST,M3.5.0,M10.5.0/3" />
 	<zone name="right/Europe/Zagreb" value="CET-1CEST,M3.5.0,M10.5.0/3" />
 	<zone name="right/Europe/Zaporozhye" value="EET-2EEST,M3.5.0/3,M10.5.0/4" />
 	<zone name="right/Europe/Zurich" value="CET-1CEST,M3.5.0,M10.5.0/3" />
-	<zone name="right/Factory" value="<Local time zone must be set--see zic manual page>0" />
 	<zone name="right/GB" value="GMT0BST,M3.5.0/1,M10.5.0" />
 	<zone name="right/GB-Eire" value="GMT0BST,M3.5.0/1,M10.5.0" />
 	<zone name="right/GMT" value="GMT0" />
@@ -1766,15 +1744,16 @@
 	<zone name="right/Navajo" value="MST7MDT,M3.2.0,M11.1.0" />
 	<zone name="right/PRC" value="CST-8" />
 	<zone name="right/PST8PDT" value="PST8PDT,M3.2.0,M11.1.0" />
-	<zone name="right/Pacific/Apia" value="WST-13WSDT,M9.5.0/3,M4.1.0/4" />
+	<zone name="right/Pacific/Apia" value="WSST-13WSDT,M9.5.0/3,M4.1.0/4" />
 	<zone name="right/Pacific/Auckland" value="NZST-12NZDT,M9.5.0,M4.1.0/3" />
+	<zone name="right/Pacific/Bougainville" value="BST-11" />
 	<zone name="right/Pacific/Chatham" value="CHAST-12:45CHADT,M9.5.0/2:45,M4.1.0/3:45" />
 	<zone name="right/Pacific/Chuuk" value="CHUT-10" />
-	<zone name="right/Pacific/Easter" value="EASST" />
+	<zone name="right/Pacific/Easter" value="EAST5" />
 	<zone name="right/Pacific/Efate" value="VUT-11" />
 	<zone name="right/Pacific/Enderbury" value="PHOT-13" />
 	<zone name="right/Pacific/Fakaofo" value="TKT-13" />
-	<zone name="right/Pacific/Fiji" value="FJST" />
+	<zone name="right/Pacific/Fiji" value="FJT-12FJST,M11.1.0,M1.3.0/3" />
 	<zone name="right/Pacific/Funafuti" value="TVT-12" />
 	<zone name="right/Pacific/Galapagos" value="GALT6" />
 	<zone name="right/Pacific/Gambier" value="GAMT9" />
@@ -1790,7 +1769,7 @@
 	<zone name="right/Pacific/Midway" value="SST11" />
 	<zone name="right/Pacific/Nauru" value="NRT-12" />
 	<zone name="right/Pacific/Niue" value="NUT11" />
-	<zone name="right/Pacific/Norfolk" value="NFT-11:30" />
+	<zone name="right/Pacific/Norfolk" value="NFT-11" />
 	<zone name="right/Pacific/Noumea" value="NCT-11" />
 	<zone name="right/Pacific/Pago_Pago" value="SST11" />
 	<zone name="right/Pacific/Palau" value="PWT-9" />
@@ -1813,23 +1792,10 @@
 	<zone name="right/ROC" value="CST-8" />
 	<zone name="right/ROK" value="KST-9" />
 	<zone name="right/Singapore" value="SGT-8" />
-	<zone name="right/SystemV/AST4" value="AST4" />
-	<zone name="right/SystemV/AST4ADT" value="AST4ADT,M3.2.0,M11.1.0" />
-	<zone name="right/SystemV/CST6" value="CST6" />
-	<zone name="right/SystemV/CST6CDT" value="CST6CDT,M3.2.0,M11.1.0" />
-	<zone name="right/SystemV/EST5" value="EST5" />
-	<zone name="right/SystemV/EST5EDT" value="EST5EDT,M3.2.0,M11.1.0" />
-	<zone name="right/SystemV/HST10" value="HST10" />
-	<zone name="right/SystemV/MST7" value="MST7" />
-	<zone name="right/SystemV/MST7MDT" value="MST7MDT,M3.2.0,M11.1.0" />
-	<zone name="right/SystemV/PST8" value="PST8" />
-	<zone name="right/SystemV/PST8PDT" value="PST8PDT,M3.2.0,M11.1.0" />
-	<zone name="right/SystemV/YST9" value="GAMT9" />
-	<zone name="right/SystemV/YST9YDT" value="AKST9AKDT,M3.2.0,M11.1.0" />
 	<zone name="right/Turkey" value="EET-2EEST,M3.5.0/3,M10.5.0/4" />
 	<zone name="right/UCT" value="UCT0" />
 	<zone name="right/US/Alaska" value="AKST9AKDT,M3.2.0,M11.1.0" />
-	<zone name="right/US/Aleutian" value="HAST10HADT,M3.2.0,M11.1.0" />
+	<zone name="right/US/Aleutian" value="HST10HDT,M3.2.0,M11.1.0" />
 	<zone name="right/US/Arizona" value="MST7" />
 	<zone name="right/US/Central" value="CST6CDT,M3.2.0,M11.1.0" />
 	<zone name="right/US/East-Indiana" value="EST5EDT,M3.2.0,M11.1.0" />
@@ -1843,7 +1809,7 @@
 	<zone name="right/US/Samoa" value="SST11" />
 	<zone name="right/UTC" value="UTC0" />
 	<zone name="right/Universal" value="UTC0" />
-	<zone name="right/W-SU" value="MSK-4" />
+	<zone name="right/W-SU" value="MSK-3" />
 	<zone name="right/WET" value="WET0WEST,M3.5.0/1,M10.5.0" />
 	<zone name="right/Zulu" value="UTC0" />
     </timezones>
diff --git a/configure.ac b/configure.ac
index 0c371b4899..b0366cbc34 100644
--- a/configure.ac
+++ b/configure.ac
@@ -408,6 +408,7 @@ elif test "x${ax_cv_c_compiler_vendor}" = "xclang" ; then
 elif test "x${ax_cv_c_compiler_vendor}" = "xgnu" ; then
     APR_ADDTO(SWITCH_AM_CFLAGS, -fPIC)
     APR_ADDTO(SWITCH_AM_CXXFLAGS, -fPIC)
+    AC_SUBST([AM_MOD_AVMD_CXXFLAGS], [-std=gnu99])      # FS-8809, needed for MAP_POPULATE
     if test "$ac_cv_gcc_supports_w_no_unused_result" = yes; then
       APR_ADDTO(SWITCH_AM_CFLAGS, -Werror)
     fi
@@ -542,6 +543,16 @@ if test "${enable_debug}" = "yes"; then
 
 fi
 
+AC_ARG_ENABLE(libyuv,
+[AC_HELP_STRING([--disable-libyuv],[build without libyuv])],[enable_libyuv="$enableval"],[enable_libyuv="yes"])
+
+AM_CONDITIONAL([ENABLE_LIBYUV],[test "${enable_libyuv}" = "yes"])
+
+AC_ARG_ENABLE(libvpx,
+[AC_HELP_STRING([--disable-libvpx],[build without libvpx])],[enable_libvpx="$enableval"],[enable_libvpx="yes"])
+
+AM_CONDITIONAL([ENABLE_LIBVPX],[test "${enable_libvpx}" = "yes"])
+
 AC_ARG_ENABLE(cpp,
 [AC_HELP_STRING([--disable-cpp],[build without cpp code])],[enable_cpp="$enableval"],[enable_cpp="yes"])
 
@@ -784,12 +795,6 @@ if test "x$have_libz" = "xyes"  ; then
 APR_ADDTO([PLATFORM_CORE_LIBS], [-lz])
 fi
 
-PKG_CHECK_MODULES([YUV], [libyuv >= 0.0.1280],
-			 [AC_MSG_RESULT([yes]);AM_CONDITIONAL([HAVE_YUV],[true])],
-			 [AC_MSG_RESULT([no]);AM_CONDITIONAL([HAVE_YUV],[false])])
-
-APR_ADDTO([PLATFORM_CORE_LIBS], [${YUV_LIBS}])
-
 PKG_CHECK_MODULES([MPG123], [libmpg123 >= 1.20.1],[
   AM_CONDITIONAL([HAVE_MPG123],[true])],[
   AC_MSG_RESULT([no]); AM_CONDITIONAL([HAVE_MPG123],[false])])
@@ -900,8 +905,6 @@ CPPFLAGS="$save_CPPFLAGS"
 
 AX_HAVE_CPU_SET
 
-AC_CHECK_LIB(vpx, vpx_img_alloc, [AC_DEFINE(HAVE_VPX, 1, [Define if you have vpx()])])
-
 AC_CHECK_LIB(rt, clock_gettime, [AC_DEFINE(HAVE_CLOCK_GETTIME, 1, [Define if you have clock_gettime()])])
 AC_CHECK_LIB(rt, clock_getres, [AC_DEFINE(HAVE_CLOCK_GETRES, 1, [Define if you have clock_getres()])])
 AC_CHECK_LIB(rt, clock_nanosleep, [AC_DEFINE(HAVE_CLOCK_NANOSLEEP, 1, [Define if you have clock_nanosleep()])])
@@ -1273,16 +1276,6 @@ PKG_CHECK_MODULES([SNDFILE], [sndfile >= 1.0.20],[
   AM_CONDITIONAL([HAVE_SNDFILE],[true])],[
   AC_MSG_RESULT([no]); AM_CONDITIONAL([HAVE_SNDFILE],[false])])
 
-PKG_CHECK_MODULES([VPX], [vpx2 >= 1.4.0],[
-  AM_CONDITIONAL([HAVE_VPX],[true])],[
-    PKG_CHECK_MODULES([VPX], [vpx >= 1.4.0],[
-      AM_CONDITIONAL([HAVE_VPX],[true])],[
-      AC_MSG_RESULT([no]); AM_CONDITIONAL([HAVE_VPX],[false])])])
-
-SWITCH_AM_CFLAGS="$VPX_CFLAGS $SWITCH_AM_CFLAGS"
-SWITCH_AM_CXXFLAGS="$VPX_CFLAGS $SWITCH_AM_CXXFLAGS"
-SWITCH_AM_CPPFLAGS="$VPX_CFLAGS $SWITCH_AM_CPPFLAGS"
-
 PKG_CHECK_MODULES([MPG123], [libmpg123 >= 1.20.1],[
   AM_CONDITIONAL([HAVE_MPG123],[true])],[
   AC_MSG_RESULT([no]); AM_CONDITIONAL([HAVE_MPG123],[false])])
@@ -1291,9 +1284,13 @@ PKG_CHECK_MODULES([SHOUT], [shout >= 2.2.2],[
   AM_CONDITIONAL([HAVE_SHOUT],[true])],[
   AC_MSG_RESULT([no]); AM_CONDITIONAL([HAVE_SHOUT],[false])])
 
-PKG_CHECK_MODULES([MP3LAME], [mp3lame],[
-  AM_CONDITIONAL([HAVE_MP3LAME],[true])],[
-  AC_MSG_RESULT([no]); AM_CONDITIONAL([HAVE_MP3LAME],[false])])
+mp3lame=false
+AC_CHECK_LIB([mp3lame], [lame_init],[
+  AC_CHECK_HEADER([lame/lame.h],[
+    mp3lame=true
+    AC_SUBST([MP3LAME_LIBS], [-lmp3lame])
+    AC_SUBST([MP3LAME_CFLAGS], [$CPPFLAGS])])])
+AM_CONDITIONAL([HAVE_MP3LAME],[$mp3lame])
 
 PKG_CHECK_MODULES([AVCODEC], [libavcodec >= 53.35.0],[
   AM_CONDITIONAL([HAVE_AVCODEC],[true])],[
@@ -1409,6 +1406,7 @@ AS_IF([test "x$enable_core_libedit_support" != "xno"],[
   PKG_CHECK_MODULES([LIBEDIT], [libedit >= 2.11],,[
     AC_MSG_RESULT([no])
     AC_CHECK_LIB([edit], [el_line], [LIBEDIT_LIBS=-ledit])
+    AC_CHECK_LIB([edit], [el_cursor], [ac_cv_has_el_cursor=yes])
     AC_CHECK_HEADER([histedit.h], [], [unset LIBEDIT_LIBS])
     AS_IF([test "x$LIBEDIT_LIBS" = "x"], [
       AC_MSG_ERROR([You need to either install libedit-dev (>= 2.11) or configure with --disable-core-libedit-support])
@@ -1446,6 +1444,9 @@ AS_IF([test "x$enable_core_libedit_support" != "xno"], [
   # If making changes here, don't forget to run autoheader and
   # update libs/esl/src/include/esl_config_auto.h.in manually.
   AC_DEFINE([HAVE_LIBEDIT], [1], [Define to 1 if you have libedit is available])
+if test x$ac_cv_has_el_cursor = xyes; then
+  AC_DEFINE([HAVE_EL_CURSOR], [1], [Define to 1 if you have libedit el_cursor support])
+fi
   save_LIBS="${LIBS}"
   save_CPPFLAGS="${CPPFLAGS}"
   LIBS="${LIBEDIT_LIBS}"
@@ -1756,7 +1757,6 @@ AC_CONFIG_FILES([Makefile
 		src/mod/codecs/mod_siren/Makefile
 		src/mod/codecs/mod_skel_codec/Makefile
 		src/mod/codecs/mod_theora/Makefile
-		src/mod/codecs/mod_vpx/Makefile
 		src/mod/dialplans/mod_dialplan_asterisk/Makefile
 		src/mod/dialplans/mod_dialplan_directory/Makefile
 		src/mod/dialplans/mod_dialplan_xml/Makefile
diff --git a/debian/bootstrap.sh b/debian/bootstrap.sh
index f9c599acdf..2c1a873ef7 100755
--- a/debian/bootstrap.sh
+++ b/debian/bootstrap.sh
@@ -336,7 +336,7 @@ Build-Depends:
  libedit-dev (>= 2.11),
  libsqlite3-dev,
  wget, pkg-config,
- libyuv-dev, libvpx-dev (>= 1.4.0) | libvpx2-dev,
+ yasm,
 # core codecs
  libogg-dev, libspeex-dev, libspeexdsp-dev,
 # configure options
@@ -370,7 +370,7 @@ Conflicts: $(list_freeswitch_all_replaces)
 Depends: \${shlibs:Depends}, \${perl:Depends}, \${misc:Depends},
  freeswitch-music-default (>= 1.0.8),
  freeswitch-sounds-en-us-callie (>= 1.0.25) | freeswitch-sounds,
- libyuv, libvpx (>= 1.4.0) | libvpx2,
+ yasm,
  $(debian_wrap "${mod_depends}")
 Recommends:
  $(debian_wrap "${mod_recommends}")
@@ -395,7 +395,7 @@ Description: Cross-Platform Scalable Multi-Protocol Soft Switch
 Package: libfreeswitch1
 Architecture: any
 Depends: \${shlibs:Depends}, \${misc:Depends},
- libyuv, libvpx (>= 1.4.0) | libvpx2
+ yasm
 Recommends:
 Suggests: libfreeswitch1-dbg
 Description: Cross-Platform Scalable Multi-Protocol Soft Switch
diff --git a/debian/control-modules b/debian/control-modules
index 66c040f5d2..28d6aea2d0 100644
--- a/debian/control-modules
+++ b/debian/control-modules
@@ -377,10 +377,6 @@ Module: codecs/mod_theora
 Description: mod_theora
  Adds mod_theora.
 
-Module: codecs/mod_vpx
-Description: VP8/VP9 video codec
- This module adds the VP8 video codec, also known as WebM.
-
 Module: codecs/mod_yuv
 Description: Adds mod_yuv
  Adds mod_yuv.
diff --git a/debian/freeswitch-systemd.freeswitch.service b/debian/freeswitch-systemd.freeswitch.service
index 2a3a97089b..cc0cf1d582 100644
--- a/debian/freeswitch-systemd.freeswitch.service
+++ b/debian/freeswitch-systemd.freeswitch.service
@@ -19,7 +19,7 @@ Group=daemon
 LimitCORE=infinity
 LimitNOFILE=100000
 LimitNPROC=60000
-;LimitSTACK=240
+LimitSTACK=240K
 LimitRTPRIO=infinity
 LimitRTTIME=7000000
 IOSchedulingClass=realtime
diff --git a/debian/util.sh b/debian/util.sh
index 022daa2048..3c481a3aa3 100755
--- a/debian/util.sh
+++ b/debian/util.sh
@@ -276,7 +276,7 @@ build_debs () {
     local OPTIND OPTARG debug_hook=false hookdir="" cow_build_opts=""
     local keep_pbuilder_config=false keyring="" custom_keyring="/tmp/fs.asc"
     local use_custom_sources=true
-    local custom_sources_file="/tmp/fs.sources.list"
+    local custom_sources_file="/etc/apt/sources.list"
     while getopts 'BbdK:kT:t' o "$@"; do
       case "$o" in
         B) cow_build_opts="--debbuildopts '-B'";;
@@ -445,7 +445,7 @@ build_all () {
   [ -n "$distros" ] || distros="$(default_distros)"
   ! $depinst || aptitude install -y \
     rsync git less cowbuilder ccache \
-    devscripts equivs build-essential
+    devscripts equivs build-essential yasm
   [ -n "$orig" ] || orig="$(create_orig $orig_opts HEAD | tail -n1)"
   if [ -n "$modlist" ]; then
     local modtmp="$(mktemp /tmp/modules-XXXXXXXXXX.conf)"
diff --git a/freeswitch.spec b/freeswitch.spec
index 42f15063e6..bf8ebfab5b 100644
--- a/freeswitch.spec
+++ b/freeswitch.spec
@@ -1462,7 +1462,7 @@ ASR_TTS_MODULES="asr_tts/mod_flite asr_tts/mod_pocketsphinx asr_tts/mod_tts_comm
 ######################################################################################################################
 CODECS_MODULES="codecs/mod_amr codecs/mod_amrwb codecs/mod_bv codecs/mod_codec2 codecs/mod_g723_1 \
 		codecs/mod_g729 codecs/mod_h26x codecs/mod_ilbc codecs/mod_isac codecs/mod_mp4v codecs/mod_opus codecs/mod_silk \
-		codecs/mod_siren codecs/mod_theora codecs/mod_vpx"
+		codecs/mod_siren codecs/mod_theora"
 #
 %if %{build_sng_tc}
 CODECS_MODULES+="codecs/mod_sangoma_codec"
@@ -1716,7 +1716,7 @@ cd ../..
 %pre
 %ifos linux
 if ! /usr/bin/id freeswitch &>/dev/null; then
-       /usr/sbin/useradd -r -g daemon -s /bin/false -c "The FreeSWITCH Open Source Voice Platform" -d %{prefix} freeswitch || \
+       /usr/sbin/useradd -r -g daemon -s /bin/false -c "The FreeSWITCH Open Source Voice Platform" -d %{LOCALSTATEDIR} freeswitch || \
                 %logmsg "Unexpected error adding user \"freeswitch\". Aborting installation."
 fi
 %endif
@@ -2174,9 +2174,6 @@ fi
 %files codec-mp4v
 %{MODINSTDIR}/mod_mp4v.so*
 
-%files codec-vpx
-%{MODINSTDIR}/mod_vpx.so*
-
 %files codec-opus
 %{MODINSTDIR}/mod_opus.so*
 %config(noreplace) %attr(0640, freeswitch, daemon) %{sysconfdir}/autoload_configs/opus.conf.xml
diff --git a/html5/verto/js/src/jquery.FSRTC.js b/html5/verto/js/src/jquery.FSRTC.js
index 6025e08f7d..7bc2aa8613 100644
--- a/html5/verto/js/src/jquery.FSRTC.js
+++ b/html5/verto/js/src/jquery.FSRTC.js
@@ -342,8 +342,8 @@
             if(typeof self.options.localVideoStream.stop == 'function') {
 	        self.options.localVideoStream.stop();
             } else {
-		if (self.localVideoStream.active){
-                    var tracks = self.localVideoStream.getTracks();
+		if (self.options.localVideoStream.active){
+                    var tracks = self.options.localVideoStream.getTracks();
                     console.error(tracks);
 		    tracks.forEach(function(track, index){
 			console.log(track);
@@ -513,7 +513,7 @@
 	    audio = false;
 	} else {
 	    audio = {
-		mandatory: obj.options.audioParams,
+		mandatory: {},
 		optional: []
 	    };
 
@@ -521,6 +521,15 @@
 		audio.optional = [{sourceId: obj.options.useMic}]
 	    }
 
+	    if (obj.options.audioParams) {
+		for (var key in obj.options.audioParams) {
+		    var con = {};
+		    con[key] = obj.options.audioParams[key];
+		    audio.optional.push(con);
+		}
+	    }
+
+
 	}
 
 	if (obj.options.useVideo && obj.options.localVideo) {
diff --git a/html5/verto/verto_communicator/src/css/verto.css b/html5/verto/verto_communicator/src/css/verto.css
index aa9157436a..7b87ed070a 100644
--- a/html5/verto/verto_communicator/src/css/verto.css
+++ b/html5/verto/verto_communicator/src/css/verto.css
@@ -403,7 +403,6 @@ body .modal-body .btn-group .btn.active {
 }
 
 .call_direction {
-  position: absolute;
   margin-top: 6px;
 }
 
@@ -452,7 +451,6 @@ body .modal-body .btn-group .btn.active {
   overflow: hidden;
   overflow-wrap: break-word;
   margin-left: 26px !important;
-  position: absolute;
 }
 
 #dialpad .dialpad-number {
diff --git a/html5/verto/verto_communicator/src/partials/modal_settings.html b/html5/verto/verto_communicator/src/partials/modal_settings.html
index c7c4470d4a..5df1573292 100644
--- a/html5/verto/verto_communicator/src/partials/modal_settings.html
+++ b/html5/verto/verto_communicator/src/partials/modal_settings.html
@@ -35,6 +35,7 @@
     </select>
   </div>
 
+  <a class="btn btn-primary" href="#/preview" ng-click="ok()">Preview Settings</a>
   <a class="btn btn-primary" href="" ng-click="refreshDeviceList()">Refresh device list</a>
 
   <div class="form-group">
@@ -131,6 +132,7 @@
       <select name="video_quality" id="video-quality" class="form-control"
               ng-disabled="mydata.autoBand"
               ng-model="mydata.vidQual"
+              ng-change="checkVideoQuality(mydata.vidQual)"
               ng-options="item.id as item.label for item in verto.videoQuality"></select>
     </div>
 
diff --git a/html5/verto/verto_communicator/src/vertoControllers/controllers/InCallController.js b/html5/verto/verto_communicator/src/vertoControllers/controllers/InCallController.js
index f030b5d611..d5e5484fa6 100644
--- a/html5/verto/verto_communicator/src/vertoControllers/controllers/InCallController.js
+++ b/html5/verto/verto_communicator/src/vertoControllers/controllers/InCallController.js
@@ -105,7 +105,15 @@
             verto.screenshareHangup();
             return false;
           }
-          verto.screenshare(storage.data.called_number);
+          if (verto.data.conf) {
+            console.log('Screenshare inside conferece: ', verto.data.conf);
+            // Setting the destination of the screenshare call as the conference
+            // number we last joined
+            verto.screenshare(verto.data.conf.params.laData.laName);
+          }
+          else {
+            verto.screenshare(storage.data.called_number);
+          }
         };
 
         function buildCanvasesData() {
diff --git a/html5/verto/verto_communicator/src/vertoControllers/controllers/ModalSettingsController.js b/html5/verto/verto_communicator/src/vertoControllers/controllers/ModalSettingsController.js
index c1665eacca..a99fefe058 100644
--- a/html5/verto/verto_communicator/src/vertoControllers/controllers/ModalSettingsController.js
+++ b/html5/verto/verto_communicator/src/vertoControllers/controllers/ModalSettingsController.js
@@ -56,11 +56,23 @@
 
         $scope.checkAutoBand = function(option) {
           $scope.mydata.useDedenc = false;
+          var bestres = videoQuality[videoQuality.length-1];
+          $scope.mydata.vidQual = bestres.id;
+          storage.data.vidQual = bestres.id;
+          verto.data.instance.videoParams({
+            minWidth: bestres.width,
+            minHeight: bestres.height,
+            maxWidth: bestres.width,
+            maxHeight: bestres.height,
+            minFrameRate: 15,
+            vertoBestFrameRate: storage.data.bestFrameRate
+          });
+          storage.data.vidQual = bestres.id;
           if (!option) {
             $scope.mydata.outgoingBandwidth = 'default';
             $scope.mydata.incomingBandwidth = 'default';
-            $scope.mydata.vidQual = 'hd';
             $scope.mydata.testSpeedJoin = false;
+
           } else {
             $scope.mydata.testSpeedJoin = true;
           }
@@ -73,6 +85,22 @@
             $scope.mydata.useDedenc = true;
           }
         };
+
+        $scope.checkVideoQuality = function(resolution) {
+          var w = videoResolution[resolution]['width'];
+          var h = videoResolution[resolution]['height'];
+          storage.data.vidQual = resolution;
+          verto.data.instance.videoParams({
+            minWidth: w,
+            minHeight: h,
+            maxWidth: w,
+            maxHeight: h,
+            minFrameRate: 15,
+            vertoBestFrameRate: storage.data.bestFrameRate
+          });
+
+        };
+
       }
     ]);
 
diff --git a/html5/verto/verto_communicator/src/vertoService/services/vertoService.js b/html5/verto/verto_communicator/src/vertoService/services/vertoService.js
index 930501cc48..824e5e9e91 100644
--- a/html5/verto/verto_communicator/src/vertoService/services/vertoService.js
+++ b/html5/verto/verto_communicator/src/vertoService/services/vertoService.js
@@ -326,7 +326,7 @@ vertoService.service('verto', ['$rootScope', '$cookieStore', '$location', 'stora
         if (!videoFlag) storage.data.selectedVideo = data.videoDevices[0].id;
         if (!shareFlag) storage.data.selectedShare = data.shareDevices[0].id;
         if (!audioFlag) storage.data.selectedAudio = data.audioDevices[0].id;
-        if (!speakerFlag) storage.data.selectedSpeaker = data.speakerDevices[0].id;
+        if (!speakerFlag && data.speakerDevices.length > 0) storage.data.selectedSpeaker = data.speakerDevices[0].id;
 
         // This means that we cannot use video!
         if (data.videoDevices.length === 0) {
diff --git a/html5/verto/video_demo/images/speed.gif b/html5/verto/video_demo/images/speed.gif
new file mode 100644
index 0000000000..96998f041f
Binary files /dev/null and b/html5/verto/video_demo/images/speed.gif differ
diff --git a/html5/verto/video_demo/js/verto-min.js b/html5/verto/video_demo/js/verto-min.js
index be202e2068..ce4dd194f3 100644
--- a/html5/verto/video_demo/js/verto-min.js
+++ b/html5/verto/video_demo/js/verto-min.js
@@ -33,7 +33,7 @@ $.FSRTC.prototype.stop=function(){var self=this;if(self.options.useVideo){self.o
 if(self.localStream){if(typeof self.localStream.stop=='function'){self.localStream.stop();}else{if(self.localStream.active){var tracks=self.localStream.getTracks();console.error(tracks);tracks.forEach(function(track,index){console.log(track);track.stop();})}}
 self.localStream=null;}
 if(self.options.localVideo){self.options.localVideo.style.display='none';if(moz){self.options.localVideo['mozSrcObject']=null;}else{self.options.localVideo['src']='';}}
-if(self.options.localVideoStream){if(typeof self.options.localVideoStream.stop=='function'){self.options.localVideoStream.stop();}else{if(self.localVideoStream.active){var tracks=self.localVideoStream.getTracks();console.error(tracks);tracks.forEach(function(track,index){console.log(track);track.stop();})}}}
+if(self.options.localVideoStream){if(typeof self.options.localVideoStream.stop=='function'){self.options.localVideoStream.stop();}else{if(self.options.localVideoStream.active){var tracks=self.options.localVideoStream.getTracks();console.error(tracks);tracks.forEach(function(track,index){console.log(track);track.stop();})}}}
 if(self.peer){console.log("stopping peer");self.peer.stop();}};$.FSRTC.prototype.getMute=function(){var self=this;return self.audioEnabled;}
 $.FSRTC.prototype.setMute=function(what){var self=this;var audioTracks=self.localStream.getAudioTracks();for(var i=0,len=audioTracks.length;i<len;i++){switch(what){case"on":audioTracks[i].enabled=true;break;case"off":audioTracks[i].enabled=false;break;case"toggle":audioTracks[i].enabled=!audioTracks[i].enabled;default:break;}
 self.audioEnabled=audioTracks[i].enabled;}
@@ -45,7 +45,8 @@ return!self.videoEnabled;}
 $.FSRTC.prototype.createAnswer=function(params){var self=this;self.type="answer";self.remoteSDP=params.sdp;console.debug("inbound sdp: ",params.sdp);function onSuccess(stream){self.localStream=stream;self.peer=RTCPeerConnection({type:self.type,attachStream:self.localStream,onICE:function(candidate){return onICE(self,candidate);},onICEComplete:function(){return onICEComplete(self);},onRemoteStream:function(stream){return onRemoteStream(self,stream);},onICESDP:function(sdp){return onICESDP(self,sdp);},onChannelError:function(e){return onChannelError(self,e);},constraints:self.constraints,iceServers:self.options.iceServers,offerSDP:{type:"offer",sdp:self.remoteSDP}});onStreamSuccess(self);}
 function onError(e){onStreamError(self,e);}
 var mediaParams=getMediaParams(self);console.log("Audio constraints",mediaParams.audio);console.log("Video constraints",mediaParams.video);if(self.options.useVideo&&self.options.localVideo){getUserMedia({constraints:{audio:false,video:{mandatory:self.options.videoParams,optional:[]},},localVideo:self.options.localVideo,onsuccess:function(e){self.options.localVideoStream=e;console.log("local video ready");},onerror:function(e){console.error("local video error!");}});}
-getUserMedia({constraints:{audio:mediaParams.audio,video:mediaParams.video},video:mediaParams.useVideo,onsuccess:onSuccess,onerror:onError});};function getMediaParams(obj){var audio;if(obj.options.useMic&&obj.options.useMic==="none"){console.log("Microphone Disabled");audio=false;}else if(obj.options.videoParams&&obj.options.screenShare){console.error("SCREEN SHARE");audio=false;}else{audio={mandatory:obj.options.audioParams,optional:[]};if(obj.options.useMic!=="any"){audio.optional=[{sourceId:obj.options.useMic}]}}
+getUserMedia({constraints:{audio:mediaParams.audio,video:mediaParams.video},video:mediaParams.useVideo,onsuccess:onSuccess,onerror:onError});};function getMediaParams(obj){var audio;if(obj.options.useMic&&obj.options.useMic==="none"){console.log("Microphone Disabled");audio=false;}else if(obj.options.videoParams&&obj.options.screenShare){console.error("SCREEN SHARE");audio=false;}else{audio={mandatory:{},optional:[]};if(obj.options.useMic!=="any"){audio.optional=[{sourceId:obj.options.useMic}]}
+if(obj.options.audioParams){for(var key in obj.options.audioParams){var con={};con[key]=obj.options.audioParams[key];audio.optional.push(con);}}}
 if(obj.options.useVideo&&obj.options.localVideo){getUserMedia({constraints:{audio:false,video:{mandatory:obj.options.videoParams,optional:[]},},localVideo:obj.options.localVideo,onsuccess:function(e){self.options.localVideoStream=e;console.log("local video ready");},onerror:function(e){console.error("local video error!");}});}
 var video={};var bestFrameRate=obj.options.videoParams.vertoBestFrameRate;delete obj.options.videoParams.vertoBestFrameRate;video={mandatory:obj.options.videoParams,optional:[]}
 var useVideo=obj.options.useVideo;if(useVideo&&obj.options.useCamera&&obj.options.useCamera!=="none"){if(!video.optional){video.optional=[];}
diff --git a/html5/verto/video_demo/verto.js b/html5/verto/video_demo/verto.js
index a4c71c36b3..2df2f0824d 100644
--- a/html5/verto/video_demo/verto.js
+++ b/html5/verto/video_demo/verto.js
@@ -1441,11 +1441,11 @@ function init() {
 
 	},
 
-//	audioParams: {
-//	    googAutoGainControl: false,
-//	    googNoiseSuppression: false,
-//	    googHighpassFilter: false
-//	},
+	audioParams: {
+	    googAutoGainControl: false,
+	    googNoiseSuppression: false,
+	    googHighpassFilter: false
+	},
 
 	iceServers: $("#use_stun").is(':checked')
     },callbacks);
diff --git a/libs/.gitignore b/libs/.gitignore
index 9da77e16e9..a567a23673 100644
--- a/libs/.gitignore
+++ b/libs/.gitignore
@@ -6,7 +6,6 @@ config.log
 config.nice
 config.status
 config.sub
-configure
 depcomp
 install-sh
 libtool
@@ -829,3 +828,15 @@ unimrcp/build/compile
 /ldns/
 /portaudio/
 portaudio.*.log
+apr-util/configure
+apr-util/xml/expat/configure
+apr/configure
+iksemel/configure
+libdingaling/configure
+libyuv/Makefile
+libyuv/convert
+sofia-sip/configure
+spandsp/configure
+srtp/configure
+tiff-4.0.2/configure
+unimrcp/configure
diff --git a/libs/esl/fs_cli.c b/libs/esl/fs_cli.c
index 7a74a042c4..237dc483c0 100644
--- a/libs/esl/fs_cli.c
+++ b/libs/esl/fs_cli.c
@@ -68,6 +68,7 @@ typedef struct {
 	char prompt_color[12];
 	char input_text_color[12];
 	char output_text_color[12];
+	char prompt_string[512];
 } cli_profile_t;
 
 static const int log_uuid_short_length = 8;
@@ -99,7 +100,9 @@ static History *myhistory;
 static HistEvent ev;
 #endif
 
-
+static char hostname[256] = "";
+static char switchname[256] = "";
+static char switch_hostname[256] = "";
 static esl_mutex_t *MUTEX = NULL;
 
 static void _sleep_ns(int secs, long nsecs) {
@@ -211,7 +214,9 @@ static unsigned char console_eofkey(EditLine *el, int ch)
 		return CC_EOF;
 	} else {
 		if (line->cursor != line->lastchar) {
-			line->cursor++;
+#ifdef HAVE_EL_CURSOR
+			el_cursor(el, 1);
+#endif
 			el_deletestr(el, 1);
 		}
 		return CC_REDISPLAY;
@@ -1300,6 +1305,8 @@ static void read_config(const char *dft_cfile, const char *cfile) {
 				profiles[pcount-1].use_history_file = !esl_true(val);
 			} else if(!strcasecmp(var, "prompt-color")) {
 				esl_set_string(profiles[pcount-1].prompt_color, match_color(val));
+			} else if(!strcasecmp(var, "prompt-string")) {
+				esl_set_string(profiles[pcount-1].prompt_string, val);
 			} else if(!strcasecmp(var, "input-text-color")) {
 				esl_set_string(profiles[pcount-1].input_text_color, match_color(val));
 			} else if(!strcasecmp(var, "output-text-color")) {
@@ -1340,6 +1347,58 @@ static void clear_el_buffer(void) {
 #endif
 }
 
+static void expand_prompt(char *s, size_t len, cli_profile_t *profile)
+{
+	char tmp[512] = "";
+	char *p, *q = tmp;
+
+	for (p = s; p && *p; p++) {
+		if (*p == '%') {
+			p++;
+			
+			switch(*p) {
+			case 's':
+				esl_copy_string(q, switchname, len - (q - &tmp[0]));
+				q += strlen(switchname);
+				break;
+			case 'h':
+				esl_copy_string(q, hostname, len - (q - &tmp[0]));
+				q += strlen(hostname);
+				break;
+			case 'H':
+				esl_copy_string(q, switch_hostname, len - (q - &tmp[0]));
+				q += strlen(switch_hostname);
+				break;
+			case 'p':
+				esl_copy_string(q, profile->name, len - (q - &tmp[0]));
+				q += strlen(profile->name);
+				break;
+			case 'o':
+				esl_copy_string(q, profile->host, len - (q - &tmp[0]));
+				q += strlen(profile->host);
+				break;
+			case 'P':
+				{
+					char ptmp[35] = "";
+					esl_snprintf(ptmp, sizeof(ptmp), "%d", profile->port);
+					esl_copy_string(q, ptmp, len - (q - &tmp[0]));
+					q += strlen(ptmp);
+				}
+				break;
+			case '%':
+				*q++ = '%';
+				break;
+			}
+		} else {
+			*q++ = *p;
+		}
+	}
+
+	esl_copy_string(s, tmp, len);
+
+}
+
+
 int main(int argc, char *argv[])
 {
 	esl_handle_t handle = {{0}};
@@ -1403,6 +1462,7 @@ int main(int argc, char *argv[])
 	int loops = 2, reconnect = 0;
 	char *ccheck;
 
+	gethostname(hostname, sizeof(hostname));
 
 	esl_mutex_create(&MUTEX);
 			
@@ -1430,7 +1490,7 @@ int main(int argc, char *argv[])
 
 	strncpy(internal_profile.host, "127.0.0.1", sizeof(internal_profile.host));
 	strncpy(internal_profile.pass, "ClueCon", sizeof(internal_profile.pass));
-	strncpy(internal_profile.name, "internal", sizeof(internal_profile.name));
+	strncpy(internal_profile.name, hostname, sizeof(internal_profile.name));
 	internal_profile.port = 8021;
 	set_fn_keys(&internal_profile);
 	esl_set_string(internal_profile.prompt_color, prompt_color);
@@ -1541,10 +1601,17 @@ int main(int argc, char *argv[])
 	}
 	if (!profile) {
 		if (get_profile("default", &profile)) {
-			esl_log(ESL_LOG_DEBUG, "profile default does not exist using builtin profile\n");
-			profile = &internal_profile;
+			if (!esl_strlen_zero(profiles[0].name)) {
+				profile = &profiles[0];
+			}
 		}
 	}
+	
+	if (!profile) {
+		esl_log(ESL_LOG_DEBUG, "no profiles found, using builtin profile\n");
+		profile = &internal_profile;
+	}
+
 	if (temp_log < 0 ) {
 		esl_global_set_default_logger(profile->debug);
 	}
@@ -1582,25 +1649,7 @@ int main(int argc, char *argv[])
 	esl_set_string(prompt_color, profile->prompt_color);
 	esl_set_string(input_text_color, profile->input_text_color);
 	esl_set_string(output_text_color, profile->output_text_color);
-	if (argv_host) {
-		if (argv_port && profile->port != 8021) {
-			snprintf(bare_prompt_str, sizeof(bare_prompt_str), "freeswitch@%s:%u@%s> ", profile->host, profile->port, profile->name);
-		} else {
-			snprintf(bare_prompt_str, sizeof(bare_prompt_str), "freeswitch@%s@%s> ", profile->host, profile->name);
-		}
-	} else {
-		snprintf(bare_prompt_str, sizeof(bare_prompt_str), "freeswitch@%s> ", profile->name);
-	}
-	bare_prompt_str_len = (int)strlen(bare_prompt_str);
-	if (feature_level) {
-#if HAVE_DECL_EL_PROMPT_ESC
-		snprintf(prompt_str, sizeof(prompt_str), "\1%s\1%s\1%s\1", prompt_color, bare_prompt_str, input_text_color);
-#else
-		snprintf(prompt_str, sizeof(prompt_str), "%s%s%s", prompt_color, bare_prompt_str, input_text_color);
-#endif
-	} else {
-		snprintf(prompt_str, sizeof(prompt_str), "%s", bare_prompt_str);
-	}
+
  connect:
 	connected = 0;
 	while (--loops > 0) {
@@ -1657,6 +1706,42 @@ int main(int argc, char *argv[])
 		return 0;
 	}
 
+	snprintf(cmd_str, sizeof(cmd_str), "api switchname\n\n");
+	esl_send_recv(global_handle, cmd_str);
+	if (global_handle->last_sr_event && global_handle->last_sr_event->body) {
+		esl_set_string(switchname, global_handle->last_sr_event->body);
+	} else {
+		esl_set_string(switchname, profile->name);
+	}
+
+
+	snprintf(cmd_str, sizeof(cmd_str), "api hostname\n\n");
+	esl_send_recv(global_handle, cmd_str);
+	if (global_handle->last_sr_event && global_handle->last_sr_event->body) {
+		esl_set_string(switch_hostname, global_handle->last_sr_event->body);
+	} else {
+		esl_set_string(switch_hostname, profile->name);
+	}
+
+	if (!esl_strlen_zero(profile->prompt_string)) {
+		expand_prompt(profile->prompt_string, sizeof(profile->prompt_string), profile);
+		snprintf(bare_prompt_str, sizeof(bare_prompt_str), "%s> ", profile->prompt_string);
+	} else {
+		snprintf(bare_prompt_str, sizeof(bare_prompt_str), "freeswitch@%s> ", switchname);
+	}
+
+	bare_prompt_str_len = (int)strlen(bare_prompt_str);
+	if (feature_level) {
+#if HAVE_DECL_EL_PROMPT_ESC
+		snprintf(prompt_str, sizeof(prompt_str), "\1%s\1%s\1%s\1", prompt_color, bare_prompt_str, input_text_color);
+#else
+		snprintf(prompt_str, sizeof(prompt_str), "%s%s%s", prompt_color, bare_prompt_str, input_text_color);
+#endif
+	} else {
+		snprintf(prompt_str, sizeof(prompt_str), "%s", bare_prompt_str);
+	}
+
+
 #ifdef HAVE_LIBEDIT
 	el = el_init(__FILE__, stdin, stdout, stderr);
 #if HAVE_DECL_EL_PROMPT_ESC
diff --git a/libs/libvpx/.gitignore b/libs/libvpx/.gitignore
new file mode 100644
index 0000000000..9fed8d5b67
--- /dev/null
+++ b/libs/libvpx/.gitignore
@@ -0,0 +1,14 @@
+*.d
+.bins
+.docs
+Makefile
+config.mk
+libs-*.mk
+vp8_rtcd.h
+vp9_rtcd.h
+vpx_config.asm
+vpx_config.c
+vpx_config.h
+vpx_dsp_rtcd.h
+vpx_scale_rtcd.h
+vpx_version.h
diff --git a/libs/libvpx/AUTHORS b/libs/libvpx/AUTHORS
new file mode 100644
index 0000000000..f89b6776a8
--- /dev/null
+++ b/libs/libvpx/AUTHORS
@@ -0,0 +1,134 @@
+# This file is automatically generated from the git commit history
+# by tools/gen_authors.sh.
+
+Aaron Watry <awatry@gmail.com>
+Abo Talib Mahfoodh <ab.mahfoodh@gmail.com>
+Adam Xu <adam@xuyaowu.com>
+Adrian Grange <agrange@google.com>
+Aℓex Converse <aconverse@google.com>
+Ahmad Sharif <asharif@google.com>
+Alexander Voronov <avoronov@graphics.cs.msu.ru>
+Alexis Ballier <aballier@gentoo.org>
+Alok Ahuja <waveletcoeff@gmail.com>
+Alpha Lam <hclam@google.com>
+A.Mahfoodh <ab.mahfoodh@gmail.com>
+Ami Fischman <fischman@chromium.org>
+Andoni Morales Alastruey <ylatuya@gmail.com>
+Andres Mejia <mcitadel@gmail.com>
+Andrew Russell <anrussell@google.com>
+Angie Chiang <angiebird@google.com>
+Aron Rosenberg <arosenberg@logitech.com>
+Attila Nagy <attilanagy@google.com>
+Brion Vibber <bvibber@wikimedia.org>
+changjun.yang <changjun.yang@intel.com>
+Charles 'Buck' Krasic <ckrasic@google.com>
+chm <chm@rock-chips.com>
+Christian Duvivier <cduvivier@google.com>
+Daniel Kang <ddkang@google.com>
+Deb Mukherjee <debargha@google.com>
+Dim Temp <dimtemp0@gmail.com>
+Dmitry Kovalev <dkovalev@google.com>
+Dragan Mrdjan <dmrdjan@mips.com>
+Ed Baker <edward.baker@intel.com>
+Ehsan Akhgari <ehsan.akhgari@gmail.com>
+Erik Niemeyer <erik.a.niemeyer@intel.com>
+Fabio Pedretti <fabio.ped@libero.it>
+Frank Galligan <fgalligan@google.com>
+Fredrik Söderquist <fs@opera.com>
+Fritz Koenig <frkoenig@google.com>
+Gaute Strokkenes <gaute.strokkenes@broadcom.com>
+Geza Lore <gezalore@gmail.com>
+Ghislain MARY <ghislainmary2@gmail.com>
+Giuseppe Scrivano <gscrivano@gnu.org>
+Gordana Cmiljanovic <gordana.cmiljanovic@imgtec.com>
+Guillaume Martres <gmartres@google.com>
+Guillermo Ballester Valor <gbvalor@gmail.com>
+Hangyu Kuang <hkuang@google.com>
+Hanno Böck <hanno@hboeck.de>
+Henrik Lundin <hlundin@google.com>
+Hui Su <huisu@google.com>
+Ivan Maltz <ivanmaltz@google.com>
+Jacek Caban <cjacek@gmail.com>
+Jacky Chen <jackychen@google.com>
+James Berry <jamesberry@google.com>
+James Yu <james.yu@linaro.org>
+James Zern <jzern@google.com>
+Jan Gerber <j@mailb.org>
+Jan Kratochvil <jan.kratochvil@redhat.com>
+Janne Salonen <jsalonen@google.com>
+Jeff Faust <jfaust@google.com>
+Jeff Muizelaar <jmuizelaar@mozilla.com>
+Jeff Petkau <jpet@chromium.org>
+Jia Jia <jia.jia@linaro.org>
+Jim Bankoski <jimbankoski@google.com>
+Jingning Han <jingning@google.com>
+Joey Parrish <joeyparrish@google.com>
+Johann Koenig <johannkoenig@google.com>
+John Koleszar <jkoleszar@google.com>
+Johnny Klonaris <google@jawknee.com>
+John Stark <jhnstrk@gmail.com>
+Joshua Bleecher Snyder <josh@treelinelabs.com>
+Joshua Litt <joshualitt@google.com>
+Julia Robson <juliamrobson@gmail.com>
+Justin Clift <justin@salasaga.org>
+Justin Lebar <justin.lebar@gmail.com>
+KO Myung-Hun <komh@chollian.net>
+Lawrence Velázquez <larryv@macports.org>
+Lou Quillio <louquillio@google.com>
+Luca Barbato <lu_zero@gentoo.org>
+Makoto Kato <makoto.kt@gmail.com>
+Mans Rullgard <mans@mansr.com>
+Marco Paniconi <marpan@google.com>
+Mark Mentovai <mark@chromium.org>
+Martin Ettl <ettl.martin78@googlemail.com>
+Martin Storsjo <martin@martin.st>
+Matthew Heaney <matthewjheaney@chromium.org>
+Michael Kohler <michaelkohler@live.com>
+Mike Frysinger <vapier@chromium.org>
+Mike Hommey <mhommey@mozilla.com>
+Mikhal Shemer <mikhal@google.com>
+Minghai Shang <minghai@google.com>
+Morton Jonuschat <yabawock@gmail.com>
+Nico Weber <thakis@chromium.org>
+Parag Salasakar <img.mips1@gmail.com>
+Pascal Massimino <pascal.massimino@gmail.com>
+Patrik Westin <patrik.westin@gmail.com>
+Paul Wilkins <paulwilkins@google.com>
+Pavol Rusnak <stick@gk2.sk>
+Paweł Hajdan <phajdan@google.com>
+Pengchong Jin <pengchong@google.com>
+Peter de Rivaz <peter.derivaz@gmail.com>
+Philip Jägenstedt <philipj@opera.com>
+Priit Laes <plaes@plaes.org>
+Rafael Ávila de Espíndola <rafael.espindola@gmail.com>
+Rafaël Carré <funman@videolan.org>
+Ralph Giles <giles@xiph.org>
+Rob Bradford <rob@linux.intel.com>
+Ronald S. Bultje <rsbultje@gmail.com>
+Rui Ueyama <ruiu@google.com>
+Sami Pietilä <samipietila@google.com>
+Scott Graham <scottmg@chromium.org>
+Scott LaVarnway <slavarnway@google.com>
+Sean McGovern <gseanmcg@gmail.com>
+Sergey Ulanov <sergeyu@chromium.org>
+Shimon Doodkin <helpmepro1@gmail.com>
+Shunyao Li <shunyaoli@google.com>
+Stefan Holmer <holmer@google.com>
+Suman Sunkara <sunkaras@google.com>
+Taekhyun Kim <takim@nvidia.com>
+Takanori MATSUURA <t.matsuu@gmail.com>
+Tamar Levy <tamar.levy@intel.com>
+Tao Bai <michaelbai@chromium.org>
+Tero Rintaluoma <teror@google.com>
+Thijs Vermeir <thijsvermeir@gmail.com>
+Tim Kopp <tkopp@google.com>
+Timothy B. Terriberry <tterribe@xiph.org>
+Tom Finegan <tomfinegan@google.com>
+Vignesh Venkatasubramanian <vigneshv@google.com>
+Yaowu Xu <yaowu@google.com>
+Yongzhe Wang <yongzhe@google.com>
+Yunqing Wang <yunqingwang@google.com>
+Zoe Liu <zoeliu@google.com>
+Google Inc.
+The Mozilla Foundation
+The Xiph.Org Foundation
diff --git a/libs/libvpx/CHANGELOG b/libs/libvpx/CHANGELOG
new file mode 100644
index 0000000000..7746cc6c4f
--- /dev/null
+++ b/libs/libvpx/CHANGELOG
@@ -0,0 +1,624 @@
+2015-11-09 v1.5.0 "Javan Whistling Duck"
+  This release improves upon the VP9 encoder and speeds up the encoding and
+  decoding processes.
+
+  - Upgrading:
+    This release is ABI incompatible with 1.4.0. It drops deprecated VP8
+    controls and adds a variety of VP9 controls for testing.
+
+    The vpxenc utility now prefers VP9 by default.
+
+  - Enhancements:
+    Faster VP9 encoding and decoding
+    Smaller library size by combining functions used by VP8 and VP9
+
+  - Bug Fixes:
+    A variety of fuzzing issues
+
+2015-04-03 v1.4.0 "Indian Runner Duck"
+  This release includes significant improvements to the VP9 codec.
+
+  - Upgrading:
+    This release is ABI incompatible with 1.3.0. It drops the compatibility
+    layer, requiring VPX_IMG_FMT_* instead of IMG_FMT_*, and adds several codec
+    controls for VP9.
+
+  - Enhancements:
+    Faster VP9 encoding and decoding
+    Multithreaded VP9 decoding (tile and frame-based)
+    Multithreaded VP9 encoding - on by default
+    YUV 4:2:2 and 4:4:4 support in VP9
+    10 and 12bit support in VP9
+    64bit ARM support by replacing ARM assembly with intrinsics
+
+  - Bug Fixes:
+    Fixes a VP9 bitstream issue in Profile 1. This only affected non-YUV 4:2:0
+    files.
+
+  - Known Issues:
+    Frame Parallel decoding fails for segmented and non-420 files.
+
+2013-11-15 v1.3.0 "Forest"
+  This release introduces the VP9 codec in a backward-compatible way.
+  All existing users of VP8 can continue to use the library without
+  modification. However, some VP8 options do not map to VP9 in the same manner.
+
+  The VP9 encoder in this release is not feature complete. Users interested in
+  the encoder are advised to use the git master branch and discuss issues on
+  libvpx mailing lists.
+
+  - Upgrading:
+    This release is ABI and API compatible with Duclair (v1.0.0). Users
+    of older releases should refer to the Upgrading notes in this document
+    for that release.
+
+  - Enhancements:
+      Get rid of bashisms in the main build scripts
+      Added usage info on command line options
+      Add lossless compression mode
+      Dll build of libvpx
+      Add additional Mac OS X targets: 10.7, 10.8 and 10.9 (darwin11-13)
+      Add option to disable documentation
+      configure: add --enable-external-build support
+      make: support V=1 as short form of verbose=yes
+      configure: support mingw-w64
+      configure: support hardfloat armv7 CHOSTS
+      configure: add support for android x86
+      Add estimated completion time to vpxenc
+      Don't exit on decode errors in vpxenc
+      vpxenc: support scaling prior to encoding
+      vpxdec: support scaling output
+      vpxenc: improve progress indicators with --skip
+      msvs: Don't link to winmm.lib
+      Add a new script for producing vcxproj files
+      Produce Visual Studio 10 and 11 project files
+      Produce Windows Phone project files
+      msvs-build: use msbuild for vs >= 2005
+      configure: default configure log to config.log
+      Add encoding option --static-thresh
+
+  - Speed:
+      Miscellaneous speed optimizations for VP8 and VP9.
+
+  - Quality:
+      In general, quality is consistent with the Eider release.
+
+  - Bug Fixes:
+      This release represents approximately a year of engineering effort,
+      and contains multiple bug fixes. Please refer to git history for details.
+
+
+2012-12-21 v1.2.0
+  This release acts as a checkpoint for a large amount of internal refactoring
+  and testing. It also contains a number of small bugfixes, so all users are
+  encouraged to upgrade.
+
+  - Upgrading:
+    This release is ABI and API compatible with Duclair (v1.0.0). Users
+    of older releases should refer to the Upgrading notes in this
+    document for that release.
+
+  - Enhancements:
+      VP8 optimizations for MIPS dspr2
+      vpxenc: add -quiet option
+
+  - Speed:
+      Encoder and decoder speed is consistent with the Eider release.
+
+  - Quality:
+      In general, quality is consistent with the Eider release.
+
+      Minor tweaks to ARNR filtering
+      Minor improvements to real time encoding with multiple temporal layers
+
+  - Bug Fixes:
+      Fixes multithreaded encoder race condition in loopfilter
+      Fixes multi-resolution threaded encoding
+      Fix potential encoder dead-lock after picture resize
+
+
+2012-05-09 v1.1.0 "Eider"
+  This introduces a number of enhancements, mostly focused on real-time
+  encoding. In addition, it fixes a decoder bug (first introduced in
+  Duclair) so all users of that release are encouraged to upgrade.
+
+  - Upgrading:
+    This release is ABI and API compatible with Duclair (v1.0.0). Users
+    of older releases should refer to the Upgrading notes in this
+    document for that release.
+
+    This release introduces a new temporal denoiser, controlled by the
+    VP8E_SET_NOISE_SENSITIVITY control. The temporal denoiser does not
+    currently take a strength parameter, so the control is effectively
+    a boolean - zero (off) or non-zero (on). For compatibility with
+    existing applications, the values accepted are the same as those
+    for the spatial denoiser (0-6). The temporal denoiser is enabled
+    by default, and the older spatial denoiser may be restored by
+    configuring with --disable-temporal-denoising. The temporal denoiser
+    is more computationally intensive than the spatial one.
+
+    This release removes support for a legacy, decode only API that was
+    supported, but deprecated, at the initial release of libvpx
+    (v0.9.0). This is not expected to have any impact. If you are
+    impacted, you can apply a reversion to commit 2bf8fb58 locally.
+    Please update to the latest libvpx API if you are affected.
+
+  - Enhancements:
+      Adds a motion compensated temporal denoiser to the encoder, which
+      gives higher quality than the older spatial denoiser. (See above
+      for notes on upgrading).
+
+      In addition, support for new compilers and platforms were added,
+      including:
+        improved support for XCode
+        Android x86 NDK build
+        OS/2 support
+        SunCC support
+
+      Changing resolution with vpx_codec_enc_config_set() is now
+      supported. Previously, reinitializing the codec was required to
+      change the input resolution.
+
+      The vpxenc application has initial support for producing multiple
+      encodes from the same input in one call. Resizing is not yet
+      supported, but varying other codec parameters is. Use -- to
+      delineate output streams. Options persist from one stream to the
+      next.
+
+      Also, the vpxenc application will now use a keyframe interval of
+      5 seconds by default. Use the --kf-max-dist option to override.
+
+  - Speed:
+      Decoder performance improved 2.5% versus Duclair. Encoder speed is
+      consistent with Duclair for most material. Two pass encoding of
+      slideshow-like material will see significant improvements.
+
+      Large realtime encoding speed gains at a small quality expense are
+      possible by configuring the on-the-fly bitpacking experiment with
+      --enable-onthefly-bitpacking. Realtime encoder can be up to 13%
+      faster (ARM) depending on the number of threads and bitrate
+      settings. This technique sees constant gain over the 5-16 speed
+      range. For VC style input the loss seen is up to 0.2dB. See commit
+      52cf4dca for further details.
+
+  - Quality:
+      On the whole, quality is consistent with the Duclair release. Some
+      tweaks:
+
+        Reduced blockiness in easy sections by applying a penalty to
+        intra modes.
+
+        Improved quality of static sections (like slideshows) with
+        two pass encoding.
+
+        Improved keyframe sizing with multiple temporal layers
+
+  - Bug Fixes:
+      Corrected alt-ref contribution to frame rate for visible updates
+      to the alt-ref buffer. This affected applications making manual
+      usage of the frame reference flags, or temporal layers.
+
+      Additional constraints were added to disable multi-frame quality
+      enhancement (MFQE) in sections of the frame where there is motion.
+      (#392)
+
+      Fixed corruption issues when vpx_codec_enc_config_set() was called
+      with spatial resampling enabled.
+
+      Fixed a decoder error introduced in Duclair where the segmentation
+      map was not being reinitialized on keyframes (#378)
+
+
+2012-01-27 v1.0.0 "Duclair"
+  Our fourth named release, focused on performance and features related to
+  real-time encoding. It also fixes a decoder crash bug introduced in
+  v0.9.7, so all users of that release are encouraged to upgrade.
+
+  - Upgrading:
+      This release is ABI incompatible with prior releases of libvpx, so the
+      "major" version number has been bumped to 1. You must recompile your
+      applications against the latest version of the libvpx headers. The
+      API remains compatible, and this should not require code changes in most
+      applications.
+
+  - Enhancements:
+      This release introduces several substantial new features to the encoder,
+      of particular interest to real time streaming applications.
+
+      Temporal scalability allows the encoder to produce a stream that can
+      be decimated to different frame rates, with independent rate targetting
+      for each substream.
+
+      Multiframe quality enhancement postprocessing can make visual quality
+      more consistent in the presence of frames that are substantially
+      different quality than the surrounding frames, as in the temporal
+      scalability case and in some forced keyframe scenarios.
+
+      Multiple-resolution encoding support allows the encoding of the
+      same content at different resolutions faster than encoding them
+      separately.
+
+  - Speed:
+      Optimization targets for this release included the decoder and the real-
+      time modes of the encoder. Decoder speed on x86 has improved 10.5% with
+      this release. Encoder improvements followed a curve where speeds 1-3
+      improved 4.0%-1.5%, speeds 4-8 improved <1%, and speeds 9-16 improved
+      1.5% to 10.5%, respectively. "Best" mode speed is consistent with the
+      Cayuga release.
+
+  - Quality:
+      Encoder quality in the single stream case is consistent with the Cayuga
+      release.
+
+  - Bug Fixes:
+      This release fixes an OOB read decoder crash bug present in v0.9.7
+      related to the clamping of motion vectors in SPLITMV blocks. This
+      behavior could be triggered by corrupt input or by starting
+      decoding from a P-frame.
+
+
+2011-08-15 v0.9.7-p1 "Cayuga" patch 1
+  This is an incremental bugfix release against Cayuga. All users of that
+  release are strongly encouraged to upgrade.
+
+    - Fix potential OOB reads (cdae03a)
+
+          An unbounded out of bounds read was discovered when the
+          decoder was requested to perform error concealment (new in
+          Cayuga) given a frame with corrupt partition sizes.
+
+          A bounded out of bounds read was discovered affecting all
+          versions of libvpx. Given an multipartition input frame that
+          is truncated between the mode/mv partition and the first
+          residiual paritition (in the block of partition offsets), up
+          to 3 extra bytes could have been read from the source buffer.
+          The code will not take any action regardless of the contents
+          of these undefined bytes, as the truncated buffer is detected
+          immediately following the read based on the calculated
+          starting position of the coefficient partition.
+
+    - Fix potential error concealment crash when the very first frame
+      is missing or corrupt (a609be5)
+
+    - Fix significant artifacts in error concealment (a4c2211, 99d870a)
+
+    - Revert 1-pass CBR rate control changes (e961317)
+      Further testing showed this change produced undesirable visual
+      artifacts, rolling back for now.
+
+
+2011-08-02 v0.9.7 "Cayuga"
+  Our third named release, focused on a faster, higher quality, encoder.
+
+  - Upgrading:
+    This release is backwards compatible with Aylesbury (v0.9.5) and
+    Bali (v0.9.6). Users of older releases should refer to the Upgrading
+    notes in this document for that release.
+
+  - Enhancements:
+          Stereo 3D format support for vpxenc
+          Runtime detection of available processor cores.
+          Allow specifying --end-usage by enum name
+          vpxdec: test for frame corruption
+          vpxenc: add quantizer histogram display
+          vpxenc: add rate histogram display
+          Set VPX_FRAME_IS_DROPPABLE
+          update configure for ios sdk 4.3
+          Avoid text relocations in ARM vp8 decoder
+          Generate a vpx.pc file for pkg-config.
+          New ways of passing encoded data between encoder and decoder.
+
+  - Speed:
+      This release includes across-the-board speed improvements to the
+      encoder. On x86, these measure at approximately 11.5% in Best mode,
+      21.5% in Good mode (speed 0), and 22.5% in Realtime mode (speed 6).
+      On ARM Cortex A9 with Neon extensions, real-time encoding of video
+      telephony content is 35% faster than Bali on single core and 48%
+      faster on multi-core. On the NVidia Tegra2 platform, real time
+      encoding is 40% faster than Bali.
+
+      Decoder speed was not a priority for this release, but improved
+      approximately 8.4% on x86.
+
+          Reduce motion vector search on alt-ref frame.
+          Encoder loopfilter running in its own thread
+          Reworked loopfilter to precalculate more parameters
+          SSE2/SSSE3 optimizations for build_predictors_mbuv{,_s}().
+          Make hor UV predict ~2x faster (73 vs 132 cycles) using SSSE3.
+          Removed redundant checks
+          Reduced structure sizes
+          utilize preload in ARMv6 MC/LPF/Copy routines
+          ARM optimized quantization, dfct, variance, subtract
+          Increase chrow row alignment to 16 bytes.
+          disable trellis optimization for first pass
+          Write SSSE3 sub-pixel filter function
+          Improve SSE2 half-pixel filter funtions
+          Add vp8_sub_pixel_variance16x8_ssse3 function
+          Reduce unnecessary distortion computation
+          Use diamond search to replace full search
+          Preload reference area in sub-pixel motion search (real-time mode)
+
+  - Quality:
+      This release focused primarily on one-pass use cases, including
+      video conferencing. Low latency data rate control was significantly
+      improved, improving streamability over bandwidth constrained links.
+      Added support for error concealment, allowing frames to maintain
+      visual quality in the presence of substantial packet loss.
+
+          Add rc_max_intra_bitrate_pct control
+          Limit size of initial keyframe in one-pass.
+          Improve framerate adaptation
+          Improved 1-pass CBR rate control
+          Improved KF insertion after fades to still.
+          Improved key frame detection.
+          Improved activity masking (lower PSNR impact for same SSIM boost)
+          Improved interaction between GF and ARFs
+          Adding error-concealment to the decoder.
+          Adding support for independent partitions
+          Adjusted rate-distortion constants
+
+
+  - Bug Fixes:
+          Removed firstpass motion map
+          Fix parallel make install
+          Fix multithreaded encoding for 1 MB wide frame
+          Fixed iwalsh_neon build problems with RVDS4.1
+          Fix semaphore emulation, spin-wait intrinsics on Windows
+          Fix build with xcode4 and simplify GLOBAL.
+          Mark ARM asm objects as allowing a non-executable stack.
+          Fix vpxenc encoding incorrect webm file header on big endian
+
+
+2011-03-07 v0.9.6 "Bali"
+  Our second named release, focused on a faster, higher quality, encoder.
+
+  - Upgrading:
+    This release is backwards compatible with Aylesbury (v0.9.5). Users
+    of older releases should refer to the Upgrading notes in this
+    document for that release.
+
+  - Enhancements:
+      vpxenc --psnr shows a summary when encode completes
+      --tune=ssim option to enable activity masking
+      improved postproc visualizations for development
+      updated support for Apple iOS to SDK 4.2
+      query decoder to determine which reference frames were updated
+      implemented error tracking in the decoder
+      fix pipe support on windows
+
+  - Speed:
+      Primary focus was on good quality mode, speed 0. Average improvement
+      on x86 about 40%, up to 100% on user-generated content at that speed.
+      Best quality mode speed improved 35%, and realtime speed 10-20%. This
+      release also saw significant improvement in realtime encoding speed
+      on ARM platforms.
+
+        Improved encoder threading
+        Dont pick encoder filter level when loopfilter is disabled.
+        Avoid double copying of key frames into alt and golden buffer
+        FDCT optimizations.
+        x86 sse2 temporal filter
+        SSSE3 version of fast quantizer
+        vp8_rd_pick_best_mbsegmentation code restructure
+        Adjusted breakout RD for SPLITMV
+        Changed segmentation check order
+        Improved rd_pick_intra4x4block
+        Adds armv6 optimized variance calculation
+        ARMv6 optimized sad16x16
+        ARMv6 optimized half pixel variance calculations
+        Full search SAD function optimization in SSE4.1
+        Improve MV prediction accuracy to achieve performance gain
+        Improve MV prediction in vp8_pick_inter_mode() for speed>3
+
+  - Quality:
+      Best quality mode improved PSNR 6.3%, and SSIM 6.1%. This release
+      also includes support for "activity masking," which greatly improves
+      SSIM at the expense of PSNR. For now, this feature is available with
+      the --tune=ssim option. Further experimentation in this area
+      is ongoing. This release also introduces a new rate control mode
+      called "CQ," which changes the allocation of bits within a clip to
+      the sections where they will have the most visual impact.
+
+        Tuning for the more exact quantizer.
+        Relax rate control for last few frames
+        CQ Mode
+        Limit key frame quantizer for forced key frames.
+        KF/GF Pulsing
+        Add simple version of activity masking.
+        make rdmult adaptive for intra in quantizer RDO
+        cap the best quantizer for 2nd order DC
+        change the threshold of DC check for encode breakout
+
+  - Bug Fixes:
+      Fix crash on Sparc Solaris.
+      Fix counter of fixed keyframe distance
+      ARNR filter pointer update bug fix
+      Fixed use of motion percentage in KF/GF group calc
+      Changed condition for using RD in Intra Mode
+      Fix encoder real-time only configuration.
+      Fix ARM encoder crash with multiple token partitions
+      Fixed bug first cluster timecode of webm file is wrong.
+      Fixed various encoder bugs with odd-sized images
+      vp8e_get_preview fixed when spatial resampling enabled
+      quantizer: fix assertion in fast quantizer path
+      Allocate source buffers to be multiples of 16
+      Fix for manual Golden frame frequency
+      Fix drastic undershoot in long form content
+
+
+2010-10-28 v0.9.5 "Aylesbury"
+  Our first named release, focused on a faster decoder, and a better encoder.
+
+  - Upgrading:
+    This release incorporates backwards-incompatible changes to the
+    ivfenc and ivfdec tools. These tools are now called vpxenc and vpxdec.
+
+    vpxdec
+      * the -q (quiet) option has been removed, and replaced with
+        -v (verbose). the output is quiet by default. Use -v to see
+        the version number of the binary.
+
+      * The default behavior is now to write output to a single file
+        instead of individual frames. The -y option has been removed.
+        Y4M output is the default.
+
+      * For raw I420/YV12 output instead of Y4M, the --i420 or --yv12
+        options must be specified.
+
+          $ ivfdec -o OUTPUT INPUT
+          $ vpxdec --i420 -o OUTPUT INPUT
+
+      * If an output file is not specified, the default is to write
+        Y4M to stdout. This makes piping more natural.
+
+          $ ivfdec -y -o - INPUT | ...
+          $ vpxdec INPUT | ...
+
+      * The output file has additional flexibility for formatting the
+        filename. It supports escape characters for constructing a
+        filename from the width, height, and sequence number. This
+        replaces the -p option. To get the equivalent:
+
+          $ ivfdec -p frame INPUT
+          $ vpxdec --i420 -o frame-%wx%h-%4.i420 INPUT
+
+    vpxenc
+      * The output file must be specified with -o, rather than as the
+        last argument.
+
+          $ ivfenc <options> INPUT OUTPUT
+          $ vpxenc <options> -o OUTPUT INPUT
+
+      * The output defaults to webm. To get IVF output, use the --ivf
+        option.
+
+          $ ivfenc <options> INPUT OUTPUT.ivf
+          $ vpxenc <options> -o OUTPUT.ivf --ivf INPUT
+
+
+  - Enhancements:
+      ivfenc and ivfdec have been renamed to vpxenc, vpxdec.
+      vpxdec supports .webm input
+      vpxdec writes .y4m by default
+      vpxenc writes .webm output by default
+      vpxenc --psnr now shows the average/overall PSNR at the end
+      ARM platforms now support runtime cpu detection
+      vpxdec visualizations added for motion vectors, block modes, references
+      vpxdec now silent by default
+      vpxdec --progress shows frame-by-frame timing information
+      vpxenc supports the distinction between --fps and --timebase
+      NASM is now a supported assembler
+      configure: enable PIC for shared libs by default
+      configure: add --enable-small
+      configure: support for ppc32-linux-gcc
+      configure: support for sparc-solaris-gcc
+
+  - Bugs:
+      Improve handling of invalid frames
+      Fix valgrind errors in the NEON loop filters.
+      Fix loopfilter delta zero transitions
+      Fix valgrind errors in vp8_sixtap_predict8x4_armv6().
+      Build fixes for darwin-icc
+
+  - Speed:
+      20-40% (average 28%) improvement in libvpx decoder speed,
+      including:
+        Rewrite vp8_short_walsh4x4_sse2()
+        Optimizations on the loopfilters.
+        Miscellaneous improvements for Atom
+        Add 4-tap version of 2nd-pass ARMv6 MC filter.
+        Improved multithread utilization
+        Better instruction choices on x86
+        reorder data to use wider instructions
+        Update NEON wide idcts
+        Make block access to frame buffer sequential
+        Improved subset block search
+        Bilinear subpixel optimizations for ssse3.
+        Decrease memory footprint
+
+      Encoder speed improvements (percentage gain not measured):
+        Skip unnecessary search of identical frames
+        Add SSE2 subtract functions
+        Improve bounds checking in vp8_diamond_search_sadx4()
+        Added vp8_fast_quantize_b_sse2
+
+  - Quality:
+      Over 7% overall PSNR improvement (6.3% SSIM) in "best" quality
+      encoding mode, and up to 60% improvement on very noisy, still
+      or slow moving source video
+
+        Motion compensated temporal filter for Alt-Ref Noise Reduction
+        Improved use of trellis quantization on 2nd order Y blocks
+        Tune effect of motion on KF/GF boost in two pass
+        Allow coefficient optimization for good quality speed 0.
+        Improved control of active min quantizer for two pass.
+        Enable ARFs for non-lagged compress
+
+2010-09-02 v0.9.2
+  - Enhancements:
+      Disable frame dropping by default
+      Improved multithreaded performance
+      Improved Force Key Frame Behaviour
+      Increased rate control buffer level precision
+      Fix bug in 1st pass motion compensation
+      ivfenc: correct fixed kf interval, --disable-kf
+  - Speed:
+      Changed above and left context data layout
+      Rework idct calling structure.
+      Removed unnecessary MB_MODE_INFO copies
+      x86: SSSE3 sixtap prediction
+      Reworked IDCT to include reconstruction (add) step
+      Swap alt/gold/new/last frame buffer ptrs instead of copying.
+      Improve SSE2 loopfilter functions
+      Change bitreader to use a larger window.
+      Avoid loopfilter reinitialization when possible
+  - Quality:
+      Normalize quantizer's zero bin and rounding factors
+      Add trellis quantization.
+      Make the quantizer exact.
+      Updates to ARNR filtering algorithm
+      Fix breakout thresh computation for golden & AltRef frames
+      Redo the forward 4x4 dct
+      Improve the accuracy of forward walsh-hadamard transform
+      Further adjustment of RD behaviour with Q and Zbin.
+  - Build System:
+      Allow linking of libs built with MinGW to MSVC
+      Fix target auto-detection on mingw32
+      Allow --cpu= to work for x86.
+      configure: pass original arguments through to make dist
+      Fix builds without runtime CPU detection
+      msvs: fix install of codec sources
+      msvs: Change devenv.com command line for better msys support
+      msvs: Add vs9 targets.
+      Add x86_64-linux-icc target
+  - Bugs:
+      Potential crashes on older MinGW builds
+      Fix two-pass framrate for Y4M input.
+      Fixed simple loop filter, other crashes on ARM v6
+      arm: fix missing dependency with --enable-shared
+      configure: support directories containing .o
+      Replace pinsrw (SSE) with MMX instructions
+      apple: include proper mach primatives
+      Fixed rate control bug with long key frame interval.
+      Fix DSO link errors on x86-64 when not using a version script
+      Fixed buffer selection for UV in AltRef filtering
+
+
+2010-06-17 v0.9.1
+  - Enhancements:
+      * ivfenc/ivfdec now support YUV4MPEG2 input and pipe I/O
+      * Speed optimizations
+  - Bugfixes:
+      * Rate control
+      * Prevent out-of-bounds accesses on invalid data
+  - Build system updates:
+      * Detect toolchain to be used automatically for native builds
+      * Support building shared libraries
+      * Better autotools emulation (--prefix, --libdir, DESTDIR)
+  - Updated LICENSE
+      * http://webmproject.blogspot.com/2010/06/changes-to-webm-open-source-license.html
+
+
+2010-05-18 v0.9.0
+  - Initial open source release. Welcome to WebM and VP8!
+
diff --git a/libs/libvpx/LICENSE b/libs/libvpx/LICENSE
new file mode 100644
index 0000000000..1ce44343c4
--- /dev/null
+++ b/libs/libvpx/LICENSE
@@ -0,0 +1,31 @@
+Copyright (c) 2010, The WebM Project authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+
+  * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in
+    the documentation and/or other materials provided with the
+    distribution.
+
+  * Neither the name of Google, nor the WebM Project, nor the names
+    of its contributors may be used to endorse or promote products
+    derived from this software without specific prior written
+    permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
diff --git a/libs/libvpx/PATENTS b/libs/libvpx/PATENTS
new file mode 100644
index 0000000000..caedf607e9
--- /dev/null
+++ b/libs/libvpx/PATENTS
@@ -0,0 +1,23 @@
+Additional IP Rights Grant (Patents)
+------------------------------------
+
+"These implementations" means the copyrightable works that implement the WebM
+codecs distributed by Google as part of the WebM Project.
+
+Google hereby grants to you a perpetual, worldwide, non-exclusive, no-charge,
+royalty-free, irrevocable (except as stated in this section) patent license to
+make, have made, use, offer to sell, sell, import, transfer, and otherwise
+run, modify and propagate the contents of these implementations of WebM, where
+such license applies only to those patent claims, both currently owned by
+Google and acquired in the future, licensable by Google that are necessarily
+infringed by these implementations of WebM. This grant does not include claims
+that would be infringed only as a consequence of further modification of these
+implementations. If you or your agent or exclusive licensee institute or order
+or agree to the institution of patent litigation or any other patent
+enforcement activity against any entity (including a cross-claim or
+counterclaim in a lawsuit) alleging that any of these implementations of WebM
+or any code incorporated within any of these implementations of WebM
+constitute direct or contributory patent infringement, or inducement of
+patent infringement, then any patent rights granted to you under this License
+for these implementations of WebM shall terminate as of the date such
+litigation is filed.
diff --git a/libs/libvpx/README b/libs/libvpx/README
new file mode 100644
index 0000000000..979440eb70
--- /dev/null
+++ b/libs/libvpx/README
@@ -0,0 +1,139 @@
+README - 23 March 2015
+
+Welcome to the WebM VP8/VP9 Codec SDK!
+
+COMPILING THE APPLICATIONS/LIBRARIES:
+  The build system used is similar to autotools. Building generally consists of
+  "configuring" with your desired build options, then using GNU make to build
+  the application.
+
+  1. Prerequisites
+
+    * All x86 targets require the Yasm[1] assembler be installed.
+    * All Windows builds require that Cygwin[2] be installed.
+    * Building the documentation requires Doxygen[3]. If you do not
+      have this package, the install-docs option will be disabled.
+    * Downloading the data for the unit tests requires curl[4] and sha1sum.
+      sha1sum is provided via the GNU coreutils, installed by default on
+      many *nix platforms, as well as MinGW and Cygwin. If coreutils is not
+      available, a compatible version of sha1sum can be built from
+      source[5]. These requirements are optional if not running the unit
+      tests.
+
+    [1]: http://www.tortall.net/projects/yasm
+    [2]: http://www.cygwin.com
+    [3]: http://www.doxygen.org
+    [4]: http://curl.haxx.se
+    [5]: http://www.microbrew.org/tools/md5sha1sum/
+
+  2. Out-of-tree builds
+  Out of tree builds are a supported method of building the application. For
+  an out of tree build, the source tree is kept separate from the object
+  files produced during compilation. For instance:
+
+    $ mkdir build
+    $ cd build
+    $ ../libvpx/configure <options>
+    $ make
+
+  3. Configuration options
+  The 'configure' script supports a number of options. The --help option can be
+  used to get a list of supported options:
+    $ ../libvpx/configure --help
+
+  4. Cross development
+  For cross development, the most notable option is the --target option. The
+  most up-to-date list of supported targets can be found at the bottom of the
+  --help output of the configure script. As of this writing, the list of
+  available targets is:
+
+    armv6-darwin-gcc
+    armv6-linux-rvct
+    armv6-linux-gcc
+    armv6-none-rvct
+    arm64-darwin-gcc
+    armv7-android-gcc
+    armv7-darwin-gcc
+    armv7-linux-rvct
+    armv7-linux-gcc
+    armv7-none-rvct
+    armv7-win32-vs11
+    armv7-win32-vs12
+    armv7-win32-vs14
+    armv7s-darwin-gcc
+    mips32-linux-gcc
+    mips64-linux-gcc
+    sparc-solaris-gcc
+    x86-android-gcc
+    x86-darwin8-gcc
+    x86-darwin8-icc
+    x86-darwin9-gcc
+    x86-darwin9-icc
+    x86-darwin10-gcc
+    x86-darwin11-gcc
+    x86-darwin12-gcc
+    x86-darwin13-gcc
+    x86-darwin14-gcc
+    x86-iphonesimulator-gcc
+    x86-linux-gcc
+    x86-linux-icc
+    x86-os2-gcc
+    x86-solaris-gcc
+    x86-win32-gcc
+    x86-win32-vs7
+    x86-win32-vs8
+    x86-win32-vs9
+    x86-win32-vs10
+    x86-win32-vs11
+    x86-win32-vs12
+    x86-win32-vs14
+    x86_64-android-gcc
+    x86_64-darwin9-gcc
+    x86_64-darwin10-gcc
+    x86_64-darwin11-gcc
+    x86_64-darwin12-gcc
+    x86_64-darwin13-gcc
+    x86_64-darwin14-gcc
+    x86_64-iphonesimulator-gcc
+    x86_64-linux-gcc
+    x86_64-linux-icc
+    x86_64-solaris-gcc
+    x86_64-win64-gcc
+    x86_64-win64-vs8
+    x86_64-win64-vs9
+    x86_64-win64-vs10
+    x86_64-win64-vs11
+    x86_64-win64-vs12
+    x86_64-win64-vs14
+    generic-gnu
+
+  The generic-gnu target, in conjunction with the CROSS environment variable,
+  can be used to cross compile architectures that aren't explicitly listed, if
+  the toolchain is a cross GNU (gcc/binutils) toolchain. Other POSIX toolchains
+  will likely work as well. For instance, to build using the mipsel-linux-uclibc
+  toolchain, the following command could be used (note, POSIX SH syntax, adapt
+  to your shell as necessary):
+
+    $ CROSS=mipsel-linux-uclibc- ../libvpx/configure
+
+  In addition, the executables to be invoked can be overridden by specifying the
+  environment variables: CC, AR, LD, AS, STRIP, NM. Additional flags can be
+  passed to these executables with CFLAGS, LDFLAGS, and ASFLAGS.
+
+  5. Configuration errors
+  If the configuration step fails, the first step is to look in the error log.
+  This defaults to config.log. This should give a good indication of what went
+  wrong. If not, contact us for support.
+
+VP8/VP9 TEST VECTORS:
+  The test vectors can be downloaded and verified using the build system after
+  running configure. To specify an alternate directory the
+  LIBVPX_TEST_DATA_PATH environment variable can be used.
+
+  $ ./configure --enable-unit-tests
+  $ LIBVPX_TEST_DATA_PATH=../libvpx-test-data make testdata
+
+SUPPORT
+  This library is an open source project supported by its community. Please
+  please email webm-discuss@webmproject.org for help.
+
diff --git a/libs/libvpx/args.c b/libs/libvpx/args.c
new file mode 100644
index 0000000000..14b031040a
--- /dev/null
+++ b/libs/libvpx/args.c
@@ -0,0 +1,236 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include "args.h"
+
+#include "vpx_ports/msvc.h"
+
+#if defined(__GNUC__) && __GNUC__
+extern void die(const char *fmt, ...) __attribute__((noreturn));
+#else
+extern void die(const char *fmt, ...);
+#endif
+
+
+struct arg arg_init(char **argv) {
+  struct arg a;
+
+  a.argv      = argv;
+  a.argv_step = 1;
+  a.name      = NULL;
+  a.val       = NULL;
+  a.def       = NULL;
+  return a;
+}
+
+int arg_match(struct arg *arg_, const struct arg_def *def, char **argv) {
+  struct arg arg;
+
+  if (!argv[0] || argv[0][0] != '-')
+    return 0;
+
+  arg = arg_init(argv);
+
+  if (def->short_name
+      && strlen(arg.argv[0]) == strlen(def->short_name) + 1
+      && !strcmp(arg.argv[0] + 1, def->short_name)) {
+
+    arg.name = arg.argv[0] + 1;
+    arg.val = def->has_val ? arg.argv[1] : NULL;
+    arg.argv_step = def->has_val ? 2 : 1;
+  } else if (def->long_name) {
+    const size_t name_len = strlen(def->long_name);
+
+    if (strlen(arg.argv[0]) >= name_len + 2
+        && arg.argv[0][1] == '-'
+        && !strncmp(arg.argv[0] + 2, def->long_name, name_len)
+        && (arg.argv[0][name_len + 2] == '='
+            || arg.argv[0][name_len + 2] == '\0')) {
+
+      arg.name = arg.argv[0] + 2;
+      arg.val = arg.name[name_len] == '=' ? arg.name + name_len + 1 : NULL;
+      arg.argv_step = 1;
+    }
+  }
+
+  if (arg.name && !arg.val && def->has_val)
+    die("Error: option %s requires argument.\n", arg.name);
+
+  if (arg.name && arg.val && !def->has_val)
+    die("Error: option %s requires no argument.\n", arg.name);
+
+  if (arg.name
+      && (arg.val || !def->has_val)) {
+    arg.def = def;
+    *arg_ = arg;
+    return 1;
+  }
+
+  return 0;
+}
+
+
+const char *arg_next(struct arg *arg) {
+  if (arg->argv[0])
+    arg->argv += arg->argv_step;
+
+  return *arg->argv;
+}
+
+
+char **argv_dup(int argc, const char **argv) {
+  char **new_argv = malloc((argc + 1) * sizeof(*argv));
+
+  memcpy(new_argv, argv, argc * sizeof(*argv));
+  new_argv[argc] = NULL;
+  return new_argv;
+}
+
+
+void arg_show_usage(FILE *fp, const struct arg_def *const *defs) {
+  char option_text[40] = {0};
+
+  for (; *defs; defs++) {
+    const struct arg_def *def = *defs;
+    char *short_val = def->has_val ? " <arg>" : "";
+    char *long_val = def->has_val ? "=<arg>" : "";
+
+    if (def->short_name && def->long_name) {
+      char *comma = def->has_val ? "," : ",      ";
+
+      snprintf(option_text, 37, "-%s%s%s --%s%6s",
+               def->short_name, short_val, comma,
+               def->long_name, long_val);
+    } else if (def->short_name)
+      snprintf(option_text, 37, "-%s%s",
+               def->short_name, short_val);
+    else if (def->long_name)
+      snprintf(option_text, 37, "          --%s%s",
+               def->long_name, long_val);
+
+    fprintf(fp, "  %-37s\t%s\n", option_text, def->desc);
+
+    if (def->enums) {
+      const struct arg_enum_list *listptr;
+
+      fprintf(fp, "  %-37s\t  ", "");
+
+      for (listptr = def->enums; listptr->name; listptr++)
+        fprintf(fp, "%s%s", listptr->name,
+                listptr[1].name ? ", " : "\n");
+    }
+  }
+}
+
+
+unsigned int arg_parse_uint(const struct arg *arg) {
+  long int   rawval;
+  char      *endptr;
+
+  rawval = strtol(arg->val, &endptr, 10);
+
+  if (arg->val[0] != '\0' && endptr[0] == '\0') {
+    if (rawval >= 0 && rawval <= UINT_MAX)
+      return rawval;
+
+    die("Option %s: Value %ld out of range for unsigned int\n",
+        arg->name, rawval);
+  }
+
+  die("Option %s: Invalid character '%c'\n", arg->name, *endptr);
+  return 0;
+}
+
+
+int arg_parse_int(const struct arg *arg) {
+  long int   rawval;
+  char      *endptr;
+
+  rawval = strtol(arg->val, &endptr, 10);
+
+  if (arg->val[0] != '\0' && endptr[0] == '\0') {
+    if (rawval >= INT_MIN && rawval <= INT_MAX)
+      return rawval;
+
+    die("Option %s: Value %ld out of range for signed int\n",
+        arg->name, rawval);
+  }
+
+  die("Option %s: Invalid character '%c'\n", arg->name, *endptr);
+  return 0;
+}
+
+
+struct vpx_rational {
+  int num; /**< fraction numerator */
+  int den; /**< fraction denominator */
+};
+struct vpx_rational arg_parse_rational(const struct arg *arg) {
+  long int             rawval;
+  char                *endptr;
+  struct vpx_rational  rat;
+
+  /* parse numerator */
+  rawval = strtol(arg->val, &endptr, 10);
+
+  if (arg->val[0] != '\0' && endptr[0] == '/') {
+    if (rawval >= INT_MIN && rawval <= INT_MAX)
+      rat.num = rawval;
+    else die("Option %s: Value %ld out of range for signed int\n",
+               arg->name, rawval);
+  } else die("Option %s: Expected / at '%c'\n", arg->name, *endptr);
+
+  /* parse denominator */
+  rawval = strtol(endptr + 1, &endptr, 10);
+
+  if (arg->val[0] != '\0' && endptr[0] == '\0') {
+    if (rawval >= INT_MIN && rawval <= INT_MAX)
+      rat.den = rawval;
+    else die("Option %s: Value %ld out of range for signed int\n",
+               arg->name, rawval);
+  } else die("Option %s: Invalid character '%c'\n", arg->name, *endptr);
+
+  return rat;
+}
+
+
+int arg_parse_enum(const struct arg *arg) {
+  const struct arg_enum_list *listptr;
+  long int                    rawval;
+  char                       *endptr;
+
+  /* First see if the value can be parsed as a raw value */
+  rawval = strtol(arg->val, &endptr, 10);
+  if (arg->val[0] != '\0' && endptr[0] == '\0') {
+    /* Got a raw value, make sure it's valid */
+    for (listptr = arg->def->enums; listptr->name; listptr++)
+      if (listptr->val == rawval)
+        return rawval;
+  }
+
+  /* Next see if it can be parsed as a string */
+  for (listptr = arg->def->enums; listptr->name; listptr++)
+    if (!strcmp(arg->val, listptr->name))
+      return listptr->val;
+
+  die("Option %s: Invalid value '%s'\n", arg->name, arg->val);
+  return 0;
+}
+
+
+int arg_parse_enum_or_int(const struct arg *arg) {
+  if (arg->def->enums)
+    return arg_parse_enum(arg);
+  return arg_parse_int(arg);
+}
diff --git a/libs/libvpx/args.h b/libs/libvpx/args.h
new file mode 100644
index 0000000000..1f37151a02
--- /dev/null
+++ b/libs/libvpx/args.h
@@ -0,0 +1,60 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef ARGS_H_
+#define ARGS_H_
+#include <stdio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct arg {
+  char                 **argv;
+  const char            *name;
+  const char            *val;
+  unsigned int           argv_step;
+  const struct arg_def  *def;
+};
+
+struct arg_enum_list {
+  const char *name;
+  int         val;
+};
+#define ARG_ENUM_LIST_END {0}
+
+typedef struct arg_def {
+  const char *short_name;
+  const char *long_name;
+  int         has_val;
+  const char *desc;
+  const struct arg_enum_list *enums;
+} arg_def_t;
+#define ARG_DEF(s,l,v,d) {s,l,v,d, NULL}
+#define ARG_DEF_ENUM(s,l,v,d,e) {s,l,v,d,e}
+#define ARG_DEF_LIST_END {0}
+
+struct arg arg_init(char **argv);
+int arg_match(struct arg *arg_, const struct arg_def *def, char **argv);
+const char *arg_next(struct arg *arg);
+void arg_show_usage(FILE *fp, const struct arg_def *const *defs);
+char **argv_dup(int argc, const char **argv);
+
+unsigned int arg_parse_uint(const struct arg *arg);
+int arg_parse_int(const struct arg *arg);
+struct vpx_rational arg_parse_rational(const struct arg *arg);
+int arg_parse_enum(const struct arg *arg);
+int arg_parse_enum_or_int(const struct arg *arg);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // ARGS_H_
diff --git a/libs/libvpx/build/.gitattributes b/libs/libvpx/build/.gitattributes
new file mode 100644
index 0000000000..03db79bc08
--- /dev/null
+++ b/libs/libvpx/build/.gitattributes
@@ -0,0 +1,2 @@
+*-vs8/*.rules -crlf
+*-msvs/*.rules -crlf
diff --git a/libs/libvpx/build/.gitignore b/libs/libvpx/build/.gitignore
new file mode 100644
index 0000000000..1350fcb5eb
--- /dev/null
+++ b/libs/libvpx/build/.gitignore
@@ -0,0 +1 @@
+x86*-win32-vs*
diff --git a/libs/libvpx/build/make/Android.mk b/libs/libvpx/build/make/Android.mk
new file mode 100644
index 0000000000..df01dece67
--- /dev/null
+++ b/libs/libvpx/build/make/Android.mk
@@ -0,0 +1,205 @@
+##
+##  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+#
+# This file is to be used for compiling libvpx for Android using the NDK.
+# In an Android project place a libvpx checkout in the jni directory.
+# Run the configure script from the jni directory.  Base libvpx
+# encoder/decoder configuration will look similar to:
+# ./libvpx/configure --target=armv7-android-gcc --disable-examples \
+#                    --sdk-path=/opt/android-ndk-r6b/
+#
+# When targeting Android, realtime-only is enabled by default.  This can
+# be overridden by adding the command line flag:
+#  --disable-realtime-only
+#
+# This will create .mk files that contain variables that contain the
+# source files to compile.
+#
+# Place an Android.mk file in the jni directory that references the
+# Android.mk file in the libvpx directory:
+# LOCAL_PATH := $(call my-dir)
+# include $(CLEAR_VARS)
+# include jni/libvpx/build/make/Android.mk
+#
+# There are currently two TARGET_ARCH_ABI targets for ARM.
+# armeabi and armeabi-v7a.  armeabi-v7a is selected by creating an
+# Application.mk in the jni directory that contains:
+# APP_ABI := armeabi-v7a
+#
+# By default libvpx will detect at runtime the existance of NEON extension.
+# For this we import the 'cpufeatures' module from the NDK sources.
+# libvpx can also be configured without this runtime detection method.
+# Configuring with --disable-runtime-cpu-detect will assume presence of NEON.
+# Configuring with --disable-runtime-cpu-detect --disable-neon \
+#     --disable-neon-asm
+# will remove any NEON dependency.
+
+# To change to building armeabi, run ./libvpx/configure again, but with
+# --target=armv6-android-gcc and modify the Application.mk file to
+# set APP_ABI := armeabi
+#
+# Running ndk-build will build libvpx and include it in your project.
+#
+
+CONFIG_DIR := $(LOCAL_PATH)/
+LIBVPX_PATH := $(LOCAL_PATH)/libvpx
+ASM_CNV_PATH_LOCAL := $(TARGET_ARCH_ABI)/ads2gas
+ASM_CNV_PATH := $(LOCAL_PATH)/$(ASM_CNV_PATH_LOCAL)
+
+# Use the makefiles generated by upstream configure to determine which files to
+# build. Also set any architecture-specific flags.
+ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
+  include $(CONFIG_DIR)libs-armv7-android-gcc.mk
+  LOCAL_ARM_MODE := arm
+else ifeq  ($(TARGET_ARCH_ABI),armeabi)
+  include $(CONFIG_DIR)libs-armv6-android-gcc.mk
+  LOCAL_ARM_MODE := arm
+else ifeq  ($(TARGET_ARCH_ABI),arm64-v8a)
+  include $(CONFIG_DIR)libs-armv8-android-gcc.mk
+  LOCAL_ARM_MODE := arm
+else ifeq ($(TARGET_ARCH_ABI),x86)
+  include $(CONFIG_DIR)libs-x86-android-gcc.mk
+else ifeq ($(TARGET_ARCH_ABI),x86_64)
+  include $(CONFIG_DIR)libs-x86_64-android-gcc.mk
+else ifeq ($(TARGET_ARCH_ABI),mips)
+  include $(CONFIG_DIR)libs-mips-android-gcc.mk
+else
+  $(error Not a supported TARGET_ARCH_ABI: $(TARGET_ARCH_ABI))
+endif
+
+# Rule that is normally in Makefile created by libvpx
+# configure.  Used to filter out source files based on configuration.
+enabled=$(filter-out $($(1)-no),$($(1)-yes))
+
+# Override the relative path that is defined by the libvpx
+# configure process
+SRC_PATH_BARE := $(LIBVPX_PATH)
+
+# Include the list of files to be built
+include $(LIBVPX_PATH)/libs.mk
+
+# Optimise the code. May want to revisit this setting in the future.
+LOCAL_CFLAGS := -O3
+
+# For x86, include the source code in the search path so it will find files
+# like x86inc.asm and x86_abi_support.asm
+LOCAL_ASMFLAGS := -I$(LIBVPX_PATH)
+
+.PRECIOUS: %.asm.s
+$(ASM_CNV_PATH)/libvpx/%.asm.s: $(LIBVPX_PATH)/%.asm
+	@mkdir -p $(dir $@)
+	@$(CONFIG_DIR)$(ASM_CONVERSION) <$< > $@
+
+# For building *_rtcd.h, which have rules in libs.mk
+TGT_ISA:=$(word 1, $(subst -, ,$(TOOLCHAIN)))
+target := libs
+
+LOCAL_SRC_FILES += vpx_config.c
+
+# Remove duplicate entries
+CODEC_SRCS_UNIQUE = $(sort $(CODEC_SRCS))
+
+# Pull out C files.  vpx_config.c is in the immediate directory and
+# so it does not need libvpx/ prefixed like the rest of the source files.
+# The neon files with intrinsics need to have .neon appended so the proper
+# flags are applied.
+CODEC_SRCS_C = $(filter %.c, $(CODEC_SRCS_UNIQUE))
+LOCAL_NEON_SRCS_C = $(filter %_neon.c, $(CODEC_SRCS_C))
+LOCAL_CODEC_SRCS_C = $(filter-out vpx_config.c %_neon.c, $(CODEC_SRCS_C))
+
+LOCAL_SRC_FILES += $(foreach file, $(LOCAL_CODEC_SRCS_C), libvpx/$(file))
+ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
+  LOCAL_SRC_FILES += $(foreach file, $(LOCAL_NEON_SRCS_C), libvpx/$(file).neon)
+else # If there are neon sources then we are building for arm64 and do not need to specify .neon
+  LOCAL_SRC_FILES += $(foreach file, $(LOCAL_NEON_SRCS_C), libvpx/$(file))
+endif
+
+# Pull out assembly files, splitting NEON from the rest.  This is
+# done to specify that the NEON assembly files use NEON assembler flags.
+# x86 assembly matches %.asm, arm matches %.asm.s
+
+# x86:
+
+CODEC_SRCS_ASM_X86 = $(filter %.asm, $(CODEC_SRCS_UNIQUE))
+LOCAL_SRC_FILES += $(foreach file, $(CODEC_SRCS_ASM_X86), libvpx/$(file))
+
+# arm:
+CODEC_SRCS_ASM_ARM_ALL = $(filter %.asm.s, $(CODEC_SRCS_UNIQUE))
+CODEC_SRCS_ASM_ARM = $(foreach v, \
+                     $(CODEC_SRCS_ASM_ARM_ALL), \
+                     $(if $(findstring neon,$(v)),,$(v)))
+CODEC_SRCS_ASM_ADS2GAS = $(patsubst %.s, \
+                         $(ASM_CNV_PATH_LOCAL)/libvpx/%.s, \
+                         $(CODEC_SRCS_ASM_ARM))
+LOCAL_SRC_FILES += $(CODEC_SRCS_ASM_ADS2GAS)
+
+ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
+  CODEC_SRCS_ASM_NEON = $(foreach v, \
+                        $(CODEC_SRCS_ASM_ARM_ALL),\
+                        $(if $(findstring neon,$(v)),$(v),))
+  CODEC_SRCS_ASM_NEON_ADS2GAS = $(patsubst %.s, \
+                                $(ASM_CNV_PATH_LOCAL)/libvpx/%.s, \
+                                $(CODEC_SRCS_ASM_NEON))
+  LOCAL_SRC_FILES += $(patsubst %.s, \
+                     %.s.neon, \
+                     $(CODEC_SRCS_ASM_NEON_ADS2GAS))
+endif
+
+LOCAL_CFLAGS += \
+    -DHAVE_CONFIG_H=vpx_config.h \
+    -I$(LIBVPX_PATH) \
+    -I$(ASM_CNV_PATH)
+
+LOCAL_MODULE := libvpx
+
+ifeq ($(CONFIG_RUNTIME_CPU_DETECT),yes)
+  LOCAL_STATIC_LIBRARIES := cpufeatures
+endif
+
+# Add a dependency to force generation of the RTCD files.
+define rtcd_dep_template
+rtcd_dep_template_SRCS := $(addprefix $(LOCAL_PATH)/, $(LOCAL_SRC_FILES))
+rtcd_dep_template_SRCS := $$(rtcd_dep_template_SRCS:.neon=)
+ifeq ($(CONFIG_VP8), yes)
+$$(rtcd_dep_template_SRCS): vp8_rtcd.h
+endif
+ifeq ($(CONFIG_VP9), yes)
+$$(rtcd_dep_template_SRCS): vp9_rtcd.h
+endif
+ifeq ($(CONFIG_VP10), yes)
+$$(rtcd_dep_template_SRCS): vp10_rtcd.h
+endif
+$$(rtcd_dep_template_SRCS): vpx_scale_rtcd.h
+$$(rtcd_dep_template_SRCS): vpx_dsp_rtcd.h
+
+ifneq ($(findstring $(TARGET_ARCH_ABI),x86 x86_64),)
+$$(rtcd_dep_template_SRCS): vpx_config.asm
+endif
+endef
+
+$(eval $(call rtcd_dep_template))
+
+.PHONY: clean
+clean:
+	@echo "Clean: ads2gas files [$(TARGET_ARCH_ABI)]"
+	@$(RM) $(CODEC_SRCS_ASM_ADS2GAS) $(CODEC_SRCS_ASM_NEON_ADS2GAS)
+	@$(RM) -r $(ASM_CNV_PATH)
+	@$(RM) $(CLEAN-OBJS)
+
+ifeq ($(ENABLE_SHARED),1)
+  include $(BUILD_SHARED_LIBRARY)
+else
+  include $(BUILD_STATIC_LIBRARY)
+endif
+
+ifeq ($(CONFIG_RUNTIME_CPU_DETECT),yes)
+$(call import-module,cpufeatures)
+endif
diff --git a/libs/libvpx/build/make/Makefile b/libs/libvpx/build/make/Makefile
new file mode 100644
index 0000000000..3081a92680
--- /dev/null
+++ b/libs/libvpx/build/make/Makefile
@@ -0,0 +1,455 @@
+##
+##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+
+include config.mk
+quiet?=true
+ifeq ($(target),)
+# If a target wasn't specified, invoke for all enabled targets.
+.DEFAULT:
+	@for t in $(ALL_TARGETS); do \
+	     $(MAKE) --no-print-directory target=$$t $(MAKECMDGOALS) || exit $$?;\
+        done
+all: .DEFAULT
+clean:: .DEFAULT
+exampletest: .DEFAULT
+install:: .DEFAULT
+test:: .DEFAULT
+test-no-data-check:: .DEFAULT
+testdata:: .DEFAULT
+utiltest: .DEFAULT
+exampletest-no-data-check utiltest-no-data-check: .DEFAULT
+
+
+# Note: md5sum is not installed on OS X, but openssl is. Openssl may not be
+# installed on cygwin, so we need to autodetect here.
+md5sum := $(firstword $(wildcard \
+          $(foreach e,md5sum openssl,\
+          $(foreach p,$(subst :, ,$(PATH)),$(p)/$(e)*))\
+          ))
+md5sum := $(if $(filter %openssl,$(md5sum)),$(md5sum) dgst -md5,$(md5sum))
+
+TGT_CC:=$(word 3, $(subst -, ,$(TOOLCHAIN)))
+dist:
+	@for t in $(ALL_TARGETS); do \
+	     $(MAKE) --no-print-directory target=$$t $(MAKECMDGOALS) || exit $$?;\
+        done
+        # Run configure for the user with the current toolchain.
+	@if [ -d "$(DIST_DIR)/src" ]; then \
+            mkdir -p "$(DIST_DIR)/build"; \
+            cd "$(DIST_DIR)/build"; \
+            echo "Rerunning configure $(CONFIGURE_ARGS)"; \
+            ../src/configure $(CONFIGURE_ARGS); \
+            $(if $(filter vs%,$(TGT_CC)),make NO_LAUNCH_DEVENV=1;) \
+        fi
+	@if [ -d "$(DIST_DIR)" ]; then \
+            echo "    [MD5SUM] $(DIST_DIR)"; \
+	    cd $(DIST_DIR) && \
+	    $(md5sum) `find . -name md5sums.txt -prune -o -type f -print` \
+                | sed -e 's/MD5(\(.*\))= \([0-9a-f]\{32\}\)/\2  \1/' \
+                > md5sums.txt;\
+        fi
+endif
+
+# Since we invoke make recursively for multiple targets we need to include the
+# .mk file for the correct target, but only when $(target) is non-empty.
+ifneq ($(target),)
+include $(target)-$(TOOLCHAIN).mk
+endif
+BUILD_ROOT?=.
+VPATH=$(SRC_PATH_BARE)
+CFLAGS+=-I$(BUILD_PFX)$(BUILD_ROOT) -I$(SRC_PATH)
+CXXFLAGS+=-I$(BUILD_PFX)$(BUILD_ROOT) -I$(SRC_PATH)
+ASFLAGS+=-I$(BUILD_PFX)$(BUILD_ROOT)/ -I$(SRC_PATH)/
+DIST_DIR?=dist
+HOSTCC?=gcc
+TGT_ISA:=$(word 1, $(subst -, ,$(TOOLCHAIN)))
+TGT_OS:=$(word 2, $(subst -, ,$(TOOLCHAIN)))
+TGT_CC:=$(word 3, $(subst -, ,$(TOOLCHAIN)))
+quiet:=$(if $(or $(verbose), $(V)),, yes)
+qexec=$(if $(quiet),@)
+
+# Cancel built-in implicit rules
+%: %.o
+%.asm:
+%.a:
+%: %.cc
+
+#
+# Common rules"
+#
+.PHONY: all
+all:
+
+.PHONY: clean
+clean::
+	rm -f $(OBJS-yes) $(OBJS-yes:.o=.d) $(OBJS-yes:.asm.s.o=.asm.s)
+	rm -f $(CLEAN-OBJS)
+
+.PHONY: clean
+distclean: clean
+	if [ -z "$(target)" ]; then \
+      rm -f Makefile; \
+      rm -f config.log config.mk; \
+      rm -f vpx_config.[hc] vpx_config.asm; \
+    else \
+      rm -f $(target)-$(TOOLCHAIN).mk; \
+    fi
+
+.PHONY: dist
+dist:
+.PHONY: exampletest
+exampletest:
+.PHONY: install
+install::
+.PHONY: test
+test::
+.PHONY: testdata
+testdata::
+.PHONY: utiltest
+utiltest:
+.PHONY: test-no-data-check exampletest-no-data-check utiltest-no-data-check
+test-no-data-check::
+exampletest-no-data-check utiltest-no-data-check:
+
+# Add compiler flags for intrinsic files
+ifeq ($(TOOLCHAIN), x86-os2-gcc)
+STACKREALIGN=-mstackrealign
+else
+STACKREALIGN=
+endif
+
+$(BUILD_PFX)%_mmx.c.d: CFLAGS += -mmmx
+$(BUILD_PFX)%_mmx.c.o: CFLAGS += -mmmx
+$(BUILD_PFX)%_sse2.c.d: CFLAGS += -msse2 $(STACKREALIGN)
+$(BUILD_PFX)%_sse2.c.o: CFLAGS += -msse2 $(STACKREALIGN)
+$(BUILD_PFX)%_sse3.c.d: CFLAGS += -msse3 $(STACKREALIGN)
+$(BUILD_PFX)%_sse3.c.o: CFLAGS += -msse3 $(STACKREALIGN)
+$(BUILD_PFX)%_ssse3.c.d: CFLAGS += -mssse3 $(STACKREALIGN)
+$(BUILD_PFX)%_ssse3.c.o: CFLAGS += -mssse3 $(STACKREALIGN)
+$(BUILD_PFX)%_sse4.c.d: CFLAGS += -msse4.1 $(STACKREALIGN)
+$(BUILD_PFX)%_sse4.c.o: CFLAGS += -msse4.1 $(STACKREALIGN)
+$(BUILD_PFX)%_avx.c.d: CFLAGS += -mavx $(STACKREALIGN)
+$(BUILD_PFX)%_avx.c.o: CFLAGS += -mavx $(STACKREALIGN)
+$(BUILD_PFX)%_avx2.c.d: CFLAGS += -mavx2 $(STACKREALIGN)
+$(BUILD_PFX)%_avx2.c.o: CFLAGS += -mavx2 $(STACKREALIGN)
+$(BUILD_PFX)%vp9_reconintra.c.d: CFLAGS += $(STACKREALIGN)
+$(BUILD_PFX)%vp9_reconintra.c.o: CFLAGS += $(STACKREALIGN)
+
+$(BUILD_PFX)%.c.d: %.c
+	$(if $(quiet),@echo "    [DEP] $@")
+	$(qexec)mkdir -p $(dir $@)
+	$(qexec)$(CC) $(INTERNAL_CFLAGS) $(CFLAGS) -M $< | $(fmt_deps) > $@
+
+$(BUILD_PFX)%.c.o: %.c
+	$(if $(quiet),@echo "    [CC] $@")
+	$(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@))
+	$(qexec)$(CC) $(INTERNAL_CFLAGS) $(CFLAGS) -c -o $@ $<
+
+$(BUILD_PFX)%.cc.d: %.cc
+	$(if $(quiet),@echo "    [DEP] $@")
+	$(qexec)mkdir -p $(dir $@)
+	$(qexec)$(CXX) $(INTERNAL_CFLAGS) $(CXXFLAGS) -M $< | $(fmt_deps) > $@
+
+$(BUILD_PFX)%.cc.o: %.cc
+	$(if $(quiet),@echo "    [CXX] $@")
+	$(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@))
+	$(qexec)$(CXX) $(INTERNAL_CFLAGS) $(CXXFLAGS) -c -o $@ $<
+
+$(BUILD_PFX)%.cpp.d: %.cpp
+	$(if $(quiet),@echo "    [DEP] $@")
+	$(qexec)mkdir -p $(dir $@)
+	$(qexec)$(CXX) $(INTERNAL_CFLAGS) $(CXXFLAGS) -M $< | $(fmt_deps) > $@
+
+$(BUILD_PFX)%.cpp.o: %.cpp
+	$(if $(quiet),@echo "    [CXX] $@")
+	$(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@))
+	$(qexec)$(CXX) $(INTERNAL_CFLAGS) $(CXXFLAGS) -c -o $@ $<
+
+$(BUILD_PFX)%.asm.d: %.asm
+	$(if $(quiet),@echo "    [DEP] $@")
+	$(qexec)mkdir -p $(dir $@)
+	$(qexec)$(SRC_PATH_BARE)/build/make/gen_asm_deps.sh \
+            --build-pfx=$(BUILD_PFX) --depfile=$@ $(ASFLAGS) $< > $@
+
+$(BUILD_PFX)%.asm.o: %.asm
+	$(if $(quiet),@echo "    [AS] $@")
+	$(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@))
+	$(qexec)$(AS) $(ASFLAGS) -o $@ $<
+
+$(BUILD_PFX)%.s.d: %.s
+	$(if $(quiet),@echo "    [DEP] $@")
+	$(qexec)mkdir -p $(dir $@)
+	$(qexec)$(SRC_PATH_BARE)/build/make/gen_asm_deps.sh \
+            --build-pfx=$(BUILD_PFX) --depfile=$@ $(ASFLAGS) $< > $@
+
+$(BUILD_PFX)%.s.o: %.s
+	$(if $(quiet),@echo "    [AS] $@")
+	$(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@))
+	$(qexec)$(AS) $(ASFLAGS) -o $@ $<
+
+.PRECIOUS: %.c.S
+%.c.S: CFLAGS += -DINLINE_ASM
+$(BUILD_PFX)%.c.S: %.c
+	$(if $(quiet),@echo "    [GEN] $@")
+	$(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@))
+	$(qexec)$(CC) -S $(CFLAGS) -o $@ $<
+
+.PRECIOUS: %.asm.s
+$(BUILD_PFX)%.asm.s: %.asm
+	$(if $(quiet),@echo "    [ASM CONVERSION] $@")
+	$(qexec)mkdir -p $(dir $@)
+	$(qexec)$(ASM_CONVERSION) <$< >$@
+
+# If we're in debug mode, pretend we don't have GNU strip, to fall back to
+# the copy implementation
+HAVE_GNU_STRIP := $(if $(CONFIG_DEBUG),,$(HAVE_GNU_STRIP))
+ifeq ($(HAVE_GNU_STRIP),yes)
+# Older binutils strip global symbols not needed for relocation processing
+# when given --strip-unneeded. Using nm and awk to identify globals and
+# keep them caused command line length issues under mingw and segfaults in
+# test_libvpx were observed under OS/2: simply use --strip-debug.
+%.a: %_g.a
+	$(if $(quiet),@echo "    [STRIP] $@ < $<")
+	$(qexec)$(STRIP) --strip-debug \
+          -o $@ $<
+else
+%.a: %_g.a
+	$(if $(quiet),@echo "    [CP] $@ < $<")
+	$(qexec)cp $< $@
+endif
+
+#
+# Utility functions
+#
+pairmap=$(if $(strip $(2)),\
+    $(call $(1),$(word 1,$(2)),$(word 2,$(2)))\
+    $(call pairmap,$(1),$(wordlist 3,$(words $(2)),$(2)))\
+)
+
+enabled=$(filter-out $($(1)-no),$($(1)-yes))
+cond_enabled=$(if $(filter yes,$($(1))), $(call enabled,$(2)))
+
+find_file1=$(word 1,$(wildcard $(subst //,/,$(addsuffix /$(1),$(2)))))
+find_file=$(foreach f,$(1),$(call find_file1,$(strip $(f)),$(strip $(2))) )
+obj_pats=.c=.c.o $(AS_SFX)=$(AS_SFX).o .cc=.cc.o .cpp=.cpp.o
+objs=$(addprefix $(BUILD_PFX),$(foreach p,$(obj_pats),$(filter %.o,$(1:$(p))) ))
+
+install_map_templates=$(eval $(call install_map_template,$(1),$(2)))
+
+not=$(subst yes,no,$(1))
+
+ifeq ($(CONFIG_MSVS),yes)
+lib_file_name=$(1).lib
+else
+lib_file_name=lib$(1).a
+endif
+#
+# Rule Templates
+#
+define linker_template
+$(1): $(filter-out -%,$(2))
+$(1):
+	$(if $(quiet),@echo    "    [LD] $$@")
+	$(qexec)$$(LD) $$(strip $$(INTERNAL_LDFLAGS) $$(LDFLAGS) -o $$@ $(2) $(3) $$(extralibs))
+endef
+define linkerxx_template
+$(1): $(filter-out -%,$(2))
+$(1):
+	$(if $(quiet),@echo    "    [LD] $$@")
+	$(qexec)$$(CXX) $$(strip $$(INTERNAL_LDFLAGS) $$(LDFLAGS) -o $$@ $(2) $(3) $$(extralibs))
+endef
+# make-3.80 has a bug with expanding large input strings to the eval function,
+# which was triggered in some cases by the following component of
+# linker_template:
+#   $(1): $$(call find_file, $(patsubst -l%,lib%.a,$(filter -l%,$(2))),\
+#                           $$(patsubst -L%,%,$$(filter -L%,$$(LDFLAGS) $(2))))
+# This may be useful to revisit in the future (it tries to locate libraries
+# in a search path and add them as prerequisites
+
+define install_map_template
+$(DIST_DIR)/$(1): $(2)
+	$(if $(quiet),@echo "    [INSTALL] $$@")
+	$(qexec)mkdir -p $$(dir $$@)
+	$(qexec)cp -p $$< $$@
+endef
+
+define archive_template
+# Not using a pattern rule here because we don't want to generate empty
+# archives when they are listed as a dependency in files not responsible
+# for creating them.
+$(1):
+	$(if $(quiet),@echo "    [AR] $$@")
+	$(qexec)$$(AR) $$(ARFLAGS) $$@ $$^
+endef
+
+define so_template
+# Not using a pattern rule here because we don't want to generate empty
+# archives when they are listed as a dependency in files not responsible
+# for creating them.
+#
+# This needs further abstraction for dealing with non-GNU linkers.
+$(1):
+	$(if $(quiet),@echo "    [LD] $$@")
+	$(qexec)$$(LD) -shared $$(LDFLAGS) \
+            -Wl,--no-undefined -Wl,-soname,$$(SONAME) \
+            -Wl,--version-script,$$(EXPORTS_FILE) -o $$@ \
+            $$(filter %.o,$$^) $$(extralibs)
+endef
+
+define dl_template
+# Not using a pattern rule here because we don't want to generate empty
+# archives when they are listed as a dependency in files not responsible
+# for creating them.
+$(1):
+	$(if $(quiet),@echo "    [LD] $$@")
+	$(qexec)$$(LD) -dynamiclib $$(LDFLAGS) \
+	    -exported_symbols_list $$(EXPORTS_FILE) \
+        -Wl,-headerpad_max_install_names,-compatibility_version,1.0,-current_version,$$(VERSION_MAJOR) \
+        -o $$@ \
+        $$(filter %.o,$$^) $$(extralibs)
+endef
+
+define dll_template
+# Not using a pattern rule here because we don't want to generate empty
+# archives when they are listed as a dependency in files not responsible
+# for creating them.
+$(1):
+	$(if $(quiet),@echo "    [LD] $$@")
+	$(qexec)$$(LD) -Zdll $$(LDFLAGS) \
+        -o $$@ \
+        $$(filter %.o,$$^) $$(extralibs) $$(EXPORTS_FILE)
+endef
+
+
+#
+# Get current configuration
+#
+ifneq ($(target),)
+include $(SRC_PATH_BARE)/$(target:-$(TOOLCHAIN)=).mk
+endif
+
+skip_deps := $(filter %clean,$(MAKECMDGOALS))
+skip_deps += $(findstring testdata,$(MAKECMDGOALS))
+ifeq ($(strip $(skip_deps)),)
+  ifeq ($(CONFIG_DEPENDENCY_TRACKING),yes)
+    # Older versions of make don't like -include directives with no arguments
+    ifneq ($(filter %.d,$(OBJS-yes:.o=.d)),)
+      -include $(filter %.d,$(OBJS-yes:.o=.d))
+    endif
+  endif
+endif
+
+#
+# Configuration dependent rules
+#
+$(call pairmap,install_map_templates,$(INSTALL_MAPS))
+
+DOCS=$(call cond_enabled,CONFIG_INSTALL_DOCS,DOCS)
+.docs: $(DOCS)
+	@touch $@
+
+INSTALL-DOCS=$(call cond_enabled,CONFIG_INSTALL_DOCS,INSTALL-DOCS)
+ifeq ($(MAKECMDGOALS),dist)
+INSTALL-DOCS+=$(call cond_enabled,CONFIG_INSTALL_DOCS,DIST-DOCS)
+endif
+.install-docs: .docs $(addprefix $(DIST_DIR)/,$(INSTALL-DOCS))
+	@touch $@
+
+clean::
+	rm -f .docs .install-docs $(DOCS)
+
+BINS=$(call enabled,BINS)
+.bins: $(BINS)
+	@touch $@
+
+INSTALL-BINS=$(call cond_enabled,CONFIG_INSTALL_BINS,INSTALL-BINS)
+ifeq ($(MAKECMDGOALS),dist)
+INSTALL-BINS+=$(call cond_enabled,CONFIG_INSTALL_BINS,DIST-BINS)
+endif
+.install-bins: .bins $(addprefix $(DIST_DIR)/,$(INSTALL-BINS))
+	@touch $@
+
+clean::
+	rm -f .bins .install-bins $(BINS)
+
+LIBS=$(call enabled,LIBS)
+.libs: $(LIBS)
+	@touch $@
+$(foreach lib,$(filter %_g.a,$(LIBS)),$(eval $(call archive_template,$(lib))))
+$(foreach lib,$(filter %so.$(SO_VERSION_MAJOR).$(SO_VERSION_MINOR).$(SO_VERSION_PATCH),$(LIBS)),$(eval $(call so_template,$(lib))))
+$(foreach lib,$(filter %$(SO_VERSION_MAJOR).dylib,$(LIBS)),$(eval $(call dl_template,$(lib))))
+$(foreach lib,$(filter %$(SO_VERSION_MAJOR).dll,$(LIBS)),$(eval $(call dll_template,$(lib))))
+
+INSTALL-LIBS=$(call cond_enabled,CONFIG_INSTALL_LIBS,INSTALL-LIBS)
+ifeq ($(MAKECMDGOALS),dist)
+INSTALL-LIBS+=$(call cond_enabled,CONFIG_INSTALL_LIBS,DIST-LIBS)
+endif
+.install-libs: .libs $(addprefix $(DIST_DIR)/,$(INSTALL-LIBS))
+	@touch $@
+
+clean::
+	rm -f .libs .install-libs $(LIBS)
+
+ifeq ($(CONFIG_EXTERNAL_BUILD),yes)
+PROJECTS=$(call enabled,PROJECTS)
+.projects: $(PROJECTS)
+	@touch $@
+
+INSTALL-PROJECTS=$(call cond_enabled,CONFIG_INSTALL_PROJECTS,INSTALL-PROJECTS)
+ifeq ($(MAKECMDGOALS),dist)
+INSTALL-PROJECTS+=$(call cond_enabled,CONFIG_INSTALL_PROJECTS,DIST-PROJECTS)
+endif
+.install-projects: .projects $(addprefix $(DIST_DIR)/,$(INSTALL-PROJECTS))
+	@touch $@
+
+clean::
+	rm -f .projects .install-projects $(PROJECTS)
+endif
+
+# If there are any source files to be distributed, then include the build
+# system too.
+ifneq ($(call enabled,DIST-SRCS),)
+    DIST-SRCS-yes            += configure
+    DIST-SRCS-yes            += build/make/configure.sh
+    DIST-SRCS-yes            += build/make/gen_asm_deps.sh
+    DIST-SRCS-yes            += build/make/Makefile
+    DIST-SRCS-$(CONFIG_MSVS)  += build/make/gen_msvs_def.sh
+    DIST-SRCS-$(CONFIG_MSVS)  += build/make/gen_msvs_proj.sh
+    DIST-SRCS-$(CONFIG_MSVS)  += build/make/gen_msvs_sln.sh
+    DIST-SRCS-$(CONFIG_MSVS)  += build/make/gen_msvs_vcxproj.sh
+    DIST-SRCS-$(CONFIG_MSVS)  += build/make/msvs_common.sh
+    DIST-SRCS-$(CONFIG_RVCT) += build/make/armlink_adapter.sh
+    DIST-SRCS-$(ARCH_ARM)    += build/make/ads2gas.pl
+    DIST-SRCS-$(ARCH_ARM)    += build/make/ads2gas_apple.pl
+    DIST-SRCS-$(ARCH_ARM)    += build/make/ads2armasm_ms.pl
+    DIST-SRCS-$(ARCH_ARM)    += build/make/thumb.pm
+    DIST-SRCS-yes            += $(target:-$(TOOLCHAIN)=).mk
+endif
+INSTALL-SRCS := $(call cond_enabled,CONFIG_INSTALL_SRCS,INSTALL-SRCS)
+ifeq ($(MAKECMDGOALS),dist)
+INSTALL-SRCS += $(call cond_enabled,CONFIG_INSTALL_SRCS,DIST-SRCS)
+endif
+.install-srcs: $(addprefix $(DIST_DIR)/src/,$(INSTALL-SRCS))
+	@touch $@
+
+clean::
+	rm -f .install-srcs
+
+ifeq ($(CONFIG_EXTERNAL_BUILD),yes)
+    BUILD_TARGETS += .projects
+    INSTALL_TARGETS += .install-projects
+endif
+BUILD_TARGETS += .docs .libs .bins
+INSTALL_TARGETS += .install-docs .install-srcs .install-libs .install-bins
+all: $(BUILD_TARGETS)
+install:: $(INSTALL_TARGETS)
+dist: $(INSTALL_TARGETS)
+test::
diff --git a/libs/libvpx/build/make/ads2armasm_ms.pl b/libs/libvpx/build/make/ads2armasm_ms.pl
new file mode 100755
index 0000000000..2a2c470ff8
--- /dev/null
+++ b/libs/libvpx/build/make/ads2armasm_ms.pl
@@ -0,0 +1,39 @@
+#!/usr/bin/env perl
+##
+##  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+use FindBin;
+use lib $FindBin::Bin;
+use thumb;
+
+print "; This file was created from a .asm file\n";
+print ";  using the ads2armasm_ms.pl script.\n";
+
+while (<STDIN>)
+{
+    undef $comment;
+    undef $line;
+
+    s/REQUIRE8//;
+    s/PRESERVE8//;
+    s/^\s*ARM\s*$//;
+    s/AREA\s+\|\|(.*)\|\|/AREA |$1|/;
+    s/qsubaddx/qsax/i;
+    s/qaddsubx/qasx/i;
+
+    thumb::FixThumbInstructions($_, 1);
+
+    s/ldrneb/ldrbne/i;
+    s/ldrneh/ldrhne/i;
+    s/^(\s*)ENDP.*/$&\n$1ALIGN 4/;
+
+    print;
+}
+
diff --git a/libs/libvpx/build/make/ads2gas.pl b/libs/libvpx/build/make/ads2gas.pl
new file mode 100755
index 0000000000..7272424af2
--- /dev/null
+++ b/libs/libvpx/build/make/ads2gas.pl
@@ -0,0 +1,236 @@
+#!/usr/bin/env perl
+##
+##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+
+# ads2gas.pl
+# Author: Eric Fung (efung (at) acm.org)
+#
+# Convert ARM Developer Suite 1.0.1 syntax assembly source to GNU as format
+#
+# Usage: cat inputfile | perl ads2gas.pl > outputfile
+#
+
+use FindBin;
+use lib $FindBin::Bin;
+use thumb;
+
+my $thumb = 0;
+
+foreach my $arg (@ARGV) {
+    $thumb = 1 if ($arg eq "-thumb");
+}
+
+print "@ This file was created from a .asm file\n";
+print "@  using the ads2gas.pl script.\n";
+print "\t.equ DO1STROUNDING, 0\n";
+if ($thumb) {
+    print "\t.syntax unified\n";
+    print "\t.thumb\n";
+}
+
+# Stack of procedure names.
+@proc_stack = ();
+
+while (<STDIN>)
+{
+    undef $comment;
+    undef $line;
+    $comment_char = ";";
+    $comment_sub = "@";
+
+    # Handle comments.
+    if (/$comment_char/)
+    {
+      $comment = "";
+      ($line, $comment) = /(.*?)$comment_char(.*)/;
+      $_ = $line;
+    }
+
+    # Load and store alignment
+    s/@/,:/g;
+
+    # Hexadecimal constants prefaced by 0x
+    s/#&/#0x/g;
+
+    # Convert :OR: to |
+    s/:OR:/ | /g;
+
+    # Convert :AND: to &
+    s/:AND:/ & /g;
+
+    # Convert :NOT: to ~
+    s/:NOT:/ ~ /g;
+
+    # Convert :SHL: to <<
+    s/:SHL:/ << /g;
+
+    # Convert :SHR: to >>
+    s/:SHR:/ >> /g;
+
+    # Convert ELSE to .else
+    s/\bELSE\b/.else/g;
+
+    # Convert ENDIF to .endif
+    s/\bENDIF\b/.endif/g;
+
+    # Convert ELSEIF to .elseif
+    s/\bELSEIF\b/.elseif/g;
+
+    # Convert LTORG to .ltorg
+    s/\bLTORG\b/.ltorg/g;
+
+    # Convert endfunc to nothing.
+    s/\bendfunc\b//ig;
+
+    # Convert FUNCTION to nothing.
+    s/\bFUNCTION\b//g;
+    s/\bfunction\b//g;
+
+    s/\bENTRY\b//g;
+    s/\bMSARMASM\b/0/g;
+    s/^\s+end\s+$//g;
+
+    # Convert IF :DEF:to .if
+    # gcc doesn't have the ability to do a conditional
+    # if defined variable that is set by IF :DEF: on
+    # armasm, so convert it to a normal .if and then
+    # make sure to define a value elesewhere
+    if (s/\bIF :DEF:\b/.if /g)
+    {
+        s/=/==/g;
+    }
+
+    # Convert IF to .if
+    if (s/\bIF\b/.if/g)
+    {
+        s/=+/==/g;
+    }
+
+    # Convert INCLUDE to .INCLUDE "file"
+    s/INCLUDE(\s*)(.*)$/.include $1\"$2\"/;
+
+    # Code directive (ARM vs Thumb)
+    s/CODE([0-9][0-9])/.code $1/;
+
+    # No AREA required
+    # But ALIGNs in AREA must be obeyed
+    s/^\s*AREA.*ALIGN=([0-9])$/.text\n.p2align $1/;
+    # If no ALIGN, strip the AREA and align to 4 bytes
+    s/^\s*AREA.*$/.text\n.p2align 2/;
+
+    # DCD to .word
+    # This one is for incoming symbols
+    s/DCD\s+\|(\w*)\|/.long $1/;
+
+    # DCW to .short
+    s/DCW\s+\|(\w*)\|/.short $1/;
+    s/DCW(.*)/.short $1/;
+
+    # Constants defined in scope
+    s/DCD(.*)/.long $1/;
+    s/DCB(.*)/.byte $1/;
+
+    # RN to .req
+    if (s/RN\s+([Rr]\d+|lr)/.req $1/)
+    {
+        print;
+        print "$comment_sub$comment\n" if defined $comment;
+        next;
+    }
+
+    # Make function visible to linker, and make additional symbol with
+    # prepended underscore
+    s/EXPORT\s+\|([\$\w]*)\|/.global $1 \n\t.type $1, function/;
+    s/IMPORT\s+\|([\$\w]*)\|/.global $1/;
+
+    s/EXPORT\s+([\$\w]*)/.global $1/;
+    s/export\s+([\$\w]*)/.global $1/;
+
+    # No vertical bars required; make additional symbol with prepended
+    # underscore
+    s/^\|(\$?\w+)\|/_$1\n\t$1:/g;
+
+    # Labels need trailing colon
+#   s/^(\w+)/$1:/ if !/EQU/;
+    # put the colon at the end of the line in the macro
+    s/^([a-zA-Z_0-9\$]+)/$1:/ if !/EQU/;
+
+    # ALIGN directive
+    s/\bALIGN\b/.balign/g;
+
+    if ($thumb) {
+        # ARM code - we force everything to thumb with the declaration in the header
+        s/\sARM//g;
+    } else {
+        # ARM code
+        s/\sARM/.arm/g;
+    }
+
+    # push/pop
+    s/(push\s+)(r\d+)/stmdb sp\!, \{$2\}/g;
+    s/(pop\s+)(r\d+)/ldmia sp\!, \{$2\}/g;
+
+    # NEON code
+    s/(vld1.\d+\s+)(q\d+)/$1\{$2\}/g;
+    s/(vtbl.\d+\s+[^,]+),([^,]+)/$1,\{$2\}/g;
+
+    if ($thumb) {
+        thumb::FixThumbInstructions($_, 0);
+    }
+
+    # eabi_attributes numerical equivalents can be found in the
+    # "ARM IHI 0045C" document.
+
+    # REQUIRE8 Stack is required to be 8-byte aligned
+    s/\sREQUIRE8/.eabi_attribute 24, 1 \@Tag_ABI_align_needed/g;
+
+    # PRESERVE8 Stack 8-byte align is preserved
+    s/\sPRESERVE8/.eabi_attribute 25, 1 \@Tag_ABI_align_preserved/g;
+
+    # Use PROC and ENDP to give the symbols a .size directive.
+    # This makes them show up properly in debugging tools like gdb and valgrind.
+    if (/\bPROC\b/)
+    {
+        my $proc;
+        /^_([\.0-9A-Z_a-z]\w+)\b/;
+        $proc = $1;
+        push(@proc_stack, $proc) if ($proc);
+        s/\bPROC\b/@ $&/;
+    }
+    if (/\bENDP\b/)
+    {
+        my $proc;
+        s/\bENDP\b/@ $&/;
+        $proc = pop(@proc_stack);
+        $_ = "\t.size $proc, .-$proc".$_ if ($proc);
+    }
+
+    # EQU directive
+    s/(\S+\s+)EQU(\s+\S+)/.equ $1, $2/;
+
+    # Begin macro definition
+    if (/\bMACRO\b/) {
+        $_ = <STDIN>;
+        s/^/.macro/;
+        s/\$//g;                # remove formal param reference
+        s/;/@/g;                # change comment characters
+    }
+
+    # For macros, use \ to reference formal params
+    s/\$/\\/g;                  # End macro definition
+    s/\bMEND\b/.endm/;              # No need to tell it where to stop assembling
+    next if /^\s*END\s*$/;
+    print;
+    print "$comment_sub$comment\n" if defined $comment;
+}
+
+# Mark that this object doesn't need an executable stack.
+printf ("\t.section\t.note.GNU-stack,\"\",\%\%progbits\n");
diff --git a/libs/libvpx/build/make/ads2gas_apple.pl b/libs/libvpx/build/make/ads2gas_apple.pl
new file mode 100755
index 0000000000..a82f3eba8e
--- /dev/null
+++ b/libs/libvpx/build/make/ads2gas_apple.pl
@@ -0,0 +1,235 @@
+#!/usr/bin/env perl
+##
+##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+
+# ads2gas_apple.pl
+# Author: Eric Fung (efung (at) acm.org)
+#
+# Convert ARM Developer Suite 1.0.1 syntax assembly source to GNU as format
+#
+# Usage: cat inputfile | perl ads2gas_apple.pl > outputfile
+#
+
+my $chromium = 0;
+
+foreach my $arg (@ARGV) {
+    $chromium = 1 if ($arg eq "-chromium");
+}
+
+print "@ This file was created from a .asm file\n";
+print "@  using the ads2gas_apple.pl script.\n\n";
+print "\t.set WIDE_REFERENCE, 0\n";
+print "\t.set ARCHITECTURE, 5\n";
+print "\t.set DO1STROUNDING, 0\n";
+
+my %register_aliases;
+my %macro_aliases;
+
+my @mapping_list = ("\$0", "\$1", "\$2", "\$3", "\$4", "\$5", "\$6", "\$7", "\$8", "\$9");
+
+my @incoming_array;
+
+my @imported_functions;
+
+# Perl trim function to remove whitespace from the start and end of the string
+sub trim($)
+{
+    my $string = shift;
+    $string =~ s/^\s+//;
+    $string =~ s/\s+$//;
+    return $string;
+}
+
+while (<STDIN>)
+{
+    # Load and store alignment
+    s/@/,:/g;
+
+    # Comment character
+    s/;/ @/g;
+
+    # Hexadecimal constants prefaced by 0x
+    s/#&/#0x/g;
+
+    # Convert :OR: to |
+    s/:OR:/ | /g;
+
+    # Convert :AND: to &
+    s/:AND:/ & /g;
+
+    # Convert :NOT: to ~
+    s/:NOT:/ ~ /g;
+
+    # Convert :SHL: to <<
+    s/:SHL:/ << /g;
+
+    # Convert :SHR: to >>
+    s/:SHR:/ >> /g;
+
+    # Convert ELSE to .else
+    s/\bELSE\b/.else/g;
+
+    # Convert ENDIF to .endif
+    s/\bENDIF\b/.endif/g;
+
+    # Convert ELSEIF to .elseif
+    s/\bELSEIF\b/.elseif/g;
+
+    # Convert LTORG to .ltorg
+    s/\bLTORG\b/.ltorg/g;
+
+    # Convert IF :DEF:to .if
+    # gcc doesn't have the ability to do a conditional
+    # if defined variable that is set by IF :DEF: on
+    # armasm, so convert it to a normal .if and then
+    # make sure to define a value elesewhere
+    if (s/\bIF :DEF:\b/.if /g)
+    {
+        s/=/==/g;
+    }
+
+    # Convert IF to .if
+    if (s/\bIF\b/.if/g)
+    {
+        s/=/==/g;
+    }
+
+    # Convert INCLUDE to .INCLUDE "file"
+    s/INCLUDE(\s*)(.*)$/.include $1\"$2\"/;
+
+    # Code directive (ARM vs Thumb)
+    s/CODE([0-9][0-9])/.code $1/;
+
+    # No AREA required
+    # But ALIGNs in AREA must be obeyed
+    s/^\s*AREA.*ALIGN=([0-9])$/.text\n.p2align $1/;
+    # If no ALIGN, strip the AREA and align to 4 bytes
+    s/^\s*AREA.*$/.text\n.p2align 2/;
+
+    # DCD to .word
+    # This one is for incoming symbols
+    s/DCD\s+\|(\w*)\|/.long $1/;
+
+    # DCW to .short
+    s/DCW\s+\|(\w*)\|/.short $1/;
+    s/DCW(.*)/.short $1/;
+
+    # Constants defined in scope
+    s/DCD(.*)/.long $1/;
+    s/DCB(.*)/.byte $1/;
+
+    # Build a hash of all the register - alias pairs.
+    if (s/(.*)RN(.*)/$1 .req $2/g)
+    {
+        $register_aliases{trim($1)} = trim($2);
+        next;
+    }
+
+    while (($key, $value) = each(%register_aliases))
+    {
+        s/\b$key\b/$value/g;
+    }
+
+    # Make function visible to linker, and make additional symbol with
+    # prepended underscore
+    s/EXPORT\s+\|([\$\w]*)\|/.globl _$1\n\t.globl $1/;
+
+    # Prepend imported functions with _
+    if (s/IMPORT\s+\|([\$\w]*)\|/.globl $1/)
+    {
+        $function = trim($1);
+        push(@imported_functions, $function);
+    }
+
+    foreach $function (@imported_functions)
+    {
+        s/$function/_$function/;
+    }
+
+    # No vertical bars required; make additional symbol with prepended
+    # underscore
+    s/^\|(\$?\w+)\|/_$1\n\t$1:/g;
+
+    # Labels need trailing colon
+#   s/^(\w+)/$1:/ if !/EQU/;
+    # put the colon at the end of the line in the macro
+    s/^([a-zA-Z_0-9\$]+)/$1:/ if !/EQU/;
+
+    # ALIGN directive
+    s/\bALIGN\b/.balign/g;
+
+    # Strip ARM
+    s/\sARM/@ ARM/g;
+
+    # Strip REQUIRE8
+    #s/\sREQUIRE8/@ REQUIRE8/g;
+    s/\sREQUIRE8/@ /g;
+
+    # Strip PRESERVE8
+    s/\sPRESERVE8/@ PRESERVE8/g;
+
+    # Strip PROC and ENDPROC
+    s/\bPROC\b/@/g;
+    s/\bENDP\b/@/g;
+
+    # EQU directive
+    s/(.*)EQU(.*)/.set $1, $2/;
+
+    # Begin macro definition
+    if (/\bMACRO\b/)
+    {
+        # Process next line down, which will be the macro definition
+        $_ = <STDIN>;
+
+        $trimmed = trim($_);
+
+        # remove commas that are separating list
+        $trimmed =~ s/,//g;
+
+        # string to array
+        @incoming_array = split(/\s+/, $trimmed);
+
+        print ".macro @incoming_array[0]\n";
+
+        # remove the first element, as that is the name of the macro
+        shift (@incoming_array);
+
+        @macro_aliases{@incoming_array} = @mapping_list;
+
+        next;
+    }
+
+    while (($key, $value) = each(%macro_aliases))
+    {
+        $key =~ s/\$/\\\$/;
+        s/$key\b/$value/g;
+    }
+
+    # For macros, use \ to reference formal params
+#   s/\$/\\/g;                  # End macro definition
+    s/\bMEND\b/.endm/;              # No need to tell it where to stop assembling
+    next if /^\s*END\s*$/;
+
+    # Clang used by Chromium differs slightly from clang in XCode in what it
+    # will accept in the assembly.
+    if ($chromium) {
+        s/qsubaddx/qsax/i;
+        s/qaddsubx/qasx/i;
+        s/ldrneb/ldrbne/i;
+        s/ldrneh/ldrhne/i;
+        s/(vqshrun\.s16 .*, \#)0$/${1}8/i;
+
+        # http://llvm.org/bugs/show_bug.cgi?id=16022
+        s/\.include/#include/;
+    }
+
+    print;
+}
diff --git a/libs/libvpx/build/make/armlink_adapter.sh b/libs/libvpx/build/make/armlink_adapter.sh
new file mode 100755
index 0000000000..75c342e97c
--- /dev/null
+++ b/libs/libvpx/build/make/armlink_adapter.sh
@@ -0,0 +1,54 @@
+#!/bin/sh
+##
+##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+
+verbose=0
+set -- $*
+for i; do
+    if [ "$i" = "-o" ]; then
+        on_of=1
+    elif [ "$i" = "-v" ]; then
+        verbose=1
+    elif [ "$i" = "-g" ]; then
+        args="${args} --debug"
+    elif [ "$on_of" = "1" ]; then
+        outfile=$i
+        on_of=0
+    elif [ -f "$i" ]; then
+        infiles="$infiles $i"
+    elif [ "${i#-l}" != "$i" ]; then
+        libs="$libs ${i#-l}"
+    elif [ "${i#-L}" != "$i" ]; then
+        libpaths="${libpaths} ${i#-L}"
+    else
+        args="${args} ${i}"
+    fi
+    shift
+done
+
+# Absolutize library file names
+for f in $libs; do
+    found=0
+    for d in $libpaths; do
+        [ -f "$d/$f" ] && infiles="$infiles $d/$f" && found=1 && break
+        [ -f "$d/lib${f}.so" ] && infiles="$infiles $d/lib${f}.so" && found=1 && break
+        [ -f "$d/lib${f}.a" ] && infiles="$infiles $d/lib${f}.a" && found=1 && break
+    done
+    [ $found -eq 0 ] && infiles="$infiles $f"
+done
+for d in $libpaths; do
+    [ -n "$libsearchpath" ] && libsearchpath="${libsearchpath},"
+    libsearchpath="${libsearchpath}$d"
+done
+
+cmd="armlink $args --userlibpath=$libsearchpath --output=$outfile $infiles"
+[ $verbose -eq 1 ] && echo $cmd
+$cmd
diff --git a/libs/libvpx/build/make/configure.sh b/libs/libvpx/build/make/configure.sh
new file mode 100644
index 0000000000..d888268dc8
--- /dev/null
+++ b/libs/libvpx/build/make/configure.sh
@@ -0,0 +1,1502 @@
+#!/bin/sh
+##
+##  configure.sh
+##
+##  This script is sourced by the main configure script and contains
+##  utility functions and other common bits that aren't strictly libvpx
+##  related.
+##
+##  This build system is based in part on the FFmpeg configure script.
+##
+
+
+#
+# Logging / Output Functions
+#
+die_unknown(){
+  echo "Unknown option \"$1\"."
+  echo "See $0 --help for available options."
+  clean_temp_files
+  exit 1
+}
+
+die() {
+  echo "$@"
+  echo
+  echo "Configuration failed. This could reflect a misconfiguration of your"
+  echo "toolchains, improper options selected, or another problem. If you"
+  echo "don't see any useful error messages above, the next step is to look"
+  echo "at the configure error log file ($logfile) to determine what"
+  echo "configure was trying to do when it died."
+  clean_temp_files
+  exit 1
+}
+
+log(){
+  echo "$@" >>$logfile
+}
+
+log_file(){
+  log BEGIN $1
+  cat -n $1 >>$logfile
+  log END $1
+}
+
+log_echo() {
+  echo "$@"
+  log "$@"
+}
+
+fwrite () {
+  outfile=$1
+  shift
+  echo "$@" >> ${outfile}
+}
+
+show_help_pre(){
+  for opt in ${CMDLINE_SELECT}; do
+    opt2=`echo $opt | sed -e 's;_;-;g'`
+    if enabled $opt; then
+      eval "toggle_${opt}=\"--disable-${opt2}\""
+    else
+      eval "toggle_${opt}=\"--enable-${opt2} \""
+    fi
+  done
+
+  cat <<EOF
+Usage: configure [options]
+Options:
+
+Build options:
+  --help                      print this message
+  --log=yes|no|FILE           file configure log is written to [config.log]
+  --target=TARGET             target platform tuple [generic-gnu]
+  --cpu=CPU                   optimize for a specific cpu rather than a family
+  --extra-cflags=ECFLAGS      add ECFLAGS to CFLAGS [$CFLAGS]
+  --extra-cxxflags=ECXXFLAGS  add ECXXFLAGS to CXXFLAGS [$CXXFLAGS]
+  ${toggle_extra_warnings}    emit harmless warnings (always non-fatal)
+  ${toggle_werror}            treat warnings as errors, if possible
+                              (not available with all compilers)
+  ${toggle_optimizations}     turn on/off compiler optimization flags
+  ${toggle_pic}               turn on/off Position Independent Code
+  ${toggle_ccache}            turn on/off compiler cache
+  ${toggle_debug}             enable/disable debug mode
+  ${toggle_gprof}             enable/disable gprof profiling instrumentation
+  ${toggle_gcov}              enable/disable gcov coverage instrumentation
+  ${toggle_thumb}             enable/disable building arm assembly in thumb mode
+  ${toggle_dependency_tracking}
+                              disable to speed up one-time build
+
+Install options:
+  ${toggle_install_docs}      control whether docs are installed
+  ${toggle_install_bins}      control whether binaries are installed
+  ${toggle_install_libs}      control whether libraries are installed
+  ${toggle_install_srcs}      control whether sources are installed
+
+
+EOF
+}
+
+show_help_post(){
+  cat <<EOF
+
+
+NOTES:
+    Object files are built at the place where configure is launched.
+
+    All boolean options can be negated. The default value is the opposite
+    of that shown above. If the option --disable-foo is listed, then
+    the default value for foo is enabled.
+
+Supported targets:
+EOF
+  show_targets ${all_platforms}
+  echo
+  exit 1
+}
+
+show_targets() {
+  while [ -n "$*" ]; do
+    if [ "${1%%-*}" = "${2%%-*}" ]; then
+      if [ "${2%%-*}" = "${3%%-*}" ]; then
+        printf "    %-24s %-24s %-24s\n" "$1" "$2" "$3"
+        shift; shift; shift
+      else
+        printf "    %-24s %-24s\n" "$1" "$2"
+        shift; shift
+      fi
+    else
+      printf "    %-24s\n" "$1"
+      shift
+    fi
+  done
+}
+
+show_help() {
+  show_help_pre
+  show_help_post
+}
+
+#
+# List Processing Functions
+#
+set_all(){
+  value=$1
+  shift
+  for var in $*; do
+    eval $var=$value
+  done
+}
+
+is_in(){
+  value=$1
+  shift
+  for var in $*; do
+    [ $var = $value ] && return 0
+  done
+  return 1
+}
+
+add_cflags() {
+  CFLAGS="${CFLAGS} $@"
+  CXXFLAGS="${CXXFLAGS} $@"
+}
+
+add_cflags_only() {
+  CFLAGS="${CFLAGS} $@"
+}
+
+add_cxxflags_only() {
+  CXXFLAGS="${CXXFLAGS} $@"
+}
+
+add_ldflags() {
+  LDFLAGS="${LDFLAGS} $@"
+}
+
+add_asflags() {
+  ASFLAGS="${ASFLAGS} $@"
+}
+
+add_extralibs() {
+  extralibs="${extralibs} $@"
+}
+
+#
+# Boolean Manipulation Functions
+#
+enable_feature(){
+  set_all yes $*
+}
+
+disable_feature(){
+  set_all no $*
+}
+
+enabled(){
+  eval test "x\$$1" = "xyes"
+}
+
+disabled(){
+  eval test "x\$$1" = "xno"
+}
+
+# Iterates through positional parameters, checks to confirm the parameter has
+# not been explicitly (force) disabled, and enables the setting controlled by
+# the parameter when the setting is not disabled.
+# Note: Does NOT alter RTCD generation options ($RTCD_OPTIONS).
+soft_enable() {
+  for var in $*; do
+    if ! disabled $var; then
+      enabled $var || log_echo "  enabling $var"
+      enable_feature $var
+    fi
+  done
+}
+
+# Iterates through positional parameters, checks to confirm the parameter has
+# not been explicitly (force) enabled, and disables the setting controlled by
+# the parameter when the setting is not enabled.
+# Note: Does NOT alter RTCD generation options ($RTCD_OPTIONS).
+soft_disable() {
+  for var in $*; do
+    if ! enabled $var; then
+      disabled $var || log_echo "  disabling $var"
+      disable_feature $var
+    fi
+  done
+}
+
+#
+# Text Processing Functions
+#
+toupper(){
+  echo "$@" | tr abcdefghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ
+}
+
+tolower(){
+  echo "$@" | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz
+}
+
+#
+# Temporary File Functions
+#
+source_path=${0%/*}
+enable_feature source_path_used
+if [ -z "$source_path" ] || [ "$source_path" = "." ]; then
+  source_path="`pwd`"
+  disable_feature source_path_used
+fi
+
+if test ! -z "$TMPDIR" ; then
+  TMPDIRx="${TMPDIR}"
+elif test ! -z "$TEMPDIR" ; then
+  TMPDIRx="${TEMPDIR}"
+else
+  TMPDIRx="/tmp"
+fi
+RAND=$(awk 'BEGIN { srand(); printf "%d\n",(rand() * 32768)}')
+TMP_H="${TMPDIRx}/vpx-conf-$$-${RAND}.h"
+TMP_C="${TMPDIRx}/vpx-conf-$$-${RAND}.c"
+TMP_CC="${TMPDIRx}/vpx-conf-$$-${RAND}.cc"
+TMP_O="${TMPDIRx}/vpx-conf-$$-${RAND}.o"
+TMP_X="${TMPDIRx}/vpx-conf-$$-${RAND}.x"
+TMP_ASM="${TMPDIRx}/vpx-conf-$$-${RAND}.asm"
+
+clean_temp_files() {
+  rm -f ${TMP_C} ${TMP_CC} ${TMP_H} ${TMP_O} ${TMP_X} ${TMP_ASM}
+  enabled gcov && rm -f ${TMP_C%.c}.gcno ${TMP_CC%.cc}.gcno
+}
+
+#
+# Toolchain Check Functions
+#
+check_cmd() {
+  enabled external_build && return
+  log "$@"
+  "$@" >>${logfile} 2>&1
+}
+
+check_cc() {
+  log check_cc "$@"
+  cat >${TMP_C}
+  log_file ${TMP_C}
+  check_cmd ${CC} ${CFLAGS} "$@" -c -o ${TMP_O} ${TMP_C}
+}
+
+check_cxx() {
+  log check_cxx "$@"
+  cat >${TMP_CC}
+  log_file ${TMP_CC}
+  check_cmd ${CXX} ${CXXFLAGS} "$@" -c -o ${TMP_O} ${TMP_CC}
+}
+
+check_cpp() {
+  log check_cpp "$@"
+  cat > ${TMP_C}
+  log_file ${TMP_C}
+  check_cmd ${CC} ${CFLAGS} "$@" -E -o ${TMP_O} ${TMP_C}
+}
+
+check_ld() {
+  log check_ld "$@"
+  check_cc $@ \
+    && check_cmd ${LD} ${LDFLAGS} "$@" -o ${TMP_X} ${TMP_O} ${extralibs}
+}
+
+check_header(){
+  log check_header "$@"
+  header=$1
+  shift
+  var=`echo $header | sed 's/[^A-Za-z0-9_]/_/g'`
+  disable_feature $var
+  check_cpp "$@" <<EOF && enable_feature $var
+#include "$header"
+int x;
+EOF
+}
+
+check_cflags() {
+ log check_cflags "$@"
+ check_cc -Werror "$@" <<EOF
+int x;
+EOF
+}
+
+check_cxxflags() {
+  log check_cxxflags "$@"
+
+  # Catch CFLAGS that trigger CXX warnings
+  case "$CXX" in
+    *c++-analyzer|*clang++|*g++*)
+      check_cxx -Werror "$@" <<EOF
+int x;
+EOF
+      ;;
+    *)
+      check_cxx -Werror "$@" <<EOF
+int x;
+EOF
+      ;;
+    esac
+}
+
+check_add_cflags() {
+  check_cxxflags "$@" && add_cxxflags_only "$@"
+  check_cflags "$@" && add_cflags_only "$@"
+}
+
+check_add_cxxflags() {
+  check_cxxflags "$@" && add_cxxflags_only "$@"
+}
+
+check_add_asflags() {
+  log add_asflags "$@"
+  add_asflags "$@"
+}
+
+check_add_ldflags() {
+  log add_ldflags "$@"
+  add_ldflags "$@"
+}
+
+check_asm_align() {
+  log check_asm_align "$@"
+  cat >${TMP_ASM} <<EOF
+section .rodata
+align 16
+EOF
+  log_file ${TMP_ASM}
+  check_cmd ${AS} ${ASFLAGS} -o ${TMP_O} ${TMP_ASM}
+  readelf -WS ${TMP_O} >${TMP_X}
+  log_file ${TMP_X}
+  if ! grep -q '\.rodata .* 16$' ${TMP_X}; then
+    die "${AS} ${ASFLAGS} does not support section alignment (nasm <=2.08?)"
+  fi
+}
+
+# tests for -m$1 toggling the feature given in $2. If $2 is empty $1 is used.
+check_gcc_machine_option() {
+  opt="$1"
+  feature="$2"
+  [ -n "$feature" ] || feature="$opt"
+
+  if enabled gcc && ! disabled "$feature" && ! check_cflags "-m$opt"; then
+    RTCD_OPTIONS="${RTCD_OPTIONS}--disable-$feature "
+  else
+    soft_enable "$feature"
+  fi
+}
+
+write_common_config_banner() {
+  print_webm_license config.mk "##" ""
+  echo '# This file automatically generated by configure. Do not edit!' >> config.mk
+  echo "TOOLCHAIN := ${toolchain}" >> config.mk
+
+  case ${toolchain} in
+    *-linux-rvct)
+      echo "ALT_LIBC := ${alt_libc}" >> config.mk
+      ;;
+  esac
+}
+
+write_common_config_targets() {
+  for t in ${all_targets}; do
+    if enabled ${t}; then
+      if enabled child; then
+        fwrite config.mk "ALL_TARGETS += ${t}-${toolchain}"
+      else
+        fwrite config.mk "ALL_TARGETS += ${t}"
+      fi
+    fi
+    true;
+  done
+  true
+}
+
+write_common_target_config_mk() {
+  saved_CC="${CC}"
+  saved_CXX="${CXX}"
+  enabled ccache && CC="ccache ${CC}"
+  enabled ccache && CXX="ccache ${CXX}"
+  print_webm_license $1 "##" ""
+
+  cat >> $1 << EOF
+# This file automatically generated by configure. Do not edit!
+SRC_PATH="$source_path"
+SRC_PATH_BARE=$source_path
+BUILD_PFX=${BUILD_PFX}
+TOOLCHAIN=${toolchain}
+ASM_CONVERSION=${asm_conversion_cmd:-${source_path}/build/make/ads2gas.pl}
+GEN_VCPROJ=${gen_vcproj_cmd}
+MSVS_ARCH_DIR=${msvs_arch_dir}
+
+CC=${CC}
+CXX=${CXX}
+AR=${AR}
+LD=${LD}
+AS=${AS}
+STRIP=${STRIP}
+NM=${NM}
+
+CFLAGS  = ${CFLAGS}
+CXXFLAGS  = ${CXXFLAGS}
+ARFLAGS = -crs\$(if \$(quiet),,v)
+LDFLAGS = ${LDFLAGS}
+ASFLAGS = ${ASFLAGS}
+extralibs = ${extralibs}
+AS_SFX    = ${AS_SFX:-.asm}
+EXE_SFX   = ${EXE_SFX}
+VCPROJ_SFX = ${VCPROJ_SFX}
+RTCD_OPTIONS = ${RTCD_OPTIONS}
+EOF
+
+  if enabled rvct; then cat >> $1 << EOF
+fmt_deps = sed -e 's;^__image.axf;\${@:.d=.o} \$@;' #hide
+EOF
+  else cat >> $1 << EOF
+fmt_deps = sed -e 's;^\([a-zA-Z0-9_]*\)\.o;\${@:.d=.o} \$@;'
+EOF
+  fi
+
+  print_config_mk ARCH   "${1}" ${ARCH_LIST}
+  print_config_mk HAVE   "${1}" ${HAVE_LIST}
+  print_config_mk CONFIG "${1}" ${CONFIG_LIST}
+  print_config_mk HAVE   "${1}" gnu_strip
+
+  enabled msvs && echo "CONFIG_VS_VERSION=${vs_version}" >> "${1}"
+
+  CC="${saved_CC}"
+  CXX="${saved_CXX}"
+}
+
+write_common_target_config_h() {
+  print_webm_license ${TMP_H} "/*" " */"
+  cat >> ${TMP_H} << EOF
+/* This file automatically generated by configure. Do not edit! */
+#ifndef VPX_CONFIG_H
+#define VPX_CONFIG_H
+#define RESTRICT    ${RESTRICT}
+#define INLINE      ${INLINE}
+EOF
+  print_config_h ARCH   "${TMP_H}" ${ARCH_LIST}
+  print_config_h HAVE   "${TMP_H}" ${HAVE_LIST}
+  print_config_h CONFIG "${TMP_H}" ${CONFIG_LIST}
+  print_config_vars_h   "${TMP_H}" ${VAR_LIST}
+  echo "#endif /* VPX_CONFIG_H */" >> ${TMP_H}
+  mkdir -p `dirname "$1"`
+  cmp "$1" ${TMP_H} >/dev/null 2>&1 || mv ${TMP_H} "$1"
+}
+
+process_common_cmdline() {
+  for opt in "$@"; do
+    optval="${opt#*=}"
+    case "$opt" in
+      --child)
+        enable_feature child
+        ;;
+      --log*)
+        logging="$optval"
+        if ! disabled logging ; then
+          enabled logging || logfile="$logging"
+        else
+          logfile=/dev/null
+        fi
+        ;;
+      --target=*)
+        toolchain="${toolchain:-${optval}}"
+        ;;
+      --force-target=*)
+        toolchain="${toolchain:-${optval}}"
+        enable_feature force_toolchain
+        ;;
+      --cpu=*)
+        tune_cpu="$optval"
+        ;;
+      --extra-cflags=*)
+        extra_cflags="${optval}"
+        ;;
+      --extra-cxxflags=*)
+        extra_cxxflags="${optval}"
+        ;;
+      --enable-?*|--disable-?*)
+        eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
+        if echo "${ARCH_EXT_LIST}" | grep "^ *$option\$" >/dev/null; then
+          [ $action = "disable" ] && RTCD_OPTIONS="${RTCD_OPTIONS}--disable-${option} "
+        elif [ $action = "disable" ] && ! disabled $option ; then
+          echo "${CMDLINE_SELECT}" | grep "^ *$option\$" >/dev/null ||
+            die_unknown $opt
+          log_echo "  disabling $option"
+        elif [ $action = "enable" ] && ! enabled $option ; then
+          echo "${CMDLINE_SELECT}" | grep "^ *$option\$" >/dev/null ||
+            die_unknown $opt
+          log_echo "  enabling $option"
+        fi
+        ${action}_feature $option
+        ;;
+      --require-?*)
+        eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
+        if echo "${ARCH_EXT_LIST}" none | grep "^ *$option\$" >/dev/null; then
+            RTCD_OPTIONS="${RTCD_OPTIONS}${opt} "
+        else
+            die_unknown $opt
+        fi
+        ;;
+      --force-enable-?*|--force-disable-?*)
+        eval `echo "$opt" | sed 's/--force-/action=/;s/-/ option=/;s/-/_/g'`
+        ${action}_feature $option
+        ;;
+      --libc=*)
+        [ -d "${optval}" ] || die "Not a directory: ${optval}"
+        disable_feature builtin_libc
+        alt_libc="${optval}"
+        ;;
+      --as=*)
+        [ "${optval}" = yasm ] || [ "${optval}" = nasm ] \
+          || [ "${optval}" = auto ] \
+          || die "Must be yasm, nasm or auto: ${optval}"
+        alt_as="${optval}"
+        ;;
+      --size-limit=*)
+        w="${optval%%x*}"
+        h="${optval##*x}"
+        VAR_LIST="DECODE_WIDTH_LIMIT ${w} DECODE_HEIGHT_LIMIT ${h}"
+        [ ${w} -gt 0 ] && [ ${h} -gt 0 ] || die "Invalid size-limit: too small."
+        [ ${w} -lt 65536 ] && [ ${h} -lt 65536 ] \
+            || die "Invalid size-limit: too big."
+        enable_feature size_limit
+        ;;
+      --prefix=*)
+        prefix="${optval}"
+        ;;
+      --libdir=*)
+        libdir="${optval}"
+        ;;
+      --sdk-path=*)
+        [ -d "${optval}" ] || die "Not a directory: ${optval}"
+        sdk_path="${optval}"
+        ;;
+      --libc|--as|--prefix|--libdir|--sdk-path)
+        die "Option ${opt} requires argument"
+        ;;
+      --help|-h)
+        show_help
+        ;;
+      *)
+        die_unknown $opt
+        ;;
+    esac
+  done
+}
+
+process_cmdline() {
+  for opt do
+    optval="${opt#*=}"
+    case "$opt" in
+      *)
+        process_common_cmdline $opt
+        ;;
+    esac
+  done
+}
+
+post_process_common_cmdline() {
+  prefix="${prefix:-/usr/local}"
+  prefix="${prefix%/}"
+  libdir="${libdir:-${prefix}/lib}"
+  libdir="${libdir%/}"
+  if [ "${libdir#${prefix}}" = "${libdir}" ]; then
+    die "Libdir ${libdir} must be a subdirectory of ${prefix}"
+  fi
+}
+
+post_process_cmdline() {
+  true;
+}
+
+setup_gnu_toolchain() {
+  CC=${CC:-${CROSS}gcc}
+  CXX=${CXX:-${CROSS}g++}
+  AR=${AR:-${CROSS}ar}
+  LD=${LD:-${CROSS}${link_with_cc:-ld}}
+  AS=${AS:-${CROSS}as}
+  STRIP=${STRIP:-${CROSS}strip}
+  NM=${NM:-${CROSS}nm}
+  AS_SFX=.s
+  EXE_SFX=
+}
+
+# Reliably find the newest available Darwin SDKs. (Older versions of
+# xcrun don't support --show-sdk-path.)
+show_darwin_sdk_path() {
+  xcrun --sdk $1 --show-sdk-path 2>/dev/null ||
+    xcodebuild -sdk $1 -version Path 2>/dev/null
+}
+
+# Print the major version number of the Darwin SDK specified by $1.
+show_darwin_sdk_major_version() {
+  xcrun --sdk $1 --show-sdk-version 2>/dev/null | cut -d. -f1
+}
+
+process_common_toolchain() {
+  if [ -z "$toolchain" ]; then
+    gcctarget="${CHOST:-$(gcc -dumpmachine 2> /dev/null)}"
+
+    # detect tgt_isa
+    case "$gcctarget" in
+      armv6*)
+        tgt_isa=armv6
+        ;;
+      armv7*-hardfloat* | armv7*-gnueabihf | arm-*-gnueabihf)
+        tgt_isa=armv7
+        float_abi=hard
+        ;;
+      armv7*)
+        tgt_isa=armv7
+        float_abi=softfp
+        ;;
+      *x86_64*|*amd64*)
+        tgt_isa=x86_64
+        ;;
+      *i[3456]86*)
+        tgt_isa=x86
+        ;;
+      *sparc*)
+        tgt_isa=sparc
+        ;;
+    esac
+
+    # detect tgt_os
+    case "$gcctarget" in
+      *darwin10*)
+        tgt_isa=x86_64
+        tgt_os=darwin10
+        ;;
+      *darwin11*)
+        tgt_isa=x86_64
+        tgt_os=darwin11
+        ;;
+      *darwin12*)
+        tgt_isa=x86_64
+        tgt_os=darwin12
+        ;;
+      *darwin13*)
+        tgt_isa=x86_64
+        tgt_os=darwin13
+        ;;
+      *darwin14*)
+        tgt_isa=x86_64
+        tgt_os=darwin14
+        ;;
+      *darwin15*)
+        tgt_isa=x86_64
+        tgt_os=darwin15
+        ;;
+      x86_64*mingw32*)
+        tgt_os=win64
+        ;;
+      *mingw32*|*cygwin*)
+        [ -z "$tgt_isa" ] && tgt_isa=x86
+        tgt_os=win32
+        ;;
+      *linux*|*bsd*)
+        tgt_os=linux
+        ;;
+      *solaris2.10)
+        tgt_os=solaris
+        ;;
+      *os2*)
+        tgt_os=os2
+        ;;
+    esac
+
+    if [ -n "$tgt_isa" ] && [ -n "$tgt_os" ]; then
+      toolchain=${tgt_isa}-${tgt_os}-gcc
+    fi
+  fi
+
+  toolchain=${toolchain:-generic-gnu}
+
+  is_in ${toolchain} ${all_platforms} || enabled force_toolchain \
+    || die "Unrecognized toolchain '${toolchain}'"
+
+  enabled child || log_echo "Configuring for target '${toolchain}'"
+
+  #
+  # Set up toolchain variables
+  #
+  tgt_isa=$(echo ${toolchain} | awk 'BEGIN{FS="-"}{print $1}')
+  tgt_os=$(echo ${toolchain} | awk 'BEGIN{FS="-"}{print $2}')
+  tgt_cc=$(echo ${toolchain} | awk 'BEGIN{FS="-"}{print $3}')
+
+  # Mark the specific ISA requested as enabled
+  soft_enable ${tgt_isa}
+  enable_feature ${tgt_os}
+  enable_feature ${tgt_cc}
+
+  # Enable the architecture family
+  case ${tgt_isa} in
+    arm*)
+      enable_feature arm
+      ;;
+    mips*)
+      enable_feature mips
+      ;;
+  esac
+
+  # PIC is probably what we want when building shared libs
+  enabled shared && soft_enable pic
+
+  # Minimum iOS version for all target platforms (darwin and iphonesimulator).
+  IOS_VERSION_MIN="6.0"
+
+  # Handle darwin variants. Newer SDKs allow targeting older
+  # platforms, so use the newest one available.
+  case ${toolchain} in
+    arm*-darwin*)
+      add_cflags "-miphoneos-version-min=${IOS_VERSION_MIN}"
+      iphoneos_sdk_dir="$(show_darwin_sdk_path iphoneos)"
+      if [ -d "${iphoneos_sdk_dir}" ]; then
+        add_cflags  "-isysroot ${iphoneos_sdk_dir}"
+        add_ldflags "-isysroot ${iphoneos_sdk_dir}"
+      fi
+      ;;
+    x86*-darwin*)
+      osx_sdk_dir="$(show_darwin_sdk_path macosx)"
+      if [ -d "${osx_sdk_dir}" ]; then
+        add_cflags  "-isysroot ${osx_sdk_dir}"
+        add_ldflags "-isysroot ${osx_sdk_dir}"
+      fi
+      ;;
+  esac
+
+  case ${toolchain} in
+    *-darwin8-*)
+      add_cflags  "-mmacosx-version-min=10.4"
+      add_ldflags "-mmacosx-version-min=10.4"
+      ;;
+    *-darwin9-*)
+      add_cflags  "-mmacosx-version-min=10.5"
+      add_ldflags "-mmacosx-version-min=10.5"
+      ;;
+    *-darwin10-*)
+      add_cflags  "-mmacosx-version-min=10.6"
+      add_ldflags "-mmacosx-version-min=10.6"
+      ;;
+    *-darwin11-*)
+      add_cflags  "-mmacosx-version-min=10.7"
+      add_ldflags "-mmacosx-version-min=10.7"
+      ;;
+    *-darwin12-*)
+      add_cflags  "-mmacosx-version-min=10.8"
+      add_ldflags "-mmacosx-version-min=10.8"
+      ;;
+    *-darwin13-*)
+      add_cflags  "-mmacosx-version-min=10.9"
+      add_ldflags "-mmacosx-version-min=10.9"
+      ;;
+    *-darwin14-*)
+      add_cflags  "-mmacosx-version-min=10.10"
+      add_ldflags "-mmacosx-version-min=10.10"
+      ;;
+    *-darwin15-*)
+      add_cflags  "-mmacosx-version-min=10.11"
+      add_ldflags "-mmacosx-version-min=10.11"
+      ;;
+    *-iphonesimulator-*)
+      add_cflags  "-miphoneos-version-min=${IOS_VERSION_MIN}"
+      add_ldflags "-miphoneos-version-min=${IOS_VERSION_MIN}"
+      iossim_sdk_dir="$(show_darwin_sdk_path iphonesimulator)"
+      if [ -d "${iossim_sdk_dir}" ]; then
+        add_cflags  "-isysroot ${iossim_sdk_dir}"
+        add_ldflags "-isysroot ${iossim_sdk_dir}"
+      fi
+      ;;
+  esac
+
+  # Handle Solaris variants. Solaris 10 needs -lposix4
+  case ${toolchain} in
+    sparc-solaris-*)
+      add_extralibs -lposix4
+      ;;
+    *-solaris-*)
+      add_extralibs -lposix4
+      ;;
+  esac
+
+  # Process ARM architecture variants
+  case ${toolchain} in
+    arm*)
+      # on arm, isa versions are supersets
+      case ${tgt_isa} in
+        arm64|armv8)
+          soft_enable neon
+          ;;
+        armv7|armv7s)
+          soft_enable neon
+          # Only enable neon_asm when neon is also enabled.
+          enabled neon && soft_enable neon_asm
+          # If someone tries to force it through, die.
+          if disabled neon && enabled neon_asm; then
+            die "Disabling neon while keeping neon-asm is not supported"
+          fi
+          case ${toolchain} in
+            # Apple iOS SDKs no longer support armv6 as of the version 9
+            # release (coincides with release of Xcode 7). Only enable media
+            # when using earlier SDK releases.
+            *-darwin*)
+              if [ "$(show_darwin_sdk_major_version iphoneos)" -lt 9 ]; then
+                soft_enable media
+              else
+                soft_disable media
+                RTCD_OPTIONS="${RTCD_OPTIONS}--disable-media "
+              fi
+              ;;
+            *)
+              soft_enable media
+              ;;
+          esac
+          ;;
+        armv6)
+          case ${toolchain} in
+            *-darwin*)
+              if [ "$(show_darwin_sdk_major_version iphoneos)" -lt 9 ]; then
+                soft_enable media
+              else
+                die "Your iOS SDK does not support armv6."
+              fi
+              ;;
+            *)
+              soft_enable media
+              ;;
+          esac
+          ;;
+      esac
+
+      asm_conversion_cmd="cat"
+
+      case ${tgt_cc} in
+        gcc)
+          link_with_cc=gcc
+          setup_gnu_toolchain
+          arch_int=${tgt_isa##armv}
+          arch_int=${arch_int%%te}
+          check_add_asflags --defsym ARCHITECTURE=${arch_int}
+          tune_cflags="-mtune="
+          if [ ${tgt_isa} = "armv7" ] || [ ${tgt_isa} = "armv7s" ]; then
+            if [ -z "${float_abi}" ]; then
+              check_cpp <<EOF && float_abi=hard || float_abi=softfp
+#ifndef __ARM_PCS_VFP
+#error "not hardfp"
+#endif
+EOF
+            fi
+            check_add_cflags  -march=armv7-a -mfloat-abi=${float_abi}
+            check_add_asflags -march=armv7-a -mfloat-abi=${float_abi}
+
+            if enabled neon || enabled neon_asm; then
+              check_add_cflags -mfpu=neon #-ftree-vectorize
+              check_add_asflags -mfpu=neon
+            fi
+          else
+            check_add_cflags -march=${tgt_isa}
+            check_add_asflags -march=${tgt_isa}
+          fi
+
+          enabled debug && add_asflags -g
+          asm_conversion_cmd="${source_path}/build/make/ads2gas.pl"
+          if enabled thumb; then
+            asm_conversion_cmd="$asm_conversion_cmd -thumb"
+            check_add_cflags -mthumb
+            check_add_asflags -mthumb -mimplicit-it=always
+          fi
+          ;;
+        vs*)
+          asm_conversion_cmd="${source_path}/build/make/ads2armasm_ms.pl"
+          AS_SFX=.s
+          msvs_arch_dir=arm-msvs
+          disable_feature multithread
+          disable_feature unit_tests
+          vs_version=${tgt_cc##vs}
+          if [ $vs_version -ge 12 ]; then
+            # MSVC 2013 doesn't allow doing plain .exe projects for ARM,
+            # only "AppContainerApplication" which requires an AppxManifest.
+            # Therefore disable the examples, just build the library.
+            disable_feature examples
+          fi
+          ;;
+        rvct)
+          CC=armcc
+          AR=armar
+          AS=armasm
+          LD="${source_path}/build/make/armlink_adapter.sh"
+          STRIP=arm-none-linux-gnueabi-strip
+          NM=arm-none-linux-gnueabi-nm
+          tune_cflags="--cpu="
+          tune_asflags="--cpu="
+          if [ -z "${tune_cpu}" ]; then
+            if [ ${tgt_isa} = "armv7" ]; then
+              if enabled neon || enabled neon_asm
+              then
+                check_add_cflags --fpu=softvfp+vfpv3
+                check_add_asflags --fpu=softvfp+vfpv3
+              fi
+              check_add_cflags --cpu=Cortex-A8
+              check_add_asflags --cpu=Cortex-A8
+            else
+              check_add_cflags --cpu=${tgt_isa##armv}
+              check_add_asflags --cpu=${tgt_isa##armv}
+            fi
+          fi
+          arch_int=${tgt_isa##armv}
+          arch_int=${arch_int%%te}
+          check_add_asflags --pd "\"ARCHITECTURE SETA ${arch_int}\""
+          enabled debug && add_asflags -g
+          add_cflags --gnu
+          add_cflags --enum_is_int
+          add_cflags --wchar32
+          ;;
+      esac
+
+      case ${tgt_os} in
+        none*)
+          disable_feature multithread
+          disable_feature os_support
+          ;;
+
+        android*)
+          SDK_PATH=${sdk_path}
+          COMPILER_LOCATION=`find "${SDK_PATH}" \
+                             -name "arm-linux-androideabi-gcc*" -print -quit`
+          TOOLCHAIN_PATH=${COMPILER_LOCATION%/*}/arm-linux-androideabi-
+          CC=${TOOLCHAIN_PATH}gcc
+          CXX=${TOOLCHAIN_PATH}g++
+          AR=${TOOLCHAIN_PATH}ar
+          LD=${TOOLCHAIN_PATH}gcc
+          AS=${TOOLCHAIN_PATH}as
+          STRIP=${TOOLCHAIN_PATH}strip
+          NM=${TOOLCHAIN_PATH}nm
+
+          if [ -z "${alt_libc}" ]; then
+            alt_libc=`find "${SDK_PATH}" -name arch-arm -print | \
+              awk '{n = split($0,a,"/"); \
+                split(a[n-1],b,"-"); \
+                print $0 " " b[2]}' | \
+                sort -g -k 2 | \
+                awk '{ print $1 }' | tail -1`
+          fi
+
+          if [ -d "${alt_libc}" ]; then
+            add_cflags "--sysroot=${alt_libc}"
+            add_ldflags "--sysroot=${alt_libc}"
+          fi
+
+          # linker flag that routes around a CPU bug in some
+          # Cortex-A8 implementations (NDK Dev Guide)
+          add_ldflags "-Wl,--fix-cortex-a8"
+
+          enable_feature pic
+          soft_enable realtime_only
+          if [ ${tgt_isa} = "armv7" ]; then
+            soft_enable runtime_cpu_detect
+          fi
+          if enabled runtime_cpu_detect; then
+            add_cflags "-I${SDK_PATH}/sources/android/cpufeatures"
+          fi
+          ;;
+
+        darwin*)
+          XCRUN_FIND="xcrun --sdk iphoneos --find"
+          CXX="$(${XCRUN_FIND} clang++)"
+          CC="$(${XCRUN_FIND} clang)"
+          AR="$(${XCRUN_FIND} ar)"
+          AS="$(${XCRUN_FIND} as)"
+          STRIP="$(${XCRUN_FIND} strip)"
+          NM="$(${XCRUN_FIND} nm)"
+          RANLIB="$(${XCRUN_FIND} ranlib)"
+          AS_SFX=.s
+
+          # Special handling of ld for armv6 because libclang_rt.ios.a does
+          # not contain armv6 support in Apple's clang package:
+          #   Apple LLVM version 5.1 (clang-503.0.40) (based on LLVM 3.4svn).
+          # TODO(tomfinegan): Remove this. Our minimum iOS version (6.0)
+          # renders support for armv6 unnecessary because the 3GS and up
+          # support neon.
+          if [ "${tgt_isa}" = "armv6" ]; then
+            LD="$(${XCRUN_FIND} ld)"
+          else
+            LD="${CXX:-$(${XCRUN_FIND} ld)}"
+          fi
+
+          # ASFLAGS is written here instead of using check_add_asflags
+          # because we need to overwrite all of ASFLAGS and purge the
+          # options that were put in above
+          ASFLAGS="-arch ${tgt_isa} -g"
+
+          add_cflags -arch ${tgt_isa}
+          add_ldflags -arch ${tgt_isa}
+
+          alt_libc="$(show_darwin_sdk_path iphoneos)"
+          if [ -d "${alt_libc}" ]; then
+            add_cflags -isysroot ${alt_libc}
+          fi
+
+          if [ "${LD}" = "${CXX}" ]; then
+            add_ldflags -miphoneos-version-min="${IOS_VERSION_MIN}"
+          else
+            add_ldflags -ios_version_min "${IOS_VERSION_MIN}"
+          fi
+
+          for d in lib usr/lib usr/lib/system; do
+            try_dir="${alt_libc}/${d}"
+            [ -d "${try_dir}" ] && add_ldflags -L"${try_dir}"
+          done
+
+          asm_conversion_cmd="${source_path}/build/make/ads2gas_apple.pl"
+
+          if [ "$(show_darwin_sdk_major_version iphoneos)" -gt 8 ]; then
+            check_add_cflags -fembed-bitcode
+            check_add_asflags -fembed-bitcode
+            check_add_ldflags -fembed-bitcode
+          fi
+          ;;
+
+        linux*)
+          enable_feature linux
+          if enabled rvct; then
+            # Check if we have CodeSourcery GCC in PATH. Needed for
+            # libraries
+            hash arm-none-linux-gnueabi-gcc 2>&- || \
+              die "Couldn't find CodeSourcery GCC from PATH"
+
+            # Use armcc as a linker to enable translation of
+            # some gcc specific options such as -lm and -lpthread.
+            LD="armcc --translate_gcc"
+
+            # create configuration file (uses path to CodeSourcery GCC)
+            armcc --arm_linux_configure --arm_linux_config_file=arm_linux.cfg
+
+            add_cflags --arm_linux_paths --arm_linux_config_file=arm_linux.cfg
+            add_asflags --no_hide_all --apcs=/interwork
+            add_ldflags --arm_linux_paths --arm_linux_config_file=arm_linux.cfg
+            enabled pic && add_cflags --apcs=/fpic
+            enabled pic && add_asflags --apcs=/fpic
+            enabled shared && add_cflags --shared
+          fi
+          ;;
+      esac
+      ;;
+    mips*)
+      link_with_cc=gcc
+      setup_gnu_toolchain
+      tune_cflags="-mtune="
+      if enabled dspr2; then
+        check_add_cflags -mips32r2 -mdspr2
+      fi
+
+      if enabled runtime_cpu_detect; then
+        disable_feature runtime_cpu_detect
+      fi
+
+      if [ -n "${tune_cpu}" ]; then
+        case ${tune_cpu} in
+          p5600)
+            check_add_cflags -mips32r5 -funroll-loops -mload-store-pairs
+            check_add_cflags -msched-weight -mhard-float -mfp64
+            check_add_asflags -mips32r5 -mhard-float -mfp64
+            check_add_ldflags -mfp64
+            ;;
+          i6400)
+            check_add_cflags -mips64r6 -mabi=64 -funroll-loops -msched-weight 
+            check_add_cflags  -mload-store-pairs -mhard-float -mfp64
+            check_add_asflags -mips64r6 -mabi=64 -mhard-float -mfp64
+            check_add_ldflags -mips64r6 -mabi=64 -mfp64
+            ;;
+        esac
+
+        if enabled msa; then
+          add_cflags -mmsa
+          add_asflags -mmsa
+          add_ldflags -mmsa
+        fi
+      fi
+
+      check_add_cflags -march=${tgt_isa}
+      check_add_asflags -march=${tgt_isa}
+      check_add_asflags -KPIC
+      ;;
+    x86*)
+      case  ${tgt_os} in
+        win*)
+          enabled gcc && add_cflags -fno-common
+          ;;
+        solaris*)
+          CC=${CC:-${CROSS}gcc}
+          CXX=${CXX:-${CROSS}g++}
+          LD=${LD:-${CROSS}gcc}
+          CROSS=${CROSS-g}
+          ;;
+        os2)
+          disable_feature pic
+          AS=${AS:-nasm}
+          add_ldflags -Zhigh-mem
+          ;;
+      esac
+
+      AS="${alt_as:-${AS:-auto}}"
+      case  ${tgt_cc} in
+        icc*)
+          CC=${CC:-icc}
+          LD=${LD:-icc}
+          setup_gnu_toolchain
+          add_cflags -use-msasm  # remove -use-msasm too?
+          # add -no-intel-extensions to suppress warning #10237
+          # refer to http://software.intel.com/en-us/forums/topic/280199
+          add_ldflags -i-static -no-intel-extensions
+          enabled x86_64 && add_cflags -ipo -static -O3 -no-prec-div
+          enabled x86_64 && AR=xiar
+          case ${tune_cpu} in
+            atom*)
+              tune_cflags="-x"
+              tune_cpu="SSE3_ATOM"
+              ;;
+            *)
+              tune_cflags="-march="
+              ;;
+          esac
+          ;;
+        gcc*)
+          link_with_cc=gcc
+          tune_cflags="-march="
+          setup_gnu_toolchain
+          #for 32 bit x86 builds, -O3 did not turn on this flag
+          enabled optimizations && disabled gprof && check_add_cflags -fomit-frame-pointer
+          ;;
+        vs*)
+          # When building with Microsoft Visual Studio the assembler is
+          # invoked directly. Checking at configure time is unnecessary.
+          # Skip the check by setting AS arbitrarily
+          AS=msvs
+          msvs_arch_dir=x86-msvs
+          vc_version=${tgt_cc##vs}
+          case $vc_version in
+            7|8|9|10)
+              echo "${tgt_cc} does not support avx/avx2, disabling....."
+              RTCD_OPTIONS="${RTCD_OPTIONS}--disable-avx --disable-avx2 "
+              soft_disable avx
+              soft_disable avx2
+              ;;
+          esac
+          ;;
+      esac
+
+      bits=32
+      enabled x86_64 && bits=64
+      check_cpp <<EOF && bits=x32
+#if !defined(__ILP32__) || !defined(__x86_64__)
+#error "not x32"
+#endif
+EOF
+      case ${tgt_cc} in
+        gcc*)
+          add_cflags -m${bits}
+          add_ldflags -m${bits}
+          ;;
+      esac
+
+      soft_enable runtime_cpu_detect
+      # We can't use 'check_cflags' until the compiler is configured and CC is
+      # populated.
+      for ext in ${ARCH_EXT_LIST_X86}; do
+        # disable higher order extensions to simplify asm dependencies
+        if [ "$disable_exts" = "yes" ]; then
+          if ! disabled $ext; then
+            RTCD_OPTIONS="${RTCD_OPTIONS}--disable-${ext} "
+            disable_feature $ext
+          fi
+        elif disabled $ext; then
+          disable_exts="yes"
+        else
+          # use the shortened version for the flag: sse4_1 -> sse4
+          check_gcc_machine_option ${ext%_*} $ext
+        fi
+      done
+
+      if enabled external_build; then
+        log_echo "  skipping assembler detection"
+      else
+        case "${AS}" in
+          auto|"")
+            which nasm >/dev/null 2>&1 && AS=nasm
+            which yasm >/dev/null 2>&1 && AS=yasm
+            if [ "${AS}" = nasm ] ; then
+              # Apple ships version 0.98 of nasm through at least Xcode 6. Revisit
+              # this check if they start shipping a compatible version.
+              apple=`nasm -v | grep "Apple"`
+              [ -n "${apple}" ] \
+                && echo "Unsupported version of nasm: ${apple}" \
+                && AS=""
+            fi
+            [ "${AS}" = auto ] || [ -z "${AS}" ] \
+              && die "Neither yasm nor nasm have been found." \
+                     "See the prerequisites section in the README for more info."
+            ;;
+        esac
+        log_echo "  using $AS"
+      fi
+      [ "${AS##*/}" = nasm ] && add_asflags -Ox
+      AS_SFX=.asm
+      case  ${tgt_os} in
+        win32)
+          add_asflags -f win32
+          enabled debug && add_asflags -g cv8
+          EXE_SFX=.exe
+          ;;
+        win64)
+          add_asflags -f x64
+          enabled debug && add_asflags -g cv8
+          EXE_SFX=.exe
+          ;;
+        linux*|solaris*|android*)
+          add_asflags -f elf${bits}
+          enabled debug && [ "${AS}" = yasm ] && add_asflags -g dwarf2
+          enabled debug && [ "${AS}" = nasm ] && add_asflags -g
+          [ "${AS##*/}" = nasm ] && check_asm_align
+          ;;
+        darwin*)
+          add_asflags -f macho${bits}
+          enabled x86 && darwin_arch="-arch i386" || darwin_arch="-arch x86_64"
+          add_cflags  ${darwin_arch}
+          add_ldflags ${darwin_arch}
+          # -mdynamic-no-pic is still a bit of voodoo -- it was required at
+          # one time, but does not seem to be now, and it breaks some of the
+          # code that still relies on inline assembly.
+          # enabled icc && ! enabled pic && add_cflags -fno-pic -mdynamic-no-pic
+          enabled icc && ! enabled pic && add_cflags -fno-pic
+          ;;
+        iphonesimulator)
+          add_asflags -f macho${bits}
+          enabled x86 && sim_arch="-arch i386" || sim_arch="-arch x86_64"
+          add_cflags  ${sim_arch}
+          add_ldflags ${sim_arch}
+
+          if [ "$(show_darwin_sdk_major_version iphonesimulator)" -gt 8 ]; then
+            # yasm v1.3.0 doesn't know what -fembed-bitcode means, so turning it
+            # on is pointless (unless building a C-only lib). Warn the user, but
+            # do nothing here.
+            log "Warning: Bitcode embed disabled for simulator targets."
+          fi
+          ;;
+        os2)
+          add_asflags -f aout
+          enabled debug && add_asflags -g
+          EXE_SFX=.exe
+          ;;
+        *)
+          log "Warning: Unknown os $tgt_os while setting up $AS flags"
+          ;;
+      esac
+      ;;
+    *-gcc|generic-gnu)
+      link_with_cc=gcc
+      enable_feature gcc
+      setup_gnu_toolchain
+      ;;
+  esac
+
+  # Try to enable CPU specific tuning
+  if [ -n "${tune_cpu}" ]; then
+    if [ -n "${tune_cflags}" ]; then
+      check_add_cflags ${tune_cflags}${tune_cpu} || \
+        die "Requested CPU '${tune_cpu}' not supported by compiler"
+    fi
+    if [ -n "${tune_asflags}" ]; then
+      check_add_asflags ${tune_asflags}${tune_cpu} || \
+        die "Requested CPU '${tune_cpu}' not supported by assembler"
+    fi
+    if [ -z "${tune_cflags}${tune_asflags}" ]; then
+      log_echo "Warning: CPU tuning not supported by this toolchain"
+    fi
+  fi
+
+  if enabled debug; then
+    check_add_cflags -g && check_add_ldflags -g
+  else
+    check_add_cflags -DNDEBUG
+  fi
+
+  enabled gprof && check_add_cflags -pg && check_add_ldflags -pg
+  enabled gcov &&
+    check_add_cflags -fprofile-arcs -ftest-coverage &&
+    check_add_ldflags -fprofile-arcs -ftest-coverage
+
+  if enabled optimizations; then
+    if enabled rvct; then
+      enabled small && check_add_cflags -Ospace || check_add_cflags -Otime
+    else
+      enabled small && check_add_cflags -O2 ||  check_add_cflags -O3
+    fi
+  fi
+
+  if [ "${tgt_isa}" = "x86_64" ] || [ "${tgt_isa}" = "x86" ]; then
+    soft_enable use_x86inc
+  fi
+
+  # Position Independent Code (PIC) support, for building relocatable
+  # shared objects
+  enabled gcc && enabled pic && check_add_cflags -fPIC
+
+  # Work around longjmp interception on glibc >= 2.11, to improve binary
+  # compatibility. See http://code.google.com/p/webm/issues/detail?id=166
+  enabled linux && check_add_cflags -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0
+
+  # Check for strip utility variant
+  ${STRIP} -V 2>/dev/null | grep GNU >/dev/null && enable_feature gnu_strip
+
+  # Try to determine target endianness
+  check_cc <<EOF
+unsigned int e = 'O'<<24 | '2'<<16 | 'B'<<8 | 'E';
+EOF
+    [ -f "${TMP_O}" ] && od -A n -t x1 "${TMP_O}" | tr -d '\n' |
+        grep '4f *32 *42 *45' >/dev/null 2>&1 && enable_feature big_endian
+
+    # Try to find which inline keywords are supported
+    check_cc <<EOF && INLINE="inline"
+static inline function() {}
+EOF
+
+  # Almost every platform uses pthreads.
+  if enabled multithread; then
+    case ${toolchain} in
+      *-win*-vs*)
+        ;;
+      *-android-gcc)
+        ;;
+      *)
+        check_header pthread.h && add_extralibs -lpthread
+        ;;
+    esac
+  fi
+
+  # only for MIPS platforms
+  case ${toolchain} in
+    mips*)
+      if enabled big_endian; then
+        if enabled dspr2; then
+          echo "dspr2 optimizations are available only for little endian platforms"
+          disable_feature dspr2
+        fi
+        if enabled msa; then
+          echo "msa optimizations are available only for little endian platforms"
+          disable_feature msa
+        fi
+      fi
+      ;;
+  esac
+
+  # glibc needs these
+  if enabled linux; then
+    add_cflags -D_LARGEFILE_SOURCE
+    add_cflags -D_FILE_OFFSET_BITS=64
+  fi
+}
+
+process_toolchain() {
+  process_common_toolchain
+}
+
+print_config_mk() {
+  saved_prefix="${prefix}"
+  prefix=$1
+  makefile=$2
+  shift 2
+  for cfg; do
+    if enabled $cfg; then
+      upname="`toupper $cfg`"
+      echo "${prefix}_${upname}=yes" >> $makefile
+    fi
+  done
+  prefix="${saved_prefix}"
+}
+
+print_config_h() {
+  saved_prefix="${prefix}"
+  prefix=$1
+  header=$2
+  shift 2
+  for cfg; do
+    upname="`toupper $cfg`"
+    if enabled $cfg; then
+      echo "#define ${prefix}_${upname} 1" >> $header
+    else
+      echo "#define ${prefix}_${upname} 0" >> $header
+    fi
+  done
+  prefix="${saved_prefix}"
+}
+
+print_config_vars_h() {
+  header=$1
+  shift
+  while [ $# -gt 0 ]; do
+    upname="`toupper $1`"
+    echo "#define ${upname} $2" >> $header
+    shift 2
+  done
+}
+
+print_webm_license() {
+  saved_prefix="${prefix}"
+  destination=$1
+  prefix="$2"
+  suffix="$3"
+  shift 3
+  cat <<EOF > ${destination}
+${prefix} Copyright (c) 2011 The WebM project authors. All Rights Reserved.${suffix}
+${prefix} ${suffix}
+${prefix} Use of this source code is governed by a BSD-style license${suffix}
+${prefix} that can be found in the LICENSE file in the root of the source${suffix}
+${prefix} tree. An additional intellectual property rights grant can be found${suffix}
+${prefix} in the file PATENTS.  All contributing project authors may${suffix}
+${prefix} be found in the AUTHORS file in the root of the source tree.${suffix}
+EOF
+  prefix="${saved_prefix}"
+}
+
+process_targets() {
+  true;
+}
+
+process_detect() {
+  true;
+}
+
+enable_feature logging
+logfile="config.log"
+self=$0
+process() {
+  cmdline_args="$@"
+  process_cmdline "$@"
+  if enabled child; then
+    echo "# ${self} $@" >> ${logfile}
+  else
+    echo "# ${self} $@" > ${logfile}
+  fi
+  post_process_common_cmdline
+  post_process_cmdline
+  process_toolchain
+  process_detect
+  process_targets
+
+  OOT_INSTALLS="${OOT_INSTALLS}"
+  if enabled source_path_used; then
+  # Prepare the PWD for building.
+  for f in ${OOT_INSTALLS}; do
+    install -D "${source_path}/$f" "$f"
+  done
+  fi
+  cp "${source_path}/build/make/Makefile" .
+
+  clean_temp_files
+  true
+}
diff --git a/libs/libvpx/build/make/gen_asm_deps.sh b/libs/libvpx/build/make/gen_asm_deps.sh
new file mode 100755
index 0000000000..6a7bff9ebc
--- /dev/null
+++ b/libs/libvpx/build/make/gen_asm_deps.sh
@@ -0,0 +1,64 @@
+#!/bin/sh
+##
+##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+
+self=$0
+show_help() {
+    echo "usage: $self [options] <srcfile>"
+    echo
+    echo "Generate Makefile dependency information from assembly code source"
+    echo
+    exit 1
+}
+die_unknown(){
+    echo "Unknown option \"$1\"."
+    echo "See $0 --help for available options."
+    exit 1
+}
+for opt do
+    optval="${opt#*=}"
+    case "$opt" in
+    --build-pfx=*) pfx="${optval}"
+    ;;
+    --depfile=*) out="${optval}"
+    ;;
+    -I*) raw_inc_paths="${raw_inc_paths} ${opt}"
+         inc_path="${inc_path} ${opt#-I}"
+    ;;
+    -h|--help) show_help
+    ;;
+    *) [ -f "$opt" ] && srcfile="$opt"
+    ;;
+    esac
+done
+
+[ -n "$srcfile" ] || show_help
+sfx=${sfx:-asm}
+includes=$(LC_ALL=C egrep -i "include +\"?[a-z0-9_/]+\.${sfx}" $srcfile |
+           perl -p -e "s;.*?([a-z0-9_/]+.${sfx}).*;\1;")
+#" restore editor state
+for inc in ${includes}; do
+    found_inc_path=
+    for idir in ${inc_path}; do
+        [ -f "${idir}/${inc}" ] && found_inc_path="${idir}" && break
+    done
+    if [ -f `dirname $srcfile`/$inc ]; then
+        # Handle include files in the same directory as the source
+        $self --build-pfx=$pfx --depfile=$out ${raw_inc_paths} `dirname $srcfile`/$inc
+    elif [ -n "${found_inc_path}" ]; then
+        # Handle include files on the include path
+        $self --build-pfx=$pfx --depfile=$out ${raw_inc_paths} "${found_inc_path}/$inc"
+    else
+        # Handle generated includes in the build root (which may not exist yet)
+        echo ${out} ${out%d}o: "${pfx}${inc}"
+    fi
+done
+echo ${out} ${out%d}o: $srcfile
diff --git a/libs/libvpx/build/make/gen_msvs_def.sh b/libs/libvpx/build/make/gen_msvs_def.sh
new file mode 100755
index 0000000000..4defcc2e7c
--- /dev/null
+++ b/libs/libvpx/build/make/gen_msvs_def.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+##
+##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+
+self=$0
+self_basename=${self##*/}
+EOL=$'\n'
+
+show_help() {
+    cat <<EOF
+Usage: ${self_basename} [options] file1 [file2 ...]
+
+This script generates a MSVC module definition file containing a list of symbols
+to export from a DLL. Source files are technically bash scripts (and thus may
+use #comment syntax) but in general, take the form of a list of symbols:
+
+  <kind> symbol1 [symbol2, symbol3, ...]
+
+where <kind> is either 'text' or 'data'
+
+
+Options:
+    --help                      Print this message
+    --out=filename              Write output to a file [stdout]
+    --name=project_name         Name of the library (required)
+EOF
+    exit 1
+}
+
+die() {
+    echo "${self_basename}: $@"
+    exit 1
+}
+
+die_unknown(){
+    echo "Unknown option \"$1\"."
+    echo "See ${self_basename} --help for available options."
+    exit 1
+}
+
+text() {
+    for sym in "$@"; do
+        echo "  $sym" >> ${outfile}
+    done
+}
+
+data() {
+    for sym in "$@"; do
+        printf "  %-40s DATA\n" "$sym" >> ${outfile}
+    done
+}
+
+# Process command line
+for opt in "$@"; do
+    optval="${opt#*=}"
+    case "$opt" in
+    --help|-h) show_help
+    ;;
+    --out=*) outfile="$optval"
+    ;;
+    --name=*) name="${optval}"
+    ;;
+     -*) die_unknown $opt
+    ;;
+    *) file_list[${#file_list[@]}]="$opt"
+    esac
+done
+outfile=${outfile:-/dev/stdout}
+[ -n "$name" ] || die "Library name (--name) must be specified!"
+
+echo "LIBRARY ${name}" > ${outfile}
+echo "EXPORTS" >> ${outfile}
+for f in "${file_list[@]}"; do
+    . $f
+done
diff --git a/libs/libvpx/build/make/gen_msvs_proj.sh b/libs/libvpx/build/make/gen_msvs_proj.sh
new file mode 100755
index 0000000000..0cf335b3d2
--- /dev/null
+++ b/libs/libvpx/build/make/gen_msvs_proj.sh
@@ -0,0 +1,490 @@
+#!/bin/bash
+##
+##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+self=$0
+self_basename=${self##*/}
+self_dirname=$(dirname "$0")
+
+. "$self_dirname/msvs_common.sh"|| exit 127
+
+show_help() {
+    cat <<EOF
+Usage: ${self_basename} --name=projname [options] file1 [file2 ...]
+
+This script generates a Visual Studio project file from a list of source
+code files.
+
+Options:
+    --help                      Print this message
+    --exe                       Generate a project for building an Application
+    --lib                       Generate a project for creating a static library
+    --dll                       Generate a project for creating a dll
+    --static-crt                Use the static C runtime (/MT)
+    --target=isa-os-cc          Target specifier (required)
+    --out=filename              Write output to a file [stdout]
+    --name=project_name         Name of the project (required)
+    --proj-guid=GUID            GUID to use for the project
+    --module-def=filename       File containing export definitions (for DLLs)
+    --ver=version               Version (7,8,9) of visual studio to generate for
+    --src-path-bare=dir         Path to root of source tree
+    -Ipath/to/include           Additional include directories
+    -DFLAG[=value]              Preprocessor macros to define
+    -Lpath/to/lib               Additional library search paths
+    -llibname                   Library to link against
+EOF
+    exit 1
+}
+
+generate_filter() {
+    local var=$1
+    local name=$2
+    local pats=$3
+    local file_list_sz
+    local i
+    local f
+    local saveIFS="$IFS"
+    local pack
+    echo "generating filter '$name' from ${#file_list[@]} files" >&2
+    IFS=*
+
+    open_tag Filter \
+        Name=$name \
+        Filter=$pats \
+        UniqueIdentifier=`generate_uuid` \
+
+    file_list_sz=${#file_list[@]}
+    for i in ${!file_list[@]}; do
+        f=${file_list[i]}
+        for pat in ${pats//;/$IFS}; do
+            if [ "${f##*.}" == "$pat" ]; then
+                unset file_list[i]
+
+                objf=$(echo ${f%.*}.obj \
+                       | sed -e "s,$src_path_bare,," \
+                             -e 's/^[\./]\+//g' -e 's,[:/ ],_,g')
+                open_tag File RelativePath="$f"
+
+                if [ "$pat" == "asm" ] && $asm_use_custom_step; then
+                    # Avoid object file name collisions, i.e. vpx_config.c and
+                    # vpx_config.asm produce the same object file without
+                    # this additional suffix.
+                    objf=${objf%.obj}_asm.obj
+                    for plat in "${platforms[@]}"; do
+                        for cfg in Debug Release; do
+                            open_tag FileConfiguration \
+                                Name="${cfg}|${plat}" \
+
+                            tag Tool \
+                                Name="VCCustomBuildTool" \
+                                Description="Assembling \$(InputFileName)" \
+                                CommandLine="$(eval echo \$asm_${cfg}_cmdline) -o \$(IntDir)\\$objf" \
+                                Outputs="\$(IntDir)\\$objf" \
+
+                            close_tag FileConfiguration
+                        done
+                    done
+                fi
+                if [ "$pat" == "c" ] || \
+                   [ "$pat" == "cc" ] || [ "$pat" == "cpp" ]; then
+                    for plat in "${platforms[@]}"; do
+                        for cfg in Debug Release; do
+                            open_tag FileConfiguration \
+                                Name="${cfg}|${plat}" \
+
+                            tag Tool \
+                                Name="VCCLCompilerTool" \
+                                ObjectFile="\$(IntDir)\\$objf" \
+
+                            close_tag FileConfiguration
+                        done
+                    done
+                fi
+                close_tag File
+
+                break
+            fi
+        done
+    done
+
+    close_tag Filter
+    IFS="$saveIFS"
+}
+
+# Process command line
+unset target
+for opt in "$@"; do
+    optval="${opt#*=}"
+    case "$opt" in
+        --help|-h) show_help
+        ;;
+        --target=*) target="${optval}"
+        ;;
+        --out=*) outfile="$optval"
+        ;;
+        --name=*) name="${optval}"
+        ;;
+        --proj-guid=*) guid="${optval}"
+        ;;
+        --module-def=*) link_opts="${link_opts} ModuleDefinitionFile=${optval}"
+        ;;
+        --exe) proj_kind="exe"
+        ;;
+        --dll) proj_kind="dll"
+        ;;
+        --lib) proj_kind="lib"
+        ;;
+        --src-path-bare=*)
+            src_path_bare=$(fix_path "$optval")
+            src_path_bare=${src_path_bare%/}
+        ;;
+        --static-crt) use_static_runtime=true
+        ;;
+        --ver=*)
+            vs_ver="$optval"
+            case "$optval" in
+                [789])
+                ;;
+                *) die Unrecognized Visual Studio Version in $opt
+                ;;
+            esac
+        ;;
+        -I*)
+            opt=${opt##-I}
+            opt=$(fix_path "$opt")
+            opt="${opt%/}"
+            incs="${incs}${incs:+;}&quot;${opt}&quot;"
+            yasmincs="${yasmincs} -I&quot;${opt}&quot;"
+        ;;
+        -D*) defines="${defines}${defines:+;}${opt##-D}"
+        ;;
+        -L*) # fudge . to $(OutDir)
+            if [ "${opt##-L}" == "." ]; then
+                libdirs="${libdirs}${libdirs:+;}&quot;\$(OutDir)&quot;"
+            else
+                 # Also try directories for this platform/configuration
+                 opt=${opt##-L}
+                 opt=$(fix_path "$opt")
+                 libdirs="${libdirs}${libdirs:+;}&quot;${opt}&quot;"
+                 libdirs="${libdirs}${libdirs:+;}&quot;${opt}/\$(PlatformName)/\$(ConfigurationName)&quot;"
+                 libdirs="${libdirs}${libdirs:+;}&quot;${opt}/\$(PlatformName)&quot;"
+            fi
+        ;;
+        -l*) libs="${libs}${libs:+ }${opt##-l}.lib"
+        ;;
+        -*) die_unknown $opt
+        ;;
+        *)
+            # The paths in file_list are fixed outside of the loop.
+            file_list[${#file_list[@]}]="$opt"
+            case "$opt" in
+                 *.asm) uses_asm=true
+                 ;;
+            esac
+        ;;
+    esac
+done
+
+# Make one call to fix_path for file_list to improve performance.
+fix_file_list
+
+outfile=${outfile:-/dev/stdout}
+guid=${guid:-`generate_uuid`}
+asm_use_custom_step=false
+uses_asm=${uses_asm:-false}
+case "${vs_ver:-8}" in
+    7) vs_ver_id="7.10"
+       asm_use_custom_step=$uses_asm
+       warn_64bit='Detect64BitPortabilityProblems=true'
+    ;;
+    8) vs_ver_id="8.00"
+       asm_use_custom_step=$uses_asm
+       warn_64bit='Detect64BitPortabilityProblems=true'
+    ;;
+    9) vs_ver_id="9.00"
+       asm_use_custom_step=$uses_asm
+       warn_64bit='Detect64BitPortabilityProblems=false'
+    ;;
+esac
+
+[ -n "$name" ] || die "Project name (--name) must be specified!"
+[ -n "$target" ] || die "Target (--target) must be specified!"
+
+if ${use_static_runtime:-false}; then
+    release_runtime=0
+    debug_runtime=1
+    lib_sfx=mt
+else
+    release_runtime=2
+    debug_runtime=3
+    lib_sfx=md
+fi
+
+# Calculate debug lib names: If a lib ends in ${lib_sfx}.lib, then rename
+# it to ${lib_sfx}d.lib. This precludes linking to release libs from a
+# debug exe, so this may need to be refactored later.
+for lib in ${libs}; do
+    if [ "$lib" != "${lib%${lib_sfx}.lib}" ]; then
+        lib=${lib%.lib}d.lib
+    fi
+    debug_libs="${debug_libs}${debug_libs:+ }${lib}"
+done
+
+
+# List Keyword for this target
+case "$target" in
+    x86*) keyword="ManagedCProj"
+    ;;
+    *) die "Unsupported target $target!"
+esac
+
+# List of all platforms supported for this target
+case "$target" in
+    x86_64*)
+        platforms[0]="x64"
+        asm_Debug_cmdline="yasm -Xvc -g cv8 -f win64 ${yasmincs} &quot;\$(InputPath)&quot;"
+        asm_Release_cmdline="yasm -Xvc -f win64 ${yasmincs} &quot;\$(InputPath)&quot;"
+    ;;
+    x86*)
+        platforms[0]="Win32"
+        asm_Debug_cmdline="yasm -Xvc -g cv8 -f win32 ${yasmincs} &quot;\$(InputPath)&quot;"
+        asm_Release_cmdline="yasm -Xvc -f win32 ${yasmincs} &quot;\$(InputPath)&quot;"
+    ;;
+    *) die "Unsupported target $target!"
+    ;;
+esac
+
+generate_vcproj() {
+    case "$proj_kind" in
+        exe) vs_ConfigurationType=1
+        ;;
+        dll) vs_ConfigurationType=2
+        ;;
+        *)   vs_ConfigurationType=4
+        ;;
+    esac
+
+    echo "<?xml version=\"1.0\" encoding=\"Windows-1252\"?>"
+    open_tag VisualStudioProject \
+        ProjectType="Visual C++" \
+        Version="${vs_ver_id}" \
+        Name="${name}" \
+        ProjectGUID="{${guid}}" \
+        RootNamespace="${name}" \
+        Keyword="${keyword}" \
+
+    open_tag Platforms
+    for plat in "${platforms[@]}"; do
+        tag Platform Name="$plat"
+    done
+    close_tag Platforms
+
+    open_tag Configurations
+    for plat in "${platforms[@]}"; do
+        plat_no_ws=`echo $plat | sed 's/[^A-Za-z0-9_]/_/g'`
+        open_tag Configuration \
+            Name="Debug|$plat" \
+            OutputDirectory="\$(SolutionDir)$plat_no_ws/\$(ConfigurationName)" \
+            IntermediateDirectory="$plat_no_ws/\$(ConfigurationName)/${name}" \
+            ConfigurationType="$vs_ConfigurationType" \
+            CharacterSet="1" \
+
+        case "$target" in
+            x86*)
+                case "$name" in
+                    vpx)
+                        tag Tool \
+                            Name="VCCLCompilerTool" \
+                            Optimization="0" \
+                            AdditionalIncludeDirectories="$incs" \
+                            PreprocessorDefinitions="WIN32;_DEBUG;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE;$defines" \
+                            RuntimeLibrary="$debug_runtime" \
+                            UsePrecompiledHeader="0" \
+                            WarningLevel="3" \
+                            DebugInformationFormat="2" \
+                            $warn_64bit \
+
+                        $uses_asm && tag Tool Name="YASM"  IncludePaths="$incs" Debug="true"
+                    ;;
+                    *)
+                        tag Tool \
+                            Name="VCCLCompilerTool" \
+                            Optimization="0" \
+                            AdditionalIncludeDirectories="$incs" \
+                            PreprocessorDefinitions="WIN32;_DEBUG;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE;$defines" \
+                            RuntimeLibrary="$debug_runtime" \
+                            UsePrecompiledHeader="0" \
+                            WarningLevel="3" \
+                            DebugInformationFormat="2" \
+                            $warn_64bit \
+
+                        $uses_asm && tag Tool Name="YASM"  IncludePaths="$incs" Debug="true"
+                    ;;
+                esac
+            ;;
+        esac
+
+        case "$proj_kind" in
+            exe)
+                case "$target" in
+                    x86*)
+                        case "$name" in
+                            *)
+                                tag Tool \
+                                    Name="VCLinkerTool" \
+                                    AdditionalDependencies="$debug_libs \$(NoInherit)" \
+                                    AdditionalLibraryDirectories="$libdirs" \
+                                    GenerateDebugInformation="true" \
+                                    ProgramDatabaseFile="\$(OutDir)/${name}.pdb" \
+                            ;;
+                        esac
+                    ;;
+                 esac
+            ;;
+            lib)
+                case "$target" in
+                    x86*)
+                        tag Tool \
+                            Name="VCLibrarianTool" \
+                            OutputFile="\$(OutDir)/${name}${lib_sfx}d.lib" \
+
+                    ;;
+                esac
+            ;;
+            dll)
+                tag Tool \
+                    Name="VCLinkerTool" \
+                    AdditionalDependencies="\$(NoInherit)" \
+                    LinkIncremental="2" \
+                    GenerateDebugInformation="true" \
+                    AssemblyDebug="1" \
+                    TargetMachine="1" \
+                    $link_opts \
+
+            ;;
+        esac
+
+        close_tag Configuration
+
+        open_tag Configuration \
+            Name="Release|$plat" \
+            OutputDirectory="\$(SolutionDir)$plat_no_ws/\$(ConfigurationName)" \
+            IntermediateDirectory="$plat_no_ws/\$(ConfigurationName)/${name}" \
+            ConfigurationType="$vs_ConfigurationType" \
+            CharacterSet="1" \
+            WholeProgramOptimization="0" \
+
+        case "$target" in
+            x86*)
+                case "$name" in
+                    vpx)
+                        tag Tool \
+                            Name="VCCLCompilerTool" \
+                            Optimization="2" \
+                            FavorSizeorSpeed="1" \
+                            AdditionalIncludeDirectories="$incs" \
+                            PreprocessorDefinitions="WIN32;NDEBUG;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE;$defines" \
+                            RuntimeLibrary="$release_runtime" \
+                            UsePrecompiledHeader="0" \
+                            WarningLevel="3" \
+                            DebugInformationFormat="0" \
+                            $warn_64bit \
+
+                        $uses_asm && tag Tool Name="YASM"  IncludePaths="$incs"
+                    ;;
+                    *)
+                        tag Tool \
+                            Name="VCCLCompilerTool" \
+                            AdditionalIncludeDirectories="$incs" \
+                            Optimization="2" \
+                            FavorSizeorSpeed="1" \
+                            PreprocessorDefinitions="WIN32;NDEBUG;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE;$defines" \
+                            RuntimeLibrary="$release_runtime" \
+                            UsePrecompiledHeader="0" \
+                            WarningLevel="3" \
+                            DebugInformationFormat="0" \
+                            $warn_64bit \
+
+                        $uses_asm && tag Tool Name="YASM"  IncludePaths="$incs"
+                    ;;
+                esac
+            ;;
+        esac
+
+        case "$proj_kind" in
+            exe)
+                case "$target" in
+                    x86*)
+                        case "$name" in
+                            *)
+                                tag Tool \
+                                    Name="VCLinkerTool" \
+                                    AdditionalDependencies="$libs \$(NoInherit)" \
+                                    AdditionalLibraryDirectories="$libdirs" \
+
+                            ;;
+                        esac
+                    ;;
+                 esac
+            ;;
+            lib)
+                case "$target" in
+                    x86*)
+                        tag Tool \
+                            Name="VCLibrarianTool" \
+                            OutputFile="\$(OutDir)/${name}${lib_sfx}.lib" \
+
+                    ;;
+                esac
+            ;;
+            dll) # note differences to debug version: LinkIncremental, AssemblyDebug
+                tag Tool \
+                    Name="VCLinkerTool" \
+                    AdditionalDependencies="\$(NoInherit)" \
+                    LinkIncremental="1" \
+                    GenerateDebugInformation="true" \
+                    TargetMachine="1" \
+                    $link_opts \
+
+            ;;
+        esac
+
+        close_tag Configuration
+    done
+    close_tag Configurations
+
+    open_tag Files
+    generate_filter srcs   "Source Files"   "c;cc;cpp;def;odl;idl;hpj;bat;asm;asmx"
+    generate_filter hdrs   "Header Files"   "h;hm;inl;inc;xsd"
+    generate_filter resrcs "Resource Files" "rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav"
+    generate_filter resrcs "Build Files"    "mk"
+    close_tag Files
+
+    tag       Globals
+    close_tag VisualStudioProject
+
+    # This must be done from within the {} subshell
+    echo "Ignored files list (${#file_list[@]} items) is:" >&2
+    for f in "${file_list[@]}"; do
+        echo "    $f" >&2
+    done
+}
+
+generate_vcproj |
+    sed  -e '/"/s;\([^ "]\)/;\1\\;g' > ${outfile}
+
+exit
+<!--
+TODO: Add any files not captured by filters.
+                <File
+                        RelativePath=".\ReadMe.txt"
+                        >
+                </File>
+-->
diff --git a/libs/libvpx/build/make/gen_msvs_sln.sh b/libs/libvpx/build/make/gen_msvs_sln.sh
new file mode 100755
index 0000000000..664b404c91
--- /dev/null
+++ b/libs/libvpx/build/make/gen_msvs_sln.sh
@@ -0,0 +1,327 @@
+#!/bin/bash
+##
+##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+
+self=$0
+self_basename=${self##*/}
+EOL=$'\n'
+EOLDOS=$'\r'
+
+show_help() {
+    cat <<EOF
+Usage: ${self_basename} [options] file1 [file2 ...]
+
+This script generates a Visual Studio solution file from a list of project
+files.
+
+Options:
+    --help                      Print this message
+    --out=outfile               Redirect output to a file
+    --ver=version               Version (7,8,9,10,11,12,14) of visual studio to generate for
+    --target=isa-os-cc          Target specifier
+EOF
+    exit 1
+}
+
+die() {
+    echo "${self_basename}: $@" >&2
+    [ -f "${outfile}" ] && rm -f ${outfile}{,.mk}
+    exit 1
+}
+
+die_unknown(){
+    echo "Unknown option \"$1\"." >&2
+    echo "See ${self_basename} --help for available options." >&2
+    [ -f "${outfile}" ] && rm -f ${outfile}{,.mk}
+    exit 1
+}
+
+indent1=$'\t'
+indent=""
+indent_push() {
+    indent="${indent}${indent1}"
+}
+indent_pop() {
+    indent="${indent%${indent1}}"
+}
+
+parse_project() {
+    local file=$1
+    if [ "$sfx" = "vcproj" ]; then
+        local name=`grep Name "$file" | awk 'BEGIN {FS="\""}{if (NR==1) print $2}'`
+        local guid=`grep ProjectGUID "$file" | awk 'BEGIN {FS="\""}{if (NR==1) print $2}'`
+    else
+        local name=`grep RootNamespace "$file" | sed 's,.*<.*>\(.*\)</.*>.*,\1,'`
+        local guid=`grep ProjectGuid "$file" | sed 's,.*<.*>\(.*\)</.*>.*,\1,'`
+    fi
+
+    # save the project GUID to a varaible, normalizing to the basename of the
+    # vcproj file without the extension
+    local var
+    var=${file##*/}
+    var=${var%%.${sfx}}
+    eval "${var}_file=\"$1\""
+    eval "${var}_name=$name"
+    eval "${var}_guid=$guid"
+
+    if [ "$sfx" = "vcproj" ]; then
+        cur_config_list=`grep -A1 '<Configuration' $file |
+            grep Name | cut -d\" -f2`
+    else
+        cur_config_list=`grep -B1 'Label="Configuration"' $file |
+            grep Condition | cut -d\' -f4`
+    fi
+    new_config_list=$(for i in $config_list $cur_config_list; do
+        echo $i
+    done | sort | uniq)
+    if [ "$config_list" != "" ] && [ "$config_list" != "$new_config_list" ]; then
+        mixed_platforms=1
+    fi
+    config_list="$new_config_list"
+    eval "${var}_config_list=\"$cur_config_list\""
+    proj_list="${proj_list} ${var}"
+}
+
+process_project() {
+    eval "local file=\${$1_file}"
+    eval "local name=\${$1_name}"
+    eval "local guid=\${$1_guid}"
+
+    # save the project GUID to a varaible, normalizing to the basename of the
+    # vcproj file without the extension
+    local var
+    var=${file##*/}
+    var=${var%%.${sfx}}
+    eval "${var}_guid=$guid"
+
+    echo "Project(\"{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}\") = \"$name\", \"$file\", \"$guid\""
+    indent_push
+
+    eval "local deps=\"\${${var}_deps}\""
+    if [ -n "$deps" ] && [ "$sfx" = "vcproj" ]; then
+        echo "${indent}ProjectSection(ProjectDependencies) = postProject"
+        indent_push
+
+        for dep in $deps; do
+            eval "local dep_guid=\${${dep}_guid}"
+            [ -z "${dep_guid}" ] && die "Unknown GUID for $dep (dependency of $var)"
+            echo "${indent}$dep_guid = $dep_guid"
+        done
+
+        indent_pop
+        echo "${indent}EndProjectSection"
+
+    fi
+
+    indent_pop
+    echo "EndProject"
+}
+
+process_global() {
+    echo "Global"
+    indent_push
+
+    #
+    # Solution Configuration Platforms
+    #
+    echo "${indent}GlobalSection(SolutionConfigurationPlatforms) = preSolution"
+    indent_push
+    IFS_bak=${IFS}
+    IFS=$'\r'$'\n'
+    if [ "$mixed_platforms" != "" ]; then
+        config_list="
+Release|Mixed Platforms
+Debug|Mixed Platforms"
+    fi
+    for config in ${config_list}; do
+        echo "${indent}$config = $config"
+    done
+    IFS=${IFS_bak}
+    indent_pop
+    echo "${indent}EndGlobalSection"
+
+    #
+    # Project Configuration Platforms
+    #
+    echo "${indent}GlobalSection(ProjectConfigurationPlatforms) = postSolution"
+    indent_push
+    for proj in ${proj_list}; do
+        eval "local proj_guid=\${${proj}_guid}"
+        eval "local proj_config_list=\${${proj}_config_list}"
+        IFS=$'\r'$'\n'
+        for config in ${proj_config_list}; do
+            if [ "$mixed_platforms" != "" ]; then
+                local c=${config%%|*}
+                echo "${indent}${proj_guid}.${c}|Mixed Platforms.ActiveCfg = ${config}"
+                echo "${indent}${proj_guid}.${c}|Mixed Platforms.Build.0 = ${config}"
+            else
+                echo "${indent}${proj_guid}.${config}.ActiveCfg = ${config}"
+                echo "${indent}${proj_guid}.${config}.Build.0 = ${config}"
+            fi
+
+        done
+        IFS=${IFS_bak}
+    done
+    indent_pop
+    echo "${indent}EndGlobalSection"
+
+    #
+    # Solution Properties
+    #
+    echo "${indent}GlobalSection(SolutionProperties) = preSolution"
+    indent_push
+    echo "${indent}HideSolutionNode = FALSE"
+    indent_pop
+    echo "${indent}EndGlobalSection"
+
+    indent_pop
+    echo "EndGlobal"
+}
+
+process_makefile() {
+    IFS_bak=${IFS}
+    IFS=$'\r'$'\n'
+    local TAB=$'\t'
+    cat <<EOF
+ifeq (\$(CONFIG_VS_VERSION),7)
+MSBUILD_TOOL := devenv.com
+else
+MSBUILD_TOOL := msbuild.exe
+endif
+found_devenv := \$(shell which \$(MSBUILD_TOOL) >/dev/null 2>&1 && echo yes)
+.nodevenv.once:
+${TAB}@echo "  * \$(MSBUILD_TOOL) not found in path."
+${TAB}@echo "  * "
+${TAB}@echo "  * You will have to build all configurations manually using the"
+${TAB}@echo "  * Visual Studio IDE. To allow make to build them automatically,"
+${TAB}@echo "  * add the Common7/IDE directory of your Visual Studio"
+${TAB}@echo "  * installation to your path, eg:"
+${TAB}@echo "  *   C:\Program Files\Microsoft Visual Studio 8\Common7\IDE"
+${TAB}@echo "  * "
+${TAB}@touch \$@
+CLEAN-OBJS += \$(if \$(found_devenv),,.nodevenv.once)
+
+EOF
+
+    for sln_config in ${config_list}; do
+        local config=${sln_config%%|*}
+        local platform=${sln_config##*|}
+        local nows_sln_config=`echo $sln_config | sed -e 's/[^a-zA-Z0-9]/_/g'`
+        cat <<EOF
+BUILD_TARGETS += \$(if \$(NO_LAUNCH_DEVENV),,$nows_sln_config)
+clean::
+${TAB}rm -rf "$platform"/"$config"
+.PHONY: $nows_sln_config
+ifneq (\$(found_devenv),)
+  ifeq (\$(CONFIG_VS_VERSION),7)
+$nows_sln_config: $outfile
+${TAB}\$(MSBUILD_TOOL) $outfile -build "$config"
+
+  else
+$nows_sln_config: $outfile
+${TAB}\$(MSBUILD_TOOL) $outfile -m -t:Build \\
+${TAB}${TAB}-p:Configuration="$config" -p:Platform="$platform"
+
+  endif
+else
+$nows_sln_config: $outfile .nodevenv.once
+${TAB}@echo "  * Skipping build of $sln_config (\$(MSBUILD_TOOL) not in path)."
+${TAB}@echo "  * "
+endif
+
+EOF
+    done
+    IFS=${IFS_bak}
+}
+
+# Process command line
+outfile=/dev/stdout
+for opt in "$@"; do
+    optval="${opt#*=}"
+    case "$opt" in
+    --help|-h) show_help
+    ;;
+    --out=*) outfile="${optval}"; mkoutfile="${optval}".mk
+    ;;
+    --dep=*) eval "${optval%%:*}_deps=\"\${${optval%%:*}_deps} ${optval##*:}\""
+    ;;
+    --ver=*) vs_ver="$optval"
+             case $optval in
+             [789]|10|11|12|14)
+             ;;
+             *) die Unrecognized Visual Studio Version in $opt
+             ;;
+             esac
+    ;;
+    --ver=*) vs_ver="$optval"
+             case $optval in
+             7) sln_vers="8.00"
+                sln_vers_str="Visual Studio .NET 2003"
+             ;;
+             [89])
+             ;;
+             *) die "Unrecognized Visual Studio Version '$optval' in $opt"
+             ;;
+             esac
+    ;;
+    --target=*) target="${optval}"
+    ;;
+    -*) die_unknown $opt
+    ;;
+    *) file_list[${#file_list[@]}]="$opt"
+    esac
+done
+outfile=${outfile:-/dev/stdout}
+mkoutfile=${mkoutfile:-/dev/stdout}
+case "${vs_ver:-8}" in
+    7) sln_vers="8.00"
+       sln_vers_str="Visual Studio .NET 2003"
+    ;;
+    8) sln_vers="9.00"
+       sln_vers_str="Visual Studio 2005"
+    ;;
+    9) sln_vers="10.00"
+       sln_vers_str="Visual Studio 2008"
+    ;;
+    10) sln_vers="11.00"
+       sln_vers_str="Visual Studio 2010"
+    ;;
+    11) sln_vers="12.00"
+       sln_vers_str="Visual Studio 2012"
+    ;;
+    12) sln_vers="12.00"
+       sln_vers_str="Visual Studio 2013"
+    ;;
+    14) sln_vers="14.00"
+       sln_vers_str="Visual Studio 2015"
+    ;;
+esac
+case "${vs_ver:-8}" in
+    [789])
+    sfx=vcproj
+    ;;
+    10|11|12|14)
+    sfx=vcxproj
+    ;;
+esac
+
+for f in "${file_list[@]}"; do
+    parse_project $f
+done
+cat  >${outfile} <<EOF
+Microsoft Visual Studio Solution File, Format Version $sln_vers${EOLDOS}
+# $sln_vers_str${EOLDOS}
+EOF
+for proj in ${proj_list}; do
+    process_project $proj >>${outfile}
+done
+process_global >>${outfile}
+process_makefile >${mkoutfile}
diff --git a/libs/libvpx/build/make/gen_msvs_vcxproj.sh b/libs/libvpx/build/make/gen_msvs_vcxproj.sh
new file mode 100755
index 0000000000..182ea28fa7
--- /dev/null
+++ b/libs/libvpx/build/make/gen_msvs_vcxproj.sh
@@ -0,0 +1,490 @@
+#!/bin/bash
+##
+##  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+self=$0
+self_basename=${self##*/}
+self_dirname=$(dirname "$0")
+
+. "$self_dirname/msvs_common.sh"|| exit 127
+
+show_help() {
+    cat <<EOF
+Usage: ${self_basename} --name=projname [options] file1 [file2 ...]
+
+This script generates a Visual Studio project file from a list of source
+code files.
+
+Options:
+    --help                      Print this message
+    --exe                       Generate a project for building an Application
+    --lib                       Generate a project for creating a static library
+    --dll                       Generate a project for creating a dll
+    --static-crt                Use the static C runtime (/MT)
+    --enable-werror             Treat warnings as errors (/WX)
+    --target=isa-os-cc          Target specifier (required)
+    --out=filename              Write output to a file [stdout]
+    --name=project_name         Name of the project (required)
+    --proj-guid=GUID            GUID to use for the project
+    --module-def=filename       File containing export definitions (for DLLs)
+    --ver=version               Version (10,11,12,14) of visual studio to generate for
+    --src-path-bare=dir         Path to root of source tree
+    -Ipath/to/include           Additional include directories
+    -DFLAG[=value]              Preprocessor macros to define
+    -Lpath/to/lib               Additional library search paths
+    -llibname                   Library to link against
+EOF
+    exit 1
+}
+
+tag_content() {
+    local tag=$1
+    local content=$2
+    shift
+    shift
+    if [ $# -ne 0 ]; then
+        echo "${indent}<${tag}"
+        indent_push
+        tag_attributes "$@"
+        echo "${indent}>${content}</${tag}>"
+        indent_pop
+    else
+        echo "${indent}<${tag}>${content}</${tag}>"
+    fi
+}
+
+generate_filter() {
+    local name=$1
+    local pats=$2
+    local file_list_sz
+    local i
+    local f
+    local saveIFS="$IFS"
+    local pack
+    echo "generating filter '$name' from ${#file_list[@]} files" >&2
+    IFS=*
+
+    file_list_sz=${#file_list[@]}
+    for i in ${!file_list[@]}; do
+        f=${file_list[i]}
+        for pat in ${pats//;/$IFS}; do
+            if [ "${f##*.}" == "$pat" ]; then
+                unset file_list[i]
+
+                objf=$(echo ${f%.*}.obj \
+                       | sed -e "s,$src_path_bare,," \
+                             -e 's/^[\./]\+//g' -e 's,[:/ ],_,g')
+
+                if ([ "$pat" == "asm" ] || [ "$pat" == "s" ]) && $asm_use_custom_step; then
+                    # Avoid object file name collisions, i.e. vpx_config.c and
+                    # vpx_config.asm produce the same object file without
+                    # this additional suffix.
+                    objf=${objf%.obj}_asm.obj
+                    open_tag CustomBuild \
+                        Include="$f"
+                    for plat in "${platforms[@]}"; do
+                        for cfg in Debug Release; do
+                            tag_content Message "Assembling %(Filename)%(Extension)" \
+                                Condition="'\$(Configuration)|\$(Platform)'=='$cfg|$plat'"
+                            tag_content Command "$(eval echo \$asm_${cfg}_cmdline) -o \$(IntDir)$objf" \
+                                Condition="'\$(Configuration)|\$(Platform)'=='$cfg|$plat'"
+                            tag_content Outputs "\$(IntDir)$objf" \
+                                Condition="'\$(Configuration)|\$(Platform)'=='$cfg|$plat'"
+                        done
+                    done
+                    close_tag CustomBuild
+                elif [ "$pat" == "c" ] || \
+                     [ "$pat" == "cc" ] || [ "$pat" == "cpp" ]; then
+                    open_tag ClCompile \
+                        Include="$f"
+                    # Separate file names with Condition?
+                    tag_content ObjectFileName "\$(IntDir)$objf"
+                    # Check for AVX and turn it on to avoid warnings.
+                    if [[ $f =~ avx.?\.c$ ]]; then
+                        tag_content AdditionalOptions "/arch:AVX"
+                    fi
+                    close_tag ClCompile
+                elif [ "$pat" == "h" ] ; then
+                    tag ClInclude \
+                        Include="$f"
+                elif [ "$pat" == "vcxproj" ] ; then
+                    open_tag ProjectReference \
+                        Include="$f"
+                    depguid=`grep ProjectGuid "$f" | sed 's,.*<.*>\(.*\)</.*>.*,\1,'`
+                    tag_content Project "$depguid"
+                    tag_content ReferenceOutputAssembly false
+                    close_tag ProjectReference
+                else
+                    tag None \
+                        Include="$f"
+                fi
+
+                break
+            fi
+        done
+    done
+
+    IFS="$saveIFS"
+}
+
+# Process command line
+unset target
+for opt in "$@"; do
+    optval="${opt#*=}"
+    case "$opt" in
+        --help|-h) show_help
+        ;;
+        --target=*) target="${optval}"
+        ;;
+        --out=*) outfile="$optval"
+        ;;
+        --name=*) name="${optval}"
+        ;;
+        --proj-guid=*) guid="${optval}"
+        ;;
+        --module-def=*) module_def="${optval}"
+        ;;
+        --exe) proj_kind="exe"
+        ;;
+        --dll) proj_kind="dll"
+        ;;
+        --lib) proj_kind="lib"
+        ;;
+        --src-path-bare=*)
+            src_path_bare=$(fix_path "$optval")
+            src_path_bare=${src_path_bare%/}
+        ;;
+        --static-crt) use_static_runtime=true
+        ;;
+        --enable-werror) werror=true
+        ;;
+        --ver=*)
+            vs_ver="$optval"
+            case "$optval" in
+                10|11|12|14)
+                ;;
+                *) die Unrecognized Visual Studio Version in $opt
+                ;;
+            esac
+        ;;
+        -I*)
+            opt=${opt##-I}
+            opt=$(fix_path "$opt")
+            opt="${opt%/}"
+            incs="${incs}${incs:+;}&quot;${opt}&quot;"
+            yasmincs="${yasmincs} -I&quot;${opt}&quot;"
+        ;;
+        -D*) defines="${defines}${defines:+;}${opt##-D}"
+        ;;
+        -L*) # fudge . to $(OutDir)
+            if [ "${opt##-L}" == "." ]; then
+                libdirs="${libdirs}${libdirs:+;}&quot;\$(OutDir)&quot;"
+            else
+                 # Also try directories for this platform/configuration
+                 opt=${opt##-L}
+                 opt=$(fix_path "$opt")
+                 libdirs="${libdirs}${libdirs:+;}&quot;${opt}&quot;"
+                 libdirs="${libdirs}${libdirs:+;}&quot;${opt}/\$(PlatformName)/\$(Configuration)&quot;"
+                 libdirs="${libdirs}${libdirs:+;}&quot;${opt}/\$(PlatformName)&quot;"
+            fi
+        ;;
+        -l*) libs="${libs}${libs:+ }${opt##-l}.lib"
+        ;;
+        -*) die_unknown $opt
+        ;;
+        *)
+            # The paths in file_list are fixed outside of the loop.
+            file_list[${#file_list[@]}]="$opt"
+            case "$opt" in
+                 *.asm|*.s) uses_asm=true
+                 ;;
+            esac
+        ;;
+    esac
+done
+
+# Make one call to fix_path for file_list to improve performance.
+fix_file_list
+
+outfile=${outfile:-/dev/stdout}
+guid=${guid:-`generate_uuid`}
+asm_use_custom_step=false
+uses_asm=${uses_asm:-false}
+case "${vs_ver:-11}" in
+    10|11|12|14)
+       asm_use_custom_step=$uses_asm
+    ;;
+esac
+
+[ -n "$name" ] || die "Project name (--name) must be specified!"
+[ -n "$target" ] || die "Target (--target) must be specified!"
+
+if ${use_static_runtime:-false}; then
+    release_runtime=MultiThreaded
+    debug_runtime=MultiThreadedDebug
+    lib_sfx=mt
+else
+    release_runtime=MultiThreadedDLL
+    debug_runtime=MultiThreadedDebugDLL
+    lib_sfx=md
+fi
+
+# Calculate debug lib names: If a lib ends in ${lib_sfx}.lib, then rename
+# it to ${lib_sfx}d.lib. This precludes linking to release libs from a
+# debug exe, so this may need to be refactored later.
+for lib in ${libs}; do
+    if [ "$lib" != "${lib%${lib_sfx}.lib}" ]; then
+        lib=${lib%.lib}d.lib
+    fi
+    debug_libs="${debug_libs}${debug_libs:+ }${lib}"
+done
+debug_libs=${debug_libs// /;}
+libs=${libs// /;}
+
+
+# List of all platforms supported for this target
+case "$target" in
+    x86_64*)
+        platforms[0]="x64"
+        asm_Debug_cmdline="yasm -Xvc -g cv8 -f win64 ${yasmincs} &quot;%(FullPath)&quot;"
+        asm_Release_cmdline="yasm -Xvc -f win64 ${yasmincs} &quot;%(FullPath)&quot;"
+    ;;
+    x86*)
+        platforms[0]="Win32"
+        asm_Debug_cmdline="yasm -Xvc -g cv8 -f win32 ${yasmincs} &quot;%(FullPath)&quot;"
+        asm_Release_cmdline="yasm -Xvc -f win32 ${yasmincs} &quot;%(FullPath)&quot;"
+    ;;
+    arm*)
+        platforms[0]="ARM"
+        asm_Debug_cmdline="armasm -nologo -oldit &quot;%(FullPath)&quot;"
+        asm_Release_cmdline="armasm -nologo -oldit &quot;%(FullPath)&quot;"
+    ;;
+    *) die "Unsupported target $target!"
+    ;;
+esac
+
+generate_vcxproj() {
+    echo "<?xml version=\"1.0\" encoding=\"utf-8\"?>"
+    open_tag Project \
+        DefaultTargets="Build" \
+        ToolsVersion="4.0" \
+        xmlns="http://schemas.microsoft.com/developer/msbuild/2003" \
+
+    open_tag ItemGroup \
+        Label="ProjectConfigurations"
+    for plat in "${platforms[@]}"; do
+        for config in Debug Release; do
+            open_tag ProjectConfiguration \
+                Include="$config|$plat"
+            tag_content Configuration $config
+            tag_content Platform $plat
+            close_tag ProjectConfiguration
+        done
+    done
+    close_tag ItemGroup
+
+    open_tag PropertyGroup \
+        Label="Globals"
+        tag_content ProjectGuid "{${guid}}"
+        tag_content RootNamespace ${name}
+        tag_content Keyword ManagedCProj
+        if [ $vs_ver -ge 12 ] && [ "${platforms[0]}" = "ARM" ]; then
+            tag_content AppContainerApplication true
+            # The application type can be one of "Windows Store",
+            # "Windows Phone" or "Windows Phone Silverlight". The
+            # actual value doesn't matter from the libvpx point of view,
+            # since a static library built for one works on the others.
+            # The PlatformToolset field needs to be set in sync with this;
+            # for Windows Store and Windows Phone Silverlight it should be
+            # v120 while it should be v120_wp81 if the type is Windows Phone.
+            tag_content ApplicationType "Windows Store"
+            tag_content ApplicationTypeRevision 8.1
+        fi
+    close_tag PropertyGroup
+
+    tag Import \
+        Project="\$(VCTargetsPath)\\Microsoft.Cpp.Default.props"
+
+    for plat in "${platforms[@]}"; do
+        for config in Release Debug; do
+            open_tag PropertyGroup \
+                Condition="'\$(Configuration)|\$(Platform)'=='$config|$plat'" \
+                Label="Configuration"
+            if [ "$proj_kind" = "exe" ]; then
+                tag_content ConfigurationType Application
+            elif [ "$proj_kind" = "dll" ]; then
+                tag_content ConfigurationType DynamicLibrary
+            else
+                tag_content ConfigurationType StaticLibrary
+            fi
+            if [ "$vs_ver" = "11" ]; then
+                if [ "$plat" = "ARM" ]; then
+                    # Setting the wp80 toolchain automatically sets the
+                    # WINAPI_FAMILY define, which is required for building
+                    # code for arm with the windows headers. Alternatively,
+                    # one could add AppContainerApplication=true in the Globals
+                    # section and add PrecompiledHeader=NotUsing and
+                    # CompileAsWinRT=false in ClCompile and SubSystem=Console
+                    # in Link.
+                    tag_content PlatformToolset v110_wp80
+                else
+                    tag_content PlatformToolset v110
+                fi
+            fi
+            if [ "$vs_ver" = "12" ]; then
+                # Setting a PlatformToolset indicating windows phone isn't
+                # enough to build code for arm with MSVC 2013, one strictly
+                # has to enable AppContainerApplication as well.
+                tag_content PlatformToolset v120
+            fi
+            if [ "$vs_ver" = "14" ]; then
+                tag_content PlatformToolset v140
+            fi
+            tag_content CharacterSet Unicode
+            if [ "$config" = "Release" ]; then
+                tag_content WholeProgramOptimization true
+            fi
+            close_tag PropertyGroup
+        done
+    done
+
+    tag Import \
+        Project="\$(VCTargetsPath)\\Microsoft.Cpp.props"
+
+    open_tag ImportGroup \
+        Label="PropertySheets"
+        tag Import \
+            Project="\$(UserRootDir)\\Microsoft.Cpp.\$(Platform).user.props" \
+            Condition="exists('\$(UserRootDir)\\Microsoft.Cpp.\$(Platform).user.props')" \
+            Label="LocalAppDataPlatform"
+    close_tag ImportGroup
+
+    tag PropertyGroup \
+        Label="UserMacros"
+
+    for plat in "${platforms[@]}"; do
+        plat_no_ws=`echo $plat | sed 's/[^A-Za-z0-9_]/_/g'`
+        for config in Debug Release; do
+            open_tag PropertyGroup \
+                Condition="'\$(Configuration)|\$(Platform)'=='$config|$plat'"
+            tag_content OutDir "\$(SolutionDir)$plat_no_ws\\\$(Configuration)\\"
+            tag_content IntDir "$plat_no_ws\\\$(Configuration)\\${name}\\"
+            if [ "$proj_kind" == "lib" ]; then
+              if [ "$config" == "Debug" ]; then
+                config_suffix=d
+              else
+                config_suffix=""
+              fi
+              tag_content TargetName "${name}${lib_sfx}${config_suffix}"
+            fi
+            close_tag PropertyGroup
+        done
+    done
+
+    for plat in "${platforms[@]}"; do
+        for config in Debug Release; do
+            open_tag ItemDefinitionGroup \
+                Condition="'\$(Configuration)|\$(Platform)'=='$config|$plat'"
+            if [ "$name" == "vpx" ]; then
+                hostplat=$plat
+                if [ "$hostplat" == "ARM" ]; then
+                    hostplat=Win32
+                fi
+            fi
+            open_tag ClCompile
+            if [ "$config" = "Debug" ]; then
+                opt=Disabled
+                runtime=$debug_runtime
+                curlibs=$debug_libs
+                debug=_DEBUG
+            else
+                opt=MaxSpeed
+                runtime=$release_runtime
+                curlibs=$libs
+                tag_content FavorSizeOrSpeed Speed
+                debug=NDEBUG
+            fi
+            extradefines=";$defines"
+            tag_content Optimization $opt
+            tag_content AdditionalIncludeDirectories "$incs;%(AdditionalIncludeDirectories)"
+            tag_content PreprocessorDefinitions "WIN32;$debug;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE$extradefines;%(PreprocessorDefinitions)"
+            tag_content RuntimeLibrary $runtime
+            tag_content WarningLevel Level3
+            if ${werror:-false}; then
+                tag_content TreatWarningAsError true
+            fi
+            if [ $vs_ver -ge 11 ]; then
+                # We need to override the defaults for these settings
+                # if AppContainerApplication is set.
+                tag_content CompileAsWinRT false
+                tag_content PrecompiledHeader NotUsing
+                tag_content SDLCheck false
+            fi
+            close_tag ClCompile
+            case "$proj_kind" in
+            exe)
+                open_tag Link
+                tag_content GenerateDebugInformation true
+                # Console is the default normally, but if
+                # AppContainerApplication is set, we need to override it.
+                tag_content SubSystem Console
+                close_tag Link
+                ;;
+            dll)
+                open_tag Link
+                tag_content GenerateDebugInformation true
+                tag_content ModuleDefinitionFile $module_def
+                close_tag Link
+                ;;
+            lib)
+                ;;
+            esac
+            close_tag ItemDefinitionGroup
+        done
+
+    done
+
+    open_tag ItemGroup
+    generate_filter "Source Files"   "c;cc;cpp;def;odl;idl;hpj;bat;asm;asmx;s"
+    close_tag ItemGroup
+    open_tag ItemGroup
+    generate_filter "Header Files"   "h;hm;inl;inc;xsd"
+    close_tag ItemGroup
+    open_tag ItemGroup
+    generate_filter "Build Files"    "mk"
+    close_tag ItemGroup
+    open_tag ItemGroup
+    generate_filter "References"     "vcxproj"
+    close_tag ItemGroup
+
+    tag Import \
+        Project="\$(VCTargetsPath)\\Microsoft.Cpp.targets"
+
+    open_tag ImportGroup \
+        Label="ExtensionTargets"
+    close_tag ImportGroup
+
+    close_tag Project
+
+    # This must be done from within the {} subshell
+    echo "Ignored files list (${#file_list[@]} items) is:" >&2
+    for f in "${file_list[@]}"; do
+        echo "    $f" >&2
+    done
+}
+
+# This regexp doesn't catch most of the strings in the vcxproj format,
+# since they're like <tag>path</tag> instead of <tag attr="path" />
+# as previously. It still seems to work ok despite this.
+generate_vcxproj |
+    sed  -e '/"/s;\([^ "]\)/;\1\\;g' |
+    sed  -e '/xmlns/s;\\;/;g' > ${outfile}
+
+exit
diff --git a/libs/libvpx/build/make/iosbuild.sh b/libs/libvpx/build/make/iosbuild.sh
new file mode 100755
index 0000000000..ae5ba182d5
--- /dev/null
+++ b/libs/libvpx/build/make/iosbuild.sh
@@ -0,0 +1,302 @@
+#!/bin/sh
+##
+##  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+##
+## This script generates 'VPX.framework'. An iOS app can encode and decode VPx
+## video by including 'VPX.framework'.
+##
+## Run iosbuild.sh to create 'VPX.framework' in the current directory.
+##
+set -e
+devnull='> /dev/null 2>&1'
+
+BUILD_ROOT="_iosbuild"
+CONFIGURE_ARGS="--disable-docs
+                --disable-examples
+                --disable-libyuv
+                --disable-unit-tests"
+DIST_DIR="_dist"
+FRAMEWORK_DIR="VPX.framework"
+HEADER_DIR="${FRAMEWORK_DIR}/Headers/vpx"
+SCRIPT_DIR=$(dirname "$0")
+LIBVPX_SOURCE_DIR=$(cd ${SCRIPT_DIR}/../..; pwd)
+LIPO=$(xcrun -sdk iphoneos${SDK} -find lipo)
+ORIG_PWD="$(pwd)"
+ARM_TARGETS="arm64-darwin-gcc
+             armv7-darwin-gcc
+             armv7s-darwin-gcc"
+SIM_TARGETS="x86-iphonesimulator-gcc
+             x86_64-iphonesimulator-gcc"
+OSX_TARGETS="x86-darwin15-gcc
+             x86_64-darwin15-gcc"
+TARGETS="${ARM_TARGETS} ${SIM_TARGETS}"
+
+# Configures for the target specified by $1, and invokes make with the dist
+# target using $DIST_DIR as the distribution output directory.
+build_target() {
+  local target="$1"
+  local old_pwd="$(pwd)"
+  local target_specific_flags=""
+
+  vlog "***Building target: ${target}***"
+
+  case "${target}" in
+    x86-*)
+      target_specific_flags="--enable-pic"
+      vlog "Enabled PIC for ${target}"
+      ;;
+  esac
+
+  mkdir "${target}"
+  cd "${target}"
+  eval "${LIBVPX_SOURCE_DIR}/configure" --target="${target}" \
+    ${CONFIGURE_ARGS} ${EXTRA_CONFIGURE_ARGS} ${target_specific_flags} \
+    ${devnull}
+  export DIST_DIR
+  eval make dist ${devnull}
+  cd "${old_pwd}"
+
+  vlog "***Done building target: ${target}***"
+}
+
+# Returns the preprocessor symbol for the target specified by $1.
+target_to_preproc_symbol() {
+  target="$1"
+  case "${target}" in
+    arm64-*)
+      echo "__aarch64__"
+      ;;
+    armv7-*)
+      echo "__ARM_ARCH_7A__"
+      ;;
+    armv7s-*)
+      echo "__ARM_ARCH_7S__"
+      ;;
+    x86-*)
+      echo "__i386__"
+      ;;
+    x86_64-*)
+      echo "__x86_64__"
+      ;;
+    *)
+      echo "#error ${target} unknown/unsupported"
+      return 1
+      ;;
+  esac
+}
+
+# Create a vpx_config.h shim that, based on preprocessor settings for the
+# current target CPU, includes the real vpx_config.h for the current target.
+# $1 is the list of targets.
+create_vpx_framework_config_shim() {
+  local targets="$1"
+  local config_file="${HEADER_DIR}/vpx_config.h"
+  local preproc_symbol=""
+  local target=""
+  local include_guard="VPX_FRAMEWORK_HEADERS_VPX_VPX_CONFIG_H_"
+
+  local file_header="/*
+ *  Copyright (c) $(date +%Y) The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/* GENERATED FILE: DO NOT EDIT! */
+
+#ifndef ${include_guard}
+#define ${include_guard}
+
+#if defined"
+
+  printf "%s" "${file_header}" > "${config_file}"
+  for target in ${targets}; do
+    preproc_symbol=$(target_to_preproc_symbol "${target}")
+    printf " ${preproc_symbol}\n" >> "${config_file}"
+    printf "#define VPX_FRAMEWORK_TARGET \"${target}\"\n" >> "${config_file}"
+    printf "#include \"VPX/vpx/${target}/vpx_config.h\"\n" >> "${config_file}"
+    printf "#elif defined" >> "${config_file}"
+    mkdir "${HEADER_DIR}/${target}"
+    cp -p "${BUILD_ROOT}/${target}/vpx_config.h" "${HEADER_DIR}/${target}"
+  done
+
+  # Consume the last line of output from the loop: We don't want it.
+  sed -i '' -e '$d' "${config_file}"
+
+  printf "#endif\n\n" >> "${config_file}"
+  printf "#endif  // ${include_guard}" >> "${config_file}"
+}
+
+# Configures and builds each target specified by $1, and then builds
+# VPX.framework.
+build_framework() {
+  local lib_list=""
+  local targets="$1"
+  local target=""
+  local target_dist_dir=""
+
+  # Clean up from previous build(s).
+  rm -rf "${BUILD_ROOT}" "${FRAMEWORK_DIR}"
+
+  # Create output dirs.
+  mkdir -p "${BUILD_ROOT}"
+  mkdir -p "${HEADER_DIR}"
+
+  cd "${BUILD_ROOT}"
+
+  for target in ${targets}; do
+    build_target "${target}"
+    target_dist_dir="${BUILD_ROOT}/${target}/${DIST_DIR}"
+    lib_list="${lib_list} ${target_dist_dir}/lib/libvpx.a"
+  done
+
+  cd "${ORIG_PWD}"
+
+  # The basic libvpx API includes are all the same; just grab the most recent
+  # set.
+  cp -p "${target_dist_dir}"/include/vpx/* "${HEADER_DIR}"
+
+  # Build the fat library.
+  ${LIPO} -create ${lib_list} -output ${FRAMEWORK_DIR}/VPX
+
+  # Create the vpx_config.h shim that allows usage of vpx_config.h from
+  # within VPX.framework.
+  create_vpx_framework_config_shim "${targets}"
+
+  # Copy in vpx_version.h.
+  cp -p "${BUILD_ROOT}/${target}/vpx_version.h" "${HEADER_DIR}"
+
+  vlog "Created fat library ${FRAMEWORK_DIR}/VPX containing:"
+  for lib in ${lib_list}; do
+    vlog "  $(echo ${lib} | awk -F / '{print $2, $NF}')"
+  done
+
+  # TODO(tomfinegan): Verify that expected targets are included within
+  # VPX.framework/VPX via lipo -info.
+}
+
+# Trap function. Cleans up the subtree used to build all targets contained in
+# $TARGETS.
+cleanup() {
+  local readonly res=$?
+  cd "${ORIG_PWD}"
+
+  if [ $res -ne 0 ]; then
+    elog "build exited with error ($res)"
+  fi
+
+  if [ "${PRESERVE_BUILD_OUTPUT}" != "yes" ]; then
+    rm -rf "${BUILD_ROOT}"
+  fi
+}
+
+print_list() {
+  local indent="$1"
+  shift
+  local list="$@"
+  for entry in ${list}; do
+    echo "${indent}${entry}"
+  done
+}
+
+iosbuild_usage() {
+cat << EOF
+  Usage: ${0##*/} [arguments]
+    --help: Display this message and exit.
+    --extra-configure-args <args>: Extra args to pass when configuring libvpx.
+    --macosx: Uses darwin15 targets instead of iphonesimulator targets for x86
+              and x86_64. Allows linking to framework when builds target MacOSX
+              instead of iOS.
+    --preserve-build-output: Do not delete the build directory.
+    --show-build-output: Show output from each library build.
+    --targets <targets>: Override default target list. Defaults:
+$(print_list "        " ${TARGETS})
+    --test-link: Confirms all targets can be linked. Functionally identical to
+                 passing --enable-examples via --extra-configure-args.
+    --verbose: Output information about the environment and each stage of the
+               build.
+EOF
+}
+
+elog() {
+  echo "${0##*/} failed because: $@" 1>&2
+}
+
+vlog() {
+  if [ "${VERBOSE}" = "yes" ]; then
+    echo "$@"
+  fi
+}
+
+trap cleanup EXIT
+
+# Parse the command line.
+while [ -n "$1" ]; do
+  case "$1" in
+    --extra-configure-args)
+      EXTRA_CONFIGURE_ARGS="$2"
+      shift
+      ;;
+    --help)
+      iosbuild_usage
+      exit
+      ;;
+    --preserve-build-output)
+      PRESERVE_BUILD_OUTPUT=yes
+      ;;
+    --show-build-output)
+      devnull=
+      ;;
+    --test-link)
+      EXTRA_CONFIGURE_ARGS="${EXTRA_CONFIGURE_ARGS} --enable-examples"
+      ;;
+    --targets)
+      TARGETS="$2"
+      shift
+      ;;
+    --macosx)
+      TARGETS="${ARM_TARGETS} ${OSX_TARGETS}"
+      ;;
+    --verbose)
+      VERBOSE=yes
+      ;;
+    *)
+      iosbuild_usage
+      exit 1
+      ;;
+  esac
+  shift
+done
+
+if [ "${VERBOSE}" = "yes" ]; then
+cat << EOF
+  BUILD_ROOT=${BUILD_ROOT}
+  DIST_DIR=${DIST_DIR}
+  CONFIGURE_ARGS=${CONFIGURE_ARGS}
+  EXTRA_CONFIGURE_ARGS=${EXTRA_CONFIGURE_ARGS}
+  FRAMEWORK_DIR=${FRAMEWORK_DIR}
+  HEADER_DIR=${HEADER_DIR}
+  LIBVPX_SOURCE_DIR=${LIBVPX_SOURCE_DIR}
+  LIPO=${LIPO}
+  MAKEFLAGS=${MAKEFLAGS}
+  ORIG_PWD=${ORIG_PWD}
+  PRESERVE_BUILD_OUTPUT=${PRESERVE_BUILD_OUTPUT}
+  TARGETS="$(print_list "" ${TARGETS})"
+  OSX_TARGETS="${OSX_TARGETS}"
+  SIM_TARGETS="${SIM_TARGETS}"
+EOF
+fi
+
+build_framework "${TARGETS}"
+echo "Successfully built '${FRAMEWORK_DIR}' for:"
+print_list "" ${TARGETS}
diff --git a/libs/libvpx/build/make/msvs_common.sh b/libs/libvpx/build/make/msvs_common.sh
new file mode 100644
index 0000000000..90c14888c2
--- /dev/null
+++ b/libs/libvpx/build/make/msvs_common.sh
@@ -0,0 +1,113 @@
+#!/bin/bash
+##
+##  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+if [ "$(uname -o 2>/dev/null)" = "Cygwin" ] \
+   && cygpath --help >/dev/null 2>&1; then
+    FIXPATH='cygpath -m'
+else
+    FIXPATH='echo_path'
+fi
+
+die() {
+    echo "${self_basename}: $@" >&2
+    exit 1
+}
+
+die_unknown(){
+    echo "Unknown option \"$1\"." >&2
+    echo "See ${self_basename} --help for available options." >&2
+    exit 1
+}
+
+echo_path() {
+    for path; do
+        echo "$path"
+    done
+}
+
+# Output one, possibly changed based on the system, path per line.
+fix_path() {
+    $FIXPATH "$@"
+}
+
+# Corrects the paths in file_list in one pass for efficiency.
+fix_file_list() {
+    # TODO(jzern): this could be more generic and take the array as a param.
+    files=$(fix_path "${file_list[@]}")
+    local IFS=$'\n'
+    file_list=($files)
+}
+
+generate_uuid() {
+    local hex="0123456789ABCDEF"
+    local i
+    local uuid=""
+    local j
+    #93995380-89BD-4b04-88EB-625FBE52EBFB
+    for ((i=0; i<32; i++)); do
+        (( j = $RANDOM % 16 ))
+        uuid="${uuid}${hex:$j:1}"
+    done
+    echo "${uuid:0:8}-${uuid:8:4}-${uuid:12:4}-${uuid:16:4}-${uuid:20:12}"
+}
+
+indent1="    "
+indent=""
+indent_push() {
+    indent="${indent}${indent1}"
+}
+indent_pop() {
+    indent="${indent%${indent1}}"
+}
+
+tag_attributes() {
+    for opt in "$@"; do
+        optval="${opt#*=}"
+        [ -n "${optval}" ] ||
+            die "Missing attribute value in '$opt' while generating $tag tag"
+        echo "${indent}${opt%%=*}=\"${optval}\""
+    done
+}
+
+open_tag() {
+    local tag=$1
+    shift
+    if [ $# -ne 0 ]; then
+        echo "${indent}<${tag}"
+        indent_push
+        tag_attributes "$@"
+        echo "${indent}>"
+    else
+        echo "${indent}<${tag}>"
+        indent_push
+    fi
+}
+
+close_tag() {
+    local tag=$1
+    indent_pop
+    echo "${indent}</${tag}>"
+}
+
+tag() {
+    local tag=$1
+    shift
+    if [ $# -ne 0 ]; then
+        echo "${indent}<${tag}"
+        indent_push
+        tag_attributes "$@"
+        indent_pop
+        echo "${indent}/>"
+    else
+        echo "${indent}<${tag}/>"
+    fi
+}
+
diff --git a/libs/libvpx/build/make/rtcd.pl b/libs/libvpx/build/make/rtcd.pl
new file mode 100755
index 0000000000..991b6abe7d
--- /dev/null
+++ b/libs/libvpx/build/make/rtcd.pl
@@ -0,0 +1,426 @@
+#!/usr/bin/env perl
+
+no strict 'refs';
+use warnings;
+use Getopt::Long;
+Getopt::Long::Configure("auto_help") if $Getopt::Long::VERSION > 2.32;
+
+my %ALL_FUNCS = ();
+my @ALL_ARCHS;
+my @ALL_FORWARD_DECLS;
+my @REQUIRES;
+
+my %opts = ();
+my %disabled = ();
+my %required = ();
+
+my @argv;
+foreach (@ARGV) {
+  $disabled{$1} = 1, next if /--disable-(.*)/;
+  $required{$1} = 1, next if /--require-(.*)/;
+  push @argv, $_;
+}
+
+# NB: use GetOptions() instead of GetOptionsFromArray() for compatibility.
+@ARGV = @argv;
+GetOptions(
+  \%opts,
+  'arch=s',
+  'sym=s',
+  'config=s',
+);
+
+foreach my $opt (qw/arch config/) {
+  if (!defined($opts{$opt})) {
+    warn "--$opt is required!\n";
+    Getopt::Long::HelpMessage('-exit' => 1);
+  }
+}
+
+foreach my $defs_file (@ARGV) {
+  if (!-f $defs_file) {
+    warn "$defs_file: $!\n";
+    Getopt::Long::HelpMessage('-exit' => 1);
+  }
+}
+
+open CONFIG_FILE, $opts{config} or
+  die "Error opening config file '$opts{config}': $!\n";
+
+my %config = ();
+while (<CONFIG_FILE>) {
+  next if !/^(?:CONFIG_|HAVE_)/;
+  chomp;
+  my @pair = split /=/;
+  $config{$pair[0]} = $pair[1];
+}
+close CONFIG_FILE;
+
+#
+# Routines for the RTCD DSL to call
+#
+sub vpx_config($) {
+  return (defined $config{$_[0]}) ? $config{$_[0]} : "";
+}
+
+sub specialize {
+  my $fn=$_[0];
+  shift;
+  foreach my $opt (@_) {
+    eval "\$${fn}_${opt}=${fn}_${opt}";
+  }
+}
+
+sub add_proto {
+  my $fn = splice(@_, -2, 1);
+  $ALL_FUNCS{$fn} = \@_;
+  specialize $fn, "c";
+}
+
+sub require {
+  foreach my $fn (keys %ALL_FUNCS) {
+    foreach my $opt (@_) {
+      my $ofn = eval "\$${fn}_${opt}";
+      next if !$ofn;
+
+      # if we already have a default, then we can disable it, as we know
+      # we can do better.
+      my $best = eval "\$${fn}_default";
+      if ($best) {
+        my $best_ofn = eval "\$${best}";
+        if ($best_ofn && "$best_ofn" ne "$ofn") {
+          eval "\$${best}_link = 'false'";
+        }
+      }
+      eval "\$${fn}_default=${fn}_${opt}";
+      eval "\$${fn}_${opt}_link='true'";
+    }
+  }
+}
+
+sub forward_decls {
+  push @ALL_FORWARD_DECLS, @_;
+}
+
+#
+# Include the user's directives
+#
+foreach my $f (@ARGV) {
+  open FILE, "<", $f or die "cannot open $f: $!\n";
+  my $contents = join('', <FILE>);
+  close FILE;
+  eval $contents or warn "eval failed: $@\n";
+}
+
+#
+# Process the directives according to the command line
+#
+sub process_forward_decls() {
+  foreach (@ALL_FORWARD_DECLS) {
+    $_->();
+  }
+}
+
+sub determine_indirection {
+  vpx_config("CONFIG_RUNTIME_CPU_DETECT") eq "yes" or &require(@ALL_ARCHS);
+  foreach my $fn (keys %ALL_FUNCS) {
+    my $n = "";
+    my @val = @{$ALL_FUNCS{$fn}};
+    my $args = pop @val;
+    my $rtyp = "@val";
+    my $dfn = eval "\$${fn}_default";
+    $dfn = eval "\$${dfn}";
+    foreach my $opt (@_) {
+      my $ofn = eval "\$${fn}_${opt}";
+      next if !$ofn;
+      my $link = eval "\$${fn}_${opt}_link";
+      next if $link && $link eq "false";
+      $n .= "x";
+    }
+    if ($n eq "x") {
+      eval "\$${fn}_indirect = 'false'";
+    } else {
+      eval "\$${fn}_indirect = 'true'";
+    }
+  }
+}
+
+sub declare_function_pointers {
+  foreach my $fn (sort keys %ALL_FUNCS) {
+    my @val = @{$ALL_FUNCS{$fn}};
+    my $args = pop @val;
+    my $rtyp = "@val";
+    my $dfn = eval "\$${fn}_default";
+    $dfn = eval "\$${dfn}";
+    foreach my $opt (@_) {
+      my $ofn = eval "\$${fn}_${opt}";
+      next if !$ofn;
+      print "$rtyp ${ofn}($args);\n";
+    }
+    if (eval "\$${fn}_indirect" eq "false") {
+      print "#define ${fn} ${dfn}\n";
+    } else {
+      print "RTCD_EXTERN $rtyp (*${fn})($args);\n";
+    }
+    print "\n";
+  }
+}
+
+sub set_function_pointers {
+  foreach my $fn (sort keys %ALL_FUNCS) {
+    my @val = @{$ALL_FUNCS{$fn}};
+    my $args = pop @val;
+    my $rtyp = "@val";
+    my $dfn = eval "\$${fn}_default";
+    $dfn = eval "\$${dfn}";
+    if (eval "\$${fn}_indirect" eq "true") {
+      print "    $fn = $dfn;\n";
+      foreach my $opt (@_) {
+        my $ofn = eval "\$${fn}_${opt}";
+        next if !$ofn;
+        next if "$ofn" eq "$dfn";
+        my $link = eval "\$${fn}_${opt}_link";
+        next if $link && $link eq "false";
+        my $cond = eval "\$have_${opt}";
+        print "    if (${cond}) $fn = $ofn;\n"
+      }
+    }
+  }
+}
+
+sub filter {
+  my @filtered;
+  foreach (@_) { push @filtered, $_ unless $disabled{$_}; }
+  return @filtered;
+}
+
+#
+# Helper functions for generating the arch specific RTCD files
+#
+sub common_top() {
+  my $include_guard = uc($opts{sym})."_H_";
+  print <<EOF;
+#ifndef ${include_guard}
+#define ${include_guard}
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+EOF
+
+process_forward_decls();
+print <<EOF;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+EOF
+declare_function_pointers("c", @ALL_ARCHS);
+
+print <<EOF;
+void $opts{sym}(void);
+
+EOF
+}
+
+sub common_bottom() {
+  print <<EOF;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
+EOF
+}
+
+sub x86() {
+  determine_indirection("c", @ALL_ARCHS);
+
+  # Assign the helper variable for each enabled extension
+  foreach my $opt (@ALL_ARCHS) {
+    my $opt_uc = uc $opt;
+    eval "\$have_${opt}=\"flags & HAS_${opt_uc}\"";
+  }
+
+  common_top;
+  print <<EOF;
+#ifdef RTCD_C
+#include "vpx_ports/x86.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = x86_simd_caps();
+
+    (void)flags;
+
+EOF
+
+  set_function_pointers("c", @ALL_ARCHS);
+
+  print <<EOF;
+}
+#endif
+EOF
+  common_bottom;
+}
+
+sub arm() {
+  determine_indirection("c", @ALL_ARCHS);
+
+  # Assign the helper variable for each enabled extension
+  foreach my $opt (@ALL_ARCHS) {
+    my $opt_uc = uc $opt;
+    # Enable neon assembly based on HAVE_NEON logic instead of adding new
+    # HAVE_NEON_ASM logic
+    if ($opt eq 'neon_asm') { $opt_uc = 'NEON' }
+    eval "\$have_${opt}=\"flags & HAS_${opt_uc}\"";
+  }
+
+  common_top;
+  print <<EOF;
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+#include "vpx_ports/arm.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = arm_cpu_caps();
+
+    (void)flags;
+
+EOF
+
+  set_function_pointers("c", @ALL_ARCHS);
+
+  print <<EOF;
+}
+#endif
+EOF
+  common_bottom;
+}
+
+sub mips() {
+  determine_indirection("c", @ALL_ARCHS);
+  common_top;
+
+  print <<EOF;
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+static void setup_rtcd_internal(void)
+{
+EOF
+
+  set_function_pointers("c", @ALL_ARCHS);
+
+  print <<EOF;
+#if HAVE_DSPR2
+void vpx_dsputil_static_init();
+#if CONFIG_VP8
+void dsputil_static_init();
+#endif
+
+vpx_dsputil_static_init();
+#if CONFIG_VP8
+dsputil_static_init();
+#endif
+#endif
+}
+#endif
+EOF
+  common_bottom;
+}
+
+sub unoptimized() {
+  determine_indirection "c";
+  common_top;
+  print <<EOF;
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+static void setup_rtcd_internal(void)
+{
+EOF
+
+  set_function_pointers "c";
+
+  print <<EOF;
+}
+#endif
+EOF
+  common_bottom;
+}
+
+#
+# Main Driver
+#
+
+&require("c");
+if ($opts{arch} eq 'x86') {
+  @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 avx avx2/);
+  x86;
+} elsif ($opts{arch} eq 'x86_64') {
+  @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 avx avx2/);
+  @REQUIRES = filter(keys %required ? keys %required : qw/mmx sse sse2/);
+  &require(@REQUIRES);
+  x86;
+} elsif ($opts{arch} eq 'mips32' || $opts{arch} eq 'mips64') {
+  @ALL_ARCHS = filter("$opts{arch}");
+  open CONFIG_FILE, $opts{config} or
+    die "Error opening config file '$opts{config}': $!\n";
+  while (<CONFIG_FILE>) {
+    if (/HAVE_DSPR2=yes/) {
+      @ALL_ARCHS = filter("$opts{arch}", qw/dspr2/);
+      last;
+    }
+    if (/HAVE_MSA=yes/) {
+      @ALL_ARCHS = filter("$opts{arch}", qw/msa/);
+      last;
+    }
+  }
+  close CONFIG_FILE;
+  mips;
+} elsif ($opts{arch} eq 'armv6') {
+  @ALL_ARCHS = filter(qw/media/);
+  arm;
+} elsif ($opts{arch} =~ /armv7\w?/) {
+  @ALL_ARCHS = filter(qw/media neon_asm neon/);
+  @REQUIRES = filter(keys %required ? keys %required : qw/media/);
+  &require(@REQUIRES);
+  arm;
+} elsif ($opts{arch} eq 'armv8' || $opts{arch} eq 'arm64' ) {
+  @ALL_ARCHS = filter(qw/neon/);
+  arm;
+} else {
+  unoptimized;
+}
+
+__END__
+
+=head1 NAME
+
+rtcd -
+
+=head1 SYNOPSIS
+
+Usage: rtcd.pl [options] FILE
+
+See 'perldoc rtcd.pl' for more details.
+
+=head1 DESCRIPTION
+
+Reads the Run Time CPU Detections definitions from FILE and generates a
+C header file on stdout.
+
+=head1 OPTIONS
+
+Options:
+  --arch=ARCH       Architecture to generate defs for (required)
+  --disable-EXT     Disable support for EXT extensions
+  --require-EXT     Require support for EXT extensions
+  --sym=SYMBOL      Unique symbol to use for RTCD initialization function
+  --config=FILE     File with CONFIG_FOO=yes lines to parse
diff --git a/libs/libvpx/build/make/thumb.pm b/libs/libvpx/build/make/thumb.pm
new file mode 100644
index 0000000000..483c2539c6
--- /dev/null
+++ b/libs/libvpx/build/make/thumb.pm
@@ -0,0 +1,70 @@
+#!/usr/bin/env perl
+##
+##  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+package thumb;
+
+sub FixThumbInstructions($$)
+{
+    my $short_branches = $_[1];
+    my $branch_shift_offset = $short_branches ? 1 : 0;
+
+    # Write additions with shifts, such as "add r10, r11, lsl #8",
+    # in three operand form, "add r10, r10, r11, lsl #8".
+    s/(add\s+)(r\d+),\s*(r\d+),\s*(lsl #\d+)/$1$2, $2, $3, $4/g;
+
+    # Convert additions with a non-constant shift into a sequence
+    # with left shift, addition and a right shift (to restore the
+    # register to the original value). Currently the right shift
+    # isn't necessary in the code base since the values in these
+    # registers aren't used, but doing the shift for consistency.
+    # This converts instructions such as "add r12, r12, r5, lsl r4"
+    # into the sequence "lsl r5, r4", "add r12, r12, r5", "lsr r5, r4".
+    s/^(\s*)(add)(\s+)(r\d+),\s*(r\d+),\s*(r\d+),\s*lsl (r\d+)/$1lsl$3$6, $7\n$1$2$3$4, $5, $6\n$1lsr$3$6, $7/g;
+
+    # Convert loads with right shifts in the indexing into a
+    # sequence of an add, load and sub. This converts
+    # "ldrb r4, [r9, lr, asr #1]" into "add r9, r9, lr, asr #1",
+    # "ldrb r9, [r9]", "sub r9, r9, lr, asr #1".
+    s/^(\s*)(ldrb)(\s+)(r\d+),\s*\[(\w+),\s*(\w+),\s*(asr #\d+)\]/$1add $3$5, $5, $6, $7\n$1$2$3$4, [$5]\n$1sub $3$5, $5, $6, $7/g;
+
+    # Convert register indexing with writeback into a separate add
+    # instruction. This converts "ldrb r12, [r1, r2]!" into
+    # "ldrb r12, [r1, r2]", "add r1, r1, r2".
+    s/^(\s*)(ldrb)(\s+)(r\d+),\s*\[(\w+),\s*(\w+)\]!/$1$2$3$4, [$5, $6]\n$1add $3$5, $6/g;
+
+    # Convert negative register indexing into separate sub/add instructions.
+    # This converts "ldrne r4, [src, -pstep, lsl #1]" into
+    # "subne src, src, pstep, lsl #1", "ldrne r4, [src]",
+    # "addne src, src, pstep, lsl #1". In a couple of cases where
+    # this is used, it's used for two subsequent load instructions,
+    # where a hand-written version of it could merge two subsequent
+    # add and sub instructions.
+    s/^(\s*)((ldr|str|pld)(ne)?)(\s+)(r\d+,\s*)?\[(\w+), -([^\]]+)\]/$1sub$4$5$7, $7, $8\n$1$2$5$6\[$7\]\n$1add$4$5$7, $7, $8/g;
+
+    # Convert register post indexing to a separate add instruction.
+    # This converts "ldrneb r9, [r0], r2" into "ldrneb r9, [r0]",
+    # "addne r0, r0, r2".
+    s/^(\s*)((ldr|str)(ne)?[bhd]?)(\s+)(\w+),(\s*\w+,)?\s*\[(\w+)\],\s*(\w+)/$1$2$5$6,$7 [$8]\n$1add$4$5$8, $8, $9/g;
+
+    # Convert a conditional addition to the pc register into a series of
+    # instructions. This converts "addlt pc, pc, r3, lsl #2" into
+    # "itttt lt", "movlt.n r12, pc", "addlt.w r12, #12",
+    # "addlt.w r12, r12, r3, lsl #2", "movlt.n pc, r12".
+    # This assumes that r12 is free at this point.
+    s/^(\s*)addlt(\s+)pc,\s*pc,\s*(\w+),\s*lsl\s*#(\d+)/$1itttt$2lt\n$1movlt.n$2r12, pc\n$1addlt.w$2r12, #12\n$1addlt.w$2r12, r12, $3, lsl #($4-$branch_shift_offset)\n$1movlt.n$2pc, r12/g;
+
+    # Convert "mov pc, lr" into "bx lr", since the former only works
+    # for switching from arm to thumb (and only in armv7), but not
+    # from thumb to arm.
+    s/mov(\s*)pc\s*,\s*lr/bx$1lr/g;
+}
+
+1;
diff --git a/libs/libvpx/build/make/version.sh b/libs/libvpx/build/make/version.sh
new file mode 100755
index 0000000000..b340142c93
--- /dev/null
+++ b/libs/libvpx/build/make/version.sh
@@ -0,0 +1,76 @@
+#!/bin/sh
+##
+##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+
+
+for opt in "$@"; do
+    optval="${opt#*=}"
+    case "$opt" in
+    --bare) bare=true ;;
+    *) break ;;
+    esac
+    shift
+done
+source_path=${1:-.}
+out_file=${2}
+id=${3:-VERSION_STRING}
+
+git_version_id=""
+if [ -d "${source_path}/.git" ]; then
+    # Source Path is a git working copy. Check for local modifications.
+    export GIT_DIR="${source_path}/.git"
+    git_version_id=`git describe --match=v[0-9]* 2>/dev/null`
+fi
+
+changelog_version=""
+for p in "${source_path}" "${source_path}/.."; do
+    if [ -z "$git_version_id" -a -f "${p}/CHANGELOG" ]; then
+        changelog_version=`head -n1 "${p}/CHANGELOG" | awk '{print $2}'`
+        changelog_version="${changelog_version}"
+        break
+    fi
+done
+version_str="${changelog_version}${git_version_id}"
+bare_version=${version_str#v}
+major_version=${bare_version%%.*}
+bare_version=${bare_version#*.}
+minor_version=${bare_version%%.*}
+bare_version=${bare_version#*.}
+patch_version=${bare_version%%-*}
+bare_version=${bare_version#${patch_version}}
+extra_version=${bare_version##-}
+
+#since they'll be used as integers below make sure they are or force to 0
+for v in major_version minor_version patch_version; do
+    if eval echo \$$v |grep -E -q '[^[:digit:]]'; then
+        eval $v=0
+    fi
+done
+
+if [ ${bare} ]; then
+    echo "${changelog_version}${git_version_id}" > $$.tmp
+else
+    cat<<EOF>$$.tmp
+#define VERSION_MAJOR  $major_version
+#define VERSION_MINOR  $minor_version
+#define VERSION_PATCH  $patch_version
+#define VERSION_EXTRA  "$extra_version"
+#define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
+#define ${id}_NOSP "${version_str}"
+#define ${id}      " ${version_str}"
+EOF
+fi
+if [ -n "$out_file" ]; then
+diff $$.tmp ${out_file} >/dev/null 2>&1 || cat $$.tmp > ${out_file}
+else
+cat $$.tmp
+fi
+rm $$.tmp
diff --git a/libs/libvpx/codereview.settings b/libs/libvpx/codereview.settings
new file mode 100644
index 0000000000..d7c8d395cb
--- /dev/null
+++ b/libs/libvpx/codereview.settings
@@ -0,0 +1,4 @@
+# This file is used by gcl to get repository specific information.
+GERRIT_HOST: chromium-review.googlesource.com
+GERRIT_PORT: 29418
+CODE_REVIEW_SERVER: chromium-review.googlesource.com
diff --git a/libs/libvpx/configure b/libs/libvpx/configure
new file mode 100755
index 0000000000..095cddf2db
--- /dev/null
+++ b/libs/libvpx/configure
@@ -0,0 +1,749 @@
+#!/bin/sh
+##
+##  configure
+##
+##  This script is the front-end to the build system. It provides a similar
+##  interface to standard configure scripts with some extra bits for dealing
+##  with toolchains that differ from the standard POSIX interface and
+##  for extracting subsets of the source tree. In theory, reusable parts
+##  of this script were intended to live in build/make/configure.sh,
+##  but in practice, the line is pretty blurry.
+##
+##  This build system is based in part on the FFmpeg configure script.
+##
+
+#source_path="`dirname \"$0\"`"
+source_path=${0%/*}
+. "${source_path}/build/make/configure.sh"
+
+show_help(){
+    show_help_pre
+    cat << EOF
+Advanced options:
+  ${toggle_libs}                  libraries
+  ${toggle_examples}              examples
+  ${toggle_docs}                  documentation
+  ${toggle_unit_tests}            unit tests
+  ${toggle_decode_perf_tests}     build decoder perf tests with unit tests
+  ${toggle_encode_perf_tests}     build encoder perf tests with unit tests
+  --cpu=CPU                       tune for the specified CPU (ARM: cortex-a8, X86: sse3)
+  --libc=PATH                     path to alternate libc
+  --size-limit=WxH                max size to allow in the decoder
+  --as={yasm|nasm|auto}           use specified assembler [auto, yasm preferred]
+  --sdk-path=PATH                 path to root of sdk (android builds only)
+  ${toggle_codec_srcs}            in/exclude codec library source code
+  ${toggle_debug_libs}            in/exclude debug version of libraries
+  ${toggle_static_msvcrt}         use static MSVCRT (VS builds only)
+  ${toggle_vp9_highbitdepth}      use VP9 high bit depth (10/12) profiles
+  ${toggle_better_hw_compatibility}
+                                  enable encoder to produce streams with better
+                                  hardware decoder compatibility
+  ${toggle_vp8}                   VP8 codec support
+  ${toggle_vp9}                   VP9 codec support
+  ${toggle_vp10}                  VP10 codec support
+  ${toggle_internal_stats}        output of encoder internal stats for debug, if supported (encoders)
+  ${toggle_postproc}              postprocessing
+  ${toggle_vp9_postproc}          vp9 specific postprocessing
+  ${toggle_multithread}           multithreaded encoding and decoding
+  ${toggle_spatial_resampling}    spatial sampling (scaling) support
+  ${toggle_realtime_only}         enable this option while building for real-time encoding
+  ${toggle_onthefly_bitpacking}   enable on-the-fly bitpacking in real-time encoding
+  ${toggle_error_concealment}     enable this option to get a decoder which is able to conceal losses
+  ${toggle_coefficient_range_checking}
+                                  enable decoder to check if intermediate
+                                  transform coefficients are in valid range
+  ${toggle_runtime_cpu_detect}    runtime cpu detection
+  ${toggle_shared}                shared library support
+  ${toggle_static}                static library support
+  ${toggle_small}                 favor smaller size over speed
+  ${toggle_postproc_visualizer}   macro block / block level visualizers
+  ${toggle_multi_res_encoding}    enable multiple-resolution encoding
+  ${toggle_temporal_denoising}    enable temporal denoising and disable the spatial denoiser
+  ${toggle_vp9_temporal_denoising}
+                                  enable vp9 temporal denoising
+  ${toggle_webm_io}               enable input from and output to WebM container
+  ${toggle_libyuv}                enable libyuv
+
+Codecs:
+  Codecs can be selectively enabled or disabled individually, or by family:
+      --disable-<codec>
+  is equivalent to:
+      --disable-<codec>-encoder
+      --disable-<codec>-decoder
+
+  Codecs available in this distribution:
+EOF
+#restore editor state '
+
+    family="";
+    last_family="";
+    c="";
+    str="";
+    for c in ${CODECS}; do
+        family=${c%_*}
+        if [ "${family}" != "${last_family}" ]; then
+            [ -z "${str}" ] || echo "${str}"
+            str="$(printf '    %10s:' ${family})"
+        fi
+        str="${str} $(printf '%10s' ${c#*_})"
+        last_family=${family}
+    done
+    echo "${str}"
+    show_help_post
+}
+
+##
+## BEGIN APPLICATION SPECIFIC CONFIGURATION
+##
+
+# all_platforms is a list of all supported target platforms. Maintain
+# alphabetically by architecture, generic-gnu last.
+all_platforms="${all_platforms} armv6-darwin-gcc"
+all_platforms="${all_platforms} armv6-linux-rvct"
+all_platforms="${all_platforms} armv6-linux-gcc"
+all_platforms="${all_platforms} armv6-none-rvct"
+all_platforms="${all_platforms} arm64-darwin-gcc"
+all_platforms="${all_platforms} armv7-android-gcc"   #neon Cortex-A8
+all_platforms="${all_platforms} armv7-darwin-gcc"    #neon Cortex-A8
+all_platforms="${all_platforms} armv7-linux-rvct"    #neon Cortex-A8
+all_platforms="${all_platforms} armv7-linux-gcc"     #neon Cortex-A8
+all_platforms="${all_platforms} armv7-none-rvct"     #neon Cortex-A8
+all_platforms="${all_platforms} armv7-win32-vs11"
+all_platforms="${all_platforms} armv7-win32-vs12"
+all_platforms="${all_platforms} armv7-win32-vs14"
+all_platforms="${all_platforms} armv7s-darwin-gcc"
+all_platforms="${all_platforms} mips32-linux-gcc"
+all_platforms="${all_platforms} mips64-linux-gcc"
+all_platforms="${all_platforms} sparc-solaris-gcc"
+all_platforms="${all_platforms} x86-android-gcc"
+all_platforms="${all_platforms} x86-darwin8-gcc"
+all_platforms="${all_platforms} x86-darwin8-icc"
+all_platforms="${all_platforms} x86-darwin9-gcc"
+all_platforms="${all_platforms} x86-darwin9-icc"
+all_platforms="${all_platforms} x86-darwin10-gcc"
+all_platforms="${all_platforms} x86-darwin11-gcc"
+all_platforms="${all_platforms} x86-darwin12-gcc"
+all_platforms="${all_platforms} x86-darwin13-gcc"
+all_platforms="${all_platforms} x86-darwin14-gcc"
+all_platforms="${all_platforms} x86-darwin15-gcc"
+all_platforms="${all_platforms} x86-iphonesimulator-gcc"
+all_platforms="${all_platforms} x86-linux-gcc"
+all_platforms="${all_platforms} x86-linux-icc"
+all_platforms="${all_platforms} x86-os2-gcc"
+all_platforms="${all_platforms} x86-solaris-gcc"
+all_platforms="${all_platforms} x86-win32-gcc"
+all_platforms="${all_platforms} x86-win32-vs7"
+all_platforms="${all_platforms} x86-win32-vs8"
+all_platforms="${all_platforms} x86-win32-vs9"
+all_platforms="${all_platforms} x86-win32-vs10"
+all_platforms="${all_platforms} x86-win32-vs11"
+all_platforms="${all_platforms} x86-win32-vs12"
+all_platforms="${all_platforms} x86-win32-vs14"
+all_platforms="${all_platforms} x86_64-android-gcc"
+all_platforms="${all_platforms} x86_64-darwin9-gcc"
+all_platforms="${all_platforms} x86_64-darwin10-gcc"
+all_platforms="${all_platforms} x86_64-darwin11-gcc"
+all_platforms="${all_platforms} x86_64-darwin12-gcc"
+all_platforms="${all_platforms} x86_64-darwin13-gcc"
+all_platforms="${all_platforms} x86_64-darwin14-gcc"
+all_platforms="${all_platforms} x86_64-darwin15-gcc"
+all_platforms="${all_platforms} x86_64-iphonesimulator-gcc"
+all_platforms="${all_platforms} x86_64-linux-gcc"
+all_platforms="${all_platforms} x86_64-linux-icc"
+all_platforms="${all_platforms} x86_64-solaris-gcc"
+all_platforms="${all_platforms} x86_64-win64-gcc"
+all_platforms="${all_platforms} x86_64-win64-vs8"
+all_platforms="${all_platforms} x86_64-win64-vs9"
+all_platforms="${all_platforms} x86_64-win64-vs10"
+all_platforms="${all_platforms} x86_64-win64-vs11"
+all_platforms="${all_platforms} x86_64-win64-vs12"
+all_platforms="${all_platforms} x86_64-win64-vs14"
+all_platforms="${all_platforms} generic-gnu"
+
+# all_targets is a list of all targets that can be configured
+# note that these should be in dependency order for now.
+all_targets="libs examples docs"
+
+# all targets available are enabled, by default.
+for t in ${all_targets}; do
+    [ -f "${source_path}/${t}.mk" ] && enable_feature ${t}
+done
+
+if ! perl --version >/dev/null; then
+    die "Perl is required to build"
+fi
+
+
+if [ "`cd \"${source_path}\" && pwd`" != "`pwd`" ]; then
+  # test to see if source_path already configured
+  if [ -f "${source_path}/vpx_config.h" ]; then
+    die "source directory already configured; run 'make distclean' there first"
+  fi
+fi
+
+# check installed doxygen version
+doxy_version=$(doxygen --version 2>/dev/null)
+doxy_major=${doxy_version%%.*}
+if [ ${doxy_major:-0} -ge 1 ]; then
+    doxy_version=${doxy_version#*.}
+    doxy_minor=${doxy_version%%.*}
+    doxy_patch=${doxy_version##*.}
+
+    [ $doxy_major -gt 1 ] && enable_feature doxygen
+    [ $doxy_minor -gt 5 ] && enable_feature doxygen
+    [ $doxy_minor -eq 5 ] && [ $doxy_patch -ge 3 ] && enable_feature doxygen
+fi
+
+# disable codecs when their source directory does not exist
+[ -d "${source_path}/vp8" ] || disable_feature vp8
+[ -d "${source_path}/vp9" ] || disable_feature vp9
+[ -d "${source_path}/vp10" ] || disable_feature vp10
+
+# disable vp10 codec by default
+disable_feature vp10
+
+# install everything except the sources, by default. sources will have
+# to be enabled when doing dist builds, since that's no longer a common
+# case.
+enabled doxygen && enable_feature install_docs
+enable_feature install_bins
+enable_feature install_libs
+
+enable_feature static
+enable_feature optimizations
+enable_feature dependency_tracking
+enable_feature spatial_resampling
+enable_feature multithread
+enable_feature os_support
+enable_feature temporal_denoising
+
+CODECS="
+    vp8_encoder
+    vp8_decoder
+    vp9_encoder
+    vp9_decoder
+    vp10_encoder
+    vp10_decoder
+"
+CODEC_FAMILIES="
+    vp8
+    vp9
+    vp10
+"
+
+ARCH_LIST="
+    arm
+    mips
+    x86
+    x86_64
+"
+ARCH_EXT_LIST_X86="
+    mmx
+    sse
+    sse2
+    sse3
+    ssse3
+    sse4_1
+    avx
+    avx2
+"
+ARCH_EXT_LIST="
+    edsp
+    media
+    neon
+    neon_asm
+
+    mips32
+    dspr2
+    msa
+    mips64
+
+    ${ARCH_EXT_LIST_X86}
+"
+HAVE_LIST="
+    ${ARCH_EXT_LIST}
+    vpx_ports
+    pthread_h
+    unistd_h
+"
+EXPERIMENT_LIST="
+    spatial_svc
+    fp_mb_stats
+    emulate_hardware
+    misc_fixes
+"
+CONFIG_LIST="
+    dependency_tracking
+    external_build
+    install_docs
+    install_bins
+    install_libs
+    install_srcs
+    use_x86inc
+    debug
+    gprof
+    gcov
+    rvct
+    gcc
+    msvs
+    pic
+    big_endian
+
+    codec_srcs
+    debug_libs
+
+    dequant_tokens
+    dc_recon
+    runtime_cpu_detect
+    postproc
+    vp9_postproc
+    multithread
+    internal_stats
+    ${CODECS}
+    ${CODEC_FAMILIES}
+    encoders
+    decoders
+    static_msvcrt
+    spatial_resampling
+    realtime_only
+    onthefly_bitpacking
+    error_concealment
+    shared
+    static
+    small
+    postproc_visualizer
+    os_support
+    unit_tests
+    webm_io
+    libyuv
+    decode_perf_tests
+    encode_perf_tests
+    multi_res_encoding
+    temporal_denoising
+    vp9_temporal_denoising
+    coefficient_range_checking
+    vp9_highbitdepth
+    better_hw_compatibility
+    experimental
+    size_limit
+    ${EXPERIMENT_LIST}
+"
+CMDLINE_SELECT="
+    dependency_tracking
+    external_build
+    extra_warnings
+    werror
+    install_docs
+    install_bins
+    install_libs
+    install_srcs
+    debug
+    gprof
+    gcov
+    pic
+    use_x86inc
+    optimizations
+    ccache
+    runtime_cpu_detect
+    thumb
+
+    libs
+    examples
+    docs
+    libc
+    as
+    size_limit
+    codec_srcs
+    debug_libs
+
+    dequant_tokens
+    dc_recon
+    postproc
+    vp9_postproc
+    multithread
+    internal_stats
+    ${CODECS}
+    ${CODEC_FAMILIES}
+    static_msvcrt
+    spatial_resampling
+    realtime_only
+    onthefly_bitpacking
+    error_concealment
+    shared
+    static
+    small
+    postproc_visualizer
+    unit_tests
+    webm_io
+    libyuv
+    decode_perf_tests
+    encode_perf_tests
+    multi_res_encoding
+    temporal_denoising
+    vp9_temporal_denoising
+    coefficient_range_checking
+    better_hw_compatibility
+    vp9_highbitdepth
+    experimental
+"
+
+process_cmdline() {
+    for opt do
+        optval="${opt#*=}"
+        case "$opt" in
+        --disable-codecs) for c in ${CODECS}; do disable_feature $c; done ;;
+        --enable-?*|--disable-?*)
+        eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
+        if echo "${EXPERIMENT_LIST}" | grep "^ *$option\$" >/dev/null; then
+            if enabled experimental; then
+                ${action}_feature $option
+            else
+                log_echo "Ignoring $opt -- not in experimental mode."
+            fi
+        else
+            process_common_cmdline $opt
+        fi
+        ;;
+        *) process_common_cmdline "$opt"
+        ;;
+        esac
+    done
+}
+
+post_process_cmdline() {
+    c=""
+
+    # If the codec family is disabled, disable all components of that family.
+    # If the codec family is enabled, enable all components of that family.
+    log_echo "Configuring selected codecs"
+    for c in ${CODECS}; do
+        disabled ${c%%_*} && disable_feature ${c}
+        enabled ${c%%_*} && enable_feature ${c}
+    done
+
+    # Enable all detected codecs, if they haven't been disabled
+    for c in ${CODECS}; do soft_enable $c; done
+
+    # Enable the codec family if any component of that family is enabled
+    for c in ${CODECS}; do
+        enabled $c && enable_feature ${c%_*}
+    done
+
+    # Set the {en,de}coders variable if any algorithm in that class is enabled
+    for c in ${CODECS}; do
+        enabled ${c} && enable_feature ${c##*_}s
+    done
+}
+
+
+process_targets() {
+    enabled child || write_common_config_banner
+    write_common_target_config_h ${BUILD_PFX}vpx_config.h
+    write_common_config_targets
+
+    # Calculate the default distribution name, based on the enabled features
+    cf=""
+    DIST_DIR=vpx
+    for cf in $CODEC_FAMILIES; do
+        if enabled ${cf}_encoder && enabled ${cf}_decoder; then
+            DIST_DIR="${DIST_DIR}-${cf}"
+        elif enabled ${cf}_encoder; then
+            DIST_DIR="${DIST_DIR}-${cf}cx"
+        elif enabled ${cf}_decoder; then
+            DIST_DIR="${DIST_DIR}-${cf}dx"
+        fi
+    done
+    enabled debug_libs && DIST_DIR="${DIST_DIR}-debug"
+    enabled codec_srcs && DIST_DIR="${DIST_DIR}-src"
+    ! enabled postproc && ! enabled vp9_postproc && DIST_DIR="${DIST_DIR}-nopost"
+    ! enabled multithread && DIST_DIR="${DIST_DIR}-nomt"
+    ! enabled install_docs && DIST_DIR="${DIST_DIR}-nodocs"
+    DIST_DIR="${DIST_DIR}-${tgt_isa}-${tgt_os}"
+    case "${tgt_os}" in
+    win*) enabled static_msvcrt && DIST_DIR="${DIST_DIR}mt" || DIST_DIR="${DIST_DIR}md"
+          DIST_DIR="${DIST_DIR}-${tgt_cc}"
+          ;;
+    esac
+    if [ -f "${source_path}/build/make/version.sh" ]; then
+        ver=`"$source_path/build/make/version.sh" --bare "$source_path"`
+        DIST_DIR="${DIST_DIR}-${ver}"
+        VERSION_STRING=${ver}
+        ver=${ver%%-*}
+        VERSION_PATCH=${ver##*.}
+        ver=${ver%.*}
+        VERSION_MINOR=${ver##*.}
+        ver=${ver#v}
+        VERSION_MAJOR=${ver%.*}
+    fi
+    enabled child || cat <<EOF >> config.mk
+
+PREFIX=${prefix}
+ifeq (\$(MAKECMDGOALS),dist)
+DIST_DIR?=${DIST_DIR}
+else
+DIST_DIR?=\$(DESTDIR)${prefix}
+endif
+LIBSUBDIR=${libdir##${prefix}/}
+
+VERSION_STRING=${VERSION_STRING}
+
+VERSION_MAJOR=${VERSION_MAJOR}
+VERSION_MINOR=${VERSION_MINOR}
+VERSION_PATCH=${VERSION_PATCH}
+
+CONFIGURE_ARGS=${CONFIGURE_ARGS}
+EOF
+    enabled child || echo "CONFIGURE_ARGS?=${CONFIGURE_ARGS}" >> config.mk
+
+    #
+    # Write makefiles for all enabled targets
+    #
+    for tgt in libs examples docs solution; do
+        tgt_fn="$tgt-$toolchain.mk"
+
+        if enabled $tgt; then
+            echo "Creating makefiles for ${toolchain} ${tgt}"
+            write_common_target_config_mk $tgt_fn ${BUILD_PFX}vpx_config.h
+            #write_${tgt}_config
+        fi
+    done
+
+}
+
+process_detect() {
+    if enabled shared; then
+        # Can only build shared libs on a subset of platforms. Doing this check
+        # here rather than at option parse time because the target auto-detect
+        # magic happens after the command line has been parsed.
+        if ! enabled linux && ! enabled os2; then
+            if enabled gnu; then
+                echo "--enable-shared is only supported on ELF; assuming this is OK"
+            else
+                die "--enable-shared only supported on ELF and OS/2 for now"
+            fi
+        fi
+    fi
+    if [ -z "$CC" ] || enabled external_build; then
+        echo "Bypassing toolchain for environment detection."
+        enable_feature external_build
+        check_header() {
+            log fake_check_header "$@"
+            header=$1
+            shift
+            var=`echo $header | sed 's/[^A-Za-z0-9_]/_/g'`
+            disable_feature $var
+            # Headers common to all environments
+            case $header in
+                stdio.h)
+                    true;
+                ;;
+                *)
+                    result=false
+                    for d in "$@"; do
+                        [ -f "${d##-I}/$header" ] && result=true && break
+                    done
+                    ${result:-true}
+            esac && enable_feature $var
+
+            # Specialize windows and POSIX environments.
+            case $toolchain in
+                *-win*-*)
+                    # Don't check for any headers in Windows builds.
+                    false
+                ;;
+                *)
+                    case $header in
+                        pthread.h) true;;
+                        unistd.h) true;;
+                        *) false;;
+                    esac && enable_feature $var
+            esac
+            enabled $var
+        }
+        check_ld() {
+            true
+        }
+    fi
+    check_header stdio.h || die "Unable to invoke compiler: ${CC} ${CFLAGS}"
+    check_ld <<EOF || die "Toolchain is unable to link executables"
+int main(void) {return 0;}
+EOF
+    # check system headers
+    check_header pthread.h
+    check_header unistd.h # for sysconf(3) and friends.
+
+    check_header vpx/vpx_integer.h -I${source_path} && enable_feature vpx_ports
+}
+
+process_toolchain() {
+    process_common_toolchain
+
+    # Enable some useful compiler flags
+    if enabled gcc; then
+        enabled werror && check_add_cflags -Werror
+        check_add_cflags -Wall
+        check_add_cflags -Wdeclaration-after-statement
+        check_add_cflags -Wdisabled-optimization
+        check_add_cflags -Wpointer-arith
+        check_add_cflags -Wtype-limits
+        check_add_cflags -Wcast-qual
+        check_add_cflags -Wvla
+        check_add_cflags -Wimplicit-function-declaration
+        check_add_cflags -Wuninitialized
+        check_add_cflags -Wunused-variable
+        case ${CC} in
+          *clang*)
+              # libvpx and/or clang have issues with aliasing:
+              # https://code.google.com/p/webm/issues/detail?id=603
+              # work around them until they are fixed
+              check_add_cflags -fno-strict-aliasing
+          ;;
+          *) check_add_cflags -Wunused-but-set-variable ;;
+        esac
+        if enabled mips || [ -z "${INLINE}" ]; then
+          enabled extra_warnings || check_add_cflags -Wno-unused-function
+        else
+          check_add_cflags -Wunused-function
+        fi
+    fi
+
+    if enabled icc; then
+        enabled werror && check_add_cflags -Werror
+        check_add_cflags -Wall
+        check_add_cflags -Wpointer-arith
+
+        # ICC has a number of floating point optimizations that we disable
+        # in favor of deterministic output WRT to other compilers
+        add_cflags -fp-model precise
+    fi
+
+    # Enable extra, harmless warnings. These might provide additional insight
+    # to what the compiler is doing and why, but in general, but they shouldn't
+    # be treated as fatal, even if we're treating warnings as errors.
+    GCC_EXTRA_WARNINGS="
+        -Wdisabled-optimization
+        -Winline
+    "
+    enabled gcc && EXTRA_WARNINGS="${GCC_EXTRA_WARNINGS}"
+    RVCT_EXTRA_WARNINGS="
+        --remarks
+    "
+    enabled rvct && EXTRA_WARNINGS="${RVCT_EXTRA_WARNINGS}"
+    if enabled extra_warnings; then
+        for w in ${EXTRA_WARNINGS}; do
+            check_add_cflags ${w}
+            enabled gcc && enabled werror && check_add_cflags -Wno-error=${w}
+        done
+    fi
+
+    # ccache only really works on gcc toolchains
+    enabled gcc || soft_disable ccache
+    if enabled mips; then
+        enable_feature dequant_tokens
+        enable_feature dc_recon
+    fi
+
+    if enabled internal_stats; then
+        enable_feature vp9_postproc
+    fi
+
+    # Enable the postbuild target if building for visual studio.
+    case "$tgt_cc" in
+        vs*) enable_feature msvs
+             enable_feature solution
+             vs_version=${tgt_cc##vs}
+             case $vs_version in
+             [789])
+                 VCPROJ_SFX=vcproj
+                 gen_vcproj_cmd=${source_path}/build/make/gen_msvs_proj.sh
+                 ;;
+             10|11|12|14)
+                 VCPROJ_SFX=vcxproj
+                 gen_vcproj_cmd=${source_path}/build/make/gen_msvs_vcxproj.sh
+                 enabled werror && gen_vcproj_cmd="${gen_vcproj_cmd} --enable-werror"
+                 ;;
+             esac
+             all_targets="${all_targets} solution"
+             INLINE="__forceinline"
+        ;;
+    esac
+
+    # Other toolchain specific defaults
+    case $toolchain in x86*) soft_enable postproc;; esac
+
+    if enabled postproc_visualizer; then
+        enabled postproc || die "postproc_visualizer requires postproc to be enabled"
+    fi
+
+    # Enable unit tests by default if we have a working C++ compiler.
+    case "$toolchain" in
+        *-vs*)
+            soft_enable unit_tests
+            soft_enable webm_io
+            soft_enable libyuv
+        ;;
+        *-android-*)
+            soft_enable webm_io
+            soft_enable libyuv
+            # GTestLog must be modified to use Android logging utilities.
+        ;;
+        *-darwin-*)
+            # iOS/ARM builds do not work with gtest. This does not match
+            # x86 targets.
+        ;;
+        *-iphonesimulator-*)
+            soft_enable webm_io
+            soft_enable libyuv
+        ;;
+        *-win*)
+            # Some mingw toolchains don't have pthread available by default.
+            # Treat these more like visual studio where threading in gtest
+            # would be disabled for the same reason.
+            check_cxx "$@" <<EOF && soft_enable unit_tests
+int z;
+EOF
+            check_cxx "$@" <<EOF && soft_enable webm_io
+int z;
+EOF
+            check_cxx "$@" <<EOF && soft_enable libyuv
+int z;
+EOF
+        ;;
+        *)
+            enabled pthread_h && check_cxx "$@" <<EOF && soft_enable unit_tests
+int z;
+EOF
+            check_cxx "$@" <<EOF && soft_enable webm_io
+int z;
+EOF
+            check_cxx "$@" <<EOF && soft_enable libyuv
+int z;
+EOF
+        ;;
+    esac
+    # libwebm needs to be linked with C++ standard library
+    enabled webm_io && LD=${CXX}
+
+    # append any user defined extra cflags
+    if [ -n "${extra_cflags}" ] ; then
+        check_add_cflags ${extra_cflags} || \
+        die "Requested extra CFLAGS '${extra_cflags}' not supported by compiler"
+    fi
+    if [ -n "${extra_cxxflags}" ]; then
+        check_add_cxxflags ${extra_cxxflags} || \
+        die "Requested extra CXXFLAGS '${extra_cxxflags}' not supported by compiler"
+    fi
+}
+
+
+##
+## END APPLICATION SPECIFIC CONFIGURATION
+##
+CONFIGURE_ARGS="$@"
+process "$@"
+print_webm_license ${BUILD_PFX}vpx_config.c "/*" " */"
+cat <<EOF >> ${BUILD_PFX}vpx_config.c
+#include "vpx/vpx_codec.h"
+static const char* const cfg = "$CONFIGURE_ARGS";
+const char *vpx_codec_build_config(void) {return cfg;}
+EOF
diff --git a/libs/libvpx/docs.mk b/libs/libvpx/docs.mk
new file mode 100644
index 0000000000..889d18251f
--- /dev/null
+++ b/libs/libvpx/docs.mk
@@ -0,0 +1,48 @@
+##
+##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+
+INSTALL_MAPS += docs/%    docs/%
+INSTALL_MAPS += src/%     %
+INSTALL_MAPS += %         %
+
+# Static documentation authored in doxygen
+CODEC_DOX :=    mainpage.dox \
+		keywords.dox \
+		usage.dox \
+		usage_cx.dox \
+		usage_dx.dox \
+
+# Other doxy files sourced in Markdown
+TXT_DOX = $(call enabled,TXT_DOX)
+
+EXAMPLE_PATH += $(SRC_PATH_BARE) #for CHANGELOG, README, etc
+EXAMPLE_PATH += $(SRC_PATH_BARE)/examples
+
+doxyfile: $(if $(findstring examples, $(ALL_TARGETS)),examples.doxy)
+doxyfile: libs.doxy_template libs.doxy
+	@echo "    [CREATE] $@"
+	@cat $^ > $@
+	@echo "STRIP_FROM_PATH += $(SRC_PATH_BARE) $(BUILD_ROOT)" >> $@
+	@echo "INPUT += $(addprefix $(SRC_PATH_BARE)/,$(CODEC_DOX))" >> $@;
+	@echo "INPUT += $(TXT_DOX)" >> $@;
+	@echo "EXAMPLE_PATH += $(EXAMPLE_PATH)" >> $@
+
+CLEAN-OBJS += doxyfile $(wildcard docs/html/*)
+docs/html/index.html: doxyfile $(CODEC_DOX) $(TXT_DOX)
+	@echo "    [DOXYGEN] $<"
+	@doxygen $<
+DOCS-yes += docs/html/index.html
+
+DIST-DOCS-yes = $(wildcard docs/html/*)
+DIST-DOCS-$(CONFIG_CODEC_SRCS) += $(addprefix src/,$(CODEC_DOX))
+DIST-DOCS-$(CONFIG_CODEC_SRCS) += src/libs.doxy_template
+DIST-DOCS-yes                  += CHANGELOG
+DIST-DOCS-yes                  += README
diff --git a/libs/libvpx/examples.mk b/libs/libvpx/examples.mk
new file mode 100644
index 0000000000..f10bec68c3
--- /dev/null
+++ b/libs/libvpx/examples.mk
@@ -0,0 +1,382 @@
+##
+##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+LIBYUV_SRCS +=  third_party/libyuv/include/libyuv/basic_types.h  \
+                third_party/libyuv/include/libyuv/convert.h \
+                third_party/libyuv/include/libyuv/convert_argb.h \
+                third_party/libyuv/include/libyuv/convert_from.h \
+                third_party/libyuv/include/libyuv/cpu_id.h  \
+                third_party/libyuv/include/libyuv/planar_functions.h  \
+                third_party/libyuv/include/libyuv/rotate.h  \
+                third_party/libyuv/include/libyuv/row.h  \
+                third_party/libyuv/include/libyuv/scale.h  \
+                third_party/libyuv/include/libyuv/scale_row.h  \
+                third_party/libyuv/source/cpu_id.cc \
+                third_party/libyuv/source/planar_functions.cc \
+                third_party/libyuv/source/row_any.cc \
+                third_party/libyuv/source/row_common.cc \
+                third_party/libyuv/source/row_gcc.cc \
+                third_party/libyuv/source/row_mips.cc \
+                third_party/libyuv/source/row_neon.cc \
+                third_party/libyuv/source/row_neon64.cc \
+                third_party/libyuv/source/row_win.cc \
+                third_party/libyuv/source/scale.cc \
+                third_party/libyuv/source/scale_any.cc \
+                third_party/libyuv/source/scale_common.cc \
+                third_party/libyuv/source/scale_gcc.cc \
+                third_party/libyuv/source/scale_mips.cc \
+                third_party/libyuv/source/scale_neon.cc \
+                third_party/libyuv/source/scale_neon64.cc \
+                third_party/libyuv/source/scale_win.cc \
+
+LIBWEBM_COMMON_SRCS += third_party/libwebm/webmids.hpp
+
+LIBWEBM_MUXER_SRCS += third_party/libwebm/mkvmuxer.cpp \
+                      third_party/libwebm/mkvmuxerutil.cpp \
+                      third_party/libwebm/mkvwriter.cpp \
+                      third_party/libwebm/mkvmuxer.hpp \
+                      third_party/libwebm/mkvmuxertypes.hpp \
+                      third_party/libwebm/mkvmuxerutil.hpp \
+                      third_party/libwebm/mkvparser.hpp \
+                      third_party/libwebm/mkvwriter.hpp
+
+LIBWEBM_PARSER_SRCS = third_party/libwebm/mkvparser.cpp \
+                      third_party/libwebm/mkvreader.cpp \
+                      third_party/libwebm/mkvparser.hpp \
+                      third_party/libwebm/mkvreader.hpp
+
+# List of examples to build. UTILS are tools meant for distribution
+# while EXAMPLES demonstrate specific portions of the API.
+UTILS-$(CONFIG_DECODERS)    += vpxdec.c
+vpxdec.SRCS                 += md5_utils.c md5_utils.h
+vpxdec.SRCS                 += vpx_ports/mem_ops.h
+vpxdec.SRCS                 += vpx_ports/mem_ops_aligned.h
+vpxdec.SRCS                 += vpx_ports/msvc.h
+vpxdec.SRCS                 += vpx_ports/vpx_timer.h
+vpxdec.SRCS                 += vpx/vpx_integer.h
+vpxdec.SRCS                 += args.c args.h
+vpxdec.SRCS                 += ivfdec.c ivfdec.h
+vpxdec.SRCS                 += tools_common.c tools_common.h
+vpxdec.SRCS                 += y4menc.c y4menc.h
+ifeq ($(CONFIG_LIBYUV),yes)
+  vpxdec.SRCS                 += $(LIBYUV_SRCS)
+endif
+ifeq ($(CONFIG_WEBM_IO),yes)
+  vpxdec.SRCS                 += $(LIBWEBM_COMMON_SRCS)
+  vpxdec.SRCS                 += $(LIBWEBM_PARSER_SRCS)
+  vpxdec.SRCS                 += webmdec.cc webmdec.h
+endif
+vpxdec.GUID                  = BA5FE66F-38DD-E034-F542-B1578C5FB950
+vpxdec.DESCRIPTION           = Full featured decoder
+UTILS-$(CONFIG_ENCODERS)    += vpxenc.c
+vpxenc.SRCS                 += args.c args.h y4minput.c y4minput.h vpxenc.h
+vpxenc.SRCS                 += ivfdec.c ivfdec.h
+vpxenc.SRCS                 += ivfenc.c ivfenc.h
+vpxenc.SRCS                 += rate_hist.c rate_hist.h
+vpxenc.SRCS                 += tools_common.c tools_common.h
+vpxenc.SRCS                 += warnings.c warnings.h
+vpxenc.SRCS                 += vpx_ports/mem_ops.h
+vpxenc.SRCS                 += vpx_ports/mem_ops_aligned.h
+vpxenc.SRCS                 += vpx_ports/msvc.h
+vpxenc.SRCS                 += vpx_ports/vpx_timer.h
+vpxenc.SRCS                 += vpxstats.c vpxstats.h
+ifeq ($(CONFIG_LIBYUV),yes)
+  vpxenc.SRCS                 += $(LIBYUV_SRCS)
+endif
+ifeq ($(CONFIG_WEBM_IO),yes)
+  vpxenc.SRCS                 += $(LIBWEBM_COMMON_SRCS)
+  vpxenc.SRCS                 += $(LIBWEBM_MUXER_SRCS)
+  vpxenc.SRCS                 += webmenc.cc webmenc.h
+endif
+vpxenc.GUID                  = 548DEC74-7A15-4B2B-AFC3-AA102E7C25C1
+vpxenc.DESCRIPTION           = Full featured encoder
+ifeq ($(CONFIG_SPATIAL_SVC),yes)
+  EXAMPLES-$(CONFIG_VP9_ENCODER)      += vp9_spatial_svc_encoder.c
+  vp9_spatial_svc_encoder.SRCS        += args.c args.h
+  vp9_spatial_svc_encoder.SRCS        += ivfenc.c ivfenc.h
+  vp9_spatial_svc_encoder.SRCS        += tools_common.c tools_common.h
+  vp9_spatial_svc_encoder.SRCS        += video_common.h
+  vp9_spatial_svc_encoder.SRCS        += video_writer.h video_writer.c
+  vp9_spatial_svc_encoder.SRCS        += vpx_ports/msvc.h
+  vp9_spatial_svc_encoder.SRCS        += vpxstats.c vpxstats.h
+  vp9_spatial_svc_encoder.GUID        = 4A38598D-627D-4505-9C7B-D4020C84100D
+  vp9_spatial_svc_encoder.DESCRIPTION = VP9 Spatial SVC Encoder
+endif
+
+ifneq ($(CONFIG_SHARED),yes)
+EXAMPLES-$(CONFIG_VP9_ENCODER)    += resize_util.c
+endif
+
+EXAMPLES-$(CONFIG_ENCODERS)          += vpx_temporal_svc_encoder.c
+vpx_temporal_svc_encoder.SRCS        += ivfenc.c ivfenc.h
+vpx_temporal_svc_encoder.SRCS        += tools_common.c tools_common.h
+vpx_temporal_svc_encoder.SRCS        += video_common.h
+vpx_temporal_svc_encoder.SRCS        += video_writer.h video_writer.c
+vpx_temporal_svc_encoder.SRCS        += vpx_ports/msvc.h
+vpx_temporal_svc_encoder.GUID        = B18C08F2-A439-4502-A78E-849BE3D60947
+vpx_temporal_svc_encoder.DESCRIPTION = Temporal SVC Encoder
+EXAMPLES-$(CONFIG_DECODERS)        += simple_decoder.c
+simple_decoder.GUID                 = D3BBF1E9-2427-450D-BBFF-B2843C1D44CC
+simple_decoder.SRCS                += ivfdec.h ivfdec.c
+simple_decoder.SRCS                += tools_common.h tools_common.c
+simple_decoder.SRCS                += video_common.h
+simple_decoder.SRCS                += video_reader.h video_reader.c
+simple_decoder.SRCS                += vpx_ports/mem_ops.h
+simple_decoder.SRCS                += vpx_ports/mem_ops_aligned.h
+simple_decoder.SRCS                += vpx_ports/msvc.h
+simple_decoder.DESCRIPTION          = Simplified decoder loop
+EXAMPLES-$(CONFIG_DECODERS)        += postproc.c
+postproc.SRCS                      += ivfdec.h ivfdec.c
+postproc.SRCS                      += tools_common.h tools_common.c
+postproc.SRCS                      += video_common.h
+postproc.SRCS                      += video_reader.h video_reader.c
+postproc.SRCS                      += vpx_ports/mem_ops.h
+postproc.SRCS                      += vpx_ports/mem_ops_aligned.h
+postproc.SRCS                      += vpx_ports/msvc.h
+postproc.GUID                       = 65E33355-F35E-4088-884D-3FD4905881D7
+postproc.DESCRIPTION                = Decoder postprocessor control
+EXAMPLES-$(CONFIG_DECODERS)        += decode_to_md5.c
+decode_to_md5.SRCS                 += md5_utils.h md5_utils.c
+decode_to_md5.SRCS                 += ivfdec.h ivfdec.c
+decode_to_md5.SRCS                 += tools_common.h tools_common.c
+decode_to_md5.SRCS                 += video_common.h
+decode_to_md5.SRCS                 += video_reader.h video_reader.c
+decode_to_md5.SRCS                 += vpx_ports/mem_ops.h
+decode_to_md5.SRCS                 += vpx_ports/mem_ops_aligned.h
+decode_to_md5.SRCS                 += vpx_ports/msvc.h
+decode_to_md5.GUID                  = 59120B9B-2735-4BFE-B022-146CA340FE42
+decode_to_md5.DESCRIPTION           = Frame by frame MD5 checksum
+EXAMPLES-$(CONFIG_ENCODERS)     += simple_encoder.c
+simple_encoder.SRCS             += ivfenc.h ivfenc.c
+simple_encoder.SRCS             += tools_common.h tools_common.c
+simple_encoder.SRCS             += video_common.h
+simple_encoder.SRCS             += video_writer.h video_writer.c
+simple_encoder.SRCS             += vpx_ports/msvc.h
+simple_encoder.GUID              = 4607D299-8A71-4D2C-9B1D-071899B6FBFD
+simple_encoder.DESCRIPTION       = Simplified encoder loop
+EXAMPLES-$(CONFIG_VP9_ENCODER)  += vp9_lossless_encoder.c
+vp9_lossless_encoder.SRCS       += ivfenc.h ivfenc.c
+vp9_lossless_encoder.SRCS       += tools_common.h tools_common.c
+vp9_lossless_encoder.SRCS       += video_common.h
+vp9_lossless_encoder.SRCS       += video_writer.h video_writer.c
+vp9_lossless_encoder.SRCS       += vpx_ports/msvc.h
+vp9_lossless_encoder.GUID        = B63C7C88-5348-46DC-A5A6-CC151EF93366
+vp9_lossless_encoder.DESCRIPTION = Simplified lossless VP9 encoder
+EXAMPLES-$(CONFIG_ENCODERS)     += twopass_encoder.c
+twopass_encoder.SRCS            += ivfenc.h ivfenc.c
+twopass_encoder.SRCS            += tools_common.h tools_common.c
+twopass_encoder.SRCS            += video_common.h
+twopass_encoder.SRCS            += video_writer.h video_writer.c
+twopass_encoder.SRCS            += vpx_ports/msvc.h
+twopass_encoder.GUID             = 73494FA6-4AF9-4763-8FBB-265C92402FD8
+twopass_encoder.DESCRIPTION      = Two-pass encoder loop
+EXAMPLES-$(CONFIG_DECODERS)     += decode_with_drops.c
+decode_with_drops.SRCS          += ivfdec.h ivfdec.c
+decode_with_drops.SRCS          += tools_common.h tools_common.c
+decode_with_drops.SRCS          += video_common.h
+decode_with_drops.SRCS          += video_reader.h video_reader.c
+decode_with_drops.SRCS          += vpx_ports/mem_ops.h
+decode_with_drops.SRCS          += vpx_ports/mem_ops_aligned.h
+decode_with_drops.SRCS          += vpx_ports/msvc.h
+decode_with_drops.GUID           = CE5C53C4-8DDA-438A-86ED-0DDD3CDB8D26
+decode_with_drops.DESCRIPTION    = Drops frames while decoding
+EXAMPLES-$(CONFIG_ENCODERS)        += set_maps.c
+set_maps.SRCS                      += ivfenc.h ivfenc.c
+set_maps.SRCS                      += tools_common.h tools_common.c
+set_maps.SRCS                      += video_common.h
+set_maps.SRCS                      += video_writer.h video_writer.c
+set_maps.SRCS                      += vpx_ports/msvc.h
+set_maps.GUID                       = ECB2D24D-98B8-4015-A465-A4AF3DCC145F
+set_maps.DESCRIPTION                = Set active and ROI maps
+EXAMPLES-$(CONFIG_VP8_ENCODER)     += vp8cx_set_ref.c
+vp8cx_set_ref.SRCS                 += ivfenc.h ivfenc.c
+vp8cx_set_ref.SRCS                 += tools_common.h tools_common.c
+vp8cx_set_ref.SRCS                 += video_common.h
+vp8cx_set_ref.SRCS                 += video_writer.h video_writer.c
+vp8cx_set_ref.SRCS                 += vpx_ports/msvc.h
+vp8cx_set_ref.GUID                  = C5E31F7F-96F6-48BD-BD3E-10EBF6E8057A
+vp8cx_set_ref.DESCRIPTION           = VP8 set encoder reference frame
+
+
+ifeq ($(CONFIG_MULTI_RES_ENCODING),yes)
+ifeq ($(CONFIG_LIBYUV),yes)
+EXAMPLES-$(CONFIG_VP8_ENCODER)          += vp8_multi_resolution_encoder.c
+vp8_multi_resolution_encoder.SRCS       += ivfenc.h ivfenc.c
+vp8_multi_resolution_encoder.SRCS       += tools_common.h tools_common.c
+vp8_multi_resolution_encoder.SRCS       += video_writer.h video_writer.c
+vp8_multi_resolution_encoder.SRCS       += vpx_ports/msvc.h
+vp8_multi_resolution_encoder.SRCS       += $(LIBYUV_SRCS)
+vp8_multi_resolution_encoder.GUID        = 04f8738e-63c8-423b-90fa-7c2703a374de
+vp8_multi_resolution_encoder.DESCRIPTION = VP8 Multiple-resolution Encoding
+endif
+endif
+
+# Handle extra library flags depending on codec configuration
+
+# We should not link to math library (libm) on RVCT
+# when building for bare-metal targets
+ifeq ($(CONFIG_OS_SUPPORT), yes)
+CODEC_EXTRA_LIBS-$(CONFIG_VP8)         += m
+CODEC_EXTRA_LIBS-$(CONFIG_VP9)         += m
+else
+    ifeq ($(CONFIG_GCC), yes)
+    CODEC_EXTRA_LIBS-$(CONFIG_VP8)         += m
+    CODEC_EXTRA_LIBS-$(CONFIG_VP9)         += m
+    endif
+endif
+#
+# End of specified files. The rest of the build rules should happen
+# automagically from here.
+#
+
+
+# Examples need different flags based on whether we're building
+# from an installed tree or a version controlled tree. Determine
+# the proper paths.
+ifeq ($(HAVE_ALT_TREE_LAYOUT),yes)
+    LIB_PATH-yes := $(SRC_PATH_BARE)/../lib
+    INC_PATH-yes := $(SRC_PATH_BARE)/../include
+else
+    LIB_PATH-yes                     += $(if $(BUILD_PFX),$(BUILD_PFX),.)
+    INC_PATH-$(CONFIG_VP8_DECODER)   += $(SRC_PATH_BARE)/vp8
+    INC_PATH-$(CONFIG_VP8_ENCODER)   += $(SRC_PATH_BARE)/vp8
+    INC_PATH-$(CONFIG_VP9_DECODER)   += $(SRC_PATH_BARE)/vp9
+    INC_PATH-$(CONFIG_VP9_ENCODER)   += $(SRC_PATH_BARE)/vp9
+endif
+INC_PATH-$(CONFIG_LIBYUV) += $(SRC_PATH_BARE)/third_party/libyuv/include
+LIB_PATH := $(call enabled,LIB_PATH)
+INC_PATH := $(call enabled,INC_PATH)
+INTERNAL_CFLAGS = $(addprefix -I,$(INC_PATH))
+INTERNAL_LDFLAGS += $(addprefix -L,$(LIB_PATH))
+
+
+# Expand list of selected examples to build (as specified above)
+UTILS           = $(call enabled,UTILS)
+EXAMPLES        = $(addprefix examples/,$(call enabled,EXAMPLES))
+ALL_EXAMPLES    = $(UTILS) $(EXAMPLES)
+UTIL_SRCS       = $(foreach ex,$(UTILS),$($(ex:.c=).SRCS))
+ALL_SRCS        = $(foreach ex,$(ALL_EXAMPLES),$($(notdir $(ex:.c=)).SRCS))
+CODEC_EXTRA_LIBS=$(sort $(call enabled,CODEC_EXTRA_LIBS))
+
+
+# Expand all example sources into a variable containing all sources
+# for that example (not just them main one specified in UTILS/EXAMPLES)
+# and add this file to the list (for MSVS workspace generation)
+$(foreach ex,$(ALL_EXAMPLES),$(eval $(notdir $(ex:.c=)).SRCS += $(ex) examples.mk))
+
+
+# Create build/install dependencies for all examples. The common case
+# is handled here. The MSVS case is handled below.
+NOT_MSVS = $(if $(CONFIG_MSVS),,yes)
+DIST-BINS-$(NOT_MSVS)      += $(addprefix bin/,$(ALL_EXAMPLES:.c=$(EXE_SFX)))
+INSTALL-BINS-$(NOT_MSVS)   += $(addprefix bin/,$(UTILS:.c=$(EXE_SFX)))
+DIST-SRCS-yes              += $(ALL_SRCS)
+INSTALL-SRCS-yes           += $(UTIL_SRCS)
+OBJS-$(NOT_MSVS)           += $(call objs,$(ALL_SRCS))
+BINS-$(NOT_MSVS)           += $(addprefix $(BUILD_PFX),$(ALL_EXAMPLES:.c=$(EXE_SFX)))
+
+
+# Instantiate linker template for all examples.
+CODEC_LIB=$(if $(CONFIG_DEBUG_LIBS),vpx_g,vpx)
+ifneq ($(filter darwin%,$(TGT_OS)),)
+SHARED_LIB_SUF=.dylib
+else
+ifneq ($(filter os2%,$(TGT_OS)),)
+SHARED_LIB_SUF=_dll.a
+else
+SHARED_LIB_SUF=.so
+endif
+endif
+CODEC_LIB_SUF=$(if $(CONFIG_SHARED),$(SHARED_LIB_SUF),.a)
+$(foreach bin,$(BINS-yes),\
+    $(eval $(bin):$(LIB_PATH)/lib$(CODEC_LIB)$(CODEC_LIB_SUF))\
+    $(eval $(call linker_template,$(bin),\
+        $(call objs,$($(notdir $(bin:$(EXE_SFX)=)).SRCS)) \
+        -l$(CODEC_LIB) $(addprefix -l,$(CODEC_EXTRA_LIBS))\
+        )))
+
+# The following pairs define a mapping of locations in the distribution
+# tree to locations in the source/build trees.
+INSTALL_MAPS += src/%.c   %.c
+INSTALL_MAPS += src/%     $(SRC_PATH_BARE)/%
+INSTALL_MAPS += bin/%     %
+INSTALL_MAPS += %         %
+
+
+# Set up additional MSVS environment
+ifeq ($(CONFIG_MSVS),yes)
+CODEC_LIB=$(if $(CONFIG_SHARED),vpx,$(if $(CONFIG_STATIC_MSVCRT),vpxmt,vpxmd))
+# This variable uses deferred expansion intentionally, since the results of
+# $(wildcard) may change during the course of the Make.
+VS_PLATFORMS = $(foreach d,$(wildcard */Release/$(CODEC_LIB).lib),$(word 1,$(subst /, ,$(d))))
+INSTALL_MAPS += $(foreach p,$(VS_PLATFORMS),bin/$(p)/%  $(p)/Release/%)
+endif
+
+# Build Visual Studio Projects. We use a template here to instantiate
+# explicit rules rather than using an implicit rule because we want to
+# leverage make's VPATH searching rather than specifying the paths on
+# each file in ALL_EXAMPLES. This has the unfortunate side effect that
+# touching the source files trigger a rebuild of the project files
+# even though there is no real dependency there (the dependency is on
+# the makefiles). We may want to revisit this.
+define vcproj_template
+$(1): $($(1:.$(VCPROJ_SFX)=).SRCS) vpx.$(VCPROJ_SFX)
+	$(if $(quiet),@echo "    [vcproj] $$@")
+	$(qexec)$$(GEN_VCPROJ)\
+            --exe\
+            --target=$$(TOOLCHAIN)\
+            --name=$$(@:.$(VCPROJ_SFX)=)\
+            --ver=$$(CONFIG_VS_VERSION)\
+            --proj-guid=$$($$(@:.$(VCPROJ_SFX)=).GUID)\
+            --src-path-bare="$(SRC_PATH_BARE)" \
+            $$(if $$(CONFIG_STATIC_MSVCRT),--static-crt) \
+            --out=$$@ $$(INTERNAL_CFLAGS) $$(CFLAGS) \
+            $$(INTERNAL_LDFLAGS) $$(LDFLAGS) -l$$(CODEC_LIB) $$^
+endef
+ALL_EXAMPLES_BASENAME := $(notdir $(ALL_EXAMPLES))
+PROJECTS-$(CONFIG_MSVS) += $(ALL_EXAMPLES_BASENAME:.c=.$(VCPROJ_SFX))
+INSTALL-BINS-$(CONFIG_MSVS) += $(foreach p,$(VS_PLATFORMS),\
+                               $(addprefix bin/$(p)/,$(ALL_EXAMPLES_BASENAME:.c=.exe)))
+$(foreach proj,$(call enabled,PROJECTS),\
+    $(eval $(call vcproj_template,$(proj))))
+
+#
+# Documentation Rules
+#
+%.dox: %.c
+	@echo "    [DOXY] $@"
+	@mkdir -p $(dir $@)
+	@echo "/*!\page example_$(@F:.dox=) $(@F:.dox=)" > $@
+	@echo "   \includelineno $(<F)" >> $@
+	@echo "*/" >> $@
+
+samples.dox: examples.mk
+	@echo "    [DOXY] $@"
+	@echo "/*!\page samples Sample Code" > $@
+	@echo "    This SDK includes a number of sample applications."\
+	      "Each sample documents a feature of the SDK in both prose"\
+	      "and the associated C code."\
+	      "The following samples are included: ">>$@
+	@$(foreach ex,$(sort $(notdir $(EXAMPLES:.c=))),\
+	   echo "     - \subpage example_$(ex) $($(ex).DESCRIPTION)" >> $@;)
+	@echo >> $@
+	@echo "    In addition, the SDK contains a number of utilities."\
+              "Since these utilities are built upon the concepts described"\
+              "in the sample code listed above, they are not documented in"\
+              "pieces like the samples are. Their source is included here"\
+              "for reference. The following utilities are included:" >> $@
+	@$(foreach ex,$(sort $(UTILS:.c=)),\
+	   echo "     - \subpage example_$(ex) $($(ex).DESCRIPTION)" >> $@;)
+	@echo "*/" >> $@
+
+CLEAN-OBJS += examples.doxy samples.dox $(ALL_EXAMPLES:.c=.dox)
+DOCS-yes += examples.doxy samples.dox
+examples.doxy: samples.dox $(ALL_EXAMPLES:.c=.dox)
+	@echo "INPUT += $^" > $@
diff --git a/libs/libvpx/examples/decode_to_md5.c b/libs/libvpx/examples/decode_to_md5.c
new file mode 100644
index 0000000000..1ae7a4b57f
--- /dev/null
+++ b/libs/libvpx/examples/decode_to_md5.c
@@ -0,0 +1,137 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// Frame-by-frame MD5 Checksum
+// ===========================
+//
+// This example builds upon the simple decoder loop to show how checksums
+// of the decoded output can be generated. These are used for validating
+// decoder implementations against the reference implementation, for example.
+//
+// MD5 algorithm
+// -------------
+// The Message-Digest 5 (MD5) is a well known hash function. We have provided
+// an implementation derived from the RSA Data Security, Inc. MD5 Message-Digest
+// Algorithm for your use. Our implmentation only changes the interface of this
+// reference code. You must include the `md5_utils.h` header for access to these
+// functions.
+//
+// Processing The Decoded Data
+// ---------------------------
+// Each row of the image is passed to the MD5 accumulator. First the Y plane
+// is processed, then U, then V. It is important to honor the image's `stride`
+// values.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "vpx/vp8dx.h"
+#include "vpx/vpx_decoder.h"
+
+#include "../md5_utils.h"
+#include "../tools_common.h"
+#include "../video_reader.h"
+#include "./vpx_config.h"
+
+static void get_image_md5(const vpx_image_t *img, unsigned char digest[16]) {
+  int plane, y;
+  MD5Context md5;
+
+  MD5Init(&md5);
+
+  for (plane = 0; plane < 3; ++plane) {
+    const unsigned char *buf = img->planes[plane];
+    const int stride = img->stride[plane];
+    const int w = plane ? (img->d_w + 1) >> 1 : img->d_w;
+    const int h = plane ? (img->d_h + 1) >> 1 : img->d_h;
+
+    for (y = 0; y < h; ++y) {
+      MD5Update(&md5, buf, w);
+      buf += stride;
+    }
+  }
+
+  MD5Final(digest, &md5);
+}
+
+static void print_md5(FILE *stream, unsigned char digest[16]) {
+  int i;
+
+  for (i = 0; i < 16; ++i)
+    fprintf(stream, "%02x", digest[i]);
+}
+
+static const char *exec_name;
+
+void usage_exit(void) {
+  fprintf(stderr, "Usage: %s <infile> <outfile>\n", exec_name);
+  exit(EXIT_FAILURE);
+}
+
+int main(int argc, char **argv) {
+  int frame_cnt = 0;
+  FILE *outfile = NULL;
+  vpx_codec_ctx_t codec;
+  VpxVideoReader *reader = NULL;
+  const VpxVideoInfo *info = NULL;
+  const VpxInterface *decoder = NULL;
+
+  exec_name = argv[0];
+
+  if (argc != 3)
+    die("Invalid number of arguments.");
+
+  reader = vpx_video_reader_open(argv[1]);
+  if (!reader)
+    die("Failed to open %s for reading.", argv[1]);
+
+  if (!(outfile = fopen(argv[2], "wb")))
+    die("Failed to open %s for writing.", argv[2]);
+
+  info = vpx_video_reader_get_info(reader);
+
+  decoder = get_vpx_decoder_by_fourcc(info->codec_fourcc);
+  if (!decoder)
+    die("Unknown input codec.");
+
+  printf("Using %s\n", vpx_codec_iface_name(decoder->codec_interface()));
+
+  if (vpx_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0))
+    die_codec(&codec, "Failed to initialize decoder");
+
+  while (vpx_video_reader_read_frame(reader)) {
+    vpx_codec_iter_t iter = NULL;
+    vpx_image_t *img = NULL;
+    size_t frame_size = 0;
+    const unsigned char *frame = vpx_video_reader_get_frame(reader,
+                                                            &frame_size);
+    if (vpx_codec_decode(&codec, frame, (unsigned int)frame_size, NULL, 0))
+      die_codec(&codec, "Failed to decode frame");
+
+    while ((img = vpx_codec_get_frame(&codec, &iter)) != NULL) {
+      unsigned char digest[16];
+
+      get_image_md5(img, digest);
+      print_md5(outfile, digest);
+      fprintf(outfile, "  img-%dx%d-%04d.i420\n",
+              img->d_w, img->d_h, ++frame_cnt);
+    }
+  }
+
+  printf("Processed %d frames.\n", frame_cnt);
+  if (vpx_codec_destroy(&codec))
+    die_codec(&codec, "Failed to destroy codec.");
+
+  vpx_video_reader_close(reader);
+
+  fclose(outfile);
+  return EXIT_SUCCESS;
+}
diff --git a/libs/libvpx/examples/decode_with_drops.c b/libs/libvpx/examples/decode_with_drops.c
new file mode 100644
index 0000000000..2233e473d3
--- /dev/null
+++ b/libs/libvpx/examples/decode_with_drops.c
@@ -0,0 +1,152 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// Decode With Drops Example
+// =========================
+//
+// This is an example utility which drops a series of frames, as specified
+// on the command line. This is useful for observing the error recovery
+// features of the codec.
+//
+// Usage
+// -----
+// This example adds a single argument to the `simple_decoder` example,
+// which specifies the range or pattern of frames to drop. The parameter is
+// parsed as follows:
+//
+// Dropping A Range Of Frames
+// --------------------------
+// To drop a range of frames, specify the starting frame and the ending
+// frame to drop, separated by a dash. The following command will drop
+// frames 5 through 10 (base 1).
+//
+//  $ ./decode_with_drops in.ivf out.i420 5-10
+//
+//
+// Dropping A Pattern Of Frames
+// ----------------------------
+// To drop a pattern of frames, specify the number of frames to drop and
+// the number of frames after which to repeat the pattern, separated by
+// a forward-slash. The following command will drop 3 of 7 frames.
+// Specifically, it will decode 4 frames, then drop 3 frames, and then
+// repeat.
+//
+//  $ ./decode_with_drops in.ivf out.i420 3/7
+//
+//
+// Extra Variables
+// ---------------
+// This example maintains the pattern passed on the command line in the
+// `n`, `m`, and `is_range` variables:
+//
+//
+// Making The Drop Decision
+// ------------------------
+// The example decides whether to drop the frame based on the current
+// frame number, immediately before decoding the frame.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "vpx/vp8dx.h"
+#include "vpx/vpx_decoder.h"
+
+#include "../tools_common.h"
+#include "../video_reader.h"
+#include "./vpx_config.h"
+
+static const char *exec_name;
+
+void usage_exit(void) {
+  fprintf(stderr, "Usage: %s <infile> <outfile> <N-M|N/M>\n", exec_name);
+  exit(EXIT_FAILURE);
+}
+
+int main(int argc, char **argv) {
+  int frame_cnt = 0;
+  FILE *outfile = NULL;
+  vpx_codec_ctx_t codec;
+  const VpxInterface *decoder = NULL;
+  VpxVideoReader *reader = NULL;
+  const VpxVideoInfo *info = NULL;
+  int n = 0;
+  int m = 0;
+  int is_range = 0;
+  char *nptr = NULL;
+
+  exec_name = argv[0];
+
+  if (argc != 4)
+    die("Invalid number of arguments.");
+
+  reader = vpx_video_reader_open(argv[1]);
+  if (!reader)
+    die("Failed to open %s for reading.", argv[1]);
+
+  if (!(outfile = fopen(argv[2], "wb")))
+    die("Failed to open %s for writing.", argv[2]);
+
+  n = strtol(argv[3], &nptr, 0);
+  m = strtol(nptr + 1, NULL, 0);
+  is_range = (*nptr == '-');
+  if (!n || !m || (*nptr != '-' && *nptr != '/'))
+    die("Couldn't parse pattern %s.\n", argv[3]);
+
+  info = vpx_video_reader_get_info(reader);
+
+  decoder = get_vpx_decoder_by_fourcc(info->codec_fourcc);
+  if (!decoder)
+    die("Unknown input codec.");
+
+  printf("Using %s\n", vpx_codec_iface_name(decoder->codec_interface()));
+
+  if (vpx_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0))
+    die_codec(&codec, "Failed to initialize decoder.");
+
+  while (vpx_video_reader_read_frame(reader)) {
+    vpx_codec_iter_t iter = NULL;
+    vpx_image_t *img = NULL;
+    size_t frame_size = 0;
+    int skip;
+    const unsigned char *frame = vpx_video_reader_get_frame(reader,
+                                                            &frame_size);
+    if (vpx_codec_decode(&codec, frame, (unsigned int)frame_size, NULL, 0))
+      die_codec(&codec, "Failed to decode frame.");
+
+    ++frame_cnt;
+
+    skip = (is_range && frame_cnt >= n && frame_cnt <= m) ||
+           (!is_range && m - (frame_cnt - 1) % m <= n);
+
+    if (!skip) {
+      putc('.', stdout);
+
+      while ((img = vpx_codec_get_frame(&codec, &iter)) != NULL)
+        vpx_img_write(img, outfile);
+    } else {
+      putc('X', stdout);
+    }
+
+    fflush(stdout);
+  }
+
+  printf("Processed %d frames.\n", frame_cnt);
+  if (vpx_codec_destroy(&codec))
+    die_codec(&codec, "Failed to destroy codec.");
+
+  printf("Play: ffplay -f rawvideo -pix_fmt yuv420p -s %dx%d %s\n",
+         info->frame_width, info->frame_height, argv[2]);
+
+  vpx_video_reader_close(reader);
+  fclose(outfile);
+
+  return EXIT_SUCCESS;
+}
diff --git a/libs/libvpx/examples/postproc.c b/libs/libvpx/examples/postproc.c
new file mode 100644
index 0000000000..a8ac208d9b
--- /dev/null
+++ b/libs/libvpx/examples/postproc.c
@@ -0,0 +1,138 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// Postprocessing Decoder
+// ======================
+//
+// This example adds postprocessing to the simple decoder loop.
+//
+// Initializing Postprocessing
+// ---------------------------
+// You must inform the codec that you might request postprocessing at
+// initialization time. This is done by passing the VPX_CODEC_USE_POSTPROC
+// flag to `vpx_codec_dec_init`. If the codec does not support
+// postprocessing, this call will return VPX_CODEC_INCAPABLE. For
+// demonstration purposes, we also fall back to default initialization if
+// the codec does not provide support.
+//
+// Using Adaptive Postprocessing
+// -----------------------------
+// VP6 provides "adaptive postprocessing." It will automatically select the
+// best postprocessing filter on a frame by frame basis based on the amount
+// of time remaining before the user's specified deadline expires. The
+// special value 0 indicates that the codec should take as long as
+// necessary to provide the best quality frame. This example gives the
+// codec 15ms (15000us) to return a frame. Remember that this is a soft
+// deadline, and the codec may exceed it doing its regular processing. In
+// these cases, no additional postprocessing will be done.
+//
+// Codec Specific Postprocessing Controls
+// --------------------------------------
+// Some codecs provide fine grained controls over their built-in
+// postprocessors. VP8 is one example. The following sample code toggles
+// postprocessing on and off every 15 frames.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "vpx/vp8dx.h"
+#include "vpx/vpx_decoder.h"
+
+#include "../tools_common.h"
+#include "../video_reader.h"
+#include "./vpx_config.h"
+
+static const char *exec_name;
+
+void usage_exit(void) {
+  fprintf(stderr, "Usage: %s <infile> <outfile>\n", exec_name);
+  exit(EXIT_FAILURE);
+}
+
+int main(int argc, char **argv) {
+  int frame_cnt = 0;
+  FILE *outfile = NULL;
+  vpx_codec_ctx_t codec;
+  vpx_codec_err_t res;
+  VpxVideoReader *reader = NULL;
+  const VpxInterface *decoder = NULL;
+  const VpxVideoInfo *info = NULL;
+
+  exec_name = argv[0];
+
+  if (argc != 3)
+    die("Invalid number of arguments.");
+
+  reader = vpx_video_reader_open(argv[1]);
+  if (!reader)
+    die("Failed to open %s for reading.", argv[1]);
+
+  if (!(outfile = fopen(argv[2], "wb")))
+    die("Failed to open %s for writing", argv[2]);
+
+  info = vpx_video_reader_get_info(reader);
+
+  decoder = get_vpx_decoder_by_fourcc(info->codec_fourcc);
+  if (!decoder)
+    die("Unknown input codec.");
+
+  printf("Using %s\n", vpx_codec_iface_name(decoder->codec_interface()));
+
+  res = vpx_codec_dec_init(&codec, decoder->codec_interface(), NULL,
+                           VPX_CODEC_USE_POSTPROC);
+  if (res == VPX_CODEC_INCAPABLE)
+    die_codec(&codec, "Postproc not supported by this decoder.");
+
+  if (res)
+    die_codec(&codec, "Failed to initialize decoder.");
+
+  while (vpx_video_reader_read_frame(reader)) {
+    vpx_codec_iter_t iter = NULL;
+    vpx_image_t *img = NULL;
+    size_t frame_size = 0;
+    const unsigned char *frame = vpx_video_reader_get_frame(reader,
+                                                            &frame_size);
+
+    ++frame_cnt;
+
+    if (frame_cnt % 30 == 1) {
+      vp8_postproc_cfg_t pp = {0, 0, 0};
+
+    if (vpx_codec_control(&codec, VP8_SET_POSTPROC, &pp))
+      die_codec(&codec, "Failed to turn off postproc.");
+    } else if (frame_cnt % 30 == 16) {
+      vp8_postproc_cfg_t pp = {VP8_DEBLOCK | VP8_DEMACROBLOCK | VP8_MFQE,
+                               4, 0};
+      if (vpx_codec_control(&codec, VP8_SET_POSTPROC, &pp))
+        die_codec(&codec, "Failed to turn on postproc.");
+    };
+
+    // Decode the frame with 15ms deadline
+    if (vpx_codec_decode(&codec, frame, (unsigned int)frame_size, NULL, 15000))
+      die_codec(&codec, "Failed to decode frame");
+
+    while ((img = vpx_codec_get_frame(&codec, &iter)) != NULL) {
+      vpx_img_write(img, outfile);
+    }
+  }
+
+  printf("Processed %d frames.\n", frame_cnt);
+  if (vpx_codec_destroy(&codec))
+    die_codec(&codec, "Failed to destroy codec");
+
+  printf("Play: ffplay -f rawvideo -pix_fmt yuv420p -s %dx%d %s\n",
+         info->frame_width, info->frame_height, argv[2]);
+
+  vpx_video_reader_close(reader);
+
+  fclose(outfile);
+  return EXIT_SUCCESS;
+}
diff --git a/libs/libvpx/examples/resize_util.c b/libs/libvpx/examples/resize_util.c
new file mode 100644
index 0000000000..e6fdd5bb2a
--- /dev/null
+++ b/libs/libvpx/examples/resize_util.c
@@ -0,0 +1,130 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../tools_common.h"
+#include "../vp9/encoder/vp9_resize.h"
+
+static const char *exec_name = NULL;
+
+static void usage() {
+  printf("Usage:\n");
+  printf("%s <input_yuv> <width>x<height> <target_width>x<target_height> ",
+         exec_name);
+  printf("<output_yuv> [<frames>]\n");
+}
+
+void usage_exit(void) {
+  usage();
+  exit(EXIT_FAILURE);
+}
+
+static int parse_dim(char *v, int *width, int *height) {
+  char *x = strchr(v, 'x');
+  if (x == NULL)
+    x = strchr(v, 'X');
+  if (x == NULL)
+    return 0;
+  *width = atoi(v);
+  *height = atoi(&x[1]);
+  if (*width <= 0 || *height <= 0)
+    return 0;
+  else
+    return 1;
+}
+
+int main(int argc, char *argv[]) {
+  char *fin, *fout;
+  FILE *fpin, *fpout;
+  uint8_t *inbuf, *outbuf;
+  uint8_t *inbuf_u, *outbuf_u;
+  uint8_t *inbuf_v, *outbuf_v;
+  int f, frames;
+  int width, height, target_width, target_height;
+
+  exec_name = argv[0];
+
+  if (argc < 5) {
+    printf("Incorrect parameters:\n");
+    usage();
+    return 1;
+  }
+
+  fin = argv[1];
+  fout = argv[4];
+  if (!parse_dim(argv[2], &width, &height)) {
+    printf("Incorrect parameters: %s\n", argv[2]);
+    usage();
+    return 1;
+  }
+  if (!parse_dim(argv[3], &target_width, &target_height)) {
+    printf("Incorrect parameters: %s\n", argv[3]);
+    usage();
+    return 1;
+  }
+
+  fpin = fopen(fin, "rb");
+  if (fpin == NULL) {
+    printf("Can't open file %s to read\n", fin);
+    usage();
+    return 1;
+  }
+  fpout = fopen(fout, "wb");
+  if (fpout == NULL) {
+    printf("Can't open file %s to write\n", fout);
+    usage();
+    return 1;
+  }
+  if (argc >= 6)
+    frames = atoi(argv[5]);
+  else
+    frames = INT_MAX;
+
+  printf("Input size:  %dx%d\n",
+         width, height);
+  printf("Target size: %dx%d, Frames: ",
+         target_width, target_height);
+  if (frames == INT_MAX)
+    printf("All\n");
+  else
+    printf("%d\n", frames);
+
+  inbuf = (uint8_t*)malloc(width * height * 3 / 2);
+  outbuf = (uint8_t*)malloc(target_width * target_height * 3 / 2);
+  inbuf_u = inbuf + width * height;
+  inbuf_v = inbuf_u + width * height / 4;
+  outbuf_u = outbuf + target_width * target_height;
+  outbuf_v = outbuf_u + target_width * target_height / 4;
+  f = 0;
+  while (f < frames) {
+    if (fread(inbuf, width * height * 3 / 2, 1, fpin) != 1)
+      break;
+    vp9_resize_frame420(inbuf, width, inbuf_u, inbuf_v, width / 2,
+                        height, width,
+                        outbuf, target_width, outbuf_u, outbuf_v,
+                        target_width / 2,
+                        target_height, target_width);
+    fwrite(outbuf, target_width * target_height * 3 / 2, 1, fpout);
+    f++;
+  }
+  printf("%d frames processed\n", f);
+  fclose(fpin);
+  fclose(fpout);
+
+  free(inbuf);
+  free(outbuf);
+  return 0;
+}
diff --git a/libs/libvpx/examples/set_maps.c b/libs/libvpx/examples/set_maps.c
new file mode 100644
index 0000000000..1dc3ac0c98
--- /dev/null
+++ b/libs/libvpx/examples/set_maps.c
@@ -0,0 +1,255 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+// VP8 Set Active and ROI Maps
+// ===========================
+//
+// This is an example demonstrating how to control the VP8 encoder's
+// ROI and Active maps.
+//
+// ROI (Reigon of Interest) maps are a way for the application to assign
+// each macroblock in the image to a region, and then set quantizer and
+// filtering parameters on that image.
+//
+// Active maps are a way for the application to specify on a
+// macroblock-by-macroblock basis whether there is any activity in that
+// macroblock.
+//
+//
+// Configuration
+// -------------
+// An ROI map is set on frame 22. If the width of the image in macroblocks
+// is evenly divisble by 4, then the output will appear to have distinct
+// columns, where the quantizer, loopfilter, and static threshold differ
+// from column to column.
+//
+// An active map is set on frame 33. If the width of the image in macroblocks
+// is evenly divisble by 4, then the output will appear to have distinct
+// columns, where one column will have motion and the next will not.
+//
+// The active map is cleared on frame 44.
+//
+// Observing The Effects
+// ---------------------
+// Use the `simple_decoder` example to decode this sample, and observe
+// the change in the image at frames 22, 33, and 44.
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "vpx/vp8cx.h"
+#include "vpx/vpx_encoder.h"
+
+#include "../tools_common.h"
+#include "../video_writer.h"
+
+static const char *exec_name;
+
+void usage_exit(void) {
+  fprintf(stderr, "Usage: %s <codec> <width> <height> <infile> <outfile>\n",
+          exec_name);
+  exit(EXIT_FAILURE);
+}
+
+static void set_roi_map(const vpx_codec_enc_cfg_t *cfg,
+                        vpx_codec_ctx_t *codec) {
+  unsigned int i;
+  vpx_roi_map_t roi;
+  memset(&roi, 0, sizeof(roi));
+
+  roi.rows = (cfg->g_h + 15) / 16;
+  roi.cols = (cfg->g_w + 15) / 16;
+
+  roi.delta_q[0] = 0;
+  roi.delta_q[1] = -2;
+  roi.delta_q[2] = -4;
+  roi.delta_q[3] = -6;
+
+  roi.delta_lf[0] = 0;
+  roi.delta_lf[1] = 1;
+  roi.delta_lf[2] = 2;
+  roi.delta_lf[3] = 3;
+
+  roi.static_threshold[0] = 1500;
+  roi.static_threshold[1] = 1000;
+  roi.static_threshold[2] = 500;
+  roi.static_threshold[3] = 0;
+
+  roi.roi_map = (uint8_t *)malloc(roi.rows * roi.cols);
+  for (i = 0; i < roi.rows * roi.cols; ++i)
+    roi.roi_map[i] = i % 4;
+
+  if (vpx_codec_control(codec, VP8E_SET_ROI_MAP, &roi))
+    die_codec(codec, "Failed to set ROI map");
+
+  free(roi.roi_map);
+}
+
+static void set_active_map(const vpx_codec_enc_cfg_t *cfg,
+                           vpx_codec_ctx_t *codec) {
+  unsigned int i;
+  vpx_active_map_t map = {0, 0, 0};
+
+  map.rows = (cfg->g_h + 15) / 16;
+  map.cols = (cfg->g_w + 15) / 16;
+
+  map.active_map = (uint8_t *)malloc(map.rows * map.cols);
+  for (i = 0; i < map.rows * map.cols; ++i)
+    map.active_map[i] = i % 2;
+
+  if (vpx_codec_control(codec, VP8E_SET_ACTIVEMAP, &map))
+    die_codec(codec, "Failed to set active map");
+
+  free(map.active_map);
+}
+
+static void unset_active_map(const vpx_codec_enc_cfg_t *cfg,
+                             vpx_codec_ctx_t *codec) {
+  vpx_active_map_t map = {0, 0, 0};
+
+  map.rows = (cfg->g_h + 15) / 16;
+  map.cols = (cfg->g_w + 15) / 16;
+  map.active_map = NULL;
+
+  if (vpx_codec_control(codec, VP8E_SET_ACTIVEMAP, &map))
+    die_codec(codec, "Failed to set active map");
+}
+
+static int encode_frame(vpx_codec_ctx_t *codec,
+                        vpx_image_t *img,
+                        int frame_index,
+                        VpxVideoWriter *writer) {
+  int got_pkts = 0;
+  vpx_codec_iter_t iter = NULL;
+  const vpx_codec_cx_pkt_t *pkt = NULL;
+  const vpx_codec_err_t res = vpx_codec_encode(codec, img, frame_index, 1, 0,
+                                               VPX_DL_GOOD_QUALITY);
+  if (res != VPX_CODEC_OK)
+    die_codec(codec, "Failed to encode frame");
+
+  while ((pkt = vpx_codec_get_cx_data(codec, &iter)) != NULL) {
+    got_pkts = 1;
+
+    if (pkt->kind == VPX_CODEC_CX_FRAME_PKT) {
+      const int keyframe = (pkt->data.frame.flags & VPX_FRAME_IS_KEY) != 0;
+      if (!vpx_video_writer_write_frame(writer,
+                                        pkt->data.frame.buf,
+                                        pkt->data.frame.sz,
+                                        pkt->data.frame.pts)) {
+        die_codec(codec, "Failed to write compressed frame");
+      }
+
+      printf(keyframe ? "K" : ".");
+      fflush(stdout);
+    }
+  }
+
+  return got_pkts;
+}
+
+int main(int argc, char **argv) {
+  FILE *infile = NULL;
+  vpx_codec_ctx_t codec;
+  vpx_codec_enc_cfg_t cfg;
+  int frame_count = 0;
+  vpx_image_t raw;
+  vpx_codec_err_t res;
+  VpxVideoInfo info;
+  VpxVideoWriter *writer = NULL;
+  const VpxInterface *encoder = NULL;
+  const int fps = 2;        // TODO(dkovalev) add command line argument
+  const double bits_per_pixel_per_frame = 0.067;
+
+  exec_name = argv[0];
+  if (argc != 6)
+    die("Invalid number of arguments");
+
+  memset(&info, 0, sizeof(info));
+
+  encoder = get_vpx_encoder_by_name(argv[1]);
+  if (encoder == NULL) {
+    die("Unsupported codec.");
+  }
+  assert(encoder != NULL);
+  info.codec_fourcc = encoder->fourcc;
+  info.frame_width = strtol(argv[2], NULL, 0);
+  info.frame_height = strtol(argv[3], NULL, 0);
+  info.time_base.numerator = 1;
+  info.time_base.denominator = fps;
+
+  if (info.frame_width <= 0 ||
+      info.frame_height <= 0 ||
+      (info.frame_width % 2) != 0 ||
+      (info.frame_height % 2) != 0) {
+    die("Invalid frame size: %dx%d", info.frame_width, info.frame_height);
+  }
+
+  if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, info.frame_width,
+                                             info.frame_height, 1)) {
+    die("Failed to allocate image.");
+  }
+
+  printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface()));
+
+  res = vpx_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
+  if (res)
+    die_codec(&codec, "Failed to get default codec config.");
+
+  cfg.g_w = info.frame_width;
+  cfg.g_h = info.frame_height;
+  cfg.g_timebase.num = info.time_base.numerator;
+  cfg.g_timebase.den = info.time_base.denominator;
+  cfg.rc_target_bitrate = (unsigned int)(bits_per_pixel_per_frame * cfg.g_w *
+                                         cfg.g_h * fps / 1000);
+  cfg.g_lag_in_frames = 0;
+
+  writer = vpx_video_writer_open(argv[5], kContainerIVF, &info);
+  if (!writer)
+    die("Failed to open %s for writing.", argv[5]);
+
+  if (!(infile = fopen(argv[4], "rb")))
+    die("Failed to open %s for reading.", argv[4]);
+
+  if (vpx_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0))
+    die_codec(&codec, "Failed to initialize encoder");
+
+  // Encode frames.
+  while (vpx_img_read(&raw, infile)) {
+    ++frame_count;
+
+    if (frame_count == 22 && encoder->fourcc == VP8_FOURCC) {
+      set_roi_map(&cfg, &codec);
+    } else if (frame_count == 33) {
+      set_active_map(&cfg, &codec);
+    } else if (frame_count == 44) {
+      unset_active_map(&cfg, &codec);
+    }
+
+    encode_frame(&codec, &raw, frame_count, writer);
+  }
+
+  // Flush encoder.
+  while (encode_frame(&codec, NULL, -1, writer)) {}
+
+  printf("\n");
+  fclose(infile);
+  printf("Processed %d frames.\n", frame_count);
+
+  vpx_img_free(&raw);
+  if (vpx_codec_destroy(&codec))
+    die_codec(&codec, "Failed to destroy codec.");
+
+  vpx_video_writer_close(writer);
+
+  return EXIT_SUCCESS;
+}
diff --git a/libs/libvpx/examples/simple_decoder.c b/libs/libvpx/examples/simple_decoder.c
new file mode 100644
index 0000000000..8ccc81035e
--- /dev/null
+++ b/libs/libvpx/examples/simple_decoder.c
@@ -0,0 +1,154 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+// Simple Decoder
+// ==============
+//
+// This is an example of a simple decoder loop. It takes an input file
+// containing the compressed data (in IVF format), passes it through the
+// decoder, and writes the decompressed frames to disk. Other decoder
+// examples build upon this one.
+//
+// The details of the IVF format have been elided from this example for
+// simplicity of presentation, as IVF files will not generally be used by
+// your application. In general, an IVF file consists of a file header,
+// followed by a variable number of frames. Each frame consists of a frame
+// header followed by a variable length payload. The length of the payload
+// is specified in the first four bytes of the frame header. The payload is
+// the raw compressed data.
+//
+// Standard Includes
+// -----------------
+// For decoders, you only have to include `vpx_decoder.h` and then any
+// header files for the specific codecs you use. In this case, we're using
+// vp8.
+//
+// Initializing The Codec
+// ----------------------
+// The libvpx decoder is initialized by the call to vpx_codec_dec_init().
+// Determining the codec interface to use is handled by VpxVideoReader and the
+// functions prefixed with vpx_video_reader_. Discussion of those functions is
+// beyond the scope of this example, but the main gist is to open the input file
+// and parse just enough of it to determine if it's a VPx file and which VPx
+// codec is contained within the file.
+// Note the NULL pointer passed to vpx_codec_dec_init(). We do that in this
+// example because we want the algorithm to determine the stream configuration
+// (width/height) and allocate memory automatically.
+//
+// Decoding A Frame
+// ----------------
+// Once the frame has been read into memory, it is decoded using the
+// `vpx_codec_decode` function. The call takes a pointer to the data
+// (`frame`) and the length of the data (`frame_size`). No application data
+// is associated with the frame in this example, so the `user_priv`
+// parameter is NULL. The `deadline` parameter is left at zero for this
+// example. This parameter is generally only used when doing adaptive post
+// processing.
+//
+// Codecs may produce a variable number of output frames for every call to
+// `vpx_codec_decode`. These frames are retrieved by the
+// `vpx_codec_get_frame` iterator function. The iterator variable `iter` is
+// initialized to NULL each time `vpx_codec_decode` is called.
+// `vpx_codec_get_frame` is called in a loop, returning a pointer to a
+// decoded image or NULL to indicate the end of list.
+//
+// Processing The Decoded Data
+// ---------------------------
+// In this example, we simply write the encoded data to disk. It is
+// important to honor the image's `stride` values.
+//
+// Cleanup
+// -------
+// The `vpx_codec_destroy` call frees any memory allocated by the codec.
+//
+// Error Handling
+// --------------
+// This example does not special case any error return codes. If there was
+// an error, a descriptive message is printed and the program exits. With
+// few exceptions, vpx_codec functions return an enumerated error status,
+// with the value `0` indicating success.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "vpx/vpx_decoder.h"
+
+#include "../tools_common.h"
+#include "../video_reader.h"
+#include "./vpx_config.h"
+
+static const char *exec_name;
+
+void usage_exit(void) {
+  fprintf(stderr, "Usage: %s <infile> <outfile>\n", exec_name);
+  exit(EXIT_FAILURE);
+}
+
+int main(int argc, char **argv) {
+  int frame_cnt = 0;
+  FILE *outfile = NULL;
+  vpx_codec_ctx_t codec;
+  VpxVideoReader *reader = NULL;
+  const VpxInterface *decoder = NULL;
+  const VpxVideoInfo *info = NULL;
+
+  exec_name = argv[0];
+
+  if (argc != 3)
+    die("Invalid number of arguments.");
+
+  reader = vpx_video_reader_open(argv[1]);
+  if (!reader)
+    die("Failed to open %s for reading.", argv[1]);
+
+  if (!(outfile = fopen(argv[2], "wb")))
+    die("Failed to open %s for writing.", argv[2]);
+
+  info = vpx_video_reader_get_info(reader);
+
+  decoder = get_vpx_decoder_by_fourcc(info->codec_fourcc);
+  if (!decoder)
+    die("Unknown input codec.");
+
+  printf("Using %s\n", vpx_codec_iface_name(decoder->codec_interface()));
+
+  if (vpx_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0))
+    die_codec(&codec, "Failed to initialize decoder.");
+
+  while (vpx_video_reader_read_frame(reader)) {
+    vpx_codec_iter_t iter = NULL;
+    vpx_image_t *img = NULL;
+    size_t frame_size = 0;
+    const unsigned char *frame = vpx_video_reader_get_frame(reader,
+                                                            &frame_size);
+    if (vpx_codec_decode(&codec, frame, (unsigned int)frame_size, NULL, 0))
+      die_codec(&codec, "Failed to decode frame.");
+
+    while ((img = vpx_codec_get_frame(&codec, &iter)) != NULL) {
+      vpx_img_write(img, outfile);
+      ++frame_cnt;
+    }
+  }
+
+  printf("Processed %d frames.\n", frame_cnt);
+  if (vpx_codec_destroy(&codec))
+    die_codec(&codec, "Failed to destroy codec");
+
+  printf("Play: ffplay -f rawvideo -pix_fmt yuv420p -s %dx%d %s\n",
+         info->frame_width, info->frame_height, argv[2]);
+
+  vpx_video_reader_close(reader);
+
+  fclose(outfile);
+
+  return EXIT_SUCCESS;
+}
diff --git a/libs/libvpx/examples/simple_encoder.c b/libs/libvpx/examples/simple_encoder.c
new file mode 100644
index 0000000000..a307729731
--- /dev/null
+++ b/libs/libvpx/examples/simple_encoder.c
@@ -0,0 +1,256 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// Simple Encoder
+// ==============
+//
+// This is an example of a simple encoder loop. It takes an input file in
+// YV12 format, passes it through the encoder, and writes the compressed
+// frames to disk in IVF format. Other decoder examples build upon this
+// one.
+//
+// The details of the IVF format have been elided from this example for
+// simplicity of presentation, as IVF files will not generally be used by
+// your application. In general, an IVF file consists of a file header,
+// followed by a variable number of frames. Each frame consists of a frame
+// header followed by a variable length payload. The length of the payload
+// is specified in the first four bytes of the frame header. The payload is
+// the raw compressed data.
+//
+// Standard Includes
+// -----------------
+// For encoders, you only have to include `vpx_encoder.h` and then any
+// header files for the specific codecs you use. In this case, we're using
+// vp8.
+//
+// Getting The Default Configuration
+// ---------------------------------
+// Encoders have the notion of "usage profiles." For example, an encoder
+// may want to publish default configurations for both a video
+// conferencing application and a best quality offline encoder. These
+// obviously have very different default settings. Consult the
+// documentation for your codec to see if it provides any default
+// configurations. All codecs provide a default configuration, number 0,
+// which is valid for material in the vacinity of QCIF/QVGA.
+//
+// Updating The Configuration
+// ---------------------------------
+// Almost all applications will want to update the default configuration
+// with settings specific to their usage. Here we set the width and height
+// of the video file to that specified on the command line. We also scale
+// the default bitrate based on the ratio between the default resolution
+// and the resolution specified on the command line.
+//
+// Initializing The Codec
+// ----------------------
+// The encoder is initialized by the following code.
+//
+// Encoding A Frame
+// ----------------
+// The frame is read as a continuous block (size width * height * 3 / 2)
+// from the input file. If a frame was read (the input file has not hit
+// EOF) then the frame is passed to the encoder. Otherwise, a NULL
+// is passed, indicating the End-Of-Stream condition to the encoder. The
+// `frame_cnt` is reused as the presentation time stamp (PTS) and each
+// frame is shown for one frame-time in duration. The flags parameter is
+// unused in this example. The deadline is set to VPX_DL_REALTIME to
+// make the example run as quickly as possible.
+
+// Forced Keyframes
+// ----------------
+// Keyframes can be forced by setting the VPX_EFLAG_FORCE_KF bit of the
+// flags passed to `vpx_codec_control()`. In this example, we force a
+// keyframe every <keyframe-interval> frames. Note, the output stream can
+// contain additional keyframes beyond those that have been forced using the
+// VPX_EFLAG_FORCE_KF flag because of automatic keyframe placement by the
+// encoder.
+//
+// Processing The Encoded Data
+// ---------------------------
+// Each packet of type `VPX_CODEC_CX_FRAME_PKT` contains the encoded data
+// for this frame. We write a IVF frame header, followed by the raw data.
+//
+// Cleanup
+// -------
+// The `vpx_codec_destroy` call frees any memory allocated by the codec.
+//
+// Error Handling
+// --------------
+// This example does not special case any error return codes. If there was
+// an error, a descriptive message is printed and the program exits. With
+// few exeptions, vpx_codec functions return an enumerated error status,
+// with the value `0` indicating success.
+//
+// Error Resiliency Features
+// -------------------------
+// Error resiliency is controlled by the g_error_resilient member of the
+// configuration structure. Use the `decode_with_drops` example to decode with
+// frames 5-10 dropped. Compare the output for a file encoded with this example
+// versus one encoded with the `simple_encoder` example.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "vpx/vpx_encoder.h"
+
+#include "../tools_common.h"
+#include "../video_writer.h"
+
+static const char *exec_name;
+
+void usage_exit(void) {
+  fprintf(stderr,
+          "Usage: %s <codec> <width> <height> <infile> <outfile> "
+              "<keyframe-interval> [<error-resilient>]\nSee comments in "
+              "simple_encoder.c for more information.\n",
+          exec_name);
+  exit(EXIT_FAILURE);
+}
+
+static int encode_frame(vpx_codec_ctx_t *codec,
+                        vpx_image_t *img,
+                        int frame_index,
+                        int flags,
+                        VpxVideoWriter *writer) {
+  int got_pkts = 0;
+  vpx_codec_iter_t iter = NULL;
+  const vpx_codec_cx_pkt_t *pkt = NULL;
+  const vpx_codec_err_t res = vpx_codec_encode(codec, img, frame_index, 1,
+                                               flags, VPX_DL_GOOD_QUALITY);
+  if (res != VPX_CODEC_OK)
+    die_codec(codec, "Failed to encode frame");
+
+  while ((pkt = vpx_codec_get_cx_data(codec, &iter)) != NULL) {
+    got_pkts = 1;
+
+    if (pkt->kind == VPX_CODEC_CX_FRAME_PKT) {
+      const int keyframe = (pkt->data.frame.flags & VPX_FRAME_IS_KEY) != 0;
+      if (!vpx_video_writer_write_frame(writer,
+                                        pkt->data.frame.buf,
+                                        pkt->data.frame.sz,
+                                        pkt->data.frame.pts)) {
+        die_codec(codec, "Failed to write compressed frame");
+      }
+      printf(keyframe ? "K" : ".");
+      fflush(stdout);
+    }
+  }
+
+  return got_pkts;
+}
+
+int main(int argc, char **argv) {
+  FILE *infile = NULL;
+  vpx_codec_ctx_t codec;
+  vpx_codec_enc_cfg_t cfg;
+  int frame_count = 0;
+  vpx_image_t raw;
+  vpx_codec_err_t res;
+  VpxVideoInfo info = {0};
+  VpxVideoWriter *writer = NULL;
+  const VpxInterface *encoder = NULL;
+  const int fps = 30;        // TODO(dkovalev) add command line argument
+  const int bitrate = 200;   // kbit/s TODO(dkovalev) add command line argument
+  int keyframe_interval = 0;
+
+  // TODO(dkovalev): Add some simple command line parsing code to make the
+  // command line more flexible.
+  const char *codec_arg = NULL;
+  const char *width_arg = NULL;
+  const char *height_arg = NULL;
+  const char *infile_arg = NULL;
+  const char *outfile_arg = NULL;
+  const char *keyframe_interval_arg = NULL;
+
+  exec_name = argv[0];
+
+  if (argc < 7)
+    die("Invalid number of arguments");
+
+  codec_arg = argv[1];
+  width_arg = argv[2];
+  height_arg = argv[3];
+  infile_arg = argv[4];
+  outfile_arg = argv[5];
+  keyframe_interval_arg = argv[6];
+
+  encoder = get_vpx_encoder_by_name(codec_arg);
+  if (!encoder)
+     die("Unsupported codec.");
+
+  info.codec_fourcc = encoder->fourcc;
+  info.frame_width = strtol(width_arg, NULL, 0);
+  info.frame_height = strtol(height_arg, NULL, 0);
+  info.time_base.numerator = 1;
+  info.time_base.denominator = fps;
+
+  if (info.frame_width <= 0 ||
+      info.frame_height <= 0 ||
+      (info.frame_width % 2) != 0 ||
+      (info.frame_height % 2) != 0) {
+    die("Invalid frame size: %dx%d", info.frame_width, info.frame_height);
+  }
+
+  if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, info.frame_width,
+                                             info.frame_height, 1)) {
+    die("Failed to allocate image.");
+  }
+
+  keyframe_interval = strtol(keyframe_interval_arg, NULL, 0);
+  if (keyframe_interval < 0)
+    die("Invalid keyframe interval value.");
+
+  printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface()));
+
+  res = vpx_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
+  if (res)
+    die_codec(&codec, "Failed to get default codec config.");
+
+  cfg.g_w = info.frame_width;
+  cfg.g_h = info.frame_height;
+  cfg.g_timebase.num = info.time_base.numerator;
+  cfg.g_timebase.den = info.time_base.denominator;
+  cfg.rc_target_bitrate = bitrate;
+  cfg.g_error_resilient = argc > 7 ? strtol(argv[7], NULL, 0) : 0;
+
+  writer = vpx_video_writer_open(outfile_arg, kContainerIVF, &info);
+  if (!writer)
+    die("Failed to open %s for writing.", outfile_arg);
+
+  if (!(infile = fopen(infile_arg, "rb")))
+    die("Failed to open %s for reading.", infile_arg);
+
+  if (vpx_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0))
+    die_codec(&codec, "Failed to initialize encoder");
+
+  // Encode frames.
+  while (vpx_img_read(&raw, infile)) {
+    int flags = 0;
+    if (keyframe_interval > 0 && frame_count % keyframe_interval == 0)
+      flags |= VPX_EFLAG_FORCE_KF;
+    encode_frame(&codec, &raw, frame_count++, flags, writer);
+  }
+
+  // Flush encoder.
+  while (encode_frame(&codec, NULL, -1, 0, writer)) {};
+
+  printf("\n");
+  fclose(infile);
+  printf("Processed %d frames.\n", frame_count);
+
+  vpx_img_free(&raw);
+  if (vpx_codec_destroy(&codec))
+    die_codec(&codec, "Failed to destroy codec.");
+
+  vpx_video_writer_close(writer);
+
+  return EXIT_SUCCESS;
+}
diff --git a/libs/libvpx/examples/twopass_encoder.c b/libs/libvpx/examples/twopass_encoder.c
new file mode 100644
index 0000000000..aecc11d3f4
--- /dev/null
+++ b/libs/libvpx/examples/twopass_encoder.c
@@ -0,0 +1,265 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// Two Pass Encoder
+// ================
+//
+// This is an example of a two pass encoder loop. It takes an input file in
+// YV12 format, passes it through the encoder twice, and writes the compressed
+// frames to disk in IVF format. It builds upon the simple_encoder example.
+//
+// Twopass Variables
+// -----------------
+// Twopass mode needs to track the current pass number and the buffer of
+// statistics packets.
+//
+// Updating The Configuration
+// ---------------------------------
+// In two pass mode, the configuration has to be updated on each pass. The
+// statistics buffer is passed on the last pass.
+//
+// Encoding A Frame
+// ----------------
+// Encoding a frame in two pass mode is identical to the simple encoder
+// example. To increase the quality while sacrificing encoding speed,
+// VPX_DL_BEST_QUALITY can be used in place of VPX_DL_GOOD_QUALITY.
+//
+// Processing Statistics Packets
+// -----------------------------
+// Each packet of type `VPX_CODEC_CX_FRAME_PKT` contains the encoded data
+// for this frame. We write a IVF frame header, followed by the raw data.
+//
+//
+// Pass Progress Reporting
+// -----------------------------
+// It's sometimes helpful to see when each pass completes.
+//
+//
+// Clean-up
+// -----------------------------
+// Destruction of the encoder instance must be done on each pass. The
+// raw image should be destroyed at the end as usual.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "vpx/vpx_encoder.h"
+
+#include "../tools_common.h"
+#include "../video_writer.h"
+
+static const char *exec_name;
+
+void usage_exit(void) {
+  fprintf(stderr, "Usage: %s <codec> <width> <height> <infile> <outfile>\n",
+          exec_name);
+  exit(EXIT_FAILURE);
+}
+
+static int get_frame_stats(vpx_codec_ctx_t *ctx,
+                           const vpx_image_t *img,
+                           vpx_codec_pts_t pts,
+                           unsigned int duration,
+                           vpx_enc_frame_flags_t flags,
+                           unsigned int deadline,
+                           vpx_fixed_buf_t *stats) {
+  int got_pkts = 0;
+  vpx_codec_iter_t iter = NULL;
+  const vpx_codec_cx_pkt_t *pkt = NULL;
+  const vpx_codec_err_t res = vpx_codec_encode(ctx, img, pts, duration, flags,
+                                               deadline);
+  if (res != VPX_CODEC_OK)
+    die_codec(ctx, "Failed to get frame stats.");
+
+  while ((pkt = vpx_codec_get_cx_data(ctx, &iter)) != NULL) {
+    got_pkts = 1;
+
+    if (pkt->kind == VPX_CODEC_STATS_PKT) {
+      const uint8_t *const pkt_buf = pkt->data.twopass_stats.buf;
+      const size_t pkt_size = pkt->data.twopass_stats.sz;
+      stats->buf = realloc(stats->buf, stats->sz + pkt_size);
+      memcpy((uint8_t *)stats->buf + stats->sz, pkt_buf, pkt_size);
+      stats->sz += pkt_size;
+    }
+  }
+
+  return got_pkts;
+}
+
+static int encode_frame(vpx_codec_ctx_t *ctx,
+                        const vpx_image_t *img,
+                        vpx_codec_pts_t pts,
+                        unsigned int duration,
+                        vpx_enc_frame_flags_t flags,
+                        unsigned int deadline,
+                        VpxVideoWriter *writer) {
+  int got_pkts = 0;
+  vpx_codec_iter_t iter = NULL;
+  const vpx_codec_cx_pkt_t *pkt = NULL;
+  const vpx_codec_err_t res = vpx_codec_encode(ctx, img, pts, duration, flags,
+                                               deadline);
+  if (res != VPX_CODEC_OK)
+    die_codec(ctx, "Failed to encode frame.");
+
+  while ((pkt = vpx_codec_get_cx_data(ctx, &iter)) != NULL) {
+    got_pkts = 1;
+    if (pkt->kind == VPX_CODEC_CX_FRAME_PKT) {
+      const int keyframe = (pkt->data.frame.flags & VPX_FRAME_IS_KEY) != 0;
+
+      if (!vpx_video_writer_write_frame(writer, pkt->data.frame.buf,
+                                                pkt->data.frame.sz,
+                                                pkt->data.frame.pts))
+        die_codec(ctx, "Failed to write compressed frame.");
+      printf(keyframe ? "K" : ".");
+      fflush(stdout);
+    }
+  }
+
+  return got_pkts;
+}
+
+static vpx_fixed_buf_t pass0(vpx_image_t *raw,
+                             FILE *infile,
+                             const VpxInterface *encoder,
+                             const vpx_codec_enc_cfg_t *cfg) {
+  vpx_codec_ctx_t codec;
+  int frame_count = 0;
+  vpx_fixed_buf_t stats = {NULL, 0};
+
+  if (vpx_codec_enc_init(&codec, encoder->codec_interface(), cfg, 0))
+    die_codec(&codec, "Failed to initialize encoder");
+
+  // Calculate frame statistics.
+  while (vpx_img_read(raw, infile)) {
+    ++frame_count;
+    get_frame_stats(&codec, raw, frame_count, 1, 0, VPX_DL_GOOD_QUALITY,
+                    &stats);
+  }
+
+  // Flush encoder.
+  while (get_frame_stats(&codec, NULL, frame_count, 1, 0,
+                         VPX_DL_GOOD_QUALITY, &stats)) {}
+
+  printf("Pass 0 complete. Processed %d frames.\n", frame_count);
+  if (vpx_codec_destroy(&codec))
+    die_codec(&codec, "Failed to destroy codec.");
+
+  return stats;
+}
+
+static void pass1(vpx_image_t *raw,
+                  FILE *infile,
+                  const char *outfile_name,
+                  const VpxInterface *encoder,
+                  const vpx_codec_enc_cfg_t *cfg) {
+  VpxVideoInfo info = {
+    encoder->fourcc,
+    cfg->g_w,
+    cfg->g_h,
+    {cfg->g_timebase.num, cfg->g_timebase.den}
+  };
+  VpxVideoWriter *writer = NULL;
+  vpx_codec_ctx_t codec;
+  int frame_count = 0;
+
+  writer = vpx_video_writer_open(outfile_name, kContainerIVF, &info);
+  if (!writer)
+    die("Failed to open %s for writing", outfile_name);
+
+  if (vpx_codec_enc_init(&codec, encoder->codec_interface(), cfg, 0))
+    die_codec(&codec, "Failed to initialize encoder");
+
+  // Encode frames.
+  while (vpx_img_read(raw, infile)) {
+    ++frame_count;
+    encode_frame(&codec, raw, frame_count, 1, 0, VPX_DL_GOOD_QUALITY, writer);
+  }
+
+  // Flush encoder.
+  while (encode_frame(&codec, NULL, -1, 1, 0, VPX_DL_GOOD_QUALITY, writer)) {}
+
+  printf("\n");
+
+  if (vpx_codec_destroy(&codec))
+    die_codec(&codec, "Failed to destroy codec.");
+
+  vpx_video_writer_close(writer);
+
+  printf("Pass 1 complete. Processed %d frames.\n", frame_count);
+}
+
+int main(int argc, char **argv) {
+  FILE *infile = NULL;
+  int w, h;
+  vpx_codec_ctx_t codec;
+  vpx_codec_enc_cfg_t cfg;
+  vpx_image_t raw;
+  vpx_codec_err_t res;
+  vpx_fixed_buf_t stats;
+
+  const VpxInterface *encoder = NULL;
+  const int fps = 30;        // TODO(dkovalev) add command line argument
+  const int bitrate = 200;   // kbit/s TODO(dkovalev) add command line argument
+  const char *const codec_arg = argv[1];
+  const char *const width_arg = argv[2];
+  const char *const height_arg = argv[3];
+  const char *const infile_arg = argv[4];
+  const char *const outfile_arg = argv[5];
+  exec_name = argv[0];
+
+  if (argc != 6)
+    die("Invalid number of arguments.");
+
+  encoder = get_vpx_encoder_by_name(codec_arg);
+  if (!encoder)
+    die("Unsupported codec.");
+
+  w = strtol(width_arg, NULL, 0);
+  h = strtol(height_arg, NULL, 0);
+
+  if (w  <= 0 || h <= 0 || (w % 2) != 0 || (h  % 2) != 0)
+    die("Invalid frame size: %dx%d", w, h);
+
+  if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, w, h, 1))
+    die("Failed to allocate image", w, h);
+
+  printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface()));
+
+  // Configuration
+  res = vpx_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
+  if (res)
+    die_codec(&codec, "Failed to get default codec config.");
+
+  cfg.g_w = w;
+  cfg.g_h = h;
+  cfg.g_timebase.num = 1;
+  cfg.g_timebase.den = fps;
+  cfg.rc_target_bitrate = bitrate;
+
+  if (!(infile = fopen(infile_arg, "rb")))
+    die("Failed to open %s for reading", infile_arg);
+
+  // Pass 0
+  cfg.g_pass = VPX_RC_FIRST_PASS;
+  stats = pass0(&raw, infile, encoder, &cfg);
+
+  // Pass 1
+  rewind(infile);
+  cfg.g_pass = VPX_RC_LAST_PASS;
+  cfg.rc_twopass_stats_in = stats;
+  pass1(&raw, infile, outfile_arg, encoder, &cfg);
+  free(stats.buf);
+
+  vpx_img_free(&raw);
+  fclose(infile);
+
+  return EXIT_SUCCESS;
+}
diff --git a/libs/libvpx/examples/vp8_multi_resolution_encoder.c b/libs/libvpx/examples/vp8_multi_resolution_encoder.c
new file mode 100644
index 0000000000..0248edede0
--- /dev/null
+++ b/libs/libvpx/examples/vp8_multi_resolution_encoder.c
@@ -0,0 +1,729 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*
+ * This is an example demonstrating multi-resolution encoding in VP8.
+ * High-resolution input video is down-sampled to lower-resolutions. The
+ * encoder then encodes the video and outputs multiple bitstreams with
+ * different resolutions.
+ *
+ * This test also allows for settings temporal layers for each spatial layer.
+ * Different number of temporal layers per spatial stream may be used.
+ * Currently up to 3 temporal layers per spatial stream (encoder) are supported
+ * in this test.
+ */
+
+#include "./vpx_config.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <math.h>
+#include <assert.h>
+#include <sys/time.h>
+#include "vpx_ports/vpx_timer.h"
+#include "vpx/vpx_encoder.h"
+#include "vpx/vp8cx.h"
+#include "vpx_ports/mem_ops.h"
+#include "../tools_common.h"
+#define interface (vpx_codec_vp8_cx())
+#define fourcc    0x30385056
+
+void usage_exit(void) {
+  exit(EXIT_FAILURE);
+}
+
+/*
+ * The input video frame is downsampled several times to generate a multi-level
+ * hierarchical structure. NUM_ENCODERS is defined as the number of encoding
+ * levels required. For example, if the size of input video is 1280x720,
+ * NUM_ENCODERS is 3, and down-sampling factor is 2, the encoder outputs 3
+ * bitstreams with resolution of 1280x720(level 0), 640x360(level 1), and
+ * 320x180(level 2) respectively.
+ */
+
+/* Number of encoders (spatial resolutions) used in this test. */
+#define NUM_ENCODERS 3
+
+/* Maximum number of temporal layers allowed for this test. */
+#define MAX_NUM_TEMPORAL_LAYERS 3
+
+/* This example uses the scaler function in libyuv. */
+#include "third_party/libyuv/include/libyuv/basic_types.h"
+#include "third_party/libyuv/include/libyuv/scale.h"
+#include "third_party/libyuv/include/libyuv/cpu_id.h"
+
+int (*read_frame_p)(FILE *f, vpx_image_t *img);
+
+static int read_frame(FILE *f, vpx_image_t *img) {
+    size_t nbytes, to_read;
+    int    res = 1;
+
+    to_read = img->w*img->h*3/2;
+    nbytes = fread(img->planes[0], 1, to_read, f);
+    if(nbytes != to_read) {
+        res = 0;
+        if(nbytes > 0)
+            printf("Warning: Read partial frame. Check your width & height!\n");
+    }
+    return res;
+}
+
+static int read_frame_by_row(FILE *f, vpx_image_t *img) {
+    size_t nbytes, to_read;
+    int    res = 1;
+    int plane;
+
+    for (plane = 0; plane < 3; plane++)
+    {
+        unsigned char *ptr;
+        int w = (plane ? (1 + img->d_w) / 2 : img->d_w);
+        int h = (plane ? (1 + img->d_h) / 2 : img->d_h);
+        int r;
+
+        /* Determine the correct plane based on the image format. The for-loop
+         * always counts in Y,U,V order, but this may not match the order of
+         * the data on disk.
+         */
+        switch (plane)
+        {
+        case 1:
+            ptr = img->planes[img->fmt==VPX_IMG_FMT_YV12? VPX_PLANE_V : VPX_PLANE_U];
+            break;
+        case 2:
+            ptr = img->planes[img->fmt==VPX_IMG_FMT_YV12?VPX_PLANE_U : VPX_PLANE_V];
+            break;
+        default:
+            ptr = img->planes[plane];
+        }
+
+        for (r = 0; r < h; r++)
+        {
+            to_read = w;
+
+            nbytes = fread(ptr, 1, to_read, f);
+            if(nbytes != to_read) {
+                res = 0;
+                if(nbytes > 0)
+                    printf("Warning: Read partial frame. Check your width & height!\n");
+                break;
+            }
+
+            ptr += img->stride[plane];
+        }
+        if (!res)
+            break;
+    }
+
+    return res;
+}
+
+static void write_ivf_file_header(FILE *outfile,
+                                  const vpx_codec_enc_cfg_t *cfg,
+                                  int frame_cnt) {
+    char header[32];
+
+    if(cfg->g_pass != VPX_RC_ONE_PASS && cfg->g_pass != VPX_RC_LAST_PASS)
+        return;
+    header[0] = 'D';
+    header[1] = 'K';
+    header[2] = 'I';
+    header[3] = 'F';
+    mem_put_le16(header+4,  0);                   /* version */
+    mem_put_le16(header+6,  32);                  /* headersize */
+    mem_put_le32(header+8,  fourcc);              /* headersize */
+    mem_put_le16(header+12, cfg->g_w);            /* width */
+    mem_put_le16(header+14, cfg->g_h);            /* height */
+    mem_put_le32(header+16, cfg->g_timebase.den); /* rate */
+    mem_put_le32(header+20, cfg->g_timebase.num); /* scale */
+    mem_put_le32(header+24, frame_cnt);           /* length */
+    mem_put_le32(header+28, 0);                   /* unused */
+
+    (void) fwrite(header, 1, 32, outfile);
+}
+
+static void write_ivf_frame_header(FILE *outfile,
+                                   const vpx_codec_cx_pkt_t *pkt)
+{
+    char             header[12];
+    vpx_codec_pts_t  pts;
+
+    if(pkt->kind != VPX_CODEC_CX_FRAME_PKT)
+        return;
+
+    pts = pkt->data.frame.pts;
+    mem_put_le32(header, pkt->data.frame.sz);
+    mem_put_le32(header+4, pts&0xFFFFFFFF);
+    mem_put_le32(header+8, pts >> 32);
+
+    (void) fwrite(header, 1, 12, outfile);
+}
+
+/* Temporal scaling parameters */
+/* This sets all the temporal layer parameters given |num_temporal_layers|,
+ * including the target bit allocation across temporal layers. Bit allocation
+ * parameters will be passed in as user parameters in another version.
+ */
+static void set_temporal_layer_pattern(int num_temporal_layers,
+                                       vpx_codec_enc_cfg_t *cfg,
+                                       int bitrate,
+                                       int *layer_flags)
+{
+    assert(num_temporal_layers <= MAX_NUM_TEMPORAL_LAYERS);
+    switch (num_temporal_layers)
+    {
+    case 1:
+    {
+        /* 1-layer */
+        cfg->ts_number_layers     = 1;
+        cfg->ts_periodicity       = 1;
+        cfg->ts_rate_decimator[0] = 1;
+        cfg->ts_layer_id[0] = 0;
+        cfg->ts_target_bitrate[0] = bitrate;
+
+        // Update L only.
+        layer_flags[0] = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
+        break;
+    }
+
+    case 2:
+    {
+        /* 2-layers, with sync point at first frame of layer 1. */
+        cfg->ts_number_layers     = 2;
+        cfg->ts_periodicity       = 2;
+        cfg->ts_rate_decimator[0] = 2;
+        cfg->ts_rate_decimator[1] = 1;
+        cfg->ts_layer_id[0] = 0;
+        cfg->ts_layer_id[1] = 1;
+        // Use 60/40 bit allocation as example.
+        cfg->ts_target_bitrate[0] = 0.6f * bitrate;
+        cfg->ts_target_bitrate[1] = bitrate;
+
+        /* 0=L, 1=GF */
+        // ARF is used as predictor for all frames, and is only updated on
+        // key frame. Sync point every 8 frames.
+
+        // Layer 0: predict from L and ARF, update L and G.
+        layer_flags[0] = VP8_EFLAG_NO_REF_GF |
+                         VP8_EFLAG_NO_UPD_ARF;
+
+        // Layer 1: sync point: predict from L and ARF, and update G.
+        layer_flags[1] = VP8_EFLAG_NO_REF_GF |
+                         VP8_EFLAG_NO_UPD_LAST |
+                         VP8_EFLAG_NO_UPD_ARF;
+
+        // Layer 0, predict from L and ARF, update L.
+        layer_flags[2] = VP8_EFLAG_NO_REF_GF  |
+                         VP8_EFLAG_NO_UPD_GF  |
+                         VP8_EFLAG_NO_UPD_ARF;
+
+        // Layer 1: predict from L, G and ARF, and update G.
+        layer_flags[3] = VP8_EFLAG_NO_UPD_ARF |
+                         VP8_EFLAG_NO_UPD_LAST |
+                         VP8_EFLAG_NO_UPD_ENTROPY;
+
+        // Layer 0
+        layer_flags[4] = layer_flags[2];
+
+        // Layer 1
+        layer_flags[5] = layer_flags[3];
+
+        // Layer 0
+        layer_flags[6] = layer_flags[4];
+
+        // Layer 1
+        layer_flags[7] = layer_flags[5];
+        break;
+    }
+
+    case 3:
+    default:
+    {
+        // 3-layers structure where ARF is used as predictor for all frames,
+        // and is only updated on key frame.
+        // Sync points for layer 1 and 2 every 8 frames.
+        cfg->ts_number_layers     = 3;
+        cfg->ts_periodicity       = 4;
+        cfg->ts_rate_decimator[0] = 4;
+        cfg->ts_rate_decimator[1] = 2;
+        cfg->ts_rate_decimator[2] = 1;
+        cfg->ts_layer_id[0] = 0;
+        cfg->ts_layer_id[1] = 2;
+        cfg->ts_layer_id[2] = 1;
+        cfg->ts_layer_id[3] = 2;
+        // Use 40/20/40 bit allocation as example.
+        cfg->ts_target_bitrate[0] = 0.4f * bitrate;
+        cfg->ts_target_bitrate[1] = 0.6f * bitrate;
+        cfg->ts_target_bitrate[2] = bitrate;
+
+        /* 0=L, 1=GF, 2=ARF */
+
+        // Layer 0: predict from L and ARF; update L and G.
+        layer_flags[0] =  VP8_EFLAG_NO_UPD_ARF |
+                          VP8_EFLAG_NO_REF_GF;
+
+        // Layer 2: sync point: predict from L and ARF; update none.
+        layer_flags[1] = VP8_EFLAG_NO_REF_GF |
+                         VP8_EFLAG_NO_UPD_GF |
+                         VP8_EFLAG_NO_UPD_ARF |
+                         VP8_EFLAG_NO_UPD_LAST |
+                         VP8_EFLAG_NO_UPD_ENTROPY;
+
+        // Layer 1: sync point: predict from L and ARF; update G.
+        layer_flags[2] = VP8_EFLAG_NO_REF_GF |
+                         VP8_EFLAG_NO_UPD_ARF |
+                         VP8_EFLAG_NO_UPD_LAST;
+
+        // Layer 2: predict from L, G, ARF; update none.
+        layer_flags[3] = VP8_EFLAG_NO_UPD_GF |
+                         VP8_EFLAG_NO_UPD_ARF |
+                         VP8_EFLAG_NO_UPD_LAST |
+                         VP8_EFLAG_NO_UPD_ENTROPY;
+
+        // Layer 0: predict from L and ARF; update L.
+        layer_flags[4] = VP8_EFLAG_NO_UPD_GF |
+                         VP8_EFLAG_NO_UPD_ARF |
+                         VP8_EFLAG_NO_REF_GF;
+
+        // Layer 2: predict from L, G, ARF; update none.
+        layer_flags[5] = layer_flags[3];
+
+        // Layer 1: predict from L, G, ARF; update G.
+        layer_flags[6] = VP8_EFLAG_NO_UPD_ARF |
+                         VP8_EFLAG_NO_UPD_LAST;
+
+        // Layer 2: predict from L, G, ARF; update none.
+        layer_flags[7] = layer_flags[3];
+        break;
+    }
+    }
+}
+
+/* The periodicity of the pattern given the number of temporal layers. */
+static int periodicity_to_num_layers[MAX_NUM_TEMPORAL_LAYERS] = {1, 8, 8};
+
+int main(int argc, char **argv)
+{
+    FILE                 *infile, *outfile[NUM_ENCODERS];
+    FILE                 *downsampled_input[NUM_ENCODERS - 1];
+    char                 filename[50];
+    vpx_codec_ctx_t      codec[NUM_ENCODERS];
+    vpx_codec_enc_cfg_t  cfg[NUM_ENCODERS];
+    int                  frame_cnt = 0;
+    vpx_image_t          raw[NUM_ENCODERS];
+    vpx_codec_err_t      res[NUM_ENCODERS];
+
+    int                  i;
+    long                 width;
+    long                 height;
+    int                  length_frame;
+    int                  frame_avail;
+    int                  got_data;
+    int                  flags = 0;
+    int                  layer_id = 0;
+
+    int                  layer_flags[VPX_TS_MAX_PERIODICITY * NUM_ENCODERS]
+                                     = {0};
+    int                  flag_periodicity;
+
+    /*Currently, only realtime mode is supported in multi-resolution encoding.*/
+    int                  arg_deadline = VPX_DL_REALTIME;
+
+    /* Set show_psnr to 1/0 to show/not show PSNR. Choose show_psnr=0 if you
+       don't need to know PSNR, which will skip PSNR calculation and save
+       encoding time. */
+    int                  show_psnr = 0;
+    int                  key_frame_insert = 0;
+    uint64_t             psnr_sse_total[NUM_ENCODERS] = {0};
+    uint64_t             psnr_samples_total[NUM_ENCODERS] = {0};
+    double               psnr_totals[NUM_ENCODERS][4] = {{0,0}};
+    int                  psnr_count[NUM_ENCODERS] = {0};
+
+    double               cx_time = 0;
+    struct  timeval      tv1, tv2, difftv;
+
+    /* Set the required target bitrates for each resolution level.
+     * If target bitrate for highest-resolution level is set to 0,
+     * (i.e. target_bitrate[0]=0), we skip encoding at that level.
+     */
+    unsigned int         target_bitrate[NUM_ENCODERS]={1000, 500, 100};
+
+    /* Enter the frame rate of the input video */
+    int                  framerate = 30;
+
+    /* Set down-sampling factor for each resolution level.
+       dsf[0] controls down sampling from level 0 to level 1;
+       dsf[1] controls down sampling from level 1 to level 2;
+       dsf[2] is not used. */
+    vpx_rational_t dsf[NUM_ENCODERS] = {{2, 1}, {2, 1}, {1, 1}};
+
+    /* Set the number of temporal layers for each encoder/resolution level,
+     * starting from highest resoln down to lowest resoln. */
+    unsigned int         num_temporal_layers[NUM_ENCODERS] = {3, 3, 3};
+
+    if(argc!= (7 + 3 * NUM_ENCODERS))
+        die("Usage: %s <width> <height> <frame_rate>  <infile> <outfile(s)> "
+            "<rate_encoder(s)> <temporal_layer(s)> <key_frame_insert> <output psnr?> \n",
+            argv[0]);
+
+    printf("Using %s\n",vpx_codec_iface_name(interface));
+
+    width = strtol(argv[1], NULL, 0);
+    height = strtol(argv[2], NULL, 0);
+    framerate = strtol(argv[3], NULL, 0);
+
+    if(width < 16 || width%2 || height <16 || height%2)
+        die("Invalid resolution: %ldx%ld", width, height);
+
+    /* Open input video file for encoding */
+    if(!(infile = fopen(argv[4], "rb")))
+        die("Failed to open %s for reading", argv[4]);
+
+    /* Open output file for each encoder to output bitstreams */
+    for (i=0; i< NUM_ENCODERS; i++)
+    {
+        if(!target_bitrate[i])
+        {
+            outfile[i] = NULL;
+            continue;
+        }
+
+        if(!(outfile[i] = fopen(argv[i+5], "wb")))
+            die("Failed to open %s for writing", argv[i+4]);
+    }
+
+    // Bitrates per spatial layer: overwrite default rates above.
+    for (i=0; i< NUM_ENCODERS; i++)
+    {
+        target_bitrate[i] = strtol(argv[NUM_ENCODERS + 5 + i], NULL, 0);
+    }
+
+    // Temporal layers per spatial layers: overwrite default settings above.
+    for (i=0; i< NUM_ENCODERS; i++)
+    {
+        num_temporal_layers[i] = strtol(argv[2 * NUM_ENCODERS + 5 + i], NULL, 0);
+        if (num_temporal_layers[i] < 1 || num_temporal_layers[i] > 3)
+          die("Invalid temporal layers: %d, Must be 1, 2, or 3. \n",
+              num_temporal_layers);
+    }
+
+    /* Open file to write out each spatially downsampled input stream. */
+    for (i=0; i< NUM_ENCODERS - 1; i++)
+    {
+       // Highest resoln is encoder 0.
+        if (sprintf(filename,"ds%d.yuv",NUM_ENCODERS - i) < 0)
+        {
+            return EXIT_FAILURE;
+        }
+        downsampled_input[i] = fopen(filename,"wb");
+    }
+
+    key_frame_insert = strtol(argv[3 * NUM_ENCODERS + 5], NULL, 0);
+
+    show_psnr = strtol(argv[3 * NUM_ENCODERS + 6], NULL, 0);
+
+
+    /* Populate default encoder configuration */
+    for (i=0; i< NUM_ENCODERS; i++)
+    {
+        res[i] = vpx_codec_enc_config_default(interface, &cfg[i], 0);
+        if(res[i]) {
+            printf("Failed to get config: %s\n", vpx_codec_err_to_string(res[i]));
+            return EXIT_FAILURE;
+        }
+    }
+
+    /*
+     * Update the default configuration according to needs of the application.
+     */
+    /* Highest-resolution encoder settings */
+    cfg[0].g_w = width;
+    cfg[0].g_h = height;
+    cfg[0].rc_dropframe_thresh = 0;
+    cfg[0].rc_end_usage = VPX_CBR;
+    cfg[0].rc_resize_allowed = 0;
+    cfg[0].rc_min_quantizer = 2;
+    cfg[0].rc_max_quantizer = 56;
+    cfg[0].rc_undershoot_pct = 100;
+    cfg[0].rc_overshoot_pct = 15;
+    cfg[0].rc_buf_initial_sz = 500;
+    cfg[0].rc_buf_optimal_sz = 600;
+    cfg[0].rc_buf_sz = 1000;
+    cfg[0].g_error_resilient = 1;              /* Enable error resilient mode */
+    cfg[0].g_lag_in_frames   = 0;
+
+    /* Disable automatic keyframe placement */
+    /* Note: These 3 settings are copied to all levels. But, except the lowest
+     * resolution level, all other levels are set to VPX_KF_DISABLED internally.
+     */
+    cfg[0].kf_mode           = VPX_KF_AUTO;
+    cfg[0].kf_min_dist = 3000;
+    cfg[0].kf_max_dist = 3000;
+
+    cfg[0].rc_target_bitrate = target_bitrate[0];       /* Set target bitrate */
+    cfg[0].g_timebase.num = 1;                          /* Set fps */
+    cfg[0].g_timebase.den = framerate;
+
+    /* Other-resolution encoder settings */
+    for (i=1; i< NUM_ENCODERS; i++)
+    {
+        memcpy(&cfg[i], &cfg[0], sizeof(vpx_codec_enc_cfg_t));
+
+        cfg[i].rc_target_bitrate = target_bitrate[i];
+
+        /* Note: Width & height of other-resolution encoders are calculated
+         * from the highest-resolution encoder's size and the corresponding
+         * down_sampling_factor.
+         */
+        {
+            unsigned int iw = cfg[i-1].g_w*dsf[i-1].den + dsf[i-1].num - 1;
+            unsigned int ih = cfg[i-1].g_h*dsf[i-1].den + dsf[i-1].num - 1;
+            cfg[i].g_w = iw/dsf[i-1].num;
+            cfg[i].g_h = ih/dsf[i-1].num;
+        }
+
+        /* Make width & height to be multiplier of 2. */
+        // Should support odd size ???
+        if((cfg[i].g_w)%2)cfg[i].g_w++;
+        if((cfg[i].g_h)%2)cfg[i].g_h++;
+    }
+
+
+    // Set the number of threads per encode/spatial layer.
+    // (1, 1, 1) means no encoder threading.
+    cfg[0].g_threads = 2;
+    cfg[1].g_threads = 1;
+    cfg[2].g_threads = 1;
+
+    /* Allocate image for each encoder */
+    for (i=0; i< NUM_ENCODERS; i++)
+        if(!vpx_img_alloc(&raw[i], VPX_IMG_FMT_I420, cfg[i].g_w, cfg[i].g_h, 32))
+            die("Failed to allocate image", cfg[i].g_w, cfg[i].g_h);
+
+    if (raw[0].stride[VPX_PLANE_Y] == raw[0].d_w)
+        read_frame_p = read_frame;
+    else
+        read_frame_p = read_frame_by_row;
+
+    for (i=0; i< NUM_ENCODERS; i++)
+        if(outfile[i])
+            write_ivf_file_header(outfile[i], &cfg[i], 0);
+
+    /* Temporal layers settings */
+    for ( i=0; i<NUM_ENCODERS; i++)
+    {
+        set_temporal_layer_pattern(num_temporal_layers[i],
+                                   &cfg[i],
+                                   cfg[i].rc_target_bitrate,
+                                   &layer_flags[i * VPX_TS_MAX_PERIODICITY]);
+    }
+
+    /* Initialize multi-encoder */
+    if(vpx_codec_enc_init_multi(&codec[0], interface, &cfg[0], NUM_ENCODERS,
+                                (show_psnr ? VPX_CODEC_USE_PSNR : 0), &dsf[0]))
+        die_codec(&codec[0], "Failed to initialize encoder");
+
+    /* The extra encoding configuration parameters can be set as follows. */
+    /* Set encoding speed */
+    for ( i=0; i<NUM_ENCODERS; i++)
+    {
+        int speed = -6;
+        /* Lower speed for the lowest resolution. */
+        if (i == NUM_ENCODERS - 1) speed = -4;
+        if(vpx_codec_control(&codec[i], VP8E_SET_CPUUSED, speed))
+            die_codec(&codec[i], "Failed to set cpu_used");
+    }
+
+    /* Set static threshold = 1 for all encoders */
+    for ( i=0; i<NUM_ENCODERS; i++)
+    {
+        if(vpx_codec_control(&codec[i], VP8E_SET_STATIC_THRESHOLD, 1))
+            die_codec(&codec[i], "Failed to set static threshold");
+    }
+
+    /* Set NOISE_SENSITIVITY to do TEMPORAL_DENOISING */
+    /* Enable denoising for the highest-resolution encoder. */
+    if(vpx_codec_control(&codec[0], VP8E_SET_NOISE_SENSITIVITY, 1))
+        die_codec(&codec[0], "Failed to set noise_sensitivity");
+    for ( i=1; i< NUM_ENCODERS; i++)
+    {
+        if(vpx_codec_control(&codec[i], VP8E_SET_NOISE_SENSITIVITY, 0))
+            die_codec(&codec[i], "Failed to set noise_sensitivity");
+    }
+
+    /* Set the number of token partitions */
+    for ( i=0; i<NUM_ENCODERS; i++)
+    {
+        if(vpx_codec_control(&codec[i], VP8E_SET_TOKEN_PARTITIONS, 1))
+            die_codec(&codec[i], "Failed to set static threshold");
+    }
+
+    /* Set the max intra target bitrate */
+    for ( i=0; i<NUM_ENCODERS; i++)
+    {
+        unsigned int max_intra_size_pct =
+            (int)(((double)cfg[0].rc_buf_optimal_sz * 0.5) * framerate / 10);
+        if(vpx_codec_control(&codec[i], VP8E_SET_MAX_INTRA_BITRATE_PCT,
+                             max_intra_size_pct))
+            die_codec(&codec[i], "Failed to set static threshold");
+       //printf("%d %d \n",i,max_intra_size_pct);
+    }
+
+    frame_avail = 1;
+    got_data = 0;
+
+    while(frame_avail || got_data)
+    {
+        vpx_codec_iter_t iter[NUM_ENCODERS]={NULL};
+        const vpx_codec_cx_pkt_t *pkt[NUM_ENCODERS];
+
+        flags = 0;
+        frame_avail = read_frame_p(infile, &raw[0]);
+
+        if(frame_avail)
+        {
+            for ( i=1; i<NUM_ENCODERS; i++)
+            {
+                /*Scale the image down a number of times by downsampling factor*/
+                /* FilterMode 1 or 2 give better psnr than FilterMode 0. */
+                I420Scale(raw[i-1].planes[VPX_PLANE_Y], raw[i-1].stride[VPX_PLANE_Y],
+                          raw[i-1].planes[VPX_PLANE_U], raw[i-1].stride[VPX_PLANE_U],
+                          raw[i-1].planes[VPX_PLANE_V], raw[i-1].stride[VPX_PLANE_V],
+                          raw[i-1].d_w, raw[i-1].d_h,
+                          raw[i].planes[VPX_PLANE_Y], raw[i].stride[VPX_PLANE_Y],
+                          raw[i].planes[VPX_PLANE_U], raw[i].stride[VPX_PLANE_U],
+                          raw[i].planes[VPX_PLANE_V], raw[i].stride[VPX_PLANE_V],
+                          raw[i].d_w, raw[i].d_h, 1);
+                /* Write out down-sampled input. */
+                length_frame = cfg[i].g_w *  cfg[i].g_h *3/2;
+                if (fwrite(raw[i].planes[0], 1, length_frame,
+                           downsampled_input[NUM_ENCODERS - i - 1]) !=
+                               length_frame)
+                {
+                    return EXIT_FAILURE;
+                }
+            }
+        }
+
+        /* Set the flags (reference and update) for all the encoders.*/
+        for ( i=0; i<NUM_ENCODERS; i++)
+        {
+            layer_id = cfg[i].ts_layer_id[frame_cnt % cfg[i].ts_periodicity];
+            flags = 0;
+            flag_periodicity = periodicity_to_num_layers
+                [num_temporal_layers[i] - 1];
+            flags = layer_flags[i * VPX_TS_MAX_PERIODICITY +
+                                frame_cnt % flag_periodicity];
+            // Key frame flag for first frame.
+            if (frame_cnt == 0)
+            {
+                flags |= VPX_EFLAG_FORCE_KF;
+            }
+            if (frame_cnt > 0 && frame_cnt == key_frame_insert)
+            {
+                flags = VPX_EFLAG_FORCE_KF;
+            }
+
+            vpx_codec_control(&codec[i], VP8E_SET_FRAME_FLAGS, flags);
+            vpx_codec_control(&codec[i], VP8E_SET_TEMPORAL_LAYER_ID, layer_id);
+        }
+
+        gettimeofday(&tv1, NULL);
+        /* Encode each frame at multi-levels */
+        /* Note the flags must be set to 0 in the encode call if they are set
+           for each frame with the vpx_codec_control(), as done above. */
+        if(vpx_codec_encode(&codec[0], frame_avail? &raw[0] : NULL,
+            frame_cnt, 1, 0, arg_deadline))
+        {
+            die_codec(&codec[0], "Failed to encode frame");
+        }
+        gettimeofday(&tv2, NULL);
+        timersub(&tv2, &tv1, &difftv);
+        cx_time += (double)(difftv.tv_sec * 1000000 + difftv.tv_usec);
+        for (i=NUM_ENCODERS-1; i>=0 ; i--)
+        {
+            got_data = 0;
+            while( (pkt[i] = vpx_codec_get_cx_data(&codec[i], &iter[i])) )
+            {
+                got_data = 1;
+                switch(pkt[i]->kind) {
+                    case VPX_CODEC_CX_FRAME_PKT:
+                        write_ivf_frame_header(outfile[i], pkt[i]);
+                        (void) fwrite(pkt[i]->data.frame.buf, 1,
+                                      pkt[i]->data.frame.sz, outfile[i]);
+                    break;
+                    case VPX_CODEC_PSNR_PKT:
+                        if (show_psnr)
+                        {
+                            int j;
+
+                            psnr_sse_total[i] += pkt[i]->data.psnr.sse[0];
+                            psnr_samples_total[i] += pkt[i]->data.psnr.samples[0];
+                            for (j = 0; j < 4; j++)
+                            {
+                                psnr_totals[i][j] += pkt[i]->data.psnr.psnr[j];
+                            }
+                            psnr_count[i]++;
+                        }
+
+                        break;
+                    default:
+                        break;
+                }
+                printf(pkt[i]->kind == VPX_CODEC_CX_FRAME_PKT
+                       && (pkt[i]->data.frame.flags & VPX_FRAME_IS_KEY)? "K":"");
+                fflush(stdout);
+            }
+        }
+        frame_cnt++;
+    }
+    printf("\n");
+    printf("FPS for encoding %d %f %f \n", frame_cnt, (float)cx_time / 1000000,
+           1000000 * (double)frame_cnt / (double)cx_time);
+
+    fclose(infile);
+
+    printf("Processed %ld frames.\n",(long int)frame_cnt-1);
+    for (i=0; i< NUM_ENCODERS; i++)
+    {
+        /* Calculate PSNR and print it out */
+        if ( (show_psnr) && (psnr_count[i]>0) )
+        {
+            int j;
+            double ovpsnr = sse_to_psnr(psnr_samples_total[i], 255.0,
+                                        psnr_sse_total[i]);
+
+            fprintf(stderr, "\n ENC%d PSNR (Overall/Avg/Y/U/V)", i);
+
+            fprintf(stderr, " %.3lf", ovpsnr);
+            for (j = 0; j < 4; j++)
+            {
+                fprintf(stderr, " %.3lf", psnr_totals[i][j]/psnr_count[i]);
+            }
+        }
+
+        if(vpx_codec_destroy(&codec[i]))
+            die_codec(&codec[i], "Failed to destroy codec");
+
+        vpx_img_free(&raw[i]);
+
+        if(!outfile[i])
+            continue;
+
+        /* Try to rewrite the file header with the actual frame count */
+        if(!fseek(outfile[i], 0, SEEK_SET))
+            write_ivf_file_header(outfile[i], &cfg[i], frame_cnt-1);
+        fclose(outfile[i]);
+    }
+    printf("\n");
+
+    return EXIT_SUCCESS;
+}
diff --git a/libs/libvpx/examples/vp8cx_set_ref.c b/libs/libvpx/examples/vp8cx_set_ref.c
new file mode 100644
index 0000000000..8b4cc303d3
--- /dev/null
+++ b/libs/libvpx/examples/vp8cx_set_ref.c
@@ -0,0 +1,194 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+// VP8 Set Reference Frame
+// =======================
+//
+// This is an example demonstrating how to overwrite the VP8 encoder's
+// internal reference frame. In the sample we set the last frame to the
+// current frame. If this is done at a cut scene it will avoid a keyframe.
+// This technique could be used to bounce between two cameras.
+//
+// Note that the decoder would also have to set the reference frame to the
+// same value on the same frame, or the video will become corrupt.
+//
+// Usage
+// -----
+// This example adds a single argument to the `simple_encoder` example,
+// which specifies the frame number to update the reference frame on.
+// The parameter is parsed as follows:
+//
+//
+// Extra Variables
+// ---------------
+// This example maintains the frame number passed on the command line
+// in the `update_frame_num` variable.
+//
+//
+// Configuration
+// -------------
+//
+// The reference frame is updated on the frame specified on the command
+// line.
+//
+// Observing The Effects
+// ---------------------
+// Use the `simple_encoder` example to encode a sample with a cut scene.
+// Determine the frame number of the cut scene by looking for a generated
+// key-frame (indicated by a 'K'). Supply that frame number as an argument
+// to this example, and observe that no key-frame is generated.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "vpx/vp8cx.h"
+#include "vpx/vpx_encoder.h"
+
+#include "../tools_common.h"
+#include "../video_writer.h"
+
+static const char *exec_name;
+
+void usage_exit(void) {
+  fprintf(stderr, "Usage: %s <width> <height> <infile> <outfile> <frame>\n",
+          exec_name);
+  exit(EXIT_FAILURE);
+}
+
+static int encode_frame(vpx_codec_ctx_t *codec,
+                        vpx_image_t *img,
+                        int frame_index,
+                        VpxVideoWriter *writer) {
+  int got_pkts = 0;
+  vpx_codec_iter_t iter = NULL;
+  const vpx_codec_cx_pkt_t *pkt = NULL;
+  const vpx_codec_err_t res = vpx_codec_encode(codec, img, frame_index, 1, 0,
+                                               VPX_DL_GOOD_QUALITY);
+  if (res != VPX_CODEC_OK)
+    die_codec(codec, "Failed to encode frame");
+
+  while ((pkt = vpx_codec_get_cx_data(codec, &iter)) != NULL) {
+    got_pkts = 1;
+
+    if (pkt->kind == VPX_CODEC_CX_FRAME_PKT) {
+      const int keyframe = (pkt->data.frame.flags & VPX_FRAME_IS_KEY) != 0;
+      if (!vpx_video_writer_write_frame(writer,
+                                        pkt->data.frame.buf,
+                                        pkt->data.frame.sz,
+                                        pkt->data.frame.pts)) {
+        die_codec(codec, "Failed to write compressed frame");
+      }
+
+      printf(keyframe ? "K" : ".");
+      fflush(stdout);
+    }
+  }
+
+  return got_pkts;
+}
+
+int main(int argc, char **argv) {
+  FILE *infile = NULL;
+  vpx_codec_ctx_t codec = {0};
+  vpx_codec_enc_cfg_t cfg = {0};
+  int frame_count = 0;
+  vpx_image_t raw;
+  vpx_codec_err_t res;
+  VpxVideoInfo info = {0};
+  VpxVideoWriter *writer = NULL;
+  const VpxInterface *encoder = NULL;
+  int update_frame_num = 0;
+  const int fps = 30;        // TODO(dkovalev) add command line argument
+  const int bitrate = 200;   // kbit/s TODO(dkovalev) add command line argument
+
+  exec_name = argv[0];
+
+  if (argc != 6)
+    die("Invalid number of arguments");
+
+  // TODO(dkovalev): add vp9 support and rename the file accordingly
+  encoder = get_vpx_encoder_by_name("vp8");
+  if (!encoder)
+    die("Unsupported codec.");
+
+  update_frame_num = atoi(argv[5]);
+  if (!update_frame_num)
+    die("Couldn't parse frame number '%s'\n", argv[5]);
+
+  info.codec_fourcc = encoder->fourcc;
+  info.frame_width = strtol(argv[1], NULL, 0);
+  info.frame_height = strtol(argv[2], NULL, 0);
+  info.time_base.numerator = 1;
+  info.time_base.denominator = fps;
+
+  if (info.frame_width <= 0 ||
+      info.frame_height <= 0 ||
+      (info.frame_width % 2) != 0 ||
+      (info.frame_height % 2) != 0) {
+    die("Invalid frame size: %dx%d", info.frame_width, info.frame_height);
+  }
+
+  if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, info.frame_width,
+                                             info.frame_height, 1)) {
+    die("Failed to allocate image.");
+  }
+
+  printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface()));
+
+  res = vpx_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
+  if (res)
+    die_codec(&codec, "Failed to get default codec config.");
+
+  cfg.g_w = info.frame_width;
+  cfg.g_h = info.frame_height;
+  cfg.g_timebase.num = info.time_base.numerator;
+  cfg.g_timebase.den = info.time_base.denominator;
+  cfg.rc_target_bitrate = bitrate;
+
+  writer = vpx_video_writer_open(argv[4], kContainerIVF, &info);
+  if (!writer)
+    die("Failed to open %s for writing.", argv[4]);
+
+  if (!(infile = fopen(argv[3], "rb")))
+    die("Failed to open %s for reading.", argv[3]);
+
+  if (vpx_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0))
+    die_codec(&codec, "Failed to initialize encoder");
+
+  // Encode frames.
+  while (vpx_img_read(&raw, infile)) {
+    if (frame_count + 1 == update_frame_num) {
+      vpx_ref_frame_t ref;
+      ref.frame_type = VP8_LAST_FRAME;
+      ref.img = raw;
+      if (vpx_codec_control(&codec, VP8_SET_REFERENCE, &ref))
+        die_codec(&codec, "Failed to set reference frame");
+    }
+
+    encode_frame(&codec, &raw, frame_count++, writer);
+  }
+
+  // Flush encoder.
+  while (encode_frame(&codec, NULL, -1, writer)) {}
+
+  printf("\n");
+  fclose(infile);
+  printf("Processed %d frames.\n", frame_count);
+
+  vpx_img_free(&raw);
+  if (vpx_codec_destroy(&codec))
+    die_codec(&codec, "Failed to destroy codec.");
+
+  vpx_video_writer_close(writer);
+
+  return EXIT_SUCCESS;
+}
diff --git a/libs/libvpx/examples/vp9_lossless_encoder.c b/libs/libvpx/examples/vp9_lossless_encoder.c
new file mode 100644
index 0000000000..8272516830
--- /dev/null
+++ b/libs/libvpx/examples/vp9_lossless_encoder.c
@@ -0,0 +1,144 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "vpx/vpx_encoder.h"
+#include "vpx/vp8cx.h"
+
+#include "../tools_common.h"
+#include "../video_writer.h"
+
+static const char *exec_name;
+
+void usage_exit(void) {
+  fprintf(stderr, "vp9_lossless_encoder: Example demonstrating VP9 lossless "
+                  "encoding feature. Supports raw input only.\n");
+  fprintf(stderr, "Usage: %s <width> <height> <infile> <outfile>\n", exec_name);
+  exit(EXIT_FAILURE);
+}
+
+static int encode_frame(vpx_codec_ctx_t *codec,
+                        vpx_image_t *img,
+                        int frame_index,
+                        int flags,
+                        VpxVideoWriter *writer) {
+  int got_pkts = 0;
+  vpx_codec_iter_t iter = NULL;
+  const vpx_codec_cx_pkt_t *pkt = NULL;
+  const vpx_codec_err_t res = vpx_codec_encode(codec, img, frame_index, 1,
+                                               flags, VPX_DL_GOOD_QUALITY);
+  if (res != VPX_CODEC_OK)
+    die_codec(codec, "Failed to encode frame");
+
+  while ((pkt = vpx_codec_get_cx_data(codec, &iter)) != NULL) {
+    got_pkts = 1;
+
+    if (pkt->kind == VPX_CODEC_CX_FRAME_PKT) {
+      const int keyframe = (pkt->data.frame.flags & VPX_FRAME_IS_KEY) != 0;
+      if (!vpx_video_writer_write_frame(writer,
+                                        pkt->data.frame.buf,
+                                        pkt->data.frame.sz,
+                                        pkt->data.frame.pts)) {
+        die_codec(codec, "Failed to write compressed frame");
+      }
+      printf(keyframe ? "K" : ".");
+      fflush(stdout);
+    }
+  }
+
+  return got_pkts;
+}
+
+int main(int argc, char **argv) {
+  FILE *infile = NULL;
+  vpx_codec_ctx_t codec;
+  vpx_codec_enc_cfg_t cfg;
+  int frame_count = 0;
+  vpx_image_t raw;
+  vpx_codec_err_t res;
+  VpxVideoInfo info = {0};
+  VpxVideoWriter *writer = NULL;
+  const VpxInterface *encoder = NULL;
+  const int fps = 30;
+
+  exec_name = argv[0];
+
+  if (argc < 5)
+    die("Invalid number of arguments");
+
+  encoder = get_vpx_encoder_by_name("vp9");
+  if (!encoder)
+     die("Unsupported codec.");
+
+  info.codec_fourcc = encoder->fourcc;
+  info.frame_width = strtol(argv[1], NULL, 0);
+  info.frame_height = strtol(argv[2], NULL, 0);
+  info.time_base.numerator = 1;
+  info.time_base.denominator = fps;
+
+  if (info.frame_width <= 0 ||
+      info.frame_height <= 0 ||
+      (info.frame_width % 2) != 0 ||
+      (info.frame_height % 2) != 0) {
+    die("Invalid frame size: %dx%d", info.frame_width, info.frame_height);
+  }
+
+  if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, info.frame_width,
+                                             info.frame_height, 1)) {
+    die("Failed to allocate image.");
+  }
+
+  printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface()));
+
+  res = vpx_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
+  if (res)
+    die_codec(&codec, "Failed to get default codec config.");
+
+  cfg.g_w = info.frame_width;
+  cfg.g_h = info.frame_height;
+  cfg.g_timebase.num = info.time_base.numerator;
+  cfg.g_timebase.den = info.time_base.denominator;
+
+  writer = vpx_video_writer_open(argv[4], kContainerIVF, &info);
+  if (!writer)
+    die("Failed to open %s for writing.", argv[4]);
+
+  if (!(infile = fopen(argv[3], "rb")))
+    die("Failed to open %s for reading.", argv[3]);
+
+  if (vpx_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0))
+    die_codec(&codec, "Failed to initialize encoder");
+
+  if (vpx_codec_control_(&codec, VP9E_SET_LOSSLESS, 1))
+    die_codec(&codec, "Failed to use lossless mode");
+
+  // Encode frames.
+  while (vpx_img_read(&raw, infile)) {
+    encode_frame(&codec, &raw, frame_count++, 0, writer);
+  }
+
+  // Flush encoder.
+  while (encode_frame(&codec, NULL, -1, 0, writer)) {}
+
+  printf("\n");
+  fclose(infile);
+  printf("Processed %d frames.\n", frame_count);
+
+  vpx_img_free(&raw);
+  if (vpx_codec_destroy(&codec))
+    die_codec(&codec, "Failed to destroy codec.");
+
+  vpx_video_writer_close(writer);
+
+  return EXIT_SUCCESS;
+}
diff --git a/libs/libvpx/examples/vp9_spatial_svc_encoder.c b/libs/libvpx/examples/vp9_spatial_svc_encoder.c
new file mode 100644
index 0000000000..271ab704b6
--- /dev/null
+++ b/libs/libvpx/examples/vp9_spatial_svc_encoder.c
@@ -0,0 +1,919 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*
+ * This is an example demonstrating how to implement a multi-layer
+ * VP9 encoding scheme based on spatial scalability for video applications
+ * that benefit from a scalable bitstream.
+ */
+
+#include <math.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+
+#include "../args.h"
+#include "../tools_common.h"
+#include "../video_writer.h"
+
+#include "../vpx_ports/vpx_timer.h"
+#include "vpx/svc_context.h"
+#include "vpx/vp8cx.h"
+#include "vpx/vpx_encoder.h"
+#include "../vpxstats.h"
+#include "vp9/encoder/vp9_encoder.h"
+#define OUTPUT_RC_STATS 1
+
+static const arg_def_t skip_frames_arg =
+    ARG_DEF("s", "skip-frames", 1, "input frames to skip");
+static const arg_def_t frames_arg =
+    ARG_DEF("f", "frames", 1, "number of frames to encode");
+static const arg_def_t threads_arg =
+    ARG_DEF("th", "threads", 1, "number of threads to use");
+#if OUTPUT_RC_STATS
+static const arg_def_t output_rc_stats_arg =
+    ARG_DEF("rcstat", "output_rc_stats", 1, "output rc stats");
+#endif
+static const arg_def_t width_arg = ARG_DEF("w", "width", 1, "source width");
+static const arg_def_t height_arg = ARG_DEF("h", "height", 1, "source height");
+static const arg_def_t timebase_arg =
+    ARG_DEF("t", "timebase", 1, "timebase (num/den)");
+static const arg_def_t bitrate_arg = ARG_DEF(
+    "b", "target-bitrate", 1, "encoding bitrate, in kilobits per second");
+static const arg_def_t spatial_layers_arg =
+    ARG_DEF("sl", "spatial-layers", 1, "number of spatial SVC layers");
+static const arg_def_t temporal_layers_arg =
+    ARG_DEF("tl", "temporal-layers", 1, "number of temporal SVC layers");
+static const arg_def_t temporal_layering_mode_arg =
+    ARG_DEF("tlm", "temporal-layering-mode", 1, "temporal layering scheme."
+        "VP9E_TEMPORAL_LAYERING_MODE");
+static const arg_def_t kf_dist_arg =
+    ARG_DEF("k", "kf-dist", 1, "number of frames between keyframes");
+static const arg_def_t scale_factors_arg =
+    ARG_DEF("r", "scale-factors", 1, "scale factors (lowest to highest layer)");
+static const arg_def_t passes_arg =
+    ARG_DEF("p", "passes", 1, "Number of passes (1/2)");
+static const arg_def_t pass_arg =
+    ARG_DEF(NULL, "pass", 1, "Pass to execute (1/2)");
+static const arg_def_t fpf_name_arg =
+    ARG_DEF(NULL, "fpf", 1, "First pass statistics file name");
+static const arg_def_t min_q_arg =
+    ARG_DEF(NULL, "min-q", 1, "Minimum quantizer");
+static const arg_def_t max_q_arg =
+    ARG_DEF(NULL, "max-q", 1, "Maximum quantizer");
+static const arg_def_t min_bitrate_arg =
+    ARG_DEF(NULL, "min-bitrate", 1, "Minimum bitrate");
+static const arg_def_t max_bitrate_arg =
+    ARG_DEF(NULL, "max-bitrate", 1, "Maximum bitrate");
+static const arg_def_t lag_in_frame_arg =
+    ARG_DEF(NULL, "lag-in-frames", 1, "Number of frame to input before "
+        "generating any outputs");
+static const arg_def_t rc_end_usage_arg =
+    ARG_DEF(NULL, "rc-end-usage", 1, "0 - 3: VBR, CBR, CQ, Q");
+static const arg_def_t speed_arg =
+    ARG_DEF("sp", "speed", 1, "speed configuration");
+static const arg_def_t aqmode_arg =
+    ARG_DEF("aq", "aqmode", 1, "aq-mode off/on");
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static const struct arg_enum_list bitdepth_enum[] = {
+  {"8",  VPX_BITS_8},
+  {"10", VPX_BITS_10},
+  {"12", VPX_BITS_12},
+  {NULL, 0}
+};
+
+static const arg_def_t bitdepth_arg =
+    ARG_DEF_ENUM("d", "bit-depth", 1, "Bit depth for codec 8, 10 or 12. ",
+                 bitdepth_enum);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+
+static const arg_def_t *svc_args[] = {
+  &frames_arg,        &width_arg,         &height_arg,
+  &timebase_arg,      &bitrate_arg,       &skip_frames_arg, &spatial_layers_arg,
+  &kf_dist_arg,       &scale_factors_arg, &passes_arg,      &pass_arg,
+  &fpf_name_arg,      &min_q_arg,         &max_q_arg,       &min_bitrate_arg,
+  &max_bitrate_arg,   &temporal_layers_arg, &temporal_layering_mode_arg,
+  &lag_in_frame_arg,  &threads_arg,       &aqmode_arg,
+#if OUTPUT_RC_STATS
+  &output_rc_stats_arg,
+#endif
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  &bitdepth_arg,
+#endif
+  &speed_arg,
+  &rc_end_usage_arg,  NULL
+};
+
+static const uint32_t default_frames_to_skip = 0;
+static const uint32_t default_frames_to_code = 60 * 60;
+static const uint32_t default_width = 1920;
+static const uint32_t default_height = 1080;
+static const uint32_t default_timebase_num = 1;
+static const uint32_t default_timebase_den = 60;
+static const uint32_t default_bitrate = 1000;
+static const uint32_t default_spatial_layers = 5;
+static const uint32_t default_temporal_layers = 1;
+static const uint32_t default_kf_dist = 100;
+static const uint32_t default_temporal_layering_mode = 0;
+static const uint32_t default_output_rc_stats = 0;
+static const int32_t default_speed = -1;  // -1 means use library default.
+static const uint32_t default_threads = 0;  // zero means use library default.
+
+typedef struct {
+  const char *input_filename;
+  const char *output_filename;
+  uint32_t frames_to_code;
+  uint32_t frames_to_skip;
+  struct VpxInputContext input_ctx;
+  stats_io_t rc_stats;
+  int passes;
+  int pass;
+} AppInput;
+
+static const char *exec_name;
+
+void usage_exit(void) {
+  fprintf(stderr, "Usage: %s <options> input_filename output_filename\n",
+          exec_name);
+  fprintf(stderr, "Options:\n");
+  arg_show_usage(stderr, svc_args);
+  exit(EXIT_FAILURE);
+}
+
+static void parse_command_line(int argc, const char **argv_,
+                               AppInput *app_input, SvcContext *svc_ctx,
+                               vpx_codec_enc_cfg_t *enc_cfg) {
+  struct arg arg = {0};
+  char **argv = NULL;
+  char **argi = NULL;
+  char **argj = NULL;
+  vpx_codec_err_t res;
+  int passes = 0;
+  int pass = 0;
+  const char *fpf_file_name = NULL;
+  unsigned int min_bitrate = 0;
+  unsigned int max_bitrate = 0;
+  char string_options[1024] = {0};
+
+  // initialize SvcContext with parameters that will be passed to vpx_svc_init
+  svc_ctx->log_level = SVC_LOG_DEBUG;
+  svc_ctx->spatial_layers = default_spatial_layers;
+  svc_ctx->temporal_layers = default_temporal_layers;
+  svc_ctx->temporal_layering_mode = default_temporal_layering_mode;
+#if OUTPUT_RC_STATS
+  svc_ctx->output_rc_stat = default_output_rc_stats;
+#endif
+  svc_ctx->speed = default_speed;
+  svc_ctx->threads = default_threads;
+
+  // start with default encoder configuration
+  res = vpx_codec_enc_config_default(vpx_codec_vp9_cx(), enc_cfg, 0);
+  if (res) {
+    die("Failed to get config: %s\n", vpx_codec_err_to_string(res));
+  }
+  // update enc_cfg with app default values
+  enc_cfg->g_w = default_width;
+  enc_cfg->g_h = default_height;
+  enc_cfg->g_timebase.num = default_timebase_num;
+  enc_cfg->g_timebase.den = default_timebase_den;
+  enc_cfg->rc_target_bitrate = default_bitrate;
+  enc_cfg->kf_min_dist = default_kf_dist;
+  enc_cfg->kf_max_dist = default_kf_dist;
+  enc_cfg->rc_end_usage = VPX_CQ;
+
+  // initialize AppInput with default values
+  app_input->frames_to_code = default_frames_to_code;
+  app_input->frames_to_skip = default_frames_to_skip;
+
+  // process command line options
+  argv = argv_dup(argc - 1, argv_ + 1);
+  for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
+    arg.argv_step = 1;
+
+    if (arg_match(&arg, &frames_arg, argi)) {
+      app_input->frames_to_code = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &width_arg, argi)) {
+      enc_cfg->g_w = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &height_arg, argi)) {
+      enc_cfg->g_h = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &timebase_arg, argi)) {
+      enc_cfg->g_timebase = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &bitrate_arg, argi)) {
+      enc_cfg->rc_target_bitrate = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &skip_frames_arg, argi)) {
+      app_input->frames_to_skip = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &spatial_layers_arg, argi)) {
+      svc_ctx->spatial_layers = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &temporal_layers_arg, argi)) {
+      svc_ctx->temporal_layers = arg_parse_uint(&arg);
+#if OUTPUT_RC_STATS
+    } else if (arg_match(&arg, &output_rc_stats_arg, argi)) {
+      svc_ctx->output_rc_stat = arg_parse_uint(&arg);
+#endif
+    } else if (arg_match(&arg, &speed_arg, argi)) {
+      svc_ctx->speed = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &aqmode_arg, argi)) {
+      svc_ctx->aqmode = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &threads_arg, argi)) {
+      svc_ctx->threads = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &temporal_layering_mode_arg, argi)) {
+      svc_ctx->temporal_layering_mode =
+          enc_cfg->temporal_layering_mode = arg_parse_int(&arg);
+      if (svc_ctx->temporal_layering_mode) {
+        enc_cfg->g_error_resilient = 1;
+      }
+    } else if (arg_match(&arg, &kf_dist_arg, argi)) {
+      enc_cfg->kf_min_dist = arg_parse_uint(&arg);
+      enc_cfg->kf_max_dist = enc_cfg->kf_min_dist;
+    } else if (arg_match(&arg, &scale_factors_arg, argi)) {
+      snprintf(string_options, sizeof(string_options), "%s scale-factors=%s",
+               string_options, arg.val);
+    } else if (arg_match(&arg, &passes_arg, argi)) {
+      passes = arg_parse_uint(&arg);
+      if (passes < 1 || passes > 2) {
+        die("Error: Invalid number of passes (%d)\n", passes);
+      }
+    } else if (arg_match(&arg, &pass_arg, argi)) {
+      pass = arg_parse_uint(&arg);
+      if (pass < 1 || pass > 2) {
+        die("Error: Invalid pass selected (%d)\n", pass);
+      }
+    } else if (arg_match(&arg, &fpf_name_arg, argi)) {
+      fpf_file_name = arg.val;
+    } else if (arg_match(&arg, &min_q_arg, argi)) {
+      snprintf(string_options, sizeof(string_options), "%s min-quantizers=%s",
+               string_options, arg.val);
+    } else if (arg_match(&arg, &max_q_arg, argi)) {
+      snprintf(string_options, sizeof(string_options), "%s max-quantizers=%s",
+               string_options, arg.val);
+    } else if (arg_match(&arg, &min_bitrate_arg, argi)) {
+      min_bitrate = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &max_bitrate_arg, argi)) {
+      max_bitrate = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &lag_in_frame_arg, argi)) {
+      enc_cfg->g_lag_in_frames = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &rc_end_usage_arg, argi)) {
+      enc_cfg->rc_end_usage = arg_parse_uint(&arg);
+#if CONFIG_VP9_HIGHBITDEPTH
+    } else if (arg_match(&arg, &bitdepth_arg, argi)) {
+      enc_cfg->g_bit_depth = arg_parse_enum_or_int(&arg);
+      switch (enc_cfg->g_bit_depth) {
+        case VPX_BITS_8:
+          enc_cfg->g_input_bit_depth = 8;
+          enc_cfg->g_profile = 0;
+          break;
+        case VPX_BITS_10:
+          enc_cfg->g_input_bit_depth = 10;
+          enc_cfg->g_profile = 2;
+          break;
+         case VPX_BITS_12:
+          enc_cfg->g_input_bit_depth = 12;
+          enc_cfg->g_profile = 2;
+          break;
+        default:
+          die("Error: Invalid bit depth selected (%d)\n", enc_cfg->g_bit_depth);
+          break;
+      }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    } else {
+      ++argj;
+    }
+  }
+
+  // There will be a space in front of the string options
+  if (strlen(string_options) > 0)
+    vpx_svc_set_options(svc_ctx, string_options + 1);
+
+  if (passes == 0 || passes == 1) {
+    if (pass) {
+      fprintf(stderr, "pass is ignored since there's only one pass\n");
+    }
+    enc_cfg->g_pass = VPX_RC_ONE_PASS;
+  } else {
+    if (pass == 0) {
+      die("pass must be specified when passes is 2\n");
+    }
+
+    if (fpf_file_name == NULL) {
+      die("fpf must be specified when passes is 2\n");
+    }
+
+    if (pass == 1) {
+      enc_cfg->g_pass = VPX_RC_FIRST_PASS;
+      if (!stats_open_file(&app_input->rc_stats, fpf_file_name, 0)) {
+        fatal("Failed to open statistics store");
+      }
+    } else {
+      enc_cfg->g_pass = VPX_RC_LAST_PASS;
+      if (!stats_open_file(&app_input->rc_stats, fpf_file_name, 1)) {
+        fatal("Failed to open statistics store");
+      }
+      enc_cfg->rc_twopass_stats_in = stats_get(&app_input->rc_stats);
+    }
+    app_input->passes = passes;
+    app_input->pass = pass;
+  }
+
+  if (enc_cfg->rc_target_bitrate > 0) {
+    if (min_bitrate > 0) {
+      enc_cfg->rc_2pass_vbr_minsection_pct =
+          min_bitrate * 100 / enc_cfg->rc_target_bitrate;
+    }
+    if (max_bitrate > 0) {
+      enc_cfg->rc_2pass_vbr_maxsection_pct =
+          max_bitrate * 100 / enc_cfg->rc_target_bitrate;
+    }
+  }
+
+  // Check for unrecognized options
+  for (argi = argv; *argi; ++argi)
+    if (argi[0][0] == '-' && strlen(argi[0]) > 1)
+      die("Error: Unrecognized option %s\n", *argi);
+
+  if (argv[0] == NULL || argv[1] == 0) {
+    usage_exit();
+  }
+  app_input->input_filename = argv[0];
+  app_input->output_filename = argv[1];
+  free(argv);
+
+  if (enc_cfg->g_w < 16 || enc_cfg->g_w % 2 || enc_cfg->g_h < 16 ||
+      enc_cfg->g_h % 2)
+    die("Invalid resolution: %d x %d\n", enc_cfg->g_w, enc_cfg->g_h);
+
+  printf(
+      "Codec %s\nframes: %d, skip: %d\n"
+      "layers: %d\n"
+      "width %d, height: %d,\n"
+      "num: %d, den: %d, bitrate: %d,\n"
+      "gop size: %d\n",
+      vpx_codec_iface_name(vpx_codec_vp9_cx()), app_input->frames_to_code,
+      app_input->frames_to_skip,
+      svc_ctx->spatial_layers, enc_cfg->g_w, enc_cfg->g_h,
+      enc_cfg->g_timebase.num, enc_cfg->g_timebase.den,
+      enc_cfg->rc_target_bitrate, enc_cfg->kf_max_dist);
+}
+
+#if OUTPUT_RC_STATS
+// For rate control encoding stats.
+struct RateControlStats {
+  // Number of input frames per layer.
+  int layer_input_frames[VPX_MAX_LAYERS];
+  // Total (cumulative) number of encoded frames per layer.
+  int layer_tot_enc_frames[VPX_MAX_LAYERS];
+  // Number of encoded non-key frames per layer.
+  int layer_enc_frames[VPX_MAX_LAYERS];
+  // Framerate per layer (cumulative).
+  double layer_framerate[VPX_MAX_LAYERS];
+  // Target average frame size per layer (per-frame-bandwidth per layer).
+  double layer_pfb[VPX_MAX_LAYERS];
+  // Actual average frame size per layer.
+  double layer_avg_frame_size[VPX_MAX_LAYERS];
+  // Average rate mismatch per layer (|target - actual| / target).
+  double layer_avg_rate_mismatch[VPX_MAX_LAYERS];
+  // Actual encoding bitrate per layer (cumulative).
+  double layer_encoding_bitrate[VPX_MAX_LAYERS];
+  // Average of the short-time encoder actual bitrate.
+  // TODO(marpan): Should we add these short-time stats for each layer?
+  double avg_st_encoding_bitrate;
+  // Variance of the short-time encoder actual bitrate.
+  double variance_st_encoding_bitrate;
+  // Window (number of frames) for computing short-time encoding bitrate.
+  int window_size;
+  // Number of window measurements.
+  int window_count;
+};
+
+// Note: these rate control stats assume only 1 key frame in the
+// sequence (i.e., first frame only).
+static void set_rate_control_stats(struct RateControlStats *rc,
+                                     vpx_codec_enc_cfg_t *cfg) {
+  unsigned int sl, tl;
+  // Set the layer (cumulative) framerate and the target layer (non-cumulative)
+  // per-frame-bandwidth, for the rate control encoding stats below.
+  const double framerate = cfg->g_timebase.den / cfg->g_timebase.num;
+
+  for (sl = 0; sl < cfg->ss_number_layers; ++sl) {
+    for (tl = 0; tl < cfg->ts_number_layers; ++tl) {
+      const int layer = sl * cfg->ts_number_layers + tl;
+      const int tlayer0 = sl * cfg->ts_number_layers;
+      if (cfg->ts_number_layers == 1)
+        rc->layer_framerate[layer] = framerate;
+      else
+        rc->layer_framerate[layer] =
+          framerate / cfg->ts_rate_decimator[tl];
+      if (tl > 0) {
+        rc->layer_pfb[layer] = 1000.0 *
+            (cfg->layer_target_bitrate[layer] -
+                cfg->layer_target_bitrate[layer - 1]) /
+            (rc->layer_framerate[layer] -
+                rc->layer_framerate[layer - 1]);
+      } else {
+        rc->layer_pfb[tlayer0] = 1000.0 *
+            cfg->layer_target_bitrate[tlayer0] /
+            rc->layer_framerate[tlayer0];
+      }
+      rc->layer_input_frames[layer] = 0;
+      rc->layer_enc_frames[layer] = 0;
+      rc->layer_tot_enc_frames[layer] = 0;
+      rc->layer_encoding_bitrate[layer] = 0.0;
+      rc->layer_avg_frame_size[layer] = 0.0;
+      rc->layer_avg_rate_mismatch[layer] = 0.0;
+    }
+  }
+  rc->window_count = 0;
+  rc->window_size = 15;
+  rc->avg_st_encoding_bitrate = 0.0;
+  rc->variance_st_encoding_bitrate = 0.0;
+}
+
+static void printout_rate_control_summary(struct RateControlStats *rc,
+                                          vpx_codec_enc_cfg_t *cfg,
+                                          int frame_cnt) {
+  unsigned int sl, tl;
+  int tot_num_frames = 0;
+  double perc_fluctuation = 0.0;
+  printf("Total number of processed frames: %d\n\n", frame_cnt - 1);
+  printf("Rate control layer stats for sl%d tl%d layer(s):\n\n",
+      cfg->ss_number_layers, cfg->ts_number_layers);
+  for (sl = 0; sl < cfg->ss_number_layers; ++sl) {
+    for (tl = 0; tl < cfg->ts_number_layers; ++tl) {
+      const int layer = sl * cfg->ts_number_layers + tl;
+      const int num_dropped = (tl > 0) ?
+          (rc->layer_input_frames[layer] - rc->layer_enc_frames[layer]) :
+          (rc->layer_input_frames[layer] - rc->layer_enc_frames[layer] - 1);
+      if (!sl)
+        tot_num_frames += rc->layer_input_frames[layer];
+      rc->layer_encoding_bitrate[layer] = 0.001 * rc->layer_framerate[layer] *
+          rc->layer_encoding_bitrate[layer] / tot_num_frames;
+      rc->layer_avg_frame_size[layer] = rc->layer_avg_frame_size[layer] /
+          rc->layer_enc_frames[layer];
+      rc->layer_avg_rate_mismatch[layer] =
+          100.0 * rc->layer_avg_rate_mismatch[layer] /
+          rc->layer_enc_frames[layer];
+      printf("For layer#: sl%d tl%d \n", sl, tl);
+      printf("Bitrate (target vs actual): %d %f.0 kbps\n",
+             cfg->layer_target_bitrate[layer],
+             rc->layer_encoding_bitrate[layer]);
+      printf("Average frame size (target vs actual): %f %f bits\n",
+             rc->layer_pfb[layer], rc->layer_avg_frame_size[layer]);
+      printf("Average rate_mismatch: %f\n",
+             rc->layer_avg_rate_mismatch[layer]);
+      printf("Number of input frames, encoded (non-key) frames, "
+          "and percent dropped frames: %d %d %f.0 \n",
+          rc->layer_input_frames[layer], rc->layer_enc_frames[layer],
+          100.0 * num_dropped / rc->layer_input_frames[layer]);
+      printf("\n");
+    }
+  }
+  rc->avg_st_encoding_bitrate = rc->avg_st_encoding_bitrate / rc->window_count;
+  rc->variance_st_encoding_bitrate =
+      rc->variance_st_encoding_bitrate / rc->window_count -
+      (rc->avg_st_encoding_bitrate * rc->avg_st_encoding_bitrate);
+  perc_fluctuation = 100.0 * sqrt(rc->variance_st_encoding_bitrate) /
+      rc->avg_st_encoding_bitrate;
+  printf("Short-time stats, for window of %d frames: \n", rc->window_size);
+  printf("Average, rms-variance, and percent-fluct: %f %f %f \n",
+         rc->avg_st_encoding_bitrate,
+         sqrt(rc->variance_st_encoding_bitrate),
+         perc_fluctuation);
+  if (frame_cnt != tot_num_frames)
+    die("Error: Number of input frames not equal to output encoded frames != "
+        "%d tot_num_frames = %d\n", frame_cnt, tot_num_frames);
+}
+
+vpx_codec_err_t parse_superframe_index(const uint8_t *data,
+                                       size_t data_sz,
+                                       uint32_t sizes[8], int *count) {
+  // A chunk ending with a byte matching 0xc0 is an invalid chunk unless
+  // it is a super frame index. If the last byte of real video compression
+  // data is 0xc0 the encoder must add a 0 byte. If we have the marker but
+  // not the associated matching marker byte at the front of the index we have
+  // an invalid bitstream and need to return an error.
+
+  uint8_t marker;
+
+  marker = *(data + data_sz - 1);
+  *count = 0;
+
+
+  if ((marker & 0xe0) == 0xc0) {
+    const uint32_t frames = (marker & 0x7) + 1;
+    const uint32_t mag = ((marker >> 3) & 0x3) + 1;
+    const size_t index_sz = 2 + mag * frames;
+
+    // This chunk is marked as having a superframe index but doesn't have
+    // enough data for it, thus it's an invalid superframe index.
+    if (data_sz < index_sz)
+      return VPX_CODEC_CORRUPT_FRAME;
+
+    {
+      const uint8_t marker2 = *(data + data_sz - index_sz);
+
+      // This chunk is marked as having a superframe index but doesn't have
+      // the matching marker byte at the front of the index therefore it's an
+      // invalid chunk.
+      if (marker != marker2)
+        return VPX_CODEC_CORRUPT_FRAME;
+    }
+
+    {
+      // Found a valid superframe index.
+      uint32_t i, j;
+      const uint8_t *x = &data[data_sz - index_sz + 1];
+
+      for (i = 0; i < frames; ++i) {
+        uint32_t this_sz = 0;
+
+        for (j = 0; j < mag; ++j)
+          this_sz |= (*x++) << (j * 8);
+        sizes[i] = this_sz;
+      }
+      *count = frames;
+    }
+  }
+  return VPX_CODEC_OK;
+}
+#endif
+
+// Example pattern for spatial layers and 2 temporal layers used in the
+// bypass/flexible mode. The pattern corresponds to the pattern
+// VP9E_TEMPORAL_LAYERING_MODE_0101 (temporal_layering_mode == 2) used in
+// non-flexible mode.
+void set_frame_flags_bypass_mode(int sl, int tl, int num_spatial_layers,
+                                 int is_key_frame,
+                                 vpx_svc_ref_frame_config_t *ref_frame_config) {
+  for (sl = 0; sl < num_spatial_layers; ++sl) {
+    if (!tl) {
+      if (!sl) {
+        ref_frame_config->frame_flags[sl] = VP8_EFLAG_NO_REF_GF |
+                                            VP8_EFLAG_NO_REF_ARF |
+                                            VP8_EFLAG_NO_UPD_GF |
+                                            VP8_EFLAG_NO_UPD_ARF;
+      } else {
+        if (is_key_frame) {
+          ref_frame_config->frame_flags[sl] = VP8_EFLAG_NO_REF_LAST |
+                                              VP8_EFLAG_NO_REF_ARF |
+                                              VP8_EFLAG_NO_UPD_GF |
+                                              VP8_EFLAG_NO_UPD_ARF;
+        } else {
+        ref_frame_config->frame_flags[sl] = VP8_EFLAG_NO_REF_ARF |
+                                            VP8_EFLAG_NO_UPD_GF |
+                                            VP8_EFLAG_NO_UPD_ARF;
+        }
+      }
+    } else if (tl == 1) {
+      if (!sl) {
+        ref_frame_config->frame_flags[sl] = VP8_EFLAG_NO_REF_GF |
+                                            VP8_EFLAG_NO_REF_ARF |
+                                            VP8_EFLAG_NO_UPD_LAST |
+                                            VP8_EFLAG_NO_UPD_GF;
+      } else {
+        ref_frame_config->frame_flags[sl] = VP8_EFLAG_NO_REF_ARF |
+                                            VP8_EFLAG_NO_UPD_LAST |
+                                            VP8_EFLAG_NO_UPD_GF;
+      }
+    }
+    if (tl == 0) {
+      ref_frame_config->lst_fb_idx[sl] = sl;
+      if (sl)
+        ref_frame_config->gld_fb_idx[sl] = sl - 1;
+      else
+        ref_frame_config->gld_fb_idx[sl] = 0;
+      ref_frame_config->alt_fb_idx[sl] = 0;
+    } else if (tl == 1) {
+      ref_frame_config->lst_fb_idx[sl] = sl;
+      ref_frame_config->gld_fb_idx[sl] = num_spatial_layers + sl - 1;
+      ref_frame_config->alt_fb_idx[sl] = num_spatial_layers + sl;
+    }
+  }
+}
+
+int main(int argc, const char **argv) {
+  AppInput app_input = {0};
+  VpxVideoWriter *writer = NULL;
+  VpxVideoInfo info = {0};
+  vpx_codec_ctx_t codec;
+  vpx_codec_enc_cfg_t enc_cfg;
+  SvcContext svc_ctx;
+  uint32_t i;
+  uint32_t frame_cnt = 0;
+  vpx_image_t raw;
+  vpx_codec_err_t res;
+  int pts = 0;            /* PTS starts at 0 */
+  int frame_duration = 1; /* 1 timebase tick per frame */
+  FILE *infile = NULL;
+  int end_of_stream = 0;
+  int frames_received = 0;
+#if OUTPUT_RC_STATS
+  VpxVideoWriter *outfile[VPX_TS_MAX_LAYERS] = {NULL};
+  struct RateControlStats rc;
+  vpx_svc_layer_id_t layer_id;
+  vpx_svc_ref_frame_config_t ref_frame_config;
+  int sl, tl;
+  double sum_bitrate = 0.0;
+  double sum_bitrate2 = 0.0;
+  double framerate  = 30.0;
+#endif
+  struct vpx_usec_timer timer;
+  int64_t cx_time = 0;
+  memset(&svc_ctx, 0, sizeof(svc_ctx));
+  svc_ctx.log_print = 1;
+  exec_name = argv[0];
+  parse_command_line(argc, argv, &app_input, &svc_ctx, &enc_cfg);
+
+  // Allocate image buffer
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (!vpx_img_alloc(&raw, enc_cfg.g_input_bit_depth == 8 ?
+                         VPX_IMG_FMT_I420 : VPX_IMG_FMT_I42016,
+                     enc_cfg.g_w, enc_cfg.g_h, 32)) {
+    die("Failed to allocate image %dx%d\n", enc_cfg.g_w, enc_cfg.g_h);
+  }
+#else
+  if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, enc_cfg.g_w, enc_cfg.g_h, 32)) {
+    die("Failed to allocate image %dx%d\n", enc_cfg.g_w, enc_cfg.g_h);
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  if (!(infile = fopen(app_input.input_filename, "rb")))
+    die("Failed to open %s for reading\n", app_input.input_filename);
+
+  // Initialize codec
+  if (vpx_svc_init(&svc_ctx, &codec, vpx_codec_vp9_cx(), &enc_cfg) !=
+      VPX_CODEC_OK)
+    die("Failed to initialize encoder\n");
+
+#if OUTPUT_RC_STATS
+  if (svc_ctx.output_rc_stat) {
+    set_rate_control_stats(&rc, &enc_cfg);
+    framerate = enc_cfg.g_timebase.den / enc_cfg.g_timebase.num;
+  }
+#endif
+
+  info.codec_fourcc = VP9_FOURCC;
+  info.time_base.numerator = enc_cfg.g_timebase.num;
+  info.time_base.denominator = enc_cfg.g_timebase.den;
+
+  if (!(app_input.passes == 2 && app_input.pass == 1)) {
+    // We don't save the bitstream for the 1st pass on two pass rate control
+    writer = vpx_video_writer_open(app_input.output_filename, kContainerIVF,
+                                   &info);
+    if (!writer)
+      die("Failed to open %s for writing\n", app_input.output_filename);
+  }
+#if OUTPUT_RC_STATS
+  // For now, just write temporal layer streams.
+  // TODO(wonkap): do spatial by re-writing superframe.
+  if (svc_ctx.output_rc_stat) {
+    for (tl = 0; tl < enc_cfg.ts_number_layers; ++tl) {
+      char file_name[PATH_MAX];
+
+      snprintf(file_name, sizeof(file_name), "%s_t%d.ivf",
+               app_input.output_filename, tl);
+      outfile[tl] = vpx_video_writer_open(file_name, kContainerIVF, &info);
+      if (!outfile[tl])
+        die("Failed to open %s for writing", file_name);
+    }
+  }
+#endif
+
+  // skip initial frames
+  for (i = 0; i < app_input.frames_to_skip; ++i)
+    vpx_img_read(&raw, infile);
+
+  if (svc_ctx.speed != -1)
+    vpx_codec_control(&codec, VP8E_SET_CPUUSED, svc_ctx.speed);
+  if (svc_ctx.threads)
+    vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (svc_ctx.threads >> 1));
+  if (svc_ctx.speed >= 5 && svc_ctx.aqmode == 1)
+    vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3);
+
+
+  // Encode frames
+  while (!end_of_stream) {
+    vpx_codec_iter_t iter = NULL;
+    const vpx_codec_cx_pkt_t *cx_pkt;
+    if (frame_cnt >= app_input.frames_to_code || !vpx_img_read(&raw, infile)) {
+      // We need one extra vpx_svc_encode call at end of stream to flush
+      // encoder and get remaining data
+      end_of_stream = 1;
+    }
+
+    // For BYPASS/FLEXIBLE mode, set the frame flags (reference and updates)
+    // and the buffer indices for each spatial layer of the current
+    // (super)frame to be encoded. The temporal layer_id for the current frame
+    // also needs to be set.
+    // TODO(marpan): Should rename the "VP9E_TEMPORAL_LAYERING_MODE_BYPASS"
+    // mode to "VP9E_LAYERING_MODE_BYPASS".
+    if (svc_ctx.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
+      layer_id.spatial_layer_id = 0;
+      // Example for 2 temporal layers.
+      if (frame_cnt % 2 == 0)
+        layer_id.temporal_layer_id = 0;
+      else
+        layer_id.temporal_layer_id = 1;
+      // Note that we only set the temporal layer_id, since we are calling
+      // the encode for the whole superframe. The encoder will internally loop
+      // over all the spatial layers for the current superframe.
+      vpx_codec_control(&codec, VP9E_SET_SVC_LAYER_ID, &layer_id);
+      set_frame_flags_bypass_mode(sl, layer_id.temporal_layer_id,
+                                  svc_ctx.spatial_layers,
+                                  frame_cnt == 0,
+                                  &ref_frame_config);
+      vpx_codec_control(&codec, VP9E_SET_SVC_REF_FRAME_CONFIG,
+                        &ref_frame_config);
+      // Keep track of input frames, to account for frame drops in rate control
+      // stats/metrics.
+      for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
+        ++rc.layer_input_frames[sl * enc_cfg.ts_number_layers +
+                                layer_id.temporal_layer_id];
+      }
+    }
+
+    vpx_usec_timer_start(&timer);
+    res = vpx_svc_encode(&svc_ctx, &codec, (end_of_stream ? NULL : &raw),
+                         pts, frame_duration, svc_ctx.speed >= 5 ?
+                         VPX_DL_REALTIME : VPX_DL_GOOD_QUALITY);
+    vpx_usec_timer_mark(&timer);
+    cx_time += vpx_usec_timer_elapsed(&timer);
+
+    printf("%s", vpx_svc_get_message(&svc_ctx));
+    fflush(stdout);
+    if (res != VPX_CODEC_OK) {
+      die_codec(&codec, "Failed to encode frame");
+    }
+
+    while ((cx_pkt = vpx_codec_get_cx_data(&codec, &iter)) != NULL) {
+      switch (cx_pkt->kind) {
+        case VPX_CODEC_CX_FRAME_PKT: {
+          SvcInternal_t *const si = (SvcInternal_t *)svc_ctx.internal;
+          if (cx_pkt->data.frame.sz > 0) {
+#if OUTPUT_RC_STATS
+            uint32_t sizes[8];
+            int count = 0;
+#endif
+            vpx_video_writer_write_frame(writer,
+                                         cx_pkt->data.frame.buf,
+                                         cx_pkt->data.frame.sz,
+                                         cx_pkt->data.frame.pts);
+#if OUTPUT_RC_STATS
+            // TODO(marpan/wonkap): Put this (to line728) in separate function.
+            if (svc_ctx.output_rc_stat) {
+              vpx_codec_control(&codec, VP9E_GET_SVC_LAYER_ID, &layer_id);
+              parse_superframe_index(cx_pkt->data.frame.buf,
+                                     cx_pkt->data.frame.sz, sizes, &count);
+              // Note computing input_layer_frames here won't account for frame
+              // drops in rate control stats.
+              // TODO(marpan): Fix this for non-bypass mode so we can get stats
+              // for dropped frames.
+              if (svc_ctx.temporal_layering_mode !=
+                  VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
+                for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
+                  ++rc.layer_input_frames[sl * enc_cfg.ts_number_layers +
+                                         layer_id.temporal_layer_id];
+                }
+              }
+              for (tl = layer_id.temporal_layer_id;
+                  tl < enc_cfg.ts_number_layers; ++tl) {
+                vpx_video_writer_write_frame(outfile[tl],
+                                             cx_pkt->data.frame.buf,
+                                             cx_pkt->data.frame.sz,
+                                             cx_pkt->data.frame.pts);
+              }
+
+              for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
+                for (tl = layer_id.temporal_layer_id;
+                    tl < enc_cfg.ts_number_layers; ++tl) {
+                  const int layer = sl * enc_cfg.ts_number_layers + tl;
+                  ++rc.layer_tot_enc_frames[layer];
+                  rc.layer_encoding_bitrate[layer] += 8.0 * sizes[sl];
+                  // Keep count of rate control stats per layer, for non-key
+                  // frames.
+                  if (tl == layer_id.temporal_layer_id &&
+                      !(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY)) {
+                    rc.layer_avg_frame_size[layer] += 8.0 * sizes[sl];
+                    rc.layer_avg_rate_mismatch[layer] +=
+                        fabs(8.0 * sizes[sl] - rc.layer_pfb[layer]) /
+                        rc.layer_pfb[layer];
+                    ++rc.layer_enc_frames[layer];
+                  }
+                }
+              }
+
+              // Update for short-time encoding bitrate states, for moving
+              // window of size rc->window, shifted by rc->window / 2.
+              // Ignore first window segment, due to key frame.
+              if (frame_cnt > rc.window_size) {
+                tl = layer_id.temporal_layer_id;
+                for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
+                  sum_bitrate += 0.001 * 8.0 * sizes[sl] * framerate;
+                }
+                if (frame_cnt % rc.window_size == 0) {
+                  rc.window_count += 1;
+                  rc.avg_st_encoding_bitrate += sum_bitrate / rc.window_size;
+                  rc.variance_st_encoding_bitrate +=
+                      (sum_bitrate / rc.window_size) *
+                      (sum_bitrate / rc.window_size);
+                  sum_bitrate = 0.0;
+                }
+              }
+
+              // Second shifted window.
+              if (frame_cnt > rc.window_size + rc.window_size / 2) {
+               tl = layer_id.temporal_layer_id;
+               for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
+                 sum_bitrate2 += 0.001 * 8.0 * sizes[sl] * framerate;
+               }
+
+               if (frame_cnt > 2 * rc.window_size &&
+                  frame_cnt % rc.window_size == 0) {
+                 rc.window_count += 1;
+                 rc.avg_st_encoding_bitrate += sum_bitrate2 / rc.window_size;
+                 rc.variance_st_encoding_bitrate +=
+                    (sum_bitrate2 / rc.window_size) *
+                    (sum_bitrate2 / rc.window_size);
+                 sum_bitrate2 = 0.0;
+               }
+              }
+            }
+#endif
+          }
+
+          printf("SVC frame: %d, kf: %d, size: %d, pts: %d\n", frames_received,
+                 !!(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY),
+                 (int)cx_pkt->data.frame.sz, (int)cx_pkt->data.frame.pts);
+          if (enc_cfg.ss_number_layers == 1 && enc_cfg.ts_number_layers == 1)
+            si->bytes_sum[0] += (int)cx_pkt->data.frame.sz;
+          ++frames_received;
+          break;
+        }
+        case VPX_CODEC_STATS_PKT: {
+          stats_write(&app_input.rc_stats,
+                      cx_pkt->data.twopass_stats.buf,
+                      cx_pkt->data.twopass_stats.sz);
+          break;
+        }
+        default: {
+          break;
+        }
+      }
+    }
+
+    if (!end_of_stream) {
+      ++frame_cnt;
+      pts += frame_duration;
+    }
+  }
+
+  // Compensate for the extra frame count for the bypass mode.
+  if (svc_ctx.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
+    for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
+      const int layer = sl * enc_cfg.ts_number_layers +
+          layer_id.temporal_layer_id;
+      --rc.layer_input_frames[layer];
+    }
+  }
+
+  printf("Processed %d frames\n", frame_cnt);
+  fclose(infile);
+#if OUTPUT_RC_STATS
+  if (svc_ctx.output_rc_stat) {
+    printout_rate_control_summary(&rc, &enc_cfg, frame_cnt);
+    printf("\n");
+  }
+#endif
+  if (vpx_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec");
+  if (app_input.passes == 2)
+    stats_close(&app_input.rc_stats, 1);
+  if (writer) {
+    vpx_video_writer_close(writer);
+  }
+#if OUTPUT_RC_STATS
+  if (svc_ctx.output_rc_stat) {
+    for (tl = 0; tl < enc_cfg.ts_number_layers; ++tl) {
+      vpx_video_writer_close(outfile[tl]);
+    }
+  }
+#endif
+  printf("Frame cnt and encoding time/FPS stats for encoding: %d %f %f \n",
+         frame_cnt,
+         1000 * (float)cx_time / (double)(frame_cnt * 1000000),
+         1000000 * (double)frame_cnt / (double)cx_time);
+  vpx_img_free(&raw);
+  // display average size, psnr
+  printf("%s", vpx_svc_dump_statistics(&svc_ctx));
+  vpx_svc_release(&svc_ctx);
+  return EXIT_SUCCESS;
+}
diff --git a/libs/libvpx/examples/vpx_temporal_svc_encoder.c b/libs/libvpx/examples/vpx_temporal_svc_encoder.c
new file mode 100644
index 0000000000..16abb9deb0
--- /dev/null
+++ b/libs/libvpx/examples/vpx_temporal_svc_encoder.c
@@ -0,0 +1,852 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+//  This is an example demonstrating how to implement a multi-layer VPx
+//  encoding scheme based on temporal scalability for video applications
+//  that benefit from a scalable bitstream.
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "./vpx_config.h"
+#include "../vpx_ports/vpx_timer.h"
+#include "vpx/vp8cx.h"
+#include "vpx/vpx_encoder.h"
+
+#include "../tools_common.h"
+#include "../video_writer.h"
+
+static const char *exec_name;
+
+void usage_exit(void) {
+  exit(EXIT_FAILURE);
+}
+
+// Denoiser states, for temporal denoising.
+enum denoiserState {
+  kDenoiserOff,
+  kDenoiserOnYOnly,
+  kDenoiserOnYUV,
+  kDenoiserOnYUVAggressive,
+  kDenoiserOnAdaptive
+};
+
+static int mode_to_num_layers[13] = {1, 2, 2, 3, 3, 3, 3, 5, 2, 3, 3, 3, 3};
+
+// For rate control encoding stats.
+struct RateControlMetrics {
+  // Number of input frames per layer.
+  int layer_input_frames[VPX_TS_MAX_LAYERS];
+  // Total (cumulative) number of encoded frames per layer.
+  int layer_tot_enc_frames[VPX_TS_MAX_LAYERS];
+  // Number of encoded non-key frames per layer.
+  int layer_enc_frames[VPX_TS_MAX_LAYERS];
+  // Framerate per layer layer (cumulative).
+  double layer_framerate[VPX_TS_MAX_LAYERS];
+  // Target average frame size per layer (per-frame-bandwidth per layer).
+  double layer_pfb[VPX_TS_MAX_LAYERS];
+  // Actual average frame size per layer.
+  double layer_avg_frame_size[VPX_TS_MAX_LAYERS];
+  // Average rate mismatch per layer (|target - actual| / target).
+  double layer_avg_rate_mismatch[VPX_TS_MAX_LAYERS];
+  // Actual encoding bitrate per layer (cumulative).
+  double layer_encoding_bitrate[VPX_TS_MAX_LAYERS];
+  // Average of the short-time encoder actual bitrate.
+  // TODO(marpan): Should we add these short-time stats for each layer?
+  double avg_st_encoding_bitrate;
+  // Variance of the short-time encoder actual bitrate.
+  double variance_st_encoding_bitrate;
+  // Window (number of frames) for computing short-timee encoding bitrate.
+  int window_size;
+  // Number of window measurements.
+  int window_count;
+  int layer_target_bitrate[VPX_MAX_LAYERS];
+};
+
+// Note: these rate control metrics assume only 1 key frame in the
+// sequence (i.e., first frame only). So for temporal pattern# 7
+// (which has key frame for every frame on base layer), the metrics
+// computation will be off/wrong.
+// TODO(marpan): Update these metrics to account for multiple key frames
+// in the stream.
+static void set_rate_control_metrics(struct RateControlMetrics *rc,
+                                     vpx_codec_enc_cfg_t *cfg) {
+  unsigned int i = 0;
+  // Set the layer (cumulative) framerate and the target layer (non-cumulative)
+  // per-frame-bandwidth, for the rate control encoding stats below.
+  const double framerate = cfg->g_timebase.den / cfg->g_timebase.num;
+  rc->layer_framerate[0] = framerate / cfg->ts_rate_decimator[0];
+  rc->layer_pfb[0] = 1000.0 * rc->layer_target_bitrate[0] /
+      rc->layer_framerate[0];
+  for (i = 0; i < cfg->ts_number_layers; ++i) {
+    if (i > 0) {
+      rc->layer_framerate[i] = framerate / cfg->ts_rate_decimator[i];
+      rc->layer_pfb[i] = 1000.0 *
+          (rc->layer_target_bitrate[i] - rc->layer_target_bitrate[i - 1]) /
+          (rc->layer_framerate[i] - rc->layer_framerate[i - 1]);
+    }
+    rc->layer_input_frames[i] = 0;
+    rc->layer_enc_frames[i] = 0;
+    rc->layer_tot_enc_frames[i] = 0;
+    rc->layer_encoding_bitrate[i] = 0.0;
+    rc->layer_avg_frame_size[i] = 0.0;
+    rc->layer_avg_rate_mismatch[i] = 0.0;
+  }
+  rc->window_count = 0;
+  rc->window_size = 15;
+  rc->avg_st_encoding_bitrate = 0.0;
+  rc->variance_st_encoding_bitrate = 0.0;
+}
+
+static void printout_rate_control_summary(struct RateControlMetrics *rc,
+                                          vpx_codec_enc_cfg_t *cfg,
+                                          int frame_cnt) {
+  unsigned int i = 0;
+  int tot_num_frames = 0;
+  double perc_fluctuation = 0.0;
+  printf("Total number of processed frames: %d\n\n", frame_cnt -1);
+  printf("Rate control layer stats for %d layer(s):\n\n",
+      cfg->ts_number_layers);
+  for (i = 0; i < cfg->ts_number_layers; ++i) {
+    const int num_dropped = (i > 0) ?
+        (rc->layer_input_frames[i] - rc->layer_enc_frames[i]) :
+        (rc->layer_input_frames[i] - rc->layer_enc_frames[i] - 1);
+    tot_num_frames += rc->layer_input_frames[i];
+    rc->layer_encoding_bitrate[i] = 0.001 * rc->layer_framerate[i] *
+        rc->layer_encoding_bitrate[i] / tot_num_frames;
+    rc->layer_avg_frame_size[i] = rc->layer_avg_frame_size[i] /
+        rc->layer_enc_frames[i];
+    rc->layer_avg_rate_mismatch[i] = 100.0 * rc->layer_avg_rate_mismatch[i] /
+        rc->layer_enc_frames[i];
+    printf("For layer#: %d \n", i);
+    printf("Bitrate (target vs actual): %d %f \n", rc->layer_target_bitrate[i],
+           rc->layer_encoding_bitrate[i]);
+    printf("Average frame size (target vs actual): %f %f \n", rc->layer_pfb[i],
+           rc->layer_avg_frame_size[i]);
+    printf("Average rate_mismatch: %f \n", rc->layer_avg_rate_mismatch[i]);
+    printf("Number of input frames, encoded (non-key) frames, "
+        "and perc dropped frames: %d %d %f \n", rc->layer_input_frames[i],
+        rc->layer_enc_frames[i],
+        100.0 * num_dropped / rc->layer_input_frames[i]);
+    printf("\n");
+  }
+  rc->avg_st_encoding_bitrate = rc->avg_st_encoding_bitrate / rc->window_count;
+  rc->variance_st_encoding_bitrate =
+      rc->variance_st_encoding_bitrate / rc->window_count -
+      (rc->avg_st_encoding_bitrate * rc->avg_st_encoding_bitrate);
+  perc_fluctuation = 100.0 * sqrt(rc->variance_st_encoding_bitrate) /
+      rc->avg_st_encoding_bitrate;
+  printf("Short-time stats, for window of %d frames: \n",rc->window_size);
+  printf("Average, rms-variance, and percent-fluct: %f %f %f \n",
+         rc->avg_st_encoding_bitrate,
+         sqrt(rc->variance_st_encoding_bitrate),
+         perc_fluctuation);
+  if ((frame_cnt - 1) != tot_num_frames)
+    die("Error: Number of input frames not equal to output! \n");
+}
+
+// Temporal scaling parameters:
+// NOTE: The 3 prediction frames cannot be used interchangeably due to
+// differences in the way they are handled throughout the code. The
+// frames should be allocated to layers in the order LAST, GF, ARF.
+// Other combinations work, but may produce slightly inferior results.
+static void set_temporal_layer_pattern(int layering_mode,
+                                       vpx_codec_enc_cfg_t *cfg,
+                                       int *layer_flags,
+                                       int *flag_periodicity) {
+  switch (layering_mode) {
+    case 0: {
+      // 1-layer.
+      int ids[1] = {0};
+      cfg->ts_periodicity = 1;
+      *flag_periodicity = 1;
+      cfg->ts_number_layers = 1;
+      cfg->ts_rate_decimator[0] = 1;
+      memcpy(cfg->ts_layer_id, ids, sizeof(ids));
+      // Update L only.
+      layer_flags[0] = VPX_EFLAG_FORCE_KF  | VP8_EFLAG_NO_UPD_GF |
+          VP8_EFLAG_NO_UPD_ARF;
+      break;
+    }
+    case 1: {
+      // 2-layers, 2-frame period.
+      int ids[2] = {0, 1};
+      cfg->ts_periodicity = 2;
+      *flag_periodicity = 2;
+      cfg->ts_number_layers = 2;
+      cfg->ts_rate_decimator[0] = 2;
+      cfg->ts_rate_decimator[1] = 1;
+      memcpy(cfg->ts_layer_id, ids, sizeof(ids));
+#if 1
+      // 0=L, 1=GF, Intra-layer prediction enabled.
+      layer_flags[0] = VPX_EFLAG_FORCE_KF  | VP8_EFLAG_NO_UPD_GF |
+          VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF;
+      layer_flags[1] = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST |
+          VP8_EFLAG_NO_REF_ARF;
+#else
+       // 0=L, 1=GF, Intra-layer prediction disabled.
+      layer_flags[0] = VPX_EFLAG_FORCE_KF  | VP8_EFLAG_NO_UPD_GF |
+          VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF;
+      layer_flags[1] = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST |
+          VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_REF_LAST;
+#endif
+      break;
+    }
+    case 2: {
+      // 2-layers, 3-frame period.
+      int ids[3] = {0, 1, 1};
+      cfg->ts_periodicity = 3;
+      *flag_periodicity = 3;
+      cfg->ts_number_layers = 2;
+      cfg->ts_rate_decimator[0] = 3;
+      cfg->ts_rate_decimator[1] = 1;
+      memcpy(cfg->ts_layer_id, ids, sizeof(ids));
+      // 0=L, 1=GF, Intra-layer prediction enabled.
+      layer_flags[0] = VPX_EFLAG_FORCE_KF  | VP8_EFLAG_NO_REF_GF |
+          VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
+      layer_flags[1] =
+      layer_flags[2] = VP8_EFLAG_NO_REF_GF  | VP8_EFLAG_NO_REF_ARF |
+          VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST;
+      break;
+    }
+    case 3: {
+      // 3-layers, 6-frame period.
+      int ids[6] = {0, 2, 2, 1, 2, 2};
+      cfg->ts_periodicity = 6;
+      *flag_periodicity = 6;
+      cfg->ts_number_layers = 3;
+      cfg->ts_rate_decimator[0] = 6;
+      cfg->ts_rate_decimator[1] = 3;
+      cfg->ts_rate_decimator[2] = 1;
+      memcpy(cfg->ts_layer_id, ids, sizeof(ids));
+      // 0=L, 1=GF, 2=ARF, Intra-layer prediction enabled.
+      layer_flags[0] = VPX_EFLAG_FORCE_KF  | VP8_EFLAG_NO_REF_GF |
+          VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
+      layer_flags[3] = VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_ARF |
+          VP8_EFLAG_NO_UPD_LAST;
+      layer_flags[1] =
+      layer_flags[2] =
+      layer_flags[4] =
+      layer_flags[5] = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_LAST;
+      break;
+    }
+    case 4: {
+      // 3-layers, 4-frame period.
+      int ids[4] = {0, 2, 1, 2};
+      cfg->ts_periodicity = 4;
+      *flag_periodicity = 4;
+      cfg->ts_number_layers = 3;
+      cfg->ts_rate_decimator[0] = 4;
+      cfg->ts_rate_decimator[1] = 2;
+      cfg->ts_rate_decimator[2] = 1;
+      memcpy(cfg->ts_layer_id, ids, sizeof(ids));
+      // 0=L, 1=GF, 2=ARF, Intra-layer prediction disabled.
+      layer_flags[0] = VPX_EFLAG_FORCE_KF  | VP8_EFLAG_NO_REF_GF |
+          VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
+      layer_flags[2] = VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF |
+          VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST;
+      layer_flags[1] =
+      layer_flags[3] = VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_LAST |
+          VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
+      break;
+    }
+    case 5: {
+      // 3-layers, 4-frame period.
+      int ids[4] = {0, 2, 1, 2};
+      cfg->ts_periodicity = 4;
+      *flag_periodicity = 4;
+      cfg->ts_number_layers     = 3;
+      cfg->ts_rate_decimator[0] = 4;
+      cfg->ts_rate_decimator[1] = 2;
+      cfg->ts_rate_decimator[2] = 1;
+      memcpy(cfg->ts_layer_id, ids, sizeof(ids));
+      // 0=L, 1=GF, 2=ARF, Intra-layer prediction enabled in layer 1, disabled
+      // in layer 2.
+      layer_flags[0] = VPX_EFLAG_FORCE_KF  | VP8_EFLAG_NO_REF_GF |
+          VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
+      layer_flags[2] = VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_LAST |
+          VP8_EFLAG_NO_UPD_ARF;
+      layer_flags[1] =
+      layer_flags[3] = VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_LAST |
+          VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
+      break;
+    }
+    case 6: {
+      // 3-layers, 4-frame period.
+      int ids[4] = {0, 2, 1, 2};
+      cfg->ts_periodicity = 4;
+      *flag_periodicity = 4;
+      cfg->ts_number_layers = 3;
+      cfg->ts_rate_decimator[0] = 4;
+      cfg->ts_rate_decimator[1] = 2;
+      cfg->ts_rate_decimator[2] = 1;
+      memcpy(cfg->ts_layer_id, ids, sizeof(ids));
+      // 0=L, 1=GF, 2=ARF, Intra-layer prediction enabled.
+      layer_flags[0] = VPX_EFLAG_FORCE_KF  | VP8_EFLAG_NO_REF_GF |
+          VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
+      layer_flags[2] = VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_LAST |
+          VP8_EFLAG_NO_UPD_ARF;
+      layer_flags[1] =
+      layer_flags[3] = VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF;
+      break;
+    }
+    case 7: {
+      // NOTE: Probably of academic interest only.
+      // 5-layers, 16-frame period.
+      int ids[16] = {0, 4, 3, 4, 2, 4, 3, 4, 1, 4, 3, 4, 2, 4, 3, 4};
+      cfg->ts_periodicity = 16;
+      *flag_periodicity = 16;
+      cfg->ts_number_layers = 5;
+      cfg->ts_rate_decimator[0] = 16;
+      cfg->ts_rate_decimator[1] = 8;
+      cfg->ts_rate_decimator[2] = 4;
+      cfg->ts_rate_decimator[3] = 2;
+      cfg->ts_rate_decimator[4] = 1;
+      memcpy(cfg->ts_layer_id, ids, sizeof(ids));
+      layer_flags[0]  = VPX_EFLAG_FORCE_KF;
+      layer_flags[1]  =
+      layer_flags[3]  =
+      layer_flags[5]  =
+      layer_flags[7]  =
+      layer_flags[9]  =
+      layer_flags[11] =
+      layer_flags[13] =
+      layer_flags[15] = VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF |
+          VP8_EFLAG_NO_UPD_ARF;
+      layer_flags[2]  =
+      layer_flags[6]  =
+      layer_flags[10] =
+      layer_flags[14] = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_GF;
+      layer_flags[4] =
+      layer_flags[12] = VP8_EFLAG_NO_REF_LAST | VP8_EFLAG_NO_UPD_ARF;
+      layer_flags[8]  = VP8_EFLAG_NO_REF_LAST | VP8_EFLAG_NO_REF_GF;
+      break;
+    }
+    case 8: {
+      // 2-layers, with sync point at first frame of layer 1.
+      int ids[2] = {0, 1};
+      cfg->ts_periodicity = 2;
+      *flag_periodicity = 8;
+      cfg->ts_number_layers = 2;
+      cfg->ts_rate_decimator[0] = 2;
+      cfg->ts_rate_decimator[1] = 1;
+      memcpy(cfg->ts_layer_id, ids, sizeof(ids));
+      // 0=L, 1=GF.
+      // ARF is used as predictor for all frames, and is only updated on
+      // key frame. Sync point every 8 frames.
+
+      // Layer 0: predict from L and ARF, update L and G.
+      layer_flags[0] = VPX_EFLAG_FORCE_KF  | VP8_EFLAG_NO_REF_GF |
+          VP8_EFLAG_NO_UPD_ARF;
+      // Layer 1: sync point: predict from L and ARF, and update G.
+      layer_flags[1] = VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_UPD_LAST |
+          VP8_EFLAG_NO_UPD_ARF;
+      // Layer 0, predict from L and ARF, update L.
+      layer_flags[2] = VP8_EFLAG_NO_REF_GF  | VP8_EFLAG_NO_UPD_GF |
+          VP8_EFLAG_NO_UPD_ARF;
+      // Layer 1: predict from L, G and ARF, and update G.
+      layer_flags[3] = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST |
+          VP8_EFLAG_NO_UPD_ENTROPY;
+      // Layer 0.
+      layer_flags[4] = layer_flags[2];
+      // Layer 1.
+      layer_flags[5] = layer_flags[3];
+      // Layer 0.
+      layer_flags[6] = layer_flags[4];
+      // Layer 1.
+      layer_flags[7] = layer_flags[5];
+     break;
+    }
+    case 9: {
+      // 3-layers: Sync points for layer 1 and 2 every 8 frames.
+      int ids[4] = {0, 2, 1, 2};
+      cfg->ts_periodicity = 4;
+      *flag_periodicity = 8;
+      cfg->ts_number_layers = 3;
+      cfg->ts_rate_decimator[0] = 4;
+      cfg->ts_rate_decimator[1] = 2;
+      cfg->ts_rate_decimator[2] = 1;
+      memcpy(cfg->ts_layer_id, ids, sizeof(ids));
+      // 0=L, 1=GF, 2=ARF.
+      layer_flags[0] = VPX_EFLAG_FORCE_KF  | VP8_EFLAG_NO_REF_GF |
+          VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
+      layer_flags[1] = VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF |
+          VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF;
+      layer_flags[2] = VP8_EFLAG_NO_REF_GF   | VP8_EFLAG_NO_REF_ARF |
+          VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_ARF;
+      layer_flags[3] =
+      layer_flags[5] = VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF;
+      layer_flags[4] = VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF |
+          VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
+      layer_flags[6] = VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_LAST |
+          VP8_EFLAG_NO_UPD_ARF;
+      layer_flags[7] = VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF |
+          VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_ENTROPY;
+      break;
+    }
+    case 10: {
+      // 3-layers structure where ARF is used as predictor for all frames,
+      // and is only updated on key frame.
+      // Sync points for layer 1 and 2 every 8 frames.
+
+      int ids[4] = {0, 2, 1, 2};
+      cfg->ts_periodicity = 4;
+      *flag_periodicity = 8;
+      cfg->ts_number_layers = 3;
+      cfg->ts_rate_decimator[0] = 4;
+      cfg->ts_rate_decimator[1] = 2;
+      cfg->ts_rate_decimator[2] = 1;
+      memcpy(cfg->ts_layer_id, ids, sizeof(ids));
+      // 0=L, 1=GF, 2=ARF.
+      // Layer 0: predict from L and ARF; update L and G.
+      layer_flags[0] = VPX_EFLAG_FORCE_KF | VP8_EFLAG_NO_UPD_ARF |
+          VP8_EFLAG_NO_REF_GF;
+      // Layer 2: sync point: predict from L and ARF; update none.
+      layer_flags[1] = VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_UPD_GF |
+          VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST |
+          VP8_EFLAG_NO_UPD_ENTROPY;
+      // Layer 1: sync point: predict from L and ARF; update G.
+      layer_flags[2] = VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_UPD_ARF |
+          VP8_EFLAG_NO_UPD_LAST;
+      // Layer 2: predict from L, G, ARF; update none.
+      layer_flags[3] = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF |
+          VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_ENTROPY;
+      // Layer 0: predict from L and ARF; update L.
+      layer_flags[4] = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF |
+          VP8_EFLAG_NO_REF_GF;
+      // Layer 2: predict from L, G, ARF; update none.
+      layer_flags[5] = layer_flags[3];
+      // Layer 1: predict from L, G, ARF; update G.
+      layer_flags[6] = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST;
+      // Layer 2: predict from L, G, ARF; update none.
+      layer_flags[7] = layer_flags[3];
+      break;
+    }
+    case 11: {
+      // 3-layers structure with one reference frame.
+      // This works same as temporal_layering_mode 3.
+      // This was added to compare with vp9_spatial_svc_encoder.
+
+      // 3-layers, 4-frame period.
+      int ids[4] = {0, 2, 1, 2};
+      cfg->ts_periodicity = 4;
+      *flag_periodicity = 4;
+      cfg->ts_number_layers = 3;
+      cfg->ts_rate_decimator[0] = 4;
+      cfg->ts_rate_decimator[1] = 2;
+      cfg->ts_rate_decimator[2] = 1;
+      memcpy(cfg->ts_layer_id, ids, sizeof(ids));
+      // 0=L, 1=GF, 2=ARF, Intra-layer prediction disabled.
+      layer_flags[0] = VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF |
+          VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
+      layer_flags[2] = VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF |
+          VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST;
+      layer_flags[1] = VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF |
+          VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF;
+      layer_flags[3] = VP8_EFLAG_NO_REF_LAST | VP8_EFLAG_NO_REF_ARF |
+          VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF;
+      break;
+    }
+    case 12:
+    default: {
+      // 3-layers structure as in case 10, but no sync/refresh points for
+      // layer 1 and 2.
+      int ids[4] = {0, 2, 1, 2};
+      cfg->ts_periodicity = 4;
+      *flag_periodicity = 8;
+      cfg->ts_number_layers = 3;
+      cfg->ts_rate_decimator[0] = 4;
+      cfg->ts_rate_decimator[1] = 2;
+      cfg->ts_rate_decimator[2] = 1;
+      memcpy(cfg->ts_layer_id, ids, sizeof(ids));
+      // 0=L, 1=GF, 2=ARF.
+      // Layer 0: predict from L and ARF; update L.
+      layer_flags[0] = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF |
+          VP8_EFLAG_NO_REF_GF;
+      layer_flags[4] = layer_flags[0];
+      // Layer 1: predict from L, G, ARF; update G.
+      layer_flags[2] = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST;
+      layer_flags[6] = layer_flags[2];
+      // Layer 2: predict from L, G, ARF; update none.
+      layer_flags[1] = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF |
+          VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_ENTROPY;
+      layer_flags[3] = layer_flags[1];
+      layer_flags[5] = layer_flags[1];
+      layer_flags[7] = layer_flags[1];
+      break;
+    }
+  }
+}
+
+int main(int argc, char **argv) {
+  VpxVideoWriter *outfile[VPX_TS_MAX_LAYERS] = {NULL};
+  vpx_codec_ctx_t codec;
+  vpx_codec_enc_cfg_t cfg;
+  int frame_cnt = 0;
+  vpx_image_t raw;
+  vpx_codec_err_t res;
+  unsigned int width;
+  unsigned int height;
+  int speed;
+  int frame_avail;
+  int got_data;
+  int flags = 0;
+  unsigned int i;
+  int pts = 0;  // PTS starts at 0.
+  int frame_duration = 1;  // 1 timebase tick per frame.
+  int layering_mode = 0;
+  int layer_flags[VPX_TS_MAX_PERIODICITY] = {0};
+  int flag_periodicity = 1;
+#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION)
+  vpx_svc_layer_id_t layer_id = {0, 0};
+#else
+  vpx_svc_layer_id_t layer_id = {0};
+#endif
+  const VpxInterface *encoder = NULL;
+  FILE *infile = NULL;
+  struct RateControlMetrics rc;
+  int64_t cx_time = 0;
+  const int min_args_base = 11;
+#if CONFIG_VP9_HIGHBITDEPTH
+  vpx_bit_depth_t bit_depth = VPX_BITS_8;
+  int input_bit_depth = 8;
+  const int min_args = min_args_base + 1;
+#else
+  const int min_args = min_args_base;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  double sum_bitrate = 0.0;
+  double sum_bitrate2 = 0.0;
+  double framerate  = 30.0;
+
+  exec_name = argv[0];
+  // Check usage and arguments.
+  if (argc < min_args) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    die("Usage: %s <infile> <outfile> <codec_type(vp8/vp9)> <width> <height> "
+        "<rate_num> <rate_den> <speed> <frame_drop_threshold> <mode> "
+        "<Rate_0> ... <Rate_nlayers-1> <bit-depth> \n", argv[0]);
+#else
+    die("Usage: %s <infile> <outfile> <codec_type(vp8/vp9)> <width> <height> "
+        "<rate_num> <rate_den> <speed> <frame_drop_threshold> <mode> "
+        "<Rate_0> ... <Rate_nlayers-1> \n", argv[0]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  }
+
+  encoder = get_vpx_encoder_by_name(argv[3]);
+  if (!encoder)
+    die("Unsupported codec.");
+
+  printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface()));
+
+  width = strtol(argv[4], NULL, 0);
+  height = strtol(argv[5], NULL, 0);
+  if (width < 16 || width % 2 || height < 16 || height % 2) {
+    die("Invalid resolution: %d x %d", width, height);
+  }
+
+  layering_mode = strtol(argv[10], NULL, 0);
+  if (layering_mode < 0 || layering_mode > 13) {
+    die("Invalid layering mode (0..12) %s", argv[10]);
+  }
+
+  if (argc != min_args + mode_to_num_layers[layering_mode]) {
+    die("Invalid number of arguments");
+  }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  switch (strtol(argv[argc-1], NULL, 0)) {
+    case 8:
+      bit_depth = VPX_BITS_8;
+      input_bit_depth = 8;
+      break;
+    case 10:
+      bit_depth = VPX_BITS_10;
+      input_bit_depth = 10;
+      break;
+    case 12:
+      bit_depth = VPX_BITS_12;
+      input_bit_depth = 12;
+      break;
+    default:
+      die("Invalid bit depth (8, 10, 12) %s", argv[argc-1]);
+  }
+  if (!vpx_img_alloc(&raw,
+                     bit_depth == VPX_BITS_8 ? VPX_IMG_FMT_I420 :
+                                               VPX_IMG_FMT_I42016,
+                     width, height, 32)) {
+    die("Failed to allocate image", width, height);
+  }
+#else
+  if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, width, height, 32)) {
+    die("Failed to allocate image", width, height);
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  // Populate encoder configuration.
+  res = vpx_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
+  if (res) {
+    printf("Failed to get config: %s\n", vpx_codec_err_to_string(res));
+    return EXIT_FAILURE;
+  }
+
+  // Update the default configuration with our settings.
+  cfg.g_w = width;
+  cfg.g_h = height;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (bit_depth != VPX_BITS_8) {
+    cfg.g_bit_depth = bit_depth;
+    cfg.g_input_bit_depth = input_bit_depth;
+    cfg.g_profile = 2;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  // Timebase format e.g. 30fps: numerator=1, demoninator = 30.
+  cfg.g_timebase.num = strtol(argv[6], NULL, 0);
+  cfg.g_timebase.den = strtol(argv[7], NULL, 0);
+
+  speed = strtol(argv[8], NULL, 0);
+  if (speed < 0) {
+    die("Invalid speed setting: must be positive");
+  }
+
+  for (i = min_args_base;
+       (int)i < min_args_base + mode_to_num_layers[layering_mode];
+       ++i) {
+    rc.layer_target_bitrate[i - 11] = strtol(argv[i], NULL, 0);
+    if (strncmp(encoder->name, "vp8", 3) == 0)
+      cfg.ts_target_bitrate[i - 11] = rc.layer_target_bitrate[i - 11];
+    else if (strncmp(encoder->name, "vp9", 3) == 0)
+      cfg.layer_target_bitrate[i - 11] = rc.layer_target_bitrate[i - 11];
+  }
+
+  // Real time parameters.
+  cfg.rc_dropframe_thresh = strtol(argv[9], NULL, 0);
+  cfg.rc_end_usage = VPX_CBR;
+  cfg.rc_min_quantizer = 2;
+  cfg.rc_max_quantizer = 56;
+  if (strncmp(encoder->name, "vp9", 3) == 0)
+    cfg.rc_max_quantizer = 52;
+  cfg.rc_undershoot_pct = 50;
+  cfg.rc_overshoot_pct = 50;
+  cfg.rc_buf_initial_sz = 500;
+  cfg.rc_buf_optimal_sz = 600;
+  cfg.rc_buf_sz = 1000;
+
+  // Disable dynamic resizing by default.
+  cfg.rc_resize_allowed = 0;
+
+  // Use 1 thread as default.
+  cfg.g_threads = 1;
+
+  // Enable error resilient mode.
+  cfg.g_error_resilient = 1;
+  cfg.g_lag_in_frames   = 0;
+  cfg.kf_mode = VPX_KF_AUTO;
+
+  // Disable automatic keyframe placement.
+  cfg.kf_min_dist = cfg.kf_max_dist = 3000;
+
+  cfg.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS;
+
+  set_temporal_layer_pattern(layering_mode,
+                             &cfg,
+                             layer_flags,
+                             &flag_periodicity);
+
+  set_rate_control_metrics(&rc, &cfg);
+
+  // Target bandwidth for the whole stream.
+  // Set to layer_target_bitrate for highest layer (total bitrate).
+  cfg.rc_target_bitrate = rc.layer_target_bitrate[cfg.ts_number_layers - 1];
+
+  // Open input file.
+  if (!(infile = fopen(argv[1], "rb"))) {
+    die("Failed to open %s for reading", argv[1]);
+  }
+
+  framerate = cfg.g_timebase.den / cfg.g_timebase.num;
+  // Open an output file for each stream.
+  for (i = 0; i < cfg.ts_number_layers; ++i) {
+    char file_name[PATH_MAX];
+    VpxVideoInfo info;
+    info.codec_fourcc = encoder->fourcc;
+    info.frame_width = cfg.g_w;
+    info.frame_height = cfg.g_h;
+    info.time_base.numerator = cfg.g_timebase.num;
+    info.time_base.denominator = cfg.g_timebase.den;
+
+    snprintf(file_name, sizeof(file_name), "%s_%d.ivf", argv[2], i);
+    outfile[i] = vpx_video_writer_open(file_name, kContainerIVF, &info);
+    if (!outfile[i])
+      die("Failed to open %s for writing", file_name);
+
+    assert(outfile[i] != NULL);
+  }
+  // No spatial layers in this encoder.
+  cfg.ss_number_layers = 1;
+
+  // Initialize codec.
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (vpx_codec_enc_init(
+          &codec, encoder->codec_interface(), &cfg,
+          bit_depth == VPX_BITS_8 ? 0 : VPX_CODEC_USE_HIGHBITDEPTH))
+#else
+  if (vpx_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0))
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    die_codec(&codec, "Failed to initialize encoder");
+
+  if (strncmp(encoder->name, "vp8", 3) == 0) {
+    vpx_codec_control(&codec, VP8E_SET_CPUUSED, -speed);
+    vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kDenoiserOff);
+    vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
+  } else if (strncmp(encoder->name, "vp9", 3) == 0) {
+    vpx_svc_extra_cfg_t svc_params;
+    vpx_codec_control(&codec, VP8E_SET_CPUUSED, speed);
+    vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3);
+    vpx_codec_control(&codec, VP9E_SET_FRAME_PERIODIC_BOOST, 0);
+    vpx_codec_control(&codec, VP9E_SET_NOISE_SENSITIVITY, 0);
+    vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
+    vpx_codec_control(&codec, VP9E_SET_TUNE_CONTENT, 0);
+    vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (cfg.g_threads >> 1));
+    if (vpx_codec_control(&codec, VP9E_SET_SVC, layering_mode > 0 ? 1: 0))
+      die_codec(&codec, "Failed to set SVC");
+    for (i = 0; i < cfg.ts_number_layers; ++i) {
+      svc_params.max_quantizers[i] = cfg.rc_max_quantizer;
+      svc_params.min_quantizers[i] = cfg.rc_min_quantizer;
+    }
+    svc_params.scaling_factor_num[0] = cfg.g_h;
+    svc_params.scaling_factor_den[0] = cfg.g_h;
+    vpx_codec_control(&codec, VP9E_SET_SVC_PARAMETERS, &svc_params);
+  }
+  if (strncmp(encoder->name, "vp8", 3) == 0) {
+    vpx_codec_control(&codec, VP8E_SET_SCREEN_CONTENT_MODE, 0);
+  }
+  vpx_codec_control(&codec, VP8E_SET_TOKEN_PARTITIONS, 1);
+  // This controls the maximum target size of the key frame.
+  // For generating smaller key frames, use a smaller max_intra_size_pct
+  // value, like 100 or 200.
+  {
+    const int max_intra_size_pct = 900;
+    vpx_codec_control(&codec, VP8E_SET_MAX_INTRA_BITRATE_PCT,
+                      max_intra_size_pct);
+  }
+
+  frame_avail = 1;
+  while (frame_avail || got_data) {
+    struct vpx_usec_timer timer;
+    vpx_codec_iter_t iter = NULL;
+    const vpx_codec_cx_pkt_t *pkt;
+#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION)
+    // Update the temporal layer_id. No spatial layers in this test.
+    layer_id.spatial_layer_id = 0;
+#endif
+    layer_id.temporal_layer_id =
+        cfg.ts_layer_id[frame_cnt % cfg.ts_periodicity];
+    if (strncmp(encoder->name, "vp9", 3) == 0) {
+      vpx_codec_control(&codec, VP9E_SET_SVC_LAYER_ID, &layer_id);
+    } else if (strncmp(encoder->name, "vp8", 3) == 0) {
+      vpx_codec_control(&codec, VP8E_SET_TEMPORAL_LAYER_ID,
+                        layer_id.temporal_layer_id);
+    }
+    flags = layer_flags[frame_cnt % flag_periodicity];
+    if (layering_mode == 0)
+      flags = 0;
+    frame_avail = vpx_img_read(&raw, infile);
+    if (frame_avail)
+      ++rc.layer_input_frames[layer_id.temporal_layer_id];
+    vpx_usec_timer_start(&timer);
+    if (vpx_codec_encode(&codec, frame_avail? &raw : NULL, pts, 1, flags,
+        VPX_DL_REALTIME)) {
+      die_codec(&codec, "Failed to encode frame");
+    }
+    vpx_usec_timer_mark(&timer);
+    cx_time += vpx_usec_timer_elapsed(&timer);
+    // Reset KF flag.
+    if (layering_mode != 7) {
+      layer_flags[0] &= ~VPX_EFLAG_FORCE_KF;
+    }
+    got_data = 0;
+    while ( (pkt = vpx_codec_get_cx_data(&codec, &iter)) ) {
+      got_data = 1;
+      switch (pkt->kind) {
+        case VPX_CODEC_CX_FRAME_PKT:
+          for (i = cfg.ts_layer_id[frame_cnt % cfg.ts_periodicity];
+              i < cfg.ts_number_layers; ++i) {
+            vpx_video_writer_write_frame(outfile[i], pkt->data.frame.buf,
+                                         pkt->data.frame.sz, pts);
+            ++rc.layer_tot_enc_frames[i];
+            rc.layer_encoding_bitrate[i] += 8.0 * pkt->data.frame.sz;
+            // Keep count of rate control stats per layer (for non-key frames).
+            if (i == cfg.ts_layer_id[frame_cnt % cfg.ts_periodicity] &&
+                !(pkt->data.frame.flags & VPX_FRAME_IS_KEY)) {
+              rc.layer_avg_frame_size[i] += 8.0 * pkt->data.frame.sz;
+              rc.layer_avg_rate_mismatch[i] +=
+                  fabs(8.0 * pkt->data.frame.sz - rc.layer_pfb[i]) /
+                  rc.layer_pfb[i];
+              ++rc.layer_enc_frames[i];
+            }
+          }
+          // Update for short-time encoding bitrate states, for moving window
+          // of size rc->window, shifted by rc->window / 2.
+          // Ignore first window segment, due to key frame.
+          if (frame_cnt > rc.window_size) {
+            sum_bitrate += 0.001 * 8.0 * pkt->data.frame.sz * framerate;
+            if (frame_cnt % rc.window_size == 0) {
+              rc.window_count += 1;
+              rc.avg_st_encoding_bitrate += sum_bitrate / rc.window_size;
+              rc.variance_st_encoding_bitrate +=
+                  (sum_bitrate / rc.window_size) *
+                  (sum_bitrate / rc.window_size);
+              sum_bitrate = 0.0;
+            }
+          }
+          // Second shifted window.
+          if (frame_cnt > rc.window_size + rc.window_size / 2) {
+            sum_bitrate2 += 0.001 * 8.0 * pkt->data.frame.sz * framerate;
+            if (frame_cnt > 2 * rc.window_size &&
+                frame_cnt % rc.window_size == 0) {
+              rc.window_count += 1;
+              rc.avg_st_encoding_bitrate += sum_bitrate2 / rc.window_size;
+              rc.variance_st_encoding_bitrate +=
+                  (sum_bitrate2 / rc.window_size) *
+                  (sum_bitrate2 / rc.window_size);
+              sum_bitrate2 = 0.0;
+            }
+          }
+          break;
+          default:
+            break;
+      }
+    }
+    ++frame_cnt;
+    pts += frame_duration;
+  }
+  fclose(infile);
+  printout_rate_control_summary(&rc, &cfg, frame_cnt);
+  printf("\n");
+  printf("Frame cnt and encoding time/FPS stats for encoding: %d %f %f \n",
+          frame_cnt,
+          1000 * (float)cx_time / (double)(frame_cnt * 1000000),
+          1000000 * (double)frame_cnt / (double)cx_time);
+
+  if (vpx_codec_destroy(&codec))
+    die_codec(&codec, "Failed to destroy codec");
+
+  // Try to rewrite the output file headers with the actual frame count.
+  for (i = 0; i < cfg.ts_number_layers; ++i)
+    vpx_video_writer_close(outfile[i]);
+
+  vpx_img_free(&raw);
+  return EXIT_SUCCESS;
+}
diff --git a/libs/libvpx/ivfdec.c b/libs/libvpx/ivfdec.c
new file mode 100644
index 0000000000..6dcd66f734
--- /dev/null
+++ b/libs/libvpx/ivfdec.c
@@ -0,0 +1,112 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "vpx_ports/mem_ops.h"
+
+#include "./ivfdec.h"
+
+static const char *IVF_SIGNATURE = "DKIF";
+
+static void fix_framerate(int *num, int *den) {
+  // Some versions of vpxenc used 1/(2*fps) for the timebase, so
+  // we can guess the framerate using only the timebase in this
+  // case. Other files would require reading ahead to guess the
+  // timebase, like we do for webm.
+  if (*num < 1000) {
+    // Correct for the factor of 2 applied to the timebase in the encoder.
+    if (*num & 1)
+      *den *= 2;
+    else
+      *num /= 2;
+  } else {
+    // Don't know FPS for sure, and don't have readahead code
+    // (yet?), so just default to 30fps.
+    *num = 30;
+    *den = 1;
+  }
+}
+
+int file_is_ivf(struct VpxInputContext *input_ctx) {
+  char raw_hdr[32];
+  int is_ivf = 0;
+
+  if (fread(raw_hdr, 1, 32, input_ctx->file) == 32) {
+    if (memcmp(IVF_SIGNATURE, raw_hdr, 4) == 0) {
+      is_ivf = 1;
+
+      if (mem_get_le16(raw_hdr + 4) != 0) {
+        fprintf(stderr, "Error: Unrecognized IVF version! This file may not"
+                " decode properly.");
+      }
+
+      input_ctx->fourcc = mem_get_le32(raw_hdr + 8);
+      input_ctx->width = mem_get_le16(raw_hdr + 12);
+      input_ctx->height = mem_get_le16(raw_hdr + 14);
+      input_ctx->framerate.numerator = mem_get_le32(raw_hdr + 16);
+      input_ctx->framerate.denominator = mem_get_le32(raw_hdr + 20);
+      fix_framerate(&input_ctx->framerate.numerator,
+                    &input_ctx->framerate.denominator);
+    }
+  }
+
+  if (!is_ivf) {
+    rewind(input_ctx->file);
+    input_ctx->detect.buf_read = 0;
+  } else {
+    input_ctx->detect.position = 4;
+  }
+  return is_ivf;
+}
+
+int ivf_read_frame(FILE *infile, uint8_t **buffer,
+                   size_t *bytes_read, size_t *buffer_size) {
+  char raw_header[IVF_FRAME_HDR_SZ] = {0};
+  size_t frame_size = 0;
+
+  if (fread(raw_header, IVF_FRAME_HDR_SZ, 1, infile) != 1) {
+    if (!feof(infile))
+      warn("Failed to read frame size\n");
+  } else {
+    frame_size = mem_get_le32(raw_header);
+
+    if (frame_size > 256 * 1024 * 1024) {
+      warn("Read invalid frame size (%u)\n", (unsigned int)frame_size);
+      frame_size = 0;
+    }
+
+    if (frame_size > *buffer_size) {
+      uint8_t *new_buffer = realloc(*buffer, 2 * frame_size);
+
+      if (new_buffer) {
+        *buffer = new_buffer;
+        *buffer_size = 2 * frame_size;
+      } else {
+        warn("Failed to allocate compressed data buffer\n");
+        frame_size = 0;
+      }
+    }
+  }
+
+  if (!feof(infile)) {
+    if (fread(*buffer, 1, frame_size, infile) != frame_size) {
+      warn("Failed to read full frame\n");
+      return 1;
+    }
+
+    *bytes_read = frame_size;
+    return 0;
+  }
+
+  return 1;
+}
diff --git a/libs/libvpx/ivfdec.h b/libs/libvpx/ivfdec.h
new file mode 100644
index 0000000000..dd29cc6174
--- /dev/null
+++ b/libs/libvpx/ivfdec.h
@@ -0,0 +1,28 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef IVFDEC_H_
+#define IVFDEC_H_
+
+#include "./tools_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int file_is_ivf(struct VpxInputContext *input);
+
+int ivf_read_frame(FILE *infile, uint8_t **buffer,
+                   size_t *bytes_read, size_t *buffer_size);
+
+#ifdef __cplusplus
+}  /* extern "C" */
+#endif
+
+#endif  // IVFDEC_H_
diff --git a/libs/libvpx/ivfenc.c b/libs/libvpx/ivfenc.c
new file mode 100644
index 0000000000..4a97c42731
--- /dev/null
+++ b/libs/libvpx/ivfenc.c
@@ -0,0 +1,53 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./ivfenc.h"
+
+#include "vpx/vpx_encoder.h"
+#include "vpx_ports/mem_ops.h"
+
+void ivf_write_file_header(FILE *outfile,
+                           const struct vpx_codec_enc_cfg *cfg,
+                           unsigned int fourcc,
+                           int frame_cnt) {
+  char header[32];
+
+  header[0] = 'D';
+  header[1] = 'K';
+  header[2] = 'I';
+  header[3] = 'F';
+  mem_put_le16(header + 4, 0);                     // version
+  mem_put_le16(header + 6, 32);                    // header size
+  mem_put_le32(header + 8, fourcc);                // fourcc
+  mem_put_le16(header + 12, cfg->g_w);             // width
+  mem_put_le16(header + 14, cfg->g_h);             // height
+  mem_put_le32(header + 16, cfg->g_timebase.den);  // rate
+  mem_put_le32(header + 20, cfg->g_timebase.num);  // scale
+  mem_put_le32(header + 24, frame_cnt);            // length
+  mem_put_le32(header + 28, 0);                    // unused
+
+  fwrite(header, 1, 32, outfile);
+}
+
+void ivf_write_frame_header(FILE *outfile, int64_t pts, size_t frame_size) {
+  char header[12];
+
+  mem_put_le32(header, (int)frame_size);
+  mem_put_le32(header + 4, (int)(pts & 0xFFFFFFFF));
+  mem_put_le32(header + 8, (int)(pts >> 32));
+  fwrite(header, 1, 12, outfile);
+}
+
+void ivf_write_frame_size(FILE *outfile, size_t frame_size) {
+  char header[4];
+
+  mem_put_le32(header, (int)frame_size);
+  fwrite(header, 1, 4, outfile);
+}
diff --git a/libs/libvpx/ivfenc.h b/libs/libvpx/ivfenc.h
new file mode 100644
index 0000000000..6623687e84
--- /dev/null
+++ b/libs/libvpx/ivfenc.h
@@ -0,0 +1,35 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef IVFENC_H_
+#define IVFENC_H_
+
+#include "./tools_common.h"
+
+struct vpx_codec_enc_cfg;
+struct vpx_codec_cx_pkt;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void ivf_write_file_header(FILE *outfile,
+                           const struct vpx_codec_enc_cfg *cfg,
+                           uint32_t fourcc,
+                           int frame_cnt);
+
+void ivf_write_frame_header(FILE *outfile, int64_t pts, size_t frame_size);
+
+void ivf_write_frame_size(FILE *outfile, size_t frame_size);
+
+#ifdef __cplusplus
+}  /* extern "C" */
+#endif
+
+#endif  // IVFENC_H_
diff --git a/libs/libvpx/keywords.dox b/libs/libvpx/keywords.dox
new file mode 100644
index 0000000000..56f5368900
--- /dev/null
+++ b/libs/libvpx/keywords.dox
@@ -0,0 +1,51 @@
+/*!\page rfc2119 RFC2119 Keywords
+
+      The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL
+      NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED",  "MAY", and
+      "OPTIONAL" in this document are to be interpreted as described in
+      <a href="http://www.ietf.org/rfc/rfc2119.txt">RFC 2119.</a>
+
+Specifically, the following definitions are used:
+
+\section MUST
+\anchor REQUIRED
+\anchor SHALL
+   This word, or the terms "REQUIRED" or "SHALL", mean that the
+   definition is an absolute requirement of the specification.
+
+\section MUSTNOT MUST NOT
+\anchor SHALLNOT
+   This phrase, or the phrase "SHALL NOT", mean that the
+   definition is an absolute prohibition of the specification.
+
+\section SHOULD
+\anchor RECOMMENDED
+   This word, or the adjective "RECOMMENDED", mean that there
+   may exist valid reasons in particular circumstances to ignore a
+   particular item, but the full implications must be understood and
+   carefully weighed before choosing a different course.
+
+\section SHOULDNOT SHOULD NOT
+\anchor NOTRECOMMENDED
+   This phrase, or the phrase "NOT RECOMMENDED" mean that
+   there may exist valid reasons in particular circumstances when the
+   particular behavior is acceptable or even useful, but the full
+   implications should be understood and the case carefully weighed
+   before implementing any behavior described with this label.
+
+\section MAY
+\anchor OPTIONAL
+   This word, or the adjective "OPTIONAL", mean that an item is
+   truly optional.  One vendor may choose to include the item because a
+   particular marketplace requires it or because the vendor feels that
+   it enhances the product while another vendor may omit the same item.
+   An implementation which does not include a particular option \ref MUST be
+   prepared to interoperate with another implementation which does
+   include the option, though perhaps with reduced functionality. In the
+   same vein an implementation which does include a particular option
+   \ref MUST be prepared to interoperate with another implementation which
+   does not include the option (except, of course, for the feature the
+   option provides.)
+
+
+*/
diff --git a/libs/libvpx/libs.doxy_template b/libs/libvpx/libs.doxy_template
new file mode 100644
index 0000000000..5a8f847280
--- /dev/null
+++ b/libs/libvpx/libs.doxy_template
@@ -0,0 +1,1296 @@
+##
+##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+
+# Doxyfile 1.5.4
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project
+#
+# All text after a hash (#) is considered a comment and will be ignored
+# The format is:
+#       TAG = value [value, ...]
+# For lists items can also be appended using:
+#       TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (" ")
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file that
+# follow. The default is UTF-8 which is also the encoding used for all text before
+# the first occurrence of this tag. Doxygen uses libiconv (or the iconv built into
+# libc) for the transcoding. See http://www.gnu.org/software/libiconv for the list of
+# possible encodings.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded
+# by quotes) that should identify the project.
+
+PROJECT_NAME           = "WebM Codec SDK"
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
+# base path where the generated documentation will be put.
+# If a relative path is entered, it will be relative to the location
+# where doxygen was started. If left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       = docs
+
+# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create
+# 4096 sub-directories (in 2 levels) under the output directory of each output
+# format and will distribute the generated files over these directories.
+# Enabling this option can be useful when feeding doxygen a huge amount of
+# source files, where putting all generated files in the same directory would
+# otherwise cause performance problems for the file system.
+
+CREATE_SUBDIRS         = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# The default language is English, other supported languages are:
+# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional,
+# Croatian, Czech, Danish, Dutch, Finnish, French, German, Greek, Hungarian,
+# Italian, Japanese, Japanese-en (Japanese with English messages), Korean,
+# Korean-en, Lithuanian, Norwegian, Polish, Portuguese, Romanian, Russian,
+# Serbian, Slovak, Slovene, Spanish, Swedish, and Ukrainian.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will
+# include brief member descriptions after the members that are listed in
+# the file and class documentation (similar to java_doc).
+# Set to NO to disable this.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend
+# the brief description of a member or function before the detailed description.
+# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator
+# that is used to form the text in various listings. Each string
+# in this list, if found as the leading text of the brief description, will be
+# stripped from the text and the result after processing the whole list, is
+# used as the annotated text. Otherwise, the brief description is used as-is.
+# If left blank, the following values are used ("$name" is automatically
+# replaced with the name of the entity): "The $name class" "The $name widget"
+# "The $name file" "is" "provides" "specifies" "contains"
+# "represents" "a" "an" "the"
+
+ABBREVIATE_BRIEF       =
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# Doxygen will generate a detailed section even if there is only a brief
+# description.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full
+# path before files name in the file list and in the header files. If set
+# to NO the shortest path that makes the file name unique will be used.
+
+FULL_PATH_NAMES        = YES
+
+# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag
+# can be used to strip a user-defined part of the path. Stripping is
+# only done if one of the specified strings matches the left-hand part of
+# the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the
+# path to strip.
+
+STRIP_FROM_PATH        =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of
+# the path mentioned in the documentation of a class, which tells
+# the reader which header file to include in order to use a class.
+# If left blank only the name of the header file containing the class
+# definition is used. Otherwise one should specify the include paths that
+# are normally passed to the compiler using the -I flag.
+
+STRIP_FROM_INC_PATH    =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter
+# (but less readable) file names. This can be useful is your file systems
+# doesn't support long names like on DOS, Mac, or CD-ROM.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen
+# will interpret the first line (until the first dot) of a java_doc-style
+# comment as the brief description. If set to NO, the java_doc
+# comments will behave just like regular Qt-style comments
+# (thus requiring an explicit @brief command for a brief description.)
+
+JAVADOC_AUTOBRIEF      = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then Doxygen will
+# interpret the first line (until the first dot) of a Qt-style
+# comment as the brief description. If set to NO, the comments
+# will behave just like regular Qt-style comments (thus requiring
+# an explicit \brief command for a brief description.)
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen
+# treat a multi-line C++ special comment block (i.e. a block of //! or ///
+# comments) as a brief description. This used to be the default behaviour.
+# The new default is to treat a multi-line C++ comment block as a detailed
+# description. Set this tag to YES if you prefer the old behaviour instead.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented
+# member inherits the documentation from any documented member that it
+# re-implements.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce
+# a new page for each member. If set to NO, the documentation of a member will
+# be part of the file/class/namespace that contains it.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab.
+# Doxygen uses this value to replace tabs by spaces in code fragments.
+
+TAB_SIZE               = 4
+
+# This tag can be used to specify a number of aliases that acts
+# as commands in the documentation. An alias has the form "name=value".
+# For example adding "sideeffect=\par Side Effects:\n" will allow you to
+# put the command \sideeffect (or @sideeffect) in the documentation, which
+# will result in a user-defined paragraph with heading "Side Effects:".
+# You can put \n's in the value part of an alias to insert newlines.
+
+ALIASES                =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C
+# sources only. Doxygen will then generate output that is more tailored for C.
+# For instance, some of the names that are used will be different. The list
+# of all members will be omitted, etc.
+
+OPTIMIZE_OUTPUT_FOR_C  = YES
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java
+# sources only. Doxygen will then generate output that is more tailored for Java.
+# For instance, namespaces will be presented as packages, qualified scopes
+# will look different, etc.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want to
+# include (a tag file for) the STL sources as input, then you should
+# set this tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string); v.s.
+# func(std::string) {}). This also make the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+
+BUILTIN_STL_SUPPORT    = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only.
+# Doxygen will parse them like normal C++ but will assume all classes use public
+# instead of private inheritance when no explicit protection keyword is present.
+
+SIP_SUPPORT            = NO
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES, then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# Set the SUBGROUPING tag to YES (the defqault) to allow class member groups of
+# the same type (for instance a group of public functions) to be put as a
+# subgroup of that type (e.g. under the Public Functions section). Set it to
+# NO to prevent subgrouping. Alternatively, this can be done per class using
+# the \nosubgrouping command.
+
+SUBGROUPING            = YES
+
+# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct (or union) is
+# documented as struct with the name of the typedef. So
+# typedef struct type_s {} type_t, will appear in the documentation as a struct
+# with name type_t. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named type_s. This can typically
+# be useful for C code where the coding convention is that all structs are
+# typedef'ed and only the typedef is referenced never the struct's name.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
+# documentation are documented, even if no documentation was available.
+# Private class members and static file members will be hidden unless
+# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES
+
+EXTRACT_ALL            = NO
+
+# If the EXTRACT_PRIVATE tag is set to YES all private members of a class
+# will be included in the documentation.
+
+EXTRACT_PRIVATE        = NO
+
+# If the EXTRACT_STATIC tag is set to YES all static members of a file
+# will be included in the documentation.
+
+EXTRACT_STATIC         = NO
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs)
+# defined locally in source files will be included in the documentation.
+# If set to NO only classes defined in header files are included.
+
+EXTRACT_LOCAL_CLASSES  = YES
+
+# This flag is only useful for Objective-C code. When set to YES local
+# methods, which are defined in the implementation section but not in
+# the interface are included in the documentation.
+# If set to NO (the default) only methods in the interface are included.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be extracted
+# and appear in the documentation as a namespace called 'anonymous_namespace{file}',
+# where file will be replaced with the base name of the file that contains the anonymous
+# namespace. By default anonymous namespace are hidden.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all
+# undocumented members of documented classes, files or namespaces.
+# If set to NO (the default) these members will be included in the
+# various overviews, but no documentation section is generated.
+# This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_MEMBERS     = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy.
+# If set to NO (the default) these classes will be included in the various
+# overviews. This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_CLASSES     = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all
+# friend (class|struct|union) declarations.
+# If set to NO (the default) these declarations will be included in the
+# documentation.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any
+# documentation blocks found inside the body of a function.
+# If set to NO (the default) these blocks will be appended to the
+# function's detailed documentation block.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation
+# that is typed after a \internal command is included. If the tag is set
+# to NO (the default) then the documentation will be excluded.
+# Set it to YES to include the internal documentation.
+
+INTERNAL_DOCS          = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate
+# file names in lower-case letters. If set to YES upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+
+CASE_SENSE_NAMES       = YES
+
+# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen
+# will show members with their full class and namespace scopes in the
+# documentation. If set to YES the scope will be hidden.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen
+# will put a list of the files that are included by a file in the documentation
+# of that file.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the INLINE_INFO tag is set to YES (the default) then a tag [inline]
+# is inserted in the documentation for inline members.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen
+# will sort the (detailed) documentation of file and class members
+# alphabetically by member name. If set to NO the members will appear in
+# declaration order.
+
+SORT_MEMBER_DOCS       = NO
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the
+# brief documentation of file, namespace and class members alphabetically
+# by member name. If set to NO (the default) the members will appear in
+# declaration order.
+
+SORT_BRIEF_DOCS        = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be
+# sorted by fully-qualified names, including namespaces. If set to
+# NO (the default), the class list will be sorted only by class name,
+# not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the
+# alphabetical list.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or
+# disable (NO) the todo list. This list is created by putting \todo
+# commands in the documentation.
+
+GENERATE_TODOLIST      = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or
+# disable (NO) the test list. This list is created by putting \test
+# commands in the documentation.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or
+# disable (NO) the bug list. This list is created by putting \bug
+# commands in the documentation.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or
+# disable (NO) the deprecated list. This list is created by putting
+# \deprecated commands in the documentation.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional
+# documentation sections, marked by \if sectionname ... \endif.
+
+ENABLED_SECTIONS       =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines
+# the initial value of a variable or define consists of for it to appear in
+# the documentation. If the initializer consists of more lines than specified
+# here it will be hidden. Use a value of 0 to hide initializers completely.
+# The appearance of the initializer of individual variables and defines in the
+# documentation can be controlled using \showinitializer or \hideinitializer
+# command in the documentation regardless of this setting.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated
+# at the bottom of the documentation of classes and structs. If set to YES the
+# list will mention the files that were used to generate the documentation.
+
+SHOW_USED_FILES        = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from the
+# version control system). Doxygen will invoke the program by executing (via
+# popen()) the command <command> <input-file>, where <command> is the value of
+# the FILE_VERSION_FILTER tag, and <input-file> is the name of an input file
+# provided by doxygen. Whatever the program writes to standard output
+# is used as the file version. See the manual for examples.
+
+FILE_VERSION_FILTER    =
+
+#---------------------------------------------------------------------------
+# configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated
+# by doxygen. Possible values are YES and NO. If left blank NO is used.
+
+QUIET                  = YES
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated by doxygen. Possible values are YES and NO. If left blank
+# NO is used.
+
+WARNINGS               = YES
+
+# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings
+# for undocumented members. If EXTRACT_ALL is set to YES then this flag will
+# automatically be disabled.
+
+WARN_IF_UNDOCUMENTED   = YES
+
+# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some
+# parameters in a documented function, or documenting parameters that
+# don't exist or using markup commands wrongly.
+
+WARN_IF_DOC_ERROR      = YES
+
+# This WARN_NO_PARAMDOC option can be abled to get warnings for
+# functions that are documented, but have no documentation for their parameters
+# or return value. If set to NO (the default) doxygen will only warn about
+# wrong or incomplete parameter documentation, but not about the absence of
+# documentation.
+
+WARN_NO_PARAMDOC       = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that
+# doxygen can produce. The string should contain the $file, $line, and $text
+# tags, which will be replaced by the file and line number from which the
+# warning originated and the warning text. Optionally the format may contain
+# $version, which will be replaced by the version of the file (if it could
+# be obtained via FILE_VERSION_FILTER)
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning
+# and error messages should be written. If left blank the output is written
+# to stderr.
+
+WARN_LOGFILE           =
+
+#---------------------------------------------------------------------------
+# configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag can be used to specify the files and/or directories that contain
+# documented source files. You may enter file names like "myfile.cpp" or
+# directories like "/usr/src/myproject". Separate the files or directories
+# with spaces.
+
+INPUT =
+
+# This tag can be used to specify the character encoding of the source files that
+# doxygen parses. Internally doxygen uses the UTF-8 encoding, which is also the default
+# input encoding. Doxygen uses libiconv (or the iconv built into libc) for the transcoding.
+# See http://www.gnu.org/software/libiconv for the list of possible encodings.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank the following patterns are tested:
+# *.c *.cc *.cxx *.cpp *.c++ *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh *.hxx
+# *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.py *.f90
+
+FILE_PATTERNS          =
+
+# The RECURSIVE tag can be used to turn specify whether or not subdirectories
+# should be searched for input files as well. Possible values are YES and NO.
+# If left blank NO is used.
+
+RECURSIVE              = NO
+
+# The EXCLUDE tag can be used to specify files and/or directories that should
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+
+EXCLUDE                =
+
+# The EXCLUDE_SYMLINKS tag can be used select whether or not files or
+# directories that are symbolic links (a Unix filesystem feature) are excluded
+# from the input.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories. Note that the wildcards are matched
+# against the file with absolute path, so to exclude all test directories
+# for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       =
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the output.
+# The symbol name can be a fully qualified name, a word, or if the wildcard * is used,
+# a substring. Examples: ANamespace, AClass, AClass::ANamespace, ANamespace::*Test
+
+EXCLUDE_SYMBOLS        =
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or
+# directories that contain example code fragments that are included (see
+# the \include command).
+
+EXAMPLE_PATH           =
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank all files are included.
+
+EXAMPLE_PATTERNS       =
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude
+# commands irrespective of the value of the RECURSIVE tag.
+# Possible values are YES and NO. If left blank NO is used.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or
+# directories that contain image that are included in the documentation (see
+# the \image command).
+
+IMAGE_PATH             =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command <filter> <input-file>, where <filter>
+# is the value of the INPUT_FILTER tag, and <input-file> is the name of an
+# input file. Doxygen will then use the output that the filter program writes
+# to standard output.  If FILTER_PATTERNS is specified, this tag will be
+# ignored.
+
+INPUT_FILTER           =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis.  Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match.  The filters are a list of the form:
+# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further
+# info on how filters are used. If FILTER_PATTERNS is empty, INPUT_FILTER
+# is applied to all files.
+
+FILTER_PATTERNS        =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will be used to filter the input files when producing source
+# files to browse (i.e. when SOURCE_BROWSER is set to YES).
+
+FILTER_SOURCE_FILES    = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will
+# be generated. Documented entities will be cross-referenced with these sources.
+# Note: To get rid of all source code in the generated output, make sure also
+# VERBATIM_HEADERS is set to NO. If you have enabled CALL_GRAPH or CALLER_GRAPH
+# then you must also enable this option. If you don't then doxygen will produce
+# a warning and turn it on anyway
+
+SOURCE_BROWSER         = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body
+# of functions and classes directly in the documentation.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct
+# doxygen to hide any special comment blocks from generated source code
+# fragments. Normal C and C++ comments will always remain visible.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES (the default)
+# then for each documented function all documented
+# functions referencing it will be listed.
+
+REFERENCED_BY_RELATION = YES
+
+# If the REFERENCES_RELATION tag is set to YES (the default)
+# then for each documented function all documented entities
+# called/used by that function will be listed.
+
+REFERENCES_RELATION    = YES
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES (the default)
+# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from
+# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will
+# link to the source code.  Otherwise they will link to the documentstion.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code
+# will point to the HTML generated by the htags(1) tool instead of doxygen
+# built-in source browser. The htags tool is part of GNU's global source
+# tagging system (see http://www.gnu.org/software/global/global.html). You
+# will need version 4.8.6 or higher.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen
+# will generate a verbatim copy of the header file for each class for
+# which an include is specified. Set to NO to disable this.
+
+VERBATIM_HEADERS       = YES
+
+#---------------------------------------------------------------------------
+# configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index
+# of all compounds will be generated. Enable this if the project
+# contains a lot of classes, structs, unions or interfaces.
+
+ALPHABETICAL_INDEX     = NO
+
+# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then
+# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns
+# in which this list will be split (can be a number in the range [1..20])
+
+COLS_IN_ALPHA_INDEX    = 5
+
+# In case all classes in a project start with a common prefix, all
+# classes will be put under the same header in the alphabetical index.
+# The IGNORE_PREFIX tag can be used to specify one or more prefixes that
+# should be ignored while generating the index headers.
+
+IGNORE_PREFIX          =
+
+#---------------------------------------------------------------------------
+# configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES (the default) Doxygen will
+# generate HTML output.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `html' will be used as the default path.
+
+HTML_OUTPUT            = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for
+# each generated HTML page (for example: .htm,.php,.asp). If it is left blank
+# doxygen will generate files with .html extension.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a personal HTML header for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard header.
+
+HTML_HEADER            =
+
+# The HTML_FOOTER tag can be used to specify a personal HTML footer for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard footer.
+
+HTML_FOOTER            =
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading
+# style sheet that is used by each HTML page. It can be used to
+# fine-tune the look of the HTML output. If the tag is left blank doxygen
+# will generate a default style sheet. Note that doxygen will try to copy
+# the style sheet file to the HTML output directory, so don't put your own
+# stylesheet in the HTML output directory as well, or it will be erased!
+
+HTML_STYLESHEET        =
+
+# If the GENERATE_HTMLHELP tag is set to YES, additional index files
+# will be generated that can be used as input for tools like the
+# Microsoft HTML help workshop to generate a compressed HTML help file (.chm)
+# of the generated HTML documentation.
+
+GENERATE_HTMLHELP      = NO
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded. For this to work a browser that supports
+# java_script and DHTML is required (for instance Mozilla 1.0+, Firefox
+# Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari).
+
+HTML_DYNAMIC_SECTIONS  = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can
+# be used to specify the file name of the resulting .chm file. You
+# can add a path in front of the file if the result should not be
+# written to the html output directory.
+
+CHM_FILE               =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can
+# be used to specify the location (absolute path including file name) of
+# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run
+# the HTML help compiler on the generated index.hhp.
+
+HHC_LOCATION           =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag
+# controls if a separate .chi index file is generated (YES) or that
+# it should be included in the master .chm file (NO).
+
+GENERATE_CHI           = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag
+# controls whether a binary table of contents is generated (YES) or a
+# normal table of contents (NO) in the .chm file.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members
+# to the contents of the HTML help documentation and to the tree view.
+
+TOC_EXPAND             = NO
+
+# The DISABLE_INDEX tag can be used to turn on/off the condensed index at
+# top of each HTML page. The value NO (the default) enables the index and
+# the value YES disables it.
+
+DISABLE_INDEX          = NO
+
+# This tag can be used to set the number of enum values (range [1..20])
+# that doxygen will group on one line in the generated HTML documentation.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# If the GENERATE_TREEVIEW tag is set to YES, a side panel will be
+# generated containing a tree-like index structure (just like the one that
+# is generated for HTML Help). For this to work a browser that supports
+# java_script, DHTML, CSS and frames is required (for instance Mozilla 1.0+,
+# Netscape 6.0+, Internet explorer 5.0+, or Konqueror). Windows users are
+# probably better off using the HTML help feature.
+
+GENERATE_TREEVIEW      = NO
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be
+# used to set the initial width (in pixels) of the frame in which the tree
+# is shown.
+
+TREEVIEW_WIDTH         = 250
+
+#---------------------------------------------------------------------------
+# configuration options related to the la_te_x output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will
+# generate Latex output.
+
+GENERATE_LATEX         = YES
+
+# The LATEX_OUTPUT tag is used to specify where the la_te_x docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `latex' will be used as the default path.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the la_te_x command name to be
+# invoked. If left blank `latex' will be used as the default command name.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to
+# generate index for la_te_x. If left blank `makeindex' will be used as the
+# default command name.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact
+# la_te_x documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_LATEX          = YES
+
+# The PAPER_TYPE tag can be used to set the paper type that is used
+# by the printer. Possible values are: a4, a4wide, letter, legal and
+# executive. If left blank a4wide will be used.
+
+PAPER_TYPE             = letter
+
+# The EXTRA_PACKAGES tag can be to specify one or more names of la_te_x
+# packages that should be included in the la_te_x output.
+
+EXTRA_PACKAGES         =
+
+# The LATEX_HEADER tag can be used to specify a personal la_te_x header for
+# the generated latex document. The header should contain everything until
+# the first chapter. If it is left blank doxygen will generate a
+# standard header. Notice: only use this tag if you know what you are doing!
+
+LATEX_HEADER           =
+
+# If the PDF_HYPERLINKS tag is set to YES, the la_te_x that is generated
+# is prepared for conversion to pdf (using ps2pdf). The pdf file will
+# contain links (just like the HTML output) instead of page references
+# This makes the output suitable for online browsing using a pdf viewer.
+
+PDF_HYPERLINKS         = YES
+
+# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of
+# plain latex in the generated Makefile. Set this option to YES to get a
+# higher quality PDF documentation.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode.
+# command to the generated la_te_x files. This will instruct la_te_x to keep
+# running if errors occur, instead of asking the user for help.
+# This option is also used when generating formulas in HTML.
+
+LATEX_BATCHMODE        = NO
+
+# If LATEX_HIDE_INDICES is set to YES then doxygen will not
+# include the index chapters (such as File Index, Compound Index, etc.)
+# in the output.
+
+LATEX_HIDE_INDICES     = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output
+# The RTF output is optimized for Word 97 and may not look very pretty with
+# other RTF readers or editors.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `rtf' will be used as the default path.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES Doxygen generates more compact
+# RTF documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated
+# will contain hyperlink fields. The RTF file will
+# contain links (just like the HTML output) instead of page references.
+# This makes the output suitable for online browsing using WORD or other
+# programs which support those fields.
+# Note: wordpad (write) and others do not support links.
+
+RTF_HYPERLINKS         = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's
+# config file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
+
+RTF_STYLESHEET_FILE    =
+
+# Set optional variables used in the generation of an rtf document.
+# Syntax is similar to doxygen's config file.
+
+RTF_EXTENSIONS_FILE    =
+
+#---------------------------------------------------------------------------
+# configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES (the default) Doxygen will
+# generate man pages
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `man' will be used as the default path.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to
+# the generated man pages (default is the subroutine's section .3)
+
+MAN_EXTENSION          = .3
+
+# If the MAN_LINKS tag is set to YES and Doxygen generates man output,
+# then it will generate one additional man file for each entity
+# documented in the real man page(s). These additional files
+# only source the real man page, but without them the man command
+# would be unable to find the correct page. The default is NO.
+
+MAN_LINKS              = YES
+
+#---------------------------------------------------------------------------
+# configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES Doxygen will
+# generate an XML file that captures the structure of
+# the code including all documentation.
+
+GENERATE_XML           = NO
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `xml' will be used as the default path.
+
+XML_OUTPUT             = xml
+
+# The XML_SCHEMA tag can be used to specify an XML schema,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_SCHEMA             =
+
+# The XML_DTD tag can be used to specify an XML DTD,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_DTD                =
+
+# If the XML_PROGRAMLISTING tag is set to YES Doxygen will
+# dump the program listings (including syntax highlighting
+# and cross-referencing information) to the XML output. Note that
+# enabling this will significantly increase the size of the XML output.
+
+XML_PROGRAMLISTING     = YES
+
+#---------------------------------------------------------------------------
+# configuration options for the auto_gen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will
+# generate an auto_gen Definitions (see autogen.sf.net) file
+# that captures the structure of the code including all
+# documentation. Note that this feature is still experimental
+# and incomplete at the moment.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES Doxygen will
+# generate a Perl module file that captures the structure of
+# the code including all documentation. Note that this
+# feature is still experimental and incomplete at the
+# moment.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES Doxygen will generate
+# the necessary Makefile rules, Perl scripts and la_te_x code to be able
+# to generate PDF and DVI output from the Perl module output.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be
+# nicely formatted so it can be parsed by a human reader.  This is useful
+# if you want to understand what is going on.  On the other hand, if this
+# tag is set to NO the size of the Perl module output will be much smaller
+# and Perl will parse it just the same.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file
+# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX.
+# This is useful so different doxyrules.make files included by the same
+# Makefile don't overwrite each other's variables.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will
+# evaluate all C-preprocessor directives found in the sources and include
+# files.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro
+# names in the source code. If set to NO (the default) only conditional
+# compilation will be performed. Macro expansion can be done in a controlled
+# way by setting EXPAND_ONLY_PREDEF to YES.
+
+MACRO_EXPANSION        = YES
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES
+# then the macro expansion is limited to the macros specified with the
+# PREDEFINED and EXPAND_AS_DEFINED tags.
+
+EXPAND_ONLY_PREDEF     = NO
+
+# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files
+# in the INCLUDE_PATH (see below) will be search if a #include is found.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by
+# the preprocessor.
+
+INCLUDE_PATH           =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will
+# be used.
+
+INCLUDE_FILE_PATTERNS  = *.h
+
+# The PREDEFINED tag can be used to specify one or more macro names that
+# are defined before the preprocessor is started (similar to the -D option of
+# gcc). The argument of the tag is a list of macros of the form: name
+# or name=definition (no spaces). If the definition and the = are
+# omitted =1 is assumed. To prevent a macro definition from being
+# undefined via #undef or recursively expanded use the := operator
+# instead of the = operator.
+
+PREDEFINED             =
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then
+# this tag can be used to specify a list of macro names that should be expanded.
+# The macro definition that is found in the sources will be used.
+# Use the PREDEFINED tag if you want to use a different macro definition.
+
+EXPAND_AS_DEFINED      =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then
+# doxygen's preprocessor will remove all function-like macros that are alone
+# on a line, have an all uppercase name, and do not end with a semicolon. Such
+# function macros are typically used for boiler-plate code, and will confuse
+# the parser if not removed.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration::additions related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES option can be used to specify one or more tagfiles.
+# Optionally an initial location of the external documentation
+# can be added for each tagfile. The format of a tag file without
+# this location is as follows:
+#   TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+#   TAGFILES = file1=loc1 "file2 = loc2" ...
+# where "loc1" and "loc2" can be relative or absolute paths or
+# URLs. If a location is present for each tag, the installdox tool
+# does not have to be run to correct the links.
+# Note that each tag file must have a unique name
+# (where the name does NOT include the path)
+# If a tag file is not located in the directory in which doxygen
+# is run, you must also specify the path to the tagfile here.
+
+TAGFILES               =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create
+# a tag file that is based on the input files it reads.
+
+GENERATE_TAGFILE       =
+
+# If the ALLEXTERNALS tag is set to YES all external classes will be listed
+# in the class index. If set to NO only the inherited external classes
+# will be listed.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will
+# be listed.
+
+EXTERNAL_GROUPS        = YES
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of `which perl').
+
+PERL_PATH              = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will
+# generate a inheritance diagram (in HTML, RTF and la_te_x) for classes with base
+# or super classes. Setting the tag to NO turns the diagrams off. Note that
+# this option is superseded by the HAVE_DOT option below. This is only a
+# fallback. It is recommended to install and use dot, since it yields more
+# powerful graphs.
+
+CLASS_DIAGRAMS         = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see http://www.mcternan.me.uk/mscgen/) to
+# produce the chart and insert it in the documentation. The MSCGEN_PATH tag allows you to
+# specify the directory where the mscgen tool resides. If left empty the tool is assumed to
+# be found in the default search path.
+
+MSCGEN_PATH            =
+
+# If set to YES, the inheritance and collaboration graphs will hide
+# inheritance and usage relations if the target is undocumented
+# or is not a class.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz, a graph visualization
+# toolkit from AT&T and Lucent Bell Labs. The other options in this section
+# have no effect if this option is set to NO (the default)
+
+HAVE_DOT               = NO
+
+# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect inheritance relations. Setting this tag to YES will force the
+# the CLASS_DIAGRAMS tag to NO.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect implementation dependencies (inheritance, containment, and
+# class references variables) of the class with other documented classes.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for groups, showing the direct groups dependencies
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+
+UML_LOOK               = NO
+
+# If set to YES, the inheritance and collaboration graphs will show the
+# relations between templates and their instances.
+
+TEMPLATE_RELATIONS     = NO
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT
+# tags are set to YES then doxygen will generate a graph for each documented
+# file showing the direct and indirect include dependencies of the file with
+# other documented files.
+
+INCLUDE_GRAPH          = YES
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and
+# HAVE_DOT tags are set to YES then doxygen will generate a graph for each
+# documented header file showing the documented files that directly or
+# indirectly include this file.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH, SOURCE_BROWSER and HAVE_DOT tags are set to YES then doxygen will
+# generate a call dependency graph for every global function or class method.
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable call graphs for selected
+# functions only using the \callgraph command.
+
+CALL_GRAPH             = NO
+
+# If the CALLER_GRAPH, SOURCE_BROWSER and HAVE_DOT tags are set to YES then doxygen will
+# generate a caller dependency graph for every global function or class method.
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable caller graphs for selected
+# functions only using the \callergraph command.
+
+CALLER_GRAPH           = NO
+
+# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen
+# will graphical hierarchy of all classes instead of a textual one.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES
+# then doxygen will show the dependencies a directory has on other directories
+# in a graphical way. The dependency relations are determined by the #include
+# relations between the files in the directories.
+
+DIRECTORY_GRAPH        = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. Possible values are png, jpg, or gif
+# If left blank png will be used.
+
+DOT_IMAGE_FORMAT       = png
+
+# The tag DOT_PATH can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+
+DOT_PATH               =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the
+# \dotfile command).
+
+DOTFILE_DIRS           =
+
+# The MAX_DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of
+# nodes that will be shown in the graph. If the number of nodes in a graph
+# becomes larger than this value, doxygen will truncate the graph, which is
+# visualized by representing a node as a red box. Note that doxygen if the number
+# of direct children of the root node in a graph is already larger than
+# MAX_DOT_GRAPH_NOTES then the graph will not be shown at all. Also note
+# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the
+# graphs generated by dot. A depth value of 3 means that only nodes reachable
+# from the root by following a path via at most 3 edges will be shown. Nodes
+# that lay further from the root node will be omitted. Note that setting this
+# option to 1 or 2 may greatly reduce the computation time needed for large
+# code bases. Also note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, which results in a white background.
+# Warning: Depending on the platform used, enabling this option may lead to
+# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
+# read).
+
+DOT_TRANSPARENT        = YES
+
+# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10)
+# support this, this feature is disabled by default.
+
+DOT_MULTI_TARGETS      = NO
+
+# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will
+# generate a legend page explaining the meaning of the various boxes and
+# arrows in the dot generated graphs.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will
+# remove the intermediate dot files that are used to generate
+# the various graphs.
+
+DOT_CLEANUP            = YES
+
+#---------------------------------------------------------------------------
+# Configuration::additions related to the search engine
+#---------------------------------------------------------------------------
+
+# The SEARCHENGINE tag specifies whether or not a search engine should be
+# used. If set to NO the values of all tags below this one will be ignored.
+
+SEARCHENGINE           = NO
diff --git a/libs/libvpx/libs.mk b/libs/libvpx/libs.mk
new file mode 100644
index 0000000000..e6fb068bf7
--- /dev/null
+++ b/libs/libvpx/libs.mk
@@ -0,0 +1,632 @@
+##
+##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+
+# ARM assembly files are written in RVCT-style. We use some make magic to
+# filter those files to allow GCC compilation
+ifeq ($(ARCH_ARM),yes)
+  ASM:=$(if $(filter yes,$(CONFIG_GCC)$(CONFIG_MSVS)),.asm.s,.asm)
+else
+  ASM:=.asm
+endif
+
+#
+# Rule to generate runtime cpu detection files
+#
+define rtcd_h_template
+$$(BUILD_PFX)$(1).h: $$(SRC_PATH_BARE)/$(2)
+	@echo "    [CREATE] $$@"
+	$$(qexec)$$(SRC_PATH_BARE)/build/make/rtcd.pl --arch=$$(TGT_ISA) \
+          --sym=$(1) \
+          --config=$$(CONFIG_DIR)$$(target)-$$(TOOLCHAIN).mk \
+          $$(RTCD_OPTIONS) $$^ > $$@
+CLEAN-OBJS += $$(BUILD_PFX)$(1).h
+RTCD += $$(BUILD_PFX)$(1).h
+endef
+
+CODEC_SRCS-yes += CHANGELOG
+CODEC_SRCS-yes += libs.mk
+
+include $(SRC_PATH_BARE)/vpx/vpx_codec.mk
+CODEC_SRCS-yes += $(addprefix vpx/,$(call enabled,API_SRCS))
+CODEC_DOC_SRCS += $(addprefix vpx/,$(call enabled,API_DOC_SRCS))
+
+include $(SRC_PATH_BARE)/vpx_mem/vpx_mem.mk
+CODEC_SRCS-yes += $(addprefix vpx_mem/,$(call enabled,MEM_SRCS))
+
+include $(SRC_PATH_BARE)/vpx_scale/vpx_scale.mk
+CODEC_SRCS-yes += $(addprefix vpx_scale/,$(call enabled,SCALE_SRCS))
+
+include $(SRC_PATH_BARE)/vpx_ports/vpx_ports.mk
+CODEC_SRCS-yes += $(addprefix vpx_ports/,$(call enabled,PORTS_SRCS))
+
+include $(SRC_PATH_BARE)/vpx_dsp/vpx_dsp.mk
+CODEC_SRCS-yes += $(addprefix vpx_dsp/,$(call enabled,DSP_SRCS))
+
+include $(SRC_PATH_BARE)/vpx_util/vpx_util.mk
+CODEC_SRCS-yes += $(addprefix vpx_util/,$(call enabled,UTIL_SRCS))
+
+ifeq ($(CONFIG_VP8),yes)
+  VP8_PREFIX=vp8/
+  include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8_common.mk
+endif
+
+ifeq ($(CONFIG_VP8_ENCODER),yes)
+  include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8cx.mk
+  CODEC_SRCS-yes += $(addprefix $(VP8_PREFIX),$(call enabled,VP8_CX_SRCS))
+  CODEC_EXPORTS-yes += $(addprefix $(VP8_PREFIX),$(VP8_CX_EXPORTS))
+  INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8cx.h
+  INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP8_PREFIX)/%
+  CODEC_DOC_SECTIONS += vp8 vp8_encoder
+endif
+
+ifeq ($(CONFIG_VP8_DECODER),yes)
+  include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8dx.mk
+  CODEC_SRCS-yes += $(addprefix $(VP8_PREFIX),$(call enabled,VP8_DX_SRCS))
+  CODEC_EXPORTS-yes += $(addprefix $(VP8_PREFIX),$(VP8_DX_EXPORTS))
+  INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8dx.h
+  INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP8_PREFIX)/%
+  CODEC_DOC_SECTIONS += vp8 vp8_decoder
+endif
+
+ifeq ($(CONFIG_VP9),yes)
+  VP9_PREFIX=vp9/
+  include $(SRC_PATH_BARE)/$(VP9_PREFIX)vp9_common.mk
+endif
+
+ifeq ($(CONFIG_VP9_ENCODER),yes)
+  VP9_PREFIX=vp9/
+  include $(SRC_PATH_BARE)/$(VP9_PREFIX)vp9cx.mk
+  CODEC_SRCS-yes += $(addprefix $(VP9_PREFIX),$(call enabled,VP9_CX_SRCS))
+  CODEC_EXPORTS-yes += $(addprefix $(VP9_PREFIX),$(VP9_CX_EXPORTS))
+  CODEC_SRCS-yes += $(VP9_PREFIX)vp9cx.mk vpx/vp8.h vpx/vp8cx.h
+  INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8cx.h
+  INSTALL-LIBS-$(CONFIG_SPATIAL_SVC) += include/vpx/svc_context.h
+  INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP9_PREFIX)/%
+  CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8cx.h
+  CODEC_DOC_SECTIONS += vp9 vp9_encoder
+endif
+
+ifeq ($(CONFIG_VP9_DECODER),yes)
+  VP9_PREFIX=vp9/
+  include $(SRC_PATH_BARE)/$(VP9_PREFIX)vp9dx.mk
+  CODEC_SRCS-yes += $(addprefix $(VP9_PREFIX),$(call enabled,VP9_DX_SRCS))
+  CODEC_EXPORTS-yes += $(addprefix $(VP9_PREFIX),$(VP9_DX_EXPORTS))
+  CODEC_SRCS-yes += $(VP9_PREFIX)vp9dx.mk vpx/vp8.h vpx/vp8dx.h
+  INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8dx.h
+  INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP9_PREFIX)/%
+  CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8dx.h
+  CODEC_DOC_SECTIONS += vp9 vp9_decoder
+endif
+
+VP9_PREFIX=vp9/
+$(BUILD_PFX)$(VP9_PREFIX)%.c.o: CFLAGS += -Wextra
+
+#  VP10 make file
+ifeq ($(CONFIG_VP10),yes)
+  VP10_PREFIX=vp10/
+  include $(SRC_PATH_BARE)/$(VP10_PREFIX)vp10_common.mk
+endif
+
+ifeq ($(CONFIG_VP10_ENCODER),yes)
+  VP10_PREFIX=vp10/
+  include $(SRC_PATH_BARE)/$(VP10_PREFIX)vp10cx.mk
+  CODEC_SRCS-yes += $(addprefix $(VP10_PREFIX),$(call enabled,VP10_CX_SRCS))
+  CODEC_EXPORTS-yes += $(addprefix $(VP10_PREFIX),$(VP10_CX_EXPORTS))
+  CODEC_SRCS-yes += $(VP10_PREFIX)vp10cx.mk vpx/vp8.h vpx/vp8cx.h
+  INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8cx.h
+  INSTALL-LIBS-$(CONFIG_SPATIAL_SVC) += include/vpx/svc_context.h
+  INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP10_PREFIX)/%
+  CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8cx.h
+  CODEC_DOC_SECTIONS += vp9 vp9_encoder
+endif
+
+ifeq ($(CONFIG_VP10_DECODER),yes)
+  VP10_PREFIX=vp10/
+  include $(SRC_PATH_BARE)/$(VP10_PREFIX)vp10dx.mk
+  CODEC_SRCS-yes += $(addprefix $(VP10_PREFIX),$(call enabled,VP10_DX_SRCS))
+  CODEC_EXPORTS-yes += $(addprefix $(VP10_PREFIX),$(VP10_DX_EXPORTS))
+  CODEC_SRCS-yes += $(VP10_PREFIX)vp10dx.mk vpx/vp8.h vpx/vp8dx.h
+  INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8dx.h
+  INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP10_PREFIX)/%
+  CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8dx.h
+  CODEC_DOC_SECTIONS += vp9 vp9_decoder
+endif
+
+VP10_PREFIX=vp10/
+$(BUILD_PFX)$(VP10_PREFIX)%.c.o: CFLAGS += -Wextra
+
+ifeq ($(CONFIG_ENCODERS),yes)
+  CODEC_DOC_SECTIONS += encoder
+endif
+ifeq ($(CONFIG_DECODERS),yes)
+  CODEC_DOC_SECTIONS += decoder
+endif
+
+
+ifeq ($(CONFIG_MSVS),yes)
+CODEC_LIB=$(if $(CONFIG_STATIC_MSVCRT),vpxmt,vpxmd)
+GTEST_LIB=$(if $(CONFIG_STATIC_MSVCRT),gtestmt,gtestmd)
+# This variable uses deferred expansion intentionally, since the results of
+# $(wildcard) may change during the course of the Make.
+VS_PLATFORMS = $(foreach d,$(wildcard */Release/$(CODEC_LIB).lib),$(word 1,$(subst /, ,$(d))))
+endif
+
+# The following pairs define a mapping of locations in the distribution
+# tree to locations in the source/build trees.
+INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/vpx/%
+INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/vpx_ports/%
+INSTALL_MAPS += $(LIBSUBDIR)/%     %
+INSTALL_MAPS += src/%     $(SRC_PATH_BARE)/%
+ifeq ($(CONFIG_MSVS),yes)
+INSTALL_MAPS += $(foreach p,$(VS_PLATFORMS),$(LIBSUBDIR)/$(p)/%  $(p)/Release/%)
+INSTALL_MAPS += $(foreach p,$(VS_PLATFORMS),$(LIBSUBDIR)/$(p)/%  $(p)/Debug/%)
+endif
+
+CODEC_SRCS-yes += build/make/version.sh
+CODEC_SRCS-yes += build/make/rtcd.pl
+CODEC_SRCS-yes += vpx_ports/emmintrin_compat.h
+CODEC_SRCS-yes += vpx_ports/mem_ops.h
+CODEC_SRCS-yes += vpx_ports/mem_ops_aligned.h
+CODEC_SRCS-yes += vpx_ports/vpx_once.h
+CODEC_SRCS-yes += $(BUILD_PFX)vpx_config.c
+INSTALL-SRCS-no += $(BUILD_PFX)vpx_config.c
+ifeq ($(ARCH_X86)$(ARCH_X86_64),yes)
+INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += third_party/x86inc/x86inc.asm
+endif
+CODEC_EXPORTS-yes += vpx/exports_com
+CODEC_EXPORTS-$(CONFIG_ENCODERS) += vpx/exports_enc
+CODEC_EXPORTS-$(CONFIG_DECODERS) += vpx/exports_dec
+
+INSTALL-LIBS-yes += include/vpx/vpx_codec.h
+INSTALL-LIBS-yes += include/vpx/vpx_frame_buffer.h
+INSTALL-LIBS-yes += include/vpx/vpx_image.h
+INSTALL-LIBS-yes += include/vpx/vpx_integer.h
+INSTALL-LIBS-$(CONFIG_DECODERS) += include/vpx/vpx_decoder.h
+INSTALL-LIBS-$(CONFIG_ENCODERS) += include/vpx/vpx_encoder.h
+ifeq ($(CONFIG_EXTERNAL_BUILD),yes)
+ifeq ($(CONFIG_MSVS),yes)
+INSTALL-LIBS-yes                  += $(foreach p,$(VS_PLATFORMS),$(LIBSUBDIR)/$(p)/$(CODEC_LIB).lib)
+INSTALL-LIBS-$(CONFIG_DEBUG_LIBS) += $(foreach p,$(VS_PLATFORMS),$(LIBSUBDIR)/$(p)/$(CODEC_LIB)d.lib)
+INSTALL-LIBS-$(CONFIG_SHARED) += $(foreach p,$(VS_PLATFORMS),$(LIBSUBDIR)/$(p)/vpx.dll)
+INSTALL-LIBS-$(CONFIG_SHARED) += $(foreach p,$(VS_PLATFORMS),$(LIBSUBDIR)/$(p)/vpx.exp)
+endif
+else
+INSTALL-LIBS-$(CONFIG_STATIC) += $(LIBSUBDIR)/libvpx.a
+INSTALL-LIBS-$(CONFIG_DEBUG_LIBS) += $(LIBSUBDIR)/libvpx_g.a
+endif
+
+CODEC_SRCS=$(call enabled,CODEC_SRCS)
+INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(CODEC_SRCS)
+INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(call enabled,CODEC_EXPORTS)
+
+
+# Generate a list of all enabled sources, in particular for exporting to gyp
+# based build systems.
+libvpx_srcs.txt:
+	@echo "    [CREATE] $@"
+	@echo $(CODEC_SRCS) | xargs -n1 echo | LC_ALL=C sort -u > $@
+CLEAN-OBJS += libvpx_srcs.txt
+
+
+ifeq ($(CONFIG_EXTERNAL_BUILD),yes)
+ifeq ($(CONFIG_MSVS),yes)
+
+vpx.def: $(call enabled,CODEC_EXPORTS)
+	@echo "    [CREATE] $@"
+	$(qexec)$(SRC_PATH_BARE)/build/make/gen_msvs_def.sh\
+            --name=vpx\
+            --out=$@ $^
+CLEAN-OBJS += vpx.def
+
+# Assembly files that are included, but don't define symbols themselves.
+# Filtered out to avoid Visual Studio build warnings.
+ASM_INCLUDES := \
+    third_party/x86inc/x86inc.asm \
+    vpx_config.asm \
+    vpx_ports/x86_abi_support.asm \
+
+vpx.$(VCPROJ_SFX): $(CODEC_SRCS) vpx.def
+	@echo "    [CREATE] $@"
+	$(qexec)$(GEN_VCPROJ) \
+            $(if $(CONFIG_SHARED),--dll,--lib) \
+            --target=$(TOOLCHAIN) \
+            $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
+            --name=vpx \
+            --proj-guid=DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74 \
+            --module-def=vpx.def \
+            --ver=$(CONFIG_VS_VERSION) \
+            --src-path-bare="$(SRC_PATH_BARE)" \
+            --out=$@ $(CFLAGS) \
+            $(filter-out $(addprefix %, $(ASM_INCLUDES)), $^) \
+            --src-path-bare="$(SRC_PATH_BARE)" \
+
+PROJECTS-yes += vpx.$(VCPROJ_SFX)
+
+vpx.$(VCPROJ_SFX): vpx_config.asm
+vpx.$(VCPROJ_SFX): $(RTCD)
+
+endif
+else
+LIBVPX_OBJS=$(call objs,$(CODEC_SRCS))
+OBJS-yes += $(LIBVPX_OBJS)
+LIBS-$(if yes,$(CONFIG_STATIC)) += $(BUILD_PFX)libvpx.a $(BUILD_PFX)libvpx_g.a
+$(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS)
+
+SO_VERSION_MAJOR := 3
+SO_VERSION_MINOR := 0
+SO_VERSION_PATCH := 0
+ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS))
+LIBVPX_SO               := libvpx.$(SO_VERSION_MAJOR).dylib
+SHARED_LIB_SUF          := .dylib
+EXPORT_FILE             := libvpx.syms
+LIBVPX_SO_SYMLINKS      := $(addprefix $(LIBSUBDIR)/, \
+                             libvpx.dylib  )
+else
+ifeq ($(filter os2%,$(TGT_OS)),$(TGT_OS))
+LIBVPX_SO               := libvpx$(SO_VERSION_MAJOR).dll
+SHARED_LIB_SUF          := _dll.a
+EXPORT_FILE             := libvpx.def
+LIBVPX_SO_SYMLINKS      :=
+LIBVPX_SO_IMPLIB        := libvpx_dll.a
+else
+LIBVPX_SO               := libvpx.so.$(SO_VERSION_MAJOR).$(SO_VERSION_MINOR).$(SO_VERSION_PATCH)
+SHARED_LIB_SUF          := .so
+EXPORT_FILE             := libvpx.ver
+LIBVPX_SO_SYMLINKS      := $(addprefix $(LIBSUBDIR)/, \
+                             libvpx.so libvpx.so.$(SO_VERSION_MAJOR) \
+                             libvpx.so.$(SO_VERSION_MAJOR).$(SO_VERSION_MINOR))
+endif
+endif
+
+LIBS-$(CONFIG_SHARED) += $(BUILD_PFX)$(LIBVPX_SO)\
+                           $(notdir $(LIBVPX_SO_SYMLINKS)) \
+                           $(if $(LIBVPX_SO_IMPLIB), $(BUILD_PFX)$(LIBVPX_SO_IMPLIB))
+$(BUILD_PFX)$(LIBVPX_SO): $(LIBVPX_OBJS) $(EXPORT_FILE)
+$(BUILD_PFX)$(LIBVPX_SO): extralibs += -lm
+$(BUILD_PFX)$(LIBVPX_SO): SONAME = libvpx.so.$(SO_VERSION_MAJOR)
+$(BUILD_PFX)$(LIBVPX_SO): EXPORTS_FILE = $(EXPORT_FILE)
+
+libvpx.ver: $(call enabled,CODEC_EXPORTS)
+	@echo "    [CREATE] $@"
+	$(qexec)echo "{ global:" > $@
+	$(qexec)for f in $?; do awk '{print $$2";"}' < $$f >>$@; done
+	$(qexec)echo "local: *; };" >> $@
+CLEAN-OBJS += libvpx.ver
+
+libvpx.syms: $(call enabled,CODEC_EXPORTS)
+	@echo "    [CREATE] $@"
+	$(qexec)awk '{print "_"$$2}' $^ >$@
+CLEAN-OBJS += libvpx.syms
+
+libvpx.def: $(call enabled,CODEC_EXPORTS)
+	@echo "    [CREATE] $@"
+	$(qexec)echo LIBRARY $(LIBVPX_SO:.dll=) INITINSTANCE TERMINSTANCE > $@
+	$(qexec)echo "DATA MULTIPLE NONSHARED" >> $@
+	$(qexec)echo "EXPORTS" >> $@
+	$(qexec)awk '!/vpx_svc_*/ {print "_"$$2}' $^ >>$@
+CLEAN-OBJS += libvpx.def
+
+libvpx_dll.a: $(LIBVPX_SO)
+	@echo "    [IMPLIB] $@"
+	$(qexec)emximp -o $@ $<
+CLEAN-OBJS += libvpx_dll.a
+
+define libvpx_symlink_template
+$(1): $(2)
+	@echo "    [LN]     $(2) $$@"
+	$(qexec)mkdir -p $$(dir $$@)
+	$(qexec)ln -sf $(2) $$@
+endef
+
+$(eval $(call libvpx_symlink_template,\
+    $(addprefix $(BUILD_PFX),$(notdir $(LIBVPX_SO_SYMLINKS))),\
+    $(BUILD_PFX)$(LIBVPX_SO)))
+$(eval $(call libvpx_symlink_template,\
+    $(addprefix $(DIST_DIR)/,$(LIBVPX_SO_SYMLINKS)),\
+    $(LIBVPX_SO)))
+
+
+INSTALL-LIBS-$(CONFIG_SHARED) += $(LIBVPX_SO_SYMLINKS)
+INSTALL-LIBS-$(CONFIG_SHARED) += $(LIBSUBDIR)/$(LIBVPX_SO)
+INSTALL-LIBS-$(CONFIG_SHARED) += $(if $(LIBVPX_SO_IMPLIB),$(LIBSUBDIR)/$(LIBVPX_SO_IMPLIB))
+
+
+LIBS-yes += vpx.pc
+vpx.pc: config.mk libs.mk
+	@echo "    [CREATE] $@"
+	$(qexec)echo '# pkg-config file from libvpx $(VERSION_STRING)' > $@
+	$(qexec)echo 'prefix=$(PREFIX)' >> $@
+	$(qexec)echo 'exec_prefix=$${prefix}' >> $@
+	$(qexec)echo 'libdir=$${prefix}/$(LIBSUBDIR)' >> $@
+	$(qexec)echo 'includedir=$${prefix}/include' >> $@
+	$(qexec)echo '' >> $@
+	$(qexec)echo 'Name: vpx' >> $@
+	$(qexec)echo 'Description: WebM Project VPx codec implementation' >> $@
+	$(qexec)echo 'Version: $(VERSION_MAJOR).$(VERSION_MINOR).$(VERSION_PATCH)' >> $@
+	$(qexec)echo 'Requires:' >> $@
+	$(qexec)echo 'Conflicts:' >> $@
+	$(qexec)echo 'Libs: -L$${libdir} -lvpx -lm' >> $@
+ifeq ($(HAVE_PTHREAD_H),yes)
+	$(qexec)echo 'Libs.private: -lm -lpthread' >> $@
+else
+	$(qexec)echo 'Libs.private: -lm' >> $@
+endif
+	$(qexec)echo 'Cflags: -I$${includedir}' >> $@
+INSTALL-LIBS-yes += $(LIBSUBDIR)/pkgconfig/vpx.pc
+INSTALL_MAPS += $(LIBSUBDIR)/pkgconfig/%.pc %.pc
+CLEAN-OBJS += vpx.pc
+endif
+
+#
+# Rule to make assembler configuration file from C configuration file
+#
+ifeq ($(ARCH_X86)$(ARCH_X86_64),yes)
+# YASM
+$(BUILD_PFX)vpx_config.asm: $(BUILD_PFX)vpx_config.h
+	@echo "    [CREATE] $@"
+	@egrep "#define [A-Z0-9_]+ [01]" $< \
+	    | awk '{print $$2 " equ " $$3}' > $@
+else
+ADS2GAS=$(if $(filter yes,$(CONFIG_GCC)),| $(ASM_CONVERSION))
+$(BUILD_PFX)vpx_config.asm: $(BUILD_PFX)vpx_config.h
+	@echo "    [CREATE] $@"
+	@egrep "#define [A-Z0-9_]+ [01]" $< \
+	    | awk '{print $$2 " EQU " $$3}' $(ADS2GAS) > $@
+	@echo "        END" $(ADS2GAS) >> $@
+CLEAN-OBJS += $(BUILD_PFX)vpx_config.asm
+endif
+
+#
+# Add assembler dependencies for configuration.
+#
+$(filter %.s.o,$(OBJS-yes)):     $(BUILD_PFX)vpx_config.asm
+$(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm
+
+
+$(shell $(SRC_PATH_BARE)/build/make/version.sh "$(SRC_PATH_BARE)" $(BUILD_PFX)vpx_version.h)
+CLEAN-OBJS += $(BUILD_PFX)vpx_version.h
+
+
+##
+## libvpx test directives
+##
+ifeq ($(CONFIG_UNIT_TESTS),yes)
+LIBVPX_TEST_DATA_PATH ?= .
+
+include $(SRC_PATH_BARE)/test/test.mk
+LIBVPX_TEST_SRCS=$(addprefix test/,$(call enabled,LIBVPX_TEST_SRCS))
+LIBVPX_TEST_BIN=./test_libvpx$(EXE_SFX)
+LIBVPX_TEST_DATA=$(addprefix $(LIBVPX_TEST_DATA_PATH)/,\
+                     $(call enabled,LIBVPX_TEST_DATA))
+libvpx_test_data_url=http://downloads.webmproject.org/test_data/libvpx/$(1)
+
+TEST_INTRA_PRED_SPEED_BIN=./test_intra_pred_speed$(EXE_SFX)
+TEST_INTRA_PRED_SPEED_SRCS=$(addprefix test/,$(call enabled,TEST_INTRA_PRED_SPEED_SRCS))
+TEST_INTRA_PRED_SPEED_OBJS := $(sort $(call objs,$(TEST_INTRA_PRED_SPEED_SRCS)))
+
+libvpx_test_srcs.txt:
+	@echo "    [CREATE] $@"
+	@echo $(LIBVPX_TEST_SRCS) | xargs -n1 echo | LC_ALL=C sort -u > $@
+CLEAN-OBJS += libvpx_test_srcs.txt
+
+$(LIBVPX_TEST_DATA): $(SRC_PATH_BARE)/test/test-data.sha1
+	@echo "    [DOWNLOAD] $@"
+	$(qexec)trap 'rm -f $@' INT TERM &&\
+            curl -L -o $@ $(call libvpx_test_data_url,$(@F))
+
+testdata:: $(LIBVPX_TEST_DATA)
+	$(qexec)[ -x "$$(which sha1sum)" ] && sha1sum=sha1sum;\
+          [ -x "$$(which shasum)" ] && sha1sum=shasum;\
+          [ -x "$$(which sha1)" ] && sha1sum=sha1;\
+          if [ -n "$${sha1sum}" ]; then\
+            set -e;\
+            echo "Checking test data:";\
+            for f in $(call enabled,LIBVPX_TEST_DATA); do\
+                grep $$f $(SRC_PATH_BARE)/test/test-data.sha1 |\
+                    (cd $(LIBVPX_TEST_DATA_PATH); $${sha1sum} -c);\
+            done; \
+        else\
+            echo "Skipping test data integrity check, sha1sum not found.";\
+        fi
+
+ifeq ($(CONFIG_EXTERNAL_BUILD),yes)
+ifeq ($(CONFIG_MSVS),yes)
+
+gtest.$(VCPROJ_SFX): $(SRC_PATH_BARE)/third_party/googletest/src/src/gtest-all.cc
+	@echo "    [CREATE] $@"
+	$(qexec)$(GEN_VCPROJ) \
+            --lib \
+            --target=$(TOOLCHAIN) \
+            $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
+            --name=gtest \
+            --proj-guid=EC00E1EC-AF68-4D92-A255-181690D1C9B1 \
+            --ver=$(CONFIG_VS_VERSION) \
+            --src-path-bare="$(SRC_PATH_BARE)" \
+            -D_VARIADIC_MAX=10 \
+            --out=gtest.$(VCPROJ_SFX) $(SRC_PATH_BARE)/third_party/googletest/src/src/gtest-all.cc \
+            -I. -I"$(SRC_PATH_BARE)/third_party/googletest/src/include" -I"$(SRC_PATH_BARE)/third_party/googletest/src"
+
+PROJECTS-$(CONFIG_MSVS) += gtest.$(VCPROJ_SFX)
+
+test_libvpx.$(VCPROJ_SFX): $(LIBVPX_TEST_SRCS) vpx.$(VCPROJ_SFX) gtest.$(VCPROJ_SFX)
+	@echo "    [CREATE] $@"
+	$(qexec)$(GEN_VCPROJ) \
+            --exe \
+            --target=$(TOOLCHAIN) \
+            --name=test_libvpx \
+            -D_VARIADIC_MAX=10 \
+            --proj-guid=CD837F5F-52D8-4314-A370-895D614166A7 \
+            --ver=$(CONFIG_VS_VERSION) \
+            --src-path-bare="$(SRC_PATH_BARE)" \
+            $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
+            --out=$@ $(INTERNAL_CFLAGS) $(CFLAGS) \
+            -I. -I"$(SRC_PATH_BARE)/third_party/googletest/src/include" \
+            -L. -l$(CODEC_LIB) -l$(GTEST_LIB) $^
+
+PROJECTS-$(CONFIG_MSVS) += test_libvpx.$(VCPROJ_SFX)
+
+LIBVPX_TEST_BIN := $(addprefix $(TGT_OS:win64=x64)/Release/,$(notdir $(LIBVPX_TEST_BIN)))
+
+ifneq ($(strip $(TEST_INTRA_PRED_SPEED_OBJS)),)
+PROJECTS-$(CONFIG_MSVS) += test_intra_pred_speed.$(VCPROJ_SFX)
+test_intra_pred_speed.$(VCPROJ_SFX): $(TEST_INTRA_PRED_SPEED_SRCS) vpx.$(VCPROJ_SFX) gtest.$(VCPROJ_SFX)
+	@echo "    [CREATE] $@"
+	$(qexec)$(GEN_VCPROJ) \
+            --exe \
+            --target=$(TOOLCHAIN) \
+            --name=test_intra_pred_speed \
+            -D_VARIADIC_MAX=10 \
+            --proj-guid=CD837F5F-52D8-4314-A370-895D614166A7 \
+            --ver=$(CONFIG_VS_VERSION) \
+            --src-path-bare="$(SRC_PATH_BARE)" \
+            $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
+            --out=$@ $(INTERNAL_CFLAGS) $(CFLAGS) \
+            -I. -I"$(SRC_PATH_BARE)/third_party/googletest/src/include" \
+            -L. -l$(CODEC_LIB) -l$(GTEST_LIB) $^
+endif  # TEST_INTRA_PRED_SPEED
+endif
+else
+
+include $(SRC_PATH_BARE)/third_party/googletest/gtest.mk
+GTEST_SRCS := $(addprefix third_party/googletest/src/,$(call enabled,GTEST_SRCS))
+GTEST_OBJS=$(call objs,$(GTEST_SRCS))
+ifeq ($(filter win%,$(TGT_OS)),$(TGT_OS))
+# Disabling pthreads globally will cause issues on darwin and possibly elsewhere
+$(GTEST_OBJS) $(GTEST_OBJS:.o=.d): CXXFLAGS += -DGTEST_HAS_PTHREAD=0
+endif
+GTEST_INCLUDES := -I$(SRC_PATH_BARE)/third_party/googletest/src
+GTEST_INCLUDES += -I$(SRC_PATH_BARE)/third_party/googletest/src/include
+$(GTEST_OBJS) $(GTEST_OBJS:.o=.d): CXXFLAGS += $(GTEST_INCLUDES)
+OBJS-yes += $(GTEST_OBJS)
+LIBS-yes += $(BUILD_PFX)libgtest.a $(BUILD_PFX)libgtest_g.a
+$(BUILD_PFX)libgtest_g.a: $(GTEST_OBJS)
+
+LIBVPX_TEST_OBJS=$(sort $(call objs,$(LIBVPX_TEST_SRCS)))
+$(LIBVPX_TEST_OBJS) $(LIBVPX_TEST_OBJS:.o=.d): CXXFLAGS += $(GTEST_INCLUDES)
+OBJS-yes += $(LIBVPX_TEST_OBJS)
+BINS-yes += $(LIBVPX_TEST_BIN)
+
+CODEC_LIB=$(if $(CONFIG_DEBUG_LIBS),vpx_g,vpx)
+CODEC_LIB_SUF=$(if $(CONFIG_SHARED),$(SHARED_LIB_SUF),.a)
+TEST_LIBS := lib$(CODEC_LIB)$(CODEC_LIB_SUF) libgtest.a
+$(LIBVPX_TEST_BIN): $(TEST_LIBS)
+$(eval $(call linkerxx_template,$(LIBVPX_TEST_BIN), \
+              $(LIBVPX_TEST_OBJS) \
+              -L. -lvpx -lgtest $(extralibs) -lm))
+
+ifneq ($(strip $(TEST_INTRA_PRED_SPEED_OBJS)),)
+$(TEST_INTRA_PRED_SPEED_OBJS) $(TEST_INTRA_PRED_SPEED_OBJS:.o=.d): CXXFLAGS += $(GTEST_INCLUDES)
+OBJS-yes += $(TEST_INTRA_PRED_SPEED_OBJS)
+BINS-yes += $(TEST_INTRA_PRED_SPEED_BIN)
+
+$(TEST_INTRA_PRED_SPEED_BIN): $(TEST_LIBS)
+$(eval $(call linkerxx_template,$(TEST_INTRA_PRED_SPEED_BIN), \
+              $(TEST_INTRA_PRED_SPEED_OBJS) \
+              -L. -lvpx -lgtest $(extralibs) -lm))
+endif  # TEST_INTRA_PRED_SPEED
+
+endif  # CONFIG_UNIT_TESTS
+
+# Install test sources only if codec source is included
+INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(patsubst $(SRC_PATH_BARE)/%,%,\
+    $(shell find $(SRC_PATH_BARE)/third_party/googletest -type f))
+INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(LIBVPX_TEST_SRCS)
+INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(TEST_INTRA_PRED_SPEED_SRCS)
+
+define test_shard_template
+test:: test_shard.$(1)
+test-no-data-check:: test_shard_ndc.$(1)
+test_shard.$(1) test_shard_ndc.$(1): $(LIBVPX_TEST_BIN)
+	@set -e; \
+	 export GTEST_SHARD_INDEX=$(1); \
+	 export GTEST_TOTAL_SHARDS=$(2); \
+	 $(LIBVPX_TEST_BIN)
+test_shard.$(1): testdata
+.PHONY: test_shard.$(1)
+endef
+
+NUM_SHARDS := 10
+SHARDS := 0 1 2 3 4 5 6 7 8 9
+$(foreach s,$(SHARDS),$(eval $(call test_shard_template,$(s),$(NUM_SHARDS))))
+
+endif
+
+##
+## documentation directives
+##
+CLEAN-OBJS += libs.doxy
+DOCS-yes += libs.doxy
+libs.doxy: $(CODEC_DOC_SRCS)
+	@echo "    [CREATE] $@"
+	@rm -f $@
+	@echo "INPUT += $^" >> $@
+	@echo "INCLUDE_PATH += ." >> $@;
+	@echo "ENABLED_SECTIONS += $(sort $(CODEC_DOC_SECTIONS))" >> $@
+
+## Generate rtcd.h for all objects
+ifeq ($(CONFIG_DEPENDENCY_TRACKING),yes)
+$(OBJS-yes:.o=.d): $(RTCD)
+else
+$(OBJS-yes): $(RTCD)
+endif
+
+## Update the global src list
+SRCS += $(CODEC_SRCS) $(LIBVPX_TEST_SRCS) $(GTEST_SRCS)
+
+##
+## vpxdec/vpxenc tests.
+##
+ifeq ($(CONFIG_UNIT_TESTS),yes)
+TEST_BIN_PATH = .
+ifeq ($(CONFIG_MSVS),yes)
+# MSVC will build both Debug and Release configurations of tools in a
+# sub directory named for the current target. Assume the user wants to
+# run the Release tools, and assign TEST_BIN_PATH accordingly.
+# TODO(tomfinegan): Is this adequate for ARM?
+# TODO(tomfinegan): Support running the debug versions of tools?
+TEST_BIN_PATH := $(addsuffix /$(TGT_OS:win64=x64)/Release, $(TEST_BIN_PATH))
+endif
+utiltest utiltest-no-data-check:
+	$(qexec)$(SRC_PATH_BARE)/test/vpxdec.sh \
+		--test-data-path $(LIBVPX_TEST_DATA_PATH) \
+		--bin-path $(TEST_BIN_PATH)
+	$(qexec)$(SRC_PATH_BARE)/test/vpxenc.sh \
+		--test-data-path $(LIBVPX_TEST_DATA_PATH) \
+		--bin-path $(TEST_BIN_PATH)
+utiltest: testdata
+else
+utiltest utiltest-no-data-check:
+	@echo Unit tests must be enabled to make the utiltest target.
+endif
+
+##
+## Example tests.
+##
+ifeq ($(CONFIG_UNIT_TESTS),yes)
+# All non-MSVC targets output example targets in a sub dir named examples.
+EXAMPLES_BIN_PATH = examples
+ifeq ($(CONFIG_MSVS),yes)
+# MSVC will build both Debug and Release configurations of the examples in a
+# sub directory named for the current target. Assume the user wants to
+# run the Release tools, and assign EXAMPLES_BIN_PATH accordingly.
+# TODO(tomfinegan): Is this adequate for ARM?
+# TODO(tomfinegan): Support running the debug versions of tools?
+EXAMPLES_BIN_PATH := $(TGT_OS:win64=x64)/Release
+endif
+exampletest exampletest-no-data-check: examples
+	$(qexec)$(SRC_PATH_BARE)/test/examples.sh \
+		--test-data-path $(LIBVPX_TEST_DATA_PATH) \
+		--bin-path $(EXAMPLES_BIN_PATH)
+exampletest: testdata
+else
+exampletest exampletest-no-data-check:
+	@echo Unit tests must be enabled to make the exampletest target.
+endif
diff --git a/libs/libvpx/mainpage.dox b/libs/libvpx/mainpage.dox
new file mode 100644
index 0000000000..ec202fa4fb
--- /dev/null
+++ b/libs/libvpx/mainpage.dox
@@ -0,0 +1,53 @@
+/*!\mainpage WebM Codec SDK
+
+  \section main_contents Page Contents
+  - \ref main_intro
+  - \ref main_startpoints
+  - \ref main_support
+
+  \section main_intro Introduction
+  Welcome to the WebM Codec SDK. This SDK allows you to integrate your
+  applications with the VP8 and VP9 video codecs, high quality, royalty free,
+  open source codecs deployed on billions of computers and devices worldwide.
+
+  This distribution of the WebM Codec SDK includes the following support:
+
+  \if vp8_encoder
+  - \ref vp8_encoder
+  \endif
+  \if vp8_decoder
+  - \ref vp8_decoder
+  \endif
+
+
+  \section main_startpoints Starting Points
+  - Consult the \ref changelog for a complete list of improvements in this
+    release.
+  - The \ref readme contains instructions on recompiling the sample applications.
+  - Read the \ref usage "usage" for a narrative on codec usage.
+  - Read the \ref samples "sample code" for examples of how to interact with the
+    codec.
+  - \ref codec reference
+  \if encoder
+  - \ref encoder reference
+  \endif
+  \if decoder
+  - \ref decoder reference
+  \endif
+
+  \section main_support Support Options & FAQ
+  The WebM project is an open source project supported by its community. For
+  questions about this SDK, please mail the apps-devel@webmproject.org list.
+  To contribute, see http://www.webmproject.org/code/contribute and mail
+  codec-devel@webmproject.org.
+*/
+
+/*!\page changelog CHANGELOG
+   \verbinclude CHANGELOG
+*/
+
+/*!\page readme README
+   \verbinclude README
+*/
+
+/*!\defgroup codecs Supported Codecs */
diff --git a/libs/libvpx/md5_utils.c b/libs/libvpx/md5_utils.c
new file mode 100644
index 0000000000..f4f893a2d6
--- /dev/null
+++ b/libs/libvpx/md5_utils.c
@@ -0,0 +1,241 @@
+/*
+ * This code implements the MD5 message-digest algorithm.
+ * The algorithm is due to Ron Rivest.  This code was
+ * written by Colin Plumb in 1993, no copyright is claimed.
+ * This code is in the public domain; do with it what you wish.
+ *
+ * Equivalent code is available from RSA Data Security, Inc.
+ * This code has been tested against that, and is equivalent,
+ * except that you don't need to include two pages of legalese
+ * with every copy.
+ *
+ * To compute the message digest of a chunk of bytes, declare an
+ * MD5Context structure, pass it to MD5Init, call MD5Update as
+ * needed on buffers full of bytes, and then call MD5Final, which
+ * will fill a supplied 16-byte array with the digest.
+ *
+ * Changed so as no longer to depend on Colin Plumb's `usual.h' header
+ * definitions
+ *  - Ian Jackson <ian@chiark.greenend.org.uk>.
+ * Still in the public domain.
+ */
+
+#include <string.h>   /* for memcpy() */
+
+#include "md5_utils.h"
+
+static void
+byteSwap(UWORD32 *buf, unsigned words) {
+  md5byte *p;
+
+  /* Only swap bytes for big endian machines */
+  int i = 1;
+
+  if (*(char *)&i == 1)
+    return;
+
+  p = (md5byte *)buf;
+
+  do {
+    *buf++ = (UWORD32)((unsigned)p[3] << 8 | p[2]) << 16 |
+             ((unsigned)p[1] << 8 | p[0]);
+    p += 4;
+  } while (--words);
+}
+
+/*
+ * Start MD5 accumulation.  Set bit count to 0 and buffer to mysterious
+ * initialization constants.
+ */
+void
+MD5Init(struct MD5Context *ctx) {
+  ctx->buf[0] = 0x67452301;
+  ctx->buf[1] = 0xefcdab89;
+  ctx->buf[2] = 0x98badcfe;
+  ctx->buf[3] = 0x10325476;
+
+  ctx->bytes[0] = 0;
+  ctx->bytes[1] = 0;
+}
+
+/*
+ * Update context to reflect the concatenation of another buffer full
+ * of bytes.
+ */
+void
+MD5Update(struct MD5Context *ctx, md5byte const *buf, unsigned len) {
+  UWORD32 t;
+
+  /* Update byte count */
+
+  t = ctx->bytes[0];
+
+  if ((ctx->bytes[0] = t + len) < t)
+    ctx->bytes[1]++;  /* Carry from low to high */
+
+  t = 64 - (t & 0x3f);  /* Space available in ctx->in (at least 1) */
+
+  if (t > len) {
+    memcpy((md5byte *)ctx->in + 64 - t, buf, len);
+    return;
+  }
+
+  /* First chunk is an odd size */
+  memcpy((md5byte *)ctx->in + 64 - t, buf, t);
+  byteSwap(ctx->in, 16);
+  MD5Transform(ctx->buf, ctx->in);
+  buf += t;
+  len -= t;
+
+  /* Process data in 64-byte chunks */
+  while (len >= 64) {
+    memcpy(ctx->in, buf, 64);
+    byteSwap(ctx->in, 16);
+    MD5Transform(ctx->buf, ctx->in);
+    buf += 64;
+    len -= 64;
+  }
+
+  /* Handle any remaining bytes of data. */
+  memcpy(ctx->in, buf, len);
+}
+
+/*
+ * Final wrapup - pad to 64-byte boundary with the bit pattern
+ * 1 0* (64-bit count of bits processed, MSB-first)
+ */
+void
+MD5Final(md5byte digest[16], struct MD5Context *ctx) {
+  int count = ctx->bytes[0] & 0x3f; /* Number of bytes in ctx->in */
+  md5byte *p = (md5byte *)ctx->in + count;
+
+  /* Set the first char of padding to 0x80.  There is always room. */
+  *p++ = 0x80;
+
+  /* Bytes of padding needed to make 56 bytes (-8..55) */
+  count = 56 - 1 - count;
+
+  if (count < 0) {  /* Padding forces an extra block */
+    memset(p, 0, count + 8);
+    byteSwap(ctx->in, 16);
+    MD5Transform(ctx->buf, ctx->in);
+    p = (md5byte *)ctx->in;
+    count = 56;
+  }
+
+  memset(p, 0, count);
+  byteSwap(ctx->in, 14);
+
+  /* Append length in bits and transform */
+  ctx->in[14] = ctx->bytes[0] << 3;
+  ctx->in[15] = ctx->bytes[1] << 3 | ctx->bytes[0] >> 29;
+  MD5Transform(ctx->buf, ctx->in);
+
+  byteSwap(ctx->buf, 4);
+  memcpy(digest, ctx->buf, 16);
+  memset(ctx, 0, sizeof(*ctx)); /* In case it's sensitive */
+}
+
+#ifndef ASM_MD5
+
+/* The four core functions - F1 is optimized somewhat */
+
+/* #define F1(x, y, z) (x & y | ~x & z) */
+#define F1(x, y, z) (z ^ (x & (y ^ z)))
+#define F2(x, y, z) F1(z, x, y)
+#define F3(x, y, z) (x ^ y ^ z)
+#define F4(x, y, z) (y ^ (x | ~z))
+
+/* This is the central step in the MD5 algorithm. */
+#define MD5STEP(f,w,x,y,z,in,s) \
+  (w += f(x,y,z) + in, w = (w<<s | w>>(32-s)) + x)
+
+/*
+ * The core of the MD5 algorithm, this alters an existing MD5 hash to
+ * reflect the addition of 16 longwords of new data.  MD5Update blocks
+ * the data and converts bytes into longwords for this routine.
+ */
+void
+MD5Transform(UWORD32 buf[4], UWORD32 const in[16]) {
+  register UWORD32 a, b, c, d;
+
+  a = buf[0];
+  b = buf[1];
+  c = buf[2];
+  d = buf[3];
+
+  MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7);
+  MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12);
+  MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17);
+  MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22);
+  MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7);
+  MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12);
+  MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17);
+  MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22);
+  MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7);
+  MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12);
+  MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17);
+  MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22);
+  MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7);
+  MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12);
+  MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17);
+  MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22);
+
+  MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5);
+  MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9);
+  MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14);
+  MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20);
+  MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5);
+  MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9);
+  MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14);
+  MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20);
+  MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5);
+  MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9);
+  MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14);
+  MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20);
+  MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5);
+  MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9);
+  MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14);
+  MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20);
+
+  MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4);
+  MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11);
+  MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16);
+  MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23);
+  MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4);
+  MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11);
+  MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16);
+  MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23);
+  MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4);
+  MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11);
+  MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16);
+  MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23);
+  MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4);
+  MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11);
+  MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16);
+  MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23);
+
+  MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6);
+  MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10);
+  MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15);
+  MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21);
+  MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6);
+  MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10);
+  MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15);
+  MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21);
+  MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6);
+  MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10);
+  MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15);
+  MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21);
+  MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6);
+  MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10);
+  MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15);
+  MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21);
+
+  buf[0] += a;
+  buf[1] += b;
+  buf[2] += c;
+  buf[3] += d;
+}
+
+#endif
diff --git a/libs/libvpx/md5_utils.h b/libs/libvpx/md5_utils.h
new file mode 100644
index 0000000000..bd4991b3ad
--- /dev/null
+++ b/libs/libvpx/md5_utils.h
@@ -0,0 +1,49 @@
+/*
+ * This is the header file for the MD5 message-digest algorithm.
+ * The algorithm is due to Ron Rivest.  This code was
+ * written by Colin Plumb in 1993, no copyright is claimed.
+ * This code is in the public domain; do with it what you wish.
+ *
+ * Equivalent code is available from RSA Data Security, Inc.
+ * This code has been tested against that, and is equivalent,
+ * except that you don't need to include two pages of legalese
+ * with every copy.
+ *
+ * To compute the message digest of a chunk of bytes, declare an
+ * MD5Context structure, pass it to MD5Init, call MD5Update as
+ * needed on buffers full of bytes, and then call MD5Final, which
+ * will fill a supplied 16-byte array with the digest.
+ *
+ * Changed so as no longer to depend on Colin Plumb's `usual.h'
+ * header definitions
+ *  - Ian Jackson <ian@chiark.greenend.org.uk>.
+ * Still in the public domain.
+ */
+
+#ifndef MD5_UTILS_H_
+#define MD5_UTILS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define md5byte unsigned char
+#define UWORD32 unsigned int
+
+typedef struct MD5Context MD5Context;
+struct MD5Context {
+  UWORD32 buf[4];
+  UWORD32 bytes[2];
+  UWORD32 in[16];
+};
+
+void MD5Init(struct MD5Context *context);
+void MD5Update(struct MD5Context *context, md5byte const *buf, unsigned len);
+void MD5Final(unsigned char digest[16], struct MD5Context *context);
+void MD5Transform(UWORD32 buf[4], UWORD32 const in[16]);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // MD5_UTILS_H_
diff --git a/libs/libvpx/rate_hist.c b/libs/libvpx/rate_hist.c
new file mode 100644
index 0000000000..a77222b161
--- /dev/null
+++ b/libs/libvpx/rate_hist.c
@@ -0,0 +1,285 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+#include <limits.h>
+#include <stdio.h>
+#include <math.h>
+
+#include "./rate_hist.h"
+
+#define RATE_BINS 100
+#define HIST_BAR_MAX 40
+
+struct hist_bucket {
+  int low;
+  int high;
+  int count;
+};
+
+struct rate_hist {
+  int64_t *pts;
+  int *sz;
+  int samples;
+  int frames;
+  struct hist_bucket bucket[RATE_BINS];
+  int total;
+};
+
+struct rate_hist *init_rate_histogram(const vpx_codec_enc_cfg_t *cfg,
+                                      const vpx_rational_t *fps) {
+  int i;
+  struct rate_hist *hist = malloc(sizeof(*hist));
+
+  // Determine the number of samples in the buffer. Use the file's framerate
+  // to determine the number of frames in rc_buf_sz milliseconds, with an
+  // adjustment (5/4) to account for alt-refs
+  hist->samples = cfg->rc_buf_sz * 5 / 4 * fps->num / fps->den / 1000;
+
+  // prevent division by zero
+  if (hist->samples == 0)
+    hist->samples = 1;
+
+  hist->frames = 0;
+  hist->total = 0;
+
+  hist->pts = calloc(hist->samples, sizeof(*hist->pts));
+  hist->sz = calloc(hist->samples, sizeof(*hist->sz));
+  for (i = 0; i < RATE_BINS; i++) {
+    hist->bucket[i].low = INT_MAX;
+    hist->bucket[i].high = 0;
+    hist->bucket[i].count = 0;
+  }
+
+  return hist;
+}
+
+void destroy_rate_histogram(struct rate_hist *hist) {
+  if (hist) {
+    free(hist->pts);
+    free(hist->sz);
+    free(hist);
+  }
+}
+
+void update_rate_histogram(struct rate_hist *hist,
+                           const vpx_codec_enc_cfg_t *cfg,
+                           const vpx_codec_cx_pkt_t *pkt) {
+  int i;
+  int64_t then = 0;
+  int64_t avg_bitrate = 0;
+  int64_t sum_sz = 0;
+  const int64_t now = pkt->data.frame.pts * 1000 *
+                          (uint64_t)cfg->g_timebase.num /
+                              (uint64_t)cfg->g_timebase.den;
+
+  int idx = hist->frames++ % hist->samples;
+  hist->pts[idx] = now;
+  hist->sz[idx] = (int)pkt->data.frame.sz;
+
+  if (now < cfg->rc_buf_initial_sz)
+    return;
+
+  if (!cfg->rc_target_bitrate)
+    return;
+
+  then = now;
+
+  /* Sum the size over the past rc_buf_sz ms */
+  for (i = hist->frames; i > 0 && hist->frames - i < hist->samples; i--) {
+    const int i_idx = (i - 1) % hist->samples;
+
+    then = hist->pts[i_idx];
+    if (now - then > cfg->rc_buf_sz)
+      break;
+    sum_sz += hist->sz[i_idx];
+  }
+
+  if (now == then)
+    return;
+
+  avg_bitrate = sum_sz * 8 * 1000 / (now - then);
+  idx = (int)(avg_bitrate * (RATE_BINS / 2) / (cfg->rc_target_bitrate * 1000));
+  if (idx < 0)
+    idx = 0;
+  if (idx > RATE_BINS - 1)
+    idx = RATE_BINS - 1;
+  if (hist->bucket[idx].low > avg_bitrate)
+    hist->bucket[idx].low = (int)avg_bitrate;
+  if (hist->bucket[idx].high < avg_bitrate)
+    hist->bucket[idx].high = (int)avg_bitrate;
+  hist->bucket[idx].count++;
+  hist->total++;
+}
+
+static int merge_hist_buckets(struct hist_bucket *bucket,
+                              int max_buckets, int *num_buckets) {
+  int small_bucket = 0, merge_bucket = INT_MAX, big_bucket = 0;
+  int buckets = *num_buckets;
+  int i;
+
+  /* Find the extrema for this list of buckets */
+  big_bucket = small_bucket = 0;
+  for (i = 0; i < buckets; i++) {
+    if (bucket[i].count < bucket[small_bucket].count)
+      small_bucket = i;
+    if (bucket[i].count > bucket[big_bucket].count)
+      big_bucket = i;
+  }
+
+  /* If we have too many buckets, merge the smallest with an adjacent
+   * bucket.
+   */
+  while (buckets > max_buckets) {
+    int last_bucket = buckets - 1;
+
+    /* merge the small bucket with an adjacent one. */
+    if (small_bucket == 0)
+      merge_bucket = 1;
+    else if (small_bucket == last_bucket)
+      merge_bucket = last_bucket - 1;
+    else if (bucket[small_bucket - 1].count < bucket[small_bucket + 1].count)
+      merge_bucket = small_bucket - 1;
+    else
+      merge_bucket = small_bucket + 1;
+
+    assert(abs(merge_bucket - small_bucket) <= 1);
+    assert(small_bucket < buckets);
+    assert(big_bucket < buckets);
+    assert(merge_bucket < buckets);
+
+    if (merge_bucket < small_bucket) {
+      bucket[merge_bucket].high = bucket[small_bucket].high;
+      bucket[merge_bucket].count += bucket[small_bucket].count;
+    } else {
+      bucket[small_bucket].high = bucket[merge_bucket].high;
+      bucket[small_bucket].count += bucket[merge_bucket].count;
+      merge_bucket = small_bucket;
+    }
+
+    assert(bucket[merge_bucket].low != bucket[merge_bucket].high);
+
+    buckets--;
+
+    /* Remove the merge_bucket from the list, and find the new small
+     * and big buckets while we're at it
+     */
+    big_bucket = small_bucket = 0;
+    for (i = 0; i < buckets; i++) {
+      if (i > merge_bucket)
+        bucket[i] = bucket[i + 1];
+
+      if (bucket[i].count < bucket[small_bucket].count)
+        small_bucket = i;
+      if (bucket[i].count > bucket[big_bucket].count)
+        big_bucket = i;
+    }
+  }
+
+  *num_buckets = buckets;
+  return bucket[big_bucket].count;
+}
+
+static void show_histogram(const struct hist_bucket *bucket,
+                           int buckets, int total, int scale) {
+  const char *pat1, *pat2;
+  int i;
+
+  switch ((int)(log(bucket[buckets - 1].high) / log(10)) + 1) {
+    case 1:
+    case 2:
+      pat1 = "%4d %2s: ";
+      pat2 = "%4d-%2d: ";
+      break;
+    case 3:
+      pat1 = "%5d %3s: ";
+      pat2 = "%5d-%3d: ";
+      break;
+    case 4:
+      pat1 = "%6d %4s: ";
+      pat2 = "%6d-%4d: ";
+      break;
+    case 5:
+      pat1 = "%7d %5s: ";
+      pat2 = "%7d-%5d: ";
+      break;
+    case 6:
+      pat1 = "%8d %6s: ";
+      pat2 = "%8d-%6d: ";
+      break;
+    case 7:
+      pat1 = "%9d %7s: ";
+      pat2 = "%9d-%7d: ";
+      break;
+    default:
+      pat1 = "%12d %10s: ";
+      pat2 = "%12d-%10d: ";
+      break;
+  }
+
+  for (i = 0; i < buckets; i++) {
+    int len;
+    int j;
+    float pct;
+
+    pct = (float)(100.0 * bucket[i].count / total);
+    len = HIST_BAR_MAX * bucket[i].count / scale;
+    if (len < 1)
+      len = 1;
+    assert(len <= HIST_BAR_MAX);
+
+    if (bucket[i].low == bucket[i].high)
+      fprintf(stderr, pat1, bucket[i].low, "");
+    else
+      fprintf(stderr, pat2, bucket[i].low, bucket[i].high);
+
+    for (j = 0; j < HIST_BAR_MAX; j++)
+      fprintf(stderr, j < len ? "=" : " ");
+    fprintf(stderr, "\t%5d (%6.2f%%)\n", bucket[i].count, pct);
+  }
+}
+
+void show_q_histogram(const int counts[64], int max_buckets) {
+  struct hist_bucket bucket[64];
+  int buckets = 0;
+  int total = 0;
+  int scale;
+  int i;
+
+  for (i = 0; i < 64; i++) {
+    if (counts[i]) {
+      bucket[buckets].low = bucket[buckets].high = i;
+      bucket[buckets].count = counts[i];
+      buckets++;
+      total += counts[i];
+    }
+  }
+
+  fprintf(stderr, "\nQuantizer Selection:\n");
+  scale = merge_hist_buckets(bucket, max_buckets, &buckets);
+  show_histogram(bucket, buckets, total, scale);
+}
+
+void show_rate_histogram(struct rate_hist *hist,
+                         const vpx_codec_enc_cfg_t *cfg, int max_buckets) {
+  int i, scale;
+  int buckets = 0;
+
+  for (i = 0; i < RATE_BINS; i++) {
+    if (hist->bucket[i].low == INT_MAX)
+      continue;
+    hist->bucket[buckets++] = hist->bucket[i];
+  }
+
+  fprintf(stderr, "\nRate (over %dms window):\n", cfg->rc_buf_sz);
+  scale = merge_hist_buckets(hist->bucket, max_buckets, &buckets);
+  show_histogram(hist->bucket, buckets, hist->total, scale);
+}
diff --git a/libs/libvpx/rate_hist.h b/libs/libvpx/rate_hist.h
new file mode 100644
index 0000000000..00a1676a61
--- /dev/null
+++ b/libs/libvpx/rate_hist.h
@@ -0,0 +1,40 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef RATE_HIST_H_
+#define RATE_HIST_H_
+
+#include "vpx/vpx_encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct rate_hist;
+
+struct rate_hist *init_rate_histogram(const vpx_codec_enc_cfg_t *cfg,
+                                      const vpx_rational_t *fps);
+
+void destroy_rate_histogram(struct rate_hist *hist);
+
+void update_rate_histogram(struct rate_hist *hist,
+                           const vpx_codec_enc_cfg_t *cfg,
+                           const vpx_codec_cx_pkt_t *pkt);
+
+void show_q_histogram(const int counts[64], int max_buckets);
+
+void show_rate_histogram(struct rate_hist *hist, const vpx_codec_enc_cfg_t *cfg,
+                         int max_buckets);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // RATE_HIST_H_
diff --git a/libs/libvpx/solution.mk b/libs/libvpx/solution.mk
new file mode 100644
index 0000000000..145adc0dda
--- /dev/null
+++ b/libs/libvpx/solution.mk
@@ -0,0 +1,31 @@
+##
+##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+# libvpx reverse dependencies (targets that depend on libvpx)
+VPX_NONDEPS=$(addsuffix .$(VCPROJ_SFX),vpx gtest)
+VPX_RDEPS=$(foreach vcp,\
+              $(filter-out $(VPX_NONDEPS),$^), --dep=$(vcp:.$(VCPROJ_SFX)=):vpx)
+
+vpx.sln: $(wildcard *.$(VCPROJ_SFX))
+	@echo "    [CREATE] $@"
+	$(SRC_PATH_BARE)/build/make/gen_msvs_sln.sh \
+            $(if $(filter vpx.$(VCPROJ_SFX),$^),$(VPX_RDEPS)) \
+            --dep=test_libvpx:gtest \
+            --ver=$(CONFIG_VS_VERSION)\
+            --out=$@ $^
+vpx.sln.mk: vpx.sln
+	@true
+
+PROJECTS-yes += vpx.sln vpx.sln.mk
+-include vpx.sln.mk
+
+# Always install this file, as it is an unconditional post-build rule.
+INSTALL_MAPS += src/%     $(SRC_PATH_BARE)/%
+INSTALL-SRCS-yes            += $(target).mk
diff --git a/libs/libvpx/test/acm_random.h b/libs/libvpx/test/acm_random.h
new file mode 100644
index 0000000000..ff5c93ea1d
--- /dev/null
+++ b/libs/libvpx/test/acm_random.h
@@ -0,0 +1,67 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef TEST_ACM_RANDOM_H_
+#define TEST_ACM_RANDOM_H_
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "vpx/vpx_integer.h"
+
+namespace libvpx_test {
+
+class ACMRandom {
+ public:
+  ACMRandom() : random_(DeterministicSeed()) {}
+
+  explicit ACMRandom(int seed) : random_(seed) {}
+
+  void Reset(int seed) {
+    random_.Reseed(seed);
+  }
+  uint16_t Rand16(void) {
+    const uint32_t value =
+        random_.Generate(testing::internal::Random::kMaxRange);
+    return (value >> 15) & 0xffff;
+  }
+
+  uint8_t Rand8(void) {
+    const uint32_t value =
+        random_.Generate(testing::internal::Random::kMaxRange);
+    // There's a bit more entropy in the upper bits of this implementation.
+    return (value >> 23) & 0xff;
+  }
+
+  uint8_t Rand8Extremes(void) {
+    // Returns a random value near 0 or near 255, to better exercise
+    // saturation behavior.
+    const uint8_t r = Rand8();
+    return r < 128 ? r << 4 : r >> 4;
+  }
+
+  int PseudoUniform(int range) {
+    return random_.Generate(range);
+  }
+
+  int operator()(int n) {
+    return PseudoUniform(n);
+  }
+
+  static int DeterministicSeed(void) {
+    return 0xbaba;
+  }
+
+ private:
+  testing::internal::Random random_;
+};
+
+}  // namespace libvpx_test
+
+#endif  // TEST_ACM_RANDOM_H_
diff --git a/libs/libvpx/test/active_map_refresh_test.cc b/libs/libvpx/test/active_map_refresh_test.cc
new file mode 100644
index 0000000000..c94566143b
--- /dev/null
+++ b/libs/libvpx/test/active_map_refresh_test.cc
@@ -0,0 +1,127 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <algorithm>
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+
+namespace {
+
+// Check if any pixel in a 16x16 macroblock varies between frames.
+int CheckMb(const vpx_image_t &current, const vpx_image_t &previous,
+            int mb_r, int mb_c) {
+  for (int plane = 0; plane < 3; plane++) {
+    int r = 16 * mb_r;
+    int c0 = 16 * mb_c;
+    int r_top = std::min(r + 16, static_cast<int>(current.d_h));
+    int c_top = std::min(c0 + 16, static_cast<int>(current.d_w));
+    r = std::max(r, 0);
+    c0 = std::max(c0, 0);
+    if (plane > 0 && current.x_chroma_shift) {
+      c_top = (c_top + 1) >> 1;
+      c0 >>= 1;
+    }
+    if (plane > 0 && current.y_chroma_shift) {
+      r_top = (r_top + 1) >> 1;
+      r >>= 1;
+    }
+    for (; r < r_top; ++r) {
+      for (int c = c0; c < c_top; ++c) {
+        if (current.planes[plane][current.stride[plane] * r + c] !=
+            previous.planes[plane][previous.stride[plane] * r + c])
+          return 1;
+      }
+    }
+  }
+  return 0;
+}
+
+void GenerateMap(int mb_rows, int mb_cols, const vpx_image_t &current,
+                 const vpx_image_t &previous, uint8_t *map) {
+  for (int mb_r = 0; mb_r < mb_rows; ++mb_r) {
+    for (int mb_c = 0; mb_c < mb_cols; ++mb_c) {
+      map[mb_r * mb_cols + mb_c] = CheckMb(current, previous, mb_r, mb_c);
+    }
+  }
+}
+
+const int kAqModeCyclicRefresh = 3;
+
+class ActiveMapRefreshTest
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
+ protected:
+  ActiveMapRefreshTest() : EncoderTest(GET_PARAM(0)) {}
+  virtual ~ActiveMapRefreshTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+    cpu_used_ = GET_PARAM(2);
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    ::libvpx_test::Y4mVideoSource *y4m_video =
+        static_cast<libvpx_test::Y4mVideoSource *>(video);
+    if (video->frame() == 1) {
+      encoder->Control(VP8E_SET_CPUUSED, cpu_used_);
+      encoder->Control(VP9E_SET_AQ_MODE, kAqModeCyclicRefresh);
+    } else if (video->frame() >= 2 && video->img()) {
+      vpx_image_t *current = video->img();
+      vpx_image_t *previous = y4m_holder_->img();
+      ASSERT_TRUE(previous != NULL);
+      vpx_active_map_t map = vpx_active_map_t();
+      const int width = static_cast<int>(current->d_w);
+      const int height = static_cast<int>(current->d_h);
+      const int mb_width = (width + 15) / 16;
+      const int mb_height = (height + 15) / 16;
+      uint8_t *active_map = new uint8_t[mb_width * mb_height];
+      GenerateMap(mb_height, mb_width, *current, *previous, active_map);
+      map.cols = mb_width;
+      map.rows = mb_height;
+      map.active_map = active_map;
+      encoder->Control(VP8E_SET_ACTIVEMAP, &map);
+      delete[] active_map;
+    }
+    if (video->img()) {
+      y4m_video->SwapBuffers(y4m_holder_);
+    }
+  }
+
+  int cpu_used_;
+  ::libvpx_test::Y4mVideoSource *y4m_holder_;
+};
+
+TEST_P(ActiveMapRefreshTest, Test) {
+  cfg_.g_lag_in_frames = 0;
+  cfg_.g_profile = 1;
+  cfg_.rc_target_bitrate = 600;
+  cfg_.rc_resize_allowed = 0;
+  cfg_.rc_min_quantizer = 8;
+  cfg_.rc_max_quantizer = 30;
+  cfg_.g_pass = VPX_RC_ONE_PASS;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.kf_max_dist = 90000;
+
+  ::libvpx_test::Y4mVideoSource video("desktop_credits.y4m", 0, 30);
+  ::libvpx_test::Y4mVideoSource video_holder("desktop_credits.y4m", 0, 30);
+  video_holder.Begin();
+  y4m_holder_ = &video_holder;
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+VP9_INSTANTIATE_TEST_CASE(ActiveMapRefreshTest,
+                          ::testing::Values(::libvpx_test::kRealTime),
+                          ::testing::Range(5, 6));
+}  // namespace
diff --git a/libs/libvpx/test/active_map_test.cc b/libs/libvpx/test/active_map_test.cc
new file mode 100644
index 0000000000..0221995191
--- /dev/null
+++ b/libs/libvpx/test/active_map_test.cc
@@ -0,0 +1,89 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <climits>
+#include <vector>
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+
+class ActiveMapTest
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
+ protected:
+  static const int kWidth = 208;
+  static const int kHeight = 144;
+
+  ActiveMapTest() : EncoderTest(GET_PARAM(0)) {}
+  virtual ~ActiveMapTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+    cpu_used_ = GET_PARAM(2);
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 1) {
+      encoder->Control(VP8E_SET_CPUUSED, cpu_used_);
+    } else if (video->frame() == 3) {
+      vpx_active_map_t map = vpx_active_map_t();
+      uint8_t active_map[9 * 13] = {
+        1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
+        1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
+        1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
+        1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
+        0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1,
+        0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
+        0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1,
+        0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1,
+        1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0,
+      };
+      map.cols = (kWidth + 15) / 16;
+      map.rows = (kHeight + 15) / 16;
+      ASSERT_EQ(map.cols, 13u);
+      ASSERT_EQ(map.rows, 9u);
+      map.active_map = active_map;
+      encoder->Control(VP8E_SET_ACTIVEMAP, &map);
+    } else if (video->frame() == 15) {
+      vpx_active_map_t map = vpx_active_map_t();
+      map.cols = (kWidth + 15) / 16;
+      map.rows = (kHeight + 15) / 16;
+      map.active_map = NULL;
+      encoder->Control(VP8E_SET_ACTIVEMAP, &map);
+    }
+  }
+
+  int cpu_used_;
+};
+
+TEST_P(ActiveMapTest, Test) {
+  // Validate that this non multiple of 64 wide clip encodes
+  cfg_.g_lag_in_frames = 0;
+  cfg_.rc_target_bitrate = 400;
+  cfg_.rc_resize_allowed = 0;
+  cfg_.g_pass = VPX_RC_ONE_PASS;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.kf_max_dist = 90000;
+
+  ::libvpx_test::I420VideoSource video("hantro_odd.yuv", kWidth, kHeight, 30,
+                                       1, 0, 20);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+VP9_INSTANTIATE_TEST_CASE(ActiveMapTest,
+                          ::testing::Values(::libvpx_test::kRealTime),
+                          ::testing::Range(0, 6));
+}  // namespace
diff --git a/libs/libvpx/test/altref_test.cc b/libs/libvpx/test/altref_test.cc
new file mode 100644
index 0000000000..af25b72856
--- /dev/null
+++ b/libs/libvpx/test/altref_test.cc
@@ -0,0 +1,69 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+namespace {
+
+// lookahead range: [kLookAheadMin, kLookAheadMax).
+const int kLookAheadMin = 5;
+const int kLookAheadMax = 26;
+
+class AltRefTest : public ::libvpx_test::EncoderTest,
+    public ::libvpx_test::CodecTestWithParam<int> {
+ protected:
+  AltRefTest() : EncoderTest(GET_PARAM(0)), altref_count_(0) {}
+  virtual ~AltRefTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(libvpx_test::kTwoPassGood);
+  }
+
+  virtual void BeginPassHook(unsigned int pass) {
+    altref_count_ = 0;
+  }
+
+  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                                  libvpx_test::Encoder *encoder) {
+    if (video->frame() == 1) {
+      encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
+      encoder->Control(VP8E_SET_CPUUSED, 3);
+    }
+  }
+
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    if (pkt->data.frame.flags & VPX_FRAME_IS_INVISIBLE) ++altref_count_;
+  }
+
+  int altref_count() const { return altref_count_; }
+
+ private:
+  int altref_count_;
+};
+
+TEST_P(AltRefTest, MonotonicTimestamps) {
+  const vpx_rational timebase = { 33333333, 1000000000 };
+  cfg_.g_timebase = timebase;
+  cfg_.rc_target_bitrate = 1000;
+  cfg_.g_lag_in_frames = GET_PARAM(1);
+
+  libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                     timebase.den, timebase.num, 0, 30);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  EXPECT_GE(altref_count(), 1);
+}
+
+
+VP8_INSTANTIATE_TEST_CASE(AltRefTest,
+                          ::testing::Range(kLookAheadMin, kLookAheadMax));
+}  // namespace
diff --git a/libs/libvpx/test/android/Android.mk b/libs/libvpx/test/android/Android.mk
new file mode 100644
index 0000000000..48872a2b65
--- /dev/null
+++ b/libs/libvpx/test/android/Android.mk
@@ -0,0 +1,56 @@
+# Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS.  All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+#
+# This make file builds vpx_test app for android.
+# The test app itself runs on the command line through adb shell
+# The paths are really messed up as the libvpx make file
+# expects to be made from a parent directory.
+CUR_WD := $(call my-dir)
+BINDINGS_DIR := $(CUR_WD)/../../..
+LOCAL_PATH := $(CUR_WD)/../../..
+
+#libwebm
+include $(CLEAR_VARS)
+include $(BINDINGS_DIR)/libvpx/third_party/libwebm/Android.mk
+LOCAL_PATH := $(CUR_WD)/../../..
+
+#libvpx
+include $(CLEAR_VARS)
+LOCAL_STATIC_LIBRARIES := libwebm
+include $(BINDINGS_DIR)/libvpx/build/make/Android.mk
+LOCAL_PATH := $(CUR_WD)/../..
+
+#libgtest
+include $(CLEAR_VARS)
+LOCAL_ARM_MODE := arm
+LOCAL_CPP_EXTENSION := .cc
+LOCAL_MODULE := gtest
+LOCAL_C_INCLUDES := $(LOCAL_PATH)/third_party/googletest/src/
+LOCAL_C_INCLUDES += $(LOCAL_PATH)/third_party/googletest/src/include/
+LOCAL_SRC_FILES := ./third_party/googletest/src/src/gtest-all.cc
+include $(BUILD_STATIC_LIBRARY)
+
+#libvpx_test
+include $(CLEAR_VARS)
+LOCAL_ARM_MODE := arm
+LOCAL_MODULE := libvpx_test
+LOCAL_STATIC_LIBRARIES := gtest libwebm
+
+ifeq ($(ENABLE_SHARED),1)
+  LOCAL_SHARED_LIBRARIES := vpx
+else
+  LOCAL_STATIC_LIBRARIES += vpx
+endif
+
+include $(LOCAL_PATH)/test/test.mk
+LOCAL_C_INCLUDES := $(BINDINGS_DIR)
+FILTERED_SRC := $(sort $(filter %.cc %.c, $(LIBVPX_TEST_SRCS-yes)))
+LOCAL_SRC_FILES := $(addprefix ./test/, $(FILTERED_SRC))
+# some test files depend on *_rtcd.h, ensure they're generated first.
+$(eval $(call rtcd_dep_template))
+include $(BUILD_EXECUTABLE)
diff --git a/libs/libvpx/test/android/README b/libs/libvpx/test/android/README
new file mode 100644
index 0000000000..4a1adcf7f4
--- /dev/null
+++ b/libs/libvpx/test/android/README
@@ -0,0 +1,32 @@
+Android.mk will build vpx unittests on android.
+1) Configure libvpx from the parent directory:
+./libvpx/configure --target=armv7-android-gcc --enable-external-build \
+  --enable-postproc --disable-install-srcs --enable-multi-res-encoding \
+  --enable-temporal-denoising --disable-unit-tests --disable-install-docs \
+  --disable-examples --disable-runtime-cpu-detect --sdk-path=$NDK
+
+2) From the parent directory, invoke ndk-build:
+NDK_PROJECT_PATH=. ndk-build APP_BUILD_SCRIPT=./libvpx/test/android/Android.mk \
+  APP_ABI=armeabi-v7a APP_PLATFORM=android-18 APP_OPTIM=release \
+  APP_STL=gnustl_static
+
+Note: Both adb and ndk-build are available prebuilt at:
+  https://chromium.googlesource.com/android_tools
+
+3) Run get_files.py to download the test files:
+python get_files.py -i /path/to/test-data.sha1 -o /path/to/put/files \
+  -u http://downloads.webmproject.org/test_data/libvpx
+
+4) Transfer files to device using adb. Ensure you have proper permissions for
+the target
+
+adb push /path/to/test_files /data/local/tmp
+adb push /path/to/built_libs /data/local/tmp
+
+NOTE: Built_libs defaults to parent_dir/libs/armeabi-v7a
+
+5) Run tests:
+adb shell
+(on device)
+cd /data/local/tmp
+LD_LIBRARY_PATH=. ./vpx_test
diff --git a/libs/libvpx/test/android/get_files.py b/libs/libvpx/test/android/get_files.py
new file mode 100644
index 0000000000..1c69740d2b
--- /dev/null
+++ b/libs/libvpx/test/android/get_files.py
@@ -0,0 +1,118 @@
+# Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS.  All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+#
+# This simple script pulls test files from the webm homepage
+# It is intelligent enough to only pull files if
+#   1) File / test_data folder does not exist
+#   2) SHA mismatch
+
+import pycurl
+import csv
+import hashlib
+import re
+import os.path
+import time
+import itertools
+import sys
+import getopt
+
+#globals
+url = ''
+file_list_path = ''
+local_resource_path = ''
+
+# Helper functions:
+# A simple function which returns the sha hash of a file in hex
+def get_file_sha(filename):
+  try:
+    sha_hash = hashlib.sha1()
+    with open(filename, 'rb') as file:
+      buf = file.read(HASH_CHUNK)
+      while len(buf) > 0:
+        sha_hash.update(buf)
+        buf = file.read(HASH_CHUNK)
+      return sha_hash.hexdigest()
+  except IOError:
+    print "Error reading " + filename
+
+# Downloads a file from a url, and then checks the sha against the passed
+# in sha
+def download_and_check_sha(url, filename, sha):
+  path = os.path.join(local_resource_path, filename)
+  fp = open(path, "wb")
+  curl = pycurl.Curl()
+  curl.setopt(pycurl.URL, url + "/" + filename)
+  curl.setopt(pycurl.WRITEDATA, fp)
+  curl.perform()
+  curl.close()
+  fp.close()
+  return get_file_sha(path) == sha
+
+#constants
+ftp_retries = 3
+
+SHA_COL = 0
+NAME_COL = 1
+EXPECTED_COL = 2
+HASH_CHUNK = 65536
+
+# Main script
+try:
+  opts, args = \
+      getopt.getopt(sys.argv[1:], \
+                    "u:i:o:", ["url=", "input_csv=", "output_dir="])
+except:
+  print 'get_files.py -u <url> -i <input_csv> -o <output_dir>'
+  sys.exit(2)
+
+for opt, arg in opts:
+  if opt == '-u':
+    url = arg
+  elif opt in ("-i", "--input_csv"):
+    file_list_path = os.path.join(arg)
+  elif opt in ("-o", "--output_dir"):
+    local_resource_path = os.path.join(arg)
+
+if len(sys.argv) != 7:
+  print "Expects two paths and a url!"
+  exit(1)
+
+if not os.path.isdir(local_resource_path):
+  os.makedirs(local_resource_path)
+
+file_list_csv = open(file_list_path, "rb")
+
+# Our 'csv' file uses multiple spaces as a delimiter, python's
+# csv class only uses single character delimiters, so we convert them below
+file_list_reader = csv.reader((re.sub(' +', ' ', line) \
+    for line in file_list_csv), delimiter = ' ')
+
+file_shas = []
+file_names = []
+
+for row in file_list_reader:
+  if len(row) != EXPECTED_COL:
+      continue
+  file_shas.append(row[SHA_COL])
+  file_names.append(row[NAME_COL])
+
+file_list_csv.close()
+
+# Download files, only if they don't already exist and have correct shas
+for filename, sha in itertools.izip(file_names, file_shas):
+  path = os.path.join(local_resource_path, filename)
+  if os.path.isfile(path) \
+      and get_file_sha(path) == sha:
+    print path + ' exists, skipping'
+    continue
+  for retry in range(0, ftp_retries):
+    print "Downloading " + path
+    if not download_and_check_sha(url, filename, sha):
+      print "Sha does not match, retrying..."
+    else:
+      break
diff --git a/libs/libvpx/test/android/scrape_gtest_log.py b/libs/libvpx/test/android/scrape_gtest_log.py
new file mode 100644
index 0000000000..487845c270
--- /dev/null
+++ b/libs/libvpx/test/android/scrape_gtest_log.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS.  All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+"""Standalone script which parses a gtest log for json.
+
+Json is returned returns as an array.  This script is used by the libvpx
+waterfall to gather json results mixed in with gtest logs.  This is
+dubious software engineering.
+"""
+
+import getopt
+import json
+import os
+import re
+import sys
+
+
+def main():
+  if len(sys.argv) != 3:
+    print "Expects a file to write json to!"
+    exit(1)
+
+  try:
+    opts, _ = \
+        getopt.getopt(sys.argv[1:], \
+                      'o:', ['output-json='])
+  except getopt.GetOptError:
+    print 'scrape_gtest_log.py -o <output_json>'
+    sys.exit(2)
+
+  output_json = ''
+  for opt, arg in opts:
+    if opt in ('-o', '--output-json'):
+      output_json = os.path.join(arg)
+
+  blob = sys.stdin.read()
+  json_string = '[' + ','.join('{' + x + '}' for x in
+                               re.findall(r'{([^}]*.?)}', blob)) + ']'
+  print blob
+
+  output = json.dumps(json.loads(json_string), indent=4, sort_keys=True)
+  print output
+
+  path = os.path.dirname(output_json)
+  if path and not os.path.exists(path):
+    os.makedirs(path)
+
+  outfile = open(output_json, 'w')
+  outfile.write(output)
+
+if __name__ == '__main__':
+  sys.exit(main())
diff --git a/libs/libvpx/test/aq_segment_test.cc b/libs/libvpx/test/aq_segment_test.cc
new file mode 100644
index 0000000000..1b9c943562
--- /dev/null
+++ b/libs/libvpx/test/aq_segment_test.cc
@@ -0,0 +1,109 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+
+class AqSegmentTest
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
+ protected:
+  AqSegmentTest() : EncoderTest(GET_PARAM(0)) {}
+  virtual ~AqSegmentTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+    set_cpu_used_ = GET_PARAM(2);
+    aq_mode_ = 0;
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 1) {
+      encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
+      encoder->Control(VP9E_SET_AQ_MODE, aq_mode_);
+      encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 100);
+    }
+  }
+
+  int set_cpu_used_;
+  int aq_mode_;
+};
+
+// Validate that this AQ segmentation mode (AQ=1, variance_ap)
+// encodes and decodes without a mismatch.
+TEST_P(AqSegmentTest, TestNoMisMatchAQ1) {
+  cfg_.rc_min_quantizer = 8;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_target_bitrate = 300;
+
+  aq_mode_ = 1;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                        30, 1, 0, 100);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+// Validate that this AQ segmentation mode (AQ=2, complexity_aq)
+// encodes and decodes without a mismatch.
+TEST_P(AqSegmentTest, TestNoMisMatchAQ2) {
+  cfg_.rc_min_quantizer = 8;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_target_bitrate = 300;
+
+  aq_mode_ = 2;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                        30, 1, 0, 100);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+// Validate that this AQ segmentation mode (AQ=3, cyclic_refresh_aq)
+// encodes and decodes without a mismatch.
+TEST_P(AqSegmentTest, TestNoMisMatchAQ3) {
+  cfg_.rc_min_quantizer = 8;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_target_bitrate = 300;
+
+  aq_mode_ = 3;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                        30, 1, 0, 100);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+VP9_INSTANTIATE_TEST_CASE(AqSegmentTest,
+                          ::testing::Values(::libvpx_test::kRealTime,
+                                            ::libvpx_test::kOnePassGood),
+                          ::testing::Range(3, 9));
+}  // namespace
diff --git a/libs/libvpx/test/avg_test.cc b/libs/libvpx/test/avg_test.cc
new file mode 100644
index 0000000000..44d8dd7db5
--- /dev/null
+++ b/libs/libvpx/test/avg_test.cc
@@ -0,0 +1,411 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <limits.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "vpx_mem/vpx_mem.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+class AverageTestBase : public ::testing::Test {
+ public:
+  AverageTestBase(int width, int height) : width_(width), height_(height) {}
+
+  static void SetUpTestCase() {
+    source_data_ = reinterpret_cast<uint8_t*>(
+        vpx_memalign(kDataAlignment, kDataBlockSize));
+  }
+
+  static void TearDownTestCase() {
+    vpx_free(source_data_);
+    source_data_ = NULL;
+  }
+
+  virtual void TearDown() {
+    libvpx_test::ClearSystemState();
+  }
+
+ protected:
+  // Handle blocks up to 4 blocks 64x64 with stride up to 128
+  static const int kDataAlignment = 16;
+  static const int kDataBlockSize = 64 * 128;
+
+  virtual void SetUp() {
+    source_stride_ = (width_ + 31) & ~31;
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+  }
+
+  // Sum Pixels
+  unsigned int ReferenceAverage8x8(const uint8_t* source, int pitch) {
+    unsigned int average = 0;
+    for (int h = 0; h < 8; ++h)
+      for (int w = 0; w < 8; ++w)
+        average += source[h * pitch + w];
+    return ((average + 32) >> 6);
+  }
+
+  unsigned int ReferenceAverage4x4(const uint8_t* source, int pitch) {
+    unsigned int average = 0;
+    for (int h = 0; h < 4; ++h)
+      for (int w = 0; w < 4; ++w)
+        average += source[h * pitch + w];
+    return ((average + 8) >> 4);
+  }
+
+  void FillConstant(uint8_t fill_constant) {
+    for (int i = 0; i < width_ * height_; ++i) {
+        source_data_[i] = fill_constant;
+    }
+  }
+
+  void FillRandom() {
+    for (int i = 0; i < width_ * height_; ++i) {
+        source_data_[i] = rnd_.Rand8();
+    }
+  }
+
+  int width_, height_;
+  static uint8_t* source_data_;
+  int source_stride_;
+
+  ACMRandom rnd_;
+};
+typedef unsigned int (*AverageFunction)(const uint8_t* s, int pitch);
+
+typedef std::tr1::tuple<int, int, int, int, AverageFunction> AvgFunc;
+
+class AverageTest
+    : public AverageTestBase,
+      public ::testing::WithParamInterface<AvgFunc>{
+ public:
+  AverageTest() : AverageTestBase(GET_PARAM(0), GET_PARAM(1)) {}
+
+ protected:
+  void CheckAverages() {
+    unsigned int expected = 0;
+    if (GET_PARAM(3) == 8) {
+      expected = ReferenceAverage8x8(source_data_+ GET_PARAM(2),
+                                     source_stride_);
+    } else  if (GET_PARAM(3) == 4) {
+      expected = ReferenceAverage4x4(source_data_+ GET_PARAM(2),
+                                     source_stride_);
+    }
+
+    ASM_REGISTER_STATE_CHECK(GET_PARAM(4)(source_data_+ GET_PARAM(2),
+                                          source_stride_));
+    unsigned int actual = GET_PARAM(4)(source_data_+ GET_PARAM(2),
+                                       source_stride_);
+
+    EXPECT_EQ(expected, actual);
+  }
+};
+
+typedef void (*IntProRowFunc)(int16_t hbuf[16], uint8_t const *ref,
+                              const int ref_stride, const int height);
+
+typedef std::tr1::tuple<int, IntProRowFunc, IntProRowFunc> IntProRowParam;
+
+class IntProRowTest
+    : public AverageTestBase,
+      public ::testing::WithParamInterface<IntProRowParam> {
+ public:
+  IntProRowTest()
+    : AverageTestBase(16, GET_PARAM(0)),
+      hbuf_asm_(NULL),
+      hbuf_c_(NULL) {
+    asm_func_ = GET_PARAM(1);
+    c_func_ = GET_PARAM(2);
+  }
+
+ protected:
+  virtual void SetUp() {
+    hbuf_asm_ = reinterpret_cast<int16_t*>(
+        vpx_memalign(kDataAlignment, sizeof(*hbuf_asm_) * 16));
+    hbuf_c_ = reinterpret_cast<int16_t*>(
+        vpx_memalign(kDataAlignment, sizeof(*hbuf_c_) * 16));
+  }
+
+  virtual void TearDown() {
+    vpx_free(hbuf_c_);
+    hbuf_c_ = NULL;
+    vpx_free(hbuf_asm_);
+    hbuf_asm_ = NULL;
+  }
+
+  void RunComparison() {
+    ASM_REGISTER_STATE_CHECK(c_func_(hbuf_c_, source_data_, 0, height_));
+    ASM_REGISTER_STATE_CHECK(asm_func_(hbuf_asm_, source_data_, 0, height_));
+    EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * 16))
+        << "Output mismatch";
+  }
+
+ private:
+  IntProRowFunc asm_func_;
+  IntProRowFunc c_func_;
+  int16_t *hbuf_asm_;
+  int16_t *hbuf_c_;
+};
+
+typedef int16_t (*IntProColFunc)(uint8_t const *ref, const int width);
+
+typedef std::tr1::tuple<int, IntProColFunc, IntProColFunc> IntProColParam;
+
+class IntProColTest
+    : public AverageTestBase,
+      public ::testing::WithParamInterface<IntProColParam> {
+ public:
+  IntProColTest() : AverageTestBase(GET_PARAM(0), 1), sum_asm_(0), sum_c_(0) {
+    asm_func_ = GET_PARAM(1);
+    c_func_ = GET_PARAM(2);
+  }
+
+ protected:
+  void RunComparison() {
+    ASM_REGISTER_STATE_CHECK(sum_c_ = c_func_(source_data_, width_));
+    ASM_REGISTER_STATE_CHECK(sum_asm_ = asm_func_(source_data_, width_));
+    EXPECT_EQ(sum_c_, sum_asm_) << "Output mismatch";
+  }
+
+ private:
+  IntProColFunc asm_func_;
+  IntProColFunc c_func_;
+  int16_t sum_asm_;
+  int16_t sum_c_;
+};
+
+typedef int (*SatdFunc)(const int16_t *coeffs, int length);
+typedef std::tr1::tuple<int, SatdFunc> SatdTestParam;
+
+class SatdTest
+    : public ::testing::Test,
+      public ::testing::WithParamInterface<SatdTestParam> {
+ protected:
+  virtual void SetUp() {
+    satd_size_ = GET_PARAM(0);
+    satd_func_ = GET_PARAM(1);
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    src_ = reinterpret_cast<int16_t*>(
+        vpx_memalign(16, sizeof(*src_) * satd_size_));
+    ASSERT_TRUE(src_ != NULL);
+  }
+
+  virtual void TearDown() {
+    libvpx_test::ClearSystemState();
+    vpx_free(src_);
+  }
+
+  void FillConstant(const int16_t val) {
+    for (int i = 0; i < satd_size_; ++i) src_[i] = val;
+  }
+
+  void FillRandom() {
+    for (int i = 0; i < satd_size_; ++i) src_[i] = rnd_.Rand16();
+  }
+
+  void Check(const int expected) {
+    int total;
+    ASM_REGISTER_STATE_CHECK(total = satd_func_(src_, satd_size_));
+    EXPECT_EQ(expected, total);
+  }
+
+  int satd_size_;
+
+ private:
+  int16_t *src_;
+  SatdFunc satd_func_;
+  ACMRandom rnd_;
+};
+
+uint8_t* AverageTestBase::source_data_ = NULL;
+
+TEST_P(AverageTest, MinValue) {
+  FillConstant(0);
+  CheckAverages();
+}
+
+TEST_P(AverageTest, MaxValue) {
+  FillConstant(255);
+  CheckAverages();
+}
+
+TEST_P(AverageTest, Random) {
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  for (int i = 0; i < 1000; i++) {
+    FillRandom();
+    CheckAverages();
+  }
+}
+
+TEST_P(IntProRowTest, MinValue) {
+  FillConstant(0);
+  RunComparison();
+}
+
+TEST_P(IntProRowTest, MaxValue) {
+  FillConstant(255);
+  RunComparison();
+}
+
+TEST_P(IntProRowTest, Random) {
+  FillRandom();
+  RunComparison();
+}
+
+TEST_P(IntProColTest, MinValue) {
+  FillConstant(0);
+  RunComparison();
+}
+
+TEST_P(IntProColTest, MaxValue) {
+  FillConstant(255);
+  RunComparison();
+}
+
+TEST_P(IntProColTest, Random) {
+  FillRandom();
+  RunComparison();
+}
+
+
+TEST_P(SatdTest, MinValue) {
+  const int kMin = -32640;
+  const int expected = -kMin * satd_size_;
+  FillConstant(kMin);
+  Check(expected);
+}
+
+TEST_P(SatdTest, MaxValue) {
+  const int kMax = 32640;
+  const int expected = kMax * satd_size_;
+  FillConstant(kMax);
+  Check(expected);
+}
+
+TEST_P(SatdTest, Random) {
+  int expected;
+  switch (satd_size_) {
+    case 16: expected = 205298; break;
+    case 64: expected = 1113950; break;
+    case 256: expected = 4268415; break;
+    case 1024: expected = 16954082; break;
+    default:
+      FAIL() << "Invalid satd size (" << satd_size_
+             << ") valid: 16/64/256/1024";
+  }
+  FillRandom();
+  Check(expected);
+}
+
+using std::tr1::make_tuple;
+
+INSTANTIATE_TEST_CASE_P(
+    C, AverageTest,
+    ::testing::Values(
+        make_tuple(16, 16, 1, 8, &vpx_avg_8x8_c),
+        make_tuple(16, 16, 1, 4, &vpx_avg_4x4_c)));
+
+INSTANTIATE_TEST_CASE_P(
+    C, SatdTest,
+    ::testing::Values(
+        make_tuple(16, &vpx_satd_c),
+        make_tuple(64, &vpx_satd_c),
+        make_tuple(256, &vpx_satd_c),
+        make_tuple(1024, &vpx_satd_c)));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, AverageTest,
+    ::testing::Values(
+        make_tuple(16, 16, 0, 8, &vpx_avg_8x8_sse2),
+        make_tuple(16, 16, 5, 8, &vpx_avg_8x8_sse2),
+        make_tuple(32, 32, 15, 8, &vpx_avg_8x8_sse2),
+        make_tuple(16, 16, 0, 4, &vpx_avg_4x4_sse2),
+        make_tuple(16, 16, 5, 4, &vpx_avg_4x4_sse2),
+        make_tuple(32, 32, 15, 4, &vpx_avg_4x4_sse2)));
+
+INSTANTIATE_TEST_CASE_P(
+    SSE2, IntProRowTest, ::testing::Values(
+        make_tuple(16, &vpx_int_pro_row_sse2, &vpx_int_pro_row_c),
+        make_tuple(32, &vpx_int_pro_row_sse2, &vpx_int_pro_row_c),
+        make_tuple(64, &vpx_int_pro_row_sse2, &vpx_int_pro_row_c)));
+
+INSTANTIATE_TEST_CASE_P(
+    SSE2, IntProColTest, ::testing::Values(
+        make_tuple(16, &vpx_int_pro_col_sse2, &vpx_int_pro_col_c),
+        make_tuple(32, &vpx_int_pro_col_sse2, &vpx_int_pro_col_c),
+        make_tuple(64, &vpx_int_pro_col_sse2, &vpx_int_pro_col_c)));
+
+INSTANTIATE_TEST_CASE_P(
+    SSE2, SatdTest,
+    ::testing::Values(
+        make_tuple(16, &vpx_satd_sse2),
+        make_tuple(64, &vpx_satd_sse2),
+        make_tuple(256, &vpx_satd_sse2),
+        make_tuple(1024, &vpx_satd_sse2)));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(
+    NEON, AverageTest,
+    ::testing::Values(
+        make_tuple(16, 16, 0, 8, &vpx_avg_8x8_neon),
+        make_tuple(16, 16, 5, 8, &vpx_avg_8x8_neon),
+        make_tuple(32, 32, 15, 8, &vpx_avg_8x8_neon),
+        make_tuple(16, 16, 0, 4, &vpx_avg_4x4_neon),
+        make_tuple(16, 16, 5, 4, &vpx_avg_4x4_neon),
+        make_tuple(32, 32, 15, 4, &vpx_avg_4x4_neon)));
+
+INSTANTIATE_TEST_CASE_P(
+    NEON, IntProRowTest, ::testing::Values(
+        make_tuple(16, &vpx_int_pro_row_neon, &vpx_int_pro_row_c),
+        make_tuple(32, &vpx_int_pro_row_neon, &vpx_int_pro_row_c),
+        make_tuple(64, &vpx_int_pro_row_neon, &vpx_int_pro_row_c)));
+
+INSTANTIATE_TEST_CASE_P(
+    NEON, IntProColTest, ::testing::Values(
+        make_tuple(16, &vpx_int_pro_col_neon, &vpx_int_pro_col_c),
+        make_tuple(32, &vpx_int_pro_col_neon, &vpx_int_pro_col_c),
+        make_tuple(64, &vpx_int_pro_col_neon, &vpx_int_pro_col_c)));
+
+INSTANTIATE_TEST_CASE_P(
+    NEON, SatdTest,
+    ::testing::Values(
+        make_tuple(16, &vpx_satd_neon),
+        make_tuple(64, &vpx_satd_neon),
+        make_tuple(256, &vpx_satd_neon),
+        make_tuple(1024, &vpx_satd_neon)));
+#endif
+
+#if HAVE_MSA
+INSTANTIATE_TEST_CASE_P(
+    MSA, AverageTest,
+    ::testing::Values(
+        make_tuple(16, 16, 0, 8, &vpx_avg_8x8_msa),
+        make_tuple(16, 16, 5, 8, &vpx_avg_8x8_msa),
+        make_tuple(32, 32, 15, 8, &vpx_avg_8x8_msa),
+        make_tuple(16, 16, 0, 4, &vpx_avg_4x4_msa),
+        make_tuple(16, 16, 5, 4, &vpx_avg_4x4_msa),
+        make_tuple(32, 32, 15, 4, &vpx_avg_4x4_msa)));
+#endif
+
+}  // namespace
diff --git a/libs/libvpx/test/blockiness_test.cc b/libs/libvpx/test/blockiness_test.cc
new file mode 100644
index 0000000000..0c60baaa38
--- /dev/null
+++ b/libs/libvpx/test/blockiness_test.cc
@@ -0,0 +1,229 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <limits.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_config.h"
+#if CONFIG_VP9_ENCODER
+#include "./vp9_rtcd.h"
+#endif
+
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+#include "vpx_mem/vpx_mem.h"
+
+
+extern "C"
+double vp9_get_blockiness(const unsigned char *img1, int img1_pitch,
+                          const unsigned char *img2, int img2_pitch,
+                          int width, int height);
+
+using libvpx_test::ACMRandom;
+
+namespace {
+class BlockinessTestBase : public ::testing::Test {
+ public:
+  BlockinessTestBase(int width, int height) : width_(width), height_(height) {}
+
+  static void SetUpTestCase() {
+    source_data_ = reinterpret_cast<uint8_t*>(
+        vpx_memalign(kDataAlignment, kDataBufferSize));
+    reference_data_ = reinterpret_cast<uint8_t*>(
+        vpx_memalign(kDataAlignment, kDataBufferSize));
+  }
+
+  static void TearDownTestCase() {
+    vpx_free(source_data_);
+    source_data_ = NULL;
+    vpx_free(reference_data_);
+    reference_data_ = NULL;
+  }
+
+  virtual void TearDown() {
+    libvpx_test::ClearSystemState();
+  }
+
+ protected:
+  // Handle frames up to 640x480
+  static const int kDataAlignment = 16;
+  static const int kDataBufferSize = 640*480;
+
+  virtual void SetUp() {
+    source_stride_ = (width_ + 31) & ~31;
+    reference_stride_ = width_ * 2;
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+  }
+
+  void FillConstant(uint8_t *data, int stride, uint8_t fill_constant,
+                    int width, int height) {
+    for (int h = 0; h < height; ++h) {
+      for (int w = 0; w < width; ++w) {
+        data[h * stride + w] = fill_constant;
+      }
+    }
+  }
+
+  void FillConstant(uint8_t *data, int stride, uint8_t fill_constant) {
+    FillConstant(data, stride, fill_constant, width_, height_);
+  }
+
+  void FillRandom(uint8_t *data, int stride, int width, int height) {
+    for (int h = 0; h < height; ++h) {
+      for (int w = 0; w < width; ++w) {
+        data[h * stride + w] = rnd_.Rand8();
+      }
+    }
+  }
+
+  void FillRandom(uint8_t *data, int stride) {
+    FillRandom(data, stride, width_, height_);
+  }
+
+  void FillRandomBlocky(uint8_t *data, int stride) {
+    for (int h = 0; h < height_; h += 4) {
+      for (int w = 0; w < width_; w += 4) {
+        FillRandom(data + h * stride + w, stride, 4, 4);
+      }
+    }
+  }
+
+  void FillCheckerboard(uint8_t *data, int stride) {
+    for (int h = 0; h < height_; h += 4) {
+      for (int w = 0; w < width_; w += 4) {
+        if (((h/4) ^ (w/4)) & 1)
+          FillConstant(data + h * stride + w, stride, 255, 4, 4);
+        else
+          FillConstant(data + h * stride + w, stride, 0, 4, 4);
+      }
+    }
+  }
+
+  void Blur(uint8_t *data, int stride, int taps) {
+    int sum = 0;
+    int half_taps = taps / 2;
+    for (int h = 0; h < height_; ++h) {
+      for (int w = 0; w < taps; ++w) {
+        sum += data[w + h * stride];
+      }
+      for (int w = taps; w < width_; ++w) {
+        sum += data[w + h * stride] - data[w - taps + h * stride];
+        data[w - half_taps + h * stride] = (sum + half_taps) / taps;
+      }
+    }
+    for (int w = 0; w < width_; ++w) {
+      for (int h = 0; h < taps; ++h) {
+        sum += data[h + w * stride];
+      }
+      for (int h = taps; h < height_; ++h) {
+        sum += data[w + h * stride] - data[(h - taps) * stride + w];
+        data[(h - half_taps) * stride + w] = (sum + half_taps) / taps;
+      }
+    }
+  }
+  int width_, height_;
+  static uint8_t* source_data_;
+  int source_stride_;
+  static uint8_t* reference_data_;
+  int reference_stride_;
+
+  ACMRandom rnd_;
+};
+
+#if CONFIG_VP9_ENCODER
+typedef std::tr1::tuple<int, int> BlockinessParam;
+class BlockinessVP9Test
+    : public BlockinessTestBase,
+      public ::testing::WithParamInterface<BlockinessParam> {
+ public:
+  BlockinessVP9Test() : BlockinessTestBase(GET_PARAM(0), GET_PARAM(1)) {}
+
+ protected:
+  int CheckBlockiness() {
+    return vp9_get_blockiness(source_data_, source_stride_,
+                              reference_data_, reference_stride_,
+                              width_, height_);
+  }
+};
+#endif  // CONFIG_VP9_ENCODER
+
+uint8_t* BlockinessTestBase::source_data_ = NULL;
+uint8_t* BlockinessTestBase::reference_data_ = NULL;
+
+#if CONFIG_VP9_ENCODER
+TEST_P(BlockinessVP9Test, SourceBlockierThanReference) {
+  // Source is blockier than reference.
+  FillRandomBlocky(source_data_, source_stride_);
+  FillConstant(reference_data_, reference_stride_, 128);
+  int super_blocky = CheckBlockiness();
+
+  EXPECT_EQ(0, super_blocky) << "Blocky source should produce 0 blockiness.";
+}
+
+TEST_P(BlockinessVP9Test, ReferenceBlockierThanSource) {
+  // Source is blockier than reference.
+  FillConstant(source_data_, source_stride_, 128);
+  FillRandomBlocky(reference_data_, reference_stride_);
+  int super_blocky = CheckBlockiness();
+
+  EXPECT_GT(super_blocky, 0.0)
+      << "Blocky reference should score high for blockiness.";
+}
+
+TEST_P(BlockinessVP9Test, BlurringDecreasesBlockiness) {
+  // Source is blockier than reference.
+  FillConstant(source_data_, source_stride_, 128);
+  FillRandomBlocky(reference_data_, reference_stride_);
+  int super_blocky = CheckBlockiness();
+
+  Blur(reference_data_, reference_stride_, 4);
+  int less_blocky = CheckBlockiness();
+
+  EXPECT_GT(super_blocky, less_blocky)
+      << "A straight blur should decrease blockiness.";
+}
+
+TEST_P(BlockinessVP9Test, WorstCaseBlockiness) {
+  // Source is blockier than reference.
+  FillConstant(source_data_, source_stride_, 128);
+  FillCheckerboard(reference_data_, reference_stride_);
+
+  int super_blocky = CheckBlockiness();
+
+  Blur(reference_data_, reference_stride_, 4);
+  int less_blocky = CheckBlockiness();
+
+  EXPECT_GT(super_blocky, less_blocky)
+      << "A straight blur should decrease blockiness.";
+}
+#endif  // CONFIG_VP9_ENCODER
+
+
+using std::tr1::make_tuple;
+
+//------------------------------------------------------------------------------
+// C functions
+
+#if CONFIG_VP9_ENCODER
+const BlockinessParam c_vp9_tests[] = {
+  make_tuple(320, 240),
+  make_tuple(318, 242),
+  make_tuple(318, 238),
+};
+INSTANTIATE_TEST_CASE_P(C, BlockinessVP9Test, ::testing::ValuesIn(c_vp9_tests));
+#endif
+
+}  // namespace
diff --git a/libs/libvpx/test/borders_test.cc b/libs/libvpx/test/borders_test.cc
new file mode 100644
index 0000000000..6592375f80
--- /dev/null
+++ b/libs/libvpx/test/borders_test.cc
@@ -0,0 +1,86 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <climits>
+#include <vector>
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+
+class BordersTest : public ::libvpx_test::EncoderTest,
+    public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
+ protected:
+  BordersTest() : EncoderTest(GET_PARAM(0)) {}
+  virtual ~BordersTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 1) {
+      encoder->Control(VP8E_SET_CPUUSED, 1);
+      encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
+      encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
+      encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
+      encoder->Control(VP8E_SET_ARNR_TYPE, 3);
+    }
+  }
+
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    if (pkt->data.frame.flags & VPX_FRAME_IS_KEY) {
+    }
+  }
+};
+
+TEST_P(BordersTest, TestEncodeHighBitrate) {
+  // Validate that this non multiple of 64 wide clip encodes and decodes
+  // without a mismatch when passing in a very low max q.  This pushes
+  // the encoder to producing lots of big partitions which will likely
+  // extend into the border and test the border condition.
+  cfg_.g_lag_in_frames = 25;
+  cfg_.rc_2pass_vbr_minsection_pct = 5;
+  cfg_.rc_2pass_vbr_minsection_pct = 2000;
+  cfg_.rc_target_bitrate = 2000;
+  cfg_.rc_max_quantizer = 10;
+
+  ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
+                                       40);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+TEST_P(BordersTest, TestLowBitrate) {
+  // Validate that this clip encodes and decodes without a mismatch
+  // when passing in a very high min q.  This pushes the encoder to producing
+  // lots of small partitions which might will test the other condition.
+
+  cfg_.g_lag_in_frames = 25;
+  cfg_.rc_2pass_vbr_minsection_pct = 5;
+  cfg_.rc_2pass_vbr_maxsection_pct = 2000;
+  cfg_.rc_target_bitrate = 200;
+  cfg_.rc_min_quantizer = 40;
+
+  ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
+                                       40);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+VP9_INSTANTIATE_TEST_CASE(BordersTest, ::testing::Values(
+    ::libvpx_test::kTwoPassGood));
+
+VP10_INSTANTIATE_TEST_CASE(BordersTest, ::testing::Values(
+    ::libvpx_test::kTwoPassGood));
+}  // namespace
diff --git a/libs/libvpx/test/byte_alignment_test.cc b/libs/libvpx/test/byte_alignment_test.cc
new file mode 100644
index 0000000000..3a808b0467
--- /dev/null
+++ b/libs/libvpx/test/byte_alignment_test.cc
@@ -0,0 +1,189 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string>
+
+#include "./vpx_config.h"
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/md5_helper.h"
+#include "test/util.h"
+#if CONFIG_WEBM_IO
+#include "test/webm_video_source.h"
+#endif
+
+namespace {
+
+#if CONFIG_WEBM_IO
+
+const int kLegacyByteAlignment = 0;
+const int kLegacyYPlaneByteAlignment = 32;
+const int kNumPlanesToCheck = 3;
+const char kVP9TestFile[] = "vp90-2-02-size-lf-1920x1080.webm";
+const char kVP9Md5File[] = "vp90-2-02-size-lf-1920x1080.webm.md5";
+
+struct ByteAlignmentTestParam {
+  int byte_alignment;
+  vpx_codec_err_t expected_value;
+  bool decode_remaining;
+};
+
+const ByteAlignmentTestParam kBaTestParams[] = {
+  {kLegacyByteAlignment, VPX_CODEC_OK, true},
+  {32, VPX_CODEC_OK, true},
+  {64, VPX_CODEC_OK, true},
+  {128, VPX_CODEC_OK, true},
+  {256, VPX_CODEC_OK, true},
+  {512, VPX_CODEC_OK, true},
+  {1024, VPX_CODEC_OK, true},
+  {1, VPX_CODEC_INVALID_PARAM, false},
+  {-2, VPX_CODEC_INVALID_PARAM, false},
+  {4, VPX_CODEC_INVALID_PARAM, false},
+  {16, VPX_CODEC_INVALID_PARAM, false},
+  {255, VPX_CODEC_INVALID_PARAM, false},
+  {2048, VPX_CODEC_INVALID_PARAM, false},
+};
+
+// Class for testing byte alignment of reference buffers.
+class ByteAlignmentTest
+    : public ::testing::TestWithParam<ByteAlignmentTestParam> {
+ protected:
+  ByteAlignmentTest()
+      : video_(NULL),
+        decoder_(NULL),
+        md5_file_(NULL) {}
+
+  virtual void SetUp() {
+    video_ = new libvpx_test::WebMVideoSource(kVP9TestFile);
+    ASSERT_TRUE(video_ != NULL);
+    video_->Init();
+    video_->Begin();
+
+    const vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
+    decoder_ = new libvpx_test::VP9Decoder(cfg, 0);
+    ASSERT_TRUE(decoder_ != NULL);
+
+    OpenMd5File(kVP9Md5File);
+  }
+
+  virtual void TearDown() {
+    if (md5_file_ != NULL)
+      fclose(md5_file_);
+
+    delete decoder_;
+    delete video_;
+  }
+
+  void SetByteAlignment(int byte_alignment, vpx_codec_err_t expected_value) {
+    decoder_->Control(VP9_SET_BYTE_ALIGNMENT, byte_alignment, expected_value);
+  }
+
+  vpx_codec_err_t DecodeOneFrame(int byte_alignment_to_check) {
+    const vpx_codec_err_t res =
+        decoder_->DecodeFrame(video_->cxdata(), video_->frame_size());
+    CheckDecodedFrames(byte_alignment_to_check);
+    if (res == VPX_CODEC_OK)
+      video_->Next();
+    return res;
+  }
+
+  vpx_codec_err_t DecodeRemainingFrames(int byte_alignment_to_check) {
+    for (; video_->cxdata() != NULL; video_->Next()) {
+      const vpx_codec_err_t res =
+          decoder_->DecodeFrame(video_->cxdata(), video_->frame_size());
+      if (res != VPX_CODEC_OK)
+        return res;
+      CheckDecodedFrames(byte_alignment_to_check);
+    }
+    return VPX_CODEC_OK;
+  }
+
+ private:
+  // Check if |data| is aligned to |byte_alignment_to_check|.
+  // |byte_alignment_to_check| must be a power of 2.
+  void CheckByteAlignment(const uint8_t *data, int byte_alignment_to_check) {
+    ASSERT_EQ(0u, reinterpret_cast<size_t>(data) % byte_alignment_to_check);
+  }
+
+  // Iterate through the planes of the decoded frames and check for
+  // alignment based off |byte_alignment_to_check|.
+  void CheckDecodedFrames(int byte_alignment_to_check) {
+    libvpx_test::DxDataIterator dec_iter = decoder_->GetDxData();
+    const vpx_image_t *img;
+
+    // Get decompressed data
+    while ((img = dec_iter.Next()) != NULL) {
+      if (byte_alignment_to_check == kLegacyByteAlignment) {
+        CheckByteAlignment(img->planes[0], kLegacyYPlaneByteAlignment);
+      } else {
+        for (int i = 0; i < kNumPlanesToCheck; ++i) {
+          CheckByteAlignment(img->planes[i], byte_alignment_to_check);
+        }
+      }
+      CheckMd5(*img);
+    }
+  }
+
+  // TODO(fgalligan): Move the MD5 testing code into another class.
+  void OpenMd5File(const std::string &md5_file_name_) {
+    md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name_);
+    ASSERT_TRUE(md5_file_ != NULL) << "MD5 file open failed. Filename: "
+        << md5_file_name_;
+  }
+
+  void CheckMd5(const vpx_image_t &img) {
+    ASSERT_TRUE(md5_file_ != NULL);
+    char expected_md5[33];
+    char junk[128];
+
+    // Read correct md5 checksums.
+    const int res = fscanf(md5_file_, "%s  %s", expected_md5, junk);
+    ASSERT_NE(EOF, res) << "Read md5 data failed";
+    expected_md5[32] = '\0';
+
+    ::libvpx_test::MD5 md5_res;
+    md5_res.Add(&img);
+    const char *const actual_md5 = md5_res.Get();
+
+    // Check md5 match.
+    ASSERT_STREQ(expected_md5, actual_md5) << "MD5 checksums don't match";
+  }
+
+  libvpx_test::WebMVideoSource *video_;
+  libvpx_test::VP9Decoder *decoder_;
+  FILE *md5_file_;
+};
+
+TEST_F(ByteAlignmentTest, SwitchByteAlignment) {
+  const int num_elements = 14;
+  const int byte_alignments[] = { 0, 32, 64, 128, 256, 512, 1024,
+                                  0, 1024, 32, 512, 64, 256, 128 };
+
+  for (int i = 0; i < num_elements; ++i) {
+    SetByteAlignment(byte_alignments[i], VPX_CODEC_OK);
+    ASSERT_EQ(VPX_CODEC_OK, DecodeOneFrame(byte_alignments[i]));
+  }
+  SetByteAlignment(byte_alignments[0], VPX_CODEC_OK);
+  ASSERT_EQ(VPX_CODEC_OK, DecodeRemainingFrames(byte_alignments[0]));
+}
+
+TEST_P(ByteAlignmentTest, TestAlignment) {
+  const ByteAlignmentTestParam t = GetParam();
+  SetByteAlignment(t.byte_alignment, t.expected_value);
+  if (t.decode_remaining)
+    ASSERT_EQ(VPX_CODEC_OK, DecodeRemainingFrames(t.byte_alignment));
+}
+
+INSTANTIATE_TEST_CASE_P(Alignments, ByteAlignmentTest,
+                        ::testing::ValuesIn(kBaTestParams));
+
+#endif  // CONFIG_WEBM_IO
+
+}  // namespace
diff --git a/libs/libvpx/test/clear_system_state.h b/libs/libvpx/test/clear_system_state.h
new file mode 100644
index 0000000000..5e76797443
--- /dev/null
+++ b/libs/libvpx/test/clear_system_state.h
@@ -0,0 +1,29 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef TEST_CLEAR_SYSTEM_STATE_H_
+#define TEST_CLEAR_SYSTEM_STATE_H_
+
+#include "./vpx_config.h"
+#if ARCH_X86 || ARCH_X86_64
+# include "vpx_ports/x86.h"
+#endif
+
+namespace libvpx_test {
+
+// Reset system to a known state. This function should be used for all non-API
+// test cases.
+inline void ClearSystemState() {
+#if ARCH_X86 || ARCH_X86_64
+  vpx_reset_mmx_state();
+#endif
+}
+
+}  // namespace libvpx_test
+#endif  // TEST_CLEAR_SYSTEM_STATE_H_
diff --git a/libs/libvpx/test/codec_factory.h b/libs/libvpx/test/codec_factory.h
new file mode 100644
index 0000000000..09c9cf9842
--- /dev/null
+++ b/libs/libvpx/test/codec_factory.h
@@ -0,0 +1,348 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef TEST_CODEC_FACTORY_H_
+#define TEST_CODEC_FACTORY_H_
+
+#include "./vpx_config.h"
+#include "vpx/vpx_decoder.h"
+#include "vpx/vpx_encoder.h"
+#if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
+#include "vpx/vp8cx.h"
+#endif
+#if CONFIG_VP8_DECODER || CONFIG_VP9_DECODER || CONFIG_VP10_DECODER
+#include "vpx/vp8dx.h"
+#endif
+
+#include "test/decode_test_driver.h"
+#include "test/encode_test_driver.h"
+namespace libvpx_test {
+
+const int kCodecFactoryParam = 0;
+
+class CodecFactory {
+ public:
+  CodecFactory() {}
+
+  virtual ~CodecFactory() {}
+
+  virtual Decoder* CreateDecoder(vpx_codec_dec_cfg_t cfg,
+                                 unsigned long deadline) const = 0;
+
+  virtual Decoder* CreateDecoder(vpx_codec_dec_cfg_t cfg,
+                                 const vpx_codec_flags_t flags,
+                                 unsigned long deadline)  // NOLINT(runtime/int)
+                                 const = 0;
+
+  virtual Encoder* CreateEncoder(vpx_codec_enc_cfg_t cfg,
+                                 unsigned long deadline,
+                                 const unsigned long init_flags,
+                                 TwopassStatsStore *stats) const = 0;
+
+  virtual vpx_codec_err_t DefaultEncoderConfig(vpx_codec_enc_cfg_t *cfg,
+                                               int usage) const = 0;
+};
+
+/* Provide CodecTestWith<n>Params classes for a variable number of parameters
+ * to avoid having to include a pointer to the CodecFactory in every test
+ * definition.
+ */
+template<class T1>
+class CodecTestWithParam : public ::testing::TestWithParam<
+    std::tr1::tuple< const libvpx_test::CodecFactory*, T1 > > {
+};
+
+template<class T1, class T2>
+class CodecTestWith2Params : public ::testing::TestWithParam<
+    std::tr1::tuple< const libvpx_test::CodecFactory*, T1, T2 > > {
+};
+
+template<class T1, class T2, class T3>
+class CodecTestWith3Params : public ::testing::TestWithParam<
+    std::tr1::tuple< const libvpx_test::CodecFactory*, T1, T2, T3 > > {
+};
+
+/*
+ * VP8 Codec Definitions
+ */
+#if CONFIG_VP8
+class VP8Decoder : public Decoder {
+ public:
+  VP8Decoder(vpx_codec_dec_cfg_t cfg, unsigned long deadline)
+      : Decoder(cfg, deadline) {}
+
+  VP8Decoder(vpx_codec_dec_cfg_t cfg, const vpx_codec_flags_t flag,
+             unsigned long deadline)  // NOLINT
+      : Decoder(cfg, flag, deadline) {}
+
+ protected:
+  virtual vpx_codec_iface_t* CodecInterface() const {
+#if CONFIG_VP8_DECODER
+    return &vpx_codec_vp8_dx_algo;
+#else
+    return NULL;
+#endif
+  }
+};
+
+class VP8Encoder : public Encoder {
+ public:
+  VP8Encoder(vpx_codec_enc_cfg_t cfg, unsigned long deadline,
+             const unsigned long init_flags, TwopassStatsStore *stats)
+      : Encoder(cfg, deadline, init_flags, stats) {}
+
+ protected:
+  virtual vpx_codec_iface_t* CodecInterface() const {
+#if CONFIG_VP8_ENCODER
+    return &vpx_codec_vp8_cx_algo;
+#else
+    return NULL;
+#endif
+  }
+};
+
+class VP8CodecFactory : public CodecFactory {
+ public:
+  VP8CodecFactory() : CodecFactory() {}
+
+  virtual Decoder* CreateDecoder(vpx_codec_dec_cfg_t cfg,
+                                 unsigned long deadline) const {
+    return CreateDecoder(cfg, 0, deadline);
+  }
+
+  virtual Decoder* CreateDecoder(vpx_codec_dec_cfg_t cfg,
+                                 const vpx_codec_flags_t flags,
+                                 unsigned long deadline) const {  // NOLINT
+#if CONFIG_VP8_DECODER
+    return new VP8Decoder(cfg, flags, deadline);
+#else
+    return NULL;
+#endif
+  }
+
+  virtual Encoder* CreateEncoder(vpx_codec_enc_cfg_t cfg,
+                                 unsigned long deadline,
+                                 const unsigned long init_flags,
+                                 TwopassStatsStore *stats) const {
+#if CONFIG_VP8_ENCODER
+    return new VP8Encoder(cfg, deadline, init_flags, stats);
+#else
+    return NULL;
+#endif
+  }
+
+  virtual vpx_codec_err_t DefaultEncoderConfig(vpx_codec_enc_cfg_t *cfg,
+                                               int usage) const {
+#if CONFIG_VP8_ENCODER
+    return vpx_codec_enc_config_default(&vpx_codec_vp8_cx_algo, cfg, usage);
+#else
+    return VPX_CODEC_INCAPABLE;
+#endif
+  }
+};
+
+const libvpx_test::VP8CodecFactory kVP8;
+
+#define VP8_INSTANTIATE_TEST_CASE(test, ...)\
+  INSTANTIATE_TEST_CASE_P(VP8, test, \
+      ::testing::Combine( \
+          ::testing::Values(static_cast<const libvpx_test::CodecFactory*>( \
+              &libvpx_test::kVP8)), \
+          __VA_ARGS__))
+#else
+#define VP8_INSTANTIATE_TEST_CASE(test, ...)
+#endif  // CONFIG_VP8
+
+
+/*
+ * VP9 Codec Definitions
+ */
+#if CONFIG_VP9
+class VP9Decoder : public Decoder {
+ public:
+  VP9Decoder(vpx_codec_dec_cfg_t cfg, unsigned long deadline)
+      : Decoder(cfg, deadline) {}
+
+  VP9Decoder(vpx_codec_dec_cfg_t cfg, const vpx_codec_flags_t flag,
+             unsigned long deadline)  // NOLINT
+      : Decoder(cfg, flag, deadline) {}
+
+ protected:
+  virtual vpx_codec_iface_t* CodecInterface() const {
+#if CONFIG_VP9_DECODER
+    return &vpx_codec_vp9_dx_algo;
+#else
+    return NULL;
+#endif
+  }
+};
+
+class VP9Encoder : public Encoder {
+ public:
+  VP9Encoder(vpx_codec_enc_cfg_t cfg, unsigned long deadline,
+             const unsigned long init_flags, TwopassStatsStore *stats)
+      : Encoder(cfg, deadline, init_flags, stats) {}
+
+ protected:
+  virtual vpx_codec_iface_t* CodecInterface() const {
+#if CONFIG_VP9_ENCODER
+    return &vpx_codec_vp9_cx_algo;
+#else
+    return NULL;
+#endif
+  }
+};
+
+class VP9CodecFactory : public CodecFactory {
+ public:
+  VP9CodecFactory() : CodecFactory() {}
+
+  virtual Decoder* CreateDecoder(vpx_codec_dec_cfg_t cfg,
+                                 unsigned long deadline) const {
+    return CreateDecoder(cfg, 0, deadline);
+  }
+
+  virtual Decoder* CreateDecoder(vpx_codec_dec_cfg_t cfg,
+                                 const vpx_codec_flags_t flags,
+                                 unsigned long deadline) const {  // NOLINT
+#if CONFIG_VP9_DECODER
+    return new VP9Decoder(cfg, flags, deadline);
+#else
+    return NULL;
+#endif
+  }
+
+  virtual Encoder* CreateEncoder(vpx_codec_enc_cfg_t cfg,
+                                 unsigned long deadline,
+                                 const unsigned long init_flags,
+                                 TwopassStatsStore *stats) const {
+#if CONFIG_VP9_ENCODER
+    return new VP9Encoder(cfg, deadline, init_flags, stats);
+#else
+    return NULL;
+#endif
+  }
+
+  virtual vpx_codec_err_t DefaultEncoderConfig(vpx_codec_enc_cfg_t *cfg,
+                                               int usage) const {
+#if CONFIG_VP9_ENCODER
+    return vpx_codec_enc_config_default(&vpx_codec_vp9_cx_algo, cfg, usage);
+#elif CONFIG_VP10_ENCODER
+    return vpx_codec_enc_config_default(&vpx_codec_vp10_cx_algo, cfg, usage);
+#else
+    return VPX_CODEC_INCAPABLE;
+#endif
+  }
+};
+
+const libvpx_test::VP9CodecFactory kVP9;
+
+#define VP9_INSTANTIATE_TEST_CASE(test, ...)\
+  INSTANTIATE_TEST_CASE_P(VP9, test, \
+      ::testing::Combine( \
+          ::testing::Values(static_cast<const libvpx_test::CodecFactory*>( \
+               &libvpx_test::kVP9)), \
+          __VA_ARGS__))
+#else
+#define VP9_INSTANTIATE_TEST_CASE(test, ...)
+#endif  // CONFIG_VP9
+
+/*
+ * VP10 Codec Definitions
+ */
+#if CONFIG_VP10
+class VP10Decoder : public Decoder {
+ public:
+  VP10Decoder(vpx_codec_dec_cfg_t cfg, unsigned long deadline)
+      : Decoder(cfg, deadline) {}
+
+  VP10Decoder(vpx_codec_dec_cfg_t cfg, const vpx_codec_flags_t flag,
+              unsigned long deadline)  // NOLINT
+      : Decoder(cfg, flag, deadline) {}
+
+ protected:
+  virtual vpx_codec_iface_t* CodecInterface() const {
+#if CONFIG_VP10_DECODER
+    return &vpx_codec_vp10_dx_algo;
+#else
+    return NULL;
+#endif
+  }
+};
+
+class VP10Encoder : public Encoder {
+ public:
+  VP10Encoder(vpx_codec_enc_cfg_t cfg, unsigned long deadline,
+              const unsigned long init_flags, TwopassStatsStore *stats)
+      : Encoder(cfg, deadline, init_flags, stats) {}
+
+ protected:
+  virtual vpx_codec_iface_t* CodecInterface() const {
+#if CONFIG_VP10_ENCODER
+    return &vpx_codec_vp10_cx_algo;
+#else
+    return NULL;
+#endif
+  }
+};
+
+class VP10CodecFactory : public CodecFactory {
+ public:
+  VP10CodecFactory() : CodecFactory() {}
+
+  virtual Decoder* CreateDecoder(vpx_codec_dec_cfg_t cfg,
+                                 unsigned long deadline) const {
+    return CreateDecoder(cfg, 0, deadline);
+  }
+
+  virtual Decoder* CreateDecoder(vpx_codec_dec_cfg_t cfg,
+                                 const vpx_codec_flags_t flags,
+                                 unsigned long deadline) const {  // NOLINT
+#if CONFIG_VP10_DECODER
+    return new VP10Decoder(cfg, flags, deadline);
+#else
+    return NULL;
+#endif
+  }
+
+  virtual Encoder* CreateEncoder(vpx_codec_enc_cfg_t cfg,
+                                 unsigned long deadline,
+                                 const unsigned long init_flags,
+                                 TwopassStatsStore *stats) const {
+#if CONFIG_VP10_ENCODER
+    return new VP10Encoder(cfg, deadline, init_flags, stats);
+#else
+    return NULL;
+#endif
+  }
+
+  virtual vpx_codec_err_t DefaultEncoderConfig(vpx_codec_enc_cfg_t *cfg,
+                                               int usage) const {
+#if CONFIG_VP10_ENCODER
+    return vpx_codec_enc_config_default(&vpx_codec_vp10_cx_algo, cfg, usage);
+#else
+    return VPX_CODEC_INCAPABLE;
+#endif
+  }
+};
+
+const libvpx_test::VP10CodecFactory kVP10;
+
+#define VP10_INSTANTIATE_TEST_CASE(test, ...)\
+  INSTANTIATE_TEST_CASE_P(VP10, test, \
+      ::testing::Combine( \
+          ::testing::Values(static_cast<const libvpx_test::CodecFactory*>( \
+               &libvpx_test::kVP10)), \
+          __VA_ARGS__))
+#else
+#define VP10_INSTANTIATE_TEST_CASE(test, ...)
+#endif  // CONFIG_VP10
+
+}  // namespace libvpx_test
+#endif  // TEST_CODEC_FACTORY_H_
diff --git a/libs/libvpx/test/config_test.cc b/libs/libvpx/test/config_test.cc
new file mode 100644
index 0000000000..04931103d7
--- /dev/null
+++ b/libs/libvpx/test/config_test.cc
@@ -0,0 +1,60 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/video_source.h"
+
+namespace {
+
+class ConfigTest : public ::libvpx_test::EncoderTest,
+    public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
+ protected:
+  ConfigTest() : EncoderTest(GET_PARAM(0)),
+                 frame_count_in_(0), frame_count_out_(0), frame_count_max_(0) {}
+  virtual ~ConfigTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+  }
+
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+    frame_count_in_ = 0;
+    frame_count_out_ = 0;
+  }
+
+  virtual void PreEncodeFrameHook(libvpx_test::VideoSource* /*video*/) {
+    ++frame_count_in_;
+    abort_ |= (frame_count_in_ >= frame_count_max_);
+  }
+
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t* /*pkt*/) {
+    ++frame_count_out_;
+  }
+
+  unsigned int frame_count_in_;
+  unsigned int frame_count_out_;
+  unsigned int frame_count_max_;
+};
+
+TEST_P(ConfigTest, LagIsDisabled) {
+  frame_count_max_ = 2;
+  cfg_.g_lag_in_frames = 15;
+
+  libvpx_test::DummyVideoSource video;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  EXPECT_EQ(frame_count_in_, frame_count_out_);
+}
+
+VP8_INSTANTIATE_TEST_CASE(ConfigTest, ONE_PASS_TEST_MODES);
+}  // namespace
diff --git a/libs/libvpx/test/consistency_test.cc b/libs/libvpx/test/consistency_test.cc
new file mode 100644
index 0000000000..9c2fd55084
--- /dev/null
+++ b/libs/libvpx/test/consistency_test.cc
@@ -0,0 +1,224 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <limits.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_config.h"
+#if CONFIG_VP9_ENCODER
+#include "./vp9_rtcd.h"
+#endif
+
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "vpx_dsp/ssim.h"
+#include "vpx_mem/vpx_mem.h"
+
+extern "C"
+double vpx_get_ssim_metrics(uint8_t *img1, int img1_pitch,
+                            uint8_t *img2, int img2_pitch,
+                            int width, int height,
+                            Ssimv *sv2, Metrics *m,
+                            int do_inconsistency);
+
+using libvpx_test::ACMRandom;
+
+namespace {
+class ConsistencyTestBase : public ::testing::Test {
+ public:
+  ConsistencyTestBase(int width, int height) : width_(width), height_(height) {}
+
+  static void SetUpTestCase() {
+    source_data_[0] = reinterpret_cast<uint8_t*>(
+        vpx_memalign(kDataAlignment, kDataBufferSize));
+    reference_data_[0] = reinterpret_cast<uint8_t*>(
+        vpx_memalign(kDataAlignment, kDataBufferSize));
+    source_data_[1] = reinterpret_cast<uint8_t*>(
+        vpx_memalign(kDataAlignment, kDataBufferSize));
+    reference_data_[1] = reinterpret_cast<uint8_t*>(
+        vpx_memalign(kDataAlignment, kDataBufferSize));
+    ssim_array_ = new Ssimv[kDataBufferSize / 16];
+  }
+
+  static void ClearSsim() {
+    memset(ssim_array_, 0, kDataBufferSize / 16);
+  }
+  static void TearDownTestCase() {
+    vpx_free(source_data_[0]);
+    source_data_[0] = NULL;
+    vpx_free(reference_data_[0]);
+    reference_data_[0] = NULL;
+    vpx_free(source_data_[1]);
+    source_data_[1] = NULL;
+    vpx_free(reference_data_[1]);
+    reference_data_[1] = NULL;
+
+    delete[] ssim_array_;
+  }
+
+  virtual void TearDown() {
+    libvpx_test::ClearSystemState();
+  }
+
+ protected:
+  // Handle frames up to 640x480
+  static const int kDataAlignment = 16;
+  static const int kDataBufferSize = 640*480;
+
+  virtual void SetUp() {
+    source_stride_ = (width_ + 31) & ~31;
+    reference_stride_ = width_ * 2;
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+  }
+
+  void FillRandom(uint8_t *data, int stride, int width, int height) {
+    for (int h = 0; h < height; ++h) {
+      for (int w = 0; w < width; ++w) {
+        data[h * stride + w] = rnd_.Rand8();
+      }
+    }
+  }
+
+  void FillRandom(uint8_t *data, int stride) {
+    FillRandom(data, stride, width_, height_);
+  }
+
+  void Copy(uint8_t *reference, uint8_t *source) {
+    memcpy(reference, source, kDataBufferSize);
+  }
+
+  void Blur(uint8_t *data, int stride, int taps) {
+    int sum = 0;
+    int half_taps = taps / 2;
+    for (int h = 0; h < height_; ++h) {
+      for (int w = 0; w < taps; ++w) {
+        sum += data[w + h * stride];
+      }
+      for (int w = taps; w < width_; ++w) {
+        sum += data[w + h * stride] - data[w - taps + h * stride];
+        data[w - half_taps + h * stride] = (sum + half_taps) / taps;
+      }
+    }
+    for (int w = 0; w < width_; ++w) {
+      for (int h = 0; h < taps; ++h) {
+        sum += data[h + w * stride];
+      }
+      for (int h = taps; h < height_; ++h) {
+        sum += data[w + h * stride] - data[(h - taps) * stride + w];
+        data[(h - half_taps) * stride + w] = (sum + half_taps) / taps;
+      }
+    }
+  }
+  int width_, height_;
+  static uint8_t* source_data_[2];
+  int source_stride_;
+  static uint8_t* reference_data_[2];
+  int reference_stride_;
+  static Ssimv *ssim_array_;
+  Metrics metrics_;
+
+  ACMRandom rnd_;
+};
+
+#if CONFIG_VP9_ENCODER
+typedef std::tr1::tuple<int, int> ConsistencyParam;
+class ConsistencyVP9Test
+    : public ConsistencyTestBase,
+      public ::testing::WithParamInterface<ConsistencyParam> {
+ public:
+  ConsistencyVP9Test() : ConsistencyTestBase(GET_PARAM(0), GET_PARAM(1)) {}
+
+ protected:
+  double CheckConsistency(int frame) {
+    EXPECT_LT(frame, 2)<< "Frame to check has to be less than 2.";
+    return
+        vpx_get_ssim_metrics(source_data_[frame], source_stride_,
+                             reference_data_[frame], reference_stride_,
+                             width_, height_, ssim_array_, &metrics_, 1);
+  }
+};
+#endif  // CONFIG_VP9_ENCODER
+
+uint8_t* ConsistencyTestBase::source_data_[2] = {NULL, NULL};
+uint8_t* ConsistencyTestBase::reference_data_[2] = {NULL, NULL};
+Ssimv* ConsistencyTestBase::ssim_array_ = NULL;
+
+#if CONFIG_VP9_ENCODER
+TEST_P(ConsistencyVP9Test, ConsistencyIsZero) {
+  FillRandom(source_data_[0], source_stride_);
+  Copy(source_data_[1], source_data_[0]);
+  Copy(reference_data_[0], source_data_[0]);
+  Blur(reference_data_[0], reference_stride_, 3);
+  Copy(reference_data_[1], source_data_[0]);
+  Blur(reference_data_[1], reference_stride_, 3);
+
+  double inconsistency = CheckConsistency(1);
+  inconsistency = CheckConsistency(0);
+  EXPECT_EQ(inconsistency, 0.0)
+      << "Should have 0 inconsistency if they are exactly the same.";
+
+  // If sources are not consistent reference frames inconsistency should
+  // be less than if the source is consistent.
+  FillRandom(source_data_[0], source_stride_);
+  FillRandom(source_data_[1], source_stride_);
+  FillRandom(reference_data_[0], reference_stride_);
+  FillRandom(reference_data_[1], reference_stride_);
+  CheckConsistency(0);
+  inconsistency = CheckConsistency(1);
+
+  Copy(source_data_[1], source_data_[0]);
+  CheckConsistency(0);
+  double inconsistency2 = CheckConsistency(1);
+  EXPECT_LT(inconsistency, inconsistency2)
+      << "Should have less inconsistency if source itself is inconsistent.";
+
+  // Less of a blur should be less inconsistent than more blur coming off a
+  // a frame with no blur.
+  ClearSsim();
+  FillRandom(source_data_[0], source_stride_);
+  Copy(source_data_[1], source_data_[0]);
+  Copy(reference_data_[0], source_data_[0]);
+  Copy(reference_data_[1], source_data_[0]);
+  Blur(reference_data_[1], reference_stride_, 4);
+  CheckConsistency(0);
+  inconsistency = CheckConsistency(1);
+  ClearSsim();
+  Copy(reference_data_[1], source_data_[0]);
+  Blur(reference_data_[1], reference_stride_, 8);
+  CheckConsistency(0);
+  inconsistency2 = CheckConsistency(1);
+
+  EXPECT_LT(inconsistency, inconsistency2)
+      << "Stronger Blur should produce more inconsistency.";
+}
+#endif  // CONFIG_VP9_ENCODER
+
+
+using std::tr1::make_tuple;
+
+//------------------------------------------------------------------------------
+// C functions
+
+#if CONFIG_VP9_ENCODER
+const ConsistencyParam c_vp9_tests[] = {
+  make_tuple(320, 240),
+  make_tuple(318, 242),
+  make_tuple(318, 238),
+};
+INSTANTIATE_TEST_CASE_P(C, ConsistencyVP9Test,
+                        ::testing::ValuesIn(c_vp9_tests));
+#endif
+
+}  // namespace
diff --git a/libs/libvpx/test/convolve_test.cc b/libs/libvpx/test/convolve_test.cc
new file mode 100644
index 0000000000..12022be523
--- /dev/null
+++ b/libs/libvpx/test/convolve_test.cc
@@ -0,0 +1,1372 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_filter.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+
+namespace {
+
+static const unsigned int kMaxDimension = 64;
+
+typedef void (*ConvolveFunc)(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const int16_t *filter_x, int filter_x_stride,
+                             const int16_t *filter_y, int filter_y_stride,
+                             int w, int h);
+
+struct ConvolveFunctions {
+  ConvolveFunctions(ConvolveFunc copy, ConvolveFunc avg,
+                    ConvolveFunc h8, ConvolveFunc h8_avg,
+                    ConvolveFunc v8, ConvolveFunc v8_avg,
+                    ConvolveFunc hv8, ConvolveFunc hv8_avg,
+                    ConvolveFunc sh8, ConvolveFunc sh8_avg,
+                    ConvolveFunc sv8, ConvolveFunc sv8_avg,
+                    ConvolveFunc shv8, ConvolveFunc shv8_avg,
+                    int bd)
+      : copy_(copy), avg_(avg), h8_(h8), v8_(v8), hv8_(hv8), h8_avg_(h8_avg),
+        v8_avg_(v8_avg), hv8_avg_(hv8_avg), sh8_(sh8), sv8_(sv8), shv8_(shv8),
+        sh8_avg_(sh8_avg), sv8_avg_(sv8_avg), shv8_avg_(shv8_avg),
+        use_highbd_(bd) {}
+
+  ConvolveFunc copy_;
+  ConvolveFunc avg_;
+  ConvolveFunc h8_;
+  ConvolveFunc v8_;
+  ConvolveFunc hv8_;
+  ConvolveFunc h8_avg_;
+  ConvolveFunc v8_avg_;
+  ConvolveFunc hv8_avg_;
+  ConvolveFunc sh8_;        // scaled horiz
+  ConvolveFunc sv8_;        // scaled vert
+  ConvolveFunc shv8_;       // scaled horiz/vert
+  ConvolveFunc sh8_avg_;    // scaled avg horiz
+  ConvolveFunc sv8_avg_;    // scaled avg vert
+  ConvolveFunc shv8_avg_;   // scaled avg horiz/vert
+  int use_highbd_;  // 0 if high bitdepth not used, else the actual bit depth.
+};
+
+typedef std::tr1::tuple<int, int, const ConvolveFunctions *> ConvolveParam;
+
+// Reference 8-tap subpixel filter, slightly modified to fit into this test.
+#define VP9_FILTER_WEIGHT 128
+#define VP9_FILTER_SHIFT 7
+uint8_t clip_pixel(int x) {
+  return x < 0 ? 0 :
+         x > 255 ? 255 :
+         x;
+}
+
+void filter_block2d_8_c(const uint8_t *src_ptr,
+                        const unsigned int src_stride,
+                        const int16_t *HFilter,
+                        const int16_t *VFilter,
+                        uint8_t *dst_ptr,
+                        unsigned int dst_stride,
+                        unsigned int output_width,
+                        unsigned int output_height) {
+  // Between passes, we use an intermediate buffer whose height is extended to
+  // have enough horizontally filtered values as input for the vertical pass.
+  // This buffer is allocated to be big enough for the largest block type we
+  // support.
+  const int kInterp_Extend = 4;
+  const unsigned int intermediate_height =
+      (kInterp_Extend - 1) + output_height + kInterp_Extend;
+  unsigned int i, j;
+
+  // Size of intermediate_buffer is max_intermediate_height * filter_max_width,
+  // where max_intermediate_height = (kInterp_Extend - 1) + filter_max_height
+  //                                 + kInterp_Extend
+  //                               = 3 + 16 + 4
+  //                               = 23
+  // and filter_max_width          = 16
+  //
+  uint8_t intermediate_buffer[71 * kMaxDimension];
+  const int intermediate_next_stride = 1 - intermediate_height * output_width;
+
+  // Horizontal pass (src -> transposed intermediate).
+  uint8_t *output_ptr = intermediate_buffer;
+  const int src_next_row_stride = src_stride - output_width;
+  src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);
+  for (i = 0; i < intermediate_height; ++i) {
+    for (j = 0; j < output_width; ++j) {
+      // Apply filter...
+      const int temp = (src_ptr[0] * HFilter[0]) +
+          (src_ptr[1] * HFilter[1]) +
+          (src_ptr[2] * HFilter[2]) +
+          (src_ptr[3] * HFilter[3]) +
+          (src_ptr[4] * HFilter[4]) +
+          (src_ptr[5] * HFilter[5]) +
+          (src_ptr[6] * HFilter[6]) +
+          (src_ptr[7] * HFilter[7]) +
+          (VP9_FILTER_WEIGHT >> 1);  // Rounding
+
+      // Normalize back to 0-255...
+      *output_ptr = clip_pixel(temp >> VP9_FILTER_SHIFT);
+      ++src_ptr;
+      output_ptr += intermediate_height;
+    }
+    src_ptr += src_next_row_stride;
+    output_ptr += intermediate_next_stride;
+  }
+
+  // Vertical pass (transposed intermediate -> dst).
+  src_ptr = intermediate_buffer;
+  const int dst_next_row_stride = dst_stride - output_width;
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; ++j) {
+      // Apply filter...
+      const int temp = (src_ptr[0] * VFilter[0]) +
+          (src_ptr[1] * VFilter[1]) +
+          (src_ptr[2] * VFilter[2]) +
+          (src_ptr[3] * VFilter[3]) +
+          (src_ptr[4] * VFilter[4]) +
+          (src_ptr[5] * VFilter[5]) +
+          (src_ptr[6] * VFilter[6]) +
+          (src_ptr[7] * VFilter[7]) +
+          (VP9_FILTER_WEIGHT >> 1);  // Rounding
+
+      // Normalize back to 0-255...
+      *dst_ptr++ = clip_pixel(temp >> VP9_FILTER_SHIFT);
+      src_ptr += intermediate_height;
+    }
+    src_ptr += intermediate_next_stride;
+    dst_ptr += dst_next_row_stride;
+  }
+}
+
+void block2d_average_c(uint8_t *src,
+                       unsigned int src_stride,
+                       uint8_t *output_ptr,
+                       unsigned int output_stride,
+                       unsigned int output_width,
+                       unsigned int output_height) {
+  unsigned int i, j;
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; ++j) {
+      output_ptr[j] = (output_ptr[j] + src[i * src_stride + j] + 1) >> 1;
+    }
+    output_ptr += output_stride;
+  }
+}
+
+void filter_average_block2d_8_c(const uint8_t *src_ptr,
+                                const unsigned int src_stride,
+                                const int16_t *HFilter,
+                                const int16_t *VFilter,
+                                uint8_t *dst_ptr,
+                                unsigned int dst_stride,
+                                unsigned int output_width,
+                                unsigned int output_height) {
+  uint8_t tmp[kMaxDimension * kMaxDimension];
+
+  assert(output_width <= kMaxDimension);
+  assert(output_height <= kMaxDimension);
+  filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, tmp, 64,
+                     output_width, output_height);
+  block2d_average_c(tmp, 64, dst_ptr, dst_stride,
+                    output_width, output_height);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void highbd_filter_block2d_8_c(const uint16_t *src_ptr,
+                               const unsigned int src_stride,
+                               const int16_t *HFilter,
+                               const int16_t *VFilter,
+                               uint16_t *dst_ptr,
+                               unsigned int dst_stride,
+                               unsigned int output_width,
+                               unsigned int output_height,
+                               int bd) {
+  // Between passes, we use an intermediate buffer whose height is extended to
+  // have enough horizontally filtered values as input for the vertical pass.
+  // This buffer is allocated to be big enough for the largest block type we
+  // support.
+  const int kInterp_Extend = 4;
+  const unsigned int intermediate_height =
+      (kInterp_Extend - 1) + output_height + kInterp_Extend;
+
+  /* Size of intermediate_buffer is max_intermediate_height * filter_max_width,
+   * where max_intermediate_height = (kInterp_Extend - 1) + filter_max_height
+   *                                 + kInterp_Extend
+   *                               = 3 + 16 + 4
+   *                               = 23
+   * and filter_max_width = 16
+   */
+  uint16_t intermediate_buffer[71 * kMaxDimension];
+  const int intermediate_next_stride = 1 - intermediate_height * output_width;
+
+  // Horizontal pass (src -> transposed intermediate).
+  {
+    uint16_t *output_ptr = intermediate_buffer;
+    const int src_next_row_stride = src_stride - output_width;
+    unsigned int i, j;
+    src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);
+    for (i = 0; i < intermediate_height; ++i) {
+      for (j = 0; j < output_width; ++j) {
+        // Apply filter...
+        const int temp = (src_ptr[0] * HFilter[0]) +
+                         (src_ptr[1] * HFilter[1]) +
+                         (src_ptr[2] * HFilter[2]) +
+                         (src_ptr[3] * HFilter[3]) +
+                         (src_ptr[4] * HFilter[4]) +
+                         (src_ptr[5] * HFilter[5]) +
+                         (src_ptr[6] * HFilter[6]) +
+                         (src_ptr[7] * HFilter[7]) +
+                         (VP9_FILTER_WEIGHT >> 1);  // Rounding
+
+        // Normalize back to 0-255...
+        *output_ptr = clip_pixel_highbd(temp >> VP9_FILTER_SHIFT, bd);
+        ++src_ptr;
+        output_ptr += intermediate_height;
+      }
+      src_ptr += src_next_row_stride;
+      output_ptr += intermediate_next_stride;
+    }
+  }
+
+  // Vertical pass (transposed intermediate -> dst).
+  {
+    uint16_t *src_ptr = intermediate_buffer;
+    const int dst_next_row_stride = dst_stride - output_width;
+    unsigned int i, j;
+    for (i = 0; i < output_height; ++i) {
+      for (j = 0; j < output_width; ++j) {
+        // Apply filter...
+        const int temp = (src_ptr[0] * VFilter[0]) +
+                         (src_ptr[1] * VFilter[1]) +
+                         (src_ptr[2] * VFilter[2]) +
+                         (src_ptr[3] * VFilter[3]) +
+                         (src_ptr[4] * VFilter[4]) +
+                         (src_ptr[5] * VFilter[5]) +
+                         (src_ptr[6] * VFilter[6]) +
+                         (src_ptr[7] * VFilter[7]) +
+                         (VP9_FILTER_WEIGHT >> 1);  // Rounding
+
+        // Normalize back to 0-255...
+        *dst_ptr++ = clip_pixel_highbd(temp >> VP9_FILTER_SHIFT, bd);
+        src_ptr += intermediate_height;
+      }
+      src_ptr += intermediate_next_stride;
+      dst_ptr += dst_next_row_stride;
+    }
+  }
+}
+
+void highbd_block2d_average_c(uint16_t *src,
+                              unsigned int src_stride,
+                              uint16_t *output_ptr,
+                              unsigned int output_stride,
+                              unsigned int output_width,
+                              unsigned int output_height) {
+  unsigned int i, j;
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; ++j) {
+      output_ptr[j] = (output_ptr[j] + src[i * src_stride + j] + 1) >> 1;
+    }
+    output_ptr += output_stride;
+  }
+}
+
+void highbd_filter_average_block2d_8_c(const uint16_t *src_ptr,
+                                       const unsigned int src_stride,
+                                       const int16_t *HFilter,
+                                       const int16_t *VFilter,
+                                       uint16_t *dst_ptr,
+                                       unsigned int dst_stride,
+                                       unsigned int output_width,
+                                       unsigned int output_height,
+                                       int bd) {
+  uint16_t tmp[kMaxDimension * kMaxDimension];
+
+  assert(output_width <= kMaxDimension);
+  assert(output_height <= kMaxDimension);
+  highbd_filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, tmp, 64,
+                            output_width, output_height, bd);
+  highbd_block2d_average_c(tmp, 64, dst_ptr, dst_stride,
+                           output_width, output_height);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
+ public:
+  static void SetUpTestCase() {
+    // Force input_ to be unaligned, output to be 16 byte aligned.
+    input_ = reinterpret_cast<uint8_t*>(
+        vpx_memalign(kDataAlignment, kInputBufferSize + 1)) + 1;
+    output_ = reinterpret_cast<uint8_t*>(
+        vpx_memalign(kDataAlignment, kOutputBufferSize));
+    output_ref_ = reinterpret_cast<uint8_t*>(
+        vpx_memalign(kDataAlignment, kOutputBufferSize));
+#if CONFIG_VP9_HIGHBITDEPTH
+    input16_ = reinterpret_cast<uint16_t*>(
+        vpx_memalign(kDataAlignment,
+                     (kInputBufferSize + 1) * sizeof(uint16_t))) + 1;
+    output16_ = reinterpret_cast<uint16_t*>(
+        vpx_memalign(kDataAlignment, (kOutputBufferSize) * sizeof(uint16_t)));
+    output16_ref_ = reinterpret_cast<uint16_t*>(
+        vpx_memalign(kDataAlignment, (kOutputBufferSize) * sizeof(uint16_t)));
+#endif
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+  static void TearDownTestCase() {
+    vpx_free(input_ - 1);
+    input_ = NULL;
+    vpx_free(output_);
+    output_ = NULL;
+    vpx_free(output_ref_);
+    output_ref_ = NULL;
+#if CONFIG_VP9_HIGHBITDEPTH
+    vpx_free(input16_ - 1);
+    input16_ = NULL;
+    vpx_free(output16_);
+    output16_ = NULL;
+    vpx_free(output16_ref_);
+    output16_ref_ = NULL;
+#endif
+  }
+
+ protected:
+  static const int kDataAlignment = 16;
+  static const int kOuterBlockSize = 256;
+  static const int kInputStride = kOuterBlockSize;
+  static const int kOutputStride = kOuterBlockSize;
+  static const int kInputBufferSize = kOuterBlockSize * kOuterBlockSize;
+  static const int kOutputBufferSize = kOuterBlockSize * kOuterBlockSize;
+
+  int Width() const { return GET_PARAM(0); }
+  int Height() const { return GET_PARAM(1); }
+  int BorderLeft() const {
+    const int center = (kOuterBlockSize - Width()) / 2;
+    return (center + (kDataAlignment - 1)) & ~(kDataAlignment - 1);
+  }
+  int BorderTop() const { return (kOuterBlockSize - Height()) / 2; }
+
+  bool IsIndexInBorder(int i) {
+    return (i < BorderTop() * kOuterBlockSize ||
+            i >= (BorderTop() + Height()) * kOuterBlockSize ||
+            i % kOuterBlockSize < BorderLeft() ||
+            i % kOuterBlockSize >= (BorderLeft() + Width()));
+  }
+
+  virtual void SetUp() {
+    UUT_ = GET_PARAM(2);
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (UUT_->use_highbd_ != 0)
+      mask_ = (1 << UUT_->use_highbd_) - 1;
+    else
+      mask_ = 255;
+#endif
+    /* Set up guard blocks for an inner block centered in the outer block */
+    for (int i = 0; i < kOutputBufferSize; ++i) {
+      if (IsIndexInBorder(i))
+        output_[i] = 255;
+      else
+        output_[i] = 0;
+    }
+
+    ::libvpx_test::ACMRandom prng;
+    for (int i = 0; i < kInputBufferSize; ++i) {
+      if (i & 1) {
+        input_[i] = 255;
+#if CONFIG_VP9_HIGHBITDEPTH
+        input16_[i] = mask_;
+#endif
+      } else {
+        input_[i] = prng.Rand8Extremes();
+#if CONFIG_VP9_HIGHBITDEPTH
+        input16_[i] = prng.Rand16() & mask_;
+#endif
+      }
+    }
+  }
+
+  void SetConstantInput(int value) {
+    memset(input_, value, kInputBufferSize);
+#if CONFIG_VP9_HIGHBITDEPTH
+    vpx_memset16(input16_, value, kInputBufferSize);
+#endif
+  }
+
+  void CopyOutputToRef() {
+    memcpy(output_ref_, output_, kOutputBufferSize);
+#if CONFIG_VP9_HIGHBITDEPTH
+    memcpy(output16_ref_, output16_, kOutputBufferSize);
+#endif
+  }
+
+  void CheckGuardBlocks() {
+    for (int i = 0; i < kOutputBufferSize; ++i) {
+      if (IsIndexInBorder(i))
+        EXPECT_EQ(255, output_[i]);
+    }
+  }
+
+  uint8_t *input() const {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (UUT_->use_highbd_ == 0) {
+      return input_ + BorderTop() * kOuterBlockSize + BorderLeft();
+    } else {
+      return CONVERT_TO_BYTEPTR(input16_ + BorderTop() * kOuterBlockSize +
+                                BorderLeft());
+    }
+#else
+    return input_ + BorderTop() * kOuterBlockSize + BorderLeft();
+#endif
+  }
+
+  uint8_t *output() const {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (UUT_->use_highbd_ == 0) {
+      return output_ + BorderTop() * kOuterBlockSize + BorderLeft();
+    } else {
+      return CONVERT_TO_BYTEPTR(output16_ + BorderTop() * kOuterBlockSize +
+                                BorderLeft());
+    }
+#else
+    return output_ + BorderTop() * kOuterBlockSize + BorderLeft();
+#endif
+  }
+
+  uint8_t *output_ref() const {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (UUT_->use_highbd_ == 0) {
+      return output_ref_ + BorderTop() * kOuterBlockSize + BorderLeft();
+    } else {
+      return CONVERT_TO_BYTEPTR(output16_ref_ + BorderTop() * kOuterBlockSize +
+                                BorderLeft());
+    }
+#else
+    return output_ref_ + BorderTop() * kOuterBlockSize + BorderLeft();
+#endif
+  }
+
+  uint16_t lookup(uint8_t *list, int index) const {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (UUT_->use_highbd_ == 0) {
+      return list[index];
+    } else {
+      return CONVERT_TO_SHORTPTR(list)[index];
+    }
+#else
+    return list[index];
+#endif
+  }
+
+  void assign_val(uint8_t *list, int index, uint16_t val) const {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (UUT_->use_highbd_ == 0) {
+      list[index] = (uint8_t) val;
+    } else {
+      CONVERT_TO_SHORTPTR(list)[index] = val;
+    }
+#else
+    list[index] = (uint8_t) val;
+#endif
+  }
+
+  void wrapper_filter_average_block2d_8_c(const uint8_t *src_ptr,
+                                          const unsigned int src_stride,
+                                          const int16_t *HFilter,
+                                          const int16_t *VFilter,
+                                          uint8_t *dst_ptr,
+                                          unsigned int dst_stride,
+                                          unsigned int output_width,
+                                          unsigned int output_height) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (UUT_->use_highbd_ == 0) {
+      filter_average_block2d_8_c(src_ptr, src_stride, HFilter, VFilter,
+                                 dst_ptr, dst_stride, output_width,
+                                 output_height);
+    } else {
+      highbd_filter_average_block2d_8_c(CONVERT_TO_SHORTPTR(src_ptr),
+                                        src_stride, HFilter, VFilter,
+                                        CONVERT_TO_SHORTPTR(dst_ptr),
+                                        dst_stride, output_width, output_height,
+                                        UUT_->use_highbd_);
+    }
+#else
+    filter_average_block2d_8_c(src_ptr, src_stride, HFilter, VFilter,
+                               dst_ptr, dst_stride, output_width,
+                               output_height);
+#endif
+  }
+
+  void wrapper_filter_block2d_8_c(const uint8_t *src_ptr,
+                                  const unsigned int src_stride,
+                                  const int16_t *HFilter,
+                                  const int16_t *VFilter,
+                                  uint8_t *dst_ptr,
+                                  unsigned int dst_stride,
+                                  unsigned int output_width,
+                                  unsigned int output_height) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (UUT_->use_highbd_ == 0) {
+      filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter,
+                         dst_ptr, dst_stride, output_width, output_height);
+    } else {
+      highbd_filter_block2d_8_c(CONVERT_TO_SHORTPTR(src_ptr), src_stride,
+                                HFilter, VFilter,
+                                CONVERT_TO_SHORTPTR(dst_ptr), dst_stride,
+                                output_width, output_height, UUT_->use_highbd_);
+    }
+#else
+    filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter,
+                       dst_ptr, dst_stride, output_width, output_height);
+#endif
+  }
+
+  const ConvolveFunctions* UUT_;
+  static uint8_t* input_;
+  static uint8_t* output_;
+  static uint8_t* output_ref_;
+#if CONFIG_VP9_HIGHBITDEPTH
+  static uint16_t* input16_;
+  static uint16_t* output16_;
+  static uint16_t* output16_ref_;
+  int mask_;
+#endif
+};
+
+uint8_t* ConvolveTest::input_ = NULL;
+uint8_t* ConvolveTest::output_ = NULL;
+uint8_t* ConvolveTest::output_ref_ = NULL;
+#if CONFIG_VP9_HIGHBITDEPTH
+uint16_t* ConvolveTest::input16_ = NULL;
+uint16_t* ConvolveTest::output16_ = NULL;
+uint16_t* ConvolveTest::output16_ref_ = NULL;
+#endif
+
+TEST_P(ConvolveTest, GuardBlocks) {
+  CheckGuardBlocks();
+}
+
+TEST_P(ConvolveTest, Copy) {
+  uint8_t* const in = input();
+  uint8_t* const out = output();
+
+  ASM_REGISTER_STATE_CHECK(
+      UUT_->copy_(in, kInputStride, out, kOutputStride, NULL, 0, NULL, 0,
+                  Width(), Height()));
+
+  CheckGuardBlocks();
+
+  for (int y = 0; y < Height(); ++y)
+    for (int x = 0; x < Width(); ++x)
+      ASSERT_EQ(lookup(out, y * kOutputStride + x),
+                lookup(in, y * kInputStride + x))
+          << "(" << x << "," << y << ")";
+}
+
+TEST_P(ConvolveTest, Avg) {
+  uint8_t* const in = input();
+  uint8_t* const out = output();
+  uint8_t* const out_ref = output_ref();
+  CopyOutputToRef();
+
+  ASM_REGISTER_STATE_CHECK(
+      UUT_->avg_(in, kInputStride, out, kOutputStride, NULL, 0, NULL, 0,
+                Width(), Height()));
+
+  CheckGuardBlocks();
+
+  for (int y = 0; y < Height(); ++y)
+    for (int x = 0; x < Width(); ++x)
+      ASSERT_EQ(lookup(out, y * kOutputStride + x),
+                ROUND_POWER_OF_TWO(lookup(in, y * kInputStride + x) +
+                                   lookup(out_ref, y * kOutputStride + x), 1))
+          << "(" << x << "," << y << ")";
+}
+
+TEST_P(ConvolveTest, CopyHoriz) {
+  uint8_t* const in = input();
+  uint8_t* const out = output();
+  DECLARE_ALIGNED(256, const int16_t, filter8[8]) = {0, 0, 0, 128, 0, 0, 0, 0};
+
+  ASM_REGISTER_STATE_CHECK(
+      UUT_->sh8_(in, kInputStride, out, kOutputStride, filter8, 16, filter8, 16,
+                 Width(), Height()));
+
+  CheckGuardBlocks();
+
+  for (int y = 0; y < Height(); ++y)
+    for (int x = 0; x < Width(); ++x)
+      ASSERT_EQ(lookup(out, y * kOutputStride + x),
+                lookup(in, y * kInputStride + x))
+          << "(" << x << "," << y << ")";
+}
+
+TEST_P(ConvolveTest, CopyVert) {
+  uint8_t* const in = input();
+  uint8_t* const out = output();
+  DECLARE_ALIGNED(256, const int16_t, filter8[8]) = {0, 0, 0, 128, 0, 0, 0, 0};
+
+  ASM_REGISTER_STATE_CHECK(
+      UUT_->sv8_(in, kInputStride, out, kOutputStride, filter8, 16, filter8, 16,
+                 Width(), Height()));
+
+  CheckGuardBlocks();
+
+  for (int y = 0; y < Height(); ++y)
+    for (int x = 0; x < Width(); ++x)
+      ASSERT_EQ(lookup(out, y * kOutputStride + x),
+                lookup(in, y * kInputStride + x))
+          << "(" << x << "," << y << ")";
+}
+
+TEST_P(ConvolveTest, Copy2D) {
+  uint8_t* const in = input();
+  uint8_t* const out = output();
+  DECLARE_ALIGNED(256, const int16_t, filter8[8]) = {0, 0, 0, 128, 0, 0, 0, 0};
+
+  ASM_REGISTER_STATE_CHECK(
+      UUT_->shv8_(in, kInputStride, out, kOutputStride, filter8, 16, filter8,
+                  16, Width(), Height()));
+
+  CheckGuardBlocks();
+
+  for (int y = 0; y < Height(); ++y)
+    for (int x = 0; x < Width(); ++x)
+      ASSERT_EQ(lookup(out, y * kOutputStride + x),
+                lookup(in, y * kInputStride + x))
+          << "(" << x << "," << y << ")";
+}
+
+const int kNumFilterBanks = 4;
+const int kNumFilters = 16;
+
+TEST(ConvolveTest, FiltersWontSaturateWhenAddedPairwise) {
+  for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) {
+    const InterpKernel *filters =
+        vp9_filter_kernels[static_cast<INTERP_FILTER>(filter_bank)];
+    for (int i = 0; i < kNumFilters; i++) {
+      const int p0 = filters[i][0] + filters[i][1];
+      const int p1 = filters[i][2] + filters[i][3];
+      const int p2 = filters[i][4] + filters[i][5];
+      const int p3 = filters[i][6] + filters[i][7];
+      EXPECT_LE(p0, 128);
+      EXPECT_LE(p1, 128);
+      EXPECT_LE(p2, 128);
+      EXPECT_LE(p3, 128);
+      EXPECT_LE(p0 + p3, 128);
+      EXPECT_LE(p0 + p3 + p1, 128);
+      EXPECT_LE(p0 + p3 + p1 + p2, 128);
+      EXPECT_EQ(p0 + p1 + p2 + p3, 128);
+    }
+  }
+}
+
+const int16_t kInvalidFilter[8] = { 0 };
+
+TEST_P(ConvolveTest, MatchesReferenceSubpixelFilter) {
+  uint8_t* const in = input();
+  uint8_t* const out = output();
+#if CONFIG_VP9_HIGHBITDEPTH
+  uint8_t ref8[kOutputStride * kMaxDimension];
+  uint16_t ref16[kOutputStride * kMaxDimension];
+  uint8_t* ref;
+  if (UUT_->use_highbd_ == 0) {
+    ref = ref8;
+  } else {
+    ref = CONVERT_TO_BYTEPTR(ref16);
+  }
+#else
+  uint8_t ref[kOutputStride * kMaxDimension];
+#endif
+
+  for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) {
+    const InterpKernel *filters =
+        vp9_filter_kernels[static_cast<INTERP_FILTER>(filter_bank)];
+
+    for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) {
+      for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) {
+        wrapper_filter_block2d_8_c(in, kInputStride,
+                                   filters[filter_x], filters[filter_y],
+                                   ref, kOutputStride,
+                                   Width(), Height());
+
+        if (filter_x && filter_y)
+          ASM_REGISTER_STATE_CHECK(
+              UUT_->hv8_(in, kInputStride, out, kOutputStride,
+                         filters[filter_x], 16, filters[filter_y], 16,
+                         Width(), Height()));
+        else if (filter_y)
+          ASM_REGISTER_STATE_CHECK(
+              UUT_->v8_(in, kInputStride, out, kOutputStride,
+                        kInvalidFilter, 16, filters[filter_y], 16,
+                        Width(), Height()));
+        else if (filter_x)
+          ASM_REGISTER_STATE_CHECK(
+              UUT_->h8_(in, kInputStride, out, kOutputStride,
+                        filters[filter_x], 16, kInvalidFilter, 16,
+                        Width(), Height()));
+        else
+          ASM_REGISTER_STATE_CHECK(
+              UUT_->copy_(in, kInputStride, out, kOutputStride,
+                          kInvalidFilter, 0, kInvalidFilter, 0,
+                          Width(), Height()));
+
+        CheckGuardBlocks();
+
+        for (int y = 0; y < Height(); ++y)
+          for (int x = 0; x < Width(); ++x)
+            ASSERT_EQ(lookup(ref, y * kOutputStride + x),
+                      lookup(out, y * kOutputStride + x))
+                << "mismatch at (" << x << "," << y << "), "
+                << "filters (" << filter_bank << ","
+                << filter_x << "," << filter_y << ")";
+      }
+    }
+  }
+}
+
+TEST_P(ConvolveTest, MatchesReferenceAveragingSubpixelFilter) {
+  uint8_t* const in = input();
+  uint8_t* const out = output();
+#if CONFIG_VP9_HIGHBITDEPTH
+  uint8_t ref8[kOutputStride * kMaxDimension];
+  uint16_t ref16[kOutputStride * kMaxDimension];
+  uint8_t* ref;
+  if (UUT_->use_highbd_ == 0) {
+    ref = ref8;
+  } else {
+    ref = CONVERT_TO_BYTEPTR(ref16);
+  }
+#else
+  uint8_t ref[kOutputStride * kMaxDimension];
+#endif
+
+  // Populate ref and out with some random data
+  ::libvpx_test::ACMRandom prng;
+  for (int y = 0; y < Height(); ++y) {
+    for (int x = 0; x < Width(); ++x) {
+      uint16_t r;
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (UUT_->use_highbd_ == 0 || UUT_->use_highbd_ == 8) {
+        r = prng.Rand8Extremes();
+      } else {
+        r = prng.Rand16() & mask_;
+      }
+#else
+      r = prng.Rand8Extremes();
+#endif
+
+      assign_val(out, y * kOutputStride + x, r);
+      assign_val(ref, y * kOutputStride + x, r);
+    }
+  }
+
+  for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) {
+    const InterpKernel *filters =
+        vp9_filter_kernels[static_cast<INTERP_FILTER>(filter_bank)];
+
+    for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) {
+      for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) {
+        wrapper_filter_average_block2d_8_c(in, kInputStride,
+                                           filters[filter_x], filters[filter_y],
+                                           ref, kOutputStride,
+                                           Width(), Height());
+
+        if (filter_x && filter_y)
+          ASM_REGISTER_STATE_CHECK(
+              UUT_->hv8_avg_(in, kInputStride, out, kOutputStride,
+                             filters[filter_x], 16, filters[filter_y], 16,
+                             Width(), Height()));
+        else if (filter_y)
+          ASM_REGISTER_STATE_CHECK(
+              UUT_->v8_avg_(in, kInputStride, out, kOutputStride,
+                            kInvalidFilter, 16, filters[filter_y], 16,
+                            Width(), Height()));
+        else if (filter_x)
+          ASM_REGISTER_STATE_CHECK(
+              UUT_->h8_avg_(in, kInputStride, out, kOutputStride,
+                            filters[filter_x], 16, kInvalidFilter, 16,
+                            Width(), Height()));
+        else
+          ASM_REGISTER_STATE_CHECK(
+              UUT_->avg_(in, kInputStride, out, kOutputStride,
+                          kInvalidFilter, 0, kInvalidFilter, 0,
+                          Width(), Height()));
+
+        CheckGuardBlocks();
+
+        for (int y = 0; y < Height(); ++y)
+          for (int x = 0; x < Width(); ++x)
+            ASSERT_EQ(lookup(ref, y * kOutputStride + x),
+                      lookup(out, y * kOutputStride + x))
+                << "mismatch at (" << x << "," << y << "), "
+                << "filters (" << filter_bank << ","
+                << filter_x << "," << filter_y << ")";
+      }
+    }
+  }
+}
+
+TEST_P(ConvolveTest, FilterExtremes) {
+  uint8_t *const in = input();
+  uint8_t *const out = output();
+#if CONFIG_VP9_HIGHBITDEPTH
+  uint8_t ref8[kOutputStride * kMaxDimension];
+  uint16_t ref16[kOutputStride * kMaxDimension];
+  uint8_t *ref;
+  if (UUT_->use_highbd_ == 0) {
+    ref = ref8;
+  } else {
+    ref = CONVERT_TO_BYTEPTR(ref16);
+  }
+#else
+  uint8_t ref[kOutputStride * kMaxDimension];
+#endif
+
+  // Populate ref and out with some random data
+  ::libvpx_test::ACMRandom prng;
+  for (int y = 0; y < Height(); ++y) {
+    for (int x = 0; x < Width(); ++x) {
+      uint16_t r;
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (UUT_->use_highbd_ == 0 || UUT_->use_highbd_ == 8) {
+        r = prng.Rand8Extremes();
+      } else {
+        r = prng.Rand16() & mask_;
+      }
+#else
+      r = prng.Rand8Extremes();
+#endif
+      assign_val(out, y * kOutputStride + x, r);
+      assign_val(ref, y * kOutputStride + x, r);
+    }
+  }
+
+  for (int axis = 0; axis < 2; axis++) {
+    int seed_val = 0;
+    while (seed_val < 256) {
+      for (int y = 0; y < 8; ++y) {
+        for (int x = 0; x < 8; ++x) {
+#if CONFIG_VP9_HIGHBITDEPTH
+            assign_val(in, y * kOutputStride + x - SUBPEL_TAPS / 2 + 1,
+                       ((seed_val >> (axis ? y : x)) & 1) * mask_);
+#else
+            assign_val(in, y * kOutputStride + x - SUBPEL_TAPS / 2 + 1,
+                       ((seed_val >> (axis ? y : x)) & 1) * 255);
+#endif
+          if (axis) seed_val++;
+        }
+        if (axis)
+          seed_val-= 8;
+        else
+          seed_val++;
+      }
+      if (axis) seed_val += 8;
+
+      for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) {
+        const InterpKernel *filters =
+            vp9_filter_kernels[static_cast<INTERP_FILTER>(filter_bank)];
+        for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) {
+          for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) {
+            wrapper_filter_block2d_8_c(in, kInputStride,
+                                       filters[filter_x], filters[filter_y],
+                                       ref, kOutputStride,
+                                       Width(), Height());
+            if (filter_x && filter_y)
+              ASM_REGISTER_STATE_CHECK(
+                  UUT_->hv8_(in, kInputStride, out, kOutputStride,
+                             filters[filter_x], 16, filters[filter_y], 16,
+                             Width(), Height()));
+            else if (filter_y)
+              ASM_REGISTER_STATE_CHECK(
+                  UUT_->v8_(in, kInputStride, out, kOutputStride,
+                            kInvalidFilter, 16, filters[filter_y], 16,
+                            Width(), Height()));
+            else if (filter_x)
+              ASM_REGISTER_STATE_CHECK(
+                  UUT_->h8_(in, kInputStride, out, kOutputStride,
+                            filters[filter_x], 16, kInvalidFilter, 16,
+                            Width(), Height()));
+            else
+              ASM_REGISTER_STATE_CHECK(
+                  UUT_->copy_(in, kInputStride, out, kOutputStride,
+                              kInvalidFilter, 0, kInvalidFilter, 0,
+                              Width(), Height()));
+
+            for (int y = 0; y < Height(); ++y)
+              for (int x = 0; x < Width(); ++x)
+                ASSERT_EQ(lookup(ref, y * kOutputStride + x),
+                          lookup(out, y * kOutputStride + x))
+                    << "mismatch at (" << x << "," << y << "), "
+                    << "filters (" << filter_bank << ","
+                    << filter_x << "," << filter_y << ")";
+          }
+        }
+      }
+    }
+  }
+}
+
+/* This test exercises that enough rows and columns are filtered with every
+   possible initial fractional positions and scaling steps. */
+TEST_P(ConvolveTest, CheckScalingFiltering) {
+  uint8_t* const in = input();
+  uint8_t* const out = output();
+  const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP];
+
+  SetConstantInput(127);
+
+  for (int frac = 0; frac < 16; ++frac) {
+    for (int step = 1; step <= 32; ++step) {
+      /* Test the horizontal and vertical filters in combination. */
+      ASM_REGISTER_STATE_CHECK(UUT_->shv8_(in, kInputStride, out, kOutputStride,
+                                           eighttap[frac], step,
+                                           eighttap[frac], step,
+                                           Width(), Height()));
+
+      CheckGuardBlocks();
+
+      for (int y = 0; y < Height(); ++y) {
+        for (int x = 0; x < Width(); ++x) {
+          ASSERT_EQ(lookup(in, y * kInputStride + x),
+                    lookup(out, y * kOutputStride + x))
+              << "x == " << x << ", y == " << y
+              << ", frac == " << frac << ", step == " << step;
+        }
+      }
+    }
+  }
+}
+
+using std::tr1::make_tuple;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+#define WRAP(func, bd) \
+void wrap_ ## func ## _ ## bd(const uint8_t *src, ptrdiff_t src_stride, \
+                              uint8_t *dst, ptrdiff_t dst_stride, \
+                              const int16_t *filter_x, \
+                              int filter_x_stride, \
+                              const int16_t *filter_y, \
+                              int filter_y_stride, \
+                              int w, int h) { \
+  vpx_highbd_ ## func(src, src_stride, dst, dst_stride, filter_x, \
+                      filter_x_stride, filter_y, filter_y_stride, \
+                      w, h, bd); \
+}
+#if HAVE_SSE2 && ARCH_X86_64
+#if CONFIG_USE_X86INC
+WRAP(convolve_copy_sse2, 8)
+WRAP(convolve_avg_sse2, 8)
+WRAP(convolve_copy_sse2, 10)
+WRAP(convolve_avg_sse2, 10)
+WRAP(convolve_copy_sse2, 12)
+WRAP(convolve_avg_sse2, 12)
+#endif  // CONFIG_USE_X86INC
+WRAP(convolve8_horiz_sse2, 8)
+WRAP(convolve8_avg_horiz_sse2, 8)
+WRAP(convolve8_vert_sse2, 8)
+WRAP(convolve8_avg_vert_sse2, 8)
+WRAP(convolve8_sse2, 8)
+WRAP(convolve8_avg_sse2, 8)
+WRAP(convolve8_horiz_sse2, 10)
+WRAP(convolve8_avg_horiz_sse2, 10)
+WRAP(convolve8_vert_sse2, 10)
+WRAP(convolve8_avg_vert_sse2, 10)
+WRAP(convolve8_sse2, 10)
+WRAP(convolve8_avg_sse2, 10)
+WRAP(convolve8_horiz_sse2, 12)
+WRAP(convolve8_avg_horiz_sse2, 12)
+WRAP(convolve8_vert_sse2, 12)
+WRAP(convolve8_avg_vert_sse2, 12)
+WRAP(convolve8_sse2, 12)
+WRAP(convolve8_avg_sse2, 12)
+#endif  // HAVE_SSE2 && ARCH_X86_64
+
+WRAP(convolve_copy_c, 8)
+WRAP(convolve_avg_c, 8)
+WRAP(convolve8_horiz_c, 8)
+WRAP(convolve8_avg_horiz_c, 8)
+WRAP(convolve8_vert_c, 8)
+WRAP(convolve8_avg_vert_c, 8)
+WRAP(convolve8_c, 8)
+WRAP(convolve8_avg_c, 8)
+WRAP(convolve_copy_c, 10)
+WRAP(convolve_avg_c, 10)
+WRAP(convolve8_horiz_c, 10)
+WRAP(convolve8_avg_horiz_c, 10)
+WRAP(convolve8_vert_c, 10)
+WRAP(convolve8_avg_vert_c, 10)
+WRAP(convolve8_c, 10)
+WRAP(convolve8_avg_c, 10)
+WRAP(convolve_copy_c, 12)
+WRAP(convolve_avg_c, 12)
+WRAP(convolve8_horiz_c, 12)
+WRAP(convolve8_avg_horiz_c, 12)
+WRAP(convolve8_vert_c, 12)
+WRAP(convolve8_avg_vert_c, 12)
+WRAP(convolve8_c, 12)
+WRAP(convolve8_avg_c, 12)
+#undef WRAP
+
+const ConvolveFunctions convolve8_c(
+    wrap_convolve_copy_c_8, wrap_convolve_avg_c_8,
+    wrap_convolve8_horiz_c_8, wrap_convolve8_avg_horiz_c_8,
+    wrap_convolve8_vert_c_8, wrap_convolve8_avg_vert_c_8,
+    wrap_convolve8_c_8, wrap_convolve8_avg_c_8,
+    wrap_convolve8_horiz_c_8, wrap_convolve8_avg_horiz_c_8,
+    wrap_convolve8_vert_c_8, wrap_convolve8_avg_vert_c_8,
+    wrap_convolve8_c_8, wrap_convolve8_avg_c_8, 8);
+INSTANTIATE_TEST_CASE_P(C_8, ConvolveTest, ::testing::Values(
+    make_tuple(4, 4, &convolve8_c),
+    make_tuple(8, 4, &convolve8_c),
+    make_tuple(4, 8, &convolve8_c),
+    make_tuple(8, 8, &convolve8_c),
+    make_tuple(16, 8, &convolve8_c),
+    make_tuple(8, 16, &convolve8_c),
+    make_tuple(16, 16, &convolve8_c),
+    make_tuple(32, 16, &convolve8_c),
+    make_tuple(16, 32, &convolve8_c),
+    make_tuple(32, 32, &convolve8_c),
+    make_tuple(64, 32, &convolve8_c),
+    make_tuple(32, 64, &convolve8_c),
+    make_tuple(64, 64, &convolve8_c)));
+const ConvolveFunctions convolve10_c(
+    wrap_convolve_copy_c_10, wrap_convolve_avg_c_10,
+    wrap_convolve8_horiz_c_10, wrap_convolve8_avg_horiz_c_10,
+    wrap_convolve8_vert_c_10, wrap_convolve8_avg_vert_c_10,
+    wrap_convolve8_c_10, wrap_convolve8_avg_c_10,
+    wrap_convolve8_horiz_c_10, wrap_convolve8_avg_horiz_c_10,
+    wrap_convolve8_vert_c_10, wrap_convolve8_avg_vert_c_10,
+    wrap_convolve8_c_10, wrap_convolve8_avg_c_10, 10);
+INSTANTIATE_TEST_CASE_P(C_10, ConvolveTest, ::testing::Values(
+    make_tuple(4, 4, &convolve10_c),
+    make_tuple(8, 4, &convolve10_c),
+    make_tuple(4, 8, &convolve10_c),
+    make_tuple(8, 8, &convolve10_c),
+    make_tuple(16, 8, &convolve10_c),
+    make_tuple(8, 16, &convolve10_c),
+    make_tuple(16, 16, &convolve10_c),
+    make_tuple(32, 16, &convolve10_c),
+    make_tuple(16, 32, &convolve10_c),
+    make_tuple(32, 32, &convolve10_c),
+    make_tuple(64, 32, &convolve10_c),
+    make_tuple(32, 64, &convolve10_c),
+    make_tuple(64, 64, &convolve10_c)));
+const ConvolveFunctions convolve12_c(
+    wrap_convolve_copy_c_12, wrap_convolve_avg_c_12,
+    wrap_convolve8_horiz_c_12, wrap_convolve8_avg_horiz_c_12,
+    wrap_convolve8_vert_c_12, wrap_convolve8_avg_vert_c_12,
+    wrap_convolve8_c_12, wrap_convolve8_avg_c_12,
+    wrap_convolve8_horiz_c_12, wrap_convolve8_avg_horiz_c_12,
+    wrap_convolve8_vert_c_12, wrap_convolve8_avg_vert_c_12,
+    wrap_convolve8_c_12, wrap_convolve8_avg_c_12, 12);
+INSTANTIATE_TEST_CASE_P(C_12, ConvolveTest, ::testing::Values(
+    make_tuple(4, 4, &convolve12_c),
+    make_tuple(8, 4, &convolve12_c),
+    make_tuple(4, 8, &convolve12_c),
+    make_tuple(8, 8, &convolve12_c),
+    make_tuple(16, 8, &convolve12_c),
+    make_tuple(8, 16, &convolve12_c),
+    make_tuple(16, 16, &convolve12_c),
+    make_tuple(32, 16, &convolve12_c),
+    make_tuple(16, 32, &convolve12_c),
+    make_tuple(32, 32, &convolve12_c),
+    make_tuple(64, 32, &convolve12_c),
+    make_tuple(32, 64, &convolve12_c),
+    make_tuple(64, 64, &convolve12_c)));
+
+#else
+
+const ConvolveFunctions convolve8_c(
+    vpx_convolve_copy_c, vpx_convolve_avg_c,
+    vpx_convolve8_horiz_c, vpx_convolve8_avg_horiz_c,
+    vpx_convolve8_vert_c, vpx_convolve8_avg_vert_c,
+    vpx_convolve8_c, vpx_convolve8_avg_c,
+    vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c,
+    vpx_scaled_vert_c, vpx_scaled_avg_vert_c,
+    vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
+
+INSTANTIATE_TEST_CASE_P(C, ConvolveTest, ::testing::Values(
+    make_tuple(4, 4, &convolve8_c),
+    make_tuple(8, 4, &convolve8_c),
+    make_tuple(4, 8, &convolve8_c),
+    make_tuple(8, 8, &convolve8_c),
+    make_tuple(16, 8, &convolve8_c),
+    make_tuple(8, 16, &convolve8_c),
+    make_tuple(16, 16, &convolve8_c),
+    make_tuple(32, 16, &convolve8_c),
+    make_tuple(16, 32, &convolve8_c),
+    make_tuple(32, 32, &convolve8_c),
+    make_tuple(64, 32, &convolve8_c),
+    make_tuple(32, 64, &convolve8_c),
+    make_tuple(64, 64, &convolve8_c)));
+#endif
+
+#if HAVE_SSE2 && ARCH_X86_64
+#if CONFIG_VP9_HIGHBITDEPTH
+const ConvolveFunctions convolve8_sse2(
+#if CONFIG_USE_X86INC
+    wrap_convolve_copy_sse2_8, wrap_convolve_avg_sse2_8,
+#else
+    wrap_convolve_copy_c_8, wrap_convolve_avg_c_8,
+#endif  // CONFIG_USE_X86INC
+    wrap_convolve8_horiz_sse2_8, wrap_convolve8_avg_horiz_sse2_8,
+    wrap_convolve8_vert_sse2_8, wrap_convolve8_avg_vert_sse2_8,
+    wrap_convolve8_sse2_8, wrap_convolve8_avg_sse2_8,
+    wrap_convolve8_horiz_sse2_8, wrap_convolve8_avg_horiz_sse2_8,
+    wrap_convolve8_vert_sse2_8, wrap_convolve8_avg_vert_sse2_8,
+    wrap_convolve8_sse2_8, wrap_convolve8_avg_sse2_8, 8);
+const ConvolveFunctions convolve10_sse2(
+#if CONFIG_USE_X86INC
+    wrap_convolve_copy_sse2_10, wrap_convolve_avg_sse2_10,
+#else
+    wrap_convolve_copy_c_10, wrap_convolve_avg_c_10,
+#endif  // CONFIG_USE_X86INC
+    wrap_convolve8_horiz_sse2_10, wrap_convolve8_avg_horiz_sse2_10,
+    wrap_convolve8_vert_sse2_10, wrap_convolve8_avg_vert_sse2_10,
+    wrap_convolve8_sse2_10, wrap_convolve8_avg_sse2_10,
+    wrap_convolve8_horiz_sse2_10, wrap_convolve8_avg_horiz_sse2_10,
+    wrap_convolve8_vert_sse2_10, wrap_convolve8_avg_vert_sse2_10,
+    wrap_convolve8_sse2_10, wrap_convolve8_avg_sse2_10, 10);
+const ConvolveFunctions convolve12_sse2(
+#if CONFIG_USE_X86INC
+    wrap_convolve_copy_sse2_12, wrap_convolve_avg_sse2_12,
+#else
+    wrap_convolve_copy_c_12, wrap_convolve_avg_c_12,
+#endif  // CONFIG_USE_X86INC
+    wrap_convolve8_horiz_sse2_12, wrap_convolve8_avg_horiz_sse2_12,
+    wrap_convolve8_vert_sse2_12, wrap_convolve8_avg_vert_sse2_12,
+    wrap_convolve8_sse2_12, wrap_convolve8_avg_sse2_12,
+    wrap_convolve8_horiz_sse2_12, wrap_convolve8_avg_horiz_sse2_12,
+    wrap_convolve8_vert_sse2_12, wrap_convolve8_avg_vert_sse2_12,
+    wrap_convolve8_sse2_12, wrap_convolve8_avg_sse2_12, 12);
+INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest, ::testing::Values(
+    make_tuple(4, 4, &convolve8_sse2),
+    make_tuple(8, 4, &convolve8_sse2),
+    make_tuple(4, 8, &convolve8_sse2),
+    make_tuple(8, 8, &convolve8_sse2),
+    make_tuple(16, 8, &convolve8_sse2),
+    make_tuple(8, 16, &convolve8_sse2),
+    make_tuple(16, 16, &convolve8_sse2),
+    make_tuple(32, 16, &convolve8_sse2),
+    make_tuple(16, 32, &convolve8_sse2),
+    make_tuple(32, 32, &convolve8_sse2),
+    make_tuple(64, 32, &convolve8_sse2),
+    make_tuple(32, 64, &convolve8_sse2),
+    make_tuple(64, 64, &convolve8_sse2),
+    make_tuple(4, 4, &convolve10_sse2),
+    make_tuple(8, 4, &convolve10_sse2),
+    make_tuple(4, 8, &convolve10_sse2),
+    make_tuple(8, 8, &convolve10_sse2),
+    make_tuple(16, 8, &convolve10_sse2),
+    make_tuple(8, 16, &convolve10_sse2),
+    make_tuple(16, 16, &convolve10_sse2),
+    make_tuple(32, 16, &convolve10_sse2),
+    make_tuple(16, 32, &convolve10_sse2),
+    make_tuple(32, 32, &convolve10_sse2),
+    make_tuple(64, 32, &convolve10_sse2),
+    make_tuple(32, 64, &convolve10_sse2),
+    make_tuple(64, 64, &convolve10_sse2),
+    make_tuple(4, 4, &convolve12_sse2),
+    make_tuple(8, 4, &convolve12_sse2),
+    make_tuple(4, 8, &convolve12_sse2),
+    make_tuple(8, 8, &convolve12_sse2),
+    make_tuple(16, 8, &convolve12_sse2),
+    make_tuple(8, 16, &convolve12_sse2),
+    make_tuple(16, 16, &convolve12_sse2),
+    make_tuple(32, 16, &convolve12_sse2),
+    make_tuple(16, 32, &convolve12_sse2),
+    make_tuple(32, 32, &convolve12_sse2),
+    make_tuple(64, 32, &convolve12_sse2),
+    make_tuple(32, 64, &convolve12_sse2),
+    make_tuple(64, 64, &convolve12_sse2)));
+#else
+const ConvolveFunctions convolve8_sse2(
+#if CONFIG_USE_X86INC
+    vpx_convolve_copy_sse2, vpx_convolve_avg_sse2,
+#else
+    vpx_convolve_copy_c, vpx_convolve_avg_c,
+#endif  // CONFIG_USE_X86INC
+    vpx_convolve8_horiz_sse2, vpx_convolve8_avg_horiz_sse2,
+    vpx_convolve8_vert_sse2, vpx_convolve8_avg_vert_sse2,
+    vpx_convolve8_sse2, vpx_convolve8_avg_sse2,
+    vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c,
+    vpx_scaled_vert_c, vpx_scaled_avg_vert_c,
+    vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
+
+INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest, ::testing::Values(
+    make_tuple(4, 4, &convolve8_sse2),
+    make_tuple(8, 4, &convolve8_sse2),
+    make_tuple(4, 8, &convolve8_sse2),
+    make_tuple(8, 8, &convolve8_sse2),
+    make_tuple(16, 8, &convolve8_sse2),
+    make_tuple(8, 16, &convolve8_sse2),
+    make_tuple(16, 16, &convolve8_sse2),
+    make_tuple(32, 16, &convolve8_sse2),
+    make_tuple(16, 32, &convolve8_sse2),
+    make_tuple(32, 32, &convolve8_sse2),
+    make_tuple(64, 32, &convolve8_sse2),
+    make_tuple(32, 64, &convolve8_sse2),
+    make_tuple(64, 64, &convolve8_sse2)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif
+
+#if HAVE_SSSE3
+const ConvolveFunctions convolve8_ssse3(
+    vpx_convolve_copy_c, vpx_convolve_avg_c,
+    vpx_convolve8_horiz_ssse3, vpx_convolve8_avg_horiz_ssse3,
+    vpx_convolve8_vert_ssse3, vpx_convolve8_avg_vert_ssse3,
+    vpx_convolve8_ssse3, vpx_convolve8_avg_ssse3,
+    vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c,
+    vpx_scaled_vert_c, vpx_scaled_avg_vert_c,
+    vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
+
+INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest, ::testing::Values(
+    make_tuple(4, 4, &convolve8_ssse3),
+    make_tuple(8, 4, &convolve8_ssse3),
+    make_tuple(4, 8, &convolve8_ssse3),
+    make_tuple(8, 8, &convolve8_ssse3),
+    make_tuple(16, 8, &convolve8_ssse3),
+    make_tuple(8, 16, &convolve8_ssse3),
+    make_tuple(16, 16, &convolve8_ssse3),
+    make_tuple(32, 16, &convolve8_ssse3),
+    make_tuple(16, 32, &convolve8_ssse3),
+    make_tuple(32, 32, &convolve8_ssse3),
+    make_tuple(64, 32, &convolve8_ssse3),
+    make_tuple(32, 64, &convolve8_ssse3),
+    make_tuple(64, 64, &convolve8_ssse3)));
+#endif
+
+#if HAVE_AVX2 && HAVE_SSSE3
+const ConvolveFunctions convolve8_avx2(
+    vpx_convolve_copy_c, vpx_convolve_avg_c,
+    vpx_convolve8_horiz_avx2, vpx_convolve8_avg_horiz_ssse3,
+    vpx_convolve8_vert_avx2, vpx_convolve8_avg_vert_ssse3,
+    vpx_convolve8_avx2, vpx_convolve8_avg_ssse3,
+    vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c,
+    vpx_scaled_vert_c, vpx_scaled_avg_vert_c,
+    vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
+
+INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest, ::testing::Values(
+    make_tuple(4, 4, &convolve8_avx2),
+    make_tuple(8, 4, &convolve8_avx2),
+    make_tuple(4, 8, &convolve8_avx2),
+    make_tuple(8, 8, &convolve8_avx2),
+    make_tuple(8, 16, &convolve8_avx2),
+    make_tuple(16, 8, &convolve8_avx2),
+    make_tuple(16, 16, &convolve8_avx2),
+    make_tuple(32, 16, &convolve8_avx2),
+    make_tuple(16, 32, &convolve8_avx2),
+    make_tuple(32, 32, &convolve8_avx2),
+    make_tuple(64, 32, &convolve8_avx2),
+    make_tuple(32, 64, &convolve8_avx2),
+    make_tuple(64, 64, &convolve8_avx2)));
+#endif  // HAVE_AVX2 && HAVE_SSSE3
+
+#if HAVE_NEON
+#if HAVE_NEON_ASM
+const ConvolveFunctions convolve8_neon(
+    vpx_convolve_copy_neon, vpx_convolve_avg_neon,
+    vpx_convolve8_horiz_neon, vpx_convolve8_avg_horiz_neon,
+    vpx_convolve8_vert_neon, vpx_convolve8_avg_vert_neon,
+    vpx_convolve8_neon, vpx_convolve8_avg_neon,
+    vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c,
+    vpx_scaled_vert_c, vpx_scaled_avg_vert_c,
+    vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
+#else  // HAVE_NEON
+const ConvolveFunctions convolve8_neon(
+    vpx_convolve_copy_neon, vpx_convolve_avg_neon,
+    vpx_convolve8_horiz_neon, vpx_convolve8_avg_horiz_neon,
+    vpx_convolve8_vert_neon, vpx_convolve8_avg_vert_neon,
+    vpx_convolve8_neon, vpx_convolve8_avg_neon,
+    vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c,
+    vpx_scaled_vert_c, vpx_scaled_avg_vert_c,
+    vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
+#endif  // HAVE_NEON_ASM
+
+INSTANTIATE_TEST_CASE_P(NEON, ConvolveTest, ::testing::Values(
+    make_tuple(4, 4, &convolve8_neon),
+    make_tuple(8, 4, &convolve8_neon),
+    make_tuple(4, 8, &convolve8_neon),
+    make_tuple(8, 8, &convolve8_neon),
+    make_tuple(16, 8, &convolve8_neon),
+    make_tuple(8, 16, &convolve8_neon),
+    make_tuple(16, 16, &convolve8_neon),
+    make_tuple(32, 16, &convolve8_neon),
+    make_tuple(16, 32, &convolve8_neon),
+    make_tuple(32, 32, &convolve8_neon),
+    make_tuple(64, 32, &convolve8_neon),
+    make_tuple(32, 64, &convolve8_neon),
+    make_tuple(64, 64, &convolve8_neon)));
+#endif  // HAVE_NEON
+
+#if HAVE_DSPR2
+const ConvolveFunctions convolve8_dspr2(
+    vpx_convolve_copy_dspr2, vpx_convolve_avg_dspr2,
+    vpx_convolve8_horiz_dspr2, vpx_convolve8_avg_horiz_dspr2,
+    vpx_convolve8_vert_dspr2, vpx_convolve8_avg_vert_dspr2,
+    vpx_convolve8_dspr2, vpx_convolve8_avg_dspr2,
+    vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c,
+    vpx_scaled_vert_c, vpx_scaled_avg_vert_c,
+    vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
+
+INSTANTIATE_TEST_CASE_P(DSPR2, ConvolveTest, ::testing::Values(
+    make_tuple(4, 4, &convolve8_dspr2),
+    make_tuple(8, 4, &convolve8_dspr2),
+    make_tuple(4, 8, &convolve8_dspr2),
+    make_tuple(8, 8, &convolve8_dspr2),
+    make_tuple(16, 8, &convolve8_dspr2),
+    make_tuple(8, 16, &convolve8_dspr2),
+    make_tuple(16, 16, &convolve8_dspr2),
+    make_tuple(32, 16, &convolve8_dspr2),
+    make_tuple(16, 32, &convolve8_dspr2),
+    make_tuple(32, 32, &convolve8_dspr2),
+    make_tuple(64, 32, &convolve8_dspr2),
+    make_tuple(32, 64, &convolve8_dspr2),
+    make_tuple(64, 64, &convolve8_dspr2)));
+#endif
+
+#if HAVE_MSA
+const ConvolveFunctions convolve8_msa(
+    vpx_convolve_copy_msa, vpx_convolve_avg_msa,
+    vpx_convolve8_horiz_msa, vpx_convolve8_avg_horiz_msa,
+    vpx_convolve8_vert_msa, vpx_convolve8_avg_vert_msa,
+    vpx_convolve8_msa, vpx_convolve8_avg_msa,
+    vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c,
+    vpx_scaled_vert_c, vpx_scaled_avg_vert_c,
+    vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
+
+INSTANTIATE_TEST_CASE_P(MSA, ConvolveTest, ::testing::Values(
+    make_tuple(4, 4, &convolve8_msa),
+    make_tuple(8, 4, &convolve8_msa),
+    make_tuple(4, 8, &convolve8_msa),
+    make_tuple(8, 8, &convolve8_msa),
+    make_tuple(16, 8, &convolve8_msa),
+    make_tuple(8, 16, &convolve8_msa),
+    make_tuple(16, 16, &convolve8_msa),
+    make_tuple(32, 16, &convolve8_msa),
+    make_tuple(16, 32, &convolve8_msa),
+    make_tuple(32, 32, &convolve8_msa),
+    make_tuple(64, 32, &convolve8_msa),
+    make_tuple(32, 64, &convolve8_msa),
+    make_tuple(64, 64, &convolve8_msa)));
+#endif  // HAVE_MSA
+}  // namespace
diff --git a/libs/libvpx/test/cpu_speed_test.cc b/libs/libvpx/test/cpu_speed_test.cc
new file mode 100644
index 0000000000..8baa2f9c89
--- /dev/null
+++ b/libs/libvpx/test/cpu_speed_test.cc
@@ -0,0 +1,148 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+
+namespace {
+
+const int kMaxPSNR = 100;
+
+class CpuSpeedTest
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
+ protected:
+  CpuSpeedTest()
+      : EncoderTest(GET_PARAM(0)),
+        encoding_mode_(GET_PARAM(1)),
+        set_cpu_used_(GET_PARAM(2)),
+        min_psnr_(kMaxPSNR) {}
+  virtual ~CpuSpeedTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+    if (encoding_mode_ != ::libvpx_test::kRealTime) {
+      cfg_.g_lag_in_frames = 25;
+      cfg_.rc_end_usage = VPX_VBR;
+    } else {
+      cfg_.g_lag_in_frames = 0;
+      cfg_.rc_end_usage = VPX_CBR;
+    }
+  }
+
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+    min_psnr_ = kMaxPSNR;
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 1) {
+      encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
+      if (encoding_mode_ != ::libvpx_test::kRealTime) {
+        encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
+        encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
+        encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
+        encoder->Control(VP8E_SET_ARNR_TYPE, 3);
+      }
+    }
+  }
+
+  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+    if (pkt->data.psnr.psnr[0] < min_psnr_)
+      min_psnr_ = pkt->data.psnr.psnr[0];
+  }
+
+  ::libvpx_test::TestMode encoding_mode_;
+  int set_cpu_used_;
+  double min_psnr_;
+};
+
+TEST_P(CpuSpeedTest, TestQ0) {
+  // Validate that this non multiple of 64 wide clip encodes and decodes
+  // without a mismatch when passing in a very low max q.  This pushes
+  // the encoder to producing lots of big partitions which will likely
+  // extend into the border and test the border condition.
+  cfg_.rc_2pass_vbr_minsection_pct = 5;
+  cfg_.rc_2pass_vbr_minsection_pct = 2000;
+  cfg_.rc_target_bitrate = 400;
+  cfg_.rc_max_quantizer = 0;
+  cfg_.rc_min_quantizer = 0;
+
+  ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
+                                       20);
+
+  init_flags_ = VPX_CODEC_USE_PSNR;
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  EXPECT_GE(min_psnr_, kMaxPSNR);
+}
+
+TEST_P(CpuSpeedTest, TestScreencastQ0) {
+  ::libvpx_test::Y4mVideoSource video("screendata.y4m", 0, 25);
+  cfg_.g_timebase = video.timebase();
+  cfg_.rc_2pass_vbr_minsection_pct = 5;
+  cfg_.rc_2pass_vbr_minsection_pct = 2000;
+  cfg_.rc_target_bitrate = 400;
+  cfg_.rc_max_quantizer = 0;
+  cfg_.rc_min_quantizer = 0;
+
+  init_flags_ = VPX_CODEC_USE_PSNR;
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  EXPECT_GE(min_psnr_, kMaxPSNR);
+}
+
+TEST_P(CpuSpeedTest, TestEncodeHighBitrate) {
+  // Validate that this non multiple of 64 wide clip encodes and decodes
+  // without a mismatch when passing in a very low max q.  This pushes
+  // the encoder to producing lots of big partitions which will likely
+  // extend into the border and test the border condition.
+  cfg_.rc_2pass_vbr_minsection_pct = 5;
+  cfg_.rc_2pass_vbr_minsection_pct = 2000;
+  cfg_.rc_target_bitrate = 12000;
+  cfg_.rc_max_quantizer = 10;
+  cfg_.rc_min_quantizer = 0;
+
+  ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
+                                       20);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+TEST_P(CpuSpeedTest, TestLowBitrate) {
+  // Validate that this clip encodes and decodes without a mismatch
+  // when passing in a very high min q.  This pushes the encoder to producing
+  // lots of small partitions which might will test the other condition.
+  cfg_.rc_2pass_vbr_minsection_pct = 5;
+  cfg_.rc_2pass_vbr_minsection_pct = 2000;
+  cfg_.rc_target_bitrate = 200;
+  cfg_.rc_min_quantizer = 40;
+
+  ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
+                                       20);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+VP9_INSTANTIATE_TEST_CASE(
+    CpuSpeedTest,
+    ::testing::Values(::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood,
+                      ::libvpx_test::kRealTime),
+    ::testing::Range(0, 9));
+
+VP10_INSTANTIATE_TEST_CASE(
+    CpuSpeedTest,
+    ::testing::Values(::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood),
+    ::testing::Range(0, 3));
+}  // namespace
diff --git a/libs/libvpx/test/cq_test.cc b/libs/libvpx/test/cq_test.cc
new file mode 100644
index 0000000000..4e8019a87c
--- /dev/null
+++ b/libs/libvpx/test/cq_test.cc
@@ -0,0 +1,134 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <cmath>
+#include <map>
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+
+// CQ level range: [kCQLevelMin, kCQLevelMax).
+const int kCQLevelMin = 4;
+const int kCQLevelMax = 63;
+const int kCQLevelStep = 8;
+const unsigned int kCQTargetBitrate = 2000;
+
+class CQTest : public ::libvpx_test::EncoderTest,
+    public ::libvpx_test::CodecTestWithParam<int> {
+ public:
+  // maps the cqlevel to the bitrate produced.
+  typedef std::map<int, uint32_t> BitrateMap;
+
+  static void SetUpTestCase() {
+    bitrates_.clear();
+  }
+
+  static void TearDownTestCase() {
+    ASSERT_TRUE(!HasFailure())
+        << "skipping bitrate validation due to earlier failure.";
+    uint32_t prev_actual_bitrate = kCQTargetBitrate;
+    for (BitrateMap::const_iterator iter = bitrates_.begin();
+         iter != bitrates_.end(); ++iter) {
+      const uint32_t cq_actual_bitrate = iter->second;
+      EXPECT_LE(cq_actual_bitrate, prev_actual_bitrate)
+          << "cq_level: " << iter->first
+          << ", bitrate should decrease with increase in CQ level.";
+      prev_actual_bitrate = cq_actual_bitrate;
+    }
+  }
+
+ protected:
+  CQTest() : EncoderTest(GET_PARAM(0)), cq_level_(GET_PARAM(1)) {
+    init_flags_ = VPX_CODEC_USE_PSNR;
+  }
+
+  virtual ~CQTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(libvpx_test::kTwoPassGood);
+  }
+
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+    file_size_ = 0;
+    psnr_ = 0.0;
+    n_frames_ = 0;
+  }
+
+  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                                  libvpx_test::Encoder *encoder) {
+    if (video->frame() == 1) {
+      if (cfg_.rc_end_usage == VPX_CQ) {
+        encoder->Control(VP8E_SET_CQ_LEVEL, cq_level_);
+      }
+      encoder->Control(VP8E_SET_CPUUSED, 3);
+    }
+  }
+
+  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+    psnr_ += pow(10.0, pkt->data.psnr.psnr[0] / 10.0);
+    n_frames_++;
+  }
+
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    file_size_ += pkt->data.frame.sz;
+  }
+
+  double GetLinearPSNROverBitrate() const {
+    double avg_psnr = log10(psnr_ / n_frames_) * 10.0;
+    return pow(10.0, avg_psnr / 10.0) / file_size_;
+  }
+
+  int cq_level() const { return cq_level_; }
+  size_t file_size() const { return file_size_; }
+  int n_frames() const { return n_frames_; }
+
+  static BitrateMap bitrates_;
+
+ private:
+  int cq_level_;
+  size_t file_size_;
+  double psnr_;
+  int n_frames_;
+};
+
+CQTest::BitrateMap CQTest::bitrates_;
+
+TEST_P(CQTest, LinearPSNRIsHigherForCQLevel) {
+  const vpx_rational timebase = { 33333333, 1000000000 };
+  cfg_.g_timebase = timebase;
+  cfg_.rc_target_bitrate = kCQTargetBitrate;
+  cfg_.g_lag_in_frames = 25;
+
+  cfg_.rc_end_usage = VPX_CQ;
+  libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                     timebase.den, timebase.num, 0, 30);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  const double cq_psnr_lin = GetLinearPSNROverBitrate();
+  const unsigned int cq_actual_bitrate =
+      static_cast<unsigned int>(file_size()) * 8 * 30 / (n_frames() * 1000);
+  EXPECT_LE(cq_actual_bitrate, kCQTargetBitrate);
+  bitrates_[cq_level()] = cq_actual_bitrate;
+
+  // try targeting the approximate same bitrate with VBR mode
+  cfg_.rc_end_usage = VPX_VBR;
+  cfg_.rc_target_bitrate = cq_actual_bitrate;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  const double vbr_psnr_lin = GetLinearPSNROverBitrate();
+  EXPECT_GE(cq_psnr_lin, vbr_psnr_lin);
+}
+
+VP8_INSTANTIATE_TEST_CASE(CQTest,
+                          ::testing::Range(kCQLevelMin, kCQLevelMax,
+                                           kCQLevelStep));
+}  // namespace
diff --git a/libs/libvpx/test/datarate_test.cc b/libs/libvpx/test/datarate_test.cc
new file mode 100644
index 0000000000..9d5074e303
--- /dev/null
+++ b/libs/libvpx/test/datarate_test.cc
@@ -0,0 +1,971 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "./vpx_config.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "vpx/vpx_codec.h"
+
+namespace {
+
+class DatarateTestLarge : public ::libvpx_test::EncoderTest,
+    public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
+ public:
+  DatarateTestLarge() : EncoderTest(GET_PARAM(0)) {}
+
+  virtual ~DatarateTestLarge() {}
+
+ protected:
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+    ResetModel();
+  }
+
+  virtual void ResetModel() {
+    last_pts_ = 0;
+    bits_in_buffer_model_ = cfg_.rc_target_bitrate * cfg_.rc_buf_initial_sz;
+    frame_number_ = 0;
+    first_drop_ = 0;
+    bits_total_ = 0;
+    duration_ = 0.0;
+    denoiser_offon_test_ = 0;
+    denoiser_offon_period_ = -1;
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 0)
+      encoder->Control(VP8E_SET_NOISE_SENSITIVITY, denoiser_on_);
+
+    if (denoiser_offon_test_) {
+      ASSERT_GT(denoiser_offon_period_, 0)
+          << "denoiser_offon_period_ is not positive.";
+      if ((video->frame() + 1) % denoiser_offon_period_ == 0) {
+        // Flip denoiser_on_ periodically
+        denoiser_on_ ^= 1;
+      }
+      encoder->Control(VP8E_SET_NOISE_SENSITIVITY, denoiser_on_);
+    }
+
+    const vpx_rational_t tb = video->timebase();
+    timebase_ = static_cast<double>(tb.num) / tb.den;
+    duration_ = 0;
+  }
+
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    // Time since last timestamp = duration.
+    vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_;
+
+    // TODO(jimbankoski): Remove these lines when the issue:
+    // http://code.google.com/p/webm/issues/detail?id=496 is fixed.
+    // For now the codec assumes buffer starts at starting buffer rate
+    // plus one frame's time.
+    if (last_pts_ == 0)
+      duration = 1;
+
+    // Add to the buffer the bits we'd expect from a constant bitrate server.
+    bits_in_buffer_model_ += static_cast<int64_t>(
+        duration * timebase_ * cfg_.rc_target_bitrate * 1000);
+
+    /* Test the buffer model here before subtracting the frame. Do so because
+     * the way the leaky bucket model works in libvpx is to allow the buffer to
+     * empty - and then stop showing frames until we've got enough bits to
+     * show one. As noted in comment below (issue 495), this does not currently
+     * apply to key frames. For now exclude key frames in condition below. */
+    const bool key_frame = (pkt->data.frame.flags & VPX_FRAME_IS_KEY)
+                         ? true: false;
+    if (!key_frame) {
+      ASSERT_GE(bits_in_buffer_model_, 0) << "Buffer Underrun at frame "
+          << pkt->data.frame.pts;
+    }
+
+    const size_t frame_size_in_bits = pkt->data.frame.sz * 8;
+
+    // Subtract from the buffer the bits associated with a played back frame.
+    bits_in_buffer_model_ -= frame_size_in_bits;
+
+    // Update the running total of bits for end of test datarate checks.
+    bits_total_ += frame_size_in_bits;
+
+    // If first drop not set and we have a drop set it to this time.
+    if (!first_drop_ && duration > 1)
+      first_drop_ = last_pts_ + 1;
+
+    // Update the most recent pts.
+    last_pts_ = pkt->data.frame.pts;
+
+    // We update this so that we can calculate the datarate minus the last
+    // frame encoded in the file.
+    bits_in_last_frame_ = frame_size_in_bits;
+
+    ++frame_number_;
+  }
+
+  virtual void EndPassHook(void) {
+    if (bits_total_) {
+      const double file_size_in_kb = bits_total_ / 1000.;  // bits per kilobit
+
+      duration_ = (last_pts_ + 1) * timebase_;
+
+      // Effective file datarate includes the time spent prebuffering.
+      effective_datarate_ = (bits_total_ - bits_in_last_frame_) / 1000.0
+          / (cfg_.rc_buf_initial_sz / 1000.0 + duration_);
+
+      file_datarate_ = file_size_in_kb / duration_;
+    }
+  }
+
+  vpx_codec_pts_t last_pts_;
+  int64_t bits_in_buffer_model_;
+  double timebase_;
+  int frame_number_;
+  vpx_codec_pts_t first_drop_;
+  int64_t bits_total_;
+  double duration_;
+  double file_datarate_;
+  double effective_datarate_;
+  size_t bits_in_last_frame_;
+  int denoiser_on_;
+  int denoiser_offon_test_;
+  int denoiser_offon_period_;
+};
+
+#if CONFIG_TEMPORAL_DENOISING
+// Check basic datarate targeting, for a single bitrate, but loop over the
+// various denoiser settings.
+TEST_P(DatarateTestLarge, DenoiserLevels) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 140);
+  for (int j = 1; j < 5; ++j) {
+    // Run over the denoiser levels.
+    // For the temporal denoiser (#if CONFIG_TEMPORAL_DENOISING) the level j
+    // refers to the 4 denoiser modes: denoiserYonly, denoiserOnYUV,
+    // denoiserOnAggressive, and denoiserOnAdaptive.
+    // For the spatial denoiser (if !CONFIG_TEMPORAL_DENOISING), the level j
+    // refers to the blur thresholds: 20, 40, 60 80.
+    // The j = 0 case (denoiser off) is covered in the tests below.
+    denoiser_on_ = j;
+    cfg_.rc_target_bitrate = 300;
+    ResetModel();
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
+        << " The datarate for the file exceeds the target!";
+
+    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.3)
+        << " The datarate for the file missed the target!";
+  }
+}
+
+// Check basic datarate targeting, for a single bitrate, when denoiser is off
+// and on.
+TEST_P(DatarateTestLarge, DenoiserOffOn) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 299);
+  cfg_.rc_target_bitrate = 300;
+  ResetModel();
+  // The denoiser is off by default.
+  denoiser_on_ = 0;
+  // Set the offon test flag.
+  denoiser_offon_test_ = 1;
+  denoiser_offon_period_ = 100;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
+      << " The datarate for the file exceeds the target!";
+  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.3)
+      << " The datarate for the file missed the target!";
+}
+#endif  // CONFIG_TEMPORAL_DENOISING
+
+TEST_P(DatarateTestLarge, BasicBufferModel) {
+  denoiser_on_ = 0;
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  // 2 pass cbr datarate control has a bug hidden by the small # of
+  // frames selected in this encode. The problem is that even if the buffer is
+  // negative we produce a keyframe on a cutscene. Ignoring datarate
+  // constraints
+  // TODO(jimbankoski): ( Fix when issue
+  // http://code.google.com/p/webm/issues/detail?id=495 is addressed. )
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 140);
+
+  // There is an issue for low bitrates in real-time mode, where the
+  // effective_datarate slightly overshoots the target bitrate.
+  // This is same the issue as noted about (#495).
+  // TODO(jimbankoski/marpan): Update test to run for lower bitrates (< 100),
+  // when the issue is resolved.
+  for (int i = 100; i < 800; i += 200) {
+    cfg_.rc_target_bitrate = i;
+    ResetModel();
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
+        << " The datarate for the file exceeds the target!";
+
+    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.3)
+        << " The datarate for the file missed the target!";
+  }
+}
+
+TEST_P(DatarateTestLarge, ChangingDropFrameThresh) {
+  denoiser_on_ = 0;
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_max_quantizer = 36;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.rc_target_bitrate = 200;
+  cfg_.kf_mode = VPX_KF_DISABLED;
+
+  const int frame_count = 40;
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, frame_count);
+
+  // Here we check that the first dropped frame gets earlier and earlier
+  // as the drop frame threshold is increased.
+
+  const int kDropFrameThreshTestStep = 30;
+  vpx_codec_pts_t last_drop = frame_count;
+  for (int i = 1; i < 91; i += kDropFrameThreshTestStep) {
+    cfg_.rc_dropframe_thresh = i;
+    ResetModel();
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_LE(first_drop_, last_drop)
+        << " The first dropped frame for drop_thresh " << i
+        << " > first dropped frame for drop_thresh "
+        << i - kDropFrameThreshTestStep;
+    last_drop = first_drop_;
+  }
+}
+
+class DatarateTestVP9Large : public ::libvpx_test::EncoderTest,
+    public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
+ public:
+  DatarateTestVP9Large() : EncoderTest(GET_PARAM(0)) {}
+
+ protected:
+  virtual ~DatarateTestVP9Large() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+    set_cpu_used_ = GET_PARAM(2);
+    ResetModel();
+  }
+
+  virtual void ResetModel() {
+    last_pts_ = 0;
+    bits_in_buffer_model_ = cfg_.rc_target_bitrate * cfg_.rc_buf_initial_sz;
+    frame_number_ = 0;
+    tot_frame_number_ = 0;
+    first_drop_ = 0;
+    num_drops_ = 0;
+    // Denoiser is off by default.
+    denoiser_on_ = 0;
+    // For testing up to 3 layers.
+    for (int i = 0; i < 3; ++i) {
+      bits_total_[i] = 0;
+    }
+    denoiser_offon_test_ = 0;
+    denoiser_offon_period_ = -1;
+  }
+
+  //
+  // Frame flags and layer id for temporal layers.
+  //
+
+  // For two layers, test pattern is:
+  //   1     3
+  // 0    2     .....
+  // For three layers, test pattern is:
+  //   1      3    5      7
+  //      2           6
+  // 0          4            ....
+  // LAST is always update on base/layer 0, GOLDEN is updated on layer 1.
+  // For this 3 layer example, the 2nd enhancement layer (layer 2) does not
+  // update any reference frames.
+  int SetFrameFlags(int frame_num, int num_temp_layers) {
+    int frame_flags = 0;
+    if (num_temp_layers == 2) {
+      if (frame_num % 2 == 0) {
+        // Layer 0: predict from L and ARF, update L.
+        frame_flags = VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_UPD_GF |
+                      VP8_EFLAG_NO_UPD_ARF;
+      } else {
+        // Layer 1: predict from L, G and ARF, and update G.
+        frame_flags = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST |
+                      VP8_EFLAG_NO_UPD_ENTROPY;
+      }
+    } else if (num_temp_layers == 3) {
+      if (frame_num % 4 == 0) {
+        // Layer 0: predict from L and ARF; update L.
+        frame_flags = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF |
+                      VP8_EFLAG_NO_REF_GF;
+      } else if ((frame_num - 2) % 4 == 0) {
+        // Layer 1: predict from L, G, ARF; update G.
+        frame_flags = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST;
+      }  else if ((frame_num - 1) % 2 == 0) {
+        // Layer 2: predict from L, G, ARF; update none.
+        frame_flags = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF |
+                      VP8_EFLAG_NO_UPD_LAST;
+      }
+    }
+    return frame_flags;
+  }
+
+  int SetLayerId(int frame_num, int num_temp_layers) {
+    int layer_id = 0;
+    if (num_temp_layers == 2) {
+      if (frame_num % 2 == 0) {
+        layer_id = 0;
+      } else {
+        layer_id = 1;
+      }
+    } else if (num_temp_layers == 3) {
+      if (frame_num % 4 == 0) {
+        layer_id = 0;
+      } else if ((frame_num - 2) % 4 == 0) {
+        layer_id = 1;
+      } else if ((frame_num - 1) % 2 == 0) {
+        layer_id = 2;
+      }
+    }
+    return layer_id;
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 0)
+      encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
+
+    if (denoiser_offon_test_) {
+      ASSERT_GT(denoiser_offon_period_, 0)
+          << "denoiser_offon_period_ is not positive.";
+      if ((video->frame() + 1) % denoiser_offon_period_ == 0) {
+        // Flip denoiser_on_ periodically
+        denoiser_on_ ^= 1;
+      }
+    }
+
+    encoder->Control(VP9E_SET_NOISE_SENSITIVITY, denoiser_on_);
+
+    if (cfg_.ts_number_layers > 1) {
+      if (video->frame() == 0) {
+        encoder->Control(VP9E_SET_SVC, 1);
+      }
+      vpx_svc_layer_id_t layer_id;
+      layer_id.spatial_layer_id = 0;
+      frame_flags_ = SetFrameFlags(video->frame(), cfg_.ts_number_layers);
+      layer_id.temporal_layer_id = SetLayerId(video->frame(),
+                                              cfg_.ts_number_layers);
+      encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id);
+    }
+    const vpx_rational_t tb = video->timebase();
+    timebase_ = static_cast<double>(tb.num) / tb.den;
+    duration_ = 0;
+  }
+
+
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    // Time since last timestamp = duration.
+    vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_;
+
+    if (duration > 1) {
+      // If first drop not set and we have a drop set it to this time.
+      if (!first_drop_)
+        first_drop_ = last_pts_ + 1;
+      // Update the number of frame drops.
+      num_drops_ += static_cast<int>(duration - 1);
+      // Update counter for total number of frames (#frames input to encoder).
+      // Needed for setting the proper layer_id below.
+      tot_frame_number_ += static_cast<int>(duration - 1);
+    }
+
+    int layer = SetLayerId(tot_frame_number_, cfg_.ts_number_layers);
+
+    // Add to the buffer the bits we'd expect from a constant bitrate server.
+    bits_in_buffer_model_ += static_cast<int64_t>(
+        duration * timebase_ * cfg_.rc_target_bitrate * 1000);
+
+    // Buffer should not go negative.
+    ASSERT_GE(bits_in_buffer_model_, 0) << "Buffer Underrun at frame "
+        << pkt->data.frame.pts;
+
+    const size_t frame_size_in_bits = pkt->data.frame.sz * 8;
+
+    // Update the total encoded bits. For temporal layers, update the cumulative
+    // encoded bits per layer.
+    for (int i = layer; i < static_cast<int>(cfg_.ts_number_layers); ++i) {
+      bits_total_[i] += frame_size_in_bits;
+    }
+
+    // Update the most recent pts.
+    last_pts_ = pkt->data.frame.pts;
+    ++frame_number_;
+    ++tot_frame_number_;
+  }
+
+  virtual void EndPassHook(void) {
+    for (int layer = 0; layer < static_cast<int>(cfg_.ts_number_layers);
+        ++layer) {
+      duration_ = (last_pts_ + 1) * timebase_;
+      if (bits_total_[layer]) {
+        // Effective file datarate:
+        effective_datarate_[layer] = (bits_total_[layer] / 1000.0) / duration_;
+      }
+    }
+  }
+
+  vpx_codec_pts_t last_pts_;
+  double timebase_;
+  int frame_number_;      // Counter for number of non-dropped/encoded frames.
+  int tot_frame_number_;  // Counter for total number of input frames.
+  int64_t bits_total_[3];
+  double duration_;
+  double effective_datarate_[3];
+  int set_cpu_used_;
+  int64_t bits_in_buffer_model_;
+  vpx_codec_pts_t first_drop_;
+  int num_drops_;
+  int denoiser_on_;
+  int denoiser_offon_test_;
+  int denoiser_offon_period_;
+};
+
+// Check basic rate targeting,
+TEST_P(DatarateTestVP9Large, BasicRateTargeting) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 140);
+  for (int i = 150; i < 800; i += 200) {
+    cfg_.rc_target_bitrate = i;
+    ResetModel();
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
+        << " The datarate for the file is lower than target by too much!";
+    ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
+        << " The datarate for the file is greater than target by too much!";
+  }
+}
+
+// Check basic rate targeting,
+TEST_P(DatarateTestVP9Large, BasicRateTargeting444) {
+  ::libvpx_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 140);
+
+  cfg_.g_profile = 1;
+  cfg_.g_timebase = video.timebase();
+
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+
+  for (int i = 250; i < 900; i += 200) {
+    cfg_.rc_target_bitrate = i;
+    ResetModel();
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(static_cast<double>(cfg_.rc_target_bitrate),
+              effective_datarate_[0] * 0.85)
+        << " The datarate for the file exceeds the target by too much!";
+    ASSERT_LE(static_cast<double>(cfg_.rc_target_bitrate),
+              effective_datarate_[0] * 1.15)
+        << " The datarate for the file missed the target!"
+        << cfg_.rc_target_bitrate << " "<< effective_datarate_;
+  }
+}
+
+// Check that (1) the first dropped frame gets earlier and earlier
+// as the drop frame threshold is increased, and (2) that the total number of
+// frame drops does not decrease as we increase frame drop threshold.
+// Use a lower qp-max to force some frame drops.
+TEST_P(DatarateTestVP9Large, ChangingDropFrameThresh) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_undershoot_pct = 20;
+  cfg_.rc_undershoot_pct = 20;
+  cfg_.rc_dropframe_thresh = 10;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 50;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.rc_target_bitrate = 200;
+  cfg_.g_lag_in_frames = 0;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 140);
+
+  const int kDropFrameThreshTestStep = 30;
+  vpx_codec_pts_t last_drop = 140;
+  int last_num_drops = 0;
+  for (int i = 10; i < 100; i += kDropFrameThreshTestStep) {
+    cfg_.rc_dropframe_thresh = i;
+    ResetModel();
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
+        << " The datarate for the file is lower than target by too much!";
+    ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
+        << " The datarate for the file is greater than target by too much!";
+    ASSERT_LE(first_drop_, last_drop)
+        << " The first dropped frame for drop_thresh " << i
+        << " > first dropped frame for drop_thresh "
+        << i - kDropFrameThreshTestStep;
+    ASSERT_GE(num_drops_, last_num_drops * 0.85)
+        << " The number of dropped frames for drop_thresh " << i
+        << " < number of dropped frames for drop_thresh "
+        << i - kDropFrameThreshTestStep;
+    last_drop = first_drop_;
+    last_num_drops = num_drops_;
+  }
+}
+
+// Check basic rate targeting for 2 temporal layers.
+TEST_P(DatarateTestVP9Large, BasicRateTargeting2TemporalLayers) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+
+  // 2 Temporal layers, no spatial layers: Framerate decimation (2, 1).
+  cfg_.ss_number_layers = 1;
+  cfg_.ts_number_layers = 2;
+  cfg_.ts_rate_decimator[0] = 2;
+  cfg_.ts_rate_decimator[1] = 1;
+
+  cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS;
+
+  if (deadline_ == VPX_DL_REALTIME)
+    cfg_.g_error_resilient = 1;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 200);
+  for (int i = 200; i <= 800; i += 200) {
+    cfg_.rc_target_bitrate = i;
+    ResetModel();
+    // 60-40 bitrate allocation for 2 temporal layers.
+    cfg_.layer_target_bitrate[0] = 60 * cfg_.rc_target_bitrate / 100;
+    cfg_.layer_target_bitrate[1] = cfg_.rc_target_bitrate;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    for (int j = 0; j < static_cast<int>(cfg_.ts_number_layers); ++j) {
+      ASSERT_GE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 0.85)
+          << " The datarate for the file is lower than target by too much, "
+              "for layer: " << j;
+      ASSERT_LE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 1.15)
+          << " The datarate for the file is greater than target by too much, "
+              "for layer: " << j;
+    }
+  }
+}
+
+// Check basic rate targeting for 3 temporal layers.
+TEST_P(DatarateTestVP9Large, BasicRateTargeting3TemporalLayers) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+
+  // 3 Temporal layers, no spatial layers: Framerate decimation (4, 2, 1).
+  cfg_.ss_number_layers = 1;
+  cfg_.ts_number_layers = 3;
+  cfg_.ts_rate_decimator[0] = 4;
+  cfg_.ts_rate_decimator[1] = 2;
+  cfg_.ts_rate_decimator[2] = 1;
+
+  cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 200);
+  for (int i = 200; i <= 800; i += 200) {
+    cfg_.rc_target_bitrate = i;
+    ResetModel();
+    // 40-20-40 bitrate allocation for 3 temporal layers.
+    cfg_.layer_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100;
+    cfg_.layer_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100;
+    cfg_.layer_target_bitrate[2] = cfg_.rc_target_bitrate;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    for (int j = 0; j < static_cast<int>(cfg_.ts_number_layers); ++j) {
+      // TODO(yaowu): Work out more stable rc control strategy and
+      //              Adjust the thresholds to be tighter than .75.
+      ASSERT_GE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 0.75)
+          << " The datarate for the file is lower than target by too much, "
+              "for layer: " << j;
+      // TODO(yaowu): Work out more stable rc control strategy and
+      //              Adjust the thresholds to be tighter than 1.25.
+      ASSERT_LE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 1.25)
+          << " The datarate for the file is greater than target by too much, "
+              "for layer: " << j;
+    }
+  }
+}
+
+// Check basic rate targeting for 3 temporal layers, with frame dropping.
+// Only for one (low) bitrate with lower max_quantizer, and somewhat higher
+// frame drop threshold, to force frame dropping.
+TEST_P(DatarateTestVP9Large, BasicRateTargeting3TemporalLayersFrameDropping) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  // Set frame drop threshold and rc_max_quantizer to force some frame drops.
+  cfg_.rc_dropframe_thresh = 20;
+  cfg_.rc_max_quantizer = 45;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+
+  // 3 Temporal layers, no spatial layers: Framerate decimation (4, 2, 1).
+  cfg_.ss_number_layers = 1;
+  cfg_.ts_number_layers = 3;
+  cfg_.ts_rate_decimator[0] = 4;
+  cfg_.ts_rate_decimator[1] = 2;
+  cfg_.ts_rate_decimator[2] = 1;
+
+  cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 200);
+  cfg_.rc_target_bitrate = 200;
+  ResetModel();
+  // 40-20-40 bitrate allocation for 3 temporal layers.
+  cfg_.layer_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100;
+  cfg_.layer_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100;
+  cfg_.layer_target_bitrate[2] = cfg_.rc_target_bitrate;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  for (int j = 0; j < static_cast<int>(cfg_.ts_number_layers); ++j) {
+    ASSERT_GE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 0.85)
+        << " The datarate for the file is lower than target by too much, "
+            "for layer: " << j;
+    ASSERT_LE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 1.15)
+        << " The datarate for the file is greater than target by too much, "
+            "for layer: " << j;
+    // Expect some frame drops in this test: for this 200 frames test,
+    // expect at least 10% and not more than 60% drops.
+    ASSERT_GE(num_drops_, 20);
+    ASSERT_LE(num_drops_, 130);
+  }
+}
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+// Check basic datarate targeting, for a single bitrate, when denoiser is on.
+TEST_P(DatarateTestVP9Large, DenoiserLevels) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_min_quantizer = 2;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 140);
+
+  // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING),
+  // there is only one denoiser mode: denoiserYonly(which is 1),
+  // but may add more modes in the future.
+  cfg_.rc_target_bitrate = 300;
+  ResetModel();
+  // Turn on the denoiser.
+  denoiser_on_ = 1;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
+      << " The datarate for the file is lower than target by too much!";
+  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
+      << " The datarate for the file is greater than target by too much!";
+}
+
+// Check basic datarate targeting, for a single bitrate, when denoiser is off
+// and on.
+TEST_P(DatarateTestVP9Large, DenoiserOffOn) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_min_quantizer = 2;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 299);
+
+  // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING),
+  // there is only one denoiser mode: denoiserYonly(which is 1),
+  // but may add more modes in the future.
+  cfg_.rc_target_bitrate = 300;
+  ResetModel();
+  // The denoiser is off by default.
+  denoiser_on_ = 0;
+  // Set the offon test flag.
+  denoiser_offon_test_ = 1;
+  denoiser_offon_period_ = 100;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
+      << " The datarate for the file is lower than target by too much!";
+  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
+      << " The datarate for the file is greater than target by too much!";
+}
+#endif  // CONFIG_VP9_TEMPORAL_DENOISING
+
+class DatarateOnePassCbrSvc : public ::libvpx_test::EncoderTest,
+    public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
+ public:
+  DatarateOnePassCbrSvc() : EncoderTest(GET_PARAM(0)) {}
+  virtual ~DatarateOnePassCbrSvc() {}
+ protected:
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+    speed_setting_ = GET_PARAM(2);
+    ResetModel();
+  }
+  virtual void ResetModel() {
+    last_pts_ = 0;
+    bits_in_buffer_model_ = cfg_.rc_target_bitrate * cfg_.rc_buf_initial_sz;
+    frame_number_ = 0;
+    first_drop_ = 0;
+    bits_total_ = 0;
+    duration_ = 0.0;
+    mismatch_psnr_ = 0.0;
+    mismatch_nframes_ = 0;
+  }
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+  }
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      int i;
+      for (i = 0; i < VPX_MAX_LAYERS; ++i) {
+        svc_params_.max_quantizers[i] = 63;
+        svc_params_.min_quantizers[i] = 0;
+      }
+      svc_params_.scaling_factor_num[0] = 144;
+      svc_params_.scaling_factor_den[0] = 288;
+      svc_params_.scaling_factor_num[1] = 288;
+      svc_params_.scaling_factor_den[1] = 288;
+      encoder->Control(VP9E_SET_SVC, 1);
+      encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_);
+      encoder->Control(VP8E_SET_CPUUSED, speed_setting_);
+      encoder->Control(VP9E_SET_TILE_COLUMNS, 0);
+      encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 300);
+      encoder->Control(VP9E_SET_TILE_COLUMNS, (cfg_.g_threads >> 1));
+    }
+    const vpx_rational_t tb = video->timebase();
+    timebase_ = static_cast<double>(tb.num) / tb.den;
+    duration_ = 0;
+  }
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_;
+    if (last_pts_ == 0)
+      duration = 1;
+    bits_in_buffer_model_ += static_cast<int64_t>(
+        duration * timebase_ * cfg_.rc_target_bitrate * 1000);
+    const bool key_frame = (pkt->data.frame.flags & VPX_FRAME_IS_KEY)
+                         ? true: false;
+    if (!key_frame) {
+      ASSERT_GE(bits_in_buffer_model_, 0) << "Buffer Underrun at frame "
+          << pkt->data.frame.pts;
+    }
+    const size_t frame_size_in_bits = pkt->data.frame.sz * 8;
+    bits_in_buffer_model_ -= frame_size_in_bits;
+    bits_total_ += frame_size_in_bits;
+    if (!first_drop_ && duration > 1)
+      first_drop_ = last_pts_ + 1;
+    last_pts_ = pkt->data.frame.pts;
+    bits_in_last_frame_ = frame_size_in_bits;
+    ++frame_number_;
+  }
+  virtual void EndPassHook(void) {
+    if (bits_total_) {
+      const double file_size_in_kb = bits_total_ / 1000.;  // bits per kilobit
+      duration_ = (last_pts_ + 1) * timebase_;
+      effective_datarate_ = (bits_total_ - bits_in_last_frame_) / 1000.0
+          / (cfg_.rc_buf_initial_sz / 1000.0 + duration_);
+      file_datarate_ = file_size_in_kb / duration_;
+    }
+  }
+
+  virtual void MismatchHook(const vpx_image_t *img1,
+                            const vpx_image_t *img2) {
+    double mismatch_psnr = compute_psnr(img1, img2);
+    mismatch_psnr_ += mismatch_psnr;
+    ++mismatch_nframes_;
+  }
+
+  unsigned int GetMismatchFrames() {
+    return mismatch_nframes_;
+  }
+
+  vpx_codec_pts_t last_pts_;
+  int64_t bits_in_buffer_model_;
+  double timebase_;
+  int frame_number_;
+  vpx_codec_pts_t first_drop_;
+  int64_t bits_total_;
+  double duration_;
+  double file_datarate_;
+  double effective_datarate_;
+  size_t bits_in_last_frame_;
+  vpx_svc_extra_cfg_t svc_params_;
+  int speed_setting_;
+  double mismatch_psnr_;
+  int mismatch_nframes_;
+};
+static void assign_layer_bitrates(vpx_codec_enc_cfg_t *const enc_cfg,
+    const vpx_svc_extra_cfg_t *svc_params,
+    int spatial_layers,
+    int temporal_layers,
+    int temporal_layering_mode) {
+  int sl, spatial_layer_target;
+  float total = 0;
+  float alloc_ratio[VPX_MAX_LAYERS] = {0};
+  for (sl = 0; sl < spatial_layers; ++sl) {
+    if (svc_params->scaling_factor_den[sl] > 0) {
+      alloc_ratio[sl] = (float)(svc_params->scaling_factor_num[sl] *
+          1.0 / svc_params->scaling_factor_den[sl]);
+      total += alloc_ratio[sl];
+    }
+  }
+  for (sl = 0; sl < spatial_layers; ++sl) {
+    enc_cfg->ss_target_bitrate[sl] = spatial_layer_target =
+        (unsigned int)(enc_cfg->rc_target_bitrate *
+            alloc_ratio[sl] / total);
+    const int index = sl * temporal_layers;
+    if (temporal_layering_mode == 3) {
+      enc_cfg->layer_target_bitrate[index] =
+          spatial_layer_target >> 1;
+      enc_cfg->layer_target_bitrate[index + 1] =
+          (spatial_layer_target >> 1) + (spatial_layer_target >> 2);
+      enc_cfg->layer_target_bitrate[index + 2] =
+          spatial_layer_target;
+    } else if (temporal_layering_mode == 2) {
+      enc_cfg->layer_target_bitrate[index] =
+          spatial_layer_target * 2 / 3;
+      enc_cfg->layer_target_bitrate[index + 1] =
+          spatial_layer_target;
+    }
+  }
+}
+
+// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and
+// 3 temporal layers. Run CIF clip with 1 thread.
+TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.ss_number_layers = 2;
+  cfg_.ts_number_layers = 3;
+  cfg_.ts_rate_decimator[0] = 4;
+  cfg_.ts_rate_decimator[1] = 2;
+  cfg_.ts_rate_decimator[2] = 1;
+  cfg_.g_error_resilient = 1;
+  cfg_.g_threads = 1;
+  cfg_.temporal_layering_mode = 3;
+  svc_params_.scaling_factor_num[0] = 144;
+  svc_params_.scaling_factor_den[0] = 288;
+  svc_params_.scaling_factor_num[1] = 288;
+  svc_params_.scaling_factor_den[1] = 288;
+  cfg_.rc_dropframe_thresh = 10;
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 200);
+  // TODO(wonkap/marpan): Check that effective_datarate for each layer hits the
+  // layer target_bitrate. Also check if test can pass at lower bitrate (~200k).
+  for (int i = 400; i <= 800; i += 200) {
+    cfg_.rc_target_bitrate = i;
+    ResetModel();
+    assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
+        cfg_.ts_number_layers, cfg_.temporal_layering_mode);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.85)
+            << " The datarate for the file exceeds the target by too much!";
+    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.15)
+        << " The datarate for the file is lower than the target by too much!";
+    EXPECT_EQ(GetMismatchFrames(), (unsigned int) 0);
+  }
+}
+
+// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and
+// 3 temporal layers. Run HD clip with 4 threads.
+TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc4threads) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.ss_number_layers = 2;
+  cfg_.ts_number_layers = 3;
+  cfg_.ts_rate_decimator[0] = 4;
+  cfg_.ts_rate_decimator[1] = 2;
+  cfg_.ts_rate_decimator[2] = 1;
+  cfg_.g_error_resilient = 1;
+  cfg_.g_threads = 4;
+  cfg_.temporal_layering_mode = 3;
+  svc_params_.scaling_factor_num[0] = 144;
+  svc_params_.scaling_factor_den[0] = 288;
+  svc_params_.scaling_factor_num[1] = 288;
+  svc_params_.scaling_factor_den[1] = 288;
+  cfg_.rc_dropframe_thresh = 10;
+  ::libvpx_test::I420VideoSource video("niklas_1280_720_30.y4m", 1280, 720,
+                                       30, 1, 0, 300);
+  cfg_.rc_target_bitrate = 800;
+  ResetModel();
+  assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
+      cfg_.ts_number_layers, cfg_.temporal_layering_mode);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.85)
+          << " The datarate for the file exceeds the target by too much!";
+  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.15)
+      << " The datarate for the file is lower than the target by too much!";
+  EXPECT_EQ(GetMismatchFrames(), (unsigned int) 0);
+}
+
+VP8_INSTANTIATE_TEST_CASE(DatarateTestLarge, ALL_TEST_MODES);
+VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9Large,
+                          ::testing::Values(::libvpx_test::kOnePassGood,
+                                            ::libvpx_test::kRealTime),
+                          ::testing::Range(2, 7));
+VP9_INSTANTIATE_TEST_CASE(DatarateOnePassCbrSvc,
+                          ::testing::Values(::libvpx_test::kRealTime),
+                          ::testing::Range(5, 8));
+}  // namespace
diff --git a/libs/libvpx/test/dct16x16_test.cc b/libs/libvpx/test/dct16x16_test.cc
new file mode 100644
index 0000000000..d6cc5e443b
--- /dev/null
+++ b/libs/libvpx/test/dct16x16_test.cc
@@ -0,0 +1,916 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "vp9/common/vp9_entropy.h"
+#include "vp9/common/vp9_scan.h"
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+
+#ifdef _MSC_VER
+static int round(double x) {
+  if (x < 0)
+    return static_cast<int>(ceil(x - 0.5));
+  else
+    return static_cast<int>(floor(x + 0.5));
+}
+#endif
+
+const int kNumCoeffs = 256;
+const double C1 = 0.995184726672197;
+const double C2 = 0.98078528040323;
+const double C3 = 0.956940335732209;
+const double C4 = 0.923879532511287;
+const double C5 = 0.881921264348355;
+const double C6 = 0.831469612302545;
+const double C7 = 0.773010453362737;
+const double C8 = 0.707106781186548;
+const double C9 = 0.634393284163646;
+const double C10 = 0.555570233019602;
+const double C11 = 0.471396736825998;
+const double C12 = 0.38268343236509;
+const double C13 = 0.290284677254462;
+const double C14 = 0.195090322016128;
+const double C15 = 0.098017140329561;
+
+void butterfly_16x16_dct_1d(double input[16], double output[16]) {
+  double step[16];
+  double intermediate[16];
+  double temp1, temp2;
+
+  // step 1
+  step[ 0] = input[0] + input[15];
+  step[ 1] = input[1] + input[14];
+  step[ 2] = input[2] + input[13];
+  step[ 3] = input[3] + input[12];
+  step[ 4] = input[4] + input[11];
+  step[ 5] = input[5] + input[10];
+  step[ 6] = input[6] + input[ 9];
+  step[ 7] = input[7] + input[ 8];
+  step[ 8] = input[7] - input[ 8];
+  step[ 9] = input[6] - input[ 9];
+  step[10] = input[5] - input[10];
+  step[11] = input[4] - input[11];
+  step[12] = input[3] - input[12];
+  step[13] = input[2] - input[13];
+  step[14] = input[1] - input[14];
+  step[15] = input[0] - input[15];
+
+  // step 2
+  output[0] = step[0] + step[7];
+  output[1] = step[1] + step[6];
+  output[2] = step[2] + step[5];
+  output[3] = step[3] + step[4];
+  output[4] = step[3] - step[4];
+  output[5] = step[2] - step[5];
+  output[6] = step[1] - step[6];
+  output[7] = step[0] - step[7];
+
+  temp1 = step[ 8] * C7;
+  temp2 = step[15] * C9;
+  output[ 8] = temp1 + temp2;
+
+  temp1 = step[ 9] * C11;
+  temp2 = step[14] * C5;
+  output[ 9] = temp1 - temp2;
+
+  temp1 = step[10] * C3;
+  temp2 = step[13] * C13;
+  output[10] = temp1 + temp2;
+
+  temp1 = step[11] * C15;
+  temp2 = step[12] * C1;
+  output[11] = temp1 - temp2;
+
+  temp1 = step[11] * C1;
+  temp2 = step[12] * C15;
+  output[12] = temp2 + temp1;
+
+  temp1 = step[10] * C13;
+  temp2 = step[13] * C3;
+  output[13] = temp2 - temp1;
+
+  temp1 = step[ 9] * C5;
+  temp2 = step[14] * C11;
+  output[14] = temp2 + temp1;
+
+  temp1 = step[ 8] * C9;
+  temp2 = step[15] * C7;
+  output[15] = temp2 - temp1;
+
+  // step 3
+  step[ 0] = output[0] + output[3];
+  step[ 1] = output[1] + output[2];
+  step[ 2] = output[1] - output[2];
+  step[ 3] = output[0] - output[3];
+
+  temp1 = output[4] * C14;
+  temp2 = output[7] * C2;
+  step[ 4] = temp1 + temp2;
+
+  temp1 = output[5] * C10;
+  temp2 = output[6] * C6;
+  step[ 5] = temp1 + temp2;
+
+  temp1 = output[5] * C6;
+  temp2 = output[6] * C10;
+  step[ 6] = temp2 - temp1;
+
+  temp1 = output[4] * C2;
+  temp2 = output[7] * C14;
+  step[ 7] = temp2 - temp1;
+
+  step[ 8] = output[ 8] + output[11];
+  step[ 9] = output[ 9] + output[10];
+  step[10] = output[ 9] - output[10];
+  step[11] = output[ 8] - output[11];
+
+  step[12] = output[12] + output[15];
+  step[13] = output[13] + output[14];
+  step[14] = output[13] - output[14];
+  step[15] = output[12] - output[15];
+
+  // step 4
+  output[ 0] = (step[ 0] + step[ 1]);
+  output[ 8] = (step[ 0] - step[ 1]);
+
+  temp1 = step[2] * C12;
+  temp2 = step[3] * C4;
+  temp1 = temp1 + temp2;
+  output[ 4] = 2*(temp1 * C8);
+
+  temp1 = step[2] * C4;
+  temp2 = step[3] * C12;
+  temp1 = temp2 - temp1;
+  output[12] = 2 * (temp1 * C8);
+
+  output[ 2] = 2 * ((step[4] + step[ 5]) * C8);
+  output[14] = 2 * ((step[7] - step[ 6]) * C8);
+
+  temp1 = step[4] - step[5];
+  temp2 = step[6] + step[7];
+  output[ 6] = (temp1 + temp2);
+  output[10] = (temp1 - temp2);
+
+  intermediate[8] = step[8] + step[14];
+  intermediate[9] = step[9] + step[15];
+
+  temp1 = intermediate[8] * C12;
+  temp2 = intermediate[9] * C4;
+  temp1 = temp1 - temp2;
+  output[3] = 2 * (temp1 * C8);
+
+  temp1 = intermediate[8] * C4;
+  temp2 = intermediate[9] * C12;
+  temp1 = temp2 + temp1;
+  output[13] = 2 * (temp1 * C8);
+
+  output[ 9] = 2 * ((step[10] + step[11]) * C8);
+
+  intermediate[11] = step[10] - step[11];
+  intermediate[12] = step[12] + step[13];
+  intermediate[13] = step[12] - step[13];
+  intermediate[14] = step[ 8] - step[14];
+  intermediate[15] = step[ 9] - step[15];
+
+  output[15] = (intermediate[11] + intermediate[12]);
+  output[ 1] = -(intermediate[11] - intermediate[12]);
+
+  output[ 7] = 2 * (intermediate[13] * C8);
+
+  temp1 = intermediate[14] * C12;
+  temp2 = intermediate[15] * C4;
+  temp1 = temp1 - temp2;
+  output[11] = -2 * (temp1 * C8);
+
+  temp1 = intermediate[14] * C4;
+  temp2 = intermediate[15] * C12;
+  temp1 = temp2 + temp1;
+  output[ 5] = 2 * (temp1 * C8);
+}
+
+void reference_16x16_dct_2d(int16_t input[256], double output[256]) {
+  // First transform columns
+  for (int i = 0; i < 16; ++i) {
+    double temp_in[16], temp_out[16];
+    for (int j = 0; j < 16; ++j)
+      temp_in[j] = input[j * 16 + i];
+    butterfly_16x16_dct_1d(temp_in, temp_out);
+    for (int j = 0; j < 16; ++j)
+      output[j * 16 + i] = temp_out[j];
+  }
+  // Then transform rows
+  for (int i = 0; i < 16; ++i) {
+    double temp_in[16], temp_out[16];
+    for (int j = 0; j < 16; ++j)
+      temp_in[j] = output[j + i * 16];
+    butterfly_16x16_dct_1d(temp_in, temp_out);
+    // Scale by some magic number
+    for (int j = 0; j < 16; ++j)
+      output[j + i * 16] = temp_out[j]/2;
+  }
+}
+
+typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride);
+typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride);
+typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,
+                        int tx_type);
+typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
+                        int tx_type);
+
+typedef std::tr1::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t> Dct16x16Param;
+typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht16x16Param;
+typedef std::tr1::tuple<IdctFunc, IdctFunc, int, vpx_bit_depth_t>
+    Idct16x16Param;
+
+void fdct16x16_ref(const int16_t *in, tran_low_t *out, int stride,
+                   int /*tx_type*/) {
+  vpx_fdct16x16_c(in, out, stride);
+}
+
+void idct16x16_ref(const tran_low_t *in, uint8_t *dest, int stride,
+                   int /*tx_type*/) {
+  vpx_idct16x16_256_add_c(in, dest, stride);
+}
+
+void fht16x16_ref(const int16_t *in, tran_low_t *out, int stride,
+                  int tx_type) {
+  vp9_fht16x16_c(in, out, stride, tx_type);
+}
+
+void iht16x16_ref(const tran_low_t *in, uint8_t *dest, int stride,
+                  int tx_type) {
+  vp9_iht16x16_256_add_c(in, dest, stride, tx_type);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void idct16x16_10(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct16x16_256_add_c(in, out, stride, 10);
+}
+
+void idct16x16_12(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct16x16_256_add_c(in, out, stride, 12);
+}
+
+void idct16x16_10_ref(const tran_low_t *in, uint8_t *out, int stride,
+                      int /*tx_type*/) {
+  idct16x16_10(in, out, stride);
+}
+
+void idct16x16_12_ref(const tran_low_t *in, uint8_t *out, int stride,
+                      int /*tx_type*/) {
+  idct16x16_12(in, out, stride);
+}
+
+void iht16x16_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
+  vp9_highbd_iht16x16_256_add_c(in, out, stride, tx_type, 10);
+}
+
+void iht16x16_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
+  vp9_highbd_iht16x16_256_add_c(in, out, stride, tx_type, 12);
+}
+
+#if HAVE_SSE2
+void idct16x16_10_add_10_c(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct16x16_10_add_c(in, out, stride, 10);
+}
+
+void idct16x16_10_add_12_c(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct16x16_10_add_c(in, out, stride, 12);
+}
+
+void idct16x16_256_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct16x16_256_add_sse2(in, out, stride, 10);
+}
+
+void idct16x16_256_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct16x16_256_add_sse2(in, out, stride, 12);
+}
+
+void idct16x16_10_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct16x16_10_add_sse2(in, out, stride, 10);
+}
+
+void idct16x16_10_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct16x16_10_add_sse2(in, out, stride, 12);
+}
+#endif  // HAVE_SSE2
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+class Trans16x16TestBase {
+ public:
+  virtual ~Trans16x16TestBase() {}
+
+ protected:
+  virtual void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) = 0;
+
+  virtual void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) = 0;
+
+  void RunAccuracyCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    uint32_t max_error = 0;
+    int64_t total_error = 0;
+    const int count_test_block = 10000;
+    for (int i = 0; i < count_test_block; ++i) {
+      DECLARE_ALIGNED(16, int16_t, test_input_block[kNumCoeffs]);
+      DECLARE_ALIGNED(16, tran_low_t, test_temp_block[kNumCoeffs]);
+      DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
+      DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]);
+#if CONFIG_VP9_HIGHBITDEPTH
+      DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
+      DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]);
+#endif
+
+      // Initialize a test block with input range [-mask_, mask_].
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        if (bit_depth_ == VPX_BITS_8) {
+          src[j] = rnd.Rand8();
+          dst[j] = rnd.Rand8();
+          test_input_block[j] = src[j] - dst[j];
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          src16[j] = rnd.Rand16() & mask_;
+          dst16[j] = rnd.Rand16() & mask_;
+          test_input_block[j] = src16[j] - dst16[j];
+#endif
+        }
+      }
+
+      ASM_REGISTER_STATE_CHECK(RunFwdTxfm(test_input_block,
+                                          test_temp_block, pitch_));
+      if (bit_depth_ == VPX_BITS_8) {
+        ASM_REGISTER_STATE_CHECK(
+            RunInvTxfm(test_temp_block, dst, pitch_));
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        ASM_REGISTER_STATE_CHECK(
+            RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));
+#endif
+      }
+
+      for (int j = 0; j < kNumCoeffs; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        const uint32_t diff =
+            bit_depth_ == VPX_BITS_8 ?  dst[j] - src[j] : dst16[j] - src16[j];
+#else
+        const uint32_t diff = dst[j] - src[j];
+#endif
+        const uint32_t error = diff * diff;
+        if (max_error < error)
+          max_error = error;
+        total_error += error;
+      }
+    }
+
+    EXPECT_GE(1u  << 2 * (bit_depth_ - 8), max_error)
+        << "Error: 16x16 FHT/IHT has an individual round trip error > 1";
+
+    EXPECT_GE(count_test_block << 2 * (bit_depth_ - 8), total_error)
+        << "Error: 16x16 FHT/IHT has average round trip error > 1 per block";
+  }
+
+  void RunCoeffCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 1000;
+    DECLARE_ALIGNED(16, int16_t, input_block[kNumCoeffs]);
+    DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]);
+    DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]);
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-mask_, mask_].
+      for (int j = 0; j < kNumCoeffs; ++j)
+        input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
+
+      fwd_txfm_ref(input_block, output_ref_block, pitch_, tx_type_);
+      ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_block, output_block, pitch_));
+
+      // The minimum quant value is 4.
+      for (int j = 0; j < kNumCoeffs; ++j)
+        EXPECT_EQ(output_block[j], output_ref_block[j]);
+    }
+  }
+
+  void RunMemCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 1000;
+    DECLARE_ALIGNED(16, int16_t, input_extreme_block[kNumCoeffs]);
+    DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]);
+    DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]);
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-mask_, mask_].
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        input_extreme_block[j] = rnd.Rand8() % 2 ? mask_ : -mask_;
+      }
+      if (i == 0) {
+        for (int j = 0; j < kNumCoeffs; ++j)
+          input_extreme_block[j] = mask_;
+      } else if (i == 1) {
+        for (int j = 0; j < kNumCoeffs; ++j)
+          input_extreme_block[j] = -mask_;
+      }
+
+      fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_);
+      ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_extreme_block,
+                                          output_block, pitch_));
+
+      // The minimum quant value is 4.
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        EXPECT_EQ(output_block[j], output_ref_block[j]);
+        EXPECT_GE(4 * DCT_MAX_VALUE << (bit_depth_ - 8), abs(output_block[j]))
+            << "Error: 16x16 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
+      }
+    }
+  }
+
+  void RunQuantCheck(int dc_thred, int ac_thred) {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 100000;
+    DECLARE_ALIGNED(16, int16_t, input_extreme_block[kNumCoeffs]);
+    DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]);
+
+    DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, ref[kNumCoeffs]);
+#if CONFIG_VP9_HIGHBITDEPTH
+    DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint16_t, ref16[kNumCoeffs]);
+#endif
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-mask_, mask_].
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        input_extreme_block[j] = rnd.Rand8() % 2 ? mask_ : -mask_;
+      }
+      if (i == 0)
+        for (int j = 0; j < kNumCoeffs; ++j)
+          input_extreme_block[j] = mask_;
+      if (i == 1)
+        for (int j = 0; j < kNumCoeffs; ++j)
+          input_extreme_block[j] = -mask_;
+
+      fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_);
+
+      // clear reconstructed pixel buffers
+      memset(dst, 0, kNumCoeffs * sizeof(uint8_t));
+      memset(ref, 0, kNumCoeffs * sizeof(uint8_t));
+#if CONFIG_VP9_HIGHBITDEPTH
+      memset(dst16, 0, kNumCoeffs * sizeof(uint16_t));
+      memset(ref16, 0, kNumCoeffs * sizeof(uint16_t));
+#endif
+
+      // quantization with maximum allowed step sizes
+      output_ref_block[0] = (output_ref_block[0] / dc_thred) * dc_thred;
+      for (int j = 1; j < kNumCoeffs; ++j)
+        output_ref_block[j] = (output_ref_block[j] / ac_thred) * ac_thred;
+      if (bit_depth_ == VPX_BITS_8) {
+        inv_txfm_ref(output_ref_block, ref, pitch_, tx_type_);
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(output_ref_block, dst, pitch_));
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        inv_txfm_ref(output_ref_block, CONVERT_TO_BYTEPTR(ref16), pitch_,
+                     tx_type_);
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(output_ref_block,
+                                            CONVERT_TO_BYTEPTR(dst16), pitch_));
+#endif
+      }
+      if (bit_depth_ == VPX_BITS_8) {
+        for (int j = 0; j < kNumCoeffs; ++j)
+          EXPECT_EQ(ref[j], dst[j]);
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        for (int j = 0; j < kNumCoeffs; ++j)
+          EXPECT_EQ(ref16[j], dst16[j]);
+#endif
+      }
+    }
+  }
+
+  void RunInvAccuracyCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 1000;
+    DECLARE_ALIGNED(16, int16_t, in[kNumCoeffs]);
+    DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]);
+#if CONFIG_VP9_HIGHBITDEPTH
+    DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    for (int i = 0; i < count_test_block; ++i) {
+      double out_r[kNumCoeffs];
+
+      // Initialize a test block with input range [-255, 255].
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        if (bit_depth_ == VPX_BITS_8) {
+          src[j] = rnd.Rand8();
+          dst[j] = rnd.Rand8();
+          in[j] = src[j] - dst[j];
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          src16[j] = rnd.Rand16() & mask_;
+          dst16[j] = rnd.Rand16() & mask_;
+          in[j] = src16[j] - dst16[j];
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        }
+      }
+
+      reference_16x16_dct_2d(in, out_r);
+      for (int j = 0; j < kNumCoeffs; ++j)
+        coeff[j] = static_cast<tran_low_t>(round(out_r[j]));
+
+      if (bit_depth_ == VPX_BITS_8) {
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, 16));
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16),
+                                            16));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
+
+      for (int j = 0; j < kNumCoeffs; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        const uint32_t diff =
+            bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+#else
+        const uint32_t diff = dst[j] - src[j];
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        const uint32_t error = diff * diff;
+        EXPECT_GE(1u, error)
+            << "Error: 16x16 IDCT has error " << error
+            << " at index " << j;
+      }
+    }
+  }
+
+  void CompareInvReference(IdctFunc ref_txfm, int thresh) {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 10000;
+    const int eob = 10;
+    const int16_t *scan = vp9_default_scan_orders[TX_16X16].scan;
+    DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, ref[kNumCoeffs]);
+#if CONFIG_VP9_HIGHBITDEPTH
+    DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint16_t, ref16[kNumCoeffs]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    for (int i = 0; i < count_test_block; ++i) {
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        if (j < eob) {
+          // Random values less than the threshold, either positive or negative
+          coeff[scan[j]] = rnd(thresh) * (1 - 2 * (i % 2));
+        } else {
+          coeff[scan[j]] = 0;
+        }
+        if (bit_depth_ == VPX_BITS_8) {
+          dst[j] = 0;
+          ref[j] = 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          dst16[j] = 0;
+          ref16[j] = 0;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        }
+      }
+      if (bit_depth_ == VPX_BITS_8) {
+        ref_txfm(coeff, ref, pitch_);
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
+      } else {
+#if CONFIG_VP9_HIGHBITDEPTH
+        ref_txfm(coeff, CONVERT_TO_BYTEPTR(ref16), pitch_);
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16),
+                                 pitch_));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
+
+      for (int j = 0; j < kNumCoeffs; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        const uint32_t diff =
+            bit_depth_ == VPX_BITS_8 ? dst[j] - ref[j] : dst16[j] - ref16[j];
+#else
+        const uint32_t diff = dst[j] - ref[j];
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        const uint32_t error = diff * diff;
+        EXPECT_EQ(0u, error)
+            << "Error: 16x16 IDCT Comparison has error " << error
+            << " at index " << j;
+      }
+    }
+  }
+
+  int pitch_;
+  int tx_type_;
+  vpx_bit_depth_t bit_depth_;
+  int mask_;
+  FhtFunc fwd_txfm_ref;
+  IhtFunc inv_txfm_ref;
+};
+
+class Trans16x16DCT
+    : public Trans16x16TestBase,
+      public ::testing::TestWithParam<Dct16x16Param> {
+ public:
+  virtual ~Trans16x16DCT() {}
+
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    tx_type_  = GET_PARAM(2);
+    bit_depth_ = GET_PARAM(3);
+    pitch_    = 16;
+    fwd_txfm_ref = fdct16x16_ref;
+    inv_txfm_ref = idct16x16_ref;
+    mask_ = (1 << bit_depth_) - 1;
+#if CONFIG_VP9_HIGHBITDEPTH
+    switch (bit_depth_) {
+      case VPX_BITS_10:
+        inv_txfm_ref = idct16x16_10_ref;
+        break;
+      case VPX_BITS_12:
+        inv_txfm_ref = idct16x16_12_ref;
+        break;
+      default:
+        inv_txfm_ref = idct16x16_ref;
+        break;
+    }
+#else
+    inv_txfm_ref = idct16x16_ref;
+#endif
+  }
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {
+    fwd_txfm_(in, out, stride);
+  }
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride);
+  }
+
+  FdctFunc fwd_txfm_;
+  IdctFunc inv_txfm_;
+};
+
+TEST_P(Trans16x16DCT, AccuracyCheck) {
+  RunAccuracyCheck();
+}
+
+TEST_P(Trans16x16DCT, CoeffCheck) {
+  RunCoeffCheck();
+}
+
+TEST_P(Trans16x16DCT, MemCheck) {
+  RunMemCheck();
+}
+
+TEST_P(Trans16x16DCT, QuantCheck) {
+  // Use maximally allowed quantization step sizes for DC and AC
+  // coefficients respectively.
+  RunQuantCheck(1336, 1828);
+}
+
+TEST_P(Trans16x16DCT, InvAccuracyCheck) {
+  RunInvAccuracyCheck();
+}
+
+class Trans16x16HT
+    : public Trans16x16TestBase,
+      public ::testing::TestWithParam<Ht16x16Param> {
+ public:
+  virtual ~Trans16x16HT() {}
+
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    tx_type_  = GET_PARAM(2);
+    bit_depth_ = GET_PARAM(3);
+    pitch_    = 16;
+    fwd_txfm_ref = fht16x16_ref;
+    inv_txfm_ref = iht16x16_ref;
+    mask_ = (1 << bit_depth_) - 1;
+#if CONFIG_VP9_HIGHBITDEPTH
+    switch (bit_depth_) {
+      case VPX_BITS_10:
+        inv_txfm_ref = iht16x16_10;
+        break;
+      case VPX_BITS_12:
+        inv_txfm_ref = iht16x16_12;
+        break;
+      default:
+        inv_txfm_ref = iht16x16_ref;
+        break;
+    }
+#else
+    inv_txfm_ref = iht16x16_ref;
+#endif
+  }
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {
+    fwd_txfm_(in, out, stride, tx_type_);
+  }
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride, tx_type_);
+  }
+
+  FhtFunc fwd_txfm_;
+  IhtFunc inv_txfm_;
+};
+
+TEST_P(Trans16x16HT, AccuracyCheck) {
+  RunAccuracyCheck();
+}
+
+TEST_P(Trans16x16HT, CoeffCheck) {
+  RunCoeffCheck();
+}
+
+TEST_P(Trans16x16HT, MemCheck) {
+  RunMemCheck();
+}
+
+TEST_P(Trans16x16HT, QuantCheck) {
+  // The encoder skips any non-DC intra prediction modes,
+  // when the quantization step size goes beyond 988.
+  RunQuantCheck(429, 729);
+}
+
+class InvTrans16x16DCT
+    : public Trans16x16TestBase,
+      public ::testing::TestWithParam<Idct16x16Param> {
+ public:
+  virtual ~InvTrans16x16DCT() {}
+
+  virtual void SetUp() {
+    ref_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    thresh_ = GET_PARAM(2);
+    bit_depth_ = GET_PARAM(3);
+    pitch_ = 16;
+    mask_ = (1 << bit_depth_) - 1;
+}
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  void RunFwdTxfm(int16_t * /*in*/, tran_low_t * /*out*/, int /*stride*/) {}
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride);
+  }
+
+  IdctFunc ref_txfm_;
+  IdctFunc inv_txfm_;
+  int thresh_;
+};
+
+TEST_P(InvTrans16x16DCT, CompareReference) {
+  CompareInvReference(ref_txfm_, thresh_);
+}
+
+using std::tr1::make_tuple;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    C, Trans16x16DCT,
+    ::testing::Values(
+        make_tuple(&vpx_highbd_fdct16x16_c, &idct16x16_10, 0, VPX_BITS_10),
+        make_tuple(&vpx_highbd_fdct16x16_c, &idct16x16_12, 0, VPX_BITS_12),
+        make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_c, 0, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(
+    C, Trans16x16DCT,
+    ::testing::Values(
+        make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_c, 0, VPX_BITS_8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    C, Trans16x16HT,
+    ::testing::Values(
+        make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_10, 0, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_10, 1, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_10, 2, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_10, 3, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_12, 0, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_12, 1, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_12, 2, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_12, 3, VPX_BITS_12),
+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(
+    C, Trans16x16HT,
+    ::testing::Values(
+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    NEON, Trans16x16DCT,
+    ::testing::Values(
+        make_tuple(&vpx_fdct16x16_c,
+                   &vpx_idct16x16_256_add_neon, 0, VPX_BITS_8)));
+#endif
+
+#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Trans16x16DCT,
+    ::testing::Values(
+        make_tuple(&vpx_fdct16x16_sse2,
+                   &vpx_idct16x16_256_add_sse2, 0, VPX_BITS_8)));
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Trans16x16HT,
+    ::testing::Values(
+        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 0,
+                   VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 1,
+                   VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 2,
+                   VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 3,
+                   VPX_BITS_8)));
+#endif  // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Trans16x16DCT,
+    ::testing::Values(
+        make_tuple(&vpx_highbd_fdct16x16_sse2,
+                   &idct16x16_10, 0, VPX_BITS_10),
+        make_tuple(&vpx_highbd_fdct16x16_c,
+                   &idct16x16_256_add_10_sse2, 0, VPX_BITS_10),
+        make_tuple(&vpx_highbd_fdct16x16_sse2,
+                   &idct16x16_12, 0, VPX_BITS_12),
+        make_tuple(&vpx_highbd_fdct16x16_c,
+                   &idct16x16_256_add_12_sse2, 0, VPX_BITS_12),
+        make_tuple(&vpx_fdct16x16_sse2,
+                   &vpx_idct16x16_256_add_c, 0, VPX_BITS_8)));
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Trans16x16HT,
+    ::testing::Values(
+        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 3,
+                   VPX_BITS_8)));
+// Optimizations take effect at a threshold of 3155, so we use a value close to
+// that to test both branches.
+INSTANTIATE_TEST_CASE_P(
+    SSE2, InvTrans16x16DCT,
+    ::testing::Values(
+        make_tuple(&idct16x16_10_add_10_c,
+                   &idct16x16_10_add_10_sse2, 3167, VPX_BITS_10),
+        make_tuple(&idct16x16_10,
+                   &idct16x16_256_add_10_sse2, 3167, VPX_BITS_10),
+        make_tuple(&idct16x16_10_add_12_c,
+                   &idct16x16_10_add_12_sse2, 3167, VPX_BITS_12),
+        make_tuple(&idct16x16_12,
+                   &idct16x16_256_add_12_sse2, 3167, VPX_BITS_12)));
+#endif  // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    MSA, Trans16x16DCT,
+    ::testing::Values(
+        make_tuple(&vpx_fdct16x16_msa,
+                   &vpx_idct16x16_256_add_msa, 0, VPX_BITS_8)));
+INSTANTIATE_TEST_CASE_P(
+    MSA, Trans16x16HT,
+    ::testing::Values(
+        make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 3,
+                   VPX_BITS_8)));
+#endif  // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+}  // namespace
diff --git a/libs/libvpx/test/dct32x32_test.cc b/libs/libvpx/test/dct32x32_test.cc
new file mode 100644
index 0000000000..2dac10bc1f
--- /dev/null
+++ b/libs/libvpx/test/dct32x32_test.cc
@@ -0,0 +1,391 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "vp9/common/vp9_entropy.h"
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+#ifdef _MSC_VER
+static int round(double x) {
+  if (x < 0)
+    return static_cast<int>(ceil(x - 0.5));
+  else
+    return static_cast<int>(floor(x + 0.5));
+}
+#endif
+
+const int kNumCoeffs = 1024;
+const double kPi = 3.141592653589793238462643383279502884;
+void reference_32x32_dct_1d(const double in[32], double out[32]) {
+  const double kInvSqrt2 = 0.707106781186547524400844362104;
+  for (int k = 0; k < 32; k++) {
+    out[k] = 0.0;
+    for (int n = 0; n < 32; n++)
+      out[k] += in[n] * cos(kPi * (2 * n + 1) * k / 64.0);
+    if (k == 0)
+      out[k] = out[k] * kInvSqrt2;
+  }
+}
+
+void reference_32x32_dct_2d(const int16_t input[kNumCoeffs],
+                            double output[kNumCoeffs]) {
+  // First transform columns
+  for (int i = 0; i < 32; ++i) {
+    double temp_in[32], temp_out[32];
+    for (int j = 0; j < 32; ++j)
+      temp_in[j] = input[j*32 + i];
+    reference_32x32_dct_1d(temp_in, temp_out);
+    for (int j = 0; j < 32; ++j)
+      output[j * 32 + i] = temp_out[j];
+  }
+  // Then transform rows
+  for (int i = 0; i < 32; ++i) {
+    double temp_in[32], temp_out[32];
+    for (int j = 0; j < 32; ++j)
+      temp_in[j] = output[j + i*32];
+    reference_32x32_dct_1d(temp_in, temp_out);
+    // Scale by some magic number
+    for (int j = 0; j < 32; ++j)
+      output[j + i * 32] = temp_out[j] / 4;
+  }
+}
+
+typedef void (*FwdTxfmFunc)(const int16_t *in, tran_low_t *out, int stride);
+typedef void (*InvTxfmFunc)(const tran_low_t *in, uint8_t *out, int stride);
+
+typedef std::tr1::tuple<FwdTxfmFunc, InvTxfmFunc, int, vpx_bit_depth_t>
+    Trans32x32Param;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void idct32x32_10(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct32x32_1024_add_c(in, out, stride, 10);
+}
+
+void idct32x32_12(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct32x32_1024_add_c(in, out, stride, 12);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+class Trans32x32Test : public ::testing::TestWithParam<Trans32x32Param> {
+ public:
+  virtual ~Trans32x32Test() {}
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    version_  = GET_PARAM(2);  // 0: high precision forward transform
+                               // 1: low precision version for rd loop
+    bit_depth_ = GET_PARAM(3);
+    mask_ = (1 << bit_depth_) - 1;
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  int version_;
+  vpx_bit_depth_t bit_depth_;
+  int mask_;
+  FwdTxfmFunc fwd_txfm_;
+  InvTxfmFunc inv_txfm_;
+};
+
+TEST_P(Trans32x32Test, AccuracyCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  uint32_t max_error = 0;
+  int64_t total_error = 0;
+  const int count_test_block = 10000;
+  DECLARE_ALIGNED(16, int16_t, test_input_block[kNumCoeffs]);
+  DECLARE_ALIGNED(16, tran_low_t, test_temp_block[kNumCoeffs]);
+  DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
+  DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]);
+#if CONFIG_VP9_HIGHBITDEPTH
+  DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
+  DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]);
+#endif
+
+  for (int i = 0; i < count_test_block; ++i) {
+    // Initialize a test block with input range [-mask_, mask_].
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      if (bit_depth_ == VPX_BITS_8) {
+        src[j] = rnd.Rand8();
+        dst[j] = rnd.Rand8();
+        test_input_block[j] = src[j] - dst[j];
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        src16[j] = rnd.Rand16() & mask_;
+        dst16[j] = rnd.Rand16() & mask_;
+        test_input_block[j] = src16[j] - dst16[j];
+#endif
+      }
+    }
+
+    ASM_REGISTER_STATE_CHECK(fwd_txfm_(test_input_block, test_temp_block, 32));
+    if (bit_depth_ == VPX_BITS_8) {
+      ASM_REGISTER_STATE_CHECK(inv_txfm_(test_temp_block, dst, 32));
+#if CONFIG_VP9_HIGHBITDEPTH
+    } else {
+      ASM_REGISTER_STATE_CHECK(inv_txfm_(test_temp_block,
+                                         CONVERT_TO_BYTEPTR(dst16), 32));
+#endif
+    }
+
+    for (int j = 0; j < kNumCoeffs; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+      const uint32_t diff =
+          bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+#else
+      const uint32_t diff = dst[j] - src[j];
+#endif
+      const uint32_t error = diff * diff;
+      if (max_error < error)
+        max_error = error;
+      total_error += error;
+    }
+  }
+
+  if (version_ == 1) {
+    max_error /= 2;
+    total_error /= 45;
+  }
+
+  EXPECT_GE(1u << 2 * (bit_depth_ - 8), max_error)
+      << "Error: 32x32 FDCT/IDCT has an individual round-trip error > 1";
+
+  EXPECT_GE(count_test_block << 2 * (bit_depth_ - 8), total_error)
+      << "Error: 32x32 FDCT/IDCT has average round-trip error > 1 per block";
+}
+
+TEST_P(Trans32x32Test, CoeffCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = 1000;
+
+  DECLARE_ALIGNED(16, int16_t, input_block[kNumCoeffs]);
+  DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]);
+  DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]);
+
+  for (int i = 0; i < count_test_block; ++i) {
+    for (int j = 0; j < kNumCoeffs; ++j)
+      input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
+
+    const int stride = 32;
+    vpx_fdct32x32_c(input_block, output_ref_block, stride);
+    ASM_REGISTER_STATE_CHECK(fwd_txfm_(input_block, output_block, stride));
+
+    if (version_ == 0) {
+      for (int j = 0; j < kNumCoeffs; ++j)
+        EXPECT_EQ(output_block[j], output_ref_block[j])
+            << "Error: 32x32 FDCT versions have mismatched coefficients";
+    } else {
+      for (int j = 0; j < kNumCoeffs; ++j)
+        EXPECT_GE(6, abs(output_block[j] - output_ref_block[j]))
+            << "Error: 32x32 FDCT rd has mismatched coefficients";
+    }
+  }
+}
+
+TEST_P(Trans32x32Test, MemCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = 2000;
+
+  DECLARE_ALIGNED(16, int16_t, input_extreme_block[kNumCoeffs]);
+  DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]);
+  DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]);
+
+  for (int i = 0; i < count_test_block; ++i) {
+    // Initialize a test block with input range [-mask_, mask_].
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      input_extreme_block[j] = rnd.Rand8() & 1 ? mask_ : -mask_;
+    }
+    if (i == 0) {
+      for (int j = 0; j < kNumCoeffs; ++j)
+        input_extreme_block[j] = mask_;
+    } else if (i == 1) {
+      for (int j = 0; j < kNumCoeffs; ++j)
+        input_extreme_block[j] = -mask_;
+    }
+
+    const int stride = 32;
+    vpx_fdct32x32_c(input_extreme_block, output_ref_block, stride);
+    ASM_REGISTER_STATE_CHECK(
+        fwd_txfm_(input_extreme_block, output_block, stride));
+
+    // The minimum quant value is 4.
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      if (version_ == 0) {
+        EXPECT_EQ(output_block[j], output_ref_block[j])
+            << "Error: 32x32 FDCT versions have mismatched coefficients";
+      } else {
+        EXPECT_GE(6, abs(output_block[j] - output_ref_block[j]))
+            << "Error: 32x32 FDCT rd has mismatched coefficients";
+      }
+      EXPECT_GE(4 * DCT_MAX_VALUE << (bit_depth_ - 8), abs(output_ref_block[j]))
+          << "Error: 32x32 FDCT C has coefficient larger than 4*DCT_MAX_VALUE";
+      EXPECT_GE(4 * DCT_MAX_VALUE << (bit_depth_ - 8), abs(output_block[j]))
+          << "Error: 32x32 FDCT has coefficient larger than "
+          << "4*DCT_MAX_VALUE";
+    }
+  }
+}
+
+TEST_P(Trans32x32Test, InverseAccuracy) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = 1000;
+  DECLARE_ALIGNED(16, int16_t, in[kNumCoeffs]);
+  DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]);
+  DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
+  DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]);
+#if CONFIG_VP9_HIGHBITDEPTH
+  DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
+  DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]);
+#endif
+
+  for (int i = 0; i < count_test_block; ++i) {
+    double out_r[kNumCoeffs];
+
+    // Initialize a test block with input range [-255, 255]
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      if (bit_depth_ == VPX_BITS_8) {
+        src[j] = rnd.Rand8();
+        dst[j] = rnd.Rand8();
+        in[j] = src[j] - dst[j];
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        src16[j] = rnd.Rand16() & mask_;
+        dst16[j] = rnd.Rand16() & mask_;
+        in[j] = src16[j] - dst16[j];
+#endif
+      }
+    }
+
+    reference_32x32_dct_2d(in, out_r);
+    for (int j = 0; j < kNumCoeffs; ++j)
+      coeff[j] = static_cast<tran_low_t>(round(out_r[j]));
+    if (bit_depth_ == VPX_BITS_8) {
+      ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, dst, 32));
+#if CONFIG_VP9_HIGHBITDEPTH
+    } else {
+      ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, CONVERT_TO_BYTEPTR(dst16), 32));
+#endif
+    }
+    for (int j = 0; j < kNumCoeffs; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+      const int diff =
+          bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+#else
+      const int diff = dst[j] - src[j];
+#endif
+      const int error = diff * diff;
+      EXPECT_GE(1, error)
+          << "Error: 32x32 IDCT has error " << error
+          << " at index " << j;
+    }
+  }
+}
+
+using std::tr1::make_tuple;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    C, Trans32x32Test,
+    ::testing::Values(
+        make_tuple(&vpx_highbd_fdct32x32_c,
+                   &idct32x32_10, 0, VPX_BITS_10),
+        make_tuple(&vpx_highbd_fdct32x32_rd_c,
+                   &idct32x32_10, 1, VPX_BITS_10),
+        make_tuple(&vpx_highbd_fdct32x32_c,
+                   &idct32x32_12, 0, VPX_BITS_12),
+        make_tuple(&vpx_highbd_fdct32x32_rd_c,
+                   &idct32x32_12, 1, VPX_BITS_12),
+        make_tuple(&vpx_fdct32x32_c,
+                   &vpx_idct32x32_1024_add_c, 0, VPX_BITS_8),
+        make_tuple(&vpx_fdct32x32_rd_c,
+                   &vpx_idct32x32_1024_add_c, 1, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(
+    C, Trans32x32Test,
+    ::testing::Values(
+        make_tuple(&vpx_fdct32x32_c,
+                   &vpx_idct32x32_1024_add_c, 0, VPX_BITS_8),
+        make_tuple(&vpx_fdct32x32_rd_c,
+                   &vpx_idct32x32_1024_add_c, 1, VPX_BITS_8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    NEON, Trans32x32Test,
+    ::testing::Values(
+        make_tuple(&vpx_fdct32x32_c,
+                   &vpx_idct32x32_1024_add_neon, 0, VPX_BITS_8),
+        make_tuple(&vpx_fdct32x32_rd_c,
+                   &vpx_idct32x32_1024_add_neon, 1, VPX_BITS_8)));
+#endif  // HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Trans32x32Test,
+    ::testing::Values(
+        make_tuple(&vpx_fdct32x32_sse2,
+                   &vpx_idct32x32_1024_add_sse2, 0, VPX_BITS_8),
+        make_tuple(&vpx_fdct32x32_rd_sse2,
+                   &vpx_idct32x32_1024_add_sse2, 1, VPX_BITS_8)));
+#endif  // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Trans32x32Test,
+    ::testing::Values(
+        make_tuple(&vpx_highbd_fdct32x32_sse2, &idct32x32_10, 0, VPX_BITS_10),
+        make_tuple(&vpx_highbd_fdct32x32_rd_sse2, &idct32x32_10, 1,
+                   VPX_BITS_10),
+        make_tuple(&vpx_highbd_fdct32x32_sse2, &idct32x32_12, 0, VPX_BITS_12),
+        make_tuple(&vpx_highbd_fdct32x32_rd_sse2, &idct32x32_12, 1,
+                   VPX_BITS_12),
+        make_tuple(&vpx_fdct32x32_sse2, &vpx_idct32x32_1024_add_c, 0,
+                   VPX_BITS_8),
+        make_tuple(&vpx_fdct32x32_rd_sse2, &vpx_idct32x32_1024_add_c, 1,
+                   VPX_BITS_8)));
+#endif  // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    AVX2, Trans32x32Test,
+    ::testing::Values(
+        make_tuple(&vpx_fdct32x32_avx2,
+                   &vpx_idct32x32_1024_add_sse2, 0, VPX_BITS_8),
+        make_tuple(&vpx_fdct32x32_rd_avx2,
+                   &vpx_idct32x32_1024_add_sse2, 1, VPX_BITS_8)));
+#endif  // HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    MSA, Trans32x32Test,
+    ::testing::Values(
+        make_tuple(&vpx_fdct32x32_msa,
+                   &vpx_idct32x32_1024_add_msa, 0, VPX_BITS_8),
+        make_tuple(&vpx_fdct32x32_rd_msa,
+                   &vpx_idct32x32_1024_add_msa, 1, VPX_BITS_8)));
+#endif  // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+}  // namespace
diff --git a/libs/libvpx/test/decode_api_test.cc b/libs/libvpx/test/decode_api_test.cc
new file mode 100644
index 0000000000..318351b73d
--- /dev/null
+++ b/libs/libvpx/test/decode_api_test.cc
@@ -0,0 +1,151 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_config.h"
+#include "test/ivf_video_source.h"
+#include "vpx/vp8dx.h"
+#include "vpx/vpx_decoder.h"
+
+namespace {
+
+#define NELEMENTS(x) static_cast<int>(sizeof(x) / sizeof(x[0]))
+
+TEST(DecodeAPI, InvalidParams) {
+  static const vpx_codec_iface_t *kCodecs[] = {
+#if CONFIG_VP8_DECODER
+    &vpx_codec_vp8_dx_algo,
+#endif
+#if CONFIG_VP9_DECODER
+    &vpx_codec_vp9_dx_algo,
+#endif
+#if CONFIG_VP10_DECODER
+    &vpx_codec_vp10_dx_algo,
+#endif
+  };
+  uint8_t buf[1] = {0};
+  vpx_codec_ctx_t dec;
+
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_dec_init(NULL, NULL, NULL, 0));
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_dec_init(&dec, NULL, NULL, 0));
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_decode(NULL, NULL, 0, NULL, 0));
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_decode(NULL, buf, 0, NULL, 0));
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
+            vpx_codec_decode(NULL, buf, NELEMENTS(buf), NULL, 0));
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
+            vpx_codec_decode(NULL, NULL, NELEMENTS(buf), NULL, 0));
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_destroy(NULL));
+  EXPECT_TRUE(vpx_codec_error(NULL) != NULL);
+
+  for (int i = 0; i < NELEMENTS(kCodecs); ++i) {
+    EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
+              vpx_codec_dec_init(NULL, kCodecs[i], NULL, 0));
+
+    EXPECT_EQ(VPX_CODEC_OK, vpx_codec_dec_init(&dec, kCodecs[i], NULL, 0));
+    EXPECT_EQ(VPX_CODEC_UNSUP_BITSTREAM,
+              vpx_codec_decode(&dec, buf, NELEMENTS(buf), NULL, 0));
+    EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
+              vpx_codec_decode(&dec, NULL, NELEMENTS(buf), NULL, 0));
+    EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
+              vpx_codec_decode(&dec, buf, 0, NULL, 0));
+
+    EXPECT_EQ(VPX_CODEC_OK, vpx_codec_destroy(&dec));
+  }
+}
+
+#if CONFIG_VP8_DECODER
+TEST(DecodeAPI, OptionalParams) {
+  vpx_codec_ctx_t dec;
+
+#if CONFIG_ERROR_CONCEALMENT
+  EXPECT_EQ(VPX_CODEC_OK, vpx_codec_dec_init(&dec, &vpx_codec_vp8_dx_algo, NULL,
+                                             VPX_CODEC_USE_ERROR_CONCEALMENT));
+#else
+  EXPECT_EQ(VPX_CODEC_INCAPABLE,
+            vpx_codec_dec_init(&dec, &vpx_codec_vp8_dx_algo, NULL,
+                               VPX_CODEC_USE_ERROR_CONCEALMENT));
+#endif  // CONFIG_ERROR_CONCEALMENT
+}
+#endif  // CONFIG_VP8_DECODER
+
+#if CONFIG_VP9_DECODER
+// Test VP9 codec controls after a decode error to ensure the code doesn't
+// misbehave.
+void TestVp9Controls(vpx_codec_ctx_t *dec) {
+  static const int kControls[] = {
+    VP8D_GET_LAST_REF_UPDATES,
+    VP8D_GET_FRAME_CORRUPTED,
+    VP9D_GET_DISPLAY_SIZE,
+    VP9D_GET_FRAME_SIZE
+  };
+  int val[2];
+
+  for (int i = 0; i < NELEMENTS(kControls); ++i) {
+    const vpx_codec_err_t res = vpx_codec_control_(dec, kControls[i], val);
+    switch (kControls[i]) {
+      case VP8D_GET_FRAME_CORRUPTED:
+        EXPECT_EQ(VPX_CODEC_ERROR, res) << kControls[i];
+        break;
+      default:
+        EXPECT_EQ(VPX_CODEC_OK, res) << kControls[i];
+        break;
+    }
+    EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
+              vpx_codec_control_(dec, kControls[i], NULL));
+  }
+
+  vp9_ref_frame_t ref;
+  ref.idx = 0;
+  EXPECT_EQ(VPX_CODEC_ERROR, vpx_codec_control(dec, VP9_GET_REFERENCE, &ref));
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
+            vpx_codec_control(dec, VP9_GET_REFERENCE, NULL));
+
+  vpx_ref_frame_t ref_copy;
+  const int width = 352;
+  const int height = 288;
+  ASSERT_TRUE(
+      vpx_img_alloc(&ref_copy.img, VPX_IMG_FMT_I420, width, height, 1) != NULL);
+  ref_copy.frame_type = VP8_LAST_FRAME;
+  EXPECT_EQ(VPX_CODEC_ERROR,
+            vpx_codec_control(dec, VP8_COPY_REFERENCE, &ref_copy));
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
+            vpx_codec_control(dec, VP8_COPY_REFERENCE, NULL));
+  vpx_img_free(&ref_copy.img);
+}
+
+TEST(DecodeAPI, Vp9InvalidDecode) {
+  const vpx_codec_iface_t *const codec = &vpx_codec_vp9_dx_algo;
+  const char filename[] =
+      "invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf";
+  libvpx_test::IVFVideoSource video(filename);
+  video.Init();
+  video.Begin();
+  ASSERT_TRUE(!HasFailure());
+
+  vpx_codec_ctx_t dec;
+  EXPECT_EQ(VPX_CODEC_OK, vpx_codec_dec_init(&dec, codec, NULL, 0));
+  const uint32_t frame_size = static_cast<uint32_t>(video.frame_size());
+#if CONFIG_VP9_HIGHBITDEPTH
+  EXPECT_EQ(VPX_CODEC_MEM_ERROR,
+            vpx_codec_decode(&dec, video.cxdata(), frame_size, NULL, 0));
+#else
+  EXPECT_EQ(VPX_CODEC_UNSUP_BITSTREAM,
+            vpx_codec_decode(&dec, video.cxdata(), frame_size, NULL, 0));
+#endif
+  vpx_codec_iter_t iter = NULL;
+  EXPECT_EQ(NULL, vpx_codec_get_frame(&dec, &iter));
+
+  TestVp9Controls(&dec);
+  EXPECT_EQ(VPX_CODEC_OK, vpx_codec_destroy(&dec));
+}
+#endif  // CONFIG_VP9_DECODER
+
+}  // namespace
diff --git a/libs/libvpx/test/decode_perf_test.cc b/libs/libvpx/test/decode_perf_test.cc
new file mode 100644
index 0000000000..c24d517013
--- /dev/null
+++ b/libs/libvpx/test/decode_perf_test.cc
@@ -0,0 +1,273 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string>
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/ivf_video_source.h"
+#include "test/md5_helper.h"
+#include "test/util.h"
+#include "test/webm_video_source.h"
+#include "vpx_ports/vpx_timer.h"
+#include "./ivfenc.h"
+#include "./vpx_version.h"
+
+using std::tr1::make_tuple;
+
+namespace {
+
+#define VIDEO_NAME 0
+#define THREADS 1
+
+const int kMaxPsnr = 100;
+const double kUsecsInSec = 1000000.0;
+const char kNewEncodeOutputFile[] = "new_encode.ivf";
+
+/*
+ DecodePerfTest takes a tuple of filename + number of threads to decode with
+ */
+typedef std::tr1::tuple<const char *, unsigned> DecodePerfParam;
+
+const DecodePerfParam kVP9DecodePerfVectors[] = {
+  make_tuple("vp90-2-bbb_426x240_tile_1x1_180kbps.webm", 1),
+  make_tuple("vp90-2-bbb_640x360_tile_1x2_337kbps.webm", 2),
+  make_tuple("vp90-2-bbb_854x480_tile_1x2_651kbps.webm", 2),
+  make_tuple("vp90-2-bbb_1280x720_tile_1x4_1310kbps.webm", 4),
+  make_tuple("vp90-2-bbb_1920x1080_tile_1x1_2581kbps.webm", 1),
+  make_tuple("vp90-2-bbb_1920x1080_tile_1x4_2586kbps.webm", 4),
+  make_tuple("vp90-2-bbb_1920x1080_tile_1x4_fpm_2304kbps.webm", 4),
+  make_tuple("vp90-2-sintel_426x182_tile_1x1_171kbps.webm", 1),
+  make_tuple("vp90-2-sintel_640x272_tile_1x2_318kbps.webm", 2),
+  make_tuple("vp90-2-sintel_854x364_tile_1x2_621kbps.webm", 2),
+  make_tuple("vp90-2-sintel_1280x546_tile_1x4_1257kbps.webm", 4),
+  make_tuple("vp90-2-sintel_1920x818_tile_1x4_fpm_2279kbps.webm", 4),
+  make_tuple("vp90-2-tos_426x178_tile_1x1_181kbps.webm", 1),
+  make_tuple("vp90-2-tos_640x266_tile_1x2_336kbps.webm", 2),
+  make_tuple("vp90-2-tos_854x356_tile_1x2_656kbps.webm", 2),
+  make_tuple("vp90-2-tos_854x356_tile_1x2_fpm_546kbps.webm", 2),
+  make_tuple("vp90-2-tos_1280x534_tile_1x4_1306kbps.webm", 4),
+  make_tuple("vp90-2-tos_1280x534_tile_1x4_fpm_952kbps.webm", 4),
+  make_tuple("vp90-2-tos_1920x800_tile_1x4_fpm_2335kbps.webm", 4),
+};
+
+/*
+ In order to reflect real world performance as much as possible, Perf tests
+ *DO NOT* do any correctness checks. Please run them alongside correctness
+ tests to ensure proper codec integrity. Furthermore, in this test we
+ deliberately limit the amount of system calls we make to avoid OS
+ preemption.
+
+ TODO(joshualitt) create a more detailed perf measurement test to collect
+   power/temp/min max frame decode times/etc
+ */
+
+class DecodePerfTest : public ::testing::TestWithParam<DecodePerfParam> {
+};
+
+TEST_P(DecodePerfTest, PerfTest) {
+  const char *const video_name = GET_PARAM(VIDEO_NAME);
+  const unsigned threads = GET_PARAM(THREADS);
+
+  libvpx_test::WebMVideoSource video(video_name);
+  video.Init();
+
+  vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
+  cfg.threads = threads;
+  libvpx_test::VP9Decoder decoder(cfg, 0);
+
+  vpx_usec_timer t;
+  vpx_usec_timer_start(&t);
+
+  for (video.Begin(); video.cxdata() != NULL; video.Next()) {
+    decoder.DecodeFrame(video.cxdata(), video.frame_size());
+  }
+
+  vpx_usec_timer_mark(&t);
+  const double elapsed_secs = double(vpx_usec_timer_elapsed(&t))
+                              / kUsecsInSec;
+  const unsigned frames = video.frame_number();
+  const double fps = double(frames) / elapsed_secs;
+
+  printf("{\n");
+  printf("\t\"type\" : \"decode_perf_test\",\n");
+  printf("\t\"version\" : \"%s\",\n", VERSION_STRING_NOSP);
+  printf("\t\"videoName\" : \"%s\",\n", video_name);
+  printf("\t\"threadCount\" : %u,\n", threads);
+  printf("\t\"decodeTimeSecs\" : %f,\n", elapsed_secs);
+  printf("\t\"totalFrames\" : %u,\n", frames);
+  printf("\t\"framesPerSecond\" : %f\n", fps);
+  printf("}\n");
+}
+
+INSTANTIATE_TEST_CASE_P(VP9, DecodePerfTest,
+                        ::testing::ValuesIn(kVP9DecodePerfVectors));
+
+class VP9NewEncodeDecodePerfTest :
+    public ::libvpx_test::EncoderTest,
+    public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
+ protected:
+  VP9NewEncodeDecodePerfTest()
+      : EncoderTest(GET_PARAM(0)),
+        encoding_mode_(GET_PARAM(1)),
+        speed_(0),
+        outfile_(0),
+        out_frames_(0) {
+  }
+
+  virtual ~VP9NewEncodeDecodePerfTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+
+    cfg_.g_lag_in_frames = 25;
+    cfg_.rc_min_quantizer = 2;
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_undershoot_pct = 50;
+    cfg_.rc_overshoot_pct = 50;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 600;
+    cfg_.rc_resize_allowed = 0;
+    cfg_.rc_end_usage = VPX_VBR;
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 1) {
+      encoder->Control(VP8E_SET_CPUUSED, speed_);
+      encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 1);
+      encoder->Control(VP9E_SET_TILE_COLUMNS, 2);
+    }
+  }
+
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+    const std::string data_path = getenv("LIBVPX_TEST_DATA_PATH");
+    const std::string path_to_source = data_path + "/" + kNewEncodeOutputFile;
+    outfile_ = fopen(path_to_source.c_str(), "wb");
+    ASSERT_TRUE(outfile_ != NULL);
+  }
+
+  virtual void EndPassHook() {
+    if (outfile_ != NULL) {
+      if (!fseek(outfile_, 0, SEEK_SET))
+        ivf_write_file_header(outfile_, &cfg_, VP9_FOURCC, out_frames_);
+      fclose(outfile_);
+      outfile_ = NULL;
+    }
+  }
+
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    ++out_frames_;
+
+    // Write initial file header if first frame.
+    if (pkt->data.frame.pts == 0)
+      ivf_write_file_header(outfile_, &cfg_, VP9_FOURCC, out_frames_);
+
+    // Write frame header and data.
+    ivf_write_frame_header(outfile_, out_frames_, pkt->data.frame.sz);
+    ASSERT_EQ(fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile_),
+              pkt->data.frame.sz);
+  }
+
+  virtual bool DoDecode() { return false; }
+
+  void set_speed(unsigned int speed) {
+    speed_ = speed;
+  }
+
+ private:
+  libvpx_test::TestMode encoding_mode_;
+  uint32_t speed_;
+  FILE *outfile_;
+  uint32_t out_frames_;
+};
+
+struct EncodePerfTestVideo {
+  EncodePerfTestVideo(const char *name_, uint32_t width_, uint32_t height_,
+                      uint32_t bitrate_, int frames_)
+      : name(name_),
+        width(width_),
+        height(height_),
+        bitrate(bitrate_),
+        frames(frames_) {}
+  const char *name;
+  uint32_t width;
+  uint32_t height;
+  uint32_t bitrate;
+  int frames;
+};
+
+const EncodePerfTestVideo kVP9EncodePerfTestVectors[] = {
+  EncodePerfTestVideo("niklas_1280_720_30.yuv", 1280, 720, 600, 470),
+};
+
+TEST_P(VP9NewEncodeDecodePerfTest, PerfTest) {
+  SetUp();
+
+  // TODO(JBB): Make this work by going through the set of given files.
+  const int i = 0;
+  const vpx_rational timebase = { 33333333, 1000000000 };
+  cfg_.g_timebase = timebase;
+  cfg_.rc_target_bitrate = kVP9EncodePerfTestVectors[i].bitrate;
+
+  init_flags_ = VPX_CODEC_USE_PSNR;
+
+  const char *video_name = kVP9EncodePerfTestVectors[i].name;
+  libvpx_test::I420VideoSource video(
+      video_name,
+      kVP9EncodePerfTestVectors[i].width,
+      kVP9EncodePerfTestVectors[i].height,
+      timebase.den, timebase.num, 0,
+      kVP9EncodePerfTestVectors[i].frames);
+  set_speed(2);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  const uint32_t threads = 4;
+
+  libvpx_test::IVFVideoSource decode_video(kNewEncodeOutputFile);
+  decode_video.Init();
+
+  vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
+  cfg.threads = threads;
+  libvpx_test::VP9Decoder decoder(cfg, 0);
+
+  vpx_usec_timer t;
+  vpx_usec_timer_start(&t);
+
+  for (decode_video.Begin(); decode_video.cxdata() != NULL;
+       decode_video.Next()) {
+    decoder.DecodeFrame(decode_video.cxdata(), decode_video.frame_size());
+  }
+
+  vpx_usec_timer_mark(&t);
+  const double elapsed_secs =
+      static_cast<double>(vpx_usec_timer_elapsed(&t)) / kUsecsInSec;
+  const unsigned decode_frames = decode_video.frame_number();
+  const double fps = static_cast<double>(decode_frames) / elapsed_secs;
+
+  printf("{\n");
+  printf("\t\"type\" : \"decode_perf_test\",\n");
+  printf("\t\"version\" : \"%s\",\n", VERSION_STRING_NOSP);
+  printf("\t\"videoName\" : \"%s\",\n", kNewEncodeOutputFile);
+  printf("\t\"threadCount\" : %u,\n", threads);
+  printf("\t\"decodeTimeSecs\" : %f,\n", elapsed_secs);
+  printf("\t\"totalFrames\" : %u,\n", decode_frames);
+  printf("\t\"framesPerSecond\" : %f\n", fps);
+  printf("}\n");
+}
+
+VP9_INSTANTIATE_TEST_CASE(
+  VP9NewEncodeDecodePerfTest, ::testing::Values(::libvpx_test::kTwoPassGood));
+}  // namespace
diff --git a/libs/libvpx/test/decode_test_driver.cc b/libs/libvpx/test/decode_test_driver.cc
new file mode 100644
index 0000000000..ad861c3157
--- /dev/null
+++ b/libs/libvpx/test/decode_test_driver.cc
@@ -0,0 +1,123 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/register_state_check.h"
+#include "test/video_source.h"
+
+namespace libvpx_test {
+
+const char kVP8Name[] = "WebM Project VP8";
+
+vpx_codec_err_t Decoder::PeekStream(const uint8_t *cxdata, size_t size,
+                                    vpx_codec_stream_info_t *stream_info) {
+  return vpx_codec_peek_stream_info(CodecInterface(),
+                                    cxdata, static_cast<unsigned int>(size),
+                                    stream_info);
+}
+
+vpx_codec_err_t Decoder::DecodeFrame(const uint8_t *cxdata, size_t size) {
+  return DecodeFrame(cxdata, size, NULL);
+}
+
+vpx_codec_err_t Decoder::DecodeFrame(const uint8_t *cxdata, size_t size,
+                                     void *user_priv) {
+  vpx_codec_err_t res_dec;
+  InitOnce();
+  API_REGISTER_STATE_CHECK(
+      res_dec = vpx_codec_decode(&decoder_,
+                                 cxdata, static_cast<unsigned int>(size),
+                                 user_priv, 0));
+  return res_dec;
+}
+
+bool Decoder::IsVP8() const {
+  const char *codec_name = GetDecoderName();
+  return strncmp(kVP8Name, codec_name, sizeof(kVP8Name) - 1) == 0;
+}
+
+void DecoderTest::HandlePeekResult(Decoder *const decoder,
+                                   CompressedVideoSource *video,
+                                   const vpx_codec_err_t res_peek) {
+  const bool is_vp8 = decoder->IsVP8();
+  if (is_vp8) {
+    /* Vp8's implementation of PeekStream returns an error if the frame you
+     * pass it is not a keyframe, so we only expect VPX_CODEC_OK on the first
+     * frame, which must be a keyframe. */
+    if (video->frame_number() == 0)
+      ASSERT_EQ(VPX_CODEC_OK, res_peek) << "Peek return failed: "
+                                        << vpx_codec_err_to_string(res_peek);
+  } else {
+    /* The Vp9 implementation of PeekStream returns an error only if the
+     * data passed to it isn't a valid Vp9 chunk. */
+    ASSERT_EQ(VPX_CODEC_OK, res_peek) << "Peek return failed: "
+                                      << vpx_codec_err_to_string(res_peek);
+  }
+}
+
+void DecoderTest::RunLoop(CompressedVideoSource *video,
+                          const vpx_codec_dec_cfg_t &dec_cfg) {
+  Decoder* const decoder = codec_->CreateDecoder(dec_cfg, flags_, 0);
+  ASSERT_TRUE(decoder != NULL);
+  bool end_of_file = false;
+
+  // Decode frames.
+  for (video->Begin(); !::testing::Test::HasFailure() && !end_of_file;
+       video->Next()) {
+    PreDecodeFrameHook(*video, decoder);
+
+    vpx_codec_stream_info_t stream_info;
+    stream_info.sz = sizeof(stream_info);
+
+    if (video->cxdata() != NULL) {
+      const vpx_codec_err_t res_peek = decoder->PeekStream(video->cxdata(),
+                                                           video->frame_size(),
+                                                           &stream_info);
+      HandlePeekResult(decoder, video, res_peek);
+      ASSERT_FALSE(::testing::Test::HasFailure());
+
+      vpx_codec_err_t res_dec = decoder->DecodeFrame(video->cxdata(),
+                                                     video->frame_size());
+      if (!HandleDecodeResult(res_dec, *video, decoder))
+        break;
+    } else {
+      // Signal end of the file to the decoder.
+      const vpx_codec_err_t res_dec = decoder->DecodeFrame(NULL, 0);
+      ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError();
+      end_of_file = true;
+    }
+
+    DxDataIterator dec_iter = decoder->GetDxData();
+    const vpx_image_t *img = NULL;
+
+    // Get decompressed data
+    while ((img = dec_iter.Next()))
+      DecompressedFrameHook(*img, video->frame_number());
+  }
+  delete decoder;
+}
+
+void DecoderTest::RunLoop(CompressedVideoSource *video) {
+  vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t();
+  RunLoop(video, dec_cfg);
+}
+
+void DecoderTest::set_cfg(const vpx_codec_dec_cfg_t &dec_cfg) {
+  memcpy(&cfg_, &dec_cfg, sizeof(cfg_));
+}
+
+void DecoderTest::set_flags(const vpx_codec_flags_t flags) {
+  flags_ = flags;
+}
+
+}  // namespace libvpx_test
diff --git a/libs/libvpx/test/decode_test_driver.h b/libs/libvpx/test/decode_test_driver.h
new file mode 100644
index 0000000000..f566c53c7d
--- /dev/null
+++ b/libs/libvpx/test/decode_test_driver.h
@@ -0,0 +1,181 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef TEST_DECODE_TEST_DRIVER_H_
+#define TEST_DECODE_TEST_DRIVER_H_
+#include <cstring>
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "./vpx_config.h"
+#include "vpx/vpx_decoder.h"
+
+namespace libvpx_test {
+
+class CodecFactory;
+class CompressedVideoSource;
+
+// Provides an object to handle decoding output
+class DxDataIterator {
+ public:
+  explicit DxDataIterator(vpx_codec_ctx_t *decoder)
+      : decoder_(decoder), iter_(NULL) {}
+
+  const vpx_image_t *Next() {
+    return vpx_codec_get_frame(decoder_, &iter_);
+  }
+
+ private:
+  vpx_codec_ctx_t  *decoder_;
+  vpx_codec_iter_t  iter_;
+};
+
+// Provides a simplified interface to manage one video decoding.
+// Similar to Encoder class, the exact services should be added
+// as more tests are added.
+class Decoder {
+ public:
+  Decoder(vpx_codec_dec_cfg_t cfg, unsigned long deadline)
+      : cfg_(cfg), flags_(0), deadline_(deadline), init_done_(false) {
+    memset(&decoder_, 0, sizeof(decoder_));
+  }
+
+  Decoder(vpx_codec_dec_cfg_t cfg, const vpx_codec_flags_t flag,
+          unsigned long deadline)  // NOLINT
+      : cfg_(cfg), flags_(flag), deadline_(deadline), init_done_(false) {
+    memset(&decoder_, 0, sizeof(decoder_));
+  }
+
+  virtual ~Decoder() {
+    vpx_codec_destroy(&decoder_);
+  }
+
+  vpx_codec_err_t PeekStream(const uint8_t *cxdata, size_t size,
+                             vpx_codec_stream_info_t *stream_info);
+
+  vpx_codec_err_t DecodeFrame(const uint8_t *cxdata, size_t size);
+
+  vpx_codec_err_t DecodeFrame(const uint8_t *cxdata, size_t size,
+                              void *user_priv);
+
+  DxDataIterator GetDxData() {
+    return DxDataIterator(&decoder_);
+  }
+
+  void set_deadline(unsigned long deadline) {
+    deadline_ = deadline;
+  }
+
+  void Control(int ctrl_id, int arg) {
+    Control(ctrl_id, arg, VPX_CODEC_OK);
+  }
+
+  void Control(int ctrl_id, const void *arg) {
+    InitOnce();
+    const vpx_codec_err_t res = vpx_codec_control_(&decoder_, ctrl_id, arg);
+    ASSERT_EQ(VPX_CODEC_OK, res) << DecodeError();
+  }
+
+  void Control(int ctrl_id, int arg, vpx_codec_err_t expected_value) {
+    InitOnce();
+    const vpx_codec_err_t res = vpx_codec_control_(&decoder_, ctrl_id, arg);
+    ASSERT_EQ(expected_value, res) << DecodeError();
+  }
+
+  const char* DecodeError() {
+    const char *detail = vpx_codec_error_detail(&decoder_);
+    return detail ? detail : vpx_codec_error(&decoder_);
+  }
+
+  // Passes the external frame buffer information to libvpx.
+  vpx_codec_err_t SetFrameBufferFunctions(
+      vpx_get_frame_buffer_cb_fn_t cb_get,
+      vpx_release_frame_buffer_cb_fn_t cb_release, void *user_priv) {
+    InitOnce();
+    return vpx_codec_set_frame_buffer_functions(
+        &decoder_, cb_get, cb_release, user_priv);
+  }
+
+  const char* GetDecoderName() const {
+    return vpx_codec_iface_name(CodecInterface());
+  }
+
+  bool IsVP8() const;
+
+  vpx_codec_ctx_t * GetDecoder() {
+    return &decoder_;
+  }
+
+ protected:
+  virtual vpx_codec_iface_t* CodecInterface() const = 0;
+
+  void InitOnce() {
+    if (!init_done_) {
+      const vpx_codec_err_t res = vpx_codec_dec_init(&decoder_,
+                                                     CodecInterface(),
+                                                     &cfg_, flags_);
+      ASSERT_EQ(VPX_CODEC_OK, res) << DecodeError();
+      init_done_ = true;
+    }
+  }
+
+  vpx_codec_ctx_t     decoder_;
+  vpx_codec_dec_cfg_t cfg_;
+  vpx_codec_flags_t   flags_;
+  unsigned int        deadline_;
+  bool                init_done_;
+};
+
+// Common test functionality for all Decoder tests.
+class DecoderTest {
+ public:
+  // Main decoding loop
+  virtual void RunLoop(CompressedVideoSource *video);
+  virtual void RunLoop(CompressedVideoSource *video,
+                       const vpx_codec_dec_cfg_t &dec_cfg);
+
+  virtual void set_cfg(const vpx_codec_dec_cfg_t &dec_cfg);
+  virtual void set_flags(const vpx_codec_flags_t flags);
+
+  // Hook to be called before decompressing every frame.
+  virtual void PreDecodeFrameHook(const CompressedVideoSource& /*video*/,
+                                  Decoder* /*decoder*/) {}
+
+  // Hook to be called to handle decode result. Return true to continue.
+  virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec,
+                                  const CompressedVideoSource& /*video*/,
+                                  Decoder *decoder) {
+    EXPECT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError();
+    return VPX_CODEC_OK == res_dec;
+  }
+
+  // Hook to be called on every decompressed frame.
+  virtual void DecompressedFrameHook(const vpx_image_t& /*img*/,
+                                     const unsigned int /*frame_number*/) {}
+
+  // Hook to be called on peek result
+  virtual void HandlePeekResult(Decoder* const decoder,
+                                CompressedVideoSource *video,
+                                const vpx_codec_err_t res_peek);
+
+ protected:
+  explicit DecoderTest(const CodecFactory *codec)
+      : codec_(codec),
+        cfg_(),
+        flags_(0) {}
+
+  virtual ~DecoderTest() {}
+
+  const CodecFactory *codec_;
+  vpx_codec_dec_cfg_t cfg_;
+  vpx_codec_flags_t   flags_;
+};
+
+}  // namespace libvpx_test
+
+#endif  // TEST_DECODE_TEST_DRIVER_H_
diff --git a/libs/libvpx/test/decode_to_md5.sh b/libs/libvpx/test/decode_to_md5.sh
new file mode 100755
index 0000000000..854b74f84f
--- /dev/null
+++ b/libs/libvpx/test/decode_to_md5.sh
@@ -0,0 +1,73 @@
+#!/bin/sh
+##
+##  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+##  This file tests the libvpx decode_to_md5 example. To add new tests to this
+##  file, do the following:
+##    1. Write a shell function (this is your test).
+##    2. Add the function to decode_to_md5_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: Make sure input is available:
+#   $VP8_IVF_FILE and $VP9_IVF_FILE are required.
+decode_to_md5_verify_environment() {
+  if [ ! -e "${VP8_IVF_FILE}" ] || [ ! -e "${VP9_IVF_FILE}" ]; then
+    echo "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
+    return 1
+  fi
+}
+
+# Runs decode_to_md5 on $1 and captures the md5 sum for the final frame. $2 is
+# interpreted as codec name and used solely to name the output file. $3 is the
+# expected md5 sum: It must match that of the final frame.
+decode_to_md5() {
+  local decoder="${LIBVPX_BIN_PATH}/decode_to_md5${VPX_TEST_EXE_SUFFIX}"
+  local input_file="$1"
+  local codec="$2"
+  local expected_md5="$3"
+  local output_file="${VPX_TEST_OUTPUT_DIR}/decode_to_md5_${codec}"
+
+  if [ ! -x "${decoder}" ]; then
+    elog "${decoder} does not exist or is not executable."
+    return 1
+  fi
+
+  eval "${VPX_TEST_PREFIX}" "${decoder}" "${input_file}" "${output_file}" \
+      ${devnull}
+
+  [ -e "${output_file}" ] || return 1
+
+  local md5_last_frame="$(tail -n1 "${output_file}" | awk '{print $1}')"
+  local actual_md5="$(echo "${md5_last_frame}" | awk '{print $1}')"
+  [ "${actual_md5}" = "${expected_md5}" ] || return 1
+}
+
+decode_to_md5_vp8() {
+  # expected MD5 sum for the last frame.
+  local expected_md5="56794d911b02190212bca92f88ad60c6"
+
+  if [ "$(vp8_decode_available)" = "yes" ]; then
+    decode_to_md5 "${VP8_IVF_FILE}" "vp8" "${expected_md5}"
+  fi
+}
+
+decode_to_md5_vp9() {
+  # expected MD5 sum for the last frame.
+  local expected_md5="2952c0eae93f3dadd1aa84c50d3fd6d2"
+
+  if [ "$(vp9_decode_available)" = "yes" ]; then
+    decode_to_md5 "${VP9_IVF_FILE}" "vp9" "${expected_md5}"
+  fi
+}
+
+decode_to_md5_tests="decode_to_md5_vp8
+                     decode_to_md5_vp9"
+
+run_tests decode_to_md5_verify_environment "${decode_to_md5_tests}"
diff --git a/libs/libvpx/test/decode_with_drops.sh b/libs/libvpx/test/decode_with_drops.sh
new file mode 100755
index 0000000000..9b2edb6429
--- /dev/null
+++ b/libs/libvpx/test/decode_with_drops.sh
@@ -0,0 +1,79 @@
+#!/bin/sh
+##
+##  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+##  This file tests the libvpx decode_with_drops example. To add new tests to
+##  this file, do the following:
+##    1. Write a shell function (this is your test).
+##    2. Add the function to decode_with_drops_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: Make sure input is available:
+#   $VP8_IVF_FILE and $VP9_IVF_FILE are required.
+decode_with_drops_verify_environment() {
+  if [ ! -e "${VP8_IVF_FILE}" ] || [ ! -e "${VP9_IVF_FILE}" ]; then
+    echo "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
+    return 1
+  fi
+}
+
+# Runs decode_with_drops on $1, $2 is interpreted as codec name and used solely
+# to name the output file. $3 is the drop mode, and is passed directly to
+# decode_with_drops.
+decode_with_drops() {
+  local decoder="${LIBVPX_BIN_PATH}/decode_with_drops${VPX_TEST_EXE_SUFFIX}"
+  local input_file="$1"
+  local codec="$2"
+  local output_file="${VPX_TEST_OUTPUT_DIR}/decode_with_drops_${codec}"
+  local drop_mode="$3"
+
+  if [ ! -x "${decoder}" ]; then
+    elog "${decoder} does not exist or is not executable."
+    return 1
+  fi
+
+  eval "${VPX_TEST_PREFIX}" "${decoder}" "${input_file}" "${output_file}" \
+      "${drop_mode}" ${devnull}
+
+  [ -e "${output_file}" ] || return 1
+}
+
+# Decodes $VP8_IVF_FILE while dropping frames, twice: once in sequence mode,
+# and once in pattern mode.
+# Note: This test assumes that $VP8_IVF_FILE has exactly 29 frames, and could
+# break if the file is modified.
+decode_with_drops_vp8() {
+  if [ "$(vp8_decode_available)" = "yes" ]; then
+    # Test sequence mode: Drop frames 2-28.
+    decode_with_drops "${VP8_IVF_FILE}" "vp8" "2-28"
+
+    # Test pattern mode: Drop 3 of every 4 frames.
+    decode_with_drops "${VP8_IVF_FILE}" "vp8" "3/4"
+  fi
+}
+
+# Decodes $VP9_IVF_FILE while dropping frames, twice: once in sequence mode,
+# and once in pattern mode.
+# Note: This test assumes that $VP9_IVF_FILE has exactly 20 frames, and could
+# break if the file is modified.
+decode_with_drops_vp9() {
+  if [ "$(vp9_decode_available)" = "yes" ]; then
+    # Test sequence mode: Drop frames 2-28.
+    decode_with_drops "${VP9_IVF_FILE}" "vp9" "2-19"
+
+    # Test pattern mode: Drop 3 of every 4 frames.
+    decode_with_drops "${VP9_IVF_FILE}" "vp9" "3/4"
+  fi
+}
+
+decode_with_drops_tests="decode_with_drops_vp8
+                         decode_with_drops_vp9"
+
+run_tests decode_with_drops_verify_environment "${decode_with_drops_tests}"
diff --git a/libs/libvpx/test/encode_api_test.cc b/libs/libvpx/test/encode_api_test.cc
new file mode 100644
index 0000000000..a7200e653a
--- /dev/null
+++ b/libs/libvpx/test/encode_api_test.cc
@@ -0,0 +1,68 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_config.h"
+#include "vpx/vp8cx.h"
+#include "vpx/vpx_encoder.h"
+
+namespace {
+
+#define NELEMENTS(x) static_cast<int>(sizeof(x) / sizeof(x[0]))
+
+TEST(EncodeAPI, InvalidParams) {
+  static const vpx_codec_iface_t *kCodecs[] = {
+#if CONFIG_VP8_ENCODER
+    &vpx_codec_vp8_cx_algo,
+#endif
+#if CONFIG_VP9_ENCODER
+    &vpx_codec_vp9_cx_algo,
+#endif
+#if CONFIG_VP10_ENCODER
+    &vpx_codec_vp10_cx_algo,
+#endif
+  };
+  uint8_t buf[1] = {0};
+  vpx_image_t img;
+  vpx_codec_ctx_t enc;
+  vpx_codec_enc_cfg_t cfg;
+
+  EXPECT_EQ(&img, vpx_img_wrap(&img, VPX_IMG_FMT_I420, 1, 1, 1, buf));
+
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_enc_init(NULL, NULL, NULL, 0));
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_enc_init(&enc, NULL, NULL, 0));
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_encode(NULL, NULL, 0, 0, 0, 0));
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_encode(NULL, &img, 0, 0, 0, 0));
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_destroy(NULL));
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
+            vpx_codec_enc_config_default(NULL, NULL, 0));
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
+            vpx_codec_enc_config_default(NULL, &cfg, 0));
+  EXPECT_TRUE(vpx_codec_error(NULL) != NULL);
+
+  for (int i = 0; i < NELEMENTS(kCodecs); ++i) {
+    SCOPED_TRACE(vpx_codec_iface_name(kCodecs[i]));
+    EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
+              vpx_codec_enc_init(NULL, kCodecs[i], NULL, 0));
+    EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
+              vpx_codec_enc_init(&enc, kCodecs[i], NULL, 0));
+    EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
+              vpx_codec_enc_config_default(kCodecs[i], &cfg, 1));
+
+    EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_config_default(kCodecs[i], &cfg, 0));
+    EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_init(&enc, kCodecs[i], &cfg, 0));
+    EXPECT_EQ(VPX_CODEC_OK, vpx_codec_encode(&enc, NULL, 0, 0, 0, 0));
+
+    EXPECT_EQ(VPX_CODEC_OK, vpx_codec_destroy(&enc));
+  }
+}
+
+}  // namespace
diff --git a/libs/libvpx/test/encode_perf_test.cc b/libs/libvpx/test/encode_perf_test.cc
new file mode 100644
index 0000000000..7e9f0d6c44
--- /dev/null
+++ b/libs/libvpx/test/encode_perf_test.cc
@@ -0,0 +1,202 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <string>
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "./vpx_config.h"
+#include "./vpx_version.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "vpx_ports/vpx_timer.h"
+
+namespace {
+
+const int kMaxPsnr = 100;
+const double kUsecsInSec = 1000000.0;
+
+struct EncodePerfTestVideo {
+  EncodePerfTestVideo(const char *name_, uint32_t width_, uint32_t height_,
+                      uint32_t bitrate_, int frames_)
+      : name(name_),
+        width(width_),
+        height(height_),
+        bitrate(bitrate_),
+        frames(frames_) {}
+  const char *name;
+  uint32_t width;
+  uint32_t height;
+  uint32_t bitrate;
+  int frames;
+};
+
+const EncodePerfTestVideo kVP9EncodePerfTestVectors[] = {
+  EncodePerfTestVideo("desktop_640_360_30.yuv", 640, 360, 200, 2484),
+  EncodePerfTestVideo("kirland_640_480_30.yuv", 640, 480, 200, 300),
+  EncodePerfTestVideo("macmarcomoving_640_480_30.yuv", 640, 480, 200, 987),
+  EncodePerfTestVideo("macmarcostationary_640_480_30.yuv", 640, 480, 200, 718),
+  EncodePerfTestVideo("niklas_640_480_30.yuv", 640, 480, 200, 471),
+  EncodePerfTestVideo("tacomanarrows_640_480_30.yuv", 640, 480, 200, 300),
+  EncodePerfTestVideo("tacomasmallcameramovement_640_480_30.yuv",
+                      640, 480, 200, 300),
+  EncodePerfTestVideo("thaloundeskmtg_640_480_30.yuv", 640, 480, 200, 300),
+  EncodePerfTestVideo("niklas_1280_720_30.yuv", 1280, 720, 600, 470),
+};
+
+const int kEncodePerfTestSpeeds[] = { 5, 6, 7, 8 };
+const int kEncodePerfTestThreads[] = { 1, 2, 4 };
+
+#define NELEMENTS(x) (sizeof((x)) / sizeof((x)[0]))
+
+class VP9EncodePerfTest
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
+ protected:
+  VP9EncodePerfTest()
+      : EncoderTest(GET_PARAM(0)),
+        min_psnr_(kMaxPsnr),
+        nframes_(0),
+        encoding_mode_(GET_PARAM(1)),
+        speed_(0),
+        threads_(1) {}
+
+  virtual ~VP9EncodePerfTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+
+    cfg_.g_lag_in_frames = 0;
+    cfg_.rc_min_quantizer = 2;
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_undershoot_pct = 50;
+    cfg_.rc_overshoot_pct = 50;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 600;
+    cfg_.rc_resize_allowed = 0;
+    cfg_.rc_end_usage = VPX_CBR;
+    cfg_.g_error_resilient = 1;
+    cfg_.g_threads = threads_;
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      const int log2_tile_columns = 3;
+      encoder->Control(VP8E_SET_CPUUSED, speed_);
+      encoder->Control(VP9E_SET_TILE_COLUMNS, log2_tile_columns);
+      encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 1);
+      encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 0);
+    }
+  }
+
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+    min_psnr_ = kMaxPsnr;
+    nframes_ = 0;
+  }
+
+  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+    if (pkt->data.psnr.psnr[0] < min_psnr_) {
+      min_psnr_= pkt->data.psnr.psnr[0];
+    }
+  }
+
+  // for performance reasons don't decode
+  virtual bool DoDecode() { return 0; }
+
+  double min_psnr() const {
+    return min_psnr_;
+  }
+
+  void set_speed(unsigned int speed) {
+    speed_ = speed;
+  }
+
+  void set_threads(unsigned int threads) {
+    threads_ = threads;
+  }
+
+ private:
+  double min_psnr_;
+  unsigned int nframes_;
+  libvpx_test::TestMode encoding_mode_;
+  unsigned speed_;
+  unsigned int threads_;
+};
+
+TEST_P(VP9EncodePerfTest, PerfTest) {
+  for (size_t i = 0; i < NELEMENTS(kVP9EncodePerfTestVectors); ++i) {
+    for (size_t j = 0; j < NELEMENTS(kEncodePerfTestSpeeds); ++j) {
+      for (size_t k = 0; k < NELEMENTS(kEncodePerfTestThreads); ++k) {
+        if (kVP9EncodePerfTestVectors[i].width < 512 &&
+            kEncodePerfTestThreads[k] > 1)
+          continue;
+        else if (kVP9EncodePerfTestVectors[i].width < 1024 &&
+                 kEncodePerfTestThreads[k] > 2)
+          continue;
+
+        set_threads(kEncodePerfTestThreads[k]);
+        SetUp();
+
+        const vpx_rational timebase = { 33333333, 1000000000 };
+        cfg_.g_timebase = timebase;
+        cfg_.rc_target_bitrate = kVP9EncodePerfTestVectors[i].bitrate;
+
+        init_flags_ = VPX_CODEC_USE_PSNR;
+
+        const unsigned frames = kVP9EncodePerfTestVectors[i].frames;
+        const char *video_name = kVP9EncodePerfTestVectors[i].name;
+        libvpx_test::I420VideoSource video(
+            video_name,
+            kVP9EncodePerfTestVectors[i].width,
+            kVP9EncodePerfTestVectors[i].height,
+            timebase.den, timebase.num, 0,
+            kVP9EncodePerfTestVectors[i].frames);
+        set_speed(kEncodePerfTestSpeeds[j]);
+
+        vpx_usec_timer t;
+        vpx_usec_timer_start(&t);
+
+        ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+        vpx_usec_timer_mark(&t);
+        const double elapsed_secs = vpx_usec_timer_elapsed(&t) / kUsecsInSec;
+        const double fps = frames / elapsed_secs;
+        const double minimum_psnr = min_psnr();
+        std::string display_name(video_name);
+        if (kEncodePerfTestThreads[k] > 1) {
+          char thread_count[32];
+          snprintf(thread_count, sizeof(thread_count), "_t-%d",
+                   kEncodePerfTestThreads[k]);
+          display_name += thread_count;
+        }
+
+        printf("{\n");
+        printf("\t\"type\" : \"encode_perf_test\",\n");
+        printf("\t\"version\" : \"%s\",\n", VERSION_STRING_NOSP);
+        printf("\t\"videoName\" : \"%s\",\n", display_name.c_str());
+        printf("\t\"encodeTimeSecs\" : %f,\n", elapsed_secs);
+        printf("\t\"totalFrames\" : %u,\n", frames);
+        printf("\t\"framesPerSecond\" : %f,\n", fps);
+        printf("\t\"minPsnr\" : %f,\n", minimum_psnr);
+        printf("\t\"speed\" : %d,\n", kEncodePerfTestSpeeds[j]);
+        printf("\t\"threads\" : %d\n", kEncodePerfTestThreads[k]);
+        printf("}\n");
+      }
+    }
+  }
+}
+
+VP9_INSTANTIATE_TEST_CASE(
+    VP9EncodePerfTest, ::testing::Values(::libvpx_test::kRealTime));
+}  // namespace
diff --git a/libs/libvpx/test/encode_test_driver.cc b/libs/libvpx/test/encode_test_driver.cc
new file mode 100644
index 0000000000..128436ee91
--- /dev/null
+++ b/libs/libvpx/test/encode_test_driver.cc
@@ -0,0 +1,282 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_config.h"
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/encode_test_driver.h"
+#include "test/register_state_check.h"
+#include "test/video_source.h"
+
+namespace libvpx_test {
+void Encoder::InitEncoder(VideoSource *video) {
+  vpx_codec_err_t res;
+  const vpx_image_t *img = video->img();
+
+  if (video->img() && !encoder_.priv) {
+    cfg_.g_w = img->d_w;
+    cfg_.g_h = img->d_h;
+    cfg_.g_timebase = video->timebase();
+    cfg_.rc_twopass_stats_in = stats_->buf();
+
+    res = vpx_codec_enc_init(&encoder_, CodecInterface(), &cfg_,
+                             init_flags_);
+    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
+
+#if CONFIG_VP9_ENCODER
+    if (CodecInterface() == &vpx_codec_vp9_cx_algo) {
+      // Default to 1 tile column for VP9.
+      const int log2_tile_columns = 0;
+      res = vpx_codec_control_(&encoder_, VP9E_SET_TILE_COLUMNS,
+                               log2_tile_columns);
+      ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
+    } else
+#endif
+#if CONFIG_VP10_ENCODER
+    if (CodecInterface() == &vpx_codec_vp10_cx_algo) {
+      // Default to 1 tile column for VP10.
+      const int log2_tile_columns = 0;
+      res = vpx_codec_control_(&encoder_, VP9E_SET_TILE_COLUMNS,
+                               log2_tile_columns);
+      ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
+    } else
+#endif
+    {
+#if CONFIG_VP8_ENCODER
+      ASSERT_EQ(&vpx_codec_vp8_cx_algo, CodecInterface())
+          << "Unknown Codec Interface";
+#endif
+    }
+  }
+}
+
+void Encoder::EncodeFrame(VideoSource *video, const unsigned long frame_flags) {
+  if (video->img())
+    EncodeFrameInternal(*video, frame_flags);
+  else
+    Flush();
+
+  // Handle twopass stats
+  CxDataIterator iter = GetCxData();
+
+  while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) {
+    if (pkt->kind != VPX_CODEC_STATS_PKT)
+      continue;
+
+    stats_->Append(*pkt);
+  }
+}
+
+void Encoder::EncodeFrameInternal(const VideoSource &video,
+                                  const unsigned long frame_flags) {
+  vpx_codec_err_t res;
+  const vpx_image_t *img = video.img();
+
+  // Handle frame resizing
+  if (cfg_.g_w != img->d_w || cfg_.g_h != img->d_h) {
+    cfg_.g_w = img->d_w;
+    cfg_.g_h = img->d_h;
+    res = vpx_codec_enc_config_set(&encoder_, &cfg_);
+    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
+  }
+
+  // Encode the frame
+  API_REGISTER_STATE_CHECK(
+      res = vpx_codec_encode(&encoder_, img, video.pts(), video.duration(),
+                             frame_flags, deadline_));
+  ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
+}
+
+void Encoder::Flush() {
+  const vpx_codec_err_t res = vpx_codec_encode(&encoder_, NULL, 0, 0, 0,
+                                               deadline_);
+  if (!encoder_.priv)
+    ASSERT_EQ(VPX_CODEC_ERROR, res) << EncoderError();
+  else
+    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
+}
+
+void EncoderTest::InitializeConfig() {
+  const vpx_codec_err_t res = codec_->DefaultEncoderConfig(&cfg_, 0);
+  dec_cfg_ = vpx_codec_dec_cfg_t();
+  ASSERT_EQ(VPX_CODEC_OK, res);
+}
+
+void EncoderTest::SetMode(TestMode mode) {
+  switch (mode) {
+    case kRealTime:
+      deadline_ = VPX_DL_REALTIME;
+      break;
+
+    case kOnePassGood:
+    case kTwoPassGood:
+      deadline_ = VPX_DL_GOOD_QUALITY;
+      break;
+
+    case kOnePassBest:
+    case kTwoPassBest:
+      deadline_ = VPX_DL_BEST_QUALITY;
+      break;
+
+    default:
+      ASSERT_TRUE(false) << "Unexpected mode " << mode;
+  }
+
+  if (mode == kTwoPassGood || mode == kTwoPassBest)
+    passes_ = 2;
+  else
+    passes_ = 1;
+}
+// The function should return "true" most of the time, therefore no early
+// break-out is implemented within the match checking process.
+static bool compare_img(const vpx_image_t *img1,
+                        const vpx_image_t *img2) {
+  bool match = (img1->fmt == img2->fmt) &&
+               (img1->cs == img2->cs) &&
+               (img1->d_w == img2->d_w) &&
+               (img1->d_h == img2->d_h);
+
+  const unsigned int width_y  = img1->d_w;
+  const unsigned int height_y = img1->d_h;
+  unsigned int i;
+  for (i = 0; i < height_y; ++i)
+    match = (memcmp(img1->planes[VPX_PLANE_Y] + i * img1->stride[VPX_PLANE_Y],
+                    img2->planes[VPX_PLANE_Y] + i * img2->stride[VPX_PLANE_Y],
+                    width_y) == 0) && match;
+  const unsigned int width_uv  = (img1->d_w + 1) >> 1;
+  const unsigned int height_uv = (img1->d_h + 1) >> 1;
+  for (i = 0; i <  height_uv; ++i)
+    match = (memcmp(img1->planes[VPX_PLANE_U] + i * img1->stride[VPX_PLANE_U],
+                    img2->planes[VPX_PLANE_U] + i * img2->stride[VPX_PLANE_U],
+                    width_uv) == 0) && match;
+  for (i = 0; i < height_uv; ++i)
+    match = (memcmp(img1->planes[VPX_PLANE_V] + i * img1->stride[VPX_PLANE_V],
+                    img2->planes[VPX_PLANE_V] + i * img2->stride[VPX_PLANE_V],
+                    width_uv) == 0) && match;
+  return match;
+}
+
+void EncoderTest::MismatchHook(const vpx_image_t* /*img1*/,
+                               const vpx_image_t* /*img2*/) {
+  ASSERT_TRUE(0) << "Encode/Decode mismatch found";
+}
+
+void EncoderTest::RunLoop(VideoSource *video) {
+  vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t();
+
+  stats_.Reset();
+
+  ASSERT_TRUE(passes_ == 1 || passes_ == 2);
+  for (unsigned int pass = 0; pass < passes_; pass++) {
+    last_pts_ = 0;
+
+    if (passes_ == 1)
+      cfg_.g_pass = VPX_RC_ONE_PASS;
+    else if (pass == 0)
+      cfg_.g_pass = VPX_RC_FIRST_PASS;
+    else
+      cfg_.g_pass = VPX_RC_LAST_PASS;
+
+    BeginPassHook(pass);
+    Encoder* const encoder = codec_->CreateEncoder(cfg_, deadline_, init_flags_,
+                                                   &stats_);
+    ASSERT_TRUE(encoder != NULL);
+
+    video->Begin();
+    encoder->InitEncoder(video);
+    ASSERT_FALSE(::testing::Test::HasFatalFailure());
+
+    unsigned long dec_init_flags = 0;  // NOLINT
+    // Use fragment decoder if encoder outputs partitions.
+    // NOTE: fragment decoder and partition encoder are only supported by VP8.
+    if (init_flags_ & VPX_CODEC_USE_OUTPUT_PARTITION)
+      dec_init_flags |= VPX_CODEC_USE_INPUT_FRAGMENTS;
+    Decoder* const decoder = codec_->CreateDecoder(dec_cfg, dec_init_flags, 0);
+    bool again;
+    for (again = true; again; video->Next()) {
+      again = (video->img() != NULL);
+
+      PreEncodeFrameHook(video);
+      PreEncodeFrameHook(video, encoder);
+      encoder->EncodeFrame(video, frame_flags_);
+
+      CxDataIterator iter = encoder->GetCxData();
+
+      bool has_cxdata = false;
+      bool has_dxdata = false;
+      while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) {
+        pkt = MutateEncoderOutputHook(pkt);
+        again = true;
+        switch (pkt->kind) {
+          case VPX_CODEC_CX_FRAME_PKT:
+            has_cxdata = true;
+            if (decoder && DoDecode()) {
+              vpx_codec_err_t res_dec = decoder->DecodeFrame(
+                  (const uint8_t*)pkt->data.frame.buf, pkt->data.frame.sz);
+
+              if (!HandleDecodeResult(res_dec, *video, decoder))
+                break;
+
+              has_dxdata = true;
+            }
+            ASSERT_GE(pkt->data.frame.pts, last_pts_);
+            last_pts_ = pkt->data.frame.pts;
+            FramePktHook(pkt);
+            break;
+
+          case VPX_CODEC_PSNR_PKT:
+            PSNRPktHook(pkt);
+            break;
+
+          default:
+            break;
+        }
+      }
+
+      // Flush the decoder when there are no more fragments.
+      if ((init_flags_ & VPX_CODEC_USE_OUTPUT_PARTITION) && has_dxdata) {
+        const vpx_codec_err_t res_dec = decoder->DecodeFrame(NULL, 0);
+        if (!HandleDecodeResult(res_dec, *video, decoder))
+          break;
+      }
+
+      if (has_dxdata && has_cxdata) {
+        const vpx_image_t *img_enc = encoder->GetPreviewFrame();
+        DxDataIterator dec_iter = decoder->GetDxData();
+        const vpx_image_t *img_dec = dec_iter.Next();
+        if (img_enc && img_dec) {
+          const bool res = compare_img(img_enc, img_dec);
+          if (!res) {  // Mismatch
+            MismatchHook(img_enc, img_dec);
+          }
+        }
+        if (img_dec)
+          DecompressedFrameHook(*img_dec, video->pts());
+      }
+      if (!Continue())
+        break;
+    }
+
+    EndPassHook();
+
+    if (decoder)
+      delete decoder;
+    delete encoder;
+
+    if (!Continue())
+      break;
+  }
+}
+
+}  // namespace libvpx_test
diff --git a/libs/libvpx/test/encode_test_driver.h b/libs/libvpx/test/encode_test_driver.h
new file mode 100644
index 0000000000..6d0a72f980
--- /dev/null
+++ b/libs/libvpx/test/encode_test_driver.h
@@ -0,0 +1,278 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef TEST_ENCODE_TEST_DRIVER_H_
+#define TEST_ENCODE_TEST_DRIVER_H_
+
+#include <string>
+#include <vector>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_config.h"
+#if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
+#include "vpx/vp8cx.h"
+#endif
+#include "vpx/vpx_encoder.h"
+
+namespace libvpx_test {
+
+class CodecFactory;
+class VideoSource;
+
+enum TestMode {
+  kRealTime,
+  kOnePassGood,
+  kOnePassBest,
+  kTwoPassGood,
+  kTwoPassBest
+};
+#define ALL_TEST_MODES ::testing::Values(::libvpx_test::kRealTime, \
+                                         ::libvpx_test::kOnePassGood, \
+                                         ::libvpx_test::kOnePassBest, \
+                                         ::libvpx_test::kTwoPassGood, \
+                                         ::libvpx_test::kTwoPassBest)
+
+#define ONE_PASS_TEST_MODES ::testing::Values(::libvpx_test::kRealTime, \
+                                              ::libvpx_test::kOnePassGood, \
+                                              ::libvpx_test::kOnePassBest)
+
+#define TWO_PASS_TEST_MODES ::testing::Values(::libvpx_test::kTwoPassGood, \
+                                              ::libvpx_test::kTwoPassBest)
+
+
+// Provides an object to handle the libvpx get_cx_data() iteration pattern
+class CxDataIterator {
+ public:
+  explicit CxDataIterator(vpx_codec_ctx_t *encoder)
+      : encoder_(encoder), iter_(NULL) {}
+
+  const vpx_codec_cx_pkt_t *Next() {
+    return vpx_codec_get_cx_data(encoder_, &iter_);
+  }
+
+ private:
+  vpx_codec_ctx_t  *encoder_;
+  vpx_codec_iter_t  iter_;
+};
+
+// Implements an in-memory store for libvpx twopass statistics
+class TwopassStatsStore {
+ public:
+  void Append(const vpx_codec_cx_pkt_t &pkt) {
+    buffer_.append(reinterpret_cast<char *>(pkt.data.twopass_stats.buf),
+                   pkt.data.twopass_stats.sz);
+  }
+
+  vpx_fixed_buf_t buf() {
+    const vpx_fixed_buf_t buf = { &buffer_[0], buffer_.size() };
+    return buf;
+  }
+
+  void Reset() {
+    buffer_.clear();
+  }
+
+ protected:
+  std::string  buffer_;
+};
+
+
+// Provides a simplified interface to manage one video encoding pass, given
+// a configuration and video source.
+//
+// TODO(jkoleszar): The exact services it provides and the appropriate
+// level of abstraction will be fleshed out as more tests are written.
+class Encoder {
+ public:
+  Encoder(vpx_codec_enc_cfg_t cfg, unsigned long deadline,
+          const unsigned long init_flags, TwopassStatsStore *stats)
+      : cfg_(cfg), deadline_(deadline), init_flags_(init_flags), stats_(stats) {
+    memset(&encoder_, 0, sizeof(encoder_));
+  }
+
+  virtual ~Encoder() {
+    vpx_codec_destroy(&encoder_);
+  }
+
+  CxDataIterator GetCxData() {
+    return CxDataIterator(&encoder_);
+  }
+
+  void InitEncoder(VideoSource *video);
+
+  const vpx_image_t *GetPreviewFrame() {
+    return vpx_codec_get_preview_frame(&encoder_);
+  }
+  // This is a thin wrapper around vpx_codec_encode(), so refer to
+  // vpx_encoder.h for its semantics.
+  void EncodeFrame(VideoSource *video, const unsigned long frame_flags);
+
+  // Convenience wrapper for EncodeFrame()
+  void EncodeFrame(VideoSource *video) {
+    EncodeFrame(video, 0);
+  }
+
+  void Control(int ctrl_id, int arg) {
+    const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
+    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
+  }
+
+  void Control(int ctrl_id, int *arg) {
+    const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
+    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
+  }
+
+  void Control(int ctrl_id, struct vpx_scaling_mode *arg) {
+    const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
+    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
+  }
+
+  void Control(int ctrl_id, struct vpx_svc_layer_id *arg) {
+    const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
+    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
+  }
+
+  void Control(int ctrl_id, struct vpx_svc_parameters *arg) {
+    const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
+    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
+  }
+#if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
+  void Control(int ctrl_id, vpx_active_map_t *arg) {
+    const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
+    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
+  }
+#endif
+
+  void Config(const vpx_codec_enc_cfg_t *cfg) {
+    const vpx_codec_err_t res = vpx_codec_enc_config_set(&encoder_, cfg);
+    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
+    cfg_ = *cfg;
+  }
+
+  void set_deadline(unsigned long deadline) {
+    deadline_ = deadline;
+  }
+
+ protected:
+  virtual vpx_codec_iface_t* CodecInterface() const = 0;
+
+  const char *EncoderError() {
+    const char *detail = vpx_codec_error_detail(&encoder_);
+    return detail ? detail : vpx_codec_error(&encoder_);
+  }
+
+  // Encode an image
+  void EncodeFrameInternal(const VideoSource &video,
+                           const unsigned long frame_flags);
+
+  // Flush the encoder on EOS
+  void Flush();
+
+  vpx_codec_ctx_t      encoder_;
+  vpx_codec_enc_cfg_t  cfg_;
+  unsigned long        deadline_;
+  unsigned long        init_flags_;
+  TwopassStatsStore   *stats_;
+};
+
+// Common test functionality for all Encoder tests.
+//
+// This class is a mixin which provides the main loop common to all
+// encoder tests. It provides hooks which can be overridden by subclasses
+// to implement each test's specific behavior, while centralizing the bulk
+// of the boilerplate. Note that it doesn't inherit the gtest testing
+// classes directly, so that tests can be parameterized differently.
+class EncoderTest {
+ protected:
+  explicit EncoderTest(const CodecFactory *codec)
+      : codec_(codec), abort_(false), init_flags_(0), frame_flags_(0),
+        last_pts_(0) {
+    // Default to 1 thread.
+    cfg_.g_threads = 1;
+  }
+
+  virtual ~EncoderTest() {}
+
+  // Initialize the cfg_ member with the default configuration.
+  void InitializeConfig();
+
+  // Map the TestMode enum to the deadline_ and passes_ variables.
+  void SetMode(TestMode mode);
+
+  // Set encoder flag.
+  void set_init_flags(unsigned long flag) {  // NOLINT(runtime/int)
+    init_flags_ = flag;
+  }
+
+  // Main loop
+  virtual void RunLoop(VideoSource *video);
+
+  // Hook to be called at the beginning of a pass.
+  virtual void BeginPassHook(unsigned int /*pass*/) {}
+
+  // Hook to be called at the end of a pass.
+  virtual void EndPassHook() {}
+
+  // Hook to be called before encoding a frame.
+  virtual void PreEncodeFrameHook(VideoSource* /*video*/) {}
+  virtual void PreEncodeFrameHook(VideoSource* /*video*/,
+                                  Encoder* /*encoder*/) {}
+
+  // Hook to be called on every compressed data packet.
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t* /*pkt*/) {}
+
+  // Hook to be called on every PSNR packet.
+  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t* /*pkt*/) {}
+
+  // Hook to determine whether the encode loop should continue.
+  virtual bool Continue() const {
+    return !(::testing::Test::HasFatalFailure() || abort_);
+  }
+
+  const CodecFactory   *codec_;
+  // Hook to determine whether to decode frame after encoding
+  virtual bool DoDecode() const { return 1; }
+
+  // Hook to handle encode/decode mismatch
+  virtual void MismatchHook(const vpx_image_t *img1,
+                            const vpx_image_t *img2);
+
+  // Hook to be called on every decompressed frame.
+  virtual void DecompressedFrameHook(const vpx_image_t& /*img*/,
+                                     vpx_codec_pts_t /*pts*/) {}
+
+  // Hook to be called to handle decode result. Return true to continue.
+  virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec,
+                                  const VideoSource& /*video*/,
+                                  Decoder *decoder) {
+    EXPECT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError();
+    return VPX_CODEC_OK == res_dec;
+  }
+
+  // Hook that can modify the encoder's output data
+  virtual const vpx_codec_cx_pkt_t *MutateEncoderOutputHook(
+      const vpx_codec_cx_pkt_t *pkt) {
+    return pkt;
+  }
+
+  bool                 abort_;
+  vpx_codec_enc_cfg_t  cfg_;
+  vpx_codec_dec_cfg_t  dec_cfg_;
+  unsigned int         passes_;
+  unsigned long        deadline_;
+  TwopassStatsStore    stats_;
+  unsigned long        init_flags_;
+  unsigned long        frame_flags_;
+  vpx_codec_pts_t      last_pts_;
+};
+
+}  // namespace libvpx_test
+
+#endif  // TEST_ENCODE_TEST_DRIVER_H_
diff --git a/libs/libvpx/test/error_resilience_test.cc b/libs/libvpx/test/error_resilience_test.cc
new file mode 100644
index 0000000000..cd0dca235a
--- /dev/null
+++ b/libs/libvpx/test/error_resilience_test.cc
@@ -0,0 +1,602 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+
+const int kMaxErrorFrames = 12;
+const int kMaxDroppableFrames = 12;
+
+class ErrorResilienceTestLarge : public ::libvpx_test::EncoderTest,
+    public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, bool> {
+ protected:
+  ErrorResilienceTestLarge()
+      : EncoderTest(GET_PARAM(0)),
+        svc_support_(GET_PARAM(2)),
+        psnr_(0.0),
+        nframes_(0),
+        mismatch_psnr_(0.0),
+        mismatch_nframes_(0),
+        encoding_mode_(GET_PARAM(1)) {
+    Reset();
+  }
+
+  virtual ~ErrorResilienceTestLarge() {}
+
+  void Reset() {
+    error_nframes_ = 0;
+    droppable_nframes_ = 0;
+    pattern_switch_ = 0;
+  }
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+  }
+
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+    psnr_ = 0.0;
+    nframes_ = 0;
+    mismatch_psnr_ = 0.0;
+    mismatch_nframes_ = 0;
+  }
+
+  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+    psnr_ += pkt->data.psnr.psnr[0];
+    nframes_++;
+  }
+
+  //
+  // Frame flags and layer id for temporal layers.
+  // For two layers, test pattern is:
+  //   1     3
+  // 0    2     .....
+  // LAST is updated on base/layer 0, GOLDEN  updated on layer 1.
+  // Non-zero pattern_switch parameter means pattern will switch to
+  // not using LAST for frame_num >= pattern_switch.
+  int SetFrameFlags(int frame_num,
+                    int num_temp_layers,
+                    int pattern_switch) {
+    int frame_flags = 0;
+    if (num_temp_layers == 2) {
+        if (frame_num % 2 == 0) {
+          if (frame_num < pattern_switch || pattern_switch == 0) {
+            // Layer 0: predict from LAST and ARF, update LAST.
+            frame_flags = VP8_EFLAG_NO_REF_GF |
+                          VP8_EFLAG_NO_UPD_GF |
+                          VP8_EFLAG_NO_UPD_ARF;
+          } else {
+            // Layer 0: predict from GF and ARF, update GF.
+            frame_flags = VP8_EFLAG_NO_REF_LAST |
+                          VP8_EFLAG_NO_UPD_LAST |
+                          VP8_EFLAG_NO_UPD_ARF;
+          }
+        } else {
+          if (frame_num < pattern_switch || pattern_switch == 0) {
+            // Layer 1: predict from L, GF, and ARF, update GF.
+            frame_flags = VP8_EFLAG_NO_UPD_ARF |
+                          VP8_EFLAG_NO_UPD_LAST;
+          } else {
+            // Layer 1: predict from GF and ARF, update GF.
+            frame_flags = VP8_EFLAG_NO_REF_LAST |
+                          VP8_EFLAG_NO_UPD_LAST |
+                          VP8_EFLAG_NO_UPD_ARF;
+          }
+        }
+    }
+    return frame_flags;
+  }
+
+  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder * /*encoder*/) {
+    frame_flags_ &= ~(VP8_EFLAG_NO_UPD_LAST |
+                      VP8_EFLAG_NO_UPD_GF |
+                      VP8_EFLAG_NO_UPD_ARF);
+    // For temporal layer case.
+    if (cfg_.ts_number_layers > 1) {
+      frame_flags_ = SetFrameFlags(video->frame(),
+                                   cfg_.ts_number_layers,
+                                   pattern_switch_);
+      for (unsigned int i = 0; i < droppable_nframes_; ++i) {
+        if (droppable_frames_[i] == video->frame()) {
+          std::cout << "Encoding droppable frame: "
+                    << droppable_frames_[i] << "\n";
+        }
+      }
+    } else {
+       if (droppable_nframes_ > 0 &&
+         (cfg_.g_pass == VPX_RC_LAST_PASS || cfg_.g_pass == VPX_RC_ONE_PASS)) {
+         for (unsigned int i = 0; i < droppable_nframes_; ++i) {
+           if (droppable_frames_[i] == video->frame()) {
+             std::cout << "Encoding droppable frame: "
+                       << droppable_frames_[i] << "\n";
+             frame_flags_ |= (VP8_EFLAG_NO_UPD_LAST |
+                              VP8_EFLAG_NO_UPD_GF |
+                              VP8_EFLAG_NO_UPD_ARF);
+             return;
+           }
+         }
+       }
+    }
+  }
+
+  double GetAveragePsnr() const {
+    if (nframes_)
+      return psnr_ / nframes_;
+    return 0.0;
+  }
+
+  double GetAverageMismatchPsnr() const {
+    if (mismatch_nframes_)
+      return mismatch_psnr_ / mismatch_nframes_;
+    return 0.0;
+  }
+
+  virtual bool DoDecode() const {
+    if (error_nframes_ > 0 &&
+        (cfg_.g_pass == VPX_RC_LAST_PASS || cfg_.g_pass == VPX_RC_ONE_PASS)) {
+      for (unsigned int i = 0; i < error_nframes_; ++i) {
+        if (error_frames_[i] == nframes_ - 1) {
+          std::cout << "             Skipping decoding frame: "
+                    << error_frames_[i] << "\n";
+          return 0;
+        }
+      }
+    }
+    return 1;
+  }
+
+  virtual void MismatchHook(const vpx_image_t *img1,
+                            const vpx_image_t *img2) {
+    double mismatch_psnr = compute_psnr(img1, img2);
+    mismatch_psnr_ += mismatch_psnr;
+    ++mismatch_nframes_;
+    // std::cout << "Mismatch frame psnr: " << mismatch_psnr << "\n";
+  }
+
+  void SetErrorFrames(int num, unsigned int *list) {
+    if (num > kMaxErrorFrames)
+      num = kMaxErrorFrames;
+    else if (num < 0)
+      num = 0;
+    error_nframes_ = num;
+    for (unsigned int i = 0; i < error_nframes_; ++i)
+      error_frames_[i] = list[i];
+  }
+
+  void SetDroppableFrames(int num, unsigned int *list) {
+    if (num > kMaxDroppableFrames)
+      num = kMaxDroppableFrames;
+    else if (num < 0)
+      num = 0;
+    droppable_nframes_ = num;
+    for (unsigned int i = 0; i < droppable_nframes_; ++i)
+      droppable_frames_[i] = list[i];
+  }
+
+  unsigned int GetMismatchFrames() {
+    return mismatch_nframes_;
+  }
+
+  void SetPatternSwitch(int frame_switch) {
+     pattern_switch_ = frame_switch;
+   }
+
+  bool svc_support_;
+
+ private:
+  double psnr_;
+  unsigned int nframes_;
+  unsigned int error_nframes_;
+  unsigned int droppable_nframes_;
+  unsigned int pattern_switch_;
+  double mismatch_psnr_;
+  unsigned int mismatch_nframes_;
+  unsigned int error_frames_[kMaxErrorFrames];
+  unsigned int droppable_frames_[kMaxDroppableFrames];
+  libvpx_test::TestMode encoding_mode_;
+};
+
+TEST_P(ErrorResilienceTestLarge, OnVersusOff) {
+  const vpx_rational timebase = { 33333333, 1000000000 };
+  cfg_.g_timebase = timebase;
+  cfg_.rc_target_bitrate = 2000;
+  cfg_.g_lag_in_frames = 10;
+
+  init_flags_ = VPX_CODEC_USE_PSNR;
+
+  libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                     timebase.den, timebase.num, 0, 30);
+
+  // Error resilient mode OFF.
+  cfg_.g_error_resilient = 0;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  const double psnr_resilience_off = GetAveragePsnr();
+  EXPECT_GT(psnr_resilience_off, 25.0);
+
+  // Error resilient mode ON.
+  cfg_.g_error_resilient = 1;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  const double psnr_resilience_on = GetAveragePsnr();
+  EXPECT_GT(psnr_resilience_on, 25.0);
+
+  // Test that turning on error resilient mode hurts by 10% at most.
+  if (psnr_resilience_off > 0.0) {
+    const double psnr_ratio = psnr_resilience_on / psnr_resilience_off;
+    EXPECT_GE(psnr_ratio, 0.9);
+    EXPECT_LE(psnr_ratio, 1.1);
+  }
+}
+
+// Check for successful decoding and no encoder/decoder mismatch
+// if we lose (i.e., drop before decoding) a set of droppable
+// frames (i.e., frames that don't update any reference buffers).
+// Check both isolated and consecutive loss.
+TEST_P(ErrorResilienceTestLarge, DropFramesWithoutRecovery) {
+  const vpx_rational timebase = { 33333333, 1000000000 };
+  cfg_.g_timebase = timebase;
+  cfg_.rc_target_bitrate = 500;
+  // FIXME(debargha): Fix this to work for any lag.
+  // Currently this test only works for lag = 0
+  cfg_.g_lag_in_frames = 0;
+
+  init_flags_ = VPX_CODEC_USE_PSNR;
+
+  libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                     timebase.den, timebase.num, 0, 40);
+
+  // Error resilient mode ON.
+  cfg_.g_error_resilient = 1;
+  cfg_.kf_mode = VPX_KF_DISABLED;
+
+  // Set an arbitrary set of error frames same as droppable frames.
+  // In addition to isolated loss/drop, add a long consecutive series
+  // (of size 9) of dropped frames.
+  unsigned int num_droppable_frames = 11;
+  unsigned int droppable_frame_list[] = {5, 16, 22, 23, 24, 25, 26, 27, 28,
+                                         29, 30};
+  SetDroppableFrames(num_droppable_frames, droppable_frame_list);
+  SetErrorFrames(num_droppable_frames, droppable_frame_list);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  // Test that no mismatches have been found
+  std::cout << "             Mismatch frames: "
+            << GetMismatchFrames() << "\n";
+  EXPECT_EQ(GetMismatchFrames(), (unsigned int) 0);
+
+  // Reset previously set of error/droppable frames.
+  Reset();
+
+#if 0
+  // TODO(jkoleszar): This test is disabled for the time being as too
+  // sensitive. It's not clear how to set a reasonable threshold for
+  // this behavior.
+
+  // Now set an arbitrary set of error frames that are non-droppable
+  unsigned int num_error_frames = 3;
+  unsigned int error_frame_list[] = {3, 10, 20};
+  SetErrorFrames(num_error_frames, error_frame_list);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  // Test that dropping an arbitrary set of inter frames does not hurt too much
+  // Note the Average Mismatch PSNR is the average of the PSNR between
+  // decoded frame and encoder's version of the same frame for all frames
+  // with mismatch.
+  const double psnr_resilience_mismatch = GetAverageMismatchPsnr();
+  std::cout << "             Mismatch PSNR: "
+            << psnr_resilience_mismatch << "\n";
+  EXPECT_GT(psnr_resilience_mismatch, 20.0);
+#endif
+}
+
+// Check for successful decoding and no encoder/decoder mismatch
+// if we lose (i.e., drop before decoding) the enhancement layer frames for a
+// two layer temporal pattern. The base layer does not predict from the top
+// layer, so successful decoding is expected.
+TEST_P(ErrorResilienceTestLarge, 2LayersDropEnhancement) {
+  // This test doesn't run if SVC is not supported.
+  if (!svc_support_)
+    return;
+
+  const vpx_rational timebase = { 33333333, 1000000000 };
+  cfg_.g_timebase = timebase;
+  cfg_.rc_target_bitrate = 500;
+  cfg_.g_lag_in_frames = 0;
+
+  cfg_.rc_end_usage = VPX_CBR;
+  // 2 Temporal layers, no spatial layers, CBR mode.
+  cfg_.ss_number_layers = 1;
+  cfg_.ts_number_layers = 2;
+  cfg_.ts_rate_decimator[0] = 2;
+  cfg_.ts_rate_decimator[1] = 1;
+  cfg_.ts_periodicity = 2;
+  cfg_.ts_target_bitrate[0] = 60 * cfg_.rc_target_bitrate / 100;
+  cfg_.ts_target_bitrate[1] = cfg_.rc_target_bitrate;
+
+  init_flags_ = VPX_CODEC_USE_PSNR;
+
+  libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                     timebase.den, timebase.num, 0, 40);
+
+  // Error resilient mode ON.
+  cfg_.g_error_resilient = 1;
+  cfg_.kf_mode = VPX_KF_DISABLED;
+  SetPatternSwitch(0);
+
+  // The odd frames are the enhancement layer for 2 layer pattern, so set
+  // those frames as droppable. Drop the last 7 frames.
+  unsigned int num_droppable_frames = 7;
+  unsigned int droppable_frame_list[] = {27, 29, 31, 33, 35, 37, 39};
+  SetDroppableFrames(num_droppable_frames, droppable_frame_list);
+  SetErrorFrames(num_droppable_frames, droppable_frame_list);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  // Test that no mismatches have been found
+  std::cout << "             Mismatch frames: "
+            << GetMismatchFrames() << "\n";
+  EXPECT_EQ(GetMismatchFrames(), (unsigned int) 0);
+
+  // Reset previously set of error/droppable frames.
+  Reset();
+}
+
+// Check for successful decoding and no encoder/decoder mismatch
+// for a two layer temporal pattern, where at some point in the
+// sequence, the LAST ref is not used anymore.
+TEST_P(ErrorResilienceTestLarge, 2LayersNoRefLast) {
+  // This test doesn't run if SVC is not supported.
+  if (!svc_support_)
+    return;
+
+  const vpx_rational timebase = { 33333333, 1000000000 };
+  cfg_.g_timebase = timebase;
+  cfg_.rc_target_bitrate = 500;
+  cfg_.g_lag_in_frames = 0;
+
+  cfg_.rc_end_usage = VPX_CBR;
+  // 2 Temporal layers, no spatial layers, CBR mode.
+  cfg_.ss_number_layers = 1;
+  cfg_.ts_number_layers = 2;
+  cfg_.ts_rate_decimator[0] = 2;
+  cfg_.ts_rate_decimator[1] = 1;
+  cfg_.ts_periodicity = 2;
+  cfg_.ts_target_bitrate[0] = 60 * cfg_.rc_target_bitrate / 100;
+  cfg_.ts_target_bitrate[1] = cfg_.rc_target_bitrate;
+
+  init_flags_ = VPX_CODEC_USE_PSNR;
+
+  libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                     timebase.den, timebase.num, 0, 100);
+
+  // Error resilient mode ON.
+  cfg_.g_error_resilient = 1;
+  cfg_.kf_mode = VPX_KF_DISABLED;
+  SetPatternSwitch(60);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  // Test that no mismatches have been found
+  std::cout << "             Mismatch frames: "
+            << GetMismatchFrames() << "\n";
+  EXPECT_EQ(GetMismatchFrames(), (unsigned int) 0);
+
+  // Reset previously set of error/droppable frames.
+  Reset();
+}
+
+class ErrorResilienceTestLargeCodecControls : public ::libvpx_test::EncoderTest,
+    public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
+ protected:
+  ErrorResilienceTestLargeCodecControls()
+      : EncoderTest(GET_PARAM(0)),
+        encoding_mode_(GET_PARAM(1)) {
+    Reset();
+  }
+
+  virtual ~ErrorResilienceTestLargeCodecControls() {}
+
+  void Reset() {
+    last_pts_ = 0;
+    tot_frame_number_ = 0;
+    // For testing up to 3 layers.
+    for (int i = 0; i < 3; ++i) {
+      bits_total_[i] = 0;
+    }
+    duration_ = 0.0;
+  }
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+  }
+
+  //
+  // Frame flags and layer id for temporal layers.
+  //
+
+  // For two layers, test pattern is:
+  //   1     3
+  // 0    2     .....
+  // For three layers, test pattern is:
+  //   1      3    5      7
+  //      2           6
+  // 0          4            ....
+  // LAST is always update on base/layer 0, GOLDEN is updated on layer 1,
+  // and ALTREF is updated on top layer for 3 layer pattern.
+  int SetFrameFlags(int frame_num, int num_temp_layers) {
+    int frame_flags = 0;
+    if (num_temp_layers == 2) {
+      if (frame_num % 2 == 0) {
+        // Layer 0: predict from L and ARF, update L.
+        frame_flags = VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_UPD_GF |
+                      VP8_EFLAG_NO_UPD_ARF;
+      } else {
+        // Layer 1: predict from L, G and ARF, and update G.
+        frame_flags = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST |
+                      VP8_EFLAG_NO_UPD_ENTROPY;
+      }
+    } else if (num_temp_layers == 3) {
+      if (frame_num % 4 == 0) {
+        // Layer 0: predict from L, update L.
+        frame_flags = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF |
+                      VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF;
+      } else if ((frame_num - 2) % 4 == 0) {
+        // Layer 1: predict from L, G,  update G.
+        frame_flags = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST |
+                      VP8_EFLAG_NO_REF_ARF;
+      }  else if ((frame_num - 1) % 2 == 0) {
+        // Layer 2: predict from L, G, ARF; update ARG.
+        frame_flags = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_LAST;
+      }
+    }
+    return frame_flags;
+  }
+
+  int SetLayerId(int frame_num, int num_temp_layers) {
+    int layer_id = 0;
+    if (num_temp_layers == 2) {
+      if (frame_num % 2 == 0) {
+        layer_id = 0;
+      } else {
+         layer_id = 1;
+      }
+    } else if (num_temp_layers == 3) {
+      if (frame_num % 4 == 0) {
+        layer_id = 0;
+      } else if ((frame_num - 2) % 4 == 0) {
+        layer_id = 1;
+      } else if ((frame_num - 1) % 2 == 0) {
+        layer_id = 2;
+      }
+    }
+    return layer_id;
+  }
+
+  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                                  libvpx_test::Encoder *encoder) {
+    if (cfg_.ts_number_layers > 1) {
+        int layer_id = SetLayerId(video->frame(), cfg_.ts_number_layers);
+        int frame_flags = SetFrameFlags(video->frame(), cfg_.ts_number_layers);
+        if (video->frame() > 0) {
+          encoder->Control(VP8E_SET_TEMPORAL_LAYER_ID, layer_id);
+          encoder->Control(VP8E_SET_FRAME_FLAGS, frame_flags);
+        }
+       const vpx_rational_t tb = video->timebase();
+       timebase_ = static_cast<double>(tb.num) / tb.den;
+       duration_ = 0;
+       return;
+    }
+  }
+
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    // Time since last timestamp = duration.
+    vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_;
+    if (duration > 1) {
+      // Update counter for total number of frames (#frames input to encoder).
+      // Needed for setting the proper layer_id below.
+      tot_frame_number_ += static_cast<int>(duration - 1);
+    }
+    int layer = SetLayerId(tot_frame_number_, cfg_.ts_number_layers);
+    const size_t frame_size_in_bits = pkt->data.frame.sz * 8;
+    // Update the total encoded bits. For temporal layers, update the cumulative
+    // encoded bits per layer.
+    for (int i = layer; i < static_cast<int>(cfg_.ts_number_layers); ++i) {
+      bits_total_[i] += frame_size_in_bits;
+    }
+    // Update the most recent pts.
+    last_pts_ = pkt->data.frame.pts;
+    ++tot_frame_number_;
+  }
+
+  virtual void EndPassHook(void) {
+    duration_ = (last_pts_ + 1) * timebase_;
+    if (cfg_.ts_number_layers  > 1) {
+      for (int layer = 0; layer < static_cast<int>(cfg_.ts_number_layers);
+          ++layer) {
+        if (bits_total_[layer]) {
+          // Effective file datarate:
+          effective_datarate_[layer] = (bits_total_[layer] / 1000.0) / duration_;
+        }
+      }
+    }
+  }
+
+  double effective_datarate_[3];
+   private:
+    libvpx_test::TestMode encoding_mode_;
+    vpx_codec_pts_t last_pts_;
+    double timebase_;
+    int64_t bits_total_[3];
+    double duration_;
+    int tot_frame_number_;
+  };
+
+// Check two codec controls used for:
+// (1) for setting temporal layer id, and (2) for settings encoder flags.
+// This test invokes those controls for each frame, and verifies encoder/decoder
+// mismatch and basic rate control response.
+// TODO(marpan): Maybe move this test to datarate_test.cc.
+TEST_P(ErrorResilienceTestLargeCodecControls, CodecControl3TemporalLayers) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_min_quantizer = 2;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.kf_mode = VPX_KF_DISABLED;
+  cfg_.g_error_resilient = 1;
+
+  // 3 Temporal layers. Framerate decimation (4, 2, 1).
+  cfg_.ts_number_layers = 3;
+  cfg_.ts_rate_decimator[0] = 4;
+  cfg_.ts_rate_decimator[1] = 2;
+  cfg_.ts_rate_decimator[2] = 1;
+  cfg_.ts_periodicity = 4;
+  cfg_.ts_layer_id[0] = 0;
+  cfg_.ts_layer_id[1] = 2;
+  cfg_.ts_layer_id[2] = 1;
+  cfg_.ts_layer_id[3] = 2;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 200);
+  for (int i = 200; i <= 800; i += 200) {
+    cfg_.rc_target_bitrate = i;
+    Reset();
+    // 40-20-40 bitrate allocation for 3 temporal layers.
+    cfg_.ts_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100;
+    cfg_.ts_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100;
+    cfg_.ts_target_bitrate[2] = cfg_.rc_target_bitrate;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    for (int j = 0; j < static_cast<int>(cfg_.ts_number_layers); ++j) {
+      ASSERT_GE(effective_datarate_[j], cfg_.ts_target_bitrate[j] * 0.75)
+          << " The datarate for the file is lower than target by too much, "
+              "for layer: " << j;
+      ASSERT_LE(effective_datarate_[j], cfg_.ts_target_bitrate[j] * 1.25)
+          << " The datarate for the file is greater than target by too much, "
+              "for layer: " << j;
+    }
+  }
+}
+
+VP8_INSTANTIATE_TEST_CASE(ErrorResilienceTestLarge, ONE_PASS_TEST_MODES,
+                          ::testing::Values(true));
+VP8_INSTANTIATE_TEST_CASE(ErrorResilienceTestLargeCodecControls,
+                          ONE_PASS_TEST_MODES);
+VP9_INSTANTIATE_TEST_CASE(ErrorResilienceTestLarge, ONE_PASS_TEST_MODES,
+                          ::testing::Values(true));
+// SVC-related tests don't run for VP10 since SVC is not supported.
+VP10_INSTANTIATE_TEST_CASE(ErrorResilienceTestLarge, ONE_PASS_TEST_MODES,
+                           ::testing::Values(false));
+}  // namespace
diff --git a/libs/libvpx/test/examples.sh b/libs/libvpx/test/examples.sh
new file mode 100755
index 0000000000..39f7e392db
--- /dev/null
+++ b/libs/libvpx/test/examples.sh
@@ -0,0 +1,29 @@
+#!/bin/sh
+##
+##  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+##  This file runs all of the tests for the libvpx examples.
+##
+. $(dirname $0)/tools_common.sh
+
+example_tests=$(ls $(dirname $0)/*.sh)
+
+# List of script names to exclude.
+exclude_list="examples tools_common"
+
+# Filter out the scripts in $exclude_list.
+for word in ${exclude_list}; do
+  example_tests=$(filter_strings "${example_tests}" "${word}" exclude)
+done
+
+for test in ${example_tests}; do
+  # Source each test script so that exporting variables can be avoided.
+  VPX_TEST_NAME="$(basename ${test%.*})"
+  . "${test}"
+done
diff --git a/libs/libvpx/test/external_frame_buffer_test.cc b/libs/libvpx/test/external_frame_buffer_test.cc
new file mode 100644
index 0000000000..2570f44eb8
--- /dev/null
+++ b/libs/libvpx/test/external_frame_buffer_test.cc
@@ -0,0 +1,493 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string>
+
+#include "./vpx_config.h"
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/ivf_video_source.h"
+#include "test/md5_helper.h"
+#include "test/test_vectors.h"
+#include "test/util.h"
+#if CONFIG_WEBM_IO
+#include "test/webm_video_source.h"
+#endif
+
+namespace {
+
+const int kVideoNameParam = 1;
+
+struct ExternalFrameBuffer {
+  uint8_t *data;
+  size_t size;
+  int in_use;
+};
+
+// Class to manipulate a list of external frame buffers.
+class ExternalFrameBufferList {
+ public:
+  ExternalFrameBufferList()
+      : num_buffers_(0),
+        ext_fb_list_(NULL) {}
+
+  virtual ~ExternalFrameBufferList() {
+    for (int i = 0; i < num_buffers_; ++i) {
+      delete [] ext_fb_list_[i].data;
+    }
+    delete [] ext_fb_list_;
+  }
+
+  // Creates the list to hold the external buffers. Returns true on success.
+  bool CreateBufferList(int num_buffers) {
+    if (num_buffers < 0)
+      return false;
+
+    num_buffers_ = num_buffers;
+    ext_fb_list_ = new ExternalFrameBuffer[num_buffers_];
+    EXPECT_TRUE(ext_fb_list_ != NULL);
+    memset(ext_fb_list_, 0, sizeof(ext_fb_list_[0]) * num_buffers_);
+    return true;
+  }
+
+  // Searches the frame buffer list for a free frame buffer. Makes sure
+  // that the frame buffer is at least |min_size| in bytes. Marks that the
+  // frame buffer is in use by libvpx. Finally sets |fb| to point to the
+  // external frame buffer. Returns < 0 on an error.
+  int GetFreeFrameBuffer(size_t min_size, vpx_codec_frame_buffer_t *fb) {
+    EXPECT_TRUE(fb != NULL);
+    const int idx = FindFreeBufferIndex();
+    if (idx == num_buffers_)
+      return -1;
+
+    if (ext_fb_list_[idx].size < min_size) {
+      delete [] ext_fb_list_[idx].data;
+      ext_fb_list_[idx].data = new uint8_t[min_size];
+      memset(ext_fb_list_[idx].data, 0, min_size);
+      ext_fb_list_[idx].size = min_size;
+    }
+
+    SetFrameBuffer(idx, fb);
+    return 0;
+  }
+
+  // Test function that will not allocate any data for the frame buffer.
+  // Returns < 0 on an error.
+  int GetZeroFrameBuffer(size_t min_size, vpx_codec_frame_buffer_t *fb) {
+    EXPECT_TRUE(fb != NULL);
+    const int idx = FindFreeBufferIndex();
+    if (idx == num_buffers_)
+      return -1;
+
+    if (ext_fb_list_[idx].size < min_size) {
+      delete [] ext_fb_list_[idx].data;
+      ext_fb_list_[idx].data = NULL;
+      ext_fb_list_[idx].size = min_size;
+    }
+
+    SetFrameBuffer(idx, fb);
+    return 0;
+  }
+
+  // Marks the external frame buffer that |fb| is pointing to as free.
+  // Returns < 0 on an error.
+  int ReturnFrameBuffer(vpx_codec_frame_buffer_t *fb) {
+    if (fb == NULL) {
+      EXPECT_TRUE(fb != NULL);
+      return -1;
+    }
+    ExternalFrameBuffer *const ext_fb =
+        reinterpret_cast<ExternalFrameBuffer*>(fb->priv);
+    if (ext_fb == NULL) {
+      EXPECT_TRUE(ext_fb != NULL);
+      return -1;
+    }
+    EXPECT_EQ(1, ext_fb->in_use);
+    ext_fb->in_use = 0;
+    return 0;
+  }
+
+  // Checks that the ximage data is contained within the external frame buffer
+  // private data passed back in the ximage.
+  void CheckXImageFrameBuffer(const vpx_image_t *img) {
+    if (img->fb_priv != NULL) {
+      const struct ExternalFrameBuffer *const ext_fb =
+          reinterpret_cast<ExternalFrameBuffer*>(img->fb_priv);
+
+      ASSERT_TRUE(img->planes[0] >= ext_fb->data &&
+                  img->planes[0] < (ext_fb->data + ext_fb->size));
+    }
+  }
+
+ private:
+  // Returns the index of the first free frame buffer. Returns |num_buffers_|
+  // if there are no free frame buffers.
+  int FindFreeBufferIndex() {
+    int i;
+    // Find a free frame buffer.
+    for (i = 0; i < num_buffers_; ++i) {
+      if (!ext_fb_list_[i].in_use)
+        break;
+    }
+    return i;
+  }
+
+  // Sets |fb| to an external frame buffer. idx is the index into the frame
+  // buffer list.
+  void SetFrameBuffer(int idx, vpx_codec_frame_buffer_t *fb) {
+    ASSERT_TRUE(fb != NULL);
+    fb->data = ext_fb_list_[idx].data;
+    fb->size = ext_fb_list_[idx].size;
+    ASSERT_EQ(0, ext_fb_list_[idx].in_use);
+    ext_fb_list_[idx].in_use = 1;
+    fb->priv = &ext_fb_list_[idx];
+  }
+
+  int num_buffers_;
+  ExternalFrameBuffer *ext_fb_list_;
+};
+
+#if CONFIG_WEBM_IO
+
+// Callback used by libvpx to request the application to return a frame
+// buffer of at least |min_size| in bytes.
+int get_vp9_frame_buffer(void *user_priv, size_t min_size,
+                         vpx_codec_frame_buffer_t *fb) {
+  ExternalFrameBufferList *const fb_list =
+      reinterpret_cast<ExternalFrameBufferList*>(user_priv);
+  return fb_list->GetFreeFrameBuffer(min_size, fb);
+}
+
+// Callback used by libvpx to tell the application that |fb| is not needed
+// anymore.
+int release_vp9_frame_buffer(void *user_priv,
+                             vpx_codec_frame_buffer_t *fb) {
+  ExternalFrameBufferList *const fb_list =
+      reinterpret_cast<ExternalFrameBufferList*>(user_priv);
+  return fb_list->ReturnFrameBuffer(fb);
+}
+
+// Callback will not allocate data for frame buffer.
+int get_vp9_zero_frame_buffer(void *user_priv, size_t min_size,
+                              vpx_codec_frame_buffer_t *fb) {
+  ExternalFrameBufferList *const fb_list =
+      reinterpret_cast<ExternalFrameBufferList*>(user_priv);
+  return fb_list->GetZeroFrameBuffer(min_size, fb);
+}
+
+// Callback will allocate one less byte than |min_size|.
+int get_vp9_one_less_byte_frame_buffer(void *user_priv, size_t min_size,
+                                       vpx_codec_frame_buffer_t *fb) {
+  ExternalFrameBufferList *const fb_list =
+      reinterpret_cast<ExternalFrameBufferList*>(user_priv);
+  return fb_list->GetFreeFrameBuffer(min_size - 1, fb);
+}
+
+// Callback will not release the external frame buffer.
+int do_not_release_vp9_frame_buffer(void *user_priv,
+                                    vpx_codec_frame_buffer_t *fb) {
+  (void)user_priv;
+  (void)fb;
+  return 0;
+}
+
+#endif  // CONFIG_WEBM_IO
+
+// Class for testing passing in external frame buffers to libvpx.
+class ExternalFrameBufferMD5Test
+    : public ::libvpx_test::DecoderTest,
+      public ::libvpx_test::CodecTestWithParam<const char*> {
+ protected:
+  ExternalFrameBufferMD5Test()
+      : DecoderTest(GET_PARAM(::libvpx_test::kCodecFactoryParam)),
+        md5_file_(NULL),
+        num_buffers_(0) {}
+
+  virtual ~ExternalFrameBufferMD5Test() {
+    if (md5_file_ != NULL)
+      fclose(md5_file_);
+  }
+
+  virtual void PreDecodeFrameHook(
+      const libvpx_test::CompressedVideoSource &video,
+      libvpx_test::Decoder *decoder) {
+    if (num_buffers_ > 0 && video.frame_number() == 0) {
+      // Have libvpx use frame buffers we create.
+      ASSERT_TRUE(fb_list_.CreateBufferList(num_buffers_));
+      ASSERT_EQ(VPX_CODEC_OK,
+                decoder->SetFrameBufferFunctions(
+                    GetVP9FrameBuffer, ReleaseVP9FrameBuffer, this));
+    }
+  }
+
+  void OpenMD5File(const std::string &md5_file_name_) {
+    md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name_);
+    ASSERT_TRUE(md5_file_ != NULL) << "Md5 file open failed. Filename: "
+        << md5_file_name_;
+  }
+
+  virtual void DecompressedFrameHook(const vpx_image_t &img,
+                                     const unsigned int frame_number) {
+    ASSERT_TRUE(md5_file_ != NULL);
+    char expected_md5[33];
+    char junk[128];
+
+    // Read correct md5 checksums.
+    const int res = fscanf(md5_file_, "%s  %s", expected_md5, junk);
+    ASSERT_NE(EOF, res) << "Read md5 data failed";
+    expected_md5[32] = '\0';
+
+    ::libvpx_test::MD5 md5_res;
+    md5_res.Add(&img);
+    const char *const actual_md5 = md5_res.Get();
+
+    // Check md5 match.
+    ASSERT_STREQ(expected_md5, actual_md5)
+        << "Md5 checksums don't match: frame number = " << frame_number;
+  }
+
+  // Callback to get a free external frame buffer. Return value < 0 is an
+  // error.
+  static int GetVP9FrameBuffer(void *user_priv, size_t min_size,
+                               vpx_codec_frame_buffer_t *fb) {
+    ExternalFrameBufferMD5Test *const md5Test =
+        reinterpret_cast<ExternalFrameBufferMD5Test*>(user_priv);
+    return md5Test->fb_list_.GetFreeFrameBuffer(min_size, fb);
+  }
+
+  // Callback to release an external frame buffer. Return value < 0 is an
+  // error.
+  static int ReleaseVP9FrameBuffer(void *user_priv,
+                                   vpx_codec_frame_buffer_t *fb) {
+    ExternalFrameBufferMD5Test *const md5Test =
+        reinterpret_cast<ExternalFrameBufferMD5Test*>(user_priv);
+    return md5Test->fb_list_.ReturnFrameBuffer(fb);
+  }
+
+  void set_num_buffers(int num_buffers) { num_buffers_ = num_buffers; }
+  int num_buffers() const { return num_buffers_; }
+
+ private:
+  FILE *md5_file_;
+  int num_buffers_;
+  ExternalFrameBufferList fb_list_;
+};
+
+#if CONFIG_WEBM_IO
+const char kVP9TestFile[] = "vp90-2-02-size-lf-1920x1080.webm";
+
+// Class for testing passing in external frame buffers to libvpx.
+class ExternalFrameBufferTest : public ::testing::Test {
+ protected:
+  ExternalFrameBufferTest()
+      : video_(NULL),
+        decoder_(NULL),
+        num_buffers_(0) {}
+
+  virtual void SetUp() {
+    video_ = new libvpx_test::WebMVideoSource(kVP9TestFile);
+    ASSERT_TRUE(video_ != NULL);
+    video_->Init();
+    video_->Begin();
+
+    vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
+    decoder_ = new libvpx_test::VP9Decoder(cfg, 0);
+    ASSERT_TRUE(decoder_ != NULL);
+  }
+
+  virtual void TearDown() {
+    delete decoder_;
+    delete video_;
+  }
+
+  // Passes the external frame buffer information to libvpx.
+  vpx_codec_err_t SetFrameBufferFunctions(
+      int num_buffers,
+      vpx_get_frame_buffer_cb_fn_t cb_get,
+      vpx_release_frame_buffer_cb_fn_t cb_release) {
+    if (num_buffers > 0) {
+      num_buffers_ = num_buffers;
+      EXPECT_TRUE(fb_list_.CreateBufferList(num_buffers_));
+    }
+
+    return decoder_->SetFrameBufferFunctions(cb_get, cb_release, &fb_list_);
+  }
+
+  vpx_codec_err_t DecodeOneFrame() {
+    const vpx_codec_err_t res =
+        decoder_->DecodeFrame(video_->cxdata(), video_->frame_size());
+    CheckDecodedFrames();
+    if (res == VPX_CODEC_OK)
+      video_->Next();
+    return res;
+  }
+
+  vpx_codec_err_t DecodeRemainingFrames() {
+    for (; video_->cxdata() != NULL; video_->Next()) {
+      const vpx_codec_err_t res =
+          decoder_->DecodeFrame(video_->cxdata(), video_->frame_size());
+      if (res != VPX_CODEC_OK)
+        return res;
+      CheckDecodedFrames();
+    }
+    return VPX_CODEC_OK;
+  }
+
+ private:
+  void CheckDecodedFrames() {
+    libvpx_test::DxDataIterator dec_iter = decoder_->GetDxData();
+    const vpx_image_t *img = NULL;
+
+    // Get decompressed data
+    while ((img = dec_iter.Next()) != NULL) {
+      fb_list_.CheckXImageFrameBuffer(img);
+    }
+  }
+
+  libvpx_test::WebMVideoSource *video_;
+  libvpx_test::VP9Decoder *decoder_;
+  int num_buffers_;
+  ExternalFrameBufferList fb_list_;
+};
+#endif  // CONFIG_WEBM_IO
+
+// This test runs through the set of test vectors, and decodes them.
+// Libvpx will call into the application to allocate a frame buffer when
+// needed. The md5 checksums are computed for each frame in the video file.
+// If md5 checksums match the correct md5 data, then the test is passed.
+// Otherwise, the test failed.
+TEST_P(ExternalFrameBufferMD5Test, ExtFBMD5Match) {
+  const std::string filename = GET_PARAM(kVideoNameParam);
+  libvpx_test::CompressedVideoSource *video = NULL;
+
+  // Number of buffers equals #VP9_MAXIMUM_REF_BUFFERS +
+  // #VPX_MAXIMUM_WORK_BUFFERS + four jitter buffers.
+  const int jitter_buffers = 4;
+  const int num_buffers =
+      VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS + jitter_buffers;
+  set_num_buffers(num_buffers);
+
+#if CONFIG_VP8_DECODER
+  // Tell compiler we are not using kVP8TestVectors.
+  (void)libvpx_test::kVP8TestVectors;
+#endif
+
+  // Open compressed video file.
+  if (filename.substr(filename.length() - 3, 3) == "ivf") {
+    video = new libvpx_test::IVFVideoSource(filename);
+  } else {
+#if CONFIG_WEBM_IO
+    video = new libvpx_test::WebMVideoSource(filename);
+#else
+    fprintf(stderr, "WebM IO is disabled, skipping test vector %s\n",
+            filename.c_str());
+    return;
+#endif
+  }
+  ASSERT_TRUE(video != NULL);
+  video->Init();
+
+  // Construct md5 file name.
+  const std::string md5_filename = filename + ".md5";
+  OpenMD5File(md5_filename);
+
+  // Decode frame, and check the md5 matching.
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video));
+  delete video;
+}
+
+#if CONFIG_WEBM_IO
+TEST_F(ExternalFrameBufferTest, MinFrameBuffers) {
+  // Minimum number of external frame buffers for VP9 is
+  // #VP9_MAXIMUM_REF_BUFFERS + #VPX_MAXIMUM_WORK_BUFFERS.
+  const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS;
+  ASSERT_EQ(VPX_CODEC_OK,
+            SetFrameBufferFunctions(
+                num_buffers, get_vp9_frame_buffer, release_vp9_frame_buffer));
+  ASSERT_EQ(VPX_CODEC_OK, DecodeRemainingFrames());
+}
+
+TEST_F(ExternalFrameBufferTest, EightJitterBuffers) {
+  // Number of buffers equals #VP9_MAXIMUM_REF_BUFFERS +
+  // #VPX_MAXIMUM_WORK_BUFFERS + eight jitter buffers.
+  const int jitter_buffers = 8;
+  const int num_buffers =
+      VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS + jitter_buffers;
+  ASSERT_EQ(VPX_CODEC_OK,
+            SetFrameBufferFunctions(
+                num_buffers, get_vp9_frame_buffer, release_vp9_frame_buffer));
+  ASSERT_EQ(VPX_CODEC_OK, DecodeRemainingFrames());
+}
+
+TEST_F(ExternalFrameBufferTest, NotEnoughBuffers) {
+  // Minimum number of external frame buffers for VP9 is
+  // #VP9_MAXIMUM_REF_BUFFERS + #VPX_MAXIMUM_WORK_BUFFERS. Most files will
+  // only use 5 frame buffers at one time.
+  const int num_buffers = 2;
+  ASSERT_EQ(VPX_CODEC_OK,
+            SetFrameBufferFunctions(
+                num_buffers, get_vp9_frame_buffer, release_vp9_frame_buffer));
+  ASSERT_EQ(VPX_CODEC_OK, DecodeOneFrame());
+  ASSERT_EQ(VPX_CODEC_MEM_ERROR, DecodeRemainingFrames());
+}
+
+TEST_F(ExternalFrameBufferTest, NoRelease) {
+  const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS;
+  ASSERT_EQ(VPX_CODEC_OK,
+            SetFrameBufferFunctions(num_buffers, get_vp9_frame_buffer,
+                                    do_not_release_vp9_frame_buffer));
+  ASSERT_EQ(VPX_CODEC_OK, DecodeOneFrame());
+  ASSERT_EQ(VPX_CODEC_MEM_ERROR, DecodeRemainingFrames());
+}
+
+TEST_F(ExternalFrameBufferTest, NullRealloc) {
+  const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS;
+  ASSERT_EQ(VPX_CODEC_OK,
+            SetFrameBufferFunctions(num_buffers, get_vp9_zero_frame_buffer,
+                                    release_vp9_frame_buffer));
+  ASSERT_EQ(VPX_CODEC_MEM_ERROR, DecodeOneFrame());
+}
+
+TEST_F(ExternalFrameBufferTest, ReallocOneLessByte) {
+  const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS;
+  ASSERT_EQ(VPX_CODEC_OK,
+            SetFrameBufferFunctions(
+                num_buffers, get_vp9_one_less_byte_frame_buffer,
+                release_vp9_frame_buffer));
+  ASSERT_EQ(VPX_CODEC_MEM_ERROR, DecodeOneFrame());
+}
+
+TEST_F(ExternalFrameBufferTest, NullGetFunction) {
+  const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS;
+  ASSERT_EQ(VPX_CODEC_INVALID_PARAM,
+            SetFrameBufferFunctions(num_buffers, NULL,
+                                    release_vp9_frame_buffer));
+}
+
+TEST_F(ExternalFrameBufferTest, NullReleaseFunction) {
+  const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS;
+  ASSERT_EQ(VPX_CODEC_INVALID_PARAM,
+            SetFrameBufferFunctions(num_buffers, get_vp9_frame_buffer, NULL));
+}
+
+TEST_F(ExternalFrameBufferTest, SetAfterDecode) {
+  const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS;
+  ASSERT_EQ(VPX_CODEC_OK, DecodeOneFrame());
+  ASSERT_EQ(VPX_CODEC_ERROR,
+            SetFrameBufferFunctions(
+                num_buffers, get_vp9_frame_buffer, release_vp9_frame_buffer));
+}
+#endif  // CONFIG_WEBM_IO
+
+VP9_INSTANTIATE_TEST_CASE(ExternalFrameBufferMD5Test,
+                          ::testing::ValuesIn(libvpx_test::kVP9TestVectors,
+                                              libvpx_test::kVP9TestVectors +
+                                              libvpx_test::kNumVP9TestVectors));
+}  // namespace
diff --git a/libs/libvpx/test/fdct4x4_test.cc b/libs/libvpx/test/fdct4x4_test.cc
new file mode 100644
index 0000000000..0c91aee214
--- /dev/null
+++ b/libs/libvpx/test/fdct4x4_test.cc
@@ -0,0 +1,554 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "vp9/common/vp9_entropy.h"
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+const int kNumCoeffs = 16;
+typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride);
+typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride);
+typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,
+                        int tx_type);
+typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
+                        int tx_type);
+
+typedef std::tr1::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t> Dct4x4Param;
+typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht4x4Param;
+
+void fdct4x4_ref(const int16_t *in, tran_low_t *out, int stride,
+                 int /*tx_type*/) {
+  vpx_fdct4x4_c(in, out, stride);
+}
+
+void fht4x4_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
+  vp9_fht4x4_c(in, out, stride, tx_type);
+}
+
+void fwht4x4_ref(const int16_t *in, tran_low_t *out, int stride,
+                 int /*tx_type*/) {
+  vp9_fwht4x4_c(in, out, stride);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void idct4x4_10(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct4x4_16_add_c(in, out, stride, 10);
+}
+
+void idct4x4_12(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct4x4_16_add_c(in, out, stride, 12);
+}
+
+void iht4x4_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
+  vp9_highbd_iht4x4_16_add_c(in, out, stride, tx_type, 10);
+}
+
+void iht4x4_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
+  vp9_highbd_iht4x4_16_add_c(in, out, stride, tx_type, 12);
+}
+
+void iwht4x4_10(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_iwht4x4_16_add_c(in, out, stride, 10);
+}
+
+void iwht4x4_12(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_iwht4x4_16_add_c(in, out, stride, 12);
+}
+
+#if HAVE_SSE2
+void idct4x4_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct4x4_16_add_sse2(in, out, stride, 10);
+}
+
+void idct4x4_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct4x4_16_add_sse2(in, out, stride, 12);
+}
+#endif  // HAVE_SSE2
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+class Trans4x4TestBase {
+ public:
+  virtual ~Trans4x4TestBase() {}
+
+ protected:
+  virtual void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) = 0;
+
+  virtual void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) = 0;
+
+  void RunAccuracyCheck(int limit) {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    uint32_t max_error = 0;
+    int64_t total_error = 0;
+    const int count_test_block = 10000;
+    for (int i = 0; i < count_test_block; ++i) {
+      DECLARE_ALIGNED(16, int16_t, test_input_block[kNumCoeffs]);
+      DECLARE_ALIGNED(16, tran_low_t, test_temp_block[kNumCoeffs]);
+      DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
+      DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]);
+#if CONFIG_VP9_HIGHBITDEPTH
+      DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
+      DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]);
+#endif
+
+      // Initialize a test block with input range [-255, 255].
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        if (bit_depth_ == VPX_BITS_8) {
+          src[j] = rnd.Rand8();
+          dst[j] = rnd.Rand8();
+          test_input_block[j] = src[j] - dst[j];
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          src16[j] = rnd.Rand16() & mask_;
+          dst16[j] = rnd.Rand16() & mask_;
+          test_input_block[j] = src16[j] - dst16[j];
+#endif
+        }
+      }
+
+      ASM_REGISTER_STATE_CHECK(RunFwdTxfm(test_input_block,
+                                          test_temp_block, pitch_));
+      if (bit_depth_ == VPX_BITS_8) {
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_));
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block,
+                                            CONVERT_TO_BYTEPTR(dst16), pitch_));
+#endif
+      }
+
+      for (int j = 0; j < kNumCoeffs; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        const uint32_t diff =
+            bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+#else
+        ASSERT_EQ(VPX_BITS_8, bit_depth_);
+        const uint32_t diff = dst[j] - src[j];
+#endif
+        const uint32_t error = diff * diff;
+        if (max_error < error)
+          max_error = error;
+        total_error += error;
+      }
+    }
+
+    EXPECT_GE(static_cast<uint32_t>(limit), max_error)
+        << "Error: 4x4 FHT/IHT has an individual round trip error > "
+        << limit;
+
+    EXPECT_GE(count_test_block * limit, total_error)
+        << "Error: 4x4 FHT/IHT has average round trip error > " << limit
+        << " per block";
+  }
+
+  void RunCoeffCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 5000;
+    DECLARE_ALIGNED(16, int16_t, input_block[kNumCoeffs]);
+    DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]);
+    DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]);
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-mask_, mask_].
+      for (int j = 0; j < kNumCoeffs; ++j)
+        input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
+
+      fwd_txfm_ref(input_block, output_ref_block, pitch_, tx_type_);
+      ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_block, output_block, pitch_));
+
+      // The minimum quant value is 4.
+      for (int j = 0; j < kNumCoeffs; ++j)
+        EXPECT_EQ(output_block[j], output_ref_block[j]);
+    }
+  }
+
+  void RunMemCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 5000;
+    DECLARE_ALIGNED(16, int16_t, input_extreme_block[kNumCoeffs]);
+    DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]);
+    DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]);
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-mask_, mask_].
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        input_extreme_block[j] = rnd.Rand8() % 2 ? mask_ : -mask_;
+      }
+      if (i == 0) {
+        for (int j = 0; j < kNumCoeffs; ++j)
+          input_extreme_block[j] = mask_;
+      } else if (i == 1) {
+        for (int j = 0; j < kNumCoeffs; ++j)
+          input_extreme_block[j] = -mask_;
+      }
+
+      fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_);
+      ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_extreme_block,
+                                          output_block, pitch_));
+
+      // The minimum quant value is 4.
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        EXPECT_EQ(output_block[j], output_ref_block[j]);
+        EXPECT_GE(4 * DCT_MAX_VALUE << (bit_depth_ - 8), abs(output_block[j]))
+            << "Error: 4x4 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
+      }
+    }
+  }
+
+  void RunInvAccuracyCheck(int limit) {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 1000;
+    DECLARE_ALIGNED(16, int16_t, in[kNumCoeffs]);
+    DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]);
+#if CONFIG_VP9_HIGHBITDEPTH
+    DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]);
+#endif
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-mask_, mask_].
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        if (bit_depth_ == VPX_BITS_8) {
+          src[j] = rnd.Rand8();
+          dst[j] = rnd.Rand8();
+          in[j] = src[j] - dst[j];
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          src16[j] = rnd.Rand16() & mask_;
+          dst16[j] = rnd.Rand16() & mask_;
+          in[j] = src16[j] - dst16[j];
+#endif
+        }
+      }
+
+      fwd_txfm_ref(in, coeff, pitch_, tx_type_);
+
+      if (bit_depth_ == VPX_BITS_8) {
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16),
+                                            pitch_));
+#endif
+      }
+
+      for (int j = 0; j < kNumCoeffs; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        const uint32_t diff =
+            bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+#else
+        const uint32_t diff = dst[j] - src[j];
+#endif
+        const uint32_t error = diff * diff;
+        EXPECT_GE(static_cast<uint32_t>(limit), error)
+            << "Error: 4x4 IDCT has error " << error
+            << " at index " << j;
+      }
+    }
+  }
+
+  int pitch_;
+  int tx_type_;
+  FhtFunc fwd_txfm_ref;
+  vpx_bit_depth_t bit_depth_;
+  int mask_;
+};
+
+class Trans4x4DCT
+    : public Trans4x4TestBase,
+      public ::testing::TestWithParam<Dct4x4Param> {
+ public:
+  virtual ~Trans4x4DCT() {}
+
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    tx_type_  = GET_PARAM(2);
+    pitch_    = 4;
+    fwd_txfm_ref = fdct4x4_ref;
+    bit_depth_ = GET_PARAM(3);
+    mask_ = (1 << bit_depth_) - 1;
+  }
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
+    fwd_txfm_(in, out, stride);
+  }
+  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride);
+  }
+
+  FdctFunc fwd_txfm_;
+  IdctFunc inv_txfm_;
+};
+
+TEST_P(Trans4x4DCT, AccuracyCheck) {
+  RunAccuracyCheck(1);
+}
+
+TEST_P(Trans4x4DCT, CoeffCheck) {
+  RunCoeffCheck();
+}
+
+TEST_P(Trans4x4DCT, MemCheck) {
+  RunMemCheck();
+}
+
+TEST_P(Trans4x4DCT, InvAccuracyCheck) {
+  RunInvAccuracyCheck(1);
+}
+
+class Trans4x4HT
+    : public Trans4x4TestBase,
+      public ::testing::TestWithParam<Ht4x4Param> {
+ public:
+  virtual ~Trans4x4HT() {}
+
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    tx_type_  = GET_PARAM(2);
+    pitch_    = 4;
+    fwd_txfm_ref = fht4x4_ref;
+    bit_depth_ = GET_PARAM(3);
+    mask_ = (1 << bit_depth_) - 1;
+  }
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
+    fwd_txfm_(in, out, stride, tx_type_);
+  }
+
+  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride, tx_type_);
+  }
+
+  FhtFunc fwd_txfm_;
+  IhtFunc inv_txfm_;
+};
+
+TEST_P(Trans4x4HT, AccuracyCheck) {
+  RunAccuracyCheck(1);
+}
+
+TEST_P(Trans4x4HT, CoeffCheck) {
+  RunCoeffCheck();
+}
+
+TEST_P(Trans4x4HT, MemCheck) {
+  RunMemCheck();
+}
+
+TEST_P(Trans4x4HT, InvAccuracyCheck) {
+  RunInvAccuracyCheck(1);
+}
+
+class Trans4x4WHT
+    : public Trans4x4TestBase,
+      public ::testing::TestWithParam<Dct4x4Param> {
+ public:
+  virtual ~Trans4x4WHT() {}
+
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    tx_type_  = GET_PARAM(2);
+    pitch_    = 4;
+    fwd_txfm_ref = fwht4x4_ref;
+    bit_depth_ = GET_PARAM(3);
+    mask_ = (1 << bit_depth_) - 1;
+  }
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
+    fwd_txfm_(in, out, stride);
+  }
+  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride);
+  }
+
+  FdctFunc fwd_txfm_;
+  IdctFunc inv_txfm_;
+};
+
+TEST_P(Trans4x4WHT, AccuracyCheck) {
+  RunAccuracyCheck(0);
+}
+
+TEST_P(Trans4x4WHT, CoeffCheck) {
+  RunCoeffCheck();
+}
+
+TEST_P(Trans4x4WHT, MemCheck) {
+  RunMemCheck();
+}
+
+TEST_P(Trans4x4WHT, InvAccuracyCheck) {
+  RunInvAccuracyCheck(0);
+}
+using std::tr1::make_tuple;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    C, Trans4x4DCT,
+    ::testing::Values(
+        make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_10, 0, VPX_BITS_10),
+        make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_12, 0, VPX_BITS_12),
+        make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c, 0, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(
+    C, Trans4x4DCT,
+    ::testing::Values(
+        make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c, 0, VPX_BITS_8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    C, Trans4x4HT,
+    ::testing::Values(
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 0, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 1, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 2, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 3, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 0, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 1, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 2, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 3, VPX_BITS_12),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(
+    C, Trans4x4HT,
+    ::testing::Values(
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    C, Trans4x4WHT,
+    ::testing::Values(
+        make_tuple(&vp9_highbd_fwht4x4_c, &iwht4x4_10, 0, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fwht4x4_c, &iwht4x4_12, 0, VPX_BITS_12),
+        make_tuple(&vp9_fwht4x4_c, &vpx_iwht4x4_16_add_c, 0, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(
+    C, Trans4x4WHT,
+    ::testing::Values(
+        make_tuple(&vp9_fwht4x4_c, &vpx_iwht4x4_16_add_c, 0, VPX_BITS_8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    NEON, Trans4x4DCT,
+    ::testing::Values(
+        make_tuple(&vpx_fdct4x4_c,
+                   &vpx_idct4x4_16_add_neon, 0, VPX_BITS_8)));
+#endif  // HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    NEON, Trans4x4HT,
+    ::testing::Values(
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 3, VPX_BITS_8)));
+#endif  // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if CONFIG_USE_X86INC && HAVE_MMX && !CONFIG_VP9_HIGHBITDEPTH && \
+    !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    MMX, Trans4x4WHT,
+    ::testing::Values(
+        make_tuple(&vp9_fwht4x4_mmx, &vpx_iwht4x4_16_add_c, 0, VPX_BITS_8)));
+#endif
+
+#if CONFIG_USE_X86INC && HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && \
+    !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Trans4x4WHT,
+    ::testing::Values(
+        make_tuple(&vp9_fwht4x4_c, &vpx_iwht4x4_16_add_sse2, 0, VPX_BITS_8)));
+#endif
+
+#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Trans4x4DCT,
+    ::testing::Values(
+        make_tuple(&vpx_fdct4x4_sse2,
+                   &vpx_idct4x4_16_add_sse2, 0, VPX_BITS_8)));
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Trans4x4HT,
+    ::testing::Values(
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 3, VPX_BITS_8)));
+#endif  // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Trans4x4DCT,
+    ::testing::Values(
+        make_tuple(&vpx_highbd_fdct4x4_c,    &idct4x4_10_sse2, 0, VPX_BITS_10),
+        make_tuple(&vpx_highbd_fdct4x4_sse2, &idct4x4_10_sse2, 0, VPX_BITS_10),
+        make_tuple(&vpx_highbd_fdct4x4_c,    &idct4x4_12_sse2, 0, VPX_BITS_12),
+        make_tuple(&vpx_highbd_fdct4x4_sse2, &idct4x4_12_sse2, 0, VPX_BITS_12),
+        make_tuple(&vpx_fdct4x4_sse2,      &vpx_idct4x4_16_add_c, 0,
+                   VPX_BITS_8)));
+
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Trans4x4HT,
+    ::testing::Values(
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8)));
+#endif  // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    MSA, Trans4x4DCT,
+    ::testing::Values(
+        make_tuple(&vpx_fdct4x4_msa, &vpx_idct4x4_16_add_msa, 0, VPX_BITS_8)));
+INSTANTIATE_TEST_CASE_P(
+    MSA, Trans4x4HT,
+    ::testing::Values(
+        make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 3, VPX_BITS_8)));
+#endif  // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+}  // namespace
diff --git a/libs/libvpx/test/fdct8x8_test.cc b/libs/libvpx/test/fdct8x8_test.cc
new file mode 100644
index 0000000000..edf4682169
--- /dev/null
+++ b/libs/libvpx/test/fdct8x8_test.cc
@@ -0,0 +1,791 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "vp9/common/vp9_entropy.h"
+#include "vp9/common/vp9_scan.h"
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+
+const int kNumCoeffs = 64;
+const double kPi = 3.141592653589793238462643383279502884;
+
+const int kSignBiasMaxDiff255 = 1500;
+const int kSignBiasMaxDiff15 = 10000;
+
+typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride);
+typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride);
+typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,
+                        int tx_type);
+typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
+                        int tx_type);
+
+typedef std::tr1::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t> Dct8x8Param;
+typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht8x8Param;
+typedef std::tr1::tuple<IdctFunc, IdctFunc, int, vpx_bit_depth_t> Idct8x8Param;
+
+void reference_8x8_dct_1d(const double in[8], double out[8]) {
+  const double kInvSqrt2 = 0.707106781186547524400844362104;
+  for (int k = 0; k < 8; k++) {
+    out[k] = 0.0;
+    for (int n = 0; n < 8; n++)
+      out[k] += in[n] * cos(kPi * (2 * n + 1) * k / 16.0);
+    if (k == 0)
+      out[k] = out[k] * kInvSqrt2;
+  }
+}
+
+void reference_8x8_dct_2d(const int16_t input[kNumCoeffs],
+                          double output[kNumCoeffs]) {
+  // First transform columns
+  for (int i = 0; i < 8; ++i) {
+    double temp_in[8], temp_out[8];
+    for (int j = 0; j < 8; ++j)
+      temp_in[j] = input[j*8 + i];
+    reference_8x8_dct_1d(temp_in, temp_out);
+    for (int j = 0; j < 8; ++j)
+      output[j * 8 + i] = temp_out[j];
+  }
+  // Then transform rows
+  for (int i = 0; i < 8; ++i) {
+    double temp_in[8], temp_out[8];
+    for (int j = 0; j < 8; ++j)
+      temp_in[j] = output[j + i*8];
+    reference_8x8_dct_1d(temp_in, temp_out);
+    // Scale by some magic number
+    for (int j = 0; j < 8; ++j)
+      output[j + i * 8] = temp_out[j] * 2;
+  }
+}
+
+
+void fdct8x8_ref(const int16_t *in, tran_low_t *out, int stride,
+                 int /*tx_type*/) {
+  vpx_fdct8x8_c(in, out, stride);
+}
+
+void fht8x8_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
+  vp9_fht8x8_c(in, out, stride, tx_type);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void idct8x8_10(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct8x8_64_add_c(in, out, stride, 10);
+}
+
+void idct8x8_12(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct8x8_64_add_c(in, out, stride, 12);
+}
+
+void iht8x8_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
+  vp9_highbd_iht8x8_64_add_c(in, out, stride, tx_type, 10);
+}
+
+void iht8x8_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
+  vp9_highbd_iht8x8_64_add_c(in, out, stride, tx_type, 12);
+}
+
+#if HAVE_SSE2
+
+void idct8x8_10_add_10_c(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct8x8_10_add_c(in, out, stride, 10);
+}
+
+void idct8x8_10_add_12_c(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct8x8_10_add_c(in, out, stride, 12);
+}
+
+void idct8x8_10_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct8x8_10_add_sse2(in, out, stride, 10);
+}
+
+void idct8x8_10_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct8x8_10_add_sse2(in, out, stride, 12);
+}
+
+void idct8x8_64_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct8x8_64_add_sse2(in, out, stride, 10);
+}
+
+void idct8x8_64_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct8x8_64_add_sse2(in, out, stride, 12);
+}
+#endif  // HAVE_SSE2
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+class FwdTrans8x8TestBase {
+ public:
+  virtual ~FwdTrans8x8TestBase() {}
+
+ protected:
+  virtual void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) = 0;
+  virtual void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) = 0;
+
+  void RunSignBiasCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    DECLARE_ALIGNED(16, int16_t, test_input_block[64]);
+    DECLARE_ALIGNED(16, tran_low_t, test_output_block[64]);
+    int count_sign_block[64][2];
+    const int count_test_block = 100000;
+
+    memset(count_sign_block, 0, sizeof(count_sign_block));
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-255, 255].
+      for (int j = 0; j < 64; ++j)
+        test_input_block[j] = ((rnd.Rand16() >> (16 - bit_depth_)) & mask_) -
+                              ((rnd.Rand16() >> (16 - bit_depth_)) & mask_);
+      ASM_REGISTER_STATE_CHECK(
+          RunFwdTxfm(test_input_block, test_output_block, pitch_));
+
+      for (int j = 0; j < 64; ++j) {
+        if (test_output_block[j] < 0)
+          ++count_sign_block[j][0];
+        else if (test_output_block[j] > 0)
+          ++count_sign_block[j][1];
+      }
+    }
+
+    for (int j = 0; j < 64; ++j) {
+      const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]);
+      const int max_diff = kSignBiasMaxDiff255;
+      EXPECT_LT(diff, max_diff << (bit_depth_ - 8))
+          << "Error: 8x8 FDCT/FHT has a sign bias > "
+          << 1. * max_diff / count_test_block * 100 << "%"
+          << " for input range [-255, 255] at index " << j
+          << " count0: " << count_sign_block[j][0]
+          << " count1: " << count_sign_block[j][1]
+          << " diff: " << diff;
+    }
+
+    memset(count_sign_block, 0, sizeof(count_sign_block));
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-mask_ / 16, mask_ / 16].
+      for (int j = 0; j < 64; ++j)
+        test_input_block[j] = ((rnd.Rand16() & mask_) >> 4) -
+                              ((rnd.Rand16() & mask_) >> 4);
+      ASM_REGISTER_STATE_CHECK(
+          RunFwdTxfm(test_input_block, test_output_block, pitch_));
+
+      for (int j = 0; j < 64; ++j) {
+        if (test_output_block[j] < 0)
+          ++count_sign_block[j][0];
+        else if (test_output_block[j] > 0)
+          ++count_sign_block[j][1];
+      }
+    }
+
+    for (int j = 0; j < 64; ++j) {
+      const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]);
+      const int max_diff = kSignBiasMaxDiff15;
+      EXPECT_LT(diff, max_diff << (bit_depth_ - 8))
+          << "Error: 8x8 FDCT/FHT has a sign bias > "
+          << 1. * max_diff / count_test_block * 100 << "%"
+          << " for input range [-15, 15] at index " << j
+          << " count0: " << count_sign_block[j][0]
+          << " count1: " << count_sign_block[j][1]
+          << " diff: " << diff;
+    }
+  }
+
+  void RunRoundTripErrorCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    int max_error = 0;
+    int total_error = 0;
+    const int count_test_block = 100000;
+    DECLARE_ALIGNED(16, int16_t, test_input_block[64]);
+    DECLARE_ALIGNED(16, tran_low_t, test_temp_block[64]);
+    DECLARE_ALIGNED(16, uint8_t, dst[64]);
+    DECLARE_ALIGNED(16, uint8_t, src[64]);
+#if CONFIG_VP9_HIGHBITDEPTH
+    DECLARE_ALIGNED(16, uint16_t, dst16[64]);
+    DECLARE_ALIGNED(16, uint16_t, src16[64]);
+#endif
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-mask_, mask_].
+      for (int j = 0; j < 64; ++j) {
+        if (bit_depth_ == VPX_BITS_8) {
+          src[j] = rnd.Rand8();
+          dst[j] = rnd.Rand8();
+          test_input_block[j] = src[j] - dst[j];
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          src16[j] = rnd.Rand16() & mask_;
+          dst16[j] = rnd.Rand16() & mask_;
+          test_input_block[j] = src16[j] - dst16[j];
+#endif
+        }
+      }
+
+      ASM_REGISTER_STATE_CHECK(
+          RunFwdTxfm(test_input_block, test_temp_block, pitch_));
+      for (int j = 0; j < 64; ++j) {
+          if (test_temp_block[j] > 0) {
+            test_temp_block[j] += 2;
+            test_temp_block[j] /= 4;
+            test_temp_block[j] *= 4;
+          } else {
+            test_temp_block[j] -= 2;
+            test_temp_block[j] /= 4;
+            test_temp_block[j] *= 4;
+          }
+      }
+      if (bit_depth_ == VPX_BITS_8) {
+        ASM_REGISTER_STATE_CHECK(
+            RunInvTxfm(test_temp_block, dst, pitch_));
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        ASM_REGISTER_STATE_CHECK(
+            RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));
+#endif
+      }
+
+      for (int j = 0; j < 64; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        const int diff =
+            bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+#else
+        const int diff = dst[j] - src[j];
+#endif
+        const int error = diff * diff;
+        if (max_error < error)
+          max_error = error;
+        total_error += error;
+      }
+    }
+
+    EXPECT_GE(1 << 2 * (bit_depth_ - 8), max_error)
+      << "Error: 8x8 FDCT/IDCT or FHT/IHT has an individual"
+      << " roundtrip error > 1";
+
+    EXPECT_GE((count_test_block << 2 * (bit_depth_ - 8))/5, total_error)
+      << "Error: 8x8 FDCT/IDCT or FHT/IHT has average roundtrip "
+      << "error > 1/5 per block";
+  }
+
+  void RunExtremalCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    int max_error = 0;
+    int total_error = 0;
+    int total_coeff_error = 0;
+    const int count_test_block = 100000;
+    DECLARE_ALIGNED(16, int16_t, test_input_block[64]);
+    DECLARE_ALIGNED(16, tran_low_t, test_temp_block[64]);
+    DECLARE_ALIGNED(16, tran_low_t, ref_temp_block[64]);
+    DECLARE_ALIGNED(16, uint8_t, dst[64]);
+    DECLARE_ALIGNED(16, uint8_t, src[64]);
+#if CONFIG_VP9_HIGHBITDEPTH
+    DECLARE_ALIGNED(16, uint16_t, dst16[64]);
+    DECLARE_ALIGNED(16, uint16_t, src16[64]);
+#endif
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-mask_, mask_].
+      for (int j = 0; j < 64; ++j) {
+        if (bit_depth_ == VPX_BITS_8) {
+          if (i == 0) {
+            src[j] = 255;
+            dst[j] = 0;
+          } else if (i == 1) {
+            src[j] = 0;
+            dst[j] = 255;
+          } else {
+            src[j] = rnd.Rand8() % 2 ? 255 : 0;
+            dst[j] = rnd.Rand8() % 2 ? 255 : 0;
+          }
+          test_input_block[j] = src[j] - dst[j];
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          if (i == 0) {
+            src16[j] = mask_;
+            dst16[j] = 0;
+          } else if (i == 1) {
+            src16[j] = 0;
+            dst16[j] = mask_;
+          } else {
+            src16[j] = rnd.Rand8() % 2 ? mask_ : 0;
+            dst16[j] = rnd.Rand8() % 2 ? mask_ : 0;
+          }
+          test_input_block[j] = src16[j] - dst16[j];
+#endif
+        }
+      }
+
+      ASM_REGISTER_STATE_CHECK(
+          RunFwdTxfm(test_input_block, test_temp_block, pitch_));
+      ASM_REGISTER_STATE_CHECK(
+          fwd_txfm_ref(test_input_block, ref_temp_block, pitch_, tx_type_));
+      if (bit_depth_ == VPX_BITS_8) {
+        ASM_REGISTER_STATE_CHECK(
+            RunInvTxfm(test_temp_block, dst, pitch_));
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        ASM_REGISTER_STATE_CHECK(
+            RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));
+#endif
+      }
+
+      for (int j = 0; j < 64; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        const int diff =
+            bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+#else
+        const int diff = dst[j] - src[j];
+#endif
+        const int error = diff * diff;
+        if (max_error < error)
+          max_error = error;
+        total_error += error;
+
+        const int coeff_diff = test_temp_block[j] - ref_temp_block[j];
+        total_coeff_error += abs(coeff_diff);
+      }
+
+      EXPECT_GE(1 << 2 * (bit_depth_ - 8), max_error)
+          << "Error: Extremal 8x8 FDCT/IDCT or FHT/IHT has"
+          << "an individual roundtrip error > 1";
+
+      EXPECT_GE((count_test_block << 2 * (bit_depth_ - 8))/5, total_error)
+          << "Error: Extremal 8x8 FDCT/IDCT or FHT/IHT has average"
+          << " roundtrip error > 1/5 per block";
+
+      EXPECT_EQ(0, total_coeff_error)
+          << "Error: Extremal 8x8 FDCT/FHT has"
+          << "overflow issues in the intermediate steps > 1";
+    }
+  }
+
+  void RunInvAccuracyCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 1000;
+    DECLARE_ALIGNED(16, int16_t, in[kNumCoeffs]);
+    DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]);
+#if CONFIG_VP9_HIGHBITDEPTH
+    DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
+#endif
+
+    for (int i = 0; i < count_test_block; ++i) {
+      double out_r[kNumCoeffs];
+
+      // Initialize a test block with input range [-255, 255].
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        if (bit_depth_ == VPX_BITS_8) {
+          src[j] = rnd.Rand8() % 2 ? 255 : 0;
+          dst[j] = src[j] > 0 ? 0 : 255;
+          in[j] = src[j] - dst[j];
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          src16[j] = rnd.Rand8() % 2 ? mask_ : 0;
+          dst16[j] = src16[j] > 0 ? 0 : mask_;
+          in[j] = src16[j] - dst16[j];
+#endif
+        }
+      }
+
+      reference_8x8_dct_2d(in, out_r);
+      for (int j = 0; j < kNumCoeffs; ++j)
+        coeff[j] = static_cast<tran_low_t>(round(out_r[j]));
+
+      if (bit_depth_ == VPX_BITS_8) {
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16),
+                                            pitch_));
+#endif
+      }
+
+      for (int j = 0; j < kNumCoeffs; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        const uint32_t diff =
+            bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+#else
+        const uint32_t diff = dst[j] - src[j];
+#endif
+        const uint32_t error = diff * diff;
+        EXPECT_GE(1u << 2 * (bit_depth_ - 8), error)
+            << "Error: 8x8 IDCT has error " << error
+            << " at index " << j;
+      }
+    }
+  }
+
+  void RunFwdAccuracyCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 1000;
+    DECLARE_ALIGNED(16, int16_t, in[kNumCoeffs]);
+    DECLARE_ALIGNED(16, tran_low_t, coeff_r[kNumCoeffs]);
+    DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]);
+
+    for (int i = 0; i < count_test_block; ++i) {
+      double out_r[kNumCoeffs];
+
+      // Initialize a test block with input range [-mask_, mask_].
+      for (int j = 0; j < kNumCoeffs; ++j)
+        in[j] = rnd.Rand8() % 2 == 0 ? mask_ : -mask_;
+
+      RunFwdTxfm(in, coeff, pitch_);
+      reference_8x8_dct_2d(in, out_r);
+      for (int j = 0; j < kNumCoeffs; ++j)
+        coeff_r[j] = static_cast<tran_low_t>(round(out_r[j]));
+
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        const uint32_t diff = coeff[j] - coeff_r[j];
+        const uint32_t error = diff * diff;
+        EXPECT_GE(9u << 2 * (bit_depth_ - 8), error)
+            << "Error: 8x8 DCT has error " << error
+            << " at index " << j;
+      }
+    }
+  }
+
+void CompareInvReference(IdctFunc ref_txfm, int thresh) {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 10000;
+    const int eob = 12;
+    DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, ref[kNumCoeffs]);
+#if CONFIG_VP9_HIGHBITDEPTH
+    DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint16_t, ref16[kNumCoeffs]);
+#endif
+    const int16_t *scan = vp9_default_scan_orders[TX_8X8].scan;
+
+    for (int i = 0; i < count_test_block; ++i) {
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        if (j < eob) {
+          // Random values less than the threshold, either positive or negative
+          coeff[scan[j]] = rnd(thresh) * (1-2*(i%2));
+        } else {
+          coeff[scan[j]] = 0;
+        }
+        if (bit_depth_ == VPX_BITS_8) {
+          dst[j] = 0;
+          ref[j] = 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          dst16[j] = 0;
+          ref16[j] = 0;
+#endif
+        }
+      }
+      if (bit_depth_ == VPX_BITS_8) {
+        ref_txfm(coeff, ref, pitch_);
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        ref_txfm(coeff, CONVERT_TO_BYTEPTR(ref16), pitch_);
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16),
+                                            pitch_));
+#endif
+      }
+
+      for (int j = 0; j < kNumCoeffs; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        const uint32_t diff =
+            bit_depth_ == VPX_BITS_8 ? dst[j] - ref[j] : dst16[j] - ref16[j];
+#else
+        const uint32_t diff = dst[j] - ref[j];
+#endif
+        const uint32_t error = diff * diff;
+        EXPECT_EQ(0u, error)
+            << "Error: 8x8 IDCT has error " << error
+            << " at index " << j;
+      }
+    }
+  }
+  int pitch_;
+  int tx_type_;
+  FhtFunc fwd_txfm_ref;
+  vpx_bit_depth_t bit_depth_;
+  int mask_;
+};
+
+class FwdTrans8x8DCT
+    : public FwdTrans8x8TestBase,
+      public ::testing::TestWithParam<Dct8x8Param> {
+ public:
+  virtual ~FwdTrans8x8DCT() {}
+
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    tx_type_  = GET_PARAM(2);
+    pitch_    = 8;
+    fwd_txfm_ref = fdct8x8_ref;
+    bit_depth_ = GET_PARAM(3);
+    mask_ = (1 << bit_depth_) - 1;
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {
+    fwd_txfm_(in, out, stride);
+  }
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride);
+  }
+
+  FdctFunc fwd_txfm_;
+  IdctFunc inv_txfm_;
+};
+
+TEST_P(FwdTrans8x8DCT, SignBiasCheck) {
+  RunSignBiasCheck();
+}
+
+TEST_P(FwdTrans8x8DCT, RoundTripErrorCheck) {
+  RunRoundTripErrorCheck();
+}
+
+TEST_P(FwdTrans8x8DCT, ExtremalCheck) {
+  RunExtremalCheck();
+}
+
+TEST_P(FwdTrans8x8DCT, FwdAccuracyCheck) {
+  RunFwdAccuracyCheck();
+}
+
+TEST_P(FwdTrans8x8DCT, InvAccuracyCheck) {
+  RunInvAccuracyCheck();
+}
+
+class FwdTrans8x8HT
+    : public FwdTrans8x8TestBase,
+      public ::testing::TestWithParam<Ht8x8Param> {
+ public:
+  virtual ~FwdTrans8x8HT() {}
+
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    tx_type_  = GET_PARAM(2);
+    pitch_    = 8;
+    fwd_txfm_ref = fht8x8_ref;
+    bit_depth_ = GET_PARAM(3);
+    mask_ = (1 << bit_depth_) - 1;
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {
+    fwd_txfm_(in, out, stride, tx_type_);
+  }
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride, tx_type_);
+  }
+
+  FhtFunc fwd_txfm_;
+  IhtFunc inv_txfm_;
+};
+
+TEST_P(FwdTrans8x8HT, SignBiasCheck) {
+  RunSignBiasCheck();
+}
+
+TEST_P(FwdTrans8x8HT, RoundTripErrorCheck) {
+  RunRoundTripErrorCheck();
+}
+
+TEST_P(FwdTrans8x8HT, ExtremalCheck) {
+  RunExtremalCheck();
+}
+
+class InvTrans8x8DCT
+    : public FwdTrans8x8TestBase,
+      public ::testing::TestWithParam<Idct8x8Param> {
+ public:
+  virtual ~InvTrans8x8DCT() {}
+
+  virtual void SetUp() {
+    ref_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    thresh_ = GET_PARAM(2);
+    pitch_ = 8;
+    bit_depth_ = GET_PARAM(3);
+    mask_ = (1 << bit_depth_) - 1;
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride);
+  }
+  void RunFwdTxfm(int16_t * /*out*/, tran_low_t * /*dst*/, int /*stride*/) {}
+
+  IdctFunc ref_txfm_;
+  IdctFunc inv_txfm_;
+  int thresh_;
+};
+
+TEST_P(InvTrans8x8DCT, CompareReference) {
+  CompareInvReference(ref_txfm_, thresh_);
+}
+
+using std::tr1::make_tuple;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    C, FwdTrans8x8DCT,
+    ::testing::Values(
+        make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_c, 0, VPX_BITS_8),
+        make_tuple(&vpx_highbd_fdct8x8_c, &idct8x8_10, 0, VPX_BITS_10),
+        make_tuple(&vpx_highbd_fdct8x8_c, &idct8x8_12, 0, VPX_BITS_12)));
+#else
+INSTANTIATE_TEST_CASE_P(
+    C, FwdTrans8x8DCT,
+    ::testing::Values(
+        make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_c, 0, VPX_BITS_8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    C, FwdTrans8x8HT,
+    ::testing::Values(
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8),
+        make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 0, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 1, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 2, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 3, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_12, 0, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_12, 1, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_12, 2, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_12, 3, VPX_BITS_12),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 3, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(
+    C, FwdTrans8x8HT,
+    ::testing::Values(
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 3, VPX_BITS_8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    NEON, FwdTrans8x8DCT,
+    ::testing::Values(
+        make_tuple(&vpx_fdct8x8_neon, &vpx_idct8x8_64_add_neon, 0,
+                   VPX_BITS_8)));
+#endif  // HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    NEON, FwdTrans8x8HT,
+    ::testing::Values(
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 3, VPX_BITS_8)));
+#endif  // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    SSE2, FwdTrans8x8DCT,
+    ::testing::Values(
+        make_tuple(&vpx_fdct8x8_sse2, &vpx_idct8x8_64_add_sse2, 0,
+                   VPX_BITS_8)));
+INSTANTIATE_TEST_CASE_P(
+    SSE2, FwdTrans8x8HT,
+    ::testing::Values(
+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 3, VPX_BITS_8)));
+#endif  // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    SSE2, FwdTrans8x8DCT,
+    ::testing::Values(
+        make_tuple(&vpx_fdct8x8_sse2, &vpx_idct8x8_64_add_c, 0, VPX_BITS_8),
+        make_tuple(&vpx_highbd_fdct8x8_c,
+                   &idct8x8_64_add_10_sse2, 12, VPX_BITS_10),
+        make_tuple(&vpx_highbd_fdct8x8_sse2,
+                   &idct8x8_64_add_10_sse2, 12, VPX_BITS_10),
+        make_tuple(&vpx_highbd_fdct8x8_c,
+                   &idct8x8_64_add_12_sse2, 12, VPX_BITS_12),
+        make_tuple(&vpx_highbd_fdct8x8_sse2,
+                   &idct8x8_64_add_12_sse2, 12, VPX_BITS_12)));
+
+INSTANTIATE_TEST_CASE_P(
+    SSE2, FwdTrans8x8HT,
+    ::testing::Values(
+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 3, VPX_BITS_8)));
+
+// Optimizations take effect at a threshold of 6201, so we use a value close to
+// that to test both branches.
+INSTANTIATE_TEST_CASE_P(
+    SSE2, InvTrans8x8DCT,
+    ::testing::Values(
+        make_tuple(&idct8x8_10_add_10_c,
+                   &idct8x8_10_add_10_sse2, 6225, VPX_BITS_10),
+        make_tuple(&idct8x8_10,
+                   &idct8x8_64_add_10_sse2, 6225, VPX_BITS_10),
+        make_tuple(&idct8x8_10_add_12_c,
+                   &idct8x8_10_add_12_sse2, 6225, VPX_BITS_12),
+        make_tuple(&idct8x8_12,
+                   &idct8x8_64_add_12_sse2, 6225, VPX_BITS_12)));
+#endif  // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_SSSE3 && CONFIG_USE_X86INC && ARCH_X86_64 && \
+    !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    SSSE3, FwdTrans8x8DCT,
+    ::testing::Values(
+        make_tuple(&vpx_fdct8x8_ssse3, &vpx_idct8x8_64_add_ssse3, 0,
+                   VPX_BITS_8)));
+#endif
+
+#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    MSA, FwdTrans8x8DCT,
+    ::testing::Values(
+        make_tuple(&vpx_fdct8x8_msa, &vpx_idct8x8_64_add_msa, 0, VPX_BITS_8)));
+INSTANTIATE_TEST_CASE_P(
+    MSA, FwdTrans8x8HT,
+    ::testing::Values(
+        make_tuple(&vp9_fht8x8_msa, &vp9_iht8x8_64_add_msa, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_msa, &vp9_iht8x8_64_add_msa, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_msa, &vp9_iht8x8_64_add_msa, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_msa, &vp9_iht8x8_64_add_msa, 3, VPX_BITS_8)));
+#endif  // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+}  // namespace
diff --git a/libs/libvpx/test/frame_size_tests.cc b/libs/libvpx/test/frame_size_tests.cc
new file mode 100644
index 0000000000..d39c8f6ee9
--- /dev/null
+++ b/libs/libvpx/test/frame_size_tests.cc
@@ -0,0 +1,96 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/video_source.h"
+
+namespace {
+
+class VP9FrameSizeTestsLarge
+    : public ::libvpx_test::EncoderTest,
+      public ::testing::Test {
+ protected:
+  VP9FrameSizeTestsLarge() : EncoderTest(&::libvpx_test::kVP9),
+                             expected_res_(VPX_CODEC_OK) {}
+  virtual ~VP9FrameSizeTestsLarge() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+  }
+
+  virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec,
+                                  const libvpx_test::VideoSource& /*video*/,
+                                  libvpx_test::Decoder *decoder) {
+    EXPECT_EQ(expected_res_, res_dec) << decoder->DecodeError();
+    return !::testing::Test::HasFailure();
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 1) {
+      encoder->Control(VP8E_SET_CPUUSED, 7);
+      encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
+      encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
+      encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
+      encoder->Control(VP8E_SET_ARNR_TYPE, 3);
+    }
+  }
+
+  int expected_res_;
+};
+
+TEST_F(VP9FrameSizeTestsLarge, TestInvalidSizes) {
+  ::libvpx_test::RandomVideoSource video;
+
+#if CONFIG_SIZE_LIMIT
+  video.SetSize(DECODE_WIDTH_LIMIT + 16, DECODE_HEIGHT_LIMIT + 16);
+  video.set_limit(2);
+  expected_res_ = VPX_CODEC_CORRUPT_FRAME;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+#endif
+}
+
+TEST_F(VP9FrameSizeTestsLarge, ValidSizes) {
+  ::libvpx_test::RandomVideoSource video;
+
+#if CONFIG_SIZE_LIMIT
+  video.SetSize(DECODE_WIDTH_LIMIT, DECODE_HEIGHT_LIMIT);
+  video.set_limit(2);
+  expected_res_ = VPX_CODEC_OK;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+#else
+  // This test produces a pretty large single frame allocation,  (roughly
+  // 25 megabits). The encoder allocates a good number of these frames
+  // one for each lag in frames (for 2 pass), and then one for each possible
+  // reference buffer (8) - we can end up with up to 30 buffers of roughly this
+  // size or almost 1 gig of memory.
+  // In total the allocations will exceed 2GiB which may cause a failure with
+  // mingw + wine, use a smaller size in that case.
+#if defined(_WIN32) && !defined(_WIN64) || defined(__OS2__)
+  video.SetSize(4096, 3072);
+#else
+  video.SetSize(4096, 4096);
+#endif
+  video.set_limit(2);
+  expected_res_ = VPX_CODEC_OK;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+#endif
+}
+
+TEST_F(VP9FrameSizeTestsLarge, OneByOneVideo) {
+  ::libvpx_test::RandomVideoSource video;
+
+  video.SetSize(1, 1);
+  video.set_limit(2);
+  expected_res_ = VPX_CODEC_OK;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+}  // namespace
diff --git a/libs/libvpx/test/i420_video_source.h b/libs/libvpx/test/i420_video_source.h
new file mode 100644
index 0000000000..0a184805c2
--- /dev/null
+++ b/libs/libvpx/test/i420_video_source.h
@@ -0,0 +1,36 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef TEST_I420_VIDEO_SOURCE_H_
+#define TEST_I420_VIDEO_SOURCE_H_
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+
+#include "test/yuv_video_source.h"
+
+namespace libvpx_test {
+
+// This class extends VideoSource to allow parsing of raw yv12
+// so that we can do actual file encodes.
+class I420VideoSource : public YUVVideoSource {
+ public:
+  I420VideoSource(const std::string &file_name,
+                  unsigned int width, unsigned int height,
+                  int rate_numerator, int rate_denominator,
+                  unsigned int start, int limit)
+      : YUVVideoSource(file_name, VPX_IMG_FMT_I420,
+                       width, height,
+                       rate_numerator, rate_denominator,
+                       start, limit) {}
+};
+
+}  // namespace libvpx_test
+
+#endif  // TEST_I420_VIDEO_SOURCE_H_
diff --git a/libs/libvpx/test/idct8x8_test.cc b/libs/libvpx/test/idct8x8_test.cc
new file mode 100644
index 0000000000..7f9d751d65
--- /dev/null
+++ b/libs/libvpx/test/idct8x8_test.cc
@@ -0,0 +1,101 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "vpx/vpx_integer.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+
+#ifdef _MSC_VER
+static int round(double x) {
+  if (x < 0)
+    return static_cast<int>(ceil(x - 0.5));
+  else
+    return static_cast<int>(floor(x + 0.5));
+}
+#endif
+
+void reference_dct_1d(double input[8], double output[8]) {
+  const double kPi = 3.141592653589793238462643383279502884;
+  const double kInvSqrt2 = 0.707106781186547524400844362104;
+  for (int k = 0; k < 8; k++) {
+    output[k] = 0.0;
+    for (int n = 0; n < 8; n++)
+      output[k] += input[n]*cos(kPi*(2*n+1)*k/16.0);
+    if (k == 0)
+      output[k] = output[k]*kInvSqrt2;
+  }
+}
+
+void reference_dct_2d(int16_t input[64], double output[64]) {
+  // First transform columns
+  for (int i = 0; i < 8; ++i) {
+    double temp_in[8], temp_out[8];
+    for (int j = 0; j < 8; ++j)
+      temp_in[j] = input[j*8 + i];
+    reference_dct_1d(temp_in, temp_out);
+    for (int j = 0; j < 8; ++j)
+      output[j*8 + i] = temp_out[j];
+  }
+  // Then transform rows
+  for (int i = 0; i < 8; ++i) {
+    double temp_in[8], temp_out[8];
+    for (int j = 0; j < 8; ++j)
+      temp_in[j] = output[j + i*8];
+    reference_dct_1d(temp_in, temp_out);
+    for (int j = 0; j < 8; ++j)
+      output[j + i*8] = temp_out[j];
+  }
+  // Scale by some magic number
+  for (int i = 0; i < 64; ++i)
+    output[i] *= 2;
+}
+
+TEST(VP9Idct8x8Test, AccuracyCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = 10000;
+  for (int i = 0; i < count_test_block; ++i) {
+    int16_t input[64];
+    tran_low_t coeff[64];
+    double output_r[64];
+    uint8_t dst[64], src[64];
+
+    for (int j = 0; j < 64; ++j) {
+      src[j] = rnd.Rand8();
+      dst[j] = rnd.Rand8();
+    }
+    // Initialize a test block with input range [-255, 255].
+    for (int j = 0; j < 64; ++j)
+      input[j] = src[j] - dst[j];
+
+    reference_dct_2d(input, output_r);
+    for (int j = 0; j < 64; ++j)
+      coeff[j] = round(output_r[j]);
+    vpx_idct8x8_64_add_c(coeff, dst, 8);
+    for (int j = 0; j < 64; ++j) {
+      const int diff = dst[j] - src[j];
+      const int error = diff * diff;
+      EXPECT_GE(1, error)
+          << "Error: 8x8 FDCT/IDCT has error " << error
+          << " at index " << j;
+    }
+  }
+}
+
+}  // namespace
diff --git a/libs/libvpx/test/idct_test.cc b/libs/libvpx/test/idct_test.cc
new file mode 100644
index 0000000000..39db3e4c61
--- /dev/null
+++ b/libs/libvpx/test/idct_test.cc
@@ -0,0 +1,121 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vp8_rtcd.h"
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "vpx/vpx_integer.h"
+
+typedef void (*IdctFunc)(int16_t *input, unsigned char *pred_ptr,
+                         int pred_stride, unsigned char *dst_ptr,
+                         int dst_stride);
+namespace {
+class IDCTTest : public ::testing::TestWithParam<IdctFunc> {
+ protected:
+  virtual void SetUp() {
+    int i;
+
+    UUT = GetParam();
+    memset(input, 0, sizeof(input));
+    /* Set up guard blocks */
+    for (i = 0; i < 256; i++) output[i] = ((i & 0xF) < 4 && (i < 64)) ? 0 : -1;
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+  IdctFunc UUT;
+  int16_t input[16];
+  unsigned char output[256];
+  unsigned char predict[256];
+};
+
+TEST_P(IDCTTest, TestGuardBlocks) {
+  int i;
+
+  for (i = 0; i < 256; i++)
+    if ((i & 0xF) < 4 && i < 64)
+      EXPECT_EQ(0, output[i]) << i;
+    else
+      EXPECT_EQ(255, output[i]);
+}
+
+TEST_P(IDCTTest, TestAllZeros) {
+  int i;
+
+  ASM_REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));
+
+  for (i = 0; i < 256; i++)
+    if ((i & 0xF) < 4 && i < 64)
+      EXPECT_EQ(0, output[i]) << "i==" << i;
+    else
+      EXPECT_EQ(255, output[i]) << "i==" << i;
+}
+
+TEST_P(IDCTTest, TestAllOnes) {
+  int i;
+
+  input[0] = 4;
+  ASM_REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));
+
+  for (i = 0; i < 256; i++)
+    if ((i & 0xF) < 4 && i < 64)
+      EXPECT_EQ(1, output[i]) << "i==" << i;
+    else
+      EXPECT_EQ(255, output[i]) << "i==" << i;
+}
+
+TEST_P(IDCTTest, TestAddOne) {
+  int i;
+
+  for (i = 0; i < 256; i++) predict[i] = i;
+  input[0] = 4;
+  ASM_REGISTER_STATE_CHECK(UUT(input, predict, 16, output, 16));
+
+  for (i = 0; i < 256; i++)
+    if ((i & 0xF) < 4 && i < 64)
+      EXPECT_EQ(i + 1, output[i]) << "i==" << i;
+    else
+      EXPECT_EQ(255, output[i]) << "i==" << i;
+}
+
+TEST_P(IDCTTest, TestWithData) {
+  int i;
+
+  for (i = 0; i < 16; i++) input[i] = i;
+
+  ASM_REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));
+
+  for (i = 0; i < 256; i++)
+    if ((i & 0xF) > 3 || i > 63)
+      EXPECT_EQ(255, output[i]) << "i==" << i;
+    else if (i == 0)
+      EXPECT_EQ(11, output[i]) << "i==" << i;
+    else if (i == 34)
+      EXPECT_EQ(1, output[i]) << "i==" << i;
+    else if (i == 2 || i == 17 || i == 32)
+      EXPECT_EQ(3, output[i]) << "i==" << i;
+    else
+      EXPECT_EQ(0, output[i]) << "i==" << i;
+}
+
+INSTANTIATE_TEST_CASE_P(C, IDCTTest, ::testing::Values(vp8_short_idct4x4llm_c));
+#if HAVE_MMX
+INSTANTIATE_TEST_CASE_P(MMX, IDCTTest,
+                        ::testing::Values(vp8_short_idct4x4llm_mmx));
+#endif
+#if HAVE_MSA
+INSTANTIATE_TEST_CASE_P(MSA, IDCTTest,
+                        ::testing::Values(vp8_short_idct4x4llm_msa));
+#endif
+}
diff --git a/libs/libvpx/test/invalid_file_test.cc b/libs/libvpx/test/invalid_file_test.cc
new file mode 100644
index 0000000000..f4241eb822
--- /dev/null
+++ b/libs/libvpx/test/invalid_file_test.cc
@@ -0,0 +1,182 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include <vector>
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "./vpx_config.h"
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/ivf_video_source.h"
+#include "test/util.h"
+#if CONFIG_WEBM_IO
+#include "test/webm_video_source.h"
+#endif
+#include "vpx_mem/vpx_mem.h"
+
+namespace {
+
+struct DecodeParam {
+  int threads;
+  const char *filename;
+};
+
+std::ostream &operator<<(std::ostream &os, const DecodeParam &dp) {
+  return os << "threads: " << dp.threads << " file: " << dp.filename;
+}
+
+class InvalidFileTest
+    : public ::libvpx_test::DecoderTest,
+      public ::libvpx_test::CodecTestWithParam<DecodeParam> {
+ protected:
+  InvalidFileTest() : DecoderTest(GET_PARAM(0)), res_file_(NULL) {}
+
+  virtual ~InvalidFileTest() {
+    if (res_file_ != NULL)
+      fclose(res_file_);
+  }
+
+  void OpenResFile(const std::string &res_file_name_) {
+    res_file_ = libvpx_test::OpenTestDataFile(res_file_name_);
+    ASSERT_TRUE(res_file_ != NULL) << "Result file open failed. Filename: "
+        << res_file_name_;
+  }
+
+  virtual bool HandleDecodeResult(
+      const vpx_codec_err_t res_dec,
+      const libvpx_test::CompressedVideoSource &video,
+      libvpx_test::Decoder *decoder) {
+    EXPECT_TRUE(res_file_ != NULL);
+    int expected_res_dec;
+
+    // Read integer result.
+    const int res = fscanf(res_file_, "%d", &expected_res_dec);
+    EXPECT_NE(res, EOF) << "Read result data failed";
+
+    // Check results match.
+    const DecodeParam input = GET_PARAM(1);
+    if (input.threads > 1) {
+      // The serial decode check is too strict for tile-threaded decoding as
+      // there is no guarantee on the decode order nor which specific error
+      // will take precedence. Currently a tile-level error is not forwarded so
+      // the frame will simply be marked corrupt.
+      EXPECT_TRUE(res_dec == expected_res_dec ||
+                  res_dec == VPX_CODEC_CORRUPT_FRAME)
+          << "Results don't match: frame number = " << video.frame_number()
+          << ". (" << decoder->DecodeError() << "). Expected: "
+          << expected_res_dec << " or " << VPX_CODEC_CORRUPT_FRAME;
+    } else {
+      EXPECT_EQ(expected_res_dec, res_dec)
+          << "Results don't match: frame number = " << video.frame_number()
+          << ". (" << decoder->DecodeError() << ")";
+    }
+
+    return !HasFailure();
+  }
+
+  void RunTest() {
+    const DecodeParam input = GET_PARAM(1);
+    libvpx_test::CompressedVideoSource *video = NULL;
+    vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
+    cfg.threads = input.threads;
+    const std::string filename = input.filename;
+
+    // Open compressed video file.
+    if (filename.substr(filename.length() - 3, 3) == "ivf") {
+      video = new libvpx_test::IVFVideoSource(filename);
+    } else if (filename.substr(filename.length() - 4, 4) == "webm") {
+#if CONFIG_WEBM_IO
+      video = new libvpx_test::WebMVideoSource(filename);
+#else
+      fprintf(stderr, "WebM IO is disabled, skipping test vector %s\n",
+              filename.c_str());
+      return;
+#endif
+    }
+    video->Init();
+
+    // Construct result file name. The file holds a list of expected integer
+    // results, one for each decoded frame.  Any result that doesn't match
+    // the files list will cause a test failure.
+    const std::string res_filename = filename + ".res";
+    OpenResFile(res_filename);
+
+    // Decode frame, and check the md5 matching.
+    ASSERT_NO_FATAL_FAILURE(RunLoop(video, cfg));
+    delete video;
+  }
+
+ private:
+  FILE *res_file_;
+};
+
+TEST_P(InvalidFileTest, ReturnCode) {
+  RunTest();
+}
+
+const DecodeParam kVP9InvalidFileTests[] = {
+  {1, "invalid-vp90-02-v2.webm"},
+#if CONFIG_VP9_HIGHBITDEPTH
+  {1, "invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf"},
+#endif
+  {1, "invalid-vp90-03-v3.webm"},
+  {1, "invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf"},
+  {1, "invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf"},
+  {1, "invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf"},
+  {1, "invalid-vp90-2-05-resize.ivf.s59293_r01-05_b6-.ivf"},
+  {1, "invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.v2.ivf"},
+  {1, "invalid-vp91-2-mixedrefcsp-444to420.ivf"},
+  {1, "invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf"},
+  {1, "invalid-vp90-2-03-size-224x196.webm.ivf.s44156_r01-05_b6-.ivf"},
+  {1, "invalid-vp90-2-03-size-202x210.webm.ivf.s113306_r01-05_b6-.ivf"},
+};
+
+VP9_INSTANTIATE_TEST_CASE(InvalidFileTest,
+                          ::testing::ValuesIn(kVP9InvalidFileTests));
+
+// This class will include test vectors that are expected to fail
+// peek. However they are still expected to have no fatal failures.
+class InvalidFileInvalidPeekTest : public InvalidFileTest {
+ protected:
+  InvalidFileInvalidPeekTest() : InvalidFileTest() {}
+  virtual void HandlePeekResult(libvpx_test::Decoder *const /*decoder*/,
+                                libvpx_test::CompressedVideoSource* /*video*/,
+                                const vpx_codec_err_t /*res_peek*/) {}
+};
+
+TEST_P(InvalidFileInvalidPeekTest, ReturnCode) {
+  RunTest();
+}
+
+const DecodeParam kVP9InvalidFileInvalidPeekTests[] = {
+  {1, "invalid-vp90-01-v3.webm"},
+};
+
+VP9_INSTANTIATE_TEST_CASE(InvalidFileInvalidPeekTest,
+                          ::testing::ValuesIn(kVP9InvalidFileInvalidPeekTests));
+
+const DecodeParam kMultiThreadedVP9InvalidFileTests[] = {
+  {4, "invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm"},
+  {4, "invalid-"
+      "vp90-2-08-tile_1x2_frame_parallel.webm.ivf.s47039_r01-05_b6-.ivf"},
+  {4, "invalid-vp90-2-08-tile_1x8_frame_parallel.webm.ivf.s288_r01-05_b6-.ivf"},
+  {2, "invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.v2.ivf"},
+  {4, "invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.v2.ivf"},
+};
+
+INSTANTIATE_TEST_CASE_P(
+    VP9MultiThreaded, InvalidFileTest,
+    ::testing::Combine(
+        ::testing::Values(
+            static_cast<const libvpx_test::CodecFactory*>(&libvpx_test::kVP9)),
+        ::testing::ValuesIn(kMultiThreadedVP9InvalidFileTests)));
+}  // namespace
diff --git a/libs/libvpx/test/ivf_video_source.h b/libs/libvpx/test/ivf_video_source.h
new file mode 100644
index 0000000000..824a39d7e6
--- /dev/null
+++ b/libs/libvpx/test/ivf_video_source.h
@@ -0,0 +1,111 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef TEST_IVF_VIDEO_SOURCE_H_
+#define TEST_IVF_VIDEO_SOURCE_H_
+#include <cstdio>
+#include <cstdlib>
+#include <new>
+#include <string>
+#include "test/video_source.h"
+
+namespace libvpx_test {
+const unsigned int kCodeBufferSize = 256 * 1024;
+const unsigned int kIvfFileHdrSize = 32;
+const unsigned int kIvfFrameHdrSize = 12;
+
+static unsigned int MemGetLe32(const uint8_t *mem) {
+  return (mem[3] << 24) | (mem[2] << 16) | (mem[1] << 8) | (mem[0]);
+}
+
+// This class extends VideoSource to allow parsing of ivf files,
+// so that we can do actual file decodes.
+class IVFVideoSource : public CompressedVideoSource {
+ public:
+  explicit IVFVideoSource(const std::string &file_name)
+      : file_name_(file_name),
+        input_file_(NULL),
+        compressed_frame_buf_(NULL),
+        frame_sz_(0),
+        frame_(0),
+        end_of_file_(false) {
+  }
+
+  virtual ~IVFVideoSource() {
+    delete[] compressed_frame_buf_;
+
+    if (input_file_)
+      fclose(input_file_);
+  }
+
+  virtual void Init() {
+    // Allocate a buffer for read in the compressed video frame.
+    compressed_frame_buf_ = new uint8_t[libvpx_test::kCodeBufferSize];
+    ASSERT_TRUE(compressed_frame_buf_ != NULL)
+        << "Allocate frame buffer failed";
+  }
+
+  virtual void Begin() {
+    input_file_ = OpenTestDataFile(file_name_);
+    ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: "
+        << file_name_;
+
+    // Read file header
+    uint8_t file_hdr[kIvfFileHdrSize];
+    ASSERT_EQ(kIvfFileHdrSize, fread(file_hdr, 1, kIvfFileHdrSize, input_file_))
+        << "File header read failed.";
+    // Check file header
+    ASSERT_TRUE(file_hdr[0] == 'D' && file_hdr[1] == 'K' && file_hdr[2] == 'I'
+                && file_hdr[3] == 'F') << "Input is not an IVF file.";
+
+    FillFrame();
+  }
+
+  virtual void Next() {
+    ++frame_;
+    FillFrame();
+  }
+
+  void FillFrame() {
+    ASSERT_TRUE(input_file_ != NULL);
+    uint8_t frame_hdr[kIvfFrameHdrSize];
+    // Check frame header and read a frame from input_file.
+    if (fread(frame_hdr, 1, kIvfFrameHdrSize, input_file_)
+        != kIvfFrameHdrSize) {
+      end_of_file_ = true;
+    } else {
+      end_of_file_ = false;
+
+      frame_sz_ = MemGetLe32(frame_hdr);
+      ASSERT_LE(frame_sz_, kCodeBufferSize)
+          << "Frame is too big for allocated code buffer";
+      ASSERT_EQ(frame_sz_,
+                fread(compressed_frame_buf_, 1, frame_sz_, input_file_))
+          << "Failed to read complete frame";
+    }
+  }
+
+  virtual const uint8_t *cxdata() const {
+    return end_of_file_ ? NULL : compressed_frame_buf_;
+  }
+  virtual size_t frame_size() const { return frame_sz_; }
+  virtual unsigned int frame_number() const { return frame_; }
+
+ protected:
+  std::string file_name_;
+  FILE *input_file_;
+  uint8_t *compressed_frame_buf_;
+  size_t frame_sz_;
+  unsigned int frame_;
+  bool end_of_file_;
+};
+
+}  // namespace libvpx_test
+
+#endif  // TEST_IVF_VIDEO_SOURCE_H_
diff --git a/libs/libvpx/test/keyframe_test.cc b/libs/libvpx/test/keyframe_test.cc
new file mode 100644
index 0000000000..d8b21a14d2
--- /dev/null
+++ b/libs/libvpx/test/keyframe_test.cc
@@ -0,0 +1,145 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <climits>
+#include <vector>
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+
+class KeyframeTest : public ::libvpx_test::EncoderTest,
+    public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
+ protected:
+  KeyframeTest() : EncoderTest(GET_PARAM(0)) {}
+  virtual ~KeyframeTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+    kf_count_ = 0;
+    kf_count_max_ = INT_MAX;
+    kf_do_force_kf_ = false;
+    set_cpu_used_ = 0;
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (kf_do_force_kf_)
+      frame_flags_ = (video->frame() % 3) ? 0 : VPX_EFLAG_FORCE_KF;
+    if (set_cpu_used_ && video->frame() == 1)
+      encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
+  }
+
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    if (pkt->data.frame.flags & VPX_FRAME_IS_KEY) {
+      kf_pts_list_.push_back(pkt->data.frame.pts);
+      kf_count_++;
+      abort_ |= kf_count_ > kf_count_max_;
+    }
+  }
+
+  bool kf_do_force_kf_;
+  int kf_count_;
+  int kf_count_max_;
+  std::vector<vpx_codec_pts_t> kf_pts_list_;
+  int set_cpu_used_;
+};
+
+TEST_P(KeyframeTest, TestRandomVideoSource) {
+  // Validate that encoding the RandomVideoSource produces multiple keyframes.
+  // This validates the results of the TestDisableKeyframes test.
+  kf_count_max_ = 2;  // early exit successful tests.
+
+  ::libvpx_test::RandomVideoSource video;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  // In realtime mode - auto placed keyframes are exceedingly rare,  don't
+  // bother with this check   if(GetParam() > 0)
+  if (GET_PARAM(1) > 0)
+    EXPECT_GT(kf_count_, 1);
+}
+
+TEST_P(KeyframeTest, TestDisableKeyframes) {
+  cfg_.kf_mode = VPX_KF_DISABLED;
+  kf_count_max_ = 1;  // early exit failed tests.
+
+  ::libvpx_test::RandomVideoSource video;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  EXPECT_EQ(1, kf_count_);
+}
+
+TEST_P(KeyframeTest, TestForceKeyframe) {
+  cfg_.kf_mode = VPX_KF_DISABLED;
+  kf_do_force_kf_ = true;
+
+  ::libvpx_test::DummyVideoSource video;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  // verify that every third frame is a keyframe.
+  for (std::vector<vpx_codec_pts_t>::const_iterator iter = kf_pts_list_.begin();
+       iter != kf_pts_list_.end(); ++iter) {
+    ASSERT_EQ(0, *iter % 3) << "Unexpected keyframe at frame " << *iter;
+  }
+}
+
+TEST_P(KeyframeTest, TestKeyframeMaxDistance) {
+  cfg_.kf_max_dist = 25;
+
+  ::libvpx_test::DummyVideoSource video;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  // verify that keyframe interval matches kf_max_dist
+  for (std::vector<vpx_codec_pts_t>::const_iterator iter = kf_pts_list_.begin();
+       iter != kf_pts_list_.end(); ++iter) {
+    ASSERT_EQ(0, *iter % 25) << "Unexpected keyframe at frame " << *iter;
+  }
+}
+
+TEST_P(KeyframeTest, TestAutoKeyframe) {
+  cfg_.kf_mode = VPX_KF_AUTO;
+  kf_do_force_kf_ = false;
+
+  // Force a deterministic speed step in Real Time mode, as the faster modes
+  // may not produce a keyframe like we expect. This is necessary when running
+  // on very slow environments (like Valgrind). The step -11 was determined
+  // experimentally as the fastest mode that still throws the keyframe.
+  if (deadline_ == VPX_DL_REALTIME)
+    set_cpu_used_ = -11;
+
+  // This clip has a cut scene every 30 frames -> Frame 0, 30, 60, 90, 120.
+  // I check only the first 40 frames to make sure there's a keyframe at frame
+  // 0 and 30.
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 40);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  // In realtime mode - auto placed keyframes are exceedingly rare,  don't
+  // bother with this check
+  if (GET_PARAM(1) > 0)
+    EXPECT_EQ(2u, kf_pts_list_.size()) << " Not the right number of keyframes ";
+
+  // Verify that keyframes match the file keyframes in the file.
+  for (std::vector<vpx_codec_pts_t>::const_iterator iter = kf_pts_list_.begin();
+       iter != kf_pts_list_.end(); ++iter) {
+    if (deadline_ == VPX_DL_REALTIME && *iter > 0)
+      EXPECT_EQ(0, (*iter - 1) % 30) << "Unexpected keyframe at frame "
+        << *iter;
+    else
+      EXPECT_EQ(0, *iter % 30) << "Unexpected keyframe at frame " << *iter;
+  }
+}
+
+VP8_INSTANTIATE_TEST_CASE(KeyframeTest, ALL_TEST_MODES);
+}  // namespace
diff --git a/libs/libvpx/test/lpf_8_test.cc b/libs/libvpx/test/lpf_8_test.cc
new file mode 100644
index 0000000000..0bf6b0c232
--- /dev/null
+++ b/libs/libvpx/test/lpf_8_test.cc
@@ -0,0 +1,718 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <cmath>
+#include <cstdlib>
+#include <string>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "vp9/common/vp9_entropy.h"
+#include "vp9/common/vp9_loopfilter.h"
+#include "vpx/vpx_integer.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+// Horizontally and Vertically need 32x32: 8  Coeffs preceeding filtered section
+//                                         16 Coefs within filtered section
+//                                         8  Coeffs following filtered section
+const int kNumCoeffs = 1024;
+
+const int number_of_iterations = 10000;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef void (*loop_op_t)(uint16_t *s, int p, const uint8_t *blimit,
+                          const uint8_t *limit, const uint8_t *thresh,
+                          int count, int bd);
+typedef void (*dual_loop_op_t)(uint16_t *s, int p, const uint8_t *blimit0,
+                               const uint8_t *limit0, const uint8_t *thresh0,
+                               const uint8_t *blimit1, const uint8_t *limit1,
+                               const uint8_t *thresh1, int bd);
+#else
+typedef void (*loop_op_t)(uint8_t *s, int p, const uint8_t *blimit,
+                          const uint8_t *limit, const uint8_t *thresh,
+                          int count);
+typedef void (*dual_loop_op_t)(uint8_t *s, int p, const uint8_t *blimit0,
+                               const uint8_t *limit0, const uint8_t *thresh0,
+                               const uint8_t *blimit1, const uint8_t *limit1,
+                               const uint8_t *thresh1);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+typedef std::tr1::tuple<loop_op_t, loop_op_t, int, int> loop8_param_t;
+typedef std::tr1::tuple<dual_loop_op_t, dual_loop_op_t, int> dualloop8_param_t;
+
+#if HAVE_SSE2
+#if CONFIG_VP9_HIGHBITDEPTH
+void wrapper_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit,
+                              const uint8_t *limit, const uint8_t *thresh,
+                              int count, int bd) {
+  vpx_highbd_lpf_vertical_16_sse2(s, p, blimit, limit, thresh, bd);
+}
+
+void wrapper_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit,
+                           const uint8_t *limit, const uint8_t *thresh,
+                           int count, int bd) {
+  vpx_highbd_lpf_vertical_16_c(s, p, blimit, limit, thresh, bd);
+}
+
+void wrapper_vertical_16_dual_sse2(uint16_t *s, int p, const uint8_t *blimit,
+                                   const uint8_t *limit, const uint8_t *thresh,
+                                   int count, int bd) {
+  vpx_highbd_lpf_vertical_16_dual_sse2(s, p, blimit, limit, thresh, bd);
+}
+
+void wrapper_vertical_16_dual_c(uint16_t *s, int p, const uint8_t *blimit,
+                                const uint8_t *limit, const uint8_t *thresh,
+                                int count, int bd) {
+  vpx_highbd_lpf_vertical_16_dual_c(s, p, blimit, limit, thresh, bd);
+}
+#else
+void wrapper_vertical_16_sse2(uint8_t *s, int p, const uint8_t *blimit,
+                              const uint8_t *limit, const uint8_t *thresh,
+                              int count) {
+  vpx_lpf_vertical_16_sse2(s, p, blimit, limit, thresh);
+}
+
+void wrapper_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
+                           const uint8_t *limit, const uint8_t *thresh,
+                           int count) {
+  vpx_lpf_vertical_16_c(s, p, blimit, limit, thresh);
+}
+
+void wrapper_vertical_16_dual_sse2(uint8_t *s, int p, const uint8_t *blimit,
+                                   const uint8_t *limit, const uint8_t *thresh,
+                                   int count) {
+  vpx_lpf_vertical_16_dual_sse2(s, p, blimit, limit, thresh);
+}
+
+void wrapper_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
+                                const uint8_t *limit, const uint8_t *thresh,
+                                int count) {
+  vpx_lpf_vertical_16_dual_c(s, p, blimit, limit, thresh);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_SSE2
+
+#if HAVE_NEON_ASM
+#if CONFIG_VP9_HIGHBITDEPTH
+// No neon high bitdepth functions.
+#else
+void wrapper_vertical_16_neon(uint8_t *s, int p, const uint8_t *blimit,
+                              const uint8_t *limit, const uint8_t *thresh,
+                              int count) {
+  vpx_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
+}
+
+void wrapper_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
+                           const uint8_t *limit, const uint8_t *thresh,
+                           int count) {
+  vpx_lpf_vertical_16_c(s, p, blimit, limit, thresh);
+}
+
+void wrapper_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
+                                   const uint8_t *limit, const uint8_t *thresh,
+                                   int count) {
+  vpx_lpf_vertical_16_dual_neon(s, p, blimit, limit, thresh);
+}
+
+void wrapper_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
+                                const uint8_t *limit, const uint8_t *thresh,
+                                int count) {
+  vpx_lpf_vertical_16_dual_c(s, p, blimit, limit, thresh);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_NEON_ASM
+
+#if HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH)
+void wrapper_vertical_16_msa(uint8_t *s, int p, const uint8_t *blimit,
+                             const uint8_t *limit, const uint8_t *thresh,
+                             int count) {
+  vpx_lpf_vertical_16_msa(s, p, blimit, limit, thresh);
+}
+
+void wrapper_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
+                           const uint8_t *limit, const uint8_t *thresh,
+                           int count) {
+  vpx_lpf_vertical_16_c(s, p, blimit, limit, thresh);
+}
+#endif  // HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH)
+
+class Loop8Test6Param : public ::testing::TestWithParam<loop8_param_t> {
+ public:
+  virtual ~Loop8Test6Param() {}
+  virtual void SetUp() {
+    loopfilter_op_ = GET_PARAM(0);
+    ref_loopfilter_op_ = GET_PARAM(1);
+    bit_depth_ = GET_PARAM(2);
+    count_ = GET_PARAM(3);
+    mask_ = (1 << bit_depth_) - 1;
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  int bit_depth_;
+  int count_;
+  int mask_;
+  loop_op_t loopfilter_op_;
+  loop_op_t ref_loopfilter_op_;
+};
+
+class Loop8Test9Param : public ::testing::TestWithParam<dualloop8_param_t> {
+ public:
+  virtual ~Loop8Test9Param() {}
+  virtual void SetUp() {
+    loopfilter_op_ = GET_PARAM(0);
+    ref_loopfilter_op_ = GET_PARAM(1);
+    bit_depth_ = GET_PARAM(2);
+    mask_ = (1 << bit_depth_) - 1;
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  int bit_depth_;
+  int mask_;
+  dual_loop_op_t loopfilter_op_;
+  dual_loop_op_t ref_loopfilter_op_;
+};
+
+TEST_P(Loop8Test6Param, OperationCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = number_of_iterations;
+#if CONFIG_VP9_HIGHBITDEPTH
+  int32_t bd = bit_depth_;
+  DECLARE_ALIGNED(16, uint16_t, s[kNumCoeffs]);
+  DECLARE_ALIGNED(16, uint16_t, ref_s[kNumCoeffs]);
+#else
+  DECLARE_ALIGNED(8, uint8_t, s[kNumCoeffs]);
+  DECLARE_ALIGNED(8, uint8_t, ref_s[kNumCoeffs]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  int err_count_total = 0;
+  int first_failure = -1;
+  for (int i = 0; i < count_test_block; ++i) {
+    int err_count = 0;
+    uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
+    DECLARE_ALIGNED(16, const uint8_t, blimit[16]) = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
+    DECLARE_ALIGNED(16, const uint8_t, limit[16])  = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = rnd.Rand8();
+    DECLARE_ALIGNED(16, const uint8_t, thresh[16]) = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    int32_t p = kNumCoeffs/32;
+
+    uint16_t tmp_s[kNumCoeffs];
+    int j = 0;
+    while (j < kNumCoeffs) {
+      uint8_t val = rnd.Rand8();
+      if (val & 0x80) {  // 50% chance to choose a new value.
+        tmp_s[j] = rnd.Rand16();
+        j++;
+      } else {  // 50% chance to repeat previous value in row X times
+        int k = 0;
+        while (k++ < ((val & 0x1f) + 1) && j < kNumCoeffs) {
+          if (j < 1) {
+            tmp_s[j] = rnd.Rand16();
+          } else if (val & 0x20) {  // Increment by an value within the limit
+            tmp_s[j] = (tmp_s[j - 1] + (*limit - 1));
+          } else {  // Decrement by an value within the limit
+            tmp_s[j] = (tmp_s[j - 1] - (*limit - 1));
+          }
+          j++;
+        }
+      }
+    }
+    for (j = 0; j < kNumCoeffs; j++) {
+      if (i % 2) {
+        s[j] = tmp_s[j] & mask_;
+      } else {
+        s[j] = tmp_s[p * (j % p) + j / p] & mask_;
+      }
+      ref_s[j] = s[j];
+    }
+#if CONFIG_VP9_HIGHBITDEPTH
+    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh, count_, bd);
+    ASM_REGISTER_STATE_CHECK(
+        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count_, bd));
+#else
+    ref_loopfilter_op_(ref_s+8+p*8, p, blimit, limit, thresh, count_);
+    ASM_REGISTER_STATE_CHECK(
+        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count_));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      err_count += ref_s[j] != s[j];
+    }
+    if (err_count && !err_count_total) {
+      first_failure = i;
+    }
+    err_count_total += err_count;
+  }
+  EXPECT_EQ(0, err_count_total)
+      << "Error: Loop8Test6Param, C output doesn't match SSE2 "
+         "loopfilter output. "
+      << "First failed at test case " << first_failure;
+}
+
+TEST_P(Loop8Test6Param, ValueCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = number_of_iterations;
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int32_t bd = bit_depth_;
+  DECLARE_ALIGNED(16, uint16_t, s[kNumCoeffs]);
+  DECLARE_ALIGNED(16, uint16_t, ref_s[kNumCoeffs]);
+#else
+  DECLARE_ALIGNED(8, uint8_t, s[kNumCoeffs]);
+  DECLARE_ALIGNED(8, uint8_t, ref_s[kNumCoeffs]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  int err_count_total = 0;
+  int first_failure = -1;
+
+  // NOTE: The code in vp9_loopfilter.c:update_sharpness computes mblim as a
+  // function of sharpness_lvl and the loopfilter lvl as:
+  // block_inside_limit = lvl >> ((sharpness_lvl > 0) + (sharpness_lvl > 4));
+  // ...
+  // memset(lfi->lfthr[lvl].mblim, (2 * (lvl + 2) + block_inside_limit),
+  //        SIMD_WIDTH);
+  // This means that the largest value for mblim will occur when sharpness_lvl
+  // is equal to 0, and lvl is equal to its greatest value (MAX_LOOP_FILTER).
+  // In this case block_inside_limit will be equal to MAX_LOOP_FILTER and
+  // therefore mblim will be equal to (2 * (lvl + 2) + block_inside_limit) =
+  // 2 * (MAX_LOOP_FILTER + 2) + MAX_LOOP_FILTER = 3 * MAX_LOOP_FILTER + 4
+
+  for (int i = 0; i < count_test_block; ++i) {
+    int err_count = 0;
+    uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
+    DECLARE_ALIGNED(16, const uint8_t, blimit[16]) = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
+    DECLARE_ALIGNED(16, const uint8_t, limit[16])  = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = rnd.Rand8();
+    DECLARE_ALIGNED(16, const uint8_t, thresh[16]) = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    int32_t p = kNumCoeffs / 32;
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      s[j] = rnd.Rand16() & mask_;
+      ref_s[j] = s[j];
+    }
+#if CONFIG_VP9_HIGHBITDEPTH
+    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh, count_, bd);
+    ASM_REGISTER_STATE_CHECK(
+        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count_, bd));
+#else
+    ref_loopfilter_op_(ref_s+8+p*8, p, blimit, limit, thresh, count_);
+    ASM_REGISTER_STATE_CHECK(
+        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count_));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      err_count += ref_s[j] != s[j];
+    }
+    if (err_count && !err_count_total) {
+      first_failure = i;
+    }
+    err_count_total += err_count;
+  }
+  EXPECT_EQ(0, err_count_total)
+      << "Error: Loop8Test6Param, C output doesn't match SSE2 "
+         "loopfilter output. "
+      << "First failed at test case " << first_failure;
+}
+
+TEST_P(Loop8Test9Param, OperationCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = number_of_iterations;
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int32_t bd = bit_depth_;
+  DECLARE_ALIGNED(16, uint16_t, s[kNumCoeffs]);
+  DECLARE_ALIGNED(16, uint16_t, ref_s[kNumCoeffs]);
+#else
+  DECLARE_ALIGNED(8,  uint8_t,  s[kNumCoeffs]);
+  DECLARE_ALIGNED(8,  uint8_t,  ref_s[kNumCoeffs]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  int err_count_total = 0;
+  int first_failure = -1;
+  for (int i = 0; i < count_test_block; ++i) {
+    int err_count = 0;
+    uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
+    DECLARE_ALIGNED(16, const uint8_t, blimit0[16]) = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
+    DECLARE_ALIGNED(16, const uint8_t, limit0[16])  = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = rnd.Rand8();
+    DECLARE_ALIGNED(16, const uint8_t, thresh0[16]) = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
+    DECLARE_ALIGNED(16, const uint8_t, blimit1[16]) = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
+    DECLARE_ALIGNED(16, const uint8_t, limit1[16])  = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = rnd.Rand8();
+    DECLARE_ALIGNED(16, const uint8_t, thresh1[16]) = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    int32_t p = kNumCoeffs / 32;
+    uint16_t tmp_s[kNumCoeffs];
+    int j = 0;
+    const uint8_t limit = *limit0 < *limit1 ? *limit0 : *limit1;
+    while (j < kNumCoeffs) {
+      uint8_t val = rnd.Rand8();
+      if (val & 0x80) {  // 50% chance to choose a new value.
+        tmp_s[j] = rnd.Rand16();
+        j++;
+      } else {  // 50% chance to repeat previous value in row X times.
+        int k = 0;
+        while (k++ < ((val & 0x1f) + 1) && j < kNumCoeffs) {
+          if (j < 1) {
+            tmp_s[j] = rnd.Rand16();
+          } else if (val & 0x20) {  // Increment by a value within the limit.
+            tmp_s[j] = (tmp_s[j - 1] + (limit - 1));
+          } else {  // Decrement by an value within the limit.
+            tmp_s[j] = (tmp_s[j - 1] - (limit - 1));
+          }
+          j++;
+        }
+      }
+    }
+    for (j = 0; j < kNumCoeffs; j++) {
+      if (i % 2) {
+        s[j] = tmp_s[j] & mask_;
+      } else {
+        s[j] = tmp_s[p * (j % p) + j / p] & mask_;
+      }
+      ref_s[j] = s[j];
+    }
+#if CONFIG_VP9_HIGHBITDEPTH
+    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit0, limit0, thresh0,
+                       blimit1, limit1, thresh1, bd);
+    ASM_REGISTER_STATE_CHECK(
+        loopfilter_op_(s + 8 + p * 8, p, blimit0, limit0, thresh0,
+                       blimit1, limit1, thresh1, bd));
+#else
+    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit0, limit0, thresh0,
+                       blimit1, limit1, thresh1);
+    ASM_REGISTER_STATE_CHECK(
+        loopfilter_op_(s + 8 + p * 8, p, blimit0, limit0, thresh0,
+                       blimit1, limit1, thresh1));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      err_count += ref_s[j] != s[j];
+    }
+    if (err_count && !err_count_total) {
+      first_failure = i;
+    }
+    err_count_total += err_count;
+  }
+  EXPECT_EQ(0, err_count_total)
+      << "Error: Loop8Test9Param, C output doesn't match SSE2 "
+         "loopfilter output. "
+      << "First failed at test case " << first_failure;
+}
+
+TEST_P(Loop8Test9Param, ValueCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = number_of_iterations;
+#if CONFIG_VP9_HIGHBITDEPTH
+  DECLARE_ALIGNED(16, uint16_t, s[kNumCoeffs]);
+  DECLARE_ALIGNED(16, uint16_t, ref_s[kNumCoeffs]);
+#else
+  DECLARE_ALIGNED(8,  uint8_t, s[kNumCoeffs]);
+  DECLARE_ALIGNED(8,  uint8_t, ref_s[kNumCoeffs]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  int err_count_total = 0;
+  int first_failure = -1;
+  for (int i = 0; i < count_test_block; ++i) {
+    int err_count = 0;
+    uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
+    DECLARE_ALIGNED(16, const uint8_t, blimit0[16]) = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
+    DECLARE_ALIGNED(16, const uint8_t, limit0[16])  = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = rnd.Rand8();
+    DECLARE_ALIGNED(16, const uint8_t, thresh0[16]) = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
+    DECLARE_ALIGNED(16, const uint8_t, blimit1[16]) = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
+    DECLARE_ALIGNED(16, const uint8_t, limit1[16])  = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    tmp = rnd.Rand8();
+    DECLARE_ALIGNED(16, const uint8_t, thresh1[16]) = {
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
+        tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
+    };
+    int32_t p = kNumCoeffs / 32;  // TODO(pdlf) can we have non-square here?
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      s[j] = rnd.Rand16() & mask_;
+      ref_s[j] = s[j];
+    }
+#if CONFIG_VP9_HIGHBITDEPTH
+    const int32_t bd = bit_depth_;
+    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit0, limit0, thresh0,
+                       blimit1, limit1, thresh1, bd);
+    ASM_REGISTER_STATE_CHECK(
+        loopfilter_op_(s + 8 + p * 8, p, blimit0, limit0,
+                       thresh0, blimit1, limit1, thresh1, bd));
+#else
+    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit0, limit0, thresh0,
+                       blimit1, limit1, thresh1);
+    ASM_REGISTER_STATE_CHECK(
+        loopfilter_op_(s + 8 + p * 8, p, blimit0, limit0, thresh0,
+                       blimit1, limit1, thresh1));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      err_count += ref_s[j] != s[j];
+    }
+    if (err_count && !err_count_total) {
+      first_failure = i;
+    }
+    err_count_total += err_count;
+  }
+  EXPECT_EQ(0, err_count_total)
+      << "Error: Loop8Test9Param, C output doesn't match SSE2"
+         "loopfilter output. "
+      << "First failed at test case " << first_failure;
+}
+
+using std::tr1::make_tuple;
+
+#if HAVE_SSE2
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Loop8Test6Param,
+    ::testing::Values(
+        make_tuple(&vpx_highbd_lpf_horizontal_4_sse2,
+                   &vpx_highbd_lpf_horizontal_4_c, 8, 1),
+        make_tuple(&vpx_highbd_lpf_vertical_4_sse2,
+                   &vpx_highbd_lpf_vertical_4_c, 8, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_8_sse2,
+                   &vpx_highbd_lpf_horizontal_8_c, 8, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
+                   &vpx_highbd_lpf_horizontal_16_c, 8, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
+                   &vpx_highbd_lpf_horizontal_16_c, 8, 2),
+        make_tuple(&vpx_highbd_lpf_vertical_8_sse2,
+                   &vpx_highbd_lpf_vertical_8_c, 8, 1),
+        make_tuple(&wrapper_vertical_16_sse2,
+                   &wrapper_vertical_16_c, 8, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_4_sse2,
+                   &vpx_highbd_lpf_horizontal_4_c, 10, 1),
+        make_tuple(&vpx_highbd_lpf_vertical_4_sse2,
+                   &vpx_highbd_lpf_vertical_4_c, 10, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_8_sse2,
+                   &vpx_highbd_lpf_horizontal_8_c, 10, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
+                   &vpx_highbd_lpf_horizontal_16_c, 10, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
+                   &vpx_highbd_lpf_horizontal_16_c, 10, 2),
+        make_tuple(&vpx_highbd_lpf_vertical_8_sse2,
+                   &vpx_highbd_lpf_vertical_8_c, 10, 1),
+        make_tuple(&wrapper_vertical_16_sse2,
+                   &wrapper_vertical_16_c, 10, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_4_sse2,
+                   &vpx_highbd_lpf_horizontal_4_c, 12, 1),
+        make_tuple(&vpx_highbd_lpf_vertical_4_sse2,
+                   &vpx_highbd_lpf_vertical_4_c, 12, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_8_sse2,
+                   &vpx_highbd_lpf_horizontal_8_c, 12, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
+                   &vpx_highbd_lpf_horizontal_16_c, 12, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
+                   &vpx_highbd_lpf_horizontal_16_c, 12, 2),
+        make_tuple(&vpx_highbd_lpf_vertical_8_sse2,
+                   &vpx_highbd_lpf_vertical_8_c, 12, 1),
+        make_tuple(&wrapper_vertical_16_sse2,
+                   &wrapper_vertical_16_c, 12, 1),
+        make_tuple(&wrapper_vertical_16_dual_sse2,
+                   &wrapper_vertical_16_dual_c, 8, 1),
+        make_tuple(&wrapper_vertical_16_dual_sse2,
+                   &wrapper_vertical_16_dual_c, 10, 1),
+        make_tuple(&wrapper_vertical_16_dual_sse2,
+                   &wrapper_vertical_16_dual_c, 12, 1)));
+#else
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Loop8Test6Param,
+    ::testing::Values(
+        make_tuple(&vpx_lpf_horizontal_8_sse2, &vpx_lpf_horizontal_8_c, 8, 1),
+        make_tuple(&vpx_lpf_horizontal_16_sse2, &vpx_lpf_horizontal_16_c, 8, 1),
+        make_tuple(&vpx_lpf_horizontal_16_sse2, &vpx_lpf_horizontal_16_c, 8, 2),
+        make_tuple(&vpx_lpf_vertical_8_sse2, &vpx_lpf_vertical_8_c, 8, 1),
+        make_tuple(&wrapper_vertical_16_sse2, &wrapper_vertical_16_c, 8, 1),
+        make_tuple(&wrapper_vertical_16_dual_sse2,
+                   &wrapper_vertical_16_dual_c, 8, 1)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif
+
+#if HAVE_AVX2 && (!CONFIG_VP9_HIGHBITDEPTH)
+INSTANTIATE_TEST_CASE_P(
+    AVX2, Loop8Test6Param,
+    ::testing::Values(
+        make_tuple(&vpx_lpf_horizontal_16_avx2, &vpx_lpf_horizontal_16_c, 8, 1),
+        make_tuple(&vpx_lpf_horizontal_16_avx2, &vpx_lpf_horizontal_16_c, 8,
+                   2)));
+#endif
+
+#if HAVE_SSE2
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Loop8Test9Param,
+    ::testing::Values(
+        make_tuple(&vpx_highbd_lpf_horizontal_4_dual_sse2,
+                   &vpx_highbd_lpf_horizontal_4_dual_c, 8),
+        make_tuple(&vpx_highbd_lpf_horizontal_8_dual_sse2,
+                   &vpx_highbd_lpf_horizontal_8_dual_c, 8),
+        make_tuple(&vpx_highbd_lpf_vertical_4_dual_sse2,
+                   &vpx_highbd_lpf_vertical_4_dual_c, 8),
+        make_tuple(&vpx_highbd_lpf_vertical_8_dual_sse2,
+                   &vpx_highbd_lpf_vertical_8_dual_c, 8),
+        make_tuple(&vpx_highbd_lpf_horizontal_4_dual_sse2,
+                   &vpx_highbd_lpf_horizontal_4_dual_c, 10),
+        make_tuple(&vpx_highbd_lpf_horizontal_8_dual_sse2,
+                   &vpx_highbd_lpf_horizontal_8_dual_c, 10),
+        make_tuple(&vpx_highbd_lpf_vertical_4_dual_sse2,
+                   &vpx_highbd_lpf_vertical_4_dual_c, 10),
+        make_tuple(&vpx_highbd_lpf_vertical_8_dual_sse2,
+                   &vpx_highbd_lpf_vertical_8_dual_c, 10),
+        make_tuple(&vpx_highbd_lpf_horizontal_4_dual_sse2,
+                   &vpx_highbd_lpf_horizontal_4_dual_c, 12),
+        make_tuple(&vpx_highbd_lpf_horizontal_8_dual_sse2,
+                   &vpx_highbd_lpf_horizontal_8_dual_c, 12),
+        make_tuple(&vpx_highbd_lpf_vertical_4_dual_sse2,
+                   &vpx_highbd_lpf_vertical_4_dual_c, 12),
+        make_tuple(&vpx_highbd_lpf_vertical_8_dual_sse2,
+                   &vpx_highbd_lpf_vertical_8_dual_c, 12)));
+#else
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Loop8Test9Param,
+    ::testing::Values(
+        make_tuple(&vpx_lpf_horizontal_4_dual_sse2,
+                   &vpx_lpf_horizontal_4_dual_c, 8),
+        make_tuple(&vpx_lpf_horizontal_8_dual_sse2,
+                   &vpx_lpf_horizontal_8_dual_c, 8),
+        make_tuple(&vpx_lpf_vertical_4_dual_sse2,
+                   &vpx_lpf_vertical_4_dual_c, 8),
+        make_tuple(&vpx_lpf_vertical_8_dual_sse2,
+                   &vpx_lpf_vertical_8_dual_c, 8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif
+
+#if HAVE_NEON
+#if CONFIG_VP9_HIGHBITDEPTH
+// No neon high bitdepth functions.
+#else
+INSTANTIATE_TEST_CASE_P(
+    NEON, Loop8Test6Param,
+    ::testing::Values(
+#if HAVE_NEON_ASM
+// Using #if inside the macro is unsupported on MSVS but the tests are not
+// currently built for MSVS with ARM and NEON.
+        make_tuple(&vpx_lpf_horizontal_16_neon,
+                   &vpx_lpf_horizontal_16_c, 8, 1),
+        make_tuple(&vpx_lpf_horizontal_16_neon,
+                   &vpx_lpf_horizontal_16_c, 8, 2),
+        make_tuple(&wrapper_vertical_16_neon,
+                   &wrapper_vertical_16_c, 8, 1),
+        make_tuple(&wrapper_vertical_16_dual_neon,
+                   &wrapper_vertical_16_dual_c, 8, 1),
+#endif  // HAVE_NEON_ASM
+        make_tuple(&vpx_lpf_horizontal_8_neon,
+                   &vpx_lpf_horizontal_8_c, 8, 1),
+        make_tuple(&vpx_lpf_vertical_8_neon,
+                   &vpx_lpf_vertical_8_c, 8, 1),
+        make_tuple(&vpx_lpf_horizontal_4_neon,
+                   &vpx_lpf_horizontal_4_c, 8, 1),
+        make_tuple(&vpx_lpf_vertical_4_neon,
+                   &vpx_lpf_vertical_4_c, 8, 1)));
+INSTANTIATE_TEST_CASE_P(
+    NEON, Loop8Test9Param,
+    ::testing::Values(
+#if HAVE_NEON_ASM
+        make_tuple(&vpx_lpf_horizontal_8_dual_neon,
+                   &vpx_lpf_horizontal_8_dual_c, 8),
+        make_tuple(&vpx_lpf_vertical_8_dual_neon,
+                   &vpx_lpf_vertical_8_dual_c, 8),
+#endif  // HAVE_NEON_ASM
+        make_tuple(&vpx_lpf_horizontal_4_dual_neon,
+                   &vpx_lpf_horizontal_4_dual_c, 8),
+        make_tuple(&vpx_lpf_vertical_4_dual_neon,
+                   &vpx_lpf_vertical_4_dual_c, 8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_NEON
+
+#if HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH)
+INSTANTIATE_TEST_CASE_P(
+    MSA, Loop8Test6Param,
+    ::testing::Values(
+        make_tuple(&vpx_lpf_horizontal_8_msa, &vpx_lpf_horizontal_8_c, 8, 1),
+        make_tuple(&vpx_lpf_horizontal_16_msa, &vpx_lpf_horizontal_16_c, 8, 1),
+        make_tuple(&vpx_lpf_horizontal_16_msa, &vpx_lpf_horizontal_16_c, 8, 2),
+        make_tuple(&vpx_lpf_vertical_8_msa, &vpx_lpf_vertical_8_c, 8, 1),
+        make_tuple(&wrapper_vertical_16_msa, &wrapper_vertical_16_c, 8, 1)));
+
+INSTANTIATE_TEST_CASE_P(
+    MSA, Loop8Test9Param,
+    ::testing::Values(
+        make_tuple(&vpx_lpf_horizontal_4_dual_msa,
+                   &vpx_lpf_horizontal_4_dual_c, 8),
+        make_tuple(&vpx_lpf_horizontal_8_dual_msa,
+                   &vpx_lpf_horizontal_8_dual_c, 8),
+        make_tuple(&vpx_lpf_vertical_4_dual_msa,
+                   &vpx_lpf_vertical_4_dual_c, 8),
+        make_tuple(&vpx_lpf_vertical_8_dual_msa,
+                   &vpx_lpf_vertical_8_dual_c, 8)));
+#endif  // HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH)
+
+}  // namespace
diff --git a/libs/libvpx/test/md5_helper.h b/libs/libvpx/test/md5_helper.h
new file mode 100644
index 0000000000..742cf0b7b3
--- /dev/null
+++ b/libs/libvpx/test/md5_helper.h
@@ -0,0 +1,74 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef TEST_MD5_HELPER_H_
+#define TEST_MD5_HELPER_H_
+
+#include "./md5_utils.h"
+#include "vpx/vpx_decoder.h"
+
+namespace libvpx_test {
+class MD5 {
+ public:
+  MD5() {
+    MD5Init(&md5_);
+  }
+
+  void Add(const vpx_image_t *img) {
+    for (int plane = 0; plane < 3; ++plane) {
+      const uint8_t *buf = img->planes[plane];
+      // Calculate the width and height to do the md5 check. For the chroma
+      // plane, we never want to round down and thus skip a pixel so if
+      // we are shifting by 1 (chroma_shift) we add 1 before doing the shift.
+      // This works only for chroma_shift of 0 and 1.
+      const int bytes_per_sample =
+          (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
+      const int h = plane ? (img->d_h + img->y_chroma_shift) >>
+                    img->y_chroma_shift : img->d_h;
+      const int w = (plane ? (img->d_w + img->x_chroma_shift) >>
+                     img->x_chroma_shift : img->d_w) * bytes_per_sample;
+
+      for (int y = 0; y < h; ++y) {
+        MD5Update(&md5_, buf, w);
+        buf += img->stride[plane];
+      }
+    }
+  }
+
+  void Add(const uint8_t *data, size_t size) {
+    MD5Update(&md5_, data, static_cast<uint32_t>(size));
+  }
+
+  const char *Get(void) {
+    static const char hex[16] = {
+      '0', '1', '2', '3', '4', '5', '6', '7',
+      '8', '9', 'a', 'b', 'c', 'd', 'e', 'f',
+    };
+    uint8_t tmp[16];
+    MD5Context ctx_tmp = md5_;
+
+    MD5Final(tmp, &ctx_tmp);
+    for (int i = 0; i < 16; i++) {
+      res_[i * 2 + 0]  = hex[tmp[i] >> 4];
+      res_[i * 2 + 1]  = hex[tmp[i] & 0xf];
+    }
+    res_[32] = 0;
+
+    return res_;
+  }
+
+ protected:
+  char res_[33];
+  MD5Context md5_;
+};
+
+}  // namespace libvpx_test
+
+#endif  // TEST_MD5_HELPER_H_
diff --git a/libs/libvpx/test/partial_idct_test.cc b/libs/libvpx/test/partial_idct_test.cc
new file mode 100644
index 0000000000..6c824128b8
--- /dev/null
+++ b/libs/libvpx/test/partial_idct_test.cc
@@ -0,0 +1,343 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_scan.h"
+#include "vpx/vpx_integer.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+typedef void (*FwdTxfmFunc)(const int16_t *in, tran_low_t *out, int stride);
+typedef void (*InvTxfmFunc)(const tran_low_t *in, uint8_t *out, int stride);
+typedef std::tr1::tuple<FwdTxfmFunc,
+                        InvTxfmFunc,
+                        InvTxfmFunc,
+                        TX_SIZE, int> PartialInvTxfmParam;
+const int kMaxNumCoeffs = 1024;
+class PartialIDctTest : public ::testing::TestWithParam<PartialInvTxfmParam> {
+ public:
+  virtual ~PartialIDctTest() {}
+  virtual void SetUp() {
+    ftxfm_ = GET_PARAM(0);
+    full_itxfm_ = GET_PARAM(1);
+    partial_itxfm_ = GET_PARAM(2);
+    tx_size_  = GET_PARAM(3);
+    last_nonzero_ = GET_PARAM(4);
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  int last_nonzero_;
+  TX_SIZE tx_size_;
+  FwdTxfmFunc ftxfm_;
+  InvTxfmFunc full_itxfm_;
+  InvTxfmFunc partial_itxfm_;
+};
+
+TEST_P(PartialIDctTest, RunQuantCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  int size;
+  switch (tx_size_) {
+    case TX_4X4:
+      size = 4;
+      break;
+    case TX_8X8:
+      size = 8;
+      break;
+    case TX_16X16:
+      size = 16;
+      break;
+    case TX_32X32:
+      size = 32;
+      break;
+    default:
+      FAIL() << "Wrong Size!";
+      break;
+  }
+  DECLARE_ALIGNED(16, tran_low_t, test_coef_block1[kMaxNumCoeffs]);
+  DECLARE_ALIGNED(16, tran_low_t, test_coef_block2[kMaxNumCoeffs]);
+  DECLARE_ALIGNED(16, uint8_t, dst1[kMaxNumCoeffs]);
+  DECLARE_ALIGNED(16, uint8_t, dst2[kMaxNumCoeffs]);
+
+  const int count_test_block = 1000;
+  const int block_size = size * size;
+
+  DECLARE_ALIGNED(16, int16_t, input_extreme_block[kMaxNumCoeffs]);
+  DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kMaxNumCoeffs]);
+
+  int max_error = 0;
+  for (int i = 0; i < count_test_block; ++i) {
+    // clear out destination buffer
+    memset(dst1, 0, sizeof(*dst1) * block_size);
+    memset(dst2, 0, sizeof(*dst2) * block_size);
+    memset(test_coef_block1, 0, sizeof(*test_coef_block1) * block_size);
+    memset(test_coef_block2, 0, sizeof(*test_coef_block2) * block_size);
+
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-255, 255].
+      if (i == 0) {
+        for (int j = 0; j < block_size; ++j)
+          input_extreme_block[j] = 255;
+      } else if (i == 1) {
+        for (int j = 0; j < block_size; ++j)
+          input_extreme_block[j] = -255;
+      } else {
+        for (int j = 0; j < block_size; ++j) {
+          input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
+        }
+      }
+
+      ftxfm_(input_extreme_block, output_ref_block, size);
+
+      // quantization with maximum allowed step sizes
+      test_coef_block1[0] = (output_ref_block[0] / 1336) * 1336;
+      for (int j = 1; j < last_nonzero_; ++j)
+        test_coef_block1[vp9_default_scan_orders[tx_size_].scan[j]]
+                         = (output_ref_block[j] / 1828) * 1828;
+    }
+
+    ASM_REGISTER_STATE_CHECK(full_itxfm_(test_coef_block1, dst1, size));
+    ASM_REGISTER_STATE_CHECK(partial_itxfm_(test_coef_block1, dst2, size));
+
+    for (int j = 0; j < block_size; ++j) {
+      const int diff = dst1[j] - dst2[j];
+      const int error = diff * diff;
+      if (max_error < error)
+        max_error = error;
+    }
+  }
+
+  EXPECT_EQ(0, max_error)
+      << "Error: partial inverse transform produces different results";
+}
+
+TEST_P(PartialIDctTest, ResultsMatch) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  int size;
+  switch (tx_size_) {
+    case TX_4X4:
+      size = 4;
+      break;
+    case TX_8X8:
+      size = 8;
+      break;
+    case TX_16X16:
+      size = 16;
+      break;
+    case TX_32X32:
+      size = 32;
+      break;
+    default:
+      FAIL() << "Wrong Size!";
+      break;
+  }
+  DECLARE_ALIGNED(16, tran_low_t, test_coef_block1[kMaxNumCoeffs]);
+  DECLARE_ALIGNED(16, tran_low_t, test_coef_block2[kMaxNumCoeffs]);
+  DECLARE_ALIGNED(16, uint8_t, dst1[kMaxNumCoeffs]);
+  DECLARE_ALIGNED(16, uint8_t, dst2[kMaxNumCoeffs]);
+  const int count_test_block = 1000;
+  const int max_coeff = 32766 / 4;
+  const int block_size = size * size;
+  int max_error = 0;
+  for (int i = 0; i < count_test_block; ++i) {
+    // clear out destination buffer
+    memset(dst1, 0, sizeof(*dst1) * block_size);
+    memset(dst2, 0, sizeof(*dst2) * block_size);
+    memset(test_coef_block1, 0, sizeof(*test_coef_block1) * block_size);
+    memset(test_coef_block2, 0, sizeof(*test_coef_block2) * block_size);
+    int max_energy_leftover = max_coeff * max_coeff;
+    for (int j = 0; j < last_nonzero_; ++j) {
+      int16_t coef = static_cast<int16_t>(sqrt(1.0 * max_energy_leftover) *
+                                          (rnd.Rand16() - 32768) / 65536);
+      max_energy_leftover -= coef * coef;
+      if (max_energy_leftover < 0) {
+        max_energy_leftover = 0;
+        coef = 0;
+      }
+      test_coef_block1[vp9_default_scan_orders[tx_size_].scan[j]] = coef;
+    }
+
+    memcpy(test_coef_block2, test_coef_block1,
+           sizeof(*test_coef_block2) * block_size);
+
+    ASM_REGISTER_STATE_CHECK(full_itxfm_(test_coef_block1, dst1, size));
+    ASM_REGISTER_STATE_CHECK(partial_itxfm_(test_coef_block2, dst2, size));
+
+    for (int j = 0; j < block_size; ++j) {
+      const int diff = dst1[j] - dst2[j];
+      const int error = diff * diff;
+      if (max_error < error)
+        max_error = error;
+    }
+  }
+
+  EXPECT_EQ(0, max_error)
+      << "Error: partial inverse transform produces different results";
+}
+using std::tr1::make_tuple;
+
+INSTANTIATE_TEST_CASE_P(
+    C, PartialIDctTest,
+    ::testing::Values(
+        make_tuple(&vpx_fdct32x32_c,
+                   &vpx_idct32x32_1024_add_c,
+                   &vpx_idct32x32_34_add_c,
+                   TX_32X32, 34),
+        make_tuple(&vpx_fdct32x32_c,
+                   &vpx_idct32x32_1024_add_c,
+                   &vpx_idct32x32_1_add_c,
+                   TX_32X32, 1),
+        make_tuple(&vpx_fdct16x16_c,
+                   &vpx_idct16x16_256_add_c,
+                   &vpx_idct16x16_10_add_c,
+                   TX_16X16, 10),
+        make_tuple(&vpx_fdct16x16_c,
+                   &vpx_idct16x16_256_add_c,
+                   &vpx_idct16x16_1_add_c,
+                   TX_16X16, 1),
+        make_tuple(&vpx_fdct8x8_c,
+                   &vpx_idct8x8_64_add_c,
+                   &vpx_idct8x8_12_add_c,
+                   TX_8X8, 12),
+        make_tuple(&vpx_fdct8x8_c,
+                   &vpx_idct8x8_64_add_c,
+                   &vpx_idct8x8_1_add_c,
+                   TX_8X8, 1),
+        make_tuple(&vpx_fdct4x4_c,
+                   &vpx_idct4x4_16_add_c,
+                   &vpx_idct4x4_1_add_c,
+                   TX_4X4, 1)));
+
+#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    NEON, PartialIDctTest,
+    ::testing::Values(
+        make_tuple(&vpx_fdct32x32_c,
+                   &vpx_idct32x32_1024_add_c,
+                   &vpx_idct32x32_1_add_neon,
+                   TX_32X32, 1),
+        make_tuple(&vpx_fdct16x16_c,
+                   &vpx_idct16x16_256_add_c,
+                   &vpx_idct16x16_10_add_neon,
+                   TX_16X16, 10),
+        make_tuple(&vpx_fdct16x16_c,
+                   &vpx_idct16x16_256_add_c,
+                   &vpx_idct16x16_1_add_neon,
+                   TX_16X16, 1),
+        make_tuple(&vpx_fdct8x8_c,
+                   &vpx_idct8x8_64_add_c,
+                   &vpx_idct8x8_12_add_neon,
+                   TX_8X8, 12),
+        make_tuple(&vpx_fdct8x8_c,
+                   &vpx_idct8x8_64_add_c,
+                   &vpx_idct8x8_1_add_neon,
+                   TX_8X8, 1),
+        make_tuple(&vpx_fdct4x4_c,
+                   &vpx_idct4x4_16_add_c,
+                   &vpx_idct4x4_1_add_neon,
+                   TX_4X4, 1)));
+#endif  // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    SSE2, PartialIDctTest,
+    ::testing::Values(
+        make_tuple(&vpx_fdct32x32_c,
+                   &vpx_idct32x32_1024_add_c,
+                   &vpx_idct32x32_34_add_sse2,
+                   TX_32X32, 34),
+        make_tuple(&vpx_fdct32x32_c,
+                   &vpx_idct32x32_1024_add_c,
+                   &vpx_idct32x32_1_add_sse2,
+                   TX_32X32, 1),
+        make_tuple(&vpx_fdct16x16_c,
+                   &vpx_idct16x16_256_add_c,
+                   &vpx_idct16x16_10_add_sse2,
+                   TX_16X16, 10),
+        make_tuple(&vpx_fdct16x16_c,
+                   &vpx_idct16x16_256_add_c,
+                   &vpx_idct16x16_1_add_sse2,
+                   TX_16X16, 1),
+        make_tuple(&vpx_fdct8x8_c,
+                   &vpx_idct8x8_64_add_c,
+                   &vpx_idct8x8_12_add_sse2,
+                   TX_8X8, 12),
+        make_tuple(&vpx_fdct8x8_c,
+                   &vpx_idct8x8_64_add_c,
+                   &vpx_idct8x8_1_add_sse2,
+                   TX_8X8, 1),
+        make_tuple(&vpx_fdct4x4_c,
+                   &vpx_idct4x4_16_add_c,
+                   &vpx_idct4x4_1_add_sse2,
+                   TX_4X4, 1)));
+#endif
+
+#if HAVE_SSSE3 && CONFIG_USE_X86INC && ARCH_X86_64 && \
+    !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    SSSE3_64, PartialIDctTest,
+    ::testing::Values(
+        make_tuple(&vpx_fdct8x8_c,
+                   &vpx_idct8x8_64_add_c,
+                   &vpx_idct8x8_12_add_ssse3,
+                   TX_8X8, 12)));
+#endif
+
+#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    MSA, PartialIDctTest,
+    ::testing::Values(
+        make_tuple(&vpx_fdct32x32_c,
+                   &vpx_idct32x32_1024_add_c,
+                   &vpx_idct32x32_34_add_msa,
+                   TX_32X32, 34),
+        make_tuple(&vpx_fdct32x32_c,
+                   &vpx_idct32x32_1024_add_c,
+                   &vpx_idct32x32_1_add_msa,
+                   TX_32X32, 1),
+        make_tuple(&vpx_fdct16x16_c,
+                   &vpx_idct16x16_256_add_c,
+                   &vpx_idct16x16_10_add_msa,
+                   TX_16X16, 10),
+        make_tuple(&vpx_fdct16x16_c,
+                   &vpx_idct16x16_256_add_c,
+                   &vpx_idct16x16_1_add_msa,
+                   TX_16X16, 1),
+        make_tuple(&vpx_fdct8x8_c,
+                   &vpx_idct8x8_64_add_c,
+                   &vpx_idct8x8_12_add_msa,
+                   TX_8X8, 10),
+        make_tuple(&vpx_fdct8x8_c,
+                   &vpx_idct8x8_64_add_c,
+                   &vpx_idct8x8_1_add_msa,
+                   TX_8X8, 1),
+        make_tuple(&vpx_fdct4x4_c,
+                   &vpx_idct4x4_16_add_c,
+                   &vpx_idct4x4_1_add_msa,
+                   TX_4X4, 1)));
+#endif  // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+}  // namespace
diff --git a/libs/libvpx/test/postproc.sh b/libs/libvpx/test/postproc.sh
new file mode 100755
index 0000000000..939a3e7620
--- /dev/null
+++ b/libs/libvpx/test/postproc.sh
@@ -0,0 +1,63 @@
+#!/bin/sh
+##
+##  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+##  This file tests the libvpx postproc example code. To add new tests to this
+##  file, do the following:
+##    1. Write a shell function (this is your test).
+##    2. Add the function to postproc_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: Make sure input is available:
+#   $VP8_IVF_FILE and $VP9_IVF_FILE are required.
+postproc_verify_environment() {
+  if [ ! -e "${VP8_IVF_FILE}" ] || [ ! -e "${VP9_IVF_FILE}" ]; then
+    echo "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
+    return 1
+  fi
+}
+
+# Runs postproc using $1 as input file. $2 is the codec name, and is used
+# solely to name the output file.
+postproc() {
+  local decoder="${LIBVPX_BIN_PATH}/postproc${VPX_TEST_EXE_SUFFIX}"
+  local input_file="$1"
+  local codec="$2"
+  local output_file="${VPX_TEST_OUTPUT_DIR}/postproc_${codec}.raw"
+
+  if [ ! -x "${decoder}" ]; then
+    elog "${decoder} does not exist or is not executable."
+    return 1
+  fi
+
+  eval "${VPX_TEST_PREFIX}" "${decoder}" "${input_file}" "${output_file}" \
+      ${devnull}
+
+  [ -e "${output_file}" ] || return 1
+}
+
+postproc_vp8() {
+  if [ "$(vp8_decode_available)" = "yes" ]; then
+    postproc "${VP8_IVF_FILE}" vp8 || return 1
+  fi
+}
+
+postproc_vp9() {
+  if [ "$(vpx_config_option_enabled CONFIG_VP9_POSTPROC)" = "yes" ]; then
+    if [ "$(vp9_decode_available)" = "yes" ]; then
+      postproc "${VP9_IVF_FILE}" vp9 || return 1
+    fi
+  fi
+}
+
+postproc_tests="postproc_vp8
+                postproc_vp9"
+
+run_tests postproc_verify_environment "${postproc_tests}"
diff --git a/libs/libvpx/test/pp_filter_test.cc b/libs/libvpx/test/pp_filter_test.cc
new file mode 100644
index 0000000000..e4688dd8ce
--- /dev/null
+++ b/libs/libvpx/test/pp_filter_test.cc
@@ -0,0 +1,118 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "./vpx_config.h"
+#include "./vp8_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
+
+typedef void (*PostProcFunc)(unsigned char *src_ptr,
+                             unsigned char *dst_ptr,
+                             int src_pixels_per_line,
+                             int dst_pixels_per_line,
+                             int cols,
+                             unsigned char *flimit,
+                             int size);
+
+namespace {
+
+class VP8PostProcessingFilterTest
+    : public ::testing::TestWithParam<PostProcFunc> {
+ public:
+  virtual void TearDown() {
+    libvpx_test::ClearSystemState();
+  }
+};
+
+// Test routine for the VP8 post-processing function
+// vp8_post_proc_down_and_across_mb_row_c.
+
+TEST_P(VP8PostProcessingFilterTest, FilterOutputCheck) {
+  // Size of the underlying data block that will be filtered.
+  const int block_width  = 16;
+  const int block_height = 16;
+
+  // 5-tap filter needs 2 padding rows above and below the block in the input.
+  const int input_width = block_width;
+  const int input_height = block_height + 4;
+  const int input_stride = input_width;
+  const int input_size = input_width * input_height;
+
+  // Filter extends output block by 8 samples at left and right edges.
+  const int output_width = block_width + 16;
+  const int output_height = block_height;
+  const int output_stride = output_width;
+  const int output_size = output_width * output_height;
+
+  uint8_t *const src_image =
+      reinterpret_cast<uint8_t*>(vpx_calloc(input_size, 1));
+  uint8_t *const dst_image =
+      reinterpret_cast<uint8_t*>(vpx_calloc(output_size, 1));
+
+  // Pointers to top-left pixel of block in the input and output images.
+  uint8_t *const src_image_ptr = src_image + (input_stride << 1);
+  uint8_t *const dst_image_ptr = dst_image + 8;
+  uint8_t *const flimits =
+      reinterpret_cast<uint8_t *>(vpx_memalign(16, block_width));
+  (void)memset(flimits, 255, block_width);
+
+  // Initialize pixels in the input:
+  //   block pixels to value 1,
+  //   border pixels to value 10.
+  (void)memset(src_image, 10, input_size);
+  uint8_t *pixel_ptr = src_image_ptr;
+  for (int i = 0; i < block_height; ++i) {
+    for (int j = 0; j < block_width; ++j) {
+      pixel_ptr[j] = 1;
+    }
+    pixel_ptr += input_stride;
+  }
+
+  // Initialize pixels in the output to 99.
+  (void)memset(dst_image, 99, output_size);
+
+  ASM_REGISTER_STATE_CHECK(
+      GetParam()(src_image_ptr, dst_image_ptr, input_stride,
+                 output_stride, block_width, flimits, 16));
+
+  static const uint8_t expected_data[block_height] = {
+    4, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 4
+  };
+
+  pixel_ptr = dst_image_ptr;
+  for (int i = 0; i < block_height; ++i) {
+    for (int j = 0; j < block_width; ++j) {
+      EXPECT_EQ(expected_data[i], pixel_ptr[j])
+          << "VP8PostProcessingFilterTest failed with invalid filter output";
+    }
+    pixel_ptr += output_stride;
+  }
+
+  vpx_free(src_image);
+  vpx_free(dst_image);
+  vpx_free(flimits);
+};
+
+INSTANTIATE_TEST_CASE_P(C, VP8PostProcessingFilterTest,
+    ::testing::Values(vp8_post_proc_down_and_across_mb_row_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, VP8PostProcessingFilterTest,
+    ::testing::Values(vp8_post_proc_down_and_across_mb_row_sse2));
+#endif
+
+#if HAVE_MSA
+INSTANTIATE_TEST_CASE_P(MSA, VP8PostProcessingFilterTest,
+    ::testing::Values(vp8_post_proc_down_and_across_mb_row_msa));
+#endif
+
+}  // namespace
diff --git a/libs/libvpx/test/quantize_test.cc b/libs/libvpx/test/quantize_test.cc
new file mode 100644
index 0000000000..69da8994ca
--- /dev/null
+++ b/libs/libvpx/test/quantize_test.cc
@@ -0,0 +1,203 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_config.h"
+#include "./vp8_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "vp8/common/blockd.h"
+#include "vp8/common/onyx.h"
+#include "vp8/encoder/block.h"
+#include "vp8/encoder/onyx_int.h"
+#include "vp8/encoder/quantize.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
+
+namespace {
+
+const int kNumBlocks = 25;
+const int kNumBlockEntries = 16;
+
+typedef void (*VP8Quantize)(BLOCK *b, BLOCKD *d);
+
+typedef std::tr1::tuple<VP8Quantize, VP8Quantize> VP8QuantizeParam;
+
+using libvpx_test::ACMRandom;
+using std::tr1::make_tuple;
+
+// Create and populate a VP8_COMP instance which has a complete set of
+// quantization inputs as well as a second MACROBLOCKD for output.
+class QuantizeTestBase {
+ public:
+  virtual ~QuantizeTestBase() {
+    vp8_remove_compressor(&vp8_comp_);
+    vp8_comp_ = NULL;
+    vpx_free(macroblockd_dst_);
+    macroblockd_dst_ = NULL;
+    libvpx_test::ClearSystemState();
+  }
+
+ protected:
+  void SetupCompressor() {
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+
+    // The full configuration is necessary to generate the quantization tables.
+    VP8_CONFIG vp8_config;
+    memset(&vp8_config, 0, sizeof(vp8_config));
+
+    vp8_comp_ = vp8_create_compressor(&vp8_config);
+
+    // Set the tables based on a quantizer of 0.
+    vp8_set_quantizer(vp8_comp_, 0);
+
+    // Set up all the block/blockd pointers for the mb in vp8_comp_.
+    vp8cx_frame_init_quantizer(vp8_comp_);
+
+    // Copy macroblockd from the reference to get pre-set-up dequant values.
+    macroblockd_dst_ = reinterpret_cast<MACROBLOCKD *>(
+        vpx_memalign(32, sizeof(*macroblockd_dst_)));
+    memcpy(macroblockd_dst_, &vp8_comp_->mb.e_mbd, sizeof(*macroblockd_dst_));
+    // Fix block pointers - currently they point to the blocks in the reference
+    // structure.
+    vp8_setup_block_dptrs(macroblockd_dst_);
+  }
+
+  void UpdateQuantizer(int q) {
+    vp8_set_quantizer(vp8_comp_, q);
+
+    memcpy(macroblockd_dst_, &vp8_comp_->mb.e_mbd, sizeof(*macroblockd_dst_));
+    vp8_setup_block_dptrs(macroblockd_dst_);
+  }
+
+  void FillCoeffConstant(int16_t c) {
+    for (int i = 0; i < kNumBlocks * kNumBlockEntries; ++i) {
+      vp8_comp_->mb.coeff[i] = c;
+    }
+  }
+
+  void FillCoeffRandom() {
+    for (int i = 0; i < kNumBlocks * kNumBlockEntries; ++i) {
+      vp8_comp_->mb.coeff[i] = rnd_.Rand8();
+    }
+  }
+
+  void CheckOutput() {
+    EXPECT_EQ(0, memcmp(vp8_comp_->mb.e_mbd.qcoeff, macroblockd_dst_->qcoeff,
+                        sizeof(*macroblockd_dst_->qcoeff) * kNumBlocks *
+                            kNumBlockEntries))
+        << "qcoeff mismatch";
+    EXPECT_EQ(0, memcmp(vp8_comp_->mb.e_mbd.dqcoeff, macroblockd_dst_->dqcoeff,
+                        sizeof(*macroblockd_dst_->dqcoeff) * kNumBlocks *
+                            kNumBlockEntries))
+        << "dqcoeff mismatch";
+    EXPECT_EQ(0, memcmp(vp8_comp_->mb.e_mbd.eobs, macroblockd_dst_->eobs,
+                        sizeof(*macroblockd_dst_->eobs) * kNumBlocks))
+        << "eobs mismatch";
+  }
+
+  VP8_COMP *vp8_comp_;
+  MACROBLOCKD *macroblockd_dst_;
+
+ private:
+  ACMRandom rnd_;
+};
+
+class QuantizeTest : public QuantizeTestBase,
+                     public ::testing::TestWithParam<VP8QuantizeParam> {
+ protected:
+  virtual void SetUp() {
+    SetupCompressor();
+    asm_quant_ = GET_PARAM(0);
+    c_quant_ = GET_PARAM(1);
+  }
+
+  void RunComparison() {
+    for (int i = 0; i < kNumBlocks; ++i) {
+      ASM_REGISTER_STATE_CHECK(
+          c_quant_(&vp8_comp_->mb.block[i], &vp8_comp_->mb.e_mbd.block[i]));
+      ASM_REGISTER_STATE_CHECK(
+          asm_quant_(&vp8_comp_->mb.block[i], &macroblockd_dst_->block[i]));
+    }
+
+    CheckOutput();
+  }
+
+ private:
+  VP8Quantize asm_quant_;
+  VP8Quantize c_quant_;
+};
+
+TEST_P(QuantizeTest, TestZeroInput) {
+  FillCoeffConstant(0);
+  RunComparison();
+}
+
+TEST_P(QuantizeTest, TestLargeNegativeInput) {
+  FillCoeffConstant(0);
+  // Generate a qcoeff which contains 512/-512 (0x0100/0xFE00) to catch issues
+  // like BUG=883 where the constant being compared was incorrectly initialized.
+  vp8_comp_->mb.coeff[0] = -8191;
+  RunComparison();
+}
+
+TEST_P(QuantizeTest, TestRandomInput) {
+  FillCoeffRandom();
+  RunComparison();
+}
+
+TEST_P(QuantizeTest, TestMultipleQ) {
+  for (int q = 0; q < QINDEX_RANGE; ++q) {
+    UpdateQuantizer(q);
+    FillCoeffRandom();
+    RunComparison();
+  }
+}
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, QuantizeTest,
+    ::testing::Values(
+        make_tuple(&vp8_fast_quantize_b_sse2, &vp8_fast_quantize_b_c),
+        make_tuple(&vp8_regular_quantize_b_sse2, &vp8_regular_quantize_b_c)));
+#endif  // HAVE_SSE2
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(SSSE3, QuantizeTest,
+                        ::testing::Values(make_tuple(&vp8_fast_quantize_b_ssse3,
+                                                     &vp8_fast_quantize_b_c)));
+#endif  // HAVE_SSSE3
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, QuantizeTest,
+    ::testing::Values(make_tuple(&vp8_regular_quantize_b_sse4_1,
+                                 &vp8_regular_quantize_b_c)));
+#endif  // HAVE_SSE4_1
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, QuantizeTest,
+                        ::testing::Values(make_tuple(&vp8_fast_quantize_b_neon,
+                                                     &vp8_fast_quantize_b_c)));
+#endif  // HAVE_NEON
+
+#if HAVE_MSA
+INSTANTIATE_TEST_CASE_P(
+    MSA, QuantizeTest,
+    ::testing::Values(
+        make_tuple(&vp8_fast_quantize_b_msa, &vp8_fast_quantize_b_c),
+        make_tuple(&vp8_regular_quantize_b_msa, &vp8_regular_quantize_b_c)));
+#endif  // HAVE_MSA
+}  // namespace
diff --git a/libs/libvpx/test/register_state_check.h b/libs/libvpx/test/register_state_check.h
new file mode 100644
index 0000000000..489c419424
--- /dev/null
+++ b/libs/libvpx/test/register_state_check.h
@@ -0,0 +1,198 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef TEST_REGISTER_STATE_CHECK_H_
+#define TEST_REGISTER_STATE_CHECK_H_
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+// ASM_REGISTER_STATE_CHECK(asm_function)
+//   Minimally validates the environment pre & post function execution. This
+//   variant should be used with assembly functions which are not expected to
+//   fully restore the system state. See platform implementations of
+//   RegisterStateCheck for details.
+//
+// API_REGISTER_STATE_CHECK(api_function)
+//   Performs all the checks done by ASM_REGISTER_STATE_CHECK() and any
+//   additional checks to ensure the environment is in a consistent state pre &
+//   post function execution. This variant should be used with API functions.
+//   See platform implementations of RegisterStateCheckXXX for details.
+//
+
+#if defined(_WIN64)
+
+#undef NOMINMAX
+#define NOMINMAX
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <winnt.h>
+
+namespace testing {
+namespace internal {
+
+inline bool operator==(const M128A& lhs, const M128A& rhs) {
+  return (lhs.Low == rhs.Low && lhs.High == rhs.High);
+}
+
+}  // namespace internal
+}  // namespace testing
+
+namespace libvpx_test {
+
+// Compares the state of xmm[6-15] at construction with their state at
+// destruction. These registers should be preserved by the callee on
+// Windows x64.
+class RegisterStateCheck {
+ public:
+  RegisterStateCheck() { initialized_ = StoreRegisters(&pre_context_); }
+  ~RegisterStateCheck() { EXPECT_TRUE(Check()); }
+
+ private:
+  static bool StoreRegisters(CONTEXT* const context) {
+    const HANDLE this_thread = GetCurrentThread();
+    EXPECT_TRUE(this_thread != NULL);
+    context->ContextFlags = CONTEXT_FLOATING_POINT;
+    const bool context_saved = GetThreadContext(this_thread, context) == TRUE;
+    EXPECT_TRUE(context_saved) << "GetLastError: " << GetLastError();
+    return context_saved;
+  }
+
+  // Compares the register state. Returns true if the states match.
+  bool Check() const {
+    if (!initialized_) return false;
+    CONTEXT post_context;
+    if (!StoreRegisters(&post_context)) return false;
+
+    const M128A* xmm_pre = &pre_context_.Xmm6;
+    const M128A* xmm_post = &post_context.Xmm6;
+    for (int i = 6; i <= 15; ++i) {
+      EXPECT_EQ(*xmm_pre, *xmm_post) << "xmm" << i << " has been modified!";
+      ++xmm_pre;
+      ++xmm_post;
+    }
+    return !testing::Test::HasNonfatalFailure();
+  }
+
+  bool initialized_;
+  CONTEXT pre_context_;
+};
+
+#define ASM_REGISTER_STATE_CHECK(statement) do {  \
+  libvpx_test::RegisterStateCheck reg_check;      \
+  statement;                                      \
+} while (false)
+
+}  // namespace libvpx_test
+
+#elif defined(CONFIG_SHARED) && defined(HAVE_NEON_ASM) && defined(CONFIG_VP9) \
+      && !CONFIG_SHARED && HAVE_NEON_ASM && CONFIG_VP9
+
+extern "C" {
+// Save the d8-d15 registers into store.
+void vpx_push_neon(int64_t *store);
+}
+
+namespace libvpx_test {
+
+// Compares the state of d8-d15 at construction with their state at
+// destruction. These registers should be preserved by the callee on
+// arm platform.
+class RegisterStateCheck {
+ public:
+  RegisterStateCheck() { initialized_ = StoreRegisters(pre_store_); }
+  ~RegisterStateCheck() { EXPECT_TRUE(Check()); }
+
+ private:
+  static bool StoreRegisters(int64_t store[8]) {
+    vpx_push_neon(store);
+    return true;
+  }
+
+  // Compares the register state. Returns true if the states match.
+  bool Check() const {
+    if (!initialized_) return false;
+    int64_t post_store[8];
+    vpx_push_neon(post_store);
+    for (int i = 0; i < 8; ++i) {
+      EXPECT_EQ(pre_store_[i], post_store[i]) << "d"
+          << i + 8 << " has been modified";
+    }
+    return !testing::Test::HasNonfatalFailure();
+  }
+
+  bool initialized_;
+  int64_t pre_store_[8];
+};
+
+#define ASM_REGISTER_STATE_CHECK(statement) do {  \
+  libvpx_test::RegisterStateCheck reg_check;      \
+  statement;                                      \
+} while (false)
+
+}  // namespace libvpx_test
+
+#else
+
+namespace libvpx_test {
+
+class RegisterStateCheck {};
+#define ASM_REGISTER_STATE_CHECK(statement) statement
+
+}  // namespace libvpx_test
+
+#endif  // _WIN64
+
+#if ARCH_X86 || ARCH_X86_64
+#if defined(__GNUC__)
+
+namespace libvpx_test {
+
+// Checks the FPU tag word pre/post execution to ensure emms has been called.
+class RegisterStateCheckMMX {
+ public:
+  RegisterStateCheckMMX() {
+    __asm__ volatile("fstenv %0" : "=rm"(pre_fpu_env_));
+  }
+  ~RegisterStateCheckMMX() { EXPECT_TRUE(Check()); }
+
+ private:
+  // Checks the FPU tag word pre/post execution, returning false if not cleared
+  // to 0xffff.
+  bool Check() const {
+    EXPECT_EQ(0xffff, pre_fpu_env_[4])
+        << "FPU was in an inconsistent state prior to call";
+
+    uint16_t post_fpu_env[14];
+    __asm__ volatile("fstenv %0" : "=rm"(post_fpu_env));
+    EXPECT_EQ(0xffff, post_fpu_env[4])
+        << "FPU was left in an inconsistent state after call";
+    return !testing::Test::HasNonfatalFailure();
+  }
+
+  uint16_t pre_fpu_env_[14];
+};
+
+#define API_REGISTER_STATE_CHECK(statement) do {  \
+  libvpx_test::RegisterStateCheckMMX reg_check;   \
+  ASM_REGISTER_STATE_CHECK(statement);            \
+} while (false)
+
+}  // namespace libvpx_test
+
+#endif  // __GNUC__
+#endif  // ARCH_X86 || ARCH_X86_64
+
+#ifndef API_REGISTER_STATE_CHECK
+#define API_REGISTER_STATE_CHECK ASM_REGISTER_STATE_CHECK
+#endif
+
+#endif  // TEST_REGISTER_STATE_CHECK_H_
diff --git a/libs/libvpx/test/resize_test.cc b/libs/libvpx/test/resize_test.cc
new file mode 100644
index 0000000000..017730899b
--- /dev/null
+++ b/libs/libvpx/test/resize_test.cc
@@ -0,0 +1,703 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <climits>
+#include <vector>
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/video_source.h"
+#include "test/util.h"
+
+// Enable(1) or Disable(0) writing of the compressed bitstream.
+#define WRITE_COMPRESSED_STREAM 0
+
+namespace {
+
+#if WRITE_COMPRESSED_STREAM
+static void mem_put_le16(char *const mem, const unsigned int val) {
+  mem[0] = val;
+  mem[1] = val >> 8;
+}
+
+static void mem_put_le32(char *const mem, const unsigned int val) {
+  mem[0] = val;
+  mem[1] = val >> 8;
+  mem[2] = val >> 16;
+  mem[3] = val >> 24;
+}
+
+static void write_ivf_file_header(const vpx_codec_enc_cfg_t *const cfg,
+                                  int frame_cnt, FILE *const outfile) {
+  char header[32];
+
+  header[0] = 'D';
+  header[1] = 'K';
+  header[2] = 'I';
+  header[3] = 'F';
+  mem_put_le16(header + 4,  0);                   /* version */
+  mem_put_le16(header + 6,  32);                  /* headersize */
+  mem_put_le32(header + 8,  0x30395056);          /* fourcc (vp9) */
+  mem_put_le16(header + 12, cfg->g_w);            /* width */
+  mem_put_le16(header + 14, cfg->g_h);            /* height */
+  mem_put_le32(header + 16, cfg->g_timebase.den); /* rate */
+  mem_put_le32(header + 20, cfg->g_timebase.num); /* scale */
+  mem_put_le32(header + 24, frame_cnt);           /* length */
+  mem_put_le32(header + 28, 0);                   /* unused */
+
+  (void)fwrite(header, 1, 32, outfile);
+}
+
+static void write_ivf_frame_size(FILE *const outfile, const size_t size) {
+  char header[4];
+  mem_put_le32(header, static_cast<unsigned int>(size));
+  (void)fwrite(header, 1, 4, outfile);
+}
+
+static void write_ivf_frame_header(const vpx_codec_cx_pkt_t *const pkt,
+                                   FILE *const outfile) {
+  char header[12];
+  vpx_codec_pts_t pts;
+
+  if (pkt->kind != VPX_CODEC_CX_FRAME_PKT)
+    return;
+
+  pts = pkt->data.frame.pts;
+  mem_put_le32(header, static_cast<unsigned int>(pkt->data.frame.sz));
+  mem_put_le32(header + 4, pts & 0xFFFFFFFF);
+  mem_put_le32(header + 8, pts >> 32);
+
+  (void)fwrite(header, 1, 12, outfile);
+}
+#endif  // WRITE_COMPRESSED_STREAM
+
+const unsigned int kInitialWidth = 320;
+const unsigned int kInitialHeight = 240;
+
+struct FrameInfo {
+  FrameInfo(vpx_codec_pts_t _pts, unsigned int _w, unsigned int _h)
+      : pts(_pts), w(_w), h(_h) {}
+
+  vpx_codec_pts_t pts;
+  unsigned int w;
+  unsigned int h;
+};
+
+void ScaleForFrameNumber(unsigned int frame,
+                         unsigned int initial_w,
+                         unsigned int initial_h,
+                         unsigned int *w,
+                         unsigned int *h,
+                         int flag_codec) {
+  if (frame < 10) {
+    *w = initial_w;
+    *h = initial_h;
+    return;
+  }
+  if (frame < 20) {
+    *w = initial_w * 3 / 4;
+    *h = initial_h * 3 / 4;
+    return;
+  }
+  if (frame < 30) {
+    *w = initial_w / 2;
+    *h = initial_h / 2;
+    return;
+  }
+  if (frame < 40) {
+    *w = initial_w;
+    *h = initial_h;
+    return;
+  }
+  if (frame < 50) {
+    *w = initial_w * 3 / 4;
+    *h = initial_h * 3 / 4;
+    return;
+  }
+  if (frame < 60) {
+    *w = initial_w / 2;
+    *h = initial_h / 2;
+    return;
+  }
+  if (frame < 70) {
+    *w = initial_w;
+    *h = initial_h;
+    return;
+  }
+  if (frame < 80) {
+    *w = initial_w * 3 / 4;
+    *h = initial_h * 3 / 4;
+    return;
+  }
+  if (frame < 90) {
+    *w = initial_w / 2;
+    *h = initial_h / 2;
+    return;
+  }
+  if (frame < 100) {
+    *w = initial_w * 3 / 4;
+    *h = initial_h * 3 / 4;
+    return;
+  }
+  if (frame < 110) {
+    *w = initial_w;
+    *h = initial_h;
+    return;
+  }
+  if (frame < 120) {
+    *w = initial_w * 3 / 4;
+    *h = initial_h * 3 / 4;
+    return;
+  }
+  if (frame < 130) {
+    *w = initial_w / 2;
+    *h = initial_h / 2;
+    return;
+  }
+  if (frame < 140) {
+    *w = initial_w * 3 / 4;
+    *h = initial_h * 3 / 4;
+    return;
+  }
+  if (frame < 150) {
+    *w = initial_w;
+    *h = initial_h;
+    return;
+  }
+  if (frame < 160) {
+    *w = initial_w * 3 / 4;
+    *h = initial_h * 3 / 4;
+    return;
+  }
+  if (frame < 170) {
+    *w = initial_w / 2;
+    *h = initial_h / 2;
+    return;
+  }
+  if (frame < 180) {
+    *w = initial_w * 3 / 4;
+    *h = initial_h * 3 / 4;
+    return;
+  }
+  if (frame < 190) {
+    *w = initial_w;
+    *h = initial_h;
+    return;
+  }
+  if (frame < 200) {
+    *w = initial_w * 3 / 4;
+    *h = initial_h * 3 / 4;
+    return;
+  }
+  if (frame < 210) {
+    *w = initial_w / 2;
+    *h = initial_h / 2;
+    return;
+  }
+  if (frame < 220) {
+    *w = initial_w * 3 / 4;
+    *h = initial_h * 3 / 4;
+    return;
+  }
+  if (frame < 230) {
+    *w = initial_w;
+    *h = initial_h;
+    return;
+  }
+  if (frame < 240) {
+    *w = initial_w * 3 / 4;
+    *h = initial_h * 3 / 4;
+    return;
+  }
+  if (frame < 250) {
+    *w = initial_w  / 2;
+    *h = initial_h / 2;
+    return;
+  }
+  if (frame < 260) {
+    *w = initial_w;
+    *h = initial_h;
+    return;
+  }
+  // Go down very low.
+  if (frame < 270) {
+    *w = initial_w / 4;
+    *h = initial_h / 4;
+    return;
+  }
+  if (flag_codec == 1) {
+    // Cases that only works for VP9.
+    // For VP9: Swap width and height of original.
+    if (frame < 320) {
+      *w = initial_h;
+      *h = initial_w;
+      return;
+    }
+  }
+  *w = initial_w;
+  *h = initial_h;
+}
+
+class ResizingVideoSource : public ::libvpx_test::DummyVideoSource {
+ public:
+  ResizingVideoSource() {
+    SetSize(kInitialWidth, kInitialHeight);
+    limit_ = 350;
+  }
+  int flag_codec_;
+  virtual ~ResizingVideoSource() {}
+
+ protected:
+  virtual void Next() {
+    ++frame_;
+    unsigned int width;
+    unsigned int height;
+    ScaleForFrameNumber(frame_, kInitialWidth, kInitialHeight, &width, &height,
+                        flag_codec_);
+    SetSize(width, height);
+    FillFrame();
+  }
+};
+
+class ResizeTest : public ::libvpx_test::EncoderTest,
+  public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
+ protected:
+  ResizeTest() : EncoderTest(GET_PARAM(0)) {}
+
+  virtual ~ResizeTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+  }
+
+  virtual void DecompressedFrameHook(const vpx_image_t &img,
+                                     vpx_codec_pts_t pts) {
+    frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h));
+  }
+
+  std::vector< FrameInfo > frame_info_list_;
+};
+
+TEST_P(ResizeTest, TestExternalResizeWorks) {
+  ResizingVideoSource video;
+  video.flag_codec_ = 0;
+  cfg_.g_lag_in_frames = 0;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
+       info != frame_info_list_.end(); ++info) {
+    const unsigned int frame = static_cast<unsigned>(info->pts);
+    unsigned int expected_w;
+    unsigned int expected_h;
+    ScaleForFrameNumber(frame, kInitialWidth, kInitialHeight,
+                        &expected_w, &expected_h, 0);
+    EXPECT_EQ(expected_w, info->w)
+        << "Frame " << frame << " had unexpected width";
+    EXPECT_EQ(expected_h, info->h)
+        << "Frame " << frame << " had unexpected height";
+  }
+}
+
+const unsigned int kStepDownFrame = 3;
+const unsigned int kStepUpFrame = 6;
+
+class ResizeInternalTest : public ResizeTest {
+ protected:
+#if WRITE_COMPRESSED_STREAM
+  ResizeInternalTest()
+      : ResizeTest(),
+        frame0_psnr_(0.0),
+        outfile_(NULL),
+        out_frames_(0) {}
+#else
+  ResizeInternalTest() : ResizeTest(), frame0_psnr_(0.0) {}
+#endif
+
+  virtual ~ResizeInternalTest() {}
+
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+#if WRITE_COMPRESSED_STREAM
+    outfile_ = fopen("vp90-2-05-resize.ivf", "wb");
+#endif
+  }
+
+  virtual void EndPassHook() {
+#if WRITE_COMPRESSED_STREAM
+    if (outfile_) {
+      if (!fseek(outfile_, 0, SEEK_SET))
+        write_ivf_file_header(&cfg_, out_frames_, outfile_);
+      fclose(outfile_);
+      outfile_ = NULL;
+    }
+#endif
+  }
+
+  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                                  libvpx_test::Encoder *encoder) {
+    if (change_config_) {
+      int new_q = 60;
+      if (video->frame() == 0) {
+        struct vpx_scaling_mode mode = {VP8E_ONETWO, VP8E_ONETWO};
+        encoder->Control(VP8E_SET_SCALEMODE, &mode);
+      }
+      if (video->frame() == 1) {
+        struct vpx_scaling_mode mode = {VP8E_NORMAL, VP8E_NORMAL};
+        encoder->Control(VP8E_SET_SCALEMODE, &mode);
+        cfg_.rc_min_quantizer = cfg_.rc_max_quantizer = new_q;
+        encoder->Config(&cfg_);
+      }
+    } else {
+      if (video->frame() == kStepDownFrame) {
+        struct vpx_scaling_mode mode = {VP8E_FOURFIVE, VP8E_THREEFIVE};
+        encoder->Control(VP8E_SET_SCALEMODE, &mode);
+      }
+      if (video->frame() == kStepUpFrame) {
+        struct vpx_scaling_mode mode = {VP8E_NORMAL, VP8E_NORMAL};
+        encoder->Control(VP8E_SET_SCALEMODE, &mode);
+      }
+    }
+  }
+
+  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+    if (!frame0_psnr_)
+      frame0_psnr_ = pkt->data.psnr.psnr[0];
+    EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 2.0);
+  }
+
+#if WRITE_COMPRESSED_STREAM
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    ++out_frames_;
+
+    // Write initial file header if first frame.
+    if (pkt->data.frame.pts == 0)
+      write_ivf_file_header(&cfg_, 0, outfile_);
+
+    // Write frame header and data.
+    write_ivf_frame_header(pkt, outfile_);
+    (void)fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile_);
+  }
+#endif
+
+  double frame0_psnr_;
+  bool change_config_;
+#if WRITE_COMPRESSED_STREAM
+  FILE *outfile_;
+  unsigned int out_frames_;
+#endif
+};
+
+TEST_P(ResizeInternalTest, TestInternalResizeWorks) {
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 10);
+  init_flags_ = VPX_CODEC_USE_PSNR;
+  change_config_ = false;
+
+  // q picked such that initial keyframe on this clip is ~30dB PSNR
+  cfg_.rc_min_quantizer = cfg_.rc_max_quantizer = 48;
+
+  // If the number of frames being encoded is smaller than g_lag_in_frames
+  // the encoded frame is unavailable using the current API. Comparing
+  // frames to detect mismatch would then not be possible. Set
+  // g_lag_in_frames = 0 to get around this.
+  cfg_.g_lag_in_frames = 0;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
+       info != frame_info_list_.end(); ++info) {
+    const vpx_codec_pts_t pts = info->pts;
+    if (pts >= kStepDownFrame && pts < kStepUpFrame) {
+      ASSERT_EQ(282U, info->w) << "Frame " << pts << " had unexpected width";
+      ASSERT_EQ(173U, info->h) << "Frame " << pts << " had unexpected height";
+    } else {
+      EXPECT_EQ(352U, info->w) << "Frame " << pts << " had unexpected width";
+      EXPECT_EQ(288U, info->h) << "Frame " << pts << " had unexpected height";
+    }
+  }
+}
+
+TEST_P(ResizeInternalTest, TestInternalResizeChangeConfig) {
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 10);
+  cfg_.g_w = 352;
+  cfg_.g_h = 288;
+  change_config_ = true;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+class ResizeRealtimeTest : public ::libvpx_test::EncoderTest,
+  public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
+ protected:
+  ResizeRealtimeTest() : EncoderTest(GET_PARAM(0)) {}
+  virtual ~ResizeRealtimeTest() {}
+
+  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                                  libvpx_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(VP9E_SET_AQ_MODE, 3);
+      encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
+    }
+
+    if (change_bitrate_ && video->frame() == 120) {
+      change_bitrate_ = false;
+      cfg_.rc_target_bitrate = 500;
+      encoder->Config(&cfg_);
+    }
+  }
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+    set_cpu_used_ = GET_PARAM(2);
+  }
+
+  virtual void DecompressedFrameHook(const vpx_image_t &img,
+                                     vpx_codec_pts_t pts) {
+    frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h));
+  }
+
+  void DefaultConfig() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 600;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_min_quantizer = 2;
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_undershoot_pct = 50;
+    cfg_.rc_overshoot_pct = 50;
+    cfg_.rc_end_usage = VPX_CBR;
+    cfg_.kf_mode = VPX_KF_AUTO;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.kf_min_dist = cfg_.kf_max_dist = 3000;
+    // Enable dropped frames.
+    cfg_.rc_dropframe_thresh = 1;
+    // Enable error_resilience mode.
+    cfg_.g_error_resilient  = 1;
+    // Enable dynamic resizing.
+    cfg_.rc_resize_allowed = 1;
+    // Run at low bitrate.
+    cfg_.rc_target_bitrate = 200;
+  }
+
+  std::vector< FrameInfo > frame_info_list_;
+  int set_cpu_used_;
+  bool change_bitrate_;
+};
+
+TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) {
+  ResizingVideoSource video;
+  video.flag_codec_ = 1;
+  DefaultConfig();
+  // Disable internal resize for this test.
+  cfg_.rc_resize_allowed = 0;
+  change_bitrate_ = false;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
+       info != frame_info_list_.end(); ++info) {
+    const unsigned int frame = static_cast<unsigned>(info->pts);
+    unsigned int expected_w;
+    unsigned int expected_h;
+    ScaleForFrameNumber(frame, kInitialWidth, kInitialHeight,
+                        &expected_w, &expected_h, 1);
+    EXPECT_EQ(expected_w, info->w)
+        << "Frame " << frame << " had unexpected width";
+    EXPECT_EQ(expected_h, info->h)
+        << "Frame " << frame << " had unexpected height";
+  }
+}
+
+// Verify the dynamic resizer behavior for real time, 1 pass CBR mode.
+// Run at low bitrate, with resize_allowed = 1, and verify that we get
+// one resize down event.
+TEST_P(ResizeRealtimeTest, TestInternalResizeDown) {
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 299);
+  DefaultConfig();
+  cfg_.g_w = 352;
+  cfg_.g_h = 288;
+  change_bitrate_ = false;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  unsigned int last_w = cfg_.g_w;
+  unsigned int last_h = cfg_.g_h;
+  int resize_count = 0;
+  for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
+       info != frame_info_list_.end(); ++info) {
+    if (info->w != last_w || info->h != last_h) {
+      // Verify that resize down occurs.
+      ASSERT_LT(info->w, last_w);
+      ASSERT_LT(info->h, last_h);
+      last_w = info->w;
+      last_h = info->h;
+      resize_count++;
+    }
+  }
+
+  // Verify that we get 1 resize down event in this test.
+  ASSERT_EQ(1, resize_count) << "Resizing should occur.";
+}
+
+// Verify the dynamic resizer behavior for real time, 1 pass CBR mode.
+// Start at low target bitrate, raise the bitrate in the middle of the clip,
+// scaling-up should occur after bitrate changed.
+TEST_P(ResizeRealtimeTest, TestInternalResizeDownUpChangeBitRate) {
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 359);
+  DefaultConfig();
+  cfg_.g_w = 352;
+  cfg_.g_h = 288;
+  change_bitrate_ = true;
+  // Disable dropped frames.
+  cfg_.rc_dropframe_thresh = 0;
+  // Starting bitrate low.
+  cfg_.rc_target_bitrate = 80;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  unsigned int last_w = cfg_.g_w;
+  unsigned int last_h = cfg_.g_h;
+  int resize_count = 0;
+  for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
+       info != frame_info_list_.end(); ++info) {
+    if (info->w != last_w || info->h != last_h) {
+      resize_count++;
+      if (resize_count == 1) {
+        // Verify that resize down occurs.
+        ASSERT_LT(info->w, last_w);
+        ASSERT_LT(info->h, last_h);
+      } else if (resize_count == 2) {
+        // Verify that resize up occurs.
+        ASSERT_GT(info->w, last_w);
+        ASSERT_GT(info->h, last_h);
+      }
+      last_w = info->w;
+      last_h = info->h;
+    }
+  }
+
+  // Verify that we get 2 resize events in this test.
+  ASSERT_EQ(resize_count, 2) << "Resizing should occur twice.";
+}
+
+vpx_img_fmt_t CspForFrameNumber(int frame) {
+  if (frame < 10)
+    return VPX_IMG_FMT_I420;
+  if (frame < 20)
+    return VPX_IMG_FMT_I444;
+  return VPX_IMG_FMT_I420;
+}
+
+class ResizeCspTest : public ResizeTest {
+ protected:
+#if WRITE_COMPRESSED_STREAM
+  ResizeCspTest()
+      : ResizeTest(),
+        frame0_psnr_(0.0),
+        outfile_(NULL),
+        out_frames_(0) {}
+#else
+  ResizeCspTest() : ResizeTest(), frame0_psnr_(0.0) {}
+#endif
+
+  virtual ~ResizeCspTest() {}
+
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+#if WRITE_COMPRESSED_STREAM
+    outfile_ = fopen("vp91-2-05-cspchape.ivf", "wb");
+#endif
+  }
+
+  virtual void EndPassHook() {
+#if WRITE_COMPRESSED_STREAM
+    if (outfile_) {
+      if (!fseek(outfile_, 0, SEEK_SET))
+        write_ivf_file_header(&cfg_, out_frames_, outfile_);
+      fclose(outfile_);
+      outfile_ = NULL;
+    }
+#endif
+  }
+
+  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                                  libvpx_test::Encoder *encoder) {
+    if (CspForFrameNumber(video->frame()) != VPX_IMG_FMT_I420 &&
+        cfg_.g_profile != 1) {
+      cfg_.g_profile = 1;
+      encoder->Config(&cfg_);
+    }
+    if (CspForFrameNumber(video->frame()) == VPX_IMG_FMT_I420 &&
+        cfg_.g_profile != 0) {
+      cfg_.g_profile = 0;
+      encoder->Config(&cfg_);
+    }
+  }
+
+  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+    if (!frame0_psnr_)
+      frame0_psnr_ = pkt->data.psnr.psnr[0];
+    EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 2.0);
+  }
+
+#if WRITE_COMPRESSED_STREAM
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    ++out_frames_;
+
+    // Write initial file header if first frame.
+    if (pkt->data.frame.pts == 0)
+      write_ivf_file_header(&cfg_, 0, outfile_);
+
+    // Write frame header and data.
+    write_ivf_frame_header(pkt, outfile_);
+    (void)fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile_);
+  }
+#endif
+
+  double frame0_psnr_;
+#if WRITE_COMPRESSED_STREAM
+  FILE *outfile_;
+  unsigned int out_frames_;
+#endif
+};
+
+class ResizingCspVideoSource : public ::libvpx_test::DummyVideoSource {
+ public:
+  ResizingCspVideoSource() {
+    SetSize(kInitialWidth, kInitialHeight);
+    limit_ = 30;
+  }
+
+  virtual ~ResizingCspVideoSource() {}
+
+ protected:
+  virtual void Next() {
+    ++frame_;
+    SetImageFormat(CspForFrameNumber(frame_));
+    FillFrame();
+  }
+};
+
+TEST_P(ResizeCspTest, TestResizeCspWorks) {
+  ResizingCspVideoSource video;
+  init_flags_ = VPX_CODEC_USE_PSNR;
+  cfg_.rc_min_quantizer = cfg_.rc_max_quantizer = 48;
+  cfg_.g_lag_in_frames = 0;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+VP8_INSTANTIATE_TEST_CASE(ResizeTest, ONE_PASS_TEST_MODES);
+VP9_INSTANTIATE_TEST_CASE(ResizeTest,
+                          ::testing::Values(::libvpx_test::kRealTime));
+VP9_INSTANTIATE_TEST_CASE(ResizeInternalTest,
+                          ::testing::Values(::libvpx_test::kOnePassBest));
+VP9_INSTANTIATE_TEST_CASE(ResizeRealtimeTest,
+                          ::testing::Values(::libvpx_test::kRealTime),
+                          ::testing::Range(5, 9));
+VP9_INSTANTIATE_TEST_CASE(ResizeCspTest,
+                          ::testing::Values(::libvpx_test::kRealTime));
+}  // namespace
diff --git a/libs/libvpx/test/resize_util.sh b/libs/libvpx/test/resize_util.sh
new file mode 100755
index 0000000000..5e472716da
--- /dev/null
+++ b/libs/libvpx/test/resize_util.sh
@@ -0,0 +1,69 @@
+#!/bin/sh
+##
+##  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+##  This file tests the libvpx resize_util example code. To add new tests to
+##  this file, do the following:
+##    1. Write a shell function (this is your test).
+##    2. Add the function to resize_util_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: $YUV_RAW_INPUT is required.
+resize_util_verify_environment() {
+  if [ ! -e "${YUV_RAW_INPUT}" ]; then
+    echo "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
+    return 1
+  fi
+}
+
+# Resizes $YUV_RAW_INPUT using the resize_util example. $1 is the output
+# dimensions that will be passed to resize_util.
+resize_util() {
+  local resizer="${LIBVPX_BIN_PATH}/resize_util${VPX_TEST_EXE_SUFFIX}"
+  local output_file="${VPX_TEST_OUTPUT_DIR}/resize_util.raw"
+  local frames_to_resize="10"
+  local target_dimensions="$1"
+
+  # resize_util is available only when CONFIG_SHARED is disabled.
+  if [ -z "$(vpx_config_option_enabled CONFIG_SHARED)" ]; then
+    if [ ! -x "${resizer}" ]; then
+      elog "${resizer} does not exist or is not executable."
+      return 1
+    fi
+
+    eval "${VPX_TEST_PREFIX}" "${resizer}" "${YUV_RAW_INPUT}" \
+        "${YUV_RAW_INPUT_WIDTH}x${YUV_RAW_INPUT_HEIGHT}" \
+        "${target_dimensions}" "${output_file}" ${frames_to_resize} \
+        ${devnull}
+
+    [ -e "${output_file}" ] || return 1
+  fi
+}
+
+# Halves each dimension of $YUV_RAW_INPUT using resize_util().
+resize_down() {
+  local target_width=$((${YUV_RAW_INPUT_WIDTH} / 2))
+  local target_height=$((${YUV_RAW_INPUT_HEIGHT} / 2))
+
+  resize_util "${target_width}x${target_height}"
+}
+
+# Doubles each dimension of $YUV_RAW_INPUT using resize_util().
+resize_up() {
+  local target_width=$((${YUV_RAW_INPUT_WIDTH} * 2))
+  local target_height=$((${YUV_RAW_INPUT_HEIGHT} * 2))
+
+  resize_util "${target_width}x${target_height}"
+}
+
+resize_util_tests="resize_down
+                   resize_up"
+
+run_tests resize_util_verify_environment "${resize_util_tests}"
diff --git a/libs/libvpx/test/sad_test.cc b/libs/libvpx/test/sad_test.cc
new file mode 100644
index 0000000000..3f0f74cae6
--- /dev/null
+++ b/libs/libvpx/test/sad_test.cc
@@ -0,0 +1,962 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <string.h>
+#include <limits.h>
+#include <stdio.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "vpx/vpx_codec.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+
+typedef unsigned int (*SadMxNFunc)(const uint8_t *src_ptr,
+                                   int src_stride,
+                                   const uint8_t *ref_ptr,
+                                   int ref_stride);
+typedef std::tr1::tuple<int, int, SadMxNFunc, int> SadMxNParam;
+
+typedef uint32_t (*SadMxNAvgFunc)(const uint8_t *src_ptr,
+                                  int src_stride,
+                                  const uint8_t *ref_ptr,
+                                  int ref_stride,
+                                  const uint8_t *second_pred);
+typedef std::tr1::tuple<int, int, SadMxNAvgFunc, int> SadMxNAvgParam;
+
+typedef void (*SadMxNx4Func)(const uint8_t *src_ptr,
+                             int src_stride,
+                             const uint8_t *const ref_ptr[],
+                             int ref_stride,
+                             uint32_t *sad_array);
+typedef std::tr1::tuple<int, int, SadMxNx4Func, int> SadMxNx4Param;
+
+using libvpx_test::ACMRandom;
+
+namespace {
+class SADTestBase : public ::testing::Test {
+ public:
+  SADTestBase(int width, int height, int bit_depth) :
+      width_(width), height_(height), bd_(bit_depth) {}
+
+  static void SetUpTestCase() {
+    source_data8_ = reinterpret_cast<uint8_t*>(
+        vpx_memalign(kDataAlignment, kDataBlockSize));
+    reference_data8_ = reinterpret_cast<uint8_t*>(
+        vpx_memalign(kDataAlignment, kDataBufferSize));
+    second_pred8_ = reinterpret_cast<uint8_t*>(
+        vpx_memalign(kDataAlignment, 64*64));
+    source_data16_ = reinterpret_cast<uint16_t*>(
+        vpx_memalign(kDataAlignment, kDataBlockSize*sizeof(uint16_t)));
+    reference_data16_ = reinterpret_cast<uint16_t*>(
+        vpx_memalign(kDataAlignment, kDataBufferSize*sizeof(uint16_t)));
+    second_pred16_ = reinterpret_cast<uint16_t*>(
+        vpx_memalign(kDataAlignment, 64*64*sizeof(uint16_t)));
+  }
+
+  static void TearDownTestCase() {
+    vpx_free(source_data8_);
+    source_data8_ = NULL;
+    vpx_free(reference_data8_);
+    reference_data8_ = NULL;
+    vpx_free(second_pred8_);
+    second_pred8_ = NULL;
+    vpx_free(source_data16_);
+    source_data16_ = NULL;
+    vpx_free(reference_data16_);
+    reference_data16_ = NULL;
+    vpx_free(second_pred16_);
+    second_pred16_ = NULL;
+  }
+
+  virtual void TearDown() {
+    libvpx_test::ClearSystemState();
+  }
+
+ protected:
+  // Handle blocks up to 4 blocks 64x64 with stride up to 128
+  static const int kDataAlignment = 16;
+  static const int kDataBlockSize = 64 * 128;
+  static const int kDataBufferSize = 4 * kDataBlockSize;
+
+  virtual void SetUp() {
+    if (bd_ == -1) {
+      use_high_bit_depth_ = false;
+      bit_depth_ = VPX_BITS_8;
+      source_data_ = source_data8_;
+      reference_data_ = reference_data8_;
+      second_pred_ = second_pred8_;
+#if CONFIG_VP9_HIGHBITDEPTH
+    } else {
+      use_high_bit_depth_ = true;
+      bit_depth_ = static_cast<vpx_bit_depth_t>(bd_);
+      source_data_ = CONVERT_TO_BYTEPTR(source_data16_);
+      reference_data_ = CONVERT_TO_BYTEPTR(reference_data16_);
+      second_pred_ = CONVERT_TO_BYTEPTR(second_pred16_);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    }
+    mask_ = (1 << bit_depth_) - 1;
+    source_stride_ = (width_ + 31) & ~31;
+    reference_stride_ = width_ * 2;
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+  }
+
+  virtual uint8_t *GetReference(int block_idx) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (use_high_bit_depth_)
+      return CONVERT_TO_BYTEPTR(CONVERT_TO_SHORTPTR(reference_data_) +
+                                block_idx * kDataBlockSize);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    return reference_data_ + block_idx * kDataBlockSize;
+  }
+
+  // Sum of Absolute Differences. Given two blocks, calculate the absolute
+  // difference between two pixels in the same relative location; accumulate.
+  unsigned int ReferenceSAD(int block_idx) {
+    unsigned int sad = 0;
+      const uint8_t *const reference8 = GetReference(block_idx);
+      const uint8_t *const source8 = source_data_;
+#if CONFIG_VP9_HIGHBITDEPTH
+      const uint16_t *const reference16 =
+          CONVERT_TO_SHORTPTR(GetReference(block_idx));
+      const uint16_t *const source16 = CONVERT_TO_SHORTPTR(source_data_);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    for (int h = 0; h < height_; ++h) {
+      for (int w = 0; w < width_; ++w) {
+        if (!use_high_bit_depth_) {
+          sad += abs(source8[h * source_stride_ + w] -
+                     reference8[h * reference_stride_ + w]);
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          sad += abs(source16[h * source_stride_ + w] -
+                     reference16[h * reference_stride_ + w]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        }
+      }
+    }
+    return sad;
+  }
+
+  // Sum of Absolute Differences Average. Given two blocks, and a prediction
+  // calculate the absolute difference between one pixel and average of the
+  // corresponding and predicted pixels; accumulate.
+  unsigned int ReferenceSADavg(int block_idx) {
+    unsigned int sad = 0;
+    const uint8_t *const reference8 = GetReference(block_idx);
+    const uint8_t *const source8 = source_data_;
+    const uint8_t *const second_pred8 = second_pred_;
+#if CONFIG_VP9_HIGHBITDEPTH
+    const uint16_t *const reference16 =
+        CONVERT_TO_SHORTPTR(GetReference(block_idx));
+    const uint16_t *const source16 = CONVERT_TO_SHORTPTR(source_data_);
+    const uint16_t *const second_pred16 = CONVERT_TO_SHORTPTR(second_pred_);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    for (int h = 0; h < height_; ++h) {
+      for (int w = 0; w < width_; ++w) {
+        if (!use_high_bit_depth_) {
+          const int tmp = second_pred8[h * width_ + w] +
+              reference8[h * reference_stride_ + w];
+          const uint8_t comp_pred = ROUND_POWER_OF_TWO(tmp, 1);
+          sad += abs(source8[h * source_stride_ + w] - comp_pred);
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          const int tmp = second_pred16[h * width_ + w] +
+              reference16[h * reference_stride_ + w];
+          const uint16_t comp_pred = ROUND_POWER_OF_TWO(tmp, 1);
+          sad += abs(source16[h * source_stride_ + w] - comp_pred);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        }
+      }
+    }
+    return sad;
+  }
+
+  void FillConstant(uint8_t *data, int stride, uint16_t fill_constant) {
+    uint8_t *data8 = data;
+#if CONFIG_VP9_HIGHBITDEPTH
+    uint16_t *data16 = CONVERT_TO_SHORTPTR(data);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    for (int h = 0; h < height_; ++h) {
+      for (int w = 0; w < width_; ++w) {
+        if (!use_high_bit_depth_) {
+          data8[h * stride + w] = static_cast<uint8_t>(fill_constant);
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          data16[h * stride + w] = fill_constant;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        }
+      }
+    }
+  }
+
+  void FillRandom(uint8_t *data, int stride) {
+    uint8_t *data8 = data;
+#if CONFIG_VP9_HIGHBITDEPTH
+    uint16_t *data16 = CONVERT_TO_SHORTPTR(data);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    for (int h = 0; h < height_; ++h) {
+      for (int w = 0; w < width_; ++w) {
+        if (!use_high_bit_depth_) {
+          data8[h * stride + w] = rnd_.Rand8();
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          data16[h * stride + w] = rnd_.Rand16() & mask_;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        }
+      }
+    }
+  }
+
+  int width_, height_, mask_, bd_;
+  vpx_bit_depth_t bit_depth_;
+  static uint8_t *source_data_;
+  static uint8_t *reference_data_;
+  static uint8_t *second_pred_;
+  int source_stride_;
+  bool use_high_bit_depth_;
+  static uint8_t *source_data8_;
+  static uint8_t *reference_data8_;
+  static uint8_t *second_pred8_;
+  static uint16_t *source_data16_;
+  static uint16_t *reference_data16_;
+  static uint16_t *second_pred16_;
+  int reference_stride_;
+
+  ACMRandom rnd_;
+};
+
+class SADx4Test
+    : public SADTestBase,
+      public ::testing::WithParamInterface<SadMxNx4Param> {
+ public:
+  SADx4Test() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
+
+ protected:
+  void SADs(unsigned int *results) {
+    const uint8_t *references[] = {GetReference(0), GetReference(1),
+                                   GetReference(2), GetReference(3)};
+
+    ASM_REGISTER_STATE_CHECK(GET_PARAM(2)(source_data_, source_stride_,
+                                          references, reference_stride_,
+                                          results));
+  }
+
+  void CheckSADs() {
+    unsigned int reference_sad, exp_sad[4];
+
+    SADs(exp_sad);
+    for (int block = 0; block < 4; ++block) {
+      reference_sad = ReferenceSAD(block);
+
+      EXPECT_EQ(reference_sad, exp_sad[block]) << "block " << block;
+    }
+  }
+};
+
+class SADTest
+    : public SADTestBase,
+      public ::testing::WithParamInterface<SadMxNParam> {
+ public:
+  SADTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
+
+ protected:
+  unsigned int SAD(int block_idx) {
+    unsigned int ret;
+    const uint8_t *const reference = GetReference(block_idx);
+
+    ASM_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
+                                                reference, reference_stride_));
+    return ret;
+  }
+
+  void CheckSAD() {
+    const unsigned int reference_sad = ReferenceSAD(0);
+    const unsigned int exp_sad = SAD(0);
+
+    ASSERT_EQ(reference_sad, exp_sad);
+  }
+};
+
+class SADavgTest
+    : public SADTestBase,
+      public ::testing::WithParamInterface<SadMxNAvgParam> {
+ public:
+  SADavgTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
+
+ protected:
+  unsigned int SAD_avg(int block_idx) {
+    unsigned int ret;
+    const uint8_t *const reference = GetReference(block_idx);
+
+    ASM_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
+                                                reference, reference_stride_,
+                                                second_pred_));
+    return ret;
+  }
+
+  void CheckSAD() {
+    const unsigned int reference_sad = ReferenceSADavg(0);
+    const unsigned int exp_sad = SAD_avg(0);
+
+    ASSERT_EQ(reference_sad, exp_sad);
+  }
+};
+
+uint8_t *SADTestBase::source_data_ = NULL;
+uint8_t *SADTestBase::reference_data_ = NULL;
+uint8_t *SADTestBase::second_pred_ = NULL;
+uint8_t *SADTestBase::source_data8_ = NULL;
+uint8_t *SADTestBase::reference_data8_ = NULL;
+uint8_t *SADTestBase::second_pred8_ = NULL;
+uint16_t *SADTestBase::source_data16_ = NULL;
+uint16_t *SADTestBase::reference_data16_ = NULL;
+uint16_t *SADTestBase::second_pred16_ = NULL;
+
+TEST_P(SADTest, MaxRef) {
+  FillConstant(source_data_, source_stride_, 0);
+  FillConstant(reference_data_, reference_stride_, mask_);
+  CheckSAD();
+}
+
+TEST_P(SADTest, MaxSrc) {
+  FillConstant(source_data_, source_stride_, mask_);
+  FillConstant(reference_data_, reference_stride_, 0);
+  CheckSAD();
+}
+
+TEST_P(SADTest, ShortRef) {
+  const int tmp_stride = reference_stride_;
+  reference_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  CheckSAD();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADTest, UnalignedRef) {
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  const int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  CheckSAD();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADTest, ShortSrc) {
+  const int tmp_stride = source_stride_;
+  source_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  CheckSAD();
+  source_stride_ = tmp_stride;
+}
+
+TEST_P(SADavgTest, MaxRef) {
+  FillConstant(source_data_, source_stride_, 0);
+  FillConstant(reference_data_, reference_stride_, mask_);
+  FillConstant(second_pred_, width_, 0);
+  CheckSAD();
+}
+TEST_P(SADavgTest, MaxSrc) {
+  FillConstant(source_data_, source_stride_, mask_);
+  FillConstant(reference_data_, reference_stride_, 0);
+  FillConstant(second_pred_, width_, 0);
+  CheckSAD();
+}
+
+TEST_P(SADavgTest, ShortRef) {
+  const int tmp_stride = reference_stride_;
+  reference_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  FillRandom(second_pred_, width_);
+  CheckSAD();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADavgTest, UnalignedRef) {
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  const int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  FillRandom(second_pred_, width_);
+  CheckSAD();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADavgTest, ShortSrc) {
+  const int tmp_stride = source_stride_;
+  source_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  FillRandom(second_pred_, width_);
+  CheckSAD();
+  source_stride_ = tmp_stride;
+}
+
+TEST_P(SADx4Test, MaxRef) {
+  FillConstant(source_data_, source_stride_, 0);
+  FillConstant(GetReference(0), reference_stride_, mask_);
+  FillConstant(GetReference(1), reference_stride_, mask_);
+  FillConstant(GetReference(2), reference_stride_, mask_);
+  FillConstant(GetReference(3), reference_stride_, mask_);
+  CheckSADs();
+}
+
+TEST_P(SADx4Test, MaxSrc) {
+  FillConstant(source_data_, source_stride_, mask_);
+  FillConstant(GetReference(0), reference_stride_, 0);
+  FillConstant(GetReference(1), reference_stride_, 0);
+  FillConstant(GetReference(2), reference_stride_, 0);
+  FillConstant(GetReference(3), reference_stride_, 0);
+  CheckSADs();
+}
+
+TEST_P(SADx4Test, ShortRef) {
+  int tmp_stride = reference_stride_;
+  reference_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  CheckSADs();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADx4Test, UnalignedRef) {
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  CheckSADs();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADx4Test, ShortSrc) {
+  int tmp_stride = source_stride_;
+  source_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  CheckSADs();
+  source_stride_ = tmp_stride;
+}
+
+TEST_P(SADx4Test, SrcAlignedByWidth) {
+  uint8_t * tmp_source_data = source_data_;
+  source_data_ += width_;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  CheckSADs();
+  source_data_ = tmp_source_data;
+}
+
+using std::tr1::make_tuple;
+
+//------------------------------------------------------------------------------
+// C functions
+const SadMxNParam c_tests[] = {
+  make_tuple(64, 64, &vpx_sad64x64_c, -1),
+  make_tuple(64, 32, &vpx_sad64x32_c, -1),
+  make_tuple(32, 64, &vpx_sad32x64_c, -1),
+  make_tuple(32, 32, &vpx_sad32x32_c, -1),
+  make_tuple(32, 16, &vpx_sad32x16_c, -1),
+  make_tuple(16, 32, &vpx_sad16x32_c, -1),
+  make_tuple(16, 16, &vpx_sad16x16_c, -1),
+  make_tuple(16, 8, &vpx_sad16x8_c, -1),
+  make_tuple(8, 16, &vpx_sad8x16_c, -1),
+  make_tuple(8, 8, &vpx_sad8x8_c, -1),
+  make_tuple(8, 4, &vpx_sad8x4_c, -1),
+  make_tuple(4, 8, &vpx_sad4x8_c, -1),
+  make_tuple(4, 4, &vpx_sad4x4_c, -1),
+#if CONFIG_VP9_HIGHBITDEPTH
+  make_tuple(64, 64, &vpx_highbd_sad64x64_c, 8),
+  make_tuple(64, 32, &vpx_highbd_sad64x32_c, 8),
+  make_tuple(32, 64, &vpx_highbd_sad32x64_c, 8),
+  make_tuple(32, 32, &vpx_highbd_sad32x32_c, 8),
+  make_tuple(32, 16, &vpx_highbd_sad32x16_c, 8),
+  make_tuple(16, 32, &vpx_highbd_sad16x32_c, 8),
+  make_tuple(16, 16, &vpx_highbd_sad16x16_c, 8),
+  make_tuple(16, 8, &vpx_highbd_sad16x8_c, 8),
+  make_tuple(8, 16, &vpx_highbd_sad8x16_c, 8),
+  make_tuple(8, 8, &vpx_highbd_sad8x8_c, 8),
+  make_tuple(8, 4, &vpx_highbd_sad8x4_c, 8),
+  make_tuple(4, 8, &vpx_highbd_sad4x8_c, 8),
+  make_tuple(4, 4, &vpx_highbd_sad4x4_c, 8),
+  make_tuple(64, 64, &vpx_highbd_sad64x64_c, 10),
+  make_tuple(64, 32, &vpx_highbd_sad64x32_c, 10),
+  make_tuple(32, 64, &vpx_highbd_sad32x64_c, 10),
+  make_tuple(32, 32, &vpx_highbd_sad32x32_c, 10),
+  make_tuple(32, 16, &vpx_highbd_sad32x16_c, 10),
+  make_tuple(16, 32, &vpx_highbd_sad16x32_c, 10),
+  make_tuple(16, 16, &vpx_highbd_sad16x16_c, 10),
+  make_tuple(16, 8, &vpx_highbd_sad16x8_c, 10),
+  make_tuple(8, 16, &vpx_highbd_sad8x16_c, 10),
+  make_tuple(8, 8, &vpx_highbd_sad8x8_c, 10),
+  make_tuple(8, 4, &vpx_highbd_sad8x4_c, 10),
+  make_tuple(4, 8, &vpx_highbd_sad4x8_c, 10),
+  make_tuple(4, 4, &vpx_highbd_sad4x4_c, 10),
+  make_tuple(64, 64, &vpx_highbd_sad64x64_c, 12),
+  make_tuple(64, 32, &vpx_highbd_sad64x32_c, 12),
+  make_tuple(32, 64, &vpx_highbd_sad32x64_c, 12),
+  make_tuple(32, 32, &vpx_highbd_sad32x32_c, 12),
+  make_tuple(32, 16, &vpx_highbd_sad32x16_c, 12),
+  make_tuple(16, 32, &vpx_highbd_sad16x32_c, 12),
+  make_tuple(16, 16, &vpx_highbd_sad16x16_c, 12),
+  make_tuple(16, 8, &vpx_highbd_sad16x8_c, 12),
+  make_tuple(8, 16, &vpx_highbd_sad8x16_c, 12),
+  make_tuple(8, 8, &vpx_highbd_sad8x8_c, 12),
+  make_tuple(8, 4, &vpx_highbd_sad8x4_c, 12),
+  make_tuple(4, 8, &vpx_highbd_sad4x8_c, 12),
+  make_tuple(4, 4, &vpx_highbd_sad4x4_c, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_CASE_P(C, SADTest, ::testing::ValuesIn(c_tests));
+
+const SadMxNAvgParam avg_c_tests[] = {
+  make_tuple(64, 64, &vpx_sad64x64_avg_c, -1),
+  make_tuple(64, 32, &vpx_sad64x32_avg_c, -1),
+  make_tuple(32, 64, &vpx_sad32x64_avg_c, -1),
+  make_tuple(32, 32, &vpx_sad32x32_avg_c, -1),
+  make_tuple(32, 16, &vpx_sad32x16_avg_c, -1),
+  make_tuple(16, 32, &vpx_sad16x32_avg_c, -1),
+  make_tuple(16, 16, &vpx_sad16x16_avg_c, -1),
+  make_tuple(16, 8, &vpx_sad16x8_avg_c, -1),
+  make_tuple(8, 16, &vpx_sad8x16_avg_c, -1),
+  make_tuple(8, 8, &vpx_sad8x8_avg_c, -1),
+  make_tuple(8, 4, &vpx_sad8x4_avg_c, -1),
+  make_tuple(4, 8, &vpx_sad4x8_avg_c, -1),
+  make_tuple(4, 4, &vpx_sad4x4_avg_c, -1),
+#if CONFIG_VP9_HIGHBITDEPTH
+  make_tuple(64, 64, &vpx_highbd_sad64x64_avg_c, 8),
+  make_tuple(64, 32, &vpx_highbd_sad64x32_avg_c, 8),
+  make_tuple(32, 64, &vpx_highbd_sad32x64_avg_c, 8),
+  make_tuple(32, 32, &vpx_highbd_sad32x32_avg_c, 8),
+  make_tuple(32, 16, &vpx_highbd_sad32x16_avg_c, 8),
+  make_tuple(16, 32, &vpx_highbd_sad16x32_avg_c, 8),
+  make_tuple(16, 16, &vpx_highbd_sad16x16_avg_c, 8),
+  make_tuple(16, 8, &vpx_highbd_sad16x8_avg_c, 8),
+  make_tuple(8, 16, &vpx_highbd_sad8x16_avg_c, 8),
+  make_tuple(8, 8, &vpx_highbd_sad8x8_avg_c, 8),
+  make_tuple(8, 4, &vpx_highbd_sad8x4_avg_c, 8),
+  make_tuple(4, 8, &vpx_highbd_sad4x8_avg_c, 8),
+  make_tuple(4, 4, &vpx_highbd_sad4x4_avg_c, 8),
+  make_tuple(64, 64, &vpx_highbd_sad64x64_avg_c, 10),
+  make_tuple(64, 32, &vpx_highbd_sad64x32_avg_c, 10),
+  make_tuple(32, 64, &vpx_highbd_sad32x64_avg_c, 10),
+  make_tuple(32, 32, &vpx_highbd_sad32x32_avg_c, 10),
+  make_tuple(32, 16, &vpx_highbd_sad32x16_avg_c, 10),
+  make_tuple(16, 32, &vpx_highbd_sad16x32_avg_c, 10),
+  make_tuple(16, 16, &vpx_highbd_sad16x16_avg_c, 10),
+  make_tuple(16, 8, &vpx_highbd_sad16x8_avg_c, 10),
+  make_tuple(8, 16, &vpx_highbd_sad8x16_avg_c, 10),
+  make_tuple(8, 8, &vpx_highbd_sad8x8_avg_c, 10),
+  make_tuple(8, 4, &vpx_highbd_sad8x4_avg_c, 10),
+  make_tuple(4, 8, &vpx_highbd_sad4x8_avg_c, 10),
+  make_tuple(4, 4, &vpx_highbd_sad4x4_avg_c, 10),
+  make_tuple(64, 64, &vpx_highbd_sad64x64_avg_c, 12),
+  make_tuple(64, 32, &vpx_highbd_sad64x32_avg_c, 12),
+  make_tuple(32, 64, &vpx_highbd_sad32x64_avg_c, 12),
+  make_tuple(32, 32, &vpx_highbd_sad32x32_avg_c, 12),
+  make_tuple(32, 16, &vpx_highbd_sad32x16_avg_c, 12),
+  make_tuple(16, 32, &vpx_highbd_sad16x32_avg_c, 12),
+  make_tuple(16, 16, &vpx_highbd_sad16x16_avg_c, 12),
+  make_tuple(16, 8, &vpx_highbd_sad16x8_avg_c, 12),
+  make_tuple(8, 16, &vpx_highbd_sad8x16_avg_c, 12),
+  make_tuple(8, 8, &vpx_highbd_sad8x8_avg_c, 12),
+  make_tuple(8, 4, &vpx_highbd_sad8x4_avg_c, 12),
+  make_tuple(4, 8, &vpx_highbd_sad4x8_avg_c, 12),
+  make_tuple(4, 4, &vpx_highbd_sad4x4_avg_c, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_CASE_P(C, SADavgTest, ::testing::ValuesIn(avg_c_tests));
+
+const SadMxNx4Param x4d_c_tests[] = {
+  make_tuple(64, 64, &vpx_sad64x64x4d_c, -1),
+  make_tuple(64, 32, &vpx_sad64x32x4d_c, -1),
+  make_tuple(32, 64, &vpx_sad32x64x4d_c, -1),
+  make_tuple(32, 32, &vpx_sad32x32x4d_c, -1),
+  make_tuple(32, 16, &vpx_sad32x16x4d_c, -1),
+  make_tuple(16, 32, &vpx_sad16x32x4d_c, -1),
+  make_tuple(16, 16, &vpx_sad16x16x4d_c, -1),
+  make_tuple(16, 8, &vpx_sad16x8x4d_c, -1),
+  make_tuple(8, 16, &vpx_sad8x16x4d_c, -1),
+  make_tuple(8, 8, &vpx_sad8x8x4d_c, -1),
+  make_tuple(8, 4, &vpx_sad8x4x4d_c, -1),
+  make_tuple(4, 8, &vpx_sad4x8x4d_c, -1),
+  make_tuple(4, 4, &vpx_sad4x4x4d_c, -1),
+#if CONFIG_VP9_HIGHBITDEPTH
+  make_tuple(64, 64, &vpx_highbd_sad64x64x4d_c, 8),
+  make_tuple(64, 32, &vpx_highbd_sad64x32x4d_c, 8),
+  make_tuple(32, 64, &vpx_highbd_sad32x64x4d_c, 8),
+  make_tuple(32, 32, &vpx_highbd_sad32x32x4d_c, 8),
+  make_tuple(32, 16, &vpx_highbd_sad32x16x4d_c, 8),
+  make_tuple(16, 32, &vpx_highbd_sad16x32x4d_c, 8),
+  make_tuple(16, 16, &vpx_highbd_sad16x16x4d_c, 8),
+  make_tuple(16, 8, &vpx_highbd_sad16x8x4d_c, 8),
+  make_tuple(8, 16, &vpx_highbd_sad8x16x4d_c, 8),
+  make_tuple(8, 8, &vpx_highbd_sad8x8x4d_c, 8),
+  make_tuple(8, 4, &vpx_highbd_sad8x4x4d_c, 8),
+  make_tuple(4, 8, &vpx_highbd_sad4x8x4d_c, 8),
+  make_tuple(4, 4, &vpx_highbd_sad4x4x4d_c, 8),
+  make_tuple(64, 64, &vpx_highbd_sad64x64x4d_c, 10),
+  make_tuple(64, 32, &vpx_highbd_sad64x32x4d_c, 10),
+  make_tuple(32, 64, &vpx_highbd_sad32x64x4d_c, 10),
+  make_tuple(32, 32, &vpx_highbd_sad32x32x4d_c, 10),
+  make_tuple(32, 16, &vpx_highbd_sad32x16x4d_c, 10),
+  make_tuple(16, 32, &vpx_highbd_sad16x32x4d_c, 10),
+  make_tuple(16, 16, &vpx_highbd_sad16x16x4d_c, 10),
+  make_tuple(16, 8, &vpx_highbd_sad16x8x4d_c, 10),
+  make_tuple(8, 16, &vpx_highbd_sad8x16x4d_c, 10),
+  make_tuple(8, 8, &vpx_highbd_sad8x8x4d_c, 10),
+  make_tuple(8, 4, &vpx_highbd_sad8x4x4d_c, 10),
+  make_tuple(4, 8, &vpx_highbd_sad4x8x4d_c, 10),
+  make_tuple(4, 4, &vpx_highbd_sad4x4x4d_c, 10),
+  make_tuple(64, 64, &vpx_highbd_sad64x64x4d_c, 12),
+  make_tuple(64, 32, &vpx_highbd_sad64x32x4d_c, 12),
+  make_tuple(32, 64, &vpx_highbd_sad32x64x4d_c, 12),
+  make_tuple(32, 32, &vpx_highbd_sad32x32x4d_c, 12),
+  make_tuple(32, 16, &vpx_highbd_sad32x16x4d_c, 12),
+  make_tuple(16, 32, &vpx_highbd_sad16x32x4d_c, 12),
+  make_tuple(16, 16, &vpx_highbd_sad16x16x4d_c, 12),
+  make_tuple(16, 8, &vpx_highbd_sad16x8x4d_c, 12),
+  make_tuple(8, 16, &vpx_highbd_sad8x16x4d_c, 12),
+  make_tuple(8, 8, &vpx_highbd_sad8x8x4d_c, 12),
+  make_tuple(8, 4, &vpx_highbd_sad8x4x4d_c, 12),
+  make_tuple(4, 8, &vpx_highbd_sad4x8x4d_c, 12),
+  make_tuple(4, 4, &vpx_highbd_sad4x4x4d_c, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_CASE_P(C, SADx4Test, ::testing::ValuesIn(x4d_c_tests));
+
+//------------------------------------------------------------------------------
+// ARM functions
+#if HAVE_MEDIA
+const SadMxNParam media_tests[] = {
+  make_tuple(16, 16, &vpx_sad16x16_media, -1),
+};
+INSTANTIATE_TEST_CASE_P(MEDIA, SADTest, ::testing::ValuesIn(media_tests));
+#endif  // HAVE_MEDIA
+
+#if HAVE_NEON
+const SadMxNParam neon_tests[] = {
+  make_tuple(64, 64, &vpx_sad64x64_neon, -1),
+  make_tuple(32, 32, &vpx_sad32x32_neon, -1),
+  make_tuple(16, 16, &vpx_sad16x16_neon, -1),
+  make_tuple(16, 8, &vpx_sad16x8_neon, -1),
+  make_tuple(8, 16, &vpx_sad8x16_neon, -1),
+  make_tuple(8, 8, &vpx_sad8x8_neon, -1),
+  make_tuple(4, 4, &vpx_sad4x4_neon, -1),
+};
+INSTANTIATE_TEST_CASE_P(NEON, SADTest, ::testing::ValuesIn(neon_tests));
+
+const SadMxNx4Param x4d_neon_tests[] = {
+  make_tuple(64, 64, &vpx_sad64x64x4d_neon, -1),
+  make_tuple(32, 32, &vpx_sad32x32x4d_neon, -1),
+  make_tuple(16, 16, &vpx_sad16x16x4d_neon, -1),
+};
+INSTANTIATE_TEST_CASE_P(NEON, SADx4Test, ::testing::ValuesIn(x4d_neon_tests));
+#endif  // HAVE_NEON
+
+//------------------------------------------------------------------------------
+// x86 functions
+#if HAVE_MMX
+const SadMxNParam mmx_tests[] = {
+  make_tuple(16, 16, &vpx_sad16x16_mmx, -1),
+  make_tuple(16, 8, &vpx_sad16x8_mmx, -1),
+  make_tuple(8, 16, &vpx_sad8x16_mmx, -1),
+  make_tuple(8, 8, &vpx_sad8x8_mmx, -1),
+  make_tuple(4, 4, &vpx_sad4x4_mmx, -1),
+};
+INSTANTIATE_TEST_CASE_P(MMX, SADTest, ::testing::ValuesIn(mmx_tests));
+#endif  // HAVE_MMX
+
+#if HAVE_SSE2
+#if CONFIG_USE_X86INC
+const SadMxNParam sse2_tests[] = {
+  make_tuple(64, 64, &vpx_sad64x64_sse2, -1),
+  make_tuple(64, 32, &vpx_sad64x32_sse2, -1),
+  make_tuple(32, 64, &vpx_sad32x64_sse2, -1),
+  make_tuple(32, 32, &vpx_sad32x32_sse2, -1),
+  make_tuple(32, 16, &vpx_sad32x16_sse2, -1),
+  make_tuple(16, 32, &vpx_sad16x32_sse2, -1),
+  make_tuple(16, 16, &vpx_sad16x16_sse2, -1),
+  make_tuple(16, 8, &vpx_sad16x8_sse2, -1),
+  make_tuple(8, 16, &vpx_sad8x16_sse2, -1),
+  make_tuple(8, 8, &vpx_sad8x8_sse2, -1),
+  make_tuple(8, 4, &vpx_sad8x4_sse2, -1),
+  make_tuple(4, 8, &vpx_sad4x8_sse2, -1),
+  make_tuple(4, 4, &vpx_sad4x4_sse2, -1),
+#if CONFIG_VP9_HIGHBITDEPTH
+  make_tuple(64, 64, &vpx_highbd_sad64x64_sse2, 8),
+  make_tuple(64, 32, &vpx_highbd_sad64x32_sse2, 8),
+  make_tuple(32, 64, &vpx_highbd_sad32x64_sse2, 8),
+  make_tuple(32, 32, &vpx_highbd_sad32x32_sse2, 8),
+  make_tuple(32, 16, &vpx_highbd_sad32x16_sse2, 8),
+  make_tuple(16, 32, &vpx_highbd_sad16x32_sse2, 8),
+  make_tuple(16, 16, &vpx_highbd_sad16x16_sse2, 8),
+  make_tuple(16, 8, &vpx_highbd_sad16x8_sse2, 8),
+  make_tuple(8, 16, &vpx_highbd_sad8x16_sse2, 8),
+  make_tuple(8, 8, &vpx_highbd_sad8x8_sse2, 8),
+  make_tuple(8, 4, &vpx_highbd_sad8x4_sse2, 8),
+  make_tuple(64, 64, &vpx_highbd_sad64x64_sse2, 10),
+  make_tuple(64, 32, &vpx_highbd_sad64x32_sse2, 10),
+  make_tuple(32, 64, &vpx_highbd_sad32x64_sse2, 10),
+  make_tuple(32, 32, &vpx_highbd_sad32x32_sse2, 10),
+  make_tuple(32, 16, &vpx_highbd_sad32x16_sse2, 10),
+  make_tuple(16, 32, &vpx_highbd_sad16x32_sse2, 10),
+  make_tuple(16, 16, &vpx_highbd_sad16x16_sse2, 10),
+  make_tuple(16, 8, &vpx_highbd_sad16x8_sse2, 10),
+  make_tuple(8, 16, &vpx_highbd_sad8x16_sse2, 10),
+  make_tuple(8, 8, &vpx_highbd_sad8x8_sse2, 10),
+  make_tuple(8, 4, &vpx_highbd_sad8x4_sse2, 10),
+  make_tuple(64, 64, &vpx_highbd_sad64x64_sse2, 12),
+  make_tuple(64, 32, &vpx_highbd_sad64x32_sse2, 12),
+  make_tuple(32, 64, &vpx_highbd_sad32x64_sse2, 12),
+  make_tuple(32, 32, &vpx_highbd_sad32x32_sse2, 12),
+  make_tuple(32, 16, &vpx_highbd_sad32x16_sse2, 12),
+  make_tuple(16, 32, &vpx_highbd_sad16x32_sse2, 12),
+  make_tuple(16, 16, &vpx_highbd_sad16x16_sse2, 12),
+  make_tuple(16, 8, &vpx_highbd_sad16x8_sse2, 12),
+  make_tuple(8, 16, &vpx_highbd_sad8x16_sse2, 12),
+  make_tuple(8, 8, &vpx_highbd_sad8x8_sse2, 12),
+  make_tuple(8, 4, &vpx_highbd_sad8x4_sse2, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_CASE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests));
+
+const SadMxNAvgParam avg_sse2_tests[] = {
+  make_tuple(64, 64, &vpx_sad64x64_avg_sse2, -1),
+  make_tuple(64, 32, &vpx_sad64x32_avg_sse2, -1),
+  make_tuple(32, 64, &vpx_sad32x64_avg_sse2, -1),
+  make_tuple(32, 32, &vpx_sad32x32_avg_sse2, -1),
+  make_tuple(32, 16, &vpx_sad32x16_avg_sse2, -1),
+  make_tuple(16, 32, &vpx_sad16x32_avg_sse2, -1),
+  make_tuple(16, 16, &vpx_sad16x16_avg_sse2, -1),
+  make_tuple(16, 8, &vpx_sad16x8_avg_sse2, -1),
+  make_tuple(8, 16, &vpx_sad8x16_avg_sse2, -1),
+  make_tuple(8, 8, &vpx_sad8x8_avg_sse2, -1),
+  make_tuple(8, 4, &vpx_sad8x4_avg_sse2, -1),
+  make_tuple(4, 8, &vpx_sad4x8_avg_sse2, -1),
+  make_tuple(4, 4, &vpx_sad4x4_avg_sse2, -1),
+#if CONFIG_VP9_HIGHBITDEPTH
+  make_tuple(64, 64, &vpx_highbd_sad64x64_avg_sse2, 8),
+  make_tuple(64, 32, &vpx_highbd_sad64x32_avg_sse2, 8),
+  make_tuple(32, 64, &vpx_highbd_sad32x64_avg_sse2, 8),
+  make_tuple(32, 32, &vpx_highbd_sad32x32_avg_sse2, 8),
+  make_tuple(32, 16, &vpx_highbd_sad32x16_avg_sse2, 8),
+  make_tuple(16, 32, &vpx_highbd_sad16x32_avg_sse2, 8),
+  make_tuple(16, 16, &vpx_highbd_sad16x16_avg_sse2, 8),
+  make_tuple(16, 8, &vpx_highbd_sad16x8_avg_sse2, 8),
+  make_tuple(8, 16, &vpx_highbd_sad8x16_avg_sse2, 8),
+  make_tuple(8, 8, &vpx_highbd_sad8x8_avg_sse2, 8),
+  make_tuple(8, 4, &vpx_highbd_sad8x4_avg_sse2, 8),
+  make_tuple(64, 64, &vpx_highbd_sad64x64_avg_sse2, 10),
+  make_tuple(64, 32, &vpx_highbd_sad64x32_avg_sse2, 10),
+  make_tuple(32, 64, &vpx_highbd_sad32x64_avg_sse2, 10),
+  make_tuple(32, 32, &vpx_highbd_sad32x32_avg_sse2, 10),
+  make_tuple(32, 16, &vpx_highbd_sad32x16_avg_sse2, 10),
+  make_tuple(16, 32, &vpx_highbd_sad16x32_avg_sse2, 10),
+  make_tuple(16, 16, &vpx_highbd_sad16x16_avg_sse2, 10),
+  make_tuple(16, 8, &vpx_highbd_sad16x8_avg_sse2, 10),
+  make_tuple(8, 16, &vpx_highbd_sad8x16_avg_sse2, 10),
+  make_tuple(8, 8, &vpx_highbd_sad8x8_avg_sse2, 10),
+  make_tuple(8, 4, &vpx_highbd_sad8x4_avg_sse2, 10),
+  make_tuple(64, 64, &vpx_highbd_sad64x64_avg_sse2, 12),
+  make_tuple(64, 32, &vpx_highbd_sad64x32_avg_sse2, 12),
+  make_tuple(32, 64, &vpx_highbd_sad32x64_avg_sse2, 12),
+  make_tuple(32, 32, &vpx_highbd_sad32x32_avg_sse2, 12),
+  make_tuple(32, 16, &vpx_highbd_sad32x16_avg_sse2, 12),
+  make_tuple(16, 32, &vpx_highbd_sad16x32_avg_sse2, 12),
+  make_tuple(16, 16, &vpx_highbd_sad16x16_avg_sse2, 12),
+  make_tuple(16, 8, &vpx_highbd_sad16x8_avg_sse2, 12),
+  make_tuple(8, 16, &vpx_highbd_sad8x16_avg_sse2, 12),
+  make_tuple(8, 8, &vpx_highbd_sad8x8_avg_sse2, 12),
+  make_tuple(8, 4, &vpx_highbd_sad8x4_avg_sse2, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_CASE_P(SSE2, SADavgTest, ::testing::ValuesIn(avg_sse2_tests));
+
+const SadMxNx4Param x4d_sse2_tests[] = {
+  make_tuple(64, 64, &vpx_sad64x64x4d_sse2, -1),
+  make_tuple(64, 32, &vpx_sad64x32x4d_sse2, -1),
+  make_tuple(32, 64, &vpx_sad32x64x4d_sse2, -1),
+  make_tuple(32, 32, &vpx_sad32x32x4d_sse2, -1),
+  make_tuple(32, 16, &vpx_sad32x16x4d_sse2, -1),
+  make_tuple(16, 32, &vpx_sad16x32x4d_sse2, -1),
+  make_tuple(16, 16, &vpx_sad16x16x4d_sse2, -1),
+  make_tuple(16, 8, &vpx_sad16x8x4d_sse2, -1),
+  make_tuple(8, 16, &vpx_sad8x16x4d_sse2, -1),
+  make_tuple(8, 8, &vpx_sad8x8x4d_sse2, -1),
+  make_tuple(8, 4, &vpx_sad8x4x4d_sse2, -1),
+  make_tuple(4, 8, &vpx_sad4x8x4d_sse2, -1),
+  make_tuple(4, 4, &vpx_sad4x4x4d_sse2, -1),
+#if CONFIG_VP9_HIGHBITDEPTH
+  make_tuple(64, 64, &vpx_highbd_sad64x64x4d_sse2, 8),
+  make_tuple(64, 32, &vpx_highbd_sad64x32x4d_sse2, 8),
+  make_tuple(32, 64, &vpx_highbd_sad32x64x4d_sse2, 8),
+  make_tuple(32, 32, &vpx_highbd_sad32x32x4d_sse2, 8),
+  make_tuple(32, 16, &vpx_highbd_sad32x16x4d_sse2, 8),
+  make_tuple(16, 32, &vpx_highbd_sad16x32x4d_sse2, 8),
+  make_tuple(16, 16, &vpx_highbd_sad16x16x4d_sse2, 8),
+  make_tuple(16, 8, &vpx_highbd_sad16x8x4d_sse2, 8),
+  make_tuple(8, 16, &vpx_highbd_sad8x16x4d_sse2, 8),
+  make_tuple(8, 8, &vpx_highbd_sad8x8x4d_sse2, 8),
+  make_tuple(8, 4, &vpx_highbd_sad8x4x4d_sse2, 8),
+  make_tuple(4, 8, &vpx_highbd_sad4x8x4d_sse2, 8),
+  make_tuple(4, 4, &vpx_highbd_sad4x4x4d_sse2, 8),
+  make_tuple(64, 64, &vpx_highbd_sad64x64x4d_sse2, 10),
+  make_tuple(64, 32, &vpx_highbd_sad64x32x4d_sse2, 10),
+  make_tuple(32, 64, &vpx_highbd_sad32x64x4d_sse2, 10),
+  make_tuple(32, 32, &vpx_highbd_sad32x32x4d_sse2, 10),
+  make_tuple(32, 16, &vpx_highbd_sad32x16x4d_sse2, 10),
+  make_tuple(16, 32, &vpx_highbd_sad16x32x4d_sse2, 10),
+  make_tuple(16, 16, &vpx_highbd_sad16x16x4d_sse2, 10),
+  make_tuple(16, 8, &vpx_highbd_sad16x8x4d_sse2, 10),
+  make_tuple(8, 16, &vpx_highbd_sad8x16x4d_sse2, 10),
+  make_tuple(8, 8, &vpx_highbd_sad8x8x4d_sse2, 10),
+  make_tuple(8, 4, &vpx_highbd_sad8x4x4d_sse2, 10),
+  make_tuple(4, 8, &vpx_highbd_sad4x8x4d_sse2, 10),
+  make_tuple(4, 4, &vpx_highbd_sad4x4x4d_sse2, 10),
+  make_tuple(64, 64, &vpx_highbd_sad64x64x4d_sse2, 12),
+  make_tuple(64, 32, &vpx_highbd_sad64x32x4d_sse2, 12),
+  make_tuple(32, 64, &vpx_highbd_sad32x64x4d_sse2, 12),
+  make_tuple(32, 32, &vpx_highbd_sad32x32x4d_sse2, 12),
+  make_tuple(32, 16, &vpx_highbd_sad32x16x4d_sse2, 12),
+  make_tuple(16, 32, &vpx_highbd_sad16x32x4d_sse2, 12),
+  make_tuple(16, 16, &vpx_highbd_sad16x16x4d_sse2, 12),
+  make_tuple(16, 8, &vpx_highbd_sad16x8x4d_sse2, 12),
+  make_tuple(8, 16, &vpx_highbd_sad8x16x4d_sse2, 12),
+  make_tuple(8, 8, &vpx_highbd_sad8x8x4d_sse2, 12),
+  make_tuple(8, 4, &vpx_highbd_sad8x4x4d_sse2, 12),
+  make_tuple(4, 8, &vpx_highbd_sad4x8x4d_sse2, 12),
+  make_tuple(4, 4, &vpx_highbd_sad4x4x4d_sse2, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_CASE_P(SSE2, SADx4Test, ::testing::ValuesIn(x4d_sse2_tests));
+#endif  // CONFIG_USE_X86INC
+#endif  // HAVE_SSE2
+
+#if HAVE_SSE3
+// Only functions are x3, which do not have tests.
+#endif  // HAVE_SSE3
+
+#if HAVE_SSSE3
+// Only functions are x3, which do not have tests.
+#endif  // HAVE_SSSE3
+
+#if HAVE_SSE4_1
+// Only functions are x8, which do not have tests.
+#endif  // HAVE_SSE4_1
+
+#if HAVE_AVX2
+const SadMxNParam avx2_tests[] = {
+  make_tuple(64, 64, &vpx_sad64x64_avx2, -1),
+  make_tuple(64, 32, &vpx_sad64x32_avx2, -1),
+  make_tuple(32, 64, &vpx_sad32x64_avx2, -1),
+  make_tuple(32, 32, &vpx_sad32x32_avx2, -1),
+  make_tuple(32, 16, &vpx_sad32x16_avx2, -1),
+};
+INSTANTIATE_TEST_CASE_P(AVX2, SADTest, ::testing::ValuesIn(avx2_tests));
+
+const SadMxNAvgParam avg_avx2_tests[] = {
+  make_tuple(64, 64, &vpx_sad64x64_avg_avx2, -1),
+  make_tuple(64, 32, &vpx_sad64x32_avg_avx2, -1),
+  make_tuple(32, 64, &vpx_sad32x64_avg_avx2, -1),
+  make_tuple(32, 32, &vpx_sad32x32_avg_avx2, -1),
+  make_tuple(32, 16, &vpx_sad32x16_avg_avx2, -1),
+};
+INSTANTIATE_TEST_CASE_P(AVX2, SADavgTest, ::testing::ValuesIn(avg_avx2_tests));
+
+const SadMxNx4Param x4d_avx2_tests[] = {
+  make_tuple(64, 64, &vpx_sad64x64x4d_avx2, -1),
+  make_tuple(32, 32, &vpx_sad32x32x4d_avx2, -1),
+};
+INSTANTIATE_TEST_CASE_P(AVX2, SADx4Test, ::testing::ValuesIn(x4d_avx2_tests));
+#endif  // HAVE_AVX2
+
+//------------------------------------------------------------------------------
+// MIPS functions
+#if HAVE_MSA
+const SadMxNParam msa_tests[] = {
+  make_tuple(64, 64, &vpx_sad64x64_msa, -1),
+  make_tuple(64, 32, &vpx_sad64x32_msa, -1),
+  make_tuple(32, 64, &vpx_sad32x64_msa, -1),
+  make_tuple(32, 32, &vpx_sad32x32_msa, -1),
+  make_tuple(32, 16, &vpx_sad32x16_msa, -1),
+  make_tuple(16, 32, &vpx_sad16x32_msa, -1),
+  make_tuple(16, 16, &vpx_sad16x16_msa, -1),
+  make_tuple(16, 8, &vpx_sad16x8_msa, -1),
+  make_tuple(8, 16, &vpx_sad8x16_msa, -1),
+  make_tuple(8, 8, &vpx_sad8x8_msa, -1),
+  make_tuple(8, 4, &vpx_sad8x4_msa, -1),
+  make_tuple(4, 8, &vpx_sad4x8_msa, -1),
+  make_tuple(4, 4, &vpx_sad4x4_msa, -1),
+};
+INSTANTIATE_TEST_CASE_P(MSA, SADTest, ::testing::ValuesIn(msa_tests));
+
+const SadMxNAvgParam avg_msa_tests[] = {
+  make_tuple(64, 64, &vpx_sad64x64_avg_msa, -1),
+  make_tuple(64, 32, &vpx_sad64x32_avg_msa, -1),
+  make_tuple(32, 64, &vpx_sad32x64_avg_msa, -1),
+  make_tuple(32, 32, &vpx_sad32x32_avg_msa, -1),
+  make_tuple(32, 16, &vpx_sad32x16_avg_msa, -1),
+  make_tuple(16, 32, &vpx_sad16x32_avg_msa, -1),
+  make_tuple(16, 16, &vpx_sad16x16_avg_msa, -1),
+  make_tuple(16, 8, &vpx_sad16x8_avg_msa, -1),
+  make_tuple(8, 16, &vpx_sad8x16_avg_msa, -1),
+  make_tuple(8, 8, &vpx_sad8x8_avg_msa, -1),
+  make_tuple(8, 4, &vpx_sad8x4_avg_msa, -1),
+  make_tuple(4, 8, &vpx_sad4x8_avg_msa, -1),
+  make_tuple(4, 4, &vpx_sad4x4_avg_msa, -1),
+};
+INSTANTIATE_TEST_CASE_P(MSA, SADavgTest, ::testing::ValuesIn(avg_msa_tests));
+
+const SadMxNx4Param x4d_msa_tests[] = {
+  make_tuple(64, 64, &vpx_sad64x64x4d_msa, -1),
+  make_tuple(64, 32, &vpx_sad64x32x4d_msa, -1),
+  make_tuple(32, 64, &vpx_sad32x64x4d_msa, -1),
+  make_tuple(32, 32, &vpx_sad32x32x4d_msa, -1),
+  make_tuple(32, 16, &vpx_sad32x16x4d_msa, -1),
+  make_tuple(16, 32, &vpx_sad16x32x4d_msa, -1),
+  make_tuple(16, 16, &vpx_sad16x16x4d_msa, -1),
+  make_tuple(16, 8, &vpx_sad16x8x4d_msa, -1),
+  make_tuple(8, 16, &vpx_sad8x16x4d_msa, -1),
+  make_tuple(8, 8, &vpx_sad8x8x4d_msa, -1),
+  make_tuple(8, 4, &vpx_sad8x4x4d_msa, -1),
+  make_tuple(4, 8, &vpx_sad4x8x4d_msa, -1),
+  make_tuple(4, 4, &vpx_sad4x4x4d_msa, -1),
+};
+INSTANTIATE_TEST_CASE_P(MSA, SADx4Test, ::testing::ValuesIn(x4d_msa_tests));
+#endif  // HAVE_MSA
+
+}  // namespace
diff --git a/libs/libvpx/test/set_maps.sh b/libs/libvpx/test/set_maps.sh
new file mode 100755
index 0000000000..e7c8d43fa8
--- /dev/null
+++ b/libs/libvpx/test/set_maps.sh
@@ -0,0 +1,59 @@
+#!/bin/sh
+##
+##  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+##  This file tests the libvpx set_maps example. To add new tests to this file,
+##  do the following:
+##    1. Write a shell function (this is your test).
+##    2. Add the function to set_maps_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: $YUV_RAW_INPUT is required, and set_maps must exist in
+# $LIBVPX_BIN_PATH.
+set_maps_verify_environment() {
+  if [ ! -e "${YUV_RAW_INPUT}" ]; then
+    echo "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
+    return 1
+  fi
+  if [ -z "$(vpx_tool_path set_maps)" ]; then
+    elog "set_maps not found. It must exist in LIBVPX_BIN_PATH or its parent."
+    return 1
+  fi
+}
+
+# Runs set_maps using the codec specified by $1.
+set_maps() {
+  local encoder="$(vpx_tool_path set_maps)"
+  local codec="$1"
+  local output_file="${VPX_TEST_OUTPUT_DIR}/set_maps_${codec}.ivf"
+
+  eval "${VPX_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
+      "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" \
+      ${devnull}
+
+  [ -e "${output_file}" ] || return 1
+}
+
+set_maps_vp8() {
+  if [ "$(vp8_encode_available)" = "yes" ]; then
+    set_maps vp8 || return 1
+  fi
+}
+
+set_maps_vp9() {
+  if [ "$(vp9_encode_available)" = "yes" ]; then
+    set_maps vp9 || return 1
+  fi
+}
+
+set_maps_tests="set_maps_vp8
+                set_maps_vp9"
+
+run_tests set_maps_verify_environment "${set_maps_tests}"
diff --git a/libs/libvpx/test/set_roi.cc b/libs/libvpx/test/set_roi.cc
new file mode 100644
index 0000000000..fea8cca7a1
--- /dev/null
+++ b/libs/libvpx/test/set_roi.cc
@@ -0,0 +1,184 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <math.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "vp8/encoder/onyx_int.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+
+TEST(VP8RoiMapTest, ParameterCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  int delta_q[MAX_MB_SEGMENTS] = { -2, -25, 0, 31 };
+  int delta_lf[MAX_MB_SEGMENTS] = { -2, -25, 0, 31 };
+  unsigned int threshold[MAX_MB_SEGMENTS] = { 0, 100, 200, 300 };
+
+  const int internalq_trans[] = {
+    0,   1,  2,  3,  4,  5,  7,  8,
+    9,  10, 12, 13, 15, 17, 18, 19,
+    20,  21, 23, 24, 25, 26, 27, 28,
+    29,  30, 31, 33, 35, 37, 39, 41,
+    43,  45, 47, 49, 51, 53, 55, 57,
+    59,  61, 64, 67, 70, 73, 76, 79,
+    82,  85, 88, 91, 94, 97, 100, 103,
+    106, 109, 112, 115, 118, 121, 124, 127,
+  };
+
+  // Initialize elements of cpi with valid defaults.
+  VP8_COMP cpi;
+  cpi.mb.e_mbd.mb_segement_abs_delta = SEGMENT_DELTADATA;
+  cpi.cyclic_refresh_mode_enabled = 0;
+  cpi.mb.e_mbd.segmentation_enabled = 0;
+  cpi.mb.e_mbd.update_mb_segmentation_map = 0;
+  cpi.mb.e_mbd.update_mb_segmentation_data = 0;
+  cpi.common.mb_rows = 240 >> 4;
+  cpi.common.mb_cols = 320 >> 4;
+  const int mbs = (cpi.common.mb_rows * cpi.common.mb_cols);
+  memset(cpi.segment_feature_data, 0, sizeof(cpi.segment_feature_data));
+
+  // Segment map
+  cpi.segmentation_map = reinterpret_cast<unsigned char *>(vpx_calloc(mbs, 1));
+
+  // Allocate memory for the source memory map.
+  unsigned char *roi_map =
+    reinterpret_cast<unsigned char *>(vpx_calloc(mbs, 1));
+  memset(&roi_map[mbs >> 2], 1, (mbs >> 2));
+  memset(&roi_map[mbs >> 1], 2, (mbs >> 2));
+  memset(&roi_map[mbs -(mbs >> 2)], 3, (mbs >> 2));
+
+  // Do a test call with valid parameters.
+  int roi_retval = vp8_set_roimap(&cpi, roi_map, cpi.common.mb_rows,
+                                  cpi.common.mb_cols, delta_q, delta_lf,
+                                  threshold);
+  EXPECT_EQ(0, roi_retval)
+        << "vp8_set_roimap roi failed with default test parameters";
+
+  // Check that the values in the cpi structure get set as expected.
+  if (roi_retval == 0) {
+    // Check that the segment map got set.
+    const int mapcompare = memcmp(roi_map, cpi.segmentation_map, mbs);
+    EXPECT_EQ(0, mapcompare) << "segment map error";
+
+    // Check the q deltas (note the need to translate into
+    // the interanl range of 0-127.
+    for (int i = 0; i < MAX_MB_SEGMENTS; ++i) {
+      const int transq = internalq_trans[abs(delta_q[i])];
+      if (abs(cpi.segment_feature_data[MB_LVL_ALT_Q][i]) != transq) {
+          EXPECT_EQ(transq, cpi.segment_feature_data[MB_LVL_ALT_Q][i])
+                    << "segment delta_q  error";
+          break;
+      }
+    }
+
+    // Check the loop filter deltas
+    for (int i = 0; i < MAX_MB_SEGMENTS; ++i) {
+      if (cpi.segment_feature_data[MB_LVL_ALT_LF][i] != delta_lf[i]) {
+        EXPECT_EQ(delta_lf[i], cpi.segment_feature_data[MB_LVL_ALT_LF][i])
+                  << "segment delta_lf error";
+        break;
+      }
+    }
+
+    // Check the breakout thresholds
+    for (int i = 0; i < MAX_MB_SEGMENTS; ++i) {
+      unsigned int breakout =
+        static_cast<unsigned int>(cpi.segment_encode_breakout[i]);
+
+      if (threshold[i] != breakout) {
+        EXPECT_EQ(threshold[i], breakout)
+                  << "breakout threshold error";
+        break;
+      }
+    }
+
+    // Segmentation, and segmentation update flages should be set.
+    EXPECT_EQ(1, cpi.mb.e_mbd.segmentation_enabled)
+              << "segmentation_enabled error";
+    EXPECT_EQ(1, cpi.mb.e_mbd.update_mb_segmentation_map)
+              << "update_mb_segmentation_map error";
+    EXPECT_EQ(1, cpi.mb.e_mbd.update_mb_segmentation_data)
+              << "update_mb_segmentation_data error";
+
+
+    // Try a range of delta q and lf parameters (some legal, some not)
+    for (int i = 0; i < 1000; ++i) {
+      int rand_deltas[4];
+      int deltas_valid;
+      rand_deltas[0] = rnd(160) - 80;
+      rand_deltas[1] = rnd(160) - 80;
+      rand_deltas[2] = rnd(160) - 80;
+      rand_deltas[3] = rnd(160) - 80;
+
+      deltas_valid = ((abs(rand_deltas[0]) <= 63) &&
+                      (abs(rand_deltas[1]) <= 63) &&
+                      (abs(rand_deltas[2]) <= 63) &&
+                      (abs(rand_deltas[3]) <= 63)) ? 0 : -1;
+
+      // Test with random delta q values.
+      roi_retval = vp8_set_roimap(&cpi, roi_map, cpi.common.mb_rows,
+                                  cpi.common.mb_cols, rand_deltas,
+                                  delta_lf, threshold);
+      EXPECT_EQ(deltas_valid, roi_retval) << "dq range check error";
+
+      // One delta_q error shown at a time
+      if (deltas_valid != roi_retval)
+        break;
+
+      // Test with random loop filter values.
+      roi_retval = vp8_set_roimap(&cpi, roi_map, cpi.common.mb_rows,
+                                  cpi.common.mb_cols, delta_q,
+                                  rand_deltas, threshold);
+      EXPECT_EQ(deltas_valid, roi_retval) << "dlf range check error";
+
+      // One delta loop filter error shown at a time
+      if (deltas_valid != roi_retval)
+        break;
+    }
+
+    // Test that we report and error if cyclic refresh is enabled.
+    cpi.cyclic_refresh_mode_enabled = 1;
+    roi_retval = vp8_set_roimap(&cpi, roi_map, cpi.common.mb_rows,
+                                cpi.common.mb_cols, delta_q,
+                                delta_lf, threshold);
+    EXPECT_EQ(-1, roi_retval) << "cyclic refresh check error";
+    cpi.cyclic_refresh_mode_enabled = 0;
+
+    // Test invalid number of rows or colums.
+    roi_retval = vp8_set_roimap(&cpi, roi_map, cpi.common.mb_rows + 1,
+                                cpi.common.mb_cols, delta_q,
+                                delta_lf, threshold);
+    EXPECT_EQ(-1, roi_retval) << "MB rows bounds check error";
+
+    roi_retval = vp8_set_roimap(&cpi, roi_map, cpi.common.mb_rows,
+                                cpi.common.mb_cols - 1, delta_q,
+                                delta_lf, threshold);
+    EXPECT_EQ(-1, roi_retval) << "MB cols bounds check error";
+  }
+
+  // Free allocated memory
+  if (cpi.segmentation_map)
+    vpx_free(cpi.segmentation_map);
+  if (roi_map)
+    vpx_free(roi_map);
+};
+
+}  // namespace
diff --git a/libs/libvpx/test/simple_decoder.sh b/libs/libvpx/test/simple_decoder.sh
new file mode 100755
index 0000000000..7eeaf71b1c
--- /dev/null
+++ b/libs/libvpx/test/simple_decoder.sh
@@ -0,0 +1,61 @@
+#!/bin/sh
+##
+##  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+##  This file tests the libvpx simple_decoder example code. To add new tests to
+##  this file, do the following:
+##    1. Write a shell function (this is your test).
+##    2. Add the function to simple_decoder_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: Make sure input is available:
+#   $VP8_IVF_FILE and $VP9_IVF_FILE are required.
+simple_decoder_verify_environment() {
+  if [ ! -e "${VP8_IVF_FILE}" ] || [ ! -e "${VP9_IVF_FILE}" ]; then
+    echo "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
+    return 1
+  fi
+}
+
+# Runs simple_decoder using $1 as input file. $2 is the codec name, and is used
+# solely to name the output file.
+simple_decoder() {
+  local decoder="${LIBVPX_BIN_PATH}/simple_decoder${VPX_TEST_EXE_SUFFIX}"
+  local input_file="$1"
+  local codec="$2"
+  local output_file="${VPX_TEST_OUTPUT_DIR}/simple_decoder_${codec}.raw"
+
+  if [ ! -x "${decoder}" ]; then
+    elog "${decoder} does not exist or is not executable."
+    return 1
+  fi
+
+  eval "${VPX_TEST_PREFIX}" "${decoder}" "${input_file}" "${output_file}" \
+      ${devnull}
+
+  [ -e "${output_file}" ] || return 1
+}
+
+simple_decoder_vp8() {
+  if [ "$(vp8_decode_available)" = "yes" ]; then
+    simple_decoder "${VP8_IVF_FILE}" vp8 || return 1
+  fi
+}
+
+simple_decoder_vp9() {
+  if [ "$(vp9_decode_available)" = "yes" ]; then
+    simple_decoder "${VP9_IVF_FILE}" vp9 || return 1
+  fi
+}
+
+simple_decoder_tests="simple_decoder_vp8
+                      simple_decoder_vp9"
+
+run_tests simple_decoder_verify_environment "${simple_decoder_tests}"
diff --git a/libs/libvpx/test/simple_encoder.sh b/libs/libvpx/test/simple_encoder.sh
new file mode 100755
index 0000000000..c4a6280303
--- /dev/null
+++ b/libs/libvpx/test/simple_encoder.sh
@@ -0,0 +1,62 @@
+#!/bin/sh
+##
+##  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+##  This file tests the libvpx simple_encoder example. To add new tests to this
+##  file, do the following:
+##    1. Write a shell function (this is your test).
+##    2. Add the function to simple_encoder_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: $YUV_RAW_INPUT is required.
+simple_encoder_verify_environment() {
+  if [ ! -e "${YUV_RAW_INPUT}" ]; then
+    echo "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
+    return 1
+  fi
+}
+
+# Runs simple_encoder using the codec specified by $1.
+simple_encoder() {
+  local encoder="${LIBVPX_BIN_PATH}/simple_encoder${VPX_TEST_EXE_SUFFIX}"
+  local codec="$1"
+  local output_file="${VPX_TEST_OUTPUT_DIR}/simple_encoder_${codec}.ivf"
+
+  if [ ! -x "${encoder}" ]; then
+    elog "${encoder} does not exist or is not executable."
+    return 1
+  fi
+
+  eval "${VPX_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
+      "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" 9999 \
+      ${devnull}
+
+  [ -e "${output_file}" ] || return 1
+}
+
+simple_encoder_vp8() {
+  if [ "$(vp8_encode_available)" = "yes" ]; then
+    simple_encoder vp8 || return 1
+  fi
+}
+
+# TODO(tomfinegan): Add a frame limit param to simple_encoder and enable this
+# test. VP9 is just too slow right now: This test takes 4m30s+ on a fast
+# machine.
+DISABLED_simple_encoder_vp9() {
+  if [ "$(vp9_encode_available)" = "yes" ]; then
+    simple_encoder vp9 || return 1
+  fi
+}
+
+simple_encoder_tests="simple_encoder_vp8
+                      DISABLED_simple_encoder_vp9"
+
+run_tests simple_encoder_verify_environment "${simple_encoder_tests}"
diff --git a/libs/libvpx/test/sixtap_predict_test.cc b/libs/libvpx/test/sixtap_predict_test.cc
new file mode 100644
index 0000000000..304a1484af
--- /dev/null
+++ b/libs/libvpx/test/sixtap_predict_test.cc
@@ -0,0 +1,233 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_config.h"
+#include "./vp8_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
+
+namespace {
+
+typedef void (*SixtapPredictFunc)(uint8_t *src_ptr,
+                                  int src_pixels_per_line,
+                                  int xoffset,
+                                  int yoffset,
+                                  uint8_t *dst_ptr,
+                                  int dst_pitch);
+
+typedef std::tr1::tuple<int, int, SixtapPredictFunc> SixtapPredictParam;
+
+class SixtapPredictTest
+    : public ::testing::TestWithParam<SixtapPredictParam> {
+ public:
+  static void SetUpTestCase() {
+    src_ = reinterpret_cast<uint8_t*>(vpx_memalign(kDataAlignment, kSrcSize));
+    dst_ = reinterpret_cast<uint8_t*>(vpx_memalign(kDataAlignment, kDstSize));
+    dst_c_ = reinterpret_cast<uint8_t*>(vpx_memalign(kDataAlignment, kDstSize));
+  }
+
+  static void TearDownTestCase() {
+    vpx_free(src_);
+    src_ = NULL;
+    vpx_free(dst_);
+    dst_ = NULL;
+    vpx_free(dst_c_);
+    dst_c_ = NULL;
+  }
+
+  virtual void TearDown() {
+    libvpx_test::ClearSystemState();
+  }
+
+ protected:
+  // Make test arrays big enough for 16x16 functions. Six-tap filters
+  // need 5 extra pixels outside of the macroblock.
+  static const int kSrcStride = 21;
+  static const int kDstStride = 16;
+  static const int kDataAlignment = 16;
+  static const int kSrcSize = kSrcStride * kSrcStride + 1;
+  static const int kDstSize = kDstStride * kDstStride;
+
+  virtual void SetUp() {
+    width_ = GET_PARAM(0);
+    height_ = GET_PARAM(1);
+    sixtap_predict_ = GET_PARAM(2);
+    memset(src_, 0, kSrcSize);
+    memset(dst_, 0, kDstSize);
+    memset(dst_c_, 0, kDstSize);
+  }
+
+  int width_;
+  int height_;
+  SixtapPredictFunc sixtap_predict_;
+  // The src stores the macroblock we will filter on, and makes it 1 byte larger
+  // in order to test unaligned access. The result is stored in dst and dst_c(c
+  // reference code result).
+  static uint8_t* src_;
+  static uint8_t* dst_;
+  static uint8_t* dst_c_;
+};
+
+uint8_t* SixtapPredictTest::src_ = NULL;
+uint8_t* SixtapPredictTest::dst_ = NULL;
+uint8_t* SixtapPredictTest::dst_c_ = NULL;
+
+TEST_P(SixtapPredictTest, TestWithPresetData) {
+  // Test input
+  static const uint8_t test_data[kSrcSize] = {
+    216, 184, 4, 191, 82, 92, 41, 0, 1, 226, 236, 172, 20, 182, 42, 226, 177,
+    79, 94, 77, 179, 203, 206, 198, 22, 192, 19, 75, 17, 192, 44, 233, 120,
+    48, 168, 203, 141, 210, 203, 143, 180, 184, 59, 201, 110, 102, 171, 32,
+    182, 10, 109, 105, 213, 60, 47, 236, 253, 67, 55, 14, 3, 99, 247, 124,
+    148, 159, 71, 34, 114, 19, 177, 38, 203, 237, 239, 58, 83, 155, 91, 10,
+    166, 201, 115, 124, 5, 163, 104, 2, 231, 160, 16, 234, 4, 8, 103, 153,
+    167, 174, 187, 26, 193, 109, 64, 141, 90, 48, 200, 174, 204, 36, 184,
+    114, 237, 43, 238, 242, 207, 86, 245, 182, 247, 6, 161, 251, 14, 8, 148,
+    182, 182, 79, 208, 120, 188, 17, 6, 23, 65, 206, 197, 13, 242, 126, 128,
+    224, 170, 110, 211, 121, 197, 200, 47, 188, 207, 208, 184, 221, 216, 76,
+    148, 143, 156, 100, 8, 89, 117, 14, 112, 183, 221, 54, 197, 208, 180, 69,
+    176, 94, 180, 131, 215, 121, 76, 7, 54, 28, 216, 238, 249, 176, 58, 142,
+    64, 215, 242, 72, 49, 104, 87, 161, 32, 52, 216, 230, 4, 141, 44, 181,
+    235, 224, 57, 195, 89, 134, 203, 144, 162, 163, 126, 156, 84, 185, 42,
+    148, 145, 29, 221, 194, 134, 52, 100, 166, 105, 60, 140, 110, 201, 184,
+    35, 181, 153, 93, 121, 243, 227, 68, 131, 134, 232, 2, 35, 60, 187, 77,
+    209, 76, 106, 174, 15, 241, 227, 115, 151, 77, 175, 36, 187, 121, 221,
+    223, 47, 118, 61, 168, 105, 32, 237, 236, 167, 213, 238, 202, 17, 170,
+    24, 226, 247, 131, 145, 6, 116, 117, 121, 11, 194, 41, 48, 126, 162, 13,
+    93, 209, 131, 154, 122, 237, 187, 103, 217, 99, 60, 200, 45, 78, 115, 69,
+    49, 106, 200, 194, 112, 60, 56, 234, 72, 251, 19, 120, 121, 182, 134, 215,
+    135, 10, 114, 2, 247, 46, 105, 209, 145, 165, 153, 191, 243, 12, 5, 36,
+    119, 206, 231, 231, 11, 32, 209, 83, 27, 229, 204, 149, 155, 83, 109, 35,
+    93, 223, 37, 84, 14, 142, 37, 160, 52, 191, 96, 40, 204, 101, 77, 67, 52,
+    53, 43, 63, 85, 253, 147, 113, 226, 96, 6, 125, 179, 115, 161, 17, 83,
+    198, 101, 98, 85, 139, 3, 137, 75, 99, 178, 23, 201, 255, 91, 253, 52,
+    134, 60, 138, 131, 208, 251, 101, 48, 2, 227, 228, 118, 132, 245, 202,
+    75, 91, 44, 160, 231, 47, 41, 50, 147, 220, 74, 92, 219, 165, 89, 16
+  };
+
+  // Expected result
+  static const uint8_t expected_dst[kDstSize] = {
+    117, 102, 74, 135, 42, 98, 175, 206, 70, 73, 222, 197, 50, 24, 39, 49, 38,
+    105, 90, 47, 169, 40, 171, 215, 200, 73, 109, 141, 53, 85, 177, 164, 79,
+    208, 124, 89, 212, 18, 81, 145, 151, 164, 217, 153, 91, 154, 102, 102,
+    159, 75, 164, 152, 136, 51, 213, 219, 186, 116, 193, 224, 186, 36, 231,
+    208, 84, 211, 155, 167, 35, 59, 42, 76, 216, 149, 73, 201, 78, 149, 184,
+    100, 96, 196, 189, 198, 188, 235, 195, 117, 129, 120, 129, 49, 25, 133,
+    113, 69, 221, 114, 70, 143, 99, 157, 108, 189, 140, 78, 6, 55, 65, 240,
+    255, 245, 184, 72, 90, 100, 116, 131, 39, 60, 234, 167, 33, 160, 88, 185,
+    200, 157, 159, 176, 127, 151, 138, 102, 168, 106, 170, 86, 82, 219, 189,
+    76, 33, 115, 197, 106, 96, 198, 136, 97, 141, 237, 151, 98, 137, 191,
+    185, 2, 57, 95, 142, 91, 255, 185, 97, 137, 76, 162, 94, 173, 131, 193,
+    161, 81, 106, 72, 135, 222, 234, 137, 66, 137, 106, 243, 210, 147, 95,
+    15, 137, 110, 85, 66, 16, 96, 167, 147, 150, 173, 203, 140, 118, 196,
+    84, 147, 160, 19, 95, 101, 123, 74, 132, 202, 82, 166, 12, 131, 166,
+    189, 170, 159, 85, 79, 66, 57, 152, 132, 203, 194, 0, 1, 56, 146, 180,
+    224, 156, 28, 83, 181, 79, 76, 80, 46, 160, 175, 59, 106, 43, 87, 75,
+    136, 85, 189, 46, 71, 200, 90
+  };
+
+  uint8_t *src = const_cast<uint8_t*>(test_data);
+
+  ASM_REGISTER_STATE_CHECK(
+      sixtap_predict_(&src[kSrcStride * 2 + 2 + 1], kSrcStride,
+                      2, 2, dst_, kDstStride));
+
+  for (int i = 0; i < height_; ++i)
+    for (int j = 0; j < width_; ++j)
+      ASSERT_EQ(expected_dst[i * kDstStride + j], dst_[i * kDstStride + j])
+          << "i==" << (i * width_ + j);
+}
+
+using libvpx_test::ACMRandom;
+
+TEST_P(SixtapPredictTest, TestWithRandomData) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  for (int i = 0; i < kSrcSize; ++i)
+    src_[i] = rnd.Rand8();
+
+  // Run tests for all possible offsets.
+  for (int xoffset = 0; xoffset < 8; ++xoffset) {
+    for (int yoffset = 0; yoffset < 8; ++yoffset) {
+      // Call c reference function.
+      // Move start point to next pixel to test if the function reads
+      // unaligned data correctly.
+      vp8_sixtap_predict16x16_c(&src_[kSrcStride * 2 + 2 + 1], kSrcStride,
+                                xoffset, yoffset, dst_c_, kDstStride);
+
+      // Run test.
+      ASM_REGISTER_STATE_CHECK(
+          sixtap_predict_(&src_[kSrcStride * 2 + 2 + 1], kSrcStride,
+                          xoffset, yoffset, dst_, kDstStride));
+
+      for (int i = 0; i < height_; ++i)
+        for (int j = 0; j < width_; ++j)
+          ASSERT_EQ(dst_c_[i * kDstStride + j], dst_[i * kDstStride + j])
+              << "i==" << (i * width_ + j);
+    }
+  }
+}
+
+using std::tr1::make_tuple;
+
+INSTANTIATE_TEST_CASE_P(
+    C, SixtapPredictTest, ::testing::Values(
+        make_tuple(16, 16, &vp8_sixtap_predict16x16_c),
+        make_tuple(8, 8, &vp8_sixtap_predict8x8_c),
+        make_tuple(8, 4, &vp8_sixtap_predict8x4_c),
+        make_tuple(4, 4, &vp8_sixtap_predict4x4_c)));
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(
+    NEON, SixtapPredictTest, ::testing::Values(
+        make_tuple(16, 16, &vp8_sixtap_predict16x16_neon),
+        make_tuple(8, 8, &vp8_sixtap_predict8x8_neon),
+        make_tuple(8, 4, &vp8_sixtap_predict8x4_neon)));
+#endif
+#if HAVE_MMX
+INSTANTIATE_TEST_CASE_P(
+    MMX, SixtapPredictTest, ::testing::Values(
+        make_tuple(16, 16, &vp8_sixtap_predict16x16_mmx),
+        make_tuple(8, 8, &vp8_sixtap_predict8x8_mmx),
+        make_tuple(8, 4, &vp8_sixtap_predict8x4_mmx),
+        make_tuple(4, 4, &vp8_sixtap_predict4x4_mmx)));
+#endif
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, SixtapPredictTest, ::testing::Values(
+        make_tuple(16, 16, &vp8_sixtap_predict16x16_sse2),
+        make_tuple(8, 8, &vp8_sixtap_predict8x8_sse2),
+        make_tuple(8, 4, &vp8_sixtap_predict8x4_sse2)));
+#endif
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(
+    SSSE3, SixtapPredictTest, ::testing::Values(
+        make_tuple(16, 16, &vp8_sixtap_predict16x16_ssse3),
+        make_tuple(8, 8, &vp8_sixtap_predict8x8_ssse3),
+        make_tuple(8, 4, &vp8_sixtap_predict8x4_ssse3),
+        make_tuple(4, 4, &vp8_sixtap_predict4x4_ssse3)));
+#endif
+#if HAVE_MSA
+INSTANTIATE_TEST_CASE_P(
+    MSA, SixtapPredictTest, ::testing::Values(
+        make_tuple(16, 16, &vp8_sixtap_predict16x16_msa),
+        make_tuple(8, 8, &vp8_sixtap_predict8x8_msa),
+        make_tuple(8, 4, &vp8_sixtap_predict8x4_msa),
+        make_tuple(4, 4, &vp8_sixtap_predict4x4_msa)));
+#endif
+}  // namespace
diff --git a/libs/libvpx/test/superframe_test.cc b/libs/libvpx/test/superframe_test.cc
new file mode 100644
index 0000000000..90aa75b41e
--- /dev/null
+++ b/libs/libvpx/test/superframe_test.cc
@@ -0,0 +1,113 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <climits>
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+
+const int kTestMode = 0;
+const int kSuperframeSyntax = 1;
+
+typedef std::tr1::tuple<libvpx_test::TestMode,int> SuperframeTestParam;
+
+class SuperframeTest : public ::libvpx_test::EncoderTest,
+    public ::libvpx_test::CodecTestWithParam<SuperframeTestParam> {
+ protected:
+  SuperframeTest() : EncoderTest(GET_PARAM(0)), modified_buf_(NULL),
+      last_sf_pts_(0) {}
+  virtual ~SuperframeTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    const SuperframeTestParam input = GET_PARAM(1);
+    const libvpx_test::TestMode mode = std::tr1::get<kTestMode>(input);
+    const int syntax = std::tr1::get<kSuperframeSyntax>(input);
+    SetMode(mode);
+    sf_count_ = 0;
+    sf_count_max_ = INT_MAX;
+    is_vp10_style_superframe_ = syntax;
+  }
+
+  virtual void TearDown() {
+    delete[] modified_buf_;
+  }
+
+  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                                  libvpx_test::Encoder *encoder) {
+    if (video->frame() == 1) {
+      encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
+    }
+  }
+
+  virtual const vpx_codec_cx_pkt_t * MutateEncoderOutputHook(
+      const vpx_codec_cx_pkt_t *pkt) {
+    if (pkt->kind != VPX_CODEC_CX_FRAME_PKT)
+      return pkt;
+
+    const uint8_t *buffer = reinterpret_cast<uint8_t*>(pkt->data.frame.buf);
+    const uint8_t marker = buffer[pkt->data.frame.sz - 1];
+    const int frames = (marker & 0x7) + 1;
+    const int mag = ((marker >> 3) & 3) + 1;
+    const unsigned int index_sz =
+        2 + mag * (frames - is_vp10_style_superframe_);
+    if ((marker & 0xe0) == 0xc0 &&
+        pkt->data.frame.sz >= index_sz &&
+        buffer[pkt->data.frame.sz - index_sz] == marker) {
+      // frame is a superframe. strip off the index.
+      if (modified_buf_)
+        delete[] modified_buf_;
+      modified_buf_ = new uint8_t[pkt->data.frame.sz - index_sz];
+      memcpy(modified_buf_, pkt->data.frame.buf,
+             pkt->data.frame.sz - index_sz);
+      modified_pkt_ = *pkt;
+      modified_pkt_.data.frame.buf = modified_buf_;
+      modified_pkt_.data.frame.sz -= index_sz;
+
+      sf_count_++;
+      last_sf_pts_ = pkt->data.frame.pts;
+      return &modified_pkt_;
+    }
+
+    // Make sure we do a few frames after the last SF
+    abort_ |= sf_count_ > sf_count_max_ &&
+              pkt->data.frame.pts - last_sf_pts_ >= 5;
+    return pkt;
+  }
+
+  int is_vp10_style_superframe_;
+  int sf_count_;
+  int sf_count_max_;
+  vpx_codec_cx_pkt_t modified_pkt_;
+  uint8_t *modified_buf_;
+  vpx_codec_pts_t last_sf_pts_;
+};
+
+TEST_P(SuperframeTest, TestSuperframeIndexIsOptional) {
+  sf_count_max_ = 0;  // early exit on successful test.
+  cfg_.g_lag_in_frames = 25;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 40);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  EXPECT_EQ(sf_count_, 1);
+}
+
+VP9_INSTANTIATE_TEST_CASE(SuperframeTest, ::testing::Combine(
+    ::testing::Values(::libvpx_test::kTwoPassGood),
+    ::testing::Values(0)));
+
+VP10_INSTANTIATE_TEST_CASE(SuperframeTest, ::testing::Combine(
+    ::testing::Values(::libvpx_test::kTwoPassGood),
+    ::testing::Values(CONFIG_MISC_FIXES)));
+}  // namespace
diff --git a/libs/libvpx/test/svc_test.cc b/libs/libvpx/test/svc_test.cc
new file mode 100644
index 0000000000..b955cee659
--- /dev/null
+++ b/libs/libvpx/test/svc_test.cc
@@ -0,0 +1,797 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string>
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/i420_video_source.h"
+
+#include "vp9/decoder/vp9_decoder.h"
+
+#include "vpx/svc_context.h"
+#include "vpx/vp8cx.h"
+#include "vpx/vpx_encoder.h"
+
+namespace {
+
+using libvpx_test::CodecFactory;
+using libvpx_test::Decoder;
+using libvpx_test::DxDataIterator;
+using libvpx_test::VP9CodecFactory;
+
+class SvcTest : public ::testing::Test {
+ protected:
+  static const uint32_t kWidth = 352;
+  static const uint32_t kHeight = 288;
+
+  SvcTest()
+      : codec_iface_(0),
+        test_file_name_("hantro_collage_w352h288.yuv"),
+        codec_initialized_(false),
+        decoder_(0) {
+    memset(&svc_, 0, sizeof(svc_));
+    memset(&codec_, 0, sizeof(codec_));
+    memset(&codec_enc_, 0, sizeof(codec_enc_));
+  }
+
+  virtual ~SvcTest() {}
+
+  virtual void SetUp() {
+    svc_.log_level = SVC_LOG_DEBUG;
+    svc_.log_print = 0;
+
+    codec_iface_ = vpx_codec_vp9_cx();
+    const vpx_codec_err_t res =
+        vpx_codec_enc_config_default(codec_iface_, &codec_enc_, 0);
+    EXPECT_EQ(VPX_CODEC_OK, res);
+
+    codec_enc_.g_w = kWidth;
+    codec_enc_.g_h = kHeight;
+    codec_enc_.g_timebase.num = 1;
+    codec_enc_.g_timebase.den = 60;
+    codec_enc_.kf_min_dist = 100;
+    codec_enc_.kf_max_dist = 100;
+
+    vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t();
+    VP9CodecFactory codec_factory;
+    decoder_ = codec_factory.CreateDecoder(dec_cfg, 0);
+
+    tile_columns_ = 0;
+    tile_rows_ = 0;
+  }
+
+  virtual void TearDown() {
+    ReleaseEncoder();
+    delete(decoder_);
+  }
+
+  void InitializeEncoder() {
+    const vpx_codec_err_t res =
+        vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+    EXPECT_EQ(VPX_CODEC_OK, res);
+    vpx_codec_control(&codec_, VP8E_SET_CPUUSED, 4);  // Make the test faster
+    vpx_codec_control(&codec_, VP9E_SET_TILE_COLUMNS, tile_columns_);
+    vpx_codec_control(&codec_, VP9E_SET_TILE_ROWS, tile_rows_);
+    codec_initialized_ = true;
+  }
+
+  void ReleaseEncoder() {
+    vpx_svc_release(&svc_);
+    if (codec_initialized_) vpx_codec_destroy(&codec_);
+    codec_initialized_ = false;
+  }
+
+  void GetStatsData(std::string *const stats_buf) {
+    vpx_codec_iter_t iter = NULL;
+    const vpx_codec_cx_pkt_t *cx_pkt;
+
+    while ((cx_pkt = vpx_codec_get_cx_data(&codec_, &iter)) != NULL) {
+      if (cx_pkt->kind == VPX_CODEC_STATS_PKT) {
+        EXPECT_GT(cx_pkt->data.twopass_stats.sz, 0U);
+        ASSERT_TRUE(cx_pkt->data.twopass_stats.buf != NULL);
+        stats_buf->append(static_cast<char*>(cx_pkt->data.twopass_stats.buf),
+                          cx_pkt->data.twopass_stats.sz);
+      }
+    }
+  }
+
+  void Pass1EncodeNFrames(const int n, const int layers,
+                          std::string *const stats_buf) {
+    vpx_codec_err_t res;
+
+    ASSERT_GT(n, 0);
+    ASSERT_GT(layers, 0);
+    svc_.spatial_layers = layers;
+    codec_enc_.g_pass = VPX_RC_FIRST_PASS;
+    InitializeEncoder();
+
+    libvpx_test::I420VideoSource video(test_file_name_,
+                                       codec_enc_.g_w, codec_enc_.g_h,
+                                       codec_enc_.g_timebase.den,
+                                       codec_enc_.g_timebase.num, 0, 30);
+    video.Begin();
+
+    for (int i = 0; i < n; ++i) {
+      res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
+                           video.duration(), VPX_DL_GOOD_QUALITY);
+      ASSERT_EQ(VPX_CODEC_OK, res);
+      GetStatsData(stats_buf);
+      video.Next();
+    }
+
+    // Flush encoder and test EOS packet.
+    res = vpx_svc_encode(&svc_, &codec_, NULL, video.pts(),
+                         video.duration(), VPX_DL_GOOD_QUALITY);
+    ASSERT_EQ(VPX_CODEC_OK, res);
+    GetStatsData(stats_buf);
+
+    ReleaseEncoder();
+  }
+
+  void StoreFrames(const size_t max_frame_received,
+                   struct vpx_fixed_buf *const outputs,
+                   size_t *const frame_received) {
+    vpx_codec_iter_t iter = NULL;
+    const vpx_codec_cx_pkt_t *cx_pkt;
+
+    while ((cx_pkt = vpx_codec_get_cx_data(&codec_, &iter)) != NULL) {
+      if (cx_pkt->kind == VPX_CODEC_CX_FRAME_PKT) {
+        const size_t frame_size = cx_pkt->data.frame.sz;
+
+        EXPECT_GT(frame_size, 0U);
+        ASSERT_TRUE(cx_pkt->data.frame.buf != NULL);
+        ASSERT_LT(*frame_received, max_frame_received);
+
+        if (*frame_received == 0)
+          EXPECT_EQ(1, !!(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY));
+
+        outputs[*frame_received].buf = malloc(frame_size + 16);
+        ASSERT_TRUE(outputs[*frame_received].buf != NULL);
+        memcpy(outputs[*frame_received].buf, cx_pkt->data.frame.buf,
+               frame_size);
+        outputs[*frame_received].sz = frame_size;
+        ++(*frame_received);
+      }
+    }
+  }
+
+  void Pass2EncodeNFrames(std::string *const stats_buf,
+                          const int n, const int layers,
+                          struct vpx_fixed_buf *const outputs) {
+    vpx_codec_err_t res;
+    size_t frame_received = 0;
+
+    ASSERT_TRUE(outputs != NULL);
+    ASSERT_GT(n, 0);
+    ASSERT_GT(layers, 0);
+    svc_.spatial_layers = layers;
+    codec_enc_.rc_target_bitrate = 500;
+    if (codec_enc_.g_pass == VPX_RC_LAST_PASS) {
+      ASSERT_TRUE(stats_buf != NULL);
+      ASSERT_GT(stats_buf->size(), 0U);
+      codec_enc_.rc_twopass_stats_in.buf = &(*stats_buf)[0];
+      codec_enc_.rc_twopass_stats_in.sz = stats_buf->size();
+    }
+    InitializeEncoder();
+
+    libvpx_test::I420VideoSource video(test_file_name_,
+                                       codec_enc_.g_w, codec_enc_.g_h,
+                                       codec_enc_.g_timebase.den,
+                                       codec_enc_.g_timebase.num, 0, 30);
+    video.Begin();
+
+    for (int i = 0; i < n; ++i) {
+      res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
+                           video.duration(), VPX_DL_GOOD_QUALITY);
+      ASSERT_EQ(VPX_CODEC_OK, res);
+      StoreFrames(n, outputs, &frame_received);
+      video.Next();
+    }
+
+    // Flush encoder.
+    res = vpx_svc_encode(&svc_, &codec_, NULL, 0,
+                         video.duration(), VPX_DL_GOOD_QUALITY);
+    EXPECT_EQ(VPX_CODEC_OK, res);
+    StoreFrames(n, outputs, &frame_received);
+
+    EXPECT_EQ(frame_received, static_cast<size_t>(n));
+
+    ReleaseEncoder();
+  }
+
+  void DecodeNFrames(const struct vpx_fixed_buf *const inputs, const int n) {
+    int decoded_frames = 0;
+    int received_frames = 0;
+
+    ASSERT_TRUE(inputs != NULL);
+    ASSERT_GT(n, 0);
+
+    for (int i = 0; i < n; ++i) {
+      ASSERT_TRUE(inputs[i].buf != NULL);
+      ASSERT_GT(inputs[i].sz, 0U);
+      const vpx_codec_err_t res_dec =
+          decoder_->DecodeFrame(static_cast<const uint8_t *>(inputs[i].buf),
+                                inputs[i].sz);
+      ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+      ++decoded_frames;
+
+      DxDataIterator dec_iter = decoder_->GetDxData();
+      while (dec_iter.Next() != NULL) {
+        ++received_frames;
+      }
+    }
+    EXPECT_EQ(decoded_frames, n);
+    EXPECT_EQ(received_frames, n);
+  }
+
+  void DropEnhancementLayers(struct vpx_fixed_buf *const inputs,
+                             const int num_super_frames,
+                             const int remained_spatial_layers) {
+    ASSERT_TRUE(inputs != NULL);
+    ASSERT_GT(num_super_frames, 0);
+    ASSERT_GT(remained_spatial_layers, 0);
+
+    for (int i = 0; i < num_super_frames; ++i) {
+      uint32_t frame_sizes[8] = {0};
+      int frame_count = 0;
+      int frames_found = 0;
+      int frame;
+      ASSERT_TRUE(inputs[i].buf != NULL);
+      ASSERT_GT(inputs[i].sz, 0U);
+
+      vpx_codec_err_t res =
+          vp9_parse_superframe_index(static_cast<const uint8_t*>(inputs[i].buf),
+                                     inputs[i].sz, frame_sizes, &frame_count,
+                                     NULL, NULL);
+      ASSERT_EQ(VPX_CODEC_OK, res);
+
+      if (frame_count == 0) {
+        // There's no super frame but only a single frame.
+        ASSERT_EQ(1, remained_spatial_layers);
+      } else {
+        // Found a super frame.
+        uint8_t *frame_data = static_cast<uint8_t*>(inputs[i].buf);
+        uint8_t *frame_start = frame_data;
+        for (frame = 0; frame < frame_count; ++frame) {
+          // Looking for a visible frame.
+          if (frame_data[0] & 0x02) {
+            ++frames_found;
+            if (frames_found == remained_spatial_layers)
+              break;
+          }
+          frame_data += frame_sizes[frame];
+        }
+        ASSERT_LT(frame, frame_count) << "Couldn't find a visible frame. "
+            << "remained_spatial_layers: " << remained_spatial_layers
+            << "    super_frame: " << i;
+        if (frame == frame_count - 1)
+          continue;
+
+        frame_data += frame_sizes[frame];
+
+        // We need to add one more frame for multiple frame contexts.
+        uint8_t marker =
+            static_cast<const uint8_t*>(inputs[i].buf)[inputs[i].sz - 1];
+        const uint32_t mag = ((marker >> 3) & 0x3) + 1;
+        const size_t index_sz = 2 + mag * frame_count;
+        const size_t new_index_sz = 2 + mag * (frame + 1);
+        marker &= 0x0f8;
+        marker |= frame;
+
+        // Copy existing frame sizes.
+        memmove(frame_data + 1, frame_start + inputs[i].sz - index_sz + 1,
+                new_index_sz - 2);
+        // New marker.
+        frame_data[0] = marker;
+        frame_data += (mag * (frame + 1) + 1);
+
+        *frame_data++ = marker;
+        inputs[i].sz = frame_data - frame_start;
+      }
+    }
+  }
+
+  void FreeBitstreamBuffers(struct vpx_fixed_buf *const inputs, const int n) {
+    ASSERT_TRUE(inputs != NULL);
+    ASSERT_GT(n, 0);
+
+    for (int i = 0; i < n; ++i) {
+      free(inputs[i].buf);
+      inputs[i].buf = NULL;
+      inputs[i].sz = 0;
+    }
+  }
+
+  SvcContext svc_;
+  vpx_codec_ctx_t codec_;
+  struct vpx_codec_enc_cfg codec_enc_;
+  vpx_codec_iface_t *codec_iface_;
+  std::string test_file_name_;
+  bool codec_initialized_;
+  Decoder *decoder_;
+  int tile_columns_;
+  int tile_rows_;
+};
+
+TEST_F(SvcTest, SvcInit) {
+  // test missing parameters
+  vpx_codec_err_t res = vpx_svc_init(NULL, &codec_, codec_iface_, &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+  res = vpx_svc_init(&svc_, NULL, codec_iface_, &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+  res = vpx_svc_init(&svc_, &codec_, NULL, &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  res = vpx_svc_init(&svc_, &codec_, codec_iface_, NULL);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  svc_.spatial_layers = 6;  // too many layers
+  res = vpx_svc_init(&svc_, &codec_, codec_iface_, &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  svc_.spatial_layers = 0;  // use default layers
+  InitializeEncoder();
+  EXPECT_EQ(VPX_SS_DEFAULT_LAYERS, svc_.spatial_layers);
+}
+
+TEST_F(SvcTest, InitTwoLayers) {
+  svc_.spatial_layers = 2;
+  InitializeEncoder();
+}
+
+TEST_F(SvcTest, InvalidOptions) {
+  vpx_codec_err_t res = vpx_svc_set_options(&svc_, NULL);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  res = vpx_svc_set_options(&svc_, "not-an-option=1");
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+}
+
+TEST_F(SvcTest, SetLayersOption) {
+  vpx_codec_err_t res = vpx_svc_set_options(&svc_, "spatial-layers=3");
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  InitializeEncoder();
+  EXPECT_EQ(3, svc_.spatial_layers);
+}
+
+TEST_F(SvcTest, SetMultipleOptions) {
+  vpx_codec_err_t res =
+      vpx_svc_set_options(&svc_, "spatial-layers=2 scale-factors=1/3,2/3");
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  InitializeEncoder();
+  EXPECT_EQ(2, svc_.spatial_layers);
+}
+
+TEST_F(SvcTest, SetScaleFactorsOption) {
+  svc_.spatial_layers = 2;
+  vpx_codec_err_t res =
+      vpx_svc_set_options(&svc_, "scale-factors=not-scale-factors");
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  res = vpx_svc_set_options(&svc_, "scale-factors=1/3, 3*3");
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  res = vpx_svc_set_options(&svc_, "scale-factors=1/3");
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  res = vpx_svc_set_options(&svc_, "scale-factors=1/3,2/3");
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  InitializeEncoder();
+}
+
+TEST_F(SvcTest, SetQuantizersOption) {
+  svc_.spatial_layers = 2;
+  vpx_codec_err_t res = vpx_svc_set_options(&svc_, "max-quantizers=nothing");
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  res = vpx_svc_set_options(&svc_, "min-quantizers=nothing");
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  res = vpx_svc_set_options(&svc_, "max-quantizers=40");
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  res = vpx_svc_set_options(&svc_, "min-quantizers=40");
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  res = vpx_svc_set_options(&svc_, "max-quantizers=30,30 min-quantizers=40,40");
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  res = vpx_svc_set_options(&svc_, "max-quantizers=40,40 min-quantizers=30,30");
+  InitializeEncoder();
+}
+
+TEST_F(SvcTest, SetAutoAltRefOption) {
+  svc_.spatial_layers = 5;
+  vpx_codec_err_t res = vpx_svc_set_options(&svc_, "auto-alt-refs=none");
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  res = vpx_svc_set_options(&svc_, "auto-alt-refs=1,1,1,1,0");
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  vpx_svc_set_options(&svc_, "auto-alt-refs=0,1,1,1,0");
+  InitializeEncoder();
+}
+
+// Test that decoder can handle an SVC frame as the first frame in a sequence.
+TEST_F(SvcTest, OnePassEncodeOneFrame) {
+  codec_enc_.g_pass = VPX_RC_ONE_PASS;
+  vpx_fixed_buf output = {0};
+  Pass2EncodeNFrames(NULL, 1, 2, &output);
+  DecodeNFrames(&output, 1);
+  FreeBitstreamBuffers(&output, 1);
+}
+
+TEST_F(SvcTest, OnePassEncodeThreeFrames) {
+  codec_enc_.g_pass = VPX_RC_ONE_PASS;
+  codec_enc_.g_lag_in_frames = 0;
+  vpx_fixed_buf outputs[3];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(NULL, 3, 2, &outputs[0]);
+  DecodeNFrames(&outputs[0], 3);
+  FreeBitstreamBuffers(&outputs[0], 3);
+}
+
+TEST_F(SvcTest, TwoPassEncode10Frames) {
+  // First pass encode
+  std::string stats_buf;
+  Pass1EncodeNFrames(10, 2, &stats_buf);
+
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  vpx_fixed_buf outputs[10];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
+  DecodeNFrames(&outputs[0], 10);
+  FreeBitstreamBuffers(&outputs[0], 10);
+}
+
+TEST_F(SvcTest, TwoPassEncode20FramesWithAltRef) {
+  // First pass encode
+  std::string stats_buf;
+  Pass1EncodeNFrames(20, 2, &stats_buf);
+
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  vpx_svc_set_options(&svc_, "auto-alt-refs=1,1");
+  vpx_fixed_buf outputs[20];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 20, 2, &outputs[0]);
+  DecodeNFrames(&outputs[0], 20);
+  FreeBitstreamBuffers(&outputs[0], 20);
+}
+
+TEST_F(SvcTest, TwoPassEncode2SpatialLayersDecodeBaseLayerOnly) {
+  // First pass encode
+  std::string stats_buf;
+  Pass1EncodeNFrames(10, 2, &stats_buf);
+
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  vpx_svc_set_options(&svc_, "auto-alt-refs=1,1");
+  vpx_fixed_buf outputs[10];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
+  DropEnhancementLayers(&outputs[0], 10, 1);
+  DecodeNFrames(&outputs[0], 10);
+  FreeBitstreamBuffers(&outputs[0], 10);
+}
+
+TEST_F(SvcTest, TwoPassEncode5SpatialLayersDecode54321Layers) {
+  // First pass encode
+  std::string stats_buf;
+  Pass1EncodeNFrames(10, 5, &stats_buf);
+
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  vpx_svc_set_options(&svc_, "auto-alt-refs=0,1,1,1,0");
+  vpx_fixed_buf outputs[10];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 10, 5, &outputs[0]);
+
+  DecodeNFrames(&outputs[0], 10);
+  DropEnhancementLayers(&outputs[0], 10, 4);
+  DecodeNFrames(&outputs[0], 10);
+  DropEnhancementLayers(&outputs[0], 10, 3);
+  DecodeNFrames(&outputs[0], 10);
+  DropEnhancementLayers(&outputs[0], 10, 2);
+  DecodeNFrames(&outputs[0], 10);
+  DropEnhancementLayers(&outputs[0], 10, 1);
+  DecodeNFrames(&outputs[0], 10);
+
+  FreeBitstreamBuffers(&outputs[0], 10);
+}
+
+TEST_F(SvcTest, TwoPassEncode2SNRLayers) {
+  // First pass encode
+  std::string stats_buf;
+  vpx_svc_set_options(&svc_, "scale-factors=1/1,1/1");
+  Pass1EncodeNFrames(20, 2, &stats_buf);
+
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  vpx_svc_set_options(&svc_,
+                      "auto-alt-refs=1,1 scale-factors=1/1,1/1");
+  vpx_fixed_buf outputs[20];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 20, 2, &outputs[0]);
+  DecodeNFrames(&outputs[0], 20);
+  FreeBitstreamBuffers(&outputs[0], 20);
+}
+
+TEST_F(SvcTest, TwoPassEncode3SNRLayersDecode321Layers) {
+  // First pass encode
+  std::string stats_buf;
+  vpx_svc_set_options(&svc_, "scale-factors=1/1,1/1,1/1");
+  Pass1EncodeNFrames(20, 3, &stats_buf);
+
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  vpx_svc_set_options(&svc_,
+                      "auto-alt-refs=1,1,1 scale-factors=1/1,1/1,1/1");
+  vpx_fixed_buf outputs[20];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 20, 3, &outputs[0]);
+  DecodeNFrames(&outputs[0], 20);
+  DropEnhancementLayers(&outputs[0], 20, 2);
+  DecodeNFrames(&outputs[0], 20);
+  DropEnhancementLayers(&outputs[0], 20, 1);
+  DecodeNFrames(&outputs[0], 20);
+
+  FreeBitstreamBuffers(&outputs[0], 20);
+}
+
+TEST_F(SvcTest, SetMultipleFrameContextsOption) {
+  svc_.spatial_layers = 5;
+  vpx_codec_err_t res =
+      vpx_svc_set_options(&svc_, "multi-frame-contexts=1");
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  svc_.spatial_layers = 2;
+  res = vpx_svc_set_options(&svc_, "multi-frame-contexts=1");
+  InitializeEncoder();
+}
+
+TEST_F(SvcTest, TwoPassEncode2SpatialLayersWithMultipleFrameContexts) {
+  // First pass encode
+  std::string stats_buf;
+  Pass1EncodeNFrames(10, 2, &stats_buf);
+
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  codec_enc_.g_error_resilient = 0;
+  vpx_svc_set_options(&svc_, "auto-alt-refs=1,1 multi-frame-contexts=1");
+  vpx_fixed_buf outputs[10];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
+  DecodeNFrames(&outputs[0], 10);
+  FreeBitstreamBuffers(&outputs[0], 10);
+}
+
+TEST_F(SvcTest,
+       TwoPassEncode2SpatialLayersWithMultipleFrameContextsDecodeBaselayer) {
+  // First pass encode
+  std::string stats_buf;
+  Pass1EncodeNFrames(10, 2, &stats_buf);
+
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  codec_enc_.g_error_resilient = 0;
+  vpx_svc_set_options(&svc_, "auto-alt-refs=1,1 multi-frame-contexts=1");
+  vpx_fixed_buf outputs[10];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
+  DropEnhancementLayers(&outputs[0], 10, 1);
+  DecodeNFrames(&outputs[0], 10);
+  FreeBitstreamBuffers(&outputs[0], 10);
+}
+
+TEST_F(SvcTest, TwoPassEncode2SNRLayersWithMultipleFrameContexts) {
+  // First pass encode
+  std::string stats_buf;
+  vpx_svc_set_options(&svc_, "scale-factors=1/1,1/1");
+  Pass1EncodeNFrames(10, 2, &stats_buf);
+
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  codec_enc_.g_error_resilient = 0;
+  vpx_svc_set_options(&svc_, "auto-alt-refs=1,1 scale-factors=1/1,1/1 "
+                      "multi-frame-contexts=1");
+  vpx_fixed_buf outputs[10];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
+  DecodeNFrames(&outputs[0], 10);
+  FreeBitstreamBuffers(&outputs[0], 10);
+}
+
+TEST_F(SvcTest,
+       TwoPassEncode3SNRLayersWithMultipleFrameContextsDecode321Layer) {
+  // First pass encode
+  std::string stats_buf;
+  vpx_svc_set_options(&svc_, "scale-factors=1/1,1/1,1/1");
+  Pass1EncodeNFrames(10, 3, &stats_buf);
+
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  codec_enc_.g_error_resilient = 0;
+  vpx_svc_set_options(&svc_, "auto-alt-refs=1,1,1 scale-factors=1/1,1/1,1/1 "
+                      "multi-frame-contexts=1");
+  vpx_fixed_buf outputs[10];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 10, 3, &outputs[0]);
+
+  DecodeNFrames(&outputs[0], 10);
+  DropEnhancementLayers(&outputs[0], 10, 2);
+  DecodeNFrames(&outputs[0], 10);
+  DropEnhancementLayers(&outputs[0], 10, 1);
+  DecodeNFrames(&outputs[0], 10);
+
+  FreeBitstreamBuffers(&outputs[0], 10);
+}
+
+TEST_F(SvcTest, TwoPassEncode2TemporalLayers) {
+  // First pass encode
+  std::string stats_buf;
+  vpx_svc_set_options(&svc_, "scale-factors=1/1");
+  svc_.temporal_layers = 2;
+  Pass1EncodeNFrames(10, 1, &stats_buf);
+
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  svc_.temporal_layers = 2;
+  vpx_svc_set_options(&svc_, "auto-alt-refs=1 scale-factors=1/1");
+  vpx_fixed_buf outputs[10];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]);
+  DecodeNFrames(&outputs[0], 10);
+  FreeBitstreamBuffers(&outputs[0], 10);
+}
+
+TEST_F(SvcTest, TwoPassEncode2TemporalLayersWithMultipleFrameContexts) {
+  // First pass encode
+  std::string stats_buf;
+  vpx_svc_set_options(&svc_, "scale-factors=1/1");
+  svc_.temporal_layers = 2;
+  Pass1EncodeNFrames(10, 1, &stats_buf);
+
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  svc_.temporal_layers = 2;
+  codec_enc_.g_error_resilient = 0;
+  vpx_svc_set_options(&svc_, "auto-alt-refs=1 scale-factors=1/1 "
+                      "multi-frame-contexts=1");
+  vpx_fixed_buf outputs[10];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]);
+  DecodeNFrames(&outputs[0], 10);
+  FreeBitstreamBuffers(&outputs[0], 10);
+}
+
+TEST_F(SvcTest, TwoPassEncode2TemporalLayersDecodeBaseLayer) {
+  // First pass encode
+  std::string stats_buf;
+  vpx_svc_set_options(&svc_, "scale-factors=1/1");
+  svc_.temporal_layers = 2;
+  Pass1EncodeNFrames(10, 1, &stats_buf);
+
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  svc_.temporal_layers = 2;
+  vpx_svc_set_options(&svc_, "auto-alt-refs=1 scale-factors=1/1");
+  vpx_fixed_buf outputs[10];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]);
+
+  vpx_fixed_buf base_layer[5];
+  for (int i = 0; i < 5; ++i)
+    base_layer[i] = outputs[i * 2];
+
+  DecodeNFrames(&base_layer[0], 5);
+  FreeBitstreamBuffers(&outputs[0], 10);
+}
+
+TEST_F(SvcTest,
+       TwoPassEncode2TemporalLayersWithMultipleFrameContextsDecodeBaseLayer) {
+  // First pass encode
+  std::string stats_buf;
+  vpx_svc_set_options(&svc_, "scale-factors=1/1");
+  svc_.temporal_layers = 2;
+  Pass1EncodeNFrames(10, 1, &stats_buf);
+
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  svc_.temporal_layers = 2;
+  codec_enc_.g_error_resilient = 0;
+  vpx_svc_set_options(&svc_, "auto-alt-refs=1 scale-factors=1/1 "
+                      "multi-frame-contexts=1");
+  vpx_fixed_buf outputs[10];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]);
+
+  vpx_fixed_buf base_layer[5];
+  for (int i = 0; i < 5; ++i)
+    base_layer[i] = outputs[i * 2];
+
+  DecodeNFrames(&base_layer[0], 5);
+  FreeBitstreamBuffers(&outputs[0], 10);
+}
+
+TEST_F(SvcTest, TwoPassEncode2TemporalLayersWithTiles) {
+  // First pass encode
+  std::string stats_buf;
+  vpx_svc_set_options(&svc_, "scale-factors=1/1");
+  svc_.temporal_layers = 2;
+  Pass1EncodeNFrames(10, 1, &stats_buf);
+
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  svc_.temporal_layers = 2;
+  vpx_svc_set_options(&svc_, "auto-alt-refs=1 scale-factors=1/1");
+  codec_enc_.g_w = 704;
+  codec_enc_.g_h = 144;
+  tile_columns_ = 1;
+  tile_rows_ = 1;
+  vpx_fixed_buf outputs[10];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]);
+  DecodeNFrames(&outputs[0], 10);
+  FreeBitstreamBuffers(&outputs[0], 10);
+}
+
+TEST_F(SvcTest,
+       TwoPassEncode2TemporalLayersWithMultipleFrameContextsAndTiles) {
+  // First pass encode
+  std::string stats_buf;
+  vpx_svc_set_options(&svc_, "scale-factors=1/1");
+  svc_.temporal_layers = 2;
+  Pass1EncodeNFrames(10, 1, &stats_buf);
+
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  svc_.temporal_layers = 2;
+  codec_enc_.g_error_resilient = 0;
+  codec_enc_.g_w = 704;
+  codec_enc_.g_h = 144;
+  tile_columns_ = 1;
+  tile_rows_ = 1;
+  vpx_svc_set_options(&svc_, "auto-alt-refs=1 scale-factors=1/1 "
+                      "multi-frame-contexts=1");
+  vpx_fixed_buf outputs[10];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]);
+  DecodeNFrames(&outputs[0], 10);
+  FreeBitstreamBuffers(&outputs[0], 10);
+}
+
+}  // namespace
diff --git a/libs/libvpx/test/test-data.mk b/libs/libvpx/test/test-data.mk
new file mode 100644
index 0000000000..05a0885ed2
--- /dev/null
+++ b/libs/libvpx/test/test-data.mk
@@ -0,0 +1,863 @@
+LIBVPX_TEST_SRCS-yes += test-data.mk
+
+# Encoder test source
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_collage_w352h288.yuv
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_odd.yuv
+
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_420.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_422.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_444.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_440.yuv
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_420.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_422.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_444.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_440.yuv
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_420_a10-1.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_420.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_422.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_444.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_440.yuv
+
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += desktop_credits.y4m
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_1280_720_30.y4m
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rush_hour_444.y4m
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += screendata.y4m
+
+# Test vectors
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-001.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-001.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-002.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-002.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-003.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-003.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-004.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-004.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-005.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-005.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-006.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-006.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-007.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-007.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-008.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-008.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-009.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-009.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-010.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-010.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-011.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-011.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-012.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-012.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-013.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-013.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-014.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-014.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-015.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-015.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-016.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-016.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-017.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-017.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-018.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-018.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1400.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1400.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1411.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1411.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1416.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1416.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1417.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1417.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1402.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1402.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1412.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1412.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1418.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1418.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1424.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1424.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-01.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-01.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-02.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-02.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-03.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-03.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-04.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-04.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1401.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1401.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1403.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1403.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1407.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1407.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1408.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1408.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1409.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1409.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1410.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1410.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1413.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1413.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1414.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1414.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1415.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1415.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1425.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1425.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1426.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1426.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1427.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1427.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1432.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1432.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1435.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1435.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1436.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1436.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1437.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1437.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1441.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1441.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1442.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1442.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1404.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1404.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1405.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1405.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1406.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1406.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1428.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1428.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1429.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1429.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1430.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1430.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1431.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1431.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1433.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1433.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1434.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1434.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1438.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1438.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1439.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1439.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1440.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1440.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1443.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1443.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-06-smallsize.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-06-smallsize.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-00.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-00.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-01.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-01.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-02.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-02.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-03.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-03.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-04.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-04.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-05.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-05.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-06.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-06.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-07.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-07.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-08.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-08.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-09.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-09.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-10.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-10.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-11.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-11.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-12.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-12.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-13.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-13.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-14.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-14.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-15.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-15.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-17.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-17.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-18.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-18.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-19.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-19.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-20.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-20.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-21.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-21.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-22.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-22.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-23.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-23.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-24.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-24.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-25.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-25.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-26.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-26.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-27.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-27.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-28.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-28.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-29.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-29.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-30.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-30.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-31.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-31.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-32.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-32.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-33.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-33.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-34.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-34.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-35.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-35.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-36.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-36.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-37.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-37.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-38.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-38.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-39.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-39.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-40.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-40.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-41.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-41.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-42.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-42.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-43.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-43.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-44.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-44.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-45.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-45.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-46.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-46.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-47.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-47.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-48.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-48.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-49.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-49.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-50.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-50.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-51.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-51.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-52.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-52.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-53.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-53.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-54.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-54.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-55.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-55.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-56.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-56.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-57.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-57.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-58.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-58.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-59.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-59.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-60.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-60.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-61.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-61.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-62.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-62.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-63.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-63.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-1.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-1.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-3.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-3.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-5.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-5.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-6.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-6.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-7.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-7.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x08.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x08.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x10.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x10.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x18.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x18.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x32.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x32.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x34.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x34.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x64.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x64.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x66.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x66.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x08.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x08.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x10.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x10.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x18.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x18.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x32.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x32.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x34.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x34.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x64.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x64.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x66.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x66.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x08.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x08.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x10.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x10.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x18.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x18.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x32.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x32.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x34.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x34.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x64.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x64.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x66.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x66.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x08.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x08.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x10.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x10.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x18.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x18.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x32.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x32.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x34.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x34.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x64.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x64.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x66.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x66.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x08.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x08.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x10.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x10.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x18.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x18.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x32.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x32.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x34.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x34.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x64.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x64.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x66.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x66.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x08.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x08.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x10.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x10.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x18.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x18.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x32.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x32.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x34.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x34.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x64.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x64.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x66.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x66.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x08.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x08.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x10.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x10.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x18.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x18.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x32.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x32.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x34.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x34.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x64.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x64.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x66.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x66.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x08.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x08.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x10.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x10.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x18.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x18.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x32.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x32.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x34.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x34.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x64.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x64.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x66.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x66.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-130x132.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-130x132.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-132x130.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-132x130.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-132x132.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-132x132.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-178x180.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-178x180.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-180x178.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-180x178.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-180x180.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-180x180.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-lf-1920x1080.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-lf-1920x1080.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-deltaq.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-deltaq.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x196.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x196.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x198.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x198.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x200.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x200.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x202.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x202.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x208.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x208.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x210.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x210.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x224.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x224.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x226.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x226.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x196.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x196.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x198.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x198.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x200.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x200.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x202.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x202.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x208.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x208.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x210.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x210.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x224.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x224.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x226.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x226.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x196.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x196.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x198.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x198.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x200.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x200.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x202.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x202.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x208.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x208.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x210.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x210.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x224.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x224.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x226.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x226.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x196.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x196.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x198.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x198.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x200.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x200.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x202.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x202.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x208.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x208.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x210.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x210.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x224.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x224.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x226.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x226.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x196.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x196.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x198.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x198.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x200.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x200.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x202.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x202.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x208.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x208.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x210.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x210.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x224.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x224.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x226.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x226.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x196.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x196.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x198.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x198.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x200.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x200.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x202.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x202.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x208.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x208.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x210.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x210.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x224.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x224.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x226.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x226.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x196.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x196.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x198.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x198.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x200.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x200.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x202.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x202.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x208.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x208.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x210.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x210.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x224.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x224.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x226.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x226.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x196.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x196.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x198.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x198.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x200.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x200.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x202.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x202.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x208.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x208.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x210.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x210.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x224.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x224.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x226.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x226.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-352x288.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-352x288.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-05-resize.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-05-resize.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-06-bilinear.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-06-bilinear.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-07-frame_parallel.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-07-frame_parallel.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-07-frame_parallel-1.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-07-frame_parallel-1.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x1.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x1.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x2_frame_parallel.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x2_frame_parallel.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x4_frame_parallel.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x4_frame_parallel.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x8.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x8.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x8_frame_parallel.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x8_frame_parallel.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-aq2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-aq2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-lf_deltas.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-lf_deltas.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-subpixel-00.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-subpixel-00.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-10-show-existing-frame.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-10-show-existing-frame.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-10-show-existing-frame2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-10-show-existing-frame2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-351x287.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-351x287.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-351x288.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-351x288.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-352x287.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-352x287.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-12-droppable_1.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-12-droppable_1.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-12-droppable_2.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-12-droppable_2.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-12-droppable_3.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-12-droppable_3.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-13-largescaling.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-13-largescaling.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-2-4-8-16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-2-4-8-16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-8.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-1-8.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-1.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-1.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-8.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-16-8.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-2-1.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-2-1.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-2-16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-2-16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-2-4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-2-4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-2-8.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-2-8.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-4-1.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-4-1.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-4-16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-4-16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-4-2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-4-2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-4-8.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-4-8.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-1.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-1.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-16.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-16.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-1-2-4-8.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-1-2-4-8.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-1-2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-1-2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-1-4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-1-4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-1-8.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-1-8.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-2-1.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-2-1.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-2-4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-2-4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-2-8.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-2-8.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-4-1.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-4-1.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-4-2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-4-2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-4-8.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-4-8.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-8-1.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-8-1.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-8-2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-8-2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-8-4-2-1.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-8-4-2-1.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-8-4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-8-4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey_adpq.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey_adpq.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-16-intra-only.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-16-intra-only.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-17-show-existing-frame.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-17-show-existing-frame.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-18-resize.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-18-resize.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-19-skip.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-19-skip.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-19-skip-01.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-19-skip-01.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-19-skip-02.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-19-skip-02.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv422.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv422.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv440.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv440.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv444.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv444.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-20-big_superframe-01.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-20-big_superframe-01.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-20-big_superframe-02.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-20-big_superframe-02.webm.md5
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp92-2-20-10bit-yuv420.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp92-2-20-10bit-yuv420.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp92-2-20-12bit-yuv420.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp92-2-20-12bit-yuv420.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-10bit-yuv422.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-10bit-yuv422.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-12bit-yuv422.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-12bit-yuv422.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-10bit-yuv440.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-10bit-yuv440.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-12bit-yuv440.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-12bit-yuv440.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-10bit-yuv444.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-10bit-yuv444.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-12bit-yuv444.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-12bit-yuv444.webm.md5
+endif  # CONFIG_VP9_HIGHBITDEPTH
+
+# Invalid files for testing libvpx error checking.
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01-v3.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01-v3.webm.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-02-v2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-02-v2.webm.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-03-v3.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-03-v3.webm.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-03-size-202x210.webm.ivf.s113306_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-03-size-202x210.webm.ivf.s113306_r01-05_b6-.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-03-size-224x196.webm.ivf.s44156_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-03-size-224x196.webm.ivf.s44156_r01-05_b6-.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-05-resize.ivf.s59293_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-05-resize.ivf.s59293_r01-05_b6-.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-08-tile_1x2_frame_parallel.webm.ivf.s47039_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-08-tile_1x2_frame_parallel.webm.ivf.s47039_r01-05_b6-.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-08-tile_1x8_frame_parallel.webm.ivf.s288_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-08-tile_1x8_frame_parallel.webm.ivf.s288_r01-05_b6-.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.v2.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.v2.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.v2.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.v2.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.v2.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.v2.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp91-2-mixedrefcsp-444to420.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp91-2-mixedrefcsp-444to420.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-07-frame_parallel-1.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-07-frame_parallel-2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-07-frame_parallel-3.webm
+
+ifeq ($(CONFIG_DECODE_PERF_TESTS),yes)
+# Encode / Decode test
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_1280_720_30.yuv
+# BBB VP9 streams
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-bbb_426x240_tile_1x1_180kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-bbb_640x360_tile_1x2_337kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-bbb_854x480_tile_1x2_651kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-bbb_1280x720_tile_1x4_1310kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-bbb_1920x1080_tile_1x1_2581kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-bbb_1920x1080_tile_1x4_2586kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-bbb_1920x1080_tile_1x4_fpm_2304kbps.webm
+# Sintel VP9 streams
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-sintel_426x182_tile_1x1_171kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-sintel_640x272_tile_1x2_318kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-sintel_854x364_tile_1x2_621kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-sintel_1280x546_tile_1x4_1257kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-sintel_1920x818_tile_1x4_fpm_2279kbps.webm
+# TOS VP9 streams
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-tos_426x178_tile_1x1_181kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-tos_640x266_tile_1x2_336kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-tos_854x356_tile_1x2_656kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-tos_854x356_tile_1x2_fpm_546kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-tos_1280x534_tile_1x4_1306kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-tos_1280x534_tile_1x4_fpm_952kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-tos_1920x800_tile_1x4_fpm_2335kbps.webm
+endif  # CONFIG_DECODE_PERF_TESTS
+
+ifeq ($(CONFIG_ENCODE_PERF_TESTS),yes)
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += desktop_640_360_30.yuv
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += kirland_640_480_30.yuv
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += macmarcomoving_640_480_30.yuv
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += macmarcostationary_640_480_30.yuv
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_1280_720_30.yuv
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_640_480_30.yuv
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += tacomanarrows_640_480_30.yuv
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += tacomasmallcameramovement_640_480_30.yuv
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += thaloundeskmtg_640_480_30.yuv
+endif  # CONFIG_ENCODE_PERF_TESTS
+
+# sort and remove duplicates
+LIBVPX_TEST_DATA-yes := $(sort $(LIBVPX_TEST_DATA-yes))
+
+# VP9 dynamic resizing test (decoder)
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x180_5_1-2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x180_5_1-2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x180_5_3-4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x180_5_3-4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x180_7_1-2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x180_7_1-2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x180_7_3-4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x180_7_3-4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x240_5_1-2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x240_5_1-2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x240_5_3-4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x240_5_3-4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x240_7_1-2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x240_7_1-2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x240_7_3-4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x240_7_3-4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x360_5_1-2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x360_5_1-2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x360_5_3-4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x360_5_3-4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x360_7_1-2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x360_7_1-2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x360_7_3-4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x360_7_3-4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x480_5_1-2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x480_5_1-2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x480_5_3-4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x480_5_3-4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x480_7_1-2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x480_7_1-2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x480_7_3-4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x480_7_3-4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1280x720_5_1-2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1280x720_5_1-2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1280x720_5_3-4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1280x720_5_3-4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1280x720_7_1-2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1280x720_7_1-2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1280x720_7_3-4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1280x720_7_3-4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_5_1-2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_5_1-2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_5_3-4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_5_3-4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_7_1-2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_7_1-2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_7_3-4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_7_3-4.webm.md5
diff --git a/libs/libvpx/test/test-data.sha1 b/libs/libvpx/test/test-data.sha1
new file mode 100644
index 0000000000..a4ed1742fc
--- /dev/null
+++ b/libs/libvpx/test/test-data.sha1
@@ -0,0 +1,836 @@
+d5dfb0151c9051f8c85999255645d7a23916d3c0 *hantro_collage_w352h288.yuv
+b87815bf86020c592ccc7a846ba2e28ec8043902 *hantro_odd.yuv
+76024eb753cdac6a5e5703aaea189d35c3c30ac7 *invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf
+7448d8798a4380162d4b56f9b452e2f6f9e24e7a *invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf.res
+83f50908c8dc0ef8760595447a2ff7727489542e *invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf
+456d1493e52d32a5c30edf44a27debc1fa6b253a *invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf.res
+c123d1f9f02fb4143abb5e271916e3a3080de8f6 *invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf
+456d1493e52d32a5c30edf44a27debc1fa6b253a *invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf.res
+fe346136b9b8c1e6f6084cc106485706915795e4 *invalid-vp90-01-v3.webm
+5d9474c0309b7ca09a182d888f73b37a8fe1362c *invalid-vp90-01-v3.webm.res
+d78e2fceba5ac942246503ec8366f879c4775ca5 *invalid-vp90-02-v2.webm
+8e2eff4af87d2b561cce2365713269e301457ef3 *invalid-vp90-02-v2.webm.res
+df1a1453feb3c00d7d89746c7003b4163523bff3 *invalid-vp90-03-v3.webm
+4935c62becc68c13642a03db1e6d3e2331c1c612 *invalid-vp90-03-v3.webm.res
+d637297561dd904eb2c97a9015deeb31c4a1e8d2 *invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm
+3a204bdbeaa3c6458b77bcebb8366d107267f55d *invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm.res
+a432f96ff0a787268e2f94a8092ab161a18d1b06 *park_joy_90p_10_420.y4m
+0b194cc312c3a2e84d156a221b0a5eb615dfddc5 *park_joy_90p_10_422.y4m
+ff0e0a21dc2adc95b8c1b37902713700655ced17 *park_joy_90p_10_444.y4m
+c934da6fb8cc54ee2a8c17c54cf6076dac37ead0 *park_joy_90p_10_440.yuv
+614c32ae1eca391e867c70d19974f0d62664dd99 *park_joy_90p_12_420.y4m
+c92825f1ea25c5c37855083a69faac6ac4641a9e *park_joy_90p_12_422.y4m
+b592189b885b6cc85db55cc98512a197d73d3b34 *park_joy_90p_12_444.y4m
+82c1bfcca368c2f22bad7d693d690d5499ecdd11 *park_joy_90p_12_440.yuv
+b9e1e90aece2be6e2c90d89e6ab2372d5f8c792d *park_joy_90p_8_420_a10-1.y4m
+4e0eb61e76f0684188d9bc9f3ce61f6b6b77bb2c *park_joy_90p_8_420.y4m
+7a193ff7dfeb96ba5f82b2afd7afa9e1fe83d947 *park_joy_90p_8_422.y4m
+bdb7856e6bc93599bdda05c2e773a9f22b6c6d03 *park_joy_90p_8_444.y4m
+81e1f3843748438b8f2e71db484eb22daf72e939 *park_joy_90p_8_440.yuv
+b1f1c3ec79114b9a0651af24ce634afb44a9a419 *rush_hour_444.y4m
+5184c46ddca8b1fadd16742e8500115bc8f749da *vp80-00-comprehensive-001.ivf
+65bf1bbbced81b97bd030f376d1b7f61a224793f *vp80-00-comprehensive-002.ivf
+906b4c1e99eb734504c504b3f1ad8052137ce672 *vp80-00-comprehensive-003.ivf
+ec144b1af53af895db78355785650b96dd3f0ade *vp80-00-comprehensive-004.ivf
+afc7091785c62f1c121c4554a2830c30704587d9 *vp80-00-comprehensive-005.ivf
+42ea9d55c818145d06a9b633b8e85c6a6164fd3e *vp80-00-comprehensive-006.ivf
+e5b3a73ab79fe024c14309d653d6bed92902ee3b *vp80-00-comprehensive-007.ivf
+f3c50a58875930adfb84525c0ef59d7e4c08540c *vp80-00-comprehensive-008.ivf
+4b2841fdb83db51ae322096ae468bbb9dc2c8362 *vp80-00-comprehensive-009.ivf
+efbff736e3a91ab6a98c5bc2dce65d645944c7b1 *vp80-00-comprehensive-010.ivf
+6b315102cae008d22a3d2c231be92cb704a222f8 *vp80-00-comprehensive-011.ivf
+f3214a4fea14c2d5ec689936c1613f274c859ee8 *vp80-00-comprehensive-012.ivf
+e4094e96d308c8a35b74c480a43d853c5294cd34 *vp80-00-comprehensive-013.ivf
+5b0adfaf60a69e0aaf3ec021a39d0a68fc0e1b5a *vp80-00-comprehensive-014.ivf
+e8467688ddf26b5000664f904faf0d70506aa653 *vp80-00-comprehensive-015.ivf
+aab55582337dfd2a39ff54fb2576a91910d49337 *vp80-00-comprehensive-016.ivf
+1ba24724f80203c9bae4f1d0f99d534721980016 *vp80-00-comprehensive-017.ivf
+143a15512b46f436280ddb4d0e6411eb4af434f2 *vp80-00-comprehensive-018.ivf
+c5baeaf5714fdfb3a8bc960a8e33ac438e83b16b *vp80-01-intra-1400.ivf
+f383955229afe3408453e316d11553d923ca60d5 *vp80-01-intra-1411.ivf
+84e1f4343f174c9f3c83f834bac3196fb325bf2c *vp80-01-intra-1416.ivf
+fb6e712a47dd57a28a3727d2ae2c97a8b7c7ca51 *vp80-01-intra-1417.ivf
+71ea772d3e9d315b8cbecf41207b8a237c34853b *vp80-02-inter-1402.ivf
+d85dbc4271525dcd128c503f936fe69091d1f8d0 *vp80-02-inter-1412.ivf
+d4e5d3ad56511867d025f93724d090f92ba6ec3d *vp80-02-inter-1418.ivf
+91791cbcc37c60f35dbd8090bacb54e5ec6dd4fa *vp80-02-inter-1424.ivf
+17fbfe2fea70f6e2f3fa6ca4efaae6c0b03b5f02 *vp80-03-segmentation-01.ivf
+3c3600dbbcde08e20d54c66fe3b7eadd4f09bdbb *vp80-03-segmentation-02.ivf
+c156778d5340967d4b369c490848076e92f1f875 *vp80-03-segmentation-03.ivf
+d25dcff6c60e87a1af70945b8911b6b4998533b0 *vp80-03-segmentation-04.ivf
+362baba2ce454c9db21218f35e81c27a5ed0b730 *vp80-03-segmentation-1401.ivf
+d223ae7ee748ce07e74c4679bfd219e84aa9f4b0 *vp80-03-segmentation-1403.ivf
+033adf7f3a13836a3f1cffcb87c1972900f2b5c6 *vp80-03-segmentation-1407.ivf
+4d51dfbf9f3e2c590ec99d1d6f59dd731d04375f *vp80-03-segmentation-1408.ivf
+f37a62b197c2600d75e0ccfbb31b60efdedac251 *vp80-03-segmentation-1409.ivf
+eb25bd7bfba5b2f6935018a930f42d123b1e7fcd *vp80-03-segmentation-1410.ivf
+b9d5c436663a30c27cfff84b53a002e501258843 *vp80-03-segmentation-1413.ivf
+6da92b9d1a180cc3a8afe348ab12258f5a37be1a *vp80-03-segmentation-1414.ivf
+a4f5842602886bd669f115f93d8a35c035cb0948 *vp80-03-segmentation-1415.ivf
+f295dceb8ef278b77251b3f9df8aee22e161d547 *vp80-03-segmentation-1425.ivf
+198dbf9f36f733200e432664cc8c5752d59779de *vp80-03-segmentation-1426.ivf
+7704804e32f5de976803929934a7fafe101ac7b0 *vp80-03-segmentation-1427.ivf
+831ccd862ea95ca025d2f3bd8b88678752f5416d *vp80-03-segmentation-1432.ivf
+b3c11978529289f9109f2766fcaba3ebc40e11ef *vp80-03-segmentation-1435.ivf
+a835a731f5520ebfc1002c40121264d0020559ac *vp80-03-segmentation-1436.ivf
+1d1732942f773bb2a5775fcb9689b1579ce28eab *vp80-03-segmentation-1437.ivf
+db04799adfe089dfdf74dbd43cc05ede7161f99e *vp80-03-segmentation-1441.ivf
+7caf39b3f20cfd52b998210878062e52a5edf1e6 *vp80-03-segmentation-1442.ivf
+3607f6bb4ee106c38fa1ea370dc4ff8b8cde2261 *vp80-04-partitions-1404.ivf
+93cc323b6b6867f1b12dd48773424549c6960a6b *vp80-04-partitions-1405.ivf
+047eedb14b865bdac8a3538e63801054e0295e9c *vp80-04-partitions-1406.ivf
+0f1233bd2bc33f56ce5e495dbd455d122339f384 *vp80-05-sharpness-1428.ivf
+51767fc136488a9535c2a4c38067c542ee2048df *vp80-05-sharpness-1429.ivf
+9805aa107672de25d6fb8c35e20d06deca5efe18 *vp80-05-sharpness-1430.ivf
+61db6b965f9c27aebe71b85bf2d5877e58e4bbdf *vp80-05-sharpness-1431.ivf
+10420d266290d2923555f84af38eeb96edbd3ae8 *vp80-05-sharpness-1433.ivf
+3ed24f9a80cddfdf75824ba95cdb4ff9286cb443 *vp80-05-sharpness-1434.ivf
+c87599cbecd72d4cd4f7ace3313b7a6bc6eb8163 *vp80-05-sharpness-1438.ivf
+aff51d865c2621b60510459244ea83e958e4baed *vp80-05-sharpness-1439.ivf
+da386e72b19b5485a6af199c5eb60ef25e510dd1 *vp80-05-sharpness-1440.ivf
+6759a095203d96ccd267ce09b1b050b8cc4c2f1f *vp80-05-sharpness-1443.ivf
+b95d3cc1d0df991e63e150a801710a72f20d9ba0 *vp80-06-smallsize.ivf
+db55ec7fd02c864ba996ff060b25b1e08611330b *vp80-00-comprehensive-001.ivf.md5
+29db0ad011cba1e45f856d5623cd38dac3e3bf19 *vp80-00-comprehensive-002.ivf.md5
+e84f258f69e173e7d68f8f8c037a0a3766902182 *vp80-00-comprehensive-003.ivf.md5
+eb7912eaf69559a16fd82bc3f5fb1524cf4a4466 *vp80-00-comprehensive-004.ivf.md5
+4206f71c94894bd5b5b376f6c09b3817dbc65206 *vp80-00-comprehensive-005.ivf.md5
+4f89b356f6f2fecb928f330a10f804f00f5325f5 *vp80-00-comprehensive-006.ivf.md5
+2813236a32964dd8007e17648bcf035a20fcda6c *vp80-00-comprehensive-007.ivf.md5
+10746c72098f872803c900e17c5680e451f5f498 *vp80-00-comprehensive-008.ivf.md5
+39a23d0692ce64421a7bb7cdf6ccec5928d37fff *vp80-00-comprehensive-009.ivf.md5
+f6e3de8931a0cc659bda8fbc14050346955e72d4 *vp80-00-comprehensive-010.ivf.md5
+101683ec195b6e944f7cd1e468fc8921439363e6 *vp80-00-comprehensive-011.ivf.md5
+1f592751ce46d8688998fa0fa4fbdcda0fd4058c *vp80-00-comprehensive-012.ivf.md5
+6066176f90ca790251e795fca1a5797d59999841 *vp80-00-comprehensive-013.ivf.md5
+2656da94ba93691f23edc4d60b3a09e2be46c217 *vp80-00-comprehensive-014.ivf.md5
+c6e0d5f5d61460c8ac8edfa4e701f10312c03133 *vp80-00-comprehensive-015.ivf.md5
+ee60fee501d8493e34e8d6a1fe315b51ed09b24a *vp80-00-comprehensive-016.ivf.md5
+9f1914ceffcad4546c0a29de3ef591d8bea304dc *vp80-00-comprehensive-017.ivf.md5
+e0305178fe288a9fd8082b39e2d03181edb19054 *vp80-00-comprehensive-018.ivf.md5
+612494da2fa799cc9d76dcdd835ae6c7cb2e5c05 *vp80-01-intra-1400.ivf.md5
+48ea06097ac8269c5e8c2131d3d0639f431fcf0e *vp80-01-intra-1411.ivf.md5
+6e2ab4e7677ad0ba868083ca6bc387ee922b400c *vp80-01-intra-1416.ivf.md5
+eca0a90348959ce3854142f8d8641b13050e8349 *vp80-01-intra-1417.ivf.md5
+920feea203145d5c2258a91c4e6991934a79a99e *vp80-02-inter-1402.ivf.md5
+f71d97909fe2b3dd65be7e1f56c72237f0cef200 *vp80-02-inter-1412.ivf.md5
+e911254569a30bbb2a237ff8b79f69ed9da0672d *vp80-02-inter-1418.ivf.md5
+58c789c50c9bb9cc90580bed291164a0939d28ba *vp80-02-inter-1424.ivf.md5
+ff3e2f441327b9c20a0b37c524e0f5a48a36de7b *vp80-03-segmentation-01.ivf.md5
+0791f417f076a542ae66fbc3426ab4d94cbd6c75 *vp80-03-segmentation-02.ivf.md5
+722e50f1a6a91c34302d68681faffc1c26d1cc57 *vp80-03-segmentation-03.ivf.md5
+c701f1885bcfb27fb8e70cc65606b289172ef889 *vp80-03-segmentation-04.ivf.md5
+f79bc9ec189a2b4807632a3d0c5bf04a178b5300 *vp80-03-segmentation-1401.ivf.md5
+b9aa4c74c0219b639811c44760d0b24cd8bb436a *vp80-03-segmentation-1403.ivf.md5
+70d5a2207ca1891bcaebd5cf6dd88ce8d57b4334 *vp80-03-segmentation-1407.ivf.md5
+265f962ee781531f9a93b9309461316fd32b2a1d *vp80-03-segmentation-1408.ivf.md5
+0c4ecbbd6dc042d30e626d951b65f460dd6cd563 *vp80-03-segmentation-1409.ivf.md5
+cf779af36a937f06570a0fca9db64ba133451dee *vp80-03-segmentation-1410.ivf.md5
+0e6c5036d51ab078842f133934926c598a9cff02 *vp80-03-segmentation-1413.ivf.md5
+eb3930aaf229116c80d507516c34759c3f6cdf69 *vp80-03-segmentation-1414.ivf.md5
+123d6c0f72ee87911c4ae7538e87b7d163b22d6c *vp80-03-segmentation-1415.ivf.md5
+e70551d1a38920e097a5d8782390b79ecaeb7505 *vp80-03-segmentation-1425.ivf.md5
+44e8f4117e46dbb302b2cfd81171cc1a1846e431 *vp80-03-segmentation-1426.ivf.md5
+52636e54aee5f95bbace37021bd67de5db767e9a *vp80-03-segmentation-1427.ivf.md5
+b1ad3eff20215c28e295b15ef3636ed926d59cba *vp80-03-segmentation-1432.ivf.md5
+24c22a552fa28a90e5978f67f57181cc2d7546d7 *vp80-03-segmentation-1435.ivf.md5
+96c49c390abfced18a7a8c9b9ea10af778e10edb *vp80-03-segmentation-1436.ivf.md5
+f95eb6214571434f1f73ab7833b9ccdf47588020 *vp80-03-segmentation-1437.ivf.md5
+1c0700ca27c9b0090a7747a4b0b4dc21d1843181 *vp80-03-segmentation-1441.ivf.md5
+81d4f23ca32667ee958bae579c8f5e97ba72eb97 *vp80-03-segmentation-1442.ivf.md5
+272efcef07a3a30fbca51bfd566063d8258ec0be *vp80-04-partitions-1404.ivf.md5
+66ed219ab812ac801b256d35cf495d193d4cf478 *vp80-04-partitions-1405.ivf.md5
+36083f37f56f502bd60ec5e07502ee9e6b8699b0 *vp80-04-partitions-1406.ivf.md5
+6ca909bf168a64c09415626294665dc1be3d1973 *vp80-05-sharpness-1428.ivf.md5
+1667d2ee2334e5fdea8a8a866f4ccf3cf76f033a *vp80-05-sharpness-1429.ivf.md5
+71bcbe5357d36a19df5b07fbe3e27bffa8893f0a *vp80-05-sharpness-1430.ivf.md5
+89a09b1dffce2d55770a89e58d9925c70ef79bf8 *vp80-05-sharpness-1431.ivf.md5
+08444a18b4e6ba3450c0796dd728d48c399a2dc9 *vp80-05-sharpness-1433.ivf.md5
+6d6223719a90c13e848aa2a8a6642098cdb5977a *vp80-05-sharpness-1434.ivf.md5
+41d70bb5fa45bc88da1604a0af466930b8dd77b5 *vp80-05-sharpness-1438.ivf.md5
+086c56378df81b6cee264d7540a7b8f2b405c7a4 *vp80-05-sharpness-1439.ivf.md5
+d32dc2c4165eb266ea4c23c14a45459b363def32 *vp80-05-sharpness-1440.ivf.md5
+8c69dc3d8e563f56ffab5ad1e400d9e689dd23df *vp80-05-sharpness-1443.ivf.md5
+d6f246df012c241b5fa6c1345019a3703d85c419 *vp80-06-smallsize.ivf.md5
+ce881e567fe1d0fbcb2d3e9e6281a1a8d74d82e0 *vp90-2-00-quantizer-00.webm
+ac5eda33407d0521c7afca43a63fd305c0cd9d13 *vp90-2-00-quantizer-00.webm.md5
+2ca0463f2cfb93d25d7dded174db70b7cb87cb48 *vp90-2-00-quantizer-01.webm
+10d98884fc6d9a5f47a2057922b8e25dd48d7786 *vp90-2-00-quantizer-01.webm.md5
+d80a2920a5e0819d69dcba8fe260c01f820f8982 *vp90-2-00-quantizer-02.webm
+c964c8e5e04165fabbf1c6ee8ee5121d35921965 *vp90-2-00-quantizer-02.webm.md5
+fdef046777b5b75c962b715d809dbe2ea331afb9 *vp90-2-00-quantizer-03.webm
+f270bee0b0c7aa2bf4c5afe098556b4f3f890faf *vp90-2-00-quantizer-03.webm.md5
+66d98609e809394a6ac730787e6724e3badc075a *vp90-2-00-quantizer-04.webm
+427433bfe121c4aea1095ec3124fdc174d200e3a *vp90-2-00-quantizer-04.webm.md5
+e6e42626d8cadf0b5be16313f69212981b96fee5 *vp90-2-00-quantizer-05.webm
+c98f6a9a1af4cfd71416792827304266aad4bd46 *vp90-2-00-quantizer-05.webm.md5
+413ef09b721f5dcec1a96e937a97e5873c2e6db6 *vp90-2-00-quantizer-06.webm
+5080e940a23805c82e578e21b57fc2c511e76376 *vp90-2-00-quantizer-06.webm.md5
+4a50a5f4ac717c30dfaae8bb46702e3542e867de *vp90-2-00-quantizer-07.webm
+76c429a02b56762e10ee4db88729d8834b3a70f4 *vp90-2-00-quantizer-07.webm.md5
+d2f4e464780bf8b7e647efa18ac777a930e62bc0 *vp90-2-00-quantizer-08.webm
+ab94aabf9316111b52d7c531962ed4123313b6ba *vp90-2-00-quantizer-08.webm.md5
+174bc58433936dd79550398d744f1072ce7f5693 *vp90-2-00-quantizer-09.webm
+e1f7690cd83ccc56d045e17cce552544a5f03810 *vp90-2-00-quantizer-09.webm.md5
+52bc1dfd3a97b24d922eb8a31d07527891561f2a *vp90-2-00-quantizer-10.webm
+9b37bed893b5f6a4e12f2aa40f02dd40f944d0f8 *vp90-2-00-quantizer-10.webm.md5
+10031eecafde1e1d8e6323fe2b2a1d7e77a66869 *vp90-2-00-quantizer-11.webm
+fe4620a4bb0e4f5cb9bbfedc4039a22b81b0f5c0 *vp90-2-00-quantizer-11.webm.md5
+78e9f7bb77e8e348155bbdfa12790789d1d50c34 *vp90-2-00-quantizer-12.webm
+0961d060cc8dd469c6dac8d7d75f927c0bb971b8 *vp90-2-00-quantizer-12.webm.md5
+133b77a3bbcef652552d74ffc46afbfe3b8a1cba *vp90-2-00-quantizer-13.webm
+df29e5e0f95772af482f540d776f6b9dea4bfa29 *vp90-2-00-quantizer-13.webm.md5
+27323afdaf8987e025c27129c74c86502315a206 *vp90-2-00-quantizer-14.webm
+ce96a2cc312942f0427a463f15a392870dd69764 *vp90-2-00-quantizer-14.webm.md5
+ab58d0b41037829f6bc993910999f4af0212aafd *vp90-2-00-quantizer-15.webm
+40f700db606501aa7cb49049624cbdde6409b122 *vp90-2-00-quantizer-15.webm.md5
+cd948e66448aafb65998815ce37241f95d7c9ee7 *vp90-2-00-quantizer-16.webm
+039b742d149c945ed79c7b9a6384352852a1c116 *vp90-2-00-quantizer-16.webm.md5
+62f56e663e13c576764e491cf08f19bd46a71999 *vp90-2-00-quantizer-17.webm
+90c5a39bf76e6b3e0a1c0d3e9b68a9fd78be963e *vp90-2-00-quantizer-17.webm.md5
+f26ecad7263cd66a614e53ba5d7c00df181affeb *vp90-2-00-quantizer-18.webm
+cda0a1c0fca2ec2976ae55124a8a67305508bae6 *vp90-2-00-quantizer-18.webm.md5
+94bfc4c04fcfe139a63b98c569e8c14ba98c401f *vp90-2-00-quantizer-19.webm
+5b8ec169ccf67d8a0a8e46a62eb173f5a1dbaf4f *vp90-2-00-quantizer-19.webm.md5
+0ee88e9318985e1e245de78c2c4a665885ab76a7 *vp90-2-00-quantizer-20.webm
+4b26f7edb4fcd3a1b4cce9ba3cb8650e3ee6e063 *vp90-2-00-quantizer-20.webm.md5
+6a995cb2b1db33da8087321df1e646f95c3e32d1 *vp90-2-00-quantizer-21.webm
+e216b4a1eceac03efcc433759be54ab8ea87b24b *vp90-2-00-quantizer-21.webm.md5
+aa7722fc427e7180115f3c9cd96bb6b2768e7296 *vp90-2-00-quantizer-22.webm
+1aa813bd45ae831bf5e79ace4d73dfd25989a07d *vp90-2-00-quantizer-22.webm.md5
+7677e5b929ed6d142041f19b8a9cd5822ee1504a *vp90-2-00-quantizer-23.webm
+0de0af34abd843d5b37e58baf3ed96a6104b64c3 *vp90-2-00-quantizer-23.webm.md5
+b2995cbe1128b2d4926f1b28d01c501ecb6be8c8 *vp90-2-00-quantizer-24.webm
+db6033af2ba2f2bca62468fb4b8808e474f93923 *vp90-2-00-quantizer-24.webm.md5
+8135ba35587fd92cd4667be7896323d9b634401c *vp90-2-00-quantizer-25.webm
+3499e00c2cc15876f61f07e3d3cfca54ebcd98fd *vp90-2-00-quantizer-25.webm.md5
+af0fa2907746db82d345f6d831fcc1b2862a29fb *vp90-2-00-quantizer-26.webm
+cd6fe3d14dab48886ebf65be00e6ed9616ebe5a7 *vp90-2-00-quantizer-26.webm.md5
+bd0002e91323776beb5ff11e06edcf19fc08e9b9 *vp90-2-00-quantizer-27.webm
+fe72154ef196067d6c272521012dd79706496cac *vp90-2-00-quantizer-27.webm.md5
+fc15eb606f81455ff03df16bf3432296b002c43c *vp90-2-00-quantizer-28.webm
+40b2e24b542206a6bfd746ef199e49ccea07678a *vp90-2-00-quantizer-28.webm.md5
+3090bbf913cad0b2eddca7228f5ed51a58378b8d *vp90-2-00-quantizer-29.webm
+eb59745e0912d8ed6c928268bcf265237c9ba93f *vp90-2-00-quantizer-29.webm.md5
+c615abdca9c25e1cb110d908edbedfb3b7c92b91 *vp90-2-00-quantizer-30.webm
+ad0f4fe6733e4e7cdfe8ef8722bb341dcc7538c0 *vp90-2-00-quantizer-30.webm.md5
+037d9f242086cfb085518f6416259defa82d5fc2 *vp90-2-00-quantizer-31.webm
+4654b40792572f0a790874c6347ef9196d86c1a7 *vp90-2-00-quantizer-31.webm.md5
+505899f3f3515044c5c8b3213d9b9d16f614619d *vp90-2-00-quantizer-32.webm
+659a2e6dd02df323f62600626859006640b445df *vp90-2-00-quantizer-32.webm.md5
+8b32ec9c3b7e5ca8ddc6b8aea1c1cb7ca996bccc *vp90-2-00-quantizer-33.webm
+5b175ef1120ddeba4feae1247bf381bbc4e816ce *vp90-2-00-quantizer-33.webm.md5
+4d283755d17e287b1d099a80604398f60d7fb6ea *vp90-2-00-quantizer-34.webm
+22a739de95acfeb27524e3700b8f678a9ad744d8 *vp90-2-00-quantizer-34.webm.md5
+4296f56a892a412d3d4f64824718dd566c4e6459 *vp90-2-00-quantizer-35.webm
+c532c9c8dc7b3506fc6a51e5c20c17ef0ac039e7 *vp90-2-00-quantizer-35.webm.md5
+6f54e11da461e4410dd9075b015e2d9bc1d07dfb *vp90-2-00-quantizer-36.webm
+0b3573f5addea4e3eb11a0b85f068299d5bdad78 *vp90-2-00-quantizer-36.webm.md5
+210581682a26c2c4375efc785c36e07539888bc2 *vp90-2-00-quantizer-37.webm
+2b4fb6f8ba975237858e61cc8f560bcfc87cb38e *vp90-2-00-quantizer-37.webm.md5
+a15ef31283dfc4860f837fe200eb32a445f59629 *vp90-2-00-quantizer-38.webm
+fb76771f3a795054b9936f70da7505c3ac585284 *vp90-2-00-quantizer-38.webm.md5
+1df8433a441412831daae6726df89fa70d21b14d *vp90-2-00-quantizer-39.webm
+39e162c09a20e7e684868097766347014371fee6 *vp90-2-00-quantizer-39.webm.md5
+5330e4788ab9129dbb25a7a7d5411104521248b6 *vp90-2-00-quantizer-40.webm
+872cc0f2cc9dbf000f89eadb4d8f9940e48e00b1 *vp90-2-00-quantizer-40.webm.md5
+d88d03b982889e399a78d7a06eeb1cf30e6c2da2 *vp90-2-00-quantizer-41.webm
+5b4f7217e57fa2a221011d0b32f8d0409496b7b6 *vp90-2-00-quantizer-41.webm.md5
+9e16406e3e26955a6e17d455ef1ef64bbfa26e53 *vp90-2-00-quantizer-42.webm
+0219d090cf37daabe19256ba8e932ba4874b92e4 *vp90-2-00-quantizer-42.webm.md5
+a9b15843486fb05f8cd15437ef279782a42b75db *vp90-2-00-quantizer-43.webm
+3c9b0b4c607f9579a31726bfcf56729334ddc686 *vp90-2-00-quantizer-43.webm.md5
+1dbc931ac446c91eabe7213efff55b596cccf07c *vp90-2-00-quantizer-44.webm
+73bc8f675103abaef3d9f73a2742b3bffd726d23 *vp90-2-00-quantizer-44.webm.md5
+7c6c1be15beb9d6201204b018966c8c4f9777efc *vp90-2-00-quantizer-45.webm
+c907b29da821f790c6748de61f592689312e4e36 *vp90-2-00-quantizer-45.webm.md5
+07b434da1a467580f73b32177ee11b3e00f65a0d *vp90-2-00-quantizer-46.webm
+7b2b7ce60c50bc970bc0ada46d7a7ce440148da3 *vp90-2-00-quantizer-46.webm.md5
+233d0465fb1a6fa36e9f89bd2193ac79bd4d2809 *vp90-2-00-quantizer-47.webm
+527e0a9fb932efe915027ffe077f9e8d3a4fb139 *vp90-2-00-quantizer-47.webm.md5
+719613df7307e205c3fdb6acfb373849c5ab23c7 *vp90-2-00-quantizer-48.webm
+65ab6c9d1b682c183b201c7ff42b90343ce3e304 *vp90-2-00-quantizer-48.webm.md5
+3bf04a598325ed0eabae1598ec7f718f715ec672 *vp90-2-00-quantizer-49.webm
+ac68c4387ce11fcc998d8ba455ab9b2bb361d240 *vp90-2-00-quantizer-49.webm.md5
+d59238fb3a654931c9b65a11e7321b40d1f702e9 *vp90-2-00-quantizer-50.webm
+d0576bfede46fd55659f028f2fd28554ceb3e6cc *vp90-2-00-quantizer-50.webm.md5
+3f579785101d4209360dd96f8c2ffe9beddf3bee *vp90-2-00-quantizer-51.webm
+89fcfe04f4457a7f02ab4a2f94aacbb88aee5789 *vp90-2-00-quantizer-51.webm.md5
+28be5836e2fedefe4babf12fc9b79e460ab0a0f4 *vp90-2-00-quantizer-52.webm
+f3dd52b70c18345fee740220f35da9c4def2017a *vp90-2-00-quantizer-52.webm.md5
+488ad4058c17170665b6acd1021fade9a02771e4 *vp90-2-00-quantizer-53.webm
+1cdcb1d4f3a37cf83ad235eb27ec62ed2a01afc7 *vp90-2-00-quantizer-53.webm.md5
+682978289cb28cc8c9d39bc797300e45d6039de7 *vp90-2-00-quantizer-54.webm
+36c35353f2c03cb099bd710d9994de7d9ed88834 *vp90-2-00-quantizer-54.webm.md5
+c398ce49af762a48f10cc4da9fae0769aae5f226 *vp90-2-00-quantizer-55.webm
+2cf3570542d984f167ab087f59493c7fb47e0ed2 *vp90-2-00-quantizer-55.webm.md5
+3071f18b2fce261aa82d61f81a7ae4ca9a75d0e3 *vp90-2-00-quantizer-56.webm
+d3f93f8272b6de31cffb011a26f11abb514efb12 *vp90-2-00-quantizer-56.webm.md5
+f4e8e14b1f278801a7eb6f11734780a01b1668e9 *vp90-2-00-quantizer-57.webm
+6478fdf1d7faf6db5f19dffc5e1363af358699ee *vp90-2-00-quantizer-57.webm.md5
+307dc264f57cc618fff211fa44d7f52767ed9660 *vp90-2-00-quantizer-58.webm
+cf231d4a52d492fa692ea4194ec5eb7511fec54e *vp90-2-00-quantizer-58.webm.md5
+1fd7cd596170afce2de0b1441b7674bda5723440 *vp90-2-00-quantizer-59.webm
+4681f7ef96f63e085c41bb1a964b0df7e67e0b38 *vp90-2-00-quantizer-59.webm.md5
+34cdcc81c0ba7085aefbb22d7b4aa9bca3dd7c62 *vp90-2-00-quantizer-60.webm
+58691ef53b6b623810e2c57ded374c77535df935 *vp90-2-00-quantizer-60.webm.md5
+e6e812406aab81021bb16e772c1db03f75906cb6 *vp90-2-00-quantizer-61.webm
+76436eace62f08ff92b61a0845e66667a027db1b *vp90-2-00-quantizer-61.webm.md5
+84d811bceed70c950a6a08e572a6e274866e72b1 *vp90-2-00-quantizer-62.webm
+2d937cc011eeddd95222b960982da5cd18db580f *vp90-2-00-quantizer-62.webm.md5
+0912b295ba0ea09359315315ffd67d22d046f883 *vp90-2-00-quantizer-63.webm
+5a829031055d70565f57dbcd47a6ac33619952b3 *vp90-2-00-quantizer-63.webm.md5
+0cf9e5ebe0112bdb47b5887ee5d58eb9d4727c00 *vp90-2-01-sharpness-1.webm
+5a0476be4448bae8f8ca17ea236c98793a755948 *vp90-2-01-sharpness-1.webm.md5
+51e02d7911810cdf5be8b68ac40aedab479a3179 *vp90-2-01-sharpness-2.webm
+a0ca5bc87a5ed7c7051f59078daa0d03be1b45b6 *vp90-2-01-sharpness-2.webm.md5
+0603f8ad239c07a531d948187f4dafcaf51eda8d *vp90-2-01-sharpness-3.webm
+3af8000a69c72fe77881e3176f026c2affb78cc7 *vp90-2-01-sharpness-3.webm.md5
+4ca4839f48146252fb261ed88838d80211804841 *vp90-2-01-sharpness-4.webm
+08832a1494f84fa9edd40e080bcf2c0e80100c76 *vp90-2-01-sharpness-4.webm.md5
+95099dc8f9cbaf9b9a7dd65311923e441ff70731 *vp90-2-01-sharpness-5.webm
+93ceee30c140f0b406726c0d896b9db6031c4c7f *vp90-2-01-sharpness-5.webm.md5
+ceb4116fb7b078d266d153233b6d62a255a34e4c *vp90-2-01-sharpness-6.webm
+da83efe59e537ce538e8b03a6eac63cf25849c9a *vp90-2-01-sharpness-6.webm.md5
+b5f7cd19aece3880f9d616a778e5cc24c6b9b505 *vp90-2-01-sharpness-7.webm
+2957408d20deac8633941a2169f801bae6f086e1 *vp90-2-01-sharpness-7.webm.md5
+ffc096c2ce1050450ad462b5fabd2a5220846319 *vp90-2-02-size-08x08.webm
+e36d2ed6fa2746347710b750586aafa6a01ff3ae *vp90-2-02-size-08x08.webm.md5
+895b986f9fd55cd879472b31c6a06b82094418c8 *vp90-2-02-size-08x10.webm
+079157a19137ccaebba606f2871f45a397347150 *vp90-2-02-size-08x10.webm.md5
+1c5992203e62a2b83040ccbecd748b604e19f4c0 *vp90-2-02-size-08x16.webm
+9aa45ffdf2078f883bbed01450031b691819c144 *vp90-2-02-size-08x16.webm.md5
+d0a8953da1f85f484487408fee5da9e2a8391901 *vp90-2-02-size-08x18.webm
+59a5cc17d354c6a23e5e959d666b1456a5d49c56 *vp90-2-02-size-08x18.webm.md5
+1b13461a9fc65cb041bacfe4ea6f02d363397d61 *vp90-2-02-size-08x32.webm
+2bdddd6878f05d37d84cde056a3f5e7f926ba3d6 *vp90-2-02-size-08x32.webm.md5
+2861f0a0daadb62295b0504a1fbe5b50c79a8f59 *vp90-2-02-size-08x34.webm
+6b5812cfb8a82d378ea2913bf009e93668020147 *vp90-2-02-size-08x34.webm.md5
+02f948216d4246579dc53c47fe55d8fb264ba251 *vp90-2-02-size-08x64.webm
+84b55fdee6d9aa820c7a8c62822446184b191767 *vp90-2-02-size-08x64.webm.md5
+4b011242cbf42516efd2b197baebb61dd34562c9 *vp90-2-02-size-08x66.webm
+6b1fa0a885947b3cc0fe58f75f838e662bd9bb8b *vp90-2-02-size-08x66.webm.md5
+4057796be9dd12df48ab607f502ae6aa70eeeab6 *vp90-2-02-size-10x08.webm
+71c752c51aec9f48de286b93f4c20e9c11cad7d0 *vp90-2-02-size-10x08.webm.md5
+6583c853fa43fc53d51743eac5f3a43a359d45d0 *vp90-2-02-size-10x10.webm
+1da524d24af1944b671d4d3f2b398d6e336584c3 *vp90-2-02-size-10x10.webm.md5
+ba442fc03ccd3a705c64c83b36f5ada67d198874 *vp90-2-02-size-10x16.webm
+7cfd960f232c34c641a4a2a9411b6fd0efb2fc50 *vp90-2-02-size-10x16.webm.md5
+cc92ed40eef14f52e4d080cb2c57939dd8326374 *vp90-2-02-size-10x18.webm
+db5626275cc55ce970b91c995e74f6838d943aca *vp90-2-02-size-10x18.webm.md5
+3a93d501d22325e9fd4c9d8b82e2a432de33c351 *vp90-2-02-size-10x32.webm
+5cae51b0c71cfc131651f345f87583eb2903afaf *vp90-2-02-size-10x32.webm.md5
+50d2f2b15a9a5178153db44a9e03aaf32b227f67 *vp90-2-02-size-10x34.webm
+bb0efe058122641e7f73e94497dda2b9e6c21efd *vp90-2-02-size-10x34.webm.md5
+01624ec173e533e0b33fd9bdb91eb7360c7c9175 *vp90-2-02-size-10x64.webm
+b9c0e3b054463546356acf5157f9be92fd34732f *vp90-2-02-size-10x64.webm.md5
+2942879baf1c09e96b14d0fc84806abfe129c706 *vp90-2-02-size-10x66.webm
+bab5f539c2f91952e187456b4beafbb4c01e25ee *vp90-2-02-size-10x66.webm.md5
+88d2b63ca5e9ee163d8f20e8886f3df3ff301a66 *vp90-2-02-size-16x08.webm
+7f48a0fcf8c25963f3057d7f6669c5f2415834b8 *vp90-2-02-size-16x08.webm.md5
+59261eb34c15ea9b5ddd2d416215c1a8b9e6dc1f *vp90-2-02-size-16x10.webm
+73a7c209a46dd051c9f7339b6e02ccd5b3b9fc81 *vp90-2-02-size-16x10.webm.md5
+066834fef9cf5b9a72932cf4dea5f253e14a976d *vp90-2-02-size-16x16.webm
+faec542f52f37601cb9c480d887ae9355be99372 *vp90-2-02-size-16x16.webm.md5
+195307b4eb3192271ee4a935b0e48deef0c54cc2 *vp90-2-02-size-16x18.webm
+5a92e19e624c0376321d4d0e22c0c91995bc23e1 *vp90-2-02-size-16x18.webm.md5
+14f3f884216d7ae16ec521f024a2f2d31bbf9c1a *vp90-2-02-size-16x32.webm
+ea622d1c817dd174556f7ee7ccfe4942b34d4845 *vp90-2-02-size-16x32.webm.md5
+2e0501100578a5da9dd47e4beea160f945bdd1ba *vp90-2-02-size-16x34.webm
+1b8645ef64239334921c5f56b24ce815e6070b05 *vp90-2-02-size-16x34.webm.md5
+89a6797fbebebe93215f367229a9152277f5dcfe *vp90-2-02-size-16x64.webm
+a03d8c1179ca626a8856fb416d635dbf377979cd *vp90-2-02-size-16x64.webm.md5
+0f3a182e0750fcbae0b9eae80c7a53aabafdd18d *vp90-2-02-size-16x66.webm
+8cb6736dc2d897c1283919a32068af377d66c59c *vp90-2-02-size-16x66.webm.md5
+68fe70dc7914cc1d8d6dcd97388b79196ba3e7f1 *vp90-2-02-size-18x08.webm
+874c7fb505be9db3160c57cb405c4dbd5b990dc2 *vp90-2-02-size-18x08.webm.md5
+0546352dd78496d4dd86c3727ac2ff36c9e72032 *vp90-2-02-size-18x10.webm
+1d80eb36557ea5f25a386495a36f93da0f25316b *vp90-2-02-size-18x10.webm.md5
+60fe99e5f5cc99706efa3e0b894e45cbcf0d6330 *vp90-2-02-size-18x16.webm
+1ab6cdd89a53662995d103546e6611c84f9292ab *vp90-2-02-size-18x16.webm.md5
+f9a8f5fb749d69fd555db6ca093b7f77800c7b4f *vp90-2-02-size-18x18.webm
+ace8a66328f7802b15f9989c2720c029c6abd279 *vp90-2-02-size-18x18.webm.md5
+a197123a527ec25913a9bf52dc8c347749e00045 *vp90-2-02-size-18x32.webm
+34fbd7036752232d1663e70d7f7cdc93f7129202 *vp90-2-02-size-18x32.webm.md5
+f219655a639a774a2c9c0a9f45c28dc0b5e75e24 *vp90-2-02-size-18x34.webm
+2c4d622a9ea548791c1a07903d3702e9774388bb *vp90-2-02-size-18x34.webm.md5
+5308578da48c677d477a5404e19391d1303033c9 *vp90-2-02-size-18x64.webm
+e7fd4462527bac38559518ba80e41847db880f15 *vp90-2-02-size-18x64.webm.md5
+e109a7e013bd179f97e378542e1e81689ed06802 *vp90-2-02-size-18x66.webm
+45c04e422fb383c1f3be04beefaa4490e83bdb1a *vp90-2-02-size-18x66.webm.md5
+38844cae5d99caf445f7de33c3ae78494ce36c01 *vp90-2-02-size-32x08.webm
+ad018be39e493ca2405225034b1a5b7a42af6f3a *vp90-2-02-size-32x08.webm.md5
+7b57eaad55906f9de9903c8657a3fcb2aaf792ea *vp90-2-02-size-32x10.webm
+2294425d4e55d275af5e25a0beac9738a1b4ee73 *vp90-2-02-size-32x10.webm.md5
+f47ca2ced0d47f761bb0a5fdcd911d3f450fdcc1 *vp90-2-02-size-32x16.webm
+ae10981d93913f0ab1f28c1146255e01769aa8c0 *vp90-2-02-size-32x16.webm.md5
+08b23ad838b6cf1fbfe3ad7e7775d95573e815fc *vp90-2-02-size-32x18.webm
+1ba76f4c4a4ac7aabfa3ce195c1b473535eb7cc8 *vp90-2-02-size-32x18.webm.md5
+d5b88ae6c8c25c53dee74d9f1e6ca64244349a57 *vp90-2-02-size-32x32.webm
+e39c067a8ee2da52a51641eb1cb7f8eba935eb6b *vp90-2-02-size-32x32.webm.md5
+529429920dc36bd899059fa75a767f02c8c60874 *vp90-2-02-size-32x34.webm
+56888e7834f52b106e8911e3a7fc0f473b609995 *vp90-2-02-size-32x34.webm.md5
+38e848e160391c2b1a55040aadde613b9f4bf15e *vp90-2-02-size-32x64.webm
+8950485fb3f68b0e8be234db860e4ec5f5490fd0 *vp90-2-02-size-32x64.webm.md5
+5e8670f0b8ec9cefa8795b8959ffbe1a8e1aea94 *vp90-2-02-size-32x66.webm
+225df9d7d72ec711b0b60f4aeb65311c97db054a *vp90-2-02-size-32x66.webm.md5
+695f929e2ce6fb11a1f180322d46c5cb1c97fa61 *vp90-2-02-size-34x08.webm
+5bb4262030018dd01883965c6aa6070185924ef6 *vp90-2-02-size-34x08.webm.md5
+5adf74ec906d2ad3f7526e06bd29f5ad7d966a90 *vp90-2-02-size-34x10.webm
+71c100b437d3e8701632ae8d65c3555339b1c68f *vp90-2-02-size-34x10.webm.md5
+d0918923c987fba2d00193d83797b21289fe54aa *vp90-2-02-size-34x16.webm
+5d5a52f3535b4d2698dd3d87f4a13fdc9b57163d *vp90-2-02-size-34x16.webm.md5
+553ab0042cf87f5e668ec31b2e4b2a4b6ec196fd *vp90-2-02-size-34x18.webm
+a164c7f3c424987df2340496e6a8cf76e973f0f1 *vp90-2-02-size-34x18.webm.md5
+baf3e233634f150de81c18ba5d8848068e1c3c54 *vp90-2-02-size-34x32.webm
+22a79d3bd1c9b85dfe8c70bb2e19f08a92a8be03 *vp90-2-02-size-34x32.webm.md5
+6d50a533774a7167350e4a7ef43c94a5622179a2 *vp90-2-02-size-34x34.webm
+0c099638e79c273546523e06704553e42eb00b00 *vp90-2-02-size-34x34.webm.md5
+698cdd0a5e895cc202c488675e682a8c537ede4f *vp90-2-02-size-34x64.webm
+9317b63987cddab8389510a27b86f9f3d46e3fa5 *vp90-2-02-size-34x64.webm.md5
+4b5335ca06f082b6b69f584eb8e7886bdcafefd3 *vp90-2-02-size-34x66.webm
+e18d68b35428f46a84a947c646804a51ef1d7cec *vp90-2-02-size-34x66.webm.md5
+a54ae7b494906ec928a876e8290e5574f2f9f6a2 *vp90-2-02-size-64x08.webm
+87f9f7087b6489d45e9e4b38ede2c5aef4a4928f *vp90-2-02-size-64x08.webm.md5
+24522c70804a3c23d937df2d829ae63965b23f38 *vp90-2-02-size-64x10.webm
+447ce03938ab53bffcb4a841ee0bfaa90462dcb9 *vp90-2-02-size-64x10.webm.md5
+2a5035d035d214ae614af8051930690ef623989b *vp90-2-02-size-64x16.webm
+84e355761dd2e0361b904c84c52a0dd0384d89cf *vp90-2-02-size-64x16.webm.md5
+3a293ef4e270a19438e59b817fbe5f43eed4d36b *vp90-2-02-size-64x18.webm
+666824e5ba746779eb46079e0631853dcc86d48b *vp90-2-02-size-64x18.webm.md5
+ed32fae837095c9e8fc95d223ec68101812932c2 *vp90-2-02-size-64x32.webm
+97086eadedce1d0d9c072b585ba7b49aec69b1e7 *vp90-2-02-size-64x32.webm.md5
+696c7a7250bdfff594f4dfd88af34239092ecd00 *vp90-2-02-size-64x34.webm
+253a1d38d452e7826b086846c6f872f829c276bb *vp90-2-02-size-64x34.webm.md5
+fc508e0e3c2e6872c60919a60b812c5232e9c2b0 *vp90-2-02-size-64x64.webm
+2cd6ebeca0f82e9f505616825c07950371b905ab *vp90-2-02-size-64x64.webm.md5
+0f8a4fc1d6521187660425c283f08dff8c66e476 *vp90-2-02-size-64x66.webm
+5806be11a1d346be235f88d3683e69f73746166c *vp90-2-02-size-64x66.webm.md5
+273b0c36e3658685cde250408a478116d7ae92f1 *vp90-2-02-size-66x08.webm
+23c3cd0dca20a2f71f036e77ea92025ff4e7a298 *vp90-2-02-size-66x08.webm.md5
+4844c59c3306d1e671bb0568f00e344bf797e66e *vp90-2-02-size-66x10.webm
+e041eaf6841d775f8fde8bbb4949d2733fdaab7f *vp90-2-02-size-66x10.webm.md5
+bdf3f1582b234fcd2805ffec59f9d716a2345302 *vp90-2-02-size-66x16.webm
+2ec85ee18119e6798968571ea6e1b93ca386e3af *vp90-2-02-size-66x16.webm.md5
+0acce9af12b13b025d5274013da7ef6f568f075f *vp90-2-02-size-66x18.webm
+77c4d53e2a5c96b70af9d575fe6811e0f5ee627b *vp90-2-02-size-66x18.webm.md5
+682b36a25774bbdedcd603f504d18eb63f0167d4 *vp90-2-02-size-66x32.webm
+53728fae2a428f16d376a29f341a64ddca97996a *vp90-2-02-size-66x32.webm.md5
+e71b70e901e29eaa6672a6aa4f37f6f5faa02bd6 *vp90-2-02-size-66x34.webm
+f69a6a555e3f614b0a35f9bfc313d8ebb35bc725 *vp90-2-02-size-66x34.webm.md5
+4151b8c29452d5c2266397a7b9bf688899a2937b *vp90-2-02-size-66x64.webm
+69486e7fd9e380b6c97a03d3e167affc79f73840 *vp90-2-02-size-66x64.webm.md5
+68784a1ecac776fe2a3f230345af32f06f123536 *vp90-2-02-size-66x66.webm
+7f008c7f48d55e652fbd6bac405b51e0015c94f2 *vp90-2-02-size-66x66.webm.md5
+7e1bc449231ac1c5c2a11c9a6333b3e828763798 *vp90-2-03-size-196x196.webm
+6788a561466dace32d500194bf042e19cccc35e1 *vp90-2-03-size-196x196.webm.md5
+a170c9a88ec1dd854c7a471ff55fb2a97ac31870 *vp90-2-03-size-196x198.webm
+6bf9d6a8e2bdc5bf4f8a78071a3fed5ca02ad6f2 *vp90-2-03-size-196x198.webm.md5
+68f861d21c4c8b03d572c3d3fcd9f4fbf1f4503f *vp90-2-03-size-196x200.webm
+bbfc260b2bfd872cc6054272bb6b7f959a9e1c6e *vp90-2-03-size-196x200.webm.md5
+fc34889feeca2b7e5b27b4f1ce22d2e2b8e3e4b1 *vp90-2-03-size-196x202.webm
+158ee72af578f39aad0c3b8f4cbed2fc78b57e0f *vp90-2-03-size-196x202.webm.md5
+dd28fb7247af534bdf5e6795a3ac429610489a0b *vp90-2-03-size-196x208.webm
+7546be847efce2d1c0a23f807bfb03f91b764e1e *vp90-2-03-size-196x208.webm.md5
+41d5cf5ed65b722a1b6dc035e67f978ea8ffecf8 *vp90-2-03-size-196x210.webm
+9444fdf632d6a1b6143f4cb10fed8f63c1d67ec1 *vp90-2-03-size-196x210.webm.md5
+5007bc618143437c009d6dde5fc2e86f72d37dc2 *vp90-2-03-size-196x224.webm
+858361d8f79b44df5545feabbc9754ec9ede632f *vp90-2-03-size-196x224.webm.md5
+0bcbe357fbc776c3fa68e7117179574ed7564a44 *vp90-2-03-size-196x226.webm
+72006a5f42031a43d70a2cd9fc1958962a86628f *vp90-2-03-size-196x226.webm.md5
+000239f048cceaac055558e97ef07078ebf65502 *vp90-2-03-size-198x196.webm
+2d6841901b72000c5340f30be602853438c1b787 *vp90-2-03-size-198x196.webm.md5
+ae75b766306a6404c3b3b35a6b6d53633c14fbdb *vp90-2-03-size-198x198.webm
+3f2544b4f3b4b643a98f2c3b15ea5826fc702fa1 *vp90-2-03-size-198x198.webm.md5
+95ffd573fa84ccef1cd59e1583e6054f56a5c83d *vp90-2-03-size-198x200.webm
+5d537e3c9b9c54418c79677543454c4cda3de1af *vp90-2-03-size-198x200.webm.md5
+ecc845bf574375f469bc91bf5c75c79dc00073d6 *vp90-2-03-size-198x202.webm
+1b59f5e111265615a7a459eeda8cc9045178d228 *vp90-2-03-size-198x202.webm.md5
+432fb27144fe421b9f51cf44d2750a26133ed585 *vp90-2-03-size-198x208.webm
+a58a67f4fb357c73ca078aeecbc0f782975630b1 *vp90-2-03-size-198x208.webm.md5
+ff5058e7e6a47435046612afc8536f2040989e6f *vp90-2-03-size-198x210.webm
+18d3be7935e52217e2e9400b6f2c681a9e45dc89 *vp90-2-03-size-198x210.webm.md5
+a0d55263c1ed2c03817454dd4ec4090d36dbc864 *vp90-2-03-size-198x224.webm
+efa366a299817e2da51c00623b165aab9fbb8d91 *vp90-2-03-size-198x224.webm.md5
+ccd142fa2920fc85bb753f049160c1c353ad1574 *vp90-2-03-size-198x226.webm
+534524a0b2dbff852e0b92ef09939db072f83243 *vp90-2-03-size-198x226.webm.md5
+0d483b94ed40abc8ab6e49f960432ee54ad9c7f1 *vp90-2-03-size-200x196.webm
+41795f548181717906e7a504ba551f06c32102ae *vp90-2-03-size-200x196.webm.md5
+f6c2dc54e0989d50f01333fe40c91661fcbf849a *vp90-2-03-size-200x198.webm
+43df5d8c46a40089441392e6d096c588c1079a68 *vp90-2-03-size-200x198.webm.md5
+2f6e9df82e44fc145f0d9212dcccbed3de605e23 *vp90-2-03-size-200x200.webm
+757b2ef96b82093255725bab9690bbafe27f3caf *vp90-2-03-size-200x200.webm.md5
+40c5ea60415642a4a2e75c0d127b06309baadfab *vp90-2-03-size-200x202.webm
+3022c4a1c625b5dc04fdb1052d17d45b4171cfba *vp90-2-03-size-200x202.webm.md5
+6942ed5b27476bb8506d10e600d6ff60887780ca *vp90-2-03-size-200x208.webm
+c4ab8c66f3cf2dc8e8dd7abae9ac21f4d32cd6be *vp90-2-03-size-200x208.webm.md5
+71dbc99b83c49d1da45589b91eabb98e2f4a7b1e *vp90-2-03-size-200x210.webm
+3f0b40da7eef7974b9bc326562f251feb67d9c7c *vp90-2-03-size-200x210.webm.md5
+6b6b8489081cfefb377cc5f18eb754ec2383f655 *vp90-2-03-size-200x224.webm
+a259df2ac0e294492e3f9d4315baa34cab044f04 *vp90-2-03-size-200x224.webm.md5
+c9adc1c9bb07559349a0b054df4af56f7a6edbb9 *vp90-2-03-size-200x226.webm
+714cec61e3575581e4f1a0e3921f4dfdbbd316c5 *vp90-2-03-size-200x226.webm.md5
+f9bdc936bdf53f8be9ce78fecd41a21d31ff3943 *vp90-2-03-size-202x196.webm
+5b8e2e50fcea2c43b12fc067b8a9cc117af77bda *vp90-2-03-size-202x196.webm.md5
+c7b66ea3da87613deb47ff24a111247d3c384fec *vp90-2-03-size-202x198.webm
+517e91204b25586da943556f4adc5951c9be8bee *vp90-2-03-size-202x198.webm.md5
+935ef56b01cfdb4265a7e24696645209ccb20970 *vp90-2-03-size-202x200.webm
+55b8ec4a2513183144a8e27564596c06c7576fce *vp90-2-03-size-202x200.webm.md5
+849acf75e4f1d8d90046704e1103a18c64f30e35 *vp90-2-03-size-202x202.webm
+c79afc6660df2824e7df314e5bfd71f0d8acf76b *vp90-2-03-size-202x202.webm.md5
+17b3a4d55576b770626ccb856b9f1a6c8f6ae476 *vp90-2-03-size-202x208.webm
+0b887ff30409c58f2ccdc3bfacd6be7c69f8997a *vp90-2-03-size-202x208.webm.md5
+032d0ade4230fb2eef6d19915a7a1c9aa4a52617 *vp90-2-03-size-202x210.webm
+f78f8e79533c0c88dd2bfdcec9b1c07848568ece *vp90-2-03-size-202x210.webm.md5
+915a38c31fe425d5b93c837121cfa8082f5ea5bc *vp90-2-03-size-202x224.webm
+bf52a104074d0c5942aa7a5b31e11db47e43d48e *vp90-2-03-size-202x224.webm.md5
+be5cfde35666fa435e47d544d9258215beb1cf29 *vp90-2-03-size-202x226.webm
+2fa2f87502fda756b319389c8975204e130a2e3f *vp90-2-03-size-202x226.webm.md5
+15d908e97862b5b4bf295610df011fb9aa09909b *vp90-2-03-size-208x196.webm
+50c60792305d6a99be376dd596a6ff979325e6cc *vp90-2-03-size-208x196.webm.md5
+a367c7bc9fde56d6f4848cc573c7d4c1ce75e348 *vp90-2-03-size-208x198.webm
+be85fb2c8d435a75484231356f07d06ebddd13cd *vp90-2-03-size-208x198.webm.md5
+05fd46deb7288e7253742091f56e54a9a441a187 *vp90-2-03-size-208x200.webm
+74f8ec3b3a2fe81767ed1ab36a47bc0062d6223c *vp90-2-03-size-208x200.webm.md5
+d8985c4b386513a7385a4b3639bf91e469f1378b *vp90-2-03-size-208x202.webm
+0614a1e8d92048852adcf605a51333f5fabc7f03 *vp90-2-03-size-208x202.webm.md5
+28b002242238479165ba4fb87ee6b442c64b32e4 *vp90-2-03-size-208x208.webm
+37de5aca59bb900228400b0e115d3229edb9dcc0 *vp90-2-03-size-208x208.webm.md5
+c545be0050c2fad7c68427dbf86c62a739e94ab3 *vp90-2-03-size-208x210.webm
+d646eccb3cd578f94b54777e32b88898bef6e17a *vp90-2-03-size-208x210.webm.md5
+63a0cfe295b661026dd7b1bebb67acace1db766f *vp90-2-03-size-208x224.webm
+85c0361d93bf85a335248fef2767ff43eeef23db *vp90-2-03-size-208x224.webm.md5
+f911cc718d66e4fe8a865226088939c9eb1b7825 *vp90-2-03-size-208x226.webm
+a6d583a57876e7b7ec48625b2b2cdbcf70cab837 *vp90-2-03-size-208x226.webm.md5
+5bbb0f36da9a4683cf04e724124d8696332911bf *vp90-2-03-size-210x196.webm
+a3580fc7816d7fbcfb54fdba501cabbd06ba2f1d *vp90-2-03-size-210x196.webm.md5
+8db64d6f9ce36dd382013b42ae4e292deba697bc *vp90-2-03-size-210x198.webm
+eda20f8268c7f4147bead4059e9c4897e09140a9 *vp90-2-03-size-210x198.webm.md5
+ce391505eeaf1d12406563101cd6b2dbbbb44bfc *vp90-2-03-size-210x200.webm
+79d73b7f623082d2a00aa33e95c79d11c7d9c3a8 *vp90-2-03-size-210x200.webm.md5
+852db6fdc206e72391fc69b807f1954934679949 *vp90-2-03-size-210x202.webm
+f69414c5677ed2f2b8b37ae76429e509a92276a5 *vp90-2-03-size-210x202.webm.md5
+c424cc3edd2308da7d33f27acb36b54db5bf2595 *vp90-2-03-size-210x208.webm
+27b18562faa1b3184256f4eae8114b539b3e9d3e *vp90-2-03-size-210x208.webm.md5
+dd029eba719d50a2851592fa8b9b2efe88904930 *vp90-2-03-size-210x210.webm
+c853a1670465eaa04ca31b3511995f1b6ed4f58f *vp90-2-03-size-210x210.webm.md5
+d962e8ae676c54d0c3ea04ec7c04b37ae6a786e3 *vp90-2-03-size-210x224.webm
+93b793e79d987065b39ad8e2e71244368435fc25 *vp90-2-03-size-210x224.webm.md5
+3d0825fe83bcc125be1f78145ff43ca6d7588784 *vp90-2-03-size-210x226.webm
+5230f31a57ca3b5311698a12035d2644533b3ec4 *vp90-2-03-size-210x226.webm.md5
+6622f8bd9279e1ce45509a58a31a990052d45e14 *vp90-2-03-size-224x196.webm
+65411da07f60113f2be05c807879072b161d561e *vp90-2-03-size-224x196.webm.md5
+6744ff2ee2c41eb08c62ff30880833b6d77b585b *vp90-2-03-size-224x198.webm
+46ea3641d41acd4bff347b224646c060d5620385 *vp90-2-03-size-224x198.webm.md5
+8eb91f3416a1404705f370caecd74b2b458351b1 *vp90-2-03-size-224x200.webm
+196aefb854c8b95b9330263d6690b7ee15693ecf *vp90-2-03-size-224x200.webm.md5
+256a5a23ef4e6d5ef2871af5afb8cd13d28cec00 *vp90-2-03-size-224x202.webm
+840ad8455dcf2be378c14b007e66fa642fc8196d *vp90-2-03-size-224x202.webm.md5
+db4606480ab48b96c9a6ff5e639f1f1aea2a12e4 *vp90-2-03-size-224x208.webm
+40b9801d5620467499ac70fa6b7c40aaa5e1c331 *vp90-2-03-size-224x208.webm.md5
+e37159e687fe1cb24cffddfae059301adbaf4212 *vp90-2-03-size-224x210.webm
+1e4acd4b6334ae260c3eed08652d0ba8122073f2 *vp90-2-03-size-224x210.webm.md5
+0de1eb4bb6285ae621e4f2b613d2aa4a8c95a130 *vp90-2-03-size-224x224.webm
+37db449ad86fb286c2c02d94aa8fe0379c05044a *vp90-2-03-size-224x224.webm.md5
+32ebbf903a7d7881bcfe59639f1d472371f3bf27 *vp90-2-03-size-224x226.webm
+5cc3ac5dc9f6912491aa2ddac863f8187f34c569 *vp90-2-03-size-224x226.webm.md5
+9480ff5c2c32b1870ac760c87514912616e6cf01 *vp90-2-03-size-226x196.webm
+fe83655c0f1888f0af7b047785f01ba7ca9f1324 *vp90-2-03-size-226x196.webm.md5
+09cad4221996315cdddad4e502dbfabf53ca1d6a *vp90-2-03-size-226x198.webm
+e3ddfdc650acb95adb45abd9b634e1f09ea8ac96 *vp90-2-03-size-226x198.webm.md5
+c34f49d55fe39e3f0b607e3cc95e30244225cecb *vp90-2-03-size-226x200.webm
+abb83edc868a3523ccd4e5523fac2efbe7c3df1f *vp90-2-03-size-226x200.webm.md5
+d17bc08eedfc60c4c23d576a6c964a21bf854d1f *vp90-2-03-size-226x202.webm
+1d22d2d0f375251c2d5a1acb4714bc35d963865b *vp90-2-03-size-226x202.webm.md5
+9bd537c4f92a25596ccd29fedfe181feac948b92 *vp90-2-03-size-226x208.webm
+6feb0e7325386275719f3511ada9e248a2ae7df4 *vp90-2-03-size-226x208.webm.md5
+4487067f6cedd495b93696b44b37fe0a3e7eda14 *vp90-2-03-size-226x210.webm
+49a8fa87945f47208168d541c068e78d878075d5 *vp90-2-03-size-226x210.webm.md5
+559fea2f8da42b33c1aa1dbc34d1d6781009847a *vp90-2-03-size-226x224.webm
+83c6d8f2969b759e10e5c6542baca1265c874c29 *vp90-2-03-size-226x224.webm.md5
+fe0af2ee47b1e5f6a66db369e2d7e9d870b38dce *vp90-2-03-size-226x226.webm
+94ad19b8b699cea105e2ff18f0df2afd7242bcf7 *vp90-2-03-size-226x226.webm.md5
+52bc1dfd3a97b24d922eb8a31d07527891561f2a *vp90-2-03-size-352x288.webm
+3084d6d0a1eec22e85a394422fbc8faae58930a5 *vp90-2-03-size-352x288.webm.md5
+b6524e4084d15b5d0caaa3d3d1368db30cbee69c *vp90-2-03-deltaq.webm
+65f45ec9a55537aac76104818278e0978f94a678 *vp90-2-03-deltaq.webm.md5
+4dbb87494c7f565ffc266c98d17d0d8c7a5c5aba *vp90-2-05-resize.ivf
+7f6d8879336239a43dbb6c9f13178cb11cf7ed09 *vp90-2-05-resize.ivf.md5
+bf61ddc1f716eba58d4c9837d4e91031d9ce4ffe *vp90-2-06-bilinear.webm
+f6235f937552e11d8eb331ec55da6b3aa596b9ac *vp90-2-06-bilinear.webm.md5
+0c83a1e414fde3bccd6dc451bbaee68e59974c76 *vp90-2-07-frame_parallel.webm
+e5c2c9fb383e5bf3b563480adaeba5b7e3475ecd *vp90-2-07-frame_parallel.webm.md5
+086c7edcffd699ae7d99d710fd7e53b18910ca5b *vp90-2-08-tile_1x2_frame_parallel.webm
+e981ecaabb29a80e0cbc1f4002384965ce8e95bb *vp90-2-08-tile_1x2_frame_parallel.webm.md5
+ed79be026a6f28646c5825da1c12d1fbc70f96a4 *vp90-2-08-tile_1x2.webm
+45b404e025841c9750895fc1a9f6bd384fe6a315 *vp90-2-08-tile_1x2.webm.md5
+cf8ea970c776797aae71dac8317ea926d9431cab *vp90-2-08-tile_1x4_frame_parallel.webm
+a481fbea465010b57af5a19ebf6d4a5cfe5b9278 *vp90-2-08-tile_1x4_frame_parallel.webm.md5
+0203ec456277a01aec401e7fb6c72c9a7e5e3f9d *vp90-2-08-tile_1x4.webm
+c9b237dfcc01c1b414fbcaa481d014a906ef7998 *vp90-2-08-tile_1x4.webm.md5
+20c75157e91ab41f82f70ffa73d5d01df8469287 *vp90-2-08-tile-4x4.webm
+ae7451810247fd13975cc257aa0301ff17102255 *vp90-2-08-tile-4x4.webm.md5
+2ec6e15422ac7a61af072dc5f27fcaf1942ce116 *vp90-2-08-tile-4x1.webm
+0094f5ee5e46345017c30e0aa4835b550212d853 *vp90-2-08-tile-4x1.webm.md5
+edea45dac4a3c2e5372339f8851d24c9bef803d6 *vp90-2-09-subpixel-00.ivf
+5428efc4bf92191faedf4a727fcd1d94966a7abc *vp90-2-09-subpixel-00.ivf.md5
+8cdd435d89029987ee196896e21520e5f879f04d *vp90-2-bbb_1280x720_tile_1x4_1310kbps.webm
+091b373aa2ecb59aa5c647affd5bcafcc7547364 *vp90-2-bbb_1920x1080_tile_1x1_2581kbps.webm
+87ee28032b0963a44b73a850fcc816a6dc83efbb *vp90-2-bbb_1920x1080_tile_1x4_2586kbps.webm
+c6ce25c4bfd4bdfc2932b70428e3dfe11210ec4f *vp90-2-bbb_1920x1080_tile_1x4_fpm_2304kbps.webm
+2064bdb22aa71c2691e0469fb62e8087a43f08f8 *vp90-2-bbb_426x240_tile_1x1_180kbps.webm
+8080eda22694910162f0996e8a962612f381a57f *vp90-2-bbb_640x360_tile_1x2_337kbps.webm
+a484b335c27ea189c0f0d77babea4a510ce12d50 *vp90-2-bbb_854x480_tile_1x2_651kbps.webm
+3eacf1f006250be4cc5c92a7ef146e385ee62653 *vp90-2-sintel_1280x546_tile_1x4_1257kbps.webm
+217f089a16447490823127b36ce0d945522accfd *vp90-2-sintel_1920x818_tile_1x4_fpm_2279kbps.webm
+eedb3c641e60dacbe082491a16df529a5c9187df *vp90-2-sintel_426x182_tile_1x1_171kbps.webm
+cb7e4955af183dff33bcba0c837f0922ab066400 *vp90-2-sintel_640x272_tile_1x2_318kbps.webm
+48613f9380e2580002f8a09d6e412ea4e89a52b9 *vp90-2-sintel_854x364_tile_1x2_621kbps.webm
+990a91f24dd284562d21d714ae773dff5452cad8 *vp90-2-tos_1280x534_tile_1x4_1306kbps.webm
+aa402217577a659cfc670157735b4b8e9aa670fe *vp90-2-tos_1280x534_tile_1x4_fpm_952kbps.webm
+b6dd558c90bca466b4bcbd03b3371648186465a7 *vp90-2-tos_1920x800_tile_1x4_fpm_2335kbps.webm
+1a9c2914ba932a38f0a143efc1ad0e318e78888b *vp90-2-tos_426x178_tile_1x1_181kbps.webm
+a3d2b09f24debad4747a1b3066f572be4273bced *vp90-2-tos_640x266_tile_1x2_336kbps.webm
+c64b03b5c090e6888cb39685c31f00a6b79fa45c *vp90-2-tos_854x356_tile_1x2_656kbps.webm
+94b533dbcf94292001e27cc51fec87f9e8c90c0b *vp90-2-tos_854x356_tile_1x2_fpm_546kbps.webm
+0e7cd4135b231c9cea8d76c19f9e84b6fd77acec *vp90-2-08-tile_1x8_frame_parallel.webm
+c9b6850af28579b031791066457f4cb40df6e1c7 *vp90-2-08-tile_1x8_frame_parallel.webm.md5
+e448b6e83490bca0f8d58b4f4b1126a17baf4b0c *vp90-2-08-tile_1x8.webm
+5e524165f0397e6141d914f4f0a66267d7658376 *vp90-2-08-tile_1x8.webm.md5
+a34e14923d6d17b1144254d8187d7f85b700a63c *vp90-2-02-size-lf-1920x1080.webm
+e3b28ddcfaeb37fb4d132b93f92642a9ad17c22d *vp90-2-02-size-lf-1920x1080.webm.md5
+d48c5db1b0f8e60521a7c749696b8067886033a3 *vp90-2-09-aq2.webm
+84c1599298aac78f2fc05ae2274575d10569dfa0 *vp90-2-09-aq2.webm.md5
+55fc55ed73d578ed60fad05692579873f8bad758 *vp90-2-09-lf_deltas.webm
+54638c38009198c38c8f3b25c182b709b6c1fd2e *vp90-2-09-lf_deltas.webm.md5
+510d95f3beb3b51c572611fdaeeece12277dac30 *vp90-2-10-show-existing-frame.webm
+14d631096f4bfa2d71f7f739aec1448fb3c33bad *vp90-2-10-show-existing-frame.webm.md5
+d2feea7728e8d2c615981d0f47427a4a5a45d881 *vp90-2-10-show-existing-frame2.webm
+5f7c7811baa3e4f03be1dd78c33971b727846821 *vp90-2-10-show-existing-frame2.webm.md5
+b4318e75f73a6a08992c7326de2fb589c2a794c7 *vp90-2-11-size-351x287.webm
+b3c48382cf7d0454e83a02497c229d27720f9e20 *vp90-2-11-size-351x287.webm.md5
+8e0096475ea2535bac71d3e2fc09e0c451c444df *vp90-2-11-size-351x288.webm
+19e003804ec1dfc5464813b32339a15d5ba7b42f *vp90-2-11-size-351x288.webm.md5
+40cd1d6a188d7a88b21ebac1e573d3f270ab261e *vp90-2-11-size-352x287.webm
+68f515abe3858fc1eded46c8e6b2f727d43b5331 *vp90-2-11-size-352x287.webm.md5
+9a510769ff23db410880ec3029d433e87d17f7fc *vp90-2-12-droppable_1.ivf
+952eaac6eefa6f62179ed1db3e922fd42fecc624 *vp90-2-12-droppable_1.ivf.md5
+9a510769ff23db410880ec3029d433e87d17f7fc *vp90-2-12-droppable_2.ivf
+92a756469fa438220524e7fa6ac1d38c89514d17 *vp90-2-12-droppable_2.ivf.md5
+c21e97e4ba486520118d78b01a5cb6e6dc33e190 *vp90-2-12-droppable_3.ivf
+601abc9e4176c70f82ac0381365e9b151fdd24cd *vp90-2-12-droppable_3.ivf.md5
+61c640dad23cd4f7ad811b867e7b7e3521f4e3ba *vp90-2-13-largescaling.webm
+bca1b02eebdb088fa3f389fe0e7571e75a71f523 *vp90-2-13-largescaling.webm.md5
+c740708fa390806eebaf669909c1285ab464f886 *vp90-2-14-resize-fp-tiles-1-2.webm
+c7b85ffd8e11500f73f52e7dc5a47f57c393d47f *vp90-2-14-resize-fp-tiles-1-2.webm.md5
+ec8faa352a08f7033c60f29f80d505e2d7daa103 *vp90-2-14-resize-fp-tiles-1-4.webm
+6852c783fb421bda5ded3d4c5a3ffc46de03fbc1 *vp90-2-14-resize-fp-tiles-1-4.webm.md5
+8af61853ac0d07c4cb5bf7c2016661ba350b3497 *vp90-2-14-resize-fp-tiles-1-8.webm
+571353bac89fea60b5706073409aa3c0d42aefe9 *vp90-2-14-resize-fp-tiles-1-8.webm.md5
+b1c187ed69931496b82ec194017a79831bafceef *vp90-2-14-resize-fp-tiles-1-16.webm
+1c199a41afe42ce303944d70089eaaa2263b4a09 *vp90-2-14-resize-fp-tiles-1-16.webm.md5
+8eaae5a6f2dff934610b0c7a917d7f583ba74aa5 *vp90-2-14-resize-fp-tiles-2-1.webm
+db18fcf915f7ffaea6c39feab8bda6c1688af011 *vp90-2-14-resize-fp-tiles-2-1.webm.md5
+bc3046d138941e2a20e9ceec0ff6d25c25d12af3 *vp90-2-14-resize-fp-tiles-4-1.webm
+393211b808030d09a79927b17a4374b2f68a60ae *vp90-2-14-resize-fp-tiles-4-1.webm.md5
+6e8f8e31721a0f7f68a2964e36e0e698c2e276b1 *vp90-2-14-resize-fp-tiles-8-1.webm
+491fd3cd78fb0577bfe905bb64bbf64bd7d29140 *vp90-2-14-resize-fp-tiles-8-1.webm.md5
+cc5958da2a7edf739cd2cfeb18bd05e77903087e *vp90-2-14-resize-fp-tiles-16-1.webm
+0b58daf55aaf9063bf5b4fb33393d18b417dc428 *vp90-2-14-resize-fp-tiles-16-1.webm.md5
+821eeecc9d8c6a316134dd42d1ff057787d8047b *vp90-2-14-resize-fp-tiles-2-4.webm
+374c549f2839a3d0b732c4e3650700144037e76c *vp90-2-14-resize-fp-tiles-2-4.webm.md5
+dff8c8e49aacea9f4c7f22cb882da984e2a1b405 *vp90-2-14-resize-fp-tiles-2-8.webm
+e5b8820a7c823b21297d6e889e57ec401882c210 *vp90-2-14-resize-fp-tiles-2-8.webm.md5
+77629e4b23e32896aadf6e994c78bd4ffa1c7797 *vp90-2-14-resize-fp-tiles-2-16.webm
+1937f5df032664ac345d4613ad4417b4967b1230 *vp90-2-14-resize-fp-tiles-2-16.webm.md5
+380ba5702bb1ec7947697314ab0300b5c56a1665 *vp90-2-14-resize-fp-tiles-4-2.webm
+fde7b30d2aa64c1e851a4852f655d79fc542cf66 *vp90-2-14-resize-fp-tiles-4-2.webm.md5
+dc784b258ffa2abc2ae693d11792acf0bb9cb74f *vp90-2-14-resize-fp-tiles-8-2.webm
+edf26f0130aeee8342d49c2c8f0793ad008782d9 *vp90-2-14-resize-fp-tiles-8-2.webm.md5
+8e575789fd63ebf69e8eff1b9a4351a249a73bee *vp90-2-14-resize-fp-tiles-16-2.webm
+b6415318c1c589a1f64b9d569ce3cabbec2e0d52 *vp90-2-14-resize-fp-tiles-16-2.webm.md5
+e3adc944a11c4c5517e63664c84ebb0847b64d81 *vp90-2-14-resize-fp-tiles-4-8.webm
+03cba0532bc90a05b1990db830bf5701e24e7982 *vp90-2-14-resize-fp-tiles-4-8.webm.md5
+3b27a991eb6d78dce38efab35b7db682e8cbbee3 *vp90-2-14-resize-fp-tiles-4-16.webm
+5d16b7f82bf59f802724ddfd97abb487150b1c9d *vp90-2-14-resize-fp-tiles-4-16.webm.md5
+d5fed8c28c1d4c7e232ebbd25cf758757313ed96 *vp90-2-14-resize-fp-tiles-8-4.webm
+5a8ff8a52cbbde7bfab569beb6d971c5f8b904f7 *vp90-2-14-resize-fp-tiles-8-4.webm.md5
+17a5faa023d77ee9dad423a4e0d3145796bbc500 *vp90-2-14-resize-fp-tiles-16-4.webm
+2ef8daa3c3e750fd745130d0a76a39fe86f0448f *vp90-2-14-resize-fp-tiles-16-4.webm.md5
+9361e031f5cc990d8740863e310abb5167ae351e *vp90-2-14-resize-fp-tiles-8-16.webm
+57f13a2197486584f4e1a4f82ad969f3abc5a1a2 *vp90-2-14-resize-fp-tiles-8-16.webm.md5
+5803fc6fcbfb47b7661f3fcc6499158a32b56675 *vp90-2-14-resize-fp-tiles-16-8.webm
+be0fe64a1a4933696ff92d93f9bdecdbd886dc13 *vp90-2-14-resize-fp-tiles-16-8.webm.md5
+0ac0f6d20a0afed77f742a3b9acb59fd7b9cb093 *vp90-2-14-resize-fp-tiles-1-2-4-8-16.webm
+1765315acccfe6cd12230e731369fcb15325ebfa *vp90-2-14-resize-fp-tiles-1-2-4-8-16.webm.md5
+4a2b7a683576fe8e330c7d1c4f098ff4e70a43a8 *vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm
+1ef480392112b3509cb190afbb96f9a38dd9fbac *vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm.md5
+e615575ded499ea1d992f3b38e3baa434509cdcd *vp90-2-15-segkey.webm
+e3ab35d4316c5e81325c50f5236ceca4bc0d35df *vp90-2-15-segkey.webm.md5
+9b7ca2cac09d34c4a5d296c1900f93b1e2f69d0d *vp90-2-15-segkey_adpq.webm
+8f46ba5f785d0c2170591a153e0d0d146a7c8090 *vp90-2-15-segkey_adpq.webm.md5
+698a6910a97486b833073ef0c0b18d75dce57ee8 *vp90-2-16-intra-only.webm
+5661b0168752969f055eec37b05fa9fa947dc7eb *vp90-2-16-intra-only.webm.md5
+c01bb7938f9a9f25e0c37afdec2f2fb73b6cc7fa *vp90-2-17-show-existing-frame.webm
+cc75f351818b9a619818f5cc77b9bc013d0c1e11 *vp90-2-17-show-existing-frame.webm.md5
+013708bd043f0821a3e56fb8404d82e7a0c7af6c *vp91-2-04-yuv422.webm
+1e58a7d23adad830a672f1733c9d2ae17890d59c *vp91-2-04-yuv422.webm.md5
+25d78f28948789d159a9453ebc13048b818251b1 *vp91-2-04-yuv440.webm
+81b3870b27a7f695ef6a43e87ab04bbdb5aee2f5 *vp91-2-04-yuv440.webm.md5
+0321d507ce62dedc8a51b4e9011f7a19aed9c3dc *vp91-2-04-yuv444.webm
+367e423dd41fdb49aa028574a2cfec5c2f325c5c *vp91-2-04-yuv444.webm.md5
+f77673b566f686853adefe0c578ad251b7241281 *vp92-2-20-10bit-yuv420.webm
+abdedfaddacbbe1a15ac7a54e86360f03629fb7a *vp92-2-20-10bit-yuv420.webm.md5
+0c2c355a1b17b28537c5a3b19997c8783b69f1af *vp92-2-20-12bit-yuv420.webm
+afb2c2798703e039189b0a15c8ac5685aa51d33f *vp92-2-20-12bit-yuv420.webm.md5
+0d661bc6e83da33238981481efd1b1802d323d88 *vp93-2-20-10bit-yuv422.webm
+10318907063db22eb02fad332556edbbecd443cc *vp93-2-20-10bit-yuv422.webm.md5
+ebc6be2f7511a0bdeac0b18c67f84ba7168839c7 *vp93-2-20-12bit-yuv422.webm
+235232267c6a1dc8a11e45d600f1c99d2f8b42d4 *vp93-2-20-12bit-yuv422.webm.md5
+f76b11b26d4beaceac7a7e7729dd5054d095164f *vp93-2-20-10bit-yuv440.webm
+757b33b5ac969c5999999488a731a3d1e6d9fb88 *vp93-2-20-10bit-yuv440.webm.md5
+df8807dbd29bec795c2db9c3c18e511fbb988101 *vp93-2-20-12bit-yuv440.webm
+ea4100930c3f59a1c23fbb33ab0ea01151cae159 *vp93-2-20-12bit-yuv440.webm.md5
+189c1b5f404ff41a50a7fc96341085ad541314a9 *vp93-2-20-10bit-yuv444.webm
+2dd0177c2f9d970b6e698892634c653630f91f40 *vp93-2-20-10bit-yuv444.webm.md5
+bd44cf6e1c27343e3639df9ac21346aedd5d6973 *vp93-2-20-12bit-yuv444.webm
+f36e5bdf5ec3213f32c0ddc82f95d82c5133bf27 *vp93-2-20-12bit-yuv444.webm.md5
+eb438c6540eb429f74404eedfa3228d409c57874 *desktop_640_360_30.yuv
+89e70ebd22c27d275fe14dc2f1a41841a6d8b9ab *kirland_640_480_30.yuv
+33c533192759e5bb4f07abfbac389dc259db4686 *macmarcomoving_640_480_30.yuv
+8bfaab121080821b8f03b23467911e59ec59b8fe *macmarcostationary_640_480_30.yuv
+70894878d916a599842d9ad0dcd24e10c13e5467 *niklas_640_480_30.yuv
+8784b6df2d8cc946195a90ac00540500d2e522e4 *tacomanarrows_640_480_30.yuv
+edd86a1f5e62fd9da9a9d46078247759c2638009 *tacomasmallcameramovement_640_480_30.yuv
+9a70e8b7d14fba9234d0e51dce876635413ce444 *thaloundeskmtg_640_480_30.yuv
+e7d315dbf4f3928779e0dc624311196d44491d32 *niklas_1280_720_30.yuv
+c77e4a26616add298a05dd5d12397be22c0e40c5 *vp90-2-18-resize.ivf
+c12918cf0a716417fba2de35c3fc5ab90e52dfce *vp90-2-18-resize.ivf.md5
+717da707afcaa1f692ff1946f291054eb75a4f06 *screendata.y4m
+b7c1296630cdf1a7ef493d15ff4f9eb2999202f6 *invalid-vp90-2-08-tile_1x2_frame_parallel.webm.ivf.s47039_r01-05_b6-.ivf
+0a3884edb3fd8f9d9b500223e650f7de257b67d8 *invalid-vp90-2-08-tile_1x2_frame_parallel.webm.ivf.s47039_r01-05_b6-.ivf.res
+359e138dfb66863828397b77000ea7a83c844d02 *invalid-vp90-2-08-tile_1x8_frame_parallel.webm.ivf.s288_r01-05_b6-.ivf
+bbd33de01c17b165b4ce00308e8a19a942023ab8 *invalid-vp90-2-08-tile_1x8_frame_parallel.webm.ivf.s288_r01-05_b6-.ivf.res
+fac89b5735be8a86b0dc05159f996a5c3208ae32 *invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.v2.ivf
+0a3884edb3fd8f9d9b500223e650f7de257b67d8 *invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.v2.ivf.res
+4506dfdcdf8ee4250924b075a0dcf1f070f72e5a *invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.v2.ivf
+bcdedaf168ac225575468fda77502d2dc9fd5baa *invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.v2.ivf.res
+65e93f9653bcf65b022f7d225268d1a90a76e7bb *vp90-2-19-skip.webm
+368dccdde5288c13c25695d2eacdc7402cadf613 *vp90-2-19-skip.webm.md5
+ffe460282df2b0e7d4603c2158653ad96f574b02 *vp90-2-19-skip-01.webm
+bd21bc9eda4a4a36b221d71ede3a139fc3c7bd85 *vp90-2-19-skip-01.webm.md5
+178f5bd239e38cc1cc2657a7a5e1a9f52ad2d3fe *vp90-2-19-skip-02.webm
+9020d5e260bd7df08e2b3d4b86f8623cee3daea2 *vp90-2-19-skip-02.webm.md5
+b03c408cf23158638da18dbc3323b99a1635c68a *invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf
+0a3884edb3fd8f9d9b500223e650f7de257b67d8 *invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf.res
+5e67e24e7f53fd189e565513cef8519b1bd6c712 *invalid-vp90-2-05-resize.ivf.s59293_r01-05_b6-.ivf
+741158f67c0d9d23726624d06bdc482ad368afc9 *invalid-vp90-2-05-resize.ivf.s59293_r01-05_b6-.ivf.res
+8b1f7bf7e86c0976d277f60e8fcd9539e75a079a *invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.v2.ivf
+9c6bdf048fb2e66f07d4b4db5b32e6f303bd6109 *invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.v2.ivf.res
+552e372e9b78127389fb06b34545df2cec15ba6d *invalid-vp91-2-mixedrefcsp-444to420.ivf
+a61774cf03fc584bd9f0904fc145253bb8ea6c4c *invalid-vp91-2-mixedrefcsp-444to420.ivf.res
+812d05a64a0d83c1b504d0519927ddc5a2cdb273 *invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf
+1e472baaf5f6113459f0399a38a5a5e68d17799d *invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf.res
+f97088c7359fc8d3d5aa5eafe57bc7308b3ee124 *vp90-2-20-big_superframe-01.webm
+47d7d409785afa33b123376de0c907336e6c7bd7 *vp90-2-20-big_superframe-01.webm.md5
+65ade6d2786209582c50d34cfe22b3cdb033abaf *vp90-2-20-big_superframe-02.webm
+7c0ed8d04c4d06c5411dd2e5de2411d37f092db5 *vp90-2-20-big_superframe-02.webm.md5
+667ec8718c982aef6be07eb94f083c2efb9d2d16 *vp90-2-07-frame_parallel-1.webm
+bfc82bf848e9c05020d61e3ffc1e62f25df81d19 *vp90-2-07-frame_parallel-1.webm.md5
+efd5a51d175cfdacd169ed23477729dc558030dc *invalid-vp90-2-07-frame_parallel-1.webm
+9f912712ec418be69adb910e2ca886a63c4cec08 *invalid-vp90-2-07-frame_parallel-2.webm
+445f5a53ca9555341852997ccdd480a51540bd14 *invalid-vp90-2-07-frame_parallel-3.webm
+d18c90709a0d03c82beadf10898b27d88fff719c *invalid-vp90-2-03-size-224x196.webm.ivf.s44156_r01-05_b6-.ivf
+d06285d109ecbaef63b0cbcc44d70a129186f51c *invalid-vp90-2-03-size-224x196.webm.ivf.s44156_r01-05_b6-.ivf.res
+e60d859b0ef2b331b21740cf6cb83fabe469b079 *invalid-vp90-2-03-size-202x210.webm.ivf.s113306_r01-05_b6-.ivf
+0ae808dca4d3c1152a9576e14830b6faa39f1b4a *invalid-vp90-2-03-size-202x210.webm.ivf.s113306_r01-05_b6-.ivf.res
+9cfc855459e7549fd015c79e8eca512b2f2cb7e3 *niklas_1280_720_30.y4m
+5b5763b388b1b52a81bb82b39f7ec25c4bd3d0e1 *desktop_credits.y4m
+85771f6ab44e4a0226e206c0cde8351dd5918953 *vp90-2-02-size-130x132.webm
+512dad5eabbed37b4bbbc64ce153f1a5484427b8 *vp90-2-02-size-130x132.webm.md5
+01f7127d40360289db63b27f61cb9afcda350e95 *vp90-2-02-size-132x130.webm
+4a94275328ae076cf60f966c097a8721010fbf5a *vp90-2-02-size-132x130.webm.md5
+f41c0400b5716b4b70552c40dd03d44be131e1cc *vp90-2-02-size-132x132.webm
+1a69e989f697e424bfe3e3e8a77bb0c0992c8e47 *vp90-2-02-size-132x132.webm.md5
+94a5cbfacacba100e0c5f7861c72a1b417feca0f *vp90-2-02-size-178x180.webm
+dedfecf1d784bcf70629592fa5e6f01d5441ccc9 *vp90-2-02-size-178x180.webm.md5
+4828b62478c04014bba3095a83106911a71cf387 *vp90-2-02-size-180x178.webm
+423da2b861050c969d78ed8e8f8f14045d1d8199 *vp90-2-02-size-180x178.webm.md5
+338f7c9282f43e29940f5391118aadd17e4f9234 *vp90-2-02-size-180x180.webm
+6c2ef013392310778dca5dd5351160eca66b0a60 *vp90-2-02-size-180x180.webm.md5
+679fa7d6807e936ff937d7b282e7dbd8ac76447e *vp90-2-14-resize-10frames-fp-tiles-1-2-4-8.webm
+fc7267ab8fc2bf5d6c234e34ee6c078a967b4888 *vp90-2-14-resize-10frames-fp-tiles-1-2-4-8.webm.md5
+9d33a137c819792209c5ce4e4e1ee5da73d574fe *vp90-2-14-resize-10frames-fp-tiles-1-2.webm
+0c78a154956a8605d050bdd75e0dcc4d39c040a6 *vp90-2-14-resize-10frames-fp-tiles-1-2.webm.md5
+d6a8d8c57f66a91d23e8e7df480f9ae841e56c37 *vp90-2-14-resize-10frames-fp-tiles-1-4.webm
+e9b4e8c7b33b5fda745d340c3f47e6623ae40cf2 *vp90-2-14-resize-10frames-fp-tiles-1-4.webm.md5
+aa6fe043a0c4a42b49c87ebbe812d4afd9945bec *vp90-2-14-resize-10frames-fp-tiles-1-8.webm
+028520578994c2d013d4c0129033d4f2ff31bbe0 *vp90-2-14-resize-10frames-fp-tiles-1-8.webm.md5
+d1d5463c9ea7b5cc5f609ddedccddf656f348d1a *vp90-2-14-resize-10frames-fp-tiles-2-1.webm
+92d5872f5bdffbed721703b7e959b4f885e3d77a *vp90-2-14-resize-10frames-fp-tiles-2-1.webm.md5
+677cb29de1215d97346015af5807a9b1faad54cf *vp90-2-14-resize-10frames-fp-tiles-2-4.webm
+a5db19f977094ec3fd60b4f7671b3e6740225e12 *vp90-2-14-resize-10frames-fp-tiles-2-4.webm.md5
+cdd3c52ba21067efdbb2de917fe2a965bf27332e *vp90-2-14-resize-10frames-fp-tiles-2-8.webm
+db17ec5d894ea8b8d0b7f32206d0dd3d46dcfa6d *vp90-2-14-resize-10frames-fp-tiles-2-8.webm.md5
+0f6093c472125d05b764d7d1965c1d56771c0ea2 *vp90-2-14-resize-10frames-fp-tiles-4-1.webm
+bc7c79e1bee07926dd970462ce6f64fc30eec3e1 *vp90-2-14-resize-10frames-fp-tiles-4-1.webm.md5
+c5142e2bff4091338196c8ea8bc9266e64f548bc *vp90-2-14-resize-10frames-fp-tiles-4-2.webm
+22aa3dd430b69fd3d92f6561bac86deeed90486d *vp90-2-14-resize-10frames-fp-tiles-4-2.webm.md5
+ede8b1466d2f26e1b1bd9602addb9cd1017e1d8c *vp90-2-14-resize-10frames-fp-tiles-4-8.webm
+508d5ebb9c0eac2a4100281a3ee052ec2fc19217 *vp90-2-14-resize-10frames-fp-tiles-4-8.webm.md5
+2b292e3392854cd1d76ae597a6f53656cf741cfa *vp90-2-14-resize-10frames-fp-tiles-8-1.webm
+1c24e54fa19e94e1722f24676404444e941c3d31 *vp90-2-14-resize-10frames-fp-tiles-8-1.webm.md5
+61beda21064e09634564caa6697ab90bd53c9af7 *vp90-2-14-resize-10frames-fp-tiles-8-2.webm
+9c0657b4d9e1d0e4c9d28a90e5a8630a65519124 *vp90-2-14-resize-10frames-fp-tiles-8-2.webm.md5
+1758c50a11a7c92522749b4a251664705f1f0d4b *vp90-2-14-resize-10frames-fp-tiles-8-4-2-1.webm
+4f454a06750614314ae15a44087b79016fe2db97 *vp90-2-14-resize-10frames-fp-tiles-8-4-2-1.webm.md5
+3920c95ba94f1f048a731d9d9b416043b44aa4bd *vp90-2-14-resize-10frames-fp-tiles-8-4.webm
+4eb347a0456d2c49a1e1d8de5aa1c51acc39887e *vp90-2-14-resize-10frames-fp-tiles-8-4.webm.md5
+4b95a74c032a473b6683d7ad5754db1b0ec378e9 *vp90-2-21-resize_inter_1280x720_5_1-2.webm
+a7826dd386bedfe69d02736969bfb47fb6a40a5e *vp90-2-21-resize_inter_1280x720_5_1-2.webm.md5
+5cfff79e82c4d69964ccb8e75b4f0c53b9295167 *vp90-2-21-resize_inter_1280x720_5_3-4.webm
+a18f57db4a25e1f543a99f2ceb182e00db0ee22f *vp90-2-21-resize_inter_1280x720_5_3-4.webm.md5
+d26db0811bf30eb4131d928669713e2485f8e833 *vp90-2-21-resize_inter_1280x720_7_1-2.webm
+fd6f9f332cd5bea4c0f0d57be4297bea493cc5a1 *vp90-2-21-resize_inter_1280x720_7_1-2.webm.md5
+5c7d73d4d268e2ba9593b31cb091fd339505c7fd *vp90-2-21-resize_inter_1280x720_7_3-4.webm
+7bbb949cabc1e70dadcc74582739f63b833034e0 *vp90-2-21-resize_inter_1280x720_7_3-4.webm.md5
+f2d2a41a60eb894aff0c5854afca15931f1445a8 *vp90-2-21-resize_inter_1920x1080_5_1-2.webm
+66d7789992613ac9d678ff905ff1059daa1b89e4 *vp90-2-21-resize_inter_1920x1080_5_1-2.webm.md5
+764edb75fe7dd64e73a1b4f3b4b2b1bf237a4dea *vp90-2-21-resize_inter_1920x1080_5_3-4.webm
+f78bea1075983fd990e7f25d4f31438f9b5efa34 *vp90-2-21-resize_inter_1920x1080_5_3-4.webm.md5
+96496f2ade764a5de9f0c27917c7df1f120fb2ef *vp90-2-21-resize_inter_1920x1080_7_1-2.webm
+2632b635135ed5ecd67fd22dec7990d29c4f4cb5 *vp90-2-21-resize_inter_1920x1080_7_1-2.webm.md5
+74889ea42001bf41428cb742ca74e65129c886dc *vp90-2-21-resize_inter_1920x1080_7_3-4.webm
+d2cf3b25956415bb579d368e7098097e482dd73a *vp90-2-21-resize_inter_1920x1080_7_3-4.webm.md5
+4658986a8ce36ebfcc80a1903e446eaab3985336 *vp90-2-21-resize_inter_320x180_5_1-2.webm
+8a3d8cf325109ffa913cc9426c32eea8c202a09a *vp90-2-21-resize_inter_320x180_5_1-2.webm.md5
+16303aa45176520ee42c2c425247aadc1506b881 *vp90-2-21-resize_inter_320x180_5_3-4.webm
+41cab1ddf7715b680a4dbce42faa9bcd72af4e5c *vp90-2-21-resize_inter_320x180_5_3-4.webm.md5
+56648adcee66dd0e5cb6ac947f5ee1b9cc8ba129 *vp90-2-21-resize_inter_320x180_7_1-2.webm
+70047377787003cc03dda7b2394e6d7eaa666d9e *vp90-2-21-resize_inter_320x180_7_1-2.webm.md5
+d2ff99165488499cc55f75929f1ce5ca9c9e359b *vp90-2-21-resize_inter_320x180_7_3-4.webm
+e69019e378114a4643db283b66d1a7e304761a56 *vp90-2-21-resize_inter_320x180_7_3-4.webm.md5
+4834d129bed0f4289d3a88f2ae3a1736f77621b0 *vp90-2-21-resize_inter_320x240_5_1-2.webm
+a75653c53d22b623c1927fc0088da21dafef21f4 *vp90-2-21-resize_inter_320x240_5_1-2.webm.md5
+19818e1b7fd1c1e63d8873c31b0babe29dd33ba6 *vp90-2-21-resize_inter_320x240_5_3-4.webm
+8d89814ff469a186312111651b16601dfbce4336 *vp90-2-21-resize_inter_320x240_5_3-4.webm.md5
+ac8057bae52498f324ce92a074d5f8207cc4a4a7 *vp90-2-21-resize_inter_320x240_7_1-2.webm
+2643440898c83c08cc47bc744245af696b877c24 *vp90-2-21-resize_inter_320x240_7_1-2.webm.md5
+cf4a4cd38ac8b18c42d8c25a3daafdb39132256b *vp90-2-21-resize_inter_320x240_7_3-4.webm
+70ba8ec9120b26e9b0ffa2c79b432f16cbcb50ec *vp90-2-21-resize_inter_320x240_7_3-4.webm.md5
+669f10409fe1c4a054010162ca47773ea1fdbead *vp90-2-21-resize_inter_640x360_5_1-2.webm
+6355a04249004a35fb386dd1024214234f044383 *vp90-2-21-resize_inter_640x360_5_1-2.webm.md5
+c23763b950b8247c1775d1f8158d93716197676c *vp90-2-21-resize_inter_640x360_5_3-4.webm
+59e6fc381e3ec3b7bdaac586334e0bc944d18fb6 *vp90-2-21-resize_inter_640x360_5_3-4.webm.md5
+71b45cbfdd068baa1f679a69e5e6f421d256a85f *vp90-2-21-resize_inter_640x360_7_1-2.webm
+1416fc761b690c54a955c4cf017fa078520e8c18 *vp90-2-21-resize_inter_640x360_7_1-2.webm.md5
+6c409903279448a697e4db63bab1061784bcd8d2 *vp90-2-21-resize_inter_640x360_7_3-4.webm
+60de1299793433a630b71130cf76c9f5965758e2 *vp90-2-21-resize_inter_640x360_7_3-4.webm.md5
+852b597b8af096d90c80bf0ed6ed3b336b851f19 *vp90-2-21-resize_inter_640x480_5_1-2.webm
+f6856f19236ee46ed462bd0a2e7e72b9c3b9cea6 *vp90-2-21-resize_inter_640x480_5_1-2.webm.md5
+792a16c6f60043bd8dceb515f0b95b8891647858 *vp90-2-21-resize_inter_640x480_5_3-4.webm
+68ffe59877e9a7863805e1c0a3ce18ce037d7c9d *vp90-2-21-resize_inter_640x480_5_3-4.webm.md5
+61e044c4759972a35ea3db8c1478a988910a4ef4 *vp90-2-21-resize_inter_640x480_7_1-2.webm
+7739bfca167b1b43fea72f807f01e097b7cb98d8 *vp90-2-21-resize_inter_640x480_7_1-2.webm.md5
+7291af354b4418917eee00e3a7e366086a0b7a10 *vp90-2-21-resize_inter_640x480_7_3-4.webm
+4a18b09ccb36564193f0215f599d745d95bb558c *vp90-2-21-resize_inter_640x480_7_3-4.webm.md5
diff --git a/libs/libvpx/test/test.mk b/libs/libvpx/test/test.mk
new file mode 100644
index 0000000000..e8e830489b
--- /dev/null
+++ b/libs/libvpx/test/test.mk
@@ -0,0 +1,185 @@
+LIBVPX_TEST_SRCS-yes += acm_random.h
+LIBVPX_TEST_SRCS-yes += clear_system_state.h
+LIBVPX_TEST_SRCS-yes += codec_factory.h
+LIBVPX_TEST_SRCS-yes += md5_helper.h
+LIBVPX_TEST_SRCS-yes += register_state_check.h
+LIBVPX_TEST_SRCS-yes += test.mk
+LIBVPX_TEST_SRCS-yes += test_libvpx.cc
+LIBVPX_TEST_SRCS-yes += test_vectors.cc
+LIBVPX_TEST_SRCS-yes += test_vectors.h
+LIBVPX_TEST_SRCS-yes += util.h
+LIBVPX_TEST_SRCS-yes += video_source.h
+
+##
+## BLACK BOX TESTS
+##
+## Black box tests only use the public API.
+##
+LIBVPX_TEST_SRCS-yes                   += ../md5_utils.h ../md5_utils.c
+LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ivf_video_source.h
+LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += ../y4minput.h ../y4minput.c
+LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += aq_segment_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += datarate_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += encode_api_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += error_resilience_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += i420_video_source.h
+LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += resize_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += y4m_video_source.h
+LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += yuv_video_source.h
+
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += altref_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += config_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += cq_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += keyframe_test.cc
+
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += byte_alignment_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += external_frame_buffer_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += invalid_file_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += user_priv_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_frame_parallel_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += active_map_refresh_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += active_map_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += borders_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += cpu_speed_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += frame_size_tests.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_lossless_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_end_to_end_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_ethread_test.cc
+
+LIBVPX_TEST_SRCS-yes                   += decode_test_driver.cc
+LIBVPX_TEST_SRCS-yes                   += decode_test_driver.h
+LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += encode_test_driver.cc
+LIBVPX_TEST_SRCS-yes                   += encode_test_driver.h
+
+## IVF writing.
+LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += ../ivfenc.c ../ivfenc.h
+
+## Y4m parsing.
+LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += y4m_test.cc ../y4menc.c ../y4menc.h
+
+## WebM Parsing
+ifeq ($(CONFIG_WEBM_IO), yes)
+LIBWEBM_PARSER_SRCS                    += ../third_party/libwebm/mkvparser.cpp
+LIBWEBM_PARSER_SRCS                    += ../third_party/libwebm/mkvreader.cpp
+LIBWEBM_PARSER_SRCS                    += ../third_party/libwebm/mkvparser.hpp
+LIBWEBM_PARSER_SRCS                    += ../third_party/libwebm/mkvreader.hpp
+LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += $(LIBWEBM_PARSER_SRCS)
+LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ../tools_common.h
+LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ../webmdec.cc
+LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ../webmdec.h
+LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += webm_video_source.h
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_skip_loopfilter_test.cc
+endif
+
+LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += decode_api_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += test_vector_test.cc
+
+# Currently we only support decoder perf tests for vp9. Also they read from WebM
+# files, so WebM IO is required.
+ifeq ($(CONFIG_DECODE_PERF_TESTS)$(CONFIG_VP9_DECODER)$(CONFIG_WEBM_IO), \
+      yesyesyes)
+LIBVPX_TEST_SRCS-yes                   += decode_perf_test.cc
+endif
+
+# encode perf tests are vp9 only
+ifeq ($(CONFIG_ENCODE_PERF_TESTS)$(CONFIG_VP9_ENCODER), yesyes)
+LIBVPX_TEST_SRCS-yes += encode_perf_test.cc
+endif
+
+##
+## WHITE BOX TESTS
+##
+## Whitebox tests invoke functions not exposed via the public API. Certain
+## shared library builds don't make these functions accessible.
+##
+ifeq ($(CONFIG_SHARED),)
+
+## VP8
+ifeq ($(CONFIG_VP8),yes)
+
+# These tests require both the encoder and decoder to be built.
+ifeq ($(CONFIG_VP8_ENCODER)$(CONFIG_VP8_DECODER),yesyes)
+LIBVPX_TEST_SRCS-yes                   += vp8_boolcoder_test.cc
+LIBVPX_TEST_SRCS-yes                   += vp8_fragments_test.cc
+endif
+
+LIBVPX_TEST_SRCS-$(CONFIG_POSTPROC)    += pp_filter_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += vp8_decrypt_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += quantize_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += set_roi.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += variance_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_fdct4x4_test.cc
+
+LIBVPX_TEST_SRCS-yes                   += idct_test.cc
+LIBVPX_TEST_SRCS-yes                   += sixtap_predict_test.cc
+LIBVPX_TEST_SRCS-yes                   += vpx_scale_test.cc
+
+ifeq ($(CONFIG_VP8_ENCODER)$(CONFIG_TEMPORAL_DENOISING),yesyes)
+LIBVPX_TEST_SRCS-$(HAVE_SSE2) += vp8_denoiser_sse2_test.cc
+endif
+
+endif # VP8
+
+## VP9
+ifeq ($(CONFIG_VP9),yes)
+
+# These tests require both the encoder and decoder to be built.
+ifeq ($(CONFIG_VP9_ENCODER)$(CONFIG_VP9_DECODER),yesyes)
+# IDCT test currently depends on FDCT function
+LIBVPX_TEST_SRCS-yes                   += idct8x8_test.cc
+LIBVPX_TEST_SRCS-yes                   += partial_idct_test.cc
+LIBVPX_TEST_SRCS-yes                   += superframe_test.cc
+LIBVPX_TEST_SRCS-yes                   += tile_independence_test.cc
+LIBVPX_TEST_SRCS-yes                   += vp9_boolcoder_test.cc
+LIBVPX_TEST_SRCS-yes                   += vp9_encoder_parms_get_to_decoder.cc
+endif
+
+LIBVPX_TEST_SRCS-yes                   += convolve_test.cc
+LIBVPX_TEST_SRCS-yes                   += lpf_8_test.cc
+LIBVPX_TEST_SRCS-yes                   += vp9_intrapred_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_decrypt_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_thread_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct16x16_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct32x32_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct4x4_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_error_block_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_quantize_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc
+
+ifeq ($(CONFIG_VP9_ENCODER),yes)
+LIBVPX_TEST_SRCS-$(CONFIG_SPATIAL_SVC) += svc_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_INTERNAL_STATS) += blockiness_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_INTERNAL_STATS) += consistency_test.cc
+endif
+
+ifeq ($(CONFIG_VP9_ENCODER)$(CONFIG_VP9_TEMPORAL_DENOISING),yesyes)
+LIBVPX_TEST_SRCS-$(HAVE_SSE2) += vp9_denoiser_sse2_test.cc
+endif
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_arf_freq_test.cc
+
+endif # VP9
+
+## VP10
+ifeq ($(CONFIG_VP10),yes)
+
+LIBVPX_TEST_SRCS-yes                    += vp10_inv_txfm_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += vp10_dct_test.cc
+
+endif # VP10
+
+## Multi-codec / unconditional whitebox tests.
+
+ifeq ($(findstring yes,$(CONFIG_VP9_ENCODER)$(CONFIG_VP10_ENCODER)),yes)
+LIBVPX_TEST_SRCS-yes += avg_test.cc
+endif
+
+LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += sad_test.cc
+
+TEST_INTRA_PRED_SPEED_SRCS-yes := test_intra_pred_speed.cc
+TEST_INTRA_PRED_SPEED_SRCS-yes += ../md5_utils.h ../md5_utils.c
+
+endif # CONFIG_SHARED
+
+include $(SRC_PATH_BARE)/test/test-data.mk
diff --git a/libs/libvpx/test/test_intra_pred_speed.cc b/libs/libvpx/test/test_intra_pred_speed.cc
new file mode 100644
index 0000000000..3e65fecfb6
--- /dev/null
+++ b/libs/libvpx/test/test_intra_pred_speed.cc
@@ -0,0 +1,372 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+//  Test and time VPX intra-predictor functions
+
+#include <stdio.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/md5_helper.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/vpx_timer.h"
+
+// -----------------------------------------------------------------------------
+
+namespace {
+
+typedef void (*VpxPredFunc)(uint8_t *dst, ptrdiff_t y_stride,
+                            const uint8_t *above, const uint8_t *left);
+
+const int kNumVp9IntraPredFuncs = 13;
+const char *kVp9IntraPredNames[kNumVp9IntraPredFuncs] = {
+  "DC_PRED", "DC_LEFT_PRED", "DC_TOP_PRED", "DC_128_PRED", "V_PRED", "H_PRED",
+  "D45_PRED", "D135_PRED", "D117_PRED", "D153_PRED", "D207_PRED", "D63_PRED",
+  "TM_PRED"
+};
+
+void TestIntraPred(const char name[], VpxPredFunc const *pred_funcs,
+                   const char *const pred_func_names[], int num_funcs,
+                   const char *const signatures[], int block_size,
+                   int num_pixels_per_test) {
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  const int kBPS = 32;
+  const int kTotalPixels = 32 * kBPS;
+  DECLARE_ALIGNED(16, uint8_t, src[kTotalPixels]);
+  DECLARE_ALIGNED(16, uint8_t, ref_src[kTotalPixels]);
+  DECLARE_ALIGNED(16, uint8_t, left[kBPS]);
+  DECLARE_ALIGNED(16, uint8_t, above_mem[2 * kBPS + 16]);
+  uint8_t *const above = above_mem + 16;
+  for (int i = 0; i < kTotalPixels; ++i) ref_src[i] = rnd.Rand8();
+  for (int i = 0; i < kBPS; ++i) left[i] = rnd.Rand8();
+  for (int i = -1; i < kBPS; ++i) above[i] = rnd.Rand8();
+  const int kNumTests = static_cast<int>(2.e10 / num_pixels_per_test);
+
+  // some code assumes the top row has been extended:
+  // d45/d63 C-code, for instance, but not the assembly.
+  // TODO(jzern): this style of extension isn't strictly necessary.
+  ASSERT_LE(block_size, kBPS);
+  memset(above + block_size, above[block_size - 1], 2 * kBPS - block_size);
+
+  for (int k = 0; k < num_funcs; ++k) {
+    if (pred_funcs[k] == NULL) continue;
+    memcpy(src, ref_src, sizeof(src));
+    vpx_usec_timer timer;
+    vpx_usec_timer_start(&timer);
+    for (int num_tests = 0; num_tests < kNumTests; ++num_tests) {
+      pred_funcs[k](src, kBPS, above, left);
+    }
+    libvpx_test::ClearSystemState();
+    vpx_usec_timer_mark(&timer);
+    const int elapsed_time =
+        static_cast<int>(vpx_usec_timer_elapsed(&timer) / 1000);
+    libvpx_test::MD5 md5;
+    md5.Add(src, sizeof(src));
+    printf("Mode %s[%12s]: %5d ms     MD5: %s\n", name, pred_func_names[k],
+           elapsed_time, md5.Get());
+    EXPECT_STREQ(signatures[k], md5.Get());
+  }
+}
+
+void TestIntraPred4(VpxPredFunc const *pred_funcs) {
+  static const int kNumVp9IntraFuncs = 13;
+  static const char *const kSignatures[kNumVp9IntraFuncs] = {
+    "4334156168b34ab599d9b5b30f522fe9",
+    "bc4649d5ba47c7ff178d92e475960fb0",
+    "8d316e5933326dcac24e1064794b5d12",
+    "a27270fed024eafd762c95de85f4da51",
+    "c33dff000d4256c2b8f3bf9e9bab14d2",
+    "44d8cddc2ad8f79b8ed3306051722b4f",
+    "eb54839b2bad6699d8946f01ec041cd0",
+    "ecb0d56ae5f677ea45127ce9d5c058e4",
+    "0b7936841f6813da818275944895b574",
+    "9117972ef64f91a58ff73e1731c81db2",
+    "c56d5e8c729e46825f46dd5d3b5d508a",
+    "c0889e2039bcf7bcb5d2f33cdca69adc",
+    "309a618577b27c648f9c5ee45252bc8f",
+  };
+  TestIntraPred("Intra4", pred_funcs, kVp9IntraPredNames, kNumVp9IntraFuncs,
+                kSignatures, 4, 4 * 4 * kNumVp9IntraFuncs);
+}
+
+void TestIntraPred8(VpxPredFunc const *pred_funcs) {
+  static const int kNumVp9IntraFuncs = 13;
+  static const char *const kSignatures[kNumVp9IntraFuncs] = {
+    "7694ddeeefed887faf9d339d18850928",
+    "7d726b1213591b99f736be6dec65065b",
+    "19c5711281357a485591aaf9c96c0a67",
+    "ba6b66877a089e71cd938e3b8c40caac",
+    "802440c93317e0f8ba93fab02ef74265",
+    "9e09a47a15deb0b9d8372824f9805080",
+    "b7c2d8c662268c0c427da412d7b0311d",
+    "78339c1c60bb1d67d248ab8c4da08b7f",
+    "5c97d70f7d47de1882a6cd86c165c8a9",
+    "8182bf60688b42205acd95e59e967157",
+    "08323400005a297f16d7e57e7fe1eaac",
+    "95f7bfc262329a5849eda66d8f7c68ce",
+    "815b75c8e0d91cc1ae766dc5d3e445a3",
+  };
+  TestIntraPred("Intra8", pred_funcs, kVp9IntraPredNames, kNumVp9IntraFuncs,
+                kSignatures, 8, 8 * 8 * kNumVp9IntraFuncs);
+}
+
+void TestIntraPred16(VpxPredFunc const *pred_funcs) {
+  static const int kNumVp9IntraFuncs = 13;
+  static const char *const kSignatures[kNumVp9IntraFuncs] = {
+    "b40dbb555d5d16a043dc361e6694fe53",
+    "fb08118cee3b6405d64c1fd68be878c6",
+    "6c190f341475c837cc38c2e566b64875",
+    "db5c34ccbe2c7f595d9b08b0dc2c698c",
+    "a62cbfd153a1f0b9fed13e62b8408a7a",
+    "143df5b4c89335e281103f610f5052e4",
+    "d87feb124107cdf2cfb147655aa0bb3c",
+    "7841fae7d4d47b519322e6a03eeed9dc",
+    "f6ebed3f71cbcf8d6d0516ce87e11093",
+    "3cc480297dbfeed01a1c2d78dd03d0c5",
+    "b9f69fa6532b372c545397dcb78ef311",
+    "a8fe1c70432f09d0c20c67bdb6432c4d",
+    "b8a41aa968ec108af447af4217cba91b",
+  };
+  TestIntraPred("Intra16", pred_funcs, kVp9IntraPredNames, kNumVp9IntraFuncs,
+                kSignatures, 16, 16 * 16 * kNumVp9IntraFuncs);
+}
+
+void TestIntraPred32(VpxPredFunc const *pred_funcs) {
+  static const int kNumVp9IntraFuncs = 13;
+  static const char *const kSignatures[kNumVp9IntraFuncs] = {
+    "558541656d84f9ae7896db655826febe",
+    "b3587a1f9a01495fa38c8cd3c8e2a1bf",
+    "4c6501e64f25aacc55a2a16c7e8f0255",
+    "b3b01379ba08916ef6b1b35f7d9ad51c",
+    "0f1eb38b6cbddb3d496199ef9f329071",
+    "911c06efb9ed1c3b4c104b232b55812f",
+    "9225beb0ddfa7a1d24eaa1be430a6654",
+    "0a6d584a44f8db9aa7ade2e2fdb9fc9e",
+    "b01c9076525216925f3456f034fb6eee",
+    "d267e20ad9e5cd2915d1a47254d3d149",
+    "ed012a4a5da71f36c2393023184a0e59",
+    "f162b51ed618d28b936974cff4391da5",
+    "9e1370c6d42e08d357d9612c93a71cfc",
+  };
+  TestIntraPred("Intra32", pred_funcs, kVp9IntraPredNames, kNumVp9IntraFuncs,
+                kSignatures, 32, 32 * 32 * kNumVp9IntraFuncs);
+}
+
+}  // namespace
+
+// Defines a test case for |arch| (e.g., C, SSE2, ...) passing the predictors
+// to |test_func|. The test name is 'arch.test_func', e.g., C.TestIntraPred4.
+#define INTRA_PRED_TEST(arch, test_func, dc, dc_left, dc_top, dc_128, v, h, \
+                        d45, d135, d117, d153, d207, d63, tm)               \
+  TEST(arch, test_func) {                                                   \
+    static const VpxPredFunc vpx_intra_pred[] = {                           \
+        dc,   dc_left, dc_top, dc_128, v,   h, d45,                         \
+        d135, d117,    d153,   d207,   d63, tm};                            \
+    test_func(vpx_intra_pred);                                              \
+  }
+
+// -----------------------------------------------------------------------------
+// 4x4
+
+INTRA_PRED_TEST(C, TestIntraPred4, vpx_dc_predictor_4x4_c,
+                vpx_dc_left_predictor_4x4_c, vpx_dc_top_predictor_4x4_c,
+                vpx_dc_128_predictor_4x4_c, vpx_v_predictor_4x4_c,
+                vpx_h_predictor_4x4_c, vpx_d45_predictor_4x4_c,
+                vpx_d135_predictor_4x4_c, vpx_d117_predictor_4x4_c,
+                vpx_d153_predictor_4x4_c, vpx_d207_predictor_4x4_c,
+                vpx_d63_predictor_4x4_c, vpx_tm_predictor_4x4_c)
+
+#if HAVE_SSE2 && CONFIG_USE_X86INC
+INTRA_PRED_TEST(SSE2, TestIntraPred4, vpx_dc_predictor_4x4_sse2,
+                vpx_dc_left_predictor_4x4_sse2, vpx_dc_top_predictor_4x4_sse2,
+                vpx_dc_128_predictor_4x4_sse2, vpx_v_predictor_4x4_sse2,
+                vpx_h_predictor_4x4_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
+                vpx_tm_predictor_4x4_sse2)
+#endif  // HAVE_SSE2 && CONFIG_USE_X86INC
+
+#if HAVE_SSSE3 && CONFIG_USE_X86INC
+INTRA_PRED_TEST(SSSE3, TestIntraPred4, NULL, NULL, NULL, NULL, NULL,
+                NULL, vpx_d45_predictor_4x4_ssse3, NULL, NULL,
+                vpx_d153_predictor_4x4_ssse3, vpx_d207_predictor_4x4_ssse3,
+                vpx_d63_predictor_4x4_ssse3, NULL)
+#endif  // HAVE_SSSE3 && CONFIG_USE_X86INC
+
+#if HAVE_DSPR2
+INTRA_PRED_TEST(DSPR2, TestIntraPred4, vpx_dc_predictor_4x4_dspr2, NULL, NULL,
+                NULL, NULL, vpx_h_predictor_4x4_dspr2, NULL, NULL, NULL, NULL,
+                NULL, NULL, vpx_tm_predictor_4x4_dspr2)
+#endif  // HAVE_DSPR2
+
+#if HAVE_NEON
+INTRA_PRED_TEST(NEON, TestIntraPred4, vpx_dc_predictor_4x4_neon,
+                vpx_dc_left_predictor_4x4_neon, vpx_dc_top_predictor_4x4_neon,
+                vpx_dc_128_predictor_4x4_neon, vpx_v_predictor_4x4_neon,
+                vpx_h_predictor_4x4_neon, vpx_d45_predictor_4x4_neon,
+                vpx_d135_predictor_4x4_neon, NULL, NULL, NULL, NULL,
+                vpx_tm_predictor_4x4_neon)
+#endif  // HAVE_NEON
+
+#if HAVE_MSA
+INTRA_PRED_TEST(MSA, TestIntraPred4, vpx_dc_predictor_4x4_msa,
+                vpx_dc_left_predictor_4x4_msa, vpx_dc_top_predictor_4x4_msa,
+                vpx_dc_128_predictor_4x4_msa, vpx_v_predictor_4x4_msa,
+                vpx_h_predictor_4x4_msa, NULL, NULL, NULL, NULL, NULL,
+                NULL, vpx_tm_predictor_4x4_msa)
+#endif  // HAVE_MSA
+
+// -----------------------------------------------------------------------------
+// 8x8
+
+INTRA_PRED_TEST(C, TestIntraPred8, vpx_dc_predictor_8x8_c,
+                vpx_dc_left_predictor_8x8_c, vpx_dc_top_predictor_8x8_c,
+                vpx_dc_128_predictor_8x8_c, vpx_v_predictor_8x8_c,
+                vpx_h_predictor_8x8_c, vpx_d45_predictor_8x8_c,
+                vpx_d135_predictor_8x8_c, vpx_d117_predictor_8x8_c,
+                vpx_d153_predictor_8x8_c, vpx_d207_predictor_8x8_c,
+                vpx_d63_predictor_8x8_c, vpx_tm_predictor_8x8_c)
+
+#if HAVE_SSE2 && CONFIG_USE_X86INC
+INTRA_PRED_TEST(SSE2, TestIntraPred8, vpx_dc_predictor_8x8_sse2,
+                vpx_dc_left_predictor_8x8_sse2, vpx_dc_top_predictor_8x8_sse2,
+                vpx_dc_128_predictor_8x8_sse2, vpx_v_predictor_8x8_sse2,
+                vpx_h_predictor_8x8_sse2, NULL, NULL, NULL, NULL, NULL,
+                NULL, vpx_tm_predictor_8x8_sse2)
+#endif  // HAVE_SSE2 && CONFIG_USE_X86INC
+
+#if HAVE_SSSE3 && CONFIG_USE_X86INC
+INTRA_PRED_TEST(SSSE3, TestIntraPred8, NULL, NULL, NULL, NULL, NULL,
+                NULL, vpx_d45_predictor_8x8_ssse3, NULL, NULL,
+                vpx_d153_predictor_8x8_ssse3, vpx_d207_predictor_8x8_ssse3,
+                vpx_d63_predictor_8x8_ssse3, NULL)
+#endif  // HAVE_SSSE3 && CONFIG_USE_X86INC
+
+#if HAVE_DSPR2
+INTRA_PRED_TEST(DSPR2, TestIntraPred8, vpx_dc_predictor_8x8_dspr2, NULL, NULL,
+                NULL, NULL, vpx_h_predictor_8x8_dspr2, NULL, NULL, NULL, NULL,
+                NULL, NULL, vpx_tm_predictor_8x8_c)
+#endif  // HAVE_DSPR2
+
+#if HAVE_NEON
+INTRA_PRED_TEST(NEON, TestIntraPred8, vpx_dc_predictor_8x8_neon,
+                vpx_dc_left_predictor_8x8_neon, vpx_dc_top_predictor_8x8_neon,
+                vpx_dc_128_predictor_8x8_neon, vpx_v_predictor_8x8_neon,
+                vpx_h_predictor_8x8_neon, vpx_d45_predictor_8x8_neon, NULL,
+                NULL, NULL, NULL, NULL, vpx_tm_predictor_8x8_neon)
+
+#endif  // HAVE_NEON
+
+#if HAVE_MSA
+INTRA_PRED_TEST(MSA, TestIntraPred8, vpx_dc_predictor_8x8_msa,
+                vpx_dc_left_predictor_8x8_msa, vpx_dc_top_predictor_8x8_msa,
+                vpx_dc_128_predictor_8x8_msa, vpx_v_predictor_8x8_msa,
+                vpx_h_predictor_8x8_msa, NULL, NULL, NULL, NULL, NULL,
+                NULL, vpx_tm_predictor_8x8_msa)
+#endif  // HAVE_MSA
+
+// -----------------------------------------------------------------------------
+// 16x16
+
+INTRA_PRED_TEST(C, TestIntraPred16, vpx_dc_predictor_16x16_c,
+                vpx_dc_left_predictor_16x16_c, vpx_dc_top_predictor_16x16_c,
+                vpx_dc_128_predictor_16x16_c, vpx_v_predictor_16x16_c,
+                vpx_h_predictor_16x16_c, vpx_d45_predictor_16x16_c,
+                vpx_d135_predictor_16x16_c, vpx_d117_predictor_16x16_c,
+                vpx_d153_predictor_16x16_c, vpx_d207_predictor_16x16_c,
+                vpx_d63_predictor_16x16_c, vpx_tm_predictor_16x16_c)
+
+#if HAVE_SSE2 && CONFIG_USE_X86INC
+INTRA_PRED_TEST(SSE2, TestIntraPred16, vpx_dc_predictor_16x16_sse2,
+                vpx_dc_left_predictor_16x16_sse2,
+                vpx_dc_top_predictor_16x16_sse2,
+                vpx_dc_128_predictor_16x16_sse2, vpx_v_predictor_16x16_sse2,
+                vpx_h_predictor_16x16_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
+                vpx_tm_predictor_16x16_sse2)
+#endif  // HAVE_SSE2 && CONFIG_USE_X86INC
+
+#if HAVE_SSSE3 && CONFIG_USE_X86INC
+INTRA_PRED_TEST(SSSE3, TestIntraPred16, NULL, NULL, NULL, NULL, NULL,
+                NULL, vpx_d45_predictor_16x16_ssse3,
+                NULL, NULL, vpx_d153_predictor_16x16_ssse3,
+                vpx_d207_predictor_16x16_ssse3, vpx_d63_predictor_16x16_ssse3,
+                NULL)
+#endif  // HAVE_SSSE3 && CONFIG_USE_X86INC
+
+#if HAVE_DSPR2
+INTRA_PRED_TEST(DSPR2, TestIntraPred16, vpx_dc_predictor_16x16_dspr2, NULL,
+                NULL, NULL, NULL, vpx_h_predictor_16x16_dspr2, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL)
+#endif  // HAVE_DSPR2
+
+#if HAVE_NEON
+INTRA_PRED_TEST(NEON, TestIntraPred16, vpx_dc_predictor_16x16_neon,
+                vpx_dc_left_predictor_16x16_neon,
+                vpx_dc_top_predictor_16x16_neon,
+                vpx_dc_128_predictor_16x16_neon, vpx_v_predictor_16x16_neon,
+                vpx_h_predictor_16x16_neon, vpx_d45_predictor_16x16_neon, NULL,
+                NULL, NULL, NULL, NULL, vpx_tm_predictor_16x16_neon)
+#endif  // HAVE_NEON
+
+#if HAVE_MSA
+INTRA_PRED_TEST(MSA, TestIntraPred16, vpx_dc_predictor_16x16_msa,
+                vpx_dc_left_predictor_16x16_msa, vpx_dc_top_predictor_16x16_msa,
+                vpx_dc_128_predictor_16x16_msa, vpx_v_predictor_16x16_msa,
+                vpx_h_predictor_16x16_msa, NULL, NULL, NULL, NULL, NULL,
+                NULL, vpx_tm_predictor_16x16_msa)
+#endif  // HAVE_MSA
+
+// -----------------------------------------------------------------------------
+// 32x32
+
+INTRA_PRED_TEST(C, TestIntraPred32, vpx_dc_predictor_32x32_c,
+                vpx_dc_left_predictor_32x32_c, vpx_dc_top_predictor_32x32_c,
+                vpx_dc_128_predictor_32x32_c, vpx_v_predictor_32x32_c,
+                vpx_h_predictor_32x32_c, vpx_d45_predictor_32x32_c,
+                vpx_d135_predictor_32x32_c, vpx_d117_predictor_32x32_c,
+                vpx_d153_predictor_32x32_c, vpx_d207_predictor_32x32_c,
+                vpx_d63_predictor_32x32_c, vpx_tm_predictor_32x32_c)
+
+#if HAVE_SSE2 && CONFIG_USE_X86INC
+INTRA_PRED_TEST(SSE2, TestIntraPred32, vpx_dc_predictor_32x32_sse2,
+                vpx_dc_left_predictor_32x32_sse2,
+                vpx_dc_top_predictor_32x32_sse2,
+                vpx_dc_128_predictor_32x32_sse2, vpx_v_predictor_32x32_sse2,
+                vpx_h_predictor_32x32_sse2, NULL, NULL, NULL, NULL, NULL,
+                NULL, vpx_tm_predictor_32x32_sse2)
+#endif  // HAVE_SSE2 && CONFIG_USE_X86INC
+
+#if HAVE_SSSE3 && CONFIG_USE_X86INC
+INTRA_PRED_TEST(SSSE3, TestIntraPred32, NULL, NULL, NULL, NULL, NULL,
+                NULL, vpx_d45_predictor_32x32_ssse3, NULL, NULL,
+                vpx_d153_predictor_32x32_ssse3, vpx_d207_predictor_32x32_ssse3,
+                vpx_d63_predictor_32x32_ssse3, NULL)
+#endif  // HAVE_SSSE3 && CONFIG_USE_X86INC
+
+#if HAVE_NEON
+INTRA_PRED_TEST(NEON, TestIntraPred32, vpx_dc_predictor_32x32_neon,
+                vpx_dc_left_predictor_32x32_neon,
+                vpx_dc_top_predictor_32x32_neon,
+                vpx_dc_128_predictor_32x32_neon, vpx_v_predictor_32x32_neon,
+                vpx_h_predictor_32x32_neon, NULL, NULL, NULL, NULL, NULL, NULL,
+                vpx_tm_predictor_32x32_neon)
+#endif  // HAVE_NEON
+
+#if HAVE_MSA
+INTRA_PRED_TEST(MSA, TestIntraPred32, vpx_dc_predictor_32x32_msa,
+                vpx_dc_left_predictor_32x32_msa, vpx_dc_top_predictor_32x32_msa,
+                vpx_dc_128_predictor_32x32_msa, vpx_v_predictor_32x32_msa,
+                vpx_h_predictor_32x32_msa, NULL, NULL, NULL, NULL, NULL,
+                NULL, vpx_tm_predictor_32x32_msa)
+#endif  // HAVE_MSA
+
+#include "test/test_libvpx.cc"
diff --git a/libs/libvpx/test/test_libvpx.cc b/libs/libvpx/test/test_libvpx.cc
new file mode 100644
index 0000000000..005ea8d13d
--- /dev/null
+++ b/libs/libvpx/test/test_libvpx.cc
@@ -0,0 +1,77 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <string>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_config.h"
+#if ARCH_X86 || ARCH_X86_64
+#include "vpx_ports/x86.h"
+#endif
+extern "C" {
+#if CONFIG_VP8
+extern void vp8_rtcd();
+#endif  // CONFIG_VP8
+#if CONFIG_VP9
+extern void vp9_rtcd();
+#endif  // CONFIG_VP9
+extern void vpx_dsp_rtcd();
+extern void vpx_scale_rtcd();
+}
+
+#if ARCH_X86 || ARCH_X86_64
+static void append_negative_gtest_filter(const char *str) {
+  std::string filter = ::testing::FLAGS_gtest_filter;
+  // Negative patterns begin with one '-' followed by a ':' separated list.
+  if (filter.find('-') == std::string::npos) filter += '-';
+  filter += str;
+  ::testing::FLAGS_gtest_filter = filter;
+}
+#endif  // ARCH_X86 || ARCH_X86_64
+
+int main(int argc, char **argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+
+#if ARCH_X86 || ARCH_X86_64
+  const int simd_caps = x86_simd_caps();
+  if (!(simd_caps & HAS_MMX))
+    append_negative_gtest_filter(":MMX.*:MMX/*");
+  if (!(simd_caps & HAS_SSE))
+    append_negative_gtest_filter(":SSE.*:SSE/*");
+  if (!(simd_caps & HAS_SSE2))
+    append_negative_gtest_filter(":SSE2.*:SSE2/*");
+  if (!(simd_caps & HAS_SSE3))
+    append_negative_gtest_filter(":SSE3.*:SSE3/*");
+  if (!(simd_caps & HAS_SSSE3))
+    append_negative_gtest_filter(":SSSE3.*:SSSE3/*");
+  if (!(simd_caps & HAS_SSE4_1))
+    append_negative_gtest_filter(":SSE4_1.*:SSE4_1/*");
+  if (!(simd_caps & HAS_AVX))
+    append_negative_gtest_filter(":AVX.*:AVX/*");
+  if (!(simd_caps & HAS_AVX2))
+    append_negative_gtest_filter(":AVX2.*:AVX2/*");
+#endif  // ARCH_X86 || ARCH_X86_64
+
+#if !CONFIG_SHARED
+// Shared library builds don't support whitebox tests
+// that exercise internal symbols.
+
+#if CONFIG_VP8
+  vp8_rtcd();
+#endif  // CONFIG_VP8
+#if CONFIG_VP9
+  vp9_rtcd();
+#endif  // CONFIG_VP9
+  vpx_dsp_rtcd();
+  vpx_scale_rtcd();
+#endif  // !CONFIG_SHARED
+
+  return RUN_ALL_TESTS();
+}
diff --git a/libs/libvpx/test/test_vector_test.cc b/libs/libvpx/test/test_vector_test.cc
new file mode 100644
index 0000000000..f1aa4d7f79
--- /dev/null
+++ b/libs/libvpx/test/test_vector_test.cc
@@ -0,0 +1,192 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+#include <set>
+#include <string>
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "../tools_common.h"
+#include "./vpx_config.h"
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/ivf_video_source.h"
+#include "test/md5_helper.h"
+#include "test/test_vectors.h"
+#include "test/util.h"
+#if CONFIG_WEBM_IO
+#include "test/webm_video_source.h"
+#endif
+#include "vpx_mem/vpx_mem.h"
+
+namespace {
+
+enum DecodeMode {
+  kSerialMode,
+  kFrameParallelMode
+};
+
+const int kDecodeMode = 0;
+const int kThreads = 1;
+const int kFileName = 2;
+
+typedef std::tr1::tuple<int, int, const char*> DecodeParam;
+
+class TestVectorTest : public ::libvpx_test::DecoderTest,
+    public ::libvpx_test::CodecTestWithParam<DecodeParam> {
+ protected:
+  TestVectorTest()
+      : DecoderTest(GET_PARAM(0)),
+        md5_file_(NULL) {
+#if CONFIG_VP9_DECODER
+    resize_clips_.insert(
+      ::libvpx_test::kVP9TestVectorsResize,
+      ::libvpx_test::kVP9TestVectorsResize +
+          ::libvpx_test::kNumVP9TestVectorsResize);
+#endif
+  }
+
+  virtual ~TestVectorTest() {
+    if (md5_file_)
+      fclose(md5_file_);
+  }
+
+  void OpenMD5File(const std::string& md5_file_name_) {
+    md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name_);
+    ASSERT_TRUE(md5_file_ != NULL) << "Md5 file open failed. Filename: "
+        << md5_file_name_;
+  }
+
+  virtual void DecompressedFrameHook(const vpx_image_t& img,
+                                     const unsigned int frame_number) {
+    ASSERT_TRUE(md5_file_ != NULL);
+    char expected_md5[33];
+    char junk[128];
+
+    // Read correct md5 checksums.
+    const int res = fscanf(md5_file_, "%s  %s", expected_md5, junk);
+    ASSERT_NE(res, EOF) << "Read md5 data failed";
+    expected_md5[32] = '\0';
+
+    ::libvpx_test::MD5 md5_res;
+    md5_res.Add(&img);
+    const char *actual_md5 = md5_res.Get();
+
+    // Check md5 match.
+    ASSERT_STREQ(expected_md5, actual_md5)
+        << "Md5 checksums don't match: frame number = " << frame_number;
+  }
+
+#if CONFIG_VP9_DECODER
+  std::set<std::string> resize_clips_;
+#endif
+
+ private:
+  FILE *md5_file_;
+};
+
+// This test runs through the whole set of test vectors, and decodes them.
+// The md5 checksums are computed for each frame in the video file. If md5
+// checksums match the correct md5 data, then the test is passed. Otherwise,
+// the test failed.
+TEST_P(TestVectorTest, MD5Match) {
+  const DecodeParam input = GET_PARAM(1);
+  const std::string filename = std::tr1::get<kFileName>(input);
+  const int threads = std::tr1::get<kThreads>(input);
+  const int mode = std::tr1::get<kDecodeMode>(input);
+  libvpx_test::CompressedVideoSource *video = NULL;
+  vpx_codec_flags_t flags = 0;
+  vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
+  char str[256];
+
+  if (mode == kFrameParallelMode) {
+    flags |= VPX_CODEC_USE_FRAME_THREADING;
+#if CONFIG_VP9_DECODER
+    // TODO(hkuang): Fix frame parallel decode bug. See issue 1086.
+    if (resize_clips_.find(filename) != resize_clips_.end()) {
+      printf("Skipping the test file: %s, due to frame parallel decode bug.\n",
+             filename.c_str());
+      return;
+    }
+#endif
+  }
+
+  cfg.threads = threads;
+
+  snprintf(str, sizeof(str) / sizeof(str[0]) - 1,
+           "file: %s  mode: %s threads: %d",
+           filename.c_str(), mode == 0 ? "Serial" : "Parallel", threads);
+  SCOPED_TRACE(str);
+
+  // Open compressed video file.
+  if (filename.substr(filename.length() - 3, 3) == "ivf") {
+    video = new libvpx_test::IVFVideoSource(filename);
+  } else if (filename.substr(filename.length() - 4, 4) == "webm") {
+#if CONFIG_WEBM_IO
+    video = new libvpx_test::WebMVideoSource(filename);
+#else
+    fprintf(stderr, "WebM IO is disabled, skipping test vector %s\n",
+            filename.c_str());
+    return;
+#endif
+  }
+  video->Init();
+
+  // Construct md5 file name.
+  const std::string md5_filename = filename + ".md5";
+  OpenMD5File(md5_filename);
+
+  // Set decode config and flags.
+  set_cfg(cfg);
+  set_flags(flags);
+
+  // Decode frame, and check the md5 matching.
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video, cfg));
+  delete video;
+}
+
+// Test VP8 decode in serial mode with single thread.
+// NOTE: VP8 only support serial mode.
+#if CONFIG_VP8_DECODER
+VP8_INSTANTIATE_TEST_CASE(
+    TestVectorTest,
+    ::testing::Combine(
+        ::testing::Values(0),  // Serial Mode.
+        ::testing::Values(1),  // Single thread.
+        ::testing::ValuesIn(libvpx_test::kVP8TestVectors,
+                            libvpx_test::kVP8TestVectors +
+                                libvpx_test::kNumVP8TestVectors)));
+#endif  // CONFIG_VP8_DECODER
+
+// Test VP9 decode in serial mode with single thread.
+#if CONFIG_VP9_DECODER
+VP9_INSTANTIATE_TEST_CASE(
+    TestVectorTest,
+    ::testing::Combine(
+        ::testing::Values(0),  // Serial Mode.
+        ::testing::Values(1),  // Single thread.
+        ::testing::ValuesIn(libvpx_test::kVP9TestVectors,
+                            libvpx_test::kVP9TestVectors +
+                                libvpx_test::kNumVP9TestVectors)));
+
+// Test VP9 decode in frame parallel mode with different number of threads.
+INSTANTIATE_TEST_CASE_P(
+    VP9MultiThreadedFrameParallel, TestVectorTest,
+    ::testing::Combine(
+        ::testing::Values(
+            static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP9)),
+        ::testing::Combine(
+            ::testing::Values(1),        // Frame Parallel mode.
+            ::testing::Range(2, 9),      // With 2 ~ 8 threads.
+            ::testing::ValuesIn(libvpx_test::kVP9TestVectors,
+                                libvpx_test::kVP9TestVectors +
+                                    libvpx_test::kNumVP9TestVectors))));
+#endif
+}  // namespace
diff --git a/libs/libvpx/test/test_vectors.cc b/libs/libvpx/test/test_vectors.cc
new file mode 100644
index 0000000000..c822479663
--- /dev/null
+++ b/libs/libvpx/test/test_vectors.cc
@@ -0,0 +1,251 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "test/test_vectors.h"
+
+namespace libvpx_test {
+
+#define NELEMENTS(x) static_cast<int>(sizeof(x) / sizeof(x[0]))
+
+#if CONFIG_VP8_DECODER
+const char *const kVP8TestVectors[] = {
+  "vp80-00-comprehensive-001.ivf",
+  "vp80-00-comprehensive-002.ivf", "vp80-00-comprehensive-003.ivf",
+  "vp80-00-comprehensive-004.ivf", "vp80-00-comprehensive-005.ivf",
+  "vp80-00-comprehensive-006.ivf", "vp80-00-comprehensive-007.ivf",
+  "vp80-00-comprehensive-008.ivf", "vp80-00-comprehensive-009.ivf",
+  "vp80-00-comprehensive-010.ivf", "vp80-00-comprehensive-011.ivf",
+  "vp80-00-comprehensive-012.ivf", "vp80-00-comprehensive-013.ivf",
+  "vp80-00-comprehensive-014.ivf", "vp80-00-comprehensive-015.ivf",
+  "vp80-00-comprehensive-016.ivf", "vp80-00-comprehensive-017.ivf",
+  "vp80-00-comprehensive-018.ivf", "vp80-01-intra-1400.ivf",
+  "vp80-01-intra-1411.ivf", "vp80-01-intra-1416.ivf",
+  "vp80-01-intra-1417.ivf", "vp80-02-inter-1402.ivf",
+  "vp80-02-inter-1412.ivf", "vp80-02-inter-1418.ivf",
+  "vp80-02-inter-1424.ivf", "vp80-03-segmentation-01.ivf",
+  "vp80-03-segmentation-02.ivf", "vp80-03-segmentation-03.ivf",
+  "vp80-03-segmentation-04.ivf", "vp80-03-segmentation-1401.ivf",
+  "vp80-03-segmentation-1403.ivf", "vp80-03-segmentation-1407.ivf",
+  "vp80-03-segmentation-1408.ivf", "vp80-03-segmentation-1409.ivf",
+  "vp80-03-segmentation-1410.ivf", "vp80-03-segmentation-1413.ivf",
+  "vp80-03-segmentation-1414.ivf", "vp80-03-segmentation-1415.ivf",
+  "vp80-03-segmentation-1425.ivf", "vp80-03-segmentation-1426.ivf",
+  "vp80-03-segmentation-1427.ivf", "vp80-03-segmentation-1432.ivf",
+  "vp80-03-segmentation-1435.ivf", "vp80-03-segmentation-1436.ivf",
+  "vp80-03-segmentation-1437.ivf", "vp80-03-segmentation-1441.ivf",
+  "vp80-03-segmentation-1442.ivf", "vp80-04-partitions-1404.ivf",
+  "vp80-04-partitions-1405.ivf", "vp80-04-partitions-1406.ivf",
+  "vp80-05-sharpness-1428.ivf", "vp80-05-sharpness-1429.ivf",
+  "vp80-05-sharpness-1430.ivf", "vp80-05-sharpness-1431.ivf",
+  "vp80-05-sharpness-1433.ivf", "vp80-05-sharpness-1434.ivf",
+  "vp80-05-sharpness-1438.ivf", "vp80-05-sharpness-1439.ivf",
+  "vp80-05-sharpness-1440.ivf", "vp80-05-sharpness-1443.ivf",
+  "vp80-06-smallsize.ivf"
+};
+const int kNumVP8TestVectors = NELEMENTS(kVP8TestVectors);
+#endif  // CONFIG_VP8_DECODER
+#if CONFIG_VP9_DECODER
+#define RESIZE_TEST_VECTORS "vp90-2-21-resize_inter_320x180_5_1-2.webm", \
+  "vp90-2-21-resize_inter_320x180_5_3-4.webm", \
+  "vp90-2-21-resize_inter_320x180_7_1-2.webm", \
+  "vp90-2-21-resize_inter_320x180_7_3-4.webm", \
+  "vp90-2-21-resize_inter_320x240_5_1-2.webm", \
+  "vp90-2-21-resize_inter_320x240_5_3-4.webm", \
+  "vp90-2-21-resize_inter_320x240_7_1-2.webm", \
+  "vp90-2-21-resize_inter_320x240_7_3-4.webm", \
+  "vp90-2-21-resize_inter_640x360_5_1-2.webm", \
+  "vp90-2-21-resize_inter_640x360_5_3-4.webm", \
+  "vp90-2-21-resize_inter_640x360_7_1-2.webm", \
+  "vp90-2-21-resize_inter_640x360_7_3-4.webm", \
+  "vp90-2-21-resize_inter_640x480_5_1-2.webm", \
+  "vp90-2-21-resize_inter_640x480_5_3-4.webm", \
+  "vp90-2-21-resize_inter_640x480_7_1-2.webm", \
+  "vp90-2-21-resize_inter_640x480_7_3-4.webm", \
+  "vp90-2-21-resize_inter_1280x720_5_1-2.webm", \
+  "vp90-2-21-resize_inter_1280x720_5_3-4.webm", \
+  "vp90-2-21-resize_inter_1280x720_7_1-2.webm", \
+  "vp90-2-21-resize_inter_1280x720_7_3-4.webm", \
+  "vp90-2-21-resize_inter_1920x1080_5_1-2.webm", \
+  "vp90-2-21-resize_inter_1920x1080_5_3-4.webm", \
+  "vp90-2-21-resize_inter_1920x1080_7_1-2.webm", \
+  "vp90-2-21-resize_inter_1920x1080_7_3-4.webm",
+
+const char *const kVP9TestVectors[] = {
+  "vp90-2-00-quantizer-00.webm", "vp90-2-00-quantizer-01.webm",
+  "vp90-2-00-quantizer-02.webm", "vp90-2-00-quantizer-03.webm",
+  "vp90-2-00-quantizer-04.webm", "vp90-2-00-quantizer-05.webm",
+  "vp90-2-00-quantizer-06.webm", "vp90-2-00-quantizer-07.webm",
+  "vp90-2-00-quantizer-08.webm", "vp90-2-00-quantizer-09.webm",
+  "vp90-2-00-quantizer-10.webm", "vp90-2-00-quantizer-11.webm",
+  "vp90-2-00-quantizer-12.webm", "vp90-2-00-quantizer-13.webm",
+  "vp90-2-00-quantizer-14.webm", "vp90-2-00-quantizer-15.webm",
+  "vp90-2-00-quantizer-16.webm", "vp90-2-00-quantizer-17.webm",
+  "vp90-2-00-quantizer-18.webm", "vp90-2-00-quantizer-19.webm",
+  "vp90-2-00-quantizer-20.webm", "vp90-2-00-quantizer-21.webm",
+  "vp90-2-00-quantizer-22.webm", "vp90-2-00-quantizer-23.webm",
+  "vp90-2-00-quantizer-24.webm", "vp90-2-00-quantizer-25.webm",
+  "vp90-2-00-quantizer-26.webm", "vp90-2-00-quantizer-27.webm",
+  "vp90-2-00-quantizer-28.webm", "vp90-2-00-quantizer-29.webm",
+  "vp90-2-00-quantizer-30.webm", "vp90-2-00-quantizer-31.webm",
+  "vp90-2-00-quantizer-32.webm", "vp90-2-00-quantizer-33.webm",
+  "vp90-2-00-quantizer-34.webm", "vp90-2-00-quantizer-35.webm",
+  "vp90-2-00-quantizer-36.webm", "vp90-2-00-quantizer-37.webm",
+  "vp90-2-00-quantizer-38.webm", "vp90-2-00-quantizer-39.webm",
+  "vp90-2-00-quantizer-40.webm", "vp90-2-00-quantizer-41.webm",
+  "vp90-2-00-quantizer-42.webm", "vp90-2-00-quantizer-43.webm",
+  "vp90-2-00-quantizer-44.webm", "vp90-2-00-quantizer-45.webm",
+  "vp90-2-00-quantizer-46.webm", "vp90-2-00-quantizer-47.webm",
+  "vp90-2-00-quantizer-48.webm", "vp90-2-00-quantizer-49.webm",
+  "vp90-2-00-quantizer-50.webm", "vp90-2-00-quantizer-51.webm",
+  "vp90-2-00-quantizer-52.webm", "vp90-2-00-quantizer-53.webm",
+  "vp90-2-00-quantizer-54.webm", "vp90-2-00-quantizer-55.webm",
+  "vp90-2-00-quantizer-56.webm", "vp90-2-00-quantizer-57.webm",
+  "vp90-2-00-quantizer-58.webm", "vp90-2-00-quantizer-59.webm",
+  "vp90-2-00-quantizer-60.webm", "vp90-2-00-quantizer-61.webm",
+  "vp90-2-00-quantizer-62.webm", "vp90-2-00-quantizer-63.webm",
+  "vp90-2-01-sharpness-1.webm", "vp90-2-01-sharpness-2.webm",
+  "vp90-2-01-sharpness-3.webm", "vp90-2-01-sharpness-4.webm",
+  "vp90-2-01-sharpness-5.webm", "vp90-2-01-sharpness-6.webm",
+  "vp90-2-01-sharpness-7.webm", "vp90-2-02-size-08x08.webm",
+  "vp90-2-02-size-08x10.webm", "vp90-2-02-size-08x16.webm",
+  "vp90-2-02-size-08x18.webm", "vp90-2-02-size-08x32.webm",
+  "vp90-2-02-size-08x34.webm", "vp90-2-02-size-08x64.webm",
+  "vp90-2-02-size-08x66.webm", "vp90-2-02-size-10x08.webm",
+  "vp90-2-02-size-10x10.webm", "vp90-2-02-size-10x16.webm",
+  "vp90-2-02-size-10x18.webm", "vp90-2-02-size-10x32.webm",
+  "vp90-2-02-size-10x34.webm", "vp90-2-02-size-10x64.webm",
+  "vp90-2-02-size-10x66.webm", "vp90-2-02-size-16x08.webm",
+  "vp90-2-02-size-16x10.webm", "vp90-2-02-size-16x16.webm",
+  "vp90-2-02-size-16x18.webm", "vp90-2-02-size-16x32.webm",
+  "vp90-2-02-size-16x34.webm", "vp90-2-02-size-16x64.webm",
+  "vp90-2-02-size-16x66.webm", "vp90-2-02-size-18x08.webm",
+  "vp90-2-02-size-18x10.webm", "vp90-2-02-size-18x16.webm",
+  "vp90-2-02-size-18x18.webm", "vp90-2-02-size-18x32.webm",
+  "vp90-2-02-size-18x34.webm", "vp90-2-02-size-18x64.webm",
+  "vp90-2-02-size-18x66.webm", "vp90-2-02-size-32x08.webm",
+  "vp90-2-02-size-32x10.webm", "vp90-2-02-size-32x16.webm",
+  "vp90-2-02-size-32x18.webm", "vp90-2-02-size-32x32.webm",
+  "vp90-2-02-size-32x34.webm", "vp90-2-02-size-32x64.webm",
+  "vp90-2-02-size-32x66.webm", "vp90-2-02-size-34x08.webm",
+  "vp90-2-02-size-34x10.webm", "vp90-2-02-size-34x16.webm",
+  "vp90-2-02-size-34x18.webm", "vp90-2-02-size-34x32.webm",
+  "vp90-2-02-size-34x34.webm", "vp90-2-02-size-34x64.webm",
+  "vp90-2-02-size-34x66.webm", "vp90-2-02-size-64x08.webm",
+  "vp90-2-02-size-64x10.webm", "vp90-2-02-size-64x16.webm",
+  "vp90-2-02-size-64x18.webm", "vp90-2-02-size-64x32.webm",
+  "vp90-2-02-size-64x34.webm", "vp90-2-02-size-64x64.webm",
+  "vp90-2-02-size-64x66.webm", "vp90-2-02-size-66x08.webm",
+  "vp90-2-02-size-66x10.webm", "vp90-2-02-size-66x16.webm",
+  "vp90-2-02-size-66x18.webm", "vp90-2-02-size-66x32.webm",
+  "vp90-2-02-size-66x34.webm", "vp90-2-02-size-66x64.webm",
+  "vp90-2-02-size-66x66.webm", "vp90-2-02-size-130x132.webm",
+  "vp90-2-02-size-132x130.webm", "vp90-2-02-size-132x132.webm",
+  "vp90-2-02-size-178x180.webm", "vp90-2-02-size-180x178.webm",
+  "vp90-2-02-size-180x180.webm", "vp90-2-03-size-196x196.webm",
+  "vp90-2-03-size-196x198.webm", "vp90-2-03-size-196x200.webm",
+  "vp90-2-03-size-196x202.webm", "vp90-2-03-size-196x208.webm",
+  "vp90-2-03-size-196x210.webm", "vp90-2-03-size-196x224.webm",
+  "vp90-2-03-size-196x226.webm", "vp90-2-03-size-198x196.webm",
+  "vp90-2-03-size-198x198.webm", "vp90-2-03-size-198x200.webm",
+  "vp90-2-03-size-198x202.webm", "vp90-2-03-size-198x208.webm",
+  "vp90-2-03-size-198x210.webm", "vp90-2-03-size-198x224.webm",
+  "vp90-2-03-size-198x226.webm", "vp90-2-03-size-200x196.webm",
+  "vp90-2-03-size-200x198.webm", "vp90-2-03-size-200x200.webm",
+  "vp90-2-03-size-200x202.webm", "vp90-2-03-size-200x208.webm",
+  "vp90-2-03-size-200x210.webm", "vp90-2-03-size-200x224.webm",
+  "vp90-2-03-size-200x226.webm", "vp90-2-03-size-202x196.webm",
+  "vp90-2-03-size-202x198.webm", "vp90-2-03-size-202x200.webm",
+  "vp90-2-03-size-202x202.webm", "vp90-2-03-size-202x208.webm",
+  "vp90-2-03-size-202x210.webm", "vp90-2-03-size-202x224.webm",
+  "vp90-2-03-size-202x226.webm", "vp90-2-03-size-208x196.webm",
+  "vp90-2-03-size-208x198.webm", "vp90-2-03-size-208x200.webm",
+  "vp90-2-03-size-208x202.webm", "vp90-2-03-size-208x208.webm",
+  "vp90-2-03-size-208x210.webm", "vp90-2-03-size-208x224.webm",
+  "vp90-2-03-size-208x226.webm", "vp90-2-03-size-210x196.webm",
+  "vp90-2-03-size-210x198.webm", "vp90-2-03-size-210x200.webm",
+  "vp90-2-03-size-210x202.webm", "vp90-2-03-size-210x208.webm",
+  "vp90-2-03-size-210x210.webm", "vp90-2-03-size-210x224.webm",
+  "vp90-2-03-size-210x226.webm", "vp90-2-03-size-224x196.webm",
+  "vp90-2-03-size-224x198.webm", "vp90-2-03-size-224x200.webm",
+  "vp90-2-03-size-224x202.webm", "vp90-2-03-size-224x208.webm",
+  "vp90-2-03-size-224x210.webm", "vp90-2-03-size-224x224.webm",
+  "vp90-2-03-size-224x226.webm", "vp90-2-03-size-226x196.webm",
+  "vp90-2-03-size-226x198.webm", "vp90-2-03-size-226x200.webm",
+  "vp90-2-03-size-226x202.webm", "vp90-2-03-size-226x208.webm",
+  "vp90-2-03-size-226x210.webm", "vp90-2-03-size-226x224.webm",
+  "vp90-2-03-size-226x226.webm", "vp90-2-03-size-352x288.webm",
+  "vp90-2-03-deltaq.webm",
+  "vp90-2-05-resize.ivf", "vp90-2-06-bilinear.webm",
+  "vp90-2-07-frame_parallel.webm", "vp90-2-08-tile_1x2_frame_parallel.webm",
+  "vp90-2-08-tile_1x2.webm", "vp90-2-08-tile_1x4_frame_parallel.webm",
+  "vp90-2-08-tile_1x4.webm", "vp90-2-08-tile_1x8_frame_parallel.webm",
+  "vp90-2-08-tile_1x8.webm", "vp90-2-08-tile-4x4.webm",
+  "vp90-2-08-tile-4x1.webm", "vp90-2-09-subpixel-00.ivf",
+  "vp90-2-02-size-lf-1920x1080.webm", "vp90-2-09-aq2.webm",
+  "vp90-2-09-lf_deltas.webm", "vp90-2-10-show-existing-frame.webm",
+  "vp90-2-10-show-existing-frame2.webm",
+  "vp90-2-11-size-351x287.webm", "vp90-2-11-size-351x288.webm",
+  "vp90-2-11-size-352x287.webm", "vp90-2-12-droppable_1.ivf",
+  "vp90-2-12-droppable_2.ivf", "vp90-2-12-droppable_3.ivf",
+#if !CONFIG_SIZE_LIMIT || \
+    (DECODE_WIDTH_LIMIT >= 20400 && DECODE_HEIGHT_LIMIT >= 120)
+  "vp90-2-13-largescaling.webm",
+#endif
+  "vp90-2-14-resize-fp-tiles-1-16.webm",
+  "vp90-2-14-resize-fp-tiles-1-2-4-8-16.webm",
+  "vp90-2-14-resize-fp-tiles-1-2.webm", "vp90-2-14-resize-fp-tiles-1-4.webm",
+  "vp90-2-14-resize-fp-tiles-16-1.webm", "vp90-2-14-resize-fp-tiles-16-2.webm",
+  "vp90-2-14-resize-fp-tiles-16-4.webm",
+  "vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm",
+  "vp90-2-14-resize-fp-tiles-16-8.webm", "vp90-2-14-resize-fp-tiles-1-8.webm",
+  "vp90-2-14-resize-fp-tiles-2-16.webm", "vp90-2-14-resize-fp-tiles-2-1.webm",
+  "vp90-2-14-resize-fp-tiles-2-4.webm", "vp90-2-14-resize-fp-tiles-2-8.webm",
+  "vp90-2-14-resize-fp-tiles-4-16.webm", "vp90-2-14-resize-fp-tiles-4-1.webm",
+  "vp90-2-14-resize-fp-tiles-4-2.webm", "vp90-2-14-resize-fp-tiles-4-8.webm",
+  "vp90-2-14-resize-fp-tiles-8-16.webm", "vp90-2-14-resize-fp-tiles-8-1.webm",
+  "vp90-2-14-resize-fp-tiles-8-2.webm", "vp90-2-14-resize-fp-tiles-8-4.webm",
+  "vp90-2-14-resize-10frames-fp-tiles-1-2-4-8.webm",
+  "vp90-2-14-resize-10frames-fp-tiles-1-2.webm",
+  "vp90-2-14-resize-10frames-fp-tiles-1-4.webm",
+  "vp90-2-14-resize-10frames-fp-tiles-1-8.webm",
+  "vp90-2-14-resize-10frames-fp-tiles-2-1.webm",
+  "vp90-2-14-resize-10frames-fp-tiles-2-4.webm",
+  "vp90-2-14-resize-10frames-fp-tiles-2-8.webm",
+  "vp90-2-14-resize-10frames-fp-tiles-4-1.webm",
+  "vp90-2-14-resize-10frames-fp-tiles-4-2.webm",
+  "vp90-2-14-resize-10frames-fp-tiles-4-8.webm",
+  "vp90-2-14-resize-10frames-fp-tiles-8-1.webm",
+  "vp90-2-14-resize-10frames-fp-tiles-8-2.webm",
+  "vp90-2-14-resize-10frames-fp-tiles-8-4-2-1.webm",
+  "vp90-2-14-resize-10frames-fp-tiles-8-4.webm",
+  "vp90-2-15-segkey.webm", "vp90-2-15-segkey_adpq.webm",
+  "vp90-2-16-intra-only.webm", "vp90-2-17-show-existing-frame.webm",
+  "vp90-2-18-resize.ivf", "vp90-2-19-skip.webm",
+  "vp90-2-19-skip-01.webm", "vp90-2-19-skip-02.webm",
+  "vp91-2-04-yuv444.webm",
+  "vp91-2-04-yuv422.webm", "vp91-2-04-yuv440.webm",
+#if CONFIG_VP9_HIGHBITDEPTH
+  "vp92-2-20-10bit-yuv420.webm", "vp92-2-20-12bit-yuv420.webm",
+  "vp93-2-20-10bit-yuv422.webm", "vp93-2-20-12bit-yuv422.webm",
+  "vp93-2-20-10bit-yuv440.webm", "vp93-2-20-12bit-yuv440.webm",
+  "vp93-2-20-10bit-yuv444.webm", "vp93-2-20-12bit-yuv444.webm",
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  "vp90-2-20-big_superframe-01.webm", "vp90-2-20-big_superframe-02.webm",
+  RESIZE_TEST_VECTORS
+};
+const int kNumVP9TestVectors = NELEMENTS(kVP9TestVectors);
+const char *const kVP9TestVectorsResize[] = {
+  RESIZE_TEST_VECTORS
+};
+const int kNumVP9TestVectorsResize = NELEMENTS(kVP9TestVectorsResize);
+#undef RESIZE_TEST_VECTORS
+#endif  // CONFIG_VP9_DECODER
+
+}  // namespace libvpx_test
diff --git a/libs/libvpx/test/test_vectors.h b/libs/libvpx/test/test_vectors.h
new file mode 100644
index 0000000000..2c6918abda
--- /dev/null
+++ b/libs/libvpx/test/test_vectors.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef TEST_TEST_VECTORS_H_
+#define TEST_TEST_VECTORS_H_
+
+#include "./vpx_config.h"
+
+namespace libvpx_test {
+
+#if CONFIG_VP8_DECODER
+extern const int kNumVP8TestVectors;
+extern const char *const kVP8TestVectors[];
+#endif
+
+#if CONFIG_VP9_DECODER
+extern const int kNumVP9TestVectors;
+extern const char *const kVP9TestVectors[];
+extern const int kNumVP9TestVectorsResize;
+extern const char *const kVP9TestVectorsResize[];
+#endif  // CONFIG_VP9_DECODER
+
+}  // namespace libvpx_test
+
+#endif  // TEST_TEST_VECTORS_H_
diff --git a/libs/libvpx/test/tile_independence_test.cc b/libs/libvpx/test/tile_independence_test.cc
new file mode 100644
index 0000000000..193bd45986
--- /dev/null
+++ b/libs/libvpx/test/tile_independence_test.cc
@@ -0,0 +1,108 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/md5_helper.h"
+#include "vpx_mem/vpx_mem.h"
+
+namespace {
+class TileIndependenceTest : public ::libvpx_test::EncoderTest,
+                             public ::libvpx_test::CodecTestWithParam<int> {
+ protected:
+  TileIndependenceTest()
+      : EncoderTest(GET_PARAM(0)),
+        md5_fw_order_(),
+        md5_inv_order_(),
+        n_tiles_(GET_PARAM(1)) {
+    init_flags_ = VPX_CODEC_USE_PSNR;
+    vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
+    cfg.w = 704;
+    cfg.h = 144;
+    cfg.threads = 1;
+    fw_dec_ = codec_->CreateDecoder(cfg, 0);
+    inv_dec_ = codec_->CreateDecoder(cfg, 0);
+    inv_dec_->Control(VP9_INVERT_TILE_DECODE_ORDER, 1);
+  }
+
+  virtual ~TileIndependenceTest() {
+    delete fw_dec_;
+    delete inv_dec_;
+  }
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(libvpx_test::kTwoPassGood);
+  }
+
+  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                                  libvpx_test::Encoder *encoder) {
+    if (video->frame() == 1) {
+      encoder->Control(VP9E_SET_TILE_COLUMNS, n_tiles_);
+    }
+  }
+
+  void UpdateMD5(::libvpx_test::Decoder *dec, const vpx_codec_cx_pkt_t *pkt,
+                 ::libvpx_test::MD5 *md5) {
+    const vpx_codec_err_t res = dec->DecodeFrame(
+        reinterpret_cast<uint8_t*>(pkt->data.frame.buf), pkt->data.frame.sz);
+    if (res != VPX_CODEC_OK) {
+      abort_ = true;
+      ASSERT_EQ(VPX_CODEC_OK, res);
+    }
+    const vpx_image_t *img = dec->GetDxData().Next();
+    md5->Add(img);
+  }
+
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    UpdateMD5(fw_dec_, pkt, &md5_fw_order_);
+    UpdateMD5(inv_dec_, pkt, &md5_inv_order_);
+  }
+
+  ::libvpx_test::MD5 md5_fw_order_, md5_inv_order_;
+  ::libvpx_test::Decoder *fw_dec_, *inv_dec_;
+
+ private:
+  int n_tiles_;
+};
+
+// run an encode with 2 or 4 tiles, and do the decode both in normal and
+// inverted tile ordering. Ensure that the MD5 of the output in both cases
+// is identical. If so, tiles are considered independent and the test passes.
+TEST_P(TileIndependenceTest, MD5Match) {
+  const vpx_rational timebase = { 33333333, 1000000000 };
+  cfg_.g_timebase = timebase;
+  cfg_.rc_target_bitrate = 500;
+  cfg_.g_lag_in_frames = 25;
+  cfg_.rc_end_usage = VPX_VBR;
+
+  libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 704, 144,
+                                     timebase.den, timebase.num, 0, 30);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  const char *md5_fw_str = md5_fw_order_.Get();
+  const char *md5_inv_str = md5_inv_order_.Get();
+
+  // could use ASSERT_EQ(!memcmp(.., .., 16) here, but this gives nicer
+  // output if it fails. Not sure if it's helpful since it's really just
+  // a MD5...
+  ASSERT_STREQ(md5_fw_str, md5_inv_str);
+}
+
+VP9_INSTANTIATE_TEST_CASE(TileIndependenceTest, ::testing::Range(0, 2, 1));
+
+VP10_INSTANTIATE_TEST_CASE(TileIndependenceTest, ::testing::Range(0, 2, 1));
+}  // namespace
diff --git a/libs/libvpx/test/tools_common.sh b/libs/libvpx/test/tools_common.sh
new file mode 100755
index 0000000000..0bdcc08d78
--- /dev/null
+++ b/libs/libvpx/test/tools_common.sh
@@ -0,0 +1,438 @@
+#!/bin/sh
+##
+##  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+##  This file contains shell code shared by test scripts for libvpx tools.
+
+# Use $VPX_TEST_TOOLS_COMMON_SH as a pseudo include guard.
+if [ -z "${VPX_TEST_TOOLS_COMMON_SH}" ]; then
+VPX_TEST_TOOLS_COMMON_SH=included
+
+set -e
+devnull='> /dev/null 2>&1'
+VPX_TEST_PREFIX=""
+
+elog() {
+  echo "$@" 1>&2
+}
+
+vlog() {
+  if [ "${VPX_TEST_VERBOSE_OUTPUT}" = "yes" ]; then
+    echo "$@"
+  fi
+}
+
+# Sets $VPX_TOOL_TEST to the name specified by positional parameter one.
+test_begin() {
+  VPX_TOOL_TEST="${1}"
+}
+
+# Clears the VPX_TOOL_TEST variable after confirming that $VPX_TOOL_TEST matches
+# positional parameter one.
+test_end() {
+  if [ "$1" != "${VPX_TOOL_TEST}" ]; then
+    echo "FAIL completed test mismatch!."
+    echo "  completed test: ${1}"
+    echo "  active test: ${VPX_TOOL_TEST}."
+    return 1
+  fi
+  VPX_TOOL_TEST='<unset>'
+}
+
+# Echoes the target configuration being tested.
+test_configuration_target() {
+  vpx_config_mk="${LIBVPX_CONFIG_PATH}/config.mk"
+  # Find the TOOLCHAIN line, split it using ':=' as the field separator, and
+  # print the last field to get the value. Then pipe the value to tr to consume
+  # any leading/trailing spaces while allowing tr to echo the output to stdout.
+  awk -F ':=' '/TOOLCHAIN/ { print $NF }' "${vpx_config_mk}" | tr -d ' '
+}
+
+# Trap function used for failure reports and tool output directory removal.
+# When the contents of $VPX_TOOL_TEST do not match the string '<unset>', reports
+# failure of test stored in $VPX_TOOL_TEST.
+cleanup() {
+  if [ -n "${VPX_TOOL_TEST}" ] && [ "${VPX_TOOL_TEST}" != '<unset>' ]; then
+    echo "FAIL: $VPX_TOOL_TEST"
+  fi
+  if [ -n "${VPX_TEST_OUTPUT_DIR}" ] && [ -d "${VPX_TEST_OUTPUT_DIR}" ]; then
+    rm -rf "${VPX_TEST_OUTPUT_DIR}"
+  fi
+}
+
+# Echoes the git hash portion of the VERSION_STRING variable defined in
+# $LIBVPX_CONFIG_PATH/config.mk to stdout, or the version number string when
+# no git hash is contained in VERSION_STRING.
+config_hash() {
+  vpx_config_mk="${LIBVPX_CONFIG_PATH}/config.mk"
+  # Find VERSION_STRING line, split it with "-g" and print the last field to
+  # output the git hash to stdout.
+  vpx_version=$(awk -F -g '/VERSION_STRING/ {print $NF}' "${vpx_config_mk}")
+  # Handle two situations here:
+  # 1. The default case: $vpx_version is a git hash, so echo it unchanged.
+  # 2. When being run a non-dev tree, the -g portion is not present in the
+  #    version string: It's only the version number.
+  #    In this case $vpx_version is something like 'VERSION_STRING=v1.3.0', so
+  #    we echo only what is after the '='.
+  echo "${vpx_version##*=}"
+}
+
+# Echoes the short form of the current git hash.
+current_hash() {
+  if git --version > /dev/null 2>&1; then
+    (cd "$(dirname "${0}")"
+    git rev-parse --short HEAD)
+  else
+    # Return the config hash if git is unavailable: Fail silently, git hashes
+    # are used only for warnings.
+    config_hash
+  fi
+}
+
+# Echoes warnings to stdout when git hash in vpx_config.h does not match the
+# current git hash.
+check_git_hashes() {
+  hash_at_configure_time=$(config_hash)
+  hash_now=$(current_hash)
+
+  if [ "${hash_at_configure_time}" != "${hash_now}" ]; then
+    echo "Warning: git hash has changed since last configure."
+  fi
+}
+
+# $1 is the name of an environment variable containing a directory name to
+# test.
+test_env_var_dir() {
+  local dir=$(eval echo "\${$1}")
+  if [ ! -d "${dir}" ]; then
+    elog "'${dir}': No such directory"
+    elog "The $1 environment variable must be set to a valid directory."
+    return 1
+  fi
+}
+
+# This script requires that the LIBVPX_BIN_PATH, LIBVPX_CONFIG_PATH, and
+# LIBVPX_TEST_DATA_PATH variables are in the environment: Confirm that
+# the variables are set and that they all evaluate to directory paths.
+verify_vpx_test_environment() {
+  test_env_var_dir "LIBVPX_BIN_PATH" \
+    && test_env_var_dir "LIBVPX_CONFIG_PATH" \
+    && test_env_var_dir "LIBVPX_TEST_DATA_PATH"
+}
+
+# Greps vpx_config.h in LIBVPX_CONFIG_PATH for positional parameter one, which
+# should be a LIBVPX preprocessor flag. Echoes yes to stdout when the feature
+# is available.
+vpx_config_option_enabled() {
+  vpx_config_option="${1}"
+  vpx_config_file="${LIBVPX_CONFIG_PATH}/vpx_config.h"
+  config_line=$(grep "${vpx_config_option}" "${vpx_config_file}")
+  if echo "${config_line}" | egrep -q '1$'; then
+    echo yes
+  fi
+}
+
+# Echoes yes when output of test_configuration_target() contains win32 or win64.
+is_windows_target() {
+  if test_configuration_target \
+     | grep -q -e win32 -e win64 > /dev/null 2>&1; then
+    echo yes
+  fi
+}
+
+# Echoes path to $1 when it's executable and exists in ${LIBVPX_BIN_PATH}, or an
+# empty string. Caller is responsible for testing the string once the function
+# returns.
+vpx_tool_path() {
+  local readonly tool_name="$1"
+  local tool_path="${LIBVPX_BIN_PATH}/${tool_name}${VPX_TEST_EXE_SUFFIX}"
+  if [ ! -x "${tool_path}" ]; then
+    # Try one directory up: when running via examples.sh the tool could be in
+    # the parent directory of $LIBVPX_BIN_PATH.
+    tool_path="${LIBVPX_BIN_PATH}/../${tool_name}${VPX_TEST_EXE_SUFFIX}"
+  fi
+
+  if [ ! -x "${tool_path}" ]; then
+    tool_path=""
+  fi
+  echo "${tool_path}"
+}
+
+# Echoes yes to stdout when the file named by positional parameter one exists
+# in LIBVPX_BIN_PATH, and is executable.
+vpx_tool_available() {
+  local tool_name="$1"
+  local tool="${LIBVPX_BIN_PATH}/${tool_name}${VPX_TEST_EXE_SUFFIX}"
+  [ -x "${tool}" ] && echo yes
+}
+
+# Echoes yes to stdout when vpx_config_option_enabled() reports yes for
+# CONFIG_VP8_DECODER.
+vp8_decode_available() {
+  [ "$(vpx_config_option_enabled CONFIG_VP8_DECODER)" = "yes" ] && echo yes
+}
+
+# Echoes yes to stdout when vpx_config_option_enabled() reports yes for
+# CONFIG_VP8_ENCODER.
+vp8_encode_available() {
+  [ "$(vpx_config_option_enabled CONFIG_VP8_ENCODER)" = "yes" ] && echo yes
+}
+
+# Echoes yes to stdout when vpx_config_option_enabled() reports yes for
+# CONFIG_VP9_DECODER.
+vp9_decode_available() {
+  [ "$(vpx_config_option_enabled CONFIG_VP9_DECODER)" = "yes" ] && echo yes
+}
+
+# Echoes yes to stdout when vpx_config_option_enabled() reports yes for
+# CONFIG_VP9_ENCODER.
+vp9_encode_available() {
+  [ "$(vpx_config_option_enabled CONFIG_VP9_ENCODER)" = "yes" ] && echo yes
+}
+
+# Echoes yes to stdout when vpx_config_option_enabled() reports yes for
+# CONFIG_WEBM_IO.
+webm_io_available() {
+  [ "$(vpx_config_option_enabled CONFIG_WEBM_IO)" = "yes" ] && echo yes
+}
+
+# Filters strings from $1 using the filter specified by $2. Filter behavior
+# depends on the presence of $3. When $3 is present, strings that match the
+# filter are excluded. When $3 is omitted, strings matching the filter are
+# included.
+# The filtered result is echoed to stdout.
+filter_strings() {
+  strings=${1}
+  filter=${2}
+  exclude=${3}
+
+  if [ -n "${exclude}" ]; then
+    # When positional parameter three exists the caller wants to remove strings.
+    # Tell grep to invert matches using the -v argument.
+    exclude='-v'
+  else
+    unset exclude
+  fi
+
+  if [ -n "${filter}" ]; then
+    for s in ${strings}; do
+      if echo "${s}" | egrep -q ${exclude} "${filter}" > /dev/null 2>&1; then
+        filtered_strings="${filtered_strings} ${s}"
+      fi
+    done
+  else
+    filtered_strings="${strings}"
+  fi
+  echo "${filtered_strings}"
+}
+
+# Runs user test functions passed via positional parameters one and two.
+# Functions in positional parameter one are treated as environment verification
+# functions and are run unconditionally. Functions in positional parameter two
+# are run according to the rules specified in vpx_test_usage().
+run_tests() {
+  local env_tests="verify_vpx_test_environment $1"
+  local tests_to_filter="$2"
+  local test_name="${VPX_TEST_NAME}"
+
+  if [ -z "${test_name}" ]; then
+    test_name="$(basename "${0%.*}")"
+  fi
+
+  if [ "${VPX_TEST_RUN_DISABLED_TESTS}" != "yes" ]; then
+    # Filter out DISABLED tests.
+    tests_to_filter=$(filter_strings "${tests_to_filter}" ^DISABLED exclude)
+  fi
+
+  if [ -n "${VPX_TEST_FILTER}" ]; then
+    # Remove tests not matching the user's filter.
+    tests_to_filter=$(filter_strings "${tests_to_filter}" ${VPX_TEST_FILTER})
+  fi
+
+  # User requested test listing: Dump test names and return.
+  if [ "${VPX_TEST_LIST_TESTS}" = "yes" ]; then
+    for test_name in $tests_to_filter; do
+      echo ${test_name}
+    done
+    return
+  fi
+
+  # Don't bother with the environment tests if everything else was disabled.
+  [ -z "${tests_to_filter}" ] && return
+
+  # Combine environment and actual tests.
+  local tests_to_run="${env_tests} ${tests_to_filter}"
+
+  check_git_hashes
+
+  # Run tests.
+  for test in ${tests_to_run}; do
+    test_begin "${test}"
+    vlog "  RUN  ${test}"
+    "${test}"
+    vlog "  PASS ${test}"
+    test_end "${test}"
+  done
+
+  local tested_config="$(test_configuration_target) @ $(current_hash)"
+  echo "${test_name}: Done, all tests pass for ${tested_config}."
+}
+
+vpx_test_usage() {
+cat << EOF
+  Usage: ${0##*/} [arguments]
+    --bin-path <path to libvpx binaries directory>
+    --config-path <path to libvpx config directory>
+    --filter <filter>: User test filter. Only tests matching filter are run.
+    --run-disabled-tests: Run disabled tests.
+    --help: Display this message and exit.
+    --test-data-path <path to libvpx test data directory>
+    --show-program-output: Shows output from all programs being tested.
+    --prefix: Allows for a user specified prefix to be inserted before all test
+              programs. Grants the ability, for example, to run test programs
+              within valgrind.
+    --list-tests: List all test names and exit without actually running tests.
+    --verbose: Verbose output.
+
+    When the --bin-path option is not specified the script attempts to use
+    \$LIBVPX_BIN_PATH and then the current directory.
+
+    When the --config-path option is not specified the script attempts to use
+    \$LIBVPX_CONFIG_PATH and then the current directory.
+
+    When the -test-data-path option is not specified the script attempts to use
+    \$LIBVPX_TEST_DATA_PATH and then the current directory.
+EOF
+}
+
+# Returns non-zero (failure) when required environment variables are empty
+# strings.
+vpx_test_check_environment() {
+  if [ -z "${LIBVPX_BIN_PATH}" ] || \
+     [ -z "${LIBVPX_CONFIG_PATH}" ] || \
+     [ -z "${LIBVPX_TEST_DATA_PATH}" ]; then
+    return 1
+  fi
+}
+
+# Parse the command line.
+while [ -n "$1" ]; do
+  case "$1" in
+    --bin-path)
+      LIBVPX_BIN_PATH="$2"
+      shift
+      ;;
+    --config-path)
+      LIBVPX_CONFIG_PATH="$2"
+      shift
+      ;;
+    --filter)
+      VPX_TEST_FILTER="$2"
+      shift
+      ;;
+    --run-disabled-tests)
+      VPX_TEST_RUN_DISABLED_TESTS=yes
+      ;;
+    --help)
+      vpx_test_usage
+      exit
+      ;;
+    --test-data-path)
+      LIBVPX_TEST_DATA_PATH="$2"
+      shift
+      ;;
+    --prefix)
+      VPX_TEST_PREFIX="$2"
+      shift
+      ;;
+    --verbose)
+      VPX_TEST_VERBOSE_OUTPUT=yes
+      ;;
+    --show-program-output)
+      devnull=
+      ;;
+    --list-tests)
+      VPX_TEST_LIST_TESTS=yes
+      ;;
+    *)
+      vpx_test_usage
+      exit 1
+      ;;
+  esac
+  shift
+done
+
+# Handle running the tests from a build directory without arguments when running
+# the tests on *nix/macosx.
+LIBVPX_BIN_PATH="${LIBVPX_BIN_PATH:-.}"
+LIBVPX_CONFIG_PATH="${LIBVPX_CONFIG_PATH:-.}"
+LIBVPX_TEST_DATA_PATH="${LIBVPX_TEST_DATA_PATH:-.}"
+
+# Create a temporary directory for output files, and a trap to clean it up.
+if [ -n "${TMPDIR}" ]; then
+  VPX_TEST_TEMP_ROOT="${TMPDIR}"
+elif [ -n "${TEMPDIR}" ]; then
+  VPX_TEST_TEMP_ROOT="${TEMPDIR}"
+else
+  VPX_TEST_TEMP_ROOT=/tmp
+fi
+
+VPX_TEST_OUTPUT_DIR="${VPX_TEST_TEMP_ROOT}/vpx_test_$$"
+
+if ! mkdir -p "${VPX_TEST_OUTPUT_DIR}" || \
+   [ ! -d "${VPX_TEST_OUTPUT_DIR}" ]; then
+  echo "${0##*/}: Cannot create output directory, giving up."
+  echo "${0##*/}:   VPX_TEST_OUTPUT_DIR=${VPX_TEST_OUTPUT_DIR}"
+  exit 1
+fi
+
+if [ "$(is_windows_target)" = "yes" ]; then
+  VPX_TEST_EXE_SUFFIX=".exe"
+fi
+
+# Variables shared by tests.
+VP8_IVF_FILE="${LIBVPX_TEST_DATA_PATH}/vp80-00-comprehensive-001.ivf"
+VP9_IVF_FILE="${LIBVPX_TEST_DATA_PATH}/vp90-2-09-subpixel-00.ivf"
+
+VP9_WEBM_FILE="${LIBVPX_TEST_DATA_PATH}/vp90-2-00-quantizer-00.webm"
+VP9_FPM_WEBM_FILE="${LIBVPX_TEST_DATA_PATH}/vp90-2-07-frame_parallel-1.webm"
+VP9_LT_50_FRAMES_WEBM_FILE="${LIBVPX_TEST_DATA_PATH}/vp90-2-02-size-32x08.webm"
+
+YUV_RAW_INPUT="${LIBVPX_TEST_DATA_PATH}/hantro_collage_w352h288.yuv"
+YUV_RAW_INPUT_WIDTH=352
+YUV_RAW_INPUT_HEIGHT=288
+
+Y4M_NOSQ_PAR_INPUT="${LIBVPX_TEST_DATA_PATH}/park_joy_90p_8_420_a10-1.y4m"
+Y4M_720P_INPUT="${LIBVPX_TEST_DATA_PATH}/niklas_1280_720_30.y4m"
+
+# Setup a trap function to clean up after tests complete.
+trap cleanup EXIT
+
+vlog "$(basename "${0%.*}") test configuration:
+  LIBVPX_BIN_PATH=${LIBVPX_BIN_PATH}
+  LIBVPX_CONFIG_PATH=${LIBVPX_CONFIG_PATH}
+  LIBVPX_TEST_DATA_PATH=${LIBVPX_TEST_DATA_PATH}
+  VP8_IVF_FILE=${VP8_IVF_FILE}
+  VP9_IVF_FILE=${VP9_IVF_FILE}
+  VP9_WEBM_FILE=${VP9_WEBM_FILE}
+  VPX_TEST_EXE_SUFFIX=${VPX_TEST_EXE_SUFFIX}
+  VPX_TEST_FILTER=${VPX_TEST_FILTER}
+  VPX_TEST_LIST_TESTS=${VPX_TEST_LIST_TESTS}
+  VPX_TEST_OUTPUT_DIR=${VPX_TEST_OUTPUT_DIR}
+  VPX_TEST_PREFIX=${VPX_TEST_PREFIX}
+  VPX_TEST_RUN_DISABLED_TESTS=${VPX_TEST_RUN_DISABLED_TESTS}
+  VPX_TEST_SHOW_PROGRAM_OUTPUT=${VPX_TEST_SHOW_PROGRAM_OUTPUT}
+  VPX_TEST_TEMP_ROOT=${VPX_TEST_TEMP_ROOT}
+  VPX_TEST_VERBOSE_OUTPUT=${VPX_TEST_VERBOSE_OUTPUT}
+  YUV_RAW_INPUT=${YUV_RAW_INPUT}
+  YUV_RAW_INPUT_WIDTH=${YUV_RAW_INPUT_WIDTH}
+  YUV_RAW_INPUT_HEIGHT=${YUV_RAW_INPUT_HEIGHT}
+  Y4M_NOSQ_PAR_INPUT=${Y4M_NOSQ_PAR_INPUT}"
+
+fi  # End $VPX_TEST_TOOLS_COMMON_SH pseudo include guard.
diff --git a/libs/libvpx/test/twopass_encoder.sh b/libs/libvpx/test/twopass_encoder.sh
new file mode 100755
index 0000000000..1189e5131c
--- /dev/null
+++ b/libs/libvpx/test/twopass_encoder.sh
@@ -0,0 +1,62 @@
+#!/bin/sh
+##
+##  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+##  This file tests the libvpx twopass_encoder example. To add new tests to this
+##  file, do the following:
+##    1. Write a shell function (this is your test).
+##    2. Add the function to twopass_encoder_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: $YUV_RAW_INPUT is required.
+twopass_encoder_verify_environment() {
+  if [ ! -e "${YUV_RAW_INPUT}" ]; then
+    echo "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
+    return 1
+  fi
+}
+
+# Runs twopass_encoder using the codec specified by $1.
+twopass_encoder() {
+  local encoder="${LIBVPX_BIN_PATH}/twopass_encoder${VPX_TEST_EXE_SUFFIX}"
+  local codec="$1"
+  local output_file="${VPX_TEST_OUTPUT_DIR}/twopass_encoder_${codec}.ivf"
+
+  if [ ! -x "${encoder}" ]; then
+    elog "${encoder} does not exist or is not executable."
+    return 1
+  fi
+
+  eval "${VPX_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
+      "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" \
+      ${devnull}
+
+  [ -e "${output_file}" ] || return 1
+}
+
+twopass_encoder_vp8() {
+  if [ "$(vp8_encode_available)" = "yes" ]; then
+    twopass_encoder vp8 || return 1
+  fi
+}
+
+# TODO(tomfinegan): Add a frame limit param to twopass_encoder and enable this
+# test. VP9 is just too slow right now: This test takes 31m16s+ on a fast
+# machine.
+DISABLED_twopass_encoder_vp9() {
+  if [ "$(vp9_encode_available)" = "yes" ]; then
+    twopass_encoder vp9 || return 1
+  fi
+}
+
+twopass_encoder_tests="twopass_encoder_vp8
+                       DISABLED_twopass_encoder_vp9"
+
+run_tests twopass_encoder_verify_environment "${twopass_encoder_tests}"
diff --git a/libs/libvpx/test/user_priv_test.cc b/libs/libvpx/test/user_priv_test.cc
new file mode 100644
index 0000000000..8512d88cf4
--- /dev/null
+++ b/libs/libvpx/test/user_priv_test.cc
@@ -0,0 +1,100 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "./vpx_config.h"
+#include "test/acm_random.h"
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/ivf_video_source.h"
+#include "test/md5_helper.h"
+#include "test/util.h"
+#if CONFIG_WEBM_IO
+#include "test/webm_video_source.h"
+#endif
+#include "vpx_mem/vpx_mem.h"
+#include "vpx/vp8.h"
+
+namespace {
+
+using std::string;
+using libvpx_test::ACMRandom;
+
+#if CONFIG_WEBM_IO
+
+void CheckUserPrivateData(void *user_priv, int *target) {
+  // actual pointer value should be the same as expected.
+  EXPECT_EQ(reinterpret_cast<void *>(target), user_priv) <<
+      "user_priv pointer value does not match.";
+}
+
+// Decodes |filename|. Passes in user_priv data when calling DecodeFrame and
+// compares the user_priv from return img with the original user_priv to see if
+// they match. Both the pointer values and the values inside the addresses
+// should match.
+string DecodeFile(const string &filename) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  libvpx_test::WebMVideoSource video(filename);
+  video.Init();
+
+  vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
+  libvpx_test::VP9Decoder decoder(cfg, 0);
+
+  libvpx_test::MD5 md5;
+  int frame_num = 0;
+  for (video.Begin(); !::testing::Test::HasFailure() && video.cxdata();
+       video.Next()) {
+    void *user_priv = reinterpret_cast<void *>(&frame_num);
+    const vpx_codec_err_t res =
+        decoder.DecodeFrame(video.cxdata(), video.frame_size(),
+                            (frame_num == 0) ? NULL : user_priv);
+    if (res != VPX_CODEC_OK) {
+      EXPECT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError();
+      break;
+    }
+    libvpx_test::DxDataIterator dec_iter = decoder.GetDxData();
+    const vpx_image_t *img = NULL;
+
+    // Get decompressed data.
+    while ((img = dec_iter.Next())) {
+      if (frame_num == 0) {
+        CheckUserPrivateData(img->user_priv, NULL);
+      } else {
+        CheckUserPrivateData(img->user_priv, &frame_num);
+
+        // Also test ctrl_get_reference api.
+        struct vp9_ref_frame ref;
+        // Randomly fetch a reference frame.
+        ref.idx = rnd.Rand8() % 3;
+        decoder.Control(VP9_GET_REFERENCE, &ref);
+
+        CheckUserPrivateData(ref.img.user_priv, NULL);
+      }
+      md5.Add(img);
+    }
+
+    frame_num++;
+  }
+  return string(md5.Get());
+}
+
+TEST(UserPrivTest, VideoDecode) {
+  // no tiles or frame parallel; this exercises the decoding to test the
+  // user_priv.
+  EXPECT_STREQ("b35a1b707b28e82be025d960aba039bc",
+               DecodeFile("vp90-2-03-size-226x226.webm").c_str());
+}
+
+#endif  // CONFIG_WEBM_IO
+
+}  // namespace
diff --git a/libs/libvpx/test/util.h b/libs/libvpx/test/util.h
new file mode 100644
index 0000000000..b27bffa94b
--- /dev/null
+++ b/libs/libvpx/test/util.h
@@ -0,0 +1,46 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef TEST_UTIL_H_
+#define TEST_UTIL_H_
+
+#include <stdio.h>
+#include <math.h>
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "vpx/vpx_image.h"
+
+// Macros
+#define GET_PARAM(k) std::tr1::get< k >(GetParam())
+
+inline double compute_psnr(const vpx_image_t *img1, const vpx_image_t *img2) {
+  assert((img1->fmt == img2->fmt) &&
+         (img1->d_w == img2->d_w) &&
+         (img1->d_h == img2->d_h));
+
+  const unsigned int width_y  = img1->d_w;
+  const unsigned int height_y = img1->d_h;
+  unsigned int i, j;
+
+  int64_t sqrerr = 0;
+  for (i = 0; i < height_y; ++i)
+    for (j = 0; j < width_y; ++j) {
+      int64_t d = img1->planes[VPX_PLANE_Y][i * img1->stride[VPX_PLANE_Y] + j] -
+                  img2->planes[VPX_PLANE_Y][i * img2->stride[VPX_PLANE_Y] + j];
+      sqrerr += d * d;
+    }
+  double mse = static_cast<double>(sqrerr) / (width_y * height_y);
+  double psnr = 100.0;
+  if (mse > 0.0) {
+    psnr = 10 * log10(255.0 * 255.0 / mse);
+  }
+  return psnr;
+}
+
+#endif  // TEST_UTIL_H_
diff --git a/libs/libvpx/test/variance_test.cc b/libs/libvpx/test/variance_test.cc
new file mode 100644
index 0000000000..6f50f78f2e
--- /dev/null
+++ b/libs/libvpx/test/variance_test.cc
@@ -0,0 +1,1353 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <cstdlib>
+#include <new>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+
+namespace {
+
+typedef unsigned int (*VarianceMxNFunc)(const uint8_t *a, int a_stride,
+                                        const uint8_t *b, int b_stride,
+                                        unsigned int *sse);
+typedef unsigned int (*SubpixVarMxNFunc)(const uint8_t *a, int a_stride,
+                                         int xoffset, int yoffset,
+                                         const uint8_t *b, int b_stride,
+                                         unsigned int *sse);
+typedef unsigned int (*SubpixAvgVarMxNFunc)(const uint8_t *a, int a_stride,
+                                            int xoffset, int yoffset,
+                                            const uint8_t *b, int b_stride,
+                                            uint32_t *sse,
+                                            const uint8_t *second_pred);
+typedef unsigned int (*Get4x4SseFunc)(const uint8_t *a, int a_stride,
+                                      const uint8_t *b, int b_stride);
+typedef unsigned int (*SumOfSquaresFunction)(const int16_t *src);
+
+
+using ::std::tr1::get;
+using ::std::tr1::make_tuple;
+using ::std::tr1::tuple;
+using libvpx_test::ACMRandom;
+
+// Truncate high bit depth results by downshifting (with rounding) by:
+// 2 * (bit_depth - 8) for sse
+// (bit_depth - 8) for se
+static void RoundHighBitDepth(int bit_depth, int64_t *se, uint64_t *sse) {
+  switch (bit_depth) {
+    case VPX_BITS_12:
+      *sse = (*sse + 128) >> 8;
+      *se = (*se + 8) >> 4;
+      break;
+    case VPX_BITS_10:
+      *sse = (*sse + 8) >> 4;
+      *se = (*se + 2) >> 2;
+      break;
+    case VPX_BITS_8:
+    default:
+      break;
+  }
+}
+
+static unsigned int mb_ss_ref(const int16_t *src) {
+  unsigned int res = 0;
+  for (int i = 0; i < 256; ++i) {
+    res += src[i] * src[i];
+  }
+  return res;
+}
+
+static uint32_t variance_ref(const uint8_t *src, const uint8_t *ref,
+                             int l2w, int l2h, int src_stride_coeff,
+                             int ref_stride_coeff, uint32_t *sse_ptr,
+                             bool use_high_bit_depth_,
+                             vpx_bit_depth_t bit_depth) {
+  int64_t se = 0;
+  uint64_t sse = 0;
+  const int w = 1 << l2w;
+  const int h = 1 << l2h;
+  for (int y = 0; y < h; y++) {
+    for (int x = 0; x < w; x++) {
+      int diff;
+      if (!use_high_bit_depth_) {
+        diff = ref[w * y * ref_stride_coeff + x] -
+               src[w * y * src_stride_coeff + x];
+        se += diff;
+        sse += diff * diff;
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        diff = CONVERT_TO_SHORTPTR(ref)[w * y * ref_stride_coeff + x] -
+               CONVERT_TO_SHORTPTR(src)[w * y * src_stride_coeff + x];
+        se += diff;
+        sse += diff * diff;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
+    }
+  }
+  RoundHighBitDepth(bit_depth, &se, &sse);
+  *sse_ptr = static_cast<uint32_t>(sse);
+  return static_cast<uint32_t>(sse -
+                               ((static_cast<int64_t>(se) * se) >>
+                                (l2w + l2h)));
+}
+
+/* The subpel reference functions differ from the codec version in one aspect:
+ * they calculate the bilinear factors directly instead of using a lookup table
+ * and therefore upshift xoff and yoff by 1. Only every other calculated value
+ * is used so the codec version shrinks the table to save space and maintain
+ * compatibility with vp8.
+ */
+static uint32_t subpel_variance_ref(const uint8_t *ref, const uint8_t *src,
+                                    int l2w, int l2h, int xoff, int yoff,
+                                    uint32_t *sse_ptr,
+                                    bool use_high_bit_depth_,
+                                    vpx_bit_depth_t bit_depth) {
+  int64_t se = 0;
+  uint64_t sse = 0;
+  const int w = 1 << l2w;
+  const int h = 1 << l2h;
+
+  xoff <<= 1;
+  yoff <<= 1;
+
+  for (int y = 0; y < h; y++) {
+    for (int x = 0; x < w; x++) {
+      // Bilinear interpolation at a 16th pel step.
+      if (!use_high_bit_depth_) {
+        const int a1 = ref[(w + 1) * (y + 0) + x + 0];
+        const int a2 = ref[(w + 1) * (y + 0) + x + 1];
+        const int b1 = ref[(w + 1) * (y + 1) + x + 0];
+        const int b2 = ref[(w + 1) * (y + 1) + x + 1];
+        const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+        const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+        const int r = a + (((b - a) * yoff + 8) >> 4);
+        const int diff = r - src[w * y + x];
+        se += diff;
+        sse += diff * diff;
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref);
+        uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+        const int a1 = ref16[(w + 1) * (y + 0) + x + 0];
+        const int a2 = ref16[(w + 1) * (y + 0) + x + 1];
+        const int b1 = ref16[(w + 1) * (y + 1) + x + 0];
+        const int b2 = ref16[(w + 1) * (y + 1) + x + 1];
+        const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+        const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+        const int r = a + (((b - a) * yoff + 8) >> 4);
+        const int diff = r - src16[w * y + x];
+        se += diff;
+        sse += diff * diff;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
+    }
+  }
+  RoundHighBitDepth(bit_depth, &se, &sse);
+  *sse_ptr = static_cast<uint32_t>(sse);
+  return static_cast<uint32_t>(sse -
+                               ((static_cast<int64_t>(se) * se) >>
+                                (l2w + l2h)));
+}
+
+class SumOfSquaresTest : public ::testing::TestWithParam<SumOfSquaresFunction> {
+ public:
+  SumOfSquaresTest() : func_(GetParam()) {}
+
+  virtual ~SumOfSquaresTest() {
+    libvpx_test::ClearSystemState();
+  }
+
+ protected:
+  void ConstTest();
+  void RefTest();
+
+  SumOfSquaresFunction func_;
+  ACMRandom rnd_;
+};
+
+void SumOfSquaresTest::ConstTest() {
+  int16_t mem[256];
+  unsigned int res;
+  for (int v = 0; v < 256; ++v) {
+    for (int i = 0; i < 256; ++i) {
+      mem[i] = v;
+    }
+    ASM_REGISTER_STATE_CHECK(res = func_(mem));
+    EXPECT_EQ(256u * (v * v), res);
+  }
+}
+
+void SumOfSquaresTest::RefTest() {
+  int16_t mem[256];
+  for (int i = 0; i < 100; ++i) {
+    for (int j = 0; j < 256; ++j) {
+      mem[j] = rnd_.Rand8() - rnd_.Rand8();
+    }
+
+    const unsigned int expected = mb_ss_ref(mem);
+    unsigned int res;
+    ASM_REGISTER_STATE_CHECK(res = func_(mem));
+    EXPECT_EQ(expected, res);
+  }
+}
+
+template<typename VarianceFunctionType>
+class VarianceTest
+    : public ::testing::TestWithParam<tuple<int, int,
+                                            VarianceFunctionType, int> > {
+ public:
+  virtual void SetUp() {
+    const tuple<int, int, VarianceFunctionType, int>& params = this->GetParam();
+    log2width_  = get<0>(params);
+    width_ = 1 << log2width_;
+    log2height_ = get<1>(params);
+    height_ = 1 << log2height_;
+    variance_ = get<2>(params);
+    if (get<3>(params)) {
+      bit_depth_ = static_cast<vpx_bit_depth_t>(get<3>(params));
+      use_high_bit_depth_ = true;
+    } else {
+      bit_depth_ = VPX_BITS_8;
+      use_high_bit_depth_ = false;
+    }
+    mask_ = (1 << bit_depth_) - 1;
+
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    block_size_ = width_ * height_;
+    if (!use_high_bit_depth_) {
+      src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_ * 2));
+      ref_ = new uint8_t[block_size_ * 2];
+#if CONFIG_VP9_HIGHBITDEPTH
+    } else {
+      src_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
+          vpx_memalign(16, block_size_ * 2 * sizeof(uint16_t))));
+      ref_ = CONVERT_TO_BYTEPTR(new uint16_t[block_size_ * 2]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    }
+    ASSERT_TRUE(src_ != NULL);
+    ASSERT_TRUE(ref_ != NULL);
+  }
+
+  virtual void TearDown() {
+    if (!use_high_bit_depth_) {
+      vpx_free(src_);
+      delete[] ref_;
+#if CONFIG_VP9_HIGHBITDEPTH
+    } else {
+      vpx_free(CONVERT_TO_SHORTPTR(src_));
+      delete[] CONVERT_TO_SHORTPTR(ref_);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    }
+    libvpx_test::ClearSystemState();
+  }
+
+ protected:
+  void ZeroTest();
+  void RefTest();
+  void RefStrideTest();
+  void OneQuarterTest();
+
+  ACMRandom rnd_;
+  uint8_t *src_;
+  uint8_t *ref_;
+  int width_, log2width_;
+  int height_, log2height_;
+  vpx_bit_depth_t bit_depth_;
+  int mask_;
+  bool use_high_bit_depth_;
+  int block_size_;
+  VarianceFunctionType variance_;
+};
+
+template<typename VarianceFunctionType>
+void VarianceTest<VarianceFunctionType>::ZeroTest() {
+  for (int i = 0; i <= 255; ++i) {
+    if (!use_high_bit_depth_) {
+      memset(src_, i, block_size_);
+#if CONFIG_VP9_HIGHBITDEPTH
+    } else {
+      vpx_memset16(CONVERT_TO_SHORTPTR(src_), i << (bit_depth_ - 8),
+                   block_size_);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    }
+    for (int j = 0; j <= 255; ++j) {
+      if (!use_high_bit_depth_) {
+        memset(ref_, j, block_size_);
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        vpx_memset16(CONVERT_TO_SHORTPTR(ref_), j  << (bit_depth_ - 8),
+                     block_size_);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
+      unsigned int sse;
+      unsigned int var;
+      ASM_REGISTER_STATE_CHECK(
+          var = variance_(src_, width_, ref_, width_, &sse));
+      EXPECT_EQ(0u, var) << "src values: " << i << " ref values: " << j;
+    }
+  }
+}
+
+template<typename VarianceFunctionType>
+void VarianceTest<VarianceFunctionType>::RefTest() {
+  for (int i = 0; i < 10; ++i) {
+    for (int j = 0; j < block_size_; j++) {
+    if (!use_high_bit_depth_) {
+      src_[j] = rnd_.Rand8();
+      ref_[j] = rnd_.Rand8();
+#if CONFIG_VP9_HIGHBITDEPTH
+    } else {
+      CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() && mask_;
+      CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() && mask_;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    }
+    }
+    unsigned int sse1, sse2;
+    unsigned int var1;
+    const int stride_coeff = 1;
+    ASM_REGISTER_STATE_CHECK(
+        var1 = variance_(src_, width_, ref_, width_, &sse1));
+    const unsigned int var2 = variance_ref(src_, ref_, log2width_,
+                                           log2height_, stride_coeff,
+                                           stride_coeff, &sse2,
+                                           use_high_bit_depth_, bit_depth_);
+    EXPECT_EQ(sse1, sse2);
+    EXPECT_EQ(var1, var2);
+  }
+}
+
+template<typename VarianceFunctionType>
+void VarianceTest<VarianceFunctionType>::RefStrideTest() {
+  for (int i = 0; i < 10; ++i) {
+    int ref_stride_coeff = i % 2;
+    int src_stride_coeff = (i >> 1) % 2;
+    for (int j = 0; j < block_size_; j++) {
+      int ref_ind = (j / width_) * ref_stride_coeff * width_ + j % width_;
+      int src_ind = (j / width_) * src_stride_coeff * width_ + j % width_;
+      if (!use_high_bit_depth_) {
+        src_[src_ind] = rnd_.Rand8();
+        ref_[ref_ind] = rnd_.Rand8();
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        CONVERT_TO_SHORTPTR(src_)[src_ind] = rnd_.Rand16() && mask_;
+        CONVERT_TO_SHORTPTR(ref_)[ref_ind] = rnd_.Rand16() && mask_;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
+    }
+    unsigned int sse1, sse2;
+    unsigned int var1;
+
+    ASM_REGISTER_STATE_CHECK(
+        var1 = variance_(src_, width_ * src_stride_coeff,
+                         ref_, width_ * ref_stride_coeff, &sse1));
+    const unsigned int var2 = variance_ref(src_, ref_, log2width_,
+                                           log2height_, src_stride_coeff,
+                                           ref_stride_coeff, &sse2,
+                                           use_high_bit_depth_, bit_depth_);
+    EXPECT_EQ(sse1, sse2);
+    EXPECT_EQ(var1, var2);
+  }
+}
+
+template<typename VarianceFunctionType>
+void VarianceTest<VarianceFunctionType>::OneQuarterTest() {
+  const int half = block_size_ / 2;
+  if (!use_high_bit_depth_) {
+    memset(src_, 255, block_size_);
+    memset(ref_, 255, half);
+    memset(ref_ + half, 0, half);
+#if CONFIG_VP9_HIGHBITDEPTH
+  } else {
+    vpx_memset16(CONVERT_TO_SHORTPTR(src_), 255 << (bit_depth_ - 8),
+                 block_size_);
+    vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 255 << (bit_depth_ - 8), half);
+    vpx_memset16(CONVERT_TO_SHORTPTR(ref_) + half, 0, half);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  }
+  unsigned int sse;
+  unsigned int var;
+  ASM_REGISTER_STATE_CHECK(var = variance_(src_, width_, ref_, width_, &sse));
+  const unsigned int expected = block_size_ * 255 * 255 / 4;
+  EXPECT_EQ(expected, var);
+}
+
+template<typename MseFunctionType>
+class MseTest
+    : public ::testing::TestWithParam<tuple<int, int, MseFunctionType> > {
+ public:
+  virtual void SetUp() {
+    const tuple<int, int, MseFunctionType>& params = this->GetParam();
+    log2width_  = get<0>(params);
+    width_ = 1 << log2width_;
+    log2height_ = get<1>(params);
+    height_ = 1 << log2height_;
+    mse_ = get<2>(params);
+
+    rnd(ACMRandom::DeterministicSeed());
+    block_size_ = width_ * height_;
+    src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
+    ref_ = new uint8_t[block_size_];
+    ASSERT_TRUE(src_ != NULL);
+    ASSERT_TRUE(ref_ != NULL);
+  }
+
+  virtual void TearDown() {
+    vpx_free(src_);
+    delete[] ref_;
+    libvpx_test::ClearSystemState();
+  }
+
+ protected:
+  void RefTest_mse();
+  void RefTest_sse();
+  void MaxTest_mse();
+  void MaxTest_sse();
+
+  ACMRandom rnd;
+  uint8_t* src_;
+  uint8_t* ref_;
+  int width_, log2width_;
+  int height_, log2height_;
+  int block_size_;
+  MseFunctionType mse_;
+};
+
+template<typename MseFunctionType>
+void MseTest<MseFunctionType>::RefTest_mse() {
+  for (int i = 0; i < 10; ++i) {
+    for (int j = 0; j < block_size_; j++) {
+      src_[j] = rnd.Rand8();
+      ref_[j] = rnd.Rand8();
+    }
+    unsigned int sse1, sse2;
+    const int stride_coeff = 1;
+    ASM_REGISTER_STATE_CHECK(mse_(src_, width_, ref_, width_, &sse1));
+    variance_ref(src_, ref_, log2width_, log2height_, stride_coeff,
+                 stride_coeff, &sse2, false, VPX_BITS_8);
+    EXPECT_EQ(sse1, sse2);
+  }
+}
+
+template<typename MseFunctionType>
+void MseTest<MseFunctionType>::RefTest_sse() {
+  for (int i = 0; i < 10; ++i) {
+    for (int j = 0; j < block_size_; j++) {
+      src_[j] = rnd.Rand8();
+      ref_[j] = rnd.Rand8();
+    }
+    unsigned int sse2;
+    unsigned int var1;
+    const int stride_coeff = 1;
+    ASM_REGISTER_STATE_CHECK(var1 = mse_(src_, width_, ref_, width_));
+    variance_ref(src_, ref_, log2width_, log2height_, stride_coeff,
+                 stride_coeff, &sse2, false, VPX_BITS_8);
+    EXPECT_EQ(var1, sse2);
+  }
+}
+
+template<typename MseFunctionType>
+void MseTest<MseFunctionType>::MaxTest_mse() {
+  memset(src_, 255, block_size_);
+  memset(ref_, 0, block_size_);
+  unsigned int sse;
+  ASM_REGISTER_STATE_CHECK(mse_(src_, width_, ref_, width_, &sse));
+  const unsigned int expected = block_size_ * 255 * 255;
+  EXPECT_EQ(expected, sse);
+}
+
+template<typename MseFunctionType>
+void MseTest<MseFunctionType>::MaxTest_sse() {
+  memset(src_, 255, block_size_);
+  memset(ref_, 0, block_size_);
+  unsigned int var;
+  ASM_REGISTER_STATE_CHECK(var = mse_(src_, width_, ref_, width_));
+  const unsigned int expected = block_size_ * 255 * 255;
+  EXPECT_EQ(expected, var);
+}
+
+static uint32_t subpel_avg_variance_ref(const uint8_t *ref,
+                                        const uint8_t *src,
+                                        const uint8_t *second_pred,
+                                        int l2w, int l2h,
+                                        int xoff, int yoff,
+                                        uint32_t *sse_ptr,
+                                        bool use_high_bit_depth,
+                                        vpx_bit_depth_t bit_depth) {
+  int64_t se = 0;
+  uint64_t sse = 0;
+  const int w = 1 << l2w;
+  const int h = 1 << l2h;
+
+  xoff <<= 1;
+  yoff <<= 1;
+
+  for (int y = 0; y < h; y++) {
+    for (int x = 0; x < w; x++) {
+      // bilinear interpolation at a 16th pel step
+      if (!use_high_bit_depth) {
+        const int a1 = ref[(w + 1) * (y + 0) + x + 0];
+        const int a2 = ref[(w + 1) * (y + 0) + x + 1];
+        const int b1 = ref[(w + 1) * (y + 1) + x + 0];
+        const int b2 = ref[(w + 1) * (y + 1) + x + 1];
+        const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+        const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+        const int r = a + (((b - a) * yoff + 8) >> 4);
+        const int diff = ((r + second_pred[w * y + x] + 1) >> 1) - src[w * y + x];
+        se += diff;
+        sse += diff * diff;
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref);
+        uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+        uint16_t *sec16   = CONVERT_TO_SHORTPTR(second_pred);
+        const int a1 = ref16[(w + 1) * (y + 0) + x + 0];
+        const int a2 = ref16[(w + 1) * (y + 0) + x + 1];
+        const int b1 = ref16[(w + 1) * (y + 1) + x + 0];
+        const int b2 = ref16[(w + 1) * (y + 1) + x + 1];
+        const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+        const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+        const int r = a + (((b - a) * yoff + 8) >> 4);
+        const int diff = ((r + sec16[w * y + x] + 1) >> 1) - src16[w * y + x];
+        se += diff;
+        sse += diff * diff;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
+    }
+  }
+  RoundHighBitDepth(bit_depth, &se, &sse);
+  *sse_ptr = static_cast<uint32_t>(sse);
+  return static_cast<uint32_t>(sse -
+                               ((static_cast<int64_t>(se) * se) >>
+                                (l2w + l2h)));
+}
+
+template<typename SubpelVarianceFunctionType>
+class SubpelVarianceTest
+    : public ::testing::TestWithParam<tuple<int, int,
+                                            SubpelVarianceFunctionType, int> > {
+ public:
+  virtual void SetUp() {
+    const tuple<int, int, SubpelVarianceFunctionType, int>& params =
+        this->GetParam();
+    log2width_  = get<0>(params);
+    width_ = 1 << log2width_;
+    log2height_ = get<1>(params);
+    height_ = 1 << log2height_;
+    subpel_variance_ = get<2>(params);
+    if (get<3>(params)) {
+      bit_depth_ = (vpx_bit_depth_t) get<3>(params);
+      use_high_bit_depth_ = true;
+    } else {
+      bit_depth_ = VPX_BITS_8;
+      use_high_bit_depth_ = false;
+    }
+    mask_ = (1 << bit_depth_)-1;
+
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    block_size_ = width_ * height_;
+    if (!use_high_bit_depth_) {
+      src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
+      sec_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
+      ref_ = new uint8_t[block_size_ + width_ + height_ + 1];
+#if CONFIG_VP9_HIGHBITDEPTH
+    } else {
+      src_ = CONVERT_TO_BYTEPTR(
+          reinterpret_cast<uint16_t *>(
+              vpx_memalign(16, block_size_*sizeof(uint16_t))));
+      sec_ = CONVERT_TO_BYTEPTR(
+          reinterpret_cast<uint16_t *>(
+              vpx_memalign(16, block_size_*sizeof(uint16_t))));
+      ref_ = CONVERT_TO_BYTEPTR(
+          new uint16_t[block_size_ + width_ + height_ + 1]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    }
+    ASSERT_TRUE(src_ != NULL);
+    ASSERT_TRUE(sec_ != NULL);
+    ASSERT_TRUE(ref_ != NULL);
+  }
+
+  virtual void TearDown() {
+    if (!use_high_bit_depth_) {
+      vpx_free(src_);
+      delete[] ref_;
+      vpx_free(sec_);
+#if CONFIG_VP9_HIGHBITDEPTH
+    } else {
+      vpx_free(CONVERT_TO_SHORTPTR(src_));
+      delete[] CONVERT_TO_SHORTPTR(ref_);
+      vpx_free(CONVERT_TO_SHORTPTR(sec_));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    }
+    libvpx_test::ClearSystemState();
+  }
+
+ protected:
+  void RefTest();
+  void ExtremeRefTest();
+
+  ACMRandom rnd_;
+  uint8_t *src_;
+  uint8_t *ref_;
+  uint8_t *sec_;
+  bool use_high_bit_depth_;
+  vpx_bit_depth_t bit_depth_;
+  int width_, log2width_;
+  int height_, log2height_;
+  int block_size_,  mask_;
+  SubpelVarianceFunctionType subpel_variance_;
+};
+
+template<typename SubpelVarianceFunctionType>
+void SubpelVarianceTest<SubpelVarianceFunctionType>::RefTest() {
+  for (int x = 0; x < 8; ++x) {
+    for (int y = 0; y < 8; ++y) {
+      if (!use_high_bit_depth_) {
+        for (int j = 0; j < block_size_; j++) {
+          src_[j] = rnd_.Rand8();
+        }
+        for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
+          ref_[j] = rnd_.Rand8();
+        }
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        for (int j = 0; j < block_size_; j++) {
+          CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask_;
+        }
+        for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
+          CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask_;
+        }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
+      unsigned int sse1, sse2;
+      unsigned int var1;
+      ASM_REGISTER_STATE_CHECK(var1 = subpel_variance_(ref_, width_ + 1, x, y,
+                                                       src_, width_, &sse1));
+      const unsigned int var2 = subpel_variance_ref(ref_, src_,
+                                                    log2width_, log2height_,
+                                                    x, y, &sse2,
+                                                    use_high_bit_depth_,
+                                                    bit_depth_);
+      EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y;
+      EXPECT_EQ(var1, var2) << "at position " << x << ", " << y;
+    }
+  }
+}
+
+template<typename SubpelVarianceFunctionType>
+void SubpelVarianceTest<SubpelVarianceFunctionType>::ExtremeRefTest() {
+  // Compare against reference.
+  // Src: Set the first half of values to 0, the second half to the maximum.
+  // Ref: Set the first half of values to the maximum, the second half to 0.
+  for (int x = 0; x < 8; ++x) {
+    for (int y = 0; y < 8; ++y) {
+      const int half = block_size_ / 2;
+      if (!use_high_bit_depth_) {
+        memset(src_, 0, half);
+        memset(src_ + half, 255, half);
+        memset(ref_, 255, half);
+        memset(ref_ + half, 0, half + width_ + height_ + 1);
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        vpx_memset16(CONVERT_TO_SHORTPTR(src_), mask_, half);
+        vpx_memset16(CONVERT_TO_SHORTPTR(src_) + half, 0, half);
+        vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 0, half);
+        vpx_memset16(CONVERT_TO_SHORTPTR(ref_) + half, mask_,
+                     half + width_ + height_ + 1);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
+      unsigned int sse1, sse2;
+      unsigned int var1;
+      ASM_REGISTER_STATE_CHECK(
+          var1 = subpel_variance_(ref_, width_ + 1, x, y, src_, width_, &sse1));
+      const unsigned int var2 =
+          subpel_variance_ref(ref_, src_, log2width_, log2height_,
+                              x, y, &sse2, use_high_bit_depth_, bit_depth_);
+      EXPECT_EQ(sse1, sse2) << "for xoffset " << x << " and yoffset " << y;
+      EXPECT_EQ(var1, var2) << "for xoffset " << x << " and yoffset " << y;
+    }
+  }
+}
+
+template<>
+void SubpelVarianceTest<SubpixAvgVarMxNFunc>::RefTest() {
+  for (int x = 0; x < 8; ++x) {
+    for (int y = 0; y < 8; ++y) {
+      if (!use_high_bit_depth_) {
+        for (int j = 0; j < block_size_; j++) {
+          src_[j] = rnd_.Rand8();
+          sec_[j] = rnd_.Rand8();
+        }
+        for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
+          ref_[j] = rnd_.Rand8();
+        }
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        for (int j = 0; j < block_size_; j++) {
+          CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask_;
+          CONVERT_TO_SHORTPTR(sec_)[j] = rnd_.Rand16() & mask_;
+        }
+        for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
+          CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask_;
+        }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
+      unsigned int sse1, sse2;
+      unsigned int var1;
+      ASM_REGISTER_STATE_CHECK(
+          var1 = subpel_variance_(ref_, width_ + 1, x, y,
+                                  src_, width_, &sse1, sec_));
+      const unsigned int var2 = subpel_avg_variance_ref(ref_, src_, sec_,
+                                                        log2width_, log2height_,
+                                                        x, y, &sse2,
+                                                        use_high_bit_depth_,
+                                                        bit_depth_);
+      EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y;
+      EXPECT_EQ(var1, var2) << "at position " << x << ", " << y;
+    }
+  }
+}
+
+typedef MseTest<Get4x4SseFunc> VpxSseTest;
+typedef MseTest<VarianceMxNFunc> VpxMseTest;
+typedef VarianceTest<VarianceMxNFunc> VpxVarianceTest;
+typedef SubpelVarianceTest<SubpixVarMxNFunc> VpxSubpelVarianceTest;
+typedef SubpelVarianceTest<SubpixAvgVarMxNFunc> VpxSubpelAvgVarianceTest;
+
+TEST_P(VpxSseTest, Ref_sse) { RefTest_sse(); }
+TEST_P(VpxSseTest, Max_sse) { MaxTest_sse(); }
+TEST_P(VpxMseTest, Ref_mse) { RefTest_mse(); }
+TEST_P(VpxMseTest, Max_mse) { MaxTest_mse(); }
+TEST_P(VpxVarianceTest, Zero) { ZeroTest(); }
+TEST_P(VpxVarianceTest, Ref) { RefTest(); }
+TEST_P(VpxVarianceTest, RefStride) { RefStrideTest(); }
+TEST_P(VpxVarianceTest, OneQuarter) { OneQuarterTest(); }
+TEST_P(SumOfSquaresTest, Const) { ConstTest(); }
+TEST_P(SumOfSquaresTest, Ref) { RefTest(); }
+TEST_P(VpxSubpelVarianceTest, Ref) { RefTest(); }
+TEST_P(VpxSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
+TEST_P(VpxSubpelAvgVarianceTest, Ref) { RefTest(); }
+
+INSTANTIATE_TEST_CASE_P(C, SumOfSquaresTest,
+                        ::testing::Values(vpx_get_mb_ss_c));
+
+INSTANTIATE_TEST_CASE_P(C, VpxSseTest,
+                        ::testing::Values(make_tuple(2, 2,
+                                                     &vpx_get4x4sse_cs_c)));
+
+INSTANTIATE_TEST_CASE_P(C, VpxMseTest,
+                        ::testing::Values(make_tuple(4, 4, &vpx_mse16x16_c),
+                                          make_tuple(4, 3, &vpx_mse16x8_c),
+                                          make_tuple(3, 4, &vpx_mse8x16_c),
+                                          make_tuple(3, 3, &vpx_mse8x8_c)));
+
+INSTANTIATE_TEST_CASE_P(
+    C, VpxVarianceTest,
+    ::testing::Values(make_tuple(6, 6, &vpx_variance64x64_c, 0),
+                      make_tuple(6, 5, &vpx_variance64x32_c, 0),
+                      make_tuple(5, 6, &vpx_variance32x64_c, 0),
+                      make_tuple(5, 5, &vpx_variance32x32_c, 0),
+                      make_tuple(5, 4, &vpx_variance32x16_c, 0),
+                      make_tuple(4, 5, &vpx_variance16x32_c, 0),
+                      make_tuple(4, 4, &vpx_variance16x16_c, 0),
+                      make_tuple(4, 3, &vpx_variance16x8_c, 0),
+                      make_tuple(3, 4, &vpx_variance8x16_c, 0),
+                      make_tuple(3, 3, &vpx_variance8x8_c, 0),
+                      make_tuple(3, 2, &vpx_variance8x4_c, 0),
+                      make_tuple(2, 3, &vpx_variance4x8_c, 0),
+                      make_tuple(2, 2, &vpx_variance4x4_c, 0)));
+
+INSTANTIATE_TEST_CASE_P(
+    C, VpxSubpelVarianceTest,
+    ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_c, 0),
+                      make_tuple(6, 5, &vpx_sub_pixel_variance64x32_c, 0),
+                      make_tuple(5, 6, &vpx_sub_pixel_variance32x64_c, 0),
+                      make_tuple(5, 5, &vpx_sub_pixel_variance32x32_c, 0),
+                      make_tuple(5, 4, &vpx_sub_pixel_variance32x16_c, 0),
+                      make_tuple(4, 5, &vpx_sub_pixel_variance16x32_c, 0),
+                      make_tuple(4, 4, &vpx_sub_pixel_variance16x16_c, 0),
+                      make_tuple(4, 3, &vpx_sub_pixel_variance16x8_c, 0),
+                      make_tuple(3, 4, &vpx_sub_pixel_variance8x16_c, 0),
+                      make_tuple(3, 3, &vpx_sub_pixel_variance8x8_c, 0),
+                      make_tuple(3, 2, &vpx_sub_pixel_variance8x4_c, 0),
+                      make_tuple(2, 3, &vpx_sub_pixel_variance4x8_c, 0),
+                      make_tuple(2, 2, &vpx_sub_pixel_variance4x4_c, 0)));
+
+INSTANTIATE_TEST_CASE_P(
+    C, VpxSubpelAvgVarianceTest,
+    ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_c, 0),
+                      make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_c, 0),
+                      make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_c, 0),
+                      make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_c, 0),
+                      make_tuple(5, 4, &vpx_sub_pixel_avg_variance32x16_c, 0),
+                      make_tuple(4, 5, &vpx_sub_pixel_avg_variance16x32_c, 0),
+                      make_tuple(4, 4, &vpx_sub_pixel_avg_variance16x16_c, 0),
+                      make_tuple(4, 3, &vpx_sub_pixel_avg_variance16x8_c, 0),
+                      make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_c, 0),
+                      make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_c, 0),
+                      make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_c, 0),
+                      make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_c, 0),
+                      make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_c, 0)));
+
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef MseTest<VarianceMxNFunc> VpxHBDMseTest;
+typedef VarianceTest<VarianceMxNFunc> VpxHBDVarianceTest;
+typedef SubpelVarianceTest<SubpixVarMxNFunc> VpxHBDSubpelVarianceTest;
+typedef SubpelVarianceTest<SubpixAvgVarMxNFunc>
+    VpxHBDSubpelAvgVarianceTest;
+
+TEST_P(VpxHBDMseTest, Ref_mse) { RefTest_mse(); }
+TEST_P(VpxHBDMseTest, Max_mse) { MaxTest_mse(); }
+TEST_P(VpxHBDVarianceTest, Zero) { ZeroTest(); }
+TEST_P(VpxHBDVarianceTest, Ref) { RefTest(); }
+TEST_P(VpxHBDVarianceTest, RefStride) { RefStrideTest(); }
+TEST_P(VpxHBDVarianceTest, OneQuarter) { OneQuarterTest(); }
+TEST_P(VpxHBDSubpelVarianceTest, Ref) { RefTest(); }
+TEST_P(VpxHBDSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
+TEST_P(VpxHBDSubpelAvgVarianceTest, Ref) { RefTest(); }
+
+/* TODO(debargha): This test does not support the highbd version
+INSTANTIATE_TEST_CASE_P(
+    C, VpxHBDMseTest,
+    ::testing::Values(make_tuple(4, 4, &vpx_highbd_12_mse16x16_c),
+                      make_tuple(4, 4, &vpx_highbd_12_mse16x8_c),
+                      make_tuple(4, 4, &vpx_highbd_12_mse8x16_c),
+                      make_tuple(4, 4, &vpx_highbd_12_mse8x8_c),
+                      make_tuple(4, 4, &vpx_highbd_10_mse16x16_c),
+                      make_tuple(4, 4, &vpx_highbd_10_mse16x8_c),
+                      make_tuple(4, 4, &vpx_highbd_10_mse8x16_c),
+                      make_tuple(4, 4, &vpx_highbd_10_mse8x8_c),
+                      make_tuple(4, 4, &vpx_highbd_8_mse16x16_c),
+                      make_tuple(4, 4, &vpx_highbd_8_mse16x8_c),
+                      make_tuple(4, 4, &vpx_highbd_8_mse8x16_c),
+                      make_tuple(4, 4, &vpx_highbd_8_mse8x8_c)));
+*/
+
+INSTANTIATE_TEST_CASE_P(
+    C, VpxHBDVarianceTest,
+    ::testing::Values(make_tuple(6, 6, &vpx_highbd_12_variance64x64_c, 12),
+                      make_tuple(6, 5, &vpx_highbd_12_variance64x32_c, 12),
+                      make_tuple(5, 6, &vpx_highbd_12_variance32x64_c, 12),
+                      make_tuple(5, 5, &vpx_highbd_12_variance32x32_c, 12),
+                      make_tuple(5, 4, &vpx_highbd_12_variance32x16_c, 12),
+                      make_tuple(4, 5, &vpx_highbd_12_variance16x32_c, 12),
+                      make_tuple(4, 4, &vpx_highbd_12_variance16x16_c, 12),
+                      make_tuple(4, 3, &vpx_highbd_12_variance16x8_c, 12),
+                      make_tuple(3, 4, &vpx_highbd_12_variance8x16_c, 12),
+                      make_tuple(3, 3, &vpx_highbd_12_variance8x8_c, 12),
+                      make_tuple(3, 2, &vpx_highbd_12_variance8x4_c, 12),
+                      make_tuple(2, 3, &vpx_highbd_12_variance4x8_c, 12),
+                      make_tuple(2, 2, &vpx_highbd_12_variance4x4_c, 12),
+                      make_tuple(6, 6, &vpx_highbd_10_variance64x64_c, 10),
+                      make_tuple(6, 5, &vpx_highbd_10_variance64x32_c, 10),
+                      make_tuple(5, 6, &vpx_highbd_10_variance32x64_c, 10),
+                      make_tuple(5, 5, &vpx_highbd_10_variance32x32_c, 10),
+                      make_tuple(5, 4, &vpx_highbd_10_variance32x16_c, 10),
+                      make_tuple(4, 5, &vpx_highbd_10_variance16x32_c, 10),
+                      make_tuple(4, 4, &vpx_highbd_10_variance16x16_c, 10),
+                      make_tuple(4, 3, &vpx_highbd_10_variance16x8_c, 10),
+                      make_tuple(3, 4, &vpx_highbd_10_variance8x16_c, 10),
+                      make_tuple(3, 3, &vpx_highbd_10_variance8x8_c, 10),
+                      make_tuple(3, 2, &vpx_highbd_10_variance8x4_c, 10),
+                      make_tuple(2, 3, &vpx_highbd_10_variance4x8_c, 10),
+                      make_tuple(2, 2, &vpx_highbd_10_variance4x4_c, 10),
+                      make_tuple(6, 6, &vpx_highbd_8_variance64x64_c, 8),
+                      make_tuple(6, 5, &vpx_highbd_8_variance64x32_c, 8),
+                      make_tuple(5, 6, &vpx_highbd_8_variance32x64_c, 8),
+                      make_tuple(5, 5, &vpx_highbd_8_variance32x32_c, 8),
+                      make_tuple(5, 4, &vpx_highbd_8_variance32x16_c, 8),
+                      make_tuple(4, 5, &vpx_highbd_8_variance16x32_c, 8),
+                      make_tuple(4, 4, &vpx_highbd_8_variance16x16_c, 8),
+                      make_tuple(4, 3, &vpx_highbd_8_variance16x8_c, 8),
+                      make_tuple(3, 4, &vpx_highbd_8_variance8x16_c, 8),
+                      make_tuple(3, 3, &vpx_highbd_8_variance8x8_c, 8),
+                      make_tuple(3, 2, &vpx_highbd_8_variance8x4_c, 8),
+                      make_tuple(2, 3, &vpx_highbd_8_variance4x8_c, 8),
+                      make_tuple(2, 2, &vpx_highbd_8_variance4x4_c, 8)));
+
+INSTANTIATE_TEST_CASE_P(
+    C, VpxHBDSubpelVarianceTest,
+    ::testing::Values(
+        make_tuple(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_c, 8),
+        make_tuple(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_c, 8),
+        make_tuple(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_c, 8),
+        make_tuple(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_c, 8),
+        make_tuple(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_c, 8),
+        make_tuple(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_c, 8),
+        make_tuple(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_c, 8),
+        make_tuple(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_c, 8),
+        make_tuple(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_c, 8),
+        make_tuple(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_c, 8),
+        make_tuple(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_c, 8),
+        make_tuple(2, 3, &vpx_highbd_8_sub_pixel_variance4x8_c, 8),
+        make_tuple(2, 2, &vpx_highbd_8_sub_pixel_variance4x4_c, 8),
+        make_tuple(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_c, 10),
+        make_tuple(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_c, 10),
+        make_tuple(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_c, 10),
+        make_tuple(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_c, 10),
+        make_tuple(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_c, 10),
+        make_tuple(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_c, 10),
+        make_tuple(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_c, 10),
+        make_tuple(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_c, 10),
+        make_tuple(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_c, 10),
+        make_tuple(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_c, 10),
+        make_tuple(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_c, 10),
+        make_tuple(2, 3, &vpx_highbd_10_sub_pixel_variance4x8_c, 10),
+        make_tuple(2, 2, &vpx_highbd_10_sub_pixel_variance4x4_c, 10),
+        make_tuple(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_c, 12),
+        make_tuple(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_c, 12),
+        make_tuple(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_c, 12),
+        make_tuple(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_c, 12),
+        make_tuple(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_c, 12),
+        make_tuple(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_c, 12),
+        make_tuple(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_c, 12),
+        make_tuple(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_c, 12),
+        make_tuple(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_c, 12),
+        make_tuple(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_c, 12),
+        make_tuple(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_c, 12),
+        make_tuple(2, 3, &vpx_highbd_12_sub_pixel_variance4x8_c, 12),
+        make_tuple(2, 2, &vpx_highbd_12_sub_pixel_variance4x4_c, 12)));
+
+INSTANTIATE_TEST_CASE_P(
+    C, VpxHBDSubpelAvgVarianceTest,
+    ::testing::Values(
+        make_tuple(6, 6, &vpx_highbd_8_sub_pixel_avg_variance64x64_c, 8),
+        make_tuple(6, 5, &vpx_highbd_8_sub_pixel_avg_variance64x32_c, 8),
+        make_tuple(5, 6, &vpx_highbd_8_sub_pixel_avg_variance32x64_c, 8),
+        make_tuple(5, 5, &vpx_highbd_8_sub_pixel_avg_variance32x32_c, 8),
+        make_tuple(5, 4, &vpx_highbd_8_sub_pixel_avg_variance32x16_c, 8),
+        make_tuple(4, 5, &vpx_highbd_8_sub_pixel_avg_variance16x32_c, 8),
+        make_tuple(4, 4, &vpx_highbd_8_sub_pixel_avg_variance16x16_c, 8),
+        make_tuple(4, 3, &vpx_highbd_8_sub_pixel_avg_variance16x8_c, 8),
+        make_tuple(3, 4, &vpx_highbd_8_sub_pixel_avg_variance8x16_c, 8),
+        make_tuple(3, 3, &vpx_highbd_8_sub_pixel_avg_variance8x8_c, 8),
+        make_tuple(3, 2, &vpx_highbd_8_sub_pixel_avg_variance8x4_c, 8),
+        make_tuple(2, 3, &vpx_highbd_8_sub_pixel_avg_variance4x8_c, 8),
+        make_tuple(2, 2, &vpx_highbd_8_sub_pixel_avg_variance4x4_c, 8),
+        make_tuple(6, 6, &vpx_highbd_10_sub_pixel_avg_variance64x64_c, 10),
+        make_tuple(6, 5, &vpx_highbd_10_sub_pixel_avg_variance64x32_c, 10),
+        make_tuple(5, 6, &vpx_highbd_10_sub_pixel_avg_variance32x64_c, 10),
+        make_tuple(5, 5, &vpx_highbd_10_sub_pixel_avg_variance32x32_c, 10),
+        make_tuple(5, 4, &vpx_highbd_10_sub_pixel_avg_variance32x16_c, 10),
+        make_tuple(4, 5, &vpx_highbd_10_sub_pixel_avg_variance16x32_c, 10),
+        make_tuple(4, 4, &vpx_highbd_10_sub_pixel_avg_variance16x16_c, 10),
+        make_tuple(4, 3, &vpx_highbd_10_sub_pixel_avg_variance16x8_c, 10),
+        make_tuple(3, 4, &vpx_highbd_10_sub_pixel_avg_variance8x16_c, 10),
+        make_tuple(3, 3, &vpx_highbd_10_sub_pixel_avg_variance8x8_c, 10),
+        make_tuple(3, 2, &vpx_highbd_10_sub_pixel_avg_variance8x4_c, 10),
+        make_tuple(2, 3, &vpx_highbd_10_sub_pixel_avg_variance4x8_c, 10),
+        make_tuple(2, 2, &vpx_highbd_10_sub_pixel_avg_variance4x4_c, 10),
+        make_tuple(6, 6, &vpx_highbd_12_sub_pixel_avg_variance64x64_c, 12),
+        make_tuple(6, 5, &vpx_highbd_12_sub_pixel_avg_variance64x32_c, 12),
+        make_tuple(5, 6, &vpx_highbd_12_sub_pixel_avg_variance32x64_c, 12),
+        make_tuple(5, 5, &vpx_highbd_12_sub_pixel_avg_variance32x32_c, 12),
+        make_tuple(5, 4, &vpx_highbd_12_sub_pixel_avg_variance32x16_c, 12),
+        make_tuple(4, 5, &vpx_highbd_12_sub_pixel_avg_variance16x32_c, 12),
+        make_tuple(4, 4, &vpx_highbd_12_sub_pixel_avg_variance16x16_c, 12),
+        make_tuple(4, 3, &vpx_highbd_12_sub_pixel_avg_variance16x8_c, 12),
+        make_tuple(3, 4, &vpx_highbd_12_sub_pixel_avg_variance8x16_c, 12),
+        make_tuple(3, 3, &vpx_highbd_12_sub_pixel_avg_variance8x8_c, 12),
+        make_tuple(3, 2, &vpx_highbd_12_sub_pixel_avg_variance8x4_c, 12),
+        make_tuple(2, 3, &vpx_highbd_12_sub_pixel_avg_variance4x8_c, 12),
+        make_tuple(2, 2, &vpx_highbd_12_sub_pixel_avg_variance4x4_c, 12)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if HAVE_MMX
+INSTANTIATE_TEST_CASE_P(MMX, VpxMseTest,
+                        ::testing::Values(make_tuple(4, 4, &vpx_mse16x16_mmx)));
+
+INSTANTIATE_TEST_CASE_P(MMX, SumOfSquaresTest,
+                        ::testing::Values(vpx_get_mb_ss_mmx));
+
+INSTANTIATE_TEST_CASE_P(
+    MMX, VpxVarianceTest,
+    ::testing::Values(make_tuple(4, 4, &vpx_variance16x16_mmx, 0),
+                      make_tuple(4, 3, &vpx_variance16x8_mmx, 0),
+                      make_tuple(3, 4, &vpx_variance8x16_mmx, 0),
+                      make_tuple(3, 3, &vpx_variance8x8_mmx, 0),
+                      make_tuple(2, 2, &vpx_variance4x4_mmx, 0)));
+
+INSTANTIATE_TEST_CASE_P(
+    MMX, VpxSubpelVarianceTest,
+    ::testing::Values(make_tuple(4, 4, &vpx_sub_pixel_variance16x16_mmx, 0),
+                      make_tuple(4, 3, &vpx_sub_pixel_variance16x8_mmx, 0),
+                      make_tuple(3, 4, &vpx_sub_pixel_variance8x16_mmx, 0),
+                      make_tuple(3, 3, &vpx_sub_pixel_variance8x8_mmx, 0),
+                      make_tuple(2, 2, &vpx_sub_pixel_variance4x4_mmx, 0)));
+#endif  // HAVE_MMX
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, SumOfSquaresTest,
+                        ::testing::Values(vpx_get_mb_ss_sse2));
+
+INSTANTIATE_TEST_CASE_P(SSE2, VpxMseTest,
+                        ::testing::Values(make_tuple(4, 4, &vpx_mse16x16_sse2),
+                                          make_tuple(4, 3, &vpx_mse16x8_sse2),
+                                          make_tuple(3, 4, &vpx_mse8x16_sse2),
+                                          make_tuple(3, 3, &vpx_mse8x8_sse2)));
+
+INSTANTIATE_TEST_CASE_P(
+    SSE2, VpxVarianceTest,
+    ::testing::Values(make_tuple(6, 6, &vpx_variance64x64_sse2, 0),
+                      make_tuple(6, 5, &vpx_variance64x32_sse2, 0),
+                      make_tuple(5, 6, &vpx_variance32x64_sse2, 0),
+                      make_tuple(5, 5, &vpx_variance32x32_sse2, 0),
+                      make_tuple(5, 4, &vpx_variance32x16_sse2, 0),
+                      make_tuple(4, 5, &vpx_variance16x32_sse2, 0),
+                      make_tuple(4, 4, &vpx_variance16x16_sse2, 0),
+                      make_tuple(4, 3, &vpx_variance16x8_sse2, 0),
+                      make_tuple(3, 4, &vpx_variance8x16_sse2, 0),
+                      make_tuple(3, 3, &vpx_variance8x8_sse2, 0),
+                      make_tuple(3, 2, &vpx_variance8x4_sse2, 0),
+                      make_tuple(2, 3, &vpx_variance4x8_sse2, 0),
+                      make_tuple(2, 2, &vpx_variance4x4_sse2, 0)));
+
+#if CONFIG_USE_X86INC
+INSTANTIATE_TEST_CASE_P(
+    SSE2, VpxSubpelVarianceTest,
+    ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_sse2, 0),
+                      make_tuple(6, 5, &vpx_sub_pixel_variance64x32_sse2, 0),
+                      make_tuple(5, 6, &vpx_sub_pixel_variance32x64_sse2, 0),
+                      make_tuple(5, 5, &vpx_sub_pixel_variance32x32_sse2, 0),
+                      make_tuple(5, 4, &vpx_sub_pixel_variance32x16_sse2, 0),
+                      make_tuple(4, 5, &vpx_sub_pixel_variance16x32_sse2, 0),
+                      make_tuple(4, 4, &vpx_sub_pixel_variance16x16_sse2, 0),
+                      make_tuple(4, 3, &vpx_sub_pixel_variance16x8_sse2, 0),
+                      make_tuple(3, 4, &vpx_sub_pixel_variance8x16_sse2, 0),
+                      make_tuple(3, 3, &vpx_sub_pixel_variance8x8_sse2, 0),
+                      make_tuple(3, 2, &vpx_sub_pixel_variance8x4_sse2, 0),
+                      make_tuple(2, 3, &vpx_sub_pixel_variance4x8_sse, 0),
+                      make_tuple(2, 2, &vpx_sub_pixel_variance4x4_sse, 0)));
+
+INSTANTIATE_TEST_CASE_P(
+    SSE2, VpxSubpelAvgVarianceTest,
+    ::testing::Values(
+        make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_sse2, 0),
+        make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_sse2, 0),
+        make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_sse2, 0),
+        make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_sse2, 0),
+        make_tuple(5, 4, &vpx_sub_pixel_avg_variance32x16_sse2, 0),
+        make_tuple(4, 5, &vpx_sub_pixel_avg_variance16x32_sse2, 0),
+        make_tuple(4, 4, &vpx_sub_pixel_avg_variance16x16_sse2, 0),
+        make_tuple(4, 3, &vpx_sub_pixel_avg_variance16x8_sse2, 0),
+        make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_sse2, 0),
+        make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_sse2, 0),
+        make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_sse2, 0),
+        make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_sse, 0),
+        make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_sse, 0)));
+#endif  // CONFIG_USE_X86INC
+
+#if CONFIG_VP9_HIGHBITDEPTH
+/* TODO(debargha): This test does not support the highbd version
+INSTANTIATE_TEST_CASE_P(
+    SSE2, VpxHBDMseTest,
+    ::testing::Values(make_tuple(4, 4, &vpx_highbd_12_mse16x16_sse2),
+                      make_tuple(4, 3, &vpx_highbd_12_mse16x8_sse2),
+                      make_tuple(3, 4, &vpx_highbd_12_mse8x16_sse2),
+                      make_tuple(3, 3, &vpx_highbd_12_mse8x8_sse2),
+                      make_tuple(4, 4, &vpx_highbd_10_mse16x16_sse2),
+                      make_tuple(4, 3, &vpx_highbd_10_mse16x8_sse2),
+                      make_tuple(3, 4, &vpx_highbd_10_mse8x16_sse2),
+                      make_tuple(3, 3, &vpx_highbd_10_mse8x8_sse2),
+                      make_tuple(4, 4, &vpx_highbd_8_mse16x16_sse2),
+                      make_tuple(4, 3, &vpx_highbd_8_mse16x8_sse2),
+                      make_tuple(3, 4, &vpx_highbd_8_mse8x16_sse2),
+                      make_tuple(3, 3, &vpx_highbd_8_mse8x8_sse2)));
+*/
+
+INSTANTIATE_TEST_CASE_P(
+    SSE2, VpxHBDVarianceTest,
+    ::testing::Values(make_tuple(6, 6, &vpx_highbd_12_variance64x64_sse2, 12),
+                      make_tuple(6, 5, &vpx_highbd_12_variance64x32_sse2, 12),
+                      make_tuple(5, 6, &vpx_highbd_12_variance32x64_sse2, 12),
+                      make_tuple(5, 5, &vpx_highbd_12_variance32x32_sse2, 12),
+                      make_tuple(5, 4, &vpx_highbd_12_variance32x16_sse2, 12),
+                      make_tuple(4, 5, &vpx_highbd_12_variance16x32_sse2, 12),
+                      make_tuple(4, 4, &vpx_highbd_12_variance16x16_sse2, 12),
+                      make_tuple(4, 3, &vpx_highbd_12_variance16x8_sse2, 12),
+                      make_tuple(3, 4, &vpx_highbd_12_variance8x16_sse2, 12),
+                      make_tuple(3, 3, &vpx_highbd_12_variance8x8_sse2, 12),
+                      make_tuple(6, 6, &vpx_highbd_10_variance64x64_sse2, 10),
+                      make_tuple(6, 5, &vpx_highbd_10_variance64x32_sse2, 10),
+                      make_tuple(5, 6, &vpx_highbd_10_variance32x64_sse2, 10),
+                      make_tuple(5, 5, &vpx_highbd_10_variance32x32_sse2, 10),
+                      make_tuple(5, 4, &vpx_highbd_10_variance32x16_sse2, 10),
+                      make_tuple(4, 5, &vpx_highbd_10_variance16x32_sse2, 10),
+                      make_tuple(4, 4, &vpx_highbd_10_variance16x16_sse2, 10),
+                      make_tuple(4, 3, &vpx_highbd_10_variance16x8_sse2, 10),
+                      make_tuple(3, 4, &vpx_highbd_10_variance8x16_sse2, 10),
+                      make_tuple(3, 3, &vpx_highbd_10_variance8x8_sse2, 10),
+                      make_tuple(6, 6, &vpx_highbd_8_variance64x64_sse2, 8),
+                      make_tuple(6, 5, &vpx_highbd_8_variance64x32_sse2, 8),
+                      make_tuple(5, 6, &vpx_highbd_8_variance32x64_sse2, 8),
+                      make_tuple(5, 5, &vpx_highbd_8_variance32x32_sse2, 8),
+                      make_tuple(5, 4, &vpx_highbd_8_variance32x16_sse2, 8),
+                      make_tuple(4, 5, &vpx_highbd_8_variance16x32_sse2, 8),
+                      make_tuple(4, 4, &vpx_highbd_8_variance16x16_sse2, 8),
+                      make_tuple(4, 3, &vpx_highbd_8_variance16x8_sse2, 8),
+                      make_tuple(3, 4, &vpx_highbd_8_variance8x16_sse2, 8),
+                      make_tuple(3, 3, &vpx_highbd_8_variance8x8_sse2, 8)));
+
+#if CONFIG_USE_X86INC
+INSTANTIATE_TEST_CASE_P(
+    SSE2, VpxHBDSubpelVarianceTest,
+    ::testing::Values(
+        make_tuple(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_sse2, 12),
+        make_tuple(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_sse2, 12),
+        make_tuple(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_sse2, 12),
+        make_tuple(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_sse2, 12),
+        make_tuple(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_sse2, 12),
+        make_tuple(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_sse2, 12),
+        make_tuple(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_sse2, 12),
+        make_tuple(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_sse2, 12),
+        make_tuple(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_sse2, 12),
+        make_tuple(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_sse2, 12),
+        make_tuple(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_sse2, 12),
+        make_tuple(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_sse2, 10),
+        make_tuple(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_sse2, 10),
+        make_tuple(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_sse2, 10),
+        make_tuple(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_sse2, 10),
+        make_tuple(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_sse2, 10),
+        make_tuple(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_sse2, 10),
+        make_tuple(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_sse2, 10),
+        make_tuple(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_sse2, 10),
+        make_tuple(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_sse2, 10),
+        make_tuple(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_sse2, 10),
+        make_tuple(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_sse2, 10),
+        make_tuple(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_sse2, 8),
+        make_tuple(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_sse2, 8),
+        make_tuple(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_sse2, 8),
+        make_tuple(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_sse2, 8),
+        make_tuple(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_sse2, 8),
+        make_tuple(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_sse2, 8),
+        make_tuple(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_sse2, 8),
+        make_tuple(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_sse2, 8),
+        make_tuple(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_sse2, 8),
+        make_tuple(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_sse2, 8),
+        make_tuple(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_sse2, 8)));
+
+INSTANTIATE_TEST_CASE_P(
+    SSE2, VpxHBDSubpelAvgVarianceTest,
+    ::testing::Values(
+        make_tuple(6, 6, &vpx_highbd_12_sub_pixel_avg_variance64x64_sse2, 12),
+        make_tuple(6, 5, &vpx_highbd_12_sub_pixel_avg_variance64x32_sse2, 12),
+        make_tuple(5, 6, &vpx_highbd_12_sub_pixel_avg_variance32x64_sse2, 12),
+        make_tuple(5, 5, &vpx_highbd_12_sub_pixel_avg_variance32x32_sse2, 12),
+        make_tuple(5, 4, &vpx_highbd_12_sub_pixel_avg_variance32x16_sse2, 12),
+        make_tuple(4, 5, &vpx_highbd_12_sub_pixel_avg_variance16x32_sse2, 12),
+        make_tuple(4, 4, &vpx_highbd_12_sub_pixel_avg_variance16x16_sse2, 12),
+        make_tuple(4, 3, &vpx_highbd_12_sub_pixel_avg_variance16x8_sse2, 12),
+        make_tuple(3, 4, &vpx_highbd_12_sub_pixel_avg_variance8x16_sse2, 12),
+        make_tuple(3, 3, &vpx_highbd_12_sub_pixel_avg_variance8x8_sse2, 12),
+        make_tuple(3, 2, &vpx_highbd_12_sub_pixel_avg_variance8x4_sse2, 12),
+        make_tuple(6, 6, &vpx_highbd_10_sub_pixel_avg_variance64x64_sse2, 10),
+        make_tuple(6, 5, &vpx_highbd_10_sub_pixel_avg_variance64x32_sse2, 10),
+        make_tuple(5, 6, &vpx_highbd_10_sub_pixel_avg_variance32x64_sse2, 10),
+        make_tuple(5, 5, &vpx_highbd_10_sub_pixel_avg_variance32x32_sse2, 10),
+        make_tuple(5, 4, &vpx_highbd_10_sub_pixel_avg_variance32x16_sse2, 10),
+        make_tuple(4, 5, &vpx_highbd_10_sub_pixel_avg_variance16x32_sse2, 10),
+        make_tuple(4, 4, &vpx_highbd_10_sub_pixel_avg_variance16x16_sse2, 10),
+        make_tuple(4, 3, &vpx_highbd_10_sub_pixel_avg_variance16x8_sse2, 10),
+        make_tuple(3, 4, &vpx_highbd_10_sub_pixel_avg_variance8x16_sse2, 10),
+        make_tuple(3, 3, &vpx_highbd_10_sub_pixel_avg_variance8x8_sse2, 10),
+        make_tuple(3, 2, &vpx_highbd_10_sub_pixel_avg_variance8x4_sse2, 10),
+        make_tuple(6, 6, &vpx_highbd_8_sub_pixel_avg_variance64x64_sse2, 8),
+        make_tuple(6, 5, &vpx_highbd_8_sub_pixel_avg_variance64x32_sse2, 8),
+        make_tuple(5, 6, &vpx_highbd_8_sub_pixel_avg_variance32x64_sse2, 8),
+        make_tuple(5, 5, &vpx_highbd_8_sub_pixel_avg_variance32x32_sse2, 8),
+        make_tuple(5, 4, &vpx_highbd_8_sub_pixel_avg_variance32x16_sse2, 8),
+        make_tuple(4, 5, &vpx_highbd_8_sub_pixel_avg_variance16x32_sse2, 8),
+        make_tuple(4, 4, &vpx_highbd_8_sub_pixel_avg_variance16x16_sse2, 8),
+        make_tuple(4, 3, &vpx_highbd_8_sub_pixel_avg_variance16x8_sse2, 8),
+        make_tuple(3, 4, &vpx_highbd_8_sub_pixel_avg_variance8x16_sse2, 8),
+        make_tuple(3, 3, &vpx_highbd_8_sub_pixel_avg_variance8x8_sse2, 8),
+        make_tuple(3, 2, &vpx_highbd_8_sub_pixel_avg_variance8x4_sse2, 8)));
+#endif  // CONFIG_USE_X86INC
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_SSE2
+
+#if HAVE_SSSE3
+#if CONFIG_USE_X86INC
+INSTANTIATE_TEST_CASE_P(
+    SSSE3, VpxSubpelVarianceTest,
+    ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_ssse3, 0),
+                      make_tuple(6, 5, &vpx_sub_pixel_variance64x32_ssse3, 0),
+                      make_tuple(5, 6, &vpx_sub_pixel_variance32x64_ssse3, 0),
+                      make_tuple(5, 5, &vpx_sub_pixel_variance32x32_ssse3, 0),
+                      make_tuple(5, 4, &vpx_sub_pixel_variance32x16_ssse3, 0),
+                      make_tuple(4, 5, &vpx_sub_pixel_variance16x32_ssse3, 0),
+                      make_tuple(4, 4, &vpx_sub_pixel_variance16x16_ssse3, 0),
+                      make_tuple(4, 3, &vpx_sub_pixel_variance16x8_ssse3, 0),
+                      make_tuple(3, 4, &vpx_sub_pixel_variance8x16_ssse3, 0),
+                      make_tuple(3, 3, &vpx_sub_pixel_variance8x8_ssse3, 0),
+                      make_tuple(3, 2, &vpx_sub_pixel_variance8x4_ssse3, 0),
+                      make_tuple(2, 3, &vpx_sub_pixel_variance4x8_ssse3, 0),
+                      make_tuple(2, 2, &vpx_sub_pixel_variance4x4_ssse3, 0)));
+
+INSTANTIATE_TEST_CASE_P(
+    SSSE3, VpxSubpelAvgVarianceTest,
+    ::testing::Values(
+        make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_ssse3, 0),
+        make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_ssse3, 0),
+        make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_ssse3, 0),
+        make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_ssse3, 0),
+        make_tuple(5, 4, &vpx_sub_pixel_avg_variance32x16_ssse3, 0),
+        make_tuple(4, 5, &vpx_sub_pixel_avg_variance16x32_ssse3, 0),
+        make_tuple(4, 4, &vpx_sub_pixel_avg_variance16x16_ssse3, 0),
+        make_tuple(4, 3, &vpx_sub_pixel_avg_variance16x8_ssse3, 0),
+        make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_ssse3, 0),
+        make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_ssse3, 0),
+        make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_ssse3, 0),
+        make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_ssse3, 0),
+        make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_ssse3, 0)));
+#endif  // CONFIG_USE_X86INC
+#endif  // HAVE_SSSE3
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(AVX2, VpxMseTest,
+                        ::testing::Values(make_tuple(4, 4,
+                                                     &vpx_mse16x16_avx2)));
+
+INSTANTIATE_TEST_CASE_P(
+    AVX2, VpxVarianceTest,
+    ::testing::Values(make_tuple(6, 6, &vpx_variance64x64_avx2, 0),
+                      make_tuple(6, 5, &vpx_variance64x32_avx2, 0),
+                      make_tuple(5, 5, &vpx_variance32x32_avx2, 0),
+                      make_tuple(5, 4, &vpx_variance32x16_avx2, 0),
+                      make_tuple(4, 4, &vpx_variance16x16_avx2, 0)));
+
+INSTANTIATE_TEST_CASE_P(
+    AVX2, VpxSubpelVarianceTest,
+    ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_avx2, 0),
+                      make_tuple(5, 5, &vpx_sub_pixel_variance32x32_avx2, 0)));
+
+INSTANTIATE_TEST_CASE_P(
+    AVX2, VpxSubpelAvgVarianceTest,
+    ::testing::Values(
+        make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_avx2, 0),
+        make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_avx2, 0)));
+#endif  // HAVE_AVX2
+
+#if HAVE_MEDIA
+INSTANTIATE_TEST_CASE_P(MEDIA, VpxMseTest,
+                        ::testing::Values(make_tuple(4, 4,
+                                                     &vpx_mse16x16_media)));
+
+INSTANTIATE_TEST_CASE_P(
+    MEDIA, VpxVarianceTest,
+    ::testing::Values(make_tuple(4, 4, &vpx_variance16x16_media, 0),
+                      make_tuple(3, 3, &vpx_variance8x8_media, 0)));
+
+INSTANTIATE_TEST_CASE_P(
+    MEDIA, VpxSubpelVarianceTest,
+    ::testing::Values(make_tuple(4, 4, &vpx_sub_pixel_variance16x16_media, 0),
+                      make_tuple(3, 3, &vpx_sub_pixel_variance8x8_media, 0)));
+#endif  // HAVE_MEDIA
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, VpxSseTest,
+                        ::testing::Values(make_tuple(2, 2,
+                                                     &vpx_get4x4sse_cs_neon)));
+
+INSTANTIATE_TEST_CASE_P(NEON, VpxMseTest,
+                        ::testing::Values(make_tuple(4, 4,
+                                                     &vpx_mse16x16_neon)));
+
+INSTANTIATE_TEST_CASE_P(
+    NEON, VpxVarianceTest,
+    ::testing::Values(make_tuple(6, 6, &vpx_variance64x64_neon, 0),
+                      make_tuple(6, 5, &vpx_variance64x32_neon, 0),
+                      make_tuple(5, 6, &vpx_variance32x64_neon, 0),
+                      make_tuple(5, 5, &vpx_variance32x32_neon, 0),
+                      make_tuple(4, 4, &vpx_variance16x16_neon, 0),
+                      make_tuple(4, 3, &vpx_variance16x8_neon, 0),
+                      make_tuple(3, 4, &vpx_variance8x16_neon, 0),
+                      make_tuple(3, 3, &vpx_variance8x8_neon, 0)));
+
+INSTANTIATE_TEST_CASE_P(
+    NEON, VpxSubpelVarianceTest,
+    ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_neon, 0),
+                      make_tuple(5, 5, &vpx_sub_pixel_variance32x32_neon, 0),
+                      make_tuple(4, 4, &vpx_sub_pixel_variance16x16_neon, 0),
+                      make_tuple(3, 3, &vpx_sub_pixel_variance8x8_neon, 0)));
+#endif  // HAVE_NEON
+
+#if HAVE_MSA
+INSTANTIATE_TEST_CASE_P(MSA, SumOfSquaresTest,
+                        ::testing::Values(vpx_get_mb_ss_msa));
+
+INSTANTIATE_TEST_CASE_P(MSA, VpxSseTest,
+                        ::testing::Values(make_tuple(2, 2,
+                                                     &vpx_get4x4sse_cs_msa)));
+
+INSTANTIATE_TEST_CASE_P(MSA, VpxMseTest,
+                        ::testing::Values(make_tuple(4, 4, &vpx_mse16x16_msa),
+                                          make_tuple(4, 3, &vpx_mse16x8_msa),
+                                          make_tuple(3, 4, &vpx_mse8x16_msa),
+                                          make_tuple(3, 3, &vpx_mse8x8_msa)));
+
+INSTANTIATE_TEST_CASE_P(
+    MSA, VpxVarianceTest,
+    ::testing::Values(make_tuple(6, 6, &vpx_variance64x64_msa, 0),
+                      make_tuple(6, 5, &vpx_variance64x32_msa, 0),
+                      make_tuple(5, 6, &vpx_variance32x64_msa, 0),
+                      make_tuple(5, 5, &vpx_variance32x32_msa, 0),
+                      make_tuple(5, 4, &vpx_variance32x16_msa, 0),
+                      make_tuple(4, 5, &vpx_variance16x32_msa, 0),
+                      make_tuple(4, 4, &vpx_variance16x16_msa, 0),
+                      make_tuple(4, 3, &vpx_variance16x8_msa, 0),
+                      make_tuple(3, 4, &vpx_variance8x16_msa, 0),
+                      make_tuple(3, 3, &vpx_variance8x8_msa, 0),
+                      make_tuple(3, 2, &vpx_variance8x4_msa, 0),
+                      make_tuple(2, 3, &vpx_variance4x8_msa, 0),
+                      make_tuple(2, 2, &vpx_variance4x4_msa, 0)));
+
+INSTANTIATE_TEST_CASE_P(
+    MSA, VpxSubpelVarianceTest,
+    ::testing::Values(make_tuple(2, 2, &vpx_sub_pixel_variance4x4_msa, 0),
+                      make_tuple(2, 3, &vpx_sub_pixel_variance4x8_msa, 0),
+                      make_tuple(3, 2, &vpx_sub_pixel_variance8x4_msa, 0),
+                      make_tuple(3, 3, &vpx_sub_pixel_variance8x8_msa, 0),
+                      make_tuple(3, 4, &vpx_sub_pixel_variance8x16_msa, 0),
+                      make_tuple(4, 3, &vpx_sub_pixel_variance16x8_msa, 0),
+                      make_tuple(4, 4, &vpx_sub_pixel_variance16x16_msa, 0),
+                      make_tuple(4, 5, &vpx_sub_pixel_variance16x32_msa, 0),
+                      make_tuple(5, 4, &vpx_sub_pixel_variance32x16_msa, 0),
+                      make_tuple(5, 5, &vpx_sub_pixel_variance32x32_msa, 0),
+                      make_tuple(5, 6, &vpx_sub_pixel_variance32x64_msa, 0),
+                      make_tuple(6, 5, &vpx_sub_pixel_variance64x32_msa, 0),
+                      make_tuple(6, 6, &vpx_sub_pixel_variance64x64_msa, 0)));
+
+INSTANTIATE_TEST_CASE_P(
+    MSA, VpxSubpelAvgVarianceTest,
+    ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_msa, 0),
+                      make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_msa, 0),
+                      make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_msa, 0),
+                      make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_msa, 0),
+                      make_tuple(5, 4, &vpx_sub_pixel_avg_variance32x16_msa, 0),
+                      make_tuple(4, 5, &vpx_sub_pixel_avg_variance16x32_msa, 0),
+                      make_tuple(4, 4, &vpx_sub_pixel_avg_variance16x16_msa, 0),
+                      make_tuple(4, 3, &vpx_sub_pixel_avg_variance16x8_msa, 0),
+                      make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_msa, 0),
+                      make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_msa, 0),
+                      make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_msa, 0),
+                      make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_msa, 0),
+                      make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_msa, 0)));
+#endif  // HAVE_MSA
+}  // namespace
diff --git a/libs/libvpx/test/video_source.h b/libs/libvpx/test/video_source.h
new file mode 100644
index 0000000000..ade323e7c3
--- /dev/null
+++ b/libs/libvpx/test/video_source.h
@@ -0,0 +1,270 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef TEST_VIDEO_SOURCE_H_
+#define TEST_VIDEO_SOURCE_H_
+
+#if defined(_WIN32)
+#undef NOMINMAX
+#define NOMINMAX
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#endif
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include "test/acm_random.h"
+#include "vpx/vpx_encoder.h"
+
+namespace libvpx_test {
+
+// Helper macros to ensure LIBVPX_TEST_DATA_PATH is a quoted string.
+// These are undefined right below GetDataPath
+// NOTE: LIBVPX_TEST_DATA_PATH MUST NOT be a quoted string before
+// Stringification or the GetDataPath will fail at runtime
+#define TO_STRING(S) #S
+#define STRINGIFY(S) TO_STRING(S)
+
+// A simple function to encapsulate cross platform retrieval of test data path
+static std::string GetDataPath() {
+  const char *const data_path = getenv("LIBVPX_TEST_DATA_PATH");
+  if (data_path == NULL) {
+#ifdef LIBVPX_TEST_DATA_PATH
+    // In some environments, we cannot set environment variables
+    // Instead, we set the data path by using a preprocessor symbol
+    // which can be set from make files
+    return STRINGIFY(LIBVPX_TEST_DATA_PATH);
+#else
+    return ".";
+#endif
+  }
+  return data_path;
+}
+
+// Undefining stringification macros because they are not used elsewhere
+#undef TO_STRING
+#undef STRINGIFY
+
+inline FILE *OpenTestDataFile(const std::string& file_name) {
+  const std::string path_to_source = GetDataPath() + "/" + file_name;
+  return fopen(path_to_source.c_str(), "rb");
+}
+
+static FILE *GetTempOutFile(std::string *file_name) {
+  file_name->clear();
+#if defined(_WIN32)
+  char fname[MAX_PATH];
+  char tmppath[MAX_PATH];
+  if (GetTempPathA(MAX_PATH, tmppath)) {
+    // Assume for now that the filename generated is unique per process
+    if (GetTempFileNameA(tmppath, "lvx", 0, fname)) {
+      file_name->assign(fname);
+      return fopen(fname, "wb+");
+    }
+  }
+  return NULL;
+#else
+  return tmpfile();
+#endif
+}
+
+class TempOutFile {
+ public:
+  TempOutFile() {
+    file_ = GetTempOutFile(&file_name_);
+  }
+  ~TempOutFile() {
+    CloseFile();
+    if (!file_name_.empty()) {
+      EXPECT_EQ(0, remove(file_name_.c_str()));
+    }
+  }
+  FILE *file() {
+    return file_;
+  }
+  const std::string& file_name() {
+    return file_name_;
+  }
+
+ protected:
+  void CloseFile() {
+    if (file_) {
+      fclose(file_);
+      file_ = NULL;
+    }
+  }
+  FILE *file_;
+  std::string file_name_;
+};
+
+// Abstract base class for test video sources, which provide a stream of
+// vpx_image_t images with associated timestamps and duration.
+class VideoSource {
+ public:
+  virtual ~VideoSource() {}
+
+  // Prepare the stream for reading, rewind/open as necessary.
+  virtual void Begin() = 0;
+
+  // Advance the cursor to the next frame
+  virtual void Next() = 0;
+
+  // Get the current video frame, or NULL on End-Of-Stream.
+  virtual vpx_image_t *img() const = 0;
+
+  // Get the presentation timestamp of the current frame.
+  virtual vpx_codec_pts_t pts() const = 0;
+
+  // Get the current frame's duration
+  virtual unsigned long duration() const = 0;
+
+  // Get the timebase for the stream
+  virtual vpx_rational_t timebase() const = 0;
+
+  // Get the current frame counter, starting at 0.
+  virtual unsigned int frame() const = 0;
+
+  // Get the current file limit.
+  virtual unsigned int limit() const = 0;
+};
+
+
+class DummyVideoSource : public VideoSource {
+ public:
+  DummyVideoSource()
+      : img_(NULL),
+        limit_(100),
+        width_(80),
+        height_(64),
+        format_(VPX_IMG_FMT_I420) {
+    ReallocImage();
+  }
+
+  virtual ~DummyVideoSource() { vpx_img_free(img_); }
+
+  virtual void Begin() {
+    frame_ = 0;
+    FillFrame();
+  }
+
+  virtual void Next() {
+    ++frame_;
+    FillFrame();
+  }
+
+  virtual vpx_image_t *img() const {
+    return (frame_ < limit_) ? img_ : NULL;
+  }
+
+  // Models a stream where Timebase = 1/FPS, so pts == frame.
+  virtual vpx_codec_pts_t pts() const { return frame_; }
+
+  virtual unsigned long duration() const { return 1; }
+
+  virtual vpx_rational_t timebase() const {
+    const vpx_rational_t t = {1, 30};
+    return t;
+  }
+
+  virtual unsigned int frame() const { return frame_; }
+
+  virtual unsigned int limit() const { return limit_; }
+
+  void set_limit(unsigned int limit) {
+    limit_ = limit;
+  }
+
+  void SetSize(unsigned int width, unsigned int height) {
+    if (width != width_ || height != height_) {
+      width_ = width;
+      height_ = height;
+      ReallocImage();
+    }
+  }
+
+  void SetImageFormat(vpx_img_fmt_t format) {
+    if (format_ != format) {
+      format_ = format;
+      ReallocImage();
+    }
+  }
+
+ protected:
+  virtual void FillFrame() { if (img_) memset(img_->img_data, 0, raw_sz_); }
+
+  void ReallocImage() {
+    vpx_img_free(img_);
+    img_ = vpx_img_alloc(NULL, format_, width_, height_, 32);
+    raw_sz_ = ((img_->w + 31) & ~31) * img_->h * img_->bps / 8;
+  }
+
+  vpx_image_t *img_;
+  size_t       raw_sz_;
+  unsigned int limit_;
+  unsigned int frame_;
+  unsigned int width_;
+  unsigned int height_;
+  vpx_img_fmt_t format_;
+};
+
+
+class RandomVideoSource : public DummyVideoSource {
+ public:
+  RandomVideoSource(int seed = ACMRandom::DeterministicSeed())
+      : rnd_(seed),
+        seed_(seed) { }
+
+ protected:
+  // Reset the RNG to get a matching stream for the second pass
+  virtual void Begin() {
+    frame_ = 0;
+    rnd_.Reset(seed_);
+    FillFrame();
+  }
+
+  // 15 frames of noise, followed by 15 static frames. Reset to 0 rather
+  // than holding previous frames to encourage keyframes to be thrown.
+  virtual void FillFrame() {
+    if (img_) {
+      if (frame_ % 30 < 15)
+        for (size_t i = 0; i < raw_sz_; ++i)
+          img_->img_data[i] = rnd_.Rand8();
+      else
+        memset(img_->img_data, 0, raw_sz_);
+    }
+  }
+
+  ACMRandom rnd_;
+  int seed_;
+};
+
+// Abstract base class for test video sources, which provide a stream of
+// decompressed images to the decoder.
+class CompressedVideoSource {
+ public:
+  virtual ~CompressedVideoSource() {}
+
+  virtual void Init() = 0;
+
+  // Prepare the stream for reading, rewind/open as necessary.
+  virtual void Begin() = 0;
+
+  // Advance the cursor to the next frame
+  virtual void Next() = 0;
+
+  virtual const uint8_t *cxdata() const = 0;
+
+  virtual size_t frame_size() const = 0;
+
+  virtual unsigned int frame_number() const = 0;
+};
+
+}  // namespace libvpx_test
+
+#endif  // TEST_VIDEO_SOURCE_H_
diff --git a/libs/libvpx/test/vp10_dct_test.cc b/libs/libvpx/test/vp10_dct_test.cc
new file mode 100644
index 0000000000..b2c301ae39
--- /dev/null
+++ b/libs/libvpx/test/vp10_dct_test.cc
@@ -0,0 +1,111 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <new>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/util.h"
+#include "./vpx_config.h"
+#include "vpx_ports/msvc.h"
+
+#undef CONFIG_COEFFICIENT_RANGE_CHECKING
+#define CONFIG_COEFFICIENT_RANGE_CHECKING 1
+#include "vp10/encoder/dct.c"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+void reference_dct_1d(const double *in, double *out, int size) {
+  const double PI = 3.141592653589793238462643383279502884;
+  const double kInvSqrt2 = 0.707106781186547524400844362104;
+  for (int k = 0; k < size; ++k) {
+    out[k] = 0;
+    for (int n = 0; n < size; ++n) {
+      out[k] += in[n] * cos(PI * (2 * n + 1) * k / (2 * size));
+    }
+    if (k == 0)
+      out[k] = out[k] * kInvSqrt2;
+  }
+}
+
+typedef void (*FdctFuncRef)(const double *in, double *out, int size);
+typedef void (*IdctFuncRef)(const double *in, double *out, int size);
+typedef void (*FdctFunc)(const tran_low_t *in, tran_low_t *out);
+typedef void (*IdctFunc)(const tran_low_t *in, tran_low_t *out);
+
+class TransTestBase {
+ public:
+  virtual ~TransTestBase() {}
+
+ protected:
+  void RunFwdAccuracyCheck() {
+    tran_low_t *input  = new tran_low_t[txfm_size_];
+    tran_low_t *output = new tran_low_t[txfm_size_];
+    double *ref_input  = new double[txfm_size_];
+    double *ref_output = new double[txfm_size_];
+
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 5000;
+    for (int ti =  0; ti < count_test_block; ++ti) {
+      for (int ni = 0; ni < txfm_size_; ++ni) {
+        input[ni] = rnd.Rand8() - rnd.Rand8();
+        ref_input[ni] = static_cast<double>(input[ni]);
+      }
+
+      fwd_txfm_(input, output);
+      fwd_txfm_ref_(ref_input, ref_output, txfm_size_);
+
+      for (int ni = 0; ni < txfm_size_; ++ni) {
+        EXPECT_LE(
+            abs(output[ni] - static_cast<tran_low_t>(round(ref_output[ni]))),
+            max_error_);
+      }
+    }
+
+    delete[] input;
+    delete[] output;
+    delete[] ref_input;
+    delete[] ref_output;
+  }
+
+  double max_error_;
+  int txfm_size_;
+  FdctFunc fwd_txfm_;
+  FdctFuncRef fwd_txfm_ref_;
+};
+
+typedef std::tr1::tuple<FdctFunc, FdctFuncRef, int, int> FdctParam;
+class Vp10FwdTxfm
+    : public TransTestBase,
+      public ::testing::TestWithParam<FdctParam> {
+ public:
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    fwd_txfm_ref_ = GET_PARAM(1);
+    txfm_size_ = GET_PARAM(2);
+    max_error_ = GET_PARAM(3);
+  }
+  virtual void TearDown() {}
+};
+
+TEST_P(Vp10FwdTxfm, RunFwdAccuracyCheck) {
+  RunFwdAccuracyCheck();
+}
+
+INSTANTIATE_TEST_CASE_P(
+    C, Vp10FwdTxfm,
+    ::testing::Values(
+        FdctParam(&fdct4, &reference_dct_1d, 4, 1),
+        FdctParam(&fdct8, &reference_dct_1d, 8, 1),
+        FdctParam(&fdct16, &reference_dct_1d, 16, 2)));
+}  // namespace
diff --git a/libs/libvpx/test/vp10_inv_txfm_test.cc b/libs/libvpx/test/vp10_inv_txfm_test.cc
new file mode 100644
index 0000000000..c49081ef85
--- /dev/null
+++ b/libs/libvpx/test/vp10_inv_txfm_test.cc
@@ -0,0 +1,321 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vp10_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "vp10/common/blockd.h"
+#include "vp10/common/scan.h"
+#include "vpx/vpx_integer.h"
+#include "vp10/common/vp10_inv_txfm.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+const double PI = 3.141592653589793238462643383279502884;
+const double kInvSqrt2 = 0.707106781186547524400844362104;
+
+void reference_idct_1d(const double *in, double *out, int size) {
+  for (int n = 0; n < size; ++n) {
+    out[n] = 0;
+    for (int k = 0; k < size; ++k) {
+      if (k == 0)
+        out[n] += kInvSqrt2 * in[k] * cos(PI * (2 * n + 1) * k / (2 * size));
+      else
+        out[n] += in[k] * cos(PI * (2 * n + 1) * k / (2 * size));
+    }
+  }
+}
+
+typedef void (*IdctFuncRef)(const double *in, double *out, int size);
+typedef void (*IdctFunc)(const tran_low_t *in, tran_low_t *out);
+
+class TransTestBase {
+ public:
+  virtual ~TransTestBase() {}
+
+ protected:
+  void RunInvAccuracyCheck() {
+    tran_low_t *input  = new tran_low_t[txfm_size_];
+    tran_low_t *output = new tran_low_t[txfm_size_];
+    double *ref_input  = new double[txfm_size_];
+    double *ref_output = new double[txfm_size_];
+
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 5000;
+    for (int ti =  0; ti < count_test_block; ++ti) {
+      for (int ni = 0; ni < txfm_size_; ++ni) {
+        input[ni] = rnd.Rand8() - rnd.Rand8();
+        ref_input[ni] = static_cast<double>(input[ni]);
+      }
+
+      fwd_txfm_(input, output);
+      fwd_txfm_ref_(ref_input, ref_output, txfm_size_);
+
+      for (int ni = 0; ni < txfm_size_; ++ni) {
+        EXPECT_LE(
+            abs(output[ni] - static_cast<tran_low_t>(round(ref_output[ni]))),
+            max_error_);
+      }
+    }
+
+    delete[] input;
+    delete[] output;
+    delete[] ref_input;
+    delete[] ref_output;
+  }
+
+  double max_error_;
+  int txfm_size_;
+  IdctFunc fwd_txfm_;
+  IdctFuncRef fwd_txfm_ref_;
+};
+
+typedef std::tr1::tuple<IdctFunc, IdctFuncRef, int, int> IdctParam;
+class Vp10InvTxfm
+    : public TransTestBase,
+      public ::testing::TestWithParam<IdctParam> {
+ public:
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    fwd_txfm_ref_ = GET_PARAM(1);
+    txfm_size_ = GET_PARAM(2);
+    max_error_ = GET_PARAM(3);
+  }
+  virtual void TearDown() {}
+};
+
+TEST_P(Vp10InvTxfm, RunInvAccuracyCheck) {
+  RunInvAccuracyCheck();
+}
+
+INSTANTIATE_TEST_CASE_P(
+    C, Vp10InvTxfm,
+    ::testing::Values(
+        IdctParam(&vp10_idct4_c, &reference_idct_1d, 4, 1),
+        IdctParam(&vp10_idct8_c, &reference_idct_1d, 8, 2),
+        IdctParam(&vp10_idct16_c, &reference_idct_1d, 16, 4),
+        IdctParam(&vp10_idct32_c, &reference_idct_1d, 32, 6))
+);
+
+typedef void (*FwdTxfmFunc)(const int16_t *in, tran_low_t *out, int stride);
+typedef void (*InvTxfmFunc)(const tran_low_t *in, uint8_t *out, int stride);
+typedef std::tr1::tuple<FwdTxfmFunc,
+                        InvTxfmFunc,
+                        InvTxfmFunc,
+                        TX_SIZE, int> PartialInvTxfmParam;
+const int kMaxNumCoeffs = 1024;
+class Vp10PartialIDctTest
+    : public ::testing::TestWithParam<PartialInvTxfmParam> {
+ public:
+  virtual ~Vp10PartialIDctTest() {}
+  virtual void SetUp() {
+    ftxfm_ = GET_PARAM(0);
+    full_itxfm_ = GET_PARAM(1);
+    partial_itxfm_ = GET_PARAM(2);
+    tx_size_  = GET_PARAM(3);
+    last_nonzero_ = GET_PARAM(4);
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  int last_nonzero_;
+  TX_SIZE tx_size_;
+  FwdTxfmFunc ftxfm_;
+  InvTxfmFunc full_itxfm_;
+  InvTxfmFunc partial_itxfm_;
+};
+
+TEST_P(Vp10PartialIDctTest, RunQuantCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  int size;
+  switch (tx_size_) {
+    case TX_4X4:
+      size = 4;
+      break;
+    case TX_8X8:
+      size = 8;
+      break;
+    case TX_16X16:
+      size = 16;
+      break;
+    case TX_32X32:
+      size = 32;
+      break;
+    default:
+      FAIL() << "Wrong Size!";
+      break;
+  }
+  DECLARE_ALIGNED(16, tran_low_t, test_coef_block1[kMaxNumCoeffs]);
+  DECLARE_ALIGNED(16, tran_low_t, test_coef_block2[kMaxNumCoeffs]);
+  DECLARE_ALIGNED(16, uint8_t, dst1[kMaxNumCoeffs]);
+  DECLARE_ALIGNED(16, uint8_t, dst2[kMaxNumCoeffs]);
+
+  const int count_test_block = 1000;
+  const int block_size = size * size;
+
+  DECLARE_ALIGNED(16, int16_t, input_extreme_block[kMaxNumCoeffs]);
+  DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kMaxNumCoeffs]);
+
+  int max_error = 0;
+  for (int i = 0; i < count_test_block; ++i) {
+    // clear out destination buffer
+    memset(dst1, 0, sizeof(*dst1) * block_size);
+    memset(dst2, 0, sizeof(*dst2) * block_size);
+    memset(test_coef_block1, 0, sizeof(*test_coef_block1) * block_size);
+    memset(test_coef_block2, 0, sizeof(*test_coef_block2) * block_size);
+
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-255, 255].
+      if (i == 0) {
+        for (int j = 0; j < block_size; ++j)
+          input_extreme_block[j] = 255;
+      } else if (i == 1) {
+        for (int j = 0; j < block_size; ++j)
+          input_extreme_block[j] = -255;
+      } else {
+        for (int j = 0; j < block_size; ++j) {
+          input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
+        }
+      }
+
+      ftxfm_(input_extreme_block, output_ref_block, size);
+
+      // quantization with maximum allowed step sizes
+      test_coef_block1[0] = (output_ref_block[0] / 1336) * 1336;
+      for (int j = 1; j < last_nonzero_; ++j)
+        test_coef_block1[vp10_default_scan_orders[tx_size_].scan[j]]
+                         = (output_ref_block[j] / 1828) * 1828;
+    }
+
+    ASM_REGISTER_STATE_CHECK(full_itxfm_(test_coef_block1, dst1, size));
+    ASM_REGISTER_STATE_CHECK(partial_itxfm_(test_coef_block1, dst2, size));
+
+    for (int j = 0; j < block_size; ++j) {
+      const int diff = dst1[j] - dst2[j];
+      const int error = diff * diff;
+      if (max_error < error)
+        max_error = error;
+    }
+  }
+
+  EXPECT_EQ(0, max_error)
+      << "Error: partial inverse transform produces different results";
+}
+
+TEST_P(Vp10PartialIDctTest, ResultsMatch) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  int size;
+  switch (tx_size_) {
+    case TX_4X4:
+      size = 4;
+      break;
+    case TX_8X8:
+      size = 8;
+      break;
+    case TX_16X16:
+      size = 16;
+      break;
+    case TX_32X32:
+      size = 32;
+      break;
+    default:
+      FAIL() << "Wrong Size!";
+      break;
+  }
+  DECLARE_ALIGNED(16, tran_low_t, test_coef_block1[kMaxNumCoeffs]);
+  DECLARE_ALIGNED(16, tran_low_t, test_coef_block2[kMaxNumCoeffs]);
+  DECLARE_ALIGNED(16, uint8_t, dst1[kMaxNumCoeffs]);
+  DECLARE_ALIGNED(16, uint8_t, dst2[kMaxNumCoeffs]);
+  const int count_test_block = 1000;
+  const int max_coeff = 32766 / 4;
+  const int block_size = size * size;
+  int max_error = 0;
+  for (int i = 0; i < count_test_block; ++i) {
+    // clear out destination buffer
+    memset(dst1, 0, sizeof(*dst1) * block_size);
+    memset(dst2, 0, sizeof(*dst2) * block_size);
+    memset(test_coef_block1, 0, sizeof(*test_coef_block1) * block_size);
+    memset(test_coef_block2, 0, sizeof(*test_coef_block2) * block_size);
+    int max_energy_leftover = max_coeff * max_coeff;
+    for (int j = 0; j < last_nonzero_; ++j) {
+      int16_t coef = static_cast<int16_t>(sqrt(1.0 * max_energy_leftover) *
+                                          (rnd.Rand16() - 32768) / 65536);
+      max_energy_leftover -= coef * coef;
+      if (max_energy_leftover < 0) {
+        max_energy_leftover = 0;
+        coef = 0;
+      }
+      test_coef_block1[vp10_default_scan_orders[tx_size_].scan[j]] = coef;
+    }
+
+    memcpy(test_coef_block2, test_coef_block1,
+           sizeof(*test_coef_block2) * block_size);
+
+    ASM_REGISTER_STATE_CHECK(full_itxfm_(test_coef_block1, dst1, size));
+    ASM_REGISTER_STATE_CHECK(partial_itxfm_(test_coef_block2, dst2, size));
+
+    for (int j = 0; j < block_size; ++j) {
+      const int diff = dst1[j] - dst2[j];
+      const int error = diff * diff;
+      if (max_error < error)
+        max_error = error;
+    }
+  }
+
+  EXPECT_EQ(0, max_error)
+      << "Error: partial inverse transform produces different results";
+}
+using std::tr1::make_tuple;
+
+INSTANTIATE_TEST_CASE_P(
+    C, Vp10PartialIDctTest,
+    ::testing::Values(
+        make_tuple(&vpx_fdct32x32_c,
+                   &vp10_idct32x32_1024_add_c,
+                   &vp10_idct32x32_34_add_c,
+                   TX_32X32, 34),
+        make_tuple(&vpx_fdct32x32_c,
+                   &vp10_idct32x32_1024_add_c,
+                   &vp10_idct32x32_1_add_c,
+                   TX_32X32, 1),
+        make_tuple(&vpx_fdct16x16_c,
+                   &vp10_idct16x16_256_add_c,
+                   &vp10_idct16x16_10_add_c,
+                   TX_16X16, 10),
+        make_tuple(&vpx_fdct16x16_c,
+                   &vp10_idct16x16_256_add_c,
+                   &vp10_idct16x16_1_add_c,
+                   TX_16X16, 1),
+        make_tuple(&vpx_fdct8x8_c,
+                   &vp10_idct8x8_64_add_c,
+                   &vp10_idct8x8_12_add_c,
+                   TX_8X8, 12),
+        make_tuple(&vpx_fdct8x8_c,
+                   &vp10_idct8x8_64_add_c,
+                   &vp10_idct8x8_1_add_c,
+                   TX_8X8, 1),
+        make_tuple(&vpx_fdct4x4_c,
+                   &vp10_idct4x4_16_add_c,
+                   &vp10_idct4x4_1_add_c,
+                   TX_4X4, 1)));
+}  // namespace
diff --git a/libs/libvpx/test/vp8_boolcoder_test.cc b/libs/libvpx/test/vp8_boolcoder_test.cc
new file mode 100644
index 0000000000..02d7162ac8
--- /dev/null
+++ b/libs/libvpx/test/vp8_boolcoder_test.cc
@@ -0,0 +1,116 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <math.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "test/acm_random.h"
+#include "vp8/decoder/dboolhuff.h"
+#include "vp8/encoder/boolhuff.h"
+#include "vpx/vpx_integer.h"
+
+namespace {
+const int num_tests = 10;
+
+// In a real use the 'decrypt_state' parameter will be a pointer to a struct
+// with whatever internal state the decryptor uses. For testing we'll just
+// xor with a constant key, and decrypt_state will point to the start of
+// the original buffer.
+const uint8_t secret_key[16] = {
+  0x01, 0x12, 0x23, 0x34, 0x45, 0x56, 0x67, 0x78,
+  0x89, 0x9a, 0xab, 0xbc, 0xcd, 0xde, 0xef, 0xf0
+};
+
+void encrypt_buffer(uint8_t *buffer, size_t size) {
+  for (size_t i = 0; i < size; ++i) {
+    buffer[i] ^= secret_key[i & 15];
+  }
+}
+
+void test_decrypt_cb(void *decrypt_state, const uint8_t *input,
+                     uint8_t *output, int count) {
+  const size_t offset = input - reinterpret_cast<uint8_t*>(decrypt_state);
+  for (int i = 0; i < count; i++) {
+    output[i] = input[i] ^ secret_key[(offset + i) & 15];
+  }
+}
+
+}  // namespace
+
+using libvpx_test::ACMRandom;
+
+TEST(VP8, TestBitIO) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  for (int n = 0; n < num_tests; ++n) {
+    for (int method = 0; method <= 7; ++method) {   // we generate various proba
+      const int kBitsToTest = 1000;
+      uint8_t probas[kBitsToTest];
+
+      for (int i = 0; i < kBitsToTest; ++i) {
+        const int parity = i & 1;
+        probas[i] =
+            (method == 0) ? 0 : (method == 1) ? 255 :
+            (method == 2) ? 128 :
+            (method == 3) ? rnd.Rand8() :
+            (method == 4) ? (parity ? 0 : 255) :
+            // alternate between low and high proba:
+            (method == 5) ? (parity ? rnd(128) : 255 - rnd(128)) :
+            (method == 6) ?
+                (parity ? rnd(64) : 255 - rnd(64)) :
+                (parity ? rnd(32) : 255 - rnd(32));
+      }
+      for (int bit_method = 0; bit_method <= 3; ++bit_method) {
+        const int random_seed = 6432;
+        const int kBufferSize = 10000;
+        ACMRandom bit_rnd(random_seed);
+        BOOL_CODER bw;
+        uint8_t bw_buffer[kBufferSize];
+        vp8_start_encode(&bw, bw_buffer, bw_buffer + kBufferSize);
+
+        int bit = (bit_method == 0) ? 0 : (bit_method == 1) ? 1 : 0;
+        for (int i = 0; i < kBitsToTest; ++i) {
+          if (bit_method == 2) {
+            bit = (i & 1);
+          } else if (bit_method == 3) {
+            bit = bit_rnd(2);
+          }
+          vp8_encode_bool(&bw, bit, static_cast<int>(probas[i]));
+        }
+
+        vp8_stop_encode(&bw);
+
+        BOOL_DECODER br;
+        encrypt_buffer(bw_buffer, kBufferSize);
+        vp8dx_start_decode(&br, bw_buffer, kBufferSize,
+                           test_decrypt_cb,
+                           reinterpret_cast<void *>(bw_buffer));
+        bit_rnd.Reset(random_seed);
+        for (int i = 0; i < kBitsToTest; ++i) {
+          if (bit_method == 2) {
+            bit = (i & 1);
+          } else if (bit_method == 3) {
+            bit = bit_rnd(2);
+          }
+          GTEST_ASSERT_EQ(vp8dx_decode_bool(&br, probas[i]), bit)
+              << "pos: "<< i << " / " << kBitsToTest
+              << " bit_method: " << bit_method
+              << " method: " << method;
+        }
+      }
+    }
+  }
+}
diff --git a/libs/libvpx/test/vp8_decrypt_test.cc b/libs/libvpx/test/vp8_decrypt_test.cc
new file mode 100644
index 0000000000..972a1d9a3d
--- /dev/null
+++ b/libs/libvpx/test/vp8_decrypt_test.cc
@@ -0,0 +1,71 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include <vector>
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/ivf_video_source.h"
+
+namespace {
+// In a real use the 'decrypt_state' parameter will be a pointer to a struct
+// with whatever internal state the decryptor uses. For testing we'll just
+// xor with a constant key, and decrypt_state will point to the start of
+// the original buffer.
+const uint8_t test_key[16] = {
+  0x01, 0x12, 0x23, 0x34, 0x45, 0x56, 0x67, 0x78,
+  0x89, 0x9a, 0xab, 0xbc, 0xcd, 0xde, 0xef, 0xf0
+};
+
+void encrypt_buffer(const uint8_t *src, uint8_t *dst, size_t size,
+                    ptrdiff_t offset) {
+  for (size_t i = 0; i < size; ++i) {
+    dst[i] = src[i] ^ test_key[(offset + i) & 15];
+  }
+}
+
+void test_decrypt_cb(void *decrypt_state, const uint8_t *input,
+                     uint8_t *output, int count) {
+  encrypt_buffer(input, output, count,
+                 input - reinterpret_cast<uint8_t *>(decrypt_state));
+}
+
+}  // namespace
+
+namespace libvpx_test {
+
+TEST(TestDecrypt, DecryptWorksVp8) {
+  libvpx_test::IVFVideoSource video("vp80-00-comprehensive-001.ivf");
+  video.Init();
+
+  vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t();
+  VP8Decoder decoder(dec_cfg, 0);
+
+  video.Begin();
+
+  // no decryption
+  vpx_codec_err_t res = decoder.DecodeFrame(video.cxdata(), video.frame_size());
+  ASSERT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError();
+
+  // decrypt frame
+  video.Next();
+
+  std::vector<uint8_t> encrypted(video.frame_size());
+  encrypt_buffer(video.cxdata(), &encrypted[0], video.frame_size(), 0);
+  vpx_decrypt_init di = { test_decrypt_cb, &encrypted[0] };
+  decoder.Control(VPXD_SET_DECRYPTOR, &di);
+
+  res = decoder.DecodeFrame(&encrypted[0], encrypted.size());
+  ASSERT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError();
+}
+
+}  // namespace libvpx_test
diff --git a/libs/libvpx/test/vp8_denoiser_sse2_test.cc b/libs/libvpx/test/vp8_denoiser_sse2_test.cc
new file mode 100644
index 0000000000..e8ca8d3986
--- /dev/null
+++ b/libs/libvpx/test/vp8_denoiser_sse2_test.cc
@@ -0,0 +1,116 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+#include "vp8/encoder/denoising.h"
+#include "vp8/common/reconinter.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+
+const int kNumPixels = 16 * 16;
+class VP8DenoiserTest : public ::testing::TestWithParam<int> {
+ public:
+  virtual ~VP8DenoiserTest() {}
+
+  virtual void SetUp() {
+    increase_denoising_ = GetParam();
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  int increase_denoising_;
+};
+
+TEST_P(VP8DenoiserTest, BitexactCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = 4000;
+  const int stride = 16;
+
+  // Allocate the space for input and output,
+  // where sig_block_c/_sse2 is the block to be denoised,
+  // mc_avg_block is the denoised reference block,
+  // avg_block_c is the denoised result from C code,
+  // avg_block_sse2 is the denoised result from SSE2 code.
+  DECLARE_ALIGNED(16, uint8_t, sig_block_c[kNumPixels]);
+  // Since in VP8 denoiser, the source signal will be changed,
+  // we need another copy of the source signal as the input of sse2 code.
+  DECLARE_ALIGNED(16, uint8_t, sig_block_sse2[kNumPixels]);
+  DECLARE_ALIGNED(16, uint8_t, mc_avg_block[kNumPixels]);
+  DECLARE_ALIGNED(16, uint8_t, avg_block_c[kNumPixels]);
+  DECLARE_ALIGNED(16, uint8_t, avg_block_sse2[kNumPixels]);
+
+  for (int i = 0; i < count_test_block; ++i) {
+    // Generate random motion magnitude, 20% of which exceed the threshold.
+    const int motion_magnitude_ran =
+        rnd.Rand8() % static_cast<int>(MOTION_MAGNITUDE_THRESHOLD * 1.2);
+
+    // Initialize a test block with random number in range [0, 255].
+    for (int j = 0; j < kNumPixels; ++j) {
+      int temp = 0;
+      sig_block_sse2[j] = sig_block_c[j] = rnd.Rand8();
+      // The pixels in mc_avg_block are generated by adding a random
+      // number in range [-19, 19] to corresponding pixels in sig_block.
+      temp = sig_block_c[j] + (rnd.Rand8() % 2 == 0 ? -1 : 1) *
+             (rnd.Rand8() % 20);
+      // Clip.
+      mc_avg_block[j] = (temp < 0) ? 0 : ((temp > 255) ? 255 : temp);
+    }
+
+    // Test denosiser on Y component.
+    ASM_REGISTER_STATE_CHECK(vp8_denoiser_filter_c(
+        mc_avg_block, stride, avg_block_c, stride, sig_block_c, stride,
+        motion_magnitude_ran, increase_denoising_));
+
+    ASM_REGISTER_STATE_CHECK(vp8_denoiser_filter_sse2(
+        mc_avg_block, stride, avg_block_sse2, stride, sig_block_sse2, stride,
+        motion_magnitude_ran, increase_denoising_));
+
+    // Check bitexactness.
+    for (int h = 0; h < 16; ++h) {
+      for (int w = 0; w < 16; ++w) {
+        EXPECT_EQ(avg_block_c[h * stride + w], avg_block_sse2[h * stride + w]);
+      }
+    }
+
+    // Test denoiser on UV component.
+    ASM_REGISTER_STATE_CHECK(vp8_denoiser_filter_uv_c(
+        mc_avg_block, stride, avg_block_c, stride, sig_block_c, stride,
+        motion_magnitude_ran, increase_denoising_));
+
+    ASM_REGISTER_STATE_CHECK(vp8_denoiser_filter_uv_sse2(
+        mc_avg_block, stride, avg_block_sse2, stride, sig_block_sse2, stride,
+        motion_magnitude_ran, increase_denoising_));
+
+    // Check bitexactness.
+    for (int h = 0; h < 16; ++h) {
+      for (int w = 0; w < 16; ++w) {
+        EXPECT_EQ(avg_block_c[h * stride + w], avg_block_sse2[h * stride + w]);
+      }
+    }
+  }
+}
+
+// Test for all block size.
+INSTANTIATE_TEST_CASE_P(SSE2, VP8DenoiserTest, ::testing::Values(0, 1));
+}  // namespace
diff --git a/libs/libvpx/test/vp8_fdct4x4_test.cc b/libs/libvpx/test/vp8_fdct4x4_test.cc
new file mode 100644
index 0000000000..11a653decc
--- /dev/null
+++ b/libs/libvpx/test/vp8_fdct4x4_test.cc
@@ -0,0 +1,164 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vp8_rtcd.h"
+#include "test/acm_random.h"
+#include "vpx/vpx_integer.h"
+
+namespace {
+
+const int cospi8sqrt2minus1 = 20091;
+const int sinpi8sqrt2 = 35468;
+
+void reference_idct4x4(const int16_t *input, int16_t *output) {
+  const int16_t *ip = input;
+  int16_t *op = output;
+
+  for (int i = 0; i < 4; ++i) {
+    const int a1 = ip[0] + ip[8];
+    const int b1 = ip[0] - ip[8];
+    const int temp1 = (ip[4] * sinpi8sqrt2) >> 16;
+    const int temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
+    const int c1 = temp1 - temp2;
+    const int temp3 = ip[4] + ((ip[4] * cospi8sqrt2minus1) >> 16);
+    const int temp4 = (ip[12] * sinpi8sqrt2) >> 16;
+    const int d1 = temp3 + temp4;
+    op[0] = a1 + d1;
+    op[12] = a1 - d1;
+    op[4] = b1 + c1;
+    op[8] = b1 - c1;
+    ++ip;
+    ++op;
+  }
+  ip = output;
+  op = output;
+  for (int i = 0; i < 4; ++i) {
+    const int a1 = ip[0] + ip[2];
+    const int b1 = ip[0] - ip[2];
+    const int temp1 = (ip[1] * sinpi8sqrt2) >> 16;
+    const int temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1) >> 16);
+    const int c1 = temp1 - temp2;
+    const int temp3 = ip[1] + ((ip[1] * cospi8sqrt2minus1) >> 16);
+    const int temp4 = (ip[3] * sinpi8sqrt2) >> 16;
+    const int d1 = temp3 + temp4;
+    op[0] = (a1 + d1 + 4) >> 3;
+    op[3] = (a1 - d1 + 4) >> 3;
+    op[1] = (b1 + c1 + 4) >> 3;
+    op[2] = (b1 - c1 + 4) >> 3;
+    ip += 4;
+    op += 4;
+  }
+}
+
+using libvpx_test::ACMRandom;
+
+TEST(VP8FdctTest, SignBiasCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  int16_t test_input_block[16];
+  int16_t test_output_block[16];
+  const int pitch = 8;
+  int count_sign_block[16][2];
+  const int count_test_block = 1000000;
+
+  memset(count_sign_block, 0, sizeof(count_sign_block));
+
+  for (int i = 0; i < count_test_block; ++i) {
+    // Initialize a test block with input range [-255, 255].
+    for (int j = 0; j < 16; ++j)
+      test_input_block[j] = rnd.Rand8() - rnd.Rand8();
+
+    vp8_short_fdct4x4_c(test_input_block, test_output_block, pitch);
+
+    for (int j = 0; j < 16; ++j) {
+      if (test_output_block[j] < 0)
+        ++count_sign_block[j][0];
+      else if (test_output_block[j] > 0)
+        ++count_sign_block[j][1];
+    }
+  }
+
+  bool bias_acceptable = true;
+  for (int j = 0; j < 16; ++j)
+    bias_acceptable = bias_acceptable &&
+    (abs(count_sign_block[j][0] - count_sign_block[j][1]) < 10000);
+
+  EXPECT_EQ(true, bias_acceptable)
+    << "Error: 4x4 FDCT has a sign bias > 1% for input range [-255, 255]";
+
+  memset(count_sign_block, 0, sizeof(count_sign_block));
+
+  for (int i = 0; i < count_test_block; ++i) {
+    // Initialize a test block with input range [-15, 15].
+    for (int j = 0; j < 16; ++j)
+      test_input_block[j] = (rnd.Rand8() >> 4) - (rnd.Rand8() >> 4);
+
+    vp8_short_fdct4x4_c(test_input_block, test_output_block, pitch);
+
+    for (int j = 0; j < 16; ++j) {
+      if (test_output_block[j] < 0)
+        ++count_sign_block[j][0];
+      else if (test_output_block[j] > 0)
+        ++count_sign_block[j][1];
+    }
+  }
+
+  bias_acceptable = true;
+  for (int j = 0; j < 16; ++j)
+    bias_acceptable = bias_acceptable &&
+    (abs(count_sign_block[j][0] - count_sign_block[j][1]) < 100000);
+
+  EXPECT_EQ(true, bias_acceptable)
+    << "Error: 4x4 FDCT has a sign bias > 10% for input range [-15, 15]";
+};
+
+TEST(VP8FdctTest, RoundTripErrorCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  int max_error = 0;
+  double total_error = 0;
+  const int count_test_block = 1000000;
+  for (int i = 0; i < count_test_block; ++i) {
+    int16_t test_input_block[16];
+    int16_t test_temp_block[16];
+    int16_t test_output_block[16];
+
+    // Initialize a test block with input range [-255, 255].
+    for (int j = 0; j < 16; ++j)
+      test_input_block[j] = rnd.Rand8() - rnd.Rand8();
+
+    const int pitch = 8;
+    vp8_short_fdct4x4_c(test_input_block, test_temp_block, pitch);
+    reference_idct4x4(test_temp_block, test_output_block);
+
+    for (int j = 0; j < 16; ++j) {
+      const int diff = test_input_block[j] - test_output_block[j];
+      const int error = diff * diff;
+      if (max_error < error)
+        max_error = error;
+      total_error += error;
+    }
+  }
+
+  EXPECT_GE(1, max_error )
+    << "Error: FDCT/IDCT has an individual roundtrip error > 1";
+
+  EXPECT_GE(count_test_block, total_error)
+    << "Error: FDCT/IDCT has average roundtrip error > 1 per block";
+};
+
+}  // namespace
diff --git a/libs/libvpx/test/vp8_fragments_test.cc b/libs/libvpx/test/vp8_fragments_test.cc
new file mode 100644
index 0000000000..cb0d1a155e
--- /dev/null
+++ b/libs/libvpx/test/vp8_fragments_test.cc
@@ -0,0 +1,37 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/video_source.h"
+
+namespace {
+
+class VP8FramgmentsTest
+    : public ::libvpx_test::EncoderTest,
+      public ::testing::Test {
+ protected:
+  VP8FramgmentsTest() : EncoderTest(&::libvpx_test::kVP8) {}
+  virtual ~VP8FramgmentsTest() {}
+
+  virtual void SetUp() {
+    const unsigned long init_flags =  // NOLINT(runtime/int)
+        VPX_CODEC_USE_OUTPUT_PARTITION;
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+    set_init_flags(init_flags);
+  }
+};
+
+TEST_F(VP8FramgmentsTest, TestFragmentsEncodeDecode) {
+  ::libvpx_test::RandomVideoSource video;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+}  // namespace
diff --git a/libs/libvpx/test/vp8_multi_resolution_encoder.sh b/libs/libvpx/test/vp8_multi_resolution_encoder.sh
new file mode 100755
index 0000000000..a8b7fe78ee
--- /dev/null
+++ b/libs/libvpx/test/vp8_multi_resolution_encoder.sh
@@ -0,0 +1,75 @@
+#!/bin/sh
+##
+##  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+##  This file tests the libvpx vp8_multi_resolution_encoder example. To add new
+##  tests to this file, do the following:
+##    1. Write a shell function (this is your test).
+##    2. Add the function to vp8_mre_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: $YUV_RAW_INPUT is required.
+vp8_multi_resolution_encoder_verify_environment() {
+  if [ "$(vpx_config_option_enabled CONFIG_MULTI_RES_ENCODING)" = "yes" ]; then
+    if [ ! -e "${YUV_RAW_INPUT}" ]; then
+      elog "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
+      return 1
+    fi
+    local readonly app="vp8_multi_resolution_encoder"
+    if [ -z "$(vpx_tool_path "${app}")" ]; then
+      elog "${app} not found. It must exist in LIBVPX_BIN_PATH or its parent."
+      return 1
+    fi
+  fi
+}
+
+# Runs vp8_multi_resolution_encoder. Simply forwards all arguments to
+# vp8_multi_resolution_encoder after building path to the executable.
+vp8_mre() {
+  local readonly encoder="$(vpx_tool_path vp8_multi_resolution_encoder)"
+  if [ ! -x "${encoder}" ]; then
+    elog "${encoder} does not exist or is not executable."
+    return 1
+  fi
+
+  eval "${VPX_TEST_PREFIX}" "${encoder}" "$@" ${devnull}
+}
+
+vp8_multi_resolution_encoder_three_formats() {
+  local readonly output_files="${VPX_TEST_OUTPUT_DIR}/vp8_mre_0.ivf
+                               ${VPX_TEST_OUTPUT_DIR}/vp8_mre_1.ivf
+                               ${VPX_TEST_OUTPUT_DIR}/vp8_mre_2.ivf"
+
+  if [ "$(vpx_config_option_enabled CONFIG_MULTI_RES_ENCODING)" = "yes" ]; then
+    if [ "$(vp8_encode_available)" = "yes" ]; then
+      # Param order:
+      #  Input width
+      #  Input height
+      #  Input file path
+      #  Output file names
+      #  Output PSNR
+      vp8_mre "${YUV_RAW_INPUT_WIDTH}" \
+        "${YUV_RAW_INPUT_HEIGHT}" \
+        "${YUV_RAW_INPUT}" \
+        ${output_files} \
+        0
+
+      for output_file in ${output_files}; do
+        if [ ! -e "${output_file}" ]; then
+          elog "Missing output file: ${output_file}"
+          return 1
+        fi
+      done
+    fi
+  fi
+}
+
+vp8_mre_tests="vp8_multi_resolution_encoder_three_formats"
+run_tests vp8_multi_resolution_encoder_verify_environment "${vp8_mre_tests}"
diff --git a/libs/libvpx/test/vp8cx_set_ref.sh b/libs/libvpx/test/vp8cx_set_ref.sh
new file mode 100755
index 0000000000..5d760bcdec
--- /dev/null
+++ b/libs/libvpx/test/vp8cx_set_ref.sh
@@ -0,0 +1,57 @@
+#!/bin/sh
+##
+##  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+##  This file tests the libvpx vp8cx_set_ref example. To add new tests to this
+##  file, do the following:
+##    1. Write a shell function (this is your test).
+##    2. Add the function to vp8cx_set_ref_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: $YUV_RAW_INPUT is required.
+vp8cx_set_ref_verify_environment() {
+  if [ ! -e "${YUV_RAW_INPUT}" ]; then
+    echo "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
+    return 1
+  fi
+}
+
+# Runs vp8cx_set_ref and updates the reference frame before encoding frame 90.
+# $1 is the codec name, which vp8cx_set_ref does not support at present: It's
+# currently used only to name the output file.
+# TODO(tomfinegan): Pass the codec param once the example is updated to support
+# VP9.
+vpx_set_ref() {
+  local encoder="${LIBVPX_BIN_PATH}/vp8cx_set_ref${VPX_TEST_EXE_SUFFIX}"
+  local codec="$1"
+  local output_file="${VPX_TEST_OUTPUT_DIR}/vp8cx_set_ref_${codec}.ivf"
+  local ref_frame_num=90
+
+  if [ ! -x "${encoder}" ]; then
+    elog "${encoder} does not exist or is not executable."
+    return 1
+  fi
+
+  eval "${VPX_TEST_PREFIX}" "${encoder}" "${YUV_RAW_INPUT_WIDTH}" \
+      "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" \
+      "${ref_frame_num}" ${devnull}
+
+  [ -e "${output_file}" ] || return 1
+}
+
+vp8cx_set_ref_vp8() {
+  if [ "$(vp8_encode_available)" = "yes" ]; then
+    vpx_set_ref vp8 || return 1
+  fi
+}
+
+vp8cx_set_ref_tests="vp8cx_set_ref_vp8"
+
+run_tests vp8cx_set_ref_verify_environment "${vp8cx_set_ref_tests}"
diff --git a/libs/libvpx/test/vp9_arf_freq_test.cc b/libs/libvpx/test/vp9_arf_freq_test.cc
new file mode 100644
index 0000000000..89200d4086
--- /dev/null
+++ b/libs/libvpx/test/vp9_arf_freq_test.cc
@@ -0,0 +1,252 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "test/yuv_video_source.h"
+#include "vp9/encoder/vp9_ratectrl.h"
+
+namespace {
+
+const unsigned int kFrames = 100;
+const int kBitrate = 500;
+
+#define ARF_NOT_SEEN               1000001
+#define ARF_SEEN_ONCE              1000000
+
+typedef struct {
+  const char *filename;
+  unsigned int width;
+  unsigned int height;
+  unsigned int framerate_num;
+  unsigned int framerate_den;
+  unsigned int input_bit_depth;
+  vpx_img_fmt fmt;
+  vpx_bit_depth_t bit_depth;
+  unsigned int profile;
+} TestVideoParam;
+
+typedef struct {
+  libvpx_test::TestMode mode;
+  int cpu_used;
+} TestEncodeParam;
+
+const TestVideoParam kTestVectors[] = {
+  // artificially increase framerate to trigger default check
+  {"hantro_collage_w352h288.yuv", 352, 288, 5000, 1,
+    8, VPX_IMG_FMT_I420, VPX_BITS_8, 0},
+  {"hantro_collage_w352h288.yuv", 352, 288, 30, 1,
+    8, VPX_IMG_FMT_I420, VPX_BITS_8, 0},
+  {"rush_hour_444.y4m", 352, 288, 30, 1,
+    8, VPX_IMG_FMT_I444, VPX_BITS_8, 1},
+#if CONFIG_VP9_HIGHBITDEPTH
+  // Add list of profile 2/3 test videos here ...
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+};
+
+const TestEncodeParam kEncodeVectors[] = {
+  {::libvpx_test::kOnePassGood, 2},
+  {::libvpx_test::kOnePassGood, 5},
+  {::libvpx_test::kTwoPassGood, 1},
+  {::libvpx_test::kTwoPassGood, 2},
+  {::libvpx_test::kTwoPassGood, 5},
+  {::libvpx_test::kRealTime, 5},
+};
+
+const int kMinArfVectors[] = {
+  // NOTE: 0 refers to the default built-in logic in:
+  //       vp9_rc_get_default_min_gf_interval(...)
+  0, 4, 8, 12, 15
+};
+
+int is_extension_y4m(const char *filename) {
+  const char *dot = strrchr(filename, '.');
+  if (!dot || dot == filename)
+    return 0;
+  else
+    return !strcmp(dot, ".y4m");
+}
+
+class ArfFreqTest
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWith3Params<TestVideoParam, \
+                                                 TestEncodeParam, int> {
+ protected:
+  ArfFreqTest()
+      : EncoderTest(GET_PARAM(0)),
+        test_video_param_(GET_PARAM(1)),
+        test_encode_param_(GET_PARAM(2)),
+        min_arf_requested_(GET_PARAM(3)) {
+  }
+
+  virtual ~ArfFreqTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(test_encode_param_.mode);
+    if (test_encode_param_.mode != ::libvpx_test::kRealTime) {
+      cfg_.g_lag_in_frames = 25;
+      cfg_.rc_end_usage = VPX_VBR;
+    } else {
+      cfg_.g_lag_in_frames = 0;
+      cfg_.rc_end_usage = VPX_CBR;
+      cfg_.rc_buf_sz = 1000;
+      cfg_.rc_buf_initial_sz = 500;
+      cfg_.rc_buf_optimal_sz = 600;
+    }
+    dec_cfg_.threads = 4;
+  }
+
+  virtual void BeginPassHook(unsigned int) {
+    min_run_ = ARF_NOT_SEEN;
+    run_of_visible_frames_ = 0;
+  }
+
+  int GetNumFramesInPkt(const vpx_codec_cx_pkt_t *pkt) {
+    const uint8_t *buffer = reinterpret_cast<uint8_t*>(pkt->data.frame.buf);
+    const uint8_t marker = buffer[pkt->data.frame.sz - 1];
+    const int mag = ((marker >> 3) & 3) + 1;
+    int frames = (marker & 0x7) + 1;
+    const unsigned int index_sz = 2 + mag  * frames;
+    // Check for superframe or not.
+    // Assume superframe has only one visible frame, the rest being
+    // invisible. If superframe index is not found, then there is only
+    // one frame.
+    if (!((marker & 0xe0) == 0xc0 &&
+          pkt->data.frame.sz >= index_sz &&
+          buffer[pkt->data.frame.sz - index_sz] == marker)) {
+      frames = 1;
+    }
+    return frames;
+  }
+
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    if (pkt->kind != VPX_CODEC_CX_FRAME_PKT)
+      return;
+    const int frames = GetNumFramesInPkt(pkt);
+    if (frames == 1) {
+      run_of_visible_frames_++;
+    } else if (frames == 2) {
+      if (min_run_ == ARF_NOT_SEEN) {
+        min_run_ = ARF_SEEN_ONCE;
+      } else if (min_run_ == ARF_SEEN_ONCE ||
+                 run_of_visible_frames_ < min_run_) {
+        min_run_ = run_of_visible_frames_;
+      }
+      run_of_visible_frames_ = 1;
+    } else {
+      min_run_ = 0;
+      run_of_visible_frames_ = 1;
+    }
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 1);
+      encoder->Control(VP9E_SET_TILE_COLUMNS, 4);
+      encoder->Control(VP8E_SET_CPUUSED, test_encode_param_.cpu_used);
+      encoder->Control(VP9E_SET_MIN_GF_INTERVAL, min_arf_requested_);
+      if (test_encode_param_.mode != ::libvpx_test::kRealTime) {
+        encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
+        encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
+        encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
+        encoder->Control(VP8E_SET_ARNR_TYPE, 3);
+      }
+    }
+  }
+
+  int GetMinVisibleRun() const {
+    return min_run_;
+  }
+
+  int GetMinArfDistanceRequested() const {
+    if (min_arf_requested_)
+      return min_arf_requested_;
+    else
+      return vp9_rc_get_default_min_gf_interval(
+          test_video_param_.width, test_video_param_.height,
+          (double)test_video_param_.framerate_num /
+          test_video_param_.framerate_den);
+  }
+
+  TestVideoParam test_video_param_;
+  TestEncodeParam test_encode_param_;
+
+ private:
+  int min_arf_requested_;
+  int min_run_;
+  int run_of_visible_frames_;
+};
+
+TEST_P(ArfFreqTest, MinArfFreqTest) {
+  cfg_.rc_target_bitrate = kBitrate;
+  cfg_.g_error_resilient = 0;
+  cfg_.g_profile = test_video_param_.profile;
+  cfg_.g_input_bit_depth = test_video_param_.input_bit_depth;
+  cfg_.g_bit_depth = test_video_param_.bit_depth;
+  init_flags_ = VPX_CODEC_USE_PSNR;
+  if (cfg_.g_bit_depth > 8)
+    init_flags_ |= VPX_CODEC_USE_HIGHBITDEPTH;
+
+  libvpx_test::VideoSource *video;
+  if (is_extension_y4m(test_video_param_.filename)) {
+    video = new libvpx_test::Y4mVideoSource(test_video_param_.filename,
+                                            0, kFrames);
+  } else {
+    video = new libvpx_test::YUVVideoSource(test_video_param_.filename,
+                                            test_video_param_.fmt,
+                                            test_video_param_.width,
+                                            test_video_param_.height,
+                                            test_video_param_.framerate_num,
+                                            test_video_param_.framerate_den,
+                                            0, kFrames);
+  }
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video));
+  const int min_run = GetMinVisibleRun();
+  const int min_arf_dist_requested = GetMinArfDistanceRequested();
+  if (min_run != ARF_NOT_SEEN && min_run != ARF_SEEN_ONCE) {
+    const int min_arf_dist = min_run + 1;
+    EXPECT_GE(min_arf_dist, min_arf_dist_requested);
+  }
+  delete(video);
+}
+
+VP9_INSTANTIATE_TEST_CASE(
+    ArfFreqTest,
+    ::testing::ValuesIn(kTestVectors),
+    ::testing::ValuesIn(kEncodeVectors),
+    ::testing::ValuesIn(kMinArfVectors));
+
+#if CONFIG_VP9_HIGHBITDEPTH
+# if CONFIG_VP10_ENCODER
+// TODO(angiebird): 25-29 fail in high bitdepth mode.
+INSTANTIATE_TEST_CASE_P(
+    DISABLED_VP10, ArfFreqTest,
+    ::testing::Combine(
+        ::testing::Values(static_cast<const libvpx_test::CodecFactory *>(
+            &libvpx_test::kVP10)),
+        ::testing::ValuesIn(kTestVectors),
+        ::testing::ValuesIn(kEncodeVectors),
+        ::testing::ValuesIn(kMinArfVectors)));
+# endif  // CONFIG_VP10_ENCODER
+#else
+VP10_INSTANTIATE_TEST_CASE(
+    ArfFreqTest,
+    ::testing::ValuesIn(kTestVectors),
+    ::testing::ValuesIn(kEncodeVectors),
+    ::testing::ValuesIn(kMinArfVectors));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}  // namespace
diff --git a/libs/libvpx/test/vp9_boolcoder_test.cc b/libs/libvpx/test/vp9_boolcoder_test.cc
new file mode 100644
index 0000000000..c61bb4ab96
--- /dev/null
+++ b/libs/libvpx/test/vp9_boolcoder_test.cc
@@ -0,0 +1,88 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "test/acm_random.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/bitreader.h"
+#include "vpx_dsp/bitwriter.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+const int num_tests = 10;
+}  // namespace
+
+TEST(VP9, TestBitIO) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  for (int n = 0; n < num_tests; ++n) {
+    for (int method = 0; method <= 7; ++method) {   // we generate various proba
+      const int kBitsToTest = 1000;
+      uint8_t probas[kBitsToTest];
+
+      for (int i = 0; i < kBitsToTest; ++i) {
+        const int parity = i & 1;
+        probas[i] =
+          (method == 0) ? 0 : (method == 1) ? 255 :
+          (method == 2) ? 128 :
+          (method == 3) ? rnd.Rand8() :
+          (method == 4) ? (parity ? 0 : 255) :
+            // alternate between low and high proba:
+            (method == 5) ? (parity ? rnd(128) : 255 - rnd(128)) :
+            (method == 6) ?
+            (parity ? rnd(64) : 255 - rnd(64)) :
+            (parity ? rnd(32) : 255 - rnd(32));
+      }
+      for (int bit_method = 0; bit_method <= 3; ++bit_method) {
+        const int random_seed = 6432;
+        const int kBufferSize = 10000;
+        ACMRandom bit_rnd(random_seed);
+        vpx_writer bw;
+        uint8_t bw_buffer[kBufferSize];
+        vpx_start_encode(&bw, bw_buffer);
+
+        int bit = (bit_method == 0) ? 0 : (bit_method == 1) ? 1 : 0;
+        for (int i = 0; i < kBitsToTest; ++i) {
+          if (bit_method == 2) {
+            bit = (i & 1);
+          } else if (bit_method == 3) {
+            bit = bit_rnd(2);
+          }
+          vpx_write(&bw, bit, static_cast<int>(probas[i]));
+        }
+
+        vpx_stop_encode(&bw);
+
+        // First bit should be zero
+        GTEST_ASSERT_EQ(bw_buffer[0] & 0x80, 0);
+
+        vpx_reader br;
+        vpx_reader_init(&br, bw_buffer, kBufferSize, NULL, NULL);
+        bit_rnd.Reset(random_seed);
+        for (int i = 0; i < kBitsToTest; ++i) {
+          if (bit_method == 2) {
+            bit = (i & 1);
+          } else if (bit_method == 3) {
+            bit = bit_rnd(2);
+          }
+          GTEST_ASSERT_EQ(vpx_read(&br, probas[i]), bit)
+              << "pos: " << i << " / " << kBitsToTest
+              << " bit_method: " << bit_method
+              << " method: " << method;
+        }
+      }
+    }
+  }
+}
diff --git a/libs/libvpx/test/vp9_decrypt_test.cc b/libs/libvpx/test/vp9_decrypt_test.cc
new file mode 100644
index 0000000000..d988612070
--- /dev/null
+++ b/libs/libvpx/test/vp9_decrypt_test.cc
@@ -0,0 +1,71 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include <vector>
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/ivf_video_source.h"
+
+namespace {
+// In a real use the 'decrypt_state' parameter will be a pointer to a struct
+// with whatever internal state the decryptor uses. For testing we'll just
+// xor with a constant key, and decrypt_state will point to the start of
+// the original buffer.
+const uint8_t test_key[16] = {
+  0x01, 0x12, 0x23, 0x34, 0x45, 0x56, 0x67, 0x78,
+  0x89, 0x9a, 0xab, 0xbc, 0xcd, 0xde, 0xef, 0xf0
+};
+
+void encrypt_buffer(const uint8_t *src, uint8_t *dst, size_t size,
+                    ptrdiff_t offset) {
+  for (size_t i = 0; i < size; ++i) {
+    dst[i] = src[i] ^ test_key[(offset + i) & 15];
+  }
+}
+
+void test_decrypt_cb(void *decrypt_state, const uint8_t *input,
+                     uint8_t *output, int count) {
+  encrypt_buffer(input, output, count,
+                 input - reinterpret_cast<uint8_t *>(decrypt_state));
+}
+
+}  // namespace
+
+namespace libvpx_test {
+
+TEST(TestDecrypt, DecryptWorksVp9) {
+  libvpx_test::IVFVideoSource video("vp90-2-05-resize.ivf");
+  video.Init();
+
+  vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t();
+  VP9Decoder decoder(dec_cfg, 0);
+
+  video.Begin();
+
+  // no decryption
+  vpx_codec_err_t res = decoder.DecodeFrame(video.cxdata(), video.frame_size());
+  ASSERT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError();
+
+  // decrypt frame
+  video.Next();
+
+  std::vector<uint8_t> encrypted(video.frame_size());
+  encrypt_buffer(video.cxdata(), &encrypted[0], video.frame_size(), 0);
+  vpx_decrypt_init di = { test_decrypt_cb, &encrypted[0] };
+  decoder.Control(VPXD_SET_DECRYPTOR, &di);
+
+  res = decoder.DecodeFrame(&encrypted[0], encrypted.size());
+  ASSERT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError();
+}
+
+}  // namespace libvpx_test
diff --git a/libs/libvpx/test/vp9_denoiser_sse2_test.cc b/libs/libvpx/test/vp9_denoiser_sse2_test.cc
new file mode 100644
index 0000000000..17c799dffb
--- /dev/null
+++ b/libs/libvpx/test/vp9_denoiser_sse2_test.cc
@@ -0,0 +1,101 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+#include "vpx_scale/yv12config.h"
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/encoder/vp9_context_tree.h"
+#include "vp9/encoder/vp9_denoiser.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+
+const int kNumPixels = 64 * 64;
+class VP9DenoiserTest : public ::testing::TestWithParam<BLOCK_SIZE> {
+ public:
+  virtual ~VP9DenoiserTest() {}
+
+  virtual void SetUp() {
+    bs_ = GetParam();
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  BLOCK_SIZE bs_;
+};
+
+TEST_P(VP9DenoiserTest, BitexactCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = 4000;
+
+  // Allocate the space for input and output,
+  // where sig_block is the block to be denoised,
+  // mc_avg_block is the denoised reference block,
+  // avg_block_c is the denoised result from C code,
+  // avg_block_sse2 is the denoised result from SSE2 code.
+  DECLARE_ALIGNED(16, uint8_t, sig_block[kNumPixels]);
+  DECLARE_ALIGNED(16, uint8_t, mc_avg_block[kNumPixels]);
+  DECLARE_ALIGNED(16, uint8_t, avg_block_c[kNumPixels]);
+  DECLARE_ALIGNED(16, uint8_t, avg_block_sse2[kNumPixels]);
+
+  for (int i = 0; i < count_test_block; ++i) {
+    // Generate random motion magnitude, 20% of which exceed the threshold.
+    const int motion_magnitude_random =
+        rnd.Rand8() % static_cast<int>(MOTION_MAGNITUDE_THRESHOLD * 1.2);
+
+    // Initialize a test block with random number in range [0, 255].
+    for (int j = 0; j < kNumPixels; ++j) {
+      int temp = 0;
+      sig_block[j] = rnd.Rand8();
+      // The pixels in mc_avg_block are generated by adding a random
+      // number in range [-19, 19] to corresponding pixels in sig_block.
+      temp = sig_block[j] + ((rnd.Rand8() % 2 == 0) ? -1 : 1) *
+             (rnd.Rand8() % 20);
+      // Clip.
+      mc_avg_block[j] = (temp < 0) ? 0 : ((temp > 255) ? 255 : temp);
+    }
+
+    ASM_REGISTER_STATE_CHECK(vp9_denoiser_filter_c(
+        sig_block, 64, mc_avg_block, 64, avg_block_c,
+        64, 0, bs_, motion_magnitude_random));
+
+    ASM_REGISTER_STATE_CHECK(vp9_denoiser_filter_sse2(
+        sig_block, 64, mc_avg_block, 64, avg_block_sse2,
+        64, 0, bs_, motion_magnitude_random));
+
+    // Test bitexactness.
+    for (int h = 0; h < (4 << b_height_log2_lookup[bs_]); ++h) {
+      for (int w = 0; w < (4 << b_width_log2_lookup[bs_]); ++w) {
+        EXPECT_EQ(avg_block_c[h * 64 + w], avg_block_sse2[h * 64 + w]);
+      }
+    }
+  }
+}
+
+// Test for all block size.
+INSTANTIATE_TEST_CASE_P(
+    SSE2, VP9DenoiserTest,
+    ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4, BLOCK_8X8,
+                      BLOCK_8X16, BLOCK_16X8, BLOCK_16X16, BLOCK_16X32,
+                      BLOCK_32X16, BLOCK_32X32, BLOCK_32X64, BLOCK_64X32,
+                      BLOCK_64X64));
+}  // namespace
diff --git a/libs/libvpx/test/vp9_encoder_parms_get_to_decoder.cc b/libs/libvpx/test/vp9_encoder_parms_get_to_decoder.cc
new file mode 100644
index 0000000000..bd84098791
--- /dev/null
+++ b/libs/libvpx/test/vp9_encoder_parms_get_to_decoder.cc
@@ -0,0 +1,153 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "vp9/vp9_dx_iface.h"
+
+namespace {
+
+const int kCpuUsed = 2;
+
+struct EncodePerfTestVideo {
+  const char *name;
+  uint32_t width;
+  uint32_t height;
+  uint32_t bitrate;
+  int frames;
+};
+
+const EncodePerfTestVideo kVP9EncodePerfTestVectors[] = {
+  {"niklas_1280_720_30.y4m", 1280, 720, 600, 10},
+};
+
+struct EncodeParameters {
+  int32_t tile_rows;
+  int32_t tile_cols;
+  int32_t lossless;
+  int32_t error_resilient;
+  int32_t frame_parallel;
+  vpx_color_range_t color_range;
+  vpx_color_space_t cs;
+  int render_size[2];
+  // TODO(JBB): quantizers / bitrate
+};
+
+const EncodeParameters kVP9EncodeParameterSet[] = {
+  {0, 0, 0, 1, 0, VPX_CR_STUDIO_RANGE, VPX_CS_BT_601, { 0, 0 }},
+  {0, 0, 0, 0, 0, VPX_CR_FULL_RANGE, VPX_CS_BT_709, { 0, 0 }},
+  {0, 0, 1, 0, 0, VPX_CR_FULL_RANGE, VPX_CS_BT_2020, { 0, 0 }},
+  {0, 2, 0, 0, 1, VPX_CR_STUDIO_RANGE, VPX_CS_UNKNOWN, { 640, 480 }},
+  // TODO(JBB): Test profiles (requires more work).
+};
+
+class VpxEncoderParmsGetToDecoder
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWith2Params<EncodeParameters,
+                                                 EncodePerfTestVideo> {
+ protected:
+  VpxEncoderParmsGetToDecoder()
+      : EncoderTest(GET_PARAM(0)), encode_parms(GET_PARAM(1)) {}
+
+  virtual ~VpxEncoderParmsGetToDecoder() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(::libvpx_test::kTwoPassGood);
+    cfg_.g_lag_in_frames = 25;
+    cfg_.g_error_resilient = encode_parms.error_resilient;
+    dec_cfg_.threads = 4;
+    test_video_ = GET_PARAM(2);
+    cfg_.rc_target_bitrate = test_video_.bitrate;
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 1) {
+      encoder->Control(VP9E_SET_COLOR_SPACE, encode_parms.cs);
+      encoder->Control(VP9E_SET_COLOR_RANGE, encode_parms.color_range);
+      encoder->Control(VP9E_SET_LOSSLESS, encode_parms.lossless);
+      encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING,
+                       encode_parms.frame_parallel);
+      encoder->Control(VP9E_SET_TILE_ROWS, encode_parms.tile_rows);
+      encoder->Control(VP9E_SET_TILE_COLUMNS, encode_parms.tile_cols);
+      encoder->Control(VP8E_SET_CPUUSED, kCpuUsed);
+      encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
+      encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
+      encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
+      encoder->Control(VP8E_SET_ARNR_TYPE, 3);
+      if (encode_parms.render_size[0] > 0 && encode_parms.render_size[1] > 0)
+        encoder->Control(VP9E_SET_RENDER_SIZE, encode_parms.render_size);
+    }
+  }
+
+  virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec,
+                                  const libvpx_test::VideoSource & /*video*/,
+                                  libvpx_test::Decoder *decoder) {
+    vpx_codec_ctx_t *const vp9_decoder = decoder->GetDecoder();
+    vpx_codec_alg_priv_t *const priv =
+        reinterpret_cast<vpx_codec_alg_priv_t *>(vp9_decoder->priv);
+    FrameWorkerData *const worker_data =
+        reinterpret_cast<FrameWorkerData *>(priv->frame_workers[0].data1);
+    VP9_COMMON *const common = &worker_data->pbi->common;
+
+    if (encode_parms.lossless) {
+      EXPECT_EQ(0, common->base_qindex);
+      EXPECT_EQ(0, common->y_dc_delta_q);
+      EXPECT_EQ(0, common->uv_dc_delta_q);
+      EXPECT_EQ(0, common->uv_ac_delta_q);
+      EXPECT_EQ(ONLY_4X4, common->tx_mode);
+    }
+    EXPECT_EQ(encode_parms.error_resilient, common->error_resilient_mode);
+    if (encode_parms.error_resilient) {
+      EXPECT_EQ(1, common->frame_parallel_decoding_mode);
+      EXPECT_EQ(0, common->use_prev_frame_mvs);
+    } else {
+      EXPECT_EQ(encode_parms.frame_parallel,
+                common->frame_parallel_decoding_mode);
+    }
+    EXPECT_EQ(encode_parms.color_range, common->color_range);
+    EXPECT_EQ(encode_parms.cs, common->color_space);
+    if (encode_parms.render_size[0] > 0 && encode_parms.render_size[1] > 0) {
+      EXPECT_EQ(encode_parms.render_size[0], common->render_width);
+      EXPECT_EQ(encode_parms.render_size[1], common->render_height);
+    }
+    EXPECT_EQ(encode_parms.tile_cols, common->log2_tile_cols);
+    EXPECT_EQ(encode_parms.tile_rows, common->log2_tile_rows);
+
+    EXPECT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError();
+    return VPX_CODEC_OK == res_dec;
+  }
+
+  EncodePerfTestVideo test_video_;
+
+ private:
+  EncodeParameters encode_parms;
+};
+
+TEST_P(VpxEncoderParmsGetToDecoder, BitstreamParms) {
+  init_flags_ = VPX_CODEC_USE_PSNR;
+
+  libvpx_test::VideoSource *const video =
+      new libvpx_test::Y4mVideoSource(test_video_.name, 0, test_video_.frames);
+  ASSERT_TRUE(video != NULL);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video));
+  delete video;
+}
+
+VP9_INSTANTIATE_TEST_CASE(VpxEncoderParmsGetToDecoder,
+                          ::testing::ValuesIn(kVP9EncodeParameterSet),
+                          ::testing::ValuesIn(kVP9EncodePerfTestVectors));
+}  // namespace
diff --git a/libs/libvpx/test/vp9_end_to_end_test.cc b/libs/libvpx/test/vp9_end_to_end_test.cc
new file mode 100644
index 0000000000..be1fa68c0e
--- /dev/null
+++ b/libs/libvpx/test/vp9_end_to_end_test.cc
@@ -0,0 +1,209 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "test/yuv_video_source.h"
+
+namespace {
+
+const unsigned int kWidth  = 160;
+const unsigned int kHeight = 90;
+const unsigned int kFramerate = 50;
+const unsigned int kFrames = 10;
+const int kBitrate = 500;
+// List of psnr thresholds for speed settings 0-7 and 5 encoding modes
+const double kPsnrThreshold[][5] = {
+  { 36.0, 37.0, 37.0, 37.0, 37.0 },
+  { 35.0, 36.0, 36.0, 36.0, 36.0 },
+  { 34.0, 35.0, 35.0, 35.0, 35.0 },
+  { 33.0, 34.0, 34.0, 34.0, 34.0 },
+  { 32.0, 33.0, 33.0, 33.0, 33.0 },
+  { 31.0, 32.0, 32.0, 32.0, 32.0 },
+  { 30.0, 31.0, 31.0, 31.0, 31.0 },
+  { 29.0, 30.0, 30.0, 30.0, 30.0 },
+};
+
+typedef struct {
+  const char *filename;
+  unsigned int input_bit_depth;
+  vpx_img_fmt fmt;
+  vpx_bit_depth_t bit_depth;
+  unsigned int profile;
+} TestVideoParam;
+
+const TestVideoParam kTestVectors[] = {
+  {"park_joy_90p_8_420.y4m", 8, VPX_IMG_FMT_I420, VPX_BITS_8, 0},
+  {"park_joy_90p_8_422.y4m", 8, VPX_IMG_FMT_I422, VPX_BITS_8, 1},
+  {"park_joy_90p_8_444.y4m", 8, VPX_IMG_FMT_I444, VPX_BITS_8, 1},
+  {"park_joy_90p_8_440.yuv", 8, VPX_IMG_FMT_I440, VPX_BITS_8, 1},
+#if CONFIG_VP9_HIGHBITDEPTH
+  {"park_joy_90p_10_420.y4m", 10, VPX_IMG_FMT_I42016, VPX_BITS_10, 2},
+  {"park_joy_90p_10_422.y4m", 10, VPX_IMG_FMT_I42216, VPX_BITS_10, 3},
+  {"park_joy_90p_10_444.y4m", 10, VPX_IMG_FMT_I44416, VPX_BITS_10, 3},
+  {"park_joy_90p_10_440.yuv", 10, VPX_IMG_FMT_I44016, VPX_BITS_10, 3},
+  {"park_joy_90p_12_420.y4m", 12, VPX_IMG_FMT_I42016, VPX_BITS_12, 2},
+  {"park_joy_90p_12_422.y4m", 12, VPX_IMG_FMT_I42216, VPX_BITS_12, 3},
+  {"park_joy_90p_12_444.y4m", 12, VPX_IMG_FMT_I44416, VPX_BITS_12, 3},
+  {"park_joy_90p_12_440.yuv", 12, VPX_IMG_FMT_I44016, VPX_BITS_12, 3},
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+};
+
+// Encoding modes tested
+const libvpx_test::TestMode kEncodingModeVectors[] = {
+  ::libvpx_test::kTwoPassGood,
+  ::libvpx_test::kOnePassGood,
+  ::libvpx_test::kRealTime,
+};
+
+// Speed settings tested
+const int kCpuUsedVectors[] = {1, 2, 3, 5, 6};
+
+int is_extension_y4m(const char *filename) {
+  const char *dot = strrchr(filename, '.');
+  if (!dot || dot == filename)
+    return 0;
+  else
+    return !strcmp(dot, ".y4m");
+}
+
+class EndToEndTestLarge
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWith3Params<libvpx_test::TestMode, \
+                                                 TestVideoParam, int> {
+ protected:
+  EndToEndTestLarge()
+      : EncoderTest(GET_PARAM(0)),
+        test_video_param_(GET_PARAM(2)),
+        cpu_used_(GET_PARAM(3)),
+        psnr_(0.0),
+        nframes_(0),
+        encoding_mode_(GET_PARAM(1)) {
+  }
+
+  virtual ~EndToEndTestLarge() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+    if (encoding_mode_ != ::libvpx_test::kRealTime) {
+      cfg_.g_lag_in_frames = 5;
+      cfg_.rc_end_usage = VPX_VBR;
+    } else {
+      cfg_.g_lag_in_frames = 0;
+      cfg_.rc_end_usage = VPX_CBR;
+      cfg_.rc_buf_sz = 1000;
+      cfg_.rc_buf_initial_sz = 500;
+      cfg_.rc_buf_optimal_sz = 600;
+    }
+    dec_cfg_.threads = 4;
+  }
+
+  virtual void BeginPassHook(unsigned int) {
+    psnr_ = 0.0;
+    nframes_ = 0;
+  }
+
+  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+    psnr_ += pkt->data.psnr.psnr[0];
+    nframes_++;
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 1) {
+      encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 1);
+      encoder->Control(VP9E_SET_TILE_COLUMNS, 4);
+      encoder->Control(VP8E_SET_CPUUSED, cpu_used_);
+      if (encoding_mode_ != ::libvpx_test::kRealTime) {
+        encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
+        encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
+        encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
+        encoder->Control(VP8E_SET_ARNR_TYPE, 3);
+      }
+    }
+  }
+
+  double GetAveragePsnr() const {
+    if (nframes_)
+      return psnr_ / nframes_;
+    return 0.0;
+  }
+
+  double GetPsnrThreshold() {
+    return kPsnrThreshold[cpu_used_][encoding_mode_];
+  }
+
+  TestVideoParam test_video_param_;
+  int cpu_used_;
+
+ private:
+  double psnr_;
+  unsigned int nframes_;
+  libvpx_test::TestMode encoding_mode_;
+};
+
+TEST_P(EndToEndTestLarge, EndtoEndPSNRTest) {
+  cfg_.rc_target_bitrate = kBitrate;
+  cfg_.g_error_resilient = 0;
+  cfg_.g_profile = test_video_param_.profile;
+  cfg_.g_input_bit_depth = test_video_param_.input_bit_depth;
+  cfg_.g_bit_depth = test_video_param_.bit_depth;
+  init_flags_ = VPX_CODEC_USE_PSNR;
+  if (cfg_.g_bit_depth > 8)
+    init_flags_ |= VPX_CODEC_USE_HIGHBITDEPTH;
+
+  libvpx_test::VideoSource *video;
+  if (is_extension_y4m(test_video_param_.filename)) {
+    video = new libvpx_test::Y4mVideoSource(test_video_param_.filename,
+                                            0, kFrames);
+  } else {
+    video = new libvpx_test::YUVVideoSource(test_video_param_.filename,
+                                            test_video_param_.fmt,
+                                            kWidth, kHeight,
+                                            kFramerate, 1, 0, kFrames);
+  }
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video));
+  const double psnr = GetAveragePsnr();
+  EXPECT_GT(psnr, GetPsnrThreshold());
+  delete(video);
+}
+
+VP9_INSTANTIATE_TEST_CASE(
+    EndToEndTestLarge,
+    ::testing::ValuesIn(kEncodingModeVectors),
+    ::testing::ValuesIn(kTestVectors),
+    ::testing::ValuesIn(kCpuUsedVectors));
+
+#if CONFIG_VP9_HIGHBITDEPTH
+# if CONFIG_VP10_ENCODER
+// TODO(angiebird): many fail in high bitdepth mode.
+INSTANTIATE_TEST_CASE_P(
+    DISABLED_VP10, EndToEndTestLarge,
+    ::testing::Combine(
+        ::testing::Values(static_cast<const libvpx_test::CodecFactory *>(
+            &libvpx_test::kVP10)),
+        ::testing::ValuesIn(kEncodingModeVectors),
+        ::testing::ValuesIn(kTestVectors),
+        ::testing::ValuesIn(kCpuUsedVectors)));
+# endif  // CONFIG_VP10_ENCODER
+#else
+VP10_INSTANTIATE_TEST_CASE(
+    EndToEndTestLarge,
+    ::testing::ValuesIn(kEncodingModeVectors),
+    ::testing::ValuesIn(kTestVectors),
+    ::testing::ValuesIn(kCpuUsedVectors));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}  // namespace
diff --git a/libs/libvpx/test/vp9_error_block_test.cc b/libs/libvpx/test/vp9_error_block_test.cc
new file mode 100644
index 0000000000..23a249e2b0
--- /dev/null
+++ b/libs/libvpx/test/vp9_error_block_test.cc
@@ -0,0 +1,211 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <cmath>
+#include <cstdlib>
+#include <string>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "vp9/common/vp9_entropy.h"
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_integer.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+#if CONFIG_VP9_HIGHBITDEPTH
+const int kNumIterations = 1000;
+
+typedef int64_t (*ErrorBlockFunc)(const tran_low_t *coeff,
+                                  const tran_low_t *dqcoeff,
+                                  intptr_t block_size,
+                                  int64_t *ssz, int bps);
+
+typedef std::tr1::tuple<ErrorBlockFunc, ErrorBlockFunc, vpx_bit_depth_t>
+                        ErrorBlockParam;
+
+class ErrorBlockTest
+  : public ::testing::TestWithParam<ErrorBlockParam> {
+ public:
+  virtual ~ErrorBlockTest() {}
+  virtual void SetUp() {
+    error_block_op_     = GET_PARAM(0);
+    ref_error_block_op_ = GET_PARAM(1);
+    bit_depth_  = GET_PARAM(2);
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  vpx_bit_depth_t bit_depth_;
+  ErrorBlockFunc error_block_op_;
+  ErrorBlockFunc ref_error_block_op_;
+};
+
+TEST_P(ErrorBlockTest, OperationCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, tran_low_t, coeff[4096]);
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[4096]);
+  int err_count_total = 0;
+  int first_failure = -1;
+  intptr_t block_size;
+  int64_t ssz;
+  int64_t ret;
+  int64_t ref_ssz;
+  int64_t ref_ret;
+  const int msb = bit_depth_ + 8 - 1;
+  for (int i = 0; i < kNumIterations; ++i) {
+    int err_count = 0;
+    block_size = 16 << (i % 9);  // All block sizes from 4x4, 8x4 ..64x64
+    for (int j = 0; j < block_size; j++) {
+      // coeff and dqcoeff will always have at least the same sign, and this
+      // can be used for optimization, so generate test input precisely.
+      if (rnd(2)) {
+        // Positive number
+        coeff[j]   = rnd(1 << msb);
+        dqcoeff[j] = rnd(1 << msb);
+      } else {
+        // Negative number
+        coeff[j]   = -rnd(1 << msb);
+        dqcoeff[j] = -rnd(1 << msb);
+      }
+    }
+    ref_ret = ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz,
+                                  bit_depth_);
+    ASM_REGISTER_STATE_CHECK(ret = error_block_op_(coeff, dqcoeff, block_size,
+                                                   &ssz, bit_depth_));
+    err_count += (ref_ret != ret) | (ref_ssz != ssz);
+    if (err_count && !err_count_total) {
+      first_failure = i;
+    }
+    err_count_total += err_count;
+  }
+  EXPECT_EQ(0, err_count_total)
+      << "Error: Error Block Test, C output doesn't match optimized output. "
+      << "First failed at test case " << first_failure;
+}
+
+TEST_P(ErrorBlockTest, ExtremeValues) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, tran_low_t, coeff[4096]);
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[4096]);
+  int err_count_total = 0;
+  int first_failure = -1;
+  intptr_t block_size;
+  int64_t ssz;
+  int64_t ret;
+  int64_t ref_ssz;
+  int64_t ref_ret;
+  const int msb = bit_depth_ + 8 - 1;
+  int max_val = ((1 << msb) - 1);
+  for (int i = 0; i < kNumIterations; ++i) {
+    int err_count = 0;
+    int k = (i / 9) % 9;
+
+    // Change the maximum coeff value, to test different bit boundaries
+    if ( k == 8 && (i % 9) == 0 ) {
+      max_val >>= 1;
+    }
+    block_size = 16 << (i % 9);  // All block sizes from 4x4, 8x4 ..64x64
+    for (int j = 0; j < block_size; j++) {
+      if (k < 4) {
+        // Test at positive maximum values
+        coeff[j]   = k % 2 ? max_val : 0;
+        dqcoeff[j] = (k >> 1) % 2 ? max_val : 0;
+      } else if (k < 8) {
+        // Test at negative maximum values
+        coeff[j]   = k % 2 ? -max_val : 0;
+        dqcoeff[j] = (k >> 1) % 2 ? -max_val : 0;
+      } else {
+        if (rnd(2)) {
+          // Positive number
+          coeff[j]   = rnd(1 << 14);
+          dqcoeff[j] = rnd(1 << 14);
+        } else {
+          // Negative number
+          coeff[j]   = -rnd(1 << 14);
+          dqcoeff[j] = -rnd(1 << 14);
+        }
+      }
+    }
+    ref_ret = ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz,
+                                  bit_depth_);
+    ASM_REGISTER_STATE_CHECK(ret = error_block_op_(coeff, dqcoeff, block_size,
+                                                   &ssz, bit_depth_));
+    err_count += (ref_ret != ret) | (ref_ssz != ssz);
+    if (err_count && !err_count_total) {
+      first_failure = i;
+    }
+    err_count_total += err_count;
+  }
+  EXPECT_EQ(0, err_count_total)
+      << "Error: Error Block Test, C output doesn't match optimized output. "
+      << "First failed at test case " << first_failure;
+}
+
+using std::tr1::make_tuple;
+
+#if CONFIG_USE_X86INC
+int64_t wrap_vp9_highbd_block_error_8bit_c(const tran_low_t *coeff,
+                                           const tran_low_t *dqcoeff,
+                                           intptr_t block_size,
+                                           int64_t *ssz, int bps) {
+  EXPECT_EQ(8, bps);
+  return vp9_highbd_block_error_8bit_c(coeff, dqcoeff, block_size, ssz);
+}
+
+#if HAVE_SSE2
+int64_t wrap_vp9_highbd_block_error_8bit_sse2(const tran_low_t *coeff,
+                                              const tran_low_t *dqcoeff,
+                                              intptr_t block_size,
+                                              int64_t *ssz, int bps) {
+  EXPECT_EQ(8, bps);
+  return vp9_highbd_block_error_8bit_sse2(coeff, dqcoeff, block_size, ssz);
+}
+
+INSTANTIATE_TEST_CASE_P(
+    SSE2, ErrorBlockTest,
+    ::testing::Values(
+        make_tuple(&vp9_highbd_block_error_sse2,
+                   &vp9_highbd_block_error_c, VPX_BITS_10),
+        make_tuple(&vp9_highbd_block_error_sse2,
+                   &vp9_highbd_block_error_c, VPX_BITS_12),
+        make_tuple(&vp9_highbd_block_error_sse2,
+                   &vp9_highbd_block_error_c, VPX_BITS_8),
+        make_tuple(&wrap_vp9_highbd_block_error_8bit_sse2,
+                   &wrap_vp9_highbd_block_error_8bit_c, VPX_BITS_8)));
+#endif  // HAVE_SSE2
+
+#if HAVE_AVX
+int64_t wrap_vp9_highbd_block_error_8bit_avx(const tran_low_t *coeff,
+                                              const tran_low_t *dqcoeff,
+                                              intptr_t block_size,
+                                              int64_t *ssz, int bps) {
+  EXPECT_EQ(8, bps);
+  return vp9_highbd_block_error_8bit_avx(coeff, dqcoeff, block_size, ssz);
+}
+
+INSTANTIATE_TEST_CASE_P(
+    AVX, ErrorBlockTest,
+    ::testing::Values(
+        make_tuple(&wrap_vp9_highbd_block_error_8bit_avx,
+                   &wrap_vp9_highbd_block_error_8bit_c, VPX_BITS_8)));
+#endif  // HAVE_AVX
+
+#endif  // CONFIG_USE_X86INC
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}  // namespace
diff --git a/libs/libvpx/test/vp9_ethread_test.cc b/libs/libvpx/test/vp9_ethread_test.cc
new file mode 100644
index 0000000000..1e270e039b
--- /dev/null
+++ b/libs/libvpx/test/vp9_ethread_test.cc
@@ -0,0 +1,142 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string>
+#include <vector>
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/md5_helper.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+
+namespace {
+class VPxEncoderThreadTest
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
+ protected:
+  VPxEncoderThreadTest()
+      : EncoderTest(GET_PARAM(0)),
+        encoder_initialized_(false),
+        tiles_(2),
+        encoding_mode_(GET_PARAM(1)),
+        set_cpu_used_(GET_PARAM(2)) {
+    init_flags_ = VPX_CODEC_USE_PSNR;
+    vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
+    cfg.w = 1280;
+    cfg.h = 720;
+    decoder_ = codec_->CreateDecoder(cfg, 0);
+
+    md5_.clear();
+  }
+  virtual ~VPxEncoderThreadTest() {
+    delete decoder_;
+  }
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+
+    if (encoding_mode_ != ::libvpx_test::kRealTime) {
+      cfg_.g_lag_in_frames = 3;
+      cfg_.rc_end_usage = VPX_VBR;
+      cfg_.rc_2pass_vbr_minsection_pct = 5;
+      cfg_.rc_2pass_vbr_minsection_pct = 2000;
+    } else {
+      cfg_.g_lag_in_frames = 0;
+      cfg_.rc_end_usage = VPX_CBR;
+      cfg_.g_error_resilient = 1;
+    }
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_min_quantizer = 0;
+  }
+
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+    encoder_initialized_ = false;
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource * /*video*/,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (!encoder_initialized_) {
+      // Encode 4 column tiles.
+      encoder->Control(VP9E_SET_TILE_COLUMNS, tiles_);
+      encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
+      if (encoding_mode_ != ::libvpx_test::kRealTime) {
+        encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
+        encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
+        encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
+        encoder->Control(VP8E_SET_ARNR_TYPE, 3);
+      } else {
+        encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 0);
+        encoder->Control(VP9E_SET_AQ_MODE, 3);
+      }
+      encoder_initialized_ = true;
+    }
+  }
+
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    const vpx_codec_err_t res = decoder_->DecodeFrame(
+        reinterpret_cast<uint8_t*>(pkt->data.frame.buf), pkt->data.frame.sz);
+    if (res != VPX_CODEC_OK) {
+      abort_ = true;
+      ASSERT_EQ(VPX_CODEC_OK, res);
+    }
+    const vpx_image_t *img = decoder_->GetDxData().Next();
+
+    if (img) {
+      ::libvpx_test::MD5 md5_res;
+      md5_res.Add(img);
+      md5_.push_back(md5_res.Get());
+    }
+  }
+
+  bool encoder_initialized_;
+  int tiles_;
+  ::libvpx_test::TestMode encoding_mode_;
+  int set_cpu_used_;
+  ::libvpx_test::Decoder *decoder_;
+  std::vector<std::string> md5_;
+};
+
+TEST_P(VPxEncoderThreadTest, EncoderResultTest) {
+  std::vector<std::string> single_thr_md5, multi_thr_md5;
+
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 15, 20);
+
+  cfg_.rc_target_bitrate = 1000;
+
+  // Encode using single thread.
+  cfg_.g_threads = 1;
+  init_flags_ = VPX_CODEC_USE_PSNR;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  single_thr_md5 = md5_;
+  md5_.clear();
+
+  // Encode using multiple threads.
+  cfg_.g_threads = 4;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  multi_thr_md5 = md5_;
+  md5_.clear();
+
+  // Compare to check if two vectors are equal.
+  ASSERT_EQ(single_thr_md5, multi_thr_md5);
+}
+
+VP9_INSTANTIATE_TEST_CASE(
+    VPxEncoderThreadTest,
+    ::testing::Values(::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood,
+                      ::libvpx_test::kRealTime),
+    ::testing::Range(1, 9));
+
+VP10_INSTANTIATE_TEST_CASE(
+    VPxEncoderThreadTest,
+    ::testing::Values(::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood),
+    ::testing::Range(1, 3));
+}  // namespace
diff --git a/libs/libvpx/test/vp9_frame_parallel_test.cc b/libs/libvpx/test/vp9_frame_parallel_test.cc
new file mode 100644
index 0000000000..f0df88afa9
--- /dev/null
+++ b/libs/libvpx/test/vp9_frame_parallel_test.cc
@@ -0,0 +1,220 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "./vpx_config.h"
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/ivf_video_source.h"
+#include "test/md5_helper.h"
+#include "test/util.h"
+#if CONFIG_WEBM_IO
+#include "test/webm_video_source.h"
+#endif
+#include "vpx_mem/vpx_mem.h"
+
+namespace {
+
+using std::string;
+
+#if CONFIG_WEBM_IO
+
+struct PauseFileList {
+  const char *name;
+  // md5 sum for decoded frames which does not include skipped frames.
+  const char *expected_md5;
+  const int pause_frame_num;
+};
+
+// Decodes |filename| with |num_threads|. Pause at the specified frame_num,
+// seek to next key frame and then continue decoding until the end. Return
+// the md5 of the decoded frames which does not include skipped frames.
+string DecodeFileWithPause(const string &filename, int num_threads,
+                           int pause_num) {
+  libvpx_test::WebMVideoSource video(filename);
+  video.Init();
+  int in_frames = 0;
+  int out_frames = 0;
+
+  vpx_codec_dec_cfg_t cfg = {0};
+  cfg.threads = num_threads;
+  vpx_codec_flags_t flags = 0;
+  flags |= VPX_CODEC_USE_FRAME_THREADING;
+  libvpx_test::VP9Decoder decoder(cfg, flags, 0);
+
+  libvpx_test::MD5 md5;
+  video.Begin();
+
+  do {
+    ++in_frames;
+    const vpx_codec_err_t res =
+        decoder.DecodeFrame(video.cxdata(), video.frame_size());
+    if (res != VPX_CODEC_OK) {
+      EXPECT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError();
+      break;
+    }
+
+    // Pause at specified frame number.
+    if (in_frames == pause_num) {
+      // Flush the decoder and then seek to next key frame.
+      decoder.DecodeFrame(NULL, 0);
+      video.SeekToNextKeyFrame();
+    } else {
+      video.Next();
+    }
+
+    // Flush the decoder at the end of the video.
+    if (!video.cxdata())
+      decoder.DecodeFrame(NULL, 0);
+
+    libvpx_test::DxDataIterator dec_iter = decoder.GetDxData();
+    const vpx_image_t *img;
+
+    // Get decompressed data
+    while ((img = dec_iter.Next())) {
+      ++out_frames;
+      md5.Add(img);
+    }
+  } while (video.cxdata() != NULL);
+
+  EXPECT_EQ(in_frames, out_frames) <<
+      "Input frame count does not match output frame count";
+
+  return string(md5.Get());
+}
+
+void DecodeFilesWithPause(const PauseFileList files[]) {
+  for (const PauseFileList *iter = files; iter->name != NULL; ++iter) {
+    SCOPED_TRACE(iter->name);
+    for (int t = 2; t <= 8; ++t) {
+      EXPECT_EQ(iter->expected_md5,
+                DecodeFileWithPause(iter->name, t, iter->pause_frame_num))
+          << "threads = " << t;
+    }
+  }
+}
+
+TEST(VP9MultiThreadedFrameParallel, PauseSeekResume) {
+  // vp90-2-07-frame_parallel-1.webm is a 40 frame video file with
+  // one key frame for every ten frames.
+  static const PauseFileList files[] = {
+    { "vp90-2-07-frame_parallel-1.webm",
+      "6ea7c3875d67252e7caf2bc6e75b36b1", 6 },
+    { "vp90-2-07-frame_parallel-1.webm",
+      "4bb634160c7356a8d7d4299b6dc83a45", 12 },
+    { "vp90-2-07-frame_parallel-1.webm",
+      "89772591e6ef461f9fa754f916c78ed8", 26 },
+    { NULL, NULL, 0 },
+  };
+  DecodeFilesWithPause(files);
+}
+
+struct FileList {
+  const char *name;
+  // md5 sum for decoded frames which does not include corrupted frames.
+  const char *expected_md5;
+  // Expected number of decoded frames which does not include corrupted frames.
+  const int expected_frame_count;
+};
+
+// Decodes |filename| with |num_threads|. Return the md5 of the decoded
+// frames which does not include corrupted frames.
+string DecodeFile(const string &filename, int num_threads,
+                  int expected_frame_count) {
+  libvpx_test::WebMVideoSource video(filename);
+  video.Init();
+
+  vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
+  cfg.threads = num_threads;
+  const vpx_codec_flags_t flags = VPX_CODEC_USE_FRAME_THREADING;
+  libvpx_test::VP9Decoder decoder(cfg, flags, 0);
+
+  libvpx_test::MD5 md5;
+  video.Begin();
+
+  int out_frames = 0;
+  do {
+    const vpx_codec_err_t res =
+        decoder.DecodeFrame(video.cxdata(), video.frame_size());
+    // TODO(hkuang): frame parallel mode should return an error on corruption.
+    if (res != VPX_CODEC_OK) {
+      EXPECT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError();
+      break;
+    }
+
+    video.Next();
+
+    // Flush the decoder at the end of the video.
+    if (!video.cxdata())
+      decoder.DecodeFrame(NULL, 0);
+
+    libvpx_test::DxDataIterator dec_iter = decoder.GetDxData();
+    const vpx_image_t *img;
+
+    // Get decompressed data
+    while ((img = dec_iter.Next())) {
+      ++out_frames;
+      md5.Add(img);
+    }
+  } while (video.cxdata() != NULL);
+
+  EXPECT_EQ(expected_frame_count, out_frames) <<
+      "Input frame count does not match expected output frame count";
+
+  return string(md5.Get());
+}
+
+void DecodeFiles(const FileList files[]) {
+  for (const FileList *iter = files; iter->name != NULL; ++iter) {
+    SCOPED_TRACE(iter->name);
+    for (int t = 2; t <= 8; ++t) {
+      EXPECT_EQ(iter->expected_md5,
+                DecodeFile(iter->name, t, iter->expected_frame_count))
+          << "threads = " << t;
+    }
+  }
+}
+
+TEST(VP9MultiThreadedFrameParallel, InvalidFileTest) {
+  static const FileList files[] = {
+    // invalid-vp90-2-07-frame_parallel-1.webm is a 40 frame video file with
+    // one key frame for every ten frames. The 11th frame has corrupted data.
+    { "invalid-vp90-2-07-frame_parallel-1.webm",
+      "0549d0f45f60deaef8eb708e6c0eb6cb", 30 },
+    // invalid-vp90-2-07-frame_parallel-2.webm is a 40 frame video file with
+    // one key frame for every ten frames. The 1st and 31st frames have
+    // corrupted data.
+    { "invalid-vp90-2-07-frame_parallel-2.webm",
+      "6a1f3cf6f9e7a364212fadb9580d525e", 20 },
+    // invalid-vp90-2-07-frame_parallel-3.webm is a 40 frame video file with
+    // one key frame for every ten frames. The 5th and 13th frames have
+    // corrupted data.
+    { "invalid-vp90-2-07-frame_parallel-3.webm",
+      "8256544308de926b0681e04685b98677", 27 },
+    { NULL, NULL, 0 },
+  };
+  DecodeFiles(files);
+}
+
+TEST(VP9MultiThreadedFrameParallel, ValidFileTest) {
+  static const FileList files[] = {
+#if CONFIG_VP9_HIGHBITDEPTH
+    { "vp92-2-20-10bit-yuv420.webm",
+      "a16b99df180c584e8db2ffeda987d293", 10 },
+#endif
+    { NULL, NULL, 0 },
+  };
+  DecodeFiles(files);
+}
+#endif  // CONFIG_WEBM_IO
+}  // namespace
diff --git a/libs/libvpx/test/vp9_intrapred_test.cc b/libs/libvpx/test/vp9_intrapred_test.cc
new file mode 100644
index 0000000000..416f3c322e
--- /dev/null
+++ b/libs/libvpx/test/vp9_intrapred_test.cc
@@ -0,0 +1,231 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_pred_common.h"
+#include "vpx_mem/vpx_mem.h"
+
+namespace {
+
+using libvpx_test::ACMRandom;
+
+const int count_test_block = 100000;
+
+// Base class for VP9 intra prediction tests.
+class VP9IntraPredBase {
+ public:
+  virtual ~VP9IntraPredBase() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  virtual void Predict() = 0;
+
+  void CheckPrediction(int test_case_number, int *error_count) const {
+    // For each pixel ensure that the calculated value is the same as reference.
+    for (int y = 0; y < block_size_; y++) {
+      for (int x = 0; x < block_size_; x++) {
+        *error_count += ref_dst_[x + y * stride_] != dst_[x + y * stride_];
+        if (*error_count == 1) {
+          ASSERT_EQ(ref_dst_[x + y * stride_], dst_[x + y * stride_])
+              << " Failed on Test Case Number "<< test_case_number;
+        }
+      }
+    }
+  }
+
+  void RunTest(uint16_t* left_col, uint16_t* above_data,
+               uint16_t* dst, uint16_t* ref_dst) {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    left_col_ = left_col;
+    dst_ = dst;
+    ref_dst_ = ref_dst;
+    above_row_ = above_data + 16;
+    int error_count = 0;
+    for (int i = 0; i < count_test_block; ++i) {
+      // Fill edges with random data, try first with saturated values.
+      for (int x = -1; x <= block_size_*2; x++) {
+        if (i == 0) {
+          above_row_[x] = mask_;
+        } else {
+          above_row_[x] = rnd.Rand16() & mask_;
+        }
+      }
+      for (int y = 0; y < block_size_; y++) {
+        if (i == 0) {
+          left_col_[y] = mask_;
+        } else {
+          left_col_[y] = rnd.Rand16() & mask_;
+        }
+      }
+      Predict();
+      CheckPrediction(i, &error_count);
+    }
+    ASSERT_EQ(0, error_count);
+  }
+
+  int block_size_;
+  uint16_t *above_row_;
+  uint16_t *left_col_;
+  uint16_t *dst_;
+  uint16_t *ref_dst_;
+  ptrdiff_t stride_;
+  int mask_;
+};
+
+typedef void (*intra_pred_fn_t)(
+      uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
+      const uint16_t *left, int bps);
+typedef std::tr1::tuple<intra_pred_fn_t,
+                        intra_pred_fn_t, int, int> intra_pred_params_t;
+class VP9IntraPredTest
+    : public VP9IntraPredBase,
+      public ::testing::TestWithParam<intra_pred_params_t> {
+
+  virtual void SetUp() {
+    pred_fn_    = GET_PARAM(0);
+    ref_fn_     = GET_PARAM(1);
+    block_size_ = GET_PARAM(2);
+    bit_depth_  = GET_PARAM(3);
+    stride_     = block_size_ * 3;
+    mask_       = (1 << bit_depth_) - 1;
+  }
+
+  virtual void Predict() {
+    const uint16_t *const_above_row = above_row_;
+    const uint16_t *const_left_col = left_col_;
+    ref_fn_(ref_dst_, stride_, const_above_row, const_left_col, bit_depth_);
+    ASM_REGISTER_STATE_CHECK(pred_fn_(dst_, stride_, const_above_row,
+                                      const_left_col, bit_depth_));
+  }
+  intra_pred_fn_t pred_fn_;
+  intra_pred_fn_t ref_fn_;
+  int bit_depth_;
+};
+
+TEST_P(VP9IntraPredTest, IntraPredTests) {
+  // max block size is 32
+  DECLARE_ALIGNED(16, uint16_t, left_col[2*32]);
+  DECLARE_ALIGNED(16, uint16_t, above_data[2*32+32]);
+  DECLARE_ALIGNED(16, uint16_t, dst[3 * 32 * 32]);
+  DECLARE_ALIGNED(16, uint16_t, ref_dst[3 * 32 * 32]);
+  RunTest(left_col, above_data, dst, ref_dst);
+}
+
+using std::tr1::make_tuple;
+
+#if HAVE_SSE2
+#if CONFIG_VP9_HIGHBITDEPTH
+#if CONFIG_USE_X86INC
+INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, VP9IntraPredTest,
+                        ::testing::Values(
+                            make_tuple(&vpx_highbd_dc_predictor_32x32_sse2,
+                                       &vpx_highbd_dc_predictor_32x32_c, 32, 8),
+                            make_tuple(&vpx_highbd_tm_predictor_16x16_sse2,
+                                       &vpx_highbd_tm_predictor_16x16_c, 16, 8),
+                            make_tuple(&vpx_highbd_tm_predictor_32x32_sse2,
+                                       &vpx_highbd_tm_predictor_32x32_c, 32, 8),
+                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse2,
+                                       &vpx_highbd_dc_predictor_4x4_c, 4, 8),
+                            make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
+                                       &vpx_highbd_dc_predictor_8x8_c, 8, 8),
+                            make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
+                                       &vpx_highbd_dc_predictor_16x16_c, 16, 8),
+                            make_tuple(&vpx_highbd_v_predictor_4x4_sse2,
+                                       &vpx_highbd_v_predictor_4x4_c, 4, 8),
+                            make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
+                                       &vpx_highbd_v_predictor_8x8_c, 8, 8),
+                            make_tuple(&vpx_highbd_v_predictor_16x16_sse2,
+                                       &vpx_highbd_v_predictor_16x16_c, 16, 8),
+                            make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
+                                       &vpx_highbd_v_predictor_32x32_c, 32, 8),
+                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse2,
+                                       &vpx_highbd_tm_predictor_4x4_c, 4, 8),
+                            make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
+                                       &vpx_highbd_tm_predictor_8x8_c, 8, 8)));
+
+INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, VP9IntraPredTest,
+                        ::testing::Values(
+                            make_tuple(&vpx_highbd_dc_predictor_32x32_sse2,
+                                       &vpx_highbd_dc_predictor_32x32_c, 32,
+                                       10),
+                            make_tuple(&vpx_highbd_tm_predictor_16x16_sse2,
+                                       &vpx_highbd_tm_predictor_16x16_c, 16,
+                                       10),
+                            make_tuple(&vpx_highbd_tm_predictor_32x32_sse2,
+                                       &vpx_highbd_tm_predictor_32x32_c, 32,
+                                       10),
+                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse2,
+                                       &vpx_highbd_dc_predictor_4x4_c, 4, 10),
+                            make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
+                                       &vpx_highbd_dc_predictor_8x8_c, 8, 10),
+                            make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
+                                       &vpx_highbd_dc_predictor_16x16_c, 16,
+                                       10),
+                            make_tuple(&vpx_highbd_v_predictor_4x4_sse2,
+                                       &vpx_highbd_v_predictor_4x4_c, 4, 10),
+                            make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
+                                       &vpx_highbd_v_predictor_8x8_c, 8, 10),
+                            make_tuple(&vpx_highbd_v_predictor_16x16_sse2,
+                                       &vpx_highbd_v_predictor_16x16_c, 16,
+                                       10),
+                            make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
+                                       &vpx_highbd_v_predictor_32x32_c, 32,
+                                       10),
+                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse2,
+                                       &vpx_highbd_tm_predictor_4x4_c, 4, 10),
+                            make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
+                                       &vpx_highbd_tm_predictor_8x8_c, 8, 10)));
+
+INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, VP9IntraPredTest,
+                        ::testing::Values(
+                            make_tuple(&vpx_highbd_dc_predictor_32x32_sse2,
+                                       &vpx_highbd_dc_predictor_32x32_c, 32,
+                                       12),
+                            make_tuple(&vpx_highbd_tm_predictor_16x16_sse2,
+                                       &vpx_highbd_tm_predictor_16x16_c, 16,
+                                       12),
+                            make_tuple(&vpx_highbd_tm_predictor_32x32_sse2,
+                                       &vpx_highbd_tm_predictor_32x32_c, 32,
+                                       12),
+                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse2,
+                                       &vpx_highbd_dc_predictor_4x4_c, 4, 12),
+                            make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
+                                       &vpx_highbd_dc_predictor_8x8_c, 8, 12),
+                            make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
+                                       &vpx_highbd_dc_predictor_16x16_c, 16,
+                                       12),
+                            make_tuple(&vpx_highbd_v_predictor_4x4_sse2,
+                                       &vpx_highbd_v_predictor_4x4_c, 4, 12),
+                            make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
+                                       &vpx_highbd_v_predictor_8x8_c, 8, 12),
+                            make_tuple(&vpx_highbd_v_predictor_16x16_sse2,
+                                       &vpx_highbd_v_predictor_16x16_c, 16,
+                                       12),
+                            make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
+                                       &vpx_highbd_v_predictor_32x32_c, 32,
+                                       12),
+                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse2,
+                                       &vpx_highbd_tm_predictor_4x4_c, 4, 12),
+                            make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
+                                       &vpx_highbd_tm_predictor_8x8_c, 8, 12)));
+
+#endif  // CONFIG_USE_X86INC
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_SSE2
+}  // namespace
diff --git a/libs/libvpx/test/vp9_lossless_test.cc b/libs/libvpx/test/vp9_lossless_test.cc
new file mode 100644
index 0000000000..09c1070c6b
--- /dev/null
+++ b/libs/libvpx/test/vp9_lossless_test.cc
@@ -0,0 +1,134 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_config.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+
+namespace {
+
+const int kMaxPsnr = 100;
+
+class LosslessTest : public ::libvpx_test::EncoderTest,
+    public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
+ protected:
+  LosslessTest()
+      : EncoderTest(GET_PARAM(0)),
+        psnr_(kMaxPsnr),
+        nframes_(0),
+        encoding_mode_(GET_PARAM(1)) {
+  }
+
+  virtual ~LosslessTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 1) {
+      // Only call Control if quantizer > 0 to verify that using quantizer
+      // alone will activate lossless
+      if (cfg_.rc_max_quantizer > 0 || cfg_.rc_min_quantizer > 0) {
+        encoder->Control(VP9E_SET_LOSSLESS, 1);
+      }
+    }
+  }
+
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+    psnr_ = kMaxPsnr;
+    nframes_ = 0;
+  }
+
+  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+    if (pkt->data.psnr.psnr[0] < psnr_)
+      psnr_= pkt->data.psnr.psnr[0];
+  }
+
+  double GetMinPsnr() const {
+      return psnr_;
+  }
+
+ private:
+  double psnr_;
+  unsigned int nframes_;
+  libvpx_test::TestMode encoding_mode_;
+};
+
+TEST_P(LosslessTest, TestLossLessEncoding) {
+  const vpx_rational timebase = { 33333333, 1000000000 };
+  cfg_.g_timebase = timebase;
+  cfg_.rc_target_bitrate = 2000;
+  cfg_.g_lag_in_frames = 25;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 0;
+
+  init_flags_ = VPX_CODEC_USE_PSNR;
+
+  // intentionally changed the dimension for better testing coverage
+  libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                     timebase.den, timebase.num, 0, 10);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  const double psnr_lossless = GetMinPsnr();
+  EXPECT_GE(psnr_lossless, kMaxPsnr);
+}
+
+TEST_P(LosslessTest, TestLossLessEncoding444) {
+  libvpx_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 10);
+
+  cfg_.g_profile = 1;
+  cfg_.g_timebase = video.timebase();
+  cfg_.rc_target_bitrate = 2000;
+  cfg_.g_lag_in_frames = 25;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 0;
+
+  init_flags_ = VPX_CODEC_USE_PSNR;
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  const double psnr_lossless = GetMinPsnr();
+  EXPECT_GE(psnr_lossless, kMaxPsnr);
+}
+
+TEST_P(LosslessTest, TestLossLessEncodingCtrl) {
+  const vpx_rational timebase = { 33333333, 1000000000 };
+  cfg_.g_timebase = timebase;
+  cfg_.rc_target_bitrate = 2000;
+  cfg_.g_lag_in_frames = 25;
+  // Intentionally set Q > 0, to make sure control can be used to activate
+  // lossless
+  cfg_.rc_min_quantizer = 10;
+  cfg_.rc_max_quantizer = 20;
+
+  init_flags_ = VPX_CODEC_USE_PSNR;
+
+  libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                     timebase.den, timebase.num, 0, 10);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  const double psnr_lossless = GetMinPsnr();
+  EXPECT_GE(psnr_lossless, kMaxPsnr);
+}
+
+VP9_INSTANTIATE_TEST_CASE(LosslessTest,
+                          ::testing::Values(::libvpx_test::kRealTime,
+                                            ::libvpx_test::kOnePassGood,
+                                            ::libvpx_test::kTwoPassGood));
+
+VP10_INSTANTIATE_TEST_CASE(LosslessTest,
+                           ::testing::Values(::libvpx_test::kOnePassGood,
+                                             ::libvpx_test::kTwoPassGood));
+}  // namespace
diff --git a/libs/libvpx/test/vp9_quantize_test.cc b/libs/libvpx/test/vp9_quantize_test.cc
new file mode 100644
index 0000000000..81d31fd1b2
--- /dev/null
+++ b/libs/libvpx/test/vp9_quantize_test.cc
@@ -0,0 +1,351 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "vp9/common/vp9_entropy.h"
+#include "vp9/common/vp9_scan.h"
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_integer.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+#if CONFIG_VP9_HIGHBITDEPTH
+const int number_of_iterations = 100;
+
+typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count,
+                             int skip_block, const int16_t *zbin,
+                             const int16_t *round, const int16_t *quant,
+                             const int16_t *quant_shift,
+                             tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                             const int16_t *dequant,
+                             uint16_t *eob, const int16_t *scan,
+                             const int16_t *iscan);
+typedef std::tr1::tuple<QuantizeFunc, QuantizeFunc, vpx_bit_depth_t>
+    QuantizeParam;
+
+class VP9QuantizeTest : public ::testing::TestWithParam<QuantizeParam> {
+ public:
+  virtual ~VP9QuantizeTest() {}
+  virtual void SetUp() {
+    quantize_op_   = GET_PARAM(0);
+    ref_quantize_op_ = GET_PARAM(1);
+    bit_depth_  = GET_PARAM(2);
+    mask_ = (1 << bit_depth_) - 1;
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  vpx_bit_depth_t bit_depth_;
+  int mask_;
+  QuantizeFunc quantize_op_;
+  QuantizeFunc ref_quantize_op_;
+};
+
+class VP9Quantize32Test : public ::testing::TestWithParam<QuantizeParam> {
+ public:
+  virtual ~VP9Quantize32Test() {}
+  virtual void SetUp() {
+    quantize_op_   = GET_PARAM(0);
+    ref_quantize_op_ = GET_PARAM(1);
+    bit_depth_  = GET_PARAM(2);
+    mask_ = (1 << bit_depth_) - 1;
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  vpx_bit_depth_t bit_depth_;
+  int mask_;
+  QuantizeFunc quantize_op_;
+  QuantizeFunc ref_quantize_op_;
+};
+
+TEST_P(VP9QuantizeTest, OperationCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[256]);
+  DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]);
+  DECLARE_ALIGNED(16, int16_t, round_ptr[2]);
+  DECLARE_ALIGNED(16, int16_t, quant_ptr[2]);
+  DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]);
+  DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[256]);
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[256]);
+  DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[256]);
+  DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[256]);
+  DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]);
+  DECLARE_ALIGNED(16, uint16_t, eob_ptr[1]);
+  DECLARE_ALIGNED(16, uint16_t, ref_eob_ptr[1]);
+  int err_count_total = 0;
+  int first_failure = -1;
+  for (int i = 0; i < number_of_iterations; ++i) {
+    const int skip_block = i == 0;
+    const TX_SIZE sz = (TX_SIZE)(i % 3);  // TX_4X4, TX_8X8 TX_16X16
+    const TX_TYPE tx_type = (TX_TYPE)((i >> 2) % 3);
+    const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
+    const int count = (4 << sz) * (4 << sz);  // 16, 64, 256
+    int err_count = 0;
+    *eob_ptr = rnd.Rand16();
+    *ref_eob_ptr = *eob_ptr;
+    for (int j = 0; j < count; j++) {
+      coeff_ptr[j] = rnd.Rand16()&mask_;
+    }
+    for (int j = 0; j < 2; j++) {
+      zbin_ptr[j] = rnd.Rand16()&mask_;
+      round_ptr[j] = rnd.Rand16();
+      quant_ptr[j] = rnd.Rand16();
+      quant_shift_ptr[j] = rnd.Rand16();
+      dequant_ptr[j] = rnd.Rand16();
+    }
+    ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
+                     quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
+                     ref_dqcoeff_ptr, dequant_ptr,
+                     ref_eob_ptr, scan_order->scan, scan_order->iscan);
+    ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_ptr, count, skip_block,
+                                          zbin_ptr, round_ptr, quant_ptr,
+                                          quant_shift_ptr, qcoeff_ptr,
+                                          dqcoeff_ptr, dequant_ptr, eob_ptr,
+                                          scan_order->scan, scan_order->iscan));
+    for (int j = 0; j < sz; ++j) {
+      err_count += (ref_qcoeff_ptr[j]  != qcoeff_ptr[j]) |
+          (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
+    }
+    err_count += (*ref_eob_ptr != *eob_ptr);
+    if (err_count && !err_count_total) {
+      first_failure = i;
+    }
+    err_count_total += err_count;
+  }
+  EXPECT_EQ(0, err_count_total)
+      << "Error: Quantization Test, C output doesn't match SSE2 output. "
+      << "First failed at test case " << first_failure;
+}
+
+TEST_P(VP9Quantize32Test, OperationCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[1024]);
+  DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]);
+  DECLARE_ALIGNED(16, int16_t, round_ptr[2]);
+  DECLARE_ALIGNED(16, int16_t, quant_ptr[2]);
+  DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]);
+  DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[1024]);
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[1024]);
+  DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[1024]);
+  DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[1024]);
+  DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]);
+  DECLARE_ALIGNED(16, uint16_t, eob_ptr[1]);
+  DECLARE_ALIGNED(16, uint16_t, ref_eob_ptr[1]);
+  int err_count_total = 0;
+  int first_failure = -1;
+  for (int i = 0; i < number_of_iterations; ++i) {
+    const int skip_block = i == 0;
+    const TX_SIZE sz = TX_32X32;
+    const TX_TYPE tx_type = (TX_TYPE)(i % 4);
+    const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
+    const int count = (4 << sz) * (4 << sz);  // 1024
+    int err_count = 0;
+    *eob_ptr = rnd.Rand16();
+    *ref_eob_ptr = *eob_ptr;
+    for (int j = 0; j < count; j++) {
+      coeff_ptr[j] = rnd.Rand16()&mask_;
+    }
+    for (int j = 0; j < 2; j++) {
+      zbin_ptr[j] = rnd.Rand16()&mask_;
+      round_ptr[j] = rnd.Rand16();
+      quant_ptr[j] = rnd.Rand16();
+      quant_shift_ptr[j] = rnd.Rand16();
+      dequant_ptr[j] = rnd.Rand16();
+    }
+    ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
+                     quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
+                     ref_dqcoeff_ptr, dequant_ptr,
+                     ref_eob_ptr, scan_order->scan, scan_order->iscan);
+    ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_ptr, count, skip_block,
+                                          zbin_ptr, round_ptr, quant_ptr,
+                                          quant_shift_ptr, qcoeff_ptr,
+                                          dqcoeff_ptr, dequant_ptr, eob_ptr,
+                                          scan_order->scan, scan_order->iscan));
+    for (int j = 0; j < sz; ++j) {
+      err_count += (ref_qcoeff_ptr[j]  != qcoeff_ptr[j]) |
+          (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
+    }
+    err_count += (*ref_eob_ptr != *eob_ptr);
+    if (err_count && !err_count_total) {
+      first_failure = i;
+    }
+    err_count_total += err_count;
+  }
+  EXPECT_EQ(0, err_count_total)
+      << "Error: Quantization Test, C output doesn't match SSE2 output. "
+      << "First failed at test case " << first_failure;
+}
+
+TEST_P(VP9QuantizeTest, EOBCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[256]);
+  DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]);
+  DECLARE_ALIGNED(16, int16_t, round_ptr[2]);
+  DECLARE_ALIGNED(16, int16_t, quant_ptr[2]);
+  DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]);
+  DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[256]);
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[256]);
+  DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[256]);
+  DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[256]);
+  DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]);
+  DECLARE_ALIGNED(16, uint16_t, eob_ptr[1]);
+  DECLARE_ALIGNED(16, uint16_t, ref_eob_ptr[1]);
+  int err_count_total = 0;
+  int first_failure = -1;
+  for (int i = 0; i < number_of_iterations; ++i) {
+    int skip_block = i == 0;
+    TX_SIZE sz = (TX_SIZE)(i % 3);  // TX_4X4, TX_8X8 TX_16X16
+    TX_TYPE tx_type = (TX_TYPE)((i >> 2) % 3);
+    const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
+    int count = (4 << sz) * (4 << sz);  // 16, 64, 256
+    int err_count = 0;
+    *eob_ptr = rnd.Rand16();
+    *ref_eob_ptr = *eob_ptr;
+    // Two random entries
+    for (int j = 0; j < count; j++) {
+      coeff_ptr[j] = 0;
+    }
+    coeff_ptr[rnd(count)] = rnd.Rand16()&mask_;
+    coeff_ptr[rnd(count)] = rnd.Rand16()&mask_;
+    for (int j = 0; j < 2; j++) {
+      zbin_ptr[j] = rnd.Rand16()&mask_;
+      round_ptr[j] = rnd.Rand16();
+      quant_ptr[j] = rnd.Rand16();
+      quant_shift_ptr[j] = rnd.Rand16();
+      dequant_ptr[j] = rnd.Rand16();
+    }
+
+    ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
+                     quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
+                     ref_dqcoeff_ptr, dequant_ptr,
+                     ref_eob_ptr, scan_order->scan, scan_order->iscan);
+    ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_ptr, count, skip_block,
+                                          zbin_ptr, round_ptr, quant_ptr,
+                                          quant_shift_ptr, qcoeff_ptr,
+                                          dqcoeff_ptr, dequant_ptr, eob_ptr,
+                                          scan_order->scan, scan_order->iscan));
+
+    for (int j = 0; j < sz; ++j) {
+      err_count += (ref_qcoeff_ptr[j]  != qcoeff_ptr[j]) |
+          (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
+    }
+    err_count += (*ref_eob_ptr != *eob_ptr);
+    if (err_count && !err_count_total) {
+      first_failure = i;
+    }
+    err_count_total += err_count;
+  }
+  EXPECT_EQ(0, err_count_total)
+      << "Error: Quantization Test, C output doesn't match SSE2 output. "
+      << "First failed at test case " << first_failure;
+}
+
+TEST_P(VP9Quantize32Test, EOBCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[1024]);
+  DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]);
+  DECLARE_ALIGNED(16, int16_t, round_ptr[2]);
+  DECLARE_ALIGNED(16, int16_t, quant_ptr[2]);
+  DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]);
+  DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[1024]);
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[1024]);
+  DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[1024]);
+  DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[1024]);
+  DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]);
+  DECLARE_ALIGNED(16, uint16_t, eob_ptr[1]);
+  DECLARE_ALIGNED(16, uint16_t, ref_eob_ptr[1]);
+  int err_count_total = 0;
+  int first_failure = -1;
+  for (int i = 0; i < number_of_iterations; ++i) {
+    int skip_block = i == 0;
+    TX_SIZE sz = TX_32X32;
+    TX_TYPE tx_type = (TX_TYPE)(i % 4);
+    const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
+    int count = (4 << sz) * (4 << sz);  // 1024
+    int err_count = 0;
+    *eob_ptr = rnd.Rand16();
+    *ref_eob_ptr = *eob_ptr;
+    for (int j = 0; j < count; j++) {
+      coeff_ptr[j] = 0;
+    }
+    // Two random entries
+    coeff_ptr[rnd(count)] = rnd.Rand16()&mask_;
+    coeff_ptr[rnd(count)] = rnd.Rand16()&mask_;
+    for (int j = 0; j < 2; j++) {
+      zbin_ptr[j] = rnd.Rand16()&mask_;
+      round_ptr[j] = rnd.Rand16();
+      quant_ptr[j] = rnd.Rand16();
+      quant_shift_ptr[j] = rnd.Rand16();
+      dequant_ptr[j] = rnd.Rand16();
+    }
+
+    ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
+                     quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
+                     ref_dqcoeff_ptr, dequant_ptr,
+                     ref_eob_ptr, scan_order->scan, scan_order->iscan);
+    ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_ptr, count, skip_block,
+                                          zbin_ptr, round_ptr, quant_ptr,
+                                          quant_shift_ptr, qcoeff_ptr,
+                                          dqcoeff_ptr, dequant_ptr, eob_ptr,
+                                          scan_order->scan, scan_order->iscan));
+
+    for (int j = 0; j < sz; ++j) {
+      err_count += (ref_qcoeff_ptr[j]  != qcoeff_ptr[j]) |
+          (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
+    }
+    err_count += (*ref_eob_ptr != *eob_ptr);
+    if (err_count && !err_count_total) {
+      first_failure = i;
+    }
+    err_count_total += err_count;
+  }
+  EXPECT_EQ(0, err_count_total)
+      << "Error: Quantization Test, C output doesn't match SSE2 output. "
+      << "First failed at test case " << first_failure;
+}
+using std::tr1::make_tuple;
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, VP9QuantizeTest,
+    ::testing::Values(
+        make_tuple(&vpx_highbd_quantize_b_sse2,
+                   &vpx_highbd_quantize_b_c, VPX_BITS_8),
+        make_tuple(&vpx_highbd_quantize_b_sse2,
+                   &vpx_highbd_quantize_b_c, VPX_BITS_10),
+        make_tuple(&vpx_highbd_quantize_b_sse2,
+                   &vpx_highbd_quantize_b_c, VPX_BITS_12)));
+INSTANTIATE_TEST_CASE_P(
+    SSE2, VP9Quantize32Test,
+    ::testing::Values(
+        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8),
+        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10),
+        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12)));
+#endif  // HAVE_SSE2
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}  // namespace
diff --git a/libs/libvpx/test/vp9_skip_loopfilter_test.cc b/libs/libvpx/test/vp9_skip_loopfilter_test.cc
new file mode 100644
index 0000000000..b0cc7ba41c
--- /dev/null
+++ b/libs/libvpx/test/vp9_skip_loopfilter_test.cc
@@ -0,0 +1,180 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string>
+
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/md5_helper.h"
+#include "test/util.h"
+#include "test/webm_video_source.h"
+
+namespace {
+
+const char kVp9TestFile[] = "vp90-2-08-tile_1x8_frame_parallel.webm";
+const char kVp9Md5File[] = "vp90-2-08-tile_1x8_frame_parallel.webm.md5";
+
+// Class for testing shutting off the loop filter.
+class SkipLoopFilterTest {
+ public:
+  SkipLoopFilterTest()
+      : video_(NULL),
+        decoder_(NULL),
+        md5_file_(NULL) {}
+
+  ~SkipLoopFilterTest() {
+    if (md5_file_ != NULL)
+      fclose(md5_file_);
+    delete decoder_;
+    delete video_;
+  }
+
+  // If |threads| > 0 then set the decoder with that number of threads.
+  void Init(int num_threads) {
+    expected_md5_[0] = '\0';
+    junk_[0] = '\0';
+    video_ = new libvpx_test::WebMVideoSource(kVp9TestFile);
+    ASSERT_TRUE(video_ != NULL);
+    video_->Init();
+    video_->Begin();
+
+    vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
+    if (num_threads > 0)
+      cfg.threads = num_threads;
+    decoder_ = new libvpx_test::VP9Decoder(cfg, 0);
+    ASSERT_TRUE(decoder_ != NULL);
+
+    OpenMd5File(kVp9Md5File);
+  }
+
+  // Set the VP9 skipLoopFilter control value.
+  void SetSkipLoopFilter(int value, vpx_codec_err_t expected_value) {
+    decoder_->Control(VP9_SET_SKIP_LOOP_FILTER, value, expected_value);
+  }
+
+  vpx_codec_err_t DecodeOneFrame() {
+    const vpx_codec_err_t res =
+        decoder_->DecodeFrame(video_->cxdata(), video_->frame_size());
+    if (res == VPX_CODEC_OK) {
+      ReadMd5();
+      video_->Next();
+    }
+    return res;
+  }
+
+  vpx_codec_err_t DecodeRemainingFrames() {
+    for (; video_->cxdata() != NULL; video_->Next()) {
+      const vpx_codec_err_t res =
+          decoder_->DecodeFrame(video_->cxdata(), video_->frame_size());
+      if (res != VPX_CODEC_OK)
+        return res;
+      ReadMd5();
+    }
+    return VPX_CODEC_OK;
+  }
+
+  // Checks if MD5 matches or doesn't.
+  void CheckMd5(bool matches) {
+    libvpx_test::DxDataIterator dec_iter = decoder_->GetDxData();
+    const vpx_image_t *img = dec_iter.Next();
+    CheckMd5Vpx(*img, matches);
+  }
+
+ private:
+  // TODO(fgalligan): Move the MD5 testing code into another class.
+  void OpenMd5File(const std::string &md5_file_name) {
+    md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name);
+    ASSERT_TRUE(md5_file_ != NULL) << "MD5 file open failed. Filename: "
+        << md5_file_name;
+  }
+
+  // Reads the next line of the MD5 file.
+  void ReadMd5() {
+    ASSERT_TRUE(md5_file_ != NULL);
+    const int res = fscanf(md5_file_, "%s  %s", expected_md5_, junk_);
+    ASSERT_NE(EOF, res) << "Read md5 data failed";
+    expected_md5_[32] = '\0';
+  }
+
+  // Checks if the last read MD5 matches |img| or doesn't.
+  void CheckMd5Vpx(const vpx_image_t &img, bool matches) {
+    ::libvpx_test::MD5 md5_res;
+    md5_res.Add(&img);
+    const char *const actual_md5 = md5_res.Get();
+
+    // Check MD5.
+    if (matches)
+      ASSERT_STREQ(expected_md5_, actual_md5) << "MD5 checksums don't match";
+    else
+      ASSERT_STRNE(expected_md5_, actual_md5) << "MD5 checksums match";
+  }
+
+  libvpx_test::WebMVideoSource *video_;
+  libvpx_test::VP9Decoder *decoder_;
+  FILE *md5_file_;
+  char expected_md5_[33];
+  char junk_[128];
+};
+
+TEST(SkipLoopFilterTest, ShutOffLoopFilter) {
+  const int non_zero_value = 1;
+  const int num_threads = 0;
+  SkipLoopFilterTest skip_loop_filter;
+  skip_loop_filter.Init(num_threads);
+  skip_loop_filter.SetSkipLoopFilter(non_zero_value, VPX_CODEC_OK);
+  ASSERT_EQ(VPX_CODEC_OK, skip_loop_filter.DecodeRemainingFrames());
+  skip_loop_filter.CheckMd5(false);
+}
+
+TEST(SkipLoopFilterTest, ShutOffLoopFilterSingleThread) {
+  const int non_zero_value = 1;
+  const int num_threads = 1;
+  SkipLoopFilterTest skip_loop_filter;
+  skip_loop_filter.Init(num_threads);
+  skip_loop_filter.SetSkipLoopFilter(non_zero_value, VPX_CODEC_OK);
+  ASSERT_EQ(VPX_CODEC_OK, skip_loop_filter.DecodeRemainingFrames());
+  skip_loop_filter.CheckMd5(false);
+}
+
+TEST(SkipLoopFilterTest, ShutOffLoopFilter8Threads) {
+  const int non_zero_value = 1;
+  const int num_threads = 8;
+  SkipLoopFilterTest skip_loop_filter;
+  skip_loop_filter.Init(num_threads);
+  skip_loop_filter.SetSkipLoopFilter(non_zero_value, VPX_CODEC_OK);
+  ASSERT_EQ(VPX_CODEC_OK, skip_loop_filter.DecodeRemainingFrames());
+  skip_loop_filter.CheckMd5(false);
+}
+
+TEST(SkipLoopFilterTest, WithLoopFilter) {
+  const int non_zero_value = 1;
+  const int num_threads = 0;
+  SkipLoopFilterTest skip_loop_filter;
+  skip_loop_filter.Init(num_threads);
+  skip_loop_filter.SetSkipLoopFilter(non_zero_value, VPX_CODEC_OK);
+  skip_loop_filter.SetSkipLoopFilter(0, VPX_CODEC_OK);
+  ASSERT_EQ(VPX_CODEC_OK, skip_loop_filter.DecodeRemainingFrames());
+  skip_loop_filter.CheckMd5(true);
+}
+
+TEST(SkipLoopFilterTest, ToggleLoopFilter) {
+  const int num_threads = 0;
+  SkipLoopFilterTest skip_loop_filter;
+  skip_loop_filter.Init(num_threads);
+
+  for (int i = 0; i < 10; ++i) {
+    skip_loop_filter.SetSkipLoopFilter(i % 2, VPX_CODEC_OK);
+    ASSERT_EQ(VPX_CODEC_OK, skip_loop_filter.DecodeOneFrame());
+  }
+  ASSERT_EQ(VPX_CODEC_OK, skip_loop_filter.DecodeRemainingFrames());
+  skip_loop_filter.CheckMd5(false);
+}
+
+}  // namespace
diff --git a/libs/libvpx/test/vp9_spatial_svc_encoder.sh b/libs/libvpx/test/vp9_spatial_svc_encoder.sh
new file mode 100755
index 0000000000..65031073f8
--- /dev/null
+++ b/libs/libvpx/test/vp9_spatial_svc_encoder.sh
@@ -0,0 +1,72 @@
+#!/bin/sh
+##
+##  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+##  This file tests the libvpx vp9_spatial_svc_encoder example. To add new
+##  tests to to this file, do the following:
+##    1. Write a shell function (this is your test).
+##    2. Add the function to vp9_spatial_svc_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: $YUV_RAW_INPUT is required.
+vp9_spatial_svc_encoder_verify_environment() {
+  if [ ! -e "${YUV_RAW_INPUT}" ]; then
+    echo "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
+    return 1
+  fi
+}
+
+# Runs vp9_spatial_svc_encoder. $1 is the test name.
+vp9_spatial_svc_encoder() {
+  local readonly \
+    encoder="${LIBVPX_BIN_PATH}/vp9_spatial_svc_encoder${VPX_TEST_EXE_SUFFIX}"
+  local readonly test_name="$1"
+  local readonly \
+    output_file="${VPX_TEST_OUTPUT_DIR}/vp9_ssvc_encoder${test_name}.ivf"
+  local readonly frames_to_encode=10
+  local readonly max_kf=9999
+
+  shift
+
+  if [ ! -x "${encoder}" ]; then
+    elog "${encoder} does not exist or is not executable."
+    return 1
+  fi
+
+  eval "${VPX_TEST_PREFIX}" "${encoder}" -w "${YUV_RAW_INPUT_WIDTH}" \
+    -h "${YUV_RAW_INPUT_HEIGHT}" -k "${max_kf}" -f "${frames_to_encode}" \
+    "$@" "${YUV_RAW_INPUT}" "${output_file}" ${devnull}
+
+  [ -e "${output_file}" ] || return 1
+}
+
+# Each test is run with layer count 1-$vp9_ssvc_test_layers.
+vp9_ssvc_test_layers=5
+
+vp9_spatial_svc() {
+  if [ "$(vp9_encode_available)" = "yes" ]; then
+    local readonly test_name="vp9_spatial_svc"
+    for layers in $(seq 1 ${vp9_ssvc_test_layers}); do
+      vp9_spatial_svc_encoder "${test_name}" -sl ${layers}
+    done
+  fi
+}
+
+readonly vp9_spatial_svc_tests="DISABLED_vp9_spatial_svc_mode_i
+                                DISABLED_vp9_spatial_svc_mode_altip
+                                DISABLED_vp9_spatial_svc_mode_ip
+                                DISABLED_vp9_spatial_svc_mode_gf
+                                vp9_spatial_svc"
+
+if [ "$(vpx_config_option_enabled CONFIG_SPATIAL_SVC)" = "yes" ]; then
+  run_tests \
+    vp9_spatial_svc_encoder_verify_environment \
+    "${vp9_spatial_svc_tests}"
+fi
diff --git a/libs/libvpx/test/vp9_subtract_test.cc b/libs/libvpx/test/vp9_subtract_test.cc
new file mode 100644
index 0000000000..3cad4d7e6d
--- /dev/null
+++ b/libs/libvpx/test/vp9_subtract_test.cc
@@ -0,0 +1,109 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vpx_mem/vpx_mem.h"
+
+typedef void (*SubtractFunc)(int rows, int cols,
+                             int16_t *diff_ptr, ptrdiff_t diff_stride,
+                             const uint8_t *src_ptr, ptrdiff_t src_stride,
+                             const uint8_t *pred_ptr, ptrdiff_t pred_stride);
+
+namespace vp9 {
+
+class VP9SubtractBlockTest : public ::testing::TestWithParam<SubtractFunc> {
+ public:
+  virtual void TearDown() {
+    libvpx_test::ClearSystemState();
+  }
+};
+
+using libvpx_test::ACMRandom;
+
+TEST_P(VP9SubtractBlockTest, SimpleSubtract) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+  // FIXME(rbultje) split in its own file
+  for (BLOCK_SIZE bsize = BLOCK_4X4; bsize < BLOCK_SIZES;
+       bsize = static_cast<BLOCK_SIZE>(static_cast<int>(bsize) + 1)) {
+    const int block_width = 4 * num_4x4_blocks_wide_lookup[bsize];
+    const int block_height = 4 * num_4x4_blocks_high_lookup[bsize];
+    int16_t *diff = reinterpret_cast<int16_t *>(
+        vpx_memalign(16, sizeof(*diff) * block_width * block_height * 2));
+    uint8_t *pred = reinterpret_cast<uint8_t *>(
+        vpx_memalign(16, block_width * block_height * 2));
+    uint8_t *src  = reinterpret_cast<uint8_t *>(
+        vpx_memalign(16, block_width * block_height * 2));
+
+    for (int n = 0; n < 100; n++) {
+      for (int r = 0; r < block_height; ++r) {
+        for (int c = 0; c < block_width * 2; ++c) {
+          src[r * block_width * 2 + c] = rnd.Rand8();
+          pred[r * block_width * 2 + c] = rnd.Rand8();
+        }
+      }
+
+      GetParam()(block_height, block_width, diff, block_width,
+                 src, block_width, pred, block_width);
+
+      for (int r = 0; r < block_height; ++r) {
+        for (int c = 0; c < block_width; ++c) {
+          EXPECT_EQ(diff[r * block_width + c],
+                    (src[r * block_width + c] -
+                     pred[r * block_width + c])) << "r = " << r
+                                                 << ", c = " << c
+                                                 << ", bs = " << bsize;
+        }
+      }
+
+      GetParam()(block_height, block_width, diff, block_width * 2,
+                 src, block_width * 2, pred, block_width * 2);
+
+      for (int r = 0; r < block_height; ++r) {
+        for (int c = 0; c < block_width; ++c) {
+          EXPECT_EQ(diff[r * block_width * 2 + c],
+                    (src[r * block_width * 2 + c] -
+                     pred[r * block_width * 2 + c])) << "r = " << r
+                                                     << ", c = " << c
+                                                     << ", bs = " << bsize;
+        }
+      }
+    }
+    vpx_free(diff);
+    vpx_free(pred);
+    vpx_free(src);
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(C, VP9SubtractBlockTest,
+                        ::testing::Values(vpx_subtract_block_c));
+
+#if HAVE_SSE2 && CONFIG_USE_X86INC
+INSTANTIATE_TEST_CASE_P(SSE2, VP9SubtractBlockTest,
+                        ::testing::Values(vpx_subtract_block_sse2));
+#endif
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, VP9SubtractBlockTest,
+                        ::testing::Values(vpx_subtract_block_neon));
+#endif
+#if HAVE_MSA
+INSTANTIATE_TEST_CASE_P(MSA, VP9SubtractBlockTest,
+                        ::testing::Values(vpx_subtract_block_msa));
+#endif
+
+}  // namespace vp9
diff --git a/libs/libvpx/test/vp9_thread_test.cc b/libs/libvpx/test/vp9_thread_test.cc
new file mode 100644
index 0000000000..92e4b9688b
--- /dev/null
+++ b/libs/libvpx/test/vp9_thread_test.cc
@@ -0,0 +1,326 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "./vpx_config.h"
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/md5_helper.h"
+#if CONFIG_WEBM_IO
+#include "test/webm_video_source.h"
+#endif
+#include "vpx_util/vpx_thread.h"
+
+namespace {
+
+using std::string;
+
+class VPxWorkerThreadTest : public ::testing::TestWithParam<bool> {
+ protected:
+  virtual ~VPxWorkerThreadTest() {}
+  virtual void SetUp() {
+    vpx_get_worker_interface()->init(&worker_);
+  }
+
+  virtual void TearDown() {
+    vpx_get_worker_interface()->end(&worker_);
+  }
+
+  void Run(VPxWorker* worker) {
+    const bool synchronous = GetParam();
+    if (synchronous) {
+      vpx_get_worker_interface()->execute(worker);
+    } else {
+      vpx_get_worker_interface()->launch(worker);
+    }
+  }
+
+  VPxWorker worker_;
+};
+
+int ThreadHook(void* data, void* return_value) {
+  int* const hook_data = reinterpret_cast<int*>(data);
+  *hook_data = 5;
+  return *reinterpret_cast<int*>(return_value);
+}
+
+TEST_P(VPxWorkerThreadTest, HookSuccess) {
+  // should be a no-op.
+  EXPECT_NE(vpx_get_worker_interface()->sync(&worker_), 0);
+
+  for (int i = 0; i < 2; ++i) {
+    EXPECT_NE(vpx_get_worker_interface()->reset(&worker_), 0);
+
+    int hook_data = 0;
+    int return_value = 1;  // return successfully from the hook
+    worker_.hook = ThreadHook;
+    worker_.data1 = &hook_data;
+    worker_.data2 = &return_value;
+
+    Run(&worker_);
+    EXPECT_NE(vpx_get_worker_interface()->sync(&worker_), 0);
+    EXPECT_FALSE(worker_.had_error);
+    EXPECT_EQ(5, hook_data);
+
+    // should be a no-op.
+    EXPECT_NE(vpx_get_worker_interface()->sync(&worker_), 0);
+  }
+}
+
+TEST_P(VPxWorkerThreadTest, HookFailure) {
+  EXPECT_NE(vpx_get_worker_interface()->reset(&worker_), 0);
+
+  int hook_data = 0;
+  int return_value = 0;  // return failure from the hook
+  worker_.hook = ThreadHook;
+  worker_.data1 = &hook_data;
+  worker_.data2 = &return_value;
+
+  Run(&worker_);
+  EXPECT_FALSE(vpx_get_worker_interface()->sync(&worker_));
+  EXPECT_EQ(1, worker_.had_error);
+
+  // Ensure _reset() clears the error and _launch() can be called again.
+  return_value = 1;
+  EXPECT_NE(vpx_get_worker_interface()->reset(&worker_), 0);
+  EXPECT_FALSE(worker_.had_error);
+  vpx_get_worker_interface()->launch(&worker_);
+  EXPECT_NE(vpx_get_worker_interface()->sync(&worker_), 0);
+  EXPECT_FALSE(worker_.had_error);
+}
+
+TEST_P(VPxWorkerThreadTest, EndWithoutSync) {
+  // Create a large number of threads to increase the chances of detecting a
+  // race. Doing more work in the hook is no guarantee as any race would occur
+  // post hook execution in the main thread loop driver.
+  static const int kNumWorkers = 64;
+  VPxWorker workers[kNumWorkers];
+  int hook_data[kNumWorkers];
+  int return_value[kNumWorkers];
+
+  for (int n = 0; n < kNumWorkers; ++n) {
+    vpx_get_worker_interface()->init(&workers[n]);
+    return_value[n] = 1;  // return successfully from the hook
+    workers[n].hook = ThreadHook;
+    workers[n].data1 = &hook_data[n];
+    workers[n].data2 = &return_value[n];
+  }
+
+  for (int i = 0; i < 2; ++i) {
+    for (int n = 0; n < kNumWorkers; ++n) {
+      EXPECT_NE(vpx_get_worker_interface()->reset(&workers[n]), 0);
+      hook_data[n] = 0;
+    }
+
+    for (int n = 0; n < kNumWorkers; ++n) {
+      Run(&workers[n]);
+    }
+
+    for (int n = kNumWorkers - 1; n >= 0; --n) {
+      vpx_get_worker_interface()->end(&workers[n]);
+    }
+  }
+}
+
+TEST(VPxWorkerThreadTest, TestInterfaceAPI) {
+  EXPECT_EQ(0, vpx_set_worker_interface(NULL));
+  EXPECT_TRUE(vpx_get_worker_interface() != NULL);
+  for (int i = 0; i < 6; ++i) {
+    VPxWorkerInterface winterface = *vpx_get_worker_interface();
+    switch (i) {
+      default:
+      case 0: winterface.init = NULL; break;
+      case 1: winterface.reset = NULL; break;
+      case 2: winterface.sync = NULL; break;
+      case 3: winterface.launch = NULL; break;
+      case 4: winterface.execute = NULL; break;
+      case 5: winterface.end = NULL; break;
+    }
+    EXPECT_EQ(0, vpx_set_worker_interface(&winterface));
+  }
+}
+
+// -----------------------------------------------------------------------------
+// Multi-threaded decode tests
+
+#if CONFIG_WEBM_IO
+struct FileList {
+  const char *name;
+  const char *expected_md5;
+};
+
+// Decodes |filename| with |num_threads|. Returns the md5 of the decoded frames.
+string DecodeFile(const string& filename, int num_threads) {
+  libvpx_test::WebMVideoSource video(filename);
+  video.Init();
+
+  vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
+  cfg.threads = num_threads;
+  libvpx_test::VP9Decoder decoder(cfg, 0);
+
+  libvpx_test::MD5 md5;
+  for (video.Begin(); video.cxdata(); video.Next()) {
+    const vpx_codec_err_t res =
+        decoder.DecodeFrame(video.cxdata(), video.frame_size());
+    if (res != VPX_CODEC_OK) {
+      EXPECT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError();
+      break;
+    }
+
+    libvpx_test::DxDataIterator dec_iter = decoder.GetDxData();
+    const vpx_image_t *img = NULL;
+
+    // Get decompressed data
+    while ((img = dec_iter.Next())) {
+      md5.Add(img);
+    }
+  }
+  return string(md5.Get());
+}
+
+void DecodeFiles(const FileList files[]) {
+  for (const FileList *iter = files; iter->name != NULL; ++iter) {
+    SCOPED_TRACE(iter->name);
+    for (int t = 1; t <= 8; ++t) {
+      EXPECT_EQ(iter->expected_md5, DecodeFile(iter->name, t))
+          << "threads = " << t;
+    }
+  }
+}
+
+// Trivial serialized thread worker interface implementation.
+// Note any worker that requires synchronization between other workers will
+// hang.
+namespace impl {
+
+void Init(VPxWorker *const worker) { memset(worker, 0, sizeof(*worker)); }
+int Reset(VPxWorker *const /*worker*/) { return 1; }
+int Sync(VPxWorker *const worker) { return !worker->had_error; }
+
+void Execute(VPxWorker *const worker) {
+  worker->had_error |= !worker->hook(worker->data1, worker->data2);
+}
+
+void Launch(VPxWorker *const worker) { Execute(worker); }
+void End(VPxWorker *const /*worker*/) {}
+
+}  // namespace impl
+
+TEST(VPxWorkerThreadTest, TestSerialInterface) {
+  static const VPxWorkerInterface serial_interface = {
+    impl::Init, impl::Reset, impl::Sync, impl::Launch, impl::Execute, impl::End
+  };
+  // TODO(jzern): Avoid using a file that will use the row-based thread
+  // loopfilter, with the simple serialized implementation it will hang. This is
+  // due to its expectation that rows will be run in parallel as they wait on
+  // progress in the row above before proceeding.
+  static const char expected_md5[] = "b35a1b707b28e82be025d960aba039bc";
+  static const char filename[] = "vp90-2-03-size-226x226.webm";
+  VPxWorkerInterface default_interface = *vpx_get_worker_interface();
+
+  EXPECT_NE(vpx_set_worker_interface(&serial_interface), 0);
+  EXPECT_EQ(expected_md5, DecodeFile(filename, 2));
+
+  // Reset the interface.
+  EXPECT_NE(vpx_set_worker_interface(&default_interface), 0);
+  EXPECT_EQ(expected_md5, DecodeFile(filename, 2));
+}
+
+TEST(VP9DecodeMultiThreadedTest, NoTilesNonFrameParallel) {
+  // no tiles or frame parallel; this exercises loop filter threading.
+  EXPECT_EQ("b35a1b707b28e82be025d960aba039bc",
+            DecodeFile("vp90-2-03-size-226x226.webm", 2));
+}
+
+TEST(VP9DecodeMultiThreadedTest, FrameParallel) {
+  static const FileList files[] = {
+    { "vp90-2-08-tile_1x2_frame_parallel.webm",
+      "68ede6abd66bae0a2edf2eb9232241b6" },
+    { "vp90-2-08-tile_1x4_frame_parallel.webm",
+      "368ebc6ebf3a5e478d85b2c3149b2848" },
+    { "vp90-2-08-tile_1x8_frame_parallel.webm",
+      "17e439da2388aff3a0f69cb22579c6c1" },
+    { NULL, NULL }
+  };
+
+  DecodeFiles(files);
+}
+
+TEST(VP9DecodeMultiThreadedTest, FrameParallelResize) {
+  static const FileList files[] = {
+    { "vp90-2-14-resize-fp-tiles-1-16.webm",
+      "0cd5e632c326297e975f38949c31ea94" },
+    { "vp90-2-14-resize-fp-tiles-1-2-4-8-16.webm",
+      "5c78a96a42e7f4a4f6b2edcdb791e44c" },
+    { "vp90-2-14-resize-fp-tiles-1-2.webm",
+      "e030450ae85c3277be2a418769df98e2" },
+    { "vp90-2-14-resize-fp-tiles-1-4.webm",
+      "312eed4e2b64eb7a4e7f18916606a430" },
+    { "vp90-2-14-resize-fp-tiles-16-1.webm",
+      "1755c16d8af16a9cb3fe7338d90abe52" },
+    { "vp90-2-14-resize-fp-tiles-16-2.webm",
+      "500300592d3fcb6f12fab25e48aaf4df" },
+    { "vp90-2-14-resize-fp-tiles-16-4.webm",
+      "47c48379fa6331215d91c67648e1af6e" },
+    { "vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm",
+      "eecf17290739bc708506fa4827665989" },
+    { "vp90-2-14-resize-fp-tiles-16-8.webm",
+      "29b6bb54e4c26b5ca85d5de5fed94e76" },
+    { "vp90-2-14-resize-fp-tiles-1-8.webm",
+      "1b6f175e08cd82cf84bb800ac6d1caa3" },
+    { "vp90-2-14-resize-fp-tiles-2-16.webm",
+      "ca3b03e4197995d8d5444ede7a6c0804" },
+    { "vp90-2-14-resize-fp-tiles-2-1.webm",
+      "99aec065369d70bbb78ccdff65afed3f" },
+    { "vp90-2-14-resize-fp-tiles-2-4.webm",
+      "22d0ebdb49b87d2920a85aea32e1afd5" },
+    { "vp90-2-14-resize-fp-tiles-2-8.webm",
+      "c2115cf051c62e0f7db1d4a783831541" },
+    { "vp90-2-14-resize-fp-tiles-4-16.webm",
+      "c690d7e1719b31367564cac0af0939cb" },
+    { "vp90-2-14-resize-fp-tiles-4-1.webm",
+      "a926020b2cc3e15ad4cc271853a0ff26" },
+    { "vp90-2-14-resize-fp-tiles-4-2.webm",
+      "42699063d9e581f1993d0cf890c2be78" },
+    { "vp90-2-14-resize-fp-tiles-4-8.webm",
+      "7f76d96036382f45121e3d5aa6f8ec52" },
+    { "vp90-2-14-resize-fp-tiles-8-16.webm",
+      "76a43fcdd7e658542913ea43216ec55d" },
+    { "vp90-2-14-resize-fp-tiles-8-1.webm",
+      "8e3fbe89486ca60a59299dea9da91378" },
+    { "vp90-2-14-resize-fp-tiles-8-2.webm",
+      "ae96f21f21b6370cc0125621b441fc52" },
+    { "vp90-2-14-resize-fp-tiles-8-4.webm",
+      "3eb4f24f10640d42218f7fd7b9fd30d4" },
+    { NULL, NULL }
+  };
+
+  DecodeFiles(files);
+}
+
+TEST(VP9DecodeMultiThreadedTest, NonFrameParallel) {
+  static const FileList files[] = {
+    { "vp90-2-08-tile_1x2.webm", "570b4a5d5a70d58b5359671668328a16" },
+    { "vp90-2-08-tile_1x4.webm", "988d86049e884c66909d2d163a09841a" },
+    { "vp90-2-08-tile_1x8.webm", "0941902a52e9092cb010905eab16364c" },
+    { "vp90-2-08-tile-4x1.webm", "06505aade6647c583c8e00a2f582266f" },
+    { "vp90-2-08-tile-4x4.webm", "85c2299892460d76e2c600502d52bfe2" },
+    { NULL, NULL }
+  };
+
+  DecodeFiles(files);
+}
+#endif  // CONFIG_WEBM_IO
+
+INSTANTIATE_TEST_CASE_P(Synchronous, VPxWorkerThreadTest, ::testing::Bool());
+
+}  // namespace
diff --git a/libs/libvpx/test/vpx_scale_test.cc b/libs/libvpx/test/vpx_scale_test.cc
new file mode 100644
index 0000000000..ef716fc80f
--- /dev/null
+++ b/libs/libvpx/test/vpx_scale_test.cc
@@ -0,0 +1,255 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_config.h"
+#include "./vpx_scale_rtcd.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_scale/yv12config.h"
+
+namespace {
+
+typedef void (*ExtendFrameBorderFunc)(YV12_BUFFER_CONFIG *ybf);
+typedef void (*CopyFrameFunc)(const YV12_BUFFER_CONFIG *src_ybf,
+                              YV12_BUFFER_CONFIG *dst_ybf);
+
+class VpxScaleBase {
+ public:
+  virtual ~VpxScaleBase() {
+    libvpx_test::ClearSystemState();
+  }
+
+  void ResetImage(int width, int height) {
+    width_ = width;
+    height_ = height;
+    memset(&img_, 0, sizeof(img_));
+    ASSERT_EQ(0, vp8_yv12_alloc_frame_buffer(&img_, width_, height_,
+                                             VP8BORDERINPIXELS));
+    memset(img_.buffer_alloc, kBufFiller, img_.frame_size);
+    FillPlane(img_.y_buffer, img_.y_crop_width, img_.y_crop_height,
+              img_.y_stride);
+    FillPlane(img_.u_buffer, img_.uv_crop_width, img_.uv_crop_height,
+              img_.uv_stride);
+    FillPlane(img_.v_buffer, img_.uv_crop_width, img_.uv_crop_height,
+              img_.uv_stride);
+
+    memset(&ref_img_, 0, sizeof(ref_img_));
+    ASSERT_EQ(0, vp8_yv12_alloc_frame_buffer(&ref_img_, width_, height_,
+                                             VP8BORDERINPIXELS));
+    memset(ref_img_.buffer_alloc, kBufFiller, ref_img_.frame_size);
+
+    memset(&cpy_img_, 0, sizeof(cpy_img_));
+    ASSERT_EQ(0, vp8_yv12_alloc_frame_buffer(&cpy_img_, width_, height_,
+                                             VP8BORDERINPIXELS));
+    memset(cpy_img_.buffer_alloc, kBufFiller, cpy_img_.frame_size);
+    ReferenceCopyFrame();
+  }
+
+  void DeallocImage() {
+    vp8_yv12_de_alloc_frame_buffer(&img_);
+    vp8_yv12_de_alloc_frame_buffer(&ref_img_);
+    vp8_yv12_de_alloc_frame_buffer(&cpy_img_);
+  }
+
+ protected:
+  static const int kBufFiller = 123;
+  static const int kBufMax = kBufFiller - 1;
+
+  static void FillPlane(uint8_t *buf, int width, int height, int stride) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        buf[x + (y * stride)] = (x + (width * y)) % kBufMax;
+      }
+    }
+  }
+
+  static void ExtendPlane(uint8_t *buf, int crop_width, int crop_height,
+                          int width, int height, int stride, int padding) {
+    // Copy the outermost visible pixel to a distance of at least 'padding.'
+    // The buffers are allocated such that there may be excess space outside the
+    // padding. As long as the minimum amount of padding is achieved it is not
+    // necessary to fill this space as well.
+    uint8_t *left = buf - padding;
+    uint8_t *right = buf + crop_width;
+    const int right_extend = padding + (width - crop_width);
+    const int bottom_extend = padding + (height - crop_height);
+
+    // Fill the border pixels from the nearest image pixel.
+    for (int y = 0; y < crop_height; ++y) {
+      memset(left, left[padding], padding);
+      memset(right, right[-1], right_extend);
+      left += stride;
+      right += stride;
+    }
+
+    left = buf - padding;
+    uint8_t *top = left - (stride * padding);
+    // The buffer does not always extend as far as the stride.
+    // Equivalent to padding + width + padding.
+    const int extend_width = padding + crop_width + right_extend;
+
+    // The first row was already extended to the left and right. Copy it up.
+    for (int y = 0; y < padding; ++y) {
+      memcpy(top, left, extend_width);
+      top += stride;
+    }
+
+    uint8_t *bottom = left + (crop_height * stride);
+    for (int y = 0; y <  bottom_extend; ++y) {
+      memcpy(bottom, left + (crop_height - 1) * stride, extend_width);
+      bottom += stride;
+    }
+  }
+
+  void ReferenceExtendBorder() {
+    ExtendPlane(ref_img_.y_buffer,
+                ref_img_.y_crop_width, ref_img_.y_crop_height,
+                ref_img_.y_width, ref_img_.y_height,
+                ref_img_.y_stride,
+                ref_img_.border);
+    ExtendPlane(ref_img_.u_buffer,
+                ref_img_.uv_crop_width, ref_img_.uv_crop_height,
+                ref_img_.uv_width, ref_img_.uv_height,
+                ref_img_.uv_stride,
+                ref_img_.border / 2);
+    ExtendPlane(ref_img_.v_buffer,
+                ref_img_.uv_crop_width, ref_img_.uv_crop_height,
+                ref_img_.uv_width, ref_img_.uv_height,
+                ref_img_.uv_stride,
+                ref_img_.border / 2);
+  }
+
+  void ReferenceCopyFrame() {
+    // Copy img_ to ref_img_ and extend frame borders. This will be used for
+    // verifying extend_fn_ as well as copy_frame_fn_.
+    EXPECT_EQ(ref_img_.frame_size, img_.frame_size);
+    for (int y = 0; y < img_.y_crop_height; ++y) {
+      for (int x = 0; x < img_.y_crop_width; ++x) {
+        ref_img_.y_buffer[x + y * ref_img_.y_stride] =
+            img_.y_buffer[x + y * img_.y_stride];
+      }
+    }
+
+    for (int y = 0; y < img_.uv_crop_height; ++y) {
+      for (int x = 0; x < img_.uv_crop_width; ++x) {
+        ref_img_.u_buffer[x + y * ref_img_.uv_stride] =
+            img_.u_buffer[x + y * img_.uv_stride];
+        ref_img_.v_buffer[x + y * ref_img_.uv_stride] =
+            img_.v_buffer[x + y * img_.uv_stride];
+      }
+    }
+
+    ReferenceExtendBorder();
+  }
+
+  void CompareImages(const YV12_BUFFER_CONFIG actual) {
+    EXPECT_EQ(ref_img_.frame_size, actual.frame_size);
+    EXPECT_EQ(0, memcmp(ref_img_.buffer_alloc, actual.buffer_alloc,
+                        ref_img_.frame_size));
+  }
+
+  YV12_BUFFER_CONFIG img_;
+  YV12_BUFFER_CONFIG ref_img_;
+  YV12_BUFFER_CONFIG cpy_img_;
+  int width_;
+  int height_;
+};
+
+class ExtendBorderTest
+    : public VpxScaleBase,
+      public ::testing::TestWithParam<ExtendFrameBorderFunc> {
+ public:
+  virtual ~ExtendBorderTest() {}
+
+ protected:
+  virtual void SetUp() {
+    extend_fn_ = GetParam();
+  }
+
+  void ExtendBorder() {
+    ASM_REGISTER_STATE_CHECK(extend_fn_(&img_));
+  }
+
+  void RunTest() {
+#if ARCH_ARM
+    // Some arm devices OOM when trying to allocate the largest buffers.
+    static const int kNumSizesToTest = 6;
+#else
+    static const int kNumSizesToTest = 7;
+#endif
+    static const int kSizesToTest[] = {1, 15, 33, 145, 512, 1025, 16383};
+    for (int h = 0; h < kNumSizesToTest; ++h) {
+      for (int w = 0; w < kNumSizesToTest; ++w) {
+        ResetImage(kSizesToTest[w], kSizesToTest[h]);
+        ExtendBorder();
+        ReferenceExtendBorder();
+        CompareImages(img_);
+        DeallocImage();
+      }
+    }
+  }
+
+  ExtendFrameBorderFunc extend_fn_;
+};
+
+TEST_P(ExtendBorderTest, ExtendBorder) {
+  ASSERT_NO_FATAL_FAILURE(RunTest());
+}
+
+INSTANTIATE_TEST_CASE_P(C, ExtendBorderTest,
+                        ::testing::Values(vp8_yv12_extend_frame_borders_c));
+
+class CopyFrameTest
+    : public VpxScaleBase,
+      public ::testing::TestWithParam<CopyFrameFunc> {
+ public:
+  virtual ~CopyFrameTest() {}
+
+ protected:
+  virtual void SetUp() {
+    copy_frame_fn_ = GetParam();
+  }
+
+  void CopyFrame() {
+    ASM_REGISTER_STATE_CHECK(copy_frame_fn_(&img_, &cpy_img_));
+  }
+
+  void RunTest() {
+#if ARCH_ARM
+    // Some arm devices OOM when trying to allocate the largest buffers.
+    static const int kNumSizesToTest = 6;
+#else
+    static const int kNumSizesToTest = 7;
+#endif
+    static const int kSizesToTest[] = {1, 15, 33, 145, 512, 1025, 16383};
+    for (int h = 0; h < kNumSizesToTest; ++h) {
+      for (int w = 0; w < kNumSizesToTest; ++w) {
+        ResetImage(kSizesToTest[w], kSizesToTest[h]);
+        ReferenceCopyFrame();
+        CopyFrame();
+        CompareImages(cpy_img_);
+        DeallocImage();
+      }
+    }
+  }
+
+  CopyFrameFunc copy_frame_fn_;
+};
+
+TEST_P(CopyFrameTest, CopyFrame) {
+  ASSERT_NO_FATAL_FAILURE(RunTest());
+}
+
+INSTANTIATE_TEST_CASE_P(C, CopyFrameTest,
+                        ::testing::Values(vp8_yv12_copy_frame_c));
+}  // namespace
diff --git a/libs/libvpx/test/vpx_temporal_svc_encoder.sh b/libs/libvpx/test/vpx_temporal_svc_encoder.sh
new file mode 100755
index 0000000000..fcc8cb4ff4
--- /dev/null
+++ b/libs/libvpx/test/vpx_temporal_svc_encoder.sh
@@ -0,0 +1,290 @@
+#!/bin/sh
+##
+##  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+##  This file tests the libvpx vpx_temporal_svc_encoder example. To add new
+##  tests to this file, do the following:
+##    1. Write a shell function (this is your test).
+##    2. Add the function to vpx_tsvc_encoder_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: $YUV_RAW_INPUT is required.
+vpx_tsvc_encoder_verify_environment() {
+  if [ ! -e "${YUV_RAW_INPUT}" ]; then
+    echo "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
+    return 1
+  fi
+  if [ "$(vpx_config_option_enabled CONFIG_TEMPORAL_DENOISING)" != "yes" ]; then
+    elog "Warning: Temporal denoising is disabled! Spatial denoising will be " \
+      "used instead, which is probably not what you want for this test."
+  fi
+}
+
+# Runs vpx_temporal_svc_encoder using the codec specified by $1 and output file
+# name by $2. Additional positional parameters are passed directly to
+# vpx_temporal_svc_encoder.
+vpx_tsvc_encoder() {
+  local encoder="${LIBVPX_BIN_PATH}/vpx_temporal_svc_encoder"
+  encoder="${encoder}${VPX_TEST_EXE_SUFFIX}"
+  local codec="$1"
+  local output_file_base="$2"
+  local output_file="${VPX_TEST_OUTPUT_DIR}/${output_file_base}"
+  local timebase_num="1"
+  local timebase_den="1000"
+  local speed="6"
+  local frame_drop_thresh="30"
+
+  shift 2
+
+  if [ ! -x "${encoder}" ]; then
+    elog "${encoder} does not exist or is not executable."
+    return 1
+  fi
+
+  eval "${VPX_TEST_PREFIX}" "${encoder}" "${YUV_RAW_INPUT}" "${output_file}" \
+      "${codec}" "${YUV_RAW_INPUT_WIDTH}" "${YUV_RAW_INPUT_HEIGHT}" \
+      "${timebase_num}" "${timebase_den}" "${speed}" "${frame_drop_thresh}" \
+      "$@" \
+      ${devnull}
+}
+
+# Confirms that all expected output files exist given the output file name
+# passed to vpx_temporal_svc_encoder.
+# The file name passed to vpx_temporal_svc_encoder is joined with the stream
+# number and the extension .ivf to produce per stream output files.  Here $1 is
+# file name, and $2 is expected number of files.
+files_exist() {
+  local file_name="${VPX_TEST_OUTPUT_DIR}/$1"
+  local num_files="$(($2 - 1))"
+  for stream_num in $(seq 0 ${num_files}); do
+    [ -e "${file_name}_${stream_num}.ivf" ] || return 1
+  done
+}
+
+# Run vpx_temporal_svc_encoder in all supported modes for vp8 and vp9.
+
+vpx_tsvc_encoder_vp8_mode_0() {
+  if [ "$(vp8_encode_available)" = "yes" ]; then
+    vpx_tsvc_encoder vp8 "${FUNCNAME}" 0 200 || return 1
+    # Mode 0 produces 1 stream
+    files_exist "${FUNCNAME}" 1 || return 1
+  fi
+}
+
+vpx_tsvc_encoder_vp8_mode_1() {
+  if [ "$(vp8_encode_available)" = "yes" ]; then
+    vpx_tsvc_encoder vp8 "${FUNCNAME}" 1 200 400 || return 1
+    # Mode 1 produces 2 streams
+    files_exist "${FUNCNAME}" 2 || return 1
+  fi
+}
+
+vpx_tsvc_encoder_vp8_mode_2() {
+  if [ "$(vp8_encode_available)" = "yes" ]; then
+    vpx_tsvc_encoder vp8 "${FUNCNAME}" 2 200 400 || return 1
+    # Mode 2 produces 2 streams
+    files_exist "${FUNCNAME}" 2 || return 1
+  fi
+}
+
+vpx_tsvc_encoder_vp8_mode_3() {
+  if [ "$(vp8_encode_available)" = "yes" ]; then
+    vpx_tsvc_encoder vp8 "${FUNCNAME}" 3 200 400 600 || return 1
+    # Mode 3 produces 3 streams
+    files_exist "${FUNCNAME}" 3 || return 1
+  fi
+}
+
+vpx_tsvc_encoder_vp8_mode_4() {
+  if [ "$(vp8_encode_available)" = "yes" ]; then
+    vpx_tsvc_encoder vp8 "${FUNCNAME}" 4 200 400 600 || return 1
+    # Mode 4 produces 3 streams
+    files_exist "${FUNCNAME}" 3 || return 1
+  fi
+}
+
+vpx_tsvc_encoder_vp8_mode_5() {
+  if [ "$(vp8_encode_available)" = "yes" ]; then
+    vpx_tsvc_encoder vp8 "${FUNCNAME}" 5 200 400 600 || return 1
+    # Mode 5 produces 3 streams
+    files_exist "${FUNCNAME}" 3 || return 1
+  fi
+}
+
+vpx_tsvc_encoder_vp8_mode_6() {
+  if [ "$(vp8_encode_available)" = "yes" ]; then
+    vpx_tsvc_encoder vp8 "${FUNCNAME}" 6 200 400 600 || return 1
+    # Mode 6 produces 3 streams
+    files_exist "${FUNCNAME}" 3 || return 1
+  fi
+}
+
+vpx_tsvc_encoder_vp8_mode_7() {
+  if [ "$(vp8_encode_available)" = "yes" ]; then
+    vpx_tsvc_encoder vp8 "${FUNCNAME}" 7 200 400 600 800 1000 || return 1
+    # Mode 7 produces 5 streams
+    files_exist "${FUNCNAME}" 5 || return 1
+  fi
+}
+
+vpx_tsvc_encoder_vp8_mode_8() {
+  if [ "$(vp8_encode_available)" = "yes" ]; then
+    vpx_tsvc_encoder vp8 "${FUNCNAME}" 8 200 400 || return 1
+    # Mode 8 produces 2 streams
+    files_exist "${FUNCNAME}" 2 || return 1
+  fi
+}
+
+vpx_tsvc_encoder_vp8_mode_9() {
+  if [ "$(vp8_encode_available)" = "yes" ]; then
+    vpx_tsvc_encoder vp8 "${FUNCNAME}" 9 200 400 600 || return 1
+    # Mode 9 produces 3 streams
+    files_exist "${FUNCNAME}" 3 || return 1
+  fi
+}
+
+vpx_tsvc_encoder_vp8_mode_10() {
+  if [ "$(vp8_encode_available)" = "yes" ]; then
+    vpx_tsvc_encoder vp8 "${FUNCNAME}" 10 200 400 600 || return 1
+    # Mode 10 produces 3 streams
+    files_exist "${FUNCNAME}" 3 || return 1
+  fi
+}
+
+vpx_tsvc_encoder_vp8_mode_11() {
+  if [ "$(vp8_encode_available)" = "yes" ]; then
+    vpx_tsvc_encoder vp8 "${FUNCNAME}" 11 200 400 600 || return 1
+    # Mode 11 produces 3 streams
+    files_exist "${FUNCNAME}" 3 || return 1
+  fi
+}
+
+vpx_tsvc_encoder_vp9_mode_0() {
+  if [ "$(vp9_encode_available)" = "yes" ]; then
+    vpx_tsvc_encoder vp9 "${FUNCNAME}" 0 200 || return 1
+    # Mode 0 produces 1 stream
+    files_exist "${FUNCNAME}" 1 || return 1
+  fi
+}
+
+vpx_tsvc_encoder_vp9_mode_1() {
+  if [ "$(vp9_encode_available)" = "yes" ]; then
+    vpx_tsvc_encoder vp9 "${FUNCNAME}" 1 200 400 || return 1
+    # Mode 1 produces 2 streams
+    files_exist "${FUNCNAME}" 2 || return 1
+  fi
+}
+
+vpx_tsvc_encoder_vp9_mode_2() {
+  if [ "$(vp9_encode_available)" = "yes" ]; then
+    vpx_tsvc_encoder vp9 "${FUNCNAME}" 2 200 400 || return 1
+    # Mode 2 produces 2 streams
+    files_exist "${FUNCNAME}" 2 || return 1
+  fi
+}
+
+vpx_tsvc_encoder_vp9_mode_3() {
+  if [ "$(vp9_encode_available)" = "yes" ]; then
+    vpx_tsvc_encoder vp9 "${FUNCNAME}" 3 200 400 600 || return 1
+    # Mode 3 produces 3 streams
+    files_exist "${FUNCNAME}" 3 || return 1
+  fi
+}
+
+vpx_tsvc_encoder_vp9_mode_4() {
+  if [ "$(vp9_encode_available)" = "yes" ]; then
+    vpx_tsvc_encoder vp9 "${FUNCNAME}" 4 200 400 600 || return 1
+    # Mode 4 produces 3 streams
+    files_exist "${FUNCNAME}" 3 || return 1
+  fi
+}
+
+vpx_tsvc_encoder_vp9_mode_5() {
+  if [ "$(vp9_encode_available)" = "yes" ]; then
+    vpx_tsvc_encoder vp9 "${FUNCNAME}" 5 200 400 600 || return 1
+    # Mode 5 produces 3 streams
+    files_exist "${FUNCNAME}" 3 || return 1
+  fi
+}
+
+vpx_tsvc_encoder_vp9_mode_6() {
+  if [ "$(vp9_encode_available)" = "yes" ]; then
+    vpx_tsvc_encoder vp9 "${FUNCNAME}" 6 200 400 600 || return 1
+    # Mode 6 produces 3 streams
+    files_exist "${FUNCNAME}" 3 || return 1
+  fi
+}
+
+vpx_tsvc_encoder_vp9_mode_7() {
+  if [ "$(vp9_encode_available)" = "yes" ]; then
+    vpx_tsvc_encoder vp9 "${FUNCNAME}" 7 200 400 600 800 1000 || return 1
+    # Mode 7 produces 5 streams
+    files_exist "${FUNCNAME}" 5 || return 1
+  fi
+}
+
+vpx_tsvc_encoder_vp9_mode_8() {
+  if [ "$(vp9_encode_available)" = "yes" ]; then
+    vpx_tsvc_encoder vp9 "${FUNCNAME}" 8 200 400 || return 1
+    # Mode 8 produces 2 streams
+    files_exist "${FUNCNAME}" 2 || return 1
+  fi
+}
+
+vpx_tsvc_encoder_vp9_mode_9() {
+  if [ "$(vp9_encode_available)" = "yes" ]; then
+    vpx_tsvc_encoder vp9 "${FUNCNAME}" 9 200 400 600 || return 1
+    # Mode 9 produces 3 streams
+    files_exist "${FUNCNAME}" 3 || return 1
+  fi
+}
+
+vpx_tsvc_encoder_vp9_mode_10() {
+  if [ "$(vp9_encode_available)" = "yes" ]; then
+    vpx_tsvc_encoder vp9 "${FUNCNAME}" 10 200 400 600 || return 1
+    # Mode 10 produces 3 streams
+    files_exist "${FUNCNAME}" 3 || return 1
+  fi
+}
+
+vpx_tsvc_encoder_vp9_mode_11() {
+  if [ "$(vp9_encode_available)" = "yes" ]; then
+    vpx_tsvc_encoder vp9 "${FUNCNAME}" 11 200 400 600 || return 1
+    # Mode 11 produces 3 streams
+    files_exist "${FUNCNAME}" 3 || return 1
+  fi
+}
+
+vpx_tsvc_encoder_tests="vpx_tsvc_encoder_vp8_mode_0
+                        vpx_tsvc_encoder_vp8_mode_1
+                        vpx_tsvc_encoder_vp8_mode_2
+                        vpx_tsvc_encoder_vp8_mode_3
+                        vpx_tsvc_encoder_vp8_mode_4
+                        vpx_tsvc_encoder_vp8_mode_5
+                        vpx_tsvc_encoder_vp8_mode_6
+                        vpx_tsvc_encoder_vp8_mode_7
+                        vpx_tsvc_encoder_vp8_mode_8
+                        vpx_tsvc_encoder_vp8_mode_9
+                        vpx_tsvc_encoder_vp8_mode_10
+                        vpx_tsvc_encoder_vp8_mode_11
+                        vpx_tsvc_encoder_vp9_mode_0
+                        vpx_tsvc_encoder_vp9_mode_1
+                        vpx_tsvc_encoder_vp9_mode_2
+                        vpx_tsvc_encoder_vp9_mode_3
+                        vpx_tsvc_encoder_vp9_mode_4
+                        vpx_tsvc_encoder_vp9_mode_5
+                        vpx_tsvc_encoder_vp9_mode_6
+                        vpx_tsvc_encoder_vp9_mode_7
+                        vpx_tsvc_encoder_vp9_mode_8
+                        vpx_tsvc_encoder_vp9_mode_9
+                        vpx_tsvc_encoder_vp9_mode_10
+                        vpx_tsvc_encoder_vp9_mode_11"
+
+run_tests vpx_tsvc_encoder_verify_environment "${vpx_tsvc_encoder_tests}"
diff --git a/libs/libvpx/test/vpxdec.sh b/libs/libvpx/test/vpxdec.sh
new file mode 100755
index 0000000000..de51c8004e
--- /dev/null
+++ b/libs/libvpx/test/vpxdec.sh
@@ -0,0 +1,116 @@
+#!/bin/sh
+##
+##  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+##  This file tests vpxdec. To add new tests to this file, do the following:
+##    1. Write a shell function (this is your test).
+##    2. Add the function to vpxdec_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: Make sure input is available.
+vpxdec_verify_environment() {
+  if [ ! -e "${VP8_IVF_FILE}" ] || [ ! -e "${VP9_WEBM_FILE}" ] || \
+    [ ! -e "${VP9_FPM_WEBM_FILE}" ] || \
+    [ ! -e "${VP9_LT_50_FRAMES_WEBM_FILE}" ] ; then
+    elog "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
+    return 1
+  fi
+  if [ -z "$(vpx_tool_path vpxdec)" ]; then
+    elog "vpxdec not found. It must exist in LIBVPX_BIN_PATH or its parent."
+    return 1
+  fi
+}
+
+# Wrapper function for running vpxdec with pipe input. Requires that
+# LIBVPX_BIN_PATH points to the directory containing vpxdec. $1 is used as the
+# input file path and shifted away. All remaining parameters are passed through
+# to vpxdec.
+vpxdec_pipe() {
+  local readonly decoder="$(vpx_tool_path vpxdec)"
+  local readonly input="$1"
+  shift
+  cat "${input}" | eval "${VPX_TEST_PREFIX}" "${decoder}" - "$@" ${devnull}
+}
+
+# Wrapper function for running vpxdec. Requires that LIBVPX_BIN_PATH points to
+# the directory containing vpxdec. $1 one is used as the input file path and
+# shifted away. All remaining parameters are passed through to vpxdec.
+vpxdec() {
+  local readonly decoder="$(vpx_tool_path vpxdec)"
+  local readonly input="$1"
+  shift
+  eval "${VPX_TEST_PREFIX}" "${decoder}" "$input" "$@" ${devnull}
+}
+
+vpxdec_can_decode_vp8() {
+  if [ "$(vp8_decode_available)" = "yes" ]; then
+    echo yes
+  fi
+}
+
+vpxdec_can_decode_vp9() {
+  if [ "$(vp9_decode_available)" = "yes" ]; then
+    echo yes
+  fi
+}
+
+vpxdec_vp8_ivf() {
+  if [ "$(vpxdec_can_decode_vp8)" = "yes" ]; then
+    vpxdec "${VP8_IVF_FILE}" --summary --noblit
+  fi
+}
+
+vpxdec_vp8_ivf_pipe_input() {
+  if [ "$(vpxdec_can_decode_vp8)" = "yes" ]; then
+    vpxdec_pipe "${VP8_IVF_FILE}" --summary --noblit
+  fi
+}
+
+vpxdec_vp9_webm() {
+  if [ "$(vpxdec_can_decode_vp9)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    vpxdec "${VP9_WEBM_FILE}" --summary --noblit
+  fi
+}
+
+vpxdec_vp9_webm_frame_parallel() {
+  if [ "$(vpxdec_can_decode_vp9)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    for threads in 2 3 4 5 6 7 8; do
+      vpxdec "${VP9_FPM_WEBM_FILE}" --summary --noblit --threads=$threads \
+        --frame-parallel
+    done
+  fi
+}
+
+vpxdec_vp9_webm_less_than_50_frames() {
+  # ensure that reaching eof in webm_guess_framerate doesn't result in invalid
+  # frames in actual webm_read_frame calls.
+  if [ "$(vpxdec_can_decode_vp9)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    local readonly decoder="$(vpx_tool_path vpxdec)"
+    local readonly expected=10
+    local readonly num_frames=$(${VPX_TEST_PREFIX} "${decoder}" \
+      "${VP9_LT_50_FRAMES_WEBM_FILE}" --summary --noblit 2>&1 \
+      | awk '/^[0-9]+ decoded frames/ { print $1 }')
+    if [ "$num_frames" -ne "$expected" ]; then
+      elog "Output frames ($num_frames) != expected ($expected)"
+      return 1
+    fi
+  fi
+}
+
+vpxdec_tests="vpxdec_vp8_ivf
+              vpxdec_vp8_ivf_pipe_input
+              vpxdec_vp9_webm
+              vpxdec_vp9_webm_frame_parallel
+              vpxdec_vp9_webm_less_than_50_frames"
+
+run_tests vpxdec_verify_environment "${vpxdec_tests}"
diff --git a/libs/libvpx/test/vpxenc.sh b/libs/libvpx/test/vpxenc.sh
new file mode 100755
index 0000000000..e8994992ae
--- /dev/null
+++ b/libs/libvpx/test/vpxenc.sh
@@ -0,0 +1,429 @@
+#!/bin/sh
+##
+##  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+##  This file tests vpxenc using hantro_collage_w352h288.yuv as input. To add
+##  new tests to this file, do the following:
+##    1. Write a shell function (this is your test).
+##    2. Add the function to vpxenc_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+readonly TEST_FRAMES=10
+
+# Environment check: Make sure input is available.
+vpxenc_verify_environment() {
+  if [ ! -e "${YUV_RAW_INPUT}" ]; then
+    elog "The file ${YUV_RAW_INPUT##*/} must exist in LIBVPX_TEST_DATA_PATH."
+    return 1
+  fi
+  if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then
+    if [ ! -e "${Y4M_NOSQ_PAR_INPUT}" ]; then
+      elog "The file ${Y4M_NOSQ_PAR_INPUT##*/} must exist in"
+      elog "LIBVPX_TEST_DATA_PATH."
+      return 1
+    fi
+  fi
+  if [ -z "$(vpx_tool_path vpxenc)" ]; then
+    elog "vpxenc not found. It must exist in LIBVPX_BIN_PATH or its parent."
+    return 1
+  fi
+}
+
+vpxenc_can_encode_vp8() {
+  if [ "$(vp8_encode_available)" = "yes" ]; then
+    echo yes
+  fi
+}
+
+vpxenc_can_encode_vp9() {
+  if [ "$(vp9_encode_available)" = "yes" ]; then
+    echo yes
+  fi
+}
+
+# Echo vpxenc command line parameters allowing use of
+# hantro_collage_w352h288.yuv as input.
+yuv_input_hantro_collage() {
+  echo ""${YUV_RAW_INPUT}"
+       --width="${YUV_RAW_INPUT_WIDTH}"
+       --height="${YUV_RAW_INPUT_HEIGHT}""
+}
+
+y4m_input_non_square_par() {
+  echo ""${Y4M_NOSQ_PAR_INPUT}""
+}
+
+y4m_input_720p() {
+  echo ""${Y4M_720P_INPUT}""
+}
+
+# Echo default vpxenc real time encoding params. $1 is the codec, which defaults
+# to vp8 if unspecified.
+vpxenc_rt_params() {
+  local readonly codec="${1:-vp8}"
+  echo "--codec=${codec}
+    --buf-initial-sz=500
+    --buf-optimal-sz=600
+    --buf-sz=1000
+    --cpu-used=-6
+    --end-usage=cbr
+    --error-resilient=1
+    --kf-max-dist=90000
+    --lag-in-frames=0
+    --max-intra-rate=300
+    --max-q=56
+    --min-q=2
+    --noise-sensitivity=0
+    --overshoot-pct=50
+    --passes=1
+    --profile=0
+    --resize-allowed=0
+    --rt
+    --static-thresh=0
+    --undershoot-pct=50"
+}
+
+# Wrapper function for running vpxenc with pipe input. Requires that
+# LIBVPX_BIN_PATH points to the directory containing vpxenc. $1 is used as the
+# input file path and shifted away. All remaining parameters are passed through
+# to vpxenc.
+vpxenc_pipe() {
+  local readonly encoder="$(vpx_tool_path vpxenc)"
+  local readonly input="$1"
+  shift
+  cat "${input}" | eval "${VPX_TEST_PREFIX}" "${encoder}" - \
+    --test-decode=fatal \
+    "$@" ${devnull}
+}
+
+# Wrapper function for running vpxenc. Requires that LIBVPX_BIN_PATH points to
+# the directory containing vpxenc. $1 one is used as the input file path and
+# shifted away. All remaining parameters are passed through to vpxenc.
+vpxenc() {
+  local readonly encoder="$(vpx_tool_path vpxenc)"
+  local readonly input="$1"
+  shift
+  eval "${VPX_TEST_PREFIX}" "${encoder}" "${input}" \
+    --test-decode=fatal \
+    "$@" ${devnull}
+}
+
+vpxenc_vp8_ivf() {
+  if [ "$(vpxenc_can_encode_vp8)" = "yes" ]; then
+    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8.ivf"
+    vpxenc $(yuv_input_hantro_collage) \
+      --codec=vp8 \
+      --limit="${TEST_FRAMES}" \
+      --ivf \
+      --output="${output}"
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+vpxenc_vp8_webm() {
+  if [ "$(vpxenc_can_encode_vp8)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8.webm"
+    vpxenc $(yuv_input_hantro_collage) \
+      --codec=vp8 \
+      --limit="${TEST_FRAMES}" \
+      --output="${output}"
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+vpxenc_vp8_webm_rt() {
+  if [ "$(vpxenc_can_encode_vp8)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8_rt.webm"
+    vpxenc $(yuv_input_hantro_collage) \
+      $(vpxenc_rt_params vp8) \
+      --output="${output}"
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+vpxenc_vp8_webm_2pass() {
+  if [ "$(vpxenc_can_encode_vp8)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8.webm"
+    vpxenc $(yuv_input_hantro_collage) \
+      --codec=vp8 \
+      --limit="${TEST_FRAMES}" \
+      --output="${output}" \
+      --passes=2
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+vpxenc_vp8_webm_lag10_frames20() {
+  if [ "$(vpxenc_can_encode_vp8)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    local readonly lag_total_frames=20
+    local readonly lag_frames=10
+    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8_lag10_frames20.webm"
+    vpxenc $(yuv_input_hantro_collage) \
+      --codec=vp8 \
+      --limit="${lag_total_frames}" \
+      --lag-in-frames="${lag_frames}" \
+      --output="${output}" \
+      --auto-alt-ref=1 \
+      --passes=2
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+vpxenc_vp8_ivf_piped_input() {
+  if [ "$(vpxenc_can_encode_vp8)" = "yes" ]; then
+    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8_piped_input.ivf"
+    vpxenc_pipe $(yuv_input_hantro_collage) \
+      --codec=vp8 \
+      --limit="${TEST_FRAMES}" \
+      --ivf \
+      --output="${output}"
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+vpxenc_vp9_ivf() {
+  if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then
+    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9.ivf"
+    vpxenc $(yuv_input_hantro_collage) \
+      --codec=vp9 \
+      --limit="${TEST_FRAMES}" \
+      --ivf \
+      --output="${output}"
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+vpxenc_vp9_webm() {
+  if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9.webm"
+    vpxenc $(yuv_input_hantro_collage) \
+      --codec=vp9 \
+      --limit="${TEST_FRAMES}" \
+      --output="${output}"
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+vpxenc_vp9_webm_rt() {
+  if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_rt.webm"
+    vpxenc $(yuv_input_hantro_collage) \
+      $(vpxenc_rt_params vp9) \
+      --output="${output}"
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+vpxenc_vp9_webm_rt_multithread_tiled() {
+  if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_rt_multithread_tiled.webm"
+    local readonly tilethread_min=2
+    local readonly tilethread_max=4
+    local readonly num_threads="$(seq ${tilethread_min} ${tilethread_max})"
+    local readonly num_tile_cols="$(seq ${tilethread_min} ${tilethread_max})"
+
+    for threads in ${num_threads}; do
+      for tile_cols in ${num_tile_cols}; do
+        vpxenc $(y4m_input_720p) \
+          $(vpxenc_rt_params vp9) \
+          --threads=${threads} \
+          --tile-columns=${tile_cols} \
+          --output="${output}"
+      done
+    done
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+
+    rm "${output}"
+  fi
+}
+
+vpxenc_vp9_webm_rt_multithread_tiled_frameparallel() {
+  if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_rt_mt_t_fp.webm"
+    local readonly tilethread_min=2
+    local readonly tilethread_max=4
+    local readonly num_threads="$(seq ${tilethread_min} ${tilethread_max})"
+    local readonly num_tile_cols="$(seq ${tilethread_min} ${tilethread_max})"
+
+    for threads in ${num_threads}; do
+      for tile_cols in ${num_tile_cols}; do
+        vpxenc $(y4m_input_720p) \
+          $(vpxenc_rt_params vp9) \
+          --threads=${threads} \
+          --tile-columns=${tile_cols} \
+          --frame-parallel=1 \
+          --output="${output}"
+      done
+    done
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+
+    rm "${output}"
+  fi
+}
+
+vpxenc_vp9_webm_2pass() {
+  if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9.webm"
+    vpxenc $(yuv_input_hantro_collage) \
+      --codec=vp9 \
+      --limit="${TEST_FRAMES}" \
+      --output="${output}" \
+      --passes=2
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+vpxenc_vp9_ivf_lossless() {
+  if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then
+    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_lossless.ivf"
+    vpxenc $(yuv_input_hantro_collage) \
+      --codec=vp9 \
+      --limit="${TEST_FRAMES}" \
+      --ivf \
+      --output="${output}" \
+      --lossless=1
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+vpxenc_vp9_ivf_minq0_maxq0() {
+  if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then
+    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_lossless_minq0_maxq0.ivf"
+    vpxenc $(yuv_input_hantro_collage) \
+      --codec=vp9 \
+      --limit="${TEST_FRAMES}" \
+      --ivf \
+      --output="${output}" \
+      --min-q=0 \
+      --max-q=0
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+vpxenc_vp9_webm_lag10_frames20() {
+  if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    local readonly lag_total_frames=20
+    local readonly lag_frames=10
+    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_lag10_frames20.webm"
+    vpxenc $(yuv_input_hantro_collage) \
+      --codec=vp9 \
+      --limit="${lag_total_frames}" \
+      --lag-in-frames="${lag_frames}" \
+      --output="${output}" \
+      --passes=2 \
+      --auto-alt-ref=1
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+# TODO(fgalligan): Test that DisplayWidth is different than video width.
+vpxenc_vp9_webm_non_square_par() {
+  if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_non_square_par.webm"
+    vpxenc $(y4m_input_non_square_par) \
+      --codec=vp9 \
+      --limit="${TEST_FRAMES}" \
+      --output="${output}"
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+vpxenc_tests="vpxenc_vp8_ivf
+              vpxenc_vp8_webm
+              vpxenc_vp8_webm_rt
+              vpxenc_vp8_webm_2pass
+              vpxenc_vp8_webm_lag10_frames20
+              vpxenc_vp8_ivf_piped_input
+              vpxenc_vp9_ivf
+              vpxenc_vp9_webm
+              vpxenc_vp9_webm_rt
+              vpxenc_vp9_webm_rt_multithread_tiled
+              vpxenc_vp9_webm_rt_multithread_tiled_frameparallel
+              vpxenc_vp9_webm_2pass
+              vpxenc_vp9_ivf_lossless
+              vpxenc_vp9_ivf_minq0_maxq0
+              vpxenc_vp9_webm_lag10_frames20
+              vpxenc_vp9_webm_non_square_par"
+
+run_tests vpxenc_verify_environment "${vpxenc_tests}"
diff --git a/libs/libvpx/test/webm_video_source.h b/libs/libvpx/test/webm_video_source.h
new file mode 100644
index 0000000000..650bc52dce
--- /dev/null
+++ b/libs/libvpx/test/webm_video_source.h
@@ -0,0 +1,102 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef TEST_WEBM_VIDEO_SOURCE_H_
+#define TEST_WEBM_VIDEO_SOURCE_H_
+#include <cstdarg>
+#include <cstdio>
+#include <cstdlib>
+#include <new>
+#include <string>
+#include "../tools_common.h"
+#include "../webmdec.h"
+#include "test/video_source.h"
+
+namespace libvpx_test {
+
+// This class extends VideoSource to allow parsing of WebM files,
+// so that we can do actual file decodes.
+class WebMVideoSource : public CompressedVideoSource {
+ public:
+  explicit WebMVideoSource(const std::string &file_name)
+      : file_name_(file_name),
+        vpx_ctx_(new VpxInputContext()),
+        webm_ctx_(new WebmInputContext()),
+        buf_(NULL),
+        buf_sz_(0),
+        frame_(0),
+        end_of_file_(false) {
+  }
+
+  virtual ~WebMVideoSource() {
+    if (vpx_ctx_->file != NULL)
+      fclose(vpx_ctx_->file);
+    webm_free(webm_ctx_);
+    delete vpx_ctx_;
+    delete webm_ctx_;
+  }
+
+  virtual void Init() {
+  }
+
+  virtual void Begin() {
+    vpx_ctx_->file = OpenTestDataFile(file_name_);
+    ASSERT_TRUE(vpx_ctx_->file != NULL) << "Input file open failed. Filename: "
+        << file_name_;
+
+    ASSERT_EQ(file_is_webm(webm_ctx_, vpx_ctx_), 1) << "file is not WebM";
+
+    FillFrame();
+  }
+
+  virtual void Next() {
+    ++frame_;
+    FillFrame();
+  }
+
+  void FillFrame() {
+    ASSERT_TRUE(vpx_ctx_->file != NULL);
+    const int status = webm_read_frame(webm_ctx_, &buf_, &buf_sz_, &buf_sz_);
+    ASSERT_GE(status, 0) << "webm_read_frame failed";
+    if (status == 1) {
+      end_of_file_ = true;
+    }
+  }
+
+  void SeekToNextKeyFrame() {
+    ASSERT_TRUE(vpx_ctx_->file != NULL);
+    do {
+      const int status = webm_read_frame(webm_ctx_, &buf_, &buf_sz_, &buf_sz_);
+      ASSERT_GE(status, 0) << "webm_read_frame failed";
+      ++frame_;
+      if (status == 1) {
+        end_of_file_ = true;
+      }
+    } while (!webm_ctx_->is_key_frame && !end_of_file_);
+  }
+
+  virtual const uint8_t *cxdata() const {
+    return end_of_file_ ? NULL : buf_;
+  }
+  virtual size_t frame_size() const { return buf_sz_; }
+  virtual unsigned int frame_number() const { return frame_; }
+
+ protected:
+  std::string file_name_;
+  VpxInputContext *vpx_ctx_;
+  WebmInputContext *webm_ctx_;
+  uint8_t *buf_;
+  size_t buf_sz_;
+  unsigned int frame_;
+  bool end_of_file_;
+};
+
+}  // namespace libvpx_test
+
+#endif  // TEST_WEBM_VIDEO_SOURCE_H_
diff --git a/libs/libvpx/test/y4m_test.cc b/libs/libvpx/test/y4m_test.cc
new file mode 100644
index 0000000000..a5553292c0
--- /dev/null
+++ b/libs/libvpx/test/y4m_test.cc
@@ -0,0 +1,195 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_config.h"
+#include "./y4menc.h"
+#include "test/md5_helper.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+
+namespace {
+
+using std::string;
+
+static const unsigned int kWidth  = 160;
+static const unsigned int kHeight = 90;
+static const unsigned int kFrames = 10;
+
+struct Y4mTestParam {
+  const char *filename;
+  unsigned int bit_depth;
+  vpx_img_fmt format;
+  const char *md5raw;
+};
+
+const Y4mTestParam kY4mTestVectors[] = {
+  {"park_joy_90p_8_420.y4m", 8, VPX_IMG_FMT_I420,
+    "e5406275b9fc6bb3436c31d4a05c1cab"},
+  {"park_joy_90p_8_422.y4m", 8, VPX_IMG_FMT_I422,
+    "284a47a47133b12884ec3a14e959a0b6"},
+  {"park_joy_90p_8_444.y4m", 8, VPX_IMG_FMT_I444,
+    "90517ff33843d85de712fd4fe60dbed0"},
+  {"park_joy_90p_10_420.y4m", 10, VPX_IMG_FMT_I42016,
+    "63f21f9f717d8b8631bd2288ee87137b"},
+  {"park_joy_90p_10_422.y4m", 10, VPX_IMG_FMT_I42216,
+    "48ab51fb540aed07f7ff5af130c9b605"},
+  {"park_joy_90p_10_444.y4m", 10, VPX_IMG_FMT_I44416,
+    "067bfd75aa85ff9bae91fa3e0edd1e3e"},
+  {"park_joy_90p_12_420.y4m", 12, VPX_IMG_FMT_I42016,
+    "9e6d8f6508c6e55625f6b697bc461cef"},
+  {"park_joy_90p_12_422.y4m", 12, VPX_IMG_FMT_I42216,
+    "b239c6b301c0b835485be349ca83a7e3"},
+  {"park_joy_90p_12_444.y4m", 12, VPX_IMG_FMT_I44416,
+    "5a6481a550821dab6d0192f5c63845e9"},
+};
+
+static void write_image_file(const vpx_image_t *img, FILE *file) {
+  int plane, y;
+  for (plane = 0; plane < 3; ++plane) {
+    const unsigned char *buf = img->planes[plane];
+    const int stride = img->stride[plane];
+    const int bytes_per_sample = (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
+    const int h = (plane ? (img->d_h + img->y_chroma_shift) >>
+                   img->y_chroma_shift : img->d_h);
+    const int w = (plane ? (img->d_w + img->x_chroma_shift) >>
+                   img->x_chroma_shift : img->d_w);
+    for (y = 0; y < h; ++y) {
+      fwrite(buf, bytes_per_sample, w, file);
+      buf += stride;
+    }
+  }
+}
+
+class Y4mVideoSourceTest
+    : public ::testing::TestWithParam<Y4mTestParam>,
+      public ::libvpx_test::Y4mVideoSource {
+ protected:
+  Y4mVideoSourceTest() : Y4mVideoSource("", 0, 0) {}
+
+  virtual ~Y4mVideoSourceTest() {
+    CloseSource();
+  }
+
+  virtual void Init(const std::string &file_name, int limit) {
+    file_name_ = file_name;
+    start_ = 0;
+    limit_ = limit;
+    frame_ = 0;
+    Begin();
+  }
+
+  // Checks y4m header information
+  void HeaderChecks(unsigned int bit_depth, vpx_img_fmt_t fmt) {
+    ASSERT_TRUE(input_file_ != NULL);
+    ASSERT_EQ(y4m_.pic_w, (int)kWidth);
+    ASSERT_EQ(y4m_.pic_h, (int)kHeight);
+    ASSERT_EQ(img()->d_w, kWidth);
+    ASSERT_EQ(img()->d_h, kHeight);
+    ASSERT_EQ(y4m_.bit_depth, bit_depth);
+    ASSERT_EQ(y4m_.vpx_fmt, fmt);
+    if (fmt == VPX_IMG_FMT_I420 || fmt == VPX_IMG_FMT_I42016) {
+      ASSERT_EQ(y4m_.bps, (int)y4m_.bit_depth * 3 / 2);
+      ASSERT_EQ(img()->x_chroma_shift, 1U);
+      ASSERT_EQ(img()->y_chroma_shift, 1U);
+    }
+    if (fmt == VPX_IMG_FMT_I422 || fmt == VPX_IMG_FMT_I42216) {
+      ASSERT_EQ(y4m_.bps, (int)y4m_.bit_depth * 2);
+      ASSERT_EQ(img()->x_chroma_shift, 1U);
+      ASSERT_EQ(img()->y_chroma_shift, 0U);
+    }
+    if (fmt == VPX_IMG_FMT_I444 || fmt == VPX_IMG_FMT_I44416) {
+      ASSERT_EQ(y4m_.bps, (int)y4m_.bit_depth * 3);
+      ASSERT_EQ(img()->x_chroma_shift, 0U);
+      ASSERT_EQ(img()->y_chroma_shift, 0U);
+    }
+  }
+
+  // Checks MD5 of the raw frame data
+  void Md5Check(const string &expected_md5) {
+    ASSERT_TRUE(input_file_ != NULL);
+    libvpx_test::MD5 md5;
+    for (unsigned int i = start_; i < limit_; i++) {
+      md5.Add(img());
+      Next();
+    }
+    ASSERT_EQ(string(md5.Get()), expected_md5);
+  }
+};
+
+TEST_P(Y4mVideoSourceTest, SourceTest) {
+  const Y4mTestParam t = GetParam();
+  Init(t.filename, kFrames);
+  HeaderChecks(t.bit_depth, t.format);
+  Md5Check(t.md5raw);
+}
+
+INSTANTIATE_TEST_CASE_P(C, Y4mVideoSourceTest,
+                        ::testing::ValuesIn(kY4mTestVectors));
+
+class Y4mVideoWriteTest
+    : public Y4mVideoSourceTest {
+ protected:
+  Y4mVideoWriteTest() {}
+
+  virtual ~Y4mVideoWriteTest() {
+    delete tmpfile_;
+    input_file_ = NULL;
+  }
+
+  void ReplaceInputFile(FILE *input_file) {
+    CloseSource();
+    frame_ = 0;
+    input_file_ = input_file;
+    rewind(input_file_);
+    ReadSourceToStart();
+  }
+
+  // Writes out a y4m file and then reads it back
+  void WriteY4mAndReadBack() {
+    ASSERT_TRUE(input_file_ != NULL);
+    char buf[Y4M_BUFFER_SIZE] = {0};
+    const struct VpxRational framerate = {y4m_.fps_n, y4m_.fps_d};
+    tmpfile_ = new libvpx_test::TempOutFile;
+    ASSERT_TRUE(tmpfile_->file() != NULL);
+    y4m_write_file_header(buf, sizeof(buf),
+                          kWidth, kHeight,
+                          &framerate, y4m_.vpx_fmt,
+                          y4m_.bit_depth);
+    fputs(buf, tmpfile_->file());
+    for (unsigned int i = start_; i < limit_; i++) {
+      y4m_write_frame_header(buf, sizeof(buf));
+      fputs(buf, tmpfile_->file());
+      write_image_file(img(), tmpfile_->file());
+      Next();
+    }
+    ReplaceInputFile(tmpfile_->file());
+  }
+
+  virtual void Init(const std::string &file_name, int limit) {
+    Y4mVideoSourceTest::Init(file_name, limit);
+    WriteY4mAndReadBack();
+  }
+  libvpx_test::TempOutFile *tmpfile_;
+};
+
+TEST_P(Y4mVideoWriteTest, WriteTest) {
+  const Y4mTestParam t = GetParam();
+  Init(t.filename, kFrames);
+  HeaderChecks(t.bit_depth, t.format);
+  Md5Check(t.md5raw);
+}
+
+INSTANTIATE_TEST_CASE_P(C, Y4mVideoWriteTest,
+                        ::testing::ValuesIn(kY4mTestVectors));
+}  // namespace
diff --git a/libs/libvpx/test/y4m_video_source.h b/libs/libvpx/test/y4m_video_source.h
new file mode 100644
index 0000000000..03d9388db8
--- /dev/null
+++ b/libs/libvpx/test/y4m_video_source.h
@@ -0,0 +1,130 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef TEST_Y4M_VIDEO_SOURCE_H_
+#define TEST_Y4M_VIDEO_SOURCE_H_
+#include <algorithm>
+#include <string>
+
+#include "test/video_source.h"
+#include "./y4minput.h"
+
+namespace libvpx_test {
+
+// This class extends VideoSource to allow parsing of raw yv12
+// so that we can do actual file encodes.
+class Y4mVideoSource : public VideoSource {
+ public:
+  Y4mVideoSource(const std::string &file_name,
+                  unsigned int start, int limit)
+      : file_name_(file_name),
+        input_file_(NULL),
+        img_(new vpx_image_t()),
+        start_(start),
+        limit_(limit),
+        frame_(0),
+        framerate_numerator_(0),
+        framerate_denominator_(0),
+        y4m_() {
+  }
+
+  virtual ~Y4mVideoSource() {
+    vpx_img_free(img_.get());
+    CloseSource();
+  }
+
+  virtual void OpenSource() {
+    CloseSource();
+    input_file_ = OpenTestDataFile(file_name_);
+    ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: "
+                                     << file_name_;
+  }
+
+  virtual void ReadSourceToStart() {
+    ASSERT_TRUE(input_file_ != NULL);
+    ASSERT_FALSE(y4m_input_open(&y4m_, input_file_, NULL, 0, 0));
+    framerate_numerator_ = y4m_.fps_n;
+    framerate_denominator_ = y4m_.fps_d;
+    frame_ = 0;
+    for (unsigned int i = 0; i < start_; i++) {
+      Next();
+    }
+    FillFrame();
+  }
+
+  virtual void Begin() {
+    OpenSource();
+    ReadSourceToStart();
+  }
+
+  virtual void Next() {
+    ++frame_;
+    FillFrame();
+  }
+
+  virtual vpx_image_t *img() const {
+    return (frame_ < limit_) ? img_.get() : NULL;
+  }
+
+  // Models a stream where Timebase = 1/FPS, so pts == frame.
+  virtual vpx_codec_pts_t pts() const { return frame_; }
+
+  virtual unsigned long duration() const { return 1; }
+
+  virtual vpx_rational_t timebase() const {
+    const vpx_rational_t t = { framerate_denominator_, framerate_numerator_ };
+    return t;
+  }
+
+  virtual unsigned int frame() const { return frame_; }
+
+  virtual unsigned int limit() const { return limit_; }
+
+  virtual void FillFrame() {
+    ASSERT_TRUE(input_file_ != NULL);
+    // Read a frame from input_file.
+    y4m_input_fetch_frame(&y4m_, input_file_, img_.get());
+  }
+
+  // Swap buffers with another y4m source. This allows reading a new frame
+  // while keeping the old frame around. A whole Y4mSource is required and
+  // not just a vpx_image_t because of how the y4m reader manipulates
+  // vpx_image_t internals,
+  void SwapBuffers(Y4mVideoSource *other) {
+    std::swap(other->y4m_.dst_buf, y4m_.dst_buf);
+    vpx_image_t *tmp;
+    tmp = other->img_.release();
+    other->img_.reset(img_.release());
+    img_.reset(tmp);
+  }
+
+ protected:
+  void CloseSource() {
+    y4m_input_close(&y4m_);
+    y4m_ = y4m_input();
+    if (input_file_ != NULL) {
+      fclose(input_file_);
+      input_file_ = NULL;
+    }
+  }
+
+  std::string file_name_;
+  FILE *input_file_;
+  testing::internal::scoped_ptr<vpx_image_t> img_;
+  unsigned int start_;
+  unsigned int limit_;
+  unsigned int frame_;
+  int framerate_numerator_;
+  int framerate_denominator_;
+  y4m_input y4m_;
+};
+
+}  // namespace libvpx_test
+
+#endif  // TEST_Y4M_VIDEO_SOURCE_H_
diff --git a/libs/libvpx/test/yuv_video_source.h b/libs/libvpx/test/yuv_video_source.h
new file mode 100644
index 0000000000..3c852b2426
--- /dev/null
+++ b/libs/libvpx/test/yuv_video_source.h
@@ -0,0 +1,151 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef TEST_YUV_VIDEO_SOURCE_H_
+#define TEST_YUV_VIDEO_SOURCE_H_
+
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+
+#include "test/video_source.h"
+#include "vpx/vpx_image.h"
+
+namespace libvpx_test {
+
+// This class extends VideoSource to allow parsing of raw YUV
+// formats of various color sampling and bit-depths so that we can
+// do actual file encodes.
+class YUVVideoSource : public VideoSource {
+ public:
+  YUVVideoSource(const std::string &file_name, vpx_img_fmt format,
+                 unsigned int width, unsigned int height,
+                 int rate_numerator, int rate_denominator,
+                 unsigned int start, int limit)
+      : file_name_(file_name),
+        input_file_(NULL),
+        img_(NULL),
+        start_(start),
+        limit_(limit),
+        frame_(0),
+        width_(0),
+        height_(0),
+        format_(VPX_IMG_FMT_NONE),
+        framerate_numerator_(rate_numerator),
+        framerate_denominator_(rate_denominator) {
+    // This initializes format_, raw_size_, width_, height_ and allocates img.
+    SetSize(width, height, format);
+  }
+
+  virtual ~YUVVideoSource() {
+    vpx_img_free(img_);
+    if (input_file_)
+      fclose(input_file_);
+  }
+
+  virtual void Begin() {
+    if (input_file_)
+      fclose(input_file_);
+    input_file_ = OpenTestDataFile(file_name_);
+    ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: "
+                                     << file_name_;
+    if (start_)
+      fseek(input_file_, static_cast<unsigned>(raw_size_) * start_, SEEK_SET);
+
+    frame_ = start_;
+    FillFrame();
+  }
+
+  virtual void Next() {
+    ++frame_;
+    FillFrame();
+  }
+
+  virtual vpx_image_t *img() const { return (frame_ < limit_) ? img_ : NULL;  }
+
+  // Models a stream where Timebase = 1/FPS, so pts == frame.
+  virtual vpx_codec_pts_t pts() const { return frame_; }
+
+  virtual unsigned long duration() const { return 1; }
+
+  virtual vpx_rational_t timebase() const {
+    const vpx_rational_t t = { framerate_denominator_, framerate_numerator_ };
+    return t;
+  }
+
+  virtual unsigned int frame() const { return frame_; }
+
+  virtual unsigned int limit() const { return limit_; }
+
+  virtual void SetSize(unsigned int width, unsigned int height,
+                       vpx_img_fmt format) {
+    if (width != width_ || height != height_ || format != format_) {
+      vpx_img_free(img_);
+      img_ = vpx_img_alloc(NULL, format, width, height, 1);
+      ASSERT_TRUE(img_ != NULL);
+      width_ = width;
+      height_ = height;
+      format_ = format;
+      switch (format) {
+        case VPX_IMG_FMT_I420:
+          raw_size_ = width * height * 3 / 2;
+          break;
+        case VPX_IMG_FMT_I422:
+          raw_size_ = width * height * 2;
+          break;
+        case VPX_IMG_FMT_I440:
+          raw_size_ = width * height * 2;
+          break;
+        case VPX_IMG_FMT_I444:
+          raw_size_ = width * height * 3;
+          break;
+        case VPX_IMG_FMT_I42016:
+          raw_size_ = width * height * 3;
+          break;
+        case VPX_IMG_FMT_I42216:
+          raw_size_ = width * height * 4;
+          break;
+        case VPX_IMG_FMT_I44016:
+          raw_size_ = width * height * 4;
+          break;
+        case VPX_IMG_FMT_I44416:
+          raw_size_ = width * height * 6;
+          break;
+        default:
+          ASSERT_TRUE(0);
+      }
+    }
+  }
+
+  virtual void FillFrame() {
+    ASSERT_TRUE(input_file_ != NULL);
+    // Read a frame from input_file.
+    if (fread(img_->img_data, raw_size_, 1, input_file_) == 0) {
+      limit_ = frame_;
+    }
+  }
+
+ protected:
+  std::string file_name_;
+  FILE *input_file_;
+  vpx_image_t *img_;
+  size_t raw_size_;
+  unsigned int start_;
+  unsigned int limit_;
+  unsigned int frame_;
+  unsigned int width_;
+  unsigned int height_;
+  vpx_img_fmt format_;
+  int framerate_numerator_;
+  int framerate_denominator_;
+};
+
+}  // namespace libvpx_test
+
+#endif  // TEST_YUV_VIDEO_SOURCE_H_
diff --git a/libs/libvpx/third_party/googletest/README.libvpx b/libs/libvpx/third_party/googletest/README.libvpx
new file mode 100644
index 0000000000..7201a67d3d
--- /dev/null
+++ b/libs/libvpx/third_party/googletest/README.libvpx
@@ -0,0 +1,15 @@
+URL: http://code.google.com/p/googletest/
+Version: 1.7.0
+License: BSD
+License File: COPYING
+
+Description:
+Google's framework for writing C++ tests on a variety of platforms
+(Linux, Mac OS X, Windows, Windows CE, Symbian, etc).  Based on the
+xUnit architecture.  Supports automatic test discovery, a rich set of
+assertions, user-defined assertions, death tests, fatal and non-fatal
+failures, various options for running the tests, and XML test report
+generation.
+
+Local Modifications:
+Removed unused declarations of kPathSeparatorString to have warning free build.
\ No newline at end of file
diff --git a/libs/libvpx/third_party/googletest/gtest.mk b/libs/libvpx/third_party/googletest/gtest.mk
new file mode 100644
index 0000000000..0de3113c7a
--- /dev/null
+++ b/libs/libvpx/third_party/googletest/gtest.mk
@@ -0,0 +1 @@
+GTEST_SRCS-yes += src/gtest-all.cc 
diff --git a/libs/libvpx/third_party/googletest/src/CHANGES b/libs/libvpx/third_party/googletest/src/CHANGES
new file mode 100644
index 0000000000..0552132421
--- /dev/null
+++ b/libs/libvpx/third_party/googletest/src/CHANGES
@@ -0,0 +1,157 @@
+Changes for 1.7.0:
+
+* New feature: death tests are supported on OpenBSD and in iOS
+  simulator now.
+* New feature: Google Test now implements a protocol to allow
+  a test runner to detect that a test program has exited
+  prematurely and report it as a failure (before it would be
+  falsely reported as a success if the exit code is 0).
+* New feature: Test::RecordProperty() can now be used outside of the
+  lifespan of a test method, in which case it will be attributed to
+  the current test case or the test program in the XML report.
+* New feature (potentially breaking): --gtest_list_tests now prints
+  the type parameters and value parameters for each test.
+* Improvement: char pointers and char arrays are now escaped properly
+  in failure messages.
+* Improvement: failure summary in XML reports now includes file and
+  line information.
+* Improvement: the <testsuites> XML element now has a timestamp attribute.
+* Improvement: When --gtest_filter is specified, XML report now doesn't
+  contain information about tests that are filtered out.
+* Fixed the bug where long --gtest_filter flag values are truncated in
+  death tests.
+* Potentially breaking change: RUN_ALL_TESTS() is now implemented as a
+  function instead of a macro in order to work better with Clang.
+* Compatibility fixes with C++ 11 and various platforms.
+* Bug/warning fixes.
+
+Changes for 1.6.0:
+
+* New feature: ADD_FAILURE_AT() for reporting a test failure at the
+  given source location -- useful for writing testing utilities.
+* New feature: the universal value printer is moved from Google Mock
+  to Google Test.
+* New feature: type parameters and value parameters are reported in
+  the XML report now.
+* A gtest_disable_pthreads CMake option.
+* Colored output works in GNU Screen sessions now.
+* Parameters of value-parameterized tests are now printed in the
+  textual output.
+* Failures from ad hoc test assertions run before RUN_ALL_TESTS() are
+  now correctly reported.
+* Arguments of ASSERT_XY and EXPECT_XY no longer need to support << to
+  ostream.
+* More complete handling of exceptions.
+* GTEST_ASSERT_XY can be used instead of ASSERT_XY in case the latter
+  name is already used by another library.
+* --gtest_catch_exceptions is now true by default, allowing a test
+  program to continue after an exception is thrown.
+* Value-parameterized test fixtures can now derive from Test and
+  WithParamInterface<T> separately, easing conversion of legacy tests.
+* Death test messages are clearly marked to make them more
+  distinguishable from other messages.
+* Compatibility fixes for Android, Google Native Client, MinGW, HP UX,
+  PowerPC, Lucid autotools, libCStd, Sun C++, Borland C++ Builder (Code Gear),
+  IBM XL C++ (Visual Age C++), and C++0x.
+* Bug fixes and implementation clean-ups.
+* Potentially incompatible changes: disables the harmful 'make install'
+  command in autotools.
+
+Changes for 1.5.0:
+
+ * New feature: assertions can be safely called in multiple threads
+   where the pthreads library is available.
+ * New feature: predicates used inside EXPECT_TRUE() and friends
+   can now generate custom failure messages.
+ * New feature: Google Test can now be compiled as a DLL.
+ * New feature: fused source files are included.
+ * New feature: prints help when encountering unrecognized Google Test flags.
+ * Experimental feature: CMake build script (requires CMake 2.6.4+).
+ * Experimental feature: the Pump script for meta programming.
+ * double values streamed to an assertion are printed with enough precision
+   to differentiate any two different values.
+ * Google Test now works on Solaris and AIX.
+ * Build and test script improvements.
+ * Bug fixes and implementation clean-ups.
+
+ Potentially breaking changes:
+
+ * Stopped supporting VC++ 7.1 with exceptions disabled.
+ * Dropped support for 'make install'.
+
+Changes for 1.4.0:
+
+ * New feature: the event listener API
+ * New feature: test shuffling
+ * New feature: the XML report format is closer to junitreport and can
+   be parsed by Hudson now.
+ * New feature: when a test runs under Visual Studio, its failures are
+   integrated in the IDE.
+ * New feature: /MD(d) versions of VC++ projects.
+ * New feature: elapsed time for the tests is printed by default.
+ * New feature: comes with a TR1 tuple implementation such that Boost
+   is no longer needed for Combine().
+ * New feature: EXPECT_DEATH_IF_SUPPORTED macro and friends.
+ * New feature: the Xcode project can now produce static gtest
+   libraries in addition to a framework.
+ * Compatibility fixes for Solaris, Cygwin, minGW, Windows Mobile,
+   Symbian, gcc, and C++Builder.
+ * Bug fixes and implementation clean-ups.
+
+Changes for 1.3.0:
+
+ * New feature: death tests on Windows, Cygwin, and Mac.
+ * New feature: ability to use Google Test assertions in other testing
+   frameworks.
+ * New feature: ability to run disabled test via
+   --gtest_also_run_disabled_tests.
+ * New feature: the --help flag for printing the usage.
+ * New feature: access to Google Test flag values in user code.
+ * New feature: a script that packs Google Test into one .h and one
+   .cc file for easy deployment.
+ * New feature: support for distributing test functions to multiple
+   machines (requires support from the test runner).
+ * Bug fixes and implementation clean-ups.
+
+Changes for 1.2.1:
+
+ * Compatibility fixes for Linux IA-64 and IBM z/OS.
+ * Added support for using Boost and other TR1 implementations.
+ * Changes to the build scripts to support upcoming release of Google C++
+   Mocking Framework.
+ * Added Makefile to the distribution package.
+ * Improved build instructions in README.
+
+Changes for 1.2.0:
+
+ * New feature: value-parameterized tests.
+ * New feature: the ASSERT/EXPECT_(NON)FATAL_FAILURE(_ON_ALL_THREADS)
+   macros.
+ * Changed the XML report format to match JUnit/Ant's.
+ * Added tests to the Xcode project.
+ * Added scons/SConscript for building with SCons.
+ * Added src/gtest-all.cc for building Google Test from a single file.
+ * Fixed compatibility with Solaris and z/OS.
+ * Enabled running Python tests on systems with python 2.3 installed,
+   e.g. Mac OS X 10.4.
+ * Bug fixes.
+
+Changes for 1.1.0:
+
+ * New feature: type-parameterized tests.
+ * New feature: exception assertions.
+ * New feature: printing elapsed time of tests.
+ * Improved the robustness of death tests.
+ * Added an Xcode project and samples.
+ * Adjusted the output format on Windows to be understandable by Visual Studio.
+ * Minor bug fixes.
+
+Changes for 1.0.1:
+
+ * Added project files for Visual Studio 7.1.
+ * Fixed issues with compiling on Mac OS X.
+ * Fixed issues with compiling on Cygwin.
+
+Changes for 1.0.0:
+
+ * Initial Open Source release of Google Test
diff --git a/libs/libvpx/third_party/googletest/src/CONTRIBUTORS b/libs/libvpx/third_party/googletest/src/CONTRIBUTORS
new file mode 100644
index 0000000000..feae2fc044
--- /dev/null
+++ b/libs/libvpx/third_party/googletest/src/CONTRIBUTORS
@@ -0,0 +1,37 @@
+# This file contains a list of people who've made non-trivial
+# contribution to the Google C++ Testing Framework project.  People
+# who commit code to the project are encouraged to add their names
+# here.  Please keep the list sorted by first names.
+
+Ajay Joshi <jaj@google.com>
+Balázs Dán <balazs.dan@gmail.com>
+Bharat Mediratta <bharat@menalto.com>
+Chandler Carruth <chandlerc@google.com>
+Chris Prince <cprince@google.com>
+Chris Taylor <taylorc@google.com>
+Dan Egnor <egnor@google.com>
+Eric Roman <eroman@chromium.org>
+Hady Zalek <hady.zalek@gmail.com>
+Jeffrey Yasskin <jyasskin@google.com>
+Jói Sigurðsson <joi@google.com>
+Keir Mierle <mierle@gmail.com>
+Keith Ray <keith.ray@gmail.com>
+Kenton Varda <kenton@google.com>
+Manuel Klimek <klimek@google.com>
+Markus Heule <markus.heule@gmail.com>
+Mika Raento <mikie@iki.fi>
+Miklós Fazekas <mfazekas@szemafor.com>
+Pasi Valminen <pasi.valminen@gmail.com>
+Patrick Hanna <phanna@google.com>
+Patrick Riley <pfr@google.com>
+Peter Kaminski <piotrk@google.com>
+Preston Jackson <preston.a.jackson@gmail.com>
+Rainer Klaffenboeck <rainer.klaffenboeck@dynatrace.com>
+Russ Cox <rsc@google.com>
+Russ Rufer <russ@pentad.com>
+Sean Mcafee <eefacm@gmail.com>
+Sigurður Ásgeirsson <siggi@google.com>
+Tracy Bialik <tracy@pentad.com>
+Vadim Berman <vadimb@google.com>
+Vlad Losev <vladl@google.com>
+Zhanyong Wan <wan@google.com>
diff --git a/libs/libvpx/third_party/googletest/src/LICENSE b/libs/libvpx/third_party/googletest/src/LICENSE
new file mode 100644
index 0000000000..1941a11f8c
--- /dev/null
+++ b/libs/libvpx/third_party/googletest/src/LICENSE
@@ -0,0 +1,28 @@
+Copyright 2008, Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+    * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/libs/libvpx/third_party/googletest/src/README b/libs/libvpx/third_party/googletest/src/README
new file mode 100644
index 0000000000..26f35a8479
--- /dev/null
+++ b/libs/libvpx/third_party/googletest/src/README
@@ -0,0 +1,435 @@
+Google C++ Testing Framework
+============================
+
+http://code.google.com/p/googletest/
+
+Overview
+--------
+
+Google's framework for writing C++ tests on a variety of platforms
+(Linux, Mac OS X, Windows, Windows CE, Symbian, etc).  Based on the
+xUnit architecture.  Supports automatic test discovery, a rich set of
+assertions, user-defined assertions, death tests, fatal and non-fatal
+failures, various options for running the tests, and XML test report
+generation.
+
+Please see the project page above for more information as well as the
+mailing list for questions, discussions, and development.  There is
+also an IRC channel on OFTC (irc.oftc.net) #gtest available.  Please
+join us!
+
+Requirements for End Users
+--------------------------
+
+Google Test is designed to have fairly minimal requirements to build
+and use with your projects, but there are some.  Currently, we support
+Linux, Windows, Mac OS X, and Cygwin.  We will also make our best
+effort to support other platforms (e.g. Solaris, AIX, and z/OS).
+However, since core members of the Google Test project have no access
+to these platforms, Google Test may have outstanding issues there.  If
+you notice any problems on your platform, please notify
+googletestframework@googlegroups.com.  Patches for fixing them are
+even more welcome!
+
+### Linux Requirements ###
+
+These are the base requirements to build and use Google Test from a source
+package (as described below):
+  * GNU-compatible Make or gmake
+  * POSIX-standard shell
+  * POSIX(-2) Regular Expressions (regex.h)
+  * A C++98-standard-compliant compiler
+
+### Windows Requirements ###
+
+  * Microsoft Visual C++ 7.1 or newer
+
+### Cygwin Requirements ###
+
+  * Cygwin 1.5.25-14 or newer
+
+### Mac OS X Requirements ###
+
+  * Mac OS X 10.4 Tiger or newer
+  * Developer Tools Installed
+
+Also, you'll need CMake 2.6.4 or higher if you want to build the
+samples using the provided CMake script, regardless of the platform.
+
+Requirements for Contributors
+-----------------------------
+
+We welcome patches.  If you plan to contribute a patch, you need to
+build Google Test and its own tests from an SVN checkout (described
+below), which has further requirements:
+
+  * Python version 2.3 or newer (for running some of the tests and
+    re-generating certain source files from templates)
+  * CMake 2.6.4 or newer
+
+Getting the Source
+------------------
+
+There are two primary ways of getting Google Test's source code: you
+can download a stable source release in your preferred archive format,
+or directly check out the source from our Subversion (SVN) repositary.
+The SVN checkout requires a few extra steps and some extra software
+packages on your system, but lets you track the latest development and
+make patches much more easily, so we highly encourage it.
+
+### Source Package ###
+
+Google Test is released in versioned source packages which can be
+downloaded from the download page [1].  Several different archive
+formats are provided, but the only difference is the tools used to
+manipulate them, and the size of the resulting file.  Download
+whichever you are most comfortable with.
+
+  [1] http://code.google.com/p/googletest/downloads/list
+
+Once the package is downloaded, expand it using whichever tools you
+prefer for that type.  This will result in a new directory with the
+name "gtest-X.Y.Z" which contains all of the source code.  Here are
+some examples on Linux:
+
+  tar -xvzf gtest-X.Y.Z.tar.gz
+  tar -xvjf gtest-X.Y.Z.tar.bz2
+  unzip gtest-X.Y.Z.zip
+
+### SVN Checkout ###
+
+To check out the main branch (also known as the "trunk") of Google
+Test, run the following Subversion command:
+
+  svn checkout http://googletest.googlecode.com/svn/trunk/ gtest-svn
+
+Setting up the Build
+--------------------
+
+To build Google Test and your tests that use it, you need to tell your
+build system where to find its headers and source files.  The exact
+way to do it depends on which build system you use, and is usually
+straightforward.
+
+### Generic Build Instructions ###
+
+Suppose you put Google Test in directory ${GTEST_DIR}.  To build it,
+create a library build target (or a project as called by Visual Studio
+and Xcode) to compile
+
+  ${GTEST_DIR}/src/gtest-all.cc
+
+with ${GTEST_DIR}/include in the system header search path and ${GTEST_DIR}
+in the normal header search path.  Assuming a Linux-like system and gcc,
+something like the following will do:
+
+  g++ -isystem ${GTEST_DIR}/include -I${GTEST_DIR} \
+      -pthread -c ${GTEST_DIR}/src/gtest-all.cc
+  ar -rv libgtest.a gtest-all.o
+
+(We need -pthread as Google Test uses threads.)
+
+Next, you should compile your test source file with
+${GTEST_DIR}/include in the system header search path, and link it
+with gtest and any other necessary libraries:
+
+  g++ -isystem ${GTEST_DIR}/include -pthread path/to/your_test.cc libgtest.a \
+      -o your_test
+
+As an example, the make/ directory contains a Makefile that you can
+use to build Google Test on systems where GNU make is available
+(e.g. Linux, Mac OS X, and Cygwin).  It doesn't try to build Google
+Test's own tests.  Instead, it just builds the Google Test library and
+a sample test.  You can use it as a starting point for your own build
+script.
+
+If the default settings are correct for your environment, the
+following commands should succeed:
+
+  cd ${GTEST_DIR}/make
+  make
+  ./sample1_unittest
+
+If you see errors, try to tweak the contents of make/Makefile to make
+them go away.  There are instructions in make/Makefile on how to do
+it.
+
+### Using CMake ###
+
+Google Test comes with a CMake build script (CMakeLists.txt) that can
+be used on a wide range of platforms ("C" stands for cross-platofrm.).
+If you don't have CMake installed already, you can download it for
+free from http://www.cmake.org/.
+
+CMake works by generating native makefiles or build projects that can
+be used in the compiler environment of your choice.  The typical
+workflow starts with:
+
+  mkdir mybuild       # Create a directory to hold the build output.
+  cd mybuild
+  cmake ${GTEST_DIR}  # Generate native build scripts.
+
+If you want to build Google Test's samples, you should replace the
+last command with
+
+  cmake -Dgtest_build_samples=ON ${GTEST_DIR}
+
+If you are on a *nix system, you should now see a Makefile in the
+current directory.  Just type 'make' to build gtest.
+
+If you use Windows and have Vistual Studio installed, a gtest.sln file
+and several .vcproj files will be created.  You can then build them
+using Visual Studio.
+
+On Mac OS X with Xcode installed, a .xcodeproj file will be generated.
+
+### Legacy Build Scripts ###
+
+Before settling on CMake, we have been providing hand-maintained build
+projects/scripts for Visual Studio, Xcode, and Autotools.  While we
+continue to provide them for convenience, they are not actively
+maintained any more.  We highly recommend that you follow the
+instructions in the previous two sections to integrate Google Test
+with your existing build system.
+
+If you still need to use the legacy build scripts, here's how:
+
+The msvc\ folder contains two solutions with Visual C++ projects.
+Open the gtest.sln or gtest-md.sln file using Visual Studio, and you
+are ready to build Google Test the same way you build any Visual
+Studio project.  Files that have names ending with -md use DLL
+versions of Microsoft runtime libraries (the /MD or the /MDd compiler
+option).  Files without that suffix use static versions of the runtime
+libraries (the /MT or the /MTd option).  Please note that one must use
+the same option to compile both gtest and the test code.  If you use
+Visual Studio 2005 or above, we recommend the -md version as /MD is
+the default for new projects in these versions of Visual Studio.
+
+On Mac OS X, open the gtest.xcodeproj in the xcode/ folder using
+Xcode.  Build the "gtest" target.  The universal binary framework will
+end up in your selected build directory (selected in the Xcode
+"Preferences..." -> "Building" pane and defaults to xcode/build).
+Alternatively, at the command line, enter:
+
+  xcodebuild
+
+This will build the "Release" configuration of gtest.framework in your
+default build location.  See the "xcodebuild" man page for more
+information about building different configurations and building in
+different locations.
+
+If you wish to use the Google Test Xcode project with Xcode 4.x and
+above, you need to either:
+ * update the SDK configuration options in xcode/Config/General.xconfig.
+   Comment options SDKROOT, MACOS_DEPLOYMENT_TARGET, and GCC_VERSION. If
+   you choose this route you lose the ability to target earlier versions
+   of MacOS X.
+ * Install an SDK for an earlier version. This doesn't appear to be
+   supported by Apple, but has been reported to work
+   (http://stackoverflow.com/questions/5378518).
+
+Tweaking Google Test
+--------------------
+
+Google Test can be used in diverse environments.  The default
+configuration may not work (or may not work well) out of the box in
+some environments.  However, you can easily tweak Google Test by
+defining control macros on the compiler command line.  Generally,
+these macros are named like GTEST_XYZ and you define them to either 1
+or 0 to enable or disable a certain feature.
+
+We list the most frequently used macros below.  For a complete list,
+see file include/gtest/internal/gtest-port.h.
+
+### Choosing a TR1 Tuple Library ###
+
+Some Google Test features require the C++ Technical Report 1 (TR1)
+tuple library, which is not yet available with all compilers.  The
+good news is that Google Test implements a subset of TR1 tuple that's
+enough for its own need, and will automatically use this when the
+compiler doesn't provide TR1 tuple.
+
+Usually you don't need to care about which tuple library Google Test
+uses.  However, if your project already uses TR1 tuple, you need to
+tell Google Test to use the same TR1 tuple library the rest of your
+project uses, or the two tuple implementations will clash.  To do
+that, add
+
+  -DGTEST_USE_OWN_TR1_TUPLE=0
+
+to the compiler flags while compiling Google Test and your tests.  If
+you want to force Google Test to use its own tuple library, just add
+
+  -DGTEST_USE_OWN_TR1_TUPLE=1
+
+to the compiler flags instead.
+
+If you don't want Google Test to use tuple at all, add
+
+  -DGTEST_HAS_TR1_TUPLE=0
+
+and all features using tuple will be disabled.
+
+### Multi-threaded Tests ###
+
+Google Test is thread-safe where the pthread library is available.
+After #include "gtest/gtest.h", you can check the GTEST_IS_THREADSAFE
+macro to see whether this is the case (yes if the macro is #defined to
+1, no if it's undefined.).
+
+If Google Test doesn't correctly detect whether pthread is available
+in your environment, you can force it with
+
+  -DGTEST_HAS_PTHREAD=1
+
+or
+
+  -DGTEST_HAS_PTHREAD=0
+
+When Google Test uses pthread, you may need to add flags to your
+compiler and/or linker to select the pthread library, or you'll get
+link errors.  If you use the CMake script or the deprecated Autotools
+script, this is taken care of for you.  If you use your own build
+script, you'll need to read your compiler and linker's manual to
+figure out what flags to add.
+
+### As a Shared Library (DLL) ###
+
+Google Test is compact, so most users can build and link it as a
+static library for the simplicity.  You can choose to use Google Test
+as a shared library (known as a DLL on Windows) if you prefer.
+
+To compile *gtest* as a shared library, add
+
+  -DGTEST_CREATE_SHARED_LIBRARY=1
+
+to the compiler flags.  You'll also need to tell the linker to produce
+a shared library instead - consult your linker's manual for how to do
+it.
+
+To compile your *tests* that use the gtest shared library, add
+
+  -DGTEST_LINKED_AS_SHARED_LIBRARY=1
+
+to the compiler flags.
+
+Note: while the above steps aren't technically necessary today when
+using some compilers (e.g. GCC), they may become necessary in the
+future, if we decide to improve the speed of loading the library (see
+http://gcc.gnu.org/wiki/Visibility for details).  Therefore you are
+recommended to always add the above flags when using Google Test as a
+shared library.  Otherwise a future release of Google Test may break
+your build script.
+
+### Avoiding Macro Name Clashes ###
+
+In C++, macros don't obey namespaces.  Therefore two libraries that
+both define a macro of the same name will clash if you #include both
+definitions.  In case a Google Test macro clashes with another
+library, you can force Google Test to rename its macro to avoid the
+conflict.
+
+Specifically, if both Google Test and some other code define macro
+FOO, you can add
+
+  -DGTEST_DONT_DEFINE_FOO=1
+
+to the compiler flags to tell Google Test to change the macro's name
+from FOO to GTEST_FOO.  Currently FOO can be FAIL, SUCCEED, or TEST.
+For example, with -DGTEST_DONT_DEFINE_TEST=1, you'll need to write
+
+  GTEST_TEST(SomeTest, DoesThis) { ... }
+
+instead of
+
+  TEST(SomeTest, DoesThis) { ... }
+
+in order to define a test.
+
+Upgrating from an Earlier Version
+---------------------------------
+
+We strive to keep Google Test releases backward compatible.
+Sometimes, though, we have to make some breaking changes for the
+users' long-term benefits.  This section describes what you'll need to
+do if you are upgrading from an earlier version of Google Test.
+
+### Upgrading from 1.3.0 or Earlier ###
+
+You may need to explicitly enable or disable Google Test's own TR1
+tuple library.  See the instructions in section "Choosing a TR1 Tuple
+Library".
+
+### Upgrading from 1.4.0 or Earlier ###
+
+The Autotools build script (configure + make) is no longer officially
+supportted.  You are encouraged to migrate to your own build system or
+use CMake.  If you still need to use Autotools, you can find
+instructions in the README file from Google Test 1.4.0.
+
+On platforms where the pthread library is available, Google Test uses
+it in order to be thread-safe.  See the "Multi-threaded Tests" section
+for what this means to your build script.
+
+If you use Microsoft Visual C++ 7.1 with exceptions disabled, Google
+Test will no longer compile.  This should affect very few people, as a
+large portion of STL (including <string>) doesn't compile in this mode
+anyway.  We decided to stop supporting it in order to greatly simplify
+Google Test's implementation.
+
+Developing Google Test
+----------------------
+
+This section discusses how to make your own changes to Google Test.
+
+### Testing Google Test Itself ###
+
+To make sure your changes work as intended and don't break existing
+functionality, you'll want to compile and run Google Test's own tests.
+For that you can use CMake:
+
+  mkdir mybuild
+  cd mybuild
+  cmake -Dgtest_build_tests=ON ${GTEST_DIR}
+
+Make sure you have Python installed, as some of Google Test's tests
+are written in Python.  If the cmake command complains about not being
+able to find Python ("Could NOT find PythonInterp (missing:
+PYTHON_EXECUTABLE)"), try telling it explicitly where your Python
+executable can be found:
+
+  cmake -DPYTHON_EXECUTABLE=path/to/python -Dgtest_build_tests=ON ${GTEST_DIR}
+
+Next, you can build Google Test and all of its own tests.  On *nix,
+this is usually done by 'make'.  To run the tests, do
+
+  make test
+
+All tests should pass.
+
+### Regenerating Source Files ###
+
+Some of Google Test's source files are generated from templates (not
+in the C++ sense) using a script.  A template file is named FOO.pump,
+where FOO is the name of the file it will generate.  For example, the
+file include/gtest/internal/gtest-type-util.h.pump is used to generate
+gtest-type-util.h in the same directory.
+
+Normally you don't need to worry about regenerating the source files,
+unless you need to modify them.  In that case, you should modify the
+corresponding .pump files instead and run the pump.py Python script to
+regenerate them.  You can find pump.py in the scripts/ directory.
+Read the Pump manual [2] for how to use it.
+
+  [2] http://code.google.com/p/googletest/wiki/PumpManual
+
+### Contributing a Patch ###
+
+We welcome patches.  Please read the Google Test developer's guide [3]
+for how you can contribute.  In particular, make sure you have signed
+the Contributor License Agreement, or we won't be able to accept the
+patch.
+
+  [3] http://code.google.com/p/googletest/wiki/GoogleTestDevGuide
+
+Happy testing!
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/gtest.h b/libs/libvpx/third_party/googletest/src/include/gtest/gtest.h
new file mode 100644
index 0000000000..4f3804f703
--- /dev/null
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/gtest.h
@@ -0,0 +1,20061 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+//
+// The Google C++ Testing Framework (Google Test)
+//
+// This header file defines the public API for Google Test.  It should be
+// included by any test program that uses Google Test.
+//
+// IMPORTANT NOTE: Due to limitation of the C++ language, we have to
+// leave some internal implementation details in this header file.
+// They are clearly marked by comments like this:
+//
+//   // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+//
+// Such code is NOT meant to be used by a user directly, and is subject
+// to CHANGE WITHOUT NOTICE.  Therefore DO NOT DEPEND ON IT in a user
+// program!
+//
+// Acknowledgment: Google Test borrowed the idea of automatic test
+// registration from Barthelemy Dagenais' (barthelemy@prologique.com)
+// easyUnit framework.
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_H_
+#define GTEST_INCLUDE_GTEST_GTEST_H_
+
+#include <limits>
+#include <ostream>
+#include <vector>
+
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee)
+//
+// The Google C++ Testing Framework (Google Test)
+//
+// This header file declares functions and macros used internally by
+// Google Test.  They are subject to change without notice.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
+
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: wan@google.com (Zhanyong Wan)
+//
+// Low-level types and utilities for porting Google Test to various
+// platforms.  They are subject to change without notice.  DO NOT USE
+// THEM IN USER CODE.
+//
+// This file is fundamental to Google Test.  All other Google Test source
+// files are expected to #include this.  Therefore, it cannot #include
+// any other Google Test header.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
+
+// The user can define the following macros in the build script to
+// control Google Test's behavior.  If the user doesn't define a macro
+// in this list, Google Test will define it.
+//
+//   GTEST_HAS_CLONE          - Define it to 1/0 to indicate that clone(2)
+//                              is/isn't available.
+//   GTEST_HAS_EXCEPTIONS     - Define it to 1/0 to indicate that exceptions
+//                              are enabled.
+//   GTEST_HAS_GLOBAL_STRING  - Define it to 1/0 to indicate that ::string
+//                              is/isn't available (some systems define
+//                              ::string, which is different to std::string).
+//   GTEST_HAS_GLOBAL_WSTRING - Define it to 1/0 to indicate that ::string
+//                              is/isn't available (some systems define
+//                              ::wstring, which is different to std::wstring).
+//   GTEST_HAS_POSIX_RE       - Define it to 1/0 to indicate that POSIX regular
+//                              expressions are/aren't available.
+//   GTEST_HAS_PTHREAD        - Define it to 1/0 to indicate that <pthread.h>
+//                              is/isn't available.
+//   GTEST_HAS_RTTI           - Define it to 1/0 to indicate that RTTI is/isn't
+//                              enabled.
+//   GTEST_HAS_STD_WSTRING    - Define it to 1/0 to indicate that
+//                              std::wstring does/doesn't work (Google Test can
+//                              be used where std::wstring is unavailable).
+//   GTEST_HAS_TR1_TUPLE      - Define it to 1/0 to indicate tr1::tuple
+//                              is/isn't available.
+//   GTEST_HAS_SEH            - Define it to 1/0 to indicate whether the
+//                              compiler supports Microsoft's "Structured
+//                              Exception Handling".
+//   GTEST_HAS_STREAM_REDIRECTION
+//                            - Define it to 1/0 to indicate whether the
+//                              platform supports I/O stream redirection using
+//                              dup() and dup2().
+//   GTEST_USE_OWN_TR1_TUPLE  - Define it to 1/0 to indicate whether Google
+//                              Test's own tr1 tuple implementation should be
+//                              used.  Unused when the user sets
+//                              GTEST_HAS_TR1_TUPLE to 0.
+//   GTEST_LANG_CXX11         - Define it to 1/0 to indicate that Google Test
+//                              is building in C++11/C++98 mode.
+//   GTEST_LINKED_AS_SHARED_LIBRARY
+//                            - Define to 1 when compiling tests that use
+//                              Google Test as a shared library (known as
+//                              DLL on Windows).
+//   GTEST_CREATE_SHARED_LIBRARY
+//                            - Define to 1 when compiling Google Test itself
+//                              as a shared library.
+
+// This header defines the following utilities:
+//
+// Macros indicating the current platform (defined to 1 if compiled on
+// the given platform; otherwise undefined):
+//   GTEST_OS_AIX      - IBM AIX
+//   GTEST_OS_CYGWIN   - Cygwin
+//   GTEST_OS_HPUX     - HP-UX
+//   GTEST_OS_LINUX    - Linux
+//     GTEST_OS_LINUX_ANDROID - Google Android
+//   GTEST_OS_MAC      - Mac OS X
+//     GTEST_OS_IOS    - iOS
+//       GTEST_OS_IOS_SIMULATOR - iOS simulator
+//   GTEST_OS_NACL     - Google Native Client (NaCl)
+//   GTEST_OS_OPENBSD  - OpenBSD
+//   GTEST_OS_QNX      - QNX
+//   GTEST_OS_SOLARIS  - Sun Solaris
+//   GTEST_OS_SYMBIAN  - Symbian
+//   GTEST_OS_WINDOWS  - Windows (Desktop, MinGW, or Mobile)
+//     GTEST_OS_WINDOWS_DESKTOP  - Windows Desktop
+//     GTEST_OS_WINDOWS_MINGW    - MinGW
+//     GTEST_OS_WINDOWS_MOBILE   - Windows Mobile
+//   GTEST_OS_ZOS      - z/OS
+//
+// Among the platforms, Cygwin, Linux, Max OS X, and Windows have the
+// most stable support.  Since core members of the Google Test project
+// don't have access to other platforms, support for them may be less
+// stable.  If you notice any problems on your platform, please notify
+// googletestframework@googlegroups.com (patches for fixing them are
+// even more welcome!).
+//
+// Note that it is possible that none of the GTEST_OS_* macros are defined.
+//
+// Macros indicating available Google Test features (defined to 1 if
+// the corresponding feature is supported; otherwise undefined):
+//   GTEST_HAS_COMBINE      - the Combine() function (for value-parameterized
+//                            tests)
+//   GTEST_HAS_DEATH_TEST   - death tests
+//   GTEST_HAS_PARAM_TEST   - value-parameterized tests
+//   GTEST_HAS_TYPED_TEST   - typed tests
+//   GTEST_HAS_TYPED_TEST_P - type-parameterized tests
+//   GTEST_USES_POSIX_RE    - enhanced POSIX regex is used. Do not confuse with
+//                            GTEST_HAS_POSIX_RE (see above) which users can
+//                            define themselves.
+//   GTEST_USES_SIMPLE_RE   - our own simple regex is used;
+//                            the above two are mutually exclusive.
+//   GTEST_CAN_COMPARE_NULL - accepts untyped NULL in EXPECT_EQ().
+//
+// Macros for basic C++ coding:
+//   GTEST_AMBIGUOUS_ELSE_BLOCKER_ - for disabling a gcc warning.
+//   GTEST_ATTRIBUTE_UNUSED_  - declares that a class' instances or a
+//                              variable don't have to be used.
+//   GTEST_DISALLOW_ASSIGN_   - disables operator=.
+//   GTEST_DISALLOW_COPY_AND_ASSIGN_ - disables copy ctor and operator=.
+//   GTEST_MUST_USE_RESULT_   - declares that a function's result must be used.
+//
+// Synchronization:
+//   Mutex, MutexLock, ThreadLocal, GetThreadCount()
+//                  - synchronization primitives.
+//   GTEST_IS_THREADSAFE - defined to 1 to indicate that the above
+//                         synchronization primitives have real implementations
+//                         and Google Test is thread-safe; or 0 otherwise.
+//
+// Template meta programming:
+//   is_pointer     - as in TR1; needed on Symbian and IBM XL C/C++ only.
+//   IteratorTraits - partial implementation of std::iterator_traits, which
+//                    is not available in libCstd when compiled with Sun C++.
+//
+// Smart pointers:
+//   scoped_ptr     - as in TR2.
+//
+// Regular expressions:
+//   RE             - a simple regular expression class using the POSIX
+//                    Extended Regular Expression syntax on UNIX-like
+//                    platforms, or a reduced regular exception syntax on
+//                    other platforms, including Windows.
+//
+// Logging:
+//   GTEST_LOG_()   - logs messages at the specified severity level.
+//   LogToStderr()  - directs all log messages to stderr.
+//   FlushInfoLog() - flushes informational log messages.
+//
+// Stdout and stderr capturing:
+//   CaptureStdout()     - starts capturing stdout.
+//   GetCapturedStdout() - stops capturing stdout and returns the captured
+//                         string.
+//   CaptureStderr()     - starts capturing stderr.
+//   GetCapturedStderr() - stops capturing stderr and returns the captured
+//                         string.
+//
+// Integer types:
+//   TypeWithSize   - maps an integer to a int type.
+//   Int32, UInt32, Int64, UInt64, TimeInMillis
+//                  - integers of known sizes.
+//   BiggestInt     - the biggest signed integer type.
+//
+// Command-line utilities:
+//   GTEST_FLAG()       - references a flag.
+//   GTEST_DECLARE_*()  - declares a flag.
+//   GTEST_DEFINE_*()   - defines a flag.
+//   GetInjectableArgvs() - returns the command line as a vector of strings.
+//
+// Environment variable utilities:
+//   GetEnv()             - gets the value of an environment variable.
+//   BoolFromGTestEnv()   - parses a bool environment variable.
+//   Int32FromGTestEnv()  - parses an Int32 environment variable.
+//   StringFromGTestEnv() - parses a string environment variable.
+
+#include <ctype.h>   // for isspace, etc
+#include <stddef.h>  // for ptrdiff_t
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#ifndef _WIN32_WCE
+# include <sys/types.h>
+# include <sys/stat.h>
+#endif  // !_WIN32_WCE
+
+#if defined __APPLE__
+# include <AvailabilityMacros.h>
+# include <TargetConditionals.h>
+#endif
+
+#include <iostream>  // NOLINT
+#include <sstream>  // NOLINT
+#include <string>  // NOLINT
+
+#define GTEST_DEV_EMAIL_ "googletestframework@@googlegroups.com"
+#define GTEST_FLAG_PREFIX_ "gtest_"
+#define GTEST_FLAG_PREFIX_DASH_ "gtest-"
+#define GTEST_FLAG_PREFIX_UPPER_ "GTEST_"
+#define GTEST_NAME_ "Google Test"
+#define GTEST_PROJECT_URL_ "http://code.google.com/p/googletest/"
+
+// Determines the version of gcc that is used to compile this.
+#ifdef __GNUC__
+// 40302 means version 4.3.2.
+# define GTEST_GCC_VER_ \
+    (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
+#endif  // __GNUC__
+
+// Determines the platform on which Google Test is compiled.
+#ifdef __CYGWIN__
+# define GTEST_OS_CYGWIN 1
+#elif defined __SYMBIAN32__
+# define GTEST_OS_SYMBIAN 1
+#elif defined _WIN32
+# define GTEST_OS_WINDOWS 1
+# ifdef _WIN32_WCE
+#  define GTEST_OS_WINDOWS_MOBILE 1
+# elif defined(__MINGW__) || defined(__MINGW32__)
+#  define GTEST_OS_WINDOWS_MINGW 1
+# else
+#  define GTEST_OS_WINDOWS_DESKTOP 1
+# endif  // _WIN32_WCE
+#elif defined __APPLE__
+# define GTEST_OS_MAC 1
+# if TARGET_OS_IPHONE
+#  define GTEST_OS_IOS 1
+#  if TARGET_IPHONE_SIMULATOR
+#   define GTEST_OS_IOS_SIMULATOR 1
+#  endif
+# endif
+#elif defined __linux__
+# define GTEST_OS_LINUX 1
+# if defined __ANDROID__
+#  define GTEST_OS_LINUX_ANDROID 1
+# endif
+#elif defined __MVS__
+# define GTEST_OS_ZOS 1
+#elif defined(__sun) && defined(__SVR4)
+# define GTEST_OS_SOLARIS 1
+#elif defined(_AIX)
+# define GTEST_OS_AIX 1
+#elif defined(__hpux)
+# define GTEST_OS_HPUX 1
+#elif defined __native_client__
+# define GTEST_OS_NACL 1
+#elif defined __OpenBSD__
+# define GTEST_OS_OPENBSD 1
+#elif defined __QNX__
+# define GTEST_OS_QNX 1
+#endif  // __CYGWIN__
+
+#ifndef GTEST_LANG_CXX11
+// gcc and clang define __GXX_EXPERIMENTAL_CXX0X__ when
+// -std={c,gnu}++{0x,11} is passed.  The C++11 standard specifies a
+// value for __cplusplus, and recent versions of clang, gcc, and
+// probably other compilers set that too in C++11 mode.
+# if __GXX_EXPERIMENTAL_CXX0X__ || __cplusplus >= 201103L
+// Compiling in at least C++11 mode.
+#  define GTEST_LANG_CXX11 1
+# else
+#  define GTEST_LANG_CXX11 0
+# endif
+#endif
+
+// Brings in definitions for functions used in the testing::internal::posix
+// namespace (read, write, close, chdir, isatty, stat). We do not currently
+// use them on Windows Mobile.
+#if !GTEST_OS_WINDOWS
+// This assumes that non-Windows OSes provide unistd.h. For OSes where this
+// is not the case, we need to include headers that provide the functions
+// mentioned above.
+# include <unistd.h>
+# include <strings.h>
+#elif !GTEST_OS_WINDOWS_MOBILE
+# include <direct.h>
+# include <io.h>
+#endif
+
+#if GTEST_OS_LINUX_ANDROID
+// Used to define __ANDROID_API__ matching the target NDK API level.
+#  include <android/api-level.h>  // NOLINT
+#endif
+
+// Defines this to true iff Google Test can use POSIX regular expressions.
+#ifndef GTEST_HAS_POSIX_RE
+# if GTEST_OS_LINUX_ANDROID
+// On Android, <regex.h> is only available starting with Gingerbread.
+#  define GTEST_HAS_POSIX_RE (__ANDROID_API__ >= 9)
+# else
+#  define GTEST_HAS_POSIX_RE (!GTEST_OS_WINDOWS)
+# endif
+#endif
+
+#if GTEST_HAS_POSIX_RE
+
+// On some platforms, <regex.h> needs someone to define size_t, and
+// won't compile otherwise.  We can #include it here as we already
+// included <stdlib.h>, which is guaranteed to define size_t through
+// <stddef.h>.
+# include <regex.h>  // NOLINT
+
+# define GTEST_USES_POSIX_RE 1
+
+#elif GTEST_OS_WINDOWS
+
+// <regex.h> is not available on Windows.  Use our own simple regex
+// implementation instead.
+# define GTEST_USES_SIMPLE_RE 1
+
+#else
+
+// <regex.h> may not be available on this platform.  Use our own
+// simple regex implementation instead.
+# define GTEST_USES_SIMPLE_RE 1
+
+#endif  // GTEST_HAS_POSIX_RE
+
+#ifndef GTEST_HAS_EXCEPTIONS
+// The user didn't tell us whether exceptions are enabled, so we need
+// to figure it out.
+# if defined(_MSC_VER) || defined(__BORLANDC__)
+// MSVC's and C++Builder's implementations of the STL use the _HAS_EXCEPTIONS
+// macro to enable exceptions, so we'll do the same.
+// Assumes that exceptions are enabled by default.
+#  ifndef _HAS_EXCEPTIONS
+#   define _HAS_EXCEPTIONS 1
+#  endif  // _HAS_EXCEPTIONS
+#  define GTEST_HAS_EXCEPTIONS _HAS_EXCEPTIONS
+# elif defined(__GNUC__) && __EXCEPTIONS
+// gcc defines __EXCEPTIONS to 1 iff exceptions are enabled.
+#  define GTEST_HAS_EXCEPTIONS 1
+# elif defined(__SUNPRO_CC)
+// Sun Pro CC supports exceptions.  However, there is no compile-time way of
+// detecting whether they are enabled or not.  Therefore, we assume that
+// they are enabled unless the user tells us otherwise.
+#  define GTEST_HAS_EXCEPTIONS 1
+# elif defined(__IBMCPP__) && __EXCEPTIONS
+// xlC defines __EXCEPTIONS to 1 iff exceptions are enabled.
+#  define GTEST_HAS_EXCEPTIONS 1
+# elif defined(__HP_aCC)
+// Exception handling is in effect by default in HP aCC compiler. It has to
+// be turned of by +noeh compiler option if desired.
+#  define GTEST_HAS_EXCEPTIONS 1
+# else
+// For other compilers, we assume exceptions are disabled to be
+// conservative.
+#  define GTEST_HAS_EXCEPTIONS 0
+# endif  // defined(_MSC_VER) || defined(__BORLANDC__)
+#endif  // GTEST_HAS_EXCEPTIONS
+
+#if !defined(GTEST_HAS_STD_STRING)
+// Even though we don't use this macro any longer, we keep it in case
+// some clients still depend on it.
+# define GTEST_HAS_STD_STRING 1
+#elif !GTEST_HAS_STD_STRING
+// The user told us that ::std::string isn't available.
+# error "Google Test cannot be used where ::std::string isn't available."
+#endif  // !defined(GTEST_HAS_STD_STRING)
+
+#ifndef GTEST_HAS_GLOBAL_STRING
+// The user didn't tell us whether ::string is available, so we need
+// to figure it out.
+
+# define GTEST_HAS_GLOBAL_STRING 0
+
+#endif  // GTEST_HAS_GLOBAL_STRING
+
+#ifndef GTEST_HAS_STD_WSTRING
+// The user didn't tell us whether ::std::wstring is available, so we need
+// to figure it out.
+// TODO(wan@google.com): uses autoconf to detect whether ::std::wstring
+//   is available.
+
+// Cygwin 1.7 and below doesn't support ::std::wstring.
+// Solaris' libc++ doesn't support it either.  Android has
+// no support for it at least as recent as Froyo (2.2).
+# define GTEST_HAS_STD_WSTRING \
+    (!(GTEST_OS_LINUX_ANDROID || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS))
+
+#endif  // GTEST_HAS_STD_WSTRING
+
+#ifndef GTEST_HAS_GLOBAL_WSTRING
+// The user didn't tell us whether ::wstring is available, so we need
+// to figure it out.
+# define GTEST_HAS_GLOBAL_WSTRING \
+    (GTEST_HAS_STD_WSTRING && GTEST_HAS_GLOBAL_STRING)
+#endif  // GTEST_HAS_GLOBAL_WSTRING
+
+// Determines whether RTTI is available.
+#ifndef GTEST_HAS_RTTI
+// The user didn't tell us whether RTTI is enabled, so we need to
+// figure it out.
+
+# ifdef _MSC_VER
+
+#  ifdef _CPPRTTI  // MSVC defines this macro iff RTTI is enabled.
+#   define GTEST_HAS_RTTI 1
+#  else
+#   define GTEST_HAS_RTTI 0
+#  endif
+
+// Starting with version 4.3.2, gcc defines __GXX_RTTI iff RTTI is enabled.
+# elif defined(__GNUC__) && (GTEST_GCC_VER_ >= 40302)
+
+#  ifdef __GXX_RTTI
+// When building against STLport with the Android NDK and with
+// -frtti -fno-exceptions, the build fails at link time with undefined
+// references to __cxa_bad_typeid. Note sure if STL or toolchain bug,
+// so disable RTTI when detected.
+#   if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR) && \
+       !defined(__EXCEPTIONS)
+#    define GTEST_HAS_RTTI 0
+#   else
+#    define GTEST_HAS_RTTI 1
+#   endif  // GTEST_OS_LINUX_ANDROID && __STLPORT_MAJOR && !__EXCEPTIONS
+#  else
+#   define GTEST_HAS_RTTI 0
+#  endif  // __GXX_RTTI
+
+// Clang defines __GXX_RTTI starting with version 3.0, but its manual recommends
+// using has_feature instead. has_feature(cxx_rtti) is supported since 2.7, the
+// first version with C++ support.
+# elif defined(__clang__)
+
+#  define GTEST_HAS_RTTI __has_feature(cxx_rtti)
+
+// Starting with version 9.0 IBM Visual Age defines __RTTI_ALL__ to 1 if
+// both the typeid and dynamic_cast features are present.
+# elif defined(__IBMCPP__) && (__IBMCPP__ >= 900)
+
+#  ifdef __RTTI_ALL__
+#   define GTEST_HAS_RTTI 1
+#  else
+#   define GTEST_HAS_RTTI 0
+#  endif
+
+# else
+
+// For all other compilers, we assume RTTI is enabled.
+#  define GTEST_HAS_RTTI 1
+
+# endif  // _MSC_VER
+
+#endif  // GTEST_HAS_RTTI
+
+// It's this header's responsibility to #include <typeinfo> when RTTI
+// is enabled.
+#if GTEST_HAS_RTTI
+# include <typeinfo>
+#endif
+
+// Determines whether Google Test can use the pthreads library.
+#ifndef GTEST_HAS_PTHREAD
+// The user didn't tell us explicitly, so we assume pthreads support is
+// available on Linux and Mac.
+//
+// To disable threading support in Google Test, add -DGTEST_HAS_PTHREAD=0
+// to your compiler flags.
+# define GTEST_HAS_PTHREAD (GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_HPUX \
+    || GTEST_OS_QNX)
+#endif  // GTEST_HAS_PTHREAD
+
+#if GTEST_HAS_PTHREAD
+// gtest-port.h guarantees to #include <pthread.h> when GTEST_HAS_PTHREAD is
+// true.
+# include <pthread.h>  // NOLINT
+
+// For timespec and nanosleep, used below.
+# include <time.h>  // NOLINT
+#endif
+
+// Determines whether Google Test can use tr1/tuple.  You can define
+// this macro to 0 to prevent Google Test from using tuple (any
+// feature depending on tuple with be disabled in this mode).
+#ifndef GTEST_HAS_TR1_TUPLE
+# if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR)
+// STLport, provided with the Android NDK, has neither <tr1/tuple> or <tuple>.
+#  define GTEST_HAS_TR1_TUPLE 0
+# else
+// The user didn't tell us not to do it, so we assume it's OK.
+#  define GTEST_HAS_TR1_TUPLE 1
+# endif
+#endif  // GTEST_HAS_TR1_TUPLE
+
+// Determines whether Google Test's own tr1 tuple implementation
+// should be used.
+#ifndef GTEST_USE_OWN_TR1_TUPLE
+// The user didn't tell us, so we need to figure it out.
+
+// We use our own TR1 tuple if we aren't sure the user has an
+// implementation of it already.  At this time, libstdc++ 4.0.0+ and
+// MSVC 2010 are the only mainstream standard libraries that come
+// with a TR1 tuple implementation.  NVIDIA's CUDA NVCC compiler
+// pretends to be GCC by defining __GNUC__ and friends, but cannot
+// compile GCC's tuple implementation.  MSVC 2008 (9.0) provides TR1
+// tuple in a 323 MB Feature Pack download, which we cannot assume the
+// user has.  QNX's QCC compiler is a modified GCC but it doesn't
+// support TR1 tuple.  libc++ only provides std::tuple, in C++11 mode,
+// and it can be used with some compilers that define __GNUC__.
+# if (defined(__GNUC__) && !defined(__CUDACC__) && (GTEST_GCC_VER_ >= 40000) \
+      && !GTEST_OS_QNX && !defined(_LIBCPP_VERSION)) || _MSC_VER >= 1600
+#  define GTEST_ENV_HAS_TR1_TUPLE_ 1
+# endif
+
+// C++11 specifies that <tuple> provides std::tuple. Use that if gtest is used
+// in C++11 mode and libstdc++ isn't very old (binaries targeting OS X 10.6
+// can build with clang but need to use gcc4.2's libstdc++).
+# if GTEST_LANG_CXX11 && (!defined(__GLIBCXX__) || __GLIBCXX__ > 20110325)
+#  define GTEST_ENV_HAS_STD_TUPLE_ 1
+# endif
+
+# if GTEST_ENV_HAS_TR1_TUPLE_ || GTEST_ENV_HAS_STD_TUPLE_
+#  define GTEST_USE_OWN_TR1_TUPLE 0
+# else
+#  define GTEST_USE_OWN_TR1_TUPLE 1
+# endif
+
+#endif  // GTEST_USE_OWN_TR1_TUPLE
+
+// To avoid conditional compilation everywhere, we make it
+// gtest-port.h's responsibility to #include the header implementing
+// tr1/tuple.
+#if GTEST_HAS_TR1_TUPLE
+
+# if GTEST_USE_OWN_TR1_TUPLE
+// This file was GENERATED by command:
+//     pump.py gtest-tuple.h.pump
+// DO NOT EDIT BY HAND!!!
+
+// Copyright 2009 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+// Implements a subset of TR1 tuple needed by Google Test and Google Mock.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
+
+#include <utility>  // For ::std::pair.
+
+// The compiler used in Symbian has a bug that prevents us from declaring the
+// tuple template as a friend (it complains that tuple is redefined).  This
+// hack bypasses the bug by declaring the members that should otherwise be
+// private as public.
+// Sun Studio versions < 12 also have the above bug.
+#if defined(__SYMBIAN32__) || (defined(__SUNPRO_CC) && __SUNPRO_CC < 0x590)
+# define GTEST_DECLARE_TUPLE_AS_FRIEND_ public:
+#else
+# define GTEST_DECLARE_TUPLE_AS_FRIEND_ \
+    template <GTEST_10_TYPENAMES_(U)> friend class tuple; \
+   private:
+#endif
+
+// GTEST_n_TUPLE_(T) is the type of an n-tuple.
+#define GTEST_0_TUPLE_(T) tuple<>
+#define GTEST_1_TUPLE_(T) tuple<T##0, void, void, void, void, void, void, \
+    void, void, void>
+#define GTEST_2_TUPLE_(T) tuple<T##0, T##1, void, void, void, void, void, \
+    void, void, void>
+#define GTEST_3_TUPLE_(T) tuple<T##0, T##1, T##2, void, void, void, void, \
+    void, void, void>
+#define GTEST_4_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, void, void, void, \
+    void, void, void>
+#define GTEST_5_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, void, void, \
+    void, void, void>
+#define GTEST_6_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, void, \
+    void, void, void>
+#define GTEST_7_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, T##6, \
+    void, void, void>
+#define GTEST_8_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, T##6, \
+    T##7, void, void>
+#define GTEST_9_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, T##6, \
+    T##7, T##8, void>
+#define GTEST_10_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, T##6, \
+    T##7, T##8, T##9>
+
+// GTEST_n_TYPENAMES_(T) declares a list of n typenames.
+#define GTEST_0_TYPENAMES_(T)
+#define GTEST_1_TYPENAMES_(T) typename T##0
+#define GTEST_2_TYPENAMES_(T) typename T##0, typename T##1
+#define GTEST_3_TYPENAMES_(T) typename T##0, typename T##1, typename T##2
+#define GTEST_4_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3
+#define GTEST_5_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3, typename T##4
+#define GTEST_6_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3, typename T##4, typename T##5
+#define GTEST_7_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3, typename T##4, typename T##5, typename T##6
+#define GTEST_8_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3, typename T##4, typename T##5, typename T##6, typename T##7
+#define GTEST_9_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3, typename T##4, typename T##5, typename T##6, \
+    typename T##7, typename T##8
+#define GTEST_10_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3, typename T##4, typename T##5, typename T##6, \
+    typename T##7, typename T##8, typename T##9
+
+// In theory, defining stuff in the ::std namespace is undefined
+// behavior.  We can do this as we are playing the role of a standard
+// library vendor.
+namespace std {
+namespace tr1 {
+
+template <typename T0 = void, typename T1 = void, typename T2 = void,
+    typename T3 = void, typename T4 = void, typename T5 = void,
+    typename T6 = void, typename T7 = void, typename T8 = void,
+    typename T9 = void>
+class tuple;
+
+// Anything in namespace gtest_internal is Google Test's INTERNAL
+// IMPLEMENTATION DETAIL and MUST NOT BE USED DIRECTLY in user code.
+namespace gtest_internal {
+
+// ByRef<T>::type is T if T is a reference; otherwise it's const T&.
+template <typename T>
+struct ByRef { typedef const T& type; };  // NOLINT
+template <typename T>
+struct ByRef<T&> { typedef T& type; };  // NOLINT
+
+// A handy wrapper for ByRef.
+#define GTEST_BY_REF_(T) typename ::std::tr1::gtest_internal::ByRef<T>::type
+
+// AddRef<T>::type is T if T is a reference; otherwise it's T&.  This
+// is the same as tr1::add_reference<T>::type.
+template <typename T>
+struct AddRef { typedef T& type; };  // NOLINT
+template <typename T>
+struct AddRef<T&> { typedef T& type; };  // NOLINT
+
+// A handy wrapper for AddRef.
+#define GTEST_ADD_REF_(T) typename ::std::tr1::gtest_internal::AddRef<T>::type
+
+// A helper for implementing get<k>().
+template <int k> class Get;
+
+// A helper for implementing tuple_element<k, T>.  kIndexValid is true
+// iff k < the number of fields in tuple type T.
+template <bool kIndexValid, int kIndex, class Tuple>
+struct TupleElement;
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 0, GTEST_10_TUPLE_(T) > {
+  typedef T0 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 1, GTEST_10_TUPLE_(T) > {
+  typedef T1 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 2, GTEST_10_TUPLE_(T) > {
+  typedef T2 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 3, GTEST_10_TUPLE_(T) > {
+  typedef T3 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 4, GTEST_10_TUPLE_(T) > {
+  typedef T4 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 5, GTEST_10_TUPLE_(T) > {
+  typedef T5 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 6, GTEST_10_TUPLE_(T) > {
+  typedef T6 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 7, GTEST_10_TUPLE_(T) > {
+  typedef T7 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 8, GTEST_10_TUPLE_(T) > {
+  typedef T8 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 9, GTEST_10_TUPLE_(T) > {
+  typedef T9 type;
+};
+
+}  // namespace gtest_internal
+
+template <>
+class tuple<> {
+ public:
+  tuple() {}
+  tuple(const tuple& /* t */)  {}
+  tuple& operator=(const tuple& /* t */) { return *this; }
+};
+
+template <GTEST_1_TYPENAMES_(T)>
+class GTEST_1_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0) : f0_(f0) {}
+
+  tuple(const tuple& t) : f0_(t.f0_) {}
+
+  template <GTEST_1_TYPENAMES_(U)>
+  tuple(const GTEST_1_TUPLE_(U)& t) : f0_(t.f0_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_1_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_1_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_1_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_1_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    return *this;
+  }
+
+  T0 f0_;
+};
+
+template <GTEST_2_TYPENAMES_(T)>
+class GTEST_2_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1) : f0_(f0),
+      f1_(f1) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_) {}
+
+  template <GTEST_2_TYPENAMES_(U)>
+  tuple(const GTEST_2_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_) {}
+  template <typename U0, typename U1>
+  tuple(const ::std::pair<U0, U1>& p) : f0_(p.first), f1_(p.second) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_2_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_2_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+  template <typename U0, typename U1>
+  tuple& operator=(const ::std::pair<U0, U1>& p) {
+    f0_ = p.first;
+    f1_ = p.second;
+    return *this;
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_2_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_2_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+};
+
+template <GTEST_3_TYPENAMES_(T)>
+class GTEST_3_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2) : f0_(f0), f1_(f1), f2_(f2) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_) {}
+
+  template <GTEST_3_TYPENAMES_(U)>
+  tuple(const GTEST_3_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_3_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_3_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_3_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_3_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+};
+
+template <GTEST_4_TYPENAMES_(T)>
+class GTEST_4_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3) : f0_(f0), f1_(f1), f2_(f2),
+      f3_(f3) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_) {}
+
+  template <GTEST_4_TYPENAMES_(U)>
+  tuple(const GTEST_4_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_4_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_4_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_4_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_4_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+};
+
+template <GTEST_5_TYPENAMES_(T)>
+class GTEST_5_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_(), f4_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3,
+      GTEST_BY_REF_(T4) f4) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
+      f4_(t.f4_) {}
+
+  template <GTEST_5_TYPENAMES_(U)>
+  tuple(const GTEST_5_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_), f4_(t.f4_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_5_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_5_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_5_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_5_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    f4_ = t.f4_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+  T4 f4_;
+};
+
+template <GTEST_6_TYPENAMES_(T)>
+class GTEST_6_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
+      GTEST_BY_REF_(T5) f5) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4),
+      f5_(f5) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
+      f4_(t.f4_), f5_(t.f5_) {}
+
+  template <GTEST_6_TYPENAMES_(U)>
+  tuple(const GTEST_6_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_6_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_6_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_6_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_6_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    f4_ = t.f4_;
+    f5_ = t.f5_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+  T4 f4_;
+  T5 f5_;
+};
+
+template <GTEST_7_TYPENAMES_(T)>
+class GTEST_7_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
+      GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6) : f0_(f0), f1_(f1), f2_(f2),
+      f3_(f3), f4_(f4), f5_(f5), f6_(f6) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
+      f4_(t.f4_), f5_(t.f5_), f6_(t.f6_) {}
+
+  template <GTEST_7_TYPENAMES_(U)>
+  tuple(const GTEST_7_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_7_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_7_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_7_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_7_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    f4_ = t.f4_;
+    f5_ = t.f5_;
+    f6_ = t.f6_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+  T4 f4_;
+  T5 f5_;
+  T6 f6_;
+};
+
+template <GTEST_8_TYPENAMES_(T)>
+class GTEST_8_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
+      GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6,
+      GTEST_BY_REF_(T7) f7) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4),
+      f5_(f5), f6_(f6), f7_(f7) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
+      f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_) {}
+
+  template <GTEST_8_TYPENAMES_(U)>
+  tuple(const GTEST_8_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_8_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_8_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_8_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_8_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    f4_ = t.f4_;
+    f5_ = t.f5_;
+    f6_ = t.f6_;
+    f7_ = t.f7_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+  T4 f4_;
+  T5 f5_;
+  T6 f6_;
+  T7 f7_;
+};
+
+template <GTEST_9_TYPENAMES_(T)>
+class GTEST_9_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_(), f8_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
+      GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6, GTEST_BY_REF_(T7) f7,
+      GTEST_BY_REF_(T8) f8) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4),
+      f5_(f5), f6_(f6), f7_(f7), f8_(f8) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
+      f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_) {}
+
+  template <GTEST_9_TYPENAMES_(U)>
+  tuple(const GTEST_9_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_9_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_9_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_9_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_9_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    f4_ = t.f4_;
+    f5_ = t.f5_;
+    f6_ = t.f6_;
+    f7_ = t.f7_;
+    f8_ = t.f8_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+  T4 f4_;
+  T5 f5_;
+  T6 f6_;
+  T7 f7_;
+  T8 f8_;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+class tuple {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_(), f8_(),
+      f9_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
+      GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6, GTEST_BY_REF_(T7) f7,
+      GTEST_BY_REF_(T8) f8, GTEST_BY_REF_(T9) f9) : f0_(f0), f1_(f1), f2_(f2),
+      f3_(f3), f4_(f4), f5_(f5), f6_(f6), f7_(f7), f8_(f8), f9_(f9) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
+      f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_), f9_(t.f9_) {}
+
+  template <GTEST_10_TYPENAMES_(U)>
+  tuple(const GTEST_10_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_),
+      f9_(t.f9_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_10_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_10_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_10_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_10_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    f4_ = t.f4_;
+    f5_ = t.f5_;
+    f6_ = t.f6_;
+    f7_ = t.f7_;
+    f8_ = t.f8_;
+    f9_ = t.f9_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+  T4 f4_;
+  T5 f5_;
+  T6 f6_;
+  T7 f7_;
+  T8 f8_;
+  T9 f9_;
+};
+
+// 6.1.3.2 Tuple creation functions.
+
+// Known limitations: we don't support passing an
+// std::tr1::reference_wrapper<T> to make_tuple().  And we don't
+// implement tie().
+
+inline tuple<> make_tuple() { return tuple<>(); }
+
+template <GTEST_1_TYPENAMES_(T)>
+inline GTEST_1_TUPLE_(T) make_tuple(const T0& f0) {
+  return GTEST_1_TUPLE_(T)(f0);
+}
+
+template <GTEST_2_TYPENAMES_(T)>
+inline GTEST_2_TUPLE_(T) make_tuple(const T0& f0, const T1& f1) {
+  return GTEST_2_TUPLE_(T)(f0, f1);
+}
+
+template <GTEST_3_TYPENAMES_(T)>
+inline GTEST_3_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2) {
+  return GTEST_3_TUPLE_(T)(f0, f1, f2);
+}
+
+template <GTEST_4_TYPENAMES_(T)>
+inline GTEST_4_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3) {
+  return GTEST_4_TUPLE_(T)(f0, f1, f2, f3);
+}
+
+template <GTEST_5_TYPENAMES_(T)>
+inline GTEST_5_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3, const T4& f4) {
+  return GTEST_5_TUPLE_(T)(f0, f1, f2, f3, f4);
+}
+
+template <GTEST_6_TYPENAMES_(T)>
+inline GTEST_6_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3, const T4& f4, const T5& f5) {
+  return GTEST_6_TUPLE_(T)(f0, f1, f2, f3, f4, f5);
+}
+
+template <GTEST_7_TYPENAMES_(T)>
+inline GTEST_7_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3, const T4& f4, const T5& f5, const T6& f6) {
+  return GTEST_7_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6);
+}
+
+template <GTEST_8_TYPENAMES_(T)>
+inline GTEST_8_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7) {
+  return GTEST_8_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7);
+}
+
+template <GTEST_9_TYPENAMES_(T)>
+inline GTEST_9_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7,
+    const T8& f8) {
+  return GTEST_9_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7, f8);
+}
+
+template <GTEST_10_TYPENAMES_(T)>
+inline GTEST_10_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7,
+    const T8& f8, const T9& f9) {
+  return GTEST_10_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7, f8, f9);
+}
+
+// 6.1.3.3 Tuple helper classes.
+
+template <typename Tuple> struct tuple_size;
+
+template <GTEST_0_TYPENAMES_(T)>
+struct tuple_size<GTEST_0_TUPLE_(T) > {
+  static const int value = 0;
+};
+
+template <GTEST_1_TYPENAMES_(T)>
+struct tuple_size<GTEST_1_TUPLE_(T) > {
+  static const int value = 1;
+};
+
+template <GTEST_2_TYPENAMES_(T)>
+struct tuple_size<GTEST_2_TUPLE_(T) > {
+  static const int value = 2;
+};
+
+template <GTEST_3_TYPENAMES_(T)>
+struct tuple_size<GTEST_3_TUPLE_(T) > {
+  static const int value = 3;
+};
+
+template <GTEST_4_TYPENAMES_(T)>
+struct tuple_size<GTEST_4_TUPLE_(T) > {
+  static const int value = 4;
+};
+
+template <GTEST_5_TYPENAMES_(T)>
+struct tuple_size<GTEST_5_TUPLE_(T) > {
+  static const int value = 5;
+};
+
+template <GTEST_6_TYPENAMES_(T)>
+struct tuple_size<GTEST_6_TUPLE_(T) > {
+  static const int value = 6;
+};
+
+template <GTEST_7_TYPENAMES_(T)>
+struct tuple_size<GTEST_7_TUPLE_(T) > {
+  static const int value = 7;
+};
+
+template <GTEST_8_TYPENAMES_(T)>
+struct tuple_size<GTEST_8_TUPLE_(T) > {
+  static const int value = 8;
+};
+
+template <GTEST_9_TYPENAMES_(T)>
+struct tuple_size<GTEST_9_TUPLE_(T) > {
+  static const int value = 9;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct tuple_size<GTEST_10_TUPLE_(T) > {
+  static const int value = 10;
+};
+
+template <int k, class Tuple>
+struct tuple_element {
+  typedef typename gtest_internal::TupleElement<
+      k < (tuple_size<Tuple>::value), k, Tuple>::type type;
+};
+
+#define GTEST_TUPLE_ELEMENT_(k, Tuple) typename tuple_element<k, Tuple >::type
+
+// 6.1.3.4 Element access.
+
+namespace gtest_internal {
+
+template <>
+class Get<0> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(0, Tuple))
+  Field(Tuple& t) { return t.f0_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(0, Tuple))
+  ConstField(const Tuple& t) { return t.f0_; }
+};
+
+template <>
+class Get<1> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(1, Tuple))
+  Field(Tuple& t) { return t.f1_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(1, Tuple))
+  ConstField(const Tuple& t) { return t.f1_; }
+};
+
+template <>
+class Get<2> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(2, Tuple))
+  Field(Tuple& t) { return t.f2_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(2, Tuple))
+  ConstField(const Tuple& t) { return t.f2_; }
+};
+
+template <>
+class Get<3> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(3, Tuple))
+  Field(Tuple& t) { return t.f3_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(3, Tuple))
+  ConstField(const Tuple& t) { return t.f3_; }
+};
+
+template <>
+class Get<4> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(4, Tuple))
+  Field(Tuple& t) { return t.f4_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(4, Tuple))
+  ConstField(const Tuple& t) { return t.f4_; }
+};
+
+template <>
+class Get<5> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(5, Tuple))
+  Field(Tuple& t) { return t.f5_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(5, Tuple))
+  ConstField(const Tuple& t) { return t.f5_; }
+};
+
+template <>
+class Get<6> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(6, Tuple))
+  Field(Tuple& t) { return t.f6_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(6, Tuple))
+  ConstField(const Tuple& t) { return t.f6_; }
+};
+
+template <>
+class Get<7> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(7, Tuple))
+  Field(Tuple& t) { return t.f7_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(7, Tuple))
+  ConstField(const Tuple& t) { return t.f7_; }
+};
+
+template <>
+class Get<8> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(8, Tuple))
+  Field(Tuple& t) { return t.f8_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(8, Tuple))
+  ConstField(const Tuple& t) { return t.f8_; }
+};
+
+template <>
+class Get<9> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(9, Tuple))
+  Field(Tuple& t) { return t.f9_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(9, Tuple))
+  ConstField(const Tuple& t) { return t.f9_; }
+};
+
+}  // namespace gtest_internal
+
+template <int k, GTEST_10_TYPENAMES_(T)>
+GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(k, GTEST_10_TUPLE_(T)))
+get(GTEST_10_TUPLE_(T)& t) {
+  return gtest_internal::Get<k>::Field(t);
+}
+
+template <int k, GTEST_10_TYPENAMES_(T)>
+GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(k,  GTEST_10_TUPLE_(T)))
+get(const GTEST_10_TUPLE_(T)& t) {
+  return gtest_internal::Get<k>::ConstField(t);
+}
+
+// 6.1.3.5 Relational operators
+
+// We only implement == and !=, as we don't have a need for the rest yet.
+
+namespace gtest_internal {
+
+// SameSizeTuplePrefixComparator<k, k>::Eq(t1, t2) returns true if the
+// first k fields of t1 equals the first k fields of t2.
+// SameSizeTuplePrefixComparator(k1, k2) would be a compiler error if
+// k1 != k2.
+template <int kSize1, int kSize2>
+struct SameSizeTuplePrefixComparator;
+
+template <>
+struct SameSizeTuplePrefixComparator<0, 0> {
+  template <class Tuple1, class Tuple2>
+  static bool Eq(const Tuple1& /* t1 */, const Tuple2& /* t2 */) {
+    return true;
+  }
+};
+
+template <int k>
+struct SameSizeTuplePrefixComparator<k, k> {
+  template <class Tuple1, class Tuple2>
+  static bool Eq(const Tuple1& t1, const Tuple2& t2) {
+    return SameSizeTuplePrefixComparator<k - 1, k - 1>::Eq(t1, t2) &&
+        ::std::tr1::get<k - 1>(t1) == ::std::tr1::get<k - 1>(t2);
+  }
+};
+
+}  // namespace gtest_internal
+
+template <GTEST_10_TYPENAMES_(T), GTEST_10_TYPENAMES_(U)>
+inline bool operator==(const GTEST_10_TUPLE_(T)& t,
+                       const GTEST_10_TUPLE_(U)& u) {
+  return gtest_internal::SameSizeTuplePrefixComparator<
+      tuple_size<GTEST_10_TUPLE_(T) >::value,
+      tuple_size<GTEST_10_TUPLE_(U) >::value>::Eq(t, u);
+}
+
+template <GTEST_10_TYPENAMES_(T), GTEST_10_TYPENAMES_(U)>
+inline bool operator!=(const GTEST_10_TUPLE_(T)& t,
+                       const GTEST_10_TUPLE_(U)& u) { return !(t == u); }
+
+// 6.1.4 Pairs.
+// Unimplemented.
+
+}  // namespace tr1
+}  // namespace std
+
+#undef GTEST_0_TUPLE_
+#undef GTEST_1_TUPLE_
+#undef GTEST_2_TUPLE_
+#undef GTEST_3_TUPLE_
+#undef GTEST_4_TUPLE_
+#undef GTEST_5_TUPLE_
+#undef GTEST_6_TUPLE_
+#undef GTEST_7_TUPLE_
+#undef GTEST_8_TUPLE_
+#undef GTEST_9_TUPLE_
+#undef GTEST_10_TUPLE_
+
+#undef GTEST_0_TYPENAMES_
+#undef GTEST_1_TYPENAMES_
+#undef GTEST_2_TYPENAMES_
+#undef GTEST_3_TYPENAMES_
+#undef GTEST_4_TYPENAMES_
+#undef GTEST_5_TYPENAMES_
+#undef GTEST_6_TYPENAMES_
+#undef GTEST_7_TYPENAMES_
+#undef GTEST_8_TYPENAMES_
+#undef GTEST_9_TYPENAMES_
+#undef GTEST_10_TYPENAMES_
+
+#undef GTEST_DECLARE_TUPLE_AS_FRIEND_
+#undef GTEST_BY_REF_
+#undef GTEST_ADD_REF_
+#undef GTEST_TUPLE_ELEMENT_
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
+# elif GTEST_ENV_HAS_STD_TUPLE_
+#  include <tuple>
+// C++11 puts its tuple into the ::std namespace rather than
+// ::std::tr1.  gtest expects tuple to live in ::std::tr1, so put it there.
+// This causes undefined behavior, but supported compilers react in
+// the way we intend.
+namespace std {
+namespace tr1 {
+using ::std::get;
+using ::std::make_tuple;
+using ::std::tuple;
+using ::std::tuple_element;
+using ::std::tuple_size;
+}
+}
+
+# elif GTEST_OS_SYMBIAN
+
+// On Symbian, BOOST_HAS_TR1_TUPLE causes Boost's TR1 tuple library to
+// use STLport's tuple implementation, which unfortunately doesn't
+// work as the copy of STLport distributed with Symbian is incomplete.
+// By making sure BOOST_HAS_TR1_TUPLE is undefined, we force Boost to
+// use its own tuple implementation.
+#  ifdef BOOST_HAS_TR1_TUPLE
+#   undef BOOST_HAS_TR1_TUPLE
+#  endif  // BOOST_HAS_TR1_TUPLE
+
+// This prevents <boost/tr1/detail/config.hpp>, which defines
+// BOOST_HAS_TR1_TUPLE, from being #included by Boost's <tuple>.
+#  define BOOST_TR1_DETAIL_CONFIG_HPP_INCLUDED
+#  include <tuple>
+
+# elif defined(__GNUC__) && (GTEST_GCC_VER_ >= 40000)
+// GCC 4.0+ implements tr1/tuple in the <tr1/tuple> header.  This does
+// not conform to the TR1 spec, which requires the header to be <tuple>.
+
+#  if !GTEST_HAS_RTTI && GTEST_GCC_VER_ < 40302
+// Until version 4.3.2, gcc has a bug that causes <tr1/functional>,
+// which is #included by <tr1/tuple>, to not compile when RTTI is
+// disabled.  _TR1_FUNCTIONAL is the header guard for
+// <tr1/functional>.  Hence the following #define is a hack to prevent
+// <tr1/functional> from being included.
+#   define _TR1_FUNCTIONAL 1
+#   include <tr1/tuple>
+#   undef _TR1_FUNCTIONAL  // Allows the user to #include
+                        // <tr1/functional> if he chooses to.
+#  else
+#   include <tr1/tuple>  // NOLINT
+#  endif  // !GTEST_HAS_RTTI && GTEST_GCC_VER_ < 40302
+
+# else
+// If the compiler is not GCC 4.0+, we assume the user is using a
+// spec-conforming TR1 implementation.
+#  include <tuple>  // NOLINT
+# endif  // GTEST_USE_OWN_TR1_TUPLE
+
+#endif  // GTEST_HAS_TR1_TUPLE
+
+// Determines whether clone(2) is supported.
+// Usually it will only be available on Linux, excluding
+// Linux on the Itanium architecture.
+// Also see http://linux.die.net/man/2/clone.
+#ifndef GTEST_HAS_CLONE
+// The user didn't tell us, so we need to figure it out.
+
+# if GTEST_OS_LINUX && !defined(__ia64__)
+#  if GTEST_OS_LINUX_ANDROID
+// On Android, clone() is only available on ARM starting with Gingerbread.
+#    if defined(__arm__) && __ANDROID_API__ >= 9
+#     define GTEST_HAS_CLONE 1
+#    else
+#     define GTEST_HAS_CLONE 0
+#    endif
+#  else
+#   define GTEST_HAS_CLONE 1
+#  endif
+# else
+#  define GTEST_HAS_CLONE 0
+# endif  // GTEST_OS_LINUX && !defined(__ia64__)
+
+#endif  // GTEST_HAS_CLONE
+
+// Determines whether to support stream redirection. This is used to test
+// output correctness and to implement death tests.
+#ifndef GTEST_HAS_STREAM_REDIRECTION
+// By default, we assume that stream redirection is supported on all
+// platforms except known mobile ones.
+# if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN
+#  define GTEST_HAS_STREAM_REDIRECTION 0
+# else
+#  define GTEST_HAS_STREAM_REDIRECTION 1
+# endif  // !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_SYMBIAN
+#endif  // GTEST_HAS_STREAM_REDIRECTION
+
+// Determines whether to support death tests.
+// Google Test does not support death tests for VC 7.1 and earlier as
+// abort() in a VC 7.1 application compiled as GUI in debug config
+// pops up a dialog window that cannot be suppressed programmatically.
+#if (GTEST_OS_LINUX || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS || \
+     (GTEST_OS_MAC && !GTEST_OS_IOS) || GTEST_OS_IOS_SIMULATOR || \
+     (GTEST_OS_WINDOWS_DESKTOP && _MSC_VER >= 1400) || \
+     GTEST_OS_WINDOWS_MINGW || GTEST_OS_AIX || GTEST_OS_HPUX || \
+     GTEST_OS_OPENBSD || GTEST_OS_QNX)
+# define GTEST_HAS_DEATH_TEST 1
+# include <vector>  // NOLINT
+#endif
+
+// We don't support MSVC 7.1 with exceptions disabled now.  Therefore
+// all the compilers we care about are adequate for supporting
+// value-parameterized tests.
+#define GTEST_HAS_PARAM_TEST 1
+
+// Determines whether to support type-driven tests.
+
+// Typed tests need <typeinfo> and variadic macros, which GCC, VC++ 8.0,
+// Sun Pro CC, IBM Visual Age, and HP aCC support.
+#if defined(__GNUC__) || (_MSC_VER >= 1400) || defined(__SUNPRO_CC) || \
+    defined(__IBMCPP__) || defined(__HP_aCC)
+# define GTEST_HAS_TYPED_TEST 1
+# define GTEST_HAS_TYPED_TEST_P 1
+#endif
+
+// Determines whether to support Combine(). This only makes sense when
+// value-parameterized tests are enabled.  The implementation doesn't
+// work on Sun Studio since it doesn't understand templated conversion
+// operators.
+#if GTEST_HAS_PARAM_TEST && GTEST_HAS_TR1_TUPLE && !defined(__SUNPRO_CC)
+# define GTEST_HAS_COMBINE 1
+#endif
+
+// Determines whether the system compiler uses UTF-16 for encoding wide strings.
+#define GTEST_WIDE_STRING_USES_UTF16_ \
+    (GTEST_OS_WINDOWS || GTEST_OS_CYGWIN || GTEST_OS_SYMBIAN || GTEST_OS_AIX)
+
+// Determines whether test results can be streamed to a socket.
+#if GTEST_OS_LINUX
+# define GTEST_CAN_STREAM_RESULTS_ 1
+#endif
+
+// Defines some utility macros.
+
+// The GNU compiler emits a warning if nested "if" statements are followed by
+// an "else" statement and braces are not used to explicitly disambiguate the
+// "else" binding.  This leads to problems with code like:
+//
+//   if (gate)
+//     ASSERT_*(condition) << "Some message";
+//
+// The "switch (0) case 0:" idiom is used to suppress this.
+#ifdef __INTEL_COMPILER
+# define GTEST_AMBIGUOUS_ELSE_BLOCKER_
+#else
+# define GTEST_AMBIGUOUS_ELSE_BLOCKER_ switch (0) case 0: default:  // NOLINT
+#endif
+
+// Use this annotation at the end of a struct/class definition to
+// prevent the compiler from optimizing away instances that are never
+// used.  This is useful when all interesting logic happens inside the
+// c'tor and / or d'tor.  Example:
+//
+//   struct Foo {
+//     Foo() { ... }
+//   } GTEST_ATTRIBUTE_UNUSED_;
+//
+// Also use it after a variable or parameter declaration to tell the
+// compiler the variable/parameter does not have to be used.
+#if defined(__GNUC__) && !defined(COMPILER_ICC)
+# define GTEST_ATTRIBUTE_UNUSED_ __attribute__ ((unused))
+#else
+# define GTEST_ATTRIBUTE_UNUSED_
+#endif
+
+// A macro to disallow operator=
+// This should be used in the private: declarations for a class.
+#define GTEST_DISALLOW_ASSIGN_(type)\
+  void operator=(type const &)
+
+// A macro to disallow copy constructor and operator=
+// This should be used in the private: declarations for a class.
+#define GTEST_DISALLOW_COPY_AND_ASSIGN_(type)\
+  type(type const &);\
+  GTEST_DISALLOW_ASSIGN_(type)
+
+// Tell the compiler to warn about unused return values for functions declared
+// with this macro.  The macro should be used on function declarations
+// following the argument list:
+//
+//   Sprocket* AllocateSprocket() GTEST_MUST_USE_RESULT_;
+#if defined(__GNUC__) && (GTEST_GCC_VER_ >= 30400) && !defined(COMPILER_ICC)
+# define GTEST_MUST_USE_RESULT_ __attribute__ ((warn_unused_result))
+#else
+# define GTEST_MUST_USE_RESULT_
+#endif  // __GNUC__ && (GTEST_GCC_VER_ >= 30400) && !COMPILER_ICC
+
+// Determine whether the compiler supports Microsoft's Structured Exception
+// Handling.  This is supported by several Windows compilers but generally
+// does not exist on any other system.
+#ifndef GTEST_HAS_SEH
+// The user didn't tell us, so we need to figure it out.
+
+# if defined(_MSC_VER) || defined(__BORLANDC__)
+// These two compilers are known to support SEH.
+#  define GTEST_HAS_SEH 1
+# else
+// Assume no SEH.
+#  define GTEST_HAS_SEH 0
+# endif
+
+#endif  // GTEST_HAS_SEH
+
+#ifdef _MSC_VER
+
+# if GTEST_LINKED_AS_SHARED_LIBRARY
+#  define GTEST_API_ __declspec(dllimport)
+# elif GTEST_CREATE_SHARED_LIBRARY
+#  define GTEST_API_ __declspec(dllexport)
+# endif
+
+#endif  // _MSC_VER
+
+#ifndef GTEST_API_
+# define GTEST_API_
+#endif
+
+#ifdef __GNUC__
+// Ask the compiler to never inline a given function.
+# define GTEST_NO_INLINE_ __attribute__((noinline))
+#else
+# define GTEST_NO_INLINE_
+#endif
+
+// _LIBCPP_VERSION is defined by the libc++ library from the LLVM project.
+#if defined(__GLIBCXX__) || defined(_LIBCPP_VERSION)
+# define GTEST_HAS_CXXABI_H_ 1
+#else
+# define GTEST_HAS_CXXABI_H_ 0
+#endif
+
+namespace testing {
+
+class Message;
+
+namespace internal {
+
+// A secret type that Google Test users don't know about.  It has no
+// definition on purpose.  Therefore it's impossible to create a
+// Secret object, which is what we want.
+class Secret;
+
+// The GTEST_COMPILE_ASSERT_ macro can be used to verify that a compile time
+// expression is true. For example, you could use it to verify the
+// size of a static array:
+//
+//   GTEST_COMPILE_ASSERT_(ARRAYSIZE(content_type_names) == CONTENT_NUM_TYPES,
+//                         content_type_names_incorrect_size);
+//
+// or to make sure a struct is smaller than a certain size:
+//
+//   GTEST_COMPILE_ASSERT_(sizeof(foo) < 128, foo_too_large);
+//
+// The second argument to the macro is the name of the variable. If
+// the expression is false, most compilers will issue a warning/error
+// containing the name of the variable.
+
+template <bool>
+struct CompileAssert {
+};
+
+#define GTEST_COMPILE_ASSERT_(expr, msg) \
+  typedef ::testing::internal::CompileAssert<(static_cast<bool>(expr))> \
+      msg[static_cast<bool>(expr) ? 1 : -1] GTEST_ATTRIBUTE_UNUSED_
+
+// Implementation details of GTEST_COMPILE_ASSERT_:
+//
+// - GTEST_COMPILE_ASSERT_ works by defining an array type that has -1
+//   elements (and thus is invalid) when the expression is false.
+//
+// - The simpler definition
+//
+//    #define GTEST_COMPILE_ASSERT_(expr, msg) typedef char msg[(expr) ? 1 : -1]
+//
+//   does not work, as gcc supports variable-length arrays whose sizes
+//   are determined at run-time (this is gcc's extension and not part
+//   of the C++ standard).  As a result, gcc fails to reject the
+//   following code with the simple definition:
+//
+//     int foo;
+//     GTEST_COMPILE_ASSERT_(foo, msg); // not supposed to compile as foo is
+//                                      // not a compile-time constant.
+//
+// - By using the type CompileAssert<(bool(expr))>, we ensures that
+//   expr is a compile-time constant.  (Template arguments must be
+//   determined at compile-time.)
+//
+// - The outter parentheses in CompileAssert<(bool(expr))> are necessary
+//   to work around a bug in gcc 3.4.4 and 4.0.1.  If we had written
+//
+//     CompileAssert<bool(expr)>
+//
+//   instead, these compilers will refuse to compile
+//
+//     GTEST_COMPILE_ASSERT_(5 > 0, some_message);
+//
+//   (They seem to think the ">" in "5 > 0" marks the end of the
+//   template argument list.)
+//
+// - The array size is (bool(expr) ? 1 : -1), instead of simply
+//
+//     ((expr) ? 1 : -1).
+//
+//   This is to avoid running into a bug in MS VC 7.1, which
+//   causes ((0.0) ? 1 : -1) to incorrectly evaluate to 1.
+
+// StaticAssertTypeEqHelper is used by StaticAssertTypeEq defined in gtest.h.
+//
+// This template is declared, but intentionally undefined.
+template <typename T1, typename T2>
+struct StaticAssertTypeEqHelper;
+
+template <typename T>
+struct StaticAssertTypeEqHelper<T, T> {};
+
+#if GTEST_HAS_GLOBAL_STRING
+typedef ::string string;
+#else
+typedef ::std::string string;
+#endif  // GTEST_HAS_GLOBAL_STRING
+
+#if GTEST_HAS_GLOBAL_WSTRING
+typedef ::wstring wstring;
+#elif GTEST_HAS_STD_WSTRING
+typedef ::std::wstring wstring;
+#endif  // GTEST_HAS_GLOBAL_WSTRING
+
+// A helper for suppressing warnings on constant condition.  It just
+// returns 'condition'.
+GTEST_API_ bool IsTrue(bool condition);
+
+// Defines scoped_ptr.
+
+// This implementation of scoped_ptr is PARTIAL - it only contains
+// enough stuff to satisfy Google Test's need.
+template <typename T>
+class scoped_ptr {
+ public:
+  typedef T element_type;
+
+  explicit scoped_ptr(T* p = NULL) : ptr_(p) {}
+  ~scoped_ptr() { reset(); }
+
+  T& operator*() const { return *ptr_; }
+  T* operator->() const { return ptr_; }
+  T* get() const { return ptr_; }
+
+  T* release() {
+    T* const ptr = ptr_;
+    ptr_ = NULL;
+    return ptr;
+  }
+
+  void reset(T* p = NULL) {
+    if (p != ptr_) {
+      if (IsTrue(sizeof(T) > 0)) {  // Makes sure T is a complete type.
+        delete ptr_;
+      }
+      ptr_ = p;
+    }
+  }
+
+ private:
+  T* ptr_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(scoped_ptr);
+};
+
+// Defines RE.
+
+// A simple C++ wrapper for <regex.h>.  It uses the POSIX Extended
+// Regular Expression syntax.
+class GTEST_API_ RE {
+ public:
+  // A copy constructor is required by the Standard to initialize object
+  // references from r-values.
+  RE(const RE& other) { Init(other.pattern()); }
+
+  // Constructs an RE from a string.
+  RE(const ::std::string& regex) { Init(regex.c_str()); }  // NOLINT
+
+#if GTEST_HAS_GLOBAL_STRING
+
+  RE(const ::string& regex) { Init(regex.c_str()); }  // NOLINT
+
+#endif  // GTEST_HAS_GLOBAL_STRING
+
+  RE(const char* regex) { Init(regex); }  // NOLINT
+  ~RE();
+
+  // Returns the string representation of the regex.
+  const char* pattern() const { return pattern_; }
+
+  // FullMatch(str, re) returns true iff regular expression re matches
+  // the entire str.
+  // PartialMatch(str, re) returns true iff regular expression re
+  // matches a substring of str (including str itself).
+  //
+  // TODO(wan@google.com): make FullMatch() and PartialMatch() work
+  // when str contains NUL characters.
+  static bool FullMatch(const ::std::string& str, const RE& re) {
+    return FullMatch(str.c_str(), re);
+  }
+  static bool PartialMatch(const ::std::string& str, const RE& re) {
+    return PartialMatch(str.c_str(), re);
+  }
+
+#if GTEST_HAS_GLOBAL_STRING
+
+  static bool FullMatch(const ::string& str, const RE& re) {
+    return FullMatch(str.c_str(), re);
+  }
+  static bool PartialMatch(const ::string& str, const RE& re) {
+    return PartialMatch(str.c_str(), re);
+  }
+
+#endif  // GTEST_HAS_GLOBAL_STRING
+
+  static bool FullMatch(const char* str, const RE& re);
+  static bool PartialMatch(const char* str, const RE& re);
+
+ private:
+  void Init(const char* regex);
+
+  // We use a const char* instead of an std::string, as Google Test used to be
+  // used where std::string is not available.  TODO(wan@google.com): change to
+  // std::string.
+  const char* pattern_;
+  bool is_valid_;
+
+#if GTEST_USES_POSIX_RE
+
+  regex_t full_regex_;     // For FullMatch().
+  regex_t partial_regex_;  // For PartialMatch().
+
+#else  // GTEST_USES_SIMPLE_RE
+
+  const char* full_pattern_;  // For FullMatch();
+
+#endif
+
+  GTEST_DISALLOW_ASSIGN_(RE);
+};
+
+// Formats a source file path and a line number as they would appear
+// in an error message from the compiler used to compile this code.
+GTEST_API_ ::std::string FormatFileLocation(const char* file, int line);
+
+// Formats a file location for compiler-independent XML output.
+// Although this function is not platform dependent, we put it next to
+// FormatFileLocation in order to contrast the two functions.
+GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char* file,
+                                                               int line);
+
+// Defines logging utilities:
+//   GTEST_LOG_(severity) - logs messages at the specified severity level. The
+//                          message itself is streamed into the macro.
+//   LogToStderr()  - directs all log messages to stderr.
+//   FlushInfoLog() - flushes informational log messages.
+
+enum GTestLogSeverity {
+  GTEST_INFO,
+  GTEST_WARNING,
+  GTEST_ERROR,
+  GTEST_FATAL
+};
+
+// Formats log entry severity, provides a stream object for streaming the
+// log message, and terminates the message with a newline when going out of
+// scope.
+class GTEST_API_ GTestLog {
+ public:
+  GTestLog(GTestLogSeverity severity, const char* file, int line);
+
+  // Flushes the buffers and, if severity is GTEST_FATAL, aborts the program.
+  ~GTestLog();
+
+  ::std::ostream& GetStream() { return ::std::cerr; }
+
+ private:
+  const GTestLogSeverity severity_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestLog);
+};
+
+#define GTEST_LOG_(severity) \
+    ::testing::internal::GTestLog(::testing::internal::GTEST_##severity, \
+                                  __FILE__, __LINE__).GetStream()
+
+inline void LogToStderr() {}
+inline void FlushInfoLog() { fflush(NULL); }
+
+// INTERNAL IMPLEMENTATION - DO NOT USE.
+//
+// GTEST_CHECK_ is an all-mode assert. It aborts the program if the condition
+// is not satisfied.
+//  Synopsys:
+//    GTEST_CHECK_(boolean_condition);
+//     or
+//    GTEST_CHECK_(boolean_condition) << "Additional message";
+//
+//    This checks the condition and if the condition is not satisfied
+//    it prints message about the condition violation, including the
+//    condition itself, plus additional message streamed into it, if any,
+//    and then it aborts the program. It aborts the program irrespective of
+//    whether it is built in the debug mode or not.
+#define GTEST_CHECK_(condition) \
+    GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+    if (::testing::internal::IsTrue(condition)) \
+      ; \
+    else \
+      GTEST_LOG_(FATAL) << "Condition " #condition " failed. "
+
+// An all-mode assert to verify that the given POSIX-style function
+// call returns 0 (indicating success).  Known limitation: this
+// doesn't expand to a balanced 'if' statement, so enclose the macro
+// in {} if you need to use it as the only statement in an 'if'
+// branch.
+#define GTEST_CHECK_POSIX_SUCCESS_(posix_call) \
+  if (const int gtest_error = (posix_call)) \
+    GTEST_LOG_(FATAL) << #posix_call << "failed with error " \
+                      << gtest_error
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Use ImplicitCast_ as a safe version of static_cast for upcasting in
+// the type hierarchy (e.g. casting a Foo* to a SuperclassOfFoo* or a
+// const Foo*).  When you use ImplicitCast_, the compiler checks that
+// the cast is safe.  Such explicit ImplicitCast_s are necessary in
+// surprisingly many situations where C++ demands an exact type match
+// instead of an argument type convertable to a target type.
+//
+// The syntax for using ImplicitCast_ is the same as for static_cast:
+//
+//   ImplicitCast_<ToType>(expr)
+//
+// ImplicitCast_ would have been part of the C++ standard library,
+// but the proposal was submitted too late.  It will probably make
+// its way into the language in the future.
+//
+// This relatively ugly name is intentional. It prevents clashes with
+// similar functions users may have (e.g., implicit_cast). The internal
+// namespace alone is not enough because the function can be found by ADL.
+template<typename To>
+inline To ImplicitCast_(To x) { return x; }
+
+// When you upcast (that is, cast a pointer from type Foo to type
+// SuperclassOfFoo), it's fine to use ImplicitCast_<>, since upcasts
+// always succeed.  When you downcast (that is, cast a pointer from
+// type Foo to type SubclassOfFoo), static_cast<> isn't safe, because
+// how do you know the pointer is really of type SubclassOfFoo?  It
+// could be a bare Foo, or of type DifferentSubclassOfFoo.  Thus,
+// when you downcast, you should use this macro.  In debug mode, we
+// use dynamic_cast<> to double-check the downcast is legal (we die
+// if it's not).  In normal mode, we do the efficient static_cast<>
+// instead.  Thus, it's important to test in debug mode to make sure
+// the cast is legal!
+//    This is the only place in the code we should use dynamic_cast<>.
+// In particular, you SHOULDN'T be using dynamic_cast<> in order to
+// do RTTI (eg code like this:
+//    if (dynamic_cast<Subclass1>(foo)) HandleASubclass1Object(foo);
+//    if (dynamic_cast<Subclass2>(foo)) HandleASubclass2Object(foo);
+// You should design the code some other way not to need this.
+//
+// This relatively ugly name is intentional. It prevents clashes with
+// similar functions users may have (e.g., down_cast). The internal
+// namespace alone is not enough because the function can be found by ADL.
+template<typename To, typename From>  // use like this: DownCast_<T*>(foo);
+inline To DownCast_(From* f) {  // so we only accept pointers
+  // Ensures that To is a sub-type of From *.  This test is here only
+  // for compile-time type checking, and has no overhead in an
+  // optimized build at run-time, as it will be optimized away
+  // completely.
+  if (false) {
+    const To to = NULL;
+    ::testing::internal::ImplicitCast_<From*>(to);
+  }
+
+#if GTEST_HAS_RTTI
+  // RTTI: debug mode only!
+  GTEST_CHECK_(f == NULL || dynamic_cast<To>(f) != NULL);
+#endif
+  return static_cast<To>(f);
+}
+
+// Downcasts the pointer of type Base to Derived.
+// Derived must be a subclass of Base. The parameter MUST
+// point to a class of type Derived, not any subclass of it.
+// When RTTI is available, the function performs a runtime
+// check to enforce this.
+template <class Derived, class Base>
+Derived* CheckedDowncastToActualType(Base* base) {
+#if GTEST_HAS_RTTI
+  GTEST_CHECK_(typeid(*base) == typeid(Derived));
+  return dynamic_cast<Derived*>(base);  // NOLINT
+#else
+  return static_cast<Derived*>(base);  // Poor man's downcast.
+#endif
+}
+
+#if GTEST_HAS_STREAM_REDIRECTION
+
+// Defines the stderr capturer:
+//   CaptureStdout     - starts capturing stdout.
+//   GetCapturedStdout - stops capturing stdout and returns the captured string.
+//   CaptureStderr     - starts capturing stderr.
+//   GetCapturedStderr - stops capturing stderr and returns the captured string.
+//
+GTEST_API_ void CaptureStdout();
+GTEST_API_ std::string GetCapturedStdout();
+GTEST_API_ void CaptureStderr();
+GTEST_API_ std::string GetCapturedStderr();
+
+#endif  // GTEST_HAS_STREAM_REDIRECTION
+
+
+#if GTEST_HAS_DEATH_TEST
+
+const ::std::vector<testing::internal::string>& GetInjectableArgvs();
+void SetInjectableArgvs(const ::std::vector<testing::internal::string>*
+                             new_argvs);
+
+// A copy of all command line arguments.  Set by InitGoogleTest().
+extern ::std::vector<testing::internal::string> g_argvs;
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+// Defines synchronization primitives.
+
+#if GTEST_HAS_PTHREAD
+
+// Sleeps for (roughly) n milli-seconds.  This function is only for
+// testing Google Test's own constructs.  Don't use it in user tests,
+// either directly or indirectly.
+inline void SleepMilliseconds(int n) {
+  const timespec time = {
+    0,                  // 0 seconds.
+    n * 1000L * 1000L,  // And n ms.
+  };
+  nanosleep(&time, NULL);
+}
+
+// Allows a controller thread to pause execution of newly created
+// threads until notified.  Instances of this class must be created
+// and destroyed in the controller thread.
+//
+// This class is only for testing Google Test's own constructs. Do not
+// use it in user tests, either directly or indirectly.
+class Notification {
+ public:
+  Notification() : notified_(false) {
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, NULL));
+  }
+  ~Notification() {
+    pthread_mutex_destroy(&mutex_);
+  }
+
+  // Notifies all threads created with this notification to start. Must
+  // be called from the controller thread.
+  void Notify() {
+    pthread_mutex_lock(&mutex_);
+    notified_ = true;
+    pthread_mutex_unlock(&mutex_);
+  }
+
+  // Blocks until the controller thread notifies. Must be called from a test
+  // thread.
+  void WaitForNotification() {
+    for (;;) {
+      pthread_mutex_lock(&mutex_);
+      const bool notified = notified_;
+      pthread_mutex_unlock(&mutex_);
+      if (notified)
+        break;
+      SleepMilliseconds(10);
+    }
+  }
+
+ private:
+  pthread_mutex_t mutex_;
+  bool notified_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification);
+};
+
+// As a C-function, ThreadFuncWithCLinkage cannot be templated itself.
+// Consequently, it cannot select a correct instantiation of ThreadWithParam
+// in order to call its Run(). Introducing ThreadWithParamBase as a
+// non-templated base class for ThreadWithParam allows us to bypass this
+// problem.
+class ThreadWithParamBase {
+ public:
+  virtual ~ThreadWithParamBase() {}
+  virtual void Run() = 0;
+};
+
+// pthread_create() accepts a pointer to a function type with the C linkage.
+// According to the Standard (7.5/1), function types with different linkages
+// are different even if they are otherwise identical.  Some compilers (for
+// example, SunStudio) treat them as different types.  Since class methods
+// cannot be defined with C-linkage we need to define a free C-function to
+// pass into pthread_create().
+extern "C" inline void* ThreadFuncWithCLinkage(void* thread) {
+  static_cast<ThreadWithParamBase*>(thread)->Run();
+  return NULL;
+}
+
+// Helper class for testing Google Test's multi-threading constructs.
+// To use it, write:
+//
+//   void ThreadFunc(int param) { /* Do things with param */ }
+//   Notification thread_can_start;
+//   ...
+//   // The thread_can_start parameter is optional; you can supply NULL.
+//   ThreadWithParam<int> thread(&ThreadFunc, 5, &thread_can_start);
+//   thread_can_start.Notify();
+//
+// These classes are only for testing Google Test's own constructs. Do
+// not use them in user tests, either directly or indirectly.
+template <typename T>
+class ThreadWithParam : public ThreadWithParamBase {
+ public:
+  typedef void (*UserThreadFunc)(T);
+
+  ThreadWithParam(
+      UserThreadFunc func, T param, Notification* thread_can_start)
+      : func_(func),
+        param_(param),
+        thread_can_start_(thread_can_start),
+        finished_(false) {
+    ThreadWithParamBase* const base = this;
+    // The thread can be created only after all fields except thread_
+    // have been initialized.
+    GTEST_CHECK_POSIX_SUCCESS_(
+        pthread_create(&thread_, 0, &ThreadFuncWithCLinkage, base));
+  }
+  ~ThreadWithParam() { Join(); }
+
+  void Join() {
+    if (!finished_) {
+      GTEST_CHECK_POSIX_SUCCESS_(pthread_join(thread_, 0));
+      finished_ = true;
+    }
+  }
+
+  virtual void Run() {
+    if (thread_can_start_ != NULL)
+      thread_can_start_->WaitForNotification();
+    func_(param_);
+  }
+
+ private:
+  const UserThreadFunc func_;  // User-supplied thread function.
+  const T param_;  // User-supplied parameter to the thread function.
+  // When non-NULL, used to block execution until the controller thread
+  // notifies.
+  Notification* const thread_can_start_;
+  bool finished_;  // true iff we know that the thread function has finished.
+  pthread_t thread_;  // The native thread object.
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParam);
+};
+
+// MutexBase and Mutex implement mutex on pthreads-based platforms. They
+// are used in conjunction with class MutexLock:
+//
+//   Mutex mutex;
+//   ...
+//   MutexLock lock(&mutex);  // Acquires the mutex and releases it at the end
+//                            // of the current scope.
+//
+// MutexBase implements behavior for both statically and dynamically
+// allocated mutexes.  Do not use MutexBase directly.  Instead, write
+// the following to define a static mutex:
+//
+//   GTEST_DEFINE_STATIC_MUTEX_(g_some_mutex);
+//
+// You can forward declare a static mutex like this:
+//
+//   GTEST_DECLARE_STATIC_MUTEX_(g_some_mutex);
+//
+// To create a dynamic mutex, just define an object of type Mutex.
+class MutexBase {
+ public:
+  // Acquires this mutex.
+  void Lock() {
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_lock(&mutex_));
+    owner_ = pthread_self();
+    has_owner_ = true;
+  }
+
+  // Releases this mutex.
+  void Unlock() {
+    // Since the lock is being released the owner_ field should no longer be
+    // considered valid. We don't protect writing to has_owner_ here, as it's
+    // the caller's responsibility to ensure that the current thread holds the
+    // mutex when this is called.
+    has_owner_ = false;
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_unlock(&mutex_));
+  }
+
+  // Does nothing if the current thread holds the mutex. Otherwise, crashes
+  // with high probability.
+  void AssertHeld() const {
+    GTEST_CHECK_(has_owner_ && pthread_equal(owner_, pthread_self()))
+        << "The current thread is not holding the mutex @" << this;
+  }
+
+  // A static mutex may be used before main() is entered.  It may even
+  // be used before the dynamic initialization stage.  Therefore we
+  // must be able to initialize a static mutex object at link time.
+  // This means MutexBase has to be a POD and its member variables
+  // have to be public.
+ public:
+  pthread_mutex_t mutex_;  // The underlying pthread mutex.
+  // has_owner_ indicates whether the owner_ field below contains a valid thread
+  // ID and is therefore safe to inspect (e.g., to use in pthread_equal()). All
+  // accesses to the owner_ field should be protected by a check of this field.
+  // An alternative might be to memset() owner_ to all zeros, but there's no
+  // guarantee that a zero'd pthread_t is necessarily invalid or even different
+  // from pthread_self().
+  bool has_owner_;
+  pthread_t owner_;  // The thread holding the mutex.
+};
+
+// Forward-declares a static mutex.
+# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+    extern ::testing::internal::MutexBase mutex
+
+// Defines and statically (i.e. at link time) initializes a static mutex.
+// The initialization list here does not explicitly initialize each field,
+// instead relying on default initialization for the unspecified fields. In
+// particular, the owner_ field (a pthread_t) is not explicitly initialized.
+// This allows initialization to work whether pthread_t is a scalar or struct.
+// The flag -Wmissing-field-initializers must not be specified for this to work.
+# define GTEST_DEFINE_STATIC_MUTEX_(mutex) \
+    ::testing::internal::MutexBase mutex = { PTHREAD_MUTEX_INITIALIZER, false }
+
+// The Mutex class can only be used for mutexes created at runtime. It
+// shares its API with MutexBase otherwise.
+class Mutex : public MutexBase {
+ public:
+  Mutex() {
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, NULL));
+    has_owner_ = false;
+  }
+  ~Mutex() {
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_destroy(&mutex_));
+  }
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex);
+};
+
+// We cannot name this class MutexLock as the ctor declaration would
+// conflict with a macro named MutexLock, which is defined on some
+// platforms.  Hence the typedef trick below.
+class GTestMutexLock {
+ public:
+  explicit GTestMutexLock(MutexBase* mutex)
+      : mutex_(mutex) { mutex_->Lock(); }
+
+  ~GTestMutexLock() { mutex_->Unlock(); }
+
+ private:
+  MutexBase* const mutex_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestMutexLock);
+};
+
+typedef GTestMutexLock MutexLock;
+
+// Helpers for ThreadLocal.
+
+// pthread_key_create() requires DeleteThreadLocalValue() to have
+// C-linkage.  Therefore it cannot be templatized to access
+// ThreadLocal<T>.  Hence the need for class
+// ThreadLocalValueHolderBase.
+class ThreadLocalValueHolderBase {
+ public:
+  virtual ~ThreadLocalValueHolderBase() {}
+};
+
+// Called by pthread to delete thread-local data stored by
+// pthread_setspecific().
+extern "C" inline void DeleteThreadLocalValue(void* value_holder) {
+  delete static_cast<ThreadLocalValueHolderBase*>(value_holder);
+}
+
+// Implements thread-local storage on pthreads-based systems.
+//
+//   // Thread 1
+//   ThreadLocal<int> tl(100);  // 100 is the default value for each thread.
+//
+//   // Thread 2
+//   tl.set(150);  // Changes the value for thread 2 only.
+//   EXPECT_EQ(150, tl.get());
+//
+//   // Thread 1
+//   EXPECT_EQ(100, tl.get());  // In thread 1, tl has the original value.
+//   tl.set(200);
+//   EXPECT_EQ(200, tl.get());
+//
+// The template type argument T must have a public copy constructor.
+// In addition, the default ThreadLocal constructor requires T to have
+// a public default constructor.
+//
+// An object managed for a thread by a ThreadLocal instance is deleted
+// when the thread exits.  Or, if the ThreadLocal instance dies in
+// that thread, when the ThreadLocal dies.  It's the user's
+// responsibility to ensure that all other threads using a ThreadLocal
+// have exited when it dies, or the per-thread objects for those
+// threads will not be deleted.
+//
+// Google Test only uses global ThreadLocal objects.  That means they
+// will die after main() has returned.  Therefore, no per-thread
+// object managed by Google Test will be leaked as long as all threads
+// using Google Test have exited when main() returns.
+template <typename T>
+class ThreadLocal {
+ public:
+  ThreadLocal() : key_(CreateKey()),
+                  default_() {}
+  explicit ThreadLocal(const T& value) : key_(CreateKey()),
+                                         default_(value) {}
+
+  ~ThreadLocal() {
+    // Destroys the managed object for the current thread, if any.
+    DeleteThreadLocalValue(pthread_getspecific(key_));
+
+    // Releases resources associated with the key.  This will *not*
+    // delete managed objects for other threads.
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_key_delete(key_));
+  }
+
+  T* pointer() { return GetOrCreateValue(); }
+  const T* pointer() const { return GetOrCreateValue(); }
+  const T& get() const { return *pointer(); }
+  void set(const T& value) { *pointer() = value; }
+
+ private:
+  // Holds a value of type T.
+  class ValueHolder : public ThreadLocalValueHolderBase {
+   public:
+    explicit ValueHolder(const T& value) : value_(value) {}
+
+    T* pointer() { return &value_; }
+
+   private:
+    T value_;
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolder);
+  };
+
+  static pthread_key_t CreateKey() {
+    pthread_key_t key;
+    // When a thread exits, DeleteThreadLocalValue() will be called on
+    // the object managed for that thread.
+    GTEST_CHECK_POSIX_SUCCESS_(
+        pthread_key_create(&key, &DeleteThreadLocalValue));
+    return key;
+  }
+
+  T* GetOrCreateValue() const {
+    ThreadLocalValueHolderBase* const holder =
+        static_cast<ThreadLocalValueHolderBase*>(pthread_getspecific(key_));
+    if (holder != NULL) {
+      return CheckedDowncastToActualType<ValueHolder>(holder)->pointer();
+    }
+
+    ValueHolder* const new_holder = new ValueHolder(default_);
+    ThreadLocalValueHolderBase* const holder_base = new_holder;
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_setspecific(key_, holder_base));
+    return new_holder->pointer();
+  }
+
+  // A key pthreads uses for looking up per-thread values.
+  const pthread_key_t key_;
+  const T default_;  // The default value for each thread.
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal);
+};
+
+# define GTEST_IS_THREADSAFE 1
+
+#else  // GTEST_HAS_PTHREAD
+
+// A dummy implementation of synchronization primitives (mutex, lock,
+// and thread-local variable).  Necessary for compiling Google Test where
+// mutex is not supported - using Google Test in multiple threads is not
+// supported on such platforms.
+
+class Mutex {
+ public:
+  Mutex() {}
+  void Lock() {}
+  void Unlock() {}
+  void AssertHeld() const {}
+};
+
+# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+  extern ::testing::internal::Mutex mutex
+
+# define GTEST_DEFINE_STATIC_MUTEX_(mutex) ::testing::internal::Mutex mutex
+
+class GTestMutexLock {
+ public:
+  explicit GTestMutexLock(Mutex*) {}  // NOLINT
+};
+
+typedef GTestMutexLock MutexLock;
+
+template <typename T>
+class ThreadLocal {
+ public:
+  ThreadLocal() : value_() {}
+  explicit ThreadLocal(const T& value) : value_(value) {}
+  T* pointer() { return &value_; }
+  const T* pointer() const { return &value_; }
+  const T& get() const { return value_; }
+  void set(const T& value) { value_ = value; }
+ private:
+  T value_;
+};
+
+// The above synchronization primitives have dummy implementations.
+// Therefore Google Test is not thread-safe.
+# define GTEST_IS_THREADSAFE 0
+
+#endif  // GTEST_HAS_PTHREAD
+
+// Returns the number of threads running in the process, or 0 to indicate that
+// we cannot detect it.
+GTEST_API_ size_t GetThreadCount();
+
+// Passing non-POD classes through ellipsis (...) crashes the ARM
+// compiler and generates a warning in Sun Studio.  The Nokia Symbian
+// and the IBM XL C/C++ compiler try to instantiate a copy constructor
+// for objects passed through ellipsis (...), failing for uncopyable
+// objects.  We define this to ensure that only POD is passed through
+// ellipsis on these systems.
+#if defined(__SYMBIAN32__) || defined(__IBMCPP__) || defined(__SUNPRO_CC)
+// We lose support for NULL detection where the compiler doesn't like
+// passing non-POD classes through ellipsis (...).
+# define GTEST_ELLIPSIS_NEEDS_POD_ 1
+#else
+# define GTEST_CAN_COMPARE_NULL 1
+#endif
+
+// The Nokia Symbian and IBM XL C/C++ compilers cannot decide between
+// const T& and const T* in a function template.  These compilers
+// _can_ decide between class template specializations for T and T*,
+// so a tr1::type_traits-like is_pointer works.
+#if defined(__SYMBIAN32__) || defined(__IBMCPP__)
+# define GTEST_NEEDS_IS_POINTER_ 1
+#endif
+
+template <bool bool_value>
+struct bool_constant {
+  typedef bool_constant<bool_value> type;
+  static const bool value = bool_value;
+};
+template <bool bool_value> const bool bool_constant<bool_value>::value;
+
+typedef bool_constant<false> false_type;
+typedef bool_constant<true> true_type;
+
+template <typename T>
+struct is_pointer : public false_type {};
+
+template <typename T>
+struct is_pointer<T*> : public true_type {};
+
+template <typename Iterator>
+struct IteratorTraits {
+  typedef typename Iterator::value_type value_type;
+};
+
+template <typename T>
+struct IteratorTraits<T*> {
+  typedef T value_type;
+};
+
+template <typename T>
+struct IteratorTraits<const T*> {
+  typedef T value_type;
+};
+
+#if GTEST_OS_WINDOWS
+# define GTEST_PATH_SEP_ "\\"
+# define GTEST_HAS_ALT_PATH_SEP_ 1
+// The biggest signed integer type the compiler supports.
+typedef __int64 BiggestInt;
+#else
+# define GTEST_PATH_SEP_ "/"
+# define GTEST_HAS_ALT_PATH_SEP_ 0
+typedef long long BiggestInt;  // NOLINT
+#endif  // GTEST_OS_WINDOWS
+
+// Utilities for char.
+
+// isspace(int ch) and friends accept an unsigned char or EOF.  char
+// may be signed, depending on the compiler (or compiler flags).
+// Therefore we need to cast a char to unsigned char before calling
+// isspace(), etc.
+
+inline bool IsAlpha(char ch) {
+  return isalpha(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsAlNum(char ch) {
+  return isalnum(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsDigit(char ch) {
+  return isdigit(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsLower(char ch) {
+  return islower(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsSpace(char ch) {
+  return isspace(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsUpper(char ch) {
+  return isupper(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsXDigit(char ch) {
+  return isxdigit(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsXDigit(wchar_t ch) {
+  const unsigned char low_byte = static_cast<unsigned char>(ch);
+  return ch == low_byte && isxdigit(low_byte) != 0;
+}
+
+inline char ToLower(char ch) {
+  return static_cast<char>(tolower(static_cast<unsigned char>(ch)));
+}
+inline char ToUpper(char ch) {
+  return static_cast<char>(toupper(static_cast<unsigned char>(ch)));
+}
+
+// The testing::internal::posix namespace holds wrappers for common
+// POSIX functions.  These wrappers hide the differences between
+// Windows/MSVC and POSIX systems.  Since some compilers define these
+// standard functions as macros, the wrapper cannot have the same name
+// as the wrapped function.
+
+namespace posix {
+
+// Functions with a different name on Windows.
+
+#if GTEST_OS_WINDOWS
+
+typedef struct _stat StatStruct;
+
+# ifdef __BORLANDC__
+inline int IsATTY(int fd) { return isatty(fd); }
+inline int StrCaseCmp(const char* s1, const char* s2) {
+  return stricmp(s1, s2);
+}
+inline char* StrDup(const char* src) { return strdup(src); }
+# else  // !__BORLANDC__
+#  if GTEST_OS_WINDOWS_MOBILE
+inline int IsATTY(int /* fd */) { return 0; }
+#  else
+inline int IsATTY(int fd) { return _isatty(fd); }
+#  endif  // GTEST_OS_WINDOWS_MOBILE
+inline int StrCaseCmp(const char* s1, const char* s2) {
+  return _stricmp(s1, s2);
+}
+inline char* StrDup(const char* src) { return _strdup(src); }
+# endif  // __BORLANDC__
+
+# if GTEST_OS_WINDOWS_MOBILE
+inline int FileNo(FILE* file) { return reinterpret_cast<int>(_fileno(file)); }
+// Stat(), RmDir(), and IsDir() are not needed on Windows CE at this
+// time and thus not defined there.
+# else
+inline int FileNo(FILE* file) { return _fileno(file); }
+inline int Stat(const char* path, StatStruct* buf) { return _stat(path, buf); }
+inline int RmDir(const char* dir) { return _rmdir(dir); }
+inline bool IsDir(const StatStruct& st) {
+  return (_S_IFDIR & st.st_mode) != 0;
+}
+# endif  // GTEST_OS_WINDOWS_MOBILE
+
+#else
+
+typedef struct stat StatStruct;
+
+inline int FileNo(FILE* file) { return fileno(file); }
+inline int IsATTY(int fd) { return isatty(fd); }
+inline int Stat(const char* path, StatStruct* buf) { return stat(path, buf); }
+inline int StrCaseCmp(const char* s1, const char* s2) {
+  return strcasecmp(s1, s2);
+}
+inline char* StrDup(const char* src) { return strdup(src); }
+inline int RmDir(const char* dir) { return rmdir(dir); }
+inline bool IsDir(const StatStruct& st) { return S_ISDIR(st.st_mode); }
+
+#endif  // GTEST_OS_WINDOWS
+
+// Functions deprecated by MSVC 8.0.
+
+#ifdef _MSC_VER
+// Temporarily disable warning 4996 (deprecated function).
+# pragma warning(push)
+# pragma warning(disable:4996)
+#endif
+
+inline const char* StrNCpy(char* dest, const char* src, size_t n) {
+  return strncpy(dest, src, n);
+}
+
+// ChDir(), FReopen(), FDOpen(), Read(), Write(), Close(), and
+// StrError() aren't needed on Windows CE at this time and thus not
+// defined there.
+
+#if !GTEST_OS_WINDOWS_MOBILE
+inline int ChDir(const char* dir) { return chdir(dir); }
+#endif
+inline FILE* FOpen(const char* path, const char* mode) {
+  return fopen(path, mode);
+}
+#if !GTEST_OS_WINDOWS_MOBILE
+inline FILE *FReopen(const char* path, const char* mode, FILE* stream) {
+  return freopen(path, mode, stream);
+}
+inline FILE* FDOpen(int fd, const char* mode) { return fdopen(fd, mode); }
+#endif
+inline int FClose(FILE* fp) { return fclose(fp); }
+#if !GTEST_OS_WINDOWS_MOBILE
+inline int Read(int fd, void* buf, unsigned int count) {
+  return static_cast<int>(read(fd, buf, count));
+}
+inline int Write(int fd, const void* buf, unsigned int count) {
+  return static_cast<int>(write(fd, buf, count));
+}
+inline int Close(int fd) { return close(fd); }
+inline const char* StrError(int errnum) { return strerror(errnum); }
+#endif
+inline const char* GetEnv(const char* name) {
+#if GTEST_OS_WINDOWS_MOBILE
+  // We are on Windows CE, which has no environment variables.
+  return NULL;
+#elif defined(__BORLANDC__) || defined(__SunOS_5_8) || defined(__SunOS_5_9)
+  // Environment variables which we programmatically clear will be set to the
+  // empty string rather than unset (NULL).  Handle that case.
+  const char* const env = getenv(name);
+  return (env != NULL && env[0] != '\0') ? env : NULL;
+#else
+  return getenv(name);
+#endif
+}
+
+#ifdef _MSC_VER
+# pragma warning(pop)  // Restores the warning state.
+#endif
+
+#if GTEST_OS_WINDOWS_MOBILE
+// Windows CE has no C library. The abort() function is used in
+// several places in Google Test. This implementation provides a reasonable
+// imitation of standard behaviour.
+void Abort();
+#else
+inline void Abort() { abort(); }
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+}  // namespace posix
+
+// MSVC "deprecates" snprintf and issues warnings wherever it is used.  In
+// order to avoid these warnings, we need to use _snprintf or _snprintf_s on
+// MSVC-based platforms.  We map the GTEST_SNPRINTF_ macro to the appropriate
+// function in order to achieve that.  We use macro definition here because
+// snprintf is a variadic function.
+#if _MSC_VER >= 1400 && !GTEST_OS_WINDOWS_MOBILE
+// MSVC 2005 and above support variadic macros.
+# define GTEST_SNPRINTF_(buffer, size, format, ...) \
+     _snprintf_s(buffer, size, size, format, __VA_ARGS__)
+#elif defined(_MSC_VER)
+// Windows CE does not define _snprintf_s and MSVC prior to 2005 doesn't
+// complain about _snprintf.
+# define GTEST_SNPRINTF_ _snprintf
+#else
+# define GTEST_SNPRINTF_ snprintf
+#endif
+
+// The maximum number a BiggestInt can represent.  This definition
+// works no matter BiggestInt is represented in one's complement or
+// two's complement.
+//
+// We cannot rely on numeric_limits in STL, as __int64 and long long
+// are not part of standard C++ and numeric_limits doesn't need to be
+// defined for them.
+const BiggestInt kMaxBiggestInt =
+    ~(static_cast<BiggestInt>(1) << (8*sizeof(BiggestInt) - 1));
+
+// This template class serves as a compile-time function from size to
+// type.  It maps a size in bytes to a primitive type with that
+// size. e.g.
+//
+//   TypeWithSize<4>::UInt
+//
+// is typedef-ed to be unsigned int (unsigned integer made up of 4
+// bytes).
+//
+// Such functionality should belong to STL, but I cannot find it
+// there.
+//
+// Google Test uses this class in the implementation of floating-point
+// comparison.
+//
+// For now it only handles UInt (unsigned int) as that's all Google Test
+// needs.  Other types can be easily added in the future if need
+// arises.
+template <size_t size>
+class TypeWithSize {
+ public:
+  // This prevents the user from using TypeWithSize<N> with incorrect
+  // values of N.
+  typedef void UInt;
+};
+
+// The specialization for size 4.
+template <>
+class TypeWithSize<4> {
+ public:
+  // unsigned int has size 4 in both gcc and MSVC.
+  //
+  // As base/basictypes.h doesn't compile on Windows, we cannot use
+  // uint32, uint64, and etc here.
+  typedef int Int;
+  typedef unsigned int UInt;
+};
+
+// The specialization for size 8.
+template <>
+class TypeWithSize<8> {
+ public:
+#if GTEST_OS_WINDOWS
+  typedef __int64 Int;
+  typedef unsigned __int64 UInt;
+#else
+  typedef long long Int;  // NOLINT
+  typedef unsigned long long UInt;  // NOLINT
+#endif  // GTEST_OS_WINDOWS
+};
+
+// Integer types of known sizes.
+typedef TypeWithSize<4>::Int Int32;
+typedef TypeWithSize<4>::UInt UInt32;
+typedef TypeWithSize<8>::Int Int64;
+typedef TypeWithSize<8>::UInt UInt64;
+typedef TypeWithSize<8>::Int TimeInMillis;  // Represents time in milliseconds.
+
+// Utilities for command line flags and environment variables.
+
+// Macro for referencing flags.
+#define GTEST_FLAG(name) FLAGS_gtest_##name
+
+// Macros for declaring flags.
+#define GTEST_DECLARE_bool_(name) GTEST_API_ extern bool GTEST_FLAG(name)
+#define GTEST_DECLARE_int32_(name) \
+    GTEST_API_ extern ::testing::internal::Int32 GTEST_FLAG(name)
+#define GTEST_DECLARE_string_(name) \
+    GTEST_API_ extern ::std::string GTEST_FLAG(name)
+
+// Macros for defining flags.
+#define GTEST_DEFINE_bool_(name, default_val, doc) \
+    GTEST_API_ bool GTEST_FLAG(name) = (default_val)
+#define GTEST_DEFINE_int32_(name, default_val, doc) \
+    GTEST_API_ ::testing::internal::Int32 GTEST_FLAG(name) = (default_val)
+#define GTEST_DEFINE_string_(name, default_val, doc) \
+    GTEST_API_ ::std::string GTEST_FLAG(name) = (default_val)
+
+// Thread annotations
+#define GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks)
+#define GTEST_LOCK_EXCLUDED_(locks)
+
+// Parses 'str' for a 32-bit signed integer.  If successful, writes the result
+// to *value and returns true; otherwise leaves *value unchanged and returns
+// false.
+// TODO(chandlerc): Find a better way to refactor flag and environment parsing
+// out of both gtest-port.cc and gtest.cc to avoid exporting this utility
+// function.
+bool ParseInt32(const Message& src_text, const char* str, Int32* value);
+
+// Parses a bool/Int32/string from the environment variable
+// corresponding to the given Google Test flag.
+bool BoolFromGTestEnv(const char* flag, bool default_val);
+GTEST_API_ Int32 Int32FromGTestEnv(const char* flag, Int32 default_val);
+const char* StringFromGTestEnv(const char* flag, const char* default_val);
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
+
+#if GTEST_OS_LINUX
+# include <stdlib.h>
+# include <sys/types.h>
+# include <sys/wait.h>
+# include <unistd.h>
+#endif  // GTEST_OS_LINUX
+
+#if GTEST_HAS_EXCEPTIONS
+# include <stdexcept>
+#endif
+
+#include <ctype.h>
+#include <float.h>
+#include <string.h>
+#include <iomanip>
+#include <limits>
+#include <set>
+
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+//
+// The Google C++ Testing Framework (Google Test)
+//
+// This header file defines the Message class.
+//
+// IMPORTANT NOTE: Due to limitation of the C++ language, we have to
+// leave some internal implementation details in this header file.
+// They are clearly marked by comments like this:
+//
+//   // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+//
+// Such code is NOT meant to be used by a user directly, and is subject
+// to CHANGE WITHOUT NOTICE.  Therefore DO NOT DEPEND ON IT in a user
+// program!
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
+#define GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
+
+#include <limits>
+
+
+// Ensures that there is at least one operator<< in the global namespace.
+// See Message& operator<<(...) below for why.
+void operator<<(const testing::internal::Secret&, int);
+
+namespace testing {
+
+// The Message class works like an ostream repeater.
+//
+// Typical usage:
+//
+//   1. You stream a bunch of values to a Message object.
+//      It will remember the text in a stringstream.
+//   2. Then you stream the Message object to an ostream.
+//      This causes the text in the Message to be streamed
+//      to the ostream.
+//
+// For example;
+//
+//   testing::Message foo;
+//   foo << 1 << " != " << 2;
+//   std::cout << foo;
+//
+// will print "1 != 2".
+//
+// Message is not intended to be inherited from.  In particular, its
+// destructor is not virtual.
+//
+// Note that stringstream behaves differently in gcc and in MSVC.  You
+// can stream a NULL char pointer to it in the former, but not in the
+// latter (it causes an access violation if you do).  The Message
+// class hides this difference by treating a NULL char pointer as
+// "(null)".
+class GTEST_API_ Message {
+ private:
+  // The type of basic IO manipulators (endl, ends, and flush) for
+  // narrow streams.
+  typedef std::ostream& (*BasicNarrowIoManip)(std::ostream&);
+
+ public:
+  // Constructs an empty Message.
+  Message();
+
+  // Copy constructor.
+  Message(const Message& msg) : ss_(new ::std::stringstream) {  // NOLINT
+    *ss_ << msg.GetString();
+  }
+
+  // Constructs a Message from a C-string.
+  explicit Message(const char* str) : ss_(new ::std::stringstream) {
+    *ss_ << str;
+  }
+
+#if GTEST_OS_SYMBIAN
+  // Streams a value (either a pointer or not) to this object.
+  template <typename T>
+  inline Message& operator <<(const T& value) {
+    StreamHelper(typename internal::is_pointer<T>::type(), value);
+    return *this;
+  }
+#else
+  // Streams a non-pointer value to this object.
+  template <typename T>
+  inline Message& operator <<(const T& val) {
+    // Some libraries overload << for STL containers.  These
+    // overloads are defined in the global namespace instead of ::std.
+    //
+    // C++'s symbol lookup rule (i.e. Koenig lookup) says that these
+    // overloads are visible in either the std namespace or the global
+    // namespace, but not other namespaces, including the testing
+    // namespace which Google Test's Message class is in.
+    //
+    // To allow STL containers (and other types that has a << operator
+    // defined in the global namespace) to be used in Google Test
+    // assertions, testing::Message must access the custom << operator
+    // from the global namespace.  With this using declaration,
+    // overloads of << defined in the global namespace and those
+    // visible via Koenig lookup are both exposed in this function.
+    using ::operator <<;
+    *ss_ << val;
+    return *this;
+  }
+
+  // Streams a pointer value to this object.
+  //
+  // This function is an overload of the previous one.  When you
+  // stream a pointer to a Message, this definition will be used as it
+  // is more specialized.  (The C++ Standard, section
+  // [temp.func.order].)  If you stream a non-pointer, then the
+  // previous definition will be used.
+  //
+  // The reason for this overload is that streaming a NULL pointer to
+  // ostream is undefined behavior.  Depending on the compiler, you
+  // may get "0", "(nil)", "(null)", or an access violation.  To
+  // ensure consistent result across compilers, we always treat NULL
+  // as "(null)".
+  template <typename T>
+  inline Message& operator <<(T* const& pointer) {  // NOLINT
+    if (pointer == NULL) {
+      *ss_ << "(null)";
+    } else {
+      *ss_ << pointer;
+    }
+    return *this;
+  }
+#endif  // GTEST_OS_SYMBIAN
+
+  // Since the basic IO manipulators are overloaded for both narrow
+  // and wide streams, we have to provide this specialized definition
+  // of operator <<, even though its body is the same as the
+  // templatized version above.  Without this definition, streaming
+  // endl or other basic IO manipulators to Message will confuse the
+  // compiler.
+  Message& operator <<(BasicNarrowIoManip val) {
+    *ss_ << val;
+    return *this;
+  }
+
+  // Instead of 1/0, we want to see true/false for bool values.
+  Message& operator <<(bool b) {
+    return *this << (b ? "true" : "false");
+  }
+
+  // These two overloads allow streaming a wide C string to a Message
+  // using the UTF-8 encoding.
+  Message& operator <<(const wchar_t* wide_c_str);
+  Message& operator <<(wchar_t* wide_c_str);
+
+#if GTEST_HAS_STD_WSTRING
+  // Converts the given wide string to a narrow string using the UTF-8
+  // encoding, and streams the result to this Message object.
+  Message& operator <<(const ::std::wstring& wstr);
+#endif  // GTEST_HAS_STD_WSTRING
+
+#if GTEST_HAS_GLOBAL_WSTRING
+  // Converts the given wide string to a narrow string using the UTF-8
+  // encoding, and streams the result to this Message object.
+  Message& operator <<(const ::wstring& wstr);
+#endif  // GTEST_HAS_GLOBAL_WSTRING
+
+  // Gets the text streamed to this object so far as an std::string.
+  // Each '\0' character in the buffer is replaced with "\\0".
+  //
+  // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+  std::string GetString() const;
+
+ private:
+
+#if GTEST_OS_SYMBIAN
+  // These are needed as the Nokia Symbian Compiler cannot decide between
+  // const T& and const T* in a function template. The Nokia compiler _can_
+  // decide between class template specializations for T and T*, so a
+  // tr1::type_traits-like is_pointer works, and we can overload on that.
+  template <typename T>
+  inline void StreamHelper(internal::true_type /*is_pointer*/, T* pointer) {
+    if (pointer == NULL) {
+      *ss_ << "(null)";
+    } else {
+      *ss_ << pointer;
+    }
+  }
+  template <typename T>
+  inline void StreamHelper(internal::false_type /*is_pointer*/,
+                           const T& value) {
+    // See the comments in Message& operator <<(const T&) above for why
+    // we need this using statement.
+    using ::operator <<;
+    *ss_ << value;
+  }
+#endif  // GTEST_OS_SYMBIAN
+
+  // We'll hold the text streamed to this object here.
+  const internal::scoped_ptr< ::std::stringstream> ss_;
+
+  // We declare (but don't implement) this to prevent the compiler
+  // from implementing the assignment operator.
+  void operator=(const Message&);
+};
+
+// Streams a Message to an ostream.
+inline std::ostream& operator <<(std::ostream& os, const Message& sb) {
+  return os << sb.GetString();
+}
+
+namespace internal {
+
+// Converts a streamable value to an std::string.  A NULL pointer is
+// converted to "(null)".  When the input value is a ::string,
+// ::std::string, ::wstring, or ::std::wstring object, each NUL
+// character in it is replaced with "\\0".
+template <typename T>
+std::string StreamableToString(const T& streamable) {
+  return (Message() << streamable).GetString();
+}
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee)
+//
+// The Google C++ Testing Framework (Google Test)
+//
+// This header file declares the String class and functions used internally by
+// Google Test.  They are subject to change without notice. They should not used
+// by code external to Google Test.
+//
+// This header file is #included by <gtest/internal/gtest-internal.h>.
+// It should not be #included by other files.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
+
+#ifdef __BORLANDC__
+// string.h is not guaranteed to provide strcpy on C++ Builder.
+# include <mem.h>
+#endif
+
+#include <string.h>
+#include <string>
+
+
+namespace testing {
+namespace internal {
+
+// String - an abstract class holding static string utilities.
+class GTEST_API_ String {
+ public:
+  // Static utility methods
+
+  // Clones a 0-terminated C string, allocating memory using new.  The
+  // caller is responsible for deleting the return value using
+  // delete[].  Returns the cloned string, or NULL if the input is
+  // NULL.
+  //
+  // This is different from strdup() in string.h, which allocates
+  // memory using malloc().
+  static const char* CloneCString(const char* c_str);
+
+#if GTEST_OS_WINDOWS_MOBILE
+  // Windows CE does not have the 'ANSI' versions of Win32 APIs. To be
+  // able to pass strings to Win32 APIs on CE we need to convert them
+  // to 'Unicode', UTF-16.
+
+  // Creates a UTF-16 wide string from the given ANSI string, allocating
+  // memory using new. The caller is responsible for deleting the return
+  // value using delete[]. Returns the wide string, or NULL if the
+  // input is NULL.
+  //
+  // The wide string is created using the ANSI codepage (CP_ACP) to
+  // match the behaviour of the ANSI versions of Win32 calls and the
+  // C runtime.
+  static LPCWSTR AnsiToUtf16(const char* c_str);
+
+  // Creates an ANSI string from the given wide string, allocating
+  // memory using new. The caller is responsible for deleting the return
+  // value using delete[]. Returns the ANSI string, or NULL if the
+  // input is NULL.
+  //
+  // The returned string is created using the ANSI codepage (CP_ACP) to
+  // match the behaviour of the ANSI versions of Win32 calls and the
+  // C runtime.
+  static const char* Utf16ToAnsi(LPCWSTR utf16_str);
+#endif
+
+  // Compares two C strings.  Returns true iff they have the same content.
+  //
+  // Unlike strcmp(), this function can handle NULL argument(s).  A
+  // NULL C string is considered different to any non-NULL C string,
+  // including the empty string.
+  static bool CStringEquals(const char* lhs, const char* rhs);
+
+  // Converts a wide C string to a String using the UTF-8 encoding.
+  // NULL will be converted to "(null)".  If an error occurred during
+  // the conversion, "(failed to convert from wide string)" is
+  // returned.
+  static std::string ShowWideCString(const wchar_t* wide_c_str);
+
+  // Compares two wide C strings.  Returns true iff they have the same
+  // content.
+  //
+  // Unlike wcscmp(), this function can handle NULL argument(s).  A
+  // NULL C string is considered different to any non-NULL C string,
+  // including the empty string.
+  static bool WideCStringEquals(const wchar_t* lhs, const wchar_t* rhs);
+
+  // Compares two C strings, ignoring case.  Returns true iff they
+  // have the same content.
+  //
+  // Unlike strcasecmp(), this function can handle NULL argument(s).
+  // A NULL C string is considered different to any non-NULL C string,
+  // including the empty string.
+  static bool CaseInsensitiveCStringEquals(const char* lhs,
+                                           const char* rhs);
+
+  // Compares two wide C strings, ignoring case.  Returns true iff they
+  // have the same content.
+  //
+  // Unlike wcscasecmp(), this function can handle NULL argument(s).
+  // A NULL C string is considered different to any non-NULL wide C string,
+  // including the empty string.
+  // NB: The implementations on different platforms slightly differ.
+  // On windows, this method uses _wcsicmp which compares according to LC_CTYPE
+  // environment variable. On GNU platform this method uses wcscasecmp
+  // which compares according to LC_CTYPE category of the current locale.
+  // On MacOS X, it uses towlower, which also uses LC_CTYPE category of the
+  // current locale.
+  static bool CaseInsensitiveWideCStringEquals(const wchar_t* lhs,
+                                               const wchar_t* rhs);
+
+  // Returns true iff the given string ends with the given suffix, ignoring
+  // case. Any string is considered to end with an empty suffix.
+  static bool EndsWithCaseInsensitive(
+      const std::string& str, const std::string& suffix);
+
+  // Formats an int value as "%02d".
+  static std::string FormatIntWidth2(int value);  // "%02d" for width == 2
+
+  // Formats an int value as "%X".
+  static std::string FormatHexInt(int value);
+
+  // Formats a byte as "%02X".
+  static std::string FormatByte(unsigned char value);
+
+ private:
+  String();  // Not meant to be instantiated.
+};  // class String
+
+// Gets the content of the stringstream's buffer as an std::string.  Each '\0'
+// character in the buffer is replaced with "\\0".
+GTEST_API_ std::string StringStreamToString(::std::stringstream* stream);
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: keith.ray@gmail.com (Keith Ray)
+//
+// Google Test filepath utilities
+//
+// This header file declares classes and functions used internally by
+// Google Test.  They are subject to change without notice.
+//
+// This file is #included in <gtest/internal/gtest-internal.h>.
+// Do not include this header file separately!
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
+
+
+namespace testing {
+namespace internal {
+
+// FilePath - a class for file and directory pathname manipulation which
+// handles platform-specific conventions (like the pathname separator).
+// Used for helper functions for naming files in a directory for xml output.
+// Except for Set methods, all methods are const or static, which provides an
+// "immutable value object" -- useful for peace of mind.
+// A FilePath with a value ending in a path separator ("like/this/") represents
+// a directory, otherwise it is assumed to represent a file. In either case,
+// it may or may not represent an actual file or directory in the file system.
+// Names are NOT checked for syntax correctness -- no checking for illegal
+// characters, malformed paths, etc.
+
+class GTEST_API_ FilePath {
+ public:
+  FilePath() : pathname_("") { }
+  FilePath(const FilePath& rhs) : pathname_(rhs.pathname_) { }
+
+  explicit FilePath(const std::string& pathname) : pathname_(pathname) {
+    Normalize();
+  }
+
+  FilePath& operator=(const FilePath& rhs) {
+    Set(rhs);
+    return *this;
+  }
+
+  void Set(const FilePath& rhs) {
+    pathname_ = rhs.pathname_;
+  }
+
+  const std::string& string() const { return pathname_; }
+  const char* c_str() const { return pathname_.c_str(); }
+
+  // Returns the current working directory, or "" if unsuccessful.
+  static FilePath GetCurrentDir();
+
+  // Given directory = "dir", base_name = "test", number = 0,
+  // extension = "xml", returns "dir/test.xml". If number is greater
+  // than zero (e.g., 12), returns "dir/test_12.xml".
+  // On Windows platform, uses \ as the separator rather than /.
+  static FilePath MakeFileName(const FilePath& directory,
+                               const FilePath& base_name,
+                               int number,
+                               const char* extension);
+
+  // Given directory = "dir", relative_path = "test.xml",
+  // returns "dir/test.xml".
+  // On Windows, uses \ as the separator rather than /.
+  static FilePath ConcatPaths(const FilePath& directory,
+                              const FilePath& relative_path);
+
+  // Returns a pathname for a file that does not currently exist. The pathname
+  // will be directory/base_name.extension or
+  // directory/base_name_<number>.extension if directory/base_name.extension
+  // already exists. The number will be incremented until a pathname is found
+  // that does not already exist.
+  // Examples: 'dir/foo_test.xml' or 'dir/foo_test_1.xml'.
+  // There could be a race condition if two or more processes are calling this
+  // function at the same time -- they could both pick the same filename.
+  static FilePath GenerateUniqueFileName(const FilePath& directory,
+                                         const FilePath& base_name,
+                                         const char* extension);
+
+  // Returns true iff the path is "".
+  bool IsEmpty() const { return pathname_.empty(); }
+
+  // If input name has a trailing separator character, removes it and returns
+  // the name, otherwise return the name string unmodified.
+  // On Windows platform, uses \ as the separator, other platforms use /.
+  FilePath RemoveTrailingPathSeparator() const;
+
+  // Returns a copy of the FilePath with the directory part removed.
+  // Example: FilePath("path/to/file").RemoveDirectoryName() returns
+  // FilePath("file"). If there is no directory part ("just_a_file"), it returns
+  // the FilePath unmodified. If there is no file part ("just_a_dir/") it
+  // returns an empty FilePath ("").
+  // On Windows platform, '\' is the path separator, otherwise it is '/'.
+  FilePath RemoveDirectoryName() const;
+
+  // RemoveFileName returns the directory path with the filename removed.
+  // Example: FilePath("path/to/file").RemoveFileName() returns "path/to/".
+  // If the FilePath is "a_file" or "/a_file", RemoveFileName returns
+  // FilePath("./") or, on Windows, FilePath(".\\"). If the filepath does
+  // not have a file, like "just/a/dir/", it returns the FilePath unmodified.
+  // On Windows platform, '\' is the path separator, otherwise it is '/'.
+  FilePath RemoveFileName() const;
+
+  // Returns a copy of the FilePath with the case-insensitive extension removed.
+  // Example: FilePath("dir/file.exe").RemoveExtension("EXE") returns
+  // FilePath("dir/file"). If a case-insensitive extension is not
+  // found, returns a copy of the original FilePath.
+  FilePath RemoveExtension(const char* extension) const;
+
+  // Creates directories so that path exists. Returns true if successful or if
+  // the directories already exist; returns false if unable to create
+  // directories for any reason. Will also return false if the FilePath does
+  // not represent a directory (that is, it doesn't end with a path separator).
+  bool CreateDirectoriesRecursively() const;
+
+  // Create the directory so that path exists. Returns true if successful or
+  // if the directory already exists; returns false if unable to create the
+  // directory for any reason, including if the parent directory does not
+  // exist. Not named "CreateDirectory" because that's a macro on Windows.
+  bool CreateFolder() const;
+
+  // Returns true if FilePath describes something in the file-system,
+  // either a file, directory, or whatever, and that something exists.
+  bool FileOrDirectoryExists() const;
+
+  // Returns true if pathname describes a directory in the file-system
+  // that exists.
+  bool DirectoryExists() const;
+
+  // Returns true if FilePath ends with a path separator, which indicates that
+  // it is intended to represent a directory. Returns false otherwise.
+  // This does NOT check that a directory (or file) actually exists.
+  bool IsDirectory() const;
+
+  // Returns true if pathname describes a root directory. (Windows has one
+  // root directory per disk drive.)
+  bool IsRootDirectory() const;
+
+  // Returns true if pathname describes an absolute path.
+  bool IsAbsolutePath() const;
+
+ private:
+  // Replaces multiple consecutive separators with a single separator.
+  // For example, "bar///foo" becomes "bar/foo". Does not eliminate other
+  // redundancies that might be in a pathname involving "." or "..".
+  //
+  // A pathname with multiple consecutive separators may occur either through
+  // user error or as a result of some scripts or APIs that generate a pathname
+  // with a trailing separator. On other platforms the same API or script
+  // may NOT generate a pathname with a trailing "/". Then elsewhere that
+  // pathname may have another "/" and pathname components added to it,
+  // without checking for the separator already being there.
+  // The script language and operating system may allow paths like "foo//bar"
+  // but some of the functions in FilePath will not handle that correctly. In
+  // particular, RemoveTrailingPathSeparator() only removes one separator, and
+  // it is called in CreateDirectoriesRecursively() assuming that it will change
+  // a pathname from directory syntax (trailing separator) to filename syntax.
+  //
+  // On Windows this method also replaces the alternate path separator '/' with
+  // the primary path separator '\\', so that for example "bar\\/\\foo" becomes
+  // "bar\\foo".
+
+  void Normalize();
+
+  // Returns a pointer to the last occurence of a valid path separator in
+  // the FilePath. On Windows, for example, both '/' and '\' are valid path
+  // separators. Returns NULL if no path separator was found.
+  const char* FindLastPathSeparator() const;
+
+  std::string pathname_;
+};  // class FilePath
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
+// This file was GENERATED by command:
+//     pump.py gtest-type-util.h.pump
+// DO NOT EDIT BY HAND!!!
+
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+// Type utilities needed for implementing typed and type-parameterized
+// tests.  This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
+//
+// Currently we support at most 50 types in a list, and at most 50
+// type-parameterized tests in one type-parameterized test case.
+// Please contact googletestframework@googlegroups.com if you need
+// more.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
+
+
+// #ifdef __GNUC__ is too general here.  It is possible to use gcc without using
+// libstdc++ (which is where cxxabi.h comes from).
+# if GTEST_HAS_CXXABI_H_
+#  include <cxxabi.h>
+# elif defined(__HP_aCC)
+#  include <acxx_demangle.h>
+# endif  // GTEST_HASH_CXXABI_H_
+
+namespace testing {
+namespace internal {
+
+// GetTypeName<T>() returns a human-readable name of type T.
+// NB: This function is also used in Google Mock, so don't move it inside of
+// the typed-test-only section below.
+template <typename T>
+std::string GetTypeName() {
+# if GTEST_HAS_RTTI
+
+  const char* const name = typeid(T).name();
+#  if GTEST_HAS_CXXABI_H_ || defined(__HP_aCC)
+  int status = 0;
+  // gcc's implementation of typeid(T).name() mangles the type name,
+  // so we have to demangle it.
+#   if GTEST_HAS_CXXABI_H_
+  using abi::__cxa_demangle;
+#   endif  // GTEST_HAS_CXXABI_H_
+  char* const readable_name = __cxa_demangle(name, 0, 0, &status);
+  const std::string name_str(status == 0 ? readable_name : name);
+  free(readable_name);
+  return name_str;
+#  else
+  return name;
+#  endif  // GTEST_HAS_CXXABI_H_ || __HP_aCC
+
+# else
+
+  return "<type>";
+
+# endif  // GTEST_HAS_RTTI
+}
+
+#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
+
+// AssertyTypeEq<T1, T2>::type is defined iff T1 and T2 are the same
+// type.  This can be used as a compile-time assertion to ensure that
+// two types are equal.
+
+template <typename T1, typename T2>
+struct AssertTypeEq;
+
+template <typename T>
+struct AssertTypeEq<T, T> {
+  typedef bool type;
+};
+
+// A unique type used as the default value for the arguments of class
+// template Types.  This allows us to simulate variadic templates
+// (e.g. Types<int>, Type<int, double>, and etc), which C++ doesn't
+// support directly.
+struct None {};
+
+// The following family of struct and struct templates are used to
+// represent type lists.  In particular, TypesN<T1, T2, ..., TN>
+// represents a type list with N types (T1, T2, ..., and TN) in it.
+// Except for Types0, every struct in the family has two member types:
+// Head for the first type in the list, and Tail for the rest of the
+// list.
+
+// The empty type list.
+struct Types0 {};
+
+// Type lists of length 1, 2, 3, and so on.
+
+template <typename T1>
+struct Types1 {
+  typedef T1 Head;
+  typedef Types0 Tail;
+};
+template <typename T1, typename T2>
+struct Types2 {
+  typedef T1 Head;
+  typedef Types1<T2> Tail;
+};
+
+template <typename T1, typename T2, typename T3>
+struct Types3 {
+  typedef T1 Head;
+  typedef Types2<T2, T3> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4>
+struct Types4 {
+  typedef T1 Head;
+  typedef Types3<T2, T3, T4> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+struct Types5 {
+  typedef T1 Head;
+  typedef Types4<T2, T3, T4, T5> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6>
+struct Types6 {
+  typedef T1 Head;
+  typedef Types5<T2, T3, T4, T5, T6> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7>
+struct Types7 {
+  typedef T1 Head;
+  typedef Types6<T2, T3, T4, T5, T6, T7> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8>
+struct Types8 {
+  typedef T1 Head;
+  typedef Types7<T2, T3, T4, T5, T6, T7, T8> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9>
+struct Types9 {
+  typedef T1 Head;
+  typedef Types8<T2, T3, T4, T5, T6, T7, T8, T9> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10>
+struct Types10 {
+  typedef T1 Head;
+  typedef Types9<T2, T3, T4, T5, T6, T7, T8, T9, T10> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11>
+struct Types11 {
+  typedef T1 Head;
+  typedef Types10<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12>
+struct Types12 {
+  typedef T1 Head;
+  typedef Types11<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13>
+struct Types13 {
+  typedef T1 Head;
+  typedef Types12<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14>
+struct Types14 {
+  typedef T1 Head;
+  typedef Types13<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15>
+struct Types15 {
+  typedef T1 Head;
+  typedef Types14<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16>
+struct Types16 {
+  typedef T1 Head;
+  typedef Types15<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17>
+struct Types17 {
+  typedef T1 Head;
+  typedef Types16<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18>
+struct Types18 {
+  typedef T1 Head;
+  typedef Types17<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19>
+struct Types19 {
+  typedef T1 Head;
+  typedef Types18<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20>
+struct Types20 {
+  typedef T1 Head;
+  typedef Types19<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21>
+struct Types21 {
+  typedef T1 Head;
+  typedef Types20<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22>
+struct Types22 {
+  typedef T1 Head;
+  typedef Types21<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23>
+struct Types23 {
+  typedef T1 Head;
+  typedef Types22<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24>
+struct Types24 {
+  typedef T1 Head;
+  typedef Types23<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25>
+struct Types25 {
+  typedef T1 Head;
+  typedef Types24<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26>
+struct Types26 {
+  typedef T1 Head;
+  typedef Types25<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27>
+struct Types27 {
+  typedef T1 Head;
+  typedef Types26<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28>
+struct Types28 {
+  typedef T1 Head;
+  typedef Types27<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29>
+struct Types29 {
+  typedef T1 Head;
+  typedef Types28<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30>
+struct Types30 {
+  typedef T1 Head;
+  typedef Types29<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31>
+struct Types31 {
+  typedef T1 Head;
+  typedef Types30<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32>
+struct Types32 {
+  typedef T1 Head;
+  typedef Types31<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33>
+struct Types33 {
+  typedef T1 Head;
+  typedef Types32<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34>
+struct Types34 {
+  typedef T1 Head;
+  typedef Types33<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35>
+struct Types35 {
+  typedef T1 Head;
+  typedef Types34<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36>
+struct Types36 {
+  typedef T1 Head;
+  typedef Types35<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37>
+struct Types37 {
+  typedef T1 Head;
+  typedef Types36<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38>
+struct Types38 {
+  typedef T1 Head;
+  typedef Types37<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39>
+struct Types39 {
+  typedef T1 Head;
+  typedef Types38<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40>
+struct Types40 {
+  typedef T1 Head;
+  typedef Types39<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41>
+struct Types41 {
+  typedef T1 Head;
+  typedef Types40<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42>
+struct Types42 {
+  typedef T1 Head;
+  typedef Types41<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43>
+struct Types43 {
+  typedef T1 Head;
+  typedef Types42<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44>
+struct Types44 {
+  typedef T1 Head;
+  typedef Types43<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45>
+struct Types45 {
+  typedef T1 Head;
+  typedef Types44<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44, T45> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46>
+struct Types46 {
+  typedef T1 Head;
+  typedef Types45<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44, T45, T46> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47>
+struct Types47 {
+  typedef T1 Head;
+  typedef Types46<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44, T45, T46, T47> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48>
+struct Types48 {
+  typedef T1 Head;
+  typedef Types47<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44, T45, T46, T47, T48> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49>
+struct Types49 {
+  typedef T1 Head;
+  typedef Types48<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44, T45, T46, T47, T48, T49> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49, typename T50>
+struct Types50 {
+  typedef T1 Head;
+  typedef Types49<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44, T45, T46, T47, T48, T49, T50> Tail;
+};
+
+
+}  // namespace internal
+
+// We don't want to require the users to write TypesN<...> directly,
+// as that would require them to count the length.  Types<...> is much
+// easier to write, but generates horrible messages when there is a
+// compiler error, as gcc insists on printing out each template
+// argument, even if it has the default value (this means Types<int>
+// will appear as Types<int, None, None, ..., None> in the compiler
+// errors).
+//
+// Our solution is to combine the best part of the two approaches: a
+// user would write Types<T1, ..., TN>, and Google Test will translate
+// that to TypesN<T1, ..., TN> internally to make error messages
+// readable.  The translation is done by the 'type' member of the
+// Types template.
+template <typename T1 = internal::None, typename T2 = internal::None,
+    typename T3 = internal::None, typename T4 = internal::None,
+    typename T5 = internal::None, typename T6 = internal::None,
+    typename T7 = internal::None, typename T8 = internal::None,
+    typename T9 = internal::None, typename T10 = internal::None,
+    typename T11 = internal::None, typename T12 = internal::None,
+    typename T13 = internal::None, typename T14 = internal::None,
+    typename T15 = internal::None, typename T16 = internal::None,
+    typename T17 = internal::None, typename T18 = internal::None,
+    typename T19 = internal::None, typename T20 = internal::None,
+    typename T21 = internal::None, typename T22 = internal::None,
+    typename T23 = internal::None, typename T24 = internal::None,
+    typename T25 = internal::None, typename T26 = internal::None,
+    typename T27 = internal::None, typename T28 = internal::None,
+    typename T29 = internal::None, typename T30 = internal::None,
+    typename T31 = internal::None, typename T32 = internal::None,
+    typename T33 = internal::None, typename T34 = internal::None,
+    typename T35 = internal::None, typename T36 = internal::None,
+    typename T37 = internal::None, typename T38 = internal::None,
+    typename T39 = internal::None, typename T40 = internal::None,
+    typename T41 = internal::None, typename T42 = internal::None,
+    typename T43 = internal::None, typename T44 = internal::None,
+    typename T45 = internal::None, typename T46 = internal::None,
+    typename T47 = internal::None, typename T48 = internal::None,
+    typename T49 = internal::None, typename T50 = internal::None>
+struct Types {
+  typedef internal::Types50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45, T46, T47, T48, T49, T50> type;
+};
+
+template <>
+struct Types<internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types0 type;
+};
+template <typename T1>
+struct Types<T1, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types1<T1> type;
+};
+template <typename T1, typename T2>
+struct Types<T1, T2, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types2<T1, T2> type;
+};
+template <typename T1, typename T2, typename T3>
+struct Types<T1, T2, T3, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types3<T1, T2, T3> type;
+};
+template <typename T1, typename T2, typename T3, typename T4>
+struct Types<T1, T2, T3, T4, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types4<T1, T2, T3, T4> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+struct Types<T1, T2, T3, T4, T5, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types5<T1, T2, T3, T4, T5> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6>
+struct Types<T1, T2, T3, T4, T5, T6, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types6<T1, T2, T3, T4, T5, T6> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7>
+struct Types<T1, T2, T3, T4, T5, T6, T7, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types7<T1, T2, T3, T4, T5, T6, T7> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types8<T1, T2, T3, T4, T5, T6, T7, T8> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types9<T1, T2, T3, T4, T5, T6, T7, T8, T9> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
+    T46, internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45, T46> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
+    T46, T47, internal::None, internal::None, internal::None> {
+  typedef internal::Types47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45, T46, T47> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
+    T46, T47, T48, internal::None, internal::None> {
+  typedef internal::Types48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45, T46, T47, T48> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
+    T46, T47, T48, T49, internal::None> {
+  typedef internal::Types49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45, T46, T47, T48, T49> type;
+};
+
+namespace internal {
+
+# define GTEST_TEMPLATE_ template <typename T> class
+
+// The template "selector" struct TemplateSel<Tmpl> is used to
+// represent Tmpl, which must be a class template with one type
+// parameter, as a type.  TemplateSel<Tmpl>::Bind<T>::type is defined
+// as the type Tmpl<T>.  This allows us to actually instantiate the
+// template "selected" by TemplateSel<Tmpl>.
+//
+// This trick is necessary for simulating typedef for class templates,
+// which C++ doesn't support directly.
+template <GTEST_TEMPLATE_ Tmpl>
+struct TemplateSel {
+  template <typename T>
+  struct Bind {
+    typedef Tmpl<T> type;
+  };
+};
+
+# define GTEST_BIND_(TmplSel, T) \
+  TmplSel::template Bind<T>::type
+
+// A unique struct template used as the default value for the
+// arguments of class template Templates.  This allows us to simulate
+// variadic templates (e.g. Templates<int>, Templates<int, double>,
+// and etc), which C++ doesn't support directly.
+template <typename T>
+struct NoneT {};
+
+// The following family of struct and struct templates are used to
+// represent template lists.  In particular, TemplatesN<T1, T2, ...,
+// TN> represents a list of N templates (T1, T2, ..., and TN).  Except
+// for Templates0, every struct in the family has two member types:
+// Head for the selector of the first template in the list, and Tail
+// for the rest of the list.
+
+// The empty template list.
+struct Templates0 {};
+
+// Template lists of length 1, 2, 3, and so on.
+
+template <GTEST_TEMPLATE_ T1>
+struct Templates1 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates0 Tail;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2>
+struct Templates2 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates1<T2> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3>
+struct Templates3 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates2<T2, T3> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4>
+struct Templates4 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates3<T2, T3, T4> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5>
+struct Templates5 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates4<T2, T3, T4, T5> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6>
+struct Templates6 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates5<T2, T3, T4, T5, T6> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7>
+struct Templates7 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates6<T2, T3, T4, T5, T6, T7> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8>
+struct Templates8 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates7<T2, T3, T4, T5, T6, T7, T8> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9>
+struct Templates9 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates8<T2, T3, T4, T5, T6, T7, T8, T9> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10>
+struct Templates10 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates9<T2, T3, T4, T5, T6, T7, T8, T9, T10> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11>
+struct Templates11 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates10<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12>
+struct Templates12 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates11<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13>
+struct Templates13 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates12<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14>
+struct Templates14 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates13<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15>
+struct Templates15 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates14<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16>
+struct Templates16 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates15<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17>
+struct Templates17 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates16<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18>
+struct Templates18 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates17<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19>
+struct Templates19 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates18<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20>
+struct Templates20 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates19<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21>
+struct Templates21 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates20<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22>
+struct Templates22 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates21<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23>
+struct Templates23 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates22<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24>
+struct Templates24 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates23<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25>
+struct Templates25 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates24<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26>
+struct Templates26 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates25<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27>
+struct Templates27 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates26<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28>
+struct Templates28 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates27<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29>
+struct Templates29 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates28<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30>
+struct Templates30 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates29<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31>
+struct Templates31 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates30<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32>
+struct Templates32 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates31<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33>
+struct Templates33 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates32<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34>
+struct Templates34 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates33<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35>
+struct Templates35 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates34<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36>
+struct Templates36 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates35<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37>
+struct Templates37 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates36<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38>
+struct Templates38 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates37<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39>
+struct Templates39 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates38<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40>
+struct Templates40 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates39<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41>
+struct Templates41 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates40<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42>
+struct Templates42 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates41<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43>
+struct Templates43 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates42<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44>
+struct Templates44 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates43<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45>
+struct Templates45 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates44<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44, T45> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46>
+struct Templates46 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates45<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44, T45, T46> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47>
+struct Templates47 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates46<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44, T45, T46, T47> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48>
+struct Templates48 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates47<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44, T45, T46, T47, T48> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48,
+    GTEST_TEMPLATE_ T49>
+struct Templates49 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates48<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44, T45, T46, T47, T48, T49> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48,
+    GTEST_TEMPLATE_ T49, GTEST_TEMPLATE_ T50>
+struct Templates50 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates49<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44, T45, T46, T47, T48, T49, T50> Tail;
+};
+
+
+// We don't want to require the users to write TemplatesN<...> directly,
+// as that would require them to count the length.  Templates<...> is much
+// easier to write, but generates horrible messages when there is a
+// compiler error, as gcc insists on printing out each template
+// argument, even if it has the default value (this means Templates<list>
+// will appear as Templates<list, NoneT, NoneT, ..., NoneT> in the compiler
+// errors).
+//
+// Our solution is to combine the best part of the two approaches: a
+// user would write Templates<T1, ..., TN>, and Google Test will translate
+// that to TemplatesN<T1, ..., TN> internally to make error messages
+// readable.  The translation is done by the 'type' member of the
+// Templates template.
+template <GTEST_TEMPLATE_ T1 = NoneT, GTEST_TEMPLATE_ T2 = NoneT,
+    GTEST_TEMPLATE_ T3 = NoneT, GTEST_TEMPLATE_ T4 = NoneT,
+    GTEST_TEMPLATE_ T5 = NoneT, GTEST_TEMPLATE_ T6 = NoneT,
+    GTEST_TEMPLATE_ T7 = NoneT, GTEST_TEMPLATE_ T8 = NoneT,
+    GTEST_TEMPLATE_ T9 = NoneT, GTEST_TEMPLATE_ T10 = NoneT,
+    GTEST_TEMPLATE_ T11 = NoneT, GTEST_TEMPLATE_ T12 = NoneT,
+    GTEST_TEMPLATE_ T13 = NoneT, GTEST_TEMPLATE_ T14 = NoneT,
+    GTEST_TEMPLATE_ T15 = NoneT, GTEST_TEMPLATE_ T16 = NoneT,
+    GTEST_TEMPLATE_ T17 = NoneT, GTEST_TEMPLATE_ T18 = NoneT,
+    GTEST_TEMPLATE_ T19 = NoneT, GTEST_TEMPLATE_ T20 = NoneT,
+    GTEST_TEMPLATE_ T21 = NoneT, GTEST_TEMPLATE_ T22 = NoneT,
+    GTEST_TEMPLATE_ T23 = NoneT, GTEST_TEMPLATE_ T24 = NoneT,
+    GTEST_TEMPLATE_ T25 = NoneT, GTEST_TEMPLATE_ T26 = NoneT,
+    GTEST_TEMPLATE_ T27 = NoneT, GTEST_TEMPLATE_ T28 = NoneT,
+    GTEST_TEMPLATE_ T29 = NoneT, GTEST_TEMPLATE_ T30 = NoneT,
+    GTEST_TEMPLATE_ T31 = NoneT, GTEST_TEMPLATE_ T32 = NoneT,
+    GTEST_TEMPLATE_ T33 = NoneT, GTEST_TEMPLATE_ T34 = NoneT,
+    GTEST_TEMPLATE_ T35 = NoneT, GTEST_TEMPLATE_ T36 = NoneT,
+    GTEST_TEMPLATE_ T37 = NoneT, GTEST_TEMPLATE_ T38 = NoneT,
+    GTEST_TEMPLATE_ T39 = NoneT, GTEST_TEMPLATE_ T40 = NoneT,
+    GTEST_TEMPLATE_ T41 = NoneT, GTEST_TEMPLATE_ T42 = NoneT,
+    GTEST_TEMPLATE_ T43 = NoneT, GTEST_TEMPLATE_ T44 = NoneT,
+    GTEST_TEMPLATE_ T45 = NoneT, GTEST_TEMPLATE_ T46 = NoneT,
+    GTEST_TEMPLATE_ T47 = NoneT, GTEST_TEMPLATE_ T48 = NoneT,
+    GTEST_TEMPLATE_ T49 = NoneT, GTEST_TEMPLATE_ T50 = NoneT>
+struct Templates {
+  typedef Templates50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44, T45, T46, T47, T48, T49, T50> type;
+};
+
+template <>
+struct Templates<NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT> {
+  typedef Templates0 type;
+};
+template <GTEST_TEMPLATE_ T1>
+struct Templates<T1, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT> {
+  typedef Templates1<T1> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2>
+struct Templates<T1, T2, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT> {
+  typedef Templates2<T1, T2> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3>
+struct Templates<T1, T2, T3, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates3<T1, T2, T3> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4>
+struct Templates<T1, T2, T3, T4, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates4<T1, T2, T3, T4> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5>
+struct Templates<T1, T2, T3, T4, T5, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates5<T1, T2, T3, T4, T5> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6>
+struct Templates<T1, T2, T3, T4, T5, T6, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates6<T1, T2, T3, T4, T5, T6> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates7<T1, T2, T3, T4, T5, T6, T7> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates8<T1, T2, T3, T4, T5, T6, T7, T8> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates9<T1, T2, T3, T4, T5, T6, T7, T8, T9> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT> {
+  typedef Templates22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT> {
+  typedef Templates23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT> {
+  typedef Templates24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT> {
+  typedef Templates25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT> {
+  typedef Templates26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT> {
+  typedef Templates27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT> {
+  typedef Templates28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT> {
+  typedef Templates29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    T45, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44, T45> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    T45, T46, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44, T45, T46> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    T45, T46, T47, NoneT, NoneT, NoneT> {
+  typedef Templates47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44, T45, T46, T47> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    T45, T46, T47, T48, NoneT, NoneT> {
+  typedef Templates48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44, T45, T46, T47, T48> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48,
+    GTEST_TEMPLATE_ T49>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    T45, T46, T47, T48, T49, NoneT> {
+  typedef Templates49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44, T45, T46, T47, T48, T49> type;
+};
+
+// The TypeList template makes it possible to use either a single type
+// or a Types<...> list in TYPED_TEST_CASE() and
+// INSTANTIATE_TYPED_TEST_CASE_P().
+
+template <typename T>
+struct TypeList {
+  typedef Types1<T> type;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49, typename T50>
+struct TypeList<Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44, T45, T46, T47, T48, T49, T50> > {
+  typedef typename Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45, T46, T47, T48, T49, T50>::type type;
+};
+
+#endif  // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
+
+// Due to C++ preprocessor weirdness, we need double indirection to
+// concatenate two tokens when one of them is __LINE__.  Writing
+//
+//   foo ## __LINE__
+//
+// will result in the token foo__LINE__, instead of foo followed by
+// the current line number.  For more details, see
+// http://www.parashift.com/c++-faq-lite/misc-technical-issues.html#faq-39.6
+#define GTEST_CONCAT_TOKEN_(foo, bar) GTEST_CONCAT_TOKEN_IMPL_(foo, bar)
+#define GTEST_CONCAT_TOKEN_IMPL_(foo, bar) foo ## bar
+
+class ProtocolMessage;
+namespace proto2 { class Message; }
+
+namespace testing {
+
+// Forward declarations.
+
+class AssertionResult;                 // Result of an assertion.
+class Message;                         // Represents a failure message.
+class Test;                            // Represents a test.
+class TestInfo;                        // Information about a test.
+class TestPartResult;                  // Result of a test part.
+class UnitTest;                        // A collection of test cases.
+
+template <typename T>
+::std::string PrintToString(const T& value);
+
+namespace internal {
+
+struct TraceInfo;                      // Information about a trace point.
+class ScopedTrace;                     // Implements scoped trace.
+class TestInfoImpl;                    // Opaque implementation of TestInfo
+class UnitTestImpl;                    // Opaque implementation of UnitTest
+
+// How many times InitGoogleTest() has been called.
+GTEST_API_ extern int g_init_gtest_count;
+
+// The text used in failure messages to indicate the start of the
+// stack trace.
+GTEST_API_ extern const char kStackTraceMarker[];
+
+// Two overloaded helpers for checking at compile time whether an
+// expression is a null pointer literal (i.e. NULL or any 0-valued
+// compile-time integral constant).  Their return values have
+// different sizes, so we can use sizeof() to test which version is
+// picked by the compiler.  These helpers have no implementations, as
+// we only need their signatures.
+//
+// Given IsNullLiteralHelper(x), the compiler will pick the first
+// version if x can be implicitly converted to Secret*, and pick the
+// second version otherwise.  Since Secret is a secret and incomplete
+// type, the only expression a user can write that has type Secret* is
+// a null pointer literal.  Therefore, we know that x is a null
+// pointer literal if and only if the first version is picked by the
+// compiler.
+char IsNullLiteralHelper(Secret* p);
+char (&IsNullLiteralHelper(...))[2];  // NOLINT
+
+// A compile-time bool constant that is true if and only if x is a
+// null pointer literal (i.e. NULL or any 0-valued compile-time
+// integral constant).
+#ifdef GTEST_ELLIPSIS_NEEDS_POD_
+// We lose support for NULL detection where the compiler doesn't like
+// passing non-POD classes through ellipsis (...).
+# define GTEST_IS_NULL_LITERAL_(x) false
+#else
+# define GTEST_IS_NULL_LITERAL_(x) \
+    (sizeof(::testing::internal::IsNullLiteralHelper(x)) == 1)
+#endif  // GTEST_ELLIPSIS_NEEDS_POD_
+
+// Appends the user-supplied message to the Google-Test-generated message.
+GTEST_API_ std::string AppendUserMessage(
+    const std::string& gtest_msg, const Message& user_msg);
+
+#if GTEST_HAS_EXCEPTIONS
+
+// This exception is thrown by (and only by) a failed Google Test
+// assertion when GTEST_FLAG(throw_on_failure) is true (if exceptions
+// are enabled).  We derive it from std::runtime_error, which is for
+// errors presumably detectable only at run time.  Since
+// std::runtime_error inherits from std::exception, many testing
+// frameworks know how to extract and print the message inside it.
+class GTEST_API_ GoogleTestFailureException : public ::std::runtime_error {
+ public:
+  explicit GoogleTestFailureException(const TestPartResult& failure);
+};
+
+#endif  // GTEST_HAS_EXCEPTIONS
+
+// A helper class for creating scoped traces in user programs.
+class GTEST_API_ ScopedTrace {
+ public:
+  // The c'tor pushes the given source file location and message onto
+  // a trace stack maintained by Google Test.
+  ScopedTrace(const char* file, int line, const Message& message);
+
+  // The d'tor pops the info pushed by the c'tor.
+  //
+  // Note that the d'tor is not virtual in order to be efficient.
+  // Don't inherit from ScopedTrace!
+  ~ScopedTrace();
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedTrace);
+} GTEST_ATTRIBUTE_UNUSED_;  // A ScopedTrace object does its job in its
+                            // c'tor and d'tor.  Therefore it doesn't
+                            // need to be used otherwise.
+
+// Constructs and returns the message for an equality assertion
+// (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure.
+//
+// The first four parameters are the expressions used in the assertion
+// and their values, as strings.  For example, for ASSERT_EQ(foo, bar)
+// where foo is 5 and bar is 6, we have:
+//
+//   expected_expression: "foo"
+//   actual_expression:   "bar"
+//   expected_value:      "5"
+//   actual_value:        "6"
+//
+// The ignoring_case parameter is true iff the assertion is a
+// *_STRCASEEQ*.  When it's true, the string " (ignoring case)" will
+// be inserted into the message.
+GTEST_API_ AssertionResult EqFailure(const char* expected_expression,
+                                     const char* actual_expression,
+                                     const std::string& expected_value,
+                                     const std::string& actual_value,
+                                     bool ignoring_case);
+
+// Constructs a failure message for Boolean assertions such as EXPECT_TRUE.
+GTEST_API_ std::string GetBoolAssertionFailureMessage(
+    const AssertionResult& assertion_result,
+    const char* expression_text,
+    const char* actual_predicate_value,
+    const char* expected_predicate_value);
+
+// This template class represents an IEEE floating-point number
+// (either single-precision or double-precision, depending on the
+// template parameters).
+//
+// The purpose of this class is to do more sophisticated number
+// comparison.  (Due to round-off error, etc, it's very unlikely that
+// two floating-points will be equal exactly.  Hence a naive
+// comparison by the == operation often doesn't work.)
+//
+// Format of IEEE floating-point:
+//
+//   The most-significant bit being the leftmost, an IEEE
+//   floating-point looks like
+//
+//     sign_bit exponent_bits fraction_bits
+//
+//   Here, sign_bit is a single bit that designates the sign of the
+//   number.
+//
+//   For float, there are 8 exponent bits and 23 fraction bits.
+//
+//   For double, there are 11 exponent bits and 52 fraction bits.
+//
+//   More details can be found at
+//   http://en.wikipedia.org/wiki/IEEE_floating-point_standard.
+//
+// Template parameter:
+//
+//   RawType: the raw floating-point type (either float or double)
+template <typename RawType>
+class FloatingPoint {
+ public:
+  // Defines the unsigned integer type that has the same size as the
+  // floating point number.
+  typedef typename TypeWithSize<sizeof(RawType)>::UInt Bits;
+
+  // Constants.
+
+  // # of bits in a number.
+  static const size_t kBitCount = 8*sizeof(RawType);
+
+  // # of fraction bits in a number.
+  static const size_t kFractionBitCount =
+    std::numeric_limits<RawType>::digits - 1;
+
+  // # of exponent bits in a number.
+  static const size_t kExponentBitCount = kBitCount - 1 - kFractionBitCount;
+
+  // The mask for the sign bit.
+  static const Bits kSignBitMask = static_cast<Bits>(1) << (kBitCount - 1);
+
+  // The mask for the fraction bits.
+  static const Bits kFractionBitMask =
+    ~static_cast<Bits>(0) >> (kExponentBitCount + 1);
+
+  // The mask for the exponent bits.
+  static const Bits kExponentBitMask = ~(kSignBitMask | kFractionBitMask);
+
+  // How many ULP's (Units in the Last Place) we want to tolerate when
+  // comparing two numbers.  The larger the value, the more error we
+  // allow.  A 0 value means that two numbers must be exactly the same
+  // to be considered equal.
+  //
+  // The maximum error of a single floating-point operation is 0.5
+  // units in the last place.  On Intel CPU's, all floating-point
+  // calculations are done with 80-bit precision, while double has 64
+  // bits.  Therefore, 4 should be enough for ordinary use.
+  //
+  // See the following article for more details on ULP:
+  // http://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/
+  static const size_t kMaxUlps = 4;
+
+  // Constructs a FloatingPoint from a raw floating-point number.
+  //
+  // On an Intel CPU, passing a non-normalized NAN (Not a Number)
+  // around may change its bits, although the new value is guaranteed
+  // to be also a NAN.  Therefore, don't expect this constructor to
+  // preserve the bits in x when x is a NAN.
+  explicit FloatingPoint(const RawType& x) { u_.value_ = x; }
+
+  // Static methods
+
+  // Reinterprets a bit pattern as a floating-point number.
+  //
+  // This function is needed to test the AlmostEquals() method.
+  static RawType ReinterpretBits(const Bits bits) {
+    FloatingPoint fp(0);
+    fp.u_.bits_ = bits;
+    return fp.u_.value_;
+  }
+
+  // Returns the floating-point number that represent positive infinity.
+  static RawType Infinity() {
+    return ReinterpretBits(kExponentBitMask);
+  }
+
+  // Returns the maximum representable finite floating-point number.
+  static RawType Max();
+
+  // Non-static methods
+
+  // Returns the bits that represents this number.
+  const Bits &bits() const { return u_.bits_; }
+
+  // Returns the exponent bits of this number.
+  Bits exponent_bits() const { return kExponentBitMask & u_.bits_; }
+
+  // Returns the fraction bits of this number.
+  Bits fraction_bits() const { return kFractionBitMask & u_.bits_; }
+
+  // Returns the sign bit of this number.
+  Bits sign_bit() const { return kSignBitMask & u_.bits_; }
+
+  // Returns true iff this is NAN (not a number).
+  bool is_nan() const {
+    // It's a NAN if the exponent bits are all ones and the fraction
+    // bits are not entirely zeros.
+    return (exponent_bits() == kExponentBitMask) && (fraction_bits() != 0);
+  }
+
+  // Returns true iff this number is at most kMaxUlps ULP's away from
+  // rhs.  In particular, this function:
+  //
+  //   - returns false if either number is (or both are) NAN.
+  //   - treats really large numbers as almost equal to infinity.
+  //   - thinks +0.0 and -0.0 are 0 DLP's apart.
+  bool AlmostEquals(const FloatingPoint& rhs) const {
+    // The IEEE standard says that any comparison operation involving
+    // a NAN must return false.
+    if (is_nan() || rhs.is_nan()) return false;
+
+    return DistanceBetweenSignAndMagnitudeNumbers(u_.bits_, rhs.u_.bits_)
+        <= kMaxUlps;
+  }
+
+ private:
+  // The data type used to store the actual floating-point number.
+  union FloatingPointUnion {
+    RawType value_;  // The raw floating-point number.
+    Bits bits_;      // The bits that represent the number.
+  };
+
+  // Converts an integer from the sign-and-magnitude representation to
+  // the biased representation.  More precisely, let N be 2 to the
+  // power of (kBitCount - 1), an integer x is represented by the
+  // unsigned number x + N.
+  //
+  // For instance,
+  //
+  //   -N + 1 (the most negative number representable using
+  //          sign-and-magnitude) is represented by 1;
+  //   0      is represented by N; and
+  //   N - 1  (the biggest number representable using
+  //          sign-and-magnitude) is represented by 2N - 1.
+  //
+  // Read http://en.wikipedia.org/wiki/Signed_number_representations
+  // for more details on signed number representations.
+  static Bits SignAndMagnitudeToBiased(const Bits &sam) {
+    if (kSignBitMask & sam) {
+      // sam represents a negative number.
+      return ~sam + 1;
+    } else {
+      // sam represents a positive number.
+      return kSignBitMask | sam;
+    }
+  }
+
+  // Given two numbers in the sign-and-magnitude representation,
+  // returns the distance between them as an unsigned number.
+  static Bits DistanceBetweenSignAndMagnitudeNumbers(const Bits &sam1,
+                                                     const Bits &sam2) {
+    const Bits biased1 = SignAndMagnitudeToBiased(sam1);
+    const Bits biased2 = SignAndMagnitudeToBiased(sam2);
+    return (biased1 >= biased2) ? (biased1 - biased2) : (biased2 - biased1);
+  }
+
+  FloatingPointUnion u_;
+};
+
+// We cannot use std::numeric_limits<T>::max() as it clashes with the max()
+// macro defined by <windows.h>.
+template <>
+inline float FloatingPoint<float>::Max() { return FLT_MAX; }
+template <>
+inline double FloatingPoint<double>::Max() { return DBL_MAX; }
+
+// Typedefs the instances of the FloatingPoint template class that we
+// care to use.
+typedef FloatingPoint<float> Float;
+typedef FloatingPoint<double> Double;
+
+// In order to catch the mistake of putting tests that use different
+// test fixture classes in the same test case, we need to assign
+// unique IDs to fixture classes and compare them.  The TypeId type is
+// used to hold such IDs.  The user should treat TypeId as an opaque
+// type: the only operation allowed on TypeId values is to compare
+// them for equality using the == operator.
+typedef const void* TypeId;
+
+template <typename T>
+class TypeIdHelper {
+ public:
+  // dummy_ must not have a const type.  Otherwise an overly eager
+  // compiler (e.g. MSVC 7.1 & 8.0) may try to merge
+  // TypeIdHelper<T>::dummy_ for different Ts as an "optimization".
+  static bool dummy_;
+};
+
+template <typename T>
+bool TypeIdHelper<T>::dummy_ = false;
+
+// GetTypeId<T>() returns the ID of type T.  Different values will be
+// returned for different types.  Calling the function twice with the
+// same type argument is guaranteed to return the same ID.
+template <typename T>
+TypeId GetTypeId() {
+  // The compiler is required to allocate a different
+  // TypeIdHelper<T>::dummy_ variable for each T used to instantiate
+  // the template.  Therefore, the address of dummy_ is guaranteed to
+  // be unique.
+  return &(TypeIdHelper<T>::dummy_);
+}
+
+// Returns the type ID of ::testing::Test.  Always call this instead
+// of GetTypeId< ::testing::Test>() to get the type ID of
+// ::testing::Test, as the latter may give the wrong result due to a
+// suspected linker bug when compiling Google Test as a Mac OS X
+// framework.
+GTEST_API_ TypeId GetTestTypeId();
+
+// Defines the abstract factory interface that creates instances
+// of a Test object.
+class TestFactoryBase {
+ public:
+  virtual ~TestFactoryBase() {}
+
+  // Creates a test instance to run. The instance is both created and destroyed
+  // within TestInfoImpl::Run()
+  virtual Test* CreateTest() = 0;
+
+ protected:
+  TestFactoryBase() {}
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestFactoryBase);
+};
+
+// This class provides implementation of TeastFactoryBase interface.
+// It is used in TEST and TEST_F macros.
+template <class TestClass>
+class TestFactoryImpl : public TestFactoryBase {
+ public:
+  virtual Test* CreateTest() { return new TestClass; }
+};
+
+#if GTEST_OS_WINDOWS
+
+// Predicate-formatters for implementing the HRESULT checking macros
+// {ASSERT|EXPECT}_HRESULT_{SUCCEEDED|FAILED}
+// We pass a long instead of HRESULT to avoid causing an
+// include dependency for the HRESULT type.
+GTEST_API_ AssertionResult IsHRESULTSuccess(const char* expr,
+                                            long hr);  // NOLINT
+GTEST_API_ AssertionResult IsHRESULTFailure(const char* expr,
+                                            long hr);  // NOLINT
+
+#endif  // GTEST_OS_WINDOWS
+
+// Types of SetUpTestCase() and TearDownTestCase() functions.
+typedef void (*SetUpTestCaseFunc)();
+typedef void (*TearDownTestCaseFunc)();
+
+// Creates a new TestInfo object and registers it with Google Test;
+// returns the created object.
+//
+// Arguments:
+//
+//   test_case_name:   name of the test case
+//   name:             name of the test
+//   type_param        the name of the test's type parameter, or NULL if
+//                     this is not a typed or a type-parameterized test.
+//   value_param       text representation of the test's value parameter,
+//                     or NULL if this is not a type-parameterized test.
+//   fixture_class_id: ID of the test fixture class
+//   set_up_tc:        pointer to the function that sets up the test case
+//   tear_down_tc:     pointer to the function that tears down the test case
+//   factory:          pointer to the factory that creates a test object.
+//                     The newly created TestInfo instance will assume
+//                     ownership of the factory object.
+GTEST_API_ TestInfo* MakeAndRegisterTestInfo(
+    const char* test_case_name,
+    const char* name,
+    const char* type_param,
+    const char* value_param,
+    TypeId fixture_class_id,
+    SetUpTestCaseFunc set_up_tc,
+    TearDownTestCaseFunc tear_down_tc,
+    TestFactoryBase* factory);
+
+// If *pstr starts with the given prefix, modifies *pstr to be right
+// past the prefix and returns true; otherwise leaves *pstr unchanged
+// and returns false.  None of pstr, *pstr, and prefix can be NULL.
+GTEST_API_ bool SkipPrefix(const char* prefix, const char** pstr);
+
+#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
+
+// State of the definition of a type-parameterized test case.
+class GTEST_API_ TypedTestCasePState {
+ public:
+  TypedTestCasePState() : registered_(false) {}
+
+  // Adds the given test name to defined_test_names_ and return true
+  // if the test case hasn't been registered; otherwise aborts the
+  // program.
+  bool AddTestName(const char* file, int line, const char* case_name,
+                   const char* test_name) {
+    if (registered_) {
+      fprintf(stderr, "%s Test %s must be defined before "
+              "REGISTER_TYPED_TEST_CASE_P(%s, ...).\n",
+              FormatFileLocation(file, line).c_str(), test_name, case_name);
+      fflush(stderr);
+      posix::Abort();
+    }
+    defined_test_names_.insert(test_name);
+    return true;
+  }
+
+  // Verifies that registered_tests match the test names in
+  // defined_test_names_; returns registered_tests if successful, or
+  // aborts the program otherwise.
+  const char* VerifyRegisteredTestNames(
+      const char* file, int line, const char* registered_tests);
+
+ private:
+  bool registered_;
+  ::std::set<const char*> defined_test_names_;
+};
+
+// Skips to the first non-space char after the first comma in 'str';
+// returns NULL if no comma is found in 'str'.
+inline const char* SkipComma(const char* str) {
+  const char* comma = strchr(str, ',');
+  if (comma == NULL) {
+    return NULL;
+  }
+  while (IsSpace(*(++comma))) {}
+  return comma;
+}
+
+// Returns the prefix of 'str' before the first comma in it; returns
+// the entire string if it contains no comma.
+inline std::string GetPrefixUntilComma(const char* str) {
+  const char* comma = strchr(str, ',');
+  return comma == NULL ? str : std::string(str, comma);
+}
+
+// TypeParameterizedTest<Fixture, TestSel, Types>::Register()
+// registers a list of type-parameterized tests with Google Test.  The
+// return value is insignificant - we just need to return something
+// such that we can call this function in a namespace scope.
+//
+// Implementation note: The GTEST_TEMPLATE_ macro declares a template
+// template parameter.  It's defined in gtest-type-util.h.
+template <GTEST_TEMPLATE_ Fixture, class TestSel, typename Types>
+class TypeParameterizedTest {
+ public:
+  // 'index' is the index of the test in the type list 'Types'
+  // specified in INSTANTIATE_TYPED_TEST_CASE_P(Prefix, TestCase,
+  // Types).  Valid values for 'index' are [0, N - 1] where N is the
+  // length of Types.
+  static bool Register(const char* prefix, const char* case_name,
+                       const char* test_names, int index) {
+    typedef typename Types::Head Type;
+    typedef Fixture<Type> FixtureClass;
+    typedef typename GTEST_BIND_(TestSel, Type) TestClass;
+
+    // First, registers the first type-parameterized test in the type
+    // list.
+    MakeAndRegisterTestInfo(
+        (std::string(prefix) + (prefix[0] == '\0' ? "" : "/") + case_name + "/"
+         + StreamableToString(index)).c_str(),
+        GetPrefixUntilComma(test_names).c_str(),
+        GetTypeName<Type>().c_str(),
+        NULL,  // No value parameter.
+        GetTypeId<FixtureClass>(),
+        TestClass::SetUpTestCase,
+        TestClass::TearDownTestCase,
+        new TestFactoryImpl<TestClass>);
+
+    // Next, recurses (at compile time) with the tail of the type list.
+    return TypeParameterizedTest<Fixture, TestSel, typename Types::Tail>
+        ::Register(prefix, case_name, test_names, index + 1);
+  }
+};
+
+// The base case for the compile time recursion.
+template <GTEST_TEMPLATE_ Fixture, class TestSel>
+class TypeParameterizedTest<Fixture, TestSel, Types0> {
+ public:
+  static bool Register(const char* /*prefix*/, const char* /*case_name*/,
+                       const char* /*test_names*/, int /*index*/) {
+    return true;
+  }
+};
+
+// TypeParameterizedTestCase<Fixture, Tests, Types>::Register()
+// registers *all combinations* of 'Tests' and 'Types' with Google
+// Test.  The return value is insignificant - we just need to return
+// something such that we can call this function in a namespace scope.
+template <GTEST_TEMPLATE_ Fixture, typename Tests, typename Types>
+class TypeParameterizedTestCase {
+ public:
+  static bool Register(const char* prefix, const char* case_name,
+                       const char* test_names) {
+    typedef typename Tests::Head Head;
+
+    // First, register the first test in 'Test' for each type in 'Types'.
+    TypeParameterizedTest<Fixture, Head, Types>::Register(
+        prefix, case_name, test_names, 0);
+
+    // Next, recurses (at compile time) with the tail of the test list.
+    return TypeParameterizedTestCase<Fixture, typename Tests::Tail, Types>
+        ::Register(prefix, case_name, SkipComma(test_names));
+  }
+};
+
+// The base case for the compile time recursion.
+template <GTEST_TEMPLATE_ Fixture, typename Types>
+class TypeParameterizedTestCase<Fixture, Templates0, Types> {
+ public:
+  static bool Register(const char* /*prefix*/, const char* /*case_name*/,
+                       const char* /*test_names*/) {
+    return true;
+  }
+};
+
+#endif  // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
+
+// Returns the current OS stack trace as an std::string.
+//
+// The maximum number of stack frames to be included is specified by
+// the gtest_stack_trace_depth flag.  The skip_count parameter
+// specifies the number of top frames to be skipped, which doesn't
+// count against the number of frames to be included.
+//
+// For example, if Foo() calls Bar(), which in turn calls
+// GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in
+// the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't.
+GTEST_API_ std::string GetCurrentOsStackTraceExceptTop(
+    UnitTest* unit_test, int skip_count);
+
+// Helpers for suppressing warnings on unreachable code or constant
+// condition.
+
+// Always returns true.
+GTEST_API_ bool AlwaysTrue();
+
+// Always returns false.
+inline bool AlwaysFalse() { return !AlwaysTrue(); }
+
+// Helper for suppressing false warning from Clang on a const char*
+// variable declared in a conditional expression always being NULL in
+// the else branch.
+struct GTEST_API_ ConstCharPtr {
+  ConstCharPtr(const char* str) : value(str) {}
+  operator bool() const { return true; }
+  const char* value;
+};
+
+// A simple Linear Congruential Generator for generating random
+// numbers with a uniform distribution.  Unlike rand() and srand(), it
+// doesn't use global state (and therefore can't interfere with user
+// code).  Unlike rand_r(), it's portable.  An LCG isn't very random,
+// but it's good enough for our purposes.
+class GTEST_API_ Random {
+ public:
+  static const UInt32 kMaxRange = 1u << 31;
+
+  explicit Random(UInt32 seed) : state_(seed) {}
+
+  void Reseed(UInt32 seed) { state_ = seed; }
+
+  // Generates a random number from [0, range).  Crashes if 'range' is
+  // 0 or greater than kMaxRange.
+  UInt32 Generate(UInt32 range);
+
+ private:
+  UInt32 state_;
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Random);
+};
+
+// Defining a variable of type CompileAssertTypesEqual<T1, T2> will cause a
+// compiler error iff T1 and T2 are different types.
+template <typename T1, typename T2>
+struct CompileAssertTypesEqual;
+
+template <typename T>
+struct CompileAssertTypesEqual<T, T> {
+};
+
+// Removes the reference from a type if it is a reference type,
+// otherwise leaves it unchanged.  This is the same as
+// tr1::remove_reference, which is not widely available yet.
+template <typename T>
+struct RemoveReference { typedef T type; };  // NOLINT
+template <typename T>
+struct RemoveReference<T&> { typedef T type; };  // NOLINT
+
+// A handy wrapper around RemoveReference that works when the argument
+// T depends on template parameters.
+#define GTEST_REMOVE_REFERENCE_(T) \
+    typename ::testing::internal::RemoveReference<T>::type
+
+// Removes const from a type if it is a const type, otherwise leaves
+// it unchanged.  This is the same as tr1::remove_const, which is not
+// widely available yet.
+template <typename T>
+struct RemoveConst { typedef T type; };  // NOLINT
+template <typename T>
+struct RemoveConst<const T> { typedef T type; };  // NOLINT
+
+// MSVC 8.0, Sun C++, and IBM XL C++ have a bug which causes the above
+// definition to fail to remove the const in 'const int[3]' and 'const
+// char[3][4]'.  The following specialization works around the bug.
+template <typename T, size_t N>
+struct RemoveConst<const T[N]> {
+  typedef typename RemoveConst<T>::type type[N];
+};
+
+#if defined(_MSC_VER) && _MSC_VER < 1400
+// This is the only specialization that allows VC++ 7.1 to remove const in
+// 'const int[3] and 'const int[3][4]'.  However, it causes trouble with GCC
+// and thus needs to be conditionally compiled.
+template <typename T, size_t N>
+struct RemoveConst<T[N]> {
+  typedef typename RemoveConst<T>::type type[N];
+};
+#endif
+
+// A handy wrapper around RemoveConst that works when the argument
+// T depends on template parameters.
+#define GTEST_REMOVE_CONST_(T) \
+    typename ::testing::internal::RemoveConst<T>::type
+
+// Turns const U&, U&, const U, and U all into U.
+#define GTEST_REMOVE_REFERENCE_AND_CONST_(T) \
+    GTEST_REMOVE_CONST_(GTEST_REMOVE_REFERENCE_(T))
+
+// Adds reference to a type if it is not a reference type,
+// otherwise leaves it unchanged.  This is the same as
+// tr1::add_reference, which is not widely available yet.
+template <typename T>
+struct AddReference { typedef T& type; };  // NOLINT
+template <typename T>
+struct AddReference<T&> { typedef T& type; };  // NOLINT
+
+// A handy wrapper around AddReference that works when the argument T
+// depends on template parameters.
+#define GTEST_ADD_REFERENCE_(T) \
+    typename ::testing::internal::AddReference<T>::type
+
+// Adds a reference to const on top of T as necessary.  For example,
+// it transforms
+//
+//   char         ==> const char&
+//   const char   ==> const char&
+//   char&        ==> const char&
+//   const char&  ==> const char&
+//
+// The argument T must depend on some template parameters.
+#define GTEST_REFERENCE_TO_CONST_(T) \
+    GTEST_ADD_REFERENCE_(const GTEST_REMOVE_REFERENCE_(T))
+
+// ImplicitlyConvertible<From, To>::value is a compile-time bool
+// constant that's true iff type From can be implicitly converted to
+// type To.
+template <typename From, typename To>
+class ImplicitlyConvertible {
+ private:
+  // We need the following helper functions only for their types.
+  // They have no implementations.
+
+  // MakeFrom() is an expression whose type is From.  We cannot simply
+  // use From(), as the type From may not have a public default
+  // constructor.
+  static From MakeFrom();
+
+  // These two functions are overloaded.  Given an expression
+  // Helper(x), the compiler will pick the first version if x can be
+  // implicitly converted to type To; otherwise it will pick the
+  // second version.
+  //
+  // The first version returns a value of size 1, and the second
+  // version returns a value of size 2.  Therefore, by checking the
+  // size of Helper(x), which can be done at compile time, we can tell
+  // which version of Helper() is used, and hence whether x can be
+  // implicitly converted to type To.
+  static char Helper(To);
+  static char (&Helper(...))[2];  // NOLINT
+
+  // We have to put the 'public' section after the 'private' section,
+  // or MSVC refuses to compile the code.
+ public:
+  // MSVC warns about implicitly converting from double to int for
+  // possible loss of data, so we need to temporarily disable the
+  // warning.
+#ifdef _MSC_VER
+# pragma warning(push)          // Saves the current warning state.
+# pragma warning(disable:4244)  // Temporarily disables warning 4244.
+
+  static const bool value =
+      sizeof(Helper(ImplicitlyConvertible::MakeFrom())) == 1;
+# pragma warning(pop)           // Restores the warning state.
+#elif defined(__BORLANDC__)
+  // C++Builder cannot use member overload resolution during template
+  // instantiation.  The simplest workaround is to use its C++0x type traits
+  // functions (C++Builder 2009 and above only).
+  static const bool value = __is_convertible(From, To);
+#else
+  static const bool value =
+      sizeof(Helper(ImplicitlyConvertible::MakeFrom())) == 1;
+#endif  // _MSV_VER
+};
+template <typename From, typename To>
+const bool ImplicitlyConvertible<From, To>::value;
+
+// IsAProtocolMessage<T>::value is a compile-time bool constant that's
+// true iff T is type ProtocolMessage, proto2::Message, or a subclass
+// of those.
+template <typename T>
+struct IsAProtocolMessage
+    : public bool_constant<
+  ImplicitlyConvertible<const T*, const ::ProtocolMessage*>::value ||
+  ImplicitlyConvertible<const T*, const ::proto2::Message*>::value> {
+};
+
+// When the compiler sees expression IsContainerTest<C>(0), if C is an
+// STL-style container class, the first overload of IsContainerTest
+// will be viable (since both C::iterator* and C::const_iterator* are
+// valid types and NULL can be implicitly converted to them).  It will
+// be picked over the second overload as 'int' is a perfect match for
+// the type of argument 0.  If C::iterator or C::const_iterator is not
+// a valid type, the first overload is not viable, and the second
+// overload will be picked.  Therefore, we can determine whether C is
+// a container class by checking the type of IsContainerTest<C>(0).
+// The value of the expression is insignificant.
+//
+// Note that we look for both C::iterator and C::const_iterator.  The
+// reason is that C++ injects the name of a class as a member of the
+// class itself (e.g. you can refer to class iterator as either
+// 'iterator' or 'iterator::iterator').  If we look for C::iterator
+// only, for example, we would mistakenly think that a class named
+// iterator is an STL container.
+//
+// Also note that the simpler approach of overloading
+// IsContainerTest(typename C::const_iterator*) and
+// IsContainerTest(...) doesn't work with Visual Age C++ and Sun C++.
+typedef int IsContainer;
+template <class C>
+IsContainer IsContainerTest(int /* dummy */,
+                            typename C::iterator* /* it */ = NULL,
+                            typename C::const_iterator* /* const_it */ = NULL) {
+  return 0;
+}
+
+typedef char IsNotContainer;
+template <class C>
+IsNotContainer IsContainerTest(long /* dummy */) { return '\0'; }
+
+// EnableIf<condition>::type is void when 'Cond' is true, and
+// undefined when 'Cond' is false.  To use SFINAE to make a function
+// overload only apply when a particular expression is true, add
+// "typename EnableIf<expression>::type* = 0" as the last parameter.
+template<bool> struct EnableIf;
+template<> struct EnableIf<true> { typedef void type; };  // NOLINT
+
+// Utilities for native arrays.
+
+// ArrayEq() compares two k-dimensional native arrays using the
+// elements' operator==, where k can be any integer >= 0.  When k is
+// 0, ArrayEq() degenerates into comparing a single pair of values.
+
+template <typename T, typename U>
+bool ArrayEq(const T* lhs, size_t size, const U* rhs);
+
+// This generic version is used when k is 0.
+template <typename T, typename U>
+inline bool ArrayEq(const T& lhs, const U& rhs) { return lhs == rhs; }
+
+// This overload is used when k >= 1.
+template <typename T, typename U, size_t N>
+inline bool ArrayEq(const T(&lhs)[N], const U(&rhs)[N]) {
+  return internal::ArrayEq(lhs, N, rhs);
+}
+
+// This helper reduces code bloat.  If we instead put its logic inside
+// the previous ArrayEq() function, arrays with different sizes would
+// lead to different copies of the template code.
+template <typename T, typename U>
+bool ArrayEq(const T* lhs, size_t size, const U* rhs) {
+  for (size_t i = 0; i != size; i++) {
+    if (!internal::ArrayEq(lhs[i], rhs[i]))
+      return false;
+  }
+  return true;
+}
+
+// Finds the first element in the iterator range [begin, end) that
+// equals elem.  Element may be a native array type itself.
+template <typename Iter, typename Element>
+Iter ArrayAwareFind(Iter begin, Iter end, const Element& elem) {
+  for (Iter it = begin; it != end; ++it) {
+    if (internal::ArrayEq(*it, elem))
+      return it;
+  }
+  return end;
+}
+
+// CopyArray() copies a k-dimensional native array using the elements'
+// operator=, where k can be any integer >= 0.  When k is 0,
+// CopyArray() degenerates into copying a single value.
+
+template <typename T, typename U>
+void CopyArray(const T* from, size_t size, U* to);
+
+// This generic version is used when k is 0.
+template <typename T, typename U>
+inline void CopyArray(const T& from, U* to) { *to = from; }
+
+// This overload is used when k >= 1.
+template <typename T, typename U, size_t N>
+inline void CopyArray(const T(&from)[N], U(*to)[N]) {
+  internal::CopyArray(from, N, *to);
+}
+
+// This helper reduces code bloat.  If we instead put its logic inside
+// the previous CopyArray() function, arrays with different sizes
+// would lead to different copies of the template code.
+template <typename T, typename U>
+void CopyArray(const T* from, size_t size, U* to) {
+  for (size_t i = 0; i != size; i++) {
+    internal::CopyArray(from[i], to + i);
+  }
+}
+
+// The relation between an NativeArray object (see below) and the
+// native array it represents.
+enum RelationToSource {
+  kReference,  // The NativeArray references the native array.
+  kCopy        // The NativeArray makes a copy of the native array and
+               // owns the copy.
+};
+
+// Adapts a native array to a read-only STL-style container.  Instead
+// of the complete STL container concept, this adaptor only implements
+// members useful for Google Mock's container matchers.  New members
+// should be added as needed.  To simplify the implementation, we only
+// support Element being a raw type (i.e. having no top-level const or
+// reference modifier).  It's the client's responsibility to satisfy
+// this requirement.  Element can be an array type itself (hence
+// multi-dimensional arrays are supported).
+template <typename Element>
+class NativeArray {
+ public:
+  // STL-style container typedefs.
+  typedef Element value_type;
+  typedef Element* iterator;
+  typedef const Element* const_iterator;
+
+  // Constructs from a native array.
+  NativeArray(const Element* array, size_t count, RelationToSource relation) {
+    Init(array, count, relation);
+  }
+
+  // Copy constructor.
+  NativeArray(const NativeArray& rhs) {
+    Init(rhs.array_, rhs.size_, rhs.relation_to_source_);
+  }
+
+  ~NativeArray() {
+    // Ensures that the user doesn't instantiate NativeArray with a
+    // const or reference type.
+    static_cast<void>(StaticAssertTypeEqHelper<Element,
+        GTEST_REMOVE_REFERENCE_AND_CONST_(Element)>());
+    if (relation_to_source_ == kCopy)
+      delete[] array_;
+  }
+
+  // STL-style container methods.
+  size_t size() const { return size_; }
+  const_iterator begin() const { return array_; }
+  const_iterator end() const { return array_ + size_; }
+  bool operator==(const NativeArray& rhs) const {
+    return size() == rhs.size() &&
+        ArrayEq(begin(), size(), rhs.begin());
+  }
+
+ private:
+  // Initializes this object; makes a copy of the input array if
+  // 'relation' is kCopy.
+  void Init(const Element* array, size_t a_size, RelationToSource relation) {
+    if (relation == kReference) {
+      array_ = array;
+    } else {
+      Element* const copy = new Element[a_size];
+      CopyArray(array, a_size, copy);
+      array_ = copy;
+    }
+    size_ = a_size;
+    relation_to_source_ = relation;
+  }
+
+  const Element* array_;
+  size_t size_;
+  RelationToSource relation_to_source_;
+
+  GTEST_DISALLOW_ASSIGN_(NativeArray);
+};
+
+}  // namespace internal
+}  // namespace testing
+
+#define GTEST_MESSAGE_AT_(file, line, message, result_type) \
+  ::testing::internal::AssertHelper(result_type, file, line, message) \
+    = ::testing::Message()
+
+#define GTEST_MESSAGE_(message, result_type) \
+  GTEST_MESSAGE_AT_(__FILE__, __LINE__, message, result_type)
+
+#define GTEST_FATAL_FAILURE_(message) \
+  return GTEST_MESSAGE_(message, ::testing::TestPartResult::kFatalFailure)
+
+#define GTEST_NONFATAL_FAILURE_(message) \
+  GTEST_MESSAGE_(message, ::testing::TestPartResult::kNonFatalFailure)
+
+#define GTEST_SUCCESS_(message) \
+  GTEST_MESSAGE_(message, ::testing::TestPartResult::kSuccess)
+
+// Suppresses MSVC warnings 4072 (unreachable code) for the code following
+// statement if it returns or throws (or doesn't return or throw in some
+// situations).
+#define GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) \
+  if (::testing::internal::AlwaysTrue()) { statement; }
+
+#define GTEST_TEST_THROW_(statement, expected_exception, fail) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (::testing::internal::ConstCharPtr gtest_msg = "") { \
+    bool gtest_caught_expected = false; \
+    try { \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+    } \
+    catch (expected_exception const&) { \
+      gtest_caught_expected = true; \
+    } \
+    catch (...) { \
+      gtest_msg.value = \
+          "Expected: " #statement " throws an exception of type " \
+          #expected_exception ".\n  Actual: it throws a different type."; \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \
+    } \
+    if (!gtest_caught_expected) { \
+      gtest_msg.value = \
+          "Expected: " #statement " throws an exception of type " \
+          #expected_exception ".\n  Actual: it throws nothing."; \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \
+    } \
+  } else \
+    GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__): \
+      fail(gtest_msg.value)
+
+#define GTEST_TEST_NO_THROW_(statement, fail) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (::testing::internal::AlwaysTrue()) { \
+    try { \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+    } \
+    catch (...) { \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__); \
+    } \
+  } else \
+    GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__): \
+      fail("Expected: " #statement " doesn't throw an exception.\n" \
+           "  Actual: it throws.")
+
+#define GTEST_TEST_ANY_THROW_(statement, fail) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (::testing::internal::AlwaysTrue()) { \
+    bool gtest_caught_any = false; \
+    try { \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+    } \
+    catch (...) { \
+      gtest_caught_any = true; \
+    } \
+    if (!gtest_caught_any) { \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__); \
+    } \
+  } else \
+    GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__): \
+      fail("Expected: " #statement " throws an exception.\n" \
+           "  Actual: it doesn't.")
+
+
+// Implements Boolean test assertions such as EXPECT_TRUE. expression can be
+// either a boolean expression or an AssertionResult. text is a textual
+// represenation of expression as it was passed into the EXPECT_TRUE.
+#define GTEST_TEST_BOOLEAN_(expression, text, actual, expected, fail) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (const ::testing::AssertionResult gtest_ar_ = \
+      ::testing::AssertionResult(expression)) \
+    ; \
+  else \
+    fail(::testing::internal::GetBoolAssertionFailureMessage(\
+        gtest_ar_, text, #actual, #expected).c_str())
+
+#define GTEST_TEST_NO_FATAL_FAILURE_(statement, fail) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (::testing::internal::AlwaysTrue()) { \
+    ::testing::internal::HasNewFatalFailureHelper gtest_fatal_failure_checker; \
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+    if (gtest_fatal_failure_checker.has_new_fatal_failure()) { \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__); \
+    } \
+  } else \
+    GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__): \
+      fail("Expected: " #statement " doesn't generate new fatal " \
+           "failures in the current thread.\n" \
+           "  Actual: it does.")
+
+// Expands to the name of the class that implements the given test.
+#define GTEST_TEST_CLASS_NAME_(test_case_name, test_name) \
+  test_case_name##_##test_name##_Test
+
+// Helper macro for defining tests.
+#define GTEST_TEST_(test_case_name, test_name, parent_class, parent_id)\
+class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) : public parent_class {\
+ public:\
+  GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {}\
+ private:\
+  virtual void TestBody();\
+  static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_;\
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(\
+      GTEST_TEST_CLASS_NAME_(test_case_name, test_name));\
+};\
+\
+::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_case_name, test_name)\
+  ::test_info_ =\
+    ::testing::internal::MakeAndRegisterTestInfo(\
+        #test_case_name, #test_name, NULL, NULL, \
+        (parent_id), \
+        parent_class::SetUpTestCase, \
+        parent_class::TearDownTestCase, \
+        new ::testing::internal::TestFactoryImpl<\
+            GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>);\
+void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody()
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+//
+// The Google C++ Testing Framework (Google Test)
+//
+// This header file defines the public API for death tests.  It is
+// #included by gtest.h so a user doesn't need to include this
+// directly.
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
+#define GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
+
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee)
+//
+// The Google C++ Testing Framework (Google Test)
+//
+// This header file defines internal utilities needed for implementing
+// death tests.  They are subject to change without notice.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
+
+
+#include <stdio.h>
+
+namespace testing {
+namespace internal {
+
+GTEST_DECLARE_string_(internal_run_death_test);
+
+// Names of the flags (needed for parsing Google Test flags).
+const char kDeathTestStyleFlag[] = "death_test_style";
+const char kDeathTestUseFork[] = "death_test_use_fork";
+const char kInternalRunDeathTestFlag[] = "internal_run_death_test";
+
+#if GTEST_HAS_DEATH_TEST
+
+// DeathTest is a class that hides much of the complexity of the
+// GTEST_DEATH_TEST_ macro.  It is abstract; its static Create method
+// returns a concrete class that depends on the prevailing death test
+// style, as defined by the --gtest_death_test_style and/or
+// --gtest_internal_run_death_test flags.
+
+// In describing the results of death tests, these terms are used with
+// the corresponding definitions:
+//
+// exit status:  The integer exit information in the format specified
+//               by wait(2)
+// exit code:    The integer code passed to exit(3), _exit(2), or
+//               returned from main()
+class GTEST_API_ DeathTest {
+ public:
+  // Create returns false if there was an error determining the
+  // appropriate action to take for the current death test; for example,
+  // if the gtest_death_test_style flag is set to an invalid value.
+  // The LastMessage method will return a more detailed message in that
+  // case.  Otherwise, the DeathTest pointer pointed to by the "test"
+  // argument is set.  If the death test should be skipped, the pointer
+  // is set to NULL; otherwise, it is set to the address of a new concrete
+  // DeathTest object that controls the execution of the current test.
+  static bool Create(const char* statement, const RE* regex,
+                     const char* file, int line, DeathTest** test);
+  DeathTest();
+  virtual ~DeathTest() { }
+
+  // A helper class that aborts a death test when it's deleted.
+  class ReturnSentinel {
+   public:
+    explicit ReturnSentinel(DeathTest* test) : test_(test) { }
+    ~ReturnSentinel() { test_->Abort(TEST_ENCOUNTERED_RETURN_STATEMENT); }
+   private:
+    DeathTest* const test_;
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(ReturnSentinel);
+  } GTEST_ATTRIBUTE_UNUSED_;
+
+  // An enumeration of possible roles that may be taken when a death
+  // test is encountered.  EXECUTE means that the death test logic should
+  // be executed immediately.  OVERSEE means that the program should prepare
+  // the appropriate environment for a child process to execute the death
+  // test, then wait for it to complete.
+  enum TestRole { OVERSEE_TEST, EXECUTE_TEST };
+
+  // An enumeration of the three reasons that a test might be aborted.
+  enum AbortReason {
+    TEST_ENCOUNTERED_RETURN_STATEMENT,
+    TEST_THREW_EXCEPTION,
+    TEST_DID_NOT_DIE
+  };
+
+  // Assumes one of the above roles.
+  virtual TestRole AssumeRole() = 0;
+
+  // Waits for the death test to finish and returns its status.
+  virtual int Wait() = 0;
+
+  // Returns true if the death test passed; that is, the test process
+  // exited during the test, its exit status matches a user-supplied
+  // predicate, and its stderr output matches a user-supplied regular
+  // expression.
+  // The user-supplied predicate may be a macro expression rather
+  // than a function pointer or functor, or else Wait and Passed could
+  // be combined.
+  virtual bool Passed(bool exit_status_ok) = 0;
+
+  // Signals that the death test did not die as expected.
+  virtual void Abort(AbortReason reason) = 0;
+
+  // Returns a human-readable outcome message regarding the outcome of
+  // the last death test.
+  static const char* LastMessage();
+
+  static void set_last_death_test_message(const std::string& message);
+
+ private:
+  // A string containing a description of the outcome of the last death test.
+  static std::string last_death_test_message_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(DeathTest);
+};
+
+// Factory interface for death tests.  May be mocked out for testing.
+class DeathTestFactory {
+ public:
+  virtual ~DeathTestFactory() { }
+  virtual bool Create(const char* statement, const RE* regex,
+                      const char* file, int line, DeathTest** test) = 0;
+};
+
+// A concrete DeathTestFactory implementation for normal use.
+class DefaultDeathTestFactory : public DeathTestFactory {
+ public:
+  virtual bool Create(const char* statement, const RE* regex,
+                      const char* file, int line, DeathTest** test);
+};
+
+// Returns true if exit_status describes a process that was terminated
+// by a signal, or exited normally with a nonzero exit code.
+GTEST_API_ bool ExitedUnsuccessfully(int exit_status);
+
+// Traps C++ exceptions escaping statement and reports them as test
+// failures. Note that trapping SEH exceptions is not implemented here.
+# if GTEST_HAS_EXCEPTIONS
+#  define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \
+  try { \
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+  } catch (const ::std::exception& gtest_exception) { \
+    fprintf(\
+        stderr, \
+        "\n%s: Caught std::exception-derived exception escaping the " \
+        "death test statement. Exception message: %s\n", \
+        ::testing::internal::FormatFileLocation(__FILE__, __LINE__).c_str(), \
+        gtest_exception.what()); \
+    fflush(stderr); \
+    death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \
+  } catch (...) { \
+    death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \
+  }
+
+# else
+#  define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \
+  GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement)
+
+# endif
+
+// This macro is for implementing ASSERT_DEATH*, EXPECT_DEATH*,
+// ASSERT_EXIT*, and EXPECT_EXIT*.
+# define GTEST_DEATH_TEST_(statement, predicate, regex, fail) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (::testing::internal::AlwaysTrue()) { \
+    const ::testing::internal::RE& gtest_regex = (regex); \
+    ::testing::internal::DeathTest* gtest_dt; \
+    if (!::testing::internal::DeathTest::Create(#statement, &gtest_regex, \
+        __FILE__, __LINE__, &gtest_dt)) { \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__); \
+    } \
+    if (gtest_dt != NULL) { \
+      ::testing::internal::scoped_ptr< ::testing::internal::DeathTest> \
+          gtest_dt_ptr(gtest_dt); \
+      switch (gtest_dt->AssumeRole()) { \
+        case ::testing::internal::DeathTest::OVERSEE_TEST: \
+          if (!gtest_dt->Passed(predicate(gtest_dt->Wait()))) { \
+            goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__); \
+          } \
+          break; \
+        case ::testing::internal::DeathTest::EXECUTE_TEST: { \
+          ::testing::internal::DeathTest::ReturnSentinel \
+              gtest_sentinel(gtest_dt); \
+          GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, gtest_dt); \
+          gtest_dt->Abort(::testing::internal::DeathTest::TEST_DID_NOT_DIE); \
+          break; \
+        } \
+        default: \
+          break; \
+      } \
+    } \
+  } else \
+    GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__): \
+      fail(::testing::internal::DeathTest::LastMessage())
+// The symbol "fail" here expands to something into which a message
+// can be streamed.
+
+// This macro is for implementing ASSERT/EXPECT_DEBUG_DEATH when compiled in
+// NDEBUG mode. In this case we need the statements to be executed, the regex is
+// ignored, and the macro must accept a streamed message even though the message
+// is never printed.
+# define GTEST_EXECUTE_STATEMENT_(statement, regex) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (::testing::internal::AlwaysTrue()) { \
+     GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+  } else \
+    ::testing::Message()
+
+// A class representing the parsed contents of the
+// --gtest_internal_run_death_test flag, as it existed when
+// RUN_ALL_TESTS was called.
+class InternalRunDeathTestFlag {
+ public:
+  InternalRunDeathTestFlag(const std::string& a_file,
+                           int a_line,
+                           int an_index,
+                           int a_write_fd)
+      : file_(a_file), line_(a_line), index_(an_index),
+        write_fd_(a_write_fd) {}
+
+  ~InternalRunDeathTestFlag() {
+    if (write_fd_ >= 0)
+      posix::Close(write_fd_);
+  }
+
+  const std::string& file() const { return file_; }
+  int line() const { return line_; }
+  int index() const { return index_; }
+  int write_fd() const { return write_fd_; }
+
+ private:
+  std::string file_;
+  int line_;
+  int index_;
+  int write_fd_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(InternalRunDeathTestFlag);
+};
+
+// Returns a newly created InternalRunDeathTestFlag object with fields
+// initialized from the GTEST_FLAG(internal_run_death_test) flag if
+// the flag is specified; otherwise returns NULL.
+InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag();
+
+#else  // GTEST_HAS_DEATH_TEST
+
+// This macro is used for implementing macros such as
+// EXPECT_DEATH_IF_SUPPORTED and ASSERT_DEATH_IF_SUPPORTED on systems where
+// death tests are not supported. Those macros must compile on such systems
+// iff EXPECT_DEATH and ASSERT_DEATH compile with the same parameters on
+// systems that support death tests. This allows one to write such a macro
+// on a system that does not support death tests and be sure that it will
+// compile on a death-test supporting system.
+//
+// Parameters:
+//   statement -  A statement that a macro such as EXPECT_DEATH would test
+//                for program termination. This macro has to make sure this
+//                statement is compiled but not executed, to ensure that
+//                EXPECT_DEATH_IF_SUPPORTED compiles with a certain
+//                parameter iff EXPECT_DEATH compiles with it.
+//   regex     -  A regex that a macro such as EXPECT_DEATH would use to test
+//                the output of statement.  This parameter has to be
+//                compiled but not evaluated by this macro, to ensure that
+//                this macro only accepts expressions that a macro such as
+//                EXPECT_DEATH would accept.
+//   terminator - Must be an empty statement for EXPECT_DEATH_IF_SUPPORTED
+//                and a return statement for ASSERT_DEATH_IF_SUPPORTED.
+//                This ensures that ASSERT_DEATH_IF_SUPPORTED will not
+//                compile inside functions where ASSERT_DEATH doesn't
+//                compile.
+//
+//  The branch that has an always false condition is used to ensure that
+//  statement and regex are compiled (and thus syntactically correct) but
+//  never executed. The unreachable code macro protects the terminator
+//  statement from generating an 'unreachable code' warning in case
+//  statement unconditionally returns or throws. The Message constructor at
+//  the end allows the syntax of streaming additional messages into the
+//  macro, for compilational compatibility with EXPECT_DEATH/ASSERT_DEATH.
+# define GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, terminator) \
+    GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+    if (::testing::internal::AlwaysTrue()) { \
+      GTEST_LOG_(WARNING) \
+          << "Death tests are not supported on this platform.\n" \
+          << "Statement '" #statement "' cannot be verified."; \
+    } else if (::testing::internal::AlwaysFalse()) { \
+      ::testing::internal::RE::PartialMatch(".*", (regex)); \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+      terminator; \
+    } else \
+      ::testing::Message()
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
+
+namespace testing {
+
+// This flag controls the style of death tests.  Valid values are "threadsafe",
+// meaning that the death test child process will re-execute the test binary
+// from the start, running only a single death test, or "fast",
+// meaning that the child process will execute the test logic immediately
+// after forking.
+GTEST_DECLARE_string_(death_test_style);
+
+#if GTEST_HAS_DEATH_TEST
+
+namespace internal {
+
+// Returns a Boolean value indicating whether the caller is currently
+// executing in the context of the death test child process.  Tools such as
+// Valgrind heap checkers may need this to modify their behavior in death
+// tests.  IMPORTANT: This is an internal utility.  Using it may break the
+// implementation of death tests.  User code MUST NOT use it.
+GTEST_API_ bool InDeathTestChild();
+
+}  // namespace internal
+
+// The following macros are useful for writing death tests.
+
+// Here's what happens when an ASSERT_DEATH* or EXPECT_DEATH* is
+// executed:
+//
+//   1. It generates a warning if there is more than one active
+//   thread.  This is because it's safe to fork() or clone() only
+//   when there is a single thread.
+//
+//   2. The parent process clone()s a sub-process and runs the death
+//   test in it; the sub-process exits with code 0 at the end of the
+//   death test, if it hasn't exited already.
+//
+//   3. The parent process waits for the sub-process to terminate.
+//
+//   4. The parent process checks the exit code and error message of
+//   the sub-process.
+//
+// Examples:
+//
+//   ASSERT_DEATH(server.SendMessage(56, "Hello"), "Invalid port number");
+//   for (int i = 0; i < 5; i++) {
+//     EXPECT_DEATH(server.ProcessRequest(i),
+//                  "Invalid request .* in ProcessRequest()")
+//                  << "Failed to die on request " << i;
+//   }
+//
+//   ASSERT_EXIT(server.ExitNow(), ::testing::ExitedWithCode(0), "Exiting");
+//
+//   bool KilledBySIGHUP(int exit_code) {
+//     return WIFSIGNALED(exit_code) && WTERMSIG(exit_code) == SIGHUP;
+//   }
+//
+//   ASSERT_EXIT(client.HangUpServer(), KilledBySIGHUP, "Hanging up!");
+//
+// On the regular expressions used in death tests:
+//
+//   On POSIX-compliant systems (*nix), we use the <regex.h> library,
+//   which uses the POSIX extended regex syntax.
+//
+//   On other platforms (e.g. Windows), we only support a simple regex
+//   syntax implemented as part of Google Test.  This limited
+//   implementation should be enough most of the time when writing
+//   death tests; though it lacks many features you can find in PCRE
+//   or POSIX extended regex syntax.  For example, we don't support
+//   union ("x|y"), grouping ("(xy)"), brackets ("[xy]"), and
+//   repetition count ("x{5,7}"), among others.
+//
+//   Below is the syntax that we do support.  We chose it to be a
+//   subset of both PCRE and POSIX extended regex, so it's easy to
+//   learn wherever you come from.  In the following: 'A' denotes a
+//   literal character, period (.), or a single \\ escape sequence;
+//   'x' and 'y' denote regular expressions; 'm' and 'n' are for
+//   natural numbers.
+//
+//     c     matches any literal character c
+//     \\d   matches any decimal digit
+//     \\D   matches any character that's not a decimal digit
+//     \\f   matches \f
+//     \\n   matches \n
+//     \\r   matches \r
+//     \\s   matches any ASCII whitespace, including \n
+//     \\S   matches any character that's not a whitespace
+//     \\t   matches \t
+//     \\v   matches \v
+//     \\w   matches any letter, _, or decimal digit
+//     \\W   matches any character that \\w doesn't match
+//     \\c   matches any literal character c, which must be a punctuation
+//     .     matches any single character except \n
+//     A?    matches 0 or 1 occurrences of A
+//     A*    matches 0 or many occurrences of A
+//     A+    matches 1 or many occurrences of A
+//     ^     matches the beginning of a string (not that of each line)
+//     $     matches the end of a string (not that of each line)
+//     xy    matches x followed by y
+//
+//   If you accidentally use PCRE or POSIX extended regex features
+//   not implemented by us, you will get a run-time failure.  In that
+//   case, please try to rewrite your regular expression within the
+//   above syntax.
+//
+//   This implementation is *not* meant to be as highly tuned or robust
+//   as a compiled regex library, but should perform well enough for a
+//   death test, which already incurs significant overhead by launching
+//   a child process.
+//
+// Known caveats:
+//
+//   A "threadsafe" style death test obtains the path to the test
+//   program from argv[0] and re-executes it in the sub-process.  For
+//   simplicity, the current implementation doesn't search the PATH
+//   when launching the sub-process.  This means that the user must
+//   invoke the test program via a path that contains at least one
+//   path separator (e.g. path/to/foo_test and
+//   /absolute/path/to/bar_test are fine, but foo_test is not).  This
+//   is rarely a problem as people usually don't put the test binary
+//   directory in PATH.
+//
+// TODO(wan@google.com): make thread-safe death tests search the PATH.
+
+// Asserts that a given statement causes the program to exit, with an
+// integer exit status that satisfies predicate, and emitting error output
+// that matches regex.
+# define ASSERT_EXIT(statement, predicate, regex) \
+    GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_FATAL_FAILURE_)
+
+// Like ASSERT_EXIT, but continues on to successive tests in the
+// test case, if any:
+# define EXPECT_EXIT(statement, predicate, regex) \
+    GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_NONFATAL_FAILURE_)
+
+// Asserts that a given statement causes the program to exit, either by
+// explicitly exiting with a nonzero exit code or being killed by a
+// signal, and emitting error output that matches regex.
+# define ASSERT_DEATH(statement, regex) \
+    ASSERT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex)
+
+// Like ASSERT_DEATH, but continues on to successive tests in the
+// test case, if any:
+# define EXPECT_DEATH(statement, regex) \
+    EXPECT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex)
+
+// Two predicate classes that can be used in {ASSERT,EXPECT}_EXIT*:
+
+// Tests that an exit code describes a normal exit with a given exit code.
+class GTEST_API_ ExitedWithCode {
+ public:
+  explicit ExitedWithCode(int exit_code);
+  bool operator()(int exit_status) const;
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ExitedWithCode& other);
+
+  const int exit_code_;
+};
+
+# if !GTEST_OS_WINDOWS
+// Tests that an exit code describes an exit due to termination by a
+// given signal.
+class GTEST_API_ KilledBySignal {
+ public:
+  explicit KilledBySignal(int signum);
+  bool operator()(int exit_status) const;
+ private:
+  const int signum_;
+};
+# endif  // !GTEST_OS_WINDOWS
+
+// EXPECT_DEBUG_DEATH asserts that the given statements die in debug mode.
+// The death testing framework causes this to have interesting semantics,
+// since the sideeffects of the call are only visible in opt mode, and not
+// in debug mode.
+//
+// In practice, this can be used to test functions that utilize the
+// LOG(DFATAL) macro using the following style:
+//
+// int DieInDebugOr12(int* sideeffect) {
+//   if (sideeffect) {
+//     *sideeffect = 12;
+//   }
+//   LOG(DFATAL) << "death";
+//   return 12;
+// }
+//
+// TEST(TestCase, TestDieOr12WorksInDgbAndOpt) {
+//   int sideeffect = 0;
+//   // Only asserts in dbg.
+//   EXPECT_DEBUG_DEATH(DieInDebugOr12(&sideeffect), "death");
+//
+// #ifdef NDEBUG
+//   // opt-mode has sideeffect visible.
+//   EXPECT_EQ(12, sideeffect);
+// #else
+//   // dbg-mode no visible sideeffect.
+//   EXPECT_EQ(0, sideeffect);
+// #endif
+// }
+//
+// This will assert that DieInDebugReturn12InOpt() crashes in debug
+// mode, usually due to a DCHECK or LOG(DFATAL), but returns the
+// appropriate fallback value (12 in this case) in opt mode. If you
+// need to test that a function has appropriate side-effects in opt
+// mode, include assertions against the side-effects.  A general
+// pattern for this is:
+//
+// EXPECT_DEBUG_DEATH({
+//   // Side-effects here will have an effect after this statement in
+//   // opt mode, but none in debug mode.
+//   EXPECT_EQ(12, DieInDebugOr12(&sideeffect));
+// }, "death");
+//
+# ifdef NDEBUG
+
+#  define EXPECT_DEBUG_DEATH(statement, regex) \
+  GTEST_EXECUTE_STATEMENT_(statement, regex)
+
+#  define ASSERT_DEBUG_DEATH(statement, regex) \
+  GTEST_EXECUTE_STATEMENT_(statement, regex)
+
+# else
+
+#  define EXPECT_DEBUG_DEATH(statement, regex) \
+  EXPECT_DEATH(statement, regex)
+
+#  define ASSERT_DEBUG_DEATH(statement, regex) \
+  ASSERT_DEATH(statement, regex)
+
+# endif  // NDEBUG for EXPECT_DEBUG_DEATH
+#endif  // GTEST_HAS_DEATH_TEST
+
+// EXPECT_DEATH_IF_SUPPORTED(statement, regex) and
+// ASSERT_DEATH_IF_SUPPORTED(statement, regex) expand to real death tests if
+// death tests are supported; otherwise they just issue a warning.  This is
+// useful when you are combining death test assertions with normal test
+// assertions in one test.
+#if GTEST_HAS_DEATH_TEST
+# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
+    EXPECT_DEATH(statement, regex)
+# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
+    ASSERT_DEATH(statement, regex)
+#else
+# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
+    GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, )
+# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
+    GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, return)
+#endif
+
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
+// This file was GENERATED by command:
+//     pump.py gtest-param-test.h.pump
+// DO NOT EDIT BY HAND!!!
+
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: vladl@google.com (Vlad Losev)
+//
+// Macros and functions for implementing parameterized tests
+// in Google C++ Testing Framework (Google Test)
+//
+// This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
+//
+#ifndef GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
+#define GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
+
+
+// Value-parameterized tests allow you to test your code with different
+// parameters without writing multiple copies of the same test.
+//
+// Here is how you use value-parameterized tests:
+
+#if 0
+
+// To write value-parameterized tests, first you should define a fixture
+// class. It is usually derived from testing::TestWithParam<T> (see below for
+// another inheritance scheme that's sometimes useful in more complicated
+// class hierarchies), where the type of your parameter values.
+// TestWithParam<T> is itself derived from testing::Test. T can be any
+// copyable type. If it's a raw pointer, you are responsible for managing the
+// lifespan of the pointed values.
+
+class FooTest : public ::testing::TestWithParam<const char*> {
+  // You can implement all the usual class fixture members here.
+};
+
+// Then, use the TEST_P macro to define as many parameterized tests
+// for this fixture as you want. The _P suffix is for "parameterized"
+// or "pattern", whichever you prefer to think.
+
+TEST_P(FooTest, DoesBlah) {
+  // Inside a test, access the test parameter with the GetParam() method
+  // of the TestWithParam<T> class:
+  EXPECT_TRUE(foo.Blah(GetParam()));
+  ...
+}
+
+TEST_P(FooTest, HasBlahBlah) {
+  ...
+}
+
+// Finally, you can use INSTANTIATE_TEST_CASE_P to instantiate the test
+// case with any set of parameters you want. Google Test defines a number
+// of functions for generating test parameters. They return what we call
+// (surprise!) parameter generators. Here is a  summary of them, which
+// are all in the testing namespace:
+//
+//
+//  Range(begin, end [, step]) - Yields values {begin, begin+step,
+//                               begin+step+step, ...}. The values do not
+//                               include end. step defaults to 1.
+//  Values(v1, v2, ..., vN)    - Yields values {v1, v2, ..., vN}.
+//  ValuesIn(container)        - Yields values from a C-style array, an STL
+//  ValuesIn(begin,end)          container, or an iterator range [begin, end).
+//  Bool()                     - Yields sequence {false, true}.
+//  Combine(g1, g2, ..., gN)   - Yields all combinations (the Cartesian product
+//                               for the math savvy) of the values generated
+//                               by the N generators.
+//
+// For more details, see comments at the definitions of these functions below
+// in this file.
+//
+// The following statement will instantiate tests from the FooTest test case
+// each with parameter values "meeny", "miny", and "moe".
+
+INSTANTIATE_TEST_CASE_P(InstantiationName,
+                        FooTest,
+                        Values("meeny", "miny", "moe"));
+
+// To distinguish different instances of the pattern, (yes, you
+// can instantiate it more then once) the first argument to the
+// INSTANTIATE_TEST_CASE_P macro is a prefix that will be added to the
+// actual test case name. Remember to pick unique prefixes for different
+// instantiations. The tests from the instantiation above will have
+// these names:
+//
+//    * InstantiationName/FooTest.DoesBlah/0 for "meeny"
+//    * InstantiationName/FooTest.DoesBlah/1 for "miny"
+//    * InstantiationName/FooTest.DoesBlah/2 for "moe"
+//    * InstantiationName/FooTest.HasBlahBlah/0 for "meeny"
+//    * InstantiationName/FooTest.HasBlahBlah/1 for "miny"
+//    * InstantiationName/FooTest.HasBlahBlah/2 for "moe"
+//
+// You can use these names in --gtest_filter.
+//
+// This statement will instantiate all tests from FooTest again, each
+// with parameter values "cat" and "dog":
+
+const char* pets[] = {"cat", "dog"};
+INSTANTIATE_TEST_CASE_P(AnotherInstantiationName, FooTest, ValuesIn(pets));
+
+// The tests from the instantiation above will have these names:
+//
+//    * AnotherInstantiationName/FooTest.DoesBlah/0 for "cat"
+//    * AnotherInstantiationName/FooTest.DoesBlah/1 for "dog"
+//    * AnotherInstantiationName/FooTest.HasBlahBlah/0 for "cat"
+//    * AnotherInstantiationName/FooTest.HasBlahBlah/1 for "dog"
+//
+// Please note that INSTANTIATE_TEST_CASE_P will instantiate all tests
+// in the given test case, whether their definitions come before or
+// AFTER the INSTANTIATE_TEST_CASE_P statement.
+//
+// Please also note that generator expressions (including parameters to the
+// generators) are evaluated in InitGoogleTest(), after main() has started.
+// This allows the user on one hand, to adjust generator parameters in order
+// to dynamically determine a set of tests to run and on the other hand,
+// give the user a chance to inspect the generated tests with Google Test
+// reflection API before RUN_ALL_TESTS() is executed.
+//
+// You can see samples/sample7_unittest.cc and samples/sample8_unittest.cc
+// for more examples.
+//
+// In the future, we plan to publish the API for defining new parameter
+// generators. But for now this interface remains part of the internal
+// implementation and is subject to change.
+//
+//
+// A parameterized test fixture must be derived from testing::Test and from
+// testing::WithParamInterface<T>, where T is the type of the parameter
+// values. Inheriting from TestWithParam<T> satisfies that requirement because
+// TestWithParam<T> inherits from both Test and WithParamInterface. In more
+// complicated hierarchies, however, it is occasionally useful to inherit
+// separately from Test and WithParamInterface. For example:
+
+class BaseTest : public ::testing::Test {
+  // You can inherit all the usual members for a non-parameterized test
+  // fixture here.
+};
+
+class DerivedTest : public BaseTest, public ::testing::WithParamInterface<int> {
+  // The usual test fixture members go here too.
+};
+
+TEST_F(BaseTest, HasFoo) {
+  // This is an ordinary non-parameterized test.
+}
+
+TEST_P(DerivedTest, DoesBlah) {
+  // GetParam works just the same here as if you inherit from TestWithParam.
+  EXPECT_TRUE(foo.Blah(GetParam()));
+}
+
+#endif  // 0
+
+
+#if !GTEST_OS_SYMBIAN
+# include <utility>
+#endif
+
+// scripts/fuse_gtest.py depends on gtest's own header being #included
+// *unconditionally*.  Therefore these #includes cannot be moved
+// inside #if GTEST_HAS_PARAM_TEST.
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: vladl@google.com (Vlad Losev)
+
+// Type and function utilities for implementing parameterized tests.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
+
+#include <iterator>
+#include <utility>
+#include <vector>
+
+// scripts/fuse_gtest.py depends on gtest's own header being #included
+// *unconditionally*.  Therefore these #includes cannot be moved
+// inside #if GTEST_HAS_PARAM_TEST.
+// Copyright 2003 Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: Dan Egnor (egnor@google.com)
+//
+// A "smart" pointer type with reference tracking.  Every pointer to a
+// particular object is kept on a circular linked list.  When the last pointer
+// to an object is destroyed or reassigned, the object is deleted.
+//
+// Used properly, this deletes the object when the last reference goes away.
+// There are several caveats:
+// - Like all reference counting schemes, cycles lead to leaks.
+// - Each smart pointer is actually two pointers (8 bytes instead of 4).
+// - Every time a pointer is assigned, the entire list of pointers to that
+//   object is traversed.  This class is therefore NOT SUITABLE when there
+//   will often be more than two or three pointers to a particular object.
+// - References are only tracked as long as linked_ptr<> objects are copied.
+//   If a linked_ptr<> is converted to a raw pointer and back, BAD THINGS
+//   will happen (double deletion).
+//
+// A good use of this class is storing object references in STL containers.
+// You can safely put linked_ptr<> in a vector<>.
+// Other uses may not be as good.
+//
+// Note: If you use an incomplete type with linked_ptr<>, the class
+// *containing* linked_ptr<> must have a constructor and destructor (even
+// if they do nothing!).
+//
+// Bill Gibbons suggested we use something like this.
+//
+// Thread Safety:
+//   Unlike other linked_ptr implementations, in this implementation
+//   a linked_ptr object is thread-safe in the sense that:
+//     - it's safe to copy linked_ptr objects concurrently,
+//     - it's safe to copy *from* a linked_ptr and read its underlying
+//       raw pointer (e.g. via get()) concurrently, and
+//     - it's safe to write to two linked_ptrs that point to the same
+//       shared object concurrently.
+// TODO(wan@google.com): rename this to safe_linked_ptr to avoid
+// confusion with normal linked_ptr.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_
+
+#include <stdlib.h>
+#include <assert.h>
+
+
+namespace testing {
+namespace internal {
+
+// Protects copying of all linked_ptr objects.
+GTEST_API_ GTEST_DECLARE_STATIC_MUTEX_(g_linked_ptr_mutex);
+
+// This is used internally by all instances of linked_ptr<>.  It needs to be
+// a non-template class because different types of linked_ptr<> can refer to
+// the same object (linked_ptr<Superclass>(obj) vs linked_ptr<Subclass>(obj)).
+// So, it needs to be possible for different types of linked_ptr to participate
+// in the same circular linked list, so we need a single class type here.
+//
+// DO NOT USE THIS CLASS DIRECTLY YOURSELF.  Use linked_ptr<T>.
+class linked_ptr_internal {
+ public:
+  // Create a new circle that includes only this instance.
+  void join_new() {
+    next_ = this;
+  }
+
+  // Many linked_ptr operations may change p.link_ for some linked_ptr
+  // variable p in the same circle as this object.  Therefore we need
+  // to prevent two such operations from occurring concurrently.
+  //
+  // Note that different types of linked_ptr objects can coexist in a
+  // circle (e.g. linked_ptr<Base>, linked_ptr<Derived1>, and
+  // linked_ptr<Derived2>).  Therefore we must use a single mutex to
+  // protect all linked_ptr objects.  This can create serious
+  // contention in production code, but is acceptable in a testing
+  // framework.
+
+  // Join an existing circle.
+  void join(linked_ptr_internal const* ptr)
+      GTEST_LOCK_EXCLUDED_(g_linked_ptr_mutex) {
+    MutexLock lock(&g_linked_ptr_mutex);
+
+    linked_ptr_internal const* p = ptr;
+    while (p->next_ != ptr) p = p->next_;
+    p->next_ = this;
+    next_ = ptr;
+  }
+
+  // Leave whatever circle we're part of.  Returns true if we were the
+  // last member of the circle.  Once this is done, you can join() another.
+  bool depart()
+      GTEST_LOCK_EXCLUDED_(g_linked_ptr_mutex) {
+    MutexLock lock(&g_linked_ptr_mutex);
+
+    if (next_ == this) return true;
+    linked_ptr_internal const* p = next_;
+    while (p->next_ != this) p = p->next_;
+    p->next_ = next_;
+    return false;
+  }
+
+ private:
+  mutable linked_ptr_internal const* next_;
+};
+
+template <typename T>
+class linked_ptr {
+ public:
+  typedef T element_type;
+
+  // Take over ownership of a raw pointer.  This should happen as soon as
+  // possible after the object is created.
+  explicit linked_ptr(T* ptr = NULL) { capture(ptr); }
+  ~linked_ptr() { depart(); }
+
+  // Copy an existing linked_ptr<>, adding ourselves to the list of references.
+  template <typename U> linked_ptr(linked_ptr<U> const& ptr) { copy(&ptr); }
+  linked_ptr(linked_ptr const& ptr) {  // NOLINT
+    assert(&ptr != this);
+    copy(&ptr);
+  }
+
+  // Assignment releases the old value and acquires the new.
+  template <typename U> linked_ptr& operator=(linked_ptr<U> const& ptr) {
+    depart();
+    copy(&ptr);
+    return *this;
+  }
+
+  linked_ptr& operator=(linked_ptr const& ptr) {
+    if (&ptr != this) {
+      depart();
+      copy(&ptr);
+    }
+    return *this;
+  }
+
+  // Smart pointer members.
+  void reset(T* ptr = NULL) {
+    depart();
+    capture(ptr);
+  }
+  T* get() const { return value_; }
+  T* operator->() const { return value_; }
+  T& operator*() const { return *value_; }
+
+  bool operator==(T* p) const { return value_ == p; }
+  bool operator!=(T* p) const { return value_ != p; }
+  template <typename U>
+  bool operator==(linked_ptr<U> const& ptr) const {
+    return value_ == ptr.get();
+  }
+  template <typename U>
+  bool operator!=(linked_ptr<U> const& ptr) const {
+    return value_ != ptr.get();
+  }
+
+ private:
+  template <typename U>
+  friend class linked_ptr;
+
+  T* value_;
+  linked_ptr_internal link_;
+
+  void depart() {
+    if (link_.depart()) delete value_;
+  }
+
+  void capture(T* ptr) {
+    value_ = ptr;
+    link_.join_new();
+  }
+
+  template <typename U> void copy(linked_ptr<U> const* ptr) {
+    value_ = ptr->get();
+    if (value_)
+      link_.join(&ptr->link_);
+    else
+      link_.join_new();
+  }
+};
+
+template<typename T> inline
+bool operator==(T* ptr, const linked_ptr<T>& x) {
+  return ptr == x.get();
+}
+
+template<typename T> inline
+bool operator!=(T* ptr, const linked_ptr<T>& x) {
+  return ptr != x.get();
+}
+
+// A function to convert T* into linked_ptr<T>
+// Doing e.g. make_linked_ptr(new FooBarBaz<type>(arg)) is a shorter notation
+// for linked_ptr<FooBarBaz<type> >(new FooBarBaz<type>(arg))
+template <typename T>
+linked_ptr<T> make_linked_ptr(T* ptr) {
+  return linked_ptr<T>(ptr);
+}
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+// Google Test - The Google C++ Testing Framework
+//
+// This file implements a universal value printer that can print a
+// value of any type T:
+//
+//   void ::testing::internal::UniversalPrinter<T>::Print(value, ostream_ptr);
+//
+// A user can teach this function how to print a class type T by
+// defining either operator<<() or PrintTo() in the namespace that
+// defines T.  More specifically, the FIRST defined function in the
+// following list will be used (assuming T is defined in namespace
+// foo):
+//
+//   1. foo::PrintTo(const T&, ostream*)
+//   2. operator<<(ostream&, const T&) defined in either foo or the
+//      global namespace.
+//
+// If none of the above is defined, it will print the debug string of
+// the value if it is a protocol buffer, or print the raw bytes in the
+// value otherwise.
+//
+// To aid debugging: when T is a reference type, the address of the
+// value is also printed; when T is a (const) char pointer, both the
+// pointer value and the NUL-terminated string it points to are
+// printed.
+//
+// We also provide some convenient wrappers:
+//
+//   // Prints a value to a string.  For a (const or not) char
+//   // pointer, the NUL-terminated string (but not the pointer) is
+//   // printed.
+//   std::string ::testing::PrintToString(const T& value);
+//
+//   // Prints a value tersely: for a reference type, the referenced
+//   // value (but not the address) is printed; for a (const or not) char
+//   // pointer, the NUL-terminated string (but not the pointer) is
+//   // printed.
+//   void ::testing::internal::UniversalTersePrint(const T& value, ostream*);
+//
+//   // Prints value using the type inferred by the compiler.  The difference
+//   // from UniversalTersePrint() is that this function prints both the
+//   // pointer and the NUL-terminated string for a (const or not) char pointer.
+//   void ::testing::internal::UniversalPrint(const T& value, ostream*);
+//
+//   // Prints the fields of a tuple tersely to a string vector, one
+//   // element for each field. Tuple support must be enabled in
+//   // gtest-port.h.
+//   std::vector<string> UniversalTersePrintTupleFieldsToStrings(
+//       const Tuple& value);
+//
+// Known limitation:
+//
+// The print primitives print the elements of an STL-style container
+// using the compiler-inferred type of *iter where iter is a
+// const_iterator of the container.  When const_iterator is an input
+// iterator but not a forward iterator, this inferred type may not
+// match value_type, and the print output may be incorrect.  In
+// practice, this is rarely a problem as for most containers
+// const_iterator is a forward iterator.  We'll fix this if there's an
+// actual need for it.  Note that this fix cannot rely on value_type
+// being defined as many user-defined container types don't have
+// value_type.
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
+#define GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
+
+#include <ostream>  // NOLINT
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace testing {
+
+// Definitions in the 'internal' and 'internal2' name spaces are
+// subject to change without notice.  DO NOT USE THEM IN USER CODE!
+namespace internal2 {
+
+// Prints the given number of bytes in the given object to the given
+// ostream.
+GTEST_API_ void PrintBytesInObjectTo(const unsigned char* obj_bytes,
+                                     size_t count,
+                                     ::std::ostream* os);
+
+// For selecting which printer to use when a given type has neither <<
+// nor PrintTo().
+enum TypeKind {
+  kProtobuf,              // a protobuf type
+  kConvertibleToInteger,  // a type implicitly convertible to BiggestInt
+                          // (e.g. a named or unnamed enum type)
+  kOtherType              // anything else
+};
+
+// TypeWithoutFormatter<T, kTypeKind>::PrintValue(value, os) is called
+// by the universal printer to print a value of type T when neither
+// operator<< nor PrintTo() is defined for T, where kTypeKind is the
+// "kind" of T as defined by enum TypeKind.
+template <typename T, TypeKind kTypeKind>
+class TypeWithoutFormatter {
+ public:
+  // This default version is called when kTypeKind is kOtherType.
+  static void PrintValue(const T& value, ::std::ostream* os) {
+    PrintBytesInObjectTo(reinterpret_cast<const unsigned char*>(&value),
+                         sizeof(value), os);
+  }
+};
+
+// We print a protobuf using its ShortDebugString() when the string
+// doesn't exceed this many characters; otherwise we print it using
+// DebugString() for better readability.
+const size_t kProtobufOneLinerMaxLength = 50;
+
+template <typename T>
+class TypeWithoutFormatter<T, kProtobuf> {
+ public:
+  static void PrintValue(const T& value, ::std::ostream* os) {
+    const ::testing::internal::string short_str = value.ShortDebugString();
+    const ::testing::internal::string pretty_str =
+        short_str.length() <= kProtobufOneLinerMaxLength ?
+        short_str : ("\n" + value.DebugString());
+    *os << ("<" + pretty_str + ">");
+  }
+};
+
+template <typename T>
+class TypeWithoutFormatter<T, kConvertibleToInteger> {
+ public:
+  // Since T has no << operator or PrintTo() but can be implicitly
+  // converted to BiggestInt, we print it as a BiggestInt.
+  //
+  // Most likely T is an enum type (either named or unnamed), in which
+  // case printing it as an integer is the desired behavior.  In case
+  // T is not an enum, printing it as an integer is the best we can do
+  // given that it has no user-defined printer.
+  static void PrintValue(const T& value, ::std::ostream* os) {
+    const internal::BiggestInt kBigInt = value;
+    *os << kBigInt;
+  }
+};
+
+// Prints the given value to the given ostream.  If the value is a
+// protocol message, its debug string is printed; if it's an enum or
+// of a type implicitly convertible to BiggestInt, it's printed as an
+// integer; otherwise the bytes in the value are printed.  This is
+// what UniversalPrinter<T>::Print() does when it knows nothing about
+// type T and T has neither << operator nor PrintTo().
+//
+// A user can override this behavior for a class type Foo by defining
+// a << operator in the namespace where Foo is defined.
+//
+// We put this operator in namespace 'internal2' instead of 'internal'
+// to simplify the implementation, as much code in 'internal' needs to
+// use << in STL, which would conflict with our own << were it defined
+// in 'internal'.
+//
+// Note that this operator<< takes a generic std::basic_ostream<Char,
+// CharTraits> type instead of the more restricted std::ostream.  If
+// we define it to take an std::ostream instead, we'll get an
+// "ambiguous overloads" compiler error when trying to print a type
+// Foo that supports streaming to std::basic_ostream<Char,
+// CharTraits>, as the compiler cannot tell whether
+// operator<<(std::ostream&, const T&) or
+// operator<<(std::basic_stream<Char, CharTraits>, const Foo&) is more
+// specific.
+template <typename Char, typename CharTraits, typename T>
+::std::basic_ostream<Char, CharTraits>& operator<<(
+    ::std::basic_ostream<Char, CharTraits>& os, const T& x) {
+  TypeWithoutFormatter<T,
+      (internal::IsAProtocolMessage<T>::value ? kProtobuf :
+       internal::ImplicitlyConvertible<const T&, internal::BiggestInt>::value ?
+       kConvertibleToInteger : kOtherType)>::PrintValue(x, &os);
+  return os;
+}
+
+}  // namespace internal2
+}  // namespace testing
+
+// This namespace MUST NOT BE NESTED IN ::testing, or the name look-up
+// magic needed for implementing UniversalPrinter won't work.
+namespace testing_internal {
+
+// Used to print a value that is not an STL-style container when the
+// user doesn't define PrintTo() for it.
+template <typename T>
+void DefaultPrintNonContainerTo(const T& value, ::std::ostream* os) {
+  // With the following statement, during unqualified name lookup,
+  // testing::internal2::operator<< appears as if it was declared in
+  // the nearest enclosing namespace that contains both
+  // ::testing_internal and ::testing::internal2, i.e. the global
+  // namespace.  For more details, refer to the C++ Standard section
+  // 7.3.4-1 [namespace.udir].  This allows us to fall back onto
+  // testing::internal2::operator<< in case T doesn't come with a <<
+  // operator.
+  //
+  // We cannot write 'using ::testing::internal2::operator<<;', which
+  // gcc 3.3 fails to compile due to a compiler bug.
+  using namespace ::testing::internal2;  // NOLINT
+
+  // Assuming T is defined in namespace foo, in the next statement,
+  // the compiler will consider all of:
+  //
+  //   1. foo::operator<< (thanks to Koenig look-up),
+  //   2. ::operator<< (as the current namespace is enclosed in ::),
+  //   3. testing::internal2::operator<< (thanks to the using statement above).
+  //
+  // The operator<< whose type matches T best will be picked.
+  //
+  // We deliberately allow #2 to be a candidate, as sometimes it's
+  // impossible to define #1 (e.g. when foo is ::std, defining
+  // anything in it is undefined behavior unless you are a compiler
+  // vendor.).
+  *os << value;
+}
+
+}  // namespace testing_internal
+
+namespace testing {
+namespace internal {
+
+// UniversalPrinter<T>::Print(value, ostream_ptr) prints the given
+// value to the given ostream.  The caller must ensure that
+// 'ostream_ptr' is not NULL, or the behavior is undefined.
+//
+// We define UniversalPrinter as a class template (as opposed to a
+// function template), as we need to partially specialize it for
+// reference types, which cannot be done with function templates.
+template <typename T>
+class UniversalPrinter;
+
+template <typename T>
+void UniversalPrint(const T& value, ::std::ostream* os);
+
+// Used to print an STL-style container when the user doesn't define
+// a PrintTo() for it.
+template <typename C>
+void DefaultPrintTo(IsContainer /* dummy */,
+                    false_type /* is not a pointer */,
+                    const C& container, ::std::ostream* os) {
+  const size_t kMaxCount = 32;  // The maximum number of elements to print.
+  *os << '{';
+  size_t count = 0;
+  for (typename C::const_iterator it = container.begin();
+       it != container.end(); ++it, ++count) {
+    if (count > 0) {
+      *os << ',';
+      if (count == kMaxCount) {  // Enough has been printed.
+        *os << " ...";
+        break;
+      }
+    }
+    *os << ' ';
+    // We cannot call PrintTo(*it, os) here as PrintTo() doesn't
+    // handle *it being a native array.
+    internal::UniversalPrint(*it, os);
+  }
+
+  if (count > 0) {
+    *os << ' ';
+  }
+  *os << '}';
+}
+
+// Used to print a pointer that is neither a char pointer nor a member
+// pointer, when the user doesn't define PrintTo() for it.  (A member
+// variable pointer or member function pointer doesn't really point to
+// a location in the address space.  Their representation is
+// implementation-defined.  Therefore they will be printed as raw
+// bytes.)
+template <typename T>
+void DefaultPrintTo(IsNotContainer /* dummy */,
+                    true_type /* is a pointer */,
+                    T* p, ::std::ostream* os) {
+  if (p == NULL) {
+    *os << "NULL";
+  } else {
+    // C++ doesn't allow casting from a function pointer to any object
+    // pointer.
+    //
+    // IsTrue() silences warnings: "Condition is always true",
+    // "unreachable code".
+    if (IsTrue(ImplicitlyConvertible<T*, const void*>::value)) {
+      // T is not a function type.  We just call << to print p,
+      // relying on ADL to pick up user-defined << for their pointer
+      // types, if any.
+      *os << p;
+    } else {
+      // T is a function type, so '*os << p' doesn't do what we want
+      // (it just prints p as bool).  We want to print p as a const
+      // void*.  However, we cannot cast it to const void* directly,
+      // even using reinterpret_cast, as earlier versions of gcc
+      // (e.g. 3.4.5) cannot compile the cast when p is a function
+      // pointer.  Casting to UInt64 first solves the problem.
+      *os << reinterpret_cast<const void*>(
+          reinterpret_cast<internal::UInt64>(p));
+    }
+  }
+}
+
+// Used to print a non-container, non-pointer value when the user
+// doesn't define PrintTo() for it.
+template <typename T>
+void DefaultPrintTo(IsNotContainer /* dummy */,
+                    false_type /* is not a pointer */,
+                    const T& value, ::std::ostream* os) {
+  ::testing_internal::DefaultPrintNonContainerTo(value, os);
+}
+
+// Prints the given value using the << operator if it has one;
+// otherwise prints the bytes in it.  This is what
+// UniversalPrinter<T>::Print() does when PrintTo() is not specialized
+// or overloaded for type T.
+//
+// A user can override this behavior for a class type Foo by defining
+// an overload of PrintTo() in the namespace where Foo is defined.  We
+// give the user this option as sometimes defining a << operator for
+// Foo is not desirable (e.g. the coding style may prevent doing it,
+// or there is already a << operator but it doesn't do what the user
+// wants).
+template <typename T>
+void PrintTo(const T& value, ::std::ostream* os) {
+  // DefaultPrintTo() is overloaded.  The type of its first two
+  // arguments determine which version will be picked.  If T is an
+  // STL-style container, the version for container will be called; if
+  // T is a pointer, the pointer version will be called; otherwise the
+  // generic version will be called.
+  //
+  // Note that we check for container types here, prior to we check
+  // for protocol message types in our operator<<.  The rationale is:
+  //
+  // For protocol messages, we want to give people a chance to
+  // override Google Mock's format by defining a PrintTo() or
+  // operator<<.  For STL containers, other formats can be
+  // incompatible with Google Mock's format for the container
+  // elements; therefore we check for container types here to ensure
+  // that our format is used.
+  //
+  // The second argument of DefaultPrintTo() is needed to bypass a bug
+  // in Symbian's C++ compiler that prevents it from picking the right
+  // overload between:
+  //
+  //   PrintTo(const T& x, ...);
+  //   PrintTo(T* x, ...);
+  DefaultPrintTo(IsContainerTest<T>(0), is_pointer<T>(), value, os);
+}
+
+// The following list of PrintTo() overloads tells
+// UniversalPrinter<T>::Print() how to print standard types (built-in
+// types, strings, plain arrays, and pointers).
+
+// Overloads for various char types.
+GTEST_API_ void PrintTo(unsigned char c, ::std::ostream* os);
+GTEST_API_ void PrintTo(signed char c, ::std::ostream* os);
+inline void PrintTo(char c, ::std::ostream* os) {
+  // When printing a plain char, we always treat it as unsigned.  This
+  // way, the output won't be affected by whether the compiler thinks
+  // char is signed or not.
+  PrintTo(static_cast<unsigned char>(c), os);
+}
+
+// Overloads for other simple built-in types.
+inline void PrintTo(bool x, ::std::ostream* os) {
+  *os << (x ? "true" : "false");
+}
+
+// Overload for wchar_t type.
+// Prints a wchar_t as a symbol if it is printable or as its internal
+// code otherwise and also as its decimal code (except for L'\0').
+// The L'\0' char is printed as "L'\\0'". The decimal code is printed
+// as signed integer when wchar_t is implemented by the compiler
+// as a signed type and is printed as an unsigned integer when wchar_t
+// is implemented as an unsigned type.
+GTEST_API_ void PrintTo(wchar_t wc, ::std::ostream* os);
+
+// Overloads for C strings.
+GTEST_API_ void PrintTo(const char* s, ::std::ostream* os);
+inline void PrintTo(char* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const char*>(s), os);
+}
+
+// signed/unsigned char is often used for representing binary data, so
+// we print pointers to it as void* to be safe.
+inline void PrintTo(const signed char* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const void*>(s), os);
+}
+inline void PrintTo(signed char* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const void*>(s), os);
+}
+inline void PrintTo(const unsigned char* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const void*>(s), os);
+}
+inline void PrintTo(unsigned char* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const void*>(s), os);
+}
+
+// MSVC can be configured to define wchar_t as a typedef of unsigned
+// short.  It defines _NATIVE_WCHAR_T_DEFINED when wchar_t is a native
+// type.  When wchar_t is a typedef, defining an overload for const
+// wchar_t* would cause unsigned short* be printed as a wide string,
+// possibly causing invalid memory accesses.
+#if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED)
+// Overloads for wide C strings
+GTEST_API_ void PrintTo(const wchar_t* s, ::std::ostream* os);
+inline void PrintTo(wchar_t* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const wchar_t*>(s), os);
+}
+#endif
+
+// Overload for C arrays.  Multi-dimensional arrays are printed
+// properly.
+
+// Prints the given number of elements in an array, without printing
+// the curly braces.
+template <typename T>
+void PrintRawArrayTo(const T a[], size_t count, ::std::ostream* os) {
+  UniversalPrint(a[0], os);
+  for (size_t i = 1; i != count; i++) {
+    *os << ", ";
+    UniversalPrint(a[i], os);
+  }
+}
+
+// Overloads for ::string and ::std::string.
+#if GTEST_HAS_GLOBAL_STRING
+GTEST_API_ void PrintStringTo(const ::string&s, ::std::ostream* os);
+inline void PrintTo(const ::string& s, ::std::ostream* os) {
+  PrintStringTo(s, os);
+}
+#endif  // GTEST_HAS_GLOBAL_STRING
+
+GTEST_API_ void PrintStringTo(const ::std::string&s, ::std::ostream* os);
+inline void PrintTo(const ::std::string& s, ::std::ostream* os) {
+  PrintStringTo(s, os);
+}
+
+// Overloads for ::wstring and ::std::wstring.
+#if GTEST_HAS_GLOBAL_WSTRING
+GTEST_API_ void PrintWideStringTo(const ::wstring&s, ::std::ostream* os);
+inline void PrintTo(const ::wstring& s, ::std::ostream* os) {
+  PrintWideStringTo(s, os);
+}
+#endif  // GTEST_HAS_GLOBAL_WSTRING
+
+#if GTEST_HAS_STD_WSTRING
+GTEST_API_ void PrintWideStringTo(const ::std::wstring&s, ::std::ostream* os);
+inline void PrintTo(const ::std::wstring& s, ::std::ostream* os) {
+  PrintWideStringTo(s, os);
+}
+#endif  // GTEST_HAS_STD_WSTRING
+
+#if GTEST_HAS_TR1_TUPLE
+// Overload for ::std::tr1::tuple.  Needed for printing function arguments,
+// which are packed as tuples.
+
+// Helper function for printing a tuple.  T must be instantiated with
+// a tuple type.
+template <typename T>
+void PrintTupleTo(const T& t, ::std::ostream* os);
+
+// Overloaded PrintTo() for tuples of various arities.  We support
+// tuples of up-to 10 fields.  The following implementation works
+// regardless of whether tr1::tuple is implemented using the
+// non-standard variadic template feature or not.
+
+inline void PrintTo(const ::std::tr1::tuple<>& t, ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1>
+void PrintTo(const ::std::tr1::tuple<T1>& t, ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2>
+void PrintTo(const ::std::tr1::tuple<T1, T2>& t, ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3>
+void PrintTo(const ::std::tr1::tuple<T1, T2, T3>& t, ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3, typename T4>
+void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4>& t, ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5>& t,
+             ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+          typename T6>
+void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6>& t,
+             ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+          typename T6, typename T7>
+void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7>& t,
+             ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+          typename T6, typename T7, typename T8>
+void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8>& t,
+             ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+          typename T6, typename T7, typename T8, typename T9>
+void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9>& t,
+             ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+          typename T6, typename T7, typename T8, typename T9, typename T10>
+void PrintTo(
+    const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10>& t,
+    ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+#endif  // GTEST_HAS_TR1_TUPLE
+
+// Overload for std::pair.
+template <typename T1, typename T2>
+void PrintTo(const ::std::pair<T1, T2>& value, ::std::ostream* os) {
+  *os << '(';
+  // We cannot use UniversalPrint(value.first, os) here, as T1 may be
+  // a reference type.  The same for printing value.second.
+  UniversalPrinter<T1>::Print(value.first, os);
+  *os << ", ";
+  UniversalPrinter<T2>::Print(value.second, os);
+  *os << ')';
+}
+
+// Implements printing a non-reference type T by letting the compiler
+// pick the right overload of PrintTo() for T.
+template <typename T>
+class UniversalPrinter {
+ public:
+  // MSVC warns about adding const to a function type, so we want to
+  // disable the warning.
+#ifdef _MSC_VER
+# pragma warning(push)          // Saves the current warning state.
+# pragma warning(disable:4180)  // Temporarily disables warning 4180.
+#endif  // _MSC_VER
+
+  // Note: we deliberately don't call this PrintTo(), as that name
+  // conflicts with ::testing::internal::PrintTo in the body of the
+  // function.
+  static void Print(const T& value, ::std::ostream* os) {
+    // By default, ::testing::internal::PrintTo() is used for printing
+    // the value.
+    //
+    // Thanks to Koenig look-up, if T is a class and has its own
+    // PrintTo() function defined in its namespace, that function will
+    // be visible here.  Since it is more specific than the generic ones
+    // in ::testing::internal, it will be picked by the compiler in the
+    // following statement - exactly what we want.
+    PrintTo(value, os);
+  }
+
+#ifdef _MSC_VER
+# pragma warning(pop)           // Restores the warning state.
+#endif  // _MSC_VER
+};
+
+// UniversalPrintArray(begin, len, os) prints an array of 'len'
+// elements, starting at address 'begin'.
+template <typename T>
+void UniversalPrintArray(const T* begin, size_t len, ::std::ostream* os) {
+  if (len == 0) {
+    *os << "{}";
+  } else {
+    *os << "{ ";
+    const size_t kThreshold = 18;
+    const size_t kChunkSize = 8;
+    // If the array has more than kThreshold elements, we'll have to
+    // omit some details by printing only the first and the last
+    // kChunkSize elements.
+    // TODO(wan@google.com): let the user control the threshold using a flag.
+    if (len <= kThreshold) {
+      PrintRawArrayTo(begin, len, os);
+    } else {
+      PrintRawArrayTo(begin, kChunkSize, os);
+      *os << ", ..., ";
+      PrintRawArrayTo(begin + len - kChunkSize, kChunkSize, os);
+    }
+    *os << " }";
+  }
+}
+// This overload prints a (const) char array compactly.
+GTEST_API_ void UniversalPrintArray(
+    const char* begin, size_t len, ::std::ostream* os);
+
+// This overload prints a (const) wchar_t array compactly.
+GTEST_API_ void UniversalPrintArray(
+    const wchar_t* begin, size_t len, ::std::ostream* os);
+
+// Implements printing an array type T[N].
+template <typename T, size_t N>
+class UniversalPrinter<T[N]> {
+ public:
+  // Prints the given array, omitting some elements when there are too
+  // many.
+  static void Print(const T (&a)[N], ::std::ostream* os) {
+    UniversalPrintArray(a, N, os);
+  }
+};
+
+// Implements printing a reference type T&.
+template <typename T>
+class UniversalPrinter<T&> {
+ public:
+  // MSVC warns about adding const to a function type, so we want to
+  // disable the warning.
+#ifdef _MSC_VER
+# pragma warning(push)          // Saves the current warning state.
+# pragma warning(disable:4180)  // Temporarily disables warning 4180.
+#endif  // _MSC_VER
+
+  static void Print(const T& value, ::std::ostream* os) {
+    // Prints the address of the value.  We use reinterpret_cast here
+    // as static_cast doesn't compile when T is a function type.
+    *os << "@" << reinterpret_cast<const void*>(&value) << " ";
+
+    // Then prints the value itself.
+    UniversalPrint(value, os);
+  }
+
+#ifdef _MSC_VER
+# pragma warning(pop)           // Restores the warning state.
+#endif  // _MSC_VER
+};
+
+// Prints a value tersely: for a reference type, the referenced value
+// (but not the address) is printed; for a (const) char pointer, the
+// NUL-terminated string (but not the pointer) is printed.
+
+template <typename T>
+class UniversalTersePrinter {
+ public:
+  static void Print(const T& value, ::std::ostream* os) {
+    UniversalPrint(value, os);
+  }
+};
+template <typename T>
+class UniversalTersePrinter<T&> {
+ public:
+  static void Print(const T& value, ::std::ostream* os) {
+    UniversalPrint(value, os);
+  }
+};
+template <typename T, size_t N>
+class UniversalTersePrinter<T[N]> {
+ public:
+  static void Print(const T (&value)[N], ::std::ostream* os) {
+    UniversalPrinter<T[N]>::Print(value, os);
+  }
+};
+template <>
+class UniversalTersePrinter<const char*> {
+ public:
+  static void Print(const char* str, ::std::ostream* os) {
+    if (str == NULL) {
+      *os << "NULL";
+    } else {
+      UniversalPrint(string(str), os);
+    }
+  }
+};
+template <>
+class UniversalTersePrinter<char*> {
+ public:
+  static void Print(char* str, ::std::ostream* os) {
+    UniversalTersePrinter<const char*>::Print(str, os);
+  }
+};
+
+#if GTEST_HAS_STD_WSTRING
+template <>
+class UniversalTersePrinter<const wchar_t*> {
+ public:
+  static void Print(const wchar_t* str, ::std::ostream* os) {
+    if (str == NULL) {
+      *os << "NULL";
+    } else {
+      UniversalPrint(::std::wstring(str), os);
+    }
+  }
+};
+#endif
+
+template <>
+class UniversalTersePrinter<wchar_t*> {
+ public:
+  static void Print(wchar_t* str, ::std::ostream* os) {
+    UniversalTersePrinter<const wchar_t*>::Print(str, os);
+  }
+};
+
+template <typename T>
+void UniversalTersePrint(const T& value, ::std::ostream* os) {
+  UniversalTersePrinter<T>::Print(value, os);
+}
+
+// Prints a value using the type inferred by the compiler.  The
+// difference between this and UniversalTersePrint() is that for a
+// (const) char pointer, this prints both the pointer and the
+// NUL-terminated string.
+template <typename T>
+void UniversalPrint(const T& value, ::std::ostream* os) {
+  // A workarond for the bug in VC++ 7.1 that prevents us from instantiating
+  // UniversalPrinter with T directly.
+  typedef T T1;
+  UniversalPrinter<T1>::Print(value, os);
+}
+
+#if GTEST_HAS_TR1_TUPLE
+typedef ::std::vector<string> Strings;
+
+// This helper template allows PrintTo() for tuples and
+// UniversalTersePrintTupleFieldsToStrings() to be defined by
+// induction on the number of tuple fields.  The idea is that
+// TuplePrefixPrinter<N>::PrintPrefixTo(t, os) prints the first N
+// fields in tuple t, and can be defined in terms of
+// TuplePrefixPrinter<N - 1>.
+
+// The inductive case.
+template <size_t N>
+struct TuplePrefixPrinter {
+  // Prints the first N fields of a tuple.
+  template <typename Tuple>
+  static void PrintPrefixTo(const Tuple& t, ::std::ostream* os) {
+    TuplePrefixPrinter<N - 1>::PrintPrefixTo(t, os);
+    *os << ", ";
+    UniversalPrinter<typename ::std::tr1::tuple_element<N - 1, Tuple>::type>
+        ::Print(::std::tr1::get<N - 1>(t), os);
+  }
+
+  // Tersely prints the first N fields of a tuple to a string vector,
+  // one element for each field.
+  template <typename Tuple>
+  static void TersePrintPrefixToStrings(const Tuple& t, Strings* strings) {
+    TuplePrefixPrinter<N - 1>::TersePrintPrefixToStrings(t, strings);
+    ::std::stringstream ss;
+    UniversalTersePrint(::std::tr1::get<N - 1>(t), &ss);
+    strings->push_back(ss.str());
+  }
+};
+
+// Base cases.
+template <>
+struct TuplePrefixPrinter<0> {
+  template <typename Tuple>
+  static void PrintPrefixTo(const Tuple&, ::std::ostream*) {}
+
+  template <typename Tuple>
+  static void TersePrintPrefixToStrings(const Tuple&, Strings*) {}
+};
+// We have to specialize the entire TuplePrefixPrinter<> class
+// template here, even though the definition of
+// TersePrintPrefixToStrings() is the same as the generic version, as
+// Embarcadero (formerly CodeGear, formerly Borland) C++ doesn't
+// support specializing a method template of a class template.
+template <>
+struct TuplePrefixPrinter<1> {
+  template <typename Tuple>
+  static void PrintPrefixTo(const Tuple& t, ::std::ostream* os) {
+    UniversalPrinter<typename ::std::tr1::tuple_element<0, Tuple>::type>::
+        Print(::std::tr1::get<0>(t), os);
+  }
+
+  template <typename Tuple>
+  static void TersePrintPrefixToStrings(const Tuple& t, Strings* strings) {
+    ::std::stringstream ss;
+    UniversalTersePrint(::std::tr1::get<0>(t), &ss);
+    strings->push_back(ss.str());
+  }
+};
+
+// Helper function for printing a tuple.  T must be instantiated with
+// a tuple type.
+template <typename T>
+void PrintTupleTo(const T& t, ::std::ostream* os) {
+  *os << "(";
+  TuplePrefixPrinter< ::std::tr1::tuple_size<T>::value>::
+      PrintPrefixTo(t, os);
+  *os << ")";
+}
+
+// Prints the fields of a tuple tersely to a string vector, one
+// element for each field.  See the comment before
+// UniversalTersePrint() for how we define "tersely".
+template <typename Tuple>
+Strings UniversalTersePrintTupleFieldsToStrings(const Tuple& value) {
+  Strings result;
+  TuplePrefixPrinter< ::std::tr1::tuple_size<Tuple>::value>::
+      TersePrintPrefixToStrings(value, &result);
+  return result;
+}
+#endif  // GTEST_HAS_TR1_TUPLE
+
+}  // namespace internal
+
+template <typename T>
+::std::string PrintToString(const T& value) {
+  ::std::stringstream ss;
+  internal::UniversalTersePrinter<T>::Print(value, &ss);
+  return ss.str();
+}
+
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
+
+#if GTEST_HAS_PARAM_TEST
+
+namespace testing {
+namespace internal {
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Outputs a message explaining invalid registration of different
+// fixture class for the same test case. This may happen when
+// TEST_P macro is used to define two tests with the same name
+// but in different namespaces.
+GTEST_API_ void ReportInvalidTestCaseType(const char* test_case_name,
+                                          const char* file, int line);
+
+template <typename> class ParamGeneratorInterface;
+template <typename> class ParamGenerator;
+
+// Interface for iterating over elements provided by an implementation
+// of ParamGeneratorInterface<T>.
+template <typename T>
+class ParamIteratorInterface {
+ public:
+  virtual ~ParamIteratorInterface() {}
+  // A pointer to the base generator instance.
+  // Used only for the purposes of iterator comparison
+  // to make sure that two iterators belong to the same generator.
+  virtual const ParamGeneratorInterface<T>* BaseGenerator() const = 0;
+  // Advances iterator to point to the next element
+  // provided by the generator. The caller is responsible
+  // for not calling Advance() on an iterator equal to
+  // BaseGenerator()->End().
+  virtual void Advance() = 0;
+  // Clones the iterator object. Used for implementing copy semantics
+  // of ParamIterator<T>.
+  virtual ParamIteratorInterface* Clone() const = 0;
+  // Dereferences the current iterator and provides (read-only) access
+  // to the pointed value. It is the caller's responsibility not to call
+  // Current() on an iterator equal to BaseGenerator()->End().
+  // Used for implementing ParamGenerator<T>::operator*().
+  virtual const T* Current() const = 0;
+  // Determines whether the given iterator and other point to the same
+  // element in the sequence generated by the generator.
+  // Used for implementing ParamGenerator<T>::operator==().
+  virtual bool Equals(const ParamIteratorInterface& other) const = 0;
+};
+
+// Class iterating over elements provided by an implementation of
+// ParamGeneratorInterface<T>. It wraps ParamIteratorInterface<T>
+// and implements the const forward iterator concept.
+template <typename T>
+class ParamIterator {
+ public:
+  typedef T value_type;
+  typedef const T& reference;
+  typedef ptrdiff_t difference_type;
+
+  // ParamIterator assumes ownership of the impl_ pointer.
+  ParamIterator(const ParamIterator& other) : impl_(other.impl_->Clone()) {}
+  ParamIterator& operator=(const ParamIterator& other) {
+    if (this != &other)
+      impl_.reset(other.impl_->Clone());
+    return *this;
+  }
+
+  const T& operator*() const { return *impl_->Current(); }
+  const T* operator->() const { return impl_->Current(); }
+  // Prefix version of operator++.
+  ParamIterator& operator++() {
+    impl_->Advance();
+    return *this;
+  }
+  // Postfix version of operator++.
+  ParamIterator operator++(int /*unused*/) {
+    ParamIteratorInterface<T>* clone = impl_->Clone();
+    impl_->Advance();
+    return ParamIterator(clone);
+  }
+  bool operator==(const ParamIterator& other) const {
+    return impl_.get() == other.impl_.get() || impl_->Equals(*other.impl_);
+  }
+  bool operator!=(const ParamIterator& other) const {
+    return !(*this == other);
+  }
+
+ private:
+  friend class ParamGenerator<T>;
+  explicit ParamIterator(ParamIteratorInterface<T>* impl) : impl_(impl) {}
+  scoped_ptr<ParamIteratorInterface<T> > impl_;
+};
+
+// ParamGeneratorInterface<T> is the binary interface to access generators
+// defined in other translation units.
+template <typename T>
+class ParamGeneratorInterface {
+ public:
+  typedef T ParamType;
+
+  virtual ~ParamGeneratorInterface() {}
+
+  // Generator interface definition
+  virtual ParamIteratorInterface<T>* Begin() const = 0;
+  virtual ParamIteratorInterface<T>* End() const = 0;
+};
+
+// Wraps ParamGeneratorInterface<T> and provides general generator syntax
+// compatible with the STL Container concept.
+// This class implements copy initialization semantics and the contained
+// ParamGeneratorInterface<T> instance is shared among all copies
+// of the original object. This is possible because that instance is immutable.
+template<typename T>
+class ParamGenerator {
+ public:
+  typedef ParamIterator<T> iterator;
+
+  explicit ParamGenerator(ParamGeneratorInterface<T>* impl) : impl_(impl) {}
+  ParamGenerator(const ParamGenerator& other) : impl_(other.impl_) {}
+
+  ParamGenerator& operator=(const ParamGenerator& other) {
+    impl_ = other.impl_;
+    return *this;
+  }
+
+  iterator begin() const { return iterator(impl_->Begin()); }
+  iterator end() const { return iterator(impl_->End()); }
+
+ private:
+  linked_ptr<const ParamGeneratorInterface<T> > impl_;
+};
+
+// Generates values from a range of two comparable values. Can be used to
+// generate sequences of user-defined types that implement operator+() and
+// operator<().
+// This class is used in the Range() function.
+template <typename T, typename IncrementT>
+class RangeGenerator : public ParamGeneratorInterface<T> {
+ public:
+  RangeGenerator(T begin, T end, IncrementT step)
+      : begin_(begin), end_(end),
+        step_(step), end_index_(CalculateEndIndex(begin, end, step)) {}
+  virtual ~RangeGenerator() {}
+
+  virtual ParamIteratorInterface<T>* Begin() const {
+    return new Iterator(this, begin_, 0, step_);
+  }
+  virtual ParamIteratorInterface<T>* End() const {
+    return new Iterator(this, end_, end_index_, step_);
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<T> {
+   public:
+    Iterator(const ParamGeneratorInterface<T>* base, T value, int index,
+             IncrementT step)
+        : base_(base), value_(value), index_(index), step_(step) {}
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<T>* BaseGenerator() const {
+      return base_;
+    }
+    virtual void Advance() {
+      value_ = value_ + step_;
+      index_++;
+    }
+    virtual ParamIteratorInterface<T>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const T* Current() const { return &value_; }
+    virtual bool Equals(const ParamIteratorInterface<T>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const int other_index =
+          CheckedDowncastToActualType<const Iterator>(&other)->index_;
+      return index_ == other_index;
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : ParamIteratorInterface<T>(),
+          base_(other.base_), value_(other.value_), index_(other.index_),
+          step_(other.step_) {}
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<T>* const base_;
+    T value_;
+    int index_;
+    const IncrementT step_;
+  };  // class RangeGenerator::Iterator
+
+  static int CalculateEndIndex(const T& begin,
+                               const T& end,
+                               const IncrementT& step) {
+    int end_index = 0;
+    for (T i = begin; i < end; i = i + step)
+      end_index++;
+    return end_index;
+  }
+
+  // No implementation - assignment is unsupported.
+  void operator=(const RangeGenerator& other);
+
+  const T begin_;
+  const T end_;
+  const IncrementT step_;
+  // The index for the end() iterator. All the elements in the generated
+  // sequence are indexed (0-based) to aid iterator comparison.
+  const int end_index_;
+};  // class RangeGenerator
+
+
+// Generates values from a pair of STL-style iterators. Used in the
+// ValuesIn() function. The elements are copied from the source range
+// since the source can be located on the stack, and the generator
+// is likely to persist beyond that stack frame.
+template <typename T>
+class ValuesInIteratorRangeGenerator : public ParamGeneratorInterface<T> {
+ public:
+  template <typename ForwardIterator>
+  ValuesInIteratorRangeGenerator(ForwardIterator begin, ForwardIterator end)
+      : container_(begin, end) {}
+  virtual ~ValuesInIteratorRangeGenerator() {}
+
+  virtual ParamIteratorInterface<T>* Begin() const {
+    return new Iterator(this, container_.begin());
+  }
+  virtual ParamIteratorInterface<T>* End() const {
+    return new Iterator(this, container_.end());
+  }
+
+ private:
+  typedef typename ::std::vector<T> ContainerType;
+
+  class Iterator : public ParamIteratorInterface<T> {
+   public:
+    Iterator(const ParamGeneratorInterface<T>* base,
+             typename ContainerType::const_iterator iterator)
+        : base_(base), iterator_(iterator) {}
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<T>* BaseGenerator() const {
+      return base_;
+    }
+    virtual void Advance() {
+      ++iterator_;
+      value_.reset();
+    }
+    virtual ParamIteratorInterface<T>* Clone() const {
+      return new Iterator(*this);
+    }
+    // We need to use cached value referenced by iterator_ because *iterator_
+    // can return a temporary object (and of type other then T), so just
+    // having "return &*iterator_;" doesn't work.
+    // value_ is updated here and not in Advance() because Advance()
+    // can advance iterator_ beyond the end of the range, and we cannot
+    // detect that fact. The client code, on the other hand, is
+    // responsible for not calling Current() on an out-of-range iterator.
+    virtual const T* Current() const {
+      if (value_.get() == NULL)
+        value_.reset(new T(*iterator_));
+      return value_.get();
+    }
+    virtual bool Equals(const ParamIteratorInterface<T>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      return iterator_ ==
+          CheckedDowncastToActualType<const Iterator>(&other)->iterator_;
+    }
+
+   private:
+    Iterator(const Iterator& other)
+          // The explicit constructor call suppresses a false warning
+          // emitted by gcc when supplied with the -Wextra option.
+        : ParamIteratorInterface<T>(),
+          base_(other.base_),
+          iterator_(other.iterator_) {}
+
+    const ParamGeneratorInterface<T>* const base_;
+    typename ContainerType::const_iterator iterator_;
+    // A cached value of *iterator_. We keep it here to allow access by
+    // pointer in the wrapping iterator's operator->().
+    // value_ needs to be mutable to be accessed in Current().
+    // Use of scoped_ptr helps manage cached value's lifetime,
+    // which is bound by the lifespan of the iterator itself.
+    mutable scoped_ptr<const T> value_;
+  };  // class ValuesInIteratorRangeGenerator::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const ValuesInIteratorRangeGenerator& other);
+
+  const ContainerType container_;
+};  // class ValuesInIteratorRangeGenerator
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Stores a parameter value and later creates tests parameterized with that
+// value.
+template <class TestClass>
+class ParameterizedTestFactory : public TestFactoryBase {
+ public:
+  typedef typename TestClass::ParamType ParamType;
+  explicit ParameterizedTestFactory(ParamType parameter) :
+      parameter_(parameter) {}
+  virtual Test* CreateTest() {
+    TestClass::SetParam(&parameter_);
+    return new TestClass();
+  }
+
+ private:
+  const ParamType parameter_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestFactory);
+};
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// TestMetaFactoryBase is a base class for meta-factories that create
+// test factories for passing into MakeAndRegisterTestInfo function.
+template <class ParamType>
+class TestMetaFactoryBase {
+ public:
+  virtual ~TestMetaFactoryBase() {}
+
+  virtual TestFactoryBase* CreateTestFactory(ParamType parameter) = 0;
+};
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// TestMetaFactory creates test factories for passing into
+// MakeAndRegisterTestInfo function. Since MakeAndRegisterTestInfo receives
+// ownership of test factory pointer, same factory object cannot be passed
+// into that method twice. But ParameterizedTestCaseInfo is going to call
+// it for each Test/Parameter value combination. Thus it needs meta factory
+// creator class.
+template <class TestCase>
+class TestMetaFactory
+    : public TestMetaFactoryBase<typename TestCase::ParamType> {
+ public:
+  typedef typename TestCase::ParamType ParamType;
+
+  TestMetaFactory() {}
+
+  virtual TestFactoryBase* CreateTestFactory(ParamType parameter) {
+    return new ParameterizedTestFactory<TestCase>(parameter);
+  }
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestMetaFactory);
+};
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// ParameterizedTestCaseInfoBase is a generic interface
+// to ParameterizedTestCaseInfo classes. ParameterizedTestCaseInfoBase
+// accumulates test information provided by TEST_P macro invocations
+// and generators provided by INSTANTIATE_TEST_CASE_P macro invocations
+// and uses that information to register all resulting test instances
+// in RegisterTests method. The ParameterizeTestCaseRegistry class holds
+// a collection of pointers to the ParameterizedTestCaseInfo objects
+// and calls RegisterTests() on each of them when asked.
+class ParameterizedTestCaseInfoBase {
+ public:
+  virtual ~ParameterizedTestCaseInfoBase() {}
+
+  // Base part of test case name for display purposes.
+  virtual const string& GetTestCaseName() const = 0;
+  // Test case id to verify identity.
+  virtual TypeId GetTestCaseTypeId() const = 0;
+  // UnitTest class invokes this method to register tests in this
+  // test case right before running them in RUN_ALL_TESTS macro.
+  // This method should not be called more then once on any single
+  // instance of a ParameterizedTestCaseInfoBase derived class.
+  virtual void RegisterTests() = 0;
+
+ protected:
+  ParameterizedTestCaseInfoBase() {}
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseInfoBase);
+};
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// ParameterizedTestCaseInfo accumulates tests obtained from TEST_P
+// macro invocations for a particular test case and generators
+// obtained from INSTANTIATE_TEST_CASE_P macro invocations for that
+// test case. It registers tests with all values generated by all
+// generators when asked.
+template <class TestCase>
+class ParameterizedTestCaseInfo : public ParameterizedTestCaseInfoBase {
+ public:
+  // ParamType and GeneratorCreationFunc are private types but are required
+  // for declarations of public methods AddTestPattern() and
+  // AddTestCaseInstantiation().
+  typedef typename TestCase::ParamType ParamType;
+  // A function that returns an instance of appropriate generator type.
+  typedef ParamGenerator<ParamType>(GeneratorCreationFunc)();
+
+  explicit ParameterizedTestCaseInfo(const char* name)
+      : test_case_name_(name) {}
+
+  // Test case base name for display purposes.
+  virtual const string& GetTestCaseName() const { return test_case_name_; }
+  // Test case id to verify identity.
+  virtual TypeId GetTestCaseTypeId() const { return GetTypeId<TestCase>(); }
+  // TEST_P macro uses AddTestPattern() to record information
+  // about a single test in a LocalTestInfo structure.
+  // test_case_name is the base name of the test case (without invocation
+  // prefix). test_base_name is the name of an individual test without
+  // parameter index. For the test SequenceA/FooTest.DoBar/1 FooTest is
+  // test case base name and DoBar is test base name.
+  void AddTestPattern(const char* test_case_name,
+                      const char* test_base_name,
+                      TestMetaFactoryBase<ParamType>* meta_factory) {
+    tests_.push_back(linked_ptr<TestInfo>(new TestInfo(test_case_name,
+                                                       test_base_name,
+                                                       meta_factory)));
+  }
+  // INSTANTIATE_TEST_CASE_P macro uses AddGenerator() to record information
+  // about a generator.
+  int AddTestCaseInstantiation(const string& instantiation_name,
+                               GeneratorCreationFunc* func,
+                               const char* /* file */,
+                               int /* line */) {
+    instantiations_.push_back(::std::make_pair(instantiation_name, func));
+    return 0;  // Return value used only to run this method in namespace scope.
+  }
+  // UnitTest class invokes this method to register tests in this test case
+  // test cases right before running tests in RUN_ALL_TESTS macro.
+  // This method should not be called more then once on any single
+  // instance of a ParameterizedTestCaseInfoBase derived class.
+  // UnitTest has a guard to prevent from calling this method more then once.
+  virtual void RegisterTests() {
+    for (typename TestInfoContainer::iterator test_it = tests_.begin();
+         test_it != tests_.end(); ++test_it) {
+      linked_ptr<TestInfo> test_info = *test_it;
+      for (typename InstantiationContainer::iterator gen_it =
+               instantiations_.begin(); gen_it != instantiations_.end();
+               ++gen_it) {
+        const string& instantiation_name = gen_it->first;
+        ParamGenerator<ParamType> generator((*gen_it->second)());
+
+        string test_case_name;
+        if ( !instantiation_name.empty() )
+          test_case_name = instantiation_name + "/";
+        test_case_name += test_info->test_case_base_name;
+
+        int i = 0;
+        for (typename ParamGenerator<ParamType>::iterator param_it =
+                 generator.begin();
+             param_it != generator.end(); ++param_it, ++i) {
+          Message test_name_stream;
+          test_name_stream << test_info->test_base_name << "/" << i;
+          MakeAndRegisterTestInfo(
+              test_case_name.c_str(),
+              test_name_stream.GetString().c_str(),
+              NULL,  // No type parameter.
+              PrintToString(*param_it).c_str(),
+              GetTestCaseTypeId(),
+              TestCase::SetUpTestCase,
+              TestCase::TearDownTestCase,
+              test_info->test_meta_factory->CreateTestFactory(*param_it));
+        }  // for param_it
+      }  // for gen_it
+    }  // for test_it
+  }  // RegisterTests
+
+ private:
+  // LocalTestInfo structure keeps information about a single test registered
+  // with TEST_P macro.
+  struct TestInfo {
+    TestInfo(const char* a_test_case_base_name,
+             const char* a_test_base_name,
+             TestMetaFactoryBase<ParamType>* a_test_meta_factory) :
+        test_case_base_name(a_test_case_base_name),
+        test_base_name(a_test_base_name),
+        test_meta_factory(a_test_meta_factory) {}
+
+    const string test_case_base_name;
+    const string test_base_name;
+    const scoped_ptr<TestMetaFactoryBase<ParamType> > test_meta_factory;
+  };
+  typedef ::std::vector<linked_ptr<TestInfo> > TestInfoContainer;
+  // Keeps pairs of <Instantiation name, Sequence generator creation function>
+  // received from INSTANTIATE_TEST_CASE_P macros.
+  typedef ::std::vector<std::pair<string, GeneratorCreationFunc*> >
+      InstantiationContainer;
+
+  const string test_case_name_;
+  TestInfoContainer tests_;
+  InstantiationContainer instantiations_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseInfo);
+};  // class ParameterizedTestCaseInfo
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// ParameterizedTestCaseRegistry contains a map of ParameterizedTestCaseInfoBase
+// classes accessed by test case names. TEST_P and INSTANTIATE_TEST_CASE_P
+// macros use it to locate their corresponding ParameterizedTestCaseInfo
+// descriptors.
+class ParameterizedTestCaseRegistry {
+ public:
+  ParameterizedTestCaseRegistry() {}
+  ~ParameterizedTestCaseRegistry() {
+    for (TestCaseInfoContainer::iterator it = test_case_infos_.begin();
+         it != test_case_infos_.end(); ++it) {
+      delete *it;
+    }
+  }
+
+  // Looks up or creates and returns a structure containing information about
+  // tests and instantiations of a particular test case.
+  template <class TestCase>
+  ParameterizedTestCaseInfo<TestCase>* GetTestCasePatternHolder(
+      const char* test_case_name,
+      const char* file,
+      int line) {
+    ParameterizedTestCaseInfo<TestCase>* typed_test_info = NULL;
+    for (TestCaseInfoContainer::iterator it = test_case_infos_.begin();
+         it != test_case_infos_.end(); ++it) {
+      if ((*it)->GetTestCaseName() == test_case_name) {
+        if ((*it)->GetTestCaseTypeId() != GetTypeId<TestCase>()) {
+          // Complain about incorrect usage of Google Test facilities
+          // and terminate the program since we cannot guaranty correct
+          // test case setup and tear-down in this case.
+          ReportInvalidTestCaseType(test_case_name,  file, line);
+          posix::Abort();
+        } else {
+          // At this point we are sure that the object we found is of the same
+          // type we are looking for, so we downcast it to that type
+          // without further checks.
+          typed_test_info = CheckedDowncastToActualType<
+              ParameterizedTestCaseInfo<TestCase> >(*it);
+        }
+        break;
+      }
+    }
+    if (typed_test_info == NULL) {
+      typed_test_info = new ParameterizedTestCaseInfo<TestCase>(test_case_name);
+      test_case_infos_.push_back(typed_test_info);
+    }
+    return typed_test_info;
+  }
+  void RegisterTests() {
+    for (TestCaseInfoContainer::iterator it = test_case_infos_.begin();
+         it != test_case_infos_.end(); ++it) {
+      (*it)->RegisterTests();
+    }
+  }
+
+ private:
+  typedef ::std::vector<ParameterizedTestCaseInfoBase*> TestCaseInfoContainer;
+
+  TestCaseInfoContainer test_case_infos_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseRegistry);
+};
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  //  GTEST_HAS_PARAM_TEST
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
+// This file was GENERATED by command:
+//     pump.py gtest-param-util-generated.h.pump
+// DO NOT EDIT BY HAND!!!
+
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: vladl@google.com (Vlad Losev)
+
+// Type and function utilities for implementing parameterized tests.
+// This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
+//
+// Currently Google Test supports at most 50 arguments in Values,
+// and at most 10 arguments in Combine. Please contact
+// googletestframework@googlegroups.com if you need more.
+// Please note that the number of arguments to Combine is limited
+// by the maximum arity of the implementation of tr1::tuple which is
+// currently set at 10.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
+
+// scripts/fuse_gtest.py depends on gtest's own header being #included
+// *unconditionally*.  Therefore these #includes cannot be moved
+// inside #if GTEST_HAS_PARAM_TEST.
+
+#if GTEST_HAS_PARAM_TEST
+
+namespace testing {
+
+// Forward declarations of ValuesIn(), which is implemented in
+// include/gtest/gtest-param-test.h.
+template <typename ForwardIterator>
+internal::ParamGenerator<
+  typename ::testing::internal::IteratorTraits<ForwardIterator>::value_type>
+ValuesIn(ForwardIterator begin, ForwardIterator end);
+
+template <typename T, size_t N>
+internal::ParamGenerator<T> ValuesIn(const T (&array)[N]);
+
+template <class Container>
+internal::ParamGenerator<typename Container::value_type> ValuesIn(
+    const Container& container);
+
+namespace internal {
+
+// Used in the Values() function to provide polymorphic capabilities.
+template <typename T1>
+class ValueArray1 {
+ public:
+  explicit ValueArray1(T1 v1) : v1_(v1) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const { return ValuesIn(&v1_, &v1_ + 1); }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray1& other);
+
+  const T1 v1_;
+};
+
+template <typename T1, typename T2>
+class ValueArray2 {
+ public:
+  ValueArray2(T1 v1, T2 v2) : v1_(v1), v2_(v2) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray2& other);
+
+  const T1 v1_;
+  const T2 v2_;
+};
+
+template <typename T1, typename T2, typename T3>
+class ValueArray3 {
+ public:
+  ValueArray3(T1 v1, T2 v2, T3 v3) : v1_(v1), v2_(v2), v3_(v3) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray3& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4>
+class ValueArray4 {
+ public:
+  ValueArray4(T1 v1, T2 v2, T3 v3, T4 v4) : v1_(v1), v2_(v2), v3_(v3),
+      v4_(v4) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray4& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class ValueArray5 {
+ public:
+  ValueArray5(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5) : v1_(v1), v2_(v2), v3_(v3),
+      v4_(v4), v5_(v5) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray5& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6>
+class ValueArray6 {
+ public:
+  ValueArray6(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6) : v1_(v1), v2_(v2),
+      v3_(v3), v4_(v4), v5_(v5), v6_(v6) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray6& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7>
+class ValueArray7 {
+ public:
+  ValueArray7(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7) : v1_(v1),
+      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray7& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8>
+class ValueArray8 {
+ public:
+  ValueArray8(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
+      T8 v8) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray8& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9>
+class ValueArray9 {
+ public:
+  ValueArray9(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8,
+      T9 v9) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray9& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10>
+class ValueArray10 {
+ public:
+  ValueArray10(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray10& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11>
+class ValueArray11 {
+ public:
+  ValueArray11(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
+      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray11& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12>
+class ValueArray12 {
+ public:
+  ValueArray12(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
+      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray12& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13>
+class ValueArray13 {
+ public:
+  ValueArray13(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
+      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
+      v12_(v12), v13_(v13) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray13& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14>
+class ValueArray14 {
+ public:
+  ValueArray14(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14) : v1_(v1), v2_(v2), v3_(v3),
+      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray14& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15>
+class ValueArray15 {
+ public:
+  ValueArray15(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15) : v1_(v1), v2_(v2),
+      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray15& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16>
+class ValueArray16 {
+ public:
+  ValueArray16(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16) : v1_(v1),
+      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
+      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
+      v16_(v16) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray16& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17>
+class ValueArray17 {
+ public:
+  ValueArray17(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16,
+      T17 v17) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray17& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18>
+class ValueArray18 {
+ public:
+  ValueArray18(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray18& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19>
+class ValueArray19 {
+ public:
+  ValueArray19(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
+      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13),
+      v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray19& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20>
+class ValueArray20 {
+ public:
+  ValueArray20(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
+      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12),
+      v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18),
+      v19_(v19), v20_(v20) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray20& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21>
+class ValueArray21 {
+ public:
+  ValueArray21(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
+      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
+      v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17),
+      v18_(v18), v19_(v19), v20_(v20), v21_(v21) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray21& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22>
+class ValueArray22 {
+ public:
+  ValueArray22(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22) : v1_(v1), v2_(v2), v3_(v3),
+      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray22& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23>
+class ValueArray23 {
+ public:
+  ValueArray23(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23) : v1_(v1), v2_(v2),
+      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
+      v23_(v23) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray23& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24>
+class ValueArray24 {
+ public:
+  ValueArray24(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24) : v1_(v1),
+      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
+      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
+      v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21),
+      v22_(v22), v23_(v23), v24_(v24) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray24& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25>
+class ValueArray25 {
+ public:
+  ValueArray25(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24,
+      T25 v25) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray25& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26>
+class ValueArray26 {
+ public:
+  ValueArray26(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray26& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27>
+class ValueArray27 {
+ public:
+  ValueArray27(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
+      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13),
+      v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19),
+      v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25),
+      v26_(v26), v27_(v27) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray27& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28>
+class ValueArray28 {
+ public:
+  ValueArray28(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
+      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12),
+      v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18),
+      v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24),
+      v25_(v25), v26_(v26), v27_(v27), v28_(v28) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray28& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29>
+class ValueArray29 {
+ public:
+  ValueArray29(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
+      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
+      v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17),
+      v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23),
+      v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray29& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30>
+class ValueArray30 {
+ public:
+  ValueArray30(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30) : v1_(v1), v2_(v2), v3_(v3),
+      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
+      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
+      v29_(v29), v30_(v30) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray30& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31>
+class ValueArray31 {
+ public:
+  ValueArray31(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31) : v1_(v1), v2_(v2),
+      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
+      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
+      v29_(v29), v30_(v30), v31_(v31) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray31& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32>
+class ValueArray32 {
+ public:
+  ValueArray32(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32) : v1_(v1),
+      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
+      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
+      v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21),
+      v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27),
+      v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray32& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33>
+class ValueArray33 {
+ public:
+  ValueArray33(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32,
+      T33 v33) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
+      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
+      v33_(v33) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray33& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34>
+class ValueArray34 {
+ public:
+  ValueArray34(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
+      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
+      v33_(v33), v34_(v34) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray34& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35>
+class ValueArray35 {
+ public:
+  ValueArray35(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
+      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13),
+      v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19),
+      v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25),
+      v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31),
+      v32_(v32), v33_(v33), v34_(v34), v35_(v35) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray35& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36>
+class ValueArray36 {
+ public:
+  ValueArray36(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
+      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12),
+      v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18),
+      v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24),
+      v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30),
+      v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray36& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37>
+class ValueArray37 {
+ public:
+  ValueArray37(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
+      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
+      v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17),
+      v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23),
+      v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29),
+      v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35),
+      v36_(v36), v37_(v37) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray37& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38>
+class ValueArray38 {
+ public:
+  ValueArray38(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38) : v1_(v1), v2_(v2), v3_(v3),
+      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
+      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
+      v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34),
+      v35_(v35), v36_(v36), v37_(v37), v38_(v38) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray38& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39>
+class ValueArray39 {
+ public:
+  ValueArray39(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39) : v1_(v1), v2_(v2),
+      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
+      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
+      v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34),
+      v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray39& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40>
+class ValueArray40 {
+ public:
+  ValueArray40(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40) : v1_(v1),
+      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
+      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
+      v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21),
+      v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27),
+      v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33),
+      v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39),
+      v40_(v40) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray40& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41>
+class ValueArray41 {
+ public:
+  ValueArray41(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40,
+      T41 v41) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
+      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
+      v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38),
+      v39_(v39), v40_(v40), v41_(v41) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray41& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42>
+class ValueArray42 {
+ public:
+  ValueArray42(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
+      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
+      v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38),
+      v39_(v39), v40_(v40), v41_(v41), v42_(v42) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray42& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43>
+class ValueArray43 {
+ public:
+  ValueArray43(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
+      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13),
+      v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19),
+      v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25),
+      v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31),
+      v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37),
+      v38_(v38), v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray43& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44>
+class ValueArray44 {
+ public:
+  ValueArray44(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43, T44 v44) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
+      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12),
+      v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18),
+      v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24),
+      v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30),
+      v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36),
+      v37_(v37), v38_(v38), v39_(v39), v40_(v40), v41_(v41), v42_(v42),
+      v43_(v43), v44_(v44) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray44& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+  const T44 v44_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45>
+class ValueArray45 {
+ public:
+  ValueArray45(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43, T44 v44, T45 v45) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
+      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
+      v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17),
+      v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23),
+      v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29),
+      v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35),
+      v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40), v41_(v41),
+      v42_(v42), v43_(v43), v44_(v44), v45_(v45) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
+        static_cast<T>(v45_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray45& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+  const T44 v44_;
+  const T45 v45_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46>
+class ValueArray46 {
+ public:
+  ValueArray46(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46) : v1_(v1), v2_(v2), v3_(v3),
+      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
+      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
+      v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34),
+      v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40),
+      v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45), v46_(v46) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
+        static_cast<T>(v45_), static_cast<T>(v46_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray46& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+  const T44 v44_;
+  const T45 v45_;
+  const T46 v46_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47>
+class ValueArray47 {
+ public:
+  ValueArray47(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47) : v1_(v1), v2_(v2),
+      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
+      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
+      v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34),
+      v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40),
+      v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45), v46_(v46),
+      v47_(v47) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
+        static_cast<T>(v45_), static_cast<T>(v46_), static_cast<T>(v47_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray47& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+  const T44 v44_;
+  const T45 v45_;
+  const T46 v46_;
+  const T47 v47_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48>
+class ValueArray48 {
+ public:
+  ValueArray48(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48) : v1_(v1),
+      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
+      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
+      v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21),
+      v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27),
+      v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33),
+      v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39),
+      v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45),
+      v46_(v46), v47_(v47), v48_(v48) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
+        static_cast<T>(v45_), static_cast<T>(v46_), static_cast<T>(v47_),
+        static_cast<T>(v48_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray48& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+  const T44 v44_;
+  const T45 v45_;
+  const T46 v46_;
+  const T47 v47_;
+  const T48 v48_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49>
+class ValueArray49 {
+ public:
+  ValueArray49(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48,
+      T49 v49) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
+      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
+      v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38),
+      v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44),
+      v45_(v45), v46_(v46), v47_(v47), v48_(v48), v49_(v49) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
+        static_cast<T>(v45_), static_cast<T>(v46_), static_cast<T>(v47_),
+        static_cast<T>(v48_), static_cast<T>(v49_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray49& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+  const T44 v44_;
+  const T45 v45_;
+  const T46 v46_;
+  const T47 v47_;
+  const T48 v48_;
+  const T49 v49_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49, typename T50>
+class ValueArray50 {
+ public:
+  ValueArray50(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48, T49 v49,
+      T50 v50) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
+      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
+      v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38),
+      v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44),
+      v45_(v45), v46_(v46), v47_(v47), v48_(v48), v49_(v49), v50_(v50) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
+        static_cast<T>(v45_), static_cast<T>(v46_), static_cast<T>(v47_),
+        static_cast<T>(v48_), static_cast<T>(v49_), static_cast<T>(v50_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray50& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+  const T44 v44_;
+  const T45 v45_;
+  const T46 v46_;
+  const T47 v47_;
+  const T48 v48_;
+  const T49 v49_;
+  const T50 v50_;
+};
+
+# if GTEST_HAS_COMBINE
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Generates values from the Cartesian product of values produced
+// by the argument generators.
+//
+template <typename T1, typename T2>
+class CartesianProductGenerator2
+    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2> > {
+ public:
+  typedef ::std::tr1::tuple<T1, T2> ParamType;
+
+  CartesianProductGenerator2(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2)
+      : g1_(g1), g2_(g2) {}
+  virtual ~CartesianProductGenerator2() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current2_;
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator2::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator2& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+};  // class CartesianProductGenerator2
+
+
+template <typename T1, typename T2, typename T3>
+class CartesianProductGenerator3
+    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3> > {
+ public:
+  typedef ::std::tr1::tuple<T1, T2, T3> ParamType;
+
+  CartesianProductGenerator3(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3)
+      : g1_(g1), g2_(g2), g3_(g3) {}
+  virtual ~CartesianProductGenerator3() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current3_;
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_, *current3_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator3::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator3& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+};  // class CartesianProductGenerator3
+
+
+template <typename T1, typename T2, typename T3, typename T4>
+class CartesianProductGenerator4
+    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4> > {
+ public:
+  typedef ::std::tr1::tuple<T1, T2, T3, T4> ParamType;
+
+  CartesianProductGenerator4(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
+      const ParamGenerator<T4>& g4)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4) {}
+  virtual ~CartesianProductGenerator4() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin(), g4_, g4_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
+        g4_, g4_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3,
+      const ParamGenerator<T4>& g4,
+      const typename ParamGenerator<T4>::iterator& current4)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
+          begin4_(g4.begin()), end4_(g4.end()), current4_(current4)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current4_;
+      if (current4_ == end4_) {
+        current4_ = begin4_;
+        ++current3_;
+      }
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_ &&
+          current4_ == typed_other->current4_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_),
+        begin4_(other.begin4_),
+        end4_(other.end4_),
+        current4_(other.current4_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_, *current3_,
+            *current4_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_ ||
+          current4_ == end4_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    const typename ParamGenerator<T4>::iterator begin4_;
+    const typename ParamGenerator<T4>::iterator end4_;
+    typename ParamGenerator<T4>::iterator current4_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator4::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator4& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+  const ParamGenerator<T4> g4_;
+};  // class CartesianProductGenerator4
+
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class CartesianProductGenerator5
+    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4, T5> > {
+ public:
+  typedef ::std::tr1::tuple<T1, T2, T3, T4, T5> ParamType;
+
+  CartesianProductGenerator5(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
+      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5) {}
+  virtual ~CartesianProductGenerator5() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
+        g4_, g4_.end(), g5_, g5_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3,
+      const ParamGenerator<T4>& g4,
+      const typename ParamGenerator<T4>::iterator& current4,
+      const ParamGenerator<T5>& g5,
+      const typename ParamGenerator<T5>::iterator& current5)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
+          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
+          begin5_(g5.begin()), end5_(g5.end()), current5_(current5)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current5_;
+      if (current5_ == end5_) {
+        current5_ = begin5_;
+        ++current4_;
+      }
+      if (current4_ == end4_) {
+        current4_ = begin4_;
+        ++current3_;
+      }
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_ &&
+          current4_ == typed_other->current4_ &&
+          current5_ == typed_other->current5_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_),
+        begin4_(other.begin4_),
+        end4_(other.end4_),
+        current4_(other.current4_),
+        begin5_(other.begin5_),
+        end5_(other.end5_),
+        current5_(other.current5_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_ ||
+          current4_ == end4_ ||
+          current5_ == end5_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    const typename ParamGenerator<T4>::iterator begin4_;
+    const typename ParamGenerator<T4>::iterator end4_;
+    typename ParamGenerator<T4>::iterator current4_;
+    const typename ParamGenerator<T5>::iterator begin5_;
+    const typename ParamGenerator<T5>::iterator end5_;
+    typename ParamGenerator<T5>::iterator current5_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator5::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator5& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+  const ParamGenerator<T4> g4_;
+  const ParamGenerator<T5> g5_;
+};  // class CartesianProductGenerator5
+
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6>
+class CartesianProductGenerator6
+    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4, T5,
+        T6> > {
+ public:
+  typedef ::std::tr1::tuple<T1, T2, T3, T4, T5, T6> ParamType;
+
+  CartesianProductGenerator6(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
+      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
+      const ParamGenerator<T6>& g6)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6) {}
+  virtual ~CartesianProductGenerator6() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
+        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3,
+      const ParamGenerator<T4>& g4,
+      const typename ParamGenerator<T4>::iterator& current4,
+      const ParamGenerator<T5>& g5,
+      const typename ParamGenerator<T5>::iterator& current5,
+      const ParamGenerator<T6>& g6,
+      const typename ParamGenerator<T6>::iterator& current6)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
+          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
+          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
+          begin6_(g6.begin()), end6_(g6.end()), current6_(current6)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current6_;
+      if (current6_ == end6_) {
+        current6_ = begin6_;
+        ++current5_;
+      }
+      if (current5_ == end5_) {
+        current5_ = begin5_;
+        ++current4_;
+      }
+      if (current4_ == end4_) {
+        current4_ = begin4_;
+        ++current3_;
+      }
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_ &&
+          current4_ == typed_other->current4_ &&
+          current5_ == typed_other->current5_ &&
+          current6_ == typed_other->current6_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_),
+        begin4_(other.begin4_),
+        end4_(other.end4_),
+        current4_(other.current4_),
+        begin5_(other.begin5_),
+        end5_(other.end5_),
+        current5_(other.current5_),
+        begin6_(other.begin6_),
+        end6_(other.end6_),
+        current6_(other.current6_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_, *current6_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_ ||
+          current4_ == end4_ ||
+          current5_ == end5_ ||
+          current6_ == end6_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    const typename ParamGenerator<T4>::iterator begin4_;
+    const typename ParamGenerator<T4>::iterator end4_;
+    typename ParamGenerator<T4>::iterator current4_;
+    const typename ParamGenerator<T5>::iterator begin5_;
+    const typename ParamGenerator<T5>::iterator end5_;
+    typename ParamGenerator<T5>::iterator current5_;
+    const typename ParamGenerator<T6>::iterator begin6_;
+    const typename ParamGenerator<T6>::iterator end6_;
+    typename ParamGenerator<T6>::iterator current6_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator6::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator6& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+  const ParamGenerator<T4> g4_;
+  const ParamGenerator<T5> g5_;
+  const ParamGenerator<T6> g6_;
+};  // class CartesianProductGenerator6
+
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7>
+class CartesianProductGenerator7
+    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6,
+        T7> > {
+ public:
+  typedef ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7> ParamType;
+
+  CartesianProductGenerator7(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
+      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
+      const ParamGenerator<T6>& g6, const ParamGenerator<T7>& g7)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7) {}
+  virtual ~CartesianProductGenerator7() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_,
+        g7_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
+        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3,
+      const ParamGenerator<T4>& g4,
+      const typename ParamGenerator<T4>::iterator& current4,
+      const ParamGenerator<T5>& g5,
+      const typename ParamGenerator<T5>::iterator& current5,
+      const ParamGenerator<T6>& g6,
+      const typename ParamGenerator<T6>::iterator& current6,
+      const ParamGenerator<T7>& g7,
+      const typename ParamGenerator<T7>::iterator& current7)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
+          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
+          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
+          begin6_(g6.begin()), end6_(g6.end()), current6_(current6),
+          begin7_(g7.begin()), end7_(g7.end()), current7_(current7)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current7_;
+      if (current7_ == end7_) {
+        current7_ = begin7_;
+        ++current6_;
+      }
+      if (current6_ == end6_) {
+        current6_ = begin6_;
+        ++current5_;
+      }
+      if (current5_ == end5_) {
+        current5_ = begin5_;
+        ++current4_;
+      }
+      if (current4_ == end4_) {
+        current4_ = begin4_;
+        ++current3_;
+      }
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_ &&
+          current4_ == typed_other->current4_ &&
+          current5_ == typed_other->current5_ &&
+          current6_ == typed_other->current6_ &&
+          current7_ == typed_other->current7_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_),
+        begin4_(other.begin4_),
+        end4_(other.end4_),
+        current4_(other.current4_),
+        begin5_(other.begin5_),
+        end5_(other.end5_),
+        current5_(other.current5_),
+        begin6_(other.begin6_),
+        end6_(other.end6_),
+        current6_(other.current6_),
+        begin7_(other.begin7_),
+        end7_(other.end7_),
+        current7_(other.current7_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_, *current6_, *current7_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_ ||
+          current4_ == end4_ ||
+          current5_ == end5_ ||
+          current6_ == end6_ ||
+          current7_ == end7_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    const typename ParamGenerator<T4>::iterator begin4_;
+    const typename ParamGenerator<T4>::iterator end4_;
+    typename ParamGenerator<T4>::iterator current4_;
+    const typename ParamGenerator<T5>::iterator begin5_;
+    const typename ParamGenerator<T5>::iterator end5_;
+    typename ParamGenerator<T5>::iterator current5_;
+    const typename ParamGenerator<T6>::iterator begin6_;
+    const typename ParamGenerator<T6>::iterator end6_;
+    typename ParamGenerator<T6>::iterator current6_;
+    const typename ParamGenerator<T7>::iterator begin7_;
+    const typename ParamGenerator<T7>::iterator end7_;
+    typename ParamGenerator<T7>::iterator current7_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator7::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator7& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+  const ParamGenerator<T4> g4_;
+  const ParamGenerator<T5> g5_;
+  const ParamGenerator<T6> g6_;
+  const ParamGenerator<T7> g7_;
+};  // class CartesianProductGenerator7
+
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8>
+class CartesianProductGenerator8
+    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6,
+        T7, T8> > {
+ public:
+  typedef ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8> ParamType;
+
+  CartesianProductGenerator8(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
+      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
+      const ParamGenerator<T6>& g6, const ParamGenerator<T7>& g7,
+      const ParamGenerator<T8>& g8)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7),
+          g8_(g8) {}
+  virtual ~CartesianProductGenerator8() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_,
+        g7_.begin(), g8_, g8_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
+        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_,
+        g8_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3,
+      const ParamGenerator<T4>& g4,
+      const typename ParamGenerator<T4>::iterator& current4,
+      const ParamGenerator<T5>& g5,
+      const typename ParamGenerator<T5>::iterator& current5,
+      const ParamGenerator<T6>& g6,
+      const typename ParamGenerator<T6>::iterator& current6,
+      const ParamGenerator<T7>& g7,
+      const typename ParamGenerator<T7>::iterator& current7,
+      const ParamGenerator<T8>& g8,
+      const typename ParamGenerator<T8>::iterator& current8)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
+          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
+          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
+          begin6_(g6.begin()), end6_(g6.end()), current6_(current6),
+          begin7_(g7.begin()), end7_(g7.end()), current7_(current7),
+          begin8_(g8.begin()), end8_(g8.end()), current8_(current8)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current8_;
+      if (current8_ == end8_) {
+        current8_ = begin8_;
+        ++current7_;
+      }
+      if (current7_ == end7_) {
+        current7_ = begin7_;
+        ++current6_;
+      }
+      if (current6_ == end6_) {
+        current6_ = begin6_;
+        ++current5_;
+      }
+      if (current5_ == end5_) {
+        current5_ = begin5_;
+        ++current4_;
+      }
+      if (current4_ == end4_) {
+        current4_ = begin4_;
+        ++current3_;
+      }
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_ &&
+          current4_ == typed_other->current4_ &&
+          current5_ == typed_other->current5_ &&
+          current6_ == typed_other->current6_ &&
+          current7_ == typed_other->current7_ &&
+          current8_ == typed_other->current8_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_),
+        begin4_(other.begin4_),
+        end4_(other.end4_),
+        current4_(other.current4_),
+        begin5_(other.begin5_),
+        end5_(other.end5_),
+        current5_(other.current5_),
+        begin6_(other.begin6_),
+        end6_(other.end6_),
+        current6_(other.current6_),
+        begin7_(other.begin7_),
+        end7_(other.end7_),
+        current7_(other.current7_),
+        begin8_(other.begin8_),
+        end8_(other.end8_),
+        current8_(other.current8_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_, *current6_, *current7_, *current8_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_ ||
+          current4_ == end4_ ||
+          current5_ == end5_ ||
+          current6_ == end6_ ||
+          current7_ == end7_ ||
+          current8_ == end8_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    const typename ParamGenerator<T4>::iterator begin4_;
+    const typename ParamGenerator<T4>::iterator end4_;
+    typename ParamGenerator<T4>::iterator current4_;
+    const typename ParamGenerator<T5>::iterator begin5_;
+    const typename ParamGenerator<T5>::iterator end5_;
+    typename ParamGenerator<T5>::iterator current5_;
+    const typename ParamGenerator<T6>::iterator begin6_;
+    const typename ParamGenerator<T6>::iterator end6_;
+    typename ParamGenerator<T6>::iterator current6_;
+    const typename ParamGenerator<T7>::iterator begin7_;
+    const typename ParamGenerator<T7>::iterator end7_;
+    typename ParamGenerator<T7>::iterator current7_;
+    const typename ParamGenerator<T8>::iterator begin8_;
+    const typename ParamGenerator<T8>::iterator end8_;
+    typename ParamGenerator<T8>::iterator current8_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator8::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator8& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+  const ParamGenerator<T4> g4_;
+  const ParamGenerator<T5> g5_;
+  const ParamGenerator<T6> g6_;
+  const ParamGenerator<T7> g7_;
+  const ParamGenerator<T8> g8_;
+};  // class CartesianProductGenerator8
+
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9>
+class CartesianProductGenerator9
+    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6,
+        T7, T8, T9> > {
+ public:
+  typedef ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9> ParamType;
+
+  CartesianProductGenerator9(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
+      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
+      const ParamGenerator<T6>& g6, const ParamGenerator<T7>& g7,
+      const ParamGenerator<T8>& g8, const ParamGenerator<T9>& g9)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8),
+          g9_(g9) {}
+  virtual ~CartesianProductGenerator9() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_,
+        g7_.begin(), g8_, g8_.begin(), g9_, g9_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
+        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_,
+        g8_.end(), g9_, g9_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3,
+      const ParamGenerator<T4>& g4,
+      const typename ParamGenerator<T4>::iterator& current4,
+      const ParamGenerator<T5>& g5,
+      const typename ParamGenerator<T5>::iterator& current5,
+      const ParamGenerator<T6>& g6,
+      const typename ParamGenerator<T6>::iterator& current6,
+      const ParamGenerator<T7>& g7,
+      const typename ParamGenerator<T7>::iterator& current7,
+      const ParamGenerator<T8>& g8,
+      const typename ParamGenerator<T8>::iterator& current8,
+      const ParamGenerator<T9>& g9,
+      const typename ParamGenerator<T9>::iterator& current9)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
+          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
+          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
+          begin6_(g6.begin()), end6_(g6.end()), current6_(current6),
+          begin7_(g7.begin()), end7_(g7.end()), current7_(current7),
+          begin8_(g8.begin()), end8_(g8.end()), current8_(current8),
+          begin9_(g9.begin()), end9_(g9.end()), current9_(current9)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current9_;
+      if (current9_ == end9_) {
+        current9_ = begin9_;
+        ++current8_;
+      }
+      if (current8_ == end8_) {
+        current8_ = begin8_;
+        ++current7_;
+      }
+      if (current7_ == end7_) {
+        current7_ = begin7_;
+        ++current6_;
+      }
+      if (current6_ == end6_) {
+        current6_ = begin6_;
+        ++current5_;
+      }
+      if (current5_ == end5_) {
+        current5_ = begin5_;
+        ++current4_;
+      }
+      if (current4_ == end4_) {
+        current4_ = begin4_;
+        ++current3_;
+      }
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_ &&
+          current4_ == typed_other->current4_ &&
+          current5_ == typed_other->current5_ &&
+          current6_ == typed_other->current6_ &&
+          current7_ == typed_other->current7_ &&
+          current8_ == typed_other->current8_ &&
+          current9_ == typed_other->current9_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_),
+        begin4_(other.begin4_),
+        end4_(other.end4_),
+        current4_(other.current4_),
+        begin5_(other.begin5_),
+        end5_(other.end5_),
+        current5_(other.current5_),
+        begin6_(other.begin6_),
+        end6_(other.end6_),
+        current6_(other.current6_),
+        begin7_(other.begin7_),
+        end7_(other.end7_),
+        current7_(other.current7_),
+        begin8_(other.begin8_),
+        end8_(other.end8_),
+        current8_(other.current8_),
+        begin9_(other.begin9_),
+        end9_(other.end9_),
+        current9_(other.current9_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_, *current6_, *current7_, *current8_,
+            *current9_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_ ||
+          current4_ == end4_ ||
+          current5_ == end5_ ||
+          current6_ == end6_ ||
+          current7_ == end7_ ||
+          current8_ == end8_ ||
+          current9_ == end9_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    const typename ParamGenerator<T4>::iterator begin4_;
+    const typename ParamGenerator<T4>::iterator end4_;
+    typename ParamGenerator<T4>::iterator current4_;
+    const typename ParamGenerator<T5>::iterator begin5_;
+    const typename ParamGenerator<T5>::iterator end5_;
+    typename ParamGenerator<T5>::iterator current5_;
+    const typename ParamGenerator<T6>::iterator begin6_;
+    const typename ParamGenerator<T6>::iterator end6_;
+    typename ParamGenerator<T6>::iterator current6_;
+    const typename ParamGenerator<T7>::iterator begin7_;
+    const typename ParamGenerator<T7>::iterator end7_;
+    typename ParamGenerator<T7>::iterator current7_;
+    const typename ParamGenerator<T8>::iterator begin8_;
+    const typename ParamGenerator<T8>::iterator end8_;
+    typename ParamGenerator<T8>::iterator current8_;
+    const typename ParamGenerator<T9>::iterator begin9_;
+    const typename ParamGenerator<T9>::iterator end9_;
+    typename ParamGenerator<T9>::iterator current9_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator9::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator9& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+  const ParamGenerator<T4> g4_;
+  const ParamGenerator<T5> g5_;
+  const ParamGenerator<T6> g6_;
+  const ParamGenerator<T7> g7_;
+  const ParamGenerator<T8> g8_;
+  const ParamGenerator<T9> g9_;
+};  // class CartesianProductGenerator9
+
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10>
+class CartesianProductGenerator10
+    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6,
+        T7, T8, T9, T10> > {
+ public:
+  typedef ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> ParamType;
+
+  CartesianProductGenerator10(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
+      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
+      const ParamGenerator<T6>& g6, const ParamGenerator<T7>& g7,
+      const ParamGenerator<T8>& g8, const ParamGenerator<T9>& g9,
+      const ParamGenerator<T10>& g10)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8),
+          g9_(g9), g10_(g10) {}
+  virtual ~CartesianProductGenerator10() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_,
+        g7_.begin(), g8_, g8_.begin(), g9_, g9_.begin(), g10_, g10_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
+        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_,
+        g8_.end(), g9_, g9_.end(), g10_, g10_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3,
+      const ParamGenerator<T4>& g4,
+      const typename ParamGenerator<T4>::iterator& current4,
+      const ParamGenerator<T5>& g5,
+      const typename ParamGenerator<T5>::iterator& current5,
+      const ParamGenerator<T6>& g6,
+      const typename ParamGenerator<T6>::iterator& current6,
+      const ParamGenerator<T7>& g7,
+      const typename ParamGenerator<T7>::iterator& current7,
+      const ParamGenerator<T8>& g8,
+      const typename ParamGenerator<T8>::iterator& current8,
+      const ParamGenerator<T9>& g9,
+      const typename ParamGenerator<T9>::iterator& current9,
+      const ParamGenerator<T10>& g10,
+      const typename ParamGenerator<T10>::iterator& current10)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
+          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
+          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
+          begin6_(g6.begin()), end6_(g6.end()), current6_(current6),
+          begin7_(g7.begin()), end7_(g7.end()), current7_(current7),
+          begin8_(g8.begin()), end8_(g8.end()), current8_(current8),
+          begin9_(g9.begin()), end9_(g9.end()), current9_(current9),
+          begin10_(g10.begin()), end10_(g10.end()), current10_(current10)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current10_;
+      if (current10_ == end10_) {
+        current10_ = begin10_;
+        ++current9_;
+      }
+      if (current9_ == end9_) {
+        current9_ = begin9_;
+        ++current8_;
+      }
+      if (current8_ == end8_) {
+        current8_ = begin8_;
+        ++current7_;
+      }
+      if (current7_ == end7_) {
+        current7_ = begin7_;
+        ++current6_;
+      }
+      if (current6_ == end6_) {
+        current6_ = begin6_;
+        ++current5_;
+      }
+      if (current5_ == end5_) {
+        current5_ = begin5_;
+        ++current4_;
+      }
+      if (current4_ == end4_) {
+        current4_ = begin4_;
+        ++current3_;
+      }
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_ &&
+          current4_ == typed_other->current4_ &&
+          current5_ == typed_other->current5_ &&
+          current6_ == typed_other->current6_ &&
+          current7_ == typed_other->current7_ &&
+          current8_ == typed_other->current8_ &&
+          current9_ == typed_other->current9_ &&
+          current10_ == typed_other->current10_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_),
+        begin4_(other.begin4_),
+        end4_(other.end4_),
+        current4_(other.current4_),
+        begin5_(other.begin5_),
+        end5_(other.end5_),
+        current5_(other.current5_),
+        begin6_(other.begin6_),
+        end6_(other.end6_),
+        current6_(other.current6_),
+        begin7_(other.begin7_),
+        end7_(other.end7_),
+        current7_(other.current7_),
+        begin8_(other.begin8_),
+        end8_(other.end8_),
+        current8_(other.current8_),
+        begin9_(other.begin9_),
+        end9_(other.end9_),
+        current9_(other.current9_),
+        begin10_(other.begin10_),
+        end10_(other.end10_),
+        current10_(other.current10_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_, *current6_, *current7_, *current8_,
+            *current9_, *current10_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_ ||
+          current4_ == end4_ ||
+          current5_ == end5_ ||
+          current6_ == end6_ ||
+          current7_ == end7_ ||
+          current8_ == end8_ ||
+          current9_ == end9_ ||
+          current10_ == end10_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    const typename ParamGenerator<T4>::iterator begin4_;
+    const typename ParamGenerator<T4>::iterator end4_;
+    typename ParamGenerator<T4>::iterator current4_;
+    const typename ParamGenerator<T5>::iterator begin5_;
+    const typename ParamGenerator<T5>::iterator end5_;
+    typename ParamGenerator<T5>::iterator current5_;
+    const typename ParamGenerator<T6>::iterator begin6_;
+    const typename ParamGenerator<T6>::iterator end6_;
+    typename ParamGenerator<T6>::iterator current6_;
+    const typename ParamGenerator<T7>::iterator begin7_;
+    const typename ParamGenerator<T7>::iterator end7_;
+    typename ParamGenerator<T7>::iterator current7_;
+    const typename ParamGenerator<T8>::iterator begin8_;
+    const typename ParamGenerator<T8>::iterator end8_;
+    typename ParamGenerator<T8>::iterator current8_;
+    const typename ParamGenerator<T9>::iterator begin9_;
+    const typename ParamGenerator<T9>::iterator end9_;
+    typename ParamGenerator<T9>::iterator current9_;
+    const typename ParamGenerator<T10>::iterator begin10_;
+    const typename ParamGenerator<T10>::iterator end10_;
+    typename ParamGenerator<T10>::iterator current10_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator10::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator10& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+  const ParamGenerator<T4> g4_;
+  const ParamGenerator<T5> g5_;
+  const ParamGenerator<T6> g6_;
+  const ParamGenerator<T7> g7_;
+  const ParamGenerator<T8> g8_;
+  const ParamGenerator<T9> g9_;
+  const ParamGenerator<T10> g10_;
+};  // class CartesianProductGenerator10
+
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Helper classes providing Combine() with polymorphic features. They allow
+// casting CartesianProductGeneratorN<T> to ParamGenerator<U> if T is
+// convertible to U.
+//
+template <class Generator1, class Generator2>
+class CartesianProductHolder2 {
+ public:
+CartesianProductHolder2(const Generator1& g1, const Generator2& g2)
+      : g1_(g1), g2_(g2) {}
+  template <typename T1, typename T2>
+  operator ParamGenerator< ::std::tr1::tuple<T1, T2> >() const {
+    return ParamGenerator< ::std::tr1::tuple<T1, T2> >(
+        new CartesianProductGenerator2<T1, T2>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder2& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+};  // class CartesianProductHolder2
+
+template <class Generator1, class Generator2, class Generator3>
+class CartesianProductHolder3 {
+ public:
+CartesianProductHolder3(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3)
+      : g1_(g1), g2_(g2), g3_(g3) {}
+  template <typename T1, typename T2, typename T3>
+  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3> >() const {
+    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3> >(
+        new CartesianProductGenerator3<T1, T2, T3>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder3& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+};  // class CartesianProductHolder3
+
+template <class Generator1, class Generator2, class Generator3,
+    class Generator4>
+class CartesianProductHolder4 {
+ public:
+CartesianProductHolder4(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3, const Generator4& g4)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4) {}
+  template <typename T1, typename T2, typename T3, typename T4>
+  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4> >() const {
+    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4> >(
+        new CartesianProductGenerator4<T1, T2, T3, T4>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_),
+        static_cast<ParamGenerator<T4> >(g4_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder4& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+  const Generator4 g4_;
+};  // class CartesianProductHolder4
+
+template <class Generator1, class Generator2, class Generator3,
+    class Generator4, class Generator5>
+class CartesianProductHolder5 {
+ public:
+CartesianProductHolder5(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3, const Generator4& g4, const Generator5& g5)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5) {}
+  template <typename T1, typename T2, typename T3, typename T4, typename T5>
+  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5> >() const {
+    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5> >(
+        new CartesianProductGenerator5<T1, T2, T3, T4, T5>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_),
+        static_cast<ParamGenerator<T4> >(g4_),
+        static_cast<ParamGenerator<T5> >(g5_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder5& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+  const Generator4 g4_;
+  const Generator5 g5_;
+};  // class CartesianProductHolder5
+
+template <class Generator1, class Generator2, class Generator3,
+    class Generator4, class Generator5, class Generator6>
+class CartesianProductHolder6 {
+ public:
+CartesianProductHolder6(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3, const Generator4& g4, const Generator5& g5,
+    const Generator6& g6)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6) {}
+  template <typename T1, typename T2, typename T3, typename T4, typename T5,
+      typename T6>
+  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6> >() const {
+    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6> >(
+        new CartesianProductGenerator6<T1, T2, T3, T4, T5, T6>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_),
+        static_cast<ParamGenerator<T4> >(g4_),
+        static_cast<ParamGenerator<T5> >(g5_),
+        static_cast<ParamGenerator<T6> >(g6_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder6& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+  const Generator4 g4_;
+  const Generator5 g5_;
+  const Generator6 g6_;
+};  // class CartesianProductHolder6
+
+template <class Generator1, class Generator2, class Generator3,
+    class Generator4, class Generator5, class Generator6, class Generator7>
+class CartesianProductHolder7 {
+ public:
+CartesianProductHolder7(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3, const Generator4& g4, const Generator5& g5,
+    const Generator6& g6, const Generator7& g7)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7) {}
+  template <typename T1, typename T2, typename T3, typename T4, typename T5,
+      typename T6, typename T7>
+  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6,
+      T7> >() const {
+    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7> >(
+        new CartesianProductGenerator7<T1, T2, T3, T4, T5, T6, T7>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_),
+        static_cast<ParamGenerator<T4> >(g4_),
+        static_cast<ParamGenerator<T5> >(g5_),
+        static_cast<ParamGenerator<T6> >(g6_),
+        static_cast<ParamGenerator<T7> >(g7_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder7& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+  const Generator4 g4_;
+  const Generator5 g5_;
+  const Generator6 g6_;
+  const Generator7 g7_;
+};  // class CartesianProductHolder7
+
+template <class Generator1, class Generator2, class Generator3,
+    class Generator4, class Generator5, class Generator6, class Generator7,
+    class Generator8>
+class CartesianProductHolder8 {
+ public:
+CartesianProductHolder8(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3, const Generator4& g4, const Generator5& g5,
+    const Generator6& g6, const Generator7& g7, const Generator8& g8)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7),
+          g8_(g8) {}
+  template <typename T1, typename T2, typename T3, typename T4, typename T5,
+      typename T6, typename T7, typename T8>
+  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7,
+      T8> >() const {
+    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8> >(
+        new CartesianProductGenerator8<T1, T2, T3, T4, T5, T6, T7, T8>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_),
+        static_cast<ParamGenerator<T4> >(g4_),
+        static_cast<ParamGenerator<T5> >(g5_),
+        static_cast<ParamGenerator<T6> >(g6_),
+        static_cast<ParamGenerator<T7> >(g7_),
+        static_cast<ParamGenerator<T8> >(g8_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder8& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+  const Generator4 g4_;
+  const Generator5 g5_;
+  const Generator6 g6_;
+  const Generator7 g7_;
+  const Generator8 g8_;
+};  // class CartesianProductHolder8
+
+template <class Generator1, class Generator2, class Generator3,
+    class Generator4, class Generator5, class Generator6, class Generator7,
+    class Generator8, class Generator9>
+class CartesianProductHolder9 {
+ public:
+CartesianProductHolder9(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3, const Generator4& g4, const Generator5& g5,
+    const Generator6& g6, const Generator7& g7, const Generator8& g8,
+    const Generator9& g9)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8),
+          g9_(g9) {}
+  template <typename T1, typename T2, typename T3, typename T4, typename T5,
+      typename T6, typename T7, typename T8, typename T9>
+  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8,
+      T9> >() const {
+    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8,
+        T9> >(
+        new CartesianProductGenerator9<T1, T2, T3, T4, T5, T6, T7, T8, T9>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_),
+        static_cast<ParamGenerator<T4> >(g4_),
+        static_cast<ParamGenerator<T5> >(g5_),
+        static_cast<ParamGenerator<T6> >(g6_),
+        static_cast<ParamGenerator<T7> >(g7_),
+        static_cast<ParamGenerator<T8> >(g8_),
+        static_cast<ParamGenerator<T9> >(g9_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder9& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+  const Generator4 g4_;
+  const Generator5 g5_;
+  const Generator6 g6_;
+  const Generator7 g7_;
+  const Generator8 g8_;
+  const Generator9 g9_;
+};  // class CartesianProductHolder9
+
+template <class Generator1, class Generator2, class Generator3,
+    class Generator4, class Generator5, class Generator6, class Generator7,
+    class Generator8, class Generator9, class Generator10>
+class CartesianProductHolder10 {
+ public:
+CartesianProductHolder10(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3, const Generator4& g4, const Generator5& g5,
+    const Generator6& g6, const Generator7& g7, const Generator8& g8,
+    const Generator9& g9, const Generator10& g10)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8),
+          g9_(g9), g10_(g10) {}
+  template <typename T1, typename T2, typename T3, typename T4, typename T5,
+      typename T6, typename T7, typename T8, typename T9, typename T10>
+  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8,
+      T9, T10> >() const {
+    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8,
+        T9, T10> >(
+        new CartesianProductGenerator10<T1, T2, T3, T4, T5, T6, T7, T8, T9,
+            T10>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_),
+        static_cast<ParamGenerator<T4> >(g4_),
+        static_cast<ParamGenerator<T5> >(g5_),
+        static_cast<ParamGenerator<T6> >(g6_),
+        static_cast<ParamGenerator<T7> >(g7_),
+        static_cast<ParamGenerator<T8> >(g8_),
+        static_cast<ParamGenerator<T9> >(g9_),
+        static_cast<ParamGenerator<T10> >(g10_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder10& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+  const Generator4 g4_;
+  const Generator5 g5_;
+  const Generator6 g6_;
+  const Generator7 g7_;
+  const Generator8 g8_;
+  const Generator9 g9_;
+  const Generator10 g10_;
+};  // class CartesianProductHolder10
+
+# endif  // GTEST_HAS_COMBINE
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  //  GTEST_HAS_PARAM_TEST
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
+
+#if GTEST_HAS_PARAM_TEST
+
+namespace testing {
+
+// Functions producing parameter generators.
+//
+// Google Test uses these generators to produce parameters for value-
+// parameterized tests. When a parameterized test case is instantiated
+// with a particular generator, Google Test creates and runs tests
+// for each element in the sequence produced by the generator.
+//
+// In the following sample, tests from test case FooTest are instantiated
+// each three times with parameter values 3, 5, and 8:
+//
+// class FooTest : public TestWithParam<int> { ... };
+//
+// TEST_P(FooTest, TestThis) {
+// }
+// TEST_P(FooTest, TestThat) {
+// }
+// INSTANTIATE_TEST_CASE_P(TestSequence, FooTest, Values(3, 5, 8));
+//
+
+// Range() returns generators providing sequences of values in a range.
+//
+// Synopsis:
+// Range(start, end)
+//   - returns a generator producing a sequence of values {start, start+1,
+//     start+2, ..., }.
+// Range(start, end, step)
+//   - returns a generator producing a sequence of values {start, start+step,
+//     start+step+step, ..., }.
+// Notes:
+//   * The generated sequences never include end. For example, Range(1, 5)
+//     returns a generator producing a sequence {1, 2, 3, 4}. Range(1, 9, 2)
+//     returns a generator producing {1, 3, 5, 7}.
+//   * start and end must have the same type. That type may be any integral or
+//     floating-point type or a user defined type satisfying these conditions:
+//     * It must be assignable (have operator=() defined).
+//     * It must have operator+() (operator+(int-compatible type) for
+//       two-operand version).
+//     * It must have operator<() defined.
+//     Elements in the resulting sequences will also have that type.
+//   * Condition start < end must be satisfied in order for resulting sequences
+//     to contain any elements.
+//
+template <typename T, typename IncrementT>
+internal::ParamGenerator<T> Range(T start, T end, IncrementT step) {
+  return internal::ParamGenerator<T>(
+      new internal::RangeGenerator<T, IncrementT>(start, end, step));
+}
+
+template <typename T>
+internal::ParamGenerator<T> Range(T start, T end) {
+  return Range(start, end, 1);
+}
+
+// ValuesIn() function allows generation of tests with parameters coming from
+// a container.
+//
+// Synopsis:
+// ValuesIn(const T (&array)[N])
+//   - returns a generator producing sequences with elements from
+//     a C-style array.
+// ValuesIn(const Container& container)
+//   - returns a generator producing sequences with elements from
+//     an STL-style container.
+// ValuesIn(Iterator begin, Iterator end)
+//   - returns a generator producing sequences with elements from
+//     a range [begin, end) defined by a pair of STL-style iterators. These
+//     iterators can also be plain C pointers.
+//
+// Please note that ValuesIn copies the values from the containers
+// passed in and keeps them to generate tests in RUN_ALL_TESTS().
+//
+// Examples:
+//
+// This instantiates tests from test case StringTest
+// each with C-string values of "foo", "bar", and "baz":
+//
+// const char* strings[] = {"foo", "bar", "baz"};
+// INSTANTIATE_TEST_CASE_P(StringSequence, SrtingTest, ValuesIn(strings));
+//
+// This instantiates tests from test case StlStringTest
+// each with STL strings with values "a" and "b":
+//
+// ::std::vector< ::std::string> GetParameterStrings() {
+//   ::std::vector< ::std::string> v;
+//   v.push_back("a");
+//   v.push_back("b");
+//   return v;
+// }
+//
+// INSTANTIATE_TEST_CASE_P(CharSequence,
+//                         StlStringTest,
+//                         ValuesIn(GetParameterStrings()));
+//
+//
+// This will also instantiate tests from CharTest
+// each with parameter values 'a' and 'b':
+//
+// ::std::list<char> GetParameterChars() {
+//   ::std::list<char> list;
+//   list.push_back('a');
+//   list.push_back('b');
+//   return list;
+// }
+// ::std::list<char> l = GetParameterChars();
+// INSTANTIATE_TEST_CASE_P(CharSequence2,
+//                         CharTest,
+//                         ValuesIn(l.begin(), l.end()));
+//
+template <typename ForwardIterator>
+internal::ParamGenerator<
+  typename ::testing::internal::IteratorTraits<ForwardIterator>::value_type>
+ValuesIn(ForwardIterator begin, ForwardIterator end) {
+  typedef typename ::testing::internal::IteratorTraits<ForwardIterator>
+      ::value_type ParamType;
+  return internal::ParamGenerator<ParamType>(
+      new internal::ValuesInIteratorRangeGenerator<ParamType>(begin, end));
+}
+
+template <typename T, size_t N>
+internal::ParamGenerator<T> ValuesIn(const T (&array)[N]) {
+  return ValuesIn(array, array + N);
+}
+
+template <class Container>
+internal::ParamGenerator<typename Container::value_type> ValuesIn(
+    const Container& container) {
+  return ValuesIn(container.begin(), container.end());
+}
+
+// Values() allows generating tests from explicitly specified list of
+// parameters.
+//
+// Synopsis:
+// Values(T v1, T v2, ..., T vN)
+//   - returns a generator producing sequences with elements v1, v2, ..., vN.
+//
+// For example, this instantiates tests from test case BarTest each
+// with values "one", "two", and "three":
+//
+// INSTANTIATE_TEST_CASE_P(NumSequence, BarTest, Values("one", "two", "three"));
+//
+// This instantiates tests from test case BazTest each with values 1, 2, 3.5.
+// The exact type of values will depend on the type of parameter in BazTest.
+//
+// INSTANTIATE_TEST_CASE_P(FloatingNumbers, BazTest, Values(1, 2, 3.5));
+//
+// Currently, Values() supports from 1 to 50 parameters.
+//
+template <typename T1>
+internal::ValueArray1<T1> Values(T1 v1) {
+  return internal::ValueArray1<T1>(v1);
+}
+
+template <typename T1, typename T2>
+internal::ValueArray2<T1, T2> Values(T1 v1, T2 v2) {
+  return internal::ValueArray2<T1, T2>(v1, v2);
+}
+
+template <typename T1, typename T2, typename T3>
+internal::ValueArray3<T1, T2, T3> Values(T1 v1, T2 v2, T3 v3) {
+  return internal::ValueArray3<T1, T2, T3>(v1, v2, v3);
+}
+
+template <typename T1, typename T2, typename T3, typename T4>
+internal::ValueArray4<T1, T2, T3, T4> Values(T1 v1, T2 v2, T3 v3, T4 v4) {
+  return internal::ValueArray4<T1, T2, T3, T4>(v1, v2, v3, v4);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+internal::ValueArray5<T1, T2, T3, T4, T5> Values(T1 v1, T2 v2, T3 v3, T4 v4,
+    T5 v5) {
+  return internal::ValueArray5<T1, T2, T3, T4, T5>(v1, v2, v3, v4, v5);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6>
+internal::ValueArray6<T1, T2, T3, T4, T5, T6> Values(T1 v1, T2 v2, T3 v3,
+    T4 v4, T5 v5, T6 v6) {
+  return internal::ValueArray6<T1, T2, T3, T4, T5, T6>(v1, v2, v3, v4, v5, v6);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7>
+internal::ValueArray7<T1, T2, T3, T4, T5, T6, T7> Values(T1 v1, T2 v2, T3 v3,
+    T4 v4, T5 v5, T6 v6, T7 v7) {
+  return internal::ValueArray7<T1, T2, T3, T4, T5, T6, T7>(v1, v2, v3, v4, v5,
+      v6, v7);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8>
+internal::ValueArray8<T1, T2, T3, T4, T5, T6, T7, T8> Values(T1 v1, T2 v2,
+    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8) {
+  return internal::ValueArray8<T1, T2, T3, T4, T5, T6, T7, T8>(v1, v2, v3, v4,
+      v5, v6, v7, v8);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9>
+internal::ValueArray9<T1, T2, T3, T4, T5, T6, T7, T8, T9> Values(T1 v1, T2 v2,
+    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9) {
+  return internal::ValueArray9<T1, T2, T3, T4, T5, T6, T7, T8, T9>(v1, v2, v3,
+      v4, v5, v6, v7, v8, v9);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10>
+internal::ValueArray10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> Values(T1 v1,
+    T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10) {
+  return internal::ValueArray10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10>(v1,
+      v2, v3, v4, v5, v6, v7, v8, v9, v10);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11>
+internal::ValueArray11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10,
+    T11> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11) {
+  return internal::ValueArray11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10,
+      T11>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12>
+internal::ValueArray12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+    T12> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12) {
+  return internal::ValueArray12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13>
+internal::ValueArray13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+    T13> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13) {
+  return internal::ValueArray13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14>
+internal::ValueArray14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14) {
+  return internal::ValueArray14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
+      v14);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15>
+internal::ValueArray15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8,
+    T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15) {
+  return internal::ValueArray15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+      v13, v14, v15);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16>
+internal::ValueArray16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
+    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16) {
+  return internal::ValueArray16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
+      v12, v13, v14, v15, v16);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17>
+internal::ValueArray17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
+    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17) {
+  return internal::ValueArray17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10,
+      v11, v12, v13, v14, v15, v16, v17);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18>
+internal::ValueArray18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6,
+    T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17, T18 v18) {
+  return internal::ValueArray18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18>(v1, v2, v3, v4, v5, v6, v7, v8, v9,
+      v10, v11, v12, v13, v14, v15, v16, v17, v18);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19>
+internal::ValueArray19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5,
+    T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14,
+    T15 v15, T16 v16, T17 v17, T18 v18, T19 v19) {
+  return internal::ValueArray19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19>(v1, v2, v3, v4, v5, v6, v7, v8,
+      v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20>
+internal::ValueArray20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20> Values(T1 v1, T2 v2, T3 v3, T4 v4,
+    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
+    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20) {
+  return internal::ValueArray20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20>(v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21>
+internal::ValueArray21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21> Values(T1 v1, T2 v2, T3 v3, T4 v4,
+    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
+    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21) {
+  return internal::ValueArray21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21>(v1, v2, v3, v4, v5, v6,
+      v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22>
+internal::ValueArray22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22> Values(T1 v1, T2 v2, T3 v3,
+    T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
+    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
+    T21 v21, T22 v22) {
+  return internal::ValueArray22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22>(v1, v2, v3, v4,
+      v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
+      v20, v21, v22);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23>
+internal::ValueArray23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23> Values(T1 v1, T2 v2,
+    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
+    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
+    T21 v21, T22 v22, T23 v23) {
+  return internal::ValueArray23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23>(v1, v2, v3,
+      v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
+      v20, v21, v22, v23);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24>
+internal::ValueArray24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> Values(T1 v1, T2 v2,
+    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
+    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
+    T21 v21, T22 v22, T23 v23, T24 v24) {
+  return internal::ValueArray24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24>(v1, v2,
+      v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18,
+      v19, v20, v21, v22, v23, v24);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25>
+internal::ValueArray25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> Values(T1 v1,
+    T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11,
+    T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19,
+    T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25) {
+  return internal::ValueArray25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25>(v1,
+      v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17,
+      v18, v19, v20, v21, v22, v23, v24, v25);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26>
+internal::ValueArray26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+    T26> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26) {
+  return internal::ValueArray26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
+      v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27>
+internal::ValueArray27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+    T27> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26, T27 v27) {
+  return internal::ValueArray27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14,
+      v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28>
+internal::ValueArray28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+    T28> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26, T27 v27, T28 v28) {
+  return internal::ValueArray28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
+      v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27,
+      v28);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29>
+internal::ValueArray29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26, T27 v27, T28 v28, T29 v29) {
+  return internal::ValueArray29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+      v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26,
+      v27, v28, v29);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30>
+internal::ValueArray30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8,
+    T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16,
+    T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24,
+    T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30) {
+  return internal::ValueArray30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
+      v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25,
+      v26, v27, v28, v29, v30);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31>
+internal::ValueArray31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
+    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
+    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31) {
+  return internal::ValueArray31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10,
+      v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24,
+      v25, v26, v27, v28, v29, v30, v31);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32>
+internal::ValueArray32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
+    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
+    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
+    T32 v32) {
+  return internal::ValueArray32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32>(v1, v2, v3, v4, v5, v6, v7, v8, v9,
+      v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23,
+      v24, v25, v26, v27, v28, v29, v30, v31, v32);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33>
+internal::ValueArray33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6,
+    T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
+    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
+    T32 v32, T33 v33) {
+  return internal::ValueArray33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33>(v1, v2, v3, v4, v5, v6, v7, v8,
+      v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23,
+      v24, v25, v26, v27, v28, v29, v30, v31, v32, v33);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34>
+internal::ValueArray34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5,
+    T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14,
+    T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22,
+    T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30,
+    T31 v31, T32 v32, T33 v33, T34 v34) {
+  return internal::ValueArray34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34>(v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22,
+      v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35>
+internal::ValueArray35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35> Values(T1 v1, T2 v2, T3 v3, T4 v4,
+    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
+    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21,
+    T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29,
+    T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35) {
+  return internal::ValueArray35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35>(v1, v2, v3, v4, v5, v6,
+      v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21,
+      v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36>
+internal::ValueArray36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36> Values(T1 v1, T2 v2, T3 v3, T4 v4,
+    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
+    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21,
+    T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29,
+    T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36) {
+  return internal::ValueArray36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36>(v1, v2, v3, v4,
+      v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
+      v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33,
+      v34, v35, v36);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37>
+internal::ValueArray37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37> Values(T1 v1, T2 v2, T3 v3,
+    T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
+    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
+    T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28,
+    T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36,
+    T37 v37) {
+  return internal::ValueArray37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37>(v1, v2, v3,
+      v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
+      v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33,
+      v34, v35, v36, v37);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38>
+internal::ValueArray38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> Values(T1 v1, T2 v2,
+    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
+    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
+    T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28,
+    T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36,
+    T37 v37, T38 v38) {
+  return internal::ValueArray38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38>(v1, v2,
+      v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18,
+      v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32,
+      v33, v34, v35, v36, v37, v38);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39>
+internal::ValueArray39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> Values(T1 v1, T2 v2,
+    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
+    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
+    T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28,
+    T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36,
+    T37 v37, T38 v38, T39 v39) {
+  return internal::ValueArray39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39>(v1,
+      v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17,
+      v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,
+      v32, v33, v34, v35, v36, v37, v38, v39);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40>
+internal::ValueArray40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> Values(T1 v1,
+    T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11,
+    T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19,
+    T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27,
+    T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35,
+    T36 v36, T37 v37, T38 v38, T39 v39, T40 v40) {
+  return internal::ValueArray40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
+      v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29,
+      v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41>
+internal::ValueArray41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+    T41> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+    T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41) {
+  return internal::ValueArray41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14,
+      v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28,
+      v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42>
+internal::ValueArray42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+    T42> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+    T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+    T42 v42) {
+  return internal::ValueArray42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
+      v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27,
+      v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41,
+      v42);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43>
+internal::ValueArray43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+    T43> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+    T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+    T42 v42, T43 v43) {
+  return internal::ValueArray43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+      v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26,
+      v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40,
+      v41, v42, v43);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44>
+internal::ValueArray44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+    T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+    T42 v42, T43 v43, T44 v44) {
+  return internal::ValueArray44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43, T44>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
+      v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25,
+      v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39,
+      v40, v41, v42, v43, v44);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45>
+internal::ValueArray45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44, T45> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8,
+    T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16,
+    T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24,
+    T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32,
+    T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40,
+    T41 v41, T42 v42, T43 v43, T44 v44, T45 v45) {
+  return internal::ValueArray45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43, T44, T45>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10,
+      v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24,
+      v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38,
+      v39, v40, v41, v42, v43, v44, v45);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46>
+internal::ValueArray46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44, T45, T46> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
+    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
+    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
+    T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39,
+    T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46) {
+  return internal::ValueArray46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43, T44, T45, T46>(v1, v2, v3, v4, v5, v6, v7, v8, v9,
+      v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23,
+      v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37,
+      v38, v39, v40, v41, v42, v43, v44, v45, v46);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47>
+internal::ValueArray47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44, T45, T46, T47> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
+    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
+    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
+    T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39,
+    T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47) {
+  return internal::ValueArray47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43, T44, T45, T46, T47>(v1, v2, v3, v4, v5, v6, v7, v8,
+      v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23,
+      v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37,
+      v38, v39, v40, v41, v42, v43, v44, v45, v46, v47);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48>
+internal::ValueArray48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44, T45, T46, T47, T48> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6,
+    T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
+    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
+    T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39,
+    T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47,
+    T48 v48) {
+  return internal::ValueArray48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43, T44, T45, T46, T47, T48>(v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22,
+      v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36,
+      v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49>
+internal::ValueArray49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44, T45, T46, T47, T48, T49> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5,
+    T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14,
+    T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22,
+    T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30,
+    T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38,
+    T39 v39, T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46,
+    T47 v47, T48 v48, T49 v49) {
+  return internal::ValueArray49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43, T44, T45, T46, T47, T48, T49>(v1, v2, v3, v4, v5, v6,
+      v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21,
+      v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35,
+      v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49, typename T50>
+internal::ValueArray50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44, T45, T46, T47, T48, T49, T50> Values(T1 v1, T2 v2, T3 v3, T4 v4,
+    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
+    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21,
+    T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29,
+    T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37,
+    T38 v38, T39 v39, T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45,
+    T46 v46, T47 v47, T48 v48, T49 v49, T50 v50) {
+  return internal::ValueArray50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43, T44, T45, T46, T47, T48, T49, T50>(v1, v2, v3, v4,
+      v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
+      v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33,
+      v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47,
+      v48, v49, v50);
+}
+
+// Bool() allows generating tests with parameters in a set of (false, true).
+//
+// Synopsis:
+// Bool()
+//   - returns a generator producing sequences with elements {false, true}.
+//
+// It is useful when testing code that depends on Boolean flags. Combinations
+// of multiple flags can be tested when several Bool()'s are combined using
+// Combine() function.
+//
+// In the following example all tests in the test case FlagDependentTest
+// will be instantiated twice with parameters false and true.
+//
+// class FlagDependentTest : public testing::TestWithParam<bool> {
+//   virtual void SetUp() {
+//     external_flag = GetParam();
+//   }
+// }
+// INSTANTIATE_TEST_CASE_P(BoolSequence, FlagDependentTest, Bool());
+//
+inline internal::ParamGenerator<bool> Bool() {
+  return Values(false, true);
+}
+
+# if GTEST_HAS_COMBINE
+// Combine() allows the user to combine two or more sequences to produce
+// values of a Cartesian product of those sequences' elements.
+//
+// Synopsis:
+// Combine(gen1, gen2, ..., genN)
+//   - returns a generator producing sequences with elements coming from
+//     the Cartesian product of elements from the sequences generated by
+//     gen1, gen2, ..., genN. The sequence elements will have a type of
+//     tuple<T1, T2, ..., TN> where T1, T2, ..., TN are the types
+//     of elements from sequences produces by gen1, gen2, ..., genN.
+//
+// Combine can have up to 10 arguments. This number is currently limited
+// by the maximum number of elements in the tuple implementation used by Google
+// Test.
+//
+// Example:
+//
+// This will instantiate tests in test case AnimalTest each one with
+// the parameter values tuple("cat", BLACK), tuple("cat", WHITE),
+// tuple("dog", BLACK), and tuple("dog", WHITE):
+//
+// enum Color { BLACK, GRAY, WHITE };
+// class AnimalTest
+//     : public testing::TestWithParam<tuple<const char*, Color> > {...};
+//
+// TEST_P(AnimalTest, AnimalLooksNice) {...}
+//
+// INSTANTIATE_TEST_CASE_P(AnimalVariations, AnimalTest,
+//                         Combine(Values("cat", "dog"),
+//                                 Values(BLACK, WHITE)));
+//
+// This will instantiate tests in FlagDependentTest with all variations of two
+// Boolean flags:
+//
+// class FlagDependentTest
+//     : public testing::TestWithParam<tuple<bool, bool> > {
+//   virtual void SetUp() {
+//     // Assigns external_flag_1 and external_flag_2 values from the tuple.
+//     tie(external_flag_1, external_flag_2) = GetParam();
+//   }
+// };
+//
+// TEST_P(FlagDependentTest, TestFeature1) {
+//   // Test your code using external_flag_1 and external_flag_2 here.
+// }
+// INSTANTIATE_TEST_CASE_P(TwoBoolSequence, FlagDependentTest,
+//                         Combine(Bool(), Bool()));
+//
+template <typename Generator1, typename Generator2>
+internal::CartesianProductHolder2<Generator1, Generator2> Combine(
+    const Generator1& g1, const Generator2& g2) {
+  return internal::CartesianProductHolder2<Generator1, Generator2>(
+      g1, g2);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3>
+internal::CartesianProductHolder3<Generator1, Generator2, Generator3> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3) {
+  return internal::CartesianProductHolder3<Generator1, Generator2, Generator3>(
+      g1, g2, g3);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3,
+    typename Generator4>
+internal::CartesianProductHolder4<Generator1, Generator2, Generator3,
+    Generator4> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3,
+        const Generator4& g4) {
+  return internal::CartesianProductHolder4<Generator1, Generator2, Generator3,
+      Generator4>(
+      g1, g2, g3, g4);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3,
+    typename Generator4, typename Generator5>
+internal::CartesianProductHolder5<Generator1, Generator2, Generator3,
+    Generator4, Generator5> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3,
+        const Generator4& g4, const Generator5& g5) {
+  return internal::CartesianProductHolder5<Generator1, Generator2, Generator3,
+      Generator4, Generator5>(
+      g1, g2, g3, g4, g5);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3,
+    typename Generator4, typename Generator5, typename Generator6>
+internal::CartesianProductHolder6<Generator1, Generator2, Generator3,
+    Generator4, Generator5, Generator6> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3,
+        const Generator4& g4, const Generator5& g5, const Generator6& g6) {
+  return internal::CartesianProductHolder6<Generator1, Generator2, Generator3,
+      Generator4, Generator5, Generator6>(
+      g1, g2, g3, g4, g5, g6);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3,
+    typename Generator4, typename Generator5, typename Generator6,
+    typename Generator7>
+internal::CartesianProductHolder7<Generator1, Generator2, Generator3,
+    Generator4, Generator5, Generator6, Generator7> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3,
+        const Generator4& g4, const Generator5& g5, const Generator6& g6,
+        const Generator7& g7) {
+  return internal::CartesianProductHolder7<Generator1, Generator2, Generator3,
+      Generator4, Generator5, Generator6, Generator7>(
+      g1, g2, g3, g4, g5, g6, g7);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3,
+    typename Generator4, typename Generator5, typename Generator6,
+    typename Generator7, typename Generator8>
+internal::CartesianProductHolder8<Generator1, Generator2, Generator3,
+    Generator4, Generator5, Generator6, Generator7, Generator8> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3,
+        const Generator4& g4, const Generator5& g5, const Generator6& g6,
+        const Generator7& g7, const Generator8& g8) {
+  return internal::CartesianProductHolder8<Generator1, Generator2, Generator3,
+      Generator4, Generator5, Generator6, Generator7, Generator8>(
+      g1, g2, g3, g4, g5, g6, g7, g8);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3,
+    typename Generator4, typename Generator5, typename Generator6,
+    typename Generator7, typename Generator8, typename Generator9>
+internal::CartesianProductHolder9<Generator1, Generator2, Generator3,
+    Generator4, Generator5, Generator6, Generator7, Generator8,
+    Generator9> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3,
+        const Generator4& g4, const Generator5& g5, const Generator6& g6,
+        const Generator7& g7, const Generator8& g8, const Generator9& g9) {
+  return internal::CartesianProductHolder9<Generator1, Generator2, Generator3,
+      Generator4, Generator5, Generator6, Generator7, Generator8, Generator9>(
+      g1, g2, g3, g4, g5, g6, g7, g8, g9);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3,
+    typename Generator4, typename Generator5, typename Generator6,
+    typename Generator7, typename Generator8, typename Generator9,
+    typename Generator10>
+internal::CartesianProductHolder10<Generator1, Generator2, Generator3,
+    Generator4, Generator5, Generator6, Generator7, Generator8, Generator9,
+    Generator10> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3,
+        const Generator4& g4, const Generator5& g5, const Generator6& g6,
+        const Generator7& g7, const Generator8& g8, const Generator9& g9,
+        const Generator10& g10) {
+  return internal::CartesianProductHolder10<Generator1, Generator2, Generator3,
+      Generator4, Generator5, Generator6, Generator7, Generator8, Generator9,
+      Generator10>(
+      g1, g2, g3, g4, g5, g6, g7, g8, g9, g10);
+}
+# endif  // GTEST_HAS_COMBINE
+
+
+
+# define TEST_P(test_case_name, test_name) \
+  class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) \
+      : public test_case_name { \
+   public: \
+    GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {} \
+    virtual void TestBody(); \
+   private: \
+    static int AddToRegistry() { \
+      ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \
+          GetTestCasePatternHolder<test_case_name>(\
+              #test_case_name, __FILE__, __LINE__)->AddTestPattern(\
+                  #test_case_name, \
+                  #test_name, \
+                  new ::testing::internal::TestMetaFactory< \
+                      GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>()); \
+      return 0; \
+    } \
+    static int gtest_registering_dummy_; \
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(\
+        GTEST_TEST_CLASS_NAME_(test_case_name, test_name)); \
+  }; \
+  int GTEST_TEST_CLASS_NAME_(test_case_name, \
+                             test_name)::gtest_registering_dummy_ = \
+      GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::AddToRegistry(); \
+  void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody()
+
+# define INSTANTIATE_TEST_CASE_P(prefix, test_case_name, generator) \
+  ::testing::internal::ParamGenerator<test_case_name::ParamType> \
+      gtest_##prefix##test_case_name##_EvalGenerator_() { return generator; } \
+  int gtest_##prefix##test_case_name##_dummy_ = \
+      ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \
+          GetTestCasePatternHolder<test_case_name>(\
+              #test_case_name, __FILE__, __LINE__)->AddTestCaseInstantiation(\
+                  #prefix, \
+                  &gtest_##prefix##test_case_name##_EvalGenerator_, \
+                  __FILE__, __LINE__)
+
+}  // namespace testing
+
+#endif  // GTEST_HAS_PARAM_TEST
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
+// Copyright 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+//
+// Google C++ Testing Framework definitions useful in production code.
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_PROD_H_
+#define GTEST_INCLUDE_GTEST_GTEST_PROD_H_
+
+// When you need to test the private or protected members of a class,
+// use the FRIEND_TEST macro to declare your tests as friends of the
+// class.  For example:
+//
+// class MyClass {
+//  private:
+//   void MyMethod();
+//   FRIEND_TEST(MyClassTest, MyMethod);
+// };
+//
+// class MyClassTest : public testing::Test {
+//   // ...
+// };
+//
+// TEST_F(MyClassTest, MyMethod) {
+//   // Can call MyClass::MyMethod() here.
+// }
+
+#define FRIEND_TEST(test_case_name, test_name)\
+friend class test_case_name##_##test_name##_Test
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_PROD_H_
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: mheule@google.com (Markus Heule)
+//
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
+#define GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
+
+#include <iosfwd>
+#include <vector>
+
+namespace testing {
+
+// A copyable object representing the result of a test part (i.e. an
+// assertion or an explicit FAIL(), ADD_FAILURE(), or SUCCESS()).
+//
+// Don't inherit from TestPartResult as its destructor is not virtual.
+class GTEST_API_ TestPartResult {
+ public:
+  // The possible outcomes of a test part (i.e. an assertion or an
+  // explicit SUCCEED(), FAIL(), or ADD_FAILURE()).
+  enum Type {
+    kSuccess,          // Succeeded.
+    kNonFatalFailure,  // Failed but the test can continue.
+    kFatalFailure      // Failed and the test should be terminated.
+  };
+
+  // C'tor.  TestPartResult does NOT have a default constructor.
+  // Always use this constructor (with parameters) to create a
+  // TestPartResult object.
+  TestPartResult(Type a_type,
+                 const char* a_file_name,
+                 int a_line_number,
+                 const char* a_message)
+      : type_(a_type),
+        file_name_(a_file_name == NULL ? "" : a_file_name),
+        line_number_(a_line_number),
+        summary_(ExtractSummary(a_message)),
+        message_(a_message) {
+  }
+
+  // Gets the outcome of the test part.
+  Type type() const { return type_; }
+
+  // Gets the name of the source file where the test part took place, or
+  // NULL if it's unknown.
+  const char* file_name() const {
+    return file_name_.empty() ? NULL : file_name_.c_str();
+  }
+
+  // Gets the line in the source file where the test part took place,
+  // or -1 if it's unknown.
+  int line_number() const { return line_number_; }
+
+  // Gets the summary of the failure message.
+  const char* summary() const { return summary_.c_str(); }
+
+  // Gets the message associated with the test part.
+  const char* message() const { return message_.c_str(); }
+
+  // Returns true iff the test part passed.
+  bool passed() const { return type_ == kSuccess; }
+
+  // Returns true iff the test part failed.
+  bool failed() const { return type_ != kSuccess; }
+
+  // Returns true iff the test part non-fatally failed.
+  bool nonfatally_failed() const { return type_ == kNonFatalFailure; }
+
+  // Returns true iff the test part fatally failed.
+  bool fatally_failed() const { return type_ == kFatalFailure; }
+
+ private:
+  Type type_;
+
+  // Gets the summary of the failure message by omitting the stack
+  // trace in it.
+  static std::string ExtractSummary(const char* message);
+
+  // The name of the source file where the test part took place, or
+  // "" if the source file is unknown.
+  std::string file_name_;
+  // The line in the source file where the test part took place, or -1
+  // if the line number is unknown.
+  int line_number_;
+  std::string summary_;  // The test failure summary.
+  std::string message_;  // The test failure message.
+};
+
+// Prints a TestPartResult object.
+std::ostream& operator<<(std::ostream& os, const TestPartResult& result);
+
+// An array of TestPartResult objects.
+//
+// Don't inherit from TestPartResultArray as its destructor is not
+// virtual.
+class GTEST_API_ TestPartResultArray {
+ public:
+  TestPartResultArray() {}
+
+  // Appends the given TestPartResult to the array.
+  void Append(const TestPartResult& result);
+
+  // Returns the TestPartResult at the given index (0-based).
+  const TestPartResult& GetTestPartResult(int index) const;
+
+  // Returns the number of TestPartResult objects in the array.
+  int size() const;
+
+ private:
+  std::vector<TestPartResult> array_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestPartResultArray);
+};
+
+// This interface knows how to report a test part result.
+class TestPartResultReporterInterface {
+ public:
+  virtual ~TestPartResultReporterInterface() {}
+
+  virtual void ReportTestPartResult(const TestPartResult& result) = 0;
+};
+
+namespace internal {
+
+// This helper class is used by {ASSERT|EXPECT}_NO_FATAL_FAILURE to check if a
+// statement generates new fatal failures. To do so it registers itself as the
+// current test part result reporter. Besides checking if fatal failures were
+// reported, it only delegates the reporting to the former result reporter.
+// The original result reporter is restored in the destructor.
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+class GTEST_API_ HasNewFatalFailureHelper
+    : public TestPartResultReporterInterface {
+ public:
+  HasNewFatalFailureHelper();
+  virtual ~HasNewFatalFailureHelper();
+  virtual void ReportTestPartResult(const TestPartResult& result);
+  bool has_new_fatal_failure() const { return has_new_fatal_failure_; }
+ private:
+  bool has_new_fatal_failure_;
+  TestPartResultReporterInterface* original_reporter_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(HasNewFatalFailureHelper);
+};
+
+}  // namespace internal
+
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
+#define GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
+
+// This header implements typed tests and type-parameterized tests.
+
+// Typed (aka type-driven) tests repeat the same test for types in a
+// list.  You must know which types you want to test with when writing
+// typed tests. Here's how you do it:
+
+#if 0
+
+// First, define a fixture class template.  It should be parameterized
+// by a type.  Remember to derive it from testing::Test.
+template <typename T>
+class FooTest : public testing::Test {
+ public:
+  ...
+  typedef std::list<T> List;
+  static T shared_;
+  T value_;
+};
+
+// Next, associate a list of types with the test case, which will be
+// repeated for each type in the list.  The typedef is necessary for
+// the macro to parse correctly.
+typedef testing::Types<char, int, unsigned int> MyTypes;
+TYPED_TEST_CASE(FooTest, MyTypes);
+
+// If the type list contains only one type, you can write that type
+// directly without Types<...>:
+//   TYPED_TEST_CASE(FooTest, int);
+
+// Then, use TYPED_TEST() instead of TEST_F() to define as many typed
+// tests for this test case as you want.
+TYPED_TEST(FooTest, DoesBlah) {
+  // Inside a test, refer to TypeParam to get the type parameter.
+  // Since we are inside a derived class template, C++ requires use to
+  // visit the members of FooTest via 'this'.
+  TypeParam n = this->value_;
+
+  // To visit static members of the fixture, add the TestFixture::
+  // prefix.
+  n += TestFixture::shared_;
+
+  // To refer to typedefs in the fixture, add the "typename
+  // TestFixture::" prefix.
+  typename TestFixture::List values;
+  values.push_back(n);
+  ...
+}
+
+TYPED_TEST(FooTest, HasPropertyA) { ... }
+
+#endif  // 0
+
+// Type-parameterized tests are abstract test patterns parameterized
+// by a type.  Compared with typed tests, type-parameterized tests
+// allow you to define the test pattern without knowing what the type
+// parameters are.  The defined pattern can be instantiated with
+// different types any number of times, in any number of translation
+// units.
+//
+// If you are designing an interface or concept, you can define a
+// suite of type-parameterized tests to verify properties that any
+// valid implementation of the interface/concept should have.  Then,
+// each implementation can easily instantiate the test suite to verify
+// that it conforms to the requirements, without having to write
+// similar tests repeatedly.  Here's an example:
+
+#if 0
+
+// First, define a fixture class template.  It should be parameterized
+// by a type.  Remember to derive it from testing::Test.
+template <typename T>
+class FooTest : public testing::Test {
+  ...
+};
+
+// Next, declare that you will define a type-parameterized test case
+// (the _P suffix is for "parameterized" or "pattern", whichever you
+// prefer):
+TYPED_TEST_CASE_P(FooTest);
+
+// Then, use TYPED_TEST_P() to define as many type-parameterized tests
+// for this type-parameterized test case as you want.
+TYPED_TEST_P(FooTest, DoesBlah) {
+  // Inside a test, refer to TypeParam to get the type parameter.
+  TypeParam n = 0;
+  ...
+}
+
+TYPED_TEST_P(FooTest, HasPropertyA) { ... }
+
+// Now the tricky part: you need to register all test patterns before
+// you can instantiate them.  The first argument of the macro is the
+// test case name; the rest are the names of the tests in this test
+// case.
+REGISTER_TYPED_TEST_CASE_P(FooTest,
+                           DoesBlah, HasPropertyA);
+
+// Finally, you are free to instantiate the pattern with the types you
+// want.  If you put the above code in a header file, you can #include
+// it in multiple C++ source files and instantiate it multiple times.
+//
+// To distinguish different instances of the pattern, the first
+// argument to the INSTANTIATE_* macro is a prefix that will be added
+// to the actual test case name.  Remember to pick unique prefixes for
+// different instances.
+typedef testing::Types<char, int, unsigned int> MyTypes;
+INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, MyTypes);
+
+// If the type list contains only one type, you can write that type
+// directly without Types<...>:
+//   INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, int);
+
+#endif  // 0
+
+
+// Implements typed tests.
+
+#if GTEST_HAS_TYPED_TEST
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Expands to the name of the typedef for the type parameters of the
+// given test case.
+# define GTEST_TYPE_PARAMS_(TestCaseName) gtest_type_params_##TestCaseName##_
+
+// The 'Types' template argument below must have spaces around it
+// since some compilers may choke on '>>' when passing a template
+// instance (e.g. Types<int>)
+# define TYPED_TEST_CASE(CaseName, Types) \
+  typedef ::testing::internal::TypeList< Types >::type \
+      GTEST_TYPE_PARAMS_(CaseName)
+
+# define TYPED_TEST(CaseName, TestName) \
+  template <typename gtest_TypeParam_> \
+  class GTEST_TEST_CLASS_NAME_(CaseName, TestName) \
+      : public CaseName<gtest_TypeParam_> { \
+   private: \
+    typedef CaseName<gtest_TypeParam_> TestFixture; \
+    typedef gtest_TypeParam_ TypeParam; \
+    virtual void TestBody(); \
+  }; \
+  bool gtest_##CaseName##_##TestName##_registered_ GTEST_ATTRIBUTE_UNUSED_ = \
+      ::testing::internal::TypeParameterizedTest< \
+          CaseName, \
+          ::testing::internal::TemplateSel< \
+              GTEST_TEST_CLASS_NAME_(CaseName, TestName)>, \
+          GTEST_TYPE_PARAMS_(CaseName)>::Register(\
+              "", #CaseName, #TestName, 0); \
+  template <typename gtest_TypeParam_> \
+  void GTEST_TEST_CLASS_NAME_(CaseName, TestName)<gtest_TypeParam_>::TestBody()
+
+#endif  // GTEST_HAS_TYPED_TEST
+
+// Implements type-parameterized tests.
+
+#if GTEST_HAS_TYPED_TEST_P
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Expands to the namespace name that the type-parameterized tests for
+// the given type-parameterized test case are defined in.  The exact
+// name of the namespace is subject to change without notice.
+# define GTEST_CASE_NAMESPACE_(TestCaseName) \
+  gtest_case_##TestCaseName##_
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Expands to the name of the variable used to remember the names of
+// the defined tests in the given test case.
+# define GTEST_TYPED_TEST_CASE_P_STATE_(TestCaseName) \
+  gtest_typed_test_case_p_state_##TestCaseName##_
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE DIRECTLY.
+//
+// Expands to the name of the variable used to remember the names of
+// the registered tests in the given test case.
+# define GTEST_REGISTERED_TEST_NAMES_(TestCaseName) \
+  gtest_registered_test_names_##TestCaseName##_
+
+// The variables defined in the type-parameterized test macros are
+// static as typically these macros are used in a .h file that can be
+// #included in multiple translation units linked together.
+# define TYPED_TEST_CASE_P(CaseName) \
+  static ::testing::internal::TypedTestCasePState \
+      GTEST_TYPED_TEST_CASE_P_STATE_(CaseName)
+
+# define TYPED_TEST_P(CaseName, TestName) \
+  namespace GTEST_CASE_NAMESPACE_(CaseName) { \
+  template <typename gtest_TypeParam_> \
+  class TestName : public CaseName<gtest_TypeParam_> { \
+   private: \
+    typedef CaseName<gtest_TypeParam_> TestFixture; \
+    typedef gtest_TypeParam_ TypeParam; \
+    virtual void TestBody(); \
+  }; \
+  static bool gtest_##TestName##_defined_ GTEST_ATTRIBUTE_UNUSED_ = \
+      GTEST_TYPED_TEST_CASE_P_STATE_(CaseName).AddTestName(\
+          __FILE__, __LINE__, #CaseName, #TestName); \
+  } \
+  template <typename gtest_TypeParam_> \
+  void GTEST_CASE_NAMESPACE_(CaseName)::TestName<gtest_TypeParam_>::TestBody()
+
+# define REGISTER_TYPED_TEST_CASE_P(CaseName, ...) \
+  namespace GTEST_CASE_NAMESPACE_(CaseName) { \
+  typedef ::testing::internal::Templates<__VA_ARGS__>::type gtest_AllTests_; \
+  } \
+  static const char* const GTEST_REGISTERED_TEST_NAMES_(CaseName) = \
+      GTEST_TYPED_TEST_CASE_P_STATE_(CaseName).VerifyRegisteredTestNames(\
+          __FILE__, __LINE__, #__VA_ARGS__)
+
+// The 'Types' template argument below must have spaces around it
+// since some compilers may choke on '>>' when passing a template
+// instance (e.g. Types<int>)
+# define INSTANTIATE_TYPED_TEST_CASE_P(Prefix, CaseName, Types) \
+  bool gtest_##Prefix##_##CaseName GTEST_ATTRIBUTE_UNUSED_ = \
+      ::testing::internal::TypeParameterizedTestCase<CaseName, \
+          GTEST_CASE_NAMESPACE_(CaseName)::gtest_AllTests_, \
+          ::testing::internal::TypeList< Types >::type>::Register(\
+              #Prefix, #CaseName, GTEST_REGISTERED_TEST_NAMES_(CaseName))
+
+#endif  // GTEST_HAS_TYPED_TEST_P
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
+
+// Depending on the platform, different string classes are available.
+// On Linux, in addition to ::std::string, Google also makes use of
+// class ::string, which has the same interface as ::std::string, but
+// has a different implementation.
+//
+// The user can define GTEST_HAS_GLOBAL_STRING to 1 to indicate that
+// ::string is available AND is a distinct type to ::std::string, or
+// define it to 0 to indicate otherwise.
+//
+// If the user's ::std::string and ::string are the same class due to
+// aliasing, he should define GTEST_HAS_GLOBAL_STRING to 0.
+//
+// If the user doesn't define GTEST_HAS_GLOBAL_STRING, it is defined
+// heuristically.
+
+namespace testing {
+
+// Declares the flags.
+
+// This flag temporary enables the disabled tests.
+GTEST_DECLARE_bool_(also_run_disabled_tests);
+
+// This flag brings the debugger on an assertion failure.
+GTEST_DECLARE_bool_(break_on_failure);
+
+// This flag controls whether Google Test catches all test-thrown exceptions
+// and logs them as failures.
+GTEST_DECLARE_bool_(catch_exceptions);
+
+// This flag enables using colors in terminal output. Available values are
+// "yes" to enable colors, "no" (disable colors), or "auto" (the default)
+// to let Google Test decide.
+GTEST_DECLARE_string_(color);
+
+// This flag sets up the filter to select by name using a glob pattern
+// the tests to run. If the filter is not given all tests are executed.
+GTEST_DECLARE_string_(filter);
+
+// This flag causes the Google Test to list tests. None of the tests listed
+// are actually run if the flag is provided.
+GTEST_DECLARE_bool_(list_tests);
+
+// This flag controls whether Google Test emits a detailed XML report to a file
+// in addition to its normal textual output.
+GTEST_DECLARE_string_(output);
+
+// This flags control whether Google Test prints the elapsed time for each
+// test.
+GTEST_DECLARE_bool_(print_time);
+
+// This flag specifies the random number seed.
+GTEST_DECLARE_int32_(random_seed);
+
+// This flag sets how many times the tests are repeated. The default value
+// is 1. If the value is -1 the tests are repeating forever.
+GTEST_DECLARE_int32_(repeat);
+
+// This flag controls whether Google Test includes Google Test internal
+// stack frames in failure stack traces.
+GTEST_DECLARE_bool_(show_internal_stack_frames);
+
+// When this flag is specified, tests' order is randomized on every iteration.
+GTEST_DECLARE_bool_(shuffle);
+
+// This flag specifies the maximum number of stack frames to be
+// printed in a failure message.
+GTEST_DECLARE_int32_(stack_trace_depth);
+
+// When this flag is specified, a failed assertion will throw an
+// exception if exceptions are enabled, or exit the program with a
+// non-zero code otherwise.
+GTEST_DECLARE_bool_(throw_on_failure);
+
+// When this flag is set with a "host:port" string, on supported
+// platforms test results are streamed to the specified port on
+// the specified host machine.
+GTEST_DECLARE_string_(stream_result_to);
+
+// The upper limit for valid stack trace depths.
+const int kMaxStackTraceDepth = 100;
+
+namespace internal {
+
+class AssertHelper;
+class DefaultGlobalTestPartResultReporter;
+class ExecDeathTest;
+class NoExecDeathTest;
+class FinalSuccessChecker;
+class GTestFlagSaver;
+class StreamingListenerTest;
+class TestResultAccessor;
+class TestEventListenersAccessor;
+class TestEventRepeater;
+class UnitTestRecordPropertyTestHelper;
+class WindowsDeathTest;
+class UnitTestImpl* GetUnitTestImpl();
+void ReportFailureInUnknownLocation(TestPartResult::Type result_type,
+                                    const std::string& message);
+
+}  // namespace internal
+
+// The friend relationship of some of these classes is cyclic.
+// If we don't forward declare them the compiler might confuse the classes
+// in friendship clauses with same named classes on the scope.
+class Test;
+class TestCase;
+class TestInfo;
+class UnitTest;
+
+// A class for indicating whether an assertion was successful.  When
+// the assertion wasn't successful, the AssertionResult object
+// remembers a non-empty message that describes how it failed.
+//
+// To create an instance of this class, use one of the factory functions
+// (AssertionSuccess() and AssertionFailure()).
+//
+// This class is useful for two purposes:
+//   1. Defining predicate functions to be used with Boolean test assertions
+//      EXPECT_TRUE/EXPECT_FALSE and their ASSERT_ counterparts
+//   2. Defining predicate-format functions to be
+//      used with predicate assertions (ASSERT_PRED_FORMAT*, etc).
+//
+// For example, if you define IsEven predicate:
+//
+//   testing::AssertionResult IsEven(int n) {
+//     if ((n % 2) == 0)
+//       return testing::AssertionSuccess();
+//     else
+//       return testing::AssertionFailure() << n << " is odd";
+//   }
+//
+// Then the failed expectation EXPECT_TRUE(IsEven(Fib(5)))
+// will print the message
+//
+//   Value of: IsEven(Fib(5))
+//     Actual: false (5 is odd)
+//   Expected: true
+//
+// instead of a more opaque
+//
+//   Value of: IsEven(Fib(5))
+//     Actual: false
+//   Expected: true
+//
+// in case IsEven is a simple Boolean predicate.
+//
+// If you expect your predicate to be reused and want to support informative
+// messages in EXPECT_FALSE and ASSERT_FALSE (negative assertions show up
+// about half as often as positive ones in our tests), supply messages for
+// both success and failure cases:
+//
+//   testing::AssertionResult IsEven(int n) {
+//     if ((n % 2) == 0)
+//       return testing::AssertionSuccess() << n << " is even";
+//     else
+//       return testing::AssertionFailure() << n << " is odd";
+//   }
+//
+// Then a statement EXPECT_FALSE(IsEven(Fib(6))) will print
+//
+//   Value of: IsEven(Fib(6))
+//     Actual: true (8 is even)
+//   Expected: false
+//
+// NB: Predicates that support negative Boolean assertions have reduced
+// performance in positive ones so be careful not to use them in tests
+// that have lots (tens of thousands) of positive Boolean assertions.
+//
+// To use this class with EXPECT_PRED_FORMAT assertions such as:
+//
+//   // Verifies that Foo() returns an even number.
+//   EXPECT_PRED_FORMAT1(IsEven, Foo());
+//
+// you need to define:
+//
+//   testing::AssertionResult IsEven(const char* expr, int n) {
+//     if ((n % 2) == 0)
+//       return testing::AssertionSuccess();
+//     else
+//       return testing::AssertionFailure()
+//         << "Expected: " << expr << " is even\n  Actual: it's " << n;
+//   }
+//
+// If Foo() returns 5, you will see the following message:
+//
+//   Expected: Foo() is even
+//     Actual: it's 5
+//
+class GTEST_API_ AssertionResult {
+ public:
+  // Copy constructor.
+  // Used in EXPECT_TRUE/FALSE(assertion_result).
+  AssertionResult(const AssertionResult& other);
+  // Used in the EXPECT_TRUE/FALSE(bool_expression).
+  explicit AssertionResult(bool success) : success_(success) {}
+
+  // Returns true iff the assertion succeeded.
+  operator bool() const { return success_; }  // NOLINT
+
+  // Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
+  AssertionResult operator!() const;
+
+  // Returns the text streamed into this AssertionResult. Test assertions
+  // use it when they fail (i.e., the predicate's outcome doesn't match the
+  // assertion's expectation). When nothing has been streamed into the
+  // object, returns an empty string.
+  const char* message() const {
+    return message_.get() != NULL ?  message_->c_str() : "";
+  }
+  // TODO(vladl@google.com): Remove this after making sure no clients use it.
+  // Deprecated; please use message() instead.
+  const char* failure_message() const { return message(); }
+
+  // Streams a custom failure message into this object.
+  template <typename T> AssertionResult& operator<<(const T& value) {
+    AppendMessage(Message() << value);
+    return *this;
+  }
+
+  // Allows streaming basic output manipulators such as endl or flush into
+  // this object.
+  AssertionResult& operator<<(
+      ::std::ostream& (*basic_manipulator)(::std::ostream& stream)) {
+    AppendMessage(Message() << basic_manipulator);
+    return *this;
+  }
+
+ private:
+  // Appends the contents of message to message_.
+  void AppendMessage(const Message& a_message) {
+    if (message_.get() == NULL)
+      message_.reset(new ::std::string);
+    message_->append(a_message.GetString().c_str());
+  }
+
+  // Stores result of the assertion predicate.
+  bool success_;
+  // Stores the message describing the condition in case the expectation
+  // construct is not satisfied with the predicate's outcome.
+  // Referenced via a pointer to avoid taking too much stack frame space
+  // with test assertions.
+  internal::scoped_ptr< ::std::string> message_;
+
+  GTEST_DISALLOW_ASSIGN_(AssertionResult);
+};
+
+// Makes a successful assertion result.
+GTEST_API_ AssertionResult AssertionSuccess();
+
+// Makes a failed assertion result.
+GTEST_API_ AssertionResult AssertionFailure();
+
+// Makes a failed assertion result with the given failure message.
+// Deprecated; use AssertionFailure() << msg.
+GTEST_API_ AssertionResult AssertionFailure(const Message& msg);
+
+// The abstract class that all tests inherit from.
+//
+// In Google Test, a unit test program contains one or many TestCases, and
+// each TestCase contains one or many Tests.
+//
+// When you define a test using the TEST macro, you don't need to
+// explicitly derive from Test - the TEST macro automatically does
+// this for you.
+//
+// The only time you derive from Test is when defining a test fixture
+// to be used a TEST_F.  For example:
+//
+//   class FooTest : public testing::Test {
+//    protected:
+//     virtual void SetUp() { ... }
+//     virtual void TearDown() { ... }
+//     ...
+//   };
+//
+//   TEST_F(FooTest, Bar) { ... }
+//   TEST_F(FooTest, Baz) { ... }
+//
+// Test is not copyable.
+class GTEST_API_ Test {
+ public:
+  friend class TestInfo;
+
+  // Defines types for pointers to functions that set up and tear down
+  // a test case.
+  typedef internal::SetUpTestCaseFunc SetUpTestCaseFunc;
+  typedef internal::TearDownTestCaseFunc TearDownTestCaseFunc;
+
+  // The d'tor is virtual as we intend to inherit from Test.
+  virtual ~Test();
+
+  // Sets up the stuff shared by all tests in this test case.
+  //
+  // Google Test will call Foo::SetUpTestCase() before running the first
+  // test in test case Foo.  Hence a sub-class can define its own
+  // SetUpTestCase() method to shadow the one defined in the super
+  // class.
+  static void SetUpTestCase() {}
+
+  // Tears down the stuff shared by all tests in this test case.
+  //
+  // Google Test will call Foo::TearDownTestCase() after running the last
+  // test in test case Foo.  Hence a sub-class can define its own
+  // TearDownTestCase() method to shadow the one defined in the super
+  // class.
+  static void TearDownTestCase() {}
+
+  // Returns true iff the current test has a fatal failure.
+  static bool HasFatalFailure();
+
+  // Returns true iff the current test has a non-fatal failure.
+  static bool HasNonfatalFailure();
+
+  // Returns true iff the current test has a (either fatal or
+  // non-fatal) failure.
+  static bool HasFailure() { return HasFatalFailure() || HasNonfatalFailure(); }
+
+  // Logs a property for the current test, test case, or for the entire
+  // invocation of the test program when used outside of the context of a
+  // test case.  Only the last value for a given key is remembered.  These
+  // are public static so they can be called from utility functions that are
+  // not members of the test fixture.  Calls to RecordProperty made during
+  // lifespan of the test (from the moment its constructor starts to the
+  // moment its destructor finishes) will be output in XML as attributes of
+  // the <testcase> element.  Properties recorded from fixture's
+  // SetUpTestCase or TearDownTestCase are logged as attributes of the
+  // corresponding <testsuite> element.  Calls to RecordProperty made in the
+  // global context (before or after invocation of RUN_ALL_TESTS and from
+  // SetUp/TearDown method of Environment objects registered with Google
+  // Test) will be output as attributes of the <testsuites> element.
+  static void RecordProperty(const std::string& key, const std::string& value);
+  static void RecordProperty(const std::string& key, int value);
+
+ protected:
+  // Creates a Test object.
+  Test();
+
+  // Sets up the test fixture.
+  virtual void SetUp();
+
+  // Tears down the test fixture.
+  virtual void TearDown();
+
+ private:
+  // Returns true iff the current test has the same fixture class as
+  // the first test in the current test case.
+  static bool HasSameFixtureClass();
+
+  // Runs the test after the test fixture has been set up.
+  //
+  // A sub-class must implement this to define the test logic.
+  //
+  // DO NOT OVERRIDE THIS FUNCTION DIRECTLY IN A USER PROGRAM.
+  // Instead, use the TEST or TEST_F macro.
+  virtual void TestBody() = 0;
+
+  // Sets up, executes, and tears down the test.
+  void Run();
+
+  // Deletes self.  We deliberately pick an unusual name for this
+  // internal method to avoid clashing with names used in user TESTs.
+  void DeleteSelf_() { delete this; }
+
+  // Uses a GTestFlagSaver to save and restore all Google Test flags.
+  const internal::GTestFlagSaver* const gtest_flag_saver_;
+
+  // Often a user mis-spells SetUp() as Setup() and spends a long time
+  // wondering why it is never called by Google Test.  The declaration of
+  // the following method is solely for catching such an error at
+  // compile time:
+  //
+  //   - The return type is deliberately chosen to be not void, so it
+  //   will be a conflict if a user declares void Setup() in his test
+  //   fixture.
+  //
+  //   - This method is private, so it will be another compiler error
+  //   if a user calls it from his test fixture.
+  //
+  // DO NOT OVERRIDE THIS FUNCTION.
+  //
+  // If you see an error about overriding the following function or
+  // about it being private, you have mis-spelled SetUp() as Setup().
+  struct Setup_should_be_spelled_SetUp {};
+  virtual Setup_should_be_spelled_SetUp* Setup() { return NULL; }
+
+  // We disallow copying Tests.
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Test);
+};
+
+typedef internal::TimeInMillis TimeInMillis;
+
+// A copyable object representing a user specified test property which can be
+// output as a key/value string pair.
+//
+// Don't inherit from TestProperty as its destructor is not virtual.
+class TestProperty {
+ public:
+  // C'tor.  TestProperty does NOT have a default constructor.
+  // Always use this constructor (with parameters) to create a
+  // TestProperty object.
+  TestProperty(const std::string& a_key, const std::string& a_value) :
+    key_(a_key), value_(a_value) {
+  }
+
+  // Gets the user supplied key.
+  const char* key() const {
+    return key_.c_str();
+  }
+
+  // Gets the user supplied value.
+  const char* value() const {
+    return value_.c_str();
+  }
+
+  // Sets a new value, overriding the one supplied in the constructor.
+  void SetValue(const std::string& new_value) {
+    value_ = new_value;
+  }
+
+ private:
+  // The key supplied by the user.
+  std::string key_;
+  // The value supplied by the user.
+  std::string value_;
+};
+
+// The result of a single Test.  This includes a list of
+// TestPartResults, a list of TestProperties, a count of how many
+// death tests there are in the Test, and how much time it took to run
+// the Test.
+//
+// TestResult is not copyable.
+class GTEST_API_ TestResult {
+ public:
+  // Creates an empty TestResult.
+  TestResult();
+
+  // D'tor.  Do not inherit from TestResult.
+  ~TestResult();
+
+  // Gets the number of all test parts.  This is the sum of the number
+  // of successful test parts and the number of failed test parts.
+  int total_part_count() const;
+
+  // Returns the number of the test properties.
+  int test_property_count() const;
+
+  // Returns true iff the test passed (i.e. no test part failed).
+  bool Passed() const { return !Failed(); }
+
+  // Returns true iff the test failed.
+  bool Failed() const;
+
+  // Returns true iff the test fatally failed.
+  bool HasFatalFailure() const;
+
+  // Returns true iff the test has a non-fatal failure.
+  bool HasNonfatalFailure() const;
+
+  // Returns the elapsed time, in milliseconds.
+  TimeInMillis elapsed_time() const { return elapsed_time_; }
+
+  // Returns the i-th test part result among all the results. i can range
+  // from 0 to test_property_count() - 1. If i is not in that range, aborts
+  // the program.
+  const TestPartResult& GetTestPartResult(int i) const;
+
+  // Returns the i-th test property. i can range from 0 to
+  // test_property_count() - 1. If i is not in that range, aborts the
+  // program.
+  const TestProperty& GetTestProperty(int i) const;
+
+ private:
+  friend class TestInfo;
+  friend class TestCase;
+  friend class UnitTest;
+  friend class internal::DefaultGlobalTestPartResultReporter;
+  friend class internal::ExecDeathTest;
+  friend class internal::TestResultAccessor;
+  friend class internal::UnitTestImpl;
+  friend class internal::WindowsDeathTest;
+
+  // Gets the vector of TestPartResults.
+  const std::vector<TestPartResult>& test_part_results() const {
+    return test_part_results_;
+  }
+
+  // Gets the vector of TestProperties.
+  const std::vector<TestProperty>& test_properties() const {
+    return test_properties_;
+  }
+
+  // Sets the elapsed time.
+  void set_elapsed_time(TimeInMillis elapsed) { elapsed_time_ = elapsed; }
+
+  // Adds a test property to the list. The property is validated and may add
+  // a non-fatal failure if invalid (e.g., if it conflicts with reserved
+  // key names). If a property is already recorded for the same key, the
+  // value will be updated, rather than storing multiple values for the same
+  // key.  xml_element specifies the element for which the property is being
+  // recorded and is used for validation.
+  void RecordProperty(const std::string& xml_element,
+                      const TestProperty& test_property);
+
+  // Adds a failure if the key is a reserved attribute of Google Test
+  // testcase tags.  Returns true if the property is valid.
+  // TODO(russr): Validate attribute names are legal and human readable.
+  static bool ValidateTestProperty(const std::string& xml_element,
+                                   const TestProperty& test_property);
+
+  // Adds a test part result to the list.
+  void AddTestPartResult(const TestPartResult& test_part_result);
+
+  // Returns the death test count.
+  int death_test_count() const { return death_test_count_; }
+
+  // Increments the death test count, returning the new count.
+  int increment_death_test_count() { return ++death_test_count_; }
+
+  // Clears the test part results.
+  void ClearTestPartResults();
+
+  // Clears the object.
+  void Clear();
+
+  // Protects mutable state of the property vector and of owned
+  // properties, whose values may be updated.
+  internal::Mutex test_properites_mutex_;
+
+  // The vector of TestPartResults
+  std::vector<TestPartResult> test_part_results_;
+  // The vector of TestProperties
+  std::vector<TestProperty> test_properties_;
+  // Running count of death tests.
+  int death_test_count_;
+  // The elapsed time, in milliseconds.
+  TimeInMillis elapsed_time_;
+
+  // We disallow copying TestResult.
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestResult);
+};  // class TestResult
+
+// A TestInfo object stores the following information about a test:
+//
+//   Test case name
+//   Test name
+//   Whether the test should be run
+//   A function pointer that creates the test object when invoked
+//   Test result
+//
+// The constructor of TestInfo registers itself with the UnitTest
+// singleton such that the RUN_ALL_TESTS() macro knows which tests to
+// run.
+class GTEST_API_ TestInfo {
+ public:
+  // Destructs a TestInfo object.  This function is not virtual, so
+  // don't inherit from TestInfo.
+  ~TestInfo();
+
+  // Returns the test case name.
+  const char* test_case_name() const { return test_case_name_.c_str(); }
+
+  // Returns the test name.
+  const char* name() const { return name_.c_str(); }
+
+  // Returns the name of the parameter type, or NULL if this is not a typed
+  // or a type-parameterized test.
+  const char* type_param() const {
+    if (type_param_.get() != NULL)
+      return type_param_->c_str();
+    return NULL;
+  }
+
+  // Returns the text representation of the value parameter, or NULL if this
+  // is not a value-parameterized test.
+  const char* value_param() const {
+    if (value_param_.get() != NULL)
+      return value_param_->c_str();
+    return NULL;
+  }
+
+  // Returns true if this test should run, that is if the test is not
+  // disabled (or it is disabled but the also_run_disabled_tests flag has
+  // been specified) and its full name matches the user-specified filter.
+  //
+  // Google Test allows the user to filter the tests by their full names.
+  // The full name of a test Bar in test case Foo is defined as
+  // "Foo.Bar".  Only the tests that match the filter will run.
+  //
+  // A filter is a colon-separated list of glob (not regex) patterns,
+  // optionally followed by a '-' and a colon-separated list of
+  // negative patterns (tests to exclude).  A test is run if it
+  // matches one of the positive patterns and does not match any of
+  // the negative patterns.
+  //
+  // For example, *A*:Foo.* is a filter that matches any string that
+  // contains the character 'A' or starts with "Foo.".
+  bool should_run() const { return should_run_; }
+
+  // Returns true iff this test will appear in the XML report.
+  bool is_reportable() const {
+    // For now, the XML report includes all tests matching the filter.
+    // In the future, we may trim tests that are excluded because of
+    // sharding.
+    return matches_filter_;
+  }
+
+  // Returns the result of the test.
+  const TestResult* result() const { return &result_; }
+
+ private:
+#if GTEST_HAS_DEATH_TEST
+  friend class internal::DefaultDeathTestFactory;
+#endif  // GTEST_HAS_DEATH_TEST
+  friend class Test;
+  friend class TestCase;
+  friend class internal::UnitTestImpl;
+  friend class internal::StreamingListenerTest;
+  friend TestInfo* internal::MakeAndRegisterTestInfo(
+      const char* test_case_name,
+      const char* name,
+      const char* type_param,
+      const char* value_param,
+      internal::TypeId fixture_class_id,
+      Test::SetUpTestCaseFunc set_up_tc,
+      Test::TearDownTestCaseFunc tear_down_tc,
+      internal::TestFactoryBase* factory);
+
+  // Constructs a TestInfo object. The newly constructed instance assumes
+  // ownership of the factory object.
+  TestInfo(const std::string& test_case_name,
+           const std::string& name,
+           const char* a_type_param,   // NULL if not a type-parameterized test
+           const char* a_value_param,  // NULL if not a value-parameterized test
+           internal::TypeId fixture_class_id,
+           internal::TestFactoryBase* factory);
+
+  // Increments the number of death tests encountered in this test so
+  // far.
+  int increment_death_test_count() {
+    return result_.increment_death_test_count();
+  }
+
+  // Creates the test object, runs it, records its result, and then
+  // deletes it.
+  void Run();
+
+  static void ClearTestResult(TestInfo* test_info) {
+    test_info->result_.Clear();
+  }
+
+  // These fields are immutable properties of the test.
+  const std::string test_case_name_;     // Test case name
+  const std::string name_;               // Test name
+  // Name of the parameter type, or NULL if this is not a typed or a
+  // type-parameterized test.
+  const internal::scoped_ptr<const ::std::string> type_param_;
+  // Text representation of the value parameter, or NULL if this is not a
+  // value-parameterized test.
+  const internal::scoped_ptr<const ::std::string> value_param_;
+  const internal::TypeId fixture_class_id_;   // ID of the test fixture class
+  bool should_run_;                 // True iff this test should run
+  bool is_disabled_;                // True iff this test is disabled
+  bool matches_filter_;             // True if this test matches the
+                                    // user-specified filter.
+  internal::TestFactoryBase* const factory_;  // The factory that creates
+                                              // the test object
+
+  // This field is mutable and needs to be reset before running the
+  // test for the second time.
+  TestResult result_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestInfo);
+};
+
+// A test case, which consists of a vector of TestInfos.
+//
+// TestCase is not copyable.
+class GTEST_API_ TestCase {
+ public:
+  // Creates a TestCase with the given name.
+  //
+  // TestCase does NOT have a default constructor.  Always use this
+  // constructor to create a TestCase object.
+  //
+  // Arguments:
+  //
+  //   name:         name of the test case
+  //   a_type_param: the name of the test's type parameter, or NULL if
+  //                 this is not a type-parameterized test.
+  //   set_up_tc:    pointer to the function that sets up the test case
+  //   tear_down_tc: pointer to the function that tears down the test case
+  TestCase(const char* name, const char* a_type_param,
+           Test::SetUpTestCaseFunc set_up_tc,
+           Test::TearDownTestCaseFunc tear_down_tc);
+
+  // Destructor of TestCase.
+  virtual ~TestCase();
+
+  // Gets the name of the TestCase.
+  const char* name() const { return name_.c_str(); }
+
+  // Returns the name of the parameter type, or NULL if this is not a
+  // type-parameterized test case.
+  const char* type_param() const {
+    if (type_param_.get() != NULL)
+      return type_param_->c_str();
+    return NULL;
+  }
+
+  // Returns true if any test in this test case should run.
+  bool should_run() const { return should_run_; }
+
+  // Gets the number of successful tests in this test case.
+  int successful_test_count() const;
+
+  // Gets the number of failed tests in this test case.
+  int failed_test_count() const;
+
+  // Gets the number of disabled tests that will be reported in the XML report.
+  int reportable_disabled_test_count() const;
+
+  // Gets the number of disabled tests in this test case.
+  int disabled_test_count() const;
+
+  // Gets the number of tests to be printed in the XML report.
+  int reportable_test_count() const;
+
+  // Get the number of tests in this test case that should run.
+  int test_to_run_count() const;
+
+  // Gets the number of all tests in this test case.
+  int total_test_count() const;
+
+  // Returns true iff the test case passed.
+  bool Passed() const { return !Failed(); }
+
+  // Returns true iff the test case failed.
+  bool Failed() const { return failed_test_count() > 0; }
+
+  // Returns the elapsed time, in milliseconds.
+  TimeInMillis elapsed_time() const { return elapsed_time_; }
+
+  // Returns the i-th test among all the tests. i can range from 0 to
+  // total_test_count() - 1. If i is not in that range, returns NULL.
+  const TestInfo* GetTestInfo(int i) const;
+
+  // Returns the TestResult that holds test properties recorded during
+  // execution of SetUpTestCase and TearDownTestCase.
+  const TestResult& ad_hoc_test_result() const { return ad_hoc_test_result_; }
+
+ private:
+  friend class Test;
+  friend class internal::UnitTestImpl;
+
+  // Gets the (mutable) vector of TestInfos in this TestCase.
+  std::vector<TestInfo*>& test_info_list() { return test_info_list_; }
+
+  // Gets the (immutable) vector of TestInfos in this TestCase.
+  const std::vector<TestInfo*>& test_info_list() const {
+    return test_info_list_;
+  }
+
+  // Returns the i-th test among all the tests. i can range from 0 to
+  // total_test_count() - 1. If i is not in that range, returns NULL.
+  TestInfo* GetMutableTestInfo(int i);
+
+  // Sets the should_run member.
+  void set_should_run(bool should) { should_run_ = should; }
+
+  // Adds a TestInfo to this test case.  Will delete the TestInfo upon
+  // destruction of the TestCase object.
+  void AddTestInfo(TestInfo * test_info);
+
+  // Clears the results of all tests in this test case.
+  void ClearResult();
+
+  // Clears the results of all tests in the given test case.
+  static void ClearTestCaseResult(TestCase* test_case) {
+    test_case->ClearResult();
+  }
+
+  // Runs every test in this TestCase.
+  void Run();
+
+  // Runs SetUpTestCase() for this TestCase.  This wrapper is needed
+  // for catching exceptions thrown from SetUpTestCase().
+  void RunSetUpTestCase() { (*set_up_tc_)(); }
+
+  // Runs TearDownTestCase() for this TestCase.  This wrapper is
+  // needed for catching exceptions thrown from TearDownTestCase().
+  void RunTearDownTestCase() { (*tear_down_tc_)(); }
+
+  // Returns true iff test passed.
+  static bool TestPassed(const TestInfo* test_info) {
+    return test_info->should_run() && test_info->result()->Passed();
+  }
+
+  // Returns true iff test failed.
+  static bool TestFailed(const TestInfo* test_info) {
+    return test_info->should_run() && test_info->result()->Failed();
+  }
+
+  // Returns true iff the test is disabled and will be reported in the XML
+  // report.
+  static bool TestReportableDisabled(const TestInfo* test_info) {
+    return test_info->is_reportable() && test_info->is_disabled_;
+  }
+
+  // Returns true iff test is disabled.
+  static bool TestDisabled(const TestInfo* test_info) {
+    return test_info->is_disabled_;
+  }
+
+  // Returns true iff this test will appear in the XML report.
+  static bool TestReportable(const TestInfo* test_info) {
+    return test_info->is_reportable();
+  }
+
+  // Returns true if the given test should run.
+  static bool ShouldRunTest(const TestInfo* test_info) {
+    return test_info->should_run();
+  }
+
+  // Shuffles the tests in this test case.
+  void ShuffleTests(internal::Random* random);
+
+  // Restores the test order to before the first shuffle.
+  void UnshuffleTests();
+
+  // Name of the test case.
+  std::string name_;
+  // Name of the parameter type, or NULL if this is not a typed or a
+  // type-parameterized test.
+  const internal::scoped_ptr<const ::std::string> type_param_;
+  // The vector of TestInfos in their original order.  It owns the
+  // elements in the vector.
+  std::vector<TestInfo*> test_info_list_;
+  // Provides a level of indirection for the test list to allow easy
+  // shuffling and restoring the test order.  The i-th element in this
+  // vector is the index of the i-th test in the shuffled test list.
+  std::vector<int> test_indices_;
+  // Pointer to the function that sets up the test case.
+  Test::SetUpTestCaseFunc set_up_tc_;
+  // Pointer to the function that tears down the test case.
+  Test::TearDownTestCaseFunc tear_down_tc_;
+  // True iff any test in this test case should run.
+  bool should_run_;
+  // Elapsed time, in milliseconds.
+  TimeInMillis elapsed_time_;
+  // Holds test properties recorded during execution of SetUpTestCase and
+  // TearDownTestCase.
+  TestResult ad_hoc_test_result_;
+
+  // We disallow copying TestCases.
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestCase);
+};
+
+// An Environment object is capable of setting up and tearing down an
+// environment.  The user should subclass this to define his own
+// environment(s).
+//
+// An Environment object does the set-up and tear-down in virtual
+// methods SetUp() and TearDown() instead of the constructor and the
+// destructor, as:
+//
+//   1. You cannot safely throw from a destructor.  This is a problem
+//      as in some cases Google Test is used where exceptions are enabled, and
+//      we may want to implement ASSERT_* using exceptions where they are
+//      available.
+//   2. You cannot use ASSERT_* directly in a constructor or
+//      destructor.
+class Environment {
+ public:
+  // The d'tor is virtual as we need to subclass Environment.
+  virtual ~Environment() {}
+
+  // Override this to define how to set up the environment.
+  virtual void SetUp() {}
+
+  // Override this to define how to tear down the environment.
+  virtual void TearDown() {}
+ private:
+  // If you see an error about overriding the following function or
+  // about it being private, you have mis-spelled SetUp() as Setup().
+  struct Setup_should_be_spelled_SetUp {};
+  virtual Setup_should_be_spelled_SetUp* Setup() { return NULL; }
+};
+
+// The interface for tracing execution of tests. The methods are organized in
+// the order the corresponding events are fired.
+class TestEventListener {
+ public:
+  virtual ~TestEventListener() {}
+
+  // Fired before any test activity starts.
+  virtual void OnTestProgramStart(const UnitTest& unit_test) = 0;
+
+  // Fired before each iteration of tests starts.  There may be more than
+  // one iteration if GTEST_FLAG(repeat) is set. iteration is the iteration
+  // index, starting from 0.
+  virtual void OnTestIterationStart(const UnitTest& unit_test,
+                                    int iteration) = 0;
+
+  // Fired before environment set-up for each iteration of tests starts.
+  virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test) = 0;
+
+  // Fired after environment set-up for each iteration of tests ends.
+  virtual void OnEnvironmentsSetUpEnd(const UnitTest& unit_test) = 0;
+
+  // Fired before the test case starts.
+  virtual void OnTestCaseStart(const TestCase& test_case) = 0;
+
+  // Fired before the test starts.
+  virtual void OnTestStart(const TestInfo& test_info) = 0;
+
+  // Fired after a failed assertion or a SUCCEED() invocation.
+  virtual void OnTestPartResult(const TestPartResult& test_part_result) = 0;
+
+  // Fired after the test ends.
+  virtual void OnTestEnd(const TestInfo& test_info) = 0;
+
+  // Fired after the test case ends.
+  virtual void OnTestCaseEnd(const TestCase& test_case) = 0;
+
+  // Fired before environment tear-down for each iteration of tests starts.
+  virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test) = 0;
+
+  // Fired after environment tear-down for each iteration of tests ends.
+  virtual void OnEnvironmentsTearDownEnd(const UnitTest& unit_test) = 0;
+
+  // Fired after each iteration of tests finishes.
+  virtual void OnTestIterationEnd(const UnitTest& unit_test,
+                                  int iteration) = 0;
+
+  // Fired after all test activities have ended.
+  virtual void OnTestProgramEnd(const UnitTest& unit_test) = 0;
+};
+
+// The convenience class for users who need to override just one or two
+// methods and are not concerned that a possible change to a signature of
+// the methods they override will not be caught during the build.  For
+// comments about each method please see the definition of TestEventListener
+// above.
+class EmptyTestEventListener : public TestEventListener {
+ public:
+  virtual void OnTestProgramStart(const UnitTest& /*unit_test*/) {}
+  virtual void OnTestIterationStart(const UnitTest& /*unit_test*/,
+                                    int /*iteration*/) {}
+  virtual void OnEnvironmentsSetUpStart(const UnitTest& /*unit_test*/) {}
+  virtual void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) {}
+  virtual void OnTestCaseStart(const TestCase& /*test_case*/) {}
+  virtual void OnTestStart(const TestInfo& /*test_info*/) {}
+  virtual void OnTestPartResult(const TestPartResult& /*test_part_result*/) {}
+  virtual void OnTestEnd(const TestInfo& /*test_info*/) {}
+  virtual void OnTestCaseEnd(const TestCase& /*test_case*/) {}
+  virtual void OnEnvironmentsTearDownStart(const UnitTest& /*unit_test*/) {}
+  virtual void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) {}
+  virtual void OnTestIterationEnd(const UnitTest& /*unit_test*/,
+                                  int /*iteration*/) {}
+  virtual void OnTestProgramEnd(const UnitTest& /*unit_test*/) {}
+};
+
+// TestEventListeners lets users add listeners to track events in Google Test.
+class GTEST_API_ TestEventListeners {
+ public:
+  TestEventListeners();
+  ~TestEventListeners();
+
+  // Appends an event listener to the end of the list. Google Test assumes
+  // the ownership of the listener (i.e. it will delete the listener when
+  // the test program finishes).
+  void Append(TestEventListener* listener);
+
+  // Removes the given event listener from the list and returns it.  It then
+  // becomes the caller's responsibility to delete the listener. Returns
+  // NULL if the listener is not found in the list.
+  TestEventListener* Release(TestEventListener* listener);
+
+  // Returns the standard listener responsible for the default console
+  // output.  Can be removed from the listeners list to shut down default
+  // console output.  Note that removing this object from the listener list
+  // with Release transfers its ownership to the caller and makes this
+  // function return NULL the next time.
+  TestEventListener* default_result_printer() const {
+    return default_result_printer_;
+  }
+
+  // Returns the standard listener responsible for the default XML output
+  // controlled by the --gtest_output=xml flag.  Can be removed from the
+  // listeners list by users who want to shut down the default XML output
+  // controlled by this flag and substitute it with custom one.  Note that
+  // removing this object from the listener list with Release transfers its
+  // ownership to the caller and makes this function return NULL the next
+  // time.
+  TestEventListener* default_xml_generator() const {
+    return default_xml_generator_;
+  }
+
+ private:
+  friend class TestCase;
+  friend class TestInfo;
+  friend class internal::DefaultGlobalTestPartResultReporter;
+  friend class internal::NoExecDeathTest;
+  friend class internal::TestEventListenersAccessor;
+  friend class internal::UnitTestImpl;
+
+  // Returns repeater that broadcasts the TestEventListener events to all
+  // subscribers.
+  TestEventListener* repeater();
+
+  // Sets the default_result_printer attribute to the provided listener.
+  // The listener is also added to the listener list and previous
+  // default_result_printer is removed from it and deleted. The listener can
+  // also be NULL in which case it will not be added to the list. Does
+  // nothing if the previous and the current listener objects are the same.
+  void SetDefaultResultPrinter(TestEventListener* listener);
+
+  // Sets the default_xml_generator attribute to the provided listener.  The
+  // listener is also added to the listener list and previous
+  // default_xml_generator is removed from it and deleted. The listener can
+  // also be NULL in which case it will not be added to the list. Does
+  // nothing if the previous and the current listener objects are the same.
+  void SetDefaultXmlGenerator(TestEventListener* listener);
+
+  // Controls whether events will be forwarded by the repeater to the
+  // listeners in the list.
+  bool EventForwardingEnabled() const;
+  void SuppressEventForwarding();
+
+  // The actual list of listeners.
+  internal::TestEventRepeater* repeater_;
+  // Listener responsible for the standard result output.
+  TestEventListener* default_result_printer_;
+  // Listener responsible for the creation of the XML output file.
+  TestEventListener* default_xml_generator_;
+
+  // We disallow copying TestEventListeners.
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventListeners);
+};
+
+// A UnitTest consists of a vector of TestCases.
+//
+// This is a singleton class.  The only instance of UnitTest is
+// created when UnitTest::GetInstance() is first called.  This
+// instance is never deleted.
+//
+// UnitTest is not copyable.
+//
+// This class is thread-safe as long as the methods are called
+// according to their specification.
+class GTEST_API_ UnitTest {
+ public:
+  // Gets the singleton UnitTest object.  The first time this method
+  // is called, a UnitTest object is constructed and returned.
+  // Consecutive calls will return the same object.
+  static UnitTest* GetInstance();
+
+  // Runs all tests in this UnitTest object and prints the result.
+  // Returns 0 if successful, or 1 otherwise.
+  //
+  // This method can only be called from the main thread.
+  //
+  // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+  int Run() GTEST_MUST_USE_RESULT_;
+
+  // Returns the working directory when the first TEST() or TEST_F()
+  // was executed.  The UnitTest object owns the string.
+  const char* original_working_dir() const;
+
+  // Returns the TestCase object for the test that's currently running,
+  // or NULL if no test is running.
+  const TestCase* current_test_case() const
+      GTEST_LOCK_EXCLUDED_(mutex_);
+
+  // Returns the TestInfo object for the test that's currently running,
+  // or NULL if no test is running.
+  const TestInfo* current_test_info() const
+      GTEST_LOCK_EXCLUDED_(mutex_);
+
+  // Returns the random seed used at the start of the current test run.
+  int random_seed() const;
+
+#if GTEST_HAS_PARAM_TEST
+  // Returns the ParameterizedTestCaseRegistry object used to keep track of
+  // value-parameterized tests and instantiate and register them.
+  //
+  // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+  internal::ParameterizedTestCaseRegistry& parameterized_test_registry()
+      GTEST_LOCK_EXCLUDED_(mutex_);
+#endif  // GTEST_HAS_PARAM_TEST
+
+  // Gets the number of successful test cases.
+  int successful_test_case_count() const;
+
+  // Gets the number of failed test cases.
+  int failed_test_case_count() const;
+
+  // Gets the number of all test cases.
+  int total_test_case_count() const;
+
+  // Gets the number of all test cases that contain at least one test
+  // that should run.
+  int test_case_to_run_count() const;
+
+  // Gets the number of successful tests.
+  int successful_test_count() const;
+
+  // Gets the number of failed tests.
+  int failed_test_count() const;
+
+  // Gets the number of disabled tests that will be reported in the XML report.
+  int reportable_disabled_test_count() const;
+
+  // Gets the number of disabled tests.
+  int disabled_test_count() const;
+
+  // Gets the number of tests to be printed in the XML report.
+  int reportable_test_count() const;
+
+  // Gets the number of all tests.
+  int total_test_count() const;
+
+  // Gets the number of tests that should run.
+  int test_to_run_count() const;
+
+  // Gets the time of the test program start, in ms from the start of the
+  // UNIX epoch.
+  TimeInMillis start_timestamp() const;
+
+  // Gets the elapsed time, in milliseconds.
+  TimeInMillis elapsed_time() const;
+
+  // Returns true iff the unit test passed (i.e. all test cases passed).
+  bool Passed() const;
+
+  // Returns true iff the unit test failed (i.e. some test case failed
+  // or something outside of all tests failed).
+  bool Failed() const;
+
+  // Gets the i-th test case among all the test cases. i can range from 0 to
+  // total_test_case_count() - 1. If i is not in that range, returns NULL.
+  const TestCase* GetTestCase(int i) const;
+
+  // Returns the TestResult containing information on test failures and
+  // properties logged outside of individual test cases.
+  const TestResult& ad_hoc_test_result() const;
+
+  // Returns the list of event listeners that can be used to track events
+  // inside Google Test.
+  TestEventListeners& listeners();
+
+ private:
+  // Registers and returns a global test environment.  When a test
+  // program is run, all global test environments will be set-up in
+  // the order they were registered.  After all tests in the program
+  // have finished, all global test environments will be torn-down in
+  // the *reverse* order they were registered.
+  //
+  // The UnitTest object takes ownership of the given environment.
+  //
+  // This method can only be called from the main thread.
+  Environment* AddEnvironment(Environment* env);
+
+  // Adds a TestPartResult to the current TestResult object.  All
+  // Google Test assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc)
+  // eventually call this to report their results.  The user code
+  // should use the assertion macros instead of calling this directly.
+  void AddTestPartResult(TestPartResult::Type result_type,
+                         const char* file_name,
+                         int line_number,
+                         const std::string& message,
+                         const std::string& os_stack_trace)
+      GTEST_LOCK_EXCLUDED_(mutex_);
+
+  // Adds a TestProperty to the current TestResult object when invoked from
+  // inside a test, to current TestCase's ad_hoc_test_result_ when invoked
+  // from SetUpTestCase or TearDownTestCase, or to the global property set
+  // when invoked elsewhere.  If the result already contains a property with
+  // the same key, the value will be updated.
+  void RecordProperty(const std::string& key, const std::string& value);
+
+  // Gets the i-th test case among all the test cases. i can range from 0 to
+  // total_test_case_count() - 1. If i is not in that range, returns NULL.
+  TestCase* GetMutableTestCase(int i);
+
+  // Accessors for the implementation object.
+  internal::UnitTestImpl* impl() { return impl_; }
+  const internal::UnitTestImpl* impl() const { return impl_; }
+
+  // These classes and funcions are friends as they need to access private
+  // members of UnitTest.
+  friend class Test;
+  friend class internal::AssertHelper;
+  friend class internal::ScopedTrace;
+  friend class internal::StreamingListenerTest;
+  friend class internal::UnitTestRecordPropertyTestHelper;
+  friend Environment* AddGlobalTestEnvironment(Environment* env);
+  friend internal::UnitTestImpl* internal::GetUnitTestImpl();
+  friend void internal::ReportFailureInUnknownLocation(
+      TestPartResult::Type result_type,
+      const std::string& message);
+
+  // Creates an empty UnitTest.
+  UnitTest();
+
+  // D'tor
+  virtual ~UnitTest();
+
+  // Pushes a trace defined by SCOPED_TRACE() on to the per-thread
+  // Google Test trace stack.
+  void PushGTestTrace(const internal::TraceInfo& trace)
+      GTEST_LOCK_EXCLUDED_(mutex_);
+
+  // Pops a trace from the per-thread Google Test trace stack.
+  void PopGTestTrace()
+      GTEST_LOCK_EXCLUDED_(mutex_);
+
+  // Protects mutable state in *impl_.  This is mutable as some const
+  // methods need to lock it too.
+  mutable internal::Mutex mutex_;
+
+  // Opaque implementation object.  This field is never changed once
+  // the object is constructed.  We don't mark it as const here, as
+  // doing so will cause a warning in the constructor of UnitTest.
+  // Mutable state in *impl_ is protected by mutex_.
+  internal::UnitTestImpl* impl_;
+
+  // We disallow copying UnitTest.
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTest);
+};
+
+// A convenient wrapper for adding an environment for the test
+// program.
+//
+// You should call this before RUN_ALL_TESTS() is called, probably in
+// main().  If you use gtest_main, you need to call this before main()
+// starts for it to take effect.  For example, you can define a global
+// variable like this:
+//
+//   testing::Environment* const foo_env =
+//       testing::AddGlobalTestEnvironment(new FooEnvironment);
+//
+// However, we strongly recommend you to write your own main() and
+// call AddGlobalTestEnvironment() there, as relying on initialization
+// of global variables makes the code harder to read and may cause
+// problems when you register multiple environments from different
+// translation units and the environments have dependencies among them
+// (remember that the compiler doesn't guarantee the order in which
+// global variables from different translation units are initialized).
+inline Environment* AddGlobalTestEnvironment(Environment* env) {
+  return UnitTest::GetInstance()->AddEnvironment(env);
+}
+
+// Initializes Google Test.  This must be called before calling
+// RUN_ALL_TESTS().  In particular, it parses a command line for the
+// flags that Google Test recognizes.  Whenever a Google Test flag is
+// seen, it is removed from argv, and *argc is decremented.
+//
+// No value is returned.  Instead, the Google Test flag variables are
+// updated.
+//
+// Calling the function for the second time has no user-visible effect.
+GTEST_API_ void InitGoogleTest(int* argc, char** argv);
+
+// This overloaded version can be used in Windows programs compiled in
+// UNICODE mode.
+GTEST_API_ void InitGoogleTest(int* argc, wchar_t** argv);
+
+namespace internal {
+
+// FormatForComparison<ToPrint, OtherOperand>::Format(value) formats a
+// value of type ToPrint that is an operand of a comparison assertion
+// (e.g. ASSERT_EQ).  OtherOperand is the type of the other operand in
+// the comparison, and is used to help determine the best way to
+// format the value.  In particular, when the value is a C string
+// (char pointer) and the other operand is an STL string object, we
+// want to format the C string as a string, since we know it is
+// compared by value with the string object.  If the value is a char
+// pointer but the other operand is not an STL string object, we don't
+// know whether the pointer is supposed to point to a NUL-terminated
+// string, and thus want to print it as a pointer to be safe.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+
+// The default case.
+template <typename ToPrint, typename OtherOperand>
+class FormatForComparison {
+ public:
+  static ::std::string Format(const ToPrint& value) {
+    return ::testing::PrintToString(value);
+  }
+};
+
+// Array.
+template <typename ToPrint, size_t N, typename OtherOperand>
+class FormatForComparison<ToPrint[N], OtherOperand> {
+ public:
+  static ::std::string Format(const ToPrint* value) {
+    return FormatForComparison<const ToPrint*, OtherOperand>::Format(value);
+  }
+};
+
+// By default, print C string as pointers to be safe, as we don't know
+// whether they actually point to a NUL-terminated string.
+
+#define GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(CharType)                \
+  template <typename OtherOperand>                                      \
+  class FormatForComparison<CharType*, OtherOperand> {                  \
+   public:                                                              \
+    static ::std::string Format(CharType* value) {                      \
+      return ::testing::PrintToString(static_cast<const void*>(value)); \
+    }                                                                   \
+  }
+
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(wchar_t);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const wchar_t);
+
+#undef GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_
+
+// If a C string is compared with an STL string object, we know it's meant
+// to point to a NUL-terminated string, and thus can print it as a string.
+
+#define GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(CharType, OtherStringType) \
+  template <>                                                           \
+  class FormatForComparison<CharType*, OtherStringType> {               \
+   public:                                                              \
+    static ::std::string Format(CharType* value) {                      \
+      return ::testing::PrintToString(value);                           \
+    }                                                                   \
+  }
+
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::std::string);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::std::string);
+
+#if GTEST_HAS_GLOBAL_STRING
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::string);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::string);
+#endif
+
+#if GTEST_HAS_GLOBAL_WSTRING
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::wstring);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::wstring);
+#endif
+
+#if GTEST_HAS_STD_WSTRING
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::std::wstring);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::std::wstring);
+#endif
+
+#undef GTEST_IMPL_FORMAT_C_STRING_AS_STRING_
+
+// Formats a comparison assertion (e.g. ASSERT_EQ, EXPECT_LT, and etc)
+// operand to be used in a failure message.  The type (but not value)
+// of the other operand may affect the format.  This allows us to
+// print a char* as a raw pointer when it is compared against another
+// char* or void*, and print it as a C string when it is compared
+// against an std::string object, for example.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+template <typename T1, typename T2>
+std::string FormatForComparisonFailureMessage(
+    const T1& value, const T2& /* other_operand */) {
+  return FormatForComparison<T1, T2>::Format(value);
+}
+
+// The helper function for {ASSERT|EXPECT}_EQ.
+template <typename T1, typename T2>
+AssertionResult CmpHelperEQ(const char* expected_expression,
+                            const char* actual_expression,
+                            const T1& expected,
+                            const T2& actual) {
+#ifdef _MSC_VER
+# pragma warning(push)          // Saves the current warning state.
+# pragma warning(disable:4389)  // Temporarily disables warning on
+                                // signed/unsigned mismatch.
+#endif
+
+  if (expected == actual) {
+    return AssertionSuccess();
+  }
+
+#ifdef _MSC_VER
+# pragma warning(pop)          // Restores the warning state.
+#endif
+
+  return EqFailure(expected_expression,
+                   actual_expression,
+                   FormatForComparisonFailureMessage(expected, actual),
+                   FormatForComparisonFailureMessage(actual, expected),
+                   false);
+}
+
+// With this overloaded version, we allow anonymous enums to be used
+// in {ASSERT|EXPECT}_EQ when compiled with gcc 4, as anonymous enums
+// can be implicitly cast to BiggestInt.
+GTEST_API_ AssertionResult CmpHelperEQ(const char* expected_expression,
+                                       const char* actual_expression,
+                                       BiggestInt expected,
+                                       BiggestInt actual);
+
+// The helper class for {ASSERT|EXPECT}_EQ.  The template argument
+// lhs_is_null_literal is true iff the first argument to ASSERT_EQ()
+// is a null pointer literal.  The following default implementation is
+// for lhs_is_null_literal being false.
+template <bool lhs_is_null_literal>
+class EqHelper {
+ public:
+  // This templatized version is for the general case.
+  template <typename T1, typename T2>
+  static AssertionResult Compare(const char* expected_expression,
+                                 const char* actual_expression,
+                                 const T1& expected,
+                                 const T2& actual) {
+    return CmpHelperEQ(expected_expression, actual_expression, expected,
+                       actual);
+  }
+
+  // With this overloaded version, we allow anonymous enums to be used
+  // in {ASSERT|EXPECT}_EQ when compiled with gcc 4, as anonymous
+  // enums can be implicitly cast to BiggestInt.
+  //
+  // Even though its body looks the same as the above version, we
+  // cannot merge the two, as it will make anonymous enums unhappy.
+  static AssertionResult Compare(const char* expected_expression,
+                                 const char* actual_expression,
+                                 BiggestInt expected,
+                                 BiggestInt actual) {
+    return CmpHelperEQ(expected_expression, actual_expression, expected,
+                       actual);
+  }
+};
+
+// This specialization is used when the first argument to ASSERT_EQ()
+// is a null pointer literal, like NULL, false, or 0.
+template <>
+class EqHelper<true> {
+ public:
+  // We define two overloaded versions of Compare().  The first
+  // version will be picked when the second argument to ASSERT_EQ() is
+  // NOT a pointer, e.g. ASSERT_EQ(0, AnIntFunction()) or
+  // EXPECT_EQ(false, a_bool).
+  template <typename T1, typename T2>
+  static AssertionResult Compare(
+      const char* expected_expression,
+      const char* actual_expression,
+      const T1& expected,
+      const T2& actual,
+      // The following line prevents this overload from being considered if T2
+      // is not a pointer type.  We need this because ASSERT_EQ(NULL, my_ptr)
+      // expands to Compare("", "", NULL, my_ptr), which requires a conversion
+      // to match the Secret* in the other overload, which would otherwise make
+      // this template match better.
+      typename EnableIf<!is_pointer<T2>::value>::type* = 0) {
+    return CmpHelperEQ(expected_expression, actual_expression, expected,
+                       actual);
+  }
+
+  // This version will be picked when the second argument to ASSERT_EQ() is a
+  // pointer, e.g. ASSERT_EQ(NULL, a_pointer).
+  template <typename T>
+  static AssertionResult Compare(
+      const char* expected_expression,
+      const char* actual_expression,
+      // We used to have a second template parameter instead of Secret*.  That
+      // template parameter would deduce to 'long', making this a better match
+      // than the first overload even without the first overload's EnableIf.
+      // Unfortunately, gcc with -Wconversion-null warns when "passing NULL to
+      // non-pointer argument" (even a deduced integral argument), so the old
+      // implementation caused warnings in user code.
+      Secret* /* expected (NULL) */,
+      T* actual) {
+    // We already know that 'expected' is a null pointer.
+    return CmpHelperEQ(expected_expression, actual_expression,
+                       static_cast<T*>(NULL), actual);
+  }
+};
+
+// A macro for implementing the helper functions needed to implement
+// ASSERT_?? and EXPECT_??.  It is here just to avoid copy-and-paste
+// of similar code.
+//
+// For each templatized helper function, we also define an overloaded
+// version for BiggestInt in order to reduce code bloat and allow
+// anonymous enums to be used with {ASSERT|EXPECT}_?? when compiled
+// with gcc 4.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+#define GTEST_IMPL_CMP_HELPER_(op_name, op)\
+template <typename T1, typename T2>\
+AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \
+                                   const T1& val1, const T2& val2) {\
+  if (val1 op val2) {\
+    return AssertionSuccess();\
+  } else {\
+    return AssertionFailure() \
+        << "Expected: (" << expr1 << ") " #op " (" << expr2\
+        << "), actual: " << FormatForComparisonFailureMessage(val1, val2)\
+        << " vs " << FormatForComparisonFailureMessage(val2, val1);\
+  }\
+}\
+GTEST_API_ AssertionResult CmpHelper##op_name(\
+    const char* expr1, const char* expr2, BiggestInt val1, BiggestInt val2)
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+
+// Implements the helper function for {ASSERT|EXPECT}_NE
+GTEST_IMPL_CMP_HELPER_(NE, !=);
+// Implements the helper function for {ASSERT|EXPECT}_LE
+GTEST_IMPL_CMP_HELPER_(LE, <=);
+// Implements the helper function for {ASSERT|EXPECT}_LT
+GTEST_IMPL_CMP_HELPER_(LT, <);
+// Implements the helper function for {ASSERT|EXPECT}_GE
+GTEST_IMPL_CMP_HELPER_(GE, >=);
+// Implements the helper function for {ASSERT|EXPECT}_GT
+GTEST_IMPL_CMP_HELPER_(GT, >);
+
+#undef GTEST_IMPL_CMP_HELPER_
+
+// The helper function for {ASSERT|EXPECT}_STREQ.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTREQ(const char* expected_expression,
+                                          const char* actual_expression,
+                                          const char* expected,
+                                          const char* actual);
+
+// The helper function for {ASSERT|EXPECT}_STRCASEEQ.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTRCASEEQ(const char* expected_expression,
+                                              const char* actual_expression,
+                                              const char* expected,
+                                              const char* actual);
+
+// The helper function for {ASSERT|EXPECT}_STRNE.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression,
+                                          const char* s2_expression,
+                                          const char* s1,
+                                          const char* s2);
+
+// The helper function for {ASSERT|EXPECT}_STRCASENE.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTRCASENE(const char* s1_expression,
+                                              const char* s2_expression,
+                                              const char* s1,
+                                              const char* s2);
+
+
+// Helper function for *_STREQ on wide strings.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTREQ(const char* expected_expression,
+                                          const char* actual_expression,
+                                          const wchar_t* expected,
+                                          const wchar_t* actual);
+
+// Helper function for *_STRNE on wide strings.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression,
+                                          const char* s2_expression,
+                                          const wchar_t* s1,
+                                          const wchar_t* s2);
+
+}  // namespace internal
+
+// IsSubstring() and IsNotSubstring() are intended to be used as the
+// first argument to {EXPECT,ASSERT}_PRED_FORMAT2(), not by
+// themselves.  They check whether needle is a substring of haystack
+// (NULL is considered a substring of itself only), and return an
+// appropriate error message when they fail.
+//
+// The {needle,haystack}_expr arguments are the stringified
+// expressions that generated the two real arguments.
+GTEST_API_ AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const char* needle, const char* haystack);
+GTEST_API_ AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const wchar_t* needle, const wchar_t* haystack);
+GTEST_API_ AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const char* needle, const char* haystack);
+GTEST_API_ AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const wchar_t* needle, const wchar_t* haystack);
+GTEST_API_ AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::string& needle, const ::std::string& haystack);
+GTEST_API_ AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::string& needle, const ::std::string& haystack);
+
+#if GTEST_HAS_STD_WSTRING
+GTEST_API_ AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::wstring& needle, const ::std::wstring& haystack);
+GTEST_API_ AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::wstring& needle, const ::std::wstring& haystack);
+#endif  // GTEST_HAS_STD_WSTRING
+
+namespace internal {
+
+// Helper template function for comparing floating-points.
+//
+// Template parameter:
+//
+//   RawType: the raw floating-point type (either float or double)
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+template <typename RawType>
+AssertionResult CmpHelperFloatingPointEQ(const char* expected_expression,
+                                         const char* actual_expression,
+                                         RawType expected,
+                                         RawType actual) {
+  const FloatingPoint<RawType> lhs(expected), rhs(actual);
+
+  if (lhs.AlmostEquals(rhs)) {
+    return AssertionSuccess();
+  }
+
+  ::std::stringstream expected_ss;
+  expected_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+              << expected;
+
+  ::std::stringstream actual_ss;
+  actual_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+            << actual;
+
+  return EqFailure(expected_expression,
+                   actual_expression,
+                   StringStreamToString(&expected_ss),
+                   StringStreamToString(&actual_ss),
+                   false);
+}
+
+// Helper function for implementing ASSERT_NEAR.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult DoubleNearPredFormat(const char* expr1,
+                                                const char* expr2,
+                                                const char* abs_error_expr,
+                                                double val1,
+                                                double val2,
+                                                double abs_error);
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+// A class that enables one to stream messages to assertion macros
+class GTEST_API_ AssertHelper {
+ public:
+  // Constructor.
+  AssertHelper(TestPartResult::Type type,
+               const char* file,
+               int line,
+               const char* message);
+  ~AssertHelper();
+
+  // Message assignment is a semantic trick to enable assertion
+  // streaming; see the GTEST_MESSAGE_ macro below.
+  void operator=(const Message& message) const;
+
+ private:
+  // We put our data in a struct so that the size of the AssertHelper class can
+  // be as small as possible.  This is important because gcc is incapable of
+  // re-using stack space even for temporary variables, so every EXPECT_EQ
+  // reserves stack space for another AssertHelper.
+  struct AssertHelperData {
+    AssertHelperData(TestPartResult::Type t,
+                     const char* srcfile,
+                     int line_num,
+                     const char* msg)
+        : type(t), file(srcfile), line(line_num), message(msg) { }
+
+    TestPartResult::Type const type;
+    const char* const file;
+    int const line;
+    std::string const message;
+
+   private:
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelperData);
+  };
+
+  AssertHelperData* const data_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelper);
+};
+
+}  // namespace internal
+
+#if GTEST_HAS_PARAM_TEST
+// The pure interface class that all value-parameterized tests inherit from.
+// A value-parameterized class must inherit from both ::testing::Test and
+// ::testing::WithParamInterface. In most cases that just means inheriting
+// from ::testing::TestWithParam, but more complicated test hierarchies
+// may need to inherit from Test and WithParamInterface at different levels.
+//
+// This interface has support for accessing the test parameter value via
+// the GetParam() method.
+//
+// Use it with one of the parameter generator defining functions, like Range(),
+// Values(), ValuesIn(), Bool(), and Combine().
+//
+// class FooTest : public ::testing::TestWithParam<int> {
+//  protected:
+//   FooTest() {
+//     // Can use GetParam() here.
+//   }
+//   virtual ~FooTest() {
+//     // Can use GetParam() here.
+//   }
+//   virtual void SetUp() {
+//     // Can use GetParam() here.
+//   }
+//   virtual void TearDown {
+//     // Can use GetParam() here.
+//   }
+// };
+// TEST_P(FooTest, DoesBar) {
+//   // Can use GetParam() method here.
+//   Foo foo;
+//   ASSERT_TRUE(foo.DoesBar(GetParam()));
+// }
+// INSTANTIATE_TEST_CASE_P(OneToTenRange, FooTest, ::testing::Range(1, 10));
+
+template <typename T>
+class WithParamInterface {
+ public:
+  typedef T ParamType;
+  virtual ~WithParamInterface() {}
+
+  // The current parameter value. Is also available in the test fixture's
+  // constructor. This member function is non-static, even though it only
+  // references static data, to reduce the opportunity for incorrect uses
+  // like writing 'WithParamInterface<bool>::GetParam()' for a test that
+  // uses a fixture whose parameter type is int.
+  const ParamType& GetParam() const {
+    GTEST_CHECK_(parameter_ != NULL)
+        << "GetParam() can only be called inside a value-parameterized test "
+        << "-- did you intend to write TEST_P instead of TEST_F?";
+    return *parameter_;
+  }
+
+ private:
+  // Sets parameter value. The caller is responsible for making sure the value
+  // remains alive and unchanged throughout the current test.
+  static void SetParam(const ParamType* parameter) {
+    parameter_ = parameter;
+  }
+
+  // Static value used for accessing parameter during a test lifetime.
+  static const ParamType* parameter_;
+
+  // TestClass must be a subclass of WithParamInterface<T> and Test.
+  template <class TestClass> friend class internal::ParameterizedTestFactory;
+};
+
+template <typename T>
+const T* WithParamInterface<T>::parameter_ = NULL;
+
+// Most value-parameterized classes can ignore the existence of
+// WithParamInterface, and can just inherit from ::testing::TestWithParam.
+
+template <typename T>
+class TestWithParam : public Test, public WithParamInterface<T> {
+};
+
+#endif  // GTEST_HAS_PARAM_TEST
+
+// Macros for indicating success/failure in test code.
+
+// ADD_FAILURE unconditionally adds a failure to the current test.
+// SUCCEED generates a success - it doesn't automatically make the
+// current test successful, as a test is only successful when it has
+// no failure.
+//
+// EXPECT_* verifies that a certain condition is satisfied.  If not,
+// it behaves like ADD_FAILURE.  In particular:
+//
+//   EXPECT_TRUE  verifies that a Boolean condition is true.
+//   EXPECT_FALSE verifies that a Boolean condition is false.
+//
+// FAIL and ASSERT_* are similar to ADD_FAILURE and EXPECT_*, except
+// that they will also abort the current function on failure.  People
+// usually want the fail-fast behavior of FAIL and ASSERT_*, but those
+// writing data-driven tests often find themselves using ADD_FAILURE
+// and EXPECT_* more.
+
+// Generates a nonfatal failure with a generic message.
+#define ADD_FAILURE() GTEST_NONFATAL_FAILURE_("Failed")
+
+// Generates a nonfatal failure at the given source file location with
+// a generic message.
+#define ADD_FAILURE_AT(file, line) \
+  GTEST_MESSAGE_AT_(file, line, "Failed", \
+                    ::testing::TestPartResult::kNonFatalFailure)
+
+// Generates a fatal failure with a generic message.
+#define GTEST_FAIL() GTEST_FATAL_FAILURE_("Failed")
+
+// Define this macro to 1 to omit the definition of FAIL(), which is a
+// generic name and clashes with some other libraries.
+#if !GTEST_DONT_DEFINE_FAIL
+# define FAIL() GTEST_FAIL()
+#endif
+
+// Generates a success with a generic message.
+#define GTEST_SUCCEED() GTEST_SUCCESS_("Succeeded")
+
+// Define this macro to 1 to omit the definition of SUCCEED(), which
+// is a generic name and clashes with some other libraries.
+#if !GTEST_DONT_DEFINE_SUCCEED
+# define SUCCEED() GTEST_SUCCEED()
+#endif
+
+// Macros for testing exceptions.
+//
+//    * {ASSERT|EXPECT}_THROW(statement, expected_exception):
+//         Tests that the statement throws the expected exception.
+//    * {ASSERT|EXPECT}_NO_THROW(statement):
+//         Tests that the statement doesn't throw any exception.
+//    * {ASSERT|EXPECT}_ANY_THROW(statement):
+//         Tests that the statement throws an exception.
+
+#define EXPECT_THROW(statement, expected_exception) \
+  GTEST_TEST_THROW_(statement, expected_exception, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_NO_THROW(statement) \
+  GTEST_TEST_NO_THROW_(statement, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_ANY_THROW(statement) \
+  GTEST_TEST_ANY_THROW_(statement, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_THROW(statement, expected_exception) \
+  GTEST_TEST_THROW_(statement, expected_exception, GTEST_FATAL_FAILURE_)
+#define ASSERT_NO_THROW(statement) \
+  GTEST_TEST_NO_THROW_(statement, GTEST_FATAL_FAILURE_)
+#define ASSERT_ANY_THROW(statement) \
+  GTEST_TEST_ANY_THROW_(statement, GTEST_FATAL_FAILURE_)
+
+// Boolean assertions. Condition can be either a Boolean expression or an
+// AssertionResult. For more information on how to use AssertionResult with
+// these macros see comments on that class.
+#define EXPECT_TRUE(condition) \
+  GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \
+                      GTEST_NONFATAL_FAILURE_)
+#define EXPECT_FALSE(condition) \
+  GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
+                      GTEST_NONFATAL_FAILURE_)
+#define ASSERT_TRUE(condition) \
+  GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \
+                      GTEST_FATAL_FAILURE_)
+#define ASSERT_FALSE(condition) \
+  GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
+                      GTEST_FATAL_FAILURE_)
+
+// Includes the auto-generated header that implements a family of
+// generic predicate assertion macros.
+// Copyright 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// This file is AUTOMATICALLY GENERATED on 10/31/2011 by command
+// 'gen_gtest_pred_impl.py 5'.  DO NOT EDIT BY HAND!
+//
+// Implements a family of generic predicate assertion macros.
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
+#define GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
+
+// Makes sure this header is not included before gtest.h.
+#ifndef GTEST_INCLUDE_GTEST_GTEST_H_
+# error Do not include gtest_pred_impl.h directly.  Include gtest.h instead.
+#endif  // GTEST_INCLUDE_GTEST_GTEST_H_
+
+// This header implements a family of generic predicate assertion
+// macros:
+//
+//   ASSERT_PRED_FORMAT1(pred_format, v1)
+//   ASSERT_PRED_FORMAT2(pred_format, v1, v2)
+//   ...
+//
+// where pred_format is a function or functor that takes n (in the
+// case of ASSERT_PRED_FORMATn) values and their source expression
+// text, and returns a testing::AssertionResult.  See the definition
+// of ASSERT_EQ in gtest.h for an example.
+//
+// If you don't care about formatting, you can use the more
+// restrictive version:
+//
+//   ASSERT_PRED1(pred, v1)
+//   ASSERT_PRED2(pred, v1, v2)
+//   ...
+//
+// where pred is an n-ary function or functor that returns bool,
+// and the values v1, v2, ..., must support the << operator for
+// streaming to std::ostream.
+//
+// We also define the EXPECT_* variations.
+//
+// For now we only support predicates whose arity is at most 5.
+// Please email googletestframework@googlegroups.com if you need
+// support for higher arities.
+
+// GTEST_ASSERT_ is the basic statement to which all of the assertions
+// in this file reduce.  Don't use this in your code.
+
+#define GTEST_ASSERT_(expression, on_failure) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (const ::testing::AssertionResult gtest_ar = (expression)) \
+    ; \
+  else \
+    on_failure(gtest_ar.failure_message())
+
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED1.  Don't use
+// this in your code.
+template <typename Pred,
+          typename T1>
+AssertionResult AssertPred1Helper(const char* pred_text,
+                                  const char* e1,
+                                  Pred pred,
+                                  const T1& v1) {
+  if (pred(v1)) return AssertionSuccess();
+
+  return AssertionFailure() << pred_text << "("
+                            << e1 << ") evaluates to false, where"
+                            << "\n" << e1 << " evaluates to " << v1;
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT1.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT1_(pred_format, v1, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, v1), \
+                on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED1.  Don't use
+// this in your code.
+#define GTEST_PRED1_(pred, v1, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred1Helper(#pred, \
+                                             #v1, \
+                                             pred, \
+                                             v1), on_failure)
+
+// Unary predicate assertion macros.
+#define EXPECT_PRED_FORMAT1(pred_format, v1) \
+  GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED1(pred, v1) \
+  GTEST_PRED1_(pred, v1, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT1(pred_format, v1) \
+  GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED1(pred, v1) \
+  GTEST_PRED1_(pred, v1, GTEST_FATAL_FAILURE_)
+
+
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED2.  Don't use
+// this in your code.
+template <typename Pred,
+          typename T1,
+          typename T2>
+AssertionResult AssertPred2Helper(const char* pred_text,
+                                  const char* e1,
+                                  const char* e2,
+                                  Pred pred,
+                                  const T1& v1,
+                                  const T2& v2) {
+  if (pred(v1, v2)) return AssertionSuccess();
+
+  return AssertionFailure() << pred_text << "("
+                            << e1 << ", "
+                            << e2 << ") evaluates to false, where"
+                            << "\n" << e1 << " evaluates to " << v1
+                            << "\n" << e2 << " evaluates to " << v2;
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT2.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT2_(pred_format, v1, v2, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, #v2, v1, v2), \
+                on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED2.  Don't use
+// this in your code.
+#define GTEST_PRED2_(pred, v1, v2, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred2Helper(#pred, \
+                                             #v1, \
+                                             #v2, \
+                                             pred, \
+                                             v1, \
+                                             v2), on_failure)
+
+// Binary predicate assertion macros.
+#define EXPECT_PRED_FORMAT2(pred_format, v1, v2) \
+  GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED2(pred, v1, v2) \
+  GTEST_PRED2_(pred, v1, v2, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT2(pred_format, v1, v2) \
+  GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED2(pred, v1, v2) \
+  GTEST_PRED2_(pred, v1, v2, GTEST_FATAL_FAILURE_)
+
+
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED3.  Don't use
+// this in your code.
+template <typename Pred,
+          typename T1,
+          typename T2,
+          typename T3>
+AssertionResult AssertPred3Helper(const char* pred_text,
+                                  const char* e1,
+                                  const char* e2,
+                                  const char* e3,
+                                  Pred pred,
+                                  const T1& v1,
+                                  const T2& v2,
+                                  const T3& v3) {
+  if (pred(v1, v2, v3)) return AssertionSuccess();
+
+  return AssertionFailure() << pred_text << "("
+                            << e1 << ", "
+                            << e2 << ", "
+                            << e3 << ") evaluates to false, where"
+                            << "\n" << e1 << " evaluates to " << v1
+                            << "\n" << e2 << " evaluates to " << v2
+                            << "\n" << e3 << " evaluates to " << v3;
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT3.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, v1, v2, v3), \
+                on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED3.  Don't use
+// this in your code.
+#define GTEST_PRED3_(pred, v1, v2, v3, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred3Helper(#pred, \
+                                             #v1, \
+                                             #v2, \
+                                             #v3, \
+                                             pred, \
+                                             v1, \
+                                             v2, \
+                                             v3), on_failure)
+
+// Ternary predicate assertion macros.
+#define EXPECT_PRED_FORMAT3(pred_format, v1, v2, v3) \
+  GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED3(pred, v1, v2, v3) \
+  GTEST_PRED3_(pred, v1, v2, v3, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT3(pred_format, v1, v2, v3) \
+  GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED3(pred, v1, v2, v3) \
+  GTEST_PRED3_(pred, v1, v2, v3, GTEST_FATAL_FAILURE_)
+
+
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED4.  Don't use
+// this in your code.
+template <typename Pred,
+          typename T1,
+          typename T2,
+          typename T3,
+          typename T4>
+AssertionResult AssertPred4Helper(const char* pred_text,
+                                  const char* e1,
+                                  const char* e2,
+                                  const char* e3,
+                                  const char* e4,
+                                  Pred pred,
+                                  const T1& v1,
+                                  const T2& v2,
+                                  const T3& v3,
+                                  const T4& v4) {
+  if (pred(v1, v2, v3, v4)) return AssertionSuccess();
+
+  return AssertionFailure() << pred_text << "("
+                            << e1 << ", "
+                            << e2 << ", "
+                            << e3 << ", "
+                            << e4 << ") evaluates to false, where"
+                            << "\n" << e1 << " evaluates to " << v1
+                            << "\n" << e2 << " evaluates to " << v2
+                            << "\n" << e3 << " evaluates to " << v3
+                            << "\n" << e4 << " evaluates to " << v4;
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT4.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, v1, v2, v3, v4), \
+                on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED4.  Don't use
+// this in your code.
+#define GTEST_PRED4_(pred, v1, v2, v3, v4, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred4Helper(#pred, \
+                                             #v1, \
+                                             #v2, \
+                                             #v3, \
+                                             #v4, \
+                                             pred, \
+                                             v1, \
+                                             v2, \
+                                             v3, \
+                                             v4), on_failure)
+
+// 4-ary predicate assertion macros.
+#define EXPECT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \
+  GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED4(pred, v1, v2, v3, v4) \
+  GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \
+  GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED4(pred, v1, v2, v3, v4) \
+  GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_FATAL_FAILURE_)
+
+
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED5.  Don't use
+// this in your code.
+template <typename Pred,
+          typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5>
+AssertionResult AssertPred5Helper(const char* pred_text,
+                                  const char* e1,
+                                  const char* e2,
+                                  const char* e3,
+                                  const char* e4,
+                                  const char* e5,
+                                  Pred pred,
+                                  const T1& v1,
+                                  const T2& v2,
+                                  const T3& v3,
+                                  const T4& v4,
+                                  const T5& v5) {
+  if (pred(v1, v2, v3, v4, v5)) return AssertionSuccess();
+
+  return AssertionFailure() << pred_text << "("
+                            << e1 << ", "
+                            << e2 << ", "
+                            << e3 << ", "
+                            << e4 << ", "
+                            << e5 << ") evaluates to false, where"
+                            << "\n" << e1 << " evaluates to " << v1
+                            << "\n" << e2 << " evaluates to " << v2
+                            << "\n" << e3 << " evaluates to " << v3
+                            << "\n" << e4 << " evaluates to " << v4
+                            << "\n" << e5 << " evaluates to " << v5;
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT5.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, #v5, v1, v2, v3, v4, v5), \
+                on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED5.  Don't use
+// this in your code.
+#define GTEST_PRED5_(pred, v1, v2, v3, v4, v5, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred5Helper(#pred, \
+                                             #v1, \
+                                             #v2, \
+                                             #v3, \
+                                             #v4, \
+                                             #v5, \
+                                             pred, \
+                                             v1, \
+                                             v2, \
+                                             v3, \
+                                             v4, \
+                                             v5), on_failure)
+
+// 5-ary predicate assertion macros.
+#define EXPECT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \
+  GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED5(pred, v1, v2, v3, v4, v5) \
+  GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \
+  GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED5(pred, v1, v2, v3, v4, v5) \
+  GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_)
+
+
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
+
+// Macros for testing equalities and inequalities.
+//
+//    * {ASSERT|EXPECT}_EQ(expected, actual): Tests that expected == actual
+//    * {ASSERT|EXPECT}_NE(v1, v2):           Tests that v1 != v2
+//    * {ASSERT|EXPECT}_LT(v1, v2):           Tests that v1 < v2
+//    * {ASSERT|EXPECT}_LE(v1, v2):           Tests that v1 <= v2
+//    * {ASSERT|EXPECT}_GT(v1, v2):           Tests that v1 > v2
+//    * {ASSERT|EXPECT}_GE(v1, v2):           Tests that v1 >= v2
+//
+// When they are not, Google Test prints both the tested expressions and
+// their actual values.  The values must be compatible built-in types,
+// or you will get a compiler error.  By "compatible" we mean that the
+// values can be compared by the respective operator.
+//
+// Note:
+//
+//   1. It is possible to make a user-defined type work with
+//   {ASSERT|EXPECT}_??(), but that requires overloading the
+//   comparison operators and is thus discouraged by the Google C++
+//   Usage Guide.  Therefore, you are advised to use the
+//   {ASSERT|EXPECT}_TRUE() macro to assert that two objects are
+//   equal.
+//
+//   2. The {ASSERT|EXPECT}_??() macros do pointer comparisons on
+//   pointers (in particular, C strings).  Therefore, if you use it
+//   with two C strings, you are testing how their locations in memory
+//   are related, not how their content is related.  To compare two C
+//   strings by content, use {ASSERT|EXPECT}_STR*().
+//
+//   3. {ASSERT|EXPECT}_EQ(expected, actual) is preferred to
+//   {ASSERT|EXPECT}_TRUE(expected == actual), as the former tells you
+//   what the actual value is when it fails, and similarly for the
+//   other comparisons.
+//
+//   4. Do not depend on the order in which {ASSERT|EXPECT}_??()
+//   evaluate their arguments, which is undefined.
+//
+//   5. These macros evaluate their arguments exactly once.
+//
+// Examples:
+//
+//   EXPECT_NE(5, Foo());
+//   EXPECT_EQ(NULL, a_pointer);
+//   ASSERT_LT(i, array_size);
+//   ASSERT_GT(records.size(), 0) << "There is no record left.";
+
+#define EXPECT_EQ(expected, actual) \
+  EXPECT_PRED_FORMAT2(::testing::internal:: \
+                      EqHelper<GTEST_IS_NULL_LITERAL_(expected)>::Compare, \
+                      expected, actual)
+#define EXPECT_NE(expected, actual) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperNE, expected, actual)
+#define EXPECT_LE(val1, val2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperLE, val1, val2)
+#define EXPECT_LT(val1, val2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperLT, val1, val2)
+#define EXPECT_GE(val1, val2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperGE, val1, val2)
+#define EXPECT_GT(val1, val2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperGT, val1, val2)
+
+#define GTEST_ASSERT_EQ(expected, actual) \
+  ASSERT_PRED_FORMAT2(::testing::internal:: \
+                      EqHelper<GTEST_IS_NULL_LITERAL_(expected)>::Compare, \
+                      expected, actual)
+#define GTEST_ASSERT_NE(val1, val2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperNE, val1, val2)
+#define GTEST_ASSERT_LE(val1, val2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperLE, val1, val2)
+#define GTEST_ASSERT_LT(val1, val2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperLT, val1, val2)
+#define GTEST_ASSERT_GE(val1, val2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperGE, val1, val2)
+#define GTEST_ASSERT_GT(val1, val2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperGT, val1, val2)
+
+// Define macro GTEST_DONT_DEFINE_ASSERT_XY to 1 to omit the definition of
+// ASSERT_XY(), which clashes with some users' own code.
+
+#if !GTEST_DONT_DEFINE_ASSERT_EQ
+# define ASSERT_EQ(val1, val2) GTEST_ASSERT_EQ(val1, val2)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_NE
+# define ASSERT_NE(val1, val2) GTEST_ASSERT_NE(val1, val2)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_LE
+# define ASSERT_LE(val1, val2) GTEST_ASSERT_LE(val1, val2)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_LT
+# define ASSERT_LT(val1, val2) GTEST_ASSERT_LT(val1, val2)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_GE
+# define ASSERT_GE(val1, val2) GTEST_ASSERT_GE(val1, val2)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_GT
+# define ASSERT_GT(val1, val2) GTEST_ASSERT_GT(val1, val2)
+#endif
+
+// C-string Comparisons.  All tests treat NULL and any non-NULL string
+// as different.  Two NULLs are equal.
+//
+//    * {ASSERT|EXPECT}_STREQ(s1, s2):     Tests that s1 == s2
+//    * {ASSERT|EXPECT}_STRNE(s1, s2):     Tests that s1 != s2
+//    * {ASSERT|EXPECT}_STRCASEEQ(s1, s2): Tests that s1 == s2, ignoring case
+//    * {ASSERT|EXPECT}_STRCASENE(s1, s2): Tests that s1 != s2, ignoring case
+//
+// For wide or narrow string objects, you can use the
+// {ASSERT|EXPECT}_??() macros.
+//
+// Don't depend on the order in which the arguments are evaluated,
+// which is undefined.
+//
+// These macros evaluate their arguments exactly once.
+
+#define EXPECT_STREQ(expected, actual) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, expected, actual)
+#define EXPECT_STRNE(s1, s2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2)
+#define EXPECT_STRCASEEQ(expected, actual) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, expected, actual)
+#define EXPECT_STRCASENE(s1, s2)\
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2)
+
+#define ASSERT_STREQ(expected, actual) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, expected, actual)
+#define ASSERT_STRNE(s1, s2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2)
+#define ASSERT_STRCASEEQ(expected, actual) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, expected, actual)
+#define ASSERT_STRCASENE(s1, s2)\
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2)
+
+// Macros for comparing floating-point numbers.
+//
+//    * {ASSERT|EXPECT}_FLOAT_EQ(expected, actual):
+//         Tests that two float values are almost equal.
+//    * {ASSERT|EXPECT}_DOUBLE_EQ(expected, actual):
+//         Tests that two double values are almost equal.
+//    * {ASSERT|EXPECT}_NEAR(v1, v2, abs_error):
+//         Tests that v1 and v2 are within the given distance to each other.
+//
+// Google Test uses ULP-based comparison to automatically pick a default
+// error bound that is appropriate for the operands.  See the
+// FloatingPoint template class in gtest-internal.h if you are
+// interested in the implementation details.
+
+#define EXPECT_FLOAT_EQ(expected, actual)\
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<float>, \
+                      expected, actual)
+
+#define EXPECT_DOUBLE_EQ(expected, actual)\
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<double>, \
+                      expected, actual)
+
+#define ASSERT_FLOAT_EQ(expected, actual)\
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<float>, \
+                      expected, actual)
+
+#define ASSERT_DOUBLE_EQ(expected, actual)\
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<double>, \
+                      expected, actual)
+
+#define EXPECT_NEAR(val1, val2, abs_error)\
+  EXPECT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, \
+                      val1, val2, abs_error)
+
+#define ASSERT_NEAR(val1, val2, abs_error)\
+  ASSERT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, \
+                      val1, val2, abs_error)
+
+// These predicate format functions work on floating-point values, and
+// can be used in {ASSERT|EXPECT}_PRED_FORMAT2*(), e.g.
+//
+//   EXPECT_PRED_FORMAT2(testing::DoubleLE, Foo(), 5.0);
+
+// Asserts that val1 is less than, or almost equal to, val2.  Fails
+// otherwise.  In particular, it fails if either val1 or val2 is NaN.
+GTEST_API_ AssertionResult FloatLE(const char* expr1, const char* expr2,
+                                   float val1, float val2);
+GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2,
+                                    double val1, double val2);
+
+
+#if GTEST_OS_WINDOWS
+
+// Macros that test for HRESULT failure and success, these are only useful
+// on Windows, and rely on Windows SDK macros and APIs to compile.
+//
+//    * {ASSERT|EXPECT}_HRESULT_{SUCCEEDED|FAILED}(expr)
+//
+// When expr unexpectedly fails or succeeds, Google Test prints the
+// expected result and the actual result with both a human-readable
+// string representation of the error, if available, as well as the
+// hex result code.
+# define EXPECT_HRESULT_SUCCEEDED(expr) \
+    EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
+
+# define ASSERT_HRESULT_SUCCEEDED(expr) \
+    ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
+
+# define EXPECT_HRESULT_FAILED(expr) \
+    EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
+
+# define ASSERT_HRESULT_FAILED(expr) \
+    ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
+
+#endif  // GTEST_OS_WINDOWS
+
+// Macros that execute statement and check that it doesn't generate new fatal
+// failures in the current thread.
+//
+//   * {ASSERT|EXPECT}_NO_FATAL_FAILURE(statement);
+//
+// Examples:
+//
+//   EXPECT_NO_FATAL_FAILURE(Process());
+//   ASSERT_NO_FATAL_FAILURE(Process()) << "Process() failed";
+//
+#define ASSERT_NO_FATAL_FAILURE(statement) \
+    GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_FATAL_FAILURE_)
+#define EXPECT_NO_FATAL_FAILURE(statement) \
+    GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_NONFATAL_FAILURE_)
+
+// Causes a trace (including the source file path, the current line
+// number, and the given message) to be included in every test failure
+// message generated by code in the current scope.  The effect is
+// undone when the control leaves the current scope.
+//
+// The message argument can be anything streamable to std::ostream.
+//
+// In the implementation, we include the current line number as part
+// of the dummy variable name, thus allowing multiple SCOPED_TRACE()s
+// to appear in the same block - as long as they are on different
+// lines.
+#define SCOPED_TRACE(message) \
+  ::testing::internal::ScopedTrace GTEST_CONCAT_TOKEN_(gtest_trace_, __LINE__)(\
+    __FILE__, __LINE__, ::testing::Message() << (message))
+
+// Compile-time assertion for type equality.
+// StaticAssertTypeEq<type1, type2>() compiles iff type1 and type2 are
+// the same type.  The value it returns is not interesting.
+//
+// Instead of making StaticAssertTypeEq a class template, we make it a
+// function template that invokes a helper class template.  This
+// prevents a user from misusing StaticAssertTypeEq<T1, T2> by
+// defining objects of that type.
+//
+// CAVEAT:
+//
+// When used inside a method of a class template,
+// StaticAssertTypeEq<T1, T2>() is effective ONLY IF the method is
+// instantiated.  For example, given:
+//
+//   template <typename T> class Foo {
+//    public:
+//     void Bar() { testing::StaticAssertTypeEq<int, T>(); }
+//   };
+//
+// the code:
+//
+//   void Test1() { Foo<bool> foo; }
+//
+// will NOT generate a compiler error, as Foo<bool>::Bar() is never
+// actually instantiated.  Instead, you need:
+//
+//   void Test2() { Foo<bool> foo; foo.Bar(); }
+//
+// to cause a compiler error.
+template <typename T1, typename T2>
+bool StaticAssertTypeEq() {
+  (void)internal::StaticAssertTypeEqHelper<T1, T2>();
+  return true;
+}
+
+// Defines a test.
+//
+// The first parameter is the name of the test case, and the second
+// parameter is the name of the test within the test case.
+//
+// The convention is to end the test case name with "Test".  For
+// example, a test case for the Foo class can be named FooTest.
+//
+// The user should put his test code between braces after using this
+// macro.  Example:
+//
+//   TEST(FooTest, InitializesCorrectly) {
+//     Foo foo;
+//     EXPECT_TRUE(foo.StatusIsOK());
+//   }
+
+// Note that we call GetTestTypeId() instead of GetTypeId<
+// ::testing::Test>() here to get the type ID of testing::Test.  This
+// is to work around a suspected linker bug when using Google Test as
+// a framework on Mac OS X.  The bug causes GetTypeId<
+// ::testing::Test>() to return different values depending on whether
+// the call is from the Google Test framework itself or from user test
+// code.  GetTestTypeId() is guaranteed to always return the same
+// value, as it always calls GetTypeId<>() from the Google Test
+// framework.
+#define GTEST_TEST(test_case_name, test_name)\
+  GTEST_TEST_(test_case_name, test_name, \
+              ::testing::Test, ::testing::internal::GetTestTypeId())
+
+// Define this macro to 1 to omit the definition of TEST(), which
+// is a generic name and clashes with some other libraries.
+#if !GTEST_DONT_DEFINE_TEST
+# define TEST(test_case_name, test_name) GTEST_TEST(test_case_name, test_name)
+#endif
+
+// Defines a test that uses a test fixture.
+//
+// The first parameter is the name of the test fixture class, which
+// also doubles as the test case name.  The second parameter is the
+// name of the test within the test case.
+//
+// A test fixture class must be declared earlier.  The user should put
+// his test code between braces after using this macro.  Example:
+//
+//   class FooTest : public testing::Test {
+//    protected:
+//     virtual void SetUp() { b_.AddElement(3); }
+//
+//     Foo a_;
+//     Foo b_;
+//   };
+//
+//   TEST_F(FooTest, InitializesCorrectly) {
+//     EXPECT_TRUE(a_.StatusIsOK());
+//   }
+//
+//   TEST_F(FooTest, ReturnsElementCountCorrectly) {
+//     EXPECT_EQ(0, a_.size());
+//     EXPECT_EQ(1, b_.size());
+//   }
+
+#define TEST_F(test_fixture, test_name)\
+  GTEST_TEST_(test_fixture, test_name, test_fixture, \
+              ::testing::internal::GetTypeId<test_fixture>())
+
+}  // namespace testing
+
+// Use this function in main() to run all tests.  It returns 0 if all
+// tests are successful, or 1 otherwise.
+//
+// RUN_ALL_TESTS() should be invoked after the command line has been
+// parsed by InitGoogleTest().
+//
+// This function was formerly a macro; thus, it is in the global
+// namespace and has an all-caps name.
+int RUN_ALL_TESTS() GTEST_MUST_USE_RESULT_;
+
+inline int RUN_ALL_TESTS() {
+  return ::testing::UnitTest::GetInstance()->Run();
+}
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_H_
diff --git a/libs/libvpx/third_party/googletest/src/src/gtest-all.cc b/libs/libvpx/third_party/googletest/src/src/gtest-all.cc
new file mode 100644
index 0000000000..8d906279ab
--- /dev/null
+++ b/libs/libvpx/third_party/googletest/src/src/gtest-all.cc
@@ -0,0 +1,9590 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: mheule@google.com (Markus Heule)
+//
+// Google C++ Testing Framework (Google Test)
+//
+// Sometimes it's desirable to build Google Test by compiling a single file.
+// This file serves this purpose.
+
+// This line ensures that gtest.h can be compiled on its own, even
+// when it's fused.
+#include "gtest/gtest.h"
+
+// The following lines pull in the real gtest *.cc files.
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+//
+// The Google C++ Testing Framework (Google Test)
+
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+//
+// Utilities for testing Google Test itself and code that uses Google Test
+// (e.g. frameworks built on top of Google Test).
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_SPI_H_
+#define GTEST_INCLUDE_GTEST_GTEST_SPI_H_
+
+
+namespace testing {
+
+// This helper class can be used to mock out Google Test failure reporting
+// so that we can test Google Test or code that builds on Google Test.
+//
+// An object of this class appends a TestPartResult object to the
+// TestPartResultArray object given in the constructor whenever a Google Test
+// failure is reported. It can either intercept only failures that are
+// generated in the same thread that created this object or it can intercept
+// all generated failures. The scope of this mock object can be controlled with
+// the second argument to the two arguments constructor.
+class GTEST_API_ ScopedFakeTestPartResultReporter
+    : public TestPartResultReporterInterface {
+ public:
+  // The two possible mocking modes of this object.
+  enum InterceptMode {
+    INTERCEPT_ONLY_CURRENT_THREAD,  // Intercepts only thread local failures.
+    INTERCEPT_ALL_THREADS           // Intercepts all failures.
+  };
+
+  // The c'tor sets this object as the test part result reporter used
+  // by Google Test.  The 'result' parameter specifies where to report the
+  // results. This reporter will only catch failures generated in the current
+  // thread. DEPRECATED
+  explicit ScopedFakeTestPartResultReporter(TestPartResultArray* result);
+
+  // Same as above, but you can choose the interception scope of this object.
+  ScopedFakeTestPartResultReporter(InterceptMode intercept_mode,
+                                   TestPartResultArray* result);
+
+  // The d'tor restores the previous test part result reporter.
+  virtual ~ScopedFakeTestPartResultReporter();
+
+  // Appends the TestPartResult object to the TestPartResultArray
+  // received in the constructor.
+  //
+  // This method is from the TestPartResultReporterInterface
+  // interface.
+  virtual void ReportTestPartResult(const TestPartResult& result);
+ private:
+  void Init();
+
+  const InterceptMode intercept_mode_;
+  TestPartResultReporterInterface* old_reporter_;
+  TestPartResultArray* const result_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedFakeTestPartResultReporter);
+};
+
+namespace internal {
+
+// A helper class for implementing EXPECT_FATAL_FAILURE() and
+// EXPECT_NONFATAL_FAILURE().  Its destructor verifies that the given
+// TestPartResultArray contains exactly one failure that has the given
+// type and contains the given substring.  If that's not the case, a
+// non-fatal failure will be generated.
+class GTEST_API_ SingleFailureChecker {
+ public:
+  // The constructor remembers the arguments.
+  SingleFailureChecker(const TestPartResultArray* results,
+                       TestPartResult::Type type,
+                       const string& substr);
+  ~SingleFailureChecker();
+ private:
+  const TestPartResultArray* const results_;
+  const TestPartResult::Type type_;
+  const string substr_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(SingleFailureChecker);
+};
+
+}  // namespace internal
+
+}  // namespace testing
+
+// A set of macros for testing Google Test assertions or code that's expected
+// to generate Google Test fatal failures.  It verifies that the given
+// statement will cause exactly one fatal Google Test failure with 'substr'
+// being part of the failure message.
+//
+// There are two different versions of this macro. EXPECT_FATAL_FAILURE only
+// affects and considers failures generated in the current thread and
+// EXPECT_FATAL_FAILURE_ON_ALL_THREADS does the same but for all threads.
+//
+// The verification of the assertion is done correctly even when the statement
+// throws an exception or aborts the current function.
+//
+// Known restrictions:
+//   - 'statement' cannot reference local non-static variables or
+//     non-static members of the current object.
+//   - 'statement' cannot return a value.
+//   - You cannot stream a failure message to this macro.
+//
+// Note that even though the implementations of the following two
+// macros are much alike, we cannot refactor them to use a common
+// helper macro, due to some peculiarity in how the preprocessor
+// works.  The AcceptsMacroThatExpandsToUnprotectedComma test in
+// gtest_unittest.cc will fail to compile if we do that.
+#define EXPECT_FATAL_FAILURE(statement, substr) \
+  do { \
+    class GTestExpectFatalFailureHelper {\
+     public:\
+      static void Execute() { statement; }\
+    };\
+    ::testing::TestPartResultArray gtest_failures;\
+    ::testing::internal::SingleFailureChecker gtest_checker(\
+        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr));\
+    {\
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
+          ::testing::ScopedFakeTestPartResultReporter:: \
+          INTERCEPT_ONLY_CURRENT_THREAD, &gtest_failures);\
+      GTestExpectFatalFailureHelper::Execute();\
+    }\
+  } while (::testing::internal::AlwaysFalse())
+
+#define EXPECT_FATAL_FAILURE_ON_ALL_THREADS(statement, substr) \
+  do { \
+    class GTestExpectFatalFailureHelper {\
+     public:\
+      static void Execute() { statement; }\
+    };\
+    ::testing::TestPartResultArray gtest_failures;\
+    ::testing::internal::SingleFailureChecker gtest_checker(\
+        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr));\
+    {\
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
+          ::testing::ScopedFakeTestPartResultReporter:: \
+          INTERCEPT_ALL_THREADS, &gtest_failures);\
+      GTestExpectFatalFailureHelper::Execute();\
+    }\
+  } while (::testing::internal::AlwaysFalse())
+
+// A macro for testing Google Test assertions or code that's expected to
+// generate Google Test non-fatal failures.  It asserts that the given
+// statement will cause exactly one non-fatal Google Test failure with 'substr'
+// being part of the failure message.
+//
+// There are two different versions of this macro. EXPECT_NONFATAL_FAILURE only
+// affects and considers failures generated in the current thread and
+// EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS does the same but for all threads.
+//
+// 'statement' is allowed to reference local variables and members of
+// the current object.
+//
+// The verification of the assertion is done correctly even when the statement
+// throws an exception or aborts the current function.
+//
+// Known restrictions:
+//   - You cannot stream a failure message to this macro.
+//
+// Note that even though the implementations of the following two
+// macros are much alike, we cannot refactor them to use a common
+// helper macro, due to some peculiarity in how the preprocessor
+// works.  If we do that, the code won't compile when the user gives
+// EXPECT_NONFATAL_FAILURE() a statement that contains a macro that
+// expands to code containing an unprotected comma.  The
+// AcceptsMacroThatExpandsToUnprotectedComma test in gtest_unittest.cc
+// catches that.
+//
+// For the same reason, we have to write
+//   if (::testing::internal::AlwaysTrue()) { statement; }
+// instead of
+//   GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement)
+// to avoid an MSVC warning on unreachable code.
+#define EXPECT_NONFATAL_FAILURE(statement, substr) \
+  do {\
+    ::testing::TestPartResultArray gtest_failures;\
+    ::testing::internal::SingleFailureChecker gtest_checker(\
+        &gtest_failures, ::testing::TestPartResult::kNonFatalFailure, \
+        (substr));\
+    {\
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
+          ::testing::ScopedFakeTestPartResultReporter:: \
+          INTERCEPT_ONLY_CURRENT_THREAD, &gtest_failures);\
+      if (::testing::internal::AlwaysTrue()) { statement; }\
+    }\
+  } while (::testing::internal::AlwaysFalse())
+
+#define EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS(statement, substr) \
+  do {\
+    ::testing::TestPartResultArray gtest_failures;\
+    ::testing::internal::SingleFailureChecker gtest_checker(\
+        &gtest_failures, ::testing::TestPartResult::kNonFatalFailure, \
+        (substr));\
+    {\
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
+          ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS, \
+          &gtest_failures);\
+      if (::testing::internal::AlwaysTrue()) { statement; }\
+    }\
+  } while (::testing::internal::AlwaysFalse())
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_SPI_H_
+
+#include <ctype.h>
+#include <math.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <wchar.h>
+#include <wctype.h>
+
+#include <algorithm>
+#include <iomanip>
+#include <limits>
+#include <ostream>  // NOLINT
+#include <sstream>
+#include <vector>
+
+#if GTEST_OS_LINUX
+
+// TODO(kenton@google.com): Use autoconf to detect availability of
+// gettimeofday().
+# define GTEST_HAS_GETTIMEOFDAY_ 1
+
+# include <fcntl.h>  // NOLINT
+# include <limits.h>  // NOLINT
+# include <sched.h>  // NOLINT
+// Declares vsnprintf().  This header is not available on Windows.
+# include <strings.h>  // NOLINT
+# include <sys/mman.h>  // NOLINT
+# include <sys/time.h>  // NOLINT
+# include <unistd.h>  // NOLINT
+# include <string>
+
+#elif GTEST_OS_SYMBIAN
+# define GTEST_HAS_GETTIMEOFDAY_ 1
+# include <sys/time.h>  // NOLINT
+
+#elif GTEST_OS_ZOS
+# define GTEST_HAS_GETTIMEOFDAY_ 1
+# include <sys/time.h>  // NOLINT
+
+// On z/OS we additionally need strings.h for strcasecmp.
+# include <strings.h>  // NOLINT
+
+#elif GTEST_OS_WINDOWS_MOBILE  // We are on Windows CE.
+
+# include <windows.h>  // NOLINT
+
+#elif GTEST_OS_WINDOWS  // We are on Windows proper.
+
+# include <io.h>  // NOLINT
+# include <sys/timeb.h>  // NOLINT
+# include <sys/types.h>  // NOLINT
+# include <sys/stat.h>  // NOLINT
+
+# if GTEST_OS_WINDOWS_MINGW
+// MinGW has gettimeofday() but not _ftime64().
+// TODO(kenton@google.com): Use autoconf to detect availability of
+//   gettimeofday().
+// TODO(kenton@google.com): There are other ways to get the time on
+//   Windows, like GetTickCount() or GetSystemTimeAsFileTime().  MinGW
+//   supports these.  consider using them instead.
+#  define GTEST_HAS_GETTIMEOFDAY_ 1
+#  include <sys/time.h>  // NOLINT
+# endif  // GTEST_OS_WINDOWS_MINGW
+
+// cpplint thinks that the header is already included, so we want to
+// silence it.
+# include <windows.h>  // NOLINT
+
+#else
+
+// Assume other platforms have gettimeofday().
+// TODO(kenton@google.com): Use autoconf to detect availability of
+//   gettimeofday().
+# define GTEST_HAS_GETTIMEOFDAY_ 1
+
+// cpplint thinks that the header is already included, so we want to
+// silence it.
+# include <sys/time.h>  // NOLINT
+# include <unistd.h>  // NOLINT
+
+#endif  // GTEST_OS_LINUX
+
+#if GTEST_HAS_EXCEPTIONS
+# include <stdexcept>
+#endif
+
+#if GTEST_CAN_STREAM_RESULTS_
+# include <arpa/inet.h>  // NOLINT
+# include <netdb.h>  // NOLINT
+#endif
+
+// Indicates that this translation unit is part of Google Test's
+// implementation.  It must come before gtest-internal-inl.h is
+// included, or there will be a compiler error.  This trick is to
+// prevent a user from accidentally including gtest-internal-inl.h in
+// his code.
+#define GTEST_IMPLEMENTATION_ 1
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Utility functions and classes used by the Google C++ testing framework.
+//
+// Author: wan@google.com (Zhanyong Wan)
+//
+// This file contains purely Google Test's internal implementation.  Please
+// DO NOT #INCLUDE IT IN A USER PROGRAM.
+
+#ifndef GTEST_SRC_GTEST_INTERNAL_INL_H_
+#define GTEST_SRC_GTEST_INTERNAL_INL_H_
+
+// GTEST_IMPLEMENTATION_ is defined to 1 iff the current translation unit is
+// part of Google Test's implementation; otherwise it's undefined.
+#if !GTEST_IMPLEMENTATION_
+// A user is trying to include this from his code - just say no.
+# error "gtest-internal-inl.h is part of Google Test's internal implementation."
+# error "It must not be included except by Google Test itself."
+#endif  // GTEST_IMPLEMENTATION_
+
+#ifndef _WIN32_WCE
+# include <errno.h>
+#endif  // !_WIN32_WCE
+#include <stddef.h>
+#include <stdlib.h>  // For strtoll/_strtoul64/malloc/free.
+#include <string.h>  // For memmove.
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+
+#if GTEST_CAN_STREAM_RESULTS_
+# include <arpa/inet.h>  // NOLINT
+# include <netdb.h>  // NOLINT
+#endif
+
+#if GTEST_OS_WINDOWS
+# include <windows.h>  // NOLINT
+#endif  // GTEST_OS_WINDOWS
+
+
+namespace testing {
+
+// Declares the flags.
+//
+// We don't want the users to modify this flag in the code, but want
+// Google Test's own unit tests to be able to access it. Therefore we
+// declare it here as opposed to in gtest.h.
+GTEST_DECLARE_bool_(death_test_use_fork);
+
+namespace internal {
+
+// The value of GetTestTypeId() as seen from within the Google Test
+// library.  This is solely for testing GetTestTypeId().
+GTEST_API_ extern const TypeId kTestTypeIdInGoogleTest;
+
+// Names of the flags (needed for parsing Google Test flags).
+const char kAlsoRunDisabledTestsFlag[] = "also_run_disabled_tests";
+const char kBreakOnFailureFlag[] = "break_on_failure";
+const char kCatchExceptionsFlag[] = "catch_exceptions";
+const char kColorFlag[] = "color";
+const char kFilterFlag[] = "filter";
+const char kListTestsFlag[] = "list_tests";
+const char kOutputFlag[] = "output";
+const char kPrintTimeFlag[] = "print_time";
+const char kRandomSeedFlag[] = "random_seed";
+const char kRepeatFlag[] = "repeat";
+const char kShuffleFlag[] = "shuffle";
+const char kStackTraceDepthFlag[] = "stack_trace_depth";
+const char kStreamResultToFlag[] = "stream_result_to";
+const char kThrowOnFailureFlag[] = "throw_on_failure";
+
+// A valid random seed must be in [1, kMaxRandomSeed].
+const int kMaxRandomSeed = 99999;
+
+// g_help_flag is true iff the --help flag or an equivalent form is
+// specified on the command line.
+GTEST_API_ extern bool g_help_flag;
+
+// Returns the current time in milliseconds.
+GTEST_API_ TimeInMillis GetTimeInMillis();
+
+// Returns true iff Google Test should use colors in the output.
+GTEST_API_ bool ShouldUseColor(bool stdout_is_tty);
+
+// Formats the given time in milliseconds as seconds.
+GTEST_API_ std::string FormatTimeInMillisAsSeconds(TimeInMillis ms);
+
+// Converts the given time in milliseconds to a date string in the ISO 8601
+// format, without the timezone information.  N.B.: due to the use the
+// non-reentrant localtime() function, this function is not thread safe.  Do
+// not use it in any code that can be called from multiple threads.
+GTEST_API_ std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms);
+
+// Parses a string for an Int32 flag, in the form of "--flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+GTEST_API_ bool ParseInt32Flag(
+    const char* str, const char* flag, Int32* value);
+
+// Returns a random seed in range [1, kMaxRandomSeed] based on the
+// given --gtest_random_seed flag value.
+inline int GetRandomSeedFromFlag(Int32 random_seed_flag) {
+  const unsigned int raw_seed = (random_seed_flag == 0) ?
+      static_cast<unsigned int>(GetTimeInMillis()) :
+      static_cast<unsigned int>(random_seed_flag);
+
+  // Normalizes the actual seed to range [1, kMaxRandomSeed] such that
+  // it's easy to type.
+  const int normalized_seed =
+      static_cast<int>((raw_seed - 1U) %
+                       static_cast<unsigned int>(kMaxRandomSeed)) + 1;
+  return normalized_seed;
+}
+
+// Returns the first valid random seed after 'seed'.  The behavior is
+// undefined if 'seed' is invalid.  The seed after kMaxRandomSeed is
+// considered to be 1.
+inline int GetNextRandomSeed(int seed) {
+  GTEST_CHECK_(1 <= seed && seed <= kMaxRandomSeed)
+      << "Invalid random seed " << seed << " - must be in [1, "
+      << kMaxRandomSeed << "].";
+  const int next_seed = seed + 1;
+  return (next_seed > kMaxRandomSeed) ? 1 : next_seed;
+}
+
+// This class saves the values of all Google Test flags in its c'tor, and
+// restores them in its d'tor.
+class GTestFlagSaver {
+ public:
+  // The c'tor.
+  GTestFlagSaver() {
+    also_run_disabled_tests_ = GTEST_FLAG(also_run_disabled_tests);
+    break_on_failure_ = GTEST_FLAG(break_on_failure);
+    catch_exceptions_ = GTEST_FLAG(catch_exceptions);
+    color_ = GTEST_FLAG(color);
+    death_test_style_ = GTEST_FLAG(death_test_style);
+    death_test_use_fork_ = GTEST_FLAG(death_test_use_fork);
+    filter_ = GTEST_FLAG(filter);
+    internal_run_death_test_ = GTEST_FLAG(internal_run_death_test);
+    list_tests_ = GTEST_FLAG(list_tests);
+    output_ = GTEST_FLAG(output);
+    print_time_ = GTEST_FLAG(print_time);
+    random_seed_ = GTEST_FLAG(random_seed);
+    repeat_ = GTEST_FLAG(repeat);
+    shuffle_ = GTEST_FLAG(shuffle);
+    stack_trace_depth_ = GTEST_FLAG(stack_trace_depth);
+    stream_result_to_ = GTEST_FLAG(stream_result_to);
+    throw_on_failure_ = GTEST_FLAG(throw_on_failure);
+  }
+
+  // The d'tor is not virtual.  DO NOT INHERIT FROM THIS CLASS.
+  ~GTestFlagSaver() {
+    GTEST_FLAG(also_run_disabled_tests) = also_run_disabled_tests_;
+    GTEST_FLAG(break_on_failure) = break_on_failure_;
+    GTEST_FLAG(catch_exceptions) = catch_exceptions_;
+    GTEST_FLAG(color) = color_;
+    GTEST_FLAG(death_test_style) = death_test_style_;
+    GTEST_FLAG(death_test_use_fork) = death_test_use_fork_;
+    GTEST_FLAG(filter) = filter_;
+    GTEST_FLAG(internal_run_death_test) = internal_run_death_test_;
+    GTEST_FLAG(list_tests) = list_tests_;
+    GTEST_FLAG(output) = output_;
+    GTEST_FLAG(print_time) = print_time_;
+    GTEST_FLAG(random_seed) = random_seed_;
+    GTEST_FLAG(repeat) = repeat_;
+    GTEST_FLAG(shuffle) = shuffle_;
+    GTEST_FLAG(stack_trace_depth) = stack_trace_depth_;
+    GTEST_FLAG(stream_result_to) = stream_result_to_;
+    GTEST_FLAG(throw_on_failure) = throw_on_failure_;
+  }
+
+ private:
+  // Fields for saving the original values of flags.
+  bool also_run_disabled_tests_;
+  bool break_on_failure_;
+  bool catch_exceptions_;
+  std::string color_;
+  std::string death_test_style_;
+  bool death_test_use_fork_;
+  std::string filter_;
+  std::string internal_run_death_test_;
+  bool list_tests_;
+  std::string output_;
+  bool print_time_;
+  internal::Int32 random_seed_;
+  internal::Int32 repeat_;
+  bool shuffle_;
+  internal::Int32 stack_trace_depth_;
+  std::string stream_result_to_;
+  bool throw_on_failure_;
+} GTEST_ATTRIBUTE_UNUSED_;
+
+// Converts a Unicode code point to a narrow string in UTF-8 encoding.
+// code_point parameter is of type UInt32 because wchar_t may not be
+// wide enough to contain a code point.
+// If the code_point is not a valid Unicode code point
+// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be converted
+// to "(Invalid Unicode 0xXXXXXXXX)".
+GTEST_API_ std::string CodePointToUtf8(UInt32 code_point);
+
+// Converts a wide string to a narrow string in UTF-8 encoding.
+// The wide string is assumed to have the following encoding:
+//   UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin, Symbian OS)
+//   UTF-32 if sizeof(wchar_t) == 4 (on Linux)
+// Parameter str points to a null-terminated wide string.
+// Parameter num_chars may additionally limit the number
+// of wchar_t characters processed. -1 is used when the entire string
+// should be processed.
+// If the string contains code points that are not valid Unicode code points
+// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output
+// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding
+// and contains invalid UTF-16 surrogate pairs, values in those pairs
+// will be encoded as individual Unicode characters from Basic Normal Plane.
+GTEST_API_ std::string WideStringToUtf8(const wchar_t* str, int num_chars);
+
+// Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file
+// if the variable is present. If a file already exists at this location, this
+// function will write over it. If the variable is present, but the file cannot
+// be created, prints an error and exits.
+void WriteToShardStatusFileIfNeeded();
+
+// Checks whether sharding is enabled by examining the relevant
+// environment variable values. If the variables are present,
+// but inconsistent (e.g., shard_index >= total_shards), prints
+// an error and exits. If in_subprocess_for_death_test, sharding is
+// disabled because it must only be applied to the original test
+// process. Otherwise, we could filter out death tests we intended to execute.
+GTEST_API_ bool ShouldShard(const char* total_shards_str,
+                            const char* shard_index_str,
+                            bool in_subprocess_for_death_test);
+
+// Parses the environment variable var as an Int32. If it is unset,
+// returns default_val. If it is not an Int32, prints an error and
+// and aborts.
+GTEST_API_ Int32 Int32FromEnvOrDie(const char* env_var, Int32 default_val);
+
+// Given the total number of shards, the shard index, and the test id,
+// returns true iff the test should be run on this shard. The test id is
+// some arbitrary but unique non-negative integer assigned to each test
+// method. Assumes that 0 <= shard_index < total_shards.
+GTEST_API_ bool ShouldRunTestOnShard(
+    int total_shards, int shard_index, int test_id);
+
+// STL container utilities.
+
+// Returns the number of elements in the given container that satisfy
+// the given predicate.
+template <class Container, typename Predicate>
+inline int CountIf(const Container& c, Predicate predicate) {
+  // Implemented as an explicit loop since std::count_if() in libCstd on
+  // Solaris has a non-standard signature.
+  int count = 0;
+  for (typename Container::const_iterator it = c.begin(); it != c.end(); ++it) {
+    if (predicate(*it))
+      ++count;
+  }
+  return count;
+}
+
+// Applies a function/functor to each element in the container.
+template <class Container, typename Functor>
+void ForEach(const Container& c, Functor functor) {
+  std::for_each(c.begin(), c.end(), functor);
+}
+
+// Returns the i-th element of the vector, or default_value if i is not
+// in range [0, v.size()).
+template <typename E>
+inline E GetElementOr(const std::vector<E>& v, int i, E default_value) {
+  return (i < 0 || i >= static_cast<int>(v.size())) ? default_value : v[i];
+}
+
+// Performs an in-place shuffle of a range of the vector's elements.
+// 'begin' and 'end' are element indices as an STL-style range;
+// i.e. [begin, end) are shuffled, where 'end' == size() means to
+// shuffle to the end of the vector.
+template <typename E>
+void ShuffleRange(internal::Random* random, int begin, int end,
+                  std::vector<E>* v) {
+  const int size = static_cast<int>(v->size());
+  GTEST_CHECK_(0 <= begin && begin <= size)
+      << "Invalid shuffle range start " << begin << ": must be in range [0, "
+      << size << "].";
+  GTEST_CHECK_(begin <= end && end <= size)
+      << "Invalid shuffle range finish " << end << ": must be in range ["
+      << begin << ", " << size << "].";
+
+  // Fisher-Yates shuffle, from
+  // http://en.wikipedia.org/wiki/Fisher-Yates_shuffle
+  for (int range_width = end - begin; range_width >= 2; range_width--) {
+    const int last_in_range = begin + range_width - 1;
+    const int selected = begin + random->Generate(range_width);
+    std::swap((*v)[selected], (*v)[last_in_range]);
+  }
+}
+
+// Performs an in-place shuffle of the vector's elements.
+template <typename E>
+inline void Shuffle(internal::Random* random, std::vector<E>* v) {
+  ShuffleRange(random, 0, static_cast<int>(v->size()), v);
+}
+
+// A function for deleting an object.  Handy for being used as a
+// functor.
+template <typename T>
+static void Delete(T* x) {
+  delete x;
+}
+
+// A predicate that checks the key of a TestProperty against a known key.
+//
+// TestPropertyKeyIs is copyable.
+class TestPropertyKeyIs {
+ public:
+  // Constructor.
+  //
+  // TestPropertyKeyIs has NO default constructor.
+  explicit TestPropertyKeyIs(const std::string& key) : key_(key) {}
+
+  // Returns true iff the test name of test property matches on key_.
+  bool operator()(const TestProperty& test_property) const {
+    return test_property.key() == key_;
+  }
+
+ private:
+  std::string key_;
+};
+
+// Class UnitTestOptions.
+//
+// This class contains functions for processing options the user
+// specifies when running the tests.  It has only static members.
+//
+// In most cases, the user can specify an option using either an
+// environment variable or a command line flag.  E.g. you can set the
+// test filter using either GTEST_FILTER or --gtest_filter.  If both
+// the variable and the flag are present, the latter overrides the
+// former.
+class GTEST_API_ UnitTestOptions {
+ public:
+  // Functions for processing the gtest_output flag.
+
+  // Returns the output format, or "" for normal printed output.
+  static std::string GetOutputFormat();
+
+  // Returns the absolute path of the requested output file, or the
+  // default (test_detail.xml in the original working directory) if
+  // none was explicitly specified.
+  static std::string GetAbsolutePathToOutputFile();
+
+  // Functions for processing the gtest_filter flag.
+
+  // Returns true iff the wildcard pattern matches the string.  The
+  // first ':' or '\0' character in pattern marks the end of it.
+  //
+  // This recursive algorithm isn't very efficient, but is clear and
+  // works well enough for matching test names, which are short.
+  static bool PatternMatchesString(const char *pattern, const char *str);
+
+  // Returns true iff the user-specified filter matches the test case
+  // name and the test name.
+  static bool FilterMatchesTest(const std::string &test_case_name,
+                                const std::string &test_name);
+
+#if GTEST_OS_WINDOWS
+  // Function for supporting the gtest_catch_exception flag.
+
+  // Returns EXCEPTION_EXECUTE_HANDLER if Google Test should handle the
+  // given SEH exception, or EXCEPTION_CONTINUE_SEARCH otherwise.
+  // This function is useful as an __except condition.
+  static int GTestShouldProcessSEH(DWORD exception_code);
+#endif  // GTEST_OS_WINDOWS
+
+  // Returns true if "name" matches the ':' separated list of glob-style
+  // filters in "filter".
+  static bool MatchesFilter(const std::string& name, const char* filter);
+};
+
+// Returns the current application's name, removing directory path if that
+// is present.  Used by UnitTestOptions::GetOutputFile.
+GTEST_API_ FilePath GetCurrentExecutableName();
+
+// The role interface for getting the OS stack trace as a string.
+class OsStackTraceGetterInterface {
+ public:
+  OsStackTraceGetterInterface() {}
+  virtual ~OsStackTraceGetterInterface() {}
+
+  // Returns the current OS stack trace as an std::string.  Parameters:
+  //
+  //   max_depth  - the maximum number of stack frames to be included
+  //                in the trace.
+  //   skip_count - the number of top frames to be skipped; doesn't count
+  //                against max_depth.
+  virtual string CurrentStackTrace(int max_depth, int skip_count) = 0;
+
+  // UponLeavingGTest() should be called immediately before Google Test calls
+  // user code. It saves some information about the current stack that
+  // CurrentStackTrace() will use to find and hide Google Test stack frames.
+  virtual void UponLeavingGTest() = 0;
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetterInterface);
+};
+
+// A working implementation of the OsStackTraceGetterInterface interface.
+class OsStackTraceGetter : public OsStackTraceGetterInterface {
+ public:
+  OsStackTraceGetter() : caller_frame_(NULL) {}
+
+  virtual string CurrentStackTrace(int max_depth, int skip_count)
+      GTEST_LOCK_EXCLUDED_(mutex_);
+
+  virtual void UponLeavingGTest() GTEST_LOCK_EXCLUDED_(mutex_);
+
+  // This string is inserted in place of stack frames that are part of
+  // Google Test's implementation.
+  static const char* const kElidedFramesMarker;
+
+ private:
+  Mutex mutex_;  // protects all internal state
+
+  // We save the stack frame below the frame that calls user code.
+  // We do this because the address of the frame immediately below
+  // the user code changes between the call to UponLeavingGTest()
+  // and any calls to CurrentStackTrace() from within the user code.
+  void* caller_frame_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetter);
+};
+
+// Information about a Google Test trace point.
+struct TraceInfo {
+  const char* file;
+  int line;
+  std::string message;
+};
+
+// This is the default global test part result reporter used in UnitTestImpl.
+// This class should only be used by UnitTestImpl.
+class DefaultGlobalTestPartResultReporter
+  : public TestPartResultReporterInterface {
+ public:
+  explicit DefaultGlobalTestPartResultReporter(UnitTestImpl* unit_test);
+  // Implements the TestPartResultReporterInterface. Reports the test part
+  // result in the current test.
+  virtual void ReportTestPartResult(const TestPartResult& result);
+
+ private:
+  UnitTestImpl* const unit_test_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultGlobalTestPartResultReporter);
+};
+
+// This is the default per thread test part result reporter used in
+// UnitTestImpl. This class should only be used by UnitTestImpl.
+class DefaultPerThreadTestPartResultReporter
+    : public TestPartResultReporterInterface {
+ public:
+  explicit DefaultPerThreadTestPartResultReporter(UnitTestImpl* unit_test);
+  // Implements the TestPartResultReporterInterface. The implementation just
+  // delegates to the current global test part result reporter of *unit_test_.
+  virtual void ReportTestPartResult(const TestPartResult& result);
+
+ private:
+  UnitTestImpl* const unit_test_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultPerThreadTestPartResultReporter);
+};
+
+// The private implementation of the UnitTest class.  We don't protect
+// the methods under a mutex, as this class is not accessible by a
+// user and the UnitTest class that delegates work to this class does
+// proper locking.
+class GTEST_API_ UnitTestImpl {
+ public:
+  explicit UnitTestImpl(UnitTest* parent);
+  virtual ~UnitTestImpl();
+
+  // There are two different ways to register your own TestPartResultReporter.
+  // You can register your own repoter to listen either only for test results
+  // from the current thread or for results from all threads.
+  // By default, each per-thread test result repoter just passes a new
+  // TestPartResult to the global test result reporter, which registers the
+  // test part result for the currently running test.
+
+  // Returns the global test part result reporter.
+  TestPartResultReporterInterface* GetGlobalTestPartResultReporter();
+
+  // Sets the global test part result reporter.
+  void SetGlobalTestPartResultReporter(
+      TestPartResultReporterInterface* reporter);
+
+  // Returns the test part result reporter for the current thread.
+  TestPartResultReporterInterface* GetTestPartResultReporterForCurrentThread();
+
+  // Sets the test part result reporter for the current thread.
+  void SetTestPartResultReporterForCurrentThread(
+      TestPartResultReporterInterface* reporter);
+
+  // Gets the number of successful test cases.
+  int successful_test_case_count() const;
+
+  // Gets the number of failed test cases.
+  int failed_test_case_count() const;
+
+  // Gets the number of all test cases.
+  int total_test_case_count() const;
+
+  // Gets the number of all test cases that contain at least one test
+  // that should run.
+  int test_case_to_run_count() const;
+
+  // Gets the number of successful tests.
+  int successful_test_count() const;
+
+  // Gets the number of failed tests.
+  int failed_test_count() const;
+
+  // Gets the number of disabled tests that will be reported in the XML report.
+  int reportable_disabled_test_count() const;
+
+  // Gets the number of disabled tests.
+  int disabled_test_count() const;
+
+  // Gets the number of tests to be printed in the XML report.
+  int reportable_test_count() const;
+
+  // Gets the number of all tests.
+  int total_test_count() const;
+
+  // Gets the number of tests that should run.
+  int test_to_run_count() const;
+
+  // Gets the time of the test program start, in ms from the start of the
+  // UNIX epoch.
+  TimeInMillis start_timestamp() const { return start_timestamp_; }
+
+  // Gets the elapsed time, in milliseconds.
+  TimeInMillis elapsed_time() const { return elapsed_time_; }
+
+  // Returns true iff the unit test passed (i.e. all test cases passed).
+  bool Passed() const { return !Failed(); }
+
+  // Returns true iff the unit test failed (i.e. some test case failed
+  // or something outside of all tests failed).
+  bool Failed() const {
+    return failed_test_case_count() > 0 || ad_hoc_test_result()->Failed();
+  }
+
+  // Gets the i-th test case among all the test cases. i can range from 0 to
+  // total_test_case_count() - 1. If i is not in that range, returns NULL.
+  const TestCase* GetTestCase(int i) const {
+    const int index = GetElementOr(test_case_indices_, i, -1);
+    return index < 0 ? NULL : test_cases_[i];
+  }
+
+  // Gets the i-th test case among all the test cases. i can range from 0 to
+  // total_test_case_count() - 1. If i is not in that range, returns NULL.
+  TestCase* GetMutableTestCase(int i) {
+    const int index = GetElementOr(test_case_indices_, i, -1);
+    return index < 0 ? NULL : test_cases_[index];
+  }
+
+  // Provides access to the event listener list.
+  TestEventListeners* listeners() { return &listeners_; }
+
+  // Returns the TestResult for the test that's currently running, or
+  // the TestResult for the ad hoc test if no test is running.
+  TestResult* current_test_result();
+
+  // Returns the TestResult for the ad hoc test.
+  const TestResult* ad_hoc_test_result() const { return &ad_hoc_test_result_; }
+
+  // Sets the OS stack trace getter.
+  //
+  // Does nothing if the input and the current OS stack trace getter
+  // are the same; otherwise, deletes the old getter and makes the
+  // input the current getter.
+  void set_os_stack_trace_getter(OsStackTraceGetterInterface* getter);
+
+  // Returns the current OS stack trace getter if it is not NULL;
+  // otherwise, creates an OsStackTraceGetter, makes it the current
+  // getter, and returns it.
+  OsStackTraceGetterInterface* os_stack_trace_getter();
+
+  // Returns the current OS stack trace as an std::string.
+  //
+  // The maximum number of stack frames to be included is specified by
+  // the gtest_stack_trace_depth flag.  The skip_count parameter
+  // specifies the number of top frames to be skipped, which doesn't
+  // count against the number of frames to be included.
+  //
+  // For example, if Foo() calls Bar(), which in turn calls
+  // CurrentOsStackTraceExceptTop(1), Foo() will be included in the
+  // trace but Bar() and CurrentOsStackTraceExceptTop() won't.
+  std::string CurrentOsStackTraceExceptTop(int skip_count) GTEST_NO_INLINE_;
+
+  // Finds and returns a TestCase with the given name.  If one doesn't
+  // exist, creates one and returns it.
+  //
+  // Arguments:
+  //
+  //   test_case_name: name of the test case
+  //   type_param:     the name of the test's type parameter, or NULL if
+  //                   this is not a typed or a type-parameterized test.
+  //   set_up_tc:      pointer to the function that sets up the test case
+  //   tear_down_tc:   pointer to the function that tears down the test case
+  TestCase* GetTestCase(const char* test_case_name,
+                        const char* type_param,
+                        Test::SetUpTestCaseFunc set_up_tc,
+                        Test::TearDownTestCaseFunc tear_down_tc);
+
+  // Adds a TestInfo to the unit test.
+  //
+  // Arguments:
+  //
+  //   set_up_tc:    pointer to the function that sets up the test case
+  //   tear_down_tc: pointer to the function that tears down the test case
+  //   test_info:    the TestInfo object
+  void AddTestInfo(Test::SetUpTestCaseFunc set_up_tc,
+                   Test::TearDownTestCaseFunc tear_down_tc,
+                   TestInfo* test_info) {
+    // In order to support thread-safe death tests, we need to
+    // remember the original working directory when the test program
+    // was first invoked.  We cannot do this in RUN_ALL_TESTS(), as
+    // the user may have changed the current directory before calling
+    // RUN_ALL_TESTS().  Therefore we capture the current directory in
+    // AddTestInfo(), which is called to register a TEST or TEST_F
+    // before main() is reached.
+    if (original_working_dir_.IsEmpty()) {
+      original_working_dir_.Set(FilePath::GetCurrentDir());
+      GTEST_CHECK_(!original_working_dir_.IsEmpty())
+          << "Failed to get the current working directory.";
+    }
+
+    GetTestCase(test_info->test_case_name(),
+                test_info->type_param(),
+                set_up_tc,
+                tear_down_tc)->AddTestInfo(test_info);
+  }
+
+#if GTEST_HAS_PARAM_TEST
+  // Returns ParameterizedTestCaseRegistry object used to keep track of
+  // value-parameterized tests and instantiate and register them.
+  internal::ParameterizedTestCaseRegistry& parameterized_test_registry() {
+    return parameterized_test_registry_;
+  }
+#endif  // GTEST_HAS_PARAM_TEST
+
+  // Sets the TestCase object for the test that's currently running.
+  void set_current_test_case(TestCase* a_current_test_case) {
+    current_test_case_ = a_current_test_case;
+  }
+
+  // Sets the TestInfo object for the test that's currently running.  If
+  // current_test_info is NULL, the assertion results will be stored in
+  // ad_hoc_test_result_.
+  void set_current_test_info(TestInfo* a_current_test_info) {
+    current_test_info_ = a_current_test_info;
+  }
+
+  // Registers all parameterized tests defined using TEST_P and
+  // INSTANTIATE_TEST_CASE_P, creating regular tests for each test/parameter
+  // combination. This method can be called more then once; it has guards
+  // protecting from registering the tests more then once.  If
+  // value-parameterized tests are disabled, RegisterParameterizedTests is
+  // present but does nothing.
+  void RegisterParameterizedTests();
+
+  // Runs all tests in this UnitTest object, prints the result, and
+  // returns true if all tests are successful.  If any exception is
+  // thrown during a test, this test is considered to be failed, but
+  // the rest of the tests will still be run.
+  bool RunAllTests();
+
+  // Clears the results of all tests, except the ad hoc tests.
+  void ClearNonAdHocTestResult() {
+    ForEach(test_cases_, TestCase::ClearTestCaseResult);
+  }
+
+  // Clears the results of ad-hoc test assertions.
+  void ClearAdHocTestResult() {
+    ad_hoc_test_result_.Clear();
+  }
+
+  // Adds a TestProperty to the current TestResult object when invoked in a
+  // context of a test or a test case, or to the global property set. If the
+  // result already contains a property with the same key, the value will be
+  // updated.
+  void RecordProperty(const TestProperty& test_property);
+
+  enum ReactionToSharding {
+    HONOR_SHARDING_PROTOCOL,
+    IGNORE_SHARDING_PROTOCOL
+  };
+
+  // Matches the full name of each test against the user-specified
+  // filter to decide whether the test should run, then records the
+  // result in each TestCase and TestInfo object.
+  // If shard_tests == HONOR_SHARDING_PROTOCOL, further filters tests
+  // based on sharding variables in the environment.
+  // Returns the number of tests that should run.
+  int FilterTests(ReactionToSharding shard_tests);
+
+  // Prints the names of the tests matching the user-specified filter flag.
+  void ListTestsMatchingFilter();
+
+  const TestCase* current_test_case() const { return current_test_case_; }
+  TestInfo* current_test_info() { return current_test_info_; }
+  const TestInfo* current_test_info() const { return current_test_info_; }
+
+  // Returns the vector of environments that need to be set-up/torn-down
+  // before/after the tests are run.
+  std::vector<Environment*>& environments() { return environments_; }
+
+  // Getters for the per-thread Google Test trace stack.
+  std::vector<TraceInfo>& gtest_trace_stack() {
+    return *(gtest_trace_stack_.pointer());
+  }
+  const std::vector<TraceInfo>& gtest_trace_stack() const {
+    return gtest_trace_stack_.get();
+  }
+
+#if GTEST_HAS_DEATH_TEST
+  void InitDeathTestSubprocessControlInfo() {
+    internal_run_death_test_flag_.reset(ParseInternalRunDeathTestFlag());
+  }
+  // Returns a pointer to the parsed --gtest_internal_run_death_test
+  // flag, or NULL if that flag was not specified.
+  // This information is useful only in a death test child process.
+  // Must not be called before a call to InitGoogleTest.
+  const InternalRunDeathTestFlag* internal_run_death_test_flag() const {
+    return internal_run_death_test_flag_.get();
+  }
+
+  // Returns a pointer to the current death test factory.
+  internal::DeathTestFactory* death_test_factory() {
+    return death_test_factory_.get();
+  }
+
+  void SuppressTestEventsIfInSubprocess();
+
+  friend class ReplaceDeathTestFactory;
+#endif  // GTEST_HAS_DEATH_TEST
+
+  // Initializes the event listener performing XML output as specified by
+  // UnitTestOptions. Must not be called before InitGoogleTest.
+  void ConfigureXmlOutput();
+
+#if GTEST_CAN_STREAM_RESULTS_
+  // Initializes the event listener for streaming test results to a socket.
+  // Must not be called before InitGoogleTest.
+  void ConfigureStreamingOutput();
+#endif
+
+  // Performs initialization dependent upon flag values obtained in
+  // ParseGoogleTestFlagsOnly.  Is called from InitGoogleTest after the call to
+  // ParseGoogleTestFlagsOnly.  In case a user neglects to call InitGoogleTest
+  // this function is also called from RunAllTests.  Since this function can be
+  // called more than once, it has to be idempotent.
+  void PostFlagParsingInit();
+
+  // Gets the random seed used at the start of the current test iteration.
+  int random_seed() const { return random_seed_; }
+
+  // Gets the random number generator.
+  internal::Random* random() { return &random_; }
+
+  // Shuffles all test cases, and the tests within each test case,
+  // making sure that death tests are still run first.
+  void ShuffleTests();
+
+  // Restores the test cases and tests to their order before the first shuffle.
+  void UnshuffleTests();
+
+  // Returns the value of GTEST_FLAG(catch_exceptions) at the moment
+  // UnitTest::Run() starts.
+  bool catch_exceptions() const { return catch_exceptions_; }
+
+ private:
+  friend class ::testing::UnitTest;
+
+  // Used by UnitTest::Run() to capture the state of
+  // GTEST_FLAG(catch_exceptions) at the moment it starts.
+  void set_catch_exceptions(bool value) { catch_exceptions_ = value; }
+
+  // The UnitTest object that owns this implementation object.
+  UnitTest* const parent_;
+
+  // The working directory when the first TEST() or TEST_F() was
+  // executed.
+  internal::FilePath original_working_dir_;
+
+  // The default test part result reporters.
+  DefaultGlobalTestPartResultReporter default_global_test_part_result_reporter_;
+  DefaultPerThreadTestPartResultReporter
+      default_per_thread_test_part_result_reporter_;
+
+  // Points to (but doesn't own) the global test part result reporter.
+  TestPartResultReporterInterface* global_test_part_result_repoter_;
+
+  // Protects read and write access to global_test_part_result_reporter_.
+  internal::Mutex global_test_part_result_reporter_mutex_;
+
+  // Points to (but doesn't own) the per-thread test part result reporter.
+  internal::ThreadLocal<TestPartResultReporterInterface*>
+      per_thread_test_part_result_reporter_;
+
+  // The vector of environments that need to be set-up/torn-down
+  // before/after the tests are run.
+  std::vector<Environment*> environments_;
+
+  // The vector of TestCases in their original order.  It owns the
+  // elements in the vector.
+  std::vector<TestCase*> test_cases_;
+
+  // Provides a level of indirection for the test case list to allow
+  // easy shuffling and restoring the test case order.  The i-th
+  // element of this vector is the index of the i-th test case in the
+  // shuffled order.
+  std::vector<int> test_case_indices_;
+
+#if GTEST_HAS_PARAM_TEST
+  // ParameterizedTestRegistry object used to register value-parameterized
+  // tests.
+  internal::ParameterizedTestCaseRegistry parameterized_test_registry_;
+
+  // Indicates whether RegisterParameterizedTests() has been called already.
+  bool parameterized_tests_registered_;
+#endif  // GTEST_HAS_PARAM_TEST
+
+  // Index of the last death test case registered.  Initially -1.
+  int last_death_test_case_;
+
+  // This points to the TestCase for the currently running test.  It
+  // changes as Google Test goes through one test case after another.
+  // When no test is running, this is set to NULL and Google Test
+  // stores assertion results in ad_hoc_test_result_.  Initially NULL.
+  TestCase* current_test_case_;
+
+  // This points to the TestInfo for the currently running test.  It
+  // changes as Google Test goes through one test after another.  When
+  // no test is running, this is set to NULL and Google Test stores
+  // assertion results in ad_hoc_test_result_.  Initially NULL.
+  TestInfo* current_test_info_;
+
+  // Normally, a user only writes assertions inside a TEST or TEST_F,
+  // or inside a function called by a TEST or TEST_F.  Since Google
+  // Test keeps track of which test is current running, it can
+  // associate such an assertion with the test it belongs to.
+  //
+  // If an assertion is encountered when no TEST or TEST_F is running,
+  // Google Test attributes the assertion result to an imaginary "ad hoc"
+  // test, and records the result in ad_hoc_test_result_.
+  TestResult ad_hoc_test_result_;
+
+  // The list of event listeners that can be used to track events inside
+  // Google Test.
+  TestEventListeners listeners_;
+
+  // The OS stack trace getter.  Will be deleted when the UnitTest
+  // object is destructed.  By default, an OsStackTraceGetter is used,
+  // but the user can set this field to use a custom getter if that is
+  // desired.
+  OsStackTraceGetterInterface* os_stack_trace_getter_;
+
+  // True iff PostFlagParsingInit() has been called.
+  bool post_flag_parse_init_performed_;
+
+  // The random number seed used at the beginning of the test run.
+  int random_seed_;
+
+  // Our random number generator.
+  internal::Random random_;
+
+  // The time of the test program start, in ms from the start of the
+  // UNIX epoch.
+  TimeInMillis start_timestamp_;
+
+  // How long the test took to run, in milliseconds.
+  TimeInMillis elapsed_time_;
+
+#if GTEST_HAS_DEATH_TEST
+  // The decomposed components of the gtest_internal_run_death_test flag,
+  // parsed when RUN_ALL_TESTS is called.
+  internal::scoped_ptr<InternalRunDeathTestFlag> internal_run_death_test_flag_;
+  internal::scoped_ptr<internal::DeathTestFactory> death_test_factory_;
+#endif  // GTEST_HAS_DEATH_TEST
+
+  // A per-thread stack of traces created by the SCOPED_TRACE() macro.
+  internal::ThreadLocal<std::vector<TraceInfo> > gtest_trace_stack_;
+
+  // The value of GTEST_FLAG(catch_exceptions) at the moment RunAllTests()
+  // starts.
+  bool catch_exceptions_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTestImpl);
+};  // class UnitTestImpl
+
+// Convenience function for accessing the global UnitTest
+// implementation object.
+inline UnitTestImpl* GetUnitTestImpl() {
+  return UnitTest::GetInstance()->impl();
+}
+
+#if GTEST_USES_SIMPLE_RE
+
+// Internal helper functions for implementing the simple regular
+// expression matcher.
+GTEST_API_ bool IsInSet(char ch, const char* str);
+GTEST_API_ bool IsAsciiDigit(char ch);
+GTEST_API_ bool IsAsciiPunct(char ch);
+GTEST_API_ bool IsRepeat(char ch);
+GTEST_API_ bool IsAsciiWhiteSpace(char ch);
+GTEST_API_ bool IsAsciiWordChar(char ch);
+GTEST_API_ bool IsValidEscape(char ch);
+GTEST_API_ bool AtomMatchesChar(bool escaped, char pattern, char ch);
+GTEST_API_ bool ValidateRegex(const char* regex);
+GTEST_API_ bool MatchRegexAtHead(const char* regex, const char* str);
+GTEST_API_ bool MatchRepetitionAndRegexAtHead(
+    bool escaped, char ch, char repeat, const char* regex, const char* str);
+GTEST_API_ bool MatchRegexAnywhere(const char* regex, const char* str);
+
+#endif  // GTEST_USES_SIMPLE_RE
+
+// Parses the command line for Google Test flags, without initializing
+// other parts of Google Test.
+GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, char** argv);
+GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv);
+
+#if GTEST_HAS_DEATH_TEST
+
+// Returns the message describing the last system error, regardless of the
+// platform.
+GTEST_API_ std::string GetLastErrnoDescription();
+
+# if GTEST_OS_WINDOWS
+// Provides leak-safe Windows kernel handle ownership.
+class AutoHandle {
+ public:
+  AutoHandle() : handle_(INVALID_HANDLE_VALUE) {}
+  explicit AutoHandle(HANDLE handle) : handle_(handle) {}
+
+  ~AutoHandle() { Reset(); }
+
+  HANDLE Get() const { return handle_; }
+  void Reset() { Reset(INVALID_HANDLE_VALUE); }
+  void Reset(HANDLE handle) {
+    if (handle != handle_) {
+      if (handle_ != INVALID_HANDLE_VALUE)
+        ::CloseHandle(handle_);
+      handle_ = handle;
+    }
+  }
+
+ private:
+  HANDLE handle_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(AutoHandle);
+};
+# endif  // GTEST_OS_WINDOWS
+
+// Attempts to parse a string into a positive integer pointed to by the
+// number parameter.  Returns true if that is possible.
+// GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we can use
+// it here.
+template <typename Integer>
+bool ParseNaturalNumber(const ::std::string& str, Integer* number) {
+  // Fail fast if the given string does not begin with a digit;
+  // this bypasses strtoXXX's "optional leading whitespace and plus
+  // or minus sign" semantics, which are undesirable here.
+  if (str.empty() || !IsDigit(str[0])) {
+    return false;
+  }
+  errno = 0;
+
+  char* end;
+  // BiggestConvertible is the largest integer type that system-provided
+  // string-to-number conversion routines can return.
+
+# if GTEST_OS_WINDOWS && !defined(__GNUC__)
+
+  // MSVC and C++ Builder define __int64 instead of the standard long long.
+  typedef unsigned __int64 BiggestConvertible;
+  const BiggestConvertible parsed = _strtoui64(str.c_str(), &end, 10);
+
+# else
+
+  typedef unsigned long long BiggestConvertible;  // NOLINT
+  const BiggestConvertible parsed = strtoull(str.c_str(), &end, 10);
+
+# endif  // GTEST_OS_WINDOWS && !defined(__GNUC__)
+
+  const bool parse_success = *end == '\0' && errno == 0;
+
+  // TODO(vladl@google.com): Convert this to compile time assertion when it is
+  // available.
+  GTEST_CHECK_(sizeof(Integer) <= sizeof(parsed));
+
+  const Integer result = static_cast<Integer>(parsed);
+  if (parse_success && static_cast<BiggestConvertible>(result) == parsed) {
+    *number = result;
+    return true;
+  }
+  return false;
+}
+#endif  // GTEST_HAS_DEATH_TEST
+
+// TestResult contains some private methods that should be hidden from
+// Google Test user but are required for testing. This class allow our tests
+// to access them.
+//
+// This class is supplied only for the purpose of testing Google Test's own
+// constructs. Do not use it in user tests, either directly or indirectly.
+class TestResultAccessor {
+ public:
+  static void RecordProperty(TestResult* test_result,
+                             const std::string& xml_element,
+                             const TestProperty& property) {
+    test_result->RecordProperty(xml_element, property);
+  }
+
+  static void ClearTestPartResults(TestResult* test_result) {
+    test_result->ClearTestPartResults();
+  }
+
+  static const std::vector<testing::TestPartResult>& test_part_results(
+      const TestResult& test_result) {
+    return test_result.test_part_results();
+  }
+};
+
+#if GTEST_CAN_STREAM_RESULTS_
+
+// Streams test results to the given port on the given host machine.
+class StreamingListener : public EmptyTestEventListener {
+ public:
+  // Abstract base class for writing strings to a socket.
+  class AbstractSocketWriter {
+   public:
+    virtual ~AbstractSocketWriter() {}
+
+    // Sends a string to the socket.
+    virtual void Send(const string& message) = 0;
+
+    // Closes the socket.
+    virtual void CloseConnection() {}
+
+    // Sends a string and a newline to the socket.
+    void SendLn(const string& message) {
+      Send(message + "\n");
+    }
+  };
+
+  // Concrete class for actually writing strings to a socket.
+  class SocketWriter : public AbstractSocketWriter {
+   public:
+    SocketWriter(const string& host, const string& port)
+        : sockfd_(-1), host_name_(host), port_num_(port) {
+      MakeConnection();
+    }
+
+    virtual ~SocketWriter() {
+      if (sockfd_ != -1)
+        CloseConnection();
+    }
+
+    // Sends a string to the socket.
+    virtual void Send(const string& message) {
+      GTEST_CHECK_(sockfd_ != -1)
+          << "Send() can be called only when there is a connection.";
+
+      const int len = static_cast<int>(message.length());
+      if (write(sockfd_, message.c_str(), len) != len) {
+        GTEST_LOG_(WARNING)
+            << "stream_result_to: failed to stream to "
+            << host_name_ << ":" << port_num_;
+      }
+    }
+
+   private:
+    // Creates a client socket and connects to the server.
+    void MakeConnection();
+
+    // Closes the socket.
+    void CloseConnection() {
+      GTEST_CHECK_(sockfd_ != -1)
+          << "CloseConnection() can be called only when there is a connection.";
+
+      close(sockfd_);
+      sockfd_ = -1;
+    }
+
+    int sockfd_;  // socket file descriptor
+    const string host_name_;
+    const string port_num_;
+
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(SocketWriter);
+  };  // class SocketWriter
+
+  // Escapes '=', '&', '%', and '\n' characters in str as "%xx".
+  static string UrlEncode(const char* str);
+
+  StreamingListener(const string& host, const string& port)
+      : socket_writer_(new SocketWriter(host, port)) { Start(); }
+
+  explicit StreamingListener(AbstractSocketWriter* socket_writer)
+      : socket_writer_(socket_writer) { Start(); }
+
+  void OnTestProgramStart(const UnitTest& /* unit_test */) {
+    SendLn("event=TestProgramStart");
+  }
+
+  void OnTestProgramEnd(const UnitTest& unit_test) {
+    // Note that Google Test current only report elapsed time for each
+    // test iteration, not for the entire test program.
+    SendLn("event=TestProgramEnd&passed=" + FormatBool(unit_test.Passed()));
+
+    // Notify the streaming server to stop.
+    socket_writer_->CloseConnection();
+  }
+
+  void OnTestIterationStart(const UnitTest& /* unit_test */, int iteration) {
+    SendLn("event=TestIterationStart&iteration=" +
+           StreamableToString(iteration));
+  }
+
+  void OnTestIterationEnd(const UnitTest& unit_test, int /* iteration */) {
+    SendLn("event=TestIterationEnd&passed=" +
+           FormatBool(unit_test.Passed()) + "&elapsed_time=" +
+           StreamableToString(unit_test.elapsed_time()) + "ms");
+  }
+
+  void OnTestCaseStart(const TestCase& test_case) {
+    SendLn(std::string("event=TestCaseStart&name=") + test_case.name());
+  }
+
+  void OnTestCaseEnd(const TestCase& test_case) {
+    SendLn("event=TestCaseEnd&passed=" + FormatBool(test_case.Passed())
+           + "&elapsed_time=" + StreamableToString(test_case.elapsed_time())
+           + "ms");
+  }
+
+  void OnTestStart(const TestInfo& test_info) {
+    SendLn(std::string("event=TestStart&name=") + test_info.name());
+  }
+
+  void OnTestEnd(const TestInfo& test_info) {
+    SendLn("event=TestEnd&passed=" +
+           FormatBool((test_info.result())->Passed()) +
+           "&elapsed_time=" +
+           StreamableToString((test_info.result())->elapsed_time()) + "ms");
+  }
+
+  void OnTestPartResult(const TestPartResult& test_part_result) {
+    const char* file_name = test_part_result.file_name();
+    if (file_name == NULL)
+      file_name = "";
+    SendLn("event=TestPartResult&file=" + UrlEncode(file_name) +
+           "&line=" + StreamableToString(test_part_result.line_number()) +
+           "&message=" + UrlEncode(test_part_result.message()));
+  }
+
+ private:
+  // Sends the given message and a newline to the socket.
+  void SendLn(const string& message) { socket_writer_->SendLn(message); }
+
+  // Called at the start of streaming to notify the receiver what
+  // protocol we are using.
+  void Start() { SendLn("gtest_streaming_protocol_version=1.0"); }
+
+  string FormatBool(bool value) { return value ? "1" : "0"; }
+
+  const scoped_ptr<AbstractSocketWriter> socket_writer_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(StreamingListener);
+};  // class StreamingListener
+
+#endif  // GTEST_CAN_STREAM_RESULTS_
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_SRC_GTEST_INTERNAL_INL_H_
+#undef GTEST_IMPLEMENTATION_
+
+#if GTEST_OS_WINDOWS
+# define vsnprintf _vsnprintf
+#endif  // GTEST_OS_WINDOWS
+
+namespace testing {
+
+using internal::CountIf;
+using internal::ForEach;
+using internal::GetElementOr;
+using internal::Shuffle;
+
+// Constants.
+
+// A test whose test case name or test name matches this filter is
+// disabled and not run.
+static const char kDisableTestFilter[] = "DISABLED_*:*/DISABLED_*";
+
+// A test case whose name matches this filter is considered a death
+// test case and will be run before test cases whose name doesn't
+// match this filter.
+static const char kDeathTestCaseFilter[] = "*DeathTest:*DeathTest/*";
+
+// A test filter that matches everything.
+static const char kUniversalFilter[] = "*";
+
+// The default output file for XML output.
+static const char kDefaultOutputFile[] = "test_detail.xml";
+
+// The environment variable name for the test shard index.
+static const char kTestShardIndex[] = "GTEST_SHARD_INDEX";
+// The environment variable name for the total number of test shards.
+static const char kTestTotalShards[] = "GTEST_TOTAL_SHARDS";
+// The environment variable name for the test shard status file.
+static const char kTestShardStatusFile[] = "GTEST_SHARD_STATUS_FILE";
+
+namespace internal {
+
+// The text used in failure messages to indicate the start of the
+// stack trace.
+const char kStackTraceMarker[] = "\nStack trace:\n";
+
+// g_help_flag is true iff the --help flag or an equivalent form is
+// specified on the command line.
+bool g_help_flag = false;
+
+}  // namespace internal
+
+static const char* GetDefaultFilter() {
+  return kUniversalFilter;
+}
+
+GTEST_DEFINE_bool_(
+    also_run_disabled_tests,
+    internal::BoolFromGTestEnv("also_run_disabled_tests", false),
+    "Run disabled tests too, in addition to the tests normally being run.");
+
+GTEST_DEFINE_bool_(
+    break_on_failure,
+    internal::BoolFromGTestEnv("break_on_failure", false),
+    "True iff a failed assertion should be a debugger break-point.");
+
+GTEST_DEFINE_bool_(
+    catch_exceptions,
+    internal::BoolFromGTestEnv("catch_exceptions", true),
+    "True iff " GTEST_NAME_
+    " should catch exceptions and treat them as test failures.");
+
+GTEST_DEFINE_string_(
+    color,
+    internal::StringFromGTestEnv("color", "auto"),
+    "Whether to use colors in the output.  Valid values: yes, no, "
+    "and auto.  'auto' means to use colors if the output is "
+    "being sent to a terminal and the TERM environment variable "
+    "is set to a terminal type that supports colors.");
+
+GTEST_DEFINE_string_(
+    filter,
+    internal::StringFromGTestEnv("filter", GetDefaultFilter()),
+    "A colon-separated list of glob (not regex) patterns "
+    "for filtering the tests to run, optionally followed by a "
+    "'-' and a : separated list of negative patterns (tests to "
+    "exclude).  A test is run if it matches one of the positive "
+    "patterns and does not match any of the negative patterns.");
+
+GTEST_DEFINE_bool_(list_tests, false,
+                   "List all tests without running them.");
+
+GTEST_DEFINE_string_(
+    output,
+    internal::StringFromGTestEnv("output", ""),
+    "A format (currently must be \"xml\"), optionally followed "
+    "by a colon and an output file name or directory. A directory "
+    "is indicated by a trailing pathname separator. "
+    "Examples: \"xml:filename.xml\", \"xml::directoryname/\". "
+    "If a directory is specified, output files will be created "
+    "within that directory, with file-names based on the test "
+    "executable's name and, if necessary, made unique by adding "
+    "digits.");
+
+GTEST_DEFINE_bool_(
+    print_time,
+    internal::BoolFromGTestEnv("print_time", true),
+    "True iff " GTEST_NAME_
+    " should display elapsed time in text output.");
+
+GTEST_DEFINE_int32_(
+    random_seed,
+    internal::Int32FromGTestEnv("random_seed", 0),
+    "Random number seed to use when shuffling test orders.  Must be in range "
+    "[1, 99999], or 0 to use a seed based on the current time.");
+
+GTEST_DEFINE_int32_(
+    repeat,
+    internal::Int32FromGTestEnv("repeat", 1),
+    "How many times to repeat each test.  Specify a negative number "
+    "for repeating forever.  Useful for shaking out flaky tests.");
+
+GTEST_DEFINE_bool_(
+    show_internal_stack_frames, false,
+    "True iff " GTEST_NAME_ " should include internal stack frames when "
+    "printing test failure stack traces.");
+
+GTEST_DEFINE_bool_(
+    shuffle,
+    internal::BoolFromGTestEnv("shuffle", false),
+    "True iff " GTEST_NAME_
+    " should randomize tests' order on every run.");
+
+GTEST_DEFINE_int32_(
+    stack_trace_depth,
+    internal::Int32FromGTestEnv("stack_trace_depth", kMaxStackTraceDepth),
+    "The maximum number of stack frames to print when an "
+    "assertion fails.  The valid range is 0 through 100, inclusive.");
+
+GTEST_DEFINE_string_(
+    stream_result_to,
+    internal::StringFromGTestEnv("stream_result_to", ""),
+    "This flag specifies the host name and the port number on which to stream "
+    "test results. Example: \"localhost:555\". The flag is effective only on "
+    "Linux.");
+
+GTEST_DEFINE_bool_(
+    throw_on_failure,
+    internal::BoolFromGTestEnv("throw_on_failure", false),
+    "When this flag is specified, a failed assertion will throw an exception "
+    "if exceptions are enabled or exit the program with a non-zero code "
+    "otherwise.");
+
+namespace internal {
+
+// Generates a random number from [0, range), using a Linear
+// Congruential Generator (LCG).  Crashes if 'range' is 0 or greater
+// than kMaxRange.
+UInt32 Random::Generate(UInt32 range) {
+  // These constants are the same as are used in glibc's rand(3).
+  state_ = (1103515245U*state_ + 12345U) % kMaxRange;
+
+  GTEST_CHECK_(range > 0)
+      << "Cannot generate a number in the range [0, 0).";
+  GTEST_CHECK_(range <= kMaxRange)
+      << "Generation of a number in [0, " << range << ") was requested, "
+      << "but this can only generate numbers in [0, " << kMaxRange << ").";
+
+  // Converting via modulus introduces a bit of downward bias, but
+  // it's simple, and a linear congruential generator isn't too good
+  // to begin with.
+  return state_ % range;
+}
+
+// GTestIsInitialized() returns true iff the user has initialized
+// Google Test.  Useful for catching the user mistake of not initializing
+// Google Test before calling RUN_ALL_TESTS().
+//
+// A user must call testing::InitGoogleTest() to initialize Google
+// Test.  g_init_gtest_count is set to the number of times
+// InitGoogleTest() has been called.  We don't protect this variable
+// under a mutex as it is only accessed in the main thread.
+GTEST_API_ int g_init_gtest_count = 0;
+static bool GTestIsInitialized() { return g_init_gtest_count != 0; }
+
+// Iterates over a vector of TestCases, keeping a running sum of the
+// results of calling a given int-returning method on each.
+// Returns the sum.
+static int SumOverTestCaseList(const std::vector<TestCase*>& case_list,
+                               int (TestCase::*method)() const) {
+  int sum = 0;
+  for (size_t i = 0; i < case_list.size(); i++) {
+    sum += (case_list[i]->*method)();
+  }
+  return sum;
+}
+
+// Returns true iff the test case passed.
+static bool TestCasePassed(const TestCase* test_case) {
+  return test_case->should_run() && test_case->Passed();
+}
+
+// Returns true iff the test case failed.
+static bool TestCaseFailed(const TestCase* test_case) {
+  return test_case->should_run() && test_case->Failed();
+}
+
+// Returns true iff test_case contains at least one test that should
+// run.
+static bool ShouldRunTestCase(const TestCase* test_case) {
+  return test_case->should_run();
+}
+
+// AssertHelper constructor.
+AssertHelper::AssertHelper(TestPartResult::Type type,
+                           const char* file,
+                           int line,
+                           const char* message)
+    : data_(new AssertHelperData(type, file, line, message)) {
+}
+
+AssertHelper::~AssertHelper() {
+  delete data_;
+}
+
+// Message assignment, for assertion streaming support.
+void AssertHelper::operator=(const Message& message) const {
+  UnitTest::GetInstance()->
+    AddTestPartResult(data_->type, data_->file, data_->line,
+                      AppendUserMessage(data_->message, message),
+                      UnitTest::GetInstance()->impl()
+                      ->CurrentOsStackTraceExceptTop(1)
+                      // Skips the stack frame for this function itself.
+                      );  // NOLINT
+}
+
+// Mutex for linked pointers.
+GTEST_API_ GTEST_DEFINE_STATIC_MUTEX_(g_linked_ptr_mutex);
+
+// Application pathname gotten in InitGoogleTest.
+std::string g_executable_path;
+
+// Returns the current application's name, removing directory path if that
+// is present.
+FilePath GetCurrentExecutableName() {
+  FilePath result;
+
+#if GTEST_OS_WINDOWS
+  result.Set(FilePath(g_executable_path).RemoveExtension("exe"));
+#else
+  result.Set(FilePath(g_executable_path));
+#endif  // GTEST_OS_WINDOWS
+
+  return result.RemoveDirectoryName();
+}
+
+// Functions for processing the gtest_output flag.
+
+// Returns the output format, or "" for normal printed output.
+std::string UnitTestOptions::GetOutputFormat() {
+  const char* const gtest_output_flag = GTEST_FLAG(output).c_str();
+  if (gtest_output_flag == NULL) return std::string("");
+
+  const char* const colon = strchr(gtest_output_flag, ':');
+  return (colon == NULL) ?
+      std::string(gtest_output_flag) :
+      std::string(gtest_output_flag, colon - gtest_output_flag);
+}
+
+// Returns the name of the requested output file, or the default if none
+// was explicitly specified.
+std::string UnitTestOptions::GetAbsolutePathToOutputFile() {
+  const char* const gtest_output_flag = GTEST_FLAG(output).c_str();
+  if (gtest_output_flag == NULL)
+    return "";
+
+  const char* const colon = strchr(gtest_output_flag, ':');
+  if (colon == NULL)
+    return internal::FilePath::ConcatPaths(
+        internal::FilePath(
+            UnitTest::GetInstance()->original_working_dir()),
+        internal::FilePath(kDefaultOutputFile)).string();
+
+  internal::FilePath output_name(colon + 1);
+  if (!output_name.IsAbsolutePath())
+    // TODO(wan@google.com): on Windows \some\path is not an absolute
+    // path (as its meaning depends on the current drive), yet the
+    // following logic for turning it into an absolute path is wrong.
+    // Fix it.
+    output_name = internal::FilePath::ConcatPaths(
+        internal::FilePath(UnitTest::GetInstance()->original_working_dir()),
+        internal::FilePath(colon + 1));
+
+  if (!output_name.IsDirectory())
+    return output_name.string();
+
+  internal::FilePath result(internal::FilePath::GenerateUniqueFileName(
+      output_name, internal::GetCurrentExecutableName(),
+      GetOutputFormat().c_str()));
+  return result.string();
+}
+
+// Returns true iff the wildcard pattern matches the string.  The
+// first ':' or '\0' character in pattern marks the end of it.
+//
+// This recursive algorithm isn't very efficient, but is clear and
+// works well enough for matching test names, which are short.
+bool UnitTestOptions::PatternMatchesString(const char *pattern,
+                                           const char *str) {
+  switch (*pattern) {
+    case '\0':
+    case ':':  // Either ':' or '\0' marks the end of the pattern.
+      return *str == '\0';
+    case '?':  // Matches any single character.
+      return *str != '\0' && PatternMatchesString(pattern + 1, str + 1);
+    case '*':  // Matches any string (possibly empty) of characters.
+      return (*str != '\0' && PatternMatchesString(pattern, str + 1)) ||
+          PatternMatchesString(pattern + 1, str);
+    default:  // Non-special character.  Matches itself.
+      return *pattern == *str &&
+          PatternMatchesString(pattern + 1, str + 1);
+  }
+}
+
+bool UnitTestOptions::MatchesFilter(
+    const std::string& name, const char* filter) {
+  const char *cur_pattern = filter;
+  for (;;) {
+    if (PatternMatchesString(cur_pattern, name.c_str())) {
+      return true;
+    }
+
+    // Finds the next pattern in the filter.
+    cur_pattern = strchr(cur_pattern, ':');
+
+    // Returns if no more pattern can be found.
+    if (cur_pattern == NULL) {
+      return false;
+    }
+
+    // Skips the pattern separater (the ':' character).
+    cur_pattern++;
+  }
+}
+
+// Returns true iff the user-specified filter matches the test case
+// name and the test name.
+bool UnitTestOptions::FilterMatchesTest(const std::string &test_case_name,
+                                        const std::string &test_name) {
+  const std::string& full_name = test_case_name + "." + test_name.c_str();
+
+  // Split --gtest_filter at '-', if there is one, to separate into
+  // positive filter and negative filter portions
+  const char* const p = GTEST_FLAG(filter).c_str();
+  const char* const dash = strchr(p, '-');
+  std::string positive;
+  std::string negative;
+  if (dash == NULL) {
+    positive = GTEST_FLAG(filter).c_str();  // Whole string is a positive filter
+    negative = "";
+  } else {
+    positive = std::string(p, dash);   // Everything up to the dash
+    negative = std::string(dash + 1);  // Everything after the dash
+    if (positive.empty()) {
+      // Treat '-test1' as the same as '*-test1'
+      positive = kUniversalFilter;
+    }
+  }
+
+  // A filter is a colon-separated list of patterns.  It matches a
+  // test if any pattern in it matches the test.
+  return (MatchesFilter(full_name, positive.c_str()) &&
+          !MatchesFilter(full_name, negative.c_str()));
+}
+
+#if GTEST_HAS_SEH
+// Returns EXCEPTION_EXECUTE_HANDLER if Google Test should handle the
+// given SEH exception, or EXCEPTION_CONTINUE_SEARCH otherwise.
+// This function is useful as an __except condition.
+int UnitTestOptions::GTestShouldProcessSEH(DWORD exception_code) {
+  // Google Test should handle a SEH exception if:
+  //   1. the user wants it to, AND
+  //   2. this is not a breakpoint exception, AND
+  //   3. this is not a C++ exception (VC++ implements them via SEH,
+  //      apparently).
+  //
+  // SEH exception code for C++ exceptions.
+  // (see http://support.microsoft.com/kb/185294 for more information).
+  const DWORD kCxxExceptionCode = 0xe06d7363;
+
+  bool should_handle = true;
+
+  if (!GTEST_FLAG(catch_exceptions))
+    should_handle = false;
+  else if (exception_code == EXCEPTION_BREAKPOINT)
+    should_handle = false;
+  else if (exception_code == kCxxExceptionCode)
+    should_handle = false;
+
+  return should_handle ? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH;
+}
+#endif  // GTEST_HAS_SEH
+
+}  // namespace internal
+
+// The c'tor sets this object as the test part result reporter used by
+// Google Test.  The 'result' parameter specifies where to report the
+// results. Intercepts only failures from the current thread.
+ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
+    TestPartResultArray* result)
+    : intercept_mode_(INTERCEPT_ONLY_CURRENT_THREAD),
+      result_(result) {
+  Init();
+}
+
+// The c'tor sets this object as the test part result reporter used by
+// Google Test.  The 'result' parameter specifies where to report the
+// results.
+ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
+    InterceptMode intercept_mode, TestPartResultArray* result)
+    : intercept_mode_(intercept_mode),
+      result_(result) {
+  Init();
+}
+
+void ScopedFakeTestPartResultReporter::Init() {
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  if (intercept_mode_ == INTERCEPT_ALL_THREADS) {
+    old_reporter_ = impl->GetGlobalTestPartResultReporter();
+    impl->SetGlobalTestPartResultReporter(this);
+  } else {
+    old_reporter_ = impl->GetTestPartResultReporterForCurrentThread();
+    impl->SetTestPartResultReporterForCurrentThread(this);
+  }
+}
+
+// The d'tor restores the test part result reporter used by Google Test
+// before.
+ScopedFakeTestPartResultReporter::~ScopedFakeTestPartResultReporter() {
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  if (intercept_mode_ == INTERCEPT_ALL_THREADS) {
+    impl->SetGlobalTestPartResultReporter(old_reporter_);
+  } else {
+    impl->SetTestPartResultReporterForCurrentThread(old_reporter_);
+  }
+}
+
+// Increments the test part result count and remembers the result.
+// This method is from the TestPartResultReporterInterface interface.
+void ScopedFakeTestPartResultReporter::ReportTestPartResult(
+    const TestPartResult& result) {
+  result_->Append(result);
+}
+
+namespace internal {
+
+// Returns the type ID of ::testing::Test.  We should always call this
+// instead of GetTypeId< ::testing::Test>() to get the type ID of
+// testing::Test.  This is to work around a suspected linker bug when
+// using Google Test as a framework on Mac OS X.  The bug causes
+// GetTypeId< ::testing::Test>() to return different values depending
+// on whether the call is from the Google Test framework itself or
+// from user test code.  GetTestTypeId() is guaranteed to always
+// return the same value, as it always calls GetTypeId<>() from the
+// gtest.cc, which is within the Google Test framework.
+TypeId GetTestTypeId() {
+  return GetTypeId<Test>();
+}
+
+// The value of GetTestTypeId() as seen from within the Google Test
+// library.  This is solely for testing GetTestTypeId().
+extern const TypeId kTestTypeIdInGoogleTest = GetTestTypeId();
+
+// This predicate-formatter checks that 'results' contains a test part
+// failure of the given type and that the failure message contains the
+// given substring.
+AssertionResult HasOneFailure(const char* /* results_expr */,
+                              const char* /* type_expr */,
+                              const char* /* substr_expr */,
+                              const TestPartResultArray& results,
+                              TestPartResult::Type type,
+                              const string& substr) {
+  const std::string expected(type == TestPartResult::kFatalFailure ?
+                        "1 fatal failure" :
+                        "1 non-fatal failure");
+  Message msg;
+  if (results.size() != 1) {
+    msg << "Expected: " << expected << "\n"
+        << "  Actual: " << results.size() << " failures";
+    for (int i = 0; i < results.size(); i++) {
+      msg << "\n" << results.GetTestPartResult(i);
+    }
+    return AssertionFailure() << msg;
+  }
+
+  const TestPartResult& r = results.GetTestPartResult(0);
+  if (r.type() != type) {
+    return AssertionFailure() << "Expected: " << expected << "\n"
+                              << "  Actual:\n"
+                              << r;
+  }
+
+  if (strstr(r.message(), substr.c_str()) == NULL) {
+    return AssertionFailure() << "Expected: " << expected << " containing \""
+                              << substr << "\"\n"
+                              << "  Actual:\n"
+                              << r;
+  }
+
+  return AssertionSuccess();
+}
+
+// The constructor of SingleFailureChecker remembers where to look up
+// test part results, what type of failure we expect, and what
+// substring the failure message should contain.
+SingleFailureChecker:: SingleFailureChecker(
+    const TestPartResultArray* results,
+    TestPartResult::Type type,
+    const string& substr)
+    : results_(results),
+      type_(type),
+      substr_(substr) {}
+
+// The destructor of SingleFailureChecker verifies that the given
+// TestPartResultArray contains exactly one failure that has the given
+// type and contains the given substring.  If that's not the case, a
+// non-fatal failure will be generated.
+SingleFailureChecker::~SingleFailureChecker() {
+  EXPECT_PRED_FORMAT3(HasOneFailure, *results_, type_, substr_);
+}
+
+DefaultGlobalTestPartResultReporter::DefaultGlobalTestPartResultReporter(
+    UnitTestImpl* unit_test) : unit_test_(unit_test) {}
+
+void DefaultGlobalTestPartResultReporter::ReportTestPartResult(
+    const TestPartResult& result) {
+  unit_test_->current_test_result()->AddTestPartResult(result);
+  unit_test_->listeners()->repeater()->OnTestPartResult(result);
+}
+
+DefaultPerThreadTestPartResultReporter::DefaultPerThreadTestPartResultReporter(
+    UnitTestImpl* unit_test) : unit_test_(unit_test) {}
+
+void DefaultPerThreadTestPartResultReporter::ReportTestPartResult(
+    const TestPartResult& result) {
+  unit_test_->GetGlobalTestPartResultReporter()->ReportTestPartResult(result);
+}
+
+// Returns the global test part result reporter.
+TestPartResultReporterInterface*
+UnitTestImpl::GetGlobalTestPartResultReporter() {
+  internal::MutexLock lock(&global_test_part_result_reporter_mutex_);
+  return global_test_part_result_repoter_;
+}
+
+// Sets the global test part result reporter.
+void UnitTestImpl::SetGlobalTestPartResultReporter(
+    TestPartResultReporterInterface* reporter) {
+  internal::MutexLock lock(&global_test_part_result_reporter_mutex_);
+  global_test_part_result_repoter_ = reporter;
+}
+
+// Returns the test part result reporter for the current thread.
+TestPartResultReporterInterface*
+UnitTestImpl::GetTestPartResultReporterForCurrentThread() {
+  return per_thread_test_part_result_reporter_.get();
+}
+
+// Sets the test part result reporter for the current thread.
+void UnitTestImpl::SetTestPartResultReporterForCurrentThread(
+    TestPartResultReporterInterface* reporter) {
+  per_thread_test_part_result_reporter_.set(reporter);
+}
+
+// Gets the number of successful test cases.
+int UnitTestImpl::successful_test_case_count() const {
+  return CountIf(test_cases_, TestCasePassed);
+}
+
+// Gets the number of failed test cases.
+int UnitTestImpl::failed_test_case_count() const {
+  return CountIf(test_cases_, TestCaseFailed);
+}
+
+// Gets the number of all test cases.
+int UnitTestImpl::total_test_case_count() const {
+  return static_cast<int>(test_cases_.size());
+}
+
+// Gets the number of all test cases that contain at least one test
+// that should run.
+int UnitTestImpl::test_case_to_run_count() const {
+  return CountIf(test_cases_, ShouldRunTestCase);
+}
+
+// Gets the number of successful tests.
+int UnitTestImpl::successful_test_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::successful_test_count);
+}
+
+// Gets the number of failed tests.
+int UnitTestImpl::failed_test_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::failed_test_count);
+}
+
+// Gets the number of disabled tests that will be reported in the XML report.
+int UnitTestImpl::reportable_disabled_test_count() const {
+  return SumOverTestCaseList(test_cases_,
+                             &TestCase::reportable_disabled_test_count);
+}
+
+// Gets the number of disabled tests.
+int UnitTestImpl::disabled_test_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::disabled_test_count);
+}
+
+// Gets the number of tests to be printed in the XML report.
+int UnitTestImpl::reportable_test_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::reportable_test_count);
+}
+
+// Gets the number of all tests.
+int UnitTestImpl::total_test_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::total_test_count);
+}
+
+// Gets the number of tests that should run.
+int UnitTestImpl::test_to_run_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::test_to_run_count);
+}
+
+// Returns the current OS stack trace as an std::string.
+//
+// The maximum number of stack frames to be included is specified by
+// the gtest_stack_trace_depth flag.  The skip_count parameter
+// specifies the number of top frames to be skipped, which doesn't
+// count against the number of frames to be included.
+//
+// For example, if Foo() calls Bar(), which in turn calls
+// CurrentOsStackTraceExceptTop(1), Foo() will be included in the
+// trace but Bar() and CurrentOsStackTraceExceptTop() won't.
+std::string UnitTestImpl::CurrentOsStackTraceExceptTop(int skip_count) {
+  (void)skip_count;
+  return "";
+}
+
+// Returns the current time in milliseconds.
+TimeInMillis GetTimeInMillis() {
+#if GTEST_OS_WINDOWS_MOBILE || defined(__BORLANDC__)
+  // Difference between 1970-01-01 and 1601-01-01 in milliseconds.
+  // http://analogous.blogspot.com/2005/04/epoch.html
+  const TimeInMillis kJavaEpochToWinFileTimeDelta =
+    static_cast<TimeInMillis>(116444736UL) * 100000UL;
+  const DWORD kTenthMicrosInMilliSecond = 10000;
+
+  SYSTEMTIME now_systime;
+  FILETIME now_filetime;
+  ULARGE_INTEGER now_int64;
+  // TODO(kenton@google.com): Shouldn't this just use
+  //   GetSystemTimeAsFileTime()?
+  GetSystemTime(&now_systime);
+  if (SystemTimeToFileTime(&now_systime, &now_filetime)) {
+    now_int64.LowPart = now_filetime.dwLowDateTime;
+    now_int64.HighPart = now_filetime.dwHighDateTime;
+    now_int64.QuadPart = (now_int64.QuadPart / kTenthMicrosInMilliSecond) -
+      kJavaEpochToWinFileTimeDelta;
+    return now_int64.QuadPart;
+  }
+  return 0;
+#elif GTEST_OS_WINDOWS && !GTEST_HAS_GETTIMEOFDAY_
+  __timeb64 now;
+
+# ifdef _MSC_VER
+
+  // MSVC 8 deprecates _ftime64(), so we want to suppress warning 4996
+  // (deprecated function) there.
+  // TODO(kenton@google.com): Use GetTickCount()?  Or use
+  //   SystemTimeToFileTime()
+#  pragma warning(push)          // Saves the current warning state.
+#  pragma warning(disable:4996)  // Temporarily disables warning 4996.
+  _ftime64(&now);
+#  pragma warning(pop)           // Restores the warning state.
+# else
+
+  _ftime64(&now);
+
+# endif  // _MSC_VER
+
+  return static_cast<TimeInMillis>(now.time) * 1000 + now.millitm;
+#elif GTEST_HAS_GETTIMEOFDAY_
+  struct timeval now;
+  gettimeofday(&now, NULL);
+  return static_cast<TimeInMillis>(now.tv_sec) * 1000 + now.tv_usec / 1000;
+#else
+# error "Don't know how to get the current time on your system."
+#endif
+}
+
+// Utilities
+
+// class String.
+
+#if GTEST_OS_WINDOWS_MOBILE
+// Creates a UTF-16 wide string from the given ANSI string, allocating
+// memory using new. The caller is responsible for deleting the return
+// value using delete[]. Returns the wide string, or NULL if the
+// input is NULL.
+LPCWSTR String::AnsiToUtf16(const char* ansi) {
+  if (!ansi) return NULL;
+  const int length = strlen(ansi);
+  const int unicode_length =
+      MultiByteToWideChar(CP_ACP, 0, ansi, length,
+                          NULL, 0);
+  WCHAR* unicode = new WCHAR[unicode_length + 1];
+  MultiByteToWideChar(CP_ACP, 0, ansi, length,
+                      unicode, unicode_length);
+  unicode[unicode_length] = 0;
+  return unicode;
+}
+
+// Creates an ANSI string from the given wide string, allocating
+// memory using new. The caller is responsible for deleting the return
+// value using delete[]. Returns the ANSI string, or NULL if the
+// input is NULL.
+const char* String::Utf16ToAnsi(LPCWSTR utf16_str)  {
+  if (!utf16_str) return NULL;
+  const int ansi_length =
+      WideCharToMultiByte(CP_ACP, 0, utf16_str, -1,
+                          NULL, 0, NULL, NULL);
+  char* ansi = new char[ansi_length + 1];
+  WideCharToMultiByte(CP_ACP, 0, utf16_str, -1,
+                      ansi, ansi_length, NULL, NULL);
+  ansi[ansi_length] = 0;
+  return ansi;
+}
+
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+// Compares two C strings.  Returns true iff they have the same content.
+//
+// Unlike strcmp(), this function can handle NULL argument(s).  A NULL
+// C string is considered different to any non-NULL C string,
+// including the empty string.
+bool String::CStringEquals(const char * lhs, const char * rhs) {
+  if ( lhs == NULL ) return rhs == NULL;
+
+  if ( rhs == NULL ) return false;
+
+  return strcmp(lhs, rhs) == 0;
+}
+
+#if GTEST_HAS_STD_WSTRING || GTEST_HAS_GLOBAL_WSTRING
+
+// Converts an array of wide chars to a narrow string using the UTF-8
+// encoding, and streams the result to the given Message object.
+static void StreamWideCharsToMessage(const wchar_t* wstr, size_t length,
+                                     Message* msg) {
+  for (size_t i = 0; i != length; ) {  // NOLINT
+    if (wstr[i] != L'\0') {
+      *msg << WideStringToUtf8(wstr + i, static_cast<int>(length - i));
+      while (i != length && wstr[i] != L'\0')
+        i++;
+    } else {
+      *msg << '\0';
+      i++;
+    }
+  }
+}
+
+#endif  // GTEST_HAS_STD_WSTRING || GTEST_HAS_GLOBAL_WSTRING
+
+}  // namespace internal
+
+// Constructs an empty Message.
+// We allocate the stringstream separately because otherwise each use of
+// ASSERT/EXPECT in a procedure adds over 200 bytes to the procedure's
+// stack frame leading to huge stack frames in some cases; gcc does not reuse
+// the stack space.
+Message::Message() : ss_(new ::std::stringstream) {
+  // By default, we want there to be enough precision when printing
+  // a double to a Message.
+  *ss_ << std::setprecision(std::numeric_limits<double>::digits10 + 2);
+}
+
+// These two overloads allow streaming a wide C string to a Message
+// using the UTF-8 encoding.
+Message& Message::operator <<(const wchar_t* wide_c_str) {
+  return *this << internal::String::ShowWideCString(wide_c_str);
+}
+Message& Message::operator <<(wchar_t* wide_c_str) {
+  return *this << internal::String::ShowWideCString(wide_c_str);
+}
+
+#if GTEST_HAS_STD_WSTRING
+// Converts the given wide string to a narrow string using the UTF-8
+// encoding, and streams the result to this Message object.
+Message& Message::operator <<(const ::std::wstring& wstr) {
+  internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this);
+  return *this;
+}
+#endif  // GTEST_HAS_STD_WSTRING
+
+#if GTEST_HAS_GLOBAL_WSTRING
+// Converts the given wide string to a narrow string using the UTF-8
+// encoding, and streams the result to this Message object.
+Message& Message::operator <<(const ::wstring& wstr) {
+  internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this);
+  return *this;
+}
+#endif  // GTEST_HAS_GLOBAL_WSTRING
+
+// Gets the text streamed to this object so far as an std::string.
+// Each '\0' character in the buffer is replaced with "\\0".
+std::string Message::GetString() const {
+  return internal::StringStreamToString(ss_.get());
+}
+
+// AssertionResult constructors.
+// Used in EXPECT_TRUE/FALSE(assertion_result).
+AssertionResult::AssertionResult(const AssertionResult& other)
+    : success_(other.success_),
+      message_(other.message_.get() != NULL ?
+               new ::std::string(*other.message_) :
+               static_cast< ::std::string*>(NULL)) {
+}
+
+// Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
+AssertionResult AssertionResult::operator!() const {
+  AssertionResult negation(!success_);
+  if (message_.get() != NULL)
+    negation << *message_;
+  return negation;
+}
+
+// Makes a successful assertion result.
+AssertionResult AssertionSuccess() {
+  return AssertionResult(true);
+}
+
+// Makes a failed assertion result.
+AssertionResult AssertionFailure() {
+  return AssertionResult(false);
+}
+
+// Makes a failed assertion result with the given failure message.
+// Deprecated; use AssertionFailure() << message.
+AssertionResult AssertionFailure(const Message& message) {
+  return AssertionFailure() << message;
+}
+
+namespace internal {
+
+// Constructs and returns the message for an equality assertion
+// (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure.
+//
+// The first four parameters are the expressions used in the assertion
+// and their values, as strings.  For example, for ASSERT_EQ(foo, bar)
+// where foo is 5 and bar is 6, we have:
+//
+//   expected_expression: "foo"
+//   actual_expression:   "bar"
+//   expected_value:      "5"
+//   actual_value:        "6"
+//
+// The ignoring_case parameter is true iff the assertion is a
+// *_STRCASEEQ*.  When it's true, the string " (ignoring case)" will
+// be inserted into the message.
+AssertionResult EqFailure(const char* expected_expression,
+                          const char* actual_expression,
+                          const std::string& expected_value,
+                          const std::string& actual_value,
+                          bool ignoring_case) {
+  Message msg;
+  msg << "Value of: " << actual_expression;
+  if (actual_value != actual_expression) {
+    msg << "\n  Actual: " << actual_value;
+  }
+
+  msg << "\nExpected: " << expected_expression;
+  if (ignoring_case) {
+    msg << " (ignoring case)";
+  }
+  if (expected_value != expected_expression) {
+    msg << "\nWhich is: " << expected_value;
+  }
+
+  return AssertionFailure() << msg;
+}
+
+// Constructs a failure message for Boolean assertions such as EXPECT_TRUE.
+std::string GetBoolAssertionFailureMessage(
+    const AssertionResult& assertion_result,
+    const char* expression_text,
+    const char* actual_predicate_value,
+    const char* expected_predicate_value) {
+  const char* actual_message = assertion_result.message();
+  Message msg;
+  msg << "Value of: " << expression_text
+      << "\n  Actual: " << actual_predicate_value;
+  if (actual_message[0] != '\0')
+    msg << " (" << actual_message << ")";
+  msg << "\nExpected: " << expected_predicate_value;
+  return msg.GetString();
+}
+
+// Helper function for implementing ASSERT_NEAR.
+AssertionResult DoubleNearPredFormat(const char* expr1,
+                                     const char* expr2,
+                                     const char* abs_error_expr,
+                                     double val1,
+                                     double val2,
+                                     double abs_error) {
+  const double diff = fabs(val1 - val2);
+  if (diff <= abs_error) return AssertionSuccess();
+
+  // TODO(wan): do not print the value of an expression if it's
+  // already a literal.
+  return AssertionFailure()
+      << "The difference between " << expr1 << " and " << expr2
+      << " is " << diff << ", which exceeds " << abs_error_expr << ", where\n"
+      << expr1 << " evaluates to " << val1 << ",\n"
+      << expr2 << " evaluates to " << val2 << ", and\n"
+      << abs_error_expr << " evaluates to " << abs_error << ".";
+}
+
+
+// Helper template for implementing FloatLE() and DoubleLE().
+template <typename RawType>
+AssertionResult FloatingPointLE(const char* expr1,
+                                const char* expr2,
+                                RawType val1,
+                                RawType val2) {
+  // Returns success if val1 is less than val2,
+  if (val1 < val2) {
+    return AssertionSuccess();
+  }
+
+  // or if val1 is almost equal to val2.
+  const FloatingPoint<RawType> lhs(val1), rhs(val2);
+  if (lhs.AlmostEquals(rhs)) {
+    return AssertionSuccess();
+  }
+
+  // Note that the above two checks will both fail if either val1 or
+  // val2 is NaN, as the IEEE floating-point standard requires that
+  // any predicate involving a NaN must return false.
+
+  ::std::stringstream val1_ss;
+  val1_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+          << val1;
+
+  ::std::stringstream val2_ss;
+  val2_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+          << val2;
+
+  return AssertionFailure()
+      << "Expected: (" << expr1 << ") <= (" << expr2 << ")\n"
+      << "  Actual: " << StringStreamToString(&val1_ss) << " vs "
+      << StringStreamToString(&val2_ss);
+}
+
+}  // namespace internal
+
+// Asserts that val1 is less than, or almost equal to, val2.  Fails
+// otherwise.  In particular, it fails if either val1 or val2 is NaN.
+AssertionResult FloatLE(const char* expr1, const char* expr2,
+                        float val1, float val2) {
+  return internal::FloatingPointLE<float>(expr1, expr2, val1, val2);
+}
+
+// Asserts that val1 is less than, or almost equal to, val2.  Fails
+// otherwise.  In particular, it fails if either val1 or val2 is NaN.
+AssertionResult DoubleLE(const char* expr1, const char* expr2,
+                         double val1, double val2) {
+  return internal::FloatingPointLE<double>(expr1, expr2, val1, val2);
+}
+
+namespace internal {
+
+// The helper function for {ASSERT|EXPECT}_EQ with int or enum
+// arguments.
+AssertionResult CmpHelperEQ(const char* expected_expression,
+                            const char* actual_expression,
+                            BiggestInt expected,
+                            BiggestInt actual) {
+  if (expected == actual) {
+    return AssertionSuccess();
+  }
+
+  return EqFailure(expected_expression,
+                   actual_expression,
+                   FormatForComparisonFailureMessage(expected, actual),
+                   FormatForComparisonFailureMessage(actual, expected),
+                   false);
+}
+
+// A macro for implementing the helper functions needed to implement
+// ASSERT_?? and EXPECT_?? with integer or enum arguments.  It is here
+// just to avoid copy-and-paste of similar code.
+#define GTEST_IMPL_CMP_HELPER_(op_name, op)\
+AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \
+                                   BiggestInt val1, BiggestInt val2) {\
+  if (val1 op val2) {\
+    return AssertionSuccess();\
+  } else {\
+    return AssertionFailure() \
+        << "Expected: (" << expr1 << ") " #op " (" << expr2\
+        << "), actual: " << FormatForComparisonFailureMessage(val1, val2)\
+        << " vs " << FormatForComparisonFailureMessage(val2, val1);\
+  }\
+}
+
+// Implements the helper function for {ASSERT|EXPECT}_NE with int or
+// enum arguments.
+GTEST_IMPL_CMP_HELPER_(NE, !=)
+// Implements the helper function for {ASSERT|EXPECT}_LE with int or
+// enum arguments.
+GTEST_IMPL_CMP_HELPER_(LE, <=)
+// Implements the helper function for {ASSERT|EXPECT}_LT with int or
+// enum arguments.
+GTEST_IMPL_CMP_HELPER_(LT, < )
+// Implements the helper function for {ASSERT|EXPECT}_GE with int or
+// enum arguments.
+GTEST_IMPL_CMP_HELPER_(GE, >=)
+// Implements the helper function for {ASSERT|EXPECT}_GT with int or
+// enum arguments.
+GTEST_IMPL_CMP_HELPER_(GT, > )
+
+#undef GTEST_IMPL_CMP_HELPER_
+
+// The helper function for {ASSERT|EXPECT}_STREQ.
+AssertionResult CmpHelperSTREQ(const char* expected_expression,
+                               const char* actual_expression,
+                               const char* expected,
+                               const char* actual) {
+  if (String::CStringEquals(expected, actual)) {
+    return AssertionSuccess();
+  }
+
+  return EqFailure(expected_expression,
+                   actual_expression,
+                   PrintToString(expected),
+                   PrintToString(actual),
+                   false);
+}
+
+// The helper function for {ASSERT|EXPECT}_STRCASEEQ.
+AssertionResult CmpHelperSTRCASEEQ(const char* expected_expression,
+                                   const char* actual_expression,
+                                   const char* expected,
+                                   const char* actual) {
+  if (String::CaseInsensitiveCStringEquals(expected, actual)) {
+    return AssertionSuccess();
+  }
+
+  return EqFailure(expected_expression,
+                   actual_expression,
+                   PrintToString(expected),
+                   PrintToString(actual),
+                   true);
+}
+
+// The helper function for {ASSERT|EXPECT}_STRNE.
+AssertionResult CmpHelperSTRNE(const char* s1_expression,
+                               const char* s2_expression,
+                               const char* s1,
+                               const char* s2) {
+  if (!String::CStringEquals(s1, s2)) {
+    return AssertionSuccess();
+  } else {
+    return AssertionFailure() << "Expected: (" << s1_expression << ") != ("
+                              << s2_expression << "), actual: \""
+                              << s1 << "\" vs \"" << s2 << "\"";
+  }
+}
+
+// The helper function for {ASSERT|EXPECT}_STRCASENE.
+AssertionResult CmpHelperSTRCASENE(const char* s1_expression,
+                                   const char* s2_expression,
+                                   const char* s1,
+                                   const char* s2) {
+  if (!String::CaseInsensitiveCStringEquals(s1, s2)) {
+    return AssertionSuccess();
+  } else {
+    return AssertionFailure()
+        << "Expected: (" << s1_expression << ") != ("
+        << s2_expression << ") (ignoring case), actual: \""
+        << s1 << "\" vs \"" << s2 << "\"";
+  }
+}
+
+}  // namespace internal
+
+namespace {
+
+// Helper functions for implementing IsSubString() and IsNotSubstring().
+
+// This group of overloaded functions return true iff needle is a
+// substring of haystack.  NULL is considered a substring of itself
+// only.
+
+bool IsSubstringPred(const char* needle, const char* haystack) {
+  if (needle == NULL || haystack == NULL)
+    return needle == haystack;
+
+  return strstr(haystack, needle) != NULL;
+}
+
+bool IsSubstringPred(const wchar_t* needle, const wchar_t* haystack) {
+  if (needle == NULL || haystack == NULL)
+    return needle == haystack;
+
+  return wcsstr(haystack, needle) != NULL;
+}
+
+// StringType here can be either ::std::string or ::std::wstring.
+template <typename StringType>
+bool IsSubstringPred(const StringType& needle,
+                     const StringType& haystack) {
+  return haystack.find(needle) != StringType::npos;
+}
+
+// This function implements either IsSubstring() or IsNotSubstring(),
+// depending on the value of the expected_to_be_substring parameter.
+// StringType here can be const char*, const wchar_t*, ::std::string,
+// or ::std::wstring.
+template <typename StringType>
+AssertionResult IsSubstringImpl(
+    bool expected_to_be_substring,
+    const char* needle_expr, const char* haystack_expr,
+    const StringType& needle, const StringType& haystack) {
+  if (IsSubstringPred(needle, haystack) == expected_to_be_substring)
+    return AssertionSuccess();
+
+  const bool is_wide_string = sizeof(needle[0]) > 1;
+  const char* const begin_string_quote = is_wide_string ? "L\"" : "\"";
+  return AssertionFailure()
+      << "Value of: " << needle_expr << "\n"
+      << "  Actual: " << begin_string_quote << needle << "\"\n"
+      << "Expected: " << (expected_to_be_substring ? "" : "not ")
+      << "a substring of " << haystack_expr << "\n"
+      << "Which is: " << begin_string_quote << haystack << "\"";
+}
+
+}  // namespace
+
+// IsSubstring() and IsNotSubstring() check whether needle is a
+// substring of haystack (NULL is considered a substring of itself
+// only), and return an appropriate error message when they fail.
+
+AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const char* needle, const char* haystack) {
+  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const wchar_t* needle, const wchar_t* haystack) {
+  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const char* needle, const char* haystack) {
+  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const wchar_t* needle, const wchar_t* haystack) {
+  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::string& needle, const ::std::string& haystack) {
+  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::string& needle, const ::std::string& haystack) {
+  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
+}
+
+#if GTEST_HAS_STD_WSTRING
+AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::wstring& needle, const ::std::wstring& haystack) {
+  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::wstring& needle, const ::std::wstring& haystack) {
+  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
+}
+#endif  // GTEST_HAS_STD_WSTRING
+
+namespace internal {
+
+#if GTEST_OS_WINDOWS
+
+namespace {
+
+// Helper function for IsHRESULT{SuccessFailure} predicates
+AssertionResult HRESULTFailureHelper(const char* expr,
+                                     const char* expected,
+                                     long hr) {  // NOLINT
+# if GTEST_OS_WINDOWS_MOBILE
+
+  // Windows CE doesn't support FormatMessage.
+  const char error_text[] = "";
+
+# else
+
+  // Looks up the human-readable system message for the HRESULT code
+  // and since we're not passing any params to FormatMessage, we don't
+  // want inserts expanded.
+  const DWORD kFlags = FORMAT_MESSAGE_FROM_SYSTEM |
+                       FORMAT_MESSAGE_IGNORE_INSERTS;
+  const DWORD kBufSize = 4096;
+  // Gets the system's human readable message string for this HRESULT.
+  char error_text[kBufSize] = { '\0' };
+  DWORD message_length = ::FormatMessageA(kFlags,
+                                          0,  // no source, we're asking system
+                                          hr,  // the error
+                                          0,  // no line width restrictions
+                                          error_text,  // output buffer
+                                          kBufSize,  // buf size
+                                          NULL);  // no arguments for inserts
+  // Trims tailing white space (FormatMessage leaves a trailing CR-LF)
+  for (; message_length && IsSpace(error_text[message_length - 1]);
+          --message_length) {
+    error_text[message_length - 1] = '\0';
+  }
+
+# endif  // GTEST_OS_WINDOWS_MOBILE
+
+  const std::string error_hex("0x" + String::FormatHexInt(hr));
+  return ::testing::AssertionFailure()
+      << "Expected: " << expr << " " << expected << ".\n"
+      << "  Actual: " << error_hex << " " << error_text << "\n";
+}
+
+}  // namespace
+
+AssertionResult IsHRESULTSuccess(const char* expr, long hr) {  // NOLINT
+  if (SUCCEEDED(hr)) {
+    return AssertionSuccess();
+  }
+  return HRESULTFailureHelper(expr, "succeeds", hr);
+}
+
+AssertionResult IsHRESULTFailure(const char* expr, long hr) {  // NOLINT
+  if (FAILED(hr)) {
+    return AssertionSuccess();
+  }
+  return HRESULTFailureHelper(expr, "fails", hr);
+}
+
+#endif  // GTEST_OS_WINDOWS
+
+// Utility functions for encoding Unicode text (wide strings) in
+// UTF-8.
+
+// A Unicode code-point can have upto 21 bits, and is encoded in UTF-8
+// like this:
+//
+// Code-point length   Encoding
+//   0 -  7 bits       0xxxxxxx
+//   8 - 11 bits       110xxxxx 10xxxxxx
+//  12 - 16 bits       1110xxxx 10xxxxxx 10xxxxxx
+//  17 - 21 bits       11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+
+// The maximum code-point a one-byte UTF-8 sequence can represent.
+const UInt32 kMaxCodePoint1 = (static_cast<UInt32>(1) <<  7) - 1;
+
+// The maximum code-point a two-byte UTF-8 sequence can represent.
+const UInt32 kMaxCodePoint2 = (static_cast<UInt32>(1) << (5 + 6)) - 1;
+
+// The maximum code-point a three-byte UTF-8 sequence can represent.
+const UInt32 kMaxCodePoint3 = (static_cast<UInt32>(1) << (4 + 2*6)) - 1;
+
+// The maximum code-point a four-byte UTF-8 sequence can represent.
+const UInt32 kMaxCodePoint4 = (static_cast<UInt32>(1) << (3 + 3*6)) - 1;
+
+// Chops off the n lowest bits from a bit pattern.  Returns the n
+// lowest bits.  As a side effect, the original bit pattern will be
+// shifted to the right by n bits.
+inline UInt32 ChopLowBits(UInt32* bits, int n) {
+  const UInt32 low_bits = *bits & ((static_cast<UInt32>(1) << n) - 1);
+  *bits >>= n;
+  return low_bits;
+}
+
+// Converts a Unicode code point to a narrow string in UTF-8 encoding.
+// code_point parameter is of type UInt32 because wchar_t may not be
+// wide enough to contain a code point.
+// If the code_point is not a valid Unicode code point
+// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be converted
+// to "(Invalid Unicode 0xXXXXXXXX)".
+std::string CodePointToUtf8(UInt32 code_point) {
+  if (code_point > kMaxCodePoint4) {
+    return "(Invalid Unicode 0x" + String::FormatHexInt(code_point) + ")";
+  }
+
+  char str[5];  // Big enough for the largest valid code point.
+  if (code_point <= kMaxCodePoint1) {
+    str[1] = '\0';
+    str[0] = static_cast<char>(code_point);                          // 0xxxxxxx
+  } else if (code_point <= kMaxCodePoint2) {
+    str[2] = '\0';
+    str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[0] = static_cast<char>(0xC0 | code_point);                   // 110xxxxx
+  } else if (code_point <= kMaxCodePoint3) {
+    str[3] = '\0';
+    str[2] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[0] = static_cast<char>(0xE0 | code_point);                   // 1110xxxx
+  } else {  // code_point <= kMaxCodePoint4
+    str[4] = '\0';
+    str[3] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[2] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[0] = static_cast<char>(0xF0 | code_point);                   // 11110xxx
+  }
+  return str;
+}
+
+// The following two functions only make sense if the the system
+// uses UTF-16 for wide string encoding. All supported systems
+// with 16 bit wchar_t (Windows, Cygwin, Symbian OS) do use UTF-16.
+
+// Determines if the arguments constitute UTF-16 surrogate pair
+// and thus should be combined into a single Unicode code point
+// using CreateCodePointFromUtf16SurrogatePair.
+inline bool IsUtf16SurrogatePair(wchar_t first, wchar_t second) {
+  return sizeof(wchar_t) == 2 &&
+      (first & 0xFC00) == 0xD800 && (second & 0xFC00) == 0xDC00;
+}
+
+// Creates a Unicode code point from UTF16 surrogate pair.
+inline UInt32 CreateCodePointFromUtf16SurrogatePair(wchar_t first,
+                                                    wchar_t second) {
+  const UInt32 mask = (1 << 10) - 1;
+  return (sizeof(wchar_t) == 2) ?
+      (((first & mask) << 10) | (second & mask)) + 0x10000 :
+      // This function should not be called when the condition is
+      // false, but we provide a sensible default in case it is.
+      static_cast<UInt32>(first);
+}
+
+// Converts a wide string to a narrow string in UTF-8 encoding.
+// The wide string is assumed to have the following encoding:
+//   UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin, Symbian OS)
+//   UTF-32 if sizeof(wchar_t) == 4 (on Linux)
+// Parameter str points to a null-terminated wide string.
+// Parameter num_chars may additionally limit the number
+// of wchar_t characters processed. -1 is used when the entire string
+// should be processed.
+// If the string contains code points that are not valid Unicode code points
+// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output
+// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding
+// and contains invalid UTF-16 surrogate pairs, values in those pairs
+// will be encoded as individual Unicode characters from Basic Normal Plane.
+std::string WideStringToUtf8(const wchar_t* str, int num_chars) {
+  if (num_chars == -1)
+    num_chars = static_cast<int>(wcslen(str));
+
+  ::std::stringstream stream;
+  for (int i = 0; i < num_chars; ++i) {
+    UInt32 unicode_code_point;
+
+    if (str[i] == L'\0') {
+      break;
+    } else if (i + 1 < num_chars && IsUtf16SurrogatePair(str[i], str[i + 1])) {
+      unicode_code_point = CreateCodePointFromUtf16SurrogatePair(str[i],
+                                                                 str[i + 1]);
+      i++;
+    } else {
+      unicode_code_point = static_cast<UInt32>(str[i]);
+    }
+
+    stream << CodePointToUtf8(unicode_code_point);
+  }
+  return StringStreamToString(&stream);
+}
+
+// Converts a wide C string to an std::string using the UTF-8 encoding.
+// NULL will be converted to "(null)".
+std::string String::ShowWideCString(const wchar_t * wide_c_str) {
+  if (wide_c_str == NULL)  return "(null)";
+
+  return internal::WideStringToUtf8(wide_c_str, -1);
+}
+
+// Compares two wide C strings.  Returns true iff they have the same
+// content.
+//
+// Unlike wcscmp(), this function can handle NULL argument(s).  A NULL
+// C string is considered different to any non-NULL C string,
+// including the empty string.
+bool String::WideCStringEquals(const wchar_t * lhs, const wchar_t * rhs) {
+  if (lhs == NULL) return rhs == NULL;
+
+  if (rhs == NULL) return false;
+
+  return wcscmp(lhs, rhs) == 0;
+}
+
+// Helper function for *_STREQ on wide strings.
+AssertionResult CmpHelperSTREQ(const char* expected_expression,
+                               const char* actual_expression,
+                               const wchar_t* expected,
+                               const wchar_t* actual) {
+  if (String::WideCStringEquals(expected, actual)) {
+    return AssertionSuccess();
+  }
+
+  return EqFailure(expected_expression,
+                   actual_expression,
+                   PrintToString(expected),
+                   PrintToString(actual),
+                   false);
+}
+
+// Helper function for *_STRNE on wide strings.
+AssertionResult CmpHelperSTRNE(const char* s1_expression,
+                               const char* s2_expression,
+                               const wchar_t* s1,
+                               const wchar_t* s2) {
+  if (!String::WideCStringEquals(s1, s2)) {
+    return AssertionSuccess();
+  }
+
+  return AssertionFailure() << "Expected: (" << s1_expression << ") != ("
+                            << s2_expression << "), actual: "
+                            << PrintToString(s1)
+                            << " vs " << PrintToString(s2);
+}
+
+// Compares two C strings, ignoring case.  Returns true iff they have
+// the same content.
+//
+// Unlike strcasecmp(), this function can handle NULL argument(s).  A
+// NULL C string is considered different to any non-NULL C string,
+// including the empty string.
+bool String::CaseInsensitiveCStringEquals(const char * lhs, const char * rhs) {
+  if (lhs == NULL)
+    return rhs == NULL;
+  if (rhs == NULL)
+    return false;
+  return posix::StrCaseCmp(lhs, rhs) == 0;
+}
+
+  // Compares two wide C strings, ignoring case.  Returns true iff they
+  // have the same content.
+  //
+  // Unlike wcscasecmp(), this function can handle NULL argument(s).
+  // A NULL C string is considered different to any non-NULL wide C string,
+  // including the empty string.
+  // NB: The implementations on different platforms slightly differ.
+  // On windows, this method uses _wcsicmp which compares according to LC_CTYPE
+  // environment variable. On GNU platform this method uses wcscasecmp
+  // which compares according to LC_CTYPE category of the current locale.
+  // On MacOS X, it uses towlower, which also uses LC_CTYPE category of the
+  // current locale.
+bool String::CaseInsensitiveWideCStringEquals(const wchar_t* lhs,
+                                              const wchar_t* rhs) {
+  if (lhs == NULL) return rhs == NULL;
+
+  if (rhs == NULL) return false;
+
+#if GTEST_OS_WINDOWS
+  return _wcsicmp(lhs, rhs) == 0;
+#elif GTEST_OS_LINUX && !GTEST_OS_LINUX_ANDROID
+  return wcscasecmp(lhs, rhs) == 0;
+#else
+  // Android, Mac OS X and Cygwin don't define wcscasecmp.
+  // Other unknown OSes may not define it either.
+  wint_t left, right;
+  do {
+    left = towlower(*lhs++);
+    right = towlower(*rhs++);
+  } while (left && left == right);
+  return left == right;
+#endif  // OS selector
+}
+
+// Returns true iff str ends with the given suffix, ignoring case.
+// Any string is considered to end with an empty suffix.
+bool String::EndsWithCaseInsensitive(
+    const std::string& str, const std::string& suffix) {
+  const size_t str_len = str.length();
+  const size_t suffix_len = suffix.length();
+  return (str_len >= suffix_len) &&
+         CaseInsensitiveCStringEquals(str.c_str() + str_len - suffix_len,
+                                      suffix.c_str());
+}
+
+// Formats an int value as "%02d".
+std::string String::FormatIntWidth2(int value) {
+  std::stringstream ss;
+  ss << std::setfill('0') << std::setw(2) << value;
+  return ss.str();
+}
+
+// Formats an int value as "%X".
+std::string String::FormatHexInt(int value) {
+  std::stringstream ss;
+  ss << std::hex << std::uppercase << value;
+  return ss.str();
+}
+
+// Formats a byte as "%02X".
+std::string String::FormatByte(unsigned char value) {
+  std::stringstream ss;
+  ss << std::setfill('0') << std::setw(2) << std::hex << std::uppercase
+     << static_cast<unsigned int>(value);
+  return ss.str();
+}
+
+// Converts the buffer in a stringstream to an std::string, converting NUL
+// bytes to "\\0" along the way.
+std::string StringStreamToString(::std::stringstream* ss) {
+  const ::std::string& str = ss->str();
+  const char* const start = str.c_str();
+  const char* const end = start + str.length();
+
+  std::string result;
+  result.reserve(2 * (end - start));
+  for (const char* ch = start; ch != end; ++ch) {
+    if (*ch == '\0') {
+      result += "\\0";  // Replaces NUL with "\\0";
+    } else {
+      result += *ch;
+    }
+  }
+
+  return result;
+}
+
+// Appends the user-supplied message to the Google-Test-generated message.
+std::string AppendUserMessage(const std::string& gtest_msg,
+                              const Message& user_msg) {
+  // Appends the user message if it's non-empty.
+  const std::string user_msg_string = user_msg.GetString();
+  if (user_msg_string.empty()) {
+    return gtest_msg;
+  }
+
+  return gtest_msg + "\n" + user_msg_string;
+}
+
+}  // namespace internal
+
+// class TestResult
+
+// Creates an empty TestResult.
+TestResult::TestResult()
+    : death_test_count_(0),
+      elapsed_time_(0) {
+}
+
+// D'tor.
+TestResult::~TestResult() {
+}
+
+// Returns the i-th test part result among all the results. i can
+// range from 0 to total_part_count() - 1. If i is not in that range,
+// aborts the program.
+const TestPartResult& TestResult::GetTestPartResult(int i) const {
+  if (i < 0 || i >= total_part_count())
+    internal::posix::Abort();
+  return test_part_results_.at(i);
+}
+
+// Returns the i-th test property. i can range from 0 to
+// test_property_count() - 1. If i is not in that range, aborts the
+// program.
+const TestProperty& TestResult::GetTestProperty(int i) const {
+  if (i < 0 || i >= test_property_count())
+    internal::posix::Abort();
+  return test_properties_.at(i);
+}
+
+// Clears the test part results.
+void TestResult::ClearTestPartResults() {
+  test_part_results_.clear();
+}
+
+// Adds a test part result to the list.
+void TestResult::AddTestPartResult(const TestPartResult& test_part_result) {
+  test_part_results_.push_back(test_part_result);
+}
+
+// Adds a test property to the list. If a property with the same key as the
+// supplied property is already represented, the value of this test_property
+// replaces the old value for that key.
+void TestResult::RecordProperty(const std::string& xml_element,
+                                const TestProperty& test_property) {
+  if (!ValidateTestProperty(xml_element, test_property)) {
+    return;
+  }
+  internal::MutexLock lock(&test_properites_mutex_);
+  const std::vector<TestProperty>::iterator property_with_matching_key =
+      std::find_if(test_properties_.begin(), test_properties_.end(),
+                   internal::TestPropertyKeyIs(test_property.key()));
+  if (property_with_matching_key == test_properties_.end()) {
+    test_properties_.push_back(test_property);
+    return;
+  }
+  property_with_matching_key->SetValue(test_property.value());
+}
+
+// The list of reserved attributes used in the <testsuites> element of XML
+// output.
+static const char* const kReservedTestSuitesAttributes[] = {
+  "disabled",
+  "errors",
+  "failures",
+  "name",
+  "random_seed",
+  "tests",
+  "time",
+  "timestamp"
+};
+
+// The list of reserved attributes used in the <testsuite> element of XML
+// output.
+static const char* const kReservedTestSuiteAttributes[] = {
+  "disabled",
+  "errors",
+  "failures",
+  "name",
+  "tests",
+  "time"
+};
+
+// The list of reserved attributes used in the <testcase> element of XML output.
+static const char* const kReservedTestCaseAttributes[] = {
+  "classname",
+  "name",
+  "status",
+  "time",
+  "type_param",
+  "value_param"
+};
+
+template <int kSize>
+std::vector<std::string> ArrayAsVector(const char* const (&array)[kSize]) {
+  return std::vector<std::string>(array, array + kSize);
+}
+
+static std::vector<std::string> GetReservedAttributesForElement(
+    const std::string& xml_element) {
+  if (xml_element == "testsuites") {
+    return ArrayAsVector(kReservedTestSuitesAttributes);
+  } else if (xml_element == "testsuite") {
+    return ArrayAsVector(kReservedTestSuiteAttributes);
+  } else if (xml_element == "testcase") {
+    return ArrayAsVector(kReservedTestCaseAttributes);
+  } else {
+    GTEST_CHECK_(false) << "Unrecognized xml_element provided: " << xml_element;
+  }
+  // This code is unreachable but some compilers may not realizes that.
+  return std::vector<std::string>();
+}
+
+static std::string FormatWordList(const std::vector<std::string>& words) {
+  Message word_list;
+  for (size_t i = 0; i < words.size(); ++i) {
+    if (i > 0 && words.size() > 2) {
+      word_list << ", ";
+    }
+    if (i == words.size() - 1) {
+      word_list << "and ";
+    }
+    word_list << "'" << words[i] << "'";
+  }
+  return word_list.GetString();
+}
+
+bool ValidateTestPropertyName(const std::string& property_name,
+                              const std::vector<std::string>& reserved_names) {
+  if (std::find(reserved_names.begin(), reserved_names.end(), property_name) !=
+          reserved_names.end()) {
+    ADD_FAILURE() << "Reserved key used in RecordProperty(): " << property_name
+                  << " (" << FormatWordList(reserved_names)
+                  << " are reserved by " << GTEST_NAME_ << ")";
+    return false;
+  }
+  return true;
+}
+
+// Adds a failure if the key is a reserved attribute of the element named
+// xml_element.  Returns true if the property is valid.
+bool TestResult::ValidateTestProperty(const std::string& xml_element,
+                                      const TestProperty& test_property) {
+  return ValidateTestPropertyName(test_property.key(),
+                                  GetReservedAttributesForElement(xml_element));
+}
+
+// Clears the object.
+void TestResult::Clear() {
+  test_part_results_.clear();
+  test_properties_.clear();
+  death_test_count_ = 0;
+  elapsed_time_ = 0;
+}
+
+// Returns true iff the test failed.
+bool TestResult::Failed() const {
+  for (int i = 0; i < total_part_count(); ++i) {
+    if (GetTestPartResult(i).failed())
+      return true;
+  }
+  return false;
+}
+
+// Returns true iff the test part fatally failed.
+static bool TestPartFatallyFailed(const TestPartResult& result) {
+  return result.fatally_failed();
+}
+
+// Returns true iff the test fatally failed.
+bool TestResult::HasFatalFailure() const {
+  return CountIf(test_part_results_, TestPartFatallyFailed) > 0;
+}
+
+// Returns true iff the test part non-fatally failed.
+static bool TestPartNonfatallyFailed(const TestPartResult& result) {
+  return result.nonfatally_failed();
+}
+
+// Returns true iff the test has a non-fatal failure.
+bool TestResult::HasNonfatalFailure() const {
+  return CountIf(test_part_results_, TestPartNonfatallyFailed) > 0;
+}
+
+// Gets the number of all test parts.  This is the sum of the number
+// of successful test parts and the number of failed test parts.
+int TestResult::total_part_count() const {
+  return static_cast<int>(test_part_results_.size());
+}
+
+// Returns the number of the test properties.
+int TestResult::test_property_count() const {
+  return static_cast<int>(test_properties_.size());
+}
+
+// class Test
+
+// Creates a Test object.
+
+// The c'tor saves the values of all Google Test flags.
+Test::Test()
+    : gtest_flag_saver_(new internal::GTestFlagSaver) {
+}
+
+// The d'tor restores the values of all Google Test flags.
+Test::~Test() {
+  delete gtest_flag_saver_;
+}
+
+// Sets up the test fixture.
+//
+// A sub-class may override this.
+void Test::SetUp() {
+}
+
+// Tears down the test fixture.
+//
+// A sub-class may override this.
+void Test::TearDown() {
+}
+
+// Allows user supplied key value pairs to be recorded for later output.
+void Test::RecordProperty(const std::string& key, const std::string& value) {
+  UnitTest::GetInstance()->RecordProperty(key, value);
+}
+
+// Allows user supplied key value pairs to be recorded for later output.
+void Test::RecordProperty(const std::string& key, int value) {
+  Message value_message;
+  value_message << value;
+  RecordProperty(key, value_message.GetString().c_str());
+}
+
+namespace internal {
+
+void ReportFailureInUnknownLocation(TestPartResult::Type result_type,
+                                    const std::string& message) {
+  // This function is a friend of UnitTest and as such has access to
+  // AddTestPartResult.
+  UnitTest::GetInstance()->AddTestPartResult(
+      result_type,
+      NULL,  // No info about the source file where the exception occurred.
+      -1,    // We have no info on which line caused the exception.
+      message,
+      "");   // No stack trace, either.
+}
+
+}  // namespace internal
+
+// Google Test requires all tests in the same test case to use the same test
+// fixture class.  This function checks if the current test has the
+// same fixture class as the first test in the current test case.  If
+// yes, it returns true; otherwise it generates a Google Test failure and
+// returns false.
+bool Test::HasSameFixtureClass() {
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  const TestCase* const test_case = impl->current_test_case();
+
+  // Info about the first test in the current test case.
+  const TestInfo* const first_test_info = test_case->test_info_list()[0];
+  const internal::TypeId first_fixture_id = first_test_info->fixture_class_id_;
+  const char* const first_test_name = first_test_info->name();
+
+  // Info about the current test.
+  const TestInfo* const this_test_info = impl->current_test_info();
+  const internal::TypeId this_fixture_id = this_test_info->fixture_class_id_;
+  const char* const this_test_name = this_test_info->name();
+
+  if (this_fixture_id != first_fixture_id) {
+    // Is the first test defined using TEST?
+    const bool first_is_TEST = first_fixture_id == internal::GetTestTypeId();
+    // Is this test defined using TEST?
+    const bool this_is_TEST = this_fixture_id == internal::GetTestTypeId();
+
+    if (first_is_TEST || this_is_TEST) {
+      // The user mixed TEST and TEST_F in this test case - we'll tell
+      // him/her how to fix it.
+
+      // Gets the name of the TEST and the name of the TEST_F.  Note
+      // that first_is_TEST and this_is_TEST cannot both be true, as
+      // the fixture IDs are different for the two tests.
+      const char* const TEST_name =
+          first_is_TEST ? first_test_name : this_test_name;
+      const char* const TEST_F_name =
+          first_is_TEST ? this_test_name : first_test_name;
+
+      ADD_FAILURE()
+          << "All tests in the same test case must use the same test fixture\n"
+          << "class, so mixing TEST_F and TEST in the same test case is\n"
+          << "illegal.  In test case " << this_test_info->test_case_name()
+          << ",\n"
+          << "test " << TEST_F_name << " is defined using TEST_F but\n"
+          << "test " << TEST_name << " is defined using TEST.  You probably\n"
+          << "want to change the TEST to TEST_F or move it to another test\n"
+          << "case.";
+    } else {
+      // The user defined two fixture classes with the same name in
+      // two namespaces - we'll tell him/her how to fix it.
+      ADD_FAILURE()
+          << "All tests in the same test case must use the same test fixture\n"
+          << "class.  However, in test case "
+          << this_test_info->test_case_name() << ",\n"
+          << "you defined test " << first_test_name
+          << " and test " << this_test_name << "\n"
+          << "using two different test fixture classes.  This can happen if\n"
+          << "the two classes are from different namespaces or translation\n"
+          << "units and have the same name.  You should probably rename one\n"
+          << "of the classes to put the tests into different test cases.";
+    }
+    return false;
+  }
+
+  return true;
+}
+
+#if GTEST_HAS_SEH
+
+// Adds an "exception thrown" fatal failure to the current test.  This
+// function returns its result via an output parameter pointer because VC++
+// prohibits creation of objects with destructors on stack in functions
+// using __try (see error C2712).
+static std::string* FormatSehExceptionMessage(DWORD exception_code,
+                                              const char* location) {
+  Message message;
+  message << "SEH exception with code 0x" << std::setbase(16) <<
+    exception_code << std::setbase(10) << " thrown in " << location << ".";
+
+  return new std::string(message.GetString());
+}
+
+#endif  // GTEST_HAS_SEH
+
+namespace internal {
+
+#if GTEST_HAS_EXCEPTIONS
+
+// Adds an "exception thrown" fatal failure to the current test.
+static std::string FormatCxxExceptionMessage(const char* description,
+                                             const char* location) {
+  Message message;
+  if (description != NULL) {
+    message << "C++ exception with description \"" << description << "\"";
+  } else {
+    message << "Unknown C++ exception";
+  }
+  message << " thrown in " << location << ".";
+
+  return message.GetString();
+}
+
+static std::string PrintTestPartResultToString(
+    const TestPartResult& test_part_result);
+
+GoogleTestFailureException::GoogleTestFailureException(
+    const TestPartResult& failure)
+    : ::std::runtime_error(PrintTestPartResultToString(failure).c_str()) {}
+
+#endif  // GTEST_HAS_EXCEPTIONS
+
+// We put these helper functions in the internal namespace as IBM's xlC
+// compiler rejects the code if they were declared static.
+
+// Runs the given method and handles SEH exceptions it throws, when
+// SEH is supported; returns the 0-value for type Result in case of an
+// SEH exception.  (Microsoft compilers cannot handle SEH and C++
+// exceptions in the same function.  Therefore, we provide a separate
+// wrapper function for handling SEH exceptions.)
+template <class T, typename Result>
+Result HandleSehExceptionsInMethodIfSupported(
+    T* object, Result (T::*method)(), const char* location) {
+#if GTEST_HAS_SEH
+  __try {
+    return (object->*method)();
+  } __except (internal::UnitTestOptions::GTestShouldProcessSEH(  // NOLINT
+      GetExceptionCode())) {
+    // We create the exception message on the heap because VC++ prohibits
+    // creation of objects with destructors on stack in functions using __try
+    // (see error C2712).
+    std::string* exception_message = FormatSehExceptionMessage(
+        GetExceptionCode(), location);
+    internal::ReportFailureInUnknownLocation(TestPartResult::kFatalFailure,
+                                             *exception_message);
+    delete exception_message;
+    return static_cast<Result>(0);
+  }
+#else
+  (void)location;
+  return (object->*method)();
+#endif  // GTEST_HAS_SEH
+}
+
+// Runs the given method and catches and reports C++ and/or SEH-style
+// exceptions, if they are supported; returns the 0-value for type
+// Result in case of an SEH exception.
+template <class T, typename Result>
+Result HandleExceptionsInMethodIfSupported(
+    T* object, Result (T::*method)(), const char* location) {
+  // NOTE: The user code can affect the way in which Google Test handles
+  // exceptions by setting GTEST_FLAG(catch_exceptions), but only before
+  // RUN_ALL_TESTS() starts. It is technically possible to check the flag
+  // after the exception is caught and either report or re-throw the
+  // exception based on the flag's value:
+  //
+  // try {
+  //   // Perform the test method.
+  // } catch (...) {
+  //   if (GTEST_FLAG(catch_exceptions))
+  //     // Report the exception as failure.
+  //   else
+  //     throw;  // Re-throws the original exception.
+  // }
+  //
+  // However, the purpose of this flag is to allow the program to drop into
+  // the debugger when the exception is thrown. On most platforms, once the
+  // control enters the catch block, the exception origin information is
+  // lost and the debugger will stop the program at the point of the
+  // re-throw in this function -- instead of at the point of the original
+  // throw statement in the code under test.  For this reason, we perform
+  // the check early, sacrificing the ability to affect Google Test's
+  // exception handling in the method where the exception is thrown.
+  if (internal::GetUnitTestImpl()->catch_exceptions()) {
+#if GTEST_HAS_EXCEPTIONS
+    try {
+      return HandleSehExceptionsInMethodIfSupported(object, method, location);
+    } catch (const internal::GoogleTestFailureException&) {  // NOLINT
+      // This exception type can only be thrown by a failed Google
+      // Test assertion with the intention of letting another testing
+      // framework catch it.  Therefore we just re-throw it.
+      throw;
+    } catch (const std::exception& e) {  // NOLINT
+      internal::ReportFailureInUnknownLocation(
+          TestPartResult::kFatalFailure,
+          FormatCxxExceptionMessage(e.what(), location));
+    } catch (...) {  // NOLINT
+      internal::ReportFailureInUnknownLocation(
+          TestPartResult::kFatalFailure,
+          FormatCxxExceptionMessage(NULL, location));
+    }
+    return static_cast<Result>(0);
+#else
+    return HandleSehExceptionsInMethodIfSupported(object, method, location);
+#endif  // GTEST_HAS_EXCEPTIONS
+  } else {
+    return (object->*method)();
+  }
+}
+
+}  // namespace internal
+
+// Runs the test and updates the test result.
+void Test::Run() {
+  if (!HasSameFixtureClass()) return;
+
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+  internal::HandleExceptionsInMethodIfSupported(this, &Test::SetUp, "SetUp()");
+  // We will run the test only if SetUp() was successful.
+  if (!HasFatalFailure()) {
+    impl->os_stack_trace_getter()->UponLeavingGTest();
+    internal::HandleExceptionsInMethodIfSupported(
+        this, &Test::TestBody, "the test body");
+  }
+
+  // However, we want to clean up as much as possible.  Hence we will
+  // always call TearDown(), even if SetUp() or the test body has
+  // failed.
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+  internal::HandleExceptionsInMethodIfSupported(
+      this, &Test::TearDown, "TearDown()");
+}
+
+// Returns true iff the current test has a fatal failure.
+bool Test::HasFatalFailure() {
+  return internal::GetUnitTestImpl()->current_test_result()->HasFatalFailure();
+}
+
+// Returns true iff the current test has a non-fatal failure.
+bool Test::HasNonfatalFailure() {
+  return internal::GetUnitTestImpl()->current_test_result()->
+      HasNonfatalFailure();
+}
+
+// class TestInfo
+
+// Constructs a TestInfo object. It assumes ownership of the test factory
+// object.
+TestInfo::TestInfo(const std::string& a_test_case_name,
+                   const std::string& a_name,
+                   const char* a_type_param,
+                   const char* a_value_param,
+                   internal::TypeId fixture_class_id,
+                   internal::TestFactoryBase* factory)
+    : test_case_name_(a_test_case_name),
+      name_(a_name),
+      type_param_(a_type_param ? new std::string(a_type_param) : NULL),
+      value_param_(a_value_param ? new std::string(a_value_param) : NULL),
+      fixture_class_id_(fixture_class_id),
+      should_run_(false),
+      is_disabled_(false),
+      matches_filter_(false),
+      factory_(factory),
+      result_() {}
+
+// Destructs a TestInfo object.
+TestInfo::~TestInfo() { delete factory_; }
+
+namespace internal {
+
+// Creates a new TestInfo object and registers it with Google Test;
+// returns the created object.
+//
+// Arguments:
+//
+//   test_case_name:   name of the test case
+//   name:             name of the test
+//   type_param:       the name of the test's type parameter, or NULL if
+//                     this is not a typed or a type-parameterized test.
+//   value_param:      text representation of the test's value parameter,
+//                     or NULL if this is not a value-parameterized test.
+//   fixture_class_id: ID of the test fixture class
+//   set_up_tc:        pointer to the function that sets up the test case
+//   tear_down_tc:     pointer to the function that tears down the test case
+//   factory:          pointer to the factory that creates a test object.
+//                     The newly created TestInfo instance will assume
+//                     ownership of the factory object.
+TestInfo* MakeAndRegisterTestInfo(
+    const char* test_case_name,
+    const char* name,
+    const char* type_param,
+    const char* value_param,
+    TypeId fixture_class_id,
+    SetUpTestCaseFunc set_up_tc,
+    TearDownTestCaseFunc tear_down_tc,
+    TestFactoryBase* factory) {
+  TestInfo* const test_info =
+      new TestInfo(test_case_name, name, type_param, value_param,
+                   fixture_class_id, factory);
+  GetUnitTestImpl()->AddTestInfo(set_up_tc, tear_down_tc, test_info);
+  return test_info;
+}
+
+#if GTEST_HAS_PARAM_TEST
+void ReportInvalidTestCaseType(const char* test_case_name,
+                               const char* file, int line) {
+  Message errors;
+  errors
+      << "Attempted redefinition of test case " << test_case_name << ".\n"
+      << "All tests in the same test case must use the same test fixture\n"
+      << "class.  However, in test case " << test_case_name << ", you tried\n"
+      << "to define a test using a fixture class different from the one\n"
+      << "used earlier. This can happen if the two fixture classes are\n"
+      << "from different namespaces and have the same name. You should\n"
+      << "probably rename one of the classes to put the tests into different\n"
+      << "test cases.";
+
+  fprintf(stderr, "%s %s", FormatFileLocation(file, line).c_str(),
+          errors.GetString().c_str());
+}
+#endif  // GTEST_HAS_PARAM_TEST
+
+}  // namespace internal
+
+namespace {
+
+// A predicate that checks the test name of a TestInfo against a known
+// value.
+//
+// This is used for implementation of the TestCase class only.  We put
+// it in the anonymous namespace to prevent polluting the outer
+// namespace.
+//
+// TestNameIs is copyable.
+class TestNameIs {
+ public:
+  // Constructor.
+  //
+  // TestNameIs has NO default constructor.
+  explicit TestNameIs(const char* name)
+      : name_(name) {}
+
+  // Returns true iff the test name of test_info matches name_.
+  bool operator()(const TestInfo * test_info) const {
+    return test_info && test_info->name() == name_;
+  }
+
+ private:
+  std::string name_;
+};
+
+}  // namespace
+
+namespace internal {
+
+// This method expands all parameterized tests registered with macros TEST_P
+// and INSTANTIATE_TEST_CASE_P into regular tests and registers those.
+// This will be done just once during the program runtime.
+void UnitTestImpl::RegisterParameterizedTests() {
+#if GTEST_HAS_PARAM_TEST
+  if (!parameterized_tests_registered_) {
+    parameterized_test_registry_.RegisterTests();
+    parameterized_tests_registered_ = true;
+  }
+#endif
+}
+
+}  // namespace internal
+
+// Creates the test object, runs it, records its result, and then
+// deletes it.
+void TestInfo::Run() {
+  if (!should_run_) return;
+
+  // Tells UnitTest where to store test result.
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  impl->set_current_test_info(this);
+
+  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
+
+  // Notifies the unit test event listeners that a test is about to start.
+  repeater->OnTestStart(*this);
+
+  const TimeInMillis start = internal::GetTimeInMillis();
+
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+
+  // Creates the test object.
+  Test* const test = internal::HandleExceptionsInMethodIfSupported(
+      factory_, &internal::TestFactoryBase::CreateTest,
+      "the test fixture's constructor");
+
+  // Runs the test only if the test object was created and its
+  // constructor didn't generate a fatal failure.
+  if ((test != NULL) && !Test::HasFatalFailure()) {
+    // This doesn't throw as all user code that can throw are wrapped into
+    // exception handling code.
+    test->Run();
+  }
+
+  // Deletes the test object.
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+  internal::HandleExceptionsInMethodIfSupported(
+      test, &Test::DeleteSelf_, "the test fixture's destructor");
+
+  result_.set_elapsed_time(internal::GetTimeInMillis() - start);
+
+  // Notifies the unit test event listener that a test has just finished.
+  repeater->OnTestEnd(*this);
+
+  // Tells UnitTest to stop associating assertion results to this
+  // test.
+  impl->set_current_test_info(NULL);
+}
+
+// class TestCase
+
+// Gets the number of successful tests in this test case.
+int TestCase::successful_test_count() const {
+  return CountIf(test_info_list_, TestPassed);
+}
+
+// Gets the number of failed tests in this test case.
+int TestCase::failed_test_count() const {
+  return CountIf(test_info_list_, TestFailed);
+}
+
+// Gets the number of disabled tests that will be reported in the XML report.
+int TestCase::reportable_disabled_test_count() const {
+  return CountIf(test_info_list_, TestReportableDisabled);
+}
+
+// Gets the number of disabled tests in this test case.
+int TestCase::disabled_test_count() const {
+  return CountIf(test_info_list_, TestDisabled);
+}
+
+// Gets the number of tests to be printed in the XML report.
+int TestCase::reportable_test_count() const {
+  return CountIf(test_info_list_, TestReportable);
+}
+
+// Get the number of tests in this test case that should run.
+int TestCase::test_to_run_count() const {
+  return CountIf(test_info_list_, ShouldRunTest);
+}
+
+// Gets the number of all tests.
+int TestCase::total_test_count() const {
+  return static_cast<int>(test_info_list_.size());
+}
+
+// Creates a TestCase with the given name.
+//
+// Arguments:
+//
+//   name:         name of the test case
+//   a_type_param: the name of the test case's type parameter, or NULL if
+//                 this is not a typed or a type-parameterized test case.
+//   set_up_tc:    pointer to the function that sets up the test case
+//   tear_down_tc: pointer to the function that tears down the test case
+TestCase::TestCase(const char* a_name, const char* a_type_param,
+                   Test::SetUpTestCaseFunc set_up_tc,
+                   Test::TearDownTestCaseFunc tear_down_tc)
+    : name_(a_name),
+      type_param_(a_type_param ? new std::string(a_type_param) : NULL),
+      set_up_tc_(set_up_tc),
+      tear_down_tc_(tear_down_tc),
+      should_run_(false),
+      elapsed_time_(0) {
+}
+
+// Destructor of TestCase.
+TestCase::~TestCase() {
+  // Deletes every Test in the collection.
+  ForEach(test_info_list_, internal::Delete<TestInfo>);
+}
+
+// Returns the i-th test among all the tests. i can range from 0 to
+// total_test_count() - 1. If i is not in that range, returns NULL.
+const TestInfo* TestCase::GetTestInfo(int i) const {
+  const int index = GetElementOr(test_indices_, i, -1);
+  return index < 0 ? NULL : test_info_list_[index];
+}
+
+// Returns the i-th test among all the tests. i can range from 0 to
+// total_test_count() - 1. If i is not in that range, returns NULL.
+TestInfo* TestCase::GetMutableTestInfo(int i) {
+  const int index = GetElementOr(test_indices_, i, -1);
+  return index < 0 ? NULL : test_info_list_[index];
+}
+
+// Adds a test to this test case.  Will delete the test upon
+// destruction of the TestCase object.
+void TestCase::AddTestInfo(TestInfo * test_info) {
+  test_info_list_.push_back(test_info);
+  test_indices_.push_back(static_cast<int>(test_indices_.size()));
+}
+
+// Runs every test in this TestCase.
+void TestCase::Run() {
+  if (!should_run_) return;
+
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  impl->set_current_test_case(this);
+
+  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
+
+  repeater->OnTestCaseStart(*this);
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+  internal::HandleExceptionsInMethodIfSupported(
+      this, &TestCase::RunSetUpTestCase, "SetUpTestCase()");
+
+  const internal::TimeInMillis start = internal::GetTimeInMillis();
+  for (int i = 0; i < total_test_count(); i++) {
+    GetMutableTestInfo(i)->Run();
+  }
+  elapsed_time_ = internal::GetTimeInMillis() - start;
+
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+  internal::HandleExceptionsInMethodIfSupported(
+      this, &TestCase::RunTearDownTestCase, "TearDownTestCase()");
+
+  repeater->OnTestCaseEnd(*this);
+  impl->set_current_test_case(NULL);
+}
+
+// Clears the results of all tests in this test case.
+void TestCase::ClearResult() {
+  ad_hoc_test_result_.Clear();
+  ForEach(test_info_list_, TestInfo::ClearTestResult);
+}
+
+// Shuffles the tests in this test case.
+void TestCase::ShuffleTests(internal::Random* random) {
+  Shuffle(random, &test_indices_);
+}
+
+// Restores the test order to before the first shuffle.
+void TestCase::UnshuffleTests() {
+  for (size_t i = 0; i < test_indices_.size(); i++) {
+    test_indices_[i] = static_cast<int>(i);
+  }
+}
+
+// Formats a countable noun.  Depending on its quantity, either the
+// singular form or the plural form is used. e.g.
+//
+// FormatCountableNoun(1, "formula", "formuli") returns "1 formula".
+// FormatCountableNoun(5, "book", "books") returns "5 books".
+static std::string FormatCountableNoun(int count,
+                                       const char * singular_form,
+                                       const char * plural_form) {
+  return internal::StreamableToString(count) + " " +
+      (count == 1 ? singular_form : plural_form);
+}
+
+// Formats the count of tests.
+static std::string FormatTestCount(int test_count) {
+  return FormatCountableNoun(test_count, "test", "tests");
+}
+
+// Formats the count of test cases.
+static std::string FormatTestCaseCount(int test_case_count) {
+  return FormatCountableNoun(test_case_count, "test case", "test cases");
+}
+
+// Converts a TestPartResult::Type enum to human-friendly string
+// representation.  Both kNonFatalFailure and kFatalFailure are translated
+// to "Failure", as the user usually doesn't care about the difference
+// between the two when viewing the test result.
+static const char * TestPartResultTypeToString(TestPartResult::Type type) {
+  switch (type) {
+    case TestPartResult::kSuccess:
+      return "Success";
+
+    case TestPartResult::kNonFatalFailure:
+    case TestPartResult::kFatalFailure:
+#ifdef _MSC_VER
+      return "error: ";
+#else
+      return "Failure\n";
+#endif
+    default:
+      return "Unknown result type";
+  }
+}
+
+namespace internal {
+
+// Prints a TestPartResult to an std::string.
+static std::string PrintTestPartResultToString(
+    const TestPartResult& test_part_result) {
+  return (Message()
+          << internal::FormatFileLocation(test_part_result.file_name(),
+                                          test_part_result.line_number())
+          << " " << TestPartResultTypeToString(test_part_result.type())
+          << test_part_result.message()).GetString();
+}
+
+// Prints a TestPartResult.
+static void PrintTestPartResult(const TestPartResult& test_part_result) {
+  const std::string& result =
+      PrintTestPartResultToString(test_part_result);
+  printf("%s\n", result.c_str());
+  fflush(stdout);
+  // If the test program runs in Visual Studio or a debugger, the
+  // following statements add the test part result message to the Output
+  // window such that the user can double-click on it to jump to the
+  // corresponding source code location; otherwise they do nothing.
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
+  // We don't call OutputDebugString*() on Windows Mobile, as printing
+  // to stdout is done by OutputDebugString() there already - we don't
+  // want the same message printed twice.
+  ::OutputDebugStringA(result.c_str());
+  ::OutputDebugStringA("\n");
+#endif
+}
+
+// class PrettyUnitTestResultPrinter
+
+enum GTestColor {
+  COLOR_DEFAULT,
+  COLOR_RED,
+  COLOR_GREEN,
+  COLOR_YELLOW
+};
+
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
+
+// Returns the character attribute for the given color.
+WORD GetColorAttribute(GTestColor color) {
+  switch (color) {
+    case COLOR_RED:    return FOREGROUND_RED;
+    case COLOR_GREEN:  return FOREGROUND_GREEN;
+    case COLOR_YELLOW: return FOREGROUND_RED | FOREGROUND_GREEN;
+    default:           return 0;
+  }
+}
+
+#else
+
+// Returns the ANSI color code for the given color.  COLOR_DEFAULT is
+// an invalid input.
+const char* GetAnsiColorCode(GTestColor color) {
+  switch (color) {
+    case COLOR_RED:     return "1";
+    case COLOR_GREEN:   return "2";
+    case COLOR_YELLOW:  return "3";
+    default:            return NULL;
+  };
+}
+
+#endif  // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
+
+// Returns true iff Google Test should use colors in the output.
+bool ShouldUseColor(bool stdout_is_tty) {
+  const char* const gtest_color = GTEST_FLAG(color).c_str();
+
+  if (String::CaseInsensitiveCStringEquals(gtest_color, "auto")) {
+#if GTEST_OS_WINDOWS
+    // On Windows the TERM variable is usually not set, but the
+    // console there does support colors.
+    return stdout_is_tty;
+#else
+    // On non-Windows platforms, we rely on the TERM variable.
+    const char* const term = posix::GetEnv("TERM");
+    const bool term_supports_color =
+        String::CStringEquals(term, "xterm") ||
+        String::CStringEquals(term, "xterm-color") ||
+        String::CStringEquals(term, "xterm-256color") ||
+        String::CStringEquals(term, "screen") ||
+        String::CStringEquals(term, "screen-256color") ||
+        String::CStringEquals(term, "linux") ||
+        String::CStringEquals(term, "cygwin");
+    return stdout_is_tty && term_supports_color;
+#endif  // GTEST_OS_WINDOWS
+  }
+
+  return String::CaseInsensitiveCStringEquals(gtest_color, "yes") ||
+      String::CaseInsensitiveCStringEquals(gtest_color, "true") ||
+      String::CaseInsensitiveCStringEquals(gtest_color, "t") ||
+      String::CStringEquals(gtest_color, "1");
+  // We take "yes", "true", "t", and "1" as meaning "yes".  If the
+  // value is neither one of these nor "auto", we treat it as "no" to
+  // be conservative.
+}
+
+// Helpers for printing colored strings to stdout. Note that on Windows, we
+// cannot simply emit special characters and have the terminal change colors.
+// This routine must actually emit the characters rather than return a string
+// that would be colored when printed, as can be done on Linux.
+void ColoredPrintf(GTestColor color, const char* fmt, ...) {
+  va_list args;
+  va_start(args, fmt);
+
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN || GTEST_OS_ZOS || GTEST_OS_IOS
+  const bool use_color = false;
+#else
+  static const bool in_color_mode =
+      ShouldUseColor(posix::IsATTY(posix::FileNo(stdout)) != 0);
+  const bool use_color = in_color_mode && (color != COLOR_DEFAULT);
+#endif  // GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN || GTEST_OS_ZOS
+  // The '!= 0' comparison is necessary to satisfy MSVC 7.1.
+
+  if (!use_color) {
+    vprintf(fmt, args);
+    va_end(args);
+    return;
+  }
+
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
+  const HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE);
+
+  // Gets the current text color.
+  CONSOLE_SCREEN_BUFFER_INFO buffer_info;
+  GetConsoleScreenBufferInfo(stdout_handle, &buffer_info);
+  const WORD old_color_attrs = buffer_info.wAttributes;
+
+  // We need to flush the stream buffers into the console before each
+  // SetConsoleTextAttribute call lest it affect the text that is already
+  // printed but has not yet reached the console.
+  fflush(stdout);
+  SetConsoleTextAttribute(stdout_handle,
+                          GetColorAttribute(color) | FOREGROUND_INTENSITY);
+  vprintf(fmt, args);
+
+  fflush(stdout);
+  // Restores the text color.
+  SetConsoleTextAttribute(stdout_handle, old_color_attrs);
+#else
+  printf("\033[0;3%sm", GetAnsiColorCode(color));
+  vprintf(fmt, args);
+  printf("\033[m");  // Resets the terminal to default.
+#endif  // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
+  va_end(args);
+}
+
+// Text printed in Google Test's text output and --gunit_list_tests
+// output to label the type parameter and value parameter for a test.
+static const char kTypeParamLabel[] = "TypeParam";
+static const char kValueParamLabel[] = "GetParam()";
+
+void PrintFullTestCommentIfPresent(const TestInfo& test_info) {
+  const char* const type_param = test_info.type_param();
+  const char* const value_param = test_info.value_param();
+
+  if (type_param != NULL || value_param != NULL) {
+    printf(", where ");
+    if (type_param != NULL) {
+      printf("%s = %s", kTypeParamLabel, type_param);
+      if (value_param != NULL)
+        printf(" and ");
+    }
+    if (value_param != NULL) {
+      printf("%s = %s", kValueParamLabel, value_param);
+    }
+  }
+}
+
+// This class implements the TestEventListener interface.
+//
+// Class PrettyUnitTestResultPrinter is copyable.
+class PrettyUnitTestResultPrinter : public TestEventListener {
+ public:
+  PrettyUnitTestResultPrinter() {}
+  static void PrintTestName(const char * test_case, const char * test) {
+    printf("%s.%s", test_case, test);
+  }
+
+  // The following methods override what's in the TestEventListener class.
+  virtual void OnTestProgramStart(const UnitTest& /*unit_test*/) {}
+  virtual void OnTestIterationStart(const UnitTest& unit_test, int iteration);
+  virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test);
+  virtual void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) {}
+  virtual void OnTestCaseStart(const TestCase& test_case);
+  virtual void OnTestStart(const TestInfo& test_info);
+  virtual void OnTestPartResult(const TestPartResult& result);
+  virtual void OnTestEnd(const TestInfo& test_info);
+  virtual void OnTestCaseEnd(const TestCase& test_case);
+  virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test);
+  virtual void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) {}
+  virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration);
+  virtual void OnTestProgramEnd(const UnitTest& /*unit_test*/) {}
+
+ private:
+  static void PrintFailedTests(const UnitTest& unit_test);
+};
+
+  // Fired before each iteration of tests starts.
+void PrettyUnitTestResultPrinter::OnTestIterationStart(
+    const UnitTest& unit_test, int iteration) {
+  if (GTEST_FLAG(repeat) != 1)
+    printf("\nRepeating all tests (iteration %d) . . .\n\n", iteration + 1);
+
+  const char* const filter = GTEST_FLAG(filter).c_str();
+
+  // Prints the filter if it's not *.  This reminds the user that some
+  // tests may be skipped.
+  if (!String::CStringEquals(filter, kUniversalFilter)) {
+    ColoredPrintf(COLOR_YELLOW,
+                  "Note: %s filter = %s\n", GTEST_NAME_, filter);
+  }
+
+  if (internal::ShouldShard(kTestTotalShards, kTestShardIndex, false)) {
+    const Int32 shard_index = Int32FromEnvOrDie(kTestShardIndex, -1);
+    ColoredPrintf(COLOR_YELLOW,
+                  "Note: This is test shard %d of %s.\n",
+                  static_cast<int>(shard_index) + 1,
+                  internal::posix::GetEnv(kTestTotalShards));
+  }
+
+  if (GTEST_FLAG(shuffle)) {
+    ColoredPrintf(COLOR_YELLOW,
+                  "Note: Randomizing tests' orders with a seed of %d .\n",
+                  unit_test.random_seed());
+  }
+
+  ColoredPrintf(COLOR_GREEN,  "[==========] ");
+  printf("Running %s from %s.\n",
+         FormatTestCount(unit_test.test_to_run_count()).c_str(),
+         FormatTestCaseCount(unit_test.test_case_to_run_count()).c_str());
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnEnvironmentsSetUpStart(
+    const UnitTest& /*unit_test*/) {
+  ColoredPrintf(COLOR_GREEN,  "[----------] ");
+  printf("Global test environment set-up.\n");
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnTestCaseStart(const TestCase& test_case) {
+  const std::string counts =
+      FormatCountableNoun(test_case.test_to_run_count(), "test", "tests");
+  ColoredPrintf(COLOR_GREEN, "[----------] ");
+  printf("%s from %s", counts.c_str(), test_case.name());
+  if (test_case.type_param() == NULL) {
+    printf("\n");
+  } else {
+    printf(", where %s = %s\n", kTypeParamLabel, test_case.type_param());
+  }
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnTestStart(const TestInfo& test_info) {
+  ColoredPrintf(COLOR_GREEN,  "[ RUN      ] ");
+  PrintTestName(test_info.test_case_name(), test_info.name());
+  printf("\n");
+  fflush(stdout);
+}
+
+// Called after an assertion failure.
+void PrettyUnitTestResultPrinter::OnTestPartResult(
+    const TestPartResult& result) {
+  // If the test part succeeded, we don't need to do anything.
+  if (result.type() == TestPartResult::kSuccess)
+    return;
+
+  // Print failure message from the assertion (e.g. expected this and got that).
+  PrintTestPartResult(result);
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) {
+  if (test_info.result()->Passed()) {
+    ColoredPrintf(COLOR_GREEN, "[       OK ] ");
+  } else {
+    ColoredPrintf(COLOR_RED, "[  FAILED  ] ");
+  }
+  PrintTestName(test_info.test_case_name(), test_info.name());
+  if (test_info.result()->Failed())
+    PrintFullTestCommentIfPresent(test_info);
+
+  if (GTEST_FLAG(print_time)) {
+    printf(" (%s ms)\n", internal::StreamableToString(
+           test_info.result()->elapsed_time()).c_str());
+  } else {
+    printf("\n");
+  }
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnTestCaseEnd(const TestCase& test_case) {
+  if (!GTEST_FLAG(print_time)) return;
+
+  const std::string counts =
+      FormatCountableNoun(test_case.test_to_run_count(), "test", "tests");
+  ColoredPrintf(COLOR_GREEN, "[----------] ");
+  printf("%s from %s (%s ms total)\n\n",
+         counts.c_str(), test_case.name(),
+         internal::StreamableToString(test_case.elapsed_time()).c_str());
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnEnvironmentsTearDownStart(
+    const UnitTest& /*unit_test*/) {
+  ColoredPrintf(COLOR_GREEN,  "[----------] ");
+  printf("Global test environment tear-down\n");
+  fflush(stdout);
+}
+
+// Internal helper for printing the list of failed tests.
+void PrettyUnitTestResultPrinter::PrintFailedTests(const UnitTest& unit_test) {
+  const int failed_test_count = unit_test.failed_test_count();
+  if (failed_test_count == 0) {
+    return;
+  }
+
+  for (int i = 0; i < unit_test.total_test_case_count(); ++i) {
+    const TestCase& test_case = *unit_test.GetTestCase(i);
+    if (!test_case.should_run() || (test_case.failed_test_count() == 0)) {
+      continue;
+    }
+    for (int j = 0; j < test_case.total_test_count(); ++j) {
+      const TestInfo& test_info = *test_case.GetTestInfo(j);
+      if (!test_info.should_run() || test_info.result()->Passed()) {
+        continue;
+      }
+      ColoredPrintf(COLOR_RED, "[  FAILED  ] ");
+      printf("%s.%s", test_case.name(), test_info.name());
+      PrintFullTestCommentIfPresent(test_info);
+      printf("\n");
+    }
+  }
+}
+
+void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
+                                                     int /*iteration*/) {
+  ColoredPrintf(COLOR_GREEN,  "[==========] ");
+  printf("%s from %s ran.",
+         FormatTestCount(unit_test.test_to_run_count()).c_str(),
+         FormatTestCaseCount(unit_test.test_case_to_run_count()).c_str());
+  if (GTEST_FLAG(print_time)) {
+    printf(" (%s ms total)",
+           internal::StreamableToString(unit_test.elapsed_time()).c_str());
+  }
+  printf("\n");
+  ColoredPrintf(COLOR_GREEN,  "[  PASSED  ] ");
+  printf("%s.\n", FormatTestCount(unit_test.successful_test_count()).c_str());
+
+  int num_failures = unit_test.failed_test_count();
+  if (!unit_test.Passed()) {
+    const int failed_test_count = unit_test.failed_test_count();
+    ColoredPrintf(COLOR_RED,  "[  FAILED  ] ");
+    printf("%s, listed below:\n", FormatTestCount(failed_test_count).c_str());
+    PrintFailedTests(unit_test);
+    printf("\n%2d FAILED %s\n", num_failures,
+                        num_failures == 1 ? "TEST" : "TESTS");
+  }
+
+  int num_disabled = unit_test.reportable_disabled_test_count();
+  if (num_disabled && !GTEST_FLAG(also_run_disabled_tests)) {
+    if (!num_failures) {
+      printf("\n");  // Add a spacer if no FAILURE banner is displayed.
+    }
+    ColoredPrintf(COLOR_YELLOW,
+                  "  YOU HAVE %d DISABLED %s\n\n",
+                  num_disabled,
+                  num_disabled == 1 ? "TEST" : "TESTS");
+  }
+  // Ensure that Google Test output is printed before, e.g., heapchecker output.
+  fflush(stdout);
+}
+
+// End PrettyUnitTestResultPrinter
+
+// class TestEventRepeater
+//
+// This class forwards events to other event listeners.
+class TestEventRepeater : public TestEventListener {
+ public:
+  TestEventRepeater() : forwarding_enabled_(true) {}
+  virtual ~TestEventRepeater();
+  void Append(TestEventListener *listener);
+  TestEventListener* Release(TestEventListener* listener);
+
+  // Controls whether events will be forwarded to listeners_. Set to false
+  // in death test child processes.
+  bool forwarding_enabled() const { return forwarding_enabled_; }
+  void set_forwarding_enabled(bool enable) { forwarding_enabled_ = enable; }
+
+  virtual void OnTestProgramStart(const UnitTest& unit_test);
+  virtual void OnTestIterationStart(const UnitTest& unit_test, int iteration);
+  virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test);
+  virtual void OnEnvironmentsSetUpEnd(const UnitTest& unit_test);
+  virtual void OnTestCaseStart(const TestCase& test_case);
+  virtual void OnTestStart(const TestInfo& test_info);
+  virtual void OnTestPartResult(const TestPartResult& result);
+  virtual void OnTestEnd(const TestInfo& test_info);
+  virtual void OnTestCaseEnd(const TestCase& test_case);
+  virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test);
+  virtual void OnEnvironmentsTearDownEnd(const UnitTest& unit_test);
+  virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration);
+  virtual void OnTestProgramEnd(const UnitTest& unit_test);
+
+ private:
+  // Controls whether events will be forwarded to listeners_. Set to false
+  // in death test child processes.
+  bool forwarding_enabled_;
+  // The list of listeners that receive events.
+  std::vector<TestEventListener*> listeners_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventRepeater);
+};
+
+TestEventRepeater::~TestEventRepeater() {
+  ForEach(listeners_, Delete<TestEventListener>);
+}
+
+void TestEventRepeater::Append(TestEventListener *listener) {
+  listeners_.push_back(listener);
+}
+
+// TODO(vladl@google.com): Factor the search functionality into Vector::Find.
+TestEventListener* TestEventRepeater::Release(TestEventListener *listener) {
+  for (size_t i = 0; i < listeners_.size(); ++i) {
+    if (listeners_[i] == listener) {
+      listeners_.erase(listeners_.begin() + i);
+      return listener;
+    }
+  }
+
+  return NULL;
+}
+
+// Since most methods are very similar, use macros to reduce boilerplate.
+// This defines a member that forwards the call to all listeners.
+#define GTEST_REPEATER_METHOD_(Name, Type) \
+void TestEventRepeater::Name(const Type& parameter) { \
+  if (forwarding_enabled_) { \
+    for (size_t i = 0; i < listeners_.size(); i++) { \
+      listeners_[i]->Name(parameter); \
+    } \
+  } \
+}
+// This defines a member that forwards the call to all listeners in reverse
+// order.
+#define GTEST_REVERSE_REPEATER_METHOD_(Name, Type) \
+void TestEventRepeater::Name(const Type& parameter) { \
+  if (forwarding_enabled_) { \
+    for (int i = static_cast<int>(listeners_.size()) - 1; i >= 0; i--) { \
+      listeners_[i]->Name(parameter); \
+    } \
+  } \
+}
+
+GTEST_REPEATER_METHOD_(OnTestProgramStart, UnitTest)
+GTEST_REPEATER_METHOD_(OnEnvironmentsSetUpStart, UnitTest)
+GTEST_REPEATER_METHOD_(OnTestCaseStart, TestCase)
+GTEST_REPEATER_METHOD_(OnTestStart, TestInfo)
+GTEST_REPEATER_METHOD_(OnTestPartResult, TestPartResult)
+GTEST_REPEATER_METHOD_(OnEnvironmentsTearDownStart, UnitTest)
+GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsSetUpEnd, UnitTest)
+GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsTearDownEnd, UnitTest)
+GTEST_REVERSE_REPEATER_METHOD_(OnTestEnd, TestInfo)
+GTEST_REVERSE_REPEATER_METHOD_(OnTestCaseEnd, TestCase)
+GTEST_REVERSE_REPEATER_METHOD_(OnTestProgramEnd, UnitTest)
+
+#undef GTEST_REPEATER_METHOD_
+#undef GTEST_REVERSE_REPEATER_METHOD_
+
+void TestEventRepeater::OnTestIterationStart(const UnitTest& unit_test,
+                                             int iteration) {
+  if (forwarding_enabled_) {
+    for (size_t i = 0; i < listeners_.size(); i++) {
+      listeners_[i]->OnTestIterationStart(unit_test, iteration);
+    }
+  }
+}
+
+void TestEventRepeater::OnTestIterationEnd(const UnitTest& unit_test,
+                                           int iteration) {
+  if (forwarding_enabled_) {
+    for (int i = static_cast<int>(listeners_.size()) - 1; i >= 0; i--) {
+      listeners_[i]->OnTestIterationEnd(unit_test, iteration);
+    }
+  }
+}
+
+// End TestEventRepeater
+
+// This class generates an XML output file.
+class XmlUnitTestResultPrinter : public EmptyTestEventListener {
+ public:
+  explicit XmlUnitTestResultPrinter(const char* output_file);
+
+  virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration);
+
+ private:
+  // Is c a whitespace character that is normalized to a space character
+  // when it appears in an XML attribute value?
+  static bool IsNormalizableWhitespace(char c) {
+    return c == 0x9 || c == 0xA || c == 0xD;
+  }
+
+  // May c appear in a well-formed XML document?
+  static bool IsValidXmlCharacter(char c) {
+    return IsNormalizableWhitespace(c) || c >= 0x20;
+  }
+
+  // Returns an XML-escaped copy of the input string str.  If
+  // is_attribute is true, the text is meant to appear as an attribute
+  // value, and normalizable whitespace is preserved by replacing it
+  // with character references.
+  static std::string EscapeXml(const std::string& str, bool is_attribute);
+
+  // Returns the given string with all characters invalid in XML removed.
+  static std::string RemoveInvalidXmlCharacters(const std::string& str);
+
+  // Convenience wrapper around EscapeXml when str is an attribute value.
+  static std::string EscapeXmlAttribute(const std::string& str) {
+    return EscapeXml(str, true);
+  }
+
+  // Convenience wrapper around EscapeXml when str is not an attribute value.
+  static std::string EscapeXmlText(const char* str) {
+    return EscapeXml(str, false);
+  }
+
+  // Verifies that the given attribute belongs to the given element and
+  // streams the attribute as XML.
+  static void OutputXmlAttribute(std::ostream* stream,
+                                 const std::string& element_name,
+                                 const std::string& name,
+                                 const std::string& value);
+
+  // Streams an XML CDATA section, escaping invalid CDATA sequences as needed.
+  static void OutputXmlCDataSection(::std::ostream* stream, const char* data);
+
+  // Streams an XML representation of a TestInfo object.
+  static void OutputXmlTestInfo(::std::ostream* stream,
+                                const char* test_case_name,
+                                const TestInfo& test_info);
+
+  // Prints an XML representation of a TestCase object
+  static void PrintXmlTestCase(::std::ostream* stream,
+                               const TestCase& test_case);
+
+  // Prints an XML summary of unit_test to output stream out.
+  static void PrintXmlUnitTest(::std::ostream* stream,
+                               const UnitTest& unit_test);
+
+  // Produces a string representing the test properties in a result as space
+  // delimited XML attributes based on the property key="value" pairs.
+  // When the std::string is not empty, it includes a space at the beginning,
+  // to delimit this attribute from prior attributes.
+  static std::string TestPropertiesAsXmlAttributes(const TestResult& result);
+
+  // The output file.
+  const std::string output_file_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(XmlUnitTestResultPrinter);
+};
+
+// Creates a new XmlUnitTestResultPrinter.
+XmlUnitTestResultPrinter::XmlUnitTestResultPrinter(const char* output_file)
+    : output_file_(output_file) {
+  if (output_file_.c_str() == NULL || output_file_.empty()) {
+    fprintf(stderr, "XML output file may not be null\n");
+    fflush(stderr);
+    exit(EXIT_FAILURE);
+  }
+}
+
+// Called after the unit test ends.
+void XmlUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
+                                                  int /*iteration*/) {
+  FILE* xmlout = NULL;
+  FilePath output_file(output_file_);
+  FilePath output_dir(output_file.RemoveFileName());
+
+  if (output_dir.CreateDirectoriesRecursively()) {
+    xmlout = posix::FOpen(output_file_.c_str(), "w");
+  }
+  if (xmlout == NULL) {
+    // TODO(wan): report the reason of the failure.
+    //
+    // We don't do it for now as:
+    //
+    //   1. There is no urgent need for it.
+    //   2. It's a bit involved to make the errno variable thread-safe on
+    //      all three operating systems (Linux, Windows, and Mac OS).
+    //   3. To interpret the meaning of errno in a thread-safe way,
+    //      we need the strerror_r() function, which is not available on
+    //      Windows.
+    fprintf(stderr,
+            "Unable to open file \"%s\"\n",
+            output_file_.c_str());
+    fflush(stderr);
+    exit(EXIT_FAILURE);
+  }
+  std::stringstream stream;
+  PrintXmlUnitTest(&stream, unit_test);
+  fprintf(xmlout, "%s", StringStreamToString(&stream).c_str());
+  fclose(xmlout);
+}
+
+// Returns an XML-escaped copy of the input string str.  If is_attribute
+// is true, the text is meant to appear as an attribute value, and
+// normalizable whitespace is preserved by replacing it with character
+// references.
+//
+// Invalid XML characters in str, if any, are stripped from the output.
+// It is expected that most, if not all, of the text processed by this
+// module will consist of ordinary English text.
+// If this module is ever modified to produce version 1.1 XML output,
+// most invalid characters can be retained using character references.
+// TODO(wan): It might be nice to have a minimally invasive, human-readable
+// escaping scheme for invalid characters, rather than dropping them.
+std::string XmlUnitTestResultPrinter::EscapeXml(
+    const std::string& str, bool is_attribute) {
+  Message m;
+
+  for (size_t i = 0; i < str.size(); ++i) {
+    const char ch = str[i];
+    switch (ch) {
+      case '<':
+        m << "&lt;";
+        break;
+      case '>':
+        m << "&gt;";
+        break;
+      case '&':
+        m << "&amp;";
+        break;
+      case '\'':
+        if (is_attribute)
+          m << "&apos;";
+        else
+          m << '\'';
+        break;
+      case '"':
+        if (is_attribute)
+          m << "&quot;";
+        else
+          m << '"';
+        break;
+      default:
+        if (IsValidXmlCharacter(ch)) {
+          if (is_attribute && IsNormalizableWhitespace(ch))
+            m << "&#x" << String::FormatByte(static_cast<unsigned char>(ch))
+              << ";";
+          else
+            m << ch;
+        }
+        break;
+    }
+  }
+
+  return m.GetString();
+}
+
+// Returns the given string with all characters invalid in XML removed.
+// Currently invalid characters are dropped from the string. An
+// alternative is to replace them with certain characters such as . or ?.
+std::string XmlUnitTestResultPrinter::RemoveInvalidXmlCharacters(
+    const std::string& str) {
+  std::string output;
+  output.reserve(str.size());
+  for (std::string::const_iterator it = str.begin(); it != str.end(); ++it)
+    if (IsValidXmlCharacter(*it))
+      output.push_back(*it);
+
+  return output;
+}
+
+// The following routines generate an XML representation of a UnitTest
+// object.
+//
+// This is how Google Test concepts map to the DTD:
+//
+// <testsuites name="AllTests">        <-- corresponds to a UnitTest object
+//   <testsuite name="testcase-name">  <-- corresponds to a TestCase object
+//     <testcase name="test-name">     <-- corresponds to a TestInfo object
+//       <failure message="...">...</failure>
+//       <failure message="...">...</failure>
+//       <failure message="...">...</failure>
+//                                     <-- individual assertion failures
+//     </testcase>
+//   </testsuite>
+// </testsuites>
+
+// Formats the given time in milliseconds as seconds.
+std::string FormatTimeInMillisAsSeconds(TimeInMillis ms) {
+  ::std::stringstream ss;
+  ss << ms/1000.0;
+  return ss.str();
+}
+
+// Converts the given epoch time in milliseconds to a date string in the ISO
+// 8601 format, without the timezone information.
+std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms) {
+  // Using non-reentrant version as localtime_r is not portable.
+  time_t seconds = static_cast<time_t>(ms / 1000);
+#ifdef _MSC_VER
+# pragma warning(push)          // Saves the current warning state.
+# pragma warning(disable:4996)  // Temporarily disables warning 4996
+                                // (function or variable may be unsafe).
+  const struct tm* const time_struct = localtime(&seconds);  // NOLINT
+# pragma warning(pop)           // Restores the warning state again.
+#else
+  const struct tm* const time_struct = localtime(&seconds);  // NOLINT
+#endif
+  if (time_struct == NULL)
+    return "";  // Invalid ms value
+
+  // YYYY-MM-DDThh:mm:ss
+  return StreamableToString(time_struct->tm_year + 1900) + "-" +
+      String::FormatIntWidth2(time_struct->tm_mon + 1) + "-" +
+      String::FormatIntWidth2(time_struct->tm_mday) + "T" +
+      String::FormatIntWidth2(time_struct->tm_hour) + ":" +
+      String::FormatIntWidth2(time_struct->tm_min) + ":" +
+      String::FormatIntWidth2(time_struct->tm_sec);
+}
+
+// Streams an XML CDATA section, escaping invalid CDATA sequences as needed.
+void XmlUnitTestResultPrinter::OutputXmlCDataSection(::std::ostream* stream,
+                                                     const char* data) {
+  const char* segment = data;
+  *stream << "<![CDATA[";
+  for (;;) {
+    const char* const next_segment = strstr(segment, "]]>");
+    if (next_segment != NULL) {
+      stream->write(
+          segment, static_cast<std::streamsize>(next_segment - segment));
+      *stream << "]]>]]&gt;<![CDATA[";
+      segment = next_segment + strlen("]]>");
+    } else {
+      *stream << segment;
+      break;
+    }
+  }
+  *stream << "]]>";
+}
+
+void XmlUnitTestResultPrinter::OutputXmlAttribute(
+    std::ostream* stream,
+    const std::string& element_name,
+    const std::string& name,
+    const std::string& value) {
+  const std::vector<std::string>& allowed_names =
+      GetReservedAttributesForElement(element_name);
+
+  GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
+                   allowed_names.end())
+      << "Attribute " << name << " is not allowed for element <" << element_name
+      << ">.";
+
+  *stream << " " << name << "=\"" << EscapeXmlAttribute(value) << "\"";
+}
+
+// Prints an XML representation of a TestInfo object.
+// TODO(wan): There is also value in printing properties with the plain printer.
+void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream,
+                                                 const char* test_case_name,
+                                                 const TestInfo& test_info) {
+  const TestResult& result = *test_info.result();
+  const std::string kTestcase = "testcase";
+
+  *stream << "    <testcase";
+  OutputXmlAttribute(stream, kTestcase, "name", test_info.name());
+
+  if (test_info.value_param() != NULL) {
+    OutputXmlAttribute(stream, kTestcase, "value_param",
+                       test_info.value_param());
+  }
+  if (test_info.type_param() != NULL) {
+    OutputXmlAttribute(stream, kTestcase, "type_param", test_info.type_param());
+  }
+
+  OutputXmlAttribute(stream, kTestcase, "status",
+                     test_info.should_run() ? "run" : "notrun");
+  OutputXmlAttribute(stream, kTestcase, "time",
+                     FormatTimeInMillisAsSeconds(result.elapsed_time()));
+  OutputXmlAttribute(stream, kTestcase, "classname", test_case_name);
+  *stream << TestPropertiesAsXmlAttributes(result);
+
+  int failures = 0;
+  for (int i = 0; i < result.total_part_count(); ++i) {
+    const TestPartResult& part = result.GetTestPartResult(i);
+    if (part.failed()) {
+      if (++failures == 1) {
+        *stream << ">\n";
+      }
+      const string location = internal::FormatCompilerIndependentFileLocation(
+          part.file_name(), part.line_number());
+      const string summary = location + "\n" + part.summary();
+      *stream << "      <failure message=\""
+              << EscapeXmlAttribute(summary.c_str())
+              << "\" type=\"\">";
+      const string detail = location + "\n" + part.message();
+      OutputXmlCDataSection(stream, RemoveInvalidXmlCharacters(detail).c_str());
+      *stream << "</failure>\n";
+    }
+  }
+
+  if (failures == 0)
+    *stream << " />\n";
+  else
+    *stream << "    </testcase>\n";
+}
+
+// Prints an XML representation of a TestCase object
+void XmlUnitTestResultPrinter::PrintXmlTestCase(std::ostream* stream,
+                                                const TestCase& test_case) {
+  const std::string kTestsuite = "testsuite";
+  *stream << "  <" << kTestsuite;
+  OutputXmlAttribute(stream, kTestsuite, "name", test_case.name());
+  OutputXmlAttribute(stream, kTestsuite, "tests",
+                     StreamableToString(test_case.reportable_test_count()));
+  OutputXmlAttribute(stream, kTestsuite, "failures",
+                     StreamableToString(test_case.failed_test_count()));
+  OutputXmlAttribute(
+      stream, kTestsuite, "disabled",
+      StreamableToString(test_case.reportable_disabled_test_count()));
+  OutputXmlAttribute(stream, kTestsuite, "errors", "0");
+  OutputXmlAttribute(stream, kTestsuite, "time",
+                     FormatTimeInMillisAsSeconds(test_case.elapsed_time()));
+  *stream << TestPropertiesAsXmlAttributes(test_case.ad_hoc_test_result())
+          << ">\n";
+
+  for (int i = 0; i < test_case.total_test_count(); ++i) {
+    if (test_case.GetTestInfo(i)->is_reportable())
+      OutputXmlTestInfo(stream, test_case.name(), *test_case.GetTestInfo(i));
+  }
+  *stream << "  </" << kTestsuite << ">\n";
+}
+
+// Prints an XML summary of unit_test to output stream out.
+void XmlUnitTestResultPrinter::PrintXmlUnitTest(std::ostream* stream,
+                                                const UnitTest& unit_test) {
+  const std::string kTestsuites = "testsuites";
+
+  *stream << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
+  *stream << "<" << kTestsuites;
+
+  OutputXmlAttribute(stream, kTestsuites, "tests",
+                     StreamableToString(unit_test.reportable_test_count()));
+  OutputXmlAttribute(stream, kTestsuites, "failures",
+                     StreamableToString(unit_test.failed_test_count()));
+  OutputXmlAttribute(
+      stream, kTestsuites, "disabled",
+      StreamableToString(unit_test.reportable_disabled_test_count()));
+  OutputXmlAttribute(stream, kTestsuites, "errors", "0");
+  OutputXmlAttribute(
+      stream, kTestsuites, "timestamp",
+      FormatEpochTimeInMillisAsIso8601(unit_test.start_timestamp()));
+  OutputXmlAttribute(stream, kTestsuites, "time",
+                     FormatTimeInMillisAsSeconds(unit_test.elapsed_time()));
+
+  if (GTEST_FLAG(shuffle)) {
+    OutputXmlAttribute(stream, kTestsuites, "random_seed",
+                       StreamableToString(unit_test.random_seed()));
+  }
+
+  *stream << TestPropertiesAsXmlAttributes(unit_test.ad_hoc_test_result());
+
+  OutputXmlAttribute(stream, kTestsuites, "name", "AllTests");
+  *stream << ">\n";
+
+  for (int i = 0; i < unit_test.total_test_case_count(); ++i) {
+    if (unit_test.GetTestCase(i)->reportable_test_count() > 0)
+      PrintXmlTestCase(stream, *unit_test.GetTestCase(i));
+  }
+  *stream << "</" << kTestsuites << ">\n";
+}
+
+// Produces a string representing the test properties in a result as space
+// delimited XML attributes based on the property key="value" pairs.
+std::string XmlUnitTestResultPrinter::TestPropertiesAsXmlAttributes(
+    const TestResult& result) {
+  Message attributes;
+  for (int i = 0; i < result.test_property_count(); ++i) {
+    const TestProperty& property = result.GetTestProperty(i);
+    attributes << " " << property.key() << "="
+        << "\"" << EscapeXmlAttribute(property.value()) << "\"";
+  }
+  return attributes.GetString();
+}
+
+// End XmlUnitTestResultPrinter
+
+#if GTEST_CAN_STREAM_RESULTS_
+
+// Checks if str contains '=', '&', '%' or '\n' characters. If yes,
+// replaces them by "%xx" where xx is their hexadecimal value. For
+// example, replaces "=" with "%3D".  This algorithm is O(strlen(str))
+// in both time and space -- important as the input str may contain an
+// arbitrarily long test failure message and stack trace.
+string StreamingListener::UrlEncode(const char* str) {
+  string result;
+  result.reserve(strlen(str) + 1);
+  for (char ch = *str; ch != '\0'; ch = *++str) {
+    switch (ch) {
+      case '%':
+      case '=':
+      case '&':
+      case '\n':
+        result.append("%" + String::FormatByte(static_cast<unsigned char>(ch)));
+        break;
+      default:
+        result.push_back(ch);
+        break;
+    }
+  }
+  return result;
+}
+
+void StreamingListener::SocketWriter::MakeConnection() {
+  GTEST_CHECK_(sockfd_ == -1)
+      << "MakeConnection() can't be called when there is already a connection.";
+
+  addrinfo hints;
+  memset(&hints, 0, sizeof(hints));
+  hints.ai_family = AF_UNSPEC;    // To allow both IPv4 and IPv6 addresses.
+  hints.ai_socktype = SOCK_STREAM;
+  addrinfo* servinfo = NULL;
+
+  // Use the getaddrinfo() to get a linked list of IP addresses for
+  // the given host name.
+  const int error_num = getaddrinfo(
+      host_name_.c_str(), port_num_.c_str(), &hints, &servinfo);
+  if (error_num != 0) {
+    GTEST_LOG_(WARNING) << "stream_result_to: getaddrinfo() failed: "
+                        << gai_strerror(error_num);
+  }
+
+  // Loop through all the results and connect to the first we can.
+  for (addrinfo* cur_addr = servinfo; sockfd_ == -1 && cur_addr != NULL;
+       cur_addr = cur_addr->ai_next) {
+    sockfd_ = socket(
+        cur_addr->ai_family, cur_addr->ai_socktype, cur_addr->ai_protocol);
+    if (sockfd_ != -1) {
+      // Connect the client socket to the server socket.
+      if (connect(sockfd_, cur_addr->ai_addr, cur_addr->ai_addrlen) == -1) {
+        close(sockfd_);
+        sockfd_ = -1;
+      }
+    }
+  }
+
+  freeaddrinfo(servinfo);  // all done with this structure
+
+  if (sockfd_ == -1) {
+    GTEST_LOG_(WARNING) << "stream_result_to: failed to connect to "
+                        << host_name_ << ":" << port_num_;
+  }
+}
+
+// End of class Streaming Listener
+#endif  // GTEST_CAN_STREAM_RESULTS__
+
+// Class ScopedTrace
+
+// Pushes the given source file location and message onto a per-thread
+// trace stack maintained by Google Test.
+ScopedTrace::ScopedTrace(const char* file, int line, const Message& message)
+    GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) {
+  TraceInfo trace;
+  trace.file = file;
+  trace.line = line;
+  trace.message = message.GetString();
+
+  UnitTest::GetInstance()->PushGTestTrace(trace);
+}
+
+// Pops the info pushed by the c'tor.
+ScopedTrace::~ScopedTrace()
+    GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) {
+  UnitTest::GetInstance()->PopGTestTrace();
+}
+
+
+// class OsStackTraceGetter
+
+// Returns the current OS stack trace as an std::string.  Parameters:
+//
+//   max_depth  - the maximum number of stack frames to be included
+//                in the trace.
+//   skip_count - the number of top frames to be skipped; doesn't count
+//                against max_depth.
+//
+string OsStackTraceGetter::CurrentStackTrace(int /* max_depth */,
+                                             int /* skip_count */)
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+  return "";
+}
+
+void OsStackTraceGetter::UponLeavingGTest()
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+}
+
+const char* const
+OsStackTraceGetter::kElidedFramesMarker =
+    "... " GTEST_NAME_ " internal frames ...";
+
+// A helper class that creates the premature-exit file in its
+// constructor and deletes the file in its destructor.
+class ScopedPrematureExitFile {
+ public:
+  explicit ScopedPrematureExitFile(const char* premature_exit_filepath)
+      : premature_exit_filepath_(premature_exit_filepath) {
+    // If a path to the premature-exit file is specified...
+    if (premature_exit_filepath != NULL && *premature_exit_filepath != '\0') {
+      // create the file with a single "0" character in it.  I/O
+      // errors are ignored as there's nothing better we can do and we
+      // don't want to fail the test because of this.
+      FILE* pfile = posix::FOpen(premature_exit_filepath, "w");
+      fwrite("0", 1, 1, pfile);
+      fclose(pfile);
+    }
+  }
+
+  ~ScopedPrematureExitFile() {
+    if (premature_exit_filepath_ != NULL && *premature_exit_filepath_ != '\0') {
+      remove(premature_exit_filepath_);
+    }
+  }
+
+ private:
+  const char* const premature_exit_filepath_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedPrematureExitFile);
+};
+
+}  // namespace internal
+
+// class TestEventListeners
+
+TestEventListeners::TestEventListeners()
+    : repeater_(new internal::TestEventRepeater()),
+      default_result_printer_(NULL),
+      default_xml_generator_(NULL) {
+}
+
+TestEventListeners::~TestEventListeners() { delete repeater_; }
+
+// Returns the standard listener responsible for the default console
+// output.  Can be removed from the listeners list to shut down default
+// console output.  Note that removing this object from the listener list
+// with Release transfers its ownership to the user.
+void TestEventListeners::Append(TestEventListener* listener) {
+  repeater_->Append(listener);
+}
+
+// Removes the given event listener from the list and returns it.  It then
+// becomes the caller's responsibility to delete the listener. Returns
+// NULL if the listener is not found in the list.
+TestEventListener* TestEventListeners::Release(TestEventListener* listener) {
+  if (listener == default_result_printer_)
+    default_result_printer_ = NULL;
+  else if (listener == default_xml_generator_)
+    default_xml_generator_ = NULL;
+  return repeater_->Release(listener);
+}
+
+// Returns repeater that broadcasts the TestEventListener events to all
+// subscribers.
+TestEventListener* TestEventListeners::repeater() { return repeater_; }
+
+// Sets the default_result_printer attribute to the provided listener.
+// The listener is also added to the listener list and previous
+// default_result_printer is removed from it and deleted. The listener can
+// also be NULL in which case it will not be added to the list. Does
+// nothing if the previous and the current listener objects are the same.
+void TestEventListeners::SetDefaultResultPrinter(TestEventListener* listener) {
+  if (default_result_printer_ != listener) {
+    // It is an error to pass this method a listener that is already in the
+    // list.
+    delete Release(default_result_printer_);
+    default_result_printer_ = listener;
+    if (listener != NULL)
+      Append(listener);
+  }
+}
+
+// Sets the default_xml_generator attribute to the provided listener.  The
+// listener is also added to the listener list and previous
+// default_xml_generator is removed from it and deleted. The listener can
+// also be NULL in which case it will not be added to the list. Does
+// nothing if the previous and the current listener objects are the same.
+void TestEventListeners::SetDefaultXmlGenerator(TestEventListener* listener) {
+  if (default_xml_generator_ != listener) {
+    // It is an error to pass this method a listener that is already in the
+    // list.
+    delete Release(default_xml_generator_);
+    default_xml_generator_ = listener;
+    if (listener != NULL)
+      Append(listener);
+  }
+}
+
+// Controls whether events will be forwarded by the repeater to the
+// listeners in the list.
+bool TestEventListeners::EventForwardingEnabled() const {
+  return repeater_->forwarding_enabled();
+}
+
+void TestEventListeners::SuppressEventForwarding() {
+  repeater_->set_forwarding_enabled(false);
+}
+
+// class UnitTest
+
+// Gets the singleton UnitTest object.  The first time this method is
+// called, a UnitTest object is constructed and returned.  Consecutive
+// calls will return the same object.
+//
+// We don't protect this under mutex_ as a user is not supposed to
+// call this before main() starts, from which point on the return
+// value will never change.
+UnitTest* UnitTest::GetInstance() {
+  // When compiled with MSVC 7.1 in optimized mode, destroying the
+  // UnitTest object upon exiting the program messes up the exit code,
+  // causing successful tests to appear failed.  We have to use a
+  // different implementation in this case to bypass the compiler bug.
+  // This implementation makes the compiler happy, at the cost of
+  // leaking the UnitTest object.
+
+  // CodeGear C++Builder insists on a public destructor for the
+  // default implementation.  Use this implementation to keep good OO
+  // design with private destructor.
+
+#if (_MSC_VER == 1310 && !defined(_DEBUG)) || defined(__BORLANDC__)
+  static UnitTest* const instance = new UnitTest;
+  return instance;
+#else
+  static UnitTest instance;
+  return &instance;
+#endif  // (_MSC_VER == 1310 && !defined(_DEBUG)) || defined(__BORLANDC__)
+}
+
+// Gets the number of successful test cases.
+int UnitTest::successful_test_case_count() const {
+  return impl()->successful_test_case_count();
+}
+
+// Gets the number of failed test cases.
+int UnitTest::failed_test_case_count() const {
+  return impl()->failed_test_case_count();
+}
+
+// Gets the number of all test cases.
+int UnitTest::total_test_case_count() const {
+  return impl()->total_test_case_count();
+}
+
+// Gets the number of all test cases that contain at least one test
+// that should run.
+int UnitTest::test_case_to_run_count() const {
+  return impl()->test_case_to_run_count();
+}
+
+// Gets the number of successful tests.
+int UnitTest::successful_test_count() const {
+  return impl()->successful_test_count();
+}
+
+// Gets the number of failed tests.
+int UnitTest::failed_test_count() const { return impl()->failed_test_count(); }
+
+// Gets the number of disabled tests that will be reported in the XML report.
+int UnitTest::reportable_disabled_test_count() const {
+  return impl()->reportable_disabled_test_count();
+}
+
+// Gets the number of disabled tests.
+int UnitTest::disabled_test_count() const {
+  return impl()->disabled_test_count();
+}
+
+// Gets the number of tests to be printed in the XML report.
+int UnitTest::reportable_test_count() const {
+  return impl()->reportable_test_count();
+}
+
+// Gets the number of all tests.
+int UnitTest::total_test_count() const { return impl()->total_test_count(); }
+
+// Gets the number of tests that should run.
+int UnitTest::test_to_run_count() const { return impl()->test_to_run_count(); }
+
+// Gets the time of the test program start, in ms from the start of the
+// UNIX epoch.
+internal::TimeInMillis UnitTest::start_timestamp() const {
+    return impl()->start_timestamp();
+}
+
+// Gets the elapsed time, in milliseconds.
+internal::TimeInMillis UnitTest::elapsed_time() const {
+  return impl()->elapsed_time();
+}
+
+// Returns true iff the unit test passed (i.e. all test cases passed).
+bool UnitTest::Passed() const { return impl()->Passed(); }
+
+// Returns true iff the unit test failed (i.e. some test case failed
+// or something outside of all tests failed).
+bool UnitTest::Failed() const { return impl()->Failed(); }
+
+// Gets the i-th test case among all the test cases. i can range from 0 to
+// total_test_case_count() - 1. If i is not in that range, returns NULL.
+const TestCase* UnitTest::GetTestCase(int i) const {
+  return impl()->GetTestCase(i);
+}
+
+// Returns the TestResult containing information on test failures and
+// properties logged outside of individual test cases.
+const TestResult& UnitTest::ad_hoc_test_result() const {
+  return *impl()->ad_hoc_test_result();
+}
+
+// Gets the i-th test case among all the test cases. i can range from 0 to
+// total_test_case_count() - 1. If i is not in that range, returns NULL.
+TestCase* UnitTest::GetMutableTestCase(int i) {
+  return impl()->GetMutableTestCase(i);
+}
+
+// Returns the list of event listeners that can be used to track events
+// inside Google Test.
+TestEventListeners& UnitTest::listeners() {
+  return *impl()->listeners();
+}
+
+// Registers and returns a global test environment.  When a test
+// program is run, all global test environments will be set-up in the
+// order they were registered.  After all tests in the program have
+// finished, all global test environments will be torn-down in the
+// *reverse* order they were registered.
+//
+// The UnitTest object takes ownership of the given environment.
+//
+// We don't protect this under mutex_, as we only support calling it
+// from the main thread.
+Environment* UnitTest::AddEnvironment(Environment* env) {
+  if (env == NULL) {
+    return NULL;
+  }
+
+  impl_->environments().push_back(env);
+  return env;
+}
+
+// Adds a TestPartResult to the current TestResult object.  All Google Test
+// assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc) eventually call
+// this to report their results.  The user code should use the
+// assertion macros instead of calling this directly.
+void UnitTest::AddTestPartResult(
+    TestPartResult::Type result_type,
+    const char* file_name,
+    int line_number,
+    const std::string& message,
+    const std::string& os_stack_trace) GTEST_LOCK_EXCLUDED_(mutex_) {
+  Message msg;
+  msg << message;
+
+  internal::MutexLock lock(&mutex_);
+  if (impl_->gtest_trace_stack().size() > 0) {
+    msg << "\n" << GTEST_NAME_ << " trace:";
+
+    for (int i = static_cast<int>(impl_->gtest_trace_stack().size());
+         i > 0; --i) {
+      const internal::TraceInfo& trace = impl_->gtest_trace_stack()[i - 1];
+      msg << "\n" << internal::FormatFileLocation(trace.file, trace.line)
+          << " " << trace.message;
+    }
+  }
+
+  if (os_stack_trace.c_str() != NULL && !os_stack_trace.empty()) {
+    msg << internal::kStackTraceMarker << os_stack_trace;
+  }
+
+  const TestPartResult result =
+    TestPartResult(result_type, file_name, line_number,
+                   msg.GetString().c_str());
+  impl_->GetTestPartResultReporterForCurrentThread()->
+      ReportTestPartResult(result);
+
+  if (result_type != TestPartResult::kSuccess) {
+    // gtest_break_on_failure takes precedence over
+    // gtest_throw_on_failure.  This allows a user to set the latter
+    // in the code (perhaps in order to use Google Test assertions
+    // with another testing framework) and specify the former on the
+    // command line for debugging.
+    if (GTEST_FLAG(break_on_failure)) {
+#if GTEST_OS_WINDOWS
+      // Using DebugBreak on Windows allows gtest to still break into a debugger
+      // when a failure happens and both the --gtest_break_on_failure and
+      // the --gtest_catch_exceptions flags are specified.
+      DebugBreak();
+#else
+      // Dereference NULL through a volatile pointer to prevent the compiler
+      // from removing. We use this rather than abort() or __builtin_trap() for
+      // portability: Symbian doesn't implement abort() well, and some debuggers
+      // don't correctly trap abort().
+      *static_cast<volatile int*>(NULL) = 1;
+#endif  // GTEST_OS_WINDOWS
+    } else if (GTEST_FLAG(throw_on_failure)) {
+#if GTEST_HAS_EXCEPTIONS
+      throw internal::GoogleTestFailureException(result);
+#else
+      // We cannot call abort() as it generates a pop-up in debug mode
+      // that cannot be suppressed in VC 7.1 or below.
+      exit(1);
+#endif
+    }
+  }
+}
+
+// Adds a TestProperty to the current TestResult object when invoked from
+// inside a test, to current TestCase's ad_hoc_test_result_ when invoked
+// from SetUpTestCase or TearDownTestCase, or to the global property set
+// when invoked elsewhere.  If the result already contains a property with
+// the same key, the value will be updated.
+void UnitTest::RecordProperty(const std::string& key,
+                              const std::string& value) {
+  impl_->RecordProperty(TestProperty(key, value));
+}
+
+// Runs all tests in this UnitTest object and prints the result.
+// Returns 0 if successful, or 1 otherwise.
+//
+// We don't protect this under mutex_, as we only support calling it
+// from the main thread.
+int UnitTest::Run() {
+  const bool in_death_test_child_process =
+      internal::GTEST_FLAG(internal_run_death_test).length() > 0;
+
+  // Google Test implements this protocol for catching that a test
+  // program exits before returning control to Google Test:
+  //
+  //   1. Upon start, Google Test creates a file whose absolute path
+  //      is specified by the environment variable
+  //      TEST_PREMATURE_EXIT_FILE.
+  //   2. When Google Test has finished its work, it deletes the file.
+  //
+  // This allows a test runner to set TEST_PREMATURE_EXIT_FILE before
+  // running a Google-Test-based test program and check the existence
+  // of the file at the end of the test execution to see if it has
+  // exited prematurely.
+
+  // If we are in the child process of a death test, don't
+  // create/delete the premature exit file, as doing so is unnecessary
+  // and will confuse the parent process.  Otherwise, create/delete
+  // the file upon entering/leaving this function.  If the program
+  // somehow exits before this function has a chance to return, the
+  // premature-exit file will be left undeleted, causing a test runner
+  // that understands the premature-exit-file protocol to report the
+  // test as having failed.
+  const internal::ScopedPrematureExitFile premature_exit_file(
+      in_death_test_child_process ?
+      NULL : internal::posix::GetEnv("TEST_PREMATURE_EXIT_FILE"));
+
+  // Captures the value of GTEST_FLAG(catch_exceptions).  This value will be
+  // used for the duration of the program.
+  impl()->set_catch_exceptions(GTEST_FLAG(catch_exceptions));
+
+#if GTEST_HAS_SEH
+  // Either the user wants Google Test to catch exceptions thrown by the
+  // tests or this is executing in the context of death test child
+  // process. In either case the user does not want to see pop-up dialogs
+  // about crashes - they are expected.
+  if (impl()->catch_exceptions() || in_death_test_child_process) {
+# if !GTEST_OS_WINDOWS_MOBILE
+    // SetErrorMode doesn't exist on CE.
+    SetErrorMode(SEM_FAILCRITICALERRORS | SEM_NOALIGNMENTFAULTEXCEPT |
+                 SEM_NOGPFAULTERRORBOX | SEM_NOOPENFILEERRORBOX);
+# endif  // !GTEST_OS_WINDOWS_MOBILE
+
+# if (defined(_MSC_VER) || GTEST_OS_WINDOWS_MINGW) && !GTEST_OS_WINDOWS_MOBILE
+    // Death test children can be terminated with _abort().  On Windows,
+    // _abort() can show a dialog with a warning message.  This forces the
+    // abort message to go to stderr instead.
+    _set_error_mode(_OUT_TO_STDERR);
+# endif
+
+# if _MSC_VER >= 1400 && !GTEST_OS_WINDOWS_MOBILE
+    // In the debug version, Visual Studio pops up a separate dialog
+    // offering a choice to debug the aborted program. We need to suppress
+    // this dialog or it will pop up for every EXPECT/ASSERT_DEATH statement
+    // executed. Google Test will notify the user of any unexpected
+    // failure via stderr.
+    //
+    // VC++ doesn't define _set_abort_behavior() prior to the version 8.0.
+    // Users of prior VC versions shall suffer the agony and pain of
+    // clicking through the countless debug dialogs.
+    // TODO(vladl@google.com): find a way to suppress the abort dialog() in the
+    // debug mode when compiled with VC 7.1 or lower.
+    if (!GTEST_FLAG(break_on_failure))
+      _set_abort_behavior(
+          0x0,                                    // Clear the following flags:
+          _WRITE_ABORT_MSG | _CALL_REPORTFAULT);  // pop-up window, core dump.
+# endif
+  }
+#endif  // GTEST_HAS_SEH
+
+  return internal::HandleExceptionsInMethodIfSupported(
+      impl(),
+      &internal::UnitTestImpl::RunAllTests,
+      "auxiliary test code (environments or event listeners)") ? 0 : 1;
+}
+
+// Returns the working directory when the first TEST() or TEST_F() was
+// executed.
+const char* UnitTest::original_working_dir() const {
+  return impl_->original_working_dir_.c_str();
+}
+
+// Returns the TestCase object for the test that's currently running,
+// or NULL if no test is running.
+const TestCase* UnitTest::current_test_case() const
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+  internal::MutexLock lock(&mutex_);
+  return impl_->current_test_case();
+}
+
+// Returns the TestInfo object for the test that's currently running,
+// or NULL if no test is running.
+const TestInfo* UnitTest::current_test_info() const
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+  internal::MutexLock lock(&mutex_);
+  return impl_->current_test_info();
+}
+
+// Returns the random seed used at the start of the current test run.
+int UnitTest::random_seed() const { return impl_->random_seed(); }
+
+#if GTEST_HAS_PARAM_TEST
+// Returns ParameterizedTestCaseRegistry object used to keep track of
+// value-parameterized tests and instantiate and register them.
+internal::ParameterizedTestCaseRegistry&
+    UnitTest::parameterized_test_registry()
+        GTEST_LOCK_EXCLUDED_(mutex_) {
+  return impl_->parameterized_test_registry();
+}
+#endif  // GTEST_HAS_PARAM_TEST
+
+// Creates an empty UnitTest.
+UnitTest::UnitTest() {
+  impl_ = new internal::UnitTestImpl(this);
+}
+
+// Destructor of UnitTest.
+UnitTest::~UnitTest() {
+  delete impl_;
+}
+
+// Pushes a trace defined by SCOPED_TRACE() on to the per-thread
+// Google Test trace stack.
+void UnitTest::PushGTestTrace(const internal::TraceInfo& trace)
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+  internal::MutexLock lock(&mutex_);
+  impl_->gtest_trace_stack().push_back(trace);
+}
+
+// Pops a trace from the per-thread Google Test trace stack.
+void UnitTest::PopGTestTrace()
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+  internal::MutexLock lock(&mutex_);
+  impl_->gtest_trace_stack().pop_back();
+}
+
+namespace internal {
+
+UnitTestImpl::UnitTestImpl(UnitTest* parent)
+    : parent_(parent),
+#ifdef _MSC_VER
+# pragma warning(push)                    // Saves the current warning state.
+# pragma warning(disable:4355)            // Temporarily disables warning 4355
+                                         // (using this in initializer).
+      default_global_test_part_result_reporter_(this),
+      default_per_thread_test_part_result_reporter_(this),
+# pragma warning(pop)                     // Restores the warning state again.
+#else
+      default_global_test_part_result_reporter_(this),
+      default_per_thread_test_part_result_reporter_(this),
+#endif  // _MSC_VER
+      global_test_part_result_repoter_(
+          &default_global_test_part_result_reporter_),
+      per_thread_test_part_result_reporter_(
+          &default_per_thread_test_part_result_reporter_),
+#if GTEST_HAS_PARAM_TEST
+      parameterized_test_registry_(),
+      parameterized_tests_registered_(false),
+#endif  // GTEST_HAS_PARAM_TEST
+      last_death_test_case_(-1),
+      current_test_case_(NULL),
+      current_test_info_(NULL),
+      ad_hoc_test_result_(),
+      os_stack_trace_getter_(NULL),
+      post_flag_parse_init_performed_(false),
+      random_seed_(0),  // Will be overridden by the flag before first use.
+      random_(0),  // Will be reseeded before first use.
+      start_timestamp_(0),
+      elapsed_time_(0),
+#if GTEST_HAS_DEATH_TEST
+      death_test_factory_(new DefaultDeathTestFactory),
+#endif
+      // Will be overridden by the flag before first use.
+      catch_exceptions_(false) {
+  listeners()->SetDefaultResultPrinter(new PrettyUnitTestResultPrinter);
+}
+
+UnitTestImpl::~UnitTestImpl() {
+  // Deletes every TestCase.
+  ForEach(test_cases_, internal::Delete<TestCase>);
+
+  // Deletes every Environment.
+  ForEach(environments_, internal::Delete<Environment>);
+
+  delete os_stack_trace_getter_;
+}
+
+// Adds a TestProperty to the current TestResult object when invoked in a
+// context of a test, to current test case's ad_hoc_test_result when invoke
+// from SetUpTestCase/TearDownTestCase, or to the global property set
+// otherwise.  If the result already contains a property with the same key,
+// the value will be updated.
+void UnitTestImpl::RecordProperty(const TestProperty& test_property) {
+  std::string xml_element;
+  TestResult* test_result;  // TestResult appropriate for property recording.
+
+  if (current_test_info_ != NULL) {
+    xml_element = "testcase";
+    test_result = &(current_test_info_->result_);
+  } else if (current_test_case_ != NULL) {
+    xml_element = "testsuite";
+    test_result = &(current_test_case_->ad_hoc_test_result_);
+  } else {
+    xml_element = "testsuites";
+    test_result = &ad_hoc_test_result_;
+  }
+  test_result->RecordProperty(xml_element, test_property);
+}
+
+#if GTEST_HAS_DEATH_TEST
+// Disables event forwarding if the control is currently in a death test
+// subprocess. Must not be called before InitGoogleTest.
+void UnitTestImpl::SuppressTestEventsIfInSubprocess() {
+  if (internal_run_death_test_flag_.get() != NULL)
+    listeners()->SuppressEventForwarding();
+}
+#endif  // GTEST_HAS_DEATH_TEST
+
+// Initializes event listeners performing XML output as specified by
+// UnitTestOptions. Must not be called before InitGoogleTest.
+void UnitTestImpl::ConfigureXmlOutput() {
+  const std::string& output_format = UnitTestOptions::GetOutputFormat();
+  if (output_format == "xml") {
+    listeners()->SetDefaultXmlGenerator(new XmlUnitTestResultPrinter(
+        UnitTestOptions::GetAbsolutePathToOutputFile().c_str()));
+  } else if (output_format != "") {
+    printf("WARNING: unrecognized output format \"%s\" ignored.\n",
+           output_format.c_str());
+    fflush(stdout);
+  }
+}
+
+#if GTEST_CAN_STREAM_RESULTS_
+// Initializes event listeners for streaming test results in string form.
+// Must not be called before InitGoogleTest.
+void UnitTestImpl::ConfigureStreamingOutput() {
+  const std::string& target = GTEST_FLAG(stream_result_to);
+  if (!target.empty()) {
+    const size_t pos = target.find(':');
+    if (pos != std::string::npos) {
+      listeners()->Append(new StreamingListener(target.substr(0, pos),
+                                                target.substr(pos+1)));
+    } else {
+      printf("WARNING: unrecognized streaming target \"%s\" ignored.\n",
+             target.c_str());
+      fflush(stdout);
+    }
+  }
+}
+#endif  // GTEST_CAN_STREAM_RESULTS_
+
+// Performs initialization dependent upon flag values obtained in
+// ParseGoogleTestFlagsOnly.  Is called from InitGoogleTest after the call to
+// ParseGoogleTestFlagsOnly.  In case a user neglects to call InitGoogleTest
+// this function is also called from RunAllTests.  Since this function can be
+// called more than once, it has to be idempotent.
+void UnitTestImpl::PostFlagParsingInit() {
+  // Ensures that this function does not execute more than once.
+  if (!post_flag_parse_init_performed_) {
+    post_flag_parse_init_performed_ = true;
+
+#if GTEST_HAS_DEATH_TEST
+    InitDeathTestSubprocessControlInfo();
+    SuppressTestEventsIfInSubprocess();
+#endif  // GTEST_HAS_DEATH_TEST
+
+    // Registers parameterized tests. This makes parameterized tests
+    // available to the UnitTest reflection API without running
+    // RUN_ALL_TESTS.
+    RegisterParameterizedTests();
+
+    // Configures listeners for XML output. This makes it possible for users
+    // to shut down the default XML output before invoking RUN_ALL_TESTS.
+    ConfigureXmlOutput();
+
+#if GTEST_CAN_STREAM_RESULTS_
+    // Configures listeners for streaming test results to the specified server.
+    ConfigureStreamingOutput();
+#endif  // GTEST_CAN_STREAM_RESULTS_
+  }
+}
+
+// A predicate that checks the name of a TestCase against a known
+// value.
+//
+// This is used for implementation of the UnitTest class only.  We put
+// it in the anonymous namespace to prevent polluting the outer
+// namespace.
+//
+// TestCaseNameIs is copyable.
+class TestCaseNameIs {
+ public:
+  // Constructor.
+  explicit TestCaseNameIs(const std::string& name)
+      : name_(name) {}
+
+  // Returns true iff the name of test_case matches name_.
+  bool operator()(const TestCase* test_case) const {
+    return test_case != NULL && strcmp(test_case->name(), name_.c_str()) == 0;
+  }
+
+ private:
+  std::string name_;
+};
+
+// Finds and returns a TestCase with the given name.  If one doesn't
+// exist, creates one and returns it.  It's the CALLER'S
+// RESPONSIBILITY to ensure that this function is only called WHEN THE
+// TESTS ARE NOT SHUFFLED.
+//
+// Arguments:
+//
+//   test_case_name: name of the test case
+//   type_param:     the name of the test case's type parameter, or NULL if
+//                   this is not a typed or a type-parameterized test case.
+//   set_up_tc:      pointer to the function that sets up the test case
+//   tear_down_tc:   pointer to the function that tears down the test case
+TestCase* UnitTestImpl::GetTestCase(const char* test_case_name,
+                                    const char* type_param,
+                                    Test::SetUpTestCaseFunc set_up_tc,
+                                    Test::TearDownTestCaseFunc tear_down_tc) {
+  // Can we find a TestCase with the given name?
+  const std::vector<TestCase*>::const_iterator test_case =
+      std::find_if(test_cases_.begin(), test_cases_.end(),
+                   TestCaseNameIs(test_case_name));
+
+  if (test_case != test_cases_.end())
+    return *test_case;
+
+  // No.  Let's create one.
+  TestCase* const new_test_case =
+      new TestCase(test_case_name, type_param, set_up_tc, tear_down_tc);
+
+  // Is this a death test case?
+  if (internal::UnitTestOptions::MatchesFilter(test_case_name,
+                                               kDeathTestCaseFilter)) {
+    // Yes.  Inserts the test case after the last death test case
+    // defined so far.  This only works when the test cases haven't
+    // been shuffled.  Otherwise we may end up running a death test
+    // after a non-death test.
+    ++last_death_test_case_;
+    test_cases_.insert(test_cases_.begin() + last_death_test_case_,
+                       new_test_case);
+  } else {
+    // No.  Appends to the end of the list.
+    test_cases_.push_back(new_test_case);
+  }
+
+  test_case_indices_.push_back(static_cast<int>(test_case_indices_.size()));
+  return new_test_case;
+}
+
+// Helpers for setting up / tearing down the given environment.  They
+// are for use in the ForEach() function.
+static void SetUpEnvironment(Environment* env) { env->SetUp(); }
+static void TearDownEnvironment(Environment* env) { env->TearDown(); }
+
+// Runs all tests in this UnitTest object, prints the result, and
+// returns true if all tests are successful.  If any exception is
+// thrown during a test, the test is considered to be failed, but the
+// rest of the tests will still be run.
+//
+// When parameterized tests are enabled, it expands and registers
+// parameterized tests first in RegisterParameterizedTests().
+// All other functions called from RunAllTests() may safely assume that
+// parameterized tests are ready to be counted and run.
+bool UnitTestImpl::RunAllTests() {
+  // Makes sure InitGoogleTest() was called.
+  if (!GTestIsInitialized()) {
+    printf("%s",
+           "\nThis test program did NOT call ::testing::InitGoogleTest "
+           "before calling RUN_ALL_TESTS().  Please fix it.\n");
+    return false;
+  }
+
+  // Do not run any test if the --help flag was specified.
+  if (g_help_flag)
+    return true;
+
+  // Repeats the call to the post-flag parsing initialization in case the
+  // user didn't call InitGoogleTest.
+  PostFlagParsingInit();
+
+  // Even if sharding is not on, test runners may want to use the
+  // GTEST_SHARD_STATUS_FILE to query whether the test supports the sharding
+  // protocol.
+  internal::WriteToShardStatusFileIfNeeded();
+
+  // True iff we are in a subprocess for running a thread-safe-style
+  // death test.
+  bool in_subprocess_for_death_test = false;
+
+#if GTEST_HAS_DEATH_TEST
+  in_subprocess_for_death_test = (internal_run_death_test_flag_.get() != NULL);
+#endif  // GTEST_HAS_DEATH_TEST
+
+  const bool should_shard = ShouldShard(kTestTotalShards, kTestShardIndex,
+                                        in_subprocess_for_death_test);
+
+  // Compares the full test names with the filter to decide which
+  // tests to run.
+  const bool has_tests_to_run = FilterTests(should_shard
+                                              ? HONOR_SHARDING_PROTOCOL
+                                              : IGNORE_SHARDING_PROTOCOL) > 0;
+
+  // Lists the tests and exits if the --gtest_list_tests flag was specified.
+  if (GTEST_FLAG(list_tests)) {
+    // This must be called *after* FilterTests() has been called.
+    ListTestsMatchingFilter();
+    return true;
+  }
+
+  random_seed_ = GTEST_FLAG(shuffle) ?
+      GetRandomSeedFromFlag(GTEST_FLAG(random_seed)) : 0;
+
+  // True iff at least one test has failed.
+  bool failed = false;
+
+  TestEventListener* repeater = listeners()->repeater();
+
+  start_timestamp_ = GetTimeInMillis();
+  repeater->OnTestProgramStart(*parent_);
+
+  // How many times to repeat the tests?  We don't want to repeat them
+  // when we are inside the subprocess of a death test.
+  const int repeat = in_subprocess_for_death_test ? 1 : GTEST_FLAG(repeat);
+  // Repeats forever if the repeat count is negative.
+  const bool forever = repeat < 0;
+  for (int i = 0; forever || i != repeat; i++) {
+    // We want to preserve failures generated by ad-hoc test
+    // assertions executed before RUN_ALL_TESTS().
+    ClearNonAdHocTestResult();
+
+    const TimeInMillis start = GetTimeInMillis();
+
+    // Shuffles test cases and tests if requested.
+    if (has_tests_to_run && GTEST_FLAG(shuffle)) {
+      random()->Reseed(random_seed_);
+      // This should be done before calling OnTestIterationStart(),
+      // such that a test event listener can see the actual test order
+      // in the event.
+      ShuffleTests();
+    }
+
+    // Tells the unit test event listeners that the tests are about to start.
+    repeater->OnTestIterationStart(*parent_, i);
+
+    // Runs each test case if there is at least one test to run.
+    if (has_tests_to_run) {
+      // Sets up all environments beforehand.
+      repeater->OnEnvironmentsSetUpStart(*parent_);
+      ForEach(environments_, SetUpEnvironment);
+      repeater->OnEnvironmentsSetUpEnd(*parent_);
+
+      // Runs the tests only if there was no fatal failure during global
+      // set-up.
+      if (!Test::HasFatalFailure()) {
+        for (int test_index = 0; test_index < total_test_case_count();
+             test_index++) {
+          GetMutableTestCase(test_index)->Run();
+        }
+      }
+
+      // Tears down all environments in reverse order afterwards.
+      repeater->OnEnvironmentsTearDownStart(*parent_);
+      std::for_each(environments_.rbegin(), environments_.rend(),
+                    TearDownEnvironment);
+      repeater->OnEnvironmentsTearDownEnd(*parent_);
+    }
+
+    elapsed_time_ = GetTimeInMillis() - start;
+
+    // Tells the unit test event listener that the tests have just finished.
+    repeater->OnTestIterationEnd(*parent_, i);
+
+    // Gets the result and clears it.
+    if (!Passed()) {
+      failed = true;
+    }
+
+    // Restores the original test order after the iteration.  This
+    // allows the user to quickly repro a failure that happens in the
+    // N-th iteration without repeating the first (N - 1) iterations.
+    // This is not enclosed in "if (GTEST_FLAG(shuffle)) { ... }", in
+    // case the user somehow changes the value of the flag somewhere
+    // (it's always safe to unshuffle the tests).
+    UnshuffleTests();
+
+    if (GTEST_FLAG(shuffle)) {
+      // Picks a new random seed for each iteration.
+      random_seed_ = GetNextRandomSeed(random_seed_);
+    }
+  }
+
+  repeater->OnTestProgramEnd(*parent_);
+
+  return !failed;
+}
+
+// Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file
+// if the variable is present. If a file already exists at this location, this
+// function will write over it. If the variable is present, but the file cannot
+// be created, prints an error and exits.
+void WriteToShardStatusFileIfNeeded() {
+  const char* const test_shard_file = posix::GetEnv(kTestShardStatusFile);
+  if (test_shard_file != NULL) {
+    FILE* const file = posix::FOpen(test_shard_file, "w");
+    if (file == NULL) {
+      ColoredPrintf(COLOR_RED,
+                    "Could not write to the test shard status file \"%s\" "
+                    "specified by the %s environment variable.\n",
+                    test_shard_file, kTestShardStatusFile);
+      fflush(stdout);
+      exit(EXIT_FAILURE);
+    }
+    fclose(file);
+  }
+}
+
+// Checks whether sharding is enabled by examining the relevant
+// environment variable values. If the variables are present,
+// but inconsistent (i.e., shard_index >= total_shards), prints
+// an error and exits. If in_subprocess_for_death_test, sharding is
+// disabled because it must only be applied to the original test
+// process. Otherwise, we could filter out death tests we intended to execute.
+bool ShouldShard(const char* total_shards_env,
+                 const char* shard_index_env,
+                 bool in_subprocess_for_death_test) {
+  if (in_subprocess_for_death_test) {
+    return false;
+  }
+
+  const Int32 total_shards = Int32FromEnvOrDie(total_shards_env, -1);
+  const Int32 shard_index = Int32FromEnvOrDie(shard_index_env, -1);
+
+  if (total_shards == -1 && shard_index == -1) {
+    return false;
+  } else if (total_shards == -1 && shard_index != -1) {
+    const Message msg = Message()
+      << "Invalid environment variables: you have "
+      << kTestShardIndex << " = " << shard_index
+      << ", but have left " << kTestTotalShards << " unset.\n";
+    ColoredPrintf(COLOR_RED, msg.GetString().c_str());
+    fflush(stdout);
+    exit(EXIT_FAILURE);
+  } else if (total_shards != -1 && shard_index == -1) {
+    const Message msg = Message()
+      << "Invalid environment variables: you have "
+      << kTestTotalShards << " = " << total_shards
+      << ", but have left " << kTestShardIndex << " unset.\n";
+    ColoredPrintf(COLOR_RED, msg.GetString().c_str());
+    fflush(stdout);
+    exit(EXIT_FAILURE);
+  } else if (shard_index < 0 || shard_index >= total_shards) {
+    const Message msg = Message()
+      << "Invalid environment variables: we require 0 <= "
+      << kTestShardIndex << " < " << kTestTotalShards
+      << ", but you have " << kTestShardIndex << "=" << shard_index
+      << ", " << kTestTotalShards << "=" << total_shards << ".\n";
+    ColoredPrintf(COLOR_RED, msg.GetString().c_str());
+    fflush(stdout);
+    exit(EXIT_FAILURE);
+  }
+
+  return total_shards > 1;
+}
+
+// Parses the environment variable var as an Int32. If it is unset,
+// returns default_val. If it is not an Int32, prints an error
+// and aborts.
+Int32 Int32FromEnvOrDie(const char* var, Int32 default_val) {
+  const char* str_val = posix::GetEnv(var);
+  if (str_val == NULL) {
+    return default_val;
+  }
+
+  Int32 result;
+  if (!ParseInt32(Message() << "The value of environment variable " << var,
+                  str_val, &result)) {
+    exit(EXIT_FAILURE);
+  }
+  return result;
+}
+
+// Given the total number of shards, the shard index, and the test id,
+// returns true iff the test should be run on this shard. The test id is
+// some arbitrary but unique non-negative integer assigned to each test
+// method. Assumes that 0 <= shard_index < total_shards.
+bool ShouldRunTestOnShard(int total_shards, int shard_index, int test_id) {
+  return (test_id % total_shards) == shard_index;
+}
+
+// Compares the name of each test with the user-specified filter to
+// decide whether the test should be run, then records the result in
+// each TestCase and TestInfo object.
+// If shard_tests == true, further filters tests based on sharding
+// variables in the environment - see
+// http://code.google.com/p/googletest/wiki/GoogleTestAdvancedGuide.
+// Returns the number of tests that should run.
+int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) {
+  const Int32 total_shards = shard_tests == HONOR_SHARDING_PROTOCOL ?
+      Int32FromEnvOrDie(kTestTotalShards, -1) : -1;
+  const Int32 shard_index = shard_tests == HONOR_SHARDING_PROTOCOL ?
+      Int32FromEnvOrDie(kTestShardIndex, -1) : -1;
+
+  // num_runnable_tests are the number of tests that will
+  // run across all shards (i.e., match filter and are not disabled).
+  // num_selected_tests are the number of tests to be run on
+  // this shard.
+  int num_runnable_tests = 0;
+  int num_selected_tests = 0;
+  for (size_t i = 0; i < test_cases_.size(); i++) {
+    TestCase* const test_case = test_cases_[i];
+    const std::string &test_case_name = test_case->name();
+    test_case->set_should_run(false);
+
+    for (size_t j = 0; j < test_case->test_info_list().size(); j++) {
+      TestInfo* const test_info = test_case->test_info_list()[j];
+      const std::string test_name(test_info->name());
+      // A test is disabled if test case name or test name matches
+      // kDisableTestFilter.
+      const bool is_disabled =
+          internal::UnitTestOptions::MatchesFilter(test_case_name,
+                                                   kDisableTestFilter) ||
+          internal::UnitTestOptions::MatchesFilter(test_name,
+                                                   kDisableTestFilter);
+      test_info->is_disabled_ = is_disabled;
+
+      const bool matches_filter =
+          internal::UnitTestOptions::FilterMatchesTest(test_case_name,
+                                                       test_name);
+      test_info->matches_filter_ = matches_filter;
+
+      const bool is_runnable =
+          (GTEST_FLAG(also_run_disabled_tests) || !is_disabled) &&
+          matches_filter;
+
+      const bool is_selected = is_runnable &&
+          (shard_tests == IGNORE_SHARDING_PROTOCOL ||
+           ShouldRunTestOnShard(total_shards, shard_index,
+                                num_runnable_tests));
+
+      num_runnable_tests += is_runnable;
+      num_selected_tests += is_selected;
+
+      test_info->should_run_ = is_selected;
+      test_case->set_should_run(test_case->should_run() || is_selected);
+    }
+  }
+  return num_selected_tests;
+}
+
+// Prints the given C-string on a single line by replacing all '\n'
+// characters with string "\\n".  If the output takes more than
+// max_length characters, only prints the first max_length characters
+// and "...".
+static void PrintOnOneLine(const char* str, int max_length) {
+  if (str != NULL) {
+    for (int i = 0; *str != '\0'; ++str) {
+      if (i >= max_length) {
+        printf("...");
+        break;
+      }
+      if (*str == '\n') {
+        printf("\\n");
+        i += 2;
+      } else {
+        printf("%c", *str);
+        ++i;
+      }
+    }
+  }
+}
+
+// Prints the names of the tests matching the user-specified filter flag.
+void UnitTestImpl::ListTestsMatchingFilter() {
+  // Print at most this many characters for each type/value parameter.
+  const int kMaxParamLength = 250;
+
+  for (size_t i = 0; i < test_cases_.size(); i++) {
+    const TestCase* const test_case = test_cases_[i];
+    bool printed_test_case_name = false;
+
+    for (size_t j = 0; j < test_case->test_info_list().size(); j++) {
+      const TestInfo* const test_info =
+          test_case->test_info_list()[j];
+      if (test_info->matches_filter_) {
+        if (!printed_test_case_name) {
+          printed_test_case_name = true;
+          printf("%s.", test_case->name());
+          if (test_case->type_param() != NULL) {
+            printf("  # %s = ", kTypeParamLabel);
+            // We print the type parameter on a single line to make
+            // the output easy to parse by a program.
+            PrintOnOneLine(test_case->type_param(), kMaxParamLength);
+          }
+          printf("\n");
+        }
+        printf("  %s", test_info->name());
+        if (test_info->value_param() != NULL) {
+          printf("  # %s = ", kValueParamLabel);
+          // We print the value parameter on a single line to make the
+          // output easy to parse by a program.
+          PrintOnOneLine(test_info->value_param(), kMaxParamLength);
+        }
+        printf("\n");
+      }
+    }
+  }
+  fflush(stdout);
+}
+
+// Sets the OS stack trace getter.
+//
+// Does nothing if the input and the current OS stack trace getter are
+// the same; otherwise, deletes the old getter and makes the input the
+// current getter.
+void UnitTestImpl::set_os_stack_trace_getter(
+    OsStackTraceGetterInterface* getter) {
+  if (os_stack_trace_getter_ != getter) {
+    delete os_stack_trace_getter_;
+    os_stack_trace_getter_ = getter;
+  }
+}
+
+// Returns the current OS stack trace getter if it is not NULL;
+// otherwise, creates an OsStackTraceGetter, makes it the current
+// getter, and returns it.
+OsStackTraceGetterInterface* UnitTestImpl::os_stack_trace_getter() {
+  if (os_stack_trace_getter_ == NULL) {
+    os_stack_trace_getter_ = new OsStackTraceGetter;
+  }
+
+  return os_stack_trace_getter_;
+}
+
+// Returns the TestResult for the test that's currently running, or
+// the TestResult for the ad hoc test if no test is running.
+TestResult* UnitTestImpl::current_test_result() {
+  return current_test_info_ ?
+      &(current_test_info_->result_) : &ad_hoc_test_result_;
+}
+
+// Shuffles all test cases, and the tests within each test case,
+// making sure that death tests are still run first.
+void UnitTestImpl::ShuffleTests() {
+  // Shuffles the death test cases.
+  ShuffleRange(random(), 0, last_death_test_case_ + 1, &test_case_indices_);
+
+  // Shuffles the non-death test cases.
+  ShuffleRange(random(), last_death_test_case_ + 1,
+               static_cast<int>(test_cases_.size()), &test_case_indices_);
+
+  // Shuffles the tests inside each test case.
+  for (size_t i = 0; i < test_cases_.size(); i++) {
+    test_cases_[i]->ShuffleTests(random());
+  }
+}
+
+// Restores the test cases and tests to their order before the first shuffle.
+void UnitTestImpl::UnshuffleTests() {
+  for (size_t i = 0; i < test_cases_.size(); i++) {
+    // Unshuffles the tests in each test case.
+    test_cases_[i]->UnshuffleTests();
+    // Resets the index of each test case.
+    test_case_indices_[i] = static_cast<int>(i);
+  }
+}
+
+// Returns the current OS stack trace as an std::string.
+//
+// The maximum number of stack frames to be included is specified by
+// the gtest_stack_trace_depth flag.  The skip_count parameter
+// specifies the number of top frames to be skipped, which doesn't
+// count against the number of frames to be included.
+//
+// For example, if Foo() calls Bar(), which in turn calls
+// GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in
+// the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't.
+std::string GetCurrentOsStackTraceExceptTop(UnitTest* /*unit_test*/,
+                                            int skip_count) {
+  // We pass skip_count + 1 to skip this wrapper function in addition
+  // to what the user really wants to skip.
+  return GetUnitTestImpl()->CurrentOsStackTraceExceptTop(skip_count + 1);
+}
+
+// Used by the GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_ macro to
+// suppress unreachable code warnings.
+namespace {
+class ClassUniqueToAlwaysTrue {};
+}
+
+bool IsTrue(bool condition) { return condition; }
+
+bool AlwaysTrue() {
+#if GTEST_HAS_EXCEPTIONS
+  // This condition is always false so AlwaysTrue() never actually throws,
+  // but it makes the compiler think that it may throw.
+  if (IsTrue(false))
+    throw ClassUniqueToAlwaysTrue();
+#endif  // GTEST_HAS_EXCEPTIONS
+  return true;
+}
+
+// If *pstr starts with the given prefix, modifies *pstr to be right
+// past the prefix and returns true; otherwise leaves *pstr unchanged
+// and returns false.  None of pstr, *pstr, and prefix can be NULL.
+bool SkipPrefix(const char* prefix, const char** pstr) {
+  const size_t prefix_len = strlen(prefix);
+  if (strncmp(*pstr, prefix, prefix_len) == 0) {
+    *pstr += prefix_len;
+    return true;
+  }
+  return false;
+}
+
+// Parses a string as a command line flag.  The string should have
+// the format "--flag=value".  When def_optional is true, the "=value"
+// part can be omitted.
+//
+// Returns the value of the flag, or NULL if the parsing failed.
+const char* ParseFlagValue(const char* str,
+                           const char* flag,
+                           bool def_optional) {
+  // str and flag must not be NULL.
+  if (str == NULL || flag == NULL) return NULL;
+
+  // The flag must start with "--" followed by GTEST_FLAG_PREFIX_.
+  const std::string flag_str = std::string("--") + GTEST_FLAG_PREFIX_ + flag;
+  const size_t flag_len = flag_str.length();
+  if (strncmp(str, flag_str.c_str(), flag_len) != 0) return NULL;
+
+  // Skips the flag name.
+  const char* flag_end = str + flag_len;
+
+  // When def_optional is true, it's OK to not have a "=value" part.
+  if (def_optional && (flag_end[0] == '\0')) {
+    return flag_end;
+  }
+
+  // If def_optional is true and there are more characters after the
+  // flag name, or if def_optional is false, there must be a '=' after
+  // the flag name.
+  if (flag_end[0] != '=') return NULL;
+
+  // Returns the string after "=".
+  return flag_end + 1;
+}
+
+// Parses a string for a bool flag, in the form of either
+// "--flag=value" or "--flag".
+//
+// In the former case, the value is taken as true as long as it does
+// not start with '0', 'f', or 'F'.
+//
+// In the latter case, the value is taken as true.
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+bool ParseBoolFlag(const char* str, const char* flag, bool* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseFlagValue(str, flag, true);
+
+  // Aborts if the parsing failed.
+  if (value_str == NULL) return false;
+
+  // Converts the string value to a bool.
+  *value = !(*value_str == '0' || *value_str == 'f' || *value_str == 'F');
+  return true;
+}
+
+// Parses a string for an Int32 flag, in the form of
+// "--flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+bool ParseInt32Flag(const char* str, const char* flag, Int32* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseFlagValue(str, flag, false);
+
+  // Aborts if the parsing failed.
+  if (value_str == NULL) return false;
+
+  // Sets *value to the value of the flag.
+  return ParseInt32(Message() << "The value of flag --" << flag,
+                    value_str, value);
+}
+
+// Parses a string for a string flag, in the form of
+// "--flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+bool ParseStringFlag(const char* str, const char* flag, std::string* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseFlagValue(str, flag, false);
+
+  // Aborts if the parsing failed.
+  if (value_str == NULL) return false;
+
+  // Sets *value to the value of the flag.
+  *value = value_str;
+  return true;
+}
+
+// Determines whether a string has a prefix that Google Test uses for its
+// flags, i.e., starts with GTEST_FLAG_PREFIX_ or GTEST_FLAG_PREFIX_DASH_.
+// If Google Test detects that a command line flag has its prefix but is not
+// recognized, it will print its help message. Flags starting with
+// GTEST_INTERNAL_PREFIX_ followed by "internal_" are considered Google Test
+// internal flags and do not trigger the help message.
+static bool HasGoogleTestFlagPrefix(const char* str) {
+  return (SkipPrefix("--", &str) ||
+          SkipPrefix("-", &str) ||
+          SkipPrefix("/", &str)) &&
+         !SkipPrefix(GTEST_FLAG_PREFIX_ "internal_", &str) &&
+         (SkipPrefix(GTEST_FLAG_PREFIX_, &str) ||
+          SkipPrefix(GTEST_FLAG_PREFIX_DASH_, &str));
+}
+
+// Prints a string containing code-encoded text.  The following escape
+// sequences can be used in the string to control the text color:
+//
+//   @@    prints a single '@' character.
+//   @R    changes the color to red.
+//   @G    changes the color to green.
+//   @Y    changes the color to yellow.
+//   @D    changes to the default terminal text color.
+//
+// TODO(wan@google.com): Write tests for this once we add stdout
+// capturing to Google Test.
+static void PrintColorEncoded(const char* str) {
+  GTestColor color = COLOR_DEFAULT;  // The current color.
+
+  // Conceptually, we split the string into segments divided by escape
+  // sequences.  Then we print one segment at a time.  At the end of
+  // each iteration, the str pointer advances to the beginning of the
+  // next segment.
+  for (;;) {
+    const char* p = strchr(str, '@');
+    if (p == NULL) {
+      ColoredPrintf(color, "%s", str);
+      return;
+    }
+
+    ColoredPrintf(color, "%s", std::string(str, p).c_str());
+
+    const char ch = p[1];
+    str = p + 2;
+    if (ch == '@') {
+      ColoredPrintf(color, "@");
+    } else if (ch == 'D') {
+      color = COLOR_DEFAULT;
+    } else if (ch == 'R') {
+      color = COLOR_RED;
+    } else if (ch == 'G') {
+      color = COLOR_GREEN;
+    } else if (ch == 'Y') {
+      color = COLOR_YELLOW;
+    } else {
+      --str;
+    }
+  }
+}
+
+static const char kColorEncodedHelpMessage[] =
+"This program contains tests written using " GTEST_NAME_ ". You can use the\n"
+"following command line flags to control its behavior:\n"
+"\n"
+"Test Selection:\n"
+"  @G--" GTEST_FLAG_PREFIX_ "list_tests@D\n"
+"      List the names of all tests instead of running them. The name of\n"
+"      TEST(Foo, Bar) is \"Foo.Bar\".\n"
+"  @G--" GTEST_FLAG_PREFIX_ "filter=@YPOSTIVE_PATTERNS"
+    "[@G-@YNEGATIVE_PATTERNS]@D\n"
+"      Run only the tests whose name matches one of the positive patterns but\n"
+"      none of the negative patterns. '?' matches any single character; '*'\n"
+"      matches any substring; ':' separates two patterns.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "also_run_disabled_tests@D\n"
+"      Run all disabled tests too.\n"
+"\n"
+"Test Execution:\n"
+"  @G--" GTEST_FLAG_PREFIX_ "repeat=@Y[COUNT]@D\n"
+"      Run the tests repeatedly; use a negative count to repeat forever.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "shuffle@D\n"
+"      Randomize tests' orders on every iteration.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "random_seed=@Y[NUMBER]@D\n"
+"      Random number seed to use for shuffling test orders (between 1 and\n"
+"      99999, or 0 to use a seed based on the current time).\n"
+"\n"
+"Test Output:\n"
+"  @G--" GTEST_FLAG_PREFIX_ "color=@Y(@Gyes@Y|@Gno@Y|@Gauto@Y)@D\n"
+"      Enable/disable colored output. The default is @Gauto@D.\n"
+"  -@G-" GTEST_FLAG_PREFIX_ "print_time=0@D\n"
+"      Don't print the elapsed time of each test.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "output=xml@Y[@G:@YDIRECTORY_PATH@G"
+    GTEST_PATH_SEP_ "@Y|@G:@YFILE_PATH]@D\n"
+"      Generate an XML report in the given directory or with the given file\n"
+"      name. @YFILE_PATH@D defaults to @Gtest_details.xml@D.\n"
+#if GTEST_CAN_STREAM_RESULTS_
+"  @G--" GTEST_FLAG_PREFIX_ "stream_result_to=@YHOST@G:@YPORT@D\n"
+"      Stream test results to the given server.\n"
+#endif  // GTEST_CAN_STREAM_RESULTS_
+"\n"
+"Assertion Behavior:\n"
+#if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
+"  @G--" GTEST_FLAG_PREFIX_ "death_test_style=@Y(@Gfast@Y|@Gthreadsafe@Y)@D\n"
+"      Set the default death test style.\n"
+#endif  // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
+"  @G--" GTEST_FLAG_PREFIX_ "break_on_failure@D\n"
+"      Turn assertion failures into debugger break-points.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "throw_on_failure@D\n"
+"      Turn assertion failures into C++ exceptions.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "catch_exceptions=0@D\n"
+"      Do not report exceptions as test failures. Instead, allow them\n"
+"      to crash the program or throw a pop-up (on Windows).\n"
+"\n"
+"Except for @G--" GTEST_FLAG_PREFIX_ "list_tests@D, you can alternatively set "
+    "the corresponding\n"
+"environment variable of a flag (all letters in upper-case). For example, to\n"
+"disable colored text output, you can either specify @G--" GTEST_FLAG_PREFIX_
+    "color=no@D or set\n"
+"the @G" GTEST_FLAG_PREFIX_UPPER_ "COLOR@D environment variable to @Gno@D.\n"
+"\n"
+"For more information, please read the " GTEST_NAME_ " documentation at\n"
+"@G" GTEST_PROJECT_URL_ "@D. If you find a bug in " GTEST_NAME_ "\n"
+"(not one in your own code or tests), please report it to\n"
+"@G<" GTEST_DEV_EMAIL_ ">@D.\n";
+
+// Parses the command line for Google Test flags, without initializing
+// other parts of Google Test.  The type parameter CharType can be
+// instantiated to either char or wchar_t.
+template <typename CharType>
+void ParseGoogleTestFlagsOnlyImpl(int* argc, CharType** argv) {
+  for (int i = 1; i < *argc; i++) {
+    const std::string arg_string = StreamableToString(argv[i]);
+    const char* const arg = arg_string.c_str();
+
+    using internal::ParseBoolFlag;
+    using internal::ParseInt32Flag;
+    using internal::ParseStringFlag;
+
+    // Do we see a Google Test flag?
+    if (ParseBoolFlag(arg, kAlsoRunDisabledTestsFlag,
+                      &GTEST_FLAG(also_run_disabled_tests)) ||
+        ParseBoolFlag(arg, kBreakOnFailureFlag,
+                      &GTEST_FLAG(break_on_failure)) ||
+        ParseBoolFlag(arg, kCatchExceptionsFlag,
+                      &GTEST_FLAG(catch_exceptions)) ||
+        ParseStringFlag(arg, kColorFlag, &GTEST_FLAG(color)) ||
+        ParseStringFlag(arg, kDeathTestStyleFlag,
+                        &GTEST_FLAG(death_test_style)) ||
+        ParseBoolFlag(arg, kDeathTestUseFork,
+                      &GTEST_FLAG(death_test_use_fork)) ||
+        ParseStringFlag(arg, kFilterFlag, &GTEST_FLAG(filter)) ||
+        ParseStringFlag(arg, kInternalRunDeathTestFlag,
+                        &GTEST_FLAG(internal_run_death_test)) ||
+        ParseBoolFlag(arg, kListTestsFlag, &GTEST_FLAG(list_tests)) ||
+        ParseStringFlag(arg, kOutputFlag, &GTEST_FLAG(output)) ||
+        ParseBoolFlag(arg, kPrintTimeFlag, &GTEST_FLAG(print_time)) ||
+        ParseInt32Flag(arg, kRandomSeedFlag, &GTEST_FLAG(random_seed)) ||
+        ParseInt32Flag(arg, kRepeatFlag, &GTEST_FLAG(repeat)) ||
+        ParseBoolFlag(arg, kShuffleFlag, &GTEST_FLAG(shuffle)) ||
+        ParseInt32Flag(arg, kStackTraceDepthFlag,
+                       &GTEST_FLAG(stack_trace_depth)) ||
+        ParseStringFlag(arg, kStreamResultToFlag,
+                        &GTEST_FLAG(stream_result_to)) ||
+        ParseBoolFlag(arg, kThrowOnFailureFlag,
+                      &GTEST_FLAG(throw_on_failure))
+        ) {
+      // Yes.  Shift the remainder of the argv list left by one.  Note
+      // that argv has (*argc + 1) elements, the last one always being
+      // NULL.  The following loop moves the trailing NULL element as
+      // well.
+      for (int j = i; j != *argc; j++) {
+        argv[j] = argv[j + 1];
+      }
+
+      // Decrements the argument count.
+      (*argc)--;
+
+      // We also need to decrement the iterator as we just removed
+      // an element.
+      i--;
+    } else if (arg_string == "--help" || arg_string == "-h" ||
+               arg_string == "-?" || arg_string == "/?" ||
+               HasGoogleTestFlagPrefix(arg)) {
+      // Both help flag and unrecognized Google Test flags (excluding
+      // internal ones) trigger help display.
+      g_help_flag = true;
+    }
+  }
+
+  if (g_help_flag) {
+    // We print the help here instead of in RUN_ALL_TESTS(), as the
+    // latter may not be called at all if the user is using Google
+    // Test with another testing framework.
+    PrintColorEncoded(kColorEncodedHelpMessage);
+  }
+}
+
+// Parses the command line for Google Test flags, without initializing
+// other parts of Google Test.
+void ParseGoogleTestFlagsOnly(int* argc, char** argv) {
+  ParseGoogleTestFlagsOnlyImpl(argc, argv);
+}
+void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv) {
+  ParseGoogleTestFlagsOnlyImpl(argc, argv);
+}
+
+// The internal implementation of InitGoogleTest().
+//
+// The type parameter CharType can be instantiated to either char or
+// wchar_t.
+template <typename CharType>
+void InitGoogleTestImpl(int* argc, CharType** argv) {
+  g_init_gtest_count++;
+
+  // We don't want to run the initialization code twice.
+  if (g_init_gtest_count != 1) return;
+
+  if (*argc <= 0) return;
+
+  internal::g_executable_path = internal::StreamableToString(argv[0]);
+
+#if GTEST_HAS_DEATH_TEST
+
+  g_argvs.clear();
+  for (int i = 0; i != *argc; i++) {
+    g_argvs.push_back(StreamableToString(argv[i]));
+  }
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+  ParseGoogleTestFlagsOnly(argc, argv);
+  GetUnitTestImpl()->PostFlagParsingInit();
+}
+
+}  // namespace internal
+
+// Initializes Google Test.  This must be called before calling
+// RUN_ALL_TESTS().  In particular, it parses a command line for the
+// flags that Google Test recognizes.  Whenever a Google Test flag is
+// seen, it is removed from argv, and *argc is decremented.
+//
+// No value is returned.  Instead, the Google Test flag variables are
+// updated.
+//
+// Calling the function for the second time has no user-visible effect.
+void InitGoogleTest(int* argc, char** argv) {
+  internal::InitGoogleTestImpl(argc, argv);
+}
+
+// This overloaded version can be used in Windows programs compiled in
+// UNICODE mode.
+void InitGoogleTest(int* argc, wchar_t** argv) {
+  internal::InitGoogleTestImpl(argc, argv);
+}
+
+}  // namespace testing
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan), vladl@google.com (Vlad Losev)
+//
+// This file implements death tests.
+
+
+#if GTEST_HAS_DEATH_TEST
+
+# if GTEST_OS_MAC
+#  include <crt_externs.h>
+# endif  // GTEST_OS_MAC
+
+# include <errno.h>
+# include <fcntl.h>
+# include <limits.h>
+
+# if GTEST_OS_LINUX
+#  include <signal.h>
+# endif  // GTEST_OS_LINUX
+
+# include <stdarg.h>
+
+# if GTEST_OS_WINDOWS
+#  include <windows.h>
+# else
+#  include <sys/mman.h>
+#  include <sys/wait.h>
+# endif  // GTEST_OS_WINDOWS
+
+# if GTEST_OS_QNX
+#  include <spawn.h>
+# endif  // GTEST_OS_QNX
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+
+// Indicates that this translation unit is part of Google Test's
+// implementation.  It must come before gtest-internal-inl.h is
+// included, or there will be a compiler error.  This trick is to
+// prevent a user from accidentally including gtest-internal-inl.h in
+// his code.
+#define GTEST_IMPLEMENTATION_ 1
+#undef GTEST_IMPLEMENTATION_
+
+namespace testing {
+
+// Constants.
+
+// The default death test style.
+static const char kDefaultDeathTestStyle[] = "fast";
+
+GTEST_DEFINE_string_(
+    death_test_style,
+    internal::StringFromGTestEnv("death_test_style", kDefaultDeathTestStyle),
+    "Indicates how to run a death test in a forked child process: "
+    "\"threadsafe\" (child process re-executes the test binary "
+    "from the beginning, running only the specific death test) or "
+    "\"fast\" (child process runs the death test immediately "
+    "after forking).");
+
+GTEST_DEFINE_bool_(
+    death_test_use_fork,
+    internal::BoolFromGTestEnv("death_test_use_fork", false),
+    "Instructs to use fork()/_exit() instead of clone() in death tests. "
+    "Ignored and always uses fork() on POSIX systems where clone() is not "
+    "implemented. Useful when running under valgrind or similar tools if "
+    "those do not support clone(). Valgrind 3.3.1 will just fail if "
+    "it sees an unsupported combination of clone() flags. "
+    "It is not recommended to use this flag w/o valgrind though it will "
+    "work in 99% of the cases. Once valgrind is fixed, this flag will "
+    "most likely be removed.");
+
+namespace internal {
+GTEST_DEFINE_string_(
+    internal_run_death_test, "",
+    "Indicates the file, line number, temporal index of "
+    "the single death test to run, and a file descriptor to "
+    "which a success code may be sent, all separated by "
+    "the '|' characters.  This flag is specified if and only if the current "
+    "process is a sub-process launched for running a thread-safe "
+    "death test.  FOR INTERNAL USE ONLY.");
+}  // namespace internal
+
+#if GTEST_HAS_DEATH_TEST
+
+namespace internal {
+
+// Valid only for fast death tests. Indicates the code is running in the
+// child process of a fast style death test.
+static bool g_in_fast_death_test_child = false;
+
+// Returns a Boolean value indicating whether the caller is currently
+// executing in the context of the death test child process.  Tools such as
+// Valgrind heap checkers may need this to modify their behavior in death
+// tests.  IMPORTANT: This is an internal utility.  Using it may break the
+// implementation of death tests.  User code MUST NOT use it.
+bool InDeathTestChild() {
+# if GTEST_OS_WINDOWS
+
+  // On Windows, death tests are thread-safe regardless of the value of the
+  // death_test_style flag.
+  return !GTEST_FLAG(internal_run_death_test).empty();
+
+# else
+
+  if (GTEST_FLAG(death_test_style) == "threadsafe")
+    return !GTEST_FLAG(internal_run_death_test).empty();
+  else
+    return g_in_fast_death_test_child;
+#endif
+}
+
+}  // namespace internal
+
+// ExitedWithCode constructor.
+ExitedWithCode::ExitedWithCode(int exit_code) : exit_code_(exit_code) {
+}
+
+// ExitedWithCode function-call operator.
+bool ExitedWithCode::operator()(int exit_status) const {
+# if GTEST_OS_WINDOWS
+
+  return exit_status == exit_code_;
+
+# else
+
+  return WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == exit_code_;
+
+# endif  // GTEST_OS_WINDOWS
+}
+
+# if !GTEST_OS_WINDOWS
+// KilledBySignal constructor.
+KilledBySignal::KilledBySignal(int signum) : signum_(signum) {
+}
+
+// KilledBySignal function-call operator.
+bool KilledBySignal::operator()(int exit_status) const {
+  return WIFSIGNALED(exit_status) && WTERMSIG(exit_status) == signum_;
+}
+# endif  // !GTEST_OS_WINDOWS
+
+namespace internal {
+
+// Utilities needed for death tests.
+
+// Generates a textual description of a given exit code, in the format
+// specified by wait(2).
+static std::string ExitSummary(int exit_code) {
+  Message m;
+
+# if GTEST_OS_WINDOWS
+
+  m << "Exited with exit status " << exit_code;
+
+# else
+
+  if (WIFEXITED(exit_code)) {
+    m << "Exited with exit status " << WEXITSTATUS(exit_code);
+  } else if (WIFSIGNALED(exit_code)) {
+    m << "Terminated by signal " << WTERMSIG(exit_code);
+  }
+#  ifdef WCOREDUMP
+  if (WCOREDUMP(exit_code)) {
+    m << " (core dumped)";
+  }
+#  endif
+# endif  // GTEST_OS_WINDOWS
+
+  return m.GetString();
+}
+
+// Returns true if exit_status describes a process that was terminated
+// by a signal, or exited normally with a nonzero exit code.
+bool ExitedUnsuccessfully(int exit_status) {
+  return !ExitedWithCode(0)(exit_status);
+}
+
+# if !GTEST_OS_WINDOWS
+// Generates a textual failure message when a death test finds more than
+// one thread running, or cannot determine the number of threads, prior
+// to executing the given statement.  It is the responsibility of the
+// caller not to pass a thread_count of 1.
+static std::string DeathTestThreadWarning(size_t thread_count) {
+  Message msg;
+  msg << "Death tests use fork(), which is unsafe particularly"
+      << " in a threaded context. For this test, " << GTEST_NAME_ << " ";
+  if (thread_count == 0)
+    msg << "couldn't detect the number of threads.";
+  else
+    msg << "detected " << thread_count << " threads.";
+  return msg.GetString();
+}
+# endif  // !GTEST_OS_WINDOWS
+
+// Flag characters for reporting a death test that did not die.
+static const char kDeathTestLived = 'L';
+static const char kDeathTestReturned = 'R';
+static const char kDeathTestThrew = 'T';
+static const char kDeathTestInternalError = 'I';
+
+// An enumeration describing all of the possible ways that a death test can
+// conclude.  DIED means that the process died while executing the test
+// code; LIVED means that process lived beyond the end of the test code;
+// RETURNED means that the test statement attempted to execute a return
+// statement, which is not allowed; THREW means that the test statement
+// returned control by throwing an exception.  IN_PROGRESS means the test
+// has not yet concluded.
+// TODO(vladl@google.com): Unify names and possibly values for
+// AbortReason, DeathTestOutcome, and flag characters above.
+enum DeathTestOutcome { IN_PROGRESS, DIED, LIVED, RETURNED, THREW };
+
+// Routine for aborting the program which is safe to call from an
+// exec-style death test child process, in which case the error
+// message is propagated back to the parent process.  Otherwise, the
+// message is simply printed to stderr.  In either case, the program
+// then exits with status 1.
+void DeathTestAbort(const std::string& message) {
+  // On a POSIX system, this function may be called from a threadsafe-style
+  // death test child process, which operates on a very small stack.  Use
+  // the heap for any additional non-minuscule memory requirements.
+  const InternalRunDeathTestFlag* const flag =
+      GetUnitTestImpl()->internal_run_death_test_flag();
+  if (flag != NULL) {
+    FILE* parent = posix::FDOpen(flag->write_fd(), "w");
+    fputc(kDeathTestInternalError, parent);
+    fprintf(parent, "%s", message.c_str());
+    fflush(parent);
+    _exit(1);
+  } else {
+    fprintf(stderr, "%s", message.c_str());
+    fflush(stderr);
+    posix::Abort();
+  }
+}
+
+// A replacement for CHECK that calls DeathTestAbort if the assertion
+// fails.
+# define GTEST_DEATH_TEST_CHECK_(expression) \
+  do { \
+    if (!::testing::internal::IsTrue(expression)) { \
+      DeathTestAbort( \
+          ::std::string("CHECK failed: File ") + __FILE__ +  ", line " \
+          + ::testing::internal::StreamableToString(__LINE__) + ": " \
+          + #expression); \
+    } \
+  } while (::testing::internal::AlwaysFalse())
+
+// This macro is similar to GTEST_DEATH_TEST_CHECK_, but it is meant for
+// evaluating any system call that fulfills two conditions: it must return
+// -1 on failure, and set errno to EINTR when it is interrupted and
+// should be tried again.  The macro expands to a loop that repeatedly
+// evaluates the expression as long as it evaluates to -1 and sets
+// errno to EINTR.  If the expression evaluates to -1 but errno is
+// something other than EINTR, DeathTestAbort is called.
+# define GTEST_DEATH_TEST_CHECK_SYSCALL_(expression) \
+  do { \
+    int gtest_retval; \
+    do { \
+      gtest_retval = (expression); \
+    } while (gtest_retval == -1 && errno == EINTR); \
+    if (gtest_retval == -1) { \
+      DeathTestAbort( \
+          ::std::string("CHECK failed: File ") + __FILE__ + ", line " \
+          + ::testing::internal::StreamableToString(__LINE__) + ": " \
+          + #expression + " != -1"); \
+    } \
+  } while (::testing::internal::AlwaysFalse())
+
+// Returns the message describing the last system error in errno.
+std::string GetLastErrnoDescription() {
+    return errno == 0 ? "" : posix::StrError(errno);
+}
+
+// This is called from a death test parent process to read a failure
+// message from the death test child process and log it with the FATAL
+// severity. On Windows, the message is read from a pipe handle. On other
+// platforms, it is read from a file descriptor.
+static void FailFromInternalError(int fd) {
+  Message error;
+  char buffer[256];
+  int num_read;
+
+  do {
+    while ((num_read = posix::Read(fd, buffer, 255)) > 0) {
+      buffer[num_read] = '\0';
+      error << buffer;
+    }
+  } while (num_read == -1 && errno == EINTR);
+
+  if (num_read == 0) {
+    GTEST_LOG_(FATAL) << error.GetString();
+  } else {
+    const int last_error = errno;
+    GTEST_LOG_(FATAL) << "Error while reading death test internal: "
+                      << GetLastErrnoDescription() << " [" << last_error << "]";
+  }
+}
+
+// Death test constructor.  Increments the running death test count
+// for the current test.
+DeathTest::DeathTest() {
+  TestInfo* const info = GetUnitTestImpl()->current_test_info();
+  if (info == NULL) {
+    DeathTestAbort("Cannot run a death test outside of a TEST or "
+                   "TEST_F construct");
+  }
+}
+
+// Creates and returns a death test by dispatching to the current
+// death test factory.
+bool DeathTest::Create(const char* statement, const RE* regex,
+                       const char* file, int line, DeathTest** test) {
+  return GetUnitTestImpl()->death_test_factory()->Create(
+      statement, regex, file, line, test);
+}
+
+const char* DeathTest::LastMessage() {
+  return last_death_test_message_.c_str();
+}
+
+void DeathTest::set_last_death_test_message(const std::string& message) {
+  last_death_test_message_ = message;
+}
+
+std::string DeathTest::last_death_test_message_;
+
+// Provides cross platform implementation for some death functionality.
+class DeathTestImpl : public DeathTest {
+ protected:
+  DeathTestImpl(const char* a_statement, const RE* a_regex)
+      : statement_(a_statement),
+        regex_(a_regex),
+        spawned_(false),
+        status_(-1),
+        outcome_(IN_PROGRESS),
+        read_fd_(-1),
+        write_fd_(-1) {}
+
+  // read_fd_ is expected to be closed and cleared by a derived class.
+  ~DeathTestImpl() { GTEST_DEATH_TEST_CHECK_(read_fd_ == -1); }
+
+  void Abort(AbortReason reason);
+  virtual bool Passed(bool status_ok);
+
+  const char* statement() const { return statement_; }
+  const RE* regex() const { return regex_; }
+  bool spawned() const { return spawned_; }
+  void set_spawned(bool is_spawned) { spawned_ = is_spawned; }
+  int status() const { return status_; }
+  void set_status(int a_status) { status_ = a_status; }
+  DeathTestOutcome outcome() const { return outcome_; }
+  void set_outcome(DeathTestOutcome an_outcome) { outcome_ = an_outcome; }
+  int read_fd() const { return read_fd_; }
+  void set_read_fd(int fd) { read_fd_ = fd; }
+  int write_fd() const { return write_fd_; }
+  void set_write_fd(int fd) { write_fd_ = fd; }
+
+  // Called in the parent process only. Reads the result code of the death
+  // test child process via a pipe, interprets it to set the outcome_
+  // member, and closes read_fd_.  Outputs diagnostics and terminates in
+  // case of unexpected codes.
+  void ReadAndInterpretStatusByte();
+
+ private:
+  // The textual content of the code this object is testing.  This class
+  // doesn't own this string and should not attempt to delete it.
+  const char* const statement_;
+  // The regular expression which test output must match.  DeathTestImpl
+  // doesn't own this object and should not attempt to delete it.
+  const RE* const regex_;
+  // True if the death test child process has been successfully spawned.
+  bool spawned_;
+  // The exit status of the child process.
+  int status_;
+  // How the death test concluded.
+  DeathTestOutcome outcome_;
+  // Descriptor to the read end of the pipe to the child process.  It is
+  // always -1 in the child process.  The child keeps its write end of the
+  // pipe in write_fd_.
+  int read_fd_;
+  // Descriptor to the child's write end of the pipe to the parent process.
+  // It is always -1 in the parent process.  The parent keeps its end of the
+  // pipe in read_fd_.
+  int write_fd_;
+};
+
+// Called in the parent process only. Reads the result code of the death
+// test child process via a pipe, interprets it to set the outcome_
+// member, and closes read_fd_.  Outputs diagnostics and terminates in
+// case of unexpected codes.
+void DeathTestImpl::ReadAndInterpretStatusByte() {
+  char flag;
+  int bytes_read;
+
+  // The read() here blocks until data is available (signifying the
+  // failure of the death test) or until the pipe is closed (signifying
+  // its success), so it's okay to call this in the parent before
+  // the child process has exited.
+  do {
+    bytes_read = posix::Read(read_fd(), &flag, 1);
+  } while (bytes_read == -1 && errno == EINTR);
+
+  if (bytes_read == 0) {
+    set_outcome(DIED);
+  } else if (bytes_read == 1) {
+    switch (flag) {
+      case kDeathTestReturned:
+        set_outcome(RETURNED);
+        break;
+      case kDeathTestThrew:
+        set_outcome(THREW);
+        break;
+      case kDeathTestLived:
+        set_outcome(LIVED);
+        break;
+      case kDeathTestInternalError:
+        FailFromInternalError(read_fd());  // Does not return.
+        break;
+      default:
+        GTEST_LOG_(FATAL) << "Death test child process reported "
+                          << "unexpected status byte ("
+                          << static_cast<unsigned int>(flag) << ")";
+    }
+  } else {
+    GTEST_LOG_(FATAL) << "Read from death test child process failed: "
+                      << GetLastErrnoDescription();
+  }
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Close(read_fd()));
+  set_read_fd(-1);
+}
+
+// Signals that the death test code which should have exited, didn't.
+// Should be called only in a death test child process.
+// Writes a status byte to the child's status file descriptor, then
+// calls _exit(1).
+void DeathTestImpl::Abort(AbortReason reason) {
+  // The parent process considers the death test to be a failure if
+  // it finds any data in our pipe.  So, here we write a single flag byte
+  // to the pipe, then exit.
+  const char status_ch =
+      reason == TEST_DID_NOT_DIE ? kDeathTestLived :
+      reason == TEST_THREW_EXCEPTION ? kDeathTestThrew : kDeathTestReturned;
+
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Write(write_fd(), &status_ch, 1));
+  // We are leaking the descriptor here because on some platforms (i.e.,
+  // when built as Windows DLL), destructors of global objects will still
+  // run after calling _exit(). On such systems, write_fd_ will be
+  // indirectly closed from the destructor of UnitTestImpl, causing double
+  // close if it is also closed here. On debug configurations, double close
+  // may assert. As there are no in-process buffers to flush here, we are
+  // relying on the OS to close the descriptor after the process terminates
+  // when the destructors are not run.
+  _exit(1);  // Exits w/o any normal exit hooks (we were supposed to crash)
+}
+
+// Returns an indented copy of stderr output for a death test.
+// This makes distinguishing death test output lines from regular log lines
+// much easier.
+static ::std::string FormatDeathTestOutput(const ::std::string& output) {
+  ::std::string ret;
+  for (size_t at = 0; ; ) {
+    const size_t line_end = output.find('\n', at);
+    ret += "[  DEATH   ] ";
+    if (line_end == ::std::string::npos) {
+      ret += output.substr(at);
+      break;
+    }
+    ret += output.substr(at, line_end + 1 - at);
+    at = line_end + 1;
+  }
+  return ret;
+}
+
+// Assesses the success or failure of a death test, using both private
+// members which have previously been set, and one argument:
+//
+// Private data members:
+//   outcome:  An enumeration describing how the death test
+//             concluded: DIED, LIVED, THREW, or RETURNED.  The death test
+//             fails in the latter three cases.
+//   status:   The exit status of the child process. On *nix, it is in the
+//             in the format specified by wait(2). On Windows, this is the
+//             value supplied to the ExitProcess() API or a numeric code
+//             of the exception that terminated the program.
+//   regex:    A regular expression object to be applied to
+//             the test's captured standard error output; the death test
+//             fails if it does not match.
+//
+// Argument:
+//   status_ok: true if exit_status is acceptable in the context of
+//              this particular death test, which fails if it is false
+//
+// Returns true iff all of the above conditions are met.  Otherwise, the
+// first failing condition, in the order given above, is the one that is
+// reported. Also sets the last death test message string.
+bool DeathTestImpl::Passed(bool status_ok) {
+  if (!spawned())
+    return false;
+
+  const std::string error_message = GetCapturedStderr();
+
+  bool success = false;
+  Message buffer;
+
+  buffer << "Death test: " << statement() << "\n";
+  switch (outcome()) {
+    case LIVED:
+      buffer << "    Result: failed to die.\n"
+             << " Error msg:\n" << FormatDeathTestOutput(error_message);
+      break;
+    case THREW:
+      buffer << "    Result: threw an exception.\n"
+             << " Error msg:\n" << FormatDeathTestOutput(error_message);
+      break;
+    case RETURNED:
+      buffer << "    Result: illegal return in test statement.\n"
+             << " Error msg:\n" << FormatDeathTestOutput(error_message);
+      break;
+    case DIED:
+      if (status_ok) {
+        const bool matched = RE::PartialMatch(error_message.c_str(), *regex());
+        if (matched) {
+          success = true;
+        } else {
+          buffer << "    Result: died but not with expected error.\n"
+                 << "  Expected: " << regex()->pattern() << "\n"
+                 << "Actual msg:\n" << FormatDeathTestOutput(error_message);
+        }
+      } else {
+        buffer << "    Result: died but not with expected exit code:\n"
+               << "            " << ExitSummary(status()) << "\n"
+               << "Actual msg:\n" << FormatDeathTestOutput(error_message);
+      }
+      break;
+    case IN_PROGRESS:
+    default:
+      GTEST_LOG_(FATAL)
+          << "DeathTest::Passed somehow called before conclusion of test";
+  }
+
+  DeathTest::set_last_death_test_message(buffer.GetString());
+  return success;
+}
+
+# if GTEST_OS_WINDOWS
+// WindowsDeathTest implements death tests on Windows. Due to the
+// specifics of starting new processes on Windows, death tests there are
+// always threadsafe, and Google Test considers the
+// --gtest_death_test_style=fast setting to be equivalent to
+// --gtest_death_test_style=threadsafe there.
+//
+// A few implementation notes:  Like the Linux version, the Windows
+// implementation uses pipes for child-to-parent communication. But due to
+// the specifics of pipes on Windows, some extra steps are required:
+//
+// 1. The parent creates a communication pipe and stores handles to both
+//    ends of it.
+// 2. The parent starts the child and provides it with the information
+//    necessary to acquire the handle to the write end of the pipe.
+// 3. The child acquires the write end of the pipe and signals the parent
+//    using a Windows event.
+// 4. Now the parent can release the write end of the pipe on its side. If
+//    this is done before step 3, the object's reference count goes down to
+//    0 and it is destroyed, preventing the child from acquiring it. The
+//    parent now has to release it, or read operations on the read end of
+//    the pipe will not return when the child terminates.
+// 5. The parent reads child's output through the pipe (outcome code and
+//    any possible error messages) from the pipe, and its stderr and then
+//    determines whether to fail the test.
+//
+// Note: to distinguish Win32 API calls from the local method and function
+// calls, the former are explicitly resolved in the global namespace.
+//
+class WindowsDeathTest : public DeathTestImpl {
+ public:
+  WindowsDeathTest(const char* a_statement,
+                   const RE* a_regex,
+                   const char* file,
+                   int line)
+      : DeathTestImpl(a_statement, a_regex), file_(file), line_(line) {}
+
+  // All of these virtual functions are inherited from DeathTest.
+  virtual int Wait();
+  virtual TestRole AssumeRole();
+
+ private:
+  // The name of the file in which the death test is located.
+  const char* const file_;
+  // The line number on which the death test is located.
+  const int line_;
+  // Handle to the write end of the pipe to the child process.
+  AutoHandle write_handle_;
+  // Child process handle.
+  AutoHandle child_handle_;
+  // Event the child process uses to signal the parent that it has
+  // acquired the handle to the write end of the pipe. After seeing this
+  // event the parent can release its own handles to make sure its
+  // ReadFile() calls return when the child terminates.
+  AutoHandle event_handle_;
+};
+
+// Waits for the child in a death test to exit, returning its exit
+// status, or 0 if no child process exists.  As a side effect, sets the
+// outcome data member.
+int WindowsDeathTest::Wait() {
+  if (!spawned())
+    return 0;
+
+  // Wait until the child either signals that it has acquired the write end
+  // of the pipe or it dies.
+  const HANDLE wait_handles[2] = { child_handle_.Get(), event_handle_.Get() };
+  switch (::WaitForMultipleObjects(2,
+                                   wait_handles,
+                                   FALSE,  // Waits for any of the handles.
+                                   INFINITE)) {
+    case WAIT_OBJECT_0:
+    case WAIT_OBJECT_0 + 1:
+      break;
+    default:
+      GTEST_DEATH_TEST_CHECK_(false);  // Should not get here.
+  }
+
+  // The child has acquired the write end of the pipe or exited.
+  // We release the handle on our side and continue.
+  write_handle_.Reset();
+  event_handle_.Reset();
+
+  ReadAndInterpretStatusByte();
+
+  // Waits for the child process to exit if it haven't already. This
+  // returns immediately if the child has already exited, regardless of
+  // whether previous calls to WaitForMultipleObjects synchronized on this
+  // handle or not.
+  GTEST_DEATH_TEST_CHECK_(
+      WAIT_OBJECT_0 == ::WaitForSingleObject(child_handle_.Get(),
+                                             INFINITE));
+  DWORD status_code;
+  GTEST_DEATH_TEST_CHECK_(
+      ::GetExitCodeProcess(child_handle_.Get(), &status_code) != FALSE);
+  child_handle_.Reset();
+  set_status(static_cast<int>(status_code));
+  return status();
+}
+
+// The AssumeRole process for a Windows death test.  It creates a child
+// process with the same executable as the current process to run the
+// death test.  The child process is given the --gtest_filter and
+// --gtest_internal_run_death_test flags such that it knows to run the
+// current death test only.
+DeathTest::TestRole WindowsDeathTest::AssumeRole() {
+  const UnitTestImpl* const impl = GetUnitTestImpl();
+  const InternalRunDeathTestFlag* const flag =
+      impl->internal_run_death_test_flag();
+  const TestInfo* const info = impl->current_test_info();
+  const int death_test_index = info->result()->death_test_count();
+
+  if (flag != NULL) {
+    // ParseInternalRunDeathTestFlag() has performed all the necessary
+    // processing.
+    set_write_fd(flag->write_fd());
+    return EXECUTE_TEST;
+  }
+
+  // WindowsDeathTest uses an anonymous pipe to communicate results of
+  // a death test.
+  SECURITY_ATTRIBUTES handles_are_inheritable = {
+    sizeof(SECURITY_ATTRIBUTES), NULL, TRUE };
+  HANDLE read_handle, write_handle;
+  GTEST_DEATH_TEST_CHECK_(
+      ::CreatePipe(&read_handle, &write_handle, &handles_are_inheritable,
+                   0)  // Default buffer size.
+      != FALSE);
+  set_read_fd(::_open_osfhandle(reinterpret_cast<intptr_t>(read_handle),
+                                O_RDONLY));
+  write_handle_.Reset(write_handle);
+  event_handle_.Reset(::CreateEvent(
+      &handles_are_inheritable,
+      TRUE,    // The event will automatically reset to non-signaled state.
+      FALSE,   // The initial state is non-signalled.
+      NULL));  // The even is unnamed.
+  GTEST_DEATH_TEST_CHECK_(event_handle_.Get() != NULL);
+  const std::string filter_flag =
+      std::string("--") + GTEST_FLAG_PREFIX_ + kFilterFlag + "=" +
+      info->test_case_name() + "." + info->name();
+  const std::string internal_flag =
+      std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag +
+      "=" + file_ + "|" + StreamableToString(line_) + "|" +
+      StreamableToString(death_test_index) + "|" +
+      StreamableToString(static_cast<unsigned int>(::GetCurrentProcessId())) +
+      // size_t has the same width as pointers on both 32-bit and 64-bit
+      // Windows platforms.
+      // See http://msdn.microsoft.com/en-us/library/tcxf1dw6.aspx.
+      "|" + StreamableToString(reinterpret_cast<size_t>(write_handle)) +
+      "|" + StreamableToString(reinterpret_cast<size_t>(event_handle_.Get()));
+
+  char executable_path[_MAX_PATH + 1];  // NOLINT
+  GTEST_DEATH_TEST_CHECK_(
+      _MAX_PATH + 1 != ::GetModuleFileNameA(NULL,
+                                            executable_path,
+                                            _MAX_PATH));
+
+  std::string command_line =
+      std::string(::GetCommandLineA()) + " " + filter_flag + " \"" +
+      internal_flag + "\"";
+
+  DeathTest::set_last_death_test_message("");
+
+  CaptureStderr();
+  // Flush the log buffers since the log streams are shared with the child.
+  FlushInfoLog();
+
+  // The child process will share the standard handles with the parent.
+  STARTUPINFOA startup_info;
+  memset(&startup_info, 0, sizeof(STARTUPINFO));
+  startup_info.dwFlags = STARTF_USESTDHANDLES;
+  startup_info.hStdInput = ::GetStdHandle(STD_INPUT_HANDLE);
+  startup_info.hStdOutput = ::GetStdHandle(STD_OUTPUT_HANDLE);
+  startup_info.hStdError = ::GetStdHandle(STD_ERROR_HANDLE);
+
+  PROCESS_INFORMATION process_info;
+  GTEST_DEATH_TEST_CHECK_(::CreateProcessA(
+      executable_path,
+      const_cast<char*>(command_line.c_str()),
+      NULL,   // Retuned process handle is not inheritable.
+      NULL,   // Retuned thread handle is not inheritable.
+      TRUE,   // Child inherits all inheritable handles (for write_handle_).
+      0x0,    // Default creation flags.
+      NULL,   // Inherit the parent's environment.
+      UnitTest::GetInstance()->original_working_dir(),
+      &startup_info,
+      &process_info) != FALSE);
+  child_handle_.Reset(process_info.hProcess);
+  ::CloseHandle(process_info.hThread);
+  set_spawned(true);
+  return OVERSEE_TEST;
+}
+# else  // We are not on Windows.
+
+// ForkingDeathTest provides implementations for most of the abstract
+// methods of the DeathTest interface.  Only the AssumeRole method is
+// left undefined.
+class ForkingDeathTest : public DeathTestImpl {
+ public:
+  ForkingDeathTest(const char* statement, const RE* regex);
+
+  // All of these virtual functions are inherited from DeathTest.
+  virtual int Wait();
+
+ protected:
+  void set_child_pid(pid_t child_pid) { child_pid_ = child_pid; }
+
+ private:
+  // PID of child process during death test; 0 in the child process itself.
+  pid_t child_pid_;
+};
+
+// Constructs a ForkingDeathTest.
+ForkingDeathTest::ForkingDeathTest(const char* a_statement, const RE* a_regex)
+    : DeathTestImpl(a_statement, a_regex),
+      child_pid_(-1) {}
+
+// Waits for the child in a death test to exit, returning its exit
+// status, or 0 if no child process exists.  As a side effect, sets the
+// outcome data member.
+int ForkingDeathTest::Wait() {
+  if (!spawned())
+    return 0;
+
+  ReadAndInterpretStatusByte();
+
+  int status_value;
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(waitpid(child_pid_, &status_value, 0));
+  set_status(status_value);
+  return status_value;
+}
+
+// A concrete death test class that forks, then immediately runs the test
+// in the child process.
+class NoExecDeathTest : public ForkingDeathTest {
+ public:
+  NoExecDeathTest(const char* a_statement, const RE* a_regex) :
+      ForkingDeathTest(a_statement, a_regex) { }
+  virtual TestRole AssumeRole();
+};
+
+// The AssumeRole process for a fork-and-run death test.  It implements a
+// straightforward fork, with a simple pipe to transmit the status byte.
+DeathTest::TestRole NoExecDeathTest::AssumeRole() {
+  const size_t thread_count = GetThreadCount();
+  if (thread_count != 1) {
+    GTEST_LOG_(WARNING) << DeathTestThreadWarning(thread_count);
+  }
+
+  int pipe_fd[2];
+  GTEST_DEATH_TEST_CHECK_(pipe(pipe_fd) != -1);
+
+  DeathTest::set_last_death_test_message("");
+  CaptureStderr();
+  // When we fork the process below, the log file buffers are copied, but the
+  // file descriptors are shared.  We flush all log files here so that closing
+  // the file descriptors in the child process doesn't throw off the
+  // synchronization between descriptors and buffers in the parent process.
+  // This is as close to the fork as possible to avoid a race condition in case
+  // there are multiple threads running before the death test, and another
+  // thread writes to the log file.
+  FlushInfoLog();
+
+  const pid_t child_pid = fork();
+  GTEST_DEATH_TEST_CHECK_(child_pid != -1);
+  set_child_pid(child_pid);
+  if (child_pid == 0) {
+    GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[0]));
+    set_write_fd(pipe_fd[1]);
+    // Redirects all logging to stderr in the child process to prevent
+    // concurrent writes to the log files.  We capture stderr in the parent
+    // process and append the child process' output to a log.
+    LogToStderr();
+    // Event forwarding to the listeners of event listener API mush be shut
+    // down in death test subprocesses.
+    GetUnitTestImpl()->listeners()->SuppressEventForwarding();
+    g_in_fast_death_test_child = true;
+    return EXECUTE_TEST;
+  } else {
+    GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[1]));
+    set_read_fd(pipe_fd[0]);
+    set_spawned(true);
+    return OVERSEE_TEST;
+  }
+}
+
+// A concrete death test class that forks and re-executes the main
+// program from the beginning, with command-line flags set that cause
+// only this specific death test to be run.
+class ExecDeathTest : public ForkingDeathTest {
+ public:
+  ExecDeathTest(const char* a_statement, const RE* a_regex,
+                const char* file, int line) :
+      ForkingDeathTest(a_statement, a_regex), file_(file), line_(line) { }
+  virtual TestRole AssumeRole();
+ private:
+  static ::std::vector<testing::internal::string>
+  GetArgvsForDeathTestChildProcess() {
+    ::std::vector<testing::internal::string> args = GetInjectableArgvs();
+    return args;
+  }
+  // The name of the file in which the death test is located.
+  const char* const file_;
+  // The line number on which the death test is located.
+  const int line_;
+};
+
+// Utility class for accumulating command-line arguments.
+class Arguments {
+ public:
+  Arguments() {
+    args_.push_back(NULL);
+  }
+
+  ~Arguments() {
+    for (std::vector<char*>::iterator i = args_.begin(); i != args_.end();
+         ++i) {
+      free(*i);
+    }
+  }
+  void AddArgument(const char* argument) {
+    args_.insert(args_.end() - 1, posix::StrDup(argument));
+  }
+
+  template <typename Str>
+  void AddArguments(const ::std::vector<Str>& arguments) {
+    for (typename ::std::vector<Str>::const_iterator i = arguments.begin();
+         i != arguments.end();
+         ++i) {
+      args_.insert(args_.end() - 1, posix::StrDup(i->c_str()));
+    }
+  }
+  char* const* Argv() {
+    return &args_[0];
+  }
+
+ private:
+  std::vector<char*> args_;
+};
+
+// A struct that encompasses the arguments to the child process of a
+// threadsafe-style death test process.
+struct ExecDeathTestArgs {
+  char* const* argv;  // Command-line arguments for the child's call to exec
+  int close_fd;       // File descriptor to close; the read end of a pipe
+};
+
+#  if GTEST_OS_MAC
+inline char** GetEnviron() {
+  // When Google Test is built as a framework on MacOS X, the environ variable
+  // is unavailable. Apple's documentation (man environ) recommends using
+  // _NSGetEnviron() instead.
+  return *_NSGetEnviron();
+}
+#  else
+// Some POSIX platforms expect you to declare environ. extern "C" makes
+// it reside in the global namespace.
+extern "C" char** environ;
+inline char** GetEnviron() { return environ; }
+#  endif  // GTEST_OS_MAC
+
+#  if !GTEST_OS_QNX
+// The main function for a threadsafe-style death test child process.
+// This function is called in a clone()-ed process and thus must avoid
+// any potentially unsafe operations like malloc or libc functions.
+static int ExecDeathTestChildMain(void* child_arg) {
+  ExecDeathTestArgs* const args = static_cast<ExecDeathTestArgs*>(child_arg);
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(close(args->close_fd));
+
+  // We need to execute the test program in the same environment where
+  // it was originally invoked.  Therefore we change to the original
+  // working directory first.
+  const char* const original_dir =
+      UnitTest::GetInstance()->original_working_dir();
+  // We can safely call chdir() as it's a direct system call.
+  if (chdir(original_dir) != 0) {
+    DeathTestAbort(std::string("chdir(\"") + original_dir + "\") failed: " +
+                   GetLastErrnoDescription());
+    return EXIT_FAILURE;
+  }
+
+  // We can safely call execve() as it's a direct system call.  We
+  // cannot use execvp() as it's a libc function and thus potentially
+  // unsafe.  Since execve() doesn't search the PATH, the user must
+  // invoke the test program via a valid path that contains at least
+  // one path separator.
+  execve(args->argv[0], args->argv, GetEnviron());
+  DeathTestAbort(std::string("execve(") + args->argv[0] + ", ...) in " +
+                 original_dir + " failed: " +
+                 GetLastErrnoDescription());
+  return EXIT_FAILURE;
+}
+#  endif  // !GTEST_OS_QNX
+
+// Two utility routines that together determine the direction the stack
+// grows.
+// This could be accomplished more elegantly by a single recursive
+// function, but we want to guard against the unlikely possibility of
+// a smart compiler optimizing the recursion away.
+//
+// GTEST_NO_INLINE_ is required to prevent GCC 4.6 from inlining
+// StackLowerThanAddress into StackGrowsDown, which then doesn't give
+// correct answer.
+void StackLowerThanAddress(const void* ptr, bool* result) GTEST_NO_INLINE_;
+void StackLowerThanAddress(const void* ptr, bool* result) {
+  int dummy;
+  *result = (&dummy < ptr);
+}
+
+bool StackGrowsDown() {
+  int dummy;
+  bool result;
+  StackLowerThanAddress(&dummy, &result);
+  return result;
+}
+
+// Spawns a child process with the same executable as the current process in
+// a thread-safe manner and instructs it to run the death test.  The
+// implementation uses fork(2) + exec.  On systems where clone(2) is
+// available, it is used instead, being slightly more thread-safe.  On QNX,
+// fork supports only single-threaded environments, so this function uses
+// spawn(2) there instead.  The function dies with an error message if
+// anything goes wrong.
+static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) {
+  ExecDeathTestArgs args = { argv, close_fd };
+  pid_t child_pid = -1;
+
+#  if GTEST_OS_QNX
+  // Obtains the current directory and sets it to be closed in the child
+  // process.
+  const int cwd_fd = open(".", O_RDONLY);
+  GTEST_DEATH_TEST_CHECK_(cwd_fd != -1);
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(fcntl(cwd_fd, F_SETFD, FD_CLOEXEC));
+  // We need to execute the test program in the same environment where
+  // it was originally invoked.  Therefore we change to the original
+  // working directory first.
+  const char* const original_dir =
+      UnitTest::GetInstance()->original_working_dir();
+  // We can safely call chdir() as it's a direct system call.
+  if (chdir(original_dir) != 0) {
+    DeathTestAbort(std::string("chdir(\"") + original_dir + "\") failed: " +
+                   GetLastErrnoDescription());
+    return EXIT_FAILURE;
+  }
+
+  int fd_flags;
+  // Set close_fd to be closed after spawn.
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(fd_flags = fcntl(close_fd, F_GETFD));
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(fcntl(close_fd, F_SETFD,
+                                        fd_flags | FD_CLOEXEC));
+  struct inheritance inherit = {0};
+  // spawn is a system call.
+  child_pid = spawn(args.argv[0], 0, NULL, &inherit, args.argv, GetEnviron());
+  // Restores the current working directory.
+  GTEST_DEATH_TEST_CHECK_(fchdir(cwd_fd) != -1);
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(close(cwd_fd));
+
+#  else   // GTEST_OS_QNX
+#   if GTEST_OS_LINUX
+  // When a SIGPROF signal is received while fork() or clone() are executing,
+  // the process may hang. To avoid this, we ignore SIGPROF here and re-enable
+  // it after the call to fork()/clone() is complete.
+  struct sigaction saved_sigprof_action;
+  struct sigaction ignore_sigprof_action;
+  memset(&ignore_sigprof_action, 0, sizeof(ignore_sigprof_action));
+  sigemptyset(&ignore_sigprof_action.sa_mask);
+  ignore_sigprof_action.sa_handler = SIG_IGN;
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(sigaction(
+      SIGPROF, &ignore_sigprof_action, &saved_sigprof_action));
+#   endif  // GTEST_OS_LINUX
+
+#   if GTEST_HAS_CLONE
+  const bool use_fork = GTEST_FLAG(death_test_use_fork);
+
+  if (!use_fork) {
+    static const bool stack_grows_down = StackGrowsDown();
+    const size_t stack_size = getpagesize();
+    // MMAP_ANONYMOUS is not defined on Mac, so we use MAP_ANON instead.
+    void* const stack = mmap(NULL, stack_size, PROT_READ | PROT_WRITE,
+                             MAP_ANON | MAP_PRIVATE, -1, 0);
+    GTEST_DEATH_TEST_CHECK_(stack != MAP_FAILED);
+
+    // Maximum stack alignment in bytes:  For a downward-growing stack, this
+    // amount is subtracted from size of the stack space to get an address
+    // that is within the stack space and is aligned on all systems we care
+    // about.  As far as I know there is no ABI with stack alignment greater
+    // than 64.  We assume stack and stack_size already have alignment of
+    // kMaxStackAlignment.
+    const size_t kMaxStackAlignment = 64;
+    void* const stack_top =
+        static_cast<char*>(stack) +
+            (stack_grows_down ? stack_size - kMaxStackAlignment : 0);
+    GTEST_DEATH_TEST_CHECK_(stack_size > kMaxStackAlignment &&
+        reinterpret_cast<intptr_t>(stack_top) % kMaxStackAlignment == 0);
+
+    child_pid = clone(&ExecDeathTestChildMain, stack_top, SIGCHLD, &args);
+
+    GTEST_DEATH_TEST_CHECK_(munmap(stack, stack_size) != -1);
+  }
+#   else
+  const bool use_fork = true;
+#   endif  // GTEST_HAS_CLONE
+
+  if (use_fork && (child_pid = fork()) == 0) {
+      ExecDeathTestChildMain(&args);
+      _exit(0);
+  }
+#  endif  // GTEST_OS_QNX
+#  if GTEST_OS_LINUX
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(
+      sigaction(SIGPROF, &saved_sigprof_action, NULL));
+#  endif  // GTEST_OS_LINUX
+
+  GTEST_DEATH_TEST_CHECK_(child_pid != -1);
+  return child_pid;
+}
+
+// The AssumeRole process for a fork-and-exec death test.  It re-executes the
+// main program from the beginning, setting the --gtest_filter
+// and --gtest_internal_run_death_test flags to cause only the current
+// death test to be re-run.
+DeathTest::TestRole ExecDeathTest::AssumeRole() {
+  const UnitTestImpl* const impl = GetUnitTestImpl();
+  const InternalRunDeathTestFlag* const flag =
+      impl->internal_run_death_test_flag();
+  const TestInfo* const info = impl->current_test_info();
+  const int death_test_index = info->result()->death_test_count();
+
+  if (flag != NULL) {
+    set_write_fd(flag->write_fd());
+    return EXECUTE_TEST;
+  }
+
+  int pipe_fd[2];
+  GTEST_DEATH_TEST_CHECK_(pipe(pipe_fd) != -1);
+  // Clear the close-on-exec flag on the write end of the pipe, lest
+  // it be closed when the child process does an exec:
+  GTEST_DEATH_TEST_CHECK_(fcntl(pipe_fd[1], F_SETFD, 0) != -1);
+
+  const std::string filter_flag =
+      std::string("--") + GTEST_FLAG_PREFIX_ + kFilterFlag + "="
+      + info->test_case_name() + "." + info->name();
+  const std::string internal_flag =
+      std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + "="
+      + file_ + "|" + StreamableToString(line_) + "|"
+      + StreamableToString(death_test_index) + "|"
+      + StreamableToString(pipe_fd[1]);
+  Arguments args;
+  args.AddArguments(GetArgvsForDeathTestChildProcess());
+  args.AddArgument(filter_flag.c_str());
+  args.AddArgument(internal_flag.c_str());
+
+  DeathTest::set_last_death_test_message("");
+
+  CaptureStderr();
+  // See the comment in NoExecDeathTest::AssumeRole for why the next line
+  // is necessary.
+  FlushInfoLog();
+
+  const pid_t child_pid = ExecDeathTestSpawnChild(args.Argv(), pipe_fd[0]);
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[1]));
+  set_child_pid(child_pid);
+  set_read_fd(pipe_fd[0]);
+  set_spawned(true);
+  return OVERSEE_TEST;
+}
+
+# endif  // !GTEST_OS_WINDOWS
+
+// Creates a concrete DeathTest-derived class that depends on the
+// --gtest_death_test_style flag, and sets the pointer pointed to
+// by the "test" argument to its address.  If the test should be
+// skipped, sets that pointer to NULL.  Returns true, unless the
+// flag is set to an invalid value.
+bool DefaultDeathTestFactory::Create(const char* statement, const RE* regex,
+                                     const char* file, int line,
+                                     DeathTest** test) {
+  UnitTestImpl* const impl = GetUnitTestImpl();
+  const InternalRunDeathTestFlag* const flag =
+      impl->internal_run_death_test_flag();
+  const int death_test_index = impl->current_test_info()
+      ->increment_death_test_count();
+
+  if (flag != NULL) {
+    if (death_test_index > flag->index()) {
+      DeathTest::set_last_death_test_message(
+          "Death test count (" + StreamableToString(death_test_index)
+          + ") somehow exceeded expected maximum ("
+          + StreamableToString(flag->index()) + ")");
+      return false;
+    }
+
+    if (!(flag->file() == file && flag->line() == line &&
+          flag->index() == death_test_index)) {
+      *test = NULL;
+      return true;
+    }
+  }
+
+# if GTEST_OS_WINDOWS
+
+  if (GTEST_FLAG(death_test_style) == "threadsafe" ||
+      GTEST_FLAG(death_test_style) == "fast") {
+    *test = new WindowsDeathTest(statement, regex, file, line);
+  }
+
+# else
+
+  if (GTEST_FLAG(death_test_style) == "threadsafe") {
+    *test = new ExecDeathTest(statement, regex, file, line);
+  } else if (GTEST_FLAG(death_test_style) == "fast") {
+    *test = new NoExecDeathTest(statement, regex);
+  }
+
+# endif  // GTEST_OS_WINDOWS
+
+  else {  // NOLINT - this is more readable than unbalanced brackets inside #if.
+    DeathTest::set_last_death_test_message(
+        "Unknown death test style \"" + GTEST_FLAG(death_test_style)
+        + "\" encountered");
+    return false;
+  }
+
+  return true;
+}
+
+// Splits a given string on a given delimiter, populating a given
+// vector with the fields.  GTEST_HAS_DEATH_TEST implies that we have
+// ::std::string, so we can use it here.
+static void SplitString(const ::std::string& str, char delimiter,
+                        ::std::vector< ::std::string>* dest) {
+  ::std::vector< ::std::string> parsed;
+  ::std::string::size_type pos = 0;
+  while (::testing::internal::AlwaysTrue()) {
+    const ::std::string::size_type colon = str.find(delimiter, pos);
+    if (colon == ::std::string::npos) {
+      parsed.push_back(str.substr(pos));
+      break;
+    } else {
+      parsed.push_back(str.substr(pos, colon - pos));
+      pos = colon + 1;
+    }
+  }
+  dest->swap(parsed);
+}
+
+# if GTEST_OS_WINDOWS
+// Recreates the pipe and event handles from the provided parameters,
+// signals the event, and returns a file descriptor wrapped around the pipe
+// handle. This function is called in the child process only.
+int GetStatusFileDescriptor(unsigned int parent_process_id,
+                            size_t write_handle_as_size_t,
+                            size_t event_handle_as_size_t) {
+  AutoHandle parent_process_handle(::OpenProcess(PROCESS_DUP_HANDLE,
+                                                   FALSE,  // Non-inheritable.
+                                                   parent_process_id));
+  if (parent_process_handle.Get() == INVALID_HANDLE_VALUE) {
+    DeathTestAbort("Unable to open parent process " +
+                   StreamableToString(parent_process_id));
+  }
+
+  // TODO(vladl@google.com): Replace the following check with a
+  // compile-time assertion when available.
+  GTEST_CHECK_(sizeof(HANDLE) <= sizeof(size_t));
+
+  const HANDLE write_handle =
+      reinterpret_cast<HANDLE>(write_handle_as_size_t);
+  HANDLE dup_write_handle;
+
+  // The newly initialized handle is accessible only in in the parent
+  // process. To obtain one accessible within the child, we need to use
+  // DuplicateHandle.
+  if (!::DuplicateHandle(parent_process_handle.Get(), write_handle,
+                         ::GetCurrentProcess(), &dup_write_handle,
+                         0x0,    // Requested privileges ignored since
+                                 // DUPLICATE_SAME_ACCESS is used.
+                         FALSE,  // Request non-inheritable handler.
+                         DUPLICATE_SAME_ACCESS)) {
+    DeathTestAbort("Unable to duplicate the pipe handle " +
+                   StreamableToString(write_handle_as_size_t) +
+                   " from the parent process " +
+                   StreamableToString(parent_process_id));
+  }
+
+  const HANDLE event_handle = reinterpret_cast<HANDLE>(event_handle_as_size_t);
+  HANDLE dup_event_handle;
+
+  if (!::DuplicateHandle(parent_process_handle.Get(), event_handle,
+                         ::GetCurrentProcess(), &dup_event_handle,
+                         0x0,
+                         FALSE,
+                         DUPLICATE_SAME_ACCESS)) {
+    DeathTestAbort("Unable to duplicate the event handle " +
+                   StreamableToString(event_handle_as_size_t) +
+                   " from the parent process " +
+                   StreamableToString(parent_process_id));
+  }
+
+  const int write_fd =
+      ::_open_osfhandle(reinterpret_cast<intptr_t>(dup_write_handle), O_APPEND);
+  if (write_fd == -1) {
+    DeathTestAbort("Unable to convert pipe handle " +
+                   StreamableToString(write_handle_as_size_t) +
+                   " to a file descriptor");
+  }
+
+  // Signals the parent that the write end of the pipe has been acquired
+  // so the parent can release its own write end.
+  ::SetEvent(dup_event_handle);
+
+  return write_fd;
+}
+# endif  // GTEST_OS_WINDOWS
+
+// Returns a newly created InternalRunDeathTestFlag object with fields
+// initialized from the GTEST_FLAG(internal_run_death_test) flag if
+// the flag is specified; otherwise returns NULL.
+InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag() {
+  if (GTEST_FLAG(internal_run_death_test) == "") return NULL;
+
+  // GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we
+  // can use it here.
+  int line = -1;
+  int index = -1;
+  ::std::vector< ::std::string> fields;
+  SplitString(GTEST_FLAG(internal_run_death_test).c_str(), '|', &fields);
+  int write_fd = -1;
+
+# if GTEST_OS_WINDOWS
+
+  unsigned int parent_process_id = 0;
+  size_t write_handle_as_size_t = 0;
+  size_t event_handle_as_size_t = 0;
+
+  if (fields.size() != 6
+      || !ParseNaturalNumber(fields[1], &line)
+      || !ParseNaturalNumber(fields[2], &index)
+      || !ParseNaturalNumber(fields[3], &parent_process_id)
+      || !ParseNaturalNumber(fields[4], &write_handle_as_size_t)
+      || !ParseNaturalNumber(fields[5], &event_handle_as_size_t)) {
+    DeathTestAbort("Bad --gtest_internal_run_death_test flag: " +
+                   GTEST_FLAG(internal_run_death_test));
+  }
+  write_fd = GetStatusFileDescriptor(parent_process_id,
+                                     write_handle_as_size_t,
+                                     event_handle_as_size_t);
+# else
+
+  if (fields.size() != 4
+      || !ParseNaturalNumber(fields[1], &line)
+      || !ParseNaturalNumber(fields[2], &index)
+      || !ParseNaturalNumber(fields[3], &write_fd)) {
+    DeathTestAbort("Bad --gtest_internal_run_death_test flag: "
+        + GTEST_FLAG(internal_run_death_test));
+  }
+
+# endif  // GTEST_OS_WINDOWS
+
+  return new InternalRunDeathTestFlag(fields[0], line, index, write_fd);
+}
+
+}  // namespace internal
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+}  // namespace testing
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: keith.ray@gmail.com (Keith Ray)
+
+
+#include <stdlib.h>
+
+#if GTEST_OS_WINDOWS_MOBILE
+# include <windows.h>
+#elif GTEST_OS_WINDOWS
+# include <direct.h>
+# include <io.h>
+#elif GTEST_OS_SYMBIAN
+// Symbian OpenC has PATH_MAX in sys/syslimits.h
+# include <sys/syslimits.h>
+#else
+# include <limits.h>
+# include <climits>  // Some Linux distributions define PATH_MAX here.
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+#if GTEST_OS_WINDOWS
+# define GTEST_PATH_MAX_ _MAX_PATH
+#elif defined(PATH_MAX)
+# define GTEST_PATH_MAX_ PATH_MAX
+#elif defined(_XOPEN_PATH_MAX)
+# define GTEST_PATH_MAX_ _XOPEN_PATH_MAX
+#else
+# define GTEST_PATH_MAX_ _POSIX_PATH_MAX
+#endif  // GTEST_OS_WINDOWS
+
+
+namespace testing {
+namespace internal {
+
+#if GTEST_OS_WINDOWS
+// On Windows, '\\' is the standard path separator, but many tools and the
+// Windows API also accept '/' as an alternate path separator. Unless otherwise
+// noted, a file path can contain either kind of path separators, or a mixture
+// of them.
+const char kPathSeparator = '\\';
+const char kAlternatePathSeparator = '/';
+const char kAlternatePathSeparatorString[] = "/";
+# if GTEST_OS_WINDOWS_MOBILE
+// Windows CE doesn't have a current directory. You should not use
+// the current directory in tests on Windows CE, but this at least
+// provides a reasonable fallback.
+const char kCurrentDirectoryString[] = "\\";
+// Windows CE doesn't define INVALID_FILE_ATTRIBUTES
+const DWORD kInvalidFileAttributes = 0xffffffff;
+# else
+const char kCurrentDirectoryString[] = ".\\";
+# endif  // GTEST_OS_WINDOWS_MOBILE
+#else
+const char kPathSeparator = '/';
+const char kCurrentDirectoryString[] = "./";
+#endif  // GTEST_OS_WINDOWS
+
+// Returns whether the given character is a valid path separator.
+static bool IsPathSeparator(char c) {
+#if GTEST_HAS_ALT_PATH_SEP_
+  return (c == kPathSeparator) || (c == kAlternatePathSeparator);
+#else
+  return c == kPathSeparator;
+#endif
+}
+
+// Returns the current working directory, or "" if unsuccessful.
+FilePath FilePath::GetCurrentDir() {
+#if GTEST_OS_WINDOWS_MOBILE
+  // Windows CE doesn't have a current directory, so we just return
+  // something reasonable.
+  return FilePath(kCurrentDirectoryString);
+#elif GTEST_OS_WINDOWS
+  char cwd[GTEST_PATH_MAX_ + 1] = { '\0' };
+  return FilePath(_getcwd(cwd, sizeof(cwd)) == NULL ? "" : cwd);
+#else
+  char cwd[GTEST_PATH_MAX_ + 1] = { '\0' };
+  return FilePath(getcwd(cwd, sizeof(cwd)) == NULL ? "" : cwd);
+#endif  // GTEST_OS_WINDOWS_MOBILE
+}
+
+// Returns a copy of the FilePath with the case-insensitive extension removed.
+// Example: FilePath("dir/file.exe").RemoveExtension("EXE") returns
+// FilePath("dir/file"). If a case-insensitive extension is not
+// found, returns a copy of the original FilePath.
+FilePath FilePath::RemoveExtension(const char* extension) const {
+  const std::string dot_extension = std::string(".") + extension;
+  if (String::EndsWithCaseInsensitive(pathname_, dot_extension)) {
+    return FilePath(pathname_.substr(
+        0, pathname_.length() - dot_extension.length()));
+  }
+  return *this;
+}
+
+// Returns a pointer to the last occurence of a valid path separator in
+// the FilePath. On Windows, for example, both '/' and '\' are valid path
+// separators. Returns NULL if no path separator was found.
+const char* FilePath::FindLastPathSeparator() const {
+  const char* const last_sep = strrchr(c_str(), kPathSeparator);
+#if GTEST_HAS_ALT_PATH_SEP_
+  const char* const last_alt_sep = strrchr(c_str(), kAlternatePathSeparator);
+  // Comparing two pointers of which only one is NULL is undefined.
+  if (last_alt_sep != NULL &&
+      (last_sep == NULL || last_alt_sep > last_sep)) {
+    return last_alt_sep;
+  }
+#endif
+  return last_sep;
+}
+
+// Returns a copy of the FilePath with the directory part removed.
+// Example: FilePath("path/to/file").RemoveDirectoryName() returns
+// FilePath("file"). If there is no directory part ("just_a_file"), it returns
+// the FilePath unmodified. If there is no file part ("just_a_dir/") it
+// returns an empty FilePath ("").
+// On Windows platform, '\' is the path separator, otherwise it is '/'.
+FilePath FilePath::RemoveDirectoryName() const {
+  const char* const last_sep = FindLastPathSeparator();
+  return last_sep ? FilePath(last_sep + 1) : *this;
+}
+
+// RemoveFileName returns the directory path with the filename removed.
+// Example: FilePath("path/to/file").RemoveFileName() returns "path/to/".
+// If the FilePath is "a_file" or "/a_file", RemoveFileName returns
+// FilePath("./") or, on Windows, FilePath(".\\"). If the filepath does
+// not have a file, like "just/a/dir/", it returns the FilePath unmodified.
+// On Windows platform, '\' is the path separator, otherwise it is '/'.
+FilePath FilePath::RemoveFileName() const {
+  const char* const last_sep = FindLastPathSeparator();
+  std::string dir;
+  if (last_sep) {
+    dir = std::string(c_str(), last_sep + 1 - c_str());
+  } else {
+    dir = kCurrentDirectoryString;
+  }
+  return FilePath(dir);
+}
+
+// Helper functions for naming files in a directory for xml output.
+
+// Given directory = "dir", base_name = "test", number = 0,
+// extension = "xml", returns "dir/test.xml". If number is greater
+// than zero (e.g., 12), returns "dir/test_12.xml".
+// On Windows platform, uses \ as the separator rather than /.
+FilePath FilePath::MakeFileName(const FilePath& directory,
+                                const FilePath& base_name,
+                                int number,
+                                const char* extension) {
+  std::string file;
+  if (number == 0) {
+    file = base_name.string() + "." + extension;
+  } else {
+    file = base_name.string() + "_" + StreamableToString(number)
+        + "." + extension;
+  }
+  return ConcatPaths(directory, FilePath(file));
+}
+
+// Given directory = "dir", relative_path = "test.xml", returns "dir/test.xml".
+// On Windows, uses \ as the separator rather than /.
+FilePath FilePath::ConcatPaths(const FilePath& directory,
+                               const FilePath& relative_path) {
+  if (directory.IsEmpty())
+    return relative_path;
+  const FilePath dir(directory.RemoveTrailingPathSeparator());
+  return FilePath(dir.string() + kPathSeparator + relative_path.string());
+}
+
+// Returns true if pathname describes something findable in the file-system,
+// either a file, directory, or whatever.
+bool FilePath::FileOrDirectoryExists() const {
+#if GTEST_OS_WINDOWS_MOBILE
+  LPCWSTR unicode = String::AnsiToUtf16(pathname_.c_str());
+  const DWORD attributes = GetFileAttributes(unicode);
+  delete [] unicode;
+  return attributes != kInvalidFileAttributes;
+#else
+  posix::StatStruct file_stat;
+  return posix::Stat(pathname_.c_str(), &file_stat) == 0;
+#endif  // GTEST_OS_WINDOWS_MOBILE
+}
+
+// Returns true if pathname describes a directory in the file-system
+// that exists.
+bool FilePath::DirectoryExists() const {
+  bool result = false;
+#if GTEST_OS_WINDOWS
+  // Don't strip off trailing separator if path is a root directory on
+  // Windows (like "C:\\").
+  const FilePath& path(IsRootDirectory() ? *this :
+                                           RemoveTrailingPathSeparator());
+#else
+  const FilePath& path(*this);
+#endif
+
+#if GTEST_OS_WINDOWS_MOBILE
+  LPCWSTR unicode = String::AnsiToUtf16(path.c_str());
+  const DWORD attributes = GetFileAttributes(unicode);
+  delete [] unicode;
+  if ((attributes != kInvalidFileAttributes) &&
+      (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
+    result = true;
+  }
+#else
+  posix::StatStruct file_stat;
+  result = posix::Stat(path.c_str(), &file_stat) == 0 &&
+      posix::IsDir(file_stat);
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+  return result;
+}
+
+// Returns true if pathname describes a root directory. (Windows has one
+// root directory per disk drive.)
+bool FilePath::IsRootDirectory() const {
+#if GTEST_OS_WINDOWS
+  // TODO(wan@google.com): on Windows a network share like
+  // \\server\share can be a root directory, although it cannot be the
+  // current directory.  Handle this properly.
+  return pathname_.length() == 3 && IsAbsolutePath();
+#else
+  return pathname_.length() == 1 && IsPathSeparator(pathname_.c_str()[0]);
+#endif
+}
+
+// Returns true if pathname describes an absolute path.
+bool FilePath::IsAbsolutePath() const {
+  const char* const name = pathname_.c_str();
+#if GTEST_OS_WINDOWS
+  return pathname_.length() >= 3 &&
+     ((name[0] >= 'a' && name[0] <= 'z') ||
+      (name[0] >= 'A' && name[0] <= 'Z')) &&
+     name[1] == ':' &&
+     IsPathSeparator(name[2]);
+#else
+  return IsPathSeparator(name[0]);
+#endif
+}
+
+// Returns a pathname for a file that does not currently exist. The pathname
+// will be directory/base_name.extension or
+// directory/base_name_<number>.extension if directory/base_name.extension
+// already exists. The number will be incremented until a pathname is found
+// that does not already exist.
+// Examples: 'dir/foo_test.xml' or 'dir/foo_test_1.xml'.
+// There could be a race condition if two or more processes are calling this
+// function at the same time -- they could both pick the same filename.
+FilePath FilePath::GenerateUniqueFileName(const FilePath& directory,
+                                          const FilePath& base_name,
+                                          const char* extension) {
+  FilePath full_pathname;
+  int number = 0;
+  do {
+    full_pathname.Set(MakeFileName(directory, base_name, number++, extension));
+  } while (full_pathname.FileOrDirectoryExists());
+  return full_pathname;
+}
+
+// Returns true if FilePath ends with a path separator, which indicates that
+// it is intended to represent a directory. Returns false otherwise.
+// This does NOT check that a directory (or file) actually exists.
+bool FilePath::IsDirectory() const {
+  return !pathname_.empty() &&
+         IsPathSeparator(pathname_.c_str()[pathname_.length() - 1]);
+}
+
+// Create directories so that path exists. Returns true if successful or if
+// the directories already exist; returns false if unable to create directories
+// for any reason.
+bool FilePath::CreateDirectoriesRecursively() const {
+  if (!this->IsDirectory()) {
+    return false;
+  }
+
+  if (pathname_.length() == 0 || this->DirectoryExists()) {
+    return true;
+  }
+
+  const FilePath parent(this->RemoveTrailingPathSeparator().RemoveFileName());
+  return parent.CreateDirectoriesRecursively() && this->CreateFolder();
+}
+
+// Create the directory so that path exists. Returns true if successful or
+// if the directory already exists; returns false if unable to create the
+// directory for any reason, including if the parent directory does not
+// exist. Not named "CreateDirectory" because that's a macro on Windows.
+bool FilePath::CreateFolder() const {
+#if GTEST_OS_WINDOWS_MOBILE
+  FilePath removed_sep(this->RemoveTrailingPathSeparator());
+  LPCWSTR unicode = String::AnsiToUtf16(removed_sep.c_str());
+  int result = CreateDirectory(unicode, NULL) ? 0 : -1;
+  delete [] unicode;
+#elif GTEST_OS_WINDOWS
+  int result = _mkdir(pathname_.c_str());
+#else
+  int result = mkdir(pathname_.c_str(), 0777);
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+  if (result == -1) {
+    return this->DirectoryExists();  // An error is OK if the directory exists.
+  }
+  return true;  // No error.
+}
+
+// If input name has a trailing separator character, remove it and return the
+// name, otherwise return the name string unmodified.
+// On Windows platform, uses \ as the separator, other platforms use /.
+FilePath FilePath::RemoveTrailingPathSeparator() const {
+  return IsDirectory()
+      ? FilePath(pathname_.substr(0, pathname_.length() - 1))
+      : *this;
+}
+
+// Removes any redundant separators that might be in the pathname.
+// For example, "bar///foo" becomes "bar/foo". Does not eliminate other
+// redundancies that might be in a pathname involving "." or "..".
+// TODO(wan@google.com): handle Windows network shares (e.g. \\server\share).
+void FilePath::Normalize() {
+  if (pathname_.c_str() == NULL) {
+    pathname_ = "";
+    return;
+  }
+  const char* src = pathname_.c_str();
+  char* const dest = new char[pathname_.length() + 1];
+  char* dest_ptr = dest;
+  memset(dest_ptr, 0, pathname_.length() + 1);
+
+  while (*src != '\0') {
+    *dest_ptr = *src;
+    if (!IsPathSeparator(*src)) {
+      src++;
+    } else {
+#if GTEST_HAS_ALT_PATH_SEP_
+      if (*dest_ptr == kAlternatePathSeparator) {
+        *dest_ptr = kPathSeparator;
+      }
+#endif
+      while (IsPathSeparator(*src))
+        src++;
+    }
+    dest_ptr++;
+  }
+  *dest_ptr = '\0';
+  pathname_ = dest;
+  delete[] dest;
+}
+
+}  // namespace internal
+}  // namespace testing
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+
+#include <limits.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#if GTEST_OS_WINDOWS_MOBILE
+# include <windows.h>  // For TerminateProcess()
+#elif GTEST_OS_WINDOWS
+# include <io.h>
+# include <sys/stat.h>
+#else
+# include <unistd.h>
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+#if GTEST_OS_MAC
+# include <mach/mach_init.h>
+# include <mach/task.h>
+# include <mach/vm_map.h>
+#endif  // GTEST_OS_MAC
+
+#if GTEST_OS_QNX
+# include <devctl.h>
+# include <sys/procfs.h>
+#endif  // GTEST_OS_QNX
+
+
+// Indicates that this translation unit is part of Google Test's
+// implementation.  It must come before gtest-internal-inl.h is
+// included, or there will be a compiler error.  This trick is to
+// prevent a user from accidentally including gtest-internal-inl.h in
+// his code.
+#define GTEST_IMPLEMENTATION_ 1
+#undef GTEST_IMPLEMENTATION_
+
+namespace testing {
+namespace internal {
+
+#if defined(_MSC_VER) || defined(__BORLANDC__)
+// MSVC and C++Builder do not provide a definition of STDERR_FILENO.
+const int kStdOutFileno = 1;
+const int kStdErrFileno = 2;
+#else
+const int kStdOutFileno = STDOUT_FILENO;
+const int kStdErrFileno = STDERR_FILENO;
+#endif  // _MSC_VER
+
+#if GTEST_OS_MAC
+
+// Returns the number of threads running in the process, or 0 to indicate that
+// we cannot detect it.
+size_t GetThreadCount() {
+  const task_t task = mach_task_self();
+  mach_msg_type_number_t thread_count;
+  thread_act_array_t thread_list;
+  const kern_return_t status = task_threads(task, &thread_list, &thread_count);
+  if (status == KERN_SUCCESS) {
+    // task_threads allocates resources in thread_list and we need to free them
+    // to avoid leaks.
+    vm_deallocate(task,
+                  reinterpret_cast<vm_address_t>(thread_list),
+                  sizeof(thread_t) * thread_count);
+    return static_cast<size_t>(thread_count);
+  } else {
+    return 0;
+  }
+}
+
+#elif GTEST_OS_QNX
+
+// Returns the number of threads running in the process, or 0 to indicate that
+// we cannot detect it.
+size_t GetThreadCount() {
+  const int fd = open("/proc/self/as", O_RDONLY);
+  if (fd < 0) {
+    return 0;
+  }
+  procfs_info process_info;
+  const int status =
+      devctl(fd, DCMD_PROC_INFO, &process_info, sizeof(process_info), NULL);
+  close(fd);
+  if (status == EOK) {
+    return static_cast<size_t>(process_info.num_threads);
+  } else {
+    return 0;
+  }
+}
+
+#else
+
+size_t GetThreadCount() {
+  // There's no portable way to detect the number of threads, so we just
+  // return 0 to indicate that we cannot detect it.
+  return 0;
+}
+
+#endif  // GTEST_OS_MAC
+
+#if GTEST_USES_POSIX_RE
+
+// Implements RE.  Currently only needed for death tests.
+
+RE::~RE() {
+  if (is_valid_) {
+    // regfree'ing an invalid regex might crash because the content
+    // of the regex is undefined. Since the regex's are essentially
+    // the same, one cannot be valid (or invalid) without the other
+    // being so too.
+    regfree(&partial_regex_);
+    regfree(&full_regex_);
+  }
+  free(const_cast<char*>(pattern_));
+}
+
+// Returns true iff regular expression re matches the entire str.
+bool RE::FullMatch(const char* str, const RE& re) {
+  if (!re.is_valid_) return false;
+
+  regmatch_t match;
+  return regexec(&re.full_regex_, str, 1, &match, 0) == 0;
+}
+
+// Returns true iff regular expression re matches a substring of str
+// (including str itself).
+bool RE::PartialMatch(const char* str, const RE& re) {
+  if (!re.is_valid_) return false;
+
+  regmatch_t match;
+  return regexec(&re.partial_regex_, str, 1, &match, 0) == 0;
+}
+
+// Initializes an RE from its string representation.
+void RE::Init(const char* regex) {
+  pattern_ = posix::StrDup(regex);
+
+  // Reserves enough bytes to hold the regular expression used for a
+  // full match.
+  const size_t full_regex_len = strlen(regex) + 10;
+  char* const full_pattern = new char[full_regex_len];
+
+  snprintf(full_pattern, full_regex_len, "^(%s)$", regex);
+  is_valid_ = regcomp(&full_regex_, full_pattern, REG_EXTENDED) == 0;
+  // We want to call regcomp(&partial_regex_, ...) even if the
+  // previous expression returns false.  Otherwise partial_regex_ may
+  // not be properly initialized can may cause trouble when it's
+  // freed.
+  //
+  // Some implementation of POSIX regex (e.g. on at least some
+  // versions of Cygwin) doesn't accept the empty string as a valid
+  // regex.  We change it to an equivalent form "()" to be safe.
+  if (is_valid_) {
+    const char* const partial_regex = (*regex == '\0') ? "()" : regex;
+    is_valid_ = regcomp(&partial_regex_, partial_regex, REG_EXTENDED) == 0;
+  }
+  EXPECT_TRUE(is_valid_)
+      << "Regular expression \"" << regex
+      << "\" is not a valid POSIX Extended regular expression.";
+
+  delete[] full_pattern;
+}
+
+#elif GTEST_USES_SIMPLE_RE
+
+// Returns true iff ch appears anywhere in str (excluding the
+// terminating '\0' character).
+bool IsInSet(char ch, const char* str) {
+  return ch != '\0' && strchr(str, ch) != NULL;
+}
+
+// Returns true iff ch belongs to the given classification.  Unlike
+// similar functions in <ctype.h>, these aren't affected by the
+// current locale.
+bool IsAsciiDigit(char ch) { return '0' <= ch && ch <= '9'; }
+bool IsAsciiPunct(char ch) {
+  return IsInSet(ch, "^-!\"#$%&'()*+,./:;<=>?@[\\]_`{|}~");
+}
+bool IsRepeat(char ch) { return IsInSet(ch, "?*+"); }
+bool IsAsciiWhiteSpace(char ch) { return IsInSet(ch, " \f\n\r\t\v"); }
+bool IsAsciiWordChar(char ch) {
+  return ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') ||
+      ('0' <= ch && ch <= '9') || ch == '_';
+}
+
+// Returns true iff "\\c" is a supported escape sequence.
+bool IsValidEscape(char c) {
+  return (IsAsciiPunct(c) || IsInSet(c, "dDfnrsStvwW"));
+}
+
+// Returns true iff the given atom (specified by escaped and pattern)
+// matches ch.  The result is undefined if the atom is invalid.
+bool AtomMatchesChar(bool escaped, char pattern_char, char ch) {
+  if (escaped) {  // "\\p" where p is pattern_char.
+    switch (pattern_char) {
+      case 'd': return IsAsciiDigit(ch);
+      case 'D': return !IsAsciiDigit(ch);
+      case 'f': return ch == '\f';
+      case 'n': return ch == '\n';
+      case 'r': return ch == '\r';
+      case 's': return IsAsciiWhiteSpace(ch);
+      case 'S': return !IsAsciiWhiteSpace(ch);
+      case 't': return ch == '\t';
+      case 'v': return ch == '\v';
+      case 'w': return IsAsciiWordChar(ch);
+      case 'W': return !IsAsciiWordChar(ch);
+    }
+    return IsAsciiPunct(pattern_char) && pattern_char == ch;
+  }
+
+  return (pattern_char == '.' && ch != '\n') || pattern_char == ch;
+}
+
+// Helper function used by ValidateRegex() to format error messages.
+std::string FormatRegexSyntaxError(const char* regex, int index) {
+  return (Message() << "Syntax error at index " << index
+          << " in simple regular expression \"" << regex << "\": ").GetString();
+}
+
+// Generates non-fatal failures and returns false if regex is invalid;
+// otherwise returns true.
+bool ValidateRegex(const char* regex) {
+  if (regex == NULL) {
+    // TODO(wan@google.com): fix the source file location in the
+    // assertion failures to match where the regex is used in user
+    // code.
+    ADD_FAILURE() << "NULL is not a valid simple regular expression.";
+    return false;
+  }
+
+  bool is_valid = true;
+
+  // True iff ?, *, or + can follow the previous atom.
+  bool prev_repeatable = false;
+  for (int i = 0; regex[i]; i++) {
+    if (regex[i] == '\\') {  // An escape sequence
+      i++;
+      if (regex[i] == '\0') {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i - 1)
+                      << "'\\' cannot appear at the end.";
+        return false;
+      }
+
+      if (!IsValidEscape(regex[i])) {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i - 1)
+                      << "invalid escape sequence \"\\" << regex[i] << "\".";
+        is_valid = false;
+      }
+      prev_repeatable = true;
+    } else {  // Not an escape sequence.
+      const char ch = regex[i];
+
+      if (ch == '^' && i > 0) {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
+                      << "'^' can only appear at the beginning.";
+        is_valid = false;
+      } else if (ch == '$' && regex[i + 1] != '\0') {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
+                      << "'$' can only appear at the end.";
+        is_valid = false;
+      } else if (IsInSet(ch, "()[]{}|")) {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
+                      << "'" << ch << "' is unsupported.";
+        is_valid = false;
+      } else if (IsRepeat(ch) && !prev_repeatable) {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
+                      << "'" << ch << "' can only follow a repeatable token.";
+        is_valid = false;
+      }
+
+      prev_repeatable = !IsInSet(ch, "^$?*+");
+    }
+  }
+
+  return is_valid;
+}
+
+// Matches a repeated regex atom followed by a valid simple regular
+// expression.  The regex atom is defined as c if escaped is false,
+// or \c otherwise.  repeat is the repetition meta character (?, *,
+// or +).  The behavior is undefined if str contains too many
+// characters to be indexable by size_t, in which case the test will
+// probably time out anyway.  We are fine with this limitation as
+// std::string has it too.
+bool MatchRepetitionAndRegexAtHead(
+    bool escaped, char c, char repeat, const char* regex,
+    const char* str) {
+  const size_t min_count = (repeat == '+') ? 1 : 0;
+  const size_t max_count = (repeat == '?') ? 1 :
+      static_cast<size_t>(-1) - 1;
+  // We cannot call numeric_limits::max() as it conflicts with the
+  // max() macro on Windows.
+
+  for (size_t i = 0; i <= max_count; ++i) {
+    // We know that the atom matches each of the first i characters in str.
+    if (i >= min_count && MatchRegexAtHead(regex, str + i)) {
+      // We have enough matches at the head, and the tail matches too.
+      // Since we only care about *whether* the pattern matches str
+      // (as opposed to *how* it matches), there is no need to find a
+      // greedy match.
+      return true;
+    }
+    if (str[i] == '\0' || !AtomMatchesChar(escaped, c, str[i]))
+      return false;
+  }
+  return false;
+}
+
+// Returns true iff regex matches a prefix of str.  regex must be a
+// valid simple regular expression and not start with "^", or the
+// result is undefined.
+bool MatchRegexAtHead(const char* regex, const char* str) {
+  if (*regex == '\0')  // An empty regex matches a prefix of anything.
+    return true;
+
+  // "$" only matches the end of a string.  Note that regex being
+  // valid guarantees that there's nothing after "$" in it.
+  if (*regex == '$')
+    return *str == '\0';
+
+  // Is the first thing in regex an escape sequence?
+  const bool escaped = *regex == '\\';
+  if (escaped)
+    ++regex;
+  if (IsRepeat(regex[1])) {
+    // MatchRepetitionAndRegexAtHead() calls MatchRegexAtHead(), so
+    // here's an indirect recursion.  It terminates as the regex gets
+    // shorter in each recursion.
+    return MatchRepetitionAndRegexAtHead(
+        escaped, regex[0], regex[1], regex + 2, str);
+  } else {
+    // regex isn't empty, isn't "$", and doesn't start with a
+    // repetition.  We match the first atom of regex with the first
+    // character of str and recurse.
+    return (*str != '\0') && AtomMatchesChar(escaped, *regex, *str) &&
+        MatchRegexAtHead(regex + 1, str + 1);
+  }
+}
+
+// Returns true iff regex matches any substring of str.  regex must be
+// a valid simple regular expression, or the result is undefined.
+//
+// The algorithm is recursive, but the recursion depth doesn't exceed
+// the regex length, so we won't need to worry about running out of
+// stack space normally.  In rare cases the time complexity can be
+// exponential with respect to the regex length + the string length,
+// but usually it's must faster (often close to linear).
+bool MatchRegexAnywhere(const char* regex, const char* str) {
+  if (regex == NULL || str == NULL)
+    return false;
+
+  if (*regex == '^')
+    return MatchRegexAtHead(regex + 1, str);
+
+  // A successful match can be anywhere in str.
+  do {
+    if (MatchRegexAtHead(regex, str))
+      return true;
+  } while (*str++ != '\0');
+  return false;
+}
+
+// Implements the RE class.
+
+RE::~RE() {
+  free(const_cast<char*>(pattern_));
+  free(const_cast<char*>(full_pattern_));
+}
+
+// Returns true iff regular expression re matches the entire str.
+bool RE::FullMatch(const char* str, const RE& re) {
+  return re.is_valid_ && MatchRegexAnywhere(re.full_pattern_, str);
+}
+
+// Returns true iff regular expression re matches a substring of str
+// (including str itself).
+bool RE::PartialMatch(const char* str, const RE& re) {
+  return re.is_valid_ && MatchRegexAnywhere(re.pattern_, str);
+}
+
+// Initializes an RE from its string representation.
+void RE::Init(const char* regex) {
+  pattern_ = full_pattern_ = NULL;
+  if (regex != NULL) {
+    pattern_ = posix::StrDup(regex);
+  }
+
+  is_valid_ = ValidateRegex(regex);
+  if (!is_valid_) {
+    // No need to calculate the full pattern when the regex is invalid.
+    return;
+  }
+
+  const size_t len = strlen(regex);
+  // Reserves enough bytes to hold the regular expression used for a
+  // full match: we need space to prepend a '^', append a '$', and
+  // terminate the string with '\0'.
+  char* buffer = static_cast<char*>(malloc(len + 3));
+  full_pattern_ = buffer;
+
+  if (*regex != '^')
+    *buffer++ = '^';  // Makes sure full_pattern_ starts with '^'.
+
+  // We don't use snprintf or strncpy, as they trigger a warning when
+  // compiled with VC++ 8.0.
+  memcpy(buffer, regex, len);
+  buffer += len;
+
+  if (len == 0 || regex[len - 1] != '$')
+    *buffer++ = '$';  // Makes sure full_pattern_ ends with '$'.
+
+  *buffer = '\0';
+}
+
+#endif  // GTEST_USES_POSIX_RE
+
+const char kUnknownFile[] = "unknown file";
+
+// Formats a source file path and a line number as they would appear
+// in an error message from the compiler used to compile this code.
+GTEST_API_ ::std::string FormatFileLocation(const char* file, int line) {
+  const std::string file_name(file == NULL ? kUnknownFile : file);
+
+  if (line < 0) {
+    return file_name + ":";
+  }
+#ifdef _MSC_VER
+  return file_name + "(" + StreamableToString(line) + "):";
+#else
+  return file_name + ":" + StreamableToString(line) + ":";
+#endif  // _MSC_VER
+}
+
+// Formats a file location for compiler-independent XML output.
+// Although this function is not platform dependent, we put it next to
+// FormatFileLocation in order to contrast the two functions.
+// Note that FormatCompilerIndependentFileLocation() does NOT append colon
+// to the file location it produces, unlike FormatFileLocation().
+GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(
+    const char* file, int line) {
+  const std::string file_name(file == NULL ? kUnknownFile : file);
+
+  if (line < 0)
+    return file_name;
+  else
+    return file_name + ":" + StreamableToString(line);
+}
+
+
+GTestLog::GTestLog(GTestLogSeverity severity, const char* file, int line)
+    : severity_(severity) {
+  const char* const marker =
+      severity == GTEST_INFO ?    "[  INFO ]" :
+      severity == GTEST_WARNING ? "[WARNING]" :
+      severity == GTEST_ERROR ?   "[ ERROR ]" : "[ FATAL ]";
+  GetStream() << ::std::endl << marker << " "
+              << FormatFileLocation(file, line).c_str() << ": ";
+}
+
+// Flushes the buffers and, if severity is GTEST_FATAL, aborts the program.
+GTestLog::~GTestLog() {
+  GetStream() << ::std::endl;
+  if (severity_ == GTEST_FATAL) {
+    fflush(stderr);
+    posix::Abort();
+  }
+}
+// Disable Microsoft deprecation warnings for POSIX functions called from
+// this class (creat, dup, dup2, and close)
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable: 4996)
+#endif  // _MSC_VER
+
+#if GTEST_HAS_STREAM_REDIRECTION
+
+// Object that captures an output stream (stdout/stderr).
+class CapturedStream {
+ public:
+  // The ctor redirects the stream to a temporary file.
+  explicit CapturedStream(int fd) : fd_(fd), uncaptured_fd_(dup(fd)) {
+# if GTEST_OS_WINDOWS
+    char temp_dir_path[MAX_PATH + 1] = { '\0' };  // NOLINT
+    char temp_file_path[MAX_PATH + 1] = { '\0' };  // NOLINT
+
+    ::GetTempPathA(sizeof(temp_dir_path), temp_dir_path);
+    const UINT success = ::GetTempFileNameA(temp_dir_path,
+                                            "gtest_redir",
+                                            0,  // Generate unique file name.
+                                            temp_file_path);
+    GTEST_CHECK_(success != 0)
+        << "Unable to create a temporary file in " << temp_dir_path;
+    const int captured_fd = creat(temp_file_path, _S_IREAD | _S_IWRITE);
+    GTEST_CHECK_(captured_fd != -1) << "Unable to open temporary file "
+                                    << temp_file_path;
+    filename_ = temp_file_path;
+# else
+    // There's no guarantee that a test has write access to the current
+    // directory, so we create the temporary file in the /tmp directory
+    // instead. We use /tmp on most systems, and /sdcard on Android.
+    // That's because Android doesn't have /tmp.
+#  if GTEST_OS_LINUX_ANDROID
+    // Note: Android applications are expected to call the framework's
+    // Context.getExternalStorageDirectory() method through JNI to get
+    // the location of the world-writable SD Card directory. However,
+    // this requires a Context handle, which cannot be retrieved
+    // globally from native code. Doing so also precludes running the
+    // code as part of a regular standalone executable, which doesn't
+    // run in a Dalvik process (e.g. when running it through 'adb shell').
+    //
+    // The location /sdcard is directly accessible from native code
+    // and is the only location (unofficially) supported by the Android
+    // team. It's generally a symlink to the real SD Card mount point
+    // which can be /mnt/sdcard, /mnt/sdcard0, /system/media/sdcard, or
+    // other OEM-customized locations. Never rely on these, and always
+    // use /sdcard.
+    char name_template[] = "/sdcard/gtest_captured_stream.XXXXXX";
+#  else
+    char name_template[] = "/tmp/captured_stream.XXXXXX";
+#  endif  // GTEST_OS_LINUX_ANDROID
+    const int captured_fd = mkstemp(name_template);
+    filename_ = name_template;
+# endif  // GTEST_OS_WINDOWS
+    fflush(NULL);
+    dup2(captured_fd, fd_);
+    close(captured_fd);
+  }
+
+  ~CapturedStream() {
+    remove(filename_.c_str());
+  }
+
+  std::string GetCapturedString() {
+    if (uncaptured_fd_ != -1) {
+      // Restores the original stream.
+      fflush(NULL);
+      dup2(uncaptured_fd_, fd_);
+      close(uncaptured_fd_);
+      uncaptured_fd_ = -1;
+    }
+
+    FILE* const file = posix::FOpen(filename_.c_str(), "r");
+    const std::string content = ReadEntireFile(file);
+    posix::FClose(file);
+    return content;
+  }
+
+ private:
+  // Reads the entire content of a file as an std::string.
+  static std::string ReadEntireFile(FILE* file);
+
+  // Returns the size (in bytes) of a file.
+  static size_t GetFileSize(FILE* file);
+
+  const int fd_;  // A stream to capture.
+  int uncaptured_fd_;
+  // Name of the temporary file holding the stderr output.
+  ::std::string filename_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(CapturedStream);
+};
+
+// Returns the size (in bytes) of a file.
+size_t CapturedStream::GetFileSize(FILE* file) {
+  fseek(file, 0, SEEK_END);
+  return static_cast<size_t>(ftell(file));
+}
+
+// Reads the entire content of a file as a string.
+std::string CapturedStream::ReadEntireFile(FILE* file) {
+  const size_t file_size = GetFileSize(file);
+  char* const buffer = new char[file_size];
+
+  size_t bytes_last_read = 0;  // # of bytes read in the last fread()
+  size_t bytes_read = 0;       // # of bytes read so far
+
+  fseek(file, 0, SEEK_SET);
+
+  // Keeps reading the file until we cannot read further or the
+  // pre-determined file size is reached.
+  do {
+    bytes_last_read = fread(buffer+bytes_read, 1, file_size-bytes_read, file);
+    bytes_read += bytes_last_read;
+  } while (bytes_last_read > 0 && bytes_read < file_size);
+
+  const std::string content(buffer, bytes_read);
+  delete[] buffer;
+
+  return content;
+}
+
+# ifdef _MSC_VER
+#  pragma warning(pop)
+# endif  // _MSC_VER
+
+static CapturedStream* g_captured_stderr = NULL;
+static CapturedStream* g_captured_stdout = NULL;
+
+// Starts capturing an output stream (stdout/stderr).
+void CaptureStream(int fd, const char* stream_name, CapturedStream** stream) {
+  if (*stream != NULL) {
+    GTEST_LOG_(FATAL) << "Only one " << stream_name
+                      << " capturer can exist at a time.";
+  }
+  *stream = new CapturedStream(fd);
+}
+
+// Stops capturing the output stream and returns the captured string.
+std::string GetCapturedStream(CapturedStream** captured_stream) {
+  const std::string content = (*captured_stream)->GetCapturedString();
+
+  delete *captured_stream;
+  *captured_stream = NULL;
+
+  return content;
+}
+
+// Starts capturing stdout.
+void CaptureStdout() {
+  CaptureStream(kStdOutFileno, "stdout", &g_captured_stdout);
+}
+
+// Starts capturing stderr.
+void CaptureStderr() {
+  CaptureStream(kStdErrFileno, "stderr", &g_captured_stderr);
+}
+
+// Stops capturing stdout and returns the captured string.
+std::string GetCapturedStdout() {
+  return GetCapturedStream(&g_captured_stdout);
+}
+
+// Stops capturing stderr and returns the captured string.
+std::string GetCapturedStderr() {
+  return GetCapturedStream(&g_captured_stderr);
+}
+
+#endif  // GTEST_HAS_STREAM_REDIRECTION
+
+#if GTEST_HAS_DEATH_TEST
+
+// A copy of all command line arguments.  Set by InitGoogleTest().
+::std::vector<testing::internal::string> g_argvs;
+
+static const ::std::vector<testing::internal::string>* g_injected_test_argvs =
+                                        NULL;  // Owned.
+
+void SetInjectableArgvs(const ::std::vector<testing::internal::string>* argvs) {
+  if (g_injected_test_argvs != argvs)
+    delete g_injected_test_argvs;
+  g_injected_test_argvs = argvs;
+}
+
+const ::std::vector<testing::internal::string>& GetInjectableArgvs() {
+  if (g_injected_test_argvs != NULL) {
+    return *g_injected_test_argvs;
+  }
+  return g_argvs;
+}
+#endif  // GTEST_HAS_DEATH_TEST
+
+#if GTEST_OS_WINDOWS_MOBILE
+namespace posix {
+void Abort() {
+  DebugBreak();
+  TerminateProcess(GetCurrentProcess(), 1);
+}
+}  // namespace posix
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+// Returns the name of the environment variable corresponding to the
+// given flag.  For example, FlagToEnvVar("foo") will return
+// "GTEST_FOO" in the open-source version.
+static std::string FlagToEnvVar(const char* flag) {
+  const std::string full_flag =
+      (Message() << GTEST_FLAG_PREFIX_ << flag).GetString();
+
+  Message env_var;
+  for (size_t i = 0; i != full_flag.length(); i++) {
+    env_var << ToUpper(full_flag.c_str()[i]);
+  }
+
+  return env_var.GetString();
+}
+
+// Parses 'str' for a 32-bit signed integer.  If successful, writes
+// the result to *value and returns true; otherwise leaves *value
+// unchanged and returns false.
+bool ParseInt32(const Message& src_text, const char* str, Int32* value) {
+  // Parses the environment variable as a decimal integer.
+  char* end = NULL;
+  const long long_value = strtol(str, &end, 10);  // NOLINT
+
+  // Has strtol() consumed all characters in the string?
+  if (*end != '\0') {
+    // No - an invalid character was encountered.
+    Message msg;
+    msg << "WARNING: " << src_text
+        << " is expected to be a 32-bit integer, but actually"
+        << " has value \"" << str << "\".\n";
+    printf("%s", msg.GetString().c_str());
+    fflush(stdout);
+    return false;
+  }
+
+  // Is the parsed value in the range of an Int32?
+  const Int32 result = static_cast<Int32>(long_value);
+  if (long_value == LONG_MAX || long_value == LONG_MIN ||
+      // The parsed value overflows as a long.  (strtol() returns
+      // LONG_MAX or LONG_MIN when the input overflows.)
+      result != long_value
+      // The parsed value overflows as an Int32.
+      ) {
+    Message msg;
+    msg << "WARNING: " << src_text
+        << " is expected to be a 32-bit integer, but actually"
+        << " has value " << str << ", which overflows.\n";
+    printf("%s", msg.GetString().c_str());
+    fflush(stdout);
+    return false;
+  }
+
+  *value = result;
+  return true;
+}
+
+// Reads and returns the Boolean environment variable corresponding to
+// the given flag; if it's not set, returns default_value.
+//
+// The value is considered true iff it's not "0".
+bool BoolFromGTestEnv(const char* flag, bool default_value) {
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* const string_value = posix::GetEnv(env_var.c_str());
+  return string_value == NULL ?
+      default_value : strcmp(string_value, "0") != 0;
+}
+
+// Reads and returns a 32-bit integer stored in the environment
+// variable corresponding to the given flag; if it isn't set or
+// doesn't represent a valid 32-bit integer, returns default_value.
+Int32 Int32FromGTestEnv(const char* flag, Int32 default_value) {
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* const string_value = posix::GetEnv(env_var.c_str());
+  if (string_value == NULL) {
+    // The environment variable is not set.
+    return default_value;
+  }
+
+  Int32 result = default_value;
+  if (!ParseInt32(Message() << "Environment variable " << env_var,
+                  string_value, &result)) {
+    printf("The default value %s is used.\n",
+           (Message() << default_value).GetString().c_str());
+    fflush(stdout);
+    return default_value;
+  }
+
+  return result;
+}
+
+// Reads and returns the string environment variable corresponding to
+// the given flag; if it's not set, returns default_value.
+const char* StringFromGTestEnv(const char* flag, const char* default_value) {
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* const value = posix::GetEnv(env_var.c_str());
+  return value == NULL ? default_value : value;
+}
+
+}  // namespace internal
+}  // namespace testing
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+// Google Test - The Google C++ Testing Framework
+//
+// This file implements a universal value printer that can print a
+// value of any type T:
+//
+//   void ::testing::internal::UniversalPrinter<T>::Print(value, ostream_ptr);
+//
+// It uses the << operator when possible, and prints the bytes in the
+// object otherwise.  A user can override its behavior for a class
+// type Foo by defining either operator<<(::std::ostream&, const Foo&)
+// or void PrintTo(const Foo&, ::std::ostream*) in the namespace that
+// defines Foo.
+
+#include <ctype.h>
+#include <stdio.h>
+#include <ostream>  // NOLINT
+#include <string>
+
+namespace testing {
+
+namespace {
+
+using ::std::ostream;
+
+// Prints a segment of bytes in the given object.
+void PrintByteSegmentInObjectTo(const unsigned char* obj_bytes, size_t start,
+                                size_t count, ostream* os) {
+  char text[5] = "";
+  for (size_t i = 0; i != count; i++) {
+    const size_t j = start + i;
+    if (i != 0) {
+      // Organizes the bytes into groups of 2 for easy parsing by
+      // human.
+      if ((j % 2) == 0)
+        *os << ' ';
+      else
+        *os << '-';
+    }
+    GTEST_SNPRINTF_(text, sizeof(text), "%02X", obj_bytes[j]);
+    *os << text;
+  }
+}
+
+// Prints the bytes in the given value to the given ostream.
+void PrintBytesInObjectToImpl(const unsigned char* obj_bytes, size_t count,
+                              ostream* os) {
+  // Tells the user how big the object is.
+  *os << count << "-byte object <";
+
+  const size_t kThreshold = 132;
+  const size_t kChunkSize = 64;
+  // If the object size is bigger than kThreshold, we'll have to omit
+  // some details by printing only the first and the last kChunkSize
+  // bytes.
+  // TODO(wan): let the user control the threshold using a flag.
+  if (count < kThreshold) {
+    PrintByteSegmentInObjectTo(obj_bytes, 0, count, os);
+  } else {
+    PrintByteSegmentInObjectTo(obj_bytes, 0, kChunkSize, os);
+    *os << " ... ";
+    // Rounds up to 2-byte boundary.
+    const size_t resume_pos = (count - kChunkSize + 1)/2*2;
+    PrintByteSegmentInObjectTo(obj_bytes, resume_pos, count - resume_pos, os);
+  }
+  *os << ">";
+}
+
+}  // namespace
+
+namespace internal2 {
+
+// Delegates to PrintBytesInObjectToImpl() to print the bytes in the
+// given object.  The delegation simplifies the implementation, which
+// uses the << operator and thus is easier done outside of the
+// ::testing::internal namespace, which contains a << operator that
+// sometimes conflicts with the one in STL.
+void PrintBytesInObjectTo(const unsigned char* obj_bytes, size_t count,
+                          ostream* os) {
+  PrintBytesInObjectToImpl(obj_bytes, count, os);
+}
+
+}  // namespace internal2
+
+namespace internal {
+
+// Depending on the value of a char (or wchar_t), we print it in one
+// of three formats:
+//   - as is if it's a printable ASCII (e.g. 'a', '2', ' '),
+//   - as a hexidecimal escape sequence (e.g. '\x7F'), or
+//   - as a special escape sequence (e.g. '\r', '\n').
+enum CharFormat {
+  kAsIs,
+  kHexEscape,
+  kSpecialEscape
+};
+
+// Returns true if c is a printable ASCII character.  We test the
+// value of c directly instead of calling isprint(), which is buggy on
+// Windows Mobile.
+inline bool IsPrintableAscii(wchar_t c) {
+  return 0x20 <= c && c <= 0x7E;
+}
+
+// Prints a wide or narrow char c as a character literal without the
+// quotes, escaping it when necessary; returns how c was formatted.
+// The template argument UnsignedChar is the unsigned version of Char,
+// which is the type of c.
+template <typename UnsignedChar, typename Char>
+static CharFormat PrintAsCharLiteralTo(Char c, ostream* os) {
+  switch (static_cast<wchar_t>(c)) {
+    case L'\0':
+      *os << "\\0";
+      break;
+    case L'\'':
+      *os << "\\'";
+      break;
+    case L'\\':
+      *os << "\\\\";
+      break;
+    case L'\a':
+      *os << "\\a";
+      break;
+    case L'\b':
+      *os << "\\b";
+      break;
+    case L'\f':
+      *os << "\\f";
+      break;
+    case L'\n':
+      *os << "\\n";
+      break;
+    case L'\r':
+      *os << "\\r";
+      break;
+    case L'\t':
+      *os << "\\t";
+      break;
+    case L'\v':
+      *os << "\\v";
+      break;
+    default:
+      if (IsPrintableAscii(c)) {
+        *os << static_cast<char>(c);
+        return kAsIs;
+      } else {
+        *os << "\\x" + String::FormatHexInt(static_cast<UnsignedChar>(c));
+        return kHexEscape;
+      }
+  }
+  return kSpecialEscape;
+}
+
+// Prints a wchar_t c as if it's part of a string literal, escaping it when
+// necessary; returns how c was formatted.
+static CharFormat PrintAsStringLiteralTo(wchar_t c, ostream* os) {
+  switch (c) {
+    case L'\'':
+      *os << "'";
+      return kAsIs;
+    case L'"':
+      *os << "\\\"";
+      return kSpecialEscape;
+    default:
+      return PrintAsCharLiteralTo<wchar_t>(c, os);
+  }
+}
+
+// Prints a char c as if it's part of a string literal, escaping it when
+// necessary; returns how c was formatted.
+static CharFormat PrintAsStringLiteralTo(char c, ostream* os) {
+  return PrintAsStringLiteralTo(
+      static_cast<wchar_t>(static_cast<unsigned char>(c)), os);
+}
+
+// Prints a wide or narrow character c and its code.  '\0' is printed
+// as "'\\0'", other unprintable characters are also properly escaped
+// using the standard C++ escape sequence.  The template argument
+// UnsignedChar is the unsigned version of Char, which is the type of c.
+template <typename UnsignedChar, typename Char>
+void PrintCharAndCodeTo(Char c, ostream* os) {
+  // First, print c as a literal in the most readable form we can find.
+  *os << ((sizeof(c) > 1) ? "L'" : "'");
+  const CharFormat format = PrintAsCharLiteralTo<UnsignedChar>(c, os);
+  *os << "'";
+
+  // To aid user debugging, we also print c's code in decimal, unless
+  // it's 0 (in which case c was printed as '\\0', making the code
+  // obvious).
+  if (c == 0)
+    return;
+  *os << " (" << static_cast<int>(c);
+
+  // For more convenience, we print c's code again in hexidecimal,
+  // unless c was already printed in the form '\x##' or the code is in
+  // [1, 9].
+  if (format == kHexEscape || (1 <= c && c <= 9)) {
+    // Do nothing.
+  } else {
+    *os << ", 0x" << String::FormatHexInt(static_cast<UnsignedChar>(c));
+  }
+  *os << ")";
+}
+
+void PrintTo(unsigned char c, ::std::ostream* os) {
+  PrintCharAndCodeTo<unsigned char>(c, os);
+}
+void PrintTo(signed char c, ::std::ostream* os) {
+  PrintCharAndCodeTo<unsigned char>(c, os);
+}
+
+// Prints a wchar_t as a symbol if it is printable or as its internal
+// code otherwise and also as its code.  L'\0' is printed as "L'\\0'".
+void PrintTo(wchar_t wc, ostream* os) {
+  PrintCharAndCodeTo<wchar_t>(wc, os);
+}
+
+// Prints the given array of characters to the ostream.  CharType must be either
+// char or wchar_t.
+// The array starts at begin, the length is len, it may include '\0' characters
+// and may not be NUL-terminated.
+template <typename CharType>
+static void PrintCharsAsStringTo(
+    const CharType* begin, size_t len, ostream* os) {
+  const char* const kQuoteBegin = sizeof(CharType) == 1 ? "\"" : "L\"";
+  *os << kQuoteBegin;
+  bool is_previous_hex = false;
+  for (size_t index = 0; index < len; ++index) {
+    const CharType cur = begin[index];
+    if (is_previous_hex && IsXDigit(cur)) {
+      // Previous character is of '\x..' form and this character can be
+      // interpreted as another hexadecimal digit in its number. Break string to
+      // disambiguate.
+      *os << "\" " << kQuoteBegin;
+    }
+    is_previous_hex = PrintAsStringLiteralTo(cur, os) == kHexEscape;
+  }
+  *os << "\"";
+}
+
+// Prints a (const) char/wchar_t array of 'len' elements, starting at address
+// 'begin'.  CharType must be either char or wchar_t.
+template <typename CharType>
+static void UniversalPrintCharArray(
+    const CharType* begin, size_t len, ostream* os) {
+  // The code
+  //   const char kFoo[] = "foo";
+  // generates an array of 4, not 3, elements, with the last one being '\0'.
+  //
+  // Therefore when printing a char array, we don't print the last element if
+  // it's '\0', such that the output matches the string literal as it's
+  // written in the source code.
+  if (len > 0 && begin[len - 1] == '\0') {
+    PrintCharsAsStringTo(begin, len - 1, os);
+    return;
+  }
+
+  // If, however, the last element in the array is not '\0', e.g.
+  //    const char kFoo[] = { 'f', 'o', 'o' };
+  // we must print the entire array.  We also print a message to indicate
+  // that the array is not NUL-terminated.
+  PrintCharsAsStringTo(begin, len, os);
+  *os << " (no terminating NUL)";
+}
+
+// Prints a (const) char array of 'len' elements, starting at address 'begin'.
+void UniversalPrintArray(const char* begin, size_t len, ostream* os) {
+  UniversalPrintCharArray(begin, len, os);
+}
+
+// Prints a (const) wchar_t array of 'len' elements, starting at address
+// 'begin'.
+void UniversalPrintArray(const wchar_t* begin, size_t len, ostream* os) {
+  UniversalPrintCharArray(begin, len, os);
+}
+
+// Prints the given C string to the ostream.
+void PrintTo(const char* s, ostream* os) {
+  if (s == NULL) {
+    *os << "NULL";
+  } else {
+    *os << ImplicitCast_<const void*>(s) << " pointing to ";
+    PrintCharsAsStringTo(s, strlen(s), os);
+  }
+}
+
+// MSVC compiler can be configured to define whar_t as a typedef
+// of unsigned short. Defining an overload for const wchar_t* in that case
+// would cause pointers to unsigned shorts be printed as wide strings,
+// possibly accessing more memory than intended and causing invalid
+// memory accesses. MSVC defines _NATIVE_WCHAR_T_DEFINED symbol when
+// wchar_t is implemented as a native type.
+#if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED)
+// Prints the given wide C string to the ostream.
+void PrintTo(const wchar_t* s, ostream* os) {
+  if (s == NULL) {
+    *os << "NULL";
+  } else {
+    *os << ImplicitCast_<const void*>(s) << " pointing to ";
+    PrintCharsAsStringTo(s, wcslen(s), os);
+  }
+}
+#endif  // wchar_t is native
+
+// Prints a ::string object.
+#if GTEST_HAS_GLOBAL_STRING
+void PrintStringTo(const ::string& s, ostream* os) {
+  PrintCharsAsStringTo(s.data(), s.size(), os);
+}
+#endif  // GTEST_HAS_GLOBAL_STRING
+
+void PrintStringTo(const ::std::string& s, ostream* os) {
+  PrintCharsAsStringTo(s.data(), s.size(), os);
+}
+
+// Prints a ::wstring object.
+#if GTEST_HAS_GLOBAL_WSTRING
+void PrintWideStringTo(const ::wstring& s, ostream* os) {
+  PrintCharsAsStringTo(s.data(), s.size(), os);
+}
+#endif  // GTEST_HAS_GLOBAL_WSTRING
+
+#if GTEST_HAS_STD_WSTRING
+void PrintWideStringTo(const ::std::wstring& s, ostream* os) {
+  PrintCharsAsStringTo(s.data(), s.size(), os);
+}
+#endif  // GTEST_HAS_STD_WSTRING
+
+}  // namespace internal
+
+}  // namespace testing
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: mheule@google.com (Markus Heule)
+//
+// The Google C++ Testing Framework (Google Test)
+
+
+// Indicates that this translation unit is part of Google Test's
+// implementation.  It must come before gtest-internal-inl.h is
+// included, or there will be a compiler error.  This trick is to
+// prevent a user from accidentally including gtest-internal-inl.h in
+// his code.
+#define GTEST_IMPLEMENTATION_ 1
+#undef GTEST_IMPLEMENTATION_
+
+namespace testing {
+
+using internal::GetUnitTestImpl;
+
+// Gets the summary of the failure message by omitting the stack trace
+// in it.
+std::string TestPartResult::ExtractSummary(const char* message) {
+  const char* const stack_trace = strstr(message, internal::kStackTraceMarker);
+  return stack_trace == NULL ? message :
+      std::string(message, stack_trace);
+}
+
+// Prints a TestPartResult object.
+std::ostream& operator<<(std::ostream& os, const TestPartResult& result) {
+  return os
+      << result.file_name() << ":" << result.line_number() << ": "
+      << (result.type() == TestPartResult::kSuccess ? "Success" :
+          result.type() == TestPartResult::kFatalFailure ? "Fatal failure" :
+          "Non-fatal failure") << ":\n"
+      << result.message() << std::endl;
+}
+
+// Appends a TestPartResult to the array.
+void TestPartResultArray::Append(const TestPartResult& result) {
+  array_.push_back(result);
+}
+
+// Returns the TestPartResult at the given index (0-based).
+const TestPartResult& TestPartResultArray::GetTestPartResult(int index) const {
+  if (index < 0 || index >= size()) {
+    printf("\nInvalid index (%d) into TestPartResultArray.\n", index);
+    internal::posix::Abort();
+  }
+
+  return array_[index];
+}
+
+// Returns the number of TestPartResult objects in the array.
+int TestPartResultArray::size() const {
+  return static_cast<int>(array_.size());
+}
+
+namespace internal {
+
+HasNewFatalFailureHelper::HasNewFatalFailureHelper()
+    : has_new_fatal_failure_(false),
+      original_reporter_(GetUnitTestImpl()->
+                         GetTestPartResultReporterForCurrentThread()) {
+  GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread(this);
+}
+
+HasNewFatalFailureHelper::~HasNewFatalFailureHelper() {
+  GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread(
+      original_reporter_);
+}
+
+void HasNewFatalFailureHelper::ReportTestPartResult(
+    const TestPartResult& result) {
+  if (result.fatally_failed())
+    has_new_fatal_failure_ = true;
+  original_reporter_->ReportTestPartResult(result);
+}
+
+}  // namespace internal
+
+}  // namespace testing
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+
+namespace testing {
+namespace internal {
+
+#if GTEST_HAS_TYPED_TEST_P
+
+// Skips to the first non-space char in str. Returns an empty string if str
+// contains only whitespace characters.
+static const char* SkipSpaces(const char* str) {
+  while (IsSpace(*str))
+    str++;
+  return str;
+}
+
+// Verifies that registered_tests match the test names in
+// defined_test_names_; returns registered_tests if successful, or
+// aborts the program otherwise.
+const char* TypedTestCasePState::VerifyRegisteredTestNames(
+    const char* file, int line, const char* registered_tests) {
+  typedef ::std::set<const char*>::const_iterator DefinedTestIter;
+  registered_ = true;
+
+  // Skip initial whitespace in registered_tests since some
+  // preprocessors prefix stringizied literals with whitespace.
+  registered_tests = SkipSpaces(registered_tests);
+
+  Message errors;
+  ::std::set<std::string> tests;
+  for (const char* names = registered_tests; names != NULL;
+       names = SkipComma(names)) {
+    const std::string name = GetPrefixUntilComma(names);
+    if (tests.count(name) != 0) {
+      errors << "Test " << name << " is listed more than once.\n";
+      continue;
+    }
+
+    bool found = false;
+    for (DefinedTestIter it = defined_test_names_.begin();
+         it != defined_test_names_.end();
+         ++it) {
+      if (name == *it) {
+        found = true;
+        break;
+      }
+    }
+
+    if (found) {
+      tests.insert(name);
+    } else {
+      errors << "No test named " << name
+             << " can be found in this test case.\n";
+    }
+  }
+
+  for (DefinedTestIter it = defined_test_names_.begin();
+       it != defined_test_names_.end();
+       ++it) {
+    if (tests.count(*it) == 0) {
+      errors << "You forgot to list test " << *it << ".\n";
+    }
+  }
+
+  const std::string& errors_str = errors.GetString();
+  if (errors_str != "") {
+    fprintf(stderr, "%s %s", FormatFileLocation(file, line).c_str(),
+            errors_str.c_str());
+    fflush(stderr);
+    posix::Abort();
+  }
+
+  return registered_tests;
+}
+
+#endif  // GTEST_HAS_TYPED_TEST_P
+
+}  // namespace internal
+}  // namespace testing
diff --git a/libs/libvpx/third_party/googletest/src/src/gtest_main.cc b/libs/libvpx/third_party/googletest/src/src/gtest_main.cc
new file mode 100644
index 0000000000..f302822552
--- /dev/null
+++ b/libs/libvpx/third_party/googletest/src/src/gtest_main.cc
@@ -0,0 +1,38 @@
+// Copyright 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include <stdio.h>
+
+#include "gtest/gtest.h"
+
+GTEST_API_ int main(int argc, char **argv) {
+  printf("Running main() from gtest_main.cc\n");
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/libs/libvpx/third_party/libwebm/AUTHORS.TXT b/libs/libvpx/third_party/libwebm/AUTHORS.TXT
new file mode 100644
index 0000000000..9686ac13eb
--- /dev/null
+++ b/libs/libvpx/third_party/libwebm/AUTHORS.TXT
@@ -0,0 +1,4 @@
+# Names should be added to this file like so:
+# Name or Organization <email address>
+
+Google Inc.
diff --git a/libs/libvpx/third_party/libwebm/Android.mk b/libs/libvpx/third_party/libwebm/Android.mk
new file mode 100644
index 0000000000..be9d77deed
--- /dev/null
+++ b/libs/libvpx/third_party/libwebm/Android.mk
@@ -0,0 +1,10 @@
+LOCAL_PATH:= $(call my-dir)
+
+include $(CLEAR_VARS)
+LOCAL_MODULE:= libwebm
+LOCAL_SRC_FILES:= mkvparser.cpp \
+                  mkvreader.cpp \
+                  mkvmuxer.cpp \
+                  mkvmuxerutil.cpp \
+                  mkvwriter.cpp
+include $(BUILD_STATIC_LIBRARY)
diff --git a/libs/libvpx/third_party/libwebm/LICENSE.TXT b/libs/libvpx/third_party/libwebm/LICENSE.TXT
new file mode 100644
index 0000000000..7a6f99547d
--- /dev/null
+++ b/libs/libvpx/third_party/libwebm/LICENSE.TXT
@@ -0,0 +1,30 @@
+Copyright (c) 2010, Google Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+
+  * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in
+    the documentation and/or other materials provided with the
+    distribution.
+
+  * Neither the name of Google nor the names of its contributors may
+    be used to endorse or promote products derived from this software
+    without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
diff --git a/libs/libvpx/third_party/libwebm/PATENTS.TXT b/libs/libvpx/third_party/libwebm/PATENTS.TXT
new file mode 100644
index 0000000000..caedf607e9
--- /dev/null
+++ b/libs/libvpx/third_party/libwebm/PATENTS.TXT
@@ -0,0 +1,23 @@
+Additional IP Rights Grant (Patents)
+------------------------------------
+
+"These implementations" means the copyrightable works that implement the WebM
+codecs distributed by Google as part of the WebM Project.
+
+Google hereby grants to you a perpetual, worldwide, non-exclusive, no-charge,
+royalty-free, irrevocable (except as stated in this section) patent license to
+make, have made, use, offer to sell, sell, import, transfer, and otherwise
+run, modify and propagate the contents of these implementations of WebM, where
+such license applies only to those patent claims, both currently owned by
+Google and acquired in the future, licensable by Google that are necessarily
+infringed by these implementations of WebM. This grant does not include claims
+that would be infringed only as a consequence of further modification of these
+implementations. If you or your agent or exclusive licensee institute or order
+or agree to the institution of patent litigation or any other patent
+enforcement activity against any entity (including a cross-claim or
+counterclaim in a lawsuit) alleging that any of these implementations of WebM
+or any code incorporated within any of these implementations of WebM
+constitute direct or contributory patent infringement, or inducement of
+patent infringement, then any patent rights granted to you under this License
+for these implementations of WebM shall terminate as of the date such
+litigation is filed.
diff --git a/libs/libvpx/third_party/libwebm/README.libvpx b/libs/libvpx/third_party/libwebm/README.libvpx
new file mode 100644
index 0000000000..2989d3d89a
--- /dev/null
+++ b/libs/libvpx/third_party/libwebm/README.libvpx
@@ -0,0 +1,10 @@
+URL: https://chromium.googlesource.com/webm/libwebm
+Version: 476366249e1fda7710a389cd41c57db42305e0d4
+License: BSD
+License File: LICENSE.txt
+
+Description:
+libwebm is used to handle WebM container I/O.
+
+Local Changes:
+* <none>
diff --git a/libs/libvpx/third_party/libwebm/RELEASE.TXT b/libs/libvpx/third_party/libwebm/RELEASE.TXT
new file mode 100644
index 0000000000..db1b77117c
--- /dev/null
+++ b/libs/libvpx/third_party/libwebm/RELEASE.TXT
@@ -0,0 +1,34 @@
+1.0.0.5
+ * Handled case when no duration
+ * Handled empty clusters
+ * Handled empty clusters when seeking
+ * Implemented check lacing bits
+
+1.0.0.4
+ * Made Cues member variables mutables
+ * Defined against badly-formatted cue points
+ * Segment::GetCluster returns CuePoint too
+ * Separated cue-based searches
+
+1.0.0.3
+ * Added Block::GetOffset() to get a frame's offset in a block
+ * Changed cluster count type from size_t to long
+ * Parsed SeekHead to find cues
+ * Allowed seeking beyond end of cluster cache
+ * Added not to attempt to reparse cues element
+ * Restructured Segment::LoadCluster
+ * Marked position of cues without parsing cues element
+ * Allowed cue points to be loaded incrementally
+ * Implemented to load lazily cue points as they're searched
+ * Merged Cues::LoadCuePoint into Cues::Find
+ * Lazy init cues
+ * Loaded cue point during find
+
+1.0.0.2
+ * added support for Cues element
+ * seeking was improved
+
+1.0.0.1
+ * fixed item 141
+ * added item 142
+ * added this file, RELEASE.TXT, to repository
diff --git a/libs/libvpx/third_party/libwebm/mkvmuxer.cpp b/libs/libvpx/third_party/libwebm/mkvmuxer.cpp
new file mode 100644
index 0000000000..9be3119a46
--- /dev/null
+++ b/libs/libvpx/third_party/libwebm/mkvmuxer.cpp
@@ -0,0 +1,3277 @@
+// Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+
+#include "mkvmuxer.hpp"
+
+#include <climits>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <ctime>
+#include <new>
+
+#include "mkvmuxerutil.hpp"
+#include "mkvparser.hpp"
+#include "mkvwriter.hpp"
+#include "webmids.hpp"
+
+#ifdef _MSC_VER
+// Disable MSVC warnings that suggest making code non-portable.
+#pragma warning(disable : 4996)
+#endif
+
+namespace mkvmuxer {
+
+namespace {
+// Deallocate the string designated by |dst|, and then copy the |src|
+// string to |dst|.  The caller owns both the |src| string and the
+// |dst| copy (hence the caller is responsible for eventually
+// deallocating the strings, either directly, or indirectly via
+// StrCpy).  Returns true if the source string was successfully copied
+// to the destination.
+bool StrCpy(const char* src, char** dst_ptr) {
+  if (dst_ptr == NULL)
+    return false;
+
+  char*& dst = *dst_ptr;
+
+  delete[] dst;
+  dst = NULL;
+
+  if (src == NULL)
+    return true;
+
+  const size_t size = strlen(src) + 1;
+
+  dst = new (std::nothrow) char[size];  // NOLINT
+  if (dst == NULL)
+    return false;
+
+  strcpy(dst, src);  // NOLINT
+  return true;
+}
+}  // namespace
+
+///////////////////////////////////////////////////////////////
+//
+// IMkvWriter Class
+
+IMkvWriter::IMkvWriter() {}
+
+IMkvWriter::~IMkvWriter() {}
+
+bool WriteEbmlHeader(IMkvWriter* writer, uint64 doc_type_version) {
+  // Level 0
+  uint64 size = EbmlElementSize(kMkvEBMLVersion, 1ULL);
+  size += EbmlElementSize(kMkvEBMLReadVersion, 1ULL);
+  size += EbmlElementSize(kMkvEBMLMaxIDLength, 4ULL);
+  size += EbmlElementSize(kMkvEBMLMaxSizeLength, 8ULL);
+  size += EbmlElementSize(kMkvDocType, "webm");
+  size += EbmlElementSize(kMkvDocTypeVersion, doc_type_version);
+  size += EbmlElementSize(kMkvDocTypeReadVersion, 2ULL);
+
+  if (!WriteEbmlMasterElement(writer, kMkvEBML, size))
+    return false;
+  if (!WriteEbmlElement(writer, kMkvEBMLVersion, 1ULL))
+    return false;
+  if (!WriteEbmlElement(writer, kMkvEBMLReadVersion, 1ULL))
+    return false;
+  if (!WriteEbmlElement(writer, kMkvEBMLMaxIDLength, 4ULL))
+    return false;
+  if (!WriteEbmlElement(writer, kMkvEBMLMaxSizeLength, 8ULL))
+    return false;
+  if (!WriteEbmlElement(writer, kMkvDocType, "webm"))
+    return false;
+  if (!WriteEbmlElement(writer, kMkvDocTypeVersion, doc_type_version))
+    return false;
+  if (!WriteEbmlElement(writer, kMkvDocTypeReadVersion, 2ULL))
+    return false;
+
+  return true;
+}
+
+bool WriteEbmlHeader(IMkvWriter* writer) {
+  return WriteEbmlHeader(writer, mkvmuxer::Segment::kDefaultDocTypeVersion);
+}
+
+bool ChunkedCopy(mkvparser::IMkvReader* source, mkvmuxer::IMkvWriter* dst,
+                 mkvmuxer::int64 start, int64 size) {
+  // TODO(vigneshv): Check if this is a reasonable value.
+  const uint32 kBufSize = 2048;
+  uint8* buf = new uint8[kBufSize];
+  int64 offset = start;
+  while (size > 0) {
+    const int64 read_len = (size > kBufSize) ? kBufSize : size;
+    if (source->Read(offset, static_cast<long>(read_len), buf))
+      return false;
+    dst->Write(buf, static_cast<uint32>(read_len));
+    offset += read_len;
+    size -= read_len;
+  }
+  delete[] buf;
+  return true;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// Frame Class
+
+Frame::Frame()
+    : add_id_(0),
+      additional_(NULL),
+      additional_length_(0),
+      duration_(0),
+      frame_(NULL),
+      is_key_(false),
+      length_(0),
+      track_number_(0),
+      timestamp_(0),
+      discard_padding_(0),
+      reference_block_timestamp_(0),
+      reference_block_timestamp_set_(false) {}
+
+Frame::~Frame() {
+  delete[] frame_;
+  delete[] additional_;
+}
+
+bool Frame::CopyFrom(const Frame& frame) {
+  delete[] frame_;
+  frame_ = NULL;
+  length_ = 0;
+  if (frame.length() > 0 && frame.frame() != NULL &&
+      !Init(frame.frame(), frame.length())) {
+    return false;
+  }
+  add_id_ = 0;
+  delete[] additional_;
+  additional_ = NULL;
+  additional_length_ = 0;
+  if (frame.additional_length() > 0 && frame.additional() != NULL &&
+      !AddAdditionalData(frame.additional(), frame.additional_length(),
+                         frame.add_id())) {
+    return false;
+  }
+  duration_ = frame.duration();
+  is_key_ = frame.is_key();
+  track_number_ = frame.track_number();
+  timestamp_ = frame.timestamp();
+  discard_padding_ = frame.discard_padding();
+  return true;
+}
+
+bool Frame::Init(const uint8* frame, uint64 length) {
+  uint8* const data =
+      new (std::nothrow) uint8[static_cast<size_t>(length)];  // NOLINT
+  if (!data)
+    return false;
+
+  delete[] frame_;
+  frame_ = data;
+  length_ = length;
+
+  memcpy(frame_, frame, static_cast<size_t>(length_));
+  return true;
+}
+
+bool Frame::AddAdditionalData(const uint8* additional, uint64 length,
+                              uint64 add_id) {
+  uint8* const data =
+      new (std::nothrow) uint8[static_cast<size_t>(length)];  // NOLINT
+  if (!data)
+    return false;
+
+  delete[] additional_;
+  additional_ = data;
+  additional_length_ = length;
+  add_id_ = add_id;
+
+  memcpy(additional_, additional, static_cast<size_t>(additional_length_));
+  return true;
+}
+
+bool Frame::IsValid() const {
+  if (length_ == 0 || !frame_) {
+    return false;
+  }
+  if ((additional_length_ != 0 && !additional_) ||
+      (additional_ != NULL && additional_length_ == 0)) {
+    return false;
+  }
+  if (track_number_ == 0 || track_number_ > kMaxTrackNumber) {
+    return false;
+  }
+  if (!CanBeSimpleBlock() && !is_key_ && !reference_block_timestamp_set_) {
+    return false;
+  }
+  return true;
+}
+
+bool Frame::CanBeSimpleBlock() const {
+  return additional_ == NULL && discard_padding_ == 0 && duration_ == 0;
+}
+
+void Frame::set_reference_block_timestamp(int64 reference_block_timestamp) {
+  reference_block_timestamp_ = reference_block_timestamp;
+  reference_block_timestamp_set_ = true;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// CuePoint Class
+
+CuePoint::CuePoint()
+    : time_(0),
+      track_(0),
+      cluster_pos_(0),
+      block_number_(1),
+      output_block_number_(true) {}
+
+CuePoint::~CuePoint() {}
+
+bool CuePoint::Write(IMkvWriter* writer) const {
+  if (!writer || track_ < 1 || cluster_pos_ < 1)
+    return false;
+
+  uint64 size = EbmlElementSize(kMkvCueClusterPosition, cluster_pos_);
+  size += EbmlElementSize(kMkvCueTrack, track_);
+  if (output_block_number_ && block_number_ > 1)
+    size += EbmlElementSize(kMkvCueBlockNumber, block_number_);
+  const uint64 track_pos_size =
+      EbmlMasterElementSize(kMkvCueTrackPositions, size) + size;
+  const uint64 payload_size =
+      EbmlElementSize(kMkvCueTime, time_) + track_pos_size;
+
+  if (!WriteEbmlMasterElement(writer, kMkvCuePoint, payload_size))
+    return false;
+
+  const int64 payload_position = writer->Position();
+  if (payload_position < 0)
+    return false;
+
+  if (!WriteEbmlElement(writer, kMkvCueTime, time_))
+    return false;
+
+  if (!WriteEbmlMasterElement(writer, kMkvCueTrackPositions, size))
+    return false;
+  if (!WriteEbmlElement(writer, kMkvCueTrack, track_))
+    return false;
+  if (!WriteEbmlElement(writer, kMkvCueClusterPosition, cluster_pos_))
+    return false;
+  if (output_block_number_ && block_number_ > 1)
+    if (!WriteEbmlElement(writer, kMkvCueBlockNumber, block_number_))
+      return false;
+
+  const int64 stop_position = writer->Position();
+  if (stop_position < 0)
+    return false;
+
+  if (stop_position - payload_position != static_cast<int64>(payload_size))
+    return false;
+
+  return true;
+}
+
+uint64 CuePoint::PayloadSize() const {
+  uint64 size = EbmlElementSize(kMkvCueClusterPosition, cluster_pos_);
+  size += EbmlElementSize(kMkvCueTrack, track_);
+  if (output_block_number_ && block_number_ > 1)
+    size += EbmlElementSize(kMkvCueBlockNumber, block_number_);
+  const uint64 track_pos_size =
+      EbmlMasterElementSize(kMkvCueTrackPositions, size) + size;
+  const uint64 payload_size =
+      EbmlElementSize(kMkvCueTime, time_) + track_pos_size;
+
+  return payload_size;
+}
+
+uint64 CuePoint::Size() const {
+  const uint64 payload_size = PayloadSize();
+  return EbmlMasterElementSize(kMkvCuePoint, payload_size) + payload_size;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// Cues Class
+
+Cues::Cues()
+    : cue_entries_capacity_(0),
+      cue_entries_size_(0),
+      cue_entries_(NULL),
+      output_block_number_(true) {}
+
+Cues::~Cues() {
+  if (cue_entries_) {
+    for (int32 i = 0; i < cue_entries_size_; ++i) {
+      CuePoint* const cue = cue_entries_[i];
+      delete cue;
+    }
+    delete[] cue_entries_;
+  }
+}
+
+bool Cues::AddCue(CuePoint* cue) {
+  if (!cue)
+    return false;
+
+  if ((cue_entries_size_ + 1) > cue_entries_capacity_) {
+    // Add more CuePoints.
+    const int32 new_capacity =
+        (!cue_entries_capacity_) ? 2 : cue_entries_capacity_ * 2;
+
+    if (new_capacity < 1)
+      return false;
+
+    CuePoint** const cues =
+        new (std::nothrow) CuePoint*[new_capacity];  // NOLINT
+    if (!cues)
+      return false;
+
+    for (int32 i = 0; i < cue_entries_size_; ++i) {
+      cues[i] = cue_entries_[i];
+    }
+
+    delete[] cue_entries_;
+
+    cue_entries_ = cues;
+    cue_entries_capacity_ = new_capacity;
+  }
+
+  cue->set_output_block_number(output_block_number_);
+  cue_entries_[cue_entries_size_++] = cue;
+  return true;
+}
+
+CuePoint* Cues::GetCueByIndex(int32 index) const {
+  if (cue_entries_ == NULL)
+    return NULL;
+
+  if (index >= cue_entries_size_)
+    return NULL;
+
+  return cue_entries_[index];
+}
+
+uint64 Cues::Size() {
+  uint64 size = 0;
+  for (int32 i = 0; i < cue_entries_size_; ++i)
+    size += GetCueByIndex(i)->Size();
+  size += EbmlMasterElementSize(kMkvCues, size);
+  return size;
+}
+
+bool Cues::Write(IMkvWriter* writer) const {
+  if (!writer)
+    return false;
+
+  uint64 size = 0;
+  for (int32 i = 0; i < cue_entries_size_; ++i) {
+    const CuePoint* const cue = GetCueByIndex(i);
+
+    if (!cue)
+      return false;
+
+    size += cue->Size();
+  }
+
+  if (!WriteEbmlMasterElement(writer, kMkvCues, size))
+    return false;
+
+  const int64 payload_position = writer->Position();
+  if (payload_position < 0)
+    return false;
+
+  for (int32 i = 0; i < cue_entries_size_; ++i) {
+    const CuePoint* const cue = GetCueByIndex(i);
+
+    if (!cue->Write(writer))
+      return false;
+  }
+
+  const int64 stop_position = writer->Position();
+  if (stop_position < 0)
+    return false;
+
+  if (stop_position - payload_position != static_cast<int64>(size))
+    return false;
+
+  return true;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// ContentEncAESSettings Class
+
+ContentEncAESSettings::ContentEncAESSettings() : cipher_mode_(kCTR) {}
+
+uint64 ContentEncAESSettings::Size() const {
+  const uint64 payload = PayloadSize();
+  const uint64 size =
+      EbmlMasterElementSize(kMkvContentEncAESSettings, payload) + payload;
+  return size;
+}
+
+bool ContentEncAESSettings::Write(IMkvWriter* writer) const {
+  const uint64 payload = PayloadSize();
+
+  if (!WriteEbmlMasterElement(writer, kMkvContentEncAESSettings, payload))
+    return false;
+
+  const int64 payload_position = writer->Position();
+  if (payload_position < 0)
+    return false;
+
+  if (!WriteEbmlElement(writer, kMkvAESSettingsCipherMode, cipher_mode_))
+    return false;
+
+  const int64 stop_position = writer->Position();
+  if (stop_position < 0 ||
+      stop_position - payload_position != static_cast<int64>(payload))
+    return false;
+
+  return true;
+}
+
+uint64 ContentEncAESSettings::PayloadSize() const {
+  uint64 size = EbmlElementSize(kMkvAESSettingsCipherMode, cipher_mode_);
+  return size;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// ContentEncoding Class
+
+ContentEncoding::ContentEncoding()
+    : enc_algo_(5),
+      enc_key_id_(NULL),
+      encoding_order_(0),
+      encoding_scope_(1),
+      encoding_type_(1),
+      enc_key_id_length_(0) {}
+
+ContentEncoding::~ContentEncoding() { delete[] enc_key_id_; }
+
+bool ContentEncoding::SetEncryptionID(const uint8* id, uint64 length) {
+  if (!id || length < 1)
+    return false;
+
+  delete[] enc_key_id_;
+
+  enc_key_id_ =
+      new (std::nothrow) uint8[static_cast<size_t>(length)];  // NOLINT
+  if (!enc_key_id_)
+    return false;
+
+  memcpy(enc_key_id_, id, static_cast<size_t>(length));
+  enc_key_id_length_ = length;
+
+  return true;
+}
+
+uint64 ContentEncoding::Size() const {
+  const uint64 encryption_size = EncryptionSize();
+  const uint64 encoding_size = EncodingSize(0, encryption_size);
+  const uint64 encodings_size =
+      EbmlMasterElementSize(kMkvContentEncoding, encoding_size) + encoding_size;
+
+  return encodings_size;
+}
+
+bool ContentEncoding::Write(IMkvWriter* writer) const {
+  const uint64 encryption_size = EncryptionSize();
+  const uint64 encoding_size = EncodingSize(0, encryption_size);
+  const uint64 size =
+      EbmlMasterElementSize(kMkvContentEncoding, encoding_size) + encoding_size;
+
+  const int64 payload_position = writer->Position();
+  if (payload_position < 0)
+    return false;
+
+  if (!WriteEbmlMasterElement(writer, kMkvContentEncoding, encoding_size))
+    return false;
+  if (!WriteEbmlElement(writer, kMkvContentEncodingOrder, encoding_order_))
+    return false;
+  if (!WriteEbmlElement(writer, kMkvContentEncodingScope, encoding_scope_))
+    return false;
+  if (!WriteEbmlElement(writer, kMkvContentEncodingType, encoding_type_))
+    return false;
+
+  if (!WriteEbmlMasterElement(writer, kMkvContentEncryption, encryption_size))
+    return false;
+  if (!WriteEbmlElement(writer, kMkvContentEncAlgo, enc_algo_))
+    return false;
+  if (!WriteEbmlElement(writer, kMkvContentEncKeyID, enc_key_id_,
+                        enc_key_id_length_))
+    return false;
+
+  if (!enc_aes_settings_.Write(writer))
+    return false;
+
+  const int64 stop_position = writer->Position();
+  if (stop_position < 0 ||
+      stop_position - payload_position != static_cast<int64>(size))
+    return false;
+
+  return true;
+}
+
+uint64 ContentEncoding::EncodingSize(uint64 compresion_size,
+                                     uint64 encryption_size) const {
+  // TODO(fgalligan): Add support for compression settings.
+  if (compresion_size != 0)
+    return 0;
+
+  uint64 encoding_size = 0;
+
+  if (encryption_size > 0) {
+    encoding_size +=
+        EbmlMasterElementSize(kMkvContentEncryption, encryption_size) +
+        encryption_size;
+  }
+  encoding_size += EbmlElementSize(kMkvContentEncodingType, encoding_type_);
+  encoding_size += EbmlElementSize(kMkvContentEncodingScope, encoding_scope_);
+  encoding_size += EbmlElementSize(kMkvContentEncodingOrder, encoding_order_);
+
+  return encoding_size;
+}
+
+uint64 ContentEncoding::EncryptionSize() const {
+  const uint64 aes_size = enc_aes_settings_.Size();
+
+  uint64 encryption_size =
+      EbmlElementSize(kMkvContentEncKeyID, enc_key_id_, enc_key_id_length_);
+  encryption_size += EbmlElementSize(kMkvContentEncAlgo, enc_algo_);
+
+  return encryption_size + aes_size;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// Track Class
+
+Track::Track(unsigned int* seed)
+    : codec_id_(NULL),
+      codec_private_(NULL),
+      language_(NULL),
+      max_block_additional_id_(0),
+      name_(NULL),
+      number_(0),
+      type_(0),
+      uid_(MakeUID(seed)),
+      codec_delay_(0),
+      seek_pre_roll_(0),
+      default_duration_(0),
+      codec_private_length_(0),
+      content_encoding_entries_(NULL),
+      content_encoding_entries_size_(0) {}
+
+Track::~Track() {
+  delete[] codec_id_;
+  delete[] codec_private_;
+  delete[] language_;
+  delete[] name_;
+
+  if (content_encoding_entries_) {
+    for (uint32 i = 0; i < content_encoding_entries_size_; ++i) {
+      ContentEncoding* const encoding = content_encoding_entries_[i];
+      delete encoding;
+    }
+    delete[] content_encoding_entries_;
+  }
+}
+
+bool Track::AddContentEncoding() {
+  const uint32 count = content_encoding_entries_size_ + 1;
+
+  ContentEncoding** const content_encoding_entries =
+      new (std::nothrow) ContentEncoding*[count];  // NOLINT
+  if (!content_encoding_entries)
+    return false;
+
+  ContentEncoding* const content_encoding =
+      new (std::nothrow) ContentEncoding();  // NOLINT
+  if (!content_encoding) {
+    delete[] content_encoding_entries;
+    return false;
+  }
+
+  for (uint32 i = 0; i < content_encoding_entries_size_; ++i) {
+    content_encoding_entries[i] = content_encoding_entries_[i];
+  }
+
+  delete[] content_encoding_entries_;
+
+  content_encoding_entries_ = content_encoding_entries;
+  content_encoding_entries_[content_encoding_entries_size_] = content_encoding;
+  content_encoding_entries_size_ = count;
+  return true;
+}
+
+ContentEncoding* Track::GetContentEncodingByIndex(uint32 index) const {
+  if (content_encoding_entries_ == NULL)
+    return NULL;
+
+  if (index >= content_encoding_entries_size_)
+    return NULL;
+
+  return content_encoding_entries_[index];
+}
+
+uint64 Track::PayloadSize() const {
+  uint64 size = EbmlElementSize(kMkvTrackNumber, number_);
+  size += EbmlElementSize(kMkvTrackUID, uid_);
+  size += EbmlElementSize(kMkvTrackType, type_);
+  if (codec_id_)
+    size += EbmlElementSize(kMkvCodecID, codec_id_);
+  if (codec_private_)
+    size += EbmlElementSize(kMkvCodecPrivate, codec_private_,
+                            codec_private_length_);
+  if (language_)
+    size += EbmlElementSize(kMkvLanguage, language_);
+  if (name_)
+    size += EbmlElementSize(kMkvName, name_);
+  if (max_block_additional_id_)
+    size += EbmlElementSize(kMkvMaxBlockAdditionID, max_block_additional_id_);
+  if (codec_delay_)
+    size += EbmlElementSize(kMkvCodecDelay, codec_delay_);
+  if (seek_pre_roll_)
+    size += EbmlElementSize(kMkvSeekPreRoll, seek_pre_roll_);
+  if (default_duration_)
+    size += EbmlElementSize(kMkvDefaultDuration, default_duration_);
+
+  if (content_encoding_entries_size_ > 0) {
+    uint64 content_encodings_size = 0;
+    for (uint32 i = 0; i < content_encoding_entries_size_; ++i) {
+      ContentEncoding* const encoding = content_encoding_entries_[i];
+      content_encodings_size += encoding->Size();
+    }
+
+    size +=
+        EbmlMasterElementSize(kMkvContentEncodings, content_encodings_size) +
+        content_encodings_size;
+  }
+
+  return size;
+}
+
+uint64 Track::Size() const {
+  uint64 size = PayloadSize();
+  size += EbmlMasterElementSize(kMkvTrackEntry, size);
+  return size;
+}
+
+bool Track::Write(IMkvWriter* writer) const {
+  if (!writer)
+    return false;
+
+  // mandatory elements without a default value.
+  if (!type_ || !codec_id_)
+    return false;
+
+  // |size| may be bigger than what is written out in this function because
+  // derived classes may write out more data in the Track element.
+  const uint64 payload_size = PayloadSize();
+
+  if (!WriteEbmlMasterElement(writer, kMkvTrackEntry, payload_size))
+    return false;
+
+  uint64 size = EbmlElementSize(kMkvTrackNumber, number_);
+  size += EbmlElementSize(kMkvTrackUID, uid_);
+  size += EbmlElementSize(kMkvTrackType, type_);
+  if (codec_id_)
+    size += EbmlElementSize(kMkvCodecID, codec_id_);
+  if (codec_private_)
+    size += EbmlElementSize(kMkvCodecPrivate, codec_private_,
+                            codec_private_length_);
+  if (language_)
+    size += EbmlElementSize(kMkvLanguage, language_);
+  if (name_)
+    size += EbmlElementSize(kMkvName, name_);
+  if (max_block_additional_id_)
+    size += EbmlElementSize(kMkvMaxBlockAdditionID, max_block_additional_id_);
+  if (codec_delay_)
+    size += EbmlElementSize(kMkvCodecDelay, codec_delay_);
+  if (seek_pre_roll_)
+    size += EbmlElementSize(kMkvSeekPreRoll, seek_pre_roll_);
+  if (default_duration_)
+    size += EbmlElementSize(kMkvDefaultDuration, default_duration_);
+
+  const int64 payload_position = writer->Position();
+  if (payload_position < 0)
+    return false;
+
+  if (!WriteEbmlElement(writer, kMkvTrackNumber, number_))
+    return false;
+  if (!WriteEbmlElement(writer, kMkvTrackUID, uid_))
+    return false;
+  if (!WriteEbmlElement(writer, kMkvTrackType, type_))
+    return false;
+  if (max_block_additional_id_) {
+    if (!WriteEbmlElement(writer, kMkvMaxBlockAdditionID,
+                          max_block_additional_id_)) {
+      return false;
+    }
+  }
+  if (codec_delay_) {
+    if (!WriteEbmlElement(writer, kMkvCodecDelay, codec_delay_))
+      return false;
+  }
+  if (seek_pre_roll_) {
+    if (!WriteEbmlElement(writer, kMkvSeekPreRoll, seek_pre_roll_))
+      return false;
+  }
+  if (default_duration_) {
+    if (!WriteEbmlElement(writer, kMkvDefaultDuration, default_duration_))
+      return false;
+  }
+  if (codec_id_) {
+    if (!WriteEbmlElement(writer, kMkvCodecID, codec_id_))
+      return false;
+  }
+  if (codec_private_) {
+    if (!WriteEbmlElement(writer, kMkvCodecPrivate, codec_private_,
+                          codec_private_length_))
+      return false;
+  }
+  if (language_) {
+    if (!WriteEbmlElement(writer, kMkvLanguage, language_))
+      return false;
+  }
+  if (name_) {
+    if (!WriteEbmlElement(writer, kMkvName, name_))
+      return false;
+  }
+
+  int64 stop_position = writer->Position();
+  if (stop_position < 0 ||
+      stop_position - payload_position != static_cast<int64>(size))
+    return false;
+
+  if (content_encoding_entries_size_ > 0) {
+    uint64 content_encodings_size = 0;
+    for (uint32 i = 0; i < content_encoding_entries_size_; ++i) {
+      ContentEncoding* const encoding = content_encoding_entries_[i];
+      content_encodings_size += encoding->Size();
+    }
+
+    if (!WriteEbmlMasterElement(writer, kMkvContentEncodings,
+                                content_encodings_size))
+      return false;
+
+    for (uint32 i = 0; i < content_encoding_entries_size_; ++i) {
+      ContentEncoding* const encoding = content_encoding_entries_[i];
+      if (!encoding->Write(writer))
+        return false;
+    }
+  }
+
+  stop_position = writer->Position();
+  if (stop_position < 0)
+    return false;
+  return true;
+}
+
+bool Track::SetCodecPrivate(const uint8* codec_private, uint64 length) {
+  if (!codec_private || length < 1)
+    return false;
+
+  delete[] codec_private_;
+
+  codec_private_ =
+      new (std::nothrow) uint8[static_cast<size_t>(length)];  // NOLINT
+  if (!codec_private_)
+    return false;
+
+  memcpy(codec_private_, codec_private, static_cast<size_t>(length));
+  codec_private_length_ = length;
+
+  return true;
+}
+
+void Track::set_codec_id(const char* codec_id) {
+  if (codec_id) {
+    delete[] codec_id_;
+
+    const size_t length = strlen(codec_id) + 1;
+    codec_id_ = new (std::nothrow) char[length];  // NOLINT
+    if (codec_id_) {
+#ifdef _MSC_VER
+      strcpy_s(codec_id_, length, codec_id);
+#else
+      strcpy(codec_id_, codec_id);
+#endif
+    }
+  }
+}
+
+// TODO(fgalligan): Vet the language parameter.
+void Track::set_language(const char* language) {
+  if (language) {
+    delete[] language_;
+
+    const size_t length = strlen(language) + 1;
+    language_ = new (std::nothrow) char[length];  // NOLINT
+    if (language_) {
+#ifdef _MSC_VER
+      strcpy_s(language_, length, language);
+#else
+      strcpy(language_, language);
+#endif
+    }
+  }
+}
+
+void Track::set_name(const char* name) {
+  if (name) {
+    delete[] name_;
+
+    const size_t length = strlen(name) + 1;
+    name_ = new (std::nothrow) char[length];  // NOLINT
+    if (name_) {
+#ifdef _MSC_VER
+      strcpy_s(name_, length, name);
+#else
+      strcpy(name_, name);
+#endif
+    }
+  }
+}
+
+///////////////////////////////////////////////////////////////
+//
+// VideoTrack Class
+
+VideoTrack::VideoTrack(unsigned int* seed)
+    : Track(seed),
+      display_height_(0),
+      display_width_(0),
+      crop_left_(0),
+      crop_right_(0),
+      crop_top_(0),
+      crop_bottom_(0),
+      frame_rate_(0.0),
+      height_(0),
+      stereo_mode_(0),
+      alpha_mode_(0),
+      width_(0) {}
+
+VideoTrack::~VideoTrack() {}
+
+bool VideoTrack::SetStereoMode(uint64 stereo_mode) {
+  if (stereo_mode != kMono && stereo_mode != kSideBySideLeftIsFirst &&
+      stereo_mode != kTopBottomRightIsFirst &&
+      stereo_mode != kTopBottomLeftIsFirst &&
+      stereo_mode != kSideBySideRightIsFirst)
+    return false;
+
+  stereo_mode_ = stereo_mode;
+  return true;
+}
+
+bool VideoTrack::SetAlphaMode(uint64 alpha_mode) {
+  if (alpha_mode != kNoAlpha && alpha_mode != kAlpha)
+    return false;
+
+  alpha_mode_ = alpha_mode;
+  return true;
+}
+
+uint64 VideoTrack::PayloadSize() const {
+  const uint64 parent_size = Track::PayloadSize();
+
+  uint64 size = VideoPayloadSize();
+  size += EbmlMasterElementSize(kMkvVideo, size);
+
+  return parent_size + size;
+}
+
+bool VideoTrack::Write(IMkvWriter* writer) const {
+  if (!Track::Write(writer))
+    return false;
+
+  const uint64 size = VideoPayloadSize();
+
+  if (!WriteEbmlMasterElement(writer, kMkvVideo, size))
+    return false;
+
+  const int64 payload_position = writer->Position();
+  if (payload_position < 0)
+    return false;
+
+  if (!WriteEbmlElement(writer, kMkvPixelWidth, width_))
+    return false;
+  if (!WriteEbmlElement(writer, kMkvPixelHeight, height_))
+    return false;
+  if (display_width_ > 0) {
+    if (!WriteEbmlElement(writer, kMkvDisplayWidth, display_width_))
+      return false;
+  }
+  if (display_height_ > 0) {
+    if (!WriteEbmlElement(writer, kMkvDisplayHeight, display_height_))
+      return false;
+  }
+  if (crop_left_ > 0) {
+    if (!WriteEbmlElement(writer, kMkvPixelCropLeft, crop_left_))
+      return false;
+  }
+  if (crop_right_ > 0) {
+    if (!WriteEbmlElement(writer, kMkvPixelCropRight, crop_right_))
+      return false;
+  }
+  if (crop_top_ > 0) {
+    if (!WriteEbmlElement(writer, kMkvPixelCropTop, crop_top_))
+      return false;
+  }
+  if (crop_bottom_ > 0) {
+    if (!WriteEbmlElement(writer, kMkvPixelCropBottom, crop_bottom_))
+      return false;
+  }
+  if (stereo_mode_ > kMono) {
+    if (!WriteEbmlElement(writer, kMkvStereoMode, stereo_mode_))
+      return false;
+  }
+  if (alpha_mode_ > kNoAlpha) {
+    if (!WriteEbmlElement(writer, kMkvAlphaMode, alpha_mode_))
+      return false;
+  }
+  if (frame_rate_ > 0.0) {
+    if (!WriteEbmlElement(writer, kMkvFrameRate,
+                          static_cast<float>(frame_rate_))) {
+      return false;
+    }
+  }
+
+  const int64 stop_position = writer->Position();
+  if (stop_position < 0 ||
+      stop_position - payload_position != static_cast<int64>(size)) {
+    return false;
+  }
+
+  return true;
+}
+
+uint64 VideoTrack::VideoPayloadSize() const {
+  uint64 size = EbmlElementSize(kMkvPixelWidth, width_);
+  size += EbmlElementSize(kMkvPixelHeight, height_);
+  if (display_width_ > 0)
+    size += EbmlElementSize(kMkvDisplayWidth, display_width_);
+  if (display_height_ > 0)
+    size += EbmlElementSize(kMkvDisplayHeight, display_height_);
+  if (crop_left_ > 0)
+    size += EbmlElementSize(kMkvPixelCropLeft, crop_left_);
+  if (crop_right_ > 0)
+    size += EbmlElementSize(kMkvPixelCropRight, crop_right_);
+  if (crop_top_ > 0)
+    size += EbmlElementSize(kMkvPixelCropTop, crop_top_);
+  if (crop_bottom_ > 0)
+    size += EbmlElementSize(kMkvPixelCropBottom, crop_bottom_);
+  if (stereo_mode_ > kMono)
+    size += EbmlElementSize(kMkvStereoMode, stereo_mode_);
+  if (alpha_mode_ > kNoAlpha)
+    size += EbmlElementSize(kMkvAlphaMode, alpha_mode_);
+  if (frame_rate_ > 0.0)
+    size += EbmlElementSize(kMkvFrameRate, static_cast<float>(frame_rate_));
+
+  return size;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// AudioTrack Class
+
+AudioTrack::AudioTrack(unsigned int* seed)
+    : Track(seed), bit_depth_(0), channels_(1), sample_rate_(0.0) {}
+
+AudioTrack::~AudioTrack() {}
+
+uint64 AudioTrack::PayloadSize() const {
+  const uint64 parent_size = Track::PayloadSize();
+
+  uint64 size =
+      EbmlElementSize(kMkvSamplingFrequency, static_cast<float>(sample_rate_));
+  size += EbmlElementSize(kMkvChannels, channels_);
+  if (bit_depth_ > 0)
+    size += EbmlElementSize(kMkvBitDepth, bit_depth_);
+  size += EbmlMasterElementSize(kMkvAudio, size);
+
+  return parent_size + size;
+}
+
+bool AudioTrack::Write(IMkvWriter* writer) const {
+  if (!Track::Write(writer))
+    return false;
+
+  // Calculate AudioSettings size.
+  uint64 size =
+      EbmlElementSize(kMkvSamplingFrequency, static_cast<float>(sample_rate_));
+  size += EbmlElementSize(kMkvChannels, channels_);
+  if (bit_depth_ > 0)
+    size += EbmlElementSize(kMkvBitDepth, bit_depth_);
+
+  if (!WriteEbmlMasterElement(writer, kMkvAudio, size))
+    return false;
+
+  const int64 payload_position = writer->Position();
+  if (payload_position < 0)
+    return false;
+
+  if (!WriteEbmlElement(writer, kMkvSamplingFrequency,
+                        static_cast<float>(sample_rate_)))
+    return false;
+  if (!WriteEbmlElement(writer, kMkvChannels, channels_))
+    return false;
+  if (bit_depth_ > 0)
+    if (!WriteEbmlElement(writer, kMkvBitDepth, bit_depth_))
+      return false;
+
+  const int64 stop_position = writer->Position();
+  if (stop_position < 0 ||
+      stop_position - payload_position != static_cast<int64>(size))
+    return false;
+
+  return true;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// Tracks Class
+
+const char Tracks::kOpusCodecId[] = "A_OPUS";
+const char Tracks::kVorbisCodecId[] = "A_VORBIS";
+const char Tracks::kVp8CodecId[] = "V_VP8";
+const char Tracks::kVp9CodecId[] = "V_VP9";
+const char Tracks::kVp10CodecId[] = "V_VP10";
+
+Tracks::Tracks() : track_entries_(NULL), track_entries_size_(0) {}
+
+Tracks::~Tracks() {
+  if (track_entries_) {
+    for (uint32 i = 0; i < track_entries_size_; ++i) {
+      Track* const track = track_entries_[i];
+      delete track;
+    }
+    delete[] track_entries_;
+  }
+}
+
+bool Tracks::AddTrack(Track* track, int32 number) {
+  if (number < 0)
+    return false;
+
+  // This muxer only supports track numbers in the range [1, 126], in
+  // order to be able (to use Matroska integer representation) to
+  // serialize the block header (of which the track number is a part)
+  // for a frame using exactly 4 bytes.
+
+  if (number > 0x7E)
+    return false;
+
+  uint32 track_num = number;
+
+  if (track_num > 0) {
+    // Check to make sure a track does not already have |track_num|.
+    for (uint32 i = 0; i < track_entries_size_; ++i) {
+      if (track_entries_[i]->number() == track_num)
+        return false;
+    }
+  }
+
+  const uint32 count = track_entries_size_ + 1;
+
+  Track** const track_entries = new (std::nothrow) Track*[count];  // NOLINT
+  if (!track_entries)
+    return false;
+
+  for (uint32 i = 0; i < track_entries_size_; ++i) {
+    track_entries[i] = track_entries_[i];
+  }
+
+  delete[] track_entries_;
+
+  // Find the lowest availible track number > 0.
+  if (track_num == 0) {
+    track_num = count;
+
+    // Check to make sure a track does not already have |track_num|.
+    bool exit = false;
+    do {
+      exit = true;
+      for (uint32 i = 0; i < track_entries_size_; ++i) {
+        if (track_entries[i]->number() == track_num) {
+          track_num++;
+          exit = false;
+          break;
+        }
+      }
+    } while (!exit);
+  }
+  track->set_number(track_num);
+
+  track_entries_ = track_entries;
+  track_entries_[track_entries_size_] = track;
+  track_entries_size_ = count;
+  return true;
+}
+
+const Track* Tracks::GetTrackByIndex(uint32 index) const {
+  if (track_entries_ == NULL)
+    return NULL;
+
+  if (index >= track_entries_size_)
+    return NULL;
+
+  return track_entries_[index];
+}
+
+Track* Tracks::GetTrackByNumber(uint64 track_number) const {
+  const int32 count = track_entries_size();
+  for (int32 i = 0; i < count; ++i) {
+    if (track_entries_[i]->number() == track_number)
+      return track_entries_[i];
+  }
+
+  return NULL;
+}
+
+bool Tracks::TrackIsAudio(uint64 track_number) const {
+  const Track* const track = GetTrackByNumber(track_number);
+
+  if (track->type() == kAudio)
+    return true;
+
+  return false;
+}
+
+bool Tracks::TrackIsVideo(uint64 track_number) const {
+  const Track* const track = GetTrackByNumber(track_number);
+
+  if (track->type() == kVideo)
+    return true;
+
+  return false;
+}
+
+bool Tracks::Write(IMkvWriter* writer) const {
+  uint64 size = 0;
+  const int32 count = track_entries_size();
+  for (int32 i = 0; i < count; ++i) {
+    const Track* const track = GetTrackByIndex(i);
+
+    if (!track)
+      return false;
+
+    size += track->Size();
+  }
+
+  if (!WriteEbmlMasterElement(writer, kMkvTracks, size))
+    return false;
+
+  const int64 payload_position = writer->Position();
+  if (payload_position < 0)
+    return false;
+
+  for (int32 i = 0; i < count; ++i) {
+    const Track* const track = GetTrackByIndex(i);
+    if (!track->Write(writer))
+      return false;
+  }
+
+  const int64 stop_position = writer->Position();
+  if (stop_position < 0 ||
+      stop_position - payload_position != static_cast<int64>(size))
+    return false;
+
+  return true;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// Chapter Class
+
+bool Chapter::set_id(const char* id) { return StrCpy(id, &id_); }
+
+void Chapter::set_time(const Segment& segment, uint64 start_ns, uint64 end_ns) {
+  const SegmentInfo* const info = segment.GetSegmentInfo();
+  const uint64 timecode_scale = info->timecode_scale();
+  start_timecode_ = start_ns / timecode_scale;
+  end_timecode_ = end_ns / timecode_scale;
+}
+
+bool Chapter::add_string(const char* title, const char* language,
+                         const char* country) {
+  if (!ExpandDisplaysArray())
+    return false;
+
+  Display& d = displays_[displays_count_++];
+  d.Init();
+
+  if (!d.set_title(title))
+    return false;
+
+  if (!d.set_language(language))
+    return false;
+
+  if (!d.set_country(country))
+    return false;
+
+  return true;
+}
+
+Chapter::Chapter() {
+  // This ctor only constructs the object.  Proper initialization is
+  // done in Init() (called in Chapters::AddChapter()).  The only
+  // reason we bother implementing this ctor is because we had to
+  // declare it as private (along with the dtor), in order to prevent
+  // clients from creating Chapter instances (a privelege we grant
+  // only to the Chapters class).  Doing no initialization here also
+  // means that creating arrays of chapter objects is more efficient,
+  // because we only initialize each new chapter object as it becomes
+  // active on the array.
+}
+
+Chapter::~Chapter() {}
+
+void Chapter::Init(unsigned int* seed) {
+  id_ = NULL;
+  start_timecode_ = 0;
+  end_timecode_ = 0;
+  displays_ = NULL;
+  displays_size_ = 0;
+  displays_count_ = 0;
+  uid_ = MakeUID(seed);
+}
+
+void Chapter::ShallowCopy(Chapter* dst) const {
+  dst->id_ = id_;
+  dst->start_timecode_ = start_timecode_;
+  dst->end_timecode_ = end_timecode_;
+  dst->uid_ = uid_;
+  dst->displays_ = displays_;
+  dst->displays_size_ = displays_size_;
+  dst->displays_count_ = displays_count_;
+}
+
+void Chapter::Clear() {
+  StrCpy(NULL, &id_);
+
+  while (displays_count_ > 0) {
+    Display& d = displays_[--displays_count_];
+    d.Clear();
+  }
+
+  delete[] displays_;
+  displays_ = NULL;
+
+  displays_size_ = 0;
+}
+
+bool Chapter::ExpandDisplaysArray() {
+  if (displays_size_ > displays_count_)
+    return true;  // nothing to do yet
+
+  const int size = (displays_size_ == 0) ? 1 : 2 * displays_size_;
+
+  Display* const displays = new (std::nothrow) Display[size];  // NOLINT
+  if (displays == NULL)
+    return false;
+
+  for (int idx = 0; idx < displays_count_; ++idx) {
+    displays[idx] = displays_[idx];  // shallow copy
+  }
+
+  delete[] displays_;
+
+  displays_ = displays;
+  displays_size_ = size;
+
+  return true;
+}
+
+uint64 Chapter::WriteAtom(IMkvWriter* writer) const {
+  uint64 payload_size = EbmlElementSize(kMkvChapterStringUID, id_) +
+                        EbmlElementSize(kMkvChapterUID, uid_) +
+                        EbmlElementSize(kMkvChapterTimeStart, start_timecode_) +
+                        EbmlElementSize(kMkvChapterTimeEnd, end_timecode_);
+
+  for (int idx = 0; idx < displays_count_; ++idx) {
+    const Display& d = displays_[idx];
+    payload_size += d.WriteDisplay(NULL);
+  }
+
+  const uint64 atom_size =
+      EbmlMasterElementSize(kMkvChapterAtom, payload_size) + payload_size;
+
+  if (writer == NULL)
+    return atom_size;
+
+  const int64 start = writer->Position();
+
+  if (!WriteEbmlMasterElement(writer, kMkvChapterAtom, payload_size))
+    return 0;
+
+  if (!WriteEbmlElement(writer, kMkvChapterStringUID, id_))
+    return 0;
+
+  if (!WriteEbmlElement(writer, kMkvChapterUID, uid_))
+    return 0;
+
+  if (!WriteEbmlElement(writer, kMkvChapterTimeStart, start_timecode_))
+    return 0;
+
+  if (!WriteEbmlElement(writer, kMkvChapterTimeEnd, end_timecode_))
+    return 0;
+
+  for (int idx = 0; idx < displays_count_; ++idx) {
+    const Display& d = displays_[idx];
+
+    if (!d.WriteDisplay(writer))
+      return 0;
+  }
+
+  const int64 stop = writer->Position();
+
+  if (stop >= start && uint64(stop - start) != atom_size)
+    return 0;
+
+  return atom_size;
+}
+
+void Chapter::Display::Init() {
+  title_ = NULL;
+  language_ = NULL;
+  country_ = NULL;
+}
+
+void Chapter::Display::Clear() {
+  StrCpy(NULL, &title_);
+  StrCpy(NULL, &language_);
+  StrCpy(NULL, &country_);
+}
+
+bool Chapter::Display::set_title(const char* title) {
+  return StrCpy(title, &title_);
+}
+
+bool Chapter::Display::set_language(const char* language) {
+  return StrCpy(language, &language_);
+}
+
+bool Chapter::Display::set_country(const char* country) {
+  return StrCpy(country, &country_);
+}
+
+uint64 Chapter::Display::WriteDisplay(IMkvWriter* writer) const {
+  uint64 payload_size = EbmlElementSize(kMkvChapString, title_);
+
+  if (language_)
+    payload_size += EbmlElementSize(kMkvChapLanguage, language_);
+
+  if (country_)
+    payload_size += EbmlElementSize(kMkvChapCountry, country_);
+
+  const uint64 display_size =
+      EbmlMasterElementSize(kMkvChapterDisplay, payload_size) + payload_size;
+
+  if (writer == NULL)
+    return display_size;
+
+  const int64 start = writer->Position();
+
+  if (!WriteEbmlMasterElement(writer, kMkvChapterDisplay, payload_size))
+    return 0;
+
+  if (!WriteEbmlElement(writer, kMkvChapString, title_))
+    return 0;
+
+  if (language_) {
+    if (!WriteEbmlElement(writer, kMkvChapLanguage, language_))
+      return 0;
+  }
+
+  if (country_) {
+    if (!WriteEbmlElement(writer, kMkvChapCountry, country_))
+      return 0;
+  }
+
+  const int64 stop = writer->Position();
+
+  if (stop >= start && uint64(stop - start) != display_size)
+    return 0;
+
+  return display_size;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// Chapters Class
+
+Chapters::Chapters() : chapters_size_(0), chapters_count_(0), chapters_(NULL) {}
+
+Chapters::~Chapters() {
+  while (chapters_count_ > 0) {
+    Chapter& chapter = chapters_[--chapters_count_];
+    chapter.Clear();
+  }
+
+  delete[] chapters_;
+  chapters_ = NULL;
+}
+
+int Chapters::Count() const { return chapters_count_; }
+
+Chapter* Chapters::AddChapter(unsigned int* seed) {
+  if (!ExpandChaptersArray())
+    return NULL;
+
+  Chapter& chapter = chapters_[chapters_count_++];
+  chapter.Init(seed);
+
+  return &chapter;
+}
+
+bool Chapters::Write(IMkvWriter* writer) const {
+  if (writer == NULL)
+    return false;
+
+  const uint64 payload_size = WriteEdition(NULL);  // return size only
+
+  if (!WriteEbmlMasterElement(writer, kMkvChapters, payload_size))
+    return false;
+
+  const int64 start = writer->Position();
+
+  if (WriteEdition(writer) == 0)  // error
+    return false;
+
+  const int64 stop = writer->Position();
+
+  if (stop >= start && uint64(stop - start) != payload_size)
+    return false;
+
+  return true;
+}
+
+bool Chapters::ExpandChaptersArray() {
+  if (chapters_size_ > chapters_count_)
+    return true;  // nothing to do yet
+
+  const int size = (chapters_size_ == 0) ? 1 : 2 * chapters_size_;
+
+  Chapter* const chapters = new (std::nothrow) Chapter[size];  // NOLINT
+  if (chapters == NULL)
+    return false;
+
+  for (int idx = 0; idx < chapters_count_; ++idx) {
+    const Chapter& src = chapters_[idx];
+    Chapter* const dst = chapters + idx;
+    src.ShallowCopy(dst);
+  }
+
+  delete[] chapters_;
+
+  chapters_ = chapters;
+  chapters_size_ = size;
+
+  return true;
+}
+
+uint64 Chapters::WriteEdition(IMkvWriter* writer) const {
+  uint64 payload_size = 0;
+
+  for (int idx = 0; idx < chapters_count_; ++idx) {
+    const Chapter& chapter = chapters_[idx];
+    payload_size += chapter.WriteAtom(NULL);
+  }
+
+  const uint64 edition_size =
+      EbmlMasterElementSize(kMkvEditionEntry, payload_size) + payload_size;
+
+  if (writer == NULL)  // return size only
+    return edition_size;
+
+  const int64 start = writer->Position();
+
+  if (!WriteEbmlMasterElement(writer, kMkvEditionEntry, payload_size))
+    return 0;  // error
+
+  for (int idx = 0; idx < chapters_count_; ++idx) {
+    const Chapter& chapter = chapters_[idx];
+
+    const uint64 chapter_size = chapter.WriteAtom(writer);
+    if (chapter_size == 0)  // error
+      return 0;
+  }
+
+  const int64 stop = writer->Position();
+
+  if (stop >= start && uint64(stop - start) != edition_size)
+    return 0;
+
+  return edition_size;
+}
+
+// Tag Class
+
+bool Tag::add_simple_tag(const char* tag_name, const char* tag_string) {
+  if (!ExpandSimpleTagsArray())
+    return false;
+
+  SimpleTag& st = simple_tags_[simple_tags_count_++];
+  st.Init();
+
+  if (!st.set_tag_name(tag_name))
+    return false;
+
+  if (!st.set_tag_string(tag_string))
+    return false;
+
+  return true;
+}
+
+Tag::Tag() {
+  simple_tags_ = NULL;
+  simple_tags_size_ = 0;
+  simple_tags_count_ = 0;
+}
+
+Tag::~Tag() {}
+
+void Tag::ShallowCopy(Tag* dst) const {
+  dst->simple_tags_ = simple_tags_;
+  dst->simple_tags_size_ = simple_tags_size_;
+  dst->simple_tags_count_ = simple_tags_count_;
+}
+
+void Tag::Clear() {
+  while (simple_tags_count_ > 0) {
+    SimpleTag& st = simple_tags_[--simple_tags_count_];
+    st.Clear();
+  }
+
+  delete[] simple_tags_;
+  simple_tags_ = NULL;
+
+  simple_tags_size_ = 0;
+}
+
+bool Tag::ExpandSimpleTagsArray() {
+  if (simple_tags_size_ > simple_tags_count_)
+    return true;  // nothing to do yet
+
+  const int size = (simple_tags_size_ == 0) ? 1 : 2 * simple_tags_size_;
+
+  SimpleTag* const simple_tags = new (std::nothrow) SimpleTag[size];  // NOLINT
+  if (simple_tags == NULL)
+    return false;
+
+  for (int idx = 0; idx < simple_tags_count_; ++idx) {
+    simple_tags[idx] = simple_tags_[idx];  // shallow copy
+  }
+
+  delete[] simple_tags_;
+
+  simple_tags_ = simple_tags;
+  simple_tags_size_ = size;
+
+  return true;
+}
+
+uint64 Tag::Write(IMkvWriter* writer) const {
+  uint64 payload_size = 0;
+
+  for (int idx = 0; idx < simple_tags_count_; ++idx) {
+    const SimpleTag& st = simple_tags_[idx];
+    payload_size += st.Write(NULL);
+  }
+
+  const uint64 tag_size =
+      EbmlMasterElementSize(kMkvTag, payload_size) + payload_size;
+
+  if (writer == NULL)
+    return tag_size;
+
+  const int64 start = writer->Position();
+
+  if (!WriteEbmlMasterElement(writer, kMkvTag, payload_size))
+    return 0;
+
+  for (int idx = 0; idx < simple_tags_count_; ++idx) {
+    const SimpleTag& st = simple_tags_[idx];
+
+    if (!st.Write(writer))
+      return 0;
+  }
+
+  const int64 stop = writer->Position();
+
+  if (stop >= start && uint64(stop - start) != tag_size)
+    return 0;
+
+  return tag_size;
+}
+
+// Tag::SimpleTag
+
+void Tag::SimpleTag::Init() {
+  tag_name_ = NULL;
+  tag_string_ = NULL;
+}
+
+void Tag::SimpleTag::Clear() {
+  StrCpy(NULL, &tag_name_);
+  StrCpy(NULL, &tag_string_);
+}
+
+bool Tag::SimpleTag::set_tag_name(const char* tag_name) {
+  return StrCpy(tag_name, &tag_name_);
+}
+
+bool Tag::SimpleTag::set_tag_string(const char* tag_string) {
+  return StrCpy(tag_string, &tag_string_);
+}
+
+uint64 Tag::SimpleTag::Write(IMkvWriter* writer) const {
+  uint64 payload_size = EbmlElementSize(kMkvTagName, tag_name_);
+
+  payload_size += EbmlElementSize(kMkvTagString, tag_string_);
+
+  const uint64 simple_tag_size =
+      EbmlMasterElementSize(kMkvSimpleTag, payload_size) + payload_size;
+
+  if (writer == NULL)
+    return simple_tag_size;
+
+  const int64 start = writer->Position();
+
+  if (!WriteEbmlMasterElement(writer, kMkvSimpleTag, payload_size))
+    return 0;
+
+  if (!WriteEbmlElement(writer, kMkvTagName, tag_name_))
+    return 0;
+
+  if (!WriteEbmlElement(writer, kMkvTagString, tag_string_))
+    return 0;
+
+  const int64 stop = writer->Position();
+
+  if (stop >= start && uint64(stop - start) != simple_tag_size)
+    return 0;
+
+  return simple_tag_size;
+}
+
+// Tags Class
+
+Tags::Tags() : tags_size_(0), tags_count_(0), tags_(NULL) {}
+
+Tags::~Tags() {
+  while (tags_count_ > 0) {
+    Tag& tag = tags_[--tags_count_];
+    tag.Clear();
+  }
+
+  delete[] tags_;
+  tags_ = NULL;
+}
+
+int Tags::Count() const { return tags_count_; }
+
+Tag* Tags::AddTag() {
+  if (!ExpandTagsArray())
+    return NULL;
+
+  Tag& tag = tags_[tags_count_++];
+
+  return &tag;
+}
+
+bool Tags::Write(IMkvWriter* writer) const {
+  if (writer == NULL)
+    return false;
+
+  uint64 payload_size = 0;
+
+  for (int idx = 0; idx < tags_count_; ++idx) {
+    const Tag& tag = tags_[idx];
+    payload_size += tag.Write(NULL);
+  }
+
+  if (!WriteEbmlMasterElement(writer, kMkvTags, payload_size))
+    return false;
+
+  const int64 start = writer->Position();
+
+  for (int idx = 0; idx < tags_count_; ++idx) {
+    const Tag& tag = tags_[idx];
+
+    const uint64 tag_size = tag.Write(writer);
+    if (tag_size == 0)  // error
+      return 0;
+  }
+
+  const int64 stop = writer->Position();
+
+  if (stop >= start && uint64(stop - start) != payload_size)
+    return false;
+
+  return true;
+}
+
+bool Tags::ExpandTagsArray() {
+  if (tags_size_ > tags_count_)
+    return true;  // nothing to do yet
+
+  const int size = (tags_size_ == 0) ? 1 : 2 * tags_size_;
+
+  Tag* const tags = new (std::nothrow) Tag[size];  // NOLINT
+  if (tags == NULL)
+    return false;
+
+  for (int idx = 0; idx < tags_count_; ++idx) {
+    const Tag& src = tags_[idx];
+    Tag* const dst = tags + idx;
+    src.ShallowCopy(dst);
+  }
+
+  delete[] tags_;
+
+  tags_ = tags;
+  tags_size_ = size;
+
+  return true;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// Cluster class
+
+Cluster::Cluster(uint64 timecode, int64 cues_pos, uint64 timecode_scale)
+    : blocks_added_(0),
+      finalized_(false),
+      header_written_(false),
+      payload_size_(0),
+      position_for_cues_(cues_pos),
+      size_position_(-1),
+      timecode_(timecode),
+      timecode_scale_(timecode_scale),
+      writer_(NULL) {}
+
+Cluster::~Cluster() {}
+
+bool Cluster::Init(IMkvWriter* ptr_writer) {
+  if (!ptr_writer) {
+    return false;
+  }
+  writer_ = ptr_writer;
+  return true;
+}
+
+bool Cluster::AddFrame(const Frame* const frame) { return DoWriteFrame(frame); }
+
+bool Cluster::AddFrame(const uint8* data, uint64 length, uint64 track_number,
+                       uint64 abs_timecode, bool is_key) {
+  Frame frame;
+  if (!frame.Init(data, length))
+    return false;
+  frame.set_track_number(track_number);
+  frame.set_timestamp(abs_timecode);
+  frame.set_is_key(is_key);
+  return DoWriteFrame(&frame);
+}
+
+bool Cluster::AddFrameWithAdditional(const uint8* data, uint64 length,
+                                     const uint8* additional,
+                                     uint64 additional_length, uint64 add_id,
+                                     uint64 track_number, uint64 abs_timecode,
+                                     bool is_key) {
+  if (!additional || additional_length == 0) {
+    return false;
+  }
+  Frame frame;
+  if (!frame.Init(data, length) ||
+      !frame.AddAdditionalData(additional, additional_length, add_id)) {
+    return false;
+  }
+  frame.set_track_number(track_number);
+  frame.set_timestamp(abs_timecode);
+  frame.set_is_key(is_key);
+  return DoWriteFrame(&frame);
+}
+
+bool Cluster::AddFrameWithDiscardPadding(const uint8* data, uint64 length,
+                                         int64 discard_padding,
+                                         uint64 track_number,
+                                         uint64 abs_timecode, bool is_key) {
+  Frame frame;
+  if (!frame.Init(data, length))
+    return false;
+  frame.set_discard_padding(discard_padding);
+  frame.set_track_number(track_number);
+  frame.set_timestamp(abs_timecode);
+  frame.set_is_key(is_key);
+  return DoWriteFrame(&frame);
+}
+
+bool Cluster::AddMetadata(const uint8* data, uint64 length, uint64 track_number,
+                          uint64 abs_timecode, uint64 duration_timecode) {
+  Frame frame;
+  if (!frame.Init(data, length))
+    return false;
+  frame.set_track_number(track_number);
+  frame.set_timestamp(abs_timecode);
+  frame.set_duration(duration_timecode);
+  frame.set_is_key(true);  // All metadata blocks are keyframes.
+  return DoWriteFrame(&frame);
+}
+
+void Cluster::AddPayloadSize(uint64 size) { payload_size_ += size; }
+
+bool Cluster::Finalize() {
+  if (!writer_ || finalized_ || size_position_ == -1)
+    return false;
+
+  if (writer_->Seekable()) {
+    const int64 pos = writer_->Position();
+
+    if (writer_->Position(size_position_))
+      return false;
+
+    if (WriteUIntSize(writer_, payload_size(), 8))
+      return false;
+
+    if (writer_->Position(pos))
+      return false;
+  }
+
+  finalized_ = true;
+
+  return true;
+}
+
+uint64 Cluster::Size() const {
+  const uint64 element_size =
+      EbmlMasterElementSize(kMkvCluster, 0xFFFFFFFFFFFFFFFFULL) + payload_size_;
+  return element_size;
+}
+
+bool Cluster::PreWriteBlock() {
+  if (finalized_)
+    return false;
+
+  if (!header_written_) {
+    if (!WriteClusterHeader())
+      return false;
+  }
+
+  return true;
+}
+
+void Cluster::PostWriteBlock(uint64 element_size) {
+  AddPayloadSize(element_size);
+  ++blocks_added_;
+}
+
+int64 Cluster::GetRelativeTimecode(int64 abs_timecode) const {
+  const int64 cluster_timecode = this->Cluster::timecode();
+  const int64 rel_timecode =
+      static_cast<int64>(abs_timecode) - cluster_timecode;
+
+  if (rel_timecode < 0 || rel_timecode > kMaxBlockTimecode)
+    return -1;
+
+  return rel_timecode;
+}
+
+bool Cluster::DoWriteFrame(const Frame* const frame) {
+  if (!frame || !frame->IsValid())
+    return false;
+
+  if (!PreWriteBlock())
+    return false;
+
+  const uint64 element_size = WriteFrame(writer_, frame, this);
+  if (element_size == 0)
+    return false;
+
+  PostWriteBlock(element_size);
+  return true;
+}
+
+bool Cluster::WriteClusterHeader() {
+  if (finalized_)
+    return false;
+
+  if (WriteID(writer_, kMkvCluster))
+    return false;
+
+  // Save for later.
+  size_position_ = writer_->Position();
+
+  // Write "unknown" (EBML coded -1) as cluster size value. We need to write 8
+  // bytes because we do not know how big our cluster will be.
+  if (SerializeInt(writer_, kEbmlUnknownValue, 8))
+    return false;
+
+  if (!WriteEbmlElement(writer_, kMkvTimecode, timecode()))
+    return false;
+  AddPayloadSize(EbmlElementSize(kMkvTimecode, timecode()));
+  header_written_ = true;
+
+  return true;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// SeekHead Class
+
+SeekHead::SeekHead() : start_pos_(0ULL) {
+  for (int32 i = 0; i < kSeekEntryCount; ++i) {
+    seek_entry_id_[i] = 0;
+    seek_entry_pos_[i] = 0;
+  }
+}
+
+SeekHead::~SeekHead() {}
+
+bool SeekHead::Finalize(IMkvWriter* writer) const {
+  if (writer->Seekable()) {
+    if (start_pos_ == -1)
+      return false;
+
+    uint64 payload_size = 0;
+    uint64 entry_size[kSeekEntryCount];
+
+    for (int32 i = 0; i < kSeekEntryCount; ++i) {
+      if (seek_entry_id_[i] != 0) {
+        entry_size[i] =
+            EbmlElementSize(kMkvSeekID, static_cast<uint64>(seek_entry_id_[i]));
+        entry_size[i] += EbmlElementSize(kMkvSeekPosition, seek_entry_pos_[i]);
+
+        payload_size +=
+            EbmlMasterElementSize(kMkvSeek, entry_size[i]) + entry_size[i];
+      }
+    }
+
+    // No SeekHead elements
+    if (payload_size == 0)
+      return true;
+
+    const int64 pos = writer->Position();
+    if (writer->Position(start_pos_))
+      return false;
+
+    if (!WriteEbmlMasterElement(writer, kMkvSeekHead, payload_size))
+      return false;
+
+    for (int32 i = 0; i < kSeekEntryCount; ++i) {
+      if (seek_entry_id_[i] != 0) {
+        if (!WriteEbmlMasterElement(writer, kMkvSeek, entry_size[i]))
+          return false;
+
+        if (!WriteEbmlElement(writer, kMkvSeekID,
+                              static_cast<uint64>(seek_entry_id_[i])))
+          return false;
+
+        if (!WriteEbmlElement(writer, kMkvSeekPosition, seek_entry_pos_[i]))
+          return false;
+      }
+    }
+
+    const uint64 total_entry_size = kSeekEntryCount * MaxEntrySize();
+    const uint64 total_size =
+        EbmlMasterElementSize(kMkvSeekHead, total_entry_size) +
+        total_entry_size;
+    const int64 size_left = total_size - (writer->Position() - start_pos_);
+
+    const uint64 bytes_written = WriteVoidElement(writer, size_left);
+    if (!bytes_written)
+      return false;
+
+    if (writer->Position(pos))
+      return false;
+  }
+
+  return true;
+}
+
+bool SeekHead::Write(IMkvWriter* writer) {
+  const uint64 entry_size = kSeekEntryCount * MaxEntrySize();
+  const uint64 size = EbmlMasterElementSize(kMkvSeekHead, entry_size);
+
+  start_pos_ = writer->Position();
+
+  const uint64 bytes_written = WriteVoidElement(writer, size + entry_size);
+  if (!bytes_written)
+    return false;
+
+  return true;
+}
+
+bool SeekHead::AddSeekEntry(uint32 id, uint64 pos) {
+  for (int32 i = 0; i < kSeekEntryCount; ++i) {
+    if (seek_entry_id_[i] == 0) {
+      seek_entry_id_[i] = id;
+      seek_entry_pos_[i] = pos;
+      return true;
+    }
+  }
+  return false;
+}
+
+uint32 SeekHead::GetId(int index) const {
+  if (index < 0 || index >= kSeekEntryCount)
+    return UINT_MAX;
+  return seek_entry_id_[index];
+}
+
+uint64 SeekHead::GetPosition(int index) const {
+  if (index < 0 || index >= kSeekEntryCount)
+    return ULLONG_MAX;
+  return seek_entry_pos_[index];
+}
+
+bool SeekHead::SetSeekEntry(int index, uint32 id, uint64 position) {
+  if (index < 0 || index >= kSeekEntryCount)
+    return false;
+  seek_entry_id_[index] = id;
+  seek_entry_pos_[index] = position;
+  return true;
+}
+
+uint64 SeekHead::MaxEntrySize() const {
+  const uint64 max_entry_payload_size =
+      EbmlElementSize(kMkvSeekID, 0xffffffffULL) +
+      EbmlElementSize(kMkvSeekPosition, 0xffffffffffffffffULL);
+  const uint64 max_entry_size =
+      EbmlMasterElementSize(kMkvSeek, max_entry_payload_size) +
+      max_entry_payload_size;
+
+  return max_entry_size;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// SegmentInfo Class
+
+SegmentInfo::SegmentInfo()
+    : duration_(-1.0),
+      muxing_app_(NULL),
+      timecode_scale_(1000000ULL),
+      writing_app_(NULL),
+      date_utc_(LLONG_MIN),
+      duration_pos_(-1) {}
+
+SegmentInfo::~SegmentInfo() {
+  delete[] muxing_app_;
+  delete[] writing_app_;
+}
+
+bool SegmentInfo::Init() {
+  int32 major;
+  int32 minor;
+  int32 build;
+  int32 revision;
+  GetVersion(&major, &minor, &build, &revision);
+  char temp[256];
+#ifdef _MSC_VER
+  sprintf_s(temp, sizeof(temp) / sizeof(temp[0]), "libwebm-%d.%d.%d.%d", major,
+            minor, build, revision);
+#else
+  snprintf(temp, sizeof(temp) / sizeof(temp[0]), "libwebm-%d.%d.%d.%d", major,
+           minor, build, revision);
+#endif
+
+  const size_t app_len = strlen(temp) + 1;
+
+  delete[] muxing_app_;
+
+  muxing_app_ = new (std::nothrow) char[app_len];  // NOLINT
+  if (!muxing_app_)
+    return false;
+
+#ifdef _MSC_VER
+  strcpy_s(muxing_app_, app_len, temp);
+#else
+  strcpy(muxing_app_, temp);
+#endif
+
+  set_writing_app(temp);
+  if (!writing_app_)
+    return false;
+  return true;
+}
+
+bool SegmentInfo::Finalize(IMkvWriter* writer) const {
+  if (!writer)
+    return false;
+
+  if (duration_ > 0.0) {
+    if (writer->Seekable()) {
+      if (duration_pos_ == -1)
+        return false;
+
+      const int64 pos = writer->Position();
+
+      if (writer->Position(duration_pos_))
+        return false;
+
+      if (!WriteEbmlElement(writer, kMkvDuration,
+                            static_cast<float>(duration_)))
+        return false;
+
+      if (writer->Position(pos))
+        return false;
+    }
+  }
+
+  return true;
+}
+
+bool SegmentInfo::Write(IMkvWriter* writer) {
+  if (!writer || !muxing_app_ || !writing_app_)
+    return false;
+
+  uint64 size = EbmlElementSize(kMkvTimecodeScale, timecode_scale_);
+  if (duration_ > 0.0)
+    size += EbmlElementSize(kMkvDuration, static_cast<float>(duration_));
+  if (date_utc_ != LLONG_MIN)
+    size += EbmlDateElementSize(kMkvDateUTC);
+  size += EbmlElementSize(kMkvMuxingApp, muxing_app_);
+  size += EbmlElementSize(kMkvWritingApp, writing_app_);
+
+  if (!WriteEbmlMasterElement(writer, kMkvInfo, size))
+    return false;
+
+  const int64 payload_position = writer->Position();
+  if (payload_position < 0)
+    return false;
+
+  if (!WriteEbmlElement(writer, kMkvTimecodeScale, timecode_scale_))
+    return false;
+
+  if (duration_ > 0.0) {
+    // Save for later
+    duration_pos_ = writer->Position();
+
+    if (!WriteEbmlElement(writer, kMkvDuration, static_cast<float>(duration_)))
+      return false;
+  }
+
+  if (date_utc_ != LLONG_MIN)
+    WriteEbmlDateElement(writer, kMkvDateUTC, date_utc_);
+
+  if (!WriteEbmlElement(writer, kMkvMuxingApp, muxing_app_))
+    return false;
+  if (!WriteEbmlElement(writer, kMkvWritingApp, writing_app_))
+    return false;
+
+  const int64 stop_position = writer->Position();
+  if (stop_position < 0 ||
+      stop_position - payload_position != static_cast<int64>(size))
+    return false;
+
+  return true;
+}
+
+void SegmentInfo::set_muxing_app(const char* app) {
+  if (app) {
+    const size_t length = strlen(app) + 1;
+    char* temp_str = new (std::nothrow) char[length];  // NOLINT
+    if (!temp_str)
+      return;
+
+#ifdef _MSC_VER
+    strcpy_s(temp_str, length, app);
+#else
+    strcpy(temp_str, app);
+#endif
+
+    delete[] muxing_app_;
+    muxing_app_ = temp_str;
+  }
+}
+
+void SegmentInfo::set_writing_app(const char* app) {
+  if (app) {
+    const size_t length = strlen(app) + 1;
+    char* temp_str = new (std::nothrow) char[length];  // NOLINT
+    if (!temp_str)
+      return;
+
+#ifdef _MSC_VER
+    strcpy_s(temp_str, length, app);
+#else
+    strcpy(temp_str, app);
+#endif
+
+    delete[] writing_app_;
+    writing_app_ = temp_str;
+  }
+}
+
+///////////////////////////////////////////////////////////////
+//
+// Segment Class
+
+Segment::Segment()
+    : chunk_count_(0),
+      chunk_name_(NULL),
+      chunk_writer_cluster_(NULL),
+      chunk_writer_cues_(NULL),
+      chunk_writer_header_(NULL),
+      chunking_(false),
+      chunking_base_name_(NULL),
+      cluster_list_(NULL),
+      cluster_list_capacity_(0),
+      cluster_list_size_(0),
+      cues_position_(kAfterClusters),
+      cues_track_(0),
+      force_new_cluster_(false),
+      frames_(NULL),
+      frames_capacity_(0),
+      frames_size_(0),
+      has_video_(false),
+      header_written_(false),
+      last_block_duration_(0),
+      last_timestamp_(0),
+      max_cluster_duration_(kDefaultMaxClusterDuration),
+      max_cluster_size_(0),
+      mode_(kFile),
+      new_cuepoint_(false),
+      output_cues_(true),
+      payload_pos_(0),
+      size_position_(0),
+      doc_type_version_(kDefaultDocTypeVersion),
+      doc_type_version_written_(0),
+      writer_cluster_(NULL),
+      writer_cues_(NULL),
+      writer_header_(NULL) {
+  const time_t curr_time = time(NULL);
+  seed_ = static_cast<unsigned int>(curr_time);
+#ifdef _WIN32
+  srand(seed_);
+#endif
+}
+
+Segment::~Segment() {
+  if (cluster_list_) {
+    for (int32 i = 0; i < cluster_list_size_; ++i) {
+      Cluster* const cluster = cluster_list_[i];
+      delete cluster;
+    }
+    delete[] cluster_list_;
+  }
+
+  if (frames_) {
+    for (int32 i = 0; i < frames_size_; ++i) {
+      Frame* const frame = frames_[i];
+      delete frame;
+    }
+    delete[] frames_;
+  }
+
+  delete[] chunk_name_;
+  delete[] chunking_base_name_;
+
+  if (chunk_writer_cluster_) {
+    chunk_writer_cluster_->Close();
+    delete chunk_writer_cluster_;
+  }
+  if (chunk_writer_cues_) {
+    chunk_writer_cues_->Close();
+    delete chunk_writer_cues_;
+  }
+  if (chunk_writer_header_) {
+    chunk_writer_header_->Close();
+    delete chunk_writer_header_;
+  }
+}
+
+void Segment::MoveCuesBeforeClustersHelper(uint64 diff, int32 index,
+                                           uint64* cues_size) {
+  CuePoint* const cue_point = cues_.GetCueByIndex(index);
+  if (cue_point == NULL)
+    return;
+  const uint64 old_cue_point_size = cue_point->Size();
+  const uint64 cluster_pos = cue_point->cluster_pos() + diff;
+  cue_point->set_cluster_pos(cluster_pos);  // update the new cluster position
+  // New size of the cue is computed as follows
+  //    Let a = current sum of size of all CuePoints
+  //    Let b = Increase in Cue Point's size due to this iteration
+  //    Let c = Increase in size of Cues Element's length due to this iteration
+  //            (This is computed as CodedSize(a + b) - CodedSize(a))
+  //    Let d = b + c. Now d is the |diff| passed to the next recursive call.
+  //    Let e = a + b. Now e is the |cues_size| passed to the next recursive
+  //                   call.
+  const uint64 cue_point_size_diff = cue_point->Size() - old_cue_point_size;
+  const uint64 cue_size_diff =
+      GetCodedUIntSize(*cues_size + cue_point_size_diff) -
+      GetCodedUIntSize(*cues_size);
+  *cues_size += cue_point_size_diff;
+  diff = cue_size_diff + cue_point_size_diff;
+  if (diff > 0) {
+    for (int32 i = 0; i < cues_.cue_entries_size(); ++i) {
+      MoveCuesBeforeClustersHelper(diff, i, cues_size);
+    }
+  }
+}
+
+void Segment::MoveCuesBeforeClusters() {
+  const uint64 current_cue_size = cues_.Size();
+  uint64 cue_size = 0;
+  for (int32 i = 0; i < cues_.cue_entries_size(); ++i)
+    cue_size += cues_.GetCueByIndex(i)->Size();
+  for (int32 i = 0; i < cues_.cue_entries_size(); ++i)
+    MoveCuesBeforeClustersHelper(current_cue_size, i, &cue_size);
+
+  // Adjust the Seek Entry to reflect the change in position
+  // of Cluster and Cues
+  int32 cluster_index = 0;
+  int32 cues_index = 0;
+  for (int32 i = 0; i < SeekHead::kSeekEntryCount; ++i) {
+    if (seek_head_.GetId(i) == kMkvCluster)
+      cluster_index = i;
+    if (seek_head_.GetId(i) == kMkvCues)
+      cues_index = i;
+  }
+  seek_head_.SetSeekEntry(cues_index, kMkvCues,
+                          seek_head_.GetPosition(cluster_index));
+  seek_head_.SetSeekEntry(cluster_index, kMkvCluster,
+                          cues_.Size() + seek_head_.GetPosition(cues_index));
+}
+
+bool Segment::Init(IMkvWriter* ptr_writer) {
+  if (!ptr_writer) {
+    return false;
+  }
+  writer_cluster_ = ptr_writer;
+  writer_cues_ = ptr_writer;
+  writer_header_ = ptr_writer;
+  return segment_info_.Init();
+}
+
+bool Segment::CopyAndMoveCuesBeforeClusters(mkvparser::IMkvReader* reader,
+                                            IMkvWriter* writer) {
+  if (!writer->Seekable() || chunking_)
+    return false;
+  const int64 cluster_offset =
+      cluster_list_[0]->size_position() - GetUIntSize(kMkvCluster);
+
+  // Copy the headers.
+  if (!ChunkedCopy(reader, writer, 0, cluster_offset))
+    return false;
+
+  // Recompute cue positions and seek entries.
+  MoveCuesBeforeClusters();
+
+  // Write cues and seek entries.
+  // TODO(vigneshv): As of now, it's safe to call seek_head_.Finalize() for the
+  // second time with a different writer object. But the name Finalize() doesn't
+  // indicate something we want to call more than once. So consider renaming it
+  // to write() or some such.
+  if (!cues_.Write(writer) || !seek_head_.Finalize(writer))
+    return false;
+
+  // Copy the Clusters.
+  if (!ChunkedCopy(reader, writer, cluster_offset,
+                   cluster_end_offset_ - cluster_offset))
+    return false;
+
+  // Update the Segment size in case the Cues size has changed.
+  const int64 pos = writer->Position();
+  const int64 segment_size = writer->Position() - payload_pos_;
+  if (writer->Position(size_position_) ||
+      WriteUIntSize(writer, segment_size, 8) || writer->Position(pos))
+    return false;
+  return true;
+}
+
+bool Segment::Finalize() {
+  if (WriteFramesAll() < 0)
+    return false;
+
+  if (mode_ == kFile) {
+    if (cluster_list_size_ > 0) {
+      // Update last cluster's size
+      Cluster* const old_cluster = cluster_list_[cluster_list_size_ - 1];
+
+      if (!old_cluster || !old_cluster->Finalize())
+        return false;
+    }
+
+    if (chunking_ && chunk_writer_cluster_) {
+      chunk_writer_cluster_->Close();
+      chunk_count_++;
+    }
+
+    const double duration =
+        (static_cast<double>(last_timestamp_) + last_block_duration_) /
+        segment_info_.timecode_scale();
+    segment_info_.set_duration(duration);
+    if (!segment_info_.Finalize(writer_header_))
+      return false;
+
+    if (output_cues_)
+      if (!seek_head_.AddSeekEntry(kMkvCues, MaxOffset()))
+        return false;
+
+    if (chunking_) {
+      if (!chunk_writer_cues_)
+        return false;
+
+      char* name = NULL;
+      if (!UpdateChunkName("cues", &name))
+        return false;
+
+      const bool cues_open = chunk_writer_cues_->Open(name);
+      delete[] name;
+      if (!cues_open)
+        return false;
+    }
+
+    cluster_end_offset_ = writer_cluster_->Position();
+
+    // Write the seek headers and cues
+    if (output_cues_)
+      if (!cues_.Write(writer_cues_))
+        return false;
+
+    if (!seek_head_.Finalize(writer_header_))
+      return false;
+
+    if (writer_header_->Seekable()) {
+      if (size_position_ == -1)
+        return false;
+
+      const int64 segment_size = MaxOffset();
+      if (segment_size < 1)
+        return false;
+
+      const int64 pos = writer_header_->Position();
+      UpdateDocTypeVersion();
+      if (doc_type_version_ != doc_type_version_written_) {
+        if (writer_header_->Position(0))
+          return false;
+
+        if (!WriteEbmlHeader(writer_header_, doc_type_version_))
+          return false;
+        if (writer_header_->Position() != ebml_header_size_)
+          return false;
+
+        doc_type_version_written_ = doc_type_version_;
+      }
+
+      if (writer_header_->Position(size_position_))
+        return false;
+
+      if (WriteUIntSize(writer_header_, segment_size, 8))
+        return false;
+
+      if (writer_header_->Position(pos))
+        return false;
+    }
+
+    if (chunking_) {
+      // Do not close any writers until the segment size has been written,
+      // otherwise the size may be off.
+      if (!chunk_writer_cues_ || !chunk_writer_header_)
+        return false;
+
+      chunk_writer_cues_->Close();
+      chunk_writer_header_->Close();
+    }
+  }
+
+  return true;
+}
+
+Track* Segment::AddTrack(int32 number) {
+  Track* const track = new (std::nothrow) Track(&seed_);  // NOLINT
+
+  if (!track)
+    return NULL;
+
+  if (!tracks_.AddTrack(track, number)) {
+    delete track;
+    return NULL;
+  }
+
+  return track;
+}
+
+Chapter* Segment::AddChapter() { return chapters_.AddChapter(&seed_); }
+
+Tag* Segment::AddTag() { return tags_.AddTag(); }
+
+uint64 Segment::AddVideoTrack(int32 width, int32 height, int32 number) {
+  VideoTrack* const track = new (std::nothrow) VideoTrack(&seed_);  // NOLINT
+  if (!track)
+    return 0;
+
+  track->set_type(Tracks::kVideo);
+  track->set_codec_id(Tracks::kVp8CodecId);
+  track->set_width(width);
+  track->set_height(height);
+
+  tracks_.AddTrack(track, number);
+  has_video_ = true;
+
+  return track->number();
+}
+
+bool Segment::AddCuePoint(uint64 timestamp, uint64 track) {
+  if (cluster_list_size_ < 1)
+    return false;
+
+  const Cluster* const cluster = cluster_list_[cluster_list_size_ - 1];
+  if (!cluster)
+    return false;
+
+  CuePoint* const cue = new (std::nothrow) CuePoint();  // NOLINT
+  if (!cue)
+    return false;
+
+  cue->set_time(timestamp / segment_info_.timecode_scale());
+  cue->set_block_number(cluster->blocks_added());
+  cue->set_cluster_pos(cluster->position_for_cues());
+  cue->set_track(track);
+  if (!cues_.AddCue(cue))
+    return false;
+
+  new_cuepoint_ = false;
+  return true;
+}
+
+uint64 Segment::AddAudioTrack(int32 sample_rate, int32 channels, int32 number) {
+  AudioTrack* const track = new (std::nothrow) AudioTrack(&seed_);  // NOLINT
+  if (!track)
+    return 0;
+
+  track->set_type(Tracks::kAudio);
+  track->set_codec_id(Tracks::kVorbisCodecId);
+  track->set_sample_rate(sample_rate);
+  track->set_channels(channels);
+
+  tracks_.AddTrack(track, number);
+
+  return track->number();
+}
+
+bool Segment::AddFrame(const uint8* data, uint64 length, uint64 track_number,
+                       uint64 timestamp, bool is_key) {
+  if (!data)
+    return false;
+
+  Frame frame;
+  if (!frame.Init(data, length))
+    return false;
+  frame.set_track_number(track_number);
+  frame.set_timestamp(timestamp);
+  frame.set_is_key(is_key);
+  return AddGenericFrame(&frame);
+}
+
+bool Segment::AddFrameWithAdditional(const uint8* data, uint64 length,
+                                     const uint8* additional,
+                                     uint64 additional_length, uint64 add_id,
+                                     uint64 track_number, uint64 timestamp,
+                                     bool is_key) {
+  if (!data || !additional)
+    return false;
+
+  Frame frame;
+  if (!frame.Init(data, length) ||
+      !frame.AddAdditionalData(additional, additional_length, add_id)) {
+    return false;
+  }
+  frame.set_track_number(track_number);
+  frame.set_timestamp(timestamp);
+  frame.set_is_key(is_key);
+  return AddGenericFrame(&frame);
+}
+
+bool Segment::AddFrameWithDiscardPadding(const uint8* data, uint64 length,
+                                         int64 discard_padding,
+                                         uint64 track_number, uint64 timestamp,
+                                         bool is_key) {
+  if (!data)
+    return false;
+
+  Frame frame;
+  if (!frame.Init(data, length))
+    return false;
+  frame.set_discard_padding(discard_padding);
+  frame.set_track_number(track_number);
+  frame.set_timestamp(timestamp);
+  frame.set_is_key(is_key);
+  return AddGenericFrame(&frame);
+}
+
+bool Segment::AddMetadata(const uint8* data, uint64 length, uint64 track_number,
+                          uint64 timestamp_ns, uint64 duration_ns) {
+  if (!data)
+    return false;
+
+  Frame frame;
+  if (!frame.Init(data, length))
+    return false;
+  frame.set_track_number(track_number);
+  frame.set_timestamp(timestamp_ns);
+  frame.set_duration(duration_ns);
+  frame.set_is_key(true);  // All metadata blocks are keyframes.
+  return AddGenericFrame(&frame);
+}
+
+bool Segment::AddGenericFrame(const Frame* frame) {
+  if (!frame)
+    return false;
+
+  if (!CheckHeaderInfo())
+    return false;
+
+  // Check for non-monotonically increasing timestamps.
+  if (frame->timestamp() < last_timestamp_)
+    return false;
+
+  // Check if the track number is valid.
+  if (!tracks_.GetTrackByNumber(frame->track_number()))
+    return false;
+
+  if (frame->discard_padding() != 0)
+    doc_type_version_ = 4;
+
+  // If the segment has a video track hold onto audio frames to make sure the
+  // audio that is associated with the start time of a video key-frame is
+  // muxed into the same cluster.
+  if (has_video_ && tracks_.TrackIsAudio(frame->track_number()) &&
+      !force_new_cluster_) {
+    Frame* const new_frame = new (std::nothrow) Frame();
+    if (!new_frame || !new_frame->CopyFrom(*frame))
+      return false;
+    return QueueFrame(new_frame);
+  }
+
+  if (!DoNewClusterProcessing(frame->track_number(), frame->timestamp(),
+                              frame->is_key())) {
+    return false;
+  }
+
+  if (cluster_list_size_ < 1)
+    return false;
+
+  Cluster* const cluster = cluster_list_[cluster_list_size_ - 1];
+  if (!cluster)
+    return false;
+
+  // If the Frame is not a SimpleBlock, then set the reference_block_timestamp
+  // if it is not set already.
+  bool frame_created = false;
+  if (!frame->CanBeSimpleBlock() && !frame->is_key() &&
+      !frame->reference_block_timestamp_set()) {
+    Frame* const new_frame = new (std::nothrow) Frame();
+    if (!new_frame->CopyFrom(*frame))
+      return false;
+    new_frame->set_reference_block_timestamp(
+        last_track_timestamp_[frame->track_number() - 1]);
+    frame = new_frame;
+    frame_created = true;
+  }
+
+  if (!cluster->AddFrame(frame))
+    return false;
+
+  if (new_cuepoint_ && cues_track_ == frame->track_number()) {
+    if (!AddCuePoint(frame->timestamp(), cues_track_))
+      return false;
+  }
+
+  last_timestamp_ = frame->timestamp();
+  last_track_timestamp_[frame->track_number() - 1] = frame->timestamp();
+  last_block_duration_ = frame->duration();
+
+  if (frame_created)
+    delete frame;
+
+  return true;
+}
+
+void Segment::OutputCues(bool output_cues) { output_cues_ = output_cues; }
+
+bool Segment::SetChunking(bool chunking, const char* filename) {
+  if (chunk_count_ > 0)
+    return false;
+
+  if (chunking) {
+    if (!filename)
+      return false;
+
+    // Check if we are being set to what is already set.
+    if (chunking_ && !strcmp(filename, chunking_base_name_))
+      return true;
+
+    const size_t name_length = strlen(filename) + 1;
+    char* const temp = new (std::nothrow) char[name_length];  // NOLINT
+    if (!temp)
+      return false;
+
+#ifdef _MSC_VER
+    strcpy_s(temp, name_length, filename);
+#else
+    strcpy(temp, filename);
+#endif
+
+    delete[] chunking_base_name_;
+    chunking_base_name_ = temp;
+
+    if (!UpdateChunkName("chk", &chunk_name_))
+      return false;
+
+    if (!chunk_writer_cluster_) {
+      chunk_writer_cluster_ = new (std::nothrow) MkvWriter();  // NOLINT
+      if (!chunk_writer_cluster_)
+        return false;
+    }
+
+    if (!chunk_writer_cues_) {
+      chunk_writer_cues_ = new (std::nothrow) MkvWriter();  // NOLINT
+      if (!chunk_writer_cues_)
+        return false;
+    }
+
+    if (!chunk_writer_header_) {
+      chunk_writer_header_ = new (std::nothrow) MkvWriter();  // NOLINT
+      if (!chunk_writer_header_)
+        return false;
+    }
+
+    if (!chunk_writer_cluster_->Open(chunk_name_))
+      return false;
+
+    const size_t header_length = strlen(filename) + strlen(".hdr") + 1;
+    char* const header = new (std::nothrow) char[header_length];  // NOLINT
+    if (!header)
+      return false;
+
+#ifdef _MSC_VER
+    strcpy_s(header, header_length - strlen(".hdr"), chunking_base_name_);
+    strcat_s(header, header_length, ".hdr");
+#else
+    strcpy(header, chunking_base_name_);
+    strcat(header, ".hdr");
+#endif
+    if (!chunk_writer_header_->Open(header)) {
+      delete[] header;
+      return false;
+    }
+
+    writer_cluster_ = chunk_writer_cluster_;
+    writer_cues_ = chunk_writer_cues_;
+    writer_header_ = chunk_writer_header_;
+
+    delete[] header;
+  }
+
+  chunking_ = chunking;
+
+  return true;
+}
+
+bool Segment::CuesTrack(uint64 track_number) {
+  const Track* const track = GetTrackByNumber(track_number);
+  if (!track)
+    return false;
+
+  cues_track_ = track_number;
+  return true;
+}
+
+void Segment::ForceNewClusterOnNextFrame() { force_new_cluster_ = true; }
+
+Track* Segment::GetTrackByNumber(uint64 track_number) const {
+  return tracks_.GetTrackByNumber(track_number);
+}
+
+bool Segment::WriteSegmentHeader() {
+  UpdateDocTypeVersion();
+
+  // TODO(fgalligan): Support more than one segment.
+  if (!WriteEbmlHeader(writer_header_, doc_type_version_))
+    return false;
+  doc_type_version_written_ = doc_type_version_;
+  ebml_header_size_ = static_cast<int32>(writer_header_->Position());
+
+  // Write "unknown" (-1) as segment size value. If mode is kFile, Segment
+  // will write over duration when the file is finalized.
+  if (WriteID(writer_header_, kMkvSegment))
+    return false;
+
+  // Save for later.
+  size_position_ = writer_header_->Position();
+
+  // Write "unknown" (EBML coded -1) as segment size value. We need to write 8
+  // bytes because if we are going to overwrite the segment size later we do
+  // not know how big our segment will be.
+  if (SerializeInt(writer_header_, kEbmlUnknownValue, 8))
+    return false;
+
+  payload_pos_ = writer_header_->Position();
+
+  if (mode_ == kFile && writer_header_->Seekable()) {
+    // Set the duration > 0.0 so SegmentInfo will write out the duration. When
+    // the muxer is done writing we will set the correct duration and have
+    // SegmentInfo upadte it.
+    segment_info_.set_duration(1.0);
+
+    if (!seek_head_.Write(writer_header_))
+      return false;
+  }
+
+  if (!seek_head_.AddSeekEntry(kMkvInfo, MaxOffset()))
+    return false;
+  if (!segment_info_.Write(writer_header_))
+    return false;
+
+  if (!seek_head_.AddSeekEntry(kMkvTracks, MaxOffset()))
+    return false;
+  if (!tracks_.Write(writer_header_))
+    return false;
+
+  if (chapters_.Count() > 0) {
+    if (!seek_head_.AddSeekEntry(kMkvChapters, MaxOffset()))
+      return false;
+    if (!chapters_.Write(writer_header_))
+      return false;
+  }
+
+  if (tags_.Count() > 0) {
+    if (!seek_head_.AddSeekEntry(kMkvTags, MaxOffset()))
+      return false;
+    if (!tags_.Write(writer_header_))
+      return false;
+  }
+
+  if (chunking_ && (mode_ == kLive || !writer_header_->Seekable())) {
+    if (!chunk_writer_header_)
+      return false;
+
+    chunk_writer_header_->Close();
+  }
+
+  header_written_ = true;
+
+  return true;
+}
+
+// Here we are testing whether to create a new cluster, given a frame
+// having time frame_timestamp_ns.
+//
+int Segment::TestFrame(uint64 track_number, uint64 frame_timestamp_ns,
+                       bool is_key) const {
+  if (force_new_cluster_)
+    return 1;
+
+  // If no clusters have been created yet, then create a new cluster
+  // and write this frame immediately, in the new cluster.  This path
+  // should only be followed once, the first time we attempt to write
+  // a frame.
+
+  if (cluster_list_size_ <= 0)
+    return 1;
+
+  // There exists at least one cluster. We must compare the frame to
+  // the last cluster, in order to determine whether the frame is
+  // written to the existing cluster, or that a new cluster should be
+  // created.
+
+  const uint64 timecode_scale = segment_info_.timecode_scale();
+  const uint64 frame_timecode = frame_timestamp_ns / timecode_scale;
+
+  const Cluster* const last_cluster = cluster_list_[cluster_list_size_ - 1];
+  const uint64 last_cluster_timecode = last_cluster->timecode();
+
+  // For completeness we test for the case when the frame's timecode
+  // is less than the cluster's timecode.  Although in principle that
+  // is allowed, this muxer doesn't actually write clusters like that,
+  // so this indicates a bug somewhere in our algorithm.
+
+  if (frame_timecode < last_cluster_timecode)  // should never happen
+    return -1;
+
+  // If the frame has a timestamp significantly larger than the last
+  // cluster (in Matroska, cluster-relative timestamps are serialized
+  // using a 16-bit signed integer), then we cannot write this frame
+  // to that cluster, and so we must create a new cluster.
+
+  const int64 delta_timecode = frame_timecode - last_cluster_timecode;
+
+  if (delta_timecode > kMaxBlockTimecode)
+    return 2;
+
+  // We decide to create a new cluster when we have a video keyframe.
+  // This will flush queued (audio) frames, and write the keyframe
+  // immediately, in the newly-created cluster.
+
+  if (is_key && tracks_.TrackIsVideo(track_number))
+    return 1;
+
+  // Create a new cluster if we have accumulated too many frames
+  // already, where "too many" is defined as "the total time of frames
+  // in the cluster exceeds a threshold".
+
+  const uint64 delta_ns = delta_timecode * timecode_scale;
+
+  if (max_cluster_duration_ > 0 && delta_ns >= max_cluster_duration_)
+    return 1;
+
+  // This is similar to the case above, with the difference that a new
+  // cluster is created when the size of the current cluster exceeds a
+  // threshold.
+
+  const uint64 cluster_size = last_cluster->payload_size();
+
+  if (max_cluster_size_ > 0 && cluster_size >= max_cluster_size_)
+    return 1;
+
+  // There's no need to create a new cluster, so emit this frame now.
+
+  return 0;
+}
+
+bool Segment::MakeNewCluster(uint64 frame_timestamp_ns) {
+  const int32 new_size = cluster_list_size_ + 1;
+
+  if (new_size > cluster_list_capacity_) {
+    // Add more clusters.
+    const int32 new_capacity =
+        (cluster_list_capacity_ <= 0) ? 1 : cluster_list_capacity_ * 2;
+    Cluster** const clusters =
+        new (std::nothrow) Cluster*[new_capacity];  // NOLINT
+    if (!clusters)
+      return false;
+
+    for (int32 i = 0; i < cluster_list_size_; ++i) {
+      clusters[i] = cluster_list_[i];
+    }
+
+    delete[] cluster_list_;
+
+    cluster_list_ = clusters;
+    cluster_list_capacity_ = new_capacity;
+  }
+
+  if (!WriteFramesLessThan(frame_timestamp_ns))
+    return false;
+
+  if (mode_ == kFile) {
+    if (cluster_list_size_ > 0) {
+      // Update old cluster's size
+      Cluster* const old_cluster = cluster_list_[cluster_list_size_ - 1];
+
+      if (!old_cluster || !old_cluster->Finalize())
+        return false;
+    }
+
+    if (output_cues_)
+      new_cuepoint_ = true;
+  }
+
+  if (chunking_ && cluster_list_size_ > 0) {
+    chunk_writer_cluster_->Close();
+    chunk_count_++;
+
+    if (!UpdateChunkName("chk", &chunk_name_))
+      return false;
+    if (!chunk_writer_cluster_->Open(chunk_name_))
+      return false;
+  }
+
+  const uint64 timecode_scale = segment_info_.timecode_scale();
+  const uint64 frame_timecode = frame_timestamp_ns / timecode_scale;
+
+  uint64 cluster_timecode = frame_timecode;
+
+  if (frames_size_ > 0) {
+    const Frame* const f = frames_[0];  // earliest queued frame
+    const uint64 ns = f->timestamp();
+    const uint64 tc = ns / timecode_scale;
+
+    if (tc < cluster_timecode)
+      cluster_timecode = tc;
+  }
+
+  Cluster*& cluster = cluster_list_[cluster_list_size_];
+  const int64 offset = MaxOffset();
+  cluster = new (std::nothrow) Cluster(cluster_timecode,  // NOLINT
+                                       offset, segment_info_.timecode_scale());
+  if (!cluster)
+    return false;
+
+  if (!cluster->Init(writer_cluster_))
+    return false;
+
+  cluster_list_size_ = new_size;
+  return true;
+}
+
+bool Segment::DoNewClusterProcessing(uint64 track_number,
+                                     uint64 frame_timestamp_ns, bool is_key) {
+  for (;;) {
+    // Based on the characteristics of the current frame and current
+    // cluster, decide whether to create a new cluster.
+    const int result = TestFrame(track_number, frame_timestamp_ns, is_key);
+    if (result < 0)  // error
+      return false;
+
+    // Always set force_new_cluster_ to false after TestFrame.
+    force_new_cluster_ = false;
+
+    // A non-zero result means create a new cluster.
+    if (result > 0 && !MakeNewCluster(frame_timestamp_ns))
+      return false;
+
+    // Write queued (audio) frames.
+    const int frame_count = WriteFramesAll();
+    if (frame_count < 0)  // error
+      return false;
+
+    // Write the current frame to the current cluster (if TestFrame
+    // returns 0) or to a newly created cluster (TestFrame returns 1).
+    if (result <= 1)
+      return true;
+
+    // TestFrame returned 2, which means there was a large time
+    // difference between the cluster and the frame itself.  Do the
+    // test again, comparing the frame to the new cluster.
+  }
+}
+
+bool Segment::CheckHeaderInfo() {
+  if (!header_written_) {
+    if (!WriteSegmentHeader())
+      return false;
+
+    if (!seek_head_.AddSeekEntry(kMkvCluster, MaxOffset()))
+      return false;
+
+    if (output_cues_ && cues_track_ == 0) {
+      // Check for a video track
+      for (uint32 i = 0; i < tracks_.track_entries_size(); ++i) {
+        const Track* const track = tracks_.GetTrackByIndex(i);
+        if (!track)
+          return false;
+
+        if (tracks_.TrackIsVideo(track->number())) {
+          cues_track_ = track->number();
+          break;
+        }
+      }
+
+      // Set first track found
+      if (cues_track_ == 0) {
+        const Track* const track = tracks_.GetTrackByIndex(0);
+        if (!track)
+          return false;
+
+        cues_track_ = track->number();
+      }
+    }
+  }
+  return true;
+}
+
+void Segment::UpdateDocTypeVersion() {
+  for (uint32 index = 0; index < tracks_.track_entries_size(); ++index) {
+    const Track* track = tracks_.GetTrackByIndex(index);
+    if (track == NULL)
+      break;
+    if ((track->codec_delay() || track->seek_pre_roll()) &&
+        doc_type_version_ < 4) {
+      doc_type_version_ = 4;
+      break;
+    }
+  }
+}
+
+bool Segment::UpdateChunkName(const char* ext, char** name) const {
+  if (!name || !ext)
+    return false;
+
+  char ext_chk[64];
+#ifdef _MSC_VER
+  sprintf_s(ext_chk, sizeof(ext_chk), "_%06d.%s", chunk_count_, ext);
+#else
+  snprintf(ext_chk, sizeof(ext_chk), "_%06d.%s", chunk_count_, ext);
+#endif
+
+  const size_t length = strlen(chunking_base_name_) + strlen(ext_chk) + 1;
+  char* const str = new (std::nothrow) char[length];  // NOLINT
+  if (!str)
+    return false;
+
+#ifdef _MSC_VER
+  strcpy_s(str, length - strlen(ext_chk), chunking_base_name_);
+  strcat_s(str, length, ext_chk);
+#else
+  strcpy(str, chunking_base_name_);
+  strcat(str, ext_chk);
+#endif
+
+  delete[] * name;
+  *name = str;
+
+  return true;
+}
+
+int64 Segment::MaxOffset() {
+  if (!writer_header_)
+    return -1;
+
+  int64 offset = writer_header_->Position() - payload_pos_;
+
+  if (chunking_) {
+    for (int32 i = 0; i < cluster_list_size_; ++i) {
+      Cluster* const cluster = cluster_list_[i];
+      offset += cluster->Size();
+    }
+
+    if (writer_cues_)
+      offset += writer_cues_->Position();
+  }
+
+  return offset;
+}
+
+bool Segment::QueueFrame(Frame* frame) {
+  const int32 new_size = frames_size_ + 1;
+
+  if (new_size > frames_capacity_) {
+    // Add more frames.
+    const int32 new_capacity = (!frames_capacity_) ? 2 : frames_capacity_ * 2;
+
+    if (new_capacity < 1)
+      return false;
+
+    Frame** const frames = new (std::nothrow) Frame*[new_capacity];  // NOLINT
+    if (!frames)
+      return false;
+
+    for (int32 i = 0; i < frames_size_; ++i) {
+      frames[i] = frames_[i];
+    }
+
+    delete[] frames_;
+    frames_ = frames;
+    frames_capacity_ = new_capacity;
+  }
+
+  frames_[frames_size_++] = frame;
+
+  return true;
+}
+
+int Segment::WriteFramesAll() {
+  if (frames_ == NULL)
+    return 0;
+
+  if (cluster_list_size_ < 1)
+    return -1;
+
+  Cluster* const cluster = cluster_list_[cluster_list_size_ - 1];
+
+  if (!cluster)
+    return -1;
+
+  for (int32 i = 0; i < frames_size_; ++i) {
+    Frame*& frame = frames_[i];
+    // TODO(jzern/vigneshv): using Segment::AddGenericFrame here would limit the
+    // places where |doc_type_version_| needs to be updated.
+    if (frame->discard_padding() != 0)
+      doc_type_version_ = 4;
+    if (!cluster->AddFrame(frame))
+      return -1;
+
+    if (new_cuepoint_ && cues_track_ == frame->track_number()) {
+      if (!AddCuePoint(frame->timestamp(), cues_track_))
+        return -1;
+    }
+
+    if (frame->timestamp() > last_timestamp_) {
+      last_timestamp_ = frame->timestamp();
+      last_track_timestamp_[frame->track_number() - 1] = frame->timestamp();
+    }
+
+    delete frame;
+    frame = NULL;
+  }
+
+  const int result = frames_size_;
+  frames_size_ = 0;
+
+  return result;
+}
+
+bool Segment::WriteFramesLessThan(uint64 timestamp) {
+  // Check |cluster_list_size_| to see if this is the first cluster. If it is
+  // the first cluster the audio frames that are less than the first video
+  // timesatmp will be written in a later step.
+  if (frames_size_ > 0 && cluster_list_size_ > 0) {
+    if (!frames_)
+      return false;
+
+    Cluster* const cluster = cluster_list_[cluster_list_size_ - 1];
+    if (!cluster)
+      return false;
+
+    int32 shift_left = 0;
+
+    // TODO(fgalligan): Change this to use the durations of frames instead of
+    // the next frame's start time if the duration is accurate.
+    for (int32 i = 1; i < frames_size_; ++i) {
+      const Frame* const frame_curr = frames_[i];
+
+      if (frame_curr->timestamp() > timestamp)
+        break;
+
+      const Frame* const frame_prev = frames_[i - 1];
+      if (frame_prev->discard_padding() != 0)
+        doc_type_version_ = 4;
+      if (!cluster->AddFrame(frame_prev))
+        return false;
+
+      if (new_cuepoint_ && cues_track_ == frame_prev->track_number()) {
+        if (!AddCuePoint(frame_prev->timestamp(), cues_track_))
+          return false;
+      }
+
+      ++shift_left;
+      if (frame_prev->timestamp() > last_timestamp_) {
+        last_timestamp_ = frame_prev->timestamp();
+        last_track_timestamp_[frame_prev->track_number() - 1] =
+            frame_prev->timestamp();
+      }
+
+      delete frame_prev;
+    }
+
+    if (shift_left > 0) {
+      if (shift_left >= frames_size_)
+        return false;
+
+      const int32 new_frames_size = frames_size_ - shift_left;
+      for (int32 i = 0; i < new_frames_size; ++i) {
+        frames_[i] = frames_[i + shift_left];
+      }
+
+      frames_size_ = new_frames_size;
+    }
+  }
+
+  return true;
+}
+
+}  // namespace mkvmuxer
diff --git a/libs/libvpx/third_party/libwebm/mkvmuxer.hpp b/libs/libvpx/third_party/libwebm/mkvmuxer.hpp
new file mode 100644
index 0000000000..03a002c93b
--- /dev/null
+++ b/libs/libvpx/third_party/libwebm/mkvmuxer.hpp
@@ -0,0 +1,1492 @@
+// Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+
+#ifndef MKVMUXER_HPP
+#define MKVMUXER_HPP
+
+#include "mkvmuxertypes.hpp"
+
+// For a description of the WebM elements see
+// http://www.webmproject.org/code/specs/container/.
+
+namespace mkvparser {
+class IMkvReader;
+}  // end namespace
+
+namespace mkvmuxer {
+
+class MkvWriter;
+class Segment;
+
+const uint64 kMaxTrackNumber = 126;
+
+///////////////////////////////////////////////////////////////
+// Interface used by the mkvmuxer to write out the Mkv data.
+class IMkvWriter {
+ public:
+  // Writes out |len| bytes of |buf|. Returns 0 on success.
+  virtual int32 Write(const void* buf, uint32 len) = 0;
+
+  // Returns the offset of the output position from the beginning of the
+  // output.
+  virtual int64 Position() const = 0;
+
+  // Set the current File position. Returns 0 on success.
+  virtual int32 Position(int64 position) = 0;
+
+  // Returns true if the writer is seekable.
+  virtual bool Seekable() const = 0;
+
+  // Element start notification. Called whenever an element identifier is about
+  // to be written to the stream. |element_id| is the element identifier, and
+  // |position| is the location in the WebM stream where the first octet of the
+  // element identifier will be written.
+  // Note: the |MkvId| enumeration in webmids.hpp defines element values.
+  virtual void ElementStartNotify(uint64 element_id, int64 position) = 0;
+
+ protected:
+  IMkvWriter();
+  virtual ~IMkvWriter();
+
+ private:
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(IMkvWriter);
+};
+
+// Writes out the EBML header for a WebM file. This function must be called
+// before any other libwebm writing functions are called.
+bool WriteEbmlHeader(IMkvWriter* writer, uint64 doc_type_version);
+
+// Deprecated. Writes out EBML header with doc_type_version as
+// kDefaultDocTypeVersion. Exists for backward compatibility.
+bool WriteEbmlHeader(IMkvWriter* writer);
+
+// Copies in Chunk from source to destination between the given byte positions
+bool ChunkedCopy(mkvparser::IMkvReader* source, IMkvWriter* dst, int64 start,
+                 int64 size);
+
+///////////////////////////////////////////////////////////////
+// Class to hold data the will be written to a block.
+class Frame {
+ public:
+  Frame();
+  ~Frame();
+
+  // Sets this frame's contents based on |frame|. Returns true on success. On
+  // failure, this frame's existing contents may be lost.
+  bool CopyFrom(const Frame& frame);
+
+  // Copies |frame| data into |frame_|. Returns true on success.
+  bool Init(const uint8* frame, uint64 length);
+
+  // Copies |additional| data into |additional_|. Returns true on success.
+  bool AddAdditionalData(const uint8* additional, uint64 length, uint64 add_id);
+
+  // Returns true if the frame has valid parameters.
+  bool IsValid() const;
+
+  // Returns true if the frame can be written as a SimpleBlock based on current
+  // parameters.
+  bool CanBeSimpleBlock() const;
+
+  uint64 add_id() const { return add_id_; }
+  const uint8* additional() const { return additional_; }
+  uint64 additional_length() const { return additional_length_; }
+  void set_duration(uint64 duration) { duration_ = duration; }
+  uint64 duration() const { return duration_; }
+  const uint8* frame() const { return frame_; }
+  void set_is_key(bool key) { is_key_ = key; }
+  bool is_key() const { return is_key_; }
+  uint64 length() const { return length_; }
+  void set_track_number(uint64 track_number) { track_number_ = track_number; }
+  uint64 track_number() const { return track_number_; }
+  void set_timestamp(uint64 timestamp) { timestamp_ = timestamp; }
+  uint64 timestamp() const { return timestamp_; }
+  void set_discard_padding(int64 discard_padding) {
+    discard_padding_ = discard_padding;
+  }
+  int64 discard_padding() const { return discard_padding_; }
+  void set_reference_block_timestamp(int64 reference_block_timestamp);
+  int64 reference_block_timestamp() const { return reference_block_timestamp_; }
+  bool reference_block_timestamp_set() const {
+    return reference_block_timestamp_set_;
+  }
+
+ private:
+  // Id of the Additional data.
+  uint64 add_id_;
+
+  // Pointer to additional data. Owned by this class.
+  uint8* additional_;
+
+  // Length of the additional data.
+  uint64 additional_length_;
+
+  // Duration of the frame in nanoseconds.
+  uint64 duration_;
+
+  // Pointer to the data. Owned by this class.
+  uint8* frame_;
+
+  // Flag telling if the data should set the key flag of a block.
+  bool is_key_;
+
+  // Length of the data.
+  uint64 length_;
+
+  // Mkv track number the data is associated with.
+  uint64 track_number_;
+
+  // Timestamp of the data in nanoseconds.
+  uint64 timestamp_;
+
+  // Discard padding for the frame.
+  int64 discard_padding_;
+
+  // Reference block timestamp.
+  int64 reference_block_timestamp_;
+
+  // Flag indicating if |reference_block_timestamp_| has been set.
+  bool reference_block_timestamp_set_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Frame);
+};
+
+///////////////////////////////////////////////////////////////
+// Class to hold one cue point in a Cues element.
+class CuePoint {
+ public:
+  CuePoint();
+  ~CuePoint();
+
+  // Returns the size in bytes for the entire CuePoint element.
+  uint64 Size() const;
+
+  // Output the CuePoint element to the writer. Returns true on success.
+  bool Write(IMkvWriter* writer) const;
+
+  void set_time(uint64 time) { time_ = time; }
+  uint64 time() const { return time_; }
+  void set_track(uint64 track) { track_ = track; }
+  uint64 track() const { return track_; }
+  void set_cluster_pos(uint64 cluster_pos) { cluster_pos_ = cluster_pos; }
+  uint64 cluster_pos() const { return cluster_pos_; }
+  void set_block_number(uint64 block_number) { block_number_ = block_number; }
+  uint64 block_number() const { return block_number_; }
+  void set_output_block_number(bool output_block_number) {
+    output_block_number_ = output_block_number;
+  }
+  bool output_block_number() const { return output_block_number_; }
+
+ private:
+  // Returns the size in bytes for the payload of the CuePoint element.
+  uint64 PayloadSize() const;
+
+  // Absolute timecode according to the segment time base.
+  uint64 time_;
+
+  // The Track element associated with the CuePoint.
+  uint64 track_;
+
+  // The position of the Cluster containing the Block.
+  uint64 cluster_pos_;
+
+  // Number of the Block within the Cluster, starting from 1.
+  uint64 block_number_;
+
+  // If true the muxer will write out the block number for the cue if the
+  // block number is different than the default of 1. Default is set to true.
+  bool output_block_number_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(CuePoint);
+};
+
+///////////////////////////////////////////////////////////////
+// Cues element.
+class Cues {
+ public:
+  Cues();
+  ~Cues();
+
+  // Adds a cue point to the Cues element. Returns true on success.
+  bool AddCue(CuePoint* cue);
+
+  // Returns the cue point by index. Returns NULL if there is no cue point
+  // match.
+  CuePoint* GetCueByIndex(int32 index) const;
+
+  // Returns the total size of the Cues element
+  uint64 Size();
+
+  // Output the Cues element to the writer. Returns true on success.
+  bool Write(IMkvWriter* writer) const;
+
+  int32 cue_entries_size() const { return cue_entries_size_; }
+  void set_output_block_number(bool output_block_number) {
+    output_block_number_ = output_block_number;
+  }
+  bool output_block_number() const { return output_block_number_; }
+
+ private:
+  // Number of allocated elements in |cue_entries_|.
+  int32 cue_entries_capacity_;
+
+  // Number of CuePoints in |cue_entries_|.
+  int32 cue_entries_size_;
+
+  // CuePoint list.
+  CuePoint** cue_entries_;
+
+  // If true the muxer will write out the block number for the cue if the
+  // block number is different than the default of 1. Default is set to true.
+  bool output_block_number_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Cues);
+};
+
+///////////////////////////////////////////////////////////////
+// ContentEncAESSettings element
+class ContentEncAESSettings {
+ public:
+  enum { kCTR = 1 };
+
+  ContentEncAESSettings();
+  ~ContentEncAESSettings() {}
+
+  // Returns the size in bytes for the ContentEncAESSettings element.
+  uint64 Size() const;
+
+  // Writes out the ContentEncAESSettings element to |writer|. Returns true on
+  // success.
+  bool Write(IMkvWriter* writer) const;
+
+  uint64 cipher_mode() const { return cipher_mode_; }
+
+ private:
+  // Returns the size in bytes for the payload of the ContentEncAESSettings
+  // element.
+  uint64 PayloadSize() const;
+
+  // Sub elements
+  uint64 cipher_mode_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(ContentEncAESSettings);
+};
+
+///////////////////////////////////////////////////////////////
+// ContentEncoding element
+// Elements used to describe if the track data has been encrypted or
+// compressed with zlib or header stripping.
+// Currently only whole frames can be encrypted with AES. This dictates that
+// ContentEncodingOrder will be 0, ContentEncodingScope will be 1,
+// ContentEncodingType will be 1, and ContentEncAlgo will be 5.
+class ContentEncoding {
+ public:
+  ContentEncoding();
+  ~ContentEncoding();
+
+  // Sets the content encryption id. Copies |length| bytes from |id| to
+  // |enc_key_id_|. Returns true on success.
+  bool SetEncryptionID(const uint8* id, uint64 length);
+
+  // Returns the size in bytes for the ContentEncoding element.
+  uint64 Size() const;
+
+  // Writes out the ContentEncoding element to |writer|. Returns true on
+  // success.
+  bool Write(IMkvWriter* writer) const;
+
+  uint64 enc_algo() const { return enc_algo_; }
+  uint64 encoding_order() const { return encoding_order_; }
+  uint64 encoding_scope() const { return encoding_scope_; }
+  uint64 encoding_type() const { return encoding_type_; }
+  ContentEncAESSettings* enc_aes_settings() { return &enc_aes_settings_; }
+
+ private:
+  // Returns the size in bytes for the encoding elements.
+  uint64 EncodingSize(uint64 compresion_size, uint64 encryption_size) const;
+
+  // Returns the size in bytes for the encryption elements.
+  uint64 EncryptionSize() const;
+
+  // Track element names
+  uint64 enc_algo_;
+  uint8* enc_key_id_;
+  uint64 encoding_order_;
+  uint64 encoding_scope_;
+  uint64 encoding_type_;
+
+  // ContentEncAESSettings element.
+  ContentEncAESSettings enc_aes_settings_;
+
+  // Size of the ContentEncKeyID data in bytes.
+  uint64 enc_key_id_length_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(ContentEncoding);
+};
+
+///////////////////////////////////////////////////////////////
+// Track element.
+class Track {
+ public:
+  // The |seed| parameter is used to synthesize a UID for the track.
+  explicit Track(unsigned int* seed);
+  virtual ~Track();
+
+  // Adds a ContentEncoding element to the Track. Returns true on success.
+  virtual bool AddContentEncoding();
+
+  // Returns the ContentEncoding by index. Returns NULL if there is no
+  // ContentEncoding match.
+  ContentEncoding* GetContentEncodingByIndex(uint32 index) const;
+
+  // Returns the size in bytes for the payload of the Track element.
+  virtual uint64 PayloadSize() const;
+
+  // Returns the size in bytes of the Track element.
+  virtual uint64 Size() const;
+
+  // Output the Track element to the writer. Returns true on success.
+  virtual bool Write(IMkvWriter* writer) const;
+
+  // Sets the CodecPrivate element of the Track element. Copies |length|
+  // bytes from |codec_private| to |codec_private_|. Returns true on success.
+  bool SetCodecPrivate(const uint8* codec_private, uint64 length);
+
+  void set_codec_id(const char* codec_id);
+  const char* codec_id() const { return codec_id_; }
+  const uint8* codec_private() const { return codec_private_; }
+  void set_language(const char* language);
+  const char* language() const { return language_; }
+  void set_max_block_additional_id(uint64 max_block_additional_id) {
+    max_block_additional_id_ = max_block_additional_id;
+  }
+  uint64 max_block_additional_id() const { return max_block_additional_id_; }
+  void set_name(const char* name);
+  const char* name() const { return name_; }
+  void set_number(uint64 number) { number_ = number; }
+  uint64 number() const { return number_; }
+  void set_type(uint64 type) { type_ = type; }
+  uint64 type() const { return type_; }
+  void set_uid(uint64 uid) { uid_ = uid; }
+  uint64 uid() const { return uid_; }
+  void set_codec_delay(uint64 codec_delay) { codec_delay_ = codec_delay; }
+  uint64 codec_delay() const { return codec_delay_; }
+  void set_seek_pre_roll(uint64 seek_pre_roll) {
+    seek_pre_roll_ = seek_pre_roll;
+  }
+  uint64 seek_pre_roll() const { return seek_pre_roll_; }
+  void set_default_duration(uint64 default_duration) {
+    default_duration_ = default_duration;
+  }
+  uint64 default_duration() const { return default_duration_; }
+
+  uint64 codec_private_length() const { return codec_private_length_; }
+  uint32 content_encoding_entries_size() const {
+    return content_encoding_entries_size_;
+  }
+
+ private:
+  // Track element names.
+  char* codec_id_;
+  uint8* codec_private_;
+  char* language_;
+  uint64 max_block_additional_id_;
+  char* name_;
+  uint64 number_;
+  uint64 type_;
+  uint64 uid_;
+  uint64 codec_delay_;
+  uint64 seek_pre_roll_;
+  uint64 default_duration_;
+
+  // Size of the CodecPrivate data in bytes.
+  uint64 codec_private_length_;
+
+  // ContentEncoding element list.
+  ContentEncoding** content_encoding_entries_;
+
+  // Number of ContentEncoding elements added.
+  uint32 content_encoding_entries_size_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Track);
+};
+
+///////////////////////////////////////////////////////////////
+// Track that has video specific elements.
+class VideoTrack : public Track {
+ public:
+  // Supported modes for stereo 3D.
+  enum StereoMode {
+    kMono = 0,
+    kSideBySideLeftIsFirst = 1,
+    kTopBottomRightIsFirst = 2,
+    kTopBottomLeftIsFirst = 3,
+    kSideBySideRightIsFirst = 11
+  };
+
+  enum AlphaMode { kNoAlpha = 0, kAlpha = 1 };
+
+  // The |seed| parameter is used to synthesize a UID for the track.
+  explicit VideoTrack(unsigned int* seed);
+  virtual ~VideoTrack();
+
+  // Returns the size in bytes for the payload of the Track element plus the
+  // video specific elements.
+  virtual uint64 PayloadSize() const;
+
+  // Output the VideoTrack element to the writer. Returns true on success.
+  virtual bool Write(IMkvWriter* writer) const;
+
+  // Sets the video's stereo mode. Returns true on success.
+  bool SetStereoMode(uint64 stereo_mode);
+
+  // Sets the video's alpha mode. Returns true on success.
+  bool SetAlphaMode(uint64 alpha_mode);
+
+  void set_display_height(uint64 height) { display_height_ = height; }
+  uint64 display_height() const { return display_height_; }
+  void set_display_width(uint64 width) { display_width_ = width; }
+  uint64 display_width() const { return display_width_; }
+
+  void set_crop_left(uint64 crop_left) { crop_left_ = crop_left; }
+  uint64 crop_left() const { return crop_left_; }
+  void set_crop_right(uint64 crop_right) { crop_right_ = crop_right; }
+  uint64 crop_right() const { return crop_right_; }
+  void set_crop_top(uint64 crop_top) { crop_top_ = crop_top; }
+  uint64 crop_top() const { return crop_top_; }
+  void set_crop_bottom(uint64 crop_bottom) { crop_bottom_ = crop_bottom; }
+  uint64 crop_bottom() const { return crop_bottom_; }
+
+  void set_frame_rate(double frame_rate) { frame_rate_ = frame_rate; }
+  double frame_rate() const { return frame_rate_; }
+  void set_height(uint64 height) { height_ = height; }
+  uint64 height() const { return height_; }
+  uint64 stereo_mode() { return stereo_mode_; }
+  uint64 alpha_mode() { return alpha_mode_; }
+  void set_width(uint64 width) { width_ = width; }
+  uint64 width() const { return width_; }
+
+ private:
+  // Returns the size in bytes of the Video element.
+  uint64 VideoPayloadSize() const;
+
+  // Video track element names.
+  uint64 display_height_;
+  uint64 display_width_;
+  uint64 crop_left_;
+  uint64 crop_right_;
+  uint64 crop_top_;
+  uint64 crop_bottom_;
+  double frame_rate_;
+  uint64 height_;
+  uint64 stereo_mode_;
+  uint64 alpha_mode_;
+  uint64 width_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(VideoTrack);
+};
+
+///////////////////////////////////////////////////////////////
+// Track that has audio specific elements.
+class AudioTrack : public Track {
+ public:
+  // The |seed| parameter is used to synthesize a UID for the track.
+  explicit AudioTrack(unsigned int* seed);
+  virtual ~AudioTrack();
+
+  // Returns the size in bytes for the payload of the Track element plus the
+  // audio specific elements.
+  virtual uint64 PayloadSize() const;
+
+  // Output the AudioTrack element to the writer. Returns true on success.
+  virtual bool Write(IMkvWriter* writer) const;
+
+  void set_bit_depth(uint64 bit_depth) { bit_depth_ = bit_depth; }
+  uint64 bit_depth() const { return bit_depth_; }
+  void set_channels(uint64 channels) { channels_ = channels; }
+  uint64 channels() const { return channels_; }
+  void set_sample_rate(double sample_rate) { sample_rate_ = sample_rate; }
+  double sample_rate() const { return sample_rate_; }
+
+ private:
+  // Audio track element names.
+  uint64 bit_depth_;
+  uint64 channels_;
+  double sample_rate_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(AudioTrack);
+};
+
+///////////////////////////////////////////////////////////////
+// Tracks element
+class Tracks {
+ public:
+  // Audio and video type defined by the Matroska specs.
+  enum { kVideo = 0x1, kAudio = 0x2 };
+
+  static const char kOpusCodecId[];
+  static const char kVorbisCodecId[];
+  static const char kVp8CodecId[];
+  static const char kVp9CodecId[];
+  static const char kVp10CodecId[];
+
+  Tracks();
+  ~Tracks();
+
+  // Adds a Track element to the Tracks object. |track| will be owned and
+  // deleted by the Tracks object. Returns true on success. |number| is the
+  // number to use for the track. |number| must be >= 0. If |number| == 0
+  // then the muxer will decide on the track number.
+  bool AddTrack(Track* track, int32 number);
+
+  // Returns the track by index. Returns NULL if there is no track match.
+  const Track* GetTrackByIndex(uint32 idx) const;
+
+  // Search the Tracks and return the track that matches |tn|. Returns NULL
+  // if there is no track match.
+  Track* GetTrackByNumber(uint64 track_number) const;
+
+  // Returns true if the track number is an audio track.
+  bool TrackIsAudio(uint64 track_number) const;
+
+  // Returns true if the track number is a video track.
+  bool TrackIsVideo(uint64 track_number) const;
+
+  // Output the Tracks element to the writer. Returns true on success.
+  bool Write(IMkvWriter* writer) const;
+
+  uint32 track_entries_size() const { return track_entries_size_; }
+
+ private:
+  // Track element list.
+  Track** track_entries_;
+
+  // Number of Track elements added.
+  uint32 track_entries_size_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Tracks);
+};
+
+///////////////////////////////////////////////////////////////
+// Chapter element
+//
+class Chapter {
+ public:
+  // Set the identifier for this chapter.  (This corresponds to the
+  // Cue Identifier line in WebVTT.)
+  // TODO(matthewjheaney): the actual serialization of this item in
+  // MKV is pending.
+  bool set_id(const char* id);
+
+  // Converts the nanosecond start and stop times of this chapter to
+  // their corresponding timecode values, and stores them that way.
+  void set_time(const Segment& segment, uint64 start_time_ns,
+                uint64 end_time_ns);
+
+  // Sets the uid for this chapter. Primarily used to enable
+  // deterministic output from the muxer.
+  void set_uid(const uint64 uid) { uid_ = uid; }
+
+  // Add a title string to this chapter, per the semantics described
+  // here:
+  //  http://www.matroska.org/technical/specs/index.html
+  //
+  // The title ("chapter string") is a UTF-8 string.
+  //
+  // The language has ISO 639-2 representation, described here:
+  //  http://www.loc.gov/standards/iso639-2/englangn.html
+  //  http://www.loc.gov/standards/iso639-2/php/English_list.php
+  // If you specify NULL as the language value, this implies
+  // English ("eng").
+  //
+  // The country value corresponds to the codes listed here:
+  //  http://www.iana.org/domains/root/db/
+  //
+  // The function returns false if the string could not be allocated.
+  bool add_string(const char* title, const char* language, const char* country);
+
+ private:
+  friend class Chapters;
+
+  // For storage of chapter titles that differ by language.
+  class Display {
+   public:
+    // Establish representation invariant for new Display object.
+    void Init();
+
+    // Reclaim resources, in anticipation of destruction.
+    void Clear();
+
+    // Copies the title to the |title_| member.  Returns false on
+    // error.
+    bool set_title(const char* title);
+
+    // Copies the language to the |language_| member.  Returns false
+    // on error.
+    bool set_language(const char* language);
+
+    // Copies the country to the |country_| member.  Returns false on
+    // error.
+    bool set_country(const char* country);
+
+    // If |writer| is non-NULL, serialize the Display sub-element of
+    // the Atom into the stream.  Returns the Display element size on
+    // success, 0 if error.
+    uint64 WriteDisplay(IMkvWriter* writer) const;
+
+   private:
+    char* title_;
+    char* language_;
+    char* country_;
+  };
+
+  Chapter();
+  ~Chapter();
+
+  // Establish the representation invariant for a newly-created
+  // Chapter object.  The |seed| parameter is used to create the UID
+  // for this chapter atom.
+  void Init(unsigned int* seed);
+
+  // Copies this Chapter object to a different one.  This is used when
+  // expanding a plain array of Chapter objects (see Chapters).
+  void ShallowCopy(Chapter* dst) const;
+
+  // Reclaim resources used by this Chapter object, pending its
+  // destruction.
+  void Clear();
+
+  // If there is no storage remaining on the |displays_| array for a
+  // new display object, creates a new, longer array and copies the
+  // existing Display objects to the new array.  Returns false if the
+  // array cannot be expanded.
+  bool ExpandDisplaysArray();
+
+  // If |writer| is non-NULL, serialize the Atom sub-element into the
+  // stream.  Returns the total size of the element on success, 0 if
+  // error.
+  uint64 WriteAtom(IMkvWriter* writer) const;
+
+  // The string identifier for this chapter (corresponds to WebVTT cue
+  // identifier).
+  char* id_;
+
+  // Start timecode of the chapter.
+  uint64 start_timecode_;
+
+  // Stop timecode of the chapter.
+  uint64 end_timecode_;
+
+  // The binary identifier for this chapter.
+  uint64 uid_;
+
+  // The Atom element can contain multiple Display sub-elements, as
+  // the same logical title can be rendered in different languages.
+  Display* displays_;
+
+  // The physical length (total size) of the |displays_| array.
+  int displays_size_;
+
+  // The logical length (number of active elements) on the |displays_|
+  // array.
+  int displays_count_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Chapter);
+};
+
+///////////////////////////////////////////////////////////////
+// Chapters element
+//
+class Chapters {
+ public:
+  Chapters();
+  ~Chapters();
+
+  Chapter* AddChapter(unsigned int* seed);
+
+  // Returns the number of chapters that have been added.
+  int Count() const;
+
+  // Output the Chapters element to the writer. Returns true on success.
+  bool Write(IMkvWriter* writer) const;
+
+ private:
+  // Expands the chapters_ array if there is not enough space to contain
+  // another chapter object.  Returns true on success.
+  bool ExpandChaptersArray();
+
+  // If |writer| is non-NULL, serialize the Edition sub-element of the
+  // Chapters element into the stream.  Returns the Edition element
+  // size on success, 0 if error.
+  uint64 WriteEdition(IMkvWriter* writer) const;
+
+  // Total length of the chapters_ array.
+  int chapters_size_;
+
+  // Number of active chapters on the chapters_ array.
+  int chapters_count_;
+
+  // Array for storage of chapter objects.
+  Chapter* chapters_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Chapters);
+};
+
+///////////////////////////////////////////////////////////////
+// Tag element
+//
+class Tag {
+ public:
+  bool add_simple_tag(const char* tag_name, const char* tag_string);
+
+ private:
+  // Tags calls Clear and the destructor of Tag
+  friend class Tags;
+
+  // For storage of simple tags
+  class SimpleTag {
+   public:
+    // Establish representation invariant for new SimpleTag object.
+    void Init();
+
+    // Reclaim resources, in anticipation of destruction.
+    void Clear();
+
+    // Copies the title to the |tag_name_| member.  Returns false on
+    // error.
+    bool set_tag_name(const char* tag_name);
+
+    // Copies the language to the |tag_string_| member.  Returns false
+    // on error.
+    bool set_tag_string(const char* tag_string);
+
+    // If |writer| is non-NULL, serialize the SimpleTag sub-element of
+    // the Atom into the stream.  Returns the SimpleTag element size on
+    // success, 0 if error.
+    uint64 Write(IMkvWriter* writer) const;
+
+   private:
+    char* tag_name_;
+    char* tag_string_;
+  };
+
+  Tag();
+  ~Tag();
+
+  // Copies this Tag object to a different one.  This is used when
+  // expanding a plain array of Tag objects (see Tags).
+  void ShallowCopy(Tag* dst) const;
+
+  // Reclaim resources used by this Tag object, pending its
+  // destruction.
+  void Clear();
+
+  // If there is no storage remaining on the |simple_tags_| array for a
+  // new display object, creates a new, longer array and copies the
+  // existing SimpleTag objects to the new array.  Returns false if the
+  // array cannot be expanded.
+  bool ExpandSimpleTagsArray();
+
+  // If |writer| is non-NULL, serialize the Tag sub-element into the
+  // stream.  Returns the total size of the element on success, 0 if
+  // error.
+  uint64 Write(IMkvWriter* writer) const;
+
+  // The Atom element can contain multiple SimpleTag sub-elements
+  SimpleTag* simple_tags_;
+
+  // The physical length (total size) of the |simple_tags_| array.
+  int simple_tags_size_;
+
+  // The logical length (number of active elements) on the |simple_tags_|
+  // array.
+  int simple_tags_count_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Tag);
+};
+
+///////////////////////////////////////////////////////////////
+// Tags element
+//
+class Tags {
+ public:
+  Tags();
+  ~Tags();
+
+  Tag* AddTag();
+
+  // Returns the number of tags that have been added.
+  int Count() const;
+
+  // Output the Tags element to the writer. Returns true on success.
+  bool Write(IMkvWriter* writer) const;
+
+ private:
+  // Expands the tags_ array if there is not enough space to contain
+  // another tag object.  Returns true on success.
+  bool ExpandTagsArray();
+
+  // Total length of the tags_ array.
+  int tags_size_;
+
+  // Number of active tags on the tags_ array.
+  int tags_count_;
+
+  // Array for storage of tag objects.
+  Tag* tags_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Tags);
+};
+
+///////////////////////////////////////////////////////////////
+// Cluster element
+//
+// Notes:
+//  |Init| must be called before any other method in this class.
+class Cluster {
+ public:
+  // |timecode| is the absolute timecode of the cluster. |cues_pos| is the
+  // position for the cluster within the segment that should be written in
+  // the cues element. |timecode_scale| is the timecode scale of the segment.
+  Cluster(uint64 timecode, int64 cues_pos, uint64 timecode_scale);
+  ~Cluster();
+
+  bool Init(IMkvWriter* ptr_writer);
+
+  // Adds a frame to be output in the file. The frame is written out through
+  // |writer_| if successful. Returns true on success.
+  bool AddFrame(const Frame* frame);
+
+  // Adds a frame to be output in the file. The frame is written out through
+  // |writer_| if successful. Returns true on success.
+  // Inputs:
+  //   data: Pointer to the data
+  //   length: Length of the data
+  //   track_number: Track to add the data to. Value returned by Add track
+  //                 functions.  The range of allowed values is [1, 126].
+  //   timecode:     Absolute (not relative to cluster) timestamp of the
+  //                 frame, expressed in timecode units.
+  //   is_key:       Flag telling whether or not this frame is a key frame.
+  bool AddFrame(const uint8* data, uint64 length, uint64 track_number,
+                uint64 timecode,  // timecode units (absolute)
+                bool is_key);
+
+  // Adds a frame to be output in the file. The frame is written out through
+  // |writer_| if successful. Returns true on success.
+  // Inputs:
+  //   data: Pointer to the data
+  //   length: Length of the data
+  //   additional: Pointer to the additional data
+  //   additional_length: Length of the additional data
+  //   add_id: Value of BlockAddID element
+  //   track_number: Track to add the data to. Value returned by Add track
+  //                 functions.  The range of allowed values is [1, 126].
+  //   abs_timecode: Absolute (not relative to cluster) timestamp of the
+  //                 frame, expressed in timecode units.
+  //   is_key:       Flag telling whether or not this frame is a key frame.
+  bool AddFrameWithAdditional(const uint8* data, uint64 length,
+                              const uint8* additional, uint64 additional_length,
+                              uint64 add_id, uint64 track_number,
+                              uint64 abs_timecode, bool is_key);
+
+  // Adds a frame to be output in the file. The frame is written out through
+  // |writer_| if successful. Returns true on success.
+  // Inputs:
+  //   data: Pointer to the data.
+  //   length: Length of the data.
+  //   discard_padding: DiscardPadding element value.
+  //   track_number: Track to add the data to. Value returned by Add track
+  //                 functions.  The range of allowed values is [1, 126].
+  //   abs_timecode: Absolute (not relative to cluster) timestamp of the
+  //                 frame, expressed in timecode units.
+  //   is_key:       Flag telling whether or not this frame is a key frame.
+  bool AddFrameWithDiscardPadding(const uint8* data, uint64 length,
+                                  int64 discard_padding, uint64 track_number,
+                                  uint64 abs_timecode, bool is_key);
+
+  // Writes a frame of metadata to the output medium; returns true on
+  // success.
+  // Inputs:
+  //   data: Pointer to the data
+  //   length: Length of the data
+  //   track_number: Track to add the data to. Value returned by Add track
+  //                 functions.  The range of allowed values is [1, 126].
+  //   timecode:     Absolute (not relative to cluster) timestamp of the
+  //                 metadata frame, expressed in timecode units.
+  //   duration:     Duration of metadata frame, in timecode units.
+  //
+  // The metadata frame is written as a block group, with a duration
+  // sub-element but no reference time sub-elements (indicating that
+  // it is considered a keyframe, per Matroska semantics).
+  bool AddMetadata(const uint8* data, uint64 length, uint64 track_number,
+                   uint64 timecode, uint64 duration);
+
+  // Increments the size of the cluster's data in bytes.
+  void AddPayloadSize(uint64 size);
+
+  // Closes the cluster so no more data can be written to it. Will update the
+  // cluster's size if |writer_| is seekable. Returns true on success.
+  bool Finalize();
+
+  // Returns the size in bytes for the entire Cluster element.
+  uint64 Size() const;
+
+  // Given |abs_timecode|, calculates timecode relative to most recent timecode.
+  // Returns -1 on failure, or a relative timecode.
+  int64 GetRelativeTimecode(int64 abs_timecode) const;
+
+  int64 size_position() const { return size_position_; }
+  int32 blocks_added() const { return blocks_added_; }
+  uint64 payload_size() const { return payload_size_; }
+  int64 position_for_cues() const { return position_for_cues_; }
+  uint64 timecode() const { return timecode_; }
+  uint64 timecode_scale() const { return timecode_scale_; }
+
+ private:
+  // Utility method that confirms that blocks can still be added, and that the
+  // cluster header has been written. Used by |DoWriteFrame*|. Returns true
+  // when successful.
+  bool PreWriteBlock();
+
+  // Utility method used by the |DoWriteFrame*| methods that handles the book
+  // keeping required after each block is written.
+  void PostWriteBlock(uint64 element_size);
+
+  // Does some verification and calls WriteFrame.
+  bool DoWriteFrame(const Frame* const frame);
+
+  // Outputs the Cluster header to |writer_|. Returns true on success.
+  bool WriteClusterHeader();
+
+  // Number of blocks added to the cluster.
+  int32 blocks_added_;
+
+  // Flag telling if the cluster has been closed.
+  bool finalized_;
+
+  // Flag telling if the cluster's header has been written.
+  bool header_written_;
+
+  // The size of the cluster elements in bytes.
+  uint64 payload_size_;
+
+  // The file position used for cue points.
+  const int64 position_for_cues_;
+
+  // The file position of the cluster's size element.
+  int64 size_position_;
+
+  // The absolute timecode of the cluster.
+  const uint64 timecode_;
+
+  // The timecode scale of the Segment containing the cluster.
+  const uint64 timecode_scale_;
+
+  // Pointer to the writer object. Not owned by this class.
+  IMkvWriter* writer_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Cluster);
+};
+
+///////////////////////////////////////////////////////////////
+// SeekHead element
+class SeekHead {
+ public:
+  SeekHead();
+  ~SeekHead();
+
+  // TODO(fgalligan): Change this to reserve a certain size. Then check how
+  // big the seek entry to be added is as not every seek entry will be the
+  // maximum size it could be.
+  // Adds a seek entry to be written out when the element is finalized. |id|
+  // must be the coded mkv element id. |pos| is the file position of the
+  // element. Returns true on success.
+  bool AddSeekEntry(uint32 id, uint64 pos);
+
+  // Writes out SeekHead and SeekEntry elements. Returns true on success.
+  bool Finalize(IMkvWriter* writer) const;
+
+  // Returns the id of the Seek Entry at the given index. Returns -1 if index is
+  // out of range.
+  uint32 GetId(int index) const;
+
+  // Returns the position of the Seek Entry at the given index. Returns -1 if
+  // index is out of range.
+  uint64 GetPosition(int index) const;
+
+  // Sets the Seek Entry id and position at given index.
+  // Returns true on success.
+  bool SetSeekEntry(int index, uint32 id, uint64 position);
+
+  // Reserves space by writing out a Void element which will be updated with
+  // a SeekHead element later. Returns true on success.
+  bool Write(IMkvWriter* writer);
+
+  // We are going to put a cap on the number of Seek Entries.
+  const static int32 kSeekEntryCount = 5;
+
+ private:
+  // Returns the maximum size in bytes of one seek entry.
+  uint64 MaxEntrySize() const;
+
+  // Seek entry id element list.
+  uint32 seek_entry_id_[kSeekEntryCount];
+
+  // Seek entry pos element list.
+  uint64 seek_entry_pos_[kSeekEntryCount];
+
+  // The file position of SeekHead element.
+  int64 start_pos_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(SeekHead);
+};
+
+///////////////////////////////////////////////////////////////
+// Segment Information element
+class SegmentInfo {
+ public:
+  SegmentInfo();
+  ~SegmentInfo();
+
+  // Will update the duration if |duration_| is > 0.0. Returns true on success.
+  bool Finalize(IMkvWriter* writer) const;
+
+  // Sets |muxing_app_| and |writing_app_|.
+  bool Init();
+
+  // Output the Segment Information element to the writer. Returns true on
+  // success.
+  bool Write(IMkvWriter* writer);
+
+  void set_duration(double duration) { duration_ = duration; }
+  double duration() const { return duration_; }
+  void set_muxing_app(const char* app);
+  const char* muxing_app() const { return muxing_app_; }
+  void set_timecode_scale(uint64 scale) { timecode_scale_ = scale; }
+  uint64 timecode_scale() const { return timecode_scale_; }
+  void set_writing_app(const char* app);
+  const char* writing_app() const { return writing_app_; }
+  void set_date_utc(int64 date_utc) { date_utc_ = date_utc; }
+  int64 date_utc() const { return date_utc_; }
+
+ private:
+  // Segment Information element names.
+  // Initially set to -1 to signify that a duration has not been set and should
+  // not be written out.
+  double duration_;
+  // Set to libwebm-%d.%d.%d.%d, major, minor, build, revision.
+  char* muxing_app_;
+  uint64 timecode_scale_;
+  // Initially set to libwebm-%d.%d.%d.%d, major, minor, build, revision.
+  char* writing_app_;
+  // LLONG_MIN when DateUTC is not set.
+  int64 date_utc_;
+
+  // The file position of the duration element.
+  int64 duration_pos_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(SegmentInfo);
+};
+
+///////////////////////////////////////////////////////////////
+// This class represents the main segment in a WebM file. Currently only
+// supports one Segment element.
+//
+// Notes:
+//  |Init| must be called before any other method in this class.
+class Segment {
+ public:
+  enum Mode { kLive = 0x1, kFile = 0x2 };
+
+  enum CuesPosition {
+    kAfterClusters = 0x0,  // Position Cues after Clusters - Default
+    kBeforeClusters = 0x1  // Position Cues before Clusters
+  };
+
+  const static uint32 kDefaultDocTypeVersion = 2;
+  const static uint64 kDefaultMaxClusterDuration = 30000000000ULL;
+
+  Segment();
+  ~Segment();
+
+  // Initializes |SegmentInfo| and returns result. Always returns false when
+  // |ptr_writer| is NULL.
+  bool Init(IMkvWriter* ptr_writer);
+
+  // Adds a generic track to the segment.  Returns the newly-allocated
+  // track object (which is owned by the segment) on success, NULL on
+  // error. |number| is the number to use for the track.  |number|
+  // must be >= 0. If |number| == 0 then the muxer will decide on the
+  // track number.
+  Track* AddTrack(int32 number);
+
+  // Adds a Vorbis audio track to the segment. Returns the number of the track
+  // on success, 0 on error. |number| is the number to use for the audio track.
+  // |number| must be >= 0. If |number| == 0 then the muxer will decide on
+  // the track number.
+  uint64 AddAudioTrack(int32 sample_rate, int32 channels, int32 number);
+
+  // Adds an empty chapter to the chapters of this segment.  Returns
+  // non-NULL on success.  After adding the chapter, the caller should
+  // populate its fields via the Chapter member functions.
+  Chapter* AddChapter();
+
+  // Adds an empty tag to the tags of this segment.  Returns
+  // non-NULL on success.  After adding the tag, the caller should
+  // populate its fields via the Tag member functions.
+  Tag* AddTag();
+
+  // Adds a cue point to the Cues element. |timestamp| is the time in
+  // nanoseconds of the cue's time. |track| is the Track of the Cue. This
+  // function must be called after AddFrame to calculate the correct
+  // BlockNumber for the CuePoint. Returns true on success.
+  bool AddCuePoint(uint64 timestamp, uint64 track);
+
+  // Adds a frame to be output in the file. Returns true on success.
+  // Inputs:
+  //   data: Pointer to the data
+  //   length: Length of the data
+  //   track_number: Track to add the data to. Value returned by Add track
+  //                 functions.
+  //   timestamp:    Timestamp of the frame in nanoseconds from 0.
+  //   is_key:       Flag telling whether or not this frame is a key frame.
+  bool AddFrame(const uint8* data, uint64 length, uint64 track_number,
+                uint64 timestamp_ns, bool is_key);
+
+  // Writes a frame of metadata to the output medium; returns true on
+  // success.
+  // Inputs:
+  //   data: Pointer to the data
+  //   length: Length of the data
+  //   track_number: Track to add the data to. Value returned by Add track
+  //                 functions.
+  //   timecode:     Absolute timestamp of the metadata frame, expressed
+  //                 in nanosecond units.
+  //   duration:     Duration of metadata frame, in nanosecond units.
+  //
+  // The metadata frame is written as a block group, with a duration
+  // sub-element but no reference time sub-elements (indicating that
+  // it is considered a keyframe, per Matroska semantics).
+  bool AddMetadata(const uint8* data, uint64 length, uint64 track_number,
+                   uint64 timestamp_ns, uint64 duration_ns);
+
+  // Writes a frame with additional data to the output medium; returns true on
+  // success.
+  // Inputs:
+  //   data: Pointer to the data.
+  //   length: Length of the data.
+  //   additional: Pointer to additional data.
+  //   additional_length: Length of additional data.
+  //   add_id: Additional ID which identifies the type of additional data.
+  //   track_number: Track to add the data to. Value returned by Add track
+  //                 functions.
+  //   timestamp:    Absolute timestamp of the frame, expressed in nanosecond
+  //                 units.
+  //   is_key:       Flag telling whether or not this frame is a key frame.
+  bool AddFrameWithAdditional(const uint8* data, uint64 length,
+                              const uint8* additional, uint64 additional_length,
+                              uint64 add_id, uint64 track_number,
+                              uint64 timestamp, bool is_key);
+
+  // Writes a frame with DiscardPadding to the output medium; returns true on
+  // success.
+  // Inputs:
+  //   data: Pointer to the data.
+  //   length: Length of the data.
+  //   discard_padding: DiscardPadding element value.
+  //   track_number: Track to add the data to. Value returned by Add track
+  //                 functions.
+  //   timestamp:    Absolute timestamp of the frame, expressed in nanosecond
+  //                 units.
+  //   is_key:       Flag telling whether or not this frame is a key frame.
+  bool AddFrameWithDiscardPadding(const uint8* data, uint64 length,
+                                  int64 discard_padding, uint64 track_number,
+                                  uint64 timestamp, bool is_key);
+
+  // Writes a Frame to the output medium. Chooses the correct way of writing
+  // the frame (Block vs SimpleBlock) based on the parameters passed.
+  // Inputs:
+  //   frame: frame object
+  bool AddGenericFrame(const Frame* frame);
+
+  // Adds a VP8 video track to the segment. Returns the number of the track on
+  // success, 0 on error. |number| is the number to use for the video track.
+  // |number| must be >= 0. If |number| == 0 then the muxer will decide on
+  // the track number.
+  uint64 AddVideoTrack(int32 width, int32 height, int32 number);
+
+  // This function must be called after Finalize() if you need a copy of the
+  // output with Cues written before the Clusters. It will return false if the
+  // writer is not seekable of if chunking is set to true.
+  // Input parameters:
+  // reader - an IMkvReader object created with the same underlying file of the
+  //          current writer object. Make sure to close the existing writer
+  //          object before creating this so that all the data is properly
+  //          flushed and available for reading.
+  // writer - an IMkvWriter object pointing to a *different* file than the one
+  //          pointed by the current writer object. This file will contain the
+  //          Cues element before the Clusters.
+  bool CopyAndMoveCuesBeforeClusters(mkvparser::IMkvReader* reader,
+                                     IMkvWriter* writer);
+
+  // Sets which track to use for the Cues element. Must have added the track
+  // before calling this function. Returns true on success. |track_number| is
+  // returned by the Add track functions.
+  bool CuesTrack(uint64 track_number);
+
+  // This will force the muxer to create a new Cluster when the next frame is
+  // added.
+  void ForceNewClusterOnNextFrame();
+
+  // Writes out any frames that have not been written out. Finalizes the last
+  // cluster. May update the size and duration of the segment. May output the
+  // Cues element. May finalize the SeekHead element. Returns true on success.
+  bool Finalize();
+
+  // Returns the Cues object.
+  Cues* GetCues() { return &cues_; }
+
+  // Returns the Segment Information object.
+  const SegmentInfo* GetSegmentInfo() const { return &segment_info_; }
+  SegmentInfo* GetSegmentInfo() { return &segment_info_; }
+
+  // Search the Tracks and return the track that matches |track_number|.
+  // Returns NULL if there is no track match.
+  Track* GetTrackByNumber(uint64 track_number) const;
+
+  // Toggles whether to output a cues element.
+  void OutputCues(bool output_cues);
+
+  // Sets if the muxer will output files in chunks or not. |chunking| is a
+  // flag telling whether or not to turn on chunking. |filename| is the base
+  // filename for the chunk files. The header chunk file will be named
+  // |filename|.hdr and the data chunks will be named
+  // |filename|_XXXXXX.chk. Chunking implies that the muxer will be writing
+  // to files so the muxer will use the default MkvWriter class to control
+  // what data is written to what files. Returns true on success.
+  // TODO: Should we change the IMkvWriter Interface to add Open and Close?
+  // That will force the interface to be dependent on files.
+  bool SetChunking(bool chunking, const char* filename);
+
+  bool chunking() const { return chunking_; }
+  uint64 cues_track() const { return cues_track_; }
+  void set_max_cluster_duration(uint64 max_cluster_duration) {
+    max_cluster_duration_ = max_cluster_duration;
+  }
+  uint64 max_cluster_duration() const { return max_cluster_duration_; }
+  void set_max_cluster_size(uint64 max_cluster_size) {
+    max_cluster_size_ = max_cluster_size;
+  }
+  uint64 max_cluster_size() const { return max_cluster_size_; }
+  void set_mode(Mode mode) { mode_ = mode; }
+  Mode mode() const { return mode_; }
+  CuesPosition cues_position() const { return cues_position_; }
+  bool output_cues() const { return output_cues_; }
+  const SegmentInfo* segment_info() const { return &segment_info_; }
+
+ private:
+  // Checks if header information has been output and initialized. If not it
+  // will output the Segment element and initialize the SeekHead elment and
+  // Cues elements.
+  bool CheckHeaderInfo();
+
+  // Sets |doc_type_version_| based on the current element requirements.
+  void UpdateDocTypeVersion();
+
+  // Sets |name| according to how many chunks have been written. |ext| is the
+  // file extension. |name| must be deleted by the calling app. Returns true
+  // on success.
+  bool UpdateChunkName(const char* ext, char** name) const;
+
+  // Returns the maximum offset within the segment's payload. When chunking
+  // this function is needed to determine offsets of elements within the
+  // chunked files. Returns -1 on error.
+  int64 MaxOffset();
+
+  // Adds the frame to our frame array.
+  bool QueueFrame(Frame* frame);
+
+  // Output all frames that are queued. Returns -1 on error, otherwise
+  // it returns the number of frames written.
+  int WriteFramesAll();
+
+  // Output all frames that are queued that have an end time that is less
+  // then |timestamp|. Returns true on success and if there are no frames
+  // queued.
+  bool WriteFramesLessThan(uint64 timestamp);
+
+  // Outputs the segment header, Segment Information element, SeekHead element,
+  // and Tracks element to |writer_|.
+  bool WriteSegmentHeader();
+
+  // Given a frame with the specified timestamp (nanosecond units) and
+  // keyframe status, determine whether a new cluster should be
+  // created, before writing enqueued frames and the frame itself. The
+  // function returns one of the following values:
+  //  -1 = error: an out-of-order frame was detected
+  //  0 = do not create a new cluster, and write frame to the existing cluster
+  //  1 = create a new cluster, and write frame to that new cluster
+  //  2 = create a new cluster, and re-run test
+  int TestFrame(uint64 track_num, uint64 timestamp_ns, bool key) const;
+
+  // Create a new cluster, using the earlier of the first enqueued
+  // frame, or the indicated time. Returns true on success.
+  bool MakeNewCluster(uint64 timestamp_ns);
+
+  // Checks whether a new cluster needs to be created, and if so
+  // creates a new cluster. Returns false if creation of a new cluster
+  // was necessary but creation was not successful.
+  bool DoNewClusterProcessing(uint64 track_num, uint64 timestamp_ns, bool key);
+
+  // Adjusts Cue Point values (to place Cues before Clusters) so that they
+  // reflect the correct offsets.
+  void MoveCuesBeforeClusters();
+
+  // This function recursively computes the correct cluster offsets (this is
+  // done to move the Cues before Clusters). It recursively updates the change
+  // in size (which indicates a change in cluster offset) until no sizes change.
+  // Parameters:
+  // diff - indicates the difference in size of the Cues element that needs to
+  //        accounted for.
+  // index - index in the list of Cues which is currently being adjusted.
+  // cue_size - sum of size of all the CuePoint elements.
+  void MoveCuesBeforeClustersHelper(uint64 diff, int index, uint64* cue_size);
+
+  // Seeds the random number generator used to make UIDs.
+  unsigned int seed_;
+
+  // WebM elements
+  Cues cues_;
+  SeekHead seek_head_;
+  SegmentInfo segment_info_;
+  Tracks tracks_;
+  Chapters chapters_;
+  Tags tags_;
+
+  // Number of chunks written.
+  int chunk_count_;
+
+  // Current chunk filename.
+  char* chunk_name_;
+
+  // Default MkvWriter object created by this class used for writing clusters
+  // out in separate files.
+  MkvWriter* chunk_writer_cluster_;
+
+  // Default MkvWriter object created by this class used for writing Cues
+  // element out to a file.
+  MkvWriter* chunk_writer_cues_;
+
+  // Default MkvWriter object created by this class used for writing the
+  // Matroska header out to a file.
+  MkvWriter* chunk_writer_header_;
+
+  // Flag telling whether or not the muxer is chunking output to multiple
+  // files.
+  bool chunking_;
+
+  // Base filename for the chunked files.
+  char* chunking_base_name_;
+
+  // File position offset where the Clusters end.
+  int64 cluster_end_offset_;
+
+  // List of clusters.
+  Cluster** cluster_list_;
+
+  // Number of cluster pointers allocated in the cluster list.
+  int32 cluster_list_capacity_;
+
+  // Number of clusters in the cluster list.
+  int32 cluster_list_size_;
+
+  // Indicates whether Cues should be written before or after Clusters
+  CuesPosition cues_position_;
+
+  // Track number that is associated with the cues element for this segment.
+  uint64 cues_track_;
+
+  // Tells the muxer to force a new cluster on the next Block.
+  bool force_new_cluster_;
+
+  // List of stored audio frames. These variables are used to store frames so
+  // the muxer can follow the guideline "Audio blocks that contain the video
+  // key frame's timecode should be in the same cluster as the video key frame
+  // block."
+  Frame** frames_;
+
+  // Number of frame pointers allocated in the frame list.
+  int32 frames_capacity_;
+
+  // Number of frames in the frame list.
+  int32 frames_size_;
+
+  // Flag telling if a video track has been added to the segment.
+  bool has_video_;
+
+  // Flag telling if the segment's header has been written.
+  bool header_written_;
+
+  // Duration of the last block in nanoseconds.
+  uint64 last_block_duration_;
+
+  // Last timestamp in nanoseconds added to a cluster.
+  uint64 last_timestamp_;
+
+  // Last timestamp in nanoseconds by track number added to a cluster.
+  uint64 last_track_timestamp_[kMaxTrackNumber];
+
+  // Maximum time in nanoseconds for a cluster duration. This variable is a
+  // guideline and some clusters may have a longer duration. Default is 30
+  // seconds.
+  uint64 max_cluster_duration_;
+
+  // Maximum size in bytes for a cluster. This variable is a guideline and
+  // some clusters may have a larger size. Default is 0 which signifies that
+  // the muxer will decide the size.
+  uint64 max_cluster_size_;
+
+  // The mode that segment is in. If set to |kLive| the writer must not
+  // seek backwards.
+  Mode mode_;
+
+  // Flag telling the muxer that a new cue point should be added.
+  bool new_cuepoint_;
+
+  // TODO(fgalligan): Should we add support for more than one Cues element?
+  // Flag whether or not the muxer should output a Cues element.
+  bool output_cues_;
+
+  // The size of the EBML header, used to validate the header if
+  // WriteEbmlHeader() is called more than once.
+  int32 ebml_header_size_;
+
+  // The file position of the segment's payload.
+  int64 payload_pos_;
+
+  // The file position of the element's size.
+  int64 size_position_;
+
+  // Current DocTypeVersion (|doc_type_version_|) and that written in
+  // WriteSegmentHeader().
+  // WriteEbmlHeader() will be called from Finalize() if |doc_type_version_|
+  // differs from |doc_type_version_written_|.
+  uint32 doc_type_version_;
+  uint32 doc_type_version_written_;
+
+  // Pointer to the writer objects. Not owned by this class.
+  IMkvWriter* writer_cluster_;
+  IMkvWriter* writer_cues_;
+  IMkvWriter* writer_header_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Segment);
+};
+
+}  // end namespace mkvmuxer
+
+#endif  // MKVMUXER_HPP
diff --git a/libs/libvpx/third_party/libwebm/mkvmuxertypes.hpp b/libs/libvpx/third_party/libwebm/mkvmuxertypes.hpp
new file mode 100644
index 0000000000..d0fc9fec88
--- /dev/null
+++ b/libs/libvpx/third_party/libwebm/mkvmuxertypes.hpp
@@ -0,0 +1,30 @@
+// Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+
+#ifndef MKVMUXERTYPES_HPP
+#define MKVMUXERTYPES_HPP
+
+// Copied from Chromium basictypes.h
+// A macro to disallow the copy constructor and operator= functions
+// This should be used in the private: declarations for a class
+#define LIBWEBM_DISALLOW_COPY_AND_ASSIGN(TypeName) \
+  TypeName(const TypeName&);                       \
+  void operator=(const TypeName&)
+
+namespace mkvmuxer {
+
+typedef unsigned char uint8;
+typedef short int16;
+typedef int int32;
+typedef unsigned int uint32;
+typedef long long int64;
+typedef unsigned long long uint64;
+
+}  // end namespace mkvmuxer
+
+#endif  // MKVMUXERTYPES_HPP
diff --git a/libs/libvpx/third_party/libwebm/mkvmuxerutil.cpp b/libs/libvpx/third_party/libwebm/mkvmuxerutil.cpp
new file mode 100644
index 0000000000..27ab15d51f
--- /dev/null
+++ b/libs/libvpx/third_party/libwebm/mkvmuxerutil.cpp
@@ -0,0 +1,629 @@
+// Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+
+#include "mkvmuxerutil.hpp"
+
+#ifdef __ANDROID__
+#include <fcntl.h>
+#endif
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <ctime>
+#include <new>
+
+#include "mkvwriter.hpp"
+#include "webmids.hpp"
+
+#ifdef _MSC_VER
+// Disable MSVC warnings that suggest making code non-portable.
+#pragma warning(disable : 4996)
+#endif
+
+namespace mkvmuxer {
+
+namespace {
+
+// Date elements are always 8 octets in size.
+const int kDateElementSize = 8;
+
+uint64 WriteBlock(IMkvWriter* writer, const Frame* const frame, int64 timecode,
+                  uint64 timecode_scale) {
+  uint64 block_additional_elem_size = 0;
+  uint64 block_addid_elem_size = 0;
+  uint64 block_more_payload_size = 0;
+  uint64 block_more_elem_size = 0;
+  uint64 block_additions_payload_size = 0;
+  uint64 block_additions_elem_size = 0;
+  if (frame->additional()) {
+    block_additional_elem_size = EbmlElementSize(
+        kMkvBlockAdditional, frame->additional(), frame->additional_length());
+    block_addid_elem_size = EbmlElementSize(kMkvBlockAddID, frame->add_id());
+
+    block_more_payload_size =
+        block_addid_elem_size + block_additional_elem_size;
+    block_more_elem_size =
+        EbmlMasterElementSize(kMkvBlockMore, block_more_payload_size) +
+        block_more_payload_size;
+    block_additions_payload_size = block_more_elem_size;
+    block_additions_elem_size =
+        EbmlMasterElementSize(kMkvBlockAdditions,
+                              block_additions_payload_size) +
+        block_additions_payload_size;
+  }
+
+  uint64 discard_padding_elem_size = 0;
+  if (frame->discard_padding() != 0) {
+    discard_padding_elem_size =
+        EbmlElementSize(kMkvDiscardPadding, frame->discard_padding());
+  }
+
+  const uint64 reference_block_timestamp =
+      frame->reference_block_timestamp() / timecode_scale;
+  uint64 reference_block_elem_size = 0;
+  if (!frame->is_key()) {
+    reference_block_elem_size =
+        EbmlElementSize(kMkvReferenceBlock, reference_block_timestamp);
+  }
+
+  const uint64 duration = frame->duration() / timecode_scale;
+  uint64 block_duration_elem_size = 0;
+  if (duration > 0)
+    block_duration_elem_size = EbmlElementSize(kMkvBlockDuration, duration);
+
+  const uint64 block_payload_size = 4 + frame->length();
+  const uint64 block_elem_size =
+      EbmlMasterElementSize(kMkvBlock, block_payload_size) + block_payload_size;
+
+  const uint64 block_group_payload_size =
+      block_elem_size + block_additions_elem_size + block_duration_elem_size +
+      discard_padding_elem_size + reference_block_elem_size;
+
+  if (!WriteEbmlMasterElement(writer, kMkvBlockGroup,
+                              block_group_payload_size)) {
+    return 0;
+  }
+
+  if (!WriteEbmlMasterElement(writer, kMkvBlock, block_payload_size))
+    return 0;
+
+  if (WriteUInt(writer, frame->track_number()))
+    return 0;
+
+  if (SerializeInt(writer, timecode, 2))
+    return 0;
+
+  // For a Block, flags is always 0.
+  if (SerializeInt(writer, 0, 1))
+    return 0;
+
+  if (writer->Write(frame->frame(), static_cast<uint32>(frame->length())))
+    return 0;
+
+  if (frame->additional()) {
+    if (!WriteEbmlMasterElement(writer, kMkvBlockAdditions,
+                                block_additions_payload_size)) {
+      return 0;
+    }
+
+    if (!WriteEbmlMasterElement(writer, kMkvBlockMore, block_more_payload_size))
+      return 0;
+
+    if (!WriteEbmlElement(writer, kMkvBlockAddID, frame->add_id()))
+      return 0;
+
+    if (!WriteEbmlElement(writer, kMkvBlockAdditional, frame->additional(),
+                          frame->additional_length())) {
+      return 0;
+    }
+  }
+
+  if (frame->discard_padding() != 0 &&
+      !WriteEbmlElement(writer, kMkvDiscardPadding, frame->discard_padding())) {
+    return false;
+  }
+
+  if (!frame->is_key() &&
+      !WriteEbmlElement(writer, kMkvReferenceBlock,
+                        reference_block_timestamp)) {
+    return false;
+  }
+
+  if (duration > 0 && !WriteEbmlElement(writer, kMkvBlockDuration, duration)) {
+    return false;
+  }
+  return EbmlMasterElementSize(kMkvBlockGroup, block_group_payload_size) +
+         block_group_payload_size;
+}
+
+uint64 WriteSimpleBlock(IMkvWriter* writer, const Frame* const frame,
+                        int64 timecode) {
+  if (WriteID(writer, kMkvSimpleBlock))
+    return 0;
+
+  const int32 size = static_cast<int32>(frame->length()) + 4;
+  if (WriteUInt(writer, size))
+    return 0;
+
+  if (WriteUInt(writer, static_cast<uint64>(frame->track_number())))
+    return 0;
+
+  if (SerializeInt(writer, timecode, 2))
+    return 0;
+
+  uint64 flags = 0;
+  if (frame->is_key())
+    flags |= 0x80;
+
+  if (SerializeInt(writer, flags, 1))
+    return 0;
+
+  if (writer->Write(frame->frame(), static_cast<uint32>(frame->length())))
+    return 0;
+
+  return GetUIntSize(kMkvSimpleBlock) + GetCodedUIntSize(size) + 4 +
+         frame->length();
+}
+
+}  // namespace
+
+int32 GetCodedUIntSize(uint64 value) {
+  if (value < 0x000000000000007FULL)
+    return 1;
+  else if (value < 0x0000000000003FFFULL)
+    return 2;
+  else if (value < 0x00000000001FFFFFULL)
+    return 3;
+  else if (value < 0x000000000FFFFFFFULL)
+    return 4;
+  else if (value < 0x00000007FFFFFFFFULL)
+    return 5;
+  else if (value < 0x000003FFFFFFFFFFULL)
+    return 6;
+  else if (value < 0x0001FFFFFFFFFFFFULL)
+    return 7;
+  return 8;
+}
+
+int32 GetUIntSize(uint64 value) {
+  if (value < 0x0000000000000100ULL)
+    return 1;
+  else if (value < 0x0000000000010000ULL)
+    return 2;
+  else if (value < 0x0000000001000000ULL)
+    return 3;
+  else if (value < 0x0000000100000000ULL)
+    return 4;
+  else if (value < 0x0000010000000000ULL)
+    return 5;
+  else if (value < 0x0001000000000000ULL)
+    return 6;
+  else if (value < 0x0100000000000000ULL)
+    return 7;
+  return 8;
+}
+
+int32 GetIntSize(int64 value) {
+  // Doubling the requested value ensures positive values with their high bit
+  // set are written with 0-padding to avoid flipping the signedness.
+  const uint64 v = (value < 0) ? value ^ -1LL : value;
+  return GetUIntSize(2 * v);
+}
+
+uint64 EbmlMasterElementSize(uint64 type, uint64 value) {
+  // Size of EBML ID
+  int32 ebml_size = GetUIntSize(type);
+
+  // Datasize
+  ebml_size += GetCodedUIntSize(value);
+
+  return ebml_size;
+}
+
+uint64 EbmlElementSize(uint64 type, int64 value) {
+  // Size of EBML ID
+  int32 ebml_size = GetUIntSize(type);
+
+  // Datasize
+  ebml_size += GetIntSize(value);
+
+  // Size of Datasize
+  ebml_size++;
+
+  return ebml_size;
+}
+
+uint64 EbmlElementSize(uint64 type, uint64 value) {
+  // Size of EBML ID
+  int32 ebml_size = GetUIntSize(type);
+
+  // Datasize
+  ebml_size += GetUIntSize(value);
+
+  // Size of Datasize
+  ebml_size++;
+
+  return ebml_size;
+}
+
+uint64 EbmlElementSize(uint64 type, float /* value */) {
+  // Size of EBML ID
+  uint64 ebml_size = GetUIntSize(type);
+
+  // Datasize
+  ebml_size += sizeof(float);
+
+  // Size of Datasize
+  ebml_size++;
+
+  return ebml_size;
+}
+
+uint64 EbmlElementSize(uint64 type, const char* value) {
+  if (!value)
+    return 0;
+
+  // Size of EBML ID
+  uint64 ebml_size = GetUIntSize(type);
+
+  // Datasize
+  ebml_size += strlen(value);
+
+  // Size of Datasize
+  ebml_size++;
+
+  return ebml_size;
+}
+
+uint64 EbmlElementSize(uint64 type, const uint8* value, uint64 size) {
+  if (!value)
+    return 0;
+
+  // Size of EBML ID
+  uint64 ebml_size = GetUIntSize(type);
+
+  // Datasize
+  ebml_size += size;
+
+  // Size of Datasize
+  ebml_size += GetCodedUIntSize(size);
+
+  return ebml_size;
+}
+
+uint64 EbmlDateElementSize(uint64 type) {
+  // Size of EBML ID
+  uint64 ebml_size = GetUIntSize(type);
+
+  // Datasize
+  ebml_size += kDateElementSize;
+
+  // Size of Datasize
+  ebml_size++;
+
+  return ebml_size;
+}
+
+int32 SerializeInt(IMkvWriter* writer, int64 value, int32 size) {
+  if (!writer || size < 1 || size > 8)
+    return -1;
+
+  for (int32 i = 1; i <= size; ++i) {
+    const int32 byte_count = size - i;
+    const int32 bit_count = byte_count * 8;
+
+    const int64 bb = value >> bit_count;
+    const uint8 b = static_cast<uint8>(bb);
+
+    const int32 status = writer->Write(&b, 1);
+
+    if (status < 0)
+      return status;
+  }
+
+  return 0;
+}
+
+int32 SerializeFloat(IMkvWriter* writer, float f) {
+  if (!writer)
+    return -1;
+
+  assert(sizeof(uint32) == sizeof(float));
+  // This union is merely used to avoid a reinterpret_cast from float& to
+  // uint32& which will result in violation of strict aliasing.
+  union U32 {
+    uint32 u32;
+    float f;
+  } value;
+  value.f = f;
+
+  for (int32 i = 1; i <= 4; ++i) {
+    const int32 byte_count = 4 - i;
+    const int32 bit_count = byte_count * 8;
+
+    const uint8 byte = static_cast<uint8>(value.u32 >> bit_count);
+
+    const int32 status = writer->Write(&byte, 1);
+
+    if (status < 0)
+      return status;
+  }
+
+  return 0;
+}
+
+int32 WriteUInt(IMkvWriter* writer, uint64 value) {
+  if (!writer)
+    return -1;
+
+  int32 size = GetCodedUIntSize(value);
+
+  return WriteUIntSize(writer, value, size);
+}
+
+int32 WriteUIntSize(IMkvWriter* writer, uint64 value, int32 size) {
+  if (!writer || size < 0 || size > 8)
+    return -1;
+
+  if (size > 0) {
+    const uint64 bit = 1LL << (size * 7);
+
+    if (value > (bit - 2))
+      return -1;
+
+    value |= bit;
+  } else {
+    size = 1;
+    int64 bit;
+
+    for (;;) {
+      bit = 1LL << (size * 7);
+      const uint64 max = bit - 2;
+
+      if (value <= max)
+        break;
+
+      ++size;
+    }
+
+    if (size > 8)
+      return false;
+
+    value |= bit;
+  }
+
+  return SerializeInt(writer, value, size);
+}
+
+int32 WriteID(IMkvWriter* writer, uint64 type) {
+  if (!writer)
+    return -1;
+
+  writer->ElementStartNotify(type, writer->Position());
+
+  const int32 size = GetUIntSize(type);
+
+  return SerializeInt(writer, type, size);
+}
+
+bool WriteEbmlMasterElement(IMkvWriter* writer, uint64 type, uint64 size) {
+  if (!writer)
+    return false;
+
+  if (WriteID(writer, type))
+    return false;
+
+  if (WriteUInt(writer, size))
+    return false;
+
+  return true;
+}
+
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, uint64 value) {
+  if (!writer)
+    return false;
+
+  if (WriteID(writer, type))
+    return false;
+
+  const uint64 size = GetUIntSize(value);
+  if (WriteUInt(writer, size))
+    return false;
+
+  if (SerializeInt(writer, value, static_cast<int32>(size)))
+    return false;
+
+  return true;
+}
+
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, int64 value) {
+  if (!writer)
+    return false;
+
+  if (WriteID(writer, type))
+    return 0;
+
+  const uint64 size = GetIntSize(value);
+  if (WriteUInt(writer, size))
+    return false;
+
+  if (SerializeInt(writer, value, static_cast<int32>(size)))
+    return false;
+
+  return true;
+}
+
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, float value) {
+  if (!writer)
+    return false;
+
+  if (WriteID(writer, type))
+    return false;
+
+  if (WriteUInt(writer, 4))
+    return false;
+
+  if (SerializeFloat(writer, value))
+    return false;
+
+  return true;
+}
+
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, const char* value) {
+  if (!writer || !value)
+    return false;
+
+  if (WriteID(writer, type))
+    return false;
+
+  const uint64 length = strlen(value);
+  if (WriteUInt(writer, length))
+    return false;
+
+  if (writer->Write(value, static_cast<const uint32>(length)))
+    return false;
+
+  return true;
+}
+
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, const uint8* value,
+                      uint64 size) {
+  if (!writer || !value || size < 1)
+    return false;
+
+  if (WriteID(writer, type))
+    return false;
+
+  if (WriteUInt(writer, size))
+    return false;
+
+  if (writer->Write(value, static_cast<uint32>(size)))
+    return false;
+
+  return true;
+}
+
+bool WriteEbmlDateElement(IMkvWriter* writer, uint64 type, int64 value) {
+  if (!writer)
+    return false;
+
+  if (WriteID(writer, type))
+    return false;
+
+  if (WriteUInt(writer, kDateElementSize))
+    return false;
+
+  if (SerializeInt(writer, value, kDateElementSize))
+    return false;
+
+  return true;
+}
+
+uint64 WriteFrame(IMkvWriter* writer, const Frame* const frame,
+                  Cluster* cluster) {
+  if (!writer || !frame || !frame->IsValid() || !cluster ||
+      !cluster->timecode_scale())
+    return 0;
+
+  //  Technically the timecode for a block can be less than the
+  //  timecode for the cluster itself (remember that block timecode
+  //  is a signed, 16-bit integer).  However, as a simplification we
+  //  only permit non-negative cluster-relative timecodes for blocks.
+  const int64 relative_timecode = cluster->GetRelativeTimecode(
+      frame->timestamp() / cluster->timecode_scale());
+  if (relative_timecode < 0 || relative_timecode > kMaxBlockTimecode)
+    return 0;
+
+  return frame->CanBeSimpleBlock() ?
+             WriteSimpleBlock(writer, frame, relative_timecode) :
+             WriteBlock(writer, frame, relative_timecode,
+                        cluster->timecode_scale());
+}
+
+uint64 WriteVoidElement(IMkvWriter* writer, uint64 size) {
+  if (!writer)
+    return false;
+
+  // Subtract one for the void ID and the coded size.
+  uint64 void_entry_size = size - 1 - GetCodedUIntSize(size - 1);
+  uint64 void_size =
+      EbmlMasterElementSize(kMkvVoid, void_entry_size) + void_entry_size;
+
+  if (void_size != size)
+    return 0;
+
+  const int64 payload_position = writer->Position();
+  if (payload_position < 0)
+    return 0;
+
+  if (WriteID(writer, kMkvVoid))
+    return 0;
+
+  if (WriteUInt(writer, void_entry_size))
+    return 0;
+
+  const uint8 value = 0;
+  for (int32 i = 0; i < static_cast<int32>(void_entry_size); ++i) {
+    if (writer->Write(&value, 1))
+      return 0;
+  }
+
+  const int64 stop_position = writer->Position();
+  if (stop_position < 0 ||
+      stop_position - payload_position != static_cast<int64>(void_size))
+    return 0;
+
+  return void_size;
+}
+
+void GetVersion(int32* major, int32* minor, int32* build, int32* revision) {
+  *major = 0;
+  *minor = 2;
+  *build = 1;
+  *revision = 0;
+}
+
+}  // namespace mkvmuxer
+
+mkvmuxer::uint64 mkvmuxer::MakeUID(unsigned int* seed) {
+  uint64 uid = 0;
+
+#ifdef __MINGW32__
+  srand(*seed);
+#endif
+
+  for (int i = 0; i < 7; ++i) {  // avoid problems with 8-byte values
+    uid <<= 8;
+
+// TODO(fgalligan): Move random number generation to platform specific code.
+#ifdef _MSC_VER
+    (void)seed;
+    const int32 nn = rand();
+#elif __ANDROID__
+    int32 temp_num = 1;
+    int fd = open("/dev/urandom", O_RDONLY);
+    if (fd != -1) {
+      read(fd, &temp_num, sizeof(int32));
+      close(fd);
+    }
+    const int32 nn = temp_num;
+#elif defined __MINGW32__
+    const int32 nn = rand();
+#else
+    const int32 nn = rand_r(seed);
+#endif
+    const int32 n = 0xFF & (nn >> 4);  // throw away low-order bits
+
+    uid |= n;
+  }
+
+  return uid;
+}
diff --git a/libs/libvpx/third_party/libwebm/mkvmuxerutil.hpp b/libs/libvpx/third_party/libwebm/mkvmuxerutil.hpp
new file mode 100644
index 0000000000..e318576942
--- /dev/null
+++ b/libs/libvpx/third_party/libwebm/mkvmuxerutil.hpp
@@ -0,0 +1,83 @@
+// Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+
+#ifndef MKVMUXERUTIL_HPP
+#define MKVMUXERUTIL_HPP
+
+#include "mkvmuxer.hpp"
+#include "mkvmuxertypes.hpp"
+
+namespace mkvmuxer {
+
+class IMkvWriter;
+
+const uint64 kEbmlUnknownValue = 0x01FFFFFFFFFFFFFFULL;
+const int64 kMaxBlockTimecode = 0x07FFFLL;
+
+// Writes out |value| in Big Endian order. Returns 0 on success.
+int32 SerializeInt(IMkvWriter* writer, int64 value, int32 size);
+
+// Returns the size in bytes of the element.
+int32 GetUIntSize(uint64 value);
+int32 GetIntSize(int64 value);
+int32 GetCodedUIntSize(uint64 value);
+uint64 EbmlMasterElementSize(uint64 type, uint64 value);
+uint64 EbmlElementSize(uint64 type, int64 value);
+uint64 EbmlElementSize(uint64 type, uint64 value);
+uint64 EbmlElementSize(uint64 type, float value);
+uint64 EbmlElementSize(uint64 type, const char* value);
+uint64 EbmlElementSize(uint64 type, const uint8* value, uint64 size);
+uint64 EbmlDateElementSize(uint64 type);
+
+// Creates an EBML coded number from |value| and writes it out. The size of
+// the coded number is determined by the value of |value|. |value| must not
+// be in a coded form. Returns 0 on success.
+int32 WriteUInt(IMkvWriter* writer, uint64 value);
+
+// Creates an EBML coded number from |value| and writes it out. The size of
+// the coded number is determined by the value of |size|. |value| must not
+// be in a coded form. Returns 0 on success.
+int32 WriteUIntSize(IMkvWriter* writer, uint64 value, int32 size);
+
+// Output an Mkv master element. Returns true if the element was written.
+bool WriteEbmlMasterElement(IMkvWriter* writer, uint64 value, uint64 size);
+
+// Outputs an Mkv ID, calls |IMkvWriter::ElementStartNotify|, and passes the
+// ID to |SerializeInt|. Returns 0 on success.
+int32 WriteID(IMkvWriter* writer, uint64 type);
+
+// Output an Mkv non-master element. Returns true if the element was written.
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, uint64 value);
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, int64 value);
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, float value);
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, const char* value);
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, const uint8* value,
+                      uint64 size);
+bool WriteEbmlDateElement(IMkvWriter* writer, uint64 type, int64 value);
+
+// Output a Mkv Frame. It decides the correct element to write (Block vs
+// SimpleBlock) based on the parameters of the Frame.
+uint64 WriteFrame(IMkvWriter* writer, const Frame* const frame,
+                  Cluster* cluster);
+
+// Output a void element. |size| must be the entire size in bytes that will be
+// void. The function will calculate the size of the void header and subtract
+// it from |size|.
+uint64 WriteVoidElement(IMkvWriter* writer, uint64 size);
+
+// Returns the version number of the muxer in |major|, |minor|, |build|,
+// and |revision|.
+void GetVersion(int32* major, int32* minor, int32* build, int32* revision);
+
+// Returns a random number to be used for UID, using |seed| to seed
+// the random-number generator (see POSIX rand_r() for semantics).
+uint64 MakeUID(unsigned int* seed);
+
+}  // end namespace mkvmuxer
+
+#endif  // MKVMUXERUTIL_HPP
diff --git a/libs/libvpx/third_party/libwebm/mkvparser.cpp b/libs/libvpx/third_party/libwebm/mkvparser.cpp
new file mode 100644
index 0000000000..f2855d5066
--- /dev/null
+++ b/libs/libvpx/third_party/libwebm/mkvparser.cpp
@@ -0,0 +1,7724 @@
+// Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+
+#include "mkvparser.hpp"
+
+#if defined(_MSC_VER) && _MSC_VER < 1800
+#include <float.h>  // _isnan() / _finite()
+#define MSC_COMPAT
+#endif
+
+#include <cassert>
+#include <climits>
+#include <cmath>
+#include <cstring>
+#include <new>
+
+#include "webmids.hpp"
+
+#ifdef _MSC_VER
+// Disable MSVC warnings that suggest making code non-portable.
+#pragma warning(disable : 4996)
+#endif
+
+namespace mkvparser {
+
+#ifdef MSC_COMPAT
+inline bool isnan(double val) { return !!_isnan(val); }
+inline bool isinf(double val) { return !_finite(val); }
+#else
+inline bool isnan(double val) { return std::isnan(val); }
+inline bool isinf(double val) { return std::isinf(val); }
+#endif  // MSC_COMPAT
+
+IMkvReader::~IMkvReader() {}
+
+template<typename Type> Type* SafeArrayAlloc(unsigned long long num_elements,
+                                             unsigned long long element_size) {
+  if (num_elements == 0 || element_size == 0)
+    return NULL;
+
+  const size_t kMaxAllocSize = 0x80000000;  // 2GiB
+  const unsigned long long num_bytes = num_elements * element_size;
+  if (element_size > (kMaxAllocSize / num_elements))
+    return NULL;
+  if (num_bytes != static_cast<size_t>(num_bytes))
+    return NULL;
+
+  return new (std::nothrow) Type[static_cast<size_t>(num_bytes)];
+}
+
+void GetVersion(int& major, int& minor, int& build, int& revision) {
+  major = 1;
+  minor = 0;
+  build = 0;
+  revision = 30;
+}
+
+long long ReadUInt(IMkvReader* pReader, long long pos, long& len) {
+  if (!pReader || pos < 0)
+    return E_FILE_FORMAT_INVALID;
+
+  len = 1;
+  unsigned char b;
+  int status = pReader->Read(pos, 1, &b);
+
+  if (status < 0)  // error or underflow
+    return status;
+
+  if (status > 0)  // interpreted as "underflow"
+    return E_BUFFER_NOT_FULL;
+
+  if (b == 0)  // we can't handle u-int values larger than 8 bytes
+    return E_FILE_FORMAT_INVALID;
+
+  unsigned char m = 0x80;
+
+  while (!(b & m)) {
+    m >>= 1;
+    ++len;
+  }
+
+  long long result = b & (~m);
+  ++pos;
+
+  for (int i = 1; i < len; ++i) {
+    status = pReader->Read(pos, 1, &b);
+
+    if (status < 0) {
+      len = 1;
+      return status;
+    }
+
+    if (status > 0) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    result <<= 8;
+    result |= b;
+
+    ++pos;
+  }
+
+  return result;
+}
+
+// Reads an EBML ID and returns it.
+// An ID must at least 1 byte long, cannot exceed 4, and its value must be
+// greater than 0.
+// See known EBML values and EBMLMaxIDLength:
+// http://www.matroska.org/technical/specs/index.html
+// Returns the ID, or a value less than 0 to report an error while reading the
+// ID.
+long long ReadID(IMkvReader* pReader, long long pos, long& len) {
+  if (pReader == NULL || pos < 0)
+    return E_FILE_FORMAT_INVALID;
+
+  // Read the first byte. The length in bytes of the ID is determined by
+  // finding the first set bit in the first byte of the ID.
+  unsigned char temp_byte = 0;
+  int read_status = pReader->Read(pos, 1, &temp_byte);
+
+  if (read_status < 0)
+    return E_FILE_FORMAT_INVALID;
+  else if (read_status > 0)  // No data to read.
+    return E_BUFFER_NOT_FULL;
+
+  if (temp_byte == 0)  // ID length > 8 bytes; invalid file.
+    return E_FILE_FORMAT_INVALID;
+
+  int bit_pos = 0;
+  const int kMaxIdLengthInBytes = 4;
+  const int kCheckByte = 0x80;
+
+  // Find the first bit that's set.
+  bool found_bit = false;
+  for (; bit_pos < kMaxIdLengthInBytes; ++bit_pos) {
+    if ((kCheckByte >> bit_pos) & temp_byte) {
+      found_bit = true;
+      break;
+    }
+  }
+
+  if (!found_bit) {
+    // The value is too large to be a valid ID.
+    return E_FILE_FORMAT_INVALID;
+  }
+
+  // Read the remaining bytes of the ID (if any).
+  const int id_length = bit_pos + 1;
+  long long ebml_id = temp_byte;
+  for (int i = 1; i < id_length; ++i) {
+    ebml_id <<= 8;
+    read_status = pReader->Read(pos + i, 1, &temp_byte);
+
+    if (read_status < 0)
+      return E_FILE_FORMAT_INVALID;
+    else if (read_status > 0)
+      return E_BUFFER_NOT_FULL;
+
+    ebml_id |= temp_byte;
+  }
+
+  len = id_length;
+  return ebml_id;
+}
+
+long long GetUIntLength(IMkvReader* pReader, long long pos, long& len) {
+  if (!pReader || pos < 0)
+    return E_FILE_FORMAT_INVALID;
+
+  long long total, available;
+
+  int status = pReader->Length(&total, &available);
+  if (status < 0 || (total >= 0 && available > total))
+    return E_FILE_FORMAT_INVALID;
+
+  len = 1;
+
+  if (pos >= available)
+    return pos;  // too few bytes available
+
+  unsigned char b;
+
+  status = pReader->Read(pos, 1, &b);
+
+  if (status != 0)
+    return status;
+
+  if (b == 0)  // we can't handle u-int values larger than 8 bytes
+    return E_FILE_FORMAT_INVALID;
+
+  unsigned char m = 0x80;
+
+  while (!(b & m)) {
+    m >>= 1;
+    ++len;
+  }
+
+  return 0;  // success
+}
+
+// TODO(vigneshv): This function assumes that unsigned values never have their
+// high bit set.
+long long UnserializeUInt(IMkvReader* pReader, long long pos, long long size) {
+  if (!pReader || pos < 0 || (size <= 0) || (size > 8))
+    return E_FILE_FORMAT_INVALID;
+
+  long long result = 0;
+
+  for (long long i = 0; i < size; ++i) {
+    unsigned char b;
+
+    const long status = pReader->Read(pos, 1, &b);
+
+    if (status < 0)
+      return status;
+
+    result <<= 8;
+    result |= b;
+
+    ++pos;
+  }
+
+  return result;
+}
+
+long UnserializeFloat(IMkvReader* pReader, long long pos, long long size_,
+                      double& result) {
+  if (!pReader || pos < 0 || ((size_ != 4) && (size_ != 8)))
+    return E_FILE_FORMAT_INVALID;
+
+  const long size = static_cast<long>(size_);
+
+  unsigned char buf[8];
+
+  const int status = pReader->Read(pos, size, buf);
+
+  if (status < 0)  // error
+    return status;
+
+  if (size == 4) {
+    union {
+      float f;
+      unsigned long ff;
+    };
+
+    ff = 0;
+
+    for (int i = 0;;) {
+      ff |= buf[i];
+
+      if (++i >= 4)
+        break;
+
+      ff <<= 8;
+    }
+
+    result = f;
+  } else {
+    union {
+      double d;
+      unsigned long long dd;
+    };
+
+    dd = 0;
+
+    for (int i = 0;;) {
+      dd |= buf[i];
+
+      if (++i >= 8)
+        break;
+
+      dd <<= 8;
+    }
+
+    result = d;
+  }
+
+  if (mkvparser::isinf(result) || mkvparser::isnan(result))
+    return E_FILE_FORMAT_INVALID;
+
+  return 0;
+}
+
+long UnserializeInt(IMkvReader* pReader, long long pos, long long size,
+                    long long& result_ref) {
+  if (!pReader || pos < 0 || size < 1 || size > 8)
+    return E_FILE_FORMAT_INVALID;
+
+  signed char first_byte = 0;
+  const long status = pReader->Read(pos, 1, (unsigned char*)&first_byte);
+
+  if (status < 0)
+    return status;
+
+  unsigned long long result = first_byte;
+  ++pos;
+
+  for (long i = 1; i < size; ++i) {
+    unsigned char b;
+
+    const long status = pReader->Read(pos, 1, &b);
+
+    if (status < 0)
+      return status;
+
+    result <<= 8;
+    result |= b;
+
+    ++pos;
+  }
+
+  result_ref = static_cast<long long>(result);
+  return 0;
+}
+
+long UnserializeString(IMkvReader* pReader, long long pos, long long size,
+                       char*& str) {
+  delete[] str;
+  str = NULL;
+
+  if (size >= LONG_MAX || size < 0)
+    return E_FILE_FORMAT_INVALID;
+
+  // +1 for '\0' terminator
+  const long required_size = static_cast<long>(size) + 1;
+
+  str = SafeArrayAlloc<char>(1, required_size);
+  if (str == NULL)
+    return E_FILE_FORMAT_INVALID;
+
+  unsigned char* const buf = reinterpret_cast<unsigned char*>(str);
+
+  const long status = pReader->Read(pos, static_cast<long>(size), buf);
+
+  if (status) {
+    delete[] str;
+    str = NULL;
+
+    return status;
+  }
+
+  str[required_size - 1] = '\0';
+  return 0;
+}
+
+long ParseElementHeader(IMkvReader* pReader, long long& pos,
+                        long long stop, long long& id,
+                        long long& size) {
+  if (stop >= 0 && pos >= stop)
+    return E_FILE_FORMAT_INVALID;
+
+  long len;
+
+  id = ReadID(pReader, pos, len);
+
+  if (id < 0)
+    return E_FILE_FORMAT_INVALID;
+
+  pos += len;  // consume id
+
+  if (stop >= 0 && pos >= stop)
+    return E_FILE_FORMAT_INVALID;
+
+  size = ReadUInt(pReader, pos, len);
+
+  if (size < 0 || len < 1 || len > 8) {
+    // Invalid: Negative payload size, negative or 0 length integer, or integer
+    // larger than 64 bits (libwebm cannot handle them).
+    return E_FILE_FORMAT_INVALID;
+  }
+
+  // Avoid rolling over pos when very close to LLONG_MAX.
+  const unsigned long long rollover_check =
+      static_cast<unsigned long long>(pos) + len;
+  if (rollover_check > LLONG_MAX)
+    return E_FILE_FORMAT_INVALID;
+
+  pos += len;  // consume length of size
+
+  // pos now designates payload
+
+  if (stop >= 0 && pos >= stop)
+    return E_FILE_FORMAT_INVALID;
+
+  return 0;  // success
+}
+
+bool Match(IMkvReader* pReader, long long& pos, unsigned long expected_id,
+           long long& val) {
+  if (!pReader || pos < 0)
+    return false;
+
+  long long total = 0;
+  long long available = 0;
+
+  const long status = pReader->Length(&total, &available);
+  if (status < 0 || (total >= 0 && available > total))
+    return false;
+
+  long len = 0;
+
+  const long long id = ReadID(pReader, pos, len);
+  if (id < 0 || (available - pos) > len)
+    return false;
+
+  if (static_cast<unsigned long>(id) != expected_id)
+    return false;
+
+  pos += len;  // consume id
+
+  const long long size = ReadUInt(pReader, pos, len);
+  if (size < 0 || size > 8 || len < 1 || len > 8 || (available - pos) > len)
+    return false;
+
+  pos += len;  // consume length of size of payload
+
+  val = UnserializeUInt(pReader, pos, size);
+  if (val < 0)
+    return false;
+
+  pos += size;  // consume size of payload
+
+  return true;
+}
+
+bool Match(IMkvReader* pReader, long long& pos, unsigned long expected_id,
+           unsigned char*& buf, size_t& buflen) {
+  if (!pReader || pos < 0)
+    return false;
+
+  long long total = 0;
+  long long available = 0;
+
+  long status = pReader->Length(&total, &available);
+  if (status < 0 || (total >= 0 && available > total))
+    return false;
+
+  long len = 0;
+  const long long id = ReadID(pReader, pos, len);
+  if (id < 0 || (available - pos) > len)
+    return false;
+
+  if (static_cast<unsigned long>(id) != expected_id)
+    return false;
+
+  pos += len;  // consume id
+
+  const long long size = ReadUInt(pReader, pos, len);
+  if (size < 0 || len <= 0 || len > 8 || (available - pos) > len)
+    return false;
+
+  unsigned long long rollover_check =
+      static_cast<unsigned long long>(pos) + len;
+  if (rollover_check > LLONG_MAX)
+    return false;
+
+  pos += len;  // consume length of size of payload
+
+  rollover_check = static_cast<unsigned long long>(pos) + size;
+  if (rollover_check > LLONG_MAX)
+    return false;
+
+  if ((pos + size) > available)
+    return false;
+
+  if (size >= LONG_MAX)
+    return false;
+
+  const long buflen_ = static_cast<long>(size);
+
+  buf = SafeArrayAlloc<unsigned char>(1, buflen_);
+  if (!buf)
+    return false;
+
+  status = pReader->Read(pos, buflen_, buf);
+  if (status != 0)
+    return false;
+
+  buflen = buflen_;
+
+  pos += size;  // consume size of payload
+  return true;
+}
+
+EBMLHeader::EBMLHeader() : m_docType(NULL) { Init(); }
+
+EBMLHeader::~EBMLHeader() { delete[] m_docType; }
+
+void EBMLHeader::Init() {
+  m_version = 1;
+  m_readVersion = 1;
+  m_maxIdLength = 4;
+  m_maxSizeLength = 8;
+
+  if (m_docType) {
+    delete[] m_docType;
+    m_docType = NULL;
+  }
+
+  m_docTypeVersion = 1;
+  m_docTypeReadVersion = 1;
+}
+
+long long EBMLHeader::Parse(IMkvReader* pReader, long long& pos) {
+  if (!pReader)
+    return E_FILE_FORMAT_INVALID;
+
+  long long total, available;
+
+  long status = pReader->Length(&total, &available);
+
+  if (status < 0)  // error
+    return status;
+
+  pos = 0;
+  long long end = (available >= 1024) ? 1024 : available;
+
+  // Scan until we find what looks like the first byte of the EBML header.
+  const long long kMaxScanBytes = (available >= 1024) ? 1024 : available;
+  const unsigned char kEbmlByte0 = 0x1A;
+  unsigned char scan_byte = 0;
+
+  while (pos < kMaxScanBytes) {
+    status = pReader->Read(pos, 1, &scan_byte);
+
+    if (status < 0)  // error
+      return status;
+    else if (status > 0)
+      return E_BUFFER_NOT_FULL;
+
+    if (scan_byte == kEbmlByte0)
+      break;
+
+    ++pos;
+  }
+
+  long len = 0;
+  const long long ebml_id = ReadID(pReader, pos, len);
+
+  // TODO(tomfinegan): Move Matroska ID constants into a common namespace.
+  if (len != 4 || ebml_id != mkvmuxer::kMkvEBML)
+    return E_FILE_FORMAT_INVALID;
+
+  // Move read pos forward to the EBML header size field.
+  pos += 4;
+
+  // Read length of size field.
+  long long result = GetUIntLength(pReader, pos, len);
+
+  if (result < 0)  // error
+    return E_FILE_FORMAT_INVALID;
+  else if (result > 0)  // need more data
+    return E_BUFFER_NOT_FULL;
+
+  if (len < 1 || len > 8)
+    return E_FILE_FORMAT_INVALID;
+
+  if ((total >= 0) && ((total - pos) < len))
+    return E_FILE_FORMAT_INVALID;
+
+  if ((available - pos) < len)
+    return pos + len;  // try again later
+
+  // Read the EBML header size.
+  result = ReadUInt(pReader, pos, len);
+
+  if (result < 0)  // error
+    return result;
+
+  pos += len;  // consume size field
+
+  // pos now designates start of payload
+
+  if ((total >= 0) && ((total - pos) < result))
+    return E_FILE_FORMAT_INVALID;
+
+  if ((available - pos) < result)
+    return pos + result;
+
+  end = pos + result;
+
+  Init();
+
+  while (pos < end) {
+    long long id, size;
+
+    status = ParseElementHeader(pReader, pos, end, id, size);
+
+    if (status < 0)  // error
+      return status;
+
+    if (size == 0)
+      return E_FILE_FORMAT_INVALID;
+
+    if (id == mkvmuxer::kMkvEBMLVersion) {
+      m_version = UnserializeUInt(pReader, pos, size);
+
+      if (m_version <= 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == mkvmuxer::kMkvEBMLReadVersion) {
+      m_readVersion = UnserializeUInt(pReader, pos, size);
+
+      if (m_readVersion <= 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == mkvmuxer::kMkvEBMLMaxIDLength) {
+      m_maxIdLength = UnserializeUInt(pReader, pos, size);
+
+      if (m_maxIdLength <= 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == mkvmuxer::kMkvEBMLMaxSizeLength) {
+      m_maxSizeLength = UnserializeUInt(pReader, pos, size);
+
+      if (m_maxSizeLength <= 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == mkvmuxer::kMkvDocType) {
+      if (m_docType)
+        return E_FILE_FORMAT_INVALID;
+
+      status = UnserializeString(pReader, pos, size, m_docType);
+
+      if (status)  // error
+        return status;
+    } else if (id == mkvmuxer::kMkvDocTypeVersion) {
+      m_docTypeVersion = UnserializeUInt(pReader, pos, size);
+
+      if (m_docTypeVersion <= 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == mkvmuxer::kMkvDocTypeReadVersion) {
+      m_docTypeReadVersion = UnserializeUInt(pReader, pos, size);
+
+      if (m_docTypeReadVersion <= 0)
+        return E_FILE_FORMAT_INVALID;
+    }
+
+    pos += size;
+  }
+
+  if (pos != end)
+    return E_FILE_FORMAT_INVALID;
+
+  // Make sure DocType, DocTypeReadVersion, and DocTypeVersion are valid.
+  if (m_docType == NULL || m_docTypeReadVersion <= 0 || m_docTypeVersion <= 0)
+    return E_FILE_FORMAT_INVALID;
+
+  // Make sure EBMLMaxIDLength and EBMLMaxSizeLength are valid.
+  if (m_maxIdLength <= 0 || m_maxIdLength > 4 ||
+      m_maxSizeLength <= 0 || m_maxSizeLength > 8)
+    return E_FILE_FORMAT_INVALID;
+
+  return 0;
+}
+
+Segment::Segment(IMkvReader* pReader, long long elem_start,
+                 // long long elem_size,
+                 long long start, long long size)
+    : m_pReader(pReader),
+      m_element_start(elem_start),
+      // m_element_size(elem_size),
+      m_start(start),
+      m_size(size),
+      m_pos(start),
+      m_pUnknownSize(0),
+      m_pSeekHead(NULL),
+      m_pInfo(NULL),
+      m_pTracks(NULL),
+      m_pCues(NULL),
+      m_pChapters(NULL),
+      m_pTags(NULL),
+      m_clusters(NULL),
+      m_clusterCount(0),
+      m_clusterPreloadCount(0),
+      m_clusterSize(0) {}
+
+Segment::~Segment() {
+  const long count = m_clusterCount + m_clusterPreloadCount;
+
+  Cluster** i = m_clusters;
+  Cluster** j = m_clusters + count;
+
+  while (i != j) {
+    Cluster* const p = *i++;
+    delete p;
+  }
+
+  delete[] m_clusters;
+
+  delete m_pTracks;
+  delete m_pInfo;
+  delete m_pCues;
+  delete m_pChapters;
+  delete m_pTags;
+  delete m_pSeekHead;
+}
+
+long long Segment::CreateInstance(IMkvReader* pReader, long long pos,
+                                  Segment*& pSegment) {
+  if (pReader == NULL || pos < 0)
+    return E_PARSE_FAILED;
+
+  pSegment = NULL;
+
+  long long total, available;
+
+  const long status = pReader->Length(&total, &available);
+
+  if (status < 0)  // error
+    return status;
+
+  if (available < 0)
+    return -1;
+
+  if ((total >= 0) && (available > total))
+    return -1;
+
+  // I would assume that in practice this loop would execute
+  // exactly once, but we allow for other elements (e.g. Void)
+  // to immediately follow the EBML header.  This is fine for
+  // the source filter case (since the entire file is available),
+  // but in the splitter case over a network we should probably
+  // just give up early.  We could for example decide only to
+  // execute this loop a maximum of, say, 10 times.
+  // TODO:
+  // There is an implied "give up early" by only parsing up
+  // to the available limit.  We do do that, but only if the
+  // total file size is unknown.  We could decide to always
+  // use what's available as our limit (irrespective of whether
+  // we happen to know the total file length).  This would have
+  // as its sense "parse this much of the file before giving up",
+  // which a slightly different sense from "try to parse up to
+  // 10 EMBL elements before giving up".
+
+  for (;;) {
+    if ((total >= 0) && (pos >= total))
+      return E_FILE_FORMAT_INVALID;
+
+    // Read ID
+    long len;
+    long long result = GetUIntLength(pReader, pos, len);
+
+    if (result)  // error, or too few available bytes
+      return result;
+
+    if ((total >= 0) && ((pos + len) > total))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > available)
+      return pos + len;
+
+    const long long idpos = pos;
+    const long long id = ReadID(pReader, pos, len);
+
+    if (id < 0)
+      return E_FILE_FORMAT_INVALID;
+
+    pos += len;  // consume ID
+
+    // Read Size
+
+    result = GetUIntLength(pReader, pos, len);
+
+    if (result)  // error, or too few available bytes
+      return result;
+
+    if ((total >= 0) && ((pos + len) > total))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > available)
+      return pos + len;
+
+    long long size = ReadUInt(pReader, pos, len);
+
+    if (size < 0)  // error
+      return size;
+
+    pos += len;  // consume length of size of element
+
+    // Pos now points to start of payload
+
+    // Handle "unknown size" for live streaming of webm files.
+    const long long unknown_size = (1LL << (7 * len)) - 1;
+
+    if (id == mkvmuxer::kMkvSegment) {
+      if (size == unknown_size)
+        size = -1;
+
+      else if (total < 0)
+        size = -1;
+
+      else if ((pos + size) > total)
+        size = -1;
+
+      pSegment = new (std::nothrow) Segment(pReader, idpos, pos, size);
+      if (pSegment == NULL)
+        return E_PARSE_FAILED;
+
+      return 0;  // success
+    }
+
+    if (size == unknown_size)
+      return E_FILE_FORMAT_INVALID;
+
+    if ((total >= 0) && ((pos + size) > total))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + size) > available)
+      return pos + size;
+
+    pos += size;  // consume payload
+  }
+}
+
+long long Segment::ParseHeaders() {
+  // Outermost (level 0) segment object has been constructed,
+  // and pos designates start of payload.  We need to find the
+  // inner (level 1) elements.
+  long long total, available;
+
+  const int status = m_pReader->Length(&total, &available);
+
+  if (status < 0)  // error
+    return status;
+
+  if (total > 0 && available > total)
+    return E_FILE_FORMAT_INVALID;
+
+  const long long segment_stop = (m_size < 0) ? -1 : m_start + m_size;
+
+  if ((segment_stop >= 0 && total >= 0 && segment_stop > total) ||
+      (segment_stop >= 0 && m_pos > segment_stop)) {
+    return E_FILE_FORMAT_INVALID;
+  }
+
+  for (;;) {
+    if ((total >= 0) && (m_pos >= total))
+      break;
+
+    if ((segment_stop >= 0) && (m_pos >= segment_stop))
+      break;
+
+    long long pos = m_pos;
+    const long long element_start = pos;
+
+    // Avoid rolling over pos when very close to LLONG_MAX.
+    unsigned long long rollover_check = pos + 1ULL;
+    if (rollover_check > LLONG_MAX)
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + 1) > available)
+      return (pos + 1);
+
+    long len;
+    long long result = GetUIntLength(m_pReader, pos, len);
+
+    if (result < 0)  // error
+      return result;
+
+    if (result > 0) {
+      // MkvReader doesn't have enough data to satisfy this read attempt.
+      return (pos + 1);
+    }
+
+    if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > available)
+      return pos + len;
+
+    const long long idpos = pos;
+    const long long id = ReadID(m_pReader, idpos, len);
+
+    if (id < 0)
+      return E_FILE_FORMAT_INVALID;
+
+    if (id == mkvmuxer::kMkvCluster)
+      break;
+
+    pos += len;  // consume ID
+
+    if ((pos + 1) > available)
+      return (pos + 1);
+
+    // Read Size
+    result = GetUIntLength(m_pReader, pos, len);
+
+    if (result < 0)  // error
+      return result;
+
+    if (result > 0) {
+      // MkvReader doesn't have enough data to satisfy this read attempt.
+      return (pos + 1);
+    }
+
+    if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > available)
+      return pos + len;
+
+    const long long size = ReadUInt(m_pReader, pos, len);
+
+    if (size < 0 || len < 1 || len > 8) {
+      // TODO(tomfinegan): ReadUInt should return an error when len is < 1 or
+      // len > 8 is true instead of checking this _everywhere_.
+      return size;
+    }
+
+    pos += len;  // consume length of size of element
+
+    // Avoid rolling over pos when very close to LLONG_MAX.
+    rollover_check = static_cast<unsigned long long>(pos) + size;
+    if (rollover_check > LLONG_MAX)
+      return E_FILE_FORMAT_INVALID;
+
+    const long long element_size = size + pos - element_start;
+
+    // Pos now points to start of payload
+
+    if ((segment_stop >= 0) && ((pos + size) > segment_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    // We read EBML elements either in total or nothing at all.
+
+    if ((pos + size) > available)
+      return pos + size;
+
+    if (id == mkvmuxer::kMkvInfo) {
+      if (m_pInfo)
+        return E_FILE_FORMAT_INVALID;
+
+      m_pInfo = new (std::nothrow)
+          SegmentInfo(this, pos, size, element_start, element_size);
+
+      if (m_pInfo == NULL)
+        return -1;
+
+      const long status = m_pInfo->Parse();
+
+      if (status)
+        return status;
+    } else if (id == mkvmuxer::kMkvTracks) {
+      if (m_pTracks)
+        return E_FILE_FORMAT_INVALID;
+
+      m_pTracks = new (std::nothrow)
+          Tracks(this, pos, size, element_start, element_size);
+
+      if (m_pTracks == NULL)
+        return -1;
+
+      const long status = m_pTracks->Parse();
+
+      if (status)
+        return status;
+    } else if (id == mkvmuxer::kMkvCues) {
+      if (m_pCues == NULL) {
+        m_pCues = new (std::nothrow)
+            Cues(this, pos, size, element_start, element_size);
+
+        if (m_pCues == NULL)
+          return -1;
+      }
+    } else if (id == mkvmuxer::kMkvSeekHead) {
+      if (m_pSeekHead == NULL) {
+        m_pSeekHead = new (std::nothrow)
+            SeekHead(this, pos, size, element_start, element_size);
+
+        if (m_pSeekHead == NULL)
+          return -1;
+
+        const long status = m_pSeekHead->Parse();
+
+        if (status)
+          return status;
+      }
+    } else if (id == mkvmuxer::kMkvChapters) {
+      if (m_pChapters == NULL) {
+        m_pChapters = new (std::nothrow)
+            Chapters(this, pos, size, element_start, element_size);
+
+        if (m_pChapters == NULL)
+          return -1;
+
+        const long status = m_pChapters->Parse();
+
+        if (status)
+          return status;
+      }
+    } else if (id == mkvmuxer::kMkvTags) {
+      if (m_pTags == NULL) {
+        m_pTags = new (std::nothrow)
+            Tags(this, pos, size, element_start, element_size);
+
+        if (m_pTags == NULL)
+          return -1;
+
+        const long status = m_pTags->Parse();
+
+        if (status)
+          return status;
+      }
+    }
+
+    m_pos = pos + size;  // consume payload
+  }
+
+  if (segment_stop >= 0 && m_pos > segment_stop)
+    return E_FILE_FORMAT_INVALID;
+
+  if (m_pInfo == NULL)  // TODO: liberalize this behavior
+    return E_FILE_FORMAT_INVALID;
+
+  if (m_pTracks == NULL)
+    return E_FILE_FORMAT_INVALID;
+
+  return 0;  // success
+}
+
+long Segment::LoadCluster(long long& pos, long& len) {
+  for (;;) {
+    const long result = DoLoadCluster(pos, len);
+
+    if (result <= 1)
+      return result;
+  }
+}
+
+long Segment::DoLoadCluster(long long& pos, long& len) {
+  if (m_pos < 0)
+    return DoLoadClusterUnknownSize(pos, len);
+
+  long long total, avail;
+
+  long status = m_pReader->Length(&total, &avail);
+
+  if (status < 0)  // error
+    return status;
+
+  if (total >= 0 && avail > total)
+    return E_FILE_FORMAT_INVALID;
+
+  const long long segment_stop = (m_size < 0) ? -1 : m_start + m_size;
+
+  long long cluster_off = -1;  // offset relative to start of segment
+  long long cluster_size = -1;  // size of cluster payload
+
+  for (;;) {
+    if ((total >= 0) && (m_pos >= total))
+      return 1;  // no more clusters
+
+    if ((segment_stop >= 0) && (m_pos >= segment_stop))
+      return 1;  // no more clusters
+
+    pos = m_pos;
+
+    // Read ID
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    long long result = GetUIntLength(m_pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)
+      return E_BUFFER_NOT_FULL;
+
+    if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long idpos = pos;
+    const long long id = ReadID(m_pReader, idpos, len);
+
+    if (id < 0)
+      return E_FILE_FORMAT_INVALID;
+
+    pos += len;  // consume ID
+
+    // Read Size
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    result = GetUIntLength(m_pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)
+      return E_BUFFER_NOT_FULL;
+
+    if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long size = ReadUInt(m_pReader, pos, len);
+
+    if (size < 0)  // error
+      return static_cast<long>(size);
+
+    pos += len;  // consume length of size of element
+
+    // pos now points to start of payload
+
+    if (size == 0) {
+      // Missing element payload: move on.
+      m_pos = pos;
+      continue;
+    }
+
+    const long long unknown_size = (1LL << (7 * len)) - 1;
+
+    if ((segment_stop >= 0) && (size != unknown_size) &&
+        ((pos + size) > segment_stop)) {
+      return E_FILE_FORMAT_INVALID;
+    }
+
+    if (id == mkvmuxer::kMkvCues) {
+      if (size == unknown_size) {
+        // Cues element of unknown size: Not supported.
+        return E_FILE_FORMAT_INVALID;
+      }
+
+      if (m_pCues == NULL) {
+        const long long element_size = (pos - idpos) + size;
+
+        m_pCues = new (std::nothrow) Cues(this, pos, size, idpos, element_size);
+        if (m_pCues == NULL)
+          return -1;
+      }
+
+      m_pos = pos + size;  // consume payload
+      continue;
+    }
+
+    if (id != mkvmuxer::kMkvCluster) {
+      // Besides the Segment, Libwebm allows only cluster elements of unknown
+      // size. Fail the parse upon encountering a non-cluster element reporting
+      // unknown size.
+      if (size == unknown_size)
+        return E_FILE_FORMAT_INVALID;
+
+      m_pos = pos + size;  // consume payload
+      continue;
+    }
+
+    // We have a cluster.
+
+    cluster_off = idpos - m_start;  // relative pos
+
+    if (size != unknown_size)
+      cluster_size = size;
+
+    break;
+  }
+
+  if (cluster_off < 0) {
+    // No cluster, die.
+    return E_FILE_FORMAT_INVALID;
+  }
+
+  long long pos_;
+  long len_;
+
+  status = Cluster::HasBlockEntries(this, cluster_off, pos_, len_);
+
+  if (status < 0) {  // error, or underflow
+    pos = pos_;
+    len = len_;
+
+    return status;
+  }
+
+  // status == 0 means "no block entries found"
+  // status > 0 means "found at least one block entry"
+
+  // TODO:
+  // The issue here is that the segment increments its own
+  // pos ptr past the most recent cluster parsed, and then
+  // starts from there to parse the next cluster.  If we
+  // don't know the size of the current cluster, then we
+  // must either parse its payload (as we do below), looking
+  // for the cluster (or cues) ID to terminate the parse.
+  // This isn't really what we want: rather, we really need
+  // a way to create the curr cluster object immediately.
+  // The pity is that cluster::parse can determine its own
+  // boundary, and we largely duplicate that same logic here.
+  //
+  // Maybe we need to get rid of our look-ahead preloading
+  // in source::parse???
+  //
+  // As we're parsing the blocks in the curr cluster
+  //(in cluster::parse), we should have some way to signal
+  // to the segment that we have determined the boundary,
+  // so it can adjust its own segment::m_pos member.
+  //
+  // The problem is that we're asserting in asyncreadinit,
+  // because we adjust the pos down to the curr seek pos,
+  // and the resulting adjusted len is > 2GB.  I'm suspicious
+  // that this is even correct, but even if it is, we can't
+  // be loading that much data in the cache anyway.
+
+  const long idx = m_clusterCount;
+
+  if (m_clusterPreloadCount > 0) {
+    if (idx >= m_clusterSize)
+      return E_FILE_FORMAT_INVALID;
+
+    Cluster* const pCluster = m_clusters[idx];
+    if (pCluster == NULL || pCluster->m_index >= 0)
+      return E_FILE_FORMAT_INVALID;
+
+    const long long off = pCluster->GetPosition();
+    if (off < 0)
+      return E_FILE_FORMAT_INVALID;
+
+    if (off == cluster_off) {  // preloaded already
+      if (status == 0)  // no entries found
+        return E_FILE_FORMAT_INVALID;
+
+      if (cluster_size >= 0)
+        pos += cluster_size;
+      else {
+        const long long element_size = pCluster->GetElementSize();
+
+        if (element_size <= 0)
+          return E_FILE_FORMAT_INVALID;  // TODO: handle this case
+
+        pos = pCluster->m_element_start + element_size;
+      }
+
+      pCluster->m_index = idx;  // move from preloaded to loaded
+      ++m_clusterCount;
+      --m_clusterPreloadCount;
+
+      m_pos = pos;  // consume payload
+      if (segment_stop >= 0 && m_pos > segment_stop)
+        return E_FILE_FORMAT_INVALID;
+
+      return 0;  // success
+    }
+  }
+
+  if (status == 0) {  // no entries found
+    if (cluster_size >= 0)
+      pos += cluster_size;
+
+    if ((total >= 0) && (pos >= total)) {
+      m_pos = total;
+      return 1;  // no more clusters
+    }
+
+    if ((segment_stop >= 0) && (pos >= segment_stop)) {
+      m_pos = segment_stop;
+      return 1;  // no more clusters
+    }
+
+    m_pos = pos;
+    return 2;  // try again
+  }
+
+  // status > 0 means we have an entry
+
+  Cluster* const pCluster = Cluster::Create(this, idx, cluster_off);
+  if (pCluster == NULL)
+    return -1;
+
+  if (!AppendCluster(pCluster)) {
+    delete pCluster;
+    return -1;
+  }
+
+  if (cluster_size >= 0) {
+    pos += cluster_size;
+
+    m_pos = pos;
+
+    if (segment_stop > 0 && m_pos > segment_stop)
+      return E_FILE_FORMAT_INVALID;
+
+    return 0;
+  }
+
+  m_pUnknownSize = pCluster;
+  m_pos = -pos;
+
+  return 0;  // partial success, since we have a new cluster
+
+  // status == 0 means "no block entries found"
+  // pos designates start of payload
+  // m_pos has NOT been adjusted yet (in case we need to come back here)
+}
+
+long Segment::DoLoadClusterUnknownSize(long long& pos, long& len) {
+  if (m_pos >= 0 || m_pUnknownSize == NULL)
+    return E_PARSE_FAILED;
+
+  const long status = m_pUnknownSize->Parse(pos, len);
+
+  if (status < 0)  // error or underflow
+    return status;
+
+  if (status == 0)  // parsed a block
+    return 2;  // continue parsing
+
+  const long long start = m_pUnknownSize->m_element_start;
+  const long long size = m_pUnknownSize->GetElementSize();
+
+  if (size < 0)
+    return E_FILE_FORMAT_INVALID;
+
+  pos = start + size;
+  m_pos = pos;
+
+  m_pUnknownSize = 0;
+
+  return 2;  // continue parsing
+}
+
+bool Segment::AppendCluster(Cluster* pCluster) {
+  if (pCluster == NULL || pCluster->m_index < 0)
+    return false;
+
+  const long count = m_clusterCount + m_clusterPreloadCount;
+
+  long& size = m_clusterSize;
+  const long idx = pCluster->m_index;
+
+  if (size < count || idx != m_clusterCount)
+    return false;
+
+  if (count >= size) {
+    const long n = (size <= 0) ? 2048 : 2 * size;
+
+    Cluster** const qq = new (std::nothrow) Cluster*[n];
+    if (qq == NULL)
+      return false;
+
+    Cluster** q = qq;
+    Cluster** p = m_clusters;
+    Cluster** const pp = p + count;
+
+    while (p != pp)
+      *q++ = *p++;
+
+    delete[] m_clusters;
+
+    m_clusters = qq;
+    size = n;
+  }
+
+  if (m_clusterPreloadCount > 0) {
+    Cluster** const p = m_clusters + m_clusterCount;
+    if (*p == NULL || (*p)->m_index >= 0)
+      return false;
+
+    Cluster** q = p + m_clusterPreloadCount;
+    if (q >= (m_clusters + size))
+      return false;
+
+    for (;;) {
+      Cluster** const qq = q - 1;
+      if ((*qq)->m_index >= 0)
+        return false;
+
+      *q = *qq;
+      q = qq;
+
+      if (q == p)
+        break;
+    }
+  }
+
+  m_clusters[idx] = pCluster;
+  ++m_clusterCount;
+  return true;
+}
+
+bool Segment::PreloadCluster(Cluster* pCluster, ptrdiff_t idx) {
+  if (pCluster == NULL || pCluster->m_index >= 0 || idx < m_clusterCount)
+    return false;
+
+  const long count = m_clusterCount + m_clusterPreloadCount;
+
+  long& size = m_clusterSize;
+  if (size < count)
+    return false;
+
+  if (count >= size) {
+    const long n = (size <= 0) ? 2048 : 2 * size;
+
+    Cluster** const qq = new (std::nothrow) Cluster*[n];
+    if (qq == NULL)
+      return false;
+    Cluster** q = qq;
+
+    Cluster** p = m_clusters;
+    Cluster** const pp = p + count;
+
+    while (p != pp)
+      *q++ = *p++;
+
+    delete[] m_clusters;
+
+    m_clusters = qq;
+    size = n;
+  }
+
+  if (m_clusters == NULL)
+    return false;
+
+  Cluster** const p = m_clusters + idx;
+
+  Cluster** q = m_clusters + count;
+  if (q < p || q >= (m_clusters + size))
+    return false;
+
+  while (q > p) {
+    Cluster** const qq = q - 1;
+
+    if ((*qq)->m_index >= 0)
+      return false;
+
+    *q = *qq;
+    q = qq;
+  }
+
+  m_clusters[idx] = pCluster;
+  ++m_clusterPreloadCount;
+  return true;
+}
+
+long Segment::Load() {
+  if (m_clusters != NULL || m_clusterSize != 0 || m_clusterCount != 0)
+    return E_PARSE_FAILED;
+
+  // Outermost (level 0) segment object has been constructed,
+  // and pos designates start of payload.  We need to find the
+  // inner (level 1) elements.
+
+  const long long header_status = ParseHeaders();
+
+  if (header_status < 0)  // error
+    return static_cast<long>(header_status);
+
+  if (header_status > 0)  // underflow
+    return E_BUFFER_NOT_FULL;
+
+  if (m_pInfo == NULL || m_pTracks == NULL)
+    return E_FILE_FORMAT_INVALID;
+
+  for (;;) {
+    const int status = LoadCluster();
+
+    if (status < 0)  // error
+      return status;
+
+    if (status >= 1)  // no more clusters
+      return 0;
+  }
+}
+
+SeekHead::SeekHead(Segment* pSegment, long long start, long long size_,
+                   long long element_start, long long element_size)
+    : m_pSegment(pSegment),
+      m_start(start),
+      m_size(size_),
+      m_element_start(element_start),
+      m_element_size(element_size),
+      m_entries(0),
+      m_entry_count(0),
+      m_void_elements(0),
+      m_void_element_count(0) {}
+
+SeekHead::~SeekHead() {
+  delete[] m_entries;
+  delete[] m_void_elements;
+}
+
+long SeekHead::Parse() {
+  IMkvReader* const pReader = m_pSegment->m_pReader;
+
+  long long pos = m_start;
+  const long long stop = m_start + m_size;
+
+  // first count the seek head entries
+
+  int entry_count = 0;
+  int void_element_count = 0;
+
+  while (pos < stop) {
+    long long id, size;
+
+    const long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+    if (status < 0)  // error
+      return status;
+
+    if (id == mkvmuxer::kMkvSeek)
+      ++entry_count;
+    else if (id == mkvmuxer::kMkvVoid)
+      ++void_element_count;
+
+    pos += size;  // consume payload
+
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
+
+  m_entries = new (std::nothrow) Entry[entry_count];
+
+  if (m_entries == NULL)
+    return -1;
+
+  m_void_elements = new (std::nothrow) VoidElement[void_element_count];
+
+  if (m_void_elements == NULL)
+    return -1;
+
+  // now parse the entries and void elements
+
+  Entry* pEntry = m_entries;
+  VoidElement* pVoidElement = m_void_elements;
+
+  pos = m_start;
+
+  while (pos < stop) {
+    const long long idpos = pos;
+
+    long long id, size;
+
+    const long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+    if (status < 0)  // error
+      return status;
+
+    if (id == mkvmuxer::kMkvSeek) {
+      if (ParseEntry(pReader, pos, size, pEntry)) {
+        Entry& e = *pEntry++;
+
+        e.element_start = idpos;
+        e.element_size = (pos + size) - idpos;
+      }
+    } else if (id == mkvmuxer::kMkvVoid) {
+      VoidElement& e = *pVoidElement++;
+
+      e.element_start = idpos;
+      e.element_size = (pos + size) - idpos;
+    }
+
+    pos += size;  // consume payload
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
+
+  ptrdiff_t count_ = ptrdiff_t(pEntry - m_entries);
+  assert(count_ >= 0);
+  assert(count_ <= entry_count);
+
+  m_entry_count = static_cast<int>(count_);
+
+  count_ = ptrdiff_t(pVoidElement - m_void_elements);
+  assert(count_ >= 0);
+  assert(count_ <= void_element_count);
+
+  m_void_element_count = static_cast<int>(count_);
+
+  return 0;
+}
+
+int SeekHead::GetCount() const { return m_entry_count; }
+
+const SeekHead::Entry* SeekHead::GetEntry(int idx) const {
+  if (idx < 0)
+    return 0;
+
+  if (idx >= m_entry_count)
+    return 0;
+
+  return m_entries + idx;
+}
+
+int SeekHead::GetVoidElementCount() const { return m_void_element_count; }
+
+const SeekHead::VoidElement* SeekHead::GetVoidElement(int idx) const {
+  if (idx < 0)
+    return 0;
+
+  if (idx >= m_void_element_count)
+    return 0;
+
+  return m_void_elements + idx;
+}
+
+long Segment::ParseCues(long long off, long long& pos, long& len) {
+  if (m_pCues)
+    return 0;  // success
+
+  if (off < 0)
+    return -1;
+
+  long long total, avail;
+
+  const int status = m_pReader->Length(&total, &avail);
+
+  if (status < 0)  // error
+    return status;
+
+  assert((total < 0) || (avail <= total));
+
+  pos = m_start + off;
+
+  if ((total < 0) || (pos >= total))
+    return 1;  // don't bother parsing cues
+
+  const long long element_start = pos;
+  const long long segment_stop = (m_size < 0) ? -1 : m_start + m_size;
+
+  if ((pos + 1) > avail) {
+    len = 1;
+    return E_BUFFER_NOT_FULL;
+  }
+
+  long long result = GetUIntLength(m_pReader, pos, len);
+
+  if (result < 0)  // error
+    return static_cast<long>(result);
+
+  if (result > 0)  // underflow (weird)
+  {
+    len = 1;
+    return E_BUFFER_NOT_FULL;
+  }
+
+  if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+    return E_FILE_FORMAT_INVALID;
+
+  if ((pos + len) > avail)
+    return E_BUFFER_NOT_FULL;
+
+  const long long idpos = pos;
+
+  const long long id = ReadID(m_pReader, idpos, len);
+
+  if (id != mkvmuxer::kMkvCues)
+    return E_FILE_FORMAT_INVALID;
+
+  pos += len;  // consume ID
+  assert((segment_stop < 0) || (pos <= segment_stop));
+
+  // Read Size
+
+  if ((pos + 1) > avail) {
+    len = 1;
+    return E_BUFFER_NOT_FULL;
+  }
+
+  result = GetUIntLength(m_pReader, pos, len);
+
+  if (result < 0)  // error
+    return static_cast<long>(result);
+
+  if (result > 0)  // underflow (weird)
+  {
+    len = 1;
+    return E_BUFFER_NOT_FULL;
+  }
+
+  if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+    return E_FILE_FORMAT_INVALID;
+
+  if ((pos + len) > avail)
+    return E_BUFFER_NOT_FULL;
+
+  const long long size = ReadUInt(m_pReader, pos, len);
+
+  if (size < 0)  // error
+    return static_cast<long>(size);
+
+  if (size == 0)  // weird, although technically not illegal
+    return 1;  // done
+
+  pos += len;  // consume length of size of element
+  assert((segment_stop < 0) || (pos <= segment_stop));
+
+  // Pos now points to start of payload
+
+  const long long element_stop = pos + size;
+
+  if ((segment_stop >= 0) && (element_stop > segment_stop))
+    return E_FILE_FORMAT_INVALID;
+
+  if ((total >= 0) && (element_stop > total))
+    return 1;  // don't bother parsing anymore
+
+  len = static_cast<long>(size);
+
+  if (element_stop > avail)
+    return E_BUFFER_NOT_FULL;
+
+  const long long element_size = element_stop - element_start;
+
+  m_pCues =
+      new (std::nothrow) Cues(this, pos, size, element_start, element_size);
+  if (m_pCues == NULL)
+    return -1;
+
+  return 0;  // success
+}
+
+bool SeekHead::ParseEntry(IMkvReader* pReader, long long start, long long size_,
+                          Entry* pEntry) {
+  if (size_ <= 0)
+    return false;
+
+  long long pos = start;
+  const long long stop = start + size_;
+
+  long len;
+
+  // parse the container for the level-1 element ID
+
+  const long long seekIdId = ReadID(pReader, pos, len);
+  if (seekIdId < 0)
+    return false;
+
+  if (seekIdId != mkvmuxer::kMkvSeekID)
+    return false;
+
+  if ((pos + len) > stop)
+    return false;
+
+  pos += len;  // consume SeekID id
+
+  const long long seekIdSize = ReadUInt(pReader, pos, len);
+
+  if (seekIdSize <= 0)
+    return false;
+
+  if ((pos + len) > stop)
+    return false;
+
+  pos += len;  // consume size of field
+
+  if ((pos + seekIdSize) > stop)
+    return false;
+
+  // Note that the SeekId payload really is serialized
+  // as a "Matroska integer", not as a plain binary value.
+  // In fact, Matroska requires that ID values in the
+  // stream exactly match the binary representation as listed
+  // in the Matroska specification.
+  //
+  // This parser is more liberal, and permits IDs to have
+  // any width.  (This could make the representation in the stream
+  // different from what's in the spec, but it doesn't matter here,
+  // since we always normalize "Matroska integer" values.)
+
+  pEntry->id = ReadUInt(pReader, pos, len);  // payload
+
+  if (pEntry->id <= 0)
+    return false;
+
+  if (len != seekIdSize)
+    return false;
+
+  pos += seekIdSize;  // consume SeekID payload
+
+  const long long seekPosId = ReadID(pReader, pos, len);
+
+  if (seekPosId != mkvmuxer::kMkvSeekPosition)
+    return false;
+
+  if ((pos + len) > stop)
+    return false;
+
+  pos += len;  // consume id
+
+  const long long seekPosSize = ReadUInt(pReader, pos, len);
+
+  if (seekPosSize <= 0)
+    return false;
+
+  if ((pos + len) > stop)
+    return false;
+
+  pos += len;  // consume size
+
+  if ((pos + seekPosSize) > stop)
+    return false;
+
+  pEntry->pos = UnserializeUInt(pReader, pos, seekPosSize);
+
+  if (pEntry->pos < 0)
+    return false;
+
+  pos += seekPosSize;  // consume payload
+
+  if (pos != stop)
+    return false;
+
+  return true;
+}
+
+Cues::Cues(Segment* pSegment, long long start_, long long size_,
+           long long element_start, long long element_size)
+    : m_pSegment(pSegment),
+      m_start(start_),
+      m_size(size_),
+      m_element_start(element_start),
+      m_element_size(element_size),
+      m_cue_points(NULL),
+      m_count(0),
+      m_preload_count(0),
+      m_pos(start_) {}
+
+Cues::~Cues() {
+  const long n = m_count + m_preload_count;
+
+  CuePoint** p = m_cue_points;
+  CuePoint** const q = p + n;
+
+  while (p != q) {
+    CuePoint* const pCP = *p++;
+    assert(pCP);
+
+    delete pCP;
+  }
+
+  delete[] m_cue_points;
+}
+
+long Cues::GetCount() const {
+  if (m_cue_points == NULL)
+    return -1;
+
+  return m_count;  // TODO: really ignore preload count?
+}
+
+bool Cues::DoneParsing() const {
+  const long long stop = m_start + m_size;
+  return (m_pos >= stop);
+}
+
+bool Cues::Init() const {
+  if (m_cue_points)
+    return true;
+
+  if (m_count != 0 || m_preload_count != 0)
+    return false;
+
+  IMkvReader* const pReader = m_pSegment->m_pReader;
+
+  const long long stop = m_start + m_size;
+  long long pos = m_start;
+
+  long cue_points_size = 0;
+
+  while (pos < stop) {
+    const long long idpos = pos;
+
+    long len;
+
+    const long long id = ReadID(pReader, pos, len);
+    if (id < 0 || (pos + len) > stop) {
+      return false;
+    }
+
+    pos += len;  // consume ID
+
+    const long long size = ReadUInt(pReader, pos, len);
+    if (size < 0 || (pos + len > stop)) {
+      return false;
+    }
+
+    pos += len;  // consume Size field
+    if (pos + size > stop) {
+      return false;
+    }
+
+    if (id == mkvmuxer::kMkvCuePoint) {
+      if (!PreloadCuePoint(cue_points_size, idpos))
+        return false;
+    }
+
+    pos += size;  // skip payload
+  }
+  return true;
+}
+
+bool Cues::PreloadCuePoint(long& cue_points_size, long long pos) const {
+  if (m_count != 0)
+    return false;
+
+  if (m_preload_count >= cue_points_size) {
+    const long n = (cue_points_size <= 0) ? 2048 : 2 * cue_points_size;
+
+    CuePoint** const qq = new (std::nothrow) CuePoint*[n];
+    if (qq == NULL)
+      return false;
+
+    CuePoint** q = qq;  // beginning of target
+
+    CuePoint** p = m_cue_points;  // beginning of source
+    CuePoint** const pp = p + m_preload_count;  // end of source
+
+    while (p != pp)
+      *q++ = *p++;
+
+    delete[] m_cue_points;
+
+    m_cue_points = qq;
+    cue_points_size = n;
+  }
+
+  CuePoint* const pCP = new (std::nothrow) CuePoint(m_preload_count, pos);
+  if (pCP == NULL)
+    return false;
+
+  m_cue_points[m_preload_count++] = pCP;
+  return true;
+}
+
+bool Cues::LoadCuePoint() const {
+  const long long stop = m_start + m_size;
+
+  if (m_pos >= stop)
+    return false;  // nothing else to do
+
+  if (!Init()) {
+    m_pos = stop;
+    return false;
+  }
+
+  IMkvReader* const pReader = m_pSegment->m_pReader;
+
+  while (m_pos < stop) {
+    const long long idpos = m_pos;
+
+    long len;
+
+    const long long id = ReadID(pReader, m_pos, len);
+    if (id < 0 || (m_pos + len) > stop)
+      return false;
+
+    m_pos += len;  // consume ID
+
+    const long long size = ReadUInt(pReader, m_pos, len);
+    if (size < 0 || (m_pos + len) > stop)
+      return false;
+
+    m_pos += len;  // consume Size field
+    if ((m_pos + size) > stop)
+      return false;
+
+    if (id != mkvmuxer::kMkvCuePoint) {
+      m_pos += size;  // consume payload
+      if (m_pos > stop)
+        return false;
+
+      continue;
+    }
+
+    if (m_preload_count < 1)
+      return false;
+
+    CuePoint* const pCP = m_cue_points[m_count];
+    if (!pCP || (pCP->GetTimeCode() < 0 && (-pCP->GetTimeCode() != idpos)))
+      return false;
+
+    if (!pCP->Load(pReader)) {
+      m_pos = stop;
+      return false;
+    }
+    ++m_count;
+    --m_preload_count;
+
+    m_pos += size;  // consume payload
+    if (m_pos > stop)
+      return false;
+
+    return true;  // yes, we loaded a cue point
+  }
+
+  return false;  // no, we did not load a cue point
+}
+
+bool Cues::Find(long long time_ns, const Track* pTrack, const CuePoint*& pCP,
+                const CuePoint::TrackPosition*& pTP) const {
+  if (time_ns < 0 || pTrack == NULL || m_cue_points == NULL || m_count == 0)
+    return false;
+
+  CuePoint** const ii = m_cue_points;
+  CuePoint** i = ii;
+
+  CuePoint** const jj = ii + m_count;
+  CuePoint** j = jj;
+
+  pCP = *i;
+  if (pCP == NULL)
+    return false;
+
+  if (time_ns <= pCP->GetTime(m_pSegment)) {
+    pTP = pCP->Find(pTrack);
+    return (pTP != NULL);
+  }
+
+  while (i < j) {
+    // INVARIANT:
+    //[ii, i) <= time_ns
+    //[i, j)  ?
+    //[j, jj) > time_ns
+
+    CuePoint** const k = i + (j - i) / 2;
+    if (k >= jj)
+      return false;
+
+    CuePoint* const pCP = *k;
+    if (pCP == NULL)
+      return false;
+
+    const long long t = pCP->GetTime(m_pSegment);
+
+    if (t <= time_ns)
+      i = k + 1;
+    else
+      j = k;
+
+    if (i > j)
+      return false;
+  }
+
+  if (i != j || i > jj || i <= ii)
+    return false;
+
+  pCP = *--i;
+
+  if (pCP == NULL || pCP->GetTime(m_pSegment) > time_ns)
+    return false;
+
+  // TODO: here and elsewhere, it's probably not correct to search
+  // for the cue point with this time, and then search for a matching
+  // track.  In principle, the matching track could be on some earlier
+  // cue point, and with our current algorithm, we'd miss it.  To make
+  // this bullet-proof, we'd need to create a secondary structure,
+  // with a list of cue points that apply to a track, and then search
+  // that track-based structure for a matching cue point.
+
+  pTP = pCP->Find(pTrack);
+  return (pTP != NULL);
+}
+
+const CuePoint* Cues::GetFirst() const {
+  if (m_cue_points == NULL || m_count == 0)
+    return NULL;
+
+  CuePoint* const* const pp = m_cue_points;
+  if (pp == NULL)
+    return NULL;
+
+  CuePoint* const pCP = pp[0];
+  if (pCP == NULL || pCP->GetTimeCode() < 0)
+    return NULL;
+
+  return pCP;
+}
+
+const CuePoint* Cues::GetLast() const {
+  if (m_cue_points == NULL || m_count <= 0)
+    return NULL;
+
+  const long index = m_count - 1;
+
+  CuePoint* const* const pp = m_cue_points;
+  if (pp == NULL)
+    return NULL;
+
+  CuePoint* const pCP = pp[index];
+  if (pCP == NULL || pCP->GetTimeCode() < 0)
+    return NULL;
+
+  return pCP;
+}
+
+const CuePoint* Cues::GetNext(const CuePoint* pCurr) const {
+  if (pCurr == NULL || pCurr->GetTimeCode() < 0 ||
+      m_cue_points == NULL || m_count < 1) {
+    return NULL;
+  }
+
+  long index = pCurr->m_index;
+  if (index >= m_count)
+    return NULL;
+
+  CuePoint* const* const pp = m_cue_points;
+  if (pp == NULL || pp[index] != pCurr)
+    return NULL;
+
+  ++index;
+
+  if (index >= m_count)
+    return NULL;
+
+  CuePoint* const pNext = pp[index];
+
+  if (pNext == NULL || pNext->GetTimeCode() < 0)
+    return NULL;
+
+  return pNext;
+}
+
+const BlockEntry* Cues::GetBlock(const CuePoint* pCP,
+                                 const CuePoint::TrackPosition* pTP) const {
+  if (pCP == NULL || pTP == NULL)
+    return NULL;
+
+  return m_pSegment->GetBlock(*pCP, *pTP);
+}
+
+const BlockEntry* Segment::GetBlock(const CuePoint& cp,
+                                    const CuePoint::TrackPosition& tp) {
+  Cluster** const ii = m_clusters;
+  Cluster** i = ii;
+
+  const long count = m_clusterCount + m_clusterPreloadCount;
+
+  Cluster** const jj = ii + count;
+  Cluster** j = jj;
+
+  while (i < j) {
+    // INVARIANT:
+    //[ii, i) < pTP->m_pos
+    //[i, j) ?
+    //[j, jj)  > pTP->m_pos
+
+    Cluster** const k = i + (j - i) / 2;
+    assert(k < jj);
+
+    Cluster* const pCluster = *k;
+    assert(pCluster);
+
+    // const long long pos_ = pCluster->m_pos;
+    // assert(pos_);
+    // const long long pos = pos_ * ((pos_ < 0) ? -1 : 1);
+
+    const long long pos = pCluster->GetPosition();
+    assert(pos >= 0);
+
+    if (pos < tp.m_pos)
+      i = k + 1;
+    else if (pos > tp.m_pos)
+      j = k;
+    else
+      return pCluster->GetEntry(cp, tp);
+  }
+
+  assert(i == j);
+  // assert(Cluster::HasBlockEntries(this, tp.m_pos));
+
+  Cluster* const pCluster = Cluster::Create(this, -1, tp.m_pos);  //, -1);
+  if (pCluster == NULL)
+    return NULL;
+
+  const ptrdiff_t idx = i - m_clusters;
+
+  if (!PreloadCluster(pCluster, idx)) {
+    delete pCluster;
+    return NULL;
+  }
+  assert(m_clusters);
+  assert(m_clusterPreloadCount > 0);
+  assert(m_clusters[idx] == pCluster);
+
+  return pCluster->GetEntry(cp, tp);
+}
+
+const Cluster* Segment::FindOrPreloadCluster(long long requested_pos) {
+  if (requested_pos < 0)
+    return 0;
+
+  Cluster** const ii = m_clusters;
+  Cluster** i = ii;
+
+  const long count = m_clusterCount + m_clusterPreloadCount;
+
+  Cluster** const jj = ii + count;
+  Cluster** j = jj;
+
+  while (i < j) {
+    // INVARIANT:
+    //[ii, i) < pTP->m_pos
+    //[i, j) ?
+    //[j, jj)  > pTP->m_pos
+
+    Cluster** const k = i + (j - i) / 2;
+    assert(k < jj);
+
+    Cluster* const pCluster = *k;
+    assert(pCluster);
+
+    // const long long pos_ = pCluster->m_pos;
+    // assert(pos_);
+    // const long long pos = pos_ * ((pos_ < 0) ? -1 : 1);
+
+    const long long pos = pCluster->GetPosition();
+    assert(pos >= 0);
+
+    if (pos < requested_pos)
+      i = k + 1;
+    else if (pos > requested_pos)
+      j = k;
+    else
+      return pCluster;
+  }
+
+  assert(i == j);
+  // assert(Cluster::HasBlockEntries(this, tp.m_pos));
+
+  Cluster* const pCluster = Cluster::Create(this, -1, requested_pos);
+  if (pCluster == NULL)
+    return NULL;
+
+  const ptrdiff_t idx = i - m_clusters;
+
+  if (!PreloadCluster(pCluster, idx)) {
+    delete pCluster;
+    return NULL;
+  }
+  assert(m_clusters);
+  assert(m_clusterPreloadCount > 0);
+  assert(m_clusters[idx] == pCluster);
+
+  return pCluster;
+}
+
+CuePoint::CuePoint(long idx, long long pos)
+    : m_element_start(0),
+      m_element_size(0),
+      m_index(idx),
+      m_timecode(-1 * pos),
+      m_track_positions(NULL),
+      m_track_positions_count(0) {
+  assert(pos > 0);
+}
+
+CuePoint::~CuePoint() { delete[] m_track_positions; }
+
+bool CuePoint::Load(IMkvReader* pReader) {
+  // odbgstream os;
+  // os << "CuePoint::Load(begin): timecode=" << m_timecode << endl;
+
+  if (m_timecode >= 0)  // already loaded
+    return true;
+
+  assert(m_track_positions == NULL);
+  assert(m_track_positions_count == 0);
+
+  long long pos_ = -m_timecode;
+  const long long element_start = pos_;
+
+  long long stop;
+
+  {
+    long len;
+
+    const long long id = ReadID(pReader, pos_, len);
+    if (id != mkvmuxer::kMkvCuePoint)
+      return false;
+
+    pos_ += len;  // consume ID
+
+    const long long size = ReadUInt(pReader, pos_, len);
+    assert(size >= 0);
+
+    pos_ += len;  // consume Size field
+    // pos_ now points to start of payload
+
+    stop = pos_ + size;
+  }
+
+  const long long element_size = stop - element_start;
+
+  long long pos = pos_;
+
+  // First count number of track positions
+
+  while (pos < stop) {
+    long len;
+
+    const long long id = ReadID(pReader, pos, len);
+    if ((id < 0) || (pos + len > stop)) {
+      return false;
+    }
+
+    pos += len;  // consume ID
+
+    const long long size = ReadUInt(pReader, pos, len);
+    if ((size < 0) || (pos + len > stop)) {
+      return false;
+    }
+
+    pos += len;  // consume Size field
+    if ((pos + size) > stop) {
+      return false;
+    }
+
+    if (id == mkvmuxer::kMkvCueTime)
+      m_timecode = UnserializeUInt(pReader, pos, size);
+
+    else if (id == mkvmuxer::kMkvCueTrackPositions)
+      ++m_track_positions_count;
+
+    pos += size;  // consume payload
+  }
+
+  if (m_timecode < 0 || m_track_positions_count <= 0) {
+    return false;
+  }
+
+  // os << "CuePoint::Load(cont'd): idpos=" << idpos
+  //   << " timecode=" << m_timecode
+  //   << endl;
+
+  m_track_positions = new (std::nothrow) TrackPosition[m_track_positions_count];
+  if (m_track_positions == NULL)
+    return false;
+
+  // Now parse track positions
+
+  TrackPosition* p = m_track_positions;
+  pos = pos_;
+
+  while (pos < stop) {
+    long len;
+
+    const long long id = ReadID(pReader, pos, len);
+    if (id < 0 || (pos + len) > stop)
+      return false;
+
+    pos += len;  // consume ID
+
+    const long long size = ReadUInt(pReader, pos, len);
+    assert(size >= 0);
+    assert((pos + len) <= stop);
+
+    pos += len;  // consume Size field
+    assert((pos + size) <= stop);
+
+    if (id == mkvmuxer::kMkvCueTrackPositions) {
+      TrackPosition& tp = *p++;
+      if (!tp.Parse(pReader, pos, size)) {
+        return false;
+      }
+    }
+
+    pos += size;  // consume payload
+    if (pos > stop)
+      return false;
+  }
+
+  assert(size_t(p - m_track_positions) == m_track_positions_count);
+
+  m_element_start = element_start;
+  m_element_size = element_size;
+
+  return true;
+}
+
+bool CuePoint::TrackPosition::Parse(IMkvReader* pReader, long long start_,
+                                    long long size_) {
+  const long long stop = start_ + size_;
+  long long pos = start_;
+
+  m_track = -1;
+  m_pos = -1;
+  m_block = 1;  // default
+
+  while (pos < stop) {
+    long len;
+
+    const long long id = ReadID(pReader, pos, len);
+    if ((id < 0) || ((pos + len) > stop)) {
+      return false;
+    }
+
+    pos += len;  // consume ID
+
+    const long long size = ReadUInt(pReader, pos, len);
+    if ((size < 0) || ((pos + len) > stop)) {
+      return false;
+    }
+
+    pos += len;  // consume Size field
+    if ((pos + size) > stop) {
+      return false;
+    }
+
+    if (id == mkvmuxer::kMkvCueTrack)
+      m_track = UnserializeUInt(pReader, pos, size);
+    else if (id == mkvmuxer::kMkvCueClusterPosition)
+      m_pos = UnserializeUInt(pReader, pos, size);
+    else if (id == mkvmuxer::kMkvCueBlockNumber)
+      m_block = UnserializeUInt(pReader, pos, size);
+
+    pos += size;  // consume payload
+  }
+
+  if ((m_pos < 0) || (m_track <= 0)) {
+    return false;
+  }
+
+  return true;
+}
+
+const CuePoint::TrackPosition* CuePoint::Find(const Track* pTrack) const {
+  assert(pTrack);
+
+  const long long n = pTrack->GetNumber();
+
+  const TrackPosition* i = m_track_positions;
+  const TrackPosition* const j = i + m_track_positions_count;
+
+  while (i != j) {
+    const TrackPosition& p = *i++;
+
+    if (p.m_track == n)
+      return &p;
+  }
+
+  return NULL;  // no matching track number found
+}
+
+long long CuePoint::GetTimeCode() const { return m_timecode; }
+
+long long CuePoint::GetTime(const Segment* pSegment) const {
+  assert(pSegment);
+  assert(m_timecode >= 0);
+
+  const SegmentInfo* const pInfo = pSegment->GetInfo();
+  assert(pInfo);
+
+  const long long scale = pInfo->GetTimeCodeScale();
+  assert(scale >= 1);
+
+  const long long time = scale * m_timecode;
+
+  return time;
+}
+
+bool Segment::DoneParsing() const {
+  if (m_size < 0) {
+    long long total, avail;
+
+    const int status = m_pReader->Length(&total, &avail);
+
+    if (status < 0)  // error
+      return true;  // must assume done
+
+    if (total < 0)
+      return false;  // assume live stream
+
+    return (m_pos >= total);
+  }
+
+  const long long stop = m_start + m_size;
+
+  return (m_pos >= stop);
+}
+
+const Cluster* Segment::GetFirst() const {
+  if ((m_clusters == NULL) || (m_clusterCount <= 0))
+    return &m_eos;
+
+  Cluster* const pCluster = m_clusters[0];
+  assert(pCluster);
+
+  return pCluster;
+}
+
+const Cluster* Segment::GetLast() const {
+  if ((m_clusters == NULL) || (m_clusterCount <= 0))
+    return &m_eos;
+
+  const long idx = m_clusterCount - 1;
+
+  Cluster* const pCluster = m_clusters[idx];
+  assert(pCluster);
+
+  return pCluster;
+}
+
+unsigned long Segment::GetCount() const { return m_clusterCount; }
+
+const Cluster* Segment::GetNext(const Cluster* pCurr) {
+  assert(pCurr);
+  assert(pCurr != &m_eos);
+  assert(m_clusters);
+
+  long idx = pCurr->m_index;
+
+  if (idx >= 0) {
+    assert(m_clusterCount > 0);
+    assert(idx < m_clusterCount);
+    assert(pCurr == m_clusters[idx]);
+
+    ++idx;
+
+    if (idx >= m_clusterCount)
+      return &m_eos;  // caller will LoadCluster as desired
+
+    Cluster* const pNext = m_clusters[idx];
+    assert(pNext);
+    assert(pNext->m_index >= 0);
+    assert(pNext->m_index == idx);
+
+    return pNext;
+  }
+
+  assert(m_clusterPreloadCount > 0);
+
+  long long pos = pCurr->m_element_start;
+
+  assert(m_size >= 0);  // TODO
+  const long long stop = m_start + m_size;  // end of segment
+
+  {
+    long len;
+
+    long long result = GetUIntLength(m_pReader, pos, len);
+    assert(result == 0);
+    assert((pos + len) <= stop);  // TODO
+    if (result != 0)
+      return NULL;
+
+    const long long id = ReadID(m_pReader, pos, len);
+    if (id != mkvmuxer::kMkvCluster)
+      return NULL;
+
+    pos += len;  // consume ID
+
+    // Read Size
+    result = GetUIntLength(m_pReader, pos, len);
+    assert(result == 0);  // TODO
+    assert((pos + len) <= stop);  // TODO
+
+    const long long size = ReadUInt(m_pReader, pos, len);
+    assert(size > 0);  // TODO
+    // assert((pCurr->m_size <= 0) || (pCurr->m_size == size));
+
+    pos += len;  // consume length of size of element
+    assert((pos + size) <= stop);  // TODO
+
+    // Pos now points to start of payload
+
+    pos += size;  // consume payload
+  }
+
+  long long off_next = 0;
+
+  while (pos < stop) {
+    long len;
+
+    long long result = GetUIntLength(m_pReader, pos, len);
+    assert(result == 0);
+    assert((pos + len) <= stop);  // TODO
+    if (result != 0)
+      return NULL;
+
+    const long long idpos = pos;  // pos of next (potential) cluster
+
+    const long long id = ReadID(m_pReader, idpos, len);
+    if (id < 0)
+      return NULL;
+
+    pos += len;  // consume ID
+
+    // Read Size
+    result = GetUIntLength(m_pReader, pos, len);
+    assert(result == 0);  // TODO
+    assert((pos + len) <= stop);  // TODO
+
+    const long long size = ReadUInt(m_pReader, pos, len);
+    assert(size >= 0);  // TODO
+
+    pos += len;  // consume length of size of element
+    assert((pos + size) <= stop);  // TODO
+
+    // Pos now points to start of payload
+
+    if (size == 0)  // weird
+      continue;
+
+    if (id == mkvmuxer::kMkvCluster) {
+      const long long off_next_ = idpos - m_start;
+
+      long long pos_;
+      long len_;
+
+      const long status = Cluster::HasBlockEntries(this, off_next_, pos_, len_);
+
+      assert(status >= 0);
+
+      if (status > 0) {
+        off_next = off_next_;
+        break;
+      }
+    }
+
+    pos += size;  // consume payload
+  }
+
+  if (off_next <= 0)
+    return 0;
+
+  Cluster** const ii = m_clusters + m_clusterCount;
+  Cluster** i = ii;
+
+  Cluster** const jj = ii + m_clusterPreloadCount;
+  Cluster** j = jj;
+
+  while (i < j) {
+    // INVARIANT:
+    //[0, i) < pos_next
+    //[i, j) ?
+    //[j, jj)  > pos_next
+
+    Cluster** const k = i + (j - i) / 2;
+    assert(k < jj);
+
+    Cluster* const pNext = *k;
+    assert(pNext);
+    assert(pNext->m_index < 0);
+
+    // const long long pos_ = pNext->m_pos;
+    // assert(pos_);
+    // pos = pos_ * ((pos_ < 0) ? -1 : 1);
+
+    pos = pNext->GetPosition();
+
+    if (pos < off_next)
+      i = k + 1;
+    else if (pos > off_next)
+      j = k;
+    else
+      return pNext;
+  }
+
+  assert(i == j);
+
+  Cluster* const pNext = Cluster::Create(this, -1, off_next);
+  if (pNext == NULL)
+    return NULL;
+
+  const ptrdiff_t idx_next = i - m_clusters;  // insertion position
+
+  if (!PreloadCluster(pNext, idx_next)) {
+    delete pNext;
+    return NULL;
+  }
+  assert(m_clusters);
+  assert(idx_next < m_clusterSize);
+  assert(m_clusters[idx_next] == pNext);
+
+  return pNext;
+}
+
+long Segment::ParseNext(const Cluster* pCurr, const Cluster*& pResult,
+                        long long& pos, long& len) {
+  assert(pCurr);
+  assert(!pCurr->EOS());
+  assert(m_clusters);
+
+  pResult = 0;
+
+  if (pCurr->m_index >= 0) {  // loaded (not merely preloaded)
+    assert(m_clusters[pCurr->m_index] == pCurr);
+
+    const long next_idx = pCurr->m_index + 1;
+
+    if (next_idx < m_clusterCount) {
+      pResult = m_clusters[next_idx];
+      return 0;  // success
+    }
+
+    // curr cluster is last among loaded
+
+    const long result = LoadCluster(pos, len);
+
+    if (result < 0)  // error or underflow
+      return result;
+
+    if (result > 0)  // no more clusters
+    {
+      // pResult = &m_eos;
+      return 1;
+    }
+
+    pResult = GetLast();
+    return 0;  // success
+  }
+
+  assert(m_pos > 0);
+
+  long long total, avail;
+
+  long status = m_pReader->Length(&total, &avail);
+
+  if (status < 0)  // error
+    return status;
+
+  assert((total < 0) || (avail <= total));
+
+  const long long segment_stop = (m_size < 0) ? -1 : m_start + m_size;
+
+  // interrogate curr cluster
+
+  pos = pCurr->m_element_start;
+
+  if (pCurr->m_element_size >= 0)
+    pos += pCurr->m_element_size;
+  else {
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    long long result = GetUIntLength(m_pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)  // weird
+      return E_BUFFER_NOT_FULL;
+
+    if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long id = ReadUInt(m_pReader, pos, len);
+
+    if (id != mkvmuxer::kMkvCluster)
+      return -1;
+
+    pos += len;  // consume ID
+
+    // Read Size
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    result = GetUIntLength(m_pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)  // weird
+      return E_BUFFER_NOT_FULL;
+
+    if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long size = ReadUInt(m_pReader, pos, len);
+
+    if (size < 0)  // error
+      return static_cast<long>(size);
+
+    pos += len;  // consume size field
+
+    const long long unknown_size = (1LL << (7 * len)) - 1;
+
+    if (size == unknown_size)  // TODO: should never happen
+      return E_FILE_FORMAT_INVALID;  // TODO: resolve this
+
+    // assert((pCurr->m_size <= 0) || (pCurr->m_size == size));
+
+    if ((segment_stop >= 0) && ((pos + size) > segment_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    // Pos now points to start of payload
+
+    pos += size;  // consume payload (that is, the current cluster)
+    if (segment_stop >= 0 && pos > segment_stop)
+      return E_FILE_FORMAT_INVALID;
+
+    // By consuming the payload, we are assuming that the curr
+    // cluster isn't interesting.  That is, we don't bother checking
+    // whether the payload of the curr cluster is less than what
+    // happens to be available (obtained via IMkvReader::Length).
+    // Presumably the caller has already dispensed with the current
+    // cluster, and really does want the next cluster.
+  }
+
+  // pos now points to just beyond the last fully-loaded cluster
+
+  for (;;) {
+    const long status = DoParseNext(pResult, pos, len);
+
+    if (status <= 1)
+      return status;
+  }
+}
+
+long Segment::DoParseNext(const Cluster*& pResult, long long& pos, long& len) {
+  long long total, avail;
+
+  long status = m_pReader->Length(&total, &avail);
+
+  if (status < 0)  // error
+    return status;
+
+  assert((total < 0) || (avail <= total));
+
+  const long long segment_stop = (m_size < 0) ? -1 : m_start + m_size;
+
+  // Parse next cluster.  This is strictly a parsing activity.
+  // Creation of a new cluster object happens later, after the
+  // parsing is done.
+
+  long long off_next = 0;
+  long long cluster_size = -1;
+
+  for (;;) {
+    if ((total >= 0) && (pos >= total))
+      return 1;  // EOF
+
+    if ((segment_stop >= 0) && (pos >= segment_stop))
+      return 1;  // EOF
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    long long result = GetUIntLength(m_pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)  // weird
+      return E_BUFFER_NOT_FULL;
+
+    if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long idpos = pos;  // absolute
+    const long long idoff = pos - m_start;  // relative
+
+    const long long id = ReadID(m_pReader, idpos, len);  // absolute
+
+    if (id < 0)  // error
+      return static_cast<long>(id);
+
+    if (id == 0)  // weird
+      return -1;  // generic error
+
+    pos += len;  // consume ID
+
+    // Read Size
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    result = GetUIntLength(m_pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)  // weird
+      return E_BUFFER_NOT_FULL;
+
+    if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long size = ReadUInt(m_pReader, pos, len);
+
+    if (size < 0)  // error
+      return static_cast<long>(size);
+
+    pos += len;  // consume length of size of element
+
+    // Pos now points to start of payload
+
+    if (size == 0)  // weird
+      continue;
+
+    const long long unknown_size = (1LL << (7 * len)) - 1;
+
+    if ((segment_stop >= 0) && (size != unknown_size) &&
+        ((pos + size) > segment_stop)) {
+      return E_FILE_FORMAT_INVALID;
+    }
+
+    if (id == mkvmuxer::kMkvCues) {
+      if (size == unknown_size)
+        return E_FILE_FORMAT_INVALID;
+
+      const long long element_stop = pos + size;
+
+      if ((segment_stop >= 0) && (element_stop > segment_stop))
+        return E_FILE_FORMAT_INVALID;
+
+      const long long element_start = idpos;
+      const long long element_size = element_stop - element_start;
+
+      if (m_pCues == NULL) {
+        m_pCues = new (std::nothrow)
+            Cues(this, pos, size, element_start, element_size);
+        if (m_pCues == NULL)
+          return false;
+      }
+
+      pos += size;  // consume payload
+      if (segment_stop >= 0 && pos > segment_stop)
+        return E_FILE_FORMAT_INVALID;
+
+      continue;
+    }
+
+    if (id != mkvmuxer::kMkvCluster) {  // not a Cluster ID
+      if (size == unknown_size)
+        return E_FILE_FORMAT_INVALID;
+
+      pos += size;  // consume payload
+      if (segment_stop >= 0 && pos > segment_stop)
+        return E_FILE_FORMAT_INVALID;
+
+      continue;
+    }
+
+    // We have a cluster.
+    off_next = idoff;
+
+    if (size != unknown_size)
+      cluster_size = size;
+
+    break;
+  }
+
+  assert(off_next > 0);  // have cluster
+
+  // We have parsed the next cluster.
+  // We have not created a cluster object yet.  What we need
+  // to do now is determine whether it has already be preloaded
+  //(in which case, an object for this cluster has already been
+  // created), and if not, create a new cluster object.
+
+  Cluster** const ii = m_clusters + m_clusterCount;
+  Cluster** i = ii;
+
+  Cluster** const jj = ii + m_clusterPreloadCount;
+  Cluster** j = jj;
+
+  while (i < j) {
+    // INVARIANT:
+    //[0, i) < pos_next
+    //[i, j) ?
+    //[j, jj)  > pos_next
+
+    Cluster** const k = i + (j - i) / 2;
+    assert(k < jj);
+
+    const Cluster* const pNext = *k;
+    assert(pNext);
+    assert(pNext->m_index < 0);
+
+    pos = pNext->GetPosition();
+    assert(pos >= 0);
+
+    if (pos < off_next)
+      i = k + 1;
+    else if (pos > off_next)
+      j = k;
+    else {
+      pResult = pNext;
+      return 0;  // success
+    }
+  }
+
+  assert(i == j);
+
+  long long pos_;
+  long len_;
+
+  status = Cluster::HasBlockEntries(this, off_next, pos_, len_);
+
+  if (status < 0) {  // error or underflow
+    pos = pos_;
+    len = len_;
+
+    return status;
+  }
+
+  if (status > 0) {  // means "found at least one block entry"
+    Cluster* const pNext = Cluster::Create(this,
+                                           -1,  // preloaded
+                                           off_next);
+    if (pNext == NULL)
+      return -1;
+
+    const ptrdiff_t idx_next = i - m_clusters;  // insertion position
+
+    if (!PreloadCluster(pNext, idx_next)) {
+      delete pNext;
+      return -1;
+    }
+    assert(m_clusters);
+    assert(idx_next < m_clusterSize);
+    assert(m_clusters[idx_next] == pNext);
+
+    pResult = pNext;
+    return 0;  // success
+  }
+
+  // status == 0 means "no block entries found"
+
+  if (cluster_size < 0) {  // unknown size
+    const long long payload_pos = pos;  // absolute pos of cluster payload
+
+    for (;;) {  // determine cluster size
+      if ((total >= 0) && (pos >= total))
+        break;
+
+      if ((segment_stop >= 0) && (pos >= segment_stop))
+        break;  // no more clusters
+
+      // Read ID
+
+      if ((pos + 1) > avail) {
+        len = 1;
+        return E_BUFFER_NOT_FULL;
+      }
+
+      long long result = GetUIntLength(m_pReader, pos, len);
+
+      if (result < 0)  // error
+        return static_cast<long>(result);
+
+      if (result > 0)  // weird
+        return E_BUFFER_NOT_FULL;
+
+      if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+        return E_FILE_FORMAT_INVALID;
+
+      if ((pos + len) > avail)
+        return E_BUFFER_NOT_FULL;
+
+      const long long idpos = pos;
+      const long long id = ReadID(m_pReader, idpos, len);
+
+      if (id < 0)  // error (or underflow)
+        return static_cast<long>(id);
+
+      // This is the distinguished set of ID's we use to determine
+      // that we have exhausted the sub-element's inside the cluster
+      // whose ID we parsed earlier.
+
+      if (id == mkvmuxer::kMkvCluster || id == mkvmuxer::kMkvCues)
+        break;
+
+      pos += len;  // consume ID (of sub-element)
+
+      // Read Size
+
+      if ((pos + 1) > avail) {
+        len = 1;
+        return E_BUFFER_NOT_FULL;
+      }
+
+      result = GetUIntLength(m_pReader, pos, len);
+
+      if (result < 0)  // error
+        return static_cast<long>(result);
+
+      if (result > 0)  // weird
+        return E_BUFFER_NOT_FULL;
+
+      if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+        return E_FILE_FORMAT_INVALID;
+
+      if ((pos + len) > avail)
+        return E_BUFFER_NOT_FULL;
+
+      const long long size = ReadUInt(m_pReader, pos, len);
+
+      if (size < 0)  // error
+        return static_cast<long>(size);
+
+      pos += len;  // consume size field of element
+
+      // pos now points to start of sub-element's payload
+
+      if (size == 0)  // weird
+        continue;
+
+      const long long unknown_size = (1LL << (7 * len)) - 1;
+
+      if (size == unknown_size)
+        return E_FILE_FORMAT_INVALID;  // not allowed for sub-elements
+
+      if ((segment_stop >= 0) && ((pos + size) > segment_stop))  // weird
+        return E_FILE_FORMAT_INVALID;
+
+      pos += size;  // consume payload of sub-element
+      if (segment_stop >= 0 && pos > segment_stop)
+        return E_FILE_FORMAT_INVALID;
+    }  // determine cluster size
+
+    cluster_size = pos - payload_pos;
+    assert(cluster_size >= 0);  // TODO: handle cluster_size = 0
+
+    pos = payload_pos;  // reset and re-parse original cluster
+  }
+
+  pos += cluster_size;  // consume payload
+  if (segment_stop >= 0 && pos > segment_stop)
+    return E_FILE_FORMAT_INVALID;
+
+  return 2;  // try to find a cluster that follows next
+}
+
+const Cluster* Segment::FindCluster(long long time_ns) const {
+  if ((m_clusters == NULL) || (m_clusterCount <= 0))
+    return &m_eos;
+
+  {
+    Cluster* const pCluster = m_clusters[0];
+    assert(pCluster);
+    assert(pCluster->m_index == 0);
+
+    if (time_ns <= pCluster->GetTime())
+      return pCluster;
+  }
+
+  // Binary search of cluster array
+
+  long i = 0;
+  long j = m_clusterCount;
+
+  while (i < j) {
+    // INVARIANT:
+    //[0, i) <= time_ns
+    //[i, j) ?
+    //[j, m_clusterCount)  > time_ns
+
+    const long k = i + (j - i) / 2;
+    assert(k < m_clusterCount);
+
+    Cluster* const pCluster = m_clusters[k];
+    assert(pCluster);
+    assert(pCluster->m_index == k);
+
+    const long long t = pCluster->GetTime();
+
+    if (t <= time_ns)
+      i = k + 1;
+    else
+      j = k;
+
+    assert(i <= j);
+  }
+
+  assert(i == j);
+  assert(i > 0);
+  assert(i <= m_clusterCount);
+
+  const long k = i - 1;
+
+  Cluster* const pCluster = m_clusters[k];
+  assert(pCluster);
+  assert(pCluster->m_index == k);
+  assert(pCluster->GetTime() <= time_ns);
+
+  return pCluster;
+}
+
+const Tracks* Segment::GetTracks() const { return m_pTracks; }
+const SegmentInfo* Segment::GetInfo() const { return m_pInfo; }
+const Cues* Segment::GetCues() const { return m_pCues; }
+const Chapters* Segment::GetChapters() const { return m_pChapters; }
+const Tags* Segment::GetTags() const { return m_pTags; }
+const SeekHead* Segment::GetSeekHead() const { return m_pSeekHead; }
+
+long long Segment::GetDuration() const {
+  assert(m_pInfo);
+  return m_pInfo->GetDuration();
+}
+
+Chapters::Chapters(Segment* pSegment, long long payload_start,
+                   long long payload_size, long long element_start,
+                   long long element_size)
+    : m_pSegment(pSegment),
+      m_start(payload_start),
+      m_size(payload_size),
+      m_element_start(element_start),
+      m_element_size(element_size),
+      m_editions(NULL),
+      m_editions_size(0),
+      m_editions_count(0) {}
+
+Chapters::~Chapters() {
+  while (m_editions_count > 0) {
+    Edition& e = m_editions[--m_editions_count];
+    e.Clear();
+  }
+  delete[] m_editions;
+}
+
+long Chapters::Parse() {
+  IMkvReader* const pReader = m_pSegment->m_pReader;
+
+  long long pos = m_start;  // payload start
+  const long long stop = pos + m_size;  // payload stop
+
+  while (pos < stop) {
+    long long id, size;
+
+    long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+    if (status < 0)  // error
+      return status;
+
+    if (size == 0)  // weird
+      continue;
+
+    if (id == mkvmuxer::kMkvEditionEntry) {
+      status = ParseEdition(pos, size);
+
+      if (status < 0)  // error
+        return status;
+    }
+
+    pos += size;
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
+  return 0;
+}
+
+int Chapters::GetEditionCount() const { return m_editions_count; }
+
+const Chapters::Edition* Chapters::GetEdition(int idx) const {
+  if (idx < 0)
+    return NULL;
+
+  if (idx >= m_editions_count)
+    return NULL;
+
+  return m_editions + idx;
+}
+
+bool Chapters::ExpandEditionsArray() {
+  if (m_editions_size > m_editions_count)
+    return true;  // nothing else to do
+
+  const int size = (m_editions_size == 0) ? 1 : 2 * m_editions_size;
+
+  Edition* const editions = new (std::nothrow) Edition[size];
+
+  if (editions == NULL)
+    return false;
+
+  for (int idx = 0; idx < m_editions_count; ++idx) {
+    m_editions[idx].ShallowCopy(editions[idx]);
+  }
+
+  delete[] m_editions;
+  m_editions = editions;
+
+  m_editions_size = size;
+  return true;
+}
+
+long Chapters::ParseEdition(long long pos, long long size) {
+  if (!ExpandEditionsArray())
+    return -1;
+
+  Edition& e = m_editions[m_editions_count++];
+  e.Init();
+
+  return e.Parse(m_pSegment->m_pReader, pos, size);
+}
+
+Chapters::Edition::Edition() {}
+
+Chapters::Edition::~Edition() {}
+
+int Chapters::Edition::GetAtomCount() const { return m_atoms_count; }
+
+const Chapters::Atom* Chapters::Edition::GetAtom(int index) const {
+  if (index < 0)
+    return NULL;
+
+  if (index >= m_atoms_count)
+    return NULL;
+
+  return m_atoms + index;
+}
+
+void Chapters::Edition::Init() {
+  m_atoms = NULL;
+  m_atoms_size = 0;
+  m_atoms_count = 0;
+}
+
+void Chapters::Edition::ShallowCopy(Edition& rhs) const {
+  rhs.m_atoms = m_atoms;
+  rhs.m_atoms_size = m_atoms_size;
+  rhs.m_atoms_count = m_atoms_count;
+}
+
+void Chapters::Edition::Clear() {
+  while (m_atoms_count > 0) {
+    Atom& a = m_atoms[--m_atoms_count];
+    a.Clear();
+  }
+
+  delete[] m_atoms;
+  m_atoms = NULL;
+
+  m_atoms_size = 0;
+}
+
+long Chapters::Edition::Parse(IMkvReader* pReader, long long pos,
+                              long long size) {
+  const long long stop = pos + size;
+
+  while (pos < stop) {
+    long long id, size;
+
+    long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+    if (status < 0)  // error
+      return status;
+
+    if (size == 0)
+      continue;
+
+    if (id == mkvmuxer::kMkvChapterAtom) {
+      status = ParseAtom(pReader, pos, size);
+
+      if (status < 0)  // error
+        return status;
+    }
+
+    pos += size;
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
+  return 0;
+}
+
+long Chapters::Edition::ParseAtom(IMkvReader* pReader, long long pos,
+                                  long long size) {
+  if (!ExpandAtomsArray())
+    return -1;
+
+  Atom& a = m_atoms[m_atoms_count++];
+  a.Init();
+
+  return a.Parse(pReader, pos, size);
+}
+
+bool Chapters::Edition::ExpandAtomsArray() {
+  if (m_atoms_size > m_atoms_count)
+    return true;  // nothing else to do
+
+  const int size = (m_atoms_size == 0) ? 1 : 2 * m_atoms_size;
+
+  Atom* const atoms = new (std::nothrow) Atom[size];
+
+  if (atoms == NULL)
+    return false;
+
+  for (int idx = 0; idx < m_atoms_count; ++idx) {
+    m_atoms[idx].ShallowCopy(atoms[idx]);
+  }
+
+  delete[] m_atoms;
+  m_atoms = atoms;
+
+  m_atoms_size = size;
+  return true;
+}
+
+Chapters::Atom::Atom() {}
+
+Chapters::Atom::~Atom() {}
+
+unsigned long long Chapters::Atom::GetUID() const { return m_uid; }
+
+const char* Chapters::Atom::GetStringUID() const { return m_string_uid; }
+
+long long Chapters::Atom::GetStartTimecode() const { return m_start_timecode; }
+
+long long Chapters::Atom::GetStopTimecode() const { return m_stop_timecode; }
+
+long long Chapters::Atom::GetStartTime(const Chapters* pChapters) const {
+  return GetTime(pChapters, m_start_timecode);
+}
+
+long long Chapters::Atom::GetStopTime(const Chapters* pChapters) const {
+  return GetTime(pChapters, m_stop_timecode);
+}
+
+int Chapters::Atom::GetDisplayCount() const { return m_displays_count; }
+
+const Chapters::Display* Chapters::Atom::GetDisplay(int index) const {
+  if (index < 0)
+    return NULL;
+
+  if (index >= m_displays_count)
+    return NULL;
+
+  return m_displays + index;
+}
+
+void Chapters::Atom::Init() {
+  m_string_uid = NULL;
+  m_uid = 0;
+  m_start_timecode = -1;
+  m_stop_timecode = -1;
+
+  m_displays = NULL;
+  m_displays_size = 0;
+  m_displays_count = 0;
+}
+
+void Chapters::Atom::ShallowCopy(Atom& rhs) const {
+  rhs.m_string_uid = m_string_uid;
+  rhs.m_uid = m_uid;
+  rhs.m_start_timecode = m_start_timecode;
+  rhs.m_stop_timecode = m_stop_timecode;
+
+  rhs.m_displays = m_displays;
+  rhs.m_displays_size = m_displays_size;
+  rhs.m_displays_count = m_displays_count;
+}
+
+void Chapters::Atom::Clear() {
+  delete[] m_string_uid;
+  m_string_uid = NULL;
+
+  while (m_displays_count > 0) {
+    Display& d = m_displays[--m_displays_count];
+    d.Clear();
+  }
+
+  delete[] m_displays;
+  m_displays = NULL;
+
+  m_displays_size = 0;
+}
+
+long Chapters::Atom::Parse(IMkvReader* pReader, long long pos, long long size) {
+  const long long stop = pos + size;
+
+  while (pos < stop) {
+    long long id, size;
+
+    long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+    if (status < 0)  // error
+      return status;
+
+    if (size == 0)  // 0 length payload, skip.
+      continue;
+
+    if (id == mkvmuxer::kMkvChapterDisplay) {
+      status = ParseDisplay(pReader, pos, size);
+
+      if (status < 0)  // error
+        return status;
+    } else if (id == mkvmuxer::kMkvChapterStringUID) {
+      status = UnserializeString(pReader, pos, size, m_string_uid);
+
+      if (status < 0)  // error
+        return status;
+    } else if (id == mkvmuxer::kMkvChapterUID) {
+      long long val;
+      status = UnserializeInt(pReader, pos, size, val);
+
+      if (status < 0)  // error
+        return status;
+
+      m_uid = static_cast<unsigned long long>(val);
+    } else if (id == mkvmuxer::kMkvChapterTimeStart) {
+      const long long val = UnserializeUInt(pReader, pos, size);
+
+      if (val < 0)  // error
+        return static_cast<long>(val);
+
+      m_start_timecode = val;
+    } else if (id == mkvmuxer::kMkvChapterTimeEnd) {
+      const long long val = UnserializeUInt(pReader, pos, size);
+
+      if (val < 0)  // error
+        return static_cast<long>(val);
+
+      m_stop_timecode = val;
+    }
+
+    pos += size;
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
+  return 0;
+}
+
+long long Chapters::Atom::GetTime(const Chapters* pChapters,
+                                  long long timecode) {
+  if (pChapters == NULL)
+    return -1;
+
+  Segment* const pSegment = pChapters->m_pSegment;
+
+  if (pSegment == NULL)  // weird
+    return -1;
+
+  const SegmentInfo* const pInfo = pSegment->GetInfo();
+
+  if (pInfo == NULL)
+    return -1;
+
+  const long long timecode_scale = pInfo->GetTimeCodeScale();
+
+  if (timecode_scale < 1)  // weird
+    return -1;
+
+  if (timecode < 0)
+    return -1;
+
+  const long long result = timecode_scale * timecode;
+
+  return result;
+}
+
+long Chapters::Atom::ParseDisplay(IMkvReader* pReader, long long pos,
+                                  long long size) {
+  if (!ExpandDisplaysArray())
+    return -1;
+
+  Display& d = m_displays[m_displays_count++];
+  d.Init();
+
+  return d.Parse(pReader, pos, size);
+}
+
+bool Chapters::Atom::ExpandDisplaysArray() {
+  if (m_displays_size > m_displays_count)
+    return true;  // nothing else to do
+
+  const int size = (m_displays_size == 0) ? 1 : 2 * m_displays_size;
+
+  Display* const displays = new (std::nothrow) Display[size];
+
+  if (displays == NULL)
+    return false;
+
+  for (int idx = 0; idx < m_displays_count; ++idx) {
+    m_displays[idx].ShallowCopy(displays[idx]);
+  }
+
+  delete[] m_displays;
+  m_displays = displays;
+
+  m_displays_size = size;
+  return true;
+}
+
+Chapters::Display::Display() {}
+
+Chapters::Display::~Display() {}
+
+const char* Chapters::Display::GetString() const { return m_string; }
+
+const char* Chapters::Display::GetLanguage() const { return m_language; }
+
+const char* Chapters::Display::GetCountry() const { return m_country; }
+
+void Chapters::Display::Init() {
+  m_string = NULL;
+  m_language = NULL;
+  m_country = NULL;
+}
+
+void Chapters::Display::ShallowCopy(Display& rhs) const {
+  rhs.m_string = m_string;
+  rhs.m_language = m_language;
+  rhs.m_country = m_country;
+}
+
+void Chapters::Display::Clear() {
+  delete[] m_string;
+  m_string = NULL;
+
+  delete[] m_language;
+  m_language = NULL;
+
+  delete[] m_country;
+  m_country = NULL;
+}
+
+long Chapters::Display::Parse(IMkvReader* pReader, long long pos,
+                              long long size) {
+  const long long stop = pos + size;
+
+  while (pos < stop) {
+    long long id, size;
+
+    long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+    if (status < 0)  // error
+      return status;
+
+    if (size == 0)  // No payload.
+      continue;
+
+    if (id == mkvmuxer::kMkvChapString) {
+      status = UnserializeString(pReader, pos, size, m_string);
+
+      if (status)
+        return status;
+    } else if (id == mkvmuxer::kMkvChapLanguage) {
+      status = UnserializeString(pReader, pos, size, m_language);
+
+      if (status)
+        return status;
+    } else if (id == mkvmuxer::kMkvChapCountry) {
+      status = UnserializeString(pReader, pos, size, m_country);
+
+      if (status)
+        return status;
+    }
+
+    pos += size;
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
+  return 0;
+}
+
+Tags::Tags(Segment* pSegment, long long payload_start, long long payload_size,
+           long long element_start, long long element_size)
+    : m_pSegment(pSegment),
+      m_start(payload_start),
+      m_size(payload_size),
+      m_element_start(element_start),
+      m_element_size(element_size),
+      m_tags(NULL),
+      m_tags_size(0),
+      m_tags_count(0) {}
+
+Tags::~Tags() {
+  while (m_tags_count > 0) {
+    Tag& t = m_tags[--m_tags_count];
+    t.Clear();
+  }
+  delete[] m_tags;
+}
+
+long Tags::Parse() {
+  IMkvReader* const pReader = m_pSegment->m_pReader;
+
+  long long pos = m_start;  // payload start
+  const long long stop = pos + m_size;  // payload stop
+
+  while (pos < stop) {
+    long long id, size;
+
+    long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+    if (status < 0)
+      return status;
+
+    if (size == 0)  // 0 length tag, read another
+      continue;
+
+    if (id == mkvmuxer::kMkvTag) {
+      status = ParseTag(pos, size);
+
+      if (status < 0)
+        return status;
+    }
+
+    pos += size;
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
+
+  return 0;
+}
+
+int Tags::GetTagCount() const { return m_tags_count; }
+
+const Tags::Tag* Tags::GetTag(int idx) const {
+  if (idx < 0)
+    return NULL;
+
+  if (idx >= m_tags_count)
+    return NULL;
+
+  return m_tags + idx;
+}
+
+bool Tags::ExpandTagsArray() {
+  if (m_tags_size > m_tags_count)
+    return true;  // nothing else to do
+
+  const int size = (m_tags_size == 0) ? 1 : 2 * m_tags_size;
+
+  Tag* const tags = new (std::nothrow) Tag[size];
+
+  if (tags == NULL)
+    return false;
+
+  for (int idx = 0; idx < m_tags_count; ++idx) {
+    m_tags[idx].ShallowCopy(tags[idx]);
+  }
+
+  delete[] m_tags;
+  m_tags = tags;
+
+  m_tags_size = size;
+  return true;
+}
+
+long Tags::ParseTag(long long pos, long long size) {
+  if (!ExpandTagsArray())
+    return -1;
+
+  Tag& t = m_tags[m_tags_count++];
+  t.Init();
+
+  return t.Parse(m_pSegment->m_pReader, pos, size);
+}
+
+Tags::Tag::Tag() {}
+
+Tags::Tag::~Tag() {}
+
+int Tags::Tag::GetSimpleTagCount() const { return m_simple_tags_count; }
+
+const Tags::SimpleTag* Tags::Tag::GetSimpleTag(int index) const {
+  if (index < 0)
+    return NULL;
+
+  if (index >= m_simple_tags_count)
+    return NULL;
+
+  return m_simple_tags + index;
+}
+
+void Tags::Tag::Init() {
+  m_simple_tags = NULL;
+  m_simple_tags_size = 0;
+  m_simple_tags_count = 0;
+}
+
+void Tags::Tag::ShallowCopy(Tag& rhs) const {
+  rhs.m_simple_tags = m_simple_tags;
+  rhs.m_simple_tags_size = m_simple_tags_size;
+  rhs.m_simple_tags_count = m_simple_tags_count;
+}
+
+void Tags::Tag::Clear() {
+  while (m_simple_tags_count > 0) {
+    SimpleTag& d = m_simple_tags[--m_simple_tags_count];
+    d.Clear();
+  }
+
+  delete[] m_simple_tags;
+  m_simple_tags = NULL;
+
+  m_simple_tags_size = 0;
+}
+
+long Tags::Tag::Parse(IMkvReader* pReader, long long pos, long long size) {
+  const long long stop = pos + size;
+
+  while (pos < stop) {
+    long long id, size;
+
+    long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+    if (status < 0)
+      return status;
+
+    if (size == 0)  // 0 length tag, read another
+      continue;
+
+    if (id == mkvmuxer::kMkvSimpleTag) {
+      status = ParseSimpleTag(pReader, pos, size);
+
+      if (status < 0)
+        return status;
+    }
+
+    pos += size;
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
+  return 0;
+}
+
+long Tags::Tag::ParseSimpleTag(IMkvReader* pReader, long long pos,
+                               long long size) {
+  if (!ExpandSimpleTagsArray())
+    return -1;
+
+  SimpleTag& st = m_simple_tags[m_simple_tags_count++];
+  st.Init();
+
+  return st.Parse(pReader, pos, size);
+}
+
+bool Tags::Tag::ExpandSimpleTagsArray() {
+  if (m_simple_tags_size > m_simple_tags_count)
+    return true;  // nothing else to do
+
+  const int size = (m_simple_tags_size == 0) ? 1 : 2 * m_simple_tags_size;
+
+  SimpleTag* const displays = new (std::nothrow) SimpleTag[size];
+
+  if (displays == NULL)
+    return false;
+
+  for (int idx = 0; idx < m_simple_tags_count; ++idx) {
+    m_simple_tags[idx].ShallowCopy(displays[idx]);
+  }
+
+  delete[] m_simple_tags;
+  m_simple_tags = displays;
+
+  m_simple_tags_size = size;
+  return true;
+}
+
+Tags::SimpleTag::SimpleTag() {}
+
+Tags::SimpleTag::~SimpleTag() {}
+
+const char* Tags::SimpleTag::GetTagName() const { return m_tag_name; }
+
+const char* Tags::SimpleTag::GetTagString() const { return m_tag_string; }
+
+void Tags::SimpleTag::Init() {
+  m_tag_name = NULL;
+  m_tag_string = NULL;
+}
+
+void Tags::SimpleTag::ShallowCopy(SimpleTag& rhs) const {
+  rhs.m_tag_name = m_tag_name;
+  rhs.m_tag_string = m_tag_string;
+}
+
+void Tags::SimpleTag::Clear() {
+  delete[] m_tag_name;
+  m_tag_name = NULL;
+
+  delete[] m_tag_string;
+  m_tag_string = NULL;
+}
+
+long Tags::SimpleTag::Parse(IMkvReader* pReader, long long pos,
+                            long long size) {
+  const long long stop = pos + size;
+
+  while (pos < stop) {
+    long long id, size;
+
+    long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+    if (status < 0)  // error
+      return status;
+
+    if (size == 0)  // weird
+      continue;
+
+    if (id == mkvmuxer::kMkvTagName) {
+      status = UnserializeString(pReader, pos, size, m_tag_name);
+
+      if (status)
+        return status;
+    } else if (id == mkvmuxer::kMkvTagString) {
+      status = UnserializeString(pReader, pos, size, m_tag_string);
+
+      if (status)
+        return status;
+    }
+
+    pos += size;
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
+  return 0;
+}
+
+SegmentInfo::SegmentInfo(Segment* pSegment, long long start, long long size_,
+                         long long element_start, long long element_size)
+    : m_pSegment(pSegment),
+      m_start(start),
+      m_size(size_),
+      m_element_start(element_start),
+      m_element_size(element_size),
+      m_pMuxingAppAsUTF8(NULL),
+      m_pWritingAppAsUTF8(NULL),
+      m_pTitleAsUTF8(NULL) {}
+
+SegmentInfo::~SegmentInfo() {
+  delete[] m_pMuxingAppAsUTF8;
+  m_pMuxingAppAsUTF8 = NULL;
+
+  delete[] m_pWritingAppAsUTF8;
+  m_pWritingAppAsUTF8 = NULL;
+
+  delete[] m_pTitleAsUTF8;
+  m_pTitleAsUTF8 = NULL;
+}
+
+long SegmentInfo::Parse() {
+  assert(m_pMuxingAppAsUTF8 == NULL);
+  assert(m_pWritingAppAsUTF8 == NULL);
+  assert(m_pTitleAsUTF8 == NULL);
+
+  IMkvReader* const pReader = m_pSegment->m_pReader;
+
+  long long pos = m_start;
+  const long long stop = m_start + m_size;
+
+  m_timecodeScale = 1000000;
+  m_duration = -1;
+
+  while (pos < stop) {
+    long long id, size;
+
+    const long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+    if (status < 0)  // error
+      return status;
+
+    if (id == mkvmuxer::kMkvTimecodeScale) {
+      m_timecodeScale = UnserializeUInt(pReader, pos, size);
+
+      if (m_timecodeScale <= 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == mkvmuxer::kMkvDuration) {
+      const long status = UnserializeFloat(pReader, pos, size, m_duration);
+
+      if (status < 0)
+        return status;
+
+      if (m_duration < 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == mkvmuxer::kMkvMuxingApp) {
+      const long status =
+          UnserializeString(pReader, pos, size, m_pMuxingAppAsUTF8);
+
+      if (status)
+        return status;
+    } else if (id == mkvmuxer::kMkvWritingApp) {
+      const long status =
+          UnserializeString(pReader, pos, size, m_pWritingAppAsUTF8);
+
+      if (status)
+        return status;
+    } else if (id == mkvmuxer::kMkvTitle) {
+      const long status = UnserializeString(pReader, pos, size, m_pTitleAsUTF8);
+
+      if (status)
+        return status;
+    }
+
+    pos += size;
+
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  const double rollover_check = m_duration * m_timecodeScale;
+  if (rollover_check > LLONG_MAX)
+    return E_FILE_FORMAT_INVALID;
+
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
+
+  return 0;
+}
+
+long long SegmentInfo::GetTimeCodeScale() const { return m_timecodeScale; }
+
+long long SegmentInfo::GetDuration() const {
+  if (m_duration < 0)
+    return -1;
+
+  assert(m_timecodeScale >= 1);
+
+  const double dd = double(m_duration) * double(m_timecodeScale);
+  const long long d = static_cast<long long>(dd);
+
+  return d;
+}
+
+const char* SegmentInfo::GetMuxingAppAsUTF8() const {
+  return m_pMuxingAppAsUTF8;
+}
+
+const char* SegmentInfo::GetWritingAppAsUTF8() const {
+  return m_pWritingAppAsUTF8;
+}
+
+const char* SegmentInfo::GetTitleAsUTF8() const { return m_pTitleAsUTF8; }
+
+///////////////////////////////////////////////////////////////
+// ContentEncoding element
+ContentEncoding::ContentCompression::ContentCompression()
+    : algo(0), settings(NULL), settings_len(0) {}
+
+ContentEncoding::ContentCompression::~ContentCompression() {
+  delete[] settings;
+}
+
+ContentEncoding::ContentEncryption::ContentEncryption()
+    : algo(0),
+      key_id(NULL),
+      key_id_len(0),
+      signature(NULL),
+      signature_len(0),
+      sig_key_id(NULL),
+      sig_key_id_len(0),
+      sig_algo(0),
+      sig_hash_algo(0) {}
+
+ContentEncoding::ContentEncryption::~ContentEncryption() {
+  delete[] key_id;
+  delete[] signature;
+  delete[] sig_key_id;
+}
+
+ContentEncoding::ContentEncoding()
+    : compression_entries_(NULL),
+      compression_entries_end_(NULL),
+      encryption_entries_(NULL),
+      encryption_entries_end_(NULL),
+      encoding_order_(0),
+      encoding_scope_(1),
+      encoding_type_(0) {}
+
+ContentEncoding::~ContentEncoding() {
+  ContentCompression** comp_i = compression_entries_;
+  ContentCompression** const comp_j = compression_entries_end_;
+
+  while (comp_i != comp_j) {
+    ContentCompression* const comp = *comp_i++;
+    delete comp;
+  }
+
+  delete[] compression_entries_;
+
+  ContentEncryption** enc_i = encryption_entries_;
+  ContentEncryption** const enc_j = encryption_entries_end_;
+
+  while (enc_i != enc_j) {
+    ContentEncryption* const enc = *enc_i++;
+    delete enc;
+  }
+
+  delete[] encryption_entries_;
+}
+
+const ContentEncoding::ContentCompression*
+    ContentEncoding::GetCompressionByIndex(unsigned long idx) const {
+  const ptrdiff_t count = compression_entries_end_ - compression_entries_;
+  assert(count >= 0);
+
+  if (idx >= static_cast<unsigned long>(count))
+    return NULL;
+
+  return compression_entries_[idx];
+}
+
+unsigned long ContentEncoding::GetCompressionCount() const {
+  const ptrdiff_t count = compression_entries_end_ - compression_entries_;
+  assert(count >= 0);
+
+  return static_cast<unsigned long>(count);
+}
+
+const ContentEncoding::ContentEncryption* ContentEncoding::GetEncryptionByIndex(
+    unsigned long idx) const {
+  const ptrdiff_t count = encryption_entries_end_ - encryption_entries_;
+  assert(count >= 0);
+
+  if (idx >= static_cast<unsigned long>(count))
+    return NULL;
+
+  return encryption_entries_[idx];
+}
+
+unsigned long ContentEncoding::GetEncryptionCount() const {
+  const ptrdiff_t count = encryption_entries_end_ - encryption_entries_;
+  assert(count >= 0);
+
+  return static_cast<unsigned long>(count);
+}
+
+long ContentEncoding::ParseContentEncAESSettingsEntry(
+    long long start, long long size, IMkvReader* pReader,
+    ContentEncAESSettings* aes) {
+  assert(pReader);
+  assert(aes);
+
+  long long pos = start;
+  const long long stop = start + size;
+
+  while (pos < stop) {
+    long long id, size;
+    const long status = ParseElementHeader(pReader, pos, stop, id, size);
+    if (status < 0)  // error
+      return status;
+
+    if (id == mkvmuxer::kMkvAESSettingsCipherMode) {
+      aes->cipher_mode = UnserializeUInt(pReader, pos, size);
+      if (aes->cipher_mode != 1)
+        return E_FILE_FORMAT_INVALID;
+    }
+
+    pos += size;  // consume payload
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  return 0;
+}
+
+long ContentEncoding::ParseContentEncodingEntry(long long start, long long size,
+                                                IMkvReader* pReader) {
+  assert(pReader);
+
+  long long pos = start;
+  const long long stop = start + size;
+
+  // Count ContentCompression and ContentEncryption elements.
+  int compression_count = 0;
+  int encryption_count = 0;
+
+  while (pos < stop) {
+    long long id, size;
+    const long status = ParseElementHeader(pReader, pos, stop, id, size);
+    if (status < 0)  // error
+      return status;
+
+    if (id == mkvmuxer::kMkvContentCompression)
+      ++compression_count;
+
+    if (id == mkvmuxer::kMkvContentEncryption)
+      ++encryption_count;
+
+    pos += size;  // consume payload
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (compression_count <= 0 && encryption_count <= 0)
+    return -1;
+
+  if (compression_count > 0) {
+    compression_entries_ =
+        new (std::nothrow) ContentCompression*[compression_count];
+    if (!compression_entries_)
+      return -1;
+    compression_entries_end_ = compression_entries_;
+  }
+
+  if (encryption_count > 0) {
+    encryption_entries_ =
+        new (std::nothrow) ContentEncryption*[encryption_count];
+    if (!encryption_entries_) {
+      delete[] compression_entries_;
+      return -1;
+    }
+    encryption_entries_end_ = encryption_entries_;
+  }
+
+  pos = start;
+  while (pos < stop) {
+    long long id, size;
+    long status = ParseElementHeader(pReader, pos, stop, id, size);
+    if (status < 0)  // error
+      return status;
+
+    if (id == mkvmuxer::kMkvContentEncodingOrder) {
+      encoding_order_ = UnserializeUInt(pReader, pos, size);
+    } else if (id == mkvmuxer::kMkvContentEncodingScope) {
+      encoding_scope_ = UnserializeUInt(pReader, pos, size);
+      if (encoding_scope_ < 1)
+        return -1;
+    } else if (id == mkvmuxer::kMkvContentEncodingType) {
+      encoding_type_ = UnserializeUInt(pReader, pos, size);
+    } else if (id == mkvmuxer::kMkvContentCompression) {
+      ContentCompression* const compression =
+          new (std::nothrow) ContentCompression();
+      if (!compression)
+        return -1;
+
+      status = ParseCompressionEntry(pos, size, pReader, compression);
+      if (status) {
+        delete compression;
+        return status;
+      }
+      *compression_entries_end_++ = compression;
+    } else if (id == mkvmuxer::kMkvContentEncryption) {
+      ContentEncryption* const encryption =
+          new (std::nothrow) ContentEncryption();
+      if (!encryption)
+        return -1;
+
+      status = ParseEncryptionEntry(pos, size, pReader, encryption);
+      if (status) {
+        delete encryption;
+        return status;
+      }
+      *encryption_entries_end_++ = encryption;
+    }
+
+    pos += size;  // consume payload
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
+  return 0;
+}
+
+long ContentEncoding::ParseCompressionEntry(long long start, long long size,
+                                            IMkvReader* pReader,
+                                            ContentCompression* compression) {
+  assert(pReader);
+  assert(compression);
+
+  long long pos = start;
+  const long long stop = start + size;
+
+  bool valid = false;
+
+  while (pos < stop) {
+    long long id, size;
+    const long status = ParseElementHeader(pReader, pos, stop, id, size);
+    if (status < 0)  // error
+      return status;
+
+    if (id == mkvmuxer::kMkvContentCompAlgo) {
+      long long algo = UnserializeUInt(pReader, pos, size);
+      if (algo < 0)
+        return E_FILE_FORMAT_INVALID;
+      compression->algo = algo;
+      valid = true;
+    } else if (id == mkvmuxer::kMkvContentCompSettings) {
+      if (size <= 0)
+        return E_FILE_FORMAT_INVALID;
+
+      const size_t buflen = static_cast<size_t>(size);
+      unsigned char* buf = SafeArrayAlloc<unsigned char>(1, buflen);
+      if (buf == NULL)
+        return -1;
+
+      const int read_status =
+          pReader->Read(pos, static_cast<long>(buflen), buf);
+      if (read_status) {
+        delete[] buf;
+        return status;
+      }
+
+      compression->settings = buf;
+      compression->settings_len = buflen;
+    }
+
+    pos += size;  // consume payload
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  // ContentCompAlgo is mandatory
+  if (!valid)
+    return E_FILE_FORMAT_INVALID;
+
+  return 0;
+}
+
+long ContentEncoding::ParseEncryptionEntry(long long start, long long size,
+                                           IMkvReader* pReader,
+                                           ContentEncryption* encryption) {
+  assert(pReader);
+  assert(encryption);
+
+  long long pos = start;
+  const long long stop = start + size;
+
+  while (pos < stop) {
+    long long id, size;
+    const long status = ParseElementHeader(pReader, pos, stop, id, size);
+    if (status < 0)  // error
+      return status;
+
+    if (id == mkvmuxer::kMkvContentEncAlgo) {
+      encryption->algo = UnserializeUInt(pReader, pos, size);
+      if (encryption->algo != 5)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == mkvmuxer::kMkvContentEncKeyID) {
+      delete[] encryption->key_id;
+      encryption->key_id = NULL;
+      encryption->key_id_len = 0;
+
+      if (size <= 0)
+        return E_FILE_FORMAT_INVALID;
+
+      const size_t buflen = static_cast<size_t>(size);
+      unsigned char* buf = SafeArrayAlloc<unsigned char>(1, buflen);
+      if (buf == NULL)
+        return -1;
+
+      const int read_status =
+          pReader->Read(pos, static_cast<long>(buflen), buf);
+      if (read_status) {
+        delete[] buf;
+        return status;
+      }
+
+      encryption->key_id = buf;
+      encryption->key_id_len = buflen;
+    } else if (id == mkvmuxer::kMkvContentSignature) {
+      delete[] encryption->signature;
+      encryption->signature = NULL;
+      encryption->signature_len = 0;
+
+      if (size <= 0)
+        return E_FILE_FORMAT_INVALID;
+
+      const size_t buflen = static_cast<size_t>(size);
+      unsigned char* buf = SafeArrayAlloc<unsigned char>(1, buflen);
+      if (buf == NULL)
+        return -1;
+
+      const int read_status =
+          pReader->Read(pos, static_cast<long>(buflen), buf);
+      if (read_status) {
+        delete[] buf;
+        return status;
+      }
+
+      encryption->signature = buf;
+      encryption->signature_len = buflen;
+    } else if (id == mkvmuxer::kMkvContentSigKeyID) {
+      delete[] encryption->sig_key_id;
+      encryption->sig_key_id = NULL;
+      encryption->sig_key_id_len = 0;
+
+      if (size <= 0)
+        return E_FILE_FORMAT_INVALID;
+
+      const size_t buflen = static_cast<size_t>(size);
+      unsigned char* buf = SafeArrayAlloc<unsigned char>(1, buflen);
+      if (buf == NULL)
+        return -1;
+
+      const int read_status =
+          pReader->Read(pos, static_cast<long>(buflen), buf);
+      if (read_status) {
+        delete[] buf;
+        return status;
+      }
+
+      encryption->sig_key_id = buf;
+      encryption->sig_key_id_len = buflen;
+    } else if (id == mkvmuxer::kMkvContentSigAlgo) {
+      encryption->sig_algo = UnserializeUInt(pReader, pos, size);
+    } else if (id == mkvmuxer::kMkvContentSigHashAlgo) {
+      encryption->sig_hash_algo = UnserializeUInt(pReader, pos, size);
+    } else if (id == mkvmuxer::kMkvContentEncAESSettings) {
+      const long status = ParseContentEncAESSettingsEntry(
+          pos, size, pReader, &encryption->aes_settings);
+      if (status)
+        return status;
+    }
+
+    pos += size;  // consume payload
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  return 0;
+}
+
+Track::Track(Segment* pSegment, long long element_start, long long element_size)
+    : m_pSegment(pSegment),
+      m_element_start(element_start),
+      m_element_size(element_size),
+      content_encoding_entries_(NULL),
+      content_encoding_entries_end_(NULL) {}
+
+Track::~Track() {
+  Info& info = const_cast<Info&>(m_info);
+  info.Clear();
+
+  ContentEncoding** i = content_encoding_entries_;
+  ContentEncoding** const j = content_encoding_entries_end_;
+
+  while (i != j) {
+    ContentEncoding* const encoding = *i++;
+    delete encoding;
+  }
+
+  delete[] content_encoding_entries_;
+}
+
+long Track::Create(Segment* pSegment, const Info& info, long long element_start,
+                   long long element_size, Track*& pResult) {
+  if (pResult)
+    return -1;
+
+  Track* const pTrack =
+      new (std::nothrow) Track(pSegment, element_start, element_size);
+
+  if (pTrack == NULL)
+    return -1;  // generic error
+
+  const int status = info.Copy(pTrack->m_info);
+
+  if (status) {  // error
+    delete pTrack;
+    return status;
+  }
+
+  pResult = pTrack;
+  return 0;  // success
+}
+
+Track::Info::Info()
+    : uid(0),
+      defaultDuration(0),
+      codecDelay(0),
+      seekPreRoll(0),
+      nameAsUTF8(NULL),
+      language(NULL),
+      codecId(NULL),
+      codecNameAsUTF8(NULL),
+      codecPrivate(NULL),
+      codecPrivateSize(0),
+      lacing(false) {}
+
+Track::Info::~Info() { Clear(); }
+
+void Track::Info::Clear() {
+  delete[] nameAsUTF8;
+  nameAsUTF8 = NULL;
+
+  delete[] language;
+  language = NULL;
+
+  delete[] codecId;
+  codecId = NULL;
+
+  delete[] codecPrivate;
+  codecPrivate = NULL;
+  codecPrivateSize = 0;
+
+  delete[] codecNameAsUTF8;
+  codecNameAsUTF8 = NULL;
+}
+
+int Track::Info::CopyStr(char* Info::*str, Info& dst_) const {
+  if (str == static_cast<char * Info::*>(NULL))
+    return -1;
+
+  char*& dst = dst_.*str;
+
+  if (dst)  // should be NULL already
+    return -1;
+
+  const char* const src = this->*str;
+
+  if (src == NULL)
+    return 0;
+
+  const size_t len = strlen(src);
+
+  dst = SafeArrayAlloc<char>(1, len + 1);
+
+  if (dst == NULL)
+    return -1;
+
+  strcpy(dst, src);
+
+  return 0;
+}
+
+int Track::Info::Copy(Info& dst) const {
+  if (&dst == this)
+    return 0;
+
+  dst.type = type;
+  dst.number = number;
+  dst.defaultDuration = defaultDuration;
+  dst.codecDelay = codecDelay;
+  dst.seekPreRoll = seekPreRoll;
+  dst.uid = uid;
+  dst.lacing = lacing;
+  dst.settings = settings;
+
+  // We now copy the string member variables from src to dst.
+  // This involves memory allocation so in principle the operation
+  // can fail (indeed, that's why we have Info::Copy), so we must
+  // report this to the caller.  An error return from this function
+  // therefore implies that the copy was only partially successful.
+
+  if (int status = CopyStr(&Info::nameAsUTF8, dst))
+    return status;
+
+  if (int status = CopyStr(&Info::language, dst))
+    return status;
+
+  if (int status = CopyStr(&Info::codecId, dst))
+    return status;
+
+  if (int status = CopyStr(&Info::codecNameAsUTF8, dst))
+    return status;
+
+  if (codecPrivateSize > 0) {
+    if (codecPrivate == NULL)
+      return -1;
+
+    if (dst.codecPrivate)
+      return -1;
+
+    if (dst.codecPrivateSize != 0)
+      return -1;
+
+    dst.codecPrivate = SafeArrayAlloc<unsigned char>(1, codecPrivateSize);
+
+    if (dst.codecPrivate == NULL)
+      return -1;
+
+    memcpy(dst.codecPrivate, codecPrivate, codecPrivateSize);
+    dst.codecPrivateSize = codecPrivateSize;
+  }
+
+  return 0;
+}
+
+const BlockEntry* Track::GetEOS() const { return &m_eos; }
+
+long Track::GetType() const { return m_info.type; }
+
+long Track::GetNumber() const { return m_info.number; }
+
+unsigned long long Track::GetUid() const { return m_info.uid; }
+
+const char* Track::GetNameAsUTF8() const { return m_info.nameAsUTF8; }
+
+const char* Track::GetLanguage() const { return m_info.language; }
+
+const char* Track::GetCodecNameAsUTF8() const { return m_info.codecNameAsUTF8; }
+
+const char* Track::GetCodecId() const { return m_info.codecId; }
+
+const unsigned char* Track::GetCodecPrivate(size_t& size) const {
+  size = m_info.codecPrivateSize;
+  return m_info.codecPrivate;
+}
+
+bool Track::GetLacing() const { return m_info.lacing; }
+
+unsigned long long Track::GetDefaultDuration() const {
+  return m_info.defaultDuration;
+}
+
+unsigned long long Track::GetCodecDelay() const { return m_info.codecDelay; }
+
+unsigned long long Track::GetSeekPreRoll() const { return m_info.seekPreRoll; }
+
+long Track::GetFirst(const BlockEntry*& pBlockEntry) const {
+  const Cluster* pCluster = m_pSegment->GetFirst();
+
+  for (int i = 0;;) {
+    if (pCluster == NULL) {
+      pBlockEntry = GetEOS();
+      return 1;
+    }
+
+    if (pCluster->EOS()) {
+      if (m_pSegment->DoneParsing()) {
+        pBlockEntry = GetEOS();
+        return 1;
+      }
+
+      pBlockEntry = 0;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    long status = pCluster->GetFirst(pBlockEntry);
+
+    if (status < 0)  // error
+      return status;
+
+    if (pBlockEntry == 0) {  // empty cluster
+      pCluster = m_pSegment->GetNext(pCluster);
+      continue;
+    }
+
+    for (;;) {
+      const Block* const pBlock = pBlockEntry->GetBlock();
+      assert(pBlock);
+
+      const long long tn = pBlock->GetTrackNumber();
+
+      if ((tn == m_info.number) && VetEntry(pBlockEntry))
+        return 0;
+
+      const BlockEntry* pNextEntry;
+
+      status = pCluster->GetNext(pBlockEntry, pNextEntry);
+
+      if (status < 0)  // error
+        return status;
+
+      if (pNextEntry == 0)
+        break;
+
+      pBlockEntry = pNextEntry;
+    }
+
+    ++i;
+
+    if (i >= 100)
+      break;
+
+    pCluster = m_pSegment->GetNext(pCluster);
+  }
+
+  // NOTE: if we get here, it means that we didn't find a block with
+  // a matching track number.  We interpret that as an error (which
+  // might be too conservative).
+
+  pBlockEntry = GetEOS();  // so we can return a non-NULL value
+  return 1;
+}
+
+long Track::GetNext(const BlockEntry* pCurrEntry,
+                    const BlockEntry*& pNextEntry) const {
+  assert(pCurrEntry);
+  assert(!pCurrEntry->EOS());  //?
+
+  const Block* const pCurrBlock = pCurrEntry->GetBlock();
+  assert(pCurrBlock && pCurrBlock->GetTrackNumber() == m_info.number);
+  if (!pCurrBlock || pCurrBlock->GetTrackNumber() != m_info.number)
+    return -1;
+
+  const Cluster* pCluster = pCurrEntry->GetCluster();
+  assert(pCluster);
+  assert(!pCluster->EOS());
+
+  long status = pCluster->GetNext(pCurrEntry, pNextEntry);
+
+  if (status < 0)  // error
+    return status;
+
+  for (int i = 0;;) {
+    while (pNextEntry) {
+      const Block* const pNextBlock = pNextEntry->GetBlock();
+      assert(pNextBlock);
+
+      if (pNextBlock->GetTrackNumber() == m_info.number)
+        return 0;
+
+      pCurrEntry = pNextEntry;
+
+      status = pCluster->GetNext(pCurrEntry, pNextEntry);
+
+      if (status < 0)  // error
+        return status;
+    }
+
+    pCluster = m_pSegment->GetNext(pCluster);
+
+    if (pCluster == NULL) {
+      pNextEntry = GetEOS();
+      return 1;
+    }
+
+    if (pCluster->EOS()) {
+      if (m_pSegment->DoneParsing()) {
+        pNextEntry = GetEOS();
+        return 1;
+      }
+
+      // TODO: there is a potential O(n^2) problem here: we tell the
+      // caller to (pre)load another cluster, which he does, but then he
+      // calls GetNext again, which repeats the same search.  This is
+      // a pathological case, since the only way it can happen is if
+      // there exists a long sequence of clusters none of which contain a
+      // block from this track.  One way around this problem is for the
+      // caller to be smarter when he loads another cluster: don't call
+      // us back until you have a cluster that contains a block from this
+      // track. (Of course, that's not cheap either, since our caller
+      // would have to scan the each cluster as it's loaded, so that
+      // would just push back the problem.)
+
+      pNextEntry = NULL;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    status = pCluster->GetFirst(pNextEntry);
+
+    if (status < 0)  // error
+      return status;
+
+    if (pNextEntry == NULL)  // empty cluster
+      continue;
+
+    ++i;
+
+    if (i >= 100)
+      break;
+  }
+
+  // NOTE: if we get here, it means that we didn't find a block with
+  // a matching track number after lots of searching, so we give
+  // up trying.
+
+  pNextEntry = GetEOS();  // so we can return a non-NULL value
+  return 1;
+}
+
+bool Track::VetEntry(const BlockEntry* pBlockEntry) const {
+  assert(pBlockEntry);
+  const Block* const pBlock = pBlockEntry->GetBlock();
+  assert(pBlock);
+  assert(pBlock->GetTrackNumber() == m_info.number);
+  if (!pBlock || pBlock->GetTrackNumber() != m_info.number)
+    return false;
+
+  // This function is used during a seek to determine whether the
+  // frame is a valid seek target.  This default function simply
+  // returns true, which means all frames are valid seek targets.
+  // It gets overridden by the VideoTrack class, because only video
+  // keyframes can be used as seek target.
+
+  return true;
+}
+
+long Track::Seek(long long time_ns, const BlockEntry*& pResult) const {
+  const long status = GetFirst(pResult);
+
+  if (status < 0)  // buffer underflow, etc
+    return status;
+
+  assert(pResult);
+
+  if (pResult->EOS())
+    return 0;
+
+  const Cluster* pCluster = pResult->GetCluster();
+  assert(pCluster);
+  assert(pCluster->GetIndex() >= 0);
+
+  if (time_ns <= pResult->GetBlock()->GetTime(pCluster))
+    return 0;
+
+  Cluster** const clusters = m_pSegment->m_clusters;
+  assert(clusters);
+
+  const long count = m_pSegment->GetCount();  // loaded only, not preloaded
+  assert(count > 0);
+
+  Cluster** const i = clusters + pCluster->GetIndex();
+  assert(i);
+  assert(*i == pCluster);
+  assert(pCluster->GetTime() <= time_ns);
+
+  Cluster** const j = clusters + count;
+
+  Cluster** lo = i;
+  Cluster** hi = j;
+
+  while (lo < hi) {
+    // INVARIANT:
+    //[i, lo) <= time_ns
+    //[lo, hi) ?
+    //[hi, j)  > time_ns
+
+    Cluster** const mid = lo + (hi - lo) / 2;
+    assert(mid < hi);
+
+    pCluster = *mid;
+    assert(pCluster);
+    assert(pCluster->GetIndex() >= 0);
+    assert(pCluster->GetIndex() == long(mid - m_pSegment->m_clusters));
+
+    const long long t = pCluster->GetTime();
+
+    if (t <= time_ns)
+      lo = mid + 1;
+    else
+      hi = mid;
+
+    assert(lo <= hi);
+  }
+
+  assert(lo == hi);
+  assert(lo > i);
+  assert(lo <= j);
+
+  while (lo > i) {
+    pCluster = *--lo;
+    assert(pCluster);
+    assert(pCluster->GetTime() <= time_ns);
+
+    pResult = pCluster->GetEntry(this);
+
+    if ((pResult != 0) && !pResult->EOS())
+      return 0;
+
+    // landed on empty cluster (no entries)
+  }
+
+  pResult = GetEOS();  // weird
+  return 0;
+}
+
+const ContentEncoding* Track::GetContentEncodingByIndex(
+    unsigned long idx) const {
+  const ptrdiff_t count =
+      content_encoding_entries_end_ - content_encoding_entries_;
+  assert(count >= 0);
+
+  if (idx >= static_cast<unsigned long>(count))
+    return NULL;
+
+  return content_encoding_entries_[idx];
+}
+
+unsigned long Track::GetContentEncodingCount() const {
+  const ptrdiff_t count =
+      content_encoding_entries_end_ - content_encoding_entries_;
+  assert(count >= 0);
+
+  return static_cast<unsigned long>(count);
+}
+
+long Track::ParseContentEncodingsEntry(long long start, long long size) {
+  IMkvReader* const pReader = m_pSegment->m_pReader;
+  assert(pReader);
+
+  long long pos = start;
+  const long long stop = start + size;
+
+  // Count ContentEncoding elements.
+  int count = 0;
+  while (pos < stop) {
+    long long id, size;
+    const long status = ParseElementHeader(pReader, pos, stop, id, size);
+    if (status < 0)  // error
+      return status;
+
+    // pos now designates start of element
+    if (id == mkvmuxer::kMkvContentEncoding)
+      ++count;
+
+    pos += size;  // consume payload
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (count <= 0)
+    return -1;
+
+  content_encoding_entries_ = new (std::nothrow) ContentEncoding*[count];
+  if (!content_encoding_entries_)
+    return -1;
+
+  content_encoding_entries_end_ = content_encoding_entries_;
+
+  pos = start;
+  while (pos < stop) {
+    long long id, size;
+    long status = ParseElementHeader(pReader, pos, stop, id, size);
+    if (status < 0)  // error
+      return status;
+
+    // pos now designates start of element
+    if (id == mkvmuxer::kMkvContentEncoding) {
+      ContentEncoding* const content_encoding =
+          new (std::nothrow) ContentEncoding();
+      if (!content_encoding)
+        return -1;
+
+      status = content_encoding->ParseContentEncodingEntry(pos, size, pReader);
+      if (status) {
+        delete content_encoding;
+        return status;
+      }
+
+      *content_encoding_entries_end_++ = content_encoding;
+    }
+
+    pos += size;  // consume payload
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
+
+  return 0;
+}
+
+Track::EOSBlock::EOSBlock() : BlockEntry(NULL, LONG_MIN) {}
+
+BlockEntry::Kind Track::EOSBlock::GetKind() const { return kBlockEOS; }
+
+const Block* Track::EOSBlock::GetBlock() const { return NULL; }
+
+VideoTrack::VideoTrack(Segment* pSegment, long long element_start,
+                       long long element_size)
+    : Track(pSegment, element_start, element_size) {}
+
+long VideoTrack::Parse(Segment* pSegment, const Info& info,
+                       long long element_start, long long element_size,
+                       VideoTrack*& pResult) {
+  if (pResult)
+    return -1;
+
+  if (info.type != Track::kVideo)
+    return -1;
+
+  long long width = 0;
+  long long height = 0;
+  long long display_width = 0;
+  long long display_height = 0;
+  long long display_unit = 0;
+  long long stereo_mode = 0;
+
+  double rate = 0.0;
+
+  IMkvReader* const pReader = pSegment->m_pReader;
+
+  const Settings& s = info.settings;
+  assert(s.start >= 0);
+  assert(s.size >= 0);
+
+  long long pos = s.start;
+  assert(pos >= 0);
+
+  const long long stop = pos + s.size;
+
+  while (pos < stop) {
+    long long id, size;
+
+    const long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+    if (status < 0)  // error
+      return status;
+
+    if (id == mkvmuxer::kMkvPixelWidth) {
+      width = UnserializeUInt(pReader, pos, size);
+
+      if (width <= 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == mkvmuxer::kMkvPixelHeight) {
+      height = UnserializeUInt(pReader, pos, size);
+
+      if (height <= 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == mkvmuxer::kMkvDisplayWidth) {
+      display_width = UnserializeUInt(pReader, pos, size);
+
+      if (display_width <= 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == mkvmuxer::kMkvDisplayHeight) {
+      display_height = UnserializeUInt(pReader, pos, size);
+
+      if (display_height <= 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == mkvmuxer::kMkvDisplayUnit) {
+      display_unit = UnserializeUInt(pReader, pos, size);
+
+      if (display_unit < 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == mkvmuxer::kMkvStereoMode) {
+      stereo_mode = UnserializeUInt(pReader, pos, size);
+
+      if (stereo_mode < 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == mkvmuxer::kMkvFrameRate) {
+      const long status = UnserializeFloat(pReader, pos, size, rate);
+
+      if (status < 0)
+        return status;
+
+      if (rate <= 0)
+        return E_FILE_FORMAT_INVALID;
+    }
+
+    pos += size;  // consume payload
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
+
+  VideoTrack* const pTrack =
+      new (std::nothrow) VideoTrack(pSegment, element_start, element_size);
+
+  if (pTrack == NULL)
+    return -1;  // generic error
+
+  const int status = info.Copy(pTrack->m_info);
+
+  if (status) {  // error
+    delete pTrack;
+    return status;
+  }
+
+  pTrack->m_width = width;
+  pTrack->m_height = height;
+  pTrack->m_display_width = display_width;
+  pTrack->m_display_height = display_height;
+  pTrack->m_display_unit = display_unit;
+  pTrack->m_stereo_mode = stereo_mode;
+  pTrack->m_rate = rate;
+
+  pResult = pTrack;
+  return 0;  // success
+}
+
+bool VideoTrack::VetEntry(const BlockEntry* pBlockEntry) const {
+  return Track::VetEntry(pBlockEntry) && pBlockEntry->GetBlock()->IsKey();
+}
+
+long VideoTrack::Seek(long long time_ns, const BlockEntry*& pResult) const {
+  const long status = GetFirst(pResult);
+
+  if (status < 0)  // buffer underflow, etc
+    return status;
+
+  assert(pResult);
+
+  if (pResult->EOS())
+    return 0;
+
+  const Cluster* pCluster = pResult->GetCluster();
+  assert(pCluster);
+  assert(pCluster->GetIndex() >= 0);
+
+  if (time_ns <= pResult->GetBlock()->GetTime(pCluster))
+    return 0;
+
+  Cluster** const clusters = m_pSegment->m_clusters;
+  assert(clusters);
+
+  const long count = m_pSegment->GetCount();  // loaded only, not pre-loaded
+  assert(count > 0);
+
+  Cluster** const i = clusters + pCluster->GetIndex();
+  assert(i);
+  assert(*i == pCluster);
+  assert(pCluster->GetTime() <= time_ns);
+
+  Cluster** const j = clusters + count;
+
+  Cluster** lo = i;
+  Cluster** hi = j;
+
+  while (lo < hi) {
+    // INVARIANT:
+    //[i, lo) <= time_ns
+    //[lo, hi) ?
+    //[hi, j)  > time_ns
+
+    Cluster** const mid = lo + (hi - lo) / 2;
+    assert(mid < hi);
+
+    pCluster = *mid;
+    assert(pCluster);
+    assert(pCluster->GetIndex() >= 0);
+    assert(pCluster->GetIndex() == long(mid - m_pSegment->m_clusters));
+
+    const long long t = pCluster->GetTime();
+
+    if (t <= time_ns)
+      lo = mid + 1;
+    else
+      hi = mid;
+
+    assert(lo <= hi);
+  }
+
+  assert(lo == hi);
+  assert(lo > i);
+  assert(lo <= j);
+
+  pCluster = *--lo;
+  assert(pCluster);
+  assert(pCluster->GetTime() <= time_ns);
+
+  pResult = pCluster->GetEntry(this, time_ns);
+
+  if ((pResult != 0) && !pResult->EOS())  // found a keyframe
+    return 0;
+
+  while (lo != i) {
+    pCluster = *--lo;
+    assert(pCluster);
+    assert(pCluster->GetTime() <= time_ns);
+
+    pResult = pCluster->GetEntry(this, time_ns);
+
+    if ((pResult != 0) && !pResult->EOS())
+      return 0;
+  }
+
+  // weird: we're on the first cluster, but no keyframe found
+  // should never happen but we must return something anyway
+
+  pResult = GetEOS();
+  return 0;
+}
+
+long long VideoTrack::GetWidth() const { return m_width; }
+
+long long VideoTrack::GetHeight() const { return m_height; }
+
+long long VideoTrack::GetDisplayWidth() const {
+  return m_display_width > 0 ? m_display_width : GetWidth();
+}
+
+long long VideoTrack::GetDisplayHeight() const {
+  return m_display_height > 0 ? m_display_height : GetHeight();
+}
+
+long long VideoTrack::GetDisplayUnit() const { return m_display_unit; }
+
+long long VideoTrack::GetStereoMode() const { return m_stereo_mode; }
+
+double VideoTrack::GetFrameRate() const { return m_rate; }
+
+AudioTrack::AudioTrack(Segment* pSegment, long long element_start,
+                       long long element_size)
+    : Track(pSegment, element_start, element_size) {}
+
+long AudioTrack::Parse(Segment* pSegment, const Info& info,
+                       long long element_start, long long element_size,
+                       AudioTrack*& pResult) {
+  if (pResult)
+    return -1;
+
+  if (info.type != Track::kAudio)
+    return -1;
+
+  IMkvReader* const pReader = pSegment->m_pReader;
+
+  const Settings& s = info.settings;
+  assert(s.start >= 0);
+  assert(s.size >= 0);
+
+  long long pos = s.start;
+  assert(pos >= 0);
+
+  const long long stop = pos + s.size;
+
+  double rate = 8000.0;  // MKV default
+  long long channels = 1;
+  long long bit_depth = 0;
+
+  while (pos < stop) {
+    long long id, size;
+
+    long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+    if (status < 0)  // error
+      return status;
+
+    if (id == mkvmuxer::kMkvSamplingFrequency) {
+      status = UnserializeFloat(pReader, pos, size, rate);
+
+      if (status < 0)
+        return status;
+
+      if (rate <= 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == mkvmuxer::kMkvChannels) {
+      channels = UnserializeUInt(pReader, pos, size);
+
+      if (channels <= 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == mkvmuxer::kMkvBitDepth) {
+      bit_depth = UnserializeUInt(pReader, pos, size);
+
+      if (bit_depth <= 0)
+        return E_FILE_FORMAT_INVALID;
+    }
+
+    pos += size;  // consume payload
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
+
+  AudioTrack* const pTrack =
+      new (std::nothrow) AudioTrack(pSegment, element_start, element_size);
+
+  if (pTrack == NULL)
+    return -1;  // generic error
+
+  const int status = info.Copy(pTrack->m_info);
+
+  if (status) {
+    delete pTrack;
+    return status;
+  }
+
+  pTrack->m_rate = rate;
+  pTrack->m_channels = channels;
+  pTrack->m_bitDepth = bit_depth;
+
+  pResult = pTrack;
+  return 0;  // success
+}
+
+double AudioTrack::GetSamplingRate() const { return m_rate; }
+
+long long AudioTrack::GetChannels() const { return m_channels; }
+
+long long AudioTrack::GetBitDepth() const { return m_bitDepth; }
+
+Tracks::Tracks(Segment* pSegment, long long start, long long size_,
+               long long element_start, long long element_size)
+    : m_pSegment(pSegment),
+      m_start(start),
+      m_size(size_),
+      m_element_start(element_start),
+      m_element_size(element_size),
+      m_trackEntries(NULL),
+      m_trackEntriesEnd(NULL) {}
+
+long Tracks::Parse() {
+  assert(m_trackEntries == NULL);
+  assert(m_trackEntriesEnd == NULL);
+
+  const long long stop = m_start + m_size;
+  IMkvReader* const pReader = m_pSegment->m_pReader;
+
+  int count = 0;
+  long long pos = m_start;
+
+  while (pos < stop) {
+    long long id, size;
+
+    const long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+    if (status < 0)  // error
+      return status;
+
+    if (size == 0)  // weird
+      continue;
+
+    if (id == mkvmuxer::kMkvTrackEntry)
+      ++count;
+
+    pos += size;  // consume payload
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
+
+  if (count <= 0)
+    return 0;  // success
+
+  m_trackEntries = new (std::nothrow) Track*[count];
+
+  if (m_trackEntries == NULL)
+    return -1;
+
+  m_trackEntriesEnd = m_trackEntries;
+
+  pos = m_start;
+
+  while (pos < stop) {
+    const long long element_start = pos;
+
+    long long id, payload_size;
+
+    const long status =
+        ParseElementHeader(pReader, pos, stop, id, payload_size);
+
+    if (status < 0)  // error
+      return status;
+
+    if (payload_size == 0)  // weird
+      continue;
+
+    const long long payload_stop = pos + payload_size;
+    assert(payload_stop <= stop);  // checked in ParseElement
+
+    const long long element_size = payload_stop - element_start;
+
+    if (id == mkvmuxer::kMkvTrackEntry) {
+      Track*& pTrack = *m_trackEntriesEnd;
+      pTrack = NULL;
+
+      const long status = ParseTrackEntry(pos, payload_size, element_start,
+                                          element_size, pTrack);
+      if (status)
+        return status;
+
+      if (pTrack)
+        ++m_trackEntriesEnd;
+    }
+
+    pos = payload_stop;
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
+
+  return 0;  // success
+}
+
+unsigned long Tracks::GetTracksCount() const {
+  const ptrdiff_t result = m_trackEntriesEnd - m_trackEntries;
+  assert(result >= 0);
+
+  return static_cast<unsigned long>(result);
+}
+
+long Tracks::ParseTrackEntry(long long track_start, long long track_size,
+                             long long element_start, long long element_size,
+                             Track*& pResult) const {
+  if (pResult)
+    return -1;
+
+  IMkvReader* const pReader = m_pSegment->m_pReader;
+
+  long long pos = track_start;
+  const long long track_stop = track_start + track_size;
+
+  Track::Info info;
+
+  info.type = 0;
+  info.number = 0;
+  info.uid = 0;
+  info.defaultDuration = 0;
+
+  Track::Settings v;
+  v.start = -1;
+  v.size = -1;
+
+  Track::Settings a;
+  a.start = -1;
+  a.size = -1;
+
+  Track::Settings e;  // content_encodings_settings;
+  e.start = -1;
+  e.size = -1;
+
+  long long lacing = 1;  // default is true
+
+  while (pos < track_stop) {
+    long long id, size;
+
+    const long status = ParseElementHeader(pReader, pos, track_stop, id, size);
+
+    if (status < 0)  // error
+      return status;
+
+    if (size < 0)
+      return E_FILE_FORMAT_INVALID;
+
+    const long long start = pos;
+
+    if (id == mkvmuxer::kMkvVideo) {
+      v.start = start;
+      v.size = size;
+    } else if (id == mkvmuxer::kMkvAudio) {
+      a.start = start;
+      a.size = size;
+    } else if (id == mkvmuxer::kMkvContentEncodings) {
+      e.start = start;
+      e.size = size;
+    } else if (id == mkvmuxer::kMkvTrackUID) {
+      if (size > 8)
+        return E_FILE_FORMAT_INVALID;
+
+      info.uid = 0;
+
+      long long pos_ = start;
+      const long long pos_end = start + size;
+
+      while (pos_ != pos_end) {
+        unsigned char b;
+
+        const int status = pReader->Read(pos_, 1, &b);
+
+        if (status)
+          return status;
+
+        info.uid <<= 8;
+        info.uid |= b;
+
+        ++pos_;
+      }
+    } else if (id == mkvmuxer::kMkvTrackNumber) {
+      const long long num = UnserializeUInt(pReader, pos, size);
+
+      if ((num <= 0) || (num > 127))
+        return E_FILE_FORMAT_INVALID;
+
+      info.number = static_cast<long>(num);
+    } else if (id == mkvmuxer::kMkvTrackType) {
+      const long long type = UnserializeUInt(pReader, pos, size);
+
+      if ((type <= 0) || (type > 254))
+        return E_FILE_FORMAT_INVALID;
+
+      info.type = static_cast<long>(type);
+    } else if (id == mkvmuxer::kMkvName) {
+      const long status =
+          UnserializeString(pReader, pos, size, info.nameAsUTF8);
+
+      if (status)
+        return status;
+    } else if (id == mkvmuxer::kMkvLanguage) {
+      const long status = UnserializeString(pReader, pos, size, info.language);
+
+      if (status)
+        return status;
+    } else if (id == mkvmuxer::kMkvDefaultDuration) {
+      const long long duration = UnserializeUInt(pReader, pos, size);
+
+      if (duration < 0)
+        return E_FILE_FORMAT_INVALID;
+
+      info.defaultDuration = static_cast<unsigned long long>(duration);
+    } else if (id == mkvmuxer::kMkvCodecID) {
+      const long status = UnserializeString(pReader, pos, size, info.codecId);
+
+      if (status)
+        return status;
+    } else if (id == mkvmuxer::kMkvFlagLacing) {
+      lacing = UnserializeUInt(pReader, pos, size);
+
+      if ((lacing < 0) || (lacing > 1))
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == mkvmuxer::kMkvCodecPrivate) {
+      delete[] info.codecPrivate;
+      info.codecPrivate = NULL;
+      info.codecPrivateSize = 0;
+
+      const size_t buflen = static_cast<size_t>(size);
+
+      if (buflen) {
+        unsigned char* buf = SafeArrayAlloc<unsigned char>(1, buflen);
+
+        if (buf == NULL)
+          return -1;
+
+        const int status = pReader->Read(pos, static_cast<long>(buflen), buf);
+
+        if (status) {
+          delete[] buf;
+          return status;
+        }
+
+        info.codecPrivate = buf;
+        info.codecPrivateSize = buflen;
+      }
+    } else if (id == mkvmuxer::kMkvCodecName) {
+      const long status =
+          UnserializeString(pReader, pos, size, info.codecNameAsUTF8);
+
+      if (status)
+        return status;
+    } else if (id == mkvmuxer::kMkvCodecDelay) {
+      info.codecDelay = UnserializeUInt(pReader, pos, size);
+    } else if (id == mkvmuxer::kMkvSeekPreRoll) {
+      info.seekPreRoll = UnserializeUInt(pReader, pos, size);
+    }
+
+    pos += size;  // consume payload
+    if (pos > track_stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (pos != track_stop)
+    return E_FILE_FORMAT_INVALID;
+
+  if (info.number <= 0)  // not specified
+    return E_FILE_FORMAT_INVALID;
+
+  if (GetTrackByNumber(info.number))
+    return E_FILE_FORMAT_INVALID;
+
+  if (info.type <= 0)  // not specified
+    return E_FILE_FORMAT_INVALID;
+
+  info.lacing = (lacing > 0) ? true : false;
+
+  if (info.type == Track::kVideo) {
+    if (v.start < 0)
+      return E_FILE_FORMAT_INVALID;
+
+    if (a.start >= 0)
+      return E_FILE_FORMAT_INVALID;
+
+    info.settings = v;
+
+    VideoTrack* pTrack = NULL;
+
+    const long status = VideoTrack::Parse(m_pSegment, info, element_start,
+                                          element_size, pTrack);
+
+    if (status)
+      return status;
+
+    pResult = pTrack;
+    assert(pResult);
+
+    if (e.start >= 0)
+      pResult->ParseContentEncodingsEntry(e.start, e.size);
+  } else if (info.type == Track::kAudio) {
+    if (a.start < 0)
+      return E_FILE_FORMAT_INVALID;
+
+    if (v.start >= 0)
+      return E_FILE_FORMAT_INVALID;
+
+    info.settings = a;
+
+    AudioTrack* pTrack = NULL;
+
+    const long status = AudioTrack::Parse(m_pSegment, info, element_start,
+                                          element_size, pTrack);
+
+    if (status)
+      return status;
+
+    pResult = pTrack;
+    assert(pResult);
+
+    if (e.start >= 0)
+      pResult->ParseContentEncodingsEntry(e.start, e.size);
+  } else {
+    // neither video nor audio - probably metadata or subtitles
+
+    if (a.start >= 0)
+      return E_FILE_FORMAT_INVALID;
+
+    if (v.start >= 0)
+      return E_FILE_FORMAT_INVALID;
+
+    if (info.type == Track::kMetadata && e.start >= 0)
+      return E_FILE_FORMAT_INVALID;
+
+    info.settings.start = -1;
+    info.settings.size = 0;
+
+    Track* pTrack = NULL;
+
+    const long status =
+        Track::Create(m_pSegment, info, element_start, element_size, pTrack);
+
+    if (status)
+      return status;
+
+    pResult = pTrack;
+    assert(pResult);
+  }
+
+  return 0;  // success
+}
+
+Tracks::~Tracks() {
+  Track** i = m_trackEntries;
+  Track** const j = m_trackEntriesEnd;
+
+  while (i != j) {
+    Track* const pTrack = *i++;
+    delete pTrack;
+  }
+
+  delete[] m_trackEntries;
+}
+
+const Track* Tracks::GetTrackByNumber(long tn) const {
+  if (tn < 0)
+    return NULL;
+
+  Track** i = m_trackEntries;
+  Track** const j = m_trackEntriesEnd;
+
+  while (i != j) {
+    Track* const pTrack = *i++;
+
+    if (pTrack == NULL)
+      continue;
+
+    if (tn == pTrack->GetNumber())
+      return pTrack;
+  }
+
+  return NULL;  // not found
+}
+
+const Track* Tracks::GetTrackByIndex(unsigned long idx) const {
+  const ptrdiff_t count = m_trackEntriesEnd - m_trackEntries;
+
+  if (idx >= static_cast<unsigned long>(count))
+    return NULL;
+
+  return m_trackEntries[idx];
+}
+
+long Cluster::Load(long long& pos, long& len) const {
+  if (m_pSegment == NULL)
+    return E_PARSE_FAILED;
+
+  if (m_timecode >= 0)  // at least partially loaded
+    return 0;
+
+  if (m_pos != m_element_start || m_element_size >= 0)
+    return E_PARSE_FAILED;
+
+  IMkvReader* const pReader = m_pSegment->m_pReader;
+  long long total, avail;
+  const int status = pReader->Length(&total, &avail);
+
+  if (status < 0)  // error
+    return status;
+
+  if (total >= 0 && (avail > total || m_pos > total))
+    return E_FILE_FORMAT_INVALID;
+
+  pos = m_pos;
+
+  long long cluster_size = -1;
+
+  if ((pos + 1) > avail) {
+    len = 1;
+    return E_BUFFER_NOT_FULL;
+  }
+
+  long long result = GetUIntLength(pReader, pos, len);
+
+  if (result < 0)  // error or underflow
+    return static_cast<long>(result);
+
+  if (result > 0)
+    return E_BUFFER_NOT_FULL;
+
+  if ((pos + len) > avail)
+    return E_BUFFER_NOT_FULL;
+
+  const long long id_ = ReadID(pReader, pos, len);
+
+  if (id_ < 0)  // error
+    return static_cast<long>(id_);
+
+  if (id_ != mkvmuxer::kMkvCluster)
+    return E_FILE_FORMAT_INVALID;
+
+  pos += len;  // consume id
+
+  // read cluster size
+
+  if ((pos + 1) > avail) {
+    len = 1;
+    return E_BUFFER_NOT_FULL;
+  }
+
+  result = GetUIntLength(pReader, pos, len);
+
+  if (result < 0)  // error
+    return static_cast<long>(result);
+
+  if (result > 0)
+    return E_BUFFER_NOT_FULL;
+
+  if ((pos + len) > avail)
+    return E_BUFFER_NOT_FULL;
+
+  const long long size = ReadUInt(pReader, pos, len);
+
+  if (size < 0)  // error
+    return static_cast<long>(cluster_size);
+
+  if (size == 0)
+    return E_FILE_FORMAT_INVALID;
+
+  pos += len;  // consume length of size of element
+
+  const long long unknown_size = (1LL << (7 * len)) - 1;
+
+  if (size != unknown_size)
+    cluster_size = size;
+
+  // pos points to start of payload
+  long long timecode = -1;
+  long long new_pos = -1;
+  bool bBlock = false;
+
+  long long cluster_stop = (cluster_size < 0) ? -1 : pos + cluster_size;
+
+  for (;;) {
+    if ((cluster_stop >= 0) && (pos >= cluster_stop))
+      break;
+
+    // Parse ID
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    long long result = GetUIntLength(pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)
+      return E_BUFFER_NOT_FULL;
+
+    if ((cluster_stop >= 0) && ((pos + len) > cluster_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long id = ReadID(pReader, pos, len);
+
+    if (id < 0)  // error
+      return static_cast<long>(id);
+
+    if (id == 0)
+      return E_FILE_FORMAT_INVALID;
+
+    // This is the distinguished set of ID's we use to determine
+    // that we have exhausted the sub-element's inside the cluster
+    // whose ID we parsed earlier.
+
+    if (id == mkvmuxer::kMkvCluster)
+      break;
+
+    if (id == mkvmuxer::kMkvCues)
+      break;
+
+    pos += len;  // consume ID field
+
+    // Parse Size
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    result = GetUIntLength(pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)
+      return E_BUFFER_NOT_FULL;
+
+    if ((cluster_stop >= 0) && ((pos + len) > cluster_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long size = ReadUInt(pReader, pos, len);
+
+    if (size < 0)  // error
+      return static_cast<long>(size);
+
+    const long long unknown_size = (1LL << (7 * len)) - 1;
+
+    if (size == unknown_size)
+      return E_FILE_FORMAT_INVALID;
+
+    pos += len;  // consume size field
+
+    if ((cluster_stop >= 0) && (pos > cluster_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    // pos now points to start of payload
+
+    if (size == 0)
+      continue;
+
+    if ((cluster_stop >= 0) && ((pos + size) > cluster_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if (id == mkvmuxer::kMkvTimecode) {
+      len = static_cast<long>(size);
+
+      if ((pos + size) > avail)
+        return E_BUFFER_NOT_FULL;
+
+      timecode = UnserializeUInt(pReader, pos, size);
+
+      if (timecode < 0)  // error (or underflow)
+        return static_cast<long>(timecode);
+
+      new_pos = pos + size;
+
+      if (bBlock)
+        break;
+    } else if (id == mkvmuxer::kMkvBlockGroup) {
+      bBlock = true;
+      break;
+    } else if (id == mkvmuxer::kMkvSimpleBlock) {
+      bBlock = true;
+      break;
+    }
+
+    pos += size;  // consume payload
+    if (cluster_stop >= 0 && pos > cluster_stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (cluster_stop >= 0 && pos > cluster_stop)
+    return E_FILE_FORMAT_INVALID;
+
+  if (timecode < 0)  // no timecode found
+    return E_FILE_FORMAT_INVALID;
+
+  if (!bBlock)
+    return E_FILE_FORMAT_INVALID;
+
+  m_pos = new_pos;  // designates position just beyond timecode payload
+  m_timecode = timecode;  // m_timecode >= 0 means we're partially loaded
+
+  if (cluster_size >= 0)
+    m_element_size = cluster_stop - m_element_start;
+
+  return 0;
+}
+
+long Cluster::Parse(long long& pos, long& len) const {
+  long status = Load(pos, len);
+
+  if (status < 0)
+    return status;
+
+  if (m_pos < m_element_start || m_timecode < 0)
+    return E_PARSE_FAILED;
+
+  const long long cluster_stop =
+      (m_element_size < 0) ? -1 : m_element_start + m_element_size;
+
+  if ((cluster_stop >= 0) && (m_pos >= cluster_stop))
+    return 1;  // nothing else to do
+
+  IMkvReader* const pReader = m_pSegment->m_pReader;
+
+  long long total, avail;
+
+  status = pReader->Length(&total, &avail);
+
+  if (status < 0)  // error
+    return status;
+
+  if (total >= 0 && avail > total)
+    return E_FILE_FORMAT_INVALID;
+
+  pos = m_pos;
+
+  for (;;) {
+    if ((cluster_stop >= 0) && (pos >= cluster_stop))
+      break;
+
+    if ((total >= 0) && (pos >= total)) {
+      if (m_element_size < 0)
+        m_element_size = pos - m_element_start;
+
+      break;
+    }
+
+    // Parse ID
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    long long result = GetUIntLength(pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)
+      return E_BUFFER_NOT_FULL;
+
+    if ((cluster_stop >= 0) && ((pos + len) > cluster_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long id = ReadID(pReader, pos, len);
+
+    if (id < 0)
+      return E_FILE_FORMAT_INVALID;
+
+    // This is the distinguished set of ID's we use to determine
+    // that we have exhausted the sub-element's inside the cluster
+    // whose ID we parsed earlier.
+
+    if ((id == mkvmuxer::kMkvCluster) || (id == mkvmuxer::kMkvCues)) {
+      if (m_element_size < 0)
+        m_element_size = pos - m_element_start;
+
+      break;
+    }
+
+    pos += len;  // consume ID field
+
+    // Parse Size
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    result = GetUIntLength(pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)
+      return E_BUFFER_NOT_FULL;
+
+    if ((cluster_stop >= 0) && ((pos + len) > cluster_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long size = ReadUInt(pReader, pos, len);
+
+    if (size < 0)  // error
+      return static_cast<long>(size);
+
+    const long long unknown_size = (1LL << (7 * len)) - 1;
+
+    if (size == unknown_size)
+      return E_FILE_FORMAT_INVALID;
+
+    pos += len;  // consume size field
+
+    if ((cluster_stop >= 0) && (pos > cluster_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    // pos now points to start of payload
+
+    if (size == 0)
+      continue;
+
+    // const long long block_start = pos;
+    const long long block_stop = pos + size;
+
+    if (cluster_stop >= 0) {
+      if (block_stop > cluster_stop) {
+        if (id == mkvmuxer::kMkvBlockGroup ||
+            id == mkvmuxer::kMkvSimpleBlock) {
+          return E_FILE_FORMAT_INVALID;
+        }
+
+        pos = cluster_stop;
+        break;
+      }
+    } else if ((total >= 0) && (block_stop > total)) {
+      m_element_size = total - m_element_start;
+      pos = total;
+      break;
+    } else if (block_stop > avail) {
+      len = static_cast<long>(size);
+      return E_BUFFER_NOT_FULL;
+    }
+
+    Cluster* const this_ = const_cast<Cluster*>(this);
+
+    if (id == mkvmuxer::kMkvBlockGroup)
+      return this_->ParseBlockGroup(size, pos, len);
+
+    if (id == mkvmuxer::kMkvSimpleBlock)
+      return this_->ParseSimpleBlock(size, pos, len);
+
+    pos += size;  // consume payload
+    if (cluster_stop >= 0 && pos > cluster_stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (m_element_size < 1)
+    return E_FILE_FORMAT_INVALID;
+
+  m_pos = pos;
+  if (cluster_stop >= 0 && m_pos > cluster_stop)
+    return E_FILE_FORMAT_INVALID;
+
+  if (m_entries_count > 0) {
+    const long idx = m_entries_count - 1;
+
+    const BlockEntry* const pLast = m_entries[idx];
+    if (pLast == NULL)
+      return E_PARSE_FAILED;
+
+    const Block* const pBlock = pLast->GetBlock();
+    if (pBlock == NULL)
+      return E_PARSE_FAILED;
+
+    const long long start = pBlock->m_start;
+
+    if ((total >= 0) && (start > total))
+      return E_PARSE_FAILED;  // defend against trucated stream
+
+    const long long size = pBlock->m_size;
+
+    const long long stop = start + size;
+    if (cluster_stop >= 0 && stop > cluster_stop)
+      return E_FILE_FORMAT_INVALID;
+
+    if ((total >= 0) && (stop > total))
+      return E_PARSE_FAILED;  // defend against trucated stream
+  }
+
+  return 1;  // no more entries
+}
+
+long Cluster::ParseSimpleBlock(long long block_size, long long& pos,
+                               long& len) {
+  const long long block_start = pos;
+  const long long block_stop = pos + block_size;
+
+  IMkvReader* const pReader = m_pSegment->m_pReader;
+
+  long long total, avail;
+
+  long status = pReader->Length(&total, &avail);
+
+  if (status < 0)  // error
+    return status;
+
+  assert((total < 0) || (avail <= total));
+
+  // parse track number
+
+  if ((pos + 1) > avail) {
+    len = 1;
+    return E_BUFFER_NOT_FULL;
+  }
+
+  long long result = GetUIntLength(pReader, pos, len);
+
+  if (result < 0)  // error
+    return static_cast<long>(result);
+
+  if (result > 0)  // weird
+    return E_BUFFER_NOT_FULL;
+
+  if ((pos + len) > block_stop)
+    return E_FILE_FORMAT_INVALID;
+
+  if ((pos + len) > avail)
+    return E_BUFFER_NOT_FULL;
+
+  const long long track = ReadUInt(pReader, pos, len);
+
+  if (track < 0)  // error
+    return static_cast<long>(track);
+
+  if (track == 0)
+    return E_FILE_FORMAT_INVALID;
+
+  pos += len;  // consume track number
+
+  if ((pos + 2) > block_stop)
+    return E_FILE_FORMAT_INVALID;
+
+  if ((pos + 2) > avail) {
+    len = 2;
+    return E_BUFFER_NOT_FULL;
+  }
+
+  pos += 2;  // consume timecode
+
+  if ((pos + 1) > block_stop)
+    return E_FILE_FORMAT_INVALID;
+
+  if ((pos + 1) > avail) {
+    len = 1;
+    return E_BUFFER_NOT_FULL;
+  }
+
+  unsigned char flags;
+
+  status = pReader->Read(pos, 1, &flags);
+
+  if (status < 0) {  // error or underflow
+    len = 1;
+    return status;
+  }
+
+  ++pos;  // consume flags byte
+  assert(pos <= avail);
+
+  if (pos >= block_stop)
+    return E_FILE_FORMAT_INVALID;
+
+  const int lacing = int(flags & 0x06) >> 1;
+
+  if ((lacing != 0) && (block_stop > avail)) {
+    len = static_cast<long>(block_stop - pos);
+    return E_BUFFER_NOT_FULL;
+  }
+
+  status = CreateBlock(mkvmuxer::kMkvSimpleBlock,
+                       block_start, block_size,
+                       0);  // DiscardPadding
+
+  if (status != 0)
+    return status;
+
+  m_pos = block_stop;
+
+  return 0;  // success
+}
+
+long Cluster::ParseBlockGroup(long long payload_size, long long& pos,
+                              long& len) {
+  const long long payload_start = pos;
+  const long long payload_stop = pos + payload_size;
+
+  IMkvReader* const pReader = m_pSegment->m_pReader;
+
+  long long total, avail;
+
+  long status = pReader->Length(&total, &avail);
+
+  if (status < 0)  // error
+    return status;
+
+  assert((total < 0) || (avail <= total));
+
+  if ((total >= 0) && (payload_stop > total))
+    return E_FILE_FORMAT_INVALID;
+
+  if (payload_stop > avail) {
+    len = static_cast<long>(payload_size);
+    return E_BUFFER_NOT_FULL;
+  }
+
+  long long discard_padding = 0;
+
+  while (pos < payload_stop) {
+    // parse sub-block element ID
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    long long result = GetUIntLength(pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)  // weird
+      return E_BUFFER_NOT_FULL;
+
+    if ((pos + len) > payload_stop)
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long id = ReadID(pReader, pos, len);
+
+    if (id < 0)  // error
+      return static_cast<long>(id);
+
+    if (id == 0)  // not a valid ID
+      return E_FILE_FORMAT_INVALID;
+
+    pos += len;  // consume ID field
+
+    // Parse Size
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    result = GetUIntLength(pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)  // weird
+      return E_BUFFER_NOT_FULL;
+
+    if ((pos + len) > payload_stop)
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long size = ReadUInt(pReader, pos, len);
+
+    if (size < 0)  // error
+      return static_cast<long>(size);
+
+    pos += len;  // consume size field
+
+    // pos now points to start of sub-block group payload
+
+    if (pos > payload_stop)
+      return E_FILE_FORMAT_INVALID;
+
+    if (size == 0)  // weird
+      continue;
+
+    const long long unknown_size = (1LL << (7 * len)) - 1;
+
+    if (size == unknown_size)
+      return E_FILE_FORMAT_INVALID;
+
+    if (id == mkvmuxer::kMkvDiscardPadding) {
+      status = UnserializeInt(pReader, pos, size, discard_padding);
+
+      if (status < 0)  // error
+        return status;
+    }
+
+    if (id != mkvmuxer::kMkvBlock) {
+      pos += size;  // consume sub-part of block group
+
+      if (pos > payload_stop)
+        return E_FILE_FORMAT_INVALID;
+
+      continue;
+    }
+
+    const long long block_stop = pos + size;
+
+    if (block_stop > payload_stop)
+      return E_FILE_FORMAT_INVALID;
+
+    // parse track number
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    result = GetUIntLength(pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)  // weird
+      return E_BUFFER_NOT_FULL;
+
+    if ((pos + len) > block_stop)
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long track = ReadUInt(pReader, pos, len);
+
+    if (track < 0)  // error
+      return static_cast<long>(track);
+
+    if (track == 0)
+      return E_FILE_FORMAT_INVALID;
+
+    pos += len;  // consume track number
+
+    if ((pos + 2) > block_stop)
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + 2) > avail) {
+      len = 2;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    pos += 2;  // consume timecode
+
+    if ((pos + 1) > block_stop)
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    unsigned char flags;
+
+    status = pReader->Read(pos, 1, &flags);
+
+    if (status < 0) {  // error or underflow
+      len = 1;
+      return status;
+    }
+
+    ++pos;  // consume flags byte
+    assert(pos <= avail);
+
+    if (pos >= block_stop)
+      return E_FILE_FORMAT_INVALID;
+
+    const int lacing = int(flags & 0x06) >> 1;
+
+    if ((lacing != 0) && (block_stop > avail)) {
+      len = static_cast<long>(block_stop - pos);
+      return E_BUFFER_NOT_FULL;
+    }
+
+    pos = block_stop;  // consume block-part of block group
+    if (pos > payload_stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (pos != payload_stop)
+    return E_FILE_FORMAT_INVALID;
+
+  status = CreateBlock(mkvmuxer::kMkvBlockGroup,
+                       payload_start, payload_size, discard_padding);
+  if (status != 0)
+    return status;
+
+  m_pos = payload_stop;
+
+  return 0;  // success
+}
+
+long Cluster::GetEntry(long index, const mkvparser::BlockEntry*& pEntry) const {
+  assert(m_pos >= m_element_start);
+
+  pEntry = NULL;
+
+  if (index < 0)
+    return -1;  // generic error
+
+  if (m_entries_count < 0)
+    return E_BUFFER_NOT_FULL;
+
+  assert(m_entries);
+  assert(m_entries_size > 0);
+  assert(m_entries_count <= m_entries_size);
+
+  if (index < m_entries_count) {
+    pEntry = m_entries[index];
+    assert(pEntry);
+
+    return 1;  // found entry
+  }
+
+  if (m_element_size < 0)  // we don't know cluster end yet
+    return E_BUFFER_NOT_FULL;  // underflow
+
+  const long long element_stop = m_element_start + m_element_size;
+
+  if (m_pos >= element_stop)
+    return 0;  // nothing left to parse
+
+  return E_BUFFER_NOT_FULL;  // underflow, since more remains to be parsed
+}
+
+Cluster* Cluster::Create(Segment* pSegment, long idx, long long off) {
+  if (!pSegment || off < 0)
+    return NULL;
+
+  const long long element_start = pSegment->m_start + off;
+
+  Cluster* const pCluster =
+      new (std::nothrow) Cluster(pSegment, idx, element_start);
+
+  return pCluster;
+}
+
+Cluster::Cluster()
+    : m_pSegment(NULL),
+      m_element_start(0),
+      m_index(0),
+      m_pos(0),
+      m_element_size(0),
+      m_timecode(0),
+      m_entries(NULL),
+      m_entries_size(0),
+      m_entries_count(0)  // means "no entries"
+{}
+
+Cluster::Cluster(Segment* pSegment, long idx, long long element_start
+                 /* long long element_size */)
+    : m_pSegment(pSegment),
+      m_element_start(element_start),
+      m_index(idx),
+      m_pos(element_start),
+      m_element_size(-1 /* element_size */),
+      m_timecode(-1),
+      m_entries(NULL),
+      m_entries_size(0),
+      m_entries_count(-1)  // means "has not been parsed yet"
+{}
+
+Cluster::~Cluster() {
+  if (m_entries_count <= 0)
+    return;
+
+  BlockEntry** i = m_entries;
+  BlockEntry** const j = m_entries + m_entries_count;
+
+  while (i != j) {
+    BlockEntry* p = *i++;
+    assert(p);
+
+    delete p;
+  }
+
+  delete[] m_entries;
+}
+
+bool Cluster::EOS() const { return (m_pSegment == NULL); }
+
+long Cluster::GetIndex() const { return m_index; }
+
+long long Cluster::GetPosition() const {
+  const long long pos = m_element_start - m_pSegment->m_start;
+  assert(pos >= 0);
+
+  return pos;
+}
+
+long long Cluster::GetElementSize() const { return m_element_size; }
+
+long Cluster::HasBlockEntries(
+    const Segment* pSegment,
+    long long off,  // relative to start of segment payload
+    long long& pos, long& len) {
+  assert(pSegment);
+  assert(off >= 0);  // relative to segment
+
+  IMkvReader* const pReader = pSegment->m_pReader;
+
+  long long total, avail;
+
+  long status = pReader->Length(&total, &avail);
+
+  if (status < 0)  // error
+    return status;
+
+  assert((total < 0) || (avail <= total));
+
+  pos = pSegment->m_start + off;  // absolute
+
+  if ((total >= 0) && (pos >= total))
+    return 0;  // we don't even have a complete cluster
+
+  const long long segment_stop =
+      (pSegment->m_size < 0) ? -1 : pSegment->m_start + pSegment->m_size;
+
+  long long cluster_stop = -1;  // interpreted later to mean "unknown size"
+
+  {
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    long long result = GetUIntLength(pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)  // need more data
+      return E_BUFFER_NOT_FULL;
+
+    if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((total >= 0) && ((pos + len) > total))
+      return 0;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long id = ReadID(pReader, pos, len);
+
+    if (id < 0)  // error
+      return static_cast<long>(id);
+
+    if (id != mkvmuxer::kMkvCluster)
+      return E_PARSE_FAILED;
+
+    pos += len;  // consume Cluster ID field
+
+    // read size field
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    result = GetUIntLength(pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)  // weird
+      return E_BUFFER_NOT_FULL;
+
+    if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((total >= 0) && ((pos + len) > total))
+      return 0;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long size = ReadUInt(pReader, pos, len);
+
+    if (size < 0)  // error
+      return static_cast<long>(size);
+
+    if (size == 0)
+      return 0;  // cluster does not have entries
+
+    pos += len;  // consume size field
+
+    // pos now points to start of payload
+
+    const long long unknown_size = (1LL << (7 * len)) - 1;
+
+    if (size != unknown_size) {
+      cluster_stop = pos + size;
+      assert(cluster_stop >= 0);
+
+      if ((segment_stop >= 0) && (cluster_stop > segment_stop))
+        return E_FILE_FORMAT_INVALID;
+
+      if ((total >= 0) && (cluster_stop > total))
+        // return E_FILE_FORMAT_INVALID;  //too conservative
+        return 0;  // cluster does not have any entries
+    }
+  }
+
+  for (;;) {
+    if ((cluster_stop >= 0) && (pos >= cluster_stop))
+      return 0;  // no entries detected
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    long long result = GetUIntLength(pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)  // need more data
+      return E_BUFFER_NOT_FULL;
+
+    if ((cluster_stop >= 0) && ((pos + len) > cluster_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long id = ReadID(pReader, pos, len);
+
+    if (id < 0)  // error
+      return static_cast<long>(id);
+
+    // This is the distinguished set of ID's we use to determine
+    // that we have exhausted the sub-element's inside the cluster
+    // whose ID we parsed earlier.
+
+    if (id == mkvmuxer::kMkvCluster)
+      return 0;  // no entries found
+
+    if (id == mkvmuxer::kMkvCues)
+      return 0;  // no entries found
+
+    pos += len;  // consume id field
+
+    if ((cluster_stop >= 0) && (pos >= cluster_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    // read size field
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    result = GetUIntLength(pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)  // underflow
+      return E_BUFFER_NOT_FULL;
+
+    if ((cluster_stop >= 0) && ((pos + len) > cluster_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long size = ReadUInt(pReader, pos, len);
+
+    if (size < 0)  // error
+      return static_cast<long>(size);
+
+    pos += len;  // consume size field
+
+    // pos now points to start of payload
+
+    if ((cluster_stop >= 0) && (pos > cluster_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if (size == 0)  // weird
+      continue;
+
+    const long long unknown_size = (1LL << (7 * len)) - 1;
+
+    if (size == unknown_size)
+      return E_FILE_FORMAT_INVALID;  // not supported inside cluster
+
+    if ((cluster_stop >= 0) && ((pos + size) > cluster_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if (id == mkvmuxer::kMkvBlockGroup)
+      return 1;  // have at least one entry
+
+    if (id == mkvmuxer::kMkvSimpleBlock)
+      return 1;  // have at least one entry
+
+    pos += size;  // consume payload
+    if (cluster_stop >= 0 && pos > cluster_stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+}
+
+long long Cluster::GetTimeCode() const {
+  long long pos;
+  long len;
+
+  const long status = Load(pos, len);
+
+  if (status < 0)  // error
+    return status;
+
+  return m_timecode;
+}
+
+long long Cluster::GetTime() const {
+  const long long tc = GetTimeCode();
+
+  if (tc < 0)
+    return tc;
+
+  const SegmentInfo* const pInfo = m_pSegment->GetInfo();
+  assert(pInfo);
+
+  const long long scale = pInfo->GetTimeCodeScale();
+  assert(scale >= 1);
+
+  const long long t = m_timecode * scale;
+
+  return t;
+}
+
+long long Cluster::GetFirstTime() const {
+  const BlockEntry* pEntry;
+
+  const long status = GetFirst(pEntry);
+
+  if (status < 0)  // error
+    return status;
+
+  if (pEntry == NULL)  // empty cluster
+    return GetTime();
+
+  const Block* const pBlock = pEntry->GetBlock();
+  assert(pBlock);
+
+  return pBlock->GetTime(this);
+}
+
+long long Cluster::GetLastTime() const {
+  const BlockEntry* pEntry;
+
+  const long status = GetLast(pEntry);
+
+  if (status < 0)  // error
+    return status;
+
+  if (pEntry == NULL)  // empty cluster
+    return GetTime();
+
+  const Block* const pBlock = pEntry->GetBlock();
+  assert(pBlock);
+
+  return pBlock->GetTime(this);
+}
+
+long Cluster::CreateBlock(long long id,
+                          long long pos,  // absolute pos of payload
+                          long long size, long long discard_padding) {
+  if (id != mkvmuxer::kMkvBlockGroup && id != mkvmuxer::kMkvSimpleBlock)
+    return E_PARSE_FAILED;
+
+  if (m_entries_count < 0) {  // haven't parsed anything yet
+    assert(m_entries == NULL);
+    assert(m_entries_size == 0);
+
+    m_entries_size = 1024;
+    m_entries = new (std::nothrow) BlockEntry*[m_entries_size];
+    if (m_entries == NULL)
+      return -1;
+
+    m_entries_count = 0;
+  } else {
+    assert(m_entries);
+    assert(m_entries_size > 0);
+    assert(m_entries_count <= m_entries_size);
+
+    if (m_entries_count >= m_entries_size) {
+      const long entries_size = 2 * m_entries_size;
+
+      BlockEntry** const entries = new (std::nothrow) BlockEntry*[entries_size];
+      if (entries == NULL)
+        return -1;
+
+      BlockEntry** src = m_entries;
+      BlockEntry** const src_end = src + m_entries_count;
+
+      BlockEntry** dst = entries;
+
+      while (src != src_end)
+        *dst++ = *src++;
+
+      delete[] m_entries;
+
+      m_entries = entries;
+      m_entries_size = entries_size;
+    }
+  }
+
+  if (id == mkvmuxer::kMkvBlockGroup)
+    return CreateBlockGroup(pos, size, discard_padding);
+  else
+    return CreateSimpleBlock(pos, size);
+}
+
+long Cluster::CreateBlockGroup(long long start_offset, long long size,
+                               long long discard_padding) {
+  assert(m_entries);
+  assert(m_entries_size > 0);
+  assert(m_entries_count >= 0);
+  assert(m_entries_count < m_entries_size);
+
+  IMkvReader* const pReader = m_pSegment->m_pReader;
+
+  long long pos = start_offset;
+  const long long stop = start_offset + size;
+
+  // For WebM files, there is a bias towards previous reference times
+  //(in order to support alt-ref frames, which refer back to the previous
+  // keyframe).  Normally a 0 value is not possible, but here we tenatively
+  // allow 0 as the value of a reference frame, with the interpretation
+  // that this is a "previous" reference time.
+
+  long long prev = 1;  // nonce
+  long long next = 0;  // nonce
+  long long duration = -1;  // really, this is unsigned
+
+  long long bpos = -1;
+  long long bsize = -1;
+
+  while (pos < stop) {
+    long len;
+    const long long id = ReadID(pReader, pos, len);
+    if (id < 0 || (pos + len) > stop)
+      return E_FILE_FORMAT_INVALID;
+
+    pos += len;  // consume ID
+
+    const long long size = ReadUInt(pReader, pos, len);
+    assert(size >= 0);  // TODO
+    assert((pos + len) <= stop);
+
+    pos += len;  // consume size
+
+    if (id == mkvmuxer::kMkvBlock) {
+      if (bpos < 0) {  // Block ID
+        bpos = pos;
+        bsize = size;
+      }
+    } else if (id == mkvmuxer::kMkvBlockDuration) {
+      if (size > 8)
+        return E_FILE_FORMAT_INVALID;
+
+      duration = UnserializeUInt(pReader, pos, size);
+
+      if (duration < 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == mkvmuxer::kMkvReferenceBlock) {
+      if (size > 8 || size <= 0)
+        return E_FILE_FORMAT_INVALID;
+      const long size_ = static_cast<long>(size);
+
+      long long time;
+
+      long status = UnserializeInt(pReader, pos, size_, time);
+      assert(status == 0);
+      if (status != 0)
+        return -1;
+
+      if (time <= 0)  // see note above
+        prev = time;
+      else
+        next = time;
+    }
+
+    pos += size;  // consume payload
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+  if (bpos < 0)
+    return E_FILE_FORMAT_INVALID;
+
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
+  assert(bsize >= 0);
+
+  const long idx = m_entries_count;
+
+  BlockEntry** const ppEntry = m_entries + idx;
+  BlockEntry*& pEntry = *ppEntry;
+
+  pEntry = new (std::nothrow)
+      BlockGroup(this, idx, bpos, bsize, prev, next, duration, discard_padding);
+
+  if (pEntry == NULL)
+    return -1;  // generic error
+
+  BlockGroup* const p = static_cast<BlockGroup*>(pEntry);
+
+  const long status = p->Parse();
+
+  if (status == 0) {  // success
+    ++m_entries_count;
+    return 0;
+  }
+
+  delete pEntry;
+  pEntry = 0;
+
+  return status;
+}
+
+long Cluster::CreateSimpleBlock(long long st, long long sz) {
+  assert(m_entries);
+  assert(m_entries_size > 0);
+  assert(m_entries_count >= 0);
+  assert(m_entries_count < m_entries_size);
+
+  const long idx = m_entries_count;
+
+  BlockEntry** const ppEntry = m_entries + idx;
+  BlockEntry*& pEntry = *ppEntry;
+
+  pEntry = new (std::nothrow) SimpleBlock(this, idx, st, sz);
+
+  if (pEntry == NULL)
+    return -1;  // generic error
+
+  SimpleBlock* const p = static_cast<SimpleBlock*>(pEntry);
+
+  const long status = p->Parse();
+
+  if (status == 0) {
+    ++m_entries_count;
+    return 0;
+  }
+
+  delete pEntry;
+  pEntry = 0;
+
+  return status;
+}
+
+long Cluster::GetFirst(const BlockEntry*& pFirst) const {
+  if (m_entries_count <= 0) {
+    long long pos;
+    long len;
+
+    const long status = Parse(pos, len);
+
+    if (status < 0) {  // error
+      pFirst = NULL;
+      return status;
+    }
+
+    if (m_entries_count <= 0) {  // empty cluster
+      pFirst = NULL;
+      return 0;
+    }
+  }
+
+  assert(m_entries);
+
+  pFirst = m_entries[0];
+  assert(pFirst);
+
+  return 0;  // success
+}
+
+long Cluster::GetLast(const BlockEntry*& pLast) const {
+  for (;;) {
+    long long pos;
+    long len;
+
+    const long status = Parse(pos, len);
+
+    if (status < 0) {  // error
+      pLast = NULL;
+      return status;
+    }
+
+    if (status > 0)  // no new block
+      break;
+  }
+
+  if (m_entries_count <= 0) {
+    pLast = NULL;
+    return 0;
+  }
+
+  assert(m_entries);
+
+  const long idx = m_entries_count - 1;
+
+  pLast = m_entries[idx];
+  assert(pLast);
+
+  return 0;
+}
+
+long Cluster::GetNext(const BlockEntry* pCurr, const BlockEntry*& pNext) const {
+  assert(pCurr);
+  assert(m_entries);
+  assert(m_entries_count > 0);
+
+  size_t idx = pCurr->GetIndex();
+  assert(idx < size_t(m_entries_count));
+  assert(m_entries[idx] == pCurr);
+
+  ++idx;
+
+  if (idx >= size_t(m_entries_count)) {
+    long long pos;
+    long len;
+
+    const long status = Parse(pos, len);
+
+    if (status < 0) {  // error
+      pNext = NULL;
+      return status;
+    }
+
+    if (status > 0) {
+      pNext = NULL;
+      return 0;
+    }
+
+    assert(m_entries);
+    assert(m_entries_count > 0);
+    assert(idx < size_t(m_entries_count));
+  }
+
+  pNext = m_entries[idx];
+  assert(pNext);
+
+  return 0;
+}
+
+long Cluster::GetEntryCount() const { return m_entries_count; }
+
+const BlockEntry* Cluster::GetEntry(const Track* pTrack,
+                                    long long time_ns) const {
+  assert(pTrack);
+
+  if (m_pSegment == NULL)  // this is the special EOS cluster
+    return pTrack->GetEOS();
+
+  const BlockEntry* pResult = pTrack->GetEOS();
+
+  long index = 0;
+
+  for (;;) {
+    if (index >= m_entries_count) {
+      long long pos;
+      long len;
+
+      const long status = Parse(pos, len);
+      assert(status >= 0);
+
+      if (status > 0)  // completely parsed, and no more entries
+        return pResult;
+
+      if (status < 0)  // should never happen
+        return 0;
+
+      assert(m_entries);
+      assert(index < m_entries_count);
+    }
+
+    const BlockEntry* const pEntry = m_entries[index];
+    assert(pEntry);
+    assert(!pEntry->EOS());
+
+    const Block* const pBlock = pEntry->GetBlock();
+    assert(pBlock);
+
+    if (pBlock->GetTrackNumber() != pTrack->GetNumber()) {
+      ++index;
+      continue;
+    }
+
+    if (pTrack->VetEntry(pEntry)) {
+      if (time_ns < 0)  // just want first candidate block
+        return pEntry;
+
+      const long long ns = pBlock->GetTime(this);
+
+      if (ns > time_ns)
+        return pResult;
+
+      pResult = pEntry;  // have a candidate
+    } else if (time_ns >= 0) {
+      const long long ns = pBlock->GetTime(this);
+
+      if (ns > time_ns)
+        return pResult;
+    }
+
+    ++index;
+  }
+}
+
+const BlockEntry* Cluster::GetEntry(const CuePoint& cp,
+                                    const CuePoint::TrackPosition& tp) const {
+  assert(m_pSegment);
+  const long long tc = cp.GetTimeCode();
+
+  if (tp.m_block > 0) {
+    const long block = static_cast<long>(tp.m_block);
+    const long index = block - 1;
+
+    while (index >= m_entries_count) {
+      long long pos;
+      long len;
+
+      const long status = Parse(pos, len);
+
+      if (status < 0)  // TODO: can this happen?
+        return NULL;
+
+      if (status > 0)  // nothing remains to be parsed
+        return NULL;
+    }
+
+    const BlockEntry* const pEntry = m_entries[index];
+    assert(pEntry);
+    assert(!pEntry->EOS());
+
+    const Block* const pBlock = pEntry->GetBlock();
+    assert(pBlock);
+
+    if ((pBlock->GetTrackNumber() == tp.m_track) &&
+        (pBlock->GetTimeCode(this) == tc)) {
+      return pEntry;
+    }
+  }
+
+  long index = 0;
+
+  for (;;) {
+    if (index >= m_entries_count) {
+      long long pos;
+      long len;
+
+      const long status = Parse(pos, len);
+
+      if (status < 0)  // TODO: can this happen?
+        return NULL;
+
+      if (status > 0)  // nothing remains to be parsed
+        return NULL;
+
+      assert(m_entries);
+      assert(index < m_entries_count);
+    }
+
+    const BlockEntry* const pEntry = m_entries[index];
+    assert(pEntry);
+    assert(!pEntry->EOS());
+
+    const Block* const pBlock = pEntry->GetBlock();
+    assert(pBlock);
+
+    if (pBlock->GetTrackNumber() != tp.m_track) {
+      ++index;
+      continue;
+    }
+
+    const long long tc_ = pBlock->GetTimeCode(this);
+
+    if (tc_ < tc) {
+      ++index;
+      continue;
+    }
+
+    if (tc_ > tc)
+      return NULL;
+
+    const Tracks* const pTracks = m_pSegment->GetTracks();
+    assert(pTracks);
+
+    const long tn = static_cast<long>(tp.m_track);
+    const Track* const pTrack = pTracks->GetTrackByNumber(tn);
+
+    if (pTrack == NULL)
+      return NULL;
+
+    const long long type = pTrack->GetType();
+
+    if (type == 2)  // audio
+      return pEntry;
+
+    if (type != 1)  // not video
+      return NULL;
+
+    if (!pBlock->IsKey())
+      return NULL;
+
+    return pEntry;
+  }
+}
+
+BlockEntry::BlockEntry(Cluster* p, long idx) : m_pCluster(p), m_index(idx) {}
+BlockEntry::~BlockEntry() {}
+bool BlockEntry::EOS() const { return (GetKind() == kBlockEOS); }
+const Cluster* BlockEntry::GetCluster() const { return m_pCluster; }
+long BlockEntry::GetIndex() const { return m_index; }
+
+SimpleBlock::SimpleBlock(Cluster* pCluster, long idx, long long start,
+                         long long size)
+    : BlockEntry(pCluster, idx), m_block(start, size, 0) {}
+
+long SimpleBlock::Parse() { return m_block.Parse(m_pCluster); }
+BlockEntry::Kind SimpleBlock::GetKind() const { return kBlockSimple; }
+const Block* SimpleBlock::GetBlock() const { return &m_block; }
+
+BlockGroup::BlockGroup(Cluster* pCluster, long idx, long long block_start,
+                       long long block_size, long long prev, long long next,
+                       long long duration, long long discard_padding)
+    : BlockEntry(pCluster, idx),
+      m_block(block_start, block_size, discard_padding),
+      m_prev(prev),
+      m_next(next),
+      m_duration(duration) {}
+
+long BlockGroup::Parse() {
+  const long status = m_block.Parse(m_pCluster);
+
+  if (status)
+    return status;
+
+  m_block.SetKey((m_prev > 0) && (m_next <= 0));
+
+  return 0;
+}
+
+BlockEntry::Kind BlockGroup::GetKind() const { return kBlockGroup; }
+const Block* BlockGroup::GetBlock() const { return &m_block; }
+long long BlockGroup::GetPrevTimeCode() const { return m_prev; }
+long long BlockGroup::GetNextTimeCode() const { return m_next; }
+long long BlockGroup::GetDurationTimeCode() const { return m_duration; }
+
+Block::Block(long long start, long long size_, long long discard_padding)
+    : m_start(start),
+      m_size(size_),
+      m_track(0),
+      m_timecode(-1),
+      m_flags(0),
+      m_frames(NULL),
+      m_frame_count(-1),
+      m_discard_padding(discard_padding) {}
+
+Block::~Block() { delete[] m_frames; }
+
+long Block::Parse(const Cluster* pCluster) {
+  if (pCluster == NULL)
+    return -1;
+
+  if (pCluster->m_pSegment == NULL)
+    return -1;
+
+  assert(m_start >= 0);
+  assert(m_size >= 0);
+  assert(m_track <= 0);
+  assert(m_frames == NULL);
+  assert(m_frame_count <= 0);
+
+  long long pos = m_start;
+  const long long stop = m_start + m_size;
+
+  long len;
+
+  IMkvReader* const pReader = pCluster->m_pSegment->m_pReader;
+
+  m_track = ReadUInt(pReader, pos, len);
+
+  if (m_track <= 0)
+    return E_FILE_FORMAT_INVALID;
+
+  if ((pos + len) > stop)
+    return E_FILE_FORMAT_INVALID;
+
+  pos += len;  // consume track number
+
+  if ((stop - pos) < 2)
+    return E_FILE_FORMAT_INVALID;
+
+  long status;
+  long long value;
+
+  status = UnserializeInt(pReader, pos, 2, value);
+
+  if (status)
+    return E_FILE_FORMAT_INVALID;
+
+  if (value < SHRT_MIN)
+    return E_FILE_FORMAT_INVALID;
+
+  if (value > SHRT_MAX)
+    return E_FILE_FORMAT_INVALID;
+
+  m_timecode = static_cast<short>(value);
+
+  pos += 2;
+
+  if ((stop - pos) <= 0)
+    return E_FILE_FORMAT_INVALID;
+
+  status = pReader->Read(pos, 1, &m_flags);
+
+  if (status)
+    return E_FILE_FORMAT_INVALID;
+
+  const int lacing = int(m_flags & 0x06) >> 1;
+
+  ++pos;  // consume flags byte
+
+  if (lacing == 0) {  // no lacing
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+
+    m_frame_count = 1;
+    m_frames = new (std::nothrow) Frame[m_frame_count];
+    if (m_frames == NULL)
+      return -1;
+
+    Frame& f = m_frames[0];
+    f.pos = pos;
+
+    const long long frame_size = stop - pos;
+
+    if (frame_size > LONG_MAX || frame_size <= 0)
+      return E_FILE_FORMAT_INVALID;
+
+    f.len = static_cast<long>(frame_size);
+
+    return 0;  // success
+  }
+
+  if (pos >= stop)
+    return E_FILE_FORMAT_INVALID;
+
+  unsigned char biased_count;
+
+  status = pReader->Read(pos, 1, &biased_count);
+
+  if (status)
+    return E_FILE_FORMAT_INVALID;
+
+  ++pos;  // consume frame count
+  if (pos > stop)
+    return E_FILE_FORMAT_INVALID;
+
+  m_frame_count = int(biased_count) + 1;
+
+  m_frames = new (std::nothrow) Frame[m_frame_count];
+  if (m_frames == NULL)
+    return -1;
+
+  if (!m_frames)
+    return E_FILE_FORMAT_INVALID;
+
+  if (lacing == 1) {  // Xiph
+    Frame* pf = m_frames;
+    Frame* const pf_end = pf + m_frame_count;
+
+    long long size = 0;
+    int frame_count = m_frame_count;
+
+    while (frame_count > 1) {
+      long frame_size = 0;
+
+      for (;;) {
+        unsigned char val;
+
+        if (pos >= stop)
+          return E_FILE_FORMAT_INVALID;
+
+        status = pReader->Read(pos, 1, &val);
+
+        if (status)
+          return E_FILE_FORMAT_INVALID;
+
+        ++pos;  // consume xiph size byte
+
+        frame_size += val;
+
+        if (val < 255)
+          break;
+      }
+
+      Frame& f = *pf++;
+      assert(pf < pf_end);
+      if (pf >= pf_end)
+        return E_FILE_FORMAT_INVALID;
+
+      f.pos = 0;  // patch later
+
+      if (frame_size <= 0)
+        return E_FILE_FORMAT_INVALID;
+
+      f.len = frame_size;
+      size += frame_size;  // contribution of this frame
+
+      --frame_count;
+    }
+
+    if (pf >= pf_end || pos > stop)
+      return E_FILE_FORMAT_INVALID;
+
+    {
+      Frame& f = *pf++;
+
+      if (pf != pf_end)
+        return E_FILE_FORMAT_INVALID;
+
+      f.pos = 0;  // patch later
+
+      const long long total_size = stop - pos;
+
+      if (total_size < size)
+        return E_FILE_FORMAT_INVALID;
+
+      const long long frame_size = total_size - size;
+
+      if (frame_size > LONG_MAX || frame_size <= 0)
+        return E_FILE_FORMAT_INVALID;
+
+      f.len = static_cast<long>(frame_size);
+    }
+
+    pf = m_frames;
+    while (pf != pf_end) {
+      Frame& f = *pf++;
+      assert((pos + f.len) <= stop);
+
+      if ((pos + f.len) > stop)
+        return E_FILE_FORMAT_INVALID;
+
+      f.pos = pos;
+      pos += f.len;
+    }
+
+    assert(pos == stop);
+    if (pos != stop)
+      return E_FILE_FORMAT_INVALID;
+
+  } else if (lacing == 2) {  // fixed-size lacing
+    if (pos >= stop)
+      return E_FILE_FORMAT_INVALID;
+
+    const long long total_size = stop - pos;
+
+    if ((total_size % m_frame_count) != 0)
+      return E_FILE_FORMAT_INVALID;
+
+    const long long frame_size = total_size / m_frame_count;
+
+    if (frame_size > LONG_MAX || frame_size <= 0)
+      return E_FILE_FORMAT_INVALID;
+
+    Frame* pf = m_frames;
+    Frame* const pf_end = pf + m_frame_count;
+
+    while (pf != pf_end) {
+      assert((pos + frame_size) <= stop);
+      if ((pos + frame_size) > stop)
+        return E_FILE_FORMAT_INVALID;
+
+      Frame& f = *pf++;
+
+      f.pos = pos;
+      f.len = static_cast<long>(frame_size);
+
+      pos += frame_size;
+    }
+
+    assert(pos == stop);
+    if (pos != stop)
+      return E_FILE_FORMAT_INVALID;
+
+  } else {
+    assert(lacing == 3);  // EBML lacing
+
+    if (pos >= stop)
+      return E_FILE_FORMAT_INVALID;
+
+    long long size = 0;
+    int frame_count = m_frame_count;
+
+    long long frame_size = ReadUInt(pReader, pos, len);
+
+    if (frame_size <= 0)
+      return E_FILE_FORMAT_INVALID;
+
+    if (frame_size > LONG_MAX)
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > stop)
+      return E_FILE_FORMAT_INVALID;
+
+    pos += len;  // consume length of size of first frame
+
+    if ((pos + frame_size) > stop)
+      return E_FILE_FORMAT_INVALID;
+
+    Frame* pf = m_frames;
+    Frame* const pf_end = pf + m_frame_count;
+
+    {
+      Frame& curr = *pf;
+
+      curr.pos = 0;  // patch later
+
+      curr.len = static_cast<long>(frame_size);
+      size += curr.len;  // contribution of this frame
+    }
+
+    --frame_count;
+
+    while (frame_count > 1) {
+      if (pos >= stop)
+        return E_FILE_FORMAT_INVALID;
+
+      assert(pf < pf_end);
+      if (pf >= pf_end)
+        return E_FILE_FORMAT_INVALID;
+
+
+      const Frame& prev = *pf++;
+      assert(prev.len == frame_size);
+      if (prev.len != frame_size)
+        return E_FILE_FORMAT_INVALID;
+
+      assert(pf < pf_end);
+      if (pf >= pf_end)
+        return E_FILE_FORMAT_INVALID;
+
+      Frame& curr = *pf;
+
+      curr.pos = 0;  // patch later
+
+      const long long delta_size_ = ReadUInt(pReader, pos, len);
+
+      if (delta_size_ < 0)
+        return E_FILE_FORMAT_INVALID;
+
+      if ((pos + len) > stop)
+        return E_FILE_FORMAT_INVALID;
+
+      pos += len;  // consume length of (delta) size
+      if (pos > stop)
+        return E_FILE_FORMAT_INVALID;
+
+      const int exp = 7 * len - 1;
+      const long long bias = (1LL << exp) - 1LL;
+      const long long delta_size = delta_size_ - bias;
+
+      frame_size += delta_size;
+
+      if (frame_size <= 0)
+        return E_FILE_FORMAT_INVALID;
+
+      if (frame_size > LONG_MAX)
+        return E_FILE_FORMAT_INVALID;
+
+      curr.len = static_cast<long>(frame_size);
+      size += curr.len;  // contribution of this frame
+
+      --frame_count;
+    }
+
+    // parse last frame
+    if (frame_count > 0) {
+      if (pos > stop || pf >= pf_end)
+        return E_FILE_FORMAT_INVALID;
+
+      const Frame& prev = *pf++;
+      assert(prev.len == frame_size);
+      if (prev.len != frame_size)
+        return E_FILE_FORMAT_INVALID;
+
+      if (pf >= pf_end)
+        return E_FILE_FORMAT_INVALID;
+
+      Frame& curr = *pf++;
+      if (pf != pf_end)
+        return E_FILE_FORMAT_INVALID;
+
+      curr.pos = 0;  // patch later
+
+      const long long total_size = stop - pos;
+
+      if (total_size < size)
+        return E_FILE_FORMAT_INVALID;
+
+      frame_size = total_size - size;
+
+      if (frame_size > LONG_MAX || frame_size <= 0)
+        return E_FILE_FORMAT_INVALID;
+
+      curr.len = static_cast<long>(frame_size);
+    }
+
+    pf = m_frames;
+    while (pf != pf_end) {
+      Frame& f = *pf++;
+      assert((pos + f.len) <= stop);
+      if ((pos + f.len) > stop)
+        return E_FILE_FORMAT_INVALID;
+
+      f.pos = pos;
+      pos += f.len;
+    }
+
+    if (pos != stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  return 0;  // success
+}
+
+long long Block::GetTimeCode(const Cluster* pCluster) const {
+  if (pCluster == 0)
+    return m_timecode;
+
+  const long long tc0 = pCluster->GetTimeCode();
+  assert(tc0 >= 0);
+
+  const long long tc = tc0 + m_timecode;
+
+  return tc;  // unscaled timecode units
+}
+
+long long Block::GetTime(const Cluster* pCluster) const {
+  assert(pCluster);
+
+  const long long tc = GetTimeCode(pCluster);
+
+  const Segment* const pSegment = pCluster->m_pSegment;
+  const SegmentInfo* const pInfo = pSegment->GetInfo();
+  assert(pInfo);
+
+  const long long scale = pInfo->GetTimeCodeScale();
+  assert(scale >= 1);
+
+  const long long ns = tc * scale;
+
+  return ns;
+}
+
+long long Block::GetTrackNumber() const { return m_track; }
+
+bool Block::IsKey() const {
+  return ((m_flags & static_cast<unsigned char>(1 << 7)) != 0);
+}
+
+void Block::SetKey(bool bKey) {
+  if (bKey)
+    m_flags |= static_cast<unsigned char>(1 << 7);
+  else
+    m_flags &= 0x7F;
+}
+
+bool Block::IsInvisible() const { return bool(int(m_flags & 0x08) != 0); }
+
+Block::Lacing Block::GetLacing() const {
+  const int value = int(m_flags & 0x06) >> 1;
+  return static_cast<Lacing>(value);
+}
+
+int Block::GetFrameCount() const { return m_frame_count; }
+
+const Block::Frame& Block::GetFrame(int idx) const {
+  assert(idx >= 0);
+  assert(idx < m_frame_count);
+
+  const Frame& f = m_frames[idx];
+  assert(f.pos > 0);
+  assert(f.len > 0);
+
+  return f;
+}
+
+long Block::Frame::Read(IMkvReader* pReader, unsigned char* buf) const {
+  assert(pReader);
+  assert(buf);
+
+  const long status = pReader->Read(pos, len, buf);
+  return status;
+}
+
+long long Block::GetDiscardPadding() const { return m_discard_padding; }
+
+}  // end namespace mkvparser
diff --git a/libs/libvpx/third_party/libwebm/mkvparser.hpp b/libs/libvpx/third_party/libwebm/mkvparser.hpp
new file mode 100644
index 0000000000..75ef69d76d
--- /dev/null
+++ b/libs/libvpx/third_party/libwebm/mkvparser.hpp
@@ -0,0 +1,1025 @@
+// Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+
+#ifndef MKVPARSER_HPP
+#define MKVPARSER_HPP
+
+#include <cstddef>
+#include <cstdio>
+#include <cstdlib>
+
+namespace mkvparser {
+
+const int E_PARSE_FAILED = -1;
+const int E_FILE_FORMAT_INVALID = -2;
+const int E_BUFFER_NOT_FULL = -3;
+
+class IMkvReader {
+ public:
+  virtual int Read(long long pos, long len, unsigned char* buf) = 0;
+  virtual int Length(long long* total, long long* available) = 0;
+
+ protected:
+  virtual ~IMkvReader();
+};
+
+template<typename Type> Type* SafeArrayAlloc(unsigned long long num_elements,
+                                             unsigned long long element_size);
+long long GetUIntLength(IMkvReader*, long long, long&);
+long long ReadUInt(IMkvReader*, long long, long&);
+long long ReadID(IMkvReader* pReader, long long pos, long& len);
+long long UnserializeUInt(IMkvReader*, long long pos, long long size);
+
+long UnserializeFloat(IMkvReader*, long long pos, long long size, double&);
+long UnserializeInt(IMkvReader*, long long pos, long long size,
+                    long long& result);
+
+long UnserializeString(IMkvReader*, long long pos, long long size, char*& str);
+
+long ParseElementHeader(IMkvReader* pReader,
+                        long long& pos,  // consume id and size fields
+                        long long stop,  // if you know size of element's parent
+                        long long& id, long long& size);
+
+bool Match(IMkvReader*, long long&, unsigned long, long long&);
+bool Match(IMkvReader*, long long&, unsigned long, unsigned char*&, size_t&);
+
+void GetVersion(int& major, int& minor, int& build, int& revision);
+
+struct EBMLHeader {
+  EBMLHeader();
+  ~EBMLHeader();
+  long long m_version;
+  long long m_readVersion;
+  long long m_maxIdLength;
+  long long m_maxSizeLength;
+  char* m_docType;
+  long long m_docTypeVersion;
+  long long m_docTypeReadVersion;
+
+  long long Parse(IMkvReader*, long long&);
+  void Init();
+};
+
+class Segment;
+class Track;
+class Cluster;
+
+class Block {
+  Block(const Block&);
+  Block& operator=(const Block&);
+
+ public:
+  const long long m_start;
+  const long long m_size;
+
+  Block(long long start, long long size, long long discard_padding);
+  ~Block();
+
+  long Parse(const Cluster*);
+
+  long long GetTrackNumber() const;
+  long long GetTimeCode(const Cluster*) const;  // absolute, but not scaled
+  long long GetTime(const Cluster*) const;  // absolute, and scaled (ns)
+  bool IsKey() const;
+  void SetKey(bool);
+  bool IsInvisible() const;
+
+  enum Lacing { kLacingNone, kLacingXiph, kLacingFixed, kLacingEbml };
+  Lacing GetLacing() const;
+
+  int GetFrameCount() const;  // to index frames: [0, count)
+
+  struct Frame {
+    long long pos;  // absolute offset
+    long len;
+
+    long Read(IMkvReader*, unsigned char*) const;
+  };
+
+  const Frame& GetFrame(int frame_index) const;
+
+  long long GetDiscardPadding() const;
+
+ private:
+  long long m_track;  // Track::Number()
+  short m_timecode;  // relative to cluster
+  unsigned char m_flags;
+
+  Frame* m_frames;
+  int m_frame_count;
+
+ protected:
+  const long long m_discard_padding;
+};
+
+class BlockEntry {
+  BlockEntry(const BlockEntry&);
+  BlockEntry& operator=(const BlockEntry&);
+
+ protected:
+  BlockEntry(Cluster*, long index);
+
+ public:
+  virtual ~BlockEntry();
+
+  bool EOS() const;
+  const Cluster* GetCluster() const;
+  long GetIndex() const;
+  virtual const Block* GetBlock() const = 0;
+
+  enum Kind { kBlockEOS, kBlockSimple, kBlockGroup };
+  virtual Kind GetKind() const = 0;
+
+ protected:
+  Cluster* const m_pCluster;
+  const long m_index;
+};
+
+class SimpleBlock : public BlockEntry {
+  SimpleBlock(const SimpleBlock&);
+  SimpleBlock& operator=(const SimpleBlock&);
+
+ public:
+  SimpleBlock(Cluster*, long index, long long start, long long size);
+  long Parse();
+
+  Kind GetKind() const;
+  const Block* GetBlock() const;
+
+ protected:
+  Block m_block;
+};
+
+class BlockGroup : public BlockEntry {
+  BlockGroup(const BlockGroup&);
+  BlockGroup& operator=(const BlockGroup&);
+
+ public:
+  BlockGroup(Cluster*, long index,
+             long long block_start,  // absolute pos of block's payload
+             long long block_size,  // size of block's payload
+             long long prev, long long next, long long duration,
+             long long discard_padding);
+
+  long Parse();
+
+  Kind GetKind() const;
+  const Block* GetBlock() const;
+
+  long long GetPrevTimeCode() const;  // relative to block's time
+  long long GetNextTimeCode() const;  // as above
+  long long GetDurationTimeCode() const;
+
+ private:
+  Block m_block;
+  const long long m_prev;
+  const long long m_next;
+  const long long m_duration;
+};
+
+///////////////////////////////////////////////////////////////
+// ContentEncoding element
+// Elements used to describe if the track data has been encrypted or
+// compressed with zlib or header stripping.
+class ContentEncoding {
+ public:
+  enum { kCTR = 1 };
+
+  ContentEncoding();
+  ~ContentEncoding();
+
+  // ContentCompression element names
+  struct ContentCompression {
+    ContentCompression();
+    ~ContentCompression();
+
+    unsigned long long algo;
+    unsigned char* settings;
+    long long settings_len;
+  };
+
+  // ContentEncAESSettings element names
+  struct ContentEncAESSettings {
+    ContentEncAESSettings() : cipher_mode(kCTR) {}
+    ~ContentEncAESSettings() {}
+
+    unsigned long long cipher_mode;
+  };
+
+  // ContentEncryption element names
+  struct ContentEncryption {
+    ContentEncryption();
+    ~ContentEncryption();
+
+    unsigned long long algo;
+    unsigned char* key_id;
+    long long key_id_len;
+    unsigned char* signature;
+    long long signature_len;
+    unsigned char* sig_key_id;
+    long long sig_key_id_len;
+    unsigned long long sig_algo;
+    unsigned long long sig_hash_algo;
+
+    ContentEncAESSettings aes_settings;
+  };
+
+  // Returns ContentCompression represented by |idx|. Returns NULL if |idx|
+  // is out of bounds.
+  const ContentCompression* GetCompressionByIndex(unsigned long idx) const;
+
+  // Returns number of ContentCompression elements in this ContentEncoding
+  // element.
+  unsigned long GetCompressionCount() const;
+
+  // Parses the ContentCompression element from |pReader|. |start| is the
+  // starting offset of the ContentCompression payload. |size| is the size in
+  // bytes of the ContentCompression payload. |compression| is where the parsed
+  // values will be stored.
+  long ParseCompressionEntry(long long start, long long size,
+                             IMkvReader* pReader,
+                             ContentCompression* compression);
+
+  // Returns ContentEncryption represented by |idx|. Returns NULL if |idx|
+  // is out of bounds.
+  const ContentEncryption* GetEncryptionByIndex(unsigned long idx) const;
+
+  // Returns number of ContentEncryption elements in this ContentEncoding
+  // element.
+  unsigned long GetEncryptionCount() const;
+
+  // Parses the ContentEncAESSettings element from |pReader|. |start| is the
+  // starting offset of the ContentEncAESSettings payload. |size| is the
+  // size in bytes of the ContentEncAESSettings payload. |encryption| is
+  // where the parsed values will be stored.
+  long ParseContentEncAESSettingsEntry(long long start, long long size,
+                                       IMkvReader* pReader,
+                                       ContentEncAESSettings* aes);
+
+  // Parses the ContentEncoding element from |pReader|. |start| is the
+  // starting offset of the ContentEncoding payload. |size| is the size in
+  // bytes of the ContentEncoding payload. Returns true on success.
+  long ParseContentEncodingEntry(long long start, long long size,
+                                 IMkvReader* pReader);
+
+  // Parses the ContentEncryption element from |pReader|. |start| is the
+  // starting offset of the ContentEncryption payload. |size| is the size in
+  // bytes of the ContentEncryption payload. |encryption| is where the parsed
+  // values will be stored.
+  long ParseEncryptionEntry(long long start, long long size,
+                            IMkvReader* pReader, ContentEncryption* encryption);
+
+  unsigned long long encoding_order() const { return encoding_order_; }
+  unsigned long long encoding_scope() const { return encoding_scope_; }
+  unsigned long long encoding_type() const { return encoding_type_; }
+
+ private:
+  // Member variables for list of ContentCompression elements.
+  ContentCompression** compression_entries_;
+  ContentCompression** compression_entries_end_;
+
+  // Member variables for list of ContentEncryption elements.
+  ContentEncryption** encryption_entries_;
+  ContentEncryption** encryption_entries_end_;
+
+  // ContentEncoding element names
+  unsigned long long encoding_order_;
+  unsigned long long encoding_scope_;
+  unsigned long long encoding_type_;
+
+  // LIBWEBM_DISALLOW_COPY_AND_ASSIGN(ContentEncoding);
+  ContentEncoding(const ContentEncoding&);
+  ContentEncoding& operator=(const ContentEncoding&);
+};
+
+class Track {
+  Track(const Track&);
+  Track& operator=(const Track&);
+
+ public:
+  class Info;
+  static long Create(Segment*, const Info&, long long element_start,
+                     long long element_size, Track*&);
+
+  enum Type { kVideo = 1, kAudio = 2, kSubtitle = 0x11, kMetadata = 0x21 };
+
+  Segment* const m_pSegment;
+  const long long m_element_start;
+  const long long m_element_size;
+  virtual ~Track();
+
+  long GetType() const;
+  long GetNumber() const;
+  unsigned long long GetUid() const;
+  const char* GetNameAsUTF8() const;
+  const char* GetLanguage() const;
+  const char* GetCodecNameAsUTF8() const;
+  const char* GetCodecId() const;
+  const unsigned char* GetCodecPrivate(size_t&) const;
+  bool GetLacing() const;
+  unsigned long long GetDefaultDuration() const;
+  unsigned long long GetCodecDelay() const;
+  unsigned long long GetSeekPreRoll() const;
+
+  const BlockEntry* GetEOS() const;
+
+  struct Settings {
+    long long start;
+    long long size;
+  };
+
+  class Info {
+   public:
+    Info();
+    ~Info();
+    int Copy(Info&) const;
+    void Clear();
+    long type;
+    long number;
+    unsigned long long uid;
+    unsigned long long defaultDuration;
+    unsigned long long codecDelay;
+    unsigned long long seekPreRoll;
+    char* nameAsUTF8;
+    char* language;
+    char* codecId;
+    char* codecNameAsUTF8;
+    unsigned char* codecPrivate;
+    size_t codecPrivateSize;
+    bool lacing;
+    Settings settings;
+
+   private:
+    Info(const Info&);
+    Info& operator=(const Info&);
+    int CopyStr(char* Info::*str, Info&) const;
+  };
+
+  long GetFirst(const BlockEntry*&) const;
+  long GetNext(const BlockEntry* pCurr, const BlockEntry*& pNext) const;
+  virtual bool VetEntry(const BlockEntry*) const;
+  virtual long Seek(long long time_ns, const BlockEntry*&) const;
+
+  const ContentEncoding* GetContentEncodingByIndex(unsigned long idx) const;
+  unsigned long GetContentEncodingCount() const;
+
+  long ParseContentEncodingsEntry(long long start, long long size);
+
+ protected:
+  Track(Segment*, long long element_start, long long element_size);
+
+  Info m_info;
+
+  class EOSBlock : public BlockEntry {
+   public:
+    EOSBlock();
+
+    Kind GetKind() const;
+    const Block* GetBlock() const;
+  };
+
+  EOSBlock m_eos;
+
+ private:
+  ContentEncoding** content_encoding_entries_;
+  ContentEncoding** content_encoding_entries_end_;
+};
+
+class VideoTrack : public Track {
+  VideoTrack(const VideoTrack&);
+  VideoTrack& operator=(const VideoTrack&);
+
+  VideoTrack(Segment*, long long element_start, long long element_size);
+
+ public:
+  static long Parse(Segment*, const Info&, long long element_start,
+                    long long element_size, VideoTrack*&);
+
+  long long GetWidth() const;
+  long long GetHeight() const;
+  long long GetDisplayWidth() const;
+  long long GetDisplayHeight() const;
+  long long GetDisplayUnit() const;
+  long long GetStereoMode() const;
+  double GetFrameRate() const;
+
+  bool VetEntry(const BlockEntry*) const;
+  long Seek(long long time_ns, const BlockEntry*&) const;
+
+ private:
+  long long m_width;
+  long long m_height;
+  long long m_display_width;
+  long long m_display_height;
+  long long m_display_unit;
+  long long m_stereo_mode;
+
+  double m_rate;
+};
+
+class AudioTrack : public Track {
+  AudioTrack(const AudioTrack&);
+  AudioTrack& operator=(const AudioTrack&);
+
+  AudioTrack(Segment*, long long element_start, long long element_size);
+
+ public:
+  static long Parse(Segment*, const Info&, long long element_start,
+                    long long element_size, AudioTrack*&);
+
+  double GetSamplingRate() const;
+  long long GetChannels() const;
+  long long GetBitDepth() const;
+
+ private:
+  double m_rate;
+  long long m_channels;
+  long long m_bitDepth;
+};
+
+class Tracks {
+  Tracks(const Tracks&);
+  Tracks& operator=(const Tracks&);
+
+ public:
+  Segment* const m_pSegment;
+  const long long m_start;
+  const long long m_size;
+  const long long m_element_start;
+  const long long m_element_size;
+
+  Tracks(Segment*, long long start, long long size, long long element_start,
+         long long element_size);
+
+  ~Tracks();
+
+  long Parse();
+
+  unsigned long GetTracksCount() const;
+
+  const Track* GetTrackByNumber(long tn) const;
+  const Track* GetTrackByIndex(unsigned long idx) const;
+
+ private:
+  Track** m_trackEntries;
+  Track** m_trackEntriesEnd;
+
+  long ParseTrackEntry(long long payload_start, long long payload_size,
+                       long long element_start, long long element_size,
+                       Track*&) const;
+};
+
+class Chapters {
+  Chapters(const Chapters&);
+  Chapters& operator=(const Chapters&);
+
+ public:
+  Segment* const m_pSegment;
+  const long long m_start;
+  const long long m_size;
+  const long long m_element_start;
+  const long long m_element_size;
+
+  Chapters(Segment*, long long payload_start, long long payload_size,
+           long long element_start, long long element_size);
+
+  ~Chapters();
+
+  long Parse();
+
+  class Atom;
+  class Edition;
+
+  class Display {
+    friend class Atom;
+    Display();
+    Display(const Display&);
+    ~Display();
+    Display& operator=(const Display&);
+
+   public:
+    const char* GetString() const;
+    const char* GetLanguage() const;
+    const char* GetCountry() const;
+
+   private:
+    void Init();
+    void ShallowCopy(Display&) const;
+    void Clear();
+    long Parse(IMkvReader*, long long pos, long long size);
+
+    char* m_string;
+    char* m_language;
+    char* m_country;
+  };
+
+  class Atom {
+    friend class Edition;
+    Atom();
+    Atom(const Atom&);
+    ~Atom();
+    Atom& operator=(const Atom&);
+
+   public:
+    unsigned long long GetUID() const;
+    const char* GetStringUID() const;
+
+    long long GetStartTimecode() const;
+    long long GetStopTimecode() const;
+
+    long long GetStartTime(const Chapters*) const;
+    long long GetStopTime(const Chapters*) const;
+
+    int GetDisplayCount() const;
+    const Display* GetDisplay(int index) const;
+
+   private:
+    void Init();
+    void ShallowCopy(Atom&) const;
+    void Clear();
+    long Parse(IMkvReader*, long long pos, long long size);
+    static long long GetTime(const Chapters*, long long timecode);
+
+    long ParseDisplay(IMkvReader*, long long pos, long long size);
+    bool ExpandDisplaysArray();
+
+    char* m_string_uid;
+    unsigned long long m_uid;
+    long long m_start_timecode;
+    long long m_stop_timecode;
+
+    Display* m_displays;
+    int m_displays_size;
+    int m_displays_count;
+  };
+
+  class Edition {
+    friend class Chapters;
+    Edition();
+    Edition(const Edition&);
+    ~Edition();
+    Edition& operator=(const Edition&);
+
+   public:
+    int GetAtomCount() const;
+    const Atom* GetAtom(int index) const;
+
+   private:
+    void Init();
+    void ShallowCopy(Edition&) const;
+    void Clear();
+    long Parse(IMkvReader*, long long pos, long long size);
+
+    long ParseAtom(IMkvReader*, long long pos, long long size);
+    bool ExpandAtomsArray();
+
+    Atom* m_atoms;
+    int m_atoms_size;
+    int m_atoms_count;
+  };
+
+  int GetEditionCount() const;
+  const Edition* GetEdition(int index) const;
+
+ private:
+  long ParseEdition(long long pos, long long size);
+  bool ExpandEditionsArray();
+
+  Edition* m_editions;
+  int m_editions_size;
+  int m_editions_count;
+};
+
+class Tags {
+  Tags(const Tags&);
+  Tags& operator=(const Tags&);
+
+ public:
+  Segment* const m_pSegment;
+  const long long m_start;
+  const long long m_size;
+  const long long m_element_start;
+  const long long m_element_size;
+
+  Tags(Segment*, long long payload_start, long long payload_size,
+       long long element_start, long long element_size);
+
+  ~Tags();
+
+  long Parse();
+
+  class Tag;
+  class SimpleTag;
+
+  class SimpleTag {
+    friend class Tag;
+    SimpleTag();
+    SimpleTag(const SimpleTag&);
+    ~SimpleTag();
+    SimpleTag& operator=(const SimpleTag&);
+
+   public:
+    const char* GetTagName() const;
+    const char* GetTagString() const;
+
+   private:
+    void Init();
+    void ShallowCopy(SimpleTag&) const;
+    void Clear();
+    long Parse(IMkvReader*, long long pos, long long size);
+
+    char* m_tag_name;
+    char* m_tag_string;
+  };
+
+  class Tag {
+    friend class Tags;
+    Tag();
+    Tag(const Tag&);
+    ~Tag();
+    Tag& operator=(const Tag&);
+
+   public:
+    int GetSimpleTagCount() const;
+    const SimpleTag* GetSimpleTag(int index) const;
+
+   private:
+    void Init();
+    void ShallowCopy(Tag&) const;
+    void Clear();
+    long Parse(IMkvReader*, long long pos, long long size);
+
+    long ParseSimpleTag(IMkvReader*, long long pos, long long size);
+    bool ExpandSimpleTagsArray();
+
+    SimpleTag* m_simple_tags;
+    int m_simple_tags_size;
+    int m_simple_tags_count;
+  };
+
+  int GetTagCount() const;
+  const Tag* GetTag(int index) const;
+
+ private:
+  long ParseTag(long long pos, long long size);
+  bool ExpandTagsArray();
+
+  Tag* m_tags;
+  int m_tags_size;
+  int m_tags_count;
+};
+
+class SegmentInfo {
+  SegmentInfo(const SegmentInfo&);
+  SegmentInfo& operator=(const SegmentInfo&);
+
+ public:
+  Segment* const m_pSegment;
+  const long long m_start;
+  const long long m_size;
+  const long long m_element_start;
+  const long long m_element_size;
+
+  SegmentInfo(Segment*, long long start, long long size,
+              long long element_start, long long element_size);
+
+  ~SegmentInfo();
+
+  long Parse();
+
+  long long GetTimeCodeScale() const;
+  long long GetDuration() const;  // scaled
+  const char* GetMuxingAppAsUTF8() const;
+  const char* GetWritingAppAsUTF8() const;
+  const char* GetTitleAsUTF8() const;
+
+ private:
+  long long m_timecodeScale;
+  double m_duration;
+  char* m_pMuxingAppAsUTF8;
+  char* m_pWritingAppAsUTF8;
+  char* m_pTitleAsUTF8;
+};
+
+class SeekHead {
+  SeekHead(const SeekHead&);
+  SeekHead& operator=(const SeekHead&);
+
+ public:
+  Segment* const m_pSegment;
+  const long long m_start;
+  const long long m_size;
+  const long long m_element_start;
+  const long long m_element_size;
+
+  SeekHead(Segment*, long long start, long long size, long long element_start,
+           long long element_size);
+
+  ~SeekHead();
+
+  long Parse();
+
+  struct Entry {
+    // the SeekHead entry payload
+    long long id;
+    long long pos;
+
+    // absolute pos of SeekEntry ID
+    long long element_start;
+
+    // SeekEntry ID size + size size + payload
+    long long element_size;
+  };
+
+  int GetCount() const;
+  const Entry* GetEntry(int idx) const;
+
+  struct VoidElement {
+    // absolute pos of Void ID
+    long long element_start;
+
+    // ID size + size size + payload size
+    long long element_size;
+  };
+
+  int GetVoidElementCount() const;
+  const VoidElement* GetVoidElement(int idx) const;
+
+ private:
+  Entry* m_entries;
+  int m_entry_count;
+
+  VoidElement* m_void_elements;
+  int m_void_element_count;
+
+  static bool ParseEntry(IMkvReader*,
+                         long long pos,  // payload
+                         long long size, Entry*);
+};
+
+class Cues;
+class CuePoint {
+  friend class Cues;
+
+  CuePoint(long, long long);
+  ~CuePoint();
+
+  CuePoint(const CuePoint&);
+  CuePoint& operator=(const CuePoint&);
+
+ public:
+  long long m_element_start;
+  long long m_element_size;
+
+  bool Load(IMkvReader*);
+
+  long long GetTimeCode() const;  // absolute but unscaled
+  long long GetTime(const Segment*) const;  // absolute and scaled (ns units)
+
+  struct TrackPosition {
+    long long m_track;
+    long long m_pos;  // of cluster
+    long long m_block;
+    // codec_state  //defaults to 0
+    // reference = clusters containing req'd referenced blocks
+    //  reftime = timecode of the referenced block
+
+    bool Parse(IMkvReader*, long long, long long);
+  };
+
+  const TrackPosition* Find(const Track*) const;
+
+ private:
+  const long m_index;
+  long long m_timecode;
+  TrackPosition* m_track_positions;
+  size_t m_track_positions_count;
+};
+
+class Cues {
+  friend class Segment;
+
+  Cues(Segment*, long long start, long long size, long long element_start,
+       long long element_size);
+  ~Cues();
+
+  Cues(const Cues&);
+  Cues& operator=(const Cues&);
+
+ public:
+  Segment* const m_pSegment;
+  const long long m_start;
+  const long long m_size;
+  const long long m_element_start;
+  const long long m_element_size;
+
+  bool Find(  // lower bound of time_ns
+      long long time_ns, const Track*, const CuePoint*&,
+      const CuePoint::TrackPosition*&) const;
+
+  const CuePoint* GetFirst() const;
+  const CuePoint* GetLast() const;
+  const CuePoint* GetNext(const CuePoint*) const;
+
+  const BlockEntry* GetBlock(const CuePoint*,
+                             const CuePoint::TrackPosition*) const;
+
+  bool LoadCuePoint() const;
+  long GetCount() const;  // loaded only
+  // long GetTotal() const;  //loaded + preloaded
+  bool DoneParsing() const;
+
+ private:
+  bool Init() const;
+  bool PreloadCuePoint(long&, long long) const;
+
+  mutable CuePoint** m_cue_points;
+  mutable long m_count;
+  mutable long m_preload_count;
+  mutable long long m_pos;
+};
+
+class Cluster {
+  friend class Segment;
+
+  Cluster(const Cluster&);
+  Cluster& operator=(const Cluster&);
+
+ public:
+  Segment* const m_pSegment;
+
+ public:
+  static Cluster* Create(Segment*,
+                         long index,  // index in segment
+                         long long off);  // offset relative to segment
+  // long long element_size);
+
+  Cluster();  // EndOfStream
+  ~Cluster();
+
+  bool EOS() const;
+
+  long long GetTimeCode() const;  // absolute, but not scaled
+  long long GetTime() const;  // absolute, and scaled (nanosecond units)
+  long long GetFirstTime() const;  // time (ns) of first (earliest) block
+  long long GetLastTime() const;  // time (ns) of last (latest) block
+
+  long GetFirst(const BlockEntry*&) const;
+  long GetLast(const BlockEntry*&) const;
+  long GetNext(const BlockEntry* curr, const BlockEntry*& next) const;
+
+  const BlockEntry* GetEntry(const Track*, long long ns = -1) const;
+  const BlockEntry* GetEntry(const CuePoint&,
+                             const CuePoint::TrackPosition&) const;
+  // const BlockEntry* GetMaxKey(const VideoTrack*) const;
+
+  //    static bool HasBlockEntries(const Segment*, long long);
+
+  static long HasBlockEntries(const Segment*, long long idoff, long long& pos,
+                              long& size);
+
+  long GetEntryCount() const;
+
+  long Load(long long& pos, long& size) const;
+
+  long Parse(long long& pos, long& size) const;
+  long GetEntry(long index, const mkvparser::BlockEntry*&) const;
+
+ protected:
+  Cluster(Segment*, long index, long long element_start);
+  // long long element_size);
+
+ public:
+  const long long m_element_start;
+  long long GetPosition() const;  // offset relative to segment
+
+  long GetIndex() const;
+  long long GetElementSize() const;
+  // long long GetPayloadSize() const;
+
+  // long long Unparsed() const;
+
+ private:
+  long m_index;
+  mutable long long m_pos;
+  // mutable long long m_size;
+  mutable long long m_element_size;
+  mutable long long m_timecode;
+  mutable BlockEntry** m_entries;
+  mutable long m_entries_size;
+  mutable long m_entries_count;
+
+  long ParseSimpleBlock(long long, long long&, long&);
+  long ParseBlockGroup(long long, long long&, long&);
+
+  long CreateBlock(long long id, long long pos, long long size,
+                   long long discard_padding);
+  long CreateBlockGroup(long long start_offset, long long size,
+                        long long discard_padding);
+  long CreateSimpleBlock(long long, long long);
+};
+
+class Segment {
+  friend class Cues;
+  friend class Track;
+  friend class VideoTrack;
+
+  Segment(const Segment&);
+  Segment& operator=(const Segment&);
+
+ private:
+  Segment(IMkvReader*, long long elem_start,
+          // long long elem_size,
+          long long pos, long long size);
+
+ public:
+  IMkvReader* const m_pReader;
+  const long long m_element_start;
+  // const long long m_element_size;
+  const long long m_start;  // posn of segment payload
+  const long long m_size;  // size of segment payload
+  Cluster m_eos;  // TODO: make private?
+
+  static long long CreateInstance(IMkvReader*, long long, Segment*&);
+  ~Segment();
+
+  long Load();  // loads headers and all clusters
+
+  // for incremental loading
+  // long long Unparsed() const;
+  bool DoneParsing() const;
+  long long ParseHeaders();  // stops when first cluster is found
+  // long FindNextCluster(long long& pos, long& size) const;
+  long LoadCluster(long long& pos, long& size);  // load one cluster
+  long LoadCluster();
+
+  long ParseNext(const Cluster* pCurr, const Cluster*& pNext, long long& pos,
+                 long& size);
+
+  const SeekHead* GetSeekHead() const;
+  const Tracks* GetTracks() const;
+  const SegmentInfo* GetInfo() const;
+  const Cues* GetCues() const;
+  const Chapters* GetChapters() const;
+  const Tags* GetTags() const;
+
+  long long GetDuration() const;
+
+  unsigned long GetCount() const;
+  const Cluster* GetFirst() const;
+  const Cluster* GetLast() const;
+  const Cluster* GetNext(const Cluster*);
+
+  const Cluster* FindCluster(long long time_nanoseconds) const;
+  // const BlockEntry* Seek(long long time_nanoseconds, const Track*) const;
+
+  const Cluster* FindOrPreloadCluster(long long pos);
+
+  long ParseCues(long long cues_off,  // offset relative to start of segment
+                 long long& parse_pos, long& parse_len);
+
+ private:
+  long long m_pos;  // absolute file posn; what has been consumed so far
+  Cluster* m_pUnknownSize;
+
+  SeekHead* m_pSeekHead;
+  SegmentInfo* m_pInfo;
+  Tracks* m_pTracks;
+  Cues* m_pCues;
+  Chapters* m_pChapters;
+  Tags* m_pTags;
+  Cluster** m_clusters;
+  long m_clusterCount;  // number of entries for which m_index >= 0
+  long m_clusterPreloadCount;  // number of entries for which m_index < 0
+  long m_clusterSize;  // array size
+
+  long DoLoadCluster(long long&, long&);
+  long DoLoadClusterUnknownSize(long long&, long&);
+  long DoParseNext(const Cluster*&, long long&, long&);
+
+  bool AppendCluster(Cluster*);
+  bool PreloadCluster(Cluster*, ptrdiff_t);
+
+  // void ParseSeekHead(long long pos, long long size);
+  // void ParseSeekEntry(long long pos, long long size);
+  // void ParseCues(long long);
+
+  const BlockEntry* GetBlock(const CuePoint&, const CuePoint::TrackPosition&);
+};
+
+}  // end namespace mkvparser
+
+inline long mkvparser::Segment::LoadCluster() {
+  long long pos;
+  long size;
+
+  return LoadCluster(pos, size);
+}
+
+#endif  // MKVPARSER_HPP
diff --git a/libs/libvpx/third_party/libwebm/mkvreader.cpp b/libs/libvpx/third_party/libwebm/mkvreader.cpp
new file mode 100644
index 0000000000..eaf9e0a799
--- /dev/null
+++ b/libs/libvpx/third_party/libwebm/mkvreader.cpp
@@ -0,0 +1,132 @@
+// Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+
+#include "mkvreader.hpp"
+
+#include <cassert>
+
+namespace mkvparser {
+
+MkvReader::MkvReader() : m_file(NULL), reader_owns_file_(true) {}
+
+MkvReader::MkvReader(FILE* fp) : m_file(fp), reader_owns_file_(false) {
+  GetFileSize();
+}
+
+MkvReader::~MkvReader() {
+  if (reader_owns_file_)
+    Close();
+  m_file = NULL;
+}
+
+int MkvReader::Open(const char* fileName) {
+  if (fileName == NULL)
+    return -1;
+
+  if (m_file)
+    return -1;
+
+#ifdef _MSC_VER
+  const errno_t e = fopen_s(&m_file, fileName, "rb");
+
+  if (e)
+    return -1;  // error
+#else
+  m_file = fopen(fileName, "rb");
+
+  if (m_file == NULL)
+    return -1;
+#endif
+  return !GetFileSize();
+}
+
+bool MkvReader::GetFileSize() {
+  if (m_file == NULL)
+    return false;
+#ifdef _MSC_VER
+  int status = _fseeki64(m_file, 0L, SEEK_END);
+
+  if (status)
+    return false;  // error
+
+  m_length = _ftelli64(m_file);
+#else
+  fseek(m_file, 0L, SEEK_END);
+  m_length = ftell(m_file);
+#endif
+  assert(m_length >= 0);
+
+  if (m_length < 0)
+    return false;
+
+#ifdef _MSC_VER
+  status = _fseeki64(m_file, 0L, SEEK_SET);
+
+  if (status)
+    return false;  // error
+#else
+  fseek(m_file, 0L, SEEK_SET);
+#endif
+
+  return true;
+}
+
+void MkvReader::Close() {
+  if (m_file != NULL) {
+    fclose(m_file);
+    m_file = NULL;
+  }
+}
+
+int MkvReader::Length(long long* total, long long* available) {
+  if (m_file == NULL)
+    return -1;
+
+  if (total)
+    *total = m_length;
+
+  if (available)
+    *available = m_length;
+
+  return 0;
+}
+
+int MkvReader::Read(long long offset, long len, unsigned char* buffer) {
+  if (m_file == NULL)
+    return -1;
+
+  if (offset < 0)
+    return -1;
+
+  if (len < 0)
+    return -1;
+
+  if (len == 0)
+    return 0;
+
+  if (offset >= m_length)
+    return -1;
+
+#ifdef _MSC_VER
+  const int status = _fseeki64(m_file, offset, SEEK_SET);
+
+  if (status)
+    return -1;  // error
+#else
+  fseek(m_file, offset, SEEK_SET);
+#endif
+
+  const size_t size = fread(buffer, 1, len, m_file);
+
+  if (size < size_t(len))
+    return -1;  // error
+
+  return 0;  // success
+}
+
+}  // end namespace mkvparser
diff --git a/libs/libvpx/third_party/libwebm/mkvreader.hpp b/libs/libvpx/third_party/libwebm/mkvreader.hpp
new file mode 100644
index 0000000000..82ebad5444
--- /dev/null
+++ b/libs/libvpx/third_party/libwebm/mkvreader.hpp
@@ -0,0 +1,45 @@
+// Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+
+#ifndef MKVREADER_HPP
+#define MKVREADER_HPP
+
+#include "mkvparser.hpp"
+#include <cstdio>
+
+namespace mkvparser {
+
+class MkvReader : public IMkvReader {
+ public:
+  MkvReader();
+  explicit MkvReader(FILE* fp);
+  virtual ~MkvReader();
+
+  int Open(const char*);
+  void Close();
+
+  virtual int Read(long long position, long length, unsigned char* buffer);
+  virtual int Length(long long* total, long long* available);
+
+ private:
+  MkvReader(const MkvReader&);
+  MkvReader& operator=(const MkvReader&);
+
+  // Determines the size of the file. This is called either by the constructor
+  // or by the Open function depending on file ownership. Returns true on
+  // success.
+  bool GetFileSize();
+
+  long long m_length;
+  FILE* m_file;
+  bool reader_owns_file_;
+};
+
+}  // end namespace mkvparser
+
+#endif  // MKVREADER_HPP
diff --git a/libs/libvpx/third_party/libwebm/mkvwriter.cpp b/libs/libvpx/third_party/libwebm/mkvwriter.cpp
new file mode 100644
index 0000000000..75d4350c70
--- /dev/null
+++ b/libs/libvpx/third_party/libwebm/mkvwriter.cpp
@@ -0,0 +1,90 @@
+// Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+
+#include "mkvwriter.hpp"
+
+#ifdef _MSC_VER
+#include <share.h>  // for _SH_DENYWR
+#endif
+
+#include <new>
+
+namespace mkvmuxer {
+
+MkvWriter::MkvWriter() : file_(NULL), writer_owns_file_(true) {}
+
+MkvWriter::MkvWriter(FILE* fp) : file_(fp), writer_owns_file_(false) {}
+
+MkvWriter::~MkvWriter() { Close(); }
+
+int32 MkvWriter::Write(const void* buffer, uint32 length) {
+  if (!file_)
+    return -1;
+
+  if (length == 0)
+    return 0;
+
+  if (buffer == NULL)
+    return -1;
+
+  const size_t bytes_written = fwrite(buffer, 1, length, file_);
+
+  return (bytes_written == length) ? 0 : -1;
+}
+
+bool MkvWriter::Open(const char* filename) {
+  if (filename == NULL)
+    return false;
+
+  if (file_)
+    return false;
+
+#ifdef _MSC_VER
+  file_ = _fsopen(filename, "wb", _SH_DENYWR);
+#else
+  file_ = fopen(filename, "wb");
+#endif
+  if (file_ == NULL)
+    return false;
+  return true;
+}
+
+void MkvWriter::Close() {
+  if (file_ && writer_owns_file_) {
+    fclose(file_);
+  }
+  file_ = NULL;
+}
+
+int64 MkvWriter::Position() const {
+  if (!file_)
+    return 0;
+
+#ifdef _MSC_VER
+  return _ftelli64(file_);
+#else
+  return ftell(file_);
+#endif
+}
+
+int32 MkvWriter::Position(int64 position) {
+  if (!file_)
+    return -1;
+
+#ifdef _MSC_VER
+  return _fseeki64(file_, position, SEEK_SET);
+#else
+  return fseek(file_, position, SEEK_SET);
+#endif
+}
+
+bool MkvWriter::Seekable() const { return true; }
+
+void MkvWriter::ElementStartNotify(uint64, int64) {}
+
+}  // namespace mkvmuxer
diff --git a/libs/libvpx/third_party/libwebm/mkvwriter.hpp b/libs/libvpx/third_party/libwebm/mkvwriter.hpp
new file mode 100644
index 0000000000..684560c92d
--- /dev/null
+++ b/libs/libvpx/third_party/libwebm/mkvwriter.hpp
@@ -0,0 +1,51 @@
+// Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+
+#ifndef MKVWRITER_HPP
+#define MKVWRITER_HPP
+
+#include <stdio.h>
+
+#include "mkvmuxer.hpp"
+#include "mkvmuxertypes.hpp"
+
+namespace mkvmuxer {
+
+// Default implementation of the IMkvWriter interface on Windows.
+class MkvWriter : public IMkvWriter {
+ public:
+  MkvWriter();
+  explicit MkvWriter(FILE* fp);
+  virtual ~MkvWriter();
+
+  // IMkvWriter interface
+  virtual int64 Position() const;
+  virtual int32 Position(int64 position);
+  virtual bool Seekable() const;
+  virtual int32 Write(const void* buffer, uint32 length);
+  virtual void ElementStartNotify(uint64 element_id, int64 position);
+
+  // Creates and opens a file for writing. |filename| is the name of the file
+  // to open. This function will overwrite the contents of |filename|. Returns
+  // true on success.
+  bool Open(const char* filename);
+
+  // Closes an opened file.
+  void Close();
+
+ private:
+  // File handle to output file.
+  FILE* file_;
+  bool writer_owns_file_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(MkvWriter);
+};
+
+}  // end namespace mkvmuxer
+
+#endif  // MKVWRITER_HPP
diff --git a/libs/libvpx/third_party/libwebm/webmids.hpp b/libs/libvpx/third_party/libwebm/webmids.hpp
new file mode 100644
index 0000000000..ad4ab57388
--- /dev/null
+++ b/libs/libvpx/third_party/libwebm/webmids.hpp
@@ -0,0 +1,155 @@
+// Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+
+#ifndef WEBMIDS_HPP
+#define WEBMIDS_HPP
+
+namespace mkvmuxer {
+
+enum MkvId {
+  kMkvEBML = 0x1A45DFA3,
+  kMkvEBMLVersion = 0x4286,
+  kMkvEBMLReadVersion = 0x42F7,
+  kMkvEBMLMaxIDLength = 0x42F2,
+  kMkvEBMLMaxSizeLength = 0x42F3,
+  kMkvDocType = 0x4282,
+  kMkvDocTypeVersion = 0x4287,
+  kMkvDocTypeReadVersion = 0x4285,
+  kMkvVoid = 0xEC,
+  kMkvSignatureSlot = 0x1B538667,
+  kMkvSignatureAlgo = 0x7E8A,
+  kMkvSignatureHash = 0x7E9A,
+  kMkvSignaturePublicKey = 0x7EA5,
+  kMkvSignature = 0x7EB5,
+  kMkvSignatureElements = 0x7E5B,
+  kMkvSignatureElementList = 0x7E7B,
+  kMkvSignedElement = 0x6532,
+  // segment
+  kMkvSegment = 0x18538067,
+  // Meta Seek Information
+  kMkvSeekHead = 0x114D9B74,
+  kMkvSeek = 0x4DBB,
+  kMkvSeekID = 0x53AB,
+  kMkvSeekPosition = 0x53AC,
+  // Segment Information
+  kMkvInfo = 0x1549A966,
+  kMkvTimecodeScale = 0x2AD7B1,
+  kMkvDuration = 0x4489,
+  kMkvDateUTC = 0x4461,
+  kMkvTitle = 0x7BA9,
+  kMkvMuxingApp = 0x4D80,
+  kMkvWritingApp = 0x5741,
+  // Cluster
+  kMkvCluster = 0x1F43B675,
+  kMkvTimecode = 0xE7,
+  kMkvPrevSize = 0xAB,
+  kMkvBlockGroup = 0xA0,
+  kMkvBlock = 0xA1,
+  kMkvBlockDuration = 0x9B,
+  kMkvReferenceBlock = 0xFB,
+  kMkvLaceNumber = 0xCC,
+  kMkvSimpleBlock = 0xA3,
+  kMkvBlockAdditions = 0x75A1,
+  kMkvBlockMore = 0xA6,
+  kMkvBlockAddID = 0xEE,
+  kMkvBlockAdditional = 0xA5,
+  kMkvDiscardPadding = 0x75A2,
+  // Track
+  kMkvTracks = 0x1654AE6B,
+  kMkvTrackEntry = 0xAE,
+  kMkvTrackNumber = 0xD7,
+  kMkvTrackUID = 0x73C5,
+  kMkvTrackType = 0x83,
+  kMkvFlagEnabled = 0xB9,
+  kMkvFlagDefault = 0x88,
+  kMkvFlagForced = 0x55AA,
+  kMkvFlagLacing = 0x9C,
+  kMkvDefaultDuration = 0x23E383,
+  kMkvMaxBlockAdditionID = 0x55EE,
+  kMkvName = 0x536E,
+  kMkvLanguage = 0x22B59C,
+  kMkvCodecID = 0x86,
+  kMkvCodecPrivate = 0x63A2,
+  kMkvCodecName = 0x258688,
+  kMkvCodecDelay = 0x56AA,
+  kMkvSeekPreRoll = 0x56BB,
+  // video
+  kMkvVideo = 0xE0,
+  kMkvFlagInterlaced = 0x9A,
+  kMkvStereoMode = 0x53B8,
+  kMkvAlphaMode = 0x53C0,
+  kMkvPixelWidth = 0xB0,
+  kMkvPixelHeight = 0xBA,
+  kMkvPixelCropBottom = 0x54AA,
+  kMkvPixelCropTop = 0x54BB,
+  kMkvPixelCropLeft = 0x54CC,
+  kMkvPixelCropRight = 0x54DD,
+  kMkvDisplayWidth = 0x54B0,
+  kMkvDisplayHeight = 0x54BA,
+  kMkvDisplayUnit = 0x54B2,
+  kMkvAspectRatioType = 0x54B3,
+  kMkvFrameRate = 0x2383E3,
+  // end video
+  // audio
+  kMkvAudio = 0xE1,
+  kMkvSamplingFrequency = 0xB5,
+  kMkvOutputSamplingFrequency = 0x78B5,
+  kMkvChannels = 0x9F,
+  kMkvBitDepth = 0x6264,
+  // end audio
+  // ContentEncodings
+  kMkvContentEncodings = 0x6D80,
+  kMkvContentEncoding = 0x6240,
+  kMkvContentEncodingOrder = 0x5031,
+  kMkvContentEncodingScope = 0x5032,
+  kMkvContentEncodingType = 0x5033,
+  kMkvContentCompression = 0x5034,
+  kMkvContentCompAlgo = 0x4254,
+  kMkvContentCompSettings = 0x4255,
+  kMkvContentEncryption = 0x5035,
+  kMkvContentEncAlgo = 0x47E1,
+  kMkvContentEncKeyID = 0x47E2,
+  kMkvContentSignature = 0x47E3,
+  kMkvContentSigKeyID = 0x47E4,
+  kMkvContentSigAlgo = 0x47E5,
+  kMkvContentSigHashAlgo = 0x47E6,
+  kMkvContentEncAESSettings = 0x47E7,
+  kMkvAESSettingsCipherMode = 0x47E8,
+  kMkvAESSettingsCipherInitData = 0x47E9,
+  // end ContentEncodings
+  // Cueing Data
+  kMkvCues = 0x1C53BB6B,
+  kMkvCuePoint = 0xBB,
+  kMkvCueTime = 0xB3,
+  kMkvCueTrackPositions = 0xB7,
+  kMkvCueTrack = 0xF7,
+  kMkvCueClusterPosition = 0xF1,
+  kMkvCueBlockNumber = 0x5378,
+  // Chapters
+  kMkvChapters = 0x1043A770,
+  kMkvEditionEntry = 0x45B9,
+  kMkvChapterAtom = 0xB6,
+  kMkvChapterUID = 0x73C4,
+  kMkvChapterStringUID = 0x5654,
+  kMkvChapterTimeStart = 0x91,
+  kMkvChapterTimeEnd = 0x92,
+  kMkvChapterDisplay = 0x80,
+  kMkvChapString = 0x85,
+  kMkvChapLanguage = 0x437C,
+  kMkvChapCountry = 0x437E,
+  // Tags
+  kMkvTags = 0x1254C367,
+  kMkvTag = 0x7373,
+  kMkvSimpleTag = 0x67C8,
+  kMkvTagName = 0x45A3,
+  kMkvTagString = 0x4487
+};
+
+}  // end namespace mkvmuxer
+
+#endif  // WEBMIDS_HPP
diff --git a/libs/libvpx/third_party/libyuv/README.libvpx b/libs/libvpx/third_party/libyuv/README.libvpx
new file mode 100644
index 0000000000..09693c1f2c
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/README.libvpx
@@ -0,0 +1,15 @@
+Name: libyuv
+URL: http://code.google.com/p/libyuv/
+Version: 1456
+License: BSD
+License File: LICENSE
+
+Description:
+libyuv is an open source project that includes YUV conversion and scaling
+functionality.
+
+The optimized scaler in libyuv is used in multiple resolution encoder example,
+which down-samples the original input video (f.g. 1280x720) a number of times
+in order to encode multiple resolution bit streams.
+
+Local Modifications:
diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/basic_types.h b/libs/libvpx/third_party/libyuv/include/libyuv/basic_types.h
new file mode 100644
index 0000000000..beb750ba65
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/include/libyuv/basic_types.h
@@ -0,0 +1,118 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_  // NOLINT
+#define INCLUDE_LIBYUV_BASIC_TYPES_H_
+
+#include <stddef.h>  // for NULL, size_t
+
+#if defined(__ANDROID__) || (defined(_MSC_VER) && (_MSC_VER < 1600))
+#include <sys/types.h>  // for uintptr_t on x86
+#else
+#include <stdint.h>  // for uintptr_t
+#endif
+
+#ifndef GG_LONGLONG
+#ifndef INT_TYPES_DEFINED
+#define INT_TYPES_DEFINED
+#ifdef COMPILER_MSVC
+typedef unsigned __int64 uint64;
+typedef __int64 int64;
+#ifndef INT64_C
+#define INT64_C(x) x ## I64
+#endif
+#ifndef UINT64_C
+#define UINT64_C(x) x ## UI64
+#endif
+#define INT64_F "I64"
+#else  // COMPILER_MSVC
+#if defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
+typedef unsigned long uint64;  // NOLINT
+typedef long int64;  // NOLINT
+#ifndef INT64_C
+#define INT64_C(x) x ## L
+#endif
+#ifndef UINT64_C
+#define UINT64_C(x) x ## UL
+#endif
+#define INT64_F "l"
+#else  // defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
+typedef unsigned long long uint64;  // NOLINT
+typedef long long int64;  // NOLINT
+#ifndef INT64_C
+#define INT64_C(x) x ## LL
+#endif
+#ifndef UINT64_C
+#define UINT64_C(x) x ## ULL
+#endif
+#define INT64_F "ll"
+#endif  // __LP64__
+#endif  // COMPILER_MSVC
+typedef unsigned int uint32;
+typedef int int32;
+typedef unsigned short uint16;  // NOLINT
+typedef short int16;  // NOLINT
+typedef unsigned char uint8;
+typedef signed char int8;
+#endif  // INT_TYPES_DEFINED
+#endif  // GG_LONGLONG
+
+// Detect compiler is for x86 or x64.
+#if defined(__x86_64__) || defined(_M_X64) || \
+    defined(__i386__) || defined(_M_IX86)
+#define CPU_X86 1
+#endif
+// Detect compiler is for ARM.
+#if defined(__arm__) || defined(_M_ARM)
+#define CPU_ARM 1
+#endif
+
+#ifndef ALIGNP
+#ifdef __cplusplus
+#define ALIGNP(p, t) \
+    (reinterpret_cast<uint8*>(((reinterpret_cast<uintptr_t>(p) + \
+    ((t) - 1)) & ~((t) - 1))))
+#else
+#define ALIGNP(p, t) \
+    ((uint8*)((((uintptr_t)(p) + ((t) - 1)) & ~((t) - 1))))  /* NOLINT */
+#endif
+#endif
+
+#if !defined(LIBYUV_API)
+#if defined(_WIN32) || defined(__CYGWIN__)
+#if defined(LIBYUV_BUILDING_SHARED_LIBRARY)
+#define LIBYUV_API __declspec(dllexport)
+#elif defined(LIBYUV_USING_SHARED_LIBRARY)
+#define LIBYUV_API __declspec(dllimport)
+#else
+#define LIBYUV_API
+#endif  // LIBYUV_BUILDING_SHARED_LIBRARY
+#elif defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__APPLE__) && \
+    (defined(LIBYUV_BUILDING_SHARED_LIBRARY) || \
+    defined(LIBYUV_USING_SHARED_LIBRARY))
+#define LIBYUV_API __attribute__ ((visibility ("default")))
+#else
+#define LIBYUV_API
+#endif  // __GNUC__
+#endif  // LIBYUV_API
+
+#define LIBYUV_BOOL int
+#define LIBYUV_FALSE 0
+#define LIBYUV_TRUE 1
+
+// Visual C x86 or GCC little endian.
+#if defined(__x86_64__) || defined(_M_X64) || \
+  defined(__i386__) || defined(_M_IX86) || \
+  defined(__arm__) || defined(_M_ARM) || \
+  (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#define LIBYUV_LITTLE_ENDIAN
+#endif
+
+#endif  // INCLUDE_LIBYUV_BASIC_TYPES_H_  NOLINT
diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/compare.h b/libs/libvpx/third_party/libyuv/include/libyuv/compare.h
new file mode 100644
index 0000000000..08b2bb2ecf
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/include/libyuv/compare.h
@@ -0,0 +1,78 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_COMPARE_H_  // NOLINT
+#define INCLUDE_LIBYUV_COMPARE_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Compute a hash for specified memory. Seed of 5381 recommended.
+LIBYUV_API
+uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed);
+
+// Scan an opaque argb image and return fourcc based on alpha offset.
+// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
+LIBYUV_API
+uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height);
+
+// Sum Square Error - used to compute Mean Square Error or PSNR.
+LIBYUV_API
+uint64 ComputeSumSquareError(const uint8* src_a,
+                             const uint8* src_b, int count);
+
+LIBYUV_API
+uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
+                                  const uint8* src_b, int stride_b,
+                                  int width, int height);
+
+static const int kMaxPsnr = 128;
+
+LIBYUV_API
+double SumSquareErrorToPsnr(uint64 sse, uint64 count);
+
+LIBYUV_API
+double CalcFramePsnr(const uint8* src_a, int stride_a,
+                     const uint8* src_b, int stride_b,
+                     int width, int height);
+
+LIBYUV_API
+double I420Psnr(const uint8* src_y_a, int stride_y_a,
+                const uint8* src_u_a, int stride_u_a,
+                const uint8* src_v_a, int stride_v_a,
+                const uint8* src_y_b, int stride_y_b,
+                const uint8* src_u_b, int stride_u_b,
+                const uint8* src_v_b, int stride_v_b,
+                int width, int height);
+
+LIBYUV_API
+double CalcFrameSsim(const uint8* src_a, int stride_a,
+                     const uint8* src_b, int stride_b,
+                     int width, int height);
+
+LIBYUV_API
+double I420Ssim(const uint8* src_y_a, int stride_y_a,
+                const uint8* src_u_a, int stride_u_a,
+                const uint8* src_v_a, int stride_v_a,
+                const uint8* src_y_b, int stride_y_b,
+                const uint8* src_u_b, int stride_u_b,
+                const uint8* src_v_b, int stride_v_b,
+                int width, int height);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_COMPARE_H_  NOLINT
diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/convert.h b/libs/libvpx/third_party/libyuv/include/libyuv/convert.h
new file mode 100644
index 0000000000..a8d3fa07ac
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/include/libyuv/convert.h
@@ -0,0 +1,245 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_CONVERT_H_  // NOLINT
+#define INCLUDE_LIBYUV_CONVERT_H_
+
+#include "libyuv/basic_types.h"
+// TODO(fbarchard): Remove the following headers includes.
+#include "libyuv/convert_from.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Convert I444 to I420.
+LIBYUV_API
+int I444ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert I422 to I420.
+LIBYUV_API
+int I422ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert I411 to I420.
+LIBYUV_API
+int I411ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Copy I420 to I420.
+#define I420ToI420 I420Copy
+LIBYUV_API
+int I420Copy(const uint8* src_y, int src_stride_y,
+             const uint8* src_u, int src_stride_u,
+             const uint8* src_v, int src_stride_v,
+             uint8* dst_y, int dst_stride_y,
+             uint8* dst_u, int dst_stride_u,
+             uint8* dst_v, int dst_stride_v,
+             int width, int height);
+
+// Convert I400 (grey) to I420.
+LIBYUV_API
+int I400ToI420(const uint8* src_y, int src_stride_y,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+#define J400ToJ420 I400ToI420
+
+// Convert NV12 to I420.
+LIBYUV_API
+int NV12ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_uv, int src_stride_uv,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert NV21 to I420.
+LIBYUV_API
+int NV21ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_vu, int src_stride_vu,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert YUY2 to I420.
+LIBYUV_API
+int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert UYVY to I420.
+LIBYUV_API
+int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert M420 to I420.
+LIBYUV_API
+int M420ToI420(const uint8* src_m420, int src_stride_m420,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// ARGB little endian (bgra in memory) to I420.
+LIBYUV_API
+int ARGBToI420(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// BGRA little endian (argb in memory) to I420.
+LIBYUV_API
+int BGRAToI420(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// ABGR little endian (rgba in memory) to I420.
+LIBYUV_API
+int ABGRToI420(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// RGBA little endian (abgr in memory) to I420.
+LIBYUV_API
+int RGBAToI420(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// RGB little endian (bgr in memory) to I420.
+LIBYUV_API
+int RGB24ToI420(const uint8* src_frame, int src_stride_frame,
+                uint8* dst_y, int dst_stride_y,
+                uint8* dst_u, int dst_stride_u,
+                uint8* dst_v, int dst_stride_v,
+                int width, int height);
+
+// RGB big endian (rgb in memory) to I420.
+LIBYUV_API
+int RAWToI420(const uint8* src_frame, int src_stride_frame,
+              uint8* dst_y, int dst_stride_y,
+              uint8* dst_u, int dst_stride_u,
+              uint8* dst_v, int dst_stride_v,
+              int width, int height);
+
+// RGB16 (RGBP fourcc) little endian to I420.
+LIBYUV_API
+int RGB565ToI420(const uint8* src_frame, int src_stride_frame,
+                 uint8* dst_y, int dst_stride_y,
+                 uint8* dst_u, int dst_stride_u,
+                 uint8* dst_v, int dst_stride_v,
+                 int width, int height);
+
+// RGB15 (RGBO fourcc) little endian to I420.
+LIBYUV_API
+int ARGB1555ToI420(const uint8* src_frame, int src_stride_frame,
+                   uint8* dst_y, int dst_stride_y,
+                   uint8* dst_u, int dst_stride_u,
+                   uint8* dst_v, int dst_stride_v,
+                   int width, int height);
+
+// RGB12 (R444 fourcc) little endian to I420.
+LIBYUV_API
+int ARGB4444ToI420(const uint8* src_frame, int src_stride_frame,
+                   uint8* dst_y, int dst_stride_y,
+                   uint8* dst_u, int dst_stride_u,
+                   uint8* dst_v, int dst_stride_v,
+                   int width, int height);
+
+#ifdef HAVE_JPEG
+// src_width/height provided by capture.
+// dst_width/height for clipping determine final size.
+LIBYUV_API
+int MJPGToI420(const uint8* sample, size_t sample_size,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int src_width, int src_height,
+               int dst_width, int dst_height);
+
+// Query size of MJPG in pixels.
+LIBYUV_API
+int MJPGSize(const uint8* sample, size_t sample_size,
+             int* width, int* height);
+#endif
+
+// Convert camera sample to I420 with cropping, rotation and vertical flip.
+// "src_size" is needed to parse MJPG.
+// "dst_stride_y" number of bytes in a row of the dst_y plane.
+//   Normally this would be the same as dst_width, with recommended alignment
+//   to 16 bytes for better efficiency.
+//   If rotation of 90 or 270 is used, stride is affected. The caller should
+//   allocate the I420 buffer according to rotation.
+// "dst_stride_u" number of bytes in a row of the dst_u plane.
+//   Normally this would be the same as (dst_width + 1) / 2, with
+//   recommended alignment to 16 bytes for better efficiency.
+//   If rotation of 90 or 270 is used, stride is affected.
+// "crop_x" and "crop_y" are starting position for cropping.
+//   To center, crop_x = (src_width - dst_width) / 2
+//              crop_y = (src_height - dst_height) / 2
+// "src_width" / "src_height" is size of src_frame in pixels.
+//   "src_height" can be negative indicating a vertically flipped image source.
+// "crop_width" / "crop_height" is the size to crop the src to.
+//    Must be less than or equal to src_width/src_height
+//    Cropping parameters are pre-rotation.
+// "rotation" can be 0, 90, 180 or 270.
+// "format" is a fourcc. ie 'I420', 'YUY2'
+// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
+LIBYUV_API
+int ConvertToI420(const uint8* src_frame, size_t src_size,
+                  uint8* dst_y, int dst_stride_y,
+                  uint8* dst_u, int dst_stride_u,
+                  uint8* dst_v, int dst_stride_v,
+                  int crop_x, int crop_y,
+                  int src_width, int src_height,
+                  int crop_width, int crop_height,
+                  enum RotationMode rotation,
+                  uint32 format);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_CONVERT_H_  NOLINT
diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/convert_argb.h b/libs/libvpx/third_party/libyuv/include/libyuv/convert_argb.h
new file mode 100644
index 0000000000..360c6d3593
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/include/libyuv/convert_argb.h
@@ -0,0 +1,231 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_CONVERT_ARGB_H_  // NOLINT
+#define INCLUDE_LIBYUV_CONVERT_ARGB_H_
+
+#include "libyuv/basic_types.h"
+// TODO(fbarchard): Remove the following headers includes
+#include "libyuv/convert_from.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+
+// TODO(fbarchard): This set of functions should exactly match convert.h
+// TODO(fbarchard): Add tests. Create random content of right size and convert
+// with C vs Opt and or to I420 and compare.
+// TODO(fbarchard): Some of these functions lack parameter setting.
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Alias.
+#define ARGBToARGB ARGBCopy
+
+// Copy ARGB to ARGB.
+LIBYUV_API
+int ARGBCopy(const uint8* src_argb, int src_stride_argb,
+             uint8* dst_argb, int dst_stride_argb,
+             int width, int height);
+
+// Convert I420 to ARGB.
+LIBYUV_API
+int I420ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert I422 to ARGB.
+LIBYUV_API
+int I422ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert I444 to ARGB.
+LIBYUV_API
+int I444ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert I411 to ARGB.
+LIBYUV_API
+int I411ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert I400 (grey) to ARGB.  Reverse of ARGBToI400.
+LIBYUV_API
+int I400ToARGB(const uint8* src_y, int src_stride_y,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert J400 (jpeg grey) to ARGB.
+LIBYUV_API
+int J400ToARGB(const uint8* src_y, int src_stride_y,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Alias.
+#define YToARGB I400ToARGB
+
+// Convert NV12 to ARGB.
+LIBYUV_API
+int NV12ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_uv, int src_stride_uv,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert NV21 to ARGB.
+LIBYUV_API
+int NV21ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_vu, int src_stride_vu,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert M420 to ARGB.
+LIBYUV_API
+int M420ToARGB(const uint8* src_m420, int src_stride_m420,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert YUY2 to ARGB.
+LIBYUV_API
+int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert UYVY to ARGB.
+LIBYUV_API
+int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert J420 to ARGB.
+LIBYUV_API
+int J420ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert J422 to ARGB.
+LIBYUV_API
+int J422ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// BGRA little endian (argb in memory) to ARGB.
+LIBYUV_API
+int BGRAToARGB(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// ABGR little endian (rgba in memory) to ARGB.
+LIBYUV_API
+int ABGRToARGB(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// RGBA little endian (abgr in memory) to ARGB.
+LIBYUV_API
+int RGBAToARGB(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Deprecated function name.
+#define BG24ToARGB RGB24ToARGB
+
+// RGB little endian (bgr in memory) to ARGB.
+LIBYUV_API
+int RGB24ToARGB(const uint8* src_frame, int src_stride_frame,
+                uint8* dst_argb, int dst_stride_argb,
+                int width, int height);
+
+// RGB big endian (rgb in memory) to ARGB.
+LIBYUV_API
+int RAWToARGB(const uint8* src_frame, int src_stride_frame,
+              uint8* dst_argb, int dst_stride_argb,
+              int width, int height);
+
+// RGB16 (RGBP fourcc) little endian to ARGB.
+LIBYUV_API
+int RGB565ToARGB(const uint8* src_frame, int src_stride_frame,
+                 uint8* dst_argb, int dst_stride_argb,
+                 int width, int height);
+
+// RGB15 (RGBO fourcc) little endian to ARGB.
+LIBYUV_API
+int ARGB1555ToARGB(const uint8* src_frame, int src_stride_frame,
+                   uint8* dst_argb, int dst_stride_argb,
+                   int width, int height);
+
+// RGB12 (R444 fourcc) little endian to ARGB.
+LIBYUV_API
+int ARGB4444ToARGB(const uint8* src_frame, int src_stride_frame,
+                   uint8* dst_argb, int dst_stride_argb,
+                   int width, int height);
+
+#ifdef HAVE_JPEG
+// src_width/height provided by capture
+// dst_width/height for clipping determine final size.
+LIBYUV_API
+int MJPGToARGB(const uint8* sample, size_t sample_size,
+               uint8* dst_argb, int dst_stride_argb,
+               int src_width, int src_height,
+               int dst_width, int dst_height);
+#endif
+
+// Convert camera sample to ARGB with cropping, rotation and vertical flip.
+// "src_size" is needed to parse MJPG.
+// "dst_stride_argb" number of bytes in a row of the dst_argb plane.
+//   Normally this would be the same as dst_width, with recommended alignment
+//   to 16 bytes for better efficiency.
+//   If rotation of 90 or 270 is used, stride is affected. The caller should
+//   allocate the I420 buffer according to rotation.
+// "dst_stride_u" number of bytes in a row of the dst_u plane.
+//   Normally this would be the same as (dst_width + 1) / 2, with
+//   recommended alignment to 16 bytes for better efficiency.
+//   If rotation of 90 or 270 is used, stride is affected.
+// "crop_x" and "crop_y" are starting position for cropping.
+//   To center, crop_x = (src_width - dst_width) / 2
+//              crop_y = (src_height - dst_height) / 2
+// "src_width" / "src_height" is size of src_frame in pixels.
+//   "src_height" can be negative indicating a vertically flipped image source.
+// "crop_width" / "crop_height" is the size to crop the src to.
+//    Must be less than or equal to src_width/src_height
+//    Cropping parameters are pre-rotation.
+// "rotation" can be 0, 90, 180 or 270.
+// "format" is a fourcc. ie 'I420', 'YUY2'
+// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
+LIBYUV_API
+int ConvertToARGB(const uint8* src_frame, size_t src_size,
+                  uint8* dst_argb, int dst_stride_argb,
+                  int crop_x, int crop_y,
+                  int src_width, int src_height,
+                  int crop_width, int crop_height,
+                  enum RotationMode rotation,
+                  uint32 format);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_CONVERT_ARGB_H_  NOLINT
diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/convert_from.h b/libs/libvpx/third_party/libyuv/include/libyuv/convert_from.h
new file mode 100644
index 0000000000..9fd8d4de5f
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/include/libyuv/convert_from.h
@@ -0,0 +1,181 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_CONVERT_FROM_H_  // NOLINT
+#define INCLUDE_LIBYUV_CONVERT_FROM_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/rotate.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// See Also convert.h for conversions from formats to I420.
+
+// I420Copy in convert to I420ToI420.
+
+LIBYUV_API
+int I420ToI422(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+LIBYUV_API
+int I420ToI444(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+LIBYUV_API
+int I420ToI411(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Copy to I400. Source can be I420, I422, I444, I400, NV12 or NV21.
+LIBYUV_API
+int I400Copy(const uint8* src_y, int src_stride_y,
+             uint8* dst_y, int dst_stride_y,
+             int width, int height);
+
+// TODO(fbarchard): I420ToM420
+
+LIBYUV_API
+int I420ToNV12(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height);
+
+LIBYUV_API
+int I420ToNV21(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_vu, int dst_stride_vu,
+               int width, int height);
+
+LIBYUV_API
+int I420ToYUY2(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_frame, int dst_stride_frame,
+               int width, int height);
+
+LIBYUV_API
+int I420ToUYVY(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_frame, int dst_stride_frame,
+               int width, int height);
+
+LIBYUV_API
+int I420ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+LIBYUV_API
+int I420ToBGRA(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+LIBYUV_API
+int I420ToABGR(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+LIBYUV_API
+int I420ToRGBA(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_rgba, int dst_stride_rgba,
+               int width, int height);
+
+LIBYUV_API
+int I420ToRGB24(const uint8* src_y, int src_stride_y,
+                const uint8* src_u, int src_stride_u,
+                const uint8* src_v, int src_stride_v,
+                uint8* dst_frame, int dst_stride_frame,
+                int width, int height);
+
+LIBYUV_API
+int I420ToRAW(const uint8* src_y, int src_stride_y,
+              const uint8* src_u, int src_stride_u,
+              const uint8* src_v, int src_stride_v,
+              uint8* dst_frame, int dst_stride_frame,
+              int width, int height);
+
+LIBYUV_API
+int I420ToRGB565(const uint8* src_y, int src_stride_y,
+                 const uint8* src_u, int src_stride_u,
+                 const uint8* src_v, int src_stride_v,
+                 uint8* dst_frame, int dst_stride_frame,
+                 int width, int height);
+
+// Convert I420 To RGB565 with 4x4 dither matrix (16 bytes).
+// Values in dither matrix from 0 to 7 recommended.
+// The order of the dither matrix is first byte is upper left.
+
+LIBYUV_API
+int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
+                       const uint8* src_u, int src_stride_u,
+                       const uint8* src_v, int src_stride_v,
+                       uint8* dst_frame, int dst_stride_frame,
+                       const uint8* dither4x4, int width, int height);
+
+LIBYUV_API
+int I420ToARGB1555(const uint8* src_y, int src_stride_y,
+                   const uint8* src_u, int src_stride_u,
+                   const uint8* src_v, int src_stride_v,
+                   uint8* dst_frame, int dst_stride_frame,
+                   int width, int height);
+
+LIBYUV_API
+int I420ToARGB4444(const uint8* src_y, int src_stride_y,
+                   const uint8* src_u, int src_stride_u,
+                   const uint8* src_v, int src_stride_v,
+                   uint8* dst_frame, int dst_stride_frame,
+                   int width, int height);
+
+// Convert I420 to specified format.
+// "dst_sample_stride" is bytes in a row for the destination. Pass 0 if the
+//    buffer has contiguous rows. Can be negative. A multiple of 16 is optimal.
+LIBYUV_API
+int ConvertFromI420(const uint8* y, int y_stride,
+                    const uint8* u, int u_stride,
+                    const uint8* v, int v_stride,
+                    uint8* dst_sample, int dst_sample_stride,
+                    int width, int height,
+                    uint32 format);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_CONVERT_FROM_H_  NOLINT
diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/convert_from_argb.h b/libs/libvpx/third_party/libyuv/include/libyuv/convert_from_argb.h
new file mode 100644
index 0000000000..1df53200dd
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/include/libyuv/convert_from_argb.h
@@ -0,0 +1,190 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_  // NOLINT
+#define INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Copy ARGB to ARGB.
+#define ARGBToARGB ARGBCopy
+LIBYUV_API
+int ARGBCopy(const uint8* src_argb, int src_stride_argb,
+             uint8* dst_argb, int dst_stride_argb,
+             int width, int height);
+
+// Convert ARGB To BGRA.
+LIBYUV_API
+int ARGBToBGRA(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_bgra, int dst_stride_bgra,
+               int width, int height);
+
+// Convert ARGB To ABGR.
+LIBYUV_API
+int ARGBToABGR(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_abgr, int dst_stride_abgr,
+               int width, int height);
+
+// Convert ARGB To RGBA.
+LIBYUV_API
+int ARGBToRGBA(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_rgba, int dst_stride_rgba,
+               int width, int height);
+
+// Convert ARGB To RGB24.
+LIBYUV_API
+int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
+                uint8* dst_rgb24, int dst_stride_rgb24,
+                int width, int height);
+
+// Convert ARGB To RAW.
+LIBYUV_API
+int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
+              uint8* dst_rgb, int dst_stride_rgb,
+              int width, int height);
+
+// Convert ARGB To RGB565.
+LIBYUV_API
+int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
+                 uint8* dst_rgb565, int dst_stride_rgb565,
+                 int width, int height);
+
+// Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes).
+// Values in dither matrix from 0 to 7 recommended.
+// The order of the dither matrix is first byte is upper left.
+// TODO(fbarchard): Consider pointer to 2d array for dither4x4.
+// const uint8(*dither)[4][4];
+LIBYUV_API
+int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
+                       uint8* dst_rgb565, int dst_stride_rgb565,
+                       const uint8* dither4x4, int width, int height);
+
+// Convert ARGB To ARGB1555.
+LIBYUV_API
+int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_argb1555, int dst_stride_argb1555,
+                   int width, int height);
+
+// Convert ARGB To ARGB4444.
+LIBYUV_API
+int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_argb4444, int dst_stride_argb4444,
+                   int width, int height);
+
+// Convert ARGB To I444.
+LIBYUV_API
+int ARGBToI444(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert ARGB To I422.
+LIBYUV_API
+int ARGBToI422(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert ARGB To I420. (also in convert.h)
+LIBYUV_API
+int ARGBToI420(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert ARGB to J420. (JPeg full range I420).
+LIBYUV_API
+int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_yj, int dst_stride_yj,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert ARGB to J422.
+LIBYUV_API
+int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_yj, int dst_stride_yj,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert ARGB To I411.
+LIBYUV_API
+int ARGBToI411(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert ARGB to J400. (JPeg full range).
+LIBYUV_API
+int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_yj, int dst_stride_yj,
+               int width, int height);
+
+// Convert ARGB to I400.
+LIBYUV_API
+int ARGBToI400(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height);
+
+// Convert ARGB to G. (Reverse of J400toARGB, which replicates G back to ARGB)
+LIBYUV_API
+int ARGBToG(const uint8* src_argb, int src_stride_argb,
+            uint8* dst_g, int dst_stride_g,
+            int width, int height);
+
+// Convert ARGB To NV12.
+LIBYUV_API
+int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height);
+
+// Convert ARGB To NV21.
+LIBYUV_API
+int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_vu, int dst_stride_vu,
+               int width, int height);
+
+// Convert ARGB To NV21.
+LIBYUV_API
+int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_vu, int dst_stride_vu,
+               int width, int height);
+
+// Convert ARGB To YUY2.
+LIBYUV_API
+int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_yuy2, int dst_stride_yuy2,
+               int width, int height);
+
+// Convert ARGB To UYVY.
+LIBYUV_API
+int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_uyvy, int dst_stride_uyvy,
+               int width, int height);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_  NOLINT
diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/cpu_id.h b/libs/libvpx/third_party/libyuv/include/libyuv/cpu_id.h
new file mode 100644
index 0000000000..dc858a814a
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/include/libyuv/cpu_id.h
@@ -0,0 +1,81 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_CPU_ID_H_  // NOLINT
+#define INCLUDE_LIBYUV_CPU_ID_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// TODO(fbarchard): Consider overlapping bits for different architectures.
+// Internal flag to indicate cpuid requires initialization.
+#define kCpuInit 0x1
+
+// These flags are only valid on ARM processors.
+static const int kCpuHasARM = 0x2;
+static const int kCpuHasNEON = 0x4;
+// 0x8 reserved for future ARM flag.
+
+// These flags are only valid on x86 processors.
+static const int kCpuHasX86 = 0x10;
+static const int kCpuHasSSE2 = 0x20;
+static const int kCpuHasSSSE3 = 0x40;
+static const int kCpuHasSSE41 = 0x80;
+static const int kCpuHasSSE42 = 0x100;
+static const int kCpuHasAVX = 0x200;
+static const int kCpuHasAVX2 = 0x400;
+static const int kCpuHasERMS = 0x800;
+static const int kCpuHasFMA3 = 0x1000;
+// 0x2000, 0x4000, 0x8000 reserved for future X86 flags.
+
+// These flags are only valid on MIPS processors.
+static const int kCpuHasMIPS = 0x10000;
+static const int kCpuHasMIPS_DSP = 0x20000;
+static const int kCpuHasMIPS_DSPR2 = 0x40000;
+
+// Internal function used to auto-init.
+LIBYUV_API
+int InitCpuFlags(void);
+
+// Internal function for parsing /proc/cpuinfo.
+LIBYUV_API
+int ArmCpuCaps(const char* cpuinfo_name);
+
+// Detect CPU has SSE2 etc.
+// Test_flag parameter should be one of kCpuHas constants above.
+// returns non-zero if instruction set is detected
+static __inline int TestCpuFlag(int test_flag) {
+  LIBYUV_API extern int cpu_info_;
+  return (cpu_info_ == kCpuInit ? InitCpuFlags() : cpu_info_) & test_flag;
+}
+
+// For testing, allow CPU flags to be disabled.
+// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.
+// MaskCpuFlags(-1) to enable all cpu specific optimizations.
+// MaskCpuFlags(0) to disable all cpu specific optimizations.
+LIBYUV_API
+void MaskCpuFlags(int enable_flags);
+
+// Low level cpuid for X86. Returns zeros on other CPUs.
+// eax is the info type that you want.
+// ecx is typically the cpu number, and should normally be zero.
+LIBYUV_API
+void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_CPU_ID_H_  NOLINT
diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/mjpeg_decoder.h b/libs/libvpx/third_party/libyuv/include/libyuv/mjpeg_decoder.h
new file mode 100644
index 0000000000..8423121d11
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/include/libyuv/mjpeg_decoder.h
@@ -0,0 +1,192 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_MJPEG_DECODER_H_  // NOLINT
+#define INCLUDE_LIBYUV_MJPEG_DECODER_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+// NOTE: For a simplified public API use convert.h MJPGToI420().
+
+struct jpeg_common_struct;
+struct jpeg_decompress_struct;
+struct jpeg_source_mgr;
+
+namespace libyuv {
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+static const uint32 kUnknownDataSize = 0xFFFFFFFF;
+
+enum JpegSubsamplingType {
+  kJpegYuv420,
+  kJpegYuv422,
+  kJpegYuv411,
+  kJpegYuv444,
+  kJpegYuv400,
+  kJpegUnknown
+};
+
+struct Buffer {
+  const uint8* data;
+  int len;
+};
+
+struct BufferVector {
+  Buffer* buffers;
+  int len;
+  int pos;
+};
+
+struct SetJmpErrorMgr;
+
+// MJPEG ("Motion JPEG") is a pseudo-standard video codec where the frames are
+// simply independent JPEG images with a fixed huffman table (which is omitted).
+// It is rarely used in video transmission, but is common as a camera capture
+// format, especially in Logitech devices. This class implements a decoder for
+// MJPEG frames.
+//
+// See http://tools.ietf.org/html/rfc2435
+class LIBYUV_API MJpegDecoder {
+ public:
+  typedef void (*CallbackFunction)(void* opaque,
+                                   const uint8* const* data,
+                                   const int* strides,
+                                   int rows);
+
+  static const int kColorSpaceUnknown;
+  static const int kColorSpaceGrayscale;
+  static const int kColorSpaceRgb;
+  static const int kColorSpaceYCbCr;
+  static const int kColorSpaceCMYK;
+  static const int kColorSpaceYCCK;
+
+  MJpegDecoder();
+  ~MJpegDecoder();
+
+  // Loads a new frame, reads its headers, and determines the uncompressed
+  // image format.
+  // Returns LIBYUV_TRUE if image looks valid and format is supported.
+  // If return value is LIBYUV_TRUE, then the values for all the following
+  // getters are populated.
+  // src_len is the size of the compressed mjpeg frame in bytes.
+  LIBYUV_BOOL LoadFrame(const uint8* src, size_t src_len);
+
+  // Returns width of the last loaded frame in pixels.
+  int GetWidth();
+
+  // Returns height of the last loaded frame in pixels.
+  int GetHeight();
+
+  // Returns format of the last loaded frame. The return value is one of the
+  // kColorSpace* constants.
+  int GetColorSpace();
+
+  // Number of color components in the color space.
+  int GetNumComponents();
+
+  // Sample factors of the n-th component.
+  int GetHorizSampFactor(int component);
+
+  int GetVertSampFactor(int component);
+
+  int GetHorizSubSampFactor(int component);
+
+  int GetVertSubSampFactor(int component);
+
+  // Public for testability.
+  int GetImageScanlinesPerImcuRow();
+
+  // Public for testability.
+  int GetComponentScanlinesPerImcuRow(int component);
+
+  // Width of a component in bytes.
+  int GetComponentWidth(int component);
+
+  // Height of a component.
+  int GetComponentHeight(int component);
+
+  // Width of a component in bytes with padding for DCTSIZE. Public for testing.
+  int GetComponentStride(int component);
+
+  // Size of a component in bytes.
+  int GetComponentSize(int component);
+
+  // Call this after LoadFrame() if you decide you don't want to decode it
+  // after all.
+  LIBYUV_BOOL UnloadFrame();
+
+  // Decodes the entire image into a one-buffer-per-color-component format.
+  // dst_width must match exactly. dst_height must be <= to image height; if
+  // less, the image is cropped. "planes" must have size equal to at least
+  // GetNumComponents() and they must point to non-overlapping buffers of size
+  // at least GetComponentSize(i). The pointers in planes are incremented
+  // to point to after the end of the written data.
+  // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded.
+  LIBYUV_BOOL DecodeToBuffers(uint8** planes, int dst_width, int dst_height);
+
+  // Decodes the entire image and passes the data via repeated calls to a
+  // callback function. Each call will get the data for a whole number of
+  // image scanlines.
+  // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded.
+  LIBYUV_BOOL DecodeToCallback(CallbackFunction fn, void* opaque,
+                        int dst_width, int dst_height);
+
+  // The helper function which recognizes the jpeg sub-sampling type.
+  static JpegSubsamplingType JpegSubsamplingTypeHelper(
+     int* subsample_x, int* subsample_y, int number_of_components);
+
+ private:
+  void AllocOutputBuffers(int num_outbufs);
+  void DestroyOutputBuffers();
+
+  LIBYUV_BOOL StartDecode();
+  LIBYUV_BOOL FinishDecode();
+
+  void SetScanlinePointers(uint8** data);
+  LIBYUV_BOOL DecodeImcuRow();
+
+  int GetComponentScanlinePadding(int component);
+
+  // A buffer holding the input data for a frame.
+  Buffer buf_;
+  BufferVector buf_vec_;
+
+  jpeg_decompress_struct* decompress_struct_;
+  jpeg_source_mgr* source_mgr_;
+  SetJmpErrorMgr* error_mgr_;
+
+  // LIBYUV_TRUE iff at least one component has scanline padding. (i.e.,
+  // GetComponentScanlinePadding() != 0.)
+  LIBYUV_BOOL has_scanline_padding_;
+
+  // Temporaries used to point to scanline outputs.
+  int num_outbufs_;  // Outermost size of all arrays below.
+  uint8*** scanlines_;
+  int* scanlines_sizes_;
+  // Temporary buffer used for decoding when we can't decode directly to the
+  // output buffers. Large enough for just one iMCU row.
+  uint8** databuf_;
+  int* databuf_strides_;
+};
+
+}  // namespace libyuv
+
+#endif  //  __cplusplus
+#endif  // INCLUDE_LIBYUV_MJPEG_DECODER_H_  NOLINT
diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/planar_functions.h b/libs/libvpx/third_party/libyuv/include/libyuv/planar_functions.h
new file mode 100644
index 0000000000..ae994db899
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/include/libyuv/planar_functions.h
@@ -0,0 +1,453 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_  // NOLINT
+#define INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
+
+#include "libyuv/basic_types.h"
+
+// TODO(fbarchard): Remove the following headers includes.
+#include "libyuv/convert.h"
+#include "libyuv/convert_argb.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Copy a plane of data.
+LIBYUV_API
+void CopyPlane(const uint8* src_y, int src_stride_y,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height);
+
+LIBYUV_API
+void CopyPlane_16(const uint16* src_y, int src_stride_y,
+                  uint16* dst_y, int dst_stride_y,
+                  int width, int height);
+
+// Set a plane of data to a 32 bit value.
+LIBYUV_API
+void SetPlane(uint8* dst_y, int dst_stride_y,
+              int width, int height,
+              uint32 value);
+
+// Copy I400.  Supports inverting.
+LIBYUV_API
+int I400ToI400(const uint8* src_y, int src_stride_y,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height);
+
+#define J400ToJ400 I400ToI400
+
+// Copy I422 to I422.
+#define I422ToI422 I422Copy
+LIBYUV_API
+int I422Copy(const uint8* src_y, int src_stride_y,
+             const uint8* src_u, int src_stride_u,
+             const uint8* src_v, int src_stride_v,
+             uint8* dst_y, int dst_stride_y,
+             uint8* dst_u, int dst_stride_u,
+             uint8* dst_v, int dst_stride_v,
+             int width, int height);
+
+// Copy I444 to I444.
+#define I444ToI444 I444Copy
+LIBYUV_API
+int I444Copy(const uint8* src_y, int src_stride_y,
+             const uint8* src_u, int src_stride_u,
+             const uint8* src_v, int src_stride_v,
+             uint8* dst_y, int dst_stride_y,
+             uint8* dst_u, int dst_stride_u,
+             uint8* dst_v, int dst_stride_v,
+             int width, int height);
+
+// Convert YUY2 to I422.
+LIBYUV_API
+int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert UYVY to I422.
+LIBYUV_API
+int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+LIBYUV_API
+int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height);
+
+LIBYUV_API
+int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height);
+
+// Convert I420 to I400. (calls CopyPlane ignoring u/v).
+LIBYUV_API
+int I420ToI400(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height);
+
+// Alias
+#define J420ToJ400 I420ToI400
+#define I420ToI420Mirror I420Mirror
+
+// I420 mirror.
+LIBYUV_API
+int I420Mirror(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Alias
+#define I400ToI400Mirror I400Mirror
+
+// I400 mirror.  A single plane is mirrored horizontally.
+// Pass negative height to achieve 180 degree rotation.
+LIBYUV_API
+int I400Mirror(const uint8* src_y, int src_stride_y,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height);
+
+// Alias
+#define ARGBToARGBMirror ARGBMirror
+
+// ARGB mirror.
+LIBYUV_API
+int ARGBMirror(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert NV12 to RGB565.
+LIBYUV_API
+int NV12ToRGB565(const uint8* src_y, int src_stride_y,
+                 const uint8* src_uv, int src_stride_uv,
+                 uint8* dst_rgb565, int dst_stride_rgb565,
+                 int width, int height);
+
+// Convert NV21 to RGB565.
+LIBYUV_API
+int NV21ToRGB565(const uint8* src_y, int src_stride_y,
+                 const uint8* src_uv, int src_stride_uv,
+                 uint8* dst_rgb565, int dst_stride_rgb565,
+                 int width, int height);
+
+// I422ToARGB is in convert_argb.h
+// Convert I422 to BGRA.
+LIBYUV_API
+int I422ToBGRA(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_bgra, int dst_stride_bgra,
+               int width, int height);
+
+// Convert I422 to ABGR.
+LIBYUV_API
+int I422ToABGR(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_abgr, int dst_stride_abgr,
+               int width, int height);
+
+// Convert I422 to RGBA.
+LIBYUV_API
+int I422ToRGBA(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_rgba, int dst_stride_rgba,
+               int width, int height);
+
+// Draw a rectangle into I420.
+LIBYUV_API
+int I420Rect(uint8* dst_y, int dst_stride_y,
+             uint8* dst_u, int dst_stride_u,
+             uint8* dst_v, int dst_stride_v,
+             int x, int y, int width, int height,
+             int value_y, int value_u, int value_v);
+
+// Draw a rectangle into ARGB.
+LIBYUV_API
+int ARGBRect(uint8* dst_argb, int dst_stride_argb,
+             int x, int y, int width, int height, uint32 value);
+
+// Convert ARGB to gray scale ARGB.
+LIBYUV_API
+int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Make a rectangle of ARGB gray scale.
+LIBYUV_API
+int ARGBGray(uint8* dst_argb, int dst_stride_argb,
+             int x, int y, int width, int height);
+
+// Make a rectangle of ARGB Sepia tone.
+LIBYUV_API
+int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
+              int x, int y, int width, int height);
+
+// Apply a matrix rotation to each ARGB pixel.
+// matrix_argb is 4 signed ARGB values. -128 to 127 representing -2 to 2.
+// The first 4 coefficients apply to B, G, R, A and produce B of the output.
+// The next 4 coefficients apply to B, G, R, A and produce G of the output.
+// The next 4 coefficients apply to B, G, R, A and produce R of the output.
+// The last 4 coefficients apply to B, G, R, A and produce A of the output.
+LIBYUV_API
+int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
+                    uint8* dst_argb, int dst_stride_argb,
+                    const int8* matrix_argb,
+                    int width, int height);
+
+// Deprecated. Use ARGBColorMatrix instead.
+// Apply a matrix rotation to each ARGB pixel.
+// matrix_argb is 3 signed ARGB values. -128 to 127 representing -1 to 1.
+// The first 4 coefficients apply to B, G, R, A and produce B of the output.
+// The next 4 coefficients apply to B, G, R, A and produce G of the output.
+// The last 4 coefficients apply to B, G, R, A and produce R of the output.
+LIBYUV_API
+int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
+                   const int8* matrix_rgb,
+                   int x, int y, int width, int height);
+
+// Apply a color table each ARGB pixel.
+// Table contains 256 ARGB values.
+LIBYUV_API
+int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,
+                   const uint8* table_argb,
+                   int x, int y, int width, int height);
+
+// Apply a color table each ARGB pixel but preserve destination alpha.
+// Table contains 256 ARGB values.
+LIBYUV_API
+int RGBColorTable(uint8* dst_argb, int dst_stride_argb,
+                  const uint8* table_argb,
+                  int x, int y, int width, int height);
+
+// Apply a luma/color table each ARGB pixel but preserve destination alpha.
+// Table contains 32768 values indexed by [Y][C] where 7 it 7 bit luma from
+// RGB (YJ style) and C is an 8 bit color component (R, G or B).
+LIBYUV_API
+int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb,
+                       uint8* dst_argb, int dst_stride_argb,
+                       const uint8* luma_rgb_table,
+                       int width, int height);
+
+// Apply a 3 term polynomial to ARGB values.
+// poly points to a 4x4 matrix.  The first row is constants.  The 2nd row is
+// coefficients for b, g, r and a.  The 3rd row is coefficients for b squared,
+// g squared, r squared and a squared.  The 4rd row is coefficients for b to
+// the 3, g to the 3, r to the 3 and a to the 3.  The values are summed and
+// result clamped to 0 to 255.
+// A polynomial approximation can be dirived using software such as 'R'.
+
+LIBYUV_API
+int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_argb, int dst_stride_argb,
+                   const float* poly,
+                   int width, int height);
+
+// Quantize a rectangle of ARGB. Alpha unaffected.
+// scale is a 16 bit fractional fixed point scaler between 0 and 65535.
+// interval_size should be a value between 1 and 255.
+// interval_offset should be a value between 0 and 255.
+LIBYUV_API
+int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
+                 int scale, int interval_size, int interval_offset,
+                 int x, int y, int width, int height);
+
+// Copy ARGB to ARGB.
+LIBYUV_API
+int ARGBCopy(const uint8* src_argb, int src_stride_argb,
+             uint8* dst_argb, int dst_stride_argb,
+             int width, int height);
+
+// Copy ARGB to ARGB.
+LIBYUV_API
+int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
+                  uint8* dst_argb, int dst_stride_argb,
+                  int width, int height);
+
+// Copy ARGB to ARGB.
+LIBYUV_API
+int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
+                     uint8* dst_argb, int dst_stride_argb,
+                     int width, int height);
+
+typedef void (*ARGBBlendRow)(const uint8* src_argb0, const uint8* src_argb1,
+                             uint8* dst_argb, int width);
+
+// Get function to Alpha Blend ARGB pixels and store to destination.
+LIBYUV_API
+ARGBBlendRow GetARGBBlend();
+
+// Alpha Blend ARGB images and store to destination.
+// Alpha of destination is set to 255.
+LIBYUV_API
+int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
+              const uint8* src_argb1, int src_stride_argb1,
+              uint8* dst_argb, int dst_stride_argb,
+              int width, int height);
+
+// Multiply ARGB image by ARGB image. Shifted down by 8. Saturates to 255.
+LIBYUV_API
+int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
+                 const uint8* src_argb1, int src_stride_argb1,
+                 uint8* dst_argb, int dst_stride_argb,
+                 int width, int height);
+
+// Add ARGB image with ARGB image. Saturates to 255.
+LIBYUV_API
+int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
+            const uint8* src_argb1, int src_stride_argb1,
+            uint8* dst_argb, int dst_stride_argb,
+            int width, int height);
+
+// Subtract ARGB image (argb1) from ARGB image (argb0). Saturates to 0.
+LIBYUV_API
+int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
+                 const uint8* src_argb1, int src_stride_argb1,
+                 uint8* dst_argb, int dst_stride_argb,
+                 int width, int height);
+
+// Convert I422 to YUY2.
+LIBYUV_API
+int I422ToYUY2(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_frame, int dst_stride_frame,
+               int width, int height);
+
+// Convert I422 to UYVY.
+LIBYUV_API
+int I422ToUYVY(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_frame, int dst_stride_frame,
+               int width, int height);
+
+// Convert unattentuated ARGB to preattenuated ARGB.
+LIBYUV_API
+int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
+                  uint8* dst_argb, int dst_stride_argb,
+                  int width, int height);
+
+// Convert preattentuated ARGB to unattenuated ARGB.
+LIBYUV_API
+int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
+                    uint8* dst_argb, int dst_stride_argb,
+                    int width, int height);
+
+// Convert MJPG to ARGB.
+LIBYUV_API
+int MJPGToARGB(const uint8* sample, size_t sample_size,
+               uint8* argb, int argb_stride,
+               int w, int h, int dw, int dh);
+
+// Internal function - do not call directly.
+// Computes table of cumulative sum for image where the value is the sum
+// of all values above and to the left of the entry. Used by ARGBBlur.
+LIBYUV_API
+int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
+                             int32* dst_cumsum, int dst_stride32_cumsum,
+                             int width, int height);
+
+// Blur ARGB image.
+// dst_cumsum table of width * (height + 1) * 16 bytes aligned to
+//   16 byte boundary.
+// dst_stride32_cumsum is number of ints in a row (width * 4).
+// radius is number of pixels around the center.  e.g. 1 = 3x3. 2=5x5.
+// Blur is optimized for radius of 5 (11x11) or less.
+LIBYUV_API
+int ARGBBlur(const uint8* src_argb, int src_stride_argb,
+             uint8* dst_argb, int dst_stride_argb,
+             int32* dst_cumsum, int dst_stride32_cumsum,
+             int width, int height, int radius);
+
+// Multiply ARGB image by ARGB value.
+LIBYUV_API
+int ARGBShade(const uint8* src_argb, int src_stride_argb,
+              uint8* dst_argb, int dst_stride_argb,
+              int width, int height, uint32 value);
+
+// Interpolate between two ARGB images using specified amount of interpolation
+// (0 to 255) and store to destination.
+// 'interpolation' is specified as 8 bit fraction where 0 means 100% src_argb0
+// and 255 means 1% src_argb0 and 99% src_argb1.
+// Internally uses ARGBScale bilinear filtering.
+// Caveat: This function will write up to 16 bytes beyond the end of dst_argb.
+LIBYUV_API
+int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
+                    const uint8* src_argb1, int src_stride_argb1,
+                    uint8* dst_argb, int dst_stride_argb,
+                    int width, int height, int interpolation);
+
+#if defined(__pnacl__) || defined(__CLR_VER) || \
+    (defined(__i386__) && !defined(__SSE2__))
+#define LIBYUV_DISABLE_X86
+#endif
+// The following are available on all x86 platforms:
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+#define HAS_ARGBAFFINEROW_SSE2
+#endif
+
+// Row function for copying pixels from a source with a slope to a row
+// of destination. Useful for scaling, rotation, mirror, texture mapping.
+LIBYUV_API
+void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
+                     uint8* dst_argb, const float* uv_dudv, int width);
+LIBYUV_API
+void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
+                        uint8* dst_argb, const float* uv_dudv, int width);
+
+// Shuffle ARGB channel order.  e.g. BGRA to ARGB.
+// shuffler is 16 bytes and must be aligned.
+LIBYUV_API
+int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
+                uint8* dst_argb, int dst_stride_argb,
+                const uint8* shuffler, int width, int height);
+
+// Sobel ARGB effect with planar output.
+LIBYUV_API
+int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb,
+                     uint8* dst_y, int dst_stride_y,
+                     int width, int height);
+
+// Sobel ARGB effect.
+LIBYUV_API
+int ARGBSobel(const uint8* src_argb, int src_stride_argb,
+              uint8* dst_argb, int dst_stride_argb,
+              int width, int height);
+
+// Sobel ARGB effect w/ Sobel X, Sobel, Sobel Y in ARGB.
+LIBYUV_API
+int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
+                uint8* dst_argb, int dst_stride_argb,
+                int width, int height);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_  NOLINT
diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/rotate.h b/libs/libvpx/third_party/libyuv/include/libyuv/rotate.h
new file mode 100644
index 0000000000..8af60b8955
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/include/libyuv/rotate.h
@@ -0,0 +1,117 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_ROTATE_H_  // NOLINT
+#define INCLUDE_LIBYUV_ROTATE_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Supported rotation.
+typedef enum RotationMode {
+  kRotate0 = 0,  // No rotation.
+  kRotate90 = 90,  // Rotate 90 degrees clockwise.
+  kRotate180 = 180,  // Rotate 180 degrees.
+  kRotate270 = 270,  // Rotate 270 degrees clockwise.
+
+  // Deprecated.
+  kRotateNone = 0,
+  kRotateClockwise = 90,
+  kRotateCounterClockwise = 270,
+} RotationModeEnum;
+
+// Rotate I420 frame.
+LIBYUV_API
+int I420Rotate(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int src_width, int src_height, enum RotationMode mode);
+
+// Rotate NV12 input and store in I420.
+LIBYUV_API
+int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
+                     const uint8* src_uv, int src_stride_uv,
+                     uint8* dst_y, int dst_stride_y,
+                     uint8* dst_u, int dst_stride_u,
+                     uint8* dst_v, int dst_stride_v,
+                     int src_width, int src_height, enum RotationMode mode);
+
+// Rotate a plane by 0, 90, 180, or 270.
+LIBYUV_API
+int RotatePlane(const uint8* src, int src_stride,
+                uint8* dst, int dst_stride,
+                int src_width, int src_height, enum RotationMode mode);
+
+// Rotate planes by 90, 180, 270. Deprecated.
+LIBYUV_API
+void RotatePlane90(const uint8* src, int src_stride,
+                   uint8* dst, int dst_stride,
+                   int width, int height);
+
+LIBYUV_API
+void RotatePlane180(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride,
+                    int width, int height);
+
+LIBYUV_API
+void RotatePlane270(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride,
+                    int width, int height);
+
+LIBYUV_API
+void RotateUV90(const uint8* src, int src_stride,
+                uint8* dst_a, int dst_stride_a,
+                uint8* dst_b, int dst_stride_b,
+                int width, int height);
+
+// Rotations for when U and V are interleaved.
+// These functions take one input pointer and
+// split the data into two buffers while
+// rotating them. Deprecated.
+LIBYUV_API
+void RotateUV180(const uint8* src, int src_stride,
+                 uint8* dst_a, int dst_stride_a,
+                 uint8* dst_b, int dst_stride_b,
+                 int width, int height);
+
+LIBYUV_API
+void RotateUV270(const uint8* src, int src_stride,
+                 uint8* dst_a, int dst_stride_a,
+                 uint8* dst_b, int dst_stride_b,
+                 int width, int height);
+
+// The 90 and 270 functions are based on transposes.
+// Doing a transpose with reversing the read/write
+// order will result in a rotation by +- 90 degrees.
+// Deprecated.
+LIBYUV_API
+void TransposePlane(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride,
+                    int width, int height);
+
+LIBYUV_API
+void TransposeUV(const uint8* src, int src_stride,
+                 uint8* dst_a, int dst_stride_a,
+                 uint8* dst_b, int dst_stride_b,
+                 int width, int height);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_ROTATE_H_  NOLINT
diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/rotate_argb.h b/libs/libvpx/third_party/libyuv/include/libyuv/rotate_argb.h
new file mode 100644
index 0000000000..660ff5573e
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/include/libyuv/rotate_argb.h
@@ -0,0 +1,33 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_ROTATE_ARGB_H_  // NOLINT
+#define INCLUDE_LIBYUV_ROTATE_ARGB_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/rotate.h"  // For RotationMode.
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Rotate ARGB frame
+LIBYUV_API
+int ARGBRotate(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_argb, int dst_stride_argb,
+               int src_width, int src_height, enum RotationMode mode);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_ROTATE_ARGB_H_  NOLINT
diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/rotate_row.h b/libs/libvpx/third_party/libyuv/include/libyuv/rotate_row.h
new file mode 100644
index 0000000000..c41cf32735
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/include/libyuv/rotate_row.h
@@ -0,0 +1,138 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_ROTATE_ROW_H_  // NOLINT
+#define INCLUDE_LIBYUV_ROTATE_ROW_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if defined(__pnacl__) || defined(__CLR_VER) || \
+    (defined(__i386__) && !defined(__SSE2__))
+#define LIBYUV_DISABLE_X86
+#endif
+
+// Visual C 2012 required for AVX2.
+#if defined(_M_IX86) && !defined(__clang__) && \
+    defined(_MSC_VER) && _MSC_VER >= 1700
+#define VISUALC_HAS_AVX2 1
+#endif  // VisualStudio >= 2012
+
+// TODO(fbarchard): switch to standard form of inline; fails on clangcl.
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+#if defined(__APPLE__) && defined(__i386__)
+#define DECLARE_FUNCTION(name)                                                 \
+    ".text                                     \n"                             \
+    ".private_extern _" #name "                \n"                             \
+    ".align 4,0x90                             \n"                             \
+"_" #name ":                                   \n"
+#elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__)
+#define DECLARE_FUNCTION(name)                                                 \
+    ".text                                     \n"                             \
+    ".align 4,0x90                             \n"                             \
+"_" #name ":                                   \n"
+#else
+#define DECLARE_FUNCTION(name)                                                 \
+    ".text                                     \n"                             \
+    ".align 4,0x90                             \n"                             \
+#name ":                                       \n"
+#endif
+#endif
+
+// The following are available for Visual C:
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
+    defined(_MSC_VER) && !defined(__clang__)
+#define HAS_TRANSPOSEWX8_SSSE3
+#define HAS_TRANSPOSEUVWX8_SSE2
+#endif
+
+// The following are available for GCC but not NaCL:
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
+#define HAS_TRANSPOSEWX8_SSSE3
+#endif
+
+// The following are available for 32 bit GCC:
+#if !defined(LIBYUV_DISABLE_X86) && defined(__i386__)  && !defined(__clang__)
+#define HAS_TRANSPOSEUVWX8_SSE2
+#endif
+
+// The following are available for 64 bit GCC but not NaCL:
+#if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
+    defined(__x86_64__)
+#define HAS_TRANSPOSEWX8_FAST_SSSE3
+#define HAS_TRANSPOSEUVWX8_SSE2
+#endif
+
+#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
+#define HAS_TRANSPOSEWX8_NEON
+#define HAS_TRANSPOSEUVWX8_NEON
+#endif
+
+#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
+    defined(__mips__) && \
+    defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+#define HAS_TRANSPOSEWX8_MIPS_DSPR2
+#define HAS_TRANSPOSEUVWx8_MIPS_DSPR2
+#endif  // defined(__mips__)
+
+void TransposeWxH_C(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride, int width, int height);
+
+void TransposeWx8_C(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride, int width);
+void TransposeWx8_NEON(const uint8* src, int src_stride,
+                       uint8* dst, int dst_stride, int width);
+void TransposeWx8_SSSE3(const uint8* src, int src_stride,
+                        uint8* dst, int dst_stride, int width);
+void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,
+                             uint8* dst, int dst_stride, int width);
+void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
+                             uint8* dst, int dst_stride, int width);
+
+void TransposeWx8_Any_NEON(const uint8* src, int src_stride,
+                           uint8* dst, int dst_stride, int width);
+void TransposeWx8_Any_SSSE3(const uint8* src, int src_stride,
+                            uint8* dst, int dst_stride, int width);
+void TransposeWx8_Fast_Any_SSSE3(const uint8* src, int src_stride,
+                                 uint8* dst, int dst_stride, int width);
+void TransposeWx8_Any_MIPS_DSPR2(const uint8* src, int src_stride,
+                                 uint8* dst, int dst_stride, int width);
+
+void TransposeUVWxH_C(const uint8* src, int src_stride,
+                      uint8* dst_a, int dst_stride_a,
+                      uint8* dst_b, int dst_stride_b,
+                      int width, int height);
+
+void TransposeUVWx8_C(const uint8* src, int src_stride,
+                      uint8* dst_a, int dst_stride_a,
+                      uint8* dst_b, int dst_stride_b, int width);
+void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b, int width);
+void TransposeUVWx8_NEON(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b, int width);
+void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
+                               uint8* dst_a, int dst_stride_a,
+                               uint8* dst_b, int dst_stride_b, int width);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_ROTATE_ROW_H_  NOLINT
diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/row.h b/libs/libvpx/third_party/libyuv/include/libyuv/row.h
new file mode 100644
index 0000000000..ebae3e7195
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/include/libyuv/row.h
@@ -0,0 +1,1856 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_ROW_H_  // NOLINT
+#define INCLUDE_LIBYUV_ROW_H_
+
+#include <stdlib.h>  // For malloc.
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1)))
+
+#ifdef __cplusplus
+#define align_buffer_64(var, size)                                             \
+  uint8* var##_mem = reinterpret_cast<uint8*>(malloc((size) + 63));            \
+  uint8* var = reinterpret_cast<uint8*>                                        \
+      ((reinterpret_cast<intptr_t>(var##_mem) + 63) & ~63)
+#else
+#define align_buffer_64(var, size)                                             \
+  uint8* var##_mem = (uint8*)(malloc((size) + 63));               /* NOLINT */ \
+  uint8* var = (uint8*)(((intptr_t)(var##_mem) + 63) & ~63)       /* NOLINT */
+#endif
+
+#define free_aligned_buffer_64(var) \
+  free(var##_mem);  \
+  var = 0
+
+#if defined(__pnacl__) || defined(__CLR_VER) || \
+    (defined(__i386__) && !defined(__SSE2__))
+#define LIBYUV_DISABLE_X86
+#endif
+// True if compiling for SSSE3 as a requirement.
+#if defined(__SSSE3__) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 3))
+#define LIBYUV_SSSE3_ONLY
+#endif
+
+#if defined(__native_client__)
+#define LIBYUV_DISABLE_NEON
+#endif
+// clang >= 3.5.0 required for Arm64.
+#if defined(__clang__) && defined(__aarch64__) && !defined(LIBYUV_DISABLE_NEON)
+#if (__clang_major__ < 3) || (__clang_major__ == 3 && (__clang_minor__ < 5))
+#define LIBYUV_DISABLE_NEON
+#endif  // clang >= 3.5
+#endif  // __clang__
+
+// The following are available on all x86 platforms:
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+// Conversions:
+#define HAS_ABGRTOUVROW_SSSE3
+#define HAS_ABGRTOYROW_SSSE3
+#define HAS_ARGB1555TOARGBROW_SSE2
+#define HAS_ARGB4444TOARGBROW_SSE2
+#define HAS_ARGBSETROW_X86
+#define HAS_ARGBSHUFFLEROW_SSE2
+#define HAS_ARGBSHUFFLEROW_SSSE3
+#define HAS_ARGBTOARGB1555ROW_SSE2
+#define HAS_ARGBTOARGB4444ROW_SSE2
+#define HAS_ARGBTORAWROW_SSSE3
+#define HAS_ARGBTORGB24ROW_SSSE3
+#define HAS_ARGBTORGB565ROW_SSE2
+#define HAS_ARGBTOUV422ROW_SSSE3
+#define HAS_ARGBTOUV444ROW_SSSE3
+#define HAS_ARGBTOUVJROW_SSSE3
+#define HAS_ARGBTOUVROW_SSSE3
+#define HAS_ARGBTOYJROW_SSSE3
+#define HAS_ARGBTOYROW_SSSE3
+#define HAS_BGRATOUVROW_SSSE3
+#define HAS_BGRATOYROW_SSSE3
+#define HAS_COPYROW_ERMS
+#define HAS_COPYROW_SSE2
+#define HAS_I400TOARGBROW_SSE2
+#define HAS_I411TOARGBROW_SSSE3
+#define HAS_I422TOABGRROW_SSSE3
+#define HAS_I422TOARGB1555ROW_SSSE3
+#define HAS_I422TOARGB4444ROW_SSSE3
+#define HAS_I422TOARGBROW_SSSE3
+#define HAS_I422TOBGRAROW_SSSE3
+#define HAS_I422TORAWROW_SSSE3
+#define HAS_I422TORGB24ROW_SSSE3
+#define HAS_I422TORGB565ROW_SSSE3
+#define HAS_I422TORGBAROW_SSSE3
+#define HAS_I422TOUYVYROW_SSE2
+#define HAS_I422TOYUY2ROW_SSE2
+#define HAS_I444TOARGBROW_SSSE3
+#define HAS_J400TOARGBROW_SSE2
+#define HAS_J422TOARGBROW_SSSE3
+#define HAS_MERGEUVROW_SSE2
+#define HAS_MIRRORROW_SSE2
+#define HAS_MIRRORROW_SSSE3
+#define HAS_MIRRORROW_UV_SSSE3
+#define HAS_MIRRORUVROW_SSSE3
+#define HAS_NV12TOARGBROW_SSSE3
+#define HAS_NV12TORGB565ROW_SSSE3
+#define HAS_NV21TOARGBROW_SSSE3
+#define HAS_NV21TORGB565ROW_SSSE3
+#define HAS_RAWTOARGBROW_SSSE3
+#define HAS_RAWTOYROW_SSSE3
+#define HAS_RGB24TOARGBROW_SSSE3
+#define HAS_RGB24TOYROW_SSSE3
+#define HAS_RGB565TOARGBROW_SSE2
+#define HAS_RGBATOUVROW_SSSE3
+#define HAS_RGBATOYROW_SSSE3
+#define HAS_SETROW_ERMS
+#define HAS_SETROW_X86
+#define HAS_SPLITUVROW_SSE2
+#define HAS_UYVYTOARGBROW_SSSE3
+#define HAS_UYVYTOUV422ROW_SSE2
+#define HAS_UYVYTOUVROW_SSE2
+#define HAS_UYVYTOYROW_SSE2
+#define HAS_YUY2TOARGBROW_SSSE3
+#define HAS_YUY2TOUV422ROW_SSE2
+#define HAS_YUY2TOUVROW_SSE2
+#define HAS_YUY2TOYROW_SSE2
+
+// Effects:
+#define HAS_ARGBADDROW_SSE2
+#define HAS_ARGBAFFINEROW_SSE2
+#define HAS_ARGBATTENUATEROW_SSSE3
+#define HAS_ARGBBLENDROW_SSSE3
+#define HAS_ARGBCOLORMATRIXROW_SSSE3
+#define HAS_ARGBCOLORTABLEROW_X86
+#define HAS_ARGBCOPYALPHAROW_SSE2
+#define HAS_ARGBCOPYYTOALPHAROW_SSE2
+#define HAS_ARGBGRAYROW_SSSE3
+#define HAS_ARGBLUMACOLORTABLEROW_SSSE3
+#define HAS_ARGBMIRRORROW_SSE2
+#define HAS_ARGBMULTIPLYROW_SSE2
+#define HAS_ARGBPOLYNOMIALROW_SSE2
+#define HAS_ARGBQUANTIZEROW_SSE2
+#define HAS_ARGBSEPIAROW_SSSE3
+#define HAS_ARGBSHADEROW_SSE2
+#define HAS_ARGBSUBTRACTROW_SSE2
+#define HAS_ARGBUNATTENUATEROW_SSE2
+#define HAS_COMPUTECUMULATIVESUMROW_SSE2
+#define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+#define HAS_INTERPOLATEROW_SSE2
+#define HAS_INTERPOLATEROW_SSSE3
+#define HAS_RGBCOLORTABLEROW_X86
+#define HAS_SOBELROW_SSE2
+#define HAS_SOBELTOPLANEROW_SSE2
+#define HAS_SOBELXROW_SSE2
+#define HAS_SOBELXYROW_SSE2
+#define HAS_SOBELYROW_SSE2
+#endif
+
+// The following are available on x64 Visual C and clangcl.
+#if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64) && \
+    (!defined(__clang__) || defined(__SSSE3__))
+#define HAS_I422TOARGBROW_SSSE3
+#endif
+
+// GCC >= 4.7.0 required for AVX2.
+#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
+#define GCC_HAS_AVX2 1
+#endif  // GNUC >= 4.7
+#endif  // __GNUC__
+
+// clang >= 3.4.0 required for AVX2.
+#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
+#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4))
+#define CLANG_HAS_AVX2 1
+#endif  // clang >= 3.4
+#endif  // __clang__
+
+// Visual C 2012 required for AVX2.
+#if defined(_M_IX86) && !defined(__clang__) && \
+    defined(_MSC_VER) && _MSC_VER >= 1700
+#define VISUALC_HAS_AVX2 1
+#endif  // VisualStudio >= 2012
+
+// The following are available require VS2012.  Port to GCC.
+#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2)
+#define HAS_ARGB1555TOARGBROW_AVX2
+#define HAS_ARGB4444TOARGBROW_AVX2
+#define HAS_ARGBTOARGB1555ROW_AVX2
+#define HAS_ARGBTOARGB4444ROW_AVX2
+#define HAS_ARGBTORGB565DITHERROW_AVX2
+#define HAS_ARGBTORGB565DITHERROW_SSE2
+#define HAS_ARGBTORGB565ROW_AVX2
+#define HAS_I411TOARGBROW_AVX2
+#define HAS_I422TOARGB1555ROW_AVX2
+#define HAS_I422TOARGB4444ROW_AVX2
+#define HAS_I422TORGB565ROW_AVX2
+#define HAS_I444TOARGBROW_AVX2
+#define HAS_J400TOARGBROW_AVX2
+#define HAS_NV12TOARGBROW_AVX2
+#define HAS_NV12TORGB565ROW_AVX2
+#define HAS_NV21TOARGBROW_AVX2
+#define HAS_NV21TORGB565ROW_AVX2
+#define HAS_RGB565TOARGBROW_AVX2
+#endif
+
+// The following are available on all x86 platforms, but
+// require VS2012, clang 3.4 or gcc 4.7.
+// The code supports NaCL but requires a new compiler and validator.
+#if !defined(LIBYUV_DISABLE_X86) && (defined(VISUALC_HAS_AVX2) || \
+    defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
+#define HAS_ARGBCOPYALPHAROW_AVX2
+#define HAS_ARGBCOPYYTOALPHAROW_AVX2
+#define HAS_ARGBMIRRORROW_AVX2
+#define HAS_ARGBPOLYNOMIALROW_AVX2
+#define HAS_ARGBSHUFFLEROW_AVX2
+#define HAS_ARGBTOUVROW_AVX2
+#define HAS_ARGBTOYJROW_AVX2
+#define HAS_ARGBTOYROW_AVX2
+#define HAS_COPYROW_AVX
+#define HAS_I400TOARGBROW_AVX2
+#define HAS_I422TOABGRROW_AVX2
+#define HAS_I422TOARGBROW_AVX2
+#define HAS_I422TOBGRAROW_AVX2
+#define HAS_I422TORAWROW_AVX2
+#define HAS_I422TORGB24ROW_AVX2
+#define HAS_I422TORGBAROW_AVX2
+#define HAS_INTERPOLATEROW_AVX2
+#define HAS_J422TOARGBROW_AVX2
+#define HAS_MERGEUVROW_AVX2
+#define HAS_MIRRORROW_AVX2
+#define HAS_SPLITUVROW_AVX2
+#define HAS_UYVYTOARGBROW_AVX2
+#define HAS_UYVYTOUV422ROW_AVX2
+#define HAS_UYVYTOUVROW_AVX2
+#define HAS_UYVYTOYROW_AVX2
+#define HAS_YUY2TOARGBROW_AVX2
+#define HAS_YUY2TOUV422ROW_AVX2
+#define HAS_YUY2TOUVROW_AVX2
+#define HAS_YUY2TOYROW_AVX2
+
+// Effects:
+#define HAS_ARGBADDROW_AVX2
+#define HAS_ARGBATTENUATEROW_AVX2
+#define HAS_ARGBMULTIPLYROW_AVX2
+#define HAS_ARGBSUBTRACTROW_AVX2
+#define HAS_ARGBUNATTENUATEROW_AVX2
+#endif
+
+// The following are disabled when SSSE3 is available:
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
+    !defined(LIBYUV_SSSE3_ONLY)
+#define HAS_ARGBATTENUATEROW_SSE2
+#define HAS_ARGBBLENDROW_SSE2
+#define HAS_MIRRORROW_SSE2
+#endif
+
+// The following are available on Neon platforms:
+#if !defined(LIBYUV_DISABLE_NEON) && \
+    (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+#define HAS_ABGRTOUVROW_NEON
+#define HAS_ABGRTOYROW_NEON
+#define HAS_ARGB1555TOARGBROW_NEON
+#define HAS_ARGB1555TOUVROW_NEON
+#define HAS_ARGB1555TOYROW_NEON
+#define HAS_ARGB4444TOARGBROW_NEON
+#define HAS_ARGB4444TOUVROW_NEON
+#define HAS_ARGB4444TOYROW_NEON
+#define HAS_ARGBTOARGB1555ROW_NEON
+#define HAS_ARGBTOARGB4444ROW_NEON
+#define HAS_ARGBTORAWROW_NEON
+#define HAS_ARGBTORGB24ROW_NEON
+#define HAS_ARGBTORGB565ROW_NEON
+#define HAS_ARGBTOUV411ROW_NEON
+#define HAS_ARGBTOUV422ROW_NEON
+#define HAS_ARGBTOUV444ROW_NEON
+#define HAS_ARGBTOUVJROW_NEON
+#define HAS_ARGBTOUVROW_NEON
+#define HAS_ARGBTOYJROW_NEON
+#define HAS_ARGBTOYROW_NEON
+#define HAS_BGRATOUVROW_NEON
+#define HAS_BGRATOYROW_NEON
+#define HAS_COPYROW_NEON
+#define HAS_J400TOARGBROW_NEON
+#define HAS_I411TOARGBROW_NEON
+#define HAS_I422TOABGRROW_NEON
+#define HAS_I422TOARGB1555ROW_NEON
+#define HAS_I422TOARGB4444ROW_NEON
+#define HAS_I422TOARGBROW_NEON
+#define HAS_I422TOBGRAROW_NEON
+#define HAS_I422TORAWROW_NEON
+#define HAS_I422TORGB24ROW_NEON
+#define HAS_I422TORGB565ROW_NEON
+#define HAS_I422TORGBAROW_NEON
+#define HAS_I422TOUYVYROW_NEON
+#define HAS_I422TOYUY2ROW_NEON
+#define HAS_I444TOARGBROW_NEON
+#define HAS_MERGEUVROW_NEON
+#define HAS_MIRRORROW_NEON
+#define HAS_MIRRORUVROW_NEON
+#define HAS_NV12TOARGBROW_NEON
+#define HAS_NV12TORGB565ROW_NEON
+#define HAS_NV21TOARGBROW_NEON
+#define HAS_NV21TORGB565ROW_NEON
+#define HAS_RAWTOARGBROW_NEON
+#define HAS_RAWTOUVROW_NEON
+#define HAS_RAWTOYROW_NEON
+#define HAS_RGB24TOARGBROW_NEON
+#define HAS_RGB24TOUVROW_NEON
+#define HAS_RGB24TOYROW_NEON
+#define HAS_RGB565TOARGBROW_NEON
+#define HAS_RGB565TOUVROW_NEON
+#define HAS_RGB565TOYROW_NEON
+#define HAS_RGBATOUVROW_NEON
+#define HAS_RGBATOYROW_NEON
+#define HAS_SETROW_NEON
+#define HAS_ARGBSETROW_NEON
+#define HAS_SPLITUVROW_NEON
+#define HAS_UYVYTOARGBROW_NEON
+#define HAS_UYVYTOUV422ROW_NEON
+#define HAS_UYVYTOUVROW_NEON
+#define HAS_UYVYTOYROW_NEON
+#define HAS_I400TOARGBROW_NEON
+#define HAS_YUY2TOARGBROW_NEON
+#define HAS_YUY2TOUV422ROW_NEON
+#define HAS_YUY2TOUVROW_NEON
+#define HAS_YUY2TOYROW_NEON
+#define HAS_ARGBTORGB565DITHERROW_NEON
+
+// Effects:
+#define HAS_ARGBADDROW_NEON
+#define HAS_ARGBATTENUATEROW_NEON
+#define HAS_ARGBBLENDROW_NEON
+#define HAS_ARGBGRAYROW_NEON
+#define HAS_ARGBMIRRORROW_NEON
+#define HAS_ARGBMULTIPLYROW_NEON
+#define HAS_ARGBQUANTIZEROW_NEON
+#define HAS_ARGBSEPIAROW_NEON
+#define HAS_ARGBSHADEROW_NEON
+#define HAS_ARGBSUBTRACTROW_NEON
+#define HAS_INTERPOLATEROW_NEON
+#define HAS_SOBELROW_NEON
+#define HAS_SOBELTOPLANEROW_NEON
+#define HAS_SOBELXROW_NEON
+#define HAS_SOBELXYROW_NEON
+#define HAS_SOBELYROW_NEON
+#define HAS_ARGBCOLORMATRIXROW_NEON
+#define HAS_ARGBSHUFFLEROW_NEON
+#endif
+
+// The following are available on Mips platforms:
+#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \
+    (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)
+#define HAS_COPYROW_MIPS
+#if defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+#define HAS_I422TOABGRROW_MIPS_DSPR2
+#define HAS_I422TOARGBROW_MIPS_DSPR2
+#define HAS_I422TOBGRAROW_MIPS_DSPR2
+#define HAS_INTERPOLATEROW_MIPS_DSPR2
+#define HAS_MIRRORROW_MIPS_DSPR2
+#define HAS_MIRRORUVROW_MIPS_DSPR2
+#define HAS_SPLITUVROW_MIPS_DSPR2
+#endif
+#endif
+
+#if defined(_MSC_VER) && !defined(__CLR_VER)
+#define SIMD_ALIGNED(var) __declspec(align(16)) var
+#define SIMD_ALIGNED32(var) __declspec(align(64)) var
+typedef __declspec(align(16)) int16 vec16[8];
+typedef __declspec(align(16)) int32 vec32[4];
+typedef __declspec(align(16)) int8 vec8[16];
+typedef __declspec(align(16)) uint16 uvec16[8];
+typedef __declspec(align(16)) uint32 uvec32[4];
+typedef __declspec(align(16)) uint8 uvec8[16];
+typedef __declspec(align(32)) int16 lvec16[16];
+typedef __declspec(align(32)) int32 lvec32[8];
+typedef __declspec(align(32)) int8 lvec8[32];
+typedef __declspec(align(32)) uint16 ulvec16[16];
+typedef __declspec(align(32)) uint32 ulvec32[8];
+typedef __declspec(align(32)) uint8 ulvec8[32];
+#elif defined(__GNUC__)
+// Caveat GCC 4.2 to 4.7 have a known issue using vectors with const.
+#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
+#define SIMD_ALIGNED32(var) var __attribute__((aligned(64)))
+typedef int16 __attribute__((vector_size(16))) vec16;
+typedef int32 __attribute__((vector_size(16))) vec32;
+typedef int8 __attribute__((vector_size(16))) vec8;
+typedef uint16 __attribute__((vector_size(16))) uvec16;
+typedef uint32 __attribute__((vector_size(16))) uvec32;
+typedef uint8 __attribute__((vector_size(16))) uvec8;
+typedef int16 __attribute__((vector_size(32))) lvec16;
+typedef int32 __attribute__((vector_size(32))) lvec32;
+typedef int8 __attribute__((vector_size(32))) lvec8;
+typedef uint16 __attribute__((vector_size(32))) ulvec16;
+typedef uint32 __attribute__((vector_size(32))) ulvec32;
+typedef uint8 __attribute__((vector_size(32))) ulvec8;
+#else
+#define SIMD_ALIGNED(var) var
+#define SIMD_ALIGNED32(var) var
+typedef int16 vec16[8];
+typedef int32 vec32[4];
+typedef int8 vec8[16];
+typedef uint16 uvec16[8];
+typedef uint32 uvec32[4];
+typedef uint8 uvec8[16];
+typedef int16 lvec16[16];
+typedef int32 lvec32[8];
+typedef int8 lvec8[32];
+typedef uint16 ulvec16[16];
+typedef uint32 ulvec32[8];
+typedef uint8 ulvec8[32];
+#endif
+
+#if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__)
+#define OMITFP
+#else
+#define OMITFP __attribute__((optimize("omit-frame-pointer")))
+#endif
+
+// NaCL macros for GCC x86 and x64.
+#if defined(__native_client__)
+#define LABELALIGN ".p2align 5\n"
+#else
+#define LABELALIGN
+#endif
+#if defined(__native_client__) && defined(__x86_64__)
+// r14 is used for MEMOP macros.
+#define NACL_R14 "r14",
+#define BUNDLELOCK ".bundle_lock\n"
+#define BUNDLEUNLOCK ".bundle_unlock\n"
+#define MEMACCESS(base) "%%nacl:(%%r15,%q" #base ")"
+#define MEMACCESS2(offset, base) "%%nacl:" #offset "(%%r15,%q" #base ")"
+#define MEMLEA(offset, base) #offset "(%q" #base ")"
+#define MEMLEA3(offset, index, scale) \
+    #offset "(,%q" #index "," #scale ")"
+#define MEMLEA4(offset, base, index, scale) \
+    #offset "(%q" #base ",%q" #index "," #scale ")"
+#define MEMMOVESTRING(s, d) "%%nacl:(%q" #s "),%%nacl:(%q" #d "), %%r15"
+#define MEMSTORESTRING(reg, d) "%%" #reg ",%%nacl:(%q" #d "), %%r15"
+#define MEMOPREG(opcode, offset, base, index, scale, reg) \
+    BUNDLELOCK \
+    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
+    #opcode " (%%r15,%%r14),%%" #reg "\n" \
+    BUNDLEUNLOCK
+#define MEMOPMEM(opcode, reg, offset, base, index, scale) \
+    BUNDLELOCK \
+    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
+    #opcode " %%" #reg ",(%%r15,%%r14)\n" \
+    BUNDLEUNLOCK
+#define MEMOPARG(opcode, offset, base, index, scale, arg) \
+    BUNDLELOCK \
+    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
+    #opcode " (%%r15,%%r14),%" #arg "\n" \
+    BUNDLEUNLOCK
+#define VMEMOPREG(opcode, offset, base, index, scale, reg1, reg2) \
+    BUNDLELOCK \
+    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
+    #opcode " (%%r15,%%r14),%%" #reg1 ",%%" #reg2 "\n" \
+    BUNDLEUNLOCK
+#define VEXTOPMEM(op, sel, reg, offset, base, index, scale) \
+    BUNDLELOCK \
+    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
+    #op " $" #sel ",%%" #reg ",(%%r15,%%r14)\n" \
+    BUNDLEUNLOCK
+#else  // defined(__native_client__) && defined(__x86_64__)
+#define NACL_R14
+#define BUNDLEALIGN
+#define MEMACCESS(base) "(%" #base ")"
+#define MEMACCESS2(offset, base) #offset "(%" #base ")"
+#define MEMLEA(offset, base) #offset "(%" #base ")"
+#define MEMLEA3(offset, index, scale) \
+    #offset "(,%" #index "," #scale ")"
+#define MEMLEA4(offset, base, index, scale) \
+    #offset "(%" #base ",%" #index "," #scale ")"
+#define MEMMOVESTRING(s, d)
+#define MEMSTORESTRING(reg, d)
+#define MEMOPREG(opcode, offset, base, index, scale, reg) \
+    #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg "\n"
+#define MEMOPMEM(opcode, reg, offset, base, index, scale) \
+    #opcode " %%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n"
+#define MEMOPARG(opcode, offset, base, index, scale, arg) \
+    #opcode " " #offset "(%" #base ",%" #index "," #scale "),%" #arg "\n"
+#define VMEMOPREG(opcode, offset, base, index, scale, reg1, reg2) \
+    #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg1 ",%%" \
+    #reg2 "\n"
+#define VEXTOPMEM(op, sel, reg, offset, base, index, scale) \
+    #op " $" #sel ",%%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n"
+#endif  // defined(__native_client__) && defined(__x86_64__)
+
+#if defined(__arm__) || defined(__aarch64__)
+#undef MEMACCESS
+#if defined(__native_client__)
+#define MEMACCESS(base) ".p2align 3\nbic %" #base ", #0xc0000000\n"
+#else
+#define MEMACCESS(base)
+#endif
+#endif
+
+void I444ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width);
+void I422ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width);
+void I411ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width);
+void I422ToBGRARow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_bgra,
+                        int width);
+void I422ToABGRRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_abgr,
+                        int width);
+void I422ToRGBARow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_rgba,
+                        int width);
+void I422ToRGB24Row_NEON(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_rgb24,
+                         int width);
+void I422ToRAWRow_NEON(const uint8* src_y,
+                       const uint8* src_u,
+                       const uint8* src_v,
+                       uint8* dst_raw,
+                       int width);
+void I422ToRGB565Row_NEON(const uint8* src_y,
+                          const uint8* src_u,
+                          const uint8* src_v,
+                          uint8* dst_rgb565,
+                          int width);
+void I422ToARGB1555Row_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb1555,
+                            int width);
+void I422ToARGB4444Row_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb4444,
+                            int width);
+void NV12ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_uv,
+                        uint8* dst_argb,
+                        int width);
+void NV21ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_vu,
+                        uint8* dst_argb,
+                        int width);
+void NV12ToRGB565Row_NEON(const uint8* src_y,
+                          const uint8* src_uv,
+                          uint8* dst_rgb565,
+                          int width);
+void NV21ToRGB565Row_NEON(const uint8* src_y,
+                          const uint8* src_vu,
+                          uint8* dst_rgb565,
+                          int width);
+void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
+                        uint8* dst_argb,
+                        int width);
+void UYVYToARGBRow_NEON(const uint8* src_uyvy,
+                        uint8* dst_argb,
+                        int width);
+
+void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYJRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix);
+void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix);
+void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix);
+void RGB24ToYRow_SSSE3(const uint8* src_rgb24, uint8* dst_y, int pix);
+void RAWToYRow_SSSE3(const uint8* src_raw, uint8* dst_y, int pix);
+void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                         int pix);
+void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                         int pix);
+void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                         int pix);
+void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int pix);
+void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int pix);
+void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
+                      uint8* dst_u, uint8* dst_v, int pix);
+void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
+                      uint8* dst_u, uint8* dst_v, int pix);
+void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
+                      uint8* dst_u, uint8* dst_v, int pix);
+void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
+                       uint8* dst_u, uint8* dst_v, int pix);
+void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
+                     uint8* dst_u, uint8* dst_v, int pix);
+void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
+                        uint8* dst_u, uint8* dst_v, int pix);
+void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
+                          uint8* dst_u, uint8* dst_v, int pix);
+void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
+                          uint8* dst_u, uint8* dst_v, int pix);
+void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix);
+void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix);
+void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix);
+void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix);
+void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix);
+void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix);
+void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix);
+void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix);
+void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYJRow_C(const uint8* src_argb, uint8* dst_y, int pix);
+void BGRAToYRow_C(const uint8* src_bgra, uint8* dst_y, int pix);
+void ABGRToYRow_C(const uint8* src_abgr, uint8* dst_y, int pix);
+void RGBAToYRow_C(const uint8* src_rgba, uint8* dst_y, int pix);
+void RGB24ToYRow_C(const uint8* src_rgb24, uint8* dst_y, int pix);
+void RAWToYRow_C(const uint8* src_raw, uint8* dst_y, int pix);
+void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int pix);
+void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int pix);
+void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int pix);
+void ARGBToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYJRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void BGRAToYRow_Any_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix);
+void ABGRToYRow_Any_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix);
+void RGBAToYRow_Any_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix);
+void RGB24ToYRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_y, int pix);
+void RAWToYRow_Any_SSSE3(const uint8* src_raw, uint8* dst_y, int pix);
+void ARGBToYRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYJRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int pix);
+void BGRAToYRow_Any_NEON(const uint8* src_bgra, uint8* dst_y, int pix);
+void ABGRToYRow_Any_NEON(const uint8* src_abgr, uint8* dst_y, int pix);
+void RGBAToYRow_Any_NEON(const uint8* src_rgba, uint8* dst_y, int pix);
+void RGB24ToYRow_Any_NEON(const uint8* src_rgb24, uint8* dst_y, int pix);
+void RAWToYRow_Any_NEON(const uint8* src_raw, uint8* dst_y, int pix);
+void RGB565ToYRow_Any_NEON(const uint8* src_rgb565, uint8* dst_y, int pix);
+void ARGB1555ToYRow_Any_NEON(const uint8* src_argb1555, uint8* dst_y, int pix);
+void ARGB4444ToYRow_Any_NEON(const uint8* src_argb4444, uint8* dst_y, int pix);
+
+void ARGBToUVRow_AVX2(const uint8* src_argb, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVRow_Any_AVX2(const uint8* src_argb, int src_stride_argb,
+                          uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVJRow_SSSE3(const uint8* src_argb, int src_stride_argb,
+                        uint8* dst_u, uint8* dst_v, int width);
+void BGRAToUVRow_SSSE3(const uint8* src_bgra, int src_stride_bgra,
+                       uint8* dst_u, uint8* dst_v, int width);
+void ABGRToUVRow_SSSE3(const uint8* src_abgr, int src_stride_abgr,
+                       uint8* dst_u, uint8* dst_v, int width);
+void RGBAToUVRow_SSSE3(const uint8* src_rgba, int src_stride_rgba,
+                       uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb,
+                           uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVJRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb,
+                            uint8* dst_u, uint8* dst_v, int width);
+void BGRAToUVRow_Any_SSSE3(const uint8* src_bgra, int src_stride_bgra,
+                           uint8* dst_u, uint8* dst_v, int width);
+void ABGRToUVRow_Any_SSSE3(const uint8* src_abgr, int src_stride_abgr,
+                           uint8* dst_u, uint8* dst_v, int width);
+void RGBAToUVRow_Any_SSSE3(const uint8* src_rgba, int src_stride_rgba,
+                           uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUV444Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                             int pix);
+void ARGBToUV422Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                             int pix);
+void ARGBToUV411Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                             int pix);
+void ARGBToUVRow_Any_NEON(const uint8* src_argb, int src_stride_argb,
+                          uint8* dst_u, uint8* dst_v, int pix);
+void ARGBToUVJRow_Any_NEON(const uint8* src_argb, int src_stride_argb,
+                           uint8* dst_u, uint8* dst_v, int pix);
+void BGRAToUVRow_Any_NEON(const uint8* src_bgra, int src_stride_bgra,
+                          uint8* dst_u, uint8* dst_v, int pix);
+void ABGRToUVRow_Any_NEON(const uint8* src_abgr, int src_stride_abgr,
+                          uint8* dst_u, uint8* dst_v, int pix);
+void RGBAToUVRow_Any_NEON(const uint8* src_rgba, int src_stride_rgba,
+                          uint8* dst_u, uint8* dst_v, int pix);
+void RGB24ToUVRow_Any_NEON(const uint8* src_rgb24, int src_stride_rgb24,
+                           uint8* dst_u, uint8* dst_v, int pix);
+void RAWToUVRow_Any_NEON(const uint8* src_raw, int src_stride_raw,
+                         uint8* dst_u, uint8* dst_v, int pix);
+void RGB565ToUVRow_Any_NEON(const uint8* src_rgb565, int src_stride_rgb565,
+                            uint8* dst_u, uint8* dst_v, int pix);
+void ARGB1555ToUVRow_Any_NEON(const uint8* src_argb1555,
+                              int src_stride_argb1555,
+                              uint8* dst_u, uint8* dst_v, int pix);
+void ARGB4444ToUVRow_Any_NEON(const uint8* src_argb4444,
+                              int src_stride_argb4444,
+                              uint8* dst_u, uint8* dst_v, int pix);
+void ARGBToUVRow_C(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVJRow_C(const uint8* src_argb, int src_stride_argb,
+                    uint8* dst_u, uint8* dst_v, int width);
+void BGRAToUVRow_C(const uint8* src_bgra, int src_stride_bgra,
+                   uint8* dst_u, uint8* dst_v, int width);
+void ABGRToUVRow_C(const uint8* src_abgr, int src_stride_abgr,
+                   uint8* dst_u, uint8* dst_v, int width);
+void RGBAToUVRow_C(const uint8* src_rgba, int src_stride_rgba,
+                   uint8* dst_u, uint8* dst_v, int width);
+void RGB24ToUVRow_C(const uint8* src_rgb24, int src_stride_rgb24,
+                    uint8* dst_u, uint8* dst_v, int width);
+void RAWToUVRow_C(const uint8* src_raw, int src_stride_raw,
+                  uint8* dst_u, uint8* dst_v, int width);
+void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565,
+                     uint8* dst_u, uint8* dst_v, int width);
+void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555,
+                       uint8* dst_u, uint8* dst_v, int width);
+void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444,
+                       uint8* dst_u, uint8* dst_v, int width);
+
+void ARGBToUV444Row_SSSE3(const uint8* src_argb,
+                          uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUV444Row_Any_SSSE3(const uint8* src_argb,
+                              uint8* dst_u, uint8* dst_v, int width);
+
+void ARGBToUV422Row_SSSE3(const uint8* src_argb,
+                          uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUV422Row_Any_SSSE3(const uint8* src_argb,
+                              uint8* dst_u, uint8* dst_v, int width);
+
+void ARGBToUV444Row_C(const uint8* src_argb,
+                      uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUV422Row_C(const uint8* src_argb,
+                      uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUV411Row_C(const uint8* src_argb,
+                      uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVJ422Row_C(const uint8* src_argb,
+                       uint8* dst_u, uint8* dst_v, int width);
+
+void MirrorRow_AVX2(const uint8* src, uint8* dst, int width);
+void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width);
+void MirrorRow_SSE2(const uint8* src, uint8* dst, int width);
+void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
+void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width);
+void MirrorRow_C(const uint8* src, uint8* dst, int width);
+void MirrorRow_Any_AVX2(const uint8* src, uint8* dst, int width);
+void MirrorRow_Any_SSSE3(const uint8* src, uint8* dst, int width);
+void MirrorRow_Any_SSE2(const uint8* src, uint8* dst, int width);
+void MirrorRow_Any_NEON(const uint8* src, uint8* dst, int width);
+
+void MirrorUVRow_SSSE3(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                       int width);
+void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                      int width);
+void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                            int width);
+void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                   int width);
+
+void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_Any_AVX2(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_Any_SSE2(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_Any_NEON(const uint8* src, uint8* dst, int width);
+
+void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
+void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
+void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
+void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
+void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                           int pix);
+void SplitUVRow_Any_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                         int pix);
+void SplitUVRow_Any_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                         int pix);
+void SplitUVRow_Any_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                         int pix);
+void SplitUVRow_Any_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                               int pix);
+
+void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                  int width);
+void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                     int width);
+void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                     int width);
+void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                     int width);
+void MergeUVRow_Any_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                         int width);
+void MergeUVRow_Any_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                         int width);
+void MergeUVRow_Any_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                         int width);
+
+void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
+void CopyRow_AVX(const uint8* src, uint8* dst, int count);
+void CopyRow_ERMS(const uint8* src, uint8* dst, int count);
+void CopyRow_NEON(const uint8* src, uint8* dst, int count);
+void CopyRow_MIPS(const uint8* src, uint8* dst, int count);
+void CopyRow_C(const uint8* src, uint8* dst, int count);
+void CopyRow_Any_SSE2(const uint8* src, uint8* dst, int count);
+void CopyRow_Any_AVX(const uint8* src, uint8* dst, int count);
+void CopyRow_Any_NEON(const uint8* src, uint8* dst, int count);
+
+void CopyRow_16_C(const uint16* src, uint16* dst, int count);
+
+void ARGBCopyAlphaRow_C(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBCopyAlphaRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBCopyAlphaRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
+
+void ARGBCopyYToAlphaRow_C(const uint8* src_y, uint8* dst_argb, int width);
+void ARGBCopyYToAlphaRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
+void ARGBCopyYToAlphaRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
+
+void SetRow_C(uint8* dst, uint8 v8, int count);
+void SetRow_X86(uint8* dst, uint8 v8, int count);
+void SetRow_ERMS(uint8* dst, uint8 v8, int count);
+void SetRow_NEON(uint8* dst, uint8 v8, int count);
+void SetRow_Any_X86(uint8* dst, uint8 v8, int count);
+void SetRow_Any_NEON(uint8* dst, uint8 v8, int count);
+
+void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int count);
+void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count);
+void ARGBSetRow_NEON(uint8* dst_argb, uint32 v32, int count);
+void ARGBSetRow_Any_NEON(uint8* dst_argb, uint32 v32, int count);
+
+// ARGBShufflers for BGRAToARGB etc.
+void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
+                      const uint8* shuffler, int pix);
+void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int pix);
+void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                          const uint8* shuffler, int pix);
+void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int pix);
+void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int pix);
+void ARGBShuffleRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
+                             const uint8* shuffler, int pix);
+void ARGBShuffleRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                              const uint8* shuffler, int pix);
+void ARGBShuffleRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,
+                             const uint8* shuffler, int pix);
+void ARGBShuffleRow_Any_NEON(const uint8* src_argb, uint8* dst_argb,
+                             const uint8* shuffler, int pix);
+
+void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix);
+void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix);
+void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, int pix);
+void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
+                            int pix);
+void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
+                            int pix);
+void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, int pix);
+void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
+                            int pix);
+void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
+                            int pix);
+
+void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix);
+void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix);
+void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix);
+void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
+                            int pix);
+void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
+                            int pix);
+void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int pix);
+void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix);
+void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix);
+void ARGB1555ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix);
+void ARGB4444ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix);
+void RGB24ToARGBRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix);
+void RAWToARGBRow_Any_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix);
+
+void RGB565ToARGBRow_Any_SSE2(const uint8* src_rgb565, uint8* dst_argb,
+                              int pix);
+void ARGB1555ToARGBRow_Any_SSE2(const uint8* src_argb1555, uint8* dst_argb,
+                                int pix);
+void ARGB4444ToARGBRow_Any_SSE2(const uint8* src_argb4444, uint8* dst_argb,
+                                int pix);
+void RGB565ToARGBRow_Any_AVX2(const uint8* src_rgb565, uint8* dst_argb,
+                              int pix);
+void ARGB1555ToARGBRow_Any_AVX2(const uint8* src_argb1555, uint8* dst_argb,
+                                int pix);
+void ARGB4444ToARGBRow_Any_AVX2(const uint8* src_argb4444, uint8* dst_argb,
+                                int pix);
+
+void RGB24ToARGBRow_Any_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix);
+void RAWToARGBRow_Any_NEON(const uint8* src_raw, uint8* dst_argb, int pix);
+void RGB565ToARGBRow_Any_NEON(const uint8* src_rgb565, uint8* dst_argb,
+                              int pix);
+void ARGB1555ToARGBRow_Any_NEON(const uint8* src_argb1555, uint8* dst_argb,
+                                int pix);
+void ARGB4444ToARGBRow_Any_NEON(const uint8* src_argb4444, uint8* dst_argb,
+                                int pix);
+
+void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+
+void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb,
+                             const uint32 dither4, int pix);
+void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
+                                const uint32 dither4, int pix);
+void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
+                                const uint32 dither4, int pix);
+
+void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
+
+void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
+                                const uint32 dither4, int width);
+
+void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+
+void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
+void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix);
+void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int pix);
+void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix);
+void J400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
+void J400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int pix);
+void J400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int pix);
+
+void I444ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_argb,
+                     int width);
+void I422ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_argb,
+                     int width);
+void I411ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_argb,
+                     int width);
+void NV12ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_uv,
+                     uint8* dst_argb,
+                     int width);
+void NV21ToRGB565Row_C(const uint8* src_y,
+                       const uint8* src_vu,
+                       uint8* dst_argb,
+                       int width);
+void NV12ToRGB565Row_C(const uint8* src_y,
+                       const uint8* src_uv,
+                       uint8* dst_argb,
+                       int width);
+void NV21ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_vu,
+                     uint8* dst_argb,
+                     int width);
+void YUY2ToARGBRow_C(const uint8* src_yuy2,
+                     uint8* dst_argb,
+                     int width);
+void UYVYToARGBRow_C(const uint8* src_uyvy,
+                     uint8* dst_argb,
+                     int width);
+void J422ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_argb,
+                     int width);
+void I422ToBGRARow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_bgra,
+                     int width);
+void I422ToABGRRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_abgr,
+                     int width);
+void I422ToRGBARow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_rgba,
+                     int width);
+void I422ToRGB24Row_C(const uint8* src_y,
+                      const uint8* src_u,
+                      const uint8* src_v,
+                      uint8* dst_rgb24,
+                      int width);
+void I422ToRAWRow_C(const uint8* src_y,
+                    const uint8* src_u,
+                    const uint8* src_v,
+                    uint8* dst_raw,
+                    int width);
+void I422ToARGB4444Row_C(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_argb4444,
+                         int width);
+void I422ToARGB1555Row_C(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_argb4444,
+                         int width);
+void I422ToRGB565Row_C(const uint8* src_y,
+                       const uint8* src_u,
+                       const uint8* src_v,
+                       uint8* dst_rgb565,
+                       int width);
+void I422ToARGBRow_AVX2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width);
+void I422ToBGRARow_AVX2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width);
+void I422ToRGBARow_AVX2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width);
+void I422ToABGRRow_AVX2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width);
+void I444ToARGBRow_SSSE3(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_argb,
+                         int width);
+void I444ToARGBRow_AVX2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width);
+void I422ToARGBRow_SSSE3(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_argb,
+                         int width);
+void I411ToARGBRow_SSSE3(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_argb,
+                         int width);
+void I411ToARGBRow_AVX2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width);
+void NV12ToARGBRow_SSSE3(const uint8* src_y,
+                         const uint8* src_uv,
+                         uint8* dst_argb,
+                         int width);
+void NV21ToARGBRow_SSSE3(const uint8* src_y,
+                         const uint8* src_vu,
+                         uint8* dst_argb,
+                         int width);
+void NV12ToARGBRow_AVX2(const uint8* src_y,
+                        const uint8* src_uv,
+                        uint8* dst_argb,
+                        int width);
+void NV21ToARGBRow_AVX2(const uint8* src_y,
+                        const uint8* src_vu,
+                        uint8* dst_argb,
+                        int width);
+void NV12ToRGB565Row_SSSE3(const uint8* src_y,
+                           const uint8* src_uv,
+                           uint8* dst_argb,
+                           int width);
+void NV21ToRGB565Row_SSSE3(const uint8* src_y,
+                           const uint8* src_vu,
+                           uint8* dst_argb,
+                           int width);
+void NV12ToRGB565Row_AVX2(const uint8* src_y,
+                          const uint8* src_uv,
+                          uint8* dst_argb,
+                          int width);
+void NV21ToRGB565Row_AVX2(const uint8* src_y,
+                          const uint8* src_vu,
+                          uint8* dst_argb,
+                          int width);
+void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
+                         uint8* dst_argb,
+                         int width);
+void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
+                         uint8* dst_argb,
+                         int width);
+void YUY2ToARGBRow_AVX2(const uint8* src_yuy2,
+                        uint8* dst_argb,
+                        int width);
+void UYVYToARGBRow_AVX2(const uint8* src_uyvy,
+                        uint8* dst_argb,
+                        int width);
+void J422ToARGBRow_SSSE3(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_argb,
+                         int width);
+void J422ToARGBRow_AVX2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width);
+void I422ToBGRARow_SSSE3(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_bgra,
+                         int width);
+void I422ToABGRRow_SSSE3(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_abgr,
+                         int width);
+void I422ToRGBARow_SSSE3(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_rgba,
+                         int width);
+void I422ToARGB4444Row_SSSE3(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb,
+                             int width);
+void I422ToARGB4444Row_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
+void I422ToARGB1555Row_SSSE3(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb,
+                             int width);
+void I422ToARGB1555Row_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
+void I422ToRGB565Row_SSSE3(const uint8* src_y,
+                           const uint8* src_u,
+                           const uint8* src_v,
+                           uint8* dst_argb,
+                           int width);
+void I422ToRGB565Row_AVX2(const uint8* src_y,
+                          const uint8* src_u,
+                          const uint8* src_v,
+                          uint8* dst_argb,
+                          int width);
+void I422ToRGB24Row_SSSE3(const uint8* src_y,
+                          const uint8* src_u,
+                          const uint8* src_v,
+                          uint8* dst_rgb24,
+                          int width);
+void I422ToRGB24Row_AVX2(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_rgb24,
+                         int width);
+void I422ToRAWRow_SSSE3(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_raw,
+                        int width);
+void I422ToRAWRow_AVX2(const uint8* src_y,
+                       const uint8* src_u,
+                       const uint8* src_v,
+                       uint8* dst_raw,
+                       int width);
+void I422ToARGBRow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
+void I422ToBGRARow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
+void I422ToRGBARow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
+void I422ToABGRRow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
+void I444ToARGBRow_Any_SSSE3(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb,
+                             int width);
+void I444ToARGBRow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
+void I422ToARGBRow_Any_SSSE3(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb,
+                             int width);
+void I411ToARGBRow_Any_SSSE3(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb,
+                             int width);
+void I411ToARGBRow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
+void NV12ToARGBRow_Any_SSSE3(const uint8* src_y,
+                             const uint8* src_uv,
+                             uint8* dst_argb,
+                             int width);
+void NV21ToARGBRow_Any_SSSE3(const uint8* src_y,
+                             const uint8* src_vu,
+                             uint8* dst_argb,
+                             int width);
+void NV12ToARGBRow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_uv,
+                            uint8* dst_argb,
+                            int width);
+void NV21ToARGBRow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_vu,
+                            uint8* dst_argb,
+                            int width);
+void NV12ToRGB565Row_Any_SSSE3(const uint8* src_y,
+                               const uint8* src_uv,
+                               uint8* dst_argb,
+                               int width);
+void NV21ToRGB565Row_Any_SSSE3(const uint8* src_y,
+                               const uint8* src_vu,
+                               uint8* dst_argb,
+                               int width);
+void NV12ToRGB565Row_Any_AVX2(const uint8* src_y,
+                              const uint8* src_uv,
+                              uint8* dst_argb,
+                              int width);
+void NV21ToRGB565Row_Any_AVX2(const uint8* src_y,
+                              const uint8* src_vu,
+                              uint8* dst_argb,
+                              int width);
+void YUY2ToARGBRow_Any_SSSE3(const uint8* src_yuy2,
+                             uint8* dst_argb,
+                             int width);
+void UYVYToARGBRow_Any_SSSE3(const uint8* src_uyvy,
+                             uint8* dst_argb,
+                             int width);
+void YUY2ToARGBRow_Any_AVX2(const uint8* src_yuy2,
+                            uint8* dst_argb,
+                            int width);
+void UYVYToARGBRow_Any_AVX2(const uint8* src_uyvy,
+                            uint8* dst_argb,
+                            int width);
+void J422ToARGBRow_Any_SSSE3(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb,
+                             int width);
+void J422ToARGBRow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
+void I422ToBGRARow_Any_SSSE3(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_bgra,
+                             int width);
+void I422ToABGRRow_Any_SSSE3(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_abgr,
+                             int width);
+void I422ToRGBARow_Any_SSSE3(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_rgba,
+                             int width);
+void I422ToARGB4444Row_Any_SSSE3(const uint8* src_y,
+                                 const uint8* src_u,
+                                 const uint8* src_v,
+                                 uint8* dst_rgba,
+                                 int width);
+void I422ToARGB4444Row_Any_AVX2(const uint8* src_y,
+                                const uint8* src_u,
+                                const uint8* src_v,
+                                uint8* dst_rgba,
+                                int width);
+void I422ToARGB1555Row_Any_SSSE3(const uint8* src_y,
+                                 const uint8* src_u,
+                                 const uint8* src_v,
+                                 uint8* dst_rgba,
+                                 int width);
+void I422ToARGB1555Row_Any_AVX2(const uint8* src_y,
+                                const uint8* src_u,
+                                const uint8* src_v,
+                                uint8* dst_rgba,
+                                int width);
+void I422ToRGB565Row_Any_SSSE3(const uint8* src_y,
+                               const uint8* src_u,
+                               const uint8* src_v,
+                               uint8* dst_rgba,
+                               int width);
+void I422ToRGB565Row_Any_AVX2(const uint8* src_y,
+                              const uint8* src_u,
+                              const uint8* src_v,
+                              uint8* dst_rgba,
+                              int width);
+void I422ToRGB24Row_Any_SSSE3(const uint8* src_y,
+                              const uint8* src_u,
+                              const uint8* src_v,
+                              uint8* dst_argb,
+                              int width);
+void I422ToRGB24Row_Any_AVX2(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb,
+                             int width);
+void I422ToRAWRow_Any_SSSE3(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
+void I422ToRAWRow_Any_AVX2(const uint8* src_y,
+                           const uint8* src_u,
+                           const uint8* src_v,
+                           uint8* dst_argb,
+                           int width);
+
+void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width);
+void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
+void I400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
+void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width);
+void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int width);
+void I400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int width);
+void I400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width);
+
+// ARGB preattenuated alpha blend.
+void ARGBBlendRow_SSSE3(const uint8* src_argb, const uint8* src_argb1,
+                        uint8* dst_argb, int width);
+void ARGBBlendRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
+                       uint8* dst_argb, int width);
+void ARGBBlendRow_NEON(const uint8* src_argb, const uint8* src_argb1,
+                       uint8* dst_argb, int width);
+void ARGBBlendRow_C(const uint8* src_argb, const uint8* src_argb1,
+                    uint8* dst_argb, int width);
+
+// ARGB multiply images. Same API as Blend, but these require
+// pointer and width alignment for SSE2.
+void ARGBMultiplyRow_C(const uint8* src_argb, const uint8* src_argb1,
+                       uint8* dst_argb, int width);
+void ARGBMultiplyRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
+                          uint8* dst_argb, int width);
+void ARGBMultiplyRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,
+                              uint8* dst_argb, int width);
+void ARGBMultiplyRow_AVX2(const uint8* src_argb, const uint8* src_argb1,
+                          uint8* dst_argb, int width);
+void ARGBMultiplyRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1,
+                              uint8* dst_argb, int width);
+void ARGBMultiplyRow_NEON(const uint8* src_argb, const uint8* src_argb1,
+                          uint8* dst_argb, int width);
+void ARGBMultiplyRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,
+                              uint8* dst_argb, int width);
+
+// ARGB add images.
+void ARGBAddRow_C(const uint8* src_argb, const uint8* src_argb1,
+                  uint8* dst_argb, int width);
+void ARGBAddRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
+                     uint8* dst_argb, int width);
+void ARGBAddRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,
+                         uint8* dst_argb, int width);
+void ARGBAddRow_AVX2(const uint8* src_argb, const uint8* src_argb1,
+                     uint8* dst_argb, int width);
+void ARGBAddRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1,
+                         uint8* dst_argb, int width);
+void ARGBAddRow_NEON(const uint8* src_argb, const uint8* src_argb1,
+                     uint8* dst_argb, int width);
+void ARGBAddRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,
+                         uint8* dst_argb, int width);
+
+// ARGB subtract images. Same API as Blend, but these require
+// pointer and width alignment for SSE2.
+void ARGBSubtractRow_C(const uint8* src_argb, const uint8* src_argb1,
+                       uint8* dst_argb, int width);
+void ARGBSubtractRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
+                          uint8* dst_argb, int width);
+void ARGBSubtractRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,
+                              uint8* dst_argb, int width);
+void ARGBSubtractRow_AVX2(const uint8* src_argb, const uint8* src_argb1,
+                          uint8* dst_argb, int width);
+void ARGBSubtractRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1,
+                              uint8* dst_argb, int width);
+void ARGBSubtractRow_NEON(const uint8* src_argb, const uint8* src_argb1,
+                          uint8* dst_argb, int width);
+void ARGBSubtractRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,
+                              uint8* dst_argb, int width);
+
+void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+
+void ARGBToRGB565DitherRow_Any_SSE2(const uint8* src_argb, uint8* dst_rgb,
+                                    const uint32 dither4, int pix);
+void ARGBToRGB565DitherRow_Any_AVX2(const uint8* src_argb, uint8* dst_rgb,
+                                    const uint32 dither4, int pix);
+
+void ARGBToRGB565Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB4444Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
+
+void ARGBToRGB24Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRAWRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB565Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB4444Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB565DitherRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb,
+                                    const uint32 dither4, int width);
+
+void I444ToARGBRow_Any_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
+void I422ToARGBRow_Any_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
+void I411ToARGBRow_Any_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
+void I422ToBGRARow_Any_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
+void I422ToABGRRow_Any_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
+void I422ToRGBARow_Any_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
+void I422ToRGB24Row_Any_NEON(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb,
+                             int width);
+void I422ToRAWRow_Any_NEON(const uint8* src_y,
+                           const uint8* src_u,
+                           const uint8* src_v,
+                           uint8* dst_argb,
+                           int width);
+void I422ToARGB4444Row_Any_NEON(const uint8* src_y,
+                                const uint8* src_u,
+                                const uint8* src_v,
+                                uint8* dst_argb,
+                                int width);
+void I422ToARGB1555Row_Any_NEON(const uint8* src_y,
+                                const uint8* src_u,
+                                const uint8* src_v,
+                                uint8* dst_argb,
+                                int width);
+void I422ToRGB565Row_Any_NEON(const uint8* src_y,
+                              const uint8* src_u,
+                              const uint8* src_v,
+                              uint8* dst_argb,
+                              int width);
+void NV12ToARGBRow_Any_NEON(const uint8* src_y,
+                            const uint8* src_uv,
+                            uint8* dst_argb,
+                            int width);
+void NV21ToARGBRow_Any_NEON(const uint8* src_y,
+                            const uint8* src_uv,
+                            uint8* dst_argb,
+                            int width);
+void NV12ToRGB565Row_Any_NEON(const uint8* src_y,
+                              const uint8* src_uv,
+                              uint8* dst_argb,
+                              int width);
+void NV21ToRGB565Row_Any_NEON(const uint8* src_y,
+                              const uint8* src_uv,
+                              uint8* dst_argb,
+                              int width);
+void YUY2ToARGBRow_Any_NEON(const uint8* src_yuy2,
+                            uint8* dst_argb,
+                            int width);
+void UYVYToARGBRow_Any_NEON(const uint8* src_uyvy,
+                            uint8* dst_argb,
+                            int width);
+void I422ToARGBRow_MIPS_DSPR2(const uint8* src_y,
+                              const uint8* src_u,
+                              const uint8* src_v,
+                              uint8* dst_argb,
+                              int width);
+void I422ToBGRARow_MIPS_DSPR2(const uint8* src_y,
+                              const uint8* src_u,
+                              const uint8* src_v,
+                              uint8* dst_argb,
+                              int width);
+void I422ToABGRRow_MIPS_DSPR2(const uint8* src_y,
+                              const uint8* src_u,
+                              const uint8* src_v,
+                              uint8* dst_argb,
+                              int width);
+void I422ToARGBRow_MIPS_DSPR2(const uint8* src_y,
+                              const uint8* src_u,
+                              const uint8* src_v,
+                              uint8* dst_argb,
+                              int width);
+void I422ToBGRARow_MIPS_DSPR2(const uint8* src_y,
+                              const uint8* src_u,
+                              const uint8* src_v,
+                              uint8* dst_argb,
+                              int width);
+void I422ToABGRRow_MIPS_DSPR2(const uint8* src_y,
+                              const uint8* src_u,
+                              const uint8* src_v,
+                              uint8* dst_argb,
+                              int width);
+
+void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix);
+void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
+                         uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix);
+void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
+                         uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix);
+void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_NEON(const uint8* src_yuy2,
+                         uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int pix);
+void YUY2ToUVRow_C(const uint8* src_yuy2, int stride_yuy2,
+                   uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_C(const uint8* src_yuy2,
+                      uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_Any_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix);
+void YUY2ToUVRow_Any_AVX2(const uint8* src_yuy2, int stride_yuy2,
+                          uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_Any_AVX2(const uint8* src_yuy2,
+                             uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_Any_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix);
+void YUY2ToUVRow_Any_SSE2(const uint8* src_yuy2, int stride_yuy2,
+                          uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_Any_SSE2(const uint8* src_yuy2,
+                             uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_Any_NEON(const uint8* src_yuy2, uint8* dst_y, int pix);
+void YUY2ToUVRow_Any_NEON(const uint8* src_yuy2, int stride_yuy2,
+                          uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_Any_NEON(const uint8* src_yuy2,
+                             uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_NEON(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int pix);
+
+void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_C(const uint8* src_uyvy, int stride_uyvy,
+                   uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_C(const uint8* src_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_Any_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_Any_AVX2(const uint8* src_uyvy, int stride_uyvy,
+                          uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_Any_AVX2(const uint8* src_uyvy,
+                             uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_Any_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_Any_SSE2(const uint8* src_uyvy, int stride_uyvy,
+                          uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_Any_SSE2(const uint8* src_uyvy,
+                             uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_Any_NEON(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_Any_NEON(const uint8* src_uyvy, int stride_uyvy,
+                          uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_Any_NEON(const uint8* src_uyvy,
+                             uint8* dst_u, uint8* dst_v, int pix);
+
+void I422ToYUY2Row_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_yuy2, int width);
+void I422ToUYVYRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_uyvy, int width);
+void I422ToYUY2Row_SSE2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_yuy2, int width);
+void I422ToUYVYRow_SSE2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_uyvy, int width);
+void I422ToYUY2Row_Any_SSE2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_yuy2, int width);
+void I422ToUYVYRow_Any_SSE2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_uyvy, int width);
+void I422ToYUY2Row_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_yuy2, int width);
+void I422ToUYVYRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_uyvy, int width);
+void I422ToYUY2Row_Any_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_yuy2, int width);
+void I422ToUYVYRow_Any_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_uyvy, int width);
+
+// Effects related row functions.
+void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBAttenuateRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
+                               int width);
+void ARGBAttenuateRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                                int width);
+void ARGBAttenuateRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,
+                               int width);
+void ARGBAttenuateRow_Any_NEON(const uint8* src_argb, uint8* dst_argb,
+                               int width);
+
+// Inverse table for unattenuate, shared by C and SSE2.
+extern const uint32 fixed_invtbl8[256];
+void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBUnattenuateRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
+                                 int width);
+void ARGBUnattenuateRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,
+                                 int width);
+
+void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width);
+
+void ARGBSepiaRow_C(uint8* dst_argb, int width);
+void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width);
+void ARGBSepiaRow_NEON(uint8* dst_argb, int width);
+
+void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb,
+                          const int8* matrix_argb, int width);
+void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                              const int8* matrix_argb, int width);
+void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
+                             const int8* matrix_argb, int width);
+
+void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width);
+void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width);
+
+void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width);
+void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width);
+
+void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,
+                       int interval_offset, int width);
+void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
+                          int interval_offset, int width);
+void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
+                          int interval_offset, int width);
+
+void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
+                    uint32 value);
+void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
+                       uint32 value);
+void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
+                       uint32 value);
+
+// Used for blur.
+void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
+                                    int width, int area, uint8* dst, int count);
+void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
+                                  const int32* previous_cumsum, int width);
+
+void CumulativeSumToAverageRow_C(const int32* topleft, const int32* botleft,
+                                 int width, int area, uint8* dst, int count);
+void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
+                               const int32* previous_cumsum, int width);
+
+LIBYUV_API
+void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
+                     uint8* dst_argb, const float* uv_dudv, int width);
+LIBYUV_API
+void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
+                        uint8* dst_argb, const float* uv_dudv, int width);
+
+// Used for I420Scale, ARGBScale, and ARGBInterpolate.
+void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
+                      ptrdiff_t src_stride_ptr,
+                      int width, int source_y_fraction);
+void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride_ptr, int width,
+                         int source_y_fraction);
+void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                          ptrdiff_t src_stride_ptr, int width,
+                          int source_y_fraction);
+void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride_ptr, int width,
+                         int source_y_fraction);
+void InterpolateRow_NEON(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride_ptr, int width,
+                         int source_y_fraction);
+void InterpolateRow_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
+                               ptrdiff_t src_stride_ptr, int width,
+                               int source_y_fraction);
+void InterpolateRow_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,
+                             ptrdiff_t src_stride_ptr, int width,
+                             int source_y_fraction);
+void InterpolateRow_Any_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+                             ptrdiff_t src_stride_ptr, int width,
+                             int source_y_fraction);
+void InterpolateRow_Any_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                              ptrdiff_t src_stride_ptr, int width,
+                              int source_y_fraction);
+void InterpolateRow_Any_AVX2(uint8* dst_ptr, const uint8* src_ptr,
+                             ptrdiff_t src_stride_ptr, int width,
+                             int source_y_fraction);
+void InterpolateRow_Any_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
+                                   ptrdiff_t src_stride_ptr, int width,
+                                   int source_y_fraction);
+
+void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                         ptrdiff_t src_stride_ptr,
+                         int width, int source_y_fraction);
+
+// Sobel images.
+void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2,
+                 uint8* dst_sobelx, int width);
+void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
+                    const uint8* src_y2, uint8* dst_sobelx, int width);
+void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
+                    const uint8* src_y2, uint8* dst_sobelx, int width);
+void SobelYRow_C(const uint8* src_y0, const uint8* src_y1,
+                 uint8* dst_sobely, int width);
+void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
+                    uint8* dst_sobely, int width);
+void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
+                    uint8* dst_sobely, int width);
+void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely,
+                uint8* dst_argb, int width);
+void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                   uint8* dst_argb, int width);
+void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                   uint8* dst_argb, int width);
+void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely,
+                       uint8* dst_y, int width);
+void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                          uint8* dst_y, int width);
+void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                          uint8* dst_y, int width);
+void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely,
+                  uint8* dst_argb, int width);
+void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width);
+void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width);
+void SobelRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                       uint8* dst_argb, int width);
+void SobelRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                       uint8* dst_argb, int width);
+void SobelToPlaneRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                              uint8* dst_y, int width);
+void SobelToPlaneRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                              uint8* dst_y, int width);
+void SobelXYRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                         uint8* dst_argb, int width);
+void SobelXYRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                         uint8* dst_argb, int width);
+
+void ARGBPolynomialRow_C(const uint8* src_argb,
+                         uint8* dst_argb, const float* poly,
+                         int width);
+void ARGBPolynomialRow_SSE2(const uint8* src_argb,
+                            uint8* dst_argb, const float* poly,
+                            int width);
+void ARGBPolynomialRow_AVX2(const uint8* src_argb,
+                            uint8* dst_argb, const float* poly,
+                            int width);
+
+void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
+                             const uint8* luma, uint32 lumacoeff);
+void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                                 int width,
+                                 const uint8* luma, uint32 lumacoeff);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_ROW_H_  NOLINT
diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/scale.h b/libs/libvpx/third_party/libyuv/include/libyuv/scale.h
new file mode 100644
index 0000000000..102158d1ab
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/include/libyuv/scale.h
@@ -0,0 +1,103 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_SCALE_H_  // NOLINT
+#define INCLUDE_LIBYUV_SCALE_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Supported filtering.
+typedef enum FilterMode {
+  kFilterNone = 0,  // Point sample; Fastest.
+  kFilterLinear = 1,  // Filter horizontally only.
+  kFilterBilinear = 2,  // Faster than box, but lower quality scaling down.
+  kFilterBox = 3  // Highest quality.
+} FilterModeEnum;
+
+// Scale a YUV plane.
+LIBYUV_API
+void ScalePlane(const uint8* src, int src_stride,
+                int src_width, int src_height,
+                uint8* dst, int dst_stride,
+                int dst_width, int dst_height,
+                enum FilterMode filtering);
+
+LIBYUV_API
+void ScalePlane_16(const uint16* src, int src_stride,
+                   int src_width, int src_height,
+                   uint16* dst, int dst_stride,
+                   int dst_width, int dst_height,
+                   enum FilterMode filtering);
+
+// Scales a YUV 4:2:0 image from the src width and height to the
+// dst width and height.
+// If filtering is kFilterNone, a simple nearest-neighbor algorithm is
+// used. This produces basic (blocky) quality at the fastest speed.
+// If filtering is kFilterBilinear, interpolation is used to produce a better
+// quality image, at the expense of speed.
+// If filtering is kFilterBox, averaging is used to produce ever better
+// quality image, at further expense of speed.
+// Returns 0 if successful.
+
+LIBYUV_API
+int I420Scale(const uint8* src_y, int src_stride_y,
+              const uint8* src_u, int src_stride_u,
+              const uint8* src_v, int src_stride_v,
+              int src_width, int src_height,
+              uint8* dst_y, int dst_stride_y,
+              uint8* dst_u, int dst_stride_u,
+              uint8* dst_v, int dst_stride_v,
+              int dst_width, int dst_height,
+              enum FilterMode filtering);
+
+LIBYUV_API
+int I420Scale_16(const uint16* src_y, int src_stride_y,
+                 const uint16* src_u, int src_stride_u,
+                 const uint16* src_v, int src_stride_v,
+                 int src_width, int src_height,
+                 uint16* dst_y, int dst_stride_y,
+                 uint16* dst_u, int dst_stride_u,
+                 uint16* dst_v, int dst_stride_v,
+                 int dst_width, int dst_height,
+                 enum FilterMode filtering);
+
+#ifdef __cplusplus
+// Legacy API.  Deprecated.
+LIBYUV_API
+int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
+          int src_stride_y, int src_stride_u, int src_stride_v,
+          int src_width, int src_height,
+          uint8* dst_y, uint8* dst_u, uint8* dst_v,
+          int dst_stride_y, int dst_stride_u, int dst_stride_v,
+          int dst_width, int dst_height,
+          LIBYUV_BOOL interpolate);
+
+// Legacy API.  Deprecated.
+LIBYUV_API
+int ScaleOffset(const uint8* src_i420, int src_width, int src_height,
+                uint8* dst_i420, int dst_width, int dst_height, int dst_yoffset,
+                LIBYUV_BOOL interpolate);
+
+// For testing, allow disabling of specialized scalers.
+LIBYUV_API
+void SetUseReferenceImpl(LIBYUV_BOOL use);
+#endif  // __cplusplus
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_SCALE_H_  NOLINT
diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/scale_argb.h b/libs/libvpx/third_party/libyuv/include/libyuv/scale_argb.h
new file mode 100644
index 0000000000..0c9b362575
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/include/libyuv/scale_argb.h
@@ -0,0 +1,57 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_SCALE_ARGB_H_  // NOLINT
+#define INCLUDE_LIBYUV_SCALE_ARGB_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/scale.h"  // For FilterMode
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+LIBYUV_API
+int ARGBScale(const uint8* src_argb, int src_stride_argb,
+              int src_width, int src_height,
+              uint8* dst_argb, int dst_stride_argb,
+              int dst_width, int dst_height,
+              enum FilterMode filtering);
+
+// Clipped scale takes destination rectangle coordinates for clip values.
+LIBYUV_API
+int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
+                  int src_width, int src_height,
+                  uint8* dst_argb, int dst_stride_argb,
+                  int dst_width, int dst_height,
+                  int clip_x, int clip_y, int clip_width, int clip_height,
+                  enum FilterMode filtering);
+
+// TODO(fbarchard): Implement this.
+// Scale with YUV conversion to ARGB and clipping.
+LIBYUV_API
+int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y,
+                       const uint8* src_u, int src_stride_u,
+                       const uint8* src_v, int src_stride_v,
+                       uint32 src_fourcc,
+                       int src_width, int src_height,
+                       uint8* dst_argb, int dst_stride_argb,
+                       uint32 dst_fourcc,
+                       int dst_width, int dst_height,
+                       int clip_x, int clip_y, int clip_width, int clip_height,
+                       enum FilterMode filtering);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_SCALE_ARGB_H_  NOLINT
diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/scale_row.h b/libs/libvpx/third_party/libyuv/include/libyuv/scale_row.h
new file mode 100644
index 0000000000..94ad9cf86b
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/include/libyuv/scale_row.h
@@ -0,0 +1,478 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_SCALE_ROW_H_  // NOLINT
+#define INCLUDE_LIBYUV_SCALE_ROW_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/scale.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if defined(__pnacl__) || defined(__CLR_VER) || \
+    (defined(__i386__) && !defined(__SSE2__))
+#define LIBYUV_DISABLE_X86
+#endif
+
+// Visual C 2012 required for AVX2.
+#if defined(_M_IX86) && !defined(__clang__) && \
+    defined(_MSC_VER) && _MSC_VER >= 1700
+#define VISUALC_HAS_AVX2 1
+#endif  // VisualStudio >= 2012
+
+// The following are available on all x86 platforms:
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+#define HAS_FIXEDDIV1_X86
+#define HAS_FIXEDDIV_X86
+#define HAS_SCALEARGBCOLS_SSE2
+#define HAS_SCALEARGBCOLSUP2_SSE2
+#define HAS_SCALEARGBFILTERCOLS_SSSE3
+#define HAS_SCALEARGBROWDOWN2_SSE2
+#define HAS_SCALEARGBROWDOWNEVEN_SSE2
+#define HAS_SCALECOLSUP2_SSE2
+#define HAS_SCALEFILTERCOLS_SSSE3
+#define HAS_SCALEROWDOWN2_SSE2
+#define HAS_SCALEROWDOWN34_SSSE3
+#define HAS_SCALEROWDOWN38_SSSE3
+#define HAS_SCALEROWDOWN4_SSE2
+#endif
+
+// The following are available on VS2012:
+#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2)
+#define HAS_SCALEADDROW_AVX2
+#define HAS_SCALEROWDOWN2_AVX2
+#define HAS_SCALEROWDOWN4_AVX2
+#endif
+
+// The following are available on Visual C:
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && !defined(__clang__)
+#define HAS_SCALEADDROW_SSE2
+#endif
+
+// The following are available on Neon platforms:
+#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
+#define HAS_SCALEARGBCOLS_NEON
+#define HAS_SCALEARGBROWDOWN2_NEON
+#define HAS_SCALEARGBROWDOWNEVEN_NEON
+#define HAS_SCALEFILTERCOLS_NEON
+#define HAS_SCALEROWDOWN2_NEON
+#define HAS_SCALEROWDOWN34_NEON
+#define HAS_SCALEROWDOWN38_NEON
+#define HAS_SCALEROWDOWN4_NEON
+#define HAS_SCALEARGBFILTERCOLS_NEON
+#endif
+
+// The following are available on Mips platforms:
+#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
+    defined(__mips__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+#define HAS_SCALEROWDOWN2_MIPS_DSPR2
+#define HAS_SCALEROWDOWN4_MIPS_DSPR2
+#define HAS_SCALEROWDOWN34_MIPS_DSPR2
+#define HAS_SCALEROWDOWN38_MIPS_DSPR2
+#endif
+
+// Scale ARGB vertically with bilinear interpolation.
+void ScalePlaneVertical(int src_height,
+                        int dst_width, int dst_height,
+                        int src_stride, int dst_stride,
+                        const uint8* src_argb, uint8* dst_argb,
+                        int x, int y, int dy,
+                        int bpp, enum FilterMode filtering);
+
+void ScalePlaneVertical_16(int src_height,
+                           int dst_width, int dst_height,
+                           int src_stride, int dst_stride,
+                           const uint16* src_argb, uint16* dst_argb,
+                           int x, int y, int dy,
+                           int wpp, enum FilterMode filtering);
+
+// Simplify the filtering based on scale factors.
+enum FilterMode ScaleFilterReduce(int src_width, int src_height,
+                                  int dst_width, int dst_height,
+                                  enum FilterMode filtering);
+
+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv_C(int num, int div);
+int FixedDiv_X86(int num, int div);
+// Divide num - 1 by div - 1 and return as 16.16 fixed point result.
+int FixedDiv1_C(int num, int div);
+int FixedDiv1_X86(int num, int div);
+#ifdef HAS_FIXEDDIV_X86
+#define FixedDiv FixedDiv_X86
+#define FixedDiv1 FixedDiv1_X86
+#else
+#define FixedDiv FixedDiv_C
+#define FixedDiv1 FixedDiv1_C
+#endif
+
+// Compute slope values for stepping.
+void ScaleSlope(int src_width, int src_height,
+                int dst_width, int dst_height,
+                enum FilterMode filtering,
+                int* x, int* y, int* dx, int* dy);
+
+void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                     uint8* dst, int dst_width);
+void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                        uint16* dst, int dst_width);
+void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst, int dst_width);
+void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                              uint16* dst, int dst_width);
+void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst, int dst_width);
+void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                           uint16* dst, int dst_width);
+void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                     uint8* dst, int dst_width);
+void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                        uint16* dst, int dst_width);
+void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst, int dst_width);
+void ScaleRowDown4Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                           uint16* dst, int dst_width);
+void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                      uint8* dst, int dst_width);
+void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                         uint16* dst, int dst_width);
+void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* d, int dst_width);
+void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                               uint16* d, int dst_width);
+void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* d, int dst_width);
+void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                               uint16* d, int dst_width);
+void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,
+                 int dst_width, int x, int dx);
+void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                    int dst_width, int x, int dx);
+void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,
+                    int dst_width, int, int);
+void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                       int dst_width, int, int);
+void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
+                       int dst_width, int x, int dx);
+void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                          int dst_width, int x, int dx);
+void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,
+                         int dst_width, int x, int dx);
+void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                            int dst_width, int x, int dx);
+void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                      uint8* dst, int dst_width);
+void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                         uint16* dst, int dst_width);
+void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint16* dst_ptr, int dst_width);
+void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                               uint16* dst_ptr, int dst_width);
+void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width);
+void ScaleARGBRowDown2_C(const uint8* src_argb,
+                         ptrdiff_t src_stride,
+                         uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Linear_C(const uint8* src_argb,
+                               ptrdiff_t src_stride,
+                               uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride,
+                            uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride,
+                            int src_stepx,
+                            uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEvenBox_C(const uint8* src_argb,
+                               ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8* dst_argb, int dst_width);
+void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb,
+                     int dst_width, int x, int dx);
+void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb,
+                       int dst_width, int x, int dx);
+void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,
+                        int dst_width, int, int);
+void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,
+                           int dst_width, int x, int dx);
+void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
+                             int dst_width, int x, int dx);
+
+// Specialized scalers for x86.
+void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width);
+void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width);
+void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width);
+void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width);
+void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width);
+void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width);
+
+void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                          uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                          uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width);
+void ScaleRowDown2_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Linear_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                  uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Box_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+void ScaleRowDown2_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Linear_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                  uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width);
+void ScaleRowDown4_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width);
+void ScaleRowDown4Box_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+void ScaleRowDown4_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width);
+void ScaleRowDown4Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+
+void ScaleRowDown34_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_1_Box_Any_SSSE3(const uint8* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_0_Box_Any_SSSE3(const uint8* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_3_Box_Any_SSSE3(const uint8* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_2_Box_Any_SSSE3(const uint8* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint8* dst_ptr, int dst_width);
+
+void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+void ScaleAddRow_Any_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+void ScaleAddRow_Any_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+
+void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                           int dst_width, int x, int dx);
+void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+                       int dst_width, int x, int dx);
+
+
+// ARGB Column functions
+void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
+                        int dst_width, int x, int dx);
+void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
+                               int dst_width, int x, int dx);
+void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
+                           int dst_width, int x, int dx);
+void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
+                              int dst_width, int x, int dx);
+void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
+                        int dst_width, int x, int dx);
+void ScaleARGBFilterCols_Any_NEON(uint8* dst_argb, const uint8* src_argb,
+                                  int dst_width, int x, int dx);
+void ScaleARGBCols_Any_NEON(uint8* dst_argb, const uint8* src_argb,
+                            int dst_width, int x, int dx);
+
+// ARGB Row functions
+void ScaleARGBRowDown2_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                            uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                                  uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                               uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst, int dst_width);
+void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                  uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst, int dst_width);
+void ScaleARGBRowDown2_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                                uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Linear_Any_SSE2(const uint8* src_argb,
+                                      ptrdiff_t src_stride,
+                                      uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Box_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                                   uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                                uint8* dst, int dst_width);
+void ScaleARGBRowDown2Linear_Any_NEON(const uint8* src_argb,
+                                      ptrdiff_t src_stride,
+                                      uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                                   uint8* dst, int dst_width);
+
+void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                               int src_stepx, uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEven_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                                   int src_stepx,
+                                   uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEvenBox_Any_SSE2(const uint8* src_argb,
+                                      ptrdiff_t src_stride,
+                                      int src_stepx,
+                                      uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEven_Any_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                   int src_stepx,
+                                   uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEvenBox_Any_NEON(const uint8* src_argb,
+                                      ptrdiff_t src_stride,
+                                      int src_stepx,
+                                      uint8* dst_argb, int dst_width);
+
+// ScaleRowDown2Box also used by planar functions
+// NEON downscalers with interpolation.
+
+// Note - not static due to reuse in convert for 444 to 420.
+void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst, int dst_width);
+void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst, int dst_width);
+void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst, int dst_width);
+
+void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width);
+void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width);
+
+// Down scale from 4 to 3 pixels. Use the neon multilane read/write
+//  to load up the every 4th pixel into a 4 different registers.
+// Point samples 32 pixels to 24 pixels.
+void ScaleRowDown34_NEON(const uint8* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+
+// 32 -> 12
+void ScaleRowDown38_NEON(const uint8* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8* dst_ptr, int dst_width);
+// 32x3 -> 12x1
+void ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+
+void ScaleRowDown2_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst, int dst_width);
+void ScaleRowDown2Linear_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                                  uint8* dst, int dst_width);
+void ScaleRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst, int dst_width);
+void ScaleRowDown4_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width);
+void ScaleRowDown4Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                             uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_0_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                                   uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_1_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                                   uint8* dst_ptr, int dst_width);
+// 32 -> 12
+void ScaleRowDown38_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                             uint8* dst_ptr, int dst_width);
+// 32x3 -> 12x1
+void ScaleRowDown38_3_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+
+void ScaleAddRow_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+void ScaleAddRow_Any_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+
+void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
+                          int dst_width, int x, int dx);
+
+void ScaleFilterCols_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,
+                              int dst_width, int x, int dx);
+
+
+void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst, int dst_width);
+void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                 uint8* dst, int dst_width);
+void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst, int dst_width);
+void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                 uint8* dst, int dst_width);
+void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst, int dst_width);
+void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                     uint8* d, int dst_width);
+void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                     uint8* d, int dst_width);
+void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst, int dst_width);
+void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                     uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint8* dst_ptr, int dst_width);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_SCALE_ROW_H_  NOLINT
diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/version.h b/libs/libvpx/third_party/libyuv/include/libyuv/version.h
new file mode 100644
index 0000000000..9d1d746c22
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/include/libyuv/version.h
@@ -0,0 +1,16 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
+#define INCLUDE_LIBYUV_VERSION_H_
+
+#define LIBYUV_VERSION 1456
+
+#endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/video_common.h b/libs/libvpx/third_party/libyuv/include/libyuv/video_common.h
new file mode 100644
index 0000000000..cb6582f24d
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/include/libyuv/video_common.h
@@ -0,0 +1,182 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// Common definitions for video, including fourcc and VideoFormat.
+
+#ifndef INCLUDE_LIBYUV_VIDEO_COMMON_H_  // NOLINT
+#define INCLUDE_LIBYUV_VIDEO_COMMON_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+// Definition of FourCC codes
+//////////////////////////////////////////////////////////////////////////////
+
+// Convert four characters to a FourCC code.
+// Needs to be a macro otherwise the OS X compiler complains when the kFormat*
+// constants are used in a switch.
+#ifdef __cplusplus
+#define FOURCC(a, b, c, d) ( \
+    (static_cast<uint32>(a)) | (static_cast<uint32>(b) << 8) | \
+    (static_cast<uint32>(c) << 16) | (static_cast<uint32>(d) << 24))
+#else
+#define FOURCC(a, b, c, d) ( \
+    ((uint32)(a)) | ((uint32)(b) << 8) | /* NOLINT */ \
+    ((uint32)(c) << 16) | ((uint32)(d) << 24))  /* NOLINT */
+#endif
+
+// Some pages discussing FourCC codes:
+//   http://www.fourcc.org/yuv.php
+//   http://v4l2spec.bytesex.org/spec/book1.htm
+//   http://developer.apple.com/quicktime/icefloe/dispatch020.html
+//   http://msdn.microsoft.com/library/windows/desktop/dd206750.aspx#nv12
+//   http://people.xiph.org/~xiphmont/containers/nut/nut4cc.txt
+
+// FourCC codes grouped according to implementation efficiency.
+// Primary formats should convert in 1 efficient step.
+// Secondary formats are converted in 2 steps.
+// Auxilliary formats call primary converters.
+enum FourCC {
+  // 9 Primary YUV formats: 5 planar, 2 biplanar, 2 packed.
+  FOURCC_I420 = FOURCC('I', '4', '2', '0'),
+  FOURCC_I422 = FOURCC('I', '4', '2', '2'),
+  FOURCC_I444 = FOURCC('I', '4', '4', '4'),
+  FOURCC_I411 = FOURCC('I', '4', '1', '1'),
+  FOURCC_I400 = FOURCC('I', '4', '0', '0'),
+  FOURCC_NV21 = FOURCC('N', 'V', '2', '1'),
+  FOURCC_NV12 = FOURCC('N', 'V', '1', '2'),
+  FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'),
+  FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'),
+
+  // 2 Secondary YUV formats: row biplanar.
+  FOURCC_M420 = FOURCC('M', '4', '2', '0'),
+  FOURCC_Q420 = FOURCC('Q', '4', '2', '0'), // deprecated.
+
+  // 9 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp.
+  FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),
+  FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'),
+  FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'),
+  FOURCC_24BG = FOURCC('2', '4', 'B', 'G'),
+  FOURCC_RAW  = FOURCC('r', 'a', 'w', ' '),
+  FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'),
+  FOURCC_RGBP = FOURCC('R', 'G', 'B', 'P'),  // rgb565 LE.
+  FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'),  // argb1555 LE.
+  FOURCC_R444 = FOURCC('R', '4', '4', '4'),  // argb4444 LE.
+
+  // 4 Secondary RGB formats: 4 Bayer Patterns. deprecated.
+  FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'),
+  FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'),
+  FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'),
+  FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'),
+
+  // 1 Primary Compressed YUV format.
+  FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'),
+
+  // 5 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias.
+  FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'),
+  FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'),
+  FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'),
+  FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'),  // Linux version of I420.
+  FOURCC_J420 = FOURCC('J', '4', '2', '0'),
+  FOURCC_J400 = FOURCC('J', '4', '0', '0'),
+
+  // 14 Auxiliary aliases.  CanonicalFourCC() maps these to canonical fourcc.
+  FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'),  // Alias for I420.
+  FOURCC_YU16 = FOURCC('Y', 'U', '1', '6'),  // Alias for I422.
+  FOURCC_YU24 = FOURCC('Y', 'U', '2', '4'),  // Alias for I444.
+  FOURCC_YUYV = FOURCC('Y', 'U', 'Y', 'V'),  // Alias for YUY2.
+  FOURCC_YUVS = FOURCC('y', 'u', 'v', 's'),  // Alias for YUY2 on Mac.
+  FOURCC_HDYC = FOURCC('H', 'D', 'Y', 'C'),  // Alias for UYVY.
+  FOURCC_2VUY = FOURCC('2', 'v', 'u', 'y'),  // Alias for UYVY on Mac.
+  FOURCC_JPEG = FOURCC('J', 'P', 'E', 'G'),  // Alias for MJPG.
+  FOURCC_DMB1 = FOURCC('d', 'm', 'b', '1'),  // Alias for MJPG on Mac.
+  FOURCC_BA81 = FOURCC('B', 'A', '8', '1'),  // Alias for BGGR.
+  FOURCC_RGB3 = FOURCC('R', 'G', 'B', '3'),  // Alias for RAW.
+  FOURCC_BGR3 = FOURCC('B', 'G', 'R', '3'),  // Alias for 24BG.
+  FOURCC_CM32 = FOURCC(0, 0, 0, 32),  // Alias for BGRA kCMPixelFormat_32ARGB
+  FOURCC_CM24 = FOURCC(0, 0, 0, 24),  // Alias for RAW kCMPixelFormat_24RGB
+  FOURCC_L555 = FOURCC('L', '5', '5', '5'),  // Alias for RGBO.
+  FOURCC_L565 = FOURCC('L', '5', '6', '5'),  // Alias for RGBP.
+  FOURCC_5551 = FOURCC('5', '5', '5', '1'),  // Alias for RGBO.
+
+  // 1 Auxiliary compressed YUV format set aside for capturer.
+  FOURCC_H264 = FOURCC('H', '2', '6', '4'),
+
+  // Match any fourcc.
+  FOURCC_ANY = -1,
+};
+
+enum FourCCBpp {
+  // Canonical fourcc codes used in our code.
+  FOURCC_BPP_I420 = 12,
+  FOURCC_BPP_I422 = 16,
+  FOURCC_BPP_I444 = 24,
+  FOURCC_BPP_I411 = 12,
+  FOURCC_BPP_I400 = 8,
+  FOURCC_BPP_NV21 = 12,
+  FOURCC_BPP_NV12 = 12,
+  FOURCC_BPP_YUY2 = 16,
+  FOURCC_BPP_UYVY = 16,
+  FOURCC_BPP_M420 = 12,
+  FOURCC_BPP_Q420 = 12,
+  FOURCC_BPP_ARGB = 32,
+  FOURCC_BPP_BGRA = 32,
+  FOURCC_BPP_ABGR = 32,
+  FOURCC_BPP_RGBA = 32,
+  FOURCC_BPP_24BG = 24,
+  FOURCC_BPP_RAW  = 24,
+  FOURCC_BPP_RGBP = 16,
+  FOURCC_BPP_RGBO = 16,
+  FOURCC_BPP_R444 = 16,
+  FOURCC_BPP_RGGB = 8,
+  FOURCC_BPP_BGGR = 8,
+  FOURCC_BPP_GRBG = 8,
+  FOURCC_BPP_GBRG = 8,
+  FOURCC_BPP_YV12 = 12,
+  FOURCC_BPP_YV16 = 16,
+  FOURCC_BPP_YV24 = 24,
+  FOURCC_BPP_YU12 = 12,
+  FOURCC_BPP_J420 = 12,
+  FOURCC_BPP_J400 = 8,
+  FOURCC_BPP_MJPG = 0,  // 0 means unknown.
+  FOURCC_BPP_H264 = 0,
+  FOURCC_BPP_IYUV = 12,
+  FOURCC_BPP_YU16 = 16,
+  FOURCC_BPP_YU24 = 24,
+  FOURCC_BPP_YUYV = 16,
+  FOURCC_BPP_YUVS = 16,
+  FOURCC_BPP_HDYC = 16,
+  FOURCC_BPP_2VUY = 16,
+  FOURCC_BPP_JPEG = 1,
+  FOURCC_BPP_DMB1 = 1,
+  FOURCC_BPP_BA81 = 8,
+  FOURCC_BPP_RGB3 = 24,
+  FOURCC_BPP_BGR3 = 24,
+  FOURCC_BPP_CM32 = 32,
+  FOURCC_BPP_CM24 = 24,
+
+  // Match any fourcc.
+  FOURCC_BPP_ANY  = 0,  // 0 means unknown.
+};
+
+// Converts fourcc aliases into canonical ones.
+LIBYUV_API uint32 CanonicalFourCC(uint32 fourcc);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_VIDEO_COMMON_H_  NOLINT
diff --git a/libs/libvpx/third_party/libyuv/source/compare.cc b/libs/libvpx/third_party/libyuv/source/compare.cc
new file mode 100644
index 0000000000..46aa8473d2
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/source/compare.cc
@@ -0,0 +1,373 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/compare.h"
+
+#include <float.h>
+#include <math.h>
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#include "libyuv/basic_types.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/row.h"
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// hash seed of 5381 recommended.
+// Internal C version of HashDjb2 with int sized count for efficiency.
+uint32 HashDjb2_C(const uint8* src, int count, uint32 seed);
+
+// This module is for Visual C x86
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || \
+    (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__))))
+#define HAS_HASHDJB2_SSE41
+uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed);
+
+#ifdef VISUALC_HAS_AVX2
+#define HAS_HASHDJB2_AVX2
+uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed);
+#endif
+
+#endif  // HAS_HASHDJB2_SSE41
+
+// hash seed of 5381 recommended.
+LIBYUV_API
+uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
+  const int kBlockSize = 1 << 15;  // 32768;
+  int remainder;
+  uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) = HashDjb2_C;
+#if defined(HAS_HASHDJB2_SSE41)
+  if (TestCpuFlag(kCpuHasSSE41)) {
+    HashDjb2_SSE = HashDjb2_SSE41;
+  }
+#endif
+#if defined(HAS_HASHDJB2_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    HashDjb2_SSE = HashDjb2_AVX2;
+  }
+#endif
+
+  while (count >= (uint64)(kBlockSize)) {
+    seed = HashDjb2_SSE(src, kBlockSize, seed);
+    src += kBlockSize;
+    count -= kBlockSize;
+  }
+  remainder = (int)(count) & ~15;
+  if (remainder) {
+    seed = HashDjb2_SSE(src, remainder, seed);
+    src += remainder;
+    count -= remainder;
+  }
+  remainder = (int)(count) & 15;
+  if (remainder) {
+    seed = HashDjb2_C(src, remainder, seed);
+  }
+  return seed;
+}
+
+static uint32 ARGBDetectRow_C(const uint8* argb, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    if (argb[0] != 255) {  // First byte is not Alpha of 255, so not ARGB.
+      return FOURCC_BGRA;
+    }
+    if (argb[3] != 255) {  // 4th byte is not Alpha of 255, so not BGRA.
+      return FOURCC_ARGB;
+    }
+    if (argb[4] != 255) {  // Second pixel first byte is not Alpha of 255.
+      return FOURCC_BGRA;
+    }
+    if (argb[7] != 255) {  // Second pixel 4th byte is not Alpha of 255.
+      return FOURCC_ARGB;
+    }
+    argb += 8;
+  }
+  if (width & 1) {
+    if (argb[0] != 255) {  // First byte is not Alpha of 255, so not ARGB.
+      return FOURCC_BGRA;
+    }
+    if (argb[3] != 255) {  // 4th byte is not Alpha of 255, so not BGRA.
+      return FOURCC_ARGB;
+    }
+  }
+  return 0;
+}
+
+// Scan an opaque argb image and return fourcc based on alpha offset.
+// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
+LIBYUV_API
+uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height) {
+  uint32 fourcc = 0;
+  int h;
+
+  // Coalesce rows.
+  if (stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    stride_argb = 0;
+  }
+  for (h = 0; h < height && fourcc == 0; ++h) {
+    fourcc = ARGBDetectRow_C(argb, width);
+    argb += stride_argb;
+  }
+  return fourcc;
+}
+
+uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count);
+#if !defined(LIBYUV_DISABLE_NEON) && \
+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
+#define HAS_SUMSQUAREERROR_NEON
+uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
+#endif
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+#define HAS_SUMSQUAREERROR_SSE2
+uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count);
+#endif
+
+#ifdef VISUALC_HAS_AVX2
+#define HAS_SUMSQUAREERROR_AVX2
+uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count);
+#endif
+
+// TODO(fbarchard): Refactor into row function.
+LIBYUV_API
+uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
+                             int count) {
+  // SumSquareError returns values 0 to 65535 for each squared difference.
+  // Up to 65536 of those can be summed and remain within a uint32.
+  // After each block of 65536 pixels, accumulate into a uint64.
+  const int kBlockSize = 65536;
+  int remainder = count & (kBlockSize - 1) & ~31;
+  uint64 sse = 0;
+  int i;
+  uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) =
+      SumSquareError_C;
+#if defined(HAS_SUMSQUAREERROR_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SumSquareError = SumSquareError_NEON;
+  }
+#endif
+#if defined(HAS_SUMSQUAREERROR_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    // Note only used for multiples of 16 so count is not checked.
+    SumSquareError = SumSquareError_SSE2;
+  }
+#endif
+#if defined(HAS_SUMSQUAREERROR_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    // Note only used for multiples of 32 so count is not checked.
+    SumSquareError = SumSquareError_AVX2;
+  }
+#endif
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+: sse)
+#endif
+  for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
+    sse += SumSquareError(src_a + i, src_b + i, kBlockSize);
+  }
+  src_a += count & ~(kBlockSize - 1);
+  src_b += count & ~(kBlockSize - 1);
+  if (remainder) {
+    sse += SumSquareError(src_a, src_b, remainder);
+    src_a += remainder;
+    src_b += remainder;
+  }
+  remainder = count & 31;
+  if (remainder) {
+    sse += SumSquareError_C(src_a, src_b, remainder);
+  }
+  return sse;
+}
+
+LIBYUV_API
+uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
+                                  const uint8* src_b, int stride_b,
+                                  int width, int height) {
+  uint64 sse = 0;
+  int h;
+  // Coalesce rows.
+  if (stride_a == width &&
+      stride_b == width) {
+    width *= height;
+    height = 1;
+    stride_a = stride_b = 0;
+  }
+  for (h = 0; h < height; ++h) {
+    sse += ComputeSumSquareError(src_a, src_b, width);
+    src_a += stride_a;
+    src_b += stride_b;
+  }
+  return sse;
+}
+
+LIBYUV_API
+double SumSquareErrorToPsnr(uint64 sse, uint64 count) {
+  double psnr;
+  if (sse > 0) {
+    double mse = (double)(count) / (double)(sse);
+    psnr = 10.0 * log10(255.0 * 255.0 * mse);
+  } else {
+    psnr = kMaxPsnr;      // Limit to prevent divide by 0
+  }
+
+  if (psnr > kMaxPsnr)
+    psnr = kMaxPsnr;
+
+  return psnr;
+}
+
+LIBYUV_API
+double CalcFramePsnr(const uint8* src_a, int stride_a,
+                     const uint8* src_b, int stride_b,
+                     int width, int height) {
+  const uint64 samples = width * height;
+  const uint64 sse = ComputeSumSquareErrorPlane(src_a, stride_a,
+                                                src_b, stride_b,
+                                                width, height);
+  return SumSquareErrorToPsnr(sse, samples);
+}
+
+LIBYUV_API
+double I420Psnr(const uint8* src_y_a, int stride_y_a,
+                const uint8* src_u_a, int stride_u_a,
+                const uint8* src_v_a, int stride_v_a,
+                const uint8* src_y_b, int stride_y_b,
+                const uint8* src_u_b, int stride_u_b,
+                const uint8* src_v_b, int stride_v_b,
+                int width, int height) {
+  const uint64 sse_y = ComputeSumSquareErrorPlane(src_y_a, stride_y_a,
+                                                  src_y_b, stride_y_b,
+                                                  width, height);
+  const int width_uv = (width + 1) >> 1;
+  const int height_uv = (height + 1) >> 1;
+  const uint64 sse_u = ComputeSumSquareErrorPlane(src_u_a, stride_u_a,
+                                                  src_u_b, stride_u_b,
+                                                  width_uv, height_uv);
+  const uint64 sse_v = ComputeSumSquareErrorPlane(src_v_a, stride_v_a,
+                                                  src_v_b, stride_v_b,
+                                                  width_uv, height_uv);
+  const uint64 samples = width * height + 2 * (width_uv * height_uv);
+  const uint64 sse = sse_y + sse_u + sse_v;
+  return SumSquareErrorToPsnr(sse, samples);
+}
+
+static const int64 cc1 =  26634;  // (64^2*(.01*255)^2
+static const int64 cc2 = 239708;  // (64^2*(.03*255)^2
+
+static double Ssim8x8_C(const uint8* src_a, int stride_a,
+                        const uint8* src_b, int stride_b) {
+  int64 sum_a = 0;
+  int64 sum_b = 0;
+  int64 sum_sq_a = 0;
+  int64 sum_sq_b = 0;
+  int64 sum_axb = 0;
+
+  int i;
+  for (i = 0; i < 8; ++i) {
+    int j;
+    for (j = 0; j < 8; ++j) {
+      sum_a += src_a[j];
+      sum_b += src_b[j];
+      sum_sq_a += src_a[j] * src_a[j];
+      sum_sq_b += src_b[j] * src_b[j];
+      sum_axb += src_a[j] * src_b[j];
+    }
+
+    src_a += stride_a;
+    src_b += stride_b;
+  }
+
+  {
+    const int64 count = 64;
+    // scale the constants by number of pixels
+    const int64 c1 = (cc1 * count * count) >> 12;
+    const int64 c2 = (cc2 * count * count) >> 12;
+
+    const int64 sum_a_x_sum_b = sum_a * sum_b;
+
+    const int64 ssim_n = (2 * sum_a_x_sum_b + c1) *
+                         (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2);
+
+    const int64 sum_a_sq = sum_a*sum_a;
+    const int64 sum_b_sq = sum_b*sum_b;
+
+    const int64 ssim_d = (sum_a_sq + sum_b_sq + c1) *
+                         (count * sum_sq_a - sum_a_sq +
+                          count * sum_sq_b - sum_b_sq + c2);
+
+    if (ssim_d == 0.0) {
+      return DBL_MAX;
+    }
+    return ssim_n * 1.0 / ssim_d;
+  }
+}
+
+// We are using a 8x8 moving window with starting location of each 8x8 window
+// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
+// block boundaries to penalize blocking artifacts.
+LIBYUV_API
+double CalcFrameSsim(const uint8* src_a, int stride_a,
+                     const uint8* src_b, int stride_b,
+                     int width, int height) {
+  int samples = 0;
+  double ssim_total = 0;
+  double (*Ssim8x8)(const uint8* src_a, int stride_a,
+                    const uint8* src_b, int stride_b) = Ssim8x8_C;
+
+  // sample point start with each 4x4 location
+  int i;
+  for (i = 0; i < height - 8; i += 4) {
+    int j;
+    for (j = 0; j < width - 8; j += 4) {
+      ssim_total += Ssim8x8(src_a + j, stride_a, src_b + j, stride_b);
+      samples++;
+    }
+
+    src_a += stride_a * 4;
+    src_b += stride_b * 4;
+  }
+
+  ssim_total /= samples;
+  return ssim_total;
+}
+
+LIBYUV_API
+double I420Ssim(const uint8* src_y_a, int stride_y_a,
+                const uint8* src_u_a, int stride_u_a,
+                const uint8* src_v_a, int stride_v_a,
+                const uint8* src_y_b, int stride_y_b,
+                const uint8* src_u_b, int stride_u_b,
+                const uint8* src_v_b, int stride_v_b,
+                int width, int height) {
+  const double ssim_y = CalcFrameSsim(src_y_a, stride_y_a,
+                                      src_y_b, stride_y_b, width, height);
+  const int width_uv = (width + 1) >> 1;
+  const int height_uv = (height + 1) >> 1;
+  const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a,
+                                      src_u_b, stride_u_b,
+                                      width_uv, height_uv);
+  const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a,
+                                      src_v_b, stride_v_b,
+                                      width_uv, height_uv);
+  return ssim_y * 0.8 + 0.1 * (ssim_u + ssim_v);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libvpx/third_party/libyuv/source/compare_common.cc b/libs/libvpx/third_party/libyuv/source/compare_common.cc
new file mode 100644
index 0000000000..c546b51829
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/source/compare_common.cc
@@ -0,0 +1,42 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count) {
+  uint32 sse = 0u;
+  int i;
+  for (i = 0; i < count; ++i) {
+    int diff = src_a[i] - src_b[i];
+    sse += (uint32)(diff * diff);
+  }
+  return sse;
+}
+
+// hash seed of 5381 recommended.
+// Internal C version of HashDjb2 with int sized count for efficiency.
+uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) {
+  uint32 hash = seed;
+  int i;
+  for (i = 0; i < count; ++i) {
+    hash += (hash << 5) + src[i];
+  }
+  return hash;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libvpx/third_party/libyuv/source/compare_gcc.cc b/libs/libvpx/third_party/libyuv/source/compare_gcc.cc
new file mode 100644
index 0000000000..247cb33bba
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/source/compare_gcc.cc
@@ -0,0 +1,152 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+
+uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
+  uint32 sse;
+  asm volatile (  // NOLINT
+    "pxor      %%xmm0,%%xmm0                   \n"
+    "pxor      %%xmm5,%%xmm5                   \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
+    "lea       " MEMLEA(0x10, 0) ",%0          \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
+    "lea       " MEMLEA(0x10, 1) ",%1          \n"
+    "movdqa    %%xmm1,%%xmm3                   \n"
+    "psubusb   %%xmm2,%%xmm1                   \n"
+    "psubusb   %%xmm3,%%xmm2                   \n"
+    "por       %%xmm2,%%xmm1                   \n"
+    "movdqa    %%xmm1,%%xmm2                   \n"
+    "punpcklbw %%xmm5,%%xmm1                   \n"
+    "punpckhbw %%xmm5,%%xmm2                   \n"
+    "pmaddwd   %%xmm1,%%xmm1                   \n"
+    "pmaddwd   %%xmm2,%%xmm2                   \n"
+    "paddd     %%xmm1,%%xmm0                   \n"
+    "paddd     %%xmm2,%%xmm0                   \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+
+    "pshufd    $0xee,%%xmm0,%%xmm1             \n"
+    "paddd     %%xmm1,%%xmm0                   \n"
+    "pshufd    $0x1,%%xmm0,%%xmm1              \n"
+    "paddd     %%xmm1,%%xmm0                   \n"
+    "movd      %%xmm0,%3                       \n"
+
+  : "+r"(src_a),      // %0
+    "+r"(src_b),      // %1
+    "+r"(count),      // %2
+    "=g"(sse)         // %3
+  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );  // NOLINT
+  return sse;
+}
+
+#endif  // defined(__x86_64__) || defined(__i386__)
+
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
+#define HAS_HASHDJB2_SSE41
+static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16
+static uvec32 kHashMul0 = {
+  0x0c3525e1,  // 33 ^ 15
+  0xa3476dc1,  // 33 ^ 14
+  0x3b4039a1,  // 33 ^ 13
+  0x4f5f0981,  // 33 ^ 12
+};
+static uvec32 kHashMul1 = {
+  0x30f35d61,  // 33 ^ 11
+  0x855cb541,  // 33 ^ 10
+  0x040a9121,  // 33 ^ 9
+  0x747c7101,  // 33 ^ 8
+};
+static uvec32 kHashMul2 = {
+  0xec41d4e1,  // 33 ^ 7
+  0x4cfa3cc1,  // 33 ^ 6
+  0x025528a1,  // 33 ^ 5
+  0x00121881,  // 33 ^ 4
+};
+static uvec32 kHashMul3 = {
+  0x00008c61,  // 33 ^ 3
+  0x00000441,  // 33 ^ 2
+  0x00000021,  // 33 ^ 1
+  0x00000001,  // 33 ^ 0
+};
+
+uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
+  uint32 hash;
+  asm volatile (  // NOLINT
+    "movd      %2,%%xmm0                       \n"
+    "pxor      %%xmm7,%%xmm7                   \n"
+    "movdqa    %4,%%xmm6                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
+    "lea       " MEMLEA(0x10, 0) ",%0          \n"
+    "pmulld    %%xmm6,%%xmm0                   \n"
+    "movdqa    %5,%%xmm5                       \n"
+    "movdqa    %%xmm1,%%xmm2                   \n"
+    "punpcklbw %%xmm7,%%xmm2                   \n"
+    "movdqa    %%xmm2,%%xmm3                   \n"
+    "punpcklwd %%xmm7,%%xmm3                   \n"
+    "pmulld    %%xmm5,%%xmm3                   \n"
+    "movdqa    %6,%%xmm5                       \n"
+    "movdqa    %%xmm2,%%xmm4                   \n"
+    "punpckhwd %%xmm7,%%xmm4                   \n"
+    "pmulld    %%xmm5,%%xmm4                   \n"
+    "movdqa    %7,%%xmm5                       \n"
+    "punpckhbw %%xmm7,%%xmm1                   \n"
+    "movdqa    %%xmm1,%%xmm2                   \n"
+    "punpcklwd %%xmm7,%%xmm2                   \n"
+    "pmulld    %%xmm5,%%xmm2                   \n"
+    "movdqa    %8,%%xmm5                       \n"
+    "punpckhwd %%xmm7,%%xmm1                   \n"
+    "pmulld    %%xmm5,%%xmm1                   \n"
+    "paddd     %%xmm4,%%xmm3                   \n"
+    "paddd     %%xmm2,%%xmm1                   \n"
+    "paddd     %%xmm3,%%xmm1                   \n"
+    "pshufd    $0xe,%%xmm1,%%xmm2              \n"
+    "paddd     %%xmm2,%%xmm1                   \n"
+    "pshufd    $0x1,%%xmm1,%%xmm2              \n"
+    "paddd     %%xmm2,%%xmm1                   \n"
+    "paddd     %%xmm1,%%xmm0                   \n"
+    "sub       $0x10,%1                        \n"
+    "jg        1b                              \n"
+    "movd      %%xmm0,%3                       \n"
+  : "+r"(src),        // %0
+    "+r"(count),      // %1
+    "+rm"(seed),      // %2
+    "=g"(hash)        // %3
+  : "m"(kHash16x33),  // %4
+    "m"(kHashMul0),   // %5
+    "m"(kHashMul1),   // %6
+    "m"(kHashMul2),   // %7
+    "m"(kHashMul3)    // %8
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );  // NOLINT
+  return hash;
+}
+#endif  // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
diff --git a/libs/libvpx/third_party/libyuv/source/compare_neon.cc b/libs/libvpx/third_party/libyuv/source/compare_neon.cc
new file mode 100644
index 0000000000..ef006ec41c
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/source/compare_neon.cc
@@ -0,0 +1,65 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+    !defined(__aarch64__)
+
+uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
+  volatile uint32 sse;
+  asm volatile (
+    "vmov.u8    q8, #0                         \n"
+    "vmov.u8    q10, #0                        \n"
+    "vmov.u8    q9, #0                         \n"
+    "vmov.u8    q11, #0                        \n"
+
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"
+    MEMACCESS(1)
+    "vld1.8     {q1}, [%1]!                    \n"
+    "subs       %2, %2, #16                    \n"
+    "vsubl.u8   q2, d0, d2                     \n"
+    "vsubl.u8   q3, d1, d3                     \n"
+    "vmlal.s16  q8, d4, d4                     \n"
+    "vmlal.s16  q9, d6, d6                     \n"
+    "vmlal.s16  q10, d5, d5                    \n"
+    "vmlal.s16  q11, d7, d7                    \n"
+    "bgt        1b                             \n"
+
+    "vadd.u32   q8, q8, q9                     \n"
+    "vadd.u32   q10, q10, q11                  \n"
+    "vadd.u32   q11, q8, q10                   \n"
+    "vpaddl.u32 q1, q11                        \n"
+    "vadd.u64   d0, d2, d3                     \n"
+    "vmov.32    %3, d0[0]                      \n"
+    : "+r"(src_a),
+      "+r"(src_b),
+      "+r"(count),
+      "=r"(sse)
+    :
+    : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
+  return sse;
+}
+
+#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libvpx/third_party/libyuv/source/compare_neon64.cc b/libs/libvpx/third_party/libyuv/source/compare_neon64.cc
new file mode 100644
index 0000000000..6d1e5e1bc9
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/source/compare_neon64.cc
@@ -0,0 +1,63 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
+  volatile uint32 sse;
+  asm volatile (
+    "eor        v16.16b, v16.16b, v16.16b      \n"
+    "eor        v18.16b, v18.16b, v18.16b      \n"
+    "eor        v17.16b, v17.16b, v17.16b      \n"
+    "eor        v19.16b, v19.16b, v19.16b      \n"
+
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"
+    MEMACCESS(1)
+    "ld1        {v1.16b}, [%1], #16            \n"
+    "subs       %w2, %w2, #16                  \n"
+    "usubl      v2.8h, v0.8b, v1.8b            \n"
+    "usubl2     v3.8h, v0.16b, v1.16b          \n"
+    "smlal      v16.4s, v2.4h, v2.4h           \n"
+    "smlal      v17.4s, v3.4h, v3.4h           \n"
+    "smlal2     v18.4s, v2.8h, v2.8h           \n"
+    "smlal2     v19.4s, v3.8h, v3.8h           \n"
+    "b.gt       1b                             \n"
+
+    "add        v16.4s, v16.4s, v17.4s         \n"
+    "add        v18.4s, v18.4s, v19.4s         \n"
+    "add        v19.4s, v16.4s, v18.4s         \n"
+    "addv       s0, v19.4s                     \n"
+    "fmov       %w3, s0                        \n"
+    : "+r"(src_a),
+      "+r"(src_b),
+      "+r"(count),
+      "=r"(sse)
+    :
+    : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
+  return sse;
+}
+
+#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libvpx/third_party/libyuv/source/compare_win.cc b/libs/libvpx/third_party/libyuv/source/compare_win.cc
new file mode 100644
index 0000000000..19806f2750
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/source/compare_win.cc
@@ -0,0 +1,229 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for Visual C x86.
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
+    defined(_MSC_VER) && !defined(__clang__)
+
+__declspec(naked)
+uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
+  __asm {
+    mov        eax, [esp + 4]    // src_a
+    mov        edx, [esp + 8]    // src_b
+    mov        ecx, [esp + 12]   // count
+    pxor       xmm0, xmm0
+    pxor       xmm5, xmm5
+
+  wloop:
+    movdqu     xmm1, [eax]
+    lea        eax,  [eax + 16]
+    movdqu     xmm2, [edx]
+    lea        edx,  [edx + 16]
+    movdqa     xmm3, xmm1  // abs trick
+    psubusb    xmm1, xmm2
+    psubusb    xmm2, xmm3
+    por        xmm1, xmm2
+    movdqa     xmm2, xmm1
+    punpcklbw  xmm1, xmm5
+    punpckhbw  xmm2, xmm5
+    pmaddwd    xmm1, xmm1
+    pmaddwd    xmm2, xmm2
+    paddd      xmm0, xmm1
+    paddd      xmm0, xmm2
+    sub        ecx, 16
+    jg         wloop
+
+    pshufd     xmm1, xmm0, 0xee
+    paddd      xmm0, xmm1
+    pshufd     xmm1, xmm0, 0x01
+    paddd      xmm0, xmm1
+    movd       eax, xmm0
+    ret
+  }
+}
+
+// Visual C 2012 required for AVX2.
+#if _MSC_VER >= 1700
+// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
+#pragma warning(disable: 4752)
+__declspec(naked)
+uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
+  __asm {
+    mov        eax, [esp + 4]    // src_a
+    mov        edx, [esp + 8]    // src_b
+    mov        ecx, [esp + 12]   // count
+    vpxor      ymm0, ymm0, ymm0  // sum
+    vpxor      ymm5, ymm5, ymm5  // constant 0 for unpck
+    sub        edx, eax
+
+  wloop:
+    vmovdqu    ymm1, [eax]
+    vmovdqu    ymm2, [eax + edx]
+    lea        eax,  [eax + 32]
+    vpsubusb   ymm3, ymm1, ymm2  // abs difference trick
+    vpsubusb   ymm2, ymm2, ymm1
+    vpor       ymm1, ymm2, ymm3
+    vpunpcklbw ymm2, ymm1, ymm5  // u16.  mutates order.
+    vpunpckhbw ymm1, ymm1, ymm5
+    vpmaddwd   ymm2, ymm2, ymm2  // square + hadd to u32.
+    vpmaddwd   ymm1, ymm1, ymm1
+    vpaddd     ymm0, ymm0, ymm1
+    vpaddd     ymm0, ymm0, ymm2
+    sub        ecx, 32
+    jg         wloop
+
+    vpshufd    ymm1, ymm0, 0xee  // 3, 2 + 1, 0 both lanes.
+    vpaddd     ymm0, ymm0, ymm1
+    vpshufd    ymm1, ymm0, 0x01  // 1 + 0 both lanes.
+    vpaddd     ymm0, ymm0, ymm1
+    vpermq     ymm1, ymm0, 0x02  // high + low lane.
+    vpaddd     ymm0, ymm0, ymm1
+    vmovd      eax, xmm0
+    vzeroupper
+    ret
+  }
+}
+#endif  // _MSC_VER >= 1700
+
+#define HAS_HASHDJB2_SSE41
+static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16
+static uvec32 kHashMul0 = {
+  0x0c3525e1,  // 33 ^ 15
+  0xa3476dc1,  // 33 ^ 14
+  0x3b4039a1,  // 33 ^ 13
+  0x4f5f0981,  // 33 ^ 12
+};
+static uvec32 kHashMul1 = {
+  0x30f35d61,  // 33 ^ 11
+  0x855cb541,  // 33 ^ 10
+  0x040a9121,  // 33 ^ 9
+  0x747c7101,  // 33 ^ 8
+};
+static uvec32 kHashMul2 = {
+  0xec41d4e1,  // 33 ^ 7
+  0x4cfa3cc1,  // 33 ^ 6
+  0x025528a1,  // 33 ^ 5
+  0x00121881,  // 33 ^ 4
+};
+static uvec32 kHashMul3 = {
+  0x00008c61,  // 33 ^ 3
+  0x00000441,  // 33 ^ 2
+  0x00000021,  // 33 ^ 1
+  0x00000001,  // 33 ^ 0
+};
+
+// 27: 66 0F 38 40 C6     pmulld      xmm0,xmm6
+// 44: 66 0F 38 40 DD     pmulld      xmm3,xmm5
+// 59: 66 0F 38 40 E5     pmulld      xmm4,xmm5
+// 72: 66 0F 38 40 D5     pmulld      xmm2,xmm5
+// 83: 66 0F 38 40 CD     pmulld      xmm1,xmm5
+#define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \
+    _asm _emit 0x40 _asm _emit reg
+
+__declspec(naked)
+uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
+  __asm {
+    mov        eax, [esp + 4]    // src
+    mov        ecx, [esp + 8]    // count
+    movd       xmm0, [esp + 12]  // seed
+
+    pxor       xmm7, xmm7        // constant 0 for unpck
+    movdqa     xmm6, kHash16x33
+
+  wloop:
+    movdqu     xmm1, [eax]       // src[0-15]
+    lea        eax, [eax + 16]
+    pmulld(0xc6)                 // pmulld      xmm0,xmm6  hash *= 33 ^ 16
+    movdqa     xmm5, kHashMul0
+    movdqa     xmm2, xmm1
+    punpcklbw  xmm2, xmm7        // src[0-7]
+    movdqa     xmm3, xmm2
+    punpcklwd  xmm3, xmm7        // src[0-3]
+    pmulld(0xdd)                 // pmulld     xmm3, xmm5
+    movdqa     xmm5, kHashMul1
+    movdqa     xmm4, xmm2
+    punpckhwd  xmm4, xmm7        // src[4-7]
+    pmulld(0xe5)                 // pmulld     xmm4, xmm5
+    movdqa     xmm5, kHashMul2
+    punpckhbw  xmm1, xmm7        // src[8-15]
+    movdqa     xmm2, xmm1
+    punpcklwd  xmm2, xmm7        // src[8-11]
+    pmulld(0xd5)                 // pmulld     xmm2, xmm5
+    movdqa     xmm5, kHashMul3
+    punpckhwd  xmm1, xmm7        // src[12-15]
+    pmulld(0xcd)                 // pmulld     xmm1, xmm5
+    paddd      xmm3, xmm4        // add 16 results
+    paddd      xmm1, xmm2
+    paddd      xmm1, xmm3
+
+    pshufd     xmm2, xmm1, 0x0e  // upper 2 dwords
+    paddd      xmm1, xmm2
+    pshufd     xmm2, xmm1, 0x01
+    paddd      xmm1, xmm2
+    paddd      xmm0, xmm1
+    sub        ecx, 16
+    jg         wloop
+
+    movd       eax, xmm0         // return hash
+    ret
+  }
+}
+
+// Visual C 2012 required for AVX2.
+#if _MSC_VER >= 1700
+__declspec(naked)
+uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
+  __asm {
+    mov        eax, [esp + 4]    // src
+    mov        ecx, [esp + 8]    // count
+    movd       xmm0, [esp + 12]  // seed
+    movdqa     xmm6, kHash16x33
+
+  wloop:
+    vpmovzxbd  xmm3, dword ptr [eax]  // src[0-3]
+    pmulld     xmm0, xmm6  // hash *= 33 ^ 16
+    vpmovzxbd  xmm4, dword ptr [eax + 4]  // src[4-7]
+    pmulld     xmm3, kHashMul0
+    vpmovzxbd  xmm2, dword ptr [eax + 8]  // src[8-11]
+    pmulld     xmm4, kHashMul1
+    vpmovzxbd  xmm1, dword ptr [eax + 12]  // src[12-15]
+    pmulld     xmm2, kHashMul2
+    lea        eax, [eax + 16]
+    pmulld     xmm1, kHashMul3
+    paddd      xmm3, xmm4        // add 16 results
+    paddd      xmm1, xmm2
+    paddd      xmm1, xmm3
+    pshufd     xmm2, xmm1, 0x0e  // upper 2 dwords
+    paddd      xmm1, xmm2
+    pshufd     xmm2, xmm1, 0x01
+    paddd      xmm1, xmm2
+    paddd      xmm0, xmm1
+    sub        ecx, 16
+    jg         wloop
+
+    movd       eax, xmm0         // return hash
+    ret
+  }
+}
+#endif  // _MSC_VER >= 1700
+#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libvpx/third_party/libyuv/source/convert.cc b/libs/libvpx/third_party/libyuv/source/convert.cc
new file mode 100644
index 0000000000..3ad6bd7a4b
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/source/convert.cc
@@ -0,0 +1,1389 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert.h"
+
+#include "libyuv/basic_types.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+#include "libyuv/scale.h"  // For ScalePlane()
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
+static __inline int Abs(int v) {
+  return v >= 0 ? v : -v;
+}
+
+// Any I4xx To I420 format with mirroring.
+static int I4xxToI420(const uint8* src_y, int src_stride_y,
+                      const uint8* src_u, int src_stride_u,
+                      const uint8* src_v, int src_stride_v,
+                      uint8* dst_y, int dst_stride_y,
+                      uint8* dst_u, int dst_stride_u,
+                      uint8* dst_v, int dst_stride_v,
+                      int src_y_width, int src_y_height,
+                      int src_uv_width, int src_uv_height) {
+  const int dst_y_width = Abs(src_y_width);
+  const int dst_y_height = Abs(src_y_height);
+  const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1);
+  const int dst_uv_height = SUBSAMPLE(dst_y_height, 1, 1);
+  if (src_y_width == 0 || src_y_height == 0 ||
+      src_uv_width == 0 || src_uv_height == 0) {
+    return -1;
+  }
+  ScalePlane(src_y, src_stride_y, src_y_width, src_y_height,
+             dst_y, dst_stride_y, dst_y_width, dst_y_height,
+             kFilterBilinear);
+  ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height,
+             dst_u, dst_stride_u, dst_uv_width, dst_uv_height,
+             kFilterBilinear);
+  ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height,
+             dst_v, dst_stride_v, dst_uv_width, dst_uv_height,
+             kFilterBilinear);
+  return 0;
+}
+
+// Copy I420 with optional flipping
+// TODO(fbarchard): Use Scale plane which supports mirroring, but ensure
+// is does row coalescing.
+LIBYUV_API
+int I420Copy(const uint8* src_y, int src_stride_y,
+             const uint8* src_u, int src_stride_u,
+             const uint8* src_v, int src_stride_v,
+             uint8* dst_y, int dst_stride_y,
+             uint8* dst_u, int dst_stride_u,
+             uint8* dst_v, int dst_stride_v,
+             int width, int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_y || !src_u || !src_v ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  if (dst_y) {
+    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+  // Copy UV planes.
+  CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
+  CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
+  return 0;
+}
+
+// 422 chroma is 1/2 width, 1x height
+// 420 chroma is 1/2 width, 1/2 height
+LIBYUV_API
+int I422ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  const int src_uv_width = SUBSAMPLE(width, 1, 1);
+  return I4xxToI420(src_y, src_stride_y,
+                    src_u, src_stride_u,
+                    src_v, src_stride_v,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height,
+                    src_uv_width, height);
+}
+
+// 444 chroma is 1x width, 1x height
+// 420 chroma is 1/2 width, 1/2 height
+LIBYUV_API
+int I444ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  return I4xxToI420(src_y, src_stride_y,
+                    src_u, src_stride_u,
+                    src_v, src_stride_v,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height,
+                    width, height);
+}
+
+// 411 chroma is 1/4 width, 1x height
+// 420 chroma is 1/2 width, 1/2 height
+LIBYUV_API
+int I411ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  const int src_uv_width = SUBSAMPLE(width, 3, 2);
+  return I4xxToI420(src_y, src_stride_y,
+                    src_u, src_stride_u,
+                    src_v, src_stride_v,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height,
+                    src_uv_width, height);
+}
+
+// I400 is greyscale typically used in MJPG
+LIBYUV_API
+int I400ToI420(const uint8* src_y, int src_stride_y,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_y || !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  SetPlane(dst_u, dst_stride_u, halfwidth, halfheight, 128);
+  SetPlane(dst_v, dst_stride_v, halfwidth, halfheight, 128);
+  return 0;
+}
+
+static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
+                       uint8* dst, int dst_stride,
+                       int width, int height) {
+  int y;
+  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+#if defined(HAS_COPYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
+  }
+#endif
+#if defined(HAS_COPYROW_AVX)
+  if (TestCpuFlag(kCpuHasAVX)) {
+    CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
+  }
+#endif
+#if defined(HAS_COPYROW_ERMS)
+  if (TestCpuFlag(kCpuHasERMS)) {
+    CopyRow = CopyRow_ERMS;
+  }
+#endif
+#if defined(HAS_COPYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
+  }
+#endif
+#if defined(HAS_COPYROW_MIPS)
+  if (TestCpuFlag(kCpuHasMIPS)) {
+    CopyRow = CopyRow_MIPS;
+  }
+#endif
+
+  // Copy plane
+  for (y = 0; y < height - 1; y += 2) {
+    CopyRow(src, dst, width);
+    CopyRow(src + src_stride_0, dst + dst_stride, width);
+    src += src_stride_0 + src_stride_1;
+    dst += dst_stride * 2;
+  }
+  if (height & 1) {
+    CopyRow(src, dst, width);
+  }
+}
+
+// Support converting from FOURCC_M420
+// Useful for bandwidth constrained transports like USB 1.0 and 2.0 and for
+// easy conversion to I420.
+// M420 format description:
+// M420 is row biplanar 420: 2 rows of Y and 1 row of UV.
+// Chroma is half width / half height. (420)
+// src_stride_m420 is row planar. Normally this will be the width in pixels.
+//   The UV plane is half width, but 2 values, so src_stride_m420 applies to
+//   this as well as the two Y planes.
+static int X420ToI420(const uint8* src_y,
+                      int src_stride_y0, int src_stride_y1,
+                      const uint8* src_uv, int src_stride_uv,
+                      uint8* dst_y, int dst_stride_y,
+                      uint8* dst_u, int dst_stride_u,
+                      uint8* dst_v, int dst_stride_v,
+                      int width, int height) {
+  int y;
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) =
+      SplitUVRow_C;
+  if (!src_y || !src_uv ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_u = dst_u + (halfheight - 1) * dst_stride_u;
+    dst_v = dst_v + (halfheight - 1) * dst_stride_v;
+    dst_stride_y = -dst_stride_y;
+    dst_stride_u = -dst_stride_u;
+    dst_stride_v = -dst_stride_v;
+  }
+  // Coalesce rows.
+  if (src_stride_y0 == width &&
+      src_stride_y1 == width &&
+      dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_y0 = src_stride_y1 = dst_stride_y = 0;
+  }
+  // Coalesce rows.
+  if (src_stride_uv == halfwidth * 2 &&
+      dst_stride_u == halfwidth &&
+      dst_stride_v == halfwidth) {
+    halfwidth *= halfheight;
+    halfheight = 1;
+    src_stride_uv = dst_stride_u = dst_stride_v = 0;
+  }
+#if defined(HAS_SPLITUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SplitUVRow = SplitUVRow_Any_SSE2;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      SplitUVRow = SplitUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    SplitUVRow = SplitUVRow_Any_AVX2;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      SplitUVRow = SplitUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SplitUVRow = SplitUVRow_Any_NEON;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      SplitUVRow = SplitUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
+      IS_ALIGNED(src_uv, 4) && IS_ALIGNED(src_stride_uv, 4) &&
+      IS_ALIGNED(dst_u, 4) && IS_ALIGNED(dst_stride_u, 4) &&
+      IS_ALIGNED(dst_v, 4) && IS_ALIGNED(dst_stride_v, 4)) {
+    SplitUVRow = SplitUVRow_Any_MIPS_DSPR2;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      SplitUVRow = SplitUVRow_MIPS_DSPR2;
+    }
+  }
+#endif
+
+  if (dst_y) {
+    if (src_stride_y0 == src_stride_y1) {
+      CopyPlane(src_y, src_stride_y0, dst_y, dst_stride_y, width, height);
+    } else {
+      CopyPlane2(src_y, src_stride_y0, src_stride_y1, dst_y, dst_stride_y,
+                 width, height);
+    }
+  }
+
+  for (y = 0; y < halfheight; ++y) {
+    // Copy a row of UV.
+    SplitUVRow(src_uv, dst_u, dst_v, halfwidth);
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+    src_uv += src_stride_uv;
+  }
+  return 0;
+}
+
+// Convert NV12 to I420.
+LIBYUV_API
+int NV12ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_uv, int src_stride_uv,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  return X420ToI420(src_y, src_stride_y, src_stride_y,
+                    src_uv, src_stride_uv,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height);
+}
+
+// Convert NV21 to I420.  Same as NV12 but u and v pointers swapped.
+LIBYUV_API
+int NV21ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_vu, int src_stride_vu,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  return X420ToI420(src_y, src_stride_y, src_stride_y,
+                    src_vu, src_stride_vu,
+                    dst_y, dst_stride_y,
+                    dst_v, dst_stride_v,
+                    dst_u, dst_stride_u,
+                    width, height);
+}
+
+// Convert M420 to I420.
+LIBYUV_API
+int M420ToI420(const uint8* src_m420, int src_stride_m420,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  return X420ToI420(src_m420, src_stride_m420, src_stride_m420 * 2,
+                    src_m420 + src_stride_m420 * 2, src_stride_m420 * 3,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height);
+}
+
+// Convert YUY2 to I420.
+LIBYUV_API
+int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*YUY2ToUVRow)(const uint8* src_yuy2, int src_stride_yuy2,
+      uint8* dst_u, uint8* dst_v, int pix) = YUY2ToUVRow_C;
+  void (*YUY2ToYRow)(const uint8* src_yuy2,
+      uint8* dst_y, int pix) = YUY2ToYRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+    src_stride_yuy2 = -src_stride_yuy2;
+  }
+#if defined(HAS_YUY2TOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    YUY2ToUVRow = YUY2ToUVRow_Any_SSE2;
+    YUY2ToYRow = YUY2ToYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToUVRow = YUY2ToUVRow_SSE2;
+      YUY2ToYRow = YUY2ToYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    YUY2ToUVRow = YUY2ToUVRow_Any_AVX2;
+    YUY2ToYRow = YUY2ToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      YUY2ToUVRow = YUY2ToUVRow_AVX2;
+      YUY2ToYRow = YUY2ToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    YUY2ToYRow = YUY2ToYRow_Any_NEON;
+    YUY2ToUVRow = YUY2ToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToYRow = YUY2ToYRow_NEON;
+      YUY2ToUVRow = YUY2ToUVRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width);
+    YUY2ToYRow(src_yuy2, dst_y, width);
+    YUY2ToYRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y, width);
+    src_yuy2 += src_stride_yuy2 * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    YUY2ToUVRow(src_yuy2, 0, dst_u, dst_v, width);
+    YUY2ToYRow(src_yuy2, dst_y, width);
+  }
+  return 0;
+}
+
+// Convert UYVY to I420.
+LIBYUV_API
+int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy,
+      uint8* dst_u, uint8* dst_v, int pix) = UYVYToUVRow_C;
+  void (*UYVYToYRow)(const uint8* src_uyvy,
+      uint8* dst_y, int pix) = UYVYToYRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+    src_stride_uyvy = -src_stride_uyvy;
+  }
+#if defined(HAS_UYVYTOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    UYVYToUVRow = UYVYToUVRow_Any_SSE2;
+    UYVYToYRow = UYVYToYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToUVRow = UYVYToUVRow_SSE2;
+      UYVYToYRow = UYVYToYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    UYVYToUVRow = UYVYToUVRow_Any_AVX2;
+    UYVYToYRow = UYVYToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      UYVYToUVRow = UYVYToUVRow_AVX2;
+      UYVYToYRow = UYVYToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    UYVYToYRow = UYVYToYRow_Any_NEON;
+    UYVYToUVRow = UYVYToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToYRow = UYVYToYRow_NEON;
+      UYVYToUVRow = UYVYToUVRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width);
+    UYVYToYRow(src_uyvy, dst_y, width);
+    UYVYToYRow(src_uyvy + src_stride_uyvy, dst_y + dst_stride_y, width);
+    src_uyvy += src_stride_uyvy * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    UYVYToUVRow(src_uyvy, 0, dst_u, dst_v, width);
+    UYVYToYRow(src_uyvy, dst_y, width);
+  }
+  return 0;
+}
+
+// Convert ARGB to I420.
+LIBYUV_API
+int ARGBToI420(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  if (!src_argb ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVRow = ARGBToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width);
+    ARGBToYRow(src_argb, dst_y, width);
+    ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+    src_argb += src_stride_argb * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
+    ARGBToYRow(src_argb, dst_y, width);
+  }
+  return 0;
+}
+
+// Convert BGRA to I420.
+LIBYUV_API
+int BGRAToI420(const uint8* src_bgra, int src_stride_bgra,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*BGRAToUVRow)(const uint8* src_bgra0, int src_stride_bgra,
+      uint8* dst_u, uint8* dst_v, int width) = BGRAToUVRow_C;
+  void (*BGRAToYRow)(const uint8* src_bgra, uint8* dst_y, int pix) =
+      BGRAToYRow_C;
+  if (!src_bgra ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_bgra = src_bgra + (height - 1) * src_stride_bgra;
+    src_stride_bgra = -src_stride_bgra;
+  }
+#if defined(HAS_BGRATOYROW_SSSE3) && defined(HAS_BGRATOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    BGRAToUVRow = BGRAToUVRow_Any_SSSE3;
+    BGRAToYRow = BGRAToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      BGRAToUVRow = BGRAToUVRow_SSSE3;
+      BGRAToYRow = BGRAToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_BGRATOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    BGRAToYRow = BGRAToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      BGRAToYRow = BGRAToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_BGRATOUVROW_NEON)
+    if (TestCpuFlag(kCpuHasNEON)) {
+      BGRAToUVRow = BGRAToUVRow_Any_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        BGRAToUVRow = BGRAToUVRow_NEON;
+      }
+    }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    BGRAToUVRow(src_bgra, src_stride_bgra, dst_u, dst_v, width);
+    BGRAToYRow(src_bgra, dst_y, width);
+    BGRAToYRow(src_bgra + src_stride_bgra, dst_y + dst_stride_y, width);
+    src_bgra += src_stride_bgra * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    BGRAToUVRow(src_bgra, 0, dst_u, dst_v, width);
+    BGRAToYRow(src_bgra, dst_y, width);
+  }
+  return 0;
+}
+
+// Convert ABGR to I420.
+LIBYUV_API
+int ABGRToI420(const uint8* src_abgr, int src_stride_abgr,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*ABGRToUVRow)(const uint8* src_abgr0, int src_stride_abgr,
+      uint8* dst_u, uint8* dst_v, int width) = ABGRToUVRow_C;
+  void (*ABGRToYRow)(const uint8* src_abgr, uint8* dst_y, int pix) =
+      ABGRToYRow_C;
+  if (!src_abgr ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+    src_stride_abgr = -src_stride_abgr;
+  }
+#if defined(HAS_ABGRTOYROW_SSSE3) && defined(HAS_ABGRTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
+    ABGRToYRow = ABGRToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVRow = ABGRToUVRow_SSSE3;
+      ABGRToYRow = ABGRToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ABGRToYRow = ABGRToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ABGRToYRow = ABGRToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ABGRToUVRow = ABGRToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVRow = ABGRToUVRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width);
+    ABGRToYRow(src_abgr, dst_y, width);
+    ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width);
+    src_abgr += src_stride_abgr * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    ABGRToUVRow(src_abgr, 0, dst_u, dst_v, width);
+    ABGRToYRow(src_abgr, dst_y, width);
+  }
+  return 0;
+}
+
+// Convert RGBA to I420.
+LIBYUV_API
+int RGBAToI420(const uint8* src_rgba, int src_stride_rgba,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*RGBAToUVRow)(const uint8* src_rgba0, int src_stride_rgba,
+      uint8* dst_u, uint8* dst_v, int width) = RGBAToUVRow_C;
+  void (*RGBAToYRow)(const uint8* src_rgba, uint8* dst_y, int pix) =
+      RGBAToYRow_C;
+  if (!src_rgba ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_rgba = src_rgba + (height - 1) * src_stride_rgba;
+    src_stride_rgba = -src_stride_rgba;
+  }
+#if defined(HAS_RGBATOYROW_SSSE3) && defined(HAS_RGBATOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RGBAToUVRow = RGBAToUVRow_Any_SSSE3;
+    RGBAToYRow = RGBAToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToUVRow = RGBAToUVRow_SSSE3;
+      RGBAToYRow = RGBAToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGBAToYRow = RGBAToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGBAToYRow = RGBAToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGBAToUVRow = RGBAToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToUVRow = RGBAToUVRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    RGBAToUVRow(src_rgba, src_stride_rgba, dst_u, dst_v, width);
+    RGBAToYRow(src_rgba, dst_y, width);
+    RGBAToYRow(src_rgba + src_stride_rgba, dst_y + dst_stride_y, width);
+    src_rgba += src_stride_rgba * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    RGBAToUVRow(src_rgba, 0, dst_u, dst_v, width);
+    RGBAToYRow(src_rgba, dst_y, width);
+  }
+  return 0;
+}
+
+// Convert RGB24 to I420.
+LIBYUV_API
+int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
+                uint8* dst_y, int dst_stride_y,
+                uint8* dst_u, int dst_stride_u,
+                uint8* dst_v, int dst_stride_v,
+                int width, int height) {
+  int y;
+#if defined(HAS_RGB24TOYROW_NEON)
+  void (*RGB24ToUVRow)(const uint8* src_rgb24, int src_stride_rgb24,
+      uint8* dst_u, uint8* dst_v, int width) = RGB24ToUVRow_C;
+  void (*RGB24ToYRow)(const uint8* src_rgb24, uint8* dst_y, int pix) =
+      RGB24ToYRow_C;
+#else
+  void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+      RGB24ToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+#endif
+  if (!src_rgb24 || !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+    src_stride_rgb24 = -src_stride_rgb24;
+  }
+
+// Neon version does direct RGB24 to YUV.
+#if defined(HAS_RGB24TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGB24ToUVRow = RGB24ToUVRow_Any_NEON;
+    RGB24ToYRow = RGB24ToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGB24ToYRow = RGB24ToYRow_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        RGB24ToUVRow = RGB24ToUVRow_NEON;
+      }
+    }
+  }
+// Other platforms do intermediate conversion from RGB24 to ARGB.
+#else
+#if defined(HAS_RGB24TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+  {
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (width * 4 + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);
+#endif
+
+    for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_RGB24TOYROW_NEON)
+      RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
+      RGB24ToYRow(src_rgb24, dst_y, width);
+      RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
+#else
+      RGB24ToARGBRow(src_rgb24, row, width);
+      RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);
+      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+      src_rgb24 += src_stride_rgb24 * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+#if defined(HAS_RGB24TOYROW_NEON)
+      RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width);
+      RGB24ToYRow(src_rgb24, dst_y, width);
+#else
+      RGB24ToARGBRow(src_rgb24, row, width);
+      ARGBToUVRow(row, 0, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+#endif
+    }
+#if !defined(HAS_RGB24TOYROW_NEON)
+    free_aligned_buffer_64(row);
+  }
+#endif
+  return 0;
+}
+
+// Convert RAW to I420.
+LIBYUV_API
+int RAWToI420(const uint8* src_raw, int src_stride_raw,
+              uint8* dst_y, int dst_stride_y,
+              uint8* dst_u, int dst_stride_u,
+              uint8* dst_v, int dst_stride_v,
+              int width, int height) {
+  int y;
+#if defined(HAS_RAWTOYROW_NEON)
+  void (*RAWToUVRow)(const uint8* src_raw, int src_stride_raw,
+      uint8* dst_u, uint8* dst_v, int width) = RAWToUVRow_C;
+  void (*RAWToYRow)(const uint8* src_raw, uint8* dst_y, int pix) =
+      RAWToYRow_C;
+#else
+  void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+      RAWToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+#endif
+  if (!src_raw || !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_raw = src_raw + (height - 1) * src_stride_raw;
+    src_stride_raw = -src_stride_raw;
+  }
+
+// Neon version does direct RAW to YUV.
+#if defined(HAS_RAWTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RAWToUVRow = RAWToUVRow_Any_NEON;
+    RAWToYRow = RAWToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RAWToYRow = RAWToYRow_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        RAWToUVRow = RAWToUVRow_NEON;
+      }
+    }
+  }
+// Other platforms do intermediate conversion from RAW to ARGB.
+#else
+#if defined(HAS_RAWTOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToARGBRow = RAWToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+  {
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (width * 4 + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);
+#endif
+
+    for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_RAWTOYROW_NEON)
+      RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width);
+      RAWToYRow(src_raw, dst_y, width);
+      RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
+#else
+      RAWToARGBRow(src_raw, row, width);
+      RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width);
+      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+      src_raw += src_stride_raw * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+#if defined(HAS_RAWTOYROW_NEON)
+      RAWToUVRow(src_raw, 0, dst_u, dst_v, width);
+      RAWToYRow(src_raw, dst_y, width);
+#else
+      RAWToARGBRow(src_raw, row, width);
+      ARGBToUVRow(row, 0, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+#endif
+    }
+#if !defined(HAS_RAWTOYROW_NEON)
+    free_aligned_buffer_64(row);
+  }
+#endif
+  return 0;
+}
+
+// Convert RGB565 to I420.
+LIBYUV_API
+int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
+                 uint8* dst_y, int dst_stride_y,
+                 uint8* dst_u, int dst_stride_u,
+                 uint8* dst_v, int dst_stride_v,
+                 int width, int height) {
+  int y;
+#if defined(HAS_RGB565TOYROW_NEON)
+  void (*RGB565ToUVRow)(const uint8* src_rgb565, int src_stride_rgb565,
+      uint8* dst_u, uint8* dst_v, int width) = RGB565ToUVRow_C;
+  void (*RGB565ToYRow)(const uint8* src_rgb565, uint8* dst_y, int pix) =
+      RGB565ToYRow_C;
+#else
+  void (*RGB565ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+      RGB565ToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+#endif
+  if (!src_rgb565 || !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565;
+    src_stride_rgb565 = -src_stride_rgb565;
+  }
+
+// Neon version does direct RGB565 to YUV.
+#if defined(HAS_RGB565TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGB565ToUVRow = RGB565ToUVRow_Any_NEON;
+    RGB565ToYRow = RGB565ToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGB565ToYRow = RGB565ToYRow_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        RGB565ToUVRow = RGB565ToUVRow_NEON;
+      }
+    }
+  }
+// Other platforms do intermediate conversion from RGB565 to ARGB.
+#else
+#if defined(HAS_RGB565TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_RGB565TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+  {
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (width * 4 + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);
+#endif
+
+    for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_RGB565TOYROW_NEON)
+      RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width);
+      RGB565ToYRow(src_rgb565, dst_y, width);
+      RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width);
+#else
+      RGB565ToARGBRow(src_rgb565, row, width);
+      RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kRowSize, width);
+      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+      src_rgb565 += src_stride_rgb565 * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+#if defined(HAS_RGB565TOYROW_NEON)
+      RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width);
+      RGB565ToYRow(src_rgb565, dst_y, width);
+#else
+      RGB565ToARGBRow(src_rgb565, row, width);
+      ARGBToUVRow(row, 0, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+#endif
+    }
+#if !defined(HAS_RGB565TOYROW_NEON)
+    free_aligned_buffer_64(row);
+  }
+#endif
+  return 0;
+}
+
+// Convert ARGB1555 to I420.
+LIBYUV_API
+int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
+                   uint8* dst_y, int dst_stride_y,
+                   uint8* dst_u, int dst_stride_u,
+                   uint8* dst_v, int dst_stride_v,
+                   int width, int height) {
+  int y;
+#if defined(HAS_ARGB1555TOYROW_NEON)
+  void (*ARGB1555ToUVRow)(const uint8* src_argb1555, int src_stride_argb1555,
+      uint8* dst_u, uint8* dst_v, int width) = ARGB1555ToUVRow_C;
+  void (*ARGB1555ToYRow)(const uint8* src_argb1555, uint8* dst_y, int pix) =
+      ARGB1555ToYRow_C;
+#else
+  void (*ARGB1555ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+      ARGB1555ToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+#endif
+  if (!src_argb1555 || !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555;
+    src_stride_argb1555 = -src_stride_argb1555;
+  }
+
+// Neon version does direct ARGB1555 to YUV.
+#if defined(HAS_ARGB1555TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGB1555ToUVRow = ARGB1555ToUVRow_Any_NEON;
+    ARGB1555ToYRow = ARGB1555ToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB1555ToYRow = ARGB1555ToYRow_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        ARGB1555ToUVRow = ARGB1555ToUVRow_NEON;
+      }
+    }
+  }
+// Other platforms do intermediate conversion from ARGB1555 to ARGB.
+#else
+#if defined(HAS_ARGB1555TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+  {
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (width * 4 + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);
+#endif
+
+    for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_ARGB1555TOYROW_NEON)
+      ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width);
+      ARGB1555ToYRow(src_argb1555, dst_y, width);
+      ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y,
+                     width);
+#else
+      ARGB1555ToARGBRow(src_argb1555, row, width);
+      ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kRowSize,
+                        width);
+      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+      src_argb1555 += src_stride_argb1555 * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+#if defined(HAS_ARGB1555TOYROW_NEON)
+      ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width);
+      ARGB1555ToYRow(src_argb1555, dst_y, width);
+#else
+      ARGB1555ToARGBRow(src_argb1555, row, width);
+      ARGBToUVRow(row, 0, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+#endif
+    }
+#if !defined(HAS_ARGB1555TOYROW_NEON)
+    free_aligned_buffer_64(row);
+  }
+#endif
+  return 0;
+}
+
+// Convert ARGB4444 to I420.
+LIBYUV_API
+int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
+                   uint8* dst_y, int dst_stride_y,
+                   uint8* dst_u, int dst_stride_u,
+                   uint8* dst_v, int dst_stride_v,
+                   int width, int height) {
+  int y;
+#if defined(HAS_ARGB4444TOYROW_NEON)
+  void (*ARGB4444ToUVRow)(const uint8* src_argb4444, int src_stride_argb4444,
+      uint8* dst_u, uint8* dst_v, int width) = ARGB4444ToUVRow_C;
+  void (*ARGB4444ToYRow)(const uint8* src_argb4444, uint8* dst_y, int pix) =
+      ARGB4444ToYRow_C;
+#else
+  void (*ARGB4444ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+      ARGB4444ToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+#endif
+  if (!src_argb4444 || !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444;
+    src_stride_argb4444 = -src_stride_argb4444;
+  }
+
+// Neon version does direct ARGB4444 to YUV.
+#if defined(HAS_ARGB4444TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGB4444ToUVRow = ARGB4444ToUVRow_Any_NEON;
+    ARGB4444ToYRow = ARGB4444ToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB4444ToYRow = ARGB4444ToYRow_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        ARGB4444ToUVRow = ARGB4444ToUVRow_NEON;
+      }
+    }
+  }
+// Other platforms do intermediate conversion from ARGB4444 to ARGB.
+#else
+#if defined(HAS_ARGB4444TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+  {
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (width * 4 + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);
+#endif
+
+    for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_ARGB4444TOYROW_NEON)
+      ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width);
+      ARGB4444ToYRow(src_argb4444, dst_y, width);
+      ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y,
+                     width);
+#else
+      ARGB4444ToARGBRow(src_argb4444, row, width);
+      ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + kRowSize,
+                        width);
+      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+      src_argb4444 += src_stride_argb4444 * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+#if defined(HAS_ARGB4444TOYROW_NEON)
+      ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width);
+      ARGB4444ToYRow(src_argb4444, dst_y, width);
+#else
+      ARGB4444ToARGBRow(src_argb4444, row, width);
+      ARGBToUVRow(row, 0, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+#endif
+    }
+#if !defined(HAS_ARGB4444TOYROW_NEON)
+    free_aligned_buffer_64(row);
+  }
+#endif
+  return 0;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libvpx/third_party/libyuv/source/convert_argb.cc b/libs/libvpx/third_party/libyuv/source/convert_argb.cc
new file mode 100644
index 0000000000..44756bc41c
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/source/convert_argb.cc
@@ -0,0 +1,1155 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert_argb.h"
+
+#include "libyuv/cpu_id.h"
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+#include "libyuv/rotate_argb.h"
+#include "libyuv/row.h"
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Copy ARGB with optional flipping
+LIBYUV_API
+int ARGBCopy(const uint8* src_argb, int src_stride_argb,
+             uint8* dst_argb, int dst_stride_argb,
+             int width, int height) {
+  if (!src_argb || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+
+  CopyPlane(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+            width * 4, height);
+  return 0;
+}
+
+// Convert I444 to ARGB.
+LIBYUV_API
+int I444ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*I444ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I444ToARGBRow_C;
+  if (!src_y || !src_u || !src_v ||
+      !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      src_stride_u == width &&
+      src_stride_v == width &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
+  }
+#if defined(HAS_I444TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I444ToARGBRow = I444ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I444ToARGBRow = I444ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I444TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I444ToARGBRow = I444ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I444ToARGBRow = I444ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I444TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I444ToARGBRow = I444ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I444ToARGBRow = I444ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I444ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert I422 to ARGB.
+LIBYUV_API
+int I422ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*I422ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToARGBRow_C;
+  if (!src_y || !src_u || !src_v ||
+      !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      src_stride_u * 2 == width &&
+      src_stride_v * 2 == width &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
+  }
+#if defined(HAS_I422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGBRow = I422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert I411 to ARGB.
+LIBYUV_API
+int I411ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*I411ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I411ToARGBRow_C;
+  if (!src_y || !src_u || !src_v ||
+      !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      src_stride_u * 4 == width &&
+      src_stride_v * 4 == width &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
+  }
+#if defined(HAS_I411TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I411ToARGBRow = I411ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I411ToARGBRow = I411ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I411TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I411ToARGBRow = I411ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I411ToARGBRow = I411ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I411TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I411ToARGBRow = I411ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I411ToARGBRow = I411ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I411ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert I400 to ARGB.
+LIBYUV_API
+int I400ToARGB(const uint8* src_y, int src_stride_y,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*I400ToARGBRow)(const uint8* y_buf,
+                     uint8* rgb_buf,
+                     int width) = I400ToARGBRow_C;
+  if (!src_y || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_argb = 0;
+  }
+#if defined(HAS_I400TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    I400ToARGBRow = I400ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      I400ToARGBRow = I400ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_I400TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I400ToARGBRow = I400ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I400ToARGBRow = I400ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I400TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I400ToARGBRow = I400ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I400ToARGBRow = I400ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I400ToARGBRow(src_y, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+  }
+  return 0;
+}
+
+// Convert J400 to ARGB.
+LIBYUV_API
+int J400ToARGB(const uint8* src_y, int src_stride_y,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*J400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix) =
+      J400ToARGBRow_C;
+  if (!src_y || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_argb = 0;
+  }
+#if defined(HAS_J400TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    J400ToARGBRow = J400ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      J400ToARGBRow = J400ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_J400TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    J400ToARGBRow = J400ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      J400ToARGBRow = J400ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_J400TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    J400ToARGBRow = J400ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      J400ToARGBRow = J400ToARGBRow_NEON;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    J400ToARGBRow(src_y, dst_argb, width);
+    src_y += src_stride_y;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Shuffle table for converting BGRA to ARGB.
+static uvec8 kShuffleMaskBGRAToARGB = {
+  3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
+};
+
+// Shuffle table for converting ABGR to ARGB.
+static uvec8 kShuffleMaskABGRToARGB = {
+  2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
+};
+
+// Shuffle table for converting RGBA to ARGB.
+static uvec8 kShuffleMaskRGBAToARGB = {
+  1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
+};
+
+// Convert BGRA to ARGB.
+LIBYUV_API
+int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  return ARGBShuffle(src_bgra, src_stride_bgra,
+                     dst_argb, dst_stride_argb,
+                     (const uint8*)(&kShuffleMaskBGRAToARGB),
+                     width, height);
+}
+
+// Convert ARGB to BGRA (same as BGRAToARGB).
+LIBYUV_API
+int ARGBToBGRA(const uint8* src_bgra, int src_stride_bgra,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  return ARGBShuffle(src_bgra, src_stride_bgra,
+                     dst_argb, dst_stride_argb,
+                     (const uint8*)(&kShuffleMaskBGRAToARGB),
+                     width, height);
+}
+
+// Convert ABGR to ARGB.
+LIBYUV_API
+int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  return ARGBShuffle(src_abgr, src_stride_abgr,
+                     dst_argb, dst_stride_argb,
+                     (const uint8*)(&kShuffleMaskABGRToARGB),
+                     width, height);
+}
+
+// Convert ARGB to ABGR to (same as ABGRToARGB).
+LIBYUV_API
+int ARGBToABGR(const uint8* src_abgr, int src_stride_abgr,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  return ARGBShuffle(src_abgr, src_stride_abgr,
+                     dst_argb, dst_stride_argb,
+                     (const uint8*)(&kShuffleMaskABGRToARGB),
+                     width, height);
+}
+
+// Convert RGBA to ARGB.
+LIBYUV_API
+int RGBAToARGB(const uint8* src_rgba, int src_stride_rgba,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  return ARGBShuffle(src_rgba, src_stride_rgba,
+                     dst_argb, dst_stride_argb,
+                     (const uint8*)(&kShuffleMaskRGBAToARGB),
+                     width, height);
+}
+
+// Convert RGB24 to ARGB.
+LIBYUV_API
+int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24,
+                uint8* dst_argb, int dst_stride_argb,
+                int width, int height) {
+  int y;
+  void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+      RGB24ToARGBRow_C;
+  if (!src_rgb24 || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+    src_stride_rgb24 = -src_stride_rgb24;
+  }
+  // Coalesce rows.
+  if (src_stride_rgb24 == width * 3 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_rgb24 = dst_stride_argb = 0;
+  }
+#if defined(HAS_RGB24TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    RGB24ToARGBRow(src_rgb24, dst_argb, width);
+    src_rgb24 += src_stride_rgb24;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert RAW to ARGB.
+LIBYUV_API
+int RAWToARGB(const uint8* src_raw, int src_stride_raw,
+              uint8* dst_argb, int dst_stride_argb,
+              int width, int height) {
+  int y;
+  void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+      RAWToARGBRow_C;
+  if (!src_raw || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_raw = src_raw + (height - 1) * src_stride_raw;
+    src_stride_raw = -src_stride_raw;
+  }
+  // Coalesce rows.
+  if (src_stride_raw == width * 3 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_raw = dst_stride_argb = 0;
+  }
+#if defined(HAS_RAWTOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToARGBRow = RAWToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RAWToARGBRow = RAWToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RAWToARGBRow = RAWToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    RAWToARGBRow(src_raw, dst_argb, width);
+    src_raw += src_stride_raw;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert RGB565 to ARGB.
+LIBYUV_API
+int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565,
+                 uint8* dst_argb, int dst_stride_argb,
+                 int width, int height) {
+  int y;
+  void (*RGB565ToARGBRow)(const uint8* src_rgb565, uint8* dst_argb, int pix) =
+      RGB565ToARGBRow_C;
+  if (!src_rgb565 || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565;
+    src_stride_rgb565 = -src_stride_rgb565;
+  }
+  // Coalesce rows.
+  if (src_stride_rgb565 == width * 2 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_rgb565 = dst_stride_argb = 0;
+  }
+#if defined(HAS_RGB565TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_RGB565TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_RGB565TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    RGB565ToARGBRow(src_rgb565, dst_argb, width);
+    src_rgb565 += src_stride_rgb565;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert ARGB1555 to ARGB.
+LIBYUV_API
+int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555,
+                   uint8* dst_argb, int dst_stride_argb,
+                   int width, int height) {
+  int y;
+  void (*ARGB1555ToARGBRow)(const uint8* src_argb1555, uint8* dst_argb,
+      int pix) = ARGB1555ToARGBRow_C;
+  if (!src_argb1555 || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555;
+    src_stride_argb1555 = -src_stride_argb1555;
+  }
+  // Coalesce rows.
+  if (src_stride_argb1555 == width * 2 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb1555 = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGB1555TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGB1555ToARGBRow(src_argb1555, dst_argb, width);
+    src_argb1555 += src_stride_argb1555;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert ARGB4444 to ARGB.
+LIBYUV_API
+int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444,
+                   uint8* dst_argb, int dst_stride_argb,
+                   int width, int height) {
+  int y;
+  void (*ARGB4444ToARGBRow)(const uint8* src_argb4444, uint8* dst_argb,
+      int pix) = ARGB4444ToARGBRow_C;
+  if (!src_argb4444 || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444;
+    src_stride_argb4444 = -src_stride_argb4444;
+  }
+  // Coalesce rows.
+  if (src_stride_argb4444 == width * 2 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb4444 = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGB4444TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGB4444ToARGBRow(src_argb4444, dst_argb, width);
+    src_argb4444 += src_stride_argb4444;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert NV12 to ARGB.
+LIBYUV_API
+int NV12ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_uv, int src_stride_uv,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*NV12ToARGBRow)(const uint8* y_buf,
+                        const uint8* uv_buf,
+                        uint8* rgb_buf,
+                        int width) = NV12ToARGBRow_C;
+  if (!src_y || !src_uv || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_NV12TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToARGBRow = NV12ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_NV12TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      NV12ToARGBRow = NV12ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_NV12TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToARGBRow = NV12ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    NV12ToARGBRow(src_y, src_uv, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_uv += src_stride_uv;
+    }
+  }
+  return 0;
+}
+
+// Convert NV21 to ARGB.
+LIBYUV_API
+int NV21ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_uv, int src_stride_uv,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*NV21ToARGBRow)(const uint8* y_buf,
+                        const uint8* uv_buf,
+                        uint8* rgb_buf,
+                        int width) = NV21ToARGBRow_C;
+  if (!src_y || !src_uv || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_NV21TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    NV21ToARGBRow = NV21ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      NV21ToARGBRow = NV21ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_NV21TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV21ToARGBRow = NV21ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      NV21ToARGBRow = NV21ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_NV21TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    NV21ToARGBRow = NV21ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      NV21ToARGBRow = NV21ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    NV21ToARGBRow(src_y, src_uv, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_uv += src_stride_uv;
+    }
+  }
+  return 0;
+}
+
+// Convert M420 to ARGB.
+LIBYUV_API
+int M420ToARGB(const uint8* src_m420, int src_stride_m420,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*NV12ToARGBRow)(const uint8* y_buf,
+                        const uint8* uv_buf,
+                        uint8* rgb_buf,
+                        int width) = NV12ToARGBRow_C;
+  if (!src_m420 || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_NV12TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToARGBRow = NV12ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_NV12TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      NV12ToARGBRow = NV12ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_NV12TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToARGBRow = NV12ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width);
+    NV12ToARGBRow(src_m420 + src_stride_m420, src_m420 + src_stride_m420 * 2,
+                  dst_argb + dst_stride_argb, width);
+    dst_argb += dst_stride_argb * 2;
+    src_m420 += src_stride_m420 * 3;
+  }
+  if (height & 1) {
+    NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width);
+  }
+  return 0;
+}
+
+// Convert YUY2 to ARGB.
+LIBYUV_API
+int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*YUY2ToARGBRow)(const uint8* src_yuy2, uint8* dst_argb, int pix) =
+      YUY2ToARGBRow_C;
+  if (!src_yuy2 || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+    src_stride_yuy2 = -src_stride_yuy2;
+  }
+  // Coalesce rows.
+  if (src_stride_yuy2 == width * 2 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_yuy2 = dst_stride_argb = 0;
+  }
+#if defined(HAS_YUY2TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    YUY2ToARGBRow = YUY2ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToARGBRow = YUY2ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    YUY2ToARGBRow = YUY2ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      YUY2ToARGBRow = YUY2ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    YUY2ToARGBRow = YUY2ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      YUY2ToARGBRow = YUY2ToARGBRow_NEON;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    YUY2ToARGBRow(src_yuy2, dst_argb, width);
+    src_yuy2 += src_stride_yuy2;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert UYVY to ARGB.
+LIBYUV_API
+int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*UYVYToARGBRow)(const uint8* src_uyvy, uint8* dst_argb, int pix) =
+      UYVYToARGBRow_C;
+  if (!src_uyvy || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+    src_stride_uyvy = -src_stride_uyvy;
+  }
+  // Coalesce rows.
+  if (src_stride_uyvy == width * 2 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_uyvy = dst_stride_argb = 0;
+  }
+#if defined(HAS_UYVYTOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    UYVYToARGBRow = UYVYToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToARGBRow = UYVYToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    UYVYToARGBRow = UYVYToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      UYVYToARGBRow = UYVYToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    UYVYToARGBRow = UYVYToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      UYVYToARGBRow = UYVYToARGBRow_NEON;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    UYVYToARGBRow(src_uyvy, dst_argb, width);
+    src_uyvy += src_stride_uyvy;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert J420 to ARGB.
+LIBYUV_API
+int J420ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*J422ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = J422ToARGBRow_C;
+  if (!src_y || !src_u || !src_v || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_J422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    J422ToARGBRow = J422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      J422ToARGBRow = J422ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_J422TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    J422ToARGBRow = J422ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      J422ToARGBRow = J422ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_J422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    J422ToARGBRow = J422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      J422ToARGBRow = J422ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_J422TOARGBROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    J422ToARGBRow = J422ToARGBRow_MIPS_DSPR2;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    J422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert J422 to ARGB.
+LIBYUV_API
+int J422ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*J422ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = J422ToARGBRow_C;
+  if (!src_y || !src_u || !src_v ||
+      !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      src_stride_u * 2 == width &&
+      src_stride_v * 2 == width &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
+  }
+#if defined(HAS_J422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    J422ToARGBRow = J422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      J422ToARGBRow = J422ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_J422TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    J422ToARGBRow = J422ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      J422ToARGBRow = J422ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_J422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    J422ToARGBRow = J422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      J422ToARGBRow = J422ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_J422TOARGBROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    J422ToARGBRow = J422ToARGBRow_MIPS_DSPR2;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    J422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libvpx/third_party/libyuv/source/convert_from.cc b/libs/libvpx/third_party/libyuv/source/convert_from.cc
new file mode 100644
index 0000000000..31f1ac992a
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/source/convert_from.cc
@@ -0,0 +1,1348 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert_from.h"
+
+#include "libyuv/basic_types.h"
+#include "libyuv/convert.h"  // For I420Copy
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+#include "libyuv/scale.h"  // For ScalePlane()
+#include "libyuv/video_common.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
+static __inline int Abs(int v) {
+  return v >= 0 ? v : -v;
+}
+
+// I420 To any I4xx YUV format with mirroring.
+static int I420ToI4xx(const uint8* src_y, int src_stride_y,
+                      const uint8* src_u, int src_stride_u,
+                      const uint8* src_v, int src_stride_v,
+                      uint8* dst_y, int dst_stride_y,
+                      uint8* dst_u, int dst_stride_u,
+                      uint8* dst_v, int dst_stride_v,
+                      int src_y_width, int src_y_height,
+                      int dst_uv_width, int dst_uv_height) {
+  const int dst_y_width = Abs(src_y_width);
+  const int dst_y_height = Abs(src_y_height);
+  const int src_uv_width = SUBSAMPLE(src_y_width, 1, 1);
+  const int src_uv_height = SUBSAMPLE(src_y_height, 1, 1);
+  if (src_y_width == 0 || src_y_height == 0 ||
+      dst_uv_width <= 0 || dst_uv_height <= 0) {
+    return -1;
+  }
+  ScalePlane(src_y, src_stride_y, src_y_width, src_y_height,
+             dst_y, dst_stride_y, dst_y_width, dst_y_height,
+             kFilterBilinear);
+  ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height,
+             dst_u, dst_stride_u, dst_uv_width, dst_uv_height,
+             kFilterBilinear);
+  ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height,
+             dst_v, dst_stride_v, dst_uv_width, dst_uv_height,
+             kFilterBilinear);
+  return 0;
+}
+
+// 420 chroma is 1/2 width, 1/2 height
+// 422 chroma is 1/2 width, 1x height
+LIBYUV_API
+int I420ToI422(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  const int dst_uv_width = (Abs(width) + 1) >> 1;
+  const int dst_uv_height = Abs(height);
+  return I420ToI4xx(src_y, src_stride_y,
+                    src_u, src_stride_u,
+                    src_v, src_stride_v,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height,
+                    dst_uv_width, dst_uv_height);
+}
+
+// 420 chroma is 1/2 width, 1/2 height
+// 444 chroma is 1x width, 1x height
+LIBYUV_API
+int I420ToI444(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  const int dst_uv_width = Abs(width);
+  const int dst_uv_height = Abs(height);
+  return I420ToI4xx(src_y, src_stride_y,
+                    src_u, src_stride_u,
+                    src_v, src_stride_v,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height,
+                    dst_uv_width, dst_uv_height);
+}
+
+// 420 chroma is 1/2 width, 1/2 height
+// 411 chroma is 1/4 width, 1x height
+LIBYUV_API
+int I420ToI411(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  const int dst_uv_width = (Abs(width) + 3) >> 2;
+  const int dst_uv_height = Abs(height);
+  return I420ToI4xx(src_y, src_stride_y,
+                    src_u, src_stride_u,
+                    src_v, src_stride_v,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height,
+                    dst_uv_width, dst_uv_height);
+}
+
+// Copy to I400. Source can be I420,422,444,400,NV12,NV21
+LIBYUV_API
+int I400Copy(const uint8* src_y, int src_stride_y,
+             uint8* dst_y, int dst_stride_y,
+             int width, int height) {
+  if (!src_y || !dst_y ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  return 0;
+}
+
+LIBYUV_API
+int I422ToYUY2(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_yuy2, int dst_stride_yuy2,
+               int width, int height) {
+  int y;
+  void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
+                        const uint8* src_v, uint8* dst_yuy2, int width) =
+      I422ToYUY2Row_C;
+  if (!src_y || !src_u || !src_v || !dst_yuy2 ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
+    dst_stride_yuy2 = -dst_stride_yuy2;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      src_stride_u * 2 == width &&
+      src_stride_v * 2 == width &&
+      dst_stride_yuy2 == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_yuy2 = 0;
+  }
+#if defined(HAS_I422TOYUY2ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOYUY2ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_yuy2 += dst_stride_yuy2;
+  }
+  return 0;
+}
+
+LIBYUV_API
+int I420ToYUY2(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_yuy2, int dst_stride_yuy2,
+               int width, int height) {
+  int y;
+  void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
+                        const uint8* src_v, uint8* dst_yuy2, int width) =
+      I422ToYUY2Row_C;
+  if (!src_y || !src_u || !src_v || !dst_yuy2 ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
+    dst_stride_yuy2 = -dst_stride_yuy2;
+  }
+#if defined(HAS_I422TOYUY2ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOYUY2ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
+    I422ToYUY2Row(src_y + src_stride_y, src_u, src_v,
+                  dst_yuy2 + dst_stride_yuy2, width);
+    src_y += src_stride_y * 2;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_yuy2 += dst_stride_yuy2 * 2;
+  }
+  if (height & 1) {
+    I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
+  }
+  return 0;
+}
+
+LIBYUV_API
+int I422ToUYVY(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_uyvy, int dst_stride_uyvy,
+               int width, int height) {
+  int y;
+  void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
+                        const uint8* src_v, uint8* dst_uyvy, int width) =
+      I422ToUYVYRow_C;
+  if (!src_y || !src_u || !src_v || !dst_uyvy ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
+    dst_stride_uyvy = -dst_stride_uyvy;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      src_stride_u * 2 == width &&
+      src_stride_v * 2 == width &&
+      dst_stride_uyvy == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_uyvy = 0;
+  }
+#if defined(HAS_I422TOUYVYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOUYVYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_uyvy += dst_stride_uyvy;
+  }
+  return 0;
+}
+
+LIBYUV_API
+int I420ToUYVY(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_uyvy, int dst_stride_uyvy,
+               int width, int height) {
+  int y;
+  void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
+                        const uint8* src_v, uint8* dst_uyvy, int width) =
+      I422ToUYVYRow_C;
+  if (!src_y || !src_u || !src_v || !dst_uyvy ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
+    dst_stride_uyvy = -dst_stride_uyvy;
+  }
+#if defined(HAS_I422TOUYVYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOUYVYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
+    I422ToUYVYRow(src_y + src_stride_y, src_u, src_v,
+                  dst_uyvy + dst_stride_uyvy, width);
+    src_y += src_stride_y * 2;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_uyvy += dst_stride_uyvy * 2;
+  }
+  if (height & 1) {
+    I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
+  }
+  return 0;
+}
+
+LIBYUV_API
+int I420ToNV12(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height) {
+  int y;
+  void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+      int width) = MergeUVRow_C;
+  // Coalesce rows.
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_y || !src_u || !src_v || !dst_y || !dst_uv ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_uv = dst_uv + (halfheight - 1) * dst_stride_uv;
+    dst_stride_y = -dst_stride_y;
+    dst_stride_uv = -dst_stride_uv;
+  }
+  if (src_stride_y == width &&
+      dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_y = 0;
+  }
+  // Coalesce rows.
+  if (src_stride_u == halfwidth &&
+      src_stride_v == halfwidth &&
+      dst_stride_uv == halfwidth * 2) {
+    halfwidth *= halfheight;
+    halfheight = 1;
+    src_stride_u = src_stride_v = dst_stride_uv = 0;
+  }
+#if defined(HAS_MERGEUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    MergeUVRow_ = MergeUVRow_Any_SSE2;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MergeUVRow_ = MergeUVRow_Any_AVX2;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      MergeUVRow_ = MergeUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MergeUVRow_ = MergeUVRow_Any_NEON;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_NEON;
+    }
+  }
+#endif
+
+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  for (y = 0; y < halfheight; ++y) {
+    // Merge a row of U and V into a row of UV.
+    MergeUVRow_(src_u, src_v, dst_uv, halfwidth);
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_uv += dst_stride_uv;
+  }
+  return 0;
+}
+
+LIBYUV_API
+int I420ToNV21(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_vu, int dst_stride_vu,
+               int width, int height) {
+  return I420ToNV12(src_y, src_stride_y,
+                    src_v, src_stride_v,
+                    src_u, src_stride_u,
+                    dst_y, src_stride_y,
+                    dst_vu, dst_stride_vu,
+                    width, height);
+}
+
+// Convert I420 to ARGB.
+LIBYUV_API
+int I420ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*I422ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToARGBRow_C;
+  if (!src_y || !src_u || !src_v || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_I422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGBRow = I422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to BGRA.
+LIBYUV_API
+int I420ToBGRA(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_bgra, int dst_stride_bgra,
+               int width, int height) {
+  int y;
+  void (*I422ToBGRARow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToBGRARow_C;
+  if (!src_y || !src_u || !src_v || !dst_bgra ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra;
+    dst_stride_bgra = -dst_stride_bgra;
+  }
+#if defined(HAS_I422TOBGRAROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToBGRARow = I422ToBGRARow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToBGRARow = I422ToBGRARow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOBGRAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToBGRARow = I422ToBGRARow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToBGRARow = I422ToBGRARow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOBGRAROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToBGRARow = I422ToBGRARow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToBGRARow = I422ToBGRARow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOBGRAROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_bgra, 4) && IS_ALIGNED(dst_stride_bgra, 4)) {
+    I422ToBGRARow = I422ToBGRARow_MIPS_DSPR2;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToBGRARow(src_y, src_u, src_v, dst_bgra, width);
+    dst_bgra += dst_stride_bgra;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to ABGR.
+LIBYUV_API
+int I420ToABGR(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_abgr, int dst_stride_abgr,
+               int width, int height) {
+  int y;
+  void (*I422ToABGRRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToABGRRow_C;
+  if (!src_y || !src_u || !src_v || !dst_abgr ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr;
+    dst_stride_abgr = -dst_stride_abgr;
+  }
+#if defined(HAS_I422TOABGRROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToABGRRow = I422ToABGRRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToABGRRow = I422ToABGRRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOABGRROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToABGRRow = I422ToABGRRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToABGRRow = I422ToABGRRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOABGRROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToABGRRow = I422ToABGRRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToABGRRow = I422ToABGRRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToABGRRow(src_y, src_u, src_v, dst_abgr, width);
+    dst_abgr += dst_stride_abgr;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to RGBA.
+LIBYUV_API
+int I420ToRGBA(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_rgba, int dst_stride_rgba,
+               int width, int height) {
+  int y;
+  void (*I422ToRGBARow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToRGBARow_C;
+  if (!src_y || !src_u || !src_v || !dst_rgba ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
+    dst_stride_rgba = -dst_stride_rgba;
+  }
+#if defined(HAS_I422TORGBAROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGBARow = I422ToRGBARow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRGBARow = I422ToRGBARow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGBARow = I422ToRGBARow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToRGBARow = I422ToRGBARow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGBARow = I422ToRGBARow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToRGBARow(src_y, src_u, src_v, dst_rgba, width);
+    dst_rgba += dst_stride_rgba;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to RGB24.
+LIBYUV_API
+int I420ToRGB24(const uint8* src_y, int src_stride_y,
+                const uint8* src_u, int src_stride_u,
+                const uint8* src_v, int src_stride_v,
+                uint8* dst_rgb24, int dst_stride_rgb24,
+                int width, int height) {
+  int y;
+  void (*I422ToRGB24Row)(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* rgb_buf,
+                         int width) = I422ToRGB24Row_C;
+  if (!src_y || !src_u || !src_v || !dst_rgb24 ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+    dst_stride_rgb24 = -dst_stride_rgb24;
+  }
+#if defined(HAS_I422TORGB24ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB24Row = I422ToRGB24Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB24ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB24Row = I422ToRGB24Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB24Row = I422ToRGB24Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, width);
+    dst_rgb24 += dst_stride_rgb24;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to RAW.
+LIBYUV_API
+int I420ToRAW(const uint8* src_y, int src_stride_y,
+                const uint8* src_u, int src_stride_u,
+                const uint8* src_v, int src_stride_v,
+                uint8* dst_raw, int dst_stride_raw,
+                int width, int height) {
+  int y;
+  void (*I422ToRAWRow)(const uint8* y_buf,
+                       const uint8* u_buf,
+                       const uint8* v_buf,
+                       uint8* rgb_buf,
+                       int width) = I422ToRAWRow_C;
+  if (!src_y || !src_u || !src_v || !dst_raw ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_raw = dst_raw + (height - 1) * dst_stride_raw;
+    dst_stride_raw = -dst_stride_raw;
+  }
+#if defined(HAS_I422TORAWROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToRAWRow = I422ToRAWRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRAWRow = I422ToRAWRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TORAWROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRAWRow = I422ToRAWRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRAWRow = I422ToRAWRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TORAWROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToRAWRow = I422ToRAWRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRAWRow = I422ToRAWRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToRAWRow(src_y, src_u, src_v, dst_raw, width);
+    dst_raw += dst_stride_raw;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to ARGB1555.
+LIBYUV_API
+int I420ToARGB1555(const uint8* src_y, int src_stride_y,
+                   const uint8* src_u, int src_stride_u,
+                   const uint8* src_v, int src_stride_v,
+                   uint8* dst_argb1555, int dst_stride_argb1555,
+                   int width, int height) {
+  int y;
+  void (*I422ToARGB1555Row)(const uint8* y_buf,
+                            const uint8* u_buf,
+                            const uint8* v_buf,
+                            uint8* rgb_buf,
+                            int width) = I422ToARGB1555Row_C;
+  if (!src_y || !src_u || !src_v || !dst_argb1555 ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb1555 = dst_argb1555 + (height - 1) * dst_stride_argb1555;
+    dst_stride_argb1555 = -dst_stride_argb1555;
+  }
+#if defined(HAS_I422TOARGB1555ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB1555Row = I422ToARGB1555Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGB1555Row = I422ToARGB1555Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGB1555Row = I422ToARGB1555Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB1555Row = I422ToARGB1555Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, width);
+    dst_argb1555 += dst_stride_argb1555;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+
+// Convert I420 to ARGB4444.
+LIBYUV_API
+int I420ToARGB4444(const uint8* src_y, int src_stride_y,
+                   const uint8* src_u, int src_stride_u,
+                   const uint8* src_v, int src_stride_v,
+                   uint8* dst_argb4444, int dst_stride_argb4444,
+                   int width, int height) {
+  int y;
+  void (*I422ToARGB4444Row)(const uint8* y_buf,
+                            const uint8* u_buf,
+                            const uint8* v_buf,
+                            uint8* rgb_buf,
+                            int width) = I422ToARGB4444Row_C;
+  if (!src_y || !src_u || !src_v || !dst_argb4444 ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb4444 = dst_argb4444 + (height - 1) * dst_stride_argb4444;
+    dst_stride_argb4444 = -dst_stride_argb4444;
+  }
+#if defined(HAS_I422TOARGB4444ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGB4444Row = I422ToARGB4444Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB4444Row = I422ToARGB4444Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGB4444Row = I422ToARGB4444Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGB4444Row = I422ToARGB4444Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGB4444Row = I422ToARGB4444Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB4444Row = I422ToARGB4444Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, width);
+    dst_argb4444 += dst_stride_argb4444;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to RGB565.
+LIBYUV_API
+int I420ToRGB565(const uint8* src_y, int src_stride_y,
+                 const uint8* src_u, int src_stride_u,
+                 const uint8* src_v, int src_stride_v,
+                 uint8* dst_rgb565, int dst_stride_rgb565,
+                 int width, int height) {
+  int y;
+  void (*I422ToRGB565Row)(const uint8* y_buf,
+                          const uint8* u_buf,
+                          const uint8* v_buf,
+                          uint8* rgb_buf,
+                          int width) = I422ToRGB565Row_C;
+  if (!src_y || !src_u || !src_v || !dst_rgb565 ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+    dst_stride_rgb565 = -dst_stride_rgb565;
+  }
+#if defined(HAS_I422TORGB565ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB565Row = I422ToRGB565Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB565ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB565Row = I422ToRGB565Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB565ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB565Row = I422ToRGB565Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, width);
+    dst_rgb565 += dst_stride_rgb565;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Ordered 8x8 dither for 888 to 565.  Values from 0 to 7.
+static const uint8 kDither565_4x4[16] = {
+  0, 4, 1, 5,
+  6, 2, 7, 3,
+  1, 5, 0, 4,
+  7, 3, 6, 2,
+};
+
+// Convert I420 to RGB565 with dithering.
+LIBYUV_API
+int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
+                       const uint8* src_u, int src_stride_u,
+                       const uint8* src_v, int src_stride_v,
+                       uint8* dst_rgb565, int dst_stride_rgb565,
+                       const uint8* dither4x4, int width, int height) {
+  int y;
+  void (*I422ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToARGBRow_C;
+  void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,
+      const uint32 dither4, int pix) = ARGBToRGB565DitherRow_C;
+  if (!src_y || !src_u || !src_v || !dst_rgb565 ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+    dst_stride_rgb565 = -dst_stride_rgb565;
+  }
+  if (!dither4x4) {
+    dither4x4 = kDither565_4x4;
+  }
+#if defined(HAS_I422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGBRow = I422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2)) {
+    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
+  }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON;
+    }
+  }
+#endif
+  {
+    // Allocate a row of argb.
+    align_buffer_64(row_argb, width * 4);
+    for (y = 0; y < height; ++y) {
+      I422ToARGBRow(src_y, src_u, src_v, row_argb, width);
+      ARGBToRGB565DitherRow(row_argb, dst_rgb565,
+                            *(uint32*)(dither4x4 + ((y & 3) << 2)), width);
+      dst_rgb565 += dst_stride_rgb565;
+      src_y += src_stride_y;
+      if (y & 1) {
+        src_u += src_stride_u;
+        src_v += src_stride_v;
+      }
+    }
+    free_aligned_buffer_64(row_argb);
+  }
+  return 0;
+}
+
+// Convert I420 to specified format
+LIBYUV_API
+int ConvertFromI420(const uint8* y, int y_stride,
+                    const uint8* u, int u_stride,
+                    const uint8* v, int v_stride,
+                    uint8* dst_sample, int dst_sample_stride,
+                    int width, int height,
+                    uint32 fourcc) {
+  uint32 format = CanonicalFourCC(fourcc);
+  int r = 0;
+  if (!y || !u|| !v || !dst_sample ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  switch (format) {
+    // Single plane formats
+    case FOURCC_YUY2:
+      r = I420ToYUY2(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 2,
+                     width, height);
+      break;
+    case FOURCC_UYVY:
+      r = I420ToUYVY(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 2,
+                     width, height);
+      break;
+    case FOURCC_RGBP:
+      r = I420ToRGB565(y, y_stride,
+                       u, u_stride,
+                       v, v_stride,
+                       dst_sample,
+                       dst_sample_stride ? dst_sample_stride : width * 2,
+                       width, height);
+      break;
+    case FOURCC_RGBO:
+      r = I420ToARGB1555(y, y_stride,
+                         u, u_stride,
+                         v, v_stride,
+                         dst_sample,
+                         dst_sample_stride ? dst_sample_stride : width * 2,
+                         width, height);
+      break;
+    case FOURCC_R444:
+      r = I420ToARGB4444(y, y_stride,
+                         u, u_stride,
+                         v, v_stride,
+                         dst_sample,
+                         dst_sample_stride ? dst_sample_stride : width * 2,
+                         width, height);
+      break;
+    case FOURCC_24BG:
+      r = I420ToRGB24(y, y_stride,
+                      u, u_stride,
+                      v, v_stride,
+                      dst_sample,
+                      dst_sample_stride ? dst_sample_stride : width * 3,
+                      width, height);
+      break;
+    case FOURCC_RAW:
+      r = I420ToRAW(y, y_stride,
+                    u, u_stride,
+                    v, v_stride,
+                    dst_sample,
+                    dst_sample_stride ? dst_sample_stride : width * 3,
+                    width, height);
+      break;
+    case FOURCC_ARGB:
+      r = I420ToARGB(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4,
+                     width, height);
+      break;
+    case FOURCC_BGRA:
+      r = I420ToBGRA(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4,
+                     width, height);
+      break;
+    case FOURCC_ABGR:
+      r = I420ToABGR(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4,
+                     width, height);
+      break;
+    case FOURCC_RGBA:
+      r = I420ToRGBA(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4,
+                     width, height);
+      break;
+    case FOURCC_I400:
+      r = I400Copy(y, y_stride,
+                   dst_sample,
+                   dst_sample_stride ? dst_sample_stride : width,
+                   width, height);
+      break;
+    case FOURCC_NV12: {
+      uint8* dst_uv = dst_sample + width * height;
+      r = I420ToNV12(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width,
+                     dst_uv,
+                     dst_sample_stride ? dst_sample_stride : width,
+                     width, height);
+      break;
+    }
+    case FOURCC_NV21: {
+      uint8* dst_vu = dst_sample + width * height;
+      r = I420ToNV21(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width,
+                     dst_vu,
+                     dst_sample_stride ? dst_sample_stride : width,
+                     width, height);
+      break;
+    }
+    // TODO(fbarchard): Add M420.
+    // Triplanar formats
+    // TODO(fbarchard): halfstride instead of halfwidth
+    case FOURCC_I420:
+    case FOURCC_YU12:
+    case FOURCC_YV12: {
+      int halfwidth = (width + 1) / 2;
+      int halfheight = (height + 1) / 2;
+      uint8* dst_u;
+      uint8* dst_v;
+      if (format == FOURCC_YV12) {
+        dst_v = dst_sample + width * height;
+        dst_u = dst_v + halfwidth * halfheight;
+      } else {
+        dst_u = dst_sample + width * height;
+        dst_v = dst_u + halfwidth * halfheight;
+      }
+      r = I420Copy(y, y_stride,
+                   u, u_stride,
+                   v, v_stride,
+                   dst_sample, width,
+                   dst_u, halfwidth,
+                   dst_v, halfwidth,
+                   width, height);
+      break;
+    }
+    case FOURCC_I422:
+    case FOURCC_YV16: {
+      int halfwidth = (width + 1) / 2;
+      uint8* dst_u;
+      uint8* dst_v;
+      if (format == FOURCC_YV16) {
+        dst_v = dst_sample + width * height;
+        dst_u = dst_v + halfwidth * height;
+      } else {
+        dst_u = dst_sample + width * height;
+        dst_v = dst_u + halfwidth * height;
+      }
+      r = I420ToI422(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample, width,
+                     dst_u, halfwidth,
+                     dst_v, halfwidth,
+                     width, height);
+      break;
+    }
+    case FOURCC_I444:
+    case FOURCC_YV24: {
+      uint8* dst_u;
+      uint8* dst_v;
+      if (format == FOURCC_YV24) {
+        dst_v = dst_sample + width * height;
+        dst_u = dst_v + width * height;
+      } else {
+        dst_u = dst_sample + width * height;
+        dst_v = dst_u + width * height;
+      }
+      r = I420ToI444(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample, width,
+                     dst_u, width,
+                     dst_v, width,
+                     width, height);
+      break;
+    }
+    case FOURCC_I411: {
+      int quarterwidth = (width + 3) / 4;
+      uint8* dst_u = dst_sample + width * height;
+      uint8* dst_v = dst_u + quarterwidth * height;
+      r = I420ToI411(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample, width,
+                     dst_u, quarterwidth,
+                     dst_v, quarterwidth,
+                     width, height);
+      break;
+    }
+
+    // Formats not supported - MJPG, biplanar, some rgb formats.
+    default:
+      return -1;  // unknown fourcc - return failure code.
+  }
+  return r;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libvpx/third_party/libyuv/source/convert_from_argb.cc b/libs/libvpx/third_party/libyuv/source/convert_from_argb.cc
new file mode 100644
index 0000000000..8d1e97aec2
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/source/convert_from_argb.cc
@@ -0,0 +1,1301 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert_from_argb.h"
+
+#include "libyuv/basic_types.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// ARGB little endian (bgra in memory) to I444
+LIBYUV_API
+int ARGBToI444(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  void (*ARGBToUV444Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+      int pix) = ARGBToUV444Row_C;
+  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_y == width &&
+      dst_stride_u == width &&
+      dst_stride_v == width) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+  }
+#if defined(HAS_ARGBTOUV444ROW_SSSE3)
+    if (TestCpuFlag(kCpuHasSSSE3)) {
+      ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3;
+      if (IS_ALIGNED(width, 16)) {
+        ARGBToUV444Row = ARGBToUV444Row_SSSE3;
+      }
+  }
+#endif
+#if defined(HAS_ARGBTOUV444ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUV444Row = ARGBToUV444Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToUV444Row = ARGBToUV444Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToUV444Row(src_argb, dst_u, dst_v, width);
+    ARGBToYRow(src_argb, dst_y, width);
+    src_argb += src_stride_argb;
+    dst_y += dst_stride_y;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  return 0;
+}
+
+// ARGB little endian (bgra in memory) to I422
+LIBYUV_API
+int ARGBToI422(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+      int pix) = ARGBToUV422Row_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_y == width &&
+      dst_stride_u * 2 == width &&
+      dst_stride_v * 2 == width) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+  }
+#if defined(HAS_ARGBTOUV422ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUV422Row = ARGBToUV422Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUV422ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUV422Row = ARGBToUV422Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToUV422Row(src_argb, dst_u, dst_v, width);
+    ARGBToYRow(src_argb, dst_y, width);
+    src_argb += src_stride_argb;
+    dst_y += dst_stride_y;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  return 0;
+}
+
+// ARGB little endian (bgra in memory) to I411
+LIBYUV_API
+int ARGBToI411(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*ARGBToUV411Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+      int pix) = ARGBToUV411Row_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_y == width &&
+      dst_stride_u * 4 == width &&
+      dst_stride_v * 4 == width) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+  }
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUV411ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUV411Row = ARGBToUV411Row_Any_NEON;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUV411Row = ARGBToUV411Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToUV411Row(src_argb, dst_u, dst_v, width);
+    ARGBToYRow(src_argb, dst_y, width);
+    src_argb += src_stride_argb;
+    dst_y += dst_stride_y;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  return 0;
+}
+
+LIBYUV_API
+int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height) {
+  int y;
+  int halfwidth = (width + 1) >> 1;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                      int width) = MergeUVRow_C;
+  if (!src_argb ||
+      !dst_y || !dst_uv ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVRow = ARGBToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    MergeUVRow_ = MergeUVRow_Any_SSE2;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MergeUVRow_ = MergeUVRow_Any_AVX2;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      MergeUVRow_ = MergeUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MergeUVRow_ = MergeUVRow_Any_NEON;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_NEON;
+    }
+  }
+#endif
+  {
+    // Allocate a rows of uv.
+    align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
+    uint8* row_v = row_u + ((halfwidth + 31) & ~31);
+
+    for (y = 0; y < height - 1; y += 2) {
+      ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
+      MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+      ARGBToYRow(src_argb, dst_y, width);
+      ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+      src_argb += src_stride_argb * 2;
+      dst_y += dst_stride_y * 2;
+      dst_uv += dst_stride_uv;
+    }
+    if (height & 1) {
+      ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+      MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+      ARGBToYRow(src_argb, dst_y, width);
+    }
+    free_aligned_buffer_64(row_u);
+  }
+  return 0;
+}
+
+// Same as NV12 but U and V swapped.
+LIBYUV_API
+int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height) {
+  int y;
+  int halfwidth = (width + 1) >> 1;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                      int width) = MergeUVRow_C;
+  if (!src_argb ||
+      !dst_y || !dst_uv ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVRow = ARGBToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    MergeUVRow_ = MergeUVRow_Any_SSE2;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MergeUVRow_ = MergeUVRow_Any_AVX2;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      MergeUVRow_ = MergeUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MergeUVRow_ = MergeUVRow_Any_NEON;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_NEON;
+    }
+  }
+#endif
+  {
+    // Allocate a rows of uv.
+    align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
+    uint8* row_v = row_u + ((halfwidth + 31) & ~31);
+
+    for (y = 0; y < height - 1; y += 2) {
+      ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
+      MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
+      ARGBToYRow(src_argb, dst_y, width);
+      ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+      src_argb += src_stride_argb * 2;
+      dst_y += dst_stride_y * 2;
+      dst_uv += dst_stride_uv;
+    }
+    if (height & 1) {
+      ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+      MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
+      ARGBToYRow(src_argb, dst_y, width);
+    }
+    free_aligned_buffer_64(row_u);
+  }
+  return 0;
+}
+
+// Convert ARGB to YUY2.
+LIBYUV_API
+int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_yuy2, int dst_stride_yuy2,
+               int width, int height) {
+  int y;
+  void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+      int pix) = ARGBToUV422Row_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
+      const uint8* src_v, uint8* dst_yuy2, int width) = I422ToYUY2Row_C;
+
+  if (!src_argb || !dst_yuy2 ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
+    dst_stride_yuy2 = -dst_stride_yuy2;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_yuy2 == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_yuy2 = 0;
+  }
+#if defined(HAS_ARGBTOUV422ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUV422Row = ARGBToUV422Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUV422ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUV422Row = ARGBToUV422Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+
+#if defined(HAS_I422TOYUY2ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOYUY2ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_NEON;
+    }
+  }
+#endif
+
+  {
+    // Allocate a rows of yuv.
+    align_buffer_64(row_y, ((width + 63) & ~63) * 2);
+    uint8* row_u = row_y + ((width + 63) & ~63);
+    uint8* row_v = row_u + ((width + 63) & ~63) / 2;
+
+    for (y = 0; y < height; ++y) {
+      ARGBToUV422Row(src_argb, row_u, row_v, width);
+      ARGBToYRow(src_argb, row_y, width);
+      I422ToYUY2Row(row_y, row_u, row_v, dst_yuy2, width);
+      src_argb += src_stride_argb;
+      dst_yuy2 += dst_stride_yuy2;
+    }
+
+    free_aligned_buffer_64(row_y);
+  }
+  return 0;
+}
+
+// Convert ARGB to UYVY.
+LIBYUV_API
+int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_uyvy, int dst_stride_uyvy,
+               int width, int height) {
+  int y;
+  void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+      int pix) = ARGBToUV422Row_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
+      const uint8* src_v, uint8* dst_uyvy, int width) = I422ToUYVYRow_C;
+
+  if (!src_argb || !dst_uyvy ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
+    dst_stride_uyvy = -dst_stride_uyvy;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_uyvy == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_uyvy = 0;
+  }
+#if defined(HAS_ARGBTOUV422ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUV422Row = ARGBToUV422Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUV422ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUV422Row = ARGBToUV422Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+
+#if defined(HAS_I422TOUYVYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOUYVYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_NEON;
+    }
+  }
+#endif
+
+  {
+    // Allocate a rows of yuv.
+    align_buffer_64(row_y, ((width + 63) & ~63) * 2);
+    uint8* row_u = row_y + ((width + 63) & ~63);
+    uint8* row_v = row_u + ((width + 63) & ~63) / 2;
+
+    for (y = 0; y < height; ++y) {
+      ARGBToUV422Row(src_argb, row_u, row_v, width);
+      ARGBToYRow(src_argb, row_y, width);
+      I422ToUYVYRow(row_y, row_u, row_v, dst_uyvy, width);
+      src_argb += src_stride_argb;
+      dst_uyvy += dst_stride_uyvy;
+    }
+
+    free_aligned_buffer_64(row_y);
+  }
+  return 0;
+}
+
+// Convert ARGB to I400.
+LIBYUV_API
+int ARGBToI400(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height) {
+  int y;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  if (!src_argb || !dst_y || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_y = 0;
+  }
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToYRow(src_argb, dst_y, width);
+    src_argb += src_stride_argb;
+    dst_y += dst_stride_y;
+  }
+  return 0;
+}
+
+// Shuffle table for converting ARGB to RGBA.
+static uvec8 kShuffleMaskARGBToRGBA = {
+  3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
+};
+
+// Convert ARGB to RGBA.
+LIBYUV_API
+int ARGBToRGBA(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_rgba, int dst_stride_rgba,
+               int width, int height) {
+  return ARGBShuffle(src_argb, src_stride_argb,
+                     dst_rgba, dst_stride_rgba,
+                     (const uint8*)(&kShuffleMaskARGBToRGBA),
+                     width, height);
+}
+
+// Convert ARGB To RGB24.
+LIBYUV_API
+int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
+                uint8* dst_rgb24, int dst_stride_rgb24,
+                int width, int height) {
+  int y;
+  void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+      ARGBToRGB24Row_C;
+  if (!src_argb || !dst_rgb24 || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_rgb24 == width * 3) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_rgb24 = 0;
+  }
+#if defined(HAS_ARGBTORGB24ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToRGB24Row = ARGBToRGB24Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB24Row = ARGBToRGB24Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToRGB24Row(src_argb, dst_rgb24, width);
+    src_argb += src_stride_argb;
+    dst_rgb24 += dst_stride_rgb24;
+  }
+  return 0;
+}
+
+// Convert ARGB To RAW.
+LIBYUV_API
+int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
+              uint8* dst_raw, int dst_stride_raw,
+              int width, int height) {
+  int y;
+  void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+      ARGBToRAWRow_C;
+  if (!src_argb || !dst_raw || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_raw == width * 3) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_raw = 0;
+  }
+#if defined(HAS_ARGBTORAWROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToRAWRow = ARGBToRAWRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORAWROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToRAWRow = ARGBToRAWRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRAWRow = ARGBToRAWRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToRAWRow(src_argb, dst_raw, width);
+    src_argb += src_stride_argb;
+    dst_raw += dst_stride_raw;
+  }
+  return 0;
+}
+
+// Ordered 8x8 dither for 888 to 565.  Values from 0 to 7.
+static const uint8 kDither565_4x4[16] = {
+  0, 4, 1, 5,
+  6, 2, 7, 3,
+  1, 5, 0, 4,
+  7, 3, 6, 2,
+};
+
+// Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes).
+LIBYUV_API
+int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
+                       uint8* dst_rgb565, int dst_stride_rgb565,
+                       const uint8* dither4x4, int width, int height) {
+  int y;
+  void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,
+      const uint32 dither4, int pix) = ARGBToRGB565DitherRow_C;
+  if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  if (!dither4x4) {
+    dither4x4 = kDither565_4x4;
+  }
+#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    ARGBToRGB565DitherRow(src_argb, dst_rgb565,
+                          *(uint32*)(dither4x4 + ((y & 3) << 2)), width);
+    src_argb += src_stride_argb;
+    dst_rgb565 += dst_stride_rgb565;
+  }
+  return 0;
+}
+
+// Convert ARGB To RGB565.
+// TODO(fbarchard): Consider using dither function low level with zeros.
+LIBYUV_API
+int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
+                 uint8* dst_rgb565, int dst_stride_rgb565,
+                 int width, int height) {
+  int y;
+  void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+      ARGBToRGB565Row_C;
+  if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_rgb565 == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_rgb565 = 0;
+  }
+#if defined(HAS_ARGBTORGB565ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToRGB565Row = ARGBToRGB565Row_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565Row = ARGBToRGB565Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToRGB565Row = ARGBToRGB565Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565Row = ARGBToRGB565Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToRGB565Row(src_argb, dst_rgb565, width);
+    src_argb += src_stride_argb;
+    dst_rgb565 += dst_stride_rgb565;
+  }
+  return 0;
+}
+
+// Convert ARGB To ARGB1555.
+LIBYUV_API
+int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_argb1555, int dst_stride_argb1555,
+                   int width, int height) {
+  int y;
+  void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+      ARGBToARGB1555Row_C;
+  if (!src_argb || !dst_argb1555 || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb1555 == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb1555 = 0;
+  }
+#if defined(HAS_ARGBTOARGB1555ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOARGB1555ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToARGB1555Row = ARGBToARGB1555Row_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToARGB1555Row = ARGBToARGB1555Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOARGB1555ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToARGB1555Row = ARGBToARGB1555Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToARGB1555Row = ARGBToARGB1555Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToARGB1555Row(src_argb, dst_argb1555, width);
+    src_argb += src_stride_argb;
+    dst_argb1555 += dst_stride_argb1555;
+  }
+  return 0;
+}
+
+// Convert ARGB To ARGB4444.
+LIBYUV_API
+int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_argb4444, int dst_stride_argb4444,
+                   int width, int height) {
+  int y;
+  void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+      ARGBToARGB4444Row_C;
+  if (!src_argb || !dst_argb4444 || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb4444 == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb4444 = 0;
+  }
+#if defined(HAS_ARGBTOARGB4444ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToARGB4444Row = ARGBToARGB4444Row_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToARGB4444Row = ARGBToARGB4444Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOARGB4444ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToARGB4444Row = ARGBToARGB4444Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToARGB4444Row = ARGBToARGB4444Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToARGB4444Row(src_argb, dst_argb4444, width);
+    src_argb += src_stride_argb;
+    dst_argb4444 += dst_stride_argb4444;
+  }
+  return 0;
+}
+
+// Convert ARGB to J420. (JPeg full range I420).
+LIBYUV_API
+int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_yj, int dst_stride_yj,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;
+  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) =
+      ARGBToYJRow_C;
+  if (!src_argb ||
+      !dst_yj || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
+    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVJRow = ARGBToUVJRow_SSSE3;
+      ARGBToYJRow = ARGBToYJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYJRow = ARGBToYJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYJRow = ARGBToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYJRow = ARGBToYJRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVJRow = ARGBToUVJRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    ARGBToUVJRow(src_argb, src_stride_argb, dst_u, dst_v, width);
+    ARGBToYJRow(src_argb, dst_yj, width);
+    ARGBToYJRow(src_argb + src_stride_argb, dst_yj + dst_stride_yj, width);
+    src_argb += src_stride_argb * 2;
+    dst_yj += dst_stride_yj * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width);
+    ARGBToYJRow(src_argb, dst_yj, width);
+  }
+  return 0;
+}
+
+// ARGB little endian (bgra in memory) to J422
+LIBYUV_API
+int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*ARGBToUVJ422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+      int pix) = ARGBToUVJ422Row_C;
+  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYJRow_C;
+  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_y == width &&
+      dst_stride_u * 2 == width &&
+      dst_stride_v * 2 == width) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+  }
+#if defined(HAS_ARGBTOUVJ422ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVJ422Row = ARGBToUVJ422Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVJ422Row = ARGBToUVJ422Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVJ422ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVJ422Row = ARGBToUVJ422Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVJ422Row = ARGBToUVJ422Row_NEON;
+    }
+  }
+#endif
+
+#if defined(HAS_ARGBTOYJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYJRow = ARGBToYJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYJRow = ARGBToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYJRow = ARGBToYJRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToUVJ422Row(src_argb, dst_u, dst_v, width);
+    ARGBToYJRow(src_argb, dst_y, width);
+    src_argb += src_stride_argb;
+    dst_y += dst_stride_y;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  return 0;
+}
+
+// Convert ARGB to J400.
+LIBYUV_API
+int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_yj, int dst_stride_yj,
+               int width, int height) {
+  int y;
+  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) =
+      ARGBToYJRow_C;
+  if (!src_argb || !dst_yj || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_yj == width) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_yj = 0;
+  }
+#if defined(HAS_ARGBTOYJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYJRow = ARGBToYJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYJRow = ARGBToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYJRow = ARGBToYJRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToYJRow(src_argb, dst_yj, width);
+    src_argb += src_stride_argb;
+    dst_yj += dst_stride_yj;
+  }
+  return 0;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libvpx/third_party/libyuv/source/convert_jpeg.cc b/libs/libvpx/third_party/libyuv/source/convert_jpeg.cc
new file mode 100644
index 0000000000..bcb980f7f1
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/source/convert_jpeg.cc
@@ -0,0 +1,392 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert.h"
+
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#ifdef HAVE_JPEG
+struct I420Buffers {
+  uint8* y;
+  int y_stride;
+  uint8* u;
+  int u_stride;
+  uint8* v;
+  int v_stride;
+  int w;
+  int h;
+};
+
+static void JpegCopyI420(void* opaque,
+                         const uint8* const* data,
+                         const int* strides,
+                         int rows) {
+  I420Buffers* dest = (I420Buffers*)(opaque);
+  I420Copy(data[0], strides[0],
+           data[1], strides[1],
+           data[2], strides[2],
+           dest->y, dest->y_stride,
+           dest->u, dest->u_stride,
+           dest->v, dest->v_stride,
+           dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->u += ((rows + 1) >> 1) * dest->u_stride;
+  dest->v += ((rows + 1) >> 1) * dest->v_stride;
+  dest->h -= rows;
+}
+
+static void JpegI422ToI420(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  I420Buffers* dest = (I420Buffers*)(opaque);
+  I422ToI420(data[0], strides[0],
+             data[1], strides[1],
+             data[2], strides[2],
+             dest->y, dest->y_stride,
+             dest->u, dest->u_stride,
+             dest->v, dest->v_stride,
+             dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->u += ((rows + 1) >> 1) * dest->u_stride;
+  dest->v += ((rows + 1) >> 1) * dest->v_stride;
+  dest->h -= rows;
+}
+
+static void JpegI444ToI420(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  I420Buffers* dest = (I420Buffers*)(opaque);
+  I444ToI420(data[0], strides[0],
+             data[1], strides[1],
+             data[2], strides[2],
+             dest->y, dest->y_stride,
+             dest->u, dest->u_stride,
+             dest->v, dest->v_stride,
+             dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->u += ((rows + 1) >> 1) * dest->u_stride;
+  dest->v += ((rows + 1) >> 1) * dest->v_stride;
+  dest->h -= rows;
+}
+
+static void JpegI411ToI420(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  I420Buffers* dest = (I420Buffers*)(opaque);
+  I411ToI420(data[0], strides[0],
+             data[1], strides[1],
+             data[2], strides[2],
+             dest->y, dest->y_stride,
+             dest->u, dest->u_stride,
+             dest->v, dest->v_stride,
+             dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->u += ((rows + 1) >> 1) * dest->u_stride;
+  dest->v += ((rows + 1) >> 1) * dest->v_stride;
+  dest->h -= rows;
+}
+
+static void JpegI400ToI420(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  I420Buffers* dest = (I420Buffers*)(opaque);
+  I400ToI420(data[0], strides[0],
+             dest->y, dest->y_stride,
+             dest->u, dest->u_stride,
+             dest->v, dest->v_stride,
+             dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->u += ((rows + 1) >> 1) * dest->u_stride;
+  dest->v += ((rows + 1) >> 1) * dest->v_stride;
+  dest->h -= rows;
+}
+
+// Query size of MJPG in pixels.
+LIBYUV_API
+int MJPGSize(const uint8* sample, size_t sample_size,
+             int* width, int* height) {
+  MJpegDecoder mjpeg_decoder;
+  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+  if (ret) {
+    *width = mjpeg_decoder.GetWidth();
+    *height = mjpeg_decoder.GetHeight();
+  }
+  mjpeg_decoder.UnloadFrame();
+  return ret ? 0 : -1;  // -1 for runtime failure.
+}
+
+// MJPG (Motion JPeg) to I420
+// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
+LIBYUV_API
+int MJPGToI420(const uint8* sample,
+               size_t sample_size,
+               uint8* y, int y_stride,
+               uint8* u, int u_stride,
+               uint8* v, int v_stride,
+               int w, int h,
+               int dw, int dh) {
+  if (sample_size == kUnknownDataSize) {
+    // ERROR: MJPEG frame size unknown
+    return -1;
+  }
+
+  // TODO(fbarchard): Port MJpeg to C.
+  MJpegDecoder mjpeg_decoder;
+  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+  if (ret && (mjpeg_decoder.GetWidth() != w ||
+              mjpeg_decoder.GetHeight() != h)) {
+    // ERROR: MJPEG frame has unexpected dimensions
+    mjpeg_decoder.UnloadFrame();
+    return 1;  // runtime failure
+  }
+  if (ret) {
+    I420Buffers bufs = { y, y_stride, u, u_stride, v, v_stride, dw, dh };
+    // YUV420
+    if (mjpeg_decoder.GetColorSpace() ==
+            MJpegDecoder::kColorSpaceYCbCr &&
+        mjpeg_decoder.GetNumComponents() == 3 &&
+        mjpeg_decoder.GetVertSampFactor(0) == 2 &&
+        mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+        mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+        mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+        mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+        mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dw, dh);
+    // YUV422
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dw, dh);
+    // YUV444
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dw, dh);
+    // YUV411
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 4 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToI420, &bufs, dw, dh);
+    // YUV400
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceGrayscale &&
+               mjpeg_decoder.GetNumComponents() == 1 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dw, dh);
+    } else {
+      // TODO(fbarchard): Implement conversion for any other colorspace/sample
+      // factors that occur in practice. 411 is supported by libjpeg
+      // ERROR: Unable to convert MJPEG frame because format is not supported
+      mjpeg_decoder.UnloadFrame();
+      return 1;
+    }
+  }
+  return ret ? 0 : 1;
+}
+
+#ifdef HAVE_JPEG
+struct ARGBBuffers {
+  uint8* argb;
+  int argb_stride;
+  int w;
+  int h;
+};
+
+static void JpegI420ToARGB(void* opaque,
+                         const uint8* const* data,
+                         const int* strides,
+                         int rows) {
+  ARGBBuffers* dest = (ARGBBuffers*)(opaque);
+  I420ToARGB(data[0], strides[0],
+             data[1], strides[1],
+             data[2], strides[2],
+             dest->argb, dest->argb_stride,
+             dest->w, rows);
+  dest->argb += rows * dest->argb_stride;
+  dest->h -= rows;
+}
+
+static void JpegI422ToARGB(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  ARGBBuffers* dest = (ARGBBuffers*)(opaque);
+  I422ToARGB(data[0], strides[0],
+             data[1], strides[1],
+             data[2], strides[2],
+             dest->argb, dest->argb_stride,
+             dest->w, rows);
+  dest->argb += rows * dest->argb_stride;
+  dest->h -= rows;
+}
+
+static void JpegI444ToARGB(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  ARGBBuffers* dest = (ARGBBuffers*)(opaque);
+  I444ToARGB(data[0], strides[0],
+             data[1], strides[1],
+             data[2], strides[2],
+             dest->argb, dest->argb_stride,
+             dest->w, rows);
+  dest->argb += rows * dest->argb_stride;
+  dest->h -= rows;
+}
+
+static void JpegI411ToARGB(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  ARGBBuffers* dest = (ARGBBuffers*)(opaque);
+  I411ToARGB(data[0], strides[0],
+             data[1], strides[1],
+             data[2], strides[2],
+             dest->argb, dest->argb_stride,
+             dest->w, rows);
+  dest->argb += rows * dest->argb_stride;
+  dest->h -= rows;
+}
+
+static void JpegI400ToARGB(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  ARGBBuffers* dest = (ARGBBuffers*)(opaque);
+  I400ToARGB(data[0], strides[0],
+             dest->argb, dest->argb_stride,
+             dest->w, rows);
+  dest->argb += rows * dest->argb_stride;
+  dest->h -= rows;
+}
+
+// MJPG (Motion JPeg) to ARGB
+// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
+LIBYUV_API
+int MJPGToARGB(const uint8* sample,
+               size_t sample_size,
+               uint8* argb, int argb_stride,
+               int w, int h,
+               int dw, int dh) {
+  if (sample_size == kUnknownDataSize) {
+    // ERROR: MJPEG frame size unknown
+    return -1;
+  }
+
+  // TODO(fbarchard): Port MJpeg to C.
+  MJpegDecoder mjpeg_decoder;
+  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+  if (ret && (mjpeg_decoder.GetWidth() != w ||
+              mjpeg_decoder.GetHeight() != h)) {
+    // ERROR: MJPEG frame has unexpected dimensions
+    mjpeg_decoder.UnloadFrame();
+    return 1;  // runtime failure
+  }
+  if (ret) {
+    ARGBBuffers bufs = { argb, argb_stride, dw, dh };
+    // YUV420
+    if (mjpeg_decoder.GetColorSpace() ==
+            MJpegDecoder::kColorSpaceYCbCr &&
+        mjpeg_decoder.GetNumComponents() == 3 &&
+        mjpeg_decoder.GetVertSampFactor(0) == 2 &&
+        mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+        mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+        mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+        mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+        mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dw, dh);
+    // YUV422
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dw, dh);
+    // YUV444
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dw, dh);
+    // YUV411
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 4 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToARGB, &bufs, dw, dh);
+    // YUV400
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceGrayscale &&
+               mjpeg_decoder.GetNumComponents() == 1 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dw, dh);
+    } else {
+      // TODO(fbarchard): Implement conversion for any other colorspace/sample
+      // factors that occur in practice. 411 is supported by libjpeg
+      // ERROR: Unable to convert MJPEG frame because format is not supported
+      mjpeg_decoder.UnloadFrame();
+      return 1;
+    }
+  }
+  return ret ? 0 : 1;
+}
+#endif
+
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libvpx/third_party/libyuv/source/convert_to_argb.cc b/libs/libvpx/third_party/libyuv/source/convert_to_argb.cc
new file mode 100644
index 0000000000..af829fbd32
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/source/convert_to_argb.cc
@@ -0,0 +1,306 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert_argb.h"
+
+#include "libyuv/cpu_id.h"
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+#include "libyuv/rotate_argb.h"
+#include "libyuv/row.h"
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Convert camera sample to I420 with cropping, rotation and vertical flip.
+// src_width is used for source stride computation
+// src_height is used to compute location of planes, and indicate inversion
+// sample_size is measured in bytes and is the size of the frame.
+//   With MJPEG it is the compressed size of the frame.
+LIBYUV_API
+int ConvertToARGB(const uint8* sample, size_t sample_size,
+                  uint8* crop_argb, int argb_stride,
+                  int crop_x, int crop_y,
+                  int src_width, int src_height,
+                  int crop_width, int crop_height,
+                  enum RotationMode rotation,
+                  uint32 fourcc) {
+  uint32 format = CanonicalFourCC(fourcc);
+  int aligned_src_width = (src_width + 1) & ~1;
+  const uint8* src;
+  const uint8* src_uv;
+  int abs_src_height = (src_height < 0) ? -src_height : src_height;
+  int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height;
+  int r = 0;
+
+  // One pass rotation is available for some formats. For the rest, convert
+  // to I420 (with optional vertical flipping) into a temporary I420 buffer,
+  // and then rotate the I420 to the final destination buffer.
+  // For in-place conversion, if destination crop_argb is same as source sample,
+  // also enable temporary buffer.
+  LIBYUV_BOOL need_buf = (rotation && format != FOURCC_ARGB) ||
+      crop_argb == sample;
+  uint8* tmp_argb = crop_argb;
+  int tmp_argb_stride = argb_stride;
+  uint8* rotate_buffer = NULL;
+  int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
+
+  if (crop_argb == NULL || sample == NULL ||
+      src_width <= 0 || crop_width <= 0 ||
+      src_height == 0 || crop_height == 0) {
+    return -1;
+  }
+  if (src_height < 0) {
+    inv_crop_height = -inv_crop_height;
+  }
+
+  if (need_buf) {
+    int argb_size = crop_width * abs_crop_height * 4;
+    rotate_buffer = (uint8*)malloc(argb_size);
+    if (!rotate_buffer) {
+      return 1;  // Out of memory runtime error.
+    }
+    crop_argb = rotate_buffer;
+    argb_stride = crop_width;
+  }
+
+  switch (format) {
+    // Single plane formats
+    case FOURCC_YUY2:
+      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+      r = YUY2ToARGB(src, aligned_src_width * 2,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_UYVY:
+      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+      r = UYVYToARGB(src, aligned_src_width * 2,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_24BG:
+      src = sample + (src_width * crop_y + crop_x) * 3;
+      r = RGB24ToARGB(src, src_width * 3,
+                      crop_argb, argb_stride,
+                      crop_width, inv_crop_height);
+      break;
+    case FOURCC_RAW:
+      src = sample + (src_width * crop_y + crop_x) * 3;
+      r = RAWToARGB(src, src_width * 3,
+                    crop_argb, argb_stride,
+                    crop_width, inv_crop_height);
+      break;
+    case FOURCC_ARGB:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = ARGBToARGB(src, src_width * 4,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_BGRA:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = BGRAToARGB(src, src_width * 4,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_ABGR:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = ABGRToARGB(src, src_width * 4,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_RGBA:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = RGBAToARGB(src, src_width * 4,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_RGBP:
+      src = sample + (src_width * crop_y + crop_x) * 2;
+      r = RGB565ToARGB(src, src_width * 2,
+                       crop_argb, argb_stride,
+                       crop_width, inv_crop_height);
+      break;
+    case FOURCC_RGBO:
+      src = sample + (src_width * crop_y + crop_x) * 2;
+      r = ARGB1555ToARGB(src, src_width * 2,
+                         crop_argb, argb_stride,
+                         crop_width, inv_crop_height);
+      break;
+    case FOURCC_R444:
+      src = sample + (src_width * crop_y + crop_x) * 2;
+      r = ARGB4444ToARGB(src, src_width * 2,
+                         crop_argb, argb_stride,
+                         crop_width, inv_crop_height);
+      break;
+    case FOURCC_I400:
+      src = sample + src_width * crop_y + crop_x;
+      r = I400ToARGB(src, src_width,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+
+    // Biplanar formats
+    case FOURCC_NV12:
+      src = sample + (src_width * crop_y + crop_x);
+      src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
+      r = NV12ToARGB(src, src_width,
+                     src_uv, aligned_src_width,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_NV21:
+      src = sample + (src_width * crop_y + crop_x);
+      src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
+      // Call NV12 but with u and v parameters swapped.
+      r = NV21ToARGB(src, src_width,
+                     src_uv, aligned_src_width,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_M420:
+      src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
+      r = M420ToARGB(src, src_width,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    // Triplanar formats
+    case FOURCC_I420:
+    case FOURCC_YU12:
+    case FOURCC_YV12: {
+      const uint8* src_y = sample + (src_width * crop_y + crop_x);
+      const uint8* src_u;
+      const uint8* src_v;
+      int halfwidth = (src_width + 1) / 2;
+      int halfheight = (abs_src_height + 1) / 2;
+      if (format == FOURCC_YV12) {
+        src_v = sample + src_width * abs_src_height +
+            (halfwidth * crop_y + crop_x) / 2;
+        src_u = sample + src_width * abs_src_height +
+            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+      } else {
+        src_u = sample + src_width * abs_src_height +
+            (halfwidth * crop_y + crop_x) / 2;
+        src_v = sample + src_width * abs_src_height +
+            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+      }
+      r = I420ToARGB(src_y, src_width,
+                     src_u, halfwidth,
+                     src_v, halfwidth,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    }
+
+    case FOURCC_J420: {
+      const uint8* src_y = sample + (src_width * crop_y + crop_x);
+      const uint8* src_u;
+      const uint8* src_v;
+      int halfwidth = (src_width + 1) / 2;
+      int halfheight = (abs_src_height + 1) / 2;
+      src_u = sample + src_width * abs_src_height +
+          (halfwidth * crop_y + crop_x) / 2;
+      src_v = sample + src_width * abs_src_height +
+          halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+      r = J420ToARGB(src_y, src_width,
+                     src_u, halfwidth,
+                     src_v, halfwidth,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    }
+
+    case FOURCC_I422:
+    case FOURCC_YV16: {
+      const uint8* src_y = sample + src_width * crop_y + crop_x;
+      const uint8* src_u;
+      const uint8* src_v;
+      int halfwidth = (src_width + 1) / 2;
+      if (format == FOURCC_YV16) {
+        src_v = sample + src_width * abs_src_height +
+            halfwidth * crop_y + crop_x / 2;
+        src_u = sample + src_width * abs_src_height +
+            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+      } else {
+        src_u = sample + src_width * abs_src_height +
+            halfwidth * crop_y + crop_x / 2;
+        src_v = sample + src_width * abs_src_height +
+            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+      }
+      r = I422ToARGB(src_y, src_width,
+                     src_u, halfwidth,
+                     src_v, halfwidth,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    }
+    case FOURCC_I444:
+    case FOURCC_YV24: {
+      const uint8* src_y = sample + src_width * crop_y + crop_x;
+      const uint8* src_u;
+      const uint8* src_v;
+      if (format == FOURCC_YV24) {
+        src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
+        src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+      } else {
+        src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+        src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+      }
+      r = I444ToARGB(src_y, src_width,
+                     src_u, src_width,
+                     src_v, src_width,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    }
+    case FOURCC_I411: {
+      int quarterwidth = (src_width + 3) / 4;
+      const uint8* src_y = sample + src_width * crop_y + crop_x;
+      const uint8* src_u = sample + src_width * abs_src_height +
+          quarterwidth * crop_y + crop_x / 4;
+      const uint8* src_v = sample + src_width * abs_src_height +
+          quarterwidth * (abs_src_height + crop_y) + crop_x / 4;
+      r = I411ToARGB(src_y, src_width,
+                     src_u, quarterwidth,
+                     src_v, quarterwidth,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    }
+#ifdef HAVE_JPEG
+    case FOURCC_MJPG:
+      r = MJPGToARGB(sample, sample_size,
+                     crop_argb, argb_stride,
+                     src_width, abs_src_height, crop_width, inv_crop_height);
+      break;
+#endif
+    default:
+      r = -1;  // unknown fourcc - return failure code.
+  }
+
+  if (need_buf) {
+    if (!r) {
+      r = ARGBRotate(crop_argb, argb_stride,
+                     tmp_argb, tmp_argb_stride,
+                     crop_width, abs_crop_height, rotation);
+    }
+    free(rotate_buffer);
+  }
+
+  return r;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libvpx/third_party/libyuv/source/convert_to_i420.cc b/libs/libvpx/third_party/libyuv/source/convert_to_i420.cc
new file mode 100644
index 0000000000..5e75369b55
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/source/convert_to_i420.cc
@@ -0,0 +1,339 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "libyuv/convert.h"
+
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Convert camera sample to I420 with cropping, rotation and vertical flip.
+// src_width is used for source stride computation
+// src_height is used to compute location of planes, and indicate inversion
+// sample_size is measured in bytes and is the size of the frame.
+//   With MJPEG it is the compressed size of the frame.
+LIBYUV_API
+int ConvertToI420(const uint8* sample,
+                  size_t sample_size,
+                  uint8* y, int y_stride,
+                  uint8* u, int u_stride,
+                  uint8* v, int v_stride,
+                  int crop_x, int crop_y,
+                  int src_width, int src_height,
+                  int crop_width, int crop_height,
+                  enum RotationMode rotation,
+                  uint32 fourcc) {
+  uint32 format = CanonicalFourCC(fourcc);
+  int aligned_src_width = (src_width + 1) & ~1;
+  const uint8* src;
+  const uint8* src_uv;
+  int abs_src_height = (src_height < 0) ? -src_height : src_height;
+  int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height;
+  int r = 0;
+  LIBYUV_BOOL need_buf = (rotation && format != FOURCC_I420 &&
+      format != FOURCC_NV12 && format != FOURCC_NV21 &&
+      format != FOURCC_YU12 && format != FOURCC_YV12) || y == sample;
+  uint8* tmp_y = y;
+  uint8* tmp_u = u;
+  uint8* tmp_v = v;
+  int tmp_y_stride = y_stride;
+  int tmp_u_stride = u_stride;
+  int tmp_v_stride = v_stride;
+  uint8* rotate_buffer = NULL;
+  int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
+
+  if (!y || !u || !v || !sample ||
+      src_width <= 0 || crop_width <= 0  ||
+      src_height == 0 || crop_height == 0) {
+    return -1;
+  }
+  if (src_height < 0) {
+    inv_crop_height = -inv_crop_height;
+  }
+
+  // One pass rotation is available for some formats. For the rest, convert
+  // to I420 (with optional vertical flipping) into a temporary I420 buffer,
+  // and then rotate the I420 to the final destination buffer.
+  // For in-place conversion, if destination y is same as source sample,
+  // also enable temporary buffer.
+  if (need_buf) {
+    int y_size = crop_width * abs_crop_height;
+    int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2);
+    rotate_buffer = (uint8*)malloc(y_size + uv_size * 2);
+    if (!rotate_buffer) {
+      return 1;  // Out of memory runtime error.
+    }
+    y = rotate_buffer;
+    u = y + y_size;
+    v = u + uv_size;
+    y_stride = crop_width;
+    u_stride = v_stride = ((crop_width + 1) / 2);
+  }
+
+  switch (format) {
+    // Single plane formats
+    case FOURCC_YUY2:
+      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+      r = YUY2ToI420(src, aligned_src_width * 2,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_UYVY:
+      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+      r = UYVYToI420(src, aligned_src_width * 2,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_RGBP:
+      src = sample + (src_width * crop_y + crop_x) * 2;
+      r = RGB565ToI420(src, src_width * 2,
+                       y, y_stride,
+                       u, u_stride,
+                       v, v_stride,
+                       crop_width, inv_crop_height);
+      break;
+    case FOURCC_RGBO:
+      src = sample + (src_width * crop_y + crop_x) * 2;
+      r = ARGB1555ToI420(src, src_width * 2,
+                         y, y_stride,
+                         u, u_stride,
+                         v, v_stride,
+                         crop_width, inv_crop_height);
+      break;
+    case FOURCC_R444:
+      src = sample + (src_width * crop_y + crop_x) * 2;
+      r = ARGB4444ToI420(src, src_width * 2,
+                         y, y_stride,
+                         u, u_stride,
+                         v, v_stride,
+                         crop_width, inv_crop_height);
+      break;
+    case FOURCC_24BG:
+      src = sample + (src_width * crop_y + crop_x) * 3;
+      r = RGB24ToI420(src, src_width * 3,
+                      y, y_stride,
+                      u, u_stride,
+                      v, v_stride,
+                      crop_width, inv_crop_height);
+      break;
+    case FOURCC_RAW:
+      src = sample + (src_width * crop_y + crop_x) * 3;
+      r = RAWToI420(src, src_width * 3,
+                    y, y_stride,
+                    u, u_stride,
+                    v, v_stride,
+                    crop_width, inv_crop_height);
+      break;
+    case FOURCC_ARGB:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = ARGBToI420(src, src_width * 4,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_BGRA:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = BGRAToI420(src, src_width * 4,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_ABGR:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = ABGRToI420(src, src_width * 4,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_RGBA:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = RGBAToI420(src, src_width * 4,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_I400:
+      src = sample + src_width * crop_y + crop_x;
+      r = I400ToI420(src, src_width,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    // Biplanar formats
+    case FOURCC_NV12:
+      src = sample + (src_width * crop_y + crop_x);
+      src_uv = sample + (src_width * src_height) +
+        ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
+      r = NV12ToI420Rotate(src, src_width,
+                           src_uv, aligned_src_width,
+                           y, y_stride,
+                           u, u_stride,
+                           v, v_stride,
+                           crop_width, inv_crop_height, rotation);
+      break;
+    case FOURCC_NV21:
+      src = sample + (src_width * crop_y + crop_x);
+      src_uv = sample + (src_width * src_height) +
+        ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
+      // Call NV12 but with u and v parameters swapped.
+      r = NV12ToI420Rotate(src, src_width,
+                           src_uv, aligned_src_width,
+                           y, y_stride,
+                           v, v_stride,
+                           u, u_stride,
+                           crop_width, inv_crop_height, rotation);
+      break;
+    case FOURCC_M420:
+      src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
+      r = M420ToI420(src, src_width,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    // Triplanar formats
+    case FOURCC_I420:
+    case FOURCC_YU12:
+    case FOURCC_YV12: {
+      const uint8* src_y = sample + (src_width * crop_y + crop_x);
+      const uint8* src_u;
+      const uint8* src_v;
+      int halfwidth = (src_width + 1) / 2;
+      int halfheight = (abs_src_height + 1) / 2;
+      if (format == FOURCC_YV12) {
+        src_v = sample + src_width * abs_src_height +
+            (halfwidth * crop_y + crop_x) / 2;
+        src_u = sample + src_width * abs_src_height +
+            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+      } else {
+        src_u = sample + src_width * abs_src_height +
+            (halfwidth * crop_y + crop_x) / 2;
+        src_v = sample + src_width * abs_src_height +
+            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+      }
+      r = I420Rotate(src_y, src_width,
+                     src_u, halfwidth,
+                     src_v, halfwidth,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height, rotation);
+      break;
+    }
+    case FOURCC_I422:
+    case FOURCC_YV16: {
+      const uint8* src_y = sample + src_width * crop_y + crop_x;
+      const uint8* src_u;
+      const uint8* src_v;
+      int halfwidth = (src_width + 1) / 2;
+      if (format == FOURCC_YV16) {
+        src_v = sample + src_width * abs_src_height +
+            halfwidth * crop_y + crop_x / 2;
+        src_u = sample + src_width * abs_src_height +
+            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+      } else {
+        src_u = sample + src_width * abs_src_height +
+            halfwidth * crop_y + crop_x / 2;
+        src_v = sample + src_width * abs_src_height +
+            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+      }
+      r = I422ToI420(src_y, src_width,
+                     src_u, halfwidth,
+                     src_v, halfwidth,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    }
+    case FOURCC_I444:
+    case FOURCC_YV24: {
+      const uint8* src_y = sample + src_width * crop_y + crop_x;
+      const uint8* src_u;
+      const uint8* src_v;
+      if (format == FOURCC_YV24) {
+        src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
+        src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+      } else {
+        src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+        src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+      }
+      r = I444ToI420(src_y, src_width,
+                     src_u, src_width,
+                     src_v, src_width,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    }
+    case FOURCC_I411: {
+      int quarterwidth = (src_width + 3) / 4;
+      const uint8* src_y = sample + src_width * crop_y + crop_x;
+      const uint8* src_u = sample + src_width * abs_src_height +
+          quarterwidth * crop_y + crop_x / 4;
+      const uint8* src_v = sample + src_width * abs_src_height +
+          quarterwidth * (abs_src_height + crop_y) + crop_x / 4;
+      r = I411ToI420(src_y, src_width,
+                     src_u, quarterwidth,
+                     src_v, quarterwidth,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    }
+#ifdef HAVE_JPEG
+    case FOURCC_MJPG:
+      r = MJPGToI420(sample, sample_size,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     src_width, abs_src_height, crop_width, inv_crop_height);
+      break;
+#endif
+    default:
+      r = -1;  // unknown fourcc - return failure code.
+  }
+
+  if (need_buf) {
+    if (!r) {
+      r = I420Rotate(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     tmp_y, tmp_y_stride,
+                     tmp_u, tmp_u_stride,
+                     tmp_v, tmp_v_stride,
+                     crop_width, abs_crop_height, rotation);
+    }
+    free(rotate_buffer);
+  }
+
+  return r;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libvpx/third_party/libyuv/source/cpu_id.cc b/libs/libvpx/third_party/libyuv/source/cpu_id.cc
new file mode 100644
index 0000000000..8a10b00835
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/source/cpu_id.cc
@@ -0,0 +1,307 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/cpu_id.h"
+
+#if (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__)
+#include <intrin.h>  // For __cpuidex()
+#endif
+#if !defined(__pnacl__) && !defined(__CLR_VER) && \
+    !defined(__native_client__) && (defined(_M_IX86) || defined(_M_X64)) && \
+    defined(_MSC_VER) && !defined(__clang__) && (_MSC_FULL_VER >= 160040219)
+#include <immintrin.h>  // For _xgetbv()
+#endif
+
+#if !defined(__native_client__)
+#include <stdlib.h>  // For getenv()
+#endif
+
+// For ArmCpuCaps() but unittested on all platforms
+#include <stdio.h>
+#include <string.h>
+
+#include "libyuv/basic_types.h"  // For CPU_X86
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// For functions that use the stack and have runtime checks for overflow,
+// use SAFEBUFFERS to avoid additional check.
+#if (defined(_MSC_VER) && !defined(__clang__)) && (_MSC_FULL_VER >= 160040219)
+#define SAFEBUFFERS __declspec(safebuffers)
+#else
+#define SAFEBUFFERS
+#endif
+
+// Low level cpuid for X86.
+#if (defined(_M_IX86) || defined(_M_X64) || \
+    defined(__i386__) || defined(__x86_64__)) && \
+    !defined(__pnacl__) && !defined(__CLR_VER)
+LIBYUV_API
+void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
+#if (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__)
+// Visual C version uses intrinsic or inline x86 assembly.
+#if (_MSC_FULL_VER >= 160040219)
+  __cpuidex((int*)(cpu_info), info_eax, info_ecx);
+#elif defined(_M_IX86)
+  __asm {
+    mov        eax, info_eax
+    mov        ecx, info_ecx
+    mov        edi, cpu_info
+    cpuid
+    mov        [edi], eax
+    mov        [edi + 4], ebx
+    mov        [edi + 8], ecx
+    mov        [edi + 12], edx
+  }
+#else
+  if (info_ecx == 0) {
+    __cpuid((int*)(cpu_info), info_eax);
+  } else {
+    cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0;
+  }
+#endif
+// GCC version uses inline x86 assembly.
+#else  // (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__)
+  uint32 info_ebx, info_edx;
+  asm volatile (  // NOLINT
+#if defined( __i386__) && defined(__PIC__)
+    // Preserve ebx for fpic 32 bit.
+    "mov %%ebx, %%edi                          \n"
+    "cpuid                                     \n"
+    "xchg %%edi, %%ebx                         \n"
+    : "=D" (info_ebx),
+#else
+    "cpuid                                     \n"
+    : "=b" (info_ebx),
+#endif  //  defined( __i386__) && defined(__PIC__)
+      "+a" (info_eax), "+c" (info_ecx), "=d" (info_edx));
+  cpu_info[0] = info_eax;
+  cpu_info[1] = info_ebx;
+  cpu_info[2] = info_ecx;
+  cpu_info[3] = info_edx;
+#endif  // (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__)
+}
+#else  // (defined(_M_IX86) || defined(_M_X64) ...
+LIBYUV_API
+void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) {
+  cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
+}
+#endif
+
+// TODO(fbarchard): Enable xgetbv when validator supports it.
+#if (defined(_M_IX86) || defined(_M_X64) || \
+    defined(__i386__) || defined(__x86_64__)) && \
+    !defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__)
+#define HAS_XGETBV
+// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
+int TestOsSaveYmm() {
+  uint32 xcr0 = 0u;
+#if (defined(_MSC_VER) && !defined(__clang__)) && (_MSC_FULL_VER >= 160040219)
+  xcr0 = (uint32)(_xgetbv(0));  // VS2010 SP1 required.
+#elif defined(_M_IX86) && defined(_MSC_VER) && !defined(__clang__)
+  __asm {
+    xor        ecx, ecx    // xcr 0
+    _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0  // For VS2010 and earlier.
+    mov        xcr0, eax
+  }
+#elif defined(__i386__) || defined(__x86_64__)
+  asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx");
+#endif  // defined(__i386__) || defined(__x86_64__)
+  return((xcr0 & 6) == 6);  // Is ymm saved?
+}
+#endif  // defined(_M_IX86) || defined(_M_X64) ..
+
+// based on libvpx arm_cpudetect.c
+// For Arm, but public to allow testing on any CPU
+LIBYUV_API SAFEBUFFERS
+int ArmCpuCaps(const char* cpuinfo_name) {
+  char cpuinfo_line[512];
+  FILE* f = fopen(cpuinfo_name, "r");
+  if (!f) {
+    // Assume Neon if /proc/cpuinfo is unavailable.
+    // This will occur for Chrome sandbox for Pepper or Render process.
+    return kCpuHasNEON;
+  }
+  while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {
+    if (memcmp(cpuinfo_line, "Features", 8) == 0) {
+      char* p = strstr(cpuinfo_line, " neon");
+      if (p && (p[5] == ' ' || p[5] == '\n')) {
+        fclose(f);
+        return kCpuHasNEON;
+      }
+      // aarch64 uses asimd for Neon.
+      p = strstr(cpuinfo_line, " asimd");
+      if (p && (p[6] == ' ' || p[6] == '\n')) {
+        fclose(f);
+        return kCpuHasNEON;
+      }
+    }
+  }
+  fclose(f);
+  return 0;
+}
+
+#if defined(__mips__) && defined(__linux__)
+static int MipsCpuCaps(const char* search_string) {
+  char cpuinfo_line[512];
+  const char* file_name = "/proc/cpuinfo";
+  FILE* f = fopen(file_name, "r");
+  if (!f) {
+    // Assume DSP if /proc/cpuinfo is unavailable.
+    // This will occur for Chrome sandbox for Pepper or Render process.
+    return kCpuHasMIPS_DSP;
+  }
+  while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f) != NULL) {
+    if (strstr(cpuinfo_line, search_string) != NULL) {
+      fclose(f);
+      return kCpuHasMIPS_DSP;
+    }
+  }
+  fclose(f);
+  return 0;
+}
+#endif
+
+// CPU detect function for SIMD instruction sets.
+LIBYUV_API
+int cpu_info_ = kCpuInit;  // cpu_info is not initialized yet.
+
+// Test environment variable for disabling CPU features. Any non-zero value
+// to disable. Zero ignored to make it easy to set the variable on/off.
+#if !defined(__native_client__) && !defined(_M_ARM)
+
+static LIBYUV_BOOL TestEnv(const char* name) {
+  const char* var = getenv(name);
+  if (var) {
+    if (var[0] != '0') {
+      return LIBYUV_TRUE;
+    }
+  }
+  return LIBYUV_FALSE;
+}
+#else  // nacl does not support getenv().
+static LIBYUV_BOOL TestEnv(const char*) {
+  return LIBYUV_FALSE;
+}
+#endif
+
+LIBYUV_API SAFEBUFFERS
+int InitCpuFlags(void) {
+#if !defined(__pnacl__) && !defined(__CLR_VER) && defined(CPU_X86)
+
+  uint32 cpu_info0[4] = { 0, 0, 0, 0 };
+  uint32 cpu_info1[4] = { 0, 0, 0, 0 };
+  uint32 cpu_info7[4] = { 0, 0, 0, 0 };
+  CpuId(0, 0, cpu_info0);
+  CpuId(1, 0, cpu_info1);
+  if (cpu_info0[0] >= 7) {
+    CpuId(7, 0, cpu_info7);
+  }
+  cpu_info_ = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
+              ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
+              ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
+              ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
+              ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) |
+              ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
+              kCpuHasX86;
+
+#ifdef HAS_XGETBV
+  if ((cpu_info1[2] & 0x18000000) == 0x18000000 &&  // AVX and OSSave
+      TestOsSaveYmm()) {  // Saves YMM.
+    cpu_info_ |= ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) |
+                 kCpuHasAVX;
+  }
+#endif
+  // Environment variable overrides for testing.
+  if (TestEnv("LIBYUV_DISABLE_X86")) {
+    cpu_info_ &= ~kCpuHasX86;
+  }
+  if (TestEnv("LIBYUV_DISABLE_SSE2")) {
+    cpu_info_ &= ~kCpuHasSSE2;
+  }
+  if (TestEnv("LIBYUV_DISABLE_SSSE3")) {
+    cpu_info_ &= ~kCpuHasSSSE3;
+  }
+  if (TestEnv("LIBYUV_DISABLE_SSE41")) {
+    cpu_info_ &= ~kCpuHasSSE41;
+  }
+  if (TestEnv("LIBYUV_DISABLE_SSE42")) {
+    cpu_info_ &= ~kCpuHasSSE42;
+  }
+  if (TestEnv("LIBYUV_DISABLE_AVX")) {
+    cpu_info_ &= ~kCpuHasAVX;
+  }
+  if (TestEnv("LIBYUV_DISABLE_AVX2")) {
+    cpu_info_ &= ~kCpuHasAVX2;
+  }
+  if (TestEnv("LIBYUV_DISABLE_ERMS")) {
+    cpu_info_ &= ~kCpuHasERMS;
+  }
+  if (TestEnv("LIBYUV_DISABLE_FMA3")) {
+    cpu_info_ &= ~kCpuHasFMA3;
+  }
+#endif
+#if defined(__mips__) && defined(__linux__)
+  // Linux mips parse text file for dsp detect.
+  cpu_info_ = MipsCpuCaps("dsp");  // set kCpuHasMIPS_DSP.
+#if defined(__mips_dspr2)
+  cpu_info_ |= kCpuHasMIPS_DSPR2;
+#endif
+  cpu_info_ |= kCpuHasMIPS;
+
+  if (getenv("LIBYUV_DISABLE_MIPS")) {
+    cpu_info_ &= ~kCpuHasMIPS;
+  }
+  if (getenv("LIBYUV_DISABLE_MIPS_DSP")) {
+    cpu_info_ &= ~kCpuHasMIPS_DSP;
+  }
+  if (getenv("LIBYUV_DISABLE_MIPS_DSPR2")) {
+    cpu_info_ &= ~kCpuHasMIPS_DSPR2;
+  }
+#endif
+#if defined(__arm__) || defined(__aarch64__)
+// gcc -mfpu=neon defines __ARM_NEON__
+// __ARM_NEON__ generates code that requires Neon.  NaCL also requires Neon.
+// For Linux, /proc/cpuinfo can be tested but without that assume Neon.
+#if defined(__ARM_NEON__) || defined(__native_client__) || !defined(__linux__)
+  cpu_info_ = kCpuHasNEON;
+// For aarch64(arm64), /proc/cpuinfo's feature is not complete, e.g. no neon
+// flag in it.
+// So for aarch64, neon enabling is hard coded here.
+#endif
+#if defined(__aarch64__)
+  cpu_info_ = kCpuHasNEON;
+#else
+  // Linux arm parse text file for neon detect.
+  cpu_info_ = ArmCpuCaps("/proc/cpuinfo");
+#endif
+  cpu_info_ |= kCpuHasARM;
+  if (TestEnv("LIBYUV_DISABLE_NEON")) {
+    cpu_info_ &= ~kCpuHasNEON;
+  }
+#endif  // __arm__
+  if (TestEnv("LIBYUV_DISABLE_ASM")) {
+    cpu_info_ = 0;
+  }
+  return cpu_info_;
+}
+
+LIBYUV_API
+void MaskCpuFlags(int enable_flags) {
+  cpu_info_ = InitCpuFlags() & enable_flags;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libvpx/third_party/libyuv/source/mjpeg_decoder.cc b/libs/libvpx/third_party/libyuv/source/mjpeg_decoder.cc
new file mode 100644
index 0000000000..75f8a610e3
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/source/mjpeg_decoder.cc
@@ -0,0 +1,572 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/mjpeg_decoder.h"
+
+#ifdef HAVE_JPEG
+#include <assert.h>
+
+#if !defined(__pnacl__) && !defined(__CLR_VER) && \
+    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
+// Must be included before jpeglib.
+#include <setjmp.h>
+#define HAVE_SETJMP
+
+#if defined(_MSC_VER)
+// disable warning 4324: structure was padded due to __declspec(align())
+#pragma warning(disable:4324)
+#endif
+
+#endif
+struct FILE;  // For jpeglib.h.
+
+// C++ build requires extern C for jpeg internals.
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <jpeglib.h>
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#include "libyuv/planar_functions.h"  // For CopyPlane().
+
+namespace libyuv {
+
+#ifdef HAVE_SETJMP
+struct SetJmpErrorMgr {
+  jpeg_error_mgr base;  // Must be at the top
+  jmp_buf setjmp_buffer;
+};
+#endif
+
+const int MJpegDecoder::kColorSpaceUnknown = JCS_UNKNOWN;
+const int MJpegDecoder::kColorSpaceGrayscale = JCS_GRAYSCALE;
+const int MJpegDecoder::kColorSpaceRgb = JCS_RGB;
+const int MJpegDecoder::kColorSpaceYCbCr = JCS_YCbCr;
+const int MJpegDecoder::kColorSpaceCMYK = JCS_CMYK;
+const int MJpegDecoder::kColorSpaceYCCK = JCS_YCCK;
+
+// Methods that are passed to jpeglib.
+boolean fill_input_buffer(jpeg_decompress_struct* cinfo);
+void init_source(jpeg_decompress_struct* cinfo);
+void skip_input_data(jpeg_decompress_struct* cinfo,
+                     long num_bytes);  // NOLINT
+void term_source(jpeg_decompress_struct* cinfo);
+void ErrorHandler(jpeg_common_struct* cinfo);
+
+MJpegDecoder::MJpegDecoder()
+    : has_scanline_padding_(LIBYUV_FALSE),
+      num_outbufs_(0),
+      scanlines_(NULL),
+      scanlines_sizes_(NULL),
+      databuf_(NULL),
+      databuf_strides_(NULL) {
+  decompress_struct_ = new jpeg_decompress_struct;
+  source_mgr_ = new jpeg_source_mgr;
+#ifdef HAVE_SETJMP
+  error_mgr_ = new SetJmpErrorMgr;
+  decompress_struct_->err = jpeg_std_error(&error_mgr_->base);
+  // Override standard exit()-based error handler.
+  error_mgr_->base.error_exit = &ErrorHandler;
+#endif
+  decompress_struct_->client_data = NULL;
+  source_mgr_->init_source = &init_source;
+  source_mgr_->fill_input_buffer = &fill_input_buffer;
+  source_mgr_->skip_input_data = &skip_input_data;
+  source_mgr_->resync_to_restart = &jpeg_resync_to_restart;
+  source_mgr_->term_source = &term_source;
+  jpeg_create_decompress(decompress_struct_);
+  decompress_struct_->src = source_mgr_;
+  buf_vec_.buffers = &buf_;
+  buf_vec_.len = 1;
+}
+
+MJpegDecoder::~MJpegDecoder() {
+  jpeg_destroy_decompress(decompress_struct_);
+  delete decompress_struct_;
+  delete source_mgr_;
+#ifdef HAVE_SETJMP
+  delete error_mgr_;
+#endif
+  DestroyOutputBuffers();
+}
+
+LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) {
+  if (!ValidateJpeg(src, src_len)) {
+    return LIBYUV_FALSE;
+  }
+
+  buf_.data = src;
+  buf_.len = static_cast<int>(src_len);
+  buf_vec_.pos = 0;
+  decompress_struct_->client_data = &buf_vec_;
+#ifdef HAVE_SETJMP
+  if (setjmp(error_mgr_->setjmp_buffer)) {
+    // We called jpeg_read_header, it experienced an error, and we called
+    // longjmp() and rewound the stack to here. Return error.
+    return LIBYUV_FALSE;
+  }
+#endif
+  if (jpeg_read_header(decompress_struct_, TRUE) != JPEG_HEADER_OK) {
+    // ERROR: Bad MJPEG header
+    return LIBYUV_FALSE;
+  }
+  AllocOutputBuffers(GetNumComponents());
+  for (int i = 0; i < num_outbufs_; ++i) {
+    int scanlines_size = GetComponentScanlinesPerImcuRow(i);
+    if (scanlines_sizes_[i] != scanlines_size) {
+      if (scanlines_[i]) {
+        delete scanlines_[i];
+      }
+      scanlines_[i] = new uint8* [scanlines_size];
+      scanlines_sizes_[i] = scanlines_size;
+    }
+
+    // We allocate padding for the final scanline to pad it up to DCTSIZE bytes
+    // to avoid memory errors, since jpeglib only reads full MCUs blocks. For
+    // the preceding scanlines, the padding is not needed/wanted because the
+    // following addresses will already be valid (they are the initial bytes of
+    // the next scanline) and will be overwritten when jpeglib writes out that
+    // next scanline.
+    int databuf_stride = GetComponentStride(i);
+    int databuf_size = scanlines_size * databuf_stride;
+    if (databuf_strides_[i] != databuf_stride) {
+      if (databuf_[i]) {
+        delete databuf_[i];
+      }
+      databuf_[i] = new uint8[databuf_size];
+      databuf_strides_[i] = databuf_stride;
+    }
+
+    if (GetComponentStride(i) != GetComponentWidth(i)) {
+      has_scanline_padding_ = LIBYUV_TRUE;
+    }
+  }
+  return LIBYUV_TRUE;
+}
+
+static int DivideAndRoundUp(int numerator, int denominator) {
+  return (numerator + denominator - 1) / denominator;
+}
+
+static int DivideAndRoundDown(int numerator, int denominator) {
+  return numerator / denominator;
+}
+
+// Returns width of the last loaded frame.
+int MJpegDecoder::GetWidth() {
+  return decompress_struct_->image_width;
+}
+
+// Returns height of the last loaded frame.
+int MJpegDecoder::GetHeight() {
+  return decompress_struct_->image_height;
+}
+
+// Returns format of the last loaded frame. The return value is one of the
+// kColorSpace* constants.
+int MJpegDecoder::GetColorSpace() {
+  return decompress_struct_->jpeg_color_space;
+}
+
+// Number of color components in the color space.
+int MJpegDecoder::GetNumComponents() {
+  return decompress_struct_->num_components;
+}
+
+// Sample factors of the n-th component.
+int MJpegDecoder::GetHorizSampFactor(int component) {
+  return decompress_struct_->comp_info[component].h_samp_factor;
+}
+
+int MJpegDecoder::GetVertSampFactor(int component) {
+  return decompress_struct_->comp_info[component].v_samp_factor;
+}
+
+int MJpegDecoder::GetHorizSubSampFactor(int component) {
+  return decompress_struct_->max_h_samp_factor /
+      GetHorizSampFactor(component);
+}
+
+int MJpegDecoder::GetVertSubSampFactor(int component) {
+  return decompress_struct_->max_v_samp_factor /
+      GetVertSampFactor(component);
+}
+
+int MJpegDecoder::GetImageScanlinesPerImcuRow() {
+  return decompress_struct_->max_v_samp_factor * DCTSIZE;
+}
+
+int MJpegDecoder::GetComponentScanlinesPerImcuRow(int component) {
+  int vs = GetVertSubSampFactor(component);
+  return DivideAndRoundUp(GetImageScanlinesPerImcuRow(), vs);
+}
+
+int MJpegDecoder::GetComponentWidth(int component) {
+  int hs = GetHorizSubSampFactor(component);
+  return DivideAndRoundUp(GetWidth(), hs);
+}
+
+int MJpegDecoder::GetComponentHeight(int component) {
+  int vs = GetVertSubSampFactor(component);
+  return DivideAndRoundUp(GetHeight(), vs);
+}
+
+// Get width in bytes padded out to a multiple of DCTSIZE
+int MJpegDecoder::GetComponentStride(int component) {
+  return (GetComponentWidth(component) + DCTSIZE - 1) & ~(DCTSIZE - 1);
+}
+
+int MJpegDecoder::GetComponentSize(int component) {
+  return GetComponentWidth(component) * GetComponentHeight(component);
+}
+
+LIBYUV_BOOL MJpegDecoder::UnloadFrame() {
+#ifdef HAVE_SETJMP
+  if (setjmp(error_mgr_->setjmp_buffer)) {
+    // We called jpeg_abort_decompress, it experienced an error, and we called
+    // longjmp() and rewound the stack to here. Return error.
+    return LIBYUV_FALSE;
+  }
+#endif
+  jpeg_abort_decompress(decompress_struct_);
+  return LIBYUV_TRUE;
+}
+
+// TODO(fbarchard): Allow rectangle to be specified: x, y, width, height.
+LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(
+    uint8** planes, int dst_width, int dst_height) {
+  if (dst_width != GetWidth() ||
+      dst_height > GetHeight()) {
+    // ERROR: Bad dimensions
+    return LIBYUV_FALSE;
+  }
+#ifdef HAVE_SETJMP
+  if (setjmp(error_mgr_->setjmp_buffer)) {
+    // We called into jpeglib, it experienced an error sometime during this
+    // function call, and we called longjmp() and rewound the stack to here.
+    // Return error.
+    return LIBYUV_FALSE;
+  }
+#endif
+  if (!StartDecode()) {
+    return LIBYUV_FALSE;
+  }
+  SetScanlinePointers(databuf_);
+  int lines_left = dst_height;
+  // Compute amount of lines to skip to implement vertical crop.
+  // TODO(fbarchard): Ensure skip is a multiple of maximum component
+  // subsample. ie 2
+  int skip = (GetHeight() - dst_height) / 2;
+  if (skip > 0) {
+    // There is no API to skip lines in the output data, so we read them
+    // into the temp buffer.
+    while (skip >= GetImageScanlinesPerImcuRow()) {
+      if (!DecodeImcuRow()) {
+        FinishDecode();
+        return LIBYUV_FALSE;
+      }
+      skip -= GetImageScanlinesPerImcuRow();
+    }
+    if (skip > 0) {
+      // Have a partial iMCU row left over to skip. Must read it and then
+      // copy the parts we want into the destination.
+      if (!DecodeImcuRow()) {
+        FinishDecode();
+        return LIBYUV_FALSE;
+      }
+      for (int i = 0; i < num_outbufs_; ++i) {
+        // TODO(fbarchard): Compute skip to avoid this
+        assert(skip % GetVertSubSampFactor(i) == 0);
+        int rows_to_skip =
+            DivideAndRoundDown(skip, GetVertSubSampFactor(i));
+        int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i) -
+                                rows_to_skip;
+        int data_to_skip = rows_to_skip * GetComponentStride(i);
+        CopyPlane(databuf_[i] + data_to_skip, GetComponentStride(i),
+                  planes[i], GetComponentWidth(i),
+                  GetComponentWidth(i), scanlines_to_copy);
+        planes[i] += scanlines_to_copy * GetComponentWidth(i);
+      }
+      lines_left -= (GetImageScanlinesPerImcuRow() - skip);
+    }
+  }
+
+  // Read full MCUs but cropped horizontally
+  for (; lines_left > GetImageScanlinesPerImcuRow();
+         lines_left -= GetImageScanlinesPerImcuRow()) {
+    if (!DecodeImcuRow()) {
+      FinishDecode();
+      return LIBYUV_FALSE;
+    }
+    for (int i = 0; i < num_outbufs_; ++i) {
+      int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i);
+      CopyPlane(databuf_[i], GetComponentStride(i),
+                planes[i], GetComponentWidth(i),
+                GetComponentWidth(i), scanlines_to_copy);
+      planes[i] += scanlines_to_copy * GetComponentWidth(i);
+    }
+  }
+
+  if (lines_left > 0) {
+    // Have a partial iMCU row left over to decode.
+    if (!DecodeImcuRow()) {
+      FinishDecode();
+      return LIBYUV_FALSE;
+    }
+    for (int i = 0; i < num_outbufs_; ++i) {
+      int scanlines_to_copy =
+          DivideAndRoundUp(lines_left, GetVertSubSampFactor(i));
+      CopyPlane(databuf_[i], GetComponentStride(i),
+                planes[i], GetComponentWidth(i),
+                GetComponentWidth(i), scanlines_to_copy);
+      planes[i] += scanlines_to_copy * GetComponentWidth(i);
+    }
+  }
+  return FinishDecode();
+}
+
+LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn, void* opaque,
+    int dst_width, int dst_height) {
+  if (dst_width != GetWidth() ||
+      dst_height > GetHeight()) {
+    // ERROR: Bad dimensions
+    return LIBYUV_FALSE;
+  }
+#ifdef HAVE_SETJMP
+  if (setjmp(error_mgr_->setjmp_buffer)) {
+    // We called into jpeglib, it experienced an error sometime during this
+    // function call, and we called longjmp() and rewound the stack to here.
+    // Return error.
+    return LIBYUV_FALSE;
+  }
+#endif
+  if (!StartDecode()) {
+    return LIBYUV_FALSE;
+  }
+  SetScanlinePointers(databuf_);
+  int lines_left = dst_height;
+  // TODO(fbarchard): Compute amount of lines to skip to implement vertical crop
+  int skip = (GetHeight() - dst_height) / 2;
+  if (skip > 0) {
+    while (skip >= GetImageScanlinesPerImcuRow()) {
+      if (!DecodeImcuRow()) {
+        FinishDecode();
+        return LIBYUV_FALSE;
+      }
+      skip -= GetImageScanlinesPerImcuRow();
+    }
+    if (skip > 0) {
+      // Have a partial iMCU row left over to skip.
+      if (!DecodeImcuRow()) {
+        FinishDecode();
+        return LIBYUV_FALSE;
+      }
+      for (int i = 0; i < num_outbufs_; ++i) {
+        // TODO(fbarchard): Compute skip to avoid this
+        assert(skip % GetVertSubSampFactor(i) == 0);
+        int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));
+        int data_to_skip = rows_to_skip * GetComponentStride(i);
+        // Change our own data buffer pointers so we can pass them to the
+        // callback.
+        databuf_[i] += data_to_skip;
+      }
+      int scanlines_to_copy = GetImageScanlinesPerImcuRow() - skip;
+      (*fn)(opaque, databuf_, databuf_strides_, scanlines_to_copy);
+      // Now change them back.
+      for (int i = 0; i < num_outbufs_; ++i) {
+        int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));
+        int data_to_skip = rows_to_skip * GetComponentStride(i);
+        databuf_[i] -= data_to_skip;
+      }
+      lines_left -= scanlines_to_copy;
+    }
+  }
+  // Read full MCUs until we get to the crop point.
+  for (; lines_left >= GetImageScanlinesPerImcuRow();
+         lines_left -= GetImageScanlinesPerImcuRow()) {
+    if (!DecodeImcuRow()) {
+      FinishDecode();
+      return LIBYUV_FALSE;
+    }
+    (*fn)(opaque, databuf_, databuf_strides_, GetImageScanlinesPerImcuRow());
+  }
+  if (lines_left > 0) {
+    // Have a partial iMCU row left over to decode.
+    if (!DecodeImcuRow()) {
+      FinishDecode();
+      return LIBYUV_FALSE;
+    }
+    (*fn)(opaque, databuf_, databuf_strides_, lines_left);
+  }
+  return FinishDecode();
+}
+
+void init_source(j_decompress_ptr cinfo) {
+  fill_input_buffer(cinfo);
+}
+
+boolean fill_input_buffer(j_decompress_ptr cinfo) {
+  BufferVector* buf_vec = reinterpret_cast<BufferVector*>(cinfo->client_data);
+  if (buf_vec->pos >= buf_vec->len) {
+    assert(0 && "No more data");
+    // ERROR: No more data
+    return FALSE;
+  }
+  cinfo->src->next_input_byte = buf_vec->buffers[buf_vec->pos].data;
+  cinfo->src->bytes_in_buffer = buf_vec->buffers[buf_vec->pos].len;
+  ++buf_vec->pos;
+  return TRUE;
+}
+
+void skip_input_data(j_decompress_ptr cinfo,
+                     long num_bytes) {  // NOLINT
+  cinfo->src->next_input_byte += num_bytes;
+}
+
+void term_source(j_decompress_ptr cinfo) {
+  // Nothing to do.
+}
+
+#ifdef HAVE_SETJMP
+void ErrorHandler(j_common_ptr cinfo) {
+  // This is called when a jpeglib command experiences an error. Unfortunately
+  // jpeglib's error handling model is not very flexible, because it expects the
+  // error handler to not return--i.e., it wants the program to terminate. To
+  // recover from errors we use setjmp() as shown in their example. setjmp() is
+  // C's implementation for the "call with current continuation" functionality
+  // seen in some functional programming languages.
+  // A formatted message can be output, but is unsafe for release.
+#ifdef DEBUG
+  char buf[JMSG_LENGTH_MAX];
+  (*cinfo->err->format_message)(cinfo, buf);
+  // ERROR: Error in jpeglib: buf
+#endif
+
+  SetJmpErrorMgr* mgr = reinterpret_cast<SetJmpErrorMgr*>(cinfo->err);
+  // This rewinds the call stack to the point of the corresponding setjmp()
+  // and causes it to return (for a second time) with value 1.
+  longjmp(mgr->setjmp_buffer, 1);
+}
+#endif
+
+void MJpegDecoder::AllocOutputBuffers(int num_outbufs) {
+  if (num_outbufs != num_outbufs_) {
+    // We could perhaps optimize this case to resize the output buffers without
+    // necessarily having to delete and recreate each one, but it's not worth
+    // it.
+    DestroyOutputBuffers();
+
+    scanlines_ = new uint8** [num_outbufs];
+    scanlines_sizes_ = new int[num_outbufs];
+    databuf_ = new uint8* [num_outbufs];
+    databuf_strides_ = new int[num_outbufs];
+
+    for (int i = 0; i < num_outbufs; ++i) {
+      scanlines_[i] = NULL;
+      scanlines_sizes_[i] = 0;
+      databuf_[i] = NULL;
+      databuf_strides_[i] = 0;
+    }
+
+    num_outbufs_ = num_outbufs;
+  }
+}
+
+void MJpegDecoder::DestroyOutputBuffers() {
+  for (int i = 0; i < num_outbufs_; ++i) {
+    delete [] scanlines_[i];
+    delete [] databuf_[i];
+  }
+  delete [] scanlines_;
+  delete [] databuf_;
+  delete [] scanlines_sizes_;
+  delete [] databuf_strides_;
+  scanlines_ = NULL;
+  databuf_ = NULL;
+  scanlines_sizes_ = NULL;
+  databuf_strides_ = NULL;
+  num_outbufs_ = 0;
+}
+
+// JDCT_IFAST and do_block_smoothing improve performance substantially.
+LIBYUV_BOOL MJpegDecoder::StartDecode() {
+  decompress_struct_->raw_data_out = TRUE;
+  decompress_struct_->dct_method = JDCT_IFAST;  // JDCT_ISLOW is default
+  decompress_struct_->dither_mode = JDITHER_NONE;
+  // Not applicable to 'raw':
+  decompress_struct_->do_fancy_upsampling = (boolean)(LIBYUV_FALSE);
+  // Only for buffered mode:
+  decompress_struct_->enable_2pass_quant = (boolean)(LIBYUV_FALSE);
+  // Blocky but fast:
+  decompress_struct_->do_block_smoothing = (boolean)(LIBYUV_FALSE);
+
+  if (!jpeg_start_decompress(decompress_struct_)) {
+    // ERROR: Couldn't start JPEG decompressor";
+    return LIBYUV_FALSE;
+  }
+  return LIBYUV_TRUE;
+}
+
+LIBYUV_BOOL MJpegDecoder::FinishDecode() {
+  // jpeglib considers it an error if we finish without decoding the whole
+  // image, so we call "abort" rather than "finish".
+  jpeg_abort_decompress(decompress_struct_);
+  return LIBYUV_TRUE;
+}
+
+void MJpegDecoder::SetScanlinePointers(uint8** data) {
+  for (int i = 0; i < num_outbufs_; ++i) {
+    uint8* data_i = data[i];
+    for (int j = 0; j < scanlines_sizes_[i]; ++j) {
+      scanlines_[i][j] = data_i;
+      data_i += GetComponentStride(i);
+    }
+  }
+}
+
+inline LIBYUV_BOOL MJpegDecoder::DecodeImcuRow() {
+  return (unsigned int)(GetImageScanlinesPerImcuRow()) ==
+      jpeg_read_raw_data(decompress_struct_,
+                         scanlines_,
+                         GetImageScanlinesPerImcuRow());
+}
+
+// The helper function which recognizes the jpeg sub-sampling type.
+JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper(
+    int* subsample_x, int* subsample_y, int number_of_components) {
+  if (number_of_components == 3) {  // Color images.
+    if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
+        subsample_x[1] == 2 && subsample_y[1] == 2 &&
+        subsample_x[2] == 2 && subsample_y[2] == 2) {
+      return kJpegYuv420;
+    } else if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
+        subsample_x[1] == 2 && subsample_y[1] == 1 &&
+        subsample_x[2] == 2 && subsample_y[2] == 1) {
+      return kJpegYuv422;
+    } else if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
+        subsample_x[1] == 1 && subsample_y[1] == 1 &&
+        subsample_x[2] == 1 && subsample_y[2] == 1) {
+      return kJpegYuv444;
+    }
+  } else if (number_of_components == 1) {  // Grey-scale images.
+    if (subsample_x[0] == 1 && subsample_y[0] == 1) {
+      return kJpegYuv400;
+    }
+  }
+  return kJpegUnknown;
+}
+
+}  // namespace libyuv
+#endif  // HAVE_JPEG
+
diff --git a/libs/libvpx/third_party/libyuv/source/mjpeg_validate.cc b/libs/libvpx/third_party/libyuv/source/mjpeg_validate.cc
new file mode 100644
index 0000000000..8edfbe1e74
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/source/mjpeg_validate.cc
@@ -0,0 +1,101 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/mjpeg_decoder.h"
+
+#include <string.h>  // For memchr.
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Enable this to try scasb implementation.
+// #define ENABLE_SCASB 1
+
+#ifdef ENABLE_SCASB
+
+// Multiple of 1.
+__declspec(naked)
+const uint8* ScanRow_ERMS(const uint8* src, uint32 val, int count) {
+  __asm {
+    mov        edx, edi
+    mov        edi, [esp + 4]   // src
+    mov        eax, [esp + 8]   // val
+    mov        ecx, [esp + 12]  // count
+    repne scasb
+    jne        sr99
+    mov        eax, edi
+    sub        eax, 1
+    mov        edi, edx
+    ret
+
+  sr99:
+    mov        eax, 0
+    mov        edi, edx
+    ret
+  }
+}
+#endif
+
+// Helper function to scan for EOI marker.
+static LIBYUV_BOOL ScanEOI(const uint8* sample, size_t sample_size) {
+  const uint8* end = sample + sample_size - 1;
+  const uint8* it = sample;
+  for (;;) {
+#ifdef ENABLE_SCASB
+    it = ScanRow_ERMS(it, 0xff, end - it);
+#else
+    it = static_cast<const uint8*>(memchr(it, 0xff, end - it));
+#endif
+    if (it == NULL) {
+      break;
+    }
+    if (it[1] == 0xd9) {
+      return LIBYUV_TRUE;  // Success: Valid jpeg.
+    }
+    ++it;  // Skip over current 0xff.
+  }
+  // ERROR: Invalid jpeg end code not found. Size sample_size
+  return LIBYUV_FALSE;
+}
+
+// Helper function to validate the jpeg appears intact.
+LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) {
+  const size_t kBackSearchSize = 1024;
+  if (sample_size < 64) {
+    // ERROR: Invalid jpeg size: sample_size
+    return LIBYUV_FALSE;
+  }
+  if (sample[0] != 0xff || sample[1] != 0xd8) {  // Start Of Image
+    // ERROR: Invalid jpeg initial start code
+    return LIBYUV_FALSE;
+  }
+  // Step over SOI marker.
+  sample += 2;
+  sample_size -= 2;
+
+  // Look for the End Of Image (EOI) marker in the end kilobyte of the buffer.
+  if (sample_size > kBackSearchSize) {
+    if (ScanEOI(sample + sample_size - kBackSearchSize, kBackSearchSize)) {
+      return LIBYUV_TRUE;  // Success: Valid jpeg.
+    }
+    // Reduce search size for forward search.
+    sample_size = sample_size - kBackSearchSize + 1;
+  }
+  return ScanEOI(sample, sample_size);
+
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
diff --git a/libs/libvpx/third_party/libyuv/source/planar_functions.cc b/libs/libvpx/third_party/libyuv/source/planar_functions.cc
new file mode 100644
index 0000000000..b96bd50206
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/source/planar_functions.cc
@@ -0,0 +1,2555 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/planar_functions.h"
+
+#include <string.h>  // for memset()
+
+#include "libyuv/cpu_id.h"
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Copy a plane of data
+LIBYUV_API
+void CopyPlane(const uint8* src_y, int src_stride_y,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height) {
+  int y;
+  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_y = 0;
+  }
+  // Nothing to do.
+  if (src_y == dst_y && src_stride_y == dst_stride_y) {
+    return;
+  }
+#if defined(HAS_COPYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
+  }
+#endif
+#if defined(HAS_COPYROW_AVX)
+  if (TestCpuFlag(kCpuHasAVX)) {
+    CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
+  }
+#endif
+#if defined(HAS_COPYROW_ERMS)
+  if (TestCpuFlag(kCpuHasERMS)) {
+    CopyRow = CopyRow_ERMS;
+  }
+#endif
+#if defined(HAS_COPYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
+  }
+#endif
+#if defined(HAS_COPYROW_MIPS)
+  if (TestCpuFlag(kCpuHasMIPS)) {
+    CopyRow = CopyRow_MIPS;
+  }
+#endif
+
+  // Copy plane
+  for (y = 0; y < height; ++y) {
+    CopyRow(src_y, dst_y, width);
+    src_y += src_stride_y;
+    dst_y += dst_stride_y;
+  }
+}
+
+LIBYUV_API
+void CopyPlane_16(const uint16* src_y, int src_stride_y,
+                  uint16* dst_y, int dst_stride_y,
+                  int width, int height) {
+  int y;
+  void (*CopyRow)(const uint16* src, uint16* dst, int width) = CopyRow_16_C;
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_y = 0;
+  }
+#if defined(HAS_COPYROW_16_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) {
+    CopyRow = CopyRow_16_SSE2;
+  }
+#endif
+#if defined(HAS_COPYROW_16_ERMS)
+  if (TestCpuFlag(kCpuHasERMS)) {
+    CopyRow = CopyRow_16_ERMS;
+  }
+#endif
+#if defined(HAS_COPYROW_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
+    CopyRow = CopyRow_16_NEON;
+  }
+#endif
+#if defined(HAS_COPYROW_16_MIPS)
+  if (TestCpuFlag(kCpuHasMIPS)) {
+    CopyRow = CopyRow_16_MIPS;
+  }
+#endif
+
+  // Copy plane
+  for (y = 0; y < height; ++y) {
+    CopyRow(src_y, dst_y, width);
+    src_y += src_stride_y;
+    dst_y += dst_stride_y;
+  }
+}
+
+// Copy I422.
+LIBYUV_API
+int I422Copy(const uint8* src_y, int src_stride_y,
+             const uint8* src_u, int src_stride_u,
+             const uint8* src_v, int src_stride_v,
+             uint8* dst_y, int dst_stride_y,
+             uint8* dst_u, int dst_stride_u,
+             uint8* dst_v, int dst_stride_v,
+             int width, int height) {
+  int halfwidth = (width + 1) >> 1;
+  if (!src_y || !src_u || !src_v ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, height);
+  CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, height);
+  return 0;
+}
+
+// Copy I444.
+LIBYUV_API
+int I444Copy(const uint8* src_y, int src_stride_y,
+             const uint8* src_u, int src_stride_u,
+             const uint8* src_v, int src_stride_v,
+             uint8* dst_y, int dst_stride_y,
+             uint8* dst_u, int dst_stride_u,
+             uint8* dst_v, int dst_stride_v,
+             int width, int height) {
+  if (!src_y || !src_u || !src_v ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+  CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+  return 0;
+}
+
+// Copy I400.
+LIBYUV_API
+int I400ToI400(const uint8* src_y, int src_stride_y,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height) {
+  if (!src_y || !dst_y || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  return 0;
+}
+
+// Convert I420 to I400.
+LIBYUV_API
+int I420ToI400(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height) {
+  if (!src_y || !dst_y || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  return 0;
+}
+
+// Mirror a plane of data.
+void MirrorPlane(const uint8* src_y, int src_stride_y,
+                 uint8* dst_y, int dst_stride_y,
+                 int width, int height) {
+  int y;
+  void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+#if defined(HAS_MIRRORROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MirrorRow = MirrorRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      MirrorRow = MirrorRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    MirrorRow = MirrorRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      MirrorRow = MirrorRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    MirrorRow = MirrorRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      MirrorRow = MirrorRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MirrorRow = MirrorRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      MirrorRow = MirrorRow_AVX2;
+    }
+  }
+#endif
+// TODO(fbarchard): Mirror on mips handle unaligned memory.
+#if defined(HAS_MIRRORROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(dst_y, 4) && IS_ALIGNED(dst_stride_y, 4)) {
+    MirrorRow = MirrorRow_MIPS_DSPR2;
+  }
+#endif
+
+  // Mirror plane
+  for (y = 0; y < height; ++y) {
+    MirrorRow(src_y, dst_y, width);
+    src_y += src_stride_y;
+    dst_y += dst_stride_y;
+  }
+}
+
+// Convert YUY2 to I422.
+LIBYUV_API
+int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*YUY2ToUV422Row)(const uint8* src_yuy2,
+                         uint8* dst_u, uint8* dst_v, int pix) =
+      YUY2ToUV422Row_C;
+  void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int pix) =
+      YUY2ToYRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+    src_stride_yuy2 = -src_stride_yuy2;
+  }
+  // Coalesce rows.
+  if (src_stride_yuy2 == width * 2 &&
+      dst_stride_y == width &&
+      dst_stride_u * 2 == width &&
+      dst_stride_v * 2 == width) {
+    width *= height;
+    height = 1;
+    src_stride_yuy2 = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+  }
+#if defined(HAS_YUY2TOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
+    YUY2ToYRow = YUY2ToYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
+      YUY2ToYRow = YUY2ToYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2;
+    YUY2ToYRow = YUY2ToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      YUY2ToUV422Row = YUY2ToUV422Row_AVX2;
+      YUY2ToYRow = YUY2ToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    YUY2ToYRow = YUY2ToYRow_Any_NEON;
+    if (width >= 16) {
+      YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
+    }
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToYRow = YUY2ToYRow_NEON;
+      YUY2ToUV422Row = YUY2ToUV422Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
+    YUY2ToYRow(src_yuy2, dst_y, width);
+    src_yuy2 += src_stride_yuy2;
+    dst_y += dst_stride_y;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  return 0;
+}
+
+// Convert UYVY to I422.
+LIBYUV_API
+int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*UYVYToUV422Row)(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int pix) =
+      UYVYToUV422Row_C;
+  void (*UYVYToYRow)(const uint8* src_uyvy,
+                     uint8* dst_y, int pix) = UYVYToYRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+    src_stride_uyvy = -src_stride_uyvy;
+  }
+  // Coalesce rows.
+  if (src_stride_uyvy == width * 2 &&
+      dst_stride_y == width &&
+      dst_stride_u * 2 == width &&
+      dst_stride_v * 2 == width) {
+    width *= height;
+    height = 1;
+    src_stride_uyvy = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+  }
+#if defined(HAS_UYVYTOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    UYVYToUV422Row = UYVYToUV422Row_Any_SSE2;
+    UYVYToYRow = UYVYToYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToUV422Row = UYVYToUV422Row_SSE2;
+      UYVYToYRow = UYVYToYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    UYVYToUV422Row = UYVYToUV422Row_Any_AVX2;
+    UYVYToYRow = UYVYToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      UYVYToUV422Row = UYVYToUV422Row_AVX2;
+      UYVYToYRow = UYVYToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    UYVYToYRow = UYVYToYRow_Any_NEON;
+    if (width >= 16) {
+      UYVYToUV422Row = UYVYToUV422Row_Any_NEON;
+    }
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToYRow = UYVYToYRow_NEON;
+      UYVYToUV422Row = UYVYToUV422Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    UYVYToUV422Row(src_uyvy, dst_u, dst_v, width);
+    UYVYToYRow(src_uyvy, dst_y, width);
+    src_uyvy += src_stride_uyvy;
+    dst_y += dst_stride_y;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  return 0;
+}
+
+// Mirror I400 with optional flipping
+LIBYUV_API
+int I400Mirror(const uint8* src_y, int src_stride_y,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height) {
+  if (!src_y || !dst_y ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+
+  MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  return 0;
+}
+
+// Mirror I420 with optional flipping
+LIBYUV_API
+int I420Mirror(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  if (dst_y) {
+    MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+  MirrorPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
+  MirrorPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
+  return 0;
+}
+
+// ARGB mirror.
+LIBYUV_API
+int ARGBMirror(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =
+      ARGBMirrorRow_C;
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+#if defined(HAS_ARGBMIRRORROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBMirrorRow = ARGBMirrorRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBMIRRORROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBMirrorRow = ARGBMirrorRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBMIRRORROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBMirrorRow = ARGBMirrorRow_AVX2;
+    }
+  }
+#endif
+
+  // Mirror plane
+  for (y = 0; y < height; ++y) {
+    ARGBMirrorRow(src_argb, dst_argb, width);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Get a blender that optimized for the CPU and pixel count.
+// As there are 6 blenders to choose from, the caller should try to use
+// the same blend function for all pixels if possible.
+LIBYUV_API
+ARGBBlendRow GetARGBBlend() {
+  void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,
+                       uint8* dst_argb, int width) = ARGBBlendRow_C;
+#if defined(HAS_ARGBBLENDROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBBlendRow = ARGBBlendRow_SSSE3;
+    return ARGBBlendRow;
+  }
+#endif
+#if defined(HAS_ARGBBLENDROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBBlendRow = ARGBBlendRow_SSE2;
+  }
+#endif
+#if defined(HAS_ARGBBLENDROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBBlendRow = ARGBBlendRow_NEON;
+  }
+#endif
+  return ARGBBlendRow;
+}
+
+// Alpha Blend 2 ARGB images and store to destination.
+LIBYUV_API
+int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
+              const uint8* src_argb1, int src_stride_argb1,
+              uint8* dst_argb, int dst_stride_argb,
+              int width, int height) {
+  int y;
+  void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,
+                       uint8* dst_argb, int width) = GetARGBBlend();
+  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb0 == width * 4 &&
+      src_stride_argb1 == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
+  }
+
+  for (y = 0; y < height; ++y) {
+    ARGBBlendRow(src_argb0, src_argb1, dst_argb, width);
+    src_argb0 += src_stride_argb0;
+    src_argb1 += src_stride_argb1;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Multiply 2 ARGB images and store to destination.
+LIBYUV_API
+int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
+                 const uint8* src_argb1, int src_stride_argb1,
+                 uint8* dst_argb, int dst_stride_argb,
+                 int width, int height) {
+  int y;
+  void (*ARGBMultiplyRow)(const uint8* src0, const uint8* src1, uint8* dst,
+                          int width) = ARGBMultiplyRow_C;
+  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb0 == width * 4 &&
+      src_stride_argb1 == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBMULTIPLYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBMultiplyRow = ARGBMultiplyRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBMultiplyRow = ARGBMultiplyRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBMULTIPLYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBMultiplyRow = ARGBMultiplyRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBMultiplyRow = ARGBMultiplyRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBMULTIPLYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBMultiplyRow = ARGBMultiplyRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBMultiplyRow = ARGBMultiplyRow_NEON;
+    }
+  }
+#endif
+
+  // Multiply plane
+  for (y = 0; y < height; ++y) {
+    ARGBMultiplyRow(src_argb0, src_argb1, dst_argb, width);
+    src_argb0 += src_stride_argb0;
+    src_argb1 += src_stride_argb1;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Add 2 ARGB images and store to destination.
+LIBYUV_API
+int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
+            const uint8* src_argb1, int src_stride_argb1,
+            uint8* dst_argb, int dst_stride_argb,
+            int width, int height) {
+  int y;
+  void (*ARGBAddRow)(const uint8* src0, const uint8* src1, uint8* dst,
+                     int width) = ARGBAddRow_C;
+  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb0 == width * 4 &&
+      src_stride_argb1 == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBADDROW_SSE2) && (defined(_MSC_VER) && !defined(__clang__))
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBAddRow = ARGBAddRow_SSE2;
+  }
+#endif
+#if defined(HAS_ARGBADDROW_SSE2) && !(defined(_MSC_VER) && !defined(__clang__))
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBAddRow = ARGBAddRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBAddRow = ARGBAddRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBADDROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBAddRow = ARGBAddRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAddRow = ARGBAddRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBADDROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBAddRow = ARGBAddRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAddRow = ARGBAddRow_NEON;
+    }
+  }
+#endif
+
+  // Add plane
+  for (y = 0; y < height; ++y) {
+    ARGBAddRow(src_argb0, src_argb1, dst_argb, width);
+    src_argb0 += src_stride_argb0;
+    src_argb1 += src_stride_argb1;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Subtract 2 ARGB images and store to destination.
+LIBYUV_API
+int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
+                 const uint8* src_argb1, int src_stride_argb1,
+                 uint8* dst_argb, int dst_stride_argb,
+                 int width, int height) {
+  int y;
+  void (*ARGBSubtractRow)(const uint8* src0, const uint8* src1, uint8* dst,
+                          int width) = ARGBSubtractRow_C;
+  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb0 == width * 4 &&
+      src_stride_argb1 == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBSUBTRACTROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBSubtractRow = ARGBSubtractRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBSubtractRow = ARGBSubtractRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBSUBTRACTROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBSubtractRow = ARGBSubtractRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBSubtractRow = ARGBSubtractRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBSUBTRACTROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBSubtractRow = ARGBSubtractRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBSubtractRow = ARGBSubtractRow_NEON;
+    }
+  }
+#endif
+
+  // Subtract plane
+  for (y = 0; y < height; ++y) {
+    ARGBSubtractRow(src_argb0, src_argb1, dst_argb, width);
+    src_argb0 += src_stride_argb0;
+    src_argb1 += src_stride_argb1;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert I422 to BGRA.
+LIBYUV_API
+int I422ToBGRA(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_bgra, int dst_stride_bgra,
+               int width, int height) {
+  int y;
+  void (*I422ToBGRARow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToBGRARow_C;
+  if (!src_y || !src_u || !src_v ||
+      !dst_bgra ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra;
+    dst_stride_bgra = -dst_stride_bgra;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      src_stride_u * 2 == width &&
+      src_stride_v * 2 == width &&
+      dst_stride_bgra == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_bgra = 0;
+  }
+#if defined(HAS_I422TOBGRAROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToBGRARow = I422ToBGRARow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToBGRARow = I422ToBGRARow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOBGRAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToBGRARow = I422ToBGRARow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToBGRARow = I422ToBGRARow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOBGRAROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToBGRARow = I422ToBGRARow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToBGRARow = I422ToBGRARow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOBGRAROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_bgra, 4) && IS_ALIGNED(dst_stride_bgra, 4)) {
+    I422ToBGRARow = I422ToBGRARow_MIPS_DSPR2;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToBGRARow(src_y, src_u, src_v, dst_bgra, width);
+    dst_bgra += dst_stride_bgra;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert I422 to ABGR.
+LIBYUV_API
+int I422ToABGR(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_abgr, int dst_stride_abgr,
+               int width, int height) {
+  int y;
+  void (*I422ToABGRRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToABGRRow_C;
+  if (!src_y || !src_u || !src_v ||
+      !dst_abgr ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr;
+    dst_stride_abgr = -dst_stride_abgr;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      src_stride_u * 2 == width &&
+      src_stride_v * 2 == width &&
+      dst_stride_abgr == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_abgr = 0;
+  }
+#if defined(HAS_I422TOABGRROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    I422ToABGRRow = I422ToABGRRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToABGRRow = I422ToABGRRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOABGRROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToABGRRow = I422ToABGRRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToABGRRow = I422ToABGRRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOABGRROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToABGRRow = I422ToABGRRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToABGRRow = I422ToABGRRow_AVX2;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToABGRRow(src_y, src_u, src_v, dst_abgr, width);
+    dst_abgr += dst_stride_abgr;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert I422 to RGBA.
+LIBYUV_API
+int I422ToRGBA(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_rgba, int dst_stride_rgba,
+               int width, int height) {
+  int y;
+  void (*I422ToRGBARow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToRGBARow_C;
+  if (!src_y || !src_u || !src_v ||
+      !dst_rgba ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
+    dst_stride_rgba = -dst_stride_rgba;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      src_stride_u * 2 == width &&
+      src_stride_v * 2 == width &&
+      dst_stride_rgba == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_rgba = 0;
+  }
+#if defined(HAS_I422TORGBAROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    I422ToRGBARow = I422ToRGBARow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGBARow = I422ToRGBARow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGBARow = I422ToRGBARow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRGBARow = I422ToRGBARow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGBARow = I422ToRGBARow_AVX2;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToRGBARow(src_y, src_u, src_v, dst_rgba, width);
+    dst_rgba += dst_stride_rgba;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert NV12 to RGB565.
+LIBYUV_API
+int NV12ToRGB565(const uint8* src_y, int src_stride_y,
+                 const uint8* src_uv, int src_stride_uv,
+                 uint8* dst_rgb565, int dst_stride_rgb565,
+                 int width, int height) {
+  int y;
+  void (*NV12ToRGB565Row)(const uint8* y_buf,
+                          const uint8* uv_buf,
+                          uint8* rgb_buf,
+                          int width) = NV12ToRGB565Row_C;
+  if (!src_y || !src_uv || !dst_rgb565 ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+    dst_stride_rgb565 = -dst_stride_rgb565;
+  }
+#if defined(HAS_NV12TORGB565ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    NV12ToRGB565Row = NV12ToRGB565Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToRGB565Row = NV12ToRGB565Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_NV12TORGB565ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV12ToRGB565Row = NV12ToRGB565Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      NV12ToRGB565Row = NV12ToRGB565Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_NV12TORGB565ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToRGB565Row = NV12ToRGB565Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    NV12ToRGB565Row(src_y, src_uv, dst_rgb565, width);
+    dst_rgb565 += dst_stride_rgb565;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_uv += src_stride_uv;
+    }
+  }
+  return 0;
+}
+
+// Convert NV21 to RGB565.
+LIBYUV_API
+int NV21ToRGB565(const uint8* src_y, int src_stride_y,
+                 const uint8* src_vu, int src_stride_vu,
+                 uint8* dst_rgb565, int dst_stride_rgb565,
+                 int width, int height) {
+  int y;
+  void (*NV21ToRGB565Row)(const uint8* y_buf,
+                          const uint8* src_vu,
+                          uint8* rgb_buf,
+                          int width) = NV21ToRGB565Row_C;
+  if (!src_y || !src_vu || !dst_rgb565 ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+    dst_stride_rgb565 = -dst_stride_rgb565;
+  }
+#if defined(HAS_NV21TORGB565ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    NV21ToRGB565Row = NV21ToRGB565Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      NV21ToRGB565Row = NV21ToRGB565Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_NV21TORGB565ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV21ToRGB565Row = NV21ToRGB565Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      NV21ToRGB565Row = NV21ToRGB565Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_NV21TORGB565ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    NV21ToRGB565Row = NV21ToRGB565Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      NV21ToRGB565Row = NV21ToRGB565Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    NV21ToRGB565Row(src_y, src_vu, dst_rgb565, width);
+    dst_rgb565 += dst_stride_rgb565;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_vu += src_stride_vu;
+    }
+  }
+  return 0;
+}
+
+LIBYUV_API
+void SetPlane(uint8* dst_y, int dst_stride_y,
+              int width, int height,
+              uint32 value) {
+  int y;
+  void (*SetRow)(uint8* dst, uint8 value, int pix) = SetRow_C;
+  if (height < 0) {
+    height = -height;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_stride_y = -dst_stride_y;
+  }
+  // Coalesce rows.
+  if (dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    dst_stride_y = 0;
+  }
+#if defined(HAS_SETROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SetRow = SetRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      SetRow = SetRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SETROW_X86)
+  if (TestCpuFlag(kCpuHasX86)) {
+    SetRow = SetRow_Any_X86;
+    if (IS_ALIGNED(width, 4)) {
+      SetRow = SetRow_X86;
+    }
+  }
+#endif
+#if defined(HAS_SETROW_ERMS)
+  if (TestCpuFlag(kCpuHasERMS)) {
+    SetRow = SetRow_ERMS;
+  }
+#endif
+
+  // Set plane
+  for (y = 0; y < height; ++y) {
+    SetRow(dst_y, value, width);
+    dst_y += dst_stride_y;
+  }
+}
+
+// Draw a rectangle into I420
+LIBYUV_API
+int I420Rect(uint8* dst_y, int dst_stride_y,
+             uint8* dst_u, int dst_stride_u,
+             uint8* dst_v, int dst_stride_v,
+             int x, int y,
+             int width, int height,
+             int value_y, int value_u, int value_v) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  uint8* start_y = dst_y + y * dst_stride_y + x;
+  uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2);
+  uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2);
+  if (!dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0 ||
+      x < 0 || y < 0 ||
+      value_y < 0 || value_y > 255 ||
+      value_u < 0 || value_u > 255 ||
+      value_v < 0 || value_v > 255) {
+    return -1;
+  }
+
+  SetPlane(start_y, dst_stride_y, width, height, value_y);
+  SetPlane(start_u, dst_stride_u, halfwidth, halfheight, value_u);
+  SetPlane(start_v, dst_stride_v, halfwidth, halfheight, value_v);
+  return 0;
+}
+
+// Draw a rectangle into ARGB
+LIBYUV_API
+int ARGBRect(uint8* dst_argb, int dst_stride_argb,
+             int dst_x, int dst_y,
+             int width, int height,
+             uint32 value) {
+  int y;
+  void (*ARGBSetRow)(uint8* dst_argb, uint32 value, int pix) = ARGBSetRow_C;
+  if (!dst_argb ||
+      width <= 0 || height == 0 ||
+      dst_x < 0 || dst_y < 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  dst_argb += dst_y * dst_stride_argb + dst_x * 4;
+  // Coalesce rows.
+  if (dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    dst_stride_argb = 0;
+  }
+
+#if defined(HAS_ARGBSETROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBSetRow = ARGBSetRow_Any_NEON;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBSetRow = ARGBSetRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBSETROW_X86)
+  if (TestCpuFlag(kCpuHasX86)) {
+    ARGBSetRow = ARGBSetRow_X86;
+  }
+#endif
+
+  // Set plane
+  for (y = 0; y < height; ++y) {
+    ARGBSetRow(dst_argb, value, width);
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert unattentuated ARGB to preattenuated ARGB.
+// An unattenutated ARGB alpha blend uses the formula
+// p = a * f + (1 - a) * b
+// where
+//   p is output pixel
+//   f is foreground pixel
+//   b is background pixel
+//   a is alpha value from foreground pixel
+// An preattenutated ARGB alpha blend uses the formula
+// p = f + (1 - a) * b
+// where
+//   f is foreground pixel premultiplied by alpha
+
+LIBYUV_API
+int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
+                  uint8* dst_argb, int dst_stride_argb,
+                  int width, int height) {
+  int y;
+  void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb,
+                           int width) = ARGBAttenuateRow_C;
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBATTENUATEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBAttenuateRow(src_argb, dst_argb, width);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert preattentuated ARGB to unattenuated ARGB.
+LIBYUV_API
+int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
+                    uint8* dst_argb, int dst_stride_argb,
+                    int width, int height) {
+  int y;
+  void (*ARGBUnattenuateRow)(const uint8* src_argb, uint8* dst_argb,
+                             int width) = ARGBUnattenuateRow_C;
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBUNATTENUATEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBUnattenuateRow = ARGBUnattenuateRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBUnattenuateRow = ARGBUnattenuateRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBUNATTENUATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBUnattenuateRow = ARGBUnattenuateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBUnattenuateRow = ARGBUnattenuateRow_AVX2;
+    }
+  }
+#endif
+// TODO(fbarchard): Neon version.
+
+  for (y = 0; y < height; ++y) {
+    ARGBUnattenuateRow(src_argb, dst_argb, width);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert ARGB to Grayed ARGB.
+LIBYUV_API
+int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb,
+                      int width) = ARGBGrayRow_C;
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBGRAYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
+    ARGBGrayRow = ARGBGrayRow_SSSE3;
+  }
+#endif
+#if defined(HAS_ARGBGRAYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    ARGBGrayRow = ARGBGrayRow_NEON;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBGrayRow(src_argb, dst_argb, width);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Make a rectangle of ARGB gray scale.
+LIBYUV_API
+int ARGBGray(uint8* dst_argb, int dst_stride_argb,
+             int dst_x, int dst_y,
+             int width, int height) {
+  int y;
+  void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb,
+                      int width) = ARGBGrayRow_C;
+  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
+    return -1;
+  }
+  // Coalesce rows.
+  if (dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBGRAYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
+    ARGBGrayRow = ARGBGrayRow_SSSE3;
+  }
+#endif
+#if defined(HAS_ARGBGRAYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    ARGBGrayRow = ARGBGrayRow_NEON;
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    ARGBGrayRow(dst, dst, width);
+    dst += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Make a rectangle of ARGB Sepia tone.
+LIBYUV_API
+int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
+              int dst_x, int dst_y, int width, int height) {
+  int y;
+  void (*ARGBSepiaRow)(uint8* dst_argb, int width) = ARGBSepiaRow_C;
+  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
+    return -1;
+  }
+  // Coalesce rows.
+  if (dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBSEPIAROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
+    ARGBSepiaRow = ARGBSepiaRow_SSSE3;
+  }
+#endif
+#if defined(HAS_ARGBSEPIAROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    ARGBSepiaRow = ARGBSepiaRow_NEON;
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    ARGBSepiaRow(dst, width);
+    dst += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Apply a 4x4 matrix to each ARGB pixel.
+// Note: Normally for shading, but can be used to swizzle or invert.
+LIBYUV_API
+int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
+                    uint8* dst_argb, int dst_stride_argb,
+                    const int8* matrix_argb,
+                    int width, int height) {
+  int y;
+  void (*ARGBColorMatrixRow)(const uint8* src_argb, uint8* dst_argb,
+      const int8* matrix_argb, int width) = ARGBColorMatrixRow_C;
+  if (!src_argb || !dst_argb || !matrix_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBCOLORMATRIXROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
+    ARGBColorMatrixRow = ARGBColorMatrixRow_SSSE3;
+  }
+#endif
+#if defined(HAS_ARGBCOLORMATRIXROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    ARGBColorMatrixRow = ARGBColorMatrixRow_NEON;
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    ARGBColorMatrixRow(src_argb, dst_argb, matrix_argb, width);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Apply a 4x3 matrix to each ARGB pixel.
+// Deprecated.
+LIBYUV_API
+int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
+                   const int8* matrix_rgb,
+                   int dst_x, int dst_y, int width, int height) {
+  SIMD_ALIGNED(int8 matrix_argb[16]);
+  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  if (!dst_argb || !matrix_rgb || width <= 0 || height <= 0 ||
+      dst_x < 0 || dst_y < 0) {
+    return -1;
+  }
+
+  // Convert 4x3 7 bit matrix to 4x4 6 bit matrix.
+  matrix_argb[0] = matrix_rgb[0] / 2;
+  matrix_argb[1] = matrix_rgb[1] / 2;
+  matrix_argb[2] = matrix_rgb[2] / 2;
+  matrix_argb[3] = matrix_rgb[3] / 2;
+  matrix_argb[4] = matrix_rgb[4] / 2;
+  matrix_argb[5] = matrix_rgb[5] / 2;
+  matrix_argb[6] = matrix_rgb[6] / 2;
+  matrix_argb[7] = matrix_rgb[7] / 2;
+  matrix_argb[8] = matrix_rgb[8] / 2;
+  matrix_argb[9] = matrix_rgb[9] / 2;
+  matrix_argb[10] = matrix_rgb[10] / 2;
+  matrix_argb[11] = matrix_rgb[11] / 2;
+  matrix_argb[14] = matrix_argb[13] = matrix_argb[12] = 0;
+  matrix_argb[15] = 64;  // 1.0
+
+  return ARGBColorMatrix((const uint8*)(dst), dst_stride_argb,
+                         dst, dst_stride_argb,
+                         &matrix_argb[0], width, height);
+}
+
+// Apply a color table each ARGB pixel.
+// Table contains 256 ARGB values.
+LIBYUV_API
+int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,
+                   const uint8* table_argb,
+                   int dst_x, int dst_y, int width, int height) {
+  int y;
+  void (*ARGBColorTableRow)(uint8* dst_argb, const uint8* table_argb,
+                            int width) = ARGBColorTableRow_C;
+  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  if (!dst_argb || !table_argb || width <= 0 || height <= 0 ||
+      dst_x < 0 || dst_y < 0) {
+    return -1;
+  }
+  // Coalesce rows.
+  if (dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBCOLORTABLEROW_X86)
+  if (TestCpuFlag(kCpuHasX86)) {
+    ARGBColorTableRow = ARGBColorTableRow_X86;
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    ARGBColorTableRow(dst, table_argb, width);
+    dst += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Apply a color table each ARGB pixel but preserve destination alpha.
+// Table contains 256 ARGB values.
+LIBYUV_API
+int RGBColorTable(uint8* dst_argb, int dst_stride_argb,
+                  const uint8* table_argb,
+                  int dst_x, int dst_y, int width, int height) {
+  int y;
+  void (*RGBColorTableRow)(uint8* dst_argb, const uint8* table_argb,
+                           int width) = RGBColorTableRow_C;
+  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  if (!dst_argb || !table_argb || width <= 0 || height <= 0 ||
+      dst_x < 0 || dst_y < 0) {
+    return -1;
+  }
+  // Coalesce rows.
+  if (dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    dst_stride_argb = 0;
+  }
+#if defined(HAS_RGBCOLORTABLEROW_X86)
+  if (TestCpuFlag(kCpuHasX86)) {
+    RGBColorTableRow = RGBColorTableRow_X86;
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    RGBColorTableRow(dst, table_argb, width);
+    dst += dst_stride_argb;
+  }
+  return 0;
+}
+
+// ARGBQuantize is used to posterize art.
+// e.g. rgb / qvalue * qvalue + qvalue / 2
+// But the low levels implement efficiently with 3 parameters, and could be
+// used for other high level operations.
+// dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset;
+// where scale is 1 / interval_size as a fixed point value.
+// The divide is replaces with a multiply by reciprocal fixed point multiply.
+// Caveat - although SSE2 saturates, the C function does not and should be used
+// with care if doing anything but quantization.
+LIBYUV_API
+int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
+                 int scale, int interval_size, int interval_offset,
+                 int dst_x, int dst_y, int width, int height) {
+  int y;
+  void (*ARGBQuantizeRow)(uint8* dst_argb, int scale, int interval_size,
+                          int interval_offset, int width) = ARGBQuantizeRow_C;
+  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0 ||
+      interval_size < 1 || interval_size > 255) {
+    return -1;
+  }
+  // Coalesce rows.
+  if (dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBQUANTIZEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) {
+    ARGBQuantizeRow = ARGBQuantizeRow_SSE2;
+  }
+#endif
+#if defined(HAS_ARGBQUANTIZEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    ARGBQuantizeRow = ARGBQuantizeRow_NEON;
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    ARGBQuantizeRow(dst, scale, interval_size, interval_offset, width);
+    dst += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Computes table of cumulative sum for image where the value is the sum
+// of all values above and to the left of the entry. Used by ARGBBlur.
+LIBYUV_API
+int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
+                             int32* dst_cumsum, int dst_stride32_cumsum,
+                             int width, int height) {
+  int y;
+  void (*ComputeCumulativeSumRow)(const uint8* row, int32* cumsum,
+      const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C;
+  int32* previous_cumsum = dst_cumsum;
+  if (!dst_cumsum || !src_argb || width <= 0 || height <= 0) {
+    return -1;
+  }
+#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2;
+  }
+#endif
+  memset(dst_cumsum, 0, width * sizeof(dst_cumsum[0]) * 4);  // 4 int per pixel.
+  for (y = 0; y < height; ++y) {
+    ComputeCumulativeSumRow(src_argb, dst_cumsum, previous_cumsum, width);
+    previous_cumsum = dst_cumsum;
+    dst_cumsum += dst_stride32_cumsum;
+    src_argb += src_stride_argb;
+  }
+  return 0;
+}
+
+// Blur ARGB image.
+// Caller should allocate CumulativeSum table of width * height * 16 bytes
+// aligned to 16 byte boundary. height can be radius * 2 + 2 to save memory
+// as the buffer is treated as circular.
+LIBYUV_API
+int ARGBBlur(const uint8* src_argb, int src_stride_argb,
+             uint8* dst_argb, int dst_stride_argb,
+             int32* dst_cumsum, int dst_stride32_cumsum,
+             int width, int height, int radius) {
+  int y;
+  void (*ComputeCumulativeSumRow)(const uint8 *row, int32 *cumsum,
+      const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C;
+  void (*CumulativeSumToAverageRow)(const int32* topleft, const int32* botleft,
+      int width, int area, uint8* dst, int count) = CumulativeSumToAverageRow_C;
+  int32* cumsum_bot_row;
+  int32* max_cumsum_bot_row;
+  int32* cumsum_top_row;
+
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  if (radius > height) {
+    radius = height;
+  }
+  if (radius > (width / 2 - 1)) {
+    radius = width / 2 - 1;
+  }
+  if (radius <= 0) {
+    return -1;
+  }
+#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2;
+    CumulativeSumToAverageRow = CumulativeSumToAverageRow_SSE2;
+  }
+#endif
+  // Compute enough CumulativeSum for first row to be blurred. After this
+  // one row of CumulativeSum is updated at a time.
+  ARGBComputeCumulativeSum(src_argb, src_stride_argb,
+                           dst_cumsum, dst_stride32_cumsum,
+                           width, radius);
+
+  src_argb = src_argb + radius * src_stride_argb;
+  cumsum_bot_row = &dst_cumsum[(radius - 1) * dst_stride32_cumsum];
+
+  max_cumsum_bot_row = &dst_cumsum[(radius * 2 + 2) * dst_stride32_cumsum];
+  cumsum_top_row = &dst_cumsum[0];
+
+  for (y = 0; y < height; ++y) {
+    int top_y = ((y - radius - 1) >= 0) ? (y - radius - 1) : 0;
+    int bot_y = ((y + radius) < height) ? (y + radius) : (height - 1);
+    int area = radius * (bot_y - top_y);
+    int boxwidth = radius * 4;
+    int x;
+    int n;
+
+    // Increment cumsum_top_row pointer with circular buffer wrap around.
+    if (top_y) {
+      cumsum_top_row += dst_stride32_cumsum;
+      if (cumsum_top_row >= max_cumsum_bot_row) {
+        cumsum_top_row = dst_cumsum;
+      }
+    }
+    // Increment cumsum_bot_row pointer with circular buffer wrap around and
+    // then fill in a row of CumulativeSum.
+    if ((y + radius) < height) {
+      const int32* prev_cumsum_bot_row = cumsum_bot_row;
+      cumsum_bot_row += dst_stride32_cumsum;
+      if (cumsum_bot_row >= max_cumsum_bot_row) {
+        cumsum_bot_row = dst_cumsum;
+      }
+      ComputeCumulativeSumRow(src_argb, cumsum_bot_row, prev_cumsum_bot_row,
+                              width);
+      src_argb += src_stride_argb;
+    }
+
+    // Left clipped.
+    for (x = 0; x < radius + 1; ++x) {
+      CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row,
+                                boxwidth, area, &dst_argb[x * 4], 1);
+      area += (bot_y - top_y);
+      boxwidth += 4;
+    }
+
+    // Middle unclipped.
+    n = (width - 1) - radius - x + 1;
+    CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row,
+                              boxwidth, area, &dst_argb[x * 4], n);
+
+    // Right clipped.
+    for (x += n; x <= width - 1; ++x) {
+      area -= (bot_y - top_y);
+      boxwidth -= 4;
+      CumulativeSumToAverageRow(cumsum_top_row + (x - radius - 1) * 4,
+                                cumsum_bot_row + (x - radius - 1) * 4,
+                                boxwidth, area, &dst_argb[x * 4], 1);
+    }
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Multiply ARGB image by a specified ARGB value.
+LIBYUV_API
+int ARGBShade(const uint8* src_argb, int src_stride_argb,
+              uint8* dst_argb, int dst_stride_argb,
+              int width, int height, uint32 value) {
+  int y;
+  void (*ARGBShadeRow)(const uint8* src_argb, uint8* dst_argb,
+                       int width, uint32 value) = ARGBShadeRow_C;
+  if (!src_argb || !dst_argb || width <= 0 || height == 0 || value == 0u) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBSHADEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) {
+    ARGBShadeRow = ARGBShadeRow_SSE2;
+  }
+#endif
+#if defined(HAS_ARGBSHADEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    ARGBShadeRow = ARGBShadeRow_NEON;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBShadeRow(src_argb, dst_argb, width, value);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Interpolate 2 ARGB images by specified amount (0 to 255).
+LIBYUV_API
+int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
+                    const uint8* src_argb1, int src_stride_argb1,
+                    uint8* dst_argb, int dst_stride_argb,
+                    int width, int height, int interpolation) {
+  int y;
+  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb0 == width * 4 &&
+      src_stride_argb1 == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
+  }
+#if defined(HAS_INTERPOLATEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      InterpolateRow = InterpolateRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 4)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(width, 4)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
+      IS_ALIGNED(src_argb0, 4) && IS_ALIGNED(src_stride_argb0, 4) &&
+      IS_ALIGNED(src_argb1, 4) && IS_ALIGNED(src_stride_argb1, 4) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    InterpolateRow = InterpolateRow_MIPS_DSPR2;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    InterpolateRow(dst_argb, src_argb0, src_argb1 - src_argb0,
+                   width * 4, interpolation);
+    src_argb0 += src_stride_argb0;
+    src_argb1 += src_stride_argb1;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Shuffle ARGB channel order.  e.g. BGRA to ARGB.
+LIBYUV_API
+int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
+                uint8* dst_argb, int dst_stride_argb,
+                const uint8* shuffler, int width, int height) {
+  int y;
+  void (*ARGBShuffleRow)(const uint8* src_bgra, uint8* dst_argb,
+                         const uint8* shuffler, int pix) = ARGBShuffleRow_C;
+  if (!src_bgra || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_bgra = src_bgra + (height - 1) * src_stride_bgra;
+    src_stride_bgra = -src_stride_bgra;
+  }
+  // Coalesce rows.
+  if (src_stride_bgra == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_bgra = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBSHUFFLEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBShuffleRow = ARGBShuffleRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBShuffleRow = ARGBShuffleRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBSHUFFLEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBShuffleRow = ARGBShuffleRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBShuffleRow = ARGBShuffleRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBSHUFFLEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBShuffleRow = ARGBShuffleRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBShuffleRow = ARGBShuffleRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBSHUFFLEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBShuffleRow = ARGBShuffleRow_Any_NEON;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBShuffleRow = ARGBShuffleRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBShuffleRow(src_bgra, dst_argb, shuffler, width);
+    src_bgra += src_stride_bgra;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Sobel ARGB effect.
+static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
+                        uint8* dst_argb, int dst_stride_argb,
+                        int width, int height,
+                        void (*SobelRow)(const uint8* src_sobelx,
+                                         const uint8* src_sobely,
+                                         uint8* dst, int width)) {
+  int y;
+  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_g, int pix) =
+      ARGBToYJRow_C;
+  void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1,
+                    uint8* dst_sobely, int width) = SobelYRow_C;
+  void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1,
+                    const uint8* src_y2, uint8* dst_sobely, int width) =
+      SobelXRow_C;
+  const int kEdge = 16;  // Extra pixels at start of row for extrude/align.
+  if (!src_argb  || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb  = src_argb  + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+
+#if defined(HAS_ARGBTOYJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYJRow = ARGBToYJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYJRow = ARGBToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYJRow = ARGBToYJRow_NEON;
+    }
+  }
+#endif
+
+#if defined(HAS_SOBELYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SobelYRow = SobelYRow_SSE2;
+  }
+#endif
+#if defined(HAS_SOBELYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SobelYRow = SobelYRow_NEON;
+  }
+#endif
+#if defined(HAS_SOBELXROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SobelXRow = SobelXRow_SSE2;
+  }
+#endif
+#if defined(HAS_SOBELXROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SobelXRow = SobelXRow_NEON;
+  }
+#endif
+  {
+    // 3 rows with edges before/after.
+    const int kRowSize = (width + kEdge + 31) & ~31;
+    align_buffer_64(rows, kRowSize * 2 + (kEdge + kRowSize * 3 + kEdge));
+    uint8* row_sobelx = rows;
+    uint8* row_sobely = rows + kRowSize;
+    uint8* row_y = rows + kRowSize * 2;
+
+    // Convert first row.
+    uint8* row_y0 = row_y + kEdge;
+    uint8* row_y1 = row_y0 + kRowSize;
+    uint8* row_y2 = row_y1 + kRowSize;
+    ARGBToYJRow(src_argb, row_y0, width);
+    row_y0[-1] = row_y0[0];
+    memset(row_y0 + width, row_y0[width - 1], 16);  // Extrude 16 for valgrind.
+    ARGBToYJRow(src_argb, row_y1, width);
+    row_y1[-1] = row_y1[0];
+    memset(row_y1 + width, row_y1[width - 1], 16);
+    memset(row_y2 + width, 0, 16);
+
+    for (y = 0; y < height; ++y) {
+      // Convert next row of ARGB to G.
+      if (y < (height - 1)) {
+        src_argb += src_stride_argb;
+      }
+      ARGBToYJRow(src_argb, row_y2, width);
+      row_y2[-1] = row_y2[0];
+      row_y2[width] = row_y2[width - 1];
+
+      SobelXRow(row_y0 - 1, row_y1 - 1, row_y2 - 1, row_sobelx, width);
+      SobelYRow(row_y0 - 1, row_y2 - 1, row_sobely, width);
+      SobelRow(row_sobelx, row_sobely, dst_argb, width);
+
+      // Cycle thru circular queue of 3 row_y buffers.
+      {
+        uint8* row_yt = row_y0;
+        row_y0 = row_y1;
+        row_y1 = row_y2;
+        row_y2 = row_yt;
+      }
+
+      dst_argb += dst_stride_argb;
+    }
+    free_aligned_buffer_64(rows);
+  }
+  return 0;
+}
+
+// Sobel ARGB effect.
+LIBYUV_API
+int ARGBSobel(const uint8* src_argb, int src_stride_argb,
+              uint8* dst_argb, int dst_stride_argb,
+              int width, int height) {
+  void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely,
+                   uint8* dst_argb, int width) = SobelRow_C;
+#if defined(HAS_SOBELROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SobelRow = SobelRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      SobelRow = SobelRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SOBELROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SobelRow = SobelRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      SobelRow = SobelRow_NEON;
+    }
+  }
+#endif
+  return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+                      width, height, SobelRow);
+}
+
+// Sobel ARGB effect with planar output.
+LIBYUV_API
+int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb,
+                     uint8* dst_y, int dst_stride_y,
+                     int width, int height) {
+  void (*SobelToPlaneRow)(const uint8* src_sobelx, const uint8* src_sobely,
+                          uint8* dst_, int width) = SobelToPlaneRow_C;
+#if defined(HAS_SOBELTOPLANEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SobelToPlaneRow = SobelToPlaneRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      SobelToPlaneRow = SobelToPlaneRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SOBELTOPLANEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SobelToPlaneRow = SobelToPlaneRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      SobelToPlaneRow = SobelToPlaneRow_NEON;
+    }
+  }
+#endif
+  return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y,
+                      width, height, SobelToPlaneRow);
+}
+
+// SobelXY ARGB effect.
+// Similar to Sobel, but also stores Sobel X in R and Sobel Y in B.  G = Sobel.
+LIBYUV_API
+int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
+                uint8* dst_argb, int dst_stride_argb,
+                int width, int height) {
+  void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width) = SobelXYRow_C;
+#if defined(HAS_SOBELXYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SobelXYRow = SobelXYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      SobelXYRow = SobelXYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SOBELXYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SobelXYRow = SobelXYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      SobelXYRow = SobelXYRow_NEON;
+    }
+  }
+#endif
+  return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+                      width, height, SobelXYRow);
+}
+
+// Apply a 4x4 polynomial to each ARGB pixel.
+LIBYUV_API
+int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_argb, int dst_stride_argb,
+                   const float* poly,
+                   int width, int height) {
+  int y;
+  void (*ARGBPolynomialRow)(const uint8* src_argb,
+                            uint8* dst_argb, const float* poly,
+                            int width) = ARGBPolynomialRow_C;
+  if (!src_argb || !dst_argb || !poly || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb  = src_argb  + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBPOLYNOMIALROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 2)) {
+    ARGBPolynomialRow = ARGBPolynomialRow_SSE2;
+  }
+#endif
+#if defined(HAS_ARGBPOLYNOMIALROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasFMA3) &&
+      IS_ALIGNED(width, 2)) {
+    ARGBPolynomialRow = ARGBPolynomialRow_AVX2;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBPolynomialRow(src_argb, dst_argb, poly, width);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Apply a lumacolortable to each ARGB pixel.
+LIBYUV_API
+int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb,
+                       uint8* dst_argb, int dst_stride_argb,
+                       const uint8* luma,
+                       int width, int height) {
+  int y;
+  void (*ARGBLumaColorTableRow)(const uint8* src_argb, uint8* dst_argb,
+      int width, const uint8* luma, const uint32 lumacoeff) =
+      ARGBLumaColorTableRow_C;
+  if (!src_argb || !dst_argb || !luma || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb  = src_argb  + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBLUMACOLORTABLEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4)) {
+    ARGBLumaColorTableRow = ARGBLumaColorTableRow_SSSE3;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBLumaColorTableRow(src_argb, dst_argb, width, luma, 0x00264b0f);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Copy Alpha from one ARGB image to another.
+LIBYUV_API
+int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
+                  uint8* dst_argb, int dst_stride_argb,
+                  int width, int height) {
+  int y;
+  void (*ARGBCopyAlphaRow)(const uint8* src_argb, uint8* dst_argb, int width) =
+      ARGBCopyAlphaRow_C;
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBCOPYALPHAROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) {
+    ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2;
+  }
+#endif
+#if defined(HAS_ARGBCOPYALPHAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 16)) {
+    ARGBCopyAlphaRow = ARGBCopyAlphaRow_AVX2;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBCopyAlphaRow(src_argb, dst_argb, width);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Copy a planar Y channel to the alpha channel of a destination ARGB image.
+LIBYUV_API
+int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
+                     uint8* dst_argb, int dst_stride_argb,
+                     int width, int height) {
+  int y;
+  void (*ARGBCopyYToAlphaRow)(const uint8* src_y, uint8* dst_argb, int width) =
+      ARGBCopyYToAlphaRow_C;
+  if (!src_y || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBCOPYYTOALPHAROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) {
+    ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_SSE2;
+  }
+#endif
+#if defined(HAS_ARGBCOPYYTOALPHAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 16)) {
+    ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_AVX2;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBCopyYToAlphaRow(src_y, dst_argb, width);
+    src_y += src_stride_y;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+LIBYUV_API
+int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height) {
+  int y;
+  int halfwidth = (width + 1) >> 1;
+  void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) =
+      SplitUVRow_C;
+  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+  if (!src_yuy2 ||
+      !dst_y || !dst_uv ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+    src_stride_yuy2 = -src_stride_yuy2;
+  }
+#if defined(HAS_SPLITUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SplitUVRow = SplitUVRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      SplitUVRow = SplitUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    SplitUVRow = SplitUVRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      SplitUVRow = SplitUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SplitUVRow = SplitUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      SplitUVRow = SplitUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      InterpolateRow = InterpolateRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+
+  {
+    int awidth = halfwidth * 2;
+    // 2 rows of uv
+    align_buffer_64(rows, awidth * 2);
+
+    for (y = 0; y < height - 1; y += 2) {
+      // Split Y from UV.
+      SplitUVRow(src_yuy2, dst_y, rows, awidth);
+      SplitUVRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y,
+                 rows + awidth, awidth);
+      InterpolateRow(dst_uv, rows, awidth, awidth, 128);
+      src_yuy2 += src_stride_yuy2 * 2;
+      dst_y += dst_stride_y * 2;
+      dst_uv += dst_stride_uv;
+    }
+    if (height & 1) {
+      // Split Y from UV.
+      SplitUVRow(src_yuy2, dst_y, dst_uv, width);
+    }
+    free_aligned_buffer_64(rows);
+  }
+  return 0;
+}
+
+LIBYUV_API
+int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height) {
+  int y;
+  int halfwidth = (width + 1) >> 1;
+  void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) =
+      SplitUVRow_C;
+  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+  if (!src_uyvy ||
+      !dst_y || !dst_uv ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+    src_stride_uyvy = -src_stride_uyvy;
+  }
+#if defined(HAS_SPLITUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SplitUVRow = SplitUVRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      SplitUVRow = SplitUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    SplitUVRow = SplitUVRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      SplitUVRow = SplitUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SplitUVRow = SplitUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      SplitUVRow = SplitUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      InterpolateRow = InterpolateRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+
+  {
+    int awidth = halfwidth * 2;
+    // 2 rows of uv
+    align_buffer_64(rows, awidth * 2);
+
+    for (y = 0; y < height - 1; y += 2) {
+      // Split Y from UV.
+      SplitUVRow(src_uyvy, rows, dst_y, awidth);
+      SplitUVRow(src_uyvy + src_stride_uyvy, rows + awidth,
+                 dst_y + dst_stride_y, awidth);
+      InterpolateRow(dst_uv, rows, awidth, awidth, 128);
+      src_uyvy += src_stride_uyvy * 2;
+      dst_y += dst_stride_y * 2;
+      dst_uv += dst_stride_uv;
+    }
+    if (height & 1) {
+      // Split Y from UV.
+      SplitUVRow(src_uyvy, dst_y, dst_uv, width);
+    }
+    free_aligned_buffer_64(rows);
+  }
+  return 0;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libvpx/third_party/libyuv/source/rotate.cc b/libs/libvpx/third_party/libyuv/source/rotate.cc
new file mode 100644
index 0000000000..be3d589207
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/source/rotate.cc
@@ -0,0 +1,496 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate.h"
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/convert.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+LIBYUV_API
+void TransposePlane(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride,
+                    int width, int height) {
+  int i = height;
+  void (*TransposeWx8)(const uint8* src, int src_stride,
+                       uint8* dst, int dst_stride, int width) = TransposeWx8_C;
+#if defined(HAS_TRANSPOSEWX8_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    TransposeWx8 = TransposeWx8_NEON;
+  }
+#endif
+#if defined(HAS_TRANSPOSEWX8_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    TransposeWx8 = TransposeWx8_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      TransposeWx8 = TransposeWx8_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    TransposeWx8 = TransposeWx8_Fast_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      TransposeWx8 = TransposeWx8_Fast_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_TRANSPOSEWX8_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
+    if (IS_ALIGNED(width, 4) &&
+        IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
+      TransposeWx8 = TransposeWx8_Fast_MIPS_DSPR2;
+    } else {
+      TransposeWx8 = TransposeWx8_MIPS_DSPR2;
+    }
+  }
+#endif
+
+  // Work across the source in 8x8 tiles
+  while (i >= 8) {
+    TransposeWx8(src, src_stride, dst, dst_stride, width);
+    src += 8 * src_stride;    // Go down 8 rows.
+    dst += 8;                 // Move over 8 columns.
+    i -= 8;
+  }
+
+  if (i > 0) {
+    TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
+  }
+}
+
+LIBYUV_API
+void RotatePlane90(const uint8* src, int src_stride,
+                   uint8* dst, int dst_stride,
+                   int width, int height) {
+  // Rotate by 90 is a transpose with the source read
+  // from bottom to top. So set the source pointer to the end
+  // of the buffer and flip the sign of the source stride.
+  src += src_stride * (height - 1);
+  src_stride = -src_stride;
+  TransposePlane(src, src_stride, dst, dst_stride, width, height);
+}
+
+LIBYUV_API
+void RotatePlane270(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride,
+                    int width, int height) {
+  // Rotate by 270 is a transpose with the destination written
+  // from bottom to top. So set the destination pointer to the end
+  // of the buffer and flip the sign of the destination stride.
+  dst += dst_stride * (width - 1);
+  dst_stride = -dst_stride;
+  TransposePlane(src, src_stride, dst, dst_stride, width, height);
+}
+
+LIBYUV_API
+void RotatePlane180(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride,
+                    int width, int height) {
+  // Swap first and last row and mirror the content. Uses a temporary row.
+  align_buffer_64(row, width);
+  const uint8* src_bot = src + src_stride * (height - 1);
+  uint8* dst_bot = dst + dst_stride * (height - 1);
+  int half_height = (height + 1) >> 1;
+  int y;
+  void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
+  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+#if defined(HAS_MIRRORROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MirrorRow = MirrorRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      MirrorRow = MirrorRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    MirrorRow = MirrorRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      MirrorRow = MirrorRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    MirrorRow = MirrorRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      MirrorRow = MirrorRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MirrorRow = MirrorRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      MirrorRow = MirrorRow_AVX2;
+    }
+  }
+#endif
+// TODO(fbarchard): Mirror on mips handle unaligned memory.
+#if defined(HAS_MIRRORROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
+      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4)) {
+    MirrorRow = MirrorRow_MIPS_DSPR2;
+  }
+#endif
+#if defined(HAS_COPYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
+  }
+#endif
+#if defined(HAS_COPYROW_AVX)
+  if (TestCpuFlag(kCpuHasAVX)) {
+    CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
+  }
+#endif
+#if defined(HAS_COPYROW_ERMS)
+  if (TestCpuFlag(kCpuHasERMS)) {
+    CopyRow = CopyRow_ERMS;
+  }
+#endif
+#if defined(HAS_COPYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
+  }
+#endif
+#if defined(HAS_COPYROW_MIPS)
+  if (TestCpuFlag(kCpuHasMIPS)) {
+    CopyRow = CopyRow_MIPS;
+  }
+#endif
+
+  // Odd height will harmlessly mirror the middle row twice.
+  for (y = 0; y < half_height; ++y) {
+    MirrorRow(src, row, width);  // Mirror first row into a buffer
+    src += src_stride;
+    MirrorRow(src_bot, dst, width);  // Mirror last row into first row
+    dst += dst_stride;
+    CopyRow(row, dst_bot, width);  // Copy first mirrored row into last
+    src_bot -= src_stride;
+    dst_bot -= dst_stride;
+  }
+  free_aligned_buffer_64(row);
+}
+
+LIBYUV_API
+void TransposeUV(const uint8* src, int src_stride,
+                 uint8* dst_a, int dst_stride_a,
+                 uint8* dst_b, int dst_stride_b,
+                 int width, int height) {
+  int i = height;
+  void (*TransposeUVWx8)(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b,
+                         int width) = TransposeUVWx8_C;
+#if defined(HAS_TRANSPOSEUVWX8_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    TransposeUVWx8 = TransposeUVWx8_NEON;
+  }
+#endif
+#if defined(HAS_TRANSPOSEUVWX8_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) {
+    TransposeUVWx8 = TransposeUVWx8_SSE2;
+  }
+#endif
+#if defined(HAS_TRANSPOSEUVWx8_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) &&
+      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
+    TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2;
+  }
+#endif
+
+  // Work through the source in 8x8 tiles.
+  while (i >= 8) {
+    TransposeUVWx8(src, src_stride,
+                   dst_a, dst_stride_a,
+                   dst_b, dst_stride_b,
+                   width);
+    src += 8 * src_stride;    // Go down 8 rows.
+    dst_a += 8;               // Move over 8 columns.
+    dst_b += 8;               // Move over 8 columns.
+    i -= 8;
+  }
+
+  if (i > 0) {
+    TransposeUVWxH_C(src, src_stride,
+                     dst_a, dst_stride_a,
+                     dst_b, dst_stride_b,
+                     width, i);
+  }
+}
+
+LIBYUV_API
+void RotateUV90(const uint8* src, int src_stride,
+                uint8* dst_a, int dst_stride_a,
+                uint8* dst_b, int dst_stride_b,
+                int width, int height) {
+  src += src_stride * (height - 1);
+  src_stride = -src_stride;
+
+  TransposeUV(src, src_stride,
+              dst_a, dst_stride_a,
+              dst_b, dst_stride_b,
+              width, height);
+}
+
+LIBYUV_API
+void RotateUV270(const uint8* src, int src_stride,
+                 uint8* dst_a, int dst_stride_a,
+                 uint8* dst_b, int dst_stride_b,
+                 int width, int height) {
+  dst_a += dst_stride_a * (width - 1);
+  dst_b += dst_stride_b * (width - 1);
+  dst_stride_a = -dst_stride_a;
+  dst_stride_b = -dst_stride_b;
+
+  TransposeUV(src, src_stride,
+              dst_a, dst_stride_a,
+              dst_b, dst_stride_b,
+              width, height);
+}
+
+// Rotate 180 is a horizontal and vertical flip.
+LIBYUV_API
+void RotateUV180(const uint8* src, int src_stride,
+                 uint8* dst_a, int dst_stride_a,
+                 uint8* dst_b, int dst_stride_b,
+                 int width, int height) {
+  int i;
+  void (*MirrorRowUV)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) =
+      MirrorUVRow_C;
+#if defined(HAS_MIRRORUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    MirrorRowUV = MirrorUVRow_NEON;
+  }
+#endif
+#if defined(HAS_MIRRORROW_UV_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
+    MirrorRowUV = MirrorUVRow_SSSE3;
+  }
+#endif
+#if defined(HAS_MIRRORUVROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
+      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
+    MirrorRowUV = MirrorUVRow_MIPS_DSPR2;
+  }
+#endif
+
+  dst_a += dst_stride_a * (height - 1);
+  dst_b += dst_stride_b * (height - 1);
+
+  for (i = 0; i < height; ++i) {
+    MirrorRowUV(src, dst_a, dst_b, width);
+    src += src_stride;
+    dst_a -= dst_stride_a;
+    dst_b -= dst_stride_b;
+  }
+}
+
+LIBYUV_API
+int RotatePlane(const uint8* src, int src_stride,
+                uint8* dst, int dst_stride,
+                int width, int height,
+                enum RotationMode mode) {
+  if (!src || width <= 0 || height == 0 || !dst) {
+    return -1;
+  }
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src = src + (height - 1) * src_stride;
+    src_stride = -src_stride;
+  }
+
+  switch (mode) {
+    case kRotate0:
+      // copy frame
+      CopyPlane(src, src_stride,
+                dst, dst_stride,
+                width, height);
+      return 0;
+    case kRotate90:
+      RotatePlane90(src, src_stride,
+                    dst, dst_stride,
+                    width, height);
+      return 0;
+    case kRotate270:
+      RotatePlane270(src, src_stride,
+                     dst, dst_stride,
+                     width, height);
+      return 0;
+    case kRotate180:
+      RotatePlane180(src, src_stride,
+                     dst, dst_stride,
+                     width, height);
+      return 0;
+    default:
+      break;
+  }
+  return -1;
+}
+
+LIBYUV_API
+int I420Rotate(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height,
+               enum RotationMode mode) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 ||
+      !dst_y || !dst_u || !dst_v) {
+    return -1;
+  }
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  switch (mode) {
+    case kRotate0:
+      // copy frame
+      return I420Copy(src_y, src_stride_y,
+                      src_u, src_stride_u,
+                      src_v, src_stride_v,
+                      dst_y, dst_stride_y,
+                      dst_u, dst_stride_u,
+                      dst_v, dst_stride_v,
+                      width, height);
+    case kRotate90:
+      RotatePlane90(src_y, src_stride_y,
+                    dst_y, dst_stride_y,
+                    width, height);
+      RotatePlane90(src_u, src_stride_u,
+                    dst_u, dst_stride_u,
+                    halfwidth, halfheight);
+      RotatePlane90(src_v, src_stride_v,
+                    dst_v, dst_stride_v,
+                    halfwidth, halfheight);
+      return 0;
+    case kRotate270:
+      RotatePlane270(src_y, src_stride_y,
+                     dst_y, dst_stride_y,
+                     width, height);
+      RotatePlane270(src_u, src_stride_u,
+                     dst_u, dst_stride_u,
+                     halfwidth, halfheight);
+      RotatePlane270(src_v, src_stride_v,
+                     dst_v, dst_stride_v,
+                     halfwidth, halfheight);
+      return 0;
+    case kRotate180:
+      RotatePlane180(src_y, src_stride_y,
+                     dst_y, dst_stride_y,
+                     width, height);
+      RotatePlane180(src_u, src_stride_u,
+                     dst_u, dst_stride_u,
+                     halfwidth, halfheight);
+      RotatePlane180(src_v, src_stride_v,
+                     dst_v, dst_stride_v,
+                     halfwidth, halfheight);
+      return 0;
+    default:
+      break;
+  }
+  return -1;
+}
+
+LIBYUV_API
+int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
+                     const uint8* src_uv, int src_stride_uv,
+                     uint8* dst_y, int dst_stride_y,
+                     uint8* dst_u, int dst_stride_u,
+                     uint8* dst_v, int dst_stride_v,
+                     int width, int height,
+                     enum RotationMode mode) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_y || !src_uv || width <= 0 || height == 0 ||
+      !dst_y || !dst_u || !dst_v) {
+    return -1;
+  }
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_uv = src_uv + (halfheight - 1) * src_stride_uv;
+    src_stride_y = -src_stride_y;
+    src_stride_uv = -src_stride_uv;
+  }
+
+  switch (mode) {
+    case kRotate0:
+      // copy frame
+      return NV12ToI420(src_y, src_stride_y,
+                        src_uv, src_stride_uv,
+                        dst_y, dst_stride_y,
+                        dst_u, dst_stride_u,
+                        dst_v, dst_stride_v,
+                        width, height);
+    case kRotate90:
+      RotatePlane90(src_y, src_stride_y,
+                    dst_y, dst_stride_y,
+                    width, height);
+      RotateUV90(src_uv, src_stride_uv,
+                 dst_u, dst_stride_u,
+                 dst_v, dst_stride_v,
+                 halfwidth, halfheight);
+      return 0;
+    case kRotate270:
+      RotatePlane270(src_y, src_stride_y,
+                     dst_y, dst_stride_y,
+                     width, height);
+      RotateUV270(src_uv, src_stride_uv,
+                  dst_u, dst_stride_u,
+                  dst_v, dst_stride_v,
+                  halfwidth, halfheight);
+      return 0;
+    case kRotate180:
+      RotatePlane180(src_y, src_stride_y,
+                     dst_y, dst_stride_y,
+                     width, height);
+      RotateUV180(src_uv, src_stride_uv,
+                  dst_u, dst_stride_u,
+                  dst_v, dst_stride_v,
+                  halfwidth, halfheight);
+      return 0;
+    default:
+      break;
+  }
+  return -1;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libvpx/third_party/libyuv/source/rotate_any.cc b/libs/libvpx/third_party/libyuv/source/rotate_any.cc
new file mode 100644
index 0000000000..4d6eb34e18
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/source/rotate_any.cc
@@ -0,0 +1,55 @@
+/*
+ *  Copyright 2015 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate.h"
+#include "libyuv/rotate_row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define TANY(NAMEANY, TPOS_SIMD, TPOS_C, MASK)                                 \
+    void NAMEANY(const uint8* src, int src_stride,                             \
+                 uint8* dst, int dst_stride, int width) {                      \
+      int r = width & MASK;                                                    \
+      int n = width - r;                                                       \
+      if (n > 0) {                                                             \
+        TPOS_SIMD(src, src_stride, dst, dst_stride, n);                        \
+      }                                                                        \
+      TPOS_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r);        \
+    }
+
+#ifdef HAS_TRANSPOSEWX8_NEON
+TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, TransposeWx8_C, 7)
+#endif
+#ifdef HAS_TRANSPOSEWX8_SSSE3
+TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, TransposeWx8_C, 7)
+#endif
+#ifdef HAS_TRANSPOSEWX8_FAST_SSSE3
+TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, TransposeWx8_C, 15)
+#endif
+#ifdef HAS_TRANSPOSEWX8_MIPS_DSPR2
+TANY(TransposeWx8_Any_MIPS_DSPR2, TransposeWx8_MIPS_DSPR2, TransposeWx8_C, 7)
+#endif
+
+#undef TANY
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+
+
+
+
diff --git a/libs/libvpx/third_party/libyuv/source/rotate_argb.cc b/libs/libvpx/third_party/libyuv/source/rotate_argb.cc
new file mode 100644
index 0000000000..787c0ad1be
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/source/rotate_argb.cc
@@ -0,0 +1,205 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate.h"
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/convert.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// ARGBScale has a function to copy pixels to a row, striding each source
+// pixel by a constant.
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || \
+    (defined(__x86_64__) && !defined(__native_client__)) || defined(__i386__))
+#define HAS_SCALEARGBROWDOWNEVEN_SSE2
+void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride,
+                               int src_stepx, uint8* dst_ptr, int dst_width);
+#endif
+#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
+#define HAS_SCALEARGBROWDOWNEVEN_NEON
+void ScaleARGBRowDownEven_NEON(const uint8* src_ptr, int src_stride,
+                               int src_stepx, uint8* dst_ptr, int dst_width);
+#endif
+
+void ScaleARGBRowDownEven_C(const uint8* src_ptr, int,
+                            int src_stepx, uint8* dst_ptr, int dst_width);
+
+static void ARGBTranspose(const uint8* src, int src_stride,
+                          uint8* dst, int dst_stride, int width, int height) {
+  int i;
+  int src_pixel_step = src_stride >> 2;
+  void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride,
+      int src_step, uint8* dst_ptr, int dst_width) = ScaleARGBRowDownEven_C;
+#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4)) {  // Width of dest.
+    ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2;
+  }
+#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(height, 4)) {  // Width of dest.
+    ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON;
+  }
+#endif
+
+  for (i = 0; i < width; ++i) {  // column of source to row of dest.
+    ScaleARGBRowDownEven(src, 0, src_pixel_step, dst, height);
+    dst += dst_stride;
+    src += 4;
+  }
+}
+
+void ARGBRotate90(const uint8* src, int src_stride,
+                  uint8* dst, int dst_stride, int width, int height) {
+  // Rotate by 90 is a ARGBTranspose with the source read
+  // from bottom to top. So set the source pointer to the end
+  // of the buffer and flip the sign of the source stride.
+  src += src_stride * (height - 1);
+  src_stride = -src_stride;
+  ARGBTranspose(src, src_stride, dst, dst_stride, width, height);
+}
+
+void ARGBRotate270(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride, int width, int height) {
+  // Rotate by 270 is a ARGBTranspose with the destination written
+  // from bottom to top. So set the destination pointer to the end
+  // of the buffer and flip the sign of the destination stride.
+  dst += dst_stride * (width - 1);
+  dst_stride = -dst_stride;
+  ARGBTranspose(src, src_stride, dst, dst_stride, width, height);
+}
+
+void ARGBRotate180(const uint8* src, int src_stride,
+                   uint8* dst, int dst_stride, int width, int height) {
+  // Swap first and last row and mirror the content. Uses a temporary row.
+  align_buffer_64(row, width * 4);
+  const uint8* src_bot = src + src_stride * (height - 1);
+  uint8* dst_bot = dst + dst_stride * (height - 1);
+  int half_height = (height + 1) >> 1;
+  int y;
+  void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =
+      ARGBMirrorRow_C;
+  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+#if defined(HAS_ARGBMIRRORROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBMirrorRow = ARGBMirrorRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBMIRRORROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBMirrorRow = ARGBMirrorRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBMIRRORROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBMirrorRow = ARGBMirrorRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_COPYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
+  }
+#endif
+#if defined(HAS_COPYROW_AVX)
+  if (TestCpuFlag(kCpuHasAVX)) {
+    CopyRow = IS_ALIGNED(width * 4, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
+  }
+#endif
+#if defined(HAS_COPYROW_ERMS)
+  if (TestCpuFlag(kCpuHasERMS)) {
+    CopyRow = CopyRow_ERMS;
+  }
+#endif
+#if defined(HAS_COPYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
+  }
+#endif
+#if defined(HAS_COPYROW_MIPS)
+  if (TestCpuFlag(kCpuHasMIPS)) {
+    CopyRow = CopyRow_MIPS;
+  }
+#endif
+
+  // Odd height will harmlessly mirror the middle row twice.
+  for (y = 0; y < half_height; ++y) {
+    ARGBMirrorRow(src, row, width);  // Mirror first row into a buffer
+    ARGBMirrorRow(src_bot, dst, width);  // Mirror last row into first row
+    CopyRow(row, dst_bot, width * 4);  // Copy first mirrored row into last
+    src += src_stride;
+    dst += dst_stride;
+    src_bot -= src_stride;
+    dst_bot -= dst_stride;
+  }
+  free_aligned_buffer_64(row);
+}
+
+LIBYUV_API
+int ARGBRotate(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_argb, int dst_stride_argb, int width, int height,
+               enum RotationMode mode) {
+  if (!src_argb || width <= 0 || height == 0 || !dst_argb) {
+    return -1;
+  }
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+
+  switch (mode) {
+    case kRotate0:
+      // copy frame
+      return ARGBCopy(src_argb, src_stride_argb,
+                      dst_argb, dst_stride_argb,
+                      width, height);
+    case kRotate90:
+      ARGBRotate90(src_argb, src_stride_argb,
+                   dst_argb, dst_stride_argb,
+                   width, height);
+      return 0;
+    case kRotate270:
+      ARGBRotate270(src_argb, src_stride_argb,
+                    dst_argb, dst_stride_argb,
+                    width, height);
+      return 0;
+    case kRotate180:
+      ARGBRotate180(src_argb, src_stride_argb,
+                    dst_argb, dst_stride_argb,
+                    width, height);
+      return 0;
+    default:
+      break;
+  }
+  return -1;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libvpx/third_party/libyuv/source/rotate_common.cc b/libs/libvpx/third_party/libyuv/source/rotate_common.cc
new file mode 100644
index 0000000000..b33a9a0c6e
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/source/rotate_common.cc
@@ -0,0 +1,92 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/rotate_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+void TransposeWx8_C(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    dst[0] = src[0 * src_stride];
+    dst[1] = src[1 * src_stride];
+    dst[2] = src[2 * src_stride];
+    dst[3] = src[3 * src_stride];
+    dst[4] = src[4 * src_stride];
+    dst[5] = src[5 * src_stride];
+    dst[6] = src[6 * src_stride];
+    dst[7] = src[7 * src_stride];
+    ++src;
+    dst += dst_stride;
+  }
+}
+
+void TransposeUVWx8_C(const uint8* src, int src_stride,
+                      uint8* dst_a, int dst_stride_a,
+                      uint8* dst_b, int dst_stride_b, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    dst_a[0] = src[0 * src_stride + 0];
+    dst_b[0] = src[0 * src_stride + 1];
+    dst_a[1] = src[1 * src_stride + 0];
+    dst_b[1] = src[1 * src_stride + 1];
+    dst_a[2] = src[2 * src_stride + 0];
+    dst_b[2] = src[2 * src_stride + 1];
+    dst_a[3] = src[3 * src_stride + 0];
+    dst_b[3] = src[3 * src_stride + 1];
+    dst_a[4] = src[4 * src_stride + 0];
+    dst_b[4] = src[4 * src_stride + 1];
+    dst_a[5] = src[5 * src_stride + 0];
+    dst_b[5] = src[5 * src_stride + 1];
+    dst_a[6] = src[6 * src_stride + 0];
+    dst_b[6] = src[6 * src_stride + 1];
+    dst_a[7] = src[7 * src_stride + 0];
+    dst_b[7] = src[7 * src_stride + 1];
+    src += 2;
+    dst_a += dst_stride_a;
+    dst_b += dst_stride_b;
+  }
+}
+
+void TransposeWxH_C(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride,
+                    int width, int height) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    int j;
+    for (j = 0; j < height; ++j) {
+      dst[i * dst_stride + j] = src[j * src_stride + i];
+    }
+  }
+}
+
+void TransposeUVWxH_C(const uint8* src, int src_stride,
+                      uint8* dst_a, int dst_stride_a,
+                      uint8* dst_b, int dst_stride_b,
+                      int width, int height) {
+  int i;
+  for (i = 0; i < width * 2; i += 2) {
+    int j;
+    for (j = 0; j < height; ++j) {
+      dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
+      dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
+    }
+  }
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libvpx/third_party/libyuv/source/rotate_gcc.cc b/libs/libvpx/third_party/libyuv/source/rotate_gcc.cc
new file mode 100644
index 0000000000..fd385bcd30
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/source/rotate_gcc.cc
@@ -0,0 +1,493 @@
+/*
+ *  Copyright 2015 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/rotate_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
+void TransposeWx8_SSSE3(const uint8* src, int src_stride,
+                        uint8* dst, int dst_stride, int width) {
+  asm volatile (
+    // Read in the data from the source pointer.
+    // First round of bit swap.
+    ".p2align  2                                 \n"
+  "1:                                            \n"
+    "movq       (%0),%%xmm0                      \n"
+    "movq       (%0,%3),%%xmm1                   \n"
+    "lea        (%0,%3,2),%0                     \n"
+    "punpcklbw  %%xmm1,%%xmm0                    \n"
+    "movq       (%0),%%xmm2                      \n"
+    "movdqa     %%xmm0,%%xmm1                    \n"
+    "palignr    $0x8,%%xmm1,%%xmm1               \n"
+    "movq       (%0,%3),%%xmm3                   \n"
+    "lea        (%0,%3,2),%0                     \n"
+    "punpcklbw  %%xmm3,%%xmm2                    \n"
+    "movdqa     %%xmm2,%%xmm3                    \n"
+    "movq       (%0),%%xmm4                      \n"
+    "palignr    $0x8,%%xmm3,%%xmm3               \n"
+    "movq       (%0,%3),%%xmm5                   \n"
+    "lea        (%0,%3,2),%0                     \n"
+    "punpcklbw  %%xmm5,%%xmm4                    \n"
+    "movdqa     %%xmm4,%%xmm5                    \n"
+    "movq       (%0),%%xmm6                      \n"
+    "palignr    $0x8,%%xmm5,%%xmm5               \n"
+    "movq       (%0,%3),%%xmm7                   \n"
+    "lea        (%0,%3,2),%0                     \n"
+    "punpcklbw  %%xmm7,%%xmm6                    \n"
+    "neg        %3                               \n"
+    "movdqa     %%xmm6,%%xmm7                    \n"
+    "lea        0x8(%0,%3,8),%0                  \n"
+    "palignr    $0x8,%%xmm7,%%xmm7               \n"
+    "neg        %3                               \n"
+     // Second round of bit swap.
+    "punpcklwd  %%xmm2,%%xmm0                    \n"
+    "punpcklwd  %%xmm3,%%xmm1                    \n"
+    "movdqa     %%xmm0,%%xmm2                    \n"
+    "movdqa     %%xmm1,%%xmm3                    \n"
+    "palignr    $0x8,%%xmm2,%%xmm2               \n"
+    "palignr    $0x8,%%xmm3,%%xmm3               \n"
+    "punpcklwd  %%xmm6,%%xmm4                    \n"
+    "punpcklwd  %%xmm7,%%xmm5                    \n"
+    "movdqa     %%xmm4,%%xmm6                    \n"
+    "movdqa     %%xmm5,%%xmm7                    \n"
+    "palignr    $0x8,%%xmm6,%%xmm6               \n"
+    "palignr    $0x8,%%xmm7,%%xmm7               \n"
+    // Third round of bit swap.
+    // Write to the destination pointer.
+    "punpckldq  %%xmm4,%%xmm0                    \n"
+    "movq       %%xmm0,(%1)                      \n"
+    "movdqa     %%xmm0,%%xmm4                    \n"
+    "palignr    $0x8,%%xmm4,%%xmm4               \n"
+    "movq       %%xmm4,(%1,%4)                   \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "punpckldq  %%xmm6,%%xmm2                    \n"
+    "movdqa     %%xmm2,%%xmm6                    \n"
+    "movq       %%xmm2,(%1)                      \n"
+    "palignr    $0x8,%%xmm6,%%xmm6               \n"
+    "punpckldq  %%xmm5,%%xmm1                    \n"
+    "movq       %%xmm6,(%1,%4)                   \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "movdqa     %%xmm1,%%xmm5                    \n"
+    "movq       %%xmm1,(%1)                      \n"
+    "palignr    $0x8,%%xmm5,%%xmm5               \n"
+    "movq       %%xmm5,(%1,%4)                   \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "punpckldq  %%xmm7,%%xmm3                    \n"
+    "movq       %%xmm3,(%1)                      \n"
+    "movdqa     %%xmm3,%%xmm7                    \n"
+    "palignr    $0x8,%%xmm7,%%xmm7               \n"
+    "sub        $0x8,%2                          \n"
+    "movq       %%xmm7,(%1,%4)                   \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "jg         1b                               \n"
+    : "+r"(src),    // %0
+      "+r"(dst),    // %1
+      "+r"(width)   // %2
+    : "r"((intptr_t)(src_stride)),  // %3
+      "r"((intptr_t)(dst_stride))   // %4
+    : "memory", "cc",
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
+#if !defined(LIBYUV_DISABLE_X86) && defined(__i386__)  && !defined(__clang__)
+void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b, int width);
+  asm (
+    DECLARE_FUNCTION(TransposeUVWx8_SSE2)
+    "push   %ebx                               \n"
+    "push   %esi                               \n"
+    "push   %edi                               \n"
+    "push   %ebp                               \n"
+    "mov    0x14(%esp),%eax                    \n"
+    "mov    0x18(%esp),%edi                    \n"
+    "mov    0x1c(%esp),%edx                    \n"
+    "mov    0x20(%esp),%esi                    \n"
+    "mov    0x24(%esp),%ebx                    \n"
+    "mov    0x28(%esp),%ebp                    \n"
+    "mov    %esp,%ecx                          \n"
+    "sub    $0x14,%esp                         \n"
+    "and    $0xfffffff0,%esp                   \n"
+    "mov    %ecx,0x10(%esp)                    \n"
+    "mov    0x2c(%ecx),%ecx                    \n"
+
+"1:                                            \n"
+    "movdqu (%eax),%xmm0                       \n"
+    "movdqu (%eax,%edi,1),%xmm1                \n"
+    "lea    (%eax,%edi,2),%eax                 \n"
+    "movdqa %xmm0,%xmm7                        \n"
+    "punpcklbw %xmm1,%xmm0                     \n"
+    "punpckhbw %xmm1,%xmm7                     \n"
+    "movdqa %xmm7,%xmm1                        \n"
+    "movdqu (%eax),%xmm2                       \n"
+    "movdqu (%eax,%edi,1),%xmm3                \n"
+    "lea    (%eax,%edi,2),%eax                 \n"
+    "movdqa %xmm2,%xmm7                        \n"
+    "punpcklbw %xmm3,%xmm2                     \n"
+    "punpckhbw %xmm3,%xmm7                     \n"
+    "movdqa %xmm7,%xmm3                        \n"
+    "movdqu (%eax),%xmm4                       \n"
+    "movdqu (%eax,%edi,1),%xmm5                \n"
+    "lea    (%eax,%edi,2),%eax                 \n"
+    "movdqa %xmm4,%xmm7                        \n"
+    "punpcklbw %xmm5,%xmm4                     \n"
+    "punpckhbw %xmm5,%xmm7                     \n"
+    "movdqa %xmm7,%xmm5                        \n"
+    "movdqu (%eax),%xmm6                       \n"
+    "movdqu (%eax,%edi,1),%xmm7                \n"
+    "lea    (%eax,%edi,2),%eax                 \n"
+    "movdqu %xmm5,(%esp)                       \n"
+    "neg    %edi                               \n"
+    "movdqa %xmm6,%xmm5                        \n"
+    "punpcklbw %xmm7,%xmm6                     \n"
+    "punpckhbw %xmm7,%xmm5                     \n"
+    "movdqa %xmm5,%xmm7                        \n"
+    "lea    0x10(%eax,%edi,8),%eax             \n"
+    "neg    %edi                               \n"
+    "movdqa %xmm0,%xmm5                        \n"
+    "punpcklwd %xmm2,%xmm0                     \n"
+    "punpckhwd %xmm2,%xmm5                     \n"
+    "movdqa %xmm5,%xmm2                        \n"
+    "movdqa %xmm1,%xmm5                        \n"
+    "punpcklwd %xmm3,%xmm1                     \n"
+    "punpckhwd %xmm3,%xmm5                     \n"
+    "movdqa %xmm5,%xmm3                        \n"
+    "movdqa %xmm4,%xmm5                        \n"
+    "punpcklwd %xmm6,%xmm4                     \n"
+    "punpckhwd %xmm6,%xmm5                     \n"
+    "movdqa %xmm5,%xmm6                        \n"
+    "movdqu (%esp),%xmm5                       \n"
+    "movdqu %xmm6,(%esp)                       \n"
+    "movdqa %xmm5,%xmm6                        \n"
+    "punpcklwd %xmm7,%xmm5                     \n"
+    "punpckhwd %xmm7,%xmm6                     \n"
+    "movdqa %xmm6,%xmm7                        \n"
+    "movdqa %xmm0,%xmm6                        \n"
+    "punpckldq %xmm4,%xmm0                     \n"
+    "punpckhdq %xmm4,%xmm6                     \n"
+    "movdqa %xmm6,%xmm4                        \n"
+    "movdqu (%esp),%xmm6                       \n"
+    "movlpd %xmm0,(%edx)                       \n"
+    "movhpd %xmm0,(%ebx)                       \n"
+    "movlpd %xmm4,(%edx,%esi,1)                \n"
+    "lea    (%edx,%esi,2),%edx                 \n"
+    "movhpd %xmm4,(%ebx,%ebp,1)                \n"
+    "lea    (%ebx,%ebp,2),%ebx                 \n"
+    "movdqa %xmm2,%xmm0                        \n"
+    "punpckldq %xmm6,%xmm2                     \n"
+    "movlpd %xmm2,(%edx)                       \n"
+    "movhpd %xmm2,(%ebx)                       \n"
+    "punpckhdq %xmm6,%xmm0                     \n"
+    "movlpd %xmm0,(%edx,%esi,1)                \n"
+    "lea    (%edx,%esi,2),%edx                 \n"
+    "movhpd %xmm0,(%ebx,%ebp,1)                \n"
+    "lea    (%ebx,%ebp,2),%ebx                 \n"
+    "movdqa %xmm1,%xmm0                        \n"
+    "punpckldq %xmm5,%xmm1                     \n"
+    "movlpd %xmm1,(%edx)                       \n"
+    "movhpd %xmm1,(%ebx)                       \n"
+    "punpckhdq %xmm5,%xmm0                     \n"
+    "movlpd %xmm0,(%edx,%esi,1)                \n"
+    "lea    (%edx,%esi,2),%edx                 \n"
+    "movhpd %xmm0,(%ebx,%ebp,1)                \n"
+    "lea    (%ebx,%ebp,2),%ebx                 \n"
+    "movdqa %xmm3,%xmm0                        \n"
+    "punpckldq %xmm7,%xmm3                     \n"
+    "movlpd %xmm3,(%edx)                       \n"
+    "movhpd %xmm3,(%ebx)                       \n"
+    "punpckhdq %xmm7,%xmm0                     \n"
+    "sub    $0x8,%ecx                          \n"
+    "movlpd %xmm0,(%edx,%esi,1)                \n"
+    "lea    (%edx,%esi,2),%edx                 \n"
+    "movhpd %xmm0,(%ebx,%ebp,1)                \n"
+    "lea    (%ebx,%ebp,2),%ebx                 \n"
+    "jg     1b                                 \n"
+    "mov    0x10(%esp),%esp                    \n"
+    "pop    %ebp                               \n"
+    "pop    %edi                               \n"
+    "pop    %esi                               \n"
+    "pop    %ebx                               \n"
+#if defined(__native_client__)
+    "pop    %ecx                               \n"
+    "and    $0xffffffe0,%ecx                   \n"
+    "jmp    *%ecx                              \n"
+#else
+    "ret                                       \n"
+#endif
+);
+#endif
+#if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
+    defined(__x86_64__)
+// 64 bit version has enough registers to do 16x8 to 8x16 at a time.
+void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,
+                             uint8* dst, int dst_stride, int width) {
+  asm volatile (
+  // Read in the data from the source pointer.
+  // First round of bit swap.
+  ".p2align  2                                 \n"
+"1:                                            \n"
+  "movdqu     (%0),%%xmm0                      \n"
+  "movdqu     (%0,%3),%%xmm1                   \n"
+  "lea        (%0,%3,2),%0                     \n"
+  "movdqa     %%xmm0,%%xmm8                    \n"
+  "punpcklbw  %%xmm1,%%xmm0                    \n"
+  "punpckhbw  %%xmm1,%%xmm8                    \n"
+  "movdqu     (%0),%%xmm2                      \n"
+  "movdqa     %%xmm0,%%xmm1                    \n"
+  "movdqa     %%xmm8,%%xmm9                    \n"
+  "palignr    $0x8,%%xmm1,%%xmm1               \n"
+  "palignr    $0x8,%%xmm9,%%xmm9               \n"
+  "movdqu     (%0,%3),%%xmm3                   \n"
+  "lea        (%0,%3,2),%0                     \n"
+  "movdqa     %%xmm2,%%xmm10                   \n"
+  "punpcklbw  %%xmm3,%%xmm2                    \n"
+  "punpckhbw  %%xmm3,%%xmm10                   \n"
+  "movdqa     %%xmm2,%%xmm3                    \n"
+  "movdqa     %%xmm10,%%xmm11                  \n"
+  "movdqu     (%0),%%xmm4                      \n"
+  "palignr    $0x8,%%xmm3,%%xmm3               \n"
+  "palignr    $0x8,%%xmm11,%%xmm11             \n"
+  "movdqu     (%0,%3),%%xmm5                   \n"
+  "lea        (%0,%3,2),%0                     \n"
+  "movdqa     %%xmm4,%%xmm12                   \n"
+  "punpcklbw  %%xmm5,%%xmm4                    \n"
+  "punpckhbw  %%xmm5,%%xmm12                   \n"
+  "movdqa     %%xmm4,%%xmm5                    \n"
+  "movdqa     %%xmm12,%%xmm13                  \n"
+  "movdqu     (%0),%%xmm6                      \n"
+  "palignr    $0x8,%%xmm5,%%xmm5               \n"
+  "palignr    $0x8,%%xmm13,%%xmm13             \n"
+  "movdqu     (%0,%3),%%xmm7                   \n"
+  "lea        (%0,%3,2),%0                     \n"
+  "movdqa     %%xmm6,%%xmm14                   \n"
+  "punpcklbw  %%xmm7,%%xmm6                    \n"
+  "punpckhbw  %%xmm7,%%xmm14                   \n"
+  "neg        %3                               \n"
+  "movdqa     %%xmm6,%%xmm7                    \n"
+  "movdqa     %%xmm14,%%xmm15                  \n"
+  "lea        0x10(%0,%3,8),%0                 \n"
+  "palignr    $0x8,%%xmm7,%%xmm7               \n"
+  "palignr    $0x8,%%xmm15,%%xmm15             \n"
+  "neg        %3                               \n"
+   // Second round of bit swap.
+  "punpcklwd  %%xmm2,%%xmm0                    \n"
+  "punpcklwd  %%xmm3,%%xmm1                    \n"
+  "movdqa     %%xmm0,%%xmm2                    \n"
+  "movdqa     %%xmm1,%%xmm3                    \n"
+  "palignr    $0x8,%%xmm2,%%xmm2               \n"
+  "palignr    $0x8,%%xmm3,%%xmm3               \n"
+  "punpcklwd  %%xmm6,%%xmm4                    \n"
+  "punpcklwd  %%xmm7,%%xmm5                    \n"
+  "movdqa     %%xmm4,%%xmm6                    \n"
+  "movdqa     %%xmm5,%%xmm7                    \n"
+  "palignr    $0x8,%%xmm6,%%xmm6               \n"
+  "palignr    $0x8,%%xmm7,%%xmm7               \n"
+  "punpcklwd  %%xmm10,%%xmm8                   \n"
+  "punpcklwd  %%xmm11,%%xmm9                   \n"
+  "movdqa     %%xmm8,%%xmm10                   \n"
+  "movdqa     %%xmm9,%%xmm11                   \n"
+  "palignr    $0x8,%%xmm10,%%xmm10             \n"
+  "palignr    $0x8,%%xmm11,%%xmm11             \n"
+  "punpcklwd  %%xmm14,%%xmm12                  \n"
+  "punpcklwd  %%xmm15,%%xmm13                  \n"
+  "movdqa     %%xmm12,%%xmm14                  \n"
+  "movdqa     %%xmm13,%%xmm15                  \n"
+  "palignr    $0x8,%%xmm14,%%xmm14             \n"
+  "palignr    $0x8,%%xmm15,%%xmm15             \n"
+  // Third round of bit swap.
+  // Write to the destination pointer.
+  "punpckldq  %%xmm4,%%xmm0                    \n"
+  "movq       %%xmm0,(%1)                      \n"
+  "movdqa     %%xmm0,%%xmm4                    \n"
+  "palignr    $0x8,%%xmm4,%%xmm4               \n"
+  "movq       %%xmm4,(%1,%4)                   \n"
+  "lea        (%1,%4,2),%1                     \n"
+  "punpckldq  %%xmm6,%%xmm2                    \n"
+  "movdqa     %%xmm2,%%xmm6                    \n"
+  "movq       %%xmm2,(%1)                      \n"
+  "palignr    $0x8,%%xmm6,%%xmm6               \n"
+  "punpckldq  %%xmm5,%%xmm1                    \n"
+  "movq       %%xmm6,(%1,%4)                   \n"
+  "lea        (%1,%4,2),%1                     \n"
+  "movdqa     %%xmm1,%%xmm5                    \n"
+  "movq       %%xmm1,(%1)                      \n"
+  "palignr    $0x8,%%xmm5,%%xmm5               \n"
+  "movq       %%xmm5,(%1,%4)                   \n"
+  "lea        (%1,%4,2),%1                     \n"
+  "punpckldq  %%xmm7,%%xmm3                    \n"
+  "movq       %%xmm3,(%1)                      \n"
+  "movdqa     %%xmm3,%%xmm7                    \n"
+  "palignr    $0x8,%%xmm7,%%xmm7               \n"
+  "movq       %%xmm7,(%1,%4)                   \n"
+  "lea        (%1,%4,2),%1                     \n"
+  "punpckldq  %%xmm12,%%xmm8                   \n"
+  "movq       %%xmm8,(%1)                      \n"
+  "movdqa     %%xmm8,%%xmm12                   \n"
+  "palignr    $0x8,%%xmm12,%%xmm12             \n"
+  "movq       %%xmm12,(%1,%4)                  \n"
+  "lea        (%1,%4,2),%1                     \n"
+  "punpckldq  %%xmm14,%%xmm10                  \n"
+  "movdqa     %%xmm10,%%xmm14                  \n"
+  "movq       %%xmm10,(%1)                     \n"
+  "palignr    $0x8,%%xmm14,%%xmm14             \n"
+  "punpckldq  %%xmm13,%%xmm9                   \n"
+  "movq       %%xmm14,(%1,%4)                  \n"
+  "lea        (%1,%4,2),%1                     \n"
+  "movdqa     %%xmm9,%%xmm13                   \n"
+  "movq       %%xmm9,(%1)                      \n"
+  "palignr    $0x8,%%xmm13,%%xmm13             \n"
+  "movq       %%xmm13,(%1,%4)                  \n"
+  "lea        (%1,%4,2),%1                     \n"
+  "punpckldq  %%xmm15,%%xmm11                  \n"
+  "movq       %%xmm11,(%1)                     \n"
+  "movdqa     %%xmm11,%%xmm15                  \n"
+  "palignr    $0x8,%%xmm15,%%xmm15             \n"
+  "sub        $0x10,%2                         \n"
+  "movq       %%xmm15,(%1,%4)                  \n"
+  "lea        (%1,%4,2),%1                     \n"
+  "jg         1b                               \n"
+  : "+r"(src),    // %0
+    "+r"(dst),    // %1
+    "+r"(width)   // %2
+  : "r"((intptr_t)(src_stride)),  // %3
+    "r"((intptr_t)(dst_stride))   // %4
+  : "memory", "cc",
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
+    "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",  "xmm14",  "xmm15"
+);
+}
+
+void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b, int width) {
+  asm volatile (
+  // Read in the data from the source pointer.
+  // First round of bit swap.
+  ".p2align  2                                 \n"
+"1:                                            \n"
+  "movdqu     (%0),%%xmm0                      \n"
+  "movdqu     (%0,%4),%%xmm1                   \n"
+  "lea        (%0,%4,2),%0                     \n"
+  "movdqa     %%xmm0,%%xmm8                    \n"
+  "punpcklbw  %%xmm1,%%xmm0                    \n"
+  "punpckhbw  %%xmm1,%%xmm8                    \n"
+  "movdqa     %%xmm8,%%xmm1                    \n"
+  "movdqu     (%0),%%xmm2                      \n"
+  "movdqu     (%0,%4),%%xmm3                   \n"
+  "lea        (%0,%4,2),%0                     \n"
+  "movdqa     %%xmm2,%%xmm8                    \n"
+  "punpcklbw  %%xmm3,%%xmm2                    \n"
+  "punpckhbw  %%xmm3,%%xmm8                    \n"
+  "movdqa     %%xmm8,%%xmm3                    \n"
+  "movdqu     (%0),%%xmm4                      \n"
+  "movdqu     (%0,%4),%%xmm5                   \n"
+  "lea        (%0,%4,2),%0                     \n"
+  "movdqa     %%xmm4,%%xmm8                    \n"
+  "punpcklbw  %%xmm5,%%xmm4                    \n"
+  "punpckhbw  %%xmm5,%%xmm8                    \n"
+  "movdqa     %%xmm8,%%xmm5                    \n"
+  "movdqu     (%0),%%xmm6                      \n"
+  "movdqu     (%0,%4),%%xmm7                   \n"
+  "lea        (%0,%4,2),%0                     \n"
+  "movdqa     %%xmm6,%%xmm8                    \n"
+  "punpcklbw  %%xmm7,%%xmm6                    \n"
+  "neg        %4                               \n"
+  "lea        0x10(%0,%4,8),%0                 \n"
+  "punpckhbw  %%xmm7,%%xmm8                    \n"
+  "movdqa     %%xmm8,%%xmm7                    \n"
+  "neg        %4                               \n"
+   // Second round of bit swap.
+  "movdqa     %%xmm0,%%xmm8                    \n"
+  "movdqa     %%xmm1,%%xmm9                    \n"
+  "punpckhwd  %%xmm2,%%xmm8                    \n"
+  "punpckhwd  %%xmm3,%%xmm9                    \n"
+  "punpcklwd  %%xmm2,%%xmm0                    \n"
+  "punpcklwd  %%xmm3,%%xmm1                    \n"
+  "movdqa     %%xmm8,%%xmm2                    \n"
+  "movdqa     %%xmm9,%%xmm3                    \n"
+  "movdqa     %%xmm4,%%xmm8                    \n"
+  "movdqa     %%xmm5,%%xmm9                    \n"
+  "punpckhwd  %%xmm6,%%xmm8                    \n"
+  "punpckhwd  %%xmm7,%%xmm9                    \n"
+  "punpcklwd  %%xmm6,%%xmm4                    \n"
+  "punpcklwd  %%xmm7,%%xmm5                    \n"
+  "movdqa     %%xmm8,%%xmm6                    \n"
+  "movdqa     %%xmm9,%%xmm7                    \n"
+  // Third round of bit swap.
+  // Write to the destination pointer.
+  "movdqa     %%xmm0,%%xmm8                    \n"
+  "punpckldq  %%xmm4,%%xmm0                    \n"
+  "movlpd     %%xmm0,(%1)                      \n"  // Write back U channel
+  "movhpd     %%xmm0,(%2)                      \n"  // Write back V channel
+  "punpckhdq  %%xmm4,%%xmm8                    \n"
+  "movlpd     %%xmm8,(%1,%5)                   \n"
+  "lea        (%1,%5,2),%1                     \n"
+  "movhpd     %%xmm8,(%2,%6)                   \n"
+  "lea        (%2,%6,2),%2                     \n"
+  "movdqa     %%xmm2,%%xmm8                    \n"
+  "punpckldq  %%xmm6,%%xmm2                    \n"
+  "movlpd     %%xmm2,(%1)                      \n"
+  "movhpd     %%xmm2,(%2)                      \n"
+  "punpckhdq  %%xmm6,%%xmm8                    \n"
+  "movlpd     %%xmm8,(%1,%5)                   \n"
+  "lea        (%1,%5,2),%1                     \n"
+  "movhpd     %%xmm8,(%2,%6)                   \n"
+  "lea        (%2,%6,2),%2                     \n"
+  "movdqa     %%xmm1,%%xmm8                    \n"
+  "punpckldq  %%xmm5,%%xmm1                    \n"
+  "movlpd     %%xmm1,(%1)                      \n"
+  "movhpd     %%xmm1,(%2)                      \n"
+  "punpckhdq  %%xmm5,%%xmm8                    \n"
+  "movlpd     %%xmm8,(%1,%5)                   \n"
+  "lea        (%1,%5,2),%1                     \n"
+  "movhpd     %%xmm8,(%2,%6)                   \n"
+  "lea        (%2,%6,2),%2                     \n"
+  "movdqa     %%xmm3,%%xmm8                    \n"
+  "punpckldq  %%xmm7,%%xmm3                    \n"
+  "movlpd     %%xmm3,(%1)                      \n"
+  "movhpd     %%xmm3,(%2)                      \n"
+  "punpckhdq  %%xmm7,%%xmm8                    \n"
+  "sub        $0x8,%3                          \n"
+  "movlpd     %%xmm8,(%1,%5)                   \n"
+  "lea        (%1,%5,2),%1                     \n"
+  "movhpd     %%xmm8,(%2,%6)                   \n"
+  "lea        (%2,%6,2),%2                     \n"
+  "jg         1b                               \n"
+  : "+r"(src),    // %0
+    "+r"(dst_a),  // %1
+    "+r"(dst_b),  // %2
+    "+r"(width)   // %3
+  : "r"((intptr_t)(src_stride)),    // %4
+    "r"((intptr_t)(dst_stride_a)),  // %5
+    "r"((intptr_t)(dst_stride_b))   // %6
+  : "memory", "cc",
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
+    "xmm8", "xmm9"
+);
+}
+#endif
+#endif
+
+#endif  // defined(__x86_64__) || defined(__i386__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libvpx/third_party/libyuv/source/rotate_mips.cc b/libs/libvpx/third_party/libyuv/source/rotate_mips.cc
new file mode 100644
index 0000000000..efe6bd909e
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/source/rotate_mips.cc
@@ -0,0 +1,484 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/rotate_row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_MIPS) && \
+    defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
+    (_MIPS_SIM == _MIPS_SIM_ABI32)
+
+void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
+                             uint8* dst, int dst_stride, int width) {
+   __asm__ __volatile__ (
+      ".set push                                         \n"
+      ".set noreorder                                    \n"
+      "sll              $t2, %[src_stride], 0x1          \n" // src_stride x 2
+      "sll              $t4, %[src_stride], 0x2          \n" // src_stride x 4
+      "sll              $t9, %[src_stride], 0x3          \n" // src_stride x 8
+      "addu             $t3, $t2, %[src_stride]          \n"
+      "addu             $t5, $t4, %[src_stride]          \n"
+      "addu             $t6, $t2, $t4                    \n"
+      "andi             $t0, %[dst], 0x3                 \n"
+      "andi             $t1, %[dst_stride], 0x3          \n"
+      "or               $t0, $t0, $t1                    \n"
+      "bnez             $t0, 11f                         \n"
+      " subu            $t7, $t9, %[src_stride]          \n"
+//dst + dst_stride word aligned
+    "1:                                                  \n"
+      "lbu              $t0, 0(%[src])                   \n"
+      "lbux             $t1, %[src_stride](%[src])       \n"
+      "lbux             $t8, $t2(%[src])                 \n"
+      "lbux             $t9, $t3(%[src])                 \n"
+      "sll              $t1, $t1, 16                     \n"
+      "sll              $t9, $t9, 16                     \n"
+      "or               $t0, $t0, $t1                    \n"
+      "or               $t8, $t8, $t9                    \n"
+      "precr.qb.ph      $s0, $t8, $t0                    \n"
+      "lbux             $t0, $t4(%[src])                 \n"
+      "lbux             $t1, $t5(%[src])                 \n"
+      "lbux             $t8, $t6(%[src])                 \n"
+      "lbux             $t9, $t7(%[src])                 \n"
+      "sll              $t1, $t1, 16                     \n"
+      "sll              $t9, $t9, 16                     \n"
+      "or               $t0, $t0, $t1                    \n"
+      "or               $t8, $t8, $t9                    \n"
+      "precr.qb.ph      $s1, $t8, $t0                    \n"
+      "sw               $s0, 0(%[dst])                   \n"
+      "addiu            %[width], -1                     \n"
+      "addiu            %[src], 1                        \n"
+      "sw               $s1, 4(%[dst])                   \n"
+      "bnez             %[width], 1b                     \n"
+      " addu            %[dst], %[dst], %[dst_stride]    \n"
+      "b                2f                               \n"
+//dst + dst_stride unaligned
+   "11:                                                  \n"
+      "lbu              $t0, 0(%[src])                   \n"
+      "lbux             $t1, %[src_stride](%[src])       \n"
+      "lbux             $t8, $t2(%[src])                 \n"
+      "lbux             $t9, $t3(%[src])                 \n"
+      "sll              $t1, $t1, 16                     \n"
+      "sll              $t9, $t9, 16                     \n"
+      "or               $t0, $t0, $t1                    \n"
+      "or               $t8, $t8, $t9                    \n"
+      "precr.qb.ph      $s0, $t8, $t0                    \n"
+      "lbux             $t0, $t4(%[src])                 \n"
+      "lbux             $t1, $t5(%[src])                 \n"
+      "lbux             $t8, $t6(%[src])                 \n"
+      "lbux             $t9, $t7(%[src])                 \n"
+      "sll              $t1, $t1, 16                     \n"
+      "sll              $t9, $t9, 16                     \n"
+      "or               $t0, $t0, $t1                    \n"
+      "or               $t8, $t8, $t9                    \n"
+      "precr.qb.ph      $s1, $t8, $t0                    \n"
+      "swr              $s0, 0(%[dst])                   \n"
+      "swl              $s0, 3(%[dst])                   \n"
+      "addiu            %[width], -1                     \n"
+      "addiu            %[src], 1                        \n"
+      "swr              $s1, 4(%[dst])                   \n"
+      "swl              $s1, 7(%[dst])                   \n"
+      "bnez             %[width], 11b                    \n"
+       "addu             %[dst], %[dst], %[dst_stride]   \n"
+    "2:                                                  \n"
+      ".set pop                                          \n"
+      :[src] "+r" (src),
+       [dst] "+r" (dst),
+       [width] "+r" (width)
+      :[src_stride] "r" (src_stride),
+       [dst_stride] "r" (dst_stride)
+      : "t0", "t1",  "t2", "t3", "t4", "t5",
+        "t6", "t7", "t8", "t9",
+        "s0", "s1"
+  );
+}
+
+void TransposeWx8_Fast_MIPS_DSPR2(const uint8* src, int src_stride,
+                                  uint8* dst, int dst_stride, int width) {
+  __asm__ __volatile__ (
+      ".set noat                                         \n"
+      ".set push                                         \n"
+      ".set noreorder                                    \n"
+      "beqz             %[width], 2f                     \n"
+      " sll             $t2, %[src_stride], 0x1          \n"  // src_stride x 2
+      "sll              $t4, %[src_stride], 0x2          \n"  // src_stride x 4
+      "sll              $t9, %[src_stride], 0x3          \n"  // src_stride x 8
+      "addu             $t3, $t2, %[src_stride]          \n"
+      "addu             $t5, $t4, %[src_stride]          \n"
+      "addu             $t6, $t2, $t4                    \n"
+
+      "srl              $AT, %[width], 0x2               \n"
+      "andi             $t0, %[dst], 0x3                 \n"
+      "andi             $t1, %[dst_stride], 0x3          \n"
+      "or               $t0, $t0, $t1                    \n"
+      "bnez             $t0, 11f                         \n"
+      " subu            $t7, $t9, %[src_stride]          \n"
+//dst + dst_stride word aligned
+      "1:                                                \n"
+      "lw               $t0, 0(%[src])                   \n"
+      "lwx              $t1, %[src_stride](%[src])       \n"
+      "lwx              $t8, $t2(%[src])                 \n"
+      "lwx              $t9, $t3(%[src])                 \n"
+
+// t0 = | 30 | 20 | 10 | 00 |
+// t1 = | 31 | 21 | 11 | 01 |
+// t8 = | 32 | 22 | 12 | 02 |
+// t9 = | 33 | 23 | 13 | 03 |
+
+      "precr.qb.ph     $s0, $t1, $t0                     \n"
+      "precr.qb.ph     $s1, $t9, $t8                     \n"
+      "precrq.qb.ph    $s2, $t1, $t0                     \n"
+      "precrq.qb.ph    $s3, $t9, $t8                     \n"
+
+  // s0 = | 21 | 01 | 20 | 00 |
+  // s1 = | 23 | 03 | 22 | 02 |
+  // s2 = | 31 | 11 | 30 | 10 |
+  // s3 = | 33 | 13 | 32 | 12 |
+
+      "precr.qb.ph     $s4, $s1, $s0                     \n"
+      "precrq.qb.ph    $s5, $s1, $s0                     \n"
+      "precr.qb.ph     $s6, $s3, $s2                     \n"
+      "precrq.qb.ph    $s7, $s3, $s2                     \n"
+
+  // s4 = | 03 | 02 | 01 | 00 |
+  // s5 = | 23 | 22 | 21 | 20 |
+  // s6 = | 13 | 12 | 11 | 10 |
+  // s7 = | 33 | 32 | 31 | 30 |
+
+      "lwx              $t0, $t4(%[src])                 \n"
+      "lwx              $t1, $t5(%[src])                 \n"
+      "lwx              $t8, $t6(%[src])                 \n"
+      "lwx              $t9, $t7(%[src])                 \n"
+
+// t0 = | 34 | 24 | 14 | 04 |
+// t1 = | 35 | 25 | 15 | 05 |
+// t8 = | 36 | 26 | 16 | 06 |
+// t9 = | 37 | 27 | 17 | 07 |
+
+      "precr.qb.ph     $s0, $t1, $t0                     \n"
+      "precr.qb.ph     $s1, $t9, $t8                     \n"
+      "precrq.qb.ph    $s2, $t1, $t0                     \n"
+      "precrq.qb.ph    $s3, $t9, $t8                     \n"
+
+  // s0 = | 25 | 05 | 24 | 04 |
+  // s1 = | 27 | 07 | 26 | 06 |
+  // s2 = | 35 | 15 | 34 | 14 |
+  // s3 = | 37 | 17 | 36 | 16 |
+
+      "precr.qb.ph     $t0, $s1, $s0                     \n"
+      "precrq.qb.ph    $t1, $s1, $s0                     \n"
+      "precr.qb.ph     $t8, $s3, $s2                     \n"
+      "precrq.qb.ph    $t9, $s3, $s2                     \n"
+
+  // t0 = | 07 | 06 | 05 | 04 |
+  // t1 = | 27 | 26 | 25 | 24 |
+  // t8 = | 17 | 16 | 15 | 14 |
+  // t9 = | 37 | 36 | 35 | 34 |
+
+      "addu            $s0, %[dst], %[dst_stride]        \n"
+      "addu            $s1, $s0, %[dst_stride]           \n"
+      "addu            $s2, $s1, %[dst_stride]           \n"
+
+      "sw              $s4, 0(%[dst])                    \n"
+      "sw              $t0, 4(%[dst])                    \n"
+      "sw              $s6, 0($s0)                       \n"
+      "sw              $t8, 4($s0)                       \n"
+      "sw              $s5, 0($s1)                       \n"
+      "sw              $t1, 4($s1)                       \n"
+      "sw              $s7, 0($s2)                       \n"
+      "sw              $t9, 4($s2)                       \n"
+
+      "addiu            $AT, -1                          \n"
+      "addiu            %[src], 4                        \n"
+
+      "bnez             $AT, 1b                          \n"
+      " addu            %[dst], $s2, %[dst_stride]       \n"
+      "b                2f                               \n"
+//dst + dst_stride unaligned
+      "11:                                               \n"
+      "lw               $t0, 0(%[src])                   \n"
+      "lwx              $t1, %[src_stride](%[src])       \n"
+      "lwx              $t8, $t2(%[src])                 \n"
+      "lwx              $t9, $t3(%[src])                 \n"
+
+// t0 = | 30 | 20 | 10 | 00 |
+// t1 = | 31 | 21 | 11 | 01 |
+// t8 = | 32 | 22 | 12 | 02 |
+// t9 = | 33 | 23 | 13 | 03 |
+
+      "precr.qb.ph     $s0, $t1, $t0                     \n"
+      "precr.qb.ph     $s1, $t9, $t8                     \n"
+      "precrq.qb.ph    $s2, $t1, $t0                     \n"
+      "precrq.qb.ph    $s3, $t9, $t8                     \n"
+
+  // s0 = | 21 | 01 | 20 | 00 |
+  // s1 = | 23 | 03 | 22 | 02 |
+  // s2 = | 31 | 11 | 30 | 10 |
+  // s3 = | 33 | 13 | 32 | 12 |
+
+      "precr.qb.ph     $s4, $s1, $s0                     \n"
+      "precrq.qb.ph    $s5, $s1, $s0                     \n"
+      "precr.qb.ph     $s6, $s3, $s2                     \n"
+      "precrq.qb.ph    $s7, $s3, $s2                     \n"
+
+  // s4 = | 03 | 02 | 01 | 00 |
+  // s5 = | 23 | 22 | 21 | 20 |
+  // s6 = | 13 | 12 | 11 | 10 |
+  // s7 = | 33 | 32 | 31 | 30 |
+
+      "lwx              $t0, $t4(%[src])                 \n"
+      "lwx              $t1, $t5(%[src])                 \n"
+      "lwx              $t8, $t6(%[src])                 \n"
+      "lwx              $t9, $t7(%[src])                 \n"
+
+// t0 = | 34 | 24 | 14 | 04 |
+// t1 = | 35 | 25 | 15 | 05 |
+// t8 = | 36 | 26 | 16 | 06 |
+// t9 = | 37 | 27 | 17 | 07 |
+
+      "precr.qb.ph     $s0, $t1, $t0                     \n"
+      "precr.qb.ph     $s1, $t9, $t8                     \n"
+      "precrq.qb.ph    $s2, $t1, $t0                     \n"
+      "precrq.qb.ph    $s3, $t9, $t8                     \n"
+
+  // s0 = | 25 | 05 | 24 | 04 |
+  // s1 = | 27 | 07 | 26 | 06 |
+  // s2 = | 35 | 15 | 34 | 14 |
+  // s3 = | 37 | 17 | 36 | 16 |
+
+      "precr.qb.ph     $t0, $s1, $s0                     \n"
+      "precrq.qb.ph    $t1, $s1, $s0                     \n"
+      "precr.qb.ph     $t8, $s3, $s2                     \n"
+      "precrq.qb.ph    $t9, $s3, $s2                     \n"
+
+  // t0 = | 07 | 06 | 05 | 04 |
+  // t1 = | 27 | 26 | 25 | 24 |
+  // t8 = | 17 | 16 | 15 | 14 |
+  // t9 = | 37 | 36 | 35 | 34 |
+
+      "addu            $s0, %[dst], %[dst_stride]        \n"
+      "addu            $s1, $s0, %[dst_stride]           \n"
+      "addu            $s2, $s1, %[dst_stride]           \n"
+
+      "swr              $s4, 0(%[dst])                   \n"
+      "swl              $s4, 3(%[dst])                   \n"
+      "swr              $t0, 4(%[dst])                   \n"
+      "swl              $t0, 7(%[dst])                   \n"
+      "swr              $s6, 0($s0)                      \n"
+      "swl              $s6, 3($s0)                      \n"
+      "swr              $t8, 4($s0)                      \n"
+      "swl              $t8, 7($s0)                      \n"
+      "swr              $s5, 0($s1)                      \n"
+      "swl              $s5, 3($s1)                      \n"
+      "swr              $t1, 4($s1)                      \n"
+      "swl              $t1, 7($s1)                      \n"
+      "swr              $s7, 0($s2)                      \n"
+      "swl              $s7, 3($s2)                      \n"
+      "swr              $t9, 4($s2)                      \n"
+      "swl              $t9, 7($s2)                      \n"
+
+      "addiu            $AT, -1                          \n"
+      "addiu            %[src], 4                        \n"
+
+      "bnez             $AT, 11b                         \n"
+      " addu            %[dst], $s2, %[dst_stride]       \n"
+      "2:                                                \n"
+      ".set pop                                          \n"
+      ".set at                                           \n"
+      :[src] "+r" (src),
+       [dst] "+r" (dst),
+       [width] "+r" (width)
+      :[src_stride] "r" (src_stride),
+       [dst_stride] "r" (dst_stride)
+      : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9",
+        "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7"
+  );
+}
+
+void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
+                               uint8* dst_a, int dst_stride_a,
+                               uint8* dst_b, int dst_stride_b,
+                               int width) {
+  __asm__ __volatile__ (
+      ".set push                                         \n"
+      ".set noreorder                                    \n"
+      "beqz            %[width], 2f                      \n"
+      " sll            $t2, %[src_stride], 0x1           \n" // src_stride x 2
+      "sll             $t4, %[src_stride], 0x2           \n" // src_stride x 4
+      "sll             $t9, %[src_stride], 0x3           \n" // src_stride x 8
+      "addu            $t3, $t2, %[src_stride]           \n"
+      "addu            $t5, $t4, %[src_stride]           \n"
+      "addu            $t6, $t2, $t4                     \n"
+      "subu            $t7, $t9, %[src_stride]           \n"
+      "srl             $t1, %[width], 1                  \n"
+
+// check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b
+      "andi            $t0, %[dst_a], 0x3                \n"
+      "andi            $t8, %[dst_b], 0x3                \n"
+      "or              $t0, $t0, $t8                     \n"
+      "andi            $t8, %[dst_stride_a], 0x3         \n"
+      "andi            $s5, %[dst_stride_b], 0x3         \n"
+      "or              $t8, $t8, $s5                     \n"
+      "or              $t0, $t0, $t8                     \n"
+      "bnez            $t0, 11f                          \n"
+      " nop                                              \n"
+// dst + dst_stride word aligned (both, a & b dst addresses)
+    "1:                                                  \n"
+      "lw              $t0, 0(%[src])                    \n" // |B0|A0|b0|a0|
+      "lwx             $t8, %[src_stride](%[src])        \n" // |B1|A1|b1|a1|
+      "addu            $s5, %[dst_a], %[dst_stride_a]    \n"
+      "lwx             $t9, $t2(%[src])                  \n" // |B2|A2|b2|a2|
+      "lwx             $s0, $t3(%[src])                  \n" // |B3|A3|b3|a3|
+      "addu            $s6, %[dst_b], %[dst_stride_b]    \n"
+
+      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B1|A1|B0|A0|
+      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B3|A3|B2|A2|
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A3|A2|A1|A0|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B3|B2|B1|B0|
+
+      "sll             $t0, $t0, 16                      \n"
+      "packrl.ph       $s1, $t8, $t0                     \n" // |b1|a1|b0|a0|
+      "sll             $t9, $t9, 16                      \n"
+      "packrl.ph       $s2, $s0, $t9                     \n" // |b3|a3|b2|a2|
+
+      "sw              $s3, 0($s5)                       \n"
+      "sw              $s4, 0($s6)                       \n"
+
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a3|a2|a1|a0|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b3|b2|b1|b0|
+
+      "lwx             $t0, $t4(%[src])                  \n" // |B4|A4|b4|a4|
+      "lwx             $t8, $t5(%[src])                  \n" // |B5|A5|b5|a5|
+      "lwx             $t9, $t6(%[src])                  \n" // |B6|A6|b6|a6|
+      "lwx             $s0, $t7(%[src])                  \n" // |B7|A7|b7|a7|
+      "sw              $s3, 0(%[dst_a])                  \n"
+      "sw              $s4, 0(%[dst_b])                  \n"
+
+      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B5|A5|B4|A4|
+      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B6|A6|B7|A7|
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A7|A6|A5|A4|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B7|B6|B5|B4|
+
+      "sll             $t0, $t0, 16                      \n"
+      "packrl.ph       $s1, $t8, $t0                     \n" // |b5|a5|b4|a4|
+      "sll             $t9, $t9, 16                      \n"
+      "packrl.ph       $s2, $s0, $t9                     \n" // |b7|a7|b6|a6|
+      "sw              $s3, 4($s5)                       \n"
+      "sw              $s4, 4($s6)                       \n"
+
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a7|a6|a5|a4|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b7|b6|b5|b4|
+
+      "addiu           %[src], 4                         \n"
+      "addiu           $t1, -1                           \n"
+      "sll             $t0, %[dst_stride_a], 1           \n"
+      "sll             $t8, %[dst_stride_b], 1           \n"
+      "sw              $s3, 4(%[dst_a])                  \n"
+      "sw              $s4, 4(%[dst_b])                  \n"
+      "addu            %[dst_a], %[dst_a], $t0           \n"
+      "bnez            $t1, 1b                           \n"
+      " addu           %[dst_b], %[dst_b], $t8           \n"
+      "b               2f                                \n"
+      " nop                                              \n"
+
+// dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned
+   "11:                                                  \n"
+      "lw              $t0, 0(%[src])                    \n" // |B0|A0|b0|a0|
+      "lwx             $t8, %[src_stride](%[src])        \n" // |B1|A1|b1|a1|
+      "addu            $s5, %[dst_a], %[dst_stride_a]    \n"
+      "lwx             $t9, $t2(%[src])                  \n" // |B2|A2|b2|a2|
+      "lwx             $s0, $t3(%[src])                  \n" // |B3|A3|b3|a3|
+      "addu            $s6, %[dst_b], %[dst_stride_b]    \n"
+
+      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B1|A1|B0|A0|
+      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B3|A3|B2|A2|
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A3|A2|A1|A0|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B3|B2|B1|B0|
+
+      "sll             $t0, $t0, 16                      \n"
+      "packrl.ph       $s1, $t8, $t0                     \n" // |b1|a1|b0|a0|
+      "sll             $t9, $t9, 16                      \n"
+      "packrl.ph       $s2, $s0, $t9                     \n" // |b3|a3|b2|a2|
+
+      "swr             $s3, 0($s5)                       \n"
+      "swl             $s3, 3($s5)                       \n"
+      "swr             $s4, 0($s6)                       \n"
+      "swl             $s4, 3($s6)                       \n"
+
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a3|a2|a1|a0|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b3|b2|b1|b0|
+
+      "lwx             $t0, $t4(%[src])                  \n" // |B4|A4|b4|a4|
+      "lwx             $t8, $t5(%[src])                  \n" // |B5|A5|b5|a5|
+      "lwx             $t9, $t6(%[src])                  \n" // |B6|A6|b6|a6|
+      "lwx             $s0, $t7(%[src])                  \n" // |B7|A7|b7|a7|
+      "swr             $s3, 0(%[dst_a])                  \n"
+      "swl             $s3, 3(%[dst_a])                  \n"
+      "swr             $s4, 0(%[dst_b])                  \n"
+      "swl             $s4, 3(%[dst_b])                  \n"
+
+      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B5|A5|B4|A4|
+      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B6|A6|B7|A7|
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A7|A6|A5|A4|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B7|B6|B5|B4|
+
+      "sll             $t0, $t0, 16                      \n"
+      "packrl.ph       $s1, $t8, $t0                     \n" // |b5|a5|b4|a4|
+      "sll             $t9, $t9, 16                      \n"
+      "packrl.ph       $s2, $s0, $t9                     \n" // |b7|a7|b6|a6|
+
+      "swr             $s3, 4($s5)                       \n"
+      "swl             $s3, 7($s5)                       \n"
+      "swr             $s4, 4($s6)                       \n"
+      "swl             $s4, 7($s6)                       \n"
+
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a7|a6|a5|a4|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b7|b6|b5|b4|
+
+      "addiu           %[src], 4                         \n"
+      "addiu           $t1, -1                           \n"
+      "sll             $t0, %[dst_stride_a], 1           \n"
+      "sll             $t8, %[dst_stride_b], 1           \n"
+      "swr             $s3, 4(%[dst_a])                  \n"
+      "swl             $s3, 7(%[dst_a])                  \n"
+      "swr             $s4, 4(%[dst_b])                  \n"
+      "swl             $s4, 7(%[dst_b])                  \n"
+      "addu            %[dst_a], %[dst_a], $t0           \n"
+      "bnez            $t1, 11b                          \n"
+      " addu           %[dst_b], %[dst_b], $t8           \n"
+
+      "2:                                                \n"
+      ".set pop                                          \n"
+      : [src] "+r" (src),
+        [dst_a] "+r" (dst_a),
+        [dst_b] "+r" (dst_b),
+        [width] "+r" (width),
+        [src_stride] "+r" (src_stride)
+      : [dst_stride_a] "r" (dst_stride_a),
+        [dst_stride_b] "r" (dst_stride_b)
+      : "t0", "t1",  "t2", "t3",  "t4", "t5",
+        "t6", "t7", "t8", "t9",
+        "s0", "s1", "s2", "s3",
+        "s4", "s5", "s6"
+  );
+}
+
+#endif  // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libvpx/third_party/libyuv/source/rotate_neon.cc b/libs/libvpx/third_party/libyuv/source/rotate_neon.cc
new file mode 100644
index 0000000000..76043b3b3c
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/source/rotate_neon.cc
@@ -0,0 +1,535 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/rotate_row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+    !defined(__aarch64__)
+
+static uvec8 kVTbl4x4Transpose =
+  { 0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15 };
+
+void TransposeWx8_NEON(const uint8* src, int src_stride,
+                       uint8* dst, int dst_stride,
+                       int width) {
+  const uint8* src_temp = NULL;
+  asm volatile (
+    // loops are on blocks of 8. loop will stop when
+    // counter gets to or below 0. starting the counter
+    // at w-8 allow for this
+    "sub         %5, #8                        \n"
+
+    // handle 8x8 blocks. this should be the majority of the plane
+    ".p2align  2                               \n"
+    "1:                                        \n"
+      "mov         %0, %1                      \n"
+
+      MEMACCESS(0)
+      "vld1.8      {d0}, [%0], %2              \n"
+      MEMACCESS(0)
+      "vld1.8      {d1}, [%0], %2              \n"
+      MEMACCESS(0)
+      "vld1.8      {d2}, [%0], %2              \n"
+      MEMACCESS(0)
+      "vld1.8      {d3}, [%0], %2              \n"
+      MEMACCESS(0)
+      "vld1.8      {d4}, [%0], %2              \n"
+      MEMACCESS(0)
+      "vld1.8      {d5}, [%0], %2              \n"
+      MEMACCESS(0)
+      "vld1.8      {d6}, [%0], %2              \n"
+      MEMACCESS(0)
+      "vld1.8      {d7}, [%0]                  \n"
+
+      "vtrn.8      d1, d0                      \n"
+      "vtrn.8      d3, d2                      \n"
+      "vtrn.8      d5, d4                      \n"
+      "vtrn.8      d7, d6                      \n"
+
+      "vtrn.16     d1, d3                      \n"
+      "vtrn.16     d0, d2                      \n"
+      "vtrn.16     d5, d7                      \n"
+      "vtrn.16     d4, d6                      \n"
+
+      "vtrn.32     d1, d5                      \n"
+      "vtrn.32     d0, d4                      \n"
+      "vtrn.32     d3, d7                      \n"
+      "vtrn.32     d2, d6                      \n"
+
+      "vrev16.8    q0, q0                      \n"
+      "vrev16.8    q1, q1                      \n"
+      "vrev16.8    q2, q2                      \n"
+      "vrev16.8    q3, q3                      \n"
+
+      "mov         %0, %3                      \n"
+
+    MEMACCESS(0)
+      "vst1.8      {d1}, [%0], %4              \n"
+    MEMACCESS(0)
+      "vst1.8      {d0}, [%0], %4              \n"
+    MEMACCESS(0)
+      "vst1.8      {d3}, [%0], %4              \n"
+    MEMACCESS(0)
+      "vst1.8      {d2}, [%0], %4              \n"
+    MEMACCESS(0)
+      "vst1.8      {d5}, [%0], %4              \n"
+    MEMACCESS(0)
+      "vst1.8      {d4}, [%0], %4              \n"
+    MEMACCESS(0)
+      "vst1.8      {d7}, [%0], %4              \n"
+    MEMACCESS(0)
+      "vst1.8      {d6}, [%0]                  \n"
+
+      "add         %1, #8                      \n"  // src += 8
+      "add         %3, %3, %4, lsl #3          \n"  // dst += 8 * dst_stride
+      "subs        %5,  #8                     \n"  // w   -= 8
+      "bge         1b                          \n"
+
+    // add 8 back to counter. if the result is 0 there are
+    // no residuals.
+    "adds        %5, #8                        \n"
+    "beq         4f                            \n"
+
+    // some residual, so between 1 and 7 lines left to transpose
+    "cmp         %5, #2                        \n"
+    "blt         3f                            \n"
+
+    "cmp         %5, #4                        \n"
+    "blt         2f                            \n"
+
+    // 4x8 block
+    "mov         %0, %1                        \n"
+    MEMACCESS(0)
+    "vld1.32     {d0[0]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.32     {d0[1]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.32     {d1[0]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.32     {d1[1]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.32     {d2[0]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.32     {d2[1]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.32     {d3[0]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.32     {d3[1]}, [%0]                 \n"
+
+    "mov         %0, %3                        \n"
+
+    MEMACCESS(6)
+    "vld1.8      {q3}, [%6]                    \n"
+
+    "vtbl.8      d4, {d0, d1}, d6              \n"
+    "vtbl.8      d5, {d0, d1}, d7              \n"
+    "vtbl.8      d0, {d2, d3}, d6              \n"
+    "vtbl.8      d1, {d2, d3}, d7              \n"
+
+    // TODO(frkoenig): Rework shuffle above to
+    // write out with 4 instead of 8 writes.
+    MEMACCESS(0)
+    "vst1.32     {d4[0]}, [%0], %4             \n"
+    MEMACCESS(0)
+    "vst1.32     {d4[1]}, [%0], %4             \n"
+    MEMACCESS(0)
+    "vst1.32     {d5[0]}, [%0], %4             \n"
+    MEMACCESS(0)
+    "vst1.32     {d5[1]}, [%0]                 \n"
+
+    "add         %0, %3, #4                    \n"
+    MEMACCESS(0)
+    "vst1.32     {d0[0]}, [%0], %4             \n"
+    MEMACCESS(0)
+    "vst1.32     {d0[1]}, [%0], %4             \n"
+    MEMACCESS(0)
+    "vst1.32     {d1[0]}, [%0], %4             \n"
+    MEMACCESS(0)
+    "vst1.32     {d1[1]}, [%0]                 \n"
+
+    "add         %1, #4                        \n"  // src += 4
+    "add         %3, %3, %4, lsl #2            \n"  // dst += 4 * dst_stride
+    "subs        %5,  #4                       \n"  // w   -= 4
+    "beq         4f                            \n"
+
+    // some residual, check to see if it includes a 2x8 block,
+    // or less
+    "cmp         %5, #2                        \n"
+    "blt         3f                            \n"
+
+    // 2x8 block
+    "2:                                        \n"
+    "mov         %0, %1                        \n"
+    MEMACCESS(0)
+    "vld1.16     {d0[0]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.16     {d1[0]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.16     {d0[1]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.16     {d1[1]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.16     {d0[2]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.16     {d1[2]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.16     {d0[3]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.16     {d1[3]}, [%0]                 \n"
+
+    "vtrn.8      d0, d1                        \n"
+
+    "mov         %0, %3                        \n"
+
+    MEMACCESS(0)
+    "vst1.64     {d0}, [%0], %4                \n"
+    MEMACCESS(0)
+    "vst1.64     {d1}, [%0]                    \n"
+
+    "add         %1, #2                        \n"  // src += 2
+    "add         %3, %3, %4, lsl #1            \n"  // dst += 2 * dst_stride
+    "subs        %5,  #2                       \n"  // w   -= 2
+    "beq         4f                            \n"
+
+    // 1x8 block
+    "3:                                        \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[0]}, [%1], %2             \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[1]}, [%1], %2             \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[2]}, [%1], %2             \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[3]}, [%1], %2             \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[4]}, [%1], %2             \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[5]}, [%1], %2             \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[6]}, [%1], %2             \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[7]}, [%1]                 \n"
+
+    MEMACCESS(3)
+    "vst1.64     {d0}, [%3]                    \n"
+
+    "4:                                        \n"
+
+    : "+r"(src_temp),          // %0
+      "+r"(src),               // %1
+      "+r"(src_stride),        // %2
+      "+r"(dst),               // %3
+      "+r"(dst_stride),        // %4
+      "+r"(width)              // %5
+    : "r"(&kVTbl4x4Transpose)  // %6
+    : "memory", "cc", "q0", "q1", "q2", "q3"
+  );
+}
+
+static uvec8 kVTbl4x4TransposeDi =
+  { 0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15 };
+
+void TransposeUVWx8_NEON(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b,
+                         int width) {
+  const uint8* src_temp = NULL;
+  asm volatile (
+    // loops are on blocks of 8. loop will stop when
+    // counter gets to or below 0. starting the counter
+    // at w-8 allow for this
+    "sub         %7, #8                        \n"
+
+    // handle 8x8 blocks. this should be the majority of the plane
+    ".p2align  2                               \n"
+    "1:                                        \n"
+      "mov         %0, %1                      \n"
+
+      MEMACCESS(0)
+      "vld2.8      {d0,  d1},  [%0], %2        \n"
+      MEMACCESS(0)
+      "vld2.8      {d2,  d3},  [%0], %2        \n"
+      MEMACCESS(0)
+      "vld2.8      {d4,  d5},  [%0], %2        \n"
+      MEMACCESS(0)
+      "vld2.8      {d6,  d7},  [%0], %2        \n"
+      MEMACCESS(0)
+      "vld2.8      {d16, d17}, [%0], %2        \n"
+      MEMACCESS(0)
+      "vld2.8      {d18, d19}, [%0], %2        \n"
+      MEMACCESS(0)
+      "vld2.8      {d20, d21}, [%0], %2        \n"
+      MEMACCESS(0)
+      "vld2.8      {d22, d23}, [%0]            \n"
+
+      "vtrn.8      q1, q0                      \n"
+      "vtrn.8      q3, q2                      \n"
+      "vtrn.8      q9, q8                      \n"
+      "vtrn.8      q11, q10                    \n"
+
+      "vtrn.16     q1, q3                      \n"
+      "vtrn.16     q0, q2                      \n"
+      "vtrn.16     q9, q11                     \n"
+      "vtrn.16     q8, q10                     \n"
+
+      "vtrn.32     q1, q9                      \n"
+      "vtrn.32     q0, q8                      \n"
+      "vtrn.32     q3, q11                     \n"
+      "vtrn.32     q2, q10                     \n"
+
+      "vrev16.8    q0, q0                      \n"
+      "vrev16.8    q1, q1                      \n"
+      "vrev16.8    q2, q2                      \n"
+      "vrev16.8    q3, q3                      \n"
+      "vrev16.8    q8, q8                      \n"
+      "vrev16.8    q9, q9                      \n"
+      "vrev16.8    q10, q10                    \n"
+      "vrev16.8    q11, q11                    \n"
+
+      "mov         %0, %3                      \n"
+
+    MEMACCESS(0)
+      "vst1.8      {d2},  [%0], %4             \n"
+    MEMACCESS(0)
+      "vst1.8      {d0},  [%0], %4             \n"
+    MEMACCESS(0)
+      "vst1.8      {d6},  [%0], %4             \n"
+    MEMACCESS(0)
+      "vst1.8      {d4},  [%0], %4             \n"
+    MEMACCESS(0)
+      "vst1.8      {d18}, [%0], %4             \n"
+    MEMACCESS(0)
+      "vst1.8      {d16}, [%0], %4             \n"
+    MEMACCESS(0)
+      "vst1.8      {d22}, [%0], %4             \n"
+    MEMACCESS(0)
+      "vst1.8      {d20}, [%0]                 \n"
+
+      "mov         %0, %5                      \n"
+
+    MEMACCESS(0)
+      "vst1.8      {d3},  [%0], %6             \n"
+    MEMACCESS(0)
+      "vst1.8      {d1},  [%0], %6             \n"
+    MEMACCESS(0)
+      "vst1.8      {d7},  [%0], %6             \n"
+    MEMACCESS(0)
+      "vst1.8      {d5},  [%0], %6             \n"
+    MEMACCESS(0)
+      "vst1.8      {d19}, [%0], %6             \n"
+    MEMACCESS(0)
+      "vst1.8      {d17}, [%0], %6             \n"
+    MEMACCESS(0)
+      "vst1.8      {d23}, [%0], %6             \n"
+    MEMACCESS(0)
+      "vst1.8      {d21}, [%0]                 \n"
+
+      "add         %1, #8*2                    \n"  // src   += 8*2
+      "add         %3, %3, %4, lsl #3          \n"  // dst_a += 8 * dst_stride_a
+      "add         %5, %5, %6, lsl #3          \n"  // dst_b += 8 * dst_stride_b
+      "subs        %7,  #8                     \n"  // w     -= 8
+      "bge         1b                          \n"
+
+    // add 8 back to counter. if the result is 0 there are
+    // no residuals.
+    "adds        %7, #8                        \n"
+    "beq         4f                            \n"
+
+    // some residual, so between 1 and 7 lines left to transpose
+    "cmp         %7, #2                        \n"
+    "blt         3f                            \n"
+
+    "cmp         %7, #4                        \n"
+    "blt         2f                            \n"
+
+    // TODO(frkoenig): Clean this up
+    // 4x8 block
+    "mov         %0, %1                        \n"
+    MEMACCESS(0)
+    "vld1.64     {d0}, [%0], %2                \n"
+    MEMACCESS(0)
+    "vld1.64     {d1}, [%0], %2                \n"
+    MEMACCESS(0)
+    "vld1.64     {d2}, [%0], %2                \n"
+    MEMACCESS(0)
+    "vld1.64     {d3}, [%0], %2                \n"
+    MEMACCESS(0)
+    "vld1.64     {d4}, [%0], %2                \n"
+    MEMACCESS(0)
+    "vld1.64     {d5}, [%0], %2                \n"
+    MEMACCESS(0)
+    "vld1.64     {d6}, [%0], %2                \n"
+    MEMACCESS(0)
+    "vld1.64     {d7}, [%0]                    \n"
+
+    MEMACCESS(8)
+    "vld1.8      {q15}, [%8]                   \n"
+
+    "vtrn.8      q0, q1                        \n"
+    "vtrn.8      q2, q3                        \n"
+
+    "vtbl.8      d16, {d0, d1}, d30            \n"
+    "vtbl.8      d17, {d0, d1}, d31            \n"
+    "vtbl.8      d18, {d2, d3}, d30            \n"
+    "vtbl.8      d19, {d2, d3}, d31            \n"
+    "vtbl.8      d20, {d4, d5}, d30            \n"
+    "vtbl.8      d21, {d4, d5}, d31            \n"
+    "vtbl.8      d22, {d6, d7}, d30            \n"
+    "vtbl.8      d23, {d6, d7}, d31            \n"
+
+    "mov         %0, %3                        \n"
+
+    MEMACCESS(0)
+    "vst1.32     {d16[0]},  [%0], %4           \n"
+    MEMACCESS(0)
+    "vst1.32     {d16[1]},  [%0], %4           \n"
+    MEMACCESS(0)
+    "vst1.32     {d17[0]},  [%0], %4           \n"
+    MEMACCESS(0)
+    "vst1.32     {d17[1]},  [%0], %4           \n"
+
+    "add         %0, %3, #4                    \n"
+    MEMACCESS(0)
+    "vst1.32     {d20[0]}, [%0], %4            \n"
+    MEMACCESS(0)
+    "vst1.32     {d20[1]}, [%0], %4            \n"
+    MEMACCESS(0)
+    "vst1.32     {d21[0]}, [%0], %4            \n"
+    MEMACCESS(0)
+    "vst1.32     {d21[1]}, [%0]                \n"
+
+    "mov         %0, %5                        \n"
+
+    MEMACCESS(0)
+    "vst1.32     {d18[0]}, [%0], %6            \n"
+    MEMACCESS(0)
+    "vst1.32     {d18[1]}, [%0], %6            \n"
+    MEMACCESS(0)
+    "vst1.32     {d19[0]}, [%0], %6            \n"
+    MEMACCESS(0)
+    "vst1.32     {d19[1]}, [%0], %6            \n"
+
+    "add         %0, %5, #4                    \n"
+    MEMACCESS(0)
+    "vst1.32     {d22[0]},  [%0], %6           \n"
+    MEMACCESS(0)
+    "vst1.32     {d22[1]},  [%0], %6           \n"
+    MEMACCESS(0)
+    "vst1.32     {d23[0]},  [%0], %6           \n"
+    MEMACCESS(0)
+    "vst1.32     {d23[1]},  [%0]               \n"
+
+    "add         %1, #4*2                      \n"  // src   += 4 * 2
+    "add         %3, %3, %4, lsl #2            \n"  // dst_a += 4 * dst_stride_a
+    "add         %5, %5, %6, lsl #2            \n"  // dst_b += 4 * dst_stride_b
+    "subs        %7,  #4                       \n"  // w     -= 4
+    "beq         4f                            \n"
+
+    // some residual, check to see if it includes a 2x8 block,
+    // or less
+    "cmp         %7, #2                        \n"
+    "blt         3f                            \n"
+
+    // 2x8 block
+    "2:                                        \n"
+    "mov         %0, %1                        \n"
+    MEMACCESS(0)
+    "vld2.16     {d0[0], d2[0]}, [%0], %2      \n"
+    MEMACCESS(0)
+    "vld2.16     {d1[0], d3[0]}, [%0], %2      \n"
+    MEMACCESS(0)
+    "vld2.16     {d0[1], d2[1]}, [%0], %2      \n"
+    MEMACCESS(0)
+    "vld2.16     {d1[1], d3[1]}, [%0], %2      \n"
+    MEMACCESS(0)
+    "vld2.16     {d0[2], d2[2]}, [%0], %2      \n"
+    MEMACCESS(0)
+    "vld2.16     {d1[2], d3[2]}, [%0], %2      \n"
+    MEMACCESS(0)
+    "vld2.16     {d0[3], d2[3]}, [%0], %2      \n"
+    MEMACCESS(0)
+    "vld2.16     {d1[3], d3[3]}, [%0]          \n"
+
+    "vtrn.8      d0, d1                        \n"
+    "vtrn.8      d2, d3                        \n"
+
+    "mov         %0, %3                        \n"
+
+    MEMACCESS(0)
+    "vst1.64     {d0}, [%0], %4                \n"
+    MEMACCESS(0)
+    "vst1.64     {d2}, [%0]                    \n"
+
+    "mov         %0, %5                        \n"
+
+    MEMACCESS(0)
+    "vst1.64     {d1}, [%0], %6                \n"
+    MEMACCESS(0)
+    "vst1.64     {d3}, [%0]                    \n"
+
+    "add         %1, #2*2                      \n"  // src   += 2 * 2
+    "add         %3, %3, %4, lsl #1            \n"  // dst_a += 2 * dst_stride_a
+    "add         %5, %5, %6, lsl #1            \n"  // dst_b += 2 * dst_stride_b
+    "subs        %7,  #2                       \n"  // w     -= 2
+    "beq         4f                            \n"
+
+    // 1x8 block
+    "3:                                        \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[0], d1[0]}, [%1], %2      \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[1], d1[1]}, [%1], %2      \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[2], d1[2]}, [%1], %2      \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[3], d1[3]}, [%1], %2      \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[4], d1[4]}, [%1], %2      \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[5], d1[5]}, [%1], %2      \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[6], d1[6]}, [%1], %2      \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[7], d1[7]}, [%1]          \n"
+
+    MEMACCESS(3)
+    "vst1.64     {d0}, [%3]                    \n"
+    MEMACCESS(5)
+    "vst1.64     {d1}, [%5]                    \n"
+
+    "4:                                        \n"
+
+    : "+r"(src_temp),            // %0
+      "+r"(src),                 // %1
+      "+r"(src_stride),          // %2
+      "+r"(dst_a),               // %3
+      "+r"(dst_stride_a),        // %4
+      "+r"(dst_b),               // %5
+      "+r"(dst_stride_b),        // %6
+      "+r"(width)                // %7
+    : "r"(&kVTbl4x4TransposeDi)  // %8
+    : "memory", "cc",
+      "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
+  );
+}
+#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libvpx/third_party/libyuv/source/rotate_neon64.cc b/libs/libvpx/third_party/libyuv/source/rotate_neon64.cc
new file mode 100644
index 0000000000..f52c082b3f
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/source/rotate_neon64.cc
@@ -0,0 +1,543 @@
+/*
+ *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/rotate_row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon armv8 64 bit.
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+static uvec8 kVTbl4x4Transpose =
+  { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
+
+void TransposeWx8_NEON(const uint8* src, int src_stride,
+                       uint8* dst, int dst_stride, int width) {
+  const uint8* src_temp = NULL;
+  int64 width64 = (int64) width;  // Work around clang 3.4 warning.
+  asm volatile (
+    // loops are on blocks of 8. loop will stop when
+    // counter gets to or below 0. starting the counter
+    // at w-8 allow for this
+    "sub         %3, %3, #8                      \n"
+
+    // handle 8x8 blocks. this should be the majority of the plane
+    "1:                                          \n"
+      "mov         %0, %1                        \n"
+
+      MEMACCESS(0)
+      "ld1        {v0.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v1.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v2.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v3.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v4.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v5.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v6.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v7.8b}, [%0]                  \n"
+
+      "trn2     v16.8b, v0.8b, v1.8b             \n"
+      "trn1     v17.8b, v0.8b, v1.8b             \n"
+      "trn2     v18.8b, v2.8b, v3.8b             \n"
+      "trn1     v19.8b, v2.8b, v3.8b             \n"
+      "trn2     v20.8b, v4.8b, v5.8b             \n"
+      "trn1     v21.8b, v4.8b, v5.8b             \n"
+      "trn2     v22.8b, v6.8b, v7.8b             \n"
+      "trn1     v23.8b, v6.8b, v7.8b             \n"
+
+      "trn2     v3.4h, v17.4h, v19.4h            \n"
+      "trn1     v1.4h, v17.4h, v19.4h            \n"
+      "trn2     v2.4h, v16.4h, v18.4h            \n"
+      "trn1     v0.4h, v16.4h, v18.4h            \n"
+      "trn2     v7.4h, v21.4h, v23.4h            \n"
+      "trn1     v5.4h, v21.4h, v23.4h            \n"
+      "trn2     v6.4h, v20.4h, v22.4h            \n"
+      "trn1     v4.4h, v20.4h, v22.4h            \n"
+
+      "trn2     v21.2s, v1.2s, v5.2s             \n"
+      "trn1     v17.2s, v1.2s, v5.2s             \n"
+      "trn2     v20.2s, v0.2s, v4.2s             \n"
+      "trn1     v16.2s, v0.2s, v4.2s             \n"
+      "trn2     v23.2s, v3.2s, v7.2s             \n"
+      "trn1     v19.2s, v3.2s, v7.2s             \n"
+      "trn2     v22.2s, v2.2s, v6.2s             \n"
+      "trn1     v18.2s, v2.2s, v6.2s             \n"
+
+      "mov         %0, %2                        \n"
+
+    MEMACCESS(0)
+      "st1      {v17.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v16.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v19.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v18.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v21.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v20.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v23.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v22.8b}, [%0]                   \n"
+
+      "add         %1, %1, #8                    \n"  // src += 8
+      "add         %2, %2, %6, lsl #3            \n"  // dst += 8 * dst_stride
+      "subs        %3, %3, #8                    \n"  // w   -= 8
+      "b.ge        1b                            \n"
+
+    // add 8 back to counter. if the result is 0 there are
+    // no residuals.
+    "adds        %3, %3, #8                      \n"
+    "b.eq        4f                              \n"
+
+    // some residual, so between 1 and 7 lines left to transpose
+    "cmp         %3, #2                          \n"
+    "b.lt        3f                              \n"
+
+    "cmp         %3, #4                          \n"
+    "b.lt        2f                              \n"
+
+    // 4x8 block
+    "mov         %0, %1                          \n"
+    MEMACCESS(0)
+    "ld1     {v0.s}[0], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v0.s}[1], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v0.s}[2], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v0.s}[3], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.s}[0], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.s}[1], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.s}[2], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.s}[3], [%0]                     \n"
+
+    "mov         %0, %2                          \n"
+
+    MEMACCESS(4)
+    "ld1      {v2.16b}, [%4]                     \n"
+
+    "tbl      v3.16b, {v0.16b}, v2.16b           \n"
+    "tbl      v0.16b, {v1.16b}, v2.16b           \n"
+
+    // TODO(frkoenig): Rework shuffle above to
+    // write out with 4 instead of 8 writes.
+    MEMACCESS(0)
+    "st1 {v3.s}[0], [%0], %6                     \n"
+    MEMACCESS(0)
+    "st1 {v3.s}[1], [%0], %6                     \n"
+    MEMACCESS(0)
+    "st1 {v3.s}[2], [%0], %6                     \n"
+    MEMACCESS(0)
+    "st1 {v3.s}[3], [%0]                         \n"
+
+    "add         %0, %2, #4                      \n"
+    MEMACCESS(0)
+    "st1 {v0.s}[0], [%0], %6                     \n"
+    MEMACCESS(0)
+    "st1 {v0.s}[1], [%0], %6                     \n"
+    MEMACCESS(0)
+    "st1 {v0.s}[2], [%0], %6                     \n"
+    MEMACCESS(0)
+    "st1 {v0.s}[3], [%0]                         \n"
+
+    "add         %1, %1, #4                      \n"  // src += 4
+    "add         %2, %2, %6, lsl #2              \n"  // dst += 4 * dst_stride
+    "subs        %3, %3, #4                      \n"  // w   -= 4
+    "b.eq        4f                              \n"
+
+    // some residual, check to see if it includes a 2x8 block,
+    // or less
+    "cmp         %3, #2                          \n"
+    "b.lt        3f                              \n"
+
+    // 2x8 block
+    "2:                                          \n"
+    "mov         %0, %1                          \n"
+    MEMACCESS(0)
+    "ld1     {v0.h}[0], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.h}[0], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v0.h}[1], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.h}[1], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v0.h}[2], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.h}[2], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v0.h}[3], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.h}[3], [%0]                     \n"
+
+    "trn2    v2.8b, v0.8b, v1.8b                 \n"
+    "trn1    v3.8b, v0.8b, v1.8b                 \n"
+
+    "mov         %0, %2                          \n"
+
+    MEMACCESS(0)
+    "st1     {v3.8b}, [%0], %6                   \n"
+    MEMACCESS(0)
+    "st1     {v2.8b}, [%0]                       \n"
+
+    "add         %1, %1, #2                      \n"  // src += 2
+    "add         %2, %2, %6, lsl #1              \n"  // dst += 2 * dst_stride
+    "subs        %3, %3,  #2                     \n"  // w   -= 2
+    "b.eq        4f                              \n"
+
+    // 1x8 block
+    "3:                                          \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[0], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[1], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[2], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[3], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[4], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[5], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[6], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[7], [%1]                 \n"
+
+    MEMACCESS(2)
+    "st1         {v0.8b}, [%2]                   \n"
+
+    "4:                                          \n"
+
+    : "+r"(src_temp),                             // %0
+      "+r"(src),                                  // %1
+      "+r"(dst),                                  // %2
+      "+r"(width64)                               // %3
+    : "r"(&kVTbl4x4Transpose),                    // %4
+      "r"(static_cast<ptrdiff_t>(src_stride)),    // %5
+      "r"(static_cast<ptrdiff_t>(dst_stride))     // %6
+    : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+      "v17", "v18", "v19", "v20", "v21", "v22", "v23"
+  );
+}
+
+static uint8 kVTbl4x4TransposeDi[32] =
+  { 0,  16, 32, 48,  2, 18, 34, 50,  4, 20, 36, 52,  6, 22, 38, 54,
+    1,  17, 33, 49,  3, 19, 35, 51,  5, 21, 37, 53,  7, 23, 39, 55};
+
+void TransposeUVWx8_NEON(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b,
+                         int width) {
+  const uint8* src_temp = NULL;
+  int64 width64 = (int64) width;  // Work around clang 3.4 warning.
+  asm volatile (
+    // loops are on blocks of 8. loop will stop when
+    // counter gets to or below 0. starting the counter
+    // at w-8 allow for this
+    "sub       %4, %4, #8                      \n"
+
+    // handle 8x8 blocks. this should be the majority of the plane
+    "1:                                        \n"
+    "mov       %0, %1                          \n"
+
+    MEMACCESS(0)
+    "ld1       {v0.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v1.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v2.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v3.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v4.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v5.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v6.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v7.16b}, [%0]                  \n"
+
+    "trn1      v16.16b, v0.16b, v1.16b         \n"
+    "trn2      v17.16b, v0.16b, v1.16b         \n"
+    "trn1      v18.16b, v2.16b, v3.16b         \n"
+    "trn2      v19.16b, v2.16b, v3.16b         \n"
+    "trn1      v20.16b, v4.16b, v5.16b         \n"
+    "trn2      v21.16b, v4.16b, v5.16b         \n"
+    "trn1      v22.16b, v6.16b, v7.16b         \n"
+    "trn2      v23.16b, v6.16b, v7.16b         \n"
+
+    "trn1      v0.8h, v16.8h, v18.8h           \n"
+    "trn2      v1.8h, v16.8h, v18.8h           \n"
+    "trn1      v2.8h, v20.8h, v22.8h           \n"
+    "trn2      v3.8h, v20.8h, v22.8h           \n"
+    "trn1      v4.8h, v17.8h, v19.8h           \n"
+    "trn2      v5.8h, v17.8h, v19.8h           \n"
+    "trn1      v6.8h, v21.8h, v23.8h           \n"
+    "trn2      v7.8h, v21.8h, v23.8h           \n"
+
+    "trn1      v16.4s, v0.4s, v2.4s            \n"
+    "trn2      v17.4s, v0.4s, v2.4s            \n"
+    "trn1      v18.4s, v1.4s, v3.4s            \n"
+    "trn2      v19.4s, v1.4s, v3.4s            \n"
+    "trn1      v20.4s, v4.4s, v6.4s            \n"
+    "trn2      v21.4s, v4.4s, v6.4s            \n"
+    "trn1      v22.4s, v5.4s, v7.4s            \n"
+    "trn2      v23.4s, v5.4s, v7.4s            \n"
+
+    "mov       %0, %2                          \n"
+
+    MEMACCESS(0)
+    "st1       {v16.d}[0], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v18.d}[0], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v17.d}[0], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v19.d}[0], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v16.d}[1], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v18.d}[1], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v17.d}[1], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v19.d}[1], [%0]                \n"
+
+    "mov       %0, %3                          \n"
+
+    MEMACCESS(0)
+    "st1       {v20.d}[0], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v22.d}[0], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v21.d}[0], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v23.d}[0], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v20.d}[1], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v22.d}[1], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v21.d}[1], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v23.d}[1], [%0]                \n"
+
+    "add       %1, %1, #16                     \n"  // src   += 8*2
+    "add       %2, %2, %6, lsl #3              \n"  // dst_a += 8 * dst_stride_a
+    "add       %3, %3, %7, lsl #3              \n"  // dst_b += 8 * dst_stride_b
+    "subs      %4, %4,  #8                     \n"  // w     -= 8
+    "b.ge      1b                              \n"
+
+    // add 8 back to counter. if the result is 0 there are
+    // no residuals.
+    "adds      %4, %4, #8                      \n"
+    "b.eq      4f                              \n"
+
+    // some residual, so between 1 and 7 lines left to transpose
+    "cmp       %4, #2                          \n"
+    "b.lt      3f                              \n"
+
+    "cmp       %4, #4                          \n"
+    "b.lt      2f                              \n"
+
+    // TODO(frkoenig): Clean this up
+    // 4x8 block
+    "mov       %0, %1                          \n"
+    MEMACCESS(0)
+    "ld1       {v0.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v1.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v2.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v3.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v4.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v5.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v6.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v7.8b}, [%0]                   \n"
+
+    MEMACCESS(8)
+    "ld1       {v30.16b}, [%8], #16            \n"
+    "ld1       {v31.16b}, [%8]                 \n"
+
+    "tbl       v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b  \n"
+    "tbl       v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b  \n"
+    "tbl       v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b  \n"
+    "tbl       v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b  \n"
+
+    "mov       %0, %2                          \n"
+
+    MEMACCESS(0)
+    "st1       {v16.s}[0],  [%0], %6           \n"
+    MEMACCESS(0)
+    "st1       {v16.s}[1],  [%0], %6           \n"
+    MEMACCESS(0)
+    "st1       {v16.s}[2],  [%0], %6           \n"
+    MEMACCESS(0)
+    "st1       {v16.s}[3],  [%0], %6           \n"
+
+    "add       %0, %2, #4                      \n"
+    MEMACCESS(0)
+    "st1       {v18.s}[0], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v18.s}[1], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v18.s}[2], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v18.s}[3], [%0]                \n"
+
+    "mov       %0, %3                          \n"
+
+    MEMACCESS(0)
+    "st1       {v17.s}[0], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v17.s}[1], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v17.s}[2], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v17.s}[3], [%0], %7            \n"
+
+    "add       %0, %3, #4                      \n"
+    MEMACCESS(0)
+    "st1       {v19.s}[0],  [%0], %7           \n"
+    MEMACCESS(0)
+    "st1       {v19.s}[1],  [%0], %7           \n"
+    MEMACCESS(0)
+    "st1       {v19.s}[2],  [%0], %7           \n"
+    MEMACCESS(0)
+    "st1       {v19.s}[3],  [%0]               \n"
+
+    "add       %1, %1, #8                      \n"  // src   += 4 * 2
+    "add       %2, %2, %6, lsl #2              \n"  // dst_a += 4 * dst_stride_a
+    "add       %3, %3, %7, lsl #2              \n"  // dst_b += 4 * dst_stride_b
+    "subs      %4,  %4,  #4                    \n"  // w     -= 4
+    "b.eq      4f                              \n"
+
+    // some residual, check to see if it includes a 2x8 block,
+    // or less
+    "cmp       %4, #2                          \n"
+    "b.lt      3f                              \n"
+
+    // 2x8 block
+    "2:                                        \n"
+    "mov       %0, %1                          \n"
+    MEMACCESS(0)
+    "ld2       {v0.h, v1.h}[0], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v2.h, v3.h}[0], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v0.h, v1.h}[1], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v2.h, v3.h}[1], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v0.h, v1.h}[2], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v2.h, v3.h}[2], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v0.h, v1.h}[3], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v2.h, v3.h}[3], [%0]           \n"
+
+    "trn1      v4.8b, v0.8b, v2.8b             \n"
+    "trn2      v5.8b, v0.8b, v2.8b             \n"
+    "trn1      v6.8b, v1.8b, v3.8b             \n"
+    "trn2      v7.8b, v1.8b, v3.8b             \n"
+
+    "mov       %0, %2                          \n"
+
+    MEMACCESS(0)
+    "st1       {v4.d}[0], [%0], %6             \n"
+    MEMACCESS(0)
+    "st1       {v6.d}[0], [%0]                 \n"
+
+    "mov       %0, %3                          \n"
+
+    MEMACCESS(0)
+    "st1       {v5.d}[0], [%0], %7             \n"
+    MEMACCESS(0)
+    "st1       {v7.d}[0], [%0]                 \n"
+
+    "add       %1, %1, #4                      \n"  // src   += 2 * 2
+    "add       %2, %2, %6, lsl #1              \n"  // dst_a += 2 * dst_stride_a
+    "add       %3, %3, %7, lsl #1              \n"  // dst_b += 2 * dst_stride_b
+    "subs      %4,  %4,  #2                    \n"  // w     -= 2
+    "b.eq      4f                              \n"
+
+    // 1x8 block
+    "3:                                        \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[0], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[1], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[2], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[3], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[4], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[5], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[6], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[7], [%1]           \n"
+
+    MEMACCESS(2)
+    "st1       {v0.d}[0], [%2]                 \n"
+    MEMACCESS(3)
+    "st1       {v1.d}[0], [%3]                 \n"
+
+    "4:                                        \n"
+
+    : "+r"(src_temp),                             // %0
+      "+r"(src),                                  // %1
+      "+r"(dst_a),                                // %2
+      "+r"(dst_b),                                // %3
+      "+r"(width64)                               // %4
+    : "r"(static_cast<ptrdiff_t>(src_stride)),    // %5
+      "r"(static_cast<ptrdiff_t>(dst_stride_a)),  // %6
+      "r"(static_cast<ptrdiff_t>(dst_stride_b)),  // %7
+      "r"(&kVTbl4x4TransposeDi)                   // %8
+    : "memory", "cc",
+      "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+      "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+      "v30", "v31"
+  );
+}
+#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libvpx/third_party/libyuv/source/rotate_win.cc b/libs/libvpx/third_party/libyuv/source/rotate_win.cc
new file mode 100644
index 0000000000..2760066dfd
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/source/rotate_win.cc
@@ -0,0 +1,248 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/rotate_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for Visual C x86.
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
+    defined(_MSC_VER) && !defined(__clang__)
+
+__declspec(naked)
+void TransposeWx8_SSSE3(const uint8* src, int src_stride,
+                        uint8* dst, int dst_stride, int width) {
+  __asm {
+    push      edi
+    push      esi
+    push      ebp
+    mov       eax, [esp + 12 + 4]   // src
+    mov       edi, [esp + 12 + 8]   // src_stride
+    mov       edx, [esp + 12 + 12]  // dst
+    mov       esi, [esp + 12 + 16]  // dst_stride
+    mov       ecx, [esp + 12 + 20]  // width
+
+    // Read in the data from the source pointer.
+    // First round of bit swap.
+    align      4
+ convertloop:
+    movq      xmm0, qword ptr [eax]
+    lea       ebp, [eax + 8]
+    movq      xmm1, qword ptr [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    punpcklbw xmm0, xmm1
+    movq      xmm2, qword ptr [eax]
+    movdqa    xmm1, xmm0
+    palignr   xmm1, xmm1, 8
+    movq      xmm3, qword ptr [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    punpcklbw xmm2, xmm3
+    movdqa    xmm3, xmm2
+    movq      xmm4, qword ptr [eax]
+    palignr   xmm3, xmm3, 8
+    movq      xmm5, qword ptr [eax + edi]
+    punpcklbw xmm4, xmm5
+    lea       eax, [eax + 2 * edi]
+    movdqa    xmm5, xmm4
+    movq      xmm6, qword ptr [eax]
+    palignr   xmm5, xmm5, 8
+    movq      xmm7, qword ptr [eax + edi]
+    punpcklbw xmm6, xmm7
+    mov       eax, ebp
+    movdqa    xmm7, xmm6
+    palignr   xmm7, xmm7, 8
+    // Second round of bit swap.
+    punpcklwd xmm0, xmm2
+    punpcklwd xmm1, xmm3
+    movdqa    xmm2, xmm0
+    movdqa    xmm3, xmm1
+    palignr   xmm2, xmm2, 8
+    palignr   xmm3, xmm3, 8
+    punpcklwd xmm4, xmm6
+    punpcklwd xmm5, xmm7
+    movdqa    xmm6, xmm4
+    movdqa    xmm7, xmm5
+    palignr   xmm6, xmm6, 8
+    palignr   xmm7, xmm7, 8
+    // Third round of bit swap.
+    // Write to the destination pointer.
+    punpckldq xmm0, xmm4
+    movq      qword ptr [edx], xmm0
+    movdqa    xmm4, xmm0
+    palignr   xmm4, xmm4, 8
+    movq      qword ptr [edx + esi], xmm4
+    lea       edx, [edx + 2 * esi]
+    punpckldq xmm2, xmm6
+    movdqa    xmm6, xmm2
+    palignr   xmm6, xmm6, 8
+    movq      qword ptr [edx], xmm2
+    punpckldq xmm1, xmm5
+    movq      qword ptr [edx + esi], xmm6
+    lea       edx, [edx + 2 * esi]
+    movdqa    xmm5, xmm1
+    movq      qword ptr [edx], xmm1
+    palignr   xmm5, xmm5, 8
+    punpckldq xmm3, xmm7
+    movq      qword ptr [edx + esi], xmm5
+    lea       edx, [edx + 2 * esi]
+    movq      qword ptr [edx], xmm3
+    movdqa    xmm7, xmm3
+    palignr   xmm7, xmm7, 8
+    sub       ecx, 8
+    movq      qword ptr [edx + esi], xmm7
+    lea       edx, [edx + 2 * esi]
+    jg        convertloop
+
+    pop       ebp
+    pop       esi
+    pop       edi
+    ret
+  }
+}
+
+__declspec(naked)
+void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b,
+                         int w) {
+  __asm {
+    push      ebx
+    push      esi
+    push      edi
+    push      ebp
+    mov       eax, [esp + 16 + 4]   // src
+    mov       edi, [esp + 16 + 8]   // src_stride
+    mov       edx, [esp + 16 + 12]  // dst_a
+    mov       esi, [esp + 16 + 16]  // dst_stride_a
+    mov       ebx, [esp + 16 + 20]  // dst_b
+    mov       ebp, [esp + 16 + 24]  // dst_stride_b
+    mov       ecx, esp
+    sub       esp, 4 + 16
+    and       esp, ~15
+    mov       [esp + 16], ecx
+    mov       ecx, [ecx + 16 + 28]  // w
+
+    align      4
+ convertloop:
+    // Read in the data from the source pointer.
+    // First round of bit swap.
+    movdqu    xmm0, [eax]
+    movdqu    xmm1, [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    movdqa    xmm7, xmm0  // use xmm7 as temp register.
+    punpcklbw xmm0, xmm1
+    punpckhbw xmm7, xmm1
+    movdqa    xmm1, xmm7
+    movdqu    xmm2, [eax]
+    movdqu    xmm3, [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    movdqa    xmm7, xmm2
+    punpcklbw xmm2, xmm3
+    punpckhbw xmm7, xmm3
+    movdqa    xmm3, xmm7
+    movdqu    xmm4, [eax]
+    movdqu    xmm5, [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    movdqa    xmm7, xmm4
+    punpcklbw xmm4, xmm5
+    punpckhbw xmm7, xmm5
+    movdqa    xmm5, xmm7
+    movdqu    xmm6, [eax]
+    movdqu    xmm7, [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    movdqu    [esp], xmm5  // backup xmm5
+    neg       edi
+    movdqa    xmm5, xmm6   // use xmm5 as temp register.
+    punpcklbw xmm6, xmm7
+    punpckhbw xmm5, xmm7
+    movdqa    xmm7, xmm5
+    lea       eax, [eax + 8 * edi + 16]
+    neg       edi
+    // Second round of bit swap.
+    movdqa    xmm5, xmm0
+    punpcklwd xmm0, xmm2
+    punpckhwd xmm5, xmm2
+    movdqa    xmm2, xmm5
+    movdqa    xmm5, xmm1
+    punpcklwd xmm1, xmm3
+    punpckhwd xmm5, xmm3
+    movdqa    xmm3, xmm5
+    movdqa    xmm5, xmm4
+    punpcklwd xmm4, xmm6
+    punpckhwd xmm5, xmm6
+    movdqa    xmm6, xmm5
+    movdqu    xmm5, [esp]  // restore xmm5
+    movdqu    [esp], xmm6  // backup xmm6
+    movdqa    xmm6, xmm5    // use xmm6 as temp register.
+    punpcklwd xmm5, xmm7
+    punpckhwd xmm6, xmm7
+    movdqa    xmm7, xmm6
+    // Third round of bit swap.
+    // Write to the destination pointer.
+    movdqa    xmm6, xmm0
+    punpckldq xmm0, xmm4
+    punpckhdq xmm6, xmm4
+    movdqa    xmm4, xmm6
+    movdqu    xmm6, [esp]  // restore xmm6
+    movlpd    qword ptr [edx], xmm0
+    movhpd    qword ptr [ebx], xmm0
+    movlpd    qword ptr [edx + esi], xmm4
+    lea       edx, [edx + 2 * esi]
+    movhpd    qword ptr [ebx + ebp], xmm4
+    lea       ebx, [ebx + 2 * ebp]
+    movdqa    xmm0, xmm2   // use xmm0 as the temp register.
+    punpckldq xmm2, xmm6
+    movlpd    qword ptr [edx], xmm2
+    movhpd    qword ptr [ebx], xmm2
+    punpckhdq xmm0, xmm6
+    movlpd    qword ptr [edx + esi], xmm0
+    lea       edx, [edx + 2 * esi]
+    movhpd    qword ptr [ebx + ebp], xmm0
+    lea       ebx, [ebx + 2 * ebp]
+    movdqa    xmm0, xmm1   // use xmm0 as the temp register.
+    punpckldq xmm1, xmm5
+    movlpd    qword ptr [edx], xmm1
+    movhpd    qword ptr [ebx], xmm1
+    punpckhdq xmm0, xmm5
+    movlpd    qword ptr [edx + esi], xmm0
+    lea       edx, [edx + 2 * esi]
+    movhpd    qword ptr [ebx + ebp], xmm0
+    lea       ebx, [ebx + 2 * ebp]
+    movdqa    xmm0, xmm3   // use xmm0 as the temp register.
+    punpckldq xmm3, xmm7
+    movlpd    qword ptr [edx], xmm3
+    movhpd    qword ptr [ebx], xmm3
+    punpckhdq xmm0, xmm7
+    sub       ecx, 8
+    movlpd    qword ptr [edx + esi], xmm0
+    lea       edx, [edx + 2 * esi]
+    movhpd    qword ptr [ebx + ebp], xmm0
+    lea       ebx, [ebx + 2 * ebp]
+    jg        convertloop
+
+    mov       esp, [esp + 16]
+    pop       ebp
+    pop       edi
+    pop       esi
+    pop       ebx
+    ret
+  }
+}
+
+#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libvpx/third_party/libyuv/source/row_any.cc b/libs/libvpx/third_party/libyuv/source/row_any.cc
new file mode 100644
index 0000000000..1cb1f6b930
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/source/row_any.cc
@@ -0,0 +1,680 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#include <string.h>  // For memset.
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Subsampled source needs to be increase by 1 of not even.
+#define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift))
+
+// Any 3 planes to 1.
+#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)                 \
+    void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf,   \
+                 uint8* dst_ptr, int width) {                                  \
+      SIMD_ALIGNED(uint8 temp[64 * 4]);                                        \
+      memset(temp, 0, 64 * 3);  /* for YUY2 and msan */                        \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n);                             \
+      }                                                                        \
+      memcpy(temp, y_buf + n, r);                                              \
+      memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));               \
+      memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));              \
+      ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, MASK + 1);             \
+      memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192,                      \
+             SS(r, DUVSHIFT) * BPP);                                           \
+    }
+
+#ifdef HAS_I422TOARGBROW_SSSE3
+ANY31(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7)
+#endif
+#ifdef HAS_I444TOARGBROW_SSSE3
+ANY31(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7)
+ANY31(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_SSSE3, 2, 0, 4, 7)
+ANY31(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_SSSE3, 1, 0, 4, 7)
+ANY31(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_SSSE3, 1, 0, 4, 7)
+ANY31(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7)
+ANY31(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, 1, 0, 2, 7)
+ANY31(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, 1, 0, 2, 7)
+ANY31(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7)
+ANY31(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 7)
+ANY31(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_SSSE3, 1, 0, 3, 7)
+ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15)
+ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15)
+#endif  // HAS_I444TOARGBROW_SSSE3
+#ifdef HAS_I422TORGB24ROW_AVX2
+ANY31(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 15)
+#endif
+#ifdef HAS_I422TORAWROW_AVX2
+ANY31(I422ToRAWRow_Any_AVX2, I422ToRAWRow_AVX2, 1, 0, 3, 15)
+#endif
+#ifdef HAS_J422TOARGBROW_SSSE3
+ANY31(J422ToARGBRow_Any_SSSE3, J422ToARGBRow_SSSE3, 1, 0, 4, 7)
+#endif
+#ifdef HAS_J422TOARGBROW_AVX2
+ANY31(J422ToARGBRow_Any_AVX2, J422ToARGBRow_AVX2, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I422TOARGBROW_AVX2
+ANY31(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I422TOBGRAROW_AVX2
+ANY31(I422ToBGRARow_Any_AVX2, I422ToBGRARow_AVX2, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I422TORGBAROW_AVX2
+ANY31(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I422TOABGRROW_AVX2
+ANY31(I422ToABGRRow_Any_AVX2, I422ToABGRRow_AVX2, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I444TOARGBROW_AVX2
+ANY31(I444ToARGBRow_Any_AVX2, I444ToARGBRow_AVX2, 0, 0, 4, 15)
+#endif
+#ifdef HAS_I411TOARGBROW_AVX2
+ANY31(I411ToARGBRow_Any_AVX2, I411ToARGBRow_AVX2, 2, 0, 4, 15)
+#endif
+#ifdef HAS_I422TOARGB4444ROW_AVX2
+ANY31(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, 1, 0, 2, 7)
+#endif
+#ifdef HAS_I422TOARGB1555ROW_AVX2
+ANY31(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, 1, 0, 2, 7)
+#endif
+#ifdef HAS_I422TORGB565ROW_AVX2
+ANY31(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 7)
+#endif
+#ifdef HAS_I422TOARGBROW_NEON
+ANY31(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, 0, 0, 4, 7)
+ANY31(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, 1, 0, 4, 7)
+ANY31(I411ToARGBRow_Any_NEON, I411ToARGBRow_NEON, 2, 0, 4, 7)
+ANY31(I422ToBGRARow_Any_NEON, I422ToBGRARow_NEON, 1, 0, 4, 7)
+ANY31(I422ToABGRRow_Any_NEON, I422ToABGRRow_NEON, 1, 0, 4, 7)
+ANY31(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, 1, 0, 4, 7)
+ANY31(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, 1, 0, 3, 7)
+ANY31(I422ToRAWRow_Any_NEON, I422ToRAWRow_NEON, 1, 0, 3, 7)
+ANY31(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, 1, 0, 2, 7)
+ANY31(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, 1, 0, 2, 7)
+ANY31(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7)
+#endif
+#ifdef HAS_I422TOYUY2ROW_NEON
+ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15)
+#endif
+#ifdef HAS_I422TOUYVYROW_NEON
+ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15)
+#endif
+#undef ANY31
+
+// Any 2 planes to 1.
+#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK)              \
+    void NAMEANY(const uint8* y_buf, const uint8* uv_buf,                      \
+                 uint8* dst_ptr, int width) {                                  \
+      SIMD_ALIGNED(uint8 temp[64 * 3]);                                        \
+      memset(temp, 0, 64 * 2);  /* for msan */                                 \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(y_buf, uv_buf, dst_ptr, n);                                   \
+      }                                                                        \
+      memcpy(temp, y_buf + n * SBPP, r * SBPP);                                \
+      memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2,                       \
+             SS(r, UVSHIFT) * SBPP2);                                          \
+      ANY_SIMD(temp, temp + 64, temp + 128, MASK + 1);                         \
+      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
+    }
+
+// Biplanar to RGB.
+#ifdef HAS_NV12TOARGBROW_SSSE3
+ANY21(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
+ANY21(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV12TOARGBROW_AVX2
+ANY21(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, 1, 1, 2, 4, 15)
+ANY21(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, 1, 1, 2, 4, 15)
+#endif
+#ifdef HAS_NV12TOARGBROW_NEON
+ANY21(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7)
+ANY21(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV12TORGB565ROW_SSSE3
+ANY21(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7)
+ANY21(NV21ToRGB565Row_Any_SSSE3, NV21ToRGB565Row_SSSE3, 1, 1, 2, 2, 7)
+#endif
+#ifdef HAS_NV12TORGB565ROW_AVX2
+ANY21(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, 1, 1, 2, 2, 15)
+ANY21(NV21ToRGB565Row_Any_AVX2, NV21ToRGB565Row_AVX2, 1, 1, 2, 2, 15)
+#endif
+#ifdef HAS_NV12TORGB565ROW_NEON
+ANY21(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, 1, 1, 2, 2, 7)
+ANY21(NV21ToRGB565Row_Any_NEON, NV21ToRGB565Row_NEON, 1, 1, 2, 2, 7)
+#endif
+
+// Merge functions.
+#ifdef HAS_MERGEUVROW_SSE2
+ANY21(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, 0, 1, 1, 2, 15)
+#endif
+#ifdef HAS_MERGEUVROW_AVX2
+ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 31)
+#endif
+#ifdef HAS_MERGEUVROW_NEON
+ANY21(MergeUVRow_Any_NEON, MergeUVRow_NEON, 0, 1, 1, 2, 15)
+#endif
+
+// Math functions.
+#ifdef HAS_ARGBMULTIPLYROW_SSE2
+ANY21(ARGBMultiplyRow_Any_SSE2, ARGBMultiplyRow_SSE2, 0, 4, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBADDROW_SSE2
+ANY21(ARGBAddRow_Any_SSE2, ARGBAddRow_SSE2, 0, 4, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_SSE2
+ANY21(ARGBSubtractRow_Any_SSE2, ARGBSubtractRow_SSE2, 0, 4, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBMULTIPLYROW_AVX2
+ANY21(ARGBMultiplyRow_Any_AVX2, ARGBMultiplyRow_AVX2, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBADDROW_AVX2
+ANY21(ARGBAddRow_Any_AVX2, ARGBAddRow_AVX2, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_AVX2
+ANY21(ARGBSubtractRow_Any_AVX2, ARGBSubtractRow_AVX2, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBMULTIPLYROW_NEON
+ANY21(ARGBMultiplyRow_Any_NEON, ARGBMultiplyRow_NEON, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBADDROW_NEON
+ANY21(ARGBAddRow_Any_NEON, ARGBAddRow_NEON, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_NEON
+ANY21(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_SOBELROW_SSE2
+ANY21(SobelRow_Any_SSE2, SobelRow_SSE2, 0, 1, 1, 4, 15)
+#endif
+#ifdef HAS_SOBELROW_NEON
+ANY21(SobelRow_Any_NEON, SobelRow_NEON, 0, 1, 1, 4, 7)
+#endif
+#ifdef HAS_SOBELTOPLANEROW_SSE2
+ANY21(SobelToPlaneRow_Any_SSE2, SobelToPlaneRow_SSE2, 0, 1, 1, 1, 15)
+#endif
+#ifdef HAS_SOBELTOPLANEROW_NEON
+ANY21(SobelToPlaneRow_Any_NEON, SobelToPlaneRow_NEON, 0, 1, 1, 1, 15)
+#endif
+#ifdef HAS_SOBELXYROW_SSE2
+ANY21(SobelXYRow_Any_SSE2, SobelXYRow_SSE2, 0, 1, 1, 4, 15)
+#endif
+#ifdef HAS_SOBELXYROW_NEON
+ANY21(SobelXYRow_Any_NEON, SobelXYRow_NEON, 0, 1, 1, 4, 7)
+#endif
+#undef ANY21
+
+// Any 1 to 1.
+#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)                     \
+    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) {            \
+      SIMD_ALIGNED(uint8 temp[128 * 2]);                                       \
+      memset(temp, 0, 128);  /* for YUY2 and msan */                           \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(src_ptr, dst_ptr, n);                                         \
+      }                                                                        \
+      memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP);    \
+      ANY_SIMD(temp, temp + 128, MASK + 1);                                    \
+      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
+    }
+
+#ifdef HAS_COPYROW_AVX
+ANY11(CopyRow_Any_AVX, CopyRow_AVX, 0, 1, 1, 63)
+#endif
+#ifdef HAS_COPYROW_SSE2
+ANY11(CopyRow_Any_SSE2, CopyRow_SSE2, 0, 1, 1, 31)
+#endif
+#ifdef HAS_COPYROW_NEON
+ANY11(CopyRow_Any_NEON, CopyRow_NEON, 0, 1, 1, 31)
+#endif
+#if defined(HAS_ARGBTORGB24ROW_SSSE3)
+ANY11(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, 0, 4, 3, 15)
+ANY11(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, 0, 4, 3, 15)
+ANY11(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 0, 4, 2, 3)
+ANY11(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 0, 4, 2, 3)
+ANY11(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 0, 4, 2, 3)
+#endif
+#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
+ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7)
+ANY11(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, 0, 4, 2, 7)
+ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7)
+#endif
+#if defined(HAS_J400TOARGBROW_SSE2)
+ANY11(J400ToARGBRow_Any_SSE2, J400ToARGBRow_SSE2, 0, 1, 4, 7)
+#endif
+#if defined(HAS_J400TOARGBROW_AVX2)
+ANY11(J400ToARGBRow_Any_AVX2, J400ToARGBRow_AVX2, 0, 1, 4, 15)
+#endif
+#if defined(HAS_I400TOARGBROW_SSE2)
+ANY11(I400ToARGBRow_Any_SSE2, I400ToARGBRow_SSE2, 0, 1, 4, 7)
+#endif
+#if defined(HAS_I400TOARGBROW_AVX2)
+ANY11(I400ToARGBRow_Any_AVX2, I400ToARGBRow_AVX2, 0, 1, 4, 15)
+#endif
+#if defined(HAS_YUY2TOARGBROW_SSSE3)
+ANY11(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, 1, 4, 4, 15)
+ANY11(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, 1, 4, 4, 15)
+ANY11(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, 0, 3, 4, 15)
+ANY11(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, 0, 3, 4, 15)
+ANY11(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, 0, 2, 4, 7)
+ANY11(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, 0, 2, 4, 7)
+ANY11(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, 0, 2, 4, 7)
+#endif
+#if defined(HAS_RGB565TOARGBROW_AVX2)
+ANY11(RGB565ToARGBRow_Any_AVX2, RGB565ToARGBRow_AVX2, 0, 2, 4, 15)
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_AVX2)
+ANY11(ARGB1555ToARGBRow_Any_AVX2, ARGB1555ToARGBRow_AVX2, 0, 2, 4, 15)
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_AVX2)
+ANY11(ARGB4444ToARGBRow_Any_AVX2, ARGB4444ToARGBRow_AVX2, 0, 2, 4, 15)
+#endif
+#if defined(HAS_YUY2TOARGBROW_AVX2)
+ANY11(YUY2ToARGBRow_Any_AVX2, YUY2ToARGBRow_AVX2, 1, 4, 4, 31)
+ANY11(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, 1, 4, 4, 31)
+#endif
+#if defined(HAS_ARGBTORGB24ROW_NEON)
+ANY11(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, 0, 4, 3, 7)
+ANY11(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, 0, 4, 3, 7)
+ANY11(ARGBToRGB565Row_Any_NEON, ARGBToRGB565Row_NEON, 0, 4, 2, 7)
+ANY11(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, 0, 4, 2, 7)
+ANY11(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, 0, 4, 2, 7)
+ANY11(J400ToARGBRow_Any_NEON, J400ToARGBRow_NEON, 0, 1, 4, 7)
+ANY11(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, 0, 1, 4, 7)
+ANY11(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, 1, 4, 4, 7)
+ANY11(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBTOYROW_AVX2
+ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31)
+#endif
+#ifdef HAS_ARGBTOYJROW_AVX2
+ANY11(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 0, 4, 1, 31)
+#endif
+#ifdef HAS_UYVYTOYROW_AVX2
+ANY11(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 0, 2, 1, 31)
+#endif
+#ifdef HAS_YUY2TOYROW_AVX2
+ANY11(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, 1, 4, 1, 31)
+#endif
+#ifdef HAS_ARGBTOYROW_SSSE3
+ANY11(ARGBToYRow_Any_SSSE3, ARGBToYRow_SSSE3, 0, 4, 1, 15)
+#endif
+#ifdef HAS_BGRATOYROW_SSSE3
+ANY11(BGRAToYRow_Any_SSSE3, BGRAToYRow_SSSE3, 0, 4, 1, 15)
+ANY11(ABGRToYRow_Any_SSSE3, ABGRToYRow_SSSE3, 0, 4, 1, 15)
+ANY11(RGBAToYRow_Any_SSSE3, RGBAToYRow_SSSE3, 0, 4, 1, 15)
+ANY11(YUY2ToYRow_Any_SSE2, YUY2ToYRow_SSE2, 1, 4, 1, 15)
+ANY11(UYVYToYRow_Any_SSE2, UYVYToYRow_SSE2, 1, 4, 1, 15)
+#endif
+#ifdef HAS_ARGBTOYJROW_SSSE3
+ANY11(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ARGBTOYROW_NEON
+ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 7)
+#endif
+#ifdef HAS_ARGBTOYJROW_NEON
+ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 7)
+#endif
+#ifdef HAS_BGRATOYROW_NEON
+ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 7)
+#endif
+#ifdef HAS_ABGRTOYROW_NEON
+ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 7)
+#endif
+#ifdef HAS_RGBATOYROW_NEON
+ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 7)
+#endif
+#ifdef HAS_RGB24TOYROW_NEON
+ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 7)
+#endif
+#ifdef HAS_RAWTOYROW_NEON
+ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 7)
+#endif
+#ifdef HAS_RGB565TOYROW_NEON
+ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 7)
+#endif
+#ifdef HAS_ARGB1555TOYROW_NEON
+ANY11(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 0, 2, 1, 7)
+#endif
+#ifdef HAS_ARGB4444TOYROW_NEON
+ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 7)
+#endif
+#ifdef HAS_YUY2TOYROW_NEON
+ANY11(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 1, 4, 1, 15)
+#endif
+#ifdef HAS_UYVYTOYROW_NEON
+ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 0, 2, 1, 15)
+#endif
+#ifdef HAS_RGB24TOARGBROW_NEON
+ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7)
+#endif
+#ifdef HAS_RAWTOARGBROW_NEON
+ANY11(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 0, 3, 4, 7)
+#endif
+#ifdef HAS_RGB565TOARGBROW_NEON
+ANY11(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 0, 2, 4, 7)
+#endif
+#ifdef HAS_ARGB1555TOARGBROW_NEON
+ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 7)
+#endif
+#ifdef HAS_ARGB4444TOARGBROW_NEON
+ANY11(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 0, 2, 4, 7)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_SSSE3
+ANY11(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, 0, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_SSE2
+ANY11(ARGBAttenuateRow_Any_SSE2, ARGBAttenuateRow_SSE2, 0, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBUNATTENUATEROW_SSE2
+ANY11(ARGBUnattenuateRow_Any_SSE2, ARGBUnattenuateRow_SSE2, 0, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_AVX2
+ANY11(ARGBAttenuateRow_Any_AVX2, ARGBAttenuateRow_AVX2, 0, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBUNATTENUATEROW_AVX2
+ANY11(ARGBUnattenuateRow_Any_AVX2, ARGBUnattenuateRow_AVX2, 0, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_NEON
+ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7)
+#endif
+#undef ANY11
+
+// Any 1 to 1 with parameter.
+#define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK)                          \
+    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr,                         \
+                 T shuffler, int width) {                                      \
+      SIMD_ALIGNED(uint8 temp[64 * 2]);                                        \
+      memset(temp, 0, 64);  /* for msan */                                     \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(src_ptr, dst_ptr, shuffler, n);                               \
+      }                                                                        \
+      memcpy(temp, src_ptr + n * SBPP, r * SBPP);                              \
+      ANY_SIMD(temp, temp + 64, shuffler, MASK + 1);                           \
+      memcpy(dst_ptr + n * BPP, temp + 64, r * BPP);                           \
+    }
+
+#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
+ANY11P(ARGBToRGB565DitherRow_Any_SSE2, ARGBToRGB565DitherRow_SSE2,
+       const uint32, 4, 2, 3)
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
+ANY11P(ARGBToRGB565DitherRow_Any_AVX2, ARGBToRGB565DitherRow_AVX2,
+       const uint32, 4, 2, 7)
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
+ANY11P(ARGBToRGB565DitherRow_Any_NEON, ARGBToRGB565DitherRow_NEON,
+       const uint32, 4, 2, 7)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_SSE2
+ANY11P(ARGBShuffleRow_Any_SSE2, ARGBShuffleRow_SSE2, const uint8*, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_SSSE3
+ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8*, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_AVX2
+ANY11P(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, const uint8*, 4, 4, 15)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_NEON
+ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8*, 4, 4, 3)
+#endif
+#undef ANY11P
+
+// Any 1 to 1 interpolate.  Takes 2 rows of source via stride.
+#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, MASK)                             \
+    void NAMEANY(uint8* dst_ptr, const uint8* src_ptr,                         \
+                 ptrdiff_t src_stride_ptr, int width,                          \
+                 int source_y_fraction) {                                      \
+      SIMD_ALIGNED(uint8 temp[64 * 3]);                                        \
+      memset(temp, 0, 64 * 2);  /* for msan */                                 \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction);      \
+      }                                                                        \
+      memcpy(temp, src_ptr + n * SBPP, r * SBPP);                              \
+      memcpy(temp + 64, src_ptr + src_stride_ptr + n * SBPP, r * SBPP);        \
+      ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction);             \
+      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
+    }
+
+#ifdef HAS_INTERPOLATEROW_AVX2
+ANY11T(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, 1, 1, 31)
+#endif
+#ifdef HAS_INTERPOLATEROW_SSSE3
+ANY11T(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, 1, 1, 15)
+#endif
+#ifdef HAS_INTERPOLATEROW_SSE2
+ANY11T(InterpolateRow_Any_SSE2, InterpolateRow_SSE2, 1, 1, 15)
+#endif
+#ifdef HAS_INTERPOLATEROW_NEON
+ANY11T(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15)
+#endif
+#ifdef HAS_INTERPOLATEROW_MIPS_DSPR2
+ANY11T(InterpolateRow_Any_MIPS_DSPR2, InterpolateRow_MIPS_DSPR2, 1, 1, 3)
+#endif
+#undef ANY11T
+
+// Any 1 to 1 mirror.
+#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK)                                   \
+    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) {            \
+      SIMD_ALIGNED(uint8 temp[64 * 2]);                                        \
+      memset(temp, 0, 64);  /* for msan */                                     \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(src_ptr + r * BPP, dst_ptr, n);                               \
+      }                                                                        \
+      memcpy(temp, src_ptr, r * BPP);                                          \
+      ANY_SIMD(temp, temp + 64, MASK + 1);                                     \
+      memcpy(dst_ptr + n * BPP, temp + 64 + (MASK + 1 - r) * BPP, r * BPP);    \
+    }
+
+#ifdef HAS_MIRRORROW_AVX2
+ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31)
+#endif
+#ifdef HAS_MIRRORROW_SSSE3
+ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15)
+#endif
+#ifdef HAS_MIRRORROW_SSE2
+ANY11M(MirrorRow_Any_SSE2, MirrorRow_SSE2, 1, 15)
+#endif
+#ifdef HAS_MIRRORROW_NEON
+ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 15)
+#endif
+#ifdef HAS_ARGBMIRRORROW_AVX2
+ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7)
+#endif
+#ifdef HAS_ARGBMIRRORROW_SSE2
+ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3)
+#endif
+#ifdef HAS_ARGBMIRRORROW_NEON
+ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 3)
+#endif
+#undef ANY11M
+
+// Any 1 plane. (memset)
+#define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK)                                  \
+    void NAMEANY(uint8* dst_ptr, T v32, int width) {                           \
+      SIMD_ALIGNED(uint8 temp[64]);                                            \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(dst_ptr, v32, n);                                             \
+      }                                                                        \
+      ANY_SIMD(temp, v32, MASK + 1);                                           \
+      memcpy(dst_ptr + n * BPP, temp, r * BPP);                                \
+    }
+
+#ifdef HAS_SETROW_X86
+ANY1(SetRow_Any_X86, SetRow_X86, uint8, 1, 3)
+#endif
+#ifdef HAS_SETROW_NEON
+ANY1(SetRow_Any_NEON, SetRow_NEON, uint8, 1, 15)
+#endif
+#ifdef HAS_ARGBSETROW_NEON
+ANY1(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, uint32, 4, 3)
+#endif
+#undef ANY1
+
+// Any 1 to 2.  Outputs UV planes.
+#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK)                 \
+    void NAMEANY(const uint8* src_ptr, uint8* dst_u, uint8* dst_v, int width) {\
+      SIMD_ALIGNED(uint8 temp[128 * 3]);                                       \
+      memset(temp, 0, 128);  /* for msan */                                    \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(src_ptr, dst_u, dst_v, n);                                    \
+      }                                                                        \
+      memcpy(temp, src_ptr  + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);     \
+      if ((width & 1) && BPP == 4) {  /* repeat last 4 bytes for subsampler */ \
+        memcpy(temp + SS(r, UVSHIFT) * BPP,                                    \
+               temp + SS(r, UVSHIFT) * BPP - BPP, 4);                          \
+      }                                                                        \
+      ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1);                        \
+      memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT));            \
+      memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT));            \
+    }
+
+#ifdef HAS_SPLITUVROW_SSE2
+ANY12(SplitUVRow_Any_SSE2, SplitUVRow_SSE2, 0, 2, 0, 15)
+#endif
+#ifdef HAS_SPLITUVROW_AVX2
+ANY12(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, 0, 2, 0, 31)
+#endif
+#ifdef HAS_SPLITUVROW_NEON
+ANY12(SplitUVRow_Any_NEON, SplitUVRow_NEON, 0, 2, 0, 15)
+#endif
+#ifdef HAS_SPLITUVROW_MIPS_DSPR2
+ANY12(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_MIPS_DSPR2, 0, 2, 0, 15)
+#endif
+#ifdef HAS_ARGBTOUV444ROW_SSSE3
+ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_AVX2
+ANY12(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2, 1, 4, 1, 31)
+ANY12(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2, 1, 4, 1, 31)
+#endif
+#ifdef HAS_ARGBTOUV422ROW_SSSE3
+ANY12(ARGBToUV422Row_Any_SSSE3, ARGBToUV422Row_SSSE3, 0, 4, 1, 15)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_SSE2
+ANY12(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_SSE2, 1, 4, 1, 15)
+ANY12(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_SSE2, 1, 4, 1, 15)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_NEON
+ANY12(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, 0, 4, 0, 7)
+ANY12(ARGBToUV422Row_Any_NEON, ARGBToUV422Row_NEON, 0, 4, 1, 15)
+ANY12(ARGBToUV411Row_Any_NEON, ARGBToUV411Row_NEON, 0, 4, 2, 31)
+ANY12(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, 1, 4, 1, 15)
+ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15)
+#endif
+#undef ANY12
+
+// Any 1 to 2 with source stride (2 rows of source).  Outputs UV planes.
+// 128 byte row allows for 32 avx ARGB pixels.
+#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK)                          \
+    void NAMEANY(const uint8* src_ptr, int src_stride_ptr,                     \
+                 uint8* dst_u, uint8* dst_v, int width) {                      \
+      SIMD_ALIGNED(uint8 temp[128 * 4]);                                       \
+      memset(temp, 0, 128 * 2);  /* for msan */                                \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(src_ptr, src_stride_ptr, dst_u, dst_v, n);                    \
+      }                                                                        \
+      memcpy(temp, src_ptr  + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);     \
+      memcpy(temp + 128, src_ptr  + src_stride_ptr + (n >> UVSHIFT) * BPP,     \
+             SS(r, UVSHIFT) * BPP);                                            \
+      if ((width & 1) && BPP == 4) {  /* repeat last 4 bytes for subsampler */ \
+        memcpy(temp + SS(r, UVSHIFT) * BPP,                                    \
+               temp + SS(r, UVSHIFT) * BPP - BPP, 4);                          \
+        memcpy(temp + 128 + SS(r, UVSHIFT) * BPP,                              \
+               temp + 128 + SS(r, UVSHIFT) * BPP - BPP, 4);                    \
+      }                                                                        \
+      ANY_SIMD(temp, 128, temp + 256, temp + 384, MASK + 1);                   \
+      memcpy(dst_u + (n >> 1), temp + 256, SS(r, 1));                          \
+      memcpy(dst_v + (n >> 1), temp + 384, SS(r, 1));                          \
+    }
+
+#ifdef HAS_ARGBTOUVROW_AVX2
+ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31)
+#endif
+#ifdef HAS_ARGBTOUVROW_SSSE3
+ANY12S(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_SSSE3, 0, 4, 15)
+ANY12S(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_SSSE3, 0, 4, 15)
+ANY12S(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_SSSE3, 0, 4, 15)
+ANY12S(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_SSSE3, 0, 4, 15)
+ANY12S(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_SSSE3, 0, 4, 15)
+#endif
+#ifdef HAS_YUY2TOUVROW_AVX2
+ANY12S(YUY2ToUVRow_Any_AVX2, YUY2ToUVRow_AVX2, 1, 4, 31)
+ANY12S(UYVYToUVRow_Any_AVX2, UYVYToUVRow_AVX2, 1, 4, 31)
+#endif
+#ifdef HAS_YUY2TOUVROW_SSE2
+ANY12S(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_SSE2, 1, 4, 15)
+ANY12S(UYVYToUVRow_Any_SSE2, UYVYToUVRow_SSE2, 1, 4, 15)
+#endif
+#ifdef HAS_ARGBTOUVROW_NEON
+ANY12S(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_ARGBTOUVJROW_NEON
+ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_BGRATOUVROW_NEON
+ANY12S(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_ABGRTOUVROW_NEON
+ANY12S(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_RGBATOUVROW_NEON
+ANY12S(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_RGB24TOUVROW_NEON
+ANY12S(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, 0, 3, 15)
+#endif
+#ifdef HAS_RAWTOUVROW_NEON
+ANY12S(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, 0, 3, 15)
+#endif
+#ifdef HAS_RGB565TOUVROW_NEON
+ANY12S(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, 0, 2, 15)
+#endif
+#ifdef HAS_ARGB1555TOUVROW_NEON
+ANY12S(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, 0, 2, 15)
+#endif
+#ifdef HAS_ARGB4444TOUVROW_NEON
+ANY12S(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, 0, 2, 15)
+#endif
+#ifdef HAS_YUY2TOUVROW_NEON
+ANY12S(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, 1, 4, 15)
+#endif
+#ifdef HAS_UYVYTOUVROW_NEON
+ANY12S(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, 1, 4, 15)
+#endif
+#undef ANY12S
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libvpx/third_party/libyuv/source/row_common.cc b/libs/libvpx/third_party/libyuv/source/row_common.cc
new file mode 100644
index 0000000000..49875894fe
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/source/row_common.cc
@@ -0,0 +1,2576 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#include <string.h>  // For memcpy and memset.
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// llvm x86 is poor at ternary operator, so use branchless min/max.
+
+#define USE_BRANCHLESS 1
+#if USE_BRANCHLESS
+static __inline int32 clamp0(int32 v) {
+  return ((-(v) >> 31) & (v));
+}
+
+static __inline int32 clamp255(int32 v) {
+  return (((255 - (v)) >> 31) | (v)) & 255;
+}
+
+static __inline uint32 Clamp(int32 val) {
+  int v = clamp0(val);
+  return (uint32)(clamp255(v));
+}
+
+static __inline uint32 Abs(int32 v) {
+  int m = v >> 31;
+  return (v + m) ^ m;
+}
+#else  // USE_BRANCHLESS
+static __inline int32 clamp0(int32 v) {
+  return (v < 0) ? 0 : v;
+}
+
+static __inline int32 clamp255(int32 v) {
+  return (v > 255) ? 255 : v;
+}
+
+static __inline uint32 Clamp(int32 val) {
+  int v = clamp0(val);
+  return (uint32)(clamp255(v));
+}
+
+static __inline uint32 Abs(int32 v) {
+  return (v < 0) ? -v : v;
+}
+#endif  // USE_BRANCHLESS
+
+#ifdef LIBYUV_LITTLE_ENDIAN
+#define WRITEWORD(p, v) *(uint32*)(p) = v
+#else
+static inline void WRITEWORD(uint8* p, uint32 v) {
+  p[0] = (uint8)(v & 255);
+  p[1] = (uint8)((v >> 8) & 255);
+  p[2] = (uint8)((v >> 16) & 255);
+  p[3] = (uint8)((v >> 24) & 255);
+}
+#endif
+
+void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 b = src_rgb24[0];
+    uint8 g = src_rgb24[1];
+    uint8 r = src_rgb24[2];
+    dst_argb[0] = b;
+    dst_argb[1] = g;
+    dst_argb[2] = r;
+    dst_argb[3] = 255u;
+    dst_argb += 4;
+    src_rgb24 += 3;
+  }
+}
+
+void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 r = src_raw[0];
+    uint8 g = src_raw[1];
+    uint8 b = src_raw[2];
+    dst_argb[0] = b;
+    dst_argb[1] = g;
+    dst_argb[2] = r;
+    dst_argb[3] = 255u;
+    dst_argb += 4;
+    src_raw += 3;
+  }
+}
+
+void RGB565ToARGBRow_C(const uint8* src_rgb565, uint8* dst_argb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 b = src_rgb565[0] & 0x1f;
+    uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+    uint8 r = src_rgb565[1] >> 3;
+    dst_argb[0] = (b << 3) | (b >> 2);
+    dst_argb[1] = (g << 2) | (g >> 4);
+    dst_argb[2] = (r << 3) | (r >> 2);
+    dst_argb[3] = 255u;
+    dst_argb += 4;
+    src_rgb565 += 2;
+  }
+}
+
+void ARGB1555ToARGBRow_C(const uint8* src_argb1555, uint8* dst_argb,
+                         int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 b = src_argb1555[0] & 0x1f;
+    uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+    uint8 r = (src_argb1555[1] & 0x7c) >> 2;
+    uint8 a = src_argb1555[1] >> 7;
+    dst_argb[0] = (b << 3) | (b >> 2);
+    dst_argb[1] = (g << 3) | (g >> 2);
+    dst_argb[2] = (r << 3) | (r >> 2);
+    dst_argb[3] = -a;
+    dst_argb += 4;
+    src_argb1555 += 2;
+  }
+}
+
+void ARGB4444ToARGBRow_C(const uint8* src_argb4444, uint8* dst_argb,
+                         int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 b = src_argb4444[0] & 0x0f;
+    uint8 g = src_argb4444[0] >> 4;
+    uint8 r = src_argb4444[1] & 0x0f;
+    uint8 a = src_argb4444[1] >> 4;
+    dst_argb[0] = (b << 4) | b;
+    dst_argb[1] = (g << 4) | g;
+    dst_argb[2] = (r << 4) | r;
+    dst_argb[3] = (a << 4) | a;
+    dst_argb += 4;
+    src_argb4444 += 2;
+  }
+}
+
+void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 b = src_argb[0];
+    uint8 g = src_argb[1];
+    uint8 r = src_argb[2];
+    dst_rgb[0] = b;
+    dst_rgb[1] = g;
+    dst_rgb[2] = r;
+    dst_rgb += 3;
+    src_argb += 4;
+  }
+}
+
+void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 b = src_argb[0];
+    uint8 g = src_argb[1];
+    uint8 r = src_argb[2];
+    dst_rgb[0] = r;
+    dst_rgb[1] = g;
+    dst_rgb[2] = b;
+    dst_rgb += 3;
+    src_argb += 4;
+  }
+}
+
+void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    uint8 b0 = src_argb[0] >> 3;
+    uint8 g0 = src_argb[1] >> 2;
+    uint8 r0 = src_argb[2] >> 3;
+    uint8 b1 = src_argb[4] >> 3;
+    uint8 g1 = src_argb[5] >> 2;
+    uint8 r1 = src_argb[6] >> 3;
+    WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) |
+              (b1 << 16) | (g1 << 21) | (r1 << 27));
+    dst_rgb += 4;
+    src_argb += 8;
+  }
+  if (width & 1) {
+    uint8 b0 = src_argb[0] >> 3;
+    uint8 g0 = src_argb[1] >> 2;
+    uint8 r0 = src_argb[2] >> 3;
+    *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
+  }
+}
+
+// dither4 is a row of 4 values from 4x4 dither matrix.
+// The 4x4 matrix contains values to increase RGB.  When converting to
+// fewer bits (565) this provides an ordered dither.
+// The order in the 4x4 matrix in first byte is upper left.
+// The 4 values are passed as an int, then referenced as an array, so
+// endian will not affect order of the original matrix.  But the dither4
+// will containing the first pixel in the lower byte for little endian
+// or the upper byte for big endian.
+void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb,
+                             const uint32 dither4, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    int dither0 = ((const unsigned char*)(&dither4))[x & 3];
+    int dither1 = ((const unsigned char*)(&dither4))[(x + 1) & 3];
+    uint8 b0 = clamp255(src_argb[0] + dither0) >> 3;
+    uint8 g0 = clamp255(src_argb[1] + dither0) >> 2;
+    uint8 r0 = clamp255(src_argb[2] + dither0) >> 3;
+    uint8 b1 = clamp255(src_argb[4] + dither1) >> 3;
+    uint8 g1 = clamp255(src_argb[5] + dither1) >> 2;
+    uint8 r1 = clamp255(src_argb[6] + dither1) >> 3;
+    WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) |
+              (b1 << 16) | (g1 << 21) | (r1 << 27));
+    dst_rgb += 4;
+    src_argb += 8;
+  }
+  if (width & 1) {
+    int dither0 = ((const unsigned char*)(&dither4))[(width - 1) & 3];
+    uint8 b0 = clamp255(src_argb[0] + dither0) >> 3;
+    uint8 g0 = clamp255(src_argb[1] + dither0) >> 2;
+    uint8 r0 = clamp255(src_argb[2] + dither0) >> 3;
+    *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
+  }
+}
+
+void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    uint8 b0 = src_argb[0] >> 3;
+    uint8 g0 = src_argb[1] >> 3;
+    uint8 r0 = src_argb[2] >> 3;
+    uint8 a0 = src_argb[3] >> 7;
+    uint8 b1 = src_argb[4] >> 3;
+    uint8 g1 = src_argb[5] >> 3;
+    uint8 r1 = src_argb[6] >> 3;
+    uint8 a1 = src_argb[7] >> 7;
+    *(uint32*)(dst_rgb) =
+        b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) |
+        (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31);
+    dst_rgb += 4;
+    src_argb += 8;
+  }
+  if (width & 1) {
+    uint8 b0 = src_argb[0] >> 3;
+    uint8 g0 = src_argb[1] >> 3;
+    uint8 r0 = src_argb[2] >> 3;
+    uint8 a0 = src_argb[3] >> 7;
+    *(uint16*)(dst_rgb) =
+        b0 | (g0 << 5) | (r0 << 10) | (a0 << 15);
+  }
+}
+
+void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    uint8 b0 = src_argb[0] >> 4;
+    uint8 g0 = src_argb[1] >> 4;
+    uint8 r0 = src_argb[2] >> 4;
+    uint8 a0 = src_argb[3] >> 4;
+    uint8 b1 = src_argb[4] >> 4;
+    uint8 g1 = src_argb[5] >> 4;
+    uint8 r1 = src_argb[6] >> 4;
+    uint8 a1 = src_argb[7] >> 4;
+    *(uint32*)(dst_rgb) =
+        b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) |
+        (b1 << 16) | (g1 << 20) | (r1 << 24) | (a1 << 28);
+    dst_rgb += 4;
+    src_argb += 8;
+  }
+  if (width & 1) {
+    uint8 b0 = src_argb[0] >> 4;
+    uint8 g0 = src_argb[1] >> 4;
+    uint8 r0 = src_argb[2] >> 4;
+    uint8 a0 = src_argb[3] >> 4;
+    *(uint16*)(dst_rgb) =
+        b0 | (g0 << 4) | (r0 << 8) | (a0 << 12);
+  }
+}
+
+static __inline int RGBToY(uint8 r, uint8 g, uint8 b) {
+  return (66 * r + 129 * g +  25 * b + 0x1080) >> 8;
+}
+
+static __inline int RGBToU(uint8 r, uint8 g, uint8 b) {
+  return (112 * b - 74 * g - 38 * r + 0x8080) >> 8;
+}
+static __inline int RGBToV(uint8 r, uint8 g, uint8 b) {
+  return (112 * r - 94 * g - 18 * b + 0x8080) >> 8;
+}
+
+#define MAKEROWY(NAME, R, G, B, BPP) \
+void NAME ## ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) {       \
+  int x;                                                                       \
+  for (x = 0; x < width; ++x) {                                                \
+    dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]);               \
+    src_argb0 += BPP;                                                          \
+    dst_y += 1;                                                                \
+  }                                                                            \
+}                                                                              \
+void NAME ## ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb,              \
+                       uint8* dst_u, uint8* dst_v, int width) {                \
+  const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;                           \
+  int x;                                                                       \
+  for (x = 0; x < width - 1; x += 2) {                                         \
+    uint8 ab = (src_rgb0[B] + src_rgb0[B + BPP] +                              \
+               src_rgb1[B] + src_rgb1[B + BPP]) >> 2;                          \
+    uint8 ag = (src_rgb0[G] + src_rgb0[G + BPP] +                              \
+               src_rgb1[G] + src_rgb1[G + BPP]) >> 2;                          \
+    uint8 ar = (src_rgb0[R] + src_rgb0[R + BPP] +                              \
+               src_rgb1[R] + src_rgb1[R + BPP]) >> 2;                          \
+    dst_u[0] = RGBToU(ar, ag, ab);                                             \
+    dst_v[0] = RGBToV(ar, ag, ab);                                             \
+    src_rgb0 += BPP * 2;                                                       \
+    src_rgb1 += BPP * 2;                                                       \
+    dst_u += 1;                                                                \
+    dst_v += 1;                                                                \
+  }                                                                            \
+  if (width & 1) {                                                             \
+    uint8 ab = (src_rgb0[B] + src_rgb1[B]) >> 1;                               \
+    uint8 ag = (src_rgb0[G] + src_rgb1[G]) >> 1;                               \
+    uint8 ar = (src_rgb0[R] + src_rgb1[R]) >> 1;                               \
+    dst_u[0] = RGBToU(ar, ag, ab);                                             \
+    dst_v[0] = RGBToV(ar, ag, ab);                                             \
+  }                                                                            \
+}
+
+MAKEROWY(ARGB, 2, 1, 0, 4)
+MAKEROWY(BGRA, 1, 2, 3, 4)
+MAKEROWY(ABGR, 0, 1, 2, 4)
+MAKEROWY(RGBA, 3, 2, 1, 4)
+MAKEROWY(RGB24, 2, 1, 0, 3)
+MAKEROWY(RAW, 0, 1, 2, 3)
+#undef MAKEROWY
+
+// JPeg uses a variation on BT.601-1 full range
+// y =  0.29900 * r + 0.58700 * g + 0.11400 * b
+// u = -0.16874 * r - 0.33126 * g + 0.50000 * b  + center
+// v =  0.50000 * r - 0.41869 * g - 0.08131 * b  + center
+// BT.601 Mpeg range uses:
+// b 0.1016 * 255 = 25.908 = 25
+// g 0.5078 * 255 = 129.489 = 129
+// r 0.2578 * 255 = 65.739 = 66
+// JPeg 8 bit Y (not used):
+// b 0.11400 * 256 = 29.184 = 29
+// g 0.58700 * 256 = 150.272 = 150
+// r 0.29900 * 256 = 76.544 = 77
+// JPeg 7 bit Y:
+// b 0.11400 * 128 = 14.592 = 15
+// g 0.58700 * 128 = 75.136 = 75
+// r 0.29900 * 128 = 38.272 = 38
+// JPeg 8 bit U:
+// b  0.50000 * 255 = 127.5 = 127
+// g -0.33126 * 255 = -84.4713 = -84
+// r -0.16874 * 255 = -43.0287 = -43
+// JPeg 8 bit V:
+// b -0.08131 * 255 = -20.73405 = -20
+// g -0.41869 * 255 = -106.76595 = -107
+// r  0.50000 * 255 = 127.5 = 127
+
+static __inline int RGBToYJ(uint8 r, uint8 g, uint8 b) {
+  return (38 * r + 75 * g +  15 * b + 64) >> 7;
+}
+
+static __inline int RGBToUJ(uint8 r, uint8 g, uint8 b) {
+  return (127 * b - 84 * g - 43 * r + 0x8080) >> 8;
+}
+static __inline int RGBToVJ(uint8 r, uint8 g, uint8 b) {
+  return (127 * r - 107 * g - 20 * b + 0x8080) >> 8;
+}
+
+#define AVGB(a, b) (((a) + (b) + 1) >> 1)
+
+#define MAKEROWYJ(NAME, R, G, B, BPP) \
+void NAME ## ToYJRow_C(const uint8* src_argb0, uint8* dst_y, int width) {      \
+  int x;                                                                       \
+  for (x = 0; x < width; ++x) {                                                \
+    dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]);              \
+    src_argb0 += BPP;                                                          \
+    dst_y += 1;                                                                \
+  }                                                                            \
+}                                                                              \
+void NAME ## ToUVJRow_C(const uint8* src_rgb0, int src_stride_rgb,             \
+                        uint8* dst_u, uint8* dst_v, int width) {               \
+  const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;                           \
+  int x;                                                                       \
+  for (x = 0; x < width - 1; x += 2) {                                         \
+    uint8 ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]),                            \
+                    AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP]));               \
+    uint8 ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]),                            \
+                    AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP]));               \
+    uint8 ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]),                            \
+                    AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP]));               \
+    dst_u[0] = RGBToUJ(ar, ag, ab);                                            \
+    dst_v[0] = RGBToVJ(ar, ag, ab);                                            \
+    src_rgb0 += BPP * 2;                                                       \
+    src_rgb1 += BPP * 2;                                                       \
+    dst_u += 1;                                                                \
+    dst_v += 1;                                                                \
+  }                                                                            \
+  if (width & 1) {                                                             \
+    uint8 ab = AVGB(src_rgb0[B], src_rgb1[B]);                                 \
+    uint8 ag = AVGB(src_rgb0[G], src_rgb1[G]);                                 \
+    uint8 ar = AVGB(src_rgb0[R], src_rgb1[R]);                                 \
+    dst_u[0] = RGBToUJ(ar, ag, ab);                                            \
+    dst_v[0] = RGBToVJ(ar, ag, ab);                                            \
+  }                                                                            \
+}
+
+MAKEROWYJ(ARGB, 2, 1, 0, 4)
+#undef MAKEROWYJ
+
+void ARGBToUVJ422Row_C(const uint8* src_argb,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    uint8 ab = (src_argb[0] + src_argb[4]) >> 1;
+    uint8 ag = (src_argb[1] + src_argb[5]) >> 1;
+    uint8 ar = (src_argb[2] + src_argb[6]) >> 1;
+    dst_u[0] = RGBToUJ(ar, ag, ab);
+    dst_v[0] = RGBToVJ(ar, ag, ab);
+    src_argb += 8;
+    dst_u += 1;
+    dst_v += 1;
+  }
+  if (width & 1) {
+    uint8 ab = src_argb[0];
+    uint8 ag = src_argb[1];
+    uint8 ar = src_argb[2];
+    dst_u[0] = RGBToUJ(ar, ag, ab);
+    dst_v[0] = RGBToVJ(ar, ag, ab);
+  }
+}
+
+void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 b = src_rgb565[0] & 0x1f;
+    uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+    uint8 r = src_rgb565[1] >> 3;
+    b = (b << 3) | (b >> 2);
+    g = (g << 2) | (g >> 4);
+    r = (r << 3) | (r >> 2);
+    dst_y[0] = RGBToY(r, g, b);
+    src_rgb565 += 2;
+    dst_y += 1;
+  }
+}
+
+void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 b = src_argb1555[0] & 0x1f;
+    uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+    uint8 r = (src_argb1555[1] & 0x7c) >> 2;
+    b = (b << 3) | (b >> 2);
+    g = (g << 3) | (g >> 2);
+    r = (r << 3) | (r >> 2);
+    dst_y[0] = RGBToY(r, g, b);
+    src_argb1555 += 2;
+    dst_y += 1;
+  }
+}
+
+void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 b = src_argb4444[0] & 0x0f;
+    uint8 g = src_argb4444[0] >> 4;
+    uint8 r = src_argb4444[1] & 0x0f;
+    b = (b << 4) | b;
+    g = (g << 4) | g;
+    r = (r << 4) | r;
+    dst_y[0] = RGBToY(r, g, b);
+    src_argb4444 += 2;
+    dst_y += 1;
+  }
+}
+
+void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565,
+                     uint8* dst_u, uint8* dst_v, int width) {
+  const uint8* next_rgb565 = src_rgb565 + src_stride_rgb565;
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    uint8 b0 = src_rgb565[0] & 0x1f;
+    uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+    uint8 r0 = src_rgb565[1] >> 3;
+    uint8 b1 = src_rgb565[2] & 0x1f;
+    uint8 g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3);
+    uint8 r1 = src_rgb565[3] >> 3;
+    uint8 b2 = next_rgb565[0] & 0x1f;
+    uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
+    uint8 r2 = next_rgb565[1] >> 3;
+    uint8 b3 = next_rgb565[2] & 0x1f;
+    uint8 g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3);
+    uint8 r3 = next_rgb565[3] >> 3;
+    uint8 b = (b0 + b1 + b2 + b3);  // 565 * 4 = 787.
+    uint8 g = (g0 + g1 + g2 + g3);
+    uint8 r = (r0 + r1 + r2 + r3);
+    b = (b << 1) | (b >> 6);  // 787 -> 888.
+    r = (r << 1) | (r >> 6);
+    dst_u[0] = RGBToU(r, g, b);
+    dst_v[0] = RGBToV(r, g, b);
+    src_rgb565 += 4;
+    next_rgb565 += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+  if (width & 1) {
+    uint8 b0 = src_rgb565[0] & 0x1f;
+    uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+    uint8 r0 = src_rgb565[1] >> 3;
+    uint8 b2 = next_rgb565[0] & 0x1f;
+    uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
+    uint8 r2 = next_rgb565[1] >> 3;
+    uint8 b = (b0 + b2);  // 565 * 2 = 676.
+    uint8 g = (g0 + g2);
+    uint8 r = (r0 + r2);
+    b = (b << 2) | (b >> 4);  // 676 -> 888
+    g = (g << 1) | (g >> 6);
+    r = (r << 2) | (r >> 4);
+    dst_u[0] = RGBToU(r, g, b);
+    dst_v[0] = RGBToV(r, g, b);
+  }
+}
+
+void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  const uint8* next_argb1555 = src_argb1555 + src_stride_argb1555;
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    uint8 b0 = src_argb1555[0] & 0x1f;
+    uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+    uint8 r0 = (src_argb1555[1] & 0x7c) >> 2;
+    uint8 b1 = src_argb1555[2] & 0x1f;
+    uint8 g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3);
+    uint8 r1 = (src_argb1555[3] & 0x7c) >> 2;
+    uint8 b2 = next_argb1555[0] & 0x1f;
+    uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
+    uint8 r2 = (next_argb1555[1] & 0x7c) >> 2;
+    uint8 b3 = next_argb1555[2] & 0x1f;
+    uint8 g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3);
+    uint8 r3 = (next_argb1555[3] & 0x7c) >> 2;
+    uint8 b = (b0 + b1 + b2 + b3);  // 555 * 4 = 777.
+    uint8 g = (g0 + g1 + g2 + g3);
+    uint8 r = (r0 + r1 + r2 + r3);
+    b = (b << 1) | (b >> 6);  // 777 -> 888.
+    g = (g << 1) | (g >> 6);
+    r = (r << 1) | (r >> 6);
+    dst_u[0] = RGBToU(r, g, b);
+    dst_v[0] = RGBToV(r, g, b);
+    src_argb1555 += 4;
+    next_argb1555 += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+  if (width & 1) {
+    uint8 b0 = src_argb1555[0] & 0x1f;
+    uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+    uint8 r0 = (src_argb1555[1] & 0x7c) >> 2;
+    uint8 b2 = next_argb1555[0] & 0x1f;
+    uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
+    uint8 r2 = next_argb1555[1] >> 3;
+    uint8 b = (b0 + b2);  // 555 * 2 = 666.
+    uint8 g = (g0 + g2);
+    uint8 r = (r0 + r2);
+    b = (b << 2) | (b >> 4);  // 666 -> 888.
+    g = (g << 2) | (g >> 4);
+    r = (r << 2) | (r >> 4);
+    dst_u[0] = RGBToU(r, g, b);
+    dst_v[0] = RGBToV(r, g, b);
+  }
+}
+
+void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  const uint8* next_argb4444 = src_argb4444 + src_stride_argb4444;
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    uint8 b0 = src_argb4444[0] & 0x0f;
+    uint8 g0 = src_argb4444[0] >> 4;
+    uint8 r0 = src_argb4444[1] & 0x0f;
+    uint8 b1 = src_argb4444[2] & 0x0f;
+    uint8 g1 = src_argb4444[2] >> 4;
+    uint8 r1 = src_argb4444[3] & 0x0f;
+    uint8 b2 = next_argb4444[0] & 0x0f;
+    uint8 g2 = next_argb4444[0] >> 4;
+    uint8 r2 = next_argb4444[1] & 0x0f;
+    uint8 b3 = next_argb4444[2] & 0x0f;
+    uint8 g3 = next_argb4444[2] >> 4;
+    uint8 r3 = next_argb4444[3] & 0x0f;
+    uint8 b = (b0 + b1 + b2 + b3);  // 444 * 4 = 666.
+    uint8 g = (g0 + g1 + g2 + g3);
+    uint8 r = (r0 + r1 + r2 + r3);
+    b = (b << 2) | (b >> 4);  // 666 -> 888.
+    g = (g << 2) | (g >> 4);
+    r = (r << 2) | (r >> 4);
+    dst_u[0] = RGBToU(r, g, b);
+    dst_v[0] = RGBToV(r, g, b);
+    src_argb4444 += 4;
+    next_argb4444 += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+  if (width & 1) {
+    uint8 b0 = src_argb4444[0] & 0x0f;
+    uint8 g0 = src_argb4444[0] >> 4;
+    uint8 r0 = src_argb4444[1] & 0x0f;
+    uint8 b2 = next_argb4444[0] & 0x0f;
+    uint8 g2 = next_argb4444[0] >> 4;
+    uint8 r2 = next_argb4444[1] & 0x0f;
+    uint8 b = (b0 + b2);  // 444 * 2 = 555.
+    uint8 g = (g0 + g2);
+    uint8 r = (r0 + r2);
+    b = (b << 3) | (b >> 2);  // 555 -> 888.
+    g = (g << 3) | (g >> 2);
+    r = (r << 3) | (r >> 2);
+    dst_u[0] = RGBToU(r, g, b);
+    dst_v[0] = RGBToV(r, g, b);
+  }
+}
+
+void ARGBToUV444Row_C(const uint8* src_argb,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 ab = src_argb[0];
+    uint8 ag = src_argb[1];
+    uint8 ar = src_argb[2];
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+    src_argb += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+}
+
+void ARGBToUV422Row_C(const uint8* src_argb,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    uint8 ab = (src_argb[0] + src_argb[4]) >> 1;
+    uint8 ag = (src_argb[1] + src_argb[5]) >> 1;
+    uint8 ar = (src_argb[2] + src_argb[6]) >> 1;
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+    src_argb += 8;
+    dst_u += 1;
+    dst_v += 1;
+  }
+  if (width & 1) {
+    uint8 ab = src_argb[0];
+    uint8 ag = src_argb[1];
+    uint8 ar = src_argb[2];
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+  }
+}
+
+void ARGBToUV411Row_C(const uint8* src_argb,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  int x;
+  for (x = 0; x < width - 3; x += 4) {
+    uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8] + src_argb[12]) >> 2;
+    uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9] + src_argb[13]) >> 2;
+    uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10] + src_argb[14]) >> 2;
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+    src_argb += 16;
+    dst_u += 1;
+    dst_v += 1;
+  }
+  if ((width & 3) == 3) {
+    uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8]) / 3;
+    uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9]) / 3;
+    uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10]) / 3;
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+  } else if ((width & 3) == 2) {
+    uint8 ab = (src_argb[0] + src_argb[4]) >> 1;
+    uint8 ag = (src_argb[1] + src_argb[5]) >> 1;
+    uint8 ar = (src_argb[2] + src_argb[6]) >> 1;
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+  } else if ((width & 3) == 1) {
+    uint8 ab = src_argb[0];
+    uint8 ag = src_argb[1];
+    uint8 ar = src_argb[2];
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+  }
+}
+
+void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 y = RGBToYJ(src_argb[2], src_argb[1], src_argb[0]);
+    dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
+    dst_argb[3] = src_argb[3];
+    dst_argb += 4;
+    src_argb += 4;
+  }
+}
+
+// Convert a row of image to Sepia tone.
+void ARGBSepiaRow_C(uint8* dst_argb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    int b = dst_argb[0];
+    int g = dst_argb[1];
+    int r = dst_argb[2];
+    int sb = (b * 17 + g * 68 + r * 35) >> 7;
+    int sg = (b * 22 + g * 88 + r * 45) >> 7;
+    int sr = (b * 24 + g * 98 + r * 50) >> 7;
+    // b does not over flow. a is preserved from original.
+    dst_argb[0] = sb;
+    dst_argb[1] = clamp255(sg);
+    dst_argb[2] = clamp255(sr);
+    dst_argb += 4;
+  }
+}
+
+// Apply color matrix to a row of image. Matrix is signed.
+// TODO(fbarchard): Consider adding rounding (+32).
+void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb,
+                          const int8* matrix_argb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    int b = src_argb[0];
+    int g = src_argb[1];
+    int r = src_argb[2];
+    int a = src_argb[3];
+    int sb = (b * matrix_argb[0] + g * matrix_argb[1] +
+              r * matrix_argb[2] + a * matrix_argb[3]) >> 6;
+    int sg = (b * matrix_argb[4] + g * matrix_argb[5] +
+              r * matrix_argb[6] + a * matrix_argb[7]) >> 6;
+    int sr = (b * matrix_argb[8] + g * matrix_argb[9] +
+              r * matrix_argb[10] + a * matrix_argb[11]) >> 6;
+    int sa = (b * matrix_argb[12] + g * matrix_argb[13] +
+              r * matrix_argb[14] + a * matrix_argb[15]) >> 6;
+    dst_argb[0] = Clamp(sb);
+    dst_argb[1] = Clamp(sg);
+    dst_argb[2] = Clamp(sr);
+    dst_argb[3] = Clamp(sa);
+    src_argb += 4;
+    dst_argb += 4;
+  }
+}
+
+// Apply color table to a row of image.
+void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    int b = dst_argb[0];
+    int g = dst_argb[1];
+    int r = dst_argb[2];
+    int a = dst_argb[3];
+    dst_argb[0] = table_argb[b * 4 + 0];
+    dst_argb[1] = table_argb[g * 4 + 1];
+    dst_argb[2] = table_argb[r * 4 + 2];
+    dst_argb[3] = table_argb[a * 4 + 3];
+    dst_argb += 4;
+  }
+}
+
+// Apply color table to a row of image.
+void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    int b = dst_argb[0];
+    int g = dst_argb[1];
+    int r = dst_argb[2];
+    dst_argb[0] = table_argb[b * 4 + 0];
+    dst_argb[1] = table_argb[g * 4 + 1];
+    dst_argb[2] = table_argb[r * 4 + 2];
+    dst_argb += 4;
+  }
+}
+
+void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,
+                       int interval_offset, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    int b = dst_argb[0];
+    int g = dst_argb[1];
+    int r = dst_argb[2];
+    dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset;
+    dst_argb[1] = (g * scale >> 16) * interval_size + interval_offset;
+    dst_argb[2] = (r * scale >> 16) * interval_size + interval_offset;
+    dst_argb += 4;
+  }
+}
+
+#define REPEAT8(v) (v) | ((v) << 8)
+#define SHADE(f, v) v * f >> 24
+
+void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
+                    uint32 value) {
+  const uint32 b_scale = REPEAT8(value & 0xff);
+  const uint32 g_scale = REPEAT8((value >> 8) & 0xff);
+  const uint32 r_scale = REPEAT8((value >> 16) & 0xff);
+  const uint32 a_scale = REPEAT8(value >> 24);
+
+  int i;
+  for (i = 0; i < width; ++i) {
+    const uint32 b = REPEAT8(src_argb[0]);
+    const uint32 g = REPEAT8(src_argb[1]);
+    const uint32 r = REPEAT8(src_argb[2]);
+    const uint32 a = REPEAT8(src_argb[3]);
+    dst_argb[0] = SHADE(b, b_scale);
+    dst_argb[1] = SHADE(g, g_scale);
+    dst_argb[2] = SHADE(r, r_scale);
+    dst_argb[3] = SHADE(a, a_scale);
+    src_argb += 4;
+    dst_argb += 4;
+  }
+}
+#undef REPEAT8
+#undef SHADE
+
+#define REPEAT8(v) (v) | ((v) << 8)
+#define SHADE(f, v) v * f >> 16
+
+void ARGBMultiplyRow_C(const uint8* src_argb0, const uint8* src_argb1,
+                       uint8* dst_argb, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    const uint32 b = REPEAT8(src_argb0[0]);
+    const uint32 g = REPEAT8(src_argb0[1]);
+    const uint32 r = REPEAT8(src_argb0[2]);
+    const uint32 a = REPEAT8(src_argb0[3]);
+    const uint32 b_scale = src_argb1[0];
+    const uint32 g_scale = src_argb1[1];
+    const uint32 r_scale = src_argb1[2];
+    const uint32 a_scale = src_argb1[3];
+    dst_argb[0] = SHADE(b, b_scale);
+    dst_argb[1] = SHADE(g, g_scale);
+    dst_argb[2] = SHADE(r, r_scale);
+    dst_argb[3] = SHADE(a, a_scale);
+    src_argb0 += 4;
+    src_argb1 += 4;
+    dst_argb += 4;
+  }
+}
+#undef REPEAT8
+#undef SHADE
+
+#define SHADE(f, v) clamp255(v + f)
+
+void ARGBAddRow_C(const uint8* src_argb0, const uint8* src_argb1,
+                  uint8* dst_argb, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    const int b = src_argb0[0];
+    const int g = src_argb0[1];
+    const int r = src_argb0[2];
+    const int a = src_argb0[3];
+    const int b_add = src_argb1[0];
+    const int g_add = src_argb1[1];
+    const int r_add = src_argb1[2];
+    const int a_add = src_argb1[3];
+    dst_argb[0] = SHADE(b, b_add);
+    dst_argb[1] = SHADE(g, g_add);
+    dst_argb[2] = SHADE(r, r_add);
+    dst_argb[3] = SHADE(a, a_add);
+    src_argb0 += 4;
+    src_argb1 += 4;
+    dst_argb += 4;
+  }
+}
+#undef SHADE
+
+#define SHADE(f, v) clamp0(f - v)
+
+void ARGBSubtractRow_C(const uint8* src_argb0, const uint8* src_argb1,
+                       uint8* dst_argb, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    const int b = src_argb0[0];
+    const int g = src_argb0[1];
+    const int r = src_argb0[2];
+    const int a = src_argb0[3];
+    const int b_sub = src_argb1[0];
+    const int g_sub = src_argb1[1];
+    const int r_sub = src_argb1[2];
+    const int a_sub = src_argb1[3];
+    dst_argb[0] = SHADE(b, b_sub);
+    dst_argb[1] = SHADE(g, g_sub);
+    dst_argb[2] = SHADE(r, r_sub);
+    dst_argb[3] = SHADE(a, a_sub);
+    src_argb0 += 4;
+    src_argb1 += 4;
+    dst_argb += 4;
+  }
+}
+#undef SHADE
+
+// Sobel functions which mimics SSSE3.
+void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2,
+                 uint8* dst_sobelx, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    int a = src_y0[i];
+    int b = src_y1[i];
+    int c = src_y2[i];
+    int a_sub = src_y0[i + 2];
+    int b_sub = src_y1[i + 2];
+    int c_sub = src_y2[i + 2];
+    int a_diff = a - a_sub;
+    int b_diff = b - b_sub;
+    int c_diff = c - c_sub;
+    int sobel = Abs(a_diff + b_diff * 2 + c_diff);
+    dst_sobelx[i] = (uint8)(clamp255(sobel));
+  }
+}
+
+void SobelYRow_C(const uint8* src_y0, const uint8* src_y1,
+                 uint8* dst_sobely, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    int a = src_y0[i + 0];
+    int b = src_y0[i + 1];
+    int c = src_y0[i + 2];
+    int a_sub = src_y1[i + 0];
+    int b_sub = src_y1[i + 1];
+    int c_sub = src_y1[i + 2];
+    int a_diff = a - a_sub;
+    int b_diff = b - b_sub;
+    int c_diff = c - c_sub;
+    int sobel = Abs(a_diff + b_diff * 2 + c_diff);
+    dst_sobely[i] = (uint8)(clamp255(sobel));
+  }
+}
+
+void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely,
+                uint8* dst_argb, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    int r = src_sobelx[i];
+    int b = src_sobely[i];
+    int s = clamp255(r + b);
+    dst_argb[0] = (uint8)(s);
+    dst_argb[1] = (uint8)(s);
+    dst_argb[2] = (uint8)(s);
+    dst_argb[3] = (uint8)(255u);
+    dst_argb += 4;
+  }
+}
+
+void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely,
+                       uint8* dst_y, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    int r = src_sobelx[i];
+    int b = src_sobely[i];
+    int s = clamp255(r + b);
+    dst_y[i] = (uint8)(s);
+  }
+}
+
+void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely,
+                  uint8* dst_argb, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    int r = src_sobelx[i];
+    int b = src_sobely[i];
+    int g = clamp255(r + b);
+    dst_argb[0] = (uint8)(b);
+    dst_argb[1] = (uint8)(g);
+    dst_argb[2] = (uint8)(r);
+    dst_argb[3] = (uint8)(255u);
+    dst_argb += 4;
+  }
+}
+
+void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
+  // Copy a Y to RGB.
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 y = src_y[0];
+    dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
+    dst_argb[3] = 255u;
+    dst_argb += 4;
+    ++src_y;
+  }
+}
+
+// BT.601 YUV to RGB reference
+//  R = (Y - 16) * 1.164              - V * -1.596
+//  G = (Y - 16) * 1.164 - U *  0.391 - V *  0.813
+//  B = (Y - 16) * 1.164 - U * -2.018
+
+// Y contribution to R,G,B.  Scale and bias.
+// TODO(fbarchard): Consider moving constants into a common header.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
+
+// U and V contributions to R,G,B.
+#define UB -128 /* max(-128, round(-2.018 * 64)) */
+#define UG 25 /* round(0.391 * 64) */
+#define VG 52 /* round(0.813 * 64) */
+#define VR -102 /* round(-1.596 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BB (UB * 128 + YGB)
+#define BG (UG * 128 + VG * 128 + YGB)
+#define BR (VR * 128 + YGB)
+
+// C reference code that mimics the YUV assembly.
+static __inline void YuvPixel(uint8 y, uint8 u, uint8 v,
+                              uint8* b, uint8* g, uint8* r) {
+  uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16;
+  *b = Clamp((int32)(-(u * UB) + y1 + BB) >> 6);
+  *g = Clamp((int32)(-(v * VG + u * UG) + y1 + BG) >> 6);
+  *r = Clamp((int32)(-(v * VR)+ y1 + BR) >> 6);
+}
+
+// C reference code that mimics the YUV assembly.
+static __inline void YPixel(uint8 y, uint8* b, uint8* g, uint8* r) {
+  uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16;
+  *b = Clamp((int32)(y1 + YGB) >> 6);
+  *g = Clamp((int32)(y1 + YGB) >> 6);
+  *r = Clamp((int32)(y1 + YGB) >> 6);
+}
+
+#undef YG
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef BB
+#undef BG
+#undef BR
+
+// JPEG YUV to RGB reference
+// *  R = Y                - V * -1.40200
+// *  G = Y - U *  0.34414 - V *  0.71414
+// *  B = Y - U * -1.77200
+
+// Y contribution to R,G,B.  Scale and bias.
+// TODO(fbarchard): Consider moving constants into a common header.
+#define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+#define YGBJ 32  /* 64 / 2 */
+
+// U and V contributions to R,G,B.
+#define UBJ -113 /* round(-1.77200 * 64) */
+#define UGJ 22 /* round(0.34414 * 64) */
+#define VGJ 46 /* round(0.71414  * 64) */
+#define VRJ -90 /* round(-1.40200 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BBJ (UBJ * 128 + YGBJ)
+#define BGJ (UGJ * 128 + VGJ * 128 + YGBJ)
+#define BRJ (VRJ * 128 + YGBJ)
+
+// C reference code that mimics the YUV assembly.
+static __inline void YuvJPixel(uint8 y, uint8 u, uint8 v,
+                               uint8* b, uint8* g, uint8* r) {
+  uint32 y1 = (uint32)(y * 0x0101 * YGJ) >> 16;
+  *b = Clamp((int32)(-(u * UBJ) + y1 + BBJ) >> 6);
+  *g = Clamp((int32)(-(v * VGJ + u * UGJ) + y1 + BGJ) >> 6);
+  *r = Clamp((int32)(-(v * VRJ) + y1 + BRJ) >> 6);
+}
+
+#undef YGJ
+#undef YGBJ
+#undef UBJ
+#undef UGJ
+#undef VGJ
+#undef VRJ
+#undef BBJ
+#undef BGJ
+#undef BRJ
+
+#if !defined(LIBYUV_DISABLE_NEON) && \
+    (defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON))
+// C mimic assembly.
+// TODO(fbarchard): Remove subsampling from Neon.
+void I444ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* rgb_buf,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    uint8 u = (src_u[0] + src_u[1] + 1) >> 1;
+    uint8 v = (src_v[0] + src_v[1] + 1) >> 1;
+    YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+    YuvPixel(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+    rgb_buf[7] = 255;
+    src_y += 2;
+    src_u += 2;
+    src_v += 2;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+  }
+}
+#else
+void I444ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* rgb_buf,
+                     int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+    src_y += 1;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 4;  // Advance 1 pixel.
+  }
+}
+#endif
+
+// Also used for 420
+void I422ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* rgb_buf,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+    YuvPixel(src_y[1], src_u[0], src_v[0],
+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+    rgb_buf[7] = 255;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+  }
+}
+
+void J422ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* rgb_buf,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvJPixel(src_y[0], src_u[0], src_v[0],
+              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+    YuvJPixel(src_y[1], src_u[0], src_v[0],
+              rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+    rgb_buf[7] = 255;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvJPixel(src_y[0], src_u[0], src_v[0],
+              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+  }
+}
+
+void I422ToRGB24Row_C(const uint8* src_y,
+                      const uint8* src_u,
+                      const uint8* src_v,
+                      uint8* rgb_buf,
+                      int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    YuvPixel(src_y[1], src_u[0], src_v[0],
+             rgb_buf + 3, rgb_buf + 4, rgb_buf + 5);
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 6;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+  }
+}
+
+void I422ToRAWRow_C(const uint8* src_y,
+                    const uint8* src_u,
+                    const uint8* src_v,
+                    uint8* rgb_buf,
+                    int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
+    YuvPixel(src_y[1], src_u[0], src_v[0],
+             rgb_buf + 5, rgb_buf + 4, rgb_buf + 3);
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 6;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
+  }
+}
+
+void I422ToARGB4444Row_C(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_argb4444,
+                         int width) {
+  uint8 b0;
+  uint8 g0;
+  uint8 r0;
+  uint8 b1;
+  uint8 g1;
+  uint8 r1;
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
+    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1);
+    b0 = b0 >> 4;
+    g0 = g0 >> 4;
+    r0 = r0 >> 4;
+    b1 = b1 >> 4;
+    g1 = g1 >> 4;
+    r1 = r1 >> 4;
+    *(uint32*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) |
+        (b1 << 16) | (g1 << 20) | (r1 << 24) | 0xf000f000;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    dst_argb4444 += 4;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
+    b0 = b0 >> 4;
+    g0 = g0 >> 4;
+    r0 = r0 >> 4;
+    *(uint16*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) |
+        0xf000;
+  }
+}
+
+void I422ToARGB1555Row_C(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_argb1555,
+                         int width) {
+  uint8 b0;
+  uint8 g0;
+  uint8 r0;
+  uint8 b1;
+  uint8 g1;
+  uint8 r1;
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
+    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1);
+    b0 = b0 >> 3;
+    g0 = g0 >> 3;
+    r0 = r0 >> 3;
+    b1 = b1 >> 3;
+    g1 = g1 >> 3;
+    r1 = r1 >> 3;
+    *(uint32*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) |
+        (b1 << 16) | (g1 << 21) | (r1 << 26) | 0x80008000;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    dst_argb1555 += 4;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
+    b0 = b0 >> 3;
+    g0 = g0 >> 3;
+    r0 = r0 >> 3;
+    *(uint16*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) |
+        0x8000;
+  }
+}
+
+void I422ToRGB565Row_C(const uint8* src_y,
+                       const uint8* src_u,
+                       const uint8* src_v,
+                       uint8* dst_rgb565,
+                       int width) {
+  uint8 b0;
+  uint8 g0;
+  uint8 r0;
+  uint8 b1;
+  uint8 g1;
+  uint8 r1;
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
+    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1);
+    b0 = b0 >> 3;
+    g0 = g0 >> 2;
+    r0 = r0 >> 3;
+    b1 = b1 >> 3;
+    g1 = g1 >> 2;
+    r1 = r1 >> 3;
+    *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
+        (b1 << 16) | (g1 << 21) | (r1 << 27);
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    dst_rgb565 += 4;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
+    b0 = b0 >> 3;
+    g0 = g0 >> 2;
+    r0 = r0 >> 3;
+    *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
+  }
+}
+
+void I411ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* rgb_buf,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 3; x += 4) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+    YuvPixel(src_y[1], src_u[0], src_v[0],
+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+    rgb_buf[7] = 255;
+    YuvPixel(src_y[2], src_u[0], src_v[0],
+             rgb_buf + 8, rgb_buf + 9, rgb_buf + 10);
+    rgb_buf[11] = 255;
+    YuvPixel(src_y[3], src_u[0], src_v[0],
+             rgb_buf + 12, rgb_buf + 13, rgb_buf + 14);
+    rgb_buf[15] = 255;
+    src_y += 4;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 16;  // Advance 4 pixels.
+  }
+  if (width & 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+    YuvPixel(src_y[1], src_u[0], src_v[0],
+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+    rgb_buf[7] = 255;
+    src_y += 2;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+  }
+}
+
+void NV12ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_uv,
+                     uint8* rgb_buf,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_uv[0], src_uv[1],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+    YuvPixel(src_y[1], src_uv[0], src_uv[1],
+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+    rgb_buf[7] = 255;
+    src_y += 2;
+    src_uv += 2;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_uv[0], src_uv[1],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+  }
+}
+
+void NV21ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_vu,
+                     uint8* rgb_buf,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_vu[1], src_vu[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+
+    YuvPixel(src_y[1], src_vu[1], src_vu[0],
+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+    rgb_buf[7] = 255;
+
+    src_y += 2;
+    src_vu += 2;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_vu[1], src_vu[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+  }
+}
+
+void NV12ToRGB565Row_C(const uint8* src_y,
+                       const uint8* src_uv,
+                       uint8* dst_rgb565,
+                       int width) {
+  uint8 b0;
+  uint8 g0;
+  uint8 r0;
+  uint8 b1;
+  uint8 g1;
+  uint8 r1;
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0);
+    YuvPixel(src_y[1], src_uv[0], src_uv[1], &b1, &g1, &r1);
+    b0 = b0 >> 3;
+    g0 = g0 >> 2;
+    r0 = r0 >> 3;
+    b1 = b1 >> 3;
+    g1 = g1 >> 2;
+    r1 = r1 >> 3;
+    *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
+        (b1 << 16) | (g1 << 21) | (r1 << 27);
+    src_y += 2;
+    src_uv += 2;
+    dst_rgb565 += 4;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0);
+    b0 = b0 >> 3;
+    g0 = g0 >> 2;
+    r0 = r0 >> 3;
+    *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
+  }
+}
+
+void NV21ToRGB565Row_C(const uint8* src_y,
+                       const uint8* vsrc_u,
+                       uint8* dst_rgb565,
+                       int width) {
+  uint8 b0;
+  uint8 g0;
+  uint8 r0;
+  uint8 b1;
+  uint8 g1;
+  uint8 r1;
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], vsrc_u[1], vsrc_u[0], &b0, &g0, &r0);
+    YuvPixel(src_y[1], vsrc_u[1], vsrc_u[0], &b1, &g1, &r1);
+    b0 = b0 >> 3;
+    g0 = g0 >> 2;
+    r0 = r0 >> 3;
+    b1 = b1 >> 3;
+    g1 = g1 >> 2;
+    r1 = r1 >> 3;
+    *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
+        (b1 << 16) | (g1 << 21) | (r1 << 27);
+    src_y += 2;
+    vsrc_u += 2;
+    dst_rgb565 += 4;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], vsrc_u[1], vsrc_u[0], &b0, &g0, &r0);
+    b0 = b0 >> 3;
+    g0 = g0 >> 2;
+    r0 = r0 >> 3;
+    *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
+  }
+}
+
+void YUY2ToARGBRow_C(const uint8* src_yuy2,
+                     uint8* rgb_buf,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+    YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3],
+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+    rgb_buf[7] = 255;
+    src_yuy2 += 4;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+  }
+}
+
+void UYVYToARGBRow_C(const uint8* src_uyvy,
+                     uint8* rgb_buf,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+    YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2],
+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+    rgb_buf[7] = 255;
+    src_uyvy += 4;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+  }
+}
+
+void I422ToBGRARow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* rgb_buf,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 3, rgb_buf + 2, rgb_buf + 1);
+    rgb_buf[0] = 255;
+    YuvPixel(src_y[1], src_u[0], src_v[0],
+             rgb_buf + 7, rgb_buf + 6, rgb_buf + 5);
+    rgb_buf[4] = 255;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 3, rgb_buf + 2, rgb_buf + 1);
+    rgb_buf[0] = 255;
+  }
+}
+
+void I422ToABGRRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* rgb_buf,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
+    rgb_buf[3] = 255;
+    YuvPixel(src_y[1], src_u[0], src_v[0],
+             rgb_buf + 6, rgb_buf + 5, rgb_buf + 4);
+    rgb_buf[7] = 255;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
+    rgb_buf[3] = 255;
+  }
+}
+
+void I422ToRGBARow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* rgb_buf,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 1, rgb_buf + 2, rgb_buf + 3);
+    rgb_buf[0] = 255;
+    YuvPixel(src_y[1], src_u[0], src_v[0],
+             rgb_buf + 5, rgb_buf + 6, rgb_buf + 7);
+    rgb_buf[4] = 255;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 1, rgb_buf + 2, rgb_buf + 3);
+    rgb_buf[0] = 255;
+  }
+}
+
+void I400ToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+    YPixel(src_y[1], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+    rgb_buf[7] = 255;
+    src_y += 2;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+  }
+}
+
+void MirrorRow_C(const uint8* src, uint8* dst, int width) {
+  int x;
+  src += width - 1;
+  for (x = 0; x < width - 1; x += 2) {
+    dst[x] = src[0];
+    dst[x + 1] = src[-1];
+    src -= 2;
+  }
+  if (width & 1) {
+    dst[width - 1] = src[0];
+  }
+}
+
+void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
+  int x;
+  src_uv += (width - 1) << 1;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_u[x] = src_uv[0];
+    dst_u[x + 1] = src_uv[-2];
+    dst_v[x] = src_uv[1];
+    dst_v[x + 1] = src_uv[-2 + 1];
+    src_uv -= 4;
+  }
+  if (width & 1) {
+    dst_u[width - 1] = src_uv[0];
+    dst_v[width - 1] = src_uv[1];
+  }
+}
+
+void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width) {
+  int x;
+  const uint32* src32 = (const uint32*)(src);
+  uint32* dst32 = (uint32*)(dst);
+  src32 += width - 1;
+  for (x = 0; x < width - 1; x += 2) {
+    dst32[x] = src32[0];
+    dst32[x + 1] = src32[-1];
+    src32 -= 2;
+  }
+  if (width & 1) {
+    dst32[width - 1] = src32[0];
+  }
+}
+
+void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_u[x] = src_uv[0];
+    dst_u[x + 1] = src_uv[2];
+    dst_v[x] = src_uv[1];
+    dst_v[x + 1] = src_uv[3];
+    src_uv += 4;
+  }
+  if (width & 1) {
+    dst_u[width - 1] = src_uv[0];
+    dst_v[width - 1] = src_uv[1];
+  }
+}
+
+void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                  int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_uv[0] = src_u[x];
+    dst_uv[1] = src_v[x];
+    dst_uv[2] = src_u[x + 1];
+    dst_uv[3] = src_v[x + 1];
+    dst_uv += 4;
+  }
+  if (width & 1) {
+    dst_uv[0] = src_u[width - 1];
+    dst_uv[1] = src_v[width - 1];
+  }
+}
+
+void CopyRow_C(const uint8* src, uint8* dst, int count) {
+  memcpy(dst, src, count);
+}
+
+void CopyRow_16_C(const uint16* src, uint16* dst, int count) {
+  memcpy(dst, src, count * 2);
+}
+
+void SetRow_C(uint8* dst, uint8 v8, int width) {
+  memset(dst, v8, width);
+}
+
+void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int width) {
+  uint32* d = (uint32*)(dst_argb);
+  int x;
+  for (x = 0; x < width; ++x) {
+    d[x] = v32;
+  }
+}
+
+// Filter 2 rows of YUY2 UV's (422) into U and V (420).
+void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,
+                   uint8* dst_u, uint8* dst_v, int width) {
+  // Output a row of UV values, filtering 2 rows of YUY2.
+  int x;
+  for (x = 0; x < width; x += 2) {
+    dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1;
+    dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1;
+    src_yuy2 += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+}
+
+// Copy row of YUY2 UV's (422) into U and V (422).
+void YUY2ToUV422Row_C(const uint8* src_yuy2,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  // Output a row of UV values.
+  int x;
+  for (x = 0; x < width; x += 2) {
+    dst_u[0] = src_yuy2[1];
+    dst_v[0] = src_yuy2[3];
+    src_yuy2 += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+}
+
+// Copy row of YUY2 Y's (422) into Y (420/422).
+void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {
+  // Output a row of Y values.
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_y[x] = src_yuy2[0];
+    dst_y[x + 1] = src_yuy2[2];
+    src_yuy2 += 4;
+  }
+  if (width & 1) {
+    dst_y[width - 1] = src_yuy2[0];
+  }
+}
+
+// Filter 2 rows of UYVY UV's (422) into U and V (420).
+void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy,
+                   uint8* dst_u, uint8* dst_v, int width) {
+  // Output a row of UV values.
+  int x;
+  for (x = 0; x < width; x += 2) {
+    dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1;
+    dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1;
+    src_uyvy += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+}
+
+// Copy row of UYVY UV's (422) into U and V (422).
+void UYVYToUV422Row_C(const uint8* src_uyvy,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  // Output a row of UV values.
+  int x;
+  for (x = 0; x < width; x += 2) {
+    dst_u[0] = src_uyvy[0];
+    dst_v[0] = src_uyvy[2];
+    src_uyvy += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+}
+
+// Copy row of UYVY Y's (422) into Y (420/422).
+void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width) {
+  // Output a row of Y values.
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_y[x] = src_uyvy[1];
+    dst_y[x + 1] = src_uyvy[3];
+    src_uyvy += 4;
+  }
+  if (width & 1) {
+    dst_y[width - 1] = src_uyvy[1];
+  }
+}
+
+#define BLEND(f, b, a) (((256 - a) * b) >> 8) + f
+
+// Blend src_argb0 over src_argb1 and store to dst_argb.
+// dst_argb may be src_argb0 or src_argb1.
+// This code mimics the SSSE3 version for better testability.
+void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
+                    uint8* dst_argb, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    uint32 fb = src_argb0[0];
+    uint32 fg = src_argb0[1];
+    uint32 fr = src_argb0[2];
+    uint32 a = src_argb0[3];
+    uint32 bb = src_argb1[0];
+    uint32 bg = src_argb1[1];
+    uint32 br = src_argb1[2];
+    dst_argb[0] = BLEND(fb, bb, a);
+    dst_argb[1] = BLEND(fg, bg, a);
+    dst_argb[2] = BLEND(fr, br, a);
+    dst_argb[3] = 255u;
+
+    fb = src_argb0[4 + 0];
+    fg = src_argb0[4 + 1];
+    fr = src_argb0[4 + 2];
+    a = src_argb0[4 + 3];
+    bb = src_argb1[4 + 0];
+    bg = src_argb1[4 + 1];
+    br = src_argb1[4 + 2];
+    dst_argb[4 + 0] = BLEND(fb, bb, a);
+    dst_argb[4 + 1] = BLEND(fg, bg, a);
+    dst_argb[4 + 2] = BLEND(fr, br, a);
+    dst_argb[4 + 3] = 255u;
+    src_argb0 += 8;
+    src_argb1 += 8;
+    dst_argb += 8;
+  }
+
+  if (width & 1) {
+    uint32 fb = src_argb0[0];
+    uint32 fg = src_argb0[1];
+    uint32 fr = src_argb0[2];
+    uint32 a = src_argb0[3];
+    uint32 bb = src_argb1[0];
+    uint32 bg = src_argb1[1];
+    uint32 br = src_argb1[2];
+    dst_argb[0] = BLEND(fb, bb, a);
+    dst_argb[1] = BLEND(fg, bg, a);
+    dst_argb[2] = BLEND(fr, br, a);
+    dst_argb[3] = 255u;
+  }
+}
+#undef BLEND
+#define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24
+
+// Multiply source RGB by alpha and store to destination.
+// This code mimics the SSSE3 version for better testability.
+void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
+  int i;
+  for (i = 0; i < width - 1; i += 2) {
+    uint32 b = src_argb[0];
+    uint32 g = src_argb[1];
+    uint32 r = src_argb[2];
+    uint32 a = src_argb[3];
+    dst_argb[0] = ATTENUATE(b, a);
+    dst_argb[1] = ATTENUATE(g, a);
+    dst_argb[2] = ATTENUATE(r, a);
+    dst_argb[3] = a;
+    b = src_argb[4];
+    g = src_argb[5];
+    r = src_argb[6];
+    a = src_argb[7];
+    dst_argb[4] = ATTENUATE(b, a);
+    dst_argb[5] = ATTENUATE(g, a);
+    dst_argb[6] = ATTENUATE(r, a);
+    dst_argb[7] = a;
+    src_argb += 8;
+    dst_argb += 8;
+  }
+
+  if (width & 1) {
+    const uint32 b = src_argb[0];
+    const uint32 g = src_argb[1];
+    const uint32 r = src_argb[2];
+    const uint32 a = src_argb[3];
+    dst_argb[0] = ATTENUATE(b, a);
+    dst_argb[1] = ATTENUATE(g, a);
+    dst_argb[2] = ATTENUATE(r, a);
+    dst_argb[3] = a;
+  }
+}
+#undef ATTENUATE
+
+// Divide source RGB by alpha and store to destination.
+// b = (b * 255 + (a / 2)) / a;
+// g = (g * 255 + (a / 2)) / a;
+// r = (r * 255 + (a / 2)) / a;
+// Reciprocal method is off by 1 on some values. ie 125
+// 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower.
+#define T(a) 0x01000000 + (0x10000 / a)
+const uint32 fixed_invtbl8[256] = {
+  0x01000000, 0x0100ffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07),
+  T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f),
+  T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17),
+  T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f),
+  T(0x20), T(0x21), T(0x22), T(0x23), T(0x24), T(0x25), T(0x26), T(0x27),
+  T(0x28), T(0x29), T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f),
+  T(0x30), T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37),
+  T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), T(0x3f),
+  T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), T(0x46), T(0x47),
+  T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), T(0x4d), T(0x4e), T(0x4f),
+  T(0x50), T(0x51), T(0x52), T(0x53), T(0x54), T(0x55), T(0x56), T(0x57),
+  T(0x58), T(0x59), T(0x5a), T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f),
+  T(0x60), T(0x61), T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67),
+  T(0x68), T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f),
+  T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), T(0x77),
+  T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), T(0x7e), T(0x7f),
+  T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), T(0x85), T(0x86), T(0x87),
+  T(0x88), T(0x89), T(0x8a), T(0x8b), T(0x8c), T(0x8d), T(0x8e), T(0x8f),
+  T(0x90), T(0x91), T(0x92), T(0x93), T(0x94), T(0x95), T(0x96), T(0x97),
+  T(0x98), T(0x99), T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f),
+  T(0xa0), T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7),
+  T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), T(0xaf),
+  T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), T(0xb6), T(0xb7),
+  T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), T(0xbd), T(0xbe), T(0xbf),
+  T(0xc0), T(0xc1), T(0xc2), T(0xc3), T(0xc4), T(0xc5), T(0xc6), T(0xc7),
+  T(0xc8), T(0xc9), T(0xca), T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf),
+  T(0xd0), T(0xd1), T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7),
+  T(0xd8), T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf),
+  T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7),
+  T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef),
+  T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7),
+  T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), 0x01000100 };
+#undef T
+
+void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    uint32 b = src_argb[0];
+    uint32 g = src_argb[1];
+    uint32 r = src_argb[2];
+    const uint32 a = src_argb[3];
+    const uint32 ia = fixed_invtbl8[a] & 0xffff;  // 8.8 fixed point
+    b = (b * ia) >> 8;
+    g = (g * ia) >> 8;
+    r = (r * ia) >> 8;
+    // Clamping should not be necessary but is free in assembly.
+    dst_argb[0] = clamp255(b);
+    dst_argb[1] = clamp255(g);
+    dst_argb[2] = clamp255(r);
+    dst_argb[3] = a;
+    src_argb += 4;
+    dst_argb += 4;
+  }
+}
+
+void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
+                               const int32* previous_cumsum, int width) {
+  int32 row_sum[4] = {0, 0, 0, 0};
+  int x;
+  for (x = 0; x < width; ++x) {
+    row_sum[0] += row[x * 4 + 0];
+    row_sum[1] += row[x * 4 + 1];
+    row_sum[2] += row[x * 4 + 2];
+    row_sum[3] += row[x * 4 + 3];
+    cumsum[x * 4 + 0] = row_sum[0]  + previous_cumsum[x * 4 + 0];
+    cumsum[x * 4 + 1] = row_sum[1]  + previous_cumsum[x * 4 + 1];
+    cumsum[x * 4 + 2] = row_sum[2]  + previous_cumsum[x * 4 + 2];
+    cumsum[x * 4 + 3] = row_sum[3]  + previous_cumsum[x * 4 + 3];
+  }
+}
+
+void CumulativeSumToAverageRow_C(const int32* tl, const int32* bl,
+                                int w, int area, uint8* dst, int count) {
+  float ooa = 1.0f / area;
+  int i;
+  for (i = 0; i < count; ++i) {
+    dst[0] = (uint8)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa);
+    dst[1] = (uint8)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa);
+    dst[2] = (uint8)((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa);
+    dst[3] = (uint8)((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa);
+    dst += 4;
+    tl += 4;
+    bl += 4;
+  }
+}
+
+// Copy pixels from rotated source to destination row with a slope.
+LIBYUV_API
+void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
+                     uint8* dst_argb, const float* uv_dudv, int width) {
+  int i;
+  // Render a row of pixels from source into a buffer.
+  float uv[2];
+  uv[0] = uv_dudv[0];
+  uv[1] = uv_dudv[1];
+  for (i = 0; i < width; ++i) {
+    int x = (int)(uv[0]);
+    int y = (int)(uv[1]);
+    *(uint32*)(dst_argb) =
+        *(const uint32*)(src_argb + y * src_argb_stride +
+                                         x * 4);
+    dst_argb += 4;
+    uv[0] += uv_dudv[2];
+    uv[1] += uv_dudv[3];
+  }
+}
+
+// Blend 2 rows into 1.
+static void HalfRow_C(const uint8* src_uv, int src_uv_stride,
+                      uint8* dst_uv, int pix) {
+  int x;
+  for (x = 0; x < pix; ++x) {
+    dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
+  }
+}
+
+static void HalfRow_16_C(const uint16* src_uv, int src_uv_stride,
+                         uint16* dst_uv, int pix) {
+  int x;
+  for (x = 0; x < pix; ++x) {
+    dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
+  }
+}
+
+// C version 2x2 -> 2x1.
+void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
+                      ptrdiff_t src_stride,
+                      int width, int source_y_fraction) {
+  int y1_fraction = source_y_fraction;
+  int y0_fraction = 256 - y1_fraction;
+  const uint8* src_ptr1 = src_ptr + src_stride;
+  int x;
+  if (source_y_fraction == 0) {
+    memcpy(dst_ptr, src_ptr, width);
+    return;
+  }
+  if (source_y_fraction == 128) {
+    HalfRow_C(src_ptr, (int)(src_stride), dst_ptr, width);
+    return;
+  }
+  for (x = 0; x < width - 1; x += 2) {
+    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
+    dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
+    src_ptr += 2;
+    src_ptr1 += 2;
+    dst_ptr += 2;
+  }
+  if (width & 1) {
+    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
+  }
+}
+
+void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                         ptrdiff_t src_stride,
+                         int width, int source_y_fraction) {
+  int y1_fraction = source_y_fraction;
+  int y0_fraction = 256 - y1_fraction;
+  const uint16* src_ptr1 = src_ptr + src_stride;
+  int x;
+  if (source_y_fraction == 0) {
+    memcpy(dst_ptr, src_ptr, width * 2);
+    return;
+  }
+  if (source_y_fraction == 128) {
+    HalfRow_16_C(src_ptr, (int)(src_stride), dst_ptr, width);
+    return;
+  }
+  for (x = 0; x < width - 1; x += 2) {
+    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
+    dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
+    src_ptr += 2;
+    src_ptr1 += 2;
+    dst_ptr += 2;
+  }
+  if (width & 1) {
+    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
+  }
+}
+
+// Use first 4 shuffler values to reorder ARGB channels.
+void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
+                      const uint8* shuffler, int pix) {
+  int index0 = shuffler[0];
+  int index1 = shuffler[1];
+  int index2 = shuffler[2];
+  int index3 = shuffler[3];
+  // Shuffle a row of ARGB.
+  int x;
+  for (x = 0; x < pix; ++x) {
+    // To support in-place conversion.
+    uint8 b = src_argb[index0];
+    uint8 g = src_argb[index1];
+    uint8 r = src_argb[index2];
+    uint8 a = src_argb[index3];
+    dst_argb[0] = b;
+    dst_argb[1] = g;
+    dst_argb[2] = r;
+    dst_argb[3] = a;
+    src_argb += 4;
+    dst_argb += 4;
+  }
+}
+
+void I422ToYUY2Row_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_frame, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_frame[0] = src_y[0];
+    dst_frame[1] = src_u[0];
+    dst_frame[2] = src_y[1];
+    dst_frame[3] = src_v[0];
+    dst_frame += 4;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+  }
+  if (width & 1) {
+    dst_frame[0] = src_y[0];
+    dst_frame[1] = src_u[0];
+    dst_frame[2] = 0;
+    dst_frame[3] = src_v[0];
+  }
+}
+
+void I422ToUYVYRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_frame, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_frame[0] = src_u[0];
+    dst_frame[1] = src_y[0];
+    dst_frame[2] = src_v[0];
+    dst_frame[3] = src_y[1];
+    dst_frame += 4;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+  }
+  if (width & 1) {
+    dst_frame[0] = src_u[0];
+    dst_frame[1] = src_y[0];
+    dst_frame[2] = src_v[0];
+    dst_frame[3] = 0;
+  }
+}
+
+// Maximum temporary width for wrappers to process at a time, in pixels.
+#define MAXTWIDTH 2048
+
+#if !(defined(_MSC_VER) && !defined(__clang__)) && \
+    defined(HAS_I422TORGB565ROW_SSSE3)
+// row_win.cc has asm version, but GCC uses 2 step wrapper.
+void I422ToRGB565Row_SSSE3(const uint8* src_y,
+                           const uint8* src_u,
+                           const uint8* src_v,
+                           uint8* dst_rgb565,
+                           int width) {
+  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, twidth);
+    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_rgb565 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_I422TOARGB1555ROW_SSSE3)
+void I422ToARGB1555Row_SSSE3(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb1555,
+                             int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, twidth);
+    ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth);
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_argb1555 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_I422TOARGB4444ROW_SSSE3)
+void I422ToARGB4444Row_SSSE3(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb4444,
+                             int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, twidth);
+    ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth);
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_argb4444 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_NV12TORGB565ROW_SSSE3)
+void NV12ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_uv,
+                           uint8* dst_rgb565, int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    NV12ToARGBRow_SSSE3(src_y, src_uv, row, twidth);
+    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
+    src_y += twidth;
+    src_uv += twidth;
+    dst_rgb565 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_NV21TORGB565ROW_SSSE3)
+void NV21ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_vu,
+                           uint8* dst_rgb565, int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    NV21ToARGBRow_SSSE3(src_y, src_vu, row, twidth);
+    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
+    src_y += twidth;
+    src_vu += twidth;
+    dst_rgb565 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_YUY2TOARGBROW_SSSE3)
+void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2, uint8* dst_argb, int width) {
+  // Row buffers for intermediate YUV pixels.
+  SIMD_ALIGNED(uint8 row_y[MAXTWIDTH]);
+  SIMD_ALIGNED(uint8 row_u[MAXTWIDTH / 2]);
+  SIMD_ALIGNED(uint8 row_v[MAXTWIDTH / 2]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    YUY2ToUV422Row_SSE2(src_yuy2, row_u, row_v, twidth);
+    YUY2ToYRow_SSE2(src_yuy2, row_y, twidth);
+    I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, twidth);
+    src_yuy2 += twidth * 2;
+    dst_argb += twidth * 4;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_UYVYTOARGBROW_SSSE3)
+void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, uint8* dst_argb, int width) {
+  // Row buffers for intermediate YUV pixels.
+  SIMD_ALIGNED(uint8 row_y[MAXTWIDTH]);
+  SIMD_ALIGNED(uint8 row_u[MAXTWIDTH / 2]);
+  SIMD_ALIGNED(uint8 row_v[MAXTWIDTH / 2]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    UYVYToUV422Row_SSE2(src_uyvy, row_u, row_v, twidth);
+    UYVYToYRow_SSE2(src_uyvy, row_y, twidth);
+    I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, twidth);
+    src_uyvy += twidth * 2;
+    dst_argb += twidth * 4;
+    width -= twidth;
+  }
+}
+#endif  // !defined(LIBYUV_DISABLE_X86)
+
+#if defined(HAS_I422TORGB565ROW_AVX2)
+void I422ToRGB565Row_AVX2(const uint8* src_y,
+                          const uint8* src_u,
+                          const uint8* src_v,
+                          uint8* dst_rgb565,
+                          int width) {
+  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);
+    ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_rgb565 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_I422TOARGB1555ROW_AVX2)
+void I422ToARGB1555Row_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb1555,
+                            int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);
+    ARGBToARGB1555Row_AVX2(row, dst_argb1555, twidth);
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_argb1555 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_I422TOARGB4444ROW_AVX2)
+void I422ToARGB4444Row_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb4444,
+                            int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);
+    ARGBToARGB4444Row_AVX2(row, dst_argb4444, twidth);
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_argb4444 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_I422TORGB24ROW_AVX2)
+void I422ToRGB24Row_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_rgb24,
+                            int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);
+    // TODO(fbarchard): ARGBToRGB24Row_AVX2
+    ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_rgb24 += twidth * 3;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_I422TORAWROW_AVX2)
+void I422ToRAWRow_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_raw,
+                            int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);
+    // TODO(fbarchard): ARGBToRAWRow_AVX2
+    ARGBToRAWRow_SSSE3(row, dst_raw, twidth);
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_raw += twidth * 3;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_NV12TORGB565ROW_AVX2)
+void NV12ToRGB565Row_AVX2(const uint8* src_y, const uint8* src_uv,
+                          uint8* dst_rgb565, int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    NV12ToARGBRow_AVX2(src_y, src_uv, row, twidth);
+    ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
+    src_y += twidth;
+    src_uv += twidth;
+    dst_rgb565 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_NV21TORGB565ROW_AVX2)
+void NV21ToRGB565Row_AVX2(const uint8* src_y, const uint8* src_vu,
+                          uint8* dst_rgb565, int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    NV21ToARGBRow_AVX2(src_y, src_vu, row, twidth);
+    ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
+    src_y += twidth;
+    src_vu += twidth;
+    dst_rgb565 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_YUY2TOARGBROW_AVX2)
+void YUY2ToARGBRow_AVX2(const uint8* src_yuy2, uint8* dst_argb, int width) {
+  // Row buffers for intermediate YUV pixels.
+  SIMD_ALIGNED32(uint8 row_y[MAXTWIDTH]);
+  SIMD_ALIGNED32(uint8 row_u[MAXTWIDTH / 2]);
+  SIMD_ALIGNED32(uint8 row_v[MAXTWIDTH / 2]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    YUY2ToUV422Row_AVX2(src_yuy2, row_u, row_v, twidth);
+    YUY2ToYRow_AVX2(src_yuy2, row_y, twidth);
+    I422ToARGBRow_AVX2(row_y, row_u, row_v, dst_argb, twidth);
+    src_yuy2 += twidth * 2;
+    dst_argb += twidth * 4;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_UYVYTOARGBROW_AVX2)
+void UYVYToARGBRow_AVX2(const uint8* src_uyvy, uint8* dst_argb, int width) {
+  // Row buffers for intermediate YUV pixels.
+  SIMD_ALIGNED32(uint8 row_y[MAXTWIDTH]);
+  SIMD_ALIGNED32(uint8 row_u[MAXTWIDTH / 2]);
+  SIMD_ALIGNED32(uint8 row_v[MAXTWIDTH / 2]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    UYVYToUV422Row_AVX2(src_uyvy, row_u, row_v, twidth);
+    UYVYToYRow_AVX2(src_uyvy, row_y, twidth);
+    I422ToARGBRow_AVX2(row_y, row_u, row_v, dst_argb, twidth);
+    src_uyvy += twidth * 2;
+    dst_argb += twidth * 4;
+    width -= twidth;
+  }
+}
+#endif  // !defined(LIBYUV_DISABLE_X86)
+
+void ARGBPolynomialRow_C(const uint8* src_argb,
+                         uint8* dst_argb, const float* poly,
+                         int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    float b = (float)(src_argb[0]);
+    float g = (float)(src_argb[1]);
+    float r = (float)(src_argb[2]);
+    float a = (float)(src_argb[3]);
+    float b2 = b * b;
+    float g2 = g * g;
+    float r2 = r * r;
+    float a2 = a * a;
+    float db = poly[0] + poly[4] * b;
+    float dg = poly[1] + poly[5] * g;
+    float dr = poly[2] + poly[6] * r;
+    float da = poly[3] + poly[7] * a;
+    float b3 = b2 * b;
+    float g3 = g2 * g;
+    float r3 = r2 * r;
+    float a3 = a2 * a;
+    db += poly[8] * b2;
+    dg += poly[9] * g2;
+    dr += poly[10] * r2;
+    da += poly[11] * a2;
+    db += poly[12] * b3;
+    dg += poly[13] * g3;
+    dr += poly[14] * r3;
+    da += poly[15] * a3;
+
+    dst_argb[0] = Clamp((int32)(db));
+    dst_argb[1] = Clamp((int32)(dg));
+    dst_argb[2] = Clamp((int32)(dr));
+    dst_argb[3] = Clamp((int32)(da));
+    src_argb += 4;
+    dst_argb += 4;
+  }
+}
+
+void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
+                             const uint8* luma, uint32 lumacoeff) {
+  uint32 bc = lumacoeff & 0xff;
+  uint32 gc = (lumacoeff >> 8) & 0xff;
+  uint32 rc = (lumacoeff >> 16) & 0xff;
+
+  int i;
+  for (i = 0; i < width - 1; i += 2) {
+    // Luminance in rows, color values in columns.
+    const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +
+                           src_argb[2] * rc) & 0x7F00u) + luma;
+    const uint8* luma1;
+    dst_argb[0] = luma0[src_argb[0]];
+    dst_argb[1] = luma0[src_argb[1]];
+    dst_argb[2] = luma0[src_argb[2]];
+    dst_argb[3] = src_argb[3];
+    luma1 = ((src_argb[4] * bc + src_argb[5] * gc +
+              src_argb[6] * rc) & 0x7F00u) + luma;
+    dst_argb[4] = luma1[src_argb[4]];
+    dst_argb[5] = luma1[src_argb[5]];
+    dst_argb[6] = luma1[src_argb[6]];
+    dst_argb[7] = src_argb[7];
+    src_argb += 8;
+    dst_argb += 8;
+  }
+  if (width & 1) {
+    // Luminance in rows, color values in columns.
+    const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +
+                           src_argb[2] * rc) & 0x7F00u) + luma;
+    dst_argb[0] = luma0[src_argb[0]];
+    dst_argb[1] = luma0[src_argb[1]];
+    dst_argb[2] = luma0[src_argb[2]];
+    dst_argb[3] = src_argb[3];
+  }
+}
+
+void ARGBCopyAlphaRow_C(const uint8* src, uint8* dst, int width) {
+  int i;
+  for (i = 0; i < width - 1; i += 2) {
+    dst[3] = src[3];
+    dst[7] = src[7];
+    dst += 8;
+    src += 8;
+  }
+  if (width & 1) {
+    dst[3] = src[3];
+  }
+}
+
+void ARGBCopyYToAlphaRow_C(const uint8* src, uint8* dst, int width) {
+  int i;
+  for (i = 0; i < width - 1; i += 2) {
+    dst[3] = src[0];
+    dst[7] = src[1];
+    dst += 8;
+    src += 2;
+  }
+  if (width & 1) {
+    dst[3] = src[0];
+  }
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libvpx/third_party/libyuv/source/row_gcc.cc b/libs/libvpx/third_party/libyuv/source/row_gcc.cc
new file mode 100644
index 0000000000..820de0a1c6
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/source/row_gcc.cc
@@ -0,0 +1,5475 @@
+// VERSION 2
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+
+#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
+
+// Constants for ARGB
+static vec8 kARGBToY = {
+  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
+};
+
+// JPeg full range.
+static vec8 kARGBToYJ = {
+  15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
+};
+#endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
+
+#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
+
+static vec8 kARGBToU = {
+  112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
+};
+
+static vec8 kARGBToUJ = {
+  127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
+};
+
+static vec8 kARGBToV = {
+  -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
+};
+
+static vec8 kARGBToVJ = {
+  -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
+};
+
+// Constants for BGRA
+static vec8 kBGRAToY = {
+  0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
+};
+
+static vec8 kBGRAToU = {
+  0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
+};
+
+static vec8 kBGRAToV = {
+  0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
+};
+
+// Constants for ABGR
+static vec8 kABGRToY = {
+  33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
+};
+
+static vec8 kABGRToU = {
+  -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
+};
+
+static vec8 kABGRToV = {
+  112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
+};
+
+// Constants for RGBA.
+static vec8 kRGBAToY = {
+  0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
+};
+
+static vec8 kRGBAToU = {
+  0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
+};
+
+static vec8 kRGBAToV = {
+  0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
+};
+
+static uvec8 kAddY16 = {
+  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
+};
+
+// 7 bit fixed point 0.5.
+static vec16 kAddYJ64 = {
+  64, 64, 64, 64, 64, 64, 64, 64
+};
+
+static uvec8 kAddUV128 = {
+  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
+};
+
+static uvec16 kAddUVJ128 = {
+  0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
+};
+#endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
+
+#ifdef HAS_RGB24TOARGBROW_SSSE3
+
+// Shuffle table for converting RGB24 to ARGB.
+static uvec8 kShuffleMaskRGB24ToARGB = {
+  0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
+};
+
+// Shuffle table for converting RAW to ARGB.
+static uvec8 kShuffleMaskRAWToARGB = {
+  2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
+};
+
+// Shuffle table for converting ARGB to RGB24.
+static uvec8 kShuffleMaskARGBToRGB24 = {
+  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
+};
+
+// Shuffle table for converting ARGB to RAW.
+static uvec8 kShuffleMaskARGBToRAW = {
+  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
+};
+
+// Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
+static uvec8 kShuffleMaskARGBToRGB24_0 = {
+  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
+};
+
+// Shuffle table for converting ARGB to RAW.
+static uvec8 kShuffleMaskARGBToRAW_0 = {
+  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
+};
+#endif  // HAS_RGB24TOARGBROW_SSSE3
+
+#if defined(TESTING) && defined(__x86_64__)
+void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
+  asm volatile (
+    ".p2align  5                               \n"
+    "mov       %%eax,%%eax                     \n"
+    "mov       %%ebx,%%ebx                     \n"
+    "mov       %%ecx,%%ecx                     \n"
+    "mov       %%edx,%%edx                     \n"
+    "mov       %%esi,%%esi                     \n"
+    "mov       %%edi,%%edi                     \n"
+    "mov       %%ebp,%%ebp                     \n"
+    "mov       %%esp,%%esp                     \n"
+    ".p2align  5                               \n"
+    "mov       %%r8d,%%r8d                     \n"
+    "mov       %%r9d,%%r9d                     \n"
+    "mov       %%r10d,%%r10d                   \n"
+    "mov       %%r11d,%%r11d                   \n"
+    "mov       %%r12d,%%r12d                   \n"
+    "mov       %%r13d,%%r13d                   \n"
+    "mov       %%r14d,%%r14d                   \n"
+    "mov       %%r15d,%%r15d                   \n"
+    ".p2align  5                               \n"
+    "lea       (%%rax),%%eax                   \n"
+    "lea       (%%rbx),%%ebx                   \n"
+    "lea       (%%rcx),%%ecx                   \n"
+    "lea       (%%rdx),%%edx                   \n"
+    "lea       (%%rsi),%%esi                   \n"
+    "lea       (%%rdi),%%edi                   \n"
+    "lea       (%%rbp),%%ebp                   \n"
+    "lea       (%%rsp),%%esp                   \n"
+    ".p2align  5                               \n"
+    "lea       (%%r8),%%r8d                    \n"
+    "lea       (%%r9),%%r9d                    \n"
+    "lea       (%%r10),%%r10d                  \n"
+    "lea       (%%r11),%%r11d                  \n"
+    "lea       (%%r12),%%r12d                  \n"
+    "lea       (%%r13),%%r13d                  \n"
+    "lea       (%%r14),%%r14d                  \n"
+    "lea       (%%r15),%%r15d                  \n"
+
+    ".p2align  5                               \n"
+    "lea       0x10(%%rax),%%eax               \n"
+    "lea       0x10(%%rbx),%%ebx               \n"
+    "lea       0x10(%%rcx),%%ecx               \n"
+    "lea       0x10(%%rdx),%%edx               \n"
+    "lea       0x10(%%rsi),%%esi               \n"
+    "lea       0x10(%%rdi),%%edi               \n"
+    "lea       0x10(%%rbp),%%ebp               \n"
+    "lea       0x10(%%rsp),%%esp               \n"
+    ".p2align  5                               \n"
+    "lea       0x10(%%r8),%%r8d                \n"
+    "lea       0x10(%%r9),%%r9d                \n"
+    "lea       0x10(%%r10),%%r10d              \n"
+    "lea       0x10(%%r11),%%r11d              \n"
+    "lea       0x10(%%r12),%%r12d              \n"
+    "lea       0x10(%%r13),%%r13d              \n"
+    "lea       0x10(%%r14),%%r14d              \n"
+    "lea       0x10(%%r15),%%r15d              \n"
+
+    ".p2align  5                               \n"
+    "add       0x10,%%eax                      \n"
+    "add       0x10,%%ebx                      \n"
+    "add       0x10,%%ecx                      \n"
+    "add       0x10,%%edx                      \n"
+    "add       0x10,%%esi                      \n"
+    "add       0x10,%%edi                      \n"
+    "add       0x10,%%ebp                      \n"
+    "add       0x10,%%esp                      \n"
+    ".p2align  5                               \n"
+    "add       0x10,%%r8d                      \n"
+    "add       0x10,%%r9d                      \n"
+    "add       0x10,%%r10d                     \n"
+    "add       0x10,%%r11d                     \n"
+    "add       0x10,%%r12d                     \n"
+    "add       0x10,%%r13d                     \n"
+    "add       0x10,%%r14d                     \n"
+    "add       0x10,%%r15d                     \n"
+
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "movq      " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x8,0) ",%0            \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_y),     // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  :
+  : "memory", "cc", "xmm0", "xmm1", "xmm5"
+  );
+}
+#endif  // TESTING
+
+#ifdef HAS_J400TOARGBROW_SSE2
+void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "pslld     $0x18,%%xmm5                    \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movq      " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x8,0) ",%0            \n"
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm0,%%xmm0                   \n"
+    "punpckhwd %%xmm1,%%xmm1                   \n"
+    "por       %%xmm5,%%xmm0                   \n"
+    "por       %%xmm5,%%xmm1                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_y),     // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  :: "memory", "cc", "xmm0", "xmm1", "xmm5"
+  );
+}
+#endif  // HAS_J400TOARGBROW_SSE2
+
+#ifdef HAS_RGB24TOARGBROW_SSSE3
+void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
+    "pslld     $0x18,%%xmm5                    \n"
+    "movdqa    %3,%%xmm4                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm3   \n"
+    "lea       " MEMLEA(0x30,0) ",%0           \n"
+    "movdqa    %%xmm3,%%xmm2                   \n"
+    "palignr   $0x8,%%xmm1,%%xmm2              \n"
+    "pshufb    %%xmm4,%%xmm2                   \n"
+    "por       %%xmm5,%%xmm2                   \n"
+    "palignr   $0xc,%%xmm0,%%xmm1              \n"
+    "pshufb    %%xmm4,%%xmm0                   \n"
+    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
+    "por       %%xmm5,%%xmm0                   \n"
+    "pshufb    %%xmm4,%%xmm1                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "por       %%xmm5,%%xmm1                   \n"
+    "palignr   $0x4,%%xmm3,%%xmm3              \n"
+    "pshufb    %%xmm4,%%xmm3                   \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "por       %%xmm5,%%xmm3                   \n"
+    "movdqu    %%xmm3," MEMACCESS2(0x30,1) "   \n"
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_rgb24),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  : "m"(kShuffleMaskRGB24ToARGB)  // %3
+  : "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
+    "pslld     $0x18,%%xmm5                    \n"
+    "movdqa    %3,%%xmm4                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm3   \n"
+    "lea       " MEMLEA(0x30,0) ",%0           \n"
+    "movdqa    %%xmm3,%%xmm2                   \n"
+    "palignr   $0x8,%%xmm1,%%xmm2              \n"
+    "pshufb    %%xmm4,%%xmm2                   \n"
+    "por       %%xmm5,%%xmm2                   \n"
+    "palignr   $0xc,%%xmm0,%%xmm1              \n"
+    "pshufb    %%xmm4,%%xmm0                   \n"
+    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
+    "por       %%xmm5,%%xmm0                   \n"
+    "pshufb    %%xmm4,%%xmm1                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "por       %%xmm5,%%xmm1                   \n"
+    "palignr   $0x4,%%xmm3,%%xmm3              \n"
+    "pshufb    %%xmm4,%%xmm3                   \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "por       %%xmm5,%%xmm3                   \n"
+    "movdqu    %%xmm3," MEMACCESS2(0x30,1) "   \n"
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_raw),   // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  : "m"(kShuffleMaskRAWToARGB)  // %3
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
+  asm volatile (
+    "mov       $0x1080108,%%eax                \n"
+    "movd      %%eax,%%xmm5                    \n"
+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+    "mov       $0x20802080,%%eax               \n"
+    "movd      %%eax,%%xmm6                    \n"
+    "pshufd    $0x0,%%xmm6,%%xmm6              \n"
+    "pcmpeqb   %%xmm3,%%xmm3                   \n"
+    "psllw     $0xb,%%xmm3                     \n"
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "psllw     $0xa,%%xmm4                     \n"
+    "psrlw     $0x5,%%xmm4                     \n"
+    "pcmpeqb   %%xmm7,%%xmm7                   \n"
+    "psllw     $0x8,%%xmm7                     \n"
+    "sub       %0,%1                           \n"
+    "sub       %0,%1                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "pand      %%xmm3,%%xmm1                   \n"
+    "psllw     $0xb,%%xmm2                     \n"
+    "pmulhuw   %%xmm5,%%xmm1                   \n"
+    "pmulhuw   %%xmm5,%%xmm2                   \n"
+    "psllw     $0x8,%%xmm1                     \n"
+    "por       %%xmm2,%%xmm1                   \n"
+    "pand      %%xmm4,%%xmm0                   \n"
+    "pmulhuw   %%xmm6,%%xmm0                   \n"
+    "por       %%xmm7,%%xmm0                   \n"
+    "movdqa    %%xmm1,%%xmm2                   \n"
+    "punpcklbw %%xmm0,%%xmm1                   \n"
+    "punpckhbw %%xmm0,%%xmm2                   \n"
+    MEMOPMEM(movdqu,xmm1,0x00,1,0,2)           //  movdqu  %%xmm1,(%1,%0,2)
+    MEMOPMEM(movdqu,xmm2,0x10,1,0,2)           //  movdqu  %%xmm2,0x10(%1,%0,2)
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(pix)   // %2
+  :
+  : "memory", "cc", "eax", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
+void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
+  asm volatile (
+    "mov       $0x1080108,%%eax                \n"
+    "movd      %%eax,%%xmm5                    \n"
+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+    "mov       $0x42004200,%%eax               \n"
+    "movd      %%eax,%%xmm6                    \n"
+    "pshufd    $0x0,%%xmm6,%%xmm6              \n"
+    "pcmpeqb   %%xmm3,%%xmm3                   \n"
+    "psllw     $0xb,%%xmm3                     \n"
+    "movdqa    %%xmm3,%%xmm4                   \n"
+    "psrlw     $0x6,%%xmm4                     \n"
+    "pcmpeqb   %%xmm7,%%xmm7                   \n"
+    "psllw     $0x8,%%xmm7                     \n"
+    "sub       %0,%1                           \n"
+    "sub       %0,%1                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "psllw     $0x1,%%xmm1                     \n"
+    "psllw     $0xb,%%xmm2                     \n"
+    "pand      %%xmm3,%%xmm1                   \n"
+    "pmulhuw   %%xmm5,%%xmm2                   \n"
+    "pmulhuw   %%xmm5,%%xmm1                   \n"
+    "psllw     $0x8,%%xmm1                     \n"
+    "por       %%xmm2,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "pand      %%xmm4,%%xmm0                   \n"
+    "psraw     $0x8,%%xmm2                     \n"
+    "pmulhuw   %%xmm6,%%xmm0                   \n"
+    "pand      %%xmm7,%%xmm2                   \n"
+    "por       %%xmm2,%%xmm0                   \n"
+    "movdqa    %%xmm1,%%xmm2                   \n"
+    "punpcklbw %%xmm0,%%xmm1                   \n"
+    "punpckhbw %%xmm0,%%xmm2                   \n"
+    MEMOPMEM(movdqu,xmm1,0x00,1,0,2)           //  movdqu  %%xmm1,(%1,%0,2)
+    MEMOPMEM(movdqu,xmm2,0x10,1,0,2)           //  movdqu  %%xmm2,0x10(%1,%0,2)
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(pix)   // %2
+  :
+  : "memory", "cc", "eax", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
+void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
+  asm volatile (
+    "mov       $0xf0f0f0f,%%eax                \n"
+    "movd      %%eax,%%xmm4                    \n"
+    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
+    "movdqa    %%xmm4,%%xmm5                   \n"
+    "pslld     $0x4,%%xmm5                     \n"
+    "sub       %0,%1                           \n"
+    "sub       %0,%1                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "pand      %%xmm4,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm2,%%xmm3                   \n"
+    "psllw     $0x4,%%xmm1                     \n"
+    "psrlw     $0x4,%%xmm3                     \n"
+    "por       %%xmm1,%%xmm0                   \n"
+    "por       %%xmm3,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm2,%%xmm0                   \n"
+    "punpckhbw %%xmm2,%%xmm1                   \n"
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,2)           //  movdqu  %%xmm0,(%1,%0,2)
+    MEMOPMEM(movdqu,xmm1,0x10,1,0,2)           //  movdqu  %%xmm1,0x10(%1,%0,2)
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(pix)   // %2
+  :
+  : "memory", "cc", "eax", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
+  asm volatile (
+    "movdqa    %3,%%xmm6                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "pshufb    %%xmm6,%%xmm0                   \n"
+    "pshufb    %%xmm6,%%xmm1                   \n"
+    "pshufb    %%xmm6,%%xmm2                   \n"
+    "pshufb    %%xmm6,%%xmm3                   \n"
+    "movdqa    %%xmm1,%%xmm4                   \n"
+    "psrldq    $0x4,%%xmm1                     \n"
+    "pslldq    $0xc,%%xmm4                     \n"
+    "movdqa    %%xmm2,%%xmm5                   \n"
+    "por       %%xmm4,%%xmm0                   \n"
+    "pslldq    $0x8,%%xmm5                     \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "por       %%xmm5,%%xmm1                   \n"
+    "psrldq    $0x8,%%xmm2                     \n"
+    "pslldq    $0x4,%%xmm3                     \n"
+    "por       %%xmm3,%%xmm2                   \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
+    "lea       " MEMLEA(0x30,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(pix)   // %2
+  : "m"(kShuffleMaskARGBToRGB24)  // %3
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+
+void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
+  asm volatile (
+    "movdqa    %3,%%xmm6                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "pshufb    %%xmm6,%%xmm0                   \n"
+    "pshufb    %%xmm6,%%xmm1                   \n"
+    "pshufb    %%xmm6,%%xmm2                   \n"
+    "pshufb    %%xmm6,%%xmm3                   \n"
+    "movdqa    %%xmm1,%%xmm4                   \n"
+    "psrldq    $0x4,%%xmm1                     \n"
+    "pslldq    $0xc,%%xmm4                     \n"
+    "movdqa    %%xmm2,%%xmm5                   \n"
+    "por       %%xmm4,%%xmm0                   \n"
+    "pslldq    $0x8,%%xmm5                     \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "por       %%xmm5,%%xmm1                   \n"
+    "psrldq    $0x8,%%xmm2                     \n"
+    "pslldq    $0x4,%%xmm3                     \n"
+    "por       %%xmm3,%%xmm2                   \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
+    "lea       " MEMLEA(0x30,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(pix)   // %2
+  : "m"(kShuffleMaskARGBToRAW)  // %3
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+
+void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm3,%%xmm3                   \n"
+    "psrld     $0x1b,%%xmm3                    \n"
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "psrld     $0x1a,%%xmm4                    \n"
+    "pslld     $0x5,%%xmm4                     \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "pslld     $0xb,%%xmm5                     \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "pslld     $0x8,%%xmm0                     \n"
+    "psrld     $0x3,%%xmm1                     \n"
+    "psrld     $0x5,%%xmm2                     \n"
+    "psrad     $0x10,%%xmm0                    \n"
+    "pand      %%xmm3,%%xmm1                   \n"
+    "pand      %%xmm4,%%xmm2                   \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "por       %%xmm2,%%xmm1                   \n"
+    "por       %%xmm1,%%xmm0                   \n"
+    "packssdw  %%xmm0,%%xmm0                   \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(pix)   // %2
+  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "psrld     $0x1b,%%xmm4                    \n"
+    "movdqa    %%xmm4,%%xmm5                   \n"
+    "pslld     $0x5,%%xmm5                     \n"
+    "movdqa    %%xmm4,%%xmm6                   \n"
+    "pslld     $0xa,%%xmm6                     \n"
+    "pcmpeqb   %%xmm7,%%xmm7                   \n"
+    "pslld     $0xf,%%xmm7                     \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm3                   \n"
+    "psrad     $0x10,%%xmm0                    \n"
+    "psrld     $0x3,%%xmm1                     \n"
+    "psrld     $0x6,%%xmm2                     \n"
+    "psrld     $0x9,%%xmm3                     \n"
+    "pand      %%xmm7,%%xmm0                   \n"
+    "pand      %%xmm4,%%xmm1                   \n"
+    "pand      %%xmm5,%%xmm2                   \n"
+    "pand      %%xmm6,%%xmm3                   \n"
+    "por       %%xmm1,%%xmm0                   \n"
+    "por       %%xmm3,%%xmm2                   \n"
+    "por       %%xmm2,%%xmm0                   \n"
+    "packssdw  %%xmm0,%%xmm0                   \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(pix)   // %2
+  :: "memory", "cc",
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
+void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "psllw     $0xc,%%xmm4                     \n"
+    "movdqa    %%xmm4,%%xmm3                   \n"
+    "psrlw     $0x8,%%xmm3                     \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "pand      %%xmm3,%%xmm0                   \n"
+    "pand      %%xmm4,%%xmm1                   \n"
+    "psrlq     $0x4,%%xmm0                     \n"
+    "psrlq     $0x8,%%xmm1                     \n"
+    "por       %%xmm1,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(pix)   // %2
+  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+  );
+}
+#endif  // HAS_RGB24TOARGBROW_SSSE3
+
+#ifdef HAS_ARGBTOYROW_SSSE3
+// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
+void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+  asm volatile (
+    "movdqa    %3,%%xmm4                       \n"
+    "movdqa    %4,%%xmm5                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm4,%%xmm3                   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "phaddw    %%xmm1,%%xmm0                   \n"
+    "phaddw    %%xmm3,%%xmm2                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "psrlw     $0x7,%%xmm2                     \n"
+    "packuswb  %%xmm2,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  : "m"(kARGBToY),   // %3
+    "m"(kAddY16)     // %4
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_ARGBTOYROW_SSSE3
+
+#ifdef HAS_ARGBTOYJROW_SSSE3
+// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
+// Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
+void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+  asm volatile (
+    "movdqa    %3,%%xmm4                       \n"
+    "movdqa    %4,%%xmm5                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm4,%%xmm3                   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "phaddw    %%xmm1,%%xmm0                   \n"
+    "phaddw    %%xmm3,%%xmm2                   \n"
+    "paddw     %%xmm5,%%xmm0                   \n"
+    "paddw     %%xmm5,%%xmm2                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "psrlw     $0x7,%%xmm2                     \n"
+    "packuswb  %%xmm2,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  : "m"(kARGBToYJ),  // %3
+    "m"(kAddYJ64)    // %4
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_ARGBTOYJROW_SSSE3
+
+#ifdef HAS_ARGBTOYROW_AVX2
+// vpermd for vphaddw + vpackuswb vpermd.
+static const lvec32 kPermdARGBToY_AVX = {
+  0, 4, 1, 5, 2, 6, 3, 7
+};
+
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
+  asm volatile (
+    "vbroadcastf128 %3,%%ymm4                  \n"
+    "vbroadcastf128 %4,%%ymm5                  \n"
+    "vmovdqu    %5,%%ymm6                      \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
+    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
+    "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
+    "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
+    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
+    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
+    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
+    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+    "lea       " MEMLEA(0x80,0) ",%0           \n"
+    "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
+    "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
+    "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
+    "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
+    "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
+    "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
+    "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"  // add 16 for Y
+    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  : "m"(kARGBToY),   // %3
+    "m"(kAddY16),    // %4
+    "m"(kPermdARGBToY_AVX)  // %5
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+#endif  // HAS_ARGBTOYROW_AVX2
+
+#ifdef HAS_ARGBTOYJROW_AVX2
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
+  asm volatile (
+    "vbroadcastf128 %3,%%ymm4                  \n"
+    "vbroadcastf128 %4,%%ymm5                  \n"
+    "vmovdqu    %5,%%ymm6                      \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
+    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
+    "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
+    "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
+    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
+    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
+    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
+    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+    "lea       " MEMLEA(0x80,0) ",%0           \n"
+    "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
+    "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
+    "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"  // Add .5 for rounding.
+    "vpaddw     %%ymm5,%%ymm2,%%ymm2           \n"
+    "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
+    "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
+    "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
+    "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
+    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  : "m"(kARGBToYJ),   // %3
+    "m"(kAddYJ64),    // %4
+    "m"(kPermdARGBToY_AVX)  // %5
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+#endif  // HAS_ARGBTOYJROW_AVX2
+
+#ifdef HAS_ARGBTOUVROW_SSSE3
+void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "movdqa    %5,%%xmm3                       \n"
+    "movdqa    %6,%%xmm4                       \n"
+    "movdqa    %7,%%xmm5                       \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm1                   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm6                   \n"
+
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqa    %%xmm2,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm6,%%xmm2             \n"
+    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm2,%%xmm6                   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "pmaddubsw %%xmm3,%%xmm6                   \n"
+    "phaddw    %%xmm2,%%xmm0                   \n"
+    "phaddw    %%xmm6,%%xmm1                   \n"
+    "psraw     $0x8,%%xmm0                     \n"
+    "psraw     $0x8,%%xmm1                     \n"
+    "packsswb  %%xmm1,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "movlps    %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps    %%xmm0,(%1,%2,1)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb0),       // %0
+    "+r"(dst_u),           // %1
+    "+r"(dst_v),           // %2
+    "+rm"(width)           // %3
+  : "r"((intptr_t)(src_stride_argb)), // %4
+    "m"(kARGBToV),  // %5
+    "m"(kARGBToU),  // %6
+    "m"(kAddUV128)  // %7
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBTOUVROW_SSSE3
+
+#ifdef HAS_ARGBTOUVROW_AVX2
+// vpshufb for vphaddw + vpackuswb packed to shorts.
+static const lvec8 kShufARGBToUV_AVX = {
+  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+};
+void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "vbroadcastf128 %5,%%ymm5                  \n"
+    "vbroadcastf128 %6,%%ymm6                  \n"
+    "vbroadcastf128 %7,%%ymm7                  \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
+    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
+    "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
+    "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
+    VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
+    VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
+    VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
+    VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
+    "lea       " MEMLEA(0x80,0) ",%0           \n"
+    "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"
+    "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"
+    "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"
+    "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"
+    "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"
+    "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"
+
+    "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"
+    "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"
+    "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"
+    "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"
+    "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"
+    "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpsraw     $0x8,%%ymm1,%%ymm1             \n"
+    "vpsraw     $0x8,%%ymm0,%%ymm0             \n"
+    "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"
+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+    "vpshufb    %8,%%ymm0,%%ymm0               \n"
+    "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"
+
+    "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
+    VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x20,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb0),       // %0
+    "+r"(dst_u),           // %1
+    "+r"(dst_v),           // %2
+    "+rm"(width)           // %3
+  : "r"((intptr_t)(src_stride_argb)), // %4
+    "m"(kAddUV128),  // %5
+    "m"(kARGBToV),   // %6
+    "m"(kARGBToU),   // %7
+    "m"(kShufARGBToUV_AVX)  // %8
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBTOUVROW_AVX2
+
+#ifdef HAS_ARGBTOUVJROW_SSSE3
+void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                        uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "movdqa    %5,%%xmm3                       \n"
+    "movdqa    %6,%%xmm4                       \n"
+    "movdqa    %7,%%xmm5                       \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm1                   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm6                   \n"
+
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqa    %%xmm2,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm6,%%xmm2             \n"
+    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm2,%%xmm6                   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "pmaddubsw %%xmm3,%%xmm6                   \n"
+    "phaddw    %%xmm2,%%xmm0                   \n"
+    "phaddw    %%xmm6,%%xmm1                   \n"
+    "paddw     %%xmm5,%%xmm0                   \n"
+    "paddw     %%xmm5,%%xmm1                   \n"
+    "psraw     $0x8,%%xmm0                     \n"
+    "psraw     $0x8,%%xmm1                     \n"
+    "packsswb  %%xmm1,%%xmm0                   \n"
+    "movlps    %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb0),       // %0
+    "+r"(dst_u),           // %1
+    "+r"(dst_v),           // %2
+    "+rm"(width)           // %3
+  : "r"((intptr_t)(src_stride_argb)), // %4
+    "m"(kARGBToVJ),  // %5
+    "m"(kARGBToUJ),  // %6
+    "m"(kAddUVJ128)  // %7
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBTOUVJROW_SSSE3
+
+#ifdef HAS_ARGBTOUV444ROW_SSSE3
+void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                          int width) {
+  asm volatile (
+    "movdqa    %4,%%xmm3                       \n"
+    "movdqa    %5,%%xmm4                       \n"
+    "movdqa    %6,%%xmm5                       \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm4,%%xmm6                   \n"
+    "phaddw    %%xmm1,%%xmm0                   \n"
+    "phaddw    %%xmm6,%%xmm2                   \n"
+    "psraw     $0x8,%%xmm0                     \n"
+    "psraw     $0x8,%%xmm2                     \n"
+    "packsswb  %%xmm2,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    "pmaddubsw %%xmm3,%%xmm0                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "pmaddubsw %%xmm3,%%xmm2                   \n"
+    "pmaddubsw %%xmm3,%%xmm6                   \n"
+    "phaddw    %%xmm1,%%xmm0                   \n"
+    "phaddw    %%xmm6,%%xmm2                   \n"
+    "psraw     $0x8,%%xmm0                     \n"
+    "psraw     $0x8,%%xmm2                     \n"
+    "packsswb  %%xmm2,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    MEMOPMEM(movdqu,xmm0,0x00,1,2,1)           //  movdqu  %%xmm0,(%1,%2,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),        // %0
+    "+r"(dst_u),           // %1
+    "+r"(dst_v),           // %2
+    "+rm"(width)           // %3
+  : "m"(kARGBToV),  // %4
+    "m"(kARGBToU),  // %5
+    "m"(kAddUV128)  // %6
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm6"
+  );
+}
+#endif  // HAS_ARGBTOUV444ROW_SSSE3
+
+#ifdef HAS_ARGBTOUV422ROW_SSSE3
+void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
+                          uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "movdqa    %4,%%xmm3                       \n"
+    "movdqa    %5,%%xmm4                       \n"
+    "movdqa    %6,%%xmm5                       \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqa    %%xmm2,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm6,%%xmm2             \n"
+    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm2,%%xmm6                   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "pmaddubsw %%xmm3,%%xmm6                   \n"
+    "phaddw    %%xmm2,%%xmm0                   \n"
+    "phaddw    %%xmm6,%%xmm1                   \n"
+    "psraw     $0x8,%%xmm0                     \n"
+    "psraw     $0x8,%%xmm1                     \n"
+    "packsswb  %%xmm1,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "movlps    %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb0),       // %0
+    "+r"(dst_u),           // %1
+    "+r"(dst_v),           // %2
+    "+rm"(width)           // %3
+  : "m"(kARGBToV),  // %4
+    "m"(kARGBToU),  // %5
+    "m"(kAddUV128)  // %6
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBTOUV422ROW_SSSE3
+
+void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
+  asm volatile (
+    "movdqa    %4,%%xmm5                       \n"
+    "movdqa    %3,%%xmm4                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm4,%%xmm3                   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "phaddw    %%xmm1,%%xmm0                   \n"
+    "phaddw    %%xmm3,%%xmm2                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "psrlw     $0x7,%%xmm2                     \n"
+    "packuswb  %%xmm2,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_bgra),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  : "m"(kBGRAToY),   // %3
+    "m"(kAddY16)     // %4
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "movdqa    %5,%%xmm3                       \n"
+    "movdqa    %6,%%xmm4                       \n"
+    "movdqa    %7,%%xmm5                       \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm1                   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm6                   \n"
+
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqa    %%xmm2,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm6,%%xmm2             \n"
+    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm2,%%xmm6                   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "pmaddubsw %%xmm3,%%xmm6                   \n"
+    "phaddw    %%xmm2,%%xmm0                   \n"
+    "phaddw    %%xmm6,%%xmm1                   \n"
+    "psraw     $0x8,%%xmm0                     \n"
+    "psraw     $0x8,%%xmm1                     \n"
+    "packsswb  %%xmm1,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "movlps    %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_bgra0),       // %0
+    "+r"(dst_u),           // %1
+    "+r"(dst_v),           // %2
+    "+rm"(width)           // %3
+  : "r"((intptr_t)(src_stride_bgra)), // %4
+    "m"(kBGRAToV),  // %5
+    "m"(kBGRAToU),  // %6
+    "m"(kAddUV128)  // %7
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+  );
+}
+
+void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
+  asm volatile (
+    "movdqa    %4,%%xmm5                       \n"
+    "movdqa    %3,%%xmm4                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm4,%%xmm3                   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "phaddw    %%xmm1,%%xmm0                   \n"
+    "phaddw    %%xmm3,%%xmm2                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "psrlw     $0x7,%%xmm2                     \n"
+    "packuswb  %%xmm2,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_abgr),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  : "m"(kABGRToY),   // %3
+    "m"(kAddY16)     // %4
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
+  asm volatile (
+    "movdqa    %4,%%xmm5                       \n"
+    "movdqa    %3,%%xmm4                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm4,%%xmm3                   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "phaddw    %%xmm1,%%xmm0                   \n"
+    "phaddw    %%xmm3,%%xmm2                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "psrlw     $0x7,%%xmm2                     \n"
+    "packuswb  %%xmm2,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_rgba),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  : "m"(kRGBAToY),   // %3
+    "m"(kAddY16)     // %4
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "movdqa    %5,%%xmm3                       \n"
+    "movdqa    %6,%%xmm4                       \n"
+    "movdqa    %7,%%xmm5                       \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm1                   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm6                   \n"
+
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqa    %%xmm2,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm6,%%xmm2             \n"
+    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm2,%%xmm6                   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "pmaddubsw %%xmm3,%%xmm6                   \n"
+    "phaddw    %%xmm2,%%xmm0                   \n"
+    "phaddw    %%xmm6,%%xmm1                   \n"
+    "psraw     $0x8,%%xmm0                     \n"
+    "psraw     $0x8,%%xmm1                     \n"
+    "packsswb  %%xmm1,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "movlps    %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_abgr0),       // %0
+    "+r"(dst_u),           // %1
+    "+r"(dst_v),           // %2
+    "+rm"(width)           // %3
+  : "r"((intptr_t)(src_stride_abgr)), // %4
+    "m"(kABGRToV),  // %5
+    "m"(kABGRToU),  // %6
+    "m"(kAddUV128)  // %7
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+  );
+}
+
+void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "movdqa    %5,%%xmm3                       \n"
+    "movdqa    %6,%%xmm4                       \n"
+    "movdqa    %7,%%xmm5                       \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm1                   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm6                   \n"
+
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqa    %%xmm2,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm6,%%xmm2             \n"
+    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm2,%%xmm6                   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "pmaddubsw %%xmm3,%%xmm6                   \n"
+    "phaddw    %%xmm2,%%xmm0                   \n"
+    "phaddw    %%xmm6,%%xmm1                   \n"
+    "psraw     $0x8,%%xmm0                     \n"
+    "psraw     $0x8,%%xmm1                     \n"
+    "packsswb  %%xmm1,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "movlps    %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_rgba0),       // %0
+    "+r"(dst_u),           // %1
+    "+r"(dst_v),           // %2
+    "+rm"(width)           // %3
+  : "r"((intptr_t)(src_stride_rgba)), // %4
+    "m"(kRGBAToV),  // %5
+    "m"(kRGBAToU),  // %6
+    "m"(kAddUV128)  // %7
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+  );
+}
+
+#if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
+
+struct YuvConstants {
+  lvec8 kUVToB;     // 0
+  lvec8 kUVToG;     // 32
+  lvec8 kUVToR;     // 64
+  lvec16 kUVBiasB;  // 96
+  lvec16 kUVBiasG;  // 128
+  lvec16 kUVBiasR;  // 160
+  lvec16 kYToRgb;   // 192
+};
+
+// BT.601 YUV to RGB reference
+//  R = (Y - 16) * 1.164              - V * -1.596
+//  G = (Y - 16) * 1.164 - U *  0.391 - V *  0.813
+//  B = (Y - 16) * 1.164 - U * -2.018
+
+// Y contribution to R,G,B.  Scale and bias.
+// TODO(fbarchard): Consider moving constants into a common header.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
+
+// U and V contributions to R,G,B.
+#define UB -128 /* max(-128, round(-2.018 * 64)) */
+#define UG 25 /* round(0.391 * 64) */
+#define VG 52 /* round(0.813 * 64) */
+#define VR -102 /* round(-1.596 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BB (UB * 128            + YGB)
+#define BG (UG * 128 + VG * 128 + YGB)
+#define BR            (VR * 128 + YGB)
+
+// BT601 constants for YUV to RGB.
+static YuvConstants SIMD_ALIGNED(kYuvConstants) = {
+  { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
+    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
+  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
+  { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
+    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
+  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
+  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
+  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
+  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
+};
+
+// BT601 constants for NV21 where chroma plane is VU instead of UV.
+static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
+  { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+    0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
+  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+    VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
+  { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+    VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
+  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
+  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
+  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
+  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
+};
+
+#undef YG
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef BB
+#undef BG
+#undef BR
+
+// JPEG YUV to RGB reference
+// *  R = Y                - V * -1.40200
+// *  G = Y - U *  0.34414 - V *  0.71414
+// *  B = Y - U * -1.77200
+
+// Y contribution to R,G,B.  Scale and bias.
+// TODO(fbarchard): Consider moving constants into a common header.
+#define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+#define YGBJ 32  /* 64 / 2 */
+
+// U and V contributions to R,G,B.
+#define UBJ -113 /* round(-1.77200 * 64) */
+#define UGJ 22 /* round(0.34414 * 64) */
+#define VGJ 46 /* round(0.71414  * 64) */
+#define VRJ -90 /* round(-1.40200 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BBJ (UBJ * 128             + YGBJ)
+#define BGJ (UGJ * 128 + VGJ * 128 + YGBJ)
+#define BRJ             (VRJ * 128 + YGBJ)
+
+// JPEG constants for YUV to RGB.
+YuvConstants SIMD_ALIGNED(kYuvJConstants) = {
+  { UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0,
+    UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0 },
+  { UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
+    UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
+    UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
+    UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ },
+  { 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ,
+    0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ },
+  { BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ,
+    BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ },
+  { BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ,
+    BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ },
+  { BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ,
+    BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ },
+  { YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ,
+    YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ }
+};
+
+#undef YGJ
+#undef YGBJ
+#undef UBJ
+#undef UGJ
+#undef VGJ
+#undef VRJ
+#undef BBJ
+#undef BGJ
+#undef BRJ
+
+// Read 8 UV from 411
+#define READYUV444                                                             \
+    "movq       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
+    MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
+    "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]               \n"            \
+    "punpcklbw  %%xmm1,%%xmm0                                   \n"
+
+// Read 4 UV from 422, upsample to 8 UV
+#define READYUV422                                                             \
+    "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
+    MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
+    "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]               \n"            \
+    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
+    "punpcklwd  %%xmm0,%%xmm0                                   \n"
+
+// Read 2 UV from 411, upsample to 8 UV
+#define READYUV411                                                             \
+    "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
+    MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
+    "lea        " MEMLEA(0x2, [u_buf]) ",%[u_buf]               \n"            \
+    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
+    "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
+    "punpckldq  %%xmm0,%%xmm0                                   \n"
+
+// Read 4 UV from NV12, upsample to 8 UV
+#define READNV12                                                               \
+    "movq       " MEMACCESS([uv_buf]) ",%%xmm0                  \n"            \
+    "lea        " MEMLEA(0x8, [uv_buf]) ",%[uv_buf]             \n"            \
+    "punpcklwd  %%xmm0,%%xmm0                                   \n"
+
+// Convert 8 pixels: 8 UV and 8 Y
+#define YUVTORGB(YuvConstants)                                                 \
+    "movdqa     %%xmm0,%%xmm1                                   \n"            \
+    "movdqa     %%xmm0,%%xmm2                                   \n"            \
+    "movdqa     %%xmm0,%%xmm3                                   \n"            \
+    "movdqa     " MEMACCESS2(96, [YuvConstants]) ",%%xmm0       \n"            \
+    "pmaddubsw  " MEMACCESS([YuvConstants]) ",%%xmm1            \n"            \
+    "psubw      %%xmm1,%%xmm0                                   \n"            \
+    "movdqa     " MEMACCESS2(128, [YuvConstants]) ",%%xmm1      \n"            \
+    "pmaddubsw  " MEMACCESS2(32, [YuvConstants]) ",%%xmm2       \n"            \
+    "psubw      %%xmm2,%%xmm1                                   \n"            \
+    "movdqa     " MEMACCESS2(160, [YuvConstants]) ",%%xmm2      \n"            \
+    "pmaddubsw  " MEMACCESS2(64, [YuvConstants]) ",%%xmm3       \n"            \
+    "psubw      %%xmm3,%%xmm2                                   \n"            \
+    "movq       " MEMACCESS([y_buf]) ",%%xmm3                   \n"            \
+    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"            \
+    "punpcklbw  %%xmm3,%%xmm3                                   \n"            \
+    "pmulhuw    " MEMACCESS2(192, [YuvConstants]) ",%%xmm3      \n"            \
+    "paddsw     %%xmm3,%%xmm0                                   \n"            \
+    "paddsw     %%xmm3,%%xmm1                                   \n"            \
+    "paddsw     %%xmm3,%%xmm2                                   \n"            \
+    "psraw      $0x6,%%xmm0                                     \n"            \
+    "psraw      $0x6,%%xmm1                                     \n"            \
+    "psraw      $0x6,%%xmm2                                     \n"            \
+    "packuswb   %%xmm0,%%xmm0                                   \n"            \
+    "packuswb   %%xmm1,%%xmm1                                   \n"            \
+    "packuswb   %%xmm2,%%xmm2                                   \n"
+
+// Store 8 ARGB values. Assumes XMM5 is zero.
+#define STOREARGB                                                              \
+    "punpcklbw  %%xmm1,%%xmm0                                    \n"           \
+    "punpcklbw  %%xmm5,%%xmm2                                    \n"           \
+    "movdqa     %%xmm0,%%xmm1                                    \n"           \
+    "punpcklwd  %%xmm2,%%xmm0                                    \n"           \
+    "punpckhwd  %%xmm2,%%xmm1                                    \n"           \
+    "movdqu     %%xmm0," MEMACCESS([dst_argb]) "                 \n"           \
+    "movdqu     %%xmm1," MEMACCESS2(0x10, [dst_argb]) "          \n"           \
+    "lea        " MEMLEA(0x20, [dst_argb]) ", %[dst_argb]        \n"
+
+// Store 8 BGRA values. Assumes XMM5 is zero.
+#define STOREBGRA                                                              \
+    "pcmpeqb   %%xmm5,%%xmm5                                     \n"           \
+    "punpcklbw %%xmm0,%%xmm1                                     \n"           \
+    "punpcklbw %%xmm2,%%xmm5                                     \n"           \
+    "movdqa    %%xmm5,%%xmm0                                     \n"           \
+    "punpcklwd %%xmm1,%%xmm5                                     \n"           \
+    "punpckhwd %%xmm1,%%xmm0                                     \n"           \
+    "movdqu    %%xmm5," MEMACCESS([dst_bgra]) "                  \n"           \
+    "movdqu    %%xmm0," MEMACCESS2(0x10, [dst_bgra]) "           \n"           \
+    "lea       " MEMLEA(0x20, [dst_bgra]) ", %[dst_bgra]         \n"
+
+// Store 8 ABGR values. Assumes XMM5 is zero.
+#define STOREABGR                                                              \
+    "punpcklbw %%xmm1,%%xmm2                                     \n"           \
+    "punpcklbw %%xmm5,%%xmm0                                     \n"           \
+    "movdqa    %%xmm2,%%xmm1                                     \n"           \
+    "punpcklwd %%xmm0,%%xmm2                                     \n"           \
+    "punpckhwd %%xmm0,%%xmm1                                     \n"           \
+    "movdqu    %%xmm2," MEMACCESS([dst_abgr]) "                  \n"           \
+    "movdqu    %%xmm1," MEMACCESS2(0x10, [dst_abgr]) "           \n"           \
+    "lea       " MEMLEA(0x20, [dst_abgr]) ", %[dst_abgr]         \n"
+
+// Store 8 RGBA values. Assumes XMM5 is zero.
+#define STORERGBA                                                              \
+    "pcmpeqb   %%xmm5,%%xmm5                                     \n"           \
+    "punpcklbw %%xmm2,%%xmm1                                     \n"           \
+    "punpcklbw %%xmm0,%%xmm5                                     \n"           \
+    "movdqa    %%xmm5,%%xmm0                                     \n"           \
+    "punpcklwd %%xmm1,%%xmm5                                     \n"           \
+    "punpckhwd %%xmm1,%%xmm0                                     \n"           \
+    "movdqu    %%xmm5," MEMACCESS([dst_rgba]) "                  \n"           \
+    "movdqu    %%xmm0," MEMACCESS2(0x10, [dst_rgba]) "           \n"           \
+    "lea       " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba]          \n"
+
+void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* dst_argb,
+                                int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV444
+    YUVTORGB(kYuvConstants)
+    STOREARGB
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+
+// TODO(fbarchard): Consider putting masks into constants.
+void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
+                                 const uint8* u_buf,
+                                 const uint8* v_buf,
+                                 uint8* dst_rgb24,
+                                 int width) {
+  asm volatile (
+    "movdqa    %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
+    "movdqa    %[kShuffleMaskARGBToRGB24],%%xmm6   \n"
+    "sub       %[u_buf],%[v_buf]               \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB(kYuvConstants)
+    "punpcklbw %%xmm1,%%xmm0                   \n"
+    "punpcklbw %%xmm2,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm2,%%xmm0                   \n"
+    "punpckhwd %%xmm2,%%xmm1                   \n"
+    "pshufb    %%xmm5,%%xmm0                   \n"
+    "pshufb    %%xmm6,%%xmm1                   \n"
+    "palignr   $0xc,%%xmm0,%%xmm1              \n"
+    "movq      %%xmm0," MEMACCESS([dst_rgb24]) "\n"
+    "movdqu    %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n"
+    "lea       " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n"
+    "subl      $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_rgb24]"+r"(dst_rgb24),  // %[dst_rgb24]
+// TODO(fbarchard): Make width a register for 32 bit.
+#if defined(__i386__) && defined(__pic__)
+    [width]"+m"(width)     // %[width]
+#else
+    [width]"+rm"(width)    // %[width]
+#endif
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB),
+    [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
+    [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6"
+  );
+}
+
+void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* dst_raw,
+                               int width) {
+  asm volatile (
+    "movdqa    %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
+    "movdqa    %[kShuffleMaskARGBToRAW],%%xmm6   \n"
+    "sub       %[u_buf],%[v_buf]               \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB(kYuvConstants)
+    "punpcklbw %%xmm1,%%xmm0                   \n"
+    "punpcklbw %%xmm2,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm2,%%xmm0                   \n"
+    "punpckhwd %%xmm2,%%xmm1                   \n"
+    "pshufb    %%xmm5,%%xmm0                   \n"
+    "pshufb    %%xmm6,%%xmm1                   \n"
+    "palignr   $0xc,%%xmm0,%%xmm1              \n"
+    "movq      %%xmm0," MEMACCESS([dst_raw]) " \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x8,[dst_raw]) "\n"
+    "lea       " MEMLEA(0x18,[dst_raw]) ",%[dst_raw] \n"
+    "subl      $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_raw]"+r"(dst_raw),  // %[dst_raw]
+// TODO(fbarchard): Make width a register for 32 bit.
+#if defined(__i386__) && defined(__pic__)
+    [width]"+m"(width)    // %[width]
+#else
+    [width]"+rm"(width)    // %[width]
+#endif
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB),
+    [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
+    [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW)
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6"
+  );
+}
+
+void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* dst_argb,
+                                int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB(kYuvConstants)
+    STOREARGB
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+
+void OMITFP J422ToARGBRow_SSSE3(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* dst_argb,
+                                int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB(kYuvConstants)
+    STOREARGB
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvJConstants.kUVToB) // %[kYuvConstants]
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+
+void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* dst_argb,
+                                int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV411
+    YUVTORGB(kYuvConstants)
+    STOREARGB
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+
+void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
+                                const uint8* uv_buf,
+                                uint8* dst_argb,
+                                int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    LABELALIGN
+  "1:                                          \n"
+    READNV12
+    YUVTORGB(kYuvConstants)
+    STOREARGB
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+  // Does not use r14.
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+
+void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
+                                const uint8* uv_buf,
+                                uint8* dst_argb,
+                                int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    LABELALIGN
+  "1:                                          \n"
+    READNV12
+    YUVTORGB(kYuvConstants)
+    STOREARGB
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYvuConstants.kUVToB) // %[kYuvConstants]
+  // Does not use r14.
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+
+void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* dst_bgra,
+                                int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB(kYuvConstants)
+    STOREBGRA
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_bgra]"+r"(dst_bgra),  // %[dst_bgra]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+
+void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* dst_abgr,
+                                int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB(kYuvConstants)
+    STOREABGR
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_abgr]"+r"(dst_abgr),  // %[dst_abgr]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+
+void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* dst_rgba,
+                                int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB(kYuvConstants)
+    STORERGBA
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_rgba]"+r"(dst_rgba),  // %[dst_rgba]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+
+#endif  // HAS_I422TOARGBROW_SSSE3
+
+// Read 8 UV from 422, upsample to 16 UV.
+#define READYUV422_AVX2                                                        \
+    "vmovq       " MEMACCESS([u_buf]) ",%%xmm0                      \n"        \
+    MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1)                           \
+    "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]                   \n"        \
+    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
+    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
+    "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"
+
+// Convert 16 pixels: 16 UV and 16 Y.
+#define YUVTORGB_AVX2(YuvConstants)                                            \
+    "vpmaddubsw  " MEMACCESS2(64, [YuvConstants]) ",%%ymm0,%%ymm2   \n"        \
+    "vpmaddubsw  " MEMACCESS2(32, [YuvConstants]) ",%%ymm0,%%ymm1   \n"        \
+    "vpmaddubsw  " MEMACCESS([YuvConstants]) ",%%ymm0,%%ymm0        \n"        \
+    "vmovdqu     " MEMACCESS2(160, [YuvConstants]) ",%%ymm3         \n"        \
+    "vpsubw      %%ymm2,%%ymm3,%%ymm2                               \n"        \
+    "vmovdqu     " MEMACCESS2(128, [YuvConstants]) ",%%ymm3         \n"        \
+    "vpsubw      %%ymm1,%%ymm3,%%ymm1                               \n"        \
+    "vmovdqu     " MEMACCESS2(96, [YuvConstants]) ",%%ymm3          \n"        \
+    "vpsubw      %%ymm0,%%ymm3,%%ymm0                               \n"        \
+    "vmovdqu     " MEMACCESS([y_buf]) ",%%xmm3                      \n"        \
+    "lea         " MEMLEA(0x10, [y_buf]) ",%[y_buf]                 \n"        \
+    "vpermq      $0xd8,%%ymm3,%%ymm3                                \n"        \
+    "vpunpcklbw  %%ymm3,%%ymm3,%%ymm3                               \n"        \
+    "vpmulhuw    " MEMACCESS2(192, [YuvConstants]) ",%%ymm3,%%ymm3  \n"        \
+    "vpaddsw     %%ymm3,%%ymm0,%%ymm0           \n"                            \
+    "vpaddsw     %%ymm3,%%ymm1,%%ymm1           \n"                            \
+    "vpaddsw     %%ymm3,%%ymm2,%%ymm2           \n"                            \
+    "vpsraw      $0x6,%%ymm0,%%ymm0             \n"                            \
+    "vpsraw      $0x6,%%ymm1,%%ymm1             \n"                            \
+    "vpsraw      $0x6,%%ymm2,%%ymm2             \n"                            \
+    "vpackuswb   %%ymm0,%%ymm0,%%ymm0           \n"                            \
+    "vpackuswb   %%ymm1,%%ymm1,%%ymm1           \n"                            \
+    "vpackuswb   %%ymm2,%%ymm2,%%ymm2           \n"
+
+#if defined(HAS_I422TOBGRAROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes).
+void OMITFP I422ToBGRARow_AVX2(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* dst_bgra,
+                               int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+
+    // Step 3: Weave into BGRA
+    "vpunpcklbw %%ymm0,%%ymm1,%%ymm1           \n"  // GB
+    "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
+    "vpunpcklbw %%ymm2,%%ymm5,%%ymm2           \n"  // AR
+    "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
+    "vpunpcklwd %%ymm1,%%ymm2,%%ymm0           \n"  // ARGB first 8 pixels
+    "vpunpckhwd %%ymm1,%%ymm2,%%ymm2           \n"  // ARGB next 8 pixels
+
+    "vmovdqu    %%ymm0," MEMACCESS([dst_bgra]) "\n"
+    "vmovdqu    %%ymm2," MEMACCESS2(0x20,[dst_bgra]) "\n"
+    "lea       " MEMLEA(0x40,[dst_bgra]) ",%[dst_bgra] \n"
+    "sub       $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_bgra]"+r"(dst_bgra),  // %[dst_bgra]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB)  // %[kYuvConstants]
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_I422TOBGRAROW_AVX2
+
+#if defined(HAS_I422TOARGBROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* dst_argb,
+                               int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+
+    // Step 3: Weave into ARGB
+    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0           \n"  // BG
+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+    "vpunpcklbw %%ymm5,%%ymm2,%%ymm2           \n"  // RA
+    "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
+    "vpunpcklwd %%ymm2,%%ymm0,%%ymm1           \n"  // BGRA first 8 pixels
+    "vpunpckhwd %%ymm2,%%ymm0,%%ymm0           \n"  // BGRA next 8 pixels
+
+    "vmovdqu    %%ymm1," MEMACCESS([dst_argb]) "\n"
+    "vmovdqu    %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n"
+    "lea       " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
+    "sub       $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB)  // %[kYuvConstants]
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_I422TOARGBROW_AVX2
+
+#if defined(HAS_J422TOARGBROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP J422ToARGBRow_AVX2(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* dst_argb,
+                               int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+
+    // Step 3: Weave into ARGB
+    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0           \n"  // BG
+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+    "vpunpcklbw %%ymm5,%%ymm2,%%ymm2           \n"  // RA
+    "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
+    "vpunpcklwd %%ymm2,%%ymm0,%%ymm1           \n"  // BGRA first 8 pixels
+    "vpunpckhwd %%ymm2,%%ymm0,%%ymm0           \n"  // BGRA next 8 pixels
+
+    "vmovdqu    %%ymm1," MEMACCESS([dst_argb]) "\n"
+    "vmovdqu    %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n"
+    "lea       " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
+    "sub       $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvJConstants.kUVToB)  // %[kYuvConstants]
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_J422TOARGBROW_AVX2
+
+#if defined(HAS_I422TOABGRROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).
+void OMITFP I422ToABGRRow_AVX2(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* dst_argb,
+                               int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+
+    // Step 3: Weave into ABGR
+    "vpunpcklbw %%ymm1,%%ymm2,%%ymm1           \n"  // RG
+    "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
+    "vpunpcklbw %%ymm5,%%ymm0,%%ymm2           \n"  // BA
+    "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
+    "vpunpcklwd %%ymm2,%%ymm1,%%ymm0           \n"  // RGBA first 8 pixels
+    "vpunpckhwd %%ymm2,%%ymm1,%%ymm1           \n"  // RGBA next 8 pixels
+    "vmovdqu    %%ymm0," MEMACCESS([dst_argb]) "\n"
+    "vmovdqu    %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"
+    "lea       " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
+    "sub       $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB)  // %[kYuvConstants]
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_I422TOABGRROW_AVX2
+
+#if defined(HAS_I422TORGBAROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
+void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* dst_argb,
+                               int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+
+    // Step 3: Weave into RGBA
+    "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"
+    "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
+    "vpunpcklbw %%ymm0,%%ymm5,%%ymm2           \n"
+    "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
+    "vpunpcklwd %%ymm1,%%ymm2,%%ymm0           \n"
+    "vpunpckhwd %%ymm1,%%ymm2,%%ymm1           \n"
+    "vmovdqu    %%ymm0," MEMACCESS([dst_argb]) "\n"
+    "vmovdqu    %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"
+    "lea       " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
+    "sub       $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB)  // %[kYuvConstants]
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_I422TORGBAROW_AVX2
+
+#ifdef HAS_I400TOARGBROW_SSE2
+void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
+  asm volatile (
+    "mov       $0x4a354a35,%%eax               \n"  // 4a35 = 18997 = 1.164
+    "movd      %%eax,%%xmm2                    \n"
+    "pshufd    $0x0,%%xmm2,%%xmm2              \n"
+    "mov       $0x04880488,%%eax               \n"  // 0488 = 1160 = 1.164 * 16
+    "movd      %%eax,%%xmm3                    \n"
+    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "pslld     $0x18,%%xmm4                    \n"
+    LABELALIGN
+  "1:                                          \n"
+    // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
+    "movq      " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x8,0) ",%0            \n"
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "pmulhuw   %%xmm2,%%xmm0                   \n"
+    "psubusw   %%xmm3,%%xmm0                   \n"
+    "psrlw     $6, %%xmm0                      \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+
+    // Step 2: Weave into ARGB
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm0,%%xmm0                   \n"
+    "punpckhwd %%xmm1,%%xmm1                   \n"
+    "por       %%xmm4,%%xmm0                   \n"
+    "por       %%xmm4,%%xmm1                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(y_buf),     // %0
+    "+r"(dst_argb),  // %1
+    "+rm"(width)     // %2
+  :
+  : "memory", "cc", "eax"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+  );
+}
+#endif  // HAS_I400TOARGBROW_SSE2
+
+#ifdef HAS_I400TOARGBROW_AVX2
+// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
+// note: vpunpcklbw mutates and vpackuswb unmutates.
+void I400ToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {
+  asm volatile (
+    "mov        $0x4a354a35,%%eax              \n" // 0488 = 1160 = 1.164 * 16
+    "vmovd      %%eax,%%xmm2                   \n"
+    "vbroadcastss %%xmm2,%%ymm2                \n"
+    "mov        $0x4880488,%%eax               \n" // 4a35 = 18997 = 1.164
+    "vmovd      %%eax,%%xmm3                   \n"
+    "vbroadcastss %%xmm3,%%ymm3                \n"
+    "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
+    "vpslld     $0x18,%%ymm4,%%ymm4            \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
+    "vmovdqu    " MEMACCESS(0) ",%%xmm0        \n"
+    "lea        " MEMLEA(0x10,0) ",%0          \n"
+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+    "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
+    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpsubusw   %%ymm3,%%ymm0,%%ymm0           \n"
+    "vpsrlw     $0x6,%%ymm0,%%ymm0             \n"
+    "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
+    "vpunpcklbw %%ymm0,%%ymm0,%%ymm1           \n"
+    "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
+    "vpunpcklwd %%ymm1,%%ymm1,%%ymm0           \n"
+    "vpunpckhwd %%ymm1,%%ymm1,%%ymm1           \n"
+    "vpor       %%ymm4,%%ymm0,%%ymm0           \n"
+    "vpor       %%ymm4,%%ymm1,%%ymm1           \n"
+    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
+    "vmovdqu    %%ymm1," MEMACCESS2(0x20,1) "  \n"
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub        $0x10,%2                       \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(y_buf),     // %0
+    "+r"(dst_argb),  // %1
+    "+rm"(width)     // %2
+  :
+  : "memory", "cc", "eax"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+  );
+}
+#endif  // HAS_I400TOARGBROW_AVX2
+
+#ifdef HAS_MIRRORROW_SSSE3
+// Shuffle table for reversing the bytes.
+static uvec8 kShuffleMirror = {
+  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
+};
+
+void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
+  intptr_t temp_width = (intptr_t)(width);
+  asm volatile (
+    "movdqa    %3,%%xmm5                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    MEMOPREG(movdqu,-0x10,0,2,1,xmm0)          //  movdqu -0x10(%0,%2),%%xmm0
+    "pshufb    %%xmm5,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(temp_width)  // %2
+  : "m"(kShuffleMirror) // %3
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm5"
+  );
+}
+#endif  // HAS_MIRRORROW_SSSE3
+
+#ifdef HAS_MIRRORROW_AVX2
+void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+  intptr_t temp_width = (intptr_t)(width);
+  asm volatile (
+    "vbroadcastf128 %3,%%ymm5                  \n"
+    LABELALIGN
+  "1:                                          \n"
+    MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0)         //  vmovdqu -0x20(%0,%2),%%ymm0
+    "vpshufb    %%ymm5,%%ymm0,%%ymm0           \n"
+    "vpermq     $0x4e,%%ymm0,%%ymm0            \n"
+    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(temp_width)  // %2
+  : "m"(kShuffleMirror) // %3
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm5"
+  );
+}
+#endif  // HAS_MIRRORROW_AVX2
+
+#ifdef HAS_MIRRORROW_SSE2
+void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
+  intptr_t temp_width = (intptr_t)(width);
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    MEMOPREG(movdqu,-0x10,0,2,1,xmm0)          //  movdqu -0x10(%0,%2),%%xmm0
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "psllw     $0x8,%%xmm0                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "por       %%xmm1,%%xmm0                   \n"
+    "pshuflw   $0x1b,%%xmm0,%%xmm0             \n"
+    "pshufhw   $0x1b,%%xmm0,%%xmm0             \n"
+    "pshufd    $0x4e,%%xmm0,%%xmm0             \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1)",%1            \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(temp_width)  // %2
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1"
+  );
+}
+#endif  // HAS_MIRRORROW_SSE2
+
+#ifdef HAS_MIRRORROW_UV_SSSE3
+// Shuffle table for reversing the bytes of UV channels.
+static uvec8 kShuffleMirrorUV = {
+  14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
+};
+void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
+                       int width) {
+  intptr_t temp_width = (intptr_t)(width);
+  asm volatile (
+    "movdqa    %4,%%xmm1                       \n"
+    "lea       " MEMLEA4(-0x10,0,3,2) ",%0     \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(-0x10,0) ",%0          \n"
+    "pshufb    %%xmm1,%%xmm0                   \n"
+    "movlpd    %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movhpd,xmm0,0x00,1,2,1)           //  movhpd    %%xmm0,(%1,%2)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $8,%3                           \n"
+    "jg        1b                              \n"
+  : "+r"(src),      // %0
+    "+r"(dst_u),    // %1
+    "+r"(dst_v),    // %2
+    "+r"(temp_width)  // %3
+  : "m"(kShuffleMirrorUV)  // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1"
+  );
+}
+#endif  // HAS_MIRRORROW_UV_SSSE3
+
+#ifdef HAS_ARGBMIRRORROW_SSE2
+
+void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
+  intptr_t temp_width = (intptr_t)(width);
+  asm volatile (
+    "lea       " MEMLEA4(-0x10,0,2,4) ",%0     \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "pshufd    $0x1b,%%xmm0,%%xmm0             \n"
+    "lea       " MEMLEA(-0x10,0) ",%0          \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(temp_width)  // %2
+  :
+  : "memory", "cc"
+    , "xmm0"
+  );
+}
+#endif  // HAS_ARGBMIRRORROW_SSE2
+
+#ifdef HAS_ARGBMIRRORROW_AVX2
+// Shuffle table for reversing the bytes.
+static const ulvec32 kARGBShuffleMirror_AVX2 = {
+  7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
+};
+void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+  intptr_t temp_width = (intptr_t)(width);
+  asm volatile (
+    "vmovdqu    %3,%%ymm5                      \n"
+    LABELALIGN
+  "1:                                          \n"
+    VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0
+    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
+    "lea        " MEMLEA(0x20,1) ",%1          \n"
+    "sub        $0x8,%2                        \n"
+    "jg         1b                             \n"
+    "vzeroupper                                \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(temp_width)  // %2
+  : "m"(kARGBShuffleMirror_AVX2) // %3
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm5"
+  );
+}
+#endif  // HAS_ARGBMIRRORROW_AVX2
+
+#ifdef HAS_SPLITUVROW_AVX2
+void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5             \n"
+    "vpsrlw     $0x8,%%ymm5,%%ymm5               \n"
+    "sub        %1,%2                            \n"
+    LABELALIGN
+  "1:                                            \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0          \n"
+    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1    \n"
+    "lea        " MEMLEA(0x40,0) ",%0            \n"
+    "vpsrlw     $0x8,%%ymm0,%%ymm2               \n"
+    "vpsrlw     $0x8,%%ymm1,%%ymm3               \n"
+    "vpand      %%ymm5,%%ymm0,%%ymm0             \n"
+    "vpand      %%ymm5,%%ymm1,%%ymm1             \n"
+    "vpackuswb  %%ymm1,%%ymm0,%%ymm0             \n"
+    "vpackuswb  %%ymm3,%%ymm2,%%ymm2             \n"
+    "vpermq     $0xd8,%%ymm0,%%ymm0              \n"
+    "vpermq     $0xd8,%%ymm2,%%ymm2              \n"
+    "vmovdqu    %%ymm0," MEMACCESS(1) "          \n"
+    MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1)             //  vmovdqu %%ymm2,(%1,%2)
+    "lea        " MEMLEA(0x20,1) ",%1            \n"
+    "sub        $0x20,%3                         \n"
+    "jg         1b                               \n"
+    "vzeroupper                                  \n"
+  : "+r"(src_uv),     // %0
+    "+r"(dst_u),      // %1
+    "+r"(dst_v),      // %2
+    "+r"(pix)         // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_SPLITUVROW_AVX2
+
+#ifdef HAS_SPLITUVROW_SSE2
+void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "pcmpeqb    %%xmm5,%%xmm5                    \n"
+    "psrlw      $0x8,%%xmm5                      \n"
+    "sub        %1,%2                            \n"
+    LABELALIGN
+  "1:                                            \n"
+    "movdqu     " MEMACCESS(0) ",%%xmm0          \n"
+    "movdqu     " MEMACCESS2(0x10,0) ",%%xmm1    \n"
+    "lea        " MEMLEA(0x20,0) ",%0            \n"
+    "movdqa     %%xmm0,%%xmm2                    \n"
+    "movdqa     %%xmm1,%%xmm3                    \n"
+    "pand       %%xmm5,%%xmm0                    \n"
+    "pand       %%xmm5,%%xmm1                    \n"
+    "packuswb   %%xmm1,%%xmm0                    \n"
+    "psrlw      $0x8,%%xmm2                      \n"
+    "psrlw      $0x8,%%xmm3                      \n"
+    "packuswb   %%xmm3,%%xmm2                    \n"
+    "movdqu     %%xmm0," MEMACCESS(1) "          \n"
+    MEMOPMEM(movdqu,xmm2,0x00,1,2,1)             //  movdqu     %%xmm2,(%1,%2)
+    "lea        " MEMLEA(0x10,1) ",%1            \n"
+    "sub        $0x10,%3                         \n"
+    "jg         1b                               \n"
+  : "+r"(src_uv),     // %0
+    "+r"(dst_u),      // %1
+    "+r"(dst_v),      // %2
+    "+r"(pix)         // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_SPLITUVROW_SSE2
+
+#ifdef HAS_MERGEUVROW_AVX2
+void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                     int width) {
+  asm volatile (
+    "sub       %0,%1                             \n"
+    LABELALIGN
+  "1:                                            \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0           \n"
+    MEMOPREG(vmovdqu,0x00,0,1,1,ymm1)             //  vmovdqu (%0,%1,1),%%ymm1
+    "lea       " MEMLEA(0x20,0) ",%0             \n"
+    "vpunpcklbw %%ymm1,%%ymm0,%%ymm2             \n"
+    "vpunpckhbw %%ymm1,%%ymm0,%%ymm0             \n"
+    "vextractf128 $0x0,%%ymm2," MEMACCESS(2) "   \n"
+    "vextractf128 $0x0,%%ymm0," MEMACCESS2(0x10,2) "\n"
+    "vextractf128 $0x1,%%ymm2," MEMACCESS2(0x20,2) "\n"
+    "vextractf128 $0x1,%%ymm0," MEMACCESS2(0x30,2) "\n"
+    "lea       " MEMLEA(0x40,2) ",%2             \n"
+    "sub       $0x20,%3                          \n"
+    "jg        1b                                \n"
+    "vzeroupper                                  \n"
+  : "+r"(src_u),     // %0
+    "+r"(src_v),     // %1
+    "+r"(dst_uv),    // %2
+    "+r"(width)      // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2"
+  );
+}
+#endif  // HAS_MERGEUVROW_AVX2
+
+#ifdef HAS_MERGEUVROW_SSE2
+void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                     int width) {
+  asm volatile (
+    "sub       %0,%1                             \n"
+    LABELALIGN
+  "1:                                            \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
+    MEMOPREG(movdqu,0x00,0,1,1,xmm1)             //  movdqu    (%0,%1,1),%%xmm1
+    "lea       " MEMLEA(0x10,0) ",%0             \n"
+    "movdqa    %%xmm0,%%xmm2                     \n"
+    "punpcklbw %%xmm1,%%xmm0                     \n"
+    "punpckhbw %%xmm1,%%xmm2                     \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "           \n"
+    "movdqu    %%xmm2," MEMACCESS2(0x10,2) "     \n"
+    "lea       " MEMLEA(0x20,2) ",%2             \n"
+    "sub       $0x10,%3                          \n"
+    "jg        1b                                \n"
+  : "+r"(src_u),     // %0
+    "+r"(src_v),     // %1
+    "+r"(dst_uv),    // %2
+    "+r"(width)      // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2"
+  );
+}
+#endif  // HAS_MERGEUVROW_SSE2
+
+#ifdef HAS_COPYROW_SSE2
+void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(count)  // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1"
+  );
+}
+#endif  // HAS_COPYROW_SSE2
+
+#ifdef HAS_COPYROW_AVX
+void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
+    "vmovdqu   %%ymm1," MEMACCESS2(0x20,1) "   \n"
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub       $0x40,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(count)  // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1"
+  );
+}
+#endif  // HAS_COPYROW_AVX
+
+#ifdef HAS_COPYROW_ERMS
+// Multiple of 1.
+void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
+  size_t width_tmp = (size_t)(width);
+  asm volatile (
+    "rep movsb " MEMMOVESTRING(0,1) "          \n"
+  : "+S"(src),  // %0
+    "+D"(dst),  // %1
+    "+c"(width_tmp) // %2
+  :
+  : "memory", "cc"
+  );
+}
+#endif  // HAS_COPYROW_ERMS
+
+#ifdef HAS_ARGBCOPYALPHAROW_SSE2
+// width in pixels
+void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm0,%%xmm0                   \n"
+    "pslld     $0x18,%%xmm0                    \n"
+    "pcmpeqb   %%xmm1,%%xmm1                   \n"
+    "psrld     $0x8,%%xmm1                     \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm4         \n"
+    "movdqu    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
+    "pand      %%xmm0,%%xmm2                   \n"
+    "pand      %%xmm0,%%xmm3                   \n"
+    "pand      %%xmm1,%%xmm4                   \n"
+    "pand      %%xmm1,%%xmm5                   \n"
+    "por       %%xmm4,%%xmm2                   \n"
+    "por       %%xmm5,%%xmm3                   \n"
+    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width)  // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_ARGBCOPYALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYALPHAROW_AVX2
+// width in pixels
+void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
+    "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm1         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm2   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1        \n"
+    "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2  \n"
+    "vmovdqu   %%ymm1," MEMACCESS(1) "         \n"
+    "vmovdqu   %%ymm2," MEMACCESS2(0x20,1) "   \n"
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width)  // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2"
+  );
+}
+#endif  // HAS_ARGBCOPYALPHAROW_AVX2
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
+// width in pixels
+void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm0,%%xmm0                   \n"
+    "pslld     $0x18,%%xmm0                    \n"
+    "pcmpeqb   %%xmm1,%%xmm1                   \n"
+    "psrld     $0x8,%%xmm1                     \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movq      " MEMACCESS(0) ",%%xmm2         \n"
+    "lea       " MEMLEA(0x8,0) ",%0            \n"
+    "punpcklbw %%xmm2,%%xmm2                   \n"
+    "punpckhwd %%xmm2,%%xmm3                   \n"
+    "punpcklwd %%xmm2,%%xmm2                   \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm4         \n"
+    "movdqu    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
+    "pand      %%xmm0,%%xmm2                   \n"
+    "pand      %%xmm0,%%xmm3                   \n"
+    "pand      %%xmm1,%%xmm4                   \n"
+    "pand      %%xmm1,%%xmm5                   \n"
+    "por       %%xmm4,%%xmm2                   \n"
+    "por       %%xmm5,%%xmm3                   \n"
+    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width)  // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
+// width in pixels
+void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
+    "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vpmovzxbd " MEMACCESS(0) ",%%ymm1         \n"
+    "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2    \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "vpslld    $0x18,%%ymm1,%%ymm1             \n"
+    "vpslld    $0x18,%%ymm2,%%ymm2             \n"
+    "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1        \n"
+    "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2  \n"
+    "vmovdqu   %%ymm1," MEMACCESS(1) "         \n"
+    "vmovdqu   %%ymm2," MEMACCESS2(0x20,1) "   \n"
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width)  // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2"
+  );
+}
+#endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
+
+#ifdef HAS_SETROW_X86
+void SetRow_X86(uint8* dst, uint8 v8, int width) {
+  size_t width_tmp = (size_t)(width >> 2);
+  const uint32 v32 = v8 * 0x01010101;  // Duplicate byte to all bytes.
+  asm volatile (
+    "rep stosl " MEMSTORESTRING(eax,0) "       \n"
+    : "+D"(dst),       // %0
+      "+c"(width_tmp)  // %1
+    : "a"(v32)         // %2
+    : "memory", "cc");
+}
+
+void SetRow_ERMS(uint8* dst, uint8 v8, int width) {
+  size_t width_tmp = (size_t)(width);
+  asm volatile (
+    "rep stosb " MEMSTORESTRING(al,0) "        \n"
+    : "+D"(dst),       // %0
+      "+c"(width_tmp)  // %1
+    : "a"(v8)          // %2
+    : "memory", "cc");
+}
+
+void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) {
+  size_t width_tmp = (size_t)(width);
+  asm volatile (
+    "rep stosl " MEMSTORESTRING(eax,0) "       \n"
+    : "+D"(dst_argb),  // %0
+      "+c"(width_tmp)  // %1
+    : "a"(v32)         // %2
+    : "memory", "cc");
+}
+#endif  // HAS_SETROW_X86
+
+#ifdef HAS_YUY2TOYROW_SSE2
+void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_yuy2),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm5"
+  );
+}
+
+void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
+    MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "pavgb     %%xmm3,%%xmm1                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm1                   \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_yuy2),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(pix)          // %3
+  : "r"((intptr_t)(stride_yuy2))  // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+
+void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
+                         uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm1                   \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_yuy2),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(pix)          // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm5"
+  );
+}
+
+void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_uyvy),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1"
+  );
+}
+
+void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
+    MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "pavgb     %%xmm3,%%xmm1                   \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm1                   \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_uyvy),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(pix)          // %3
+  : "r"((intptr_t)(stride_uyvy))  // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+
+void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm1                   \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_uyvy),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(pix)          // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm5"
+  );
+}
+#endif  // HAS_YUY2TOYROW_SSE2
+
+#ifdef HAS_YUY2TOYROW_AVX2
+void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix) {
+  asm volatile (
+    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
+    "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
+    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
+    "lea      " MEMLEA(0x20,1) ",%1            \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_yuy2),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm5"
+  );
+}
+
+void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
+    VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+    "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
+    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
+    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
+    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
+    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
+    "lea      " MEMLEA(0x10,1) ",%1            \n"
+    "sub       $0x20,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_yuy2),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(pix)          // %3
+  : "r"((intptr_t)(stride_yuy2))  // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm5"
+  );
+}
+
+void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
+                         uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+    "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
+    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
+    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
+    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
+    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
+    "lea      " MEMLEA(0x10,1) ",%1            \n"
+    "sub       $0x20,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_yuy2),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(pix)          // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm5"
+  );
+}
+
+void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix) {
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+    "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
+    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
+    "lea      " MEMLEA(0x20,1) ",%1            \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_uyvy),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm5"
+  );
+}
+void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
+    "sub       %1,%2                           \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
+    VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
+    "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
+    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
+    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
+    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
+    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
+    "lea      " MEMLEA(0x10,1) ",%1            \n"
+    "sub       $0x20,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_uyvy),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(pix)          // %3
+  : "r"((intptr_t)(stride_uyvy))  // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm5"
+  );
+}
+
+void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    "vpsrlw     $0x8,%%ymm5,%%ymm5             \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
+    "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
+    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
+    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
+    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
+    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
+    "lea      " MEMLEA(0x10,1) ",%1            \n"
+    "sub       $0x20,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_uyvy),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(pix)          // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm5"
+  );
+}
+#endif  // HAS_YUY2TOYROW_AVX2
+
+#ifdef HAS_ARGBBLENDROW_SSE2
+// Blend 8 pixels at a time.
+void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                       uint8* dst_argb, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm7,%%xmm7                   \n"
+    "psrlw     $0xf,%%xmm7                     \n"
+    "pcmpeqb   %%xmm6,%%xmm6                   \n"
+    "psrlw     $0x8,%%xmm6                     \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psllw     $0x8,%%xmm5                     \n"
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "pslld     $0x18,%%xmm4                    \n"
+    "sub       $0x4,%3                         \n"
+    "jl        49f                             \n"
+
+    // 4 pixel loop.
+    LABELALIGN
+  "41:                                         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm3         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqa    %%xmm3,%%xmm0                   \n"
+    "pxor      %%xmm4,%%xmm3                   \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
+    "psrlw     $0x8,%%xmm3                     \n"
+    "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
+    "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
+    "pand      %%xmm6,%%xmm2                   \n"
+    "paddw     %%xmm7,%%xmm3                   \n"
+    "pmullw    %%xmm3,%%xmm2                   \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "por       %%xmm4,%%xmm0                   \n"
+    "pmullw    %%xmm3,%%xmm1                   \n"
+    "psrlw     $0x8,%%xmm2                     \n"
+    "paddusb   %%xmm2,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "paddusb   %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
+    "jge       41b                             \n"
+
+  "49:                                         \n"
+    "add       $0x3,%3                         \n"
+    "jl        99f                             \n"
+
+    // 1 pixel loop.
+  "91:                                         \n"
+    "movd      " MEMACCESS(0) ",%%xmm3         \n"
+    "lea       " MEMLEA(0x4,0) ",%0            \n"
+    "movdqa    %%xmm3,%%xmm0                   \n"
+    "pxor      %%xmm4,%%xmm3                   \n"
+    "movd      " MEMACCESS(1) ",%%xmm2         \n"
+    "psrlw     $0x8,%%xmm3                     \n"
+    "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
+    "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
+    "pand      %%xmm6,%%xmm2                   \n"
+    "paddw     %%xmm7,%%xmm3                   \n"
+    "pmullw    %%xmm3,%%xmm2                   \n"
+    "movd      " MEMACCESS(1) ",%%xmm1         \n"
+    "lea       " MEMLEA(0x4,1) ",%1            \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "por       %%xmm4,%%xmm0                   \n"
+    "pmullw    %%xmm3,%%xmm1                   \n"
+    "psrlw     $0x8,%%xmm2                     \n"
+    "paddusb   %%xmm2,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "paddusb   %%xmm1,%%xmm0                   \n"
+    "movd      %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x4,2) ",%2            \n"
+    "sub       $0x1,%3                         \n"
+    "jge       91b                             \n"
+  "99:                                         \n"
+  : "+r"(src_argb0),    // %0
+    "+r"(src_argb1),    // %1
+    "+r"(dst_argb),     // %2
+    "+r"(width)         // %3
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBBLENDROW_SSE2
+
+#ifdef HAS_ARGBBLENDROW_SSSE3
+// Shuffle table for isolating alpha.
+static uvec8 kShuffleAlpha = {
+  3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
+  11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
+};
+
+// Blend 8 pixels at a time
+// Shuffle table for reversing the bytes.
+
+// Same as SSE2, but replaces
+//    psrlw      xmm3, 8          // alpha
+//    pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
+//    pshuflw    xmm3, xmm3,0F5h
+// with..
+//    pshufb     xmm3, kShuffleAlpha // alpha
+
+void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
+                        uint8* dst_argb, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm7,%%xmm7                   \n"
+    "psrlw     $0xf,%%xmm7                     \n"
+    "pcmpeqb   %%xmm6,%%xmm6                   \n"
+    "psrlw     $0x8,%%xmm6                     \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psllw     $0x8,%%xmm5                     \n"
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "pslld     $0x18,%%xmm4                    \n"
+    "sub       $0x4,%3                         \n"
+    "jl        49f                             \n"
+
+    // 4 pixel loop.
+    LABELALIGN
+  "40:                                         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm3         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqa    %%xmm3,%%xmm0                   \n"
+    "pxor      %%xmm4,%%xmm3                   \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
+    "pshufb    %4,%%xmm3                       \n"
+    "pand      %%xmm6,%%xmm2                   \n"
+    "paddw     %%xmm7,%%xmm3                   \n"
+    "pmullw    %%xmm3,%%xmm2                   \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "por       %%xmm4,%%xmm0                   \n"
+    "pmullw    %%xmm3,%%xmm1                   \n"
+    "psrlw     $0x8,%%xmm2                     \n"
+    "paddusb   %%xmm2,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "paddusb   %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
+    "jge       40b                             \n"
+
+  "49:                                         \n"
+    "add       $0x3,%3                         \n"
+    "jl        99f                             \n"
+
+    // 1 pixel loop.
+  "91:                                         \n"
+    "movd      " MEMACCESS(0) ",%%xmm3         \n"
+    "lea       " MEMLEA(0x4,0) ",%0            \n"
+    "movdqa    %%xmm3,%%xmm0                   \n"
+    "pxor      %%xmm4,%%xmm3                   \n"
+    "movd      " MEMACCESS(1) ",%%xmm2         \n"
+    "pshufb    %4,%%xmm3                       \n"
+    "pand      %%xmm6,%%xmm2                   \n"
+    "paddw     %%xmm7,%%xmm3                   \n"
+    "pmullw    %%xmm3,%%xmm2                   \n"
+    "movd      " MEMACCESS(1) ",%%xmm1         \n"
+    "lea       " MEMLEA(0x4,1) ",%1            \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "por       %%xmm4,%%xmm0                   \n"
+    "pmullw    %%xmm3,%%xmm1                   \n"
+    "psrlw     $0x8,%%xmm2                     \n"
+    "paddusb   %%xmm2,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "paddusb   %%xmm1,%%xmm0                   \n"
+    "movd      %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x4,2) ",%2            \n"
+    "sub       $0x1,%3                         \n"
+    "jge       91b                             \n"
+  "99:                                         \n"
+  : "+r"(src_argb0),    // %0
+    "+r"(src_argb1),    // %1
+    "+r"(dst_argb),     // %2
+    "+r"(width)         // %3
+  : "m"(kShuffleAlpha)  // %4
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBBLENDROW_SSSE3
+
+#ifdef HAS_ARGBATTENUATEROW_SSE2
+// Attenuate 4 pixels at a time.
+void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "pslld     $0x18,%%xmm4                    \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrld     $0x8,%%xmm5                     \n"
+
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "pshufhw   $0xff,%%xmm0,%%xmm2             \n"
+    "pshuflw   $0xff,%%xmm2,%%xmm2             \n"
+    "pmulhuw   %%xmm2,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
+    "punpckhbw %%xmm1,%%xmm1                   \n"
+    "pshufhw   $0xff,%%xmm1,%%xmm2             \n"
+    "pshuflw   $0xff,%%xmm2,%%xmm2             \n"
+    "pmulhuw   %%xmm2,%%xmm1                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "pand      %%xmm4,%%xmm2                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "por       %%xmm2,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),    // %0
+    "+r"(dst_argb),    // %1
+    "+r"(width)        // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_ARGBATTENUATEROW_SSE2
+
+#ifdef HAS_ARGBATTENUATEROW_SSSE3
+// Shuffle table duplicating alpha
+static uvec8 kShuffleAlpha0 = {
+  3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u
+};
+static uvec8 kShuffleAlpha1 = {
+  11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
+  15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u
+};
+// Attenuate 4 pixels at a time.
+void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm3,%%xmm3                   \n"
+    "pslld     $0x18,%%xmm3                    \n"
+    "movdqa    %3,%%xmm4                       \n"
+    "movdqa    %4,%%xmm5                       \n"
+
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "pshufb    %%xmm4,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
+    "punpcklbw %%xmm1,%%xmm1                   \n"
+    "pmulhuw   %%xmm1,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
+    "pshufb    %%xmm5,%%xmm1                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
+    "punpckhbw %%xmm2,%%xmm2                   \n"
+    "pmulhuw   %%xmm2,%%xmm1                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "pand      %%xmm3,%%xmm2                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "por       %%xmm2,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),    // %0
+    "+r"(dst_argb),    // %1
+    "+r"(width)        // %2
+  : "m"(kShuffleAlpha0),  // %3
+    "m"(kShuffleAlpha1)  // %4
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_ARGBATTENUATEROW_SSSE3
+
+#ifdef HAS_ARGBATTENUATEROW_AVX2
+// Shuffle table duplicating alpha.
+static const uvec8 kShuffleAlpha_AVX2 = {
+  6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
+};
+// Attenuate 8 pixels at a time.
+void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
+  asm volatile (
+    "vbroadcastf128 %3,%%ymm4                  \n"
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    "vpslld     $0x18,%%ymm5,%%ymm5            \n"
+    "sub        %0,%1                          \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm6        \n"
+    "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
+    "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
+    "vpshufb    %%ymm4,%%ymm0,%%ymm2           \n"
+    "vpshufb    %%ymm4,%%ymm1,%%ymm3           \n"
+    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
+    "vpand      %%ymm5,%%ymm6,%%ymm6           \n"
+    "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
+    "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
+    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+    "vpor       %%ymm6,%%ymm0,%%ymm0           \n"
+    MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1)          //  vmovdqu %%ymm0,(%0,%1)
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "sub        $0x8,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb),    // %0
+    "+r"(dst_argb),    // %1
+    "+r"(width)        // %2
+  : "m"(kShuffleAlpha_AVX2)  // %3
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+#endif  // HAS_ARGBATTENUATEROW_AVX2
+
+#ifdef HAS_ARGBUNATTENUATEROW_SSE2
+// Unattenuate 4 pixels at a time.
+void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
+                             int width) {
+  uintptr_t alpha = 0;
+  asm volatile (
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movzb     " MEMACCESS2(0x03,0) ",%3       \n"
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
+    "movzb     " MEMACCESS2(0x07,0) ",%3       \n"
+    MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3
+    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
+    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
+    "movlhps   %%xmm3,%%xmm2                   \n"
+    "pmulhuw   %%xmm2,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
+    "movzb     " MEMACCESS2(0x0b,0) ",%3       \n"
+    "punpckhbw %%xmm1,%%xmm1                   \n"
+    MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
+    "movzb     " MEMACCESS2(0x0f,0) ",%3       \n"
+    MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3
+    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
+    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
+    "movlhps   %%xmm3,%%xmm2                   \n"
+    "pmulhuw   %%xmm2,%%xmm1                   \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),    // %0
+    "+r"(dst_argb),    // %1
+    "+r"(width),       // %2
+    "+r"(alpha)        // %3
+  : "r"(fixed_invtbl8)  // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_ARGBUNATTENUATEROW_SSE2
+
+#ifdef HAS_ARGBUNATTENUATEROW_AVX2
+// Shuffle table duplicating alpha.
+static const uvec8 kUnattenShuffleAlpha_AVX2 = {
+  0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
+};
+// Unattenuate 8 pixels at a time.
+void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+                             int width) {
+  uintptr_t alpha = 0;
+  asm volatile (
+    "sub        %0,%1                          \n"
+    "vbroadcastf128 %5,%%ymm5                  \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    // replace VPGATHER
+    "movzb     " MEMACCESS2(0x03,0) ",%3       \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm0)             //  vmovd 0x0(%4,%3,4),%%xmm0
+    "movzb     " MEMACCESS2(0x07,0) ",%3       \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm1)             //  vmovd 0x0(%4,%3,4),%%xmm1
+    "movzb     " MEMACCESS2(0x0b,0) ",%3       \n"
+    "vpunpckldq %%xmm1,%%xmm0,%%xmm6           \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm2)             //  vmovd 0x0(%4,%3,4),%%xmm2
+    "movzb     " MEMACCESS2(0x0f,0) ",%3       \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm3)             //  vmovd 0x0(%4,%3,4),%%xmm3
+    "movzb     " MEMACCESS2(0x13,0) ",%3       \n"
+    "vpunpckldq %%xmm3,%%xmm2,%%xmm7           \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm0)             //  vmovd 0x0(%4,%3,4),%%xmm0
+    "movzb     " MEMACCESS2(0x17,0) ",%3       \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm1)             //  vmovd 0x0(%4,%3,4),%%xmm1
+    "movzb     " MEMACCESS2(0x1b,0) ",%3       \n"
+    "vpunpckldq %%xmm1,%%xmm0,%%xmm0           \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm2)             //  vmovd 0x0(%4,%3,4),%%xmm2
+    "movzb     " MEMACCESS2(0x1f,0) ",%3       \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm3)             //  vmovd 0x0(%4,%3,4),%%xmm3
+    "vpunpckldq %%xmm3,%%xmm2,%%xmm2           \n"
+    "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3          \n"
+    "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0          \n"
+    "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3     \n"
+    // end of VPGATHER
+
+    "vmovdqu    " MEMACCESS(0) ",%%ymm6        \n"
+    "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
+    "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
+    "vpunpcklwd %%ymm3,%%ymm3,%%ymm2           \n"
+    "vpunpckhwd %%ymm3,%%ymm3,%%ymm3           \n"
+    "vpshufb    %%ymm5,%%ymm2,%%ymm2           \n"
+    "vpshufb    %%ymm5,%%ymm3,%%ymm3           \n"
+    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
+    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+    MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1)          //  vmovdqu %%ymm0,(%0,%1)
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "sub        $0x8,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb),    // %0
+    "+r"(dst_argb),    // %1
+    "+r"(width),       // %2
+    "+r"(alpha)        // %3
+  : "r"(fixed_invtbl8),  // %4
+    "m"(kUnattenShuffleAlpha_AVX2)  // %5
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBUNATTENUATEROW_AVX2
+
+#ifdef HAS_ARGBGRAYROW_SSSE3
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
+void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
+  asm volatile (
+    "movdqa    %3,%%xmm4                       \n"
+    "movdqa    %4,%%xmm5                       \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "phaddw    %%xmm1,%%xmm0                   \n"
+    "paddw     %%xmm5,%%xmm0                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "psrld     $0x18,%%xmm2                    \n"
+    "psrld     $0x18,%%xmm3                    \n"
+    "packuswb  %%xmm3,%%xmm2                   \n"
+    "packuswb  %%xmm2,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm3                   \n"
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "punpcklbw %%xmm2,%%xmm3                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm3,%%xmm0                   \n"
+    "punpckhwd %%xmm3,%%xmm1                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_argb),   // %1
+    "+r"(width)       // %2
+  : "m"(kARGBToYJ),   // %3
+    "m"(kAddYJ64)     // %4
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_ARGBGRAYROW_SSSE3
+
+#ifdef HAS_ARGBSEPIAROW_SSSE3
+//    b = (r * 35 + g * 68 + b * 17) >> 7
+//    g = (r * 45 + g * 88 + b * 22) >> 7
+//    r = (r * 50 + g * 98 + b * 24) >> 7
+// Constant for ARGB color to sepia tone
+static vec8 kARGBToSepiaB = {
+  17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
+};
+
+static vec8 kARGBToSepiaG = {
+  22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
+};
+
+static vec8 kARGBToSepiaR = {
+  24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
+};
+
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
+  asm volatile (
+    "movdqa    %2,%%xmm2                       \n"
+    "movdqa    %3,%%xmm3                       \n"
+    "movdqa    %4,%%xmm4                       \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
+    "pmaddubsw %%xmm2,%%xmm0                   \n"
+    "pmaddubsw %%xmm2,%%xmm6                   \n"
+    "phaddw    %%xmm6,%%xmm0                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm5         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "pmaddubsw %%xmm3,%%xmm5                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "phaddw    %%xmm1,%%xmm5                   \n"
+    "psrlw     $0x7,%%xmm5                     \n"
+    "packuswb  %%xmm5,%%xmm5                   \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm5         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "pmaddubsw %%xmm4,%%xmm5                   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "phaddw    %%xmm1,%%xmm5                   \n"
+    "psrlw     $0x7,%%xmm5                     \n"
+    "packuswb  %%xmm5,%%xmm5                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "psrld     $0x18,%%xmm6                    \n"
+    "psrld     $0x18,%%xmm1                    \n"
+    "packuswb  %%xmm1,%%xmm6                   \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "punpcklbw %%xmm6,%%xmm5                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm5,%%xmm0                   \n"
+    "punpckhwd %%xmm5,%%xmm1                   \n"
+    "movdqu    %%xmm0," MEMACCESS(0) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "sub       $0x8,%1                         \n"
+    "jg        1b                              \n"
+  : "+r"(dst_argb),      // %0
+    "+r"(width)          // %1
+  : "m"(kARGBToSepiaB),  // %2
+    "m"(kARGBToSepiaG),  // %3
+    "m"(kARGBToSepiaR)   // %4
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+#endif  // HAS_ARGBSEPIAROW_SSSE3
+
+#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// Same as Sepia except matrix is provided.
+void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                              const int8* matrix_argb, int width) {
+  asm volatile (
+    "movdqu    " MEMACCESS(3) ",%%xmm5         \n"
+    "pshufd    $0x00,%%xmm5,%%xmm2             \n"
+    "pshufd    $0x55,%%xmm5,%%xmm3             \n"
+    "pshufd    $0xaa,%%xmm5,%%xmm4             \n"
+    "pshufd    $0xff,%%xmm5,%%xmm5             \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
+    "pmaddubsw %%xmm2,%%xmm0                   \n"
+    "pmaddubsw %%xmm2,%%xmm7                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "pmaddubsw %%xmm3,%%xmm6                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "phaddsw   %%xmm7,%%xmm0                   \n"
+    "phaddsw   %%xmm1,%%xmm6                   \n"
+    "psraw     $0x6,%%xmm0                     \n"
+    "psraw     $0x6,%%xmm6                     \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "punpcklbw %%xmm6,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "pmaddubsw %%xmm4,%%xmm7                   \n"
+    "phaddsw   %%xmm7,%%xmm1                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
+    "pmaddubsw %%xmm5,%%xmm6                   \n"
+    "pmaddubsw %%xmm5,%%xmm7                   \n"
+    "phaddsw   %%xmm7,%%xmm6                   \n"
+    "psraw     $0x6,%%xmm1                     \n"
+    "psraw     $0x6,%%xmm6                     \n"
+    "packuswb  %%xmm1,%%xmm1                   \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "punpcklbw %%xmm6,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm6                   \n"
+    "punpcklwd %%xmm1,%%xmm0                   \n"
+    "punpckhwd %%xmm1,%%xmm6                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm6," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),      // %0
+    "+r"(dst_argb),      // %1
+    "+r"(width)          // %2
+  : "r"(matrix_argb)     // %3
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
+
+#ifdef HAS_ARGBQUANTIZEROW_SSE2
+// Quantize 4 ARGB pixels (16 bytes).
+void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
+                          int interval_offset, int width) {
+  asm volatile (
+    "movd      %2,%%xmm2                       \n"
+    "movd      %3,%%xmm3                       \n"
+    "movd      %4,%%xmm4                       \n"
+    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
+    "pshufd    $0x44,%%xmm2,%%xmm2             \n"
+    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
+    "pshufd    $0x44,%%xmm3,%%xmm3             \n"
+    "pshuflw   $0x40,%%xmm4,%%xmm4             \n"
+    "pshufd    $0x44,%%xmm4,%%xmm4             \n"
+    "pxor      %%xmm5,%%xmm5                   \n"
+    "pcmpeqb   %%xmm6,%%xmm6                   \n"
+    "pslld     $0x18,%%xmm6                    \n"
+
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "pmulhuw   %%xmm2,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
+    "punpckhbw %%xmm5,%%xmm1                   \n"
+    "pmulhuw   %%xmm2,%%xmm1                   \n"
+    "pmullw    %%xmm3,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm7         \n"
+    "pmullw    %%xmm3,%%xmm1                   \n"
+    "pand      %%xmm6,%%xmm7                   \n"
+    "paddw     %%xmm4,%%xmm0                   \n"
+    "paddw     %%xmm4,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "por       %%xmm7,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(0) "         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "sub       $0x4,%1                         \n"
+    "jg        1b                              \n"
+  : "+r"(dst_argb),       // %0
+    "+r"(width)           // %1
+  : "r"(scale),           // %2
+    "r"(interval_size),   // %3
+    "r"(interval_offset)  // %4
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBQUANTIZEROW_SSE2
+
+#ifdef HAS_ARGBSHADEROW_SSE2
+// Shade 4 pixels at a time by specified value.
+void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
+                       uint32 value) {
+  asm volatile (
+    "movd      %3,%%xmm2                       \n"
+    "punpcklbw %%xmm2,%%xmm2                   \n"
+    "punpcklqdq %%xmm2,%%xmm2                  \n"
+
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "punpckhbw %%xmm1,%%xmm1                   \n"
+    "pmulhuw   %%xmm2,%%xmm0                   \n"
+    "pmulhuw   %%xmm2,%%xmm1                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(width)      // %2
+  : "r"(value)       // %3
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2"
+  );
+}
+#endif  // HAS_ARGBSHADEROW_SSE2
+
+#ifdef HAS_ARGBMULTIPLYROW_SSE2
+// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
+void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  asm volatile (
+    "pxor      %%xmm5,%%xmm5                  \n"
+
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "movdqu    %%xmm0,%%xmm1                   \n"
+    "movdqu    %%xmm2,%%xmm3                   \n"
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "punpckhbw %%xmm1,%%xmm1                   \n"
+    "punpcklbw %%xmm5,%%xmm2                   \n"
+    "punpckhbw %%xmm5,%%xmm3                   \n"
+    "pmulhuw   %%xmm2,%%xmm0                   \n"
+    "pmulhuw   %%xmm3,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_ARGBMULTIPLYROW_SSE2
+
+#ifdef HAS_ARGBMULTIPLYROW_AVX2
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  asm volatile (
+    "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
+
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm1        \n"
+    "lea        " MEMLEA(0x20,0) ",%0          \n"
+    "vmovdqu    " MEMACCESS(1) ",%%ymm3        \n"
+    "lea        " MEMLEA(0x20,1) ",%1          \n"
+    "vpunpcklbw %%ymm1,%%ymm1,%%ymm0           \n"
+    "vpunpckhbw %%ymm1,%%ymm1,%%ymm1           \n"
+    "vpunpcklbw %%ymm5,%%ymm3,%%ymm2           \n"
+    "vpunpckhbw %%ymm5,%%ymm3,%%ymm3           \n"
+    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
+    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+    "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
+    "lea       " MEMLEA(0x20,2) ",%2           \n"
+    "sub        $0x8,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "memory", "cc"
+#if defined(__AVX2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+  );
+}
+#endif  // HAS_ARGBMULTIPLYROW_AVX2
+
+#ifdef HAS_ARGBADDROW_SSE2
+// Add 2 rows of ARGB pixels together, 4 pixels at a time.
+void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "paddusb   %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1"
+  );
+}
+#endif  // HAS_ARGBADDROW_SSE2
+
+#ifdef HAS_ARGBADDROW_AVX2
+// Add 2 rows of ARGB pixels together, 4 pixels at a time.
+void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
+    "lea        " MEMLEA(0x20,0) ",%0          \n"
+    "vpaddusb   " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
+    "lea        " MEMLEA(0x20,1) ",%1          \n"
+    "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
+    "lea        " MEMLEA(0x20,2) ",%2          \n"
+    "sub        $0x8,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "memory", "cc"
+    , "xmm0"
+  );
+}
+#endif  // HAS_ARGBADDROW_AVX2
+
+#ifdef HAS_ARGBSUBTRACTROW_SSE2
+// Subtract 2 rows of ARGB pixels, 4 pixels at a time.
+void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  asm volatile (
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "psubusb   %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1"
+  );
+}
+#endif  // HAS_ARGBSUBTRACTROW_SSE2
+
+#ifdef HAS_ARGBSUBTRACTROW_AVX2
+// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
+void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  asm volatile (
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
+    "lea        " MEMLEA(0x20,0) ",%0          \n"
+    "vpsubusb   " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
+    "lea        " MEMLEA(0x20,1) ",%1          \n"
+    "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
+    "lea        " MEMLEA(0x20,2) ",%2          \n"
+    "sub        $0x8,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "memory", "cc"
+    , "xmm0"
+  );
+}
+#endif  // HAS_ARGBSUBTRACTROW_AVX2
+
+#ifdef HAS_SOBELXROW_SSE2
+// SobelX as a matrix is
+// -1  0  1
+// -2  0  2
+// -1  0  1
+void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
+                    const uint8* src_y2, uint8* dst_sobelx, int width) {
+  asm volatile (
+    "sub       %0,%1                           \n"
+    "sub       %0,%2                           \n"
+    "sub       %0,%3                           \n"
+    "pxor      %%xmm5,%%xmm5                   \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movq      " MEMACCESS(0) ",%%xmm0         \n"
+    "movq      " MEMACCESS2(0x2,0) ",%%xmm1    \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "punpcklbw %%xmm5,%%xmm1                   \n"
+    "psubw     %%xmm1,%%xmm0                   \n"
+    MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1
+    MEMOPREG(movq,0x02,0,1,1,xmm2)             //  movq      0x2(%0,%1,1),%%xmm2
+    "punpcklbw %%xmm5,%%xmm1                   \n"
+    "punpcklbw %%xmm5,%%xmm2                   \n"
+    "psubw     %%xmm2,%%xmm1                   \n"
+    MEMOPREG(movq,0x00,0,2,1,xmm2)             //  movq      (%0,%2,1),%%xmm2
+    MEMOPREG(movq,0x02,0,2,1,xmm3)             //  movq      0x2(%0,%2,1),%%xmm3
+    "punpcklbw %%xmm5,%%xmm2                   \n"
+    "punpcklbw %%xmm5,%%xmm3                   \n"
+    "psubw     %%xmm3,%%xmm2                   \n"
+    "paddw     %%xmm2,%%xmm0                   \n"
+    "paddw     %%xmm1,%%xmm0                   \n"
+    "paddw     %%xmm1,%%xmm0                   \n"
+    "pxor      %%xmm1,%%xmm1                   \n"
+    "psubw     %%xmm0,%%xmm1                   \n"
+    "pmaxsw    %%xmm1,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    MEMOPMEM(movq,xmm0,0x00,0,3,1)             //  movq      %%xmm0,(%0,%3,1)
+    "lea       " MEMLEA(0x8,0) ",%0            \n"
+    "sub       $0x8,%4                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_y0),      // %0
+    "+r"(src_y1),      // %1
+    "+r"(src_y2),      // %2
+    "+r"(dst_sobelx),  // %3
+    "+r"(width)        // %4
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_SOBELXROW_SSE2
+
+#ifdef HAS_SOBELYROW_SSE2
+// SobelY as a matrix is
+// -1 -2 -1
+//  0  0  0
+//  1  2  1
+void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
+                    uint8* dst_sobely, int width) {
+  asm volatile (
+    "sub       %0,%1                           \n"
+    "sub       %0,%2                           \n"
+    "pxor      %%xmm5,%%xmm5                   \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movq      " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "punpcklbw %%xmm5,%%xmm1                   \n"
+    "psubw     %%xmm1,%%xmm0                   \n"
+    "movq      " MEMACCESS2(0x1,0) ",%%xmm1    \n"
+    MEMOPREG(movq,0x01,0,1,1,xmm2)             //  movq      0x1(%0,%1,1),%%xmm2
+    "punpcklbw %%xmm5,%%xmm1                   \n"
+    "punpcklbw %%xmm5,%%xmm2                   \n"
+    "psubw     %%xmm2,%%xmm1                   \n"
+    "movq      " MEMACCESS2(0x2,0) ",%%xmm2    \n"
+    MEMOPREG(movq,0x02,0,1,1,xmm3)             //  movq      0x2(%0,%1,1),%%xmm3
+    "punpcklbw %%xmm5,%%xmm2                   \n"
+    "punpcklbw %%xmm5,%%xmm3                   \n"
+    "psubw     %%xmm3,%%xmm2                   \n"
+    "paddw     %%xmm2,%%xmm0                   \n"
+    "paddw     %%xmm1,%%xmm0                   \n"
+    "paddw     %%xmm1,%%xmm0                   \n"
+    "pxor      %%xmm1,%%xmm1                   \n"
+    "psubw     %%xmm0,%%xmm1                   \n"
+    "pmaxsw    %%xmm1,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    MEMOPMEM(movq,xmm0,0x00,0,2,1)             //  movq      %%xmm0,(%0,%2,1)
+    "lea       " MEMLEA(0x8,0) ",%0            \n"
+    "sub       $0x8,%3                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_y0),      // %0
+    "+r"(src_y1),      // %1
+    "+r"(dst_sobely),  // %2
+    "+r"(width)        // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_SOBELYROW_SSE2
+
+#ifdef HAS_SOBELROW_SSE2
+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+// A = 255
+// R = Sobel
+// G = Sobel
+// B = Sobel
+void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                   uint8* dst_argb, int width) {
+  asm volatile (
+    "sub       %0,%1                           \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "pslld     $0x18,%%xmm5                    \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "paddusb   %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "punpcklbw %%xmm0,%%xmm2                   \n"
+    "punpckhbw %%xmm0,%%xmm0                   \n"
+    "movdqa    %%xmm2,%%xmm1                   \n"
+    "punpcklwd %%xmm2,%%xmm1                   \n"
+    "punpckhwd %%xmm2,%%xmm2                   \n"
+    "por       %%xmm5,%%xmm1                   \n"
+    "por       %%xmm5,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm3                   \n"
+    "punpcklwd %%xmm0,%%xmm3                   \n"
+    "punpckhwd %%xmm0,%%xmm0                   \n"
+    "por       %%xmm5,%%xmm3                   \n"
+    "por       %%xmm5,%%xmm0                   \n"
+    "movdqu    %%xmm1," MEMACCESS(2) "         \n"
+    "movdqu    %%xmm2," MEMACCESS2(0x10,2) "   \n"
+    "movdqu    %%xmm3," MEMACCESS2(0x20,2) "   \n"
+    "movdqu    %%xmm0," MEMACCESS2(0x30,2) "   \n"
+    "lea       " MEMLEA(0x40,2) ",%2           \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_argb),    // %2
+    "+r"(width)        // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_SOBELROW_SSE2
+
+#ifdef HAS_SOBELTOPLANEROW_SSE2
+// Adds Sobel X and Sobel Y and stores Sobel into a plane.
+void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                          uint8* dst_y, int width) {
+  asm volatile (
+    "sub       %0,%1                           \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "pslld     $0x18,%%xmm5                    \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "paddusb   %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_y),       // %2
+    "+r"(width)        // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1"
+  );
+}
+#endif  // HAS_SOBELTOPLANEROW_SSE2
+
+#ifdef HAS_SOBELXYROW_SSE2
+// Mixes Sobel X, Sobel Y and Sobel into ARGB.
+// A = 255
+// R = Sobel X
+// G = Sobel
+// B = Sobel Y
+void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    "sub       %0,%1                           \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "paddusb   %%xmm1,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm3                   \n"
+    "punpcklbw %%xmm5,%%xmm3                   \n"
+    "punpckhbw %%xmm5,%%xmm0                   \n"
+    "movdqa    %%xmm1,%%xmm4                   \n"
+    "punpcklbw %%xmm2,%%xmm4                   \n"
+    "punpckhbw %%xmm2,%%xmm1                   \n"
+    "movdqa    %%xmm4,%%xmm6                   \n"
+    "punpcklwd %%xmm3,%%xmm6                   \n"
+    "punpckhwd %%xmm3,%%xmm4                   \n"
+    "movdqa    %%xmm1,%%xmm7                   \n"
+    "punpcklwd %%xmm0,%%xmm7                   \n"
+    "punpckhwd %%xmm0,%%xmm1                   \n"
+    "movdqu    %%xmm6," MEMACCESS(2) "         \n"
+    "movdqu    %%xmm4," MEMACCESS2(0x10,2) "   \n"
+    "movdqu    %%xmm7," MEMACCESS2(0x20,2) "   \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x30,2) "   \n"
+    "lea       " MEMLEA(0x40,2) ",%2           \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_argb),    // %2
+    "+r"(width)        // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_SOBELXYROW_SSE2
+
+#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
+// Creates a table of cumulative sums where each value is a sum of all values
+// above and to the left of the value, inclusive of the value.
+void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
+                                  const int32* previous_cumsum, int width) {
+  asm volatile (
+    "pxor      %%xmm0,%%xmm0                   \n"
+    "pxor      %%xmm1,%%xmm1                   \n"
+    "sub       $0x4,%3                         \n"
+    "jl        49f                             \n"
+    "test      $0xf,%1                         \n"
+    "jne       49f                             \n"
+
+  // 4 pixel loop                              \n"
+    LABELALIGN
+  "40:                                         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqa    %%xmm2,%%xmm4                   \n"
+    "punpcklbw %%xmm1,%%xmm2                   \n"
+    "movdqa    %%xmm2,%%xmm3                   \n"
+    "punpcklwd %%xmm1,%%xmm2                   \n"
+    "punpckhwd %%xmm1,%%xmm3                   \n"
+    "punpckhbw %%xmm1,%%xmm4                   \n"
+    "movdqa    %%xmm4,%%xmm5                   \n"
+    "punpcklwd %%xmm1,%%xmm4                   \n"
+    "punpckhwd %%xmm1,%%xmm5                   \n"
+    "paddd     %%xmm2,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(2) ",%%xmm2         \n"
+    "paddd     %%xmm0,%%xmm2                   \n"
+    "paddd     %%xmm3,%%xmm0                   \n"
+    "movdqu    " MEMACCESS2(0x10,2) ",%%xmm3   \n"
+    "paddd     %%xmm0,%%xmm3                   \n"
+    "paddd     %%xmm4,%%xmm0                   \n"
+    "movdqu    " MEMACCESS2(0x20,2) ",%%xmm4   \n"
+    "paddd     %%xmm0,%%xmm4                   \n"
+    "paddd     %%xmm5,%%xmm0                   \n"
+    "movdqu    " MEMACCESS2(0x30,2) ",%%xmm5   \n"
+    "lea       " MEMLEA(0x40,2) ",%2           \n"
+    "paddd     %%xmm0,%%xmm5                   \n"
+    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
+    "movdqu    %%xmm4," MEMACCESS2(0x20,1) "   \n"
+    "movdqu    %%xmm5," MEMACCESS2(0x30,1) "   \n"
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub       $0x4,%3                         \n"
+    "jge       40b                             \n"
+
+  "49:                                         \n"
+    "add       $0x3,%3                         \n"
+    "jl        19f                             \n"
+
+  // 1 pixel loop                              \n"
+    LABELALIGN
+  "10:                                         \n"
+    "movd      " MEMACCESS(0) ",%%xmm2         \n"
+    "lea       " MEMLEA(0x4,0) ",%0            \n"
+    "punpcklbw %%xmm1,%%xmm2                   \n"
+    "punpcklwd %%xmm1,%%xmm2                   \n"
+    "paddd     %%xmm2,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(2) ",%%xmm2         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "paddd     %%xmm0,%%xmm2                   \n"
+    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x1,%3                         \n"
+    "jge       10b                             \n"
+
+  "19:                                         \n"
+  : "+r"(row),  // %0
+    "+r"(cumsum),  // %1
+    "+r"(previous_cumsum),  // %2
+    "+r"(width)  // %3
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
+
+#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
+                                    int width, int area, uint8* dst,
+                                    int count) {
+  asm volatile (
+    "movd      %5,%%xmm5                       \n"
+    "cvtdq2ps  %%xmm5,%%xmm5                   \n"
+    "rcpss     %%xmm5,%%xmm4                   \n"
+    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
+    "sub       $0x4,%3                         \n"
+    "jl        49f                             \n"
+    "cmpl      $0x80,%5                        \n"
+    "ja        40f                             \n"
+
+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+    "pcmpeqb   %%xmm6,%%xmm6                   \n"
+    "psrld     $0x10,%%xmm6                    \n"
+    "cvtdq2ps  %%xmm6,%%xmm6                   \n"
+    "addps     %%xmm6,%%xmm5                   \n"
+    "mulps     %%xmm4,%%xmm5                   \n"
+    "cvtps2dq  %%xmm5,%%xmm5                   \n"
+    "packssdw  %%xmm5,%%xmm5                   \n"
+
+  // 4 pixel small loop                        \n"
+    LABELALIGN
+  "4:                                         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
+    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
+    MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1
+    MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2
+    MEMOPREG(psubd,0x30,0,4,4,xmm3)            // psubd    0x30(%0,%4,4),%%xmm3
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "psubd     " MEMACCESS(1) ",%%xmm0         \n"
+    "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"
+    "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"
+    "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"
+    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
+    MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1
+    MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2
+    MEMOPREG(paddd,0x30,1,4,4,xmm3)            // paddd    0x30(%1,%4,4),%%xmm3
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "packssdw  %%xmm1,%%xmm0                   \n"
+    "packssdw  %%xmm3,%%xmm2                   \n"
+    "pmulhuw   %%xmm5,%%xmm0                   \n"
+    "pmulhuw   %%xmm5,%%xmm2                   \n"
+    "packuswb  %%xmm2,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
+    "jge       4b                              \n"
+    "jmp       49f                             \n"
+
+  // 4 pixel loop                              \n"
+    LABELALIGN
+  "40:                                         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
+    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
+    MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1
+    MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2
+    MEMOPREG(psubd,0x30,0,4,4,xmm3)            // psubd    0x30(%0,%4,4),%%xmm3
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "psubd     " MEMACCESS(1) ",%%xmm0         \n"
+    "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"
+    "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"
+    "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"
+    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
+    MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1
+    MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2
+    MEMOPREG(paddd,0x30,1,4,4,xmm3)            // paddd    0x30(%1,%4,4),%%xmm3
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
+    "cvtdq2ps  %%xmm1,%%xmm1                   \n"
+    "mulps     %%xmm4,%%xmm0                   \n"
+    "mulps     %%xmm4,%%xmm1                   \n"
+    "cvtdq2ps  %%xmm2,%%xmm2                   \n"
+    "cvtdq2ps  %%xmm3,%%xmm3                   \n"
+    "mulps     %%xmm4,%%xmm2                   \n"
+    "mulps     %%xmm4,%%xmm3                   \n"
+    "cvtps2dq  %%xmm0,%%xmm0                   \n"
+    "cvtps2dq  %%xmm1,%%xmm1                   \n"
+    "cvtps2dq  %%xmm2,%%xmm2                   \n"
+    "cvtps2dq  %%xmm3,%%xmm3                   \n"
+    "packssdw  %%xmm1,%%xmm0                   \n"
+    "packssdw  %%xmm3,%%xmm2                   \n"
+    "packuswb  %%xmm2,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
+    "jge       40b                             \n"
+
+  "49:                                         \n"
+    "add       $0x3,%3                         \n"
+    "jl        19f                             \n"
+
+  // 1 pixel loop                              \n"
+    LABELALIGN
+  "10:                                         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "psubd     " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
+    "mulps     %%xmm4,%%xmm0                   \n"
+    "cvtps2dq  %%xmm0,%%xmm0                   \n"
+    "packssdw  %%xmm0,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movd      %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x4,2) ",%2            \n"
+    "sub       $0x1,%3                         \n"
+    "jge       10b                             \n"
+  "19:                                         \n"
+  : "+r"(topleft),  // %0
+    "+r"(botleft),  // %1
+    "+r"(dst),      // %2
+    "+rm"(count)    // %3
+  : "r"((intptr_t)(width)),  // %4
+    "rm"(area)     // %5
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+#endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+
+#ifdef HAS_ARGBAFFINEROW_SSE2
+// Copy ARGB pixels from source image with slope to a row of destination.
+LIBYUV_API
+void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
+                        uint8* dst_argb, const float* src_dudv, int width) {
+  intptr_t src_argb_stride_temp = src_argb_stride;
+  intptr_t temp = 0;
+  asm volatile (
+    "movq      " MEMACCESS(3) ",%%xmm2         \n"
+    "movq      " MEMACCESS2(0x08,3) ",%%xmm7   \n"
+    "shl       $0x10,%1                        \n"
+    "add       $0x4,%1                         \n"
+    "movd      %1,%%xmm5                       \n"
+    "sub       $0x4,%4                         \n"
+    "jl        49f                             \n"
+
+    "pshufd    $0x44,%%xmm7,%%xmm7             \n"
+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+    "movdqa    %%xmm2,%%xmm0                   \n"
+    "addps     %%xmm7,%%xmm0                   \n"
+    "movlhps   %%xmm0,%%xmm2                   \n"
+    "movdqa    %%xmm7,%%xmm4                   \n"
+    "addps     %%xmm4,%%xmm4                   \n"
+    "movdqa    %%xmm2,%%xmm3                   \n"
+    "addps     %%xmm4,%%xmm3                   \n"
+    "addps     %%xmm4,%%xmm4                   \n"
+
+  // 4 pixel loop                              \n"
+    LABELALIGN
+  "40:                                         \n"
+    "cvttps2dq %%xmm2,%%xmm0                   \n"  // x, y float to int first 2
+    "cvttps2dq %%xmm3,%%xmm1                   \n"  // x, y float to int next 2
+    "packssdw  %%xmm1,%%xmm0                   \n"  // x, y as 8 shorts
+    "pmaddwd   %%xmm5,%%xmm0                   \n"  // off = x * 4 + y * stride
+    "movd      %%xmm0,%k1                      \n"
+    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+    "movd      %%xmm0,%k5                      \n"
+    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+    MEMOPREG(movd,0x00,0,1,1,xmm1)             //  movd      (%0,%1,1),%%xmm1
+    MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6
+    "punpckldq %%xmm6,%%xmm1                   \n"
+    "addps     %%xmm4,%%xmm2                   \n"
+    "movq      %%xmm1," MEMACCESS(2) "         \n"
+    "movd      %%xmm0,%k1                      \n"
+    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+    "movd      %%xmm0,%k5                      \n"
+    MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0
+    MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6
+    "punpckldq %%xmm6,%%xmm0                   \n"
+    "addps     %%xmm4,%%xmm3                   \n"
+    "movq      %%xmm0," MEMACCESS2(0x08,2) "   \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%4                         \n"
+    "jge       40b                             \n"
+
+  "49:                                         \n"
+    "add       $0x3,%4                         \n"
+    "jl        19f                             \n"
+
+  // 1 pixel loop                              \n"
+    LABELALIGN
+  "10:                                         \n"
+    "cvttps2dq %%xmm2,%%xmm0                   \n"
+    "packssdw  %%xmm0,%%xmm0                   \n"
+    "pmaddwd   %%xmm5,%%xmm0                   \n"
+    "addps     %%xmm7,%%xmm2                   \n"
+    "movd      %%xmm0,%k1                      \n"
+    MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0
+    "movd      %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x04,2) ",%2           \n"
+    "sub       $0x1,%4                         \n"
+    "jge       10b                             \n"
+  "19:                                         \n"
+  : "+r"(src_argb),  // %0
+    "+r"(src_argb_stride_temp),  // %1
+    "+r"(dst_argb),  // %2
+    "+r"(src_dudv),  // %3
+    "+rm"(width),    // %4
+    "+r"(temp)   // %5
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBAFFINEROW_SSE2
+
+#ifdef HAS_INTERPOLATEROW_SSSE3
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                          ptrdiff_t src_stride, int dst_width,
+                          int source_y_fraction) {
+  asm volatile (
+    "sub       %1,%0                           \n"
+    "shr       %3                              \n"
+    "cmp       $0x0,%3                         \n"
+    "je        100f                            \n"
+    "cmp       $0x20,%3                        \n"
+    "je        75f                             \n"
+    "cmp       $0x40,%3                        \n"
+    "je        50f                             \n"
+    "cmp       $0x60,%3                        \n"
+    "je        25f                             \n"
+
+    "movd      %3,%%xmm0                       \n"
+    "neg       %3                              \n"
+    "add       $0x80,%3                        \n"
+    "movd      %3,%%xmm5                       \n"
+    "punpcklbw %%xmm0,%%xmm5                   \n"
+    "punpcklwd %%xmm5,%%xmm5                   \n"
+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+
+    // General purpose row blend.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,1,4,1,xmm2)
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm2,%%xmm0                   \n"
+    "punpckhbw %%xmm2,%%xmm1                   \n"
+    "pmaddubsw %%xmm5,%%xmm0                   \n"
+    "pmaddubsw %%xmm5,%%xmm1                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "psrlw     $0x7,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+    "jmp       99f                             \n"
+
+    // Blend 25 / 75.
+    LABELALIGN
+  "25:                                         \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,1,4,1,xmm1)
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        25b                             \n"
+    "jmp       99f                             \n"
+
+    // Blend 50 / 50.
+    LABELALIGN
+  "50:                                         \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,1,4,1,xmm1)
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        50b                             \n"
+    "jmp       99f                             \n"
+
+    // Blend 75 / 25.
+    LABELALIGN
+  "75:                                         \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
+    MEMOPREG(movdqu,0x00,1,4,1,xmm0)
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        75b                             \n"
+    "jmp       99f                             \n"
+
+    // Blend 100 / 0 - Copy row unchanged.
+    LABELALIGN
+  "100:                                        \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        100b                            \n"
+
+  "99:                                         \n"
+  : "+r"(dst_ptr),    // %0
+    "+r"(src_ptr),    // %1
+    "+r"(dst_width),  // %2
+    "+r"(source_y_fraction)  // %3
+  : "r"((intptr_t)(src_stride))  // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm5"
+  );
+}
+#endif  // HAS_INTERPOLATEROW_SSSE3
+
+#ifdef HAS_INTERPOLATEROW_AVX2
+// Bilinear filter 32x2 -> 32x1
+void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) {
+  asm volatile (
+    "shr       %3                              \n"
+    "cmp       $0x0,%3                         \n"
+    "je        100f                            \n"
+    "sub       %1,%0                           \n"
+    "cmp       $0x20,%3                        \n"
+    "je        75f                             \n"
+    "cmp       $0x40,%3                        \n"
+    "je        50f                             \n"
+    "cmp       $0x60,%3                        \n"
+    "je        25f                             \n"
+
+    "vmovd      %3,%%xmm0                      \n"
+    "neg        %3                             \n"
+    "add        $0x80,%3                       \n"
+    "vmovd      %3,%%xmm5                      \n"
+    "vpunpcklbw %%xmm0,%%xmm5,%%xmm5           \n"
+    "vpunpcklwd %%xmm5,%%xmm5,%%xmm5           \n"
+    "vpxor      %%ymm0,%%ymm0,%%ymm0           \n"
+    "vpermd     %%ymm5,%%ymm0,%%ymm5           \n"
+
+    // General purpose row blend.
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"
+    MEMOPREG(vmovdqu,0x00,1,4,1,ymm2)
+    "vpunpckhbw %%ymm2,%%ymm0,%%ymm1           \n"
+    "vpunpcklbw %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpmaddubsw %%ymm5,%%ymm0,%%ymm0           \n"
+    "vpmaddubsw %%ymm5,%%ymm1,%%ymm1           \n"
+    "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
+    "vpsrlw     $0x7,%%ymm1,%%ymm1             \n"
+    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+    MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+    "jmp       99f                             \n"
+
+    // Blend 25 / 75.
+    LABELALIGN
+  "25:                                         \n"
+    "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"
+    MEMOPREG(vmovdqu,0x00,1,4,1,ymm1)
+    "vpavgb     %%ymm1,%%ymm0,%%ymm0           \n"
+    "vpavgb     %%ymm1,%%ymm0,%%ymm0           \n"
+    MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
+    "jg        25b                             \n"
+    "jmp       99f                             \n"
+
+    // Blend 50 / 50.
+    LABELALIGN
+  "50:                                         \n"
+    "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"
+    VMEMOPREG(vpavgb,0x00,1,4,1,ymm0,ymm0)     // vpavgb (%1,%4,1),%%ymm0,%%ymm0
+    MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
+    "jg        50b                             \n"
+    "jmp       99f                             \n"
+
+    // Blend 75 / 25.
+    LABELALIGN
+  "75:                                         \n"
+    "vmovdqu    " MEMACCESS(1) ",%%ymm1        \n"
+    MEMOPREG(vmovdqu,0x00,1,4,1,ymm0)
+    "vpavgb     %%ymm1,%%ymm0,%%ymm0           \n"
+    "vpavgb     %%ymm1,%%ymm0,%%ymm0           \n"
+    MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
+    "jg        75b                             \n"
+    "jmp       99f                             \n"
+
+    // Blend 100 / 0 - Copy row unchanged.
+    LABELALIGN
+  "100:                                        \n"
+    "rep movsb " MEMMOVESTRING(1,0) "          \n"
+    "jmp       999f                            \n"
+
+  "99:                                         \n"
+    "vzeroupper                                \n"
+  "999:                                        \n"
+  : "+D"(dst_ptr),    // %0
+    "+S"(src_ptr),    // %1
+    "+c"(dst_width),  // %2
+    "+r"(source_y_fraction)  // %3
+  : "r"((intptr_t)(src_stride))  // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm5"
+  );
+}
+#endif  // HAS_INTERPOLATEROW_AVX2
+
+#ifdef HAS_INTERPOLATEROW_SSE2
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) {
+  asm volatile (
+    "sub       %1,%0                           \n"
+    "shr       %3                              \n"
+    "cmp       $0x0,%3                         \n"
+    "je        100f                            \n"
+    "cmp       $0x20,%3                        \n"
+    "je        75f                             \n"
+    "cmp       $0x40,%3                        \n"
+    "je        50f                             \n"
+    "cmp       $0x60,%3                        \n"
+    "je        25f                             \n"
+
+    "movd      %3,%%xmm0                       \n"
+    "neg       %3                              \n"
+    "add       $0x80,%3                        \n"
+    "movd      %3,%%xmm5                       \n"
+    "punpcklbw %%xmm0,%%xmm5                   \n"
+    "punpcklwd %%xmm5,%%xmm5                   \n"
+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+    "pxor      %%xmm4,%%xmm4                   \n"
+
+    // General purpose row blend.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,1,4,1,xmm2)           //  movdqu    (%1,%4,1),%%xmm2
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm2,%%xmm3                   \n"
+    "punpcklbw %%xmm4,%%xmm2                   \n"
+    "punpckhbw %%xmm4,%%xmm3                   \n"
+    "punpcklbw %%xmm4,%%xmm0                   \n"
+    "punpckhbw %%xmm4,%%xmm1                   \n"
+    "psubw     %%xmm0,%%xmm2                   \n"
+    "psubw     %%xmm1,%%xmm3                   \n"
+    "paddw     %%xmm2,%%xmm2                   \n"
+    "paddw     %%xmm3,%%xmm3                   \n"
+    "pmulhw    %%xmm5,%%xmm2                   \n"
+    "pmulhw    %%xmm5,%%xmm3                   \n"
+    "paddw     %%xmm2,%%xmm0                   \n"
+    "paddw     %%xmm3,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+    "jmp       99f                             \n"
+
+    // Blend 25 / 75.
+    LABELALIGN
+  "25:                                         \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,1,4,1,xmm1)           //  movdqu    (%1,%4,1),%%xmm1
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        25b                             \n"
+    "jmp       99f                             \n"
+
+    // Blend 50 / 50.
+    LABELALIGN
+  "50:                                         \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,1,4,1,xmm1)           //  movdqu    (%1,%4,1),%%xmm1
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        50b                             \n"
+    "jmp       99f                             \n"
+
+    // Blend 75 / 25.
+    LABELALIGN
+  "75:                                         \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
+    MEMOPREG(movdqu,0x00,1,4,1,xmm0)           //  movdqu    (%1,%4,1),%%xmm0
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        75b                             \n"
+    "jmp       99f                             \n"
+
+    // Blend 100 / 0 - Copy row unchanged.
+    LABELALIGN
+  "100:                                        \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        100b                            \n"
+
+  "99:                                         \n"
+  : "+r"(dst_ptr),    // %0
+    "+r"(src_ptr),    // %1
+    "+r"(dst_width),  // %2
+    "+r"(source_y_fraction)  // %3
+  : "r"((intptr_t)(src_stride))  // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_INTERPOLATEROW_SSE2
+
+#ifdef HAS_ARGBSHUFFLEROW_SSSE3
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                          const uint8* shuffler, int pix) {
+  asm volatile (
+    "movdqu    " MEMACCESS(3) ",%%xmm5         \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pshufb    %%xmm5,%%xmm0                   \n"
+    "pshufb    %%xmm5,%%xmm1                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  : "r"(shuffler)    // %3
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm5"
+  );
+}
+#endif  // HAS_ARGBSHUFFLEROW_SSSE3
+
+#ifdef HAS_ARGBSHUFFLEROW_AVX2
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int pix) {
+  asm volatile (
+    "vbroadcastf128 " MEMACCESS(3) ",%%ymm5    \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpshufb   %%ymm5,%%ymm0,%%ymm0            \n"
+    "vpshufb   %%ymm5,%%ymm1,%%ymm1            \n"
+    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
+    "vmovdqu   %%ymm1," MEMACCESS2(0x20,1) "   \n"
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  : "r"(shuffler)    // %3
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm5"
+  );
+}
+#endif  // HAS_ARGBSHUFFLEROW_AVX2
+
+#ifdef HAS_ARGBSHUFFLEROW_SSE2
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int pix) {
+  uintptr_t pixel_temp = 0u;
+  asm volatile (
+    "pxor      %%xmm5,%%xmm5                   \n"
+    "mov       " MEMACCESS(4) ",%k2            \n"
+    "cmp       $0x3000102,%k2                  \n"
+    "je        3012f                           \n"
+    "cmp       $0x10203,%k2                    \n"
+    "je        123f                            \n"
+    "cmp       $0x30201,%k2                    \n"
+    "je        321f                            \n"
+    "cmp       $0x2010003,%k2                  \n"
+    "je        2103f                           \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "movzb     " MEMACCESS(4) ",%2             \n"
+    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
+    "mov       %b2," MEMACCESS(1) "            \n"
+    "movzb     " MEMACCESS2(0x1,4) ",%2        \n"
+    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
+    "mov       %b2," MEMACCESS2(0x1,1) "       \n"
+    "movzb     " MEMACCESS2(0x2,4) ",%2        \n"
+    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
+    "mov       %b2," MEMACCESS2(0x2,1) "       \n"
+    "movzb     " MEMACCESS2(0x3,4) ",%2        \n"
+    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
+    "mov       %b2," MEMACCESS2(0x3,1) "       \n"
+    "lea       " MEMLEA(0x4,0) ",%0            \n"
+    "lea       " MEMLEA(0x4,1) ",%1            \n"
+    "sub       $0x1,%3                         \n"
+    "jg        1b                              \n"
+    "jmp       99f                             \n"
+
+    LABELALIGN
+  "123:                                        \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "punpckhbw %%xmm5,%%xmm1                   \n"
+    "pshufhw   $0x1b,%%xmm0,%%xmm0             \n"
+    "pshuflw   $0x1b,%%xmm0,%%xmm0             \n"
+    "pshufhw   $0x1b,%%xmm1,%%xmm1             \n"
+    "pshuflw   $0x1b,%%xmm1,%%xmm1             \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%3                         \n"
+    "jg        123b                            \n"
+    "jmp       99f                             \n"
+
+    LABELALIGN
+  "321:                                        \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "punpckhbw %%xmm5,%%xmm1                   \n"
+    "pshufhw   $0x39,%%xmm0,%%xmm0             \n"
+    "pshuflw   $0x39,%%xmm0,%%xmm0             \n"
+    "pshufhw   $0x39,%%xmm1,%%xmm1             \n"
+    "pshuflw   $0x39,%%xmm1,%%xmm1             \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%3                         \n"
+    "jg        321b                            \n"
+    "jmp       99f                             \n"
+
+    LABELALIGN
+  "2103:                                       \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "punpckhbw %%xmm5,%%xmm1                   \n"
+    "pshufhw   $0x93,%%xmm0,%%xmm0             \n"
+    "pshuflw   $0x93,%%xmm0,%%xmm0             \n"
+    "pshufhw   $0x93,%%xmm1,%%xmm1             \n"
+    "pshuflw   $0x93,%%xmm1,%%xmm1             \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%3                         \n"
+    "jg        2103b                           \n"
+    "jmp       99f                             \n"
+
+    LABELALIGN
+  "3012:                                       \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "punpckhbw %%xmm5,%%xmm1                   \n"
+    "pshufhw   $0xc6,%%xmm0,%%xmm0             \n"
+    "pshuflw   $0xc6,%%xmm0,%%xmm0             \n"
+    "pshufhw   $0xc6,%%xmm1,%%xmm1             \n"
+    "pshuflw   $0xc6,%%xmm1,%%xmm1             \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%3                         \n"
+    "jg        3012b                           \n"
+
+  "99:                                         \n"
+  : "+r"(src_argb),    // %0
+    "+r"(dst_argb),    // %1
+    "+d"(pixel_temp),  // %2
+    "+r"(pix)         // %3
+  : "r"(shuffler)      // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm5"
+  );
+}
+#endif  // HAS_ARGBSHUFFLEROW_SSE2
+
+#ifdef HAS_I422TOYUY2ROW_SSE2
+void I422ToYUY2Row_SSE2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_frame, int width) {
+ asm volatile (
+    "sub       %1,%2                             \n"
+    LABELALIGN
+  "1:                                            \n"
+    "movq      " MEMACCESS(1) ",%%xmm2           \n"
+    MEMOPREG(movq,0x00,1,2,1,xmm3)               //  movq    (%1,%2,1),%%xmm3
+    "lea       " MEMLEA(0x8,1) ",%1              \n"
+    "punpcklbw %%xmm3,%%xmm2                     \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
+    "lea       " MEMLEA(0x10,0) ",%0             \n"
+    "movdqa    %%xmm0,%%xmm1                     \n"
+    "punpcklbw %%xmm2,%%xmm0                     \n"
+    "punpckhbw %%xmm2,%%xmm1                     \n"
+    "movdqu    %%xmm0," MEMACCESS(3) "           \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,3) "     \n"
+    "lea       " MEMLEA(0x20,3) ",%3             \n"
+    "sub       $0x10,%4                          \n"
+    "jg         1b                               \n"
+    : "+r"(src_y),  // %0
+      "+r"(src_u),  // %1
+      "+r"(src_v),  // %2
+      "+r"(dst_frame),  // %3
+      "+rm"(width)  // %4
+    :
+    : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3"
+  );
+}
+#endif  // HAS_I422TOYUY2ROW_SSE2
+
+#ifdef HAS_I422TOUYVYROW_SSE2
+void I422ToUYVYRow_SSE2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_frame, int width) {
+ asm volatile (
+    "sub        %1,%2                            \n"
+    LABELALIGN
+  "1:                                            \n"
+    "movq      " MEMACCESS(1) ",%%xmm2           \n"
+    MEMOPREG(movq,0x00,1,2,1,xmm3)               //  movq    (%1,%2,1),%%xmm3
+    "lea       " MEMLEA(0x8,1) ",%1              \n"
+    "punpcklbw %%xmm3,%%xmm2                     \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
+    "movdqa    %%xmm2,%%xmm1                     \n"
+    "lea       " MEMLEA(0x10,0) ",%0             \n"
+    "punpcklbw %%xmm0,%%xmm1                     \n"
+    "punpckhbw %%xmm0,%%xmm2                     \n"
+    "movdqu    %%xmm1," MEMACCESS(3) "           \n"
+    "movdqu    %%xmm2," MEMACCESS2(0x10,3) "     \n"
+    "lea       " MEMLEA(0x20,3) ",%3             \n"
+    "sub       $0x10,%4                          \n"
+    "jg         1b                               \n"
+    : "+r"(src_y),  // %0
+      "+r"(src_u),  // %1
+      "+r"(src_v),  // %2
+      "+r"(dst_frame),  // %3
+      "+rm"(width)  // %4
+    :
+    : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3"
+  );
+}
+#endif  // HAS_I422TOUYVYROW_SSE2
+
+#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
+void ARGBPolynomialRow_SSE2(const uint8* src_argb,
+                            uint8* dst_argb, const float* poly,
+                            int width) {
+  asm volatile (
+    "pxor      %%xmm3,%%xmm3                   \n"
+
+    // 2 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movq      " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x8,0) ",%0            \n"
+    "punpcklbw %%xmm3,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm4                   \n"
+    "punpcklwd %%xmm3,%%xmm0                   \n"
+    "punpckhwd %%xmm3,%%xmm4                   \n"
+    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
+    "cvtdq2ps  %%xmm4,%%xmm4                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm4,%%xmm5                   \n"
+    "mulps     " MEMACCESS2(0x10,3) ",%%xmm0   \n"
+    "mulps     " MEMACCESS2(0x10,3) ",%%xmm4   \n"
+    "addps     " MEMACCESS(3) ",%%xmm0         \n"
+    "addps     " MEMACCESS(3) ",%%xmm4         \n"
+    "movdqa    %%xmm1,%%xmm2                   \n"
+    "movdqa    %%xmm5,%%xmm6                   \n"
+    "mulps     %%xmm1,%%xmm2                   \n"
+    "mulps     %%xmm5,%%xmm6                   \n"
+    "mulps     %%xmm2,%%xmm1                   \n"
+    "mulps     %%xmm6,%%xmm5                   \n"
+    "mulps     " MEMACCESS2(0x20,3) ",%%xmm2   \n"
+    "mulps     " MEMACCESS2(0x20,3) ",%%xmm6   \n"
+    "mulps     " MEMACCESS2(0x30,3) ",%%xmm1   \n"
+    "mulps     " MEMACCESS2(0x30,3) ",%%xmm5   \n"
+    "addps     %%xmm2,%%xmm0                   \n"
+    "addps     %%xmm6,%%xmm4                   \n"
+    "addps     %%xmm1,%%xmm0                   \n"
+    "addps     %%xmm5,%%xmm4                   \n"
+    "cvttps2dq %%xmm0,%%xmm0                   \n"
+    "cvttps2dq %%xmm4,%%xmm4                   \n"
+    "packuswb  %%xmm4,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x2,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(width)      // %2
+  : "r"(poly)        // %3
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+#endif  // HAS_ARGBPOLYNOMIALROW_SSE2
+
+#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
+void ARGBPolynomialRow_AVX2(const uint8* src_argb,
+                            uint8* dst_argb, const float* poly,
+                            int width) {
+  asm volatile (
+    "vbroadcastf128 " MEMACCESS(3) ",%%ymm4     \n"
+    "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n"
+    "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n"
+    "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n"
+
+    // 2 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "vpmovzxbd   " MEMACCESS(0) ",%%ymm0       \n"  // 2 ARGB pixels
+    "lea         " MEMLEA(0x8,0) ",%0          \n"
+    "vcvtdq2ps   %%ymm0,%%ymm0                 \n"  // X 8 floats
+    "vmulps      %%ymm0,%%ymm0,%%ymm2          \n"  // X * X
+    "vmulps      %%ymm7,%%ymm0,%%ymm3          \n"  // C3 * X
+    "vfmadd132ps %%ymm5,%%ymm4,%%ymm0          \n"  // result = C0 + C1 * X
+    "vfmadd231ps %%ymm6,%%ymm2,%%ymm0          \n"  // result += C2 * X * X
+    "vfmadd231ps %%ymm3,%%ymm2,%%ymm0          \n"  // result += C3 * X * X * X
+    "vcvttps2dq  %%ymm0,%%ymm0                 \n"
+    "vpackusdw   %%ymm0,%%ymm0,%%ymm0          \n"
+    "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+    "vpackuswb   %%xmm0,%%xmm0,%%xmm0          \n"
+    "vmovq       %%xmm0," MEMACCESS(1) "       \n"
+    "lea         " MEMLEA(0x8,1) ",%1          \n"
+    "sub         $0x2,%2                       \n"
+    "jg          1b                            \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(width)      // %2
+  : "r"(poly)        // %3
+  : "memory", "cc",
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBPOLYNOMIALROW_AVX2
+
+#ifdef HAS_ARGBCOLORTABLEROW_X86
+// Tranform ARGB pixels with color table.
+void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
+                           int width) {
+  uintptr_t pixel_temp = 0u;
+  asm volatile (
+    // 1 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movzb     " MEMACCESS(0) ",%1             \n"
+    "lea       " MEMLEA(0x4,0) ",%0            \n"
+    MEMOPARG(movzb,0x00,3,1,4,1) "             \n"  // movzb (%3,%1,4),%1
+    "mov       %b1," MEMACCESS2(-0x4,0) "      \n"
+    "movzb     " MEMACCESS2(-0x3,0) ",%1       \n"
+    MEMOPARG(movzb,0x01,3,1,4,1) "             \n"  // movzb 0x1(%3,%1,4),%1
+    "mov       %b1," MEMACCESS2(-0x3,0) "      \n"
+    "movzb     " MEMACCESS2(-0x2,0) ",%1       \n"
+    MEMOPARG(movzb,0x02,3,1,4,1) "             \n"  // movzb 0x2(%3,%1,4),%1
+    "mov       %b1," MEMACCESS2(-0x2,0) "      \n"
+    "movzb     " MEMACCESS2(-0x1,0) ",%1       \n"
+    MEMOPARG(movzb,0x03,3,1,4,1) "             \n"  // movzb 0x3(%3,%1,4),%1
+    "mov       %b1," MEMACCESS2(-0x1,0) "      \n"
+    "dec       %2                              \n"
+    "jg        1b                              \n"
+  : "+r"(dst_argb),   // %0
+    "+d"(pixel_temp), // %1
+    "+r"(width)       // %2
+  : "r"(table_argb)   // %3
+  : "memory", "cc");
+}
+#endif  // HAS_ARGBCOLORTABLEROW_X86
+
+#ifdef HAS_RGBCOLORTABLEROW_X86
+// Tranform RGB pixels with color table.
+void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
+  uintptr_t pixel_temp = 0u;
+  asm volatile (
+    // 1 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movzb     " MEMACCESS(0) ",%1             \n"
+    "lea       " MEMLEA(0x4,0) ",%0            \n"
+    MEMOPARG(movzb,0x00,3,1,4,1) "             \n"  // movzb (%3,%1,4),%1
+    "mov       %b1," MEMACCESS2(-0x4,0) "      \n"
+    "movzb     " MEMACCESS2(-0x3,0) ",%1       \n"
+    MEMOPARG(movzb,0x01,3,1,4,1) "             \n"  // movzb 0x1(%3,%1,4),%1
+    "mov       %b1," MEMACCESS2(-0x3,0) "      \n"
+    "movzb     " MEMACCESS2(-0x2,0) ",%1       \n"
+    MEMOPARG(movzb,0x02,3,1,4,1) "             \n"  // movzb 0x2(%3,%1,4),%1
+    "mov       %b1," MEMACCESS2(-0x2,0) "      \n"
+    "dec       %2                              \n"
+    "jg        1b                              \n"
+  : "+r"(dst_argb),   // %0
+    "+d"(pixel_temp), // %1
+    "+r"(width)       // %2
+  : "r"(table_argb)   // %3
+  : "memory", "cc");
+}
+#endif  // HAS_RGBCOLORTABLEROW_X86
+
+#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
+// Tranform RGB pixels with luma table.
+void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                                 int width,
+                                 const uint8* luma, uint32 lumacoeff) {
+  uintptr_t pixel_temp = 0u;
+  uintptr_t table_temp = 0u;
+  asm volatile (
+    "movd      %6,%%xmm3                       \n"
+    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "psllw     $0x8,%%xmm4                     \n"
+    "pxor      %%xmm5,%%xmm5                   \n"
+
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(2) ",%%xmm0         \n"
+    "pmaddubsw %%xmm3,%%xmm0                   \n"
+    "phaddw    %%xmm0,%%xmm0                   \n"
+    "pand      %%xmm4,%%xmm0                   \n"
+    "punpcklwd %%xmm5,%%xmm0                   \n"
+    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
+    "add       %5,%1                           \n"
+    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+
+    "movzb     " MEMACCESS(2) ",%0             \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS(3) "            \n"
+    "movzb     " MEMACCESS2(0x1,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0x1,3) "       \n"
+    "movzb     " MEMACCESS2(0x2,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0x2,3) "       \n"
+    "movzb     " MEMACCESS2(0x3,2) ",%0        \n"
+    "mov       %b0," MEMACCESS2(0x3,3) "       \n"
+
+    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
+    "add       %5,%1                           \n"
+    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+
+    "movzb     " MEMACCESS2(0x4,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0x4,3) "       \n"
+    "movzb     " MEMACCESS2(0x5,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0x5,3) "       \n"
+    "movzb     " MEMACCESS2(0x6,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0x6,3) "       \n"
+    "movzb     " MEMACCESS2(0x7,2) ",%0        \n"
+    "mov       %b0," MEMACCESS2(0x7,3) "       \n"
+
+    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
+    "add       %5,%1                           \n"
+    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+
+    "movzb     " MEMACCESS2(0x8,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0x8,3) "       \n"
+    "movzb     " MEMACCESS2(0x9,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0x9,3) "       \n"
+    "movzb     " MEMACCESS2(0xa,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0xa,3) "       \n"
+    "movzb     " MEMACCESS2(0xb,2) ",%0        \n"
+    "mov       %b0," MEMACCESS2(0xb,3) "       \n"
+
+    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
+    "add       %5,%1                           \n"
+
+    "movzb     " MEMACCESS2(0xc,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0xc,3) "       \n"
+    "movzb     " MEMACCESS2(0xd,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0xd,3) "       \n"
+    "movzb     " MEMACCESS2(0xe,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0xe,3) "       \n"
+    "movzb     " MEMACCESS2(0xf,2) ",%0        \n"
+    "mov       %b0," MEMACCESS2(0xf,3) "       \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "lea       " MEMLEA(0x10,3) ",%3           \n"
+    "sub       $0x4,%4                         \n"
+    "jg        1b                              \n"
+  : "+d"(pixel_temp),  // %0
+    "+a"(table_temp),  // %1
+    "+r"(src_argb),    // %2
+    "+r"(dst_argb),    // %3
+    "+rm"(width)       // %4
+  : "r"(luma),         // %5
+    "rm"(lumacoeff)    // %6
+  : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
+
+#endif  // defined(__x86_64__) || defined(__i386__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libvpx/third_party/libyuv/source/row_mips.cc b/libs/libvpx/third_party/libyuv/source/row_mips.cc
new file mode 100644
index 0000000000..cfc9ffe036
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/source/row_mips.cc
@@ -0,0 +1,911 @@
+/*
+ *  Copyright (c) 2012 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// The following are available on Mips platforms:
+#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \
+    (_MIPS_SIM == _MIPS_SIM_ABI32)
+
+#ifdef HAS_COPYROW_MIPS
+void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
+  __asm__ __volatile__ (
+    ".set      noreorder                         \n"
+    ".set      noat                              \n"
+    "slti      $at, %[count], 8                  \n"
+    "bne       $at ,$zero, $last8                \n"
+    "xor       $t8, %[src], %[dst]               \n"
+    "andi      $t8, $t8, 0x3                     \n"
+
+    "bne       $t8, $zero, unaligned             \n"
+    "negu      $a3, %[dst]                       \n"
+    // make dst/src aligned
+    "andi      $a3, $a3, 0x3                     \n"
+    "beq       $a3, $zero, $chk16w               \n"
+    // word-aligned now count is the remining bytes count
+    "subu     %[count], %[count], $a3            \n"
+
+    "lwr       $t8, 0(%[src])                    \n"
+    "addu      %[src], %[src], $a3               \n"
+    "swr       $t8, 0(%[dst])                    \n"
+    "addu      %[dst], %[dst], $a3               \n"
+
+    // Now the dst/src are mutually word-aligned with word-aligned addresses
+    "$chk16w:                                    \n"
+    "andi      $t8, %[count], 0x3f               \n"  // whole 64-B chunks?
+    // t8 is the byte count after 64-byte chunks
+    "beq       %[count], $t8, chk8w              \n"
+    // There will be at most 1 32-byte chunk after it
+    "subu      $a3, %[count], $t8                \n"  // the reminder
+    // Here a3 counts bytes in 16w chunks
+    "addu      $a3, %[dst], $a3                  \n"
+    // Now a3 is the final dst after 64-byte chunks
+    "addu      $t0, %[dst], %[count]             \n"
+    // t0 is the "past the end" address
+
+    // When in the loop we exercise "pref 30,x(a1)", the a1+x should not be past
+    // the "t0-32" address
+    // This means: for x=128 the last "safe" a1 address is "t0-160"
+    // Alternatively, for x=64 the last "safe" a1 address is "t0-96"
+    // we will use "pref 30,128(a1)", so "t0-160" is the limit
+    "subu      $t9, $t0, 160                     \n"
+    // t9 is the "last safe pref 30,128(a1)" address
+    "pref      0, 0(%[src])                      \n"  // first line of src
+    "pref      0, 32(%[src])                     \n"  // second line of src
+    "pref      0, 64(%[src])                     \n"
+    "pref      30, 32(%[dst])                    \n"
+    // In case the a1 > t9 don't use "pref 30" at all
+    "sgtu      $v1, %[dst], $t9                  \n"
+    "bgtz      $v1, $loop16w                     \n"
+    "nop                                         \n"
+    // otherwise, start with using pref30
+    "pref      30, 64(%[dst])                    \n"
+    "$loop16w:                                    \n"
+    "pref      0, 96(%[src])                     \n"
+    "lw        $t0, 0(%[src])                    \n"
+    "bgtz      $v1, $skip_pref30_96              \n"  // skip
+    "lw        $t1, 4(%[src])                    \n"
+    "pref      30, 96(%[dst])                    \n"  // continue
+    "$skip_pref30_96:                            \n"
+    "lw        $t2, 8(%[src])                    \n"
+    "lw        $t3, 12(%[src])                   \n"
+    "lw        $t4, 16(%[src])                   \n"
+    "lw        $t5, 20(%[src])                   \n"
+    "lw        $t6, 24(%[src])                   \n"
+    "lw        $t7, 28(%[src])                   \n"
+    "pref      0, 128(%[src])                    \n"
+    //  bring the next lines of src, addr 128
+    "sw        $t0, 0(%[dst])                    \n"
+    "sw        $t1, 4(%[dst])                    \n"
+    "sw        $t2, 8(%[dst])                    \n"
+    "sw        $t3, 12(%[dst])                   \n"
+    "sw        $t4, 16(%[dst])                   \n"
+    "sw        $t5, 20(%[dst])                   \n"
+    "sw        $t6, 24(%[dst])                   \n"
+    "sw        $t7, 28(%[dst])                   \n"
+    "lw        $t0, 32(%[src])                   \n"
+    "bgtz      $v1, $skip_pref30_128             \n"  // skip pref 30,128(a1)
+    "lw        $t1, 36(%[src])                   \n"
+    "pref      30, 128(%[dst])                   \n"  // set dest, addr 128
+    "$skip_pref30_128:                           \n"
+    "lw        $t2, 40(%[src])                   \n"
+    "lw        $t3, 44(%[src])                   \n"
+    "lw        $t4, 48(%[src])                   \n"
+    "lw        $t5, 52(%[src])                   \n"
+    "lw        $t6, 56(%[src])                   \n"
+    "lw        $t7, 60(%[src])                   \n"
+    "pref      0, 160(%[src])                    \n"
+    // bring the next lines of src, addr 160
+    "sw        $t0, 32(%[dst])                   \n"
+    "sw        $t1, 36(%[dst])                   \n"
+    "sw        $t2, 40(%[dst])                   \n"
+    "sw        $t3, 44(%[dst])                   \n"
+    "sw        $t4, 48(%[dst])                   \n"
+    "sw        $t5, 52(%[dst])                   \n"
+    "sw        $t6, 56(%[dst])                   \n"
+    "sw        $t7, 60(%[dst])                   \n"
+
+    "addiu     %[dst], %[dst], 64                \n"  // adding 64 to dest
+    "sgtu      $v1, %[dst], $t9                  \n"
+    "bne       %[dst], $a3, $loop16w             \n"
+    " addiu    %[src], %[src], 64                \n"  // adding 64 to src
+    "move      %[count], $t8                     \n"
+
+    // Here we have src and dest word-aligned but less than 64-bytes to go
+
+    "chk8w:                                      \n"
+    "pref      0, 0x0(%[src])                    \n"
+    "andi      $t8, %[count], 0x1f               \n"  // 32-byte chunk?
+    // the t8 is the reminder count past 32-bytes
+    "beq       %[count], $t8, chk1w              \n"
+    // count=t8,no 32-byte chunk
+    " nop                                        \n"
+
+    "lw        $t0, 0(%[src])                    \n"
+    "lw        $t1, 4(%[src])                    \n"
+    "lw        $t2, 8(%[src])                    \n"
+    "lw        $t3, 12(%[src])                   \n"
+    "lw        $t4, 16(%[src])                   \n"
+    "lw        $t5, 20(%[src])                   \n"
+    "lw        $t6, 24(%[src])                   \n"
+    "lw        $t7, 28(%[src])                   \n"
+    "addiu     %[src], %[src], 32                \n"
+
+    "sw        $t0, 0(%[dst])                    \n"
+    "sw        $t1, 4(%[dst])                    \n"
+    "sw        $t2, 8(%[dst])                    \n"
+    "sw        $t3, 12(%[dst])                   \n"
+    "sw        $t4, 16(%[dst])                   \n"
+    "sw        $t5, 20(%[dst])                   \n"
+    "sw        $t6, 24(%[dst])                   \n"
+    "sw        $t7, 28(%[dst])                   \n"
+    "addiu     %[dst], %[dst], 32                \n"
+
+    "chk1w:                                      \n"
+    "andi      %[count], $t8, 0x3                \n"
+    // now count is the reminder past 1w chunks
+    "beq       %[count], $t8, $last8             \n"
+    " subu     $a3, $t8, %[count]                \n"
+    // a3 is count of bytes in 1w chunks
+    "addu      $a3, %[dst], $a3                  \n"
+    // now a3 is the dst address past the 1w chunks
+    // copying in words (4-byte chunks)
+    "$wordCopy_loop:                             \n"
+    "lw        $t3, 0(%[src])                    \n"
+    // the first t3 may be equal t0 ... optimize?
+    "addiu     %[src], %[src],4                  \n"
+    "addiu     %[dst], %[dst],4                  \n"
+    "bne       %[dst], $a3,$wordCopy_loop        \n"
+    " sw       $t3, -4(%[dst])                   \n"
+
+    // For the last (<8) bytes
+    "$last8:                                     \n"
+    "blez      %[count], leave                   \n"
+    " addu     $a3, %[dst], %[count]             \n"  // a3 -last dst address
+    "$last8loop:                                 \n"
+    "lb        $v1, 0(%[src])                    \n"
+    "addiu     %[src], %[src], 1                 \n"
+    "addiu     %[dst], %[dst], 1                 \n"
+    "bne       %[dst], $a3, $last8loop           \n"
+    " sb       $v1, -1(%[dst])                   \n"
+
+    "leave:                                      \n"
+    "  j       $ra                               \n"
+    "  nop                                       \n"
+
+    //
+    // UNALIGNED case
+    //
+
+    "unaligned:                                  \n"
+    // got here with a3="negu a1"
+    "andi      $a3, $a3, 0x3                     \n"  // a1 is word aligned?
+    "beqz      $a3, $ua_chk16w                   \n"
+    " subu     %[count], %[count], $a3           \n"
+    // bytes left after initial a3 bytes
+    "lwr       $v1, 0(%[src])                    \n"
+    "lwl       $v1, 3(%[src])                    \n"
+    "addu      %[src], %[src], $a3               \n"  // a3 may be 1, 2 or 3
+    "swr       $v1, 0(%[dst])                    \n"
+    "addu      %[dst], %[dst], $a3               \n"
+    // below the dst will be word aligned (NOTE1)
+    "$ua_chk16w:                                 \n"
+    "andi      $t8, %[count], 0x3f               \n"  // whole 64-B chunks?
+    // t8 is the byte count after 64-byte chunks
+    "beq       %[count], $t8, ua_chk8w           \n"
+    // if a2==t8, no 64-byte chunks
+    // There will be at most 1 32-byte chunk after it
+    "subu      $a3, %[count], $t8                \n"  // the reminder
+    // Here a3 counts bytes in 16w chunks
+    "addu      $a3, %[dst], $a3                  \n"
+    // Now a3 is the final dst after 64-byte chunks
+    "addu      $t0, %[dst], %[count]             \n"  // t0 "past the end"
+    "subu      $t9, $t0, 160                     \n"
+    // t9 is the "last safe pref 30,128(a1)" address
+    "pref      0, 0(%[src])                      \n"  // first line of src
+    "pref      0, 32(%[src])                     \n"  // second line  addr 32
+    "pref      0, 64(%[src])                     \n"
+    "pref      30, 32(%[dst])                    \n"
+    // safe, as we have at least 64 bytes ahead
+    // In case the a1 > t9 don't use "pref 30" at all
+    "sgtu      $v1, %[dst], $t9                  \n"
+    "bgtz      $v1, $ua_loop16w                  \n"
+    // skip "pref 30,64(a1)" for too short arrays
+    " nop                                        \n"
+    // otherwise, start with using pref30
+    "pref      30, 64(%[dst])                    \n"
+    "$ua_loop16w:                                \n"
+    "pref      0, 96(%[src])                     \n"
+    "lwr       $t0, 0(%[src])                    \n"
+    "lwl       $t0, 3(%[src])                    \n"
+    "lwr       $t1, 4(%[src])                    \n"
+    "bgtz      $v1, $ua_skip_pref30_96           \n"
+    " lwl      $t1, 7(%[src])                    \n"
+    "pref      30, 96(%[dst])                    \n"
+    // continue setting up the dest, addr 96
+    "$ua_skip_pref30_96:                         \n"
+    "lwr       $t2, 8(%[src])                    \n"
+    "lwl       $t2, 11(%[src])                   \n"
+    "lwr       $t3, 12(%[src])                   \n"
+    "lwl       $t3, 15(%[src])                   \n"
+    "lwr       $t4, 16(%[src])                   \n"
+    "lwl       $t4, 19(%[src])                   \n"
+    "lwr       $t5, 20(%[src])                   \n"
+    "lwl       $t5, 23(%[src])                   \n"
+    "lwr       $t6, 24(%[src])                   \n"
+    "lwl       $t6, 27(%[src])                   \n"
+    "lwr       $t7, 28(%[src])                   \n"
+    "lwl       $t7, 31(%[src])                   \n"
+    "pref      0, 128(%[src])                    \n"
+    // bring the next lines of src, addr 128
+    "sw        $t0, 0(%[dst])                    \n"
+    "sw        $t1, 4(%[dst])                    \n"
+    "sw        $t2, 8(%[dst])                    \n"
+    "sw        $t3, 12(%[dst])                   \n"
+    "sw        $t4, 16(%[dst])                   \n"
+    "sw        $t5, 20(%[dst])                   \n"
+    "sw        $t6, 24(%[dst])                   \n"
+    "sw        $t7, 28(%[dst])                   \n"
+    "lwr       $t0, 32(%[src])                   \n"
+    "lwl       $t0, 35(%[src])                   \n"
+    "lwr       $t1, 36(%[src])                   \n"
+    "bgtz      $v1, ua_skip_pref30_128           \n"
+    " lwl      $t1, 39(%[src])                   \n"
+    "pref      30, 128(%[dst])                   \n"
+    // continue setting up the dest, addr 128
+    "ua_skip_pref30_128:                         \n"
+
+    "lwr       $t2, 40(%[src])                   \n"
+    "lwl       $t2, 43(%[src])                   \n"
+    "lwr       $t3, 44(%[src])                   \n"
+    "lwl       $t3, 47(%[src])                   \n"
+    "lwr       $t4, 48(%[src])                   \n"
+    "lwl       $t4, 51(%[src])                   \n"
+    "lwr       $t5, 52(%[src])                   \n"
+    "lwl       $t5, 55(%[src])                   \n"
+    "lwr       $t6, 56(%[src])                   \n"
+    "lwl       $t6, 59(%[src])                   \n"
+    "lwr       $t7, 60(%[src])                   \n"
+    "lwl       $t7, 63(%[src])                   \n"
+    "pref      0, 160(%[src])                    \n"
+    // bring the next lines of src, addr 160
+    "sw        $t0, 32(%[dst])                   \n"
+    "sw        $t1, 36(%[dst])                   \n"
+    "sw        $t2, 40(%[dst])                   \n"
+    "sw        $t3, 44(%[dst])                   \n"
+    "sw        $t4, 48(%[dst])                   \n"
+    "sw        $t5, 52(%[dst])                   \n"
+    "sw        $t6, 56(%[dst])                   \n"
+    "sw        $t7, 60(%[dst])                   \n"
+
+    "addiu     %[dst],%[dst],64                  \n"  // adding 64 to dest
+    "sgtu      $v1,%[dst],$t9                    \n"
+    "bne       %[dst],$a3,$ua_loop16w            \n"
+    " addiu    %[src],%[src],64                  \n"  // adding 64 to src
+    "move      %[count],$t8                      \n"
+
+    // Here we have src and dest word-aligned but less than 64-bytes to go
+
+    "ua_chk8w:                                   \n"
+    "pref      0, 0x0(%[src])                    \n"
+    "andi      $t8, %[count], 0x1f               \n"  // 32-byte chunk?
+    // the t8 is the reminder count
+    "beq       %[count], $t8, $ua_chk1w          \n"
+    // when count==t8, no 32-byte chunk
+
+    "lwr       $t0, 0(%[src])                    \n"
+    "lwl       $t0, 3(%[src])                    \n"
+    "lwr       $t1, 4(%[src])                    \n"
+    "lwl       $t1, 7(%[src])                    \n"
+    "lwr       $t2, 8(%[src])                    \n"
+    "lwl       $t2, 11(%[src])                   \n"
+    "lwr       $t3, 12(%[src])                   \n"
+    "lwl       $t3, 15(%[src])                   \n"
+    "lwr       $t4, 16(%[src])                   \n"
+    "lwl       $t4, 19(%[src])                   \n"
+    "lwr       $t5, 20(%[src])                   \n"
+    "lwl       $t5, 23(%[src])                   \n"
+    "lwr       $t6, 24(%[src])                   \n"
+    "lwl       $t6, 27(%[src])                   \n"
+    "lwr       $t7, 28(%[src])                   \n"
+    "lwl       $t7, 31(%[src])                   \n"
+    "addiu     %[src], %[src], 32                \n"
+
+    "sw        $t0, 0(%[dst])                    \n"
+    "sw        $t1, 4(%[dst])                    \n"
+    "sw        $t2, 8(%[dst])                    \n"
+    "sw        $t3, 12(%[dst])                   \n"
+    "sw        $t4, 16(%[dst])                   \n"
+    "sw        $t5, 20(%[dst])                   \n"
+    "sw        $t6, 24(%[dst])                   \n"
+    "sw        $t7, 28(%[dst])                   \n"
+    "addiu     %[dst], %[dst], 32                \n"
+
+    "$ua_chk1w:                                  \n"
+    "andi      %[count], $t8, 0x3                \n"
+    // now count is the reminder past 1w chunks
+    "beq       %[count], $t8, ua_smallCopy       \n"
+    "subu      $a3, $t8, %[count]                \n"
+    // a3 is count of bytes in 1w chunks
+    "addu      $a3, %[dst], $a3                  \n"
+    // now a3 is the dst address past the 1w chunks
+
+    // copying in words (4-byte chunks)
+    "$ua_wordCopy_loop:                          \n"
+    "lwr       $v1, 0(%[src])                    \n"
+    "lwl       $v1, 3(%[src])                    \n"
+    "addiu     %[src], %[src], 4                 \n"
+    "addiu     %[dst], %[dst], 4                 \n"
+    // note: dst=a1 is word aligned here, see NOTE1
+    "bne       %[dst], $a3, $ua_wordCopy_loop    \n"
+    " sw       $v1,-4(%[dst])                    \n"
+
+    // Now less than 4 bytes (value in count) left to copy
+    "ua_smallCopy:                               \n"
+    "beqz      %[count], leave                   \n"
+    " addu     $a3, %[dst], %[count]             \n" // a3 = last dst address
+    "$ua_smallCopy_loop:                         \n"
+    "lb        $v1, 0(%[src])                    \n"
+    "addiu     %[src], %[src], 1                 \n"
+    "addiu     %[dst], %[dst], 1                 \n"
+    "bne       %[dst],$a3,$ua_smallCopy_loop     \n"
+    " sb       $v1, -1(%[dst])                   \n"
+
+    "j         $ra                               \n"
+    " nop                                        \n"
+    ".set      at                                \n"
+    ".set      reorder                           \n"
+       : [dst] "+r" (dst), [src] "+r" (src)
+       : [count] "r" (count)
+       : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7",
+       "t8", "t9", "a3", "v1", "at"
+  );
+}
+#endif  // HAS_COPYROW_MIPS
+
+// MIPS DSPR2 functions
+#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \
+    (__mips_dsp_rev >= 2) && \
+    (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)
+
+void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                           int width) {
+  __asm__ __volatile__ (
+    ".set push                                     \n"
+    ".set noreorder                                \n"
+    "srl             $t4, %[width], 4              \n"  // multiplies of 16
+    "blez            $t4, 2f                       \n"
+    " andi           %[width], %[width], 0xf       \n"  // residual
+
+    ".p2align        2                             \n"
+  "1:                                              \n"
+    "addiu           $t4, $t4, -1                  \n"
+    "lw              $t0, 0(%[src_uv])             \n"  // V1 | U1 | V0 | U0
+    "lw              $t1, 4(%[src_uv])             \n"  // V3 | U3 | V2 | U2
+    "lw              $t2, 8(%[src_uv])             \n"  // V5 | U5 | V4 | U4
+    "lw              $t3, 12(%[src_uv])            \n"  // V7 | U7 | V6 | U6
+    "lw              $t5, 16(%[src_uv])            \n"  // V9 | U9 | V8 | U8
+    "lw              $t6, 20(%[src_uv])            \n"  // V11 | U11 | V10 | U10
+    "lw              $t7, 24(%[src_uv])            \n"  // V13 | U13 | V12 | U12
+    "lw              $t8, 28(%[src_uv])            \n"  // V15 | U15 | V14 | U14
+    "addiu           %[src_uv], %[src_uv], 32      \n"
+    "precrq.qb.ph    $t9, $t1, $t0                 \n"  // V3 | V2 | V1 | V0
+    "precr.qb.ph     $t0, $t1, $t0                 \n"  // U3 | U2 | U1 | U0
+    "precrq.qb.ph    $t1, $t3, $t2                 \n"  // V7 | V6 | V5 | V4
+    "precr.qb.ph     $t2, $t3, $t2                 \n"  // U7 | U6 | U5 | U4
+    "precrq.qb.ph    $t3, $t6, $t5                 \n"  // V11 | V10 | V9 | V8
+    "precr.qb.ph     $t5, $t6, $t5                 \n"  // U11 | U10 | U9 | U8
+    "precrq.qb.ph    $t6, $t8, $t7                 \n"  // V15 | V14 | V13 | V12
+    "precr.qb.ph     $t7, $t8, $t7                 \n"  // U15 | U14 | U13 | U12
+    "sw              $t9, 0(%[dst_v])              \n"
+    "sw              $t0, 0(%[dst_u])              \n"
+    "sw              $t1, 4(%[dst_v])              \n"
+    "sw              $t2, 4(%[dst_u])              \n"
+    "sw              $t3, 8(%[dst_v])              \n"
+    "sw              $t5, 8(%[dst_u])              \n"
+    "sw              $t6, 12(%[dst_v])             \n"
+    "sw              $t7, 12(%[dst_u])             \n"
+    "addiu           %[dst_v], %[dst_v], 16        \n"
+    "bgtz            $t4, 1b                       \n"
+    " addiu          %[dst_u], %[dst_u], 16        \n"
+
+    "beqz            %[width], 3f                  \n"
+    " nop                                          \n"
+
+  "2:                                              \n"
+    "lbu             $t0, 0(%[src_uv])             \n"
+    "lbu             $t1, 1(%[src_uv])             \n"
+    "addiu           %[src_uv], %[src_uv], 2       \n"
+    "addiu           %[width], %[width], -1        \n"
+    "sb              $t0, 0(%[dst_u])              \n"
+    "sb              $t1, 0(%[dst_v])              \n"
+    "addiu           %[dst_u], %[dst_u], 1         \n"
+    "bgtz            %[width], 2b                  \n"
+    " addiu          %[dst_v], %[dst_v], 1         \n"
+
+  "3:                                              \n"
+    ".set pop                                      \n"
+     : [src_uv] "+r" (src_uv),
+       [width] "+r" (width),
+       [dst_u] "+r" (dst_u),
+       [dst_v] "+r" (dst_v)
+     :
+     : "t0", "t1", "t2", "t3",
+     "t4", "t5", "t6", "t7", "t8", "t9"
+  );
+}
+
+void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) {
+  __asm__ __volatile__ (
+    ".set push                             \n"
+    ".set noreorder                        \n"
+
+    "srl       $t4, %[width], 4            \n"  // multiplies of 16
+    "andi      $t5, %[width], 0xf          \n"
+    "blez      $t4, 2f                     \n"
+    " addu     %[src], %[src], %[width]    \n"  // src += width
+
+    ".p2align  2                           \n"
+   "1:                                     \n"
+    "lw        $t0, -16(%[src])            \n"  // |3|2|1|0|
+    "lw        $t1, -12(%[src])            \n"  // |7|6|5|4|
+    "lw        $t2, -8(%[src])             \n"  // |11|10|9|8|
+    "lw        $t3, -4(%[src])             \n"  // |15|14|13|12|
+    "wsbh      $t0, $t0                    \n"  // |2|3|0|1|
+    "wsbh      $t1, $t1                    \n"  // |6|7|4|5|
+    "wsbh      $t2, $t2                    \n"  // |10|11|8|9|
+    "wsbh      $t3, $t3                    \n"  // |14|15|12|13|
+    "rotr      $t0, $t0, 16                \n"  // |0|1|2|3|
+    "rotr      $t1, $t1, 16                \n"  // |4|5|6|7|
+    "rotr      $t2, $t2, 16                \n"  // |8|9|10|11|
+    "rotr      $t3, $t3, 16                \n"  // |12|13|14|15|
+    "addiu     %[src], %[src], -16         \n"
+    "addiu     $t4, $t4, -1                \n"
+    "sw        $t3, 0(%[dst])              \n"  // |15|14|13|12|
+    "sw        $t2, 4(%[dst])              \n"  // |11|10|9|8|
+    "sw        $t1, 8(%[dst])              \n"  // |7|6|5|4|
+    "sw        $t0, 12(%[dst])             \n"  // |3|2|1|0|
+    "bgtz      $t4, 1b                     \n"
+    " addiu    %[dst], %[dst], 16          \n"
+    "beqz      $t5, 3f                     \n"
+    " nop                                  \n"
+
+   "2:                                     \n"
+    "lbu       $t0, -1(%[src])             \n"
+    "addiu     $t5, $t5, -1                \n"
+    "addiu     %[src], %[src], -1          \n"
+    "sb        $t0, 0(%[dst])              \n"
+    "bgez      $t5, 2b                     \n"
+    " addiu    %[dst], %[dst], 1           \n"
+
+   "3:                                     \n"
+    ".set pop                              \n"
+      : [src] "+r" (src), [dst] "+r" (dst)
+      : [width] "r" (width)
+      : "t0", "t1", "t2", "t3", "t4", "t5"
+  );
+}
+
+void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                            int width) {
+  int x = 0;
+  int y = 0;
+  __asm__ __volatile__ (
+    ".set push                                    \n"
+    ".set noreorder                               \n"
+
+    "addu            $t4, %[width], %[width]      \n"
+    "srl             %[x], %[width], 4            \n"
+    "andi            %[y], %[width], 0xf          \n"
+    "blez            %[x], 2f                     \n"
+    " addu           %[src_uv], %[src_uv], $t4    \n"
+
+    ".p2align        2                            \n"
+   "1:                                            \n"
+    "lw              $t0, -32(%[src_uv])          \n"  // |3|2|1|0|
+    "lw              $t1, -28(%[src_uv])          \n"  // |7|6|5|4|
+    "lw              $t2, -24(%[src_uv])          \n"  // |11|10|9|8|
+    "lw              $t3, -20(%[src_uv])          \n"  // |15|14|13|12|
+    "lw              $t4, -16(%[src_uv])          \n"  // |19|18|17|16|
+    "lw              $t6, -12(%[src_uv])          \n"  // |23|22|21|20|
+    "lw              $t7, -8(%[src_uv])           \n"  // |27|26|25|24|
+    "lw              $t8, -4(%[src_uv])           \n"  // |31|30|29|28|
+
+    "rotr            $t0, $t0, 16                 \n"  // |1|0|3|2|
+    "rotr            $t1, $t1, 16                 \n"  // |5|4|7|6|
+    "rotr            $t2, $t2, 16                 \n"  // |9|8|11|10|
+    "rotr            $t3, $t3, 16                 \n"  // |13|12|15|14|
+    "rotr            $t4, $t4, 16                 \n"  // |17|16|19|18|
+    "rotr            $t6, $t6, 16                 \n"  // |21|20|23|22|
+    "rotr            $t7, $t7, 16                 \n"  // |25|24|27|26|
+    "rotr            $t8, $t8, 16                 \n"  // |29|28|31|30|
+    "precr.qb.ph     $t9, $t0, $t1                \n"  // |0|2|4|6|
+    "precrq.qb.ph    $t5, $t0, $t1                \n"  // |1|3|5|7|
+    "precr.qb.ph     $t0, $t2, $t3                \n"  // |8|10|12|14|
+    "precrq.qb.ph    $t1, $t2, $t3                \n"  // |9|11|13|15|
+    "precr.qb.ph     $t2, $t4, $t6                \n"  // |16|18|20|22|
+    "precrq.qb.ph    $t3, $t4, $t6                \n"  // |17|19|21|23|
+    "precr.qb.ph     $t4, $t7, $t8                \n"  // |24|26|28|30|
+    "precrq.qb.ph    $t6, $t7, $t8                \n"  // |25|27|29|31|
+    "addiu           %[src_uv], %[src_uv], -32    \n"
+    "addiu           %[x], %[x], -1               \n"
+    "swr             $t4, 0(%[dst_u])             \n"
+    "swl             $t4, 3(%[dst_u])             \n"  // |30|28|26|24|
+    "swr             $t6, 0(%[dst_v])             \n"
+    "swl             $t6, 3(%[dst_v])             \n"  // |31|29|27|25|
+    "swr             $t2, 4(%[dst_u])             \n"
+    "swl             $t2, 7(%[dst_u])             \n"  // |22|20|18|16|
+    "swr             $t3, 4(%[dst_v])             \n"
+    "swl             $t3, 7(%[dst_v])             \n"  // |23|21|19|17|
+    "swr             $t0, 8(%[dst_u])             \n"
+    "swl             $t0, 11(%[dst_u])            \n"  // |14|12|10|8|
+    "swr             $t1, 8(%[dst_v])             \n"
+    "swl             $t1, 11(%[dst_v])            \n"  // |15|13|11|9|
+    "swr             $t9, 12(%[dst_u])            \n"
+    "swl             $t9, 15(%[dst_u])            \n"  // |6|4|2|0|
+    "swr             $t5, 12(%[dst_v])            \n"
+    "swl             $t5, 15(%[dst_v])            \n"  // |7|5|3|1|
+    "addiu           %[dst_v], %[dst_v], 16       \n"
+    "bgtz            %[x], 1b                     \n"
+    " addiu          %[dst_u], %[dst_u], 16       \n"
+    "beqz            %[y], 3f                     \n"
+    " nop                                         \n"
+    "b               2f                           \n"
+    " nop                                         \n"
+
+   "2:                                            \n"
+    "lbu             $t0, -2(%[src_uv])           \n"
+    "lbu             $t1, -1(%[src_uv])           \n"
+    "addiu           %[src_uv], %[src_uv], -2     \n"
+    "addiu           %[y], %[y], -1               \n"
+    "sb              $t0, 0(%[dst_u])             \n"
+    "sb              $t1, 0(%[dst_v])             \n"
+    "addiu           %[dst_u], %[dst_u], 1        \n"
+    "bgtz            %[y], 2b                     \n"
+    " addiu          %[dst_v], %[dst_v], 1        \n"
+
+   "3:                                            \n"
+    ".set pop                                     \n"
+      : [src_uv] "+r" (src_uv),
+        [dst_u] "+r" (dst_u),
+        [dst_v] "+r" (dst_v),
+        [x] "=&r" (x),
+        [y] "+r" (y)
+      : [width] "r" (width)
+      : "t0", "t1", "t2", "t3", "t4",
+      "t5", "t7", "t8", "t9"
+  );
+}
+
+// Convert (4 Y and 2 VU) I422 and arrange RGB values into
+// t5 = | 0 | B0 | 0 | b0 |
+// t4 = | 0 | B1 | 0 | b1 |
+// t9 = | 0 | G0 | 0 | g0 |
+// t8 = | 0 | G1 | 0 | g1 |
+// t2 = | 0 | R0 | 0 | r0 |
+// t1 = | 0 | R1 | 0 | r1 |
+#define I422ToTransientMipsRGB                                                 \
+      "lw                $t0, 0(%[y_buf])       \n"                            \
+      "lhu               $t1, 0(%[u_buf])       \n"                            \
+      "lhu               $t2, 0(%[v_buf])       \n"                            \
+      "preceu.ph.qbr     $t1, $t1               \n"                            \
+      "preceu.ph.qbr     $t2, $t2               \n"                            \
+      "preceu.ph.qbra    $t3, $t0               \n"                            \
+      "preceu.ph.qbla    $t0, $t0               \n"                            \
+      "subu.ph           $t1, $t1, $s5          \n"                            \
+      "subu.ph           $t2, $t2, $s5          \n"                            \
+      "subu.ph           $t3, $t3, $s4          \n"                            \
+      "subu.ph           $t0, $t0, $s4          \n"                            \
+      "mul.ph            $t3, $t3, $s0          \n"                            \
+      "mul.ph            $t0, $t0, $s0          \n"                            \
+      "shll.ph           $t4, $t1, 0x7          \n"                            \
+      "subu.ph           $t4, $t4, $t1          \n"                            \
+      "mul.ph            $t6, $t1, $s1          \n"                            \
+      "mul.ph            $t1, $t2, $s2          \n"                            \
+      "addq_s.ph         $t5, $t4, $t3          \n"                            \
+      "addq_s.ph         $t4, $t4, $t0          \n"                            \
+      "shra.ph           $t5, $t5, 6            \n"                            \
+      "shra.ph           $t4, $t4, 6            \n"                            \
+      "addiu             %[u_buf], 2            \n"                            \
+      "addiu             %[v_buf], 2            \n"                            \
+      "addu.ph           $t6, $t6, $t1          \n"                            \
+      "mul.ph            $t1, $t2, $s3          \n"                            \
+      "addu.ph           $t9, $t6, $t3          \n"                            \
+      "addu.ph           $t8, $t6, $t0          \n"                            \
+      "shra.ph           $t9, $t9, 6            \n"                            \
+      "shra.ph           $t8, $t8, 6            \n"                            \
+      "addu.ph           $t2, $t1, $t3          \n"                            \
+      "addu.ph           $t1, $t1, $t0          \n"                            \
+      "shra.ph           $t2, $t2, 6            \n"                            \
+      "shra.ph           $t1, $t1, 6            \n"                            \
+      "subu.ph           $t5, $t5, $s5          \n"                            \
+      "subu.ph           $t4, $t4, $s5          \n"                            \
+      "subu.ph           $t9, $t9, $s5          \n"                            \
+      "subu.ph           $t8, $t8, $s5          \n"                            \
+      "subu.ph           $t2, $t2, $s5          \n"                            \
+      "subu.ph           $t1, $t1, $s5          \n"                            \
+      "shll_s.ph         $t5, $t5, 8            \n"                            \
+      "shll_s.ph         $t4, $t4, 8            \n"                            \
+      "shll_s.ph         $t9, $t9, 8            \n"                            \
+      "shll_s.ph         $t8, $t8, 8            \n"                            \
+      "shll_s.ph         $t2, $t2, 8            \n"                            \
+      "shll_s.ph         $t1, $t1, 8            \n"                            \
+      "shra.ph           $t5, $t5, 8            \n"                            \
+      "shra.ph           $t4, $t4, 8            \n"                            \
+      "shra.ph           $t9, $t9, 8            \n"                            \
+      "shra.ph           $t8, $t8, 8            \n"                            \
+      "shra.ph           $t2, $t2, 8            \n"                            \
+      "shra.ph           $t1, $t1, 8            \n"                            \
+      "addu.ph           $t5, $t5, $s5          \n"                            \
+      "addu.ph           $t4, $t4, $s5          \n"                            \
+      "addu.ph           $t9, $t9, $s5          \n"                            \
+      "addu.ph           $t8, $t8, $s5          \n"                            \
+      "addu.ph           $t2, $t2, $s5          \n"                            \
+      "addu.ph           $t1, $t1, $s5          \n"
+
+void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width) {
+  __asm__ __volatile__ (
+    ".set push                                \n"
+    ".set noreorder                           \n"
+    "beqz              %[width], 2f           \n"
+    " repl.ph          $s0, 74                \n"  // |YG|YG| = |74|74|
+    "repl.ph           $s1, -25               \n"  // |UG|UG| = |-25|-25|
+    "repl.ph           $s2, -52               \n"  // |VG|VG| = |-52|-52|
+    "repl.ph           $s3, 102               \n"  // |VR|VR| = |102|102|
+    "repl.ph           $s4, 16                \n"  // |0|16|0|16|
+    "repl.ph           $s5, 128               \n"  // |128|128| // clipping
+    "lui               $s6, 0xff00            \n"
+    "ori               $s6, 0xff00            \n"  // |ff|00|ff|00|ff|
+
+    ".p2align          2                      \n"
+   "1:                                        \n"
+      I422ToTransientMipsRGB
+// Arranging into argb format
+    "precr.qb.ph       $t4, $t8, $t4          \n"  // |G1|g1|B1|b1|
+    "precr.qb.ph       $t5, $t9, $t5          \n"  // |G0|g0|B0|b0|
+    "addiu             %[width], -4           \n"
+    "precrq.qb.ph      $t8, $t4, $t5          \n"  // |G1|B1|G0|B0|
+    "precr.qb.ph       $t9, $t4, $t5          \n"  // |g1|b1|g0|b0|
+    "precr.qb.ph       $t2, $t1, $t2          \n"  // |R1|r1|R0|r0|
+
+    "addiu             %[y_buf], 4            \n"
+    "preceu.ph.qbla    $t1, $t2               \n"  // |0 |R1|0 |R0|
+    "preceu.ph.qbra    $t2, $t2               \n"  // |0 |r1|0 |r0|
+    "or                $t1, $t1, $s6          \n"  // |ff|R1|ff|R0|
+    "or                $t2, $t2, $s6          \n"  // |ff|r1|ff|r0|
+    "precrq.ph.w       $t0, $t2, $t9          \n"  // |ff|r1|g1|b1|
+    "precrq.ph.w       $t3, $t1, $t8          \n"  // |ff|R1|G1|B1|
+    "sll               $t9, $t9, 16           \n"
+    "sll               $t8, $t8, 16           \n"
+    "packrl.ph         $t2, $t2, $t9          \n"  // |ff|r0|g0|b0|
+    "packrl.ph         $t1, $t1, $t8          \n"  // |ff|R0|G0|B0|
+// Store results.
+    "sw                $t2, 0(%[rgb_buf])     \n"
+    "sw                $t0, 4(%[rgb_buf])     \n"
+    "sw                $t1, 8(%[rgb_buf])     \n"
+    "sw                $t3, 12(%[rgb_buf])    \n"
+    "bnez              %[width], 1b           \n"
+    " addiu            %[rgb_buf], 16         \n"
+   "2:                                        \n"
+    ".set pop                                 \n"
+      :[y_buf] "+r" (y_buf),
+       [u_buf] "+r" (u_buf),
+       [v_buf] "+r" (v_buf),
+       [width] "+r" (width),
+       [rgb_buf] "+r" (rgb_buf)
+      :
+      : "t0", "t1",  "t2", "t3",  "t4", "t5",
+      "t6", "t7", "t8", "t9",
+      "s0", "s1", "s2", "s3",
+      "s4", "s5", "s6"
+  );
+}
+
+void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width) {
+  __asm__ __volatile__ (
+    ".set push                                \n"
+    ".set noreorder                           \n"
+    "beqz              %[width], 2f           \n"
+    " repl.ph          $s0, 74                \n"  // |YG|YG| = |74|74|
+    "repl.ph           $s1, -25               \n"  // |UG|UG| = |-25|-25|
+    "repl.ph           $s2, -52               \n"  // |VG|VG| = |-52|-52|
+    "repl.ph           $s3, 102               \n"  // |VR|VR| = |102|102|
+    "repl.ph           $s4, 16                \n"  // |0|16|0|16|
+    "repl.ph           $s5, 128               \n"  // |128|128|
+    "lui               $s6, 0xff00            \n"
+    "ori               $s6, 0xff00            \n"  // |ff|00|ff|00|
+
+    ".p2align          2                       \n"
+   "1:                                         \n"
+      I422ToTransientMipsRGB
+// Arranging into abgr format
+    "precr.qb.ph      $t0, $t8, $t1           \n"  // |G1|g1|R1|r1|
+    "precr.qb.ph      $t3, $t9, $t2           \n"  // |G0|g0|R0|r0|
+    "precrq.qb.ph     $t8, $t0, $t3           \n"  // |G1|R1|G0|R0|
+    "precr.qb.ph      $t9, $t0, $t3           \n"  // |g1|r1|g0|r0|
+
+    "precr.qb.ph       $t2, $t4, $t5          \n"  // |B1|b1|B0|b0|
+    "addiu             %[width], -4           \n"
+    "addiu             %[y_buf], 4            \n"
+    "preceu.ph.qbla    $t1, $t2               \n"  // |0 |B1|0 |B0|
+    "preceu.ph.qbra    $t2, $t2               \n"  // |0 |b1|0 |b0|
+    "or                $t1, $t1, $s6          \n"  // |ff|B1|ff|B0|
+    "or                $t2, $t2, $s6          \n"  // |ff|b1|ff|b0|
+    "precrq.ph.w       $t0, $t2, $t9          \n"  // |ff|b1|g1|r1|
+    "precrq.ph.w       $t3, $t1, $t8          \n"  // |ff|B1|G1|R1|
+    "sll               $t9, $t9, 16           \n"
+    "sll               $t8, $t8, 16           \n"
+    "packrl.ph         $t2, $t2, $t9          \n"  // |ff|b0|g0|r0|
+    "packrl.ph         $t1, $t1, $t8          \n"  // |ff|B0|G0|R0|
+// Store results.
+    "sw                $t2, 0(%[rgb_buf])     \n"
+    "sw                $t0, 4(%[rgb_buf])     \n"
+    "sw                $t1, 8(%[rgb_buf])     \n"
+    "sw                $t3, 12(%[rgb_buf])    \n"
+    "bnez              %[width], 1b           \n"
+    " addiu            %[rgb_buf], 16         \n"
+   "2:                                        \n"
+    ".set pop                                 \n"
+      :[y_buf] "+r" (y_buf),
+       [u_buf] "+r" (u_buf),
+       [v_buf] "+r" (v_buf),
+       [width] "+r" (width),
+       [rgb_buf] "+r" (rgb_buf)
+      :
+      : "t0", "t1",  "t2", "t3",  "t4", "t5",
+      "t6", "t7", "t8", "t9",
+      "s0", "s1", "s2", "s3",
+      "s4", "s5", "s6"
+  );
+}
+
+void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width) {
+  __asm__ __volatile__ (
+    ".set push                                \n"
+    ".set noreorder                           \n"
+    "beqz              %[width], 2f           \n"
+    " repl.ph          $s0, 74                \n"  // |YG|YG| = |74 |74 |
+    "repl.ph           $s1, -25               \n"  // |UG|UG| = |-25|-25|
+    "repl.ph           $s2, -52               \n"  // |VG|VG| = |-52|-52|
+    "repl.ph           $s3, 102               \n"  // |VR|VR| = |102|102|
+    "repl.ph           $s4, 16                \n"  // |0|16|0|16|
+    "repl.ph           $s5, 128               \n"  // |128|128|
+    "lui               $s6, 0xff              \n"
+    "ori               $s6, 0xff              \n"  // |00|ff|00|ff|
+
+    ".p2align          2                      \n"
+   "1:                                        \n"
+      I422ToTransientMipsRGB
+      // Arranging into bgra format
+    "precr.qb.ph       $t4, $t4, $t8          \n"  // |B1|b1|G1|g1|
+    "precr.qb.ph       $t5, $t5, $t9          \n"  // |B0|b0|G0|g0|
+    "precrq.qb.ph      $t8, $t4, $t5          \n"  // |B1|G1|B0|G0|
+    "precr.qb.ph       $t9, $t4, $t5          \n"  // |b1|g1|b0|g0|
+
+    "precr.qb.ph       $t2, $t1, $t2          \n"  // |R1|r1|R0|r0|
+    "addiu             %[width], -4           \n"
+    "addiu             %[y_buf], 4            \n"
+    "preceu.ph.qbla    $t1, $t2               \n"  // |0 |R1|0 |R0|
+    "preceu.ph.qbra    $t2, $t2               \n"  // |0 |r1|0 |r0|
+    "sll               $t1, $t1, 8            \n"  // |R1|0 |R0|0 |
+    "sll               $t2, $t2, 8            \n"  // |r1|0 |r0|0 |
+    "or                $t1, $t1, $s6          \n"  // |R1|ff|R0|ff|
+    "or                $t2, $t2, $s6          \n"  // |r1|ff|r0|ff|
+    "precrq.ph.w       $t0, $t9, $t2          \n"  // |b1|g1|r1|ff|
+    "precrq.ph.w       $t3, $t8, $t1          \n"  // |B1|G1|R1|ff|
+    "sll               $t1, $t1, 16           \n"
+    "sll               $t2, $t2, 16           \n"
+    "packrl.ph         $t2, $t9, $t2          \n"  // |b0|g0|r0|ff|
+    "packrl.ph         $t1, $t8, $t1          \n"  // |B0|G0|R0|ff|
+// Store results.
+    "sw                $t2, 0(%[rgb_buf])     \n"
+    "sw                $t0, 4(%[rgb_buf])     \n"
+    "sw                $t1, 8(%[rgb_buf])     \n"
+    "sw                $t3, 12(%[rgb_buf])    \n"
+    "bnez              %[width], 1b           \n"
+    " addiu            %[rgb_buf], 16         \n"
+   "2:                                        \n"
+    ".set pop                                 \n"
+      :[y_buf] "+r" (y_buf),
+       [u_buf] "+r" (u_buf),
+       [v_buf] "+r" (v_buf),
+       [width] "+r" (width),
+       [rgb_buf] "+r" (rgb_buf)
+      :
+      : "t0", "t1",  "t2", "t3",  "t4", "t5",
+      "t6", "t7", "t8", "t9",
+      "s0", "s1", "s2", "s3",
+      "s4", "s5", "s6"
+  );
+}
+
+// Bilinear filter 8x2 -> 8x1
+void InterpolateRow_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
+                               ptrdiff_t src_stride, int dst_width,
+                               int source_y_fraction) {
+    int y0_fraction = 256 - source_y_fraction;
+    const uint8* src_ptr1 = src_ptr + src_stride;
+
+  __asm__ __volatile__ (
+     ".set push                                           \n"
+     ".set noreorder                                      \n"
+
+     "replv.ph          $t0, %[y0_fraction]               \n"
+     "replv.ph          $t1, %[source_y_fraction]         \n"
+
+    ".p2align           2                                 \n"
+   "1:                                                    \n"
+     "lw                $t2, 0(%[src_ptr])                \n"
+     "lw                $t3, 0(%[src_ptr1])               \n"
+     "lw                $t4, 4(%[src_ptr])                \n"
+     "lw                $t5, 4(%[src_ptr1])               \n"
+     "muleu_s.ph.qbl    $t6, $t2, $t0                     \n"
+     "muleu_s.ph.qbr    $t7, $t2, $t0                     \n"
+     "muleu_s.ph.qbl    $t8, $t3, $t1                     \n"
+     "muleu_s.ph.qbr    $t9, $t3, $t1                     \n"
+     "muleu_s.ph.qbl    $t2, $t4, $t0                     \n"
+     "muleu_s.ph.qbr    $t3, $t4, $t0                     \n"
+     "muleu_s.ph.qbl    $t4, $t5, $t1                     \n"
+     "muleu_s.ph.qbr    $t5, $t5, $t1                     \n"
+     "addq.ph           $t6, $t6, $t8                     \n"
+     "addq.ph           $t7, $t7, $t9                     \n"
+     "addq.ph           $t2, $t2, $t4                     \n"
+     "addq.ph           $t3, $t3, $t5                     \n"
+     "shra.ph           $t6, $t6, 8                       \n"
+     "shra.ph           $t7, $t7, 8                       \n"
+     "shra.ph           $t2, $t2, 8                       \n"
+     "shra.ph           $t3, $t3, 8                       \n"
+     "precr.qb.ph       $t6, $t6, $t7                     \n"
+     "precr.qb.ph       $t2, $t2, $t3                     \n"
+     "addiu             %[src_ptr], %[src_ptr], 8         \n"
+     "addiu             %[src_ptr1], %[src_ptr1], 8       \n"
+     "addiu             %[dst_width], %[dst_width], -8    \n"
+     "sw                $t6, 0(%[dst_ptr])                \n"
+     "sw                $t2, 4(%[dst_ptr])                \n"
+     "bgtz              %[dst_width], 1b                  \n"
+     " addiu            %[dst_ptr], %[dst_ptr], 8         \n"
+
+     ".set pop                                            \n"
+  : [dst_ptr] "+r" (dst_ptr),
+    [src_ptr1] "+r" (src_ptr1),
+    [src_ptr] "+r" (src_ptr),
+    [dst_width] "+r" (dst_width)
+  : [source_y_fraction] "r" (source_y_fraction),
+    [y0_fraction] "r" (y0_fraction),
+    [src_stride] "r" (src_stride)
+  : "t0", "t1", "t2", "t3", "t4", "t5",
+    "t6", "t7", "t8", "t9"
+  );
+}
+#endif  // __mips_dsp_rev >= 2
+
+#endif  // defined(__mips__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libvpx/third_party/libyuv/source/row_neon.cc b/libs/libvpx/third_party/libyuv/source/row_neon.cc
new file mode 100644
index 0000000000..1a72eb9039
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/source/row_neon.cc
@@ -0,0 +1,3084 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+    !defined(__aarch64__)
+
+// Read 8 Y, 4 U and 4 V from 422
+#define READYUV422                                                             \
+    MEMACCESS(0)                                                               \
+    "vld1.8     {d0}, [%0]!                    \n"                             \
+    MEMACCESS(1)                                                               \
+    "vld1.32    {d2[0]}, [%1]!                 \n"                             \
+    MEMACCESS(2)                                                               \
+    "vld1.32    {d2[1]}, [%2]!                 \n"
+
+// Read 8 Y, 2 U and 2 V from 422
+#define READYUV411                                                             \
+    MEMACCESS(0)                                                               \
+    "vld1.8     {d0}, [%0]!                    \n"                             \
+    MEMACCESS(1)                                                               \
+    "vld1.16    {d2[0]}, [%1]!                 \n"                             \
+    MEMACCESS(2)                                                               \
+    "vld1.16    {d2[1]}, [%2]!                 \n"                             \
+    "vmov.u8    d3, d2                         \n"                             \
+    "vzip.u8    d2, d3                         \n"
+
+// Read 8 Y, 8 U and 8 V from 444
+#define READYUV444                                                             \
+    MEMACCESS(0)                                                               \
+    "vld1.8     {d0}, [%0]!                    \n"                             \
+    MEMACCESS(1)                                                               \
+    "vld1.8     {d2}, [%1]!                    \n"                             \
+    MEMACCESS(2)                                                               \
+    "vld1.8     {d3}, [%2]!                    \n"                             \
+    "vpaddl.u8  q1, q1                         \n"                             \
+    "vrshrn.u16 d2, q1, #1                     \n"
+
+// Read 8 Y, and set 4 U and 4 V to 128
+#define READYUV400                                                             \
+    MEMACCESS(0)                                                               \
+    "vld1.8     {d0}, [%0]!                    \n"                             \
+    "vmov.u8    d2, #128                       \n"
+
+// Read 8 Y and 4 UV from NV12
+#define READNV12                                                               \
+    MEMACCESS(0)                                                               \
+    "vld1.8     {d0}, [%0]!                    \n"                             \
+    MEMACCESS(1)                                                               \
+    "vld1.8     {d2}, [%1]!                    \n"                             \
+    "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\
+    "vuzp.u8    d2, d3                         \n"                             \
+    "vtrn.u32   d2, d3                         \n"
+
+// Read 8 Y and 4 VU from NV21
+#define READNV21                                                               \
+    MEMACCESS(0)                                                               \
+    "vld1.8     {d0}, [%0]!                    \n"                             \
+    MEMACCESS(1)                                                               \
+    "vld1.8     {d2}, [%1]!                    \n"                             \
+    "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\
+    "vuzp.u8    d3, d2                         \n"                             \
+    "vtrn.u32   d2, d3                         \n"
+
+// Read 8 YUY2
+#define READYUY2                                                               \
+    MEMACCESS(0)                                                               \
+    "vld2.8     {d0, d2}, [%0]!                \n"                             \
+    "vmov.u8    d3, d2                         \n"                             \
+    "vuzp.u8    d2, d3                         \n"                             \
+    "vtrn.u32   d2, d3                         \n"
+
+// Read 8 UYVY
+#define READUYVY                                                               \
+    MEMACCESS(0)                                                               \
+    "vld2.8     {d2, d3}, [%0]!                \n"                             \
+    "vmov.u8    d0, d3                         \n"                             \
+    "vmov.u8    d3, d2                         \n"                             \
+    "vuzp.u8    d2, d3                         \n"                             \
+    "vtrn.u32   d2, d3                         \n"
+
+#define YUV422TORGB_SETUP_REG                                                  \
+    MEMACCESS([kUVToRB])                                                       \
+    "vld1.8     {d24}, [%[kUVToRB]]            \n"                             \
+    MEMACCESS([kUVToG])                                                        \
+    "vld1.8     {d25}, [%[kUVToG]]             \n"                             \
+    MEMACCESS([kUVBiasBGR])                                                    \
+    "vld1.16    {d26[], d27[]}, [%[kUVBiasBGR]]! \n"                           \
+    MEMACCESS([kUVBiasBGR])                                                    \
+    "vld1.16    {d8[], d9[]}, [%[kUVBiasBGR]]!   \n"                           \
+    MEMACCESS([kUVBiasBGR])                                                    \
+    "vld1.16    {d28[], d29[]}, [%[kUVBiasBGR]]  \n"                           \
+    MEMACCESS([kYToRgb])                                                       \
+    "vld1.32    {d30[], d31[]}, [%[kYToRgb]]     \n"
+
+#define YUV422TORGB                                                            \
+    "vmull.u8   q8, d2, d24                    \n" /* u/v B/R component      */\
+    "vmull.u8   q9, d2, d25                    \n" /* u/v G component        */\
+    "vmovl.u8   q0, d0                         \n" /* Y                      */\
+    "vmovl.s16  q10, d1                        \n"                             \
+    "vmovl.s16  q0, d0                         \n"                             \
+    "vmul.s32   q10, q10, q15                  \n"                             \
+    "vmul.s32   q0, q0, q15                    \n"                             \
+    "vqshrun.s32 d0, q0, #16                   \n"                             \
+    "vqshrun.s32 d1, q10, #16                  \n" /* Y                      */\
+    "vadd.s16   d18, d19                       \n"                             \
+    "vshll.u16  q1, d16, #16                   \n" /* Replicate u * UB       */\
+    "vshll.u16  q10, d17, #16                  \n" /* Replicate v * VR       */\
+    "vshll.u16  q3, d18, #16                   \n" /* Replicate (v*VG + u*UG)*/\
+    "vaddw.u16  q1, q1, d16                    \n"                             \
+    "vaddw.u16  q10, q10, d17                  \n"                             \
+    "vaddw.u16  q3, q3, d18                    \n"                             \
+    "vqadd.s16  q8, q0, q13                    \n" /* B */                     \
+    "vqadd.s16  q9, q0, q14                    \n" /* R */                     \
+    "vqadd.s16  q0, q0, q4                     \n" /* G */                     \
+    "vqadd.s16  q8, q8, q1                     \n" /* B */                     \
+    "vqadd.s16  q9, q9, q10                    \n" /* R */                     \
+    "vqsub.s16  q0, q0, q3                     \n" /* G */                     \
+    "vqshrun.s16 d20, q8, #6                   \n" /* B */                     \
+    "vqshrun.s16 d22, q9, #6                   \n" /* R */                     \
+    "vqshrun.s16 d21, q0, #6                   \n" /* G */
+
+// YUV to RGB conversion constants.
+// Y contribution to R,G,B.  Scale and bias.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */
+
+// U and V contributions to R,G,B.
+#define UB -128 /* -min(128, round(2.018 * 64)) */
+#define UG 25 /* -round(-0.391 * 64) */
+#define VG 52 /* -round(-0.813 * 64) */
+#define VR -102 /* -round(1.596 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BB (UB * 128            - YGB)
+#define BG (UG * 128 + VG * 128 - YGB)
+#define BR            (VR * 128 - YGB)
+
+static uvec8 kUVToRB  = { 128, 128, 128, 128, 102, 102, 102, 102,
+                          0, 0, 0, 0, 0, 0, 0, 0 };
+static uvec8 kUVToG = { 25, 25, 25, 25, 52, 52, 52, 52,
+                        0, 0, 0, 0, 0, 0, 0, 0 };
+static vec16 kUVBiasBGR = { BB, BG, BR, 0, 0, 0, 0, 0 };
+static vec32 kYToRgb = { 0x0101 * YG, 0, 0, 0 };
+
+#undef YG
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef BB
+#undef BG
+#undef BR
+
+void I444ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READYUV444
+    YUV422TORGB
+    "subs       %4, %4, #8                     \n"
+    "vmov.u8    d23, #255                      \n"
+    MEMACCESS(3)
+    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_argb),  // %3
+      "+r"(width)      // %4
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void I422ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB
+    "subs       %4, %4, #8                     \n"
+    "vmov.u8    d23, #255                      \n"
+    MEMACCESS(3)
+    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_argb),  // %3
+      "+r"(width)      // %4
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void I411ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READYUV411
+    YUV422TORGB
+    "subs       %4, %4, #8                     \n"
+    "vmov.u8    d23, #255                      \n"
+    MEMACCESS(3)
+    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_argb),  // %3
+      "+r"(width)      // %4
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void I422ToBGRARow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_bgra,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB
+    "subs       %4, %4, #8                     \n"
+    "vswp.u8    d20, d22                       \n"
+    "vmov.u8    d19, #255                      \n"
+    MEMACCESS(3)
+    "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_bgra),  // %3
+      "+r"(width)      // %4
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void I422ToABGRRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_abgr,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB
+    "subs       %4, %4, #8                     \n"
+    "vswp.u8    d20, d22                       \n"
+    "vmov.u8    d23, #255                      \n"
+    MEMACCESS(3)
+    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_abgr),  // %3
+      "+r"(width)      // %4
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void I422ToRGBARow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_rgba,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB
+    "subs       %4, %4, #8                     \n"
+    "vmov.u8    d19, #255                      \n"
+    MEMACCESS(3)
+    "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_rgba),  // %3
+      "+r"(width)      // %4
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void I422ToRGB24Row_NEON(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_rgb24,
+                         int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB
+    "subs       %4, %4, #8                     \n"
+    MEMACCESS(3)
+    "vst3.8     {d20, d21, d22}, [%3]!         \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),      // %0
+      "+r"(src_u),      // %1
+      "+r"(src_v),      // %2
+      "+r"(dst_rgb24),  // %3
+      "+r"(width)       // %4
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void I422ToRAWRow_NEON(const uint8* src_y,
+                       const uint8* src_u,
+                       const uint8* src_v,
+                       uint8* dst_raw,
+                       int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB
+    "subs       %4, %4, #8                     \n"
+    "vswp.u8    d20, d22                       \n"
+    MEMACCESS(3)
+    "vst3.8     {d20, d21, d22}, [%3]!         \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),    // %0
+      "+r"(src_u),    // %1
+      "+r"(src_v),    // %2
+      "+r"(dst_raw),  // %3
+      "+r"(width)     // %4
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+#define ARGBTORGB565                                                           \
+    "vshr.u8    d20, d20, #3                   \n"  /* B                    */ \
+    "vshr.u8    d21, d21, #2                   \n"  /* G                    */ \
+    "vshr.u8    d22, d22, #3                   \n"  /* R                    */ \
+    "vmovl.u8   q8, d20                        \n"  /* B                    */ \
+    "vmovl.u8   q9, d21                        \n"  /* G                    */ \
+    "vmovl.u8   q10, d22                       \n"  /* R                    */ \
+    "vshl.u16   q9, q9, #5                     \n"  /* G                    */ \
+    "vshl.u16   q10, q10, #11                  \n"  /* R                    */ \
+    "vorr       q0, q8, q9                     \n"  /* BG                   */ \
+    "vorr       q0, q0, q10                    \n"  /* BGR                  */
+
+void I422ToRGB565Row_NEON(const uint8* src_y,
+                          const uint8* src_u,
+                          const uint8* src_v,
+                          uint8* dst_rgb565,
+                          int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB
+    "subs       %4, %4, #8                     \n"
+    ARGBTORGB565
+    MEMACCESS(3)
+    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels RGB565.
+    "bgt        1b                             \n"
+    : "+r"(src_y),    // %0
+      "+r"(src_u),    // %1
+      "+r"(src_v),    // %2
+      "+r"(dst_rgb565),  // %3
+      "+r"(width)     // %4
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+#define ARGBTOARGB1555                                                         \
+    "vshr.u8    q10, q10, #3                   \n"  /* B                    */ \
+    "vshr.u8    d22, d22, #3                   \n"  /* R                    */ \
+    "vshr.u8    d23, d23, #7                   \n"  /* A                    */ \
+    "vmovl.u8   q8, d20                        \n"  /* B                    */ \
+    "vmovl.u8   q9, d21                        \n"  /* G                    */ \
+    "vmovl.u8   q10, d22                       \n"  /* R                    */ \
+    "vmovl.u8   q11, d23                       \n"  /* A                    */ \
+    "vshl.u16   q9, q9, #5                     \n"  /* G                    */ \
+    "vshl.u16   q10, q10, #10                  \n"  /* R                    */ \
+    "vshl.u16   q11, q11, #15                  \n"  /* A                    */ \
+    "vorr       q0, q8, q9                     \n"  /* BG                   */ \
+    "vorr       q1, q10, q11                   \n"  /* RA                   */ \
+    "vorr       q0, q0, q1                     \n"  /* BGRA                 */
+
+void I422ToARGB1555Row_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb1555,
+                            int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB
+    "subs       %4, %4, #8                     \n"
+    "vmov.u8    d23, #255                      \n"
+    ARGBTOARGB1555
+    MEMACCESS(3)
+    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels ARGB1555.
+    "bgt        1b                             \n"
+    : "+r"(src_y),    // %0
+      "+r"(src_u),    // %1
+      "+r"(src_v),    // %2
+      "+r"(dst_argb1555),  // %3
+      "+r"(width)     // %4
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+#define ARGBTOARGB4444                                                         \
+    "vshr.u8    d20, d20, #4                   \n"  /* B                    */ \
+    "vbic.32    d21, d21, d4                   \n"  /* G                    */ \
+    "vshr.u8    d22, d22, #4                   \n"  /* R                    */ \
+    "vbic.32    d23, d23, d4                   \n"  /* A                    */ \
+    "vorr       d0, d20, d21                   \n"  /* BG                   */ \
+    "vorr       d1, d22, d23                   \n"  /* RA                   */ \
+    "vzip.u8    d0, d1                         \n"  /* BGRA                 */
+
+void I422ToARGB4444Row_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb4444,
+                            int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB
+    "subs       %4, %4, #8                     \n"
+    "vmov.u8    d23, #255                      \n"
+    ARGBTOARGB4444
+    MEMACCESS(3)
+    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels ARGB4444.
+    "bgt        1b                             \n"
+    : "+r"(src_y),    // %0
+      "+r"(src_u),    // %1
+      "+r"(src_v),    // %2
+      "+r"(dst_argb4444),  // %3
+      "+r"(width)     // %4
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void I400ToARGBRow_NEON(const uint8* src_y,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READYUV400
+    YUV422TORGB
+    "subs       %2, %2, #8                     \n"
+    "vmov.u8    d23, #255                      \n"
+    MEMACCESS(1)
+    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(dst_argb),  // %1
+      "+r"(width)      // %2
+    : [kUVToRB]"r"(&kUVToRB),   // %3
+      [kUVToG]"r"(&kUVToG),     // %4
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void J400ToARGBRow_NEON(const uint8* src_y,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    "vmov.u8    d23, #255                      \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {d20}, [%0]!                   \n"
+    "vmov       d21, d20                       \n"
+    "vmov       d22, d20                       \n"
+    "subs       %2, %2, #8                     \n"
+    MEMACCESS(1)
+    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(dst_argb),  // %1
+      "+r"(width)      // %2
+    :
+    : "cc", "memory", "d20", "d21", "d22", "d23"
+  );
+}
+
+void NV12ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_uv,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READNV12
+    YUV422TORGB
+    "subs       %3, %3, #8                     \n"
+    "vmov.u8    d23, #255                      \n"
+    MEMACCESS(2)
+    "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_uv),    // %1
+      "+r"(dst_argb),  // %2
+      "+r"(width)      // %3
+    : [kUVToRB]"r"(&kUVToRB),   // %4
+      [kUVToG]"r"(&kUVToG),     // %5
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void NV21ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_uv,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READNV21
+    YUV422TORGB
+    "subs       %3, %3, #8                     \n"
+    "vmov.u8    d23, #255                      \n"
+    MEMACCESS(2)
+    "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_uv),    // %1
+      "+r"(dst_argb),  // %2
+      "+r"(width)      // %3
+    : [kUVToRB]"r"(&kUVToRB),   // %4
+      [kUVToG]"r"(&kUVToG),     // %5
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void NV12ToRGB565Row_NEON(const uint8* src_y,
+                          const uint8* src_uv,
+                          uint8* dst_rgb565,
+                          int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READNV12
+    YUV422TORGB
+    "subs       %3, %3, #8                     \n"
+    ARGBTORGB565
+    MEMACCESS(2)
+    "vst1.8     {q0}, [%2]!                    \n"  // store 8 pixels RGB565.
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_uv),    // %1
+      "+r"(dst_rgb565),  // %2
+      "+r"(width)      // %3
+    : [kUVToRB]"r"(&kUVToRB),   // %4
+      [kUVToG]"r"(&kUVToG),     // %5
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void NV21ToRGB565Row_NEON(const uint8* src_y,
+                          const uint8* src_uv,
+                          uint8* dst_rgb565,
+                          int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READNV21
+    YUV422TORGB
+    "subs       %3, %3, #8                     \n"
+    ARGBTORGB565
+    MEMACCESS(2)
+    "vst1.8     {q0}, [%2]!                    \n"  // store 8 pixels RGB565.
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_uv),    // %1
+      "+r"(dst_rgb565),  // %2
+      "+r"(width)      // %3
+    : [kUVToRB]"r"(&kUVToRB),   // %4
+      [kUVToG]"r"(&kUVToG),     // %5
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READYUY2
+    YUV422TORGB
+    "subs       %2, %2, #8                     \n"
+    "vmov.u8    d23, #255                      \n"
+    MEMACCESS(1)
+    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_yuy2),  // %0
+      "+r"(dst_argb),  // %1
+      "+r"(width)      // %2
+    : [kUVToRB]"r"(&kUVToRB),   // %3
+      [kUVToG]"r"(&kUVToG),     // %4
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void UYVYToARGBRow_NEON(const uint8* src_uyvy,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READUYVY
+    YUV422TORGB
+    "subs       %2, %2, #8                     \n"
+    "vmov.u8    d23, #255                      \n"
+    MEMACCESS(1)
+    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_uyvy),  // %0
+      "+r"(dst_argb),  // %1
+      "+r"(width)      // %2
+    : [kUVToRB]"r"(&kUVToRB),   // %3
+      [kUVToG]"r"(&kUVToG),     // %4
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
+void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                     int width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pairs of UV
+    "subs       %3, %3, #16                    \n"  // 16 processed per loop
+    MEMACCESS(1)
+    "vst1.8     {q0}, [%1]!                    \n"  // store U
+    MEMACCESS(2)
+    "vst1.8     {q1}, [%2]!                    \n"  // store V
+    "bgt        1b                             \n"
+    : "+r"(src_uv),  // %0
+      "+r"(dst_u),   // %1
+      "+r"(dst_v),   // %2
+      "+r"(width)    // %3  // Output registers
+    :                       // Input registers
+    : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+
+// Reads 16 U's and V's and writes out 16 pairs of UV.
+void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                     int width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load U
+    MEMACCESS(1)
+    "vld1.8     {q1}, [%1]!                    \n"  // load V
+    "subs       %3, %3, #16                    \n"  // 16 processed per loop
+    MEMACCESS(2)
+    "vst2.u8    {q0, q1}, [%2]!                \n"  // store 16 pairs of UV
+    "bgt        1b                             \n"
+    :
+      "+r"(src_u),   // %0
+      "+r"(src_v),   // %1
+      "+r"(dst_uv),  // %2
+      "+r"(width)    // %3  // Output registers
+    :                       // Input registers
+    : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+
+// Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
+void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 32
+    "subs       %2, %2, #32                    \n"  // 32 processed per loop
+    MEMACCESS(1)
+    "vst1.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 32
+    "bgt        1b                             \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(count)  // %2  // Output registers
+  :                     // Input registers
+  : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+
+// SetRow writes 'count' bytes using an 8 bit value repeated.
+void SetRow_NEON(uint8* dst, uint8 v8, int count) {
+  asm volatile (
+    "vdup.8    q0, %2                          \n"  // duplicate 16 bytes
+  "1:                                          \n"
+    "subs      %1, %1, #16                     \n"  // 16 bytes per loop
+    MEMACCESS(0)
+    "vst1.8    {q0}, [%0]!                     \n"  // store
+    "bgt       1b                              \n"
+  : "+r"(dst),   // %0
+    "+r"(count)  // %1
+  : "r"(v8)      // %2
+  : "cc", "memory", "q0"
+  );
+}
+
+// ARGBSetRow writes 'count' pixels using an 32 bit value repeated.
+void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
+  asm volatile (
+    "vdup.u32  q0, %2                          \n"  // duplicate 4 ints
+  "1:                                          \n"
+    "subs      %1, %1, #4                      \n"  // 4 pixels per loop
+    MEMACCESS(0)
+    "vst1.8    {q0}, [%0]!                     \n"  // store
+    "bgt       1b                              \n"
+  : "+r"(dst),   // %0
+    "+r"(count)  // %1
+  : "r"(v32)     // %2
+  : "cc", "memory", "q0"
+  );
+}
+
+void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    // Start at end of source row.
+    "mov        r3, #-16                       \n"
+    "add        %0, %0, %2                     \n"
+    "sub        %0, #16                        \n"
+
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
+    "subs       %2, #16                        \n"  // 16 pixels per loop.
+    "vrev64.8   q0, q0                         \n"
+    MEMACCESS(1)
+    "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width)  // %2
+  :
+  : "cc", "memory", "r3", "q0"
+  );
+}
+
+void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                      int width) {
+  asm volatile (
+    // Start at end of source row.
+    "mov        r12, #-16                      \n"
+    "add        %0, %0, %3, lsl #1             \n"
+    "sub        %0, #16                        \n"
+
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld2.8     {d0, d1}, [%0], r12            \n"  // src -= 16
+    "subs       %3, #8                         \n"  // 8 pixels per loop.
+    "vrev64.8   q0, q0                         \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // dst += 8
+    MEMACCESS(2)
+    "vst1.8     {d1}, [%2]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src_uv),  // %0
+    "+r"(dst_u),   // %1
+    "+r"(dst_v),   // %2
+    "+r"(width)    // %3
+  :
+  : "cc", "memory", "r12", "q0"
+  );
+}
+
+void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    // Start at end of source row.
+    "mov        r3, #-16                       \n"
+    "add        %0, %0, %2, lsl #2             \n"
+    "sub        %0, #16                        \n"
+
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
+    "subs       %2, #4                         \n"  // 4 pixels per loop.
+    "vrev64.32  q0, q0                         \n"
+    MEMACCESS(1)
+    "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width)  // %2
+  :
+  : "cc", "memory", "r3", "q0"
+  );
+}
+
+void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
+  asm volatile (
+    "vmov.u8    d4, #255                       \n"  // Alpha
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RGB24.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    MEMACCESS(1)
+    "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(src_rgb24),  // %0
+    "+r"(dst_argb),   // %1
+    "+r"(pix)         // %2
+  :
+  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
+  );
+}
+
+void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
+  asm volatile (
+    "vmov.u8    d4, #255                       \n"  // Alpha
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RAW.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vswp.u8    d1, d3                         \n"  // swap R, B
+    MEMACCESS(1)
+    "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(src_raw),   // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
+  );
+}
+
+#define RGB565TOARGB                                                           \
+    "vshrn.u16  d6, q0, #5                     \n"  /* G xxGGGGGG           */ \
+    "vuzp.u8    d0, d1                         \n"  /* d0 xxxBBBBB RRRRRxxx */ \
+    "vshl.u8    d6, d6, #2                     \n"  /* G GGGGGG00 upper 6   */ \
+    "vshr.u8    d1, d1, #3                     \n"  /* R 000RRRRR lower 5   */ \
+    "vshl.u8    q0, q0, #3                     \n"  /* B,R BBBBB000 upper 5 */ \
+    "vshr.u8    q2, q0, #5                     \n"  /* B,R 00000BBB lower 3 */ \
+    "vorr.u8    d0, d0, d4                     \n"  /* B                    */ \
+    "vshr.u8    d4, d6, #6                     \n"  /* G 000000GG lower 2   */ \
+    "vorr.u8    d2, d1, d5                     \n"  /* R                    */ \
+    "vorr.u8    d1, d4, d6                     \n"  /* G                    */
+
+void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) {
+  asm volatile (
+    "vmov.u8    d3, #255                       \n"  // Alpha
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    RGB565TOARGB
+    MEMACCESS(1)
+    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(src_rgb565),  // %0
+    "+r"(dst_argb),    // %1
+    "+r"(pix)          // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
+  );
+}
+
+#define ARGB1555TOARGB                                                         \
+    "vshrn.u16  d7, q0, #8                     \n"  /* A Arrrrrxx           */ \
+    "vshr.u8    d6, d7, #2                     \n"  /* R xxxRRRRR           */ \
+    "vshrn.u16  d5, q0, #5                     \n"  /* G xxxGGGGG           */ \
+    "vmovn.u16  d4, q0                         \n"  /* B xxxBBBBB           */ \
+    "vshr.u8    d7, d7, #7                     \n"  /* A 0000000A           */ \
+    "vneg.s8    d7, d7                         \n"  /* A AAAAAAAA upper 8   */ \
+    "vshl.u8    d6, d6, #3                     \n"  /* R RRRRR000 upper 5   */ \
+    "vshr.u8    q1, q3, #5                     \n"  /* R,A 00000RRR lower 3 */ \
+    "vshl.u8    q0, q2, #3                     \n"  /* B,G BBBBB000 upper 5 */ \
+    "vshr.u8    q2, q0, #5                     \n"  /* B,G 00000BBB lower 3 */ \
+    "vorr.u8    q1, q1, q3                     \n"  /* R,A                  */ \
+    "vorr.u8    q0, q0, q2                     \n"  /* B,G                  */ \
+
+// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
+#define RGB555TOARGB                                                           \
+    "vshrn.u16  d6, q0, #5                     \n"  /* G xxxGGGGG           */ \
+    "vuzp.u8    d0, d1                         \n"  /* d0 xxxBBBBB xRRRRRxx */ \
+    "vshl.u8    d6, d6, #3                     \n"  /* G GGGGG000 upper 5   */ \
+    "vshr.u8    d1, d1, #2                     \n"  /* R 00xRRRRR lower 5   */ \
+    "vshl.u8    q0, q0, #3                     \n"  /* B,R BBBBB000 upper 5 */ \
+    "vshr.u8    q2, q0, #5                     \n"  /* B,R 00000BBB lower 3 */ \
+    "vorr.u8    d0, d0, d4                     \n"  /* B                    */ \
+    "vshr.u8    d4, d6, #5                     \n"  /* G 00000GGG lower 3   */ \
+    "vorr.u8    d2, d1, d5                     \n"  /* R                    */ \
+    "vorr.u8    d1, d4, d6                     \n"  /* G                    */
+
+void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
+                            int pix) {
+  asm volatile (
+    "vmov.u8    d3, #255                       \n"  // Alpha
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    ARGB1555TOARGB
+    MEMACCESS(1)
+    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(src_argb1555),  // %0
+    "+r"(dst_argb),    // %1
+    "+r"(pix)          // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
+  );
+}
+
+#define ARGB4444TOARGB                                                         \
+    "vuzp.u8    d0, d1                         \n"  /* d0 BG, d1 RA         */ \
+    "vshl.u8    q2, q0, #4                     \n"  /* B,R BBBB0000         */ \
+    "vshr.u8    q1, q0, #4                     \n"  /* G,A 0000GGGG         */ \
+    "vshr.u8    q0, q2, #4                     \n"  /* B,R 0000BBBB         */ \
+    "vorr.u8    q0, q0, q2                     \n"  /* B,R BBBBBBBB         */ \
+    "vshl.u8    q2, q1, #4                     \n"  /* G,A GGGG0000         */ \
+    "vorr.u8    q1, q1, q2                     \n"  /* G,A GGGGGGGG         */ \
+    "vswp.u8    d1, d2                         \n"  /* B,R,G,A -> B,G,R,A   */
+
+void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
+                            int pix) {
+  asm volatile (
+    "vmov.u8    d3, #255                       \n"  // Alpha
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    ARGB4444TOARGB
+    MEMACCESS(1)
+    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(src_argb4444),  // %0
+    "+r"(dst_argb),    // %1
+    "+r"(pix)          // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2"  // Clobber List
+  );
+}
+
+void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    MEMACCESS(1)
+    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RGB24.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_rgb24),  // %1
+    "+r"(pix)         // %2
+  :
+  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
+  );
+}
+
+void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vswp.u8    d1, d3                         \n"  // swap R, B
+    MEMACCESS(1)
+    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RAW.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_raw),   // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
+  );
+}
+
+void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of YUY2.
+    "subs       %2, %2, #16                    \n"  // 16 processed per loop.
+    MEMACCESS(1)
+    "vst1.8     {q0}, [%1]!                    \n"  // store 16 pixels of Y.
+    "bgt        1b                             \n"
+  : "+r"(src_yuy2),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+
+void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of UYVY.
+    "subs       %2, %2, #16                    \n"  // 16 processed per loop.
+    MEMACCESS(1)
+    "vst1.8     {q1}, [%1]!                    \n"  // store 16 pixels of Y.
+    "bgt        1b                             \n"
+  : "+r"(src_uyvy),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+
+void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
+                         int pix) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
+    "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
+    MEMACCESS(1)
+    "vst1.8     {d1}, [%1]!                    \n"  // store 8 U.
+    MEMACCESS(2)
+    "vst1.8     {d3}, [%2]!                    \n"  // store 8 V.
+    "bgt        1b                             \n"
+  : "+r"(src_yuy2),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(pix)        // %3
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
+  );
+}
+
+void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
+                         int pix) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
+    "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 U.
+    MEMACCESS(2)
+    "vst1.8     {d2}, [%2]!                    \n"  // store 8 V.
+    "bgt        1b                             \n"
+  : "+r"(src_uyvy),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(pix)        // %3
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
+  );
+}
+
+void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // stride + src_yuy2
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
+    "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
+    MEMACCESS(1)
+    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row YUY2.
+    "vrhadd.u8  d1, d1, d5                     \n"  // average rows of U
+    "vrhadd.u8  d3, d3, d7                     \n"  // average rows of V
+    MEMACCESS(2)
+    "vst1.8     {d1}, [%2]!                    \n"  // store 8 U.
+    MEMACCESS(3)
+    "vst1.8     {d3}, [%3]!                    \n"  // store 8 V.
+    "bgt        1b                             \n"
+  : "+r"(src_yuy2),     // %0
+    "+r"(stride_yuy2),  // %1
+    "+r"(dst_u),        // %2
+    "+r"(dst_v),        // %3
+    "+r"(pix)           // %4
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"  // Clobber List
+  );
+}
+
+void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // stride + src_uyvy
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
+    "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
+    MEMACCESS(1)
+    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row UYVY.
+    "vrhadd.u8  d0, d0, d4                     \n"  // average rows of U
+    "vrhadd.u8  d2, d2, d6                     \n"  // average rows of V
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 U.
+    MEMACCESS(3)
+    "vst1.8     {d2}, [%3]!                    \n"  // store 8 V.
+    "bgt        1b                             \n"
+  : "+r"(src_uyvy),     // %0
+    "+r"(stride_uyvy),  // %1
+    "+r"(dst_u),        // %2
+    "+r"(dst_v),        // %3
+    "+r"(pix)           // %4
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"  // Clobber List
+  );
+}
+
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int pix) {
+  asm volatile (
+    MEMACCESS(3)
+    "vld1.8     {q2}, [%3]                     \n"  // shuffler
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 4 pixels.
+    "subs       %2, %2, #4                     \n"  // 4 processed per loop
+    "vtbl.8     d2, {d0, d1}, d4               \n"  // look up 2 first pixels
+    "vtbl.8     d3, {d0, d1}, d5               \n"  // look up 2 next pixels
+    MEMACCESS(1)
+    "vst1.8     {q1}, [%1]!                    \n"  // store 4.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  : "r"(shuffler)    // %3
+  : "cc", "memory", "q0", "q1", "q2"  // Clobber List
+  );
+}
+
+void I422ToYUY2Row_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_yuy2, int width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld2.8     {d0, d2}, [%0]!                \n"  // load 16 Ys
+    MEMACCESS(1)
+    "vld1.8     {d1}, [%1]!                    \n"  // load 8 Us
+    MEMACCESS(2)
+    "vld1.8     {d3}, [%2]!                    \n"  // load 8 Vs
+    "subs       %4, %4, #16                    \n"  // 16 pixels
+    MEMACCESS(3)
+    "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 YUY2/16 pixels.
+    "bgt        1b                             \n"
+  : "+r"(src_y),     // %0
+    "+r"(src_u),     // %1
+    "+r"(src_v),     // %2
+    "+r"(dst_yuy2),  // %3
+    "+r"(width)      // %4
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3"
+  );
+}
+
+void I422ToUYVYRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_uyvy, int width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld2.8     {d1, d3}, [%0]!                \n"  // load 16 Ys
+    MEMACCESS(1)
+    "vld1.8     {d0}, [%1]!                    \n"  // load 8 Us
+    MEMACCESS(2)
+    "vld1.8     {d2}, [%2]!                    \n"  // load 8 Vs
+    "subs       %4, %4, #16                    \n"  // 16 pixels
+    MEMACCESS(3)
+    "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 UYVY/16 pixels.
+    "bgt        1b                             \n"
+  : "+r"(src_y),     // %0
+    "+r"(src_u),     // %1
+    "+r"(src_v),     // %2
+    "+r"(dst_uyvy),  // %3
+    "+r"(width)      // %4
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3"
+  );
+}
+
+void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    ARGBTORGB565
+    MEMACCESS(1)
+    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels RGB565.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_rgb565),  // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
+  );
+}
+
+void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
+                                const uint32 dither4, int width) {
+  asm volatile (
+    ".p2align   2                              \n"
+    "vdup.32    d2, %2                         \n"  // dither4
+  "1:                                          \n"
+    MEMACCESS(1)
+    "vld4.8     {d20, d21, d22, d23}, [%1]!    \n"  // load 8 pixels of ARGB.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vqadd.u8   d20, d20, d2                   \n"
+    "vqadd.u8   d21, d21, d2                   \n"
+    "vqadd.u8   d22, d22, d2                   \n"
+    ARGBTORGB565
+    MEMACCESS(0)
+    "vst1.8     {q0}, [%0]!                    \n"  // store 8 pixels RGB565.
+    "bgt        1b                             \n"
+  : "+r"(dst_rgb)    // %0
+  : "r"(src_argb),   // %1
+    "r"(dither4),    // %2
+    "r"(width)       // %3
+  : "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11"
+  );
+}
+
+void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
+                            int pix) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    ARGBTOARGB1555
+    MEMACCESS(1)
+    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels ARGB1555.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb1555),  // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
+  );
+}
+
+void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
+                            int pix) {
+  asm volatile (
+    "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    ARGBTOARGB4444
+    MEMACCESS(1)
+    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels ARGB4444.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),      // %0
+    "+r"(dst_argb4444),  // %1
+    "+r"(pix)            // %2
+  :
+  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
+  );
+}
+
+void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
+  asm volatile (
+    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
+    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
+    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
+    "vmov.u8    d27, #16                       \n"  // Add 16 constant
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q2, d0, d24                    \n"  // B
+    "vmlal.u8   q2, d1, d25                    \n"  // G
+    "vmlal.u8   q2, d2, d26                    \n"  // R
+    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d27                        \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
+  );
+}
+
+void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
+  asm volatile (
+    "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient
+    "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient
+    "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q2, d0, d24                    \n"  // B
+    "vmlal.u8   q2, d1, d25                    \n"  // G
+    "vmlal.u8   q2, d2, d26                    \n"  // R
+    "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit Y
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
+  );
+}
+
+// 8x1 pixels.
+void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                         int pix) {
+  asm volatile (
+    "vmov.u8    d24, #112                      \n"  // UB / VR 0.875 coefficient
+    "vmov.u8    d25, #74                       \n"  // UG -0.5781 coefficient
+    "vmov.u8    d26, #38                       \n"  // UR -0.2969 coefficient
+    "vmov.u8    d27, #18                       \n"  // VB -0.1406 coefficient
+    "vmov.u8    d28, #94                       \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q2, d0, d24                    \n"  // B
+    "vmlsl.u8   q2, d1, d25                    \n"  // G
+    "vmlsl.u8   q2, d2, d26                    \n"  // R
+    "vadd.u16   q2, q2, q15                    \n"  // +128 -> unsigned
+
+    "vmull.u8   q3, d2, d24                    \n"  // R
+    "vmlsl.u8   q3, d1, d28                    \n"  // G
+    "vmlsl.u8   q3, d0, d27                    \n"  // B
+    "vadd.u16   q3, q3, q15                    \n"  // +128 -> unsigned
+
+    "vqshrn.u16  d0, q2, #8                    \n"  // 16 bit to 8 bit U
+    "vqshrn.u16  d1, q3, #8                    \n"  // 16 bit to 8 bit V
+
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
+    MEMACCESS(2)
+    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(pix)        // %3
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", "q15"
+  );
+}
+
+// 16x1 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
+void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                         int pix) {
+  asm volatile (
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
+
+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+
+    "subs       %3, %3, #16                    \n"  // 16 processed per loop.
+    "vmul.s16   q8, q0, q10                    \n"  // B
+    "vmls.s16   q8, q1, q11                    \n"  // G
+    "vmls.s16   q8, q2, q12                    \n"  // R
+    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
+
+    "vmul.s16   q9, q2, q10                    \n"  // R
+    "vmls.s16   q9, q1, q14                    \n"  // G
+    "vmls.s16   q9, q0, q13                    \n"  // B
+    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
+
+    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
+    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
+    MEMACCESS(2)
+    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(pix)        // %3
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+// 32x1 pixels -> 8x1.  pix is number of argb pixels. e.g. 32.
+void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                         int pix) {
+  asm volatile (
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(0)
+    "vld4.8     {d8, d10, d12, d14}, [%0]!     \n"  // load 8 more ARGB pixels.
+    MEMACCESS(0)
+    "vld4.8     {d9, d11, d13, d15}, [%0]!     \n"  // load last 8 ARGB pixels.
+    "vpaddl.u8  q4, q4                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q5, q5                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q6, q6                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vpadd.u16  d0, d0, d1                     \n"  // B 16 shorts -> 8 shorts.
+    "vpadd.u16  d1, d8, d9                     \n"  // B
+    "vpadd.u16  d2, d2, d3                     \n"  // G 16 shorts -> 8 shorts.
+    "vpadd.u16  d3, d10, d11                   \n"  // G
+    "vpadd.u16  d4, d4, d5                     \n"  // R 16 shorts -> 8 shorts.
+    "vpadd.u16  d5, d12, d13                   \n"  // R
+
+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
+    "vrshr.u16  q1, q1, #1                     \n"
+    "vrshr.u16  q2, q2, #1                     \n"
+
+    "subs       %3, %3, #32                    \n"  // 32 processed per loop.
+    "vmul.s16   q8, q0, q10                    \n"  // B
+    "vmls.s16   q8, q1, q11                    \n"  // G
+    "vmls.s16   q8, q2, q12                    \n"  // R
+    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
+    "vmul.s16   q9, q2, q10                    \n"  // R
+    "vmls.s16   q9, q1, q14                    \n"  // G
+    "vmls.s16   q9, q0, q13                    \n"  // B
+    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
+    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
+    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
+    MEMACCESS(2)
+    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(pix)        // %3
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
+#define RGBTOUV(QB, QG, QR) \
+    "vmul.s16   q8, " #QB ", q10               \n"  /* B                    */ \
+    "vmls.s16   q8, " #QG ", q11               \n"  /* G                    */ \
+    "vmls.s16   q8, " #QR ", q12               \n"  /* R                    */ \
+    "vadd.u16   q8, q8, q15                    \n"  /* +128 -> unsigned     */ \
+    "vmul.s16   q9, " #QR ", q10               \n"  /* R                    */ \
+    "vmls.s16   q9, " #QG ", q14               \n"  /* G                    */ \
+    "vmls.s16   q9, " #QB ", q13               \n"  /* B                    */ \
+    "vadd.u16   q9, q9, q15                    \n"  /* +128 -> unsigned     */ \
+    "vqshrn.u16  d0, q8, #8                    \n"  /* 16 bit to 8 bit U    */ \
+    "vqshrn.u16  d1, q9, #8                    \n"  /* 16 bit to 8 bit V    */
+
+// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
+void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_argb
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.
+    MEMACCESS(1)
+    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.
+    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
+    "vrshr.u16  q1, q1, #1                     \n"
+    "vrshr.u16  q2, q2, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    RGBTOUV(q0, q1, q2)
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(src_stride_argb),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+// TODO(fbarchard): Subsample match C code.
+void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_argb
+    "vmov.s16   q10, #127 / 2                  \n"  // UB / VR 0.500 coefficient
+    "vmov.s16   q11, #84 / 2                   \n"  // UG -0.33126 coefficient
+    "vmov.s16   q12, #43 / 2                   \n"  // UR -0.16874 coefficient
+    "vmov.s16   q13, #20 / 2                   \n"  // VB -0.08131 coefficient
+    "vmov.s16   q14, #107 / 2                  \n"  // VG -0.41869 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.
+    MEMACCESS(1)
+    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.
+    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
+    "vrshr.u16  q1, q1, #1                     \n"
+    "vrshr.u16  q2, q2, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    RGBTOUV(q0, q1, q2)
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(src_stride_argb),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_bgra
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 BGRA pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 BGRA pixels.
+    "vpaddl.u8  q3, q3                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more BGRA pixels.
+    MEMACCESS(1)
+    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 BGRA pixels.
+    "vpadal.u8  q3, q7                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q2, q6                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q5                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vrshr.u16  q1, q1, #1                     \n"  // 2x average
+    "vrshr.u16  q2, q2, #1                     \n"
+    "vrshr.u16  q3, q3, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    RGBTOUV(q3, q2, q1)
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_bgra),  // %0
+    "+r"(src_stride_bgra),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_abgr
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ABGR pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ABGR pixels.
+    "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ABGR pixels.
+    MEMACCESS(1)
+    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ABGR pixels.
+    "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q0, q4                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
+    "vrshr.u16  q1, q1, #1                     \n"
+    "vrshr.u16  q2, q2, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    RGBTOUV(q2, q1, q0)
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_abgr),  // %0
+    "+r"(src_stride_abgr),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_rgba
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 RGBA pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 RGBA pixels.
+    "vpaddl.u8  q0, q1                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q2                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q3                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more RGBA pixels.
+    MEMACCESS(1)
+    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 RGBA pixels.
+    "vpadal.u8  q0, q5                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q6                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q2, q7                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
+    "vrshr.u16  q1, q1, #1                     \n"
+    "vrshr.u16  q2, q2, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    RGBTOUV(q0, q1, q2)
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_rgba),  // %0
+    "+r"(src_stride_rgba),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
+                       uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_rgb24
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RGB24 pixels.
+    MEMACCESS(0)
+    "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RGB24 pixels.
+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RGB24 pixels.
+    MEMACCESS(1)
+    "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RGB24 pixels.
+    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
+    "vrshr.u16  q1, q1, #1                     \n"
+    "vrshr.u16  q2, q2, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    RGBTOUV(q0, q1, q2)
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_rgb24),  // %0
+    "+r"(src_stride_rgb24),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
+                     uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_raw
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RAW pixels.
+    MEMACCESS(0)
+    "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RAW pixels.
+    "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RAW pixels.
+    MEMACCESS(1)
+    "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RAW pixels.
+    "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q0, q4                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
+    "vrshr.u16  q1, q1, #1                     \n"
+    "vrshr.u16  q2, q2, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    RGBTOUV(q2, q1, q0)
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_raw),  // %0
+    "+r"(src_stride_raw),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
+void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
+                        uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_argb
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
+    RGB565TOARGB
+    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // next 8 RGB565 pixels.
+    RGB565TOARGB
+    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"  // load 8 RGB565 pixels.
+    RGB565TOARGB
+    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"  // next 8 RGB565 pixels.
+    RGB565TOARGB
+    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+
+    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
+    "vrshr.u16  q5, q5, #1                     \n"
+    "vrshr.u16  q6, q6, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
+    "vmul.s16   q8, q4, q10                    \n"  // B
+    "vmls.s16   q8, q5, q11                    \n"  // G
+    "vmls.s16   q8, q6, q12                    \n"  // R
+    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
+    "vmul.s16   q9, q6, q10                    \n"  // R
+    "vmls.s16   q9, q5, q14                    \n"  // G
+    "vmls.s16   q9, q4, q13                    \n"  // B
+    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
+    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
+    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_rgb565),  // %0
+    "+r"(src_stride_rgb565),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
+void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
+                        uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_argb
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
+    RGB555TOARGB
+    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB1555 pixels.
+    RGB555TOARGB
+    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB1555 pixels.
+    RGB555TOARGB
+    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB1555 pixels.
+    RGB555TOARGB
+    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+
+    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
+    "vrshr.u16  q5, q5, #1                     \n"
+    "vrshr.u16  q6, q6, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
+    "vmul.s16   q8, q4, q10                    \n"  // B
+    "vmls.s16   q8, q5, q11                    \n"  // G
+    "vmls.s16   q8, q6, q12                    \n"  // R
+    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
+    "vmul.s16   q9, q6, q10                    \n"  // R
+    "vmls.s16   q9, q5, q14                    \n"  // G
+    "vmls.s16   q9, q4, q13                    \n"  // B
+    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
+    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
+    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_argb1555),  // %0
+    "+r"(src_stride_argb1555),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
+void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
+                          uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_argb
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
+    ARGB4444TOARGB
+    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB4444 pixels.
+    ARGB4444TOARGB
+    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB4444 pixels.
+    ARGB4444TOARGB
+    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB4444 pixels.
+    ARGB4444TOARGB
+    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+
+    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
+    "vrshr.u16  q5, q5, #1                     \n"
+    "vrshr.u16  q6, q6, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
+    "vmul.s16   q8, q4, q10                    \n"  // B
+    "vmls.s16   q8, q5, q11                    \n"  // G
+    "vmls.s16   q8, q6, q12                    \n"  // R
+    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
+    "vmul.s16   q9, q6, q10                    \n"  // R
+    "vmls.s16   q9, q5, q14                    \n"  // G
+    "vmls.s16   q9, q4, q13                    \n"  // B
+    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
+    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
+    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_argb4444),  // %0
+    "+r"(src_stride_argb4444),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
+  asm volatile (
+    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
+    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
+    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
+    "vmov.u8    d27, #16                       \n"  // Add 16 constant
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    RGB565TOARGB
+    "vmull.u8   q2, d0, d24                    \n"  // B
+    "vmlal.u8   q2, d1, d25                    \n"  // G
+    "vmlal.u8   q2, d2, d26                    \n"  // R
+    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d27                        \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_rgb565),  // %0
+    "+r"(dst_y),       // %1
+    "+r"(pix)          // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
+  );
+}
+
+void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {
+  asm volatile (
+    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
+    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
+    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
+    "vmov.u8    d27, #16                       \n"  // Add 16 constant
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    ARGB1555TOARGB
+    "vmull.u8   q2, d0, d24                    \n"  // B
+    "vmlal.u8   q2, d1, d25                    \n"  // G
+    "vmlal.u8   q2, d2, d26                    \n"  // R
+    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d27                        \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_argb1555),  // %0
+    "+r"(dst_y),         // %1
+    "+r"(pix)            // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
+  );
+}
+
+void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
+  asm volatile (
+    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
+    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
+    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
+    "vmov.u8    d27, #16                       \n"  // Add 16 constant
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    ARGB4444TOARGB
+    "vmull.u8   q2, d0, d24                    \n"  // B
+    "vmlal.u8   q2, d1, d25                    \n"  // G
+    "vmlal.u8   q2, d2, d26                    \n"  // R
+    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d27                        \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_argb4444),  // %0
+    "+r"(dst_y),         // %1
+    "+r"(pix)            // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
+  );
+}
+
+void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
+  asm volatile (
+    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
+    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
+    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
+    "vmov.u8    d7, #16                        \n"  // Add 16 constant
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of BGRA.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q8, d1, d4                     \n"  // R
+    "vmlal.u8   q8, d2, d5                     \n"  // G
+    "vmlal.u8   q8, d3, d6                     \n"  // B
+    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d7                         \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_bgra),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+  );
+}
+
+void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
+  asm volatile (
+    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
+    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
+    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
+    "vmov.u8    d7, #16                        \n"  // Add 16 constant
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ABGR.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q8, d0, d4                     \n"  // R
+    "vmlal.u8   q8, d1, d5                     \n"  // G
+    "vmlal.u8   q8, d2, d6                     \n"  // B
+    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d7                         \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_abgr),  // %0
+    "+r"(dst_y),  // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+  );
+}
+
+void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
+  asm volatile (
+    "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient
+    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
+    "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient
+    "vmov.u8    d7, #16                        \n"  // Add 16 constant
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of RGBA.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q8, d1, d4                     \n"  // B
+    "vmlal.u8   q8, d2, d5                     \n"  // G
+    "vmlal.u8   q8, d3, d6                     \n"  // R
+    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d7                         \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_rgba),  // %0
+    "+r"(dst_y),  // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+  );
+}
+
+void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
+  asm volatile (
+    "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient
+    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
+    "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient
+    "vmov.u8    d7, #16                        \n"  // Add 16 constant
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RGB24.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q8, d0, d4                     \n"  // B
+    "vmlal.u8   q8, d1, d5                     \n"  // G
+    "vmlal.u8   q8, d2, d6                     \n"  // R
+    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d7                         \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_rgb24),  // %0
+    "+r"(dst_y),  // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+  );
+}
+
+void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
+  asm volatile (
+    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
+    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
+    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
+    "vmov.u8    d7, #16                        \n"  // Add 16 constant
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RAW.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q8, d0, d4                     \n"  // B
+    "vmlal.u8   q8, d1, d5                     \n"  // G
+    "vmlal.u8   q8, d2, d6                     \n"  // R
+    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d7                         \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_raw),  // %0
+    "+r"(dst_y),  // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+  );
+}
+
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_NEON(uint8* dst_ptr,
+                         const uint8* src_ptr, ptrdiff_t src_stride,
+                         int dst_width, int source_y_fraction) {
+  asm volatile (
+    "cmp        %4, #0                         \n"
+    "beq        100f                           \n"
+    "add        %2, %1                         \n"
+    "cmp        %4, #64                        \n"
+    "beq        75f                            \n"
+    "cmp        %4, #128                       \n"
+    "beq        50f                            \n"
+    "cmp        %4, #192                       \n"
+    "beq        25f                            \n"
+
+    "vdup.8     d5, %4                         \n"
+    "rsb        %4, #256                       \n"
+    "vdup.8     d4, %4                         \n"
+    // General purpose row blend.
+  "1:                                          \n"
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"
+    MEMACCESS(2)
+    "vld1.8     {q1}, [%2]!                    \n"
+    "subs       %3, %3, #16                    \n"
+    "vmull.u8   q13, d0, d4                    \n"
+    "vmull.u8   q14, d1, d4                    \n"
+    "vmlal.u8   q13, d2, d5                    \n"
+    "vmlal.u8   q14, d3, d5                    \n"
+    "vrshrn.u16 d0, q13, #8                    \n"
+    "vrshrn.u16 d1, q14, #8                    \n"
+    MEMACCESS(0)
+    "vst1.8     {q0}, [%0]!                    \n"
+    "bgt        1b                             \n"
+    "b          99f                            \n"
+
+    // Blend 25 / 75.
+  "25:                                         \n"
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"
+    MEMACCESS(2)
+    "vld1.8     {q1}, [%2]!                    \n"
+    "subs       %3, %3, #16                    \n"
+    "vrhadd.u8  q0, q1                         \n"
+    "vrhadd.u8  q0, q1                         \n"
+    MEMACCESS(0)
+    "vst1.8     {q0}, [%0]!                    \n"
+    "bgt        25b                            \n"
+    "b          99f                            \n"
+
+    // Blend 50 / 50.
+  "50:                                         \n"
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"
+    MEMACCESS(2)
+    "vld1.8     {q1}, [%2]!                    \n"
+    "subs       %3, %3, #16                    \n"
+    "vrhadd.u8  q0, q1                         \n"
+    MEMACCESS(0)
+    "vst1.8     {q0}, [%0]!                    \n"
+    "bgt        50b                            \n"
+    "b          99f                            \n"
+
+    // Blend 75 / 25.
+  "75:                                         \n"
+    MEMACCESS(1)
+    "vld1.8     {q1}, [%1]!                    \n"
+    MEMACCESS(2)
+    "vld1.8     {q0}, [%2]!                    \n"
+    "subs       %3, %3, #16                    \n"
+    "vrhadd.u8  q0, q1                         \n"
+    "vrhadd.u8  q0, q1                         \n"
+    MEMACCESS(0)
+    "vst1.8     {q0}, [%0]!                    \n"
+    "bgt        75b                            \n"
+    "b          99f                            \n"
+
+    // Blend 100 / 0 - Copy row unchanged.
+  "100:                                        \n"
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"
+    "subs       %3, %3, #16                    \n"
+    MEMACCESS(0)
+    "vst1.8     {q0}, [%0]!                    \n"
+    "bgt        100b                           \n"
+
+  "99:                                         \n"
+  : "+r"(dst_ptr),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(src_stride),       // %2
+    "+r"(dst_width),        // %3
+    "+r"(source_y_fraction) // %4
+  :
+  : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14"
+  );
+}
+
+// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
+void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+                       uint8* dst_argb, int width) {
+  asm volatile (
+    "subs       %3, #8                         \n"
+    "blt        89f                            \n"
+    // Blend 8 pixels.
+  "8:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB0.
+    MEMACCESS(1)
+    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 pixels of ARGB1.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q10, d4, d3                    \n"  // db * a
+    "vmull.u8   q11, d5, d3                    \n"  // dg * a
+    "vmull.u8   q12, d6, d3                    \n"  // dr * a
+    "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8
+    "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8
+    "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8
+    "vqsub.u8   q2, q2, q10                    \n"  // dbg - dbg * a / 256
+    "vqsub.u8   d6, d6, d22                    \n"  // dr - dr * a / 256
+    "vqadd.u8   q0, q0, q2                     \n"  // + sbg
+    "vqadd.u8   d2, d2, d6                     \n"  // + sr
+    "vmov.u8    d3, #255                       \n"  // a = 255
+    MEMACCESS(2)
+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 pixels of ARGB.
+    "bge        8b                             \n"
+
+  "89:                                         \n"
+    "adds       %3, #8-1                       \n"
+    "blt        99f                            \n"
+
+    // Blend 1 pixels.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n"  // load 1 pixel ARGB0.
+    MEMACCESS(1)
+    "vld4.8     {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n"  // load 1 pixel ARGB1.
+    "subs       %3, %3, #1                     \n"  // 1 processed per loop.
+    "vmull.u8   q10, d4, d3                    \n"  // db * a
+    "vmull.u8   q11, d5, d3                    \n"  // dg * a
+    "vmull.u8   q12, d6, d3                    \n"  // dr * a
+    "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8
+    "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8
+    "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8
+    "vqsub.u8   q2, q2, q10                    \n"  // dbg - dbg * a / 256
+    "vqsub.u8   d6, d6, d22                    \n"  // dr - dr * a / 256
+    "vqadd.u8   q0, q0, q2                     \n"  // + sbg
+    "vqadd.u8   d2, d2, d6                     \n"  // + sr
+    "vmov.u8    d3, #255                       \n"  // a = 255
+    MEMACCESS(2)
+    "vst4.8     {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n"  // store 1 pixel.
+    "bge        1b                             \n"
+
+  "99:                                         \n"
+
+  : "+r"(src_argb0),    // %0
+    "+r"(src_argb1),    // %1
+    "+r"(dst_argb),     // %2
+    "+r"(width)         // %3
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12"
+  );
+}
+
+// Attenuate 8 pixels at a time.
+void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
+  asm volatile (
+    // Attenuate 8 pixels.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q10, d0, d3                    \n"  // b * a
+    "vmull.u8   q11, d1, d3                    \n"  // g * a
+    "vmull.u8   q12, d2, d3                    \n"  // r * a
+    "vqrshrn.u16 d0, q10, #8                   \n"  // b >>= 8
+    "vqrshrn.u16 d1, q11, #8                   \n"  // g >>= 8
+    "vqrshrn.u16 d2, q12, #8                   \n"  // r >>= 8
+    MEMACCESS(1)
+    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_argb),   // %1
+    "+r"(width)       // %2
+  :
+  : "cc", "memory", "q0", "q1", "q10", "q11", "q12"
+  );
+}
+
+// Quantize 8 ARGB pixels (32 bytes).
+// dst = (dst * scale >> 16) * interval_size + interval_offset;
+void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
+                          int interval_offset, int width) {
+  asm volatile (
+    "vdup.u16   q8, %2                         \n"
+    "vshr.u16   q8, q8, #1                     \n"  // scale >>= 1
+    "vdup.u16   q9, %3                         \n"  // interval multiply.
+    "vdup.u16   q10, %4                        \n"  // interval add
+
+    // 8 pixel loop.
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]         \n"  // load 8 pixels of ARGB.
+    "subs       %1, %1, #8                     \n"  // 8 processed per loop.
+    "vmovl.u8   q0, d0                         \n"  // b (0 .. 255)
+    "vmovl.u8   q1, d2                         \n"
+    "vmovl.u8   q2, d4                         \n"
+    "vqdmulh.s16 q0, q0, q8                    \n"  // b * scale
+    "vqdmulh.s16 q1, q1, q8                    \n"  // g
+    "vqdmulh.s16 q2, q2, q8                    \n"  // r
+    "vmul.u16   q0, q0, q9                     \n"  // b * interval_size
+    "vmul.u16   q1, q1, q9                     \n"  // g
+    "vmul.u16   q2, q2, q9                     \n"  // r
+    "vadd.u16   q0, q0, q10                    \n"  // b + interval_offset
+    "vadd.u16   q1, q1, q10                    \n"  // g
+    "vadd.u16   q2, q2, q10                    \n"  // r
+    "vqmovn.u16 d0, q0                         \n"
+    "vqmovn.u16 d2, q1                         \n"
+    "vqmovn.u16 d4, q2                         \n"
+    MEMACCESS(0)
+    "vst4.8     {d0, d2, d4, d6}, [%0]!        \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(dst_argb),       // %0
+    "+r"(width)           // %1
+  : "r"(scale),           // %2
+    "r"(interval_size),   // %3
+    "r"(interval_offset)  // %4
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"
+  );
+}
+
+// Shade 8 pixels at a time by specified value.
+// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
+// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
+void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
+                       uint32 value) {
+  asm volatile (
+    "vdup.u32   q0, %3                         \n"  // duplicate scale value.
+    "vzip.u8    d0, d1                         \n"  // d0 aarrggbb.
+    "vshr.u16   q0, q0, #1                     \n"  // scale / 2.
+
+    // 8 pixel loop.
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d20, d22, d24, d26}, [%0]!    \n"  // load 8 pixels of ARGB.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmovl.u8   q10, d20                       \n"  // b (0 .. 255)
+    "vmovl.u8   q11, d22                       \n"
+    "vmovl.u8   q12, d24                       \n"
+    "vmovl.u8   q13, d26                       \n"
+    "vqrdmulh.s16 q10, q10, d0[0]              \n"  // b * scale * 2
+    "vqrdmulh.s16 q11, q11, d0[1]              \n"  // g
+    "vqrdmulh.s16 q12, q12, d0[2]              \n"  // r
+    "vqrdmulh.s16 q13, q13, d0[3]              \n"  // a
+    "vqmovn.u16 d20, q10                       \n"
+    "vqmovn.u16 d22, q11                       \n"
+    "vqmovn.u16 d24, q12                       \n"
+    "vqmovn.u16 d26, q13                       \n"
+    MEMACCESS(1)
+    "vst4.8     {d20, d22, d24, d26}, [%1]!    \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),       // %0
+    "+r"(dst_argb),       // %1
+    "+r"(width)           // %2
+  : "r"(value)            // %3
+  : "cc", "memory", "q0", "q10", "q11", "q12", "q13"
+  );
+}
+
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
+// Similar to ARGBToYJ but stores ARGB.
+// C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
+void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
+  asm volatile (
+    "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient
+    "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient
+    "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q2, d0, d24                    \n"  // B
+    "vmlal.u8   q2, d1, d25                    \n"  // G
+    "vmlal.u8   q2, d2, d26                    \n"  // R
+    "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit B
+    "vmov       d1, d0                         \n"  // G
+    "vmov       d2, d0                         \n"  // R
+    MEMACCESS(1)
+    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 ARGB pixels.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(width)      // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
+  );
+}
+
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+//    b = (r * 35 + g * 68 + b * 17) >> 7
+//    g = (r * 45 + g * 88 + b * 22) >> 7
+//    r = (r * 50 + g * 98 + b * 24) >> 7
+void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
+  asm volatile (
+    "vmov.u8    d20, #17                       \n"  // BB coefficient
+    "vmov.u8    d21, #68                       \n"  // BG coefficient
+    "vmov.u8    d22, #35                       \n"  // BR coefficient
+    "vmov.u8    d24, #22                       \n"  // GB coefficient
+    "vmov.u8    d25, #88                       \n"  // GG coefficient
+    "vmov.u8    d26, #45                       \n"  // GR coefficient
+    "vmov.u8    d28, #24                       \n"  // BB coefficient
+    "vmov.u8    d29, #98                       \n"  // BG coefficient
+    "vmov.u8    d30, #50                       \n"  // BR coefficient
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]         \n"  // load 8 ARGB pixels.
+    "subs       %1, %1, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q2, d0, d20                    \n"  // B to Sepia B
+    "vmlal.u8   q2, d1, d21                    \n"  // G
+    "vmlal.u8   q2, d2, d22                    \n"  // R
+    "vmull.u8   q3, d0, d24                    \n"  // B to Sepia G
+    "vmlal.u8   q3, d1, d25                    \n"  // G
+    "vmlal.u8   q3, d2, d26                    \n"  // R
+    "vmull.u8   q8, d0, d28                    \n"  // B to Sepia R
+    "vmlal.u8   q8, d1, d29                    \n"  // G
+    "vmlal.u8   q8, d2, d30                    \n"  // R
+    "vqshrn.u16 d0, q2, #7                     \n"  // 16 bit to 8 bit B
+    "vqshrn.u16 d1, q3, #7                     \n"  // 16 bit to 8 bit G
+    "vqshrn.u16 d2, q8, #7                     \n"  // 16 bit to 8 bit R
+    MEMACCESS(0)
+    "vst4.8     {d0, d1, d2, d3}, [%0]!        \n"  // store 8 ARGB pixels.
+    "bgt        1b                             \n"
+  : "+r"(dst_argb),  // %0
+    "+r"(width)      // %1
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3",
+    "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// TODO(fbarchard): Was same as Sepia except matrix is provided.  This function
+// needs to saturate.  Consider doing a non-saturating version.
+void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
+                             const int8* matrix_argb, int width) {
+  asm volatile (
+    MEMACCESS(3)
+    "vld1.8     {q2}, [%3]                     \n"  // load 3 ARGB vectors.
+    "vmovl.s8   q0, d4                         \n"  // B,G coefficients s16.
+    "vmovl.s8   q1, d5                         \n"  // R,A coefficients s16.
+
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d16, d18, d20, d22}, [%0]!    \n"  // load 8 ARGB pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmovl.u8   q8, d16                        \n"  // b (0 .. 255) 16 bit
+    "vmovl.u8   q9, d18                        \n"  // g
+    "vmovl.u8   q10, d20                       \n"  // r
+    "vmovl.u8   q11, d22                       \n"  // a
+    "vmul.s16   q12, q8, d0[0]                 \n"  // B = B * Matrix B
+    "vmul.s16   q13, q8, d1[0]                 \n"  // G = B * Matrix G
+    "vmul.s16   q14, q8, d2[0]                 \n"  // R = B * Matrix R
+    "vmul.s16   q15, q8, d3[0]                 \n"  // A = B * Matrix A
+    "vmul.s16   q4, q9, d0[1]                  \n"  // B += G * Matrix B
+    "vmul.s16   q5, q9, d1[1]                  \n"  // G += G * Matrix G
+    "vmul.s16   q6, q9, d2[1]                  \n"  // R += G * Matrix R
+    "vmul.s16   q7, q9, d3[1]                  \n"  // A += G * Matrix A
+    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
+    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
+    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
+    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
+    "vmul.s16   q4, q10, d0[2]                 \n"  // B += R * Matrix B
+    "vmul.s16   q5, q10, d1[2]                 \n"  // G += R * Matrix G
+    "vmul.s16   q6, q10, d2[2]                 \n"  // R += R * Matrix R
+    "vmul.s16   q7, q10, d3[2]                 \n"  // A += R * Matrix A
+    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
+    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
+    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
+    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
+    "vmul.s16   q4, q11, d0[3]                 \n"  // B += A * Matrix B
+    "vmul.s16   q5, q11, d1[3]                 \n"  // G += A * Matrix G
+    "vmul.s16   q6, q11, d2[3]                 \n"  // R += A * Matrix R
+    "vmul.s16   q7, q11, d3[3]                 \n"  // A += A * Matrix A
+    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
+    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
+    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
+    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
+    "vqshrun.s16 d16, q12, #6                  \n"  // 16 bit to 8 bit B
+    "vqshrun.s16 d18, q13, #6                  \n"  // 16 bit to 8 bit G
+    "vqshrun.s16 d20, q14, #6                  \n"  // 16 bit to 8 bit R
+    "vqshrun.s16 d22, q15, #6                  \n"  // 16 bit to 8 bit A
+    MEMACCESS(1)
+    "vst4.8     {d16, d18, d20, d22}, [%1]!    \n"  // store 8 ARGB pixels.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_argb),   // %1
+    "+r"(width)       // %2
+  : "r"(matrix_argb)  // %3
+  : "cc", "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9",
+    "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
+#ifdef HAS_ARGBMULTIPLYROW_NEON
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  asm volatile (
+    // 8 pixel loop.
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(1)
+    "vld4.8     {d1, d3, d5, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q0, d0, d1                     \n"  // multiply B
+    "vmull.u8   q1, d2, d3                     \n"  // multiply G
+    "vmull.u8   q2, d4, d5                     \n"  // multiply R
+    "vmull.u8   q3, d6, d7                     \n"  // multiply A
+    "vrshrn.u16 d0, q0, #8                     \n"  // 16 bit to 8 bit B
+    "vrshrn.u16 d1, q1, #8                     \n"  // 16 bit to 8 bit G
+    "vrshrn.u16 d2, q2, #8                     \n"  // 16 bit to 8 bit R
+    "vrshrn.u16 d3, q3, #8                     \n"  // 16 bit to 8 bit A
+    MEMACCESS(2)
+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+    "bgt        1b                             \n"
+
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3"
+  );
+}
+#endif  // HAS_ARGBMULTIPLYROW_NEON
+
+// Add 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    // 8 pixel loop.
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(1)
+    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vqadd.u8   q0, q0, q2                     \n"  // add B, G
+    "vqadd.u8   q1, q1, q3                     \n"  // add R, A
+    MEMACCESS(2)
+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+    "bgt        1b                             \n"
+
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3"
+  );
+}
+
+// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
+void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  asm volatile (
+    // 8 pixel loop.
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(1)
+    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vqsub.u8   q0, q0, q2                     \n"  // subtract B, G
+    "vqsub.u8   q1, q1, q3                     \n"  // subtract R, A
+    MEMACCESS(2)
+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+    "bgt        1b                             \n"
+
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3"
+  );
+}
+
+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+// A = 255
+// R = Sobel
+// G = Sobel
+// B = Sobel
+void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    "vmov.u8    d3, #255                       \n"  // alpha
+    // 8 pixel loop.
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {d0}, [%0]!                    \n"  // load 8 sobelx.
+    MEMACCESS(1)
+    "vld1.8     {d1}, [%1]!                    \n"  // load 8 sobely.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vqadd.u8   d0, d0, d1                     \n"  // add
+    "vmov.u8    d1, d0                         \n"
+    "vmov.u8    d2, d0                         \n"
+    MEMACCESS(2)
+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+    "bgt        1b                             \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_argb),    // %2
+    "+r"(width)        // %3
+  :
+  : "cc", "memory", "q0", "q1"
+  );
+}
+
+// Adds Sobel X and Sobel Y and stores Sobel into plane.
+void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                          uint8* dst_y, int width) {
+  asm volatile (
+    // 16 pixel loop.
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 16 sobelx.
+    MEMACCESS(1)
+    "vld1.8     {q1}, [%1]!                    \n"  // load 16 sobely.
+    "subs       %3, %3, #16                    \n"  // 16 processed per loop.
+    "vqadd.u8   q0, q0, q1                     \n"  // add
+    MEMACCESS(2)
+    "vst1.8     {q0}, [%2]!                    \n"  // store 16 pixels.
+    "bgt        1b                             \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_y),       // %2
+    "+r"(width)        // %3
+  :
+  : "cc", "memory", "q0", "q1"
+  );
+}
+
+// Mixes Sobel X, Sobel Y and Sobel into ARGB.
+// A = 255
+// R = Sobel X
+// G = Sobel
+// B = Sobel Y
+void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    "vmov.u8    d3, #255                       \n"  // alpha
+    // 8 pixel loop.
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {d2}, [%0]!                    \n"  // load 8 sobelx.
+    MEMACCESS(1)
+    "vld1.8     {d0}, [%1]!                    \n"  // load 8 sobely.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vqadd.u8   d1, d0, d2                     \n"  // add
+    MEMACCESS(2)
+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+    "bgt        1b                             \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_argb),    // %2
+    "+r"(width)        // %3
+  :
+  : "cc", "memory", "q0", "q1"
+  );
+}
+
+// SobelX as a matrix is
+// -1  0  1
+// -2  0  2
+// -1  0  1
+void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
+                    const uint8* src_y2, uint8* dst_sobelx, int width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {d0}, [%0],%5                  \n"  // top
+    MEMACCESS(0)
+    "vld1.8     {d1}, [%0],%6                  \n"
+    "vsubl.u8   q0, d0, d1                     \n"
+    MEMACCESS(1)
+    "vld1.8     {d2}, [%1],%5                  \n"  // center * 2
+    MEMACCESS(1)
+    "vld1.8     {d3}, [%1],%6                  \n"
+    "vsubl.u8   q1, d2, d3                     \n"
+    "vadd.s16   q0, q0, q1                     \n"
+    "vadd.s16   q0, q0, q1                     \n"
+    MEMACCESS(2)
+    "vld1.8     {d2}, [%2],%5                  \n"  // bottom
+    MEMACCESS(2)
+    "vld1.8     {d3}, [%2],%6                  \n"
+    "subs       %4, %4, #8                     \n"  // 8 pixels
+    "vsubl.u8   q1, d2, d3                     \n"
+    "vadd.s16   q0, q0, q1                     \n"
+    "vabs.s16   q0, q0                         \n"
+    "vqmovn.u16 d0, q0                         \n"
+    MEMACCESS(3)
+    "vst1.8     {d0}, [%3]!                    \n"  // store 8 sobelx
+    "bgt        1b                             \n"
+  : "+r"(src_y0),      // %0
+    "+r"(src_y1),      // %1
+    "+r"(src_y2),      // %2
+    "+r"(dst_sobelx),  // %3
+    "+r"(width)        // %4
+  : "r"(2),            // %5
+    "r"(6)             // %6
+  : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+
+// SobelY as a matrix is
+// -1 -2 -1
+//  0  0  0
+//  1  2  1
+void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
+                    uint8* dst_sobely, int width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {d0}, [%0],%4                  \n"  // left
+    MEMACCESS(1)
+    "vld1.8     {d1}, [%1],%4                  \n"
+    "vsubl.u8   q0, d0, d1                     \n"
+    MEMACCESS(0)
+    "vld1.8     {d2}, [%0],%4                  \n"  // center * 2
+    MEMACCESS(1)
+    "vld1.8     {d3}, [%1],%4                  \n"
+    "vsubl.u8   q1, d2, d3                     \n"
+    "vadd.s16   q0, q0, q1                     \n"
+    "vadd.s16   q0, q0, q1                     \n"
+    MEMACCESS(0)
+    "vld1.8     {d2}, [%0],%5                  \n"  // right
+    MEMACCESS(1)
+    "vld1.8     {d3}, [%1],%5                  \n"
+    "subs       %3, %3, #8                     \n"  // 8 pixels
+    "vsubl.u8   q1, d2, d3                     \n"
+    "vadd.s16   q0, q0, q1                     \n"
+    "vabs.s16   q0, q0                         \n"
+    "vqmovn.u16 d0, q0                         \n"
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 sobely
+    "bgt        1b                             \n"
+  : "+r"(src_y0),      // %0
+    "+r"(src_y1),      // %1
+    "+r"(dst_sobely),  // %2
+    "+r"(width)        // %3
+  : "r"(1),            // %4
+    "r"(6)             // %5
+  : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libvpx/third_party/libyuv/source/row_neon64.cc b/libs/libvpx/third_party/libyuv/source/row_neon64.cc
new file mode 100644
index 0000000000..5d015454b0
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/source/row_neon64.cc
@@ -0,0 +1,3087 @@
+/*
+ *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon armv8 64 bit.
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+// Read 8 Y, 4 U and 4 V from 422
+#define READYUV422                                                             \
+    MEMACCESS(0)                                                               \
+    "ld1        {v0.8b}, [%0], #8              \n"                             \
+    MEMACCESS(1)                                                               \
+    "ld1        {v1.s}[0], [%1], #4            \n"                             \
+    MEMACCESS(2)                                                               \
+    "ld1        {v1.s}[1], [%2], #4            \n"
+
+// Read 8 Y, 2 U and 2 V from 422
+#define READYUV411                                                             \
+    MEMACCESS(0)                                                               \
+    "ld1        {v0.8b}, [%0], #8              \n"                             \
+    MEMACCESS(1)                                                               \
+    "ld1        {v2.h}[0], [%1], #2            \n"                             \
+    MEMACCESS(2)                                                               \
+    "ld1        {v2.h}[1], [%2], #2            \n"                             \
+    "zip1       v1.8b, v2.8b, v2.8b            \n"
+
+// Read 8 Y, 8 U and 8 V from 444
+#define READYUV444                                                             \
+    MEMACCESS(0)                                                               \
+    "ld1        {v0.8b}, [%0], #8              \n"                             \
+    MEMACCESS(1)                                                               \
+    "ld1        {v1.d}[0], [%1], #8            \n"                             \
+    MEMACCESS(2)                                                               \
+    "ld1        {v1.d}[1], [%2], #8            \n"                             \
+    "uaddlp     v1.8h, v1.16b                  \n"                             \
+    "rshrn      v1.8b, v1.8h, #1               \n"
+
+// Read 8 Y, and set 4 U and 4 V to 128
+#define READYUV400                                                             \
+    MEMACCESS(0)                                                               \
+    "ld1        {v0.8b}, [%0], #8              \n"                             \
+    "movi       v1.8b , #128                   \n"
+
+// Read 8 Y and 4 UV from NV12
+#define READNV12                                                               \
+    MEMACCESS(0)                                                               \
+    "ld1        {v0.8b}, [%0], #8              \n"                             \
+    MEMACCESS(1)                                                               \
+    "ld1        {v2.8b}, [%1], #8              \n"                             \
+    "uzp1       v1.8b, v2.8b, v2.8b            \n"                             \
+    "uzp2       v3.8b, v2.8b, v2.8b            \n"                             \
+    "ins        v1.s[1], v3.s[0]               \n"
+
+// Read 8 Y and 4 VU from NV21
+#define READNV21                                                               \
+    MEMACCESS(0)                                                               \
+    "ld1        {v0.8b}, [%0], #8              \n"                             \
+    MEMACCESS(1)                                                               \
+    "ld1        {v2.8b}, [%1], #8              \n"                             \
+    "uzp1       v3.8b, v2.8b, v2.8b            \n"                             \
+    "uzp2       v1.8b, v2.8b, v2.8b            \n"                             \
+    "ins        v1.s[1], v3.s[0]               \n"
+
+// Read 8 YUY2
+#define READYUY2                                                               \
+    MEMACCESS(0)                                                               \
+    "ld2        {v0.8b, v1.8b}, [%0], #16      \n"                             \
+    "uzp2       v3.8b, v1.8b, v1.8b            \n"                             \
+    "uzp1       v1.8b, v1.8b, v1.8b            \n"                             \
+    "ins        v1.s[1], v3.s[0]               \n"
+
+// Read 8 UYVY
+#define READUYVY                                                               \
+    MEMACCESS(0)                                                               \
+    "ld2        {v2.8b, v3.8b}, [%0], #16      \n"                             \
+    "orr        v0.8b, v3.8b, v3.8b            \n"                             \
+    "uzp1       v1.8b, v2.8b, v2.8b            \n"                             \
+    "uzp2       v3.8b, v2.8b, v2.8b            \n"                             \
+    "ins        v1.s[1], v3.s[0]               \n"
+
+#define YUV422TORGB_SETUP_REG                                                  \
+    "ld1r       {v24.8h}, [%[kUVBiasBGR]], #2  \n"                             \
+    "ld1r       {v25.8h}, [%[kUVBiasBGR]], #2  \n"                             \
+    "ld1r       {v26.8h}, [%[kUVBiasBGR]]      \n"                             \
+    "ld1r       {v31.4s}, [%[kYToRgb]]         \n"                             \
+    "movi       v27.8h, #128                   \n"                             \
+    "movi       v28.8h, #102                   \n"                             \
+    "movi       v29.8h, #25                    \n"                             \
+    "movi       v30.8h, #52                    \n"
+
+#define YUV422TORGB(vR, vG, vB)                                                \
+    "uxtl       v0.8h, v0.8b                   \n" /* Extract Y    */          \
+    "shll       v2.8h, v1.8b, #8               \n" /* Replicate UV */          \
+    "ushll2     v3.4s, v0.8h, #0               \n" /* Y */                     \
+    "ushll      v0.4s, v0.4h, #0               \n"                             \
+    "mul        v3.4s, v3.4s, v31.4s           \n"                             \
+    "mul        v0.4s, v0.4s, v31.4s           \n"                             \
+    "sqshrun    v0.4h, v0.4s, #16              \n"                             \
+    "sqshrun2   v0.8h, v3.4s, #16              \n" /* Y */                     \
+    "uaddw      v1.8h, v2.8h, v1.8b            \n" /* Replicate UV */          \
+    "mov        v2.d[0], v1.d[1]               \n" /* Extract V */             \
+    "uxtl       v2.8h, v2.8b                   \n"                             \
+    "uxtl       v1.8h, v1.8b                   \n" /* Extract U */             \
+    "mul        v3.8h, v1.8h, v27.8h           \n"                             \
+    "mul        v5.8h, v1.8h, v29.8h           \n"                             \
+    "mul        v6.8h, v2.8h, v30.8h           \n"                             \
+    "mul        v7.8h, v2.8h, v28.8h           \n"                             \
+    "sqadd      v6.8h, v6.8h, v5.8h            \n"                             \
+    "sqadd      " #vB ".8h, v24.8h, v0.8h      \n" /* B */                     \
+    "sqadd      " #vG ".8h, v25.8h, v0.8h      \n" /* G */                     \
+    "sqadd      " #vR ".8h, v26.8h, v0.8h      \n" /* R */                     \
+    "sqadd      " #vB ".8h, " #vB ".8h, v3.8h  \n" /* B */                     \
+    "sqsub      " #vG ".8h, " #vG ".8h, v6.8h  \n" /* G */                     \
+    "sqadd      " #vR ".8h, " #vR ".8h, v7.8h  \n" /* R */                     \
+    "sqshrun    " #vB ".8b, " #vB ".8h, #6     \n" /* B */                     \
+    "sqshrun    " #vG ".8b, " #vG ".8h, #6     \n" /* G */                     \
+    "sqshrun    " #vR ".8b, " #vR ".8h, #6     \n" /* R */                     \
+
+// YUV to RGB conversion constants.
+// Y contribution to R,G,B.  Scale and bias.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */
+
+// U and V contributions to R,G,B.
+#define UB -128 /* -min(128, round(2.018 * 64)) */
+#define UG 25 /* -round(-0.391 * 64) */
+#define VG 52 /* -round(-0.813 * 64) */
+#define VR -102 /* -round(1.596 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BB (UB * 128            - YGB)
+#define BG (UG * 128 + VG * 128 - YGB)
+#define BR            (VR * 128 - YGB)
+
+static vec16 kUVBiasBGR = { BB, BG, BR, 0, 0, 0, 0, 0 };
+static vec32 kYToRgb = { 0x0101 * YG, 0, 0, 0 };
+
+#undef YG
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef BB
+#undef BG
+#undef BR
+
+#define RGBTOUV_SETUP_REG                                                      \
+    "movi       v20.8h, #56, lsl #0  \n"  /* UB/VR coefficient (0.875) / 2 */  \
+    "movi       v21.8h, #37, lsl #0  \n"  /* UG coefficient (-0.5781) / 2  */  \
+    "movi       v22.8h, #19, lsl #0  \n"  /* UR coefficient (-0.2969) / 2  */  \
+    "movi       v23.8h, #9,  lsl #0  \n"  /* VB coefficient (-0.1406) / 2  */  \
+    "movi       v24.8h, #47, lsl #0  \n"  /* VG coefficient (-0.7344) / 2  */  \
+    "movi       v25.16b, #0x80       \n"  /* 128.5 (0x8080 in 16-bit)      */
+
+
+#ifdef HAS_I444TOARGBROW_NEON
+void I444ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+  "1:                                          \n"
+    READYUV444
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w4, %w4, #8                 \n"
+    "movi       v23.8b, #255                   \n" /* A */
+    MEMACCESS(3)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_argb),  // %3
+      "+r"(width)      // %4
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_I444TOARGBROW_NEON
+
+#ifdef HAS_I422TOARGBROW_NEON
+void I422ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w4, %w4, #8                   \n"
+    "movi       v23.8b, #255                   \n" /* A */
+    MEMACCESS(3)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_argb),  // %3
+      "+r"(width)      // %4
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_I422TOARGBROW_NEON
+
+#ifdef HAS_I411TOARGBROW_NEON
+void I411ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+  "1:                                          \n"
+    READYUV411
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w4, %w4, #8                   \n"
+    "movi       v23.8b, #255                   \n" /* A */
+    MEMACCESS(3)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_argb),  // %3
+      "+r"(width)      // %4
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_I411TOARGBROW_NEON
+
+#ifdef HAS_I422TOBGRAROW_NEON
+void I422ToBGRARow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_bgra,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB(v21, v22, v23)
+    "subs       %w4, %w4, #8                   \n"
+    "movi       v20.8b, #255                   \n" /* A */
+    MEMACCESS(3)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_bgra),  // %3
+      "+r"(width)      // %4
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_I422TOBGRAROW_NEON
+
+#ifdef HAS_I422TOABGRROW_NEON
+void I422ToABGRRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_abgr,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB(v20, v21, v22)
+    "subs       %w4, %w4, #8                   \n"
+    "movi       v23.8b, #255                   \n" /* A */
+    MEMACCESS(3)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_abgr),  // %3
+      "+r"(width)      // %4
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_I422TOABGRROW_NEON
+
+#ifdef HAS_I422TORGBAROW_NEON
+void I422ToRGBARow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_rgba,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB(v23, v22, v21)
+    "subs       %w4, %w4, #8                   \n"
+    "movi       v20.8b, #255                   \n" /* A */
+    MEMACCESS(3)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_rgba),  // %3
+      "+r"(width)      // %4
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_I422TORGBAROW_NEON
+
+#ifdef HAS_I422TORGB24ROW_NEON
+void I422ToRGB24Row_NEON(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_rgb24,
+                         int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w4, %w4, #8                   \n"
+    MEMACCESS(3)
+    "st3        {v20.8b,v21.8b,v22.8b}, [%3], #24     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_rgb24), // %3
+      "+r"(width)      // %4
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_I422TORGB24ROW_NEON
+
+#ifdef HAS_I422TORAWROW_NEON
+void I422ToRAWRow_NEON(const uint8* src_y,
+                       const uint8* src_u,
+                       const uint8* src_v,
+                       uint8* dst_raw,
+                       int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB(v20, v21, v22)
+    "subs       %w4, %w4, #8                   \n"
+    MEMACCESS(3)
+    "st3        {v20.8b,v21.8b,v22.8b}, [%3], #24     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_raw),   // %3
+      "+r"(width)      // %4
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_I422TORAWROW_NEON
+
+#define ARGBTORGB565                                                           \
+    "shll       v0.8h,  v22.8b, #8             \n"  /* R                    */ \
+    "shll       v20.8h, v20.8b, #8             \n"  /* B                    */ \
+    "shll       v21.8h, v21.8b, #8             \n"  /* G                    */ \
+    "sri        v0.8h,  v21.8h, #5             \n"  /* RG                   */ \
+    "sri        v0.8h,  v20.8h, #11            \n"  /* RGB                  */
+
+#ifdef HAS_I422TORGB565ROW_NEON
+void I422ToRGB565Row_NEON(const uint8* src_y,
+                          const uint8* src_u,
+                          const uint8* src_v,
+                          uint8* dst_rgb565,
+                          int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w4, %w4, #8                   \n"
+    ARGBTORGB565
+    MEMACCESS(3)
+    "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels RGB565.
+    "b.gt       1b                             \n"
+    : "+r"(src_y),    // %0
+      "+r"(src_u),    // %1
+      "+r"(src_v),    // %2
+      "+r"(dst_rgb565),  // %3
+      "+r"(width)     // %4
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_I422TORGB565ROW_NEON
+
+#define ARGBTOARGB1555                                                         \
+    "shll       v0.8h,  v23.8b, #8             \n"  /* A                    */ \
+    "shll       v22.8h, v22.8b, #8             \n"  /* R                    */ \
+    "shll       v20.8h, v20.8b, #8             \n"  /* B                    */ \
+    "shll       v21.8h, v21.8b, #8             \n"  /* G                    */ \
+    "sri        v0.8h,  v22.8h, #1             \n"  /* AR                   */ \
+    "sri        v0.8h,  v21.8h, #6             \n"  /* ARG                  */ \
+    "sri        v0.8h,  v20.8h, #11            \n"  /* ARGB                 */
+
+#ifdef HAS_I422TOARGB1555ROW_NEON
+void I422ToARGB1555Row_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb1555,
+                            int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w4, %w4, #8                   \n"
+    "movi       v23.8b, #255                   \n"
+    ARGBTOARGB1555
+    MEMACCESS(3)
+    "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels RGB565.
+    "b.gt       1b                             \n"
+    : "+r"(src_y),    // %0
+      "+r"(src_u),    // %1
+      "+r"(src_v),    // %2
+      "+r"(dst_argb1555),  // %3
+      "+r"(width)     // %4
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_I422TOARGB1555ROW_NEON
+
+#define ARGBTOARGB4444                                                         \
+    /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f        */ \
+    "ushr       v20.8b, v20.8b, #4             \n"  /* B                    */ \
+    "bic        v21.8b, v21.8b, v4.8b          \n"  /* G                    */ \
+    "ushr       v22.8b, v22.8b, #4             \n"  /* R                    */ \
+    "bic        v23.8b, v23.8b, v4.8b          \n"  /* A                    */ \
+    "orr        v0.8b,  v20.8b, v21.8b         \n"  /* BG                   */ \
+    "orr        v1.8b,  v22.8b, v23.8b         \n"  /* RA                   */ \
+    "zip1       v0.16b, v0.16b, v1.16b         \n"  /* BGRA                 */
+
+#ifdef HAS_I422TOARGB4444ROW_NEON
+void I422ToARGB4444Row_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb4444,
+                            int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    "movi       v4.16b, #0x0f                  \n"  // bits to clear with vbic.
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w4, %w4, #8                   \n"
+    "movi       v23.8b, #255                   \n"
+    ARGBTOARGB4444
+    MEMACCESS(3)
+    "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels ARGB4444.
+    "b.gt       1b                             \n"
+    : "+r"(src_y),    // %0
+      "+r"(src_u),    // %1
+      "+r"(src_v),    // %2
+      "+r"(dst_argb4444),  // %3
+      "+r"(width)     // %4
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_I422TOARGB4444ROW_NEON
+
+#ifdef HAS_I400TOARGBROW_NEON
+void I400ToARGBRow_NEON(const uint8* src_y,
+                        uint8* dst_argb,
+                        int width) {
+  int64 width64 = (int64)(width);
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+  "1:                                          \n"
+    READYUV400
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w2, %w2, #8                   \n"
+    "movi       v23.8b, #255                   \n"
+    MEMACCESS(1)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(dst_argb),  // %1
+      "+r"(width64)    // %2
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_I400TOARGBROW_NEON
+
+#ifdef HAS_J400TOARGBROW_NEON
+void J400ToARGBRow_NEON(const uint8* src_y,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    "movi       v23.8b, #255                   \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v20.8b}, [%0], #8             \n"
+    "orr        v21.8b, v20.8b, v20.8b         \n"
+    "orr        v22.8b, v20.8b, v20.8b         \n"
+    "subs       %w2, %w2, #8                   \n"
+    MEMACCESS(1)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(dst_argb),  // %1
+      "+r"(width)      // %2
+    :
+    : "cc", "memory", "v20", "v21", "v22", "v23"
+  );
+}
+#endif  // HAS_J400TOARGBROW_NEON
+
+#ifdef HAS_NV12TOARGBROW_NEON
+void NV12ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_uv,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+  "1:                                          \n"
+    READNV12
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w3, %w3, #8                   \n"
+    "movi       v23.8b, #255                   \n"
+    MEMACCESS(2)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_uv),    // %1
+      "+r"(dst_argb),  // %2
+      "+r"(width)      // %3
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_NV12TOARGBROW_NEON
+
+#ifdef HAS_NV21TOARGBROW_NEON
+void NV21ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_uv,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+  "1:                                          \n"
+    READNV21
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w3, %w3, #8                   \n"
+    "movi       v23.8b, #255                   \n"
+    MEMACCESS(2)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_uv),    // %1
+      "+r"(dst_argb),  // %2
+      "+r"(width)      // %3
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_NV21TOARGBROW_NEON
+
+#ifdef HAS_NV12TORGB565ROW_NEON
+void NV12ToRGB565Row_NEON(const uint8* src_y,
+                          const uint8* src_uv,
+                          uint8* dst_rgb565,
+                          int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+  "1:                                          \n"
+    READNV12
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w3, %w3, #8                   \n"
+    ARGBTORGB565
+    MEMACCESS(2)
+    "st1        {v0.8h}, [%2], 16              \n"  // store 8 pixels RGB565.
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_uv),    // %1
+      "+r"(dst_rgb565),  // %2
+      "+r"(width)      // %3
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_NV12TORGB565ROW_NEON
+
+#ifdef HAS_NV21TORGB565ROW_NEON
+void NV21ToRGB565Row_NEON(const uint8* src_y,
+                          const uint8* src_uv,
+                          uint8* dst_rgb565,
+                          int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+  "1:                                          \n"
+    READNV21
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w3, %w3, #8                   \n"
+    ARGBTORGB565
+    MEMACCESS(2)
+    "st1        {v0.8h}, [%2], 16              \n"  // store 8 pixels RGB565.
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_uv),    // %1
+      "+r"(dst_rgb565),  // %2
+      "+r"(width)      // %3
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_NV21TORGB565ROW_NEON
+
+#ifdef HAS_YUY2TOARGBROW_NEON
+void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
+                        uint8* dst_argb,
+                        int width) {
+  int64 width64 = (int64)(width);
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+  "1:                                          \n"
+    READYUY2
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w2, %w2, #8                   \n"
+    "movi       v23.8b, #255                   \n"
+    MEMACCESS(1)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32      \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_yuy2),  // %0
+      "+r"(dst_argb),  // %1
+      "+r"(width64)    // %2
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_YUY2TOARGBROW_NEON
+
+#ifdef HAS_UYVYTOARGBROW_NEON
+void UYVYToARGBRow_NEON(const uint8* src_uyvy,
+                        uint8* dst_argb,
+                        int width) {
+  int64 width64 = (int64)(width);
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+  "1:                                          \n"
+    READUYVY
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w2, %w2, #8                   \n"
+    "movi       v23.8b, #255                   \n"
+    MEMACCESS(1)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32      \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_uyvy),  // %0
+      "+r"(dst_argb),  // %1
+      "+r"(width64)    // %2
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_UYVYTOARGBROW_NEON
+
+// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
+#ifdef HAS_SPLITUVROW_NEON
+void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                     int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pairs of UV
+    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
+    MEMACCESS(1)
+    "st1        {v0.16b}, [%1], #16            \n"  // store U
+    MEMACCESS(2)
+    "st1        {v1.16b}, [%2], #16            \n"  // store V
+    "b.gt       1b                             \n"
+    : "+r"(src_uv),  // %0
+      "+r"(dst_u),   // %1
+      "+r"(dst_v),   // %2
+      "+r"(width)    // %3  // Output registers
+    :                       // Input registers
+    : "cc", "memory", "v0", "v1"  // Clobber List
+  );
+}
+#endif  // HAS_SPLITUVROW_NEON
+
+// Reads 16 U's and V's and writes out 16 pairs of UV.
+#ifdef HAS_MERGEUVROW_NEON
+void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                     int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load U
+    MEMACCESS(1)
+    "ld1        {v1.16b}, [%1], #16            \n"  // load V
+    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
+    MEMACCESS(2)
+    "st2        {v0.16b,v1.16b}, [%2], #32     \n"  // store 16 pairs of UV
+    "b.gt       1b                             \n"
+    :
+      "+r"(src_u),   // %0
+      "+r"(src_v),   // %1
+      "+r"(dst_uv),  // %2
+      "+r"(width)    // %3  // Output registers
+    :                       // Input registers
+    : "cc", "memory", "v0", "v1"  // Clobber List
+  );
+}
+#endif  // HAS_MERGEUVROW_NEON
+
+// Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
+#ifdef HAS_COPYROW_NEON
+void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32       \n"  // load 32
+    "subs       %w2, %w2, #32                  \n"  // 32 processed per loop
+    MEMACCESS(1)
+    "st1        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32       \n"  // store 32
+    "b.gt       1b                             \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(count)  // %2  // Output registers
+  :                     // Input registers
+  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+#endif  // HAS_COPYROW_NEON
+
+// SetRow writes 'count' bytes using an 8 bit value repeated.
+void SetRow_NEON(uint8* dst, uint8 v8, int count) {
+  asm volatile (
+    "dup        v0.16b, %w2                    \n"  // duplicate 16 bytes
+  "1:                                          \n"
+    "subs      %w1, %w1, #16                   \n"  // 16 bytes per loop
+    MEMACCESS(0)
+    "st1        {v0.16b}, [%0], #16            \n"  // store
+    "b.gt      1b                              \n"
+  : "+r"(dst),   // %0
+    "+r"(count)  // %1
+  : "r"(v8)      // %2
+  : "cc", "memory", "v0"
+  );
+}
+
+void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
+  asm volatile (
+    "dup        v0.4s, %w2                     \n"  // duplicate 4 ints
+  "1:                                          \n"
+    "subs      %w1, %w1, #4                    \n"  // 4 ints per loop
+    MEMACCESS(0)
+    "st1        {v0.16b}, [%0], #16            \n"  // store
+    "b.gt      1b                              \n"
+  : "+r"(dst),   // %0
+    "+r"(count)  // %1
+  : "r"(v32)     // %2
+  : "cc", "memory", "v0"
+  );
+}
+
+#ifdef HAS_MIRRORROW_NEON
+void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
+  int64 width64 = (int64) width;
+  asm volatile (
+    // Start at end of source row.
+    "add        %0, %0, %2                     \n"
+    "sub        %0, %0, #16                    \n"
+
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
+    "subs       %2, %2, #16                   \n"  // 16 pixels per loop.
+    "rev64      v0.16b, v0.16b                 \n"
+    MEMACCESS(1)
+    "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
+    MEMACCESS(1)
+    "st1        {v0.D}[0], [%1], #8            \n"
+    "b.gt       1b                             \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width64)  // %2
+  : "r"((ptrdiff_t)-16)    // %3
+  : "cc", "memory", "v0"
+  );
+}
+#endif  // HAS_MIRRORROW_NEON
+
+#ifdef HAS_MIRRORUVROW_NEON
+void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                      int width) {
+  int64 width64 = (int64) width;
+  asm volatile (
+    // Start at end of source row.
+    "add        %0, %0, %3, lsl #1             \n"
+    "sub        %0, %0, #16                    \n"
+
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld2        {v0.8b, v1.8b}, [%0], %4       \n"  // src -= 16
+    "subs       %3, %3, #8                     \n"  // 8 pixels per loop.
+    "rev64      v0.8b, v0.8b                   \n"
+    "rev64      v1.8b, v1.8b                   \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // dst += 8
+    MEMACCESS(2)
+    "st1        {v1.8b}, [%2], #8              \n"
+    "b.gt       1b                             \n"
+  : "+r"(src_uv),  // %0
+    "+r"(dst_u),   // %1
+    "+r"(dst_v),   // %2
+    "+r"(width64)    // %3
+  : "r"((ptrdiff_t)-16)      // %4
+  : "cc", "memory", "v0", "v1"
+  );
+}
+#endif  // HAS_MIRRORUVROW_NEON
+
+#ifdef HAS_ARGBMIRRORROW_NEON
+void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
+  int64 width64 = (int64) width;
+  asm volatile (
+    // Start at end of source row.
+    "add        %0, %0, %2, lsl #2             \n"
+    "sub        %0, %0, #16                    \n"
+
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
+    "subs       %2, %2, #4                     \n"  // 4 pixels per loop.
+    "rev64      v0.4s, v0.4s                   \n"
+    MEMACCESS(1)
+    "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
+    MEMACCESS(1)
+    "st1        {v0.D}[0], [%1], #8            \n"
+    "b.gt       1b                             \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width64)  // %2
+  : "r"((ptrdiff_t)-16)    // %3
+  : "cc", "memory", "v0"
+  );
+}
+#endif  // HAS_ARGBMIRRORROW_NEON
+
+#ifdef HAS_RGB24TOARGBROW_NEON
+void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
+  asm volatile (
+    "movi       v4.8b, #255                    \n"  // Alpha
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld3        {v1.8b,v2.8b,v3.8b}, [%0], #24 \n"  // load 8 pixels of RGB24.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    MEMACCESS(1)
+    "st4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+  : "+r"(src_rgb24),  // %0
+    "+r"(dst_argb),   // %1
+    "+r"(pix)         // %2
+  :
+  : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
+  );
+}
+#endif  // HAS_RGB24TOARGBROW_NEON
+
+#ifdef HAS_RAWTOARGBROW_NEON
+void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
+  asm volatile (
+    "movi       v5.8b, #255                    \n"  // Alpha
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "orr        v3.8b, v1.8b, v1.8b            \n"  // move g
+    "orr        v4.8b, v0.8b, v0.8b            \n"  // move r
+    MEMACCESS(1)
+    "st4        {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n"  // store b g r a
+    "b.gt       1b                             \n"
+  : "+r"(src_raw),   // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"  // Clobber List
+  );
+}
+#endif  // HAS_RAWTOARGBROW_NEON
+
+#define RGB565TOARGB                                                           \
+    "shrn       v6.8b, v0.8h, #5               \n"  /* G xxGGGGGG           */ \
+    "shl        v6.8b, v6.8b, #2               \n"  /* G GGGGGG00 upper 6   */ \
+    "ushr       v4.8b, v6.8b, #6               \n"  /* G 000000GG lower 2   */ \
+    "orr        v1.8b, v4.8b, v6.8b            \n"  /* G                    */ \
+    "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \
+    "ushr       v0.8h, v0.8h, #11              \n"  /* R 000RRRRR           */ \
+    "xtn2       v2.16b,v0.8h                   \n"  /* R in upper part      */ \
+    "shl        v2.16b, v2.16b, #3             \n"  /* R,B BBBBB000 upper 5 */ \
+    "ushr       v0.16b, v2.16b, #5             \n"  /* R,B 00000BBB lower 3 */ \
+    "orr        v0.16b, v0.16b, v2.16b         \n"  /* R,B                  */ \
+    "dup        v2.2D, v0.D[1]                 \n"  /* R                    */
+
+#ifdef HAS_RGB565TOARGBROW_NEON
+void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) {
+  asm volatile (
+    "movi       v3.8b, #255                    \n"  // Alpha
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    RGB565TOARGB
+    MEMACCESS(1)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+  : "+r"(src_rgb565),  // %0
+    "+r"(dst_argb),    // %1
+    "+r"(pix)          // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6"  // Clobber List
+  );
+}
+#endif  // HAS_RGB565TOARGBROW_NEON
+
+#define ARGB1555TOARGB                                                         \
+    "ushr       v2.8h, v0.8h, #10              \n"  /* R xxxRRRRR           */ \
+    "shl        v2.8h, v2.8h, #3               \n"  /* R RRRRR000 upper 5   */ \
+    "xtn        v3.8b, v2.8h                   \n"  /* RRRRR000 AAAAAAAA    */ \
+                                                                               \
+    "sshr       v2.8h, v0.8h, #15              \n"  /* A AAAAAAAA           */ \
+    "xtn2       v3.16b, v2.8h                  \n"                             \
+                                                                               \
+    "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \
+    "shrn2      v2.16b,v0.8h, #5               \n"  /* G xxxGGGGG           */ \
+                                                                               \
+    "ushr       v1.16b, v3.16b, #5             \n"  /* R,A 00000RRR lower 3 */ \
+    "shl        v0.16b, v2.16b, #3             \n"  /* B,G BBBBB000 upper 5 */ \
+    "ushr       v2.16b, v0.16b, #5             \n"  /* B,G 00000BBB lower 3 */ \
+                                                                               \
+    "orr        v0.16b, v0.16b, v2.16b         \n"  /* B,G                  */ \
+    "orr        v2.16b, v1.16b, v3.16b         \n"  /* R,A                  */ \
+    "dup        v1.2D, v0.D[1]                 \n"                             \
+    "dup        v3.2D, v2.D[1]                 \n"
+
+// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
+#define RGB555TOARGB                                                           \
+    "ushr       v2.8h, v0.8h, #10              \n"  /* R xxxRRRRR           */ \
+    "shl        v2.8h, v2.8h, #3               \n"  /* R RRRRR000 upper 5   */ \
+    "xtn        v3.8b, v2.8h                   \n"  /* RRRRR000             */ \
+                                                                               \
+    "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \
+    "shrn2      v2.16b,v0.8h, #5               \n"  /* G xxxGGGGG           */ \
+                                                                               \
+    "ushr       v1.16b, v3.16b, #5             \n"  /* R   00000RRR lower 3 */ \
+    "shl        v0.16b, v2.16b, #3             \n"  /* B,G BBBBB000 upper 5 */ \
+    "ushr       v2.16b, v0.16b, #5             \n"  /* B,G 00000BBB lower 3 */ \
+                                                                               \
+    "orr        v0.16b, v0.16b, v2.16b         \n"  /* B,G                  */ \
+    "orr        v2.16b, v1.16b, v3.16b         \n"  /* R                    */ \
+    "dup        v1.2D, v0.D[1]                 \n"  /* G */                    \
+
+#ifdef HAS_ARGB1555TOARGBROW_NEON
+void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
+                            int pix) {
+  asm volatile (
+    "movi       v3.8b, #255                    \n"  // Alpha
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    ARGB1555TOARGB
+    MEMACCESS(1)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+  : "+r"(src_argb1555),  // %0
+    "+r"(dst_argb),    // %1
+    "+r"(pix)          // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+#endif  // HAS_ARGB1555TOARGBROW_NEON
+
+#define ARGB4444TOARGB                                                         \
+    "shrn       v1.8b,  v0.8h, #8              \n"  /* v1(l) AR             */ \
+    "xtn2       v1.16b, v0.8h                  \n"  /* v1(h) GB             */ \
+    "shl        v2.16b, v1.16b, #4             \n"  /* B,R BBBB0000         */ \
+    "ushr       v3.16b, v1.16b, #4             \n"  /* G,A 0000GGGG         */ \
+    "ushr       v0.16b, v2.16b, #4             \n"  /* B,R 0000BBBB         */ \
+    "shl        v1.16b, v3.16b, #4             \n"  /* G,A GGGG0000         */ \
+    "orr        v2.16b, v0.16b, v2.16b         \n"  /* B,R BBBBBBBB         */ \
+    "orr        v3.16b, v1.16b, v3.16b         \n"  /* G,A GGGGGGGG         */ \
+    "dup        v0.2D, v2.D[1]                 \n"                             \
+    "dup        v1.2D, v3.D[1]                 \n"
+
+#ifdef HAS_ARGB4444TOARGBROW_NEON
+void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
+                            int pix) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    ARGB4444TOARGB
+    MEMACCESS(1)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+  : "+r"(src_argb4444),  // %0
+    "+r"(dst_argb),    // %1
+    "+r"(pix)          // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
+  );
+}
+#endif  // HAS_ARGB4444TOARGBROW_NEON
+
+#ifdef HAS_ARGBTORGB24ROW_NEON
+void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load 8 ARGB pixels
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    MEMACCESS(1)
+    "st3        {v1.8b,v2.8b,v3.8b}, [%1], #24 \n"  // store 8 pixels of RGB24.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_rgb24),  // %1
+    "+r"(pix)         // %2
+  :
+  : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
+  );
+}
+#endif  // HAS_ARGBTORGB24ROW_NEON
+
+#ifdef HAS_ARGBTORAWROW_NEON
+void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load b g r a
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "orr        v4.8b, v2.8b, v2.8b            \n"  // mov g
+    "orr        v5.8b, v1.8b, v1.8b            \n"  // mov b
+    MEMACCESS(1)
+    "st3        {v3.8b,v4.8b,v5.8b}, [%1], #24 \n"  // store r g b
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_raw),   // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "v1", "v2", "v3", "v4", "v5"  // Clobber List
+  );
+}
+#endif  // HAS_ARGBTORAWROW_NEON
+
+#ifdef HAS_YUY2TOYROW_NEON
+void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of YUY2.
+    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
+    MEMACCESS(1)
+    "st1        {v0.16b}, [%1], #16            \n"  // store 16 pixels of Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_yuy2),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "v0", "v1"  // Clobber List
+  );
+}
+#endif  // HAS_YUY2TOYROW_NEON
+
+#ifdef HAS_UYVYTOYROW_NEON
+void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of UYVY.
+    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
+    MEMACCESS(1)
+    "st1        {v1.16b}, [%1], #16            \n"  // store 16 pixels of Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_uyvy),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "v0", "v1"  // Clobber List
+  );
+}
+#endif  // HAS_UYVYTOYROW_NEON
+
+#ifdef HAS_YUY2TOUV422ROW_NEON
+void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
+                         int pix) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 YUY2 pixels
+    "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
+    MEMACCESS(1)
+    "st1        {v1.8b}, [%1], #8              \n"  // store 8 U.
+    MEMACCESS(2)
+    "st1        {v3.8b}, [%2], #8              \n"  // store 8 V.
+    "b.gt       1b                             \n"
+  : "+r"(src_yuy2),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(pix)        // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+#endif  // HAS_YUY2TOUV422ROW_NEON
+
+#ifdef HAS_UYVYTOUV422ROW_NEON
+void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
+                         int pix) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 UYVY pixels
+    "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 U.
+    MEMACCESS(2)
+    "st1        {v2.8b}, [%2], #8              \n"  // store 8 V.
+    "b.gt       1b                             \n"
+  : "+r"(src_uyvy),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(pix)        // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+#endif  // HAS_UYVYTOUV422ROW_NEON
+
+#ifdef HAS_YUY2TOUVROW_NEON
+void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_yuy2b = src_yuy2 + stride_yuy2;
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
+    "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.
+    MEMACCESS(1)
+    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
+    "urhadd     v1.8b, v1.8b, v5.8b            \n"  // average rows of U
+    "urhadd     v3.8b, v3.8b, v7.8b            \n"  // average rows of V
+    MEMACCESS(2)
+    "st1        {v1.8b}, [%2], #8              \n"  // store 8 U.
+    MEMACCESS(3)
+    "st1        {v3.8b}, [%3], #8              \n"  // store 8 V.
+    "b.gt       1b                             \n"
+  : "+r"(src_yuy2),     // %0
+    "+r"(src_yuy2b),    // %1
+    "+r"(dst_u),        // %2
+    "+r"(dst_v),        // %3
+    "+r"(pix)           // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
+    "v5", "v6", "v7"  // Clobber List
+  );
+}
+#endif  // HAS_YUY2TOUVROW_NEON
+
+#ifdef HAS_UYVYTOUVROW_NEON
+void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_uyvyb = src_uyvy + stride_uyvy;
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
+    "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.
+    MEMACCESS(1)
+    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
+    "urhadd     v0.8b, v0.8b, v4.8b            \n"  // average rows of U
+    "urhadd     v2.8b, v2.8b, v6.8b            \n"  // average rows of V
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 U.
+    MEMACCESS(3)
+    "st1        {v2.8b}, [%3], #8              \n"  // store 8 V.
+    "b.gt       1b                             \n"
+  : "+r"(src_uyvy),     // %0
+    "+r"(src_uyvyb),    // %1
+    "+r"(dst_u),        // %2
+    "+r"(dst_v),        // %3
+    "+r"(pix)           // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
+    "v5", "v6", "v7"  // Clobber List
+  );
+}
+#endif  // HAS_UYVYTOUVROW_NEON
+
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+#ifdef HAS_ARGBSHUFFLEROW_NEON
+void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int pix) {
+  asm volatile (
+    MEMACCESS(3)
+    "ld1        {v2.16b}, [%3]                 \n"  // shuffler
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 4 pixels.
+    "subs       %w2, %w2, #4                   \n"  // 4 processed per loop
+    "tbl        v1.16b, {v0.16b}, v2.16b       \n"  // look up 4 pixels
+    MEMACCESS(1)
+    "st1        {v1.16b}, [%1], #16            \n"  // store 4.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  : "r"(shuffler)    // %3
+  : "cc", "memory", "v0", "v1", "v2"  // Clobber List
+  );
+}
+#endif  // HAS_ARGBSHUFFLEROW_NEON
+
+#ifdef HAS_I422TOYUY2ROW_NEON
+void I422ToYUY2Row_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_yuy2, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld2        {v0.8b, v1.8b}, [%0], #16      \n"  // load 16 Ys
+    "orr        v2.8b, v1.8b, v1.8b            \n"
+    MEMACCESS(1)
+    "ld1        {v1.8b}, [%1], #8              \n"  // load 8 Us
+    MEMACCESS(2)
+    "ld1        {v3.8b}, [%2], #8              \n"  // load 8 Vs
+    "subs       %w4, %w4, #16                  \n"  // 16 pixels
+    MEMACCESS(3)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
+    "b.gt       1b                             \n"
+  : "+r"(src_y),     // %0
+    "+r"(src_u),     // %1
+    "+r"(src_v),     // %2
+    "+r"(dst_yuy2),  // %3
+    "+r"(width)      // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3"
+  );
+}
+#endif  // HAS_I422TOYUY2ROW_NEON
+
+#ifdef HAS_I422TOUYVYROW_NEON
+void I422ToUYVYRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_uyvy, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld2        {v1.8b,v2.8b}, [%0], #16       \n"  // load 16 Ys
+    "orr        v3.8b, v2.8b, v2.8b            \n"
+    MEMACCESS(1)
+    "ld1        {v0.8b}, [%1], #8              \n"  // load 8 Us
+    MEMACCESS(2)
+    "ld1        {v2.8b}, [%2], #8              \n"  // load 8 Vs
+    "subs       %w4, %w4, #16                  \n"  // 16 pixels
+    MEMACCESS(3)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
+    "b.gt       1b                             \n"
+  : "+r"(src_y),     // %0
+    "+r"(src_u),     // %1
+    "+r"(src_v),     // %2
+    "+r"(dst_uyvy),  // %3
+    "+r"(width)      // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3"
+  );
+}
+#endif  // HAS_I422TOUYVYROW_NEON
+
+#ifdef HAS_ARGBTORGB565ROW_NEON
+void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    ARGBTORGB565
+    MEMACCESS(1)
+    "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels RGB565.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_rgb565),  // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
+  );
+}
+#endif  // HAS_ARGBTORGB565ROW_NEON
+
+#ifdef HAS_ARGBTORGB565DITHERROW_NEON
+void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
+                                const uint32 dither4, int width) {
+  asm volatile (
+    "dup        v1.4s, %w2                     \n"  // dither4
+  "1:                                          \n"
+    MEMACCESS(1)
+    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"  // load 8 pixels
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "uqadd      v20.8b, v20.8b, v1.8b          \n"
+    "uqadd      v21.8b, v21.8b, v1.8b          \n"
+    "uqadd      v22.8b, v22.8b, v1.8b          \n"
+    ARGBTORGB565
+    MEMACCESS(0)
+    "st1        {v0.16b}, [%0], #16            \n"  // store 8 pixels RGB565.
+    "b.gt       1b                             \n"
+  : "+r"(dst_rgb)    // %0
+  : "r"(src_argb),   // %1
+    "r"(dither4),    // %2
+    "r"(width)       // %3
+  : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23"
+  );
+}
+#endif  // HAS_ARGBTORGB565ROW_NEON
+
+#ifdef HAS_ARGBTOARGB1555ROW_NEON
+void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
+                            int pix) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    ARGBTOARGB1555
+    MEMACCESS(1)
+    "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels ARGB1555.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb1555),  // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
+  );
+}
+#endif  // HAS_ARGBTOARGB1555ROW_NEON
+
+#ifdef HAS_ARGBTOARGB4444ROW_NEON
+void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
+                            int pix) {
+  asm volatile (
+    "movi       v4.16b, #0x0f                  \n"  // bits to clear with vbic.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    ARGBTOARGB4444
+    MEMACCESS(1)
+    "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels ARGB4444.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),      // %0
+    "+r"(dst_argb4444),  // %1
+    "+r"(pix)            // %2
+  :
+  : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23"
+  );
+}
+#endif  // HAS_ARGBTOARGB4444ROW_NEON
+
+#ifdef HAS_ARGBTOYROW_NEON
+void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
+  asm volatile (
+    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
+    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
+    "movi       v7.8b, #16                     \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v3.8h, v0.8b, v4.8b            \n"  // B
+    "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
+    "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
+    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v7.8b            \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
+  );
+}
+#endif  // HAS_ARGBTOYROW_NEON
+
+#ifdef HAS_ARGBTOYJROW_NEON
+void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
+  asm volatile (
+    "movi       v4.8b, #15                     \n"  // B * 0.11400 coefficient
+    "movi       v5.8b, #75                     \n"  // G * 0.58700 coefficient
+    "movi       v6.8b, #38                     \n"  // R * 0.29900 coefficient
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v3.8h, v0.8b, v4.8b            \n"  // B
+    "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
+    "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
+    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 15 bit to 8 bit Y
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
+  );
+}
+#endif  // HAS_ARGBTOYJROW_NEON
+
+// 8x1 pixels.
+#ifdef HAS_ARGBTOUV444ROW_NEON
+void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                         int pix) {
+  asm volatile (
+    "movi       v24.8b, #112                   \n"  // UB / VR 0.875 coefficient
+    "movi       v25.8b, #74                    \n"  // UG -0.5781 coefficient
+    "movi       v26.8b, #38                    \n"  // UR -0.2969 coefficient
+    "movi       v27.8b, #18                    \n"  // VB -0.1406 coefficient
+    "movi       v28.8b, #94                    \n"  // VG -0.7344 coefficient
+    "movi       v29.16b,#0x80                  \n"  // 128.5
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "umull      v4.8h, v0.8b, v24.8b           \n"  // B
+    "umlsl      v4.8h, v1.8b, v25.8b           \n"  // G
+    "umlsl      v4.8h, v2.8b, v26.8b           \n"  // R
+    "add        v4.8h, v4.8h, v29.8h           \n"  // +128 -> unsigned
+
+    "umull      v3.8h, v2.8b, v24.8b           \n"  // R
+    "umlsl      v3.8h, v1.8b, v28.8b           \n"  // G
+    "umlsl      v3.8h, v0.8b, v27.8b           \n"  // B
+    "add        v3.8h, v3.8h, v29.8h           \n"  // +128 -> unsigned
+
+    "uqshrn     v0.8b, v4.8h, #8               \n"  // 16 bit to 8 bit U
+    "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
+
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
+    MEMACCESS(2)
+    "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(pix)        // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
+    "v24", "v25", "v26", "v27", "v28", "v29"
+  );
+}
+#endif  // HAS_ARGBTOUV444ROW_NEON
+
+// 16x1 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
+#ifdef HAS_ARGBTOUV422ROW_NEON
+void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                         int pix) {
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+
+    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
+    "mul        v3.8h, v0.8h, v20.8h           \n"  // B
+    "mls        v3.8h, v1.8h, v21.8h           \n"  // G
+    "mls        v3.8h, v2.8h, v22.8h           \n"  // R
+    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
+
+    "mul        v4.8h, v2.8h, v20.8h           \n"  // R
+    "mls        v4.8h, v1.8h, v24.8h           \n"  // G
+    "mls        v4.8h, v0.8h, v23.8h           \n"  // B
+    "add        v4.8h, v4.8h, v25.8h           \n"  // +128 -> unsigned
+
+    "uqshrn     v0.8b, v3.8h, #8               \n"  // 16 bit to 8 bit U
+    "uqshrn     v1.8b, v4.8h, #8               \n"  // 16 bit to 8 bit V
+
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
+    MEMACCESS(2)
+    "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(pix)        // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+#endif  // HAS_ARGBTOUV422ROW_NEON
+
+// 32x1 pixels -> 8x1.  pix is number of argb pixels. e.g. 32.
+#ifdef HAS_ARGBTOUV411ROW_NEON
+void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                         int pix) {
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(0)
+    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%0], #64 \n"  // load next 16.
+    "uaddlp     v4.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v5.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v6.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    "addp       v0.8h, v0.8h, v4.8h            \n"  // B 16 shorts -> 8 shorts.
+    "addp       v1.8h, v1.8h, v5.8h            \n"  // G 16 shorts -> 8 shorts.
+    "addp       v2.8h, v2.8h, v6.8h            \n"  // R 16 shorts -> 8 shorts.
+
+    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v1.8h, #1               \n"
+    "urshr      v2.8h, v2.8h, #1               \n"
+
+    "subs       %w3, %w3, #32                  \n"  // 32 processed per loop.
+    "mul        v3.8h, v0.8h, v20.8h           \n"  // B
+    "mls        v3.8h, v1.8h, v21.8h           \n"  // G
+    "mls        v3.8h, v2.8h, v22.8h           \n"  // R
+    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
+    "mul        v4.8h, v2.8h, v20.8h           \n"  // R
+    "mls        v4.8h, v1.8h, v24.8h           \n"  // G
+    "mls        v4.8h, v0.8h, v23.8h           \n"  // B
+    "add        v4.8h, v4.8h, v25.8h           \n"  // +128 -> unsigned
+    "uqshrn     v0.8b, v3.8h, #8               \n"  // 16 bit to 8 bit U
+    "uqshrn     v1.8b, v4.8h, #8               \n"  // 16 bit to 8 bit V
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
+    MEMACCESS(2)
+    "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(pix)        // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+#endif  // HAS_ARGBTOUV411ROW_NEON
+
+// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
+#define RGBTOUV(QB, QG, QR) \
+    "mul        v3.8h, " #QB ",v20.8h          \n"  /* B                    */ \
+    "mul        v4.8h, " #QR ",v20.8h          \n"  /* R                    */ \
+    "mls        v3.8h, " #QG ",v21.8h          \n"  /* G                    */ \
+    "mls        v4.8h, " #QG ",v24.8h          \n"  /* G                    */ \
+    "mls        v3.8h, " #QR ",v22.8h          \n"  /* R                    */ \
+    "mls        v4.8h, " #QB ",v23.8h          \n"  /* B                    */ \
+    "add        v3.8h, v3.8h, v25.8h           \n"  /* +128 -> unsigned     */ \
+    "add        v4.8h, v4.8h, v25.8h           \n"  /* +128 -> unsigned     */ \
+    "uqshrn     v0.8b, v3.8h, #8               \n"  /* 16 bit to 8 bit U    */ \
+    "uqshrn     v1.8b, v4.8h, #8               \n"  /* 16 bit to 8 bit V    */
+
+// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
+// TODO(fbarchard): consider ptrdiff_t for all strides.
+
+#ifdef HAS_ARGBTOUVROW_NEON
+void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_argb_1 = src_argb + src_stride_argb;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    MEMACCESS(1)
+    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
+    "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v1.8h, #1               \n"
+    "urshr      v2.8h, v2.8h, #1               \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+    RGBTOUV(v0.8h, v1.8h, v2.8h)
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(src_argb_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+#endif  // HAS_ARGBTOUVROW_NEON
+
+// TODO(fbarchard): Subsample match C code.
+#ifdef HAS_ARGBTOUVJROW_NEON
+void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_argb_1 = src_argb + src_stride_argb;
+  asm volatile (
+    "movi       v20.8h, #63, lsl #0            \n"  // UB/VR coeff (0.500) / 2
+    "movi       v21.8h, #42, lsl #0            \n"  // UG coeff (-0.33126) / 2
+    "movi       v22.8h, #21, lsl #0            \n"  // UR coeff (-0.16874) / 2
+    "movi       v23.8h, #10, lsl #0            \n"  // VB coeff (-0.08131) / 2
+    "movi       v24.8h, #53, lsl #0            \n"  // VG coeff (-0.41869) / 2
+    "movi       v25.16b, #0x80                 \n"  // 128.5 (0x8080 in 16-bit)
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64  \n"  // load next 16
+    "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v1.8h, #1               \n"
+    "urshr      v2.8h, v2.8h, #1               \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+    RGBTOUV(v0.8h, v1.8h, v2.8h)
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(src_argb_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+#endif  // HAS_ARGBTOUVJROW_NEON
+
+#ifdef HAS_BGRATOUVROW_NEON
+void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_bgra_1 = src_bgra + src_stride_bgra;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+    "uaddlp     v0.8h, v3.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v3.8h, v2.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v1.16b                  \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more
+    "uadalp     v0.8h, v7.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v3.8h, v6.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v5.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v3.8h, #1               \n"
+    "urshr      v2.8h, v2.8h, #1               \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+    RGBTOUV(v0.8h, v1.8h, v2.8h)
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_bgra),  // %0
+    "+r"(src_bgra_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+#endif  // HAS_BGRATOUVROW_NEON
+
+#ifdef HAS_ABGRTOUVROW_NEON
+void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_abgr_1 = src_abgr + src_stride_abgr;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+    "uaddlp     v3.8h, v2.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v0.16b                  \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
+    "uadalp     v3.8h, v6.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v4.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    "urshr      v0.8h, v3.8h, #1               \n"  // 2x average
+    "urshr      v2.8h, v2.8h, #1               \n"
+    "urshr      v1.8h, v1.8h, #1               \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+    RGBTOUV(v0.8h, v2.8h, v1.8h)
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_abgr),  // %0
+    "+r"(src_abgr_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+#endif  // HAS_ABGRTOUVROW_NEON
+
+#ifdef HAS_RGBATOUVROW_NEON
+void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_rgba_1 = src_rgba + src_stride_rgba;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+    "uaddlp     v0.8h, v1.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v2.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v3.16b                  \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
+    "uadalp     v0.8h, v5.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v6.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v7.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v1.8h, #1               \n"
+    "urshr      v2.8h, v2.8h, #1               \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+    RGBTOUV(v0.8h, v1.8h, v2.8h)
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_rgba),  // %0
+    "+r"(src_rgba_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+#endif  // HAS_RGBATOUVROW_NEON
+
+#ifdef HAS_RGB24TOUVROW_NEON
+void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
+                       uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 pixels.
+    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "ld3        {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 16 more.
+    "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v1.8h, #1               \n"
+    "urshr      v2.8h, v2.8h, #1               \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+    RGBTOUV(v0.8h, v1.8h, v2.8h)
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_rgb24),  // %0
+    "+r"(src_rgb24_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+#endif  // HAS_RGB24TOUVROW_NEON
+
+#ifdef HAS_RAWTOUVROW_NEON
+void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
+                     uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_raw_1 = src_raw + src_stride_raw;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 8 RAW pixels.
+    "uaddlp     v2.8h, v2.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v0.8h, v0.16b                  \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "ld3        {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 8 more RAW pixels
+    "uadalp     v2.8h, v6.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v0.8h, v4.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    "urshr      v2.8h, v2.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v1.8h, #1               \n"
+    "urshr      v0.8h, v0.8h, #1               \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+    RGBTOUV(v2.8h, v1.8h, v0.8h)
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_raw),  // %0
+    "+r"(src_raw_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+#endif  // HAS_RAWTOUVROW_NEON
+
+// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
+#ifdef HAS_RGB565TOUVROW_NEON
+void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
+                        uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
+  asm volatile (
+    "movi       v22.8h, #56, lsl #0            \n"  // UB / VR coeff (0.875) / 2
+    "movi       v23.8h, #37, lsl #0            \n"  // UG coeff (-0.5781) / 2
+    "movi       v24.8h, #19, lsl #0            \n"  // UR coeff (-0.2969) / 2
+    "movi       v25.8h, #9 , lsl #0            \n"  // VB coeff (-0.1406) / 2
+    "movi       v26.8h, #47, lsl #0            \n"  // VG coeff (-0.7344) / 2
+    "movi       v27.16b, #0x80                 \n"  // 128.5 (0x8080 in 16-bit)
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
+    RGB565TOARGB
+    "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uaddlp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uaddlp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // next 8 RGB565 pixels.
+    RGB565TOARGB
+    "uaddlp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uaddlp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uaddlp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+
+    MEMACCESS(1)
+    "ld1        {v0.16b}, [%1], #16            \n"  // load 8 RGB565 pixels.
+    RGB565TOARGB
+    "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uadalp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uadalp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(1)
+    "ld1        {v0.16b}, [%1], #16            \n"  // next 8 RGB565 pixels.
+    RGB565TOARGB
+    "uadalp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uadalp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uadalp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+
+    "ins        v16.D[1], v17.D[0]             \n"
+    "ins        v18.D[1], v19.D[0]             \n"
+    "ins        v20.D[1], v21.D[0]             \n"
+
+    "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
+    "urshr      v5.8h, v18.8h, #1              \n"
+    "urshr      v6.8h, v20.8h, #1              \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
+    "mul        v16.8h, v4.8h, v22.8h          \n"  // B
+    "mls        v16.8h, v5.8h, v23.8h          \n"  // G
+    "mls        v16.8h, v6.8h, v24.8h          \n"  // R
+    "add        v16.8h, v16.8h, v27.8h         \n"  // +128 -> unsigned
+    "mul        v17.8h, v6.8h, v22.8h          \n"  // R
+    "mls        v17.8h, v5.8h, v26.8h          \n"  // G
+    "mls        v17.8h, v4.8h, v25.8h          \n"  // B
+    "add        v17.8h, v17.8h, v27.8h         \n"  // +128 -> unsigned
+    "uqshrn     v0.8b, v16.8h, #8              \n"  // 16 bit to 8 bit U
+    "uqshrn     v1.8b, v17.8h, #8              \n"  // 16 bit to 8 bit V
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_rgb565),  // %0
+    "+r"(src_rgb565_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
+    "v25", "v26", "v27"
+  );
+}
+#endif  // HAS_RGB565TOUVROW_NEON
+
+// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
+#ifdef HAS_ARGB1555TOUVROW_NEON
+void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
+                        uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
+    RGB555TOARGB
+    "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB1555 pixels.
+    RGB555TOARGB
+    "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+
+    MEMACCESS(1)
+    "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB1555 pixels.
+    RGB555TOARGB
+    "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(1)
+    "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB1555 pixels.
+    RGB555TOARGB
+    "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+
+    "ins        v16.D[1], v26.D[0]             \n"
+    "ins        v17.D[1], v27.D[0]             \n"
+    "ins        v18.D[1], v28.D[0]             \n"
+
+    "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
+    "urshr      v5.8h, v17.8h, #1              \n"
+    "urshr      v6.8h, v18.8h, #1              \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
+    "mul        v2.8h, v4.8h, v20.8h           \n"  // B
+    "mls        v2.8h, v5.8h, v21.8h           \n"  // G
+    "mls        v2.8h, v6.8h, v22.8h           \n"  // R
+    "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned
+    "mul        v3.8h, v6.8h, v20.8h           \n"  // R
+    "mls        v3.8h, v5.8h, v24.8h           \n"  // G
+    "mls        v3.8h, v4.8h, v23.8h           \n"  // B
+    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
+    "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U
+    "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb1555),  // %0
+    "+r"(src_argb1555_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
+    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
+    "v26", "v27", "v28"
+  );
+}
+#endif  // HAS_ARGB1555TOUVROW_NEON
+
+// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
+#ifdef HAS_ARGB4444TOUVROW_NEON
+void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
+                          uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
+    ARGB4444TOARGB
+    "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB4444 pixels.
+    ARGB4444TOARGB
+    "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+
+    MEMACCESS(1)
+    "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB4444 pixels.
+    ARGB4444TOARGB
+    "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(1)
+    "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB4444 pixels.
+    ARGB4444TOARGB
+    "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+
+    "ins        v16.D[1], v26.D[0]             \n"
+    "ins        v17.D[1], v27.D[0]             \n"
+    "ins        v18.D[1], v28.D[0]             \n"
+
+    "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
+    "urshr      v5.8h, v17.8h, #1              \n"
+    "urshr      v6.8h, v18.8h, #1              \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
+    "mul        v2.8h, v4.8h, v20.8h           \n"  // B
+    "mls        v2.8h, v5.8h, v21.8h           \n"  // G
+    "mls        v2.8h, v6.8h, v22.8h           \n"  // R
+    "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned
+    "mul        v3.8h, v6.8h, v20.8h           \n"  // R
+    "mls        v3.8h, v5.8h, v24.8h           \n"  // G
+    "mls        v3.8h, v4.8h, v23.8h           \n"  // B
+    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
+    "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U
+    "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb4444),  // %0
+    "+r"(src_argb4444_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
+    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
+    "v26", "v27", "v28"
+
+  );
+}
+#endif  // HAS_ARGB4444TOUVROW_NEON
+
+#ifdef HAS_RGB565TOYROW_NEON
+void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
+  asm volatile (
+    "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient
+    "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient
+    "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient
+    "movi       v27.8b, #16                    \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    RGB565TOARGB
+    "umull      v3.8h, v0.8b, v24.8b           \n"  // B
+    "umlal      v3.8h, v1.8b, v25.8b           \n"  // G
+    "umlal      v3.8h, v2.8b, v26.8b           \n"  // R
+    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v27.8b           \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_rgb565),  // %0
+    "+r"(dst_y),       // %1
+    "+r"(pix)          // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6",
+    "v24", "v25", "v26", "v27"
+  );
+}
+#endif  // HAS_RGB565TOYROW_NEON
+
+#ifdef HAS_ARGB1555TOYROW_NEON
+void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {
+  asm volatile (
+    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
+    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
+    "movi       v7.8b, #16                     \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    ARGB1555TOARGB
+    "umull      v3.8h, v0.8b, v4.8b            \n"  // B
+    "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
+    "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
+    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v7.8b            \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb1555),  // %0
+    "+r"(dst_y),         // %1
+    "+r"(pix)            // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
+  );
+}
+#endif  // HAS_ARGB1555TOYROW_NEON
+
+#ifdef HAS_ARGB4444TOYROW_NEON
+void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
+  asm volatile (
+    "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient
+    "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient
+    "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient
+    "movi       v27.8b, #16                    \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    ARGB4444TOARGB
+    "umull      v3.8h, v0.8b, v24.8b           \n"  // B
+    "umlal      v3.8h, v1.8b, v25.8b           \n"  // G
+    "umlal      v3.8h, v2.8b, v26.8b           \n"  // R
+    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v27.8b           \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb4444),  // %0
+    "+r"(dst_y),         // %1
+    "+r"(pix)            // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27"
+  );
+}
+#endif  // HAS_ARGB4444TOYROW_NEON
+
+#ifdef HAS_BGRATOYROW_NEON
+void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
+  asm volatile (
+    "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
+    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+    "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
+    "movi       v7.8b, #16                     \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v16.8h, v1.8b, v4.8b           \n"  // R
+    "umlal      v16.8h, v2.8b, v5.8b           \n"  // G
+    "umlal      v16.8h, v3.8b, v6.8b           \n"  // B
+    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v7.8b            \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_bgra),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
+  );
+}
+#endif  // HAS_BGRATOYROW_NEON
+
+#ifdef HAS_ABGRTOYROW_NEON
+void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
+  asm volatile (
+    "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
+    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+    "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
+    "movi       v7.8b, #16                     \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v16.8h, v0.8b, v4.8b           \n"  // R
+    "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
+    "umlal      v16.8h, v2.8b, v6.8b           \n"  // B
+    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v7.8b            \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_abgr),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
+  );
+}
+#endif  // HAS_ABGRTOYROW_NEON
+
+#ifdef HAS_RGBATOYROW_NEON
+void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
+  asm volatile (
+    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
+    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
+    "movi       v7.8b, #16                     \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v16.8h, v1.8b, v4.8b           \n"  // B
+    "umlal      v16.8h, v2.8b, v5.8b           \n"  // G
+    "umlal      v16.8h, v3.8b, v6.8b           \n"  // R
+    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v7.8b            \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_rgba),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
+  );
+}
+#endif  // HAS_RGBATOYROW_NEON
+
+#ifdef HAS_RGB24TOYROW_NEON
+void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
+  asm volatile (
+    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
+    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
+    "movi       v7.8b, #16                     \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v16.8h, v0.8b, v4.8b           \n"  // B
+    "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
+    "umlal      v16.8h, v2.8b, v6.8b           \n"  // R
+    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v7.8b            \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_rgb24),  // %0
+    "+r"(dst_y),      // %1
+    "+r"(pix)         // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
+  );
+}
+#endif  // HAS_RGB24TOYROW_NEON
+
+#ifdef HAS_RAWTOYROW_NEON
+void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
+  asm volatile (
+    "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
+    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+    "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
+    "movi       v7.8b, #16                     \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v16.8h, v0.8b, v4.8b           \n"  // B
+    "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
+    "umlal      v16.8h, v2.8b, v6.8b           \n"  // R
+    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v7.8b            \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_raw),  // %0
+    "+r"(dst_y),    // %1
+    "+r"(pix)       // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
+  );
+}
+#endif  // HAS_RAWTOYROW_NEON
+
+// Bilinear filter 16x2 -> 16x1
+#ifdef HAS_INTERPOLATEROW_NEON
+void InterpolateRow_NEON(uint8* dst_ptr,
+                         const uint8* src_ptr, ptrdiff_t src_stride,
+                         int dst_width, int source_y_fraction) {
+  int y1_fraction = source_y_fraction;
+  int y0_fraction = 256 - y1_fraction;
+  const uint8* src_ptr1 = src_ptr + src_stride;
+  asm volatile (
+    "cmp        %w4, #0                        \n"
+    "b.eq       100f                           \n"
+    "cmp        %w4, #64                       \n"
+    "b.eq       75f                            \n"
+    "cmp        %w4, #128                      \n"
+    "b.eq       50f                            \n"
+    "cmp        %w4, #192                      \n"
+    "b.eq       25f                            \n"
+
+    "dup        v5.16b, %w4                    \n"
+    "dup        v4.16b, %w5                    \n"
+    // General purpose row blend.
+  "1:                                          \n"
+    MEMACCESS(1)
+    "ld1        {v0.16b}, [%1], #16            \n"
+    MEMACCESS(2)
+    "ld1        {v1.16b}, [%2], #16            \n"
+    "subs       %w3, %w3, #16                  \n"
+    "umull      v2.8h, v0.8b,  v4.8b           \n"
+    "umull2     v3.8h, v0.16b, v4.16b          \n"
+    "umlal      v2.8h, v1.8b,  v5.8b           \n"
+    "umlal2     v3.8h, v1.16b, v5.16b          \n"
+    "rshrn      v0.8b,  v2.8h, #8              \n"
+    "rshrn2     v0.16b, v3.8h, #8              \n"
+    MEMACCESS(0)
+    "st1        {v0.16b}, [%0], #16            \n"
+    "b.gt       1b                             \n"
+    "b          99f                            \n"
+
+    // Blend 25 / 75.
+  "25:                                         \n"
+    MEMACCESS(1)
+    "ld1        {v0.16b}, [%1], #16            \n"
+    MEMACCESS(2)
+    "ld1        {v1.16b}, [%2], #16            \n"
+    "subs       %w3, %w3, #16                  \n"
+    "urhadd     v0.16b, v0.16b, v1.16b         \n"
+    "urhadd     v0.16b, v0.16b, v1.16b         \n"
+    MEMACCESS(0)
+    "st1        {v0.16b}, [%0], #16            \n"
+    "b.gt       25b                            \n"
+    "b          99f                            \n"
+
+    // Blend 50 / 50.
+  "50:                                         \n"
+    MEMACCESS(1)
+    "ld1        {v0.16b}, [%1], #16            \n"
+    MEMACCESS(2)
+    "ld1        {v1.16b}, [%2], #16            \n"
+    "subs       %w3, %w3, #16                  \n"
+    "urhadd     v0.16b, v0.16b, v1.16b         \n"
+    MEMACCESS(0)
+    "st1        {v0.16b}, [%0], #16            \n"
+    "b.gt       50b                            \n"
+    "b          99f                            \n"
+
+    // Blend 75 / 25.
+  "75:                                         \n"
+    MEMACCESS(1)
+    "ld1        {v1.16b}, [%1], #16            \n"
+    MEMACCESS(2)
+    "ld1        {v0.16b}, [%2], #16            \n"
+    "subs       %w3, %w3, #16                  \n"
+    "urhadd     v0.16b, v0.16b, v1.16b         \n"
+    "urhadd     v0.16b, v0.16b, v1.16b         \n"
+    MEMACCESS(0)
+    "st1        {v0.16b}, [%0], #16            \n"
+    "b.gt       75b                            \n"
+    "b          99f                            \n"
+
+    // Blend 100 / 0 - Copy row unchanged.
+  "100:                                        \n"
+    MEMACCESS(1)
+    "ld1        {v0.16b}, [%1], #16            \n"
+    "subs       %w3, %w3, #16                  \n"
+    MEMACCESS(0)
+    "st1        {v0.16b}, [%0], #16            \n"
+    "b.gt       100b                           \n"
+
+  "99:                                         \n"
+  : "+r"(dst_ptr),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(src_ptr1),         // %2
+    "+r"(dst_width),        // %3
+    "+r"(y1_fraction),      // %4
+    "+r"(y0_fraction)       // %5
+  :
+  : "cc", "memory", "v0", "v1", "v3", "v4", "v5"
+  );
+}
+#endif  // HAS_INTERPOLATEROW_NEON
+
+// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
+#ifdef HAS_ARGBBLENDROW_NEON
+void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+                       uint8* dst_argb, int width) {
+  asm volatile (
+    "subs       %w3, %w3, #8                   \n"
+    "b.lt       89f                            \n"
+    // Blend 8 pixels.
+  "8:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB0 pixels
+    MEMACCESS(1)
+    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 ARGB1 pixels
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
+    "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
+    "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a
+    "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8
+    "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8
+    "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8
+    "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)
+    "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)
+    "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)
+    "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb
+    "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg
+    "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr
+    "movi       v3.8b, #255                    \n"  // a = 255
+    MEMACCESS(2)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
+    "b.ge       8b                             \n"
+
+  "89:                                         \n"
+    "adds       %w3, %w3, #8-1                 \n"
+    "b.lt       99f                            \n"
+
+    // Blend 1 pixels.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n"  // load 1 pixel ARGB0.
+    MEMACCESS(1)
+    "ld4        {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n"  // load 1 pixel ARGB1.
+    "subs       %w3, %w3, #1                   \n"  // 1 processed per loop.
+    "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
+    "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
+    "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a
+    "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8
+    "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8
+    "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8
+    "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)
+    "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)
+    "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)
+    "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb
+    "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg
+    "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr
+    "movi       v3.8b, #255                    \n"  // a = 255
+    MEMACCESS(2)
+    "st4        {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n"  // store 1 pixel.
+    "b.ge       1b                             \n"
+
+  "99:                                         \n"
+
+  : "+r"(src_argb0),    // %0
+    "+r"(src_argb1),    // %1
+    "+r"(dst_argb),     // %2
+    "+r"(width)         // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v16", "v17", "v18"
+  );
+}
+#endif  // HAS_ARGBBLENDROW_NEON
+
+// Attenuate 8 pixels at a time.
+#ifdef HAS_ARGBATTENUATEROW_NEON
+void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
+  asm volatile (
+    // Attenuate 8 pixels.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v4.8h, v0.8b, v3.8b            \n"  // b * a
+    "umull      v5.8h, v1.8b, v3.8b            \n"  // g * a
+    "umull      v6.8h, v2.8b, v3.8b            \n"  // r * a
+    "uqrshrn    v0.8b, v4.8h, #8               \n"  // b >>= 8
+    "uqrshrn    v1.8b, v5.8h, #8               \n"  // g >>= 8
+    "uqrshrn    v2.8b, v6.8h, #8               \n"  // r >>= 8
+    MEMACCESS(1)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_argb),   // %1
+    "+r"(width)       // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
+  );
+}
+#endif  // HAS_ARGBATTENUATEROW_NEON
+
+// Quantize 8 ARGB pixels (32 bytes).
+// dst = (dst * scale >> 16) * interval_size + interval_offset;
+#ifdef HAS_ARGBQUANTIZEROW_NEON
+void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
+                          int interval_offset, int width) {
+  asm volatile (
+    "dup        v4.8h, %w2                     \n"
+    "ushr       v4.8h, v4.8h, #1               \n"  // scale >>= 1
+    "dup        v5.8h, %w3                     \n"  // interval multiply.
+    "dup        v6.8h, %w4                     \n"  // interval add
+
+    // 8 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0]  \n"  // load 8 pixels of ARGB.
+    "subs       %w1, %w1, #8                   \n"  // 8 processed per loop.
+    "uxtl       v0.8h, v0.8b                   \n"  // b (0 .. 255)
+    "uxtl       v1.8h, v1.8b                   \n"
+    "uxtl       v2.8h, v2.8b                   \n"
+    "sqdmulh    v0.8h, v0.8h, v4.8h            \n"  // b * scale
+    "sqdmulh    v1.8h, v1.8h, v4.8h            \n"  // g
+    "sqdmulh    v2.8h, v2.8h, v4.8h            \n"  // r
+    "mul        v0.8h, v0.8h, v5.8h            \n"  // b * interval_size
+    "mul        v1.8h, v1.8h, v5.8h            \n"  // g
+    "mul        v2.8h, v2.8h, v5.8h            \n"  // r
+    "add        v0.8h, v0.8h, v6.8h            \n"  // b + interval_offset
+    "add        v1.8h, v1.8h, v6.8h            \n"  // g
+    "add        v2.8h, v2.8h, v6.8h            \n"  // r
+    "uqxtn      v0.8b, v0.8h                   \n"
+    "uqxtn      v1.8b, v1.8h                   \n"
+    "uqxtn      v2.8b, v2.8h                   \n"
+    MEMACCESS(0)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+  : "+r"(dst_argb),       // %0
+    "+r"(width)           // %1
+  : "r"(scale),           // %2
+    "r"(interval_size),   // %3
+    "r"(interval_offset)  // %4
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
+  );
+}
+#endif  // HAS_ARGBQUANTIZEROW_NEON
+
+// Shade 8 pixels at a time by specified value.
+// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
+// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
+#ifdef HAS_ARGBSHADEROW_NEON
+void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
+                       uint32 value) {
+  asm volatile (
+    "dup        v0.4s, %w3                     \n"  // duplicate scale value.
+    "zip1       v0.8b, v0.8b, v0.8b            \n"  // v0.8b aarrggbb.
+    "ushr       v0.8h, v0.8h, #1               \n"  // scale / 2.
+
+    // 8 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "uxtl       v4.8h, v4.8b                   \n"  // b (0 .. 255)
+    "uxtl       v5.8h, v5.8b                   \n"
+    "uxtl       v6.8h, v6.8b                   \n"
+    "uxtl       v7.8h, v7.8b                   \n"
+    "sqrdmulh   v4.8h, v4.8h, v0.h[0]          \n"  // b * scale * 2
+    "sqrdmulh   v5.8h, v5.8h, v0.h[1]          \n"  // g
+    "sqrdmulh   v6.8h, v6.8h, v0.h[2]          \n"  // r
+    "sqrdmulh   v7.8h, v7.8h, v0.h[3]          \n"  // a
+    "uqxtn      v4.8b, v4.8h                   \n"
+    "uqxtn      v5.8b, v5.8h                   \n"
+    "uqxtn      v6.8b, v6.8h                   \n"
+    "uqxtn      v7.8b, v7.8h                   \n"
+    MEMACCESS(1)
+    "st4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),       // %0
+    "+r"(dst_argb),       // %1
+    "+r"(width)           // %2
+  : "r"(value)            // %3
+  : "cc", "memory", "v0", "v4", "v5", "v6", "v7"
+  );
+}
+#endif  // HAS_ARGBSHADEROW_NEON
+
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
+// Similar to ARGBToYJ but stores ARGB.
+// C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
+#ifdef HAS_ARGBGRAYROW_NEON
+void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
+  asm volatile (
+    "movi       v24.8b, #15                    \n"  // B * 0.11400 coefficient
+    "movi       v25.8b, #75                    \n"  // G * 0.58700 coefficient
+    "movi       v26.8b, #38                    \n"  // R * 0.29900 coefficient
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v4.8h, v0.8b, v24.8b           \n"  // B
+    "umlal      v4.8h, v1.8b, v25.8b           \n"  // G
+    "umlal      v4.8h, v2.8b, v26.8b           \n"  // R
+    "sqrshrun   v0.8b, v4.8h, #7               \n"  // 15 bit to 8 bit B
+    "orr        v1.8b, v0.8b, v0.8b            \n"  // G
+    "orr        v2.8b, v0.8b, v0.8b            \n"  // R
+    MEMACCESS(1)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 pixels.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(width)      // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26"
+  );
+}
+#endif  // HAS_ARGBGRAYROW_NEON
+
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+//    b = (r * 35 + g * 68 + b * 17) >> 7
+//    g = (r * 45 + g * 88 + b * 22) >> 7
+//    r = (r * 50 + g * 98 + b * 24) >> 7
+
+#ifdef HAS_ARGBSEPIAROW_NEON
+void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
+  asm volatile (
+    "movi       v20.8b, #17                    \n"  // BB coefficient
+    "movi       v21.8b, #68                    \n"  // BG coefficient
+    "movi       v22.8b, #35                    \n"  // BR coefficient
+    "movi       v24.8b, #22                    \n"  // GB coefficient
+    "movi       v25.8b, #88                    \n"  // GG coefficient
+    "movi       v26.8b, #45                    \n"  // GR coefficient
+    "movi       v28.8b, #24                    \n"  // BB coefficient
+    "movi       v29.8b, #98                    \n"  // BG coefficient
+    "movi       v30.8b, #50                    \n"  // BR coefficient
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n"  // load 8 ARGB pixels.
+    "subs       %w1, %w1, #8                   \n"  // 8 processed per loop.
+    "umull      v4.8h, v0.8b, v20.8b           \n"  // B to Sepia B
+    "umlal      v4.8h, v1.8b, v21.8b           \n"  // G
+    "umlal      v4.8h, v2.8b, v22.8b           \n"  // R
+    "umull      v5.8h, v0.8b, v24.8b           \n"  // B to Sepia G
+    "umlal      v5.8h, v1.8b, v25.8b           \n"  // G
+    "umlal      v5.8h, v2.8b, v26.8b           \n"  // R
+    "umull      v6.8h, v0.8b, v28.8b           \n"  // B to Sepia R
+    "umlal      v6.8h, v1.8b, v29.8b           \n"  // G
+    "umlal      v6.8h, v2.8b, v30.8b           \n"  // R
+    "uqshrn     v0.8b, v4.8h, #7               \n"  // 16 bit to 8 bit B
+    "uqshrn     v1.8b, v5.8h, #7               \n"  // 16 bit to 8 bit G
+    "uqshrn     v2.8b, v6.8h, #7               \n"  // 16 bit to 8 bit R
+    MEMACCESS(0)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 pixels.
+    "b.gt       1b                             \n"
+  : "+r"(dst_argb),  // %0
+    "+r"(width)      // %1
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_ARGBSEPIAROW_NEON
+
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// TODO(fbarchard): Was same as Sepia except matrix is provided.  This function
+// needs to saturate.  Consider doing a non-saturating version.
+#ifdef HAS_ARGBCOLORMATRIXROW_NEON
+void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
+                             const int8* matrix_argb, int width) {
+  asm volatile (
+    MEMACCESS(3)
+    "ld1        {v2.16b}, [%3]                 \n"  // load 3 ARGB vectors.
+    "sxtl       v0.8h, v2.8b                   \n"  // B,G coefficients s16.
+    "sxtl2      v1.8h, v2.16b                  \n"  // R,A coefficients s16.
+
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "uxtl       v16.8h, v16.8b                 \n"  // b (0 .. 255) 16 bit
+    "uxtl       v17.8h, v17.8b                 \n"  // g
+    "uxtl       v18.8h, v18.8b                 \n"  // r
+    "uxtl       v19.8h, v19.8b                 \n"  // a
+    "mul        v22.8h, v16.8h, v0.h[0]        \n"  // B = B * Matrix B
+    "mul        v23.8h, v16.8h, v0.h[4]        \n"  // G = B * Matrix G
+    "mul        v24.8h, v16.8h, v1.h[0]        \n"  // R = B * Matrix R
+    "mul        v25.8h, v16.8h, v1.h[4]        \n"  // A = B * Matrix A
+    "mul        v4.8h, v17.8h, v0.h[1]         \n"  // B += G * Matrix B
+    "mul        v5.8h, v17.8h, v0.h[5]         \n"  // G += G * Matrix G
+    "mul        v6.8h, v17.8h, v1.h[1]         \n"  // R += G * Matrix R
+    "mul        v7.8h, v17.8h, v1.h[5]         \n"  // A += G * Matrix A
+    "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
+    "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
+    "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
+    "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
+    "mul        v4.8h, v18.8h, v0.h[2]         \n"  // B += R * Matrix B
+    "mul        v5.8h, v18.8h, v0.h[6]         \n"  // G += R * Matrix G
+    "mul        v6.8h, v18.8h, v1.h[2]         \n"  // R += R * Matrix R
+    "mul        v7.8h, v18.8h, v1.h[6]         \n"  // A += R * Matrix A
+    "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
+    "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
+    "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
+    "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
+    "mul        v4.8h, v19.8h, v0.h[3]         \n"  // B += A * Matrix B
+    "mul        v5.8h, v19.8h, v0.h[7]         \n"  // G += A * Matrix G
+    "mul        v6.8h, v19.8h, v1.h[3]         \n"  // R += A * Matrix R
+    "mul        v7.8h, v19.8h, v1.h[7]         \n"  // A += A * Matrix A
+    "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
+    "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
+    "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
+    "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
+    "sqshrun    v16.8b, v22.8h, #6             \n"  // 16 bit to 8 bit B
+    "sqshrun    v17.8b, v23.8h, #6             \n"  // 16 bit to 8 bit G
+    "sqshrun    v18.8b, v24.8h, #6             \n"  // 16 bit to 8 bit R
+    "sqshrun    v19.8b, v25.8h, #6             \n"  // 16 bit to 8 bit A
+    MEMACCESS(1)
+    "st4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n"  // store 8 pixels.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_argb),   // %1
+    "+r"(width)       // %2
+  : "r"(matrix_argb)  // %3
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
+    "v18", "v19", "v22", "v23", "v24", "v25"
+  );
+}
+#endif  // HAS_ARGBCOLORMATRIXROW_NEON
+
+// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+#ifdef HAS_ARGBMULTIPLYROW_NEON
+void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  asm volatile (
+    // 8 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
+    MEMACCESS(1)
+    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "umull      v0.8h, v0.8b, v4.8b            \n"  // multiply B
+    "umull      v1.8h, v1.8b, v5.8b            \n"  // multiply G
+    "umull      v2.8h, v2.8b, v6.8b            \n"  // multiply R
+    "umull      v3.8h, v3.8b, v7.8b            \n"  // multiply A
+    "rshrn      v0.8b, v0.8h, #8               \n"  // 16 bit to 8 bit B
+    "rshrn      v1.8b, v1.8h, #8               \n"  // 16 bit to 8 bit G
+    "rshrn      v2.8b, v2.8h, #8               \n"  // 16 bit to 8 bit R
+    "rshrn      v3.8b, v3.8h, #8               \n"  // 16 bit to 8 bit A
+    MEMACCESS(2)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
+  );
+}
+#endif  // HAS_ARGBMULTIPLYROW_NEON
+
+// Add 2 rows of ARGB pixels together, 8 pixels at a time.
+#ifdef HAS_ARGBADDROW_NEON
+void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    // 8 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
+    MEMACCESS(1)
+    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "uqadd      v0.8b, v0.8b, v4.8b            \n"
+    "uqadd      v1.8b, v1.8b, v5.8b            \n"
+    "uqadd      v2.8b, v2.8b, v6.8b            \n"
+    "uqadd      v3.8b, v3.8b, v7.8b            \n"
+    MEMACCESS(2)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
+  );
+}
+#endif  // HAS_ARGBADDROW_NEON
+
+// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
+#ifdef HAS_ARGBSUBTRACTROW_NEON
+void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  asm volatile (
+    // 8 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
+    MEMACCESS(1)
+    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "uqsub      v0.8b, v0.8b, v4.8b            \n"
+    "uqsub      v1.8b, v1.8b, v5.8b            \n"
+    "uqsub      v2.8b, v2.8b, v6.8b            \n"
+    "uqsub      v3.8b, v3.8b, v7.8b            \n"
+    MEMACCESS(2)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
+  );
+}
+#endif  // HAS_ARGBSUBTRACTROW_NEON
+
+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+// A = 255
+// R = Sobel
+// G = Sobel
+// B = Sobel
+#ifdef HAS_SOBELROW_NEON
+void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    "movi       v3.8b, #255                    \n"  // alpha
+    // 8 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.8b}, [%0], #8              \n"  // load 8 sobelx.
+    MEMACCESS(1)
+    "ld1        {v1.8b}, [%1], #8              \n"  // load 8 sobely.
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "uqadd      v0.8b, v0.8b, v1.8b            \n"  // add
+    "orr        v1.8b, v0.8b, v0.8b            \n"
+    "orr        v2.8b, v0.8b, v0.8b            \n"
+    MEMACCESS(2)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_argb),    // %2
+    "+r"(width)        // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3"
+  );
+}
+#endif  // HAS_SOBELROW_NEON
+
+// Adds Sobel X and Sobel Y and stores Sobel into plane.
+#ifdef HAS_SOBELTOPLANEROW_NEON
+void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                          uint8* dst_y, int width) {
+  asm volatile (
+    // 16 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 16 sobelx.
+    MEMACCESS(1)
+    "ld1        {v1.16b}, [%1], #16            \n"  // load 16 sobely.
+    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
+    "uqadd      v0.16b, v0.16b, v1.16b         \n"  // add
+    MEMACCESS(2)
+    "st1        {v0.16b}, [%2], #16            \n"  // store 16 pixels.
+    "b.gt       1b                             \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_y),       // %2
+    "+r"(width)        // %3
+  :
+  : "cc", "memory", "v0", "v1"
+  );
+}
+#endif  // HAS_SOBELTOPLANEROW_NEON
+
+// Mixes Sobel X, Sobel Y and Sobel into ARGB.
+// A = 255
+// R = Sobel X
+// G = Sobel
+// B = Sobel Y
+#ifdef HAS_SOBELXYROW_NEON
+void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    "movi       v3.8b, #255                    \n"  // alpha
+    // 8 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v2.8b}, [%0], #8              \n"  // load 8 sobelx.
+    MEMACCESS(1)
+    "ld1        {v0.8b}, [%1], #8              \n"  // load 8 sobely.
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "uqadd      v1.8b, v0.8b, v2.8b            \n"  // add
+    MEMACCESS(2)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_argb),    // %2
+    "+r"(width)        // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3"
+  );
+}
+#endif  // HAS_SOBELXYROW_NEON
+
+// SobelX as a matrix is
+// -1  0  1
+// -2  0  2
+// -1  0  1
+#ifdef HAS_SOBELXROW_NEON
+void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
+                    const uint8* src_y2, uint8* dst_sobelx, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.8b}, [%0],%5               \n"  // top
+    MEMACCESS(0)
+    "ld1        {v1.8b}, [%0],%6               \n"
+    "usubl      v0.8h, v0.8b, v1.8b            \n"
+    MEMACCESS(1)
+    "ld1        {v2.8b}, [%1],%5               \n"  // center * 2
+    MEMACCESS(1)
+    "ld1        {v3.8b}, [%1],%6               \n"
+    "usubl      v1.8h, v2.8b, v3.8b            \n"
+    "add        v0.8h, v0.8h, v1.8h            \n"
+    "add        v0.8h, v0.8h, v1.8h            \n"
+    MEMACCESS(2)
+    "ld1        {v2.8b}, [%2],%5               \n"  // bottom
+    MEMACCESS(2)
+    "ld1        {v3.8b}, [%2],%6               \n"
+    "subs       %w4, %w4, #8                   \n"  // 8 pixels
+    "usubl      v1.8h, v2.8b, v3.8b            \n"
+    "add        v0.8h, v0.8h, v1.8h            \n"
+    "abs        v0.8h, v0.8h                   \n"
+    "uqxtn      v0.8b, v0.8h                   \n"
+    MEMACCESS(3)
+    "st1        {v0.8b}, [%3], #8              \n"  // store 8 sobelx
+    "b.gt       1b                             \n"
+  : "+r"(src_y0),      // %0
+    "+r"(src_y1),      // %1
+    "+r"(src_y2),      // %2
+    "+r"(dst_sobelx),  // %3
+    "+r"(width)        // %4
+  : "r"(2LL),          // %5
+    "r"(6LL)           // %6
+  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+#endif  // HAS_SOBELXROW_NEON
+
+// SobelY as a matrix is
+// -1 -2 -1
+//  0  0  0
+//  1  2  1
+#ifdef HAS_SOBELYROW_NEON
+void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
+                    uint8* dst_sobely, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.8b}, [%0],%4               \n"  // left
+    MEMACCESS(1)
+    "ld1        {v1.8b}, [%1],%4               \n"
+    "usubl      v0.8h, v0.8b, v1.8b            \n"
+    MEMACCESS(0)
+    "ld1        {v2.8b}, [%0],%4               \n"  // center * 2
+    MEMACCESS(1)
+    "ld1        {v3.8b}, [%1],%4               \n"
+    "usubl      v1.8h, v2.8b, v3.8b            \n"
+    "add        v0.8h, v0.8h, v1.8h            \n"
+    "add        v0.8h, v0.8h, v1.8h            \n"
+    MEMACCESS(0)
+    "ld1        {v2.8b}, [%0],%5               \n"  // right
+    MEMACCESS(1)
+    "ld1        {v3.8b}, [%1],%5               \n"
+    "subs       %w3, %w3, #8                   \n"  // 8 pixels
+    "usubl      v1.8h, v2.8b, v3.8b            \n"
+    "add        v0.8h, v0.8h, v1.8h            \n"
+    "abs        v0.8h, v0.8h                   \n"
+    "uqxtn      v0.8b, v0.8h                   \n"
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 sobely
+    "b.gt       1b                             \n"
+  : "+r"(src_y0),      // %0
+    "+r"(src_y1),      // %1
+    "+r"(dst_sobely),  // %2
+    "+r"(width)        // %3
+  : "r"(1LL),          // %4
+    "r"(6LL)           // %5
+  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+#endif  // HAS_SOBELYROW_NEON
+#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libvpx/third_party/libyuv/source/row_win.cc b/libs/libvpx/third_party/libyuv/source/row_win.cc
new file mode 100644
index 0000000000..71be268b47
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/source/row_win.cc
@@ -0,0 +1,6331 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_X64) && \
+    defined(_MSC_VER) && !defined(__clang__)
+#include <emmintrin.h>
+#include <tmmintrin.h>  // For _mm_maddubs_epi16
+#endif
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for Visual C.
+#if !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) && \
+    defined(_MSC_VER) && !defined(__clang__)
+
+struct YuvConstants {
+  lvec8 kUVToB;     // 0
+  lvec8 kUVToG;     // 32
+  lvec8 kUVToR;     // 64
+  lvec16 kUVBiasB;  // 96
+  lvec16 kUVBiasG;  // 128
+  lvec16 kUVBiasR;  // 160
+  lvec16 kYToRgb;   // 192
+};
+
+// BT.601 YUV to RGB reference
+//  R = (Y - 16) * 1.164              - V * -1.596
+//  G = (Y - 16) * 1.164 - U *  0.391 - V *  0.813
+//  B = (Y - 16) * 1.164 - U * -2.018
+
+// Y contribution to R,G,B.  Scale and bias.
+// TODO(fbarchard): Consider moving constants into a common header.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
+
+// U and V contributions to R,G,B.
+#define UB -128 /* max(-128, round(-2.018 * 64)) */
+#define UG 25 /* round(0.391 * 64) */
+#define VG 52 /* round(0.813 * 64) */
+#define VR -102 /* round(-1.596 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BB (UB * 128            + YGB)
+#define BG (UG * 128 + VG * 128 + YGB)
+#define BR            (VR * 128 + YGB)
+
+// BT601 constants for YUV to RGB.
+static YuvConstants SIMD_ALIGNED(kYuvConstants) = {
+  { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
+    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
+  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
+  { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
+    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
+  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
+  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
+  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
+  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
+};
+
+// BT601 constants for NV21 where chroma plane is VU instead of UV.
+static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
+  { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+    0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
+  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+    VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
+  { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+    VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
+  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
+  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
+  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
+  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
+};
+
+#undef YG
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef BB
+#undef BG
+#undef BR
+
+// JPEG YUV to RGB reference
+// *  R = Y                - V * -1.40200
+// *  G = Y - U *  0.34414 - V *  0.71414
+// *  B = Y - U * -1.77200
+
+// Y contribution to R,G,B.  Scale and bias.
+// TODO(fbarchard): Consider moving constants into a common header.
+#define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+#define YGBJ 32  /* 64 / 2 */
+
+// U and V contributions to R,G,B.
+#define UBJ -113 /* round(-1.77200 * 64) */
+#define UGJ 22 /* round(0.34414 * 64) */
+#define VGJ 46 /* round(0.71414  * 64) */
+#define VRJ -90 /* round(-1.40200 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BBJ (UBJ * 128             + YGBJ)
+#define BGJ (UGJ * 128 + VGJ * 128 + YGBJ)
+#define BRJ             (VRJ * 128 + YGBJ)
+
+// JPEG constants for YUV to RGB.
+static YuvConstants SIMD_ALIGNED(kYuvJConstants) = {
+  { UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0,
+    UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0 },
+  { UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
+    UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
+    UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
+    UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ },
+  { 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ,
+    0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ },
+  { BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ,
+    BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ },
+  { BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ,
+    BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ },
+  { BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ,
+    BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ },
+  { YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ,
+    YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ }
+};
+
+#undef YGJ
+#undef YGBJ
+#undef UBJ
+#undef UGJ
+#undef VGJ
+#undef VRJ
+#undef BBJ
+#undef BGJ
+#undef BRJ
+
+// 64 bit
+#if defined(_M_X64)
+#if defined(HAS_I422TOARGBROW_SSSE3)
+void I422ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* dst_argb,
+                         int width) {
+  __m128i xmm0, xmm1, xmm2, xmm3;
+  const __m128i xmm5 = _mm_set1_epi8(-1);
+  const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
+
+  while (width > 0) {
+    xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);
+    xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));
+    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
+    xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);
+    xmm1 = _mm_loadu_si128(&xmm0);
+    xmm2 = _mm_loadu_si128(&xmm0);
+    xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kYuvConstants.kUVToB);
+    xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kYuvConstants.kUVToG);
+    xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kYuvConstants.kUVToR);
+    xmm0 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasB, xmm0);
+    xmm1 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasG, xmm1);
+    xmm2 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasR, xmm2);
+    xmm3 = _mm_loadl_epi64((__m128i*)y_buf);
+    xmm3 = _mm_unpacklo_epi8(xmm3, xmm3);
+    xmm3 = _mm_mulhi_epu16(xmm3, *(__m128i*)kYuvConstants.kYToRgb);
+    xmm0 = _mm_adds_epi16(xmm0, xmm3);
+    xmm1 = _mm_adds_epi16(xmm1, xmm3);
+    xmm2 = _mm_adds_epi16(xmm2, xmm3);
+    xmm0 = _mm_srai_epi16(xmm0, 6);
+    xmm1 = _mm_srai_epi16(xmm1, 6);
+    xmm2 = _mm_srai_epi16(xmm2, 6);
+    xmm0 = _mm_packus_epi16(xmm0, xmm0);
+    xmm1 = _mm_packus_epi16(xmm1, xmm1);
+    xmm2 = _mm_packus_epi16(xmm2, xmm2);
+    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
+    xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
+    xmm1 = _mm_loadu_si128(&xmm0);
+    xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);
+    xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);
+
+    _mm_storeu_si128((__m128i *)dst_argb, xmm0);
+    _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1);
+
+    y_buf += 8;
+    u_buf += 4;
+    dst_argb += 32;
+    width -= 8;
+  }
+}
+#endif
+// 32 bit
+#else  // defined(_M_X64)
+#ifdef HAS_ARGBTOYROW_SSSE3
+
+// Constants for ARGB.
+static const vec8 kARGBToY = {
+  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
+};
+
+// JPeg full range.
+static const vec8 kARGBToYJ = {
+  15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
+};
+
+static const vec8 kARGBToU = {
+  112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
+};
+
+static const vec8 kARGBToUJ = {
+  127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
+};
+
+static const vec8 kARGBToV = {
+  -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
+};
+
+static const vec8 kARGBToVJ = {
+  -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
+};
+
+// vpshufb for vphaddw + vpackuswb packed to shorts.
+static const lvec8 kShufARGBToUV_AVX = {
+  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+};
+
+// Constants for BGRA.
+static const vec8 kBGRAToY = {
+  0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
+};
+
+static const vec8 kBGRAToU = {
+  0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
+};
+
+static const vec8 kBGRAToV = {
+  0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
+};
+
+// Constants for ABGR.
+static const vec8 kABGRToY = {
+  33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
+};
+
+static const vec8 kABGRToU = {
+  -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
+};
+
+static const vec8 kABGRToV = {
+  112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
+};
+
+// Constants for RGBA.
+static const vec8 kRGBAToY = {
+  0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
+};
+
+static const vec8 kRGBAToU = {
+  0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
+};
+
+static const vec8 kRGBAToV = {
+  0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
+};
+
+static const uvec8 kAddY16 = {
+  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
+};
+
+// 7 bit fixed point 0.5.
+static const vec16 kAddYJ64 = {
+  64, 64, 64, 64, 64, 64, 64, 64
+};
+
+static const uvec8 kAddUV128 = {
+  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
+};
+
+static const uvec16 kAddUVJ128 = {
+  0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
+};
+
+// Shuffle table for converting RGB24 to ARGB.
+static const uvec8 kShuffleMaskRGB24ToARGB = {
+  0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
+};
+
+// Shuffle table for converting RAW to ARGB.
+static const uvec8 kShuffleMaskRAWToARGB = {
+  2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
+};
+
+// Shuffle table for converting ARGB to RGB24.
+static const uvec8 kShuffleMaskARGBToRGB24 = {
+  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
+};
+
+// Shuffle table for converting ARGB to RAW.
+static const uvec8 kShuffleMaskARGBToRAW = {
+  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
+};
+
+// Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
+static const uvec8 kShuffleMaskARGBToRGB24_0 = {
+  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
+};
+
+// Shuffle table for converting ARGB to RAW.
+static const uvec8 kShuffleMaskARGBToRAW_0 = {
+  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
+};
+
+// Duplicates gray value 3 times and fills in alpha opaque.
+__declspec(naked)
+void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
+  __asm {
+    mov        eax, [esp + 4]        // src_y
+    mov        edx, [esp + 8]        // dst_argb
+    mov        ecx, [esp + 12]       // pix
+    pcmpeqb    xmm5, xmm5            // generate mask 0xff000000
+    pslld      xmm5, 24
+
+  convertloop:
+    movq       xmm0, qword ptr [eax]
+    lea        eax,  [eax + 8]
+    punpcklbw  xmm0, xmm0
+    movdqa     xmm1, xmm0
+    punpcklwd  xmm0, xmm0
+    punpckhwd  xmm1, xmm1
+    por        xmm0, xmm5
+    por        xmm1, xmm5
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+    ret
+  }
+}
+
+#ifdef HAS_J400TOARGBROW_AVX2
+// Duplicates gray value 3 times and fills in alpha opaque.
+__declspec(naked)
+void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix) {
+  __asm {
+    mov         eax, [esp + 4]        // src_y
+    mov         edx, [esp + 8]        // dst_argb
+    mov         ecx, [esp + 12]       // pix
+    vpcmpeqb    ymm5, ymm5, ymm5      // generate mask 0xff000000
+    vpslld      ymm5, ymm5, 24
+
+  convertloop:
+    vmovdqu     xmm0, [eax]
+    lea         eax,  [eax + 16]
+    vpermq      ymm0, ymm0, 0xd8
+    vpunpcklbw  ymm0, ymm0, ymm0
+    vpermq      ymm0, ymm0, 0xd8
+    vpunpckhwd  ymm1, ymm0, ymm0
+    vpunpcklwd  ymm0, ymm0, ymm0
+    vpor        ymm0, ymm0, ymm5
+    vpor        ymm1, ymm1, ymm5
+    vmovdqu     [edx], ymm0
+    vmovdqu     [edx + 32], ymm1
+    lea         edx, [edx + 64]
+    sub         ecx, 16
+    jg          convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_J400TOARGBROW_AVX2
+
+__declspec(naked)
+void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
+  __asm {
+    mov       eax, [esp + 4]   // src_rgb24
+    mov       edx, [esp + 8]   // dst_argb
+    mov       ecx, [esp + 12]  // pix
+    pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
+    pslld     xmm5, 24
+    movdqa    xmm4, kShuffleMaskRGB24ToARGB
+
+ convertloop:
+    movdqu    xmm0, [eax]
+    movdqu    xmm1, [eax + 16]
+    movdqu    xmm3, [eax + 32]
+    lea       eax, [eax + 48]
+    movdqa    xmm2, xmm3
+    palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
+    pshufb    xmm2, xmm4
+    por       xmm2, xmm5
+    palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
+    pshufb    xmm0, xmm4
+    movdqu    [edx + 32], xmm2
+    por       xmm0, xmm5
+    pshufb    xmm1, xmm4
+    movdqu    [edx], xmm0
+    por       xmm1, xmm5
+    palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
+    pshufb    xmm3, xmm4
+    movdqu    [edx + 16], xmm1
+    por       xmm3, xmm5
+    movdqu    [edx + 48], xmm3
+    lea       edx, [edx + 64]
+    sub       ecx, 16
+    jg        convertloop
+    ret
+  }
+}
+
+__declspec(naked)
+void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
+                        int pix) {
+  __asm {
+    mov       eax, [esp + 4]   // src_raw
+    mov       edx, [esp + 8]   // dst_argb
+    mov       ecx, [esp + 12]  // pix
+    pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
+    pslld     xmm5, 24
+    movdqa    xmm4, kShuffleMaskRAWToARGB
+
+ convertloop:
+    movdqu    xmm0, [eax]
+    movdqu    xmm1, [eax + 16]
+    movdqu    xmm3, [eax + 32]
+    lea       eax, [eax + 48]
+    movdqa    xmm2, xmm3
+    palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
+    pshufb    xmm2, xmm4
+    por       xmm2, xmm5
+    palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
+    pshufb    xmm0, xmm4
+    movdqu    [edx + 32], xmm2
+    por       xmm0, xmm5
+    pshufb    xmm1, xmm4
+    movdqu    [edx], xmm0
+    por       xmm1, xmm5
+    palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
+    pshufb    xmm3, xmm4
+    movdqu    [edx + 16], xmm1
+    por       xmm3, xmm5
+    movdqu    [edx + 48], xmm3
+    lea       edx, [edx + 64]
+    sub       ecx, 16
+    jg        convertloop
+    ret
+  }
+}
+
+// pmul method to replicate bits.
+// Math to replicate bits:
+// (v << 8) | (v << 3)
+// v * 256 + v * 8
+// v * (256 + 8)
+// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
+// 20 instructions.
+__declspec(naked)
+void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
+                          int pix) {
+  __asm {
+    mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
+    movd      xmm5, eax
+    pshufd    xmm5, xmm5, 0
+    mov       eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
+    movd      xmm6, eax
+    pshufd    xmm6, xmm6, 0
+    pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
+    psllw     xmm3, 11
+    pcmpeqb   xmm4, xmm4       // generate mask 0x07e007e0 for Green
+    psllw     xmm4, 10
+    psrlw     xmm4, 5
+    pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
+    psllw     xmm7, 8
+
+    mov       eax, [esp + 4]   // src_rgb565
+    mov       edx, [esp + 8]   // dst_argb
+    mov       ecx, [esp + 12]  // pix
+    sub       edx, eax
+    sub       edx, eax
+
+ convertloop:
+    movdqu    xmm0, [eax]   // fetch 8 pixels of bgr565
+    movdqa    xmm1, xmm0
+    movdqa    xmm2, xmm0
+    pand      xmm1, xmm3    // R in upper 5 bits
+    psllw     xmm2, 11      // B in upper 5 bits
+    pmulhuw   xmm1, xmm5    // * (256 + 8)
+    pmulhuw   xmm2, xmm5    // * (256 + 8)
+    psllw     xmm1, 8
+    por       xmm1, xmm2    // RB
+    pand      xmm0, xmm4    // G in middle 6 bits
+    pmulhuw   xmm0, xmm6    // << 5 * (256 + 4)
+    por       xmm0, xmm7    // AG
+    movdqa    xmm2, xmm1
+    punpcklbw xmm1, xmm0
+    punpckhbw xmm2, xmm0
+    movdqu    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
+    movdqu    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
+    lea       eax, [eax + 16]
+    sub       ecx, 8
+    jg        convertloop
+    ret
+  }
+}
+
+#ifdef HAS_RGB565TOARGBROW_AVX2
+// pmul method to replicate bits.
+// Math to replicate bits:
+// (v << 8) | (v << 3)
+// v * 256 + v * 8
+// v * (256 + 8)
+// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
+__declspec(naked)
+void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb,
+                          int pix) {
+  __asm {
+    mov        eax, 0x01080108  // generate multiplier to repeat 5 bits
+    vmovd      xmm5, eax
+    vbroadcastss ymm5, xmm5
+    mov        eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
+    movd       xmm6, eax
+    vbroadcastss ymm6, xmm6
+    vpcmpeqb   ymm3, ymm3, ymm3       // generate mask 0xf800f800 for Red
+    vpsllw     ymm3, ymm3, 11
+    vpcmpeqb   ymm4, ymm4, ymm4       // generate mask 0x07e007e0 for Green
+    vpsllw     ymm4, ymm4, 10
+    vpsrlw     ymm4, ymm4, 5
+    vpcmpeqb   ymm7, ymm7, ymm7       // generate mask 0xff00ff00 for Alpha
+    vpsllw     ymm7, ymm7, 8
+
+    mov        eax, [esp + 4]   // src_rgb565
+    mov        edx, [esp + 8]   // dst_argb
+    mov        ecx, [esp + 12]  // pix
+    sub        edx, eax
+    sub        edx, eax
+
+ convertloop:
+    vmovdqu    ymm0, [eax]   // fetch 16 pixels of bgr565
+    vpand      ymm1, ymm0, ymm3    // R in upper 5 bits
+    vpsllw     ymm2, ymm0, 11      // B in upper 5 bits
+    vpmulhuw   ymm1, ymm1, ymm5    // * (256 + 8)
+    vpmulhuw   ymm2, ymm2, ymm5    // * (256 + 8)
+    vpsllw     ymm1, ymm1, 8
+    vpor       ymm1, ymm1, ymm2    // RB
+    vpand      ymm0, ymm0, ymm4    // G in middle 6 bits
+    vpmulhuw   ymm0, ymm0, ymm6    // << 5 * (256 + 4)
+    vpor       ymm0, ymm0, ymm7    // AG
+    vpermq     ymm0, ymm0, 0xd8    // mutate for unpack
+    vpermq     ymm1, ymm1, 0xd8
+    vpunpckhbw ymm2, ymm1, ymm0
+    vpunpcklbw ymm1, ymm1, ymm0
+    vmovdqu    [eax * 2 + edx], ymm1  // store 4 pixels of ARGB
+    vmovdqu    [eax * 2 + edx + 32], ymm2  // store next 4 pixels of ARGB
+    lea       eax, [eax + 32]
+    sub       ecx, 16
+    jg        convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_RGB565TOARGBROW_AVX2
+
+#ifdef HAS_ARGB1555TOARGBROW_AVX2
+__declspec(naked)
+void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
+                            int pix) {
+  __asm {
+    mov        eax, 0x01080108  // generate multiplier to repeat 5 bits
+    vmovd      xmm5, eax
+    vbroadcastss ymm5, xmm5
+    mov        eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
+    movd       xmm6, eax
+    vbroadcastss ymm6, xmm6
+    vpcmpeqb   ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
+    vpsllw     ymm3, ymm3, 11
+    vpsrlw     ymm4, ymm3, 6    // generate mask 0x03e003e0 for Green
+    vpcmpeqb   ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
+    vpsllw     ymm7, ymm7, 8
+
+    mov        eax,  [esp + 4]   // src_argb1555
+    mov        edx,  [esp + 8]   // dst_argb
+    mov        ecx,  [esp + 12]  // pix
+    sub        edx,  eax
+    sub        edx,  eax
+
+ convertloop:
+    vmovdqu    ymm0, [eax]         // fetch 16 pixels of 1555
+    vpsllw     ymm1, ymm0, 1       // R in upper 5 bits
+    vpsllw     ymm2, ymm0, 11      // B in upper 5 bits
+    vpand      ymm1, ymm1, ymm3
+    vpmulhuw   ymm2, ymm2, ymm5    // * (256 + 8)
+    vpmulhuw   ymm1, ymm1, ymm5    // * (256 + 8)
+    vpsllw     ymm1, ymm1, 8
+    vpor       ymm1, ymm1, ymm2    // RB
+    vpsraw     ymm2, ymm0, 8       // A
+    vpand      ymm0, ymm0, ymm4    // G in middle 5 bits
+    vpmulhuw   ymm0, ymm0, ymm6    // << 6 * (256 + 8)
+    vpand      ymm2, ymm2, ymm7
+    vpor       ymm0, ymm0, ymm2    // AG
+    vpermq     ymm0, ymm0, 0xd8    // mutate for unpack
+    vpermq     ymm1, ymm1, 0xd8
+    vpunpckhbw ymm2, ymm1, ymm0
+    vpunpcklbw ymm1, ymm1, ymm0
+    vmovdqu    [eax * 2 + edx], ymm1  // store 8 pixels of ARGB
+    vmovdqu    [eax * 2 + edx + 32], ymm2  // store next 8 pixels of ARGB
+    lea       eax, [eax + 32]
+    sub       ecx, 16
+    jg        convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGB1555TOARGBROW_AVX2
+
+#ifdef HAS_ARGB4444TOARGBROW_AVX2
+__declspec(naked)
+void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
+                            int pix) {
+  __asm {
+    mov       eax,  0x0f0f0f0f  // generate mask 0x0f0f0f0f
+    vmovd     xmm4, eax
+    vbroadcastss ymm4, xmm4
+    vpslld    ymm5, ymm4, 4     // 0xf0f0f0f0 for high nibbles
+    mov       eax,  [esp + 4]   // src_argb4444
+    mov       edx,  [esp + 8]   // dst_argb
+    mov       ecx,  [esp + 12]  // pix
+    sub       edx,  eax
+    sub       edx,  eax
+
+ convertloop:
+    vmovdqu    ymm0, [eax]         // fetch 16 pixels of bgra4444
+    vpand      ymm2, ymm0, ymm5    // mask high nibbles
+    vpand      ymm0, ymm0, ymm4    // mask low nibbles
+    vpsrlw     ymm3, ymm2, 4
+    vpsllw     ymm1, ymm0, 4
+    vpor       ymm2, ymm2, ymm3
+    vpor       ymm0, ymm0, ymm1
+    vpermq     ymm0, ymm0, 0xd8    // mutate for unpack
+    vpermq     ymm2, ymm2, 0xd8
+    vpunpckhbw ymm1, ymm0, ymm2
+    vpunpcklbw ymm0, ymm0, ymm2
+    vmovdqu    [eax * 2 + edx], ymm0  // store 8 pixels of ARGB
+    vmovdqu    [eax * 2 + edx + 32], ymm1  // store next 8 pixels of ARGB
+    lea       eax, [eax + 32]
+    sub       ecx, 16
+    jg        convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGB4444TOARGBROW_AVX2
+
+// 24 instructions
+__declspec(naked)
+void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
+                            int pix) {
+  __asm {
+    mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
+    movd      xmm5, eax
+    pshufd    xmm5, xmm5, 0
+    mov       eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
+    movd      xmm6, eax
+    pshufd    xmm6, xmm6, 0
+    pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
+    psllw     xmm3, 11
+    movdqa    xmm4, xmm3       // generate mask 0x03e003e0 for Green
+    psrlw     xmm4, 6
+    pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
+    psllw     xmm7, 8
+
+    mov       eax, [esp + 4]   // src_argb1555
+    mov       edx, [esp + 8]   // dst_argb
+    mov       ecx, [esp + 12]  // pix
+    sub       edx, eax
+    sub       edx, eax
+
+ convertloop:
+    movdqu    xmm0, [eax]   // fetch 8 pixels of 1555
+    movdqa    xmm1, xmm0
+    movdqa    xmm2, xmm0
+    psllw     xmm1, 1       // R in upper 5 bits
+    psllw     xmm2, 11      // B in upper 5 bits
+    pand      xmm1, xmm3
+    pmulhuw   xmm2, xmm5    // * (256 + 8)
+    pmulhuw   xmm1, xmm5    // * (256 + 8)
+    psllw     xmm1, 8
+    por       xmm1, xmm2    // RB
+    movdqa    xmm2, xmm0
+    pand      xmm0, xmm4    // G in middle 5 bits
+    psraw     xmm2, 8       // A
+    pmulhuw   xmm0, xmm6    // << 6 * (256 + 8)
+    pand      xmm2, xmm7
+    por       xmm0, xmm2    // AG
+    movdqa    xmm2, xmm1
+    punpcklbw xmm1, xmm0
+    punpckhbw xmm2, xmm0
+    movdqu    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
+    movdqu    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
+    lea       eax, [eax + 16]
+    sub       ecx, 8
+    jg        convertloop
+    ret
+  }
+}
+
+// 18 instructions.
+__declspec(naked)
+void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
+                            int pix) {
+  __asm {
+    mov       eax, 0x0f0f0f0f  // generate mask 0x0f0f0f0f
+    movd      xmm4, eax
+    pshufd    xmm4, xmm4, 0
+    movdqa    xmm5, xmm4       // 0xf0f0f0f0 for high nibbles
+    pslld     xmm5, 4
+    mov       eax, [esp + 4]   // src_argb4444
+    mov       edx, [esp + 8]   // dst_argb
+    mov       ecx, [esp + 12]  // pix
+    sub       edx, eax
+    sub       edx, eax
+
+ convertloop:
+    movdqu    xmm0, [eax]   // fetch 8 pixels of bgra4444
+    movdqa    xmm2, xmm0
+    pand      xmm0, xmm4    // mask low nibbles
+    pand      xmm2, xmm5    // mask high nibbles
+    movdqa    xmm1, xmm0
+    movdqa    xmm3, xmm2
+    psllw     xmm1, 4
+    psrlw     xmm3, 4
+    por       xmm0, xmm1
+    por       xmm2, xmm3
+    movdqa    xmm1, xmm0
+    punpcklbw xmm0, xmm2
+    punpckhbw xmm1, xmm2
+    movdqu    [eax * 2 + edx], xmm0  // store 4 pixels of ARGB
+    movdqu    [eax * 2 + edx + 16], xmm1  // store next 4 pixels of ARGB
+    lea       eax, [eax + 16]
+    sub       ecx, 8
+    jg        convertloop
+    ret
+  }
+}
+
+__declspec(naked)
+void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
+  __asm {
+    mov       eax, [esp + 4]   // src_argb
+    mov       edx, [esp + 8]   // dst_rgb
+    mov       ecx, [esp + 12]  // pix
+    movdqa    xmm6, kShuffleMaskARGBToRGB24
+
+ convertloop:
+    movdqu    xmm0, [eax]   // fetch 16 pixels of argb
+    movdqu    xmm1, [eax + 16]
+    movdqu    xmm2, [eax + 32]
+    movdqu    xmm3, [eax + 48]
+    lea       eax, [eax + 64]
+    pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
+    pshufb    xmm1, xmm6
+    pshufb    xmm2, xmm6
+    pshufb    xmm3, xmm6
+    movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
+    psrldq    xmm1, 4      // 8 bytes from 1
+    pslldq    xmm4, 12     // 4 bytes from 1 for 0
+    movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
+    por       xmm0, xmm4   // 4 bytes from 1 for 0
+    pslldq    xmm5, 8      // 8 bytes from 2 for 1
+    movdqu    [edx], xmm0  // store 0
+    por       xmm1, xmm5   // 8 bytes from 2 for 1
+    psrldq    xmm2, 8      // 4 bytes from 2
+    pslldq    xmm3, 4      // 12 bytes from 3 for 2
+    por       xmm2, xmm3   // 12 bytes from 3 for 2
+    movdqu    [edx + 16], xmm1   // store 1
+    movdqu    [edx + 32], xmm2   // store 2
+    lea       edx, [edx + 48]
+    sub       ecx, 16
+    jg        convertloop
+    ret
+  }
+}
+
+__declspec(naked)
+void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
+  __asm {
+    mov       eax, [esp + 4]   // src_argb
+    mov       edx, [esp + 8]   // dst_rgb
+    mov       ecx, [esp + 12]  // pix
+    movdqa    xmm6, kShuffleMaskARGBToRAW
+
+ convertloop:
+    movdqu    xmm0, [eax]   // fetch 16 pixels of argb
+    movdqu    xmm1, [eax + 16]
+    movdqu    xmm2, [eax + 32]
+    movdqu    xmm3, [eax + 48]
+    lea       eax, [eax + 64]
+    pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
+    pshufb    xmm1, xmm6
+    pshufb    xmm2, xmm6
+    pshufb    xmm3, xmm6
+    movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
+    psrldq    xmm1, 4      // 8 bytes from 1
+    pslldq    xmm4, 12     // 4 bytes from 1 for 0
+    movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
+    por       xmm0, xmm4   // 4 bytes from 1 for 0
+    pslldq    xmm5, 8      // 8 bytes from 2 for 1
+    movdqu    [edx], xmm0  // store 0
+    por       xmm1, xmm5   // 8 bytes from 2 for 1
+    psrldq    xmm2, 8      // 4 bytes from 2
+    pslldq    xmm3, 4      // 12 bytes from 3 for 2
+    por       xmm2, xmm3   // 12 bytes from 3 for 2
+    movdqu    [edx + 16], xmm1   // store 1
+    movdqu    [edx + 32], xmm2   // store 2
+    lea       edx, [edx + 48]
+    sub       ecx, 16
+    jg        convertloop
+    ret
+  }
+}
+
+// 4 pixels
+__declspec(naked)
+void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
+  __asm {
+    mov       eax, [esp + 4]   // src_argb
+    mov       edx, [esp + 8]   // dst_rgb
+    mov       ecx, [esp + 12]  // pix
+    pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f
+    psrld     xmm3, 27
+    pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0
+    psrld     xmm4, 26
+    pslld     xmm4, 5
+    pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800
+    pslld     xmm5, 11
+
+ convertloop:
+    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
+    movdqa    xmm1, xmm0    // B
+    movdqa    xmm2, xmm0    // G
+    pslld     xmm0, 8       // R
+    psrld     xmm1, 3       // B
+    psrld     xmm2, 5       // G
+    psrad     xmm0, 16      // R
+    pand      xmm1, xmm3    // B
+    pand      xmm2, xmm4    // G
+    pand      xmm0, xmm5    // R
+    por       xmm1, xmm2    // BG
+    por       xmm0, xmm1    // BGR
+    packssdw  xmm0, xmm0
+    lea       eax, [eax + 16]
+    movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
+    lea       edx, [edx + 8]
+    sub       ecx, 4
+    jg        convertloop
+    ret
+  }
+}
+
+// 8 pixels
+__declspec(naked)
+void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
+                                const uint32 dither4, int pix) {
+  __asm {
+
+    mov       eax, [esp + 4]   // src_argb
+    mov       edx, [esp + 8]   // dst_rgb
+    movd      xmm6, [esp + 12] // dither4
+    mov       ecx, [esp + 16]  // pix
+    punpcklbw xmm6, xmm6       // make dither 16 bytes
+    movdqa    xmm7, xmm6
+    punpcklwd xmm6, xmm6
+    punpckhwd xmm7, xmm7
+    pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f
+    psrld     xmm3, 27
+    pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0
+    psrld     xmm4, 26
+    pslld     xmm4, 5
+    pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800
+    pslld     xmm5, 11
+
+ convertloop:
+    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
+    paddusb   xmm0, xmm6    // add dither
+    movdqa    xmm1, xmm0    // B
+    movdqa    xmm2, xmm0    // G
+    pslld     xmm0, 8       // R
+    psrld     xmm1, 3       // B
+    psrld     xmm2, 5       // G
+    psrad     xmm0, 16      // R
+    pand      xmm1, xmm3    // B
+    pand      xmm2, xmm4    // G
+    pand      xmm0, xmm5    // R
+    por       xmm1, xmm2    // BG
+    por       xmm0, xmm1    // BGR
+    packssdw  xmm0, xmm0
+    lea       eax, [eax + 16]
+    movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
+    lea       edx, [edx + 8]
+    sub       ecx, 4
+    jg        convertloop
+    ret
+  }
+}
+
+#ifdef HAS_ARGBTORGB565DITHERROW_AVX2
+__declspec(naked)
+void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
+                                const uint32 dither4, int pix) {
+  __asm {
+    mov        eax, [esp + 4]      // src_argb
+    mov        edx, [esp + 8]      // dst_rgb
+    vbroadcastss xmm6, [esp + 12]  // dither4
+    mov        ecx, [esp + 16]     // pix
+    vpunpcklbw xmm6, xmm6, xmm6    // make dither 32 bytes
+    vpermq     ymm6, ymm6, 0xd8
+    vpunpcklwd ymm6, ymm6, ymm6
+    vpcmpeqb   ymm3, ymm3, ymm3    // generate mask 0x0000001f
+    vpsrld     ymm3, ymm3, 27
+    vpcmpeqb   ymm4, ymm4, ymm4    // generate mask 0x000007e0
+    vpsrld     ymm4, ymm4, 26
+    vpslld     ymm4, ymm4, 5
+    vpslld     ymm5, ymm3, 11      // generate mask 0x0000f800
+
+ convertloop:
+    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
+    vpaddusb   ymm0, ymm0, ymm6    // add dither
+    vpsrld     ymm2, ymm0, 5       // G
+    vpsrld     ymm1, ymm0, 3       // B
+    vpsrld     ymm0, ymm0, 8       // R
+    vpand      ymm2, ymm2, ymm4    // G
+    vpand      ymm1, ymm1, ymm3    // B
+    vpand      ymm0, ymm0, ymm5    // R
+    vpor       ymm1, ymm1, ymm2    // BG
+    vpor       ymm0, ymm0, ymm1    // BGR
+    vpackusdw  ymm0, ymm0, ymm0
+    vpermq     ymm0, ymm0, 0xd8
+    lea        eax, [eax + 32]
+    vmovdqu    [edx], xmm0         // store 8 pixels of RGB565
+    lea        edx, [edx + 16]
+    sub        ecx, 8
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBTORGB565DITHERROW_AVX2
+
+// TODO(fbarchard): Improve sign extension/packing.
+__declspec(naked)
+void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
+  __asm {
+    mov       eax, [esp + 4]   // src_argb
+    mov       edx, [esp + 8]   // dst_rgb
+    mov       ecx, [esp + 12]  // pix
+    pcmpeqb   xmm4, xmm4       // generate mask 0x0000001f
+    psrld     xmm4, 27
+    movdqa    xmm5, xmm4       // generate mask 0x000003e0
+    pslld     xmm5, 5
+    movdqa    xmm6, xmm4       // generate mask 0x00007c00
+    pslld     xmm6, 10
+    pcmpeqb   xmm7, xmm7       // generate mask 0xffff8000
+    pslld     xmm7, 15
+
+ convertloop:
+    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
+    movdqa    xmm1, xmm0    // B
+    movdqa    xmm2, xmm0    // G
+    movdqa    xmm3, xmm0    // R
+    psrad     xmm0, 16      // A
+    psrld     xmm1, 3       // B
+    psrld     xmm2, 6       // G
+    psrld     xmm3, 9       // R
+    pand      xmm0, xmm7    // A
+    pand      xmm1, xmm4    // B
+    pand      xmm2, xmm5    // G
+    pand      xmm3, xmm6    // R
+    por       xmm0, xmm1    // BA
+    por       xmm2, xmm3    // GR
+    por       xmm0, xmm2    // BGRA
+    packssdw  xmm0, xmm0
+    lea       eax, [eax + 16]
+    movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB1555
+    lea       edx, [edx + 8]
+    sub       ecx, 4
+    jg        convertloop
+    ret
+  }
+}
+
+__declspec(naked)
+void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
+  __asm {
+    mov       eax, [esp + 4]   // src_argb
+    mov       edx, [esp + 8]   // dst_rgb
+    mov       ecx, [esp + 12]  // pix
+    pcmpeqb   xmm4, xmm4       // generate mask 0xf000f000
+    psllw     xmm4, 12
+    movdqa    xmm3, xmm4       // generate mask 0x00f000f0
+    psrlw     xmm3, 8
+
+ convertloop:
+    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
+    movdqa    xmm1, xmm0
+    pand      xmm0, xmm3    // low nibble
+    pand      xmm1, xmm4    // high nibble
+    psrld     xmm0, 4
+    psrld     xmm1, 8
+    por       xmm0, xmm1
+    packuswb  xmm0, xmm0
+    lea       eax, [eax + 16]
+    movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB4444
+    lea       edx, [edx + 8]
+    sub       ecx, 4
+    jg        convertloop
+    ret
+  }
+}
+
+#ifdef HAS_ARGBTORGB565ROW_AVX2
+__declspec(naked)
+void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
+  __asm {
+    mov        eax, [esp + 4]      // src_argb
+    mov        edx, [esp + 8]      // dst_rgb
+    mov        ecx, [esp + 12]     // pix
+    vpcmpeqb   ymm3, ymm3, ymm3    // generate mask 0x0000001f
+    vpsrld     ymm3, ymm3, 27
+    vpcmpeqb   ymm4, ymm4, ymm4    // generate mask 0x000007e0
+    vpsrld     ymm4, ymm4, 26
+    vpslld     ymm4, ymm4, 5
+    vpslld     ymm5, ymm3, 11      // generate mask 0x0000f800
+
+ convertloop:
+    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
+    vpsrld     ymm2, ymm0, 5       // G
+    vpsrld     ymm1, ymm0, 3       // B
+    vpsrld     ymm0, ymm0, 8       // R
+    vpand      ymm2, ymm2, ymm4    // G
+    vpand      ymm1, ymm1, ymm3    // B
+    vpand      ymm0, ymm0, ymm5    // R
+    vpor       ymm1, ymm1, ymm2    // BG
+    vpor       ymm0, ymm0, ymm1    // BGR
+    vpackusdw  ymm0, ymm0, ymm0
+    vpermq     ymm0, ymm0, 0xd8
+    lea        eax, [eax + 32]
+    vmovdqu    [edx], xmm0         // store 8 pixels of RGB565
+    lea        edx, [edx + 16]
+    sub        ecx, 8
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBTORGB565ROW_AVX2
+
+#ifdef HAS_ARGBTOARGB1555ROW_AVX2
+__declspec(naked)
+void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
+  __asm {
+    mov        eax, [esp + 4]      // src_argb
+    mov        edx, [esp + 8]      // dst_rgb
+    mov        ecx, [esp + 12]     // pix
+    vpcmpeqb   ymm4, ymm4, ymm4
+    vpsrld     ymm4, ymm4, 27      // generate mask 0x0000001f
+    vpslld     ymm5, ymm4, 5       // generate mask 0x000003e0
+    vpslld     ymm6, ymm4, 10      // generate mask 0x00007c00
+    vpcmpeqb   ymm7, ymm7, ymm7    // generate mask 0xffff8000
+    vpslld     ymm7, ymm7, 15
+
+ convertloop:
+    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
+    vpsrld     ymm3, ymm0, 9       // R
+    vpsrld     ymm2, ymm0, 6       // G
+    vpsrld     ymm1, ymm0, 3       // B
+    vpsrad     ymm0, ymm0, 16      // A
+    vpand      ymm3, ymm3, ymm6    // R
+    vpand      ymm2, ymm2, ymm5    // G
+    vpand      ymm1, ymm1, ymm4    // B
+    vpand      ymm0, ymm0, ymm7    // A
+    vpor       ymm0, ymm0, ymm1    // BA
+    vpor       ymm2, ymm2, ymm3    // GR
+    vpor       ymm0, ymm0, ymm2    // BGRA
+    vpackssdw  ymm0, ymm0, ymm0
+    vpermq     ymm0, ymm0, 0xd8
+    lea        eax, [eax + 32]
+    vmovdqu    [edx], xmm0         // store 8 pixels of ARGB1555
+    lea        edx, [edx + 16]
+    sub        ecx, 8
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBTOARGB1555ROW_AVX2
+
+#ifdef HAS_ARGBTOARGB4444ROW_AVX2
+__declspec(naked)
+void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
+  __asm {
+    mov        eax, [esp + 4]   // src_argb
+    mov        edx, [esp + 8]   // dst_rgb
+    mov        ecx, [esp + 12]  // pix
+    vpcmpeqb   ymm4, ymm4, ymm4   // generate mask 0xf000f000
+    vpsllw     ymm4, ymm4, 12
+    vpsrlw     ymm3, ymm4, 8      // generate mask 0x00f000f0
+
+ convertloop:
+    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
+    vpand      ymm1, ymm0, ymm4    // high nibble
+    vpand      ymm0, ymm0, ymm3    // low nibble
+    vpsrld     ymm1, ymm1, 8
+    vpsrld     ymm0, ymm0, 4
+    vpor       ymm0, ymm0, ymm1
+    vpackuswb  ymm0, ymm0, ymm0
+    vpermq     ymm0, ymm0, 0xd8
+    lea        eax, [eax + 32]
+    vmovdqu    [edx], xmm0         // store 8 pixels of ARGB4444
+    lea        edx, [edx + 16]
+    sub        ecx, 8
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBTOARGB4444ROW_AVX2
+
+// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
+__declspec(naked)
+void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+  __asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_y */
+    mov        ecx, [esp + 12]  /* pix */
+    movdqa     xmm4, kARGBToY
+    movdqa     xmm5, kAddY16
+
+ convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    pmaddubsw  xmm0, xmm4
+    pmaddubsw  xmm1, xmm4
+    pmaddubsw  xmm2, xmm4
+    pmaddubsw  xmm3, xmm4
+    lea        eax, [eax + 64]
+    phaddw     xmm0, xmm1
+    phaddw     xmm2, xmm3
+    psrlw      xmm0, 7
+    psrlw      xmm2, 7
+    packuswb   xmm0, xmm2
+    paddb      xmm0, xmm5
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         convertloop
+    ret
+  }
+}
+
+// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
+// Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
+__declspec(naked)
+void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+  __asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_y */
+    mov        ecx, [esp + 12]  /* pix */
+    movdqa     xmm4, kARGBToYJ
+    movdqa     xmm5, kAddYJ64
+
+ convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    pmaddubsw  xmm0, xmm4
+    pmaddubsw  xmm1, xmm4
+    pmaddubsw  xmm2, xmm4
+    pmaddubsw  xmm3, xmm4
+    lea        eax, [eax + 64]
+    phaddw     xmm0, xmm1
+    phaddw     xmm2, xmm3
+    paddw      xmm0, xmm5  // Add .5 for rounding.
+    paddw      xmm2, xmm5
+    psrlw      xmm0, 7
+    psrlw      xmm2, 7
+    packuswb   xmm0, xmm2
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         convertloop
+    ret
+  }
+}
+
+#ifdef HAS_ARGBTOYROW_AVX2
+// vpermd for vphaddw + vpackuswb vpermd.
+static const lvec32 kPermdARGBToY_AVX = {
+  0, 4, 1, 5, 2, 6, 3, 7
+};
+
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+__declspec(naked)
+void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
+  __asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_y */
+    mov        ecx, [esp + 12]  /* pix */
+    vbroadcastf128 ymm4, kARGBToY
+    vbroadcastf128 ymm5, kAddY16
+    vmovdqu    ymm6, kPermdARGBToY_AVX
+
+ convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    vmovdqu    ymm2, [eax + 64]
+    vmovdqu    ymm3, [eax + 96]
+    vpmaddubsw ymm0, ymm0, ymm4
+    vpmaddubsw ymm1, ymm1, ymm4
+    vpmaddubsw ymm2, ymm2, ymm4
+    vpmaddubsw ymm3, ymm3, ymm4
+    lea        eax, [eax + 128]
+    vphaddw    ymm0, ymm0, ymm1  // mutates.
+    vphaddw    ymm2, ymm2, ymm3
+    vpsrlw     ymm0, ymm0, 7
+    vpsrlw     ymm2, ymm2, 7
+    vpackuswb  ymm0, ymm0, ymm2  // mutates.
+    vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
+    vpaddb     ymm0, ymm0, ymm5  // add 16 for Y
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
+    sub        ecx, 32
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  //  HAS_ARGBTOYROW_AVX2
+
+#ifdef HAS_ARGBTOYJROW_AVX2
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+__declspec(naked)
+void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
+  __asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_y */
+    mov        ecx, [esp + 12]  /* pix */
+    vbroadcastf128 ymm4, kARGBToYJ
+    vbroadcastf128 ymm5, kAddYJ64
+    vmovdqu    ymm6, kPermdARGBToY_AVX
+
+ convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    vmovdqu    ymm2, [eax + 64]
+    vmovdqu    ymm3, [eax + 96]
+    vpmaddubsw ymm0, ymm0, ymm4
+    vpmaddubsw ymm1, ymm1, ymm4
+    vpmaddubsw ymm2, ymm2, ymm4
+    vpmaddubsw ymm3, ymm3, ymm4
+    lea        eax, [eax + 128]
+    vphaddw    ymm0, ymm0, ymm1  // mutates.
+    vphaddw    ymm2, ymm2, ymm3
+    vpaddw     ymm0, ymm0, ymm5  // Add .5 for rounding.
+    vpaddw     ymm2, ymm2, ymm5
+    vpsrlw     ymm0, ymm0, 7
+    vpsrlw     ymm2, ymm2, 7
+    vpackuswb  ymm0, ymm0, ymm2  // mutates.
+    vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
+    sub        ecx, 32
+    jg         convertloop
+
+    vzeroupper
+    ret
+  }
+}
+#endif  //  HAS_ARGBTOYJROW_AVX2
+
+__declspec(naked)
+void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+  __asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_y */
+    mov        ecx, [esp + 12]  /* pix */
+    movdqa     xmm4, kBGRAToY
+    movdqa     xmm5, kAddY16
+
+ convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    pmaddubsw  xmm0, xmm4
+    pmaddubsw  xmm1, xmm4
+    pmaddubsw  xmm2, xmm4
+    pmaddubsw  xmm3, xmm4
+    lea        eax, [eax + 64]
+    phaddw     xmm0, xmm1
+    phaddw     xmm2, xmm3
+    psrlw      xmm0, 7
+    psrlw      xmm2, 7
+    packuswb   xmm0, xmm2
+    paddb      xmm0, xmm5
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         convertloop
+    ret
+  }
+}
+
+__declspec(naked)
+void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+  __asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_y */
+    mov        ecx, [esp + 12]  /* pix */
+    movdqa     xmm4, kABGRToY
+    movdqa     xmm5, kAddY16
+
+ convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    pmaddubsw  xmm0, xmm4
+    pmaddubsw  xmm1, xmm4
+    pmaddubsw  xmm2, xmm4
+    pmaddubsw  xmm3, xmm4
+    lea        eax, [eax + 64]
+    phaddw     xmm0, xmm1
+    phaddw     xmm2, xmm3
+    psrlw      xmm0, 7
+    psrlw      xmm2, 7
+    packuswb   xmm0, xmm2
+    paddb      xmm0, xmm5
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         convertloop
+    ret
+  }
+}
+
+__declspec(naked)
+void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+  __asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_y */
+    mov        ecx, [esp + 12]  /* pix */
+    movdqa     xmm4, kRGBAToY
+    movdqa     xmm5, kAddY16
+
+ convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    pmaddubsw  xmm0, xmm4
+    pmaddubsw  xmm1, xmm4
+    pmaddubsw  xmm2, xmm4
+    pmaddubsw  xmm3, xmm4
+    lea        eax, [eax + 64]
+    phaddw     xmm0, xmm1
+    phaddw     xmm2, xmm3
+    psrlw      xmm0, 7
+    psrlw      xmm2, 7
+    packuswb   xmm0, xmm2
+    paddb      xmm0, xmm5
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         convertloop
+    ret
+  }
+}
+
+__declspec(naked)
+void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // src_argb
+    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // pix
+    movdqa     xmm5, kAddUV128
+    movdqa     xmm6, kARGBToV
+    movdqa     xmm7, kARGBToU
+    sub        edi, edx             // stride from u to v
+
+ convertloop:
+    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+    movdqu     xmm0, [eax]
+    movdqu     xmm4, [eax + esi]
+    pavgb      xmm0, xmm4
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm4, [eax + esi + 16]
+    pavgb      xmm1, xmm4
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm4, [eax + esi + 32]
+    pavgb      xmm2, xmm4
+    movdqu     xmm3, [eax + 48]
+    movdqu     xmm4, [eax + esi + 48]
+    pavgb      xmm3, xmm4
+
+    lea        eax,  [eax + 64]
+    movdqa     xmm4, xmm0
+    shufps     xmm0, xmm1, 0x88
+    shufps     xmm4, xmm1, 0xdd
+    pavgb      xmm0, xmm4
+    movdqa     xmm4, xmm2
+    shufps     xmm2, xmm3, 0x88
+    shufps     xmm4, xmm3, 0xdd
+    pavgb      xmm2, xmm4
+
+    // step 2 - convert to U and V
+    // from here down is very similar to Y code except
+    // instead of 16 different pixels, its 8 pixels of U and 8 of V
+    movdqa     xmm1, xmm0
+    movdqa     xmm3, xmm2
+    pmaddubsw  xmm0, xmm7  // U
+    pmaddubsw  xmm2, xmm7
+    pmaddubsw  xmm1, xmm6  // V
+    pmaddubsw  xmm3, xmm6
+    phaddw     xmm0, xmm2
+    phaddw     xmm1, xmm3
+    psraw      xmm0, 8
+    psraw      xmm1, 8
+    packsswb   xmm0, xmm1
+    paddb      xmm0, xmm5            // -> unsigned
+
+    // step 3 - store 8 U and 8 V values
+    movlps     qword ptr [edx], xmm0 // U
+    movhps     qword ptr [edx + edi], xmm0 // V
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked)
+void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                        uint8* dst_u, uint8* dst_v, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // src_argb
+    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // pix
+    movdqa     xmm5, kAddUVJ128
+    movdqa     xmm6, kARGBToVJ
+    movdqa     xmm7, kARGBToUJ
+    sub        edi, edx             // stride from u to v
+
+ convertloop:
+    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+    movdqu     xmm0, [eax]
+    movdqu     xmm4, [eax + esi]
+    pavgb      xmm0, xmm4
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm4, [eax + esi + 16]
+    pavgb      xmm1, xmm4
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm4, [eax + esi + 32]
+    pavgb      xmm2, xmm4
+    movdqu     xmm3, [eax + 48]
+    movdqu     xmm4, [eax + esi + 48]
+    pavgb      xmm3, xmm4
+
+    lea        eax,  [eax + 64]
+    movdqa     xmm4, xmm0
+    shufps     xmm0, xmm1, 0x88
+    shufps     xmm4, xmm1, 0xdd
+    pavgb      xmm0, xmm4
+    movdqa     xmm4, xmm2
+    shufps     xmm2, xmm3, 0x88
+    shufps     xmm4, xmm3, 0xdd
+    pavgb      xmm2, xmm4
+
+    // step 2 - convert to U and V
+    // from here down is very similar to Y code except
+    // instead of 16 different pixels, its 8 pixels of U and 8 of V
+    movdqa     xmm1, xmm0
+    movdqa     xmm3, xmm2
+    pmaddubsw  xmm0, xmm7  // U
+    pmaddubsw  xmm2, xmm7
+    pmaddubsw  xmm1, xmm6  // V
+    pmaddubsw  xmm3, xmm6
+    phaddw     xmm0, xmm2
+    phaddw     xmm1, xmm3
+    paddw      xmm0, xmm5            // +.5 rounding -> unsigned
+    paddw      xmm1, xmm5
+    psraw      xmm0, 8
+    psraw      xmm1, 8
+    packsswb   xmm0, xmm1
+
+    // step 3 - store 8 U and 8 V values
+    movlps     qword ptr [edx], xmm0 // U
+    movhps     qword ptr [edx + edi], xmm0 // V
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+#ifdef HAS_ARGBTOUVROW_AVX2
+__declspec(naked)
+void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // src_argb
+    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // pix
+    vbroadcastf128 ymm5, kAddUV128
+    vbroadcastf128 ymm6, kARGBToV
+    vbroadcastf128 ymm7, kARGBToU
+    sub        edi, edx             // stride from u to v
+
+ convertloop:
+    /* step 1 - subsample 32x2 argb pixels to 16x1 */
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    vmovdqu    ymm2, [eax + 64]
+    vmovdqu    ymm3, [eax + 96]
+    vpavgb     ymm0, ymm0, [eax + esi]
+    vpavgb     ymm1, ymm1, [eax + esi + 32]
+    vpavgb     ymm2, ymm2, [eax + esi + 64]
+    vpavgb     ymm3, ymm3, [eax + esi + 96]
+    lea        eax,  [eax + 128]
+    vshufps    ymm4, ymm0, ymm1, 0x88
+    vshufps    ymm0, ymm0, ymm1, 0xdd
+    vpavgb     ymm0, ymm0, ymm4  // mutated by vshufps
+    vshufps    ymm4, ymm2, ymm3, 0x88
+    vshufps    ymm2, ymm2, ymm3, 0xdd
+    vpavgb     ymm2, ymm2, ymm4  // mutated by vshufps
+
+    // step 2 - convert to U and V
+    // from here down is very similar to Y code except
+    // instead of 32 different pixels, its 16 pixels of U and 16 of V
+    vpmaddubsw ymm1, ymm0, ymm7  // U
+    vpmaddubsw ymm3, ymm2, ymm7
+    vpmaddubsw ymm0, ymm0, ymm6  // V
+    vpmaddubsw ymm2, ymm2, ymm6
+    vphaddw    ymm1, ymm1, ymm3  // mutates
+    vphaddw    ymm0, ymm0, ymm2
+    vpsraw     ymm1, ymm1, 8
+    vpsraw     ymm0, ymm0, 8
+    vpacksswb  ymm0, ymm1, ymm0  // mutates
+    vpermq     ymm0, ymm0, 0xd8  // For vpacksswb
+    vpshufb    ymm0, ymm0, kShufARGBToUV_AVX  // For vshufps + vphaddw
+    vpaddb     ymm0, ymm0, ymm5  // -> unsigned
+
+    // step 3 - store 16 U and 16 V values
+    vextractf128 [edx], ymm0, 0 // U
+    vextractf128 [edx + edi], ymm0, 1 // V
+    lea        edx, [edx + 16]
+    sub        ecx, 32
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBTOUVROW_AVX2
+
+__declspec(naked)
+void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
+                          uint8* dst_u, uint8* dst_v, int width) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]   // src_argb
+    mov        edx, [esp + 4 + 8]   // dst_u
+    mov        edi, [esp + 4 + 12]  // dst_v
+    mov        ecx, [esp + 4 + 16]  // pix
+    movdqa     xmm5, kAddUV128
+    movdqa     xmm6, kARGBToV
+    movdqa     xmm7, kARGBToU
+    sub        edi, edx             // stride from u to v
+
+ convertloop:
+    /* convert to U and V */
+    movdqu     xmm0, [eax]          // U
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    pmaddubsw  xmm0, xmm7
+    pmaddubsw  xmm1, xmm7
+    pmaddubsw  xmm2, xmm7
+    pmaddubsw  xmm3, xmm7
+    phaddw     xmm0, xmm1
+    phaddw     xmm2, xmm3
+    psraw      xmm0, 8
+    psraw      xmm2, 8
+    packsswb   xmm0, xmm2
+    paddb      xmm0, xmm5
+    movdqu     [edx], xmm0
+
+    movdqu     xmm0, [eax]          // V
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    pmaddubsw  xmm0, xmm6
+    pmaddubsw  xmm1, xmm6
+    pmaddubsw  xmm2, xmm6
+    pmaddubsw  xmm3, xmm6
+    phaddw     xmm0, xmm1
+    phaddw     xmm2, xmm3
+    psraw      xmm0, 8
+    psraw      xmm2, 8
+    packsswb   xmm0, xmm2
+    paddb      xmm0, xmm5
+    lea        eax,  [eax + 64]
+    movdqu     [edx + edi], xmm0
+    lea        edx,  [edx + 16]
+    sub        ecx,  16
+    jg         convertloop
+
+    pop        edi
+    ret
+  }
+}
+
+__declspec(naked)
+void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
+                          uint8* dst_u, uint8* dst_v, int width) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]   // src_argb
+    mov        edx, [esp + 4 + 8]   // dst_u
+    mov        edi, [esp + 4 + 12]  // dst_v
+    mov        ecx, [esp + 4 + 16]  // pix
+    movdqa     xmm5, kAddUV128
+    movdqa     xmm6, kARGBToV
+    movdqa     xmm7, kARGBToU
+    sub        edi, edx             // stride from u to v
+
+ convertloop:
+    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    lea        eax,  [eax + 64]
+    movdqa     xmm4, xmm0
+    shufps     xmm0, xmm1, 0x88
+    shufps     xmm4, xmm1, 0xdd
+    pavgb      xmm0, xmm4
+    movdqa     xmm4, xmm2
+    shufps     xmm2, xmm3, 0x88
+    shufps     xmm4, xmm3, 0xdd
+    pavgb      xmm2, xmm4
+
+    // step 2 - convert to U and V
+    // from here down is very similar to Y code except
+    // instead of 16 different pixels, its 8 pixels of U and 8 of V
+    movdqa     xmm1, xmm0
+    movdqa     xmm3, xmm2
+    pmaddubsw  xmm0, xmm7  // U
+    pmaddubsw  xmm2, xmm7
+    pmaddubsw  xmm1, xmm6  // V
+    pmaddubsw  xmm3, xmm6
+    phaddw     xmm0, xmm2
+    phaddw     xmm1, xmm3
+    psraw      xmm0, 8
+    psraw      xmm1, 8
+    packsswb   xmm0, xmm1
+    paddb      xmm0, xmm5            // -> unsigned
+
+    // step 3 - store 8 U and 8 V values
+    movlps     qword ptr [edx], xmm0 // U
+    movhps     qword ptr [edx + edi], xmm0 // V
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    ret
+  }
+}
+
+__declspec(naked)
+void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // src_argb
+    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // pix
+    movdqa     xmm5, kAddUV128
+    movdqa     xmm6, kBGRAToV
+    movdqa     xmm7, kBGRAToU
+    sub        edi, edx             // stride from u to v
+
+ convertloop:
+    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+    movdqu     xmm0, [eax]
+    movdqu     xmm4, [eax + esi]
+    pavgb      xmm0, xmm4
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm4, [eax + esi + 16]
+    pavgb      xmm1, xmm4
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm4, [eax + esi + 32]
+    pavgb      xmm2, xmm4
+    movdqu     xmm3, [eax + 48]
+    movdqu     xmm4, [eax + esi + 48]
+    pavgb      xmm3, xmm4
+
+    lea        eax,  [eax + 64]
+    movdqa     xmm4, xmm0
+    shufps     xmm0, xmm1, 0x88
+    shufps     xmm4, xmm1, 0xdd
+    pavgb      xmm0, xmm4
+    movdqa     xmm4, xmm2
+    shufps     xmm2, xmm3, 0x88
+    shufps     xmm4, xmm3, 0xdd
+    pavgb      xmm2, xmm4
+
+    // step 2 - convert to U and V
+    // from here down is very similar to Y code except
+    // instead of 16 different pixels, its 8 pixels of U and 8 of V
+    movdqa     xmm1, xmm0
+    movdqa     xmm3, xmm2
+    pmaddubsw  xmm0, xmm7  // U
+    pmaddubsw  xmm2, xmm7
+    pmaddubsw  xmm1, xmm6  // V
+    pmaddubsw  xmm3, xmm6
+    phaddw     xmm0, xmm2
+    phaddw     xmm1, xmm3
+    psraw      xmm0, 8
+    psraw      xmm1, 8
+    packsswb   xmm0, xmm1
+    paddb      xmm0, xmm5            // -> unsigned
+
+    // step 3 - store 8 U and 8 V values
+    movlps     qword ptr [edx], xmm0 // U
+    movhps     qword ptr [edx + edi], xmm0 // V
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked)
+void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // src_argb
+    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // pix
+    movdqa     xmm5, kAddUV128
+    movdqa     xmm6, kABGRToV
+    movdqa     xmm7, kABGRToU
+    sub        edi, edx             // stride from u to v
+
+ convertloop:
+    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+    movdqu     xmm0, [eax]
+    movdqu     xmm4, [eax + esi]
+    pavgb      xmm0, xmm4
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm4, [eax + esi + 16]
+    pavgb      xmm1, xmm4
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm4, [eax + esi + 32]
+    pavgb      xmm2, xmm4
+    movdqu     xmm3, [eax + 48]
+    movdqu     xmm4, [eax + esi + 48]
+    pavgb      xmm3, xmm4
+
+    lea        eax,  [eax + 64]
+    movdqa     xmm4, xmm0
+    shufps     xmm0, xmm1, 0x88
+    shufps     xmm4, xmm1, 0xdd
+    pavgb      xmm0, xmm4
+    movdqa     xmm4, xmm2
+    shufps     xmm2, xmm3, 0x88
+    shufps     xmm4, xmm3, 0xdd
+    pavgb      xmm2, xmm4
+
+    // step 2 - convert to U and V
+    // from here down is very similar to Y code except
+    // instead of 16 different pixels, its 8 pixels of U and 8 of V
+    movdqa     xmm1, xmm0
+    movdqa     xmm3, xmm2
+    pmaddubsw  xmm0, xmm7  // U
+    pmaddubsw  xmm2, xmm7
+    pmaddubsw  xmm1, xmm6  // V
+    pmaddubsw  xmm3, xmm6
+    phaddw     xmm0, xmm2
+    phaddw     xmm1, xmm3
+    psraw      xmm0, 8
+    psraw      xmm1, 8
+    packsswb   xmm0, xmm1
+    paddb      xmm0, xmm5            // -> unsigned
+
+    // step 3 - store 8 U and 8 V values
+    movlps     qword ptr [edx], xmm0 // U
+    movhps     qword ptr [edx + edi], xmm0 // V
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked)
+void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // src_argb
+    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // pix
+    movdqa     xmm5, kAddUV128
+    movdqa     xmm6, kRGBAToV
+    movdqa     xmm7, kRGBAToU
+    sub        edi, edx             // stride from u to v
+
+ convertloop:
+    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+    movdqu     xmm0, [eax]
+    movdqu     xmm4, [eax + esi]
+    pavgb      xmm0, xmm4
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm4, [eax + esi + 16]
+    pavgb      xmm1, xmm4
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm4, [eax + esi + 32]
+    pavgb      xmm2, xmm4
+    movdqu     xmm3, [eax + 48]
+    movdqu     xmm4, [eax + esi + 48]
+    pavgb      xmm3, xmm4
+
+    lea        eax,  [eax + 64]
+    movdqa     xmm4, xmm0
+    shufps     xmm0, xmm1, 0x88
+    shufps     xmm4, xmm1, 0xdd
+    pavgb      xmm0, xmm4
+    movdqa     xmm4, xmm2
+    shufps     xmm2, xmm3, 0x88
+    shufps     xmm4, xmm3, 0xdd
+    pavgb      xmm2, xmm4
+
+    // step 2 - convert to U and V
+    // from here down is very similar to Y code except
+    // instead of 16 different pixels, its 8 pixels of U and 8 of V
+    movdqa     xmm1, xmm0
+    movdqa     xmm3, xmm2
+    pmaddubsw  xmm0, xmm7  // U
+    pmaddubsw  xmm2, xmm7
+    pmaddubsw  xmm1, xmm6  // V
+    pmaddubsw  xmm3, xmm6
+    phaddw     xmm0, xmm2
+    phaddw     xmm1, xmm3
+    psraw      xmm0, 8
+    psraw      xmm1, 8
+    packsswb   xmm0, xmm1
+    paddb      xmm0, xmm5            // -> unsigned
+
+    // step 3 - store 8 U and 8 V values
+    movlps     qword ptr [edx], xmm0 // U
+    movhps     qword ptr [edx + edi], xmm0 // V
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBTOYROW_SSSE3
+
+// Read 16 UV from 444
+#define READYUV444_AVX2 __asm {                                                \
+    __asm vmovdqu    xmm0, [esi]                  /* U */         /* NOLINT */ \
+    __asm vmovdqu    xmm1, [esi + edi]            /* V */         /* NOLINT */ \
+    __asm lea        esi,  [esi + 16]                                          \
+    __asm vpermq     ymm0, ymm0, 0xd8                                          \
+    __asm vpermq     ymm1, ymm1, 0xd8                                          \
+    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
+  }
+
+// Read 8 UV from 422, upsample to 16 UV.
+#define READYUV422_AVX2 __asm {                                                \
+    __asm vmovq      xmm0, qword ptr [esi]        /* U */         /* NOLINT */ \
+    __asm vmovq      xmm1, qword ptr [esi + edi]  /* V */         /* NOLINT */ \
+    __asm lea        esi,  [esi + 8]                                           \
+    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
+    __asm vpermq     ymm0, ymm0, 0xd8                                          \
+    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
+  }
+
+// Read 4 UV from 411, upsample to 16 UV.
+#define READYUV411_AVX2 __asm {                                                \
+    __asm vmovd      xmm0, dword ptr [esi]        /* U */         /* NOLINT */ \
+    __asm vmovd      xmm1, dword ptr [esi + edi]  /* V */         /* NOLINT */ \
+    __asm lea        esi,  [esi + 4]                                           \
+    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
+    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
+    __asm vpermq     ymm0, ymm0, 0xd8                                          \
+    __asm vpunpckldq ymm0, ymm0, ymm0             /* UVUVUVUV (upsample) */    \
+  }
+
+// Read 8 UV from NV12, upsample to 16 UV.
+#define READNV12_AVX2 __asm {                                                  \
+    __asm vmovdqu    xmm0, [esi]                  /* UV */                     \
+    __asm lea        esi,  [esi + 16]                                          \
+    __asm vpermq     ymm0, ymm0, 0xd8                                          \
+    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
+  }
+
+// Convert 16 pixels: 16 UV and 16 Y.
+#define YUVTORGB_AVX2(YuvConstants) __asm {                                    \
+    /* Step 1: Find 8 UV contributions to 16 R,G,B values */                   \
+    __asm vpmaddubsw ymm2, ymm0, YuvConstants.kUVToR        /* scale R UV */   \
+    __asm vpmaddubsw ymm1, ymm0, YuvConstants.kUVToG        /* scale G UV */   \
+    __asm vpmaddubsw ymm0, ymm0, YuvConstants.kUVToB        /* scale B UV */   \
+    __asm vmovdqu    ymm3, YuvConstants.kUVBiasR                               \
+    __asm vpsubw     ymm2, ymm3, ymm2                                          \
+    __asm vmovdqu    ymm3, YuvConstants.kUVBiasG                               \
+    __asm vpsubw     ymm1, ymm3, ymm1                                          \
+    __asm vmovdqu    ymm3, YuvConstants.kUVBiasB                               \
+    __asm vpsubw     ymm0, ymm3, ymm0                                          \
+    /* Step 2: Find Y contribution to 16 R,G,B values */                       \
+    __asm vmovdqu    xmm3, [eax]                  /* NOLINT */                 \
+    __asm lea        eax, [eax + 16]                                           \
+    __asm vpermq     ymm3, ymm3, 0xd8                                          \
+    __asm vpunpcklbw ymm3, ymm3, ymm3                                          \
+    __asm vpmulhuw   ymm3, ymm3, YuvConstants.kYToRgb                          \
+    __asm vpaddsw    ymm0, ymm0, ymm3           /* B += Y */                   \
+    __asm vpaddsw    ymm1, ymm1, ymm3           /* G += Y */                   \
+    __asm vpaddsw    ymm2, ymm2, ymm3           /* R += Y */                   \
+    __asm vpsraw     ymm0, ymm0, 6                                             \
+    __asm vpsraw     ymm1, ymm1, 6                                             \
+    __asm vpsraw     ymm2, ymm2, 6                                             \
+    __asm vpackuswb  ymm0, ymm0, ymm0           /* B */                        \
+    __asm vpackuswb  ymm1, ymm1, ymm1           /* G */                        \
+    __asm vpackuswb  ymm2, ymm2, ymm2           /* R */                        \
+  }
+
+// Store 16 ARGB values.
+#define STOREARGB_AVX2 __asm {                                                 \
+    /* Step 3: Weave into ARGB */                                              \
+    __asm vpunpcklbw ymm0, ymm0, ymm1           /* BG */                       \
+    __asm vpermq     ymm0, ymm0, 0xd8                                          \
+    __asm vpunpcklbw ymm2, ymm2, ymm5           /* RA */                       \
+    __asm vpermq     ymm2, ymm2, 0xd8                                          \
+    __asm vpunpcklwd ymm1, ymm0, ymm2           /* BGRA first 8 pixels */      \
+    __asm vpunpckhwd ymm0, ymm0, ymm2           /* BGRA next 8 pixels */       \
+    __asm vmovdqu    0[edx], ymm1                                              \
+    __asm vmovdqu    32[edx], ymm0                                             \
+    __asm lea        edx,  [edx + 64]                                          \
+  }
+
+#ifdef HAS_I422TOARGBROW_AVX2
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked)
+void I422ToARGBRow_AVX2(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* dst_argb,
+                        int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // argb
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+    READYUV422_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+    STOREARGB_AVX2
+
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_I422TOARGBROW_AVX2
+
+#ifdef HAS_J422TOARGBROW_AVX2
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked)
+void J422ToARGBRow_AVX2(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* dst_argb,
+                        int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // argb
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+    READYUV422_AVX2
+    YUVTORGB_AVX2(kYuvJConstants)
+    STOREARGB_AVX2
+
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_J422TOARGBROW_AVX2
+
+#ifdef HAS_I444TOARGBROW_AVX2
+// 16 pixels
+// 16 UV values with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked)
+void I444ToARGBRow_AVX2(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* dst_argb,
+                        int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // argb
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+    READYUV444_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+    STOREARGB_AVX2
+
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_I444TOARGBROW_AVX2
+
+#ifdef HAS_I411TOARGBROW_AVX2
+// 16 pixels
+// 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked)
+void I411ToARGBRow_AVX2(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* dst_argb,
+                        int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // argb
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+    READYUV411_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+    STOREARGB_AVX2
+
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_I411TOARGBROW_AVX2
+
+#ifdef HAS_NV12TOARGBROW_AVX2
+// 16 pixels.
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked)
+void NV12ToARGBRow_AVX2(const uint8* y_buf,
+                        const uint8* uv_buf,
+                        uint8* dst_argb,
+                        int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // Y
+    mov        esi, [esp + 4 + 8]   // UV
+    mov        edx, [esp + 4 + 12]  // argb
+    mov        ecx, [esp + 4 + 16]  // width
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+    READNV12_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+    STOREARGB_AVX2
+
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_NV12TOARGBROW_AVX2
+
+#ifdef HAS_NV21TOARGBROW_AVX2
+// 16 pixels.
+// 8 VU values upsampled to 16 VU, mixed with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked)
+void NV21ToARGBRow_AVX2(const uint8* y_buf,
+                        const uint8* uv_buf,
+                        uint8* dst_argb,
+                        int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // Y
+    mov        esi, [esp + 4 + 8]   // UV
+    mov        edx, [esp + 4 + 12]  // argb
+    mov        ecx, [esp + 4 + 16]  // width
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+    READNV12_AVX2
+    YUVTORGB_AVX2(kYvuConstants)
+    STOREARGB_AVX2
+
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_NV21TOARGBROW_AVX2
+
+#ifdef HAS_I422TOBGRAROW_AVX2
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes).
+// TODO(fbarchard): Use macros to reduce duplicate code.  See SSSE3.
+__declspec(naked)
+void I422ToBGRARow_AVX2(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* dst_argb,
+                        int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // argb
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+    READYUV422_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+
+    // Step 3: Weave into BGRA
+    vpunpcklbw ymm1, ymm1, ymm0           // GB
+    vpermq     ymm1, ymm1, 0xd8
+    vpunpcklbw ymm2, ymm5, ymm2           // AR
+    vpermq     ymm2, ymm2, 0xd8
+    vpunpcklwd ymm0, ymm2, ymm1           // ARGB first 8 pixels
+    vpunpckhwd ymm2, ymm2, ymm1           // ARGB next 8 pixels
+    vmovdqu    [edx], ymm0
+    vmovdqu    [edx + 32], ymm2
+    lea        edx,  [edx + 64]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_I422TOBGRAROW_AVX2
+
+#ifdef HAS_I422TORGBAROW_AVX2
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
+// TODO(fbarchard): Use macros to reduce duplicate code.  See SSSE3.
+__declspec(naked)
+void I422ToRGBARow_AVX2(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* dst_argb,
+                        int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // argb
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+    READYUV422_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+
+    // Step 3: Weave into RGBA
+    vpunpcklbw ymm1, ymm1, ymm2           // GR
+    vpermq     ymm1, ymm1, 0xd8
+    vpunpcklbw ymm2, ymm5, ymm0           // AB
+    vpermq     ymm2, ymm2, 0xd8
+    vpunpcklwd ymm0, ymm2, ymm1           // ABGR first 8 pixels
+    vpunpckhwd ymm1, ymm2, ymm1           // ABGR next 8 pixels
+    vmovdqu    [edx], ymm0
+    vmovdqu    [edx + 32], ymm1
+    lea        edx,  [edx + 64]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_I422TORGBAROW_AVX2
+
+#ifdef HAS_I422TOABGRROW_AVX2
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).
+// TODO(fbarchard): Use macros to reduce duplicate code.  See SSSE3.
+__declspec(naked)
+void I422ToABGRRow_AVX2(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* dst_argb,
+                        int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // argb
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+    READYUV422_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+
+    // Step 3: Weave into ABGR
+    vpunpcklbw ymm1, ymm2, ymm1           // RG
+    vpermq     ymm1, ymm1, 0xd8
+    vpunpcklbw ymm2, ymm0, ymm5           // BA
+    vpermq     ymm2, ymm2, 0xd8
+    vpunpcklwd ymm0, ymm1, ymm2           // RGBA first 8 pixels
+    vpunpckhwd ymm1, ymm1, ymm2           // RGBA next 8 pixels
+    vmovdqu    [edx], ymm0
+    vmovdqu    [edx + 32], ymm1
+    lea        edx,  [edx + 64]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_I422TOABGRROW_AVX2
+
+#if defined(HAS_I422TOARGBROW_SSSE3)
+// TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
+
+// Read 8 UV from 444.
+#define READYUV444 __asm {                                                     \
+    __asm movq       xmm0, qword ptr [esi] /* U */                /* NOLINT */ \
+    __asm movq       xmm1, qword ptr [esi + edi] /* V */          /* NOLINT */ \
+    __asm lea        esi,  [esi + 8]                                           \
+    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
+  }
+
+// Read 4 UV from 422, upsample to 8 UV.
+#define READYUV422 __asm {                                                     \
+    __asm movd       xmm0, [esi]          /* U */                              \
+    __asm movd       xmm1, [esi + edi]    /* V */                              \
+    __asm lea        esi,  [esi + 4]                                           \
+    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
+    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
+  }
+
+// Read 2 UV from 411, upsample to 8 UV.
+#define READYUV411 __asm {                                                     \
+    __asm movzx      ebx, word ptr [esi]        /* U */           /* NOLINT */ \
+    __asm movd       xmm0, ebx                                                 \
+    __asm movzx      ebx, word ptr [esi + edi]  /* V */           /* NOLINT */ \
+    __asm movd       xmm1, ebx                                                 \
+    __asm lea        esi,  [esi + 2]                                           \
+    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
+    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
+    __asm punpckldq  xmm0, xmm0           /* UVUVUVUV (upsample) */            \
+  }
+
+// Read 4 UV from NV12, upsample to 8 UV.
+#define READNV12 __asm {                                                       \
+    __asm movq       xmm0, qword ptr [esi] /* UV */               /* NOLINT */ \
+    __asm lea        esi,  [esi + 8]                                           \
+    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
+  }
+
+// Convert 8 pixels: 8 UV and 8 Y.
+#define YUVTORGB(YuvConstants) __asm {                                         \
+    /* Step 1: Find 4 UV contributions to 8 R,G,B values */                    \
+    __asm movdqa     xmm1, xmm0                                                \
+    __asm movdqa     xmm2, xmm0                                                \
+    __asm movdqa     xmm3, xmm0                                                \
+    __asm movdqa     xmm0, YuvConstants.kUVBiasB /* unbias back to signed */   \
+    __asm pmaddubsw  xmm1, YuvConstants.kUVToB   /* scale B UV */              \
+    __asm psubw      xmm0, xmm1                                                \
+    __asm movdqa     xmm1, YuvConstants.kUVBiasG                               \
+    __asm pmaddubsw  xmm2, YuvConstants.kUVToG   /* scale G UV */              \
+    __asm psubw      xmm1, xmm2                                                \
+    __asm movdqa     xmm2, YuvConstants.kUVBiasR                               \
+    __asm pmaddubsw  xmm3, YuvConstants.kUVToR   /* scale R UV */              \
+    __asm psubw      xmm2, xmm3                                                \
+    /* Step 2: Find Y contribution to 8 R,G,B values */                        \
+    __asm movq       xmm3, qword ptr [eax]                        /* NOLINT */ \
+    __asm lea        eax, [eax + 8]                                            \
+    __asm punpcklbw  xmm3, xmm3                                                \
+    __asm pmulhuw    xmm3, YuvConstants.kYToRgb                                \
+    __asm paddsw     xmm0, xmm3           /* B += Y */                         \
+    __asm paddsw     xmm1, xmm3           /* G += Y */                         \
+    __asm paddsw     xmm2, xmm3           /* R += Y */                         \
+    __asm psraw      xmm0, 6                                                   \
+    __asm psraw      xmm1, 6                                                   \
+    __asm psraw      xmm2, 6                                                   \
+    __asm packuswb   xmm0, xmm0           /* B */                              \
+    __asm packuswb   xmm1, xmm1           /* G */                              \
+    __asm packuswb   xmm2, xmm2           /* R */                              \
+  }
+
+// Store 8 ARGB values.
+#define STOREARGB __asm {                                                      \
+    /* Step 3: Weave into ARGB */                                              \
+    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
+    __asm punpcklbw  xmm2, xmm5           /* RA */                             \
+    __asm movdqa     xmm1, xmm0                                                \
+    __asm punpcklwd  xmm0, xmm2           /* BGRA first 4 pixels */            \
+    __asm punpckhwd  xmm1, xmm2           /* BGRA next 4 pixels */             \
+    __asm movdqu     0[edx], xmm0                                              \
+    __asm movdqu     16[edx], xmm1                                             \
+    __asm lea        edx,  [edx + 32]                                          \
+  }
+
+// Store 8 BGRA values.
+#define STOREBGRA __asm {                                                      \
+    /* Step 3: Weave into BGRA */                                              \
+    __asm pcmpeqb    xmm5, xmm5           /* generate 0xffffffff for alpha */  \
+    __asm punpcklbw  xmm1, xmm0           /* GB */                             \
+    __asm punpcklbw  xmm5, xmm2           /* AR */                             \
+    __asm movdqa     xmm0, xmm5                                                \
+    __asm punpcklwd  xmm5, xmm1           /* BGRA first 4 pixels */            \
+    __asm punpckhwd  xmm0, xmm1           /* BGRA next 4 pixels */             \
+    __asm movdqu     0[edx], xmm5                                              \
+    __asm movdqu     16[edx], xmm0                                             \
+    __asm lea        edx,  [edx + 32]                                          \
+  }
+
+// Store 8 ABGR values.
+#define STOREABGR __asm {                                                      \
+    /* Step 3: Weave into ABGR */                                              \
+    __asm punpcklbw  xmm2, xmm1           /* RG */                             \
+    __asm punpcklbw  xmm0, xmm5           /* BA */                             \
+    __asm movdqa     xmm1, xmm2                                                \
+    __asm punpcklwd  xmm2, xmm0           /* RGBA first 4 pixels */            \
+    __asm punpckhwd  xmm1, xmm0           /* RGBA next 4 pixels */             \
+    __asm movdqu     0[edx], xmm2                                              \
+    __asm movdqu     16[edx], xmm1                                             \
+    __asm lea        edx,  [edx + 32]                                          \
+  }
+
+// Store 8 RGBA values.
+#define STORERGBA __asm {                                                      \
+    /* Step 3: Weave into RGBA */                                              \
+    __asm pcmpeqb    xmm5, xmm5           /* generate 0xffffffff for alpha */  \
+    __asm punpcklbw  xmm1, xmm2           /* GR */                             \
+    __asm punpcklbw  xmm5, xmm0           /* AB */                             \
+    __asm movdqa     xmm0, xmm5                                                \
+    __asm punpcklwd  xmm5, xmm1           /* RGBA first 4 pixels */            \
+    __asm punpckhwd  xmm0, xmm1           /* RGBA next 4 pixels */             \
+    __asm movdqu     0[edx], xmm5                                              \
+    __asm movdqu     16[edx], xmm0                                             \
+    __asm lea        edx,  [edx + 32]                                          \
+  }
+
+// Store 8 RGB24 values.
+#define STORERGB24 __asm {                                                     \
+    /* Step 3: Weave into RRGB */                                              \
+    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
+    __asm punpcklbw  xmm2, xmm2           /* RR */                             \
+    __asm movdqa     xmm1, xmm0                                                \
+    __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \
+    __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \
+    /* Step 4: RRGB -> RGB24 */                                                \
+    __asm pshufb     xmm0, xmm5           /* Pack first 8 and last 4 bytes. */ \
+    __asm pshufb     xmm1, xmm6           /* Pack first 12 bytes. */           \
+    __asm palignr    xmm1, xmm0, 12       /* last 4 bytes of xmm0 + 12 xmm1 */ \
+    __asm movq       qword ptr 0[edx], xmm0  /* First 8 bytes */               \
+    __asm movdqu     8[edx], xmm1         /* Last 16 bytes */                  \
+    __asm lea        edx,  [edx + 24]                                          \
+  }
+
+// Store 8 RAW values.
+#define STORERAW __asm {                                                       \
+    /* Step 3: Weave into RRGB */                                              \
+    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
+    __asm punpcklbw  xmm2, xmm2           /* RR */                             \
+    __asm movdqa     xmm1, xmm0                                                \
+    __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \
+    __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \
+    /* Step 4: RRGB -> RAW */                                                  \
+    __asm pshufb     xmm0, xmm5           /* Pack first 8 and last 4 bytes. */ \
+    __asm pshufb     xmm1, xmm6           /* Pack first 12 bytes. */           \
+    __asm palignr    xmm1, xmm0, 12       /* last 4 bytes of xmm0 + 12 xmm1 */ \
+    __asm movq       qword ptr 0[edx], xmm0  /* First 8 bytes */               \
+    __asm movdqu     8[edx], xmm1         /* Last 16 bytes */                  \
+    __asm lea        edx,  [edx + 24]                                          \
+  }
+
+// Store 8 RGB565 values.
+#define STORERGB565 __asm {                                                    \
+    /* Step 3: Weave into RRGB */                                              \
+    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
+    __asm punpcklbw  xmm2, xmm2           /* RR */                             \
+    __asm movdqa     xmm1, xmm0                                                \
+    __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \
+    __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \
+    /* Step 4: RRGB -> RGB565 */                                               \
+    __asm movdqa     xmm3, xmm0    /* B  first 4 pixels of argb */             \
+    __asm movdqa     xmm2, xmm0    /* G */                                     \
+    __asm pslld      xmm0, 8       /* R */                                     \
+    __asm psrld      xmm3, 3       /* B */                                     \
+    __asm psrld      xmm2, 5       /* G */                                     \
+    __asm psrad      xmm0, 16      /* R */                                     \
+    __asm pand       xmm3, xmm5    /* B */                                     \
+    __asm pand       xmm2, xmm6    /* G */                                     \
+    __asm pand       xmm0, xmm7    /* R */                                     \
+    __asm por        xmm3, xmm2    /* BG */                                    \
+    __asm por        xmm0, xmm3    /* BGR */                                   \
+    __asm movdqa     xmm3, xmm1    /* B  next 4 pixels of argb */              \
+    __asm movdqa     xmm2, xmm1    /* G */                                     \
+    __asm pslld      xmm1, 8       /* R */                                     \
+    __asm psrld      xmm3, 3       /* B */                                     \
+    __asm psrld      xmm2, 5       /* G */                                     \
+    __asm psrad      xmm1, 16      /* R */                                     \
+    __asm pand       xmm3, xmm5    /* B */                                     \
+    __asm pand       xmm2, xmm6    /* G */                                     \
+    __asm pand       xmm1, xmm7    /* R */                                     \
+    __asm por        xmm3, xmm2    /* BG */                                    \
+    __asm por        xmm1, xmm3    /* BGR */                                   \
+    __asm packssdw   xmm0, xmm1                                                \
+    __asm movdqu     0[edx], xmm0  /* store 8 pixels of RGB565 */              \
+    __asm lea        edx, [edx + 16]                                           \
+  }
+
+// 8 pixels.
+// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked)
+void I444ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* dst_argb,
+                         int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // argb
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+
+ convertloop:
+    READYUV444
+    YUVTORGB(kYuvConstants)
+    STOREARGB
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+// 8 pixels.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
+__declspec(naked)
+void I422ToRGB24Row_SSSE3(const uint8* y_buf,
+                          const uint8* u_buf,
+                          const uint8* v_buf,
+                          uint8* dst_rgb24,
+                          int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // rgb24
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    movdqa     xmm5, kShuffleMaskARGBToRGB24_0
+    movdqa     xmm6, kShuffleMaskARGBToRGB24
+
+ convertloop:
+    READYUV422
+    YUVTORGB(kYuvConstants)
+    STORERGB24
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+// 8 pixels.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RAW (24 bytes).
+__declspec(naked)
+void I422ToRAWRow_SSSE3(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* dst_raw,
+                        int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // raw
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    movdqa     xmm5, kShuffleMaskARGBToRAW_0
+    movdqa     xmm6, kShuffleMaskARGBToRAW
+
+ convertloop:
+    READYUV422
+    YUVTORGB(kYuvConstants)
+    STORERAW
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+// 8 pixels
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
+__declspec(naked)
+void I422ToRGB565Row_SSSE3(const uint8* y_buf,
+                           const uint8* u_buf,
+                           const uint8* v_buf,
+                           uint8* rgb565_buf,
+                           int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // rgb565
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    pcmpeqb    xmm5, xmm5       // generate mask 0x0000001f
+    psrld      xmm5, 27
+    pcmpeqb    xmm6, xmm6       // generate mask 0x000007e0
+    psrld      xmm6, 26
+    pslld      xmm6, 5
+    pcmpeqb    xmm7, xmm7       // generate mask 0xfffff800
+    pslld      xmm7, 11
+
+ convertloop:
+    READYUV422
+    YUVTORGB(kYuvConstants)
+    STORERGB565
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+// 8 pixels.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked)
+void I422ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* dst_argb,
+                         int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // argb
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+
+ convertloop:
+    READYUV422
+    YUVTORGB(kYuvConstants)
+    STOREARGB
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+// 8 pixels.
+// JPeg color space version of I422ToARGB
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked)
+void J422ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* dst_argb,
+                         int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // argb
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+
+ convertloop:
+    READYUV422
+    YUVTORGB(kYuvJConstants)
+    STOREARGB
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+// 8 pixels.
+// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+// Similar to I420 but duplicate UV once more.
+__declspec(naked)
+void I411ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* dst_argb,
+                         int width) {
+  __asm {
+    push       ebx
+    push       esi
+    push       edi
+    mov        eax, [esp + 12 + 4]   // Y
+    mov        esi, [esp + 12 + 8]   // U
+    mov        edi, [esp + 12 + 12]  // V
+    mov        edx, [esp + 12 + 16]  // argb
+    mov        ecx, [esp + 12 + 20]  // width
+    sub        edi, esi
+    pcmpeqb    xmm5, xmm5            // generate 0xffffffff for alpha
+
+ convertloop:
+    READYUV411  // modifies EBX
+    YUVTORGB(kYuvConstants)
+    STOREARGB
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    pop        ebx
+    ret
+  }
+}
+
+// 8 pixels.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked)
+void NV12ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* uv_buf,
+                         uint8* dst_argb,
+                         int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // Y
+    mov        esi, [esp + 4 + 8]   // UV
+    mov        edx, [esp + 4 + 12]  // argb
+    mov        ecx, [esp + 4 + 16]  // width
+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+
+ convertloop:
+    READNV12
+    YUVTORGB(kYuvConstants)
+    STOREARGB
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+
+// 8 pixels.
+// 4 VU values upsampled to 8 VU, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked)
+void NV21ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* uv_buf,
+                         uint8* dst_argb,
+                         int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // Y
+    mov        esi, [esp + 4 + 8]   // UV
+    mov        edx, [esp + 4 + 12]  // argb
+    mov        ecx, [esp + 4 + 16]  // width
+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+
+ convertloop:
+    READNV12
+    YUVTORGB(kYvuConstants)
+    STOREARGB
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked)
+void I422ToBGRARow_SSSE3(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* dst_bgra,
+                         int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // bgra
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+
+ convertloop:
+    READYUV422
+    YUVTORGB(kYuvConstants)
+    STOREBGRA
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked)
+void I422ToABGRRow_SSSE3(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* dst_abgr,
+                         int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // abgr
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+
+ convertloop:
+    READYUV422
+    YUVTORGB(kYuvConstants)
+    STOREABGR
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked)
+void I422ToRGBARow_SSSE3(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* dst_rgba,
+                         int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // rgba
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+
+ convertloop:
+    READYUV422
+    YUVTORGB(kYuvConstants)
+    STORERGBA
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+#endif  // HAS_I422TOARGBROW_SSSE3
+
+#ifdef HAS_I400TOARGBROW_SSE2
+// 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
+__declspec(naked)
+void I400ToARGBRow_SSE2(const uint8* y_buf,
+                        uint8* rgb_buf,
+                        int width) {
+  __asm {
+    mov        eax, 0x4a354a35      // 4a35 = 18997 = round(1.164 * 64 * 256)
+    movd       xmm2, eax
+    pshufd     xmm2, xmm2,0
+    mov        eax, 0x04880488      // 0488 = 1160 = round(1.164 * 64 * 16)
+    movd       xmm3, eax
+    pshufd     xmm3, xmm3, 0
+    pcmpeqb    xmm4, xmm4           // generate mask 0xff000000
+    pslld      xmm4, 24
+
+    mov        eax, [esp + 4]       // Y
+    mov        edx, [esp + 8]       // rgb
+    mov        ecx, [esp + 12]      // width
+
+ convertloop:
+    // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
+    movq       xmm0, qword ptr [eax]
+    lea        eax, [eax + 8]
+    punpcklbw  xmm0, xmm0           // Y.Y
+    pmulhuw    xmm0, xmm2
+    psubusw    xmm0, xmm3
+    psrlw      xmm0, 6
+    packuswb   xmm0, xmm0           // G
+
+    // Step 2: Weave into ARGB
+    punpcklbw  xmm0, xmm0           // GG
+    movdqa     xmm1, xmm0
+    punpcklwd  xmm0, xmm0           // BGRA first 4 pixels
+    punpckhwd  xmm1, xmm1           // BGRA next 4 pixels
+    por        xmm0, xmm4
+    por        xmm1, xmm4
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
+    lea        edx,  [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+    ret
+  }
+}
+#endif  // HAS_I400TOARGBROW_SSE2
+
+#ifdef HAS_I400TOARGBROW_AVX2
+// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
+// note: vpunpcklbw mutates and vpackuswb unmutates.
+__declspec(naked)
+void I400ToARGBRow_AVX2(const uint8* y_buf,
+                        uint8* rgb_buf,
+                        int width) {
+  __asm {
+    mov        eax, 0x4a354a35      // 4a35 = 18997 = round(1.164 * 64 * 256)
+    vmovd      xmm2, eax
+    vbroadcastss ymm2, xmm2
+    mov        eax, 0x04880488      // 0488 = 1160 = round(1.164 * 64 * 16)
+    vmovd      xmm3, eax
+    vbroadcastss ymm3, xmm3
+    vpcmpeqb   ymm4, ymm4, ymm4     // generate mask 0xff000000
+    vpslld     ymm4, ymm4, 24
+
+    mov        eax, [esp + 4]       // Y
+    mov        edx, [esp + 8]       // rgb
+    mov        ecx, [esp + 12]      // width
+
+ convertloop:
+    // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164
+    vmovdqu    xmm0, [eax]
+    lea        eax, [eax + 16]
+    vpermq     ymm0, ymm0, 0xd8           // vpunpcklbw mutates
+    vpunpcklbw ymm0, ymm0, ymm0           // Y.Y
+    vpmulhuw   ymm0, ymm0, ymm2
+    vpsubusw   ymm0, ymm0, ymm3
+    vpsrlw     ymm0, ymm0, 6
+    vpackuswb  ymm0, ymm0, ymm0           // G.  still mutated: 3120
+
+    // TODO(fbarchard): Weave alpha with unpack.
+    // Step 2: Weave into ARGB
+    vpunpcklbw ymm1, ymm0, ymm0           // GG - mutates
+    vpermq     ymm1, ymm1, 0xd8
+    vpunpcklwd ymm0, ymm1, ymm1           // GGGG first 8 pixels
+    vpunpckhwd ymm1, ymm1, ymm1           // GGGG next 8 pixels
+    vpor       ymm0, ymm0, ymm4
+    vpor       ymm1, ymm1, ymm4
+    vmovdqu    [edx], ymm0
+    vmovdqu    [edx + 32], ymm1
+    lea        edx,  [edx + 64]
+    sub        ecx, 16
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_I400TOARGBROW_AVX2
+
+#ifdef HAS_MIRRORROW_SSSE3
+// Shuffle table for reversing the bytes.
+static const uvec8 kShuffleMirror = {
+  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
+};
+
+// TODO(fbarchard): Replace lea with -16 offset.
+__declspec(naked)
+void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
+  __asm {
+    mov       eax, [esp + 4]   // src
+    mov       edx, [esp + 8]   // dst
+    mov       ecx, [esp + 12]  // width
+    movdqa    xmm5, kShuffleMirror
+
+ convertloop:
+    movdqu    xmm0, [eax - 16 + ecx]
+    pshufb    xmm0, xmm5
+    movdqu    [edx], xmm0
+    lea       edx, [edx + 16]
+    sub       ecx, 16
+    jg        convertloop
+    ret
+  }
+}
+#endif  // HAS_MIRRORROW_SSSE3
+
+#ifdef HAS_MIRRORROW_AVX2
+__declspec(naked)
+void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+  __asm {
+    mov       eax, [esp + 4]   // src
+    mov       edx, [esp + 8]   // dst
+    mov       ecx, [esp + 12]  // width
+    vbroadcastf128 ymm5, kShuffleMirror
+
+ convertloop:
+    vmovdqu   ymm0, [eax - 32 + ecx]
+    vpshufb   ymm0, ymm0, ymm5
+    vpermq    ymm0, ymm0, 0x4e  // swap high and low halfs
+    vmovdqu   [edx], ymm0
+    lea       edx, [edx + 32]
+    sub       ecx, 32
+    jg        convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_MIRRORROW_AVX2
+
+#ifdef HAS_MIRRORROW_SSE2
+__declspec(naked)
+void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
+  __asm {
+    mov       eax, [esp + 4]   // src
+    mov       edx, [esp + 8]   // dst
+    mov       ecx, [esp + 12]  // width
+
+ convertloop:
+    movdqu    xmm0, [eax - 16 + ecx]
+    movdqa    xmm1, xmm0        // swap bytes
+    psllw     xmm0, 8
+    psrlw     xmm1, 8
+    por       xmm0, xmm1
+    pshuflw   xmm0, xmm0, 0x1b  // swap words
+    pshufhw   xmm0, xmm0, 0x1b
+    pshufd    xmm0, xmm0, 0x4e  // swap qwords
+    movdqu    [edx], xmm0
+    lea       edx, [edx + 16]
+    sub       ecx, 16
+    jg        convertloop
+    ret
+  }
+}
+#endif  // HAS_MIRRORROW_SSE2
+
+#ifdef HAS_MIRRORROW_UV_SSSE3
+// Shuffle table for reversing the bytes of UV channels.
+static const uvec8 kShuffleMirrorUV = {
+  14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
+};
+
+__declspec(naked)
+void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
+                       int width) {
+  __asm {
+    push      edi
+    mov       eax, [esp + 4 + 4]   // src
+    mov       edx, [esp + 4 + 8]   // dst_u
+    mov       edi, [esp + 4 + 12]  // dst_v
+    mov       ecx, [esp + 4 + 16]  // width
+    movdqa    xmm1, kShuffleMirrorUV
+    lea       eax, [eax + ecx * 2 - 16]
+    sub       edi, edx
+
+ convertloop:
+    movdqu    xmm0, [eax]
+    lea       eax, [eax - 16]
+    pshufb    xmm0, xmm1
+    movlpd    qword ptr [edx], xmm0
+    movhpd    qword ptr [edx + edi], xmm0
+    lea       edx, [edx + 8]
+    sub       ecx, 8
+    jg        convertloop
+
+    pop       edi
+    ret
+  }
+}
+#endif  // HAS_MIRRORROW_UV_SSSE3
+
+#ifdef HAS_ARGBMIRRORROW_SSE2
+__declspec(naked)
+void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
+  __asm {
+    mov       eax, [esp + 4]   // src
+    mov       edx, [esp + 8]   // dst
+    mov       ecx, [esp + 12]  // width
+    lea       eax, [eax - 16 + ecx * 4]  // last 4 pixels.
+
+ convertloop:
+    movdqu    xmm0, [eax]
+    lea       eax, [eax - 16]
+    pshufd    xmm0, xmm0, 0x1b
+    movdqu    [edx], xmm0
+    lea       edx, [edx + 16]
+    sub       ecx, 4
+    jg        convertloop
+    ret
+  }
+}
+#endif  // HAS_ARGBMIRRORROW_SSE2
+
+#ifdef HAS_ARGBMIRRORROW_AVX2
+// Shuffle table for reversing the bytes.
+static const ulvec32 kARGBShuffleMirror_AVX2 = {
+  7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
+};
+
+__declspec(naked)
+void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+  __asm {
+    mov       eax, [esp + 4]   // src
+    mov       edx, [esp + 8]   // dst
+    mov       ecx, [esp + 12]  // width
+    vmovdqu   ymm5, kARGBShuffleMirror_AVX2
+
+ convertloop:
+    vpermd    ymm0, ymm5, [eax - 32 + ecx * 4]  // permute dword order
+    vmovdqu   [edx], ymm0
+    lea       edx, [edx + 32]
+    sub       ecx, 8
+    jg        convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBMIRRORROW_AVX2
+
+#ifdef HAS_SPLITUVROW_SSE2
+__declspec(naked)
+void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]    // src_uv
+    mov        edx, [esp + 4 + 8]    // dst_u
+    mov        edi, [esp + 4 + 12]   // dst_v
+    mov        ecx, [esp + 4 + 16]   // pix
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+    sub        edi, edx
+
+  convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    movdqa     xmm2, xmm0
+    movdqa     xmm3, xmm1
+    pand       xmm0, xmm5   // even bytes
+    pand       xmm1, xmm5
+    packuswb   xmm0, xmm1
+    psrlw      xmm2, 8      // odd bytes
+    psrlw      xmm3, 8
+    packuswb   xmm2, xmm3
+    movdqu     [edx], xmm0
+    movdqu     [edx + edi], xmm2
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    ret
+  }
+}
+
+#endif  // HAS_SPLITUVROW_SSE2
+
+#ifdef HAS_SPLITUVROW_AVX2
+__declspec(naked)
+void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]    // src_uv
+    mov        edx, [esp + 4 + 8]    // dst_u
+    mov        edi, [esp + 4 + 12]   // dst_v
+    mov        ecx, [esp + 4 + 16]   // pix
+    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
+    vpsrlw     ymm5, ymm5, 8
+    sub        edi, edx
+
+  convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    lea        eax,  [eax + 64]
+    vpsrlw     ymm2, ymm0, 8      // odd bytes
+    vpsrlw     ymm3, ymm1, 8
+    vpand      ymm0, ymm0, ymm5   // even bytes
+    vpand      ymm1, ymm1, ymm5
+    vpackuswb  ymm0, ymm0, ymm1
+    vpackuswb  ymm2, ymm2, ymm3
+    vpermq     ymm0, ymm0, 0xd8
+    vpermq     ymm2, ymm2, 0xd8
+    vmovdqu    [edx], ymm0
+    vmovdqu    [edx + edi], ymm2
+    lea        edx, [edx + 32]
+    sub        ecx, 32
+    jg         convertloop
+
+    pop        edi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_SPLITUVROW_AVX2
+
+#ifdef HAS_MERGEUVROW_SSE2
+__declspec(naked)
+void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                     int width) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]    // src_u
+    mov        edx, [esp + 4 + 8]    // src_v
+    mov        edi, [esp + 4 + 12]   // dst_uv
+    mov        ecx, [esp + 4 + 16]   // width
+    sub        edx, eax
+
+  convertloop:
+    movdqu     xmm0, [eax]      // read 16 U's
+    movdqu     xmm1, [eax + edx]  // and 16 V's
+    lea        eax,  [eax + 16]
+    movdqa     xmm2, xmm0
+    punpcklbw  xmm0, xmm1       // first 8 UV pairs
+    punpckhbw  xmm2, xmm1       // next 8 UV pairs
+    movdqu     [edi], xmm0
+    movdqu     [edi + 16], xmm2
+    lea        edi, [edi + 32]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    ret
+  }
+}
+#endif  //  HAS_MERGEUVROW_SSE2
+
+#ifdef HAS_MERGEUVROW_AVX2
+__declspec(naked)
+void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                     int width) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]    // src_u
+    mov        edx, [esp + 4 + 8]    // src_v
+    mov        edi, [esp + 4 + 12]   // dst_uv
+    mov        ecx, [esp + 4 + 16]   // width
+    sub        edx, eax
+
+  convertloop:
+    vmovdqu    ymm0, [eax]           // read 32 U's
+    vmovdqu    ymm1, [eax + edx]     // and 32 V's
+    lea        eax,  [eax + 32]
+    vpunpcklbw ymm2, ymm0, ymm1      // low 16 UV pairs. mutated qqword 0,2
+    vpunpckhbw ymm0, ymm0, ymm1      // high 16 UV pairs. mutated qqword 1,3
+    vextractf128 [edi], ymm2, 0       // bytes 0..15
+    vextractf128 [edi + 16], ymm0, 0  // bytes 16..31
+    vextractf128 [edi + 32], ymm2, 1  // bytes 32..47
+    vextractf128 [edi + 48], ymm0, 1  // bytes 47..63
+    lea        edi, [edi + 64]
+    sub        ecx, 32
+    jg         convertloop
+
+    pop        edi
+    vzeroupper
+    ret
+  }
+}
+#endif  //  HAS_MERGEUVROW_AVX2
+
+#ifdef HAS_COPYROW_SSE2
+// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
+__declspec(naked)
+void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
+  __asm {
+    mov        eax, [esp + 4]   // src
+    mov        edx, [esp + 8]   // dst
+    mov        ecx, [esp + 12]  // count
+
+  convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax, [eax + 32]
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    sub        ecx, 32
+    jg         convertloop
+    ret
+  }
+}
+#endif  // HAS_COPYROW_SSE2
+
+#ifdef HAS_COPYROW_AVX
+// CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time.
+__declspec(naked)
+void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
+  __asm {
+    mov        eax, [esp + 4]   // src
+    mov        edx, [esp + 8]   // dst
+    mov        ecx, [esp + 12]  // count
+
+  convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    lea        eax, [eax + 64]
+    vmovdqu    [edx], ymm0
+    vmovdqu    [edx + 32], ymm1
+    lea        edx, [edx + 64]
+    sub        ecx, 64
+    jg         convertloop
+
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_COPYROW_AVX
+
+// Multiple of 1.
+__declspec(naked)
+void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
+  __asm {
+    mov        eax, esi
+    mov        edx, edi
+    mov        esi, [esp + 4]   // src
+    mov        edi, [esp + 8]   // dst
+    mov        ecx, [esp + 12]  // count
+    rep movsb
+    mov        edi, edx
+    mov        esi, eax
+    ret
+  }
+}
+
+#ifdef HAS_ARGBCOPYALPHAROW_SSE2
+// width in pixels
+__declspec(naked)
+void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
+  __asm {
+    mov        eax, [esp + 4]   // src
+    mov        edx, [esp + 8]   // dst
+    mov        ecx, [esp + 12]  // count
+    pcmpeqb    xmm0, xmm0       // generate mask 0xff000000
+    pslld      xmm0, 24
+    pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
+    psrld      xmm1, 8
+
+  convertloop:
+    movdqu     xmm2, [eax]
+    movdqu     xmm3, [eax + 16]
+    lea        eax, [eax + 32]
+    movdqu     xmm4, [edx]
+    movdqu     xmm5, [edx + 16]
+    pand       xmm2, xmm0
+    pand       xmm3, xmm0
+    pand       xmm4, xmm1
+    pand       xmm5, xmm1
+    por        xmm2, xmm4
+    por        xmm3, xmm5
+    movdqu     [edx], xmm2
+    movdqu     [edx + 16], xmm3
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    ret
+  }
+}
+#endif  // HAS_ARGBCOPYALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYALPHAROW_AVX2
+// width in pixels
+__declspec(naked)
+void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
+  __asm {
+    mov        eax, [esp + 4]   // src
+    mov        edx, [esp + 8]   // dst
+    mov        ecx, [esp + 12]  // count
+    vpcmpeqb   ymm0, ymm0, ymm0
+    vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
+
+  convertloop:
+    vmovdqu    ymm1, [eax]
+    vmovdqu    ymm2, [eax + 32]
+    lea        eax, [eax + 64]
+    vpblendvb  ymm1, ymm1, [edx], ymm0
+    vpblendvb  ymm2, ymm2, [edx + 32], ymm0
+    vmovdqu    [edx], ymm1
+    vmovdqu    [edx + 32], ymm2
+    lea        edx, [edx + 64]
+    sub        ecx, 16
+    jg         convertloop
+
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBCOPYALPHAROW_AVX2
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
+// width in pixels
+__declspec(naked)
+void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
+  __asm {
+    mov        eax, [esp + 4]   // src
+    mov        edx, [esp + 8]   // dst
+    mov        ecx, [esp + 12]  // count
+    pcmpeqb    xmm0, xmm0       // generate mask 0xff000000
+    pslld      xmm0, 24
+    pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
+    psrld      xmm1, 8
+
+  convertloop:
+    movq       xmm2, qword ptr [eax]  // 8 Y's
+    lea        eax, [eax + 8]
+    punpcklbw  xmm2, xmm2
+    punpckhwd  xmm3, xmm2
+    punpcklwd  xmm2, xmm2
+    movdqu     xmm4, [edx]
+    movdqu     xmm5, [edx + 16]
+    pand       xmm2, xmm0
+    pand       xmm3, xmm0
+    pand       xmm4, xmm1
+    pand       xmm5, xmm1
+    por        xmm2, xmm4
+    por        xmm3, xmm5
+    movdqu     [edx], xmm2
+    movdqu     [edx + 16], xmm3
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    ret
+  }
+}
+#endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
+// width in pixels
+__declspec(naked)
+void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
+  __asm {
+    mov        eax, [esp + 4]   // src
+    mov        edx, [esp + 8]   // dst
+    mov        ecx, [esp + 12]  // count
+    vpcmpeqb   ymm0, ymm0, ymm0
+    vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
+
+  convertloop:
+    vpmovzxbd  ymm1, qword ptr [eax]
+    vpmovzxbd  ymm2, qword ptr [eax + 8]
+    lea        eax, [eax + 16]
+    vpslld     ymm1, ymm1, 24
+    vpslld     ymm2, ymm2, 24
+    vpblendvb  ymm1, ymm1, [edx], ymm0
+    vpblendvb  ymm2, ymm2, [edx + 32], ymm0
+    vmovdqu    [edx], ymm1
+    vmovdqu    [edx + 32], ymm2
+    lea        edx, [edx + 64]
+    sub        ecx, 16
+    jg         convertloop
+
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
+
+#ifdef HAS_SETROW_X86
+// Write 'count' bytes using an 8 bit value repeated.
+// Count should be multiple of 4.
+__declspec(naked)
+void SetRow_X86(uint8* dst, uint8 v8, int count) {
+  __asm {
+    movzx      eax, byte ptr [esp + 8]    // v8
+    mov        edx, 0x01010101  // Duplicate byte to all bytes.
+    mul        edx              // overwrites edx with upper part of result.
+    mov        edx, edi
+    mov        edi, [esp + 4]   // dst
+    mov        ecx, [esp + 12]  // count
+    shr        ecx, 2
+    rep stosd
+    mov        edi, edx
+    ret
+  }
+}
+
+// Write 'count' bytes using an 8 bit value repeated.
+__declspec(naked)
+void SetRow_ERMS(uint8* dst, uint8 v8, int count) {
+  __asm {
+    mov        edx, edi
+    mov        edi, [esp + 4]   // dst
+    mov        eax, [esp + 8]   // v8
+    mov        ecx, [esp + 12]  // count
+    rep stosb
+    mov        edi, edx
+    ret
+  }
+}
+
+// Write 'count' 32 bit values.
+__declspec(naked)
+void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
+  __asm {
+    mov        edx, edi
+    mov        edi, [esp + 4]   // dst
+    mov        eax, [esp + 8]   // v32
+    mov        ecx, [esp + 12]  // count
+    rep stosd
+    mov        edi, edx
+    ret
+  }
+}
+#endif  // HAS_SETROW_X86
+
+#ifdef HAS_YUY2TOYROW_AVX2
+__declspec(naked)
+void YUY2ToYRow_AVX2(const uint8* src_yuy2,
+                     uint8* dst_y, int pix) {
+  __asm {
+    mov        eax, [esp + 4]    // src_yuy2
+    mov        edx, [esp + 8]    // dst_y
+    mov        ecx, [esp + 12]   // pix
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
+    vpsrlw     ymm5, ymm5, 8
+
+  convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    lea        eax,  [eax + 64]
+    vpand      ymm0, ymm0, ymm5   // even bytes are Y
+    vpand      ymm1, ymm1, ymm5
+    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpermq     ymm0, ymm0, 0xd8
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
+    sub        ecx, 32
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+
+__declspec(naked)
+void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_yuy2
+    mov        esi, [esp + 8 + 8]    // stride_yuy2
+    mov        edx, [esp + 8 + 12]   // dst_u
+    mov        edi, [esp + 8 + 16]   // dst_v
+    mov        ecx, [esp + 8 + 20]   // pix
+    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
+    vpsrlw     ymm5, ymm5, 8
+    sub        edi, edx
+
+  convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    vpavgb     ymm0, ymm0, [eax + esi]
+    vpavgb     ymm1, ymm1, [eax + esi + 32]
+    lea        eax,  [eax + 64]
+    vpsrlw     ymm0, ymm0, 8      // YUYV -> UVUV
+    vpsrlw     ymm1, ymm1, 8
+    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpermq     ymm0, ymm0, 0xd8
+    vpand      ymm1, ymm0, ymm5  // U
+    vpsrlw     ymm0, ymm0, 8     // V
+    vpackuswb  ymm1, ymm1, ymm1  // mutates.
+    vpackuswb  ymm0, ymm0, ymm0  // mutates.
+    vpermq     ymm1, ymm1, 0xd8
+    vpermq     ymm0, ymm0, 0xd8
+    vextractf128 [edx], ymm1, 0  // U
+    vextractf128 [edx + edi], ymm0, 0 // V
+    lea        edx, [edx + 16]
+    sub        ecx, 32
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+
+__declspec(naked)
+void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
+                         uint8* dst_u, uint8* dst_v, int pix) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]    // src_yuy2
+    mov        edx, [esp + 4 + 8]    // dst_u
+    mov        edi, [esp + 4 + 12]   // dst_v
+    mov        ecx, [esp + 4 + 16]   // pix
+    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
+    vpsrlw     ymm5, ymm5, 8
+    sub        edi, edx
+
+  convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    lea        eax,  [eax + 64]
+    vpsrlw     ymm0, ymm0, 8      // YUYV -> UVUV
+    vpsrlw     ymm1, ymm1, 8
+    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpermq     ymm0, ymm0, 0xd8
+    vpand      ymm1, ymm0, ymm5  // U
+    vpsrlw     ymm0, ymm0, 8     // V
+    vpackuswb  ymm1, ymm1, ymm1  // mutates.
+    vpackuswb  ymm0, ymm0, ymm0  // mutates.
+    vpermq     ymm1, ymm1, 0xd8
+    vpermq     ymm0, ymm0, 0xd8
+    vextractf128 [edx], ymm1, 0  // U
+    vextractf128 [edx + edi], ymm0, 0 // V
+    lea        edx, [edx + 16]
+    sub        ecx, 32
+    jg         convertloop
+
+    pop        edi
+    vzeroupper
+    ret
+  }
+}
+
+__declspec(naked)
+void UYVYToYRow_AVX2(const uint8* src_uyvy,
+                     uint8* dst_y, int pix) {
+  __asm {
+    mov        eax, [esp + 4]    // src_uyvy
+    mov        edx, [esp + 8]    // dst_y
+    mov        ecx, [esp + 12]   // pix
+
+  convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    lea        eax,  [eax + 64]
+    vpsrlw     ymm0, ymm0, 8      // odd bytes are Y
+    vpsrlw     ymm1, ymm1, 8
+    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpermq     ymm0, ymm0, 0xd8
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
+    sub        ecx, 32
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+
+__declspec(naked)
+void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_yuy2
+    mov        esi, [esp + 8 + 8]    // stride_yuy2
+    mov        edx, [esp + 8 + 12]   // dst_u
+    mov        edi, [esp + 8 + 16]   // dst_v
+    mov        ecx, [esp + 8 + 20]   // pix
+    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
+    vpsrlw     ymm5, ymm5, 8
+    sub        edi, edx
+
+  convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    vpavgb     ymm0, ymm0, [eax + esi]
+    vpavgb     ymm1, ymm1, [eax + esi + 32]
+    lea        eax,  [eax + 64]
+    vpand      ymm0, ymm0, ymm5   // UYVY -> UVUV
+    vpand      ymm1, ymm1, ymm5
+    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpermq     ymm0, ymm0, 0xd8
+    vpand      ymm1, ymm0, ymm5  // U
+    vpsrlw     ymm0, ymm0, 8     // V
+    vpackuswb  ymm1, ymm1, ymm1  // mutates.
+    vpackuswb  ymm0, ymm0, ymm0  // mutates.
+    vpermq     ymm1, ymm1, 0xd8
+    vpermq     ymm0, ymm0, 0xd8
+    vextractf128 [edx], ymm1, 0  // U
+    vextractf128 [edx + edi], ymm0, 0 // V
+    lea        edx, [edx + 16]
+    sub        ecx, 32
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+
+__declspec(naked)
+void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int pix) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]    // src_yuy2
+    mov        edx, [esp + 4 + 8]    // dst_u
+    mov        edi, [esp + 4 + 12]   // dst_v
+    mov        ecx, [esp + 4 + 16]   // pix
+    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
+    vpsrlw     ymm5, ymm5, 8
+    sub        edi, edx
+
+  convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    lea        eax,  [eax + 64]
+    vpand      ymm0, ymm0, ymm5   // UYVY -> UVUV
+    vpand      ymm1, ymm1, ymm5
+    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpermq     ymm0, ymm0, 0xd8
+    vpand      ymm1, ymm0, ymm5  // U
+    vpsrlw     ymm0, ymm0, 8     // V
+    vpackuswb  ymm1, ymm1, ymm1  // mutates.
+    vpackuswb  ymm0, ymm0, ymm0  // mutates.
+    vpermq     ymm1, ymm1, 0xd8
+    vpermq     ymm0, ymm0, 0xd8
+    vextractf128 [edx], ymm1, 0  // U
+    vextractf128 [edx + edi], ymm0, 0 // V
+    lea        edx, [edx + 16]
+    sub        ecx, 32
+    jg         convertloop
+
+    pop        edi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_YUY2TOYROW_AVX2
+
+#ifdef HAS_YUY2TOYROW_SSE2
+__declspec(naked)
+void YUY2ToYRow_SSE2(const uint8* src_yuy2,
+                     uint8* dst_y, int pix) {
+  __asm {
+    mov        eax, [esp + 4]    // src_yuy2
+    mov        edx, [esp + 8]    // dst_y
+    mov        ecx, [esp + 12]   // pix
+    pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+
+  convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    pand       xmm0, xmm5   // even bytes are Y
+    pand       xmm1, xmm5
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         convertloop
+    ret
+  }
+}
+
+__declspec(naked)
+void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_yuy2
+    mov        esi, [esp + 8 + 8]    // stride_yuy2
+    mov        edx, [esp + 8 + 12]   // dst_u
+    mov        edi, [esp + 8 + 16]   // dst_v
+    mov        ecx, [esp + 8 + 20]   // pix
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+    sub        edi, edx
+
+  convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + esi]
+    movdqu     xmm3, [eax + esi + 16]
+    lea        eax,  [eax + 32]
+    pavgb      xmm0, xmm2
+    pavgb      xmm1, xmm3
+    psrlw      xmm0, 8      // YUYV -> UVUV
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    movdqa     xmm1, xmm0
+    pand       xmm0, xmm5  // U
+    packuswb   xmm0, xmm0
+    psrlw      xmm1, 8     // V
+    packuswb   xmm1, xmm1
+    movq       qword ptr [edx], xmm0
+    movq       qword ptr [edx + edi], xmm1
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked)
+void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
+                         uint8* dst_u, uint8* dst_v, int pix) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]    // src_yuy2
+    mov        edx, [esp + 4 + 8]    // dst_u
+    mov        edi, [esp + 4 + 12]   // dst_v
+    mov        ecx, [esp + 4 + 16]   // pix
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+    sub        edi, edx
+
+  convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    psrlw      xmm0, 8      // YUYV -> UVUV
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    movdqa     xmm1, xmm0
+    pand       xmm0, xmm5  // U
+    packuswb   xmm0, xmm0
+    psrlw      xmm1, 8     // V
+    packuswb   xmm1, xmm1
+    movq       qword ptr [edx], xmm0
+    movq       qword ptr [edx + edi], xmm1
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    ret
+  }
+}
+
+__declspec(naked)
+void UYVYToYRow_SSE2(const uint8* src_uyvy,
+                     uint8* dst_y, int pix) {
+  __asm {
+    mov        eax, [esp + 4]    // src_uyvy
+    mov        edx, [esp + 8]    // dst_y
+    mov        ecx, [esp + 12]   // pix
+
+  convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    psrlw      xmm0, 8    // odd bytes are Y
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         convertloop
+    ret
+  }
+}
+
+__declspec(naked)
+void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_yuy2
+    mov        esi, [esp + 8 + 8]    // stride_yuy2
+    mov        edx, [esp + 8 + 12]   // dst_u
+    mov        edi, [esp + 8 + 16]   // dst_v
+    mov        ecx, [esp + 8 + 20]   // pix
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+    sub        edi, edx
+
+  convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + esi]
+    movdqu     xmm3, [eax + esi + 16]
+    lea        eax,  [eax + 32]
+    pavgb      xmm0, xmm2
+    pavgb      xmm1, xmm3
+    pand       xmm0, xmm5   // UYVY -> UVUV
+    pand       xmm1, xmm5
+    packuswb   xmm0, xmm1
+    movdqa     xmm1, xmm0
+    pand       xmm0, xmm5  // U
+    packuswb   xmm0, xmm0
+    psrlw      xmm1, 8     // V
+    packuswb   xmm1, xmm1
+    movq       qword ptr [edx], xmm0
+    movq       qword ptr [edx + edi], xmm1
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked)
+void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int pix) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]    // src_yuy2
+    mov        edx, [esp + 4 + 8]    // dst_u
+    mov        edi, [esp + 4 + 12]   // dst_v
+    mov        ecx, [esp + 4 + 16]   // pix
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+    sub        edi, edx
+
+  convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    pand       xmm0, xmm5   // UYVY -> UVUV
+    pand       xmm1, xmm5
+    packuswb   xmm0, xmm1
+    movdqa     xmm1, xmm0
+    pand       xmm0, xmm5  // U
+    packuswb   xmm0, xmm0
+    psrlw      xmm1, 8     // V
+    packuswb   xmm1, xmm1
+    movq       qword ptr [edx], xmm0
+    movq       qword ptr [edx + edi], xmm1
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    ret
+  }
+}
+#endif  // HAS_YUY2TOYROW_SSE2
+
+#ifdef HAS_ARGBBLENDROW_SSE2
+// Blend 8 pixels at a time.
+__declspec(naked)
+void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                       uint8* dst_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_argb0
+    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+    pcmpeqb    xmm7, xmm7       // generate constant 1
+    psrlw      xmm7, 15
+    pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
+    psrlw      xmm6, 8
+    pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
+    psllw      xmm5, 8
+    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
+    pslld      xmm4, 24
+    sub        ecx, 4
+    jl         convertloop4b    // less than 4 pixels?
+
+    // 4 pixel loop.
+  convertloop4:
+    movdqu     xmm3, [eax]      // src argb
+    lea        eax, [eax + 16]
+    movdqa     xmm0, xmm3       // src argb
+    pxor       xmm3, xmm4       // ~alpha
+    movdqu     xmm2, [esi]      // _r_b
+    psrlw      xmm3, 8          // alpha
+    pshufhw    xmm3, xmm3, 0F5h // 8 alpha words
+    pshuflw    xmm3, xmm3, 0F5h
+    pand       xmm2, xmm6       // _r_b
+    paddw      xmm3, xmm7       // 256 - alpha
+    pmullw     xmm2, xmm3       // _r_b * alpha
+    movdqu     xmm1, [esi]      // _a_g
+    lea        esi, [esi + 16]
+    psrlw      xmm1, 8          // _a_g
+    por        xmm0, xmm4       // set alpha to 255
+    pmullw     xmm1, xmm3       // _a_g * alpha
+    psrlw      xmm2, 8          // _r_b convert to 8 bits again
+    paddusb    xmm0, xmm2       // + src argb
+    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
+    paddusb    xmm0, xmm1       // + src argb
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jge        convertloop4
+
+  convertloop4b:
+    add        ecx, 4 - 1
+    jl         convertloop1b
+
+    // 1 pixel loop.
+  convertloop1:
+    movd       xmm3, [eax]      // src argb
+    lea        eax, [eax + 4]
+    movdqa     xmm0, xmm3       // src argb
+    pxor       xmm3, xmm4       // ~alpha
+    movd       xmm2, [esi]      // _r_b
+    psrlw      xmm3, 8          // alpha
+    pshufhw    xmm3, xmm3, 0F5h // 8 alpha words
+    pshuflw    xmm3, xmm3, 0F5h
+    pand       xmm2, xmm6       // _r_b
+    paddw      xmm3, xmm7       // 256 - alpha
+    pmullw     xmm2, xmm3       // _r_b * alpha
+    movd       xmm1, [esi]      // _a_g
+    lea        esi, [esi + 4]
+    psrlw      xmm1, 8          // _a_g
+    por        xmm0, xmm4       // set alpha to 255
+    pmullw     xmm1, xmm3       // _a_g * alpha
+    psrlw      xmm2, 8          // _r_b convert to 8 bits again
+    paddusb    xmm0, xmm2       // + src argb
+    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
+    paddusb    xmm0, xmm1       // + src argb
+    movd       [edx], xmm0
+    lea        edx, [edx + 4]
+    sub        ecx, 1
+    jge        convertloop1
+
+  convertloop1b:
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBBLENDROW_SSE2
+
+#ifdef HAS_ARGBBLENDROW_SSSE3
+// Shuffle table for isolating alpha.
+static const uvec8 kShuffleAlpha = {
+  3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
+  11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
+};
+// Same as SSE2, but replaces:
+//    psrlw      xmm3, 8          // alpha
+//    pshufhw    xmm3, xmm3, 0F5h // 8 alpha words
+//    pshuflw    xmm3, xmm3, 0F5h
+// with..
+//    pshufb     xmm3, kShuffleAlpha // alpha
+// Blend 8 pixels at a time.
+
+__declspec(naked)
+void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
+                        uint8* dst_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_argb0
+    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+    pcmpeqb    xmm7, xmm7       // generate constant 0x0001
+    psrlw      xmm7, 15
+    pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
+    psrlw      xmm6, 8
+    pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
+    psllw      xmm5, 8
+    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
+    pslld      xmm4, 24
+    sub        ecx, 4
+    jl         convertloop4b    // less than 4 pixels?
+
+    // 4 pixel loop.
+  convertloop4:
+    movdqu     xmm3, [eax]      // src argb
+    lea        eax, [eax + 16]
+    movdqa     xmm0, xmm3       // src argb
+    pxor       xmm3, xmm4       // ~alpha
+    movdqu     xmm2, [esi]      // _r_b
+    pshufb     xmm3, kShuffleAlpha // alpha
+    pand       xmm2, xmm6       // _r_b
+    paddw      xmm3, xmm7       // 256 - alpha
+    pmullw     xmm2, xmm3       // _r_b * alpha
+    movdqu     xmm1, [esi]      // _a_g
+    lea        esi, [esi + 16]
+    psrlw      xmm1, 8          // _a_g
+    por        xmm0, xmm4       // set alpha to 255
+    pmullw     xmm1, xmm3       // _a_g * alpha
+    psrlw      xmm2, 8          // _r_b convert to 8 bits again
+    paddusb    xmm0, xmm2       // + src argb
+    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
+    paddusb    xmm0, xmm1       // + src argb
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jge        convertloop4
+
+  convertloop4b:
+    add        ecx, 4 - 1
+    jl         convertloop1b
+
+    // 1 pixel loop.
+  convertloop1:
+    movd       xmm3, [eax]      // src argb
+    lea        eax, [eax + 4]
+    movdqa     xmm0, xmm3       // src argb
+    pxor       xmm3, xmm4       // ~alpha
+    movd       xmm2, [esi]      // _r_b
+    pshufb     xmm3, kShuffleAlpha // alpha
+    pand       xmm2, xmm6       // _r_b
+    paddw      xmm3, xmm7       // 256 - alpha
+    pmullw     xmm2, xmm3       // _r_b * alpha
+    movd       xmm1, [esi]      // _a_g
+    lea        esi, [esi + 4]
+    psrlw      xmm1, 8          // _a_g
+    por        xmm0, xmm4       // set alpha to 255
+    pmullw     xmm1, xmm3       // _a_g * alpha
+    psrlw      xmm2, 8          // _r_b convert to 8 bits again
+    paddusb    xmm0, xmm2       // + src argb
+    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
+    paddusb    xmm0, xmm1       // + src argb
+    movd       [edx], xmm0
+    lea        edx, [edx + 4]
+    sub        ecx, 1
+    jge        convertloop1
+
+  convertloop1b:
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBBLENDROW_SSSE3
+
+#ifdef HAS_ARGBATTENUATEROW_SSE2
+// Attenuate 4 pixels at a time.
+__declspec(naked)
+void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
+  __asm {
+    mov        eax, [esp + 4]   // src_argb0
+    mov        edx, [esp + 8]   // dst_argb
+    mov        ecx, [esp + 12]  // width
+    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
+    pslld      xmm4, 24
+    pcmpeqb    xmm5, xmm5       // generate mask 0x00ffffff
+    psrld      xmm5, 8
+
+ convertloop:
+    movdqu     xmm0, [eax]      // read 4 pixels
+    punpcklbw  xmm0, xmm0       // first 2
+    pshufhw    xmm2, xmm0, 0FFh // 8 alpha words
+    pshuflw    xmm2, xmm2, 0FFh
+    pmulhuw    xmm0, xmm2       // rgb * a
+    movdqu     xmm1, [eax]      // read 4 pixels
+    punpckhbw  xmm1, xmm1       // next 2 pixels
+    pshufhw    xmm2, xmm1, 0FFh // 8 alpha words
+    pshuflw    xmm2, xmm2, 0FFh
+    pmulhuw    xmm1, xmm2       // rgb * a
+    movdqu     xmm2, [eax]      // alphas
+    lea        eax, [eax + 16]
+    psrlw      xmm0, 8
+    pand       xmm2, xmm4
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    pand       xmm0, xmm5       // keep original alphas
+    por        xmm0, xmm2
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         convertloop
+
+    ret
+  }
+}
+#endif  // HAS_ARGBATTENUATEROW_SSE2
+
+#ifdef HAS_ARGBATTENUATEROW_SSSE3
+// Shuffle table duplicating alpha.
+static const uvec8 kShuffleAlpha0 = {
+  3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
+};
+static const uvec8 kShuffleAlpha1 = {
+  11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
+  15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
+};
+__declspec(naked)
+void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
+  __asm {
+    mov        eax, [esp + 4]   // src_argb0
+    mov        edx, [esp + 8]   // dst_argb
+    mov        ecx, [esp + 12]  // width
+    pcmpeqb    xmm3, xmm3       // generate mask 0xff000000
+    pslld      xmm3, 24
+    movdqa     xmm4, kShuffleAlpha0
+    movdqa     xmm5, kShuffleAlpha1
+
+ convertloop:
+    movdqu     xmm0, [eax]      // read 4 pixels
+    pshufb     xmm0, xmm4       // isolate first 2 alphas
+    movdqu     xmm1, [eax]      // read 4 pixels
+    punpcklbw  xmm1, xmm1       // first 2 pixel rgbs
+    pmulhuw    xmm0, xmm1       // rgb * a
+    movdqu     xmm1, [eax]      // read 4 pixels
+    pshufb     xmm1, xmm5       // isolate next 2 alphas
+    movdqu     xmm2, [eax]      // read 4 pixels
+    punpckhbw  xmm2, xmm2       // next 2 pixel rgbs
+    pmulhuw    xmm1, xmm2       // rgb * a
+    movdqu     xmm2, [eax]      // mask original alpha
+    lea        eax, [eax + 16]
+    pand       xmm2, xmm3
+    psrlw      xmm0, 8
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    por        xmm0, xmm2       // copy original alpha
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         convertloop
+
+    ret
+  }
+}
+#endif  // HAS_ARGBATTENUATEROW_SSSE3
+
+#ifdef HAS_ARGBATTENUATEROW_AVX2
+// Shuffle table duplicating alpha.
+static const uvec8 kShuffleAlpha_AVX2 = {
+  6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
+};
+__declspec(naked)
+void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
+  __asm {
+    mov        eax, [esp + 4]   // src_argb0
+    mov        edx, [esp + 8]   // dst_argb
+    mov        ecx, [esp + 12]  // width
+    sub        edx, eax
+    vbroadcastf128 ymm4,kShuffleAlpha_AVX2
+    vpcmpeqb   ymm5, ymm5, ymm5 // generate mask 0xff000000
+    vpslld     ymm5, ymm5, 24
+
+ convertloop:
+    vmovdqu    ymm6, [eax]       // read 8 pixels.
+    vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
+    vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
+    vpshufb    ymm2, ymm0, ymm4  // low 4 alphas
+    vpshufb    ymm3, ymm1, ymm4  // high 4 alphas
+    vpmulhuw   ymm0, ymm0, ymm2  // rgb * a
+    vpmulhuw   ymm1, ymm1, ymm3  // rgb * a
+    vpand      ymm6, ymm6, ymm5  // isolate alpha
+    vpsrlw     ymm0, ymm0, 8
+    vpsrlw     ymm1, ymm1, 8
+    vpackuswb  ymm0, ymm0, ymm1  // unmutated.
+    vpor       ymm0, ymm0, ymm6  // copy original alpha
+    vmovdqu    [eax + edx], ymm0
+    lea        eax, [eax + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBATTENUATEROW_AVX2
+
+#ifdef HAS_ARGBUNATTENUATEROW_SSE2
+// Unattenuate 4 pixels at a time.
+__declspec(naked)
+void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
+                             int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // src_argb0
+    mov        edx, [esp + 8 + 8]   // dst_argb
+    mov        ecx, [esp + 8 + 12]  // width
+
+ convertloop:
+    movdqu     xmm0, [eax]      // read 4 pixels
+    movzx      esi, byte ptr [eax + 3]  // first alpha
+    movzx      edi, byte ptr [eax + 7]  // second alpha
+    punpcklbw  xmm0, xmm0       // first 2
+    movd       xmm2, dword ptr fixed_invtbl8[esi * 4]
+    movd       xmm3, dword ptr fixed_invtbl8[edi * 4]
+    pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words.  1, a, a, a
+    pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words
+    movlhps    xmm2, xmm3
+    pmulhuw    xmm0, xmm2       // rgb * a
+
+    movdqu     xmm1, [eax]      // read 4 pixels
+    movzx      esi, byte ptr [eax + 11]  // third alpha
+    movzx      edi, byte ptr [eax + 15]  // forth alpha
+    punpckhbw  xmm1, xmm1       // next 2
+    movd       xmm2, dword ptr fixed_invtbl8[esi * 4]
+    movd       xmm3, dword ptr fixed_invtbl8[edi * 4]
+    pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words
+    pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words
+    movlhps    xmm2, xmm3
+    pmulhuw    xmm1, xmm2       // rgb * a
+    lea        eax, [eax + 16]
+
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         convertloop
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBUNATTENUATEROW_SSE2
+
+#ifdef HAS_ARGBUNATTENUATEROW_AVX2
+// Shuffle table duplicating alpha.
+static const uvec8 kUnattenShuffleAlpha_AVX2 = {
+  0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
+};
+// TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
+// USE_GATHER is not on by default, due to being a slow instruction.
+#ifdef USE_GATHER
+__declspec(naked)
+void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+                             int width) {
+  __asm {
+    mov        eax, [esp + 4]   // src_argb0
+    mov        edx, [esp + 8]   // dst_argb
+    mov        ecx, [esp + 12]  // width
+    sub        edx, eax
+    vbroadcastf128 ymm4, kUnattenShuffleAlpha_AVX2
+
+ convertloop:
+    vmovdqu    ymm6, [eax]       // read 8 pixels.
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0xffffffff for gather.
+    vpsrld     ymm2, ymm6, 24    // alpha in low 8 bits.
+    vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
+    vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
+    vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5  // ymm5 cleared.  1, a
+    vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
+    vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.
+    vpshufb    ymm2, ymm2, ymm4  // replicate low 4 alphas. 1, a, a, a
+    vpshufb    ymm3, ymm3, ymm4  // replicate high 4 alphas
+    vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
+    vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
+    vpackuswb  ymm0, ymm0, ymm1  // unmutated.
+    vmovdqu    [eax + edx], ymm0
+    lea        eax, [eax + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    vzeroupper
+    ret
+  }
+}
+#else  // USE_GATHER
+__declspec(naked)
+void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+                             int width) {
+  __asm {
+
+    mov        eax, [esp + 4]   // src_argb0
+    mov        edx, [esp + 8]   // dst_argb
+    mov        ecx, [esp + 12]  // width
+    sub        edx, eax
+    vbroadcastf128 ymm5, kUnattenShuffleAlpha_AVX2
+
+    push       esi
+    push       edi
+
+ convertloop:
+    // replace VPGATHER
+    movzx      esi, byte ptr [eax + 3]                 // alpha0
+    movzx      edi, byte ptr [eax + 7]                 // alpha1
+    vmovd      xmm0, dword ptr fixed_invtbl8[esi * 4]  // [1,a0]
+    vmovd      xmm1, dword ptr fixed_invtbl8[edi * 4]  // [1,a1]
+    movzx      esi, byte ptr [eax + 11]                // alpha2
+    movzx      edi, byte ptr [eax + 15]                // alpha3
+    vpunpckldq xmm6, xmm0, xmm1                        // [1,a1,1,a0]
+    vmovd      xmm2, dword ptr fixed_invtbl8[esi * 4]  // [1,a2]
+    vmovd      xmm3, dword ptr fixed_invtbl8[edi * 4]  // [1,a3]
+    movzx      esi, byte ptr [eax + 19]                // alpha4
+    movzx      edi, byte ptr [eax + 23]                // alpha5
+    vpunpckldq xmm7, xmm2, xmm3                        // [1,a3,1,a2]
+    vmovd      xmm0, dword ptr fixed_invtbl8[esi * 4]  // [1,a4]
+    vmovd      xmm1, dword ptr fixed_invtbl8[edi * 4]  // [1,a5]
+    movzx      esi, byte ptr [eax + 27]                // alpha6
+    movzx      edi, byte ptr [eax + 31]                // alpha7
+    vpunpckldq xmm0, xmm0, xmm1                        // [1,a5,1,a4]
+    vmovd      xmm2, dword ptr fixed_invtbl8[esi * 4]  // [1,a6]
+    vmovd      xmm3, dword ptr fixed_invtbl8[edi * 4]  // [1,a7]
+    vpunpckldq xmm2, xmm2, xmm3                        // [1,a7,1,a6]
+    vpunpcklqdq xmm3, xmm6, xmm7                       // [1,a3,1,a2,1,a1,1,a0]
+    vpunpcklqdq xmm0, xmm0, xmm2                       // [1,a7,1,a6,1,a5,1,a4]
+    vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
+    // end of VPGATHER
+
+    vmovdqu    ymm6, [eax]       // read 8 pixels.
+    vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
+    vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
+    vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
+    vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.
+    vpshufb    ymm2, ymm2, ymm5  // replicate low 4 alphas. 1, a, a, a
+    vpshufb    ymm3, ymm3, ymm5  // replicate high 4 alphas
+    vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
+    vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
+    vpackuswb  ymm0, ymm0, ymm1  // unmutated.
+    vmovdqu    [eax + edx], ymm0
+    lea        eax, [eax + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // USE_GATHER
+#endif  // HAS_ARGBATTENUATEROW_AVX2
+
+#ifdef HAS_ARGBGRAYROW_SSSE3
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
+__declspec(naked)
+void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
+  __asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_argb */
+    mov        ecx, [esp + 12]  /* width */
+    movdqa     xmm4, kARGBToYJ
+    movdqa     xmm5, kAddYJ64
+
+ convertloop:
+    movdqu     xmm0, [eax]  // G
+    movdqu     xmm1, [eax + 16]
+    pmaddubsw  xmm0, xmm4
+    pmaddubsw  xmm1, xmm4
+    phaddw     xmm0, xmm1
+    paddw      xmm0, xmm5  // Add .5 for rounding.
+    psrlw      xmm0, 7
+    packuswb   xmm0, xmm0   // 8 G bytes
+    movdqu     xmm2, [eax]  // A
+    movdqu     xmm3, [eax + 16]
+    lea        eax, [eax + 32]
+    psrld      xmm2, 24
+    psrld      xmm3, 24
+    packuswb   xmm2, xmm3
+    packuswb   xmm2, xmm2   // 8 A bytes
+    movdqa     xmm3, xmm0   // Weave into GG, GA, then GGGA
+    punpcklbw  xmm0, xmm0   // 8 GG words
+    punpcklbw  xmm3, xmm2   // 8 GA words
+    movdqa     xmm1, xmm0
+    punpcklwd  xmm0, xmm3   // GGGA first 4
+    punpckhwd  xmm1, xmm3   // GGGA next 4
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+    ret
+  }
+}
+#endif  // HAS_ARGBGRAYROW_SSSE3
+
+#ifdef HAS_ARGBSEPIAROW_SSSE3
+//    b = (r * 35 + g * 68 + b * 17) >> 7
+//    g = (r * 45 + g * 88 + b * 22) >> 7
+//    r = (r * 50 + g * 98 + b * 24) >> 7
+// Constant for ARGB color to sepia tone.
+static const vec8 kARGBToSepiaB = {
+  17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
+};
+
+static const vec8 kARGBToSepiaG = {
+  22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
+};
+
+static const vec8 kARGBToSepiaR = {
+  24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
+};
+
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+__declspec(naked)
+void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
+  __asm {
+    mov        eax, [esp + 4]   /* dst_argb */
+    mov        ecx, [esp + 8]   /* width */
+    movdqa     xmm2, kARGBToSepiaB
+    movdqa     xmm3, kARGBToSepiaG
+    movdqa     xmm4, kARGBToSepiaR
+
+ convertloop:
+    movdqu     xmm0, [eax]  // B
+    movdqu     xmm6, [eax + 16]
+    pmaddubsw  xmm0, xmm2
+    pmaddubsw  xmm6, xmm2
+    phaddw     xmm0, xmm6
+    psrlw      xmm0, 7
+    packuswb   xmm0, xmm0   // 8 B values
+    movdqu     xmm5, [eax]  // G
+    movdqu     xmm1, [eax + 16]
+    pmaddubsw  xmm5, xmm3
+    pmaddubsw  xmm1, xmm3
+    phaddw     xmm5, xmm1
+    psrlw      xmm5, 7
+    packuswb   xmm5, xmm5   // 8 G values
+    punpcklbw  xmm0, xmm5   // 8 BG values
+    movdqu     xmm5, [eax]  // R
+    movdqu     xmm1, [eax + 16]
+    pmaddubsw  xmm5, xmm4
+    pmaddubsw  xmm1, xmm4
+    phaddw     xmm5, xmm1
+    psrlw      xmm5, 7
+    packuswb   xmm5, xmm5   // 8 R values
+    movdqu     xmm6, [eax]  // A
+    movdqu     xmm1, [eax + 16]
+    psrld      xmm6, 24
+    psrld      xmm1, 24
+    packuswb   xmm6, xmm1
+    packuswb   xmm6, xmm6   // 8 A values
+    punpcklbw  xmm5, xmm6   // 8 RA values
+    movdqa     xmm1, xmm0   // Weave BG, RA together
+    punpcklwd  xmm0, xmm5   // BGRA first 4
+    punpckhwd  xmm1, xmm5   // BGRA next 4
+    movdqu     [eax], xmm0
+    movdqu     [eax + 16], xmm1
+    lea        eax, [eax + 32]
+    sub        ecx, 8
+    jg         convertloop
+    ret
+  }
+}
+#endif  // HAS_ARGBSEPIAROW_SSSE3
+
+#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// Same as Sepia except matrix is provided.
+// TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
+// and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
+__declspec(naked)
+void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                              const int8* matrix_argb, int width) {
+  __asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_argb */
+    mov        ecx, [esp + 12]  /* matrix_argb */
+    movdqu     xmm5, [ecx]
+    pshufd     xmm2, xmm5, 0x00
+    pshufd     xmm3, xmm5, 0x55
+    pshufd     xmm4, xmm5, 0xaa
+    pshufd     xmm5, xmm5, 0xff
+    mov        ecx, [esp + 16]  /* width */
+
+ convertloop:
+    movdqu     xmm0, [eax]  // B
+    movdqu     xmm7, [eax + 16]
+    pmaddubsw  xmm0, xmm2
+    pmaddubsw  xmm7, xmm2
+    movdqu     xmm6, [eax]  // G
+    movdqu     xmm1, [eax + 16]
+    pmaddubsw  xmm6, xmm3
+    pmaddubsw  xmm1, xmm3
+    phaddsw    xmm0, xmm7   // B
+    phaddsw    xmm6, xmm1   // G
+    psraw      xmm0, 6      // B
+    psraw      xmm6, 6      // G
+    packuswb   xmm0, xmm0   // 8 B values
+    packuswb   xmm6, xmm6   // 8 G values
+    punpcklbw  xmm0, xmm6   // 8 BG values
+    movdqu     xmm1, [eax]  // R
+    movdqu     xmm7, [eax + 16]
+    pmaddubsw  xmm1, xmm4
+    pmaddubsw  xmm7, xmm4
+    phaddsw    xmm1, xmm7   // R
+    movdqu     xmm6, [eax]  // A
+    movdqu     xmm7, [eax + 16]
+    pmaddubsw  xmm6, xmm5
+    pmaddubsw  xmm7, xmm5
+    phaddsw    xmm6, xmm7   // A
+    psraw      xmm1, 6      // R
+    psraw      xmm6, 6      // A
+    packuswb   xmm1, xmm1   // 8 R values
+    packuswb   xmm6, xmm6   // 8 A values
+    punpcklbw  xmm1, xmm6   // 8 RA values
+    movdqa     xmm6, xmm0   // Weave BG, RA together
+    punpcklwd  xmm0, xmm1   // BGRA first 4
+    punpckhwd  xmm6, xmm1   // BGRA next 4
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm6
+    lea        eax, [eax + 32]
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+    ret
+  }
+}
+#endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
+
+#ifdef HAS_ARGBQUANTIZEROW_SSE2
+// Quantize 4 ARGB pixels (16 bytes).
+__declspec(naked)
+void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
+                          int interval_offset, int width) {
+  __asm {
+    mov        eax, [esp + 4]    /* dst_argb */
+    movd       xmm2, [esp + 8]   /* scale */
+    movd       xmm3, [esp + 12]  /* interval_size */
+    movd       xmm4, [esp + 16]  /* interval_offset */
+    mov        ecx, [esp + 20]   /* width */
+    pshuflw    xmm2, xmm2, 040h
+    pshufd     xmm2, xmm2, 044h
+    pshuflw    xmm3, xmm3, 040h
+    pshufd     xmm3, xmm3, 044h
+    pshuflw    xmm4, xmm4, 040h
+    pshufd     xmm4, xmm4, 044h
+    pxor       xmm5, xmm5  // constant 0
+    pcmpeqb    xmm6, xmm6  // generate mask 0xff000000
+    pslld      xmm6, 24
+
+ convertloop:
+    movdqu     xmm0, [eax]  // read 4 pixels
+    punpcklbw  xmm0, xmm5   // first 2 pixels
+    pmulhuw    xmm0, xmm2   // pixel * scale >> 16
+    movdqu     xmm1, [eax]  // read 4 pixels
+    punpckhbw  xmm1, xmm5   // next 2 pixels
+    pmulhuw    xmm1, xmm2
+    pmullw     xmm0, xmm3   // * interval_size
+    movdqu     xmm7, [eax]  // read 4 pixels
+    pmullw     xmm1, xmm3
+    pand       xmm7, xmm6   // mask alpha
+    paddw      xmm0, xmm4   // + interval_size / 2
+    paddw      xmm1, xmm4
+    packuswb   xmm0, xmm1
+    por        xmm0, xmm7
+    movdqu     [eax], xmm0
+    lea        eax, [eax + 16]
+    sub        ecx, 4
+    jg         convertloop
+    ret
+  }
+}
+#endif  // HAS_ARGBQUANTIZEROW_SSE2
+
+#ifdef HAS_ARGBSHADEROW_SSE2
+// Shade 4 pixels at a time by specified value.
+__declspec(naked)
+void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
+                       uint32 value) {
+  __asm {
+    mov        eax, [esp + 4]   // src_argb
+    mov        edx, [esp + 8]   // dst_argb
+    mov        ecx, [esp + 12]  // width
+    movd       xmm2, [esp + 16]  // value
+    punpcklbw  xmm2, xmm2
+    punpcklqdq xmm2, xmm2
+
+ convertloop:
+    movdqu     xmm0, [eax]      // read 4 pixels
+    lea        eax, [eax + 16]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm0       // first 2
+    punpckhbw  xmm1, xmm1       // next 2
+    pmulhuw    xmm0, xmm2       // argb * value
+    pmulhuw    xmm1, xmm2       // argb * value
+    psrlw      xmm0, 8
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         convertloop
+
+    ret
+  }
+}
+#endif  // HAS_ARGBSHADEROW_SSE2
+
+#ifdef HAS_ARGBMULTIPLYROW_SSE2
+// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
+__declspec(naked)
+void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_argb0
+    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+    pxor       xmm5, xmm5  // constant 0
+
+ convertloop:
+    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
+    movdqu     xmm2, [esi]        // read 4 pixels from src_argb1
+    movdqu     xmm1, xmm0
+    movdqu     xmm3, xmm2
+    punpcklbw  xmm0, xmm0         // first 2
+    punpckhbw  xmm1, xmm1         // next 2
+    punpcklbw  xmm2, xmm5         // first 2
+    punpckhbw  xmm3, xmm5         // next 2
+    pmulhuw    xmm0, xmm2         // src_argb0 * src_argb1 first 2
+    pmulhuw    xmm1, xmm3         // src_argb0 * src_argb1 next 2
+    lea        eax, [eax + 16]
+    lea        esi, [esi + 16]
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBMULTIPLYROW_SSE2
+
+#ifdef HAS_ARGBADDROW_SSE2
+// Add 2 rows of ARGB pixels together, 4 pixels at a time.
+// TODO(fbarchard): Port this to posix, neon and other math functions.
+__declspec(naked)
+void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                     uint8* dst_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_argb0
+    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+
+    sub        ecx, 4
+    jl         convertloop49
+
+ convertloop4:
+    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
+    lea        eax, [eax + 16]
+    movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
+    lea        esi, [esi + 16]
+    paddusb    xmm0, xmm1         // src_argb0 + src_argb1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jge        convertloop4
+
+ convertloop49:
+    add        ecx, 4 - 1
+    jl         convertloop19
+
+ convertloop1:
+    movd       xmm0, [eax]        // read 1 pixels from src_argb0
+    lea        eax, [eax + 4]
+    movd       xmm1, [esi]        // read 1 pixels from src_argb1
+    lea        esi, [esi + 4]
+    paddusb    xmm0, xmm1         // src_argb0 + src_argb1
+    movd       [edx], xmm0
+    lea        edx, [edx + 4]
+    sub        ecx, 1
+    jge        convertloop1
+
+ convertloop19:
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBADDROW_SSE2
+
+#ifdef HAS_ARGBSUBTRACTROW_SSE2
+// Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
+__declspec(naked)
+void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_argb0
+    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+
+ convertloop:
+    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
+    lea        eax, [eax + 16]
+    movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
+    lea        esi, [esi + 16]
+    psubusb    xmm0, xmm1         // src_argb0 - src_argb1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBSUBTRACTROW_SSE2
+
+#ifdef HAS_ARGBMULTIPLYROW_AVX2
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+__declspec(naked)
+void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_argb0
+    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+    vpxor      ymm5, ymm5, ymm5     // constant 0
+
+ convertloop:
+    vmovdqu    ymm1, [eax]        // read 8 pixels from src_argb0
+    lea        eax, [eax + 32]
+    vmovdqu    ymm3, [esi]        // read 8 pixels from src_argb1
+    lea        esi, [esi + 32]
+    vpunpcklbw ymm0, ymm1, ymm1   // low 4
+    vpunpckhbw ymm1, ymm1, ymm1   // high 4
+    vpunpcklbw ymm2, ymm3, ymm5   // low 4
+    vpunpckhbw ymm3, ymm3, ymm5   // high 4
+    vpmulhuw   ymm0, ymm0, ymm2   // src_argb0 * src_argb1 low 4
+    vpmulhuw   ymm1, ymm1, ymm3   // src_argb0 * src_argb1 high 4
+    vpackuswb  ymm0, ymm0, ymm1
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBMULTIPLYROW_AVX2
+
+#ifdef HAS_ARGBADDROW_AVX2
+// Add 2 rows of ARGB pixels together, 8 pixels at a time.
+__declspec(naked)
+void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+                     uint8* dst_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_argb0
+    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+
+ convertloop:
+    vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
+    lea        eax, [eax + 32]
+    vpaddusb   ymm0, ymm0, [esi]        // add 8 pixels from src_argb1
+    lea        esi, [esi + 32]
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBADDROW_AVX2
+
+#ifdef HAS_ARGBSUBTRACTROW_AVX2
+// Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
+__declspec(naked)
+void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_argb0
+    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+
+ convertloop:
+    vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
+    lea        eax, [eax + 32]
+    vpsubusb   ymm0, ymm0, [esi]        // src_argb0 - src_argb1
+    lea        esi, [esi + 32]
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBSUBTRACTROW_AVX2
+
+#ifdef HAS_SOBELXROW_SSE2
+// SobelX as a matrix is
+// -1  0  1
+// -2  0  2
+// -1  0  1
+__declspec(naked)
+void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
+                    const uint8* src_y2, uint8* dst_sobelx, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // src_y0
+    mov        esi, [esp + 8 + 8]   // src_y1
+    mov        edi, [esp + 8 + 12]  // src_y2
+    mov        edx, [esp + 8 + 16]  // dst_sobelx
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        esi, eax
+    sub        edi, eax
+    sub        edx, eax
+    pxor       xmm5, xmm5  // constant 0
+
+ convertloop:
+    movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]
+    movq       xmm1, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]
+    punpcklbw  xmm0, xmm5
+    punpcklbw  xmm1, xmm5
+    psubw      xmm0, xmm1
+    movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]
+    movq       xmm2, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
+    punpcklbw  xmm1, xmm5
+    punpcklbw  xmm2, xmm5
+    psubw      xmm1, xmm2
+    movq       xmm2, qword ptr [eax + edi]      // read 8 pixels from src_y2[0]
+    movq       xmm3, qword ptr [eax + edi + 2]  // read 8 pixels from src_y2[2]
+    punpcklbw  xmm2, xmm5
+    punpcklbw  xmm3, xmm5
+    psubw      xmm2, xmm3
+    paddw      xmm0, xmm2
+    paddw      xmm0, xmm1
+    paddw      xmm0, xmm1
+    pxor       xmm1, xmm1   // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
+    psubw      xmm1, xmm0
+    pmaxsw     xmm0, xmm1
+    packuswb   xmm0, xmm0
+    movq       qword ptr [eax + edx], xmm0
+    lea        eax, [eax + 8]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_SOBELXROW_SSE2
+
+#ifdef HAS_SOBELYROW_SSE2
+// SobelY as a matrix is
+// -1 -2 -1
+//  0  0  0
+//  1  2  1
+__declspec(naked)
+void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
+                    uint8* dst_sobely, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_y0
+    mov        esi, [esp + 4 + 8]   // src_y1
+    mov        edx, [esp + 4 + 12]  // dst_sobely
+    mov        ecx, [esp + 4 + 16]  // width
+    sub        esi, eax
+    sub        edx, eax
+    pxor       xmm5, xmm5  // constant 0
+
+ convertloop:
+    movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]
+    movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]
+    punpcklbw  xmm0, xmm5
+    punpcklbw  xmm1, xmm5
+    psubw      xmm0, xmm1
+    movq       xmm1, qword ptr [eax + 1]        // read 8 pixels from src_y0[1]
+    movq       xmm2, qword ptr [eax + esi + 1]  // read 8 pixels from src_y1[1]
+    punpcklbw  xmm1, xmm5
+    punpcklbw  xmm2, xmm5
+    psubw      xmm1, xmm2
+    movq       xmm2, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]
+    movq       xmm3, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
+    punpcklbw  xmm2, xmm5
+    punpcklbw  xmm3, xmm5
+    psubw      xmm2, xmm3
+    paddw      xmm0, xmm2
+    paddw      xmm0, xmm1
+    paddw      xmm0, xmm1
+    pxor       xmm1, xmm1   // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
+    psubw      xmm1, xmm0
+    pmaxsw     xmm0, xmm1
+    packuswb   xmm0, xmm0
+    movq       qword ptr [eax + edx], xmm0
+    lea        eax, [eax + 8]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_SOBELYROW_SSE2
+
+#ifdef HAS_SOBELROW_SSE2
+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+// A = 255
+// R = Sobel
+// G = Sobel
+// B = Sobel
+__declspec(naked)
+void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                   uint8* dst_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_sobelx
+    mov        esi, [esp + 4 + 8]   // src_sobely
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+    sub        esi, eax
+    pcmpeqb    xmm5, xmm5           // alpha 255
+    pslld      xmm5, 24             // 0xff000000
+
+ convertloop:
+    movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
+    movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
+    lea        eax, [eax + 16]
+    paddusb    xmm0, xmm1             // sobel = sobelx + sobely
+    movdqa     xmm2, xmm0             // GG
+    punpcklbw  xmm2, xmm0             // First 8
+    punpckhbw  xmm0, xmm0             // Next 8
+    movdqa     xmm1, xmm2             // GGGG
+    punpcklwd  xmm1, xmm2             // First 4
+    punpckhwd  xmm2, xmm2             // Next 4
+    por        xmm1, xmm5             // GGGA
+    por        xmm2, xmm5
+    movdqa     xmm3, xmm0             // GGGG
+    punpcklwd  xmm3, xmm0             // Next 4
+    punpckhwd  xmm0, xmm0             // Last 4
+    por        xmm3, xmm5             // GGGA
+    por        xmm0, xmm5
+    movdqu     [edx], xmm1
+    movdqu     [edx + 16], xmm2
+    movdqu     [edx + 32], xmm3
+    movdqu     [edx + 48], xmm0
+    lea        edx, [edx + 64]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_SOBELROW_SSE2
+
+#ifdef HAS_SOBELTOPLANEROW_SSE2
+// Adds Sobel X and Sobel Y and stores Sobel into a plane.
+__declspec(naked)
+void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                          uint8* dst_y, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_sobelx
+    mov        esi, [esp + 4 + 8]   // src_sobely
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+    sub        esi, eax
+
+ convertloop:
+    movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
+    movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
+    lea        eax, [eax + 16]
+    paddusb    xmm0, xmm1             // sobel = sobelx + sobely
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_SOBELTOPLANEROW_SSE2
+
+#ifdef HAS_SOBELXYROW_SSE2
+// Mixes Sobel X, Sobel Y and Sobel into ARGB.
+// A = 255
+// R = Sobel X
+// G = Sobel
+// B = Sobel Y
+__declspec(naked)
+void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_sobelx
+    mov        esi, [esp + 4 + 8]   // src_sobely
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+    sub        esi, eax
+    pcmpeqb    xmm5, xmm5           // alpha 255
+
+ convertloop:
+    movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
+    movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
+    lea        eax, [eax + 16]
+    movdqa     xmm2, xmm0
+    paddusb    xmm2, xmm1             // sobel = sobelx + sobely
+    movdqa     xmm3, xmm0             // XA
+    punpcklbw  xmm3, xmm5
+    punpckhbw  xmm0, xmm5
+    movdqa     xmm4, xmm1             // YS
+    punpcklbw  xmm4, xmm2
+    punpckhbw  xmm1, xmm2
+    movdqa     xmm6, xmm4             // YSXA
+    punpcklwd  xmm6, xmm3             // First 4
+    punpckhwd  xmm4, xmm3             // Next 4
+    movdqa     xmm7, xmm1             // YSXA
+    punpcklwd  xmm7, xmm0             // Next 4
+    punpckhwd  xmm1, xmm0             // Last 4
+    movdqu     [edx], xmm6
+    movdqu     [edx + 16], xmm4
+    movdqu     [edx + 32], xmm7
+    movdqu     [edx + 48], xmm1
+    lea        edx, [edx + 64]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_SOBELXYROW_SSE2
+
+#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+// Consider float CumulativeSum.
+// Consider calling CumulativeSum one row at time as needed.
+// Consider circular CumulativeSum buffer of radius * 2 + 1 height.
+// Convert cumulative sum for an area to an average for 1 pixel.
+// topleft is pointer to top left of CumulativeSum buffer for area.
+// botleft is pointer to bottom left of CumulativeSum buffer.
+// width is offset from left to right of area in CumulativeSum buffer measured
+//   in number of ints.
+// area is the number of pixels in the area being averaged.
+// dst points to pixel to store result to.
+// count is number of averaged pixels to produce.
+// Does 4 pixels at a time.
+void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
+                                    int width, int area, uint8* dst,
+                                    int count) {
+  __asm {
+    mov        eax, topleft  // eax topleft
+    mov        esi, botleft  // esi botleft
+    mov        edx, width
+    movd       xmm5, area
+    mov        edi, dst
+    mov        ecx, count
+    cvtdq2ps   xmm5, xmm5
+    rcpss      xmm4, xmm5  // 1.0f / area
+    pshufd     xmm4, xmm4, 0
+    sub        ecx, 4
+    jl         l4b
+
+    cmp        area, 128  // 128 pixels will not overflow 15 bits.
+    ja         l4
+
+    pshufd     xmm5, xmm5, 0        // area
+    pcmpeqb    xmm6, xmm6           // constant of 65536.0 - 1 = 65535.0
+    psrld      xmm6, 16
+    cvtdq2ps   xmm6, xmm6
+    addps      xmm5, xmm6           // (65536.0 + area - 1)
+    mulps      xmm5, xmm4           // (65536.0 + area - 1) * 1 / area
+    cvtps2dq   xmm5, xmm5           // 0.16 fixed point
+    packssdw   xmm5, xmm5           // 16 bit shorts
+
+    // 4 pixel loop small blocks.
+  s4:
+    // top left
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+
+    // - top right
+    psubd      xmm0, [eax + edx * 4]
+    psubd      xmm1, [eax + edx * 4 + 16]
+    psubd      xmm2, [eax + edx * 4 + 32]
+    psubd      xmm3, [eax + edx * 4 + 48]
+    lea        eax, [eax + 64]
+
+    // - bottom left
+    psubd      xmm0, [esi]
+    psubd      xmm1, [esi + 16]
+    psubd      xmm2, [esi + 32]
+    psubd      xmm3, [esi + 48]
+
+    // + bottom right
+    paddd      xmm0, [esi + edx * 4]
+    paddd      xmm1, [esi + edx * 4 + 16]
+    paddd      xmm2, [esi + edx * 4 + 32]
+    paddd      xmm3, [esi + edx * 4 + 48]
+    lea        esi, [esi + 64]
+
+    packssdw   xmm0, xmm1  // pack 4 pixels into 2 registers
+    packssdw   xmm2, xmm3
+
+    pmulhuw    xmm0, xmm5
+    pmulhuw    xmm2, xmm5
+
+    packuswb   xmm0, xmm2
+    movdqu     [edi], xmm0
+    lea        edi, [edi + 16]
+    sub        ecx, 4
+    jge        s4
+
+    jmp        l4b
+
+    // 4 pixel loop
+  l4:
+    // top left
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+
+    // - top right
+    psubd      xmm0, [eax + edx * 4]
+    psubd      xmm1, [eax + edx * 4 + 16]
+    psubd      xmm2, [eax + edx * 4 + 32]
+    psubd      xmm3, [eax + edx * 4 + 48]
+    lea        eax, [eax + 64]
+
+    // - bottom left
+    psubd      xmm0, [esi]
+    psubd      xmm1, [esi + 16]
+    psubd      xmm2, [esi + 32]
+    psubd      xmm3, [esi + 48]
+
+    // + bottom right
+    paddd      xmm0, [esi + edx * 4]
+    paddd      xmm1, [esi + edx * 4 + 16]
+    paddd      xmm2, [esi + edx * 4 + 32]
+    paddd      xmm3, [esi + edx * 4 + 48]
+    lea        esi, [esi + 64]
+
+    cvtdq2ps   xmm0, xmm0   // Average = Sum * 1 / Area
+    cvtdq2ps   xmm1, xmm1
+    mulps      xmm0, xmm4
+    mulps      xmm1, xmm4
+    cvtdq2ps   xmm2, xmm2
+    cvtdq2ps   xmm3, xmm3
+    mulps      xmm2, xmm4
+    mulps      xmm3, xmm4
+    cvtps2dq   xmm0, xmm0
+    cvtps2dq   xmm1, xmm1
+    cvtps2dq   xmm2, xmm2
+    cvtps2dq   xmm3, xmm3
+    packssdw   xmm0, xmm1
+    packssdw   xmm2, xmm3
+    packuswb   xmm0, xmm2
+    movdqu     [edi], xmm0
+    lea        edi, [edi + 16]
+    sub        ecx, 4
+    jge        l4
+
+  l4b:
+    add        ecx, 4 - 1
+    jl         l1b
+
+    // 1 pixel loop
+  l1:
+    movdqu     xmm0, [eax]
+    psubd      xmm0, [eax + edx * 4]
+    lea        eax, [eax + 16]
+    psubd      xmm0, [esi]
+    paddd      xmm0, [esi + edx * 4]
+    lea        esi, [esi + 16]
+    cvtdq2ps   xmm0, xmm0
+    mulps      xmm0, xmm4
+    cvtps2dq   xmm0, xmm0
+    packssdw   xmm0, xmm0
+    packuswb   xmm0, xmm0
+    movd       dword ptr [edi], xmm0
+    lea        edi, [edi + 4]
+    sub        ecx, 1
+    jge        l1
+  l1b:
+  }
+}
+#endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+
+#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
+// Creates a table of cumulative sums where each value is a sum of all values
+// above and to the left of the value.
+void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
+                                  const int32* previous_cumsum, int width) {
+  __asm {
+    mov        eax, row
+    mov        edx, cumsum
+    mov        esi, previous_cumsum
+    mov        ecx, width
+    pxor       xmm0, xmm0
+    pxor       xmm1, xmm1
+
+    sub        ecx, 4
+    jl         l4b
+    test       edx, 15
+    jne        l4b
+
+    // 4 pixel loop
+  l4:
+    movdqu     xmm2, [eax]  // 4 argb pixels 16 bytes.
+    lea        eax, [eax + 16]
+    movdqa     xmm4, xmm2
+
+    punpcklbw  xmm2, xmm1
+    movdqa     xmm3, xmm2
+    punpcklwd  xmm2, xmm1
+    punpckhwd  xmm3, xmm1
+
+    punpckhbw  xmm4, xmm1
+    movdqa     xmm5, xmm4
+    punpcklwd  xmm4, xmm1
+    punpckhwd  xmm5, xmm1
+
+    paddd      xmm0, xmm2
+    movdqu     xmm2, [esi]  // previous row above.
+    paddd      xmm2, xmm0
+
+    paddd      xmm0, xmm3
+    movdqu     xmm3, [esi + 16]
+    paddd      xmm3, xmm0
+
+    paddd      xmm0, xmm4
+    movdqu     xmm4, [esi + 32]
+    paddd      xmm4, xmm0
+
+    paddd      xmm0, xmm5
+    movdqu     xmm5, [esi + 48]
+    lea        esi, [esi + 64]
+    paddd      xmm5, xmm0
+
+    movdqu     [edx], xmm2
+    movdqu     [edx + 16], xmm3
+    movdqu     [edx + 32], xmm4
+    movdqu     [edx + 48], xmm5
+
+    lea        edx, [edx + 64]
+    sub        ecx, 4
+    jge        l4
+
+  l4b:
+    add        ecx, 4 - 1
+    jl         l1b
+
+    // 1 pixel loop
+  l1:
+    movd       xmm2, dword ptr [eax]  // 1 argb pixel 4 bytes.
+    lea        eax, [eax + 4]
+    punpcklbw  xmm2, xmm1
+    punpcklwd  xmm2, xmm1
+    paddd      xmm0, xmm2
+    movdqu     xmm2, [esi]
+    lea        esi, [esi + 16]
+    paddd      xmm2, xmm0
+    movdqu     [edx], xmm2
+    lea        edx, [edx + 16]
+    sub        ecx, 1
+    jge        l1
+
+ l1b:
+  }
+}
+#endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
+
+#ifdef HAS_ARGBAFFINEROW_SSE2
+// Copy ARGB pixels from source image with slope to a row of destination.
+__declspec(naked)
+LIBYUV_API
+void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
+                        uint8* dst_argb, const float* uv_dudv, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 12]  // src_argb
+    mov        esi, [esp + 16]  // stride
+    mov        edx, [esp + 20]  // dst_argb
+    mov        ecx, [esp + 24]  // pointer to uv_dudv
+    movq       xmm2, qword ptr [ecx]  // uv
+    movq       xmm7, qword ptr [ecx + 8]  // dudv
+    mov        ecx, [esp + 28]  // width
+    shl        esi, 16          // 4, stride
+    add        esi, 4
+    movd       xmm5, esi
+    sub        ecx, 4
+    jl         l4b
+
+    // setup for 4 pixel loop
+    pshufd     xmm7, xmm7, 0x44  // dup dudv
+    pshufd     xmm5, xmm5, 0  // dup 4, stride
+    movdqa     xmm0, xmm2    // x0, y0, x1, y1
+    addps      xmm0, xmm7
+    movlhps    xmm2, xmm0
+    movdqa     xmm4, xmm7
+    addps      xmm4, xmm4    // dudv *= 2
+    movdqa     xmm3, xmm2    // x2, y2, x3, y3
+    addps      xmm3, xmm4
+    addps      xmm4, xmm4    // dudv *= 4
+
+    // 4 pixel loop
+  l4:
+    cvttps2dq  xmm0, xmm2    // x, y float to int first 2
+    cvttps2dq  xmm1, xmm3    // x, y float to int next 2
+    packssdw   xmm0, xmm1    // x, y as 8 shorts
+    pmaddwd    xmm0, xmm5    // offsets = x * 4 + y * stride.
+    movd       esi, xmm0
+    pshufd     xmm0, xmm0, 0x39  // shift right
+    movd       edi, xmm0
+    pshufd     xmm0, xmm0, 0x39  // shift right
+    movd       xmm1, [eax + esi]  // read pixel 0
+    movd       xmm6, [eax + edi]  // read pixel 1
+    punpckldq  xmm1, xmm6     // combine pixel 0 and 1
+    addps      xmm2, xmm4    // x, y += dx, dy first 2
+    movq       qword ptr [edx], xmm1
+    movd       esi, xmm0
+    pshufd     xmm0, xmm0, 0x39  // shift right
+    movd       edi, xmm0
+    movd       xmm6, [eax + esi]  // read pixel 2
+    movd       xmm0, [eax + edi]  // read pixel 3
+    punpckldq  xmm6, xmm0     // combine pixel 2 and 3
+    addps      xmm3, xmm4    // x, y += dx, dy next 2
+    movq       qword ptr 8[edx], xmm6
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jge        l4
+
+  l4b:
+    add        ecx, 4 - 1
+    jl         l1b
+
+    // 1 pixel loop
+  l1:
+    cvttps2dq  xmm0, xmm2    // x, y float to int
+    packssdw   xmm0, xmm0    // x, y as shorts
+    pmaddwd    xmm0, xmm5    // offset = x * 4 + y * stride
+    addps      xmm2, xmm7    // x, y += dx, dy
+    movd       esi, xmm0
+    movd       xmm0, [eax + esi]  // copy a pixel
+    movd       [edx], xmm0
+    lea        edx, [edx + 4]
+    sub        ecx, 1
+    jge        l1
+  l1b:
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBAFFINEROW_SSE2
+
+#ifdef HAS_INTERPOLATEROW_AVX2
+// Bilinear filter 32x2 -> 32x1
+__declspec(naked)
+void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) {
+  __asm {
+    push       esi
+    push       edi
+    mov        edi, [esp + 8 + 4]   // dst_ptr
+    mov        esi, [esp + 8 + 8]   // src_ptr
+    mov        edx, [esp + 8 + 12]  // src_stride
+    mov        ecx, [esp + 8 + 16]  // dst_width
+    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
+    shr        eax, 1
+    // Dispatch to specialized filters if applicable.
+    cmp        eax, 0
+    je         xloop100  // 0 / 128.  Blend 100 / 0.
+    sub        edi, esi
+    cmp        eax, 32
+    je         xloop75   // 32 / 128 is 0.25.  Blend 75 / 25.
+    cmp        eax, 64
+    je         xloop50   // 64 / 128 is 0.50.  Blend 50 / 50.
+    cmp        eax, 96
+    je         xloop25   // 96 / 128 is 0.75.  Blend 25 / 75.
+
+    vmovd      xmm0, eax  // high fraction 0..127
+    neg        eax
+    add        eax, 128
+    vmovd      xmm5, eax  // low fraction 128..1
+    vpunpcklbw xmm5, xmm5, xmm0
+    vpunpcklwd xmm5, xmm5, xmm5
+    vpxor      ymm0, ymm0, ymm0
+    vpermd     ymm5, ymm0, ymm5
+
+  xloop:
+    vmovdqu    ymm0, [esi]
+    vmovdqu    ymm2, [esi + edx]
+    vpunpckhbw ymm1, ymm0, ymm2  // mutates
+    vpunpcklbw ymm0, ymm0, ymm2  // mutates
+    vpmaddubsw ymm0, ymm0, ymm5
+    vpmaddubsw ymm1, ymm1, ymm5
+    vpsrlw     ymm0, ymm0, 7
+    vpsrlw     ymm1, ymm1, 7
+    vpackuswb  ymm0, ymm0, ymm1  // unmutates
+    vmovdqu    [esi + edi], ymm0
+    lea        esi, [esi + 32]
+    sub        ecx, 32
+    jg         xloop
+    jmp        xloop99
+
+   // Blend 25 / 75.
+ xloop25:
+   vmovdqu    ymm0, [esi]
+   vmovdqu    ymm1, [esi + edx]
+   vpavgb     ymm0, ymm0, ymm1
+   vpavgb     ymm0, ymm0, ymm1
+   vmovdqu    [esi + edi], ymm0
+   lea        esi, [esi + 32]
+   sub        ecx, 32
+   jg         xloop25
+   jmp        xloop99
+
+   // Blend 50 / 50.
+ xloop50:
+   vmovdqu    ymm0, [esi]
+   vpavgb     ymm0, ymm0, [esi + edx]
+   vmovdqu    [esi + edi], ymm0
+   lea        esi, [esi + 32]
+   sub        ecx, 32
+   jg         xloop50
+   jmp        xloop99
+
+   // Blend 75 / 25.
+ xloop75:
+   vmovdqu    ymm1, [esi]
+   vmovdqu    ymm0, [esi + edx]
+   vpavgb     ymm0, ymm0, ymm1
+   vpavgb     ymm0, ymm0, ymm1
+   vmovdqu    [esi + edi], ymm0
+   lea        esi, [esi + 32]
+   sub        ecx, 32
+   jg         xloop75
+   jmp        xloop99
+
+   // Blend 100 / 0 - Copy row unchanged.
+ xloop100:
+   rep movsb
+
+  xloop99:
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_INTERPOLATEROW_AVX2
+
+// Bilinear filter 16x2 -> 16x1
+__declspec(naked)
+void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                          ptrdiff_t src_stride, int dst_width,
+                          int source_y_fraction) {
+  __asm {
+    push       esi
+    push       edi
+    mov        edi, [esp + 8 + 4]   // dst_ptr
+    mov        esi, [esp + 8 + 8]   // src_ptr
+    mov        edx, [esp + 8 + 12]  // src_stride
+    mov        ecx, [esp + 8 + 16]  // dst_width
+    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
+    sub        edi, esi
+    shr        eax, 1
+    // Dispatch to specialized filters if applicable.
+    cmp        eax, 0
+    je         xloop100  // 0 / 128.  Blend 100 / 0.
+    cmp        eax, 32
+    je         xloop75   // 32 / 128 is 0.25.  Blend 75 / 25.
+    cmp        eax, 64
+    je         xloop50   // 64 / 128 is 0.50.  Blend 50 / 50.
+    cmp        eax, 96
+    je         xloop25   // 96 / 128 is 0.75.  Blend 25 / 75.
+
+    movd       xmm0, eax  // high fraction 0..127
+    neg        eax
+    add        eax, 128
+    movd       xmm5, eax  // low fraction 128..1
+    punpcklbw  xmm5, xmm0
+    punpcklwd  xmm5, xmm5
+    pshufd     xmm5, xmm5, 0
+
+  xloop:
+    movdqu     xmm0, [esi]
+    movdqu     xmm2, [esi + edx]
+    movdqu     xmm1, xmm0
+    punpcklbw  xmm0, xmm2
+    punpckhbw  xmm1, xmm2
+    pmaddubsw  xmm0, xmm5
+    pmaddubsw  xmm1, xmm5
+    psrlw      xmm0, 7
+    psrlw      xmm1, 7
+    packuswb   xmm0, xmm1
+    movdqu     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    sub        ecx, 16
+    jg         xloop
+    jmp        xloop99
+
+    // Blend 25 / 75.
+  xloop25:
+    movdqu     xmm0, [esi]
+    movdqu     xmm1, [esi + edx]
+    pavgb      xmm0, xmm1
+    pavgb      xmm0, xmm1
+    movdqu     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    sub        ecx, 16
+    jg         xloop25
+    jmp        xloop99
+
+    // Blend 50 / 50.
+  xloop50:
+    movdqu     xmm0, [esi]
+    movdqu     xmm1, [esi + edx]
+    pavgb      xmm0, xmm1
+    movdqu     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    sub        ecx, 16
+    jg         xloop50
+    jmp        xloop99
+
+    // Blend 75 / 25.
+  xloop75:
+    movdqu     xmm1, [esi]
+    movdqu     xmm0, [esi + edx]
+    pavgb      xmm0, xmm1
+    pavgb      xmm0, xmm1
+    movdqu     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    sub        ecx, 16
+    jg         xloop75
+    jmp        xloop99
+
+    // Blend 100 / 0 - Copy row unchanged.
+  xloop100:
+    movdqu     xmm0, [esi]
+    movdqu     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    sub        ecx, 16
+    jg         xloop100
+
+  xloop99:
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+#ifdef HAS_INTERPOLATEROW_SSE2
+// Bilinear filter 16x2 -> 16x1
+__declspec(naked)
+void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) {
+  __asm {
+    push       esi
+    push       edi
+    mov        edi, [esp + 8 + 4]   // dst_ptr
+    mov        esi, [esp + 8 + 8]   // src_ptr
+    mov        edx, [esp + 8 + 12]  // src_stride
+    mov        ecx, [esp + 8 + 16]  // dst_width
+    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
+    sub        edi, esi
+    // Dispatch to specialized filters if applicable.
+    cmp        eax, 0
+    je         xloop100  // 0 / 256.  Blend 100 / 0.
+    cmp        eax, 64
+    je         xloop75   // 64 / 256 is 0.25.  Blend 75 / 25.
+    cmp        eax, 128
+    je         xloop50   // 128 / 256 is 0.50.  Blend 50 / 50.
+    cmp        eax, 192
+    je         xloop25   // 192 / 256 is 0.75.  Blend 25 / 75.
+
+    movd       xmm5, eax            // xmm5 = y fraction
+    punpcklbw  xmm5, xmm5
+    psrlw      xmm5, 1
+    punpcklwd  xmm5, xmm5
+    punpckldq  xmm5, xmm5
+    punpcklqdq xmm5, xmm5
+    pxor       xmm4, xmm4
+
+  xloop:
+    movdqu     xmm0, [esi]  // row0
+    movdqu     xmm2, [esi + edx]  // row1
+    movdqu     xmm1, xmm0
+    movdqu     xmm3, xmm2
+    punpcklbw  xmm2, xmm4
+    punpckhbw  xmm3, xmm4
+    punpcklbw  xmm0, xmm4
+    punpckhbw  xmm1, xmm4
+    psubw      xmm2, xmm0  // row1 - row0
+    psubw      xmm3, xmm1
+    paddw      xmm2, xmm2  // 9 bits * 15 bits = 8.16
+    paddw      xmm3, xmm3
+    pmulhw     xmm2, xmm5  // scale diff
+    pmulhw     xmm3, xmm5
+    paddw      xmm0, xmm2  // sum rows
+    paddw      xmm1, xmm3
+    packuswb   xmm0, xmm1
+    movdqu     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    sub        ecx, 16
+    jg         xloop
+    jmp        xloop99
+
+    // Blend 25 / 75.
+  xloop25:
+    movdqu     xmm0, [esi]
+    movdqu     xmm1, [esi + edx]
+    pavgb      xmm0, xmm1
+    pavgb      xmm0, xmm1
+    movdqu     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    sub        ecx, 16
+    jg         xloop25
+    jmp        xloop99
+
+    // Blend 50 / 50.
+  xloop50:
+    movdqu     xmm0, [esi]
+    movdqu     xmm1, [esi + edx]
+    pavgb      xmm0, xmm1
+    movdqu     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    sub        ecx, 16
+    jg         xloop50
+    jmp        xloop99
+
+    // Blend 75 / 25.
+  xloop75:
+    movdqu     xmm1, [esi]
+    movdqu     xmm0, [esi + edx]
+    pavgb      xmm0, xmm1
+    pavgb      xmm0, xmm1
+    movdqu     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    sub        ecx, 16
+    jg         xloop75
+    jmp        xloop99
+
+    // Blend 100 / 0 - Copy row unchanged.
+  xloop100:
+    movdqu     xmm0, [esi]
+    movdqu     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    sub        ecx, 16
+    jg         xloop100
+
+  xloop99:
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_INTERPOLATEROW_SSE2
+
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+__declspec(naked)
+void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                          const uint8* shuffler, int pix) {
+  __asm {
+    mov        eax, [esp + 4]    // src_argb
+    mov        edx, [esp + 8]    // dst_argb
+    mov        ecx, [esp + 12]   // shuffler
+    movdqu     xmm5, [ecx]
+    mov        ecx, [esp + 16]   // pix
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax, [eax + 32]
+    pshufb     xmm0, xmm5
+    pshufb     xmm1, xmm5
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         wloop
+    ret
+  }
+}
+
+#ifdef HAS_ARGBSHUFFLEROW_AVX2
+__declspec(naked)
+void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int pix) {
+  __asm {
+    mov        eax, [esp + 4]     // src_argb
+    mov        edx, [esp + 8]     // dst_argb
+    mov        ecx, [esp + 12]    // shuffler
+    vbroadcastf128 ymm5, [ecx]    // same shuffle in high as low.
+    mov        ecx, [esp + 16]    // pix
+
+  wloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    lea        eax, [eax + 64]
+    vpshufb    ymm0, ymm0, ymm5
+    vpshufb    ymm1, ymm1, ymm5
+    vmovdqu    [edx], ymm0
+    vmovdqu    [edx + 32], ymm1
+    lea        edx, [edx + 64]
+    sub        ecx, 16
+    jg         wloop
+
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBSHUFFLEROW_AVX2
+
+__declspec(naked)
+void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int pix) {
+  __asm {
+    push       ebx
+    push       esi
+    mov        eax, [esp + 8 + 4]    // src_argb
+    mov        edx, [esp + 8 + 8]    // dst_argb
+    mov        esi, [esp + 8 + 12]   // shuffler
+    mov        ecx, [esp + 8 + 16]   // pix
+    pxor       xmm5, xmm5
+
+    mov        ebx, [esi]   // shuffler
+    cmp        ebx, 0x03000102
+    je         shuf_3012
+    cmp        ebx, 0x00010203
+    je         shuf_0123
+    cmp        ebx, 0x00030201
+    je         shuf_0321
+    cmp        ebx, 0x02010003
+    je         shuf_2103
+
+  // TODO(fbarchard): Use one source pointer and 3 offsets.
+  shuf_any1:
+    movzx      ebx, byte ptr [esi]
+    movzx      ebx, byte ptr [eax + ebx]
+    mov        [edx], bl
+    movzx      ebx, byte ptr [esi + 1]
+    movzx      ebx, byte ptr [eax + ebx]
+    mov        [edx + 1], bl
+    movzx      ebx, byte ptr [esi + 2]
+    movzx      ebx, byte ptr [eax + ebx]
+    mov        [edx + 2], bl
+    movzx      ebx, byte ptr [esi + 3]
+    movzx      ebx, byte ptr [eax + ebx]
+    mov        [edx + 3], bl
+    lea        eax, [eax + 4]
+    lea        edx, [edx + 4]
+    sub        ecx, 1
+    jg         shuf_any1
+    jmp        shuf99
+
+  shuf_0123:
+    movdqu     xmm0, [eax]
+    lea        eax, [eax + 16]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm5
+    punpckhbw  xmm1, xmm5
+    pshufhw    xmm0, xmm0, 01Bh   // 1B = 00011011 = 0x0123 = BGRAToARGB
+    pshuflw    xmm0, xmm0, 01Bh
+    pshufhw    xmm1, xmm1, 01Bh
+    pshuflw    xmm1, xmm1, 01Bh
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         shuf_0123
+    jmp        shuf99
+
+  shuf_0321:
+    movdqu     xmm0, [eax]
+    lea        eax, [eax + 16]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm5
+    punpckhbw  xmm1, xmm5
+    pshufhw    xmm0, xmm0, 039h   // 39 = 00111001 = 0x0321 = RGBAToARGB
+    pshuflw    xmm0, xmm0, 039h
+    pshufhw    xmm1, xmm1, 039h
+    pshuflw    xmm1, xmm1, 039h
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         shuf_0321
+    jmp        shuf99
+
+  shuf_2103:
+    movdqu     xmm0, [eax]
+    lea        eax, [eax + 16]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm5
+    punpckhbw  xmm1, xmm5
+    pshufhw    xmm0, xmm0, 093h   // 93 = 10010011 = 0x2103 = ARGBToRGBA
+    pshuflw    xmm0, xmm0, 093h
+    pshufhw    xmm1, xmm1, 093h
+    pshuflw    xmm1, xmm1, 093h
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         shuf_2103
+    jmp        shuf99
+
+  shuf_3012:
+    movdqu     xmm0, [eax]
+    lea        eax, [eax + 16]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm5
+    punpckhbw  xmm1, xmm5
+    pshufhw    xmm0, xmm0, 0C6h   // C6 = 11000110 = 0x3012 = ABGRToARGB
+    pshuflw    xmm0, xmm0, 0C6h
+    pshufhw    xmm1, xmm1, 0C6h
+    pshuflw    xmm1, xmm1, 0C6h
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         shuf_3012
+
+  shuf99:
+    pop        esi
+    pop        ebx
+    ret
+  }
+}
+
+// YUY2 - Macro-pixel = 2 image pixels
+// Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
+
+// UYVY - Macro-pixel = 2 image pixels
+// U0Y0V0Y1
+
+__declspec(naked)
+void I422ToYUY2Row_SSE2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_frame, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_y
+    mov        esi, [esp + 8 + 8]    // src_u
+    mov        edx, [esp + 8 + 12]   // src_v
+    mov        edi, [esp + 8 + 16]   // dst_frame
+    mov        ecx, [esp + 8 + 20]   // width
+    sub        edx, esi
+
+  convertloop:
+    movq       xmm2, qword ptr [esi] // U
+    movq       xmm3, qword ptr [esi + edx] // V
+    lea        esi, [esi + 8]
+    punpcklbw  xmm2, xmm3 // UV
+    movdqu     xmm0, [eax] // Y
+    lea        eax, [eax + 16]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm2 // YUYV
+    punpckhbw  xmm1, xmm2
+    movdqu     [edi], xmm0
+    movdqu     [edi + 16], xmm1
+    lea        edi, [edi + 32]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked)
+void I422ToUYVYRow_SSE2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_frame, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_y
+    mov        esi, [esp + 8 + 8]    // src_u
+    mov        edx, [esp + 8 + 12]   // src_v
+    mov        edi, [esp + 8 + 16]   // dst_frame
+    mov        ecx, [esp + 8 + 20]   // width
+    sub        edx, esi
+
+  convertloop:
+    movq       xmm2, qword ptr [esi] // U
+    movq       xmm3, qword ptr [esi + edx] // V
+    lea        esi, [esi + 8]
+    punpcklbw  xmm2, xmm3 // UV
+    movdqu     xmm0, [eax] // Y
+    movdqa     xmm1, xmm2
+    lea        eax, [eax + 16]
+    punpcklbw  xmm1, xmm0 // UYVY
+    punpckhbw  xmm2, xmm0
+    movdqu     [edi], xmm1
+    movdqu     [edi + 16], xmm2
+    lea        edi, [edi + 32]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
+__declspec(naked)
+void ARGBPolynomialRow_SSE2(const uint8* src_argb,
+                            uint8* dst_argb, const float* poly,
+                            int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   /* src_argb */
+    mov        edx, [esp + 4 + 8]   /* dst_argb */
+    mov        esi, [esp + 4 + 12]  /* poly */
+    mov        ecx, [esp + 4 + 16]  /* width */
+    pxor       xmm3, xmm3  // 0 constant for zero extending bytes to ints.
+
+    // 2 pixel loop.
+ convertloop:
+//    pmovzxbd  xmm0, dword ptr [eax]  // BGRA pixel
+//    pmovzxbd  xmm4, dword ptr [eax + 4]  // BGRA pixel
+    movq       xmm0, qword ptr [eax]  // BGRABGRA
+    lea        eax, [eax + 8]
+    punpcklbw  xmm0, xmm3
+    movdqa     xmm4, xmm0
+    punpcklwd  xmm0, xmm3  // pixel 0
+    punpckhwd  xmm4, xmm3  // pixel 1
+    cvtdq2ps   xmm0, xmm0  // 4 floats
+    cvtdq2ps   xmm4, xmm4
+    movdqa     xmm1, xmm0  // X
+    movdqa     xmm5, xmm4
+    mulps      xmm0, [esi + 16]  // C1 * X
+    mulps      xmm4, [esi + 16]
+    addps      xmm0, [esi]  // result = C0 + C1 * X
+    addps      xmm4, [esi]
+    movdqa     xmm2, xmm1
+    movdqa     xmm6, xmm5
+    mulps      xmm2, xmm1  // X * X
+    mulps      xmm6, xmm5
+    mulps      xmm1, xmm2  // X * X * X
+    mulps      xmm5, xmm6
+    mulps      xmm2, [esi + 32]  // C2 * X * X
+    mulps      xmm6, [esi + 32]
+    mulps      xmm1, [esi + 48]  // C3 * X * X * X
+    mulps      xmm5, [esi + 48]
+    addps      xmm0, xmm2  // result += C2 * X * X
+    addps      xmm4, xmm6
+    addps      xmm0, xmm1  // result += C3 * X * X * X
+    addps      xmm4, xmm5
+    cvttps2dq  xmm0, xmm0
+    cvttps2dq  xmm4, xmm4
+    packuswb   xmm0, xmm4
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx], xmm0
+    lea        edx, [edx + 8]
+    sub        ecx, 2
+    jg         convertloop
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBPOLYNOMIALROW_SSE2
+
+#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
+__declspec(naked)
+void ARGBPolynomialRow_AVX2(const uint8* src_argb,
+                            uint8* dst_argb, const float* poly,
+                            int width) {
+  __asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_argb */
+    mov        ecx, [esp + 12]   /* poly */
+    vbroadcastf128 ymm4, [ecx]       // C0
+    vbroadcastf128 ymm5, [ecx + 16]  // C1
+    vbroadcastf128 ymm6, [ecx + 32]  // C2
+    vbroadcastf128 ymm7, [ecx + 48]  // C3
+    mov        ecx, [esp + 16]  /* width */
+
+    // 2 pixel loop.
+ convertloop:
+    vpmovzxbd   ymm0, qword ptr [eax]  // 2 BGRA pixels
+    lea         eax, [eax + 8]
+    vcvtdq2ps   ymm0, ymm0        // X 8 floats
+    vmulps      ymm2, ymm0, ymm0  // X * X
+    vmulps      ymm3, ymm0, ymm7  // C3 * X
+    vfmadd132ps ymm0, ymm4, ymm5  // result = C0 + C1 * X
+    vfmadd231ps ymm0, ymm2, ymm6  // result += C2 * X * X
+    vfmadd231ps ymm0, ymm2, ymm3  // result += C3 * X * X * X
+    vcvttps2dq  ymm0, ymm0
+    vpackusdw   ymm0, ymm0, ymm0  // b0g0r0a0_00000000_b0g0r0a0_00000000
+    vpermq      ymm0, ymm0, 0xd8  // b0g0r0a0_b0g0r0a0_00000000_00000000
+    vpackuswb   xmm0, xmm0, xmm0  // bgrabgra_00000000_00000000_00000000
+    vmovq       qword ptr [edx], xmm0
+    lea         edx, [edx + 8]
+    sub         ecx, 2
+    jg          convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBPOLYNOMIALROW_AVX2
+
+#ifdef HAS_ARGBCOLORTABLEROW_X86
+// Tranform ARGB pixels with color table.
+__declspec(naked)
+void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
+                           int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   /* dst_argb */
+    mov        esi, [esp + 4 + 8]   /* table_argb */
+    mov        ecx, [esp + 4 + 12]  /* width */
+
+    // 1 pixel loop.
+  convertloop:
+    movzx      edx, byte ptr [eax]
+    lea        eax, [eax + 4]
+    movzx      edx, byte ptr [esi + edx * 4]
+    mov        byte ptr [eax - 4], dl
+    movzx      edx, byte ptr [eax - 4 + 1]
+    movzx      edx, byte ptr [esi + edx * 4 + 1]
+    mov        byte ptr [eax - 4 + 1], dl
+    movzx      edx, byte ptr [eax - 4 + 2]
+    movzx      edx, byte ptr [esi + edx * 4 + 2]
+    mov        byte ptr [eax - 4 + 2], dl
+    movzx      edx, byte ptr [eax - 4 + 3]
+    movzx      edx, byte ptr [esi + edx * 4 + 3]
+    mov        byte ptr [eax - 4 + 3], dl
+    dec        ecx
+    jg         convertloop
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBCOLORTABLEROW_X86
+
+#ifdef HAS_RGBCOLORTABLEROW_X86
+// Tranform RGB pixels with color table.
+__declspec(naked)
+void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   /* dst_argb */
+    mov        esi, [esp + 4 + 8]   /* table_argb */
+    mov        ecx, [esp + 4 + 12]  /* width */
+
+    // 1 pixel loop.
+  convertloop:
+    movzx      edx, byte ptr [eax]
+    lea        eax, [eax + 4]
+    movzx      edx, byte ptr [esi + edx * 4]
+    mov        byte ptr [eax - 4], dl
+    movzx      edx, byte ptr [eax - 4 + 1]
+    movzx      edx, byte ptr [esi + edx * 4 + 1]
+    mov        byte ptr [eax - 4 + 1], dl
+    movzx      edx, byte ptr [eax - 4 + 2]
+    movzx      edx, byte ptr [esi + edx * 4 + 2]
+    mov        byte ptr [eax - 4 + 2], dl
+    dec        ecx
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_RGBCOLORTABLEROW_X86
+
+#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
+// Tranform RGB pixels with luma table.
+__declspec(naked)
+void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                                 int width,
+                                 const uint8* luma, uint32 lumacoeff) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   /* src_argb */
+    mov        edi, [esp + 8 + 8]   /* dst_argb */
+    mov        ecx, [esp + 8 + 12]  /* width */
+    movd       xmm2, dword ptr [esp + 8 + 16]  // luma table
+    movd       xmm3, dword ptr [esp + 8 + 20]  // lumacoeff
+    pshufd     xmm2, xmm2, 0
+    pshufd     xmm3, xmm3, 0
+    pcmpeqb    xmm4, xmm4        // generate mask 0xff00ff00
+    psllw      xmm4, 8
+    pxor       xmm5, xmm5
+
+    // 4 pixel loop.
+  convertloop:
+    movdqu     xmm0, qword ptr [eax]      // generate luma ptr
+    pmaddubsw  xmm0, xmm3
+    phaddw     xmm0, xmm0
+    pand       xmm0, xmm4  // mask out low bits
+    punpcklwd  xmm0, xmm5
+    paddd      xmm0, xmm2  // add table base
+    movd       esi, xmm0
+    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
+
+    movzx      edx, byte ptr [eax]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi], dl
+    movzx      edx, byte ptr [eax + 1]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 1], dl
+    movzx      edx, byte ptr [eax + 2]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 2], dl
+    movzx      edx, byte ptr [eax + 3]  // copy alpha.
+    mov        byte ptr [edi + 3], dl
+
+    movd       esi, xmm0
+    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
+
+    movzx      edx, byte ptr [eax + 4]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 4], dl
+    movzx      edx, byte ptr [eax + 5]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 5], dl
+    movzx      edx, byte ptr [eax + 6]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 6], dl
+    movzx      edx, byte ptr [eax + 7]  // copy alpha.
+    mov        byte ptr [edi + 7], dl
+
+    movd       esi, xmm0
+    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
+
+    movzx      edx, byte ptr [eax + 8]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 8], dl
+    movzx      edx, byte ptr [eax + 9]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 9], dl
+    movzx      edx, byte ptr [eax + 10]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 10], dl
+    movzx      edx, byte ptr [eax + 11]  // copy alpha.
+    mov        byte ptr [edi + 11], dl
+
+    movd       esi, xmm0
+
+    movzx      edx, byte ptr [eax + 12]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 12], dl
+    movzx      edx, byte ptr [eax + 13]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 13], dl
+    movzx      edx, byte ptr [eax + 14]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 14], dl
+    movzx      edx, byte ptr [eax + 15]  // copy alpha.
+    mov        byte ptr [edi + 15], dl
+
+    lea        eax, [eax + 16]
+    lea        edi, [edi + 16]
+    sub        ecx, 4
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
+
+#endif  // defined(_M_X64)
+#endif  // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64))
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libvpx/third_party/libyuv/source/row_x86.asm b/libs/libvpx/third_party/libyuv/source/row_x86.asm
new file mode 100644
index 0000000000..0cb326f8e5
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/source/row_x86.asm
@@ -0,0 +1,146 @@
+;
+; Copyright 2012 The LibYuv Project Authors. All rights reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%ifdef __YASM_VERSION_ID__
+%if __YASM_VERSION_ID__ < 01020000h
+%error AVX2 is supported only by yasm 1.2.0 or later.
+%endif
+%endif
+%include "x86inc.asm"
+
+SECTION .text
+
+; cglobal numeric constants are parameters, gpr regs, mm regs
+
+; void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix)
+
+%macro YUY2TOYROW 2-3
+cglobal %1ToYRow%3, 3, 3, 3, src_yuy2, dst_y, pix
+%ifidn %1,YUY2
+    pcmpeqb    m2, m2, m2        ; generate mask 0x00ff00ff
+    psrlw      m2, m2, 8
+%endif
+
+    ALIGN      4
+.convertloop:
+    mov%2      m0, [src_yuy2q]
+    mov%2      m1, [src_yuy2q + mmsize]
+    lea        src_yuy2q, [src_yuy2q + mmsize * 2]
+%ifidn %1,YUY2
+    pand       m0, m0, m2   ; YUY2 even bytes are Y
+    pand       m1, m1, m2
+%else
+    psrlw      m0, m0, 8    ; UYVY odd bytes are Y
+    psrlw      m1, m1, 8
+%endif
+    packuswb   m0, m0, m1
+%if cpuflag(AVX2)
+    vpermq     m0, m0, 0xd8
+%endif
+    sub        pixd, mmsize
+    mov%2      [dst_yq], m0
+    lea        dst_yq, [dst_yq + mmsize]
+    jg         .convertloop
+    REP_RET
+%endmacro
+
+; TODO(fbarchard): Remove MMX.  Add SSSE3 pshufb version.
+INIT_MMX MMX
+YUY2TOYROW YUY2,a,
+YUY2TOYROW YUY2,u,_Unaligned
+YUY2TOYROW UYVY,a,
+YUY2TOYROW UYVY,u,_Unaligned
+INIT_XMM SSE2
+YUY2TOYROW YUY2,a,
+YUY2TOYROW YUY2,u,_Unaligned
+YUY2TOYROW UYVY,a,
+YUY2TOYROW UYVY,u,_Unaligned
+INIT_YMM AVX2
+YUY2TOYROW YUY2,a,
+YUY2TOYROW UYVY,a,
+
+; void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix)
+
+%macro SplitUVRow 1-2
+cglobal SplitUVRow%2, 4, 4, 5, src_uv, dst_u, dst_v, pix
+    pcmpeqb    m4, m4, m4        ; generate mask 0x00ff00ff
+    psrlw      m4, m4, 8
+    sub        dst_vq, dst_uq
+
+    ALIGN      4
+.convertloop:
+    mov%1      m0, [src_uvq]
+    mov%1      m1, [src_uvq + mmsize]
+    lea        src_uvq, [src_uvq + mmsize * 2]
+    psrlw      m2, m0, 8         ; odd bytes
+    psrlw      m3, m1, 8
+    pand       m0, m0, m4        ; even bytes
+    pand       m1, m1, m4
+    packuswb   m0, m0, m1
+    packuswb   m2, m2, m3
+%if cpuflag(AVX2)
+    vpermq     m0, m0, 0xd8
+    vpermq     m2, m2, 0xd8
+%endif
+    mov%1      [dst_uq], m0
+    mov%1      [dst_uq + dst_vq], m2
+    lea        dst_uq, [dst_uq + mmsize]
+    sub        pixd, mmsize
+    jg         .convertloop
+    REP_RET
+%endmacro
+
+INIT_MMX MMX
+SplitUVRow a,
+SplitUVRow u,_Unaligned
+INIT_XMM SSE2
+SplitUVRow a,
+SplitUVRow u,_Unaligned
+INIT_YMM AVX2
+SplitUVRow a,
+
+; void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+;                      int width);
+
+%macro MergeUVRow_ 1-2
+cglobal MergeUVRow_%2, 4, 4, 3, src_u, src_v, dst_uv, pix
+    sub        src_vq, src_uq
+
+    ALIGN      4
+.convertloop:
+    mov%1      m0, [src_uq]
+    mov%1      m1, [src_vq]
+    lea        src_uq, [src_uq + mmsize]
+    punpcklbw  m2, m0, m1       // first 8 UV pairs
+    punpckhbw  m0, m0, m1       // next 8 UV pairs
+%if cpuflag(AVX2)
+    vperm2i128 m1, m2, m0, 0x20  // low 128 of ymm2 and low 128 of ymm0
+    vperm2i128 m2, m2, m0, 0x31  // high 128 of ymm2 and high 128 of ymm0
+    mov%1      [dst_uvq], m1
+    mov%1      [dst_uvq + mmsize], m2
+%else
+    mov%1      [dst_uvq], m2
+    mov%1      [dst_uvq + mmsize], m0
+%endif
+    lea        dst_uvq, [dst_uvq + mmsize * 2]
+    sub        pixd, mmsize
+    jg         .convertloop
+    REP_RET
+%endmacro
+
+INIT_MMX MMX
+MergeUVRow_ a,
+MergeUVRow_ u,_Unaligned
+INIT_XMM SSE2
+MergeUVRow_ a,
+MergeUVRow_ u,_Unaligned
+INIT_YMM AVX2
+MergeUVRow_ a,
+
diff --git a/libs/libvpx/third_party/libyuv/source/scale.cc b/libs/libvpx/third_party/libyuv/source/scale.cc
new file mode 100644
index 0000000000..0a01304c41
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/source/scale.cc
@@ -0,0 +1,1689 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"  // For CopyPlane
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+static __inline int Abs(int v) {
+  return v >= 0 ? v : -v;
+}
+
+#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
+
+// Scale plane, 1/2
+// This is an optimized version for scaling down a plane to 1/2 of
+// its original size.
+
+static void ScalePlaneDown2(int src_width, int src_height,
+                            int dst_width, int dst_height,
+                            int src_stride, int dst_stride,
+                            const uint8* src_ptr, uint8* dst_ptr,
+                            enum FilterMode filtering) {
+  int y;
+  void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) =
+      filtering == kFilterNone ? ScaleRowDown2_C :
+      (filtering == kFilterLinear ? ScaleRowDown2Linear_C : ScaleRowDown2Box_C);
+  int row_stride = src_stride << 1;
+  if (!filtering) {
+    src_ptr += src_stride;  // Point to odd rows.
+    src_stride = 0;
+  }
+
+#if defined(HAS_SCALEROWDOWN2_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_NEON :
+        (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_NEON :
+        ScaleRowDown2Box_Any_NEON);
+    if (IS_ALIGNED(dst_width, 16)) {
+      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_NEON :
+          (filtering == kFilterLinear ? ScaleRowDown2Linear_NEON :
+          ScaleRowDown2Box_NEON);
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN2_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_SSE2 :
+        (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSE2 :
+        ScaleRowDown2Box_Any_SSE2);
+    if (IS_ALIGNED(dst_width, 16)) {
+      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 :
+          (filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 :
+          ScaleRowDown2Box_SSE2);
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN2_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_AVX2 :
+        (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_AVX2 :
+        ScaleRowDown2Box_Any_AVX2);
+    if (IS_ALIGNED(dst_width, 32)) {
+      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_AVX2 :
+          (filtering == kFilterLinear ? ScaleRowDown2Linear_AVX2 :
+          ScaleRowDown2Box_AVX2);
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN2_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) &&
+      IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+    ScaleRowDown2 = filtering ?
+        ScaleRowDown2Box_MIPS_DSPR2 : ScaleRowDown2_MIPS_DSPR2;
+  }
+#endif
+
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
+  }
+  // TODO(fbarchard): Loop through source height to allow odd height.
+  for (y = 0; y < dst_height; ++y) {
+    ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
+    src_ptr += row_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+static void ScalePlaneDown2_16(int src_width, int src_height,
+                               int dst_width, int dst_height,
+                               int src_stride, int dst_stride,
+                               const uint16* src_ptr, uint16* dst_ptr,
+                               enum FilterMode filtering) {
+  int y;
+  void (*ScaleRowDown2)(const uint16* src_ptr, ptrdiff_t src_stride,
+                        uint16* dst_ptr, int dst_width) =
+    filtering == kFilterNone ? ScaleRowDown2_16_C :
+        (filtering == kFilterLinear ? ScaleRowDown2Linear_16_C :
+        ScaleRowDown2Box_16_C);
+  int row_stride = src_stride << 1;
+  if (!filtering) {
+    src_ptr += src_stride;  // Point to odd rows.
+    src_stride = 0;
+  }
+
+#if defined(HAS_SCALEROWDOWN2_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) {
+    ScaleRowDown2 = filtering ? ScaleRowDown2Box_16_NEON :
+        ScaleRowDown2_16_NEON;
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN2_16_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
+    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_SSE2 :
+        (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2 :
+        ScaleRowDown2Box_16_SSE2);
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN2_16_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) &&
+      IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+    ScaleRowDown2 = filtering ?
+        ScaleRowDown2Box_16_MIPS_DSPR2 : ScaleRowDown2_16_MIPS_DSPR2;
+  }
+#endif
+
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
+  }
+  // TODO(fbarchard): Loop through source height to allow odd height.
+  for (y = 0; y < dst_height; ++y) {
+    ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
+    src_ptr += row_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+// Scale plane, 1/4
+// This is an optimized version for scaling down a plane to 1/4 of
+// its original size.
+
+static void ScalePlaneDown4(int src_width, int src_height,
+                            int dst_width, int dst_height,
+                            int src_stride, int dst_stride,
+                            const uint8* src_ptr, uint8* dst_ptr,
+                            enum FilterMode filtering) {
+  int y;
+  void (*ScaleRowDown4)(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) =
+      filtering ? ScaleRowDown4Box_C : ScaleRowDown4_C;
+  int row_stride = src_stride << 2;
+  if (!filtering) {
+    src_ptr += src_stride * 2;  // Point to row 2.
+    src_stride = 0;
+  }
+#if defined(HAS_SCALEROWDOWN4_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleRowDown4 = filtering ?
+        ScaleRowDown4Box_Any_NEON : ScaleRowDown4_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN4_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ScaleRowDown4 = filtering ?
+        ScaleRowDown4Box_Any_SSE2 : ScaleRowDown4_Any_SSE2;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSE2 : ScaleRowDown4_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN4_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ScaleRowDown4 = filtering ?
+        ScaleRowDown4Box_Any_AVX2 : ScaleRowDown4_Any_AVX2;
+    if (IS_ALIGNED(dst_width, 16)) {
+      ScaleRowDown4 = filtering ? ScaleRowDown4Box_AVX2 : ScaleRowDown4_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN4_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(row_stride, 4) &&
+      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+    ScaleRowDown4 = filtering ?
+        ScaleRowDown4Box_MIPS_DSPR2 : ScaleRowDown4_MIPS_DSPR2;
+  }
+#endif
+
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
+  }
+  for (y = 0; y < dst_height; ++y) {
+    ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
+    src_ptr += row_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+static void ScalePlaneDown4_16(int src_width, int src_height,
+                               int dst_width, int dst_height,
+                               int src_stride, int dst_stride,
+                               const uint16* src_ptr, uint16* dst_ptr,
+                               enum FilterMode filtering) {
+  int y;
+  void (*ScaleRowDown4)(const uint16* src_ptr, ptrdiff_t src_stride,
+                        uint16* dst_ptr, int dst_width) =
+      filtering ? ScaleRowDown4Box_16_C : ScaleRowDown4_16_C;
+  int row_stride = src_stride << 2;
+  if (!filtering) {
+    src_ptr += src_stride * 2;  // Point to row 2.
+    src_stride = 0;
+  }
+#if defined(HAS_SCALEROWDOWN4_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) {
+    ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_NEON :
+        ScaleRowDown4_16_NEON;
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN4_16_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+    ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_SSE2 :
+        ScaleRowDown4_16_SSE2;
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN4_16_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(row_stride, 4) &&
+      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+    ScaleRowDown4 = filtering ?
+        ScaleRowDown4Box_16_MIPS_DSPR2 : ScaleRowDown4_16_MIPS_DSPR2;
+  }
+#endif
+
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
+  }
+  for (y = 0; y < dst_height; ++y) {
+    ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
+    src_ptr += row_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+// Scale plane down, 3/4
+
+static void ScalePlaneDown34(int src_width, int src_height,
+                             int dst_width, int dst_height,
+                             int src_stride, int dst_stride,
+                             const uint8* src_ptr, uint8* dst_ptr,
+                             enum FilterMode filtering) {
+  int y;
+  void (*ScaleRowDown34_0)(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width);
+  void (*ScaleRowDown34_1)(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width);
+  const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+  assert(dst_width % 3 == 0);
+  if (!filtering) {
+    ScaleRowDown34_0 = ScaleRowDown34_C;
+    ScaleRowDown34_1 = ScaleRowDown34_C;
+  } else {
+    ScaleRowDown34_0 = ScaleRowDown34_0_Box_C;
+    ScaleRowDown34_1 = ScaleRowDown34_1_Box_C;
+  }
+#if defined(HAS_SCALEROWDOWN34_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    if (!filtering) {
+      ScaleRowDown34_0 = ScaleRowDown34_Any_NEON;
+      ScaleRowDown34_1 = ScaleRowDown34_Any_NEON;
+    } else {
+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_NEON;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_NEON;
+    }
+    if (dst_width % 24 == 0) {
+      if (!filtering) {
+        ScaleRowDown34_0 = ScaleRowDown34_NEON;
+        ScaleRowDown34_1 = ScaleRowDown34_NEON;
+      } else {
+        ScaleRowDown34_0 = ScaleRowDown34_0_Box_NEON;
+        ScaleRowDown34_1 = ScaleRowDown34_1_Box_NEON;
+      }
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN34_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    if (!filtering) {
+      ScaleRowDown34_0 = ScaleRowDown34_Any_SSSE3;
+      ScaleRowDown34_1 = ScaleRowDown34_Any_SSSE3;
+    } else {
+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_SSSE3;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_SSSE3;
+    }
+    if (dst_width % 24 == 0) {
+      if (!filtering) {
+        ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
+        ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
+      } else {
+        ScaleRowDown34_0 = ScaleRowDown34_0_Box_SSSE3;
+        ScaleRowDown34_1 = ScaleRowDown34_1_Box_SSSE3;
+      }
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN34_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 24 == 0) &&
+      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+    if (!filtering) {
+      ScaleRowDown34_0 = ScaleRowDown34_MIPS_DSPR2;
+      ScaleRowDown34_1 = ScaleRowDown34_MIPS_DSPR2;
+    } else {
+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_MIPS_DSPR2;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_MIPS_DSPR2;
+    }
+  }
+#endif
+
+  for (y = 0; y < dst_height - 2; y += 3) {
+    ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+    ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+    ScaleRowDown34_0(src_ptr + src_stride, -filter_stride,
+                     dst_ptr, dst_width);
+    src_ptr += src_stride * 2;
+    dst_ptr += dst_stride;
+  }
+
+  // Remainder 1 or 2 rows with last row vertically unfiltered
+  if ((dst_height % 3) == 2) {
+    ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+    ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width);
+  } else if ((dst_height % 3) == 1) {
+    ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width);
+  }
+}
+
+static void ScalePlaneDown34_16(int src_width, int src_height,
+                                int dst_width, int dst_height,
+                                int src_stride, int dst_stride,
+                                const uint16* src_ptr, uint16* dst_ptr,
+                                enum FilterMode filtering) {
+  int y;
+  void (*ScaleRowDown34_0)(const uint16* src_ptr, ptrdiff_t src_stride,
+                           uint16* dst_ptr, int dst_width);
+  void (*ScaleRowDown34_1)(const uint16* src_ptr, ptrdiff_t src_stride,
+                           uint16* dst_ptr, int dst_width);
+  const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+  assert(dst_width % 3 == 0);
+  if (!filtering) {
+    ScaleRowDown34_0 = ScaleRowDown34_16_C;
+    ScaleRowDown34_1 = ScaleRowDown34_16_C;
+  } else {
+    ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_C;
+    ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_C;
+  }
+#if defined(HAS_SCALEROWDOWN34_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && (dst_width % 24 == 0)) {
+    if (!filtering) {
+      ScaleRowDown34_0 = ScaleRowDown34_16_NEON;
+      ScaleRowDown34_1 = ScaleRowDown34_16_NEON;
+    } else {
+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_NEON;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN34_16_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) {
+    if (!filtering) {
+      ScaleRowDown34_0 = ScaleRowDown34_16_SSSE3;
+      ScaleRowDown34_1 = ScaleRowDown34_16_SSSE3;
+    } else {
+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_SSSE3;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN34_16_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 24 == 0) &&
+      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+    if (!filtering) {
+      ScaleRowDown34_0 = ScaleRowDown34_16_MIPS_DSPR2;
+      ScaleRowDown34_1 = ScaleRowDown34_16_MIPS_DSPR2;
+    } else {
+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_MIPS_DSPR2;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_MIPS_DSPR2;
+    }
+  }
+#endif
+
+  for (y = 0; y < dst_height - 2; y += 3) {
+    ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+    ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+    ScaleRowDown34_0(src_ptr + src_stride, -filter_stride,
+                     dst_ptr, dst_width);
+    src_ptr += src_stride * 2;
+    dst_ptr += dst_stride;
+  }
+
+  // Remainder 1 or 2 rows with last row vertically unfiltered
+  if ((dst_height % 3) == 2) {
+    ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+    ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width);
+  } else if ((dst_height % 3) == 1) {
+    ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width);
+  }
+}
+
+
+// Scale plane, 3/8
+// This is an optimized version for scaling down a plane to 3/8
+// of its original size.
+//
+// Uses box filter arranges like this
+// aaabbbcc -> abc
+// aaabbbcc    def
+// aaabbbcc    ghi
+// dddeeeff
+// dddeeeff
+// dddeeeff
+// ggghhhii
+// ggghhhii
+// Boxes are 3x3, 2x3, 3x2 and 2x2
+
+static void ScalePlaneDown38(int src_width, int src_height,
+                             int dst_width, int dst_height,
+                             int src_stride, int dst_stride,
+                             const uint8* src_ptr, uint8* dst_ptr,
+                             enum FilterMode filtering) {
+  int y;
+  void (*ScaleRowDown38_3)(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width);
+  void (*ScaleRowDown38_2)(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width);
+  const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+  assert(dst_width % 3 == 0);
+  if (!filtering) {
+    ScaleRowDown38_3 = ScaleRowDown38_C;
+    ScaleRowDown38_2 = ScaleRowDown38_C;
+  } else {
+    ScaleRowDown38_3 = ScaleRowDown38_3_Box_C;
+    ScaleRowDown38_2 = ScaleRowDown38_2_Box_C;
+  }
+
+#if defined(HAS_SCALEROWDOWN38_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    if (!filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_Any_NEON;
+      ScaleRowDown38_2 = ScaleRowDown38_Any_NEON;
+    } else {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_NEON;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_NEON;
+    }
+    if (dst_width % 12 == 0) {
+      if (!filtering) {
+        ScaleRowDown38_3 = ScaleRowDown38_NEON;
+        ScaleRowDown38_2 = ScaleRowDown38_NEON;
+      } else {
+        ScaleRowDown38_3 = ScaleRowDown38_3_Box_NEON;
+        ScaleRowDown38_2 = ScaleRowDown38_2_Box_NEON;
+      }
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN38_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    if (!filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_Any_SSSE3;
+      ScaleRowDown38_2 = ScaleRowDown38_Any_SSSE3;
+    } else {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_SSSE3;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_SSSE3;
+    }
+    if (dst_width % 12 == 0 && !filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_SSSE3;
+      ScaleRowDown38_2 = ScaleRowDown38_SSSE3;
+    }
+    if (dst_width % 6 == 0 && filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_SSSE3;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN38_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 12 == 0) &&
+      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+    if (!filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_MIPS_DSPR2;
+      ScaleRowDown38_2 = ScaleRowDown38_MIPS_DSPR2;
+    } else {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_MIPS_DSPR2;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_MIPS_DSPR2;
+    }
+  }
+#endif
+
+  for (y = 0; y < dst_height - 2; y += 3) {
+    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride * 3;
+    dst_ptr += dst_stride;
+    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride * 3;
+    dst_ptr += dst_stride;
+    ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride * 2;
+    dst_ptr += dst_stride;
+  }
+
+  // Remainder 1 or 2 rows with last row vertically unfiltered
+  if ((dst_height % 3) == 2) {
+    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride * 3;
+    dst_ptr += dst_stride;
+    ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
+  } else if ((dst_height % 3) == 1) {
+    ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
+  }
+}
+
+static void ScalePlaneDown38_16(int src_width, int src_height,
+                                int dst_width, int dst_height,
+                                int src_stride, int dst_stride,
+                                const uint16* src_ptr, uint16* dst_ptr,
+                                enum FilterMode filtering) {
+  int y;
+  void (*ScaleRowDown38_3)(const uint16* src_ptr, ptrdiff_t src_stride,
+                           uint16* dst_ptr, int dst_width);
+  void (*ScaleRowDown38_2)(const uint16* src_ptr, ptrdiff_t src_stride,
+                           uint16* dst_ptr, int dst_width);
+  const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+  assert(dst_width % 3 == 0);
+  if (!filtering) {
+    ScaleRowDown38_3 = ScaleRowDown38_16_C;
+    ScaleRowDown38_2 = ScaleRowDown38_16_C;
+  } else {
+    ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_C;
+    ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_C;
+  }
+#if defined(HAS_SCALEROWDOWN38_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && (dst_width % 12 == 0)) {
+    if (!filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_16_NEON;
+      ScaleRowDown38_2 = ScaleRowDown38_16_NEON;
+    } else {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_NEON;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN38_16_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) {
+    if (!filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_16_SSSE3;
+      ScaleRowDown38_2 = ScaleRowDown38_16_SSSE3;
+    } else {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_SSSE3;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN38_16_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 12 == 0) &&
+      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+    if (!filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_16_MIPS_DSPR2;
+      ScaleRowDown38_2 = ScaleRowDown38_16_MIPS_DSPR2;
+    } else {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_MIPS_DSPR2;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_MIPS_DSPR2;
+    }
+  }
+#endif
+
+  for (y = 0; y < dst_height - 2; y += 3) {
+    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride * 3;
+    dst_ptr += dst_stride;
+    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride * 3;
+    dst_ptr += dst_stride;
+    ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride * 2;
+    dst_ptr += dst_stride;
+  }
+
+  // Remainder 1 or 2 rows with last row vertically unfiltered
+  if ((dst_height % 3) == 2) {
+    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride * 3;
+    dst_ptr += dst_stride;
+    ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
+  } else if ((dst_height % 3) == 1) {
+    ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
+  }
+}
+
+#define MIN1(x) ((x) < 1 ? 1 : (x))
+
+static __inline uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {
+  uint32 sum = 0u;
+  int x;
+  assert(iboxwidth > 0);
+  for (x = 0; x < iboxwidth; ++x) {
+    sum += src_ptr[x];
+  }
+  return sum;
+}
+
+static __inline uint32 SumPixels_16(int iboxwidth, const uint32* src_ptr) {
+  uint32 sum = 0u;
+  int x;
+  assert(iboxwidth > 0);
+  for (x = 0; x < iboxwidth; ++x) {
+    sum += src_ptr[x];
+  }
+  return sum;
+}
+
+static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx,
+                            const uint16* src_ptr, uint8* dst_ptr) {
+  int i;
+  int scaletbl[2];
+  int minboxwidth = dx >> 16;
+  int* scaleptr = scaletbl - minboxwidth;
+  int boxwidth;
+  scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight);
+  scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight);
+  for (i = 0; i < dst_width; ++i) {
+    int ix = x >> 16;
+    x += dx;
+    boxwidth = MIN1((x >> 16) - ix);
+    *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16;
+  }
+}
+
+static void ScaleAddCols2_16_C(int dst_width, int boxheight, int x, int dx,
+                               const uint32* src_ptr, uint16* dst_ptr) {
+  int i;
+  int scaletbl[2];
+  int minboxwidth = dx >> 16;
+  int* scaleptr = scaletbl - minboxwidth;
+  int boxwidth;
+  scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight);
+  scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight);
+  for (i = 0; i < dst_width; ++i) {
+    int ix = x >> 16;
+    x += dx;
+    boxwidth = MIN1((x >> 16) - ix);
+    *dst_ptr++ =
+        SumPixels_16(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16;
+  }
+}
+
+static void ScaleAddCols0_C(int dst_width, int boxheight, int x, int,
+                            const uint16* src_ptr, uint8* dst_ptr) {
+  int scaleval = 65536 / boxheight;
+  int i;
+  src_ptr += (x >> 16);
+  for (i = 0; i < dst_width; ++i) {
+    *dst_ptr++ = src_ptr[i] * scaleval >> 16;
+  }
+}
+
+static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx,
+                            const uint16* src_ptr, uint8* dst_ptr) {
+  int boxwidth = MIN1(dx >> 16);
+  int scaleval = 65536 / (boxwidth * boxheight);
+  int i;
+  x >>= 16;
+  for (i = 0; i < dst_width; ++i) {
+    *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16;
+    x += boxwidth;
+  }
+}
+
+static void ScaleAddCols1_16_C(int dst_width, int boxheight, int x, int dx,
+                               const uint32* src_ptr, uint16* dst_ptr) {
+  int boxwidth = MIN1(dx >> 16);
+  int scaleval = 65536 / (boxwidth * boxheight);
+  int i;
+  for (i = 0; i < dst_width; ++i) {
+    *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + x) * scaleval >> 16;
+    x += boxwidth;
+  }
+}
+
+// Scale plane down to any dimensions, with interpolation.
+// (boxfilter).
+//
+// Same method as SimpleScale, which is fixed point, outputting
+// one pixel of destination using fixed point (16.16) to step
+// through source, sampling a box of pixel with simple
+// averaging.
+static void ScalePlaneBox(int src_width, int src_height,
+                          int dst_width, int dst_height,
+                          int src_stride, int dst_stride,
+                          const uint8* src_ptr, uint8* dst_ptr) {
+  int j, k;
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  const int max_y = (src_height << 16);
+  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox,
+             &x, &y, &dx, &dy);
+  src_width = Abs(src_width);
+  {
+    // Allocate a row buffer of uint16.
+    align_buffer_64(row16, src_width * 2);
+    void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
+        const uint16* src_ptr, uint8* dst_ptr) =
+        (dx & 0xffff) ? ScaleAddCols2_C:
+        ((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C);
+    void (*ScaleAddRow)(const uint8* src_ptr, uint16* dst_ptr, int src_width) =
+        ScaleAddRow_C;
+#if defined(HAS_SCALEADDROW_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2)) {
+      ScaleAddRow = ScaleAddRow_Any_SSE2;
+      if (IS_ALIGNED(src_width, 16)) {
+        ScaleAddRow = ScaleAddRow_SSE2;
+      }
+    }
+#endif
+#if defined(HAS_SCALEADDROW_AVX2)
+    if (TestCpuFlag(kCpuHasAVX2)) {
+      ScaleAddRow = ScaleAddRow_Any_AVX2;
+      if (IS_ALIGNED(src_width, 32)) {
+        ScaleAddRow = ScaleAddRow_AVX2;
+      }
+    }
+#endif
+#if defined(HAS_SCALEADDROW_NEON)
+    if (TestCpuFlag(kCpuHasNEON)) {
+      ScaleAddRow = ScaleAddRow_Any_NEON;
+      if (IS_ALIGNED(src_width, 16)) {
+        ScaleAddRow = ScaleAddRow_NEON;
+      }
+    }
+#endif
+
+    for (j = 0; j < dst_height; ++j) {
+      int boxheight;
+      int iy = y >> 16;
+      const uint8* src = src_ptr + iy * src_stride;
+      y += dy;
+      if (y > max_y) {
+        y = max_y;
+      }
+      boxheight = MIN1((y >> 16) - iy);
+      memset(row16, 0, src_width * 2);
+      for (k = 0; k < boxheight; ++k) {
+        ScaleAddRow(src, (uint16 *)(row16), src_width);
+        src += src_stride;
+      }
+      ScaleAddCols(dst_width, boxheight, x, dx, (uint16*)(row16), dst_ptr);
+      dst_ptr += dst_stride;
+    }
+    free_aligned_buffer_64(row16);
+  }
+}
+
+static void ScalePlaneBox_16(int src_width, int src_height,
+                             int dst_width, int dst_height,
+                             int src_stride, int dst_stride,
+                             const uint16* src_ptr, uint16* dst_ptr) {
+  int j, k;
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  const int max_y = (src_height << 16);
+  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox,
+             &x, &y, &dx, &dy);
+  src_width = Abs(src_width);
+  {
+    // Allocate a row buffer of uint32.
+    align_buffer_64(row32, src_width * 4);
+    void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
+        const uint32* src_ptr, uint16* dst_ptr) =
+        (dx & 0xffff) ? ScaleAddCols2_16_C: ScaleAddCols1_16_C;
+    void (*ScaleAddRow)(const uint16* src_ptr, uint32* dst_ptr, int src_width) =
+        ScaleAddRow_16_C;
+
+#if defined(HAS_SCALEADDROW_16_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 16)) {
+      ScaleAddRow = ScaleAddRow_16_SSE2;
+    }
+#endif
+
+    for (j = 0; j < dst_height; ++j) {
+      int boxheight;
+      int iy = y >> 16;
+      const uint16* src = src_ptr + iy * src_stride;
+      y += dy;
+      if (y > max_y) {
+        y = max_y;
+      }
+      boxheight = MIN1((y >> 16) - iy);
+      memset(row32, 0, src_width * 4);
+      for (k = 0; k < boxheight; ++k) {
+        ScaleAddRow(src, (uint32 *)(row32), src_width);
+        src += src_stride;
+      }
+      ScaleAddCols(dst_width, boxheight, x, dx, (uint32*)(row32), dst_ptr);
+      dst_ptr += dst_stride;
+    }
+    free_aligned_buffer_64(row32);
+  }
+}
+
+// Scale plane down with bilinear interpolation.
+void ScalePlaneBilinearDown(int src_width, int src_height,
+                            int dst_width, int dst_height,
+                            int src_stride, int dst_stride,
+                            const uint8* src_ptr, uint8* dst_ptr,
+                            enum FilterMode filtering) {
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
+  // Allocate a row buffer.
+  align_buffer_64(row, src_width);
+
+  const int max_y = (src_height - 1) << 16;
+  int j;
+  void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
+      int dst_width, int x, int dx) =
+      (src_width >= 32768) ? ScaleFilterCols64_C : ScaleFilterCols_C;
+  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_C;
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
+             &x, &y, &dx, &dy);
+  src_width = Abs(src_width);
+
+#if defined(HAS_INTERPOLATEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_Any_SSE2;
+    if (IS_ALIGNED(src_width, 16)) {
+      InterpolateRow = InterpolateRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(src_width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(src_width, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(src_width, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
+    InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
+    if (IS_ALIGNED(src_width, 4)) {
+      InterpolateRow = InterpolateRow_MIPS_DSPR2;
+    }
+  }
+#endif
+
+
+#if defined(HAS_SCALEFILTERCOLS_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleFilterCols = ScaleFilterCols_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEFILTERCOLS_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && src_width < 32768) {
+    ScaleFilterCols = ScaleFilterCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleFilterCols = ScaleFilterCols_NEON;
+    }
+  }
+#endif
+  if (y > max_y) {
+    y = max_y;
+  }
+
+  for (j = 0; j < dst_height; ++j) {
+    int yi = y >> 16;
+    const uint8* src = src_ptr + yi * src_stride;
+    if (filtering == kFilterLinear) {
+      ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
+    } else {
+      int yf = (y >> 8) & 255;
+      InterpolateRow(row, src, src_stride, src_width, yf);
+      ScaleFilterCols(dst_ptr, row, dst_width, x, dx);
+    }
+    dst_ptr += dst_stride;
+    y += dy;
+    if (y > max_y) {
+      y = max_y;
+    }
+  }
+  free_aligned_buffer_64(row);
+}
+
+void ScalePlaneBilinearDown_16(int src_width, int src_height,
+                               int dst_width, int dst_height,
+                               int src_stride, int dst_stride,
+                               const uint16* src_ptr, uint16* dst_ptr,
+                               enum FilterMode filtering) {
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
+  // Allocate a row buffer.
+  align_buffer_64(row, src_width * 2);
+
+  const int max_y = (src_height - 1) << 16;
+  int j;
+  void (*ScaleFilterCols)(uint16* dst_ptr, const uint16* src_ptr,
+      int dst_width, int x, int dx) =
+      (src_width >= 32768) ? ScaleFilterCols64_16_C : ScaleFilterCols_16_C;
+  void (*InterpolateRow)(uint16* dst_ptr, const uint16* src_ptr,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_16_C;
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
+             &x, &y, &dx, &dy);
+  src_width = Abs(src_width);
+
+#if defined(HAS_INTERPOLATEROW_16_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_Any_16_SSE2;
+    if (IS_ALIGNED(src_width, 16)) {
+      InterpolateRow = InterpolateRow_16_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_16_SSSE3;
+    if (IS_ALIGNED(src_width, 16)) {
+      InterpolateRow = InterpolateRow_16_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_16_AVX2;
+    if (IS_ALIGNED(src_width, 32)) {
+      InterpolateRow = InterpolateRow_16_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_16_NEON;
+    if (IS_ALIGNED(src_width, 16)) {
+      InterpolateRow = InterpolateRow_16_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
+    InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2;
+    if (IS_ALIGNED(src_width, 4)) {
+      InterpolateRow = InterpolateRow_16_MIPS_DSPR2;
+    }
+  }
+#endif
+
+
+#if defined(HAS_SCALEFILTERCOLS_16_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleFilterCols = ScaleFilterCols_16_SSSE3;
+  }
+#endif
+  if (y > max_y) {
+    y = max_y;
+  }
+
+  for (j = 0; j < dst_height; ++j) {
+    int yi = y >> 16;
+    const uint16* src = src_ptr + yi * src_stride;
+    if (filtering == kFilterLinear) {
+      ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
+    } else {
+      int yf = (y >> 8) & 255;
+      InterpolateRow((uint16*)row, src, src_stride, src_width, yf);
+      ScaleFilterCols(dst_ptr, (uint16*)row, dst_width, x, dx);
+    }
+    dst_ptr += dst_stride;
+    y += dy;
+    if (y > max_y) {
+      y = max_y;
+    }
+  }
+  free_aligned_buffer_64(row);
+}
+
+// Scale up down with bilinear interpolation.
+void ScalePlaneBilinearUp(int src_width, int src_height,
+                          int dst_width, int dst_height,
+                          int src_stride, int dst_stride,
+                          const uint8* src_ptr, uint8* dst_ptr,
+                          enum FilterMode filtering) {
+  int j;
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  const int max_y = (src_height - 1) << 16;
+  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_C;
+  void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
+      int dst_width, int x, int dx) =
+      filtering ? ScaleFilterCols_C : ScaleCols_C;
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
+             &x, &y, &dx, &dy);
+  src_width = Abs(src_width);
+
+#if defined(HAS_INTERPOLATEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_Any_SSE2;
+    if (IS_ALIGNED(dst_width, 16)) {
+      InterpolateRow = InterpolateRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(dst_width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(dst_width, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(dst_width, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
+    InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_MIPS_DSPR2;
+    }
+  }
+#endif
+
+  if (filtering && src_width >= 32768) {
+    ScaleFilterCols = ScaleFilterCols64_C;
+  }
+#if defined(HAS_SCALEFILTERCOLS_SSSE3)
+  if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleFilterCols = ScaleFilterCols_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEFILTERCOLS_NEON)
+  if (filtering && TestCpuFlag(kCpuHasNEON) && src_width < 32768) {
+    ScaleFilterCols = ScaleFilterCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleFilterCols = ScaleFilterCols_NEON;
+    }
+  }
+#endif
+  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+    ScaleFilterCols = ScaleColsUp2_C;
+#if defined(HAS_SCALECOLS_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+      ScaleFilterCols = ScaleColsUp2_SSE2;
+    }
+#endif
+  }
+
+  if (y > max_y) {
+    y = max_y;
+  }
+  {
+    int yi = y >> 16;
+    const uint8* src = src_ptr + yi * src_stride;
+
+    // Allocate 2 row buffers.
+    const int kRowSize = (dst_width + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);
+
+    uint8* rowptr = row;
+    int rowstride = kRowSize;
+    int lasty = yi;
+
+    ScaleFilterCols(rowptr, src, dst_width, x, dx);
+    if (src_height > 1) {
+      src += src_stride;
+    }
+    ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx);
+    src += src_stride;
+
+    for (j = 0; j < dst_height; ++j) {
+      yi = y >> 16;
+      if (yi != lasty) {
+        if (y > max_y) {
+          y = max_y;
+          yi = y >> 16;
+          src = src_ptr + yi * src_stride;
+        }
+        if (yi != lasty) {
+          ScaleFilterCols(rowptr, src, dst_width, x, dx);
+          rowptr += rowstride;
+          rowstride = -rowstride;
+          lasty = yi;
+          src += src_stride;
+        }
+      }
+      if (filtering == kFilterLinear) {
+        InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0);
+      } else {
+        int yf = (y >> 8) & 255;
+        InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf);
+      }
+      dst_ptr += dst_stride;
+      y += dy;
+    }
+    free_aligned_buffer_64(row);
+  }
+}
+
+void ScalePlaneBilinearUp_16(int src_width, int src_height,
+                             int dst_width, int dst_height,
+                             int src_stride, int dst_stride,
+                             const uint16* src_ptr, uint16* dst_ptr,
+                             enum FilterMode filtering) {
+  int j;
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  const int max_y = (src_height - 1) << 16;
+  void (*InterpolateRow)(uint16* dst_ptr, const uint16* src_ptr,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_16_C;
+  void (*ScaleFilterCols)(uint16* dst_ptr, const uint16* src_ptr,
+      int dst_width, int x, int dx) =
+      filtering ? ScaleFilterCols_16_C : ScaleCols_16_C;
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
+             &x, &y, &dx, &dy);
+  src_width = Abs(src_width);
+
+#if defined(HAS_INTERPOLATEROW_16_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_Any_16_SSE2;
+    if (IS_ALIGNED(dst_width, 16)) {
+      InterpolateRow = InterpolateRow_16_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_16_SSSE3;
+    if (IS_ALIGNED(dst_width, 16)) {
+      InterpolateRow = InterpolateRow_16_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_16_AVX2;
+    if (IS_ALIGNED(dst_width, 32)) {
+      InterpolateRow = InterpolateRow_16_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_16_NEON;
+    if (IS_ALIGNED(dst_width, 16)) {
+      InterpolateRow = InterpolateRow_16_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
+    InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_16_MIPS_DSPR2;
+    }
+  }
+#endif
+
+  if (filtering && src_width >= 32768) {
+    ScaleFilterCols = ScaleFilterCols64_16_C;
+  }
+#if defined(HAS_SCALEFILTERCOLS_16_SSSE3)
+  if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleFilterCols = ScaleFilterCols_16_SSSE3;
+  }
+#endif
+  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+    ScaleFilterCols = ScaleColsUp2_16_C;
+#if defined(HAS_SCALECOLS_16_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+      ScaleFilterCols = ScaleColsUp2_16_SSE2;
+    }
+#endif
+  }
+
+  if (y > max_y) {
+    y = max_y;
+  }
+  {
+    int yi = y >> 16;
+    const uint16* src = src_ptr + yi * src_stride;
+
+    // Allocate 2 row buffers.
+    const int kRowSize = (dst_width + 31) & ~31;
+    align_buffer_64(row, kRowSize * 4);
+
+    uint16* rowptr = (uint16*)row;
+    int rowstride = kRowSize;
+    int lasty = yi;
+
+    ScaleFilterCols(rowptr, src, dst_width, x, dx);
+    if (src_height > 1) {
+      src += src_stride;
+    }
+    ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx);
+    src += src_stride;
+
+    for (j = 0; j < dst_height; ++j) {
+      yi = y >> 16;
+      if (yi != lasty) {
+        if (y > max_y) {
+          y = max_y;
+          yi = y >> 16;
+          src = src_ptr + yi * src_stride;
+        }
+        if (yi != lasty) {
+          ScaleFilterCols(rowptr, src, dst_width, x, dx);
+          rowptr += rowstride;
+          rowstride = -rowstride;
+          lasty = yi;
+          src += src_stride;
+        }
+      }
+      if (filtering == kFilterLinear) {
+        InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0);
+      } else {
+        int yf = (y >> 8) & 255;
+        InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf);
+      }
+      dst_ptr += dst_stride;
+      y += dy;
+    }
+    free_aligned_buffer_64(row);
+  }
+}
+
+// Scale Plane to/from any dimensions, without interpolation.
+// Fixed point math is used for performance: The upper 16 bits
+// of x and dx is the integer part of the source position and
+// the lower 16 bits are the fixed decimal part.
+
+static void ScalePlaneSimple(int src_width, int src_height,
+                             int dst_width, int dst_height,
+                             int src_stride, int dst_stride,
+                             const uint8* src_ptr, uint8* dst_ptr) {
+  int i;
+  void (*ScaleCols)(uint8* dst_ptr, const uint8* src_ptr,
+      int dst_width, int x, int dx) = ScaleCols_C;
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone,
+             &x, &y, &dx, &dy);
+  src_width = Abs(src_width);
+
+  if (src_width * 2 == dst_width && x < 0x8000) {
+    ScaleCols = ScaleColsUp2_C;
+#if defined(HAS_SCALECOLS_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+      ScaleCols = ScaleColsUp2_SSE2;
+    }
+#endif
+  }
+
+  for (i = 0; i < dst_height; ++i) {
+    ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx);
+    dst_ptr += dst_stride;
+    y += dy;
+  }
+}
+
+static void ScalePlaneSimple_16(int src_width, int src_height,
+                                int dst_width, int dst_height,
+                                int src_stride, int dst_stride,
+                                const uint16* src_ptr, uint16* dst_ptr) {
+  int i;
+  void (*ScaleCols)(uint16* dst_ptr, const uint16* src_ptr,
+      int dst_width, int x, int dx) = ScaleCols_16_C;
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone,
+             &x, &y, &dx, &dy);
+  src_width = Abs(src_width);
+
+  if (src_width * 2 == dst_width && x < 0x8000) {
+    ScaleCols = ScaleColsUp2_16_C;
+#if defined(HAS_SCALECOLS_16_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+      ScaleCols = ScaleColsUp2_16_SSE2;
+    }
+#endif
+  }
+
+  for (i = 0; i < dst_height; ++i) {
+    ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride,
+              dst_width, x, dx);
+    dst_ptr += dst_stride;
+    y += dy;
+  }
+}
+
+// Scale a plane.
+// This function dispatches to a specialized scaler based on scale factor.
+
+LIBYUV_API
+void ScalePlane(const uint8* src, int src_stride,
+                int src_width, int src_height,
+                uint8* dst, int dst_stride,
+                int dst_width, int dst_height,
+                enum FilterMode filtering) {
+  // Simplify filtering when possible.
+  filtering = ScaleFilterReduce(src_width, src_height,
+                                dst_width, dst_height, filtering);
+
+  // Negative height means invert the image.
+  if (src_height < 0) {
+    src_height = -src_height;
+    src = src + (src_height - 1) * src_stride;
+    src_stride = -src_stride;
+  }
+
+  // Use specialized scales to improve performance for common resolutions.
+  // For example, all the 1/2 scalings will use ScalePlaneDown2()
+  if (dst_width == src_width && dst_height == src_height) {
+    // Straight copy.
+    CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height);
+    return;
+  }
+  if (dst_width == src_width && filtering != kFilterBox) {
+    int dy = FixedDiv(src_height, dst_height);
+    // Arbitrary scale vertically, but unscaled horizontally.
+    ScalePlaneVertical(src_height,
+                       dst_width, dst_height,
+                       src_stride, dst_stride, src, dst,
+                       0, 0, dy, 1, filtering);
+    return;
+  }
+  if (dst_width <= Abs(src_width) && dst_height <= src_height) {
+    // Scale down.
+    if (4 * dst_width == 3 * src_width &&
+        4 * dst_height == 3 * src_height) {
+      // optimized, 3/4
+      ScalePlaneDown34(src_width, src_height, dst_width, dst_height,
+                       src_stride, dst_stride, src, dst, filtering);
+      return;
+    }
+    if (2 * dst_width == src_width && 2 * dst_height == src_height) {
+      // optimized, 1/2
+      ScalePlaneDown2(src_width, src_height, dst_width, dst_height,
+                      src_stride, dst_stride, src, dst, filtering);
+      return;
+    }
+    // 3/8 rounded up for odd sized chroma height.
+    if (8 * dst_width == 3 * src_width &&
+        dst_height == ((src_height * 3 + 7) / 8)) {
+      // optimized, 3/8
+      ScalePlaneDown38(src_width, src_height, dst_width, dst_height,
+                       src_stride, dst_stride, src, dst, filtering);
+      return;
+    }
+    if (4 * dst_width == src_width && 4 * dst_height == src_height &&
+        (filtering == kFilterBox || filtering == kFilterNone)) {
+      // optimized, 1/4
+      ScalePlaneDown4(src_width, src_height, dst_width, dst_height,
+                      src_stride, dst_stride, src, dst, filtering);
+      return;
+    }
+  }
+  if (filtering == kFilterBox && dst_height * 2 < src_height) {
+    ScalePlaneBox(src_width, src_height, dst_width, dst_height,
+                  src_stride, dst_stride, src, dst);
+    return;
+  }
+  if (filtering && dst_height > src_height) {
+    ScalePlaneBilinearUp(src_width, src_height, dst_width, dst_height,
+                         src_stride, dst_stride, src, dst, filtering);
+    return;
+  }
+  if (filtering) {
+    ScalePlaneBilinearDown(src_width, src_height, dst_width, dst_height,
+                           src_stride, dst_stride, src, dst, filtering);
+    return;
+  }
+  ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
+                   src_stride, dst_stride, src, dst);
+}
+
+LIBYUV_API
+void ScalePlane_16(const uint16* src, int src_stride,
+                  int src_width, int src_height,
+                  uint16* dst, int dst_stride,
+                  int dst_width, int dst_height,
+                  enum FilterMode filtering) {
+  // Simplify filtering when possible.
+  filtering = ScaleFilterReduce(src_width, src_height,
+                                dst_width, dst_height, filtering);
+
+  // Negative height means invert the image.
+  if (src_height < 0) {
+    src_height = -src_height;
+    src = src + (src_height - 1) * src_stride;
+    src_stride = -src_stride;
+  }
+
+  // Use specialized scales to improve performance for common resolutions.
+  // For example, all the 1/2 scalings will use ScalePlaneDown2()
+  if (dst_width == src_width && dst_height == src_height) {
+    // Straight copy.
+    CopyPlane_16(src, src_stride, dst, dst_stride, dst_width, dst_height);
+    return;
+  }
+  if (dst_width == src_width) {
+    int dy = FixedDiv(src_height, dst_height);
+    // Arbitrary scale vertically, but unscaled vertically.
+    ScalePlaneVertical_16(src_height,
+                          dst_width, dst_height,
+                          src_stride, dst_stride, src, dst,
+                          0, 0, dy, 1, filtering);
+    return;
+  }
+  if (dst_width <= Abs(src_width) && dst_height <= src_height) {
+    // Scale down.
+    if (4 * dst_width == 3 * src_width &&
+        4 * dst_height == 3 * src_height) {
+      // optimized, 3/4
+      ScalePlaneDown34_16(src_width, src_height, dst_width, dst_height,
+                          src_stride, dst_stride, src, dst, filtering);
+      return;
+    }
+    if (2 * dst_width == src_width && 2 * dst_height == src_height) {
+      // optimized, 1/2
+      ScalePlaneDown2_16(src_width, src_height, dst_width, dst_height,
+                         src_stride, dst_stride, src, dst, filtering);
+      return;
+    }
+    // 3/8 rounded up for odd sized chroma height.
+    if (8 * dst_width == 3 * src_width &&
+        dst_height == ((src_height * 3 + 7) / 8)) {
+      // optimized, 3/8
+      ScalePlaneDown38_16(src_width, src_height, dst_width, dst_height,
+                          src_stride, dst_stride, src, dst, filtering);
+      return;
+    }
+    if (4 * dst_width == src_width && 4 * dst_height == src_height &&
+               filtering != kFilterBilinear) {
+      // optimized, 1/4
+      ScalePlaneDown4_16(src_width, src_height, dst_width, dst_height,
+                         src_stride, dst_stride, src, dst, filtering);
+      return;
+    }
+  }
+  if (filtering == kFilterBox && dst_height * 2 < src_height) {
+    ScalePlaneBox_16(src_width, src_height, dst_width, dst_height,
+                     src_stride, dst_stride, src, dst);
+    return;
+  }
+  if (filtering && dst_height > src_height) {
+    ScalePlaneBilinearUp_16(src_width, src_height, dst_width, dst_height,
+                            src_stride, dst_stride, src, dst, filtering);
+    return;
+  }
+  if (filtering) {
+    ScalePlaneBilinearDown_16(src_width, src_height, dst_width, dst_height,
+                              src_stride, dst_stride, src, dst, filtering);
+    return;
+  }
+  ScalePlaneSimple_16(src_width, src_height, dst_width, dst_height,
+                      src_stride, dst_stride, src, dst);
+}
+
+// Scale an I420 image.
+// This function in turn calls a scaling function for each plane.
+
+LIBYUV_API
+int I420Scale(const uint8* src_y, int src_stride_y,
+              const uint8* src_u, int src_stride_u,
+              const uint8* src_v, int src_stride_v,
+              int src_width, int src_height,
+              uint8* dst_y, int dst_stride_y,
+              uint8* dst_u, int dst_stride_u,
+              uint8* dst_v, int dst_stride_v,
+              int dst_width, int dst_height,
+              enum FilterMode filtering) {
+  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+  int src_halfheight = SUBSAMPLE(src_height, 1, 1);
+  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+  int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
+  if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 ||
+      !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }
+
+  ScalePlane(src_y, src_stride_y, src_width, src_height,
+             dst_y, dst_stride_y, dst_width, dst_height,
+             filtering);
+  ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight,
+             dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
+             filtering);
+  ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight,
+             dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
+             filtering);
+  return 0;
+}
+
+LIBYUV_API
+int I420Scale_16(const uint16* src_y, int src_stride_y,
+                 const uint16* src_u, int src_stride_u,
+                 const uint16* src_v, int src_stride_v,
+                 int src_width, int src_height,
+                 uint16* dst_y, int dst_stride_y,
+                 uint16* dst_u, int dst_stride_u,
+                 uint16* dst_v, int dst_stride_v,
+                 int dst_width, int dst_height,
+                 enum FilterMode filtering) {
+  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+  int src_halfheight = SUBSAMPLE(src_height, 1, 1);
+  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+  int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
+  if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 ||
+      !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }
+
+  ScalePlane_16(src_y, src_stride_y, src_width, src_height,
+                dst_y, dst_stride_y, dst_width, dst_height,
+                filtering);
+  ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_halfheight,
+                dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
+                filtering);
+  ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_halfheight,
+                dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
+                filtering);
+  return 0;
+}
+
+// Deprecated api
+LIBYUV_API
+int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
+          int src_stride_y, int src_stride_u, int src_stride_v,
+          int src_width, int src_height,
+          uint8* dst_y, uint8* dst_u, uint8* dst_v,
+          int dst_stride_y, int dst_stride_u, int dst_stride_v,
+          int dst_width, int dst_height,
+          LIBYUV_BOOL interpolate) {
+  return I420Scale(src_y, src_stride_y,
+                   src_u, src_stride_u,
+                   src_v, src_stride_v,
+                   src_width, src_height,
+                   dst_y, dst_stride_y,
+                   dst_u, dst_stride_u,
+                   dst_v, dst_stride_v,
+                   dst_width, dst_height,
+                   interpolate ? kFilterBox : kFilterNone);
+}
+
+// Deprecated api
+LIBYUV_API
+int ScaleOffset(const uint8* src, int src_width, int src_height,
+                uint8* dst, int dst_width, int dst_height, int dst_yoffset,
+                LIBYUV_BOOL interpolate) {
+  // Chroma requires offset to multiple of 2.
+  int dst_yoffset_even = dst_yoffset & ~1;
+  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+  int src_halfheight = SUBSAMPLE(src_height, 1, 1);
+  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+  int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
+  int aheight = dst_height - dst_yoffset_even * 2;  // actual output height
+  const uint8* src_y = src;
+  const uint8* src_u = src + src_width * src_height;
+  const uint8* src_v = src + src_width * src_height +
+                             src_halfwidth * src_halfheight;
+  uint8* dst_y = dst + dst_yoffset_even * dst_width;
+  uint8* dst_u = dst + dst_width * dst_height +
+                 (dst_yoffset_even >> 1) * dst_halfwidth;
+  uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight +
+                 (dst_yoffset_even >> 1) * dst_halfwidth;
+  if (!src || src_width <= 0 || src_height <= 0 ||
+      !dst || dst_width <= 0 || dst_height <= 0 || dst_yoffset_even < 0 ||
+      dst_yoffset_even >= dst_height) {
+    return -1;
+  }
+  return I420Scale(src_y, src_width,
+                   src_u, src_halfwidth,
+                   src_v, src_halfwidth,
+                   src_width, src_height,
+                   dst_y, dst_width,
+                   dst_u, dst_halfwidth,
+                   dst_v, dst_halfwidth,
+                   dst_width, aheight,
+                   interpolate ? kFilterBox : kFilterNone);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libvpx/third_party/libyuv/source/scale_any.cc b/libs/libvpx/third_party/libyuv/source/scale_any.cc
new file mode 100644
index 0000000000..2f6a2c8baf
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/source/scale_any.cc
@@ -0,0 +1,200 @@
+/*
+ *  Copyright 2015 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+#include "libyuv/scale_row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols
+#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK)                            \
+    void NAMEANY(uint8* dst_ptr, const uint8* src_ptr,                         \
+                 int dst_width, int x, int dx) {                               \
+      int n = dst_width & ~MASK;                                               \
+      if (n > 0) {                                                             \
+        TERP_SIMD(dst_ptr, src_ptr, n, x, dx);                                 \
+      }                                                                        \
+      TERP_C(dst_ptr + n * BPP, src_ptr,                                       \
+             dst_width & MASK, x + n * dx, dx);                                \
+    }
+
+#ifdef HAS_SCALEFILTERCOLS_NEON
+CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
+#endif
+#ifdef HAS_SCALEARGBCOLS_NEON
+CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7)
+#endif
+#ifdef HAS_SCALEARGBFILTERCOLS_NEON
+CANY(ScaleARGBFilterCols_Any_NEON, ScaleARGBFilterCols_NEON,
+     ScaleARGBFilterCols_C, 4, 3)
+#endif
+#undef CANY
+
+// Fixed scale down.
+#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK)   \
+    void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride,                   \
+                 uint8* dst_ptr, int dst_width) {                              \
+      int r = (int)((unsigned int)dst_width % (MASK + 1));                     \
+      int n = dst_width - r;                                                   \
+      if (n > 0) {                                                             \
+        SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n);                    \
+      }                                                                        \
+      SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride,                 \
+                     dst_ptr + n * BPP, r);                                    \
+    }
+
+#ifdef HAS_SCALEROWDOWN2_SSE2
+SDANY(ScaleRowDown2_Any_SSE2, ScaleRowDown2_SSE2, ScaleRowDown2_C, 2, 1, 15)
+SDANY(ScaleRowDown2Linear_Any_SSE2, ScaleRowDown2Linear_SSE2,
+      ScaleRowDown2Linear_C, 2, 1, 15)
+SDANY(ScaleRowDown2Box_Any_SSE2, ScaleRowDown2Box_SSE2, ScaleRowDown2Box_C,
+      2, 1, 15)
+#endif
+#ifdef HAS_SCALEROWDOWN2_AVX2
+SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31)
+SDANY(ScaleRowDown2Linear_Any_AVX2, ScaleRowDown2Linear_AVX2,
+      ScaleRowDown2Linear_C, 2, 1, 31)
+SDANY(ScaleRowDown2Box_Any_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_C,
+      2, 1, 31)
+#endif
+#ifdef HAS_SCALEROWDOWN2_NEON
+SDANY(ScaleRowDown2_Any_NEON, ScaleRowDown2_NEON, ScaleRowDown2_C, 2, 1, 15)
+SDANY(ScaleRowDown2Linear_Any_NEON, ScaleRowDown2Linear_NEON,
+      ScaleRowDown2Linear_C, 2, 1, 15)
+SDANY(ScaleRowDown2Box_Any_NEON, ScaleRowDown2Box_NEON,
+      ScaleRowDown2Box_C, 2, 1, 15)
+#endif
+#ifdef HAS_SCALEROWDOWN4_SSE2
+SDANY(ScaleRowDown4_Any_SSE2, ScaleRowDown4_SSE2, ScaleRowDown4_C, 4, 1, 7)
+SDANY(ScaleRowDown4Box_Any_SSE2, ScaleRowDown4Box_SSE2, ScaleRowDown4Box_C,
+      4, 1, 7)
+#endif
+#ifdef HAS_SCALEROWDOWN4_AVX2
+SDANY(ScaleRowDown4_Any_AVX2, ScaleRowDown4_AVX2, ScaleRowDown4_C, 4, 1, 15)
+SDANY(ScaleRowDown4Box_Any_AVX2, ScaleRowDown4Box_AVX2, ScaleRowDown4Box_C,
+      4, 1, 15)
+#endif
+#ifdef HAS_SCALEROWDOWN4_NEON
+SDANY(ScaleRowDown4_Any_NEON, ScaleRowDown4_NEON, ScaleRowDown4_C, 4, 1, 7)
+SDANY(ScaleRowDown4Box_Any_NEON, ScaleRowDown4Box_NEON, ScaleRowDown4Box_C,
+      4, 1, 7)
+#endif
+#ifdef HAS_SCALEROWDOWN34_SSSE3
+SDANY(ScaleRowDown34_Any_SSSE3, ScaleRowDown34_SSSE3,
+      ScaleRowDown34_C, 4 / 3, 1, 23)
+SDANY(ScaleRowDown34_0_Box_Any_SSSE3, ScaleRowDown34_0_Box_SSSE3,
+      ScaleRowDown34_0_Box_C, 4 / 3, 1, 23)
+SDANY(ScaleRowDown34_1_Box_Any_SSSE3, ScaleRowDown34_1_Box_SSSE3,
+      ScaleRowDown34_1_Box_C, 4 / 3, 1, 23)
+#endif
+#ifdef HAS_SCALEROWDOWN34_NEON
+SDANY(ScaleRowDown34_Any_NEON, ScaleRowDown34_NEON,
+      ScaleRowDown34_C, 4 / 3, 1, 23)
+SDANY(ScaleRowDown34_0_Box_Any_NEON, ScaleRowDown34_0_Box_NEON,
+      ScaleRowDown34_0_Box_C, 4 / 3, 1, 23)
+SDANY(ScaleRowDown34_1_Box_Any_NEON, ScaleRowDown34_1_Box_NEON,
+      ScaleRowDown34_1_Box_C, 4 / 3, 1, 23)
+#endif
+#ifdef HAS_SCALEROWDOWN38_SSSE3
+SDANY(ScaleRowDown38_Any_SSSE3, ScaleRowDown38_SSSE3,
+      ScaleRowDown38_C, 8 / 3, 1, 11)
+SDANY(ScaleRowDown38_3_Box_Any_SSSE3, ScaleRowDown38_3_Box_SSSE3,
+      ScaleRowDown38_3_Box_C, 8 / 3, 1, 5)
+SDANY(ScaleRowDown38_2_Box_Any_SSSE3, ScaleRowDown38_2_Box_SSSE3,
+      ScaleRowDown38_2_Box_C, 8 / 3, 1, 5)
+#endif
+#ifdef HAS_SCALEROWDOWN38_NEON
+SDANY(ScaleRowDown38_Any_NEON, ScaleRowDown38_NEON,
+      ScaleRowDown38_C, 8 / 3, 1, 11)
+SDANY(ScaleRowDown38_3_Box_Any_NEON, ScaleRowDown38_3_Box_NEON,
+      ScaleRowDown38_3_Box_C, 8 / 3, 1, 11)
+SDANY(ScaleRowDown38_2_Box_Any_NEON, ScaleRowDown38_2_Box_NEON,
+      ScaleRowDown38_2_Box_C, 8 / 3, 1, 11)
+#endif
+
+#ifdef HAS_SCALEARGBROWDOWN2_SSE2
+SDANY(ScaleARGBRowDown2_Any_SSE2, ScaleARGBRowDown2_SSE2,
+      ScaleARGBRowDown2_C, 2, 4, 3)
+SDANY(ScaleARGBRowDown2Linear_Any_SSE2, ScaleARGBRowDown2Linear_SSE2,
+      ScaleARGBRowDown2Linear_C, 2, 4, 3)
+SDANY(ScaleARGBRowDown2Box_Any_SSE2, ScaleARGBRowDown2Box_SSE2,
+      ScaleARGBRowDown2Box_C, 2, 4, 3)
+#endif
+#ifdef HAS_SCALEARGBROWDOWN2_NEON
+SDANY(ScaleARGBRowDown2_Any_NEON, ScaleARGBRowDown2_NEON,
+      ScaleARGBRowDown2_C, 2, 4, 7)
+SDANY(ScaleARGBRowDown2Linear_Any_NEON, ScaleARGBRowDown2Linear_NEON,
+      ScaleARGBRowDown2Linear_C, 2, 4, 7)
+SDANY(ScaleARGBRowDown2Box_Any_NEON, ScaleARGBRowDown2Box_NEON,
+      ScaleARGBRowDown2Box_C, 2, 4, 7)
+#endif
+#undef SDANY
+
+// Scale down by even scale factor.
+#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK)          \
+    void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, int src_stepx,    \
+                 uint8* dst_ptr, int dst_width) {                              \
+      int r = (int)((unsigned int)dst_width % (MASK + 1));                     \
+      int n = dst_width - r;                                                   \
+      if (n > 0) {                                                             \
+        SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n);         \
+      }                                                                        \
+      SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride,              \
+                     src_stepx, dst_ptr + n * BPP, r);                         \
+    }
+
+#ifdef HAS_SCALEARGBROWDOWNEVEN_SSE2
+SDAANY(ScaleARGBRowDownEven_Any_SSE2, ScaleARGBRowDownEven_SSE2,
+       ScaleARGBRowDownEven_C, 4, 3)
+SDAANY(ScaleARGBRowDownEvenBox_Any_SSE2, ScaleARGBRowDownEvenBox_SSE2,
+       ScaleARGBRowDownEvenBox_C, 4, 3)
+#endif
+#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON
+SDAANY(ScaleARGBRowDownEven_Any_NEON, ScaleARGBRowDownEven_NEON,
+       ScaleARGBRowDownEven_C, 4, 3)
+SDAANY(ScaleARGBRowDownEvenBox_Any_NEON, ScaleARGBRowDownEvenBox_NEON,
+       ScaleARGBRowDownEvenBox_C, 4, 3)
+#endif
+
+// Add rows box filter scale down.
+#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK)                  \
+  void NAMEANY(const uint8* src_ptr, uint16* dst_ptr, int src_width) {         \
+      int n = src_width & ~MASK;                                               \
+      if (n > 0) {                                                             \
+        SCALEADDROW_SIMD(src_ptr, dst_ptr, n);                                 \
+      }                                                                        \
+      SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK);               \
+    }
+
+#ifdef HAS_SCALEADDROW_SSE2
+SAANY(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, ScaleAddRow_C, 15)
+#endif
+#ifdef HAS_SCALEADDROW_AVX2
+SAANY(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, ScaleAddRow_C, 31)
+#endif
+#ifdef HAS_SCALEADDROW_NEON
+SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15)
+#endif
+#undef SAANY
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+
+
+
+
diff --git a/libs/libvpx/third_party/libyuv/source/scale_argb.cc b/libs/libvpx/third_party/libyuv/source/scale_argb.cc
new file mode 100644
index 0000000000..40a2d1ab20
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/source/scale_argb.cc
@@ -0,0 +1,853 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"  // For CopyARGB
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+static __inline int Abs(int v) {
+  return v >= 0 ? v : -v;
+}
+
+// ScaleARGB ARGB, 1/2
+// This is an optimized version for scaling down a ARGB to 1/2 of
+// its original size.
+static void ScaleARGBDown2(int src_width, int src_height,
+                           int dst_width, int dst_height,
+                           int src_stride, int dst_stride,
+                           const uint8* src_argb, uint8* dst_argb,
+                           int x, int dx, int y, int dy,
+                           enum FilterMode filtering) {
+  int j;
+  int row_stride = src_stride * (dy >> 16);
+  void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
+                            uint8* dst_argb, int dst_width) =
+    filtering == kFilterNone ? ScaleARGBRowDown2_C :
+        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C :
+        ScaleARGBRowDown2Box_C);
+  assert(dx == 65536 * 2);  // Test scale factor of 2.
+  assert((dy & 0x1ffff) == 0);  // Test vertical scale is multiple of 2.
+  // Advance to odd row, even column.
+  if (filtering == kFilterBilinear) {
+    src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
+  } else {
+    src_argb += (y >> 16) * src_stride + ((x >> 16) - 1) * 4;
+  }
+
+#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_SSE2 :
+        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2 :
+        ScaleARGBRowDown2Box_Any_SSE2);
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 :
+          (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 :
+          ScaleARGBRowDown2Box_SSE2);
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBROWDOWN2_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_NEON :
+        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON :
+        ScaleARGBRowDown2Box_Any_NEON);
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_NEON :
+          (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON :
+          ScaleARGBRowDown2Box_NEON);
+    }
+  }
+#endif
+
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
+  }
+  for (j = 0; j < dst_height; ++j) {
+    ScaleARGBRowDown2(src_argb, src_stride, dst_argb, dst_width);
+    src_argb += row_stride;
+    dst_argb += dst_stride;
+  }
+}
+
+// ScaleARGB ARGB, 1/4
+// This is an optimized version for scaling down a ARGB to 1/4 of
+// its original size.
+static void ScaleARGBDown4Box(int src_width, int src_height,
+                              int dst_width, int dst_height,
+                              int src_stride, int dst_stride,
+                              const uint8* src_argb, uint8* dst_argb,
+                              int x, int dx, int y, int dy) {
+  int j;
+  // Allocate 2 rows of ARGB.
+  const int kRowSize = (dst_width * 2 * 4 + 31) & ~31;
+  align_buffer_64(row, kRowSize * 2);
+  int row_stride = src_stride * (dy >> 16);
+  void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
+    uint8* dst_argb, int dst_width) = ScaleARGBRowDown2Box_C;
+  // Advance to odd row, even column.
+  src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
+  assert(dx == 65536 * 4);  // Test scale factor of 4.
+  assert((dy & 0x3ffff) == 0);  // Test vertical scale is multiple of 4.
+#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_SSE2;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBROWDOWN2_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON;
+    }
+  }
+#endif
+
+  for (j = 0; j < dst_height; ++j) {
+    ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2);
+    ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride,
+                      row + kRowSize, dst_width * 2);
+    ScaleARGBRowDown2(row, kRowSize, dst_argb, dst_width);
+    src_argb += row_stride;
+    dst_argb += dst_stride;
+  }
+  free_aligned_buffer_64(row);
+}
+
+// ScaleARGB ARGB Even
+// This is an optimized version for scaling down a ARGB to even
+// multiple of its original size.
+static void ScaleARGBDownEven(int src_width, int src_height,
+                              int dst_width, int dst_height,
+                              int src_stride, int dst_stride,
+                              const uint8* src_argb, uint8* dst_argb,
+                              int x, int dx, int y, int dy,
+                              enum FilterMode filtering) {
+  int j;
+  int col_step = dx >> 16;
+  int row_stride = (dy >> 16) * src_stride;
+  void (*ScaleARGBRowDownEven)(const uint8* src_argb, ptrdiff_t src_stride,
+                               int src_step, uint8* dst_argb, int dst_width) =
+      filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C;
+  assert(IS_ALIGNED(src_width, 2));
+  assert(IS_ALIGNED(src_height, 2));
+  src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
+#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2 :
+        ScaleARGBRowDownEven_Any_SSE2;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 :
+          ScaleARGBRowDownEven_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON :
+        ScaleARGBRowDownEven_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON :
+          ScaleARGBRowDownEven_NEON;
+    }
+  }
+#endif
+
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
+  }
+  for (j = 0; j < dst_height; ++j) {
+    ScaleARGBRowDownEven(src_argb, src_stride, col_step, dst_argb, dst_width);
+    src_argb += row_stride;
+    dst_argb += dst_stride;
+  }
+}
+
+// Scale ARGB down with bilinear interpolation.
+static void ScaleARGBBilinearDown(int src_width, int src_height,
+                                  int dst_width, int dst_height,
+                                  int src_stride, int dst_stride,
+                                  const uint8* src_argb, uint8* dst_argb,
+                                  int x, int dx, int y, int dy,
+                                  enum FilterMode filtering) {
+  int j;
+  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_C;
+  void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
+      int dst_width, int x, int dx) =
+      (src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C;
+  int64 xlast = x + (int64)(dst_width - 1) * dx;
+  int64 xl = (dx >= 0) ? x : xlast;
+  int64 xr = (dx >= 0) ? xlast : x;
+  int clip_src_width;
+  xl = (xl >> 16) & ~3;  // Left edge aligned.
+  xr = (xr >> 16) + 1;  // Right most pixel used.  Bilinear uses 2 pixels.
+  xr = (xr + 1 + 3) & ~3;  // 1 beyond 4 pixel aligned right most pixel.
+  if (xr > src_width) {
+    xr = src_width;
+  }
+  clip_src_width = (int)(xr - xl) * 4;  // Width aligned to 4.
+  src_argb += xl * 4;
+  x -= (int)(xl << 16);
+#if defined(HAS_INTERPOLATEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_Any_SSE2;
+    if (IS_ALIGNED(clip_src_width, 16)) {
+      InterpolateRow = InterpolateRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(clip_src_width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(clip_src_width, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(clip_src_width, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
+      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4)) {
+    InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
+    if (IS_ALIGNED(clip_src_width, 4)) {
+      InterpolateRow = InterpolateRow_MIPS_DSPR2;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
+    }
+  }
+#endif
+  // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
+  // Allocate a row of ARGB.
+  {
+    align_buffer_64(row, clip_src_width * 4);
+
+    const int max_y = (src_height - 1) << 16;
+    if (y > max_y) {
+      y = max_y;
+    }
+    for (j = 0; j < dst_height; ++j) {
+      int yi = y >> 16;
+      const uint8* src = src_argb + yi * src_stride;
+      if (filtering == kFilterLinear) {
+        ScaleARGBFilterCols(dst_argb, src, dst_width, x, dx);
+      } else {
+        int yf = (y >> 8) & 255;
+        InterpolateRow(row, src, src_stride, clip_src_width, yf);
+        ScaleARGBFilterCols(dst_argb, row, dst_width, x, dx);
+      }
+      dst_argb += dst_stride;
+      y += dy;
+      if (y > max_y) {
+        y = max_y;
+      }
+    }
+    free_aligned_buffer_64(row);
+  }
+}
+
+// Scale ARGB up with bilinear interpolation.
+static void ScaleARGBBilinearUp(int src_width, int src_height,
+                                int dst_width, int dst_height,
+                                int src_stride, int dst_stride,
+                                const uint8* src_argb, uint8* dst_argb,
+                                int x, int dx, int y, int dy,
+                                enum FilterMode filtering) {
+  int j;
+  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_C;
+  void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
+      int dst_width, int x, int dx) =
+      filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
+  const int max_y = (src_height - 1) << 16;
+#if defined(HAS_INTERPOLATEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_Any_SSE2;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(dst_width, 8)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
+    InterpolateRow = InterpolateRow_MIPS_DSPR2;
+  }
+#endif
+  if (src_width >= 32768) {
+    ScaleARGBFilterCols = filtering ?
+        ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
+  }
+#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
+  if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
+  if (filtering && TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBCOLS_SSE2)
+  if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
+    ScaleARGBFilterCols = ScaleARGBCols_SSE2;
+  }
+#endif
+#if defined(HAS_SCALEARGBCOLS_NEON)
+  if (!filtering && TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBFilterCols = ScaleARGBCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBFilterCols = ScaleARGBCols_NEON;
+    }
+  }
+#endif
+  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+    ScaleARGBFilterCols = ScaleARGBColsUp2_C;
+#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
+    }
+#endif
+  }
+
+  if (y > max_y) {
+    y = max_y;
+  }
+
+  {
+    int yi = y >> 16;
+    const uint8* src = src_argb + yi * src_stride;
+
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (dst_width * 4 + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);
+
+    uint8* rowptr = row;
+    int rowstride = kRowSize;
+    int lasty = yi;
+
+    ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
+    if (src_height > 1) {
+      src += src_stride;
+    }
+    ScaleARGBFilterCols(rowptr + rowstride, src, dst_width, x, dx);
+    src += src_stride;
+
+    for (j = 0; j < dst_height; ++j) {
+      yi = y >> 16;
+      if (yi != lasty) {
+        if (y > max_y) {
+          y = max_y;
+          yi = y >> 16;
+          src = src_argb + yi * src_stride;
+        }
+        if (yi != lasty) {
+          ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
+          rowptr += rowstride;
+          rowstride = -rowstride;
+          lasty = yi;
+          src += src_stride;
+        }
+      }
+      if (filtering == kFilterLinear) {
+        InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);
+      } else {
+        int yf = (y >> 8) & 255;
+        InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
+      }
+      dst_argb += dst_stride;
+      y += dy;
+    }
+    free_aligned_buffer_64(row);
+  }
+}
+
+#ifdef YUVSCALEUP
+// Scale YUV to ARGB up with bilinear interpolation.
+static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
+                                     int dst_width, int dst_height,
+                                     int src_stride_y,
+                                     int src_stride_u,
+                                     int src_stride_v,
+                                     int dst_stride_argb,
+                                     const uint8* src_y,
+                                     const uint8* src_u,
+                                     const uint8* src_v,
+                                     uint8* dst_argb,
+                                     int x, int dx, int y, int dy,
+                                     enum FilterMode filtering) {
+  int j;
+  void (*I422ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToARGBRow_C;
+#if defined(HAS_I422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(src_width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(src_width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGBRow = I422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(src_width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
+  }
+#endif
+
+  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_C;
+#if defined(HAS_INTERPOLATEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_Any_SSE2;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(dst_width, 8)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    InterpolateRow = InterpolateRow_MIPS_DSPR2;
+  }
+#endif
+
+  void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
+      int dst_width, int x, int dx) =
+      filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
+  if (src_width >= 32768) {
+    ScaleARGBFilterCols = filtering ?
+        ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
+  }
+#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
+  if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
+  if (filtering && TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBCOLS_SSE2)
+  if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
+    ScaleARGBFilterCols = ScaleARGBCols_SSE2;
+  }
+#endif
+#if defined(HAS_SCALEARGBCOLS_NEON)
+  if (!filtering && TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBFilterCols = ScaleARGBCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBFilterCols = ScaleARGBCols_NEON;
+    }
+  }
+#endif
+  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+    ScaleARGBFilterCols = ScaleARGBColsUp2_C;
+#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
+    }
+#endif
+  }
+
+  const int max_y = (src_height - 1) << 16;
+  if (y > max_y) {
+    y = max_y;
+  }
+  const int kYShift = 1;  // Shift Y by 1 to convert Y plane to UV coordinate.
+  int yi = y >> 16;
+  int uv_yi = yi >> kYShift;
+  const uint8* src_row_y = src_y + yi * src_stride_y;
+  const uint8* src_row_u = src_u + uv_yi * src_stride_u;
+  const uint8* src_row_v = src_v + uv_yi * src_stride_v;
+
+  // Allocate 2 rows of ARGB.
+  const int kRowSize = (dst_width * 4 + 31) & ~31;
+  align_buffer_64(row, kRowSize * 2);
+
+  // Allocate 1 row of ARGB for source conversion.
+  align_buffer_64(argb_row, src_width * 4);
+
+  uint8* rowptr = row;
+  int rowstride = kRowSize;
+  int lasty = yi;
+
+  // TODO(fbarchard): Convert first 2 rows of YUV to ARGB.
+  ScaleARGBFilterCols(rowptr, src_row_y, dst_width, x, dx);
+  if (src_height > 1) {
+    src_row_y += src_stride_y;
+    if (yi & 1) {
+      src_row_u += src_stride_u;
+      src_row_v += src_stride_v;
+    }
+  }
+  ScaleARGBFilterCols(rowptr + rowstride, src_row_y, dst_width, x, dx);
+  if (src_height > 2) {
+    src_row_y += src_stride_y;
+    if (!(yi & 1)) {
+      src_row_u += src_stride_u;
+      src_row_v += src_stride_v;
+    }
+  }
+
+  for (j = 0; j < dst_height; ++j) {
+    yi = y >> 16;
+    if (yi != lasty) {
+      if (y > max_y) {
+        y = max_y;
+        yi = y >> 16;
+        uv_yi = yi >> kYShift;
+        src_row_y = src_y + yi * src_stride_y;
+        src_row_u = src_u + uv_yi * src_stride_u;
+        src_row_v = src_v + uv_yi * src_stride_v;
+      }
+      if (yi != lasty) {
+        // TODO(fbarchard): Convert the clipped region of row.
+        I422ToARGBRow(src_row_y, src_row_u, src_row_v, argb_row, src_width);
+        ScaleARGBFilterCols(rowptr, argb_row, dst_width, x, dx);
+        rowptr += rowstride;
+        rowstride = -rowstride;
+        lasty = yi;
+        src_row_y += src_stride_y;
+        if (yi & 1) {
+          src_row_u += src_stride_u;
+          src_row_v += src_stride_v;
+        }
+      }
+    }
+    if (filtering == kFilterLinear) {
+      InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);
+    } else {
+      int yf = (y >> 8) & 255;
+      InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
+    }
+    dst_argb += dst_stride_argb;
+    y += dy;
+  }
+  free_aligned_buffer_64(row);
+  free_aligned_buffer_64(row_argb);
+}
+#endif
+
+// Scale ARGB to/from any dimensions, without interpolation.
+// Fixed point math is used for performance: The upper 16 bits
+// of x and dx is the integer part of the source position and
+// the lower 16 bits are the fixed decimal part.
+
+static void ScaleARGBSimple(int src_width, int src_height,
+                            int dst_width, int dst_height,
+                            int src_stride, int dst_stride,
+                            const uint8* src_argb, uint8* dst_argb,
+                            int x, int dx, int y, int dy) {
+  int j;
+  void (*ScaleARGBCols)(uint8* dst_argb, const uint8* src_argb,
+      int dst_width, int x, int dx) =
+      (src_width >= 32768) ? ScaleARGBCols64_C : ScaleARGBCols_C;
+#if defined(HAS_SCALEARGBCOLS_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
+    ScaleARGBCols = ScaleARGBCols_SSE2;
+  }
+#endif
+#if defined(HAS_SCALEARGBCOLS_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBCols = ScaleARGBCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBCols = ScaleARGBCols_NEON;
+    }
+  }
+#endif
+  if (src_width * 2 == dst_width && x < 0x8000) {
+    ScaleARGBCols = ScaleARGBColsUp2_C;
+#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBCols = ScaleARGBColsUp2_SSE2;
+    }
+#endif
+  }
+
+  for (j = 0; j < dst_height; ++j) {
+    ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride,
+                  dst_width, x, dx);
+    dst_argb += dst_stride;
+    y += dy;
+  }
+}
+
+// ScaleARGB a ARGB.
+// This function in turn calls a scaling function
+// suitable for handling the desired resolutions.
+static void ScaleARGB(const uint8* src, int src_stride,
+                      int src_width, int src_height,
+                      uint8* dst, int dst_stride,
+                      int dst_width, int dst_height,
+                      int clip_x, int clip_y, int clip_width, int clip_height,
+                      enum FilterMode filtering) {
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  // ARGB does not support box filter yet, but allow the user to pass it.
+  // Simplify filtering when possible.
+  filtering = ScaleFilterReduce(src_width, src_height,
+                                dst_width, dst_height,
+                                filtering);
+
+  // Negative src_height means invert the image.
+  if (src_height < 0) {
+    src_height = -src_height;
+    src = src + (src_height - 1) * src_stride;
+    src_stride = -src_stride;
+  }
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
+             &x, &y, &dx, &dy);
+  src_width = Abs(src_width);
+  if (clip_x) {
+    int64 clipf = (int64)(clip_x) * dx;
+    x += (clipf & 0xffff);
+    src += (clipf >> 16) * 4;
+    dst += clip_x * 4;
+  }
+  if (clip_y) {
+    int64 clipf = (int64)(clip_y) * dy;
+    y += (clipf & 0xffff);
+    src += (clipf >> 16) * src_stride;
+    dst += clip_y * dst_stride;
+  }
+
+  // Special case for integer step values.
+  if (((dx | dy) & 0xffff) == 0) {
+    if (!dx || !dy) {  // 1 pixel wide and/or tall.
+      filtering = kFilterNone;
+    } else {
+      // Optimized even scale down. ie 2, 4, 6, 8, 10x.
+      if (!(dx & 0x10000) && !(dy & 0x10000)) {
+        if (dx == 0x20000) {
+          // Optimized 1/2 downsample.
+          ScaleARGBDown2(src_width, src_height,
+                         clip_width, clip_height,
+                         src_stride, dst_stride, src, dst,
+                         x, dx, y, dy, filtering);
+          return;
+        }
+        if (dx == 0x40000 && filtering == kFilterBox) {
+          // Optimized 1/4 box downsample.
+          ScaleARGBDown4Box(src_width, src_height,
+                            clip_width, clip_height,
+                            src_stride, dst_stride, src, dst,
+                            x, dx, y, dy);
+          return;
+        }
+        ScaleARGBDownEven(src_width, src_height,
+                          clip_width, clip_height,
+                          src_stride, dst_stride, src, dst,
+                          x, dx, y, dy, filtering);
+        return;
+      }
+      // Optimized odd scale down. ie 3, 5, 7, 9x.
+      if ((dx & 0x10000) && (dy & 0x10000)) {
+        filtering = kFilterNone;
+        if (dx == 0x10000 && dy == 0x10000) {
+          // Straight copy.
+          ARGBCopy(src + (y >> 16) * src_stride + (x >> 16) * 4, src_stride,
+                   dst, dst_stride, clip_width, clip_height);
+          return;
+        }
+      }
+    }
+  }
+  if (dx == 0x10000 && (x & 0xffff) == 0) {
+    // Arbitrary scale vertically, but unscaled vertically.
+    ScalePlaneVertical(src_height,
+                       clip_width, clip_height,
+                       src_stride, dst_stride, src, dst,
+                       x, y, dy, 4, filtering);
+    return;
+  }
+  if (filtering && dy < 65536) {
+    ScaleARGBBilinearUp(src_width, src_height,
+                        clip_width, clip_height,
+                        src_stride, dst_stride, src, dst,
+                        x, dx, y, dy, filtering);
+    return;
+  }
+  if (filtering) {
+    ScaleARGBBilinearDown(src_width, src_height,
+                          clip_width, clip_height,
+                          src_stride, dst_stride, src, dst,
+                          x, dx, y, dy, filtering);
+    return;
+  }
+  ScaleARGBSimple(src_width, src_height, clip_width, clip_height,
+                  src_stride, dst_stride, src, dst,
+                  x, dx, y, dy);
+}
+
+LIBYUV_API
+int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
+                  int src_width, int src_height,
+                  uint8* dst_argb, int dst_stride_argb,
+                  int dst_width, int dst_height,
+                  int clip_x, int clip_y, int clip_width, int clip_height,
+                  enum FilterMode filtering) {
+  if (!src_argb || src_width == 0 || src_height == 0 ||
+      !dst_argb || dst_width <= 0 || dst_height <= 0 ||
+      clip_x < 0 || clip_y < 0 ||
+      clip_width > 32768 || clip_height > 32768 ||
+      (clip_x + clip_width) > dst_width ||
+      (clip_y + clip_height) > dst_height) {
+    return -1;
+  }
+  ScaleARGB(src_argb, src_stride_argb, src_width, src_height,
+            dst_argb, dst_stride_argb, dst_width, dst_height,
+            clip_x, clip_y, clip_width, clip_height, filtering);
+  return 0;
+}
+
+// Scale an ARGB image.
+LIBYUV_API
+int ARGBScale(const uint8* src_argb, int src_stride_argb,
+              int src_width, int src_height,
+              uint8* dst_argb, int dst_stride_argb,
+              int dst_width, int dst_height,
+              enum FilterMode filtering) {
+  if (!src_argb || src_width == 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 ||
+      !dst_argb || dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }
+  ScaleARGB(src_argb, src_stride_argb, src_width, src_height,
+            dst_argb, dst_stride_argb, dst_width, dst_height,
+            0, 0, dst_width, dst_height, filtering);
+  return 0;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libvpx/third_party/libyuv/source/scale_common.cc b/libs/libvpx/third_party/libyuv/source/scale_common.cc
new file mode 100644
index 0000000000..1711f3d54c
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/source/scale_common.cc
@@ -0,0 +1,1137 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"  // For CopyARGB
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+static __inline int Abs(int v) {
+  return v >= 0 ? v : -v;
+}
+
+// CPU agnostic row functions
+void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                     uint8* dst, int dst_width) {
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = src_ptr[1];
+    dst[1] = src_ptr[3];
+    dst += 2;
+    src_ptr += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = src_ptr[1];
+  }
+}
+
+void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                        uint16* dst, int dst_width) {
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = src_ptr[1];
+    dst[1] = src_ptr[3];
+    dst += 2;
+    src_ptr += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = src_ptr[1];
+  }
+}
+
+void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst, int dst_width) {
+  const uint8* s = src_ptr;
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = (s[0] + s[1] + 1) >> 1;
+    dst[1] = (s[2] + s[3] + 1) >> 1;
+    dst += 2;
+    s += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = (s[0] + s[1] + 1) >> 1;
+  }
+}
+
+void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                              uint16* dst, int dst_width) {
+  const uint16* s = src_ptr;
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = (s[0] + s[1] + 1) >> 1;
+    dst[1] = (s[2] + s[3] + 1) >> 1;
+    dst += 2;
+    s += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = (s[0] + s[1] + 1) >> 1;
+  }
+}
+
+void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst, int dst_width) {
+  const uint8* s = src_ptr;
+  const uint8* t = src_ptr + src_stride;
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+    dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
+    dst += 2;
+    s += 4;
+    t += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+  }
+}
+
+void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                           uint16* dst, int dst_width) {
+  const uint16* s = src_ptr;
+  const uint16* t = src_ptr + src_stride;
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+    dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
+    dst += 2;
+    s += 4;
+    t += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+  }
+}
+
+void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                     uint8* dst, int dst_width) {
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = src_ptr[2];
+    dst[1] = src_ptr[6];
+    dst += 2;
+    src_ptr += 8;
+  }
+  if (dst_width & 1) {
+    dst[0] = src_ptr[2];
+  }
+}
+
+void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                        uint16* dst, int dst_width) {
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = src_ptr[2];
+    dst[1] = src_ptr[6];
+    dst += 2;
+    src_ptr += 8;
+  }
+  if (dst_width & 1) {
+    dst[0] = src_ptr[2];
+  }
+}
+
+void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst, int dst_width) {
+  intptr_t stride = src_stride;
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+             src_ptr[stride + 0] + src_ptr[stride + 1] +
+             src_ptr[stride + 2] + src_ptr[stride + 3] +
+             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
+             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
+             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
+             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
+             8) >> 4;
+    dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
+             src_ptr[stride + 4] + src_ptr[stride + 5] +
+             src_ptr[stride + 6] + src_ptr[stride + 7] +
+             src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +
+             src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +
+             src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +
+             src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +
+             8) >> 4;
+    dst += 2;
+    src_ptr += 8;
+  }
+  if (dst_width & 1) {
+    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+             src_ptr[stride + 0] + src_ptr[stride + 1] +
+             src_ptr[stride + 2] + src_ptr[stride + 3] +
+             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
+             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
+             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
+             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
+             8) >> 4;
+  }
+}
+
+void ScaleRowDown4Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                           uint16* dst, int dst_width) {
+  intptr_t stride = src_stride;
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+             src_ptr[stride + 0] + src_ptr[stride + 1] +
+             src_ptr[stride + 2] + src_ptr[stride + 3] +
+             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
+             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
+             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
+             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
+             8) >> 4;
+    dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
+             src_ptr[stride + 4] + src_ptr[stride + 5] +
+             src_ptr[stride + 6] + src_ptr[stride + 7] +
+             src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +
+             src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +
+             src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +
+             src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +
+             8) >> 4;
+    dst += 2;
+    src_ptr += 8;
+  }
+  if (dst_width & 1) {
+    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+             src_ptr[stride + 0] + src_ptr[stride + 1] +
+             src_ptr[stride + 2] + src_ptr[stride + 3] +
+             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
+             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
+             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
+             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
+             8) >> 4;
+  }
+}
+
+void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                      uint8* dst, int dst_width) {
+  int x;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (x = 0; x < dst_width; x += 3) {
+    dst[0] = src_ptr[0];
+    dst[1] = src_ptr[1];
+    dst[2] = src_ptr[3];
+    dst += 3;
+    src_ptr += 4;
+  }
+}
+
+void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                         uint16* dst, int dst_width) {
+  int x;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (x = 0; x < dst_width; x += 3) {
+    dst[0] = src_ptr[0];
+    dst[1] = src_ptr[1];
+    dst[2] = src_ptr[3];
+    dst += 3;
+    src_ptr += 4;
+  }
+}
+
+// Filter rows 0 and 1 together, 3 : 1
+void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* d, int dst_width) {
+  const uint8* s = src_ptr;
+  const uint8* t = src_ptr + src_stride;
+  int x;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (x = 0; x < dst_width; x += 3) {
+    uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+    uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+    uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+    uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+    uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+    uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+    d[0] = (a0 * 3 + b0 + 2) >> 2;
+    d[1] = (a1 * 3 + b1 + 2) >> 2;
+    d[2] = (a2 * 3 + b2 + 2) >> 2;
+    d += 3;
+    s += 4;
+    t += 4;
+  }
+}
+
+void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                               uint16* d, int dst_width) {
+  const uint16* s = src_ptr;
+  const uint16* t = src_ptr + src_stride;
+  int x;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (x = 0; x < dst_width; x += 3) {
+    uint16 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+    uint16 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+    uint16 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+    uint16 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+    uint16 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+    uint16 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+    d[0] = (a0 * 3 + b0 + 2) >> 2;
+    d[1] = (a1 * 3 + b1 + 2) >> 2;
+    d[2] = (a2 * 3 + b2 + 2) >> 2;
+    d += 3;
+    s += 4;
+    t += 4;
+  }
+}
+
+// Filter rows 1 and 2 together, 1 : 1
+void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* d, int dst_width) {
+  const uint8* s = src_ptr;
+  const uint8* t = src_ptr + src_stride;
+  int x;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (x = 0; x < dst_width; x += 3) {
+    uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+    uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+    uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+    uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+    uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+    uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+    d[0] = (a0 + b0 + 1) >> 1;
+    d[1] = (a1 + b1 + 1) >> 1;
+    d[2] = (a2 + b2 + 1) >> 1;
+    d += 3;
+    s += 4;
+    t += 4;
+  }
+}
+
+void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                               uint16* d, int dst_width) {
+  const uint16* s = src_ptr;
+  const uint16* t = src_ptr + src_stride;
+  int x;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (x = 0; x < dst_width; x += 3) {
+    uint16 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+    uint16 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+    uint16 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+    uint16 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+    uint16 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+    uint16 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+    d[0] = (a0 + b0 + 1) >> 1;
+    d[1] = (a1 + b1 + 1) >> 1;
+    d[2] = (a2 + b2 + 1) >> 1;
+    d += 3;
+    s += 4;
+    t += 4;
+  }
+}
+
+// Scales a single row of pixels using point sampling.
+void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,
+                 int dst_width, int x, int dx) {
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst_ptr[0] = src_ptr[x >> 16];
+    x += dx;
+    dst_ptr[1] = src_ptr[x >> 16];
+    x += dx;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    dst_ptr[0] = src_ptr[x >> 16];
+  }
+}
+
+void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                    int dst_width, int x, int dx) {
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst_ptr[0] = src_ptr[x >> 16];
+    x += dx;
+    dst_ptr[1] = src_ptr[x >> 16];
+    x += dx;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    dst_ptr[0] = src_ptr[x >> 16];
+  }
+}
+
+// Scales a single row of pixels up by 2x using point sampling.
+void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,
+                    int dst_width, int x, int dx) {
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst_ptr[1] = dst_ptr[0] = src_ptr[0];
+    src_ptr += 1;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    dst_ptr[0] = src_ptr[0];
+  }
+}
+
+void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                       int dst_width, int x, int dx) {
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst_ptr[1] = dst_ptr[0] = src_ptr[0];
+    src_ptr += 1;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    dst_ptr[0] = src_ptr[0];
+  }
+}
+
+// (1-f)a + fb can be replaced with a + f(b-a)
+#define BLENDER(a, b, f) (uint8)((int)(a) + \
+    ((int)(f) * ((int)(b) - (int)(a)) >> 16))
+
+void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
+                       int dst_width, int x, int dx) {
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    int xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    xi = x >> 16;
+    a = src_ptr[xi];
+    b = src_ptr[xi + 1];
+    dst_ptr[1] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    int xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+  }
+}
+
+void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,
+                         int dst_width, int x32, int dx) {
+  int64 x = (int64)(x32);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    int64 xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    xi = x >> 16;
+    a = src_ptr[xi];
+    b = src_ptr[xi + 1];
+    dst_ptr[1] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    int64 xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+  }
+}
+#undef BLENDER
+
+#define BLENDER(a, b, f) (uint16)((int)(a) + \
+    ((int)(f) * ((int)(b) - (int)(a)) >> 16))
+
+void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                       int dst_width, int x, int dx) {
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    int xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    xi = x >> 16;
+    a = src_ptr[xi];
+    b = src_ptr[xi + 1];
+    dst_ptr[1] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    int xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+  }
+}
+
+void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                         int dst_width, int x32, int dx) {
+  int64 x = (int64)(x32);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    int64 xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    xi = x >> 16;
+    a = src_ptr[xi];
+    b = src_ptr[xi + 1];
+    dst_ptr[1] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    int64 xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+  }
+}
+#undef BLENDER
+
+void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                      uint8* dst, int dst_width) {
+  int x;
+  assert(dst_width % 3 == 0);
+  for (x = 0; x < dst_width; x += 3) {
+    dst[0] = src_ptr[0];
+    dst[1] = src_ptr[3];
+    dst[2] = src_ptr[6];
+    dst += 3;
+    src_ptr += 8;
+  }
+}
+
+void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                         uint16* dst, int dst_width) {
+  int x;
+  assert(dst_width % 3 == 0);
+  for (x = 0; x < dst_width; x += 3) {
+    dst[0] = src_ptr[0];
+    dst[1] = src_ptr[3];
+    dst[2] = src_ptr[6];
+    dst += 3;
+    src_ptr += 8;
+  }
+}
+
+// 8x3 -> 3x1
+void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width) {
+  intptr_t stride = src_stride;
+  int i;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (i = 0; i < dst_width; i += 3) {
+    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
+        src_ptr[stride + 0] + src_ptr[stride + 1] +
+        src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
+        src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
+        (65536 / 9) >> 16;
+    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
+        src_ptr[stride + 3] + src_ptr[stride + 4] +
+        src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
+        src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
+        (65536 / 9) >> 16;
+    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
+        src_ptr[stride + 6] + src_ptr[stride + 7] +
+        src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
+        (65536 / 6) >> 16;
+    src_ptr += 8;
+    dst_ptr += 3;
+  }
+}
+
+void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint16* dst_ptr, int dst_width) {
+  intptr_t stride = src_stride;
+  int i;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (i = 0; i < dst_width; i += 3) {
+    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
+        src_ptr[stride + 0] + src_ptr[stride + 1] +
+        src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
+        src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
+        (65536 / 9) >> 16;
+    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
+        src_ptr[stride + 3] + src_ptr[stride + 4] +
+        src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
+        src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
+        (65536 / 9) >> 16;
+    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
+        src_ptr[stride + 6] + src_ptr[stride + 7] +
+        src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
+        (65536 / 6) >> 16;
+    src_ptr += 8;
+    dst_ptr += 3;
+  }
+}
+
+// 8x2 -> 3x1
+void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width) {
+  intptr_t stride = src_stride;
+  int i;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (i = 0; i < dst_width; i += 3) {
+    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
+        src_ptr[stride + 0] + src_ptr[stride + 1] +
+        src_ptr[stride + 2]) * (65536 / 6) >> 16;
+    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
+        src_ptr[stride + 3] + src_ptr[stride + 4] +
+        src_ptr[stride + 5]) * (65536 / 6) >> 16;
+    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
+        src_ptr[stride + 6] + src_ptr[stride + 7]) *
+        (65536 / 4) >> 16;
+    src_ptr += 8;
+    dst_ptr += 3;
+  }
+}
+
+void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                               uint16* dst_ptr, int dst_width) {
+  intptr_t stride = src_stride;
+  int i;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (i = 0; i < dst_width; i += 3) {
+    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
+        src_ptr[stride + 0] + src_ptr[stride + 1] +
+        src_ptr[stride + 2]) * (65536 / 6) >> 16;
+    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
+        src_ptr[stride + 3] + src_ptr[stride + 4] +
+        src_ptr[stride + 5]) * (65536 / 6) >> 16;
+    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
+        src_ptr[stride + 6] + src_ptr[stride + 7]) *
+        (65536 / 4) >> 16;
+    src_ptr += 8;
+    dst_ptr += 3;
+  }
+}
+
+void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
+  int x;
+  assert(src_width > 0);
+  for (x = 0; x < src_width - 1; x += 2) {
+    dst_ptr[0] += src_ptr[0];
+    dst_ptr[1] += src_ptr[1];
+    src_ptr += 2;
+    dst_ptr += 2;
+  }
+  if (src_width & 1) {
+    dst_ptr[0] += src_ptr[0];
+  }
+}
+
+void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width) {
+  int x;
+  assert(src_width > 0);
+  for (x = 0; x < src_width - 1; x += 2) {
+    dst_ptr[0] += src_ptr[0];
+    dst_ptr[1] += src_ptr[1];
+    src_ptr += 2;
+    dst_ptr += 2;
+  }
+  if (src_width & 1) {
+    dst_ptr[0] += src_ptr[0];
+  }
+}
+
+void ScaleARGBRowDown2_C(const uint8* src_argb,
+                         ptrdiff_t src_stride,
+                         uint8* dst_argb, int dst_width) {
+  const uint32* src = (const uint32*)(src_argb);
+  uint32* dst = (uint32*)(dst_argb);
+
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = src[1];
+    dst[1] = src[3];
+    src += 4;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[1];
+  }
+}
+
+void ScaleARGBRowDown2Linear_C(const uint8* src_argb,
+                               ptrdiff_t src_stride,
+                               uint8* dst_argb, int dst_width) {
+  int x;
+  for (x = 0; x < dst_width; ++x) {
+    dst_argb[0] = (src_argb[0] + src_argb[4] + 1) >> 1;
+    dst_argb[1] = (src_argb[1] + src_argb[5] + 1) >> 1;
+    dst_argb[2] = (src_argb[2] + src_argb[6] + 1) >> 1;
+    dst_argb[3] = (src_argb[3] + src_argb[7] + 1) >> 1;
+    src_argb += 8;
+    dst_argb += 4;
+  }
+}
+
+void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride,
+                            uint8* dst_argb, int dst_width) {
+  int x;
+  for (x = 0; x < dst_width; ++x) {
+    dst_argb[0] = (src_argb[0] + src_argb[4] +
+                  src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2;
+    dst_argb[1] = (src_argb[1] + src_argb[5] +
+                  src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2;
+    dst_argb[2] = (src_argb[2] + src_argb[6] +
+                  src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2;
+    dst_argb[3] = (src_argb[3] + src_argb[7] +
+                  src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2;
+    src_argb += 8;
+    dst_argb += 4;
+  }
+}
+
+void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride,
+                            int src_stepx,
+                            uint8* dst_argb, int dst_width) {
+  const uint32* src = (const uint32*)(src_argb);
+  uint32* dst = (uint32*)(dst_argb);
+
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = src[0];
+    dst[1] = src[src_stepx];
+    src += src_stepx * 2;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[0];
+  }
+}
+
+void ScaleARGBRowDownEvenBox_C(const uint8* src_argb,
+                               ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8* dst_argb, int dst_width) {
+  int x;
+  for (x = 0; x < dst_width; ++x) {
+    dst_argb[0] = (src_argb[0] + src_argb[4] +
+                  src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2;
+    dst_argb[1] = (src_argb[1] + src_argb[5] +
+                  src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2;
+    dst_argb[2] = (src_argb[2] + src_argb[6] +
+                  src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2;
+    dst_argb[3] = (src_argb[3] + src_argb[7] +
+                  src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2;
+    src_argb += src_stepx * 4;
+    dst_argb += 4;
+  }
+}
+
+// Scales a single row of pixels using point sampling.
+void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb,
+                     int dst_width, int x, int dx) {
+  const uint32* src = (const uint32*)(src_argb);
+  uint32* dst = (uint32*)(dst_argb);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst[0] = src[x >> 16];
+    x += dx;
+    dst[1] = src[x >> 16];
+    x += dx;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[x >> 16];
+  }
+}
+
+void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb,
+                       int dst_width, int x32, int dx) {
+  int64 x = (int64)(x32);
+  const uint32* src = (const uint32*)(src_argb);
+  uint32* dst = (uint32*)(dst_argb);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst[0] = src[x >> 16];
+    x += dx;
+    dst[1] = src[x >> 16];
+    x += dx;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[x >> 16];
+  }
+}
+
+// Scales a single row of pixels up by 2x using point sampling.
+void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,
+                        int dst_width, int x, int dx) {
+  const uint32* src = (const uint32*)(src_argb);
+  uint32* dst = (uint32*)(dst_argb);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst[1] = dst[0] = src[0];
+    src += 1;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[0];
+  }
+}
+
+// Mimics SSSE3 blender
+#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7
+#define BLENDERC(a, b, f, s) (uint32)( \
+    BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
+#define BLENDER(a, b, f) \
+    BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | \
+    BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0)
+
+void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,
+                           int dst_width, int x, int dx) {
+  const uint32* src = (const uint32*)(src_argb);
+  uint32* dst = (uint32*)(dst_argb);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    int xi = x >> 16;
+    int xf = (x >> 9) & 0x7f;
+    uint32 a = src[xi];
+    uint32 b = src[xi + 1];
+    dst[0] = BLENDER(a, b, xf);
+    x += dx;
+    xi = x >> 16;
+    xf = (x >> 9) & 0x7f;
+    a = src[xi];
+    b = src[xi + 1];
+    dst[1] = BLENDER(a, b, xf);
+    x += dx;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    int xi = x >> 16;
+    int xf = (x >> 9) & 0x7f;
+    uint32 a = src[xi];
+    uint32 b = src[xi + 1];
+    dst[0] = BLENDER(a, b, xf);
+  }
+}
+
+void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
+                             int dst_width, int x32, int dx) {
+  int64 x = (int64)(x32);
+  const uint32* src = (const uint32*)(src_argb);
+  uint32* dst = (uint32*)(dst_argb);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    int64 xi = x >> 16;
+    int xf = (x >> 9) & 0x7f;
+    uint32 a = src[xi];
+    uint32 b = src[xi + 1];
+    dst[0] = BLENDER(a, b, xf);
+    x += dx;
+    xi = x >> 16;
+    xf = (x >> 9) & 0x7f;
+    a = src[xi];
+    b = src[xi + 1];
+    dst[1] = BLENDER(a, b, xf);
+    x += dx;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    int64 xi = x >> 16;
+    int xf = (x >> 9) & 0x7f;
+    uint32 a = src[xi];
+    uint32 b = src[xi + 1];
+    dst[0] = BLENDER(a, b, xf);
+  }
+}
+#undef BLENDER1
+#undef BLENDERC
+#undef BLENDER
+
+// Scale plane vertically with bilinear interpolation.
+void ScalePlaneVertical(int src_height,
+                        int dst_width, int dst_height,
+                        int src_stride, int dst_stride,
+                        const uint8* src_argb, uint8* dst_argb,
+                        int x, int y, int dy,
+                        int bpp, enum FilterMode filtering) {
+  // TODO(fbarchard): Allow higher bpp.
+  int dst_width_bytes = dst_width * bpp;
+  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_C;
+  const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
+  int j;
+  assert(bpp >= 1 && bpp <= 4);
+  assert(src_height != 0);
+  assert(dst_width > 0);
+  assert(dst_height > 0);
+  src_argb += (x >> 16) * bpp;
+#if defined(HAS_INTERPOLATEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_Any_SSE2;
+    if (IS_ALIGNED(dst_width_bytes, 16)) {
+      InterpolateRow = InterpolateRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(dst_width_bytes, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(dst_width_bytes, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(dst_width_bytes, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
+      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
+    InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
+    if (IS_ALIGNED(dst_width_bytes, 4)) {
+      InterpolateRow = InterpolateRow_MIPS_DSPR2;
+    }
+  }
+#endif
+  for (j = 0; j < dst_height; ++j) {
+    int yi;
+    int yf;
+    if (y > max_y) {
+      y = max_y;
+    }
+    yi = y >> 16;
+    yf = filtering ? ((y >> 8) & 255) : 0;
+    InterpolateRow(dst_argb, src_argb + yi * src_stride,
+                   src_stride, dst_width_bytes, yf);
+    dst_argb += dst_stride;
+    y += dy;
+  }
+}
+void ScalePlaneVertical_16(int src_height,
+                           int dst_width, int dst_height,
+                           int src_stride, int dst_stride,
+                           const uint16* src_argb, uint16* dst_argb,
+                           int x, int y, int dy,
+                           int wpp, enum FilterMode filtering) {
+  // TODO(fbarchard): Allow higher wpp.
+  int dst_width_words = dst_width * wpp;
+  void (*InterpolateRow)(uint16* dst_argb, const uint16* src_argb,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_16_C;
+  const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
+  int j;
+  assert(wpp >= 1 && wpp <= 2);
+  assert(src_height != 0);
+  assert(dst_width > 0);
+  assert(dst_height > 0);
+  src_argb += (x >> 16) * wpp;
+#if defined(HAS_INTERPOLATEROW_16_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_Any_16_SSE2;
+    if (IS_ALIGNED(dst_width_bytes, 16)) {
+      InterpolateRow = InterpolateRow_16_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_16_SSSE3;
+    if (IS_ALIGNED(dst_width_bytes, 16)) {
+      InterpolateRow = InterpolateRow_16_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_16_AVX2;
+    if (IS_ALIGNED(dst_width_bytes, 32)) {
+      InterpolateRow = InterpolateRow_16_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_16_NEON;
+    if (IS_ALIGNED(dst_width_bytes, 16)) {
+      InterpolateRow = InterpolateRow_16_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
+      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
+    InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2;
+    if (IS_ALIGNED(dst_width_bytes, 4)) {
+      InterpolateRow = InterpolateRow_16_MIPS_DSPR2;
+    }
+  }
+#endif
+  for (j = 0; j < dst_height; ++j) {
+    int yi;
+    int yf;
+    if (y > max_y) {
+      y = max_y;
+    }
+    yi = y >> 16;
+    yf = filtering ? ((y >> 8) & 255) : 0;
+    InterpolateRow(dst_argb, src_argb + yi * src_stride,
+                   src_stride, dst_width_words, yf);
+    dst_argb += dst_stride;
+    y += dy;
+  }
+}
+
+// Simplify the filtering based on scale factors.
+enum FilterMode ScaleFilterReduce(int src_width, int src_height,
+                                  int dst_width, int dst_height,
+                                  enum FilterMode filtering) {
+  if (src_width < 0) {
+    src_width = -src_width;
+  }
+  if (src_height < 0) {
+    src_height = -src_height;
+  }
+  if (filtering == kFilterBox) {
+    // If scaling both axis to 0.5 or larger, switch from Box to Bilinear.
+    if (dst_width * 2 >= src_width && dst_height * 2 >= src_height) {
+      filtering = kFilterBilinear;
+    }
+  }
+  if (filtering == kFilterBilinear) {
+    if (src_height == 1) {
+      filtering = kFilterLinear;
+    }
+    // TODO(fbarchard): Detect any odd scale factor and reduce to Linear.
+    if (dst_height == src_height || dst_height * 3 == src_height) {
+      filtering = kFilterLinear;
+    }
+    // TODO(fbarchard): Remove 1 pixel wide filter restriction, which is to
+    // avoid reading 2 pixels horizontally that causes memory exception.
+    if (src_width == 1) {
+      filtering = kFilterNone;
+    }
+  }
+  if (filtering == kFilterLinear) {
+    if (src_width == 1) {
+      filtering = kFilterNone;
+    }
+    // TODO(fbarchard): Detect any odd scale factor and reduce to None.
+    if (dst_width == src_width || dst_width * 3 == src_width) {
+      filtering = kFilterNone;
+    }
+  }
+  return filtering;
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv_C(int num, int div) {
+  return (int)(((int64)(num) << 16) / div);
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv1_C(int num, int div) {
+  return (int)((((int64)(num) << 16) - 0x00010001) /
+                          (div - 1));
+}
+
+#define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s)
+
+// Compute slope values for stepping.
+void ScaleSlope(int src_width, int src_height,
+                int dst_width, int dst_height,
+                enum FilterMode filtering,
+                int* x, int* y, int* dx, int* dy) {
+  assert(x != NULL);
+  assert(y != NULL);
+  assert(dx != NULL);
+  assert(dy != NULL);
+  assert(src_width != 0);
+  assert(src_height != 0);
+  assert(dst_width > 0);
+  assert(dst_height > 0);
+  // Check for 1 pixel and avoid FixedDiv overflow.
+  if (dst_width == 1 && src_width >= 32768) {
+    dst_width = src_width;
+  }
+  if (dst_height == 1 && src_height >= 32768) {
+    dst_height = src_height;
+  }
+  if (filtering == kFilterBox) {
+    // Scale step for point sampling duplicates all pixels equally.
+    *dx = FixedDiv(Abs(src_width), dst_width);
+    *dy = FixedDiv(src_height, dst_height);
+    *x = 0;
+    *y = 0;
+  } else if (filtering == kFilterBilinear) {
+    // Scale step for bilinear sampling renders last pixel once for upsample.
+    if (dst_width <= Abs(src_width)) {
+      *dx = FixedDiv(Abs(src_width), dst_width);
+      *x = CENTERSTART(*dx, -32768);  // Subtract 0.5 (32768) to center filter.
+    } else if (dst_width > 1) {
+      *dx = FixedDiv1(Abs(src_width), dst_width);
+      *x = 0;
+    }
+    if (dst_height <= src_height) {
+      *dy = FixedDiv(src_height,  dst_height);
+      *y = CENTERSTART(*dy, -32768);  // Subtract 0.5 (32768) to center filter.
+    } else if (dst_height > 1) {
+      *dy = FixedDiv1(src_height, dst_height);
+      *y = 0;
+    }
+  } else if (filtering == kFilterLinear) {
+    // Scale step for bilinear sampling renders last pixel once for upsample.
+    if (dst_width <= Abs(src_width)) {
+      *dx = FixedDiv(Abs(src_width), dst_width);
+      *x = CENTERSTART(*dx, -32768);  // Subtract 0.5 (32768) to center filter.
+    } else if (dst_width > 1) {
+      *dx = FixedDiv1(Abs(src_width), dst_width);
+      *x = 0;
+    }
+    *dy = FixedDiv(src_height, dst_height);
+    *y = *dy >> 1;
+  } else {
+    // Scale step for point sampling duplicates all pixels equally.
+    *dx = FixedDiv(Abs(src_width), dst_width);
+    *dy = FixedDiv(src_height, dst_height);
+    *x = CENTERSTART(*dx, 0);
+    *y = CENTERSTART(*dy, 0);
+  }
+  // Negative src_width means horizontally mirror.
+  if (src_width < 0) {
+    *x += (dst_width - 1) * *dx;
+    *dx = -*dx;
+    // src_width = -src_width;   // Caller must do this.
+  }
+}
+#undef CENTERSTART
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libvpx/third_party/libyuv/source/scale_gcc.cc b/libs/libvpx/third_party/libyuv/source/scale_gcc.cc
new file mode 100644
index 0000000000..8a6ac54592
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/source/scale_gcc.cc
@@ -0,0 +1,1089 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+
+// Offsets for source bytes 0 to 9
+static uvec8 kShuf0 =
+  { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
+static uvec8 kShuf1 =
+  { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static uvec8 kShuf2 =
+  { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 0 to 10
+static uvec8 kShuf01 =
+  { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
+
+// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
+static uvec8 kShuf11 =
+  { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static uvec8 kShuf21 =
+  { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
+
+// Coefficients for source bytes 0 to 10
+static uvec8 kMadd01 =
+  { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
+
+// Coefficients for source bytes 10 to 21
+static uvec8 kMadd11 =
+  { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
+
+// Coefficients for source bytes 21 to 31
+static uvec8 kMadd21 =
+  { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
+
+// Coefficients for source bytes 21 to 31
+static vec16 kRound34 =
+  { 2, 2, 2, 2, 2, 2, 2, 2 };
+
+static uvec8 kShuf38a =
+  { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+static uvec8 kShuf38b =
+  { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
+
+// Arrange words 0,3,6 into 0,1,2
+static uvec8 kShufAc =
+  { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Arrange words 0,3,6 into 3,4,5
+static uvec8 kShufAc3 =
+  { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
+
+// Scaling values for boxes of 3x3 and 2x3
+static uvec16 kScaleAc33 =
+  { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
+
+// Arrange first value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb0 =
+  { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
+
+// Arrange second value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb1 =
+  { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
+
+// Arrange third value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb2 =
+  { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
+
+// Scaling values for boxes of 3x2 and 2x2
+static uvec16 kScaleAb2 =
+  { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
+
+// GCC versions of row functions are verbatim conversions from Visual C.
+// Generated using gcc disassembly on Visual C object file:
+// objdump -D yuvscaler.obj >yuvscaler.txt
+
+void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  :: "memory", "cc", "xmm0", "xmm1"
+  );
+}
+
+void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10, 0) ",%%xmm1  \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "movdqa    %%xmm1,%%xmm3                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "pand      %%xmm5,%%xmm2                   \n"
+    "pand      %%xmm5,%%xmm3                   \n"
+    "pavgw     %%xmm2,%%xmm0                   \n"
+    "pavgw     %%xmm3,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  :: "memory", "cc", "xmm0", "xmm1", "xmm5"
+  );
+}
+
+void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu  (%0,%3,1),%%xmm2
+    MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu  0x10(%0,%3,1),%%xmm3
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "pavgb     %%xmm3,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "movdqa    %%xmm1,%%xmm3                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "pand      %%xmm5,%%xmm2                   \n"
+    "pand      %%xmm5,%%xmm3                   \n"
+    "pavgw     %%xmm2,%%xmm0                   \n"
+    "pavgw     %%xmm3,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  : "r"((intptr_t)(src_stride))   // %3
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+
+void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrld     $0x18,%%xmm5                    \n"
+    "pslld     $0x10,%%xmm5                    \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  :: "memory", "cc", "xmm0", "xmm1", "xmm5"
+  );
+}
+
+void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
+  intptr_t stridex3 = 0;
+  asm volatile (
+    "pcmpeqb   %%xmm7,%%xmm7                   \n"
+    "psrlw     $0x8,%%xmm7                     \n"
+    "lea       " MEMLEA4(0x00,4,4,2) ",%3      \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
+    MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "pavgb     %%xmm3,%%xmm1                   \n"
+    MEMOPREG(movdqu,0x00,0,4,2,xmm2)           //  movdqu  (%0,%4,2),%%xmm2
+    MEMOPREG(movdqu,0x10,0,4,2,xmm3)           //  movdqu  0x10(%0,%4,2),%%xmm3
+    MEMOPREG(movdqu,0x00,0,3,1,xmm4)           //  movdqu  (%0,%3,1),%%xmm4
+    MEMOPREG(movdqu,0x10,0,3,1,xmm5)           //  movdqu  0x10(%0,%3,1),%%xmm5
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pavgb     %%xmm4,%%xmm2                   \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "pavgb     %%xmm5,%%xmm3                   \n"
+    "pavgb     %%xmm3,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "movdqa    %%xmm1,%%xmm3                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "pand      %%xmm7,%%xmm2                   \n"
+    "pand      %%xmm7,%%xmm3                   \n"
+    "pavgw     %%xmm2,%%xmm0                   \n"
+    "pavgw     %%xmm3,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "pand      %%xmm7,%%xmm2                   \n"
+    "pavgw     %%xmm2,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),     // %0
+    "+r"(dst_ptr),     // %1
+    "+r"(dst_width),   // %2
+    "+r"(stridex3)     // %3
+  : "r"((intptr_t)(src_stride))    // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7"
+  );
+}
+
+void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                          uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "movdqa    %0,%%xmm3                       \n"
+    "movdqa    %1,%%xmm4                       \n"
+    "movdqa    %2,%%xmm5                       \n"
+  :
+  : "m"(kShuf0),  // %0
+    "m"(kShuf1),  // %1
+    "m"(kShuf2)   // %2
+  );
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm2   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "movdqa    %%xmm2,%%xmm1                   \n"
+    "palignr   $0x8,%%xmm0,%%xmm1              \n"
+    "pshufb    %%xmm3,%%xmm0                   \n"
+    "pshufb    %%xmm4,%%xmm1                   \n"
+    "pshufb    %%xmm5,%%xmm2                   \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "movq      %%xmm1," MEMACCESS2(0x8,1) "    \n"
+    "movq      %%xmm2," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x18,1) ",%1           \n"
+    "sub       $0x18,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(dst_width)  // %2
+  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "movdqa    %0,%%xmm2                       \n"  // kShuf01
+    "movdqa    %1,%%xmm3                       \n"  // kShuf11
+    "movdqa    %2,%%xmm4                       \n"  // kShuf21
+  :
+  : "m"(kShuf01),  // %0
+    "m"(kShuf11),  // %1
+    "m"(kShuf21)   // %2
+  );
+  asm volatile (
+    "movdqa    %0,%%xmm5                       \n"  // kMadd01
+    "movdqa    %1,%%xmm0                       \n"  // kMadd11
+    "movdqa    %2,%%xmm1                       \n"  // kRound34
+  :
+  : "m"(kMadd01),  // %0
+    "m"(kMadd11),  // %1
+    "m"(kRound34)  // %2
+  );
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
+    MEMOPREG(movdqu,0x00,0,3,1,xmm7)           //  movdqu  (%0,%3),%%xmm7
+    "pavgb     %%xmm7,%%xmm6                   \n"
+    "pshufb    %%xmm2,%%xmm6                   \n"
+    "pmaddubsw %%xmm5,%%xmm6                   \n"
+    "paddsw    %%xmm1,%%xmm6                   \n"
+    "psrlw     $0x2,%%xmm6                     \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "movq      %%xmm6," MEMACCESS(1) "         \n"
+    "movdqu    " MEMACCESS2(0x8,0) ",%%xmm6    \n"
+    MEMOPREG(movdqu,0x8,0,3,1,xmm7)            //  movdqu  0x8(%0,%3),%%xmm7
+    "pavgb     %%xmm7,%%xmm6                   \n"
+    "pshufb    %%xmm3,%%xmm6                   \n"
+    "pmaddubsw %%xmm0,%%xmm6                   \n"
+    "paddsw    %%xmm1,%%xmm6                   \n"
+    "psrlw     $0x2,%%xmm6                     \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "movq      %%xmm6," MEMACCESS2(0x8,1) "    \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
+    MEMOPREG(movdqu,0x10,0,3,1,xmm7)           //  movdqu  0x10(%0,%3),%%xmm7
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pavgb     %%xmm7,%%xmm6                   \n"
+    "pshufb    %%xmm4,%%xmm6                   \n"
+    "pmaddubsw %4,%%xmm6                       \n"
+    "paddsw    %%xmm1,%%xmm6                   \n"
+    "psrlw     $0x2,%%xmm6                     \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "movq      %%xmm6," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x18,1) ",%1           \n"
+    "sub       $0x18,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(dst_width)  // %2
+  : "r"((intptr_t)(src_stride)),  // %3
+    "m"(kMadd21)     // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
+void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "movdqa    %0,%%xmm2                       \n"  // kShuf01
+    "movdqa    %1,%%xmm3                       \n"  // kShuf11
+    "movdqa    %2,%%xmm4                       \n"  // kShuf21
+  :
+  : "m"(kShuf01),  // %0
+    "m"(kShuf11),  // %1
+    "m"(kShuf21)   // %2
+  );
+  asm volatile (
+    "movdqa    %0,%%xmm5                       \n"  // kMadd01
+    "movdqa    %1,%%xmm0                       \n"  // kMadd11
+    "movdqa    %2,%%xmm1                       \n"  // kRound34
+  :
+  : "m"(kMadd01),  // %0
+    "m"(kMadd11),  // %1
+    "m"(kRound34)  // %2
+  );
+
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
+    MEMOPREG(movdqu,0x00,0,3,1,xmm7)           //  movdqu  (%0,%3,1),%%xmm7
+    "pavgb     %%xmm6,%%xmm7                   \n"
+    "pavgb     %%xmm7,%%xmm6                   \n"
+    "pshufb    %%xmm2,%%xmm6                   \n"
+    "pmaddubsw %%xmm5,%%xmm6                   \n"
+    "paddsw    %%xmm1,%%xmm6                   \n"
+    "psrlw     $0x2,%%xmm6                     \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "movq      %%xmm6," MEMACCESS(1) "         \n"
+    "movdqu    " MEMACCESS2(0x8,0) ",%%xmm6    \n"
+    MEMOPREG(movdqu,0x8,0,3,1,xmm7)            //  movdqu  0x8(%0,%3,1),%%xmm7
+    "pavgb     %%xmm6,%%xmm7                   \n"
+    "pavgb     %%xmm7,%%xmm6                   \n"
+    "pshufb    %%xmm3,%%xmm6                   \n"
+    "pmaddubsw %%xmm0,%%xmm6                   \n"
+    "paddsw    %%xmm1,%%xmm6                   \n"
+    "psrlw     $0x2,%%xmm6                     \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "movq      %%xmm6," MEMACCESS2(0x8,1) "    \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
+    MEMOPREG(movdqu,0x10,0,3,1,xmm7)           //  movdqu  0x10(%0,%3,1),%%xmm7
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pavgb     %%xmm6,%%xmm7                   \n"
+    "pavgb     %%xmm7,%%xmm6                   \n"
+    "pshufb    %%xmm4,%%xmm6                   \n"
+    "pmaddubsw %4,%%xmm6                       \n"
+    "paddsw    %%xmm1,%%xmm6                   \n"
+    "psrlw     $0x2,%%xmm6                     \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "movq      %%xmm6," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x18,1) ",%1           \n"
+    "sub       $0x18,%2                        \n"
+    "jg        1b                              \n"
+    : "+r"(src_ptr),   // %0
+      "+r"(dst_ptr),   // %1
+      "+r"(dst_width)  // %2
+    : "r"((intptr_t)(src_stride)),  // %3
+      "m"(kMadd21)     // %4
+    : "memory", "cc", NACL_R14
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
+void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                          uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "movdqa    %3,%%xmm4                       \n"
+    "movdqa    %4,%%xmm5                       \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pshufb    %%xmm4,%%xmm0                   \n"
+    "pshufb    %%xmm5,%%xmm1                   \n"
+    "paddusb   %%xmm1,%%xmm0                   \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "movhlps   %%xmm0,%%xmm1                   \n"
+    "movd      %%xmm1," MEMACCESS2(0x8,1) "    \n"
+    "lea       " MEMLEA(0xc,1) ",%1            \n"
+    "sub       $0xc,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(dst_width)  // %2
+  : "m"(kShuf38a),   // %3
+    "m"(kShuf38b)    // %4
+  : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
+  );
+}
+
+void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "movdqa    %0,%%xmm2                       \n"
+    "movdqa    %1,%%xmm3                       \n"
+    "movdqa    %2,%%xmm4                       \n"
+    "movdqa    %3,%%xmm5                       \n"
+  :
+  : "m"(kShufAb0),   // %0
+    "m"(kShufAb1),   // %1
+    "m"(kShufAb2),   // %2
+    "m"(kScaleAb2)   // %3
+  );
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,3,1,xmm1)           //  movdqu  (%0,%3,1),%%xmm1
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "pshufb    %%xmm2,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm6                   \n"
+    "pshufb    %%xmm3,%%xmm6                   \n"
+    "paddusw   %%xmm6,%%xmm1                   \n"
+    "pshufb    %%xmm4,%%xmm0                   \n"
+    "paddusw   %%xmm0,%%xmm1                   \n"
+    "pmulhuw   %%xmm5,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm1                   \n"
+    "movd      %%xmm1," MEMACCESS(1) "         \n"
+    "psrlq     $0x10,%%xmm1                    \n"
+    "movd      %%xmm1," MEMACCESS2(0x2,1) "    \n"
+    "lea       " MEMLEA(0x6,1) ",%1            \n"
+    "sub       $0x6,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),     // %0
+    "+r"(dst_ptr),     // %1
+    "+r"(dst_width)    // %2
+  : "r"((intptr_t)(src_stride))  // %3
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+
+void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "movdqa    %0,%%xmm2                       \n"
+    "movdqa    %1,%%xmm3                       \n"
+    "movdqa    %2,%%xmm4                       \n"
+    "pxor      %%xmm5,%%xmm5                   \n"
+  :
+  : "m"(kShufAc),    // %0
+    "m"(kShufAc3),   // %1
+    "m"(kScaleAc33)  // %2
+  );
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,3,1,xmm6)           //  movdqu  (%0,%3,1),%%xmm6
+    "movhlps   %%xmm0,%%xmm1                   \n"
+    "movhlps   %%xmm6,%%xmm7                   \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "punpcklbw %%xmm5,%%xmm1                   \n"
+    "punpcklbw %%xmm5,%%xmm6                   \n"
+    "punpcklbw %%xmm5,%%xmm7                   \n"
+    "paddusw   %%xmm6,%%xmm0                   \n"
+    "paddusw   %%xmm7,%%xmm1                   \n"
+    MEMOPREG(movdqu,0x00,0,3,2,xmm6)           //  movdqu  (%0,%3,2),%%xmm6
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movhlps   %%xmm6,%%xmm7                   \n"
+    "punpcklbw %%xmm5,%%xmm6                   \n"
+    "punpcklbw %%xmm5,%%xmm7                   \n"
+    "paddusw   %%xmm6,%%xmm0                   \n"
+    "paddusw   %%xmm7,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm6                   \n"
+    "psrldq    $0x2,%%xmm0                     \n"
+    "paddusw   %%xmm0,%%xmm6                   \n"
+    "psrldq    $0x2,%%xmm0                     \n"
+    "paddusw   %%xmm0,%%xmm6                   \n"
+    "pshufb    %%xmm2,%%xmm6                   \n"
+    "movdqa    %%xmm1,%%xmm7                   \n"
+    "psrldq    $0x2,%%xmm1                     \n"
+    "paddusw   %%xmm1,%%xmm7                   \n"
+    "psrldq    $0x2,%%xmm1                     \n"
+    "paddusw   %%xmm1,%%xmm7                   \n"
+    "pshufb    %%xmm3,%%xmm7                   \n"
+    "paddusw   %%xmm7,%%xmm6                   \n"
+    "pmulhuw   %%xmm4,%%xmm6                   \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "movd      %%xmm6," MEMACCESS(1) "         \n"
+    "psrlq     $0x10,%%xmm6                    \n"
+    "movd      %%xmm6," MEMACCESS2(0x2,1) "    \n"
+    "lea       " MEMLEA(0x6,1) ",%1            \n"
+    "sub       $0x6,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  : "r"((intptr_t)(src_stride))   // %3
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
+// Reads 16xN bytes and produces 16 shorts at a time.
+void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                       uint16* dst_ptr, int src_width, int src_height) {
+  int tmp_height = 0;
+  intptr_t tmp_src = 0;
+  asm volatile (
+    "mov       %0,%3                           \n"  // row pointer
+    "mov       %5,%2                           \n"  // height
+    "pxor      %%xmm0,%%xmm0                   \n"  // clear accumulators
+    "pxor      %%xmm1,%%xmm1                   \n"
+    "pxor      %%xmm4,%%xmm4                   \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(3) ",%%xmm2         \n"
+    "add       %6,%3                           \n"
+    "movdqa    %%xmm2,%%xmm3                   \n"
+    "punpcklbw %%xmm4,%%xmm2                   \n"
+    "punpckhbw %%xmm4,%%xmm3                   \n"
+    "paddusw   %%xmm2,%%xmm0                   \n"
+    "paddusw   %%xmm3,%%xmm1                   \n"
+    "sub       $0x1,%2                         \n"
+    "jg        1b                              \n"
+
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"  // src_ptr += 16
+    "mov       %0,%3                           \n"  // row pointer
+    "mov       %5,%2                           \n"  // height
+    "pxor      %%xmm0,%%xmm0                   \n"  // clear accumulators
+    "pxor      %%xmm1,%%xmm1                   \n"
+    "sub       $0x10,%4                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),     // %0
+    "+r"(dst_ptr),     // %1
+    "+r"(tmp_height),  // %2
+    "+r"(tmp_src),     // %3
+    "+r"(src_width),   // %4
+    "+rm"(src_height)  // %5
+  : "rm"((intptr_t)(src_stride))  // %6
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+  );
+}
+
+// Bilinear column filtering. SSSE3 version.
+void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                           int dst_width, int x, int dx) {
+  intptr_t x0 = 0, x1 = 0, temp_pixel = 0;
+  asm volatile (
+    "movd      %6,%%xmm2                       \n"
+    "movd      %7,%%xmm3                       \n"
+    "movl      $0x04040000,%k2                 \n"
+    "movd      %k2,%%xmm5                      \n"
+    "pcmpeqb   %%xmm6,%%xmm6                   \n"
+    "psrlw     $0x9,%%xmm6                     \n"
+    "pextrw    $0x1,%%xmm2,%k3                 \n"
+    "subl      $0x2,%5                         \n"
+    "jl        29f                             \n"
+    "movdqa    %%xmm2,%%xmm0                   \n"
+    "paddd     %%xmm3,%%xmm0                   \n"
+    "punpckldq %%xmm0,%%xmm2                   \n"
+    "punpckldq %%xmm3,%%xmm3                   \n"
+    "paddd     %%xmm3,%%xmm3                   \n"
+    "pextrw    $0x3,%%xmm2,%k4                 \n"
+
+    LABELALIGN
+  "2:                                          \n"
+    "movdqa    %%xmm2,%%xmm1                   \n"
+    "paddd     %%xmm3,%%xmm2                   \n"
+    MEMOPARG(movzwl,0x00,1,3,1,k2)             //  movzwl  (%1,%3,1),%k2
+    "movd      %k2,%%xmm0                      \n"
+    "psrlw     $0x9,%%xmm1                     \n"
+    MEMOPARG(movzwl,0x00,1,4,1,k2)             //  movzwl  (%1,%4,1),%k2
+    "movd      %k2,%%xmm4                      \n"
+    "pshufb    %%xmm5,%%xmm1                   \n"
+    "punpcklwd %%xmm4,%%xmm0                   \n"
+    "pxor      %%xmm6,%%xmm1                   \n"
+    "pmaddubsw %%xmm1,%%xmm0                   \n"
+    "pextrw    $0x1,%%xmm2,%k3                 \n"
+    "pextrw    $0x3,%%xmm2,%k4                 \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movd      %%xmm0,%k2                      \n"
+    "mov       %w2," MEMACCESS(0) "            \n"
+    "lea       " MEMLEA(0x2,0) ",%0            \n"
+    "sub       $0x2,%5                         \n"
+    "jge       2b                              \n"
+
+    LABELALIGN
+  "29:                                         \n"
+    "addl      $0x1,%5                         \n"
+    "jl        99f                             \n"
+    MEMOPARG(movzwl,0x00,1,3,1,k2)             //  movzwl  (%1,%3,1),%k2
+    "movd      %k2,%%xmm0                      \n"
+    "psrlw     $0x9,%%xmm2                     \n"
+    "pshufb    %%xmm5,%%xmm2                   \n"
+    "pxor      %%xmm6,%%xmm2                   \n"
+    "pmaddubsw %%xmm2,%%xmm0                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movd      %%xmm0,%k2                      \n"
+    "mov       %b2," MEMACCESS(0) "            \n"
+  "99:                                         \n"
+  : "+r"(dst_ptr),     // %0
+    "+r"(src_ptr),     // %1
+    "+a"(temp_pixel),  // %2
+    "+r"(x0),          // %3
+    "+r"(x1),          // %4
+    "+rm"(dst_width)   // %5
+  : "rm"(x),           // %6
+    "rm"(dx)           // %7
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+
+// Reads 4 pixels, duplicates them and writes 8 pixels.
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+                       int dst_width, int x, int dx) {
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "punpckhbw %%xmm1,%%xmm1                   \n"
+    "movdqu    %%xmm0," MEMACCESS(0) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "sub       $0x20,%2                         \n"
+    "jg        1b                              \n"
+
+  : "+r"(dst_ptr),     // %0
+    "+r"(src_ptr),     // %1
+    "+r"(dst_width)    // %2
+  :: "memory", "cc", "xmm0", "xmm1"
+  );
+}
+
+void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
+                            ptrdiff_t src_stride,
+                            uint8* dst_argb, int dst_width) {
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "shufps    $0xdd,%%xmm1,%%xmm0             \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(dst_width)  // %2
+  :: "memory", "cc", "xmm0", "xmm1"
+  );
+}
+
+void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
+                                  ptrdiff_t src_stride,
+                                  uint8* dst_argb, int dst_width) {
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm2             \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(dst_width)  // %2
+  :: "memory", "cc", "xmm0", "xmm1"
+  );
+}
+
+void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
+                               ptrdiff_t src_stride,
+                               uint8* dst_argb, int dst_width) {
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu   (%0,%3,1),%%xmm2
+    MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu   0x10(%0,%3,1),%%xmm3
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "pavgb     %%xmm3,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm2             \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_argb),   // %1
+    "+r"(dst_width)   // %2
+  : "r"((intptr_t)(src_stride))   // %3
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3"
+  );
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: dst_argb 16 byte aligned.
+void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                               int src_stepx, uint8* dst_argb, int dst_width) {
+  intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
+  intptr_t src_stepx_x12 = 0;
+  asm volatile (
+    "lea       " MEMLEA3(0x00,1,4) ",%1        \n"
+    "lea       " MEMLEA4(0x00,1,1,2) ",%4      \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movd      " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movd,0x00,0,1,1,xmm1)             //  movd      (%0,%1,1),%%xmm1
+    "punpckldq %%xmm1,%%xmm0                   \n"
+    MEMOPREG(movd,0x00,0,1,2,xmm2)             //  movd      (%0,%1,2),%%xmm2
+    MEMOPREG(movd,0x00,0,4,1,xmm3)             //  movd      (%0,%4,1),%%xmm3
+    "lea       " MEMLEA4(0x00,0,1,4) ",%0      \n"
+    "punpckldq %%xmm3,%%xmm2                   \n"
+    "punpcklqdq %%xmm2,%%xmm0                  \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),      // %0
+    "+r"(src_stepx_x4),  // %1
+    "+r"(dst_argb),      // %2
+    "+r"(dst_width),     // %3
+    "+r"(src_stepx_x12)  // %4
+  :: "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3"
+  );
+}
+
+// Blends four 2x2 to 4x1.
+// Alignment requirement: dst_argb 16 byte aligned.
+void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
+                                  ptrdiff_t src_stride, int src_stepx,
+                                  uint8* dst_argb, int dst_width) {
+  intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
+  intptr_t src_stepx_x12 = 0;
+  intptr_t row1 = (intptr_t)(src_stride);
+  asm volatile (
+    "lea       " MEMLEA3(0x00,1,4) ",%1        \n"
+    "lea       " MEMLEA4(0x00,1,1,2) ",%4      \n"
+    "lea       " MEMLEA4(0x00,0,5,1) ",%5      \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "movq      " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movhps,0x00,0,1,1,xmm0)           //  movhps    (%0,%1,1),%%xmm0
+    MEMOPREG(movq,0x00,0,1,2,xmm1)             //  movq      (%0,%1,2),%%xmm1
+    MEMOPREG(movhps,0x00,0,4,1,xmm1)           //  movhps    (%0,%4,1),%%xmm1
+    "lea       " MEMLEA4(0x00,0,1,4) ",%0      \n"
+    "movq      " MEMACCESS(5) ",%%xmm2         \n"
+    MEMOPREG(movhps,0x00,5,1,1,xmm2)           //  movhps    (%5,%1,1),%%xmm2
+    MEMOPREG(movq,0x00,5,1,2,xmm3)             //  movq      (%5,%1,2),%%xmm3
+    MEMOPREG(movhps,0x00,5,4,1,xmm3)           //  movhps    (%5,%4,1),%%xmm3
+    "lea       " MEMLEA4(0x00,5,1,4) ",%5      \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "pavgb     %%xmm3,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm2             \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),       // %0
+    "+r"(src_stepx_x4),   // %1
+    "+r"(dst_argb),       // %2
+    "+rm"(dst_width),     // %3
+    "+r"(src_stepx_x12),  // %4
+    "+r"(row1)            // %5
+  :: "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3"
+  );
+}
+
+void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
+                        int dst_width, int x, int dx) {
+  intptr_t x0 = 0, x1 = 0;
+  asm volatile (
+    "movd      %5,%%xmm2                       \n"
+    "movd      %6,%%xmm3                       \n"
+    "pshufd    $0x0,%%xmm2,%%xmm2              \n"
+    "pshufd    $0x11,%%xmm3,%%xmm0             \n"
+    "paddd     %%xmm0,%%xmm2                   \n"
+    "paddd     %%xmm3,%%xmm3                   \n"
+    "pshufd    $0x5,%%xmm3,%%xmm0              \n"
+    "paddd     %%xmm0,%%xmm2                   \n"
+    "paddd     %%xmm3,%%xmm3                   \n"
+    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
+    "pextrw    $0x1,%%xmm2,%k0                 \n"
+    "pextrw    $0x3,%%xmm2,%k1                 \n"
+    "cmp       $0x0,%4                         \n"
+    "jl        99f                             \n"
+    "sub       $0x4,%4                         \n"
+    "jl        49f                             \n"
+
+    LABELALIGN
+  "40:                                         \n"
+    MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
+    MEMOPREG(movd,0x00,3,1,4,xmm1)             //  movd      (%3,%1,4),%%xmm1
+    "pextrw    $0x5,%%xmm2,%k0                 \n"
+    "pextrw    $0x7,%%xmm2,%k1                 \n"
+    "paddd     %%xmm3,%%xmm2                   \n"
+    "punpckldq %%xmm1,%%xmm0                   \n"
+    MEMOPREG(movd,0x00,3,0,4,xmm1)             //  movd      (%3,%0,4),%%xmm1
+    MEMOPREG(movd,0x00,3,1,4,xmm4)             //  movd      (%3,%1,4),%%xmm4
+    "pextrw    $0x1,%%xmm2,%k0                 \n"
+    "pextrw    $0x3,%%xmm2,%k1                 \n"
+    "punpckldq %%xmm4,%%xmm1                   \n"
+    "punpcklqdq %%xmm1,%%xmm0                  \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%4                         \n"
+    "jge       40b                             \n"
+
+  "49:                                         \n"
+    "test      $0x2,%4                         \n"
+    "je        29f                             \n"
+    MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
+    MEMOPREG(movd,0x00,3,1,4,xmm1)             //  movd      (%3,%1,4),%%xmm1
+    "pextrw    $0x5,%%xmm2,%k0                 \n"
+    "punpckldq %%xmm1,%%xmm0                   \n"
+    "movq      %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x8,2) ",%2            \n"
+  "29:                                         \n"
+    "test      $0x1,%4                         \n"
+    "je        99f                             \n"
+    MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
+    "movd      %%xmm0," MEMACCESS(2) "         \n"
+  "99:                                         \n"
+  : "+a"(x0),          // %0
+    "+d"(x1),          // %1
+    "+r"(dst_argb),    // %2
+    "+r"(src_argb),    // %3
+    "+r"(dst_width)    // %4
+  : "rm"(x),           // %5
+    "rm"(dx)           // %6
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+  );
+}
+
+// Reads 4 pixels, duplicates them and writes 8 pixels.
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
+                           int dst_width, int x, int dx) {
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpckldq %%xmm0,%%xmm0                   \n"
+    "punpckhdq %%xmm1,%%xmm1                   \n"
+    "movdqu    %%xmm0," MEMACCESS(0) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+
+  : "+r"(dst_argb),    // %0
+    "+r"(src_argb),    // %1
+    "+r"(dst_width)    // %2
+  :: "memory", "cc", NACL_R14
+    "xmm0", "xmm1"
+  );
+}
+
+// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
+static uvec8 kShuffleColARGB = {
+  0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel
+  8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
+};
+
+// Shuffle table for duplicating 2 fractions into 8 bytes each
+static uvec8 kShuffleFractions = {
+  0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
+};
+
+// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
+void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
+                               int dst_width, int x, int dx) {
+  intptr_t x0 = 0, x1 = 0;
+  asm volatile (
+    "movdqa    %0,%%xmm4                       \n"
+    "movdqa    %1,%%xmm5                       \n"
+  :
+  : "m"(kShuffleColARGB),  // %0
+    "m"(kShuffleFractions)  // %1
+  );
+
+  asm volatile (
+    "movd      %5,%%xmm2                       \n"
+    "movd      %6,%%xmm3                       \n"
+    "pcmpeqb   %%xmm6,%%xmm6                   \n"
+    "psrlw     $0x9,%%xmm6                     \n"
+    "pextrw    $0x1,%%xmm2,%k3                 \n"
+    "sub       $0x2,%2                         \n"
+    "jl        29f                             \n"
+    "movdqa    %%xmm2,%%xmm0                   \n"
+    "paddd     %%xmm3,%%xmm0                   \n"
+    "punpckldq %%xmm0,%%xmm2                   \n"
+    "punpckldq %%xmm3,%%xmm3                   \n"
+    "paddd     %%xmm3,%%xmm3                   \n"
+    "pextrw    $0x3,%%xmm2,%k4                 \n"
+
+    LABELALIGN
+  "2:                                          \n"
+    "movdqa    %%xmm2,%%xmm1                   \n"
+    "paddd     %%xmm3,%%xmm2                   \n"
+    MEMOPREG(movq,0x00,1,3,4,xmm0)             //  movq      (%1,%3,4),%%xmm0
+    "psrlw     $0x9,%%xmm1                     \n"
+    MEMOPREG(movhps,0x00,1,4,4,xmm0)           //  movhps    (%1,%4,4),%%xmm0
+    "pshufb    %%xmm5,%%xmm1                   \n"
+    "pshufb    %%xmm4,%%xmm0                   \n"
+    "pxor      %%xmm6,%%xmm1                   \n"
+    "pmaddubsw %%xmm1,%%xmm0                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "pextrw    $0x1,%%xmm2,%k3                 \n"
+    "pextrw    $0x3,%%xmm2,%k4                 \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movq      %%xmm0," MEMACCESS(0) "         \n"
+    "lea       " MEMLEA(0x8,0) ",%0            \n"
+    "sub       $0x2,%2                         \n"
+    "jge       2b                              \n"
+
+    LABELALIGN
+  "29:                                         \n"
+    "add       $0x1,%2                         \n"
+    "jl        99f                             \n"
+    "psrlw     $0x9,%%xmm2                     \n"
+    MEMOPREG(movq,0x00,1,3,4,xmm0)             //  movq      (%1,%3,4),%%xmm0
+    "pshufb    %%xmm5,%%xmm2                   \n"
+    "pshufb    %%xmm4,%%xmm0                   \n"
+    "pxor      %%xmm6,%%xmm2                   \n"
+    "pmaddubsw %%xmm2,%%xmm0                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movd      %%xmm0," MEMACCESS(0) "         \n"
+
+    LABELALIGN
+  "99:                                         \n"
+  : "+r"(dst_argb),    // %0
+    "+r"(src_argb),    // %1
+    "+rm"(dst_width),  // %2
+    "+r"(x0),          // %3
+    "+r"(x1)           // %4
+  : "rm"(x),           // %5
+    "rm"(dx)           // %6
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv_X86(int num, int div) {
+  asm volatile (
+    "cdq                                       \n"
+    "shld      $0x10,%%eax,%%edx               \n"
+    "shl       $0x10,%%eax                     \n"
+    "idiv      %1                              \n"
+    "mov       %0, %%eax                       \n"
+    : "+a"(num)  // %0
+    : "c"(div)   // %1
+    : "memory", "cc", "edx"
+  );
+  return num;
+}
+
+// Divide num - 1 by div - 1 and return as 16.16 fixed point result.
+int FixedDiv1_X86(int num, int div) {
+  asm volatile (
+    "cdq                                       \n"
+    "shld      $0x10,%%eax,%%edx               \n"
+    "shl       $0x10,%%eax                     \n"
+    "sub       $0x10001,%%eax                  \n"
+    "sbb       $0x0,%%edx                      \n"
+    "sub       $0x1,%1                         \n"
+    "idiv      %1                              \n"
+    "mov       %0, %%eax                       \n"
+    : "+a"(num)  // %0
+    : "c"(div)   // %1
+    : "memory", "cc", "edx"
+  );
+  return num;
+}
+
+#endif  // defined(__x86_64__) || defined(__i386__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libvpx/third_party/libyuv/source/scale_mips.cc b/libs/libvpx/third_party/libyuv/source/scale_mips.cc
new file mode 100644
index 0000000000..3eb4f27c45
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/source/scale_mips.cc
@@ -0,0 +1,654 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC MIPS DSPR2
+#if !defined(LIBYUV_DISABLE_MIPS) && \
+    defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
+    (_MIPS_SIM == _MIPS_SIM_ABI32)
+
+void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst, int dst_width) {
+  __asm__ __volatile__(
+    ".set push                                     \n"
+    ".set noreorder                                \n"
+
+    "srl            $t9, %[dst_width], 4           \n"  // iterations -> by 16
+    "beqz           $t9, 2f                        \n"
+    " nop                                          \n"
+
+    ".p2align       2                              \n"
+  "1:                                              \n"
+    "lw             $t0, 0(%[src_ptr])             \n"  // |3|2|1|0|
+    "lw             $t1, 4(%[src_ptr])             \n"  // |7|6|5|4|
+    "lw             $t2, 8(%[src_ptr])             \n"  // |11|10|9|8|
+    "lw             $t3, 12(%[src_ptr])            \n"  // |15|14|13|12|
+    "lw             $t4, 16(%[src_ptr])            \n"  // |19|18|17|16|
+    "lw             $t5, 20(%[src_ptr])            \n"  // |23|22|21|20|
+    "lw             $t6, 24(%[src_ptr])            \n"  // |27|26|25|24|
+    "lw             $t7, 28(%[src_ptr])            \n"  // |31|30|29|28|
+    // TODO(fbarchard): Use odd pixels instead of even.
+    "precr.qb.ph    $t8, $t1, $t0                  \n"  // |6|4|2|0|
+    "precr.qb.ph    $t0, $t3, $t2                  \n"  // |14|12|10|8|
+    "precr.qb.ph    $t1, $t5, $t4                  \n"  // |22|20|18|16|
+    "precr.qb.ph    $t2, $t7, $t6                  \n"  // |30|28|26|24|
+    "addiu          %[src_ptr], %[src_ptr], 32     \n"
+    "addiu          $t9, $t9, -1                   \n"
+    "sw             $t8, 0(%[dst])                 \n"
+    "sw             $t0, 4(%[dst])                 \n"
+    "sw             $t1, 8(%[dst])                 \n"
+    "sw             $t2, 12(%[dst])                \n"
+    "bgtz           $t9, 1b                        \n"
+    " addiu         %[dst], %[dst], 16             \n"
+
+  "2:                                              \n"
+    "andi           $t9, %[dst_width], 0xf         \n"  // residue
+    "beqz           $t9, 3f                        \n"
+    " nop                                          \n"
+
+  "21:                                             \n"
+    "lbu            $t0, 0(%[src_ptr])             \n"
+    "addiu          %[src_ptr], %[src_ptr], 2      \n"
+    "addiu          $t9, $t9, -1                   \n"
+    "sb             $t0, 0(%[dst])                 \n"
+    "bgtz           $t9, 21b                       \n"
+    " addiu         %[dst], %[dst], 1              \n"
+
+  "3:                                              \n"
+    ".set pop                                      \n"
+  : [src_ptr] "+r" (src_ptr),
+    [dst] "+r" (dst)
+  : [dst_width] "r" (dst_width)
+  : "t0", "t1", "t2", "t3", "t4", "t5",
+    "t6", "t7", "t8", "t9"
+  );
+}
+
+void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                 uint8* dst, int dst_width) {
+  const uint8* t = src_ptr + src_stride;
+
+  __asm__ __volatile__ (
+    ".set push                                    \n"
+    ".set noreorder                               \n"
+
+    "srl            $t9, %[dst_width], 3          \n"  // iterations -> step 8
+    "bltz           $t9, 2f                       \n"
+    " nop                                         \n"
+
+    ".p2align       2                             \n"
+  "1:                                             \n"
+    "lw             $t0, 0(%[src_ptr])            \n"  // |3|2|1|0|
+    "lw             $t1, 4(%[src_ptr])            \n"  // |7|6|5|4|
+    "lw             $t2, 8(%[src_ptr])            \n"  // |11|10|9|8|
+    "lw             $t3, 12(%[src_ptr])           \n"  // |15|14|13|12|
+    "lw             $t4, 0(%[t])                  \n"  // |19|18|17|16|
+    "lw             $t5, 4(%[t])                  \n"  // |23|22|21|20|
+    "lw             $t6, 8(%[t])                  \n"  // |27|26|25|24|
+    "lw             $t7, 12(%[t])                 \n"  // |31|30|29|28|
+    "addiu          $t9, $t9, -1                  \n"
+    "srl            $t8, $t0, 16                  \n"  // |X|X|3|2|
+    "ins            $t0, $t4, 16, 16              \n"  // |17|16|1|0|
+    "ins            $t4, $t8, 0, 16               \n"  // |19|18|3|2|
+    "raddu.w.qb     $t0, $t0                      \n"  // |17+16+1+0|
+    "raddu.w.qb     $t4, $t4                      \n"  // |19+18+3+2|
+    "shra_r.w       $t0, $t0, 2                   \n"  // |t0+2|>>2
+    "shra_r.w       $t4, $t4, 2                   \n"  // |t4+2|>>2
+    "srl            $t8, $t1, 16                  \n"  // |X|X|7|6|
+    "ins            $t1, $t5, 16, 16              \n"  // |21|20|5|4|
+    "ins            $t5, $t8, 0, 16               \n"  // |22|23|7|6|
+    "raddu.w.qb     $t1, $t1                      \n"  // |21+20+5+4|
+    "raddu.w.qb     $t5, $t5                      \n"  // |23+22+7+6|
+    "shra_r.w       $t1, $t1, 2                   \n"  // |t1+2|>>2
+    "shra_r.w       $t5, $t5, 2                   \n"  // |t5+2|>>2
+    "srl            $t8, $t2, 16                  \n"  // |X|X|11|10|
+    "ins            $t2, $t6, 16, 16              \n"  // |25|24|9|8|
+    "ins            $t6, $t8, 0, 16               \n"  // |27|26|11|10|
+    "raddu.w.qb     $t2, $t2                      \n"  // |25+24+9+8|
+    "raddu.w.qb     $t6, $t6                      \n"  // |27+26+11+10|
+    "shra_r.w       $t2, $t2, 2                   \n"  // |t2+2|>>2
+    "shra_r.w       $t6, $t6, 2                   \n"  // |t5+2|>>2
+    "srl            $t8, $t3, 16                  \n"  // |X|X|15|14|
+    "ins            $t3, $t7, 16, 16              \n"  // |29|28|13|12|
+    "ins            $t7, $t8, 0, 16               \n"  // |31|30|15|14|
+    "raddu.w.qb     $t3, $t3                      \n"  // |29+28+13+12|
+    "raddu.w.qb     $t7, $t7                      \n"  // |31+30+15+14|
+    "shra_r.w       $t3, $t3, 2                   \n"  // |t3+2|>>2
+    "shra_r.w       $t7, $t7, 2                   \n"  // |t7+2|>>2
+    "addiu          %[src_ptr], %[src_ptr], 16    \n"
+    "addiu          %[t], %[t], 16                \n"
+    "sb             $t0, 0(%[dst])                \n"
+    "sb             $t4, 1(%[dst])                \n"
+    "sb             $t1, 2(%[dst])                \n"
+    "sb             $t5, 3(%[dst])                \n"
+    "sb             $t2, 4(%[dst])                \n"
+    "sb             $t6, 5(%[dst])                \n"
+    "sb             $t3, 6(%[dst])                \n"
+    "sb             $t7, 7(%[dst])                \n"
+    "bgtz           $t9, 1b                       \n"
+    " addiu         %[dst], %[dst], 8             \n"
+
+  "2:                                             \n"
+    "andi           $t9, %[dst_width], 0x7        \n"  // x = residue
+    "beqz           $t9, 3f                       \n"
+    " nop                                         \n"
+
+    "21:                                          \n"
+    "lwr            $t1, 0(%[src_ptr])            \n"
+    "lwl            $t1, 3(%[src_ptr])            \n"
+    "lwr            $t2, 0(%[t])                  \n"
+    "lwl            $t2, 3(%[t])                  \n"
+    "srl            $t8, $t1, 16                  \n"
+    "ins            $t1, $t2, 16, 16              \n"
+    "ins            $t2, $t8, 0, 16               \n"
+    "raddu.w.qb     $t1, $t1                      \n"
+    "raddu.w.qb     $t2, $t2                      \n"
+    "shra_r.w       $t1, $t1, 2                   \n"
+    "shra_r.w       $t2, $t2, 2                   \n"
+    "sb             $t1, 0(%[dst])                \n"
+    "sb             $t2, 1(%[dst])                \n"
+    "addiu          %[src_ptr], %[src_ptr], 4     \n"
+    "addiu          $t9, $t9, -2                  \n"
+    "addiu          %[t], %[t], 4                 \n"
+    "bgtz           $t9, 21b                      \n"
+    " addiu         %[dst], %[dst], 2             \n"
+
+  "3:                                             \n"
+    ".set pop                                     \n"
+
+  : [src_ptr] "+r" (src_ptr),
+    [dst] "+r" (dst), [t] "+r" (t)
+  : [dst_width] "r" (dst_width)
+  : "t0", "t1", "t2", "t3", "t4", "t5",
+    "t6", "t7", "t8", "t9"
+  );
+}
+
+void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst, int dst_width) {
+  __asm__ __volatile__ (
+      ".set push                                    \n"
+      ".set noreorder                               \n"
+
+      "srl            $t9, %[dst_width], 3          \n"
+      "beqz           $t9, 2f                       \n"
+      " nop                                         \n"
+
+      ".p2align       2                             \n"
+     "1:                                            \n"
+      "lw             $t1, 0(%[src_ptr])            \n"  // |3|2|1|0|
+      "lw             $t2, 4(%[src_ptr])            \n"  // |7|6|5|4|
+      "lw             $t3, 8(%[src_ptr])            \n"  // |11|10|9|8|
+      "lw             $t4, 12(%[src_ptr])           \n"  // |15|14|13|12|
+      "lw             $t5, 16(%[src_ptr])           \n"  // |19|18|17|16|
+      "lw             $t6, 20(%[src_ptr])           \n"  // |23|22|21|20|
+      "lw             $t7, 24(%[src_ptr])           \n"  // |27|26|25|24|
+      "lw             $t8, 28(%[src_ptr])           \n"  // |31|30|29|28|
+      "precr.qb.ph    $t1, $t2, $t1                 \n"  // |6|4|2|0|
+      "precr.qb.ph    $t2, $t4, $t3                 \n"  // |14|12|10|8|
+      "precr.qb.ph    $t5, $t6, $t5                 \n"  // |22|20|18|16|
+      "precr.qb.ph    $t6, $t8, $t7                 \n"  // |30|28|26|24|
+      "precr.qb.ph    $t1, $t2, $t1                 \n"  // |12|8|4|0|
+      "precr.qb.ph    $t5, $t6, $t5                 \n"  // |28|24|20|16|
+      "addiu          %[src_ptr], %[src_ptr], 32    \n"
+      "addiu          $t9, $t9, -1                  \n"
+      "sw             $t1, 0(%[dst])                \n"
+      "sw             $t5, 4(%[dst])                \n"
+      "bgtz           $t9, 1b                       \n"
+      " addiu         %[dst], %[dst], 8             \n"
+
+    "2:                                             \n"
+      "andi           $t9, %[dst_width], 7          \n"  // residue
+      "beqz           $t9, 3f                       \n"
+      " nop                                         \n"
+
+    "21:                                            \n"
+      "lbu            $t1, 0(%[src_ptr])            \n"
+      "addiu          %[src_ptr], %[src_ptr], 4     \n"
+      "addiu          $t9, $t9, -1                  \n"
+      "sb             $t1, 0(%[dst])                \n"
+      "bgtz           $t9, 21b                      \n"
+      " addiu         %[dst], %[dst], 1             \n"
+
+    "3:                                             \n"
+      ".set pop                                     \n"
+      : [src_ptr] "+r" (src_ptr),
+        [dst] "+r" (dst)
+      : [dst_width] "r" (dst_width)
+      : "t1", "t2", "t3", "t4", "t5",
+        "t6", "t7", "t8", "t9"
+  );
+}
+
+void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                 uint8* dst, int dst_width) {
+  intptr_t stride = src_stride;
+  const uint8* s1 = src_ptr + stride;
+  const uint8* s2 = s1 + stride;
+  const uint8* s3 = s2 + stride;
+
+  __asm__ __volatile__ (
+      ".set push                                  \n"
+      ".set noreorder                             \n"
+
+      "srl           $t9, %[dst_width], 1         \n"
+      "andi          $t8, %[dst_width], 1         \n"
+
+      ".p2align      2                            \n"
+     "1:                                          \n"
+      "lw            $t0, 0(%[src_ptr])           \n"  // |3|2|1|0|
+      "lw            $t1, 0(%[s1])                \n"  // |7|6|5|4|
+      "lw            $t2, 0(%[s2])                \n"  // |11|10|9|8|
+      "lw            $t3, 0(%[s3])                \n"  // |15|14|13|12|
+      "lw            $t4, 4(%[src_ptr])           \n"  // |19|18|17|16|
+      "lw            $t5, 4(%[s1])                \n"  // |23|22|21|20|
+      "lw            $t6, 4(%[s2])                \n"  // |27|26|25|24|
+      "lw            $t7, 4(%[s3])                \n"  // |31|30|29|28|
+      "raddu.w.qb    $t0, $t0                     \n"  // |3 + 2 + 1 + 0|
+      "raddu.w.qb    $t1, $t1                     \n"  // |7 + 6 + 5 + 4|
+      "raddu.w.qb    $t2, $t2                     \n"  // |11 + 10 + 9 + 8|
+      "raddu.w.qb    $t3, $t3                     \n"  // |15 + 14 + 13 + 12|
+      "raddu.w.qb    $t4, $t4                     \n"  // |19 + 18 + 17 + 16|
+      "raddu.w.qb    $t5, $t5                     \n"  // |23 + 22 + 21 + 20|
+      "raddu.w.qb    $t6, $t6                     \n"  // |27 + 26 + 25 + 24|
+      "raddu.w.qb    $t7, $t7                     \n"  // |31 + 30 + 29 + 28|
+      "add           $t0, $t0, $t1                \n"
+      "add           $t1, $t2, $t3                \n"
+      "add           $t0, $t0, $t1                \n"
+      "add           $t4, $t4, $t5                \n"
+      "add           $t6, $t6, $t7                \n"
+      "add           $t4, $t4, $t6                \n"
+      "shra_r.w      $t0, $t0, 4                  \n"
+      "shra_r.w      $t4, $t4, 4                  \n"
+      "sb            $t0, 0(%[dst])               \n"
+      "sb            $t4, 1(%[dst])               \n"
+      "addiu         %[src_ptr], %[src_ptr], 8    \n"
+      "addiu         %[s1], %[s1], 8              \n"
+      "addiu         %[s2], %[s2], 8              \n"
+      "addiu         %[s3], %[s3], 8              \n"
+      "addiu         $t9, $t9, -1                 \n"
+      "bgtz          $t9, 1b                      \n"
+      " addiu        %[dst], %[dst], 2            \n"
+      "beqz          $t8, 2f                      \n"
+      " nop                                       \n"
+
+      "lw            $t0, 0(%[src_ptr])           \n"  // |3|2|1|0|
+      "lw            $t1, 0(%[s1])                \n"  // |7|6|5|4|
+      "lw            $t2, 0(%[s2])                \n"  // |11|10|9|8|
+      "lw            $t3, 0(%[s3])                \n"  // |15|14|13|12|
+      "raddu.w.qb    $t0, $t0                     \n"  // |3 + 2 + 1 + 0|
+      "raddu.w.qb    $t1, $t1                     \n"  // |7 + 6 + 5 + 4|
+      "raddu.w.qb    $t2, $t2                     \n"  // |11 + 10 + 9 + 8|
+      "raddu.w.qb    $t3, $t3                     \n"  // |15 + 14 + 13 + 12|
+      "add           $t0, $t0, $t1                \n"
+      "add           $t1, $t2, $t3                \n"
+      "add           $t0, $t0, $t1                \n"
+      "shra_r.w      $t0, $t0, 4                  \n"
+      "sb            $t0, 0(%[dst])               \n"
+
+      "2:                                         \n"
+      ".set pop                                   \n"
+
+      : [src_ptr] "+r" (src_ptr),
+        [dst] "+r" (dst),
+        [s1] "+r" (s1),
+        [s2] "+r" (s2),
+        [s3] "+r" (s3)
+      : [dst_width] "r" (dst_width)
+      : "t0", "t1", "t2", "t3", "t4", "t5",
+        "t6","t7", "t8", "t9"
+  );
+}
+
+void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst, int dst_width) {
+  __asm__ __volatile__ (
+      ".set push                                          \n"
+      ".set noreorder                                     \n"
+      ".p2align        2                                  \n"
+    "1:                                                   \n"
+      "lw              $t1, 0(%[src_ptr])                 \n"  // |3|2|1|0|
+      "lw              $t2, 4(%[src_ptr])                 \n"  // |7|6|5|4|
+      "lw              $t3, 8(%[src_ptr])                 \n"  // |11|10|9|8|
+      "lw              $t4, 12(%[src_ptr])                \n"  // |15|14|13|12|
+      "lw              $t5, 16(%[src_ptr])                \n"  // |19|18|17|16|
+      "lw              $t6, 20(%[src_ptr])                \n"  // |23|22|21|20|
+      "lw              $t7, 24(%[src_ptr])                \n"  // |27|26|25|24|
+      "lw              $t8, 28(%[src_ptr])                \n"  // |31|30|29|28|
+      "precrq.qb.ph    $t0, $t2, $t4                      \n"  // |7|5|15|13|
+      "precrq.qb.ph    $t9, $t6, $t8                      \n"  // |23|21|31|30|
+      "addiu           %[dst_width], %[dst_width], -24    \n"
+      "ins             $t1, $t1, 8, 16                    \n"  // |3|1|0|X|
+      "ins             $t4, $t0, 8, 16                    \n"  // |X|15|13|12|
+      "ins             $t5, $t5, 8, 16                    \n"  // |19|17|16|X|
+      "ins             $t8, $t9, 8, 16                    \n"  // |X|31|29|28|
+      "addiu           %[src_ptr], %[src_ptr], 32         \n"
+      "packrl.ph       $t0, $t3, $t0                      \n"  // |9|8|7|5|
+      "packrl.ph       $t9, $t7, $t9                      \n"  // |25|24|23|21|
+      "prepend         $t1, $t2, 8                        \n"  // |4|3|1|0|
+      "prepend         $t3, $t4, 24                       \n"  // |15|13|12|11|
+      "prepend         $t5, $t6, 8                        \n"  // |20|19|17|16|
+      "prepend         $t7, $t8, 24                       \n"  // |31|29|28|27|
+      "sw              $t1, 0(%[dst])                     \n"
+      "sw              $t0, 4(%[dst])                     \n"
+      "sw              $t3, 8(%[dst])                     \n"
+      "sw              $t5, 12(%[dst])                    \n"
+      "sw              $t9, 16(%[dst])                    \n"
+      "sw              $t7, 20(%[dst])                    \n"
+      "bnez            %[dst_width], 1b                   \n"
+      " addiu          %[dst], %[dst], 24                 \n"
+      ".set pop                                           \n"
+      : [src_ptr] "+r" (src_ptr),
+        [dst] "+r" (dst),
+        [dst_width] "+r" (dst_width)
+      :
+      : "t0", "t1", "t2", "t3", "t4", "t5",
+        "t6","t7", "t8", "t9"
+  );
+}
+
+void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                     uint8* d, int dst_width) {
+  __asm__ __volatile__ (
+      ".set push                                         \n"
+      ".set noreorder                                    \n"
+      "repl.ph           $t3, 3                          \n"  // 0x00030003
+
+     ".p2align           2                               \n"
+    "1:                                                  \n"
+      "lw                $t0, 0(%[src_ptr])              \n"  // |S3|S2|S1|S0|
+      "lwx               $t1, %[src_stride](%[src_ptr])  \n"  // |T3|T2|T1|T0|
+      "rotr              $t2, $t0, 8                     \n"  // |S0|S3|S2|S1|
+      "rotr              $t6, $t1, 8                     \n"  // |T0|T3|T2|T1|
+      "muleu_s.ph.qbl    $t4, $t2, $t3                   \n"  // |S0*3|S3*3|
+      "muleu_s.ph.qbl    $t5, $t6, $t3                   \n"  // |T0*3|T3*3|
+      "andi              $t0, $t2, 0xFFFF                \n"  // |0|0|S2|S1|
+      "andi              $t1, $t6, 0xFFFF                \n"  // |0|0|T2|T1|
+      "raddu.w.qb        $t0, $t0                        \n"
+      "raddu.w.qb        $t1, $t1                        \n"
+      "shra_r.w          $t0, $t0, 1                     \n"
+      "shra_r.w          $t1, $t1, 1                     \n"
+      "preceu.ph.qbr     $t2, $t2                        \n"  // |0|S2|0|S1|
+      "preceu.ph.qbr     $t6, $t6                        \n"  // |0|T2|0|T1|
+      "rotr              $t2, $t2, 16                    \n"  // |0|S1|0|S2|
+      "rotr              $t6, $t6, 16                    \n"  // |0|T1|0|T2|
+      "addu.ph           $t2, $t2, $t4                   \n"
+      "addu.ph           $t6, $t6, $t5                   \n"
+      "sll               $t5, $t0, 1                     \n"
+      "add               $t0, $t5, $t0                   \n"
+      "shra_r.ph         $t2, $t2, 2                     \n"
+      "shra_r.ph         $t6, $t6, 2                     \n"
+      "shll.ph           $t4, $t2, 1                     \n"
+      "addq.ph           $t4, $t4, $t2                   \n"
+      "addu              $t0, $t0, $t1                   \n"
+      "addiu             %[src_ptr], %[src_ptr], 4       \n"
+      "shra_r.w          $t0, $t0, 2                     \n"
+      "addu.ph           $t6, $t6, $t4                   \n"
+      "shra_r.ph         $t6, $t6, 2                     \n"
+      "srl               $t1, $t6, 16                    \n"
+      "addiu             %[dst_width], %[dst_width], -3  \n"
+      "sb                $t1, 0(%[d])                    \n"
+      "sb                $t0, 1(%[d])                    \n"
+      "sb                $t6, 2(%[d])                    \n"
+      "bgtz              %[dst_width], 1b                \n"
+      " addiu            %[d], %[d], 3                   \n"
+    "3:                                                  \n"
+      ".set pop                                          \n"
+      : [src_ptr] "+r" (src_ptr),
+        [src_stride] "+r" (src_stride),
+        [d] "+r" (d),
+        [dst_width] "+r" (dst_width)
+      :
+      : "t0", "t1", "t2", "t3",
+        "t4", "t5", "t6"
+  );
+}
+
+void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                     uint8* d, int dst_width) {
+  __asm__ __volatile__ (
+      ".set push                                           \n"
+      ".set noreorder                                      \n"
+      "repl.ph           $t2, 3                            \n"  // 0x00030003
+
+      ".p2align          2                                 \n"
+    "1:                                                    \n"
+      "lw                $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
+      "lwx               $t1, %[src_stride](%[src_ptr])    \n"  // |T3|T2|T1|T0|
+      "rotr              $t4, $t0, 8                       \n"  // |S0|S3|S2|S1|
+      "rotr              $t6, $t1, 8                       \n"  // |T0|T3|T2|T1|
+      "muleu_s.ph.qbl    $t3, $t4, $t2                     \n"  // |S0*3|S3*3|
+      "muleu_s.ph.qbl    $t5, $t6, $t2                     \n"  // |T0*3|T3*3|
+      "andi              $t0, $t4, 0xFFFF                  \n"  // |0|0|S2|S1|
+      "andi              $t1, $t6, 0xFFFF                  \n"  // |0|0|T2|T1|
+      "raddu.w.qb        $t0, $t0                          \n"
+      "raddu.w.qb        $t1, $t1                          \n"
+      "shra_r.w          $t0, $t0, 1                       \n"
+      "shra_r.w          $t1, $t1, 1                       \n"
+      "preceu.ph.qbr     $t4, $t4                          \n"  // |0|S2|0|S1|
+      "preceu.ph.qbr     $t6, $t6                          \n"  // |0|T2|0|T1|
+      "rotr              $t4, $t4, 16                      \n"  // |0|S1|0|S2|
+      "rotr              $t6, $t6, 16                      \n"  // |0|T1|0|T2|
+      "addu.ph           $t4, $t4, $t3                     \n"
+      "addu.ph           $t6, $t6, $t5                     \n"
+      "shra_r.ph         $t6, $t6, 2                       \n"
+      "shra_r.ph         $t4, $t4, 2                       \n"
+      "addu.ph           $t6, $t6, $t4                     \n"
+      "addiu             %[src_ptr], %[src_ptr], 4         \n"
+      "shra_r.ph         $t6, $t6, 1                       \n"
+      "addu              $t0, $t0, $t1                     \n"
+      "addiu             %[dst_width], %[dst_width], -3    \n"
+      "shra_r.w          $t0, $t0, 1                       \n"
+      "srl               $t1, $t6, 16                      \n"
+      "sb                $t1, 0(%[d])                      \n"
+      "sb                $t0, 1(%[d])                      \n"
+      "sb                $t6, 2(%[d])                      \n"
+      "bgtz              %[dst_width], 1b                  \n"
+      " addiu            %[d], %[d], 3                     \n"
+    "3:                                                    \n"
+      ".set pop                                            \n"
+      : [src_ptr] "+r" (src_ptr),
+        [src_stride] "+r" (src_stride),
+        [d] "+r" (d),
+        [dst_width] "+r" (dst_width)
+      :
+      : "t0", "t1", "t2", "t3",
+        "t4", "t5", "t6"
+  );
+}
+
+void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst, int dst_width) {
+  __asm__ __volatile__ (
+      ".set push                                     \n"
+      ".set noreorder                                \n"
+
+      ".p2align   2                                  \n"
+    "1:                                              \n"
+      "lw         $t0, 0(%[src_ptr])                 \n"  // |3|2|1|0|
+      "lw         $t1, 4(%[src_ptr])                 \n"  // |7|6|5|4|
+      "lw         $t2, 8(%[src_ptr])                 \n"  // |11|10|9|8|
+      "lw         $t3, 12(%[src_ptr])                \n"  // |15|14|13|12|
+      "lw         $t4, 16(%[src_ptr])                \n"  // |19|18|17|16|
+      "lw         $t5, 20(%[src_ptr])                \n"  // |23|22|21|20|
+      "lw         $t6, 24(%[src_ptr])                \n"  // |27|26|25|24|
+      "lw         $t7, 28(%[src_ptr])                \n"  // |31|30|29|28|
+      "wsbh       $t0, $t0                           \n"  // |2|3|0|1|
+      "wsbh       $t6, $t6                           \n"  // |26|27|24|25|
+      "srl        $t0, $t0, 8                        \n"  // |X|2|3|0|
+      "srl        $t3, $t3, 16                       \n"  // |X|X|15|14|
+      "srl        $t5, $t5, 16                       \n"  // |X|X|23|22|
+      "srl        $t7, $t7, 16                       \n"  // |X|X|31|30|
+      "ins        $t1, $t2, 24, 8                    \n"  // |8|6|5|4|
+      "ins        $t6, $t5, 0, 8                     \n"  // |26|27|24|22|
+      "ins        $t1, $t0, 0, 16                    \n"  // |8|6|3|0|
+      "ins        $t6, $t7, 24, 8                    \n"  // |30|27|24|22|
+      "prepend    $t2, $t3, 24                       \n"  // |X|15|14|11|
+      "ins        $t4, $t4, 16, 8                    \n"  // |19|16|17|X|
+      "ins        $t4, $t2, 0, 16                    \n"  // |19|16|14|11|
+      "addiu      %[src_ptr], %[src_ptr], 32         \n"
+      "addiu      %[dst_width], %[dst_width], -12    \n"
+      "addiu      $t8,%[dst_width], -12              \n"
+      "sw         $t1, 0(%[dst])                     \n"
+      "sw         $t4, 4(%[dst])                     \n"
+      "sw         $t6, 8(%[dst])                     \n"
+      "bgez       $t8, 1b                            \n"
+      " addiu     %[dst], %[dst], 12                 \n"
+      ".set pop                                      \n"
+      : [src_ptr] "+r" (src_ptr),
+        [dst] "+r" (dst),
+        [dst_width] "+r" (dst_width)
+      :
+      : "t0", "t1", "t2", "t3", "t4",
+        "t5", "t6", "t7", "t8"
+  );
+}
+
+void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                     uint8* dst_ptr, int dst_width) {
+  intptr_t stride = src_stride;
+  const uint8* t = src_ptr + stride;
+  const int c = 0x2AAA;
+
+  __asm__ __volatile__ (
+      ".set push                                         \n"
+      ".set noreorder                                    \n"
+
+      ".p2align        2                                 \n"
+    "1:                                                  \n"
+      "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
+      "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|
+      "lw              $t2, 0(%[t])                      \n"  // |T3|T2|T1|T0|
+      "lw              $t3, 4(%[t])                      \n"  // |T7|T6|T5|T4|
+      "rotr            $t1, $t1, 16                      \n"  // |S5|S4|S7|S6|
+      "packrl.ph       $t4, $t1, $t3                     \n"  // |S7|S6|T7|T6|
+      "packrl.ph       $t5, $t3, $t1                     \n"  // |T5|T4|S5|S4|
+      "raddu.w.qb      $t4, $t4                          \n"  // S7+S6+T7+T6
+      "raddu.w.qb      $t5, $t5                          \n"  // T5+T4+S5+S4
+      "precrq.qb.ph    $t6, $t0, $t2                     \n"  // |S3|S1|T3|T1|
+      "precrq.qb.ph    $t6, $t6, $t6                     \n"  // |S3|T3|S3|T3|
+      "srl             $t4, $t4, 2                       \n"  // t4 / 4
+      "srl             $t6, $t6, 16                      \n"  // |0|0|S3|T3|
+      "raddu.w.qb      $t6, $t6                          \n"  // 0+0+S3+T3
+      "addu            $t6, $t5, $t6                     \n"
+      "mul             $t6, $t6, %[c]                    \n"  // t6 * 0x2AAA
+      "sll             $t0, $t0, 8                       \n"  // |S2|S1|S0|0|
+      "sll             $t2, $t2, 8                       \n"  // |T2|T1|T0|0|
+      "raddu.w.qb      $t0, $t0                          \n"  // S2+S1+S0+0
+      "raddu.w.qb      $t2, $t2                          \n"  // T2+T1+T0+0
+      "addu            $t0, $t0, $t2                     \n"
+      "mul             $t0, $t0, %[c]                    \n"  // t0 * 0x2AAA
+      "addiu           %[src_ptr], %[src_ptr], 8         \n"
+      "addiu           %[t], %[t], 8                     \n"
+      "addiu           %[dst_width], %[dst_width], -3    \n"
+      "addiu           %[dst_ptr], %[dst_ptr], 3         \n"
+      "srl             $t6, $t6, 16                      \n"
+      "srl             $t0, $t0, 16                      \n"
+      "sb              $t4, -1(%[dst_ptr])               \n"
+      "sb              $t6, -2(%[dst_ptr])               \n"
+      "bgtz            %[dst_width], 1b                  \n"
+      " sb             $t0, -3(%[dst_ptr])               \n"
+      ".set pop                                          \n"
+      : [src_ptr] "+r" (src_ptr),
+        [dst_ptr] "+r" (dst_ptr),
+        [t] "+r" (t),
+        [dst_width] "+r" (dst_width)
+      : [c] "r" (c)
+      : "t0", "t1", "t2", "t3", "t4", "t5", "t6"
+  );
+}
+
+void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint8* dst_ptr, int dst_width) {
+  intptr_t stride = src_stride;
+  const uint8* s1 = src_ptr + stride;
+  stride += stride;
+  const uint8* s2 = src_ptr + stride;
+  const int c1 = 0x1C71;
+  const int c2 = 0x2AAA;
+
+  __asm__ __volatile__ (
+      ".set push                                         \n"
+      ".set noreorder                                    \n"
+
+      ".p2align        2                                 \n"
+    "1:                                                  \n"
+      "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
+      "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|
+      "lw              $t2, 0(%[s1])                     \n"  // |T3|T2|T1|T0|
+      "lw              $t3, 4(%[s1])                     \n"  // |T7|T6|T5|T4|
+      "lw              $t4, 0(%[s2])                     \n"  // |R3|R2|R1|R0|
+      "lw              $t5, 4(%[s2])                     \n"  // |R7|R6|R5|R4|
+      "rotr            $t1, $t1, 16                      \n"  // |S5|S4|S7|S6|
+      "packrl.ph       $t6, $t1, $t3                     \n"  // |S7|S6|T7|T6|
+      "raddu.w.qb      $t6, $t6                          \n"  // S7+S6+T7+T6
+      "packrl.ph       $t7, $t3, $t1                     \n"  // |T5|T4|S5|S4|
+      "raddu.w.qb      $t7, $t7                          \n"  // T5+T4+S5+S4
+      "sll             $t8, $t5, 16                      \n"  // |R5|R4|0|0|
+      "raddu.w.qb      $t8, $t8                          \n"  // R5+R4
+      "addu            $t7, $t7, $t8                     \n"
+      "srl             $t8, $t5, 16                      \n"  // |0|0|R7|R6|
+      "raddu.w.qb      $t8, $t8                          \n"  // R7 + R6
+      "addu            $t6, $t6, $t8                     \n"
+      "mul             $t6, $t6, %[c2]                   \n"  // t6 * 0x2AAA
+      "precrq.qb.ph    $t8, $t0, $t2                     \n"  // |S3|S1|T3|T1|
+      "precrq.qb.ph    $t8, $t8, $t4                     \n"  // |S3|T3|R3|R1|
+      "srl             $t8, $t8, 8                       \n"  // |0|S3|T3|R3|
+      "raddu.w.qb      $t8, $t8                          \n"  // S3 + T3 + R3
+      "addu            $t7, $t7, $t8                     \n"
+      "mul             $t7, $t7, %[c1]                   \n"  // t7 * 0x1C71
+      "sll             $t0, $t0, 8                       \n"  // |S2|S1|S0|0|
+      "sll             $t2, $t2, 8                       \n"  // |T2|T1|T0|0|
+      "sll             $t4, $t4, 8                       \n"  // |R2|R1|R0|0|
+      "raddu.w.qb      $t0, $t0                          \n"
+      "raddu.w.qb      $t2, $t2                          \n"
+      "raddu.w.qb      $t4, $t4                          \n"
+      "addu            $t0, $t0, $t2                     \n"
+      "addu            $t0, $t0, $t4                     \n"
+      "mul             $t0, $t0, %[c1]                   \n"  // t0 * 0x1C71
+      "addiu           %[src_ptr], %[src_ptr], 8         \n"
+      "addiu           %[s1], %[s1], 8                   \n"
+      "addiu           %[s2], %[s2], 8                   \n"
+      "addiu           %[dst_width], %[dst_width], -3    \n"
+      "addiu           %[dst_ptr], %[dst_ptr], 3         \n"
+      "srl             $t6, $t6, 16                      \n"
+      "srl             $t7, $t7, 16                      \n"
+      "srl             $t0, $t0, 16                      \n"
+      "sb              $t6, -1(%[dst_ptr])               \n"
+      "sb              $t7, -2(%[dst_ptr])               \n"
+      "bgtz            %[dst_width], 1b                  \n"
+      " sb             $t0, -3(%[dst_ptr])               \n"
+      ".set pop                                          \n"
+      : [src_ptr] "+r" (src_ptr),
+        [dst_ptr] "+r" (dst_ptr),
+        [s1] "+r" (s1),
+        [s2] "+r" (s2),
+        [dst_width] "+r" (dst_width)
+      : [c1] "r" (c1), [c2] "r" (c2)
+      : "t0", "t1", "t2", "t3", "t4",
+        "t5", "t6", "t7", "t8"
+  );
+}
+
+#endif  // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
diff --git a/libs/libvpx/third_party/libyuv/source/scale_neon.cc b/libs/libvpx/third_party/libyuv/source/scale_neon.cc
new file mode 100644
index 0000000000..7825878e98
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/source/scale_neon.cc
@@ -0,0 +1,1037 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon.
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+    !defined(__aarch64__)
+
+// NEON downscalers with interpolation.
+// Provided by Fritz Koenig
+
+// Read 32x1 throw away even pixels, and write 16x1.
+void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst, int dst_width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    // load even pixels into q0, odd into q1
+    MEMACCESS(0)
+    "vld2.8     {q0, q1}, [%0]!                \n"
+    "subs       %2, %2, #16                    \n"  // 16 processed per loop
+    MEMACCESS(1)
+    "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst),              // %1
+    "+r"(dst_width)         // %2
+  :
+  : "q0", "q1"              // Clobber List
+  );
+}
+
+// Read 32x1 average down and write 16x1.
+void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst, int dst_width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0, q1}, [%0]!                \n"  // load pixels and post inc
+    "subs       %2, %2, #16                    \n"  // 16 processed per loop
+    "vpaddl.u8  q0, q0                         \n"  // add adjacent
+    "vpaddl.u8  q1, q1                         \n"
+    "vrshrn.u16 d0, q0, #1                     \n"  // downshift, round and pack
+    "vrshrn.u16 d1, q1, #1                     \n"
+    MEMACCESS(1)
+    "vst1.8     {q0}, [%1]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst),              // %1
+    "+r"(dst_width)         // %2
+  :
+  : "q0", "q1"     // Clobber List
+  );
+}
+
+// Read 32x2 average down and write 16x1.
+void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst, int dst_width) {
+  asm volatile (
+    // change the stride to row 2 pointer
+    "add        %1, %0                         \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0, q1}, [%0]!                \n"  // load row 1 and post inc
+    MEMACCESS(1)
+    "vld1.8     {q2, q3}, [%1]!                \n"  // load row 2 and post inc
+    "subs       %3, %3, #16                    \n"  // 16 processed per loop
+    "vpaddl.u8  q0, q0                         \n"  // row 1 add adjacent
+    "vpaddl.u8  q1, q1                         \n"
+    "vpadal.u8  q0, q2                         \n"  // row 2 add adjacent + row1
+    "vpadal.u8  q1, q3                         \n"
+    "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack
+    "vrshrn.u16 d1, q1, #2                     \n"
+    MEMACCESS(2)
+    "vst1.8     {q0}, [%2]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(src_stride),       // %1
+    "+r"(dst),              // %2
+    "+r"(dst_width)         // %3
+  :
+  : "q0", "q1", "q2", "q3"     // Clobber List
+  );
+}
+
+void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n" // src line 0
+    "subs       %2, %2, #8                     \n" // 8 processed per loop
+    MEMACCESS(1)
+    "vst1.8     {d2}, [%1]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width)         // %2
+  :
+  : "q0", "q1", "memory", "cc"
+  );
+}
+
+void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
+  const uint8* src_ptr1 = src_ptr + src_stride;
+  const uint8* src_ptr2 = src_ptr + src_stride * 2;
+  const uint8* src_ptr3 = src_ptr + src_stride * 3;
+asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"   // load up 16x4
+    MEMACCESS(3)
+    "vld1.8     {q1}, [%3]!                    \n"
+    MEMACCESS(4)
+    "vld1.8     {q2}, [%4]!                    \n"
+    MEMACCESS(5)
+    "vld1.8     {q3}, [%5]!                    \n"
+    "subs       %2, %2, #4                     \n"
+    "vpaddl.u8  q0, q0                         \n"
+    "vpadal.u8  q0, q1                         \n"
+    "vpadal.u8  q0, q2                         \n"
+    "vpadal.u8  q0, q3                         \n"
+    "vpaddl.u16 q0, q0                         \n"
+    "vrshrn.u32 d0, q0, #4                     \n"   // divide by 16 w/rounding
+    "vmovn.u16  d0, q0                         \n"
+    MEMACCESS(1)
+    "vst1.32    {d0[0]}, [%1]!                 \n"
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(dst_width), // %2
+    "+r"(src_ptr1),  // %3
+    "+r"(src_ptr2),  // %4
+    "+r"(src_ptr3)   // %5
+  :
+  : "q0", "q1", "q2", "q3", "memory", "cc"
+  );
+}
+
+// Down scale from 4 to 3 pixels. Use the neon multilane read/write
+// to load up the every 4th pixel into a 4 different registers.
+// Point samples 32 pixels to 24 pixels.
+void ScaleRowDown34_NEON(const uint8* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!      \n" // src line 0
+    "subs       %2, %2, #24                  \n"
+    "vmov       d2, d3                       \n" // order d0, d1, d2
+    MEMACCESS(1)
+    "vst3.8     {d0, d1, d2}, [%1]!          \n"
+    "bgt        1b                           \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width)         // %2
+  :
+  : "d0", "d1", "d2", "d3", "memory", "cc"
+  );
+}
+
+void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "vmov.u8    d24, #3                        \n"
+    "add        %3, %0                         \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0
+    MEMACCESS(3)
+    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n" // src line 1
+    "subs         %2, %2, #24                  \n"
+
+    // filter src line 0 with src line 1
+    // expand chars to shorts to allow for room
+    // when adding lines together
+    "vmovl.u8     q8, d4                       \n"
+    "vmovl.u8     q9, d5                       \n"
+    "vmovl.u8     q10, d6                      \n"
+    "vmovl.u8     q11, d7                      \n"
+
+    // 3 * line_0 + line_1
+    "vmlal.u8     q8, d0, d24                  \n"
+    "vmlal.u8     q9, d1, d24                  \n"
+    "vmlal.u8     q10, d2, d24                 \n"
+    "vmlal.u8     q11, d3, d24                 \n"
+
+    // (3 * line_0 + line_1) >> 2
+    "vqrshrn.u16  d0, q8, #2                   \n"
+    "vqrshrn.u16  d1, q9, #2                   \n"
+    "vqrshrn.u16  d2, q10, #2                  \n"
+    "vqrshrn.u16  d3, q11, #2                  \n"
+
+    // a0 = (src[0] * 3 + s[1] * 1) >> 2
+    "vmovl.u8     q8, d1                       \n"
+    "vmlal.u8     q8, d0, d24                  \n"
+    "vqrshrn.u16  d0, q8, #2                   \n"
+
+    // a1 = (src[1] * 1 + s[2] * 1) >> 1
+    "vrhadd.u8    d1, d1, d2                   \n"
+
+    // a2 = (src[2] * 1 + s[3] * 3) >> 2
+    "vmovl.u8     q8, d2                       \n"
+    "vmlal.u8     q8, d3, d24                  \n"
+    "vqrshrn.u16  d2, q8, #2                   \n"
+
+    MEMACCESS(1)
+    "vst3.8       {d0, d1, d2}, [%1]!          \n"
+
+    "bgt          1b                           \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width),        // %2
+    "+r"(src_stride)        // %3
+  :
+  : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
+  );
+}
+
+void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "vmov.u8    d24, #3                        \n"
+    "add        %3, %0                         \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0
+    MEMACCESS(3)
+    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n" // src line 1
+    "subs         %2, %2, #24                  \n"
+    // average src line 0 with src line 1
+    "vrhadd.u8    q0, q0, q2                   \n"
+    "vrhadd.u8    q1, q1, q3                   \n"
+
+    // a0 = (src[0] * 3 + s[1] * 1) >> 2
+    "vmovl.u8     q3, d1                       \n"
+    "vmlal.u8     q3, d0, d24                  \n"
+    "vqrshrn.u16  d0, q3, #2                   \n"
+
+    // a1 = (src[1] * 1 + s[2] * 1) >> 1
+    "vrhadd.u8    d1, d1, d2                   \n"
+
+    // a2 = (src[2] * 1 + s[3] * 3) >> 2
+    "vmovl.u8     q3, d2                       \n"
+    "vmlal.u8     q3, d3, d24                  \n"
+    "vqrshrn.u16  d2, q3, #2                   \n"
+
+    MEMACCESS(1)
+    "vst3.8       {d0, d1, d2}, [%1]!          \n"
+    "bgt          1b                           \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width),        // %2
+    "+r"(src_stride)        // %3
+  :
+  : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
+  );
+}
+
+#define HAS_SCALEROWDOWN38_NEON
+static uvec8 kShuf38 =
+  { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
+static uvec8 kShuf38_2 =
+  { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
+static vec16 kMult38_Div6 =
+  { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
+    65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
+static vec16 kMult38_Div9 =
+  { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
+    65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
+
+// 32 -> 12
+void ScaleRowDown38_NEON(const uint8* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    MEMACCESS(3)
+    "vld1.8     {q3}, [%3]                     \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"
+    "subs       %2, %2, #12                    \n"
+    "vtbl.u8    d4, {d0, d1, d2, d3}, d6       \n"
+    "vtbl.u8    d5, {d0, d1, d2, d3}, d7       \n"
+    MEMACCESS(1)
+    "vst1.8     {d4}, [%1]!                    \n"
+    MEMACCESS(1)
+    "vst1.32    {d5[0]}, [%1]!                 \n"
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width)         // %2
+  : "r"(&kShuf38)           // %3
+  : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
+  );
+}
+
+// 32x3 -> 12x1
+void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
+                                      ptrdiff_t src_stride,
+                                      uint8* dst_ptr, int dst_width) {
+  const uint8* src_ptr1 = src_ptr + src_stride * 2;
+
+  asm volatile (
+    MEMACCESS(5)
+    "vld1.16    {q13}, [%5]                    \n"
+    MEMACCESS(6)
+    "vld1.8     {q14}, [%6]                    \n"
+    MEMACCESS(7)
+    "vld1.8     {q15}, [%7]                    \n"
+    "add        %3, %0                         \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+
+    // d0 = 00 40 01 41 02 42 03 43
+    // d1 = 10 50 11 51 12 52 13 53
+    // d2 = 20 60 21 61 22 62 23 63
+    // d3 = 30 70 31 71 32 72 33 73
+    MEMACCESS(0)
+    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"
+    MEMACCESS(3)
+    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"
+    MEMACCESS(4)
+    "vld4.8       {d16, d17, d18, d19}, [%4]!  \n"
+    "subs         %2, %2, #12                  \n"
+
+    // Shuffle the input data around to get align the data
+    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+    // d0 = 00 10 01 11 02 12 03 13
+    // d1 = 40 50 41 51 42 52 43 53
+    "vtrn.u8      d0, d1                       \n"
+    "vtrn.u8      d4, d5                       \n"
+    "vtrn.u8      d16, d17                     \n"
+
+    // d2 = 20 30 21 31 22 32 23 33
+    // d3 = 60 70 61 71 62 72 63 73
+    "vtrn.u8      d2, d3                       \n"
+    "vtrn.u8      d6, d7                       \n"
+    "vtrn.u8      d18, d19                     \n"
+
+    // d0 = 00+10 01+11 02+12 03+13
+    // d2 = 40+50 41+51 42+52 43+53
+    "vpaddl.u8    q0, q0                       \n"
+    "vpaddl.u8    q2, q2                       \n"
+    "vpaddl.u8    q8, q8                       \n"
+
+    // d3 = 60+70 61+71 62+72 63+73
+    "vpaddl.u8    d3, d3                       \n"
+    "vpaddl.u8    d7, d7                       \n"
+    "vpaddl.u8    d19, d19                     \n"
+
+    // combine source lines
+    "vadd.u16     q0, q2                       \n"
+    "vadd.u16     q0, q8                       \n"
+    "vadd.u16     d4, d3, d7                   \n"
+    "vadd.u16     d4, d19                      \n"
+
+    // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
+    //             + s[6 + st * 1] + s[7 + st * 1]
+    //             + s[6 + st * 2] + s[7 + st * 2]) / 6
+    "vqrdmulh.s16 q2, q2, q13                  \n"
+    "vmovn.u16    d4, q2                       \n"
+
+    // Shuffle 2,3 reg around so that 2 can be added to the
+    //  0,1 reg and 3 can be added to the 4,5 reg. This
+    //  requires expanding from u8 to u16 as the 0,1 and 4,5
+    //  registers are already expanded. Then do transposes
+    //  to get aligned.
+    // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+    "vmovl.u8     q1, d2                       \n"
+    "vmovl.u8     q3, d6                       \n"
+    "vmovl.u8     q9, d18                      \n"
+
+    // combine source lines
+    "vadd.u16     q1, q3                       \n"
+    "vadd.u16     q1, q9                       \n"
+
+    // d4 = xx 20 xx 30 xx 22 xx 32
+    // d5 = xx 21 xx 31 xx 23 xx 33
+    "vtrn.u32     d2, d3                       \n"
+
+    // d4 = xx 20 xx 21 xx 22 xx 23
+    // d5 = xx 30 xx 31 xx 32 xx 33
+    "vtrn.u16     d2, d3                       \n"
+
+    // 0+1+2, 3+4+5
+    "vadd.u16     q0, q1                       \n"
+
+    // Need to divide, but can't downshift as the the value
+    //  isn't a power of 2. So multiply by 65536 / n
+    //  and take the upper 16 bits.
+    "vqrdmulh.s16 q0, q0, q15                  \n"
+
+    // Align for table lookup, vtbl requires registers to
+    //  be adjacent
+    "vmov.u8      d2, d4                       \n"
+
+    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
+    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
+
+    MEMACCESS(1)
+    "vst1.8       {d3}, [%1]!                  \n"
+    MEMACCESS(1)
+    "vst1.32      {d4[0]}, [%1]!               \n"
+    "bgt          1b                           \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width),        // %2
+    "+r"(src_stride),       // %3
+    "+r"(src_ptr1)          // %4
+  : "r"(&kMult38_Div6),     // %5
+    "r"(&kShuf38_2),        // %6
+    "r"(&kMult38_Div9)      // %7
+  : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc"
+  );
+}
+
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    MEMACCESS(4)
+    "vld1.16    {q13}, [%4]                    \n"
+    MEMACCESS(5)
+    "vld1.8     {q14}, [%5]                    \n"
+    "add        %3, %0                         \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+
+    // d0 = 00 40 01 41 02 42 03 43
+    // d1 = 10 50 11 51 12 52 13 53
+    // d2 = 20 60 21 61 22 62 23 63
+    // d3 = 30 70 31 71 32 72 33 73
+    MEMACCESS(0)
+    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"
+    MEMACCESS(3)
+    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"
+    "subs         %2, %2, #12                  \n"
+
+    // Shuffle the input data around to get align the data
+    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+    // d0 = 00 10 01 11 02 12 03 13
+    // d1 = 40 50 41 51 42 52 43 53
+    "vtrn.u8      d0, d1                       \n"
+    "vtrn.u8      d4, d5                       \n"
+
+    // d2 = 20 30 21 31 22 32 23 33
+    // d3 = 60 70 61 71 62 72 63 73
+    "vtrn.u8      d2, d3                       \n"
+    "vtrn.u8      d6, d7                       \n"
+
+    // d0 = 00+10 01+11 02+12 03+13
+    // d2 = 40+50 41+51 42+52 43+53
+    "vpaddl.u8    q0, q0                       \n"
+    "vpaddl.u8    q2, q2                       \n"
+
+    // d3 = 60+70 61+71 62+72 63+73
+    "vpaddl.u8    d3, d3                       \n"
+    "vpaddl.u8    d7, d7                       \n"
+
+    // combine source lines
+    "vadd.u16     q0, q2                       \n"
+    "vadd.u16     d4, d3, d7                   \n"
+
+    // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
+    "vqrshrn.u16  d4, q2, #2                   \n"
+
+    // Shuffle 2,3 reg around so that 2 can be added to the
+    //  0,1 reg and 3 can be added to the 4,5 reg. This
+    //  requires expanding from u8 to u16 as the 0,1 and 4,5
+    //  registers are already expanded. Then do transposes
+    //  to get aligned.
+    // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+    "vmovl.u8     q1, d2                       \n"
+    "vmovl.u8     q3, d6                       \n"
+
+    // combine source lines
+    "vadd.u16     q1, q3                       \n"
+
+    // d4 = xx 20 xx 30 xx 22 xx 32
+    // d5 = xx 21 xx 31 xx 23 xx 33
+    "vtrn.u32     d2, d3                       \n"
+
+    // d4 = xx 20 xx 21 xx 22 xx 23
+    // d5 = xx 30 xx 31 xx 32 xx 33
+    "vtrn.u16     d2, d3                       \n"
+
+    // 0+1+2, 3+4+5
+    "vadd.u16     q0, q1                       \n"
+
+    // Need to divide, but can't downshift as the the value
+    //  isn't a power of 2. So multiply by 65536 / n
+    //  and take the upper 16 bits.
+    "vqrdmulh.s16 q0, q0, q13                  \n"
+
+    // Align for table lookup, vtbl requires registers to
+    //  be adjacent
+    "vmov.u8      d2, d4                       \n"
+
+    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
+    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
+
+    MEMACCESS(1)
+    "vst1.8       {d3}, [%1]!                  \n"
+    MEMACCESS(1)
+    "vst1.32      {d4[0]}, [%1]!               \n"
+    "bgt          1b                           \n"
+  : "+r"(src_ptr),       // %0
+    "+r"(dst_ptr),       // %1
+    "+r"(dst_width),     // %2
+    "+r"(src_stride)     // %3
+  : "r"(&kMult38_Div6),  // %4
+    "r"(&kShuf38_2)      // %5
+  : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
+  );
+}
+
+void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                    uint16* dst_ptr, int src_width, int src_height) {
+  const uint8* src_tmp = NULL;
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    "mov       %0, %1                          \n"
+    "mov       r12, %5                         \n"
+    "veor      q2, q2, q2                      \n"
+    "veor      q3, q3, q3                      \n"
+  "2:                                          \n"
+    // load 16 pixels into q0
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0], %3                 \n"
+    "vaddw.u8   q3, q3, d1                     \n"
+    "vaddw.u8   q2, q2, d0                     \n"
+    "subs       r12, r12, #1                   \n"
+    "bgt        2b                             \n"
+    MEMACCESS(2)
+    "vst1.16    {q2, q3}, [%2]!                \n"  // store pixels
+    "add        %1, %1, #16                    \n"
+    "subs       %4, %4, #16                    \n"  // 16 processed per loop
+    "bgt        1b                             \n"
+  : "+r"(src_tmp),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(dst_ptr),          // %2
+    "+r"(src_stride),       // %3
+    "+r"(src_width),        // %4
+    "+r"(src_height)        // %5
+  :
+  : "memory", "cc", "r12", "q0", "q1", "q2", "q3"  // Clobber List
+  );
+}
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD2_DATA8_LANE(n)                                    \
+    "lsr        %5, %3, #16                    \n"             \
+    "add        %6, %1, %5                     \n"             \
+    "add        %3, %3, %4                     \n"             \
+    MEMACCESS(6)                                               \
+    "vld2.8     {d6["#n"], d7["#n"]}, [%6]     \n"
+
+void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
+                          int dst_width, int x, int dx) {
+  int dx_offset[4] = {0, 1, 2, 3};
+  int* tmp = dx_offset;
+  const uint8* src_tmp = src_ptr;
+  asm volatile (
+    ".p2align   2                              \n"
+    "vdup.32    q0, %3                         \n"  // x
+    "vdup.32    q1, %4                         \n"  // dx
+    "vld1.32    {q2}, [%5]                     \n"  // 0 1 2 3
+    "vshl.i32   q3, q1, #2                     \n"  // 4 * dx
+    "vmul.s32   q1, q1, q2                     \n"
+    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
+    "vadd.s32   q1, q1, q0                     \n"
+    // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
+    "vadd.s32   q2, q1, q3                     \n"
+    "vshl.i32   q0, q3, #1                     \n"  // 8 * dx
+  "1:                                          \n"
+    LOAD2_DATA8_LANE(0)
+    LOAD2_DATA8_LANE(1)
+    LOAD2_DATA8_LANE(2)
+    LOAD2_DATA8_LANE(3)
+    LOAD2_DATA8_LANE(4)
+    LOAD2_DATA8_LANE(5)
+    LOAD2_DATA8_LANE(6)
+    LOAD2_DATA8_LANE(7)
+    "vmov       q10, q1                        \n"
+    "vmov       q11, q2                        \n"
+    "vuzp.16    q10, q11                       \n"
+    "vmovl.u8   q8, d6                         \n"
+    "vmovl.u8   q9, d7                         \n"
+    "vsubl.s16  q11, d18, d16                  \n"
+    "vsubl.s16  q12, d19, d17                  \n"
+    "vmovl.u16  q13, d20                       \n"
+    "vmovl.u16  q10, d21                       \n"
+    "vmul.s32   q11, q11, q13                  \n"
+    "vmul.s32   q12, q12, q10                  \n"
+    "vshrn.s32  d18, q11, #16                  \n"
+    "vshrn.s32  d19, q12, #16                  \n"
+    "vadd.s16   q8, q8, q9                     \n"
+    "vmovn.s16  d6, q8                         \n"
+
+    MEMACCESS(0)
+    "vst1.8     {d6}, [%0]!                    \n"  // store pixels
+    "vadd.s32   q1, q1, q0                     \n"
+    "vadd.s32   q2, q2, q0                     \n"
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop
+    "bgt        1b                             \n"
+  : "+r"(dst_ptr),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(dst_width),        // %2
+    "+r"(x),                // %3
+    "+r"(dx),               // %4
+    "+r"(tmp),              // %5
+    "+r"(src_tmp)           // %6
+  :
+  : "memory", "cc", "q0", "q1", "q2", "q3",
+    "q8", "q9", "q10", "q11", "q12", "q13"
+  );
+}
+
+#undef LOAD2_DATA8_LANE
+
+// 16x2 -> 16x1
+void ScaleFilterRows_NEON(uint8* dst_ptr,
+                          const uint8* src_ptr, ptrdiff_t src_stride,
+                          int dst_width, int source_y_fraction) {
+  asm volatile (
+    "cmp          %4, #0                       \n"
+    "beq          100f                         \n"
+    "add          %2, %1                       \n"
+    "cmp          %4, #64                      \n"
+    "beq          75f                          \n"
+    "cmp          %4, #128                     \n"
+    "beq          50f                          \n"
+    "cmp          %4, #192                     \n"
+    "beq          25f                          \n"
+
+    "vdup.8       d5, %4                       \n"
+    "rsb          %4, #256                     \n"
+    "vdup.8       d4, %4                       \n"
+    // General purpose row blend.
+  "1:                                          \n"
+    MEMACCESS(1)
+    "vld1.8       {q0}, [%1]!                  \n"
+    MEMACCESS(2)
+    "vld1.8       {q1}, [%2]!                  \n"
+    "subs         %3, %3, #16                  \n"
+    "vmull.u8     q13, d0, d4                  \n"
+    "vmull.u8     q14, d1, d4                  \n"
+    "vmlal.u8     q13, d2, d5                  \n"
+    "vmlal.u8     q14, d3, d5                  \n"
+    "vrshrn.u16   d0, q13, #8                  \n"
+    "vrshrn.u16   d1, q14, #8                  \n"
+    MEMACCESS(0)
+    "vst1.8       {q0}, [%0]!                  \n"
+    "bgt          1b                           \n"
+    "b            99f                          \n"
+
+    // Blend 25 / 75.
+  "25:                                         \n"
+    MEMACCESS(1)
+    "vld1.8       {q0}, [%1]!                  \n"
+    MEMACCESS(2)
+    "vld1.8       {q1}, [%2]!                  \n"
+    "subs         %3, %3, #16                  \n"
+    "vrhadd.u8    q0, q1                       \n"
+    "vrhadd.u8    q0, q1                       \n"
+    MEMACCESS(0)
+    "vst1.8       {q0}, [%0]!                  \n"
+    "bgt          25b                          \n"
+    "b            99f                          \n"
+
+    // Blend 50 / 50.
+  "50:                                         \n"
+    MEMACCESS(1)
+    "vld1.8       {q0}, [%1]!                  \n"
+    MEMACCESS(2)
+    "vld1.8       {q1}, [%2]!                  \n"
+    "subs         %3, %3, #16                  \n"
+    "vrhadd.u8    q0, q1                       \n"
+    MEMACCESS(0)
+    "vst1.8       {q0}, [%0]!                  \n"
+    "bgt          50b                          \n"
+    "b            99f                          \n"
+
+    // Blend 75 / 25.
+  "75:                                         \n"
+    MEMACCESS(1)
+    "vld1.8       {q1}, [%1]!                  \n"
+    MEMACCESS(2)
+    "vld1.8       {q0}, [%2]!                  \n"
+    "subs         %3, %3, #16                  \n"
+    "vrhadd.u8    q0, q1                       \n"
+    "vrhadd.u8    q0, q1                       \n"
+    MEMACCESS(0)
+    "vst1.8       {q0}, [%0]!                  \n"
+    "bgt          75b                          \n"
+    "b            99f                          \n"
+
+    // Blend 100 / 0 - Copy row unchanged.
+  "100:                                        \n"
+    MEMACCESS(1)
+    "vld1.8       {q0}, [%1]!                  \n"
+    "subs         %3, %3, #16                  \n"
+    MEMACCESS(0)
+    "vst1.8       {q0}, [%0]!                  \n"
+    "bgt          100b                         \n"
+
+  "99:                                         \n"
+    MEMACCESS(0)
+    "vst1.8       {d1[7]}, [%0]                \n"
+  : "+r"(dst_ptr),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(src_stride),       // %2
+    "+r"(dst_width),        // %3
+    "+r"(source_y_fraction) // %4
+  :
+  : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
+  );
+}
+
+void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst, int dst_width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    // load even pixels into q0, odd into q1
+    MEMACCESS(0)
+    "vld2.32    {q0, q1}, [%0]!                \n"
+    MEMACCESS(0)
+    "vld2.32    {q2, q3}, [%0]!                \n"
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop
+    MEMACCESS(1)
+    "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels
+    MEMACCESS(1)
+    "vst1.8     {q3}, [%1]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst),              // %1
+    "+r"(dst_width)         // %2
+  :
+  : "memory", "cc", "q0", "q1", "q2", "q3"  // Clobber List
+  );
+}
+
+void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                  uint8* dst_argb, int dst_width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop
+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    "vpaddl.u8  q3, q3                         \n"  // A 16 bytes -> 8 shorts.
+    "vrshrn.u16 d0, q0, #1                     \n"  // downshift, round and pack
+    "vrshrn.u16 d1, q1, #1                     \n"
+    "vrshrn.u16 d2, q2, #1                     \n"
+    "vrshrn.u16 d3, q3, #1                     \n"
+    MEMACCESS(1)
+    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"
+    "bgt       1b                              \n"
+  : "+r"(src_argb),         // %0
+    "+r"(dst_argb),         // %1
+    "+r"(dst_width)         // %2
+  :
+  : "memory", "cc", "q0", "q1", "q2", "q3"     // Clobber List
+  );
+}
+
+void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst, int dst_width) {
+  asm volatile (
+    // change the stride to row 2 pointer
+    "add        %1, %1, %0                     \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    "vpaddl.u8  q3, q3                         \n"  // A 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "vld4.8     {d16, d18, d20, d22}, [%1]!    \n"  // load 8 more ARGB pixels.
+    MEMACCESS(1)
+    "vld4.8     {d17, d19, d21, d23}, [%1]!    \n"  // load last 8 ARGB pixels.
+    "vpadal.u8  q0, q8                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q9                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q2, q10                        \n"  // R 16 bytes -> 8 shorts.
+    "vpadal.u8  q3, q11                        \n"  // A 16 bytes -> 8 shorts.
+    "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack
+    "vrshrn.u16 d1, q1, #2                     \n"
+    "vrshrn.u16 d2, q2, #2                     \n"
+    "vrshrn.u16 d3, q3, #2                     \n"
+    MEMACCESS(2)
+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(src_stride),       // %1
+    "+r"(dst),              // %2
+    "+r"(dst_width)         // %3
+  :
+  : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
+  );
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+void ScaleARGBRowDownEven_NEON(const uint8* src_argb,  ptrdiff_t src_stride,
+                               int src_stepx, uint8* dst_argb, int dst_width) {
+  asm volatile (
+    "mov        r12, %3, lsl #2                \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.32    {d0[0]}, [%0], r12             \n"
+    MEMACCESS(0)
+    "vld1.32    {d0[1]}, [%0], r12             \n"
+    MEMACCESS(0)
+    "vld1.32    {d1[0]}, [%0], r12             \n"
+    MEMACCESS(0)
+    "vld1.32    {d1[1]}, [%0], r12             \n"
+    "subs       %2, %2, #4                     \n"  // 4 pixels per loop.
+    MEMACCESS(1)
+    "vst1.8     {q0}, [%1]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src_argb),    // %0
+    "+r"(dst_argb),    // %1
+    "+r"(dst_width)    // %2
+  : "r"(src_stepx)     // %3
+  : "memory", "cc", "r12", "q0"
+  );
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8* dst_argb, int dst_width) {
+  asm volatile (
+    "mov        r12, %4, lsl #2                \n"
+    "add        %1, %1, %0                     \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {d0}, [%0], r12                \n"  // Read 4 2x2 blocks -> 2x1
+    MEMACCESS(1)
+    "vld1.8     {d1}, [%1], r12                \n"
+    MEMACCESS(0)
+    "vld1.8     {d2}, [%0], r12                \n"
+    MEMACCESS(1)
+    "vld1.8     {d3}, [%1], r12                \n"
+    MEMACCESS(0)
+    "vld1.8     {d4}, [%0], r12                \n"
+    MEMACCESS(1)
+    "vld1.8     {d5}, [%1], r12                \n"
+    MEMACCESS(0)
+    "vld1.8     {d6}, [%0], r12                \n"
+    MEMACCESS(1)
+    "vld1.8     {d7}, [%1], r12                \n"
+    "vaddl.u8   q0, d0, d1                     \n"
+    "vaddl.u8   q1, d2, d3                     \n"
+    "vaddl.u8   q2, d4, d5                     \n"
+    "vaddl.u8   q3, d6, d7                     \n"
+    "vswp.8     d1, d2                         \n"  // ab_cd -> ac_bd
+    "vswp.8     d5, d6                         \n"  // ef_gh -> eg_fh
+    "vadd.u16   q0, q0, q1                     \n"  // (a+b)_(c+d)
+    "vadd.u16   q2, q2, q3                     \n"  // (e+f)_(g+h)
+    "vrshrn.u16 d0, q0, #2                     \n"  // first 2 pixels.
+    "vrshrn.u16 d1, q2, #2                     \n"  // next 2 pixels.
+    "subs       %3, %3, #4                     \n"  // 4 pixels per loop.
+    MEMACCESS(2)
+    "vst1.8     {q0}, [%2]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src_argb),    // %0
+    "+r"(src_stride),  // %1
+    "+r"(dst_argb),    // %2
+    "+r"(dst_width)    // %3
+  : "r"(src_stepx)     // %4
+  : "memory", "cc", "r12", "q0", "q1", "q2", "q3"
+  );
+}
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD1_DATA32_LANE(dn, n)                               \
+    "lsr        %5, %3, #16                    \n"             \
+    "add        %6, %1, %5, lsl #2             \n"             \
+    "add        %3, %3, %4                     \n"             \
+    MEMACCESS(6)                                               \
+    "vld1.32    {"#dn"["#n"]}, [%6]            \n"
+
+void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
+                        int dst_width, int x, int dx) {
+  int tmp = 0;
+  const uint8* src_tmp = src_argb;
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    LOAD1_DATA32_LANE(d0, 0)
+    LOAD1_DATA32_LANE(d0, 1)
+    LOAD1_DATA32_LANE(d1, 0)
+    LOAD1_DATA32_LANE(d1, 1)
+    LOAD1_DATA32_LANE(d2, 0)
+    LOAD1_DATA32_LANE(d2, 1)
+    LOAD1_DATA32_LANE(d3, 0)
+    LOAD1_DATA32_LANE(d3, 1)
+
+    MEMACCESS(0)
+    "vst1.32     {q0, q1}, [%0]!               \n"  // store pixels
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop
+    "bgt        1b                             \n"
+  : "+r"(dst_argb),         // %0
+    "+r"(src_argb),         // %1
+    "+r"(dst_width),        // %2
+    "+r"(x),                // %3
+    "+r"(dx),               // %4
+    "+r"(tmp),              // %5
+    "+r"(src_tmp)           // %6
+  :
+  : "memory", "cc", "q0", "q1"
+  );
+}
+
+#undef LOAD1_DATA32_LANE
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD2_DATA32_LANE(dn1, dn2, n)                         \
+    "lsr        %5, %3, #16                           \n"      \
+    "add        %6, %1, %5, lsl #2                    \n"      \
+    "add        %3, %3, %4                            \n"      \
+    MEMACCESS(6)                                               \
+    "vld2.32    {"#dn1"["#n"], "#dn2"["#n"]}, [%6]    \n"
+
+void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
+                              int dst_width, int x, int dx) {
+  int dx_offset[4] = {0, 1, 2, 3};
+  int* tmp = dx_offset;
+  const uint8* src_tmp = src_argb;
+  asm volatile (
+    ".p2align   2                              \n"
+    "vdup.32    q0, %3                         \n"  // x
+    "vdup.32    q1, %4                         \n"  // dx
+    "vld1.32    {q2}, [%5]                     \n"  // 0 1 2 3
+    "vshl.i32   q9, q1, #2                     \n"  // 4 * dx
+    "vmul.s32   q1, q1, q2                     \n"
+    "vmov.i8    q3, #0x7f                      \n"  // 0x7F
+    "vmov.i16   q15, #0x7f                     \n"  // 0x7F
+    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
+    "vadd.s32   q8, q1, q0                     \n"
+  "1:                                          \n"
+    // d0, d1: a
+    // d2, d3: b
+    LOAD2_DATA32_LANE(d0, d2, 0)
+    LOAD2_DATA32_LANE(d0, d2, 1)
+    LOAD2_DATA32_LANE(d1, d3, 0)
+    LOAD2_DATA32_LANE(d1, d3, 1)
+    "vshrn.i32   d22, q8, #9                   \n"
+    "vand.16     d22, d22, d30                 \n"
+    "vdup.8      d24, d22[0]                   \n"
+    "vdup.8      d25, d22[2]                   \n"
+    "vdup.8      d26, d22[4]                   \n"
+    "vdup.8      d27, d22[6]                   \n"
+    "vext.8      d4, d24, d25, #4              \n"
+    "vext.8      d5, d26, d27, #4              \n"  // f
+    "veor.8      q10, q2, q3                   \n"  // 0x7f ^ f
+    "vmull.u8    q11, d0, d20                  \n"
+    "vmull.u8    q12, d1, d21                  \n"
+    "vmull.u8    q13, d2, d4                   \n"
+    "vmull.u8    q14, d3, d5                   \n"
+    "vadd.i16    q11, q11, q13                 \n"
+    "vadd.i16    q12, q12, q14                 \n"
+    "vshrn.i16   d0, q11, #7                   \n"
+    "vshrn.i16   d1, q12, #7                   \n"
+
+    MEMACCESS(0)
+    "vst1.32     {d0, d1}, [%0]!               \n"  // store pixels
+    "vadd.s32    q8, q8, q9                    \n"
+    "subs        %2, %2, #4                    \n"  // 4 processed per loop
+    "bgt         1b                            \n"
+  : "+r"(dst_argb),         // %0
+    "+r"(src_argb),         // %1
+    "+r"(dst_width),        // %2
+    "+r"(x),                // %3
+    "+r"(dx),               // %4
+    "+r"(tmp),              // %5
+    "+r"(src_tmp)           // %6
+  :
+  : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9",
+    "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+#undef LOAD2_DATA32_LANE
+
+#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libvpx/third_party/libyuv/source/scale_neon64.cc b/libs/libvpx/third_party/libyuv/source/scale_neon64.cc
new file mode 100644
index 0000000000..1d55193579
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/source/scale_neon64.cc
@@ -0,0 +1,1042 @@
+/*
+ *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon armv8 64 bit.
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+// Read 32x1 throw away even pixels, and write 16x1.
+void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst, int dst_width) {
+  asm volatile (
+  "1:                                          \n"
+    // load even pixels into v0, odd into v1
+    MEMACCESS(0)
+    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"
+    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
+    MEMACCESS(1)
+    "st1        {v1.16b}, [%1], #16            \n"  // store odd pixels
+    "b.gt       1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst),              // %1
+    "+r"(dst_width)         // %2
+  :
+  : "v0", "v1"              // Clobber List
+  );
+}
+
+// Read 32x1 average down and write 16x1.
+void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst, int dst_width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b,v1.16b}, [%0], #32     \n"  // load pixels and post inc
+    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
+    "uaddlp     v0.8h, v0.16b                  \n"  // add adjacent
+    "uaddlp     v1.8h, v1.16b                  \n"
+    "rshrn      v0.8b, v0.8h, #1               \n"  // downshift, round and pack
+    "rshrn2     v0.16b, v1.8h, #1              \n"
+    MEMACCESS(1)
+    "st1        {v0.16b}, [%1], #16            \n"
+    "b.gt       1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst),              // %1
+    "+r"(dst_width)         // %2
+  :
+  : "v0", "v1"     // Clobber List
+  );
+}
+
+// Read 32x2 average down and write 16x1.
+void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst, int dst_width) {
+  asm volatile (
+    // change the stride to row 2 pointer
+    "add        %1, %1, %0                     \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b,v1.16b}, [%0], #32    \n"  // load row 1 and post inc
+    MEMACCESS(1)
+    "ld1        {v2.16b, v3.16b}, [%1], #32    \n"  // load row 2 and post inc
+    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
+    "uaddlp     v0.8h, v0.16b                  \n"  // row 1 add adjacent
+    "uaddlp     v1.8h, v1.16b                  \n"
+    "uadalp     v0.8h, v2.16b                  \n"  // row 2 add adjacent + row1
+    "uadalp     v1.8h, v3.16b                  \n"
+    "rshrn      v0.8b, v0.8h, #2               \n"  // downshift, round and pack
+    "rshrn2     v0.16b, v1.8h, #2              \n"
+    MEMACCESS(2)
+    "st1        {v0.16b}, [%2], #16            \n"
+    "b.gt       1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(src_stride),       // %1
+    "+r"(dst),              // %2
+    "+r"(dst_width)         // %3
+  :
+  : "v0", "v1", "v2", "v3"     // Clobber List
+  );
+}
+
+void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4     {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32          \n"  // src line 0
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
+    MEMACCESS(1)
+    "st1     {v2.8b}, [%1], #8                 \n"
+    "b.gt       1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width)         // %2
+  :
+  : "v0", "v1", "v2", "v3", "memory", "cc"
+  );
+}
+
+void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
+  const uint8* src_ptr1 = src_ptr + src_stride;
+  const uint8* src_ptr2 = src_ptr + src_stride * 2;
+  const uint8* src_ptr3 = src_ptr + src_stride * 3;
+asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1     {v0.16b}, [%0], #16               \n"   // load up 16x4
+    MEMACCESS(3)
+    "ld1     {v1.16b}, [%2], #16               \n"
+    MEMACCESS(4)
+    "ld1     {v2.16b}, [%3], #16               \n"
+    MEMACCESS(5)
+    "ld1     {v3.16b}, [%4], #16               \n"
+    "subs    %w5, %w5, #4                      \n"
+    "uaddlp  v0.8h, v0.16b                     \n"
+    "uadalp  v0.8h, v1.16b                     \n"
+    "uadalp  v0.8h, v2.16b                     \n"
+    "uadalp  v0.8h, v3.16b                     \n"
+    "addp    v0.8h, v0.8h, v0.8h               \n"
+    "rshrn   v0.8b, v0.8h, #4                  \n"   // divide by 16 w/rounding
+    MEMACCESS(1)
+    "st1    {v0.s}[0], [%1], #4                \n"
+    "b.gt       1b                             \n"
+  : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(src_ptr1),  // %2
+    "+r"(src_ptr2),  // %3
+    "+r"(src_ptr3),  // %4
+    "+r"(dst_width)  // %5
+  :
+  : "v0", "v1", "v2", "v3", "memory", "cc"
+  );
+}
+
+// Down scale from 4 to 3 pixels. Use the neon multilane read/write
+// to load up the every 4th pixel into a 4 different registers.
+// Point samples 32 pixels to 24 pixels.
+void ScaleRowDown34_NEON(const uint8* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8* dst_ptr, int dst_width) {
+  asm volatile (
+  "1:                                                  \n"
+    MEMACCESS(0)
+    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src line 0
+    "subs      %w2, %w2, #24                           \n"
+    "orr       v2.16b, v3.16b, v3.16b                  \n"  // order v0, v1, v2
+    MEMACCESS(1)
+    "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n"
+    "b.gt      1b                                      \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width)         // %2
+  :
+  : "v0", "v1", "v2", "v3", "memory", "cc"
+  );
+}
+
+void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "movi      v20.8b, #3                              \n"
+    "add       %3, %3, %0                              \n"
+  "1:                                                  \n"
+    MEMACCESS(0)
+    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src line 0
+    MEMACCESS(3)
+    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32                \n"  // src line 1
+    "subs         %w2, %w2, #24                        \n"
+
+    // filter src line 0 with src line 1
+    // expand chars to shorts to allow for room
+    // when adding lines together
+    "ushll     v16.8h, v4.8b, #0                       \n"
+    "ushll     v17.8h, v5.8b, #0                       \n"
+    "ushll     v18.8h, v6.8b, #0                       \n"
+    "ushll     v19.8h, v7.8b, #0                       \n"
+
+    // 3 * line_0 + line_1
+    "umlal     v16.8h, v0.8b, v20.8b                   \n"
+    "umlal     v17.8h, v1.8b, v20.8b                   \n"
+    "umlal     v18.8h, v2.8b, v20.8b                   \n"
+    "umlal     v19.8h, v3.8b, v20.8b                   \n"
+
+    // (3 * line_0 + line_1) >> 2
+    "uqrshrn   v0.8b, v16.8h, #2                       \n"
+    "uqrshrn   v1.8b, v17.8h, #2                       \n"
+    "uqrshrn   v2.8b, v18.8h, #2                       \n"
+    "uqrshrn   v3.8b, v19.8h, #2                       \n"
+
+    // a0 = (src[0] * 3 + s[1] * 1) >> 2
+    "ushll     v16.8h, v1.8b, #0                       \n"
+    "umlal     v16.8h, v0.8b, v20.8b                   \n"
+    "uqrshrn   v0.8b, v16.8h, #2                       \n"
+
+    // a1 = (src[1] * 1 + s[2] * 1) >> 1
+    "urhadd    v1.8b, v1.8b, v2.8b                     \n"
+
+    // a2 = (src[2] * 1 + s[3] * 3) >> 2
+    "ushll     v16.8h, v2.8b, #0                       \n"
+    "umlal     v16.8h, v3.8b, v20.8b                   \n"
+    "uqrshrn   v2.8b, v16.8h, #2                       \n"
+
+    MEMACCESS(1)
+    "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n"
+
+    "b.gt      1b                                      \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width),        // %2
+    "+r"(src_stride)        // %3
+  :
+  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19",
+    "v20", "memory", "cc"
+  );
+}
+
+void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "movi      v20.8b, #3                              \n"
+    "add       %3, %3, %0                              \n"
+  "1:                                                  \n"
+    MEMACCESS(0)
+    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src line 0
+    MEMACCESS(3)
+    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32                \n"  // src line 1
+    "subs         %w2, %w2, #24                        \n"
+    // average src line 0 with src line 1
+    "urhadd    v0.8b, v0.8b, v4.8b                     \n"
+    "urhadd    v1.8b, v1.8b, v5.8b                     \n"
+    "urhadd    v2.8b, v2.8b, v6.8b                     \n"
+    "urhadd    v3.8b, v3.8b, v7.8b                     \n"
+
+    // a0 = (src[0] * 3 + s[1] * 1) >> 2
+    "ushll     v4.8h, v1.8b, #0                        \n"
+    "umlal     v4.8h, v0.8b, v20.8b                    \n"
+    "uqrshrn   v0.8b, v4.8h, #2                        \n"
+
+    // a1 = (src[1] * 1 + s[2] * 1) >> 1
+    "urhadd    v1.8b, v1.8b, v2.8b                     \n"
+
+    // a2 = (src[2] * 1 + s[3] * 3) >> 2
+    "ushll     v4.8h, v2.8b, #0                        \n"
+    "umlal     v4.8h, v3.8b, v20.8b                    \n"
+    "uqrshrn   v2.8b, v4.8h, #2                        \n"
+
+    MEMACCESS(1)
+    "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n"
+    "b.gt      1b                                      \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width),        // %2
+    "+r"(src_stride)        // %3
+  :
+  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc"
+  );
+}
+
+static uvec8 kShuf38 =
+  { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
+static uvec8 kShuf38_2 =
+  { 0, 16, 32, 2, 18, 33, 4, 20, 34, 6, 22, 35, 0, 0, 0, 0 };
+static vec16 kMult38_Div6 =
+  { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
+    65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
+static vec16 kMult38_Div9 =
+  { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
+    65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
+
+// 32 -> 12
+void ScaleRowDown38_NEON(const uint8* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    MEMACCESS(3)
+    "ld1       {v3.16b}, [%3]                          \n"
+  "1:                                                  \n"
+    MEMACCESS(0)
+    "ld1       {v0.16b,v1.16b}, [%0], #32             \n"
+    "subs      %w2, %w2, #12                           \n"
+    "tbl       v2.16b, {v0.16b,v1.16b}, v3.16b        \n"
+    MEMACCESS(1)
+    "st1       {v2.8b}, [%1], #8                       \n"
+    MEMACCESS(1)
+    "st1       {v2.s}[2], [%1], #4                     \n"
+    "b.gt      1b                                      \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width)         // %2
+  : "r"(&kShuf38)           // %3
+  : "v0", "v1", "v2", "v3", "memory", "cc"
+  );
+}
+
+// 32x3 -> 12x1
+void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
+                                      ptrdiff_t src_stride,
+                                      uint8* dst_ptr, int dst_width) {
+  const uint8* src_ptr1 = src_ptr + src_stride * 2;
+  ptrdiff_t tmp_src_stride = src_stride;
+
+  asm volatile (
+    MEMACCESS(5)
+    "ld1       {v29.8h}, [%5]                          \n"
+    MEMACCESS(6)
+    "ld1       {v30.16b}, [%6]                         \n"
+    MEMACCESS(7)
+    "ld1       {v31.8h}, [%7]                          \n"
+    "add       %2, %2, %0                              \n"
+  "1:                                                  \n"
+
+    // 00 40 01 41 02 42 03 43
+    // 10 50 11 51 12 52 13 53
+    // 20 60 21 61 22 62 23 63
+    // 30 70 31 71 32 72 33 73
+    MEMACCESS(0)
+    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"
+    MEMACCESS(3)
+    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32                \n"
+    MEMACCESS(4)
+    "ld4       {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32              \n"
+    "subs      %w4, %w4, #12                           \n"
+
+    // Shuffle the input data around to get align the data
+    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+    // 00 10 01 11 02 12 03 13
+    // 40 50 41 51 42 52 43 53
+    "trn1      v20.8b, v0.8b, v1.8b                    \n"
+    "trn2      v21.8b, v0.8b, v1.8b                    \n"
+    "trn1      v22.8b, v4.8b, v5.8b                    \n"
+    "trn2      v23.8b, v4.8b, v5.8b                    \n"
+    "trn1      v24.8b, v16.8b, v17.8b                  \n"
+    "trn2      v25.8b, v16.8b, v17.8b                  \n"
+
+    // 20 30 21 31 22 32 23 33
+    // 60 70 61 71 62 72 63 73
+    "trn1      v0.8b, v2.8b, v3.8b                     \n"
+    "trn2      v1.8b, v2.8b, v3.8b                     \n"
+    "trn1      v4.8b, v6.8b, v7.8b                     \n"
+    "trn2      v5.8b, v6.8b, v7.8b                     \n"
+    "trn1      v16.8b, v18.8b, v19.8b                  \n"
+    "trn2      v17.8b, v18.8b, v19.8b                  \n"
+
+    // 00+10 01+11 02+12 03+13
+    // 40+50 41+51 42+52 43+53
+    "uaddlp    v20.4h, v20.8b                          \n"
+    "uaddlp    v21.4h, v21.8b                          \n"
+    "uaddlp    v22.4h, v22.8b                          \n"
+    "uaddlp    v23.4h, v23.8b                          \n"
+    "uaddlp    v24.4h, v24.8b                          \n"
+    "uaddlp    v25.4h, v25.8b                          \n"
+
+    // 60+70 61+71 62+72 63+73
+    "uaddlp    v1.4h, v1.8b                            \n"
+    "uaddlp    v5.4h, v5.8b                            \n"
+    "uaddlp    v17.4h, v17.8b                          \n"
+
+    // combine source lines
+    "add       v20.4h, v20.4h, v22.4h                  \n"
+    "add       v21.4h, v21.4h, v23.4h                  \n"
+    "add       v20.4h, v20.4h, v24.4h                  \n"
+    "add       v21.4h, v21.4h, v25.4h                  \n"
+    "add       v2.4h, v1.4h, v5.4h                     \n"
+    "add       v2.4h, v2.4h, v17.4h                    \n"
+
+    // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
+    //             + s[6 + st * 1] + s[7 + st * 1]
+    //             + s[6 + st * 2] + s[7 + st * 2]) / 6
+    "sqrdmulh  v2.8h, v2.8h, v29.8h                    \n"
+    "xtn       v2.8b,  v2.8h                           \n"
+
+    // Shuffle 2,3 reg around so that 2 can be added to the
+    //  0,1 reg and 3 can be added to the 4,5 reg. This
+    //  requires expanding from u8 to u16 as the 0,1 and 4,5
+    //  registers are already expanded. Then do transposes
+    //  to get aligned.
+    // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+    "ushll     v16.8h, v16.8b, #0                      \n"
+    "uaddl     v0.8h, v0.8b, v4.8b                     \n"
+
+    // combine source lines
+    "add       v0.8h, v0.8h, v16.8h                    \n"
+
+    // xx 20 xx 21 xx 22 xx 23
+    // xx 30 xx 31 xx 32 xx 33
+    "trn1      v1.8h, v0.8h, v0.8h                     \n"
+    "trn2      v4.8h, v0.8h, v0.8h                     \n"
+    "xtn       v0.4h, v1.4s                            \n"
+    "xtn       v4.4h, v4.4s                            \n"
+
+    // 0+1+2, 3+4+5
+    "add       v20.8h, v20.8h, v0.8h                   \n"
+    "add       v21.8h, v21.8h, v4.8h                   \n"
+
+    // Need to divide, but can't downshift as the the value
+    //  isn't a power of 2. So multiply by 65536 / n
+    //  and take the upper 16 bits.
+    "sqrdmulh  v0.8h, v20.8h, v31.8h                   \n"
+    "sqrdmulh  v1.8h, v21.8h, v31.8h                   \n"
+
+    // Align for table lookup, vtbl requires registers to
+    //  be adjacent
+    "tbl       v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
+
+    MEMACCESS(1)
+    "st1       {v3.8b}, [%1], #8                       \n"
+    MEMACCESS(1)
+    "st1       {v3.s}[2], [%1], #4                     \n"
+    "b.gt      1b                                      \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(tmp_src_stride),   // %2
+    "+r"(src_ptr1),         // %3
+    "+r"(dst_width)         // %4
+  : "r"(&kMult38_Div6),     // %5
+    "r"(&kShuf38_2),        // %6
+    "r"(&kMult38_Div9)      // %7
+  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
+    "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29",
+    "v30", "v31", "memory", "cc"
+  );
+}
+
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  // TODO(fbarchard): use src_stride directly for clang 3.5+.
+  ptrdiff_t tmp_src_stride = src_stride;
+  asm volatile (
+    MEMACCESS(4)
+    "ld1       {v30.8h}, [%4]                          \n"
+    MEMACCESS(5)
+    "ld1       {v31.16b}, [%5]                         \n"
+    "add       %2, %2, %0                              \n"
+  "1:                                                  \n"
+
+    // 00 40 01 41 02 42 03 43
+    // 10 50 11 51 12 52 13 53
+    // 20 60 21 61 22 62 23 63
+    // 30 70 31 71 32 72 33 73
+    MEMACCESS(0)
+    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"
+    MEMACCESS(3)
+    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32                \n"
+    "subs      %w3, %w3, #12                           \n"
+
+    // Shuffle the input data around to get align the data
+    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+    // 00 10 01 11 02 12 03 13
+    // 40 50 41 51 42 52 43 53
+    "trn1      v16.8b, v0.8b, v1.8b                    \n"
+    "trn2      v17.8b, v0.8b, v1.8b                    \n"
+    "trn1      v18.8b, v4.8b, v5.8b                    \n"
+    "trn2      v19.8b, v4.8b, v5.8b                    \n"
+
+    // 20 30 21 31 22 32 23 33
+    // 60 70 61 71 62 72 63 73
+    "trn1      v0.8b, v2.8b, v3.8b                     \n"
+    "trn2      v1.8b, v2.8b, v3.8b                     \n"
+    "trn1      v4.8b, v6.8b, v7.8b                     \n"
+    "trn2      v5.8b, v6.8b, v7.8b                     \n"
+
+    // 00+10 01+11 02+12 03+13
+    // 40+50 41+51 42+52 43+53
+    "uaddlp    v16.4h, v16.8b                          \n"
+    "uaddlp    v17.4h, v17.8b                          \n"
+    "uaddlp    v18.4h, v18.8b                          \n"
+    "uaddlp    v19.4h, v19.8b                          \n"
+
+    // 60+70 61+71 62+72 63+73
+    "uaddlp    v1.4h, v1.8b                            \n"
+    "uaddlp    v5.4h, v5.8b                            \n"
+
+    // combine source lines
+    "add       v16.4h, v16.4h, v18.4h                  \n"
+    "add       v17.4h, v17.4h, v19.4h                  \n"
+    "add       v2.4h, v1.4h, v5.4h                     \n"
+
+    // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
+    "uqrshrn   v2.8b, v2.8h, #2                        \n"
+
+    // Shuffle 2,3 reg around so that 2 can be added to the
+    //  0,1 reg and 3 can be added to the 4,5 reg. This
+    //  requires expanding from u8 to u16 as the 0,1 and 4,5
+    //  registers are already expanded. Then do transposes
+    //  to get aligned.
+    // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+
+    // combine source lines
+    "uaddl     v0.8h, v0.8b, v4.8b                     \n"
+
+    // xx 20 xx 21 xx 22 xx 23
+    // xx 30 xx 31 xx 32 xx 33
+    "trn1      v1.8h, v0.8h, v0.8h                     \n"
+    "trn2      v4.8h, v0.8h, v0.8h                     \n"
+    "xtn       v0.4h, v1.4s                            \n"
+    "xtn       v4.4h, v4.4s                            \n"
+
+    // 0+1+2, 3+4+5
+    "add       v16.8h, v16.8h, v0.8h                   \n"
+    "add       v17.8h, v17.8h, v4.8h                   \n"
+
+    // Need to divide, but can't downshift as the the value
+    //  isn't a power of 2. So multiply by 65536 / n
+    //  and take the upper 16 bits.
+    "sqrdmulh  v0.8h, v16.8h, v30.8h                   \n"
+    "sqrdmulh  v1.8h, v17.8h, v30.8h                   \n"
+
+    // Align for table lookup, vtbl requires registers to
+    //  be adjacent
+
+    "tbl       v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
+
+    MEMACCESS(1)
+    "st1       {v3.8b}, [%1], #8                       \n"
+    MEMACCESS(1)
+    "st1       {v3.s}[2], [%1], #4                     \n"
+    "b.gt      1b                                      \n"
+  : "+r"(src_ptr),         // %0
+    "+r"(dst_ptr),         // %1
+    "+r"(tmp_src_stride),  // %2
+    "+r"(dst_width)        // %3
+  : "r"(&kMult38_Div6),    // %4
+    "r"(&kShuf38_2)        // %5
+  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
+    "v18", "v19", "v30", "v31", "memory", "cc"
+  );
+}
+
+void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                    uint16* dst_ptr, int src_width, int src_height) {
+  const uint8* src_tmp = NULL;
+  asm volatile (
+  "1:                                          \n"
+    "mov       %0, %1                          \n"
+    "mov       w12, %w5                        \n"
+    "eor       v2.16b, v2.16b, v2.16b          \n"
+    "eor       v3.16b, v3.16b, v3.16b          \n"
+  "2:                                          \n"
+    // load 16 pixels into q0
+    MEMACCESS(0)
+    "ld1       {v0.16b}, [%0], %3              \n"
+    "uaddw2    v3.8h, v3.8h, v0.16b            \n"
+    "uaddw     v2.8h, v2.8h, v0.8b             \n"
+    "subs      w12, w12, #1                    \n"
+    "b.gt      2b                              \n"
+    MEMACCESS(2)
+    "st1      {v2.8h, v3.8h}, [%2], #32        \n"  // store pixels
+    "add      %1, %1, #16                      \n"
+    "subs     %w4, %w4, #16                    \n"  // 16 processed per loop
+    "b.gt     1b                               \n"
+  : "+r"(src_tmp),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(dst_ptr),          // %2
+    "+r"(src_stride),       // %3
+    "+r"(src_width),        // %4
+    "+r"(src_height)        // %5
+  :
+  : "memory", "cc", "w12", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD2_DATA8_LANE(n)                                    \
+    "lsr        %5, %3, #16                    \n"             \
+    "add        %6, %1, %5                    \n"              \
+    "add        %3, %3, %4                     \n"             \
+    MEMACCESS(6)                                               \
+    "ld2        {v4.b, v5.b}["#n"], [%6]      \n"
+
+void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
+                          int dst_width, int x, int dx) {
+  int dx_offset[4] = {0, 1, 2, 3};
+  int* tmp = dx_offset;
+  const uint8* src_tmp = src_ptr;
+  int64 dst_width64 = (int64) dst_width;  // Work around ios 64 bit warning.
+  int64 x64 = (int64) x;
+  int64 dx64 = (int64) dx;
+  asm volatile (
+    "dup        v0.4s, %w3                     \n"  // x
+    "dup        v1.4s, %w4                     \n"  // dx
+    "ld1        {v2.4s}, [%5]                  \n"  // 0 1 2 3
+    "shl        v3.4s, v1.4s, #2               \n"  // 4 * dx
+    "mul        v1.4s, v1.4s, v2.4s            \n"
+    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
+    "add        v1.4s, v1.4s, v0.4s            \n"
+    // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
+    "add        v2.4s, v1.4s, v3.4s            \n"
+    "shl        v0.4s, v3.4s, #1               \n"  // 8 * dx
+  "1:                                          \n"
+    LOAD2_DATA8_LANE(0)
+    LOAD2_DATA8_LANE(1)
+    LOAD2_DATA8_LANE(2)
+    LOAD2_DATA8_LANE(3)
+    LOAD2_DATA8_LANE(4)
+    LOAD2_DATA8_LANE(5)
+    LOAD2_DATA8_LANE(6)
+    LOAD2_DATA8_LANE(7)
+    "mov       v6.16b, v1.16b                  \n"
+    "mov       v7.16b, v2.16b                  \n"
+    "uzp1      v6.8h, v6.8h, v7.8h             \n"
+    "ushll     v4.8h, v4.8b, #0                \n"
+    "ushll     v5.8h, v5.8b, #0                \n"
+    "ssubl     v16.4s, v5.4h, v4.4h            \n"
+    "ssubl2    v17.4s, v5.8h, v4.8h            \n"
+    "ushll     v7.4s, v6.4h, #0                \n"
+    "ushll2    v6.4s, v6.8h, #0                \n"
+    "mul       v16.4s, v16.4s, v7.4s           \n"
+    "mul       v17.4s, v17.4s, v6.4s           \n"
+    "shrn      v6.4h, v16.4s, #16              \n"
+    "shrn2     v6.8h, v17.4s, #16              \n"
+    "add       v4.8h, v4.8h, v6.8h             \n"
+    "xtn       v4.8b, v4.8h                    \n"
+
+    MEMACCESS(0)
+    "st1       {v4.8b}, [%0], #8               \n"  // store pixels
+    "add       v1.4s, v1.4s, v0.4s             \n"
+    "add       v2.4s, v2.4s, v0.4s             \n"
+    "subs      %w2, %w2, #8                    \n"  // 8 processed per loop
+    "b.gt      1b                              \n"
+  : "+r"(dst_ptr),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(dst_width64),      // %2
+    "+r"(x64),              // %3
+    "+r"(dx64),             // %4
+    "+r"(tmp),              // %5
+    "+r"(src_tmp)           // %6
+  :
+  : "memory", "cc", "v0", "v1", "v2", "v3",
+    "v4", "v5", "v6", "v7", "v16", "v17"
+  );
+}
+
+#undef LOAD2_DATA8_LANE
+
+// 16x2 -> 16x1
+void ScaleFilterRows_NEON(uint8* dst_ptr,
+                          const uint8* src_ptr, ptrdiff_t src_stride,
+                          int dst_width, int source_y_fraction) {
+    int y_fraction = 256 - source_y_fraction;
+  asm volatile (
+    "cmp          %w4, #0                      \n"
+    "b.eq         100f                         \n"
+    "add          %2, %2, %1                   \n"
+    "cmp          %w4, #64                     \n"
+    "b.eq         75f                          \n"
+    "cmp          %w4, #128                    \n"
+    "b.eq         50f                          \n"
+    "cmp          %w4, #192                    \n"
+    "b.eq         25f                          \n"
+
+    "dup          v5.8b, %w4                   \n"
+    "dup          v4.8b, %w5                   \n"
+    // General purpose row blend.
+  "1:                                          \n"
+    MEMACCESS(1)
+    "ld1          {v0.16b}, [%1], #16          \n"
+    MEMACCESS(2)
+    "ld1          {v1.16b}, [%2], #16          \n"
+    "subs         %w3, %w3, #16                \n"
+    "umull        v6.8h, v0.8b, v4.8b          \n"
+    "umull2       v7.8h, v0.16b, v4.16b        \n"
+    "umlal        v6.8h, v1.8b, v5.8b          \n"
+    "umlal2       v7.8h, v1.16b, v5.16b        \n"
+    "rshrn        v0.8b, v6.8h, #8             \n"
+    "rshrn2       v0.16b, v7.8h, #8            \n"
+    MEMACCESS(0)
+    "st1          {v0.16b}, [%0], #16          \n"
+    "b.gt         1b                           \n"
+    "b            99f                          \n"
+
+    // Blend 25 / 75.
+  "25:                                         \n"
+    MEMACCESS(1)
+    "ld1          {v0.16b}, [%1], #16          \n"
+    MEMACCESS(2)
+    "ld1          {v1.16b}, [%2], #16          \n"
+    "subs         %w3, %w3, #16                \n"
+    "urhadd       v0.16b, v0.16b, v1.16b       \n"
+    "urhadd       v0.16b, v0.16b, v1.16b       \n"
+    MEMACCESS(0)
+    "st1          {v0.16b}, [%0], #16          \n"
+    "b.gt         25b                          \n"
+    "b            99f                          \n"
+
+    // Blend 50 / 50.
+  "50:                                         \n"
+    MEMACCESS(1)
+    "ld1          {v0.16b}, [%1], #16          \n"
+    MEMACCESS(2)
+    "ld1          {v1.16b}, [%2], #16          \n"
+    "subs         %w3, %w3, #16                \n"
+    "urhadd       v0.16b, v0.16b, v1.16b       \n"
+    MEMACCESS(0)
+    "st1          {v0.16b}, [%0], #16          \n"
+    "b.gt         50b                          \n"
+    "b            99f                          \n"
+
+    // Blend 75 / 25.
+  "75:                                         \n"
+    MEMACCESS(1)
+    "ld1          {v1.16b}, [%1], #16          \n"
+    MEMACCESS(2)
+    "ld1          {v0.16b}, [%2], #16          \n"
+    "subs         %w3, %w3, #16                \n"
+    "urhadd       v0.16b, v0.16b, v1.16b       \n"
+    "urhadd       v0.16b, v0.16b, v1.16b       \n"
+    MEMACCESS(0)
+    "st1          {v0.16b}, [%0], #16          \n"
+    "b.gt         75b                          \n"
+    "b            99f                          \n"
+
+    // Blend 100 / 0 - Copy row unchanged.
+  "100:                                        \n"
+    MEMACCESS(1)
+    "ld1          {v0.16b}, [%1], #16          \n"
+    "subs         %w3, %w3, #16                \n"
+    MEMACCESS(0)
+    "st1          {v0.16b}, [%0], #16          \n"
+    "b.gt         100b                         \n"
+
+  "99:                                         \n"
+    MEMACCESS(0)
+    "st1          {v0.b}[15], [%0]             \n"
+  : "+r"(dst_ptr),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(src_stride),       // %2
+    "+r"(dst_width),        // %3
+    "+r"(source_y_fraction),// %4
+    "+r"(y_fraction)        // %5
+  :
+  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc"
+  );
+}
+
+void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst, int dst_width) {
+  asm volatile (
+  "1:                                          \n"
+    // load even pixels into q0, odd into q1
+    MEMACCESS (0)
+    "ld2        {v0.4s, v1.4s}, [%0], #32      \n"
+    MEMACCESS (0)
+    "ld2        {v2.4s, v3.4s}, [%0], #32      \n"
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
+    MEMACCESS (1)
+    "st1        {v1.16b}, [%1], #16            \n"  // store odd pixels
+    MEMACCESS (1)
+    "st1        {v3.16b}, [%1], #16            \n"
+    "b.gt       1b                             \n"
+  : "+r" (src_ptr),          // %0
+    "+r" (dst),              // %1
+    "+r" (dst_width)         // %2
+  :
+  : "memory", "cc", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+
+void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                  uint8* dst_argb, int dst_width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS (0)
+    // load 8 ARGB pixels.
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64   \n"
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
+    "uaddlp     v3.8h, v3.16b                  \n"  // A 16 bytes -> 8 shorts.
+    "rshrn      v0.8b, v0.8h, #1               \n"  // downshift, round and pack
+    "rshrn      v1.8b, v1.8h, #1               \n"
+    "rshrn      v2.8b, v2.8h, #1               \n"
+    "rshrn      v3.8b, v3.8h, #1               \n"
+    MEMACCESS (1)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32     \n"
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),         // %0
+    "+r"(dst_argb),         // %1
+    "+r"(dst_width)         // %2
+  :
+  : "memory", "cc", "v0", "v1", "v2", "v3"    // Clobber List
+  );
+}
+
+void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst, int dst_width) {
+  asm volatile (
+    // change the stride to row 2 pointer
+    "add        %1, %1, %0                     \n"
+  "1:                                          \n"
+    MEMACCESS (0)
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64   \n"  // load 8 ARGB pixels.
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
+    "uaddlp     v3.8h, v3.16b                  \n"  // A 16 bytes -> 8 shorts.
+    MEMACCESS (1)
+    "ld4        {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n"  // load 8 more ARGB pixels.
+    "uadalp     v0.8h, v16.16b                 \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v17.16b                 \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v18.16b                 \n"  // R 16 bytes -> 8 shorts.
+    "uadalp     v3.8h, v19.16b                 \n"  // A 16 bytes -> 8 shorts.
+    "rshrn      v0.8b, v0.8h, #2               \n"  // downshift, round and pack
+    "rshrn      v1.8b, v1.8h, #2               \n"
+    "rshrn      v2.8b, v2.8h, #2               \n"
+    "rshrn      v3.8b, v3.8h, #2               \n"
+    MEMACCESS (2)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32     \n"
+    "b.gt       1b                             \n"
+  : "+r" (src_ptr),          // %0
+    "+r" (src_stride),       // %1
+    "+r" (dst),              // %2
+    "+r" (dst_width)         // %3
+  :
+  : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"
+  );
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+void ScaleARGBRowDownEven_NEON(const uint8* src_argb,  ptrdiff_t src_stride,
+                               int src_stepx, uint8* dst_argb, int dst_width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.s}[0], [%0], %3            \n"
+    MEMACCESS(0)
+    "ld1        {v0.s}[1], [%0], %3            \n"
+    MEMACCESS(0)
+    "ld1        {v0.s}[2], [%0], %3            \n"
+    MEMACCESS(0)
+    "ld1        {v0.s}[3], [%0], %3            \n"
+    "subs       %w2, %w2, #4                   \n"  // 4 pixels per loop.
+    MEMACCESS(1)
+    "st1        {v0.16b}, [%1], #16            \n"
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),    // %0
+    "+r"(dst_argb),    // %1
+    "+r"(dst_width)    // %2
+  : "r"((int64)(src_stepx * 4)) // %3
+  : "memory", "cc", "v0"
+  );
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+// TODO(Yang Zhang): Might be worth another optimization pass in future.
+// It could be upgraded to 8 pixels at a time to start with.
+void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8* dst_argb, int dst_width) {
+  asm volatile (
+    "add        %1, %1, %0                     \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.8b}, [%0], %4              \n"  // Read 4 2x2 blocks -> 2x1
+    MEMACCESS(1)
+    "ld1        {v1.8b}, [%1], %4              \n"
+    MEMACCESS(0)
+    "ld1        {v2.8b}, [%0], %4              \n"
+    MEMACCESS(1)
+    "ld1        {v3.8b}, [%1], %4              \n"
+    MEMACCESS(0)
+    "ld1        {v4.8b}, [%0], %4              \n"
+    MEMACCESS(1)
+    "ld1        {v5.8b}, [%1], %4              \n"
+    MEMACCESS(0)
+    "ld1        {v6.8b}, [%0], %4              \n"
+    MEMACCESS(1)
+    "ld1        {v7.8b}, [%1], %4              \n"
+    "uaddl      v0.8h, v0.8b, v1.8b            \n"
+    "uaddl      v2.8h, v2.8b, v3.8b            \n"
+    "uaddl      v4.8h, v4.8b, v5.8b            \n"
+    "uaddl      v6.8h, v6.8b, v7.8b            \n"
+    "mov        v16.d[1], v0.d[1]              \n"  // ab_cd -> ac_bd
+    "mov        v0.d[1], v2.d[0]               \n"
+    "mov        v2.d[0], v16.d[1]              \n"
+    "mov        v16.d[1], v4.d[1]              \n"  // ef_gh -> eg_fh
+    "mov        v4.d[1], v6.d[0]               \n"
+    "mov        v6.d[0], v16.d[1]              \n"
+    "add        v0.8h, v0.8h, v2.8h            \n"  // (a+b)_(c+d)
+    "add        v4.8h, v4.8h, v6.8h            \n"  // (e+f)_(g+h)
+    "rshrn      v0.8b, v0.8h, #2               \n"  // first 2 pixels.
+    "rshrn2     v0.16b, v4.8h, #2              \n"  // next 2 pixels.
+    "subs       %w3, %w3, #4                   \n"  // 4 pixels per loop.
+    MEMACCESS(2)
+    "st1     {v0.16b}, [%2], #16               \n"
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),    // %0
+    "+r"(src_stride),  // %1
+    "+r"(dst_argb),    // %2
+    "+r"(dst_width)    // %3
+  : "r"((int64)(src_stepx * 4)) // %4
+  : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
+  );
+}
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD1_DATA32_LANE(vn, n)                               \
+    "lsr        %5, %3, #16                    \n"             \
+    "add        %6, %1, %5, lsl #2             \n"             \
+    "add        %3, %3, %4                     \n"             \
+    MEMACCESS(6)                                               \
+    "ld1        {"#vn".s}["#n"], [%6]          \n"
+
+void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
+                        int dst_width, int x, int dx) {
+  const uint8* src_tmp = src_argb;
+  int64 dst_width64 = (int64) dst_width;  // Work around ios 64 bit warning.
+  int64 x64 = (int64) x;
+  int64 dx64 = (int64) dx;
+  int64 tmp64 = 0;
+  asm volatile (
+  "1:                                          \n"
+    LOAD1_DATA32_LANE(v0, 0)
+    LOAD1_DATA32_LANE(v0, 1)
+    LOAD1_DATA32_LANE(v0, 2)
+    LOAD1_DATA32_LANE(v0, 3)
+    LOAD1_DATA32_LANE(v1, 0)
+    LOAD1_DATA32_LANE(v1, 1)
+    LOAD1_DATA32_LANE(v1, 2)
+    LOAD1_DATA32_LANE(v1, 3)
+
+    MEMACCESS(0)
+    "st1        {v0.4s, v1.4s}, [%0], #32      \n"  // store pixels
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
+    "b.gt        1b                            \n"
+  : "+r"(dst_argb),         // %0
+    "+r"(src_argb),         // %1
+    "+r"(dst_width64),      // %2
+    "+r"(x64),              // %3
+    "+r"(dx64),             // %4
+    "+r"(tmp64),            // %5
+    "+r"(src_tmp)           // %6
+  :
+  : "memory", "cc", "v0", "v1"
+  );
+}
+
+#undef LOAD1_DATA32_LANE
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD2_DATA32_LANE(vn1, vn2, n)                         \
+    "lsr        %5, %3, #16                           \n"      \
+    "add        %6, %1, %5, lsl #2                    \n"      \
+    "add        %3, %3, %4                            \n"      \
+    MEMACCESS(6)                                               \
+    "ld2        {"#vn1".s, "#vn2".s}["#n"], [%6]      \n"
+
+void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
+                              int dst_width, int x, int dx) {
+  int dx_offset[4] = {0, 1, 2, 3};
+  int* tmp = dx_offset;
+  const uint8* src_tmp = src_argb;
+  int64 dst_width64 = (int64) dst_width;  // Work around ios 64 bit warning.
+  int64 x64 = (int64) x;
+  int64 dx64 = (int64) dx;
+  asm volatile (
+    "dup        v0.4s, %w3                     \n"  // x
+    "dup        v1.4s, %w4                     \n"  // dx
+    "ld1        {v2.4s}, [%5]                  \n"  // 0 1 2 3
+    "shl        v6.4s, v1.4s, #2               \n"  // 4 * dx
+    "mul        v1.4s, v1.4s, v2.4s            \n"
+    "movi       v3.16b, #0x7f                  \n"  // 0x7F
+    "movi       v4.8h, #0x7f                   \n"  // 0x7F
+    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
+    "add        v5.4s, v1.4s, v0.4s            \n"
+  "1:                                          \n"
+    // d0, d1: a
+    // d2, d3: b
+    LOAD2_DATA32_LANE(v0, v1, 0)
+    LOAD2_DATA32_LANE(v0, v1, 1)
+    LOAD2_DATA32_LANE(v0, v1, 2)
+    LOAD2_DATA32_LANE(v0, v1, 3)
+    "shrn       v2.4h, v5.4s, #9               \n"
+    "and        v2.8b, v2.8b, v4.8b            \n"
+    "dup        v16.8b, v2.b[0]                \n"
+    "dup        v17.8b, v2.b[2]                \n"
+    "dup        v18.8b, v2.b[4]                \n"
+    "dup        v19.8b, v2.b[6]                \n"
+    "ext        v2.8b, v16.8b, v17.8b, #4      \n"
+    "ext        v17.8b, v18.8b, v19.8b, #4     \n"
+    "ins        v2.d[1], v17.d[0]              \n"  // f
+    "eor        v7.16b, v2.16b, v3.16b         \n"  // 0x7f ^ f
+    "umull      v16.8h, v0.8b, v7.8b           \n"
+    "umull2     v17.8h, v0.16b, v7.16b         \n"
+    "umull      v18.8h, v1.8b, v2.8b           \n"
+    "umull2     v19.8h, v1.16b, v2.16b         \n"
+    "add        v16.8h, v16.8h, v18.8h         \n"
+    "add        v17.8h, v17.8h, v19.8h         \n"
+    "shrn       v0.8b, v16.8h, #7              \n"
+    "shrn2      v0.16b, v17.8h, #7             \n"
+
+    MEMACCESS(0)
+    "st1     {v0.4s}, [%0], #16                \n"  // store pixels
+    "add     v5.4s, v5.4s, v6.4s               \n"
+    "subs    %w2, %w2, #4                      \n"  // 4 processed per loop
+    "b.gt    1b                                \n"
+  : "+r"(dst_argb),         // %0
+    "+r"(src_argb),         // %1
+    "+r"(dst_width64),      // %2
+    "+r"(x64),              // %3
+    "+r"(dx64),             // %4
+    "+r"(tmp),              // %5
+    "+r"(src_tmp)           // %6
+  :
+  : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
+    "v6", "v7", "v16", "v17", "v18", "v19"
+  );
+}
+
+#undef LOAD2_DATA32_LANE
+
+#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libvpx/third_party/libyuv/source/scale_win.cc b/libs/libvpx/third_party/libyuv/source/scale_win.cc
new file mode 100644
index 0000000000..c3896ebad2
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/source/scale_win.cc
@@ -0,0 +1,1354 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for Visual C x86.
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
+    defined(_MSC_VER) && !defined(__clang__)
+
+// Offsets for source bytes 0 to 9
+static uvec8 kShuf0 =
+  { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
+static uvec8 kShuf1 =
+  { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static uvec8 kShuf2 =
+  { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 0 to 10
+static uvec8 kShuf01 =
+  { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
+
+// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
+static uvec8 kShuf11 =
+  { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static uvec8 kShuf21 =
+  { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
+
+// Coefficients for source bytes 0 to 10
+static uvec8 kMadd01 =
+  { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
+
+// Coefficients for source bytes 10 to 21
+static uvec8 kMadd11 =
+  { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
+
+// Coefficients for source bytes 21 to 31
+static uvec8 kMadd21 =
+  { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
+
+// Coefficients for source bytes 21 to 31
+static vec16 kRound34 =
+  { 2, 2, 2, 2, 2, 2, 2, 2 };
+
+static uvec8 kShuf38a =
+  { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+static uvec8 kShuf38b =
+  { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
+
+// Arrange words 0,3,6 into 0,1,2
+static uvec8 kShufAc =
+  { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Arrange words 0,3,6 into 3,4,5
+static uvec8 kShufAc3 =
+  { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
+
+// Scaling values for boxes of 3x3 and 2x3
+static uvec16 kScaleAc33 =
+  { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
+
+// Arrange first value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb0 =
+  { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
+
+// Arrange second value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb1 =
+  { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
+
+// Arrange third value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb2 =
+  { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
+
+// Scaling values for boxes of 3x2 and 2x2
+static uvec16 kScaleAb2 =
+  { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
+
+// Reads 32 pixels, throws half away and writes 16 pixels.
+__declspec(naked)
+void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_ptr
+                                     // src_stride ignored
+    mov        edx, [esp + 12]       // dst_ptr
+    mov        ecx, [esp + 16]       // dst_width
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    psrlw      xmm0, 8               // isolate odd pixels.
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         wloop
+
+    ret
+  }
+}
+
+// Blends 32x1 rectangle to 16x1.
+__declspec(naked)
+void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_ptr
+                                     // src_stride
+    mov        edx, [esp + 12]       // dst_ptr
+    mov        ecx, [esp + 16]       // dst_width
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+
+    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
+    psrlw      xmm0, 8
+    movdqa     xmm3, xmm1
+    psrlw      xmm1, 8
+    pand       xmm2, xmm5
+    pand       xmm3, xmm5
+    pavgw      xmm0, xmm2
+    pavgw      xmm1, xmm3
+    packuswb   xmm0, xmm1
+
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         wloop
+
+    ret
+  }
+}
+
+// Blends 32x2 rectangle to 16x1.
+__declspec(naked)
+void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]    // src_ptr
+    mov        esi, [esp + 4 + 8]    // src_stride
+    mov        edx, [esp + 4 + 12]   // dst_ptr
+    mov        ecx, [esp + 4 + 16]   // dst_width
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + esi]
+    movdqu     xmm3, [eax + esi + 16]
+    lea        eax,  [eax + 32]
+    pavgb      xmm0, xmm2            // average rows
+    pavgb      xmm1, xmm3
+
+    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
+    psrlw      xmm0, 8
+    movdqa     xmm3, xmm1
+    psrlw      xmm1, 8
+    pand       xmm2, xmm5
+    pand       xmm3, xmm5
+    pavgw      xmm0, xmm2
+    pavgw      xmm1, xmm3
+    packuswb   xmm0, xmm1
+
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         wloop
+
+    pop        esi
+    ret
+  }
+}
+
+#ifdef HAS_SCALEROWDOWN2_AVX2
+// Reads 64 pixels, throws half away and writes 32 pixels.
+__declspec(naked)
+void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_ptr
+                                     // src_stride ignored
+    mov        edx, [esp + 12]       // dst_ptr
+    mov        ecx, [esp + 16]       // dst_width
+
+  wloop:
+    vmovdqu     ymm0, [eax]
+    vmovdqu     ymm1, [eax + 32]
+    lea         eax,  [eax + 64]
+    vpsrlw      ymm0, ymm0, 8        // isolate odd pixels.
+    vpsrlw      ymm1, ymm1, 8
+    vpackuswb   ymm0, ymm0, ymm1
+    vpermq      ymm0, ymm0, 0xd8     // unmutate vpackuswb
+    vmovdqu     [edx], ymm0
+    lea         edx, [edx + 32]
+    sub         ecx, 32
+    jg          wloop
+
+    vzeroupper
+    ret
+  }
+}
+
+// Blends 64x1 rectangle to 32x1.
+__declspec(naked)
+void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov         eax, [esp + 4]        // src_ptr
+                                      // src_stride
+    mov         edx, [esp + 12]       // dst_ptr
+    mov         ecx, [esp + 16]       // dst_width
+
+    vpcmpeqb    ymm4, ymm4, ymm4      // '1' constant, 8b
+    vpsrlw      ymm4, ymm4, 15
+    vpackuswb   ymm4, ymm4, ymm4
+    vpxor       ymm5, ymm5, ymm5      // constant 0
+
+  wloop:
+    vmovdqu     ymm0, [eax]
+    vmovdqu     ymm1, [eax + 32]
+    lea         eax,  [eax + 64]
+
+    vpmaddubsw  ymm0, ymm0, ymm4      // average horizontally
+    vpmaddubsw  ymm1, ymm1, ymm4
+    vpavgw      ymm0, ymm0, ymm5      // (x + 1) / 2
+    vpavgw      ymm1, ymm1, ymm5
+    vpackuswb   ymm0, ymm0, ymm1
+    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
+
+    vmovdqu     [edx], ymm0
+    lea         edx, [edx + 32]
+    sub         ecx, 32
+    jg          wloop
+
+    vzeroupper
+    ret
+  }
+}
+
+// Blends 64x2 rectangle to 32x1.
+__declspec(naked)
+void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
+  __asm {
+    push        esi
+    mov         eax, [esp + 4 + 4]    // src_ptr
+    mov         esi, [esp + 4 + 8]    // src_stride
+    mov         edx, [esp + 4 + 12]   // dst_ptr
+    mov         ecx, [esp + 4 + 16]   // dst_width
+
+    vpcmpeqb    ymm4, ymm4, ymm4      // '1' constant, 8b
+    vpsrlw      ymm4, ymm4, 15
+    vpackuswb   ymm4, ymm4, ymm4
+    vpxor       ymm5, ymm5, ymm5      // constant 0
+
+  wloop:
+    vmovdqu     ymm0, [eax]           // average rows
+    vmovdqu     ymm1, [eax + 32]
+    vpavgb      ymm0, ymm0, [eax + esi]
+    vpavgb      ymm1, ymm1, [eax + esi + 32]
+    lea         eax,  [eax + 64]
+
+    vpmaddubsw  ymm0, ymm0, ymm4      // average horizontally
+    vpmaddubsw  ymm1, ymm1, ymm4
+    vpavgw      ymm0, ymm0, ymm5      // (x + 1) / 2
+    vpavgw      ymm1, ymm1, ymm5
+    vpackuswb   ymm0, ymm0, ymm1
+    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
+
+    vmovdqu     [edx], ymm0
+    lea         edx, [edx + 32]
+    sub         ecx, 32
+    jg          wloop
+
+    pop         esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_SCALEROWDOWN2_AVX2
+
+// Point samples 32 pixels to 8 pixels.
+__declspec(naked)
+void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_ptr
+                                     // src_stride ignored
+    mov        edx, [esp + 12]       // dst_ptr
+    mov        ecx, [esp + 16]       // dst_width
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff0000
+    psrld      xmm5, 24
+    pslld      xmm5, 16
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    pand       xmm0, xmm5
+    pand       xmm1, xmm5
+    packuswb   xmm0, xmm1
+    psrlw      xmm0, 8
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx], xmm0
+    lea        edx, [edx + 8]
+    sub        ecx, 8
+    jg         wloop
+
+    ret
+  }
+}
+
+// Blends 32x4 rectangle to 8x1.
+__declspec(naked)
+void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_ptr
+    mov        esi, [esp + 8 + 8]    // src_stride
+    mov        edx, [esp + 8 + 12]   // dst_ptr
+    mov        ecx, [esp + 8 + 16]   // dst_width
+    lea        edi, [esi + esi * 2]  // src_stride * 3
+    pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
+    psrlw      xmm7, 8
+
+  wloop:
+    movdqu     xmm0, [eax]           // average rows
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + esi]
+    movdqu     xmm3, [eax + esi + 16]
+    pavgb      xmm0, xmm2
+    pavgb      xmm1, xmm3
+    movdqu     xmm2, [eax + esi * 2]
+    movdqu     xmm3, [eax + esi * 2 + 16]
+    movdqu     xmm4, [eax + edi]
+    movdqu     xmm5, [eax + edi + 16]
+    lea        eax, [eax + 32]
+    pavgb      xmm2, xmm4
+    pavgb      xmm3, xmm5
+    pavgb      xmm0, xmm2
+    pavgb      xmm1, xmm3
+
+    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
+    psrlw      xmm0, 8
+    movdqa     xmm3, xmm1
+    psrlw      xmm1, 8
+    pand       xmm2, xmm7
+    pand       xmm3, xmm7
+    pavgw      xmm0, xmm2
+    pavgw      xmm1, xmm3
+    packuswb   xmm0, xmm1
+
+    movdqa     xmm2, xmm0            // average columns (16 to 8 pixels)
+    psrlw      xmm0, 8
+    pand       xmm2, xmm7
+    pavgw      xmm0, xmm2
+    packuswb   xmm0, xmm0
+
+    movq       qword ptr [edx], xmm0
+    lea        edx, [edx + 8]
+    sub        ecx, 8
+    jg         wloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+#ifdef HAS_SCALEROWDOWN4_AVX2
+// Point samples 64 pixels to 16 pixels.
+__declspec(naked)
+void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov         eax, [esp + 4]        // src_ptr
+                                      // src_stride ignored
+    mov         edx, [esp + 12]       // dst_ptr
+    mov         ecx, [esp + 16]       // dst_width
+    vpcmpeqb    ymm5, ymm5, ymm5      // generate mask 0x00ff0000
+    vpsrld      ymm5, ymm5, 24
+    vpslld      ymm5, ymm5, 16
+
+  wloop:
+    vmovdqu     ymm0, [eax]
+    vmovdqu     ymm1, [eax + 32]
+    lea         eax,  [eax + 64]
+    vpand       ymm0, ymm0, ymm5
+    vpand       ymm1, ymm1, ymm5
+    vpackuswb   ymm0, ymm0, ymm1
+    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
+    vpsrlw      ymm0, ymm0, 8
+    vpackuswb   ymm0, ymm0, ymm0
+    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
+    vmovdqu     [edx], xmm0
+    lea         edx, [edx + 16]
+    sub         ecx, 16
+    jg          wloop
+
+    vzeroupper
+    ret
+  }
+}
+
+// Blends 64x4 rectangle to 16x1.
+__declspec(naked)
+void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
+  __asm {
+    push        esi
+    push        edi
+    mov         eax, [esp + 8 + 4]    // src_ptr
+    mov         esi, [esp + 8 + 8]    // src_stride
+    mov         edx, [esp + 8 + 12]   // dst_ptr
+    mov         ecx, [esp + 8 + 16]   // dst_width
+    lea         edi, [esi + esi * 2]  // src_stride * 3
+    vpcmpeqb    ymm7, ymm7, ymm7      // generate mask 0x00ff00ff
+    vpsrlw      ymm7, ymm7, 8
+
+  wloop:
+    vmovdqu     ymm0, [eax]           // average rows
+    vmovdqu     ymm1, [eax + 32]
+    vpavgb      ymm0, ymm0, [eax + esi]
+    vpavgb      ymm1, ymm1, [eax + esi + 32]
+    vmovdqu     ymm2, [eax + esi * 2]
+    vmovdqu     ymm3, [eax + esi * 2 + 32]
+    vpavgb      ymm2, ymm2, [eax + edi]
+    vpavgb      ymm3, ymm3, [eax + edi + 32]
+    lea         eax, [eax + 64]
+    vpavgb      ymm0, ymm0, ymm2
+    vpavgb      ymm1, ymm1, ymm3
+
+    vpand       ymm2, ymm0, ymm7      // average columns (64 to 32 pixels)
+    vpand       ymm3, ymm1, ymm7
+    vpsrlw      ymm0, ymm0, 8
+    vpsrlw      ymm1, ymm1, 8
+    vpavgw      ymm0, ymm0, ymm2
+    vpavgw      ymm1, ymm1, ymm3
+    vpackuswb   ymm0, ymm0, ymm1
+    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
+
+    vpand       ymm2, ymm0, ymm7      // average columns (32 to 16 pixels)
+    vpsrlw      ymm0, ymm0, 8
+    vpavgw      ymm0, ymm0, ymm2
+    vpackuswb   ymm0, ymm0, ymm0
+    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
+
+    vmovdqu     [edx], xmm0
+    lea         edx, [edx + 16]
+    sub         ecx, 16
+    jg          wloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_SCALEROWDOWN4_AVX2
+
+// Point samples 32 pixels to 24 pixels.
+// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
+// Then shuffled to do the scaling.
+
+__declspec(naked)
+void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                          uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_ptr
+                                     // src_stride ignored
+    mov        edx, [esp + 12]       // dst_ptr
+    mov        ecx, [esp + 16]       // dst_width
+    movdqa     xmm3, kShuf0
+    movdqa     xmm4, kShuf1
+    movdqa     xmm5, kShuf2
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    movdqa     xmm2, xmm1
+    palignr    xmm1, xmm0, 8
+    pshufb     xmm0, xmm3
+    pshufb     xmm1, xmm4
+    pshufb     xmm2, xmm5
+    movq       qword ptr [edx], xmm0
+    movq       qword ptr [edx + 8], xmm1
+    movq       qword ptr [edx + 16], xmm2
+    lea        edx, [edx + 24]
+    sub        ecx, 24
+    jg         wloop
+
+    ret
+  }
+}
+
+// Blends 32x2 rectangle to 24x1
+// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
+// Then shuffled to do the scaling.
+
+// Register usage:
+// xmm0 src_row 0
+// xmm1 src_row 1
+// xmm2 shuf 0
+// xmm3 shuf 1
+// xmm4 shuf 2
+// xmm5 madd 0
+// xmm6 madd 1
+// xmm7 kRound34
+
+// Note that movdqa+palign may be better than movdqu.
+__declspec(naked)
+void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]    // src_ptr
+    mov        esi, [esp + 4 + 8]    // src_stride
+    mov        edx, [esp + 4 + 12]   // dst_ptr
+    mov        ecx, [esp + 4 + 16]   // dst_width
+    movdqa     xmm2, kShuf01
+    movdqa     xmm3, kShuf11
+    movdqa     xmm4, kShuf21
+    movdqa     xmm5, kMadd01
+    movdqa     xmm6, kMadd11
+    movdqa     xmm7, kRound34
+
+  wloop:
+    movdqu     xmm0, [eax]           // pixels 0..7
+    movdqu     xmm1, [eax + esi]
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm2
+    pmaddubsw  xmm0, xmm5
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx], xmm0
+    movdqu     xmm0, [eax + 8]       // pixels 8..15
+    movdqu     xmm1, [eax + esi + 8]
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm3
+    pmaddubsw  xmm0, xmm6
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx + 8], xmm0
+    movdqu     xmm0, [eax + 16]      // pixels 16..23
+    movdqu     xmm1, [eax + esi + 16]
+    lea        eax, [eax + 32]
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm4
+    movdqa     xmm1, kMadd21
+    pmaddubsw  xmm0, xmm1
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx + 16], xmm0
+    lea        edx, [edx + 24]
+    sub        ecx, 24
+    jg         wloop
+
+    pop        esi
+    ret
+  }
+}
+
+// Note that movdqa+palign may be better than movdqu.
+__declspec(naked)
+void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]    // src_ptr
+    mov        esi, [esp + 4 + 8]    // src_stride
+    mov        edx, [esp + 4 + 12]   // dst_ptr
+    mov        ecx, [esp + 4 + 16]   // dst_width
+    movdqa     xmm2, kShuf01
+    movdqa     xmm3, kShuf11
+    movdqa     xmm4, kShuf21
+    movdqa     xmm5, kMadd01
+    movdqa     xmm6, kMadd11
+    movdqa     xmm7, kRound34
+
+  wloop:
+    movdqu     xmm0, [eax]           // pixels 0..7
+    movdqu     xmm1, [eax + esi]
+    pavgb      xmm1, xmm0
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm2
+    pmaddubsw  xmm0, xmm5
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx], xmm0
+    movdqu     xmm0, [eax + 8]       // pixels 8..15
+    movdqu     xmm1, [eax + esi + 8]
+    pavgb      xmm1, xmm0
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm3
+    pmaddubsw  xmm0, xmm6
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx + 8], xmm0
+    movdqu     xmm0, [eax + 16]      // pixels 16..23
+    movdqu     xmm1, [eax + esi + 16]
+    lea        eax, [eax + 32]
+    pavgb      xmm1, xmm0
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm4
+    movdqa     xmm1, kMadd21
+    pmaddubsw  xmm0, xmm1
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx + 16], xmm0
+    lea        edx, [edx+24]
+    sub        ecx, 24
+    jg         wloop
+
+    pop        esi
+    ret
+  }
+}
+
+// 3/8 point sampler
+
+// Scale 32 pixels to 12
+__declspec(naked)
+void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                          uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_ptr
+                                     // src_stride ignored
+    mov        edx, [esp + 12]       // dst_ptr
+    mov        ecx, [esp + 16]       // dst_width
+    movdqa     xmm4, kShuf38a
+    movdqa     xmm5, kShuf38b
+
+  xloop:
+    movdqu     xmm0, [eax]           // 16 pixels -> 0,1,2,3,4,5
+    movdqu     xmm1, [eax + 16]      // 16 pixels -> 6,7,8,9,10,11
+    lea        eax, [eax + 32]
+    pshufb     xmm0, xmm4
+    pshufb     xmm1, xmm5
+    paddusb    xmm0, xmm1
+
+    movq       qword ptr [edx], xmm0  // write 12 pixels
+    movhlps    xmm1, xmm0
+    movd       [edx + 8], xmm1
+    lea        edx, [edx + 12]
+    sub        ecx, 12
+    jg         xloop
+
+    ret
+  }
+}
+
+// Scale 16x3 pixels to 6x1 with interpolation
+__declspec(naked)
+void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]    // src_ptr
+    mov        esi, [esp + 4 + 8]    // src_stride
+    mov        edx, [esp + 4 + 12]   // dst_ptr
+    mov        ecx, [esp + 4 + 16]   // dst_width
+    movdqa     xmm2, kShufAc
+    movdqa     xmm3, kShufAc3
+    movdqa     xmm4, kScaleAc33
+    pxor       xmm5, xmm5
+
+  xloop:
+    movdqu     xmm0, [eax]           // sum up 3 rows into xmm0/1
+    movdqu     xmm6, [eax + esi]
+    movhlps    xmm1, xmm0
+    movhlps    xmm7, xmm6
+    punpcklbw  xmm0, xmm5
+    punpcklbw  xmm1, xmm5
+    punpcklbw  xmm6, xmm5
+    punpcklbw  xmm7, xmm5
+    paddusw    xmm0, xmm6
+    paddusw    xmm1, xmm7
+    movdqu     xmm6, [eax + esi * 2]
+    lea        eax, [eax + 16]
+    movhlps    xmm7, xmm6
+    punpcklbw  xmm6, xmm5
+    punpcklbw  xmm7, xmm5
+    paddusw    xmm0, xmm6
+    paddusw    xmm1, xmm7
+
+    movdqa     xmm6, xmm0            // 8 pixels -> 0,1,2 of xmm6
+    psrldq     xmm0, 2
+    paddusw    xmm6, xmm0
+    psrldq     xmm0, 2
+    paddusw    xmm6, xmm0
+    pshufb     xmm6, xmm2
+
+    movdqa     xmm7, xmm1            // 8 pixels -> 3,4,5 of xmm6
+    psrldq     xmm1, 2
+    paddusw    xmm7, xmm1
+    psrldq     xmm1, 2
+    paddusw    xmm7, xmm1
+    pshufb     xmm7, xmm3
+    paddusw    xmm6, xmm7
+
+    pmulhuw    xmm6, xmm4            // divide by 9,9,6, 9,9,6
+    packuswb   xmm6, xmm6
+
+    movd       [edx], xmm6           // write 6 pixels
+    psrlq      xmm6, 16
+    movd       [edx + 2], xmm6
+    lea        edx, [edx + 6]
+    sub        ecx, 6
+    jg         xloop
+
+    pop        esi
+    ret
+  }
+}
+
+// Scale 16x2 pixels to 6x1 with interpolation
+__declspec(naked)
+void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]    // src_ptr
+    mov        esi, [esp + 4 + 8]    // src_stride
+    mov        edx, [esp + 4 + 12]   // dst_ptr
+    mov        ecx, [esp + 4 + 16]   // dst_width
+    movdqa     xmm2, kShufAb0
+    movdqa     xmm3, kShufAb1
+    movdqa     xmm4, kShufAb2
+    movdqa     xmm5, kScaleAb2
+
+  xloop:
+    movdqu     xmm0, [eax]           // average 2 rows into xmm0
+    movdqu     xmm1, [eax + esi]
+    lea        eax, [eax + 16]
+    pavgb      xmm0, xmm1
+
+    movdqa     xmm1, xmm0            // 16 pixels -> 0,1,2,3,4,5 of xmm1
+    pshufb     xmm1, xmm2
+    movdqa     xmm6, xmm0
+    pshufb     xmm6, xmm3
+    paddusw    xmm1, xmm6
+    pshufb     xmm0, xmm4
+    paddusw    xmm1, xmm0
+
+    pmulhuw    xmm1, xmm5            // divide by 3,3,2, 3,3,2
+    packuswb   xmm1, xmm1
+
+    movd       [edx], xmm1           // write 6 pixels
+    psrlq      xmm1, 16
+    movd       [edx + 2], xmm1
+    lea        edx, [edx + 6]
+    sub        ecx, 6
+    jg         xloop
+
+    pop        esi
+    ret
+  }
+}
+
+// Reads 16 bytes and accumulates to 16 shorts at a time.
+__declspec(naked)
+void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
+  __asm {
+    mov        eax, [esp + 4]   // src_ptr
+    mov        edx, [esp + 8]   // dst_ptr
+    mov        ecx, [esp + 12]  // src_width
+    pxor       xmm5, xmm5
+
+  // sum rows
+  xloop:
+    movdqu     xmm3, [eax]       // read 16 bytes
+    lea        eax, [eax + 16]
+    movdqu     xmm0, [edx]       // read 16 words from destination
+    movdqu     xmm1, [edx + 16]
+    movdqa     xmm2, xmm3
+    punpcklbw  xmm2, xmm5
+    punpckhbw  xmm3, xmm5
+    paddusw    xmm0, xmm2        // sum 16 words
+    paddusw    xmm1, xmm3
+    movdqu     [edx], xmm0       // write 16 words to destination
+    movdqu     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    sub        ecx, 16
+    jg         xloop
+    ret
+  }
+}
+
+#ifdef HAS_SCALEADDROW_AVX2
+// Reads 32 bytes and accumulates to 32 shorts at a time.
+__declspec(naked)
+void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
+  __asm {
+    mov         eax, [esp + 4]   // src_ptr
+    mov         edx, [esp + 8]   // dst_ptr
+    mov         ecx, [esp + 12]  // src_width
+    vpxor       ymm5, ymm5, ymm5
+
+  // sum rows
+  xloop:
+    vmovdqu     ymm3, [eax]       // read 32 bytes
+    lea         eax, [eax + 32]
+    vpermq      ymm3, ymm3, 0xd8  // unmutate for vpunpck
+    vpunpcklbw  ymm2, ymm3, ymm5
+    vpunpckhbw  ymm3, ymm3, ymm5
+    vpaddusw    ymm0, ymm2, [edx] // sum 16 words
+    vpaddusw    ymm1, ymm3, [edx + 32]
+    vmovdqu     [edx], ymm0       // write 32 words to destination
+    vmovdqu     [edx + 32], ymm1
+    lea         edx, [edx + 64]
+    sub         ecx, 32
+    jg          xloop
+
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_SCALEADDROW_AVX2
+
+// Bilinear column filtering. SSSE3 version.
+__declspec(naked)
+void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                           int dst_width, int x, int dx) {
+  __asm {
+    push       ebx
+    push       esi
+    push       edi
+    mov        edi, [esp + 12 + 4]    // dst_ptr
+    mov        esi, [esp + 12 + 8]    // src_ptr
+    mov        ecx, [esp + 12 + 12]   // dst_width
+    movd       xmm2, [esp + 12 + 16]  // x
+    movd       xmm3, [esp + 12 + 20]  // dx
+    mov        eax, 0x04040000      // shuffle to line up fractions with pixel.
+    movd       xmm5, eax
+    pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.
+    psrlw      xmm6, 9
+    pextrw     eax, xmm2, 1         // get x0 integer. preroll
+    sub        ecx, 2
+    jl         xloop29
+
+    movdqa     xmm0, xmm2           // x1 = x0 + dx
+    paddd      xmm0, xmm3
+    punpckldq  xmm2, xmm0           // x0 x1
+    punpckldq  xmm3, xmm3           // dx dx
+    paddd      xmm3, xmm3           // dx * 2, dx * 2
+    pextrw     edx, xmm2, 3         // get x1 integer. preroll
+
+    // 2 Pixel loop.
+  xloop2:
+    movdqa     xmm1, xmm2           // x0, x1 fractions.
+    paddd      xmm2, xmm3           // x += dx
+    movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
+    movd       xmm0, ebx
+    psrlw      xmm1, 9              // 7 bit fractions.
+    movzx      ebx, word ptr [esi + edx]  // 2 source x1 pixels
+    movd       xmm4, ebx
+    pshufb     xmm1, xmm5           // 0011
+    punpcklwd  xmm0, xmm4
+    pxor       xmm1, xmm6           // 0..7f and 7f..0
+    pmaddubsw  xmm0, xmm1           // 16 bit, 2 pixels.
+    pextrw     eax, xmm2, 1         // get x0 integer. next iteration.
+    pextrw     edx, xmm2, 3         // get x1 integer. next iteration.
+    psrlw      xmm0, 7              // 8.7 fixed point to low 8 bits.
+    packuswb   xmm0, xmm0           // 8 bits, 2 pixels.
+    movd       ebx, xmm0
+    mov        [edi], bx
+    lea        edi, [edi + 2]
+    sub        ecx, 2               // 2 pixels
+    jge        xloop2
+
+ xloop29:
+
+    add        ecx, 2 - 1
+    jl         xloop99
+
+    // 1 pixel remainder
+    movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
+    movd       xmm0, ebx
+    psrlw      xmm2, 9              // 7 bit fractions.
+    pshufb     xmm2, xmm5           // 0011
+    pxor       xmm2, xmm6           // 0..7f and 7f..0
+    pmaddubsw  xmm0, xmm2           // 16 bit
+    psrlw      xmm0, 7              // 8.7 fixed point to low 8 bits.
+    packuswb   xmm0, xmm0           // 8 bits
+    movd       ebx, xmm0
+    mov        [edi], bl
+
+ xloop99:
+
+    pop        edi
+    pop        esi
+    pop        ebx
+    ret
+  }
+}
+
+// Reads 16 pixels, duplicates them and writes 32 pixels.
+__declspec(naked)
+void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+                       int dst_width, int x, int dx) {
+  __asm {
+    mov        edx, [esp + 4]    // dst_ptr
+    mov        eax, [esp + 8]    // src_ptr
+    mov        ecx, [esp + 12]   // dst_width
+
+  wloop:
+    movdqu     xmm0, [eax]
+    lea        eax,  [eax + 16]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm0
+    punpckhbw  xmm1, xmm1
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    sub        ecx, 32
+    jg         wloop
+
+    ret
+  }
+}
+
+// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
+__declspec(naked)
+void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
+                            ptrdiff_t src_stride,
+                            uint8* dst_argb, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_argb
+                                     // src_stride ignored
+    mov        edx, [esp + 12]       // dst_argb
+    mov        ecx, [esp + 16]       // dst_width
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    shufps     xmm0, xmm1, 0xdd
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         wloop
+
+    ret
+  }
+}
+
+// Blends 8x1 rectangle to 4x1.
+__declspec(naked)
+void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
+                                  ptrdiff_t src_stride,
+                                  uint8* dst_argb, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_argb
+                                     // src_stride ignored
+    mov        edx, [esp + 12]       // dst_argb
+    mov        ecx, [esp + 16]       // dst_width
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    movdqa     xmm2, xmm0
+    shufps     xmm0, xmm1, 0x88      // even pixels
+    shufps     xmm2, xmm1, 0xdd      // odd pixels
+    pavgb      xmm0, xmm2
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         wloop
+
+    ret
+  }
+}
+
+// Blends 8x2 rectangle to 4x1.
+__declspec(naked)
+void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
+                               ptrdiff_t src_stride,
+                               uint8* dst_argb, int dst_width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]    // src_argb
+    mov        esi, [esp + 4 + 8]    // src_stride
+    mov        edx, [esp + 4 + 12]   // dst_argb
+    mov        ecx, [esp + 4 + 16]   // dst_width
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + esi]
+    movdqu     xmm3, [eax + esi + 16]
+    lea        eax,  [eax + 32]
+    pavgb      xmm0, xmm2            // average rows
+    pavgb      xmm1, xmm3
+    movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
+    shufps     xmm0, xmm1, 0x88      // even pixels
+    shufps     xmm2, xmm1, 0xdd      // odd pixels
+    pavgb      xmm0, xmm2
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         wloop
+
+    pop        esi
+    ret
+  }
+}
+
+// Reads 4 pixels at a time.
+__declspec(naked)
+void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8* dst_argb, int dst_width) {
+  __asm {
+    push       ebx
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_argb
+                                     // src_stride ignored
+    mov        ebx, [esp + 8 + 12]   // src_stepx
+    mov        edx, [esp + 8 + 16]   // dst_argb
+    mov        ecx, [esp + 8 + 20]   // dst_width
+    lea        ebx, [ebx * 4]
+    lea        edi, [ebx + ebx * 2]
+
+  wloop:
+    movd       xmm0, [eax]
+    movd       xmm1, [eax + ebx]
+    punpckldq  xmm0, xmm1
+    movd       xmm2, [eax + ebx * 2]
+    movd       xmm3, [eax + edi]
+    lea        eax,  [eax + ebx * 4]
+    punpckldq  xmm2, xmm3
+    punpcklqdq xmm0, xmm2
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         wloop
+
+    pop        edi
+    pop        ebx
+    ret
+  }
+}
+
+// Blends four 2x2 to 4x1.
+__declspec(naked)
+void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
+                                  ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8* dst_argb, int dst_width) {
+  __asm {
+    push       ebx
+    push       esi
+    push       edi
+    mov        eax, [esp + 12 + 4]    // src_argb
+    mov        esi, [esp + 12 + 8]    // src_stride
+    mov        ebx, [esp + 12 + 12]   // src_stepx
+    mov        edx, [esp + 12 + 16]   // dst_argb
+    mov        ecx, [esp + 12 + 20]   // dst_width
+    lea        esi, [eax + esi]       // row1 pointer
+    lea        ebx, [ebx * 4]
+    lea        edi, [ebx + ebx * 2]
+
+  wloop:
+    movq       xmm0, qword ptr [eax]  // row0 4 pairs
+    movhps     xmm0, qword ptr [eax + ebx]
+    movq       xmm1, qword ptr [eax + ebx * 2]
+    movhps     xmm1, qword ptr [eax + edi]
+    lea        eax,  [eax + ebx * 4]
+    movq       xmm2, qword ptr [esi]  // row1 4 pairs
+    movhps     xmm2, qword ptr [esi + ebx]
+    movq       xmm3, qword ptr [esi + ebx * 2]
+    movhps     xmm3, qword ptr [esi + edi]
+    lea        esi,  [esi + ebx * 4]
+    pavgb      xmm0, xmm2            // average rows
+    pavgb      xmm1, xmm3
+    movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
+    shufps     xmm0, xmm1, 0x88      // even pixels
+    shufps     xmm2, xmm1, 0xdd      // odd pixels
+    pavgb      xmm0, xmm2
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         wloop
+
+    pop        edi
+    pop        esi
+    pop        ebx
+    ret
+  }
+}
+
+// Column scaling unfiltered. SSE2 version.
+__declspec(naked)
+void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
+                        int dst_width, int x, int dx) {
+  __asm {
+    push       edi
+    push       esi
+    mov        edi, [esp + 8 + 4]    // dst_argb
+    mov        esi, [esp + 8 + 8]    // src_argb
+    mov        ecx, [esp + 8 + 12]   // dst_width
+    movd       xmm2, [esp + 8 + 16]  // x
+    movd       xmm3, [esp + 8 + 20]  // dx
+
+    pshufd     xmm2, xmm2, 0         // x0 x0 x0 x0
+    pshufd     xmm0, xmm3, 0x11      // dx  0 dx  0
+    paddd      xmm2, xmm0
+    paddd      xmm3, xmm3            // 0, 0, 0,  dx * 2
+    pshufd     xmm0, xmm3, 0x05      // dx * 2, dx * 2, 0, 0
+    paddd      xmm2, xmm0            // x3 x2 x1 x0
+    paddd      xmm3, xmm3            // 0, 0, 0,  dx * 4
+    pshufd     xmm3, xmm3, 0         // dx * 4, dx * 4, dx * 4, dx * 4
+
+    pextrw     eax, xmm2, 1          // get x0 integer.
+    pextrw     edx, xmm2, 3          // get x1 integer.
+
+    cmp        ecx, 0
+    jle        xloop99
+    sub        ecx, 4
+    jl         xloop49
+
+    // 4 Pixel loop.
+ xloop4:
+    movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
+    movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
+    pextrw     eax, xmm2, 5           // get x2 integer.
+    pextrw     edx, xmm2, 7           // get x3 integer.
+    paddd      xmm2, xmm3             // x += dx
+    punpckldq  xmm0, xmm1             // x0 x1
+
+    movd       xmm1, [esi + eax * 4]  // 1 source x2 pixels
+    movd       xmm4, [esi + edx * 4]  // 1 source x3 pixels
+    pextrw     eax, xmm2, 1           // get x0 integer. next iteration.
+    pextrw     edx, xmm2, 3           // get x1 integer. next iteration.
+    punpckldq  xmm1, xmm4             // x2 x3
+    punpcklqdq xmm0, xmm1             // x0 x1 x2 x3
+    movdqu     [edi], xmm0
+    lea        edi, [edi + 16]
+    sub        ecx, 4                 // 4 pixels
+    jge        xloop4
+
+ xloop49:
+    test       ecx, 2
+    je         xloop29
+
+    // 2 Pixels.
+    movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
+    movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
+    pextrw     eax, xmm2, 5           // get x2 integer.
+    punpckldq  xmm0, xmm1             // x0 x1
+
+    movq       qword ptr [edi], xmm0
+    lea        edi, [edi + 8]
+
+ xloop29:
+    test       ecx, 1
+    je         xloop99
+
+    // 1 Pixels.
+    movd       xmm0, [esi + eax * 4]  // 1 source x2 pixels
+    movd       dword ptr [edi], xmm0
+ xloop99:
+
+    pop        esi
+    pop        edi
+    ret
+  }
+}
+
+// Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.
+// TODO(fbarchard): Port to Neon
+
+// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
+static uvec8 kShuffleColARGB = {
+  0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel
+  8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
+};
+
+// Shuffle table for duplicating 2 fractions into 8 bytes each
+static uvec8 kShuffleFractions = {
+  0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
+};
+
+__declspec(naked)
+void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
+                               int dst_width, int x, int dx) {
+  __asm {
+    push       esi
+    push       edi
+    mov        edi, [esp + 8 + 4]    // dst_argb
+    mov        esi, [esp + 8 + 8]    // src_argb
+    mov        ecx, [esp + 8 + 12]   // dst_width
+    movd       xmm2, [esp + 8 + 16]  // x
+    movd       xmm3, [esp + 8 + 20]  // dx
+    movdqa     xmm4, kShuffleColARGB
+    movdqa     xmm5, kShuffleFractions
+    pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.
+    psrlw      xmm6, 9
+    pextrw     eax, xmm2, 1         // get x0 integer. preroll
+    sub        ecx, 2
+    jl         xloop29
+
+    movdqa     xmm0, xmm2           // x1 = x0 + dx
+    paddd      xmm0, xmm3
+    punpckldq  xmm2, xmm0           // x0 x1
+    punpckldq  xmm3, xmm3           // dx dx
+    paddd      xmm3, xmm3           // dx * 2, dx * 2
+    pextrw     edx, xmm2, 3         // get x1 integer. preroll
+
+    // 2 Pixel loop.
+  xloop2:
+    movdqa     xmm1, xmm2           // x0, x1 fractions.
+    paddd      xmm2, xmm3           // x += dx
+    movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
+    psrlw      xmm1, 9              // 7 bit fractions.
+    movhps     xmm0, qword ptr [esi + edx * 4]  // 2 source x1 pixels
+    pshufb     xmm1, xmm5           // 0000000011111111
+    pshufb     xmm0, xmm4           // arrange pixels into pairs
+    pxor       xmm1, xmm6           // 0..7f and 7f..0
+    pmaddubsw  xmm0, xmm1           // argb_argb 16 bit, 2 pixels.
+    pextrw     eax, xmm2, 1         // get x0 integer. next iteration.
+    pextrw     edx, xmm2, 3         // get x1 integer. next iteration.
+    psrlw      xmm0, 7              // argb 8.7 fixed point to low 8 bits.
+    packuswb   xmm0, xmm0           // argb_argb 8 bits, 2 pixels.
+    movq       qword ptr [edi], xmm0
+    lea        edi, [edi + 8]
+    sub        ecx, 2               // 2 pixels
+    jge        xloop2
+
+ xloop29:
+
+    add        ecx, 2 - 1
+    jl         xloop99
+
+    // 1 pixel remainder
+    psrlw      xmm2, 9              // 7 bit fractions.
+    movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
+    pshufb     xmm2, xmm5           // 00000000
+    pshufb     xmm0, xmm4           // arrange pixels into pairs
+    pxor       xmm2, xmm6           // 0..7f and 7f..0
+    pmaddubsw  xmm0, xmm2           // argb 16 bit, 1 pixel.
+    psrlw      xmm0, 7
+    packuswb   xmm0, xmm0           // argb 8 bits, 1 pixel.
+    movd       [edi], xmm0
+
+ xloop99:
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+// Reads 4 pixels, duplicates them and writes 8 pixels.
+__declspec(naked)
+void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
+                           int dst_width, int x, int dx) {
+  __asm {
+    mov        edx, [esp + 4]    // dst_argb
+    mov        eax, [esp + 8]    // src_argb
+    mov        ecx, [esp + 12]   // dst_width
+
+  wloop:
+    movdqu     xmm0, [eax]
+    lea        eax,  [eax + 16]
+    movdqa     xmm1, xmm0
+    punpckldq  xmm0, xmm0
+    punpckhdq  xmm1, xmm1
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         wloop
+
+    ret
+  }
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+__declspec(naked)
+int FixedDiv_X86(int num, int div) {
+  __asm {
+    mov        eax, [esp + 4]    // num
+    cdq                          // extend num to 64 bits
+    shld       edx, eax, 16      // 32.16
+    shl        eax, 16
+    idiv       dword ptr [esp + 8]
+    ret
+  }
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+__declspec(naked)
+int FixedDiv1_X86(int num, int div) {
+  __asm {
+    mov        eax, [esp + 4]    // num
+    mov        ecx, [esp + 8]    // denom
+    cdq                          // extend num to 64 bits
+    shld       edx, eax, 16      // 32.16
+    shl        eax, 16
+    sub        eax, 0x00010001
+    sbb        edx, 0
+    sub        ecx, 1
+    idiv       ecx
+    ret
+  }
+}
+#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libvpx/third_party/libyuv/source/video_common.cc b/libs/libvpx/third_party/libyuv/source/video_common.cc
new file mode 100644
index 0000000000..379a0669ae
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/source/video_common.cc
@@ -0,0 +1,64 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define ARRAY_SIZE(x) (int)(sizeof(x) / sizeof(x[0]))
+
+struct FourCCAliasEntry {
+  uint32 alias;
+  uint32 canonical;
+};
+
+static const struct FourCCAliasEntry kFourCCAliases[] = {
+  {FOURCC_IYUV, FOURCC_I420},
+  {FOURCC_YU16, FOURCC_I422},
+  {FOURCC_YU24, FOURCC_I444},
+  {FOURCC_YUYV, FOURCC_YUY2},
+  {FOURCC_YUVS, FOURCC_YUY2},  // kCMPixelFormat_422YpCbCr8_yuvs
+  {FOURCC_HDYC, FOURCC_UYVY},
+  {FOURCC_2VUY, FOURCC_UYVY},  // kCMPixelFormat_422YpCbCr8
+  {FOURCC_JPEG, FOURCC_MJPG},  // Note: JPEG has DHT while MJPG does not.
+  {FOURCC_DMB1, FOURCC_MJPG},
+  {FOURCC_BA81, FOURCC_BGGR},  // deprecated.
+  {FOURCC_RGB3, FOURCC_RAW },
+  {FOURCC_BGR3, FOURCC_24BG},
+  {FOURCC_CM32, FOURCC_BGRA},  // kCMPixelFormat_32ARGB
+  {FOURCC_CM24, FOURCC_RAW },  // kCMPixelFormat_24RGB
+  {FOURCC_L555, FOURCC_RGBO},  // kCMPixelFormat_16LE555
+  {FOURCC_L565, FOURCC_RGBP},  // kCMPixelFormat_16LE565
+  {FOURCC_5551, FOURCC_RGBO},  // kCMPixelFormat_16LE5551
+};
+// TODO(fbarchard): Consider mapping kCMPixelFormat_32BGRA to FOURCC_ARGB.
+//  {FOURCC_BGRA, FOURCC_ARGB},  // kCMPixelFormat_32BGRA
+
+LIBYUV_API
+uint32 CanonicalFourCC(uint32 fourcc) {
+  int i;
+  for (i = 0; i < ARRAY_SIZE(kFourCCAliases); ++i) {
+    if (kFourCCAliases[i].alias == fourcc) {
+      return kFourCCAliases[i].canonical;
+    }
+  }
+  // Not an alias, so return it as-is.
+  return fourcc;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
diff --git a/libs/libvpx/third_party/libyuv/source/x86inc.asm b/libs/libvpx/third_party/libyuv/source/x86inc.asm
new file mode 100644
index 0000000000..cb5c32df3a
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/source/x86inc.asm
@@ -0,0 +1,1136 @@
+;*****************************************************************************
+;* x86inc.asm: x264asm abstraction layer
+;*****************************************************************************
+;* Copyright (C) 2005-2012 x264 project
+;*
+;* Authors: Loren Merritt <lorenm@u.washington.edu>
+;*          Anton Mitrofanov <BugMaster@narod.ru>
+;*          Jason Garrett-Glaser <darkshikari@gmail.com>
+;*          Henrik Gramner <hengar-6@student.ltu.se>
+;*
+;* Permission to use, copy, modify, and/or distribute this software for any
+;* purpose with or without fee is hereby granted, provided that the above
+;* copyright notice and this permission notice appear in all copies.
+;*
+;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+;*****************************************************************************
+
+; This is a header file for the x264ASM assembly language, which uses
+; NASM/YASM syntax combined with a large number of macros to provide easy
+; abstraction between different calling conventions (x86_32, win64, linux64).
+; It also has various other useful features to simplify writing the kind of
+; DSP functions that are most often used in x264.
+
+; Unlike the rest of x264, this file is available under an ISC license, as it
+; has significant usefulness outside of x264 and we want it to be available
+; to the largest audience possible.  Of course, if you modify it for your own
+; purposes to add a new feature, we strongly encourage contributing a patch
+; as this feature might be useful for others as well.  Send patches or ideas
+; to x264-devel@videolan.org .
+
+; Local changes for libyuv:
+; remove %define program_name and references in labels
+; rename cpus to uppercase
+
+%define WIN64  0
+%define UNIX64 0
+%if ARCH_X86_64
+    %ifidn __OUTPUT_FORMAT__,win32
+        %define WIN64  1
+    %elifidn __OUTPUT_FORMAT__,win64
+        %define WIN64  1
+    %else
+        %define UNIX64 1
+    %endif
+%endif
+
+%ifdef PREFIX
+    %define mangle(x) _ %+ x
+%else
+    %define mangle(x) x
+%endif
+
+; Name of the .rodata section.
+; Kludge: Something on OS X fails to align .rodata even given an align attribute,
+; so use a different read-only section.
+%macro SECTION_RODATA 0-1 16
+    %ifidn __OUTPUT_FORMAT__,macho64
+        SECTION .text align=%1
+    %elifidn __OUTPUT_FORMAT__,macho
+        SECTION .text align=%1
+        fakegot:
+    %elifidn __OUTPUT_FORMAT__,aout
+        section .text
+    %else
+        SECTION .rodata align=%1
+    %endif
+%endmacro
+
+; aout does not support align=
+%macro SECTION_TEXT 0-1 16
+    %ifidn __OUTPUT_FORMAT__,aout
+        SECTION .text
+    %else
+        SECTION .text align=%1
+    %endif
+%endmacro
+
+%if WIN64
+    %define PIC
+%elif ARCH_X86_64 == 0
+; x86_32 doesn't require PIC.
+; Some distros prefer shared objects to be PIC, but nothing breaks if
+; the code contains a few textrels, so we'll skip that complexity.
+    %undef PIC
+%endif
+%ifdef PIC
+    default rel
+%endif
+
+; Always use long nops (reduces 0x90 spam in disassembly on x86_32)
+CPU amdnop
+
+; Macros to eliminate most code duplication between x86_32 and x86_64:
+; Currently this works only for leaf functions which load all their arguments
+; into registers at the start, and make no other use of the stack. Luckily that
+; covers most of x264's asm.
+
+; PROLOGUE:
+; %1 = number of arguments. loads them from stack if needed.
+; %2 = number of registers used. pushes callee-saved regs if needed.
+; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
+; %4 = list of names to define to registers
+; PROLOGUE can also be invoked by adding the same options to cglobal
+
+; e.g.
+; cglobal foo, 2,3,0, dst, src, tmp
+; declares a function (foo), taking two args (dst and src) and one local variable (tmp)
+
+; TODO Some functions can use some args directly from the stack. If they're the
+; last args then you can just not declare them, but if they're in the middle
+; we need more flexible macro.
+
+; RET:
+; Pops anything that was pushed by PROLOGUE, and returns.
+
+; REP_RET:
+; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
+; which are slow when a normal ret follows a branch.
+
+; registers:
+; rN and rNq are the native-size register holding function argument N
+; rNd, rNw, rNb are dword, word, and byte size
+; rNh is the high 8 bits of the word size
+; rNm is the original location of arg N (a register or on the stack), dword
+; rNmp is native size
+
+%macro DECLARE_REG 2-3
+    %define r%1q %2
+    %define r%1d %2d
+    %define r%1w %2w
+    %define r%1b %2b
+    %define r%1h %2h
+    %if %0 == 2
+        %define r%1m  %2d
+        %define r%1mp %2
+    %elif ARCH_X86_64 ; memory
+        %define r%1m [rsp + stack_offset + %3]
+        %define r%1mp qword r %+ %1m
+    %else
+        %define r%1m [esp + stack_offset + %3]
+        %define r%1mp dword r %+ %1m
+    %endif
+    %define r%1  %2
+%endmacro
+
+%macro DECLARE_REG_SIZE 3
+    %define r%1q r%1
+    %define e%1q r%1
+    %define r%1d e%1
+    %define e%1d e%1
+    %define r%1w %1
+    %define e%1w %1
+    %define r%1h %3
+    %define e%1h %3
+    %define r%1b %2
+    %define e%1b %2
+%if ARCH_X86_64 == 0
+    %define r%1  e%1
+%endif
+%endmacro
+
+DECLARE_REG_SIZE ax, al, ah
+DECLARE_REG_SIZE bx, bl, bh
+DECLARE_REG_SIZE cx, cl, ch
+DECLARE_REG_SIZE dx, dl, dh
+DECLARE_REG_SIZE si, sil, null
+DECLARE_REG_SIZE di, dil, null
+DECLARE_REG_SIZE bp, bpl, null
+
+; t# defines for when per-arch register allocation is more complex than just function arguments
+
+%macro DECLARE_REG_TMP 1-*
+    %assign %%i 0
+    %rep %0
+        CAT_XDEFINE t, %%i, r%1
+        %assign %%i %%i+1
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro DECLARE_REG_TMP_SIZE 0-*
+    %rep %0
+        %define t%1q t%1 %+ q
+        %define t%1d t%1 %+ d
+        %define t%1w t%1 %+ w
+        %define t%1h t%1 %+ h
+        %define t%1b t%1 %+ b
+        %rotate 1
+    %endrep
+%endmacro
+
+DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
+
+%if ARCH_X86_64
+    %define gprsize 8
+%else
+    %define gprsize 4
+%endif
+
+%macro PUSH 1
+    push %1
+    %assign stack_offset stack_offset+gprsize
+%endmacro
+
+%macro POP 1
+    pop %1
+    %assign stack_offset stack_offset-gprsize
+%endmacro
+
+%macro PUSH_IF_USED 1-*
+    %rep %0
+        %if %1 < regs_used
+            PUSH r%1
+        %endif
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro POP_IF_USED 1-*
+    %rep %0
+        %if %1 < regs_used
+            pop r%1
+        %endif
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro LOAD_IF_USED 1-*
+    %rep %0
+        %if %1 < num_args
+            mov r%1, r %+ %1 %+ mp
+        %endif
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro SUB 2
+    sub %1, %2
+    %ifidn %1, rsp
+        %assign stack_offset stack_offset+(%2)
+    %endif
+%endmacro
+
+%macro ADD 2
+    add %1, %2
+    %ifidn %1, rsp
+        %assign stack_offset stack_offset-(%2)
+    %endif
+%endmacro
+
+%macro movifnidn 2
+    %ifnidn %1, %2
+        mov %1, %2
+    %endif
+%endmacro
+
+%macro movsxdifnidn 2
+    %ifnidn %1, %2
+        movsxd %1, %2
+    %endif
+%endmacro
+
+%macro ASSERT 1
+    %if (%1) == 0
+        %error assert failed
+    %endif
+%endmacro
+
+%macro DEFINE_ARGS 0-*
+    %ifdef n_arg_names
+        %assign %%i 0
+        %rep n_arg_names
+            CAT_UNDEF arg_name %+ %%i, q
+            CAT_UNDEF arg_name %+ %%i, d
+            CAT_UNDEF arg_name %+ %%i, w
+            CAT_UNDEF arg_name %+ %%i, h
+            CAT_UNDEF arg_name %+ %%i, b
+            CAT_UNDEF arg_name %+ %%i, m
+            CAT_UNDEF arg_name %+ %%i, mp
+            CAT_UNDEF arg_name, %%i
+            %assign %%i %%i+1
+        %endrep
+    %endif
+
+    %xdefine %%stack_offset stack_offset
+    %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine
+    %assign %%i 0
+    %rep %0
+        %xdefine %1q r %+ %%i %+ q
+        %xdefine %1d r %+ %%i %+ d
+        %xdefine %1w r %+ %%i %+ w
+        %xdefine %1h r %+ %%i %+ h
+        %xdefine %1b r %+ %%i %+ b
+        %xdefine %1m r %+ %%i %+ m
+        %xdefine %1mp r %+ %%i %+ mp
+        CAT_XDEFINE arg_name, %%i, %1
+        %assign %%i %%i+1
+        %rotate 1
+    %endrep
+    %xdefine stack_offset %%stack_offset
+    %assign n_arg_names %0
+%endmacro
+
+%if WIN64 ; Windows x64 ;=================================================
+
+DECLARE_REG 0,  rcx
+DECLARE_REG 1,  rdx
+DECLARE_REG 2,  R8
+DECLARE_REG 3,  R9
+DECLARE_REG 4,  R10, 40
+DECLARE_REG 5,  R11, 48
+DECLARE_REG 6,  rax, 56
+DECLARE_REG 7,  rdi, 64
+DECLARE_REG 8,  rsi, 72
+DECLARE_REG 9,  rbx, 80
+DECLARE_REG 10, rbp, 88
+DECLARE_REG 11, R12, 96
+DECLARE_REG 12, R13, 104
+DECLARE_REG 13, R14, 112
+DECLARE_REG 14, R15, 120
+
+%macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names...
+    %assign num_args %1
+    %assign regs_used %2
+    ASSERT regs_used >= num_args
+    ASSERT regs_used <= 15
+    PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
+    %if mmsize == 8
+        %assign xmm_regs_used 0
+    %else
+        WIN64_SPILL_XMM %3
+    %endif
+    LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
+    DEFINE_ARGS %4
+%endmacro
+
+%macro WIN64_SPILL_XMM 1
+    %assign xmm_regs_used %1
+    ASSERT xmm_regs_used <= 16
+    %if xmm_regs_used > 6
+        SUB rsp, (xmm_regs_used-6)*16+16
+        %assign %%i xmm_regs_used
+        %rep (xmm_regs_used-6)
+            %assign %%i %%i-1
+            movdqa [rsp + (%%i-6)*16+(~stack_offset&8)], xmm %+ %%i
+        %endrep
+    %endif
+%endmacro
+
+%macro WIN64_RESTORE_XMM_INTERNAL 1
+    %if xmm_regs_used > 6
+        %assign %%i xmm_regs_used
+        %rep (xmm_regs_used-6)
+            %assign %%i %%i-1
+            movdqa xmm %+ %%i, [%1 + (%%i-6)*16+(~stack_offset&8)]
+        %endrep
+        add %1, (xmm_regs_used-6)*16+16
+    %endif
+%endmacro
+
+%macro WIN64_RESTORE_XMM 1
+    WIN64_RESTORE_XMM_INTERNAL %1
+    %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16
+    %assign xmm_regs_used 0
+%endmacro
+
+%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32
+
+%macro RET 0
+    WIN64_RESTORE_XMM_INTERNAL rsp
+    POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
+%if mmsize == 32
+    vzeroupper
+%endif
+    ret
+%endmacro
+
+%elif ARCH_X86_64 ; *nix x64 ;=============================================
+
+DECLARE_REG 0,  rdi
+DECLARE_REG 1,  rsi
+DECLARE_REG 2,  rdx
+DECLARE_REG 3,  rcx
+DECLARE_REG 4,  R8
+DECLARE_REG 5,  R9
+DECLARE_REG 6,  rax, 8
+DECLARE_REG 7,  R10, 16
+DECLARE_REG 8,  R11, 24
+DECLARE_REG 9,  rbx, 32
+DECLARE_REG 10, rbp, 40
+DECLARE_REG 11, R12, 48
+DECLARE_REG 12, R13, 56
+DECLARE_REG 13, R14, 64
+DECLARE_REG 14, R15, 72
+
+%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
+    %assign num_args %1
+    %assign regs_used %2
+    ASSERT regs_used >= num_args
+    ASSERT regs_used <= 15
+    PUSH_IF_USED 9, 10, 11, 12, 13, 14
+    LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
+    DEFINE_ARGS %4
+%endmacro
+
+%define has_epilogue regs_used > 9 || mmsize == 32
+
+%macro RET 0
+    POP_IF_USED 14, 13, 12, 11, 10, 9
+%if mmsize == 32
+    vzeroupper
+%endif
+    ret
+%endmacro
+
+%else ; X86_32 ;==============================================================
+
+DECLARE_REG 0, eax, 4
+DECLARE_REG 1, ecx, 8
+DECLARE_REG 2, edx, 12
+DECLARE_REG 3, ebx, 16
+DECLARE_REG 4, esi, 20
+DECLARE_REG 5, edi, 24
+DECLARE_REG 6, ebp, 28
+%define rsp esp
+
+%macro DECLARE_ARG 1-*
+    %rep %0
+        %define r%1m [esp + stack_offset + 4*%1 + 4]
+        %define r%1mp dword r%1m
+        %rotate 1
+    %endrep
+%endmacro
+
+DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
+
+%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
+    %assign num_args %1
+    %assign regs_used %2
+    %if regs_used > 7
+        %assign regs_used 7
+    %endif
+    ASSERT regs_used >= num_args
+    PUSH_IF_USED 3, 4, 5, 6
+    LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
+    DEFINE_ARGS %4
+%endmacro
+
+%define has_epilogue regs_used > 3 || mmsize == 32
+
+%macro RET 0
+    POP_IF_USED 6, 5, 4, 3
+%if mmsize == 32
+    vzeroupper
+%endif
+    ret
+%endmacro
+
+%endif ;======================================================================
+
+%if WIN64 == 0
+%macro WIN64_SPILL_XMM 1
+%endmacro
+%macro WIN64_RESTORE_XMM 1
+%endmacro
+%endif
+
+%macro REP_RET 0
+    %if has_epilogue
+        RET
+    %else
+        rep ret
+    %endif
+%endmacro
+
+%macro TAIL_CALL 2 ; callee, is_nonadjacent
+    %if has_epilogue
+        call %1
+        RET
+    %elif %2
+        jmp %1
+    %endif
+%endmacro
+
+;=============================================================================
+; arch-independent part
+;=============================================================================
+
+%assign function_align 16
+
+; Begin a function.
+; Applies any symbol mangling needed for C linkage, and sets up a define such that
+; subsequent uses of the function name automatically refer to the mangled version.
+; Appends cpuflags to the function name if cpuflags has been specified.
+%macro cglobal 1-2+ ; name, [PROLOGUE args]
+%if %0 == 1
+    cglobal_internal %1 %+ SUFFIX
+%else
+    cglobal_internal %1 %+ SUFFIX, %2
+%endif
+%endmacro
+%macro cglobal_internal 1-2+
+    %ifndef cglobaled_%1
+        %xdefine %1 mangle(%1)
+        %xdefine %1.skip_prologue %1 %+ .skip_prologue
+        CAT_XDEFINE cglobaled_, %1, 1
+    %endif
+    %xdefine current_function %1
+    %ifidn __OUTPUT_FORMAT__,elf
+        global %1:function hidden
+    %else
+        global %1
+    %endif
+    align function_align
+    %1:
+    RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
+    %assign stack_offset 0
+    %if %0 > 1
+        PROLOGUE %2
+    %endif
+%endmacro
+
+%macro cextern 1
+    %xdefine %1 mangle(%1)
+    CAT_XDEFINE cglobaled_, %1, 1
+    extern %1
+%endmacro
+
+; like cextern, but without the prefix
+%macro cextern_naked 1
+    %xdefine %1 mangle(%1)
+    CAT_XDEFINE cglobaled_, %1, 1
+    extern %1
+%endmacro
+
+%macro const 2+
+    %xdefine %1 mangle(%1)
+    global %1
+    %1: %2
+%endmacro
+
+; This is needed for ELF, otherwise the GNU linker assumes the stack is
+; executable by default.
+%ifidn __OUTPUT_FORMAT__,elf
+SECTION .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+%ifidn __OUTPUT_FORMAT__,elf32
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+%ifidn __OUTPUT_FORMAT__,elf64
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+
+; cpuflags
+
+%assign cpuflags_MMX      (1<<0)
+%assign cpuflags_MMX2     (1<<1) | cpuflags_MMX
+%assign cpuflags_3dnow    (1<<2) | cpuflags_MMX
+%assign cpuflags_3dnow2   (1<<3) | cpuflags_3dnow
+%assign cpuflags_SSE      (1<<4) | cpuflags_MMX2
+%assign cpuflags_SSE2     (1<<5) | cpuflags_SSE
+%assign cpuflags_SSE2slow (1<<6) | cpuflags_SSE2
+%assign cpuflags_SSE3     (1<<7) | cpuflags_SSE2
+%assign cpuflags_SSSE3    (1<<8) | cpuflags_SSE3
+%assign cpuflags_SSE4     (1<<9) | cpuflags_SSSE3
+%assign cpuflags_SSE42    (1<<10)| cpuflags_SSE4
+%assign cpuflags_AVX      (1<<11)| cpuflags_SSE42
+%assign cpuflags_xop      (1<<12)| cpuflags_AVX
+%assign cpuflags_fma4     (1<<13)| cpuflags_AVX
+%assign cpuflags_AVX2     (1<<14)| cpuflags_AVX
+%assign cpuflags_fma3     (1<<15)| cpuflags_AVX
+
+%assign cpuflags_cache32  (1<<16)
+%assign cpuflags_cache64  (1<<17)
+%assign cpuflags_slowctz  (1<<18)
+%assign cpuflags_lzcnt    (1<<19)
+%assign cpuflags_misalign (1<<20)
+%assign cpuflags_aligned  (1<<21) ; not a cpu feature, but a function variant
+%assign cpuflags_atom     (1<<22)
+%assign cpuflags_bmi1     (1<<23)
+%assign cpuflags_bmi2     (1<<24)|cpuflags_bmi1
+%assign cpuflags_tbm      (1<<25)|cpuflags_bmi1
+
+%define    cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))
+%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))
+
+; Takes up to 2 cpuflags from the above list.
+; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
+; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
+%macro INIT_CPUFLAGS 0-2
+    %if %0 >= 1
+        %xdefine cpuname %1
+        %assign cpuflags cpuflags_%1
+        %if %0 >= 2
+            %xdefine cpuname %1_%2
+            %assign cpuflags cpuflags | cpuflags_%2
+        %endif
+        %xdefine SUFFIX _ %+ cpuname
+        %if cpuflag(AVX)
+            %assign AVX_enabled 1
+        %endif
+        %if mmsize == 16 && notcpuflag(SSE2)
+            %define mova movaps
+            %define movu movups
+            %define movnta movntps
+        %endif
+        %if cpuflag(aligned)
+            %define movu mova
+        %elifidn %1, SSE3
+            %define movu lddqu
+        %endif
+    %else
+        %xdefine SUFFIX
+        %undef cpuname
+        %undef cpuflags
+    %endif
+%endmacro
+
+; merge MMX and SSE*
+
+%macro CAT_XDEFINE 3
+    %xdefine %1%2 %3
+%endmacro
+
+%macro CAT_UNDEF 2
+    %undef %1%2
+%endmacro
+
+%macro INIT_MMX 0-1+
+    %assign AVX_enabled 0
+    %define RESET_MM_PERMUTATION INIT_MMX %1
+    %define mmsize 8
+    %define num_mmregs 8
+    %define mova movq
+    %define movu movq
+    %define movh movd
+    %define movnta movntq
+    %assign %%i 0
+    %rep 8
+    CAT_XDEFINE m, %%i, mm %+ %%i
+    CAT_XDEFINE nmm, %%i, %%i
+    %assign %%i %%i+1
+    %endrep
+    %rep 8
+    CAT_UNDEF m, %%i
+    CAT_UNDEF nmm, %%i
+    %assign %%i %%i+1
+    %endrep
+    INIT_CPUFLAGS %1
+%endmacro
+
+%macro INIT_XMM 0-1+
+    %assign AVX_enabled 0
+    %define RESET_MM_PERMUTATION INIT_XMM %1
+    %define mmsize 16
+    %define num_mmregs 8
+    %if ARCH_X86_64
+    %define num_mmregs 16
+    %endif
+    %define mova movdqa
+    %define movu movdqu
+    %define movh movq
+    %define movnta movntdq
+    %assign %%i 0
+    %rep num_mmregs
+    CAT_XDEFINE m, %%i, xmm %+ %%i
+    CAT_XDEFINE nxmm, %%i, %%i
+    %assign %%i %%i+1
+    %endrep
+    INIT_CPUFLAGS %1
+%endmacro
+
+%macro INIT_YMM 0-1+
+    %assign AVX_enabled 1
+    %define RESET_MM_PERMUTATION INIT_YMM %1
+    %define mmsize 32
+    %define num_mmregs 8
+    %if ARCH_X86_64
+    %define num_mmregs 16
+    %endif
+    %define mova vmovaps
+    %define movu vmovups
+    %undef movh
+    %define movnta vmovntps
+    %assign %%i 0
+    %rep num_mmregs
+    CAT_XDEFINE m, %%i, ymm %+ %%i
+    CAT_XDEFINE nymm, %%i, %%i
+    %assign %%i %%i+1
+    %endrep
+    INIT_CPUFLAGS %1
+%endmacro
+
+INIT_XMM
+
+; I often want to use macros that permute their arguments. e.g. there's no
+; efficient way to implement butterfly or transpose or dct without swapping some
+; arguments.
+;
+; I would like to not have to manually keep track of the permutations:
+; If I insert a permutation in the middle of a function, it should automatically
+; change everything that follows. For more complex macros I may also have multiple
+; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
+;
+; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
+; permutes its arguments. It's equivalent to exchanging the contents of the
+; registers, except that this way you exchange the register names instead, so it
+; doesn't cost any cycles.
+
+%macro PERMUTE 2-* ; takes a list of pairs to swap
+%rep %0/2
+    %xdefine tmp%2 m%2
+    %xdefine ntmp%2 nm%2
+    %rotate 2
+%endrep
+%rep %0/2
+    %xdefine m%1 tmp%2
+    %xdefine nm%1 ntmp%2
+    %undef tmp%2
+    %undef ntmp%2
+    %rotate 2
+%endrep
+%endmacro
+
+%macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs)
+%rep %0-1
+%ifdef m%1
+    %xdefine tmp m%1
+    %xdefine m%1 m%2
+    %xdefine m%2 tmp
+    CAT_XDEFINE n, m%1, %1
+    CAT_XDEFINE n, m%2, %2
+%else
+    ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here.
+    ; Be careful using this mode in nested macros though, as in some cases there may be
+    ; other copies of m# that have already been dereferenced and don't get updated correctly.
+    %xdefine %%n1 n %+ %1
+    %xdefine %%n2 n %+ %2
+    %xdefine tmp m %+ %%n1
+    CAT_XDEFINE m, %%n1, m %+ %%n2
+    CAT_XDEFINE m, %%n2, tmp
+    CAT_XDEFINE n, m %+ %%n1, %%n1
+    CAT_XDEFINE n, m %+ %%n2, %%n2
+%endif
+    %undef tmp
+    %rotate 1
+%endrep
+%endmacro
+
+; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later
+; calls to that function will automatically load the permutation, so values can
+; be returned in mmregs.
+%macro SAVE_MM_PERMUTATION 0-1
+    %if %0
+        %xdefine %%f %1_m
+    %else
+        %xdefine %%f current_function %+ _m
+    %endif
+    %assign %%i 0
+    %rep num_mmregs
+        CAT_XDEFINE %%f, %%i, m %+ %%i
+    %assign %%i %%i+1
+    %endrep
+%endmacro
+
+%macro LOAD_MM_PERMUTATION 1 ; name to load from
+    %ifdef %1_m0
+        %assign %%i 0
+        %rep num_mmregs
+            CAT_XDEFINE m, %%i, %1_m %+ %%i
+            CAT_XDEFINE n, m %+ %%i, %%i
+        %assign %%i %%i+1
+        %endrep
+    %endif
+%endmacro
+
+; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
+%macro call 1
+    call_internal %1, %1 %+ SUFFIX
+%endmacro
+%macro call_internal 2
+    %xdefine %%i %1
+    %ifndef cglobaled_%1
+        %ifdef cglobaled_%2
+            %xdefine %%i %2
+        %endif
+    %endif
+    call %%i
+    LOAD_MM_PERMUTATION %%i
+%endmacro
+
+; Substitutions that reduce instruction size but are functionally equivalent
+%macro add 2
+    %ifnum %2
+        %if %2==128
+            sub %1, -128
+        %else
+            add %1, %2
+        %endif
+    %else
+        add %1, %2
+    %endif
+%endmacro
+
+%macro sub 2
+    %ifnum %2
+        %if %2==128
+            add %1, -128
+        %else
+            sub %1, %2
+        %endif
+    %else
+        sub %1, %2
+    %endif
+%endmacro
+
+;=============================================================================
+; AVX abstraction layer
+;=============================================================================
+
+%assign i 0
+%rep 16
+    %if i < 8
+        CAT_XDEFINE sizeofmm, i, 8
+    %endif
+    CAT_XDEFINE sizeofxmm, i, 16
+    CAT_XDEFINE sizeofymm, i, 32
+%assign i i+1
+%endrep
+%undef i
+
+%macro CHECK_AVX_INSTR_EMU 3-*
+    %xdefine %%opcode %1
+    %xdefine %%dst %2
+    %rep %0-2
+        %ifidn %%dst, %3
+            %error non-AVX emulation of ``%%opcode'' is not supported
+        %endif
+        %rotate 1
+    %endrep
+%endmacro
+
+;%1 == instruction
+;%2 == 1 if float, 0 if int
+;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm)
+;%4 == number of operands given
+;%5+: operands
+%macro RUN_AVX_INSTR 6-7+
+    %ifid %6
+        %define %%sizeofreg sizeof%6
+    %elifid %5
+        %define %%sizeofreg sizeof%5
+    %else
+        %define %%sizeofreg mmsize
+    %endif
+    %if %%sizeofreg==32
+        %if %4>=3
+            v%1 %5, %6, %7
+        %else
+            v%1 %5, %6
+        %endif
+    %else
+        %if %%sizeofreg==8
+            %define %%regmov movq
+        %elif %2
+            %define %%regmov movaps
+        %else
+            %define %%regmov movdqa
+        %endif
+
+        %if %4>=3+%3
+            %ifnidn %5, %6
+                %if AVX_enabled && %%sizeofreg==16
+                    v%1 %5, %6, %7
+                %else
+                    CHECK_AVX_INSTR_EMU {%1 %5, %6, %7}, %5, %7
+                    %%regmov %5, %6
+                    %1 %5, %7
+                %endif
+            %else
+                %1 %5, %7
+            %endif
+        %elif %4>=3
+            %1 %5, %6, %7
+        %else
+            %1 %5, %6
+        %endif
+    %endif
+%endmacro
+
+; 3arg AVX ops with a memory arg can only have it in src2,
+; whereas SSE emulation of 3arg prefers to have it in src1 (i.e. the mov).
+; So, if the op is symmetric and the wrong one is memory, swap them.
+%macro RUN_AVX_INSTR1 8
+    %assign %%swap 0
+    %if AVX_enabled
+        %ifnid %6
+            %assign %%swap 1
+        %endif
+    %elifnidn %5, %6
+        %ifnid %7
+            %assign %%swap 1
+        %endif
+    %endif
+    %if %%swap && %3 == 0 && %8 == 1
+        RUN_AVX_INSTR %1, %2, %3, %4, %5, %7, %6
+    %else
+        RUN_AVX_INSTR %1, %2, %3, %4, %5, %6, %7
+    %endif
+%endmacro
+
+;%1 == instruction
+;%2 == 1 if float, 0 if int
+;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm)
+;%4 == 1 if symmetric (i.e. doesn't matter which src arg is which), 0 if not
+%macro AVX_INSTR 4
+    %macro %1 2-9 fnord, fnord, fnord, %1, %2, %3, %4
+        %ifidn %3, fnord
+            RUN_AVX_INSTR %6, %7, %8, 2, %1, %2
+        %elifidn %4, fnord
+            RUN_AVX_INSTR1 %6, %7, %8, 3, %1, %2, %3, %9
+        %elifidn %5, fnord
+            RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4
+        %else
+            RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5
+        %endif
+    %endmacro
+%endmacro
+
+AVX_INSTR addpd, 1, 0, 1
+AVX_INSTR addps, 1, 0, 1
+AVX_INSTR addsd, 1, 0, 1
+AVX_INSTR addss, 1, 0, 1
+AVX_INSTR addsubpd, 1, 0, 0
+AVX_INSTR addsubps, 1, 0, 0
+AVX_INSTR andpd, 1, 0, 1
+AVX_INSTR andps, 1, 0, 1
+AVX_INSTR andnpd, 1, 0, 0
+AVX_INSTR andnps, 1, 0, 0
+AVX_INSTR blendpd, 1, 0, 0
+AVX_INSTR blendps, 1, 0, 0
+AVX_INSTR blendvpd, 1, 0, 0
+AVX_INSTR blendvps, 1, 0, 0
+AVX_INSTR cmppd, 1, 0, 0
+AVX_INSTR cmpps, 1, 0, 0
+AVX_INSTR cmpsd, 1, 0, 0
+AVX_INSTR cmpss, 1, 0, 0
+AVX_INSTR cvtdq2ps, 1, 0, 0
+AVX_INSTR cvtps2dq, 1, 0, 0
+AVX_INSTR divpd, 1, 0, 0
+AVX_INSTR divps, 1, 0, 0
+AVX_INSTR divsd, 1, 0, 0
+AVX_INSTR divss, 1, 0, 0
+AVX_INSTR dppd, 1, 1, 0
+AVX_INSTR dpps, 1, 1, 0
+AVX_INSTR haddpd, 1, 0, 0
+AVX_INSTR haddps, 1, 0, 0
+AVX_INSTR hsubpd, 1, 0, 0
+AVX_INSTR hsubps, 1, 0, 0
+AVX_INSTR maxpd, 1, 0, 1
+AVX_INSTR maxps, 1, 0, 1
+AVX_INSTR maxsd, 1, 0, 1
+AVX_INSTR maxss, 1, 0, 1
+AVX_INSTR minpd, 1, 0, 1
+AVX_INSTR minps, 1, 0, 1
+AVX_INSTR minsd, 1, 0, 1
+AVX_INSTR minss, 1, 0, 1
+AVX_INSTR movhlps, 1, 0, 0
+AVX_INSTR movlhps, 1, 0, 0
+AVX_INSTR movsd, 1, 0, 0
+AVX_INSTR movss, 1, 0, 0
+AVX_INSTR mpsadbw, 0, 1, 0
+AVX_INSTR mulpd, 1, 0, 1
+AVX_INSTR mulps, 1, 0, 1
+AVX_INSTR mulsd, 1, 0, 1
+AVX_INSTR mulss, 1, 0, 1
+AVX_INSTR orpd, 1, 0, 1
+AVX_INSTR orps, 1, 0, 1
+AVX_INSTR pabsb, 0, 0, 0
+AVX_INSTR pabsw, 0, 0, 0
+AVX_INSTR pabsd, 0, 0, 0
+AVX_INSTR packsswb, 0, 0, 0
+AVX_INSTR packssdw, 0, 0, 0
+AVX_INSTR packuswb, 0, 0, 0
+AVX_INSTR packusdw, 0, 0, 0
+AVX_INSTR paddb, 0, 0, 1
+AVX_INSTR paddw, 0, 0, 1
+AVX_INSTR paddd, 0, 0, 1
+AVX_INSTR paddq, 0, 0, 1
+AVX_INSTR paddsb, 0, 0, 1
+AVX_INSTR paddsw, 0, 0, 1
+AVX_INSTR paddusb, 0, 0, 1
+AVX_INSTR paddusw, 0, 0, 1
+AVX_INSTR palignr, 0, 1, 0
+AVX_INSTR pand, 0, 0, 1
+AVX_INSTR pandn, 0, 0, 0
+AVX_INSTR pavgb, 0, 0, 1
+AVX_INSTR pavgw, 0, 0, 1
+AVX_INSTR pblendvb, 0, 0, 0
+AVX_INSTR pblendw, 0, 1, 0
+AVX_INSTR pcmpestri, 0, 0, 0
+AVX_INSTR pcmpestrm, 0, 0, 0
+AVX_INSTR pcmpistri, 0, 0, 0
+AVX_INSTR pcmpistrm, 0, 0, 0
+AVX_INSTR pcmpeqb, 0, 0, 1
+AVX_INSTR pcmpeqw, 0, 0, 1
+AVX_INSTR pcmpeqd, 0, 0, 1
+AVX_INSTR pcmpeqq, 0, 0, 1
+AVX_INSTR pcmpgtb, 0, 0, 0
+AVX_INSTR pcmpgtw, 0, 0, 0
+AVX_INSTR pcmpgtd, 0, 0, 0
+AVX_INSTR pcmpgtq, 0, 0, 0
+AVX_INSTR phaddw, 0, 0, 0
+AVX_INSTR phaddd, 0, 0, 0
+AVX_INSTR phaddsw, 0, 0, 0
+AVX_INSTR phsubw, 0, 0, 0
+AVX_INSTR phsubd, 0, 0, 0
+AVX_INSTR phsubsw, 0, 0, 0
+AVX_INSTR pmaddwd, 0, 0, 1
+AVX_INSTR pmaddubsw, 0, 0, 0
+AVX_INSTR pmaxsb, 0, 0, 1
+AVX_INSTR pmaxsw, 0, 0, 1
+AVX_INSTR pmaxsd, 0, 0, 1
+AVX_INSTR pmaxub, 0, 0, 1
+AVX_INSTR pmaxuw, 0, 0, 1
+AVX_INSTR pmaxud, 0, 0, 1
+AVX_INSTR pminsb, 0, 0, 1
+AVX_INSTR pminsw, 0, 0, 1
+AVX_INSTR pminsd, 0, 0, 1
+AVX_INSTR pminub, 0, 0, 1
+AVX_INSTR pminuw, 0, 0, 1
+AVX_INSTR pminud, 0, 0, 1
+AVX_INSTR pmovmskb, 0, 0, 0
+AVX_INSTR pmulhuw, 0, 0, 1
+AVX_INSTR pmulhrsw, 0, 0, 1
+AVX_INSTR pmulhw, 0, 0, 1
+AVX_INSTR pmullw, 0, 0, 1
+AVX_INSTR pmulld, 0, 0, 1
+AVX_INSTR pmuludq, 0, 0, 1
+AVX_INSTR pmuldq, 0, 0, 1
+AVX_INSTR por, 0, 0, 1
+AVX_INSTR psadbw, 0, 0, 1
+AVX_INSTR pshufb, 0, 0, 0
+AVX_INSTR pshufd, 0, 1, 0
+AVX_INSTR pshufhw, 0, 1, 0
+AVX_INSTR pshuflw, 0, 1, 0
+AVX_INSTR psignb, 0, 0, 0
+AVX_INSTR psignw, 0, 0, 0
+AVX_INSTR psignd, 0, 0, 0
+AVX_INSTR psllw, 0, 0, 0
+AVX_INSTR pslld, 0, 0, 0
+AVX_INSTR psllq, 0, 0, 0
+AVX_INSTR pslldq, 0, 0, 0
+AVX_INSTR psraw, 0, 0, 0
+AVX_INSTR psrad, 0, 0, 0
+AVX_INSTR psrlw, 0, 0, 0
+AVX_INSTR psrld, 0, 0, 0
+AVX_INSTR psrlq, 0, 0, 0
+AVX_INSTR psrldq, 0, 0, 0
+AVX_INSTR psubb, 0, 0, 0
+AVX_INSTR psubw, 0, 0, 0
+AVX_INSTR psubd, 0, 0, 0
+AVX_INSTR psubq, 0, 0, 0
+AVX_INSTR psubsb, 0, 0, 0
+AVX_INSTR psubsw, 0, 0, 0
+AVX_INSTR psubusb, 0, 0, 0
+AVX_INSTR psubusw, 0, 0, 0
+AVX_INSTR ptest, 0, 0, 0
+AVX_INSTR punpckhbw, 0, 0, 0
+AVX_INSTR punpckhwd, 0, 0, 0
+AVX_INSTR punpckhdq, 0, 0, 0
+AVX_INSTR punpckhqdq, 0, 0, 0
+AVX_INSTR punpcklbw, 0, 0, 0
+AVX_INSTR punpcklwd, 0, 0, 0
+AVX_INSTR punpckldq, 0, 0, 0
+AVX_INSTR punpcklqdq, 0, 0, 0
+AVX_INSTR pxor, 0, 0, 1
+AVX_INSTR shufps, 1, 1, 0
+AVX_INSTR subpd, 1, 0, 0
+AVX_INSTR subps, 1, 0, 0
+AVX_INSTR subsd, 1, 0, 0
+AVX_INSTR subss, 1, 0, 0
+AVX_INSTR unpckhpd, 1, 0, 0
+AVX_INSTR unpckhps, 1, 0, 0
+AVX_INSTR unpcklpd, 1, 0, 0
+AVX_INSTR unpcklps, 1, 0, 0
+AVX_INSTR xorpd, 1, 0, 1
+AVX_INSTR xorps, 1, 0, 1
+
+; 3DNow instructions, for sharing code between AVX, SSE and 3DN
+AVX_INSTR pfadd, 1, 0, 1
+AVX_INSTR pfsub, 1, 0, 0
+AVX_INSTR pfmul, 1, 0, 1
+
+; base-4 constants for shuffles
+%assign i 0
+%rep 256
+    %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3)
+    %if j < 10
+        CAT_XDEFINE q000, j, i
+    %elif j < 100
+        CAT_XDEFINE q00, j, i
+    %elif j < 1000
+        CAT_XDEFINE q0, j, i
+    %else
+        CAT_XDEFINE q, j, i
+    %endif
+%assign i i+1
+%endrep
+%undef i
+%undef j
+
+%macro FMA_INSTR 3
+    %macro %1 4-7 %1, %2, %3
+        %if cpuflag(xop)
+            v%5 %1, %2, %3, %4
+        %else
+            %6 %1, %2, %3
+            %7 %1, %4
+        %endif
+    %endmacro
+%endmacro
+
+FMA_INSTR  pmacsdd,  pmulld, paddd
+FMA_INSTR  pmacsww,  pmullw, paddw
+FMA_INSTR pmadcswd, pmaddwd, paddd
+
+; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf.
+; This lets us use tzcnt without bumping the yasm version requirement yet.
+%define tzcnt rep bsf
diff --git a/libs/libvpx/third_party/x86inc/LICENSE b/libs/libvpx/third_party/x86inc/LICENSE
new file mode 100644
index 0000000000..7d07645a17
--- /dev/null
+++ b/libs/libvpx/third_party/x86inc/LICENSE
@@ -0,0 +1,18 @@
+Copyright (C) 2005-2012 x264 project
+
+Authors: Loren Merritt <lorenm@u.washington.edu>
+         Anton Mitrofanov <BugMaster@narod.ru>
+         Jason Garrett-Glaser <darkshikari@gmail.com>
+         Henrik Gramner <hengar-6@student.ltu.se>
+
+Permission to use, copy, modify, and/or distribute this software for any
+purpose with or without fee is hereby granted, provided that the above
+copyright notice and this permission notice appear in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
diff --git a/libs/libvpx/third_party/x86inc/README.libvpx b/libs/libvpx/third_party/x86inc/README.libvpx
new file mode 100644
index 0000000000..e91e305a28
--- /dev/null
+++ b/libs/libvpx/third_party/x86inc/README.libvpx
@@ -0,0 +1,24 @@
+URL: http://git.videolan.org/?p=x264.git
+Version: a95584945dd9ce3acc66c6cd8f6796bc4404d40d
+License: ISC
+License File: LICENSE
+
+Description:
+x264/libav's framework for x86 assembly. Contains a variety of macros and
+defines that help automatically allow assembly to work cross-platform.
+
+Local Modifications:
+Get configuration from vpx_config.asm.
+Prefix functions with vpx by default.
+Manage name mangling (prefixing with '_') manually because 'PREFIX' does not
+  exist in libvpx.
+Expand PIC default to macho64 and respect CONFIG_PIC from libvpx
+Catch all elf formats for 'hidden' status and SECTION notes.
+Avoid 'amdnop' when building with nasm.
+Set 'private_extern' visibility for macho targets.
+Copy PIC 'GLOBAL' macros from x86_abi_support.asm
+Use .text instead of .rodata on macho to avoid broken tables in PIC mode.
+Use .text with no alignment for aout
+Only use 'hidden' visibility with Chromium
+Move '%use smartalign' for nasm out of 'INIT_CPUFLAGS' and before
+  'ALIGNMODE'.
diff --git a/libs/libvpx/third_party/x86inc/x86inc.asm b/libs/libvpx/third_party/x86inc/x86inc.asm
new file mode 100644
index 0000000000..e7d3fa5ebb
--- /dev/null
+++ b/libs/libvpx/third_party/x86inc/x86inc.asm
@@ -0,0 +1,1613 @@
+;*****************************************************************************
+;* x86inc.asm: x264asm abstraction layer
+;*****************************************************************************
+;* Copyright (C) 2005-2015 x264 project
+;*
+;* Authors: Loren Merritt <lorenm@u.washington.edu>
+;*          Anton Mitrofanov <BugMaster@narod.ru>
+;*          Fiona Glaser <fiona@x264.com>
+;*          Henrik Gramner <henrik@gramner.com>
+;*
+;* Permission to use, copy, modify, and/or distribute this software for any
+;* purpose with or without fee is hereby granted, provided that the above
+;* copyright notice and this permission notice appear in all copies.
+;*
+;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+;*****************************************************************************
+
+; This is a header file for the x264ASM assembly language, which uses
+; NASM/YASM syntax combined with a large number of macros to provide easy
+; abstraction between different calling conventions (x86_32, win64, linux64).
+; It also has various other useful features to simplify writing the kind of
+; DSP functions that are most often used in x264.
+
+; Unlike the rest of x264, this file is available under an ISC license, as it
+; has significant usefulness outside of x264 and we want it to be available
+; to the largest audience possible.  Of course, if you modify it for your own
+; purposes to add a new feature, we strongly encourage contributing a patch
+; as this feature might be useful for others as well.  Send patches or ideas
+; to x264-devel@videolan.org .
+
+%include "vpx_config.asm"
+
+%ifndef private_prefix
+    %define private_prefix vpx
+%endif
+
+%ifndef public_prefix
+    %define public_prefix private_prefix
+%endif
+
+%ifndef STACK_ALIGNMENT
+    %if ARCH_X86_64
+        %define STACK_ALIGNMENT 16
+    %else
+        %define STACK_ALIGNMENT 4
+    %endif
+%endif
+
+%define WIN64  0
+%define UNIX64 0
+%if ARCH_X86_64
+    %ifidn __OUTPUT_FORMAT__,win32
+        %define WIN64  1
+    %elifidn __OUTPUT_FORMAT__,win64
+        %define WIN64  1
+    %elifidn __OUTPUT_FORMAT__,x64
+        %define WIN64  1
+    %else
+        %define UNIX64 1
+    %endif
+%endif
+
+%ifidn   __OUTPUT_FORMAT__,elf32
+    %define mangle(x) x
+%elifidn __OUTPUT_FORMAT__,elf64
+    %define mangle(x) x
+%elifidn __OUTPUT_FORMAT__,x64
+    %define mangle(x) x
+%elifidn __OUTPUT_FORMAT__,win64
+    %define mangle(x) x
+%else
+    %define mangle(x) _ %+ x
+%endif
+
+; In some instances macho32 tables get misaligned when using .rodata.
+; When looking at the disassembly it appears that the offset is either
+; correct or consistently off by 90. Placing them in the .text section
+; works around the issue. It appears to be specific to the way libvpx
+; handles the tables.
+%macro SECTION_RODATA 0-1 16
+    %ifidn __OUTPUT_FORMAT__,macho32
+        SECTION .text align=%1
+        fakegot:
+    %elifidn __OUTPUT_FORMAT__,aout
+        SECTION .text
+    %else
+        SECTION .rodata align=%1
+    %endif
+%endmacro
+
+%macro SECTION_TEXT 0-1 16
+    %ifidn __OUTPUT_FORMAT__,aout
+        SECTION .text
+    %else
+        SECTION .text align=%1
+    %endif
+%endmacro
+
+; PIC macros are copied from vpx_ports/x86_abi_support.asm. The "define PIC"
+; from original code is added in for 64bit.
+%ifidn __OUTPUT_FORMAT__,elf32
+%define ABI_IS_32BIT 1
+%elifidn __OUTPUT_FORMAT__,macho32
+%define ABI_IS_32BIT 1
+%elifidn __OUTPUT_FORMAT__,win32
+%define ABI_IS_32BIT 1
+%elifidn __OUTPUT_FORMAT__,aout
+%define ABI_IS_32BIT 1
+%else
+%define ABI_IS_32BIT 0
+%endif
+
+%if ABI_IS_32BIT
+    %if CONFIG_PIC=1
+        %ifidn __OUTPUT_FORMAT__,elf32
+            %define GET_GOT_DEFINED 1
+            %define WRT_PLT wrt ..plt
+            %macro GET_GOT 1
+                extern _GLOBAL_OFFSET_TABLE_
+                push %1
+                call %%get_got
+                %%sub_offset:
+                jmp %%exitGG
+                %%get_got:
+                mov %1, [esp]
+                add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%sub_offset wrt ..gotpc
+                ret
+                %%exitGG:
+                %undef GLOBAL
+                %define GLOBAL(x) x + %1 wrt ..gotoff
+                %undef RESTORE_GOT
+                %define RESTORE_GOT pop %1
+            %endmacro
+        %elifidn __OUTPUT_FORMAT__,macho32
+            %define GET_GOT_DEFINED 1
+            %macro GET_GOT 1
+                push %1
+                call %%get_got
+                %%get_got:
+                pop  %1
+                %undef GLOBAL
+                %define GLOBAL(x) x + %1 - %%get_got
+                %undef RESTORE_GOT
+                %define RESTORE_GOT pop %1
+            %endmacro
+        %else
+            %define GET_GOT_DEFINED 0
+        %endif
+    %endif
+
+    %if ARCH_X86_64 == 0
+        %undef PIC
+    %endif
+
+%else
+    %macro GET_GOT 1
+    %endmacro
+    %define GLOBAL(x) rel x
+    %define WRT_PLT wrt ..plt
+
+    %if WIN64
+        %define PIC
+    %elifidn __OUTPUT_FORMAT__,macho64
+        %define PIC
+    %elif CONFIG_PIC
+        %define PIC
+    %endif
+%endif
+
+%ifnmacro GET_GOT
+    %macro GET_GOT 1
+    %endmacro
+    %define GLOBAL(x) x
+%endif
+%ifndef RESTORE_GOT
+    %define RESTORE_GOT
+%endif
+%ifndef WRT_PLT
+    %define WRT_PLT
+%endif
+
+%ifdef PIC
+    default rel
+%endif
+; Done with PIC macros
+
+; Macros to eliminate most code duplication between x86_32 and x86_64:
+; Currently this works only for leaf functions which load all their arguments
+; into registers at the start, and make no other use of the stack. Luckily that
+; covers most of x264's asm.
+
+; PROLOGUE:
+; %1 = number of arguments. loads them from stack if needed.
+; %2 = number of registers used. pushes callee-saved regs if needed.
+; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
+; %4 = (optional) stack size to be allocated. The stack will be aligned before
+;      allocating the specified stack size. If the required stack alignment is
+;      larger than the known stack alignment the stack will be manually aligned
+;      and an extra register will be allocated to hold the original stack
+;      pointer (to not invalidate r0m etc.). To prevent the use of an extra
+;      register as stack pointer, request a negative stack size.
+; %4+/%5+ = list of names to define to registers
+; PROLOGUE can also be invoked by adding the same options to cglobal
+
+; e.g.
+; cglobal foo, 2,3,7,0x40, dst, src, tmp
+; declares a function (foo) that automatically loads two arguments (dst and
+; src) into registers, uses one additional register (tmp) plus 7 vector
+; registers (m0-m6) and allocates 0x40 bytes of stack space.
+
+; TODO Some functions can use some args directly from the stack. If they're the
+; last args then you can just not declare them, but if they're in the middle
+; we need more flexible macro.
+
+; RET:
+; Pops anything that was pushed by PROLOGUE, and returns.
+
+; REP_RET:
+; Use this instead of RET if it's a branch target.
+
+; registers:
+; rN and rNq are the native-size register holding function argument N
+; rNd, rNw, rNb are dword, word, and byte size
+; rNh is the high 8 bits of the word size
+; rNm is the original location of arg N (a register or on the stack), dword
+; rNmp is native size
+
+%macro DECLARE_REG 2-3
+    %define r%1q %2
+    %define r%1d %2d
+    %define r%1w %2w
+    %define r%1b %2b
+    %define r%1h %2h
+    %if %0 == 2
+        %define r%1m  %2d
+        %define r%1mp %2
+    %elif ARCH_X86_64 ; memory
+        %define r%1m [rstk + stack_offset + %3]
+        %define r%1mp qword r %+ %1 %+ m
+    %else
+        %define r%1m [rstk + stack_offset + %3]
+        %define r%1mp dword r %+ %1 %+ m
+    %endif
+    %define r%1  %2
+%endmacro
+
+%macro DECLARE_REG_SIZE 3
+    %define r%1q r%1
+    %define e%1q r%1
+    %define r%1d e%1
+    %define e%1d e%1
+    %define r%1w %1
+    %define e%1w %1
+    %define r%1h %3
+    %define e%1h %3
+    %define r%1b %2
+    %define e%1b %2
+%if ARCH_X86_64 == 0
+    %define r%1  e%1
+%endif
+%endmacro
+
+DECLARE_REG_SIZE ax, al, ah
+DECLARE_REG_SIZE bx, bl, bh
+DECLARE_REG_SIZE cx, cl, ch
+DECLARE_REG_SIZE dx, dl, dh
+DECLARE_REG_SIZE si, sil, null
+DECLARE_REG_SIZE di, dil, null
+DECLARE_REG_SIZE bp, bpl, null
+
+; t# defines for when per-arch register allocation is more complex than just function arguments
+
+%macro DECLARE_REG_TMP 1-*
+    %assign %%i 0
+    %rep %0
+        CAT_XDEFINE t, %%i, r%1
+        %assign %%i %%i+1
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro DECLARE_REG_TMP_SIZE 0-*
+    %rep %0
+        %define t%1q t%1 %+ q
+        %define t%1d t%1 %+ d
+        %define t%1w t%1 %+ w
+        %define t%1h t%1 %+ h
+        %define t%1b t%1 %+ b
+        %rotate 1
+    %endrep
+%endmacro
+
+DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
+
+%if ARCH_X86_64
+    %define gprsize 8
+%else
+    %define gprsize 4
+%endif
+
+%macro PUSH 1
+    push %1
+    %ifidn rstk, rsp
+        %assign stack_offset stack_offset+gprsize
+    %endif
+%endmacro
+
+%macro POP 1
+    pop %1
+    %ifidn rstk, rsp
+        %assign stack_offset stack_offset-gprsize
+    %endif
+%endmacro
+
+%macro PUSH_IF_USED 1-*
+    %rep %0
+        %if %1 < regs_used
+            PUSH r%1
+        %endif
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro POP_IF_USED 1-*
+    %rep %0
+        %if %1 < regs_used
+            pop r%1
+        %endif
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro LOAD_IF_USED 1-*
+    %rep %0
+        %if %1 < num_args
+            mov r%1, r %+ %1 %+ mp
+        %endif
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro SUB 2
+    sub %1, %2
+    %ifidn %1, rstk
+        %assign stack_offset stack_offset+(%2)
+    %endif
+%endmacro
+
+%macro ADD 2
+    add %1, %2
+    %ifidn %1, rstk
+        %assign stack_offset stack_offset-(%2)
+    %endif
+%endmacro
+
+%macro movifnidn 2
+    %ifnidn %1, %2
+        mov %1, %2
+    %endif
+%endmacro
+
+%macro movsxdifnidn 2
+    %ifnidn %1, %2
+        movsxd %1, %2
+    %endif
+%endmacro
+
+%macro ASSERT 1
+    %if (%1) == 0
+        %error assert failed
+    %endif
+%endmacro
+
+%macro DEFINE_ARGS 0-*
+    %ifdef n_arg_names
+        %assign %%i 0
+        %rep n_arg_names
+            CAT_UNDEF arg_name %+ %%i, q
+            CAT_UNDEF arg_name %+ %%i, d
+            CAT_UNDEF arg_name %+ %%i, w
+            CAT_UNDEF arg_name %+ %%i, h
+            CAT_UNDEF arg_name %+ %%i, b
+            CAT_UNDEF arg_name %+ %%i, m
+            CAT_UNDEF arg_name %+ %%i, mp
+            CAT_UNDEF arg_name, %%i
+            %assign %%i %%i+1
+        %endrep
+    %endif
+
+    %xdefine %%stack_offset stack_offset
+    %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine
+    %assign %%i 0
+    %rep %0
+        %xdefine %1q r %+ %%i %+ q
+        %xdefine %1d r %+ %%i %+ d
+        %xdefine %1w r %+ %%i %+ w
+        %xdefine %1h r %+ %%i %+ h
+        %xdefine %1b r %+ %%i %+ b
+        %xdefine %1m r %+ %%i %+ m
+        %xdefine %1mp r %+ %%i %+ mp
+        CAT_XDEFINE arg_name, %%i, %1
+        %assign %%i %%i+1
+        %rotate 1
+    %endrep
+    %xdefine stack_offset %%stack_offset
+    %assign n_arg_names %0
+%endmacro
+
+%define required_stack_alignment ((mmsize + 15) & ~15)
+
+%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only)
+    %ifnum %1
+        %if %1 != 0
+            %assign %%pad 0
+            %assign stack_size %1
+            %if stack_size < 0
+                %assign stack_size -stack_size
+            %endif
+            %if WIN64
+                %assign %%pad %%pad + 32 ; shadow space
+                %if mmsize != 8
+                    %assign xmm_regs_used %2
+                    %if xmm_regs_used > 8
+                        %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers
+                    %endif
+                %endif
+            %endif
+            %if required_stack_alignment <= STACK_ALIGNMENT
+                ; maintain the current stack alignment
+                %assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
+                SUB rsp, stack_size_padded
+            %else
+                %assign %%reg_num (regs_used - 1)
+                %xdefine rstk r %+ %%reg_num
+                ; align stack, and save original stack location directly above
+                ; it, i.e. in [rsp+stack_size_padded], so we can restore the
+                ; stack in a single instruction (i.e. mov rsp, rstk or mov
+                ; rsp, [rsp+stack_size_padded])
+                %if %1 < 0 ; need to store rsp on stack
+                    %xdefine rstkm [rsp + stack_size + %%pad]
+                    %assign %%pad %%pad + gprsize
+                %else ; can keep rsp in rstk during whole function
+                    %xdefine rstkm rstk
+                %endif
+                %assign stack_size_padded stack_size + ((%%pad + required_stack_alignment-1) & ~(required_stack_alignment-1))
+                mov rstk, rsp
+                and rsp, ~(required_stack_alignment-1)
+                sub rsp, stack_size_padded
+                movifnidn rstkm, rstk
+            %endif
+            WIN64_PUSH_XMM
+        %endif
+    %endif
+%endmacro
+
+%macro SETUP_STACK_POINTER 1
+    %ifnum %1
+        %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT
+            %if %1 > 0
+                %assign regs_used (regs_used + 1)
+            %elif ARCH_X86_64 && regs_used == num_args && num_args <= 4 + UNIX64 * 2
+                %warning "Stack pointer will overwrite register argument"
+            %endif
+        %endif
+    %endif
+%endmacro
+
+%macro DEFINE_ARGS_INTERNAL 3+
+    %ifnum %2
+        DEFINE_ARGS %3
+    %elif %1 == 4
+        DEFINE_ARGS %2
+    %elif %1 > 4
+        DEFINE_ARGS %2, %3
+    %endif
+%endmacro
+
+%if WIN64 ; Windows x64 ;=================================================
+
+DECLARE_REG 0,  rcx
+DECLARE_REG 1,  rdx
+DECLARE_REG 2,  R8
+DECLARE_REG 3,  R9
+DECLARE_REG 4,  R10, 40
+DECLARE_REG 5,  R11, 48
+DECLARE_REG 6,  rax, 56
+DECLARE_REG 7,  rdi, 64
+DECLARE_REG 8,  rsi, 72
+DECLARE_REG 9,  rbx, 80
+DECLARE_REG 10, rbp, 88
+DECLARE_REG 11, R12, 96
+DECLARE_REG 12, R13, 104
+DECLARE_REG 13, R14, 112
+DECLARE_REG 14, R15, 120
+
+%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
+    %assign num_args %1
+    %assign regs_used %2
+    ASSERT regs_used >= num_args
+    SETUP_STACK_POINTER %4
+    ASSERT regs_used <= 15
+    PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
+    ALLOC_STACK %4, %3
+    %if mmsize != 8 && stack_size == 0
+        WIN64_SPILL_XMM %3
+    %endif
+    LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
+    DEFINE_ARGS_INTERNAL %0, %4, %5
+%endmacro
+
+%macro WIN64_PUSH_XMM 0
+    ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
+    %if xmm_regs_used > 6
+        movaps [rstk + stack_offset +  8], xmm6
+    %endif
+    %if xmm_regs_used > 7
+        movaps [rstk + stack_offset + 24], xmm7
+    %endif
+    %if xmm_regs_used > 8
+        %assign %%i 8
+        %rep xmm_regs_used-8
+            movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
+            %assign %%i %%i+1
+        %endrep
+    %endif
+%endmacro
+
+%macro WIN64_SPILL_XMM 1
+    %assign xmm_regs_used %1
+    ASSERT xmm_regs_used <= 16
+    %if xmm_regs_used > 8
+        ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack.
+        %assign %%pad (xmm_regs_used-8)*16 + 32
+        %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
+        SUB rsp, stack_size_padded
+    %endif
+    WIN64_PUSH_XMM
+%endmacro
+
+%macro WIN64_RESTORE_XMM_INTERNAL 1
+    %assign %%pad_size 0
+    %if xmm_regs_used > 8
+        %assign %%i xmm_regs_used
+        %rep xmm_regs_used-8
+            %assign %%i %%i-1
+            movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32]
+        %endrep
+    %endif
+    %if stack_size_padded > 0
+        %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT
+            mov rsp, rstkm
+        %else
+            add %1, stack_size_padded
+            %assign %%pad_size stack_size_padded
+        %endif
+    %endif
+    %if xmm_regs_used > 7
+        movaps xmm7, [%1 + stack_offset - %%pad_size + 24]
+    %endif
+    %if xmm_regs_used > 6
+        movaps xmm6, [%1 + stack_offset - %%pad_size +  8]
+    %endif
+%endmacro
+
+%macro WIN64_RESTORE_XMM 1
+    WIN64_RESTORE_XMM_INTERNAL %1
+    %assign stack_offset (stack_offset-stack_size_padded)
+    %assign xmm_regs_used 0
+%endmacro
+
+%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0
+
+%macro RET 0
+    WIN64_RESTORE_XMM_INTERNAL rsp
+    POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
+%if mmsize == 32
+    vzeroupper
+%endif
+    AUTO_REP_RET
+%endmacro
+
+%elif ARCH_X86_64 ; *nix x64 ;=============================================
+
+DECLARE_REG 0,  rdi
+DECLARE_REG 1,  rsi
+DECLARE_REG 2,  rdx
+DECLARE_REG 3,  rcx
+DECLARE_REG 4,  R8
+DECLARE_REG 5,  R9
+DECLARE_REG 6,  rax, 8
+DECLARE_REG 7,  R10, 16
+DECLARE_REG 8,  R11, 24
+DECLARE_REG 9,  rbx, 32
+DECLARE_REG 10, rbp, 40
+DECLARE_REG 11, R12, 48
+DECLARE_REG 12, R13, 56
+DECLARE_REG 13, R14, 64
+DECLARE_REG 14, R15, 72
+
+%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
+    %assign num_args %1
+    %assign regs_used %2
+    ASSERT regs_used >= num_args
+    SETUP_STACK_POINTER %4
+    ASSERT regs_used <= 15
+    PUSH_IF_USED 9, 10, 11, 12, 13, 14
+    ALLOC_STACK %4
+    LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
+    DEFINE_ARGS_INTERNAL %0, %4, %5
+%endmacro
+
+%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0
+
+%macro RET 0
+%if stack_size_padded > 0
+%if required_stack_alignment > STACK_ALIGNMENT
+    mov rsp, rstkm
+%else
+    add rsp, stack_size_padded
+%endif
+%endif
+    POP_IF_USED 14, 13, 12, 11, 10, 9
+%if mmsize == 32
+    vzeroupper
+%endif
+    AUTO_REP_RET
+%endmacro
+
+%else ; X86_32 ;==============================================================
+
+DECLARE_REG 0, eax, 4
+DECLARE_REG 1, ecx, 8
+DECLARE_REG 2, edx, 12
+DECLARE_REG 3, ebx, 16
+DECLARE_REG 4, esi, 20
+DECLARE_REG 5, edi, 24
+DECLARE_REG 6, ebp, 28
+%define rsp esp
+
+%macro DECLARE_ARG 1-*
+    %rep %0
+        %define r%1m [rstk + stack_offset + 4*%1 + 4]
+        %define r%1mp dword r%1m
+        %rotate 1
+    %endrep
+%endmacro
+
+DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
+
+%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
+    %assign num_args %1
+    %assign regs_used %2
+    ASSERT regs_used >= num_args
+    %if num_args > 7
+        %assign num_args 7
+    %endif
+    %if regs_used > 7
+        %assign regs_used 7
+    %endif
+    SETUP_STACK_POINTER %4
+    ASSERT regs_used <= 7
+    PUSH_IF_USED 3, 4, 5, 6
+    ALLOC_STACK %4
+    LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
+    DEFINE_ARGS_INTERNAL %0, %4, %5
+%endmacro
+
+%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0
+
+%macro RET 0
+%if stack_size_padded > 0
+%if required_stack_alignment > STACK_ALIGNMENT
+    mov rsp, rstkm
+%else
+    add rsp, stack_size_padded
+%endif
+%endif
+    POP_IF_USED 6, 5, 4, 3
+%if mmsize == 32
+    vzeroupper
+%endif
+    AUTO_REP_RET
+%endmacro
+
+%endif ;======================================================================
+
+%if WIN64 == 0
+%macro WIN64_SPILL_XMM 1
+%endmacro
+%macro WIN64_RESTORE_XMM 1
+%endmacro
+%macro WIN64_PUSH_XMM 0
+%endmacro
+%endif
+
+; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either
+; a branch or a branch target. So switch to a 2-byte form of ret in that case.
+; We can automatically detect "follows a branch", but not a branch target.
+; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.)
+%macro REP_RET 0
+    %if has_epilogue
+        RET
+    %else
+        rep ret
+    %endif
+%endmacro
+
+%define last_branch_adr $$
+%macro AUTO_REP_RET 0
+    %ifndef cpuflags
+        times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ != last_branch_adr.
+    %elif notcpuflag(ssse3)
+        times ((last_branch_adr-$)>>31)+1 rep
+    %endif
+    ret
+%endmacro
+
+%macro BRANCH_INSTR 0-*
+    %rep %0
+        %macro %1 1-2 %1
+            %2 %1
+            %%branch_instr:
+            %xdefine last_branch_adr %%branch_instr
+        %endmacro
+        %rotate 1
+    %endrep
+%endmacro
+
+BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp
+
+%macro TAIL_CALL 2 ; callee, is_nonadjacent
+    %if has_epilogue
+        call %1
+        RET
+    %elif %2
+        jmp %1
+    %endif
+%endmacro
+
+;=============================================================================
+; arch-independent part
+;=============================================================================
+
+%assign function_align 16
+
+; Begin a function.
+; Applies any symbol mangling needed for C linkage, and sets up a define such that
+; subsequent uses of the function name automatically refer to the mangled version.
+; Appends cpuflags to the function name if cpuflags has been specified.
+; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX
+; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2).
+%macro cglobal 1-2+ "" ; name, [PROLOGUE args]
+    cglobal_internal 1, %1 %+ SUFFIX, %2
+%endmacro
+%macro cvisible 1-2+ "" ; name, [PROLOGUE args]
+    cglobal_internal 0, %1 %+ SUFFIX, %2
+%endmacro
+%macro cglobal_internal 2-3+
+    %if %1
+        %xdefine %%FUNCTION_PREFIX private_prefix
+        ; libvpx explicitly sets visibility in shared object builds. Avoid
+        ; setting visibility to hidden as it may break builds that split
+        ; sources on e.g., directory boundaries.
+        %ifdef CHROMIUM
+            %xdefine %%VISIBILITY hidden
+        %else
+            %xdefine %%VISIBILITY
+        %endif
+    %else
+        %xdefine %%FUNCTION_PREFIX public_prefix
+        %xdefine %%VISIBILITY
+    %endif
+    %ifndef cglobaled_%2
+        %xdefine %2 mangle(%%FUNCTION_PREFIX %+ _ %+ %2)
+        %xdefine %2.skip_prologue %2 %+ .skip_prologue
+        CAT_XDEFINE cglobaled_, %2, 1
+    %endif
+    %xdefine current_function %2
+    %ifidn __OUTPUT_FORMAT__,elf32
+        global %2:function %%VISIBILITY
+    %elifidn __OUTPUT_FORMAT__,elf64
+        global %2:function %%VISIBILITY
+    %elifidn __OUTPUT_FORMAT__,macho32
+        %ifdef __NASM_VER__
+            global %2
+        %else
+            global %2:private_extern
+        %endif
+    %elifidn __OUTPUT_FORMAT__,macho64
+        %ifdef __NASM_VER__
+            global %2
+        %else
+            global %2:private_extern
+        %endif
+    %else
+        global %2
+    %endif
+    align function_align
+    %2:
+    RESET_MM_PERMUTATION        ; needed for x86-64, also makes disassembly somewhat nicer
+    %xdefine rstk rsp           ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required
+    %assign stack_offset 0      ; stack pointer offset relative to the return address
+    %assign stack_size 0        ; amount of stack space that can be freely used inside a function
+    %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
+    %assign xmm_regs_used 0     ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64
+    %ifnidn %3, ""
+        PROLOGUE %3
+    %endif
+%endmacro
+
+%macro cextern 1
+    %xdefine %1 mangle(private_prefix %+ _ %+ %1)
+    CAT_XDEFINE cglobaled_, %1, 1
+    extern %1
+%endmacro
+
+; like cextern, but without the prefix
+%macro cextern_naked 1
+    %xdefine %1 mangle(%1)
+    CAT_XDEFINE cglobaled_, %1, 1
+    extern %1
+%endmacro
+
+%macro const 1-2+
+    %xdefine %1 mangle(private_prefix %+ _ %+ %1)
+    %ifidn __OUTPUT_FORMAT__,elf32
+        global %1:data hidden
+    %elifidn __OUTPUT_FORMAT__,elf64
+        global %1:data hidden
+    %else
+        global %1
+    %endif
+    %1: %2
+%endmacro
+
+; This is needed for ELF, otherwise the GNU linker assumes the stack is
+; executable by default.
+%ifidn __OUTPUT_FORMAT__,elf32
+SECTION .note.GNU-stack noalloc noexec nowrite progbits
+%elifidn __OUTPUT_FORMAT__,elf64
+SECTION .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+
+; cpuflags
+
+%assign cpuflags_mmx      (1<<0)
+%assign cpuflags_mmx2     (1<<1) | cpuflags_mmx
+%assign cpuflags_3dnow    (1<<2) | cpuflags_mmx
+%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow
+%assign cpuflags_sse      (1<<4) | cpuflags_mmx2
+%assign cpuflags_sse2     (1<<5) | cpuflags_sse
+%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
+%assign cpuflags_sse3     (1<<7) | cpuflags_sse2
+%assign cpuflags_ssse3    (1<<8) | cpuflags_sse3
+%assign cpuflags_sse4     (1<<9) | cpuflags_ssse3
+%assign cpuflags_sse42    (1<<10)| cpuflags_sse4
+%assign cpuflags_avx      (1<<11)| cpuflags_sse42
+%assign cpuflags_xop      (1<<12)| cpuflags_avx
+%assign cpuflags_fma4     (1<<13)| cpuflags_avx
+%assign cpuflags_fma3     (1<<14)| cpuflags_avx
+%assign cpuflags_avx2     (1<<15)| cpuflags_fma3
+
+%assign cpuflags_cache32  (1<<16)
+%assign cpuflags_cache64  (1<<17)
+%assign cpuflags_slowctz  (1<<18)
+%assign cpuflags_lzcnt    (1<<19)
+%assign cpuflags_aligned  (1<<20) ; not a cpu feature, but a function variant
+%assign cpuflags_atom     (1<<21)
+%assign cpuflags_bmi1     (1<<22)|cpuflags_lzcnt
+%assign cpuflags_bmi2     (1<<23)|cpuflags_bmi1
+
+%define    cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))
+%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))
+
+%ifdef __NASM_VER__
+    %use smartalign
+%endif
+
+; Takes an arbitrary number of cpuflags from the above list.
+; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
+; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
+%macro INIT_CPUFLAGS 0-*
+    %xdefine SUFFIX
+    %undef cpuname
+    %assign cpuflags 0
+
+    %if %0 >= 1
+        %rep %0
+            %ifdef cpuname
+                %xdefine cpuname cpuname %+ _%1
+            %else
+                %xdefine cpuname %1
+            %endif
+            %assign cpuflags cpuflags | cpuflags_%1
+            %rotate 1
+        %endrep
+        %xdefine SUFFIX _ %+ cpuname
+
+        %if cpuflag(avx)
+            %assign avx_enabled 1
+        %endif
+        %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2))
+            %define mova movaps
+            %define movu movups
+            %define movnta movntps
+        %endif
+        %if cpuflag(aligned)
+            %define movu mova
+        %elif cpuflag(sse3) && notcpuflag(ssse3)
+            %define movu lddqu
+        %endif
+    %endif
+
+    %ifdef __NASM_VER__
+        ALIGNMODE k7
+    %elif ARCH_X86_64 || cpuflag(sse2)
+        CPU amdnop
+    %else
+        CPU basicnop
+    %endif
+%endmacro
+
+; Merge mmx and sse*
+; m# is a simd register of the currently selected size
+; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m#
+; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m#
+; (All 3 remain in sync through SWAP.)
+
+%macro CAT_XDEFINE 3
+    %xdefine %1%2 %3
+%endmacro
+
+%macro CAT_UNDEF 2
+    %undef %1%2
+%endmacro
+
+%macro INIT_MMX 0-1+
+    %assign avx_enabled 0
+    %define RESET_MM_PERMUTATION INIT_MMX %1
+    %define mmsize 8
+    %define num_mmregs 8
+    %define mova movq
+    %define movu movq
+    %define movh movd
+    %define movnta movntq
+    %assign %%i 0
+    %rep 8
+    CAT_XDEFINE m, %%i, mm %+ %%i
+    CAT_XDEFINE nnmm, %%i, %%i
+    %assign %%i %%i+1
+    %endrep
+    %rep 8
+    CAT_UNDEF m, %%i
+    CAT_UNDEF nnmm, %%i
+    %assign %%i %%i+1
+    %endrep
+    INIT_CPUFLAGS %1
+%endmacro
+
+%macro INIT_XMM 0-1+
+    %assign avx_enabled 0
+    %define RESET_MM_PERMUTATION INIT_XMM %1
+    %define mmsize 16
+    %define num_mmregs 8
+    %if ARCH_X86_64
+    %define num_mmregs 16
+    %endif
+    %define mova movdqa
+    %define movu movdqu
+    %define movh movq
+    %define movnta movntdq
+    %assign %%i 0
+    %rep num_mmregs
+    CAT_XDEFINE m, %%i, xmm %+ %%i
+    CAT_XDEFINE nnxmm, %%i, %%i
+    %assign %%i %%i+1
+    %endrep
+    INIT_CPUFLAGS %1
+%endmacro
+
+%macro INIT_YMM 0-1+
+    %assign avx_enabled 1
+    %define RESET_MM_PERMUTATION INIT_YMM %1
+    %define mmsize 32
+    %define num_mmregs 8
+    %if ARCH_X86_64
+    %define num_mmregs 16
+    %endif
+    %define mova movdqa
+    %define movu movdqu
+    %undef movh
+    %define movnta movntdq
+    %assign %%i 0
+    %rep num_mmregs
+    CAT_XDEFINE m, %%i, ymm %+ %%i
+    CAT_XDEFINE nnymm, %%i, %%i
+    %assign %%i %%i+1
+    %endrep
+    INIT_CPUFLAGS %1
+%endmacro
+
+INIT_XMM
+
+%macro DECLARE_MMCAST 1
+    %define  mmmm%1   mm%1
+    %define  mmxmm%1  mm%1
+    %define  mmymm%1  mm%1
+    %define xmmmm%1   mm%1
+    %define xmmxmm%1 xmm%1
+    %define xmmymm%1 xmm%1
+    %define ymmmm%1   mm%1
+    %define ymmxmm%1 xmm%1
+    %define ymmymm%1 ymm%1
+    %define xm%1 xmm %+ m%1
+    %define ym%1 ymm %+ m%1
+%endmacro
+
+%assign i 0
+%rep 16
+    DECLARE_MMCAST i
+%assign i i+1
+%endrep
+
+; I often want to use macros that permute their arguments. e.g. there's no
+; efficient way to implement butterfly or transpose or dct without swapping some
+; arguments.
+;
+; I would like to not have to manually keep track of the permutations:
+; If I insert a permutation in the middle of a function, it should automatically
+; change everything that follows. For more complex macros I may also have multiple
+; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
+;
+; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
+; permutes its arguments. It's equivalent to exchanging the contents of the
+; registers, except that this way you exchange the register names instead, so it
+; doesn't cost any cycles.
+
+%macro PERMUTE 2-* ; takes a list of pairs to swap
+%rep %0/2
+    %xdefine %%tmp%2 m%2
+    %rotate 2
+%endrep
+%rep %0/2
+    %xdefine m%1 %%tmp%2
+    CAT_XDEFINE nn, m%1, %1
+    %rotate 2
+%endrep
+%endmacro
+
+%macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs)
+%ifnum %1 ; SWAP 0, 1, ...
+    SWAP_INTERNAL_NUM %1, %2
+%else ; SWAP m0, m1, ...
+    SWAP_INTERNAL_NAME %1, %2
+%endif
+%endmacro
+
+%macro SWAP_INTERNAL_NUM 2-*
+    %rep %0-1
+        %xdefine %%tmp m%1
+        %xdefine m%1 m%2
+        %xdefine m%2 %%tmp
+        CAT_XDEFINE nn, m%1, %1
+        CAT_XDEFINE nn, m%2, %2
+    %rotate 1
+    %endrep
+%endmacro
+
+%macro SWAP_INTERNAL_NAME 2-*
+    %xdefine %%args nn %+ %1
+    %rep %0-1
+        %xdefine %%args %%args, nn %+ %2
+    %rotate 1
+    %endrep
+    SWAP_INTERNAL_NUM %%args
+%endmacro
+
+; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later
+; calls to that function will automatically load the permutation, so values can
+; be returned in mmregs.
+%macro SAVE_MM_PERMUTATION 0-1
+    %if %0
+        %xdefine %%f %1_m
+    %else
+        %xdefine %%f current_function %+ _m
+    %endif
+    %assign %%i 0
+    %rep num_mmregs
+        CAT_XDEFINE %%f, %%i, m %+ %%i
+    %assign %%i %%i+1
+    %endrep
+%endmacro
+
+%macro LOAD_MM_PERMUTATION 1 ; name to load from
+    %ifdef %1_m0
+        %assign %%i 0
+        %rep num_mmregs
+            CAT_XDEFINE m, %%i, %1_m %+ %%i
+            CAT_XDEFINE nn, m %+ %%i, %%i
+        %assign %%i %%i+1
+        %endrep
+    %endif
+%endmacro
+
+; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
+%macro call 1
+    call_internal %1, %1 %+ SUFFIX
+%endmacro
+%macro call_internal 2
+    %xdefine %%i %1
+    %ifndef cglobaled_%1
+        %ifdef cglobaled_%2
+            %xdefine %%i %2
+        %endif
+    %endif
+    call %%i
+    LOAD_MM_PERMUTATION %%i
+%endmacro
+
+; Substitutions that reduce instruction size but are functionally equivalent
+%macro add 2
+    %ifnum %2
+        %if %2==128
+            sub %1, -128
+        %else
+            add %1, %2
+        %endif
+    %else
+        add %1, %2
+    %endif
+%endmacro
+
+%macro sub 2
+    %ifnum %2
+        %if %2==128
+            add %1, -128
+        %else
+            sub %1, %2
+        %endif
+    %else
+        sub %1, %2
+    %endif
+%endmacro
+
+;=============================================================================
+; AVX abstraction layer
+;=============================================================================
+
+%assign i 0
+%rep 16
+    %if i < 8
+        CAT_XDEFINE sizeofmm, i, 8
+    %endif
+    CAT_XDEFINE sizeofxmm, i, 16
+    CAT_XDEFINE sizeofymm, i, 32
+%assign i i+1
+%endrep
+%undef i
+
+%macro CHECK_AVX_INSTR_EMU 3-*
+    %xdefine %%opcode %1
+    %xdefine %%dst %2
+    %rep %0-2
+        %ifidn %%dst, %3
+            %error non-avx emulation of ``%%opcode'' is not supported
+        %endif
+        %rotate 1
+    %endrep
+%endmacro
+
+;%1 == instruction
+;%2 == minimal instruction set
+;%3 == 1 if float, 0 if int
+;%4 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise
+;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
+;%6+: operands
+%macro RUN_AVX_INSTR 6-9+
+    %ifnum sizeof%7
+        %assign __sizeofreg sizeof%7
+    %elifnum sizeof%6
+        %assign __sizeofreg sizeof%6
+    %else
+        %assign __sizeofreg mmsize
+    %endif
+    %assign __emulate_avx 0
+    %if avx_enabled && __sizeofreg >= 16
+        %xdefine __instr v%1
+    %else
+        %xdefine __instr %1
+        %if %0 >= 8+%4
+            %assign __emulate_avx 1
+        %endif
+    %endif
+    %ifnidn %2, fnord
+        %ifdef cpuname
+            %if notcpuflag(%2)
+                %error use of ``%1'' %2 instruction in cpuname function: current_function
+            %elif cpuflags_%2 < cpuflags_sse && notcpuflag(sse2) && __sizeofreg > 8
+                %error use of ``%1'' sse2 instruction in cpuname function: current_function
+            %endif
+        %endif
+    %endif
+
+    %if __emulate_avx
+        %xdefine __src1 %7
+        %xdefine __src2 %8
+        %ifnidn %6, %7
+            %if %0 >= 9
+                CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, %8, %9
+            %else
+                CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, %8
+            %endif
+            %if %5 && %4 == 0
+                %ifnid %8
+                    ; 3-operand AVX instructions with a memory arg can only have it in src2,
+                    ; whereas SSE emulation prefers to have it in src1 (i.e. the mov).
+                    ; So, if the instruction is commutative with a memory arg, swap them.
+                    %xdefine __src1 %8
+                    %xdefine __src2 %7
+                %endif
+            %endif
+            %if __sizeofreg == 8
+                MOVQ %6, __src1
+            %elif %3
+                MOVAPS %6, __src1
+            %else
+                MOVDQA %6, __src1
+            %endif
+        %endif
+        %if %0 >= 9
+            %1 %6, __src2, %9
+        %else
+            %1 %6, __src2
+        %endif
+    %elif %0 >= 9
+        __instr %6, %7, %8, %9
+    %elif %0 == 8
+        __instr %6, %7, %8
+    %elif %0 == 7
+        __instr %6, %7
+    %else
+        __instr %6
+    %endif
+%endmacro
+
+;%1 == instruction
+;%2 == minimal instruction set
+;%3 == 1 if float, 0 if int
+;%4 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise
+;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
+%macro AVX_INSTR 1-5 fnord, 0, 1, 0
+    %macro %1 1-10 fnord, fnord, fnord, fnord, %1, %2, %3, %4, %5
+        %ifidn %2, fnord
+            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1
+        %elifidn %3, fnord
+            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2
+        %elifidn %4, fnord
+            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3
+        %elifidn %5, fnord
+            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4
+        %else
+            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4, %5
+        %endif
+    %endmacro
+%endmacro
+
+; Instructions with both VEX and non-VEX encodings
+; Non-destructive instructions are written without parameters
+AVX_INSTR addpd, sse2, 1, 0, 1
+AVX_INSTR addps, sse, 1, 0, 1
+AVX_INSTR addsd, sse2, 1, 0, 1
+AVX_INSTR addss, sse, 1, 0, 1
+AVX_INSTR addsubpd, sse3, 1, 0, 0
+AVX_INSTR addsubps, sse3, 1, 0, 0
+AVX_INSTR aesdec, fnord, 0, 0, 0
+AVX_INSTR aesdeclast, fnord, 0, 0, 0
+AVX_INSTR aesenc, fnord, 0, 0, 0
+AVX_INSTR aesenclast, fnord, 0, 0, 0
+AVX_INSTR aesimc
+AVX_INSTR aeskeygenassist
+AVX_INSTR andnpd, sse2, 1, 0, 0
+AVX_INSTR andnps, sse, 1, 0, 0
+AVX_INSTR andpd, sse2, 1, 0, 1
+AVX_INSTR andps, sse, 1, 0, 1
+AVX_INSTR blendpd, sse4, 1, 0, 0
+AVX_INSTR blendps, sse4, 1, 0, 0
+AVX_INSTR blendvpd, sse4, 1, 0, 0
+AVX_INSTR blendvps, sse4, 1, 0, 0
+AVX_INSTR cmppd, sse2, 1, 1, 0
+AVX_INSTR cmpps, sse, 1, 1, 0
+AVX_INSTR cmpsd, sse2, 1, 1, 0
+AVX_INSTR cmpss, sse, 1, 1, 0
+AVX_INSTR comisd, sse2
+AVX_INSTR comiss, sse
+AVX_INSTR cvtdq2pd, sse2
+AVX_INSTR cvtdq2ps, sse2
+AVX_INSTR cvtpd2dq, sse2
+AVX_INSTR cvtpd2ps, sse2
+AVX_INSTR cvtps2dq, sse2
+AVX_INSTR cvtps2pd, sse2
+AVX_INSTR cvtsd2si, sse2
+AVX_INSTR cvtsd2ss, sse2
+AVX_INSTR cvtsi2sd, sse2
+AVX_INSTR cvtsi2ss, sse
+AVX_INSTR cvtss2sd, sse2
+AVX_INSTR cvtss2si, sse
+AVX_INSTR cvttpd2dq, sse2
+AVX_INSTR cvttps2dq, sse2
+AVX_INSTR cvttsd2si, sse2
+AVX_INSTR cvttss2si, sse
+AVX_INSTR divpd, sse2, 1, 0, 0
+AVX_INSTR divps, sse, 1, 0, 0
+AVX_INSTR divsd, sse2, 1, 0, 0
+AVX_INSTR divss, sse, 1, 0, 0
+AVX_INSTR dppd, sse4, 1, 1, 0
+AVX_INSTR dpps, sse4, 1, 1, 0
+AVX_INSTR extractps, sse4
+AVX_INSTR haddpd, sse3, 1, 0, 0
+AVX_INSTR haddps, sse3, 1, 0, 0
+AVX_INSTR hsubpd, sse3, 1, 0, 0
+AVX_INSTR hsubps, sse3, 1, 0, 0
+AVX_INSTR insertps, sse4, 1, 1, 0
+AVX_INSTR lddqu, sse3
+AVX_INSTR ldmxcsr, sse
+AVX_INSTR maskmovdqu, sse2
+AVX_INSTR maxpd, sse2, 1, 0, 1
+AVX_INSTR maxps, sse, 1, 0, 1
+AVX_INSTR maxsd, sse2, 1, 0, 1
+AVX_INSTR maxss, sse, 1, 0, 1
+AVX_INSTR minpd, sse2, 1, 0, 1
+AVX_INSTR minps, sse, 1, 0, 1
+AVX_INSTR minsd, sse2, 1, 0, 1
+AVX_INSTR minss, sse, 1, 0, 1
+AVX_INSTR movapd, sse2
+AVX_INSTR movaps, sse
+AVX_INSTR movd, mmx
+AVX_INSTR movddup, sse3
+AVX_INSTR movdqa, sse2
+AVX_INSTR movdqu, sse2
+AVX_INSTR movhlps, sse, 1, 0, 0
+AVX_INSTR movhpd, sse2, 1, 0, 0
+AVX_INSTR movhps, sse, 1, 0, 0
+AVX_INSTR movlhps, sse, 1, 0, 0
+AVX_INSTR movlpd, sse2, 1, 0, 0
+AVX_INSTR movlps, sse, 1, 0, 0
+AVX_INSTR movmskpd, sse2
+AVX_INSTR movmskps, sse
+AVX_INSTR movntdq, sse2
+AVX_INSTR movntdqa, sse4
+AVX_INSTR movntpd, sse2
+AVX_INSTR movntps, sse
+AVX_INSTR movq, mmx
+AVX_INSTR movsd, sse2, 1, 0, 0
+AVX_INSTR movshdup, sse3
+AVX_INSTR movsldup, sse3
+AVX_INSTR movss, sse, 1, 0, 0
+AVX_INSTR movupd, sse2
+AVX_INSTR movups, sse
+AVX_INSTR mpsadbw, sse4
+AVX_INSTR mulpd, sse2, 1, 0, 1
+AVX_INSTR mulps, sse, 1, 0, 1
+AVX_INSTR mulsd, sse2, 1, 0, 1
+AVX_INSTR mulss, sse, 1, 0, 1
+AVX_INSTR orpd, sse2, 1, 0, 1
+AVX_INSTR orps, sse, 1, 0, 1
+AVX_INSTR pabsb, ssse3
+AVX_INSTR pabsd, ssse3
+AVX_INSTR pabsw, ssse3
+AVX_INSTR packsswb, mmx, 0, 0, 0
+AVX_INSTR packssdw, mmx, 0, 0, 0
+AVX_INSTR packuswb, mmx, 0, 0, 0
+AVX_INSTR packusdw, sse4, 0, 0, 0
+AVX_INSTR paddb, mmx, 0, 0, 1
+AVX_INSTR paddw, mmx, 0, 0, 1
+AVX_INSTR paddd, mmx, 0, 0, 1
+AVX_INSTR paddq, sse2, 0, 0, 1
+AVX_INSTR paddsb, mmx, 0, 0, 1
+AVX_INSTR paddsw, mmx, 0, 0, 1
+AVX_INSTR paddusb, mmx, 0, 0, 1
+AVX_INSTR paddusw, mmx, 0, 0, 1
+AVX_INSTR palignr, ssse3
+AVX_INSTR pand, mmx, 0, 0, 1
+AVX_INSTR pandn, mmx, 0, 0, 0
+AVX_INSTR pavgb, mmx2, 0, 0, 1
+AVX_INSTR pavgw, mmx2, 0, 0, 1
+AVX_INSTR pblendvb, sse4, 0, 0, 0
+AVX_INSTR pblendw, sse4
+AVX_INSTR pclmulqdq
+AVX_INSTR pcmpestri, sse42
+AVX_INSTR pcmpestrm, sse42
+AVX_INSTR pcmpistri, sse42
+AVX_INSTR pcmpistrm, sse42
+AVX_INSTR pcmpeqb, mmx, 0, 0, 1
+AVX_INSTR pcmpeqw, mmx, 0, 0, 1
+AVX_INSTR pcmpeqd, mmx, 0, 0, 1
+AVX_INSTR pcmpeqq, sse4, 0, 0, 1
+AVX_INSTR pcmpgtb, mmx, 0, 0, 0
+AVX_INSTR pcmpgtw, mmx, 0, 0, 0
+AVX_INSTR pcmpgtd, mmx, 0, 0, 0
+AVX_INSTR pcmpgtq, sse42, 0, 0, 0
+AVX_INSTR pextrb, sse4
+AVX_INSTR pextrd, sse4
+AVX_INSTR pextrq, sse4
+AVX_INSTR pextrw, mmx2
+AVX_INSTR phaddw, ssse3, 0, 0, 0
+AVX_INSTR phaddd, ssse3, 0, 0, 0
+AVX_INSTR phaddsw, ssse3, 0, 0, 0
+AVX_INSTR phminposuw, sse4
+AVX_INSTR phsubw, ssse3, 0, 0, 0
+AVX_INSTR phsubd, ssse3, 0, 0, 0
+AVX_INSTR phsubsw, ssse3, 0, 0, 0
+AVX_INSTR pinsrb, sse4
+AVX_INSTR pinsrd, sse4
+AVX_INSTR pinsrq, sse4
+AVX_INSTR pinsrw, mmx2
+AVX_INSTR pmaddwd, mmx, 0, 0, 1
+AVX_INSTR pmaddubsw, ssse3, 0, 0, 0
+AVX_INSTR pmaxsb, sse4, 0, 0, 1
+AVX_INSTR pmaxsw, mmx2, 0, 0, 1
+AVX_INSTR pmaxsd, sse4, 0, 0, 1
+AVX_INSTR pmaxub, mmx2, 0, 0, 1
+AVX_INSTR pmaxuw, sse4, 0, 0, 1
+AVX_INSTR pmaxud, sse4, 0, 0, 1
+AVX_INSTR pminsb, sse4, 0, 0, 1
+AVX_INSTR pminsw, mmx2, 0, 0, 1
+AVX_INSTR pminsd, sse4, 0, 0, 1
+AVX_INSTR pminub, mmx2, 0, 0, 1
+AVX_INSTR pminuw, sse4, 0, 0, 1
+AVX_INSTR pminud, sse4, 0, 0, 1
+AVX_INSTR pmovmskb, mmx2
+AVX_INSTR pmovsxbw, sse4
+AVX_INSTR pmovsxbd, sse4
+AVX_INSTR pmovsxbq, sse4
+AVX_INSTR pmovsxwd, sse4
+AVX_INSTR pmovsxwq, sse4
+AVX_INSTR pmovsxdq, sse4
+AVX_INSTR pmovzxbw, sse4
+AVX_INSTR pmovzxbd, sse4
+AVX_INSTR pmovzxbq, sse4
+AVX_INSTR pmovzxwd, sse4
+AVX_INSTR pmovzxwq, sse4
+AVX_INSTR pmovzxdq, sse4
+AVX_INSTR pmuldq, sse4, 0, 0, 1
+AVX_INSTR pmulhrsw, ssse3, 0, 0, 1
+AVX_INSTR pmulhuw, mmx2, 0, 0, 1
+AVX_INSTR pmulhw, mmx, 0, 0, 1
+AVX_INSTR pmullw, mmx, 0, 0, 1
+AVX_INSTR pmulld, sse4, 0, 0, 1
+AVX_INSTR pmuludq, sse2, 0, 0, 1
+AVX_INSTR por, mmx, 0, 0, 1
+AVX_INSTR psadbw, mmx2, 0, 0, 1
+AVX_INSTR pshufb, ssse3, 0, 0, 0
+AVX_INSTR pshufd, sse2
+AVX_INSTR pshufhw, sse2
+AVX_INSTR pshuflw, sse2
+AVX_INSTR psignb, ssse3, 0, 0, 0
+AVX_INSTR psignw, ssse3, 0, 0, 0
+AVX_INSTR psignd, ssse3, 0, 0, 0
+AVX_INSTR psllw, mmx, 0, 0, 0
+AVX_INSTR pslld, mmx, 0, 0, 0
+AVX_INSTR psllq, mmx, 0, 0, 0
+AVX_INSTR pslldq, sse2, 0, 0, 0
+AVX_INSTR psraw, mmx, 0, 0, 0
+AVX_INSTR psrad, mmx, 0, 0, 0
+AVX_INSTR psrlw, mmx, 0, 0, 0
+AVX_INSTR psrld, mmx, 0, 0, 0
+AVX_INSTR psrlq, mmx, 0, 0, 0
+AVX_INSTR psrldq, sse2, 0, 0, 0
+AVX_INSTR psubb, mmx, 0, 0, 0
+AVX_INSTR psubw, mmx, 0, 0, 0
+AVX_INSTR psubd, mmx, 0, 0, 0
+AVX_INSTR psubq, sse2, 0, 0, 0
+AVX_INSTR psubsb, mmx, 0, 0, 0
+AVX_INSTR psubsw, mmx, 0, 0, 0
+AVX_INSTR psubusb, mmx, 0, 0, 0
+AVX_INSTR psubusw, mmx, 0, 0, 0
+AVX_INSTR ptest, sse4
+AVX_INSTR punpckhbw, mmx, 0, 0, 0
+AVX_INSTR punpckhwd, mmx, 0, 0, 0
+AVX_INSTR punpckhdq, mmx, 0, 0, 0
+AVX_INSTR punpckhqdq, sse2, 0, 0, 0
+AVX_INSTR punpcklbw, mmx, 0, 0, 0
+AVX_INSTR punpcklwd, mmx, 0, 0, 0
+AVX_INSTR punpckldq, mmx, 0, 0, 0
+AVX_INSTR punpcklqdq, sse2, 0, 0, 0
+AVX_INSTR pxor, mmx, 0, 0, 1
+AVX_INSTR rcpps, sse, 1, 0, 0
+AVX_INSTR rcpss, sse, 1, 0, 0
+AVX_INSTR roundpd, sse4
+AVX_INSTR roundps, sse4
+AVX_INSTR roundsd, sse4
+AVX_INSTR roundss, sse4
+AVX_INSTR rsqrtps, sse, 1, 0, 0
+AVX_INSTR rsqrtss, sse, 1, 0, 0
+AVX_INSTR shufpd, sse2, 1, 1, 0
+AVX_INSTR shufps, sse, 1, 1, 0
+AVX_INSTR sqrtpd, sse2, 1, 0, 0
+AVX_INSTR sqrtps, sse, 1, 0, 0
+AVX_INSTR sqrtsd, sse2, 1, 0, 0
+AVX_INSTR sqrtss, sse, 1, 0, 0
+AVX_INSTR stmxcsr, sse
+AVX_INSTR subpd, sse2, 1, 0, 0
+AVX_INSTR subps, sse, 1, 0, 0
+AVX_INSTR subsd, sse2, 1, 0, 0
+AVX_INSTR subss, sse, 1, 0, 0
+AVX_INSTR ucomisd, sse2
+AVX_INSTR ucomiss, sse
+AVX_INSTR unpckhpd, sse2, 1, 0, 0
+AVX_INSTR unpckhps, sse, 1, 0, 0
+AVX_INSTR unpcklpd, sse2, 1, 0, 0
+AVX_INSTR unpcklps, sse, 1, 0, 0
+AVX_INSTR xorpd, sse2, 1, 0, 1
+AVX_INSTR xorps, sse, 1, 0, 1
+
+; 3DNow instructions, for sharing code between AVX, SSE and 3DN
+AVX_INSTR pfadd, 3dnow, 1, 0, 1
+AVX_INSTR pfsub, 3dnow, 1, 0, 0
+AVX_INSTR pfmul, 3dnow, 1, 0, 1
+
+; base-4 constants for shuffles
+%assign i 0
+%rep 256
+    %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3)
+    %if j < 10
+        CAT_XDEFINE q000, j, i
+    %elif j < 100
+        CAT_XDEFINE q00, j, i
+    %elif j < 1000
+        CAT_XDEFINE q0, j, i
+    %else
+        CAT_XDEFINE q, j, i
+    %endif
+%assign i i+1
+%endrep
+%undef i
+%undef j
+
+%macro FMA_INSTR 3
+    %macro %1 4-7 %1, %2, %3
+        %if cpuflag(xop)
+            v%5 %1, %2, %3, %4
+        %elifnidn %1, %4
+            %6 %1, %2, %3
+            %7 %1, %4
+        %else
+            %error non-xop emulation of ``%5 %1, %2, %3, %4'' is not supported
+        %endif
+    %endmacro
+%endmacro
+
+FMA_INSTR  pmacsww,  pmullw, paddw
+FMA_INSTR  pmacsdd,  pmulld, paddd ; sse4 emulation
+FMA_INSTR pmacsdql,  pmuldq, paddq ; sse4 emulation
+FMA_INSTR pmadcswd, pmaddwd, paddd
+
+; convert FMA4 to FMA3 if possible
+%macro FMA4_INSTR 4
+    %macro %1 4-8 %1, %2, %3, %4
+        %if cpuflag(fma4)
+            v%5 %1, %2, %3, %4
+        %elifidn %1, %2
+            v%6 %1, %4, %3 ; %1 = %1 * %3 + %4
+        %elifidn %1, %3
+            v%7 %1, %2, %4 ; %1 = %2 * %1 + %4
+        %elifidn %1, %4
+            v%8 %1, %2, %3 ; %1 = %2 * %3 + %1
+        %else
+            %error fma3 emulation of ``%5 %1, %2, %3, %4'' is not supported
+        %endif
+    %endmacro
+%endmacro
+
+FMA4_INSTR fmaddpd, fmadd132pd, fmadd213pd, fmadd231pd
+FMA4_INSTR fmaddps, fmadd132ps, fmadd213ps, fmadd231ps
+FMA4_INSTR fmaddsd, fmadd132sd, fmadd213sd, fmadd231sd
+FMA4_INSTR fmaddss, fmadd132ss, fmadd213ss, fmadd231ss
+
+FMA4_INSTR fmaddsubpd, fmaddsub132pd, fmaddsub213pd, fmaddsub231pd
+FMA4_INSTR fmaddsubps, fmaddsub132ps, fmaddsub213ps, fmaddsub231ps
+FMA4_INSTR fmsubaddpd, fmsubadd132pd, fmsubadd213pd, fmsubadd231pd
+FMA4_INSTR fmsubaddps, fmsubadd132ps, fmsubadd213ps, fmsubadd231ps
+
+FMA4_INSTR fmsubpd, fmsub132pd, fmsub213pd, fmsub231pd
+FMA4_INSTR fmsubps, fmsub132ps, fmsub213ps, fmsub231ps
+FMA4_INSTR fmsubsd, fmsub132sd, fmsub213sd, fmsub231sd
+FMA4_INSTR fmsubss, fmsub132ss, fmsub213ss, fmsub231ss
+
+FMA4_INSTR fnmaddpd, fnmadd132pd, fnmadd213pd, fnmadd231pd
+FMA4_INSTR fnmaddps, fnmadd132ps, fnmadd213ps, fnmadd231ps
+FMA4_INSTR fnmaddsd, fnmadd132sd, fnmadd213sd, fnmadd231sd
+FMA4_INSTR fnmaddss, fnmadd132ss, fnmadd213ss, fnmadd231ss
+
+FMA4_INSTR fnmsubpd, fnmsub132pd, fnmsub213pd, fnmsub231pd
+FMA4_INSTR fnmsubps, fnmsub132ps, fnmsub213ps, fnmsub231ps
+FMA4_INSTR fnmsubsd, fnmsub132sd, fnmsub213sd, fnmsub231sd
+FMA4_INSTR fnmsubss, fnmsub132ss, fnmsub213ss, fnmsub231ss
+
+; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug
+%if ARCH_X86_64 == 0
+%macro vpbroadcastq 2
+%if sizeof%1 == 16
+    movddup %1, %2
+%else
+    vbroadcastsd %1, %2
+%endif
+%endmacro
+%endif
diff --git a/libs/libvpx/tools/all_builds.py b/libs/libvpx/tools/all_builds.py
new file mode 100755
index 0000000000..d1f0c80c03
--- /dev/null
+++ b/libs/libvpx/tools/all_builds.py
@@ -0,0 +1,72 @@
+#!/usr/bin/python
+
+import getopt
+import subprocess
+import sys
+
+LONG_OPTIONS = ["shard=", "shards="]
+BASE_COMMAND = "./configure --enable-internal-stats --enable-experimental"
+
+def RunCommand(command):
+  run = subprocess.Popen(command, shell=True)
+  output = run.communicate()
+  if run.returncode:
+    print "Non-zero return code: " + str(run.returncode) + " => exiting!"
+    sys.exit(1)
+
+def list_of_experiments():
+  experiments = []
+  configure_file = open("configure")
+  list_start = False
+  for line in configure_file.read().split("\n"):
+    if line == 'EXPERIMENT_LIST="':
+      list_start = True
+    elif line == '"':
+      list_start = False
+    elif list_start:
+      currently_broken = ["csm"]
+      experiment = line[4:]
+      if experiment not in currently_broken:
+        experiments.append(experiment)
+  return experiments
+
+def main(argv):
+  # Parse arguments
+  options = {"--shard": 0, "--shards": 1}
+  if "--" in argv:
+    opt_end_index = argv.index("--")
+  else:
+    opt_end_index = len(argv)
+  try:
+    o, _ = getopt.getopt(argv[1:opt_end_index], None, LONG_OPTIONS)
+  except getopt.GetoptError, err:
+    print str(err)
+    print "Usage: %s [--shard=<n> --shards=<n>] -- [configure flag ...]"%argv[0]
+    sys.exit(2)
+
+  options.update(o)
+  extra_args = argv[opt_end_index + 1:]
+
+  # Shard experiment list
+  shard = int(options["--shard"])
+  shards = int(options["--shards"])
+  experiments = list_of_experiments()
+  base_command = " ".join([BASE_COMMAND] + extra_args)
+  configs = [base_command]
+  configs += ["%s --enable-%s" % (base_command, e) for e in experiments]
+  my_configs = zip(configs, range(len(configs)))
+  my_configs = filter(lambda x: x[1] % shards == shard, my_configs)
+  my_configs = [e[0] for e in my_configs]
+
+  # Run configs for this shard
+  for config in my_configs:
+    test_build(config)
+
+def test_build(configure_command):
+  print "\033[34m\033[47mTesting %s\033[0m" % (configure_command)
+  RunCommand(configure_command)
+  RunCommand("make clean")
+  RunCommand("make")
+
+if __name__ == "__main__":
+  main(sys.argv)
diff --git a/libs/libvpx/tools/author_first_release.sh b/libs/libvpx/tools/author_first_release.sh
new file mode 100755
index 0000000000..7b0b797212
--- /dev/null
+++ b/libs/libvpx/tools/author_first_release.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+##
+## List the release each author first contributed to.
+##
+## Usage: author_first_release.sh [TAGS]
+##
+## If the TAGS arguments are unspecified, all tags reported by `git tag`
+## will be considered.
+##
+tags=${@:-$(git tag)}
+for tag in $tags; do
+  git shortlog -n -e -s $tag |
+      cut -f2- |
+      awk "{print \"${tag#v}\t\"\$0}"
+done | sort -k2  | uniq -f2
diff --git a/libs/libvpx/tools/cpplint.py b/libs/libvpx/tools/cpplint.py
new file mode 100755
index 0000000000..25fbef73d8
--- /dev/null
+++ b/libs/libvpx/tools/cpplint.py
@@ -0,0 +1,4756 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2009 Google Inc. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#    * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#    * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""Does google-lint on c++ files.
+
+The goal of this script is to identify places in the code that *may*
+be in non-compliance with google style.  It does not attempt to fix
+up these problems -- the point is to educate.  It does also not
+attempt to find all problems, or to ensure that everything it does
+find is legitimately a problem.
+
+In particular, we can get very confused by /* and // inside strings!
+We do a small hack, which is to ignore //'s with "'s after them on the
+same line, but it is far from perfect (in either direction).
+"""
+
+import codecs
+import copy
+import getopt
+import math  # for log
+import os
+import re
+import sre_compile
+import string
+import sys
+import unicodedata
+
+
+_USAGE = """
+Syntax: cpplint.py [--verbose=#] [--output=vs7] [--filter=-x,+y,...]
+                   [--counting=total|toplevel|detailed] [--root=subdir]
+                   [--linelength=digits]
+        <file> [file] ...
+
+  The style guidelines this tries to follow are those in
+    http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml
+
+  Every problem is given a confidence score from 1-5, with 5 meaning we are
+  certain of the problem, and 1 meaning it could be a legitimate construct.
+  This will miss some errors, and is not a substitute for a code review.
+
+  To suppress false-positive errors of a certain category, add a
+  'NOLINT(category)' comment to the line.  NOLINT or NOLINT(*)
+  suppresses errors of all categories on that line.
+
+  The files passed in will be linted; at least one file must be provided.
+  Default linted extensions are .cc, .cpp, .cu, .cuh and .h.  Change the
+  extensions with the --extensions flag.
+
+  Flags:
+
+    output=vs7
+      By default, the output is formatted to ease emacs parsing.  Visual Studio
+      compatible output (vs7) may also be used.  Other formats are unsupported.
+
+    verbose=#
+      Specify a number 0-5 to restrict errors to certain verbosity levels.
+
+    filter=-x,+y,...
+      Specify a comma-separated list of category-filters to apply: only
+      error messages whose category names pass the filters will be printed.
+      (Category names are printed with the message and look like
+      "[whitespace/indent]".)  Filters are evaluated left to right.
+      "-FOO" and "FOO" means "do not print categories that start with FOO".
+      "+FOO" means "do print categories that start with FOO".
+
+      Examples: --filter=-whitespace,+whitespace/braces
+                --filter=whitespace,runtime/printf,+runtime/printf_format
+                --filter=-,+build/include_what_you_use
+
+      To see a list of all the categories used in cpplint, pass no arg:
+         --filter=
+
+    counting=total|toplevel|detailed
+      The total number of errors found is always printed. If
+      'toplevel' is provided, then the count of errors in each of
+      the top-level categories like 'build' and 'whitespace' will
+      also be printed. If 'detailed' is provided, then a count
+      is provided for each category like 'build/class'.
+
+    root=subdir
+      The root directory used for deriving header guard CPP variable.
+      By default, the header guard CPP variable is calculated as the relative
+      path to the directory that contains .git, .hg, or .svn.  When this flag
+      is specified, the relative path is calculated from the specified
+      directory. If the specified directory does not exist, this flag is
+      ignored.
+
+      Examples:
+        Assuing that src/.git exists, the header guard CPP variables for
+        src/chrome/browser/ui/browser.h are:
+
+        No flag => CHROME_BROWSER_UI_BROWSER_H_
+        --root=chrome => BROWSER_UI_BROWSER_H_
+        --root=chrome/browser => UI_BROWSER_H_
+
+    linelength=digits
+      This is the allowed line length for the project. The default value is
+      80 characters.
+
+      Examples:
+        --linelength=120
+
+    extensions=extension,extension,...
+      The allowed file extensions that cpplint will check
+
+      Examples:
+        --extensions=hpp,cpp
+"""
+
+# We categorize each error message we print.  Here are the categories.
+# We want an explicit list so we can list them all in cpplint --filter=.
+# If you add a new error message with a new category, add it to the list
+# here!  cpplint_unittest.py should tell you if you forget to do this.
+_ERROR_CATEGORIES = [
+  'build/class',
+  'build/deprecated',
+  'build/endif_comment',
+  'build/explicit_make_pair',
+  'build/forward_decl',
+  'build/header_guard',
+  'build/include',
+  'build/include_alpha',
+  'build/include_order',
+  'build/include_what_you_use',
+  'build/namespaces',
+  'build/printf_format',
+  'build/storage_class',
+  'legal/copyright',
+  'readability/alt_tokens',
+  'readability/braces',
+  'readability/casting',
+  'readability/check',
+  'readability/constructors',
+  'readability/fn_size',
+  'readability/function',
+  'readability/multiline_comment',
+  'readability/multiline_string',
+  'readability/namespace',
+  'readability/nolint',
+  'readability/nul',
+  'readability/streams',
+  'readability/todo',
+  'readability/utf8',
+  'runtime/arrays',
+  'runtime/casting',
+  'runtime/explicit',
+  'runtime/int',
+  'runtime/init',
+  'runtime/invalid_increment',
+  'runtime/member_string_references',
+  'runtime/memset',
+  'runtime/operator',
+  'runtime/printf',
+  'runtime/printf_format',
+  'runtime/references',
+  'runtime/sizeof',
+  'runtime/string',
+  'runtime/threadsafe_fn',
+  'runtime/vlog',
+  'whitespace/blank_line',
+  'whitespace/braces',
+  'whitespace/comma',
+  'whitespace/comments',
+  'whitespace/empty_conditional_body',
+  'whitespace/empty_loop_body',
+  'whitespace/end_of_line',
+  'whitespace/ending_newline',
+  'whitespace/forcolon',
+  'whitespace/indent',
+  'whitespace/line_length',
+  'whitespace/newline',
+  'whitespace/operators',
+  'whitespace/parens',
+  'whitespace/semicolon',
+  'whitespace/tab',
+  'whitespace/todo'
+  ]
+
+# The default state of the category filter. This is overrided by the --filter=
+# flag. By default all errors are on, so only add here categories that should be
+# off by default (i.e., categories that must be enabled by the --filter= flags).
+# All entries here should start with a '-' or '+', as in the --filter= flag.
+_DEFAULT_FILTERS = ['-build/include_alpha']
+
+# We used to check for high-bit characters, but after much discussion we
+# decided those were OK, as long as they were in UTF-8 and didn't represent
+# hard-coded international strings, which belong in a separate i18n file.
+
+
+# C++ headers
+_CPP_HEADERS = frozenset([
+    # Legacy
+    'algobase.h',
+    'algo.h',
+    'alloc.h',
+    'builtinbuf.h',
+    'bvector.h',
+    'complex.h',
+    'defalloc.h',
+    'deque.h',
+    'editbuf.h',
+    'fstream.h',
+    'function.h',
+    'hash_map',
+    'hash_map.h',
+    'hash_set',
+    'hash_set.h',
+    'hashtable.h',
+    'heap.h',
+    'indstream.h',
+    'iomanip.h',
+    'iostream.h',
+    'istream.h',
+    'iterator.h',
+    'list.h',
+    'map.h',
+    'multimap.h',
+    'multiset.h',
+    'ostream.h',
+    'pair.h',
+    'parsestream.h',
+    'pfstream.h',
+    'procbuf.h',
+    'pthread_alloc',
+    'pthread_alloc.h',
+    'rope',
+    'rope.h',
+    'ropeimpl.h',
+    'set.h',
+    'slist',
+    'slist.h',
+    'stack.h',
+    'stdiostream.h',
+    'stl_alloc.h',
+    'stl_relops.h',
+    'streambuf.h',
+    'stream.h',
+    'strfile.h',
+    'strstream.h',
+    'tempbuf.h',
+    'tree.h',
+    'type_traits.h',
+    'vector.h',
+    # 17.6.1.2 C++ library headers
+    'algorithm',
+    'array',
+    'atomic',
+    'bitset',
+    'chrono',
+    'codecvt',
+    'complex',
+    'condition_variable',
+    'deque',
+    'exception',
+    'forward_list',
+    'fstream',
+    'functional',
+    'future',
+    'initializer_list',
+    'iomanip',
+    'ios',
+    'iosfwd',
+    'iostream',
+    'istream',
+    'iterator',
+    'limits',
+    'list',
+    'locale',
+    'map',
+    'memory',
+    'mutex',
+    'new',
+    'numeric',
+    'ostream',
+    'queue',
+    'random',
+    'ratio',
+    'regex',
+    'set',
+    'sstream',
+    'stack',
+    'stdexcept',
+    'streambuf',
+    'string',
+    'strstream',
+    'system_error',
+    'thread',
+    'tuple',
+    'typeindex',
+    'typeinfo',
+    'type_traits',
+    'unordered_map',
+    'unordered_set',
+    'utility',
+    'valarray',
+    'vector',
+    # 17.6.1.2 C++ headers for C library facilities
+    'cassert',
+    'ccomplex',
+    'cctype',
+    'cerrno',
+    'cfenv',
+    'cfloat',
+    'cinttypes',
+    'ciso646',
+    'climits',
+    'clocale',
+    'cmath',
+    'csetjmp',
+    'csignal',
+    'cstdalign',
+    'cstdarg',
+    'cstdbool',
+    'cstddef',
+    'cstdint',
+    'cstdio',
+    'cstdlib',
+    'cstring',
+    'ctgmath',
+    'ctime',
+    'cuchar',
+    'cwchar',
+    'cwctype',
+    ])
+
+# Assertion macros.  These are defined in base/logging.h and
+# testing/base/gunit.h.  Note that the _M versions need to come first
+# for substring matching to work.
+_CHECK_MACROS = [
+    'DCHECK', 'CHECK',
+    'EXPECT_TRUE_M', 'EXPECT_TRUE',
+    'ASSERT_TRUE_M', 'ASSERT_TRUE',
+    'EXPECT_FALSE_M', 'EXPECT_FALSE',
+    'ASSERT_FALSE_M', 'ASSERT_FALSE',
+    ]
+
+# Replacement macros for CHECK/DCHECK/EXPECT_TRUE/EXPECT_FALSE
+_CHECK_REPLACEMENT = dict([(m, {}) for m in _CHECK_MACROS])
+
+for op, replacement in [('==', 'EQ'), ('!=', 'NE'),
+                        ('>=', 'GE'), ('>', 'GT'),
+                        ('<=', 'LE'), ('<', 'LT')]:
+  _CHECK_REPLACEMENT['DCHECK'][op] = 'DCHECK_%s' % replacement
+  _CHECK_REPLACEMENT['CHECK'][op] = 'CHECK_%s' % replacement
+  _CHECK_REPLACEMENT['EXPECT_TRUE'][op] = 'EXPECT_%s' % replacement
+  _CHECK_REPLACEMENT['ASSERT_TRUE'][op] = 'ASSERT_%s' % replacement
+  _CHECK_REPLACEMENT['EXPECT_TRUE_M'][op] = 'EXPECT_%s_M' % replacement
+  _CHECK_REPLACEMENT['ASSERT_TRUE_M'][op] = 'ASSERT_%s_M' % replacement
+
+for op, inv_replacement in [('==', 'NE'), ('!=', 'EQ'),
+                            ('>=', 'LT'), ('>', 'LE'),
+                            ('<=', 'GT'), ('<', 'GE')]:
+  _CHECK_REPLACEMENT['EXPECT_FALSE'][op] = 'EXPECT_%s' % inv_replacement
+  _CHECK_REPLACEMENT['ASSERT_FALSE'][op] = 'ASSERT_%s' % inv_replacement
+  _CHECK_REPLACEMENT['EXPECT_FALSE_M'][op] = 'EXPECT_%s_M' % inv_replacement
+  _CHECK_REPLACEMENT['ASSERT_FALSE_M'][op] = 'ASSERT_%s_M' % inv_replacement
+
+# Alternative tokens and their replacements.  For full list, see section 2.5
+# Alternative tokens [lex.digraph] in the C++ standard.
+#
+# Digraphs (such as '%:') are not included here since it's a mess to
+# match those on a word boundary.
+_ALT_TOKEN_REPLACEMENT = {
+    'and': '&&',
+    'bitor': '|',
+    'or': '||',
+    'xor': '^',
+    'compl': '~',
+    'bitand': '&',
+    'and_eq': '&=',
+    'or_eq': '|=',
+    'xor_eq': '^=',
+    'not': '!',
+    'not_eq': '!='
+    }
+
+# Compile regular expression that matches all the above keywords.  The "[ =()]"
+# bit is meant to avoid matching these keywords outside of boolean expressions.
+#
+# False positives include C-style multi-line comments and multi-line strings
+# but those have always been troublesome for cpplint.
+_ALT_TOKEN_REPLACEMENT_PATTERN = re.compile(
+    r'[ =()](' + ('|'.join(_ALT_TOKEN_REPLACEMENT.keys())) + r')(?=[ (]|$)')
+
+
+# These constants define types of headers for use with
+# _IncludeState.CheckNextIncludeOrder().
+_C_SYS_HEADER = 1
+_CPP_SYS_HEADER = 2
+_LIKELY_MY_HEADER = 3
+_POSSIBLE_MY_HEADER = 4
+_OTHER_HEADER = 5
+
+# These constants define the current inline assembly state
+_NO_ASM = 0       # Outside of inline assembly block
+_INSIDE_ASM = 1   # Inside inline assembly block
+_END_ASM = 2      # Last line of inline assembly block
+_BLOCK_ASM = 3    # The whole block is an inline assembly block
+
+# Match start of assembly blocks
+_MATCH_ASM = re.compile(r'^\s*(?:asm|_asm|__asm|__asm__)'
+                        r'(?:\s+(volatile|__volatile__))?'
+                        r'\s*[{(]')
+
+
+_regexp_compile_cache = {}
+
+# Finds occurrences of NOLINT or NOLINT(...).
+_RE_SUPPRESSION = re.compile(r'\bNOLINT\b(\([^)]*\))?')
+
+# {str, set(int)}: a map from error categories to sets of linenumbers
+# on which those errors are expected and should be suppressed.
+_error_suppressions = {}
+
+# The root directory used for deriving header guard CPP variable.
+# This is set by --root flag.
+_root = None
+
+# The allowed line length of files.
+# This is set by --linelength flag.
+_line_length = 80
+
+# The allowed extensions for file names
+# This is set by --extensions flag.
+_valid_extensions = set(['cc', 'h', 'cpp', 'cu', 'cuh'])
+
+def ParseNolintSuppressions(filename, raw_line, linenum, error):
+  """Updates the global list of error-suppressions.
+
+  Parses any NOLINT comments on the current line, updating the global
+  error_suppressions store.  Reports an error if the NOLINT comment
+  was malformed.
+
+  Args:
+    filename: str, the name of the input file.
+    raw_line: str, the line of input text, with comments.
+    linenum: int, the number of the current line.
+    error: function, an error handler.
+  """
+  # FIXME(adonovan): "NOLINT(" is misparsed as NOLINT(*).
+  matched = _RE_SUPPRESSION.search(raw_line)
+  if matched:
+    category = matched.group(1)
+    if category in (None, '(*)'):  # => "suppress all"
+      _error_suppressions.setdefault(None, set()).add(linenum)
+    else:
+      if category.startswith('(') and category.endswith(')'):
+        category = category[1:-1]
+        if category in _ERROR_CATEGORIES:
+          _error_suppressions.setdefault(category, set()).add(linenum)
+        else:
+          error(filename, linenum, 'readability/nolint', 5,
+                'Unknown NOLINT error category: %s' % category)
+
+
+def ResetNolintSuppressions():
+  "Resets the set of NOLINT suppressions to empty."
+  _error_suppressions.clear()
+
+
+def IsErrorSuppressedByNolint(category, linenum):
+  """Returns true if the specified error category is suppressed on this line.
+
+  Consults the global error_suppressions map populated by
+  ParseNolintSuppressions/ResetNolintSuppressions.
+
+  Args:
+    category: str, the category of the error.
+    linenum: int, the current line number.
+  Returns:
+    bool, True iff the error should be suppressed due to a NOLINT comment.
+  """
+  return (linenum in _error_suppressions.get(category, set()) or
+          linenum in _error_suppressions.get(None, set()))
+
+def Match(pattern, s):
+  """Matches the string with the pattern, caching the compiled regexp."""
+  # The regexp compilation caching is inlined in both Match and Search for
+  # performance reasons; factoring it out into a separate function turns out
+  # to be noticeably expensive.
+  if pattern not in _regexp_compile_cache:
+    _regexp_compile_cache[pattern] = sre_compile.compile(pattern)
+  return _regexp_compile_cache[pattern].match(s)
+
+
+def ReplaceAll(pattern, rep, s):
+  """Replaces instances of pattern in a string with a replacement.
+
+  The compiled regex is kept in a cache shared by Match and Search.
+
+  Args:
+    pattern: regex pattern
+    rep: replacement text
+    s: search string
+
+  Returns:
+    string with replacements made (or original string if no replacements)
+  """
+  if pattern not in _regexp_compile_cache:
+    _regexp_compile_cache[pattern] = sre_compile.compile(pattern)
+  return _regexp_compile_cache[pattern].sub(rep, s)
+
+
+def Search(pattern, s):
+  """Searches the string for the pattern, caching the compiled regexp."""
+  if pattern not in _regexp_compile_cache:
+    _regexp_compile_cache[pattern] = sre_compile.compile(pattern)
+  return _regexp_compile_cache[pattern].search(s)
+
+
+class _IncludeState(dict):
+  """Tracks line numbers for includes, and the order in which includes appear.
+
+  As a dict, an _IncludeState object serves as a mapping between include
+  filename and line number on which that file was included.
+
+  Call CheckNextIncludeOrder() once for each header in the file, passing
+  in the type constants defined above. Calls in an illegal order will
+  raise an _IncludeError with an appropriate error message.
+
+  """
+  # self._section will move monotonically through this set. If it ever
+  # needs to move backwards, CheckNextIncludeOrder will raise an error.
+  _INITIAL_SECTION = 0
+  _MY_H_SECTION = 1
+  _C_SECTION = 2
+  _CPP_SECTION = 3
+  _OTHER_H_SECTION = 4
+
+  _TYPE_NAMES = {
+      _C_SYS_HEADER: 'C system header',
+      _CPP_SYS_HEADER: 'C++ system header',
+      _LIKELY_MY_HEADER: 'header this file implements',
+      _POSSIBLE_MY_HEADER: 'header this file may implement',
+      _OTHER_HEADER: 'other header',
+      }
+  _SECTION_NAMES = {
+      _INITIAL_SECTION: "... nothing. (This can't be an error.)",
+      _MY_H_SECTION: 'a header this file implements',
+      _C_SECTION: 'C system header',
+      _CPP_SECTION: 'C++ system header',
+      _OTHER_H_SECTION: 'other header',
+      }
+
+  def __init__(self):
+    dict.__init__(self)
+    self.ResetSection()
+
+  def ResetSection(self):
+    # The name of the current section.
+    self._section = self._INITIAL_SECTION
+    # The path of last found header.
+    self._last_header = ''
+
+  def SetLastHeader(self, header_path):
+    self._last_header = header_path
+
+  def CanonicalizeAlphabeticalOrder(self, header_path):
+    """Returns a path canonicalized for alphabetical comparison.
+
+    - replaces "-" with "_" so they both cmp the same.
+    - removes '-inl' since we don't require them to be after the main header.
+    - lowercase everything, just in case.
+
+    Args:
+      header_path: Path to be canonicalized.
+
+    Returns:
+      Canonicalized path.
+    """
+    return header_path.replace('-inl.h', '.h').replace('-', '_').lower()
+
+  def IsInAlphabeticalOrder(self, clean_lines, linenum, header_path):
+    """Check if a header is in alphabetical order with the previous header.
+
+    Args:
+      clean_lines: A CleansedLines instance containing the file.
+      linenum: The number of the line to check.
+      header_path: Canonicalized header to be checked.
+
+    Returns:
+      Returns true if the header is in alphabetical order.
+    """
+    # If previous section is different from current section, _last_header will
+    # be reset to empty string, so it's always less than current header.
+    #
+    # If previous line was a blank line, assume that the headers are
+    # intentionally sorted the way they are.
+    if (self._last_header > header_path and
+        not Match(r'^\s*$', clean_lines.elided[linenum - 1])):
+      return False
+    return True
+
+  def CheckNextIncludeOrder(self, header_type):
+    """Returns a non-empty error message if the next header is out of order.
+
+    This function also updates the internal state to be ready to check
+    the next include.
+
+    Args:
+      header_type: One of the _XXX_HEADER constants defined above.
+
+    Returns:
+      The empty string if the header is in the right order, or an
+      error message describing what's wrong.
+
+    """
+    error_message = ('Found %s after %s' %
+                     (self._TYPE_NAMES[header_type],
+                      self._SECTION_NAMES[self._section]))
+
+    last_section = self._section
+
+    if header_type == _C_SYS_HEADER:
+      if self._section <= self._C_SECTION:
+        self._section = self._C_SECTION
+      else:
+        self._last_header = ''
+        return error_message
+    elif header_type == _CPP_SYS_HEADER:
+      if self._section <= self._CPP_SECTION:
+        self._section = self._CPP_SECTION
+      else:
+        self._last_header = ''
+        return error_message
+    elif header_type == _LIKELY_MY_HEADER:
+      if self._section <= self._MY_H_SECTION:
+        self._section = self._MY_H_SECTION
+      else:
+        self._section = self._OTHER_H_SECTION
+    elif header_type == _POSSIBLE_MY_HEADER:
+      if self._section <= self._MY_H_SECTION:
+        self._section = self._MY_H_SECTION
+      else:
+        # This will always be the fallback because we're not sure
+        # enough that the header is associated with this file.
+        self._section = self._OTHER_H_SECTION
+    else:
+      assert header_type == _OTHER_HEADER
+      self._section = self._OTHER_H_SECTION
+
+    if last_section != self._section:
+      self._last_header = ''
+
+    return ''
+
+
+class _CppLintState(object):
+  """Maintains module-wide state.."""
+
+  def __init__(self):
+    self.verbose_level = 1  # global setting.
+    self.error_count = 0    # global count of reported errors
+    # filters to apply when emitting error messages
+    self.filters = _DEFAULT_FILTERS[:]
+    self.counting = 'total'  # In what way are we counting errors?
+    self.errors_by_category = {}  # string to int dict storing error counts
+
+    # output format:
+    # "emacs" - format that emacs can parse (default)
+    # "vs7" - format that Microsoft Visual Studio 7 can parse
+    self.output_format = 'emacs'
+
+  def SetOutputFormat(self, output_format):
+    """Sets the output format for errors."""
+    self.output_format = output_format
+
+  def SetVerboseLevel(self, level):
+    """Sets the module's verbosity, and returns the previous setting."""
+    last_verbose_level = self.verbose_level
+    self.verbose_level = level
+    return last_verbose_level
+
+  def SetCountingStyle(self, counting_style):
+    """Sets the module's counting options."""
+    self.counting = counting_style
+
+  def SetFilters(self, filters):
+    """Sets the error-message filters.
+
+    These filters are applied when deciding whether to emit a given
+    error message.
+
+    Args:
+      filters: A string of comma-separated filters (eg "+whitespace/indent").
+               Each filter should start with + or -; else we die.
+
+    Raises:
+      ValueError: The comma-separated filters did not all start with '+' or '-'.
+                  E.g. "-,+whitespace,-whitespace/indent,whitespace/badfilter"
+    """
+    # Default filters always have less priority than the flag ones.
+    self.filters = _DEFAULT_FILTERS[:]
+    for filt in filters.split(','):
+      clean_filt = filt.strip()
+      if clean_filt:
+        self.filters.append(clean_filt)
+    for filt in self.filters:
+      if not (filt.startswith('+') or filt.startswith('-')):
+        raise ValueError('Every filter in --filters must start with + or -'
+                         ' (%s does not)' % filt)
+
+  def ResetErrorCounts(self):
+    """Sets the module's error statistic back to zero."""
+    self.error_count = 0
+    self.errors_by_category = {}
+
+  def IncrementErrorCount(self, category):
+    """Bumps the module's error statistic."""
+    self.error_count += 1
+    if self.counting in ('toplevel', 'detailed'):
+      if self.counting != 'detailed':
+        category = category.split('/')[0]
+      if category not in self.errors_by_category:
+        self.errors_by_category[category] = 0
+      self.errors_by_category[category] += 1
+
+  def PrintErrorCounts(self):
+    """Print a summary of errors by category, and the total."""
+    for category, count in self.errors_by_category.iteritems():
+      sys.stderr.write('Category \'%s\' errors found: %d\n' %
+                       (category, count))
+    sys.stderr.write('Total errors found: %d\n' % self.error_count)
+
+_cpplint_state = _CppLintState()
+
+
+def _OutputFormat():
+  """Gets the module's output format."""
+  return _cpplint_state.output_format
+
+
+def _SetOutputFormat(output_format):
+  """Sets the module's output format."""
+  _cpplint_state.SetOutputFormat(output_format)
+
+
+def _VerboseLevel():
+  """Returns the module's verbosity setting."""
+  return _cpplint_state.verbose_level
+
+
+def _SetVerboseLevel(level):
+  """Sets the module's verbosity, and returns the previous setting."""
+  return _cpplint_state.SetVerboseLevel(level)
+
+
+def _SetCountingStyle(level):
+  """Sets the module's counting options."""
+  _cpplint_state.SetCountingStyle(level)
+
+
+def _Filters():
+  """Returns the module's list of output filters, as a list."""
+  return _cpplint_state.filters
+
+
+def _SetFilters(filters):
+  """Sets the module's error-message filters.
+
+  These filters are applied when deciding whether to emit a given
+  error message.
+
+  Args:
+    filters: A string of comma-separated filters (eg "whitespace/indent").
+             Each filter should start with + or -; else we die.
+  """
+  _cpplint_state.SetFilters(filters)
+
+
+class _FunctionState(object):
+  """Tracks current function name and the number of lines in its body."""
+
+  _NORMAL_TRIGGER = 250  # for --v=0, 500 for --v=1, etc.
+  _TEST_TRIGGER = 400    # about 50% more than _NORMAL_TRIGGER.
+
+  def __init__(self):
+    self.in_a_function = False
+    self.lines_in_function = 0
+    self.current_function = ''
+
+  def Begin(self, function_name):
+    """Start analyzing function body.
+
+    Args:
+      function_name: The name of the function being tracked.
+    """
+    self.in_a_function = True
+    self.lines_in_function = 0
+    self.current_function = function_name
+
+  def Count(self):
+    """Count line in current function body."""
+    if self.in_a_function:
+      self.lines_in_function += 1
+
+  def Check(self, error, filename, linenum):
+    """Report if too many lines in function body.
+
+    Args:
+      error: The function to call with any errors found.
+      filename: The name of the current file.
+      linenum: The number of the line to check.
+    """
+    if Match(r'T(EST|est)', self.current_function):
+      base_trigger = self._TEST_TRIGGER
+    else:
+      base_trigger = self._NORMAL_TRIGGER
+    trigger = base_trigger * 2**_VerboseLevel()
+
+    if self.lines_in_function > trigger:
+      error_level = int(math.log(self.lines_in_function / base_trigger, 2))
+      # 50 => 0, 100 => 1, 200 => 2, 400 => 3, 800 => 4, 1600 => 5, ...
+      if error_level > 5:
+        error_level = 5
+      error(filename, linenum, 'readability/fn_size', error_level,
+            'Small and focused functions are preferred:'
+            ' %s has %d non-comment lines'
+            ' (error triggered by exceeding %d lines).'  % (
+                self.current_function, self.lines_in_function, trigger))
+
+  def End(self):
+    """Stop analyzing function body."""
+    self.in_a_function = False
+
+
+class _IncludeError(Exception):
+  """Indicates a problem with the include order in a file."""
+  pass
+
+
+class FileInfo:
+  """Provides utility functions for filenames.
+
+  FileInfo provides easy access to the components of a file's path
+  relative to the project root.
+  """
+
+  def __init__(self, filename):
+    self._filename = filename
+
+  def FullName(self):
+    """Make Windows paths like Unix."""
+    return os.path.abspath(self._filename).replace('\\', '/')
+
+  def RepositoryName(self):
+    """FullName after removing the local path to the repository.
+
+    If we have a real absolute path name here we can try to do something smart:
+    detecting the root of the checkout and truncating /path/to/checkout from
+    the name so that we get header guards that don't include things like
+    "C:\Documents and Settings\..." or "/home/username/..." in them and thus
+    people on different computers who have checked the source out to different
+    locations won't see bogus errors.
+    """
+    fullname = self.FullName()
+
+    if os.path.exists(fullname):
+      project_dir = os.path.dirname(fullname)
+
+      if os.path.exists(os.path.join(project_dir, ".svn")):
+        # If there's a .svn file in the current directory, we recursively look
+        # up the directory tree for the top of the SVN checkout
+        root_dir = project_dir
+        one_up_dir = os.path.dirname(root_dir)
+        while os.path.exists(os.path.join(one_up_dir, ".svn")):
+          root_dir = os.path.dirname(root_dir)
+          one_up_dir = os.path.dirname(one_up_dir)
+
+        prefix = os.path.commonprefix([root_dir, project_dir])
+        return fullname[len(prefix) + 1:]
+
+      # Not SVN <= 1.6? Try to find a git, hg, or svn top level directory by
+      # searching up from the current path.
+      root_dir = os.path.dirname(fullname)
+      while (root_dir != os.path.dirname(root_dir) and
+             not os.path.exists(os.path.join(root_dir, ".git")) and
+             not os.path.exists(os.path.join(root_dir, ".hg")) and
+             not os.path.exists(os.path.join(root_dir, ".svn"))):
+        root_dir = os.path.dirname(root_dir)
+
+      if (os.path.exists(os.path.join(root_dir, ".git")) or
+          os.path.exists(os.path.join(root_dir, ".hg")) or
+          os.path.exists(os.path.join(root_dir, ".svn"))):
+        prefix = os.path.commonprefix([root_dir, project_dir])
+        return fullname[len(prefix) + 1:]
+
+    # Don't know what to do; header guard warnings may be wrong...
+    return fullname
+
+  def Split(self):
+    """Splits the file into the directory, basename, and extension.
+
+    For 'chrome/browser/browser.cc', Split() would
+    return ('chrome/browser', 'browser', '.cc')
+
+    Returns:
+      A tuple of (directory, basename, extension).
+    """
+
+    googlename = self.RepositoryName()
+    project, rest = os.path.split(googlename)
+    return (project,) + os.path.splitext(rest)
+
+  def BaseName(self):
+    """File base name - text after the final slash, before the final period."""
+    return self.Split()[1]
+
+  def Extension(self):
+    """File extension - text following the final period."""
+    return self.Split()[2]
+
+  def NoExtension(self):
+    """File has no source file extension."""
+    return '/'.join(self.Split()[0:2])
+
+  def IsSource(self):
+    """File has a source file extension."""
+    return self.Extension()[1:] in ('c', 'cc', 'cpp', 'cxx')
+
+
+def _ShouldPrintError(category, confidence, linenum):
+  """If confidence >= verbose, category passes filter and is not suppressed."""
+
+  # There are three ways we might decide not to print an error message:
+  # a "NOLINT(category)" comment appears in the source,
+  # the verbosity level isn't high enough, or the filters filter it out.
+  if IsErrorSuppressedByNolint(category, linenum):
+    return False
+  if confidence < _cpplint_state.verbose_level:
+    return False
+
+  is_filtered = False
+  for one_filter in _Filters():
+    if one_filter.startswith('-'):
+      if category.startswith(one_filter[1:]):
+        is_filtered = True
+    elif one_filter.startswith('+'):
+      if category.startswith(one_filter[1:]):
+        is_filtered = False
+    else:
+      assert False  # should have been checked for in SetFilter.
+  if is_filtered:
+    return False
+
+  return True
+
+
+def Error(filename, linenum, category, confidence, message):
+  """Logs the fact we've found a lint error.
+
+  We log where the error was found, and also our confidence in the error,
+  that is, how certain we are this is a legitimate style regression, and
+  not a misidentification or a use that's sometimes justified.
+
+  False positives can be suppressed by the use of
+  "cpplint(category)"  comments on the offending line.  These are
+  parsed into _error_suppressions.
+
+  Args:
+    filename: The name of the file containing the error.
+    linenum: The number of the line containing the error.
+    category: A string used to describe the "category" this bug
+      falls under: "whitespace", say, or "runtime".  Categories
+      may have a hierarchy separated by slashes: "whitespace/indent".
+    confidence: A number from 1-5 representing a confidence score for
+      the error, with 5 meaning that we are certain of the problem,
+      and 1 meaning that it could be a legitimate construct.
+    message: The error message.
+  """
+  if _ShouldPrintError(category, confidence, linenum):
+    _cpplint_state.IncrementErrorCount(category)
+    if _cpplint_state.output_format == 'vs7':
+      sys.stderr.write('%s(%s):  %s  [%s] [%d]\n' % (
+          filename, linenum, message, category, confidence))
+    elif _cpplint_state.output_format == 'eclipse':
+      sys.stderr.write('%s:%s: warning: %s  [%s] [%d]\n' % (
+          filename, linenum, message, category, confidence))
+    else:
+      sys.stderr.write('%s:%s:  %s  [%s] [%d]\n' % (
+          filename, linenum, message, category, confidence))
+
+
+# Matches standard C++ escape sequences per 2.13.2.3 of the C++ standard.
+_RE_PATTERN_CLEANSE_LINE_ESCAPES = re.compile(
+    r'\\([abfnrtv?"\\\']|\d+|x[0-9a-fA-F]+)')
+# Matches strings.  Escape codes should already be removed by ESCAPES.
+_RE_PATTERN_CLEANSE_LINE_DOUBLE_QUOTES = re.compile(r'"[^"]*"')
+# Matches characters.  Escape codes should already be removed by ESCAPES.
+_RE_PATTERN_CLEANSE_LINE_SINGLE_QUOTES = re.compile(r"'.'")
+# Matches multi-line C++ comments.
+# This RE is a little bit more complicated than one might expect, because we
+# have to take care of space removals tools so we can handle comments inside
+# statements better.
+# The current rule is: We only clear spaces from both sides when we're at the
+# end of the line. Otherwise, we try to remove spaces from the right side,
+# if this doesn't work we try on left side but only if there's a non-character
+# on the right.
+_RE_PATTERN_CLEANSE_LINE_C_COMMENTS = re.compile(
+    r"""(\s*/\*.*\*/\s*$|
+            /\*.*\*/\s+|
+         \s+/\*.*\*/(?=\W)|
+            /\*.*\*/)""", re.VERBOSE)
+
+
+def IsCppString(line):
+  """Does line terminate so, that the next symbol is in string constant.
+
+  This function does not consider single-line nor multi-line comments.
+
+  Args:
+    line: is a partial line of code starting from the 0..n.
+
+  Returns:
+    True, if next character appended to 'line' is inside a
+    string constant.
+  """
+
+  line = line.replace(r'\\', 'XX')  # after this, \\" does not match to \"
+  return ((line.count('"') - line.count(r'\"') - line.count("'\"'")) & 1) == 1
+
+
+def CleanseRawStrings(raw_lines):
+  """Removes C++11 raw strings from lines.
+
+    Before:
+      static const char kData[] = R"(
+          multi-line string
+          )";
+
+    After:
+      static const char kData[] = ""
+          (replaced by blank line)
+          "";
+
+  Args:
+    raw_lines: list of raw lines.
+
+  Returns:
+    list of lines with C++11 raw strings replaced by empty strings.
+  """
+
+  delimiter = None
+  lines_without_raw_strings = []
+  for line in raw_lines:
+    if delimiter:
+      # Inside a raw string, look for the end
+      end = line.find(delimiter)
+      if end >= 0:
+        # Found the end of the string, match leading space for this
+        # line and resume copying the original lines, and also insert
+        # a "" on the last line.
+        leading_space = Match(r'^(\s*)\S', line)
+        line = leading_space.group(1) + '""' + line[end + len(delimiter):]
+        delimiter = None
+      else:
+        # Haven't found the end yet, append a blank line.
+        line = ''
+
+    else:
+      # Look for beginning of a raw string.
+      # See 2.14.15 [lex.string] for syntax.
+      matched = Match(r'^(.*)\b(?:R|u8R|uR|UR|LR)"([^\s\\()]*)\((.*)$', line)
+      if matched:
+        delimiter = ')' + matched.group(2) + '"'
+
+        end = matched.group(3).find(delimiter)
+        if end >= 0:
+          # Raw string ended on same line
+          line = (matched.group(1) + '""' +
+                  matched.group(3)[end + len(delimiter):])
+          delimiter = None
+        else:
+          # Start of a multi-line raw string
+          line = matched.group(1) + '""'
+
+    lines_without_raw_strings.append(line)
+
+  # TODO(unknown): if delimiter is not None here, we might want to
+  # emit a warning for unterminated string.
+  return lines_without_raw_strings
+
+
+def FindNextMultiLineCommentStart(lines, lineix):
+  """Find the beginning marker for a multiline comment."""
+  while lineix < len(lines):
+    if lines[lineix].strip().startswith('/*'):
+      # Only return this marker if the comment goes beyond this line
+      if lines[lineix].strip().find('*/', 2) < 0:
+        return lineix
+    lineix += 1
+  return len(lines)
+
+
+def FindNextMultiLineCommentEnd(lines, lineix):
+  """We are inside a comment, find the end marker."""
+  while lineix < len(lines):
+    if lines[lineix].strip().endswith('*/'):
+      return lineix
+    lineix += 1
+  return len(lines)
+
+
+def RemoveMultiLineCommentsFromRange(lines, begin, end):
+  """Clears a range of lines for multi-line comments."""
+  # Having // dummy comments makes the lines non-empty, so we will not get
+  # unnecessary blank line warnings later in the code.
+  for i in range(begin, end):
+    lines[i] = '// dummy'
+
+
+def RemoveMultiLineComments(filename, lines, error):
+  """Removes multiline (c-style) comments from lines."""
+  lineix = 0
+  while lineix < len(lines):
+    lineix_begin = FindNextMultiLineCommentStart(lines, lineix)
+    if lineix_begin >= len(lines):
+      return
+    lineix_end = FindNextMultiLineCommentEnd(lines, lineix_begin)
+    if lineix_end >= len(lines):
+      error(filename, lineix_begin + 1, 'readability/multiline_comment', 5,
+            'Could not find end of multi-line comment')
+      return
+    RemoveMultiLineCommentsFromRange(lines, lineix_begin, lineix_end + 1)
+    lineix = lineix_end + 1
+
+
+def CleanseComments(line):
+  """Removes //-comments and single-line C-style /* */ comments.
+
+  Args:
+    line: A line of C++ source.
+
+  Returns:
+    The line with single-line comments removed.
+  """
+  commentpos = line.find('//')
+  if commentpos != -1 and not IsCppString(line[:commentpos]):
+    line = line[:commentpos].rstrip()
+  # get rid of /* ... */
+  return _RE_PATTERN_CLEANSE_LINE_C_COMMENTS.sub('', line)
+
+
+class CleansedLines(object):
+  """Holds 3 copies of all lines with different preprocessing applied to them.
+
+  1) elided member contains lines without strings and comments,
+  2) lines member contains lines without comments, and
+  3) raw_lines member contains all the lines without processing.
+  All these three members are of <type 'list'>, and of the same length.
+  """
+
+  def __init__(self, lines):
+    self.elided = []
+    self.lines = []
+    self.raw_lines = lines
+    self.num_lines = len(lines)
+    self.lines_without_raw_strings = CleanseRawStrings(lines)
+    for linenum in range(len(self.lines_without_raw_strings)):
+      self.lines.append(CleanseComments(
+          self.lines_without_raw_strings[linenum]))
+      elided = self._CollapseStrings(self.lines_without_raw_strings[linenum])
+      self.elided.append(CleanseComments(elided))
+
+  def NumLines(self):
+    """Returns the number of lines represented."""
+    return self.num_lines
+
+  @staticmethod
+  def _CollapseStrings(elided):
+    """Collapses strings and chars on a line to simple "" or '' blocks.
+
+    We nix strings first so we're not fooled by text like '"http://"'
+
+    Args:
+      elided: The line being processed.
+
+    Returns:
+      The line with collapsed strings.
+    """
+    if not _RE_PATTERN_INCLUDE.match(elided):
+      # Remove escaped characters first to make quote/single quote collapsing
+      # basic.  Things that look like escaped characters shouldn't occur
+      # outside of strings and chars.
+      elided = _RE_PATTERN_CLEANSE_LINE_ESCAPES.sub('', elided)
+      elided = _RE_PATTERN_CLEANSE_LINE_SINGLE_QUOTES.sub("''", elided)
+      elided = _RE_PATTERN_CLEANSE_LINE_DOUBLE_QUOTES.sub('""', elided)
+    return elided
+
+
+def FindEndOfExpressionInLine(line, startpos, depth, startchar, endchar):
+  """Find the position just after the matching endchar.
+
+  Args:
+    line: a CleansedLines line.
+    startpos: start searching at this position.
+    depth: nesting level at startpos.
+    startchar: expression opening character.
+    endchar: expression closing character.
+
+  Returns:
+    On finding matching endchar: (index just after matching endchar, 0)
+    Otherwise: (-1, new depth at end of this line)
+  """
+  for i in xrange(startpos, len(line)):
+    if line[i] == startchar:
+      depth += 1
+    elif line[i] == endchar:
+      depth -= 1
+      if depth == 0:
+        return (i + 1, 0)
+  return (-1, depth)
+
+
+def CloseExpression(clean_lines, linenum, pos):
+  """If input points to ( or { or [ or <, finds the position that closes it.
+
+  If lines[linenum][pos] points to a '(' or '{' or '[' or '<', finds the
+  linenum/pos that correspond to the closing of the expression.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    pos: A position on the line.
+
+  Returns:
+    A tuple (line, linenum, pos) pointer *past* the closing brace, or
+    (line, len(lines), -1) if we never find a close.  Note we ignore
+    strings and comments when matching; and the line we return is the
+    'cleansed' line at linenum.
+  """
+
+  line = clean_lines.elided[linenum]
+  startchar = line[pos]
+  if startchar not in '({[<':
+    return (line, clean_lines.NumLines(), -1)
+  if startchar == '(': endchar = ')'
+  if startchar == '[': endchar = ']'
+  if startchar == '{': endchar = '}'
+  if startchar == '<': endchar = '>'
+
+  # Check first line
+  (end_pos, num_open) = FindEndOfExpressionInLine(
+      line, pos, 0, startchar, endchar)
+  if end_pos > -1:
+    return (line, linenum, end_pos)
+
+  # Continue scanning forward
+  while linenum < clean_lines.NumLines() - 1:
+    linenum += 1
+    line = clean_lines.elided[linenum]
+    (end_pos, num_open) = FindEndOfExpressionInLine(
+        line, 0, num_open, startchar, endchar)
+    if end_pos > -1:
+      return (line, linenum, end_pos)
+
+  # Did not find endchar before end of file, give up
+  return (line, clean_lines.NumLines(), -1)
+
+
+def FindStartOfExpressionInLine(line, endpos, depth, startchar, endchar):
+  """Find position at the matching startchar.
+
+  This is almost the reverse of FindEndOfExpressionInLine, but note
+  that the input position and returned position differs by 1.
+
+  Args:
+    line: a CleansedLines line.
+    endpos: start searching at this position.
+    depth: nesting level at endpos.
+    startchar: expression opening character.
+    endchar: expression closing character.
+
+  Returns:
+    On finding matching startchar: (index at matching startchar, 0)
+    Otherwise: (-1, new depth at beginning of this line)
+  """
+  for i in xrange(endpos, -1, -1):
+    if line[i] == endchar:
+      depth += 1
+    elif line[i] == startchar:
+      depth -= 1
+      if depth == 0:
+        return (i, 0)
+  return (-1, depth)
+
+
+def ReverseCloseExpression(clean_lines, linenum, pos):
+  """If input points to ) or } or ] or >, finds the position that opens it.
+
+  If lines[linenum][pos] points to a ')' or '}' or ']' or '>', finds the
+  linenum/pos that correspond to the opening of the expression.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    pos: A position on the line.
+
+  Returns:
+    A tuple (line, linenum, pos) pointer *at* the opening brace, or
+    (line, 0, -1) if we never find the matching opening brace.  Note
+    we ignore strings and comments when matching; and the line we
+    return is the 'cleansed' line at linenum.
+  """
+  line = clean_lines.elided[linenum]
+  endchar = line[pos]
+  if endchar not in ')}]>':
+    return (line, 0, -1)
+  if endchar == ')': startchar = '('
+  if endchar == ']': startchar = '['
+  if endchar == '}': startchar = '{'
+  if endchar == '>': startchar = '<'
+
+  # Check last line
+  (start_pos, num_open) = FindStartOfExpressionInLine(
+      line, pos, 0, startchar, endchar)
+  if start_pos > -1:
+    return (line, linenum, start_pos)
+
+  # Continue scanning backward
+  while linenum > 0:
+    linenum -= 1
+    line = clean_lines.elided[linenum]
+    (start_pos, num_open) = FindStartOfExpressionInLine(
+        line, len(line) - 1, num_open, startchar, endchar)
+    if start_pos > -1:
+      return (line, linenum, start_pos)
+
+  # Did not find startchar before beginning of file, give up
+  return (line, 0, -1)
+
+
+def CheckForCopyright(filename, lines, error):
+  """Logs an error if no Copyright message appears at the top of the file."""
+
+  # We'll say it should occur by line 10. Don't forget there's a
+  # dummy line at the front.
+  for line in xrange(1, min(len(lines), 11)):
+    if re.search(r'Copyright', lines[line], re.I): break
+  else:                       # means no copyright line was found
+    error(filename, 0, 'legal/copyright', 5,
+          'No copyright message found.  '
+          'You should have a line: "Copyright [year] <Copyright Owner>"')
+
+
+def GetHeaderGuardCPPVariable(filename):
+  """Returns the CPP variable that should be used as a header guard.
+
+  Args:
+    filename: The name of a C++ header file.
+
+  Returns:
+    The CPP variable that should be used as a header guard in the
+    named file.
+
+  """
+
+  # Restores original filename in case that cpplint is invoked from Emacs's
+  # flymake.
+  filename = re.sub(r'_flymake\.h$', '.h', filename)
+  filename = re.sub(r'/\.flymake/([^/]*)$', r'/\1', filename)
+
+  fileinfo = FileInfo(filename)
+  file_path_from_root = fileinfo.RepositoryName()
+  if _root:
+    file_path_from_root = re.sub('^' + _root + os.sep, '', file_path_from_root)
+  return re.sub(r'[-./\s]', '_', file_path_from_root).upper() + '_'
+
+
+def CheckForHeaderGuard(filename, lines, error):
+  """Checks that the file contains a header guard.
+
+  Logs an error if no #ifndef header guard is present.  For other
+  headers, checks that the full pathname is used.
+
+  Args:
+    filename: The name of the C++ header file.
+    lines: An array of strings, each representing a line of the file.
+    error: The function to call with any errors found.
+  """
+
+  cppvar = GetHeaderGuardCPPVariable(filename)
+
+  ifndef = None
+  ifndef_linenum = 0
+  define = None
+  endif = None
+  endif_linenum = 0
+  for linenum, line in enumerate(lines):
+    linesplit = line.split()
+    if len(linesplit) >= 2:
+      # find the first occurrence of #ifndef and #define, save arg
+      if not ifndef and linesplit[0] == '#ifndef':
+        # set ifndef to the header guard presented on the #ifndef line.
+        ifndef = linesplit[1]
+        ifndef_linenum = linenum
+      if not define and linesplit[0] == '#define':
+        define = linesplit[1]
+    # find the last occurrence of #endif, save entire line
+    if line.startswith('#endif'):
+      endif = line
+      endif_linenum = linenum
+
+  if not ifndef:
+    error(filename, 0, 'build/header_guard', 5,
+          'No #ifndef header guard found, suggested CPP variable is: %s' %
+          cppvar)
+    return
+
+  if not define:
+    error(filename, 0, 'build/header_guard', 5,
+          'No #define header guard found, suggested CPP variable is: %s' %
+          cppvar)
+    return
+
+  # The guard should be PATH_FILE_H_, but we also allow PATH_FILE_H__
+  # for backward compatibility.
+  if ifndef != cppvar:
+    error_level = 0
+    if ifndef != cppvar + '_':
+      error_level = 5
+
+    ParseNolintSuppressions(filename, lines[ifndef_linenum], ifndef_linenum,
+                            error)
+    error(filename, ifndef_linenum, 'build/header_guard', error_level,
+          '#ifndef header guard has wrong style, please use: %s' % cppvar)
+
+  if define != ifndef:
+    error(filename, 0, 'build/header_guard', 5,
+          '#ifndef and #define don\'t match, suggested CPP variable is: %s' %
+          cppvar)
+    return
+
+  if endif != ('#endif  // %s' % cppvar):
+    error_level = 0
+    if endif != ('#endif  // %s' % (cppvar + '_')):
+      error_level = 5
+
+    ParseNolintSuppressions(filename, lines[endif_linenum], endif_linenum,
+                            error)
+    error(filename, endif_linenum, 'build/header_guard', error_level,
+          '#endif line should be "#endif  // %s"' % cppvar)
+
+
+def CheckForBadCharacters(filename, lines, error):
+  """Logs an error for each line containing bad characters.
+
+  Two kinds of bad characters:
+
+  1. Unicode replacement characters: These indicate that either the file
+  contained invalid UTF-8 (likely) or Unicode replacement characters (which
+  it shouldn't).  Note that it's possible for this to throw off line
+  numbering if the invalid UTF-8 occurred adjacent to a newline.
+
+  2. NUL bytes.  These are problematic for some tools.
+
+  Args:
+    filename: The name of the current file.
+    lines: An array of strings, each representing a line of the file.
+    error: The function to call with any errors found.
+  """
+  for linenum, line in enumerate(lines):
+    if u'\ufffd' in line:
+      error(filename, linenum, 'readability/utf8', 5,
+            'Line contains invalid UTF-8 (or Unicode replacement character).')
+    if '\0' in line:
+      error(filename, linenum, 'readability/nul', 5, 'Line contains NUL byte.')
+
+
+def CheckForNewlineAtEOF(filename, lines, error):
+  """Logs an error if there is no newline char at the end of the file.
+
+  Args:
+    filename: The name of the current file.
+    lines: An array of strings, each representing a line of the file.
+    error: The function to call with any errors found.
+  """
+
+  # The array lines() was created by adding two newlines to the
+  # original file (go figure), then splitting on \n.
+  # To verify that the file ends in \n, we just have to make sure the
+  # last-but-two element of lines() exists and is empty.
+  if len(lines) < 3 or lines[-2]:
+    error(filename, len(lines) - 2, 'whitespace/ending_newline', 5,
+          'Could not find a newline character at the end of the file.')
+
+
+def CheckForMultilineCommentsAndStrings(filename, clean_lines, linenum, error):
+  """Logs an error if we see /* ... */ or "..." that extend past one line.
+
+  /* ... */ comments are legit inside macros, for one line.
+  Otherwise, we prefer // comments, so it's ok to warn about the
+  other.  Likewise, it's ok for strings to extend across multiple
+  lines, as long as a line continuation character (backslash)
+  terminates each line. Although not currently prohibited by the C++
+  style guide, it's ugly and unnecessary. We don't do well with either
+  in this lint program, so we warn about both.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  # Remove all \\ (escaped backslashes) from the line. They are OK, and the
+  # second (escaped) slash may trigger later \" detection erroneously.
+  line = line.replace('\\\\', '')
+
+  if line.count('/*') > line.count('*/'):
+    error(filename, linenum, 'readability/multiline_comment', 5,
+          'Complex multi-line /*...*/-style comment found. '
+          'Lint may give bogus warnings.  '
+          'Consider replacing these with //-style comments, '
+          'with #if 0...#endif, '
+          'or with more clearly structured multi-line comments.')
+
+  if (line.count('"') - line.count('\\"')) % 2:
+    error(filename, linenum, 'readability/multiline_string', 5,
+          'Multi-line string ("...") found.  This lint script doesn\'t '
+          'do well with such strings, and may give bogus warnings.  '
+          'Use C++11 raw strings or concatenation instead.')
+
+
+threading_list = (
+    ('asctime(', 'asctime_r('),
+    ('ctime(', 'ctime_r('),
+    ('getgrgid(', 'getgrgid_r('),
+    ('getgrnam(', 'getgrnam_r('),
+    ('getlogin(', 'getlogin_r('),
+    ('getpwnam(', 'getpwnam_r('),
+    ('getpwuid(', 'getpwuid_r('),
+    ('gmtime(', 'gmtime_r('),
+    ('localtime(', 'localtime_r('),
+    ('rand(', 'rand_r('),
+    ('strtok(', 'strtok_r('),
+    ('ttyname(', 'ttyname_r('),
+    )
+
+
+def CheckPosixThreading(filename, clean_lines, linenum, error):
+  """Checks for calls to thread-unsafe functions.
+
+  Much code has been originally written without consideration of
+  multi-threading. Also, engineers are relying on their old experience;
+  they have learned posix before threading extensions were added. These
+  tests guide the engineers to use thread-safe functions (when using
+  posix directly).
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+  for single_thread_function, multithread_safe_function in threading_list:
+    ix = line.find(single_thread_function)
+    # Comparisons made explicit for clarity -- pylint: disable=g-explicit-bool-comparison
+    if ix >= 0 and (ix == 0 or (not line[ix - 1].isalnum() and
+                                line[ix - 1] not in ('_', '.', '>'))):
+      error(filename, linenum, 'runtime/threadsafe_fn', 2,
+            'Consider using ' + multithread_safe_function +
+            '...) instead of ' + single_thread_function +
+            '...) for improved thread safety.')
+
+
+def CheckVlogArguments(filename, clean_lines, linenum, error):
+  """Checks that VLOG() is only used for defining a logging level.
+
+  For example, VLOG(2) is correct. VLOG(INFO), VLOG(WARNING), VLOG(ERROR), and
+  VLOG(FATAL) are not.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+  if Search(r'\bVLOG\((INFO|ERROR|WARNING|DFATAL|FATAL)\)', line):
+    error(filename, linenum, 'runtime/vlog', 5,
+          'VLOG() should be used with numeric verbosity level.  '
+          'Use LOG() if you want symbolic severity levels.')
+
+
+# Matches invalid increment: *count++, which moves pointer instead of
+# incrementing a value.
+_RE_PATTERN_INVALID_INCREMENT = re.compile(
+    r'^\s*\*\w+(\+\+|--);')
+
+
+def CheckInvalidIncrement(filename, clean_lines, linenum, error):
+  """Checks for invalid increment *count++.
+
+  For example following function:
+  void increment_counter(int* count) {
+    *count++;
+  }
+  is invalid, because it effectively does count++, moving pointer, and should
+  be replaced with ++*count, (*count)++ or *count += 1.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+  if _RE_PATTERN_INVALID_INCREMENT.match(line):
+    error(filename, linenum, 'runtime/invalid_increment', 5,
+          'Changing pointer instead of value (or unused value of operator*).')
+
+
+class _BlockInfo(object):
+  """Stores information about a generic block of code."""
+
+  def __init__(self, seen_open_brace):
+    self.seen_open_brace = seen_open_brace
+    self.open_parentheses = 0
+    self.inline_asm = _NO_ASM
+
+  def CheckBegin(self, filename, clean_lines, linenum, error):
+    """Run checks that applies to text up to the opening brace.
+
+    This is mostly for checking the text after the class identifier
+    and the "{", usually where the base class is specified.  For other
+    blocks, there isn't much to check, so we always pass.
+
+    Args:
+      filename: The name of the current file.
+      clean_lines: A CleansedLines instance containing the file.
+      linenum: The number of the line to check.
+      error: The function to call with any errors found.
+    """
+    pass
+
+  def CheckEnd(self, filename, clean_lines, linenum, error):
+    """Run checks that applies to text after the closing brace.
+
+    This is mostly used for checking end of namespace comments.
+
+    Args:
+      filename: The name of the current file.
+      clean_lines: A CleansedLines instance containing the file.
+      linenum: The number of the line to check.
+      error: The function to call with any errors found.
+    """
+    pass
+
+
+class _ClassInfo(_BlockInfo):
+  """Stores information about a class."""
+
+  def __init__(self, name, class_or_struct, clean_lines, linenum):
+    _BlockInfo.__init__(self, False)
+    self.name = name
+    self.starting_linenum = linenum
+    self.is_derived = False
+    if class_or_struct == 'struct':
+      self.access = 'public'
+      self.is_struct = True
+    else:
+      self.access = 'private'
+      self.is_struct = False
+
+    # Remember initial indentation level for this class.  Using raw_lines here
+    # instead of elided to account for leading comments.
+    initial_indent = Match(r'^( *)\S', clean_lines.raw_lines[linenum])
+    if initial_indent:
+      self.class_indent = len(initial_indent.group(1))
+    else:
+      self.class_indent = 0
+
+    # Try to find the end of the class.  This will be confused by things like:
+    #   class A {
+    #   } *x = { ...
+    #
+    # But it's still good enough for CheckSectionSpacing.
+    self.last_line = 0
+    depth = 0
+    for i in range(linenum, clean_lines.NumLines()):
+      line = clean_lines.elided[i]
+      depth += line.count('{') - line.count('}')
+      if not depth:
+        self.last_line = i
+        break
+
+  def CheckBegin(self, filename, clean_lines, linenum, error):
+    # Look for a bare ':'
+    if Search('(^|[^:]):($|[^:])', clean_lines.elided[linenum]):
+      self.is_derived = True
+
+  def CheckEnd(self, filename, clean_lines, linenum, error):
+    # Check that closing brace is aligned with beginning of the class.
+    # Only do this if the closing brace is indented by only whitespaces.
+    # This means we will not check single-line class definitions.
+    indent = Match(r'^( *)\}', clean_lines.elided[linenum])
+    if indent and len(indent.group(1)) != self.class_indent:
+      if self.is_struct:
+        parent = 'struct ' + self.name
+      else:
+        parent = 'class ' + self.name
+      error(filename, linenum, 'whitespace/indent', 3,
+            'Closing brace should be aligned with beginning of %s' % parent)
+
+
+class _NamespaceInfo(_BlockInfo):
+  """Stores information about a namespace."""
+
+  def __init__(self, name, linenum):
+    _BlockInfo.__init__(self, False)
+    self.name = name or ''
+    self.starting_linenum = linenum
+
+  def CheckEnd(self, filename, clean_lines, linenum, error):
+    """Check end of namespace comments."""
+    line = clean_lines.raw_lines[linenum]
+
+    # Check how many lines is enclosed in this namespace.  Don't issue
+    # warning for missing namespace comments if there aren't enough
+    # lines.  However, do apply checks if there is already an end of
+    # namespace comment and it's incorrect.
+    #
+    # TODO(unknown): We always want to check end of namespace comments
+    # if a namespace is large, but sometimes we also want to apply the
+    # check if a short namespace contained nontrivial things (something
+    # other than forward declarations).  There is currently no logic on
+    # deciding what these nontrivial things are, so this check is
+    # triggered by namespace size only, which works most of the time.
+    if (linenum - self.starting_linenum < 10
+        and not Match(r'};*\s*(//|/\*).*\bnamespace\b', line)):
+      return
+
+    # Look for matching comment at end of namespace.
+    #
+    # Note that we accept C style "/* */" comments for terminating
+    # namespaces, so that code that terminate namespaces inside
+    # preprocessor macros can be cpplint clean.
+    #
+    # We also accept stuff like "// end of namespace <name>." with the
+    # period at the end.
+    #
+    # Besides these, we don't accept anything else, otherwise we might
+    # get false negatives when existing comment is a substring of the
+    # expected namespace.
+    if self.name:
+      # Named namespace
+      if not Match((r'};*\s*(//|/\*).*\bnamespace\s+' + re.escape(self.name) +
+                    r'[\*/\.\\\s]*$'),
+                   line):
+        error(filename, linenum, 'readability/namespace', 5,
+              'Namespace should be terminated with "// namespace %s"' %
+              self.name)
+    else:
+      # Anonymous namespace
+      if not Match(r'};*\s*(//|/\*).*\bnamespace[\*/\.\\\s]*$', line):
+        error(filename, linenum, 'readability/namespace', 5,
+              'Namespace should be terminated with "// namespace"')
+
+
+class _PreprocessorInfo(object):
+  """Stores checkpoints of nesting stacks when #if/#else is seen."""
+
+  def __init__(self, stack_before_if):
+    # The entire nesting stack before #if
+    self.stack_before_if = stack_before_if
+
+    # The entire nesting stack up to #else
+    self.stack_before_else = []
+
+    # Whether we have already seen #else or #elif
+    self.seen_else = False
+
+
+class _NestingState(object):
+  """Holds states related to parsing braces."""
+
+  def __init__(self):
+    # Stack for tracking all braces.  An object is pushed whenever we
+    # see a "{", and popped when we see a "}".  Only 3 types of
+    # objects are possible:
+    # - _ClassInfo: a class or struct.
+    # - _NamespaceInfo: a namespace.
+    # - _BlockInfo: some other type of block.
+    self.stack = []
+
+    # Stack of _PreprocessorInfo objects.
+    self.pp_stack = []
+
+  def SeenOpenBrace(self):
+    """Check if we have seen the opening brace for the innermost block.
+
+    Returns:
+      True if we have seen the opening brace, False if the innermost
+      block is still expecting an opening brace.
+    """
+    return (not self.stack) or self.stack[-1].seen_open_brace
+
+  def InNamespaceBody(self):
+    """Check if we are currently one level inside a namespace body.
+
+    Returns:
+      True if top of the stack is a namespace block, False otherwise.
+    """
+    return self.stack and isinstance(self.stack[-1], _NamespaceInfo)
+
+  def UpdatePreprocessor(self, line):
+    """Update preprocessor stack.
+
+    We need to handle preprocessors due to classes like this:
+      #ifdef SWIG
+      struct ResultDetailsPageElementExtensionPoint {
+      #else
+      struct ResultDetailsPageElementExtensionPoint : public Extension {
+      #endif
+
+    We make the following assumptions (good enough for most files):
+    - Preprocessor condition evaluates to true from #if up to first
+      #else/#elif/#endif.
+
+    - Preprocessor condition evaluates to false from #else/#elif up
+      to #endif.  We still perform lint checks on these lines, but
+      these do not affect nesting stack.
+
+    Args:
+      line: current line to check.
+    """
+    if Match(r'^\s*#\s*(if|ifdef|ifndef)\b', line):
+      # Beginning of #if block, save the nesting stack here.  The saved
+      # stack will allow us to restore the parsing state in the #else case.
+      self.pp_stack.append(_PreprocessorInfo(copy.deepcopy(self.stack)))
+    elif Match(r'^\s*#\s*(else|elif)\b', line):
+      # Beginning of #else block
+      if self.pp_stack:
+        if not self.pp_stack[-1].seen_else:
+          # This is the first #else or #elif block.  Remember the
+          # whole nesting stack up to this point.  This is what we
+          # keep after the #endif.
+          self.pp_stack[-1].seen_else = True
+          self.pp_stack[-1].stack_before_else = copy.deepcopy(self.stack)
+
+        # Restore the stack to how it was before the #if
+        self.stack = copy.deepcopy(self.pp_stack[-1].stack_before_if)
+      else:
+        # TODO(unknown): unexpected #else, issue warning?
+        pass
+    elif Match(r'^\s*#\s*endif\b', line):
+      # End of #if or #else blocks.
+      if self.pp_stack:
+        # If we saw an #else, we will need to restore the nesting
+        # stack to its former state before the #else, otherwise we
+        # will just continue from where we left off.
+        if self.pp_stack[-1].seen_else:
+          # Here we can just use a shallow copy since we are the last
+          # reference to it.
+          self.stack = self.pp_stack[-1].stack_before_else
+        # Drop the corresponding #if
+        self.pp_stack.pop()
+      else:
+        # TODO(unknown): unexpected #endif, issue warning?
+        pass
+
+  def Update(self, filename, clean_lines, linenum, error):
+    """Update nesting state with current line.
+
+    Args:
+      filename: The name of the current file.
+      clean_lines: A CleansedLines instance containing the file.
+      linenum: The number of the line to check.
+      error: The function to call with any errors found.
+    """
+    line = clean_lines.elided[linenum]
+
+    # Update pp_stack first
+    self.UpdatePreprocessor(line)
+
+    # Count parentheses.  This is to avoid adding struct arguments to
+    # the nesting stack.
+    if self.stack:
+      inner_block = self.stack[-1]
+      depth_change = line.count('(') - line.count(')')
+      inner_block.open_parentheses += depth_change
+
+      # Also check if we are starting or ending an inline assembly block.
+      if inner_block.inline_asm in (_NO_ASM, _END_ASM):
+        if (depth_change != 0 and
+            inner_block.open_parentheses == 1 and
+            _MATCH_ASM.match(line)):
+          # Enter assembly block
+          inner_block.inline_asm = _INSIDE_ASM
+        else:
+          # Not entering assembly block.  If previous line was _END_ASM,
+          # we will now shift to _NO_ASM state.
+          inner_block.inline_asm = _NO_ASM
+      elif (inner_block.inline_asm == _INSIDE_ASM and
+            inner_block.open_parentheses == 0):
+        # Exit assembly block
+        inner_block.inline_asm = _END_ASM
+
+    # Consume namespace declaration at the beginning of the line.  Do
+    # this in a loop so that we catch same line declarations like this:
+    #   namespace proto2 { namespace bridge { class MessageSet; } }
+    while True:
+      # Match start of namespace.  The "\b\s*" below catches namespace
+      # declarations even if it weren't followed by a whitespace, this
+      # is so that we don't confuse our namespace checker.  The
+      # missing spaces will be flagged by CheckSpacing.
+      namespace_decl_match = Match(r'^\s*namespace\b\s*([:\w]+)?(.*)$', line)
+      if not namespace_decl_match:
+        break
+
+      new_namespace = _NamespaceInfo(namespace_decl_match.group(1), linenum)
+      self.stack.append(new_namespace)
+
+      line = namespace_decl_match.group(2)
+      if line.find('{') != -1:
+        new_namespace.seen_open_brace = True
+        line = line[line.find('{') + 1:]
+
+    # Look for a class declaration in whatever is left of the line
+    # after parsing namespaces.  The regexp accounts for decorated classes
+    # such as in:
+    #   class LOCKABLE API Object {
+    #   };
+    #
+    # Templates with class arguments may confuse the parser, for example:
+    #   template <class T
+    #             class Comparator = less<T>,
+    #             class Vector = vector<T> >
+    #   class HeapQueue {
+    #
+    # Because this parser has no nesting state about templates, by the
+    # time it saw "class Comparator", it may think that it's a new class.
+    # Nested templates have a similar problem:
+    #   template <
+    #       typename ExportedType,
+    #       typename TupleType,
+    #       template <typename, typename> class ImplTemplate>
+    #
+    # To avoid these cases, we ignore classes that are followed by '=' or '>'
+    class_decl_match = Match(
+        r'\s*(template\s*<[\w\s<>,:]*>\s*)?'
+        r'(class|struct)\s+([A-Z_]+\s+)*(\w+(?:::\w+)*)'
+        r'(([^=>]|<[^<>]*>|<[^<>]*<[^<>]*>\s*>)*)$', line)
+    if (class_decl_match and
+        (not self.stack or self.stack[-1].open_parentheses == 0)):
+      self.stack.append(_ClassInfo(
+          class_decl_match.group(4), class_decl_match.group(2),
+          clean_lines, linenum))
+      line = class_decl_match.group(5)
+
+    # If we have not yet seen the opening brace for the innermost block,
+    # run checks here.
+    if not self.SeenOpenBrace():
+      self.stack[-1].CheckBegin(filename, clean_lines, linenum, error)
+
+    # Update access control if we are inside a class/struct
+    if self.stack and isinstance(self.stack[-1], _ClassInfo):
+      classinfo = self.stack[-1]
+      access_match = Match(
+          r'^(.*)\b(public|private|protected|signals)(\s+(?:slots\s*)?)?'
+          r':(?:[^:]|$)',
+          line)
+      if access_match:
+        classinfo.access = access_match.group(2)
+
+        # Check that access keywords are indented +1 space.  Skip this
+        # check if the keywords are not preceded by whitespaces.
+        indent = access_match.group(1)
+        if (len(indent) != classinfo.class_indent + 1 and
+            Match(r'^\s*$', indent)):
+          if classinfo.is_struct:
+            parent = 'struct ' + classinfo.name
+          else:
+            parent = 'class ' + classinfo.name
+          slots = ''
+          if access_match.group(3):
+            slots = access_match.group(3)
+          error(filename, linenum, 'whitespace/indent', 3,
+                '%s%s: should be indented +1 space inside %s' % (
+                    access_match.group(2), slots, parent))
+
+    # Consume braces or semicolons from what's left of the line
+    while True:
+      # Match first brace, semicolon, or closed parenthesis.
+      matched = Match(r'^[^{;)}]*([{;)}])(.*)$', line)
+      if not matched:
+        break
+
+      token = matched.group(1)
+      if token == '{':
+        # If namespace or class hasn't seen a opening brace yet, mark
+        # namespace/class head as complete.  Push a new block onto the
+        # stack otherwise.
+        if not self.SeenOpenBrace():
+          self.stack[-1].seen_open_brace = True
+        else:
+          self.stack.append(_BlockInfo(True))
+          if _MATCH_ASM.match(line):
+            self.stack[-1].inline_asm = _BLOCK_ASM
+      elif token == ';' or token == ')':
+        # If we haven't seen an opening brace yet, but we already saw
+        # a semicolon, this is probably a forward declaration.  Pop
+        # the stack for these.
+        #
+        # Similarly, if we haven't seen an opening brace yet, but we
+        # already saw a closing parenthesis, then these are probably
+        # function arguments with extra "class" or "struct" keywords.
+        # Also pop these stack for these.
+        if not self.SeenOpenBrace():
+          self.stack.pop()
+      else:  # token == '}'
+        # Perform end of block checks and pop the stack.
+        if self.stack:
+          self.stack[-1].CheckEnd(filename, clean_lines, linenum, error)
+          self.stack.pop()
+      line = matched.group(2)
+
+  def InnermostClass(self):
+    """Get class info on the top of the stack.
+
+    Returns:
+      A _ClassInfo object if we are inside a class, or None otherwise.
+    """
+    for i in range(len(self.stack), 0, -1):
+      classinfo = self.stack[i - 1]
+      if isinstance(classinfo, _ClassInfo):
+        return classinfo
+    return None
+
+  def CheckCompletedBlocks(self, filename, error):
+    """Checks that all classes and namespaces have been completely parsed.
+
+    Call this when all lines in a file have been processed.
+    Args:
+      filename: The name of the current file.
+      error: The function to call with any errors found.
+    """
+    # Note: This test can result in false positives if #ifdef constructs
+    # get in the way of brace matching. See the testBuildClass test in
+    # cpplint_unittest.py for an example of this.
+    for obj in self.stack:
+      if isinstance(obj, _ClassInfo):
+        error(filename, obj.starting_linenum, 'build/class', 5,
+              'Failed to find complete declaration of class %s' %
+              obj.name)
+      elif isinstance(obj, _NamespaceInfo):
+        error(filename, obj.starting_linenum, 'build/namespaces', 5,
+              'Failed to find complete declaration of namespace %s' %
+              obj.name)
+
+
+def CheckForNonStandardConstructs(filename, clean_lines, linenum,
+                                  nesting_state, error):
+  r"""Logs an error if we see certain non-ANSI constructs ignored by gcc-2.
+
+  Complain about several constructs which gcc-2 accepts, but which are
+  not standard C++.  Warning about these in lint is one way to ease the
+  transition to new compilers.
+  - put storage class first (e.g. "static const" instead of "const static").
+  - "%lld" instead of %qd" in printf-type functions.
+  - "%1$d" is non-standard in printf-type functions.
+  - "\%" is an undefined character escape sequence.
+  - text after #endif is not allowed.
+  - invalid inner-style forward declaration.
+  - >? and <? operators, and their >?= and <?= cousins.
+
+  Additionally, check for constructor/destructor style violations and reference
+  members, as it is very convenient to do so while checking for
+  gcc-2 compliance.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    nesting_state: A _NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: A callable to which errors are reported, which takes 4 arguments:
+           filename, line number, error level, and message
+  """
+
+  # Remove comments from the line, but leave in strings for now.
+  line = clean_lines.lines[linenum]
+
+  if Search(r'printf\s*\(.*".*%[-+ ]?\d*q', line):
+    error(filename, linenum, 'runtime/printf_format', 3,
+          '%q in format strings is deprecated.  Use %ll instead.')
+
+  if Search(r'printf\s*\(.*".*%\d+\$', line):
+    error(filename, linenum, 'runtime/printf_format', 2,
+          '%N$ formats are unconventional.  Try rewriting to avoid them.')
+
+  # Remove escaped backslashes before looking for undefined escapes.
+  line = line.replace('\\\\', '')
+
+  if Search(r'("|\').*\\(%|\[|\(|{)', line):
+    error(filename, linenum, 'build/printf_format', 3,
+          '%, [, (, and { are undefined character escapes.  Unescape them.')
+
+  # For the rest, work with both comments and strings removed.
+  line = clean_lines.elided[linenum]
+
+  if Search(r'\b(const|volatile|void|char|short|int|long'
+            r'|float|double|signed|unsigned'
+            r'|schar|u?int8|u?int16|u?int32|u?int64)'
+            r'\s+(register|static|extern|typedef)\b',
+            line):
+    error(filename, linenum, 'build/storage_class', 5,
+          'Storage class (static, extern, typedef, etc) should be first.')
+
+  if Match(r'\s*#\s*endif\s*[^/\s]+', line):
+    error(filename, linenum, 'build/endif_comment', 5,
+          'Uncommented text after #endif is non-standard.  Use a comment.')
+
+  if Match(r'\s*class\s+(\w+\s*::\s*)+\w+\s*;', line):
+    error(filename, linenum, 'build/forward_decl', 5,
+          'Inner-style forward declarations are invalid.  Remove this line.')
+
+  if Search(r'(\w+|[+-]?\d+(\.\d*)?)\s*(<|>)\?=?\s*(\w+|[+-]?\d+)(\.\d*)?',
+            line):
+    error(filename, linenum, 'build/deprecated', 3,
+          '>? and <? (max and min) operators are non-standard and deprecated.')
+
+  if Search(r'^\s*const\s*string\s*&\s*\w+\s*;', line):
+    # TODO(unknown): Could it be expanded safely to arbitrary references,
+    # without triggering too many false positives? The first
+    # attempt triggered 5 warnings for mostly benign code in the regtest, hence
+    # the restriction.
+    # Here's the original regexp, for the reference:
+    # type_name = r'\w+((\s*::\s*\w+)|(\s*<\s*\w+?\s*>))?'
+    # r'\s*const\s*' + type_name + '\s*&\s*\w+\s*;'
+    error(filename, linenum, 'runtime/member_string_references', 2,
+          'const string& members are dangerous. It is much better to use '
+          'alternatives, such as pointers or simple constants.')
+
+  # Everything else in this function operates on class declarations.
+  # Return early if the top of the nesting stack is not a class, or if
+  # the class head is not completed yet.
+  classinfo = nesting_state.InnermostClass()
+  if not classinfo or not classinfo.seen_open_brace:
+    return
+
+  # The class may have been declared with namespace or classname qualifiers.
+  # The constructor and destructor will not have those qualifiers.
+  base_classname = classinfo.name.split('::')[-1]
+
+  # Look for single-argument constructors that aren't marked explicit.
+  # Technically a valid construct, but against style.
+  args = Match(r'\s+(?:inline\s+)?%s\s*\(([^,()]+)\)'
+               % re.escape(base_classname),
+               line)
+  if (args and
+      args.group(1) != 'void' and
+      not Match(r'(const\s+)?%s(\s+const)?\s*(?:<\w+>\s*)?&'
+                % re.escape(base_classname), args.group(1).strip())):
+    error(filename, linenum, 'runtime/explicit', 5,
+          'Single-argument constructors should be marked explicit.')
+
+
+def CheckSpacingForFunctionCall(filename, line, linenum, error):
+  """Checks for the correctness of various spacing around function calls.
+
+  Args:
+    filename: The name of the current file.
+    line: The text of the line to check.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+
+  # Since function calls often occur inside if/for/while/switch
+  # expressions - which have their own, more liberal conventions - we
+  # first see if we should be looking inside such an expression for a
+  # function call, to which we can apply more strict standards.
+  fncall = line    # if there's no control flow construct, look at whole line
+  for pattern in (r'\bif\s*\((.*)\)\s*{',
+                  r'\bfor\s*\((.*)\)\s*{',
+                  r'\bwhile\s*\((.*)\)\s*[{;]',
+                  r'\bswitch\s*\((.*)\)\s*{'):
+    match = Search(pattern, line)
+    if match:
+      fncall = match.group(1)    # look inside the parens for function calls
+      break
+
+  # Except in if/for/while/switch, there should never be space
+  # immediately inside parens (eg "f( 3, 4 )").  We make an exception
+  # for nested parens ( (a+b) + c ).  Likewise, there should never be
+  # a space before a ( when it's a function argument.  I assume it's a
+  # function argument when the char before the whitespace is legal in
+  # a function name (alnum + _) and we're not starting a macro. Also ignore
+  # pointers and references to arrays and functions coz they're too tricky:
+  # we use a very simple way to recognize these:
+  # " (something)(maybe-something)" or
+  # " (something)(maybe-something," or
+  # " (something)[something]"
+  # Note that we assume the contents of [] to be short enough that
+  # they'll never need to wrap.
+  if (  # Ignore control structures.
+      not Search(r'\b(if|for|while|switch|return|new|delete|catch|sizeof)\b',
+                 fncall) and
+      # Ignore pointers/references to functions.
+      not Search(r' \([^)]+\)\([^)]*(\)|,$)', fncall) and
+      # Ignore pointers/references to arrays.
+      not Search(r' \([^)]+\)\[[^\]]+\]', fncall)):
+    if Search(r'\w\s*\(\s(?!\s*\\$)', fncall):      # a ( used for a fn call
+      error(filename, linenum, 'whitespace/parens', 4,
+            'Extra space after ( in function call')
+    elif Search(r'\(\s+(?!(\s*\\)|\()', fncall):
+      error(filename, linenum, 'whitespace/parens', 2,
+            'Extra space after (')
+    if (Search(r'\w\s+\(', fncall) and
+        not Search(r'#\s*define|typedef', fncall) and
+        not Search(r'\w\s+\((\w+::)*\*\w+\)\(', fncall)):
+      error(filename, linenum, 'whitespace/parens', 4,
+            'Extra space before ( in function call')
+    # If the ) is followed only by a newline or a { + newline, assume it's
+    # part of a control statement (if/while/etc), and don't complain
+    if Search(r'[^)]\s+\)\s*[^{\s]', fncall):
+      # If the closing parenthesis is preceded by only whitespaces,
+      # try to give a more descriptive error message.
+      if Search(r'^\s+\)', fncall):
+        error(filename, linenum, 'whitespace/parens', 2,
+              'Closing ) should be moved to the previous line')
+      else:
+        error(filename, linenum, 'whitespace/parens', 2,
+              'Extra space before )')
+
+
+def IsBlankLine(line):
+  """Returns true if the given line is blank.
+
+  We consider a line to be blank if the line is empty or consists of
+  only white spaces.
+
+  Args:
+    line: A line of a string.
+
+  Returns:
+    True, if the given line is blank.
+  """
+  return not line or line.isspace()
+
+
+def CheckForFunctionLengths(filename, clean_lines, linenum,
+                            function_state, error):
+  """Reports for long function bodies.
+
+  For an overview why this is done, see:
+  http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Write_Short_Functions
+
+  Uses a simplistic algorithm assuming other style guidelines
+  (especially spacing) are followed.
+  Only checks unindented functions, so class members are unchecked.
+  Trivial bodies are unchecked, so constructors with huge initializer lists
+  may be missed.
+  Blank/comment lines are not counted so as to avoid encouraging the removal
+  of vertical space and comments just to get through a lint check.
+  NOLINT *on the last line of a function* disables this check.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    function_state: Current function name and lines in body so far.
+    error: The function to call with any errors found.
+  """
+  lines = clean_lines.lines
+  line = lines[linenum]
+  raw = clean_lines.raw_lines
+  raw_line = raw[linenum]
+  joined_line = ''
+
+  starting_func = False
+  regexp = r'(\w(\w|::|\*|\&|\s)*)\('  # decls * & space::name( ...
+  match_result = Match(regexp, line)
+  if match_result:
+    # If the name is all caps and underscores, figure it's a macro and
+    # ignore it, unless it's TEST or TEST_F.
+    function_name = match_result.group(1).split()[-1]
+    if function_name == 'TEST' or function_name == 'TEST_F' or (
+        not Match(r'[A-Z_]+$', function_name)):
+      starting_func = True
+
+  if starting_func:
+    body_found = False
+    for start_linenum in xrange(linenum, clean_lines.NumLines()):
+      start_line = lines[start_linenum]
+      joined_line += ' ' + start_line.lstrip()
+      if Search(r'(;|})', start_line):  # Declarations and trivial functions
+        body_found = True
+        break                              # ... ignore
+      elif Search(r'{', start_line):
+        body_found = True
+        function = Search(r'((\w|:)*)\(', line).group(1)
+        if Match(r'TEST', function):    # Handle TEST... macros
+          parameter_regexp = Search(r'(\(.*\))', joined_line)
+          if parameter_regexp:             # Ignore bad syntax
+            function += parameter_regexp.group(1)
+        else:
+          function += '()'
+        function_state.Begin(function)
+        break
+    if not body_found:
+      # No body for the function (or evidence of a non-function) was found.
+      error(filename, linenum, 'readability/fn_size', 5,
+            'Lint failed to find start of function body.')
+  elif Match(r'^\}\s*$', line):  # function end
+    function_state.Check(error, filename, linenum)
+    function_state.End()
+  elif not Match(r'^\s*$', line):
+    function_state.Count()  # Count non-blank/non-comment lines.
+
+
+_RE_PATTERN_TODO = re.compile(r'^//(\s*)TODO(\(.+?\))?:?(\s|$)?')
+
+
+def CheckComment(comment, filename, linenum, error):
+  """Checks for common mistakes in TODO comments.
+
+  Args:
+    comment: The text of the comment from the line in question.
+    filename: The name of the current file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  match = _RE_PATTERN_TODO.match(comment)
+  if match:
+    # One whitespace is correct; zero whitespace is handled elsewhere.
+    leading_whitespace = match.group(1)
+    if len(leading_whitespace) > 1:
+      error(filename, linenum, 'whitespace/todo', 2,
+            'Too many spaces before TODO')
+
+    username = match.group(2)
+    if not username:
+      error(filename, linenum, 'readability/todo', 2,
+            'Missing username in TODO; it should look like '
+            '"// TODO(my_username): Stuff."')
+
+    middle_whitespace = match.group(3)
+    # Comparisons made explicit for correctness -- pylint: disable=g-explicit-bool-comparison
+    if middle_whitespace != ' ' and middle_whitespace != '':
+      error(filename, linenum, 'whitespace/todo', 2,
+            'TODO(my_username) should be followed by a space')
+
+def CheckAccess(filename, clean_lines, linenum, nesting_state, error):
+  """Checks for improper use of DISALLOW* macros.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    nesting_state: A _NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]  # get rid of comments and strings
+
+  matched = Match((r'\s*(DISALLOW_COPY_AND_ASSIGN|'
+                   r'DISALLOW_EVIL_CONSTRUCTORS|'
+                   r'DISALLOW_IMPLICIT_CONSTRUCTORS)'), line)
+  if not matched:
+    return
+  if nesting_state.stack and isinstance(nesting_state.stack[-1], _ClassInfo):
+    if nesting_state.stack[-1].access != 'private':
+      error(filename, linenum, 'readability/constructors', 3,
+            '%s must be in the private: section' % matched.group(1))
+
+  else:
+    # Found DISALLOW* macro outside a class declaration, or perhaps it
+    # was used inside a function when it should have been part of the
+    # class declaration.  We could issue a warning here, but it
+    # probably resulted in a compiler error already.
+    pass
+
+
+def FindNextMatchingAngleBracket(clean_lines, linenum, init_suffix):
+  """Find the corresponding > to close a template.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: Current line number.
+    init_suffix: Remainder of the current line after the initial <.
+
+  Returns:
+    True if a matching bracket exists.
+  """
+  line = init_suffix
+  nesting_stack = ['<']
+  while True:
+    # Find the next operator that can tell us whether < is used as an
+    # opening bracket or as a less-than operator.  We only want to
+    # warn on the latter case.
+    #
+    # We could also check all other operators and terminate the search
+    # early, e.g. if we got something like this "a<b+c", the "<" is
+    # most likely a less-than operator, but then we will get false
+    # positives for default arguments and other template expressions.
+    match = Search(r'^[^<>(),;\[\]]*([<>(),;\[\]])(.*)$', line)
+    if match:
+      # Found an operator, update nesting stack
+      operator = match.group(1)
+      line = match.group(2)
+
+      if nesting_stack[-1] == '<':
+        # Expecting closing angle bracket
+        if operator in ('<', '(', '['):
+          nesting_stack.append(operator)
+        elif operator == '>':
+          nesting_stack.pop()
+          if not nesting_stack:
+            # Found matching angle bracket
+            return True
+        elif operator == ',':
+          # Got a comma after a bracket, this is most likely a template
+          # argument.  We have not seen a closing angle bracket yet, but
+          # it's probably a few lines later if we look for it, so just
+          # return early here.
+          return True
+        else:
+          # Got some other operator.
+          return False
+
+      else:
+        # Expecting closing parenthesis or closing bracket
+        if operator in ('<', '(', '['):
+          nesting_stack.append(operator)
+        elif operator in (')', ']'):
+          # We don't bother checking for matching () or [].  If we got
+          # something like (] or [), it would have been a syntax error.
+          nesting_stack.pop()
+
+    else:
+      # Scan the next line
+      linenum += 1
+      if linenum >= len(clean_lines.elided):
+        break
+      line = clean_lines.elided[linenum]
+
+  # Exhausted all remaining lines and still no matching angle bracket.
+  # Most likely the input was incomplete, otherwise we should have
+  # seen a semicolon and returned early.
+  return True
+
+
+def FindPreviousMatchingAngleBracket(clean_lines, linenum, init_prefix):
+  """Find the corresponding < that started a template.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: Current line number.
+    init_prefix: Part of the current line before the initial >.
+
+  Returns:
+    True if a matching bracket exists.
+  """
+  line = init_prefix
+  nesting_stack = ['>']
+  while True:
+    # Find the previous operator
+    match = Search(r'^(.*)([<>(),;\[\]])[^<>(),;\[\]]*$', line)
+    if match:
+      # Found an operator, update nesting stack
+      operator = match.group(2)
+      line = match.group(1)
+
+      if nesting_stack[-1] == '>':
+        # Expecting opening angle bracket
+        if operator in ('>', ')', ']'):
+          nesting_stack.append(operator)
+        elif operator == '<':
+          nesting_stack.pop()
+          if not nesting_stack:
+            # Found matching angle bracket
+            return True
+        elif operator == ',':
+          # Got a comma before a bracket, this is most likely a
+          # template argument.  The opening angle bracket is probably
+          # there if we look for it, so just return early here.
+          return True
+        else:
+          # Got some other operator.
+          return False
+
+      else:
+        # Expecting opening parenthesis or opening bracket
+        if operator in ('>', ')', ']'):
+          nesting_stack.append(operator)
+        elif operator in ('(', '['):
+          nesting_stack.pop()
+
+    else:
+      # Scan the previous line
+      linenum -= 1
+      if linenum < 0:
+        break
+      line = clean_lines.elided[linenum]
+
+  # Exhausted all earlier lines and still no matching angle bracket.
+  return False
+
+
+def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
+  """Checks for the correctness of various spacing issues in the code.
+
+  Things we check for: spaces around operators, spaces after
+  if/for/while/switch, no spaces around parens in function calls, two
+  spaces between code and comment, don't start a block with a blank
+  line, don't end a function with a blank line, don't add a blank line
+  after public/protected/private, don't have too many blank lines in a row.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    nesting_state: A _NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: The function to call with any errors found.
+  """
+
+  # Don't use "elided" lines here, otherwise we can't check commented lines.
+  # Don't want to use "raw" either, because we don't want to check inside C++11
+  # raw strings,
+  raw = clean_lines.lines_without_raw_strings
+  line = raw[linenum]
+
+  # Before nixing comments, check if the line is blank for no good
+  # reason.  This includes the first line after a block is opened, and
+  # blank lines at the end of a function (ie, right before a line like '}'
+  #
+  # Skip all the blank line checks if we are immediately inside a
+  # namespace body.  In other words, don't issue blank line warnings
+  # for this block:
+  #   namespace {
+  #
+  #   }
+  #
+  # A warning about missing end of namespace comments will be issued instead.
+  if IsBlankLine(line) and not nesting_state.InNamespaceBody():
+    elided = clean_lines.elided
+    prev_line = elided[linenum - 1]
+    prevbrace = prev_line.rfind('{')
+    # TODO(unknown): Don't complain if line before blank line, and line after,
+    #                both start with alnums and are indented the same amount.
+    #                This ignores whitespace at the start of a namespace block
+    #                because those are not usually indented.
+    if prevbrace != -1 and prev_line[prevbrace:].find('}') == -1:
+      # OK, we have a blank line at the start of a code block.  Before we
+      # complain, we check if it is an exception to the rule: The previous
+      # non-empty line has the parameters of a function header that are indented
+      # 4 spaces (because they did not fit in a 80 column line when placed on
+      # the same line as the function name).  We also check for the case where
+      # the previous line is indented 6 spaces, which may happen when the
+      # initializers of a constructor do not fit into a 80 column line.
+      exception = False
+      if Match(r' {6}\w', prev_line):  # Initializer list?
+        # We are looking for the opening column of initializer list, which
+        # should be indented 4 spaces to cause 6 space indentation afterwards.
+        search_position = linenum-2
+        while (search_position >= 0
+               and Match(r' {6}\w', elided[search_position])):
+          search_position -= 1
+        exception = (search_position >= 0
+                     and elided[search_position][:5] == '    :')
+      else:
+        # Search for the function arguments or an initializer list.  We use a
+        # simple heuristic here: If the line is indented 4 spaces; and we have a
+        # closing paren, without the opening paren, followed by an opening brace
+        # or colon (for initializer lists) we assume that it is the last line of
+        # a function header.  If we have a colon indented 4 spaces, it is an
+        # initializer list.
+        exception = (Match(r' {4}\w[^\(]*\)\s*(const\s*)?(\{\s*$|:)',
+                           prev_line)
+                     or Match(r' {4}:', prev_line))
+
+      if not exception:
+        error(filename, linenum, 'whitespace/blank_line', 2,
+              'Redundant blank line at the start of a code block '
+              'should be deleted.')
+    # Ignore blank lines at the end of a block in a long if-else
+    # chain, like this:
+    #   if (condition1) {
+    #     // Something followed by a blank line
+    #
+    #   } else if (condition2) {
+    #     // Something else
+    #   }
+    if linenum + 1 < clean_lines.NumLines():
+      next_line = raw[linenum + 1]
+      if (next_line
+          and Match(r'\s*}', next_line)
+          and next_line.find('} else ') == -1):
+        error(filename, linenum, 'whitespace/blank_line', 3,
+              'Redundant blank line at the end of a code block '
+              'should be deleted.')
+
+    matched = Match(r'\s*(public|protected|private):', prev_line)
+    if matched:
+      error(filename, linenum, 'whitespace/blank_line', 3,
+            'Do not leave a blank line after "%s:"' % matched.group(1))
+
+  # Next, we complain if there's a comment too near the text
+  commentpos = line.find('//')
+  if commentpos != -1:
+    # Check if the // may be in quotes.  If so, ignore it
+    # Comparisons made explicit for clarity -- pylint: disable=g-explicit-bool-comparison
+    if (line.count('"', 0, commentpos) -
+        line.count('\\"', 0, commentpos)) % 2 == 0:   # not in quotes
+      # Allow one space for new scopes, two spaces otherwise:
+      if (not Match(r'^\s*{ //', line) and
+          ((commentpos >= 1 and
+            line[commentpos-1] not in string.whitespace) or
+           (commentpos >= 2 and
+            line[commentpos-2] not in string.whitespace))):
+        error(filename, linenum, 'whitespace/comments', 2,
+              'At least two spaces is best between code and comments')
+      # There should always be a space between the // and the comment
+      commentend = commentpos + 2
+      if commentend < len(line) and not line[commentend] == ' ':
+        # but some lines are exceptions -- e.g. if they're big
+        # comment delimiters like:
+        # //----------------------------------------------------------
+        # or are an empty C++ style Doxygen comment, like:
+        # ///
+        # or C++ style Doxygen comments placed after the variable:
+        # ///<  Header comment
+        # //!<  Header comment
+        # or they begin with multiple slashes followed by a space:
+        # //////// Header comment
+        match = (Search(r'[=/-]{4,}\s*$', line[commentend:]) or
+                 Search(r'^/$', line[commentend:]) or
+                 Search(r'^!< ', line[commentend:]) or
+                 Search(r'^/< ', line[commentend:]) or
+                 Search(r'^/+ ', line[commentend:]))
+        if not match:
+          error(filename, linenum, 'whitespace/comments', 4,
+                'Should have a space between // and comment')
+      CheckComment(line[commentpos:], filename, linenum, error)
+
+  line = clean_lines.elided[linenum]  # get rid of comments and strings
+
+  # Don't try to do spacing checks for operator methods
+  line = re.sub(r'operator(==|!=|<|<<|<=|>=|>>|>)\(', 'operator\(', line)
+
+  # We allow no-spaces around = within an if: "if ( (a=Foo()) == 0 )".
+  # Otherwise not.  Note we only check for non-spaces on *both* sides;
+  # sometimes people put non-spaces on one side when aligning ='s among
+  # many lines (not that this is behavior that I approve of...)
+  if Search(r'[\w.]=[\w.]', line) and not Search(r'\b(if|while) ', line):
+    error(filename, linenum, 'whitespace/operators', 4,
+          'Missing spaces around =')
+
+  # It's ok not to have spaces around binary operators like + - * /, but if
+  # there's too little whitespace, we get concerned.  It's hard to tell,
+  # though, so we punt on this one for now.  TODO.
+
+  # You should always have whitespace around binary operators.
+  #
+  # Check <= and >= first to avoid false positives with < and >, then
+  # check non-include lines for spacing around < and >.
+  match = Search(r'[^<>=!\s](==|!=|<=|>=)[^<>=!\s]', line)
+  if match:
+    error(filename, linenum, 'whitespace/operators', 3,
+          'Missing spaces around %s' % match.group(1))
+  # We allow no-spaces around << when used like this: 10<<20, but
+  # not otherwise (particularly, not when used as streams)
+  # Also ignore using ns::operator<<;
+  match = Search(r'(operator|\S)(?:L|UL|ULL|l|ul|ull)?<<(\S)', line)
+  if (match and
+      not (match.group(1).isdigit() and match.group(2).isdigit()) and
+      not (match.group(1) == 'operator' and match.group(2) == ';')):
+    error(filename, linenum, 'whitespace/operators', 3,
+          'Missing spaces around <<')
+  elif not Match(r'#.*include', line):
+    # Avoid false positives on ->
+    reduced_line = line.replace('->', '')
+
+    # Look for < that is not surrounded by spaces.  This is only
+    # triggered if both sides are missing spaces, even though
+    # technically should should flag if at least one side is missing a
+    # space.  This is done to avoid some false positives with shifts.
+    match = Search(r'[^\s<]<([^\s=<].*)', reduced_line)
+    if (match and
+        not FindNextMatchingAngleBracket(clean_lines, linenum, match.group(1))):
+      error(filename, linenum, 'whitespace/operators', 3,
+            'Missing spaces around <')
+
+    # Look for > that is not surrounded by spaces.  Similar to the
+    # above, we only trigger if both sides are missing spaces to avoid
+    # false positives with shifts.
+    match = Search(r'^(.*[^\s>])>[^\s=>]', reduced_line)
+    if (match and
+        not FindPreviousMatchingAngleBracket(clean_lines, linenum,
+                                             match.group(1))):
+      error(filename, linenum, 'whitespace/operators', 3,
+            'Missing spaces around >')
+
+  # We allow no-spaces around >> for almost anything.  This is because
+  # C++11 allows ">>" to close nested templates, which accounts for
+  # most cases when ">>" is not followed by a space.
+  #
+  # We still warn on ">>" followed by alpha character, because that is
+  # likely due to ">>" being used for right shifts, e.g.:
+  #   value >> alpha
+  #
+  # When ">>" is used to close templates, the alphanumeric letter that
+  # follows would be part of an identifier, and there should still be
+  # a space separating the template type and the identifier.
+  #   type<type<type>> alpha
+  match = Search(r'>>[a-zA-Z_]', line)
+  if match:
+    error(filename, linenum, 'whitespace/operators', 3,
+          'Missing spaces around >>')
+
+  # There shouldn't be space around unary operators
+  match = Search(r'(!\s|~\s|[\s]--[\s;]|[\s]\+\+[\s;])', line)
+  if match:
+    error(filename, linenum, 'whitespace/operators', 4,
+          'Extra space for operator %s' % match.group(1))
+
+  # A pet peeve of mine: no spaces after an if, while, switch, or for
+  match = Search(r' (if\(|for\(|while\(|switch\()', line)
+  if match:
+    error(filename, linenum, 'whitespace/parens', 5,
+          'Missing space before ( in %s' % match.group(1))
+
+  # For if/for/while/switch, the left and right parens should be
+  # consistent about how many spaces are inside the parens, and
+  # there should either be zero or one spaces inside the parens.
+  # We don't want: "if ( foo)" or "if ( foo   )".
+  # Exception: "for ( ; foo; bar)" and "for (foo; bar; )" are allowed.
+  match = Search(r'\b(if|for|while|switch)\s*'
+                 r'\(([ ]*)(.).*[^ ]+([ ]*)\)\s*{\s*$',
+                 line)
+  if match:
+    if len(match.group(2)) != len(match.group(4)):
+      if not (match.group(3) == ';' and
+              len(match.group(2)) == 1 + len(match.group(4)) or
+              not match.group(2) and Search(r'\bfor\s*\(.*; \)', line)):
+        error(filename, linenum, 'whitespace/parens', 5,
+              'Mismatching spaces inside () in %s' % match.group(1))
+    if len(match.group(2)) not in [0, 1]:
+      error(filename, linenum, 'whitespace/parens', 5,
+            'Should have zero or one spaces inside ( and ) in %s' %
+            match.group(1))
+
+  # You should always have a space after a comma (either as fn arg or operator)
+  #
+  # This does not apply when the non-space character following the
+  # comma is another comma, since the only time when that happens is
+  # for empty macro arguments.
+  #
+  # We run this check in two passes: first pass on elided lines to
+  # verify that lines contain missing whitespaces, second pass on raw
+  # lines to confirm that those missing whitespaces are not due to
+  # elided comments.
+  if Search(r',[^,\s]', line) and Search(r',[^,\s]', raw[linenum]):
+    error(filename, linenum, 'whitespace/comma', 3,
+          'Missing space after ,')
+
+  # You should always have a space after a semicolon
+  # except for few corner cases
+  # TODO(unknown): clarify if 'if (1) { return 1;}' is requires one more
+  # space after ;
+  if Search(r';[^\s};\\)/]', line):
+    error(filename, linenum, 'whitespace/semicolon', 3,
+          'Missing space after ;')
+
+  # Next we will look for issues with function calls.
+  CheckSpacingForFunctionCall(filename, line, linenum, error)
+
+  # Except after an opening paren, or after another opening brace (in case of
+  # an initializer list, for instance), you should have spaces before your
+  # braces. And since you should never have braces at the beginning of a line,
+  # this is an easy test.
+  match = Match(r'^(.*[^ ({]){', line)
+  if match:
+    # Try a bit harder to check for brace initialization.  This
+    # happens in one of the following forms:
+    #   Constructor() : initializer_list_{} { ... }
+    #   Constructor{}.MemberFunction()
+    #   Type variable{};
+    #   FunctionCall(type{}, ...);
+    #   LastArgument(..., type{});
+    #   LOG(INFO) << type{} << " ...";
+    #   map_of_type[{...}] = ...;
+    #
+    # We check for the character following the closing brace, and
+    # silence the warning if it's one of those listed above, i.e.
+    # "{.;,)<]".
+    #
+    # To account for nested initializer list, we allow any number of
+    # closing braces up to "{;,)<".  We can't simply silence the
+    # warning on first sight of closing brace, because that would
+    # cause false negatives for things that are not initializer lists.
+    #   Silence this:         But not this:
+    #     Outer{                if (...) {
+    #       Inner{...}            if (...){  // Missing space before {
+    #     };                    }
+    #
+    # There is a false negative with this approach if people inserted
+    # spurious semicolons, e.g. "if (cond){};", but we will catch the
+    # spurious semicolon with a separate check.
+    (endline, endlinenum, endpos) = CloseExpression(
+        clean_lines, linenum, len(match.group(1)))
+    trailing_text = ''
+    if endpos > -1:
+      trailing_text = endline[endpos:]
+    for offset in xrange(endlinenum + 1,
+                         min(endlinenum + 3, clean_lines.NumLines() - 1)):
+      trailing_text += clean_lines.elided[offset]
+    if not Match(r'^[\s}]*[{.;,)<\]]', trailing_text):
+      error(filename, linenum, 'whitespace/braces', 5,
+            'Missing space before {')
+
+  # Make sure '} else {' has spaces.
+  if Search(r'}else', line):
+    error(filename, linenum, 'whitespace/braces', 5,
+          'Missing space before else')
+
+  # You shouldn't have spaces before your brackets, except maybe after
+  # 'delete []' or 'new char * []'.
+  if Search(r'\w\s+\[', line) and not Search(r'delete\s+\[', line):
+    error(filename, linenum, 'whitespace/braces', 5,
+          'Extra space before [')
+
+  # You shouldn't have a space before a semicolon at the end of the line.
+  # There's a special case for "for" since the style guide allows space before
+  # the semicolon there.
+  if Search(r':\s*;\s*$', line):
+    error(filename, linenum, 'whitespace/semicolon', 5,
+          'Semicolon defining empty statement. Use {} instead.')
+  elif Search(r'^\s*;\s*$', line):
+    error(filename, linenum, 'whitespace/semicolon', 5,
+          'Line contains only semicolon. If this should be an empty statement, '
+          'use {} instead.')
+  elif (Search(r'\s+;\s*$', line) and
+        not Search(r'\bfor\b', line)):
+    error(filename, linenum, 'whitespace/semicolon', 5,
+          'Extra space before last semicolon. If this should be an empty '
+          'statement, use {} instead.')
+
+  # In range-based for, we wanted spaces before and after the colon, but
+  # not around "::" tokens that might appear.
+  if (Search('for *\(.*[^:]:[^: ]', line) or
+      Search('for *\(.*[^: ]:[^:]', line)):
+    error(filename, linenum, 'whitespace/forcolon', 2,
+          'Missing space around colon in range-based for loop')
+
+
+def CheckSectionSpacing(filename, clean_lines, class_info, linenum, error):
+  """Checks for additional blank line issues related to sections.
+
+  Currently the only thing checked here is blank line before protected/private.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    class_info: A _ClassInfo objects.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  # Skip checks if the class is small, where small means 25 lines or less.
+  # 25 lines seems like a good cutoff since that's the usual height of
+  # terminals, and any class that can't fit in one screen can't really
+  # be considered "small".
+  #
+  # Also skip checks if we are on the first line.  This accounts for
+  # classes that look like
+  #   class Foo { public: ... };
+  #
+  # If we didn't find the end of the class, last_line would be zero,
+  # and the check will be skipped by the first condition.
+  if (class_info.last_line - class_info.starting_linenum <= 24 or
+      linenum <= class_info.starting_linenum):
+    return
+
+  matched = Match(r'\s*(public|protected|private):', clean_lines.lines[linenum])
+  if matched:
+    # Issue warning if the line before public/protected/private was
+    # not a blank line, but don't do this if the previous line contains
+    # "class" or "struct".  This can happen two ways:
+    #  - We are at the beginning of the class.
+    #  - We are forward-declaring an inner class that is semantically
+    #    private, but needed to be public for implementation reasons.
+    # Also ignores cases where the previous line ends with a backslash as can be
+    # common when defining classes in C macros.
+    prev_line = clean_lines.lines[linenum - 1]
+    if (not IsBlankLine(prev_line) and
+        not Search(r'\b(class|struct)\b', prev_line) and
+        not Search(r'\\$', prev_line)):
+      # Try a bit harder to find the beginning of the class.  This is to
+      # account for multi-line base-specifier lists, e.g.:
+      #   class Derived
+      #       : public Base {
+      end_class_head = class_info.starting_linenum
+      for i in range(class_info.starting_linenum, linenum):
+        if Search(r'\{\s*$', clean_lines.lines[i]):
+          end_class_head = i
+          break
+      if end_class_head < linenum - 1:
+        error(filename, linenum, 'whitespace/blank_line', 3,
+              '"%s:" should be preceded by a blank line' % matched.group(1))
+
+
+def GetPreviousNonBlankLine(clean_lines, linenum):
+  """Return the most recent non-blank line and its line number.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file contents.
+    linenum: The number of the line to check.
+
+  Returns:
+    A tuple with two elements.  The first element is the contents of the last
+    non-blank line before the current line, or the empty string if this is the
+    first non-blank line.  The second is the line number of that line, or -1
+    if this is the first non-blank line.
+  """
+
+  prevlinenum = linenum - 1
+  while prevlinenum >= 0:
+    prevline = clean_lines.elided[prevlinenum]
+    if not IsBlankLine(prevline):     # if not a blank line...
+      return (prevline, prevlinenum)
+    prevlinenum -= 1
+  return ('', -1)
+
+
+def CheckBraces(filename, clean_lines, linenum, error):
+  """Looks for misplaced braces (e.g. at the end of line).
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+
+  line = clean_lines.elided[linenum]        # get rid of comments and strings
+
+  if Match(r'\s*{\s*$', line):
+    # We allow an open brace to start a line in the case where someone is using
+    # braces in a block to explicitly create a new scope, which is commonly used
+    # to control the lifetime of stack-allocated variables.  Braces are also
+    # used for brace initializers inside function calls.  We don't detect this
+    # perfectly: we just don't complain if the last non-whitespace character on
+    # the previous non-blank line is ',', ';', ':', '(', '{', or '}', or if the
+    # previous line starts a preprocessor block.
+    prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0]
+    if (not Search(r'[,;:}{(]\s*$', prevline) and
+        not Match(r'\s*#', prevline)):
+      error(filename, linenum, 'whitespace/braces', 4,
+            '{ should almost always be at the end of the previous line')
+
+  # An else clause should be on the same line as the preceding closing brace.
+  if Match(r'\s*else\s*', line):
+    prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0]
+    if Match(r'\s*}\s*$', prevline):
+      error(filename, linenum, 'whitespace/newline', 4,
+            'An else should appear on the same line as the preceding }')
+
+  # If braces come on one side of an else, they should be on both.
+  # However, we have to worry about "else if" that spans multiple lines!
+  if Search(r'}\s*else[^{]*$', line) or Match(r'[^}]*else\s*{', line):
+    if Search(r'}\s*else if([^{]*)$', line):       # could be multi-line if
+      # find the ( after the if
+      pos = line.find('else if')
+      pos = line.find('(', pos)
+      if pos > 0:
+        (endline, _, endpos) = CloseExpression(clean_lines, linenum, pos)
+        if endline[endpos:].find('{') == -1:    # must be brace after if
+          error(filename, linenum, 'readability/braces', 5,
+                'If an else has a brace on one side, it should have it on both')
+    else:            # common case: else not followed by a multi-line if
+      error(filename, linenum, 'readability/braces', 5,
+            'If an else has a brace on one side, it should have it on both')
+
+  # Likewise, an else should never have the else clause on the same line
+  if Search(r'\belse [^\s{]', line) and not Search(r'\belse if\b', line):
+    error(filename, linenum, 'whitespace/newline', 4,
+          'Else clause should never be on same line as else (use 2 lines)')
+
+  # In the same way, a do/while should never be on one line
+  if Match(r'\s*do [^\s{]', line):
+    error(filename, linenum, 'whitespace/newline', 4,
+          'do/while clauses should not be on a single line')
+
+  # Block bodies should not be followed by a semicolon.  Due to C++11
+  # brace initialization, there are more places where semicolons are
+  # required than not, so we use a whitelist approach to check these
+  # rather than a blacklist.  These are the places where "};" should
+  # be replaced by just "}":
+  # 1. Some flavor of block following closing parenthesis:
+  #    for (;;) {};
+  #    while (...) {};
+  #    switch (...) {};
+  #    Function(...) {};
+  #    if (...) {};
+  #    if (...) else if (...) {};
+  #
+  # 2. else block:
+  #    if (...) else {};
+  #
+  # 3. const member function:
+  #    Function(...) const {};
+  #
+  # 4. Block following some statement:
+  #    x = 42;
+  #    {};
+  #
+  # 5. Block at the beginning of a function:
+  #    Function(...) {
+  #      {};
+  #    }
+  #
+  #    Note that naively checking for the preceding "{" will also match
+  #    braces inside multi-dimensional arrays, but this is fine since
+  #    that expression will not contain semicolons.
+  #
+  # 6. Block following another block:
+  #    while (true) {}
+  #    {};
+  #
+  # 7. End of namespaces:
+  #    namespace {};
+  #
+  #    These semicolons seems far more common than other kinds of
+  #    redundant semicolons, possibly due to people converting classes
+  #    to namespaces.  For now we do not warn for this case.
+  #
+  # Try matching case 1 first.
+  match = Match(r'^(.*\)\s*)\{', line)
+  if match:
+    # Matched closing parenthesis (case 1).  Check the token before the
+    # matching opening parenthesis, and don't warn if it looks like a
+    # macro.  This avoids these false positives:
+    #  - macro that defines a base class
+    #  - multi-line macro that defines a base class
+    #  - macro that defines the whole class-head
+    #
+    # But we still issue warnings for macros that we know are safe to
+    # warn, specifically:
+    #  - TEST, TEST_F, TEST_P, MATCHER, MATCHER_P
+    #  - TYPED_TEST
+    #  - INTERFACE_DEF
+    #  - EXCLUSIVE_LOCKS_REQUIRED, SHARED_LOCKS_REQUIRED, LOCKS_EXCLUDED:
+    #
+    # We implement a whitelist of safe macros instead of a blacklist of
+    # unsafe macros, even though the latter appears less frequently in
+    # google code and would have been easier to implement.  This is because
+    # the downside for getting the whitelist wrong means some extra
+    # semicolons, while the downside for getting the blacklist wrong
+    # would result in compile errors.
+    #
+    # In addition to macros, we also don't want to warn on compound
+    # literals.
+    closing_brace_pos = match.group(1).rfind(')')
+    opening_parenthesis = ReverseCloseExpression(
+        clean_lines, linenum, closing_brace_pos)
+    if opening_parenthesis[2] > -1:
+      line_prefix = opening_parenthesis[0][0:opening_parenthesis[2]]
+      macro = Search(r'\b([A-Z_]+)\s*$', line_prefix)
+      if ((macro and
+           macro.group(1) not in (
+               'TEST', 'TEST_F', 'MATCHER', 'MATCHER_P', 'TYPED_TEST',
+               'EXCLUSIVE_LOCKS_REQUIRED', 'SHARED_LOCKS_REQUIRED',
+               'LOCKS_EXCLUDED', 'INTERFACE_DEF')) or
+          Search(r'\s+=\s*$', line_prefix)):
+        match = None
+
+  else:
+    # Try matching cases 2-3.
+    match = Match(r'^(.*(?:else|\)\s*const)\s*)\{', line)
+    if not match:
+      # Try matching cases 4-6.  These are always matched on separate lines.
+      #
+      # Note that we can't simply concatenate the previous line to the
+      # current line and do a single match, otherwise we may output
+      # duplicate warnings for the blank line case:
+      #   if (cond) {
+      #     // blank line
+      #   }
+      prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0]
+      if prevline and Search(r'[;{}]\s*$', prevline):
+        match = Match(r'^(\s*)\{', line)
+
+  # Check matching closing brace
+  if match:
+    (endline, endlinenum, endpos) = CloseExpression(
+        clean_lines, linenum, len(match.group(1)))
+    if endpos > -1 and Match(r'^\s*;', endline[endpos:]):
+      # Current {} pair is eligible for semicolon check, and we have found
+      # the redundant semicolon, output warning here.
+      #
+      # Note: because we are scanning forward for opening braces, and
+      # outputting warnings for the matching closing brace, if there are
+      # nested blocks with trailing semicolons, we will get the error
+      # messages in reversed order.
+      error(filename, endlinenum, 'readability/braces', 4,
+            "You don't need a ; after a }")
+
+
+def CheckEmptyBlockBody(filename, clean_lines, linenum, error):
+  """Look for empty loop/conditional body with only a single semicolon.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+
+  # Search for loop keywords at the beginning of the line.  Because only
+  # whitespaces are allowed before the keywords, this will also ignore most
+  # do-while-loops, since those lines should start with closing brace.
+  #
+  # We also check "if" blocks here, since an empty conditional block
+  # is likely an error.
+  line = clean_lines.elided[linenum]
+  matched = Match(r'\s*(for|while|if)\s*\(', line)
+  if matched:
+    # Find the end of the conditional expression
+    (end_line, end_linenum, end_pos) = CloseExpression(
+        clean_lines, linenum, line.find('('))
+
+    # Output warning if what follows the condition expression is a semicolon.
+    # No warning for all other cases, including whitespace or newline, since we
+    # have a separate check for semicolons preceded by whitespace.
+    if end_pos >= 0 and Match(r';', end_line[end_pos:]):
+      if matched.group(1) == 'if':
+        error(filename, end_linenum, 'whitespace/empty_conditional_body', 5,
+              'Empty conditional bodies should use {}')
+      else:
+        error(filename, end_linenum, 'whitespace/empty_loop_body', 5,
+              'Empty loop bodies should use {} or continue')
+
+
+def CheckCheck(filename, clean_lines, linenum, error):
+  """Checks the use of CHECK and EXPECT macros.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+
+  # Decide the set of replacement macros that should be suggested
+  lines = clean_lines.elided
+  check_macro = None
+  start_pos = -1
+  for macro in _CHECK_MACROS:
+    i = lines[linenum].find(macro)
+    if i >= 0:
+      check_macro = macro
+
+      # Find opening parenthesis.  Do a regular expression match here
+      # to make sure that we are matching the expected CHECK macro, as
+      # opposed to some other macro that happens to contain the CHECK
+      # substring.
+      matched = Match(r'^(.*\b' + check_macro + r'\s*)\(', lines[linenum])
+      if not matched:
+        continue
+      start_pos = len(matched.group(1))
+      break
+  if not check_macro or start_pos < 0:
+    # Don't waste time here if line doesn't contain 'CHECK' or 'EXPECT'
+    return
+
+  # Find end of the boolean expression by matching parentheses
+  (last_line, end_line, end_pos) = CloseExpression(
+      clean_lines, linenum, start_pos)
+  if end_pos < 0:
+    return
+  if linenum == end_line:
+    expression = lines[linenum][start_pos + 1:end_pos - 1]
+  else:
+    expression = lines[linenum][start_pos + 1:]
+    for i in xrange(linenum + 1, end_line):
+      expression += lines[i]
+    expression += last_line[0:end_pos - 1]
+
+  # Parse expression so that we can take parentheses into account.
+  # This avoids false positives for inputs like "CHECK((a < 4) == b)",
+  # which is not replaceable by CHECK_LE.
+  lhs = ''
+  rhs = ''
+  operator = None
+  while expression:
+    matched = Match(r'^\s*(<<|<<=|>>|>>=|->\*|->|&&|\|\||'
+                    r'==|!=|>=|>|<=|<|\()(.*)$', expression)
+    if matched:
+      token = matched.group(1)
+      if token == '(':
+        # Parenthesized operand
+        expression = matched.group(2)
+        (end, _) = FindEndOfExpressionInLine(expression, 0, 1, '(', ')')
+        if end < 0:
+          return  # Unmatched parenthesis
+        lhs += '(' + expression[0:end]
+        expression = expression[end:]
+      elif token in ('&&', '||'):
+        # Logical and/or operators.  This means the expression
+        # contains more than one term, for example:
+        #   CHECK(42 < a && a < b);
+        #
+        # These are not replaceable with CHECK_LE, so bail out early.
+        return
+      elif token in ('<<', '<<=', '>>', '>>=', '->*', '->'):
+        # Non-relational operator
+        lhs += token
+        expression = matched.group(2)
+      else:
+        # Relational operator
+        operator = token
+        rhs = matched.group(2)
+        break
+    else:
+      # Unparenthesized operand.  Instead of appending to lhs one character
+      # at a time, we do another regular expression match to consume several
+      # characters at once if possible.  Trivial benchmark shows that this
+      # is more efficient when the operands are longer than a single
+      # character, which is generally the case.
+      matched = Match(r'^([^-=!<>()&|]+)(.*)$', expression)
+      if not matched:
+        matched = Match(r'^(\s*\S)(.*)$', expression)
+        if not matched:
+          break
+      lhs += matched.group(1)
+      expression = matched.group(2)
+
+  # Only apply checks if we got all parts of the boolean expression
+  if not (lhs and operator and rhs):
+    return
+
+  # Check that rhs do not contain logical operators.  We already know
+  # that lhs is fine since the loop above parses out && and ||.
+  if rhs.find('&&') > -1 or rhs.find('||') > -1:
+    return
+
+  # At least one of the operands must be a constant literal.  This is
+  # to avoid suggesting replacements for unprintable things like
+  # CHECK(variable != iterator)
+  #
+  # The following pattern matches decimal, hex integers, strings, and
+  # characters (in that order).
+  lhs = lhs.strip()
+  rhs = rhs.strip()
+  match_constant = r'^([-+]?(\d+|0[xX][0-9a-fA-F]+)[lLuU]{0,3}|".*"|\'.*\')$'
+  if Match(match_constant, lhs) or Match(match_constant, rhs):
+    # Note: since we know both lhs and rhs, we can provide a more
+    # descriptive error message like:
+    #   Consider using CHECK_EQ(x, 42) instead of CHECK(x == 42)
+    # Instead of:
+    #   Consider using CHECK_EQ instead of CHECK(a == b)
+    #
+    # We are still keeping the less descriptive message because if lhs
+    # or rhs gets long, the error message might become unreadable.
+    error(filename, linenum, 'readability/check', 2,
+          'Consider using %s instead of %s(a %s b)' % (
+              _CHECK_REPLACEMENT[check_macro][operator],
+              check_macro, operator))
+
+
+def CheckAltTokens(filename, clean_lines, linenum, error):
+  """Check alternative keywords being used in boolean expressions.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  # Avoid preprocessor lines
+  if Match(r'^\s*#', line):
+    return
+
+  # Last ditch effort to avoid multi-line comments.  This will not help
+  # if the comment started before the current line or ended after the
+  # current line, but it catches most of the false positives.  At least,
+  # it provides a way to workaround this warning for people who use
+  # multi-line comments in preprocessor macros.
+  #
+  # TODO(unknown): remove this once cpplint has better support for
+  # multi-line comments.
+  if line.find('/*') >= 0 or line.find('*/') >= 0:
+    return
+
+  for match in _ALT_TOKEN_REPLACEMENT_PATTERN.finditer(line):
+    error(filename, linenum, 'readability/alt_tokens', 2,
+          'Use operator %s instead of %s' % (
+              _ALT_TOKEN_REPLACEMENT[match.group(1)], match.group(1)))
+
+
+def GetLineWidth(line):
+  """Determines the width of the line in column positions.
+
+  Args:
+    line: A string, which may be a Unicode string.
+
+  Returns:
+    The width of the line in column positions, accounting for Unicode
+    combining characters and wide characters.
+  """
+  if isinstance(line, unicode):
+    width = 0
+    for uc in unicodedata.normalize('NFC', line):
+      if unicodedata.east_asian_width(uc) in ('W', 'F'):
+        width += 2
+      elif not unicodedata.combining(uc):
+        width += 1
+    return width
+  else:
+    return len(line)
+
+
+def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state,
+               error):
+  """Checks rules from the 'C++ style rules' section of cppguide.html.
+
+  Most of these rules are hard to test (naming, comment style), but we
+  do what we can.  In particular we check for 2-space indents, line lengths,
+  tab usage, spaces inside code, etc.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    file_extension: The extension (without the dot) of the filename.
+    nesting_state: A _NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: The function to call with any errors found.
+  """
+
+  # Don't use "elided" lines here, otherwise we can't check commented lines.
+  # Don't want to use "raw" either, because we don't want to check inside C++11
+  # raw strings,
+  raw_lines = clean_lines.lines_without_raw_strings
+  line = raw_lines[linenum]
+
+  if line.find('\t') != -1:
+    error(filename, linenum, 'whitespace/tab', 1,
+          'Tab found; better to use spaces')
+
+  # One or three blank spaces at the beginning of the line is weird; it's
+  # hard to reconcile that with 2-space indents.
+  # NOTE: here are the conditions rob pike used for his tests.  Mine aren't
+  # as sophisticated, but it may be worth becoming so:  RLENGTH==initial_spaces
+  # if(RLENGTH > 20) complain = 0;
+  # if(match($0, " +(error|private|public|protected):")) complain = 0;
+  # if(match(prev, "&& *$")) complain = 0;
+  # if(match(prev, "\\|\\| *$")) complain = 0;
+  # if(match(prev, "[\",=><] *$")) complain = 0;
+  # if(match($0, " <<")) complain = 0;
+  # if(match(prev, " +for \\(")) complain = 0;
+  # if(prevodd && match(prevprev, " +for \\(")) complain = 0;
+  initial_spaces = 0
+  cleansed_line = clean_lines.elided[linenum]
+  while initial_spaces < len(line) and line[initial_spaces] == ' ':
+    initial_spaces += 1
+  if line and line[-1].isspace():
+    error(filename, linenum, 'whitespace/end_of_line', 4,
+          'Line ends in whitespace.  Consider deleting these extra spaces.')
+  # There are certain situations we allow one space, notably for section labels
+  elif ((initial_spaces == 1 or initial_spaces == 3) and
+        not Match(r'\s*\w+\s*:\s*$', cleansed_line)):
+    error(filename, linenum, 'whitespace/indent', 3,
+          'Weird number of spaces at line-start.  '
+          'Are you using a 2-space indent?')
+
+  # Check if the line is a header guard.
+  is_header_guard = False
+  if file_extension == 'h':
+    cppvar = GetHeaderGuardCPPVariable(filename)
+    if (line.startswith('#ifndef %s' % cppvar) or
+        line.startswith('#define %s' % cppvar) or
+        line.startswith('#endif  // %s' % cppvar)):
+      is_header_guard = True
+  # #include lines and header guards can be long, since there's no clean way to
+  # split them.
+  #
+  # URLs can be long too.  It's possible to split these, but it makes them
+  # harder to cut&paste.
+  #
+  # The "$Id:...$" comment may also get very long without it being the
+  # developers fault.
+  if (not line.startswith('#include') and not is_header_guard and
+      not Match(r'^\s*//.*http(s?)://\S*$', line) and
+      not Match(r'^// \$Id:.*#[0-9]+ \$$', line)):
+    line_width = GetLineWidth(line)
+    extended_length = int((_line_length * 1.25))
+    if line_width > extended_length:
+      error(filename, linenum, 'whitespace/line_length', 4,
+            'Lines should very rarely be longer than %i characters' %
+            extended_length)
+    elif line_width > _line_length:
+      error(filename, linenum, 'whitespace/line_length', 2,
+            'Lines should be <= %i characters long' % _line_length)
+
+  if (cleansed_line.count(';') > 1 and
+      # for loops are allowed two ;'s (and may run over two lines).
+      cleansed_line.find('for') == -1 and
+      (GetPreviousNonBlankLine(clean_lines, linenum)[0].find('for') == -1 or
+       GetPreviousNonBlankLine(clean_lines, linenum)[0].find(';') != -1) and
+      # It's ok to have many commands in a switch case that fits in 1 line
+      not ((cleansed_line.find('case ') != -1 or
+            cleansed_line.find('default:') != -1) and
+           cleansed_line.find('break;') != -1)):
+    error(filename, linenum, 'whitespace/newline', 0,
+          'More than one command on the same line')
+
+  # Some more style checks
+  CheckBraces(filename, clean_lines, linenum, error)
+  CheckEmptyBlockBody(filename, clean_lines, linenum, error)
+  CheckAccess(filename, clean_lines, linenum, nesting_state, error)
+  CheckSpacing(filename, clean_lines, linenum, nesting_state, error)
+  CheckCheck(filename, clean_lines, linenum, error)
+  CheckAltTokens(filename, clean_lines, linenum, error)
+  classinfo = nesting_state.InnermostClass()
+  if classinfo:
+    CheckSectionSpacing(filename, clean_lines, classinfo, linenum, error)
+
+
+_RE_PATTERN_INCLUDE_NEW_STYLE = re.compile(r'#include +"[^/]+\.h"')
+_RE_PATTERN_INCLUDE = re.compile(r'^\s*#\s*include\s*([<"])([^>"]*)[>"].*$')
+# Matches the first component of a filename delimited by -s and _s. That is:
+#  _RE_FIRST_COMPONENT.match('foo').group(0) == 'foo'
+#  _RE_FIRST_COMPONENT.match('foo.cc').group(0) == 'foo'
+#  _RE_FIRST_COMPONENT.match('foo-bar_baz.cc').group(0) == 'foo'
+#  _RE_FIRST_COMPONENT.match('foo_bar-baz.cc').group(0) == 'foo'
+_RE_FIRST_COMPONENT = re.compile(r'^[^-_.]+')
+
+
+def _DropCommonSuffixes(filename):
+  """Drops common suffixes like _test.cc or -inl.h from filename.
+
+  For example:
+    >>> _DropCommonSuffixes('foo/foo-inl.h')
+    'foo/foo'
+    >>> _DropCommonSuffixes('foo/bar/foo.cc')
+    'foo/bar/foo'
+    >>> _DropCommonSuffixes('foo/foo_internal.h')
+    'foo/foo'
+    >>> _DropCommonSuffixes('foo/foo_unusualinternal.h')
+    'foo/foo_unusualinternal'
+
+  Args:
+    filename: The input filename.
+
+  Returns:
+    The filename with the common suffix removed.
+  """
+  for suffix in ('test.cc', 'regtest.cc', 'unittest.cc',
+                 'inl.h', 'impl.h', 'internal.h'):
+    if (filename.endswith(suffix) and len(filename) > len(suffix) and
+        filename[-len(suffix) - 1] in ('-', '_')):
+      return filename[:-len(suffix) - 1]
+  return os.path.splitext(filename)[0]
+
+
+def _IsTestFilename(filename):
+  """Determines if the given filename has a suffix that identifies it as a test.
+
+  Args:
+    filename: The input filename.
+
+  Returns:
+    True if 'filename' looks like a test, False otherwise.
+  """
+  if (filename.endswith('_test.cc') or
+      filename.endswith('_unittest.cc') or
+      filename.endswith('_regtest.cc')):
+    return True
+  else:
+    return False
+
+
+def _ClassifyInclude(fileinfo, include, is_system):
+  """Figures out what kind of header 'include' is.
+
+  Args:
+    fileinfo: The current file cpplint is running over. A FileInfo instance.
+    include: The path to a #included file.
+    is_system: True if the #include used <> rather than "".
+
+  Returns:
+    One of the _XXX_HEADER constants.
+
+  For example:
+    >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'stdio.h', True)
+    _C_SYS_HEADER
+    >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'string', True)
+    _CPP_SYS_HEADER
+    >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'foo/foo.h', False)
+    _LIKELY_MY_HEADER
+    >>> _ClassifyInclude(FileInfo('foo/foo_unknown_extension.cc'),
+    ...                  'bar/foo_other_ext.h', False)
+    _POSSIBLE_MY_HEADER
+    >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'foo/bar.h', False)
+    _OTHER_HEADER
+  """
+  # This is a list of all standard c++ header files, except
+  # those already checked for above.
+  is_cpp_h = include in _CPP_HEADERS
+
+  if is_system:
+    if is_cpp_h:
+      return _CPP_SYS_HEADER
+    else:
+      return _C_SYS_HEADER
+
+  # If the target file and the include we're checking share a
+  # basename when we drop common extensions, and the include
+  # lives in . , then it's likely to be owned by the target file.
+  target_dir, target_base = (
+      os.path.split(_DropCommonSuffixes(fileinfo.RepositoryName())))
+  include_dir, include_base = os.path.split(_DropCommonSuffixes(include))
+  if target_base == include_base and (
+      include_dir == target_dir or
+      include_dir == os.path.normpath(target_dir + '/../public')):
+    return _LIKELY_MY_HEADER
+
+  # If the target and include share some initial basename
+  # component, it's possible the target is implementing the
+  # include, so it's allowed to be first, but we'll never
+  # complain if it's not there.
+  target_first_component = _RE_FIRST_COMPONENT.match(target_base)
+  include_first_component = _RE_FIRST_COMPONENT.match(include_base)
+  if (target_first_component and include_first_component and
+      target_first_component.group(0) ==
+      include_first_component.group(0)):
+    return _POSSIBLE_MY_HEADER
+
+  return _OTHER_HEADER
+
+
+
+def CheckIncludeLine(filename, clean_lines, linenum, include_state, error):
+  """Check rules that are applicable to #include lines.
+
+  Strings on #include lines are NOT removed from elided line, to make
+  certain tasks easier. However, to prevent false positives, checks
+  applicable to #include lines in CheckLanguage must be put here.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    include_state: An _IncludeState instance in which the headers are inserted.
+    error: The function to call with any errors found.
+  """
+  fileinfo = FileInfo(filename)
+
+  line = clean_lines.lines[linenum]
+
+  # "include" should use the new style "foo/bar.h" instead of just "bar.h"
+  if _RE_PATTERN_INCLUDE_NEW_STYLE.search(line):
+    error(filename, linenum, 'build/include', 4,
+          'Include the directory when naming .h files')
+
+  # we shouldn't include a file more than once. actually, there are a
+  # handful of instances where doing so is okay, but in general it's
+  # not.
+  match = _RE_PATTERN_INCLUDE.search(line)
+  if match:
+    include = match.group(2)
+    is_system = (match.group(1) == '<')
+    if include in include_state:
+      error(filename, linenum, 'build/include', 4,
+            '"%s" already included at %s:%s' %
+            (include, filename, include_state[include]))
+    else:
+      include_state[include] = linenum
+
+      # We want to ensure that headers appear in the right order:
+      # 1) for foo.cc, foo.h  (preferred location)
+      # 2) c system files
+      # 3) cpp system files
+      # 4) for foo.cc, foo.h  (deprecated location)
+      # 5) other google headers
+      #
+      # We classify each include statement as one of those 5 types
+      # using a number of techniques. The include_state object keeps
+      # track of the highest type seen, and complains if we see a
+      # lower type after that.
+      error_message = include_state.CheckNextIncludeOrder(
+          _ClassifyInclude(fileinfo, include, is_system))
+      if error_message:
+        error(filename, linenum, 'build/include_order', 4,
+              '%s. Should be: %s.h, c system, c++ system, other.' %
+              (error_message, fileinfo.BaseName()))
+      canonical_include = include_state.CanonicalizeAlphabeticalOrder(include)
+      if not include_state.IsInAlphabeticalOrder(
+          clean_lines, linenum, canonical_include):
+        error(filename, linenum, 'build/include_alpha', 4,
+              'Include "%s" not in alphabetical order' % include)
+      include_state.SetLastHeader(canonical_include)
+
+  # Look for any of the stream classes that are part of standard C++.
+  match = _RE_PATTERN_INCLUDE.match(line)
+  if match:
+    include = match.group(2)
+    if Match(r'(f|ind|io|i|o|parse|pf|stdio|str|)?stream$', include):
+      # Many unit tests use cout, so we exempt them.
+      if not _IsTestFilename(filename):
+        error(filename, linenum, 'readability/streams', 3,
+              'Streams are highly discouraged.')
+
+
+def _GetTextInside(text, start_pattern):
+  r"""Retrieves all the text between matching open and close parentheses.
+
+  Given a string of lines and a regular expression string, retrieve all the text
+  following the expression and between opening punctuation symbols like
+  (, [, or {, and the matching close-punctuation symbol. This properly nested
+  occurrences of the punctuations, so for the text like
+    printf(a(), b(c()));
+  a call to _GetTextInside(text, r'printf\(') will return 'a(), b(c())'.
+  start_pattern must match string having an open punctuation symbol at the end.
+
+  Args:
+    text: The lines to extract text. Its comments and strings must be elided.
+           It can be single line and can span multiple lines.
+    start_pattern: The regexp string indicating where to start extracting
+                   the text.
+  Returns:
+    The extracted text.
+    None if either the opening string or ending punctuation could not be found.
+  """
+  # TODO(sugawarayu): Audit cpplint.py to see what places could be profitably
+  # rewritten to use _GetTextInside (and use inferior regexp matching today).
+
+  # Give opening punctuations to get the matching close-punctuations.
+  matching_punctuation = {'(': ')', '{': '}', '[': ']'}
+  closing_punctuation = set(matching_punctuation.itervalues())
+
+  # Find the position to start extracting text.
+  match = re.search(start_pattern, text, re.M)
+  if not match:  # start_pattern not found in text.
+    return None
+  start_position = match.end(0)
+
+  assert start_position > 0, (
+      'start_pattern must ends with an opening punctuation.')
+  assert text[start_position - 1] in matching_punctuation, (
+      'start_pattern must ends with an opening punctuation.')
+  # Stack of closing punctuations we expect to have in text after position.
+  punctuation_stack = [matching_punctuation[text[start_position - 1]]]
+  position = start_position
+  while punctuation_stack and position < len(text):
+    if text[position] == punctuation_stack[-1]:
+      punctuation_stack.pop()
+    elif text[position] in closing_punctuation:
+      # A closing punctuation without matching opening punctuations.
+      return None
+    elif text[position] in matching_punctuation:
+      punctuation_stack.append(matching_punctuation[text[position]])
+    position += 1
+  if punctuation_stack:
+    # Opening punctuations left without matching close-punctuations.
+    return None
+  # punctuations match.
+  return text[start_position:position - 1]
+
+
+# Patterns for matching call-by-reference parameters.
+#
+# Supports nested templates up to 2 levels deep using this messy pattern:
+#   < (?: < (?: < [^<>]*
+#               >
+#           |   [^<>] )*
+#         >
+#     |   [^<>] )*
+#   >
+_RE_PATTERN_IDENT = r'[_a-zA-Z]\w*'  # =~ [[:alpha:]][[:alnum:]]*
+_RE_PATTERN_TYPE = (
+    r'(?:const\s+)?(?:typename\s+|class\s+|struct\s+|union\s+|enum\s+)?'
+    r'(?:\w|'
+    r'\s*<(?:<(?:<[^<>]*>|[^<>])*>|[^<>])*>|'
+    r'::)+')
+# A call-by-reference parameter ends with '& identifier'.
+_RE_PATTERN_REF_PARAM = re.compile(
+    r'(' + _RE_PATTERN_TYPE + r'(?:\s*(?:\bconst\b|[*]))*\s*'
+    r'&\s*' + _RE_PATTERN_IDENT + r')\s*(?:=[^,()]+)?[,)]')
+# A call-by-const-reference parameter either ends with 'const& identifier'
+# or looks like 'const type& identifier' when 'type' is atomic.
+_RE_PATTERN_CONST_REF_PARAM = (
+    r'(?:.*\s*\bconst\s*&\s*' + _RE_PATTERN_IDENT +
+    r'|const\s+' + _RE_PATTERN_TYPE + r'\s*&\s*' + _RE_PATTERN_IDENT + r')')
+
+
+def CheckLanguage(filename, clean_lines, linenum, file_extension,
+                  include_state, nesting_state, error):
+  """Checks rules from the 'C++ language rules' section of cppguide.html.
+
+  Some of these rules are hard to test (function overloading, using
+  uint32 inappropriately), but we do the best we can.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    file_extension: The extension (without the dot) of the filename.
+    include_state: An _IncludeState instance in which the headers are inserted.
+    nesting_state: A _NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: The function to call with any errors found.
+  """
+  # If the line is empty or consists of entirely a comment, no need to
+  # check it.
+  line = clean_lines.elided[linenum]
+  if not line:
+    return
+
+  match = _RE_PATTERN_INCLUDE.search(line)
+  if match:
+    CheckIncludeLine(filename, clean_lines, linenum, include_state, error)
+    return
+
+  # Reset include state across preprocessor directives.  This is meant
+  # to silence warnings for conditional includes.
+  if Match(r'^\s*#\s*(?:ifdef|elif|else|endif)\b', line):
+    include_state.ResetSection()
+
+  # Make Windows paths like Unix.
+  fullname = os.path.abspath(filename).replace('\\', '/')
+
+  # TODO(unknown): figure out if they're using default arguments in fn proto.
+
+  # Check to see if they're using an conversion function cast.
+  # I just try to capture the most common basic types, though there are more.
+  # Parameterless conversion functions, such as bool(), are allowed as they are
+  # probably a member operator declaration or default constructor.
+  match = Search(
+      r'(\bnew\s+)?\b'  # Grab 'new' operator, if it's there
+      r'(int|float|double|bool|char|int32|uint32|int64|uint64)'
+      r'(\([^)].*)', line)
+  if match:
+    matched_new = match.group(1)
+    matched_type = match.group(2)
+    matched_funcptr = match.group(3)
+
+    # gMock methods are defined using some variant of MOCK_METHODx(name, type)
+    # where type may be float(), int(string), etc.  Without context they are
+    # virtually indistinguishable from int(x) casts. Likewise, gMock's
+    # MockCallback takes a template parameter of the form return_type(arg_type),
+    # which looks much like the cast we're trying to detect.
+    #
+    # std::function<> wrapper has a similar problem.
+    #
+    # Return types for function pointers also look like casts if they
+    # don't have an extra space.
+    if (matched_new is None and  # If new operator, then this isn't a cast
+        not (Match(r'^\s*MOCK_(CONST_)?METHOD\d+(_T)?\(', line) or
+             Search(r'\bMockCallback<.*>', line) or
+             Search(r'\bstd::function<.*>', line)) and
+        not (matched_funcptr and
+             Match(r'\((?:[^() ]+::\s*\*\s*)?[^() ]+\)\s*\(',
+                   matched_funcptr))):
+      # Try a bit harder to catch gmock lines: the only place where
+      # something looks like an old-style cast is where we declare the
+      # return type of the mocked method, and the only time when we
+      # are missing context is if MOCK_METHOD was split across
+      # multiple lines.  The missing MOCK_METHOD is usually one or two
+      # lines back, so scan back one or two lines.
+      #
+      # It's not possible for gmock macros to appear in the first 2
+      # lines, since the class head + section name takes up 2 lines.
+      if (linenum < 2 or
+          not (Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\((?:\S+,)?\s*$',
+                     clean_lines.elided[linenum - 1]) or
+               Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\(\s*$',
+                     clean_lines.elided[linenum - 2]))):
+        error(filename, linenum, 'readability/casting', 4,
+              'Using deprecated casting style.  '
+              'Use static_cast<%s>(...) instead' %
+              matched_type)
+
+  CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum],
+                  'static_cast',
+                  r'\((int|float|double|bool|char|u?int(16|32|64))\)', error)
+
+  # This doesn't catch all cases. Consider (const char * const)"hello".
+  #
+  # (char *) "foo" should always be a const_cast (reinterpret_cast won't
+  # compile).
+  if CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum],
+                     'const_cast', r'\((char\s?\*+\s?)\)\s*"', error):
+    pass
+  else:
+    # Check pointer casts for other than string constants
+    CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum],
+                    'reinterpret_cast', r'\((\w+\s?\*+\s?)\)', error)
+
+  # In addition, we look for people taking the address of a cast.  This
+  # is dangerous -- casts can assign to temporaries, so the pointer doesn't
+  # point where you think.
+  match = Search(
+      r'(?:&\(([^)]+)\)[\w(])|'
+      r'(?:&(static|dynamic|down|reinterpret)_cast\b)', line)
+  if match and match.group(1) != '*':
+    error(filename, linenum, 'runtime/casting', 4,
+          ('Are you taking an address of a cast?  '
+           'This is dangerous: could be a temp var.  '
+           'Take the address before doing the cast, rather than after'))
+
+  # Create an extended_line, which is the concatenation of the current and
+  # next lines, for more effective checking of code that may span more than one
+  # line.
+  if linenum + 1 < clean_lines.NumLines():
+    extended_line = line + clean_lines.elided[linenum + 1]
+  else:
+    extended_line = line
+
+  # Check for people declaring static/global STL strings at the top level.
+  # This is dangerous because the C++ language does not guarantee that
+  # globals with constructors are initialized before the first access.
+  match = Match(
+      r'((?:|static +)(?:|const +))string +([a-zA-Z0-9_:]+)\b(.*)',
+      line)
+  # Make sure it's not a function.
+  # Function template specialization looks like: "string foo<Type>(...".
+  # Class template definitions look like: "string Foo<Type>::Method(...".
+  #
+  # Also ignore things that look like operators.  These are matched separately
+  # because operator names cross non-word boundaries.  If we change the pattern
+  # above, we would decrease the accuracy of matching identifiers.
+  if (match and
+      not Search(r'\boperator\W', line) and
+      not Match(r'\s*(<.*>)?(::[a-zA-Z0-9_]+)?\s*\(([^"]|$)', match.group(3))):
+    error(filename, linenum, 'runtime/string', 4,
+          'For a static/global string constant, use a C style string instead: '
+          '"%schar %s[]".' %
+          (match.group(1), match.group(2)))
+
+  if Search(r'\b([A-Za-z0-9_]*_)\(\1\)', line):
+    error(filename, linenum, 'runtime/init', 4,
+          'You seem to be initializing a member variable with itself.')
+
+  if file_extension == 'h':
+    # TODO(unknown): check that 1-arg constructors are explicit.
+    #                How to tell it's a constructor?
+    #                (handled in CheckForNonStandardConstructs for now)
+    # TODO(unknown): check that classes have DISALLOW_EVIL_CONSTRUCTORS
+    #                (level 1 error)
+    pass
+
+  # Check if people are using the verboten C basic types.  The only exception
+  # we regularly allow is "unsigned short port" for port.
+  if Search(r'\bshort port\b', line):
+    if not Search(r'\bunsigned short port\b', line):
+      error(filename, linenum, 'runtime/int', 4,
+            'Use "unsigned short" for ports, not "short"')
+  else:
+    match = Search(r'\b(short|long(?! +double)|long long)\b', line)
+    if match:
+      error(filename, linenum, 'runtime/int', 4,
+            'Use int16/int64/etc, rather than the C type %s' % match.group(1))
+
+  # When snprintf is used, the second argument shouldn't be a literal.
+  match = Search(r'snprintf\s*\(([^,]*),\s*([0-9]*)\s*,', line)
+  if match and match.group(2) != '0':
+    # If 2nd arg is zero, snprintf is used to calculate size.
+    error(filename, linenum, 'runtime/printf', 3,
+          'If you can, use sizeof(%s) instead of %s as the 2nd arg '
+          'to snprintf.' % (match.group(1), match.group(2)))
+
+  # Check if some verboten C functions are being used.
+  if Search(r'\bsprintf\b', line):
+    error(filename, linenum, 'runtime/printf', 5,
+          'Never use sprintf.  Use snprintf instead.')
+  match = Search(r'\b(strcpy|strcat)\b', line)
+  if match:
+    error(filename, linenum, 'runtime/printf', 4,
+          'Almost always, snprintf is better than %s' % match.group(1))
+
+  # Check if some verboten operator overloading is going on
+  # TODO(unknown): catch out-of-line unary operator&:
+  #   class X {};
+  #   int operator&(const X& x) { return 42; }  // unary operator&
+  # The trick is it's hard to tell apart from binary operator&:
+  #   class Y { int operator&(const Y& x) { return 23; } }; // binary operator&
+  if Search(r'\boperator\s*&\s*\(\s*\)', line):
+    error(filename, linenum, 'runtime/operator', 4,
+          'Unary operator& is dangerous.  Do not use it.')
+
+  # Check for suspicious usage of "if" like
+  # } if (a == b) {
+  if Search(r'\}\s*if\s*\(', line):
+    error(filename, linenum, 'readability/braces', 4,
+          'Did you mean "else if"? If not, start a new line for "if".')
+
+  # Check for potential format string bugs like printf(foo).
+  # We constrain the pattern not to pick things like DocidForPrintf(foo).
+  # Not perfect but it can catch printf(foo.c_str()) and printf(foo->c_str())
+  # TODO(sugawarayu): Catch the following case. Need to change the calling
+  # convention of the whole function to process multiple line to handle it.
+  #   printf(
+  #       boy_this_is_a_really_long_variable_that_cannot_fit_on_the_prev_line);
+  printf_args = _GetTextInside(line, r'(?i)\b(string)?printf\s*\(')
+  if printf_args:
+    match = Match(r'([\w.\->()]+)$', printf_args)
+    if match and match.group(1) != '__VA_ARGS__':
+      function_name = re.search(r'\b((?:string)?printf)\s*\(',
+                                line, re.I).group(1)
+      error(filename, linenum, 'runtime/printf', 4,
+            'Potential format string bug. Do %s("%%s", %s) instead.'
+            % (function_name, match.group(1)))
+
+  # Check for potential memset bugs like memset(buf, sizeof(buf), 0).
+  match = Search(r'memset\s*\(([^,]*),\s*([^,]*),\s*0\s*\)', line)
+  if match and not Match(r"^''|-?[0-9]+|0x[0-9A-Fa-f]$", match.group(2)):
+    error(filename, linenum, 'runtime/memset', 4,
+          'Did you mean "memset(%s, 0, %s)"?'
+          % (match.group(1), match.group(2)))
+
+  if Search(r'\busing namespace\b', line):
+    error(filename, linenum, 'build/namespaces', 5,
+          'Do not use namespace using-directives.  '
+          'Use using-declarations instead.')
+
+  # Detect variable-length arrays.
+  match = Match(r'\s*(.+::)?(\w+) [a-z]\w*\[(.+)];', line)
+  if (match and match.group(2) != 'return' and match.group(2) != 'delete' and
+      match.group(3).find(']') == -1):
+    # Split the size using space and arithmetic operators as delimiters.
+    # If any of the resulting tokens are not compile time constants then
+    # report the error.
+    tokens = re.split(r'\s|\+|\-|\*|\/|<<|>>]', match.group(3))
+    is_const = True
+    skip_next = False
+    for tok in tokens:
+      if skip_next:
+        skip_next = False
+        continue
+
+      if Search(r'sizeof\(.+\)', tok): continue
+      if Search(r'arraysize\(\w+\)', tok): continue
+
+      tok = tok.lstrip('(')
+      tok = tok.rstrip(')')
+      if not tok: continue
+      if Match(r'\d+', tok): continue
+      if Match(r'0[xX][0-9a-fA-F]+', tok): continue
+      if Match(r'k[A-Z0-9]\w*', tok): continue
+      if Match(r'(.+::)?k[A-Z0-9]\w*', tok): continue
+      if Match(r'(.+::)?[A-Z][A-Z0-9_]*', tok): continue
+      # A catch all for tricky sizeof cases, including 'sizeof expression',
+      # 'sizeof(*type)', 'sizeof(const type)', 'sizeof(struct StructName)'
+      # requires skipping the next token because we split on ' ' and '*'.
+      if tok.startswith('sizeof'):
+        skip_next = True
+        continue
+      is_const = False
+      break
+    if not is_const:
+      error(filename, linenum, 'runtime/arrays', 1,
+            'Do not use variable-length arrays.  Use an appropriately named '
+            "('k' followed by CamelCase) compile-time constant for the size.")
+
+  # If DISALLOW_EVIL_CONSTRUCTORS, DISALLOW_COPY_AND_ASSIGN, or
+  # DISALLOW_IMPLICIT_CONSTRUCTORS is present, then it should be the last thing
+  # in the class declaration.
+  match = Match(
+      (r'\s*'
+       r'(DISALLOW_(EVIL_CONSTRUCTORS|COPY_AND_ASSIGN|IMPLICIT_CONSTRUCTORS))'
+       r'\(.*\);$'),
+      line)
+  if match and linenum + 1 < clean_lines.NumLines():
+    next_line = clean_lines.elided[linenum + 1]
+    # We allow some, but not all, declarations of variables to be present
+    # in the statement that defines the class.  The [\w\*,\s]* fragment of
+    # the regular expression below allows users to declare instances of
+    # the class or pointers to instances, but not less common types such
+    # as function pointers or arrays.  It's a tradeoff between allowing
+    # reasonable code and avoiding trying to parse more C++ using regexps.
+    if not Search(r'^\s*}[\w\*,\s]*;', next_line):
+      error(filename, linenum, 'readability/constructors', 3,
+            match.group(1) + ' should be the last thing in the class')
+
+  # Check for use of unnamed namespaces in header files.  Registration
+  # macros are typically OK, so we allow use of "namespace {" on lines
+  # that end with backslashes.
+  if (file_extension == 'h'
+      and Search(r'\bnamespace\s*{', line)
+      and line[-1] != '\\'):
+    error(filename, linenum, 'build/namespaces', 4,
+          'Do not use unnamed namespaces in header files.  See '
+          'http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Namespaces'
+          ' for more information.')
+
+def CheckForNonConstReference(filename, clean_lines, linenum,
+                              nesting_state, error):
+  """Check for non-const references.
+
+  Separate from CheckLanguage since it scans backwards from current
+  line, instead of scanning forward.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    nesting_state: A _NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: The function to call with any errors found.
+  """
+  # Do nothing if there is no '&' on current line.
+  line = clean_lines.elided[linenum]
+  if '&' not in line:
+    return
+
+  # Long type names may be broken across multiple lines, usually in one
+  # of these forms:
+  #   LongType
+  #       ::LongTypeContinued &identifier
+  #   LongType::
+  #       LongTypeContinued &identifier
+  #   LongType<
+  #       ...>::LongTypeContinued &identifier
+  #
+  # If we detected a type split across two lines, join the previous
+  # line to current line so that we can match const references
+  # accordingly.
+  #
+  # Note that this only scans back one line, since scanning back
+  # arbitrary number of lines would be expensive.  If you have a type
+  # that spans more than 2 lines, please use a typedef.
+  if linenum > 1:
+    previous = None
+    if Match(r'\s*::(?:[\w<>]|::)+\s*&\s*\S', line):
+      # previous_line\n + ::current_line
+      previous = Search(r'\b((?:const\s*)?(?:[\w<>]|::)+[\w<>])\s*$',
+                        clean_lines.elided[linenum - 1])
+    elif Match(r'\s*[a-zA-Z_]([\w<>]|::)+\s*&\s*\S', line):
+      # previous_line::\n + current_line
+      previous = Search(r'\b((?:const\s*)?(?:[\w<>]|::)+::)\s*$',
+                        clean_lines.elided[linenum - 1])
+    if previous:
+      line = previous.group(1) + line.lstrip()
+    else:
+      # Check for templated parameter that is split across multiple lines
+      endpos = line.rfind('>')
+      if endpos > -1:
+        (_, startline, startpos) = ReverseCloseExpression(
+            clean_lines, linenum, endpos)
+        if startpos > -1 and startline < linenum:
+          # Found the matching < on an earlier line, collect all
+          # pieces up to current line.
+          line = ''
+          for i in xrange(startline, linenum + 1):
+            line += clean_lines.elided[i].strip()
+
+  # Check for non-const references in function parameters.  A single '&' may
+  # found in the following places:
+  #   inside expression: binary & for bitwise AND
+  #   inside expression: unary & for taking the address of something
+  #   inside declarators: reference parameter
+  # We will exclude the first two cases by checking that we are not inside a
+  # function body, including one that was just introduced by a trailing '{'.
+  # TODO(unknwon): Doesn't account for preprocessor directives.
+  # TODO(unknown): Doesn't account for 'catch(Exception& e)' [rare].
+  check_params = False
+  if not nesting_state.stack:
+    check_params = True  # top level
+  elif (isinstance(nesting_state.stack[-1], _ClassInfo) or
+        isinstance(nesting_state.stack[-1], _NamespaceInfo)):
+    check_params = True  # within class or namespace
+  elif Match(r'.*{\s*$', line):
+    if (len(nesting_state.stack) == 1 or
+        isinstance(nesting_state.stack[-2], _ClassInfo) or
+        isinstance(nesting_state.stack[-2], _NamespaceInfo)):
+      check_params = True  # just opened global/class/namespace block
+  # We allow non-const references in a few standard places, like functions
+  # called "swap()" or iostream operators like "<<" or ">>".  Do not check
+  # those function parameters.
+  #
+  # We also accept & in static_assert, which looks like a function but
+  # it's actually a declaration expression.
+  whitelisted_functions = (r'(?:[sS]wap(?:<\w:+>)?|'
+                           r'operator\s*[<>][<>]|'
+                           r'static_assert|COMPILE_ASSERT'
+                           r')\s*\(')
+  if Search(whitelisted_functions, line):
+    check_params = False
+  elif not Search(r'\S+\([^)]*$', line):
+    # Don't see a whitelisted function on this line.  Actually we
+    # didn't see any function name on this line, so this is likely a
+    # multi-line parameter list.  Try a bit harder to catch this case.
+    for i in xrange(2):
+      if (linenum > i and
+          Search(whitelisted_functions, clean_lines.elided[linenum - i - 1])):
+        check_params = False
+        break
+
+  if check_params:
+    decls = ReplaceAll(r'{[^}]*}', ' ', line)  # exclude function body
+    for parameter in re.findall(_RE_PATTERN_REF_PARAM, decls):
+      if not Match(_RE_PATTERN_CONST_REF_PARAM, parameter):
+        error(filename, linenum, 'runtime/references', 2,
+              'Is this a non-const reference? '
+              'If so, make const or use a pointer: ' +
+              ReplaceAll(' *<', '<', parameter))
+
+
+def CheckCStyleCast(filename, linenum, line, raw_line, cast_type, pattern,
+                    error):
+  """Checks for a C-style cast by looking for the pattern.
+
+  Args:
+    filename: The name of the current file.
+    linenum: The number of the line to check.
+    line: The line of code to check.
+    raw_line: The raw line of code to check, with comments.
+    cast_type: The string for the C++ cast to recommend.  This is either
+      reinterpret_cast, static_cast, or const_cast, depending.
+    pattern: The regular expression used to find C-style casts.
+    error: The function to call with any errors found.
+
+  Returns:
+    True if an error was emitted.
+    False otherwise.
+  """
+  match = Search(pattern, line)
+  if not match:
+    return False
+
+  # e.g., sizeof(int)
+  sizeof_match = Match(r'.*sizeof\s*$', line[0:match.start(1) - 1])
+  if sizeof_match:
+    error(filename, linenum, 'runtime/sizeof', 1,
+          'Using sizeof(type).  Use sizeof(varname) instead if possible')
+    return True
+
+  # operator++(int) and operator--(int)
+  if (line[0:match.start(1) - 1].endswith(' operator++') or
+      line[0:match.start(1) - 1].endswith(' operator--')):
+    return False
+
+  # A single unnamed argument for a function tends to look like old
+  # style cast.  If we see those, don't issue warnings for deprecated
+  # casts, instead issue warnings for unnamed arguments where
+  # appropriate.
+  #
+  # These are things that we want warnings for, since the style guide
+  # explicitly require all parameters to be named:
+  #   Function(int);
+  #   Function(int) {
+  #   ConstMember(int) const;
+  #   ConstMember(int) const {
+  #   ExceptionMember(int) throw (...);
+  #   ExceptionMember(int) throw (...) {
+  #   PureVirtual(int) = 0;
+  #
+  # These are functions of some sort, where the compiler would be fine
+  # if they had named parameters, but people often omit those
+  # identifiers to reduce clutter:
+  #   (FunctionPointer)(int);
+  #   (FunctionPointer)(int) = value;
+  #   Function((function_pointer_arg)(int))
+  #   <TemplateArgument(int)>;
+  #   <(FunctionPointerTemplateArgument)(int)>;
+  remainder = line[match.end(0):]
+  if Match(r'^\s*(?:;|const\b|throw\b|=|>|\{|\))', remainder):
+    # Looks like an unnamed parameter.
+
+    # Don't warn on any kind of template arguments.
+    if Match(r'^\s*>', remainder):
+      return False
+
+    # Don't warn on assignments to function pointers, but keep warnings for
+    # unnamed parameters to pure virtual functions.  Note that this pattern
+    # will also pass on assignments of "0" to function pointers, but the
+    # preferred values for those would be "nullptr" or "NULL".
+    matched_zero = Match(r'^\s=\s*(\S+)\s*;', remainder)
+    if matched_zero and matched_zero.group(1) != '0':
+      return False
+
+    # Don't warn on function pointer declarations.  For this we need
+    # to check what came before the "(type)" string.
+    if Match(r'.*\)\s*$', line[0:match.start(0)]):
+      return False
+
+    # Don't warn if the parameter is named with block comments, e.g.:
+    #  Function(int /*unused_param*/);
+    if '/*' in raw_line:
+      return False
+
+    # Passed all filters, issue warning here.
+    error(filename, linenum, 'readability/function', 3,
+          'All parameters should be named in a function')
+    return True
+
+  # At this point, all that should be left is actual casts.
+  error(filename, linenum, 'readability/casting', 4,
+        'Using C-style cast.  Use %s<%s>(...) instead' %
+        (cast_type, match.group(1)))
+
+  return True
+
+
+_HEADERS_CONTAINING_TEMPLATES = (
+    ('<deque>', ('deque',)),
+    ('<functional>', ('unary_function', 'binary_function',
+                      'plus', 'minus', 'multiplies', 'divides', 'modulus',
+                      'negate',
+                      'equal_to', 'not_equal_to', 'greater', 'less',
+                      'greater_equal', 'less_equal',
+                      'logical_and', 'logical_or', 'logical_not',
+                      'unary_negate', 'not1', 'binary_negate', 'not2',
+                      'bind1st', 'bind2nd',
+                      'pointer_to_unary_function',
+                      'pointer_to_binary_function',
+                      'ptr_fun',
+                      'mem_fun_t', 'mem_fun', 'mem_fun1_t', 'mem_fun1_ref_t',
+                      'mem_fun_ref_t',
+                      'const_mem_fun_t', 'const_mem_fun1_t',
+                      'const_mem_fun_ref_t', 'const_mem_fun1_ref_t',
+                      'mem_fun_ref',
+                     )),
+    ('<limits>', ('numeric_limits',)),
+    ('<list>', ('list',)),
+    ('<map>', ('map', 'multimap',)),
+    ('<memory>', ('allocator',)),
+    ('<queue>', ('queue', 'priority_queue',)),
+    ('<set>', ('set', 'multiset',)),
+    ('<stack>', ('stack',)),
+    ('<string>', ('char_traits', 'basic_string',)),
+    ('<utility>', ('pair',)),
+    ('<vector>', ('vector',)),
+
+    # gcc extensions.
+    # Note: std::hash is their hash, ::hash is our hash
+    ('<hash_map>', ('hash_map', 'hash_multimap',)),
+    ('<hash_set>', ('hash_set', 'hash_multiset',)),
+    ('<slist>', ('slist',)),
+    )
+
+_RE_PATTERN_STRING = re.compile(r'\bstring\b')
+
+_re_pattern_algorithm_header = []
+for _template in ('copy', 'max', 'min', 'min_element', 'sort', 'swap',
+                  'transform'):
+  # Match max<type>(..., ...), max(..., ...), but not foo->max, foo.max or
+  # type::max().
+  _re_pattern_algorithm_header.append(
+      (re.compile(r'[^>.]\b' + _template + r'(<.*?>)?\([^\)]'),
+       _template,
+       '<algorithm>'))
+
+_re_pattern_templates = []
+for _header, _templates in _HEADERS_CONTAINING_TEMPLATES:
+  for _template in _templates:
+    _re_pattern_templates.append(
+        (re.compile(r'(\<|\b)' + _template + r'\s*\<'),
+         _template + '<>',
+         _header))
+
+
+def FilesBelongToSameModule(filename_cc, filename_h):
+  """Check if these two filenames belong to the same module.
+
+  The concept of a 'module' here is a as follows:
+  foo.h, foo-inl.h, foo.cc, foo_test.cc and foo_unittest.cc belong to the
+  same 'module' if they are in the same directory.
+  some/path/public/xyzzy and some/path/internal/xyzzy are also considered
+  to belong to the same module here.
+
+  If the filename_cc contains a longer path than the filename_h, for example,
+  '/absolute/path/to/base/sysinfo.cc', and this file would include
+  'base/sysinfo.h', this function also produces the prefix needed to open the
+  header. This is used by the caller of this function to more robustly open the
+  header file. We don't have access to the real include paths in this context,
+  so we need this guesswork here.
+
+  Known bugs: tools/base/bar.cc and base/bar.h belong to the same module
+  according to this implementation. Because of this, this function gives
+  some false positives. This should be sufficiently rare in practice.
+
+  Args:
+    filename_cc: is the path for the .cc file
+    filename_h: is the path for the header path
+
+  Returns:
+    Tuple with a bool and a string:
+    bool: True if filename_cc and filename_h belong to the same module.
+    string: the additional prefix needed to open the header file.
+  """
+
+  if not filename_cc.endswith('.cc'):
+    return (False, '')
+  filename_cc = filename_cc[:-len('.cc')]
+  if filename_cc.endswith('_unittest'):
+    filename_cc = filename_cc[:-len('_unittest')]
+  elif filename_cc.endswith('_test'):
+    filename_cc = filename_cc[:-len('_test')]
+  filename_cc = filename_cc.replace('/public/', '/')
+  filename_cc = filename_cc.replace('/internal/', '/')
+
+  if not filename_h.endswith('.h'):
+    return (False, '')
+  filename_h = filename_h[:-len('.h')]
+  if filename_h.endswith('-inl'):
+    filename_h = filename_h[:-len('-inl')]
+  filename_h = filename_h.replace('/public/', '/')
+  filename_h = filename_h.replace('/internal/', '/')
+
+  files_belong_to_same_module = filename_cc.endswith(filename_h)
+  common_path = ''
+  if files_belong_to_same_module:
+    common_path = filename_cc[:-len(filename_h)]
+  return files_belong_to_same_module, common_path
+
+
+def UpdateIncludeState(filename, include_state, io=codecs):
+  """Fill up the include_state with new includes found from the file.
+
+  Args:
+    filename: the name of the header to read.
+    include_state: an _IncludeState instance in which the headers are inserted.
+    io: The io factory to use to read the file. Provided for testability.
+
+  Returns:
+    True if a header was succesfully added. False otherwise.
+  """
+  headerfile = None
+  try:
+    headerfile = io.open(filename, 'r', 'utf8', 'replace')
+  except IOError:
+    return False
+  linenum = 0
+  for line in headerfile:
+    linenum += 1
+    clean_line = CleanseComments(line)
+    match = _RE_PATTERN_INCLUDE.search(clean_line)
+    if match:
+      include = match.group(2)
+      # The value formatting is cute, but not really used right now.
+      # What matters here is that the key is in include_state.
+      include_state.setdefault(include, '%s:%d' % (filename, linenum))
+  return True
+
+
+def CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error,
+                              io=codecs):
+  """Reports for missing stl includes.
+
+  This function will output warnings to make sure you are including the headers
+  necessary for the stl containers and functions that you use. We only give one
+  reason to include a header. For example, if you use both equal_to<> and
+  less<> in a .h file, only one (the latter in the file) of these will be
+  reported as a reason to include the <functional>.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    include_state: An _IncludeState instance.
+    error: The function to call with any errors found.
+    io: The IO factory to use to read the header file. Provided for unittest
+        injection.
+  """
+  required = {}  # A map of header name to linenumber and the template entity.
+                 # Example of required: { '<functional>': (1219, 'less<>') }
+
+  for linenum in xrange(clean_lines.NumLines()):
+    line = clean_lines.elided[linenum]
+    if not line or line[0] == '#':
+      continue
+
+    # String is special -- it is a non-templatized type in STL.
+    matched = _RE_PATTERN_STRING.search(line)
+    if matched:
+      # Don't warn about strings in non-STL namespaces:
+      # (We check only the first match per line; good enough.)
+      prefix = line[:matched.start()]
+      if prefix.endswith('std::') or not prefix.endswith('::'):
+        required['<string>'] = (linenum, 'string')
+
+    for pattern, template, header in _re_pattern_algorithm_header:
+      if pattern.search(line):
+        required[header] = (linenum, template)
+
+    # The following function is just a speed up, no semantics are changed.
+    if not '<' in line:  # Reduces the cpu time usage by skipping lines.
+      continue
+
+    for pattern, template, header in _re_pattern_templates:
+      if pattern.search(line):
+        required[header] = (linenum, template)
+
+  # The policy is that if you #include something in foo.h you don't need to
+  # include it again in foo.cc. Here, we will look at possible includes.
+  # Let's copy the include_state so it is only messed up within this function.
+  include_state = include_state.copy()
+
+  # Did we find the header for this file (if any) and succesfully load it?
+  header_found = False
+
+  # Use the absolute path so that matching works properly.
+  abs_filename = FileInfo(filename).FullName()
+
+  # For Emacs's flymake.
+  # If cpplint is invoked from Emacs's flymake, a temporary file is generated
+  # by flymake and that file name might end with '_flymake.cc'. In that case,
+  # restore original file name here so that the corresponding header file can be
+  # found.
+  # e.g. If the file name is 'foo_flymake.cc', we should search for 'foo.h'
+  # instead of 'foo_flymake.h'
+  abs_filename = re.sub(r'_flymake\.cc$', '.cc', abs_filename)
+
+  # include_state is modified during iteration, so we iterate over a copy of
+  # the keys.
+  header_keys = include_state.keys()
+  for header in header_keys:
+    (same_module, common_path) = FilesBelongToSameModule(abs_filename, header)
+    fullpath = common_path + header
+    if same_module and UpdateIncludeState(fullpath, include_state, io):
+      header_found = True
+
+  # If we can't find the header file for a .cc, assume it's because we don't
+  # know where to look. In that case we'll give up as we're not sure they
+  # didn't include it in the .h file.
+  # TODO(unknown): Do a better job of finding .h files so we are confident that
+  # not having the .h file means there isn't one.
+  if filename.endswith('.cc') and not header_found:
+    return
+
+  # All the lines have been processed, report the errors found.
+  for required_header_unstripped in required:
+    template = required[required_header_unstripped][1]
+    if required_header_unstripped.strip('<>"') not in include_state:
+      error(filename, required[required_header_unstripped][0],
+            'build/include_what_you_use', 4,
+            'Add #include ' + required_header_unstripped + ' for ' + template)
+
+
+_RE_PATTERN_EXPLICIT_MAKEPAIR = re.compile(r'\bmake_pair\s*<')
+
+
+def CheckMakePairUsesDeduction(filename, clean_lines, linenum, error):
+  """Check that make_pair's template arguments are deduced.
+
+  G++ 4.6 in C++0x mode fails badly if make_pair's template arguments are
+  specified explicitly, and such use isn't intended in any case.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+  match = _RE_PATTERN_EXPLICIT_MAKEPAIR.search(line)
+  if match:
+    error(filename, linenum, 'build/explicit_make_pair',
+          4,  # 4 = high confidence
+          'For C++11-compatibility, omit template arguments from make_pair'
+          ' OR use pair directly OR if appropriate, construct a pair directly')
+
+
+def ProcessLine(filename, file_extension, clean_lines, line,
+                include_state, function_state, nesting_state, error,
+                extra_check_functions=[]):
+  """Processes a single line in the file.
+
+  Args:
+    filename: Filename of the file that is being processed.
+    file_extension: The extension (dot not included) of the file.
+    clean_lines: An array of strings, each representing a line of the file,
+                 with comments stripped.
+    line: Number of line being processed.
+    include_state: An _IncludeState instance in which the headers are inserted.
+    function_state: A _FunctionState instance which counts function lines, etc.
+    nesting_state: A _NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: A callable to which errors are reported, which takes 4 arguments:
+           filename, line number, error level, and message
+    extra_check_functions: An array of additional check functions that will be
+                           run on each source line. Each function takes 4
+                           arguments: filename, clean_lines, line, error
+  """
+  raw_lines = clean_lines.raw_lines
+  ParseNolintSuppressions(filename, raw_lines[line], line, error)
+  nesting_state.Update(filename, clean_lines, line, error)
+  if nesting_state.stack and nesting_state.stack[-1].inline_asm != _NO_ASM:
+    return
+  CheckForFunctionLengths(filename, clean_lines, line, function_state, error)
+  CheckForMultilineCommentsAndStrings(filename, clean_lines, line, error)
+  CheckStyle(filename, clean_lines, line, file_extension, nesting_state, error)
+  CheckLanguage(filename, clean_lines, line, file_extension, include_state,
+                nesting_state, error)
+  CheckForNonConstReference(filename, clean_lines, line, nesting_state, error)
+  CheckForNonStandardConstructs(filename, clean_lines, line,
+                                nesting_state, error)
+  CheckVlogArguments(filename, clean_lines, line, error)
+  CheckPosixThreading(filename, clean_lines, line, error)
+  CheckInvalidIncrement(filename, clean_lines, line, error)
+  CheckMakePairUsesDeduction(filename, clean_lines, line, error)
+  for check_fn in extra_check_functions:
+    check_fn(filename, clean_lines, line, error)
+
+def ProcessFileData(filename, file_extension, lines, error,
+                    extra_check_functions=[]):
+  """Performs lint checks and reports any errors to the given error function.
+
+  Args:
+    filename: Filename of the file that is being processed.
+    file_extension: The extension (dot not included) of the file.
+    lines: An array of strings, each representing a line of the file, with the
+           last element being empty if the file is terminated with a newline.
+    error: A callable to which errors are reported, which takes 4 arguments:
+           filename, line number, error level, and message
+    extra_check_functions: An array of additional check functions that will be
+                           run on each source line. Each function takes 4
+                           arguments: filename, clean_lines, line, error
+  """
+  lines = (['// marker so line numbers and indices both start at 1'] + lines +
+           ['// marker so line numbers end in a known way'])
+
+  include_state = _IncludeState()
+  function_state = _FunctionState()
+  nesting_state = _NestingState()
+
+  ResetNolintSuppressions()
+
+  CheckForCopyright(filename, lines, error)
+
+  if file_extension == 'h':
+    CheckForHeaderGuard(filename, lines, error)
+
+  RemoveMultiLineComments(filename, lines, error)
+  clean_lines = CleansedLines(lines)
+  for line in xrange(clean_lines.NumLines()):
+    ProcessLine(filename, file_extension, clean_lines, line,
+                include_state, function_state, nesting_state, error,
+                extra_check_functions)
+  nesting_state.CheckCompletedBlocks(filename, error)
+
+  CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error)
+
+  # We check here rather than inside ProcessLine so that we see raw
+  # lines rather than "cleaned" lines.
+  CheckForBadCharacters(filename, lines, error)
+
+  CheckForNewlineAtEOF(filename, lines, error)
+
+def ProcessFile(filename, vlevel, extra_check_functions=[]):
+  """Does google-lint on a single file.
+
+  Args:
+    filename: The name of the file to parse.
+
+    vlevel: The level of errors to report.  Every error of confidence
+    >= verbose_level will be reported.  0 is a good default.
+
+    extra_check_functions: An array of additional check functions that will be
+                           run on each source line. Each function takes 4
+                           arguments: filename, clean_lines, line, error
+  """
+
+  _SetVerboseLevel(vlevel)
+
+  try:
+    # Support the UNIX convention of using "-" for stdin.  Note that
+    # we are not opening the file with universal newline support
+    # (which codecs doesn't support anyway), so the resulting lines do
+    # contain trailing '\r' characters if we are reading a file that
+    # has CRLF endings.
+    # If after the split a trailing '\r' is present, it is removed
+    # below. If it is not expected to be present (i.e. os.linesep !=
+    # '\r\n' as in Windows), a warning is issued below if this file
+    # is processed.
+
+    if filename == '-':
+      lines = codecs.StreamReaderWriter(sys.stdin,
+                                        codecs.getreader('utf8'),
+                                        codecs.getwriter('utf8'),
+                                        'replace').read().split('\n')
+    else:
+      lines = codecs.open(filename, 'r', 'utf8', 'replace').read().split('\n')
+
+    carriage_return_found = False
+    # Remove trailing '\r'.
+    for linenum in range(len(lines)):
+      if lines[linenum].endswith('\r'):
+        lines[linenum] = lines[linenum].rstrip('\r')
+        carriage_return_found = True
+
+  except IOError:
+    sys.stderr.write(
+        "Skipping input '%s': Can't open for reading\n" % filename)
+    return
+
+  # Note, if no dot is found, this will give the entire filename as the ext.
+  file_extension = filename[filename.rfind('.') + 1:]
+
+  # When reading from stdin, the extension is unknown, so no cpplint tests
+  # should rely on the extension.
+  if filename != '-' and file_extension not in _valid_extensions:
+    sys.stderr.write('Ignoring %s; not a valid file name '
+                     '(%s)\n' % (filename, ', '.join(_valid_extensions)))
+  else:
+    ProcessFileData(filename, file_extension, lines, Error,
+                    extra_check_functions)
+    if carriage_return_found and os.linesep != '\r\n':
+      # Use 0 for linenum since outputting only one error for potentially
+      # several lines.
+      Error(filename, 0, 'whitespace/newline', 1,
+            'One or more unexpected \\r (^M) found;'
+            'better to use only a \\n')
+
+  sys.stderr.write('Done processing %s\n' % filename)
+
+
+def PrintUsage(message):
+  """Prints a brief usage string and exits, optionally with an error message.
+
+  Args:
+    message: The optional error message.
+  """
+  sys.stderr.write(_USAGE)
+  if message:
+    sys.exit('\nFATAL ERROR: ' + message)
+  else:
+    sys.exit(1)
+
+
+def PrintCategories():
+  """Prints a list of all the error-categories used by error messages.
+
+  These are the categories used to filter messages via --filter.
+  """
+  sys.stderr.write(''.join('  %s\n' % cat for cat in _ERROR_CATEGORIES))
+  sys.exit(0)
+
+
+def ParseArguments(args):
+  """Parses the command line arguments.
+
+  This may set the output format and verbosity level as side-effects.
+
+  Args:
+    args: The command line arguments:
+
+  Returns:
+    The list of filenames to lint.
+  """
+  try:
+    (opts, filenames) = getopt.getopt(args, '', ['help', 'output=', 'verbose=',
+                                                 'counting=',
+                                                 'filter=',
+                                                 'root=',
+                                                 'linelength=',
+                                                 'extensions='])
+  except getopt.GetoptError:
+    PrintUsage('Invalid arguments.')
+
+  verbosity = _VerboseLevel()
+  output_format = _OutputFormat()
+  filters = ''
+  counting_style = ''
+
+  for (opt, val) in opts:
+    if opt == '--help':
+      PrintUsage(None)
+    elif opt == '--output':
+      if val not in ('emacs', 'vs7', 'eclipse'):
+        PrintUsage('The only allowed output formats are emacs, vs7 and eclipse.')
+      output_format = val
+    elif opt == '--verbose':
+      verbosity = int(val)
+    elif opt == '--filter':
+      filters = val
+      if not filters:
+        PrintCategories()
+    elif opt == '--counting':
+      if val not in ('total', 'toplevel', 'detailed'):
+        PrintUsage('Valid counting options are total, toplevel, and detailed')
+      counting_style = val
+    elif opt == '--root':
+      global _root
+      _root = val
+    elif opt == '--linelength':
+      global _line_length
+      try:
+          _line_length = int(val)
+      except ValueError:
+          PrintUsage('Line length must be digits.')
+    elif opt == '--extensions':
+      global _valid_extensions
+      try:
+          _valid_extensions = set(val.split(','))
+      except ValueError:
+          PrintUsage('Extensions must be comma seperated list.')
+
+  if not filenames:
+    PrintUsage('No files were specified.')
+
+  _SetOutputFormat(output_format)
+  _SetVerboseLevel(verbosity)
+  _SetFilters(filters)
+  _SetCountingStyle(counting_style)
+
+  return filenames
+
+
+def main():
+  filenames = ParseArguments(sys.argv[1:])
+
+  # Change stderr to write with replacement characters so we don't die
+  # if we try to print something containing non-ASCII characters.
+  sys.stderr = codecs.StreamReaderWriter(sys.stderr,
+                                         codecs.getreader('utf8'),
+                                         codecs.getwriter('utf8'),
+                                         'replace')
+
+  _cpplint_state.ResetErrorCounts()
+  for filename in filenames:
+    ProcessFile(filename, _cpplint_state.verbose_level)
+  _cpplint_state.PrintErrorCounts()
+
+  sys.exit(_cpplint_state.error_count > 0)
+
+
+if __name__ == '__main__':
+  main()
diff --git a/libs/libvpx/tools/diff.py b/libs/libvpx/tools/diff.py
new file mode 100644
index 0000000000..a96c7db851
--- /dev/null
+++ b/libs/libvpx/tools/diff.py
@@ -0,0 +1,130 @@
+#!/usr/bin/env python
+##  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+"""Classes for representing diff pieces."""
+
+__author__ = "jkoleszar@google.com"
+
+import re
+
+
+class DiffLines(object):
+    """A container for one half of a diff."""
+
+    def __init__(self, filename, offset, length):
+        self.filename = filename
+        self.offset = offset
+        self.length = length
+        self.lines = []
+        self.delta_line_nums = []
+
+    def Append(self, line):
+        l = len(self.lines)
+        if line[0] != " ":
+            self.delta_line_nums.append(self.offset + l)
+        self.lines.append(line[1:])
+        assert l+1 <= self.length
+
+    def Complete(self):
+        return len(self.lines) == self.length
+
+    def __contains__(self, item):
+        return item >= self.offset and item <= self.offset + self.length - 1
+
+
+class DiffHunk(object):
+    """A container for one diff hunk, consisting of two DiffLines."""
+
+    def __init__(self, header, file_a, file_b, start_a, len_a, start_b, len_b):
+        self.header = header
+        self.left = DiffLines(file_a, start_a, len_a)
+        self.right = DiffLines(file_b, start_b, len_b)
+        self.lines = []
+
+    def Append(self, line):
+        """Adds a line to the DiffHunk and its DiffLines children."""
+        if line[0] == "-":
+            self.left.Append(line)
+        elif line[0] == "+":
+            self.right.Append(line)
+        elif line[0] == " ":
+            self.left.Append(line)
+            self.right.Append(line)
+        elif line[0] == "\\":
+            # Ignore newline messages from git diff.
+            pass
+        else:
+            assert False, ("Unrecognized character at start of diff line "
+                           "%r" % line[0])
+        self.lines.append(line)
+
+    def Complete(self):
+        return self.left.Complete() and self.right.Complete()
+
+    def __repr__(self):
+        return "DiffHunk(%s, %s, len %d)" % (
+            self.left.filename, self.right.filename,
+            max(self.left.length, self.right.length))
+
+
+def ParseDiffHunks(stream):
+    """Walk a file-like object, yielding DiffHunks as they're parsed."""
+
+    file_regex = re.compile(r"(\+\+\+|---) (\S+)")
+    range_regex = re.compile(r"@@ -(\d+)(,(\d+))? \+(\d+)(,(\d+))?")
+    hunk = None
+    while True:
+        line = stream.readline()
+        if not line:
+            break
+
+        if hunk is None:
+            # Parse file names
+            diff_file = file_regex.match(line)
+            if diff_file:
+              if line.startswith("---"):
+                  a_line = line
+                  a = diff_file.group(2)
+                  continue
+              if line.startswith("+++"):
+                  b_line = line
+                  b = diff_file.group(2)
+                  continue
+
+            # Parse offset/lengths
+            diffrange = range_regex.match(line)
+            if diffrange:
+                if diffrange.group(2):
+                    start_a = int(diffrange.group(1))
+                    len_a = int(diffrange.group(3))
+                else:
+                    start_a = 1
+                    len_a = int(diffrange.group(1))
+
+                if diffrange.group(5):
+                    start_b = int(diffrange.group(4))
+                    len_b = int(diffrange.group(6))
+                else:
+                    start_b = 1
+                    len_b = int(diffrange.group(4))
+
+                header = [a_line, b_line, line]
+                hunk = DiffHunk(header, a, b, start_a, len_a, start_b, len_b)
+        else:
+            # Add the current line to the hunk
+            hunk.Append(line)
+
+            # See if the whole hunk has been parsed. If so, yield it and prepare
+            # for the next hunk.
+            if hunk.Complete():
+                yield hunk
+                hunk = None
+
+    # Partial hunks are a parse error
+    assert hunk is None
diff --git a/libs/libvpx/tools/ftfy.sh b/libs/libvpx/tools/ftfy.sh
new file mode 100755
index 0000000000..29ae95e9ba
--- /dev/null
+++ b/libs/libvpx/tools/ftfy.sh
@@ -0,0 +1,159 @@
+#!/bin/sh
+self="$0"
+dirname_self=$(dirname "$self")
+
+usage() {
+  cat <<EOF >&2
+Usage: $self [option]
+
+This script applies a whitespace transformation to the commit at HEAD. If no
+options are given, then the modified files are left in the working tree.
+
+Options:
+  -h, --help     Shows this message
+  -n, --dry-run  Shows a diff of the changes to be made.
+  --amend        Squashes the changes into the commit at HEAD
+                     This option will also reformat the commit message.
+  --commit       Creates a new commit containing only the whitespace changes
+  --msg-only     Reformat the commit message only, ignore the patch itself.
+
+EOF
+  rm -f ${CLEAN_FILES}
+  exit 1
+}
+
+
+log() {
+  echo "${self##*/}: $@" >&2
+}
+
+
+vpx_style() {
+  for f; do
+    case "$f" in
+      *.h|*.c|*.cc)
+        "${dirname_self}"/vpx-astyle.sh "$f"
+        ;;
+    esac
+  done
+}
+
+
+apply() {
+  [ $INTERSECT_RESULT -ne 0 ] && patch -p1 < "$1"
+}
+
+
+commit() {
+  LAST_CHANGEID=$(git show | awk '/Change-Id:/{print $2}')
+  if [ -z "$LAST_CHANGEID" ]; then
+    log "HEAD doesn't have a Change-Id, unable to generate a new commit"
+    exit 1
+  fi
+
+  # Build a deterministic Change-Id from the parent's
+  NEW_CHANGEID=${LAST_CHANGEID}-styled
+  NEW_CHANGEID=I$(echo $NEW_CHANGEID | git hash-object --stdin)
+
+  # Commit, preserving authorship from the parent commit.
+  git commit -a -C HEAD > /dev/null
+  git commit --amend -F- << EOF
+Cosmetic: Fix whitespace in change ${LAST_CHANGEID:0:9}
+
+Change-Id: ${NEW_CHANGEID}
+EOF
+}
+
+
+show_commit_msg_diff() {
+  if [ $DIFF_MSG_RESULT -ne 0 ]; then
+    log "Modified commit message:"
+    diff -u "$ORIG_COMMIT_MSG" "$NEW_COMMIT_MSG" | tail -n +3
+  fi
+}
+
+
+amend() {
+  show_commit_msg_diff
+  if [ $DIFF_MSG_RESULT -ne 0 ] || [ $INTERSECT_RESULT -ne 0 ]; then
+    git commit -a --amend -F "$NEW_COMMIT_MSG"
+  fi
+}
+
+
+diff_msg() {
+  git log -1 --format=%B > "$ORIG_COMMIT_MSG"
+  "${dirname_self}"/wrap-commit-msg.py \
+      < "$ORIG_COMMIT_MSG" > "$NEW_COMMIT_MSG"
+  cmp -s "$ORIG_COMMIT_MSG" "$NEW_COMMIT_MSG"
+  DIFF_MSG_RESULT=$?
+}
+
+
+# Temporary files
+ORIG_DIFF=orig.diff.$$
+MODIFIED_DIFF=modified.diff.$$
+FINAL_DIFF=final.diff.$$
+ORIG_COMMIT_MSG=orig.commit-msg.$$
+NEW_COMMIT_MSG=new.commit-msg.$$
+CLEAN_FILES="${ORIG_DIFF} ${MODIFIED_DIFF} ${FINAL_DIFF}"
+CLEAN_FILES="${CLEAN_FILES} ${ORIG_COMMIT_MSG} ${NEW_COMMIT_MSG}"
+
+# Preconditions
+[ $# -lt 2 ] || usage
+
+# Check that astyle supports pad-header and align-pointer=name
+if ! astyle --pad-header --align-pointer=name < /dev/null; then
+  log "Install astyle v1.24 or newer"
+  exit 1
+fi
+
+if ! git diff --quiet HEAD; then
+  log "Working tree is dirty, commit your changes first"
+  exit 1
+fi
+
+# Need to be in the root
+cd "$(git rev-parse --show-toplevel)"
+
+# Collect the original diff
+git show > "${ORIG_DIFF}"
+
+# Apply the style guide on new and modified files and collect its diff
+for f in $(git diff HEAD^ --name-only -M90 --diff-filter=AM); do
+  case "$f" in
+    third_party/*) continue;;
+  esac
+  vpx_style "$f"
+done
+git diff --no-color --no-ext-diff > "${MODIFIED_DIFF}"
+
+# Intersect the two diffs
+"${dirname_self}"/intersect-diffs.py \
+    "${ORIG_DIFF}" "${MODIFIED_DIFF}" > "${FINAL_DIFF}"
+INTERSECT_RESULT=$?
+git reset --hard >/dev/null
+
+# Fixup the commit message
+diff_msg
+
+# Handle options
+if [ -n "$1" ]; then
+  case "$1" in
+    -h|--help) usage;;
+    -n|--dry-run) cat "${FINAL_DIFF}"; show_commit_msg_diff;;
+    --commit) apply "${FINAL_DIFF}"; commit;;
+    --amend) apply "${FINAL_DIFF}"; amend;;
+    --msg-only) amend;;
+    *) usage;;
+  esac
+else
+  apply "${FINAL_DIFF}"
+  if ! git diff --quiet; then
+    log "Formatting changes applied, verify and commit."
+    log "See also: http://www.webmproject.org/code/contribute/conventions/"
+    git diff --stat
+  fi
+fi
+
+rm -f ${CLEAN_FILES}
diff --git a/libs/libvpx/tools/gen_authors.sh b/libs/libvpx/tools/gen_authors.sh
new file mode 100755
index 0000000000..4cfd81ec3f
--- /dev/null
+++ b/libs/libvpx/tools/gen_authors.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+# Add organization names manually.
+
+cat <<EOF
+# This file is automatically generated from the git commit history
+# by tools/gen_authors.sh.
+
+$(git log --pretty=format:"%aN <%aE>" | sort | uniq | grep -v corp.google)
+Google Inc.
+The Mozilla Foundation
+The Xiph.Org Foundation
+EOF
diff --git a/libs/libvpx/tools/intersect-diffs.py b/libs/libvpx/tools/intersect-diffs.py
new file mode 100755
index 0000000000..4dbafa90b7
--- /dev/null
+++ b/libs/libvpx/tools/intersect-diffs.py
@@ -0,0 +1,76 @@
+#!/usr/bin/env python
+##  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+"""Calculates the "intersection" of two unified diffs.
+
+Given two diffs, A and B, it finds all hunks in B that had non-context lines
+in A and prints them to stdout. This is useful to determine the hunks in B that
+are relevant to A. The resulting file can be applied with patch(1) on top of A.
+"""
+
+__author__ = "jkoleszar@google.com"
+
+import sys
+
+import diff
+
+
+def FormatDiffHunks(hunks):
+    """Re-serialize a list of DiffHunks."""
+    r = []
+    last_header = None
+    for hunk in hunks:
+        this_header = hunk.header[0:2]
+        if last_header != this_header:
+            r.extend(hunk.header)
+            last_header = this_header
+        else:
+            r.extend(hunk.header[2])
+        r.extend(hunk.lines)
+        r.append("\n")
+    return "".join(r)
+
+
+def ZipHunks(rhs_hunks, lhs_hunks):
+    """Join two hunk lists on filename."""
+    for rhs_hunk in rhs_hunks:
+        rhs_file = rhs_hunk.right.filename.split("/")[1:]
+
+        for lhs_hunk in lhs_hunks:
+            lhs_file = lhs_hunk.left.filename.split("/")[1:]
+            if lhs_file != rhs_file:
+                continue
+            yield (rhs_hunk, lhs_hunk)
+
+
+def main():
+    old_hunks = [x for x in diff.ParseDiffHunks(open(sys.argv[1], "r"))]
+    new_hunks = [x for x in diff.ParseDiffHunks(open(sys.argv[2], "r"))]
+    out_hunks = []
+
+    # Join the right hand side of the older diff with the left hand side of the
+    # newer diff.
+    for old_hunk, new_hunk in ZipHunks(old_hunks, new_hunks):
+        if new_hunk in out_hunks:
+            continue
+        old_lines = old_hunk.right
+        new_lines = new_hunk.left
+
+        # Determine if this hunk overlaps any non-context line from the other
+        for i in old_lines.delta_line_nums:
+            if i in new_lines:
+                out_hunks.append(new_hunk)
+                break
+
+    if out_hunks:
+        print FormatDiffHunks(out_hunks)
+        sys.exit(1)
+
+if __name__ == "__main__":
+    main()
diff --git a/libs/libvpx/tools/lint-hunks.py b/libs/libvpx/tools/lint-hunks.py
new file mode 100755
index 0000000000..6e25d93624
--- /dev/null
+++ b/libs/libvpx/tools/lint-hunks.py
@@ -0,0 +1,144 @@
+#!/usr/bin/python
+##  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+"""Performs style checking on each diff hunk."""
+import getopt
+import os
+import StringIO
+import subprocess
+import sys
+
+import diff
+
+
+SHORT_OPTIONS = "h"
+LONG_OPTIONS = ["help"]
+
+TOPLEVEL_CMD = ["git", "rev-parse", "--show-toplevel"]
+DIFF_CMD = ["git", "diff"]
+DIFF_INDEX_CMD = ["git", "diff-index", "-u", "HEAD", "--"]
+SHOW_CMD = ["git", "show"]
+CPPLINT_FILTERS = ["-readability/casting"]
+
+
+class Usage(Exception):
+    pass
+
+
+class SubprocessException(Exception):
+    def __init__(self, args):
+        msg = "Failed to execute '%s'"%(" ".join(args))
+        super(SubprocessException, self).__init__(msg)
+
+
+class Subprocess(subprocess.Popen):
+    """Adds the notion of an expected returncode to Popen."""
+
+    def __init__(self, args, expected_returncode=0, **kwargs):
+        self._args = args
+        self._expected_returncode = expected_returncode
+        super(Subprocess, self).__init__(args, **kwargs)
+
+    def communicate(self, *args, **kwargs):
+        result = super(Subprocess, self).communicate(*args, **kwargs)
+        if self._expected_returncode is not None:
+            try:
+                ok = self.returncode in self._expected_returncode
+            except TypeError:
+                ok = self.returncode == self._expected_returncode
+            if not ok:
+                raise SubprocessException(self._args)
+        return result
+
+
+def main(argv=None):
+    if argv is None:
+        argv = sys.argv
+    try:
+        try:
+            opts, args = getopt.getopt(argv[1:], SHORT_OPTIONS, LONG_OPTIONS)
+        except getopt.error, msg:
+            raise Usage(msg)
+
+        # process options
+        for o, _ in opts:
+            if o in ("-h", "--help"):
+                print __doc__
+                sys.exit(0)
+
+        if args and len(args) > 1:
+            print __doc__
+            sys.exit(0)
+
+        # Find the fully qualified path to the root of the tree
+        tl = Subprocess(TOPLEVEL_CMD, stdout=subprocess.PIPE)
+        tl = tl.communicate()[0].strip()
+
+        # See if we're working on the index or not.
+        if args:
+            diff_cmd = DIFF_CMD + [args[0] + "^!"]
+        else:
+            diff_cmd = DIFF_INDEX_CMD
+
+        # Build the command line to execute cpplint
+        cpplint_cmd = [os.path.join(tl, "tools", "cpplint.py"),
+                       "--filter=" + ",".join(CPPLINT_FILTERS),
+                       "-"]
+
+        # Get a list of all affected lines
+        file_affected_line_map = {}
+        p = Subprocess(diff_cmd, stdout=subprocess.PIPE)
+        stdout = p.communicate()[0]
+        for hunk in diff.ParseDiffHunks(StringIO.StringIO(stdout)):
+            filename = hunk.right.filename[2:]
+            if filename not in file_affected_line_map:
+                file_affected_line_map[filename] = set()
+            file_affected_line_map[filename].update(hunk.right.delta_line_nums)
+
+        # Run each affected file through cpplint
+        lint_failed = False
+        for filename, affected_lines in file_affected_line_map.iteritems():
+            if filename.split(".")[-1] not in ("c", "h", "cc"):
+                continue
+
+            if args:
+                # File contents come from git
+                show_cmd = SHOW_CMD + [args[0] + ":" + filename]
+                show = Subprocess(show_cmd, stdout=subprocess.PIPE)
+                lint = Subprocess(cpplint_cmd, expected_returncode=(0, 1),
+                                  stdin=show.stdout, stderr=subprocess.PIPE)
+                lint_out = lint.communicate()[1]
+            else:
+                # File contents come from the working tree
+                lint = Subprocess(cpplint_cmd, expected_returncode=(0, 1),
+                                  stdin=subprocess.PIPE, stderr=subprocess.PIPE)
+                stdin = open(os.path.join(tl, filename)).read()
+                lint_out = lint.communicate(stdin)[1]
+
+            for line in lint_out.split("\n"):
+                fields = line.split(":")
+                if fields[0] != "-":
+                    continue
+                warning_line_num = int(fields[1])
+                if warning_line_num in affected_lines:
+                    print "%s:%d:%s"%(filename, warning_line_num,
+                                      ":".join(fields[2:]))
+                    lint_failed = True
+
+        # Set exit code if any relevant lint errors seen
+        if lint_failed:
+            return 1
+
+    except Usage, err:
+        print >>sys.stderr, err
+        print >>sys.stderr, "for help use --help"
+        return 2
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/libs/libvpx/tools/vpx-astyle.sh b/libs/libvpx/tools/vpx-astyle.sh
new file mode 100755
index 0000000000..6340426bdc
--- /dev/null
+++ b/libs/libvpx/tools/vpx-astyle.sh
@@ -0,0 +1,27 @@
+#!/bin/sh
+set -e
+astyle --style=java --indent=spaces=2 --indent-switches\
+       --min-conditional-indent=0 \
+       --pad-oper --pad-header --unpad-paren \
+       --align-pointer=name \
+       --indent-preprocessor --convert-tabs --indent-labels \
+       --suffix=none --quiet --max-instatement-indent=80 "$@"
+# Disabled, too greedy?
+#sed -i 's;[[:space:]]\{1,\}\[;[;g' "$@"
+
+sed_i() {
+  # Incompatible sed parameter parsing.
+  if sed -i 2>&1 | grep -q 'requires an argument'; then
+    sed -i '' "$@"
+  else
+    sed -i "$@"
+  fi
+}
+
+sed_i -e 's/[[:space:]]\{1,\}\([,;]\)/\1/g' \
+      -e 's/[[:space:]]\{1,\}\([+-]\{2\};\)/\1/g' \
+      -e 's/,[[:space:]]*}/}/g' \
+      -e 's;//\([^/[:space:]].*$\);// \1;g' \
+      -e 's/^\(public\|private\|protected\):$/ \1:/g' \
+      -e 's/[[:space:]]\{1,\}$//g' \
+      "$@"
diff --git a/libs/libvpx/tools/wrap-commit-msg.py b/libs/libvpx/tools/wrap-commit-msg.py
new file mode 100755
index 0000000000..d5b4b046b1
--- /dev/null
+++ b/libs/libvpx/tools/wrap-commit-msg.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+##  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+"""Wraps paragraphs of text, preserving manual formatting
+
+This is like fold(1), but has the special convention of not modifying lines
+that start with whitespace. This allows you to intersperse blocks with
+special formatting, like code blocks, with written prose. The prose will
+be wordwrapped, and the manual formatting will be preserved.
+
+ * This won't handle the case of a bulleted (or ordered) list specially, so
+   manual wrapping must be done.
+
+Occasionally it's useful to put something with explicit formatting that
+doesn't look at all like a block of text inline.
+
+  indicator = has_leading_whitespace(line);
+  if (indicator)
+    preserve_formatting(line);
+
+The intent is that this docstring would make it through the transform
+and still be legible and presented as it is in the source. If additional
+cases are handled, update this doc to describe the effect.
+"""
+
+__author__ = "jkoleszar@google.com"
+import textwrap
+import sys
+
+def wrap(text):
+    if text:
+        return textwrap.fill(text, break_long_words=False) + '\n'
+    return ""
+
+
+def main(fileobj):
+    text = ""
+    output = ""
+    while True:
+        line = fileobj.readline()
+        if not line:
+            break
+
+        if line.lstrip() == line:
+            text += line
+        else:
+            output += wrap(text)
+            text=""
+            output += line
+    output += wrap(text)
+
+    # Replace the file or write to stdout.
+    if fileobj == sys.stdin:
+        fileobj = sys.stdout
+    else:
+        fileobj.seek(0)
+        fileobj.truncate(0)
+    fileobj.write(output)
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1:
+        main(open(sys.argv[1], "r+"))
+    else:
+        main(sys.stdin)
diff --git a/libs/libvpx/tools_common.c b/libs/libvpx/tools_common.c
new file mode 100644
index 0000000000..20b259ca94
--- /dev/null
+++ b/libs/libvpx/tools_common.c
@@ -0,0 +1,502 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "./tools_common.h"
+
+#if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
+#include "vpx/vp8cx.h"
+#endif
+
+#if CONFIG_VP8_DECODER || CONFIG_VP9_DECODER || CONFIG_VP10_DECODER
+#include "vpx/vp8dx.h"
+#endif
+
+#if defined(_WIN32) || defined(__OS2__)
+#include <io.h>
+#include <fcntl.h>
+
+#ifdef __OS2__
+#define _setmode    setmode
+#define _fileno     fileno
+#define _O_BINARY   O_BINARY
+#endif
+#endif
+
+#define LOG_ERROR(label) do {\
+  const char *l = label;\
+  va_list ap;\
+  va_start(ap, fmt);\
+  if (l)\
+    fprintf(stderr, "%s: ", l);\
+  vfprintf(stderr, fmt, ap);\
+  fprintf(stderr, "\n");\
+  va_end(ap);\
+} while (0)
+
+
+FILE *set_binary_mode(FILE *stream) {
+  (void)stream;
+#if defined(_WIN32) || defined(__OS2__)
+  _setmode(_fileno(stream), _O_BINARY);
+#endif
+  return stream;
+}
+
+void die(const char *fmt, ...) {
+  LOG_ERROR(NULL);
+  usage_exit();
+}
+
+void fatal(const char *fmt, ...) {
+  LOG_ERROR("Fatal");
+  exit(EXIT_FAILURE);
+}
+
+void warn(const char *fmt, ...) {
+  LOG_ERROR("Warning");
+}
+
+void die_codec(vpx_codec_ctx_t *ctx, const char *s) {
+  const char *detail = vpx_codec_error_detail(ctx);
+
+  printf("%s: %s\n", s, vpx_codec_error(ctx));
+  if (detail)
+    printf("    %s\n", detail);
+  exit(EXIT_FAILURE);
+}
+
+int read_yuv_frame(struct VpxInputContext *input_ctx, vpx_image_t *yuv_frame) {
+  FILE *f = input_ctx->file;
+  struct FileTypeDetectionBuffer *detect = &input_ctx->detect;
+  int plane = 0;
+  int shortread = 0;
+  const int bytespp = (yuv_frame->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
+
+  for (plane = 0; plane < 3; ++plane) {
+    uint8_t *ptr;
+    const int w = vpx_img_plane_width(yuv_frame, plane);
+    const int h = vpx_img_plane_height(yuv_frame, plane);
+    int r;
+
+    /* Determine the correct plane based on the image format. The for-loop
+     * always counts in Y,U,V order, but this may not match the order of
+     * the data on disk.
+     */
+    switch (plane) {
+      case 1:
+        ptr = yuv_frame->planes[
+            yuv_frame->fmt == VPX_IMG_FMT_YV12 ? VPX_PLANE_V : VPX_PLANE_U];
+        break;
+      case 2:
+        ptr = yuv_frame->planes[
+            yuv_frame->fmt == VPX_IMG_FMT_YV12 ? VPX_PLANE_U : VPX_PLANE_V];
+        break;
+      default:
+        ptr = yuv_frame->planes[plane];
+    }
+
+    for (r = 0; r < h; ++r) {
+      size_t needed = w * bytespp;
+      size_t buf_position = 0;
+      const size_t left = detect->buf_read - detect->position;
+      if (left > 0) {
+        const size_t more = (left < needed) ? left : needed;
+        memcpy(ptr, detect->buf + detect->position, more);
+        buf_position = more;
+        needed -= more;
+        detect->position += more;
+      }
+      if (needed > 0) {
+        shortread |= (fread(ptr + buf_position, 1, needed, f) < needed);
+      }
+
+      ptr += yuv_frame->stride[plane];
+    }
+  }
+
+  return shortread;
+}
+
+#if CONFIG_ENCODERS
+
+static const VpxInterface vpx_encoders[] = {
+#if CONFIG_VP10_ENCODER
+  {"vp10", VP10_FOURCC, &vpx_codec_vp10_cx},
+#endif
+
+#if CONFIG_VP8_ENCODER
+  {"vp8", VP8_FOURCC, &vpx_codec_vp8_cx},
+#endif
+
+#if CONFIG_VP9_ENCODER
+  {"vp9", VP9_FOURCC, &vpx_codec_vp9_cx},
+#endif
+};
+
+int get_vpx_encoder_count(void) {
+  return sizeof(vpx_encoders) / sizeof(vpx_encoders[0]);
+}
+
+const VpxInterface *get_vpx_encoder_by_index(int i) {
+  return &vpx_encoders[i];
+}
+
+const VpxInterface *get_vpx_encoder_by_name(const char *name) {
+  int i;
+
+  for (i = 0; i < get_vpx_encoder_count(); ++i) {
+    const VpxInterface *encoder = get_vpx_encoder_by_index(i);
+    if (strcmp(encoder->name, name) == 0)
+      return encoder;
+  }
+
+  return NULL;
+}
+
+#endif  // CONFIG_ENCODERS
+
+#if CONFIG_DECODERS
+
+static const VpxInterface vpx_decoders[] = {
+#if CONFIG_VP8_DECODER
+  {"vp8", VP8_FOURCC, &vpx_codec_vp8_dx},
+#endif
+
+#if CONFIG_VP9_DECODER
+  {"vp9", VP9_FOURCC, &vpx_codec_vp9_dx},
+#endif
+
+#if CONFIG_VP10_DECODER
+  {"vp10", VP10_FOURCC, &vpx_codec_vp10_dx},
+#endif
+};
+
+int get_vpx_decoder_count(void) {
+  return sizeof(vpx_decoders) / sizeof(vpx_decoders[0]);
+}
+
+const VpxInterface *get_vpx_decoder_by_index(int i) {
+  return &vpx_decoders[i];
+}
+
+const VpxInterface *get_vpx_decoder_by_name(const char *name) {
+  int i;
+
+  for (i = 0; i < get_vpx_decoder_count(); ++i) {
+     const VpxInterface *const decoder = get_vpx_decoder_by_index(i);
+     if (strcmp(decoder->name, name) == 0)
+       return decoder;
+  }
+
+  return NULL;
+}
+
+const VpxInterface *get_vpx_decoder_by_fourcc(uint32_t fourcc) {
+  int i;
+
+  for (i = 0; i < get_vpx_decoder_count(); ++i) {
+    const VpxInterface *const decoder = get_vpx_decoder_by_index(i);
+    if (decoder->fourcc == fourcc)
+      return decoder;
+  }
+
+  return NULL;
+}
+
+#endif  // CONFIG_DECODERS
+
+// TODO(dkovalev): move this function to vpx_image.{c, h}, so it will be part
+// of vpx_image_t support
+int vpx_img_plane_width(const vpx_image_t *img, int plane) {
+  if (plane > 0 && img->x_chroma_shift > 0)
+    return (img->d_w + 1) >> img->x_chroma_shift;
+  else
+    return img->d_w;
+}
+
+int vpx_img_plane_height(const vpx_image_t *img, int plane) {
+  if (plane > 0 &&  img->y_chroma_shift > 0)
+    return (img->d_h + 1) >> img->y_chroma_shift;
+  else
+    return img->d_h;
+}
+
+void vpx_img_write(const vpx_image_t *img, FILE *file) {
+  int plane;
+
+  for (plane = 0; plane < 3; ++plane) {
+    const unsigned char *buf = img->planes[plane];
+    const int stride = img->stride[plane];
+    const int w = vpx_img_plane_width(img, plane) *
+        ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
+    const int h = vpx_img_plane_height(img, plane);
+    int y;
+
+    for (y = 0; y < h; ++y) {
+      fwrite(buf, 1, w, file);
+      buf += stride;
+    }
+  }
+}
+
+int vpx_img_read(vpx_image_t *img, FILE *file) {
+  int plane;
+
+  for (plane = 0; plane < 3; ++plane) {
+    unsigned char *buf = img->planes[plane];
+    const int stride = img->stride[plane];
+    const int w = vpx_img_plane_width(img, plane) *
+        ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
+    const int h = vpx_img_plane_height(img, plane);
+    int y;
+
+    for (y = 0; y < h; ++y) {
+      if (fread(buf, 1, w, file) != (size_t)w)
+        return 0;
+      buf += stride;
+    }
+  }
+
+  return 1;
+}
+
+// TODO(dkovalev) change sse_to_psnr signature: double -> int64_t
+double sse_to_psnr(double samples, double peak, double sse) {
+  static const double kMaxPSNR = 100.0;
+
+  if (sse > 0.0) {
+    const double psnr = 10.0 * log10(samples * peak * peak / sse);
+    return psnr > kMaxPSNR ? kMaxPSNR : psnr;
+  } else {
+    return kMaxPSNR;
+  }
+}
+
+// TODO(debargha): Consolidate the functions below into a separate file.
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_img_upshift(vpx_image_t *dst, vpx_image_t *src,
+                               int input_shift) {
+  // Note the offset is 1 less than half.
+  const int offset = input_shift > 0 ? (1 << (input_shift - 1)) - 1 : 0;
+  int plane;
+  if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
+      dst->x_chroma_shift != src->x_chroma_shift ||
+      dst->y_chroma_shift != src->y_chroma_shift ||
+      dst->fmt != src->fmt || input_shift < 0) {
+    fatal("Unsupported image conversion");
+  }
+  switch (src->fmt) {
+    case VPX_IMG_FMT_I42016:
+    case VPX_IMG_FMT_I42216:
+    case VPX_IMG_FMT_I44416:
+    case VPX_IMG_FMT_I44016:
+      break;
+    default:
+      fatal("Unsupported image conversion");
+      break;
+  }
+  for (plane = 0; plane < 3; plane++) {
+    int w = src->d_w;
+    int h = src->d_h;
+    int x, y;
+    if (plane) {
+      w = (w + src->x_chroma_shift) >> src->x_chroma_shift;
+      h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
+    }
+    for (y = 0; y < h; y++) {
+      uint16_t *p_src =
+          (uint16_t *)(src->planes[plane] + y * src->stride[plane]);
+      uint16_t *p_dst =
+          (uint16_t *)(dst->planes[plane] + y * dst->stride[plane]);
+      for (x = 0; x < w; x++)
+        *p_dst++ = (*p_src++ << input_shift) + offset;
+    }
+  }
+}
+
+static void lowbd_img_upshift(vpx_image_t *dst, vpx_image_t *src,
+                              int input_shift) {
+  // Note the offset is 1 less than half.
+  const int offset = input_shift > 0 ? (1 << (input_shift - 1)) - 1 : 0;
+  int plane;
+  if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
+      dst->x_chroma_shift != src->x_chroma_shift ||
+      dst->y_chroma_shift != src->y_chroma_shift ||
+      dst->fmt != src->fmt + VPX_IMG_FMT_HIGHBITDEPTH ||
+      input_shift < 0) {
+    fatal("Unsupported image conversion");
+  }
+  switch (src->fmt) {
+    case VPX_IMG_FMT_I420:
+    case VPX_IMG_FMT_I422:
+    case VPX_IMG_FMT_I444:
+    case VPX_IMG_FMT_I440:
+      break;
+    default:
+      fatal("Unsupported image conversion");
+      break;
+  }
+  for (plane = 0; plane < 3; plane++) {
+    int w = src->d_w;
+    int h = src->d_h;
+    int x, y;
+    if (plane) {
+      w = (w + src->x_chroma_shift) >> src->x_chroma_shift;
+      h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
+    }
+    for (y = 0; y < h; y++) {
+      uint8_t *p_src = src->planes[plane] + y * src->stride[plane];
+      uint16_t *p_dst =
+          (uint16_t *)(dst->planes[plane] + y * dst->stride[plane]);
+      for (x = 0; x < w; x++) {
+        *p_dst++ = (*p_src++ << input_shift) + offset;
+      }
+    }
+  }
+}
+
+void vpx_img_upshift(vpx_image_t *dst, vpx_image_t *src,
+                     int input_shift) {
+  if (src->fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
+    highbd_img_upshift(dst, src, input_shift);
+  } else {
+    lowbd_img_upshift(dst, src, input_shift);
+  }
+}
+
+void vpx_img_truncate_16_to_8(vpx_image_t *dst, vpx_image_t *src) {
+  int plane;
+  if (dst->fmt + VPX_IMG_FMT_HIGHBITDEPTH != src->fmt ||
+      dst->d_w != src->d_w || dst->d_h != src->d_h ||
+      dst->x_chroma_shift != src->x_chroma_shift ||
+      dst->y_chroma_shift != src->y_chroma_shift) {
+    fatal("Unsupported image conversion");
+  }
+  switch (dst->fmt) {
+    case VPX_IMG_FMT_I420:
+    case VPX_IMG_FMT_I422:
+    case VPX_IMG_FMT_I444:
+    case VPX_IMG_FMT_I440:
+      break;
+    default:
+      fatal("Unsupported image conversion");
+      break;
+  }
+  for (plane = 0; plane < 3; plane++) {
+    int w = src->d_w;
+    int h = src->d_h;
+    int x, y;
+    if (plane) {
+      w = (w + src->x_chroma_shift) >> src->x_chroma_shift;
+      h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
+    }
+    for (y = 0; y < h; y++) {
+      uint16_t *p_src =
+          (uint16_t *)(src->planes[plane] + y * src->stride[plane]);
+      uint8_t *p_dst = dst->planes[plane] + y * dst->stride[plane];
+      for (x = 0; x < w; x++) {
+        *p_dst++ = (uint8_t)(*p_src++);
+      }
+    }
+  }
+}
+
+static void highbd_img_downshift(vpx_image_t *dst, vpx_image_t *src,
+                                 int down_shift) {
+  int plane;
+  if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
+      dst->x_chroma_shift != src->x_chroma_shift ||
+      dst->y_chroma_shift != src->y_chroma_shift ||
+      dst->fmt != src->fmt || down_shift < 0) {
+    fatal("Unsupported image conversion");
+  }
+  switch (src->fmt) {
+    case VPX_IMG_FMT_I42016:
+    case VPX_IMG_FMT_I42216:
+    case VPX_IMG_FMT_I44416:
+    case VPX_IMG_FMT_I44016:
+      break;
+    default:
+      fatal("Unsupported image conversion");
+      break;
+  }
+  for (plane = 0; plane < 3; plane++) {
+    int w = src->d_w;
+    int h = src->d_h;
+    int x, y;
+    if (plane) {
+      w = (w + src->x_chroma_shift) >> src->x_chroma_shift;
+      h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
+    }
+    for (y = 0; y < h; y++) {
+      uint16_t *p_src =
+          (uint16_t *)(src->planes[plane] + y * src->stride[plane]);
+      uint16_t *p_dst =
+          (uint16_t *)(dst->planes[plane] + y * dst->stride[plane]);
+      for (x = 0; x < w; x++)
+        *p_dst++ = *p_src++ >> down_shift;
+    }
+  }
+}
+
+static void lowbd_img_downshift(vpx_image_t *dst, vpx_image_t *src,
+                                int down_shift) {
+  int plane;
+  if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
+      dst->x_chroma_shift != src->x_chroma_shift ||
+      dst->y_chroma_shift != src->y_chroma_shift ||
+      src->fmt != dst->fmt + VPX_IMG_FMT_HIGHBITDEPTH ||
+      down_shift < 0) {
+    fatal("Unsupported image conversion");
+  }
+  switch (dst->fmt) {
+    case VPX_IMG_FMT_I420:
+    case VPX_IMG_FMT_I422:
+    case VPX_IMG_FMT_I444:
+    case VPX_IMG_FMT_I440:
+      break;
+    default:
+      fatal("Unsupported image conversion");
+      break;
+  }
+  for (plane = 0; plane < 3; plane++) {
+    int w = src->d_w;
+    int h = src->d_h;
+    int x, y;
+    if (plane) {
+      w = (w + src->x_chroma_shift) >> src->x_chroma_shift;
+      h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
+    }
+    for (y = 0; y < h; y++) {
+      uint16_t *p_src =
+          (uint16_t *)(src->planes[plane] + y * src->stride[plane]);
+      uint8_t *p_dst = dst->planes[plane] + y * dst->stride[plane];
+      for (x = 0; x < w; x++) {
+        *p_dst++ = *p_src++ >> down_shift;
+      }
+    }
+  }
+}
+
+void vpx_img_downshift(vpx_image_t *dst, vpx_image_t *src,
+                       int down_shift) {
+  if (dst->fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
+    highbd_img_downshift(dst, src, down_shift);
+  } else {
+    lowbd_img_downshift(dst, src, down_shift);
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/libs/libvpx/tools_common.h b/libs/libvpx/tools_common.h
new file mode 100644
index 0000000000..98347b6f27
--- /dev/null
+++ b/libs/libvpx/tools_common.h
@@ -0,0 +1,164 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef TOOLS_COMMON_H_
+#define TOOLS_COMMON_H_
+
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_image.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/msvc.h"
+
+#if CONFIG_ENCODERS
+#include "./y4minput.h"
+#endif
+
+#if defined(_MSC_VER)
+/* MSVS uses _f{seek,tell}i64. */
+#define fseeko _fseeki64
+#define ftello _ftelli64
+#elif defined(_WIN32)
+/* MinGW uses f{seek,tell}o64 for large files. */
+#define fseeko fseeko64
+#define ftello ftello64
+#endif  /* _WIN32 */
+
+#if CONFIG_OS_SUPPORT
+#if defined(_MSC_VER)
+#include <io.h>  /* NOLINT */
+#define isatty   _isatty
+#define fileno   _fileno
+#else
+#include <unistd.h>  /* NOLINT */
+#endif  /* _MSC_VER */
+#endif  /* CONFIG_OS_SUPPORT */
+
+/* Use 32-bit file operations in WebM file format when building ARM
+ * executables (.axf) with RVCT. */
+#if !CONFIG_OS_SUPPORT
+#define fseeko fseek
+#define ftello ftell
+#endif  /* CONFIG_OS_SUPPORT */
+
+#define LITERALU64(hi, lo) ((((uint64_t)hi) << 32) | lo)
+
+#ifndef PATH_MAX
+#define PATH_MAX 512
+#endif
+
+#define IVF_FRAME_HDR_SZ (4 + 8)  /* 4 byte size + 8 byte timestamp */
+#define IVF_FILE_HDR_SZ 32
+
+#define RAW_FRAME_HDR_SZ sizeof(uint32_t)
+
+#define VP8_FOURCC 0x30385056
+#define VP9_FOURCC 0x30395056
+#define VP10_FOURCC 0x303a5056
+
+enum VideoFileType {
+  FILE_TYPE_RAW,
+  FILE_TYPE_IVF,
+  FILE_TYPE_Y4M,
+  FILE_TYPE_WEBM
+};
+
+struct FileTypeDetectionBuffer {
+  char buf[4];
+  size_t buf_read;
+  size_t position;
+};
+
+struct VpxRational {
+  int numerator;
+  int denominator;
+};
+
+struct VpxInputContext {
+  const char *filename;
+  FILE *file;
+  int64_t length;
+  struct FileTypeDetectionBuffer detect;
+  enum VideoFileType file_type;
+  uint32_t width;
+  uint32_t height;
+  struct VpxRational pixel_aspect_ratio;
+  vpx_img_fmt_t fmt;
+  vpx_bit_depth_t bit_depth;
+  int only_i420;
+  uint32_t fourcc;
+  struct VpxRational framerate;
+#if CONFIG_ENCODERS
+  y4m_input y4m;
+#endif
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(__GNUC__)
+#define VPX_NO_RETURN __attribute__((noreturn))
+#else
+#define VPX_NO_RETURN
+#endif
+
+/* Sets a stdio stream into binary mode */
+FILE *set_binary_mode(FILE *stream);
+
+void die(const char *fmt, ...) VPX_NO_RETURN;
+void fatal(const char *fmt, ...) VPX_NO_RETURN;
+void warn(const char *fmt, ...);
+
+void die_codec(vpx_codec_ctx_t *ctx, const char *s) VPX_NO_RETURN;
+
+/* The tool including this file must define usage_exit() */
+void usage_exit(void) VPX_NO_RETURN;
+
+#undef VPX_NO_RETURN
+
+int read_yuv_frame(struct VpxInputContext *input_ctx, vpx_image_t *yuv_frame);
+
+typedef struct VpxInterface {
+  const char *const name;
+  const uint32_t fourcc;
+  vpx_codec_iface_t *(*const codec_interface)();
+} VpxInterface;
+
+int get_vpx_encoder_count(void);
+const VpxInterface *get_vpx_encoder_by_index(int i);
+const VpxInterface *get_vpx_encoder_by_name(const char *name);
+
+int get_vpx_decoder_count(void);
+const VpxInterface *get_vpx_decoder_by_index(int i);
+const VpxInterface *get_vpx_decoder_by_name(const char *name);
+const VpxInterface *get_vpx_decoder_by_fourcc(uint32_t fourcc);
+
+// TODO(dkovalev): move this function to vpx_image.{c, h}, so it will be part
+// of vpx_image_t support
+int vpx_img_plane_width(const vpx_image_t *img, int plane);
+int vpx_img_plane_height(const vpx_image_t *img, int plane);
+void vpx_img_write(const vpx_image_t *img, FILE *file);
+int vpx_img_read(vpx_image_t *img, FILE *file);
+
+double sse_to_psnr(double samples, double peak, double mse);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_img_upshift(vpx_image_t *dst, vpx_image_t *src, int input_shift);
+void vpx_img_downshift(vpx_image_t *dst, vpx_image_t *src, int down_shift);
+void vpx_img_truncate_16_to_8(vpx_image_t *dst, vpx_image_t *src);
+#endif
+
+#ifdef __cplusplus
+}  /* extern "C" */
+#endif
+
+#endif  // TOOLS_COMMON_H_
diff --git a/libs/libvpx/usage.dox b/libs/libvpx/usage.dox
new file mode 100644
index 0000000000..88235202d1
--- /dev/null
+++ b/libs/libvpx/usage.dox
@@ -0,0 +1,136 @@
+/*!\page usage Usage
+
+    The vpx multi-format codec SDK provides a unified interface amongst its
+    supported codecs. This abstraction allows applications using this SDK to
+    easily support multiple video formats with minimal code duplication or
+    "special casing." This section describes the interface common to all codecs.
+    For codec-specific details, see the \ref codecs page.
+
+    The following sections are common to all codecs:
+    - \ref usage_types
+    - \ref usage_features
+    - \ref usage_init
+    - \ref usage_errors
+
+    For more information on decoder and encoder specific usage, see the
+    following pages:
+    \if decoder
+    \li \subpage usage_decode
+    \endif
+    \if encoder
+    \li \subpage usage_encode
+    \endif
+
+    \section usage_types Important Data Types
+    There are two important data structures to consider in this interface.
+
+    \subsection usage_ctxs Contexts
+    A context is a storage area allocated by the calling application that the
+    codec may write into to store details about a single instance of that codec.
+    Most of the context is implementation specific, and thus opaque to the
+    application. The context structure as seen by the application is of fixed
+    size, and thus can be allocated with automatic storage or dynamically
+    on the heap.
+
+    Most operations require an initialized codec context. Codec context
+    instances are codec specific. That is, the codec to be used for the encoded
+    video must be known at initialization time. See #vpx_codec_ctx_t for further
+    information.
+
+    \subsection usage_ifaces Interfaces
+    A codec interface is an opaque structure that controls how function calls
+    into the generic interface are dispatched to their codec-specific
+    implementations. Applications \ref MUSTNOT attempt to examine or override
+    this storage, as it contains internal implementation details likely to
+    change from release to release.
+
+    Each supported codec will expose an interface structure to the application
+    as an <code>extern</code> reference to a structure of the incomplete type
+    #vpx_codec_iface_t.
+
+    \section usage_features Features
+    Several "features" are defined that are optionally implemented by codec
+    algorithms. Indeed, the same algorithm may support different features on
+    different platforms. The purpose of defining these features is that when
+    they are implemented, they conform to a common interface. The features, or
+    capabilities, of an algorithm can be queried from it's interface by using
+    the vpx_codec_get_caps() method. Attempts to invoke features not supported
+    by an algorithm will generally result in #VPX_CODEC_INCAPABLE.
+
+    \if decoder
+    Currently defined decoder features include:
+    - \ref usage_cb
+    - \ref usage_postproc
+    \endif
+
+    \section usage_init Initialization
+    To initialize a codec instance, the address of the codec context
+    and interface structures are passed to an initialization function. Depending
+    on the \ref usage_features that the codec supports, the codec could be
+    initialized in different modes.
+
+    To prevent cases of confusion where the ABI of the library changes,
+    the ABI is versioned. The ABI version number must be passed at
+    initialization time to ensure the application is using a header file that
+    matches the library. The current ABI version number is stored in the
+    preprocessor macros #VPX_CODEC_ABI_VERSION, #VPX_ENCODER_ABI_VERSION, and
+    #VPX_DECODER_ABI_VERSION. For convenience, each initialization function has
+    a wrapper macro that inserts the correct version number. These macros are
+    named like the initialization methods, but without the _ver suffix.
+
+
+    The available initialization methods are:
+    \if encoder
+    \li #vpx_codec_enc_init (calls vpx_codec_enc_init_ver())
+    \li #vpx_codec_enc_init_multi (calls vpx_codec_enc_init_multi_ver())
+    \endif
+    \if decoder
+    \li #vpx_codec_dec_init (calls vpx_codec_dec_init_ver())
+    \endif
+
+
+    \section usage_errors Error Handling
+    Almost all codec functions return an error status of type #vpx_codec_err_t.
+    The semantics of how each error condition should be processed is clearly
+    defined in the definitions of each enumerated value. Error values can be
+    converted into ASCII strings with the vpx_codec_error() and
+    vpx_codec_err_to_string() methods. The difference between these two methods is
+    that vpx_codec_error() returns the error state from an initialized context,
+    whereas vpx_codec_err_to_string() can be used in cases where an error occurs
+    outside any context. The enumerated value returned from the last call can be
+    retrieved from the <code>err</code> member of the decoder context as well.
+    Finally, more detailed error information may be able to be obtained by using
+    the vpx_codec_error_detail() method. Not all errors produce detailed error
+    information.
+
+    In addition to error information, the codec library's build configuration
+    is available at runtime on some platforms. This information can be returned
+    by calling vpx_codec_build_config(), and is formatted as a base64 coded string
+    (comprised of characters in the set [a-z_a-Z0-9+/]). This information is not
+    useful to an application at runtime, but may be of use to vpx for support.
+
+
+    \section usage_deadline Deadline
+    Both the encoding and decoding functions have a <code>deadline</code>
+    parameter. This parameter indicates the amount of time, in microseconds
+    (us), that the application wants the codec to spend processing before
+    returning. This is a soft deadline -- that is, the semantics of the
+    requested operation take precedence over meeting the deadline. If, for
+    example, an application sets a <code>deadline</code> of 1000us, and the
+    frame takes 2000us to decode, the call to vpx_codec_decode() will return
+    after 2000us. In this case the deadline is not met, but the semantics of the
+    function are preserved. If, for the same frame, an application instead sets
+    a <code>deadline</code> of 5000us, the decoder will see that it has 3000us
+    remaining in its time slice when decoding completes. It could then choose to
+    run a set of \ref usage_postproc filters, and perhaps would return after
+    4000us (instead of the allocated 5000us). In this case the deadline is met,
+    and the semantics of the call are preserved, as before.
+
+    The special value <code>0</code> is reserved to represent an infinite
+    deadline. In this case, the codec will perform as much processing as
+    possible to yield the highest quality frame.
+
+    By convention, the value <code>1</code> is used to mean "return as fast as
+    possible."
+
+*/
diff --git a/libs/libvpx/usage_cx.dox b/libs/libvpx/usage_cx.dox
new file mode 100644
index 0000000000..92b0d34ef4
--- /dev/null
+++ b/libs/libvpx/usage_cx.dox
@@ -0,0 +1,13 @@
+/*! \page usage_encode Encoding
+
+    The vpx_codec_encode() function is at the core of the encode loop. It
+    processes raw images passed by the application, producing packets of
+    compressed data. The <code>deadline</code> parameter controls the amount
+    of time in microseconds the encoder should spend working on the frame. For
+    more information on the <code>deadline</code> parameter, see
+    \ref usage_deadline.
+
+
+    \ref samples
+
+*/
diff --git a/libs/libvpx/usage_dx.dox b/libs/libvpx/usage_dx.dox
new file mode 100644
index 0000000000..883ce24926
--- /dev/null
+++ b/libs/libvpx/usage_dx.dox
@@ -0,0 +1,62 @@
+/*! \page usage_decode Decoding
+
+    The vpx_codec_decode() function is at the core of the decode loop. It
+    processes packets of compressed data passed by the application, producing
+    decoded images. The decoder expects packets to comprise exactly one image
+    frame of data. Packets \ref MUST be passed in decode order. If the
+    application wishes to associate some data with the frame, the
+    <code>user_priv</code> member may be set. The <code>deadline</code>
+    parameter controls the amount of time in microseconds the decoder should
+    spend working on the frame. This is typically used to support adaptive
+    \ref usage_postproc based on the amount of free CPU time. For more
+    information on the <code>deadline</code> parameter, see \ref usage_deadline.
+
+    \ref samples
+
+
+    \section usage_cb Callback Based Decoding
+    There are two methods for the application to access decoded frame data. Some
+    codecs support asynchronous (callback-based) decoding \ref usage_features
+    that allow the application to register a callback to be invoked by the
+    decoder when decoded data becomes available. Decoders are not required to
+    support this feature, however. Like all \ref usage_features, support can be
+    determined by calling vpx_codec_get_caps(). Callbacks are available in both
+    frame-based and slice-based variants. Frame based callbacks conform to the
+    signature of #vpx_codec_put_frame_cb_fn_t and are invoked once the entire
+    frame has been decoded. Slice based callbacks conform to the signature of
+    #vpx_codec_put_slice_cb_fn_t and are invoked after a subsection of the frame
+    is decoded. For example, a slice callback could be issued for each
+    macroblock row. However, the number and size of slices to return is
+    implementation specific. Also, the image data passed in a slice callback is
+    not necessarily in the same memory segment as the data will be when it is
+    assembled into a full frame. For this reason, the application \ref MUST
+    examine the rectangles that describe what data is valid to access and what
+    data has been updated in this call. For all their additional complexity,
+    slice based decoding callbacks provide substantial speed gains to the
+    overall application in some cases, due to improved cache behavior.
+
+
+    \section usage_frame_iter Frame Iterator Based Decoding
+    If the codec does not support callback based decoding, or the application
+    chooses not to make use of that feature, decoded frames are made available
+    through the vpx_codec_get_frame() iterator. The application initializes the
+    iterator storage (of type #vpx_codec_iter_t) to NULL, then calls
+    vpx_codec_get_frame repeatedly until it returns NULL, indicating that all
+    images have been returned. This process may result in zero, one, or many
+    frames that are ready for display, depending on the codec.
+
+
+    \section usage_postproc Postprocessing
+    Postprocessing is a process that is applied after a frame is decoded to
+    enhance the image's appearance by removing artifacts introduced in the
+    compression process. It is not required to properly decode the frame, and
+    is generally done only when there is enough spare CPU time to execute
+    the required filters. Codecs may support a number of different
+    postprocessing filters, and the available filters may differ from platform
+    to platform. Embedded devices often do not have enough CPU to implement
+    postprocessing in software. The filter selection is generally handled
+    automatically by the codec, depending on the amount of time remaining before
+    hitting the user-specified \ref usage_deadline after decoding the frame.
+
+
+*/
diff --git a/libs/libvpx/video_common.h b/libs/libvpx/video_common.h
new file mode 100644
index 0000000000..44b27a8390
--- /dev/null
+++ b/libs/libvpx/video_common.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VIDEO_COMMON_H_
+#define VIDEO_COMMON_H_
+
+#include "./tools_common.h"
+
+typedef struct {
+  uint32_t codec_fourcc;
+  int frame_width;
+  int frame_height;
+  struct VpxRational time_base;
+} VpxVideoInfo;
+
+#endif  // VIDEO_COMMON_H_
diff --git a/libs/libvpx/video_reader.c b/libs/libvpx/video_reader.c
new file mode 100644
index 0000000000..39c7edba1e
--- /dev/null
+++ b/libs/libvpx/video_reader.c
@@ -0,0 +1,83 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "./ivfdec.h"
+#include "./video_reader.h"
+
+#include "vpx_ports/mem_ops.h"
+
+static const char *const kIVFSignature = "DKIF";
+
+struct VpxVideoReaderStruct {
+  VpxVideoInfo info;
+  FILE *file;
+  uint8_t *buffer;
+  size_t buffer_size;
+  size_t frame_size;
+};
+
+VpxVideoReader *vpx_video_reader_open(const char *filename) {
+  char header[32];
+  VpxVideoReader *reader = NULL;
+  FILE *const file = fopen(filename, "rb");
+  if (!file)
+    return NULL;  // Can't open file
+
+  if (fread(header, 1, 32, file) != 32)
+    return NULL;  // Can't read file header
+
+  if (memcmp(kIVFSignature, header, 4) != 0)
+    return NULL;  // Wrong IVF signature
+
+  if (mem_get_le16(header + 4) != 0)
+    return NULL;  // Wrong IVF version
+
+  reader = calloc(1, sizeof(*reader));
+  if (!reader)
+    return NULL;  // Can't allocate VpxVideoReader
+
+  reader->file = file;
+  reader->info.codec_fourcc = mem_get_le32(header + 8);
+  reader->info.frame_width = mem_get_le16(header + 12);
+  reader->info.frame_height = mem_get_le16(header + 14);
+  reader->info.time_base.numerator = mem_get_le32(header + 16);
+  reader->info.time_base.denominator = mem_get_le32(header + 20);
+
+  return reader;
+}
+
+void vpx_video_reader_close(VpxVideoReader *reader) {
+  if (reader) {
+    fclose(reader->file);
+    free(reader->buffer);
+    free(reader);
+  }
+}
+
+int vpx_video_reader_read_frame(VpxVideoReader *reader) {
+  return !ivf_read_frame(reader->file, &reader->buffer, &reader->frame_size,
+                         &reader->buffer_size);
+}
+
+const uint8_t *vpx_video_reader_get_frame(VpxVideoReader *reader,
+                                          size_t *size) {
+  if (size)
+    *size = reader->frame_size;
+
+  return reader->buffer;
+}
+
+const VpxVideoInfo *vpx_video_reader_get_info(VpxVideoReader *reader) {
+  return &reader->info;
+}
+
diff --git a/libs/libvpx/video_reader.h b/libs/libvpx/video_reader.h
new file mode 100644
index 0000000000..a62c6d7109
--- /dev/null
+++ b/libs/libvpx/video_reader.h
@@ -0,0 +1,52 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VIDEO_READER_H_
+#define VIDEO_READER_H_
+
+#include "./video_common.h"
+
+// The following code is work in progress. It is going to  support transparent
+// reading of input files. Right now only IVF format is supported for
+// simplicity. The main goal the API is to be simple and easy to use in example
+// code and in vpxenc/vpxdec later. All low-level details like memory
+// buffer management are hidden from API users.
+struct VpxVideoReaderStruct;
+typedef struct VpxVideoReaderStruct VpxVideoReader;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Opens the input file for reading and inspects it to determine file type.
+// Returns an opaque VpxVideoReader* upon success, or NULL upon failure.
+// Right now only IVF format is supported.
+VpxVideoReader *vpx_video_reader_open(const char *filename);
+
+// Frees all resources associated with VpxVideoReader* returned from
+// vpx_video_reader_open() call.
+void vpx_video_reader_close(VpxVideoReader *reader);
+
+// Reads frame from the file and stores it in internal buffer.
+int vpx_video_reader_read_frame(VpxVideoReader *reader);
+
+// Returns the pointer to memory buffer with frame data read by last call to
+// vpx_video_reader_read_frame().
+const uint8_t *vpx_video_reader_get_frame(VpxVideoReader *reader,
+                                          size_t *size);
+
+// Fills VpxVideoInfo with information from opened video file.
+const VpxVideoInfo *vpx_video_reader_get_info(VpxVideoReader *reader);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VIDEO_READER_H_
diff --git a/libs/libvpx/video_writer.c b/libs/libvpx/video_writer.c
new file mode 100644
index 0000000000..3695236bfa
--- /dev/null
+++ b/libs/libvpx/video_writer.c
@@ -0,0 +1,80 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./ivfenc.h"
+#include "./video_writer.h"
+#include "vpx/vpx_encoder.h"
+
+struct VpxVideoWriterStruct {
+  VpxVideoInfo info;
+  FILE *file;
+  int frame_count;
+};
+
+static void write_header(FILE *file, const VpxVideoInfo *info,
+                         int frame_count) {
+  struct vpx_codec_enc_cfg cfg;
+  cfg.g_w = info->frame_width;
+  cfg.g_h = info->frame_height;
+  cfg.g_timebase.num = info->time_base.numerator;
+  cfg.g_timebase.den = info->time_base.denominator;
+
+  ivf_write_file_header(file, &cfg, info->codec_fourcc, frame_count);
+}
+
+VpxVideoWriter *vpx_video_writer_open(const char *filename,
+                                      VpxContainer container,
+                                      const VpxVideoInfo *info) {
+  if (container == kContainerIVF) {
+    VpxVideoWriter *writer = NULL;
+    FILE *const file = fopen(filename, "wb");
+    if (!file)
+      return NULL;
+
+    writer = malloc(sizeof(*writer));
+    if (!writer)
+      return NULL;
+
+    writer->frame_count = 0;
+    writer->info = *info;
+    writer->file = file;
+
+    write_header(writer->file, info, 0);
+
+    return writer;
+  }
+
+  return NULL;
+}
+
+void vpx_video_writer_close(VpxVideoWriter *writer) {
+  if (writer) {
+    // Rewriting frame header with real frame count
+    rewind(writer->file);
+    write_header(writer->file, &writer->info, writer->frame_count);
+
+    fclose(writer->file);
+    free(writer);
+  }
+}
+
+int vpx_video_writer_write_frame(VpxVideoWriter *writer,
+                                 const uint8_t *buffer, size_t size,
+                                 int64_t pts) {
+  ivf_write_frame_header(writer->file, pts, size);
+  if (fwrite(buffer, 1, size, writer->file) != size)
+    return 0;
+
+  ++writer->frame_count;
+
+  return 1;
+}
diff --git a/libs/libvpx/video_writer.h b/libs/libvpx/video_writer.h
new file mode 100644
index 0000000000..5dbfe52ea0
--- /dev/null
+++ b/libs/libvpx/video_writer.h
@@ -0,0 +1,47 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VIDEO_WRITER_H_
+#define VIDEO_WRITER_H_
+
+#include "./video_common.h"
+
+typedef enum {
+  kContainerIVF
+} VpxContainer;
+
+struct VpxVideoWriterStruct;
+typedef struct VpxVideoWriterStruct VpxVideoWriter;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Finds and opens writer for specified container format.
+// Returns an opaque VpxVideoWriter* upon success, or NULL upon failure.
+// Right now only IVF format is supported.
+VpxVideoWriter *vpx_video_writer_open(const char *filename,
+                                      VpxContainer container,
+                                      const VpxVideoInfo *info);
+
+// Frees all resources associated with VpxVideoWriter* returned from
+// vpx_video_writer_open() call.
+void vpx_video_writer_close(VpxVideoWriter *writer);
+
+// Writes frame bytes to the file.
+int vpx_video_writer_write_frame(VpxVideoWriter *writer,
+                                 const uint8_t *buffer, size_t size,
+                                 int64_t pts);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VIDEO_WRITER_H_
diff --git a/libs/libvpx/vp10/common/alloccommon.c b/libs/libvpx/vp10/common/alloccommon.c
new file mode 100644
index 0000000000..9ca86e5e58
--- /dev/null
+++ b/libs/libvpx/vp10/common/alloccommon.c
@@ -0,0 +1,164 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "vpx_mem/vpx_mem.h"
+
+#include "vp10/common/alloccommon.h"
+#include "vp10/common/blockd.h"
+#include "vp10/common/entropymode.h"
+#include "vp10/common/entropymv.h"
+#include "vp10/common/onyxc_int.h"
+
+void vp10_set_mb_mi(VP10_COMMON *cm, int width, int height) {
+  const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2);
+  const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2);
+
+  cm->mi_cols = aligned_width >> MI_SIZE_LOG2;
+  cm->mi_rows = aligned_height >> MI_SIZE_LOG2;
+  cm->mi_stride = calc_mi_size(cm->mi_cols);
+
+  cm->mb_cols = (cm->mi_cols + 1) >> 1;
+  cm->mb_rows = (cm->mi_rows + 1) >> 1;
+  cm->MBs = cm->mb_rows * cm->mb_cols;
+}
+
+static int alloc_seg_map(VP10_COMMON *cm, int seg_map_size) {
+  int i;
+
+  for (i = 0; i < NUM_PING_PONG_BUFFERS; ++i) {
+    cm->seg_map_array[i] = (uint8_t *)vpx_calloc(seg_map_size, 1);
+    if (cm->seg_map_array[i] == NULL)
+      return 1;
+  }
+  cm->seg_map_alloc_size = seg_map_size;
+
+  // Init the index.
+  cm->seg_map_idx = 0;
+  cm->prev_seg_map_idx = 1;
+
+  cm->current_frame_seg_map = cm->seg_map_array[cm->seg_map_idx];
+  if (!cm->frame_parallel_decode)
+    cm->last_frame_seg_map = cm->seg_map_array[cm->prev_seg_map_idx];
+
+  return 0;
+}
+
+static void free_seg_map(VP10_COMMON *cm) {
+  int i;
+
+  for (i = 0; i < NUM_PING_PONG_BUFFERS; ++i) {
+    vpx_free(cm->seg_map_array[i]);
+    cm->seg_map_array[i] = NULL;
+  }
+
+  cm->current_frame_seg_map = NULL;
+
+  if (!cm->frame_parallel_decode) {
+    cm->last_frame_seg_map = NULL;
+  }
+}
+
+void vp10_free_ref_frame_buffers(BufferPool *pool) {
+  int i;
+
+  for (i = 0; i < FRAME_BUFFERS; ++i) {
+    if (pool->frame_bufs[i].ref_count > 0 &&
+        pool->frame_bufs[i].raw_frame_buffer.data != NULL) {
+      pool->release_fb_cb(pool->cb_priv, &pool->frame_bufs[i].raw_frame_buffer);
+      pool->frame_bufs[i].ref_count = 0;
+    }
+    vpx_free(pool->frame_bufs[i].mvs);
+    pool->frame_bufs[i].mvs = NULL;
+    vpx_free_frame_buffer(&pool->frame_bufs[i].buf);
+  }
+}
+
+void vp10_free_postproc_buffers(VP10_COMMON *cm) {
+#if CONFIG_VP9_POSTPROC
+  vpx_free_frame_buffer(&cm->post_proc_buffer);
+  vpx_free_frame_buffer(&cm->post_proc_buffer_int);
+#else
+  (void)cm;
+#endif
+}
+
+void vp10_free_context_buffers(VP10_COMMON *cm) {
+  cm->free_mi(cm);
+  free_seg_map(cm);
+  vpx_free(cm->above_context);
+  cm->above_context = NULL;
+  vpx_free(cm->above_seg_context);
+  cm->above_seg_context = NULL;
+}
+
+int vp10_alloc_context_buffers(VP10_COMMON *cm, int width, int height) {
+  int new_mi_size;
+
+  vp10_set_mb_mi(cm, width, height);
+  new_mi_size = cm->mi_stride * calc_mi_size(cm->mi_rows);
+  if (cm->mi_alloc_size < new_mi_size) {
+    cm->free_mi(cm);
+    if (cm->alloc_mi(cm, new_mi_size))
+      goto fail;
+  }
+
+  if (cm->seg_map_alloc_size < cm->mi_rows * cm->mi_cols) {
+    // Create the segmentation map structure and set to 0.
+    free_seg_map(cm);
+    if (alloc_seg_map(cm, cm->mi_rows * cm->mi_cols))
+      goto fail;
+  }
+
+  if (cm->above_context_alloc_cols < cm->mi_cols) {
+    vpx_free(cm->above_context);
+    cm->above_context = (ENTROPY_CONTEXT *)vpx_calloc(
+        2 * mi_cols_aligned_to_sb(cm->mi_cols) * MAX_MB_PLANE,
+        sizeof(*cm->above_context));
+    if (!cm->above_context) goto fail;
+
+    vpx_free(cm->above_seg_context);
+    cm->above_seg_context = (PARTITION_CONTEXT *)vpx_calloc(
+        mi_cols_aligned_to_sb(cm->mi_cols), sizeof(*cm->above_seg_context));
+    if (!cm->above_seg_context) goto fail;
+    cm->above_context_alloc_cols = cm->mi_cols;
+  }
+
+  return 0;
+
+ fail:
+  vp10_free_context_buffers(cm);
+  return 1;
+}
+
+void vp10_remove_common(VP10_COMMON *cm) {
+  vp10_free_context_buffers(cm);
+
+  vpx_free(cm->fc);
+  cm->fc = NULL;
+  vpx_free(cm->frame_contexts);
+  cm->frame_contexts = NULL;
+}
+
+void vp10_init_context_buffers(VP10_COMMON *cm) {
+  cm->setup_mi(cm);
+  if (cm->last_frame_seg_map && !cm->frame_parallel_decode)
+    memset(cm->last_frame_seg_map, 0, cm->mi_rows * cm->mi_cols);
+}
+
+void vp10_swap_current_and_last_seg_map(VP10_COMMON *cm) {
+  // Swap indices.
+  const int tmp = cm->seg_map_idx;
+  cm->seg_map_idx = cm->prev_seg_map_idx;
+  cm->prev_seg_map_idx = tmp;
+
+  cm->current_frame_seg_map = cm->seg_map_array[cm->seg_map_idx];
+  cm->last_frame_seg_map = cm->seg_map_array[cm->prev_seg_map_idx];
+}
diff --git a/libs/libvpx/vp10/common/alloccommon.h b/libs/libvpx/vp10/common/alloccommon.h
new file mode 100644
index 0000000000..5cfe6602d3
--- /dev/null
+++ b/libs/libvpx/vp10/common/alloccommon.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP10_COMMON_ALLOCCOMMON_H_
+#define VP10_COMMON_ALLOCCOMMON_H_
+
+#define INVALID_IDX -1  // Invalid buffer index.
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP10Common;
+struct BufferPool;
+
+void vp10_remove_common(struct VP10Common *cm);
+
+int vp10_alloc_context_buffers(struct VP10Common *cm, int width, int height);
+void vp10_init_context_buffers(struct VP10Common *cm);
+void vp10_free_context_buffers(struct VP10Common *cm);
+
+void vp10_free_ref_frame_buffers(struct BufferPool *pool);
+void vp10_free_postproc_buffers(struct VP10Common *cm);
+
+int vp10_alloc_state_buffers(struct VP10Common *cm, int width, int height);
+void vp10_free_state_buffers(struct VP10Common *cm);
+
+void vp10_set_mb_mi(struct VP10Common *cm, int width, int height);
+
+void vp10_swap_current_and_last_seg_map(struct VP10Common *cm);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_COMMON_ALLOCCOMMON_H_
diff --git a/libs/libvpx/vp10/common/arm/neon/iht4x4_add_neon.c b/libs/libvpx/vp10/common/arm/neon/iht4x4_add_neon.c
new file mode 100644
index 0000000000..bd3e8b30f4
--- /dev/null
+++ b/libs/libvpx/vp10/common/arm/neon/iht4x4_add_neon.c
@@ -0,0 +1,248 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vp10_rtcd.h"
+#include "./vpx_config.h"
+#include "vp10/common/common.h"
+
+static int16_t sinpi_1_9 = 0x14a3;
+static int16_t sinpi_2_9 = 0x26c9;
+static int16_t sinpi_3_9 = 0x3441;
+static int16_t sinpi_4_9 = 0x3b6c;
+static int16_t cospi_8_64 = 0x3b21;
+static int16_t cospi_16_64 = 0x2d41;
+static int16_t cospi_24_64 = 0x187e;
+
+static INLINE void TRANSPOSE4X4(
+        int16x8_t *q8s16,
+        int16x8_t *q9s16) {
+    int32x4_t q8s32, q9s32;
+    int16x4x2_t d0x2s16, d1x2s16;
+    int32x4x2_t q0x2s32;
+
+    d0x2s16 = vtrn_s16(vget_low_s16(*q8s16), vget_high_s16(*q8s16));
+    d1x2s16 = vtrn_s16(vget_low_s16(*q9s16), vget_high_s16(*q9s16));
+
+    q8s32 = vreinterpretq_s32_s16(vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]));
+    q9s32 = vreinterpretq_s32_s16(vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]));
+    q0x2s32 = vtrnq_s32(q8s32, q9s32);
+
+    *q8s16 = vreinterpretq_s16_s32(q0x2s32.val[0]);
+    *q9s16 = vreinterpretq_s16_s32(q0x2s32.val[1]);
+    return;
+}
+
+static INLINE void GENERATE_COSINE_CONSTANTS(
+        int16x4_t *d0s16,
+        int16x4_t *d1s16,
+        int16x4_t *d2s16) {
+    *d0s16 = vdup_n_s16(cospi_8_64);
+    *d1s16 = vdup_n_s16(cospi_16_64);
+    *d2s16 = vdup_n_s16(cospi_24_64);
+    return;
+}
+
+static INLINE void GENERATE_SINE_CONSTANTS(
+        int16x4_t *d3s16,
+        int16x4_t *d4s16,
+        int16x4_t *d5s16,
+        int16x8_t *q3s16) {
+    *d3s16 = vdup_n_s16(sinpi_1_9);
+    *d4s16 = vdup_n_s16(sinpi_2_9);
+    *q3s16 = vdupq_n_s16(sinpi_3_9);
+    *d5s16 = vdup_n_s16(sinpi_4_9);
+    return;
+}
+
+static INLINE void IDCT4x4_1D(
+        int16x4_t *d0s16,
+        int16x4_t *d1s16,
+        int16x4_t *d2s16,
+        int16x8_t *q8s16,
+        int16x8_t *q9s16) {
+    int16x4_t d16s16, d17s16, d18s16, d19s16, d23s16, d24s16;
+    int16x4_t d26s16, d27s16, d28s16, d29s16;
+    int32x4_t q10s32, q13s32, q14s32, q15s32;
+    int16x8_t q13s16, q14s16;
+
+    d16s16 = vget_low_s16(*q8s16);
+    d17s16 = vget_high_s16(*q8s16);
+    d18s16 = vget_low_s16(*q9s16);
+    d19s16 = vget_high_s16(*q9s16);
+
+    d23s16 = vadd_s16(d16s16, d18s16);
+    d24s16 = vsub_s16(d16s16, d18s16);
+
+    q15s32 = vmull_s16(d17s16, *d2s16);
+    q10s32 = vmull_s16(d17s16, *d0s16);
+    q13s32 = vmull_s16(d23s16, *d1s16);
+    q14s32 = vmull_s16(d24s16, *d1s16);
+    q15s32 = vmlsl_s16(q15s32, d19s16, *d0s16);
+    q10s32 = vmlal_s16(q10s32, d19s16, *d2s16);
+
+    d26s16 = vqrshrn_n_s32(q13s32, 14);
+    d27s16 = vqrshrn_n_s32(q14s32, 14);
+    d29s16 = vqrshrn_n_s32(q15s32, 14);
+    d28s16 = vqrshrn_n_s32(q10s32, 14);
+
+    q13s16 = vcombine_s16(d26s16, d27s16);
+    q14s16 = vcombine_s16(d28s16, d29s16);
+    *q8s16 = vaddq_s16(q13s16, q14s16);
+    *q9s16 = vsubq_s16(q13s16, q14s16);
+    *q9s16 = vcombine_s16(vget_high_s16(*q9s16),
+                          vget_low_s16(*q9s16));  // vswp
+    return;
+}
+
+static INLINE void IADST4x4_1D(
+        int16x4_t *d3s16,
+        int16x4_t *d4s16,
+        int16x4_t *d5s16,
+        int16x8_t *q3s16,
+        int16x8_t *q8s16,
+        int16x8_t *q9s16) {
+    int16x4_t d6s16, d16s16, d17s16, d18s16, d19s16;
+    int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32;
+
+    d6s16 = vget_low_s16(*q3s16);
+
+    d16s16 = vget_low_s16(*q8s16);
+    d17s16 = vget_high_s16(*q8s16);
+    d18s16 = vget_low_s16(*q9s16);
+    d19s16 = vget_high_s16(*q9s16);
+
+    q10s32 = vmull_s16(*d3s16, d16s16);
+    q11s32 = vmull_s16(*d4s16, d16s16);
+    q12s32 = vmull_s16(d6s16, d17s16);
+    q13s32 = vmull_s16(*d5s16, d18s16);
+    q14s32 = vmull_s16(*d3s16, d18s16);
+    q15s32 = vmovl_s16(d16s16);
+    q15s32 = vaddw_s16(q15s32, d19s16);
+    q8s32  = vmull_s16(*d4s16, d19s16);
+    q15s32 = vsubw_s16(q15s32, d18s16);
+    q9s32  = vmull_s16(*d5s16, d19s16);
+
+    q10s32 = vaddq_s32(q10s32, q13s32);
+    q10s32 = vaddq_s32(q10s32, q8s32);
+    q11s32 = vsubq_s32(q11s32, q14s32);
+    q8s32  = vdupq_n_s32(sinpi_3_9);
+    q11s32 = vsubq_s32(q11s32, q9s32);
+    q15s32 = vmulq_s32(q15s32, q8s32);
+
+    q13s32 = vaddq_s32(q10s32, q12s32);
+    q10s32 = vaddq_s32(q10s32, q11s32);
+    q14s32 = vaddq_s32(q11s32, q12s32);
+    q10s32 = vsubq_s32(q10s32, q12s32);
+
+    d16s16 = vqrshrn_n_s32(q13s32, 14);
+    d17s16 = vqrshrn_n_s32(q14s32, 14);
+    d18s16 = vqrshrn_n_s32(q15s32, 14);
+    d19s16 = vqrshrn_n_s32(q10s32, 14);
+
+    *q8s16 = vcombine_s16(d16s16, d17s16);
+    *q9s16 = vcombine_s16(d18s16, d19s16);
+    return;
+}
+
+void vp10_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
+                            int dest_stride, int tx_type) {
+    uint8x8_t d26u8, d27u8;
+    int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16;
+    uint32x2_t d26u32, d27u32;
+    int16x8_t q3s16, q8s16, q9s16;
+    uint16x8_t q8u16, q9u16;
+
+    d26u32 = d27u32 = vdup_n_u32(0);
+
+    q8s16 = vld1q_s16(input);
+    q9s16 = vld1q_s16(input + 8);
+
+    TRANSPOSE4X4(&q8s16, &q9s16);
+
+    switch (tx_type) {
+      case 0:  // idct_idct is not supported. Fall back to C
+        vp10_iht4x4_16_add_c(input, dest, dest_stride, tx_type);
+        return;
+        break;
+      case 1:  // iadst_idct
+        // generate constants
+        GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);
+        GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
+
+        // first transform rows
+        IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16);
+
+        // transpose the matrix
+        TRANSPOSE4X4(&q8s16, &q9s16);
+
+        // then transform columns
+        IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+        break;
+      case 2:  // idct_iadst
+        // generate constantsyy
+        GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);
+        GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
+
+        // first transform rows
+        IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+
+        // transpose the matrix
+        TRANSPOSE4X4(&q8s16, &q9s16);
+
+        // then transform columns
+        IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16);
+        break;
+      case 3:  // iadst_iadst
+        // generate constants
+        GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
+
+        // first transform rows
+        IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+
+        // transpose the matrix
+        TRANSPOSE4X4(&q8s16, &q9s16);
+
+        // then transform columns
+        IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+        break;
+      default:  // iadst_idct
+        assert(0);
+        break;
+    }
+
+    q8s16 = vrshrq_n_s16(q8s16, 4);
+    q9s16 = vrshrq_n_s16(q9s16, 4);
+
+    d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 0);
+    dest += dest_stride;
+    d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 1);
+    dest += dest_stride;
+    d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 0);
+    dest += dest_stride;
+    d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 1);
+
+    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32));
+    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32));
+
+    d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+    d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+
+    vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 1);
+    dest -= dest_stride;
+    vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 0);
+    dest -= dest_stride;
+    vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 1);
+    dest -= dest_stride;
+    vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 0);
+    return;
+}
diff --git a/libs/libvpx/vp10/common/arm/neon/iht8x8_add_neon.c b/libs/libvpx/vp10/common/arm/neon/iht8x8_add_neon.c
new file mode 100644
index 0000000000..82d7ccc612
--- /dev/null
+++ b/libs/libvpx/vp10/common/arm/neon/iht8x8_add_neon.c
@@ -0,0 +1,624 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vp10_rtcd.h"
+#include "./vpx_config.h"
+#include "vp10/common/common.h"
+
+static int16_t cospi_2_64 = 16305;
+static int16_t cospi_4_64 = 16069;
+static int16_t cospi_6_64 = 15679;
+static int16_t cospi_8_64 = 15137;
+static int16_t cospi_10_64 = 14449;
+static int16_t cospi_12_64 = 13623;
+static int16_t cospi_14_64 = 12665;
+static int16_t cospi_16_64 = 11585;
+static int16_t cospi_18_64 = 10394;
+static int16_t cospi_20_64 = 9102;
+static int16_t cospi_22_64 = 7723;
+static int16_t cospi_24_64 = 6270;
+static int16_t cospi_26_64 = 4756;
+static int16_t cospi_28_64 = 3196;
+static int16_t cospi_30_64 = 1606;
+
+static INLINE void TRANSPOSE8X8(
+        int16x8_t *q8s16,
+        int16x8_t *q9s16,
+        int16x8_t *q10s16,
+        int16x8_t *q11s16,
+        int16x8_t *q12s16,
+        int16x8_t *q13s16,
+        int16x8_t *q14s16,
+        int16x8_t *q15s16) {
+    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+    int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
+    int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
+
+    d16s16 = vget_low_s16(*q8s16);
+    d17s16 = vget_high_s16(*q8s16);
+    d18s16 = vget_low_s16(*q9s16);
+    d19s16 = vget_high_s16(*q9s16);
+    d20s16 = vget_low_s16(*q10s16);
+    d21s16 = vget_high_s16(*q10s16);
+    d22s16 = vget_low_s16(*q11s16);
+    d23s16 = vget_high_s16(*q11s16);
+    d24s16 = vget_low_s16(*q12s16);
+    d25s16 = vget_high_s16(*q12s16);
+    d26s16 = vget_low_s16(*q13s16);
+    d27s16 = vget_high_s16(*q13s16);
+    d28s16 = vget_low_s16(*q14s16);
+    d29s16 = vget_high_s16(*q14s16);
+    d30s16 = vget_low_s16(*q15s16);
+    d31s16 = vget_high_s16(*q15s16);
+
+    *q8s16  = vcombine_s16(d16s16, d24s16);  // vswp d17, d24
+    *q9s16  = vcombine_s16(d18s16, d26s16);  // vswp d19, d26
+    *q10s16 = vcombine_s16(d20s16, d28s16);  // vswp d21, d28
+    *q11s16 = vcombine_s16(d22s16, d30s16);  // vswp d23, d30
+    *q12s16 = vcombine_s16(d17s16, d25s16);
+    *q13s16 = vcombine_s16(d19s16, d27s16);
+    *q14s16 = vcombine_s16(d21s16, d29s16);
+    *q15s16 = vcombine_s16(d23s16, d31s16);
+
+    q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16),
+                        vreinterpretq_s32_s16(*q10s16));
+    q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16),
+                        vreinterpretq_s32_s16(*q11s16));
+    q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16),
+                        vreinterpretq_s32_s16(*q14s16));
+    q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16),
+                        vreinterpretq_s32_s16(*q15s16));
+
+    q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]),   // q8
+                        vreinterpretq_s16_s32(q1x2s32.val[0]));  // q9
+    q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]),   // q10
+                        vreinterpretq_s16_s32(q1x2s32.val[1]));  // q11
+    q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]),   // q12
+                        vreinterpretq_s16_s32(q3x2s32.val[0]));  // q13
+    q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]),   // q14
+                        vreinterpretq_s16_s32(q3x2s32.val[1]));  // q15
+
+    *q8s16  = q0x2s16.val[0];
+    *q9s16  = q0x2s16.val[1];
+    *q10s16 = q1x2s16.val[0];
+    *q11s16 = q1x2s16.val[1];
+    *q12s16 = q2x2s16.val[0];
+    *q13s16 = q2x2s16.val[1];
+    *q14s16 = q3x2s16.val[0];
+    *q15s16 = q3x2s16.val[1];
+    return;
+}
+
+static INLINE void IDCT8x8_1D(
+        int16x8_t *q8s16,
+        int16x8_t *q9s16,
+        int16x8_t *q10s16,
+        int16x8_t *q11s16,
+        int16x8_t *q12s16,
+        int16x8_t *q13s16,
+        int16x8_t *q14s16,
+        int16x8_t *q15s16) {
+    int16x4_t d0s16, d1s16, d2s16, d3s16;
+    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+    int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32;
+    int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
+
+    d0s16 = vdup_n_s16(cospi_28_64);
+    d1s16 = vdup_n_s16(cospi_4_64);
+    d2s16 = vdup_n_s16(cospi_12_64);
+    d3s16 = vdup_n_s16(cospi_20_64);
+
+    d16s16 = vget_low_s16(*q8s16);
+    d17s16 = vget_high_s16(*q8s16);
+    d18s16 = vget_low_s16(*q9s16);
+    d19s16 = vget_high_s16(*q9s16);
+    d20s16 = vget_low_s16(*q10s16);
+    d21s16 = vget_high_s16(*q10s16);
+    d22s16 = vget_low_s16(*q11s16);
+    d23s16 = vget_high_s16(*q11s16);
+    d24s16 = vget_low_s16(*q12s16);
+    d25s16 = vget_high_s16(*q12s16);
+    d26s16 = vget_low_s16(*q13s16);
+    d27s16 = vget_high_s16(*q13s16);
+    d28s16 = vget_low_s16(*q14s16);
+    d29s16 = vget_high_s16(*q14s16);
+    d30s16 = vget_low_s16(*q15s16);
+    d31s16 = vget_high_s16(*q15s16);
+
+    q2s32 = vmull_s16(d18s16, d0s16);
+    q3s32 = vmull_s16(d19s16, d0s16);
+    q5s32 = vmull_s16(d26s16, d2s16);
+    q6s32 = vmull_s16(d27s16, d2s16);
+
+    q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
+    q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
+    q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);
+    q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);
+
+    d8s16  = vqrshrn_n_s32(q2s32, 14);
+    d9s16  = vqrshrn_n_s32(q3s32, 14);
+    d10s16 = vqrshrn_n_s32(q5s32, 14);
+    d11s16 = vqrshrn_n_s32(q6s32, 14);
+    q4s16 = vcombine_s16(d8s16, d9s16);
+    q5s16 = vcombine_s16(d10s16, d11s16);
+
+    q2s32 = vmull_s16(d18s16, d1s16);
+    q3s32 = vmull_s16(d19s16, d1s16);
+    q9s32 = vmull_s16(d26s16, d3s16);
+    q13s32 = vmull_s16(d27s16, d3s16);
+
+    q2s32 = vmlal_s16(q2s32, d30s16, d0s16);
+    q3s32 = vmlal_s16(q3s32, d31s16, d0s16);
+    q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
+    q13s32 = vmlal_s16(q13s32, d23s16, d2s16);
+
+    d14s16 = vqrshrn_n_s32(q2s32, 14);
+    d15s16 = vqrshrn_n_s32(q3s32, 14);
+    d12s16 = vqrshrn_n_s32(q9s32, 14);
+    d13s16 = vqrshrn_n_s32(q13s32, 14);
+    q6s16 = vcombine_s16(d12s16, d13s16);
+    q7s16 = vcombine_s16(d14s16, d15s16);
+
+    d0s16 = vdup_n_s16(cospi_16_64);
+
+    q2s32 = vmull_s16(d16s16, d0s16);
+    q3s32 = vmull_s16(d17s16, d0s16);
+    q13s32 = vmull_s16(d16s16, d0s16);
+    q15s32 = vmull_s16(d17s16, d0s16);
+
+    q2s32 = vmlal_s16(q2s32, d24s16, d0s16);
+    q3s32 = vmlal_s16(q3s32, d25s16, d0s16);
+    q13s32 = vmlsl_s16(q13s32, d24s16, d0s16);
+    q15s32 = vmlsl_s16(q15s32, d25s16, d0s16);
+
+    d0s16 = vdup_n_s16(cospi_24_64);
+    d1s16 = vdup_n_s16(cospi_8_64);
+
+    d18s16 = vqrshrn_n_s32(q2s32, 14);
+    d19s16 = vqrshrn_n_s32(q3s32, 14);
+    d22s16 = vqrshrn_n_s32(q13s32, 14);
+    d23s16 = vqrshrn_n_s32(q15s32, 14);
+    *q9s16  = vcombine_s16(d18s16, d19s16);
+    *q11s16 = vcombine_s16(d22s16, d23s16);
+
+    q2s32 = vmull_s16(d20s16, d0s16);
+    q3s32 = vmull_s16(d21s16, d0s16);
+    q8s32 = vmull_s16(d20s16, d1s16);
+    q12s32 = vmull_s16(d21s16, d1s16);
+
+    q2s32 = vmlsl_s16(q2s32, d28s16, d1s16);
+    q3s32 = vmlsl_s16(q3s32, d29s16, d1s16);
+    q8s32 = vmlal_s16(q8s32, d28s16, d0s16);
+    q12s32 = vmlal_s16(q12s32, d29s16, d0s16);
+
+    d26s16 = vqrshrn_n_s32(q2s32, 14);
+    d27s16 = vqrshrn_n_s32(q3s32, 14);
+    d30s16 = vqrshrn_n_s32(q8s32, 14);
+    d31s16 = vqrshrn_n_s32(q12s32, 14);
+    *q13s16 = vcombine_s16(d26s16, d27s16);
+    *q15s16 = vcombine_s16(d30s16, d31s16);
+
+    q0s16 = vaddq_s16(*q9s16, *q15s16);
+    q1s16 = vaddq_s16(*q11s16, *q13s16);
+    q2s16 = vsubq_s16(*q11s16, *q13s16);
+    q3s16 = vsubq_s16(*q9s16, *q15s16);
+
+    *q13s16 = vsubq_s16(q4s16, q5s16);
+    q4s16   = vaddq_s16(q4s16, q5s16);
+    *q14s16 = vsubq_s16(q7s16, q6s16);
+    q7s16   = vaddq_s16(q7s16, q6s16);
+    d26s16 = vget_low_s16(*q13s16);
+    d27s16 = vget_high_s16(*q13s16);
+    d28s16 = vget_low_s16(*q14s16);
+    d29s16 = vget_high_s16(*q14s16);
+
+    d16s16 = vdup_n_s16(cospi_16_64);
+
+    q9s32  = vmull_s16(d28s16, d16s16);
+    q10s32 = vmull_s16(d29s16, d16s16);
+    q11s32 = vmull_s16(d28s16, d16s16);
+    q12s32 = vmull_s16(d29s16, d16s16);
+
+    q9s32  = vmlsl_s16(q9s32,  d26s16, d16s16);
+    q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
+    q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
+    q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
+
+    d10s16 = vqrshrn_n_s32(q9s32, 14);
+    d11s16 = vqrshrn_n_s32(q10s32, 14);
+    d12s16 = vqrshrn_n_s32(q11s32, 14);
+    d13s16 = vqrshrn_n_s32(q12s32, 14);
+    q5s16 = vcombine_s16(d10s16, d11s16);
+    q6s16 = vcombine_s16(d12s16, d13s16);
+
+    *q8s16  = vaddq_s16(q0s16, q7s16);
+    *q9s16  = vaddq_s16(q1s16, q6s16);
+    *q10s16 = vaddq_s16(q2s16, q5s16);
+    *q11s16 = vaddq_s16(q3s16, q4s16);
+    *q12s16 = vsubq_s16(q3s16, q4s16);
+    *q13s16 = vsubq_s16(q2s16, q5s16);
+    *q14s16 = vsubq_s16(q1s16, q6s16);
+    *q15s16 = vsubq_s16(q0s16, q7s16);
+    return;
+}
+
+static INLINE void IADST8X8_1D(
+        int16x8_t *q8s16,
+        int16x8_t *q9s16,
+        int16x8_t *q10s16,
+        int16x8_t *q11s16,
+        int16x8_t *q12s16,
+        int16x8_t *q13s16,
+        int16x8_t *q14s16,
+        int16x8_t *q15s16) {
+    int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+    int16x8_t q2s16, q4s16, q5s16, q6s16;
+    int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q7s32, q8s32;
+    int32x4_t q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32;
+
+    d16s16 = vget_low_s16(*q8s16);
+    d17s16 = vget_high_s16(*q8s16);
+    d18s16 = vget_low_s16(*q9s16);
+    d19s16 = vget_high_s16(*q9s16);
+    d20s16 = vget_low_s16(*q10s16);
+    d21s16 = vget_high_s16(*q10s16);
+    d22s16 = vget_low_s16(*q11s16);
+    d23s16 = vget_high_s16(*q11s16);
+    d24s16 = vget_low_s16(*q12s16);
+    d25s16 = vget_high_s16(*q12s16);
+    d26s16 = vget_low_s16(*q13s16);
+    d27s16 = vget_high_s16(*q13s16);
+    d28s16 = vget_low_s16(*q14s16);
+    d29s16 = vget_high_s16(*q14s16);
+    d30s16 = vget_low_s16(*q15s16);
+    d31s16 = vget_high_s16(*q15s16);
+
+    d14s16 = vdup_n_s16(cospi_2_64);
+    d15s16 = vdup_n_s16(cospi_30_64);
+
+    q1s32 = vmull_s16(d30s16, d14s16);
+    q2s32 = vmull_s16(d31s16, d14s16);
+    q3s32 = vmull_s16(d30s16, d15s16);
+    q4s32 = vmull_s16(d31s16, d15s16);
+
+    d30s16 = vdup_n_s16(cospi_18_64);
+    d31s16 = vdup_n_s16(cospi_14_64);
+
+    q1s32 = vmlal_s16(q1s32, d16s16, d15s16);
+    q2s32 = vmlal_s16(q2s32, d17s16, d15s16);
+    q3s32 = vmlsl_s16(q3s32, d16s16, d14s16);
+    q4s32 = vmlsl_s16(q4s32, d17s16, d14s16);
+
+    q5s32 = vmull_s16(d22s16, d30s16);
+    q6s32 = vmull_s16(d23s16, d30s16);
+    q7s32 = vmull_s16(d22s16, d31s16);
+    q8s32 = vmull_s16(d23s16, d31s16);
+
+    q5s32 = vmlal_s16(q5s32, d24s16, d31s16);
+    q6s32 = vmlal_s16(q6s32, d25s16, d31s16);
+    q7s32 = vmlsl_s16(q7s32, d24s16, d30s16);
+    q8s32 = vmlsl_s16(q8s32, d25s16, d30s16);
+
+    q11s32 = vaddq_s32(q1s32, q5s32);
+    q12s32 = vaddq_s32(q2s32, q6s32);
+    q1s32 = vsubq_s32(q1s32, q5s32);
+    q2s32 = vsubq_s32(q2s32, q6s32);
+
+    d22s16 = vqrshrn_n_s32(q11s32, 14);
+    d23s16 = vqrshrn_n_s32(q12s32, 14);
+    *q11s16 = vcombine_s16(d22s16, d23s16);
+
+    q12s32 = vaddq_s32(q3s32, q7s32);
+    q15s32 = vaddq_s32(q4s32, q8s32);
+    q3s32 = vsubq_s32(q3s32, q7s32);
+    q4s32 = vsubq_s32(q4s32, q8s32);
+
+    d2s16  = vqrshrn_n_s32(q1s32, 14);
+    d3s16  = vqrshrn_n_s32(q2s32, 14);
+    d24s16 = vqrshrn_n_s32(q12s32, 14);
+    d25s16 = vqrshrn_n_s32(q15s32, 14);
+    d6s16  = vqrshrn_n_s32(q3s32, 14);
+    d7s16  = vqrshrn_n_s32(q4s32, 14);
+    *q12s16 = vcombine_s16(d24s16, d25s16);
+
+    d0s16 = vdup_n_s16(cospi_10_64);
+    d1s16 = vdup_n_s16(cospi_22_64);
+    q4s32 = vmull_s16(d26s16, d0s16);
+    q5s32 = vmull_s16(d27s16, d0s16);
+    q2s32 = vmull_s16(d26s16, d1s16);
+    q6s32 = vmull_s16(d27s16, d1s16);
+
+    d30s16 = vdup_n_s16(cospi_26_64);
+    d31s16 = vdup_n_s16(cospi_6_64);
+
+    q4s32 = vmlal_s16(q4s32, d20s16, d1s16);
+    q5s32 = vmlal_s16(q5s32, d21s16, d1s16);
+    q2s32 = vmlsl_s16(q2s32, d20s16, d0s16);
+    q6s32 = vmlsl_s16(q6s32, d21s16, d0s16);
+
+    q0s32 = vmull_s16(d18s16, d30s16);
+    q13s32 = vmull_s16(d19s16, d30s16);
+
+    q0s32 = vmlal_s16(q0s32, d28s16, d31s16);
+    q13s32 = vmlal_s16(q13s32, d29s16, d31s16);
+
+    q10s32 = vmull_s16(d18s16, d31s16);
+    q9s32 = vmull_s16(d19s16, d31s16);
+
+    q10s32 = vmlsl_s16(q10s32, d28s16, d30s16);
+    q9s32 = vmlsl_s16(q9s32, d29s16, d30s16);
+
+    q14s32 = vaddq_s32(q2s32, q10s32);
+    q15s32 = vaddq_s32(q6s32, q9s32);
+    q2s32 = vsubq_s32(q2s32, q10s32);
+    q6s32 = vsubq_s32(q6s32, q9s32);
+
+    d28s16 = vqrshrn_n_s32(q14s32, 14);
+    d29s16 = vqrshrn_n_s32(q15s32, 14);
+    d4s16 = vqrshrn_n_s32(q2s32, 14);
+    d5s16 = vqrshrn_n_s32(q6s32, 14);
+    *q14s16 = vcombine_s16(d28s16, d29s16);
+
+    q9s32 = vaddq_s32(q4s32, q0s32);
+    q10s32 = vaddq_s32(q5s32, q13s32);
+    q4s32 = vsubq_s32(q4s32, q0s32);
+    q5s32 = vsubq_s32(q5s32, q13s32);
+
+    d30s16 = vdup_n_s16(cospi_8_64);
+    d31s16 = vdup_n_s16(cospi_24_64);
+
+    d18s16 = vqrshrn_n_s32(q9s32, 14);
+    d19s16 = vqrshrn_n_s32(q10s32, 14);
+    d8s16 = vqrshrn_n_s32(q4s32, 14);
+    d9s16 = vqrshrn_n_s32(q5s32, 14);
+    *q9s16 = vcombine_s16(d18s16, d19s16);
+
+    q5s32 = vmull_s16(d2s16, d30s16);
+    q6s32 = vmull_s16(d3s16, d30s16);
+    q7s32 = vmull_s16(d2s16, d31s16);
+    q0s32 = vmull_s16(d3s16, d31s16);
+
+    q5s32 = vmlal_s16(q5s32, d6s16, d31s16);
+    q6s32 = vmlal_s16(q6s32, d7s16, d31s16);
+    q7s32 = vmlsl_s16(q7s32, d6s16, d30s16);
+    q0s32 = vmlsl_s16(q0s32, d7s16, d30s16);
+
+    q1s32 = vmull_s16(d4s16, d30s16);
+    q3s32 = vmull_s16(d5s16, d30s16);
+    q10s32 = vmull_s16(d4s16, d31s16);
+    q2s32 = vmull_s16(d5s16, d31s16);
+
+    q1s32 = vmlsl_s16(q1s32, d8s16, d31s16);
+    q3s32 = vmlsl_s16(q3s32, d9s16, d31s16);
+    q10s32 = vmlal_s16(q10s32, d8s16, d30s16);
+    q2s32 = vmlal_s16(q2s32, d9s16, d30s16);
+
+    *q8s16 = vaddq_s16(*q11s16, *q9s16);
+    *q11s16 = vsubq_s16(*q11s16, *q9s16);
+    q4s16 = vaddq_s16(*q12s16, *q14s16);
+    *q12s16 = vsubq_s16(*q12s16, *q14s16);
+
+    q14s32 = vaddq_s32(q5s32, q1s32);
+    q15s32 = vaddq_s32(q6s32, q3s32);
+    q5s32 = vsubq_s32(q5s32, q1s32);
+    q6s32 = vsubq_s32(q6s32, q3s32);
+
+    d18s16 = vqrshrn_n_s32(q14s32, 14);
+    d19s16 = vqrshrn_n_s32(q15s32, 14);
+    d10s16 = vqrshrn_n_s32(q5s32, 14);
+    d11s16 = vqrshrn_n_s32(q6s32, 14);
+    *q9s16 = vcombine_s16(d18s16, d19s16);
+
+    q1s32 = vaddq_s32(q7s32, q10s32);
+    q3s32 = vaddq_s32(q0s32, q2s32);
+    q7s32 = vsubq_s32(q7s32, q10s32);
+    q0s32 = vsubq_s32(q0s32, q2s32);
+
+    d28s16 = vqrshrn_n_s32(q1s32, 14);
+    d29s16 = vqrshrn_n_s32(q3s32, 14);
+    d14s16 = vqrshrn_n_s32(q7s32, 14);
+    d15s16 = vqrshrn_n_s32(q0s32, 14);
+    *q14s16 = vcombine_s16(d28s16, d29s16);
+
+    d30s16 = vdup_n_s16(cospi_16_64);
+
+    d22s16 = vget_low_s16(*q11s16);
+    d23s16 = vget_high_s16(*q11s16);
+    q2s32 = vmull_s16(d22s16, d30s16);
+    q3s32 = vmull_s16(d23s16, d30s16);
+    q13s32 = vmull_s16(d22s16, d30s16);
+    q1s32 = vmull_s16(d23s16, d30s16);
+
+    d24s16 = vget_low_s16(*q12s16);
+    d25s16 = vget_high_s16(*q12s16);
+    q2s32 = vmlal_s16(q2s32, d24s16, d30s16);
+    q3s32 = vmlal_s16(q3s32, d25s16, d30s16);
+    q13s32 = vmlsl_s16(q13s32, d24s16, d30s16);
+    q1s32 = vmlsl_s16(q1s32, d25s16, d30s16);
+
+    d4s16 = vqrshrn_n_s32(q2s32, 14);
+    d5s16 = vqrshrn_n_s32(q3s32, 14);
+    d24s16 = vqrshrn_n_s32(q13s32, 14);
+    d25s16 = vqrshrn_n_s32(q1s32, 14);
+    q2s16 = vcombine_s16(d4s16, d5s16);
+    *q12s16 = vcombine_s16(d24s16, d25s16);
+
+    q13s32 = vmull_s16(d10s16, d30s16);
+    q1s32 = vmull_s16(d11s16, d30s16);
+    q11s32 = vmull_s16(d10s16, d30s16);
+    q0s32 = vmull_s16(d11s16, d30s16);
+
+    q13s32 = vmlal_s16(q13s32, d14s16, d30s16);
+    q1s32 = vmlal_s16(q1s32, d15s16, d30s16);
+    q11s32 = vmlsl_s16(q11s32, d14s16, d30s16);
+    q0s32 = vmlsl_s16(q0s32, d15s16, d30s16);
+
+    d20s16 = vqrshrn_n_s32(q13s32, 14);
+    d21s16 = vqrshrn_n_s32(q1s32, 14);
+    d12s16 = vqrshrn_n_s32(q11s32, 14);
+    d13s16 = vqrshrn_n_s32(q0s32, 14);
+    *q10s16 = vcombine_s16(d20s16, d21s16);
+    q6s16 = vcombine_s16(d12s16, d13s16);
+
+    q5s16 = vdupq_n_s16(0);
+
+    *q9s16  = vsubq_s16(q5s16, *q9s16);
+    *q11s16 = vsubq_s16(q5s16, q2s16);
+    *q13s16 = vsubq_s16(q5s16, q6s16);
+    *q15s16 = vsubq_s16(q5s16, q4s16);
+    return;
+}
+
+void vp10_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest,
+                            int dest_stride, int tx_type) {
+    int i;
+    uint8_t *d1, *d2;
+    uint8x8_t d0u8, d1u8, d2u8, d3u8;
+    uint64x1_t d0u64, d1u64, d2u64, d3u64;
+    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+    uint16x8_t q8u16, q9u16, q10u16, q11u16;
+
+    q8s16  = vld1q_s16(input);
+    q9s16  = vld1q_s16(input + 8);
+    q10s16 = vld1q_s16(input + 8 * 2);
+    q11s16 = vld1q_s16(input + 8 * 3);
+    q12s16 = vld1q_s16(input + 8 * 4);
+    q13s16 = vld1q_s16(input + 8 * 5);
+    q14s16 = vld1q_s16(input + 8 * 6);
+    q15s16 = vld1q_s16(input + 8 * 7);
+
+    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+                 &q12s16, &q13s16, &q14s16, &q15s16);
+
+    switch (tx_type) {
+      case 0:  // idct_idct is not supported. Fall back to C
+        vp10_iht8x8_64_add_c(input, dest, dest_stride, tx_type);
+        return;
+        break;
+      case 1:  // iadst_idct
+        // generate IDCT constants
+        // GENERATE_IDCT_CONSTANTS
+
+        // first transform rows
+        IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+                   &q12s16, &q13s16, &q14s16, &q15s16);
+
+        // transpose the matrix
+        TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+                     &q12s16, &q13s16, &q14s16, &q15s16);
+
+        // generate IADST constants
+        // GENERATE_IADST_CONSTANTS
+
+        // then transform columns
+        IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+                    &q12s16, &q13s16, &q14s16, &q15s16);
+        break;
+      case 2:  // idct_iadst
+        // generate IADST constants
+        // GENERATE_IADST_CONSTANTS
+
+        // first transform rows
+        IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+                    &q12s16, &q13s16, &q14s16, &q15s16);
+
+        // transpose the matrix
+        TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+                     &q12s16, &q13s16, &q14s16, &q15s16);
+
+        // generate IDCT constants
+        // GENERATE_IDCT_CONSTANTS
+
+        // then transform columns
+        IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+                   &q12s16, &q13s16, &q14s16, &q15s16);
+        break;
+      case 3:  // iadst_iadst
+        // generate IADST constants
+        // GENERATE_IADST_CONSTANTS
+
+        // first transform rows
+        IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+                    &q12s16, &q13s16, &q14s16, &q15s16);
+
+        // transpose the matrix
+        TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+                     &q12s16, &q13s16, &q14s16, &q15s16);
+
+        // then transform columns
+        IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+                    &q12s16, &q13s16, &q14s16, &q15s16);
+        break;
+      default:  // iadst_idct
+        assert(0);
+        break;
+    }
+
+    q8s16 = vrshrq_n_s16(q8s16, 5);
+    q9s16 = vrshrq_n_s16(q9s16, 5);
+    q10s16 = vrshrq_n_s16(q10s16, 5);
+    q11s16 = vrshrq_n_s16(q11s16, 5);
+    q12s16 = vrshrq_n_s16(q12s16, 5);
+    q13s16 = vrshrq_n_s16(q13s16, 5);
+    q14s16 = vrshrq_n_s16(q14s16, 5);
+    q15s16 = vrshrq_n_s16(q15s16, 5);
+
+    for (d1 = d2 = dest, i = 0; i < 2; i++) {
+        if (i != 0) {
+            q8s16 = q12s16;
+            q9s16 = q13s16;
+            q10s16 = q14s16;
+            q11s16 = q15s16;
+        }
+
+        d0u64 = vld1_u64((uint64_t *)d1);
+        d1 += dest_stride;
+        d1u64 = vld1_u64((uint64_t *)d1);
+        d1 += dest_stride;
+        d2u64 = vld1_u64((uint64_t *)d1);
+        d1 += dest_stride;
+        d3u64 = vld1_u64((uint64_t *)d1);
+        d1 += dest_stride;
+
+        q8u16  = vaddw_u8(vreinterpretq_u16_s16(q8s16),
+                          vreinterpret_u8_u64(d0u64));
+        q9u16  = vaddw_u8(vreinterpretq_u16_s16(q9s16),
+                          vreinterpret_u8_u64(d1u64));
+        q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
+                          vreinterpret_u8_u64(d2u64));
+        q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
+                          vreinterpret_u8_u64(d3u64));
+
+        d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+        d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+        d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+        d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+        d2 += dest_stride;
+        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+        d2 += dest_stride;
+        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+        d2 += dest_stride;
+        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+        d2 += dest_stride;
+    }
+    return;
+}
diff --git a/libs/libvpx/vp10/common/blockd.c b/libs/libvpx/vp10/common/blockd.c
new file mode 100644
index 0000000000..b6f910ff68
--- /dev/null
+++ b/libs/libvpx/vp10/common/blockd.c
@@ -0,0 +1,136 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp10/common/blockd.h"
+
+PREDICTION_MODE vp10_left_block_mode(const MODE_INFO *cur_mi,
+                                    const MODE_INFO *left_mi, int b) {
+  if (b == 0 || b == 2) {
+    if (!left_mi || is_inter_block(&left_mi->mbmi))
+      return DC_PRED;
+
+    return get_y_mode(left_mi, b + 1);
+  } else {
+    assert(b == 1 || b == 3);
+    return cur_mi->bmi[b - 1].as_mode;
+  }
+}
+
+PREDICTION_MODE vp10_above_block_mode(const MODE_INFO *cur_mi,
+                                     const MODE_INFO *above_mi, int b) {
+  if (b == 0 || b == 1) {
+    if (!above_mi || is_inter_block(&above_mi->mbmi))
+      return DC_PRED;
+
+    return get_y_mode(above_mi, b + 2);
+  } else {
+    assert(b == 2 || b == 3);
+    return cur_mi->bmi[b - 2].as_mode;
+  }
+}
+
+void vp10_foreach_transformed_block_in_plane(
+    const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
+    foreach_transformed_block_visitor visit, void *arg) {
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const MB_MODE_INFO* mbmi = &xd->mi[0]->mbmi;
+  // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
+  // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
+  // transform size varies per plane, look it up in a common way.
+  const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi, pd)
+                                : mbmi->tx_size;
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+  const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+  const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+  const int step = 1 << (tx_size << 1);
+  int i = 0, r, c;
+
+  // If mb_to_right_edge is < 0 we are in a situation in which
+  // the current block size extends into the UMV and we won't
+  // visit the sub blocks that are wholly within the UMV.
+  const int max_blocks_wide = num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 :
+      xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+  const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 :
+      xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+  const int extra_step = ((num_4x4_w - max_blocks_wide) >> tx_size) * step;
+
+  // Keep track of the row and column of the blocks we use so that we know
+  // if we are in the unrestricted motion border.
+  for (r = 0; r < max_blocks_high; r += (1 << tx_size)) {
+    // Skip visiting the sub blocks that are wholly within the UMV.
+    for (c = 0; c < max_blocks_wide; c += (1 << tx_size)) {
+      visit(plane, i, r, c, plane_bsize, tx_size, arg);
+      i += step;
+    }
+    i += extra_step;
+  }
+}
+
+void vp10_foreach_transformed_block(const MACROBLOCKD* const xd,
+                                   BLOCK_SIZE bsize,
+                                   foreach_transformed_block_visitor visit,
+                                   void *arg) {
+  int plane;
+
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane)
+    vp10_foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg);
+}
+
+void vp10_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
+                      BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int has_eob,
+                      int aoff, int loff) {
+  ENTROPY_CONTEXT *const a = pd->above_context + aoff;
+  ENTROPY_CONTEXT *const l = pd->left_context + loff;
+  const int tx_size_in_blocks = 1 << tx_size;
+
+  // above
+  if (has_eob && xd->mb_to_right_edge < 0) {
+    int i;
+    const int blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize] +
+                            (xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+    int above_contexts = tx_size_in_blocks;
+    if (above_contexts + aoff > blocks_wide)
+      above_contexts = blocks_wide - aoff;
+
+    for (i = 0; i < above_contexts; ++i)
+      a[i] = has_eob;
+    for (i = above_contexts; i < tx_size_in_blocks; ++i)
+      a[i] = 0;
+  } else {
+    memset(a, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
+  }
+
+  // left
+  if (has_eob && xd->mb_to_bottom_edge < 0) {
+    int i;
+    const int blocks_high = num_4x4_blocks_high_lookup[plane_bsize] +
+                            (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+    int left_contexts = tx_size_in_blocks;
+    if (left_contexts + loff > blocks_high)
+      left_contexts = blocks_high - loff;
+
+    for (i = 0; i < left_contexts; ++i)
+      l[i] = has_eob;
+    for (i = left_contexts; i < tx_size_in_blocks; ++i)
+      l[i] = 0;
+  } else {
+    memset(l, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
+  }
+}
+
+void vp10_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y) {
+  int i;
+
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    xd->plane[i].plane_type = i ? PLANE_TYPE_UV : PLANE_TYPE_Y;
+    xd->plane[i].subsampling_x = i ? ss_x : 0;
+    xd->plane[i].subsampling_y = i ? ss_y : 0;
+  }
+}
diff --git a/libs/libvpx/vp10/common/blockd.h b/libs/libvpx/vp10/common/blockd.h
new file mode 100644
index 0000000000..fce1767963
--- /dev/null
+++ b/libs/libvpx/vp10/common/blockd.h
@@ -0,0 +1,295 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP10_COMMON_BLOCKD_H_
+#define VP10_COMMON_BLOCKD_H_
+
+#include "./vpx_config.h"
+
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_ports/mem.h"
+#include "vpx_scale/yv12config.h"
+
+#include "vp10/common/common_data.h"
+#include "vp10/common/entropy.h"
+#include "vp10/common/entropymode.h"
+#include "vp10/common/mv.h"
+#include "vp10/common/scale.h"
+#include "vp10/common/seg_common.h"
+#include "vp10/common/tile_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_MB_PLANE 3
+
+typedef enum {
+  KEY_FRAME = 0,
+  INTER_FRAME = 1,
+  FRAME_TYPES,
+} FRAME_TYPE;
+
+static INLINE int is_inter_mode(PREDICTION_MODE mode) {
+  return mode >= NEARESTMV && mode <= NEWMV;
+}
+
+/* For keyframes, intra block modes are predicted by the (already decoded)
+   modes for the Y blocks to the left and above us; for interframes, there
+   is a single probability table. */
+
+typedef struct {
+  PREDICTION_MODE as_mode;
+  int_mv as_mv[2];  // first, second inter predictor motion vectors
+} b_mode_info;
+
+// Note that the rate-distortion optimization loop, bit-stream writer, and
+// decoder implementation modules critically rely on the defined entry values
+// specified herein. They should be refactored concurrently.
+
+#define NONE           -1
+#define INTRA_FRAME     0
+#define LAST_FRAME      1
+#define GOLDEN_FRAME    2
+#define ALTREF_FRAME    3
+#define MAX_REF_FRAMES  4
+typedef int8_t MV_REFERENCE_FRAME;
+
+// This structure now relates to 8x8 block regions.
+typedef struct {
+  // Common for both INTER and INTRA blocks
+  BLOCK_SIZE sb_type;
+  PREDICTION_MODE mode;
+  TX_SIZE tx_size;
+  int8_t skip;
+#if CONFIG_MISC_FIXES
+  int8_t has_no_coeffs;
+#endif
+  int8_t segment_id;
+  int8_t seg_id_predicted;  // valid only when temporal_update is enabled
+
+  // Only for INTRA blocks
+  PREDICTION_MODE uv_mode;
+
+  // Only for INTER blocks
+  INTERP_FILTER interp_filter;
+  MV_REFERENCE_FRAME ref_frame[2];
+  TX_TYPE tx_type;
+
+  // TODO(slavarnway): Delete and use bmi[3].as_mv[] instead.
+  int_mv mv[2];
+} MB_MODE_INFO;
+
+typedef struct MODE_INFO {
+  MB_MODE_INFO mbmi;
+  b_mode_info bmi[4];
+} MODE_INFO;
+
+static INLINE PREDICTION_MODE get_y_mode(const MODE_INFO *mi, int block) {
+  return mi->mbmi.sb_type < BLOCK_8X8 ? mi->bmi[block].as_mode
+                                      : mi->mbmi.mode;
+}
+
+static INLINE int is_inter_block(const MB_MODE_INFO *mbmi) {
+  return mbmi->ref_frame[0] > INTRA_FRAME;
+}
+
+static INLINE int has_second_ref(const MB_MODE_INFO *mbmi) {
+  return mbmi->ref_frame[1] > INTRA_FRAME;
+}
+
+PREDICTION_MODE vp10_left_block_mode(const MODE_INFO *cur_mi,
+                                    const MODE_INFO *left_mi, int b);
+
+PREDICTION_MODE vp10_above_block_mode(const MODE_INFO *cur_mi,
+                                     const MODE_INFO *above_mi, int b);
+
+enum mv_precision {
+  MV_PRECISION_Q3,
+  MV_PRECISION_Q4
+};
+
+struct buf_2d {
+  uint8_t *buf;
+  int stride;
+};
+
+struct macroblockd_plane {
+  tran_low_t *dqcoeff;
+  PLANE_TYPE plane_type;
+  int subsampling_x;
+  int subsampling_y;
+  struct buf_2d dst;
+  struct buf_2d pre[2];
+  ENTROPY_CONTEXT *above_context;
+  ENTROPY_CONTEXT *left_context;
+  int16_t seg_dequant[MAX_SEGMENTS][2];
+  uint8_t *color_index_map;
+
+  // number of 4x4s in current block
+  uint16_t n4_w, n4_h;
+  // log2 of n4_w, n4_h
+  uint8_t n4_wl, n4_hl;
+
+  // encoder
+  const int16_t *dequant;
+};
+
+#define BLOCK_OFFSET(x, i) ((x) + (i) * 16)
+
+typedef struct RefBuffer {
+  // TODO(dkovalev): idx is not really required and should be removed, now it
+  // is used in vp10_onyxd_if.c
+  int idx;
+  YV12_BUFFER_CONFIG *buf;
+  struct scale_factors sf;
+} RefBuffer;
+
+typedef struct macroblockd {
+  struct macroblockd_plane plane[MAX_MB_PLANE];
+  uint8_t bmode_blocks_wl;
+  uint8_t bmode_blocks_hl;
+
+  FRAME_COUNTS *counts;
+  TileInfo tile;
+
+  int mi_stride;
+
+  MODE_INFO **mi;
+  MODE_INFO *left_mi;
+  MODE_INFO *above_mi;
+  MB_MODE_INFO *left_mbmi;
+  MB_MODE_INFO *above_mbmi;
+
+  int up_available;
+  int left_available;
+
+  /* Distance of MB away from frame edges */
+  int mb_to_left_edge;
+  int mb_to_right_edge;
+  int mb_to_top_edge;
+  int mb_to_bottom_edge;
+
+  FRAME_CONTEXT *fc;
+
+  /* pointers to reference frames */
+  RefBuffer *block_refs[2];
+
+  /* pointer to current frame */
+  const YV12_BUFFER_CONFIG *cur_buf;
+
+  ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
+  ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16];
+
+  PARTITION_CONTEXT *above_seg_context;
+  PARTITION_CONTEXT left_seg_context[8];
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  /* Bit depth: 8, 10, 12 */
+  int bd;
+#endif
+
+  int lossless[MAX_SEGMENTS];
+  int corrupted;
+
+  struct vpx_internal_error_info *error_info;
+} MACROBLOCKD;
+
+static INLINE BLOCK_SIZE get_subsize(BLOCK_SIZE bsize,
+                                     PARTITION_TYPE partition) {
+  return subsize_lookup[partition][bsize];
+}
+
+static const TX_TYPE intra_mode_to_tx_type_context[INTRA_MODES] = {
+  DCT_DCT,    // DC
+  ADST_DCT,   // V
+  DCT_ADST,   // H
+  DCT_DCT,    // D45
+  ADST_ADST,  // D135
+  ADST_DCT,   // D117
+  DCT_ADST,   // D153
+  DCT_ADST,   // D207
+  ADST_DCT,   // D63
+  ADST_ADST,  // TM
+};
+
+static INLINE TX_TYPE get_tx_type(PLANE_TYPE plane_type, const MACROBLOCKD *xd,
+                                  int block_idx) {
+  const MODE_INFO *const mi = xd->mi[0];
+  const MB_MODE_INFO *const mbmi = &mi->mbmi;
+
+  (void) block_idx;
+  if (plane_type != PLANE_TYPE_Y || xd->lossless[mbmi->segment_id] ||
+      mbmi->tx_size >= TX_32X32)
+    return DCT_DCT;
+
+  return mbmi->tx_type;
+}
+
+void vp10_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y);
+
+static INLINE TX_SIZE get_uv_tx_size_impl(TX_SIZE y_tx_size, BLOCK_SIZE bsize,
+                                          int xss, int yss) {
+  if (bsize < BLOCK_8X8) {
+    return TX_4X4;
+  } else {
+    const BLOCK_SIZE plane_bsize = ss_size_lookup[bsize][xss][yss];
+    return VPXMIN(y_tx_size, max_txsize_lookup[plane_bsize]);
+  }
+}
+
+static INLINE TX_SIZE get_uv_tx_size(const MB_MODE_INFO *mbmi,
+                                     const struct macroblockd_plane *pd) {
+  return get_uv_tx_size_impl(mbmi->tx_size, mbmi->sb_type, pd->subsampling_x,
+                             pd->subsampling_y);
+}
+
+static INLINE BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize,
+    const struct macroblockd_plane *pd) {
+  return ss_size_lookup[bsize][pd->subsampling_x][pd->subsampling_y];
+}
+
+static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) {
+  int i;
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    struct macroblockd_plane *const pd = &xd->plane[i];
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+    memset(pd->above_context, 0,
+           sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide_lookup[plane_bsize]);
+    memset(pd->left_context, 0,
+           sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high_lookup[plane_bsize]);
+  }
+}
+
+typedef void (*foreach_transformed_block_visitor)(int plane, int block,
+                                                  int blk_row, int blk_col,
+                                                  BLOCK_SIZE plane_bsize,
+                                                  TX_SIZE tx_size,
+                                                  void *arg);
+
+void vp10_foreach_transformed_block_in_plane(
+    const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
+    foreach_transformed_block_visitor visit, void *arg);
+
+
+void vp10_foreach_transformed_block(
+    const MACROBLOCKD* const xd, BLOCK_SIZE bsize,
+    foreach_transformed_block_visitor visit, void *arg);
+
+void vp10_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
+                      BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int has_eob,
+                      int aoff, int loff);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_COMMON_BLOCKD_H_
diff --git a/libs/libvpx/vp10/common/common.h b/libs/libvpx/vp10/common/common.h
new file mode 100644
index 0000000000..4abcbf6332
--- /dev/null
+++ b/libs/libvpx/vp10/common/common.h
@@ -0,0 +1,75 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_COMMON_COMMON_H_
+#define VP10_COMMON_COMMON_H_
+
+/* Interface header for common constant data structures and lookup tables */
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/bitops.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Only need this for fixed-size arrays, for structs just assign.
+#define vp10_copy(dest, src) {            \
+    assert(sizeof(dest) == sizeof(src)); \
+    memcpy(dest, src, sizeof(src));  \
+  }
+
+// Use this for variably-sized arrays.
+#define vp10_copy_array(dest, src, n) {       \
+    assert(sizeof(*dest) == sizeof(*src));   \
+    memcpy(dest, src, n * sizeof(*src)); \
+  }
+
+#define vp10_zero(dest) memset(&(dest), 0, sizeof(dest))
+#define vp10_zero_array(dest, n) memset(dest, 0, n * sizeof(*dest))
+
+static INLINE int get_unsigned_bits(unsigned int num_values) {
+  return num_values > 0 ? get_msb(num_values) + 1 : 0;
+}
+
+#if CONFIG_DEBUG
+#define CHECK_MEM_ERROR(cm, lval, expr) do { \
+  lval = (expr); \
+  if (!lval) \
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, \
+                       "Failed to allocate "#lval" at %s:%d", \
+                       __FILE__, __LINE__); \
+  } while (0)
+#else
+#define CHECK_MEM_ERROR(cm, lval, expr) do { \
+  lval = (expr); \
+  if (!lval) \
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, \
+                       "Failed to allocate "#lval); \
+  } while (0)
+#endif
+// TODO(yaowu: validate the usage of these codes or develop new ones.)
+#define VP10_SYNC_CODE_0 0x49
+#define VP10_SYNC_CODE_1 0x83
+#define VP10_SYNC_CODE_2 0x43
+
+#define VP9_FRAME_MARKER 0x2
+
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_COMMON_COMMON_H_
diff --git a/libs/libvpx/vp10/common/common_data.h b/libs/libvpx/vp10/common/common_data.h
new file mode 100644
index 0000000000..334489c9d2
--- /dev/null
+++ b/libs/libvpx/vp10/common/common_data.h
@@ -0,0 +1,177 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_COMMON_COMMON_DATA_H_
+#define VP10_COMMON_COMMON_DATA_H_
+
+#include "vp10/common/enums.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Log 2 conversion lookup tables for block width and height
+static const uint8_t b_width_log2_lookup[BLOCK_SIZES] =
+  {0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4};
+static const uint8_t b_height_log2_lookup[BLOCK_SIZES] =
+  {0, 1, 0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4};
+static const uint8_t num_4x4_blocks_wide_lookup[BLOCK_SIZES] =
+  {1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 8, 16, 16};
+static const uint8_t num_4x4_blocks_high_lookup[BLOCK_SIZES] =
+  {1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 16, 8, 16};
+// Log 2 conversion lookup tables for modeinfo width and height
+static const uint8_t mi_width_log2_lookup[BLOCK_SIZES] =
+  {0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3};
+static const uint8_t mi_height_log2_lookup[BLOCK_SIZES] =
+  {0, 0, 0, 0, 1, 0, 1, 2, 1, 2, 3, 2, 3};
+static const uint8_t num_8x8_blocks_wide_lookup[BLOCK_SIZES] =
+  {1, 1, 1, 1, 1, 2, 2, 2, 4, 4, 4, 8, 8};
+static const uint8_t num_8x8_blocks_high_lookup[BLOCK_SIZES] =
+  {1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8};
+
+// VPXMIN(3, VPXMIN(b_width_log2(bsize), b_height_log2(bsize)))
+static const uint8_t size_group_lookup[BLOCK_SIZES] =
+  {0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3};
+
+static const uint8_t num_pels_log2_lookup[BLOCK_SIZES] =
+  {4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12};
+
+static const PARTITION_TYPE partition_lookup[][BLOCK_SIZES] = {
+  {  // 4X4
+    // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
+    PARTITION_NONE, PARTITION_INVALID, PARTITION_INVALID,
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+    PARTITION_INVALID
+  }, {  // 8X8
+    // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
+    PARTITION_SPLIT, PARTITION_VERT, PARTITION_HORZ, PARTITION_NONE,
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID
+  }, {  // 16X16
+    // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
+    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
+    PARTITION_VERT, PARTITION_HORZ, PARTITION_NONE, PARTITION_INVALID,
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+    PARTITION_INVALID, PARTITION_INVALID
+  }, {  // 32X32
+    // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
+    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
+    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_VERT,
+    PARTITION_HORZ, PARTITION_NONE, PARTITION_INVALID,
+    PARTITION_INVALID, PARTITION_INVALID
+  }, {  // 64X64
+    // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
+    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
+    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
+    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_VERT, PARTITION_HORZ,
+    PARTITION_NONE
+  }
+};
+
+static const BLOCK_SIZE subsize_lookup[PARTITION_TYPES][BLOCK_SIZES] = {
+  {     // PARTITION_NONE
+    BLOCK_4X4,   BLOCK_4X8,   BLOCK_8X4,
+    BLOCK_8X8,   BLOCK_8X16,  BLOCK_16X8,
+    BLOCK_16X16, BLOCK_16X32, BLOCK_32X16,
+    BLOCK_32X32, BLOCK_32X64, BLOCK_64X32,
+    BLOCK_64X64,
+  }, {  // PARTITION_HORZ
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_8X4,     BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_16X8,    BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_32X16,   BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_64X32,
+  }, {  // PARTITION_VERT
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_4X8,     BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_8X16,    BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_16X32,   BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_32X64,
+  }, {  // PARTITION_SPLIT
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_4X4,     BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_8X8,     BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_16X16,   BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_32X32,
+  }
+};
+
+static const TX_SIZE max_txsize_lookup[BLOCK_SIZES] = {
+  TX_4X4,   TX_4X4,   TX_4X4,
+  TX_8X8,   TX_8X8,   TX_8X8,
+  TX_16X16, TX_16X16, TX_16X16,
+  TX_32X32, TX_32X32, TX_32X32, TX_32X32
+};
+
+static const BLOCK_SIZE txsize_to_bsize[TX_SIZES] = {
+    BLOCK_4X4,  // TX_4X4
+    BLOCK_8X8,  // TX_8X8
+    BLOCK_16X16,  // TX_16X16
+    BLOCK_32X32,  // TX_32X32
+};
+
+static const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES] = {
+  TX_4X4,  // ONLY_4X4
+  TX_8X8,  // ALLOW_8X8
+  TX_16X16,  // ALLOW_16X16
+  TX_32X32,  // ALLOW_32X32
+  TX_32X32,  // TX_MODE_SELECT
+};
+
+static const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2] = {
+//  ss_x == 0    ss_x == 0        ss_x == 1      ss_x == 1
+//  ss_y == 0    ss_y == 1        ss_y == 0      ss_y == 1
+  {{BLOCK_4X4,   BLOCK_INVALID}, {BLOCK_INVALID, BLOCK_INVALID}},
+  {{BLOCK_4X8,   BLOCK_4X4},     {BLOCK_INVALID, BLOCK_INVALID}},
+  {{BLOCK_8X4,   BLOCK_INVALID}, {BLOCK_4X4,     BLOCK_INVALID}},
+  {{BLOCK_8X8,   BLOCK_8X4},     {BLOCK_4X8,     BLOCK_4X4}},
+  {{BLOCK_8X16,  BLOCK_8X8},     {BLOCK_INVALID, BLOCK_4X8}},
+  {{BLOCK_16X8,  BLOCK_INVALID}, {BLOCK_8X8,     BLOCK_8X4}},
+  {{BLOCK_16X16, BLOCK_16X8},    {BLOCK_8X16,    BLOCK_8X8}},
+  {{BLOCK_16X32, BLOCK_16X16},   {BLOCK_INVALID, BLOCK_8X16}},
+  {{BLOCK_32X16, BLOCK_INVALID}, {BLOCK_16X16,   BLOCK_16X8}},
+  {{BLOCK_32X32, BLOCK_32X16},   {BLOCK_16X32,   BLOCK_16X16}},
+  {{BLOCK_32X64, BLOCK_32X32},   {BLOCK_INVALID, BLOCK_16X32}},
+  {{BLOCK_64X32, BLOCK_INVALID}, {BLOCK_32X32,   BLOCK_32X16}},
+  {{BLOCK_64X64, BLOCK_64X32},   {BLOCK_32X64,   BLOCK_32X32}},
+};
+
+// Generates 4 bit field in which each bit set to 1 represents
+// a blocksize partition  1111 means we split 64x64, 32x32, 16x16
+// and 8x8.  1000 means we just split the 64x64 to 32x32
+static const struct {
+  PARTITION_CONTEXT above;
+  PARTITION_CONTEXT left;
+} partition_context_lookup[BLOCK_SIZES]= {
+  {15, 15},  // 4X4   - {0b1111, 0b1111}
+  {15, 14},  // 4X8   - {0b1111, 0b1110}
+  {14, 15},  // 8X4   - {0b1110, 0b1111}
+  {14, 14},  // 8X8   - {0b1110, 0b1110}
+  {14, 12},  // 8X16  - {0b1110, 0b1100}
+  {12, 14},  // 16X8  - {0b1100, 0b1110}
+  {12, 12},  // 16X16 - {0b1100, 0b1100}
+  {12, 8 },  // 16X32 - {0b1100, 0b1000}
+  {8,  12},  // 32X16 - {0b1000, 0b1100}
+  {8,  8 },  // 32X32 - {0b1000, 0b1000}
+  {8,  0 },  // 32X64 - {0b1000, 0b0000}
+  {0,  8 },  // 64X32 - {0b0000, 0b1000}
+  {0,  0 },  // 64X64 - {0b0000, 0b0000}
+};
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_COMMON_COMMON_DATA_H_
diff --git a/libs/libvpx/vp10/common/debugmodes.c b/libs/libvpx/vp10/common/debugmodes.c
new file mode 100644
index 0000000000..10fc4d633d
--- /dev/null
+++ b/libs/libvpx/vp10/common/debugmodes.c
@@ -0,0 +1,91 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdio.h>
+
+#include "vp10/common/blockd.h"
+#include "vp10/common/onyxc_int.h"
+
+static void log_frame_info(VP10_COMMON *cm, const char *str, FILE *f) {
+  fprintf(f, "%s", str);
+  fprintf(f, "(Frame %d, Show:%d, Q:%d): \n", cm->current_video_frame,
+          cm->show_frame, cm->base_qindex);
+}
+/* This function dereferences a pointer to the mbmi structure
+ * and uses the passed in member offset to print out the value of an integer
+ * for each mbmi member value in the mi structure.
+ */
+static void print_mi_data(VP10_COMMON *cm, FILE *file, const char *descriptor,
+                          size_t member_offset) {
+  int mi_row, mi_col;
+  MODE_INFO **mi = cm->mi_grid_visible;
+  int rows = cm->mi_rows;
+  int cols = cm->mi_cols;
+  char prefix = descriptor[0];
+
+  log_frame_info(cm, descriptor, file);
+  for (mi_row = 0; mi_row < rows; mi_row++) {
+    fprintf(file, "%c ", prefix);
+    for (mi_col = 0; mi_col < cols; mi_col++) {
+      fprintf(file, "%2d ",
+              *((int*) ((char *) (&mi[0]->mbmi) +
+                                  member_offset)));
+      mi++;
+    }
+    fprintf(file, "\n");
+    mi += 8;
+  }
+  fprintf(file, "\n");
+}
+
+void vp10_print_modes_and_motion_vectors(VP10_COMMON *cm, const char *file) {
+  int mi_row;
+  int mi_col;
+  FILE *mvs = fopen(file, "a");
+  MODE_INFO **mi = cm->mi_grid_visible;
+  int rows = cm->mi_rows;
+  int cols = cm->mi_cols;
+
+  print_mi_data(cm, mvs, "Partitions:", offsetof(MB_MODE_INFO, sb_type));
+  print_mi_data(cm, mvs, "Modes:", offsetof(MB_MODE_INFO, mode));
+  print_mi_data(cm, mvs, "Ref frame:", offsetof(MB_MODE_INFO, ref_frame[0]));
+  print_mi_data(cm, mvs, "Transform:", offsetof(MB_MODE_INFO, tx_size));
+  print_mi_data(cm, mvs, "UV Modes:", offsetof(MB_MODE_INFO, uv_mode));
+
+  // output skip infomation.
+  log_frame_info(cm, "Skips:", mvs);
+  for (mi_row = 0; mi_row < rows; mi_row++) {
+    fprintf(mvs, "S ");
+    for (mi_col = 0; mi_col < cols; mi_col++) {
+      fprintf(mvs, "%2d ", mi[0]->mbmi.skip);
+      mi++;
+    }
+    fprintf(mvs, "\n");
+    mi += 8;
+  }
+  fprintf(mvs, "\n");
+
+  // output motion vectors.
+  log_frame_info(cm, "Vectors ", mvs);
+  mi = cm->mi_grid_visible;
+  for (mi_row = 0; mi_row < rows; mi_row++) {
+    fprintf(mvs, "V ");
+    for (mi_col = 0; mi_col < cols; mi_col++) {
+      fprintf(mvs, "%4d:%4d ", mi[0]->mbmi.mv[0].as_mv.row,
+                               mi[0]->mbmi.mv[0].as_mv.col);
+      mi++;
+    }
+    fprintf(mvs, "\n");
+    mi += 8;
+  }
+  fprintf(mvs, "\n");
+
+  fclose(mvs);
+}
diff --git a/libs/libvpx/vp10/common/entropy.c b/libs/libvpx/vp10/common/entropy.c
new file mode 100644
index 0000000000..3da08a61b0
--- /dev/null
+++ b/libs/libvpx/vp10/common/entropy.c
@@ -0,0 +1,818 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp10/common/entropy.h"
+#include "vp10/common/blockd.h"
+#include "vp10/common/onyxc_int.h"
+#include "vp10/common/entropymode.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx/vpx_integer.h"
+
+// Unconstrained Node Tree
+const vpx_tree_index vp10_coef_con_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
+  2, 6,                                // 0 = LOW_VAL
+  -TWO_TOKEN, 4,                       // 1 = TWO
+  -THREE_TOKEN, -FOUR_TOKEN,           // 2 = THREE
+  8, 10,                               // 3 = HIGH_LOW
+  -CATEGORY1_TOKEN, -CATEGORY2_TOKEN,  // 4 = CAT_ONE
+  12, 14,                              // 5 = CAT_THREEFOUR
+  -CATEGORY3_TOKEN, -CATEGORY4_TOKEN,  // 6 = CAT_THREE
+  -CATEGORY5_TOKEN, -CATEGORY6_TOKEN   // 7 = CAT_FIVE
+};
+
+const vpx_prob vp10_cat1_prob[] = { 159 };
+const vpx_prob vp10_cat2_prob[] = { 165, 145 };
+const vpx_prob vp10_cat3_prob[] = { 173, 148, 140 };
+const vpx_prob vp10_cat4_prob[] = { 176, 155, 140, 135 };
+const vpx_prob vp10_cat5_prob[] = { 180, 157, 141, 134, 130 };
+const vpx_prob vp10_cat6_prob[] = {
+    254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129
+};
+#if CONFIG_VP9_HIGHBITDEPTH
+const vpx_prob vp10_cat1_prob_high10[] = { 159 };
+const vpx_prob vp10_cat2_prob_high10[] = { 165, 145 };
+const vpx_prob vp10_cat3_prob_high10[] = { 173, 148, 140 };
+const vpx_prob vp10_cat4_prob_high10[] = { 176, 155, 140, 135 };
+const vpx_prob vp10_cat5_prob_high10[] = { 180, 157, 141, 134, 130 };
+const vpx_prob vp10_cat6_prob_high10[] = {
+    255, 255, 254, 254, 254, 252, 249, 243,
+    230, 196, 177, 153, 140, 133, 130, 129
+};
+const vpx_prob vp10_cat1_prob_high12[] = { 159 };
+const vpx_prob vp10_cat2_prob_high12[] = { 165, 145 };
+const vpx_prob vp10_cat3_prob_high12[] = { 173, 148, 140 };
+const vpx_prob vp10_cat4_prob_high12[] = { 176, 155, 140, 135 };
+const vpx_prob vp10_cat5_prob_high12[] = { 180, 157, 141, 134, 130 };
+const vpx_prob vp10_cat6_prob_high12[] = {
+    255, 255, 255, 255, 254, 254, 254, 252, 249,
+    243, 230, 196, 177, 153, 140, 133, 130, 129
+};
+#endif
+
+const uint8_t vp10_coefband_trans_8x8plus[1024] = {
+  0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4,
+  4, 4, 4, 4, 4, 5,
+  // beyond MAXBAND_INDEX+1 all values are filled as 5
+                    5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+};
+
+const uint8_t vp10_coefband_trans_4x4[16] = {
+  0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5,
+};
+
+const uint8_t vp10_pt_energy_class[ENTROPY_TOKENS] = {
+  0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5
+};
+
+// Model obtained from a 2-sided zero-centerd distribuition derived
+// from a Pareto distribution. The cdf of the distribution is:
+// cdf(x) = 0.5 + 0.5 * sgn(x) * [1 - {alpha/(alpha + |x|)} ^ beta]
+//
+// For a given beta and a given probablity of the 1-node, the alpha
+// is first solved, and then the {alpha, beta} pair is used to generate
+// the probabilities for the rest of the nodes.
+
+// beta = 8
+
+// Every odd line in this table can be generated from the even lines
+// by averaging :
+// vp10_pareto8_full[l][node] = (vp10_pareto8_full[l-1][node] +
+//                              vp10_pareto8_full[l+1][node] ) >> 1;
+const vpx_prob vp10_pareto8_full[COEFF_PROB_MODELS][MODEL_NODES] = {
+  {  3,  86, 128,   6,  86,  23,  88,  29},
+  {  6,  86, 128,  11,  87,  42,  91,  52},
+  {  9,  86, 129,  17,  88,  61,  94,  76},
+  { 12,  86, 129,  22,  88,  77,  97,  93},
+  { 15,  87, 129,  28,  89,  93, 100, 110},
+  { 17,  87, 129,  33,  90, 105, 103, 123},
+  { 20,  88, 130,  38,  91, 118, 106, 136},
+  { 23,  88, 130,  43,  91, 128, 108, 146},
+  { 26,  89, 131,  48,  92, 139, 111, 156},
+  { 28,  89, 131,  53,  93, 147, 114, 163},
+  { 31,  90, 131,  58,  94, 156, 117, 171},
+  { 34,  90, 131,  62,  94, 163, 119, 177},
+  { 37,  90, 132,  66,  95, 171, 122, 184},
+  { 39,  90, 132,  70,  96, 177, 124, 189},
+  { 42,  91, 132,  75,  97, 183, 127, 194},
+  { 44,  91, 132,  79,  97, 188, 129, 198},
+  { 47,  92, 133,  83,  98, 193, 132, 202},
+  { 49,  92, 133,  86,  99, 197, 134, 205},
+  { 52,  93, 133,  90, 100, 201, 137, 208},
+  { 54,  93, 133,  94, 100, 204, 139, 211},
+  { 57,  94, 134,  98, 101, 208, 142, 214},
+  { 59,  94, 134, 101, 102, 211, 144, 216},
+  { 62,  94, 135, 105, 103, 214, 146, 218},
+  { 64,  94, 135, 108, 103, 216, 148, 220},
+  { 66,  95, 135, 111, 104, 219, 151, 222},
+  { 68,  95, 135, 114, 105, 221, 153, 223},
+  { 71,  96, 136, 117, 106, 224, 155, 225},
+  { 73,  96, 136, 120, 106, 225, 157, 226},
+  { 76,  97, 136, 123, 107, 227, 159, 228},
+  { 78,  97, 136, 126, 108, 229, 160, 229},
+  { 80,  98, 137, 129, 109, 231, 162, 231},
+  { 82,  98, 137, 131, 109, 232, 164, 232},
+  { 84,  98, 138, 134, 110, 234, 166, 233},
+  { 86,  98, 138, 137, 111, 235, 168, 234},
+  { 89,  99, 138, 140, 112, 236, 170, 235},
+  { 91,  99, 138, 142, 112, 237, 171, 235},
+  { 93, 100, 139, 145, 113, 238, 173, 236},
+  { 95, 100, 139, 147, 114, 239, 174, 237},
+  { 97, 101, 140, 149, 115, 240, 176, 238},
+  { 99, 101, 140, 151, 115, 241, 177, 238},
+  {101, 102, 140, 154, 116, 242, 179, 239},
+  {103, 102, 140, 156, 117, 242, 180, 239},
+  {105, 103, 141, 158, 118, 243, 182, 240},
+  {107, 103, 141, 160, 118, 243, 183, 240},
+  {109, 104, 141, 162, 119, 244, 185, 241},
+  {111, 104, 141, 164, 119, 244, 186, 241},
+  {113, 104, 142, 166, 120, 245, 187, 242},
+  {114, 104, 142, 168, 121, 245, 188, 242},
+  {116, 105, 143, 170, 122, 246, 190, 243},
+  {118, 105, 143, 171, 122, 246, 191, 243},
+  {120, 106, 143, 173, 123, 247, 192, 244},
+  {121, 106, 143, 175, 124, 247, 193, 244},
+  {123, 107, 144, 177, 125, 248, 195, 244},
+  {125, 107, 144, 178, 125, 248, 196, 244},
+  {127, 108, 145, 180, 126, 249, 197, 245},
+  {128, 108, 145, 181, 127, 249, 198, 245},
+  {130, 109, 145, 183, 128, 249, 199, 245},
+  {132, 109, 145, 184, 128, 249, 200, 245},
+  {134, 110, 146, 186, 129, 250, 201, 246},
+  {135, 110, 146, 187, 130, 250, 202, 246},
+  {137, 111, 147, 189, 131, 251, 203, 246},
+  {138, 111, 147, 190, 131, 251, 204, 246},
+  {140, 112, 147, 192, 132, 251, 205, 247},
+  {141, 112, 147, 193, 132, 251, 206, 247},
+  {143, 113, 148, 194, 133, 251, 207, 247},
+  {144, 113, 148, 195, 134, 251, 207, 247},
+  {146, 114, 149, 197, 135, 252, 208, 248},
+  {147, 114, 149, 198, 135, 252, 209, 248},
+  {149, 115, 149, 199, 136, 252, 210, 248},
+  {150, 115, 149, 200, 137, 252, 210, 248},
+  {152, 115, 150, 201, 138, 252, 211, 248},
+  {153, 115, 150, 202, 138, 252, 212, 248},
+  {155, 116, 151, 204, 139, 253, 213, 249},
+  {156, 116, 151, 205, 139, 253, 213, 249},
+  {158, 117, 151, 206, 140, 253, 214, 249},
+  {159, 117, 151, 207, 141, 253, 215, 249},
+  {161, 118, 152, 208, 142, 253, 216, 249},
+  {162, 118, 152, 209, 142, 253, 216, 249},
+  {163, 119, 153, 210, 143, 253, 217, 249},
+  {164, 119, 153, 211, 143, 253, 217, 249},
+  {166, 120, 153, 212, 144, 254, 218, 250},
+  {167, 120, 153, 212, 145, 254, 219, 250},
+  {168, 121, 154, 213, 146, 254, 220, 250},
+  {169, 121, 154, 214, 146, 254, 220, 250},
+  {171, 122, 155, 215, 147, 254, 221, 250},
+  {172, 122, 155, 216, 147, 254, 221, 250},
+  {173, 123, 155, 217, 148, 254, 222, 250},
+  {174, 123, 155, 217, 149, 254, 222, 250},
+  {176, 124, 156, 218, 150, 254, 223, 250},
+  {177, 124, 156, 219, 150, 254, 223, 250},
+  {178, 125, 157, 220, 151, 254, 224, 251},
+  {179, 125, 157, 220, 151, 254, 224, 251},
+  {180, 126, 157, 221, 152, 254, 225, 251},
+  {181, 126, 157, 221, 152, 254, 225, 251},
+  {183, 127, 158, 222, 153, 254, 226, 251},
+  {184, 127, 158, 223, 154, 254, 226, 251},
+  {185, 128, 159, 224, 155, 255, 227, 251},
+  {186, 128, 159, 224, 155, 255, 227, 251},
+  {187, 129, 160, 225, 156, 255, 228, 251},
+  {188, 130, 160, 225, 156, 255, 228, 251},
+  {189, 131, 160, 226, 157, 255, 228, 251},
+  {190, 131, 160, 226, 158, 255, 228, 251},
+  {191, 132, 161, 227, 159, 255, 229, 251},
+  {192, 132, 161, 227, 159, 255, 229, 251},
+  {193, 133, 162, 228, 160, 255, 230, 252},
+  {194, 133, 162, 229, 160, 255, 230, 252},
+  {195, 134, 163, 230, 161, 255, 231, 252},
+  {196, 134, 163, 230, 161, 255, 231, 252},
+  {197, 135, 163, 231, 162, 255, 231, 252},
+  {198, 135, 163, 231, 162, 255, 231, 252},
+  {199, 136, 164, 232, 163, 255, 232, 252},
+  {200, 136, 164, 232, 164, 255, 232, 252},
+  {201, 137, 165, 233, 165, 255, 233, 252},
+  {201, 137, 165, 233, 165, 255, 233, 252},
+  {202, 138, 166, 233, 166, 255, 233, 252},
+  {203, 138, 166, 233, 166, 255, 233, 252},
+  {204, 139, 166, 234, 167, 255, 234, 252},
+  {205, 139, 166, 234, 167, 255, 234, 252},
+  {206, 140, 167, 235, 168, 255, 235, 252},
+  {206, 140, 167, 235, 168, 255, 235, 252},
+  {207, 141, 168, 236, 169, 255, 235, 252},
+  {208, 141, 168, 236, 170, 255, 235, 252},
+  {209, 142, 169, 237, 171, 255, 236, 252},
+  {209, 143, 169, 237, 171, 255, 236, 252},
+  {210, 144, 169, 237, 172, 255, 236, 252},
+  {211, 144, 169, 237, 172, 255, 236, 252},
+  {212, 145, 170, 238, 173, 255, 237, 252},
+  {213, 145, 170, 238, 173, 255, 237, 252},
+  {214, 146, 171, 239, 174, 255, 237, 253},
+  {214, 146, 171, 239, 174, 255, 237, 253},
+  {215, 147, 172, 240, 175, 255, 238, 253},
+  {215, 147, 172, 240, 175, 255, 238, 253},
+  {216, 148, 173, 240, 176, 255, 238, 253},
+  {217, 148, 173, 240, 176, 255, 238, 253},
+  {218, 149, 173, 241, 177, 255, 239, 253},
+  {218, 149, 173, 241, 178, 255, 239, 253},
+  {219, 150, 174, 241, 179, 255, 239, 253},
+  {219, 151, 174, 241, 179, 255, 239, 253},
+  {220, 152, 175, 242, 180, 255, 240, 253},
+  {221, 152, 175, 242, 180, 255, 240, 253},
+  {222, 153, 176, 242, 181, 255, 240, 253},
+  {222, 153, 176, 242, 181, 255, 240, 253},
+  {223, 154, 177, 243, 182, 255, 240, 253},
+  {223, 154, 177, 243, 182, 255, 240, 253},
+  {224, 155, 178, 244, 183, 255, 241, 253},
+  {224, 155, 178, 244, 183, 255, 241, 253},
+  {225, 156, 178, 244, 184, 255, 241, 253},
+  {225, 157, 178, 244, 184, 255, 241, 253},
+  {226, 158, 179, 244, 185, 255, 242, 253},
+  {227, 158, 179, 244, 185, 255, 242, 253},
+  {228, 159, 180, 245, 186, 255, 242, 253},
+  {228, 159, 180, 245, 186, 255, 242, 253},
+  {229, 160, 181, 245, 187, 255, 242, 253},
+  {229, 160, 181, 245, 187, 255, 242, 253},
+  {230, 161, 182, 246, 188, 255, 243, 253},
+  {230, 162, 182, 246, 188, 255, 243, 253},
+  {231, 163, 183, 246, 189, 255, 243, 253},
+  {231, 163, 183, 246, 189, 255, 243, 253},
+  {232, 164, 184, 247, 190, 255, 243, 253},
+  {232, 164, 184, 247, 190, 255, 243, 253},
+  {233, 165, 185, 247, 191, 255, 244, 253},
+  {233, 165, 185, 247, 191, 255, 244, 253},
+  {234, 166, 185, 247, 192, 255, 244, 253},
+  {234, 167, 185, 247, 192, 255, 244, 253},
+  {235, 168, 186, 248, 193, 255, 244, 253},
+  {235, 168, 186, 248, 193, 255, 244, 253},
+  {236, 169, 187, 248, 194, 255, 244, 253},
+  {236, 169, 187, 248, 194, 255, 244, 253},
+  {236, 170, 188, 248, 195, 255, 245, 253},
+  {236, 170, 188, 248, 195, 255, 245, 253},
+  {237, 171, 189, 249, 196, 255, 245, 254},
+  {237, 172, 189, 249, 196, 255, 245, 254},
+  {238, 173, 190, 249, 197, 255, 245, 254},
+  {238, 173, 190, 249, 197, 255, 245, 254},
+  {239, 174, 191, 249, 198, 255, 245, 254},
+  {239, 174, 191, 249, 198, 255, 245, 254},
+  {240, 175, 192, 249, 199, 255, 246, 254},
+  {240, 176, 192, 249, 199, 255, 246, 254},
+  {240, 177, 193, 250, 200, 255, 246, 254},
+  {240, 177, 193, 250, 200, 255, 246, 254},
+  {241, 178, 194, 250, 201, 255, 246, 254},
+  {241, 178, 194, 250, 201, 255, 246, 254},
+  {242, 179, 195, 250, 202, 255, 246, 254},
+  {242, 180, 195, 250, 202, 255, 246, 254},
+  {242, 181, 196, 250, 203, 255, 247, 254},
+  {242, 181, 196, 250, 203, 255, 247, 254},
+  {243, 182, 197, 251, 204, 255, 247, 254},
+  {243, 183, 197, 251, 204, 255, 247, 254},
+  {244, 184, 198, 251, 205, 255, 247, 254},
+  {244, 184, 198, 251, 205, 255, 247, 254},
+  {244, 185, 199, 251, 206, 255, 247, 254},
+  {244, 185, 199, 251, 206, 255, 247, 254},
+  {245, 186, 200, 251, 207, 255, 247, 254},
+  {245, 187, 200, 251, 207, 255, 247, 254},
+  {246, 188, 201, 252, 207, 255, 248, 254},
+  {246, 188, 201, 252, 207, 255, 248, 254},
+  {246, 189, 202, 252, 208, 255, 248, 254},
+  {246, 190, 202, 252, 208, 255, 248, 254},
+  {247, 191, 203, 252, 209, 255, 248, 254},
+  {247, 191, 203, 252, 209, 255, 248, 254},
+  {247, 192, 204, 252, 210, 255, 248, 254},
+  {247, 193, 204, 252, 210, 255, 248, 254},
+  {248, 194, 205, 252, 211, 255, 248, 254},
+  {248, 194, 205, 252, 211, 255, 248, 254},
+  {248, 195, 206, 252, 212, 255, 249, 254},
+  {248, 196, 206, 252, 212, 255, 249, 254},
+  {249, 197, 207, 253, 213, 255, 249, 254},
+  {249, 197, 207, 253, 213, 255, 249, 254},
+  {249, 198, 208, 253, 214, 255, 249, 254},
+  {249, 199, 209, 253, 214, 255, 249, 254},
+  {250, 200, 210, 253, 215, 255, 249, 254},
+  {250, 200, 210, 253, 215, 255, 249, 254},
+  {250, 201, 211, 253, 215, 255, 249, 254},
+  {250, 202, 211, 253, 215, 255, 249, 254},
+  {250, 203, 212, 253, 216, 255, 249, 254},
+  {250, 203, 212, 253, 216, 255, 249, 254},
+  {251, 204, 213, 253, 217, 255, 250, 254},
+  {251, 205, 213, 253, 217, 255, 250, 254},
+  {251, 206, 214, 254, 218, 255, 250, 254},
+  {251, 206, 215, 254, 218, 255, 250, 254},
+  {252, 207, 216, 254, 219, 255, 250, 254},
+  {252, 208, 216, 254, 219, 255, 250, 254},
+  {252, 209, 217, 254, 220, 255, 250, 254},
+  {252, 210, 217, 254, 220, 255, 250, 254},
+  {252, 211, 218, 254, 221, 255, 250, 254},
+  {252, 212, 218, 254, 221, 255, 250, 254},
+  {253, 213, 219, 254, 222, 255, 250, 254},
+  {253, 213, 220, 254, 222, 255, 250, 254},
+  {253, 214, 221, 254, 223, 255, 250, 254},
+  {253, 215, 221, 254, 223, 255, 250, 254},
+  {253, 216, 222, 254, 224, 255, 251, 254},
+  {253, 217, 223, 254, 224, 255, 251, 254},
+  {253, 218, 224, 254, 225, 255, 251, 254},
+  {253, 219, 224, 254, 225, 255, 251, 254},
+  {254, 220, 225, 254, 225, 255, 251, 254},
+  {254, 221, 226, 254, 225, 255, 251, 254},
+  {254, 222, 227, 255, 226, 255, 251, 254},
+  {254, 223, 227, 255, 226, 255, 251, 254},
+  {254, 224, 228, 255, 227, 255, 251, 254},
+  {254, 225, 229, 255, 227, 255, 251, 254},
+  {254, 226, 230, 255, 228, 255, 251, 254},
+  {254, 227, 230, 255, 229, 255, 251, 254},
+  {255, 228, 231, 255, 230, 255, 251, 254},
+  {255, 229, 232, 255, 230, 255, 251, 254},
+  {255, 230, 233, 255, 231, 255, 252, 254},
+  {255, 231, 234, 255, 231, 255, 252, 254},
+  {255, 232, 235, 255, 232, 255, 252, 254},
+  {255, 233, 236, 255, 232, 255, 252, 254},
+  {255, 235, 237, 255, 233, 255, 252, 254},
+  {255, 236, 238, 255, 234, 255, 252, 254},
+  {255, 238, 240, 255, 235, 255, 252, 255},
+  {255, 239, 241, 255, 235, 255, 252, 254},
+  {255, 241, 243, 255, 236, 255, 252, 254},
+  {255, 243, 245, 255, 237, 255, 252, 254},
+  {255, 246, 247, 255, 239, 255, 253, 255},
+};
+
+static const vp10_coeff_probs_model default_coef_probs_4x4[PLANE_TYPES] = {
+  {  // Y plane
+    {  // Intra
+      {  // Band 0
+        { 195,  29, 183 }, {  84,  49, 136 }, {   8,  42,  71 }
+      }, {  // Band 1
+        {  31, 107, 169 }, {  35,  99, 159 }, {  17,  82, 140 },
+        {   8,  66, 114 }, {   2,  44,  76 }, {   1,  19,  32 }
+      }, {  // Band 2
+        {  40, 132, 201 }, {  29, 114, 187 }, {  13,  91, 157 },
+        {   7,  75, 127 }, {   3,  58,  95 }, {   1,  28,  47 }
+      }, {  // Band 3
+        {  69, 142, 221 }, {  42, 122, 201 }, {  15,  91, 159 },
+        {   6,  67, 121 }, {   1,  42,  77 }, {   1,  17,  31 }
+      }, {  // Band 4
+        { 102, 148, 228 }, {  67, 117, 204 }, {  17,  82, 154 },
+        {   6,  59, 114 }, {   2,  39,  75 }, {   1,  15,  29 }
+      }, {  // Band 5
+        { 156,  57, 233 }, { 119,  57, 212 }, {  58,  48, 163 },
+        {  29,  40, 124 }, {  12,  30,  81 }, {   3,  12,  31 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        { 191, 107, 226 }, { 124, 117, 204 }, {  25,  99, 155 }
+      }, {  // Band 1
+        {  29, 148, 210 }, {  37, 126, 194 }, {   8,  93, 157 },
+        {   2,  68, 118 }, {   1,  39,  69 }, {   1,  17,  33 }
+      }, {  // Band 2
+        {  41, 151, 213 }, {  27, 123, 193 }, {   3,  82, 144 },
+        {   1,  58, 105 }, {   1,  32,  60 }, {   1,  13,  26 }
+      }, {  // Band 3
+        {  59, 159, 220 }, {  23, 126, 198 }, {   4,  88, 151 },
+        {   1,  66, 114 }, {   1,  38,  71 }, {   1,  18,  34 }
+      }, {  // Band 4
+        { 114, 136, 232 }, {  51, 114, 207 }, {  11,  83, 155 },
+        {   3,  56, 105 }, {   1,  33,  65 }, {   1,  17,  34 }
+      }, {  // Band 5
+        { 149,  65, 234 }, { 121,  57, 215 }, {  61,  49, 166 },
+        {  28,  36, 114 }, {  12,  25,  76 }, {   3,  16,  42 }
+      }
+    }
+  }, {  // UV plane
+    {  // Intra
+      {  // Band 0
+        { 214,  49, 220 }, { 132,  63, 188 }, {  42,  65, 137 }
+      }, {  // Band 1
+        {  85, 137, 221 }, { 104, 131, 216 }, {  49, 111, 192 },
+        {  21,  87, 155 }, {   2,  49,  87 }, {   1,  16,  28 }
+      }, {  // Band 2
+        {  89, 163, 230 }, {  90, 137, 220 }, {  29, 100, 183 },
+        {  10,  70, 135 }, {   2,  42,  81 }, {   1,  17,  33 }
+      }, {  // Band 3
+        { 108, 167, 237 }, {  55, 133, 222 }, {  15,  97, 179 },
+        {   4,  72, 135 }, {   1,  45,  85 }, {   1,  19,  38 }
+      }, {  // Band 4
+        { 124, 146, 240 }, {  66, 124, 224 }, {  17,  88, 175 },
+        {   4,  58, 122 }, {   1,  36,  75 }, {   1,  18,  37 }
+      }, {  //  Band 5
+        { 141,  79, 241 }, { 126,  70, 227 }, {  66,  58, 182 },
+        {  30,  44, 136 }, {  12,  34,  96 }, {   2,  20,  47 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        { 229,  99, 249 }, { 143, 111, 235 }, {  46, 109, 192 }
+      }, {  // Band 1
+        {  82, 158, 236 }, {  94, 146, 224 }, {  25, 117, 191 },
+        {   9,  87, 149 }, {   3,  56,  99 }, {   1,  33,  57 }
+      }, {  // Band 2
+        {  83, 167, 237 }, {  68, 145, 222 }, {  10, 103, 177 },
+        {   2,  72, 131 }, {   1,  41,  79 }, {   1,  20,  39 }
+      }, {  // Band 3
+        {  99, 167, 239 }, {  47, 141, 224 }, {  10, 104, 178 },
+        {   2,  73, 133 }, {   1,  44,  85 }, {   1,  22,  47 }
+      }, {  // Band 4
+        { 127, 145, 243 }, {  71, 129, 228 }, {  17,  93, 177 },
+        {   3,  61, 124 }, {   1,  41,  84 }, {   1,  21,  52 }
+      }, {  // Band 5
+        { 157,  78, 244 }, { 140,  72, 231 }, {  69,  58, 184 },
+        {  31,  44, 137 }, {  14,  38, 105 }, {   8,  23,  61 }
+      }
+    }
+  }
+};
+
+static const vp10_coeff_probs_model default_coef_probs_8x8[PLANE_TYPES] = {
+  {  // Y plane
+    {  // Intra
+      {  // Band 0
+        { 125,  34, 187 }, {  52,  41, 133 }, {   6,  31,  56 }
+      }, {  // Band 1
+        {  37, 109, 153 }, {  51, 102, 147 }, {  23,  87, 128 },
+        {   8,  67, 101 }, {   1,  41,  63 }, {   1,  19,  29 }
+      }, {  // Band 2
+        {  31, 154, 185 }, {  17, 127, 175 }, {   6,  96, 145 },
+        {   2,  73, 114 }, {   1,  51,  82 }, {   1,  28,  45 }
+      }, {  // Band 3
+        {  23, 163, 200 }, {  10, 131, 185 }, {   2,  93, 148 },
+        {   1,  67, 111 }, {   1,  41,  69 }, {   1,  14,  24 }
+      }, {  // Band 4
+        {  29, 176, 217 }, {  12, 145, 201 }, {   3, 101, 156 },
+        {   1,  69, 111 }, {   1,  39,  63 }, {   1,  14,  23 }
+      }, {  // Band 5
+        {  57, 192, 233 }, {  25, 154, 215 }, {   6, 109, 167 },
+        {   3,  78, 118 }, {   1,  48,  69 }, {   1,  21,  29 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        { 202, 105, 245 }, { 108, 106, 216 }, {  18,  90, 144 }
+      }, {  // Band 1
+        {  33, 172, 219 }, {  64, 149, 206 }, {  14, 117, 177 },
+        {   5,  90, 141 }, {   2,  61,  95 }, {   1,  37,  57 }
+      }, {  // Band 2
+        {  33, 179, 220 }, {  11, 140, 198 }, {   1,  89, 148 },
+        {   1,  60, 104 }, {   1,  33,  57 }, {   1,  12,  21 }
+      }, {  // Band 3
+        {  30, 181, 221 }, {   8, 141, 198 }, {   1,  87, 145 },
+        {   1,  58, 100 }, {   1,  31,  55 }, {   1,  12,  20 }
+      }, {  // Band 4
+        {  32, 186, 224 }, {   7, 142, 198 }, {   1,  86, 143 },
+        {   1,  58, 100 }, {   1,  31,  55 }, {   1,  12,  22 }
+      }, {  // Band 5
+        {  57, 192, 227 }, {  20, 143, 204 }, {   3,  96, 154 },
+        {   1,  68, 112 }, {   1,  42,  69 }, {   1,  19,  32 }
+      }
+    }
+  }, {  // UV plane
+    {  // Intra
+      {  // Band 0
+        { 212,  35, 215 }, { 113,  47, 169 }, {  29,  48, 105 }
+      }, {  // Band 1
+        {  74, 129, 203 }, { 106, 120, 203 }, {  49, 107, 178 },
+        {  19,  84, 144 }, {   4,  50,  84 }, {   1,  15,  25 }
+      }, {  // Band 2
+        {  71, 172, 217 }, {  44, 141, 209 }, {  15, 102, 173 },
+        {   6,  76, 133 }, {   2,  51,  89 }, {   1,  24,  42 }
+      }, {  // Band 3
+        {  64, 185, 231 }, {  31, 148, 216 }, {   8, 103, 175 },
+        {   3,  74, 131 }, {   1,  46,  81 }, {   1,  18,  30 }
+      }, {  // Band 4
+        {  65, 196, 235 }, {  25, 157, 221 }, {   5, 105, 174 },
+        {   1,  67, 120 }, {   1,  38,  69 }, {   1,  15,  30 }
+      }, {  // Band 5
+        {  65, 204, 238 }, {  30, 156, 224 }, {   7, 107, 177 },
+        {   2,  70, 124 }, {   1,  42,  73 }, {   1,  18,  34 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        { 225,  86, 251 }, { 144, 104, 235 }, {  42,  99, 181 }
+      }, {  // Band 1
+        {  85, 175, 239 }, { 112, 165, 229 }, {  29, 136, 200 },
+        {  12, 103, 162 }, {   6,  77, 123 }, {   2,  53,  84 }
+      }, {  // Band 2
+        {  75, 183, 239 }, {  30, 155, 221 }, {   3, 106, 171 },
+        {   1,  74, 128 }, {   1,  44,  76 }, {   1,  17,  28 }
+      }, {  // Band 3
+        {  73, 185, 240 }, {  27, 159, 222 }, {   2, 107, 172 },
+        {   1,  75, 127 }, {   1,  42,  73 }, {   1,  17,  29 }
+      }, {  // Band 4
+        {  62, 190, 238 }, {  21, 159, 222 }, {   2, 107, 172 },
+        {   1,  72, 122 }, {   1,  40,  71 }, {   1,  18,  32 }
+      }, {  // Band 5
+        {  61, 199, 240 }, {  27, 161, 226 }, {   4, 113, 180 },
+        {   1,  76, 129 }, {   1,  46,  80 }, {   1,  23,  41 }
+      }
+    }
+  }
+};
+
+static const vp10_coeff_probs_model default_coef_probs_16x16[PLANE_TYPES] = {
+  {  // Y plane
+    {  // Intra
+      {  // Band 0
+        {   7,  27, 153 }, {   5,  30,  95 }, {   1,  16,  30 }
+      }, {  // Band 1
+        {  50,  75, 127 }, {  57,  75, 124 }, {  27,  67, 108 },
+        {  10,  54,  86 }, {   1,  33,  52 }, {   1,  12,  18 }
+      }, {  // Band 2
+        {  43, 125, 151 }, {  26, 108, 148 }, {   7,  83, 122 },
+        {   2,  59,  89 }, {   1,  38,  60 }, {   1,  17,  27 }
+      }, {  // Band 3
+        {  23, 144, 163 }, {  13, 112, 154 }, {   2,  75, 117 },
+        {   1,  50,  81 }, {   1,  31,  51 }, {   1,  14,  23 }
+      }, {  // Band 4
+        {  18, 162, 185 }, {   6, 123, 171 }, {   1,  78, 125 },
+        {   1,  51,  86 }, {   1,  31,  54 }, {   1,  14,  23 }
+      }, {  // Band 5
+        {  15, 199, 227 }, {   3, 150, 204 }, {   1,  91, 146 },
+        {   1,  55,  95 }, {   1,  30,  53 }, {   1,  11,  20 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        {  19,  55, 240 }, {  19,  59, 196 }, {   3,  52, 105 }
+      }, {  // Band 1
+        {  41, 166, 207 }, { 104, 153, 199 }, {  31, 123, 181 },
+        {  14, 101, 152 }, {   5,  72, 106 }, {   1,  36,  52 }
+      }, {  // Band 2
+        {  35, 176, 211 }, {  12, 131, 190 }, {   2,  88, 144 },
+        {   1,  60, 101 }, {   1,  36,  60 }, {   1,  16,  28 }
+      }, {  // Band 3
+        {  28, 183, 213 }, {   8, 134, 191 }, {   1,  86, 142 },
+        {   1,  56,  96 }, {   1,  30,  53 }, {   1,  12,  20 }
+      }, {  // Band 4
+        {  20, 190, 215 }, {   4, 135, 192 }, {   1,  84, 139 },
+        {   1,  53,  91 }, {   1,  28,  49 }, {   1,  11,  20 }
+      }, {  // Band 5
+        {  13, 196, 216 }, {   2, 137, 192 }, {   1,  86, 143 },
+        {   1,  57,  99 }, {   1,  32,  56 }, {   1,  13,  24 }
+      }
+    }
+  }, {  // UV plane
+    {  // Intra
+      {  // Band 0
+        { 211,  29, 217 }, {  96,  47, 156 }, {  22,  43,  87 }
+      }, {  // Band 1
+        {  78, 120, 193 }, { 111, 116, 186 }, {  46, 102, 164 },
+        {  15,  80, 128 }, {   2,  49,  76 }, {   1,  18,  28 }
+      }, {  // Band 2
+        {  71, 161, 203 }, {  42, 132, 192 }, {  10,  98, 150 },
+        {   3,  69, 109 }, {   1,  44,  70 }, {   1,  18,  29 }
+      }, {  // Band 3
+        {  57, 186, 211 }, {  30, 140, 196 }, {   4,  93, 146 },
+        {   1,  62, 102 }, {   1,  38,  65 }, {   1,  16,  27 }
+      }, {  // Band 4
+        {  47, 199, 217 }, {  14, 145, 196 }, {   1,  88, 142 },
+        {   1,  57,  98 }, {   1,  36,  62 }, {   1,  15,  26 }
+      }, {  // Band 5
+        {  26, 219, 229 }, {   5, 155, 207 }, {   1,  94, 151 },
+        {   1,  60, 104 }, {   1,  36,  62 }, {   1,  16,  28 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        { 233,  29, 248 }, { 146,  47, 220 }, {  43,  52, 140 }
+      }, {  // Band 1
+        { 100, 163, 232 }, { 179, 161, 222 }, {  63, 142, 204 },
+        {  37, 113, 174 }, {  26,  89, 137 }, {  18,  68,  97 }
+      }, {  // Band 2
+        {  85, 181, 230 }, {  32, 146, 209 }, {   7, 100, 164 },
+        {   3,  71, 121 }, {   1,  45,  77 }, {   1,  18,  30 }
+      }, {  // Band 3
+        {  65, 187, 230 }, {  20, 148, 207 }, {   2,  97, 159 },
+        {   1,  68, 116 }, {   1,  40,  70 }, {   1,  14,  29 }
+      }, {  // Band 4
+        {  40, 194, 227 }, {   8, 147, 204 }, {   1,  94, 155 },
+        {   1,  65, 112 }, {   1,  39,  66 }, {   1,  14,  26 }
+      }, {  // Band 5
+        {  16, 208, 228 }, {   3, 151, 207 }, {   1,  98, 160 },
+        {   1,  67, 117 }, {   1,  41,  74 }, {   1,  17,  31 }
+      }
+    }
+  }
+};
+
+static const vp10_coeff_probs_model default_coef_probs_32x32[PLANE_TYPES] = {
+  {  // Y plane
+    {  // Intra
+      {  // Band 0
+        {  17,  38, 140 }, {   7,  34,  80 }, {   1,  17,  29 }
+      }, {  // Band 1
+        {  37,  75, 128 }, {  41,  76, 128 }, {  26,  66, 116 },
+        {  12,  52,  94 }, {   2,  32,  55 }, {   1,  10,  16 }
+      }, {  // Band 2
+        {  50, 127, 154 }, {  37, 109, 152 }, {  16,  82, 121 },
+        {   5,  59,  85 }, {   1,  35,  54 }, {   1,  13,  20 }
+      }, {  // Band 3
+        {  40, 142, 167 }, {  17, 110, 157 }, {   2,  71, 112 },
+        {   1,  44,  72 }, {   1,  27,  45 }, {   1,  11,  17 }
+      }, {  // Band 4
+        {  30, 175, 188 }, {   9, 124, 169 }, {   1,  74, 116 },
+        {   1,  48,  78 }, {   1,  30,  49 }, {   1,  11,  18 }
+      }, {  // Band 5
+        {  10, 222, 223 }, {   2, 150, 194 }, {   1,  83, 128 },
+        {   1,  48,  79 }, {   1,  27,  45 }, {   1,  11,  17 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        {  36,  41, 235 }, {  29,  36, 193 }, {  10,  27, 111 }
+      }, {  // Band 1
+        {  85, 165, 222 }, { 177, 162, 215 }, { 110, 135, 195 },
+        {  57, 113, 168 }, {  23,  83, 120 }, {  10,  49,  61 }
+      }, {  // Band 2
+        {  85, 190, 223 }, {  36, 139, 200 }, {   5,  90, 146 },
+        {   1,  60, 103 }, {   1,  38,  65 }, {   1,  18,  30 }
+      }, {  // Band 3
+        {  72, 202, 223 }, {  23, 141, 199 }, {   2,  86, 140 },
+        {   1,  56,  97 }, {   1,  36,  61 }, {   1,  16,  27 }
+      }, {  // Band 4
+        {  55, 218, 225 }, {  13, 145, 200 }, {   1,  86, 141 },
+        {   1,  57,  99 }, {   1,  35,  61 }, {   1,  13,  22 }
+      }, {  // Band 5
+        {  15, 235, 212 }, {   1, 132, 184 }, {   1,  84, 139 },
+        {   1,  57,  97 }, {   1,  34,  56 }, {   1,  14,  23 }
+      }
+    }
+  }, {  // UV plane
+    {  // Intra
+      {  // Band 0
+        { 181,  21, 201 }, {  61,  37, 123 }, {  10,  38,  71 }
+      }, {  // Band 1
+        {  47, 106, 172 }, {  95, 104, 173 }, {  42,  93, 159 },
+        {  18,  77, 131 }, {   4,  50,  81 }, {   1,  17,  23 }
+      }, {  // Band 2
+        {  62, 147, 199 }, {  44, 130, 189 }, {  28, 102, 154 },
+        {  18,  75, 115 }, {   2,  44,  65 }, {   1,  12,  19 }
+      }, {  // Band 3
+        {  55, 153, 210 }, {  24, 130, 194 }, {   3,  93, 146 },
+        {   1,  61,  97 }, {   1,  31,  50 }, {   1,  10,  16 }
+      }, {  // Band 4
+        {  49, 186, 223 }, {  17, 148, 204 }, {   1,  96, 142 },
+        {   1,  53,  83 }, {   1,  26,  44 }, {   1,  11,  17 }
+      }, {  // Band 5
+        {  13, 217, 212 }, {   2, 136, 180 }, {   1,  78, 124 },
+        {   1,  50,  83 }, {   1,  29,  49 }, {   1,  14,  23 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        { 197,  13, 247 }, {  82,  17, 222 }, {  25,  17, 162 }
+      }, {  // Band 1
+        { 126, 186, 247 }, { 234, 191, 243 }, { 176, 177, 234 },
+        { 104, 158, 220 }, {  66, 128, 186 }, {  55,  90, 137 }
+      }, {  // Band 2
+        { 111, 197, 242 }, {  46, 158, 219 }, {   9, 104, 171 },
+        {   2,  65, 125 }, {   1,  44,  80 }, {   1,  17,  91 }
+      }, {  // Band 3
+        { 104, 208, 245 }, {  39, 168, 224 }, {   3, 109, 162 },
+        {   1,  79, 124 }, {   1,  50, 102 }, {   1,  43, 102 }
+      }, {  // Band 4
+        {  84, 220, 246 }, {  31, 177, 231 }, {   2, 115, 180 },
+        {   1,  79, 134 }, {   1,  55,  77 }, {   1,  60,  79 }
+      }, {  // Band 5
+        {  43, 243, 240 }, {   8, 180, 217 }, {   1, 115, 166 },
+        {   1,  84, 121 }, {   1,  51,  67 }, {   1,  16,   6 }
+      }
+    }
+  }
+};
+
+static void extend_to_full_distribution(vpx_prob *probs, vpx_prob p) {
+  // TODO(aconverse): model[PIVOT_NODE] should never be zero.
+  // https://code.google.com/p/webm/issues/detail?id=1089
+  memcpy(probs, vp10_pareto8_full[p == 0 ? 254 : p - 1],
+         MODEL_NODES * sizeof(vpx_prob));
+}
+
+void vp10_model_to_full_probs(const vpx_prob *model, vpx_prob *full) {
+  if (full != model)
+    memcpy(full, model, sizeof(vpx_prob) * UNCONSTRAINED_NODES);
+  extend_to_full_distribution(&full[UNCONSTRAINED_NODES], model[PIVOT_NODE]);
+}
+
+void vp10_default_coef_probs(VP10_COMMON *cm) {
+  vp10_copy(cm->fc->coef_probs[TX_4X4], default_coef_probs_4x4);
+  vp10_copy(cm->fc->coef_probs[TX_8X8], default_coef_probs_8x8);
+  vp10_copy(cm->fc->coef_probs[TX_16X16], default_coef_probs_16x16);
+  vp10_copy(cm->fc->coef_probs[TX_32X32], default_coef_probs_32x32);
+}
+
+#define COEF_COUNT_SAT 24
+#define COEF_MAX_UPDATE_FACTOR 112
+#define COEF_COUNT_SAT_KEY 24
+#define COEF_MAX_UPDATE_FACTOR_KEY 112
+#define COEF_COUNT_SAT_AFTER_KEY 24
+#define COEF_MAX_UPDATE_FACTOR_AFTER_KEY 128
+
+static void adapt_coef_probs(VP10_COMMON *cm, TX_SIZE tx_size,
+                             unsigned int count_sat,
+                             unsigned int update_factor) {
+  const FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
+  vp10_coeff_probs_model *const probs = cm->fc->coef_probs[tx_size];
+  const vp10_coeff_probs_model *const pre_probs = pre_fc->coef_probs[tx_size];
+  vp10_coeff_count_model *counts = cm->counts.coef[tx_size];
+  unsigned int (*eob_counts)[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS] =
+      cm->counts.eob_branch[tx_size];
+  int i, j, k, l, m;
+
+  for (i = 0; i < PLANE_TYPES; ++i)
+    for (j = 0; j < REF_TYPES; ++j)
+      for (k = 0; k < COEF_BANDS; ++k)
+        for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
+          const int n0 = counts[i][j][k][l][ZERO_TOKEN];
+          const int n1 = counts[i][j][k][l][ONE_TOKEN];
+          const int n2 = counts[i][j][k][l][TWO_TOKEN];
+          const int neob = counts[i][j][k][l][EOB_MODEL_TOKEN];
+          const unsigned int branch_ct[UNCONSTRAINED_NODES][2] = {
+            { neob, eob_counts[i][j][k][l] - neob },
+            { n0, n1 + n2 },
+            { n1, n2 }
+          };
+          for (m = 0; m < UNCONSTRAINED_NODES; ++m)
+            probs[i][j][k][l][m] = merge_probs(pre_probs[i][j][k][l][m],
+                                               branch_ct[m],
+                                               count_sat, update_factor);
+        }
+}
+
+void vp10_adapt_coef_probs(VP10_COMMON *cm) {
+  TX_SIZE t;
+  unsigned int count_sat, update_factor;
+
+  if (frame_is_intra_only(cm)) {
+    update_factor = COEF_MAX_UPDATE_FACTOR_KEY;
+    count_sat = COEF_COUNT_SAT_KEY;
+  } else if (cm->last_frame_type == KEY_FRAME) {
+    update_factor = COEF_MAX_UPDATE_FACTOR_AFTER_KEY;  /* adapt quickly */
+    count_sat = COEF_COUNT_SAT_AFTER_KEY;
+  } else {
+    update_factor = COEF_MAX_UPDATE_FACTOR;
+    count_sat = COEF_COUNT_SAT;
+  }
+  for (t = TX_4X4; t <= TX_32X32; t++)
+    adapt_coef_probs(cm, t, count_sat, update_factor);
+}
diff --git a/libs/libvpx/vp10/common/entropy.h b/libs/libvpx/vp10/common/entropy.h
new file mode 100644
index 0000000000..9a471c8183
--- /dev/null
+++ b/libs/libvpx/vp10/common/entropy.h
@@ -0,0 +1,216 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_COMMON_ENTROPY_H_
+#define VP10_COMMON_ENTROPY_H_
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/prob.h"
+
+#include "vp10/common/common.h"
+#include "vp10/common/enums.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define DIFF_UPDATE_PROB        252
+#define GROUP_DIFF_UPDATE_PROB  252
+
+// Coefficient token alphabet
+#define ZERO_TOKEN      0   // 0     Extra Bits 0+0
+#define ONE_TOKEN       1   // 1     Extra Bits 0+1
+#define TWO_TOKEN       2   // 2     Extra Bits 0+1
+#define THREE_TOKEN     3   // 3     Extra Bits 0+1
+#define FOUR_TOKEN      4   // 4     Extra Bits 0+1
+#define CATEGORY1_TOKEN 5   // 5-6   Extra Bits 1+1
+#define CATEGORY2_TOKEN 6   // 7-10  Extra Bits 2+1
+#define CATEGORY3_TOKEN 7   // 11-18 Extra Bits 3+1
+#define CATEGORY4_TOKEN 8   // 19-34 Extra Bits 4+1
+#define CATEGORY5_TOKEN 9   // 35-66 Extra Bits 5+1
+#define CATEGORY6_TOKEN 10  // 67+   Extra Bits 14+1
+#define EOB_TOKEN       11  // EOB   Extra Bits 0+0
+
+#define ENTROPY_TOKENS 12
+
+#define ENTROPY_NODES 11
+
+DECLARE_ALIGNED(16, extern const uint8_t, vp10_pt_energy_class[ENTROPY_TOKENS]);
+
+#define CAT1_MIN_VAL    5
+#define CAT2_MIN_VAL    7
+#define CAT3_MIN_VAL   11
+#define CAT4_MIN_VAL   19
+#define CAT5_MIN_VAL   35
+#define CAT6_MIN_VAL   67
+
+// Extra bit probabilities.
+DECLARE_ALIGNED(16, extern const uint8_t, vp10_cat1_prob[1]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp10_cat2_prob[2]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp10_cat3_prob[3]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp10_cat4_prob[4]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp10_cat5_prob[5]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp10_cat6_prob[14]);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+DECLARE_ALIGNED(16, extern const uint8_t, vp10_cat1_prob_high10[1]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp10_cat2_prob_high10[2]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp10_cat3_prob_high10[3]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp10_cat4_prob_high10[4]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp10_cat5_prob_high10[5]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp10_cat6_prob_high10[16]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp10_cat1_prob_high12[1]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp10_cat2_prob_high12[2]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp10_cat3_prob_high12[3]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp10_cat4_prob_high12[4]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp10_cat5_prob_high12[5]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp10_cat6_prob_high12[18]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#define EOB_MODEL_TOKEN 3
+
+typedef struct {
+  const vpx_tree_index *tree;
+  const vpx_prob *prob;
+  int len;
+  int base_val;
+  const int16_t *cost;
+} vp10_extra_bit;
+
+// indexed by token value
+extern const vp10_extra_bit vp10_extra_bits[ENTROPY_TOKENS];
+#if CONFIG_VP9_HIGHBITDEPTH
+extern const vp10_extra_bit vp10_extra_bits_high10[ENTROPY_TOKENS];
+extern const vp10_extra_bit vp10_extra_bits_high12[ENTROPY_TOKENS];
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#define DCT_MAX_VALUE           16384
+#if CONFIG_VP9_HIGHBITDEPTH
+#define DCT_MAX_VALUE_HIGH10    65536
+#define DCT_MAX_VALUE_HIGH12   262144
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+/* Coefficients are predicted via a 3-dimensional probability table. */
+
+#define REF_TYPES 2  // intra=0, inter=1
+
+/* Middle dimension reflects the coefficient position within the transform. */
+#define COEF_BANDS 6
+
+/* Inside dimension is measure of nearby complexity, that reflects the energy
+   of nearby coefficients are nonzero.  For the first coefficient (DC, unless
+   block type is 0), we look at the (already encoded) blocks above and to the
+   left of the current block.  The context index is then the number (0,1,or 2)
+   of these blocks having nonzero coefficients.
+   After decoding a coefficient, the measure is determined by the size of the
+   most recently decoded coefficient.
+   Note that the intuitive meaning of this measure changes as coefficients
+   are decoded, e.g., prior to the first token, a zero means that my neighbors
+   are empty while, after the first token, because of the use of end-of-block,
+   a zero means we just decoded a zero and hence guarantees that a non-zero
+   coefficient will appear later in this block.  However, this shift
+   in meaning is perfectly OK because our context depends also on the
+   coefficient band (and since zigzag positions 0, 1, and 2 are in
+   distinct bands). */
+
+#define COEFF_CONTEXTS 6
+#define BAND_COEFF_CONTEXTS(band) ((band) == 0 ? 3 : COEFF_CONTEXTS)
+
+// #define ENTROPY_STATS
+
+typedef unsigned int vp10_coeff_count[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS]
+                                    [ENTROPY_TOKENS];
+typedef unsigned int vp10_coeff_stats[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS]
+                                    [ENTROPY_NODES][2];
+
+#define SUBEXP_PARAM                4   /* Subexponential code parameter */
+#define MODULUS_PARAM               13  /* Modulus parameter */
+
+struct VP10Common;
+void vp10_default_coef_probs(struct VP10Common *cm);
+void vp10_adapt_coef_probs(struct VP10Common *cm);
+
+// This is the index in the scan order beyond which all coefficients for
+// 8x8 transform and above are in the top band.
+// This macro is currently unused but may be used by certain implementations
+#define MAXBAND_INDEX 21
+
+DECLARE_ALIGNED(16, extern const uint8_t, vp10_coefband_trans_8x8plus[1024]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp10_coefband_trans_4x4[16]);
+
+static INLINE const uint8_t *get_band_translate(TX_SIZE tx_size) {
+  return tx_size == TX_4X4 ? vp10_coefband_trans_4x4
+                           : vp10_coefband_trans_8x8plus;
+}
+
+// 128 lists of probabilities are stored for the following ONE node probs:
+// 1, 3, 5, 7, ..., 253, 255
+// In between probabilities are interpolated linearly
+
+#define COEFF_PROB_MODELS 255
+
+#define UNCONSTRAINED_NODES         3
+
+#define PIVOT_NODE                  2   // which node is pivot
+
+#define MODEL_NODES (ENTROPY_NODES - UNCONSTRAINED_NODES)
+extern const vpx_tree_index vp10_coef_con_tree[TREE_SIZE(ENTROPY_TOKENS)];
+extern const vpx_prob vp10_pareto8_full[COEFF_PROB_MODELS][MODEL_NODES];
+
+typedef vpx_prob vp10_coeff_probs_model[REF_TYPES][COEF_BANDS]
+                                      [COEFF_CONTEXTS][UNCONSTRAINED_NODES];
+
+typedef unsigned int vp10_coeff_count_model[REF_TYPES][COEF_BANDS]
+                                          [COEFF_CONTEXTS]
+                                          [UNCONSTRAINED_NODES + 1];
+
+void vp10_model_to_full_probs(const vpx_prob *model, vpx_prob *full);
+
+typedef char ENTROPY_CONTEXT;
+
+static INLINE int combine_entropy_contexts(ENTROPY_CONTEXT a,
+                                           ENTROPY_CONTEXT b) {
+  return (a != 0) + (b != 0);
+}
+
+static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
+                                      const ENTROPY_CONTEXT *l) {
+  ENTROPY_CONTEXT above_ec = 0, left_ec = 0;
+
+  switch (tx_size) {
+    case TX_4X4:
+      above_ec = a[0] != 0;
+      left_ec = l[0] != 0;
+      break;
+    case TX_8X8:
+      above_ec = !!*(const uint16_t *)a;
+      left_ec  = !!*(const uint16_t *)l;
+      break;
+    case TX_16X16:
+      above_ec = !!*(const uint32_t *)a;
+      left_ec  = !!*(const uint32_t *)l;
+      break;
+    case TX_32X32:
+      above_ec = !!*(const uint64_t *)a;
+      left_ec  = !!*(const uint64_t *)l;
+      break;
+    default:
+      assert(0 && "Invalid transform size.");
+      break;
+  }
+
+  return combine_entropy_contexts(above_ec, left_ec);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_COMMON_ENTROPY_H_
diff --git a/libs/libvpx/vp10/common/entropymode.c b/libs/libvpx/vp10/common/entropymode.c
new file mode 100644
index 0000000000..78f3650f8b
--- /dev/null
+++ b/libs/libvpx/vp10/common/entropymode.c
@@ -0,0 +1,551 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_mem/vpx_mem.h"
+
+#include "vp10/common/onyxc_int.h"
+#include "vp10/common/seg_common.h"
+
+const vpx_prob vp10_kf_y_mode_prob[INTRA_MODES][INTRA_MODES][INTRA_MODES - 1] = {
+  {  // above = dc
+    { 137,  30,  42, 148, 151, 207,  70,  52,  91 },  // left = dc
+    {  92,  45, 102, 136, 116, 180,  74,  90, 100 },  // left = v
+    {  73,  32,  19, 187, 222, 215,  46,  34, 100 },  // left = h
+    {  91,  30,  32, 116, 121, 186,  93,  86,  94 },  // left = d45
+    {  72,  35,  36, 149,  68, 206,  68,  63, 105 },  // left = d135
+    {  73,  31,  28, 138,  57, 124,  55, 122, 151 },  // left = d117
+    {  67,  23,  21, 140, 126, 197,  40,  37, 171 },  // left = d153
+    {  86,  27,  28, 128, 154, 212,  45,  43,  53 },  // left = d207
+    {  74,  32,  27, 107,  86, 160,  63, 134, 102 },  // left = d63
+    {  59,  67,  44, 140, 161, 202,  78,  67, 119 }   // left = tm
+  }, {  // above = v
+    {  63,  36, 126, 146, 123, 158,  60,  90,  96 },  // left = dc
+    {  43,  46, 168, 134, 107, 128,  69, 142,  92 },  // left = v
+    {  44,  29,  68, 159, 201, 177,  50,  57,  77 },  // left = h
+    {  58,  38,  76, 114,  97, 172,  78, 133,  92 },  // left = d45
+    {  46,  41,  76, 140,  63, 184,  69, 112,  57 },  // left = d135
+    {  38,  32,  85, 140,  46, 112,  54, 151, 133 },  // left = d117
+    {  39,  27,  61, 131, 110, 175,  44,  75, 136 },  // left = d153
+    {  52,  30,  74, 113, 130, 175,  51,  64,  58 },  // left = d207
+    {  47,  35,  80, 100,  74, 143,  64, 163,  74 },  // left = d63
+    {  36,  61, 116, 114, 128, 162,  80, 125,  82 }   // left = tm
+  }, {  // above = h
+    {  82,  26,  26, 171, 208, 204,  44,  32, 105 },  // left = dc
+    {  55,  44,  68, 166, 179, 192,  57,  57, 108 },  // left = v
+    {  42,  26,  11, 199, 241, 228,  23,  15,  85 },  // left = h
+    {  68,  42,  19, 131, 160, 199,  55,  52,  83 },  // left = d45
+    {  58,  50,  25, 139, 115, 232,  39,  52, 118 },  // left = d135
+    {  50,  35,  33, 153, 104, 162,  64,  59, 131 },  // left = d117
+    {  44,  24,  16, 150, 177, 202,  33,  19, 156 },  // left = d153
+    {  55,  27,  12, 153, 203, 218,  26,  27,  49 },  // left = d207
+    {  53,  49,  21, 110, 116, 168,  59,  80,  76 },  // left = d63
+    {  38,  72,  19, 168, 203, 212,  50,  50, 107 }   // left = tm
+  }, {  // above = d45
+    { 103,  26,  36, 129, 132, 201,  83,  80,  93 },  // left = dc
+    {  59,  38,  83, 112, 103, 162,  98, 136,  90 },  // left = v
+    {  62,  30,  23, 158, 200, 207,  59,  57,  50 },  // left = h
+    {  67,  30,  29,  84,  86, 191, 102,  91,  59 },  // left = d45
+    {  60,  32,  33, 112,  71, 220,  64,  89, 104 },  // left = d135
+    {  53,  26,  34, 130,  56, 149,  84, 120, 103 },  // left = d117
+    {  53,  21,  23, 133, 109, 210,  56,  77, 172 },  // left = d153
+    {  77,  19,  29, 112, 142, 228,  55,  66,  36 },  // left = d207
+    {  61,  29,  29,  93,  97, 165,  83, 175, 162 },  // left = d63
+    {  47,  47,  43, 114, 137, 181, 100,  99,  95 }   // left = tm
+  }, {  // above = d135
+    {  69,  23,  29, 128,  83, 199,  46,  44, 101 },  // left = dc
+    {  53,  40,  55, 139,  69, 183,  61,  80, 110 },  // left = v
+    {  40,  29,  19, 161, 180, 207,  43,  24,  91 },  // left = h
+    {  60,  34,  19, 105,  61, 198,  53,  64,  89 },  // left = d45
+    {  52,  31,  22, 158,  40, 209,  58,  62,  89 },  // left = d135
+    {  44,  31,  29, 147,  46, 158,  56, 102, 198 },  // left = d117
+    {  35,  19,  12, 135,  87, 209,  41,  45, 167 },  // left = d153
+    {  55,  25,  21, 118,  95, 215,  38,  39,  66 },  // left = d207
+    {  51,  38,  25, 113,  58, 164,  70,  93,  97 },  // left = d63
+    {  47,  54,  34, 146, 108, 203,  72, 103, 151 }   // left = tm
+  }, {  // above = d117
+    {  64,  19,  37, 156,  66, 138,  49,  95, 133 },  // left = dc
+    {  46,  27,  80, 150,  55, 124,  55, 121, 135 },  // left = v
+    {  36,  23,  27, 165, 149, 166,  54,  64, 118 },  // left = h
+    {  53,  21,  36, 131,  63, 163,  60, 109,  81 },  // left = d45
+    {  40,  26,  35, 154,  40, 185,  51,  97, 123 },  // left = d135
+    {  35,  19,  34, 179,  19,  97,  48, 129, 124 },  // left = d117
+    {  36,  20,  26, 136,  62, 164,  33,  77, 154 },  // left = d153
+    {  45,  18,  32, 130,  90, 157,  40,  79,  91 },  // left = d207
+    {  45,  26,  28, 129,  45, 129,  49, 147, 123 },  // left = d63
+    {  38,  44,  51, 136,  74, 162,  57,  97, 121 }   // left = tm
+  }, {  // above = d153
+    {  75,  17,  22, 136, 138, 185,  32,  34, 166 },  // left = dc
+    {  56,  39,  58, 133, 117, 173,  48,  53, 187 },  // left = v
+    {  35,  21,  12, 161, 212, 207,  20,  23, 145 },  // left = h
+    {  56,  29,  19, 117, 109, 181,  55,  68, 112 },  // left = d45
+    {  47,  29,  17, 153,  64, 220,  59,  51, 114 },  // left = d135
+    {  46,  16,  24, 136,  76, 147,  41,  64, 172 },  // left = d117
+    {  34,  17,  11, 108, 152, 187,  13,  15, 209 },  // left = d153
+    {  51,  24,  14, 115, 133, 209,  32,  26, 104 },  // left = d207
+    {  55,  30,  18, 122,  79, 179,  44,  88, 116 },  // left = d63
+    {  37,  49,  25, 129, 168, 164,  41,  54, 148 }   // left = tm
+  }, {  // above = d207
+    {  82,  22,  32, 127, 143, 213,  39,  41,  70 },  // left = dc
+    {  62,  44,  61, 123, 105, 189,  48,  57,  64 },  // left = v
+    {  47,  25,  17, 175, 222, 220,  24,  30,  86 },  // left = h
+    {  68,  36,  17, 106, 102, 206,  59,  74,  74 },  // left = d45
+    {  57,  39,  23, 151,  68, 216,  55,  63,  58 },  // left = d135
+    {  49,  30,  35, 141,  70, 168,  82,  40, 115 },  // left = d117
+    {  51,  25,  15, 136, 129, 202,  38,  35, 139 },  // left = d153
+    {  68,  26,  16, 111, 141, 215,  29,  28,  28 },  // left = d207
+    {  59,  39,  19, 114,  75, 180,  77, 104,  42 },  // left = d63
+    {  40,  61,  26, 126, 152, 206,  61,  59,  93 }   // left = tm
+  }, {  // above = d63
+    {  78,  23,  39, 111, 117, 170,  74, 124,  94 },  // left = dc
+    {  48,  34,  86, 101,  92, 146,  78, 179, 134 },  // left = v
+    {  47,  22,  24, 138, 187, 178,  68,  69,  59 },  // left = h
+    {  56,  25,  33, 105, 112, 187,  95, 177, 129 },  // left = d45
+    {  48,  31,  27, 114,  63, 183,  82, 116,  56 },  // left = d135
+    {  43,  28,  37, 121,  63, 123,  61, 192, 169 },  // left = d117
+    {  42,  17,  24, 109,  97, 177,  56,  76, 122 },  // left = d153
+    {  58,  18,  28, 105, 139, 182,  70,  92,  63 },  // left = d207
+    {  46,  23,  32,  74,  86, 150,  67, 183,  88 },  // left = d63
+    {  36,  38,  48,  92, 122, 165,  88, 137,  91 }   // left = tm
+  }, {  // above = tm
+    {  65,  70,  60, 155, 159, 199,  61,  60,  81 },  // left = dc
+    {  44,  78, 115, 132, 119, 173,  71, 112,  93 },  // left = v
+    {  39,  38,  21, 184, 227, 206,  42,  32,  64 },  // left = h
+    {  58,  47,  36, 124, 137, 193,  80,  82,  78 },  // left = d45
+    {  49,  50,  35, 144,  95, 205,  63,  78,  59 },  // left = d135
+    {  41,  53,  52, 148,  71, 142,  65, 128,  51 },  // left = d117
+    {  40,  36,  28, 143, 143, 202,  40,  55, 137 },  // left = d153
+    {  52,  34,  29, 129, 183, 227,  42,  35,  43 },  // left = d207
+    {  42,  44,  44, 104, 105, 164,  64, 130,  80 },  // left = d63
+    {  43,  81,  53, 140, 169, 204,  68,  84,  72 }   // left = tm
+  }
+};
+
+#if !CONFIG_MISC_FIXES
+const vpx_prob vp10_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1] = {
+  { 144,  11,  54, 157, 195, 130,  46,  58, 108 },  // y = dc
+  { 118,  15, 123, 148, 131, 101,  44,  93, 131 },  // y = v
+  { 113,  12,  23, 188, 226, 142,  26,  32, 125 },  // y = h
+  { 120,  11,  50, 123, 163, 135,  64,  77, 103 },  // y = d45
+  { 113,   9,  36, 155, 111, 157,  32,  44, 161 },  // y = d135
+  { 116,   9,  55, 176,  76,  96,  37,  61, 149 },  // y = d117
+  { 115,   9,  28, 141, 161, 167,  21,  25, 193 },  // y = d153
+  { 120,  12,  32, 145, 195, 142,  32,  38,  86 },  // y = d207
+  { 116,  12,  64, 120, 140, 125,  49, 115, 121 },  // y = d63
+  { 102,  19,  66, 162, 182, 122,  35,  59, 128 }   // y = tm
+};
+#endif
+
+static const vpx_prob default_if_y_probs[BLOCK_SIZE_GROUPS][INTRA_MODES - 1] = {
+  {  65,  32,  18, 144, 162, 194,  41,  51,  98 },  // block_size < 8x8
+  { 132,  68,  18, 165, 217, 196,  45,  40,  78 },  // block_size < 16x16
+  { 173,  80,  19, 176, 240, 193,  64,  35,  46 },  // block_size < 32x32
+  { 221, 135,  38, 194, 248, 121,  96,  85,  29 }   // block_size >= 32x32
+};
+
+static const vpx_prob default_uv_probs[INTRA_MODES][INTRA_MODES - 1] = {
+  { 120,   7,  76, 176, 208, 126,  28,  54, 103 },  // y = dc
+  {  48,  12, 154, 155, 139,  90,  34, 117, 119 },  // y = v
+  {  67,   6,  25, 204, 243, 158,  13,  21,  96 },  // y = h
+  {  97,   5,  44, 131, 176, 139,  48,  68,  97 },  // y = d45
+  {  83,   5,  42, 156, 111, 152,  26,  49, 152 },  // y = d135
+  {  80,   5,  58, 178,  74,  83,  33,  62, 145 },  // y = d117
+  {  86,   5,  32, 154, 192, 168,  14,  22, 163 },  // y = d153
+  {  85,   5,  32, 156, 216, 148,  19,  29,  73 },  // y = d207
+  {  77,   7,  64, 116, 132, 122,  37, 126, 120 },  // y = d63
+  { 101,  21, 107, 181, 192, 103,  19,  67, 125 }   // y = tm
+};
+
+#if !CONFIG_MISC_FIXES
+const vpx_prob vp10_kf_partition_probs[PARTITION_CONTEXTS]
+                                     [PARTITION_TYPES - 1] = {
+  // 8x8 -> 4x4
+  { 158,  97,  94 },  // a/l both not split
+  {  93,  24,  99 },  // a split, l not split
+  {  85, 119,  44 },  // l split, a not split
+  {  62,  59,  67 },  // a/l both split
+  // 16x16 -> 8x8
+  { 149,  53,  53 },  // a/l both not split
+  {  94,  20,  48 },  // a split, l not split
+  {  83,  53,  24 },  // l split, a not split
+  {  52,  18,  18 },  // a/l both split
+  // 32x32 -> 16x16
+  { 150,  40,  39 },  // a/l both not split
+  {  78,  12,  26 },  // a split, l not split
+  {  67,  33,  11 },  // l split, a not split
+  {  24,   7,   5 },  // a/l both split
+  // 64x64 -> 32x32
+  { 174,  35,  49 },  // a/l both not split
+  {  68,  11,  27 },  // a split, l not split
+  {  57,  15,   9 },  // l split, a not split
+  {  12,   3,   3 },  // a/l both split
+};
+#endif
+
+static const vpx_prob default_partition_probs[PARTITION_CONTEXTS]
+                                             [PARTITION_TYPES - 1] = {
+  // 8x8 -> 4x4
+  { 199, 122, 141 },  // a/l both not split
+  { 147,  63, 159 },  // a split, l not split
+  { 148, 133, 118 },  // l split, a not split
+  { 121, 104, 114 },  // a/l both split
+  // 16x16 -> 8x8
+  { 174,  73,  87 },  // a/l both not split
+  {  92,  41,  83 },  // a split, l not split
+  {  82,  99,  50 },  // l split, a not split
+  {  53,  39,  39 },  // a/l both split
+  // 32x32 -> 16x16
+  { 177,  58,  59 },  // a/l both not split
+  {  68,  26,  63 },  // a split, l not split
+  {  52,  79,  25 },  // l split, a not split
+  {  17,  14,  12 },  // a/l both split
+  // 64x64 -> 32x32
+  { 222,  34,  30 },  // a/l both not split
+  {  72,  16,  44 },  // a split, l not split
+  {  58,  32,  12 },  // l split, a not split
+  {  10,   7,   6 },  // a/l both split
+};
+
+static const vpx_prob default_inter_mode_probs[INTER_MODE_CONTEXTS]
+                                              [INTER_MODES - 1] = {
+  {2,       173,   34},  // 0 = both zero mv
+  {7,       145,   85},  // 1 = one zero mv + one a predicted mv
+  {7,       166,   63},  // 2 = two predicted mvs
+  {7,       94,    66},  // 3 = one predicted/zero and one new mv
+  {8,       64,    46},  // 4 = two new mvs
+  {17,      81,    31},  // 5 = one intra neighbour + x
+  {25,      29,    30},  // 6 = two intra neighbours
+};
+
+/* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */
+const vpx_tree_index vp10_intra_mode_tree[TREE_SIZE(INTRA_MODES)] = {
+  -DC_PRED, 2,                      /* 0 = DC_NODE */
+  -TM_PRED, 4,                      /* 1 = TM_NODE */
+  -V_PRED, 6,                       /* 2 = V_NODE */
+  8, 12,                            /* 3 = COM_NODE */
+  -H_PRED, 10,                      /* 4 = H_NODE */
+  -D135_PRED, -D117_PRED,           /* 5 = D135_NODE */
+  -D45_PRED, 14,                    /* 6 = D45_NODE */
+  -D63_PRED, 16,                    /* 7 = D63_NODE */
+  -D153_PRED, -D207_PRED             /* 8 = D153_NODE */
+};
+
+const vpx_tree_index vp10_inter_mode_tree[TREE_SIZE(INTER_MODES)] = {
+  -INTER_OFFSET(ZEROMV), 2,
+  -INTER_OFFSET(NEARESTMV), 4,
+  -INTER_OFFSET(NEARMV), -INTER_OFFSET(NEWMV)
+};
+
+const vpx_tree_index vp10_partition_tree[TREE_SIZE(PARTITION_TYPES)] = {
+  -PARTITION_NONE, 2,
+  -PARTITION_HORZ, 4,
+  -PARTITION_VERT, -PARTITION_SPLIT
+};
+
+static const vpx_prob default_intra_inter_p[INTRA_INTER_CONTEXTS] = {
+  9, 102, 187, 225
+};
+
+static const vpx_prob default_comp_inter_p[COMP_INTER_CONTEXTS] = {
+  239, 183, 119,  96,  41
+};
+
+static const vpx_prob default_comp_ref_p[REF_CONTEXTS] = {
+  50, 126, 123, 221, 226
+};
+
+static const vpx_prob default_single_ref_p[REF_CONTEXTS][2] = {
+  {  33,  16 },
+  {  77,  74 },
+  { 142, 142 },
+  { 172, 170 },
+  { 238, 247 }
+};
+
+static const struct tx_probs default_tx_probs = {
+  { { 3, 136, 37 },
+    { 5, 52,  13 } },
+
+  { { 20, 152 },
+    { 15, 101 } },
+
+  { { 100 },
+    { 66  } }
+};
+
+void vp10_tx_counts_to_branch_counts_32x32(const unsigned int *tx_count_32x32p,
+                                      unsigned int (*ct_32x32p)[2]) {
+  ct_32x32p[0][0] = tx_count_32x32p[TX_4X4];
+  ct_32x32p[0][1] = tx_count_32x32p[TX_8X8] +
+                    tx_count_32x32p[TX_16X16] +
+                    tx_count_32x32p[TX_32X32];
+  ct_32x32p[1][0] = tx_count_32x32p[TX_8X8];
+  ct_32x32p[1][1] = tx_count_32x32p[TX_16X16] +
+                    tx_count_32x32p[TX_32X32];
+  ct_32x32p[2][0] = tx_count_32x32p[TX_16X16];
+  ct_32x32p[2][1] = tx_count_32x32p[TX_32X32];
+}
+
+void vp10_tx_counts_to_branch_counts_16x16(const unsigned int *tx_count_16x16p,
+                                      unsigned int (*ct_16x16p)[2]) {
+  ct_16x16p[0][0] = tx_count_16x16p[TX_4X4];
+  ct_16x16p[0][1] = tx_count_16x16p[TX_8X8] + tx_count_16x16p[TX_16X16];
+  ct_16x16p[1][0] = tx_count_16x16p[TX_8X8];
+  ct_16x16p[1][1] = tx_count_16x16p[TX_16X16];
+}
+
+void vp10_tx_counts_to_branch_counts_8x8(const unsigned int *tx_count_8x8p,
+                                    unsigned int (*ct_8x8p)[2]) {
+  ct_8x8p[0][0] = tx_count_8x8p[TX_4X4];
+  ct_8x8p[0][1] = tx_count_8x8p[TX_8X8];
+}
+
+static const vpx_prob default_skip_probs[SKIP_CONTEXTS] = {
+  192, 128, 64
+};
+
+static const vpx_prob default_switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
+                                                    [SWITCHABLE_FILTERS - 1] = {
+  { 235, 162, },
+  { 36, 255, },
+  { 34, 3, },
+  { 149, 144, },
+};
+
+#if CONFIG_MISC_FIXES
+// FIXME(someone) need real defaults here
+static const struct segmentation_probs default_seg_probs = {
+  { 128, 128, 128, 128, 128, 128, 128 },
+  { 128, 128, 128 },
+};
+#endif
+
+const vpx_tree_index vp10_ext_tx_tree[TREE_SIZE(TX_TYPES)] = {
+  -DCT_DCT, 2,
+  -ADST_ADST, 4,
+  -ADST_DCT, -DCT_ADST
+};
+
+static const vpx_prob default_intra_ext_tx_prob[EXT_TX_SIZES]
+                                               [TX_TYPES][TX_TYPES - 1] = {
+  {{240, 85, 128}, {4, 1, 248}, {4, 1, 8}, {4, 248, 128}},
+  {{244, 85, 128}, {8, 2, 248}, {8, 2, 8}, {8, 248, 128}},
+  {{248, 85, 128}, {16, 4, 248}, {16, 4, 8}, {16, 248, 128}},
+};
+
+static const vpx_prob default_inter_ext_tx_prob[EXT_TX_SIZES]
+                                               [TX_TYPES - 1] = {
+  {160, 85, 128},
+  {176, 85, 128},
+  {192, 85, 128},
+};
+
+static void init_mode_probs(FRAME_CONTEXT *fc) {
+  vp10_copy(fc->uv_mode_prob, default_uv_probs);
+  vp10_copy(fc->y_mode_prob, default_if_y_probs);
+  vp10_copy(fc->switchable_interp_prob, default_switchable_interp_prob);
+  vp10_copy(fc->partition_prob, default_partition_probs);
+  vp10_copy(fc->intra_inter_prob, default_intra_inter_p);
+  vp10_copy(fc->comp_inter_prob, default_comp_inter_p);
+  vp10_copy(fc->comp_ref_prob, default_comp_ref_p);
+  vp10_copy(fc->single_ref_prob, default_single_ref_p);
+  fc->tx_probs = default_tx_probs;
+  vp10_copy(fc->skip_probs, default_skip_probs);
+  vp10_copy(fc->inter_mode_probs, default_inter_mode_probs);
+#if CONFIG_MISC_FIXES
+  vp10_copy(fc->seg.tree_probs, default_seg_probs.tree_probs);
+  vp10_copy(fc->seg.pred_probs, default_seg_probs.pred_probs);
+#endif
+  vp10_copy(fc->intra_ext_tx_prob, default_intra_ext_tx_prob);
+  vp10_copy(fc->inter_ext_tx_prob, default_inter_ext_tx_prob);
+}
+
+const vpx_tree_index vp10_switchable_interp_tree
+                         [TREE_SIZE(SWITCHABLE_FILTERS)] = {
+  -EIGHTTAP, 2,
+  -EIGHTTAP_SMOOTH, -EIGHTTAP_SHARP
+};
+
+void vp10_adapt_inter_frame_probs(VP10_COMMON *cm) {
+  int i, j;
+  FRAME_CONTEXT *fc = cm->fc;
+  const FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
+  const FRAME_COUNTS *counts = &cm->counts;
+
+  for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
+    fc->intra_inter_prob[i] = mode_mv_merge_probs(pre_fc->intra_inter_prob[i],
+                                                  counts->intra_inter[i]);
+  for (i = 0; i < COMP_INTER_CONTEXTS; i++)
+    fc->comp_inter_prob[i] = mode_mv_merge_probs(pre_fc->comp_inter_prob[i],
+                                                 counts->comp_inter[i]);
+  for (i = 0; i < REF_CONTEXTS; i++)
+    fc->comp_ref_prob[i] = mode_mv_merge_probs(pre_fc->comp_ref_prob[i],
+                                               counts->comp_ref[i]);
+  for (i = 0; i < REF_CONTEXTS; i++)
+    for (j = 0; j < 2; j++)
+      fc->single_ref_prob[i][j] = mode_mv_merge_probs(
+          pre_fc->single_ref_prob[i][j], counts->single_ref[i][j]);
+
+  for (i = 0; i < INTER_MODE_CONTEXTS; i++)
+    vpx_tree_merge_probs(vp10_inter_mode_tree, pre_fc->inter_mode_probs[i],
+                counts->inter_mode[i], fc->inter_mode_probs[i]);
+
+  for (i = 0; i < BLOCK_SIZE_GROUPS; i++)
+    vpx_tree_merge_probs(vp10_intra_mode_tree, pre_fc->y_mode_prob[i],
+                counts->y_mode[i], fc->y_mode_prob[i]);
+
+#if !CONFIG_MISC_FIXES
+  for (i = 0; i < INTRA_MODES; ++i)
+    vpx_tree_merge_probs(vp10_intra_mode_tree, pre_fc->uv_mode_prob[i],
+                         counts->uv_mode[i], fc->uv_mode_prob[i]);
+
+  for (i = 0; i < PARTITION_CONTEXTS; i++)
+    vpx_tree_merge_probs(vp10_partition_tree, pre_fc->partition_prob[i],
+                         counts->partition[i], fc->partition_prob[i]);
+#endif
+
+  if (cm->interp_filter == SWITCHABLE) {
+    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
+      vpx_tree_merge_probs(vp10_switchable_interp_tree,
+                           pre_fc->switchable_interp_prob[i],
+                           counts->switchable_interp[i],
+                           fc->switchable_interp_prob[i]);
+  }
+}
+
+void vp10_adapt_intra_frame_probs(VP10_COMMON *cm) {
+  int i;
+  FRAME_CONTEXT *fc = cm->fc;
+  const FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
+  const FRAME_COUNTS *counts = &cm->counts;
+
+  if (cm->tx_mode == TX_MODE_SELECT) {
+    int j;
+    unsigned int branch_ct_8x8p[TX_SIZES - 3][2];
+    unsigned int branch_ct_16x16p[TX_SIZES - 2][2];
+    unsigned int branch_ct_32x32p[TX_SIZES - 1][2];
+
+    for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
+      vp10_tx_counts_to_branch_counts_8x8(counts->tx.p8x8[i], branch_ct_8x8p);
+      for (j = 0; j < TX_SIZES - 3; ++j)
+        fc->tx_probs.p8x8[i][j] = mode_mv_merge_probs(
+            pre_fc->tx_probs.p8x8[i][j], branch_ct_8x8p[j]);
+
+      vp10_tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i], branch_ct_16x16p);
+      for (j = 0; j < TX_SIZES - 2; ++j)
+        fc->tx_probs.p16x16[i][j] = mode_mv_merge_probs(
+            pre_fc->tx_probs.p16x16[i][j], branch_ct_16x16p[j]);
+
+      vp10_tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i], branch_ct_32x32p);
+      for (j = 0; j < TX_SIZES - 1; ++j)
+        fc->tx_probs.p32x32[i][j] = mode_mv_merge_probs(
+            pre_fc->tx_probs.p32x32[i][j], branch_ct_32x32p[j]);
+    }
+  }
+
+  for (i = 0; i < SKIP_CONTEXTS; ++i)
+    fc->skip_probs[i] = mode_mv_merge_probs(
+        pre_fc->skip_probs[i], counts->skip[i]);
+
+  for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+    int j;
+    for (j = 0; j < TX_TYPES; ++j)
+      vpx_tree_merge_probs(vp10_ext_tx_tree,
+                           pre_fc->intra_ext_tx_prob[i][j],
+                           counts->intra_ext_tx[i][j],
+                           fc->intra_ext_tx_prob[i][j]);
+  }
+  for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+    vpx_tree_merge_probs(vp10_ext_tx_tree,
+                         pre_fc->inter_ext_tx_prob[i],
+                         counts->inter_ext_tx[i],
+                         fc->inter_ext_tx_prob[i]);
+  }
+
+#if CONFIG_MISC_FIXES
+  if (cm->seg.temporal_update) {
+    for (i = 0; i < PREDICTION_PROBS; i++)
+      fc->seg.pred_probs[i] = mode_mv_merge_probs(pre_fc->seg.pred_probs[i],
+                                                  counts->seg.pred[i]);
+
+    vpx_tree_merge_probs(vp10_segment_tree, pre_fc->seg.tree_probs,
+                         counts->seg.tree_mispred, fc->seg.tree_probs);
+  } else {
+    vpx_tree_merge_probs(vp10_segment_tree, pre_fc->seg.tree_probs,
+                         counts->seg.tree_total, fc->seg.tree_probs);
+  }
+
+  for (i = 0; i < INTRA_MODES; ++i)
+    vpx_tree_merge_probs(vp10_intra_mode_tree, pre_fc->uv_mode_prob[i],
+                         counts->uv_mode[i], fc->uv_mode_prob[i]);
+
+  for (i = 0; i < PARTITION_CONTEXTS; i++)
+    vpx_tree_merge_probs(vp10_partition_tree, pre_fc->partition_prob[i],
+                         counts->partition[i], fc->partition_prob[i]);
+#endif
+}
+
+static void set_default_lf_deltas(struct loopfilter *lf) {
+  lf->mode_ref_delta_enabled = 1;
+  lf->mode_ref_delta_update = 1;
+
+  lf->ref_deltas[INTRA_FRAME] = 1;
+  lf->ref_deltas[LAST_FRAME] = 0;
+  lf->ref_deltas[GOLDEN_FRAME] = -1;
+  lf->ref_deltas[ALTREF_FRAME] = -1;
+
+  lf->mode_deltas[0] = 0;
+  lf->mode_deltas[1] = 0;
+}
+
+void vp10_setup_past_independence(VP10_COMMON *cm) {
+  // Reset the segment feature data to the default stats:
+  // Features disabled, 0, with delta coding (Default state).
+  struct loopfilter *const lf = &cm->lf;
+
+  int i;
+  vp10_clearall_segfeatures(&cm->seg);
+  cm->seg.abs_delta = SEGMENT_DELTADATA;
+
+  if (cm->last_frame_seg_map && !cm->frame_parallel_decode)
+    memset(cm->last_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols));
+
+  if (cm->current_frame_seg_map)
+    memset(cm->current_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols));
+
+  // Reset the mode ref deltas for loop filter
+  vp10_zero(lf->last_ref_deltas);
+  vp10_zero(lf->last_mode_deltas);
+  set_default_lf_deltas(lf);
+
+  // To force update of the sharpness
+  lf->last_sharpness_level = -1;
+
+  vp10_default_coef_probs(cm);
+  init_mode_probs(cm->fc);
+  vp10_init_mv_probs(cm);
+  cm->fc->initialized = 1;
+
+  if (cm->frame_type == KEY_FRAME || cm->error_resilient_mode ||
+      cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL) {
+    // Reset all frame contexts.
+    for (i = 0; i < FRAME_CONTEXTS; ++i)
+      cm->frame_contexts[i] = *cm->fc;
+  } else if (cm->reset_frame_context == RESET_FRAME_CONTEXT_CURRENT) {
+    // Reset only the frame context specified in the frame header.
+    cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
+  }
+
+  // prev_mip will only be allocated in encoder.
+  if (frame_is_intra_only(cm) && cm->prev_mip && !cm->frame_parallel_decode)
+    memset(cm->prev_mip, 0,
+           cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->prev_mip));
+
+  cm->frame_context_idx = 0;
+}
diff --git a/libs/libvpx/vp10/common/entropymode.h b/libs/libvpx/vp10/common/entropymode.h
new file mode 100644
index 0000000000..611d3ad132
--- /dev/null
+++ b/libs/libvpx/vp10/common/entropymode.h
@@ -0,0 +1,142 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_COMMON_ENTROPYMODE_H_
+#define VP10_COMMON_ENTROPYMODE_H_
+
+#include "vp10/common/entropy.h"
+#include "vp10/common/entropymv.h"
+#include "vp10/common/filter.h"
+#include "vp10/common/seg_common.h"
+#include "vpx_dsp/vpx_filter.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define BLOCK_SIZE_GROUPS 4
+
+#define TX_SIZE_CONTEXTS 2
+
+#define INTER_OFFSET(mode) ((mode) - NEARESTMV)
+
+struct VP10Common;
+
+struct tx_probs {
+  vpx_prob p32x32[TX_SIZE_CONTEXTS][TX_SIZES - 1];
+  vpx_prob p16x16[TX_SIZE_CONTEXTS][TX_SIZES - 2];
+  vpx_prob p8x8[TX_SIZE_CONTEXTS][TX_SIZES - 3];
+};
+
+struct tx_counts {
+  unsigned int p32x32[TX_SIZE_CONTEXTS][TX_SIZES];
+  unsigned int p16x16[TX_SIZE_CONTEXTS][TX_SIZES - 1];
+  unsigned int p8x8[TX_SIZE_CONTEXTS][TX_SIZES - 2];
+  unsigned int tx_totals[TX_SIZES];
+};
+
+struct seg_counts {
+  unsigned int tree_total[MAX_SEGMENTS];
+  unsigned int tree_mispred[MAX_SEGMENTS];
+  unsigned int pred[PREDICTION_PROBS][2];
+};
+
+typedef struct frame_contexts {
+  vpx_prob y_mode_prob[BLOCK_SIZE_GROUPS][INTRA_MODES - 1];
+  vpx_prob uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
+  vpx_prob partition_prob[PARTITION_CONTEXTS][PARTITION_TYPES - 1];
+  vp10_coeff_probs_model coef_probs[TX_SIZES][PLANE_TYPES];
+  vpx_prob switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
+                                 [SWITCHABLE_FILTERS - 1];
+  vpx_prob inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1];
+  vpx_prob intra_inter_prob[INTRA_INTER_CONTEXTS];
+  vpx_prob comp_inter_prob[COMP_INTER_CONTEXTS];
+  vpx_prob single_ref_prob[REF_CONTEXTS][2];
+  vpx_prob comp_ref_prob[REF_CONTEXTS];
+  struct tx_probs tx_probs;
+  vpx_prob skip_probs[SKIP_CONTEXTS];
+  nmv_context nmvc;
+#if CONFIG_MISC_FIXES
+  struct segmentation_probs seg;
+#endif
+  vpx_prob intra_ext_tx_prob[EXT_TX_SIZES][TX_TYPES][TX_TYPES - 1];
+  vpx_prob inter_ext_tx_prob[EXT_TX_SIZES][TX_TYPES - 1];
+  int initialized;
+} FRAME_CONTEXT;
+
+typedef struct FRAME_COUNTS {
+  unsigned int kf_y_mode[INTRA_MODES][INTRA_MODES][INTRA_MODES];
+  unsigned int y_mode[BLOCK_SIZE_GROUPS][INTRA_MODES];
+  unsigned int uv_mode[INTRA_MODES][INTRA_MODES];
+  unsigned int partition[PARTITION_CONTEXTS][PARTITION_TYPES];
+  vp10_coeff_count_model coef[TX_SIZES][PLANE_TYPES];
+  unsigned int eob_branch[TX_SIZES][PLANE_TYPES][REF_TYPES]
+                         [COEF_BANDS][COEFF_CONTEXTS];
+  unsigned int switchable_interp[SWITCHABLE_FILTER_CONTEXTS]
+                                [SWITCHABLE_FILTERS];
+  unsigned int inter_mode[INTER_MODE_CONTEXTS][INTER_MODES];
+  unsigned int intra_inter[INTRA_INTER_CONTEXTS][2];
+  unsigned int comp_inter[COMP_INTER_CONTEXTS][2];
+  unsigned int single_ref[REF_CONTEXTS][2][2];
+  unsigned int comp_ref[REF_CONTEXTS][2];
+  struct tx_counts tx;
+  unsigned int skip[SKIP_CONTEXTS][2];
+  nmv_context_counts mv;
+#if CONFIG_MISC_FIXES
+  struct seg_counts seg;
+#endif
+  unsigned int intra_ext_tx[EXT_TX_SIZES][TX_TYPES][TX_TYPES];
+  unsigned int inter_ext_tx[EXT_TX_SIZES][TX_TYPES];
+} FRAME_COUNTS;
+
+extern const vpx_prob vp10_kf_y_mode_prob[INTRA_MODES][INTRA_MODES]
+                                        [INTRA_MODES - 1];
+#if !CONFIG_MISC_FIXES
+extern const vpx_prob vp10_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
+extern const vpx_prob vp10_kf_partition_probs[PARTITION_CONTEXTS]
+                                            [PARTITION_TYPES - 1];
+#endif
+
+extern const vpx_tree_index vp10_intra_mode_tree[TREE_SIZE(INTRA_MODES)];
+extern const vpx_tree_index vp10_inter_mode_tree[TREE_SIZE(INTER_MODES)];
+extern const vpx_tree_index vp10_partition_tree[TREE_SIZE(PARTITION_TYPES)];
+extern const vpx_tree_index vp10_switchable_interp_tree
+                                [TREE_SIZE(SWITCHABLE_FILTERS)];
+
+
+void vp10_setup_past_independence(struct VP10Common *cm);
+
+void vp10_adapt_intra_frame_probs(struct VP10Common *cm);
+void vp10_adapt_inter_frame_probs(struct VP10Common *cm);
+
+void vp10_tx_counts_to_branch_counts_32x32(const unsigned int *tx_count_32x32p,
+                                      unsigned int (*ct_32x32p)[2]);
+void vp10_tx_counts_to_branch_counts_16x16(const unsigned int *tx_count_16x16p,
+                                      unsigned int (*ct_16x16p)[2]);
+void vp10_tx_counts_to_branch_counts_8x8(const unsigned int *tx_count_8x8p,
+                                    unsigned int (*ct_8x8p)[2]);
+
+extern const vpx_tree_index
+    vp10_ext_tx_tree[TREE_SIZE(TX_TYPES)];
+
+static INLINE int vp10_ceil_log2(int n) {
+  int i = 1, p = 2;
+  while (p < n) {
+    i++;
+    p = p << 1;
+  }
+  return i;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_COMMON_ENTROPYMODE_H_
diff --git a/libs/libvpx/vp10/common/entropymv.c b/libs/libvpx/vp10/common/entropymv.c
new file mode 100644
index 0000000000..a9946ee152
--- /dev/null
+++ b/libs/libvpx/vp10/common/entropymv.c
@@ -0,0 +1,225 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp10/common/onyxc_int.h"
+#include "vp10/common/entropymv.h"
+
+// Integer pel reference mv threshold for use of high-precision 1/8 mv
+#define COMPANDED_MVREF_THRESH 8
+
+const vpx_tree_index vp10_mv_joint_tree[TREE_SIZE(MV_JOINTS)] = {
+  -MV_JOINT_ZERO, 2,
+  -MV_JOINT_HNZVZ, 4,
+  -MV_JOINT_HZVNZ, -MV_JOINT_HNZVNZ
+};
+
+const vpx_tree_index vp10_mv_class_tree[TREE_SIZE(MV_CLASSES)] = {
+  -MV_CLASS_0, 2,
+  -MV_CLASS_1, 4,
+  6, 8,
+  -MV_CLASS_2, -MV_CLASS_3,
+  10, 12,
+  -MV_CLASS_4, -MV_CLASS_5,
+  -MV_CLASS_6, 14,
+  16, 18,
+  -MV_CLASS_7, -MV_CLASS_8,
+  -MV_CLASS_9, -MV_CLASS_10,
+};
+
+const vpx_tree_index vp10_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)] = {
+  -0, -1,
+};
+
+const vpx_tree_index vp10_mv_fp_tree[TREE_SIZE(MV_FP_SIZE)] = {
+  -0, 2,
+  -1, 4,
+  -2, -3
+};
+
+static const nmv_context default_nmv_context = {
+  {32, 64, 96},
+  {
+    { // Vertical component
+      128,                                                  // sign
+      {224, 144, 192, 168, 192, 176, 192, 198, 198, 245},   // class
+      {216},                                                // class0
+      {136, 140, 148, 160, 176, 192, 224, 234, 234, 240},   // bits
+      {{128, 128, 64}, {96, 112, 64}},                      // class0_fp
+      {64, 96, 64},                                         // fp
+      160,                                                  // class0_hp bit
+      128,                                                  // hp
+    },
+    { // Horizontal component
+      128,                                                  // sign
+      {216, 128, 176, 160, 176, 176, 192, 198, 198, 208},   // class
+      {208},                                                // class0
+      {136, 140, 148, 160, 176, 192, 224, 234, 234, 240},   // bits
+      {{128, 128, 64}, {96, 112, 64}},                      // class0_fp
+      {64, 96, 64},                                         // fp
+      160,                                                  // class0_hp bit
+      128,                                                  // hp
+    }
+  },
+};
+
+static const uint8_t log_in_base_2[] = {
+  0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
+  4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10
+};
+
+static INLINE int mv_class_base(MV_CLASS_TYPE c) {
+  return c ? CLASS0_SIZE << (c + 2) : 0;
+}
+
+MV_CLASS_TYPE vp10_get_mv_class(int z, int *offset) {
+  const MV_CLASS_TYPE c = (z >= CLASS0_SIZE * 4096) ?
+      MV_CLASS_10 : (MV_CLASS_TYPE)log_in_base_2[z >> 3];
+  if (offset)
+    *offset = z - mv_class_base(c);
+  return c;
+}
+
+int vp10_use_mv_hp(const MV *ref) {
+#if CONFIG_MISC_FIXES
+  (void) ref;
+  return 1;
+#else
+  return (abs(ref->row) >> 3) < COMPANDED_MVREF_THRESH &&
+         (abs(ref->col) >> 3) < COMPANDED_MVREF_THRESH;
+#endif
+}
+
+static void inc_mv_component(int v, nmv_component_counts *comp_counts,
+                             int incr, int usehp) {
+  int s, z, c, o, d, e, f;
+  assert(v != 0);            /* should not be zero */
+  s = v < 0;
+  comp_counts->sign[s] += incr;
+  z = (s ? -v : v) - 1;       /* magnitude - 1 */
+
+  c = vp10_get_mv_class(z, &o);
+  comp_counts->classes[c] += incr;
+
+  d = (o >> 3);               /* int mv data */
+  f = (o >> 1) & 3;           /* fractional pel mv data */
+  e = (o & 1);                /* high precision mv data */
+
+  if (c == MV_CLASS_0) {
+    comp_counts->class0[d] += incr;
+    comp_counts->class0_fp[d][f] += incr;
+    comp_counts->class0_hp[e] += usehp * incr;
+  } else {
+    int i;
+    int b = c + CLASS0_BITS - 1;  // number of bits
+    for (i = 0; i < b; ++i)
+      comp_counts->bits[i][((d >> i) & 1)] += incr;
+    comp_counts->fp[f] += incr;
+    comp_counts->hp[e] += usehp * incr;
+  }
+}
+
+void vp10_inc_mv(const MV *mv, nmv_context_counts *counts, const int usehp) {
+  if (counts != NULL) {
+    const MV_JOINT_TYPE j = vp10_get_mv_joint(mv);
+    ++counts->joints[j];
+
+    if (mv_joint_vertical(j)) {
+      inc_mv_component(mv->row, &counts->comps[0], 1,
+                       !CONFIG_MISC_FIXES || usehp);
+    }
+
+    if (mv_joint_horizontal(j)) {
+      inc_mv_component(mv->col, &counts->comps[1], 1,
+                       !CONFIG_MISC_FIXES || usehp);
+    }
+  }
+}
+
+void vp10_adapt_mv_probs(VP10_COMMON *cm, int allow_hp) {
+  int i, j;
+
+  nmv_context *fc = &cm->fc->nmvc;
+  const nmv_context *pre_fc = &cm->frame_contexts[cm->frame_context_idx].nmvc;
+  const nmv_context_counts *counts = &cm->counts.mv;
+
+  vpx_tree_merge_probs(vp10_mv_joint_tree, pre_fc->joints, counts->joints,
+                       fc->joints);
+
+  for (i = 0; i < 2; ++i) {
+    nmv_component *comp = &fc->comps[i];
+    const nmv_component *pre_comp = &pre_fc->comps[i];
+    const nmv_component_counts *c = &counts->comps[i];
+
+    comp->sign = mode_mv_merge_probs(pre_comp->sign, c->sign);
+    vpx_tree_merge_probs(vp10_mv_class_tree, pre_comp->classes, c->classes,
+                         comp->classes);
+    vpx_tree_merge_probs(vp10_mv_class0_tree, pre_comp->class0, c->class0,
+                         comp->class0);
+
+    for (j = 0; j < MV_OFFSET_BITS; ++j)
+      comp->bits[j] = mode_mv_merge_probs(pre_comp->bits[j], c->bits[j]);
+
+    for (j = 0; j < CLASS0_SIZE; ++j)
+      vpx_tree_merge_probs(vp10_mv_fp_tree, pre_comp->class0_fp[j],
+                           c->class0_fp[j], comp->class0_fp[j]);
+
+    vpx_tree_merge_probs(vp10_mv_fp_tree, pre_comp->fp, c->fp, comp->fp);
+
+    if (allow_hp) {
+      comp->class0_hp = mode_mv_merge_probs(pre_comp->class0_hp, c->class0_hp);
+      comp->hp = mode_mv_merge_probs(pre_comp->hp, c->hp);
+    }
+  }
+}
+
+void vp10_init_mv_probs(VP10_COMMON *cm) {
+  cm->fc->nmvc = default_nmv_context;
+}
diff --git a/libs/libvpx/vp10/common/entropymv.h b/libs/libvpx/vp10/common/entropymv.h
new file mode 100644
index 0000000000..d1eb95c579
--- /dev/null
+++ b/libs/libvpx/vp10/common/entropymv.h
@@ -0,0 +1,133 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP10_COMMON_ENTROPYMV_H_
+#define VP10_COMMON_ENTROPYMV_H_
+
+#include "./vpx_config.h"
+
+#include "vpx_dsp/prob.h"
+
+#include "vp10/common/mv.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP10Common;
+
+void vp10_init_mv_probs(struct VP10Common *cm);
+
+void vp10_adapt_mv_probs(struct VP10Common *cm, int usehp);
+int vp10_use_mv_hp(const MV *ref);
+
+#define MV_UPDATE_PROB 252
+
+/* Symbols for coding which components are zero jointly */
+#define MV_JOINTS     4
+typedef enum {
+  MV_JOINT_ZERO = 0,             /* Zero vector */
+  MV_JOINT_HNZVZ = 1,            /* Vert zero, hor nonzero */
+  MV_JOINT_HZVNZ = 2,            /* Hor zero, vert nonzero */
+  MV_JOINT_HNZVNZ = 3,           /* Both components nonzero */
+} MV_JOINT_TYPE;
+
+static INLINE int mv_joint_vertical(MV_JOINT_TYPE type) {
+  return type == MV_JOINT_HZVNZ || type == MV_JOINT_HNZVNZ;
+}
+
+static INLINE int mv_joint_horizontal(MV_JOINT_TYPE type) {
+  return type == MV_JOINT_HNZVZ || type == MV_JOINT_HNZVNZ;
+}
+
+/* Symbols for coding magnitude class of nonzero components */
+#define MV_CLASSES     11
+typedef enum {
+  MV_CLASS_0 = 0,      /* (0, 2]     integer pel */
+  MV_CLASS_1 = 1,      /* (2, 4]     integer pel */
+  MV_CLASS_2 = 2,      /* (4, 8]     integer pel */
+  MV_CLASS_3 = 3,      /* (8, 16]    integer pel */
+  MV_CLASS_4 = 4,      /* (16, 32]   integer pel */
+  MV_CLASS_5 = 5,      /* (32, 64]   integer pel */
+  MV_CLASS_6 = 6,      /* (64, 128]  integer pel */
+  MV_CLASS_7 = 7,      /* (128, 256] integer pel */
+  MV_CLASS_8 = 8,      /* (256, 512] integer pel */
+  MV_CLASS_9 = 9,      /* (512, 1024] integer pel */
+  MV_CLASS_10 = 10,    /* (1024,2048] integer pel */
+} MV_CLASS_TYPE;
+
+#define CLASS0_BITS    1  /* bits at integer precision for class 0 */
+#define CLASS0_SIZE    (1 << CLASS0_BITS)
+#define MV_OFFSET_BITS (MV_CLASSES + CLASS0_BITS - 2)
+#define MV_FP_SIZE 4
+
+#define MV_MAX_BITS    (MV_CLASSES + CLASS0_BITS + 2)
+#define MV_MAX         ((1 << MV_MAX_BITS) - 1)
+#define MV_VALS        ((MV_MAX << 1) + 1)
+
+#define MV_IN_USE_BITS 14
+#define MV_UPP   ((1 << MV_IN_USE_BITS) - 1)
+#define MV_LOW   (-(1 << MV_IN_USE_BITS))
+
+extern const vpx_tree_index vp10_mv_joint_tree[];
+extern const vpx_tree_index vp10_mv_class_tree[];
+extern const vpx_tree_index vp10_mv_class0_tree[];
+extern const vpx_tree_index vp10_mv_fp_tree[];
+
+typedef struct {
+  vpx_prob sign;
+  vpx_prob classes[MV_CLASSES - 1];
+  vpx_prob class0[CLASS0_SIZE - 1];
+  vpx_prob bits[MV_OFFSET_BITS];
+  vpx_prob class0_fp[CLASS0_SIZE][MV_FP_SIZE - 1];
+  vpx_prob fp[MV_FP_SIZE - 1];
+  vpx_prob class0_hp;
+  vpx_prob hp;
+} nmv_component;
+
+typedef struct {
+  vpx_prob joints[MV_JOINTS - 1];
+  nmv_component comps[2];
+} nmv_context;
+
+static INLINE MV_JOINT_TYPE vp10_get_mv_joint(const MV *mv) {
+  if (mv->row == 0) {
+    return mv->col == 0 ? MV_JOINT_ZERO : MV_JOINT_HNZVZ;
+  } else {
+    return mv->col == 0 ? MV_JOINT_HZVNZ : MV_JOINT_HNZVNZ;
+  }
+}
+
+MV_CLASS_TYPE vp10_get_mv_class(int z, int *offset);
+
+typedef struct {
+  unsigned int sign[2];
+  unsigned int classes[MV_CLASSES];
+  unsigned int class0[CLASS0_SIZE];
+  unsigned int bits[MV_OFFSET_BITS][2];
+  unsigned int class0_fp[CLASS0_SIZE][MV_FP_SIZE];
+  unsigned int fp[MV_FP_SIZE];
+  unsigned int class0_hp[2];
+  unsigned int hp[2];
+} nmv_component_counts;
+
+typedef struct {
+  unsigned int joints[MV_JOINTS];
+  nmv_component_counts comps[2];
+} nmv_context_counts;
+
+void vp10_inc_mv(const MV *mv, nmv_context_counts *mvctx, const int usehp);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_COMMON_ENTROPYMV_H_
diff --git a/libs/libvpx/vp10/common/enums.h b/libs/libvpx/vp10/common/enums.h
new file mode 100644
index 0000000000..18c7d1629e
--- /dev/null
+++ b/libs/libvpx/vp10/common/enums.h
@@ -0,0 +1,149 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_COMMON_ENUMS_H_
+#define VP10_COMMON_ENUMS_H_
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MI_SIZE_LOG2 3
+#define MI_BLOCK_SIZE_LOG2 (6 - MI_SIZE_LOG2)  // 64 = 2^6
+
+#define MI_SIZE (1 << MI_SIZE_LOG2)  // pixels per mi-unit
+#define MI_BLOCK_SIZE (1 << MI_BLOCK_SIZE_LOG2)  // mi-units per max block
+
+#define MI_MASK (MI_BLOCK_SIZE - 1)
+
+// Bitstream profiles indicated by 2-3 bits in the uncompressed header.
+// 00: Profile 0.  8-bit 4:2:0 only.
+// 10: Profile 1.  8-bit 4:4:4, 4:2:2, and 4:4:0.
+// 01: Profile 2.  10-bit and 12-bit color only, with 4:2:0 sampling.
+// 110: Profile 3. 10-bit and 12-bit color only, with 4:2:2/4:4:4/4:4:0
+//                 sampling.
+// 111: Undefined profile.
+typedef enum BITSTREAM_PROFILE {
+  PROFILE_0,
+  PROFILE_1,
+  PROFILE_2,
+  PROFILE_3,
+  MAX_PROFILES
+} BITSTREAM_PROFILE;
+
+#define BLOCK_4X4     0
+#define BLOCK_4X8     1
+#define BLOCK_8X4     2
+#define BLOCK_8X8     3
+#define BLOCK_8X16    4
+#define BLOCK_16X8    5
+#define BLOCK_16X16   6
+#define BLOCK_16X32   7
+#define BLOCK_32X16   8
+#define BLOCK_32X32   9
+#define BLOCK_32X64  10
+#define BLOCK_64X32  11
+#define BLOCK_64X64  12
+#define BLOCK_SIZES  13
+#define BLOCK_INVALID BLOCK_SIZES
+typedef uint8_t BLOCK_SIZE;
+
+typedef enum PARTITION_TYPE {
+  PARTITION_NONE,
+  PARTITION_HORZ,
+  PARTITION_VERT,
+  PARTITION_SPLIT,
+  PARTITION_TYPES,
+  PARTITION_INVALID = PARTITION_TYPES
+} PARTITION_TYPE;
+
+typedef char PARTITION_CONTEXT;
+#define PARTITION_PLOFFSET   4  // number of probability models per block size
+#define PARTITION_CONTEXTS (4 * PARTITION_PLOFFSET)
+
+// block transform size
+typedef uint8_t TX_SIZE;
+#define TX_4X4   ((TX_SIZE)0)   // 4x4 transform
+#define TX_8X8   ((TX_SIZE)1)   // 8x8 transform
+#define TX_16X16 ((TX_SIZE)2)   // 16x16 transform
+#define TX_32X32 ((TX_SIZE)3)   // 32x32 transform
+#define TX_SIZES ((TX_SIZE)4)
+
+// frame transform mode
+typedef enum {
+  ONLY_4X4            = 0,        // only 4x4 transform used
+  ALLOW_8X8           = 1,        // allow block transform size up to 8x8
+  ALLOW_16X16         = 2,        // allow block transform size up to 16x16
+  ALLOW_32X32         = 3,        // allow block transform size up to 32x32
+  TX_MODE_SELECT      = 4,        // transform specified for each block
+  TX_MODES            = 5,
+} TX_MODE;
+
+typedef enum {
+  DCT_DCT   = 0,                      // DCT  in both horizontal and vertical
+  ADST_DCT  = 1,                      // ADST in vertical, DCT in horizontal
+  DCT_ADST  = 2,                      // DCT  in vertical, ADST in horizontal
+  ADST_ADST = 3,                      // ADST in both directions
+  TX_TYPES = 4
+} TX_TYPE;
+
+#define EXT_TX_SIZES       3  // number of sizes that use extended transforms
+
+typedef enum {
+  VP9_LAST_FLAG = 1 << 0,
+  VP9_GOLD_FLAG = 1 << 1,
+  VP9_ALT_FLAG = 1 << 2,
+} VP9_REFFRAME;
+
+typedef enum {
+  PLANE_TYPE_Y  = 0,
+  PLANE_TYPE_UV = 1,
+  PLANE_TYPES
+} PLANE_TYPE;
+
+#define DC_PRED    0       // Average of above and left pixels
+#define V_PRED     1       // Vertical
+#define H_PRED     2       // Horizontal
+#define D45_PRED   3       // Directional 45  deg = round(arctan(1/1) * 180/pi)
+#define D135_PRED  4       // Directional 135 deg = 180 - 45
+#define D117_PRED  5       // Directional 117 deg = 180 - 63
+#define D153_PRED  6       // Directional 153 deg = 180 - 27
+#define D207_PRED  7       // Directional 207 deg = 180 + 27
+#define D63_PRED   8       // Directional 63  deg = round(arctan(2/1) * 180/pi)
+#define TM_PRED    9       // True-motion
+#define NEARESTMV 10
+#define NEARMV    11
+#define ZEROMV    12
+#define NEWMV     13
+#define MB_MODE_COUNT 14
+typedef uint8_t PREDICTION_MODE;
+
+#define INTRA_MODES (TM_PRED + 1)
+
+#define INTER_MODES (1 + NEWMV - NEARESTMV)
+
+#define SKIP_CONTEXTS 3
+#define INTER_MODE_CONTEXTS 7
+
+/* Segment Feature Masks */
+#define MAX_MV_REF_CANDIDATES 2
+
+#define INTRA_INTER_CONTEXTS 4
+#define COMP_INTER_CONTEXTS 5
+#define REF_CONTEXTS 5
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_COMMON_ENUMS_H_
diff --git a/libs/libvpx/vp10/common/filter.c b/libs/libvpx/vp10/common/filter.c
new file mode 100644
index 0000000000..dda279f132
--- /dev/null
+++ b/libs/libvpx/vp10/common/filter.c
@@ -0,0 +1,104 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vp10/common/filter.h"
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+                bilinear_filters[SUBPEL_SHIFTS]) = {
+  { 0, 0, 0, 128,   0, 0, 0, 0 },
+  { 0, 0, 0, 120,   8, 0, 0, 0 },
+  { 0, 0, 0, 112,  16, 0, 0, 0 },
+  { 0, 0, 0, 104,  24, 0, 0, 0 },
+  { 0, 0, 0,  96,  32, 0, 0, 0 },
+  { 0, 0, 0,  88,  40, 0, 0, 0 },
+  { 0, 0, 0,  80,  48, 0, 0, 0 },
+  { 0, 0, 0,  72,  56, 0, 0, 0 },
+  { 0, 0, 0,  64,  64, 0, 0, 0 },
+  { 0, 0, 0,  56,  72, 0, 0, 0 },
+  { 0, 0, 0,  48,  80, 0, 0, 0 },
+  { 0, 0, 0,  40,  88, 0, 0, 0 },
+  { 0, 0, 0,  32,  96, 0, 0, 0 },
+  { 0, 0, 0,  24, 104, 0, 0, 0 },
+  { 0, 0, 0,  16, 112, 0, 0, 0 },
+  { 0, 0, 0,   8, 120, 0, 0, 0 }
+};
+
+// Lagrangian interpolation filter
+DECLARE_ALIGNED(256, static const InterpKernel,
+                sub_pel_filters_8[SUBPEL_SHIFTS]) = {
+  { 0,   0,   0, 128,   0,   0,   0,  0},
+  { 0,   1,  -5, 126,   8,  -3,   1,  0},
+  { -1,   3, -10, 122,  18,  -6,   2,  0},
+  { -1,   4, -13, 118,  27,  -9,   3, -1},
+  { -1,   4, -16, 112,  37, -11,   4, -1},
+  { -1,   5, -18, 105,  48, -14,   4, -1},
+  { -1,   5, -19,  97,  58, -16,   5, -1},
+  { -1,   6, -19,  88,  68, -18,   5, -1},
+  { -1,   6, -19,  78,  78, -19,   6, -1},
+  { -1,   5, -18,  68,  88, -19,   6, -1},
+  { -1,   5, -16,  58,  97, -19,   5, -1},
+  { -1,   4, -14,  48, 105, -18,   5, -1},
+  { -1,   4, -11,  37, 112, -16,   4, -1},
+  { -1,   3,  -9,  27, 118, -13,   4, -1},
+  { 0,   2,  -6,  18, 122, -10,   3, -1},
+  { 0,   1,  -3,   8, 126,  -5,   1,  0}
+};
+
+// DCT based filter
+DECLARE_ALIGNED(256, static const InterpKernel,
+                sub_pel_filters_8s[SUBPEL_SHIFTS]) = {
+  {0,   0,   0, 128,   0,   0,   0, 0},
+  {-1,   3,  -7, 127,   8,  -3,   1, 0},
+  {-2,   5, -13, 125,  17,  -6,   3, -1},
+  {-3,   7, -17, 121,  27, -10,   5, -2},
+  {-4,   9, -20, 115,  37, -13,   6, -2},
+  {-4,  10, -23, 108,  48, -16,   8, -3},
+  {-4,  10, -24, 100,  59, -19,   9, -3},
+  {-4,  11, -24,  90,  70, -21,  10, -4},
+  {-4,  11, -23,  80,  80, -23,  11, -4},
+  {-4,  10, -21,  70,  90, -24,  11, -4},
+  {-3,   9, -19,  59, 100, -24,  10, -4},
+  {-3,   8, -16,  48, 108, -23,  10, -4},
+  {-2,   6, -13,  37, 115, -20,   9, -4},
+  {-2,   5, -10,  27, 121, -17,   7, -3},
+  {-1,   3,  -6,  17, 125, -13,   5, -2},
+  {0,   1,  -3,   8, 127,  -7,   3, -1}
+};
+
+// freqmultiplier = 0.5
+DECLARE_ALIGNED(256, static const InterpKernel,
+                sub_pel_filters_8lp[SUBPEL_SHIFTS]) = {
+  { 0,  0,  0, 128,  0,  0,  0,  0},
+  {-3, -1, 32,  64, 38,  1, -3,  0},
+  {-2, -2, 29,  63, 41,  2, -3,  0},
+  {-2, -2, 26,  63, 43,  4, -4,  0},
+  {-2, -3, 24,  62, 46,  5, -4,  0},
+  {-2, -3, 21,  60, 49,  7, -4,  0},
+  {-1, -4, 18,  59, 51,  9, -4,  0},
+  {-1, -4, 16,  57, 53, 12, -4, -1},
+  {-1, -4, 14,  55, 55, 14, -4, -1},
+  {-1, -4, 12,  53, 57, 16, -4, -1},
+  { 0, -4,  9,  51, 59, 18, -4, -1},
+  { 0, -4,  7,  49, 60, 21, -3, -2},
+  { 0, -4,  5,  46, 62, 24, -3, -2},
+  { 0, -4,  4,  43, 63, 26, -2, -2},
+  { 0, -3,  2,  41, 63, 29, -2, -2},
+  { 0, -3,  1,  38, 64, 32, -1, -3}
+};
+
+
+const InterpKernel *vp10_filter_kernels[4] = {
+  sub_pel_filters_8,
+  sub_pel_filters_8lp,
+  sub_pel_filters_8s,
+  bilinear_filters
+};
diff --git a/libs/libvpx/vp10/common/filter.h b/libs/libvpx/vp10/common/filter.h
new file mode 100644
index 0000000000..826cd0386e
--- /dev/null
+++ b/libs/libvpx/vp10/common/filter.h
@@ -0,0 +1,42 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_COMMON_FILTER_H_
+#define VP10_COMMON_FILTER_H_
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_ports/mem.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define EIGHTTAP            0
+#define EIGHTTAP_SMOOTH     1
+#define EIGHTTAP_SHARP      2
+#define SWITCHABLE_FILTERS  3 /* Number of switchable filters */
+#define BILINEAR            3
+// The codec can operate in four possible inter prediction filter mode:
+// 8-tap, 8-tap-smooth, 8-tap-sharp, and switching between the three.
+#define SWITCHABLE_FILTER_CONTEXTS (SWITCHABLE_FILTERS + 1)
+#define SWITCHABLE 4 /* should be the last one */
+
+typedef uint8_t INTERP_FILTER;
+
+extern const InterpKernel *vp10_filter_kernels[4];
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_COMMON_FILTER_H_
diff --git a/libs/libvpx/vp10/common/frame_buffers.c b/libs/libvpx/vp10/common/frame_buffers.c
new file mode 100644
index 0000000000..794c80fde3
--- /dev/null
+++ b/libs/libvpx/vp10/common/frame_buffers.c
@@ -0,0 +1,86 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vp10/common/frame_buffers.h"
+#include "vpx_mem/vpx_mem.h"
+
+int vp10_alloc_internal_frame_buffers(InternalFrameBufferList *list) {
+  assert(list != NULL);
+  vp10_free_internal_frame_buffers(list);
+
+  list->num_internal_frame_buffers =
+      VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS;
+  list->int_fb =
+      (InternalFrameBuffer *)vpx_calloc(list->num_internal_frame_buffers,
+                                        sizeof(*list->int_fb));
+  return (list->int_fb == NULL);
+}
+
+void vp10_free_internal_frame_buffers(InternalFrameBufferList *list) {
+  int i;
+
+  assert(list != NULL);
+
+  for (i = 0; i < list->num_internal_frame_buffers; ++i) {
+    vpx_free(list->int_fb[i].data);
+    list->int_fb[i].data = NULL;
+  }
+  vpx_free(list->int_fb);
+  list->int_fb = NULL;
+}
+
+int vp10_get_frame_buffer(void *cb_priv, size_t min_size,
+                         vpx_codec_frame_buffer_t *fb) {
+  int i;
+  InternalFrameBufferList *const int_fb_list =
+      (InternalFrameBufferList *)cb_priv;
+  if (int_fb_list == NULL)
+    return -1;
+
+  // Find a free frame buffer.
+  for (i = 0; i < int_fb_list->num_internal_frame_buffers; ++i) {
+    if (!int_fb_list->int_fb[i].in_use)
+      break;
+  }
+
+  if (i == int_fb_list->num_internal_frame_buffers)
+    return -1;
+
+  if (int_fb_list->int_fb[i].size < min_size) {
+    int_fb_list->int_fb[i].data =
+        (uint8_t *)vpx_realloc(int_fb_list->int_fb[i].data, min_size);
+    if (!int_fb_list->int_fb[i].data)
+      return -1;
+
+    // This memset is needed for fixing valgrind error from C loop filter
+    // due to access uninitialized memory in frame border. It could be
+    // removed if border is totally removed.
+    memset(int_fb_list->int_fb[i].data, 0, min_size);
+    int_fb_list->int_fb[i].size = min_size;
+  }
+
+  fb->data = int_fb_list->int_fb[i].data;
+  fb->size = int_fb_list->int_fb[i].size;
+  int_fb_list->int_fb[i].in_use = 1;
+
+  // Set the frame buffer's private data to point at the internal frame buffer.
+  fb->priv = &int_fb_list->int_fb[i];
+  return 0;
+}
+
+int vp10_release_frame_buffer(void *cb_priv, vpx_codec_frame_buffer_t *fb) {
+  InternalFrameBuffer *const int_fb = (InternalFrameBuffer *)fb->priv;
+  (void)cb_priv;
+  if (int_fb)
+    int_fb->in_use = 0;
+  return 0;
+}
diff --git a/libs/libvpx/vp10/common/frame_buffers.h b/libs/libvpx/vp10/common/frame_buffers.h
new file mode 100644
index 0000000000..729ebafb02
--- /dev/null
+++ b/libs/libvpx/vp10/common/frame_buffers.h
@@ -0,0 +1,53 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_COMMON_FRAME_BUFFERS_H_
+#define VP10_COMMON_FRAME_BUFFERS_H_
+
+#include "vpx/vpx_frame_buffer.h"
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct InternalFrameBuffer {
+  uint8_t *data;
+  size_t size;
+  int in_use;
+} InternalFrameBuffer;
+
+typedef struct InternalFrameBufferList {
+  int num_internal_frame_buffers;
+  InternalFrameBuffer *int_fb;
+} InternalFrameBufferList;
+
+// Initializes |list|. Returns 0 on success.
+int vp10_alloc_internal_frame_buffers(InternalFrameBufferList *list);
+
+// Free any data allocated to the frame buffers.
+void vp10_free_internal_frame_buffers(InternalFrameBufferList *list);
+
+// Callback used by libvpx to request an external frame buffer. |cb_priv|
+// Callback private data, which points to an InternalFrameBufferList.
+// |min_size| is the minimum size in bytes needed to decode the next frame.
+// |fb| pointer to the frame buffer.
+int vp10_get_frame_buffer(void *cb_priv, size_t min_size,
+                         vpx_codec_frame_buffer_t *fb);
+
+// Callback used by libvpx when there are no references to the frame buffer.
+// |cb_priv| is not used. |fb| pointer to the frame buffer.
+int vp10_release_frame_buffer(void *cb_priv, vpx_codec_frame_buffer_t *fb);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_COMMON_FRAME_BUFFERS_H_
diff --git a/libs/libvpx/vp10/common/idct.c b/libs/libvpx/vp10/common/idct.c
new file mode 100644
index 0000000000..5ee15c862d
--- /dev/null
+++ b/libs/libvpx/vp10/common/idct.c
@@ -0,0 +1,498 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+
+#include "./vp10_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vp10/common/blockd.h"
+#include "vp10/common/idct.h"
+#include "vpx_dsp/inv_txfm.h"
+#include "vpx_ports/mem.h"
+
+void vp10_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride,
+                         int tx_type) {
+  const transform_2d IHT_4[] = {
+    { idct4_c, idct4_c  },  // DCT_DCT  = 0
+    { iadst4_c, idct4_c  },   // ADST_DCT = 1
+    { idct4_c, iadst4_c },    // DCT_ADST = 2
+    { iadst4_c, iadst4_c }      // ADST_ADST = 3
+  };
+
+  int i, j;
+  tran_low_t out[4 * 4];
+  tran_low_t *outptr = out;
+  tran_low_t temp_in[4], temp_out[4];
+
+  // inverse transform row vectors
+  for (i = 0; i < 4; ++i) {
+    IHT_4[tx_type].rows(input, outptr);
+    input  += 4;
+    outptr += 4;
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < 4; ++i) {
+    for (j = 0; j < 4; ++j)
+      temp_in[j] = out[j * 4 + i];
+    IHT_4[tx_type].cols(temp_in, temp_out);
+    for (j = 0; j < 4; ++j) {
+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+                                            ROUND_POWER_OF_TWO(temp_out[j], 4));
+    }
+  }
+}
+
+static const transform_2d IHT_8[] = {
+  { idct8_c,  idct8_c  },  // DCT_DCT  = 0
+  { iadst8_c, idct8_c  },  // ADST_DCT = 1
+  { idct8_c,  iadst8_c },  // DCT_ADST = 2
+  { iadst8_c, iadst8_c }   // ADST_ADST = 3
+};
+
+void vp10_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride,
+                         int tx_type) {
+  int i, j;
+  tran_low_t out[8 * 8];
+  tran_low_t *outptr = out;
+  tran_low_t temp_in[8], temp_out[8];
+  const transform_2d ht = IHT_8[tx_type];
+
+  // inverse transform row vectors
+  for (i = 0; i < 8; ++i) {
+    ht.rows(input, outptr);
+    input += 8;
+    outptr += 8;
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < 8; ++i) {
+    for (j = 0; j < 8; ++j)
+      temp_in[j] = out[j * 8 + i];
+    ht.cols(temp_in, temp_out);
+    for (j = 0; j < 8; ++j) {
+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+                                            ROUND_POWER_OF_TWO(temp_out[j], 5));
+    }
+  }
+}
+
+static const transform_2d IHT_16[] = {
+  { idct16_c,  idct16_c  },  // DCT_DCT  = 0
+  { iadst16_c, idct16_c  },  // ADST_DCT = 1
+  { idct16_c,  iadst16_c },  // DCT_ADST = 2
+  { iadst16_c, iadst16_c }   // ADST_ADST = 3
+};
+
+void vp10_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride,
+                            int tx_type) {
+  int i, j;
+  tran_low_t out[16 * 16];
+  tran_low_t *outptr = out;
+  tran_low_t temp_in[16], temp_out[16];
+  const transform_2d ht = IHT_16[tx_type];
+
+  // Rows
+  for (i = 0; i < 16; ++i) {
+    ht.rows(input, outptr);
+    input += 16;
+    outptr += 16;
+  }
+
+  // Columns
+  for (i = 0; i < 16; ++i) {
+    for (j = 0; j < 16; ++j)
+      temp_in[j] = out[j * 16 + i];
+    ht.cols(temp_in, temp_out);
+    for (j = 0; j < 16; ++j) {
+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
+    }
+  }
+}
+
+// idct
+void vp10_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                     int eob) {
+  if (eob > 1)
+    vpx_idct4x4_16_add(input, dest, stride);
+  else
+    vpx_idct4x4_1_add(input, dest, stride);
+}
+
+
+void vp10_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                     int eob) {
+  if (eob > 1)
+    vpx_iwht4x4_16_add(input, dest, stride);
+  else
+    vpx_iwht4x4_1_add(input, dest, stride);
+}
+
+void vp10_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
+                     int eob) {
+  // If dc is 1, then input[0] is the reconstructed value, do not need
+  // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
+
+  // The calculation can be simplified if there are not many non-zero dct
+  // coefficients. Use eobs to decide what to do.
+  // TODO(yunqingwang): "eobs = 1" case is also handled in vp10_short_idct8x8_c.
+  // Combine that with code here.
+  if (eob == 1)
+    // DC only DCT coefficient
+    vpx_idct8x8_1_add(input, dest, stride);
+  else if (eob <= 12)
+    vpx_idct8x8_12_add(input, dest, stride);
+  else
+    vpx_idct8x8_64_add(input, dest, stride);
+}
+
+void vp10_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride,
+                       int eob) {
+  /* The calculation can be simplified if there are not many non-zero dct
+   * coefficients. Use eobs to separate different cases. */
+  if (eob == 1)
+    /* DC only DCT coefficient. */
+    vpx_idct16x16_1_add(input, dest, stride);
+  else if (eob <= 10)
+    vpx_idct16x16_10_add(input, dest, stride);
+  else
+    vpx_idct16x16_256_add(input, dest, stride);
+}
+
+void vp10_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
+                       int eob) {
+  if (eob == 1)
+    vpx_idct32x32_1_add(input, dest, stride);
+  else if (eob <= 34)
+    // non-zero coeff only in upper-left 8x8
+    vpx_idct32x32_34_add(input, dest, stride);
+  else
+    vpx_idct32x32_1024_add(input, dest, stride);
+}
+
+void vp10_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest,
+                           int stride, int eob, TX_TYPE tx_type, int lossless) {
+  if (lossless) {
+    assert(tx_type == DCT_DCT);
+    vp10_iwht4x4_add(input, dest, stride, eob);
+  } else {
+    switch (tx_type) {
+      case DCT_DCT:
+        vp10_idct4x4_add(input, dest, stride, eob);
+        break;
+      case ADST_DCT:
+      case DCT_ADST:
+      case ADST_ADST:
+        vp10_iht4x4_16_add(input, dest, stride, tx_type);
+        break;
+      default:
+        assert(0);
+        break;
+    }
+  }
+}
+
+void vp10_inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest,
+                           int stride, int eob, TX_TYPE tx_type) {
+  switch (tx_type) {
+    case DCT_DCT:
+      vp10_idct8x8_add(input, dest, stride, eob);
+      break;
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+      vp10_iht8x8_64_add(input, dest, stride, tx_type);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
+
+void vp10_inv_txfm_add_16x16(const tran_low_t *input, uint8_t *dest,
+                             int stride, int eob, TX_TYPE tx_type) {
+  switch (tx_type) {
+    case DCT_DCT:
+      vp10_idct16x16_add(input, dest, stride, eob);
+      break;
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+      vp10_iht16x16_256_add(input, dest, stride, tx_type);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
+
+void vp10_inv_txfm_add_32x32(const tran_low_t *input, uint8_t *dest,
+                             int stride, int eob, TX_TYPE tx_type) {
+  switch (tx_type) {
+    case DCT_DCT:
+      vp10_idct32x32_add(input, dest, stride, eob);
+      break;
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+      assert(0);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp10_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
+                                int stride, int tx_type, int bd) {
+  const highbd_transform_2d IHT_4[] = {
+    { vpx_highbd_idct4_c, vpx_highbd_idct4_c  },    // DCT_DCT  = 0
+    { vpx_highbd_iadst4_c, vpx_highbd_idct4_c },    // ADST_DCT = 1
+    { vpx_highbd_idct4_c, vpx_highbd_iadst4_c },    // DCT_ADST = 2
+    { vpx_highbd_iadst4_c, vpx_highbd_iadst4_c }    // ADST_ADST = 3
+  };
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  int i, j;
+  tran_low_t out[4 * 4];
+  tran_low_t *outptr = out;
+  tran_low_t temp_in[4], temp_out[4];
+
+  // Inverse transform row vectors.
+  for (i = 0; i < 4; ++i) {
+    IHT_4[tx_type].rows(input, outptr, bd);
+    input  += 4;
+    outptr += 4;
+  }
+
+  // Inverse transform column vectors.
+  for (i = 0; i < 4; ++i) {
+    for (j = 0; j < 4; ++j)
+      temp_in[j] = out[j * 4 + i];
+    IHT_4[tx_type].cols(temp_in, temp_out, bd);
+    for (j = 0; j < 4; ++j) {
+      dest[j * stride + i] = highbd_clip_pixel_add(
+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
+    }
+  }
+}
+
+static const highbd_transform_2d HIGH_IHT_8[] = {
+  { vpx_highbd_idct8_c,  vpx_highbd_idct8_c  },  // DCT_DCT  = 0
+  { vpx_highbd_iadst8_c, vpx_highbd_idct8_c  },  // ADST_DCT = 1
+  { vpx_highbd_idct8_c,  vpx_highbd_iadst8_c },  // DCT_ADST = 2
+  { vpx_highbd_iadst8_c, vpx_highbd_iadst8_c }   // ADST_ADST = 3
+};
+
+void vp10_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
+                                int stride, int tx_type, int bd) {
+  int i, j;
+  tran_low_t out[8 * 8];
+  tran_low_t *outptr = out;
+  tran_low_t temp_in[8], temp_out[8];
+  const highbd_transform_2d ht = HIGH_IHT_8[tx_type];
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  // Inverse transform row vectors.
+  for (i = 0; i < 8; ++i) {
+    ht.rows(input, outptr, bd);
+    input += 8;
+    outptr += 8;
+  }
+
+  // Inverse transform column vectors.
+  for (i = 0; i < 8; ++i) {
+    for (j = 0; j < 8; ++j)
+      temp_in[j] = out[j * 8 + i];
+    ht.cols(temp_in, temp_out, bd);
+    for (j = 0; j < 8; ++j) {
+      dest[j * stride + i] = highbd_clip_pixel_add(
+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+    }
+  }
+}
+
+static const highbd_transform_2d HIGH_IHT_16[] = {
+  { vpx_highbd_idct16_c,  vpx_highbd_idct16_c  },  // DCT_DCT  = 0
+  { vpx_highbd_iadst16_c, vpx_highbd_idct16_c  },  // ADST_DCT = 1
+  { vpx_highbd_idct16_c,  vpx_highbd_iadst16_c },  // DCT_ADST = 2
+  { vpx_highbd_iadst16_c, vpx_highbd_iadst16_c }   // ADST_ADST = 3
+};
+
+void vp10_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
+                                   int stride, int tx_type, int bd) {
+  int i, j;
+  tran_low_t out[16 * 16];
+  tran_low_t *outptr = out;
+  tran_low_t temp_in[16], temp_out[16];
+  const highbd_transform_2d ht = HIGH_IHT_16[tx_type];
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  // Rows
+  for (i = 0; i < 16; ++i) {
+    ht.rows(input, outptr, bd);
+    input += 16;
+    outptr += 16;
+  }
+
+  // Columns
+  for (i = 0; i < 16; ++i) {
+    for (j = 0; j < 16; ++j)
+      temp_in[j] = out[j * 16 + i];
+    ht.cols(temp_in, temp_out, bd);
+    for (j = 0; j < 16; ++j) {
+      dest[j * stride + i] = highbd_clip_pixel_add(
+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+    }
+  }
+}
+
+// idct
+void vp10_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                            int eob, int bd) {
+  if (eob > 1)
+    vpx_highbd_idct4x4_16_add(input, dest, stride, bd);
+  else
+    vpx_highbd_idct4x4_1_add(input, dest, stride, bd);
+}
+
+
+void vp10_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                            int eob, int bd) {
+  if (eob > 1)
+    vpx_highbd_iwht4x4_16_add(input, dest, stride, bd);
+  else
+    vpx_highbd_iwht4x4_1_add(input, dest, stride, bd);
+}
+
+void vp10_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
+                            int eob, int bd) {
+  // If dc is 1, then input[0] is the reconstructed value, do not need
+  // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
+
+  // The calculation can be simplified if there are not many non-zero dct
+  // coefficients. Use eobs to decide what to do.
+  // TODO(yunqingwang): "eobs = 1" case is also handled in vp10_short_idct8x8_c.
+  // Combine that with code here.
+  // DC only DCT coefficient
+  if (eob == 1) {
+    vpx_highbd_idct8x8_1_add(input, dest, stride, bd);
+  } else if (eob <= 10) {
+    vpx_highbd_idct8x8_10_add(input, dest, stride, bd);
+  } else {
+    vpx_highbd_idct8x8_64_add(input, dest, stride, bd);
+  }
+}
+
+void vp10_highbd_idct16x16_add(const tran_low_t *input, uint8_t *dest,
+                              int stride, int eob, int bd) {
+  // The calculation can be simplified if there are not many non-zero dct
+  // coefficients. Use eobs to separate different cases.
+  // DC only DCT coefficient.
+  if (eob == 1) {
+    vpx_highbd_idct16x16_1_add(input, dest, stride, bd);
+  } else if (eob <= 10) {
+    vpx_highbd_idct16x16_10_add(input, dest, stride, bd);
+  } else {
+    vpx_highbd_idct16x16_256_add(input, dest, stride, bd);
+  }
+}
+
+void vp10_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest,
+                              int stride, int eob, int bd) {
+  // Non-zero coeff only in upper-left 8x8
+  if (eob == 1) {
+    vpx_highbd_idct32x32_1_add(input, dest, stride, bd);
+  } else if (eob <= 34) {
+    vpx_highbd_idct32x32_34_add(input, dest, stride, bd);
+  } else {
+    vpx_highbd_idct32x32_1024_add(input, dest, stride, bd);
+  }
+}
+
+void vp10_highbd_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest,
+                                  int stride, int eob, int bd, TX_TYPE tx_type,
+                                  int lossless) {
+  if (lossless) {
+    assert(tx_type == DCT_DCT);
+    vp10_highbd_iwht4x4_add(input, dest, stride, eob, bd);
+  } else {
+    switch (tx_type) {
+      case DCT_DCT:
+        vp10_highbd_idct4x4_add(input, dest, stride, eob, bd);
+        break;
+      case ADST_DCT:
+      case DCT_ADST:
+      case ADST_ADST:
+         vp10_highbd_iht4x4_16_add(input, dest, stride, tx_type, bd);
+         break;
+      default:
+        assert(0);
+        break;
+    }
+  }
+}
+
+void vp10_highbd_inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest,
+                                  int stride, int eob, int bd,
+                                  TX_TYPE tx_type) {
+  switch (tx_type) {
+    case DCT_DCT:
+      vp10_highbd_idct8x8_add(input, dest, stride, eob, bd);
+      break;
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+      vp10_highbd_iht8x8_64_add(input, dest, stride, tx_type, bd);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
+
+void vp10_highbd_inv_txfm_add_16x16(const tran_low_t *input, uint8_t *dest,
+                                    int stride, int eob, int bd,
+                                    TX_TYPE tx_type) {
+  switch (tx_type) {
+    case DCT_DCT:
+      vp10_highbd_idct16x16_add(input, dest, stride, eob, bd);
+      break;
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+      vp10_highbd_iht16x16_256_add(input, dest, stride, tx_type, bd);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
+
+void vp10_highbd_inv_txfm_add_32x32(const tran_low_t *input, uint8_t *dest,
+                                    int stride, int eob, int bd,
+                                    TX_TYPE tx_type) {
+  switch (tx_type) {
+    case DCT_DCT:
+      vp10_highbd_idct32x32_add(input, dest, stride, eob, bd);
+      break;
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+      assert(0);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/libs/libvpx/vp10/common/idct.h b/libs/libvpx/vp10/common/idct.h
new file mode 100644
index 0000000000..088339804d
--- /dev/null
+++ b/libs/libvpx/vp10/common/idct.h
@@ -0,0 +1,82 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_COMMON_IDCT_H_
+#define VP10_COMMON_IDCT_H_
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "vp10/common/common.h"
+#include "vp10/common/enums.h"
+#include "vpx_dsp/inv_txfm.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_ports/mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void (*transform_1d)(const tran_low_t*, tran_low_t*);
+
+typedef struct {
+  transform_1d cols, rows;  // vertical and horizontal
+} transform_2d;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef void (*highbd_transform_1d)(const tran_low_t*, tran_low_t*, int bd);
+
+typedef struct {
+  highbd_transform_1d cols, rows;  // vertical and horizontal
+} highbd_transform_2d;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+void vp10_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                     int eob);
+void vp10_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                     int eob);
+
+void vp10_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest,
+                           int stride, int eob, TX_TYPE tx_type, int lossless);
+void vp10_inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest,
+                           int stride, int eob, TX_TYPE tx_type);
+void vp10_inv_txfm_add_16x16(const tran_low_t *input, uint8_t *dest,
+                             int stride, int eob, TX_TYPE tx_type);
+void vp10_inv_txfm_add_32x32(const tran_low_t *input, uint8_t *dest,
+                             int stride, int eob, TX_TYPE tx_type);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp10_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                            int eob, int bd);
+void vp10_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                            int eob, int bd);
+void vp10_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
+                            int eob, int bd);
+void vp10_highbd_idct16x16_add(const tran_low_t *input, uint8_t *dest,
+                              int stride, int eob, int bd);
+void vp10_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest,
+                              int stride, int eob, int bd);
+void vp10_highbd_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest,
+                                  int stride, int eob, int bd, TX_TYPE tx_type,
+                                  int lossless);
+void vp10_highbd_inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest,
+                                  int stride, int eob, int bd, TX_TYPE tx_type);
+void vp10_highbd_inv_txfm_add_16x16(const tran_low_t *input, uint8_t *dest,
+                                    int stride, int eob, int bd,
+                                    TX_TYPE tx_type);
+void vp10_highbd_inv_txfm_add_32x32(const tran_low_t *input, uint8_t *dest,
+                                    int stride, int eob, int bd,
+                                    TX_TYPE tx_type);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_COMMON_IDCT_H_
diff --git a/libs/libvpx/vp10/common/loopfilter.c b/libs/libvpx/vp10/common/loopfilter.c
new file mode 100644
index 0000000000..a1925de55d
--- /dev/null
+++ b/libs/libvpx/vp10/common/loopfilter.c
@@ -0,0 +1,1656 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vp10/common/loopfilter.h"
+#include "vp10/common/onyxc_int.h"
+#include "vp10/common/reconinter.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+
+#include "vp10/common/seg_common.h"
+
+// 64 bit masks for left transform size. Each 1 represents a position where
+// we should apply a loop filter across the left border of an 8x8 block
+// boundary.
+//
+// In the case of TX_16X16->  ( in low order byte first we end up with
+// a mask that looks like this
+//
+//    10101010
+//    10101010
+//    10101010
+//    10101010
+//    10101010
+//    10101010
+//    10101010
+//    10101010
+//
+// A loopfilter should be applied to every other 8x8 horizontally.
+static const uint64_t left_64x64_txform_mask[TX_SIZES]= {
+  0xffffffffffffffffULL,  // TX_4X4
+  0xffffffffffffffffULL,  // TX_8x8
+  0x5555555555555555ULL,  // TX_16x16
+  0x1111111111111111ULL,  // TX_32x32
+};
+
+// 64 bit masks for above transform size. Each 1 represents a position where
+// we should apply a loop filter across the top border of an 8x8 block
+// boundary.
+//
+// In the case of TX_32x32 ->  ( in low order byte first we end up with
+// a mask that looks like this
+//
+//    11111111
+//    00000000
+//    00000000
+//    00000000
+//    11111111
+//    00000000
+//    00000000
+//    00000000
+//
+// A loopfilter should be applied to every other 4 the row vertically.
+static const uint64_t above_64x64_txform_mask[TX_SIZES]= {
+  0xffffffffffffffffULL,  // TX_4X4
+  0xffffffffffffffffULL,  // TX_8x8
+  0x00ff00ff00ff00ffULL,  // TX_16x16
+  0x000000ff000000ffULL,  // TX_32x32
+};
+
+// 64 bit masks for prediction sizes (left). Each 1 represents a position
+// where left border of an 8x8 block. These are aligned to the right most
+// appropriate bit, and then shifted into place.
+//
+// In the case of TX_16x32 ->  ( low order byte first ) we end up with
+// a mask that looks like this :
+//
+//  10000000
+//  10000000
+//  10000000
+//  10000000
+//  00000000
+//  00000000
+//  00000000
+//  00000000
+static const uint64_t left_prediction_mask[BLOCK_SIZES] = {
+  0x0000000000000001ULL,  // BLOCK_4X4,
+  0x0000000000000001ULL,  // BLOCK_4X8,
+  0x0000000000000001ULL,  // BLOCK_8X4,
+  0x0000000000000001ULL,  // BLOCK_8X8,
+  0x0000000000000101ULL,  // BLOCK_8X16,
+  0x0000000000000001ULL,  // BLOCK_16X8,
+  0x0000000000000101ULL,  // BLOCK_16X16,
+  0x0000000001010101ULL,  // BLOCK_16X32,
+  0x0000000000000101ULL,  // BLOCK_32X16,
+  0x0000000001010101ULL,  // BLOCK_32X32,
+  0x0101010101010101ULL,  // BLOCK_32X64,
+  0x0000000001010101ULL,  // BLOCK_64X32,
+  0x0101010101010101ULL,  // BLOCK_64X64
+};
+
+// 64 bit mask to shift and set for each prediction size.
+static const uint64_t above_prediction_mask[BLOCK_SIZES] = {
+  0x0000000000000001ULL,  // BLOCK_4X4
+  0x0000000000000001ULL,  // BLOCK_4X8
+  0x0000000000000001ULL,  // BLOCK_8X4
+  0x0000000000000001ULL,  // BLOCK_8X8
+  0x0000000000000001ULL,  // BLOCK_8X16,
+  0x0000000000000003ULL,  // BLOCK_16X8
+  0x0000000000000003ULL,  // BLOCK_16X16
+  0x0000000000000003ULL,  // BLOCK_16X32,
+  0x000000000000000fULL,  // BLOCK_32X16,
+  0x000000000000000fULL,  // BLOCK_32X32,
+  0x000000000000000fULL,  // BLOCK_32X64,
+  0x00000000000000ffULL,  // BLOCK_64X32,
+  0x00000000000000ffULL,  // BLOCK_64X64
+};
+// 64 bit mask to shift and set for each prediction size. A bit is set for
+// each 8x8 block that would be in the left most block of the given block
+// size in the 64x64 block.
+static const uint64_t size_mask[BLOCK_SIZES] = {
+  0x0000000000000001ULL,  // BLOCK_4X4
+  0x0000000000000001ULL,  // BLOCK_4X8
+  0x0000000000000001ULL,  // BLOCK_8X4
+  0x0000000000000001ULL,  // BLOCK_8X8
+  0x0000000000000101ULL,  // BLOCK_8X16,
+  0x0000000000000003ULL,  // BLOCK_16X8
+  0x0000000000000303ULL,  // BLOCK_16X16
+  0x0000000003030303ULL,  // BLOCK_16X32,
+  0x0000000000000f0fULL,  // BLOCK_32X16,
+  0x000000000f0f0f0fULL,  // BLOCK_32X32,
+  0x0f0f0f0f0f0f0f0fULL,  // BLOCK_32X64,
+  0x00000000ffffffffULL,  // BLOCK_64X32,
+  0xffffffffffffffffULL,  // BLOCK_64X64
+};
+
+// These are used for masking the left and above borders.
+static const uint64_t left_border =  0x1111111111111111ULL;
+static const uint64_t above_border = 0x000000ff000000ffULL;
+
+// 16 bit masks for uv transform sizes.
+static const uint16_t left_64x64_txform_mask_uv[TX_SIZES]= {
+  0xffff,  // TX_4X4
+  0xffff,  // TX_8x8
+  0x5555,  // TX_16x16
+  0x1111,  // TX_32x32
+};
+
+static const uint16_t above_64x64_txform_mask_uv[TX_SIZES]= {
+  0xffff,  // TX_4X4
+  0xffff,  // TX_8x8
+  0x0f0f,  // TX_16x16
+  0x000f,  // TX_32x32
+};
+
+// 16 bit left mask to shift and set for each uv prediction size.
+static const uint16_t left_prediction_mask_uv[BLOCK_SIZES] = {
+  0x0001,  // BLOCK_4X4,
+  0x0001,  // BLOCK_4X8,
+  0x0001,  // BLOCK_8X4,
+  0x0001,  // BLOCK_8X8,
+  0x0001,  // BLOCK_8X16,
+  0x0001,  // BLOCK_16X8,
+  0x0001,  // BLOCK_16X16,
+  0x0011,  // BLOCK_16X32,
+  0x0001,  // BLOCK_32X16,
+  0x0011,  // BLOCK_32X32,
+  0x1111,  // BLOCK_32X64
+  0x0011,  // BLOCK_64X32,
+  0x1111,  // BLOCK_64X64
+};
+// 16 bit above mask to shift and set for uv each prediction size.
+static const uint16_t above_prediction_mask_uv[BLOCK_SIZES] = {
+  0x0001,  // BLOCK_4X4
+  0x0001,  // BLOCK_4X8
+  0x0001,  // BLOCK_8X4
+  0x0001,  // BLOCK_8X8
+  0x0001,  // BLOCK_8X16,
+  0x0001,  // BLOCK_16X8
+  0x0001,  // BLOCK_16X16
+  0x0001,  // BLOCK_16X32,
+  0x0003,  // BLOCK_32X16,
+  0x0003,  // BLOCK_32X32,
+  0x0003,  // BLOCK_32X64,
+  0x000f,  // BLOCK_64X32,
+  0x000f,  // BLOCK_64X64
+};
+
+// 64 bit mask to shift and set for each uv prediction size
+static const uint16_t size_mask_uv[BLOCK_SIZES] = {
+  0x0001,  // BLOCK_4X4
+  0x0001,  // BLOCK_4X8
+  0x0001,  // BLOCK_8X4
+  0x0001,  // BLOCK_8X8
+  0x0001,  // BLOCK_8X16,
+  0x0001,  // BLOCK_16X8
+  0x0001,  // BLOCK_16X16
+  0x0011,  // BLOCK_16X32,
+  0x0003,  // BLOCK_32X16,
+  0x0033,  // BLOCK_32X32,
+  0x3333,  // BLOCK_32X64,
+  0x00ff,  // BLOCK_64X32,
+  0xffff,  // BLOCK_64X64
+};
+static const uint16_t left_border_uv =  0x1111;
+static const uint16_t above_border_uv = 0x000f;
+
+static const int mode_lf_lut[MB_MODE_COUNT] = {
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // INTRA_MODES
+  1, 1, 0, 1                     // INTER_MODES (ZEROMV == 0)
+};
+
+static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
+  int lvl;
+
+  // For each possible value for the loop filter fill out limits
+  for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++) {
+    // Set loop filter parameters that control sharpness.
+    int block_inside_limit = lvl >> ((sharpness_lvl > 0) + (sharpness_lvl > 4));
+
+    if (sharpness_lvl > 0) {
+      if (block_inside_limit > (9 - sharpness_lvl))
+        block_inside_limit = (9 - sharpness_lvl);
+    }
+
+    if (block_inside_limit < 1)
+      block_inside_limit = 1;
+
+    memset(lfi->lfthr[lvl].lim, block_inside_limit, SIMD_WIDTH);
+    memset(lfi->lfthr[lvl].mblim, (2 * (lvl + 2) + block_inside_limit),
+           SIMD_WIDTH);
+  }
+}
+
+static uint8_t get_filter_level(const loop_filter_info_n *lfi_n,
+                                const MB_MODE_INFO *mbmi) {
+  return lfi_n->lvl[mbmi->segment_id][mbmi->ref_frame[0]]
+                   [mode_lf_lut[mbmi->mode]];
+}
+
+void vp10_loop_filter_init(VP10_COMMON *cm) {
+  loop_filter_info_n *lfi = &cm->lf_info;
+  struct loopfilter *lf = &cm->lf;
+  int lvl;
+
+  // init limits for given sharpness
+  update_sharpness(lfi, lf->sharpness_level);
+  lf->last_sharpness_level = lf->sharpness_level;
+
+  // init hev threshold const vectors
+  for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++)
+    memset(lfi->lfthr[lvl].hev_thr, (lvl >> 4), SIMD_WIDTH);
+}
+
+void vp10_loop_filter_frame_init(VP10_COMMON *cm, int default_filt_lvl) {
+  int seg_id;
+  // n_shift is the multiplier for lf_deltas
+  // the multiplier is 1 for when filter_lvl is between 0 and 31;
+  // 2 when filter_lvl is between 32 and 63
+  const int scale = 1 << (default_filt_lvl >> 5);
+  loop_filter_info_n *const lfi = &cm->lf_info;
+  struct loopfilter *const lf = &cm->lf;
+  const struct segmentation *const seg = &cm->seg;
+
+  // update limits if sharpness has changed
+  if (lf->last_sharpness_level != lf->sharpness_level) {
+    update_sharpness(lfi, lf->sharpness_level);
+    lf->last_sharpness_level = lf->sharpness_level;
+  }
+
+  for (seg_id = 0; seg_id < MAX_SEGMENTS; seg_id++) {
+    int lvl_seg = default_filt_lvl;
+    if (segfeature_active(seg, seg_id, SEG_LVL_ALT_LF)) {
+      const int data = get_segdata(seg, seg_id, SEG_LVL_ALT_LF);
+      lvl_seg = clamp(seg->abs_delta == SEGMENT_ABSDATA ?
+                      data : default_filt_lvl + data,
+                      0, MAX_LOOP_FILTER);
+    }
+
+    if (!lf->mode_ref_delta_enabled) {
+      // we could get rid of this if we assume that deltas are set to
+      // zero when not in use; encoder always uses deltas
+      memset(lfi->lvl[seg_id], lvl_seg, sizeof(lfi->lvl[seg_id]));
+    } else {
+      int ref, mode;
+      const int intra_lvl = lvl_seg + lf->ref_deltas[INTRA_FRAME] * scale;
+      lfi->lvl[seg_id][INTRA_FRAME][0] = clamp(intra_lvl, 0, MAX_LOOP_FILTER);
+
+      for (ref = LAST_FRAME; ref < MAX_REF_FRAMES; ++ref) {
+        for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) {
+          const int inter_lvl = lvl_seg + lf->ref_deltas[ref] * scale
+                                        + lf->mode_deltas[mode] * scale;
+          lfi->lvl[seg_id][ref][mode] = clamp(inter_lvl, 0, MAX_LOOP_FILTER);
+        }
+      }
+    }
+  }
+}
+
+static void filter_selectively_vert_row2(int subsampling_factor,
+                                         uint8_t *s, int pitch,
+                                         unsigned int mask_16x16_l,
+                                         unsigned int mask_8x8_l,
+                                         unsigned int mask_4x4_l,
+                                         unsigned int mask_4x4_int_l,
+                                         const loop_filter_info_n *lfi_n,
+                                         const uint8_t *lfl) {
+  const int mask_shift = subsampling_factor ? 4 : 8;
+  const int mask_cutoff = subsampling_factor ? 0xf : 0xff;
+  const int lfl_forward = subsampling_factor ? 4 : 8;
+
+  unsigned int mask_16x16_0 = mask_16x16_l & mask_cutoff;
+  unsigned int mask_8x8_0 = mask_8x8_l & mask_cutoff;
+  unsigned int mask_4x4_0 = mask_4x4_l & mask_cutoff;
+  unsigned int mask_4x4_int_0 = mask_4x4_int_l & mask_cutoff;
+  unsigned int mask_16x16_1 = (mask_16x16_l >> mask_shift) & mask_cutoff;
+  unsigned int mask_8x8_1 = (mask_8x8_l >> mask_shift) & mask_cutoff;
+  unsigned int mask_4x4_1 = (mask_4x4_l >> mask_shift) & mask_cutoff;
+  unsigned int mask_4x4_int_1 = (mask_4x4_int_l >> mask_shift) & mask_cutoff;
+  unsigned int mask;
+
+  for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_4x4_int_0 |
+              mask_16x16_1 | mask_8x8_1 | mask_4x4_1 | mask_4x4_int_1;
+       mask; mask >>= 1) {
+    const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
+    const loop_filter_thresh *lfi1 = lfi_n->lfthr + *(lfl + lfl_forward);
+
+    // TODO(yunqingwang): count in loopfilter functions should be removed.
+    if (mask & 1) {
+      if ((mask_16x16_0 | mask_16x16_1) & 1) {
+        if ((mask_16x16_0 & mask_16x16_1) & 1) {
+          vpx_lpf_vertical_16_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                   lfi0->hev_thr);
+        } else if (mask_16x16_0 & 1) {
+          vpx_lpf_vertical_16(s, pitch, lfi0->mblim, lfi0->lim,
+                              lfi0->hev_thr);
+        } else {
+          vpx_lpf_vertical_16(s + 8 *pitch, pitch, lfi1->mblim,
+                              lfi1->lim, lfi1->hev_thr);
+        }
+      }
+
+      if ((mask_8x8_0 | mask_8x8_1) & 1) {
+        if ((mask_8x8_0 & mask_8x8_1) & 1) {
+          vpx_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                  lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                  lfi1->hev_thr);
+        } else if (mask_8x8_0 & 1) {
+          vpx_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
+                             1);
+        } else {
+          vpx_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim,
+                             lfi1->hev_thr, 1);
+        }
+      }
+
+      if ((mask_4x4_0 | mask_4x4_1) & 1) {
+        if ((mask_4x4_0 & mask_4x4_1) & 1) {
+          vpx_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                  lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                  lfi1->hev_thr);
+        } else if (mask_4x4_0 & 1) {
+          vpx_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
+                             1);
+        } else {
+          vpx_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim,
+                             lfi1->hev_thr, 1);
+        }
+      }
+
+      if ((mask_4x4_int_0 | mask_4x4_int_1) & 1) {
+        if ((mask_4x4_int_0 & mask_4x4_int_1) & 1) {
+          vpx_lpf_vertical_4_dual(s + 4, pitch, lfi0->mblim, lfi0->lim,
+                                  lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                  lfi1->hev_thr);
+        } else if (mask_4x4_int_0 & 1) {
+          vpx_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim,
+                             lfi0->hev_thr, 1);
+        } else {
+          vpx_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim, lfi1->lim,
+                             lfi1->hev_thr, 1);
+        }
+      }
+    }
+
+    s += 8;
+    lfl += 1;
+    mask_16x16_0 >>= 1;
+    mask_8x8_0 >>= 1;
+    mask_4x4_0 >>= 1;
+    mask_4x4_int_0 >>= 1;
+    mask_16x16_1 >>= 1;
+    mask_8x8_1 >>= 1;
+    mask_4x4_1 >>= 1;
+    mask_4x4_int_1 >>= 1;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_filter_selectively_vert_row2(int subsampling_factor,
+                                                uint16_t *s, int pitch,
+                                                unsigned int mask_16x16_l,
+                                                unsigned int mask_8x8_l,
+                                                unsigned int mask_4x4_l,
+                                                unsigned int mask_4x4_int_l,
+                                                const loop_filter_info_n *lfi_n,
+                                                const uint8_t *lfl, int bd) {
+  const int mask_shift = subsampling_factor ? 4 : 8;
+  const int mask_cutoff = subsampling_factor ? 0xf : 0xff;
+  const int lfl_forward = subsampling_factor ? 4 : 8;
+
+  unsigned int mask_16x16_0 = mask_16x16_l & mask_cutoff;
+  unsigned int mask_8x8_0 = mask_8x8_l & mask_cutoff;
+  unsigned int mask_4x4_0 = mask_4x4_l & mask_cutoff;
+  unsigned int mask_4x4_int_0 = mask_4x4_int_l & mask_cutoff;
+  unsigned int mask_16x16_1 = (mask_16x16_l >> mask_shift) & mask_cutoff;
+  unsigned int mask_8x8_1 = (mask_8x8_l >> mask_shift) & mask_cutoff;
+  unsigned int mask_4x4_1 = (mask_4x4_l >> mask_shift) & mask_cutoff;
+  unsigned int mask_4x4_int_1 = (mask_4x4_int_l >> mask_shift) & mask_cutoff;
+  unsigned int mask;
+
+  for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_4x4_int_0 |
+       mask_16x16_1 | mask_8x8_1 | mask_4x4_1 | mask_4x4_int_1;
+       mask; mask >>= 1) {
+    const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
+    const loop_filter_thresh *lfi1 = lfi_n->lfthr + *(lfl + lfl_forward);
+
+    // TODO(yunqingwang): count in loopfilter functions should be removed.
+    if (mask & 1) {
+      if ((mask_16x16_0 | mask_16x16_1) & 1) {
+        if ((mask_16x16_0 & mask_16x16_1) & 1) {
+          vpx_highbd_lpf_vertical_16_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                          lfi0->hev_thr, bd);
+        } else if (mask_16x16_0 & 1) {
+          vpx_highbd_lpf_vertical_16(s, pitch, lfi0->mblim, lfi0->lim,
+                                     lfi0->hev_thr, bd);
+        } else {
+          vpx_highbd_lpf_vertical_16(s + 8 *pitch, pitch, lfi1->mblim,
+                                     lfi1->lim, lfi1->hev_thr, bd);
+        }
+      }
+
+      if ((mask_8x8_0 | mask_8x8_1) & 1) {
+        if ((mask_8x8_0 & mask_8x8_1) & 1) {
+          vpx_highbd_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                         lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                         lfi1->hev_thr, bd);
+        } else if (mask_8x8_0 & 1) {
+          vpx_highbd_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim,
+                                    lfi0->hev_thr, 1, bd);
+        } else {
+          vpx_highbd_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim,
+                                    lfi1->lim, lfi1->hev_thr, 1, bd);
+        }
+      }
+
+      if ((mask_4x4_0 | mask_4x4_1) & 1) {
+        if ((mask_4x4_0 & mask_4x4_1) & 1) {
+          vpx_highbd_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                         lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                         lfi1->hev_thr, bd);
+        } else if (mask_4x4_0 & 1) {
+          vpx_highbd_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim,
+                                    lfi0->hev_thr, 1, bd);
+        } else {
+          vpx_highbd_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim,
+                                    lfi1->lim, lfi1->hev_thr, 1, bd);
+        }
+      }
+
+      if ((mask_4x4_int_0 | mask_4x4_int_1) & 1) {
+        if ((mask_4x4_int_0 & mask_4x4_int_1) & 1) {
+          vpx_highbd_lpf_vertical_4_dual(s + 4, pitch, lfi0->mblim, lfi0->lim,
+                                         lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                         lfi1->hev_thr, bd);
+        } else if (mask_4x4_int_0 & 1) {
+          vpx_highbd_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim,
+                                    lfi0->hev_thr, 1, bd);
+        } else {
+          vpx_highbd_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim,
+                                    lfi1->lim, lfi1->hev_thr, 1, bd);
+        }
+      }
+    }
+
+    s += 8;
+    lfl += 1;
+    mask_16x16_0 >>= 1;
+    mask_8x8_0 >>= 1;
+    mask_4x4_0 >>= 1;
+    mask_4x4_int_0 >>= 1;
+    mask_16x16_1 >>= 1;
+    mask_8x8_1 >>= 1;
+    mask_4x4_1 >>= 1;
+    mask_4x4_int_1 >>= 1;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static void filter_selectively_horiz(uint8_t *s, int pitch,
+                                     unsigned int mask_16x16,
+                                     unsigned int mask_8x8,
+                                     unsigned int mask_4x4,
+                                     unsigned int mask_4x4_int,
+                                     const loop_filter_info_n *lfi_n,
+                                     const uint8_t *lfl) {
+  unsigned int mask;
+  int count;
+
+  for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
+       mask; mask >>= count) {
+    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
+
+    count = 1;
+    if (mask & 1) {
+      if (mask_16x16 & 1) {
+        if ((mask_16x16 & 3) == 3) {
+          vpx_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
+                                lfi->hev_thr, 2);
+          count = 2;
+        } else {
+          vpx_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
+                                lfi->hev_thr, 1);
+        }
+      } else if (mask_8x8 & 1) {
+        if ((mask_8x8 & 3) == 3) {
+          // Next block's thresholds.
+          const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
+
+          vpx_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
+                                    lfi->hev_thr, lfin->mblim, lfin->lim,
+                                    lfin->hev_thr);
+
+          if ((mask_4x4_int & 3) == 3) {
+            vpx_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
+                                      lfi->lim, lfi->hev_thr, lfin->mblim,
+                                      lfin->lim, lfin->hev_thr);
+          } else {
+            if (mask_4x4_int & 1)
+              vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
+                                   lfi->hev_thr, 1);
+            else if (mask_4x4_int & 2)
+              vpx_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
+                                   lfin->lim, lfin->hev_thr, 1);
+          }
+          count = 2;
+        } else {
+          vpx_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+
+          if (mask_4x4_int & 1)
+            vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
+                                 lfi->hev_thr, 1);
+        }
+      } else if (mask_4x4 & 1) {
+        if ((mask_4x4 & 3) == 3) {
+          // Next block's thresholds.
+          const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
+
+          vpx_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
+                                    lfi->hev_thr, lfin->mblim, lfin->lim,
+                                    lfin->hev_thr);
+          if ((mask_4x4_int & 3) == 3) {
+            vpx_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
+                                      lfi->lim, lfi->hev_thr, lfin->mblim,
+                                      lfin->lim, lfin->hev_thr);
+          } else {
+            if (mask_4x4_int & 1)
+              vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
+                                   lfi->hev_thr, 1);
+            else if (mask_4x4_int & 2)
+              vpx_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
+                                   lfin->lim, lfin->hev_thr, 1);
+          }
+          count = 2;
+        } else {
+          vpx_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+
+          if (mask_4x4_int & 1)
+            vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
+                                 lfi->hev_thr, 1);
+        }
+      } else if (mask_4x4_int & 1) {
+        vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
+                             lfi->hev_thr, 1);
+      }
+    }
+    s += 8 * count;
+    lfl += count;
+    mask_16x16 >>= count;
+    mask_8x8 >>= count;
+    mask_4x4 >>= count;
+    mask_4x4_int >>= count;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_filter_selectively_horiz(uint16_t *s, int pitch,
+                                            unsigned int mask_16x16,
+                                            unsigned int mask_8x8,
+                                            unsigned int mask_4x4,
+                                            unsigned int mask_4x4_int,
+                                            const loop_filter_info_n *lfi_n,
+                                            const uint8_t *lfl, int bd) {
+  unsigned int mask;
+  int count;
+
+  for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
+       mask; mask >>= count) {
+    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
+
+    count = 1;
+    if (mask & 1) {
+      if (mask_16x16 & 1) {
+        if ((mask_16x16 & 3) == 3) {
+          vpx_highbd_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
+                                       lfi->hev_thr, 2, bd);
+          count = 2;
+        } else {
+          vpx_highbd_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
+                                       lfi->hev_thr, 1, bd);
+        }
+      } else if (mask_8x8 & 1) {
+        if ((mask_8x8 & 3) == 3) {
+          // Next block's thresholds.
+          const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
+
+          vpx_highbd_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
+                                           lfi->hev_thr, lfin->mblim, lfin->lim,
+                                           lfin->hev_thr, bd);
+
+          if ((mask_4x4_int & 3) == 3) {
+            vpx_highbd_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
+                                             lfi->lim, lfi->hev_thr,
+                                             lfin->mblim, lfin->lim,
+                                             lfin->hev_thr, bd);
+          } else {
+            if (mask_4x4_int & 1) {
+              vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
+                                          lfi->lim, lfi->hev_thr, 1, bd);
+            } else if (mask_4x4_int & 2) {
+              vpx_highbd_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
+                                          lfin->lim, lfin->hev_thr, 1, bd);
+            }
+          }
+          count = 2;
+        } else {
+          vpx_highbd_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim,
+                                      lfi->hev_thr, 1, bd);
+
+          if (mask_4x4_int & 1) {
+            vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
+                                        lfi->lim, lfi->hev_thr, 1, bd);
+          }
+        }
+      } else if (mask_4x4 & 1) {
+        if ((mask_4x4 & 3) == 3) {
+          // Next block's thresholds.
+          const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
+
+          vpx_highbd_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
+                                           lfi->hev_thr, lfin->mblim, lfin->lim,
+                                           lfin->hev_thr, bd);
+          if ((mask_4x4_int & 3) == 3) {
+            vpx_highbd_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
+                                             lfi->lim, lfi->hev_thr,
+                                             lfin->mblim, lfin->lim,
+                                             lfin->hev_thr, bd);
+          } else {
+            if (mask_4x4_int & 1) {
+              vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
+                                          lfi->lim, lfi->hev_thr, 1, bd);
+            } else if (mask_4x4_int & 2) {
+              vpx_highbd_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
+                                          lfin->lim, lfin->hev_thr, 1, bd);
+            }
+          }
+          count = 2;
+        } else {
+          vpx_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim,
+                                      lfi->hev_thr, 1, bd);
+
+          if (mask_4x4_int & 1) {
+            vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
+                                        lfi->lim, lfi->hev_thr, 1, bd);
+          }
+        }
+      } else if (mask_4x4_int & 1) {
+        vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
+                                    lfi->hev_thr, 1, bd);
+      }
+    }
+    s += 8 * count;
+    lfl += count;
+    mask_16x16 >>= count;
+    mask_8x8 >>= count;
+    mask_4x4 >>= count;
+    mask_4x4_int >>= count;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+// This function ors into the current lfm structure, where to do loop
+// filters for the specific mi we are looking at. It uses information
+// including the block_size_type (32x16, 32x32, etc.), the transform size,
+// whether there were any coefficients encoded, and the loop filter strength
+// block we are currently looking at. Shift is used to position the
+// 1's we produce.
+// TODO(JBB) Need another function for different resolution color..
+static void build_masks(const loop_filter_info_n *const lfi_n,
+                        const MODE_INFO *mi, const int shift_y,
+                        const int shift_uv,
+                        LOOP_FILTER_MASK *lfm) {
+  const MB_MODE_INFO *mbmi = &mi->mbmi;
+  const BLOCK_SIZE block_size = mbmi->sb_type;
+  const TX_SIZE tx_size_y = mbmi->tx_size;
+  const TX_SIZE tx_size_uv = get_uv_tx_size_impl(tx_size_y, block_size, 1, 1);
+  const int filter_level = get_filter_level(lfi_n, mbmi);
+  uint64_t *const left_y = &lfm->left_y[tx_size_y];
+  uint64_t *const above_y = &lfm->above_y[tx_size_y];
+  uint64_t *const int_4x4_y = &lfm->int_4x4_y;
+  uint16_t *const left_uv = &lfm->left_uv[tx_size_uv];
+  uint16_t *const above_uv = &lfm->above_uv[tx_size_uv];
+#if CONFIG_MISC_FIXES
+  uint16_t *const int_4x4_uv = &lfm->left_int_4x4_uv;
+#else
+  uint16_t *const int_4x4_uv = &lfm->int_4x4_uv;
+#endif
+  int i;
+
+  // If filter level is 0 we don't loop filter.
+  if (!filter_level) {
+    return;
+  } else {
+    const int w = num_8x8_blocks_wide_lookup[block_size];
+    const int h = num_8x8_blocks_high_lookup[block_size];
+    int index = shift_y;
+    for (i = 0; i < h; i++) {
+      memset(&lfm->lfl_y[index], filter_level, w);
+      index += 8;
+    }
+  }
+
+  // These set 1 in the current block size for the block size edges.
+  // For instance if the block size is 32x16, we'll set:
+  //    above =   1111
+  //              0000
+  //    and
+  //    left  =   1000
+  //          =   1000
+  // NOTE : In this example the low bit is left most ( 1000 ) is stored as
+  //        1,  not 8...
+  //
+  // U and V set things on a 16 bit scale.
+  //
+  *above_y |= above_prediction_mask[block_size] << shift_y;
+  *above_uv |= above_prediction_mask_uv[block_size] << shift_uv;
+  *left_y |= left_prediction_mask[block_size] << shift_y;
+  *left_uv |= left_prediction_mask_uv[block_size] << shift_uv;
+
+  // If the block has no coefficients and is not intra we skip applying
+  // the loop filter on block edges.
+#if CONFIG_MISC_FIXES
+  if ((mbmi->skip || mbmi->has_no_coeffs) && is_inter_block(mbmi))
+    return;
+#else
+  if (mbmi->skip && is_inter_block(mbmi))
+    return;
+#endif
+
+  // Here we are adding a mask for the transform size. The transform
+  // size mask is set to be correct for a 64x64 prediction block size. We
+  // mask to match the size of the block we are working on and then shift it
+  // into place..
+  *above_y |= (size_mask[block_size] &
+               above_64x64_txform_mask[tx_size_y]) << shift_y;
+  *above_uv |= (size_mask_uv[block_size] &
+                above_64x64_txform_mask_uv[tx_size_uv]) << shift_uv;
+
+  *left_y |= (size_mask[block_size] &
+              left_64x64_txform_mask[tx_size_y]) << shift_y;
+  *left_uv |= (size_mask_uv[block_size] &
+               left_64x64_txform_mask_uv[tx_size_uv]) << shift_uv;
+
+  // Here we are trying to determine what to do with the internal 4x4 block
+  // boundaries.  These differ from the 4x4 boundaries on the outside edge of
+  // an 8x8 in that the internal ones can be skipped and don't depend on
+  // the prediction block size.
+  if (tx_size_y == TX_4X4)
+    *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffffULL) << shift_y;
+
+  if (tx_size_uv == TX_4X4)
+    *int_4x4_uv |= (size_mask_uv[block_size] & 0xffff) << shift_uv;
+}
+
+// This function does the same thing as the one above with the exception that
+// it only affects the y masks. It exists because for blocks < 16x16 in size,
+// we only update u and v masks on the first block.
+static void build_y_mask(const loop_filter_info_n *const lfi_n,
+                         const MODE_INFO *mi, const int shift_y,
+                         LOOP_FILTER_MASK *lfm) {
+  const MB_MODE_INFO *mbmi = &mi->mbmi;
+  const BLOCK_SIZE block_size = mbmi->sb_type;
+  const TX_SIZE tx_size_y = mbmi->tx_size;
+  const int filter_level = get_filter_level(lfi_n, mbmi);
+  uint64_t *const left_y = &lfm->left_y[tx_size_y];
+  uint64_t *const above_y = &lfm->above_y[tx_size_y];
+  uint64_t *const int_4x4_y = &lfm->int_4x4_y;
+  int i;
+
+  if (!filter_level) {
+    return;
+  } else {
+    const int w = num_8x8_blocks_wide_lookup[block_size];
+    const int h = num_8x8_blocks_high_lookup[block_size];
+    int index = shift_y;
+    for (i = 0; i < h; i++) {
+      memset(&lfm->lfl_y[index], filter_level, w);
+      index += 8;
+    }
+  }
+
+  *above_y |= above_prediction_mask[block_size] << shift_y;
+  *left_y |= left_prediction_mask[block_size] << shift_y;
+
+#if CONFIG_MISC_FIXES
+  if ((mbmi->skip || mbmi->has_no_coeffs) && is_inter_block(mbmi))
+    return;
+#else
+  if (mbmi->skip && is_inter_block(mbmi))
+    return;
+#endif
+
+  *above_y |= (size_mask[block_size] &
+               above_64x64_txform_mask[tx_size_y]) << shift_y;
+
+  *left_y |= (size_mask[block_size] &
+              left_64x64_txform_mask[tx_size_y]) << shift_y;
+
+  if (tx_size_y == TX_4X4)
+    *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffffULL) << shift_y;
+}
+
+// This function sets up the bit masks for the entire 64x64 region represented
+// by mi_row, mi_col.
+// TODO(JBB): This function only works for yv12.
+void vp10_setup_mask(VP10_COMMON *const cm, const int mi_row, const int mi_col,
+                    MODE_INFO **mi, const int mode_info_stride,
+                    LOOP_FILTER_MASK *lfm) {
+  int idx_32, idx_16, idx_8;
+  const loop_filter_info_n *const lfi_n = &cm->lf_info;
+  MODE_INFO **mip = mi;
+  MODE_INFO **mip2 = mi;
+
+  // These are offsets to the next mi in the 64x64 block. It is what gets
+  // added to the mi ptr as we go through each loop. It helps us to avoid
+  // setting up special row and column counters for each index. The last step
+  // brings us out back to the starting position.
+  const int offset_32[] = {4, (mode_info_stride << 2) - 4, 4,
+                           -(mode_info_stride << 2) - 4};
+  const int offset_16[] = {2, (mode_info_stride << 1) - 2, 2,
+                           -(mode_info_stride << 1) - 2};
+  const int offset[] = {1, mode_info_stride - 1, 1, -mode_info_stride - 1};
+
+  // Following variables represent shifts to position the current block
+  // mask over the appropriate block. A shift of 36 to the left will move
+  // the bits for the final 32 by 32 block in the 64x64 up 4 rows and left
+  // 4 rows to the appropriate spot.
+  const int shift_32_y[] = {0, 4, 32, 36};
+  const int shift_16_y[] = {0, 2, 16, 18};
+  const int shift_8_y[] = {0, 1, 8, 9};
+  const int shift_32_uv[] = {0, 2, 8, 10};
+  const int shift_16_uv[] = {0, 1, 4, 5};
+  int i;
+  const int max_rows = (mi_row + MI_BLOCK_SIZE > cm->mi_rows ?
+                        cm->mi_rows - mi_row : MI_BLOCK_SIZE);
+  const int max_cols = (mi_col + MI_BLOCK_SIZE > cm->mi_cols ?
+                        cm->mi_cols - mi_col : MI_BLOCK_SIZE);
+
+  vp10_zero(*lfm);
+  assert(mip[0] != NULL);
+
+  // TODO(jimbankoski): Try moving most of the following code into decode
+  // loop and storing lfm in the mbmi structure so that we don't have to go
+  // through the recursive loop structure multiple times.
+  switch (mip[0]->mbmi.sb_type) {
+    case BLOCK_64X64:
+      build_masks(lfi_n, mip[0] , 0, 0, lfm);
+      break;
+    case BLOCK_64X32:
+      build_masks(lfi_n, mip[0], 0, 0, lfm);
+      mip2 = mip + mode_info_stride * 4;
+      if (4 >= max_rows)
+        break;
+      build_masks(lfi_n, mip2[0], 32, 8, lfm);
+      break;
+    case BLOCK_32X64:
+      build_masks(lfi_n, mip[0], 0, 0, lfm);
+      mip2 = mip + 4;
+      if (4 >= max_cols)
+        break;
+      build_masks(lfi_n, mip2[0], 4, 2, lfm);
+      break;
+    default:
+      for (idx_32 = 0; idx_32 < 4; mip += offset_32[idx_32], ++idx_32) {
+        const int shift_y = shift_32_y[idx_32];
+        const int shift_uv = shift_32_uv[idx_32];
+        const int mi_32_col_offset = ((idx_32 & 1) << 2);
+        const int mi_32_row_offset = ((idx_32 >> 1) << 2);
+        if (mi_32_col_offset >= max_cols || mi_32_row_offset >= max_rows)
+          continue;
+        switch (mip[0]->mbmi.sb_type) {
+          case BLOCK_32X32:
+            build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+            break;
+          case BLOCK_32X16:
+            build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+            if (mi_32_row_offset + 2 >= max_rows)
+              continue;
+            mip2 = mip + mode_info_stride * 2;
+            build_masks(lfi_n, mip2[0], shift_y + 16, shift_uv + 4, lfm);
+            break;
+          case BLOCK_16X32:
+            build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+            if (mi_32_col_offset + 2 >= max_cols)
+              continue;
+            mip2 = mip + 2;
+            build_masks(lfi_n, mip2[0], shift_y + 2, shift_uv + 1, lfm);
+            break;
+          default:
+            for (idx_16 = 0; idx_16 < 4; mip += offset_16[idx_16], ++idx_16) {
+              const int shift_y = shift_32_y[idx_32] + shift_16_y[idx_16];
+              const int shift_uv = shift_32_uv[idx_32] + shift_16_uv[idx_16];
+              const int mi_16_col_offset = mi_32_col_offset +
+                  ((idx_16 & 1) << 1);
+              const int mi_16_row_offset = mi_32_row_offset +
+                  ((idx_16 >> 1) << 1);
+
+              if (mi_16_col_offset >= max_cols || mi_16_row_offset >= max_rows)
+                continue;
+
+              switch (mip[0]->mbmi.sb_type) {
+                case BLOCK_16X16:
+                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+                  break;
+                case BLOCK_16X8:
+                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+                  if (mi_16_row_offset + 1 >= max_rows)
+                    continue;
+                  mip2 = mip + mode_info_stride;
+                  build_y_mask(lfi_n, mip2[0], shift_y+8, lfm);
+                  break;
+                case BLOCK_8X16:
+                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+                  if (mi_16_col_offset +1 >= max_cols)
+                    continue;
+                  mip2 = mip + 1;
+                  build_y_mask(lfi_n, mip2[0], shift_y+1, lfm);
+                  break;
+                default: {
+                  const int shift_y = shift_32_y[idx_32] +
+                                      shift_16_y[idx_16] +
+                                      shift_8_y[0];
+                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+                  mip += offset[0];
+                  for (idx_8 = 1; idx_8 < 4; mip += offset[idx_8], ++idx_8) {
+                    const int shift_y = shift_32_y[idx_32] +
+                                        shift_16_y[idx_16] +
+                                        shift_8_y[idx_8];
+                    const int mi_8_col_offset = mi_16_col_offset +
+                        ((idx_8 & 1));
+                    const int mi_8_row_offset = mi_16_row_offset +
+                        ((idx_8 >> 1));
+
+                    if (mi_8_col_offset >= max_cols ||
+                        mi_8_row_offset >= max_rows)
+                      continue;
+                    build_y_mask(lfi_n, mip[0], shift_y, lfm);
+                  }
+                  break;
+                }
+              }
+            }
+            break;
+        }
+      }
+      break;
+  }
+  // The largest loopfilter we have is 16x16 so we use the 16x16 mask
+  // for 32x32 transforms also.
+  lfm->left_y[TX_16X16] |= lfm->left_y[TX_32X32];
+  lfm->above_y[TX_16X16] |= lfm->above_y[TX_32X32];
+  lfm->left_uv[TX_16X16] |= lfm->left_uv[TX_32X32];
+  lfm->above_uv[TX_16X16] |= lfm->above_uv[TX_32X32];
+
+  // We do at least 8 tap filter on every 32x32 even if the transform size
+  // is 4x4. So if the 4x4 is set on a border pixel add it to the 8x8 and
+  // remove it from the 4x4.
+  lfm->left_y[TX_8X8] |= lfm->left_y[TX_4X4] & left_border;
+  lfm->left_y[TX_4X4] &= ~left_border;
+  lfm->above_y[TX_8X8] |= lfm->above_y[TX_4X4] & above_border;
+  lfm->above_y[TX_4X4] &= ~above_border;
+  lfm->left_uv[TX_8X8] |= lfm->left_uv[TX_4X4] & left_border_uv;
+  lfm->left_uv[TX_4X4] &= ~left_border_uv;
+  lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_4X4] & above_border_uv;
+  lfm->above_uv[TX_4X4] &= ~above_border_uv;
+
+  // We do some special edge handling.
+  if (mi_row + MI_BLOCK_SIZE > cm->mi_rows) {
+    const uint64_t rows = cm->mi_rows - mi_row;
+
+    // Each pixel inside the border gets a 1,
+    const uint64_t mask_y = (((uint64_t) 1 << (rows << 3)) - 1);
+    const uint16_t mask_uv = (((uint16_t) 1 << (((rows + 1) >> 1) << 2)) - 1);
+
+    // Remove values completely outside our border.
+    for (i = 0; i < TX_32X32; i++) {
+      lfm->left_y[i] &= mask_y;
+      lfm->above_y[i] &= mask_y;
+      lfm->left_uv[i] &= mask_uv;
+      lfm->above_uv[i] &= mask_uv;
+    }
+    lfm->int_4x4_y &= mask_y;
+#if CONFIG_MISC_FIXES
+    lfm->above_int_4x4_uv = lfm->left_int_4x4_uv & mask_uv;
+#else
+    lfm->int_4x4_uv &= mask_uv;
+#endif
+
+    // We don't apply a wide loop filter on the last uv block row. If set
+    // apply the shorter one instead.
+    if (rows == 1) {
+      lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_16X16];
+      lfm->above_uv[TX_16X16] = 0;
+    }
+    if (rows == 5) {
+      lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_16X16] & 0xff00;
+      lfm->above_uv[TX_16X16] &= ~(lfm->above_uv[TX_16X16] & 0xff00);
+    }
+  }
+
+  if (mi_col + MI_BLOCK_SIZE > cm->mi_cols) {
+    const uint64_t columns = cm->mi_cols - mi_col;
+
+    // Each pixel inside the border gets a 1, the multiply copies the border
+    // to where we need it.
+    const uint64_t mask_y  = (((1 << columns) - 1)) * 0x0101010101010101ULL;
+    const uint16_t mask_uv = ((1 << ((columns + 1) >> 1)) - 1) * 0x1111;
+
+    // Internal edges are not applied on the last column of the image so
+    // we mask 1 more for the internal edges
+    const uint16_t mask_uv_int = ((1 << (columns >> 1)) - 1) * 0x1111;
+
+    // Remove the bits outside the image edge.
+    for (i = 0; i < TX_32X32; i++) {
+      lfm->left_y[i] &= mask_y;
+      lfm->above_y[i] &= mask_y;
+      lfm->left_uv[i] &= mask_uv;
+      lfm->above_uv[i] &= mask_uv;
+    }
+    lfm->int_4x4_y &= mask_y;
+#if CONFIG_MISC_FIXES
+    lfm->left_int_4x4_uv &= mask_uv_int;
+#else
+    lfm->int_4x4_uv &= mask_uv_int;
+#endif
+
+    // We don't apply a wide loop filter on the last uv column. If set
+    // apply the shorter one instead.
+    if (columns == 1) {
+      lfm->left_uv[TX_8X8] |= lfm->left_uv[TX_16X16];
+      lfm->left_uv[TX_16X16] = 0;
+    }
+    if (columns == 5) {
+      lfm->left_uv[TX_8X8] |= (lfm->left_uv[TX_16X16] & 0xcccc);
+      lfm->left_uv[TX_16X16] &= ~(lfm->left_uv[TX_16X16] & 0xcccc);
+    }
+  }
+  // We don't apply a loop filter on the first column in the image, mask that
+  // out.
+  if (mi_col == 0) {
+    for (i = 0; i < TX_32X32; i++) {
+      lfm->left_y[i] &= 0xfefefefefefefefeULL;
+      lfm->left_uv[i] &= 0xeeee;
+    }
+  }
+
+  // Assert if we try to apply 2 different loop filters at the same position.
+  assert(!(lfm->left_y[TX_16X16] & lfm->left_y[TX_8X8]));
+  assert(!(lfm->left_y[TX_16X16] & lfm->left_y[TX_4X4]));
+  assert(!(lfm->left_y[TX_8X8] & lfm->left_y[TX_4X4]));
+  assert(!(lfm->int_4x4_y & lfm->left_y[TX_16X16]));
+  assert(!(lfm->left_uv[TX_16X16]&lfm->left_uv[TX_8X8]));
+  assert(!(lfm->left_uv[TX_16X16] & lfm->left_uv[TX_4X4]));
+  assert(!(lfm->left_uv[TX_8X8] & lfm->left_uv[TX_4X4]));
+#if CONFIG_MISC_FIXES
+  assert(!(lfm->left_int_4x4_uv & lfm->left_uv[TX_16X16]));
+#else
+  assert(!(lfm->int_4x4_uv & lfm->left_uv[TX_16X16]));
+#endif
+  assert(!(lfm->above_y[TX_16X16] & lfm->above_y[TX_8X8]));
+  assert(!(lfm->above_y[TX_16X16] & lfm->above_y[TX_4X4]));
+  assert(!(lfm->above_y[TX_8X8] & lfm->above_y[TX_4X4]));
+  assert(!(lfm->int_4x4_y & lfm->above_y[TX_16X16]));
+  assert(!(lfm->above_uv[TX_16X16] & lfm->above_uv[TX_8X8]));
+  assert(!(lfm->above_uv[TX_16X16] & lfm->above_uv[TX_4X4]));
+  assert(!(lfm->above_uv[TX_8X8] & lfm->above_uv[TX_4X4]));
+#if CONFIG_MISC_FIXES
+  assert(!(lfm->above_int_4x4_uv & lfm->above_uv[TX_16X16]));
+#else
+  assert(!(lfm->int_4x4_uv & lfm->above_uv[TX_16X16]));
+#endif
+}
+
+static void filter_selectively_vert(uint8_t *s, int pitch,
+                                    unsigned int mask_16x16,
+                                    unsigned int mask_8x8,
+                                    unsigned int mask_4x4,
+                                    unsigned int mask_4x4_int,
+                                    const loop_filter_info_n *lfi_n,
+                                    const uint8_t *lfl) {
+  unsigned int mask;
+
+  for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
+       mask; mask >>= 1) {
+    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
+
+    if (mask & 1) {
+      if (mask_16x16 & 1) {
+        vpx_lpf_vertical_16(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
+      } else if (mask_8x8 & 1) {
+        vpx_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+      } else if (mask_4x4 & 1) {
+        vpx_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+      }
+    }
+    if (mask_4x4_int & 1)
+      vpx_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+    s += 8;
+    lfl += 1;
+    mask_16x16 >>= 1;
+    mask_8x8 >>= 1;
+    mask_4x4 >>= 1;
+    mask_4x4_int >>= 1;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_filter_selectively_vert(uint16_t *s, int pitch,
+                                           unsigned int mask_16x16,
+                                           unsigned int mask_8x8,
+                                           unsigned int mask_4x4,
+                                           unsigned int mask_4x4_int,
+                                           const loop_filter_info_n *lfi_n,
+                                           const uint8_t *lfl, int bd) {
+  unsigned int mask;
+
+  for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
+       mask; mask >>= 1) {
+    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
+
+    if (mask & 1) {
+      if (mask_16x16 & 1) {
+        vpx_highbd_lpf_vertical_16(s, pitch, lfi->mblim, lfi->lim,
+                                   lfi->hev_thr, bd);
+      } else if (mask_8x8 & 1) {
+        vpx_highbd_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim,
+                                  lfi->hev_thr, 1, bd);
+      } else if (mask_4x4 & 1) {
+        vpx_highbd_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim,
+                                lfi->hev_thr, 1, bd);
+      }
+    }
+    if (mask_4x4_int & 1)
+      vpx_highbd_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim,
+                                lfi->hev_thr, 1, bd);
+    s += 8;
+    lfl += 1;
+    mask_16x16 >>= 1;
+    mask_8x8 >>= 1;
+    mask_4x4 >>= 1;
+    mask_4x4_int >>= 1;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+void vp10_filter_block_plane_non420(VP10_COMMON *cm,
+                                   struct macroblockd_plane *plane,
+                                   MODE_INFO **mi_8x8,
+                                   int mi_row, int mi_col) {
+  const int ss_x = plane->subsampling_x;
+  const int ss_y = plane->subsampling_y;
+  const int row_step = 1 << ss_y;
+  const int col_step = 1 << ss_x;
+  const int row_step_stride = cm->mi_stride * row_step;
+  struct buf_2d *const dst = &plane->dst;
+  uint8_t* const dst0 = dst->buf;
+  unsigned int mask_16x16[MI_BLOCK_SIZE] = {0};
+  unsigned int mask_8x8[MI_BLOCK_SIZE] = {0};
+  unsigned int mask_4x4[MI_BLOCK_SIZE] = {0};
+  unsigned int mask_4x4_int[MI_BLOCK_SIZE] = {0};
+  uint8_t lfl[MI_BLOCK_SIZE * MI_BLOCK_SIZE];
+  int r, c;
+
+  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
+    unsigned int mask_16x16_c = 0;
+    unsigned int mask_8x8_c = 0;
+    unsigned int mask_4x4_c = 0;
+    unsigned int border_mask;
+
+    // Determine the vertical edges that need filtering
+    for (c = 0; c < MI_BLOCK_SIZE && mi_col + c < cm->mi_cols; c += col_step) {
+      const MODE_INFO *mi = mi_8x8[c];
+      const BLOCK_SIZE sb_type = mi[0].mbmi.sb_type;
+      const int skip_this = mi[0].mbmi.skip && is_inter_block(&mi[0].mbmi);
+      // left edge of current unit is block/partition edge -> no skip
+      const int block_edge_left = (num_4x4_blocks_wide_lookup[sb_type] > 1) ?
+          !(c & (num_8x8_blocks_wide_lookup[sb_type] - 1)) : 1;
+      const int skip_this_c = skip_this && !block_edge_left;
+      // top edge of current unit is block/partition edge -> no skip
+      const int block_edge_above = (num_4x4_blocks_high_lookup[sb_type] > 1) ?
+          !(r & (num_8x8_blocks_high_lookup[sb_type] - 1)) : 1;
+      const int skip_this_r = skip_this && !block_edge_above;
+      const TX_SIZE tx_size = (plane->plane_type == PLANE_TYPE_UV)
+                            ? get_uv_tx_size(&mi[0].mbmi, plane)
+                            : mi[0].mbmi.tx_size;
+      const int skip_border_4x4_c = ss_x && mi_col + c == cm->mi_cols - 1;
+      const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
+
+      // Filter level can vary per MI
+      if (!(lfl[(r << 3) + (c >> ss_x)] =
+            get_filter_level(&cm->lf_info, &mi[0].mbmi)))
+        continue;
+
+      // Build masks based on the transform size of each block
+      if (tx_size == TX_32X32) {
+        if (!skip_this_c && ((c >> ss_x) & 3) == 0) {
+          if (!skip_border_4x4_c)
+            mask_16x16_c |= 1 << (c >> ss_x);
+          else
+            mask_8x8_c |= 1 << (c >> ss_x);
+        }
+        if (!skip_this_r && ((r >> ss_y) & 3) == 0) {
+          if (!skip_border_4x4_r)
+            mask_16x16[r] |= 1 << (c >> ss_x);
+          else
+            mask_8x8[r] |= 1 << (c >> ss_x);
+        }
+      } else if (tx_size == TX_16X16) {
+        if (!skip_this_c && ((c >> ss_x) & 1) == 0) {
+          if (!skip_border_4x4_c)
+            mask_16x16_c |= 1 << (c >> ss_x);
+          else
+            mask_8x8_c |= 1 << (c >> ss_x);
+        }
+        if (!skip_this_r && ((r >> ss_y) & 1) == 0) {
+          if (!skip_border_4x4_r)
+            mask_16x16[r] |= 1 << (c >> ss_x);
+          else
+            mask_8x8[r] |= 1 << (c >> ss_x);
+        }
+      } else {
+        // force 8x8 filtering on 32x32 boundaries
+        if (!skip_this_c) {
+          if (tx_size == TX_8X8 || ((c >> ss_x) & 3) == 0)
+            mask_8x8_c |= 1 << (c >> ss_x);
+          else
+            mask_4x4_c |= 1 << (c >> ss_x);
+        }
+
+        if (!skip_this_r) {
+          if (tx_size == TX_8X8 || ((r >> ss_y) & 3) == 0)
+            mask_8x8[r] |= 1 << (c >> ss_x);
+          else
+            mask_4x4[r] |= 1 << (c >> ss_x);
+        }
+
+        if (!skip_this && tx_size < TX_8X8 && !skip_border_4x4_c)
+          mask_4x4_int[r] |= 1 << (c >> ss_x);
+      }
+    }
+
+    // Disable filtering on the leftmost column
+    border_mask = ~(mi_col == 0);
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (cm->use_highbitdepth) {
+      highbd_filter_selectively_vert(CONVERT_TO_SHORTPTR(dst->buf),
+                                     dst->stride,
+                                     mask_16x16_c & border_mask,
+                                     mask_8x8_c & border_mask,
+                                     mask_4x4_c & border_mask,
+                                     mask_4x4_int[r],
+                                     &cm->lf_info, &lfl[r << 3],
+                                     (int)cm->bit_depth);
+    } else {
+      filter_selectively_vert(dst->buf, dst->stride,
+                              mask_16x16_c & border_mask,
+                              mask_8x8_c & border_mask,
+                              mask_4x4_c & border_mask,
+                              mask_4x4_int[r],
+                              &cm->lf_info, &lfl[r << 3]);
+    }
+#else
+    filter_selectively_vert(dst->buf, dst->stride,
+                            mask_16x16_c & border_mask,
+                            mask_8x8_c & border_mask,
+                            mask_4x4_c & border_mask,
+                            mask_4x4_int[r],
+                            &cm->lf_info, &lfl[r << 3]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    dst->buf += 8 * dst->stride;
+    mi_8x8 += row_step_stride;
+  }
+
+  // Now do horizontal pass
+  dst->buf = dst0;
+  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
+    const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
+    const unsigned int mask_4x4_int_r = skip_border_4x4_r ? 0 : mask_4x4_int[r];
+
+    unsigned int mask_16x16_r;
+    unsigned int mask_8x8_r;
+    unsigned int mask_4x4_r;
+
+    if (mi_row + r == 0) {
+      mask_16x16_r = 0;
+      mask_8x8_r = 0;
+      mask_4x4_r = 0;
+    } else {
+      mask_16x16_r = mask_16x16[r];
+      mask_8x8_r = mask_8x8[r];
+      mask_4x4_r = mask_4x4[r];
+    }
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (cm->use_highbitdepth) {
+      highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
+                                      dst->stride,
+                                      mask_16x16_r,
+                                      mask_8x8_r,
+                                      mask_4x4_r,
+                                      mask_4x4_int_r,
+                                      &cm->lf_info, &lfl[r << 3],
+                                      (int)cm->bit_depth);
+    } else {
+      filter_selectively_horiz(dst->buf, dst->stride,
+                               mask_16x16_r,
+                               mask_8x8_r,
+                               mask_4x4_r,
+                               mask_4x4_int_r,
+                               &cm->lf_info, &lfl[r << 3]);
+    }
+#else
+    filter_selectively_horiz(dst->buf, dst->stride,
+                             mask_16x16_r,
+                             mask_8x8_r,
+                             mask_4x4_r,
+                             mask_4x4_int_r,
+                             &cm->lf_info, &lfl[r << 3]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    dst->buf += 8 * dst->stride;
+  }
+}
+
+void vp10_filter_block_plane_ss00(VP10_COMMON *const cm,
+                                 struct macroblockd_plane *const plane,
+                                 int mi_row,
+                                 LOOP_FILTER_MASK *lfm) {
+  struct buf_2d *const dst = &plane->dst;
+  uint8_t *const dst0 = dst->buf;
+  int r;
+  uint64_t mask_16x16 = lfm->left_y[TX_16X16];
+  uint64_t mask_8x8 = lfm->left_y[TX_8X8];
+  uint64_t mask_4x4 = lfm->left_y[TX_4X4];
+  uint64_t mask_4x4_int = lfm->int_4x4_y;
+
+  assert(plane->subsampling_x == 0 && plane->subsampling_y == 0);
+
+  // Vertical pass: do 2 rows at one time
+  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) {
+    unsigned int mask_16x16_l = mask_16x16 & 0xffff;
+    unsigned int mask_8x8_l = mask_8x8 & 0xffff;
+    unsigned int mask_4x4_l = mask_4x4 & 0xffff;
+    unsigned int mask_4x4_int_l = mask_4x4_int & 0xffff;
+
+// Disable filtering on the leftmost column.
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (cm->use_highbitdepth) {
+      highbd_filter_selectively_vert_row2(
+          plane->subsampling_x, CONVERT_TO_SHORTPTR(dst->buf), dst->stride,
+          mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
+          &lfm->lfl_y[r << 3], (int)cm->bit_depth);
+    } else {
+      filter_selectively_vert_row2(
+          plane->subsampling_x, dst->buf, dst->stride, mask_16x16_l, mask_8x8_l,
+          mask_4x4_l, mask_4x4_int_l, &cm->lf_info, &lfm->lfl_y[r << 3]);
+    }
+#else
+    filter_selectively_vert_row2(
+        plane->subsampling_x, dst->buf, dst->stride, mask_16x16_l, mask_8x8_l,
+        mask_4x4_l, mask_4x4_int_l, &cm->lf_info, &lfm->lfl_y[r << 3]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    dst->buf += 16 * dst->stride;
+    mask_16x16 >>= 16;
+    mask_8x8 >>= 16;
+    mask_4x4 >>= 16;
+    mask_4x4_int >>= 16;
+  }
+
+  // Horizontal pass
+  dst->buf = dst0;
+  mask_16x16 = lfm->above_y[TX_16X16];
+  mask_8x8 = lfm->above_y[TX_8X8];
+  mask_4x4 = lfm->above_y[TX_4X4];
+  mask_4x4_int = lfm->int_4x4_y;
+
+  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r++) {
+    unsigned int mask_16x16_r;
+    unsigned int mask_8x8_r;
+    unsigned int mask_4x4_r;
+
+    if (mi_row + r == 0) {
+      mask_16x16_r = 0;
+      mask_8x8_r = 0;
+      mask_4x4_r = 0;
+    } else {
+      mask_16x16_r = mask_16x16 & 0xff;
+      mask_8x8_r = mask_8x8 & 0xff;
+      mask_4x4_r = mask_4x4 & 0xff;
+    }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (cm->use_highbitdepth) {
+      highbd_filter_selectively_horiz(
+          CONVERT_TO_SHORTPTR(dst->buf), dst->stride, mask_16x16_r, mask_8x8_r,
+          mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info, &lfm->lfl_y[r << 3],
+          (int)cm->bit_depth);
+    } else {
+      filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
+                               mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info,
+                               &lfm->lfl_y[r << 3]);
+    }
+#else
+    filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
+                             mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info,
+                             &lfm->lfl_y[r << 3]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    dst->buf += 8 * dst->stride;
+    mask_16x16 >>= 8;
+    mask_8x8 >>= 8;
+    mask_4x4 >>= 8;
+    mask_4x4_int >>= 8;
+  }
+}
+
+void vp10_filter_block_plane_ss11(VP10_COMMON *const cm,
+                                 struct macroblockd_plane *const plane,
+                                 int mi_row,
+                                 LOOP_FILTER_MASK *lfm) {
+  struct buf_2d *const dst = &plane->dst;
+  uint8_t *const dst0 = dst->buf;
+  int r, c;
+
+  uint16_t mask_16x16 = lfm->left_uv[TX_16X16];
+  uint16_t mask_8x8 = lfm->left_uv[TX_8X8];
+  uint16_t mask_4x4 = lfm->left_uv[TX_4X4];
+#if CONFIG_MISC_FIXES
+  uint16_t mask_4x4_int = lfm->left_int_4x4_uv;
+#else
+  uint16_t mask_4x4_int = lfm->int_4x4_uv;
+#endif
+
+  assert(plane->subsampling_x == 1 && plane->subsampling_y == 1);
+
+  // Vertical pass: do 2 rows at one time
+  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 4) {
+    if (plane->plane_type == 1) {
+      for (c = 0; c < (MI_BLOCK_SIZE >> 1); c++) {
+        lfm->lfl_uv[(r << 1) + c] = lfm->lfl_y[(r << 3) + (c << 1)];
+        lfm->lfl_uv[((r + 2) << 1) + c] = lfm->lfl_y[((r + 2) << 3) + (c << 1)];
+      }
+    }
+
+    {
+      unsigned int mask_16x16_l = mask_16x16 & 0xff;
+      unsigned int mask_8x8_l = mask_8x8 & 0xff;
+      unsigned int mask_4x4_l = mask_4x4 & 0xff;
+      unsigned int mask_4x4_int_l = mask_4x4_int & 0xff;
+
+// Disable filtering on the leftmost column.
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (cm->use_highbitdepth) {
+        highbd_filter_selectively_vert_row2(
+            plane->subsampling_x, CONVERT_TO_SHORTPTR(dst->buf), dst->stride,
+            mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
+            &lfm->lfl_uv[r << 1], (int)cm->bit_depth);
+      } else {
+        filter_selectively_vert_row2(
+            plane->subsampling_x, dst->buf, dst->stride,
+            mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
+            &lfm->lfl_uv[r << 1]);
+      }
+#else
+      filter_selectively_vert_row2(
+          plane->subsampling_x, dst->buf, dst->stride,
+          mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
+          &lfm->lfl_uv[r << 1]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+      dst->buf += 16 * dst->stride;
+      mask_16x16 >>= 8;
+      mask_8x8 >>= 8;
+      mask_4x4 >>= 8;
+      mask_4x4_int >>= 8;
+    }
+  }
+
+  // Horizontal pass
+  dst->buf = dst0;
+  mask_16x16 = lfm->above_uv[TX_16X16];
+  mask_8x8 = lfm->above_uv[TX_8X8];
+  mask_4x4 = lfm->above_uv[TX_4X4];
+#if CONFIG_MISC_FIXES
+  mask_4x4_int = lfm->above_int_4x4_uv;
+#else
+  mask_4x4_int = lfm->int_4x4_uv;
+#endif
+
+  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) {
+    const int skip_border_4x4_r = mi_row + r == cm->mi_rows - 1;
+    const unsigned int mask_4x4_int_r =
+        skip_border_4x4_r ? 0 : (mask_4x4_int & 0xf);
+    unsigned int mask_16x16_r;
+    unsigned int mask_8x8_r;
+    unsigned int mask_4x4_r;
+
+    if (mi_row + r == 0) {
+      mask_16x16_r = 0;
+      mask_8x8_r = 0;
+      mask_4x4_r = 0;
+    } else {
+      mask_16x16_r = mask_16x16 & 0xf;
+      mask_8x8_r = mask_8x8 & 0xf;
+      mask_4x4_r = mask_4x4 & 0xf;
+    }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (cm->use_highbitdepth) {
+      highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
+                                      dst->stride, mask_16x16_r, mask_8x8_r,
+                                      mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
+                                      &lfm->lfl_uv[r << 1], (int)cm->bit_depth);
+    } else {
+      filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
+                               mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
+                               &lfm->lfl_uv[r << 1]);
+    }
+#else
+    filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
+                             mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
+                             &lfm->lfl_uv[r << 1]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    dst->buf += 8 * dst->stride;
+    mask_16x16 >>= 4;
+    mask_8x8 >>= 4;
+    mask_4x4 >>= 4;
+    mask_4x4_int >>= 4;
+  }
+}
+
+void vp10_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer,
+                          VP10_COMMON *cm,
+                          struct macroblockd_plane planes[MAX_MB_PLANE],
+                          int start, int stop, int y_only) {
+  const int num_planes = y_only ? 1 : MAX_MB_PLANE;
+  enum lf_path path;
+  LOOP_FILTER_MASK lfm;
+  int mi_row, mi_col;
+
+  if (y_only)
+    path = LF_PATH_444;
+  else if (planes[1].subsampling_y == 1 && planes[1].subsampling_x == 1)
+    path = LF_PATH_420;
+  else if (planes[1].subsampling_y == 0 && planes[1].subsampling_x == 0)
+    path = LF_PATH_444;
+  else
+    path = LF_PATH_SLOW;
+
+  for (mi_row = start; mi_row < stop; mi_row += MI_BLOCK_SIZE) {
+    MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
+
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
+      int plane;
+
+      vp10_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
+
+      // TODO(JBB): Make setup_mask work for non 420.
+      vp10_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride,
+                     &lfm);
+
+      vp10_filter_block_plane_ss00(cm, &planes[0], mi_row, &lfm);
+      for (plane = 1; plane < num_planes; ++plane) {
+        switch (path) {
+          case LF_PATH_420:
+            vp10_filter_block_plane_ss11(cm, &planes[plane], mi_row, &lfm);
+            break;
+          case LF_PATH_444:
+            vp10_filter_block_plane_ss00(cm, &planes[plane], mi_row, &lfm);
+            break;
+          case LF_PATH_SLOW:
+            vp10_filter_block_plane_non420(cm, &planes[plane], mi + mi_col,
+                                          mi_row, mi_col);
+            break;
+        }
+      }
+    }
+  }
+}
+
+void vp10_loop_filter_frame(YV12_BUFFER_CONFIG *frame,
+                           VP10_COMMON *cm, MACROBLOCKD *xd,
+                           int frame_filter_level,
+                           int y_only, int partial_frame) {
+  int start_mi_row, end_mi_row, mi_rows_to_filter;
+  if (!frame_filter_level) return;
+  start_mi_row = 0;
+  mi_rows_to_filter = cm->mi_rows;
+  if (partial_frame && cm->mi_rows > 8) {
+    start_mi_row = cm->mi_rows >> 1;
+    start_mi_row &= 0xfffffff8;
+    mi_rows_to_filter = VPXMAX(cm->mi_rows / 8, 8);
+  }
+  end_mi_row = start_mi_row + mi_rows_to_filter;
+  vp10_loop_filter_frame_init(cm, frame_filter_level);
+  vp10_loop_filter_rows(frame, cm, xd->plane,
+                       start_mi_row, end_mi_row,
+                       y_only);
+}
+
+void vp10_loop_filter_data_reset(
+    LFWorkerData *lf_data, YV12_BUFFER_CONFIG *frame_buffer,
+    struct VP10Common *cm,
+    const struct macroblockd_plane planes[MAX_MB_PLANE]) {
+  lf_data->frame_buffer = frame_buffer;
+  lf_data->cm = cm;
+  lf_data->start = 0;
+  lf_data->stop = 0;
+  lf_data->y_only = 0;
+  memcpy(lf_data->planes, planes, sizeof(lf_data->planes));
+}
+
+int vp10_loop_filter_worker(LFWorkerData *const lf_data, void *unused) {
+  (void)unused;
+  vp10_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
+                       lf_data->start, lf_data->stop, lf_data->y_only);
+  return 1;
+}
diff --git a/libs/libvpx/vp10/common/loopfilter.h b/libs/libvpx/vp10/common/loopfilter.h
new file mode 100644
index 0000000000..8db705aa03
--- /dev/null
+++ b/libs/libvpx/vp10/common/loopfilter.h
@@ -0,0 +1,159 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_COMMON_LOOPFILTER_H_
+#define VP10_COMMON_LOOPFILTER_H_
+
+#include "vpx_ports/mem.h"
+#include "./vpx_config.h"
+
+#include "vp10/common/blockd.h"
+#include "vp10/common/seg_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_LOOP_FILTER 63
+#define MAX_SHARPNESS 7
+
+#define SIMD_WIDTH 16
+
+#define MAX_MODE_LF_DELTAS      2
+
+enum lf_path {
+  LF_PATH_420,
+  LF_PATH_444,
+  LF_PATH_SLOW,
+};
+
+struct loopfilter {
+  int filter_level;
+
+  int sharpness_level;
+  int last_sharpness_level;
+
+  uint8_t mode_ref_delta_enabled;
+  uint8_t mode_ref_delta_update;
+
+  // 0 = Intra, Last, GF, ARF
+  signed char ref_deltas[MAX_REF_FRAMES];
+  signed char last_ref_deltas[MAX_REF_FRAMES];
+
+  // 0 = ZERO_MV, MV
+  signed char mode_deltas[MAX_MODE_LF_DELTAS];
+  signed char last_mode_deltas[MAX_MODE_LF_DELTAS];
+};
+
+// Need to align this structure so when it is declared and
+// passed it can be loaded into vector registers.
+typedef struct {
+  DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, mblim[SIMD_WIDTH]);
+  DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, lim[SIMD_WIDTH]);
+  DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, hev_thr[SIMD_WIDTH]);
+} loop_filter_thresh;
+
+typedef struct {
+  loop_filter_thresh lfthr[MAX_LOOP_FILTER + 1];
+  uint8_t lvl[MAX_SEGMENTS][MAX_REF_FRAMES][MAX_MODE_LF_DELTAS];
+} loop_filter_info_n;
+
+// This structure holds bit masks for all 8x8 blocks in a 64x64 region.
+// Each 1 bit represents a position in which we want to apply the loop filter.
+// Left_ entries refer to whether we apply a filter on the border to the
+// left of the block.   Above_ entries refer to whether or not to apply a
+// filter on the above border.   Int_ entries refer to whether or not to
+// apply borders on the 4x4 edges within the 8x8 block that each bit
+// represents.
+// Since each transform is accompanied by a potentially different type of
+// loop filter there is a different entry in the array for each transform size.
+typedef struct {
+  uint64_t left_y[TX_SIZES];
+  uint64_t above_y[TX_SIZES];
+  uint64_t int_4x4_y;
+  uint16_t left_uv[TX_SIZES];
+  uint16_t above_uv[TX_SIZES];
+#if CONFIG_MISC_FIXES
+  uint16_t left_int_4x4_uv;
+  uint16_t above_int_4x4_uv;
+#else
+  uint16_t int_4x4_uv;
+#endif
+  uint8_t lfl_y[64];
+  uint8_t lfl_uv[16];
+} LOOP_FILTER_MASK;
+
+/* assorted loopfilter functions which get used elsewhere */
+struct VP10Common;
+struct macroblockd;
+struct VP9LfSyncData;
+
+// This function sets up the bit masks for the entire 64x64 region represented
+// by mi_row, mi_col.
+void vp10_setup_mask(struct VP10Common *const cm,
+                    const int mi_row, const int mi_col,
+                    MODE_INFO **mi_8x8, const int mode_info_stride,
+                    LOOP_FILTER_MASK *lfm);
+
+void vp10_filter_block_plane_ss00(struct VP10Common *const cm,
+                                 struct macroblockd_plane *const plane,
+                                 int mi_row,
+                                 LOOP_FILTER_MASK *lfm);
+
+void vp10_filter_block_plane_ss11(struct VP10Common *const cm,
+                                 struct macroblockd_plane *const plane,
+                                 int mi_row,
+                                 LOOP_FILTER_MASK *lfm);
+
+void vp10_filter_block_plane_non420(struct VP10Common *cm,
+                                   struct macroblockd_plane *plane,
+                                   MODE_INFO **mi_8x8,
+                                   int mi_row, int mi_col);
+
+void vp10_loop_filter_init(struct VP10Common *cm);
+
+// Update the loop filter for the current frame.
+// This should be called before vp10_loop_filter_rows(), vp10_loop_filter_frame()
+// calls this function directly.
+void vp10_loop_filter_frame_init(struct VP10Common *cm, int default_filt_lvl);
+
+void vp10_loop_filter_frame(YV12_BUFFER_CONFIG *frame,
+                           struct VP10Common *cm,
+                           struct macroblockd *mbd,
+                           int filter_level,
+                           int y_only, int partial_frame);
+
+// Apply the loop filter to [start, stop) macro block rows in frame_buffer.
+void vp10_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer,
+                          struct VP10Common *cm,
+                          struct macroblockd_plane planes[MAX_MB_PLANE],
+                          int start, int stop, int y_only);
+
+typedef struct LoopFilterWorkerData {
+  YV12_BUFFER_CONFIG *frame_buffer;
+  struct VP10Common *cm;
+  struct macroblockd_plane planes[MAX_MB_PLANE];
+
+  int start;
+  int stop;
+  int y_only;
+} LFWorkerData;
+
+void vp10_loop_filter_data_reset(
+    LFWorkerData *lf_data, YV12_BUFFER_CONFIG *frame_buffer,
+    struct VP10Common *cm, const struct macroblockd_plane planes[MAX_MB_PLANE]);
+
+// Operates on the rows described by 'lf_data'.
+int vp10_loop_filter_worker(LFWorkerData *const lf_data, void *unused);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_COMMON_LOOPFILTER_H_
diff --git a/libs/libvpx/vp10/common/mfqe.c b/libs/libvpx/vp10/common/mfqe.c
new file mode 100644
index 0000000000..c715ef73eb
--- /dev/null
+++ b/libs/libvpx/vp10/common/mfqe.c
@@ -0,0 +1,394 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vp10_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_scale_rtcd.h"
+
+#include "vp10/common/onyxc_int.h"
+#include "vp10/common/postproc.h"
+
+// TODO(jackychen): Replace this function with SSE2 code. There is
+// one SSE2 implementation in vp8, so will consider how to share it
+// between vp8 and vp9.
+static void filter_by_weight(const uint8_t *src, int src_stride,
+                             uint8_t *dst, int dst_stride,
+                             int block_size, int src_weight) {
+  const int dst_weight = (1 << MFQE_PRECISION) - src_weight;
+  const int rounding_bit = 1 << (MFQE_PRECISION - 1);
+  int r, c;
+
+  for (r = 0; r < block_size; r++) {
+    for (c = 0; c < block_size; c++) {
+      dst[c] = (src[c] * src_weight + dst[c] * dst_weight + rounding_bit)
+               >> MFQE_PRECISION;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void vp10_filter_by_weight8x8_c(const uint8_t *src, int src_stride,
+                               uint8_t *dst, int dst_stride, int src_weight) {
+  filter_by_weight(src, src_stride, dst, dst_stride, 8, src_weight);
+}
+
+void vp10_filter_by_weight16x16_c(const uint8_t *src, int src_stride,
+                                 uint8_t *dst, int dst_stride,
+                                 int src_weight) {
+  filter_by_weight(src, src_stride, dst, dst_stride, 16, src_weight);
+}
+
+static void filter_by_weight32x32(const uint8_t *src, int src_stride,
+                                  uint8_t *dst, int dst_stride, int weight) {
+  vp10_filter_by_weight16x16(src, src_stride, dst, dst_stride, weight);
+  vp10_filter_by_weight16x16(src + 16, src_stride, dst + 16, dst_stride,
+                            weight);
+  vp10_filter_by_weight16x16(src + src_stride * 16, src_stride,
+                            dst + dst_stride * 16, dst_stride, weight);
+  vp10_filter_by_weight16x16(src + src_stride * 16 + 16, src_stride,
+                            dst + dst_stride * 16 + 16, dst_stride, weight);
+}
+
+static void filter_by_weight64x64(const uint8_t *src, int src_stride,
+                                  uint8_t *dst, int dst_stride, int weight) {
+  filter_by_weight32x32(src, src_stride, dst, dst_stride, weight);
+  filter_by_weight32x32(src + 32, src_stride, dst + 32,
+                        dst_stride, weight);
+  filter_by_weight32x32(src + src_stride * 32, src_stride,
+                        dst + dst_stride * 32, dst_stride, weight);
+  filter_by_weight32x32(src + src_stride * 32 + 32, src_stride,
+                        dst + dst_stride * 32 + 32, dst_stride, weight);
+}
+
+static void apply_ifactor(const uint8_t *y, int y_stride, uint8_t *yd,
+                          int yd_stride, const uint8_t *u, const uint8_t *v,
+                          int uv_stride, uint8_t *ud, uint8_t *vd,
+                          int uvd_stride, BLOCK_SIZE block_size,
+                          int weight) {
+  if (block_size == BLOCK_16X16) {
+    vp10_filter_by_weight16x16(y, y_stride, yd, yd_stride, weight);
+    vp10_filter_by_weight8x8(u, uv_stride, ud, uvd_stride, weight);
+    vp10_filter_by_weight8x8(v, uv_stride, vd, uvd_stride, weight);
+  } else if (block_size == BLOCK_32X32) {
+    filter_by_weight32x32(y, y_stride, yd, yd_stride, weight);
+    vp10_filter_by_weight16x16(u, uv_stride, ud, uvd_stride, weight);
+    vp10_filter_by_weight16x16(v, uv_stride, vd, uvd_stride, weight);
+  } else if (block_size == BLOCK_64X64) {
+    filter_by_weight64x64(y, y_stride, yd, yd_stride, weight);
+    filter_by_weight32x32(u, uv_stride, ud, uvd_stride, weight);
+    filter_by_weight32x32(v, uv_stride, vd, uvd_stride, weight);
+  }
+}
+
+// TODO(jackychen): Determine whether replace it with assembly code.
+static void copy_mem8x8(const uint8_t *src, int src_stride,
+                        uint8_t *dst, int dst_stride) {
+  int r;
+  for (r = 0; r < 8; r++) {
+    memcpy(dst, src, 8);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void copy_mem16x16(const uint8_t *src, int src_stride,
+                          uint8_t *dst, int dst_stride) {
+  int r;
+  for (r = 0; r < 16; r++) {
+    memcpy(dst, src, 16);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void copy_mem32x32(const uint8_t *src, int src_stride,
+                          uint8_t *dst, int dst_stride) {
+  copy_mem16x16(src, src_stride, dst, dst_stride);
+  copy_mem16x16(src + 16, src_stride, dst + 16, dst_stride);
+  copy_mem16x16(src + src_stride * 16, src_stride,
+                dst + dst_stride * 16, dst_stride);
+  copy_mem16x16(src + src_stride * 16 + 16, src_stride,
+                dst + dst_stride * 16 + 16, dst_stride);
+}
+
+void copy_mem64x64(const uint8_t *src, int src_stride,
+                   uint8_t *dst, int dst_stride) {
+  copy_mem32x32(src, src_stride, dst, dst_stride);
+  copy_mem32x32(src + 32, src_stride, dst + 32, dst_stride);
+  copy_mem32x32(src + src_stride * 32, src_stride,
+                dst + src_stride * 32, dst_stride);
+  copy_mem32x32(src + src_stride * 32 + 32, src_stride,
+                dst + src_stride * 32 + 32, dst_stride);
+}
+
+static void copy_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
+                       int y_stride, int uv_stride, uint8_t *yd, uint8_t *ud,
+                       uint8_t *vd, int yd_stride, int uvd_stride,
+                       BLOCK_SIZE bs) {
+  if (bs == BLOCK_16X16) {
+    copy_mem16x16(y, y_stride, yd, yd_stride);
+    copy_mem8x8(u, uv_stride, ud, uvd_stride);
+    copy_mem8x8(v, uv_stride, vd, uvd_stride);
+  } else if (bs == BLOCK_32X32) {
+    copy_mem32x32(y, y_stride, yd, yd_stride);
+    copy_mem16x16(u, uv_stride, ud, uvd_stride);
+    copy_mem16x16(v, uv_stride, vd, uvd_stride);
+  } else {
+    copy_mem64x64(y, y_stride, yd, yd_stride);
+    copy_mem32x32(u, uv_stride, ud, uvd_stride);
+    copy_mem32x32(v, uv_stride, vd, uvd_stride);
+  }
+}
+
+static void get_thr(BLOCK_SIZE bs, int qdiff, int *sad_thr, int *vdiff_thr) {
+  const int adj = qdiff >> MFQE_PRECISION;
+  if (bs == BLOCK_16X16) {
+    *sad_thr = 7 + adj;
+  } else if (bs == BLOCK_32X32) {
+    *sad_thr = 6 + adj;
+  } else {  // BLOCK_64X64
+    *sad_thr = 5 + adj;
+  }
+  *vdiff_thr = 125 + qdiff;
+}
+
+static void mfqe_block(BLOCK_SIZE bs, const uint8_t *y, const uint8_t *u,
+                       const uint8_t *v, int y_stride, int uv_stride,
+                       uint8_t *yd, uint8_t *ud, uint8_t *vd, int yd_stride,
+                       int uvd_stride, int qdiff) {
+  int sad, sad_thr, vdiff, vdiff_thr;
+  uint32_t sse;
+
+  get_thr(bs, qdiff, &sad_thr, &vdiff_thr);
+
+  if (bs == BLOCK_16X16) {
+    vdiff = (vpx_variance16x16(y, y_stride, yd, yd_stride, &sse) + 128) >> 8;
+    sad = (vpx_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8;
+  } else if (bs == BLOCK_32X32) {
+    vdiff = (vpx_variance32x32(y, y_stride, yd, yd_stride, &sse) + 512) >> 10;
+    sad = (vpx_sad32x32(y, y_stride, yd, yd_stride) + 512) >> 10;
+  } else /* if (bs == BLOCK_64X64) */ {
+    vdiff = (vpx_variance64x64(y, y_stride, yd, yd_stride, &sse) + 2048) >> 12;
+    sad = (vpx_sad64x64(y, y_stride, yd, yd_stride) + 2048) >> 12;
+  }
+
+  // vdiff > sad * 3 means vdiff should not be too small, otherwise,
+  // it might be a lighting change in smooth area. When there is a
+  // lighting change in smooth area, it is dangerous to do MFQE.
+  if (sad > 1 && vdiff > sad * 3) {
+    const int weight = 1 << MFQE_PRECISION;
+    int ifactor = weight * sad * vdiff / (sad_thr * vdiff_thr);
+    // When ifactor equals weight, no MFQE is done.
+    if (ifactor > weight) {
+      ifactor = weight;
+    }
+    apply_ifactor(y, y_stride, yd, yd_stride, u, v, uv_stride, ud, vd,
+                  uvd_stride, bs, ifactor);
+  } else {
+    // Copy the block from current frame (i.e., no mfqe is done).
+    copy_block(y, u, v, y_stride, uv_stride, yd, ud, vd,
+               yd_stride, uvd_stride, bs);
+  }
+}
+
+static int mfqe_decision(MODE_INFO *mi, BLOCK_SIZE cur_bs) {
+  // Check the motion in current block(for inter frame),
+  // or check the motion in the correlated block in last frame (for keyframe).
+  const int mv_len_square = mi->mbmi.mv[0].as_mv.row *
+                            mi->mbmi.mv[0].as_mv.row +
+                            mi->mbmi.mv[0].as_mv.col *
+                            mi->mbmi.mv[0].as_mv.col;
+  const int mv_threshold = 100;
+  return mi->mbmi.mode >= NEARESTMV &&  // Not an intra block
+         cur_bs >= BLOCK_16X16 &&
+         mv_len_square <= mv_threshold;
+}
+
+// Process each partiton in a super block, recursively.
+static void mfqe_partition(VP10_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs,
+                           const uint8_t *y, const uint8_t *u,
+                           const uint8_t *v, int y_stride, int uv_stride,
+                           uint8_t *yd, uint8_t *ud, uint8_t *vd,
+                           int yd_stride, int uvd_stride) {
+  int mi_offset, y_offset, uv_offset;
+  const BLOCK_SIZE cur_bs = mi->mbmi.sb_type;
+  const int qdiff = cm->base_qindex - cm->postproc_state.last_base_qindex;
+  const int bsl = b_width_log2_lookup[bs];
+  PARTITION_TYPE partition = partition_lookup[bsl][cur_bs];
+  const BLOCK_SIZE subsize = get_subsize(bs, partition);
+
+  if (cur_bs < BLOCK_8X8) {
+    // If there are blocks smaller than 8x8, it must be on the boundary.
+    return;
+  }
+  // No MFQE on blocks smaller than 16x16
+  if (bs == BLOCK_16X16) {
+    partition = PARTITION_NONE;
+  }
+  if (bs == BLOCK_64X64) {
+    mi_offset = 4;
+    y_offset = 32;
+    uv_offset = 16;
+  } else {
+    mi_offset = 2;
+    y_offset = 16;
+    uv_offset = 8;
+  }
+  switch (partition) {
+    BLOCK_SIZE mfqe_bs, bs_tmp;
+    case PARTITION_HORZ:
+      if (bs == BLOCK_64X64) {
+        mfqe_bs = BLOCK_64X32;
+        bs_tmp = BLOCK_32X32;
+      } else {
+        mfqe_bs = BLOCK_32X16;
+        bs_tmp = BLOCK_16X16;
+      }
+      if (mfqe_decision(mi, mfqe_bs)) {
+        // Do mfqe on the first square partition.
+        mfqe_block(bs_tmp, y, u, v, y_stride, uv_stride,
+                   yd, ud, vd, yd_stride, uvd_stride, qdiff);
+        // Do mfqe on the second square partition.
+        mfqe_block(bs_tmp, y + y_offset, u + uv_offset, v + uv_offset,
+                   y_stride, uv_stride, yd + y_offset, ud + uv_offset,
+                   vd + uv_offset, yd_stride, uvd_stride, qdiff);
+      }
+      if (mfqe_decision(mi + mi_offset * cm->mi_stride, mfqe_bs)) {
+        // Do mfqe on the first square partition.
+        mfqe_block(bs_tmp, y + y_offset * y_stride, u + uv_offset * uv_stride,
+                   v + uv_offset * uv_stride, y_stride, uv_stride,
+                   yd + y_offset * yd_stride, ud + uv_offset * uvd_stride,
+                   vd + uv_offset * uvd_stride, yd_stride, uvd_stride, qdiff);
+        // Do mfqe on the second square partition.
+        mfqe_block(bs_tmp, y + y_offset * y_stride + y_offset,
+                   u + uv_offset * uv_stride + uv_offset,
+                   v + uv_offset * uv_stride + uv_offset, y_stride,
+                   uv_stride, yd + y_offset * yd_stride + y_offset,
+                   ud + uv_offset * uvd_stride + uv_offset,
+                   vd + uv_offset * uvd_stride + uv_offset,
+                   yd_stride, uvd_stride, qdiff);
+      }
+      break;
+    case PARTITION_VERT:
+      if (bs == BLOCK_64X64) {
+        mfqe_bs = BLOCK_32X64;
+        bs_tmp = BLOCK_32X32;
+      } else {
+        mfqe_bs = BLOCK_16X32;
+        bs_tmp = BLOCK_16X16;
+      }
+      if (mfqe_decision(mi, mfqe_bs)) {
+        // Do mfqe on the first square partition.
+        mfqe_block(bs_tmp, y, u, v, y_stride, uv_stride,
+                   yd, ud, vd, yd_stride, uvd_stride, qdiff);
+        // Do mfqe on the second square partition.
+        mfqe_block(bs_tmp, y + y_offset * y_stride, u + uv_offset * uv_stride,
+                   v + uv_offset * uv_stride, y_stride, uv_stride,
+                   yd + y_offset * yd_stride, ud + uv_offset * uvd_stride,
+                   vd + uv_offset * uvd_stride, yd_stride, uvd_stride, qdiff);
+      }
+      if (mfqe_decision(mi + mi_offset, mfqe_bs)) {
+        // Do mfqe on the first square partition.
+        mfqe_block(bs_tmp, y + y_offset, u + uv_offset, v + uv_offset,
+                   y_stride, uv_stride, yd + y_offset, ud + uv_offset,
+                   vd + uv_offset, yd_stride, uvd_stride, qdiff);
+        // Do mfqe on the second square partition.
+        mfqe_block(bs_tmp, y + y_offset * y_stride + y_offset,
+                   u + uv_offset * uv_stride + uv_offset,
+                   v + uv_offset * uv_stride + uv_offset, y_stride,
+                   uv_stride, yd + y_offset * yd_stride + y_offset,
+                   ud + uv_offset * uvd_stride + uv_offset,
+                   vd + uv_offset * uvd_stride + uv_offset,
+                   yd_stride, uvd_stride, qdiff);
+      }
+      break;
+    case PARTITION_NONE:
+      if (mfqe_decision(mi, cur_bs)) {
+        // Do mfqe on this partition.
+        mfqe_block(cur_bs, y, u, v, y_stride, uv_stride,
+                   yd, ud, vd, yd_stride, uvd_stride, qdiff);
+      } else {
+        // Copy the block from current frame(i.e., no mfqe is done).
+        copy_block(y, u, v, y_stride, uv_stride, yd, ud, vd,
+                   yd_stride, uvd_stride, bs);
+      }
+      break;
+    case PARTITION_SPLIT:
+      // Recursion on four square partitions, e.g. if bs is 64X64,
+      // then look into four 32X32 blocks in it.
+      mfqe_partition(cm, mi, subsize, y, u, v, y_stride, uv_stride, yd, ud, vd,
+                     yd_stride, uvd_stride);
+      mfqe_partition(cm, mi + mi_offset, subsize, y + y_offset, u + uv_offset,
+                     v + uv_offset, y_stride, uv_stride, yd + y_offset,
+                     ud + uv_offset, vd + uv_offset, yd_stride, uvd_stride);
+      mfqe_partition(cm, mi + mi_offset * cm->mi_stride, subsize,
+                     y + y_offset * y_stride, u + uv_offset * uv_stride,
+                     v + uv_offset * uv_stride, y_stride, uv_stride,
+                     yd + y_offset * yd_stride, ud + uv_offset * uvd_stride,
+                     vd + uv_offset * uvd_stride, yd_stride, uvd_stride);
+      mfqe_partition(cm, mi + mi_offset * cm->mi_stride + mi_offset,
+                     subsize, y + y_offset * y_stride + y_offset,
+                     u + uv_offset * uv_stride + uv_offset,
+                     v + uv_offset * uv_stride + uv_offset, y_stride,
+                     uv_stride, yd + y_offset * yd_stride + y_offset,
+                     ud + uv_offset * uvd_stride + uv_offset,
+                     vd + uv_offset * uvd_stride + uv_offset,
+                     yd_stride, uvd_stride);
+      break;
+    default:
+      assert(0);
+  }
+}
+
+void vp10_mfqe(VP10_COMMON *cm) {
+  int mi_row, mi_col;
+  // Current decoded frame.
+  const YV12_BUFFER_CONFIG *show = cm->frame_to_show;
+  // Last decoded frame and will store the MFQE result.
+  YV12_BUFFER_CONFIG *dest = &cm->post_proc_buffer;
+  // Loop through each super block.
+  for (mi_row = 0; mi_row < cm->mi_rows; mi_row += MI_BLOCK_SIZE) {
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
+      MODE_INFO *mi;
+      MODE_INFO *mi_local = cm->mi + (mi_row * cm->mi_stride + mi_col);
+      // Motion Info in last frame.
+      MODE_INFO *mi_prev = cm->postproc_state.prev_mi +
+                           (mi_row * cm->mi_stride + mi_col);
+      const uint32_t y_stride = show->y_stride;
+      const uint32_t uv_stride = show->uv_stride;
+      const uint32_t yd_stride = dest->y_stride;
+      const uint32_t uvd_stride = dest->uv_stride;
+      const uint32_t row_offset_y = mi_row << 3;
+      const uint32_t row_offset_uv = mi_row << 2;
+      const uint32_t col_offset_y = mi_col << 3;
+      const uint32_t col_offset_uv = mi_col << 2;
+      const uint8_t *y = show->y_buffer + row_offset_y * y_stride +
+                         col_offset_y;
+      const uint8_t *u = show->u_buffer + row_offset_uv * uv_stride +
+                         col_offset_uv;
+      const uint8_t *v = show->v_buffer + row_offset_uv * uv_stride +
+                         col_offset_uv;
+      uint8_t *yd = dest->y_buffer + row_offset_y * yd_stride + col_offset_y;
+      uint8_t *ud = dest->u_buffer + row_offset_uv * uvd_stride +
+                    col_offset_uv;
+      uint8_t *vd = dest->v_buffer + row_offset_uv * uvd_stride +
+                    col_offset_uv;
+      if (frame_is_intra_only(cm)) {
+        mi = mi_prev;
+      } else {
+        mi = mi_local;
+      }
+      mfqe_partition(cm, mi, BLOCK_64X64, y, u, v, y_stride, uv_stride, yd, ud,
+                     vd, yd_stride, uvd_stride);
+    }
+  }
+}
diff --git a/libs/libvpx/vp10/common/mfqe.h b/libs/libvpx/vp10/common/mfqe.h
new file mode 100644
index 0000000000..7bedd119f1
--- /dev/null
+++ b/libs/libvpx/vp10/common/mfqe.h
@@ -0,0 +1,31 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_COMMON_MFQE_H_
+#define VP10_COMMON_MFQE_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Multiframe Quality Enhancement.
+// The aim for MFQE is to replace pixel blocks in the current frame with
+// the correlated pixel blocks (with higher quality) in the last frame.
+// The replacement can only be taken in stationary blocks by checking
+// the motion of the blocks and other conditions such as the SAD of
+// the current block and correlated block, the variance of the block
+// difference, etc.
+void vp10_mfqe(struct VP10Common *cm);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_COMMON_MFQE_H_
diff --git a/libs/libvpx/vp10/common/mips/dspr2/itrans16_dspr2.c b/libs/libvpx/vp10/common/mips/dspr2/itrans16_dspr2.c
new file mode 100644
index 0000000000..3d1bd3d906
--- /dev/null
+++ b/libs/libvpx/vp10/common/mips/dspr2/itrans16_dspr2.c
@@ -0,0 +1,108 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp10_rtcd.h"
+#include "vp10/common/common.h"
+#include "vp10/common/blockd.h"
+#include "vp10/common/idct.h"
+#include "vpx_dsp/mips/inv_txfm_dspr2.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_ports/mem.h"
+
+#if HAVE_DSPR2
+void vp10_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
+                                int pitch, int tx_type) {
+  int i, j;
+  DECLARE_ALIGNED(32, int16_t,  out[16 * 16]);
+  int16_t *outptr = out;
+  int16_t temp_out[16];
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp    %[pos],    1    \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  switch (tx_type) {
+    case DCT_DCT:     // DCT in both horizontal and vertical
+      idct16_rows_dspr2(input, outptr, 16);
+      idct16_cols_add_blk_dspr2(out, dest, pitch);
+      break;
+    case ADST_DCT:    // ADST in vertical, DCT in horizontal
+      idct16_rows_dspr2(input, outptr, 16);
+
+      outptr = out;
+
+      for (i = 0; i < 16; ++i) {
+        iadst16_dspr2(outptr, temp_out);
+
+        for (j = 0; j < 16; ++j)
+          dest[j * pitch + i] =
+                    clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+                                      + dest[j * pitch + i]);
+        outptr += 16;
+      }
+      break;
+    case DCT_ADST:    // DCT in vertical, ADST in horizontal
+    {
+      int16_t temp_in[16 * 16];
+
+      for (i = 0; i < 16; ++i) {
+        /* prefetch row */
+        prefetch_load((const uint8_t *)(input + 16));
+
+        iadst16_dspr2(input, outptr);
+        input += 16;
+        outptr += 16;
+      }
+
+      for (i = 0; i < 16; ++i)
+        for (j = 0; j < 16; ++j)
+            temp_in[j * 16 + i] = out[i * 16 + j];
+
+      idct16_cols_add_blk_dspr2(temp_in, dest, pitch);
+    }
+    break;
+    case ADST_ADST:   // ADST in both directions
+    {
+      int16_t temp_in[16];
+
+      for (i = 0; i < 16; ++i) {
+        /* prefetch row */
+        prefetch_load((const uint8_t *)(input + 16));
+
+        iadst16_dspr2(input, outptr);
+        input += 16;
+        outptr += 16;
+      }
+
+      for (i = 0; i < 16; ++i) {
+        for (j = 0; j < 16; ++j)
+          temp_in[j] = out[j * 16 + i];
+        iadst16_dspr2(temp_in, temp_out);
+        for (j = 0; j < 16; ++j)
+          dest[j * pitch + i] =
+                    clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+                                      + dest[j * pitch + i]);
+      }
+    }
+    break;
+    default:
+      printf("vp10_short_iht16x16_add_dspr2 : Invalid tx_type\n");
+      break;
+  }
+}
+#endif  // #if HAVE_DSPR2
diff --git a/libs/libvpx/vp10/common/mips/dspr2/itrans4_dspr2.c b/libs/libvpx/vp10/common/mips/dspr2/itrans4_dspr2.c
new file mode 100644
index 0000000000..5249287b85
--- /dev/null
+++ b/libs/libvpx/vp10/common/mips/dspr2/itrans4_dspr2.c
@@ -0,0 +1,97 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp10_rtcd.h"
+#include "vp10/common/common.h"
+#include "vp10/common/blockd.h"
+#include "vp10/common/idct.h"
+#include "vpx_dsp/mips/inv_txfm_dspr2.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_ports/mem.h"
+
+#if HAVE_DSPR2
+void vp10_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
+                             int dest_stride, int tx_type) {
+  int i, j;
+  DECLARE_ALIGNED(32, int16_t, out[4 * 4]);
+  int16_t *outptr = out;
+  int16_t temp_in[4 * 4], temp_out[4];
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  switch (tx_type) {
+    case DCT_DCT:   // DCT in both horizontal and vertical
+      vpx_idct4_rows_dspr2(input, outptr);
+      vpx_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+      break;
+    case ADST_DCT:  // ADST in vertical, DCT in horizontal
+      vpx_idct4_rows_dspr2(input, outptr);
+
+      outptr = out;
+
+      for (i = 0; i < 4; ++i) {
+        iadst4_dspr2(outptr, temp_out);
+
+        for (j = 0; j < 4; ++j)
+          dest[j * dest_stride + i] =
+                    clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
+                                      + dest[j * dest_stride + i]);
+
+        outptr += 4;
+      }
+      break;
+    case DCT_ADST:  // DCT in vertical, ADST in horizontal
+      for (i = 0; i < 4; ++i) {
+        iadst4_dspr2(input, outptr);
+        input  += 4;
+        outptr += 4;
+      }
+
+      for (i = 0; i < 4; ++i) {
+        for (j = 0; j < 4; ++j) {
+          temp_in[i * 4 + j] = out[j * 4 + i];
+        }
+      }
+      vpx_idct4_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride);
+      break;
+    case ADST_ADST:  // ADST in both directions
+      for (i = 0; i < 4; ++i) {
+        iadst4_dspr2(input, outptr);
+        input  += 4;
+        outptr += 4;
+      }
+
+      for (i = 0; i < 4; ++i) {
+        for (j = 0; j < 4; ++j)
+          temp_in[j] = out[j * 4 + i];
+        iadst4_dspr2(temp_in, temp_out);
+
+        for (j = 0; j < 4; ++j)
+          dest[j * dest_stride + i] =
+                  clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
+                                      + dest[j * dest_stride + i]);
+      }
+      break;
+    default:
+      printf("vp10_short_iht4x4_add_dspr2 : Invalid tx_type\n");
+      break;
+  }
+}
+#endif  // #if HAVE_DSPR2
diff --git a/libs/libvpx/vp10/common/mips/dspr2/itrans8_dspr2.c b/libs/libvpx/vp10/common/mips/dspr2/itrans8_dspr2.c
new file mode 100644
index 0000000000..b25b93aee0
--- /dev/null
+++ b/libs/libvpx/vp10/common/mips/dspr2/itrans8_dspr2.c
@@ -0,0 +1,93 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp10_rtcd.h"
+#include "vp10/common/common.h"
+#include "vp10/common/blockd.h"
+#include "vpx_dsp/mips/inv_txfm_dspr2.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_ports/mem.h"
+
+#if HAVE_DSPR2
+void vp10_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
+                             int dest_stride, int tx_type) {
+  int i, j;
+  DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
+  int16_t *outptr = out;
+  int16_t temp_in[8 * 8], temp_out[8];
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp    %[pos],    1    \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  switch (tx_type) {
+    case DCT_DCT:     // DCT in both horizontal and vertical
+      idct8_rows_dspr2(input, outptr, 8);
+      idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+      break;
+    case ADST_DCT:    // ADST in vertical, DCT in horizontal
+      idct8_rows_dspr2(input, outptr, 8);
+
+      for (i = 0; i < 8; ++i) {
+        iadst8_dspr2(&out[i * 8], temp_out);
+
+        for (j = 0; j < 8; ++j)
+          dest[j * dest_stride + i] =
+                    clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+                                      + dest[j * dest_stride + i]);
+      }
+      break;
+    case DCT_ADST:    // DCT in vertical, ADST in horizontal
+      for (i = 0; i < 8; ++i) {
+        iadst8_dspr2(input, outptr);
+        input += 8;
+        outptr += 8;
+      }
+
+      for (i = 0; i < 8; ++i) {
+        for (j = 0; j < 8; ++j) {
+          temp_in[i * 8 + j] = out[j * 8 + i];
+        }
+      }
+      idct8_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride);
+      break;
+    case ADST_ADST:   // ADST in both directions
+      for (i = 0; i < 8; ++i) {
+        iadst8_dspr2(input, outptr);
+        input += 8;
+        outptr += 8;
+      }
+
+      for (i = 0; i < 8; ++i) {
+        for (j = 0; j < 8; ++j)
+          temp_in[j] = out[j * 8 + i];
+
+        iadst8_dspr2(temp_in, temp_out);
+
+        for (j = 0; j < 8; ++j)
+          dest[j * dest_stride + i] =
+                clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+                                      + dest[j * dest_stride + i]);
+      }
+      break;
+    default:
+      printf("vp10_short_iht8x8_add_dspr2 : Invalid tx_type\n");
+      break;
+  }
+}
+#endif  // #if HAVE_DSPR2
diff --git a/libs/libvpx/vp10/common/mips/msa/idct16x16_msa.c b/libs/libvpx/vp10/common/mips/msa/idct16x16_msa.c
new file mode 100644
index 0000000000..a89e41b3dd
--- /dev/null
+++ b/libs/libvpx/vp10/common/mips/msa/idct16x16_msa.c
@@ -0,0 +1,81 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vp10/common/enums.h"
+#include "vpx_dsp/mips/inv_txfm_msa.h"
+
+void vp10_iht16x16_256_add_msa(const int16_t *input, uint8_t *dst,
+                              int32_t dst_stride, int32_t tx_type) {
+  int32_t i;
+  DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
+  int16_t *out_ptr = &out[0];
+
+  switch (tx_type) {
+    case DCT_DCT:
+      /* transform rows */
+      for (i = 0; i < 2; ++i) {
+        /* process 16 * 8 block */
+        vpx_idct16_1d_rows_msa((input + (i << 7)), (out_ptr + (i << 7)));
+      }
+
+      /* transform columns */
+      for (i = 0; i < 2; ++i) {
+        /* process 8 * 16 block */
+        vpx_idct16_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)),
+                                         dst_stride);
+      }
+      break;
+    case ADST_DCT:
+      /* transform rows */
+      for (i = 0; i < 2; ++i) {
+        /* process 16 * 8 block */
+        vpx_idct16_1d_rows_msa((input + (i << 7)), (out_ptr + (i << 7)));
+      }
+
+      /* transform columns */
+      for (i = 0; i < 2; ++i) {
+        vpx_iadst16_1d_columns_addblk_msa((out_ptr + (i << 3)),
+                                          (dst + (i << 3)), dst_stride);
+      }
+      break;
+    case DCT_ADST:
+      /* transform rows */
+      for (i = 0; i < 2; ++i) {
+        /* process 16 * 8 block */
+        vpx_iadst16_1d_rows_msa((input + (i << 7)), (out_ptr + (i << 7)));
+      }
+
+      /* transform columns */
+      for (i = 0; i < 2; ++i) {
+        /* process 8 * 16 block */
+        vpx_idct16_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)),
+                                         dst_stride);
+      }
+      break;
+    case ADST_ADST:
+      /* transform rows */
+      for (i = 0; i < 2; ++i) {
+        /* process 16 * 8 block */
+        vpx_iadst16_1d_rows_msa((input + (i << 7)), (out_ptr + (i << 7)));
+      }
+
+      /* transform columns */
+      for (i = 0; i < 2; ++i) {
+        vpx_iadst16_1d_columns_addblk_msa((out_ptr + (i << 3)),
+                                          (dst + (i << 3)), dst_stride);
+      }
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
diff --git a/libs/libvpx/vp10/common/mips/msa/idct4x4_msa.c b/libs/libvpx/vp10/common/mips/msa/idct4x4_msa.c
new file mode 100644
index 0000000000..866f321ab7
--- /dev/null
+++ b/libs/libvpx/vp10/common/mips/msa/idct4x4_msa.c
@@ -0,0 +1,62 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vp10/common/enums.h"
+#include "vpx_dsp/mips/inv_txfm_msa.h"
+
+void vp10_iht4x4_16_add_msa(const int16_t *input, uint8_t *dst,
+                           int32_t dst_stride, int32_t tx_type) {
+  v8i16 in0, in1, in2, in3;
+
+  /* load vector elements of 4x4 block */
+  LD4x4_SH(input, in0, in1, in2, in3);
+  TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+
+  switch (tx_type) {
+    case DCT_DCT:
+      /* DCT in horizontal */
+      VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+      /* DCT in vertical */
+      TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+      VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+      break;
+    case ADST_DCT:
+      /* DCT in horizontal */
+      VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+      /* ADST in vertical */
+      TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+      VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+      break;
+    case DCT_ADST:
+      /* ADST in horizontal */
+      VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+      /* DCT in vertical */
+      TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+      VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+      break;
+    case ADST_ADST:
+      /* ADST in horizontal */
+      VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+      /* ADST in vertical */
+      TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+      VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+
+  /* final rounding (add 2^3, divide by 2^4) and shift */
+  SRARI_H4_SH(in0, in1, in2, in3, 4);
+  /* add block and store 4x4 */
+  ADDBLK_ST4x4_UB(in0, in1, in2, in3, dst, dst_stride);
+}
diff --git a/libs/libvpx/vp10/common/mips/msa/idct8x8_msa.c b/libs/libvpx/vp10/common/mips/msa/idct8x8_msa.c
new file mode 100644
index 0000000000..726af4e9ec
--- /dev/null
+++ b/libs/libvpx/vp10/common/mips/msa/idct8x8_msa.c
@@ -0,0 +1,80 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vp10/common/enums.h"
+#include "vpx_dsp/mips/inv_txfm_msa.h"
+
+void vp10_iht8x8_64_add_msa(const int16_t *input, uint8_t *dst,
+                           int32_t dst_stride, int32_t tx_type) {
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+  /* load vector elements of 8x8 block */
+  LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+
+  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                     in0, in1, in2, in3, in4, in5, in6, in7);
+
+  switch (tx_type) {
+    case DCT_DCT:
+      /* DCT in horizontal */
+      VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
+                     in0, in1, in2, in3, in4, in5, in6, in7);
+      /* DCT in vertical */
+      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                         in0, in1, in2, in3, in4, in5, in6, in7);
+      VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
+                     in0, in1, in2, in3, in4, in5, in6, in7);
+      break;
+    case ADST_DCT:
+      /* DCT in horizontal */
+      VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
+                     in0, in1, in2, in3, in4, in5, in6, in7);
+      /* ADST in vertical */
+      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                         in0, in1, in2, in3, in4, in5, in6, in7);
+      VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,
+                in0, in1, in2, in3, in4, in5, in6, in7);
+      break;
+    case DCT_ADST:
+      /* ADST in horizontal */
+      VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,
+                in0, in1, in2, in3, in4, in5, in6, in7);
+      /* DCT in vertical */
+      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                         in0, in1, in2, in3, in4, in5, in6, in7);
+      VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
+                     in0, in1, in2, in3, in4, in5, in6, in7);
+      break;
+    case ADST_ADST:
+      /* ADST in horizontal */
+      VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,
+                in0, in1, in2, in3, in4, in5, in6, in7);
+      /* ADST in vertical */
+      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                         in0, in1, in2, in3, in4, in5, in6, in7);
+      VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,
+                in0, in1, in2, in3, in4, in5, in6, in7);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+
+  /* final rounding (add 2^4, divide by 2^5) and shift */
+  SRARI_H4_SH(in0, in1, in2, in3, 5);
+  SRARI_H4_SH(in4, in5, in6, in7, 5);
+
+  /* add block and store 8x8 */
+  VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3);
+  dst += (4 * dst_stride);
+  VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7);
+}
diff --git a/libs/libvpx/vp10/common/mips/msa/mfqe_msa.c b/libs/libvpx/vp10/common/mips/msa/mfqe_msa.c
new file mode 100644
index 0000000000..3a593a1a1c
--- /dev/null
+++ b/libs/libvpx/vp10/common/mips/msa/mfqe_msa.c
@@ -0,0 +1,137 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp10_rtcd.h"
+#include "vp10/common/onyxc_int.h"
+#include "vpx_dsp/mips/macros_msa.h"
+
+static void filter_by_weight8x8_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                    uint8_t *dst_ptr, int32_t dst_stride,
+                                    int32_t src_weight) {
+  int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight;
+  int32_t row;
+  uint64_t src0_d, src1_d, dst0_d, dst1_d;
+  v16i8 src0 = { 0 };
+  v16i8 src1 = { 0 };
+  v16i8 dst0 = { 0 };
+  v16i8 dst1 = { 0 };
+  v8i16 src_wt, dst_wt, res_h_r, res_h_l, src_r, src_l, dst_r, dst_l;
+
+  src_wt = __msa_fill_h(src_weight);
+  dst_wt = __msa_fill_h(dst_weight);
+
+  for (row = 2; row--;) {
+    LD2(src_ptr, src_stride, src0_d, src1_d);
+    src_ptr += (2 * src_stride);
+    LD2(dst_ptr, dst_stride, dst0_d, dst1_d);
+    INSERT_D2_SB(src0_d, src1_d, src0);
+    INSERT_D2_SB(dst0_d, dst1_d, dst0);
+
+    LD2(src_ptr, src_stride, src0_d, src1_d);
+    src_ptr += (2 * src_stride);
+    LD2((dst_ptr + 2 * dst_stride), dst_stride, dst0_d, dst1_d);
+    INSERT_D2_SB(src0_d, src1_d, src1);
+    INSERT_D2_SB(dst0_d, dst1_d, dst1);
+
+    UNPCK_UB_SH(src0, src_r, src_l);
+    UNPCK_UB_SH(dst0, dst_r, dst_l);
+    res_h_r = (src_r * src_wt);
+    res_h_r += (dst_r * dst_wt);
+    res_h_l = (src_l * src_wt);
+    res_h_l += (dst_l * dst_wt);
+    SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+    dst0 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r);
+    ST8x2_UB(dst0, dst_ptr, dst_stride);
+    dst_ptr += (2 * dst_stride);
+
+    UNPCK_UB_SH(src1, src_r, src_l);
+    UNPCK_UB_SH(dst1, dst_r, dst_l);
+    res_h_r = (src_r * src_wt);
+    res_h_r += (dst_r * dst_wt);
+    res_h_l = (src_l * src_wt);
+    res_h_l += (dst_l * dst_wt);
+    SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+    dst1 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r);
+    ST8x2_UB(dst1, dst_ptr, dst_stride);
+    dst_ptr += (2 * dst_stride);
+  }
+}
+
+static void filter_by_weight16x16_msa(const uint8_t *src_ptr,
+                                      int32_t src_stride,
+                                      uint8_t *dst_ptr,
+                                      int32_t dst_stride,
+                                      int32_t src_weight) {
+  int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight;
+  int32_t row;
+  v16i8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
+  v8i16 src_wt, dst_wt, res_h_r, res_h_l, src_r, src_l, dst_r, dst_l;
+
+  src_wt = __msa_fill_h(src_weight);
+  dst_wt = __msa_fill_h(dst_weight);
+
+  for (row = 4; row--;) {
+    LD_SB4(src_ptr, src_stride, src0, src1, src2, src3);
+    src_ptr += (4 * src_stride);
+    LD_SB4(dst_ptr, dst_stride, dst0, dst1, dst2, dst3);
+
+    UNPCK_UB_SH(src0, src_r, src_l);
+    UNPCK_UB_SH(dst0, dst_r, dst_l);
+    res_h_r = (src_r * src_wt);
+    res_h_r += (dst_r * dst_wt);
+    res_h_l = (src_l * src_wt);
+    res_h_l += (dst_l * dst_wt);
+    SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+    PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
+    dst_ptr += dst_stride;
+
+    UNPCK_UB_SH(src1, src_r, src_l);
+    UNPCK_UB_SH(dst1, dst_r, dst_l);
+    res_h_r = (src_r * src_wt);
+    res_h_r += (dst_r * dst_wt);
+    res_h_l = (src_l * src_wt);
+    res_h_l += (dst_l * dst_wt);
+    SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+    PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
+    dst_ptr += dst_stride;
+
+    UNPCK_UB_SH(src2, src_r, src_l);
+    UNPCK_UB_SH(dst2, dst_r, dst_l);
+    res_h_r = (src_r * src_wt);
+    res_h_r += (dst_r * dst_wt);
+    res_h_l = (src_l * src_wt);
+    res_h_l += (dst_l * dst_wt);
+    SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+    PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
+    dst_ptr += dst_stride;
+
+    UNPCK_UB_SH(src3, src_r, src_l);
+    UNPCK_UB_SH(dst3, dst_r, dst_l);
+    res_h_r = (src_r * src_wt);
+    res_h_r += (dst_r * dst_wt);
+    res_h_l = (src_l * src_wt);
+    res_h_l += (dst_l * dst_wt);
+    SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+    PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
+    dst_ptr += dst_stride;
+  }
+}
+
+void vp10_filter_by_weight8x8_msa(const uint8_t *src, int src_stride,
+                                 uint8_t *dst, int dst_stride,
+                                 int src_weight) {
+  filter_by_weight8x8_msa(src, src_stride, dst, dst_stride, src_weight);
+}
+
+void vp10_filter_by_weight16x16_msa(const uint8_t *src, int src_stride,
+                                   uint8_t *dst, int dst_stride,
+                                   int src_weight) {
+  filter_by_weight16x16_msa(src, src_stride, dst, dst_stride, src_weight);
+}
diff --git a/libs/libvpx/vp10/common/mv.h b/libs/libvpx/vp10/common/mv.h
new file mode 100644
index 0000000000..b4971a567e
--- /dev/null
+++ b/libs/libvpx/vp10/common/mv.h
@@ -0,0 +1,55 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_COMMON_MV_H_
+#define VP10_COMMON_MV_H_
+
+#include "vpx/vpx_integer.h"
+
+#include "vp10/common/common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct mv {
+  int16_t row;
+  int16_t col;
+} MV;
+
+typedef union int_mv {
+  uint32_t as_int;
+  MV as_mv;
+} int_mv; /* facilitates faster equality tests and copies */
+
+typedef struct mv32 {
+  int32_t row;
+  int32_t col;
+} MV32;
+
+static INLINE int is_zero_mv(const MV *mv) {
+  return *((const uint32_t *)mv) == 0;
+}
+
+static INLINE int is_equal_mv(const MV *a, const MV *b) {
+  return  *((const uint32_t *)a) == *((const uint32_t *)b);
+}
+
+static INLINE void clamp_mv(MV *mv, int min_col, int max_col,
+                            int min_row, int max_row) {
+  mv->col = clamp(mv->col, min_col, max_col);
+  mv->row = clamp(mv->row, min_row, max_row);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_COMMON_MV_H_
diff --git a/libs/libvpx/vp10/common/mvref_common.c b/libs/libvpx/vp10/common/mvref_common.c
new file mode 100644
index 0000000000..1ef80c21aa
--- /dev/null
+++ b/libs/libvpx/vp10/common/mvref_common.c
@@ -0,0 +1,243 @@
+
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp10/common/mvref_common.h"
+
+// This function searches the neighbourhood of a given MB/SB
+// to try and find candidate reference vectors.
+static void find_mv_refs_idx(const VP10_COMMON *cm, const MACROBLOCKD *xd,
+                             MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
+                             int_mv *mv_ref_list,
+                             int block, int mi_row, int mi_col,
+                             find_mv_refs_sync sync, void *const data,
+                             uint8_t *mode_context) {
+  const int *ref_sign_bias = cm->ref_frame_sign_bias;
+  int i, refmv_count = 0;
+  const POSITION *const mv_ref_search = mv_ref_blocks[mi->mbmi.sb_type];
+  int different_ref_found = 0;
+  int context_counter = 0;
+  const MV_REF *const  prev_frame_mvs = cm->use_prev_frame_mvs ?
+      cm->prev_frame->mvs + mi_row * cm->mi_cols + mi_col : NULL;
+  const TileInfo *const tile = &xd->tile;
+  const int bw = num_8x8_blocks_wide_lookup[mi->mbmi.sb_type] << 3;
+  const int bh = num_8x8_blocks_high_lookup[mi->mbmi.sb_type] << 3;
+
+#if !CONFIG_MISC_FIXES
+  // Blank the reference vector list
+  memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES);
+#endif
+
+  // The nearest 2 blocks are treated differently
+  // if the size < 8x8 we get the mv from the bmi substructure,
+  // and we also need to keep a mode count.
+  for (i = 0; i < 2; ++i) {
+    const POSITION *const mv_ref = &mv_ref_search[i];
+    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
+      const MODE_INFO *const candidate_mi = xd->mi[mv_ref->col + mv_ref->row *
+                                                   xd->mi_stride];
+      const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
+      // Keep counts for entropy encoding.
+      context_counter += mode_2_counter[candidate->mode];
+      different_ref_found = 1;
+
+      if (candidate->ref_frame[0] == ref_frame)
+        ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 0, mv_ref->col, block),
+                        refmv_count, mv_ref_list, bw, bh, xd, Done);
+      else if (candidate->ref_frame[1] == ref_frame)
+        ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 1, mv_ref->col, block),
+                        refmv_count, mv_ref_list, bw, bh, xd, Done);
+    }
+  }
+
+  // Check the rest of the neighbors in much the same way
+  // as before except we don't need to keep track of sub blocks or
+  // mode counts.
+  for (; i < MVREF_NEIGHBOURS; ++i) {
+    const POSITION *const mv_ref = &mv_ref_search[i];
+    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
+      const MB_MODE_INFO *const candidate = &xd->mi[mv_ref->col + mv_ref->row *
+                                                    xd->mi_stride]->mbmi;
+      different_ref_found = 1;
+
+      if (candidate->ref_frame[0] == ref_frame)
+        ADD_MV_REF_LIST(candidate->mv[0], refmv_count, mv_ref_list,
+                        bw, bh, xd, Done);
+      else if (candidate->ref_frame[1] == ref_frame)
+        ADD_MV_REF_LIST(candidate->mv[1], refmv_count, mv_ref_list,
+                        bw, bh, xd, Done);
+    }
+  }
+
+  // TODO(hkuang): Remove this sync after fixing pthread_cond_broadcast
+  // on windows platform. The sync here is unncessary if use_perv_frame_mvs
+  // is 0. But after removing it, there will be hang in the unit test on windows
+  // due to several threads waiting for a thread's signal.
+#if defined(_WIN32) && !HAVE_PTHREAD_H
+    if (cm->frame_parallel_decode && sync != NULL) {
+      sync(data, mi_row);
+    }
+#endif
+
+  // Check the last frame's mode and mv info.
+  if (cm->use_prev_frame_mvs) {
+    // Synchronize here for frame parallel decode if sync function is provided.
+    if (cm->frame_parallel_decode && sync != NULL) {
+      sync(data, mi_row);
+    }
+
+    if (prev_frame_mvs->ref_frame[0] == ref_frame) {
+      ADD_MV_REF_LIST(prev_frame_mvs->mv[0], refmv_count, mv_ref_list,
+                      bw, bh, xd, Done);
+    } else if (prev_frame_mvs->ref_frame[1] == ref_frame) {
+      ADD_MV_REF_LIST(prev_frame_mvs->mv[1], refmv_count, mv_ref_list,
+                      bw, bh, xd, Done);
+    }
+  }
+
+  // Since we couldn't find 2 mvs from the same reference frame
+  // go back through the neighbors and find motion vectors from
+  // different reference frames.
+  if (different_ref_found) {
+    for (i = 0; i < MVREF_NEIGHBOURS; ++i) {
+      const POSITION *mv_ref = &mv_ref_search[i];
+      if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
+        const MB_MODE_INFO *const candidate = &xd->mi[mv_ref->col + mv_ref->row
+                                              * xd->mi_stride]->mbmi;
+
+        // If the candidate is INTRA we don't want to consider its mv.
+        IF_DIFF_REF_FRAME_ADD_MV(candidate, ref_frame, ref_sign_bias,
+                                 refmv_count, mv_ref_list, bw, bh, xd, Done);
+      }
+    }
+  }
+
+  // Since we still don't have a candidate we'll try the last frame.
+  if (cm->use_prev_frame_mvs) {
+    if (prev_frame_mvs->ref_frame[0] != ref_frame &&
+        prev_frame_mvs->ref_frame[0] > INTRA_FRAME) {
+      int_mv mv = prev_frame_mvs->mv[0];
+      if (ref_sign_bias[prev_frame_mvs->ref_frame[0]] !=
+          ref_sign_bias[ref_frame]) {
+        mv.as_mv.row *= -1;
+        mv.as_mv.col *= -1;
+      }
+      ADD_MV_REF_LIST(mv, refmv_count, mv_ref_list, bw, bh, xd, Done);
+    }
+
+    if (prev_frame_mvs->ref_frame[1] > INTRA_FRAME &&
+#if !CONFIG_MISC_FIXES
+        prev_frame_mvs->mv[1].as_int != prev_frame_mvs->mv[0].as_int &&
+#endif
+        prev_frame_mvs->ref_frame[1] != ref_frame) {
+      int_mv mv = prev_frame_mvs->mv[1];
+      if (ref_sign_bias[prev_frame_mvs->ref_frame[1]] !=
+          ref_sign_bias[ref_frame]) {
+        mv.as_mv.row *= -1;
+        mv.as_mv.col *= -1;
+      }
+      ADD_MV_REF_LIST(mv, refmv_count, mv_ref_list, bw, bh, xd, Done);
+    }
+  }
+
+ Done:
+
+  mode_context[ref_frame] = counter_to_context[context_counter];
+
+#if CONFIG_MISC_FIXES
+  for (i = refmv_count; i < MAX_MV_REF_CANDIDATES; ++i)
+      mv_ref_list[i].as_int = 0;
+#else
+  // Clamp vectors
+  for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i)
+    clamp_mv_ref(&mv_ref_list[i].as_mv, bw, bh, xd);
+#endif
+}
+
+void vp10_find_mv_refs(const VP10_COMMON *cm, const MACROBLOCKD *xd,
+                      MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
+                      int_mv *mv_ref_list,
+                      int mi_row, int mi_col,
+                      find_mv_refs_sync sync, void *const data,
+                      uint8_t *mode_context) {
+  find_mv_refs_idx(cm, xd, mi, ref_frame, mv_ref_list, -1,
+                   mi_row, mi_col, sync, data, mode_context);
+}
+
+static void lower_mv_precision(MV *mv, int allow_hp) {
+  const int use_hp = allow_hp && vp10_use_mv_hp(mv);
+  if (!use_hp) {
+    if (mv->row & 1)
+      mv->row += (mv->row > 0 ? -1 : 1);
+    if (mv->col & 1)
+      mv->col += (mv->col > 0 ? -1 : 1);
+  }
+}
+
+void vp10_find_best_ref_mvs(int allow_hp,
+                           int_mv *mvlist, int_mv *nearest_mv,
+                           int_mv *near_mv) {
+  int i;
+  // Make sure all the candidates are properly clamped etc
+  for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {
+    lower_mv_precision(&mvlist[i].as_mv, allow_hp);
+  }
+  *nearest_mv = mvlist[0];
+  *near_mv = mvlist[1];
+}
+
+void vp10_append_sub8x8_mvs_for_idx(VP10_COMMON *cm, MACROBLOCKD *xd,
+                                   int block, int ref, int mi_row, int mi_col,
+                                   int_mv *nearest_mv, int_mv *near_mv,
+                                   uint8_t *mode_context) {
+  int_mv mv_list[MAX_MV_REF_CANDIDATES];
+  MODE_INFO *const mi = xd->mi[0];
+  b_mode_info *bmi = mi->bmi;
+  int n;
+
+  assert(MAX_MV_REF_CANDIDATES == 2);
+
+  find_mv_refs_idx(cm, xd, mi, mi->mbmi.ref_frame[ref], mv_list, block,
+                   mi_row, mi_col, NULL, NULL, mode_context);
+
+  near_mv->as_int = 0;
+  switch (block) {
+    case 0:
+      nearest_mv->as_int = mv_list[0].as_int;
+      near_mv->as_int = mv_list[1].as_int;
+      break;
+    case 1:
+    case 2:
+      nearest_mv->as_int = bmi[0].as_mv[ref].as_int;
+      for (n = 0; n < MAX_MV_REF_CANDIDATES; ++n)
+        if (nearest_mv->as_int != mv_list[n].as_int) {
+          near_mv->as_int = mv_list[n].as_int;
+          break;
+        }
+      break;
+    case 3: {
+      int_mv candidates[2 + MAX_MV_REF_CANDIDATES];
+      candidates[0] = bmi[1].as_mv[ref];
+      candidates[1] = bmi[0].as_mv[ref];
+      candidates[2] = mv_list[0];
+      candidates[3] = mv_list[1];
+
+      nearest_mv->as_int = bmi[2].as_mv[ref].as_int;
+      for (n = 0; n < 2 + MAX_MV_REF_CANDIDATES; ++n)
+        if (nearest_mv->as_int != candidates[n].as_int) {
+          near_mv->as_int = candidates[n].as_int;
+          break;
+        }
+      break;
+    }
+    default:
+      assert(0 && "Invalid block index.");
+  }
+}
diff --git a/libs/libvpx/vp10/common/mvref_common.h b/libs/libvpx/vp10/common/mvref_common.h
new file mode 100644
index 0000000000..0a98866149
--- /dev/null
+++ b/libs/libvpx/vp10/common/mvref_common.h
@@ -0,0 +1,239 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef VP10_COMMON_MVREF_COMMON_H_
+#define VP10_COMMON_MVREF_COMMON_H_
+
+#include "vp10/common/onyxc_int.h"
+#include "vp10/common/blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MVREF_NEIGHBOURS 8
+
+typedef struct position {
+  int row;
+  int col;
+} POSITION;
+
+typedef enum {
+  BOTH_ZERO = 0,
+  ZERO_PLUS_PREDICTED = 1,
+  BOTH_PREDICTED = 2,
+  NEW_PLUS_NON_INTRA = 3,
+  BOTH_NEW = 4,
+  INTRA_PLUS_NON_INTRA = 5,
+  BOTH_INTRA = 6,
+  INVALID_CASE = 9
+} motion_vector_context;
+
+// This is used to figure out a context for the ref blocks. The code flattens
+// an array that would have 3 possible counts (0, 1 & 2) for 3 choices by
+// adding 9 for each intra block, 3 for each zero mv and 1 for each new
+// motion vector. This single number is then converted into a context
+// with a single lookup ( counter_to_context ).
+static const int mode_2_counter[MB_MODE_COUNT] = {
+  9,  // DC_PRED
+  9,  // V_PRED
+  9,  // H_PRED
+  9,  // D45_PRED
+  9,  // D135_PRED
+  9,  // D117_PRED
+  9,  // D153_PRED
+  9,  // D207_PRED
+  9,  // D63_PRED
+  9,  // TM_PRED
+  0,  // NEARESTMV
+  0,  // NEARMV
+  3,  // ZEROMV
+  1,  // NEWMV
+};
+
+// There are 3^3 different combinations of 3 counts that can be either 0,1 or
+// 2. However the actual count can never be greater than 2 so the highest
+// counter we need is 18. 9 is an invalid counter that's never used.
+static const int counter_to_context[19] = {
+  BOTH_PREDICTED,  // 0
+  NEW_PLUS_NON_INTRA,  // 1
+  BOTH_NEW,  // 2
+  ZERO_PLUS_PREDICTED,  // 3
+  NEW_PLUS_NON_INTRA,  // 4
+  INVALID_CASE,  // 5
+  BOTH_ZERO,  // 6
+  INVALID_CASE,  // 7
+  INVALID_CASE,  // 8
+  INTRA_PLUS_NON_INTRA,  // 9
+  INTRA_PLUS_NON_INTRA,  // 10
+  INVALID_CASE,  // 11
+  INTRA_PLUS_NON_INTRA,  // 12
+  INVALID_CASE,  // 13
+  INVALID_CASE,  // 14
+  INVALID_CASE,  // 15
+  INVALID_CASE,  // 16
+  INVALID_CASE,  // 17
+  BOTH_INTRA  // 18
+};
+
+static const POSITION mv_ref_blocks[BLOCK_SIZES][MVREF_NEIGHBOURS] = {
+  // 4X4
+  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
+  // 4X8
+  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
+  // 8X4
+  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
+  // 8X8
+  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
+  // 8X16
+  {{0, -1}, {-1, 0}, {1, -1}, {-1, -1}, {0, -2}, {-2, 0}, {-2, -1}, {-1, -2}},
+  // 16X8
+  {{-1, 0}, {0, -1}, {-1, 1}, {-1, -1}, {-2, 0}, {0, -2}, {-1, -2}, {-2, -1}},
+  // 16X16
+  {{-1, 0}, {0, -1}, {-1, 1}, {1, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-3, -3}},
+  // 16X32
+  {{0, -1}, {-1, 0}, {2, -1}, {-1, -1}, {-1, 1}, {0, -3}, {-3, 0}, {-3, -3}},
+  // 32X16
+  {{-1, 0}, {0, -1}, {-1, 2}, {-1, -1}, {1, -1}, {-3, 0}, {0, -3}, {-3, -3}},
+  // 32X32
+  {{-1, 1}, {1, -1}, {-1, 2}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-3, -3}},
+  // 32X64
+  {{0, -1}, {-1, 0}, {4, -1}, {-1, 2}, {-1, -1}, {0, -3}, {-3, 0}, {2, -1}},
+  // 64X32
+  {{-1, 0}, {0, -1}, {-1, 4}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-1, 2}},
+  // 64X64
+  {{-1, 3}, {3, -1}, {-1, 4}, {4, -1}, {-1, -1}, {-1, 0}, {0, -1}, {-1, 6}}
+};
+
+static const int idx_n_column_to_subblock[4][2] = {
+  {1, 2},
+  {1, 3},
+  {3, 2},
+  {3, 3}
+};
+
+// clamp_mv_ref
+#if CONFIG_MISC_FIXES
+#define MV_BORDER (8 << 3)  // Allow 8 pels in 1/8th pel units
+#else
+#define MV_BORDER (16 << 3)  // Allow 16 pels in 1/8th pel units
+#endif
+
+static INLINE void clamp_mv_ref(MV *mv, int bw, int bh, const MACROBLOCKD *xd) {
+#if CONFIG_MISC_FIXES
+  clamp_mv(mv, xd->mb_to_left_edge - bw * 8 - MV_BORDER,
+               xd->mb_to_right_edge + bw * 8 + MV_BORDER,
+               xd->mb_to_top_edge - bh * 8 - MV_BORDER,
+               xd->mb_to_bottom_edge + bh * 8 + MV_BORDER);
+#else
+  (void) bw;
+  (void) bh;
+  clamp_mv(mv, xd->mb_to_left_edge - MV_BORDER,
+               xd->mb_to_right_edge + MV_BORDER,
+               xd->mb_to_top_edge - MV_BORDER,
+               xd->mb_to_bottom_edge + MV_BORDER);
+#endif
+}
+
+// This function returns either the appropriate sub block or block's mv
+// on whether the block_size < 8x8 and we have check_sub_blocks set.
+static INLINE int_mv get_sub_block_mv(const MODE_INFO *candidate, int which_mv,
+                                      int search_col, int block_idx) {
+  return block_idx >= 0 && candidate->mbmi.sb_type < BLOCK_8X8
+          ? candidate->bmi[idx_n_column_to_subblock[block_idx][search_col == 0]]
+              .as_mv[which_mv]
+          : candidate->mbmi.mv[which_mv];
+}
+
+
+// Performs mv sign inversion if indicated by the reference frame combination.
+static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref,
+                              const MV_REFERENCE_FRAME this_ref_frame,
+                              const int *ref_sign_bias) {
+  int_mv mv = mbmi->mv[ref];
+  if (ref_sign_bias[mbmi->ref_frame[ref]] != ref_sign_bias[this_ref_frame]) {
+    mv.as_mv.row *= -1;
+    mv.as_mv.col *= -1;
+  }
+  return mv;
+}
+
+#if CONFIG_MISC_FIXES
+#define CLIP_IN_ADD(mv, bw, bh, xd) clamp_mv_ref(mv, bw, bh, xd)
+#else
+#define CLIP_IN_ADD(mv, bw, bh, xd) do {} while (0)
+#endif
+
+// This macro is used to add a motion vector mv_ref list if it isn't
+// already in the list.  If it's the second motion vector it will also
+// skip all additional processing and jump to done!
+#define ADD_MV_REF_LIST(mv, refmv_count, mv_ref_list, bw, bh, xd, Done) \
+  do { \
+    (mv_ref_list)[(refmv_count)] = (mv); \
+    CLIP_IN_ADD(&(mv_ref_list)[(refmv_count)].as_mv, (bw), (bh), (xd)); \
+    if (refmv_count && (mv_ref_list)[1].as_int != (mv_ref_list)[0].as_int) { \
+        (refmv_count) = 2; \
+        goto Done; \
+    } \
+    (refmv_count) = 1; \
+  } while (0)
+
+// If either reference frame is different, not INTRA, and they
+// are different from each other scale and add the mv to our list.
+#define IF_DIFF_REF_FRAME_ADD_MV(mbmi, ref_frame, ref_sign_bias, refmv_count, \
+                                 mv_ref_list, bw, bh, xd, Done) \
+  do { \
+    if (is_inter_block(mbmi)) { \
+      if ((mbmi)->ref_frame[0] != ref_frame) \
+        ADD_MV_REF_LIST(scale_mv((mbmi), 0, ref_frame, ref_sign_bias), \
+                        refmv_count, mv_ref_list, bw, bh, xd, Done); \
+      if (has_second_ref(mbmi) && \
+          (CONFIG_MISC_FIXES || \
+           (mbmi)->mv[1].as_int != (mbmi)->mv[0].as_int) && \
+          (mbmi)->ref_frame[1] != ref_frame) \
+        ADD_MV_REF_LIST(scale_mv((mbmi), 1, ref_frame, ref_sign_bias), \
+                        refmv_count, mv_ref_list, bw, bh, xd, Done); \
+    } \
+  } while (0)
+
+
+// Checks that the given mi_row, mi_col and search point
+// are inside the borders of the tile.
+static INLINE int is_inside(const TileInfo *const tile,
+                            int mi_col, int mi_row, int mi_rows,
+                            const POSITION *mi_pos) {
+  return !(mi_row + mi_pos->row < 0 ||
+           mi_col + mi_pos->col < tile->mi_col_start ||
+           mi_row + mi_pos->row >= mi_rows ||
+           mi_col + mi_pos->col >= tile->mi_col_end);
+}
+
+typedef void (*find_mv_refs_sync)(void *const data, int mi_row);
+void vp10_find_mv_refs(const VP10_COMMON *cm, const MACROBLOCKD *xd,
+                      MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
+                      int_mv *mv_ref_list, int mi_row, int mi_col,
+                      find_mv_refs_sync sync, void *const data,
+                      uint8_t *mode_context);
+
+// check a list of motion vectors by sad score using a number rows of pixels
+// above and a number cols of pixels in the left to select the one with best
+// score to use as ref motion vector
+void vp10_find_best_ref_mvs(int allow_hp,
+                           int_mv *mvlist, int_mv *nearest_mv, int_mv *near_mv);
+
+void vp10_append_sub8x8_mvs_for_idx(VP10_COMMON *cm, MACROBLOCKD *xd,
+                                   int block, int ref, int mi_row, int mi_col,
+                                   int_mv *nearest_mv, int_mv *near_mv,
+                                   uint8_t *mode_context);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_COMMON_MVREF_COMMON_H_
diff --git a/libs/libvpx/vp10/common/onyxc_int.h b/libs/libvpx/vp10/common/onyxc_int.h
new file mode 100644
index 0000000000..ffef73312a
--- /dev/null
+++ b/libs/libvpx/vp10/common/onyxc_int.h
@@ -0,0 +1,494 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_COMMON_ONYXC_INT_H_
+#define VP10_COMMON_ONYXC_INT_H_
+
+#include "./vpx_config.h"
+#include "vpx/internal/vpx_codec_internal.h"
+#include "vpx_util/vpx_thread.h"
+#include "./vp10_rtcd.h"
+#include "vp10/common/alloccommon.h"
+#include "vp10/common/loopfilter.h"
+#include "vp10/common/entropymv.h"
+#include "vp10/common/entropy.h"
+#include "vp10/common/entropymode.h"
+#include "vp10/common/frame_buffers.h"
+#include "vp10/common/quant_common.h"
+#include "vp10/common/tile_common.h"
+
+#if CONFIG_VP9_POSTPROC
+#include "vp10/common/postproc.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define REFS_PER_FRAME (ALTREF_FRAME - LAST_FRAME + 1)
+
+#define REF_FRAMES_LOG2 3
+#define REF_FRAMES (1 << REF_FRAMES_LOG2)
+
+// 4 scratch frames for the new frames to support a maximum of 4 cores decoding
+// in parallel, 3 for scaled references on the encoder.
+// TODO(hkuang): Add ondemand frame buffers instead of hardcoding the number
+// of framebuffers.
+// TODO(jkoleszar): These 3 extra references could probably come from the
+// normal reference pool.
+#define FRAME_BUFFERS (REF_FRAMES + 7)
+
+#define FRAME_CONTEXTS_LOG2 2
+#define FRAME_CONTEXTS (1 << FRAME_CONTEXTS_LOG2)
+
+#define NUM_PING_PONG_BUFFERS 2
+
+typedef enum {
+  SINGLE_REFERENCE      = 0,
+  COMPOUND_REFERENCE    = 1,
+  REFERENCE_MODE_SELECT = 2,
+  REFERENCE_MODES       = 3,
+} REFERENCE_MODE;
+
+typedef enum {
+  RESET_FRAME_CONTEXT_NONE = 0,
+  RESET_FRAME_CONTEXT_CURRENT = 1,
+  RESET_FRAME_CONTEXT_ALL = 2,
+} RESET_FRAME_CONTEXT_MODE;
+
+typedef enum {
+  /**
+   * Don't update frame context
+   */
+  REFRESH_FRAME_CONTEXT_OFF,
+  /**
+   * Update frame context to values resulting from forward probability
+   * updates signaled in the frame header
+   */
+  REFRESH_FRAME_CONTEXT_FORWARD,
+  /**
+   * Update frame context to values resulting from backward probability
+   * updates based on entropy/counts in the decoded frame
+   */
+  REFRESH_FRAME_CONTEXT_BACKWARD,
+} REFRESH_FRAME_CONTEXT_MODE;
+
+typedef struct {
+  int_mv mv[2];
+  MV_REFERENCE_FRAME ref_frame[2];
+} MV_REF;
+
+typedef struct {
+  int ref_count;
+  MV_REF *mvs;
+  int mi_rows;
+  int mi_cols;
+  vpx_codec_frame_buffer_t raw_frame_buffer;
+  YV12_BUFFER_CONFIG buf;
+
+  // The Following variables will only be used in frame parallel decode.
+
+  // frame_worker_owner indicates which FrameWorker owns this buffer. NULL means
+  // that no FrameWorker owns, or is decoding, this buffer.
+  VPxWorker *frame_worker_owner;
+
+  // row and col indicate which position frame has been decoded to in real
+  // pixel unit. They are reset to -1 when decoding begins and set to INT_MAX
+  // when the frame is fully decoded.
+  int row;
+  int col;
+} RefCntBuffer;
+
+typedef struct BufferPool {
+  // Protect BufferPool from being accessed by several FrameWorkers at
+  // the same time during frame parallel decode.
+  // TODO(hkuang): Try to use atomic variable instead of locking the whole pool.
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t pool_mutex;
+#endif
+
+  // Private data associated with the frame buffer callbacks.
+  void *cb_priv;
+
+  vpx_get_frame_buffer_cb_fn_t get_fb_cb;
+  vpx_release_frame_buffer_cb_fn_t release_fb_cb;
+
+  RefCntBuffer frame_bufs[FRAME_BUFFERS];
+
+  // Frame buffers allocated internally by the codec.
+  InternalFrameBufferList int_frame_buffers;
+} BufferPool;
+
+typedef struct VP10Common {
+  struct vpx_internal_error_info  error;
+  vpx_color_space_t color_space;
+  int color_range;
+  int width;
+  int height;
+  int render_width;
+  int render_height;
+  int last_width;
+  int last_height;
+
+  // TODO(jkoleszar): this implies chroma ss right now, but could vary per
+  // plane. Revisit as part of the future change to YV12_BUFFER_CONFIG to
+  // support additional planes.
+  int subsampling_x;
+  int subsampling_y;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  int use_highbitdepth;  // Marks if we need to use 16bit frame buffers.
+#endif
+
+  YV12_BUFFER_CONFIG *frame_to_show;
+  RefCntBuffer *prev_frame;
+
+  // TODO(hkuang): Combine this with cur_buf in macroblockd.
+  RefCntBuffer *cur_frame;
+
+  int ref_frame_map[REF_FRAMES]; /* maps fb_idx to reference slot */
+
+  // Prepare ref_frame_map for the next frame.
+  // Only used in frame parallel decode.
+  int next_ref_frame_map[REF_FRAMES];
+
+  // TODO(jkoleszar): could expand active_ref_idx to 4, with 0 as intra, and
+  // roll new_fb_idx into it.
+
+  // Each frame can reference REFS_PER_FRAME buffers
+  RefBuffer frame_refs[REFS_PER_FRAME];
+
+  int new_fb_idx;
+
+#if CONFIG_VP9_POSTPROC
+  YV12_BUFFER_CONFIG post_proc_buffer;
+  YV12_BUFFER_CONFIG post_proc_buffer_int;
+#endif
+
+  FRAME_TYPE last_frame_type;  /* last frame's frame type for motion search.*/
+  FRAME_TYPE frame_type;
+
+  int show_frame;
+  int last_show_frame;
+  int show_existing_frame;
+
+  // Flag signaling that the frame is encoded using only INTRA modes.
+  uint8_t intra_only;
+  uint8_t last_intra_only;
+
+  int allow_high_precision_mv;
+
+  // Flag signaling which frame contexts should be reset to default values.
+  RESET_FRAME_CONTEXT_MODE reset_frame_context;
+
+  // MBs, mb_rows/cols is in 16-pixel units; mi_rows/cols is in
+  // MODE_INFO (8-pixel) units.
+  int MBs;
+  int mb_rows, mi_rows;
+  int mb_cols, mi_cols;
+  int mi_stride;
+
+  /* profile settings */
+  TX_MODE tx_mode;
+
+  int base_qindex;
+  int y_dc_delta_q;
+  int uv_dc_delta_q;
+  int uv_ac_delta_q;
+  int16_t y_dequant[MAX_SEGMENTS][2];
+  int16_t uv_dequant[MAX_SEGMENTS][2];
+
+  /* We allocate a MODE_INFO struct for each macroblock, together with
+     an extra row on top and column on the left to simplify prediction. */
+  int mi_alloc_size;
+  MODE_INFO *mip; /* Base of allocated array */
+  MODE_INFO *mi;  /* Corresponds to upper left visible macroblock */
+
+  // TODO(agrange): Move prev_mi into encoder structure.
+  // prev_mip and prev_mi will only be allocated in VP9 encoder.
+  MODE_INFO *prev_mip; /* MODE_INFO array 'mip' from last decoded frame */
+  MODE_INFO *prev_mi;  /* 'mi' from last frame (points into prev_mip) */
+
+  // Separate mi functions between encoder and decoder.
+  int (*alloc_mi)(struct VP10Common *cm, int mi_size);
+  void (*free_mi)(struct VP10Common *cm);
+  void (*setup_mi)(struct VP10Common *cm);
+
+  // Grid of pointers to 8x8 MODE_INFO structs.  Any 8x8 not in the visible
+  // area will be NULL.
+  MODE_INFO **mi_grid_base;
+  MODE_INFO **mi_grid_visible;
+  MODE_INFO **prev_mi_grid_base;
+  MODE_INFO **prev_mi_grid_visible;
+
+  // Whether to use previous frame's motion vectors for prediction.
+  int use_prev_frame_mvs;
+
+  // Persistent mb segment id map used in prediction.
+  int seg_map_idx;
+  int prev_seg_map_idx;
+
+  uint8_t *seg_map_array[NUM_PING_PONG_BUFFERS];
+  uint8_t *last_frame_seg_map;
+  uint8_t *current_frame_seg_map;
+  int seg_map_alloc_size;
+
+  INTERP_FILTER interp_filter;
+
+  loop_filter_info_n lf_info;
+
+  // Flag signaling how frame contexts should be updated at the end of
+  // a frame decode
+  REFRESH_FRAME_CONTEXT_MODE refresh_frame_context;
+
+  int ref_frame_sign_bias[MAX_REF_FRAMES];    /* Two state 0, 1 */
+
+  struct loopfilter lf;
+  struct segmentation seg;
+#if !CONFIG_MISC_FIXES
+  struct segmentation_probs segp;
+#endif
+
+  int frame_parallel_decode;  // frame-based threading.
+
+  // Context probabilities for reference frame prediction
+  MV_REFERENCE_FRAME comp_fixed_ref;
+  MV_REFERENCE_FRAME comp_var_ref[2];
+  REFERENCE_MODE reference_mode;
+
+  FRAME_CONTEXT *fc;  /* this frame entropy */
+  FRAME_CONTEXT *frame_contexts;   // FRAME_CONTEXTS
+  unsigned int  frame_context_idx; /* Context to use/update */
+  FRAME_COUNTS counts;
+
+  unsigned int current_video_frame;
+  BITSTREAM_PROFILE profile;
+
+  // VPX_BITS_8 in profile 0 or 1, VPX_BITS_10 or VPX_BITS_12 in profile 2 or 3.
+  vpx_bit_depth_t bit_depth;
+  vpx_bit_depth_t dequant_bit_depth;  // bit_depth of current dequantizer
+
+#if CONFIG_VP9_POSTPROC
+  struct postproc_state  postproc_state;
+#endif
+
+  int error_resilient_mode;
+
+  int log2_tile_cols, log2_tile_rows;
+  int tile_sz_mag;
+  int byte_alignment;
+  int skip_loop_filter;
+
+  // Private data associated with the frame buffer callbacks.
+  void *cb_priv;
+  vpx_get_frame_buffer_cb_fn_t get_fb_cb;
+  vpx_release_frame_buffer_cb_fn_t release_fb_cb;
+
+  // Handles memory for the codec.
+  InternalFrameBufferList int_frame_buffers;
+
+  // External BufferPool passed from outside.
+  BufferPool *buffer_pool;
+
+  PARTITION_CONTEXT *above_seg_context;
+  ENTROPY_CONTEXT *above_context;
+  int above_context_alloc_cols;
+
+  // scratch memory for intraonly/keyframe forward updates from default tables
+  // - this is intentionally not placed in FRAME_CONTEXT since it's reset upon
+  // each keyframe and not used afterwards
+  vpx_prob kf_y_prob[INTRA_MODES][INTRA_MODES][INTRA_MODES - 1];
+} VP10_COMMON;
+
+// TODO(hkuang): Don't need to lock the whole pool after implementing atomic
+// frame reference count.
+static void lock_buffer_pool(BufferPool *const pool) {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(&pool->pool_mutex);
+#else
+  (void)pool;
+#endif
+}
+
+static void unlock_buffer_pool(BufferPool *const pool) {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_unlock(&pool->pool_mutex);
+#else
+  (void)pool;
+#endif
+}
+
+static INLINE YV12_BUFFER_CONFIG *get_ref_frame(VP10_COMMON *cm, int index) {
+  if (index < 0 || index >= REF_FRAMES)
+    return NULL;
+  if (cm->ref_frame_map[index] < 0)
+    return NULL;
+  assert(cm->ref_frame_map[index] < FRAME_BUFFERS);
+  return &cm->buffer_pool->frame_bufs[cm->ref_frame_map[index]].buf;
+}
+
+static INLINE YV12_BUFFER_CONFIG *get_frame_new_buffer(VP10_COMMON *cm) {
+  return &cm->buffer_pool->frame_bufs[cm->new_fb_idx].buf;
+}
+
+static INLINE int get_free_fb(VP10_COMMON *cm) {
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+  int i;
+
+  lock_buffer_pool(cm->buffer_pool);
+  for (i = 0; i < FRAME_BUFFERS; ++i)
+    if (frame_bufs[i].ref_count == 0)
+      break;
+
+  if (i != FRAME_BUFFERS) {
+    frame_bufs[i].ref_count = 1;
+  } else {
+    // Reset i to be INVALID_IDX to indicate no free buffer found.
+    i = INVALID_IDX;
+  }
+
+  unlock_buffer_pool(cm->buffer_pool);
+  return i;
+}
+
+static INLINE void ref_cnt_fb(RefCntBuffer *bufs, int *idx, int new_idx) {
+  const int ref_index = *idx;
+
+  if (ref_index >= 0 && bufs[ref_index].ref_count > 0)
+    bufs[ref_index].ref_count--;
+
+  *idx = new_idx;
+
+  bufs[new_idx].ref_count++;
+}
+
+static INLINE int mi_cols_aligned_to_sb(int n_mis) {
+  return ALIGN_POWER_OF_TWO(n_mis, MI_BLOCK_SIZE_LOG2);
+}
+
+static INLINE int frame_is_intra_only(const VP10_COMMON *const cm) {
+  return cm->frame_type == KEY_FRAME || cm->intra_only;
+}
+
+static INLINE void vp10_init_macroblockd(VP10_COMMON *cm, MACROBLOCKD *xd,
+                                        tran_low_t *dqcoeff) {
+  int i;
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    xd->plane[i].dqcoeff = dqcoeff;
+    xd->above_context[i] = cm->above_context +
+        i * sizeof(*cm->above_context) * 2 * mi_cols_aligned_to_sb(cm->mi_cols);
+
+    if (xd->plane[i].plane_type == PLANE_TYPE_Y) {
+      memcpy(xd->plane[i].seg_dequant, cm->y_dequant, sizeof(cm->y_dequant));
+    } else {
+      memcpy(xd->plane[i].seg_dequant, cm->uv_dequant, sizeof(cm->uv_dequant));
+    }
+    xd->fc = cm->fc;
+  }
+
+  xd->above_seg_context = cm->above_seg_context;
+  xd->mi_stride = cm->mi_stride;
+  xd->error_info = &cm->error;
+}
+
+static INLINE void set_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col) {
+  const int above_idx = mi_col * 2;
+  const int left_idx = (mi_row * 2) & 15;
+  int i;
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    struct macroblockd_plane *const pd = &xd->plane[i];
+    pd->above_context = &xd->above_context[i][above_idx >> pd->subsampling_x];
+    pd->left_context = &xd->left_context[i][left_idx >> pd->subsampling_y];
+  }
+}
+
+static INLINE int calc_mi_size(int len) {
+  // len is in mi units.
+  return len + MI_BLOCK_SIZE;
+}
+
+static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile,
+                                  int mi_row, int bh,
+                                  int mi_col, int bw,
+                                  int mi_rows, int mi_cols) {
+  xd->mb_to_top_edge    = -((mi_row * MI_SIZE) * 8);
+  xd->mb_to_bottom_edge = ((mi_rows - bh - mi_row) * MI_SIZE) * 8;
+  xd->mb_to_left_edge   = -((mi_col * MI_SIZE) * 8);
+  xd->mb_to_right_edge  = ((mi_cols - bw - mi_col) * MI_SIZE) * 8;
+
+  // Are edges available for intra prediction?
+  xd->up_available    = (mi_row != 0);
+  xd->left_available  = (mi_col > tile->mi_col_start);
+  if (xd->up_available) {
+    xd->above_mi = xd->mi[-xd->mi_stride];
+    // above_mi may be NULL in VP9 encoder's first pass.
+    xd->above_mbmi = xd->above_mi ? &xd->above_mi->mbmi : NULL;
+  } else {
+    xd->above_mi = NULL;
+    xd->above_mbmi = NULL;
+  }
+
+  if (xd->left_available) {
+    xd->left_mi = xd->mi[-1];
+    // left_mi may be NULL in VP9 encoder's first pass.
+    xd->left_mbmi = xd->left_mi ? &xd->left_mi->mbmi : NULL;
+  } else {
+    xd->left_mi = NULL;
+    xd->left_mbmi = NULL;
+  }
+}
+
+static INLINE const vpx_prob *get_y_mode_probs(const VP10_COMMON *cm,
+                                               const MODE_INFO *mi,
+                                               const MODE_INFO *above_mi,
+                                               const MODE_INFO *left_mi,
+                                               int block) {
+  const PREDICTION_MODE above = vp10_above_block_mode(mi, above_mi, block);
+  const PREDICTION_MODE left = vp10_left_block_mode(mi, left_mi, block);
+  return cm->kf_y_prob[above][left];
+}
+
+static INLINE void update_partition_context(MACROBLOCKD *xd,
+                                            int mi_row, int mi_col,
+                                            BLOCK_SIZE subsize,
+                                            BLOCK_SIZE bsize) {
+  PARTITION_CONTEXT *const above_ctx = xd->above_seg_context + mi_col;
+  PARTITION_CONTEXT *const left_ctx = xd->left_seg_context + (mi_row & MI_MASK);
+
+  // num_4x4_blocks_wide_lookup[bsize] / 2
+  const int bs = num_8x8_blocks_wide_lookup[bsize];
+
+  // update the partition context at the end notes. set partition bits
+  // of block sizes larger than the current one to be one, and partition
+  // bits of smaller block sizes to be zero.
+  memset(above_ctx, partition_context_lookup[subsize].above, bs);
+  memset(left_ctx, partition_context_lookup[subsize].left, bs);
+}
+
+static INLINE int partition_plane_context(const MACROBLOCKD *xd,
+                                          int mi_row, int mi_col,
+                                          BLOCK_SIZE bsize) {
+  const PARTITION_CONTEXT *above_ctx = xd->above_seg_context + mi_col;
+  const PARTITION_CONTEXT *left_ctx = xd->left_seg_context + (mi_row & MI_MASK);
+  const int bsl = mi_width_log2_lookup[bsize];
+  int above = (*above_ctx >> bsl) & 1 , left = (*left_ctx >> bsl) & 1;
+
+  assert(b_width_log2_lookup[bsize] == b_height_log2_lookup[bsize]);
+  assert(bsl >= 0);
+
+  return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_COMMON_ONYXC_INT_H_
diff --git a/libs/libvpx/vp10/common/postproc.c b/libs/libvpx/vp10/common/postproc.c
new file mode 100644
index 0000000000..a6ea9c0eff
--- /dev/null
+++ b/libs/libvpx/vp10/common/postproc.c
@@ -0,0 +1,746 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vpx_scale_rtcd.h"
+#include "./vp10_rtcd.h"
+
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/system_state.h"
+#include "vpx_scale/vpx_scale.h"
+#include "vpx_scale/yv12config.h"
+
+#include "vp10/common/onyxc_int.h"
+#include "vp10/common/postproc.h"
+#include "vp10/common/textblit.h"
+
+#if CONFIG_VP9_POSTPROC
+static const short kernel5[] = {
+  1, 1, 4, 1, 1
+};
+
+const short vp10_rv[] = {
+  8, 5, 2, 2, 8, 12, 4, 9, 8, 3,
+  0, 3, 9, 0, 0, 0, 8, 3, 14, 4,
+  10, 1, 11, 14, 1, 14, 9, 6, 12, 11,
+  8, 6, 10, 0, 0, 8, 9, 0, 3, 14,
+  8, 11, 13, 4, 2, 9, 0, 3, 9, 6,
+  1, 2, 3, 14, 13, 1, 8, 2, 9, 7,
+  3, 3, 1, 13, 13, 6, 6, 5, 2, 7,
+  11, 9, 11, 8, 7, 3, 2, 0, 13, 13,
+  14, 4, 12, 5, 12, 10, 8, 10, 13, 10,
+  4, 14, 4, 10, 0, 8, 11, 1, 13, 7,
+  7, 14, 6, 14, 13, 2, 13, 5, 4, 4,
+  0, 10, 0, 5, 13, 2, 12, 7, 11, 13,
+  8, 0, 4, 10, 7, 2, 7, 2, 2, 5,
+  3, 4, 7, 3, 3, 14, 14, 5, 9, 13,
+  3, 14, 3, 6, 3, 0, 11, 8, 13, 1,
+  13, 1, 12, 0, 10, 9, 7, 6, 2, 8,
+  5, 2, 13, 7, 1, 13, 14, 7, 6, 7,
+  9, 6, 10, 11, 7, 8, 7, 5, 14, 8,
+  4, 4, 0, 8, 7, 10, 0, 8, 14, 11,
+  3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
+  11, 12, 12, 8, 0, 11, 13, 1, 2, 0,
+  5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
+  0, 3, 10, 5, 8, 0, 11, 6, 7, 8,
+  10, 7, 13, 9, 2, 5, 1, 5, 10, 2,
+  4, 3, 5, 6, 10, 8, 9, 4, 11, 14,
+  0, 10, 0, 5, 13, 2, 12, 7, 11, 13,
+  8, 0, 4, 10, 7, 2, 7, 2, 2, 5,
+  3, 4, 7, 3, 3, 14, 14, 5, 9, 13,
+  3, 14, 3, 6, 3, 0, 11, 8, 13, 1,
+  13, 1, 12, 0, 10, 9, 7, 6, 2, 8,
+  5, 2, 13, 7, 1, 13, 14, 7, 6, 7,
+  9, 6, 10, 11, 7, 8, 7, 5, 14, 8,
+  4, 4, 0, 8, 7, 10, 0, 8, 14, 11,
+  3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
+  11, 12, 12, 8, 0, 11, 13, 1, 2, 0,
+  5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
+  0, 3, 10, 5, 8, 0, 11, 6, 7, 8,
+  10, 7, 13, 9, 2, 5, 1, 5, 10, 2,
+  4, 3, 5, 6, 10, 8, 9, 4, 11, 14,
+  3, 8, 3, 7, 8, 5, 11, 4, 12, 3,
+  11, 9, 14, 8, 14, 13, 4, 3, 1, 2,
+  14, 6, 5, 4, 4, 11, 4, 6, 2, 1,
+  5, 8, 8, 12, 13, 5, 14, 10, 12, 13,
+  0, 9, 5, 5, 11, 10, 13, 9, 10, 13,
+};
+
+static const uint8_t q_diff_thresh = 20;
+static const uint8_t last_q_thresh = 170;
+
+void vp10_post_proc_down_and_across_c(const uint8_t *src_ptr,
+                                     uint8_t *dst_ptr,
+                                     int src_pixels_per_line,
+                                     int dst_pixels_per_line,
+                                     int rows,
+                                     int cols,
+                                     int flimit) {
+  uint8_t const *p_src;
+  uint8_t *p_dst;
+  int row, col, i, v, kernel;
+  int pitch = src_pixels_per_line;
+  uint8_t d[8];
+  (void)dst_pixels_per_line;
+
+  for (row = 0; row < rows; row++) {
+    /* post_proc_down for one row */
+    p_src = src_ptr;
+    p_dst = dst_ptr;
+
+    for (col = 0; col < cols; col++) {
+      kernel = 4;
+      v = p_src[col];
+
+      for (i = -2; i <= 2; i++) {
+        if (abs(v - p_src[col + i * pitch]) > flimit)
+          goto down_skip_convolve;
+
+        kernel += kernel5[2 + i] * p_src[col + i * pitch];
+      }
+
+      v = (kernel >> 3);
+    down_skip_convolve:
+      p_dst[col] = v;
+    }
+
+    /* now post_proc_across */
+    p_src = dst_ptr;
+    p_dst = dst_ptr;
+
+    for (i = 0; i < 8; i++)
+      d[i] = p_src[i];
+
+    for (col = 0; col < cols; col++) {
+      kernel = 4;
+      v = p_src[col];
+
+      d[col & 7] = v;
+
+      for (i = -2; i <= 2; i++) {
+        if (abs(v - p_src[col + i]) > flimit)
+          goto across_skip_convolve;
+
+        kernel += kernel5[2 + i] * p_src[col + i];
+      }
+
+      d[col & 7] = (kernel >> 3);
+    across_skip_convolve:
+
+      if (col >= 2)
+        p_dst[col - 2] = d[(col - 2) & 7];
+    }
+
+    /* handle the last two pixels */
+    p_dst[col - 2] = d[(col - 2) & 7];
+    p_dst[col - 1] = d[(col - 1) & 7];
+
+
+    /* next row */
+    src_ptr += pitch;
+    dst_ptr += pitch;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp10_highbd_post_proc_down_and_across_c(const uint16_t *src_ptr,
+                                            uint16_t *dst_ptr,
+                                            int src_pixels_per_line,
+                                            int dst_pixels_per_line,
+                                            int rows,
+                                            int cols,
+                                            int flimit) {
+  uint16_t const *p_src;
+  uint16_t *p_dst;
+  int row, col, i, v, kernel;
+  int pitch = src_pixels_per_line;
+  uint16_t d[8];
+
+  for (row = 0; row < rows; row++) {
+    // post_proc_down for one row.
+    p_src = src_ptr;
+    p_dst = dst_ptr;
+
+    for (col = 0; col < cols; col++) {
+      kernel = 4;
+      v = p_src[col];
+
+      for (i = -2; i <= 2; i++) {
+        if (abs(v - p_src[col + i * pitch]) > flimit)
+          goto down_skip_convolve;
+
+        kernel += kernel5[2 + i] * p_src[col + i * pitch];
+      }
+
+      v = (kernel >> 3);
+
+    down_skip_convolve:
+      p_dst[col] = v;
+    }
+
+    /* now post_proc_across */
+    p_src = dst_ptr;
+    p_dst = dst_ptr;
+
+    for (i = 0; i < 8; i++)
+      d[i] = p_src[i];
+
+    for (col = 0; col < cols; col++) {
+      kernel = 4;
+      v = p_src[col];
+
+      d[col & 7] = v;
+
+      for (i = -2; i <= 2; i++) {
+        if (abs(v - p_src[col + i]) > flimit)
+          goto across_skip_convolve;
+
+        kernel += kernel5[2 + i] * p_src[col + i];
+      }
+
+      d[col & 7] = (kernel >> 3);
+
+    across_skip_convolve:
+      if (col >= 2)
+        p_dst[col - 2] = d[(col - 2) & 7];
+    }
+
+    /* handle the last two pixels */
+    p_dst[col - 2] = d[(col - 2) & 7];
+    p_dst[col - 1] = d[(col - 1) & 7];
+
+
+    /* next row */
+    src_ptr += pitch;
+    dst_ptr += dst_pixels_per_line;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static int q2mbl(int x) {
+  if (x < 20) x = 20;
+
+  x = 50 + (x - 50) * 10 / 8;
+  return x * x / 3;
+}
+
+void vp10_mbpost_proc_across_ip_c(uint8_t *src, int pitch,
+                                 int rows, int cols, int flimit) {
+  int r, c, i;
+  uint8_t *s = src;
+  uint8_t d[16];
+
+  for (r = 0; r < rows; r++) {
+    int sumsq = 0;
+    int sum = 0;
+
+    for (i = -8; i <= 6; i++) {
+      sumsq += s[i] * s[i];
+      sum += s[i];
+      d[i + 8] = 0;
+    }
+
+    for (c = 0; c < cols + 8; c++) {
+      int x = s[c + 7] - s[c - 8];
+      int y = s[c + 7] + s[c - 8];
+
+      sum += x;
+      sumsq += x * y;
+
+      d[c & 15] = s[c];
+
+      if (sumsq * 15 - sum * sum < flimit) {
+        d[c & 15] = (8 + sum + s[c]) >> 4;
+      }
+
+      s[c - 8] = d[(c - 8) & 15];
+    }
+    s += pitch;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp10_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch,
+                                        int rows, int cols, int flimit) {
+  int r, c, i;
+
+  uint16_t *s = src;
+  uint16_t d[16];
+
+
+  for (r = 0; r < rows; r++) {
+    int sumsq = 0;
+    int sum   = 0;
+
+    for (i = -8; i <= 6; i++) {
+      sumsq += s[i] * s[i];
+      sum   += s[i];
+      d[i + 8] = 0;
+    }
+
+    for (c = 0; c < cols + 8; c++) {
+      int x = s[c + 7] - s[c - 8];
+      int y = s[c + 7] + s[c - 8];
+
+      sum  += x;
+      sumsq += x * y;
+
+      d[c & 15] = s[c];
+
+      if (sumsq * 15 - sum * sum < flimit) {
+        d[c & 15] = (8 + sum + s[c]) >> 4;
+      }
+
+      s[c - 8] = d[(c - 8) & 15];
+    }
+
+    s += pitch;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+void vp10_mbpost_proc_down_c(uint8_t *dst, int pitch,
+                            int rows, int cols, int flimit) {
+  int r, c, i;
+  const short *rv3 = &vp10_rv[63 & rand()]; // NOLINT
+
+  for (c = 0; c < cols; c++) {
+    uint8_t *s = &dst[c];
+    int sumsq = 0;
+    int sum   = 0;
+    uint8_t d[16];
+    const short *rv2 = rv3 + ((c * 17) & 127);
+
+    for (i = -8; i <= 6; i++) {
+      sumsq += s[i * pitch] * s[i * pitch];
+      sum   += s[i * pitch];
+    }
+
+    for (r = 0; r < rows + 8; r++) {
+      sumsq += s[7 * pitch] * s[ 7 * pitch] - s[-8 * pitch] * s[-8 * pitch];
+      sum  += s[7 * pitch] - s[-8 * pitch];
+      d[r & 15] = s[0];
+
+      if (sumsq * 15 - sum * sum < flimit) {
+        d[r & 15] = (rv2[r & 127] + sum + s[0]) >> 4;
+      }
+
+      s[-8 * pitch] = d[(r - 8) & 15];
+      s += pitch;
+    }
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp10_highbd_mbpost_proc_down_c(uint16_t *dst, int pitch,
+                                   int rows, int cols, int flimit) {
+  int r, c, i;
+  const int16_t *rv3 = &vp10_rv[63 & rand()];  // NOLINT
+
+  for (c = 0; c < cols; c++) {
+    uint16_t *s = &dst[c];
+    int sumsq = 0;
+    int sum = 0;
+    uint16_t d[16];
+    const int16_t *rv2 = rv3 + ((c * 17) & 127);
+
+    for (i = -8; i <= 6; i++) {
+      sumsq += s[i * pitch] * s[i * pitch];
+      sum += s[i * pitch];
+    }
+
+    for (r = 0; r < rows + 8; r++) {
+      sumsq += s[7 * pitch] * s[ 7 * pitch] - s[-8 * pitch] * s[-8 * pitch];
+      sum += s[7 * pitch] - s[-8 * pitch];
+      d[r & 15] = s[0];
+
+      if (sumsq * 15 - sum * sum < flimit) {
+        d[r & 15] = (rv2[r & 127] + sum + s[0]) >> 4;
+      }
+
+      s[-8 * pitch] = d[(r - 8) & 15];
+      s += pitch;
+    }
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static void deblock_and_de_macro_block(YV12_BUFFER_CONFIG   *source,
+                                       YV12_BUFFER_CONFIG   *post,
+                                       int                   q,
+                                       int                   low_var_thresh,
+                                       int                   flag) {
+  double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
+  int ppl = (int)(level + .5);
+  (void) low_var_thresh;
+  (void) flag;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (source->flags & YV12_FLAG_HIGHBITDEPTH) {
+    vp10_highbd_post_proc_down_and_across(CONVERT_TO_SHORTPTR(source->y_buffer),
+                                         CONVERT_TO_SHORTPTR(post->y_buffer),
+                                         source->y_stride, post->y_stride,
+                                         source->y_height, source->y_width,
+                                         ppl);
+
+    vp10_highbd_mbpost_proc_across_ip(CONVERT_TO_SHORTPTR(post->y_buffer),
+                                     post->y_stride, post->y_height,
+                                     post->y_width, q2mbl(q));
+
+    vp10_highbd_mbpost_proc_down(CONVERT_TO_SHORTPTR(post->y_buffer),
+                                post->y_stride, post->y_height,
+                                post->y_width, q2mbl(q));
+
+    vp10_highbd_post_proc_down_and_across(CONVERT_TO_SHORTPTR(source->u_buffer),
+                                         CONVERT_TO_SHORTPTR(post->u_buffer),
+                                         source->uv_stride, post->uv_stride,
+                                         source->uv_height, source->uv_width,
+                                         ppl);
+    vp10_highbd_post_proc_down_and_across(CONVERT_TO_SHORTPTR(source->v_buffer),
+                                         CONVERT_TO_SHORTPTR(post->v_buffer),
+                                         source->uv_stride, post->uv_stride,
+                                         source->uv_height, source->uv_width,
+                                         ppl);
+  } else {
+    vp10_post_proc_down_and_across(source->y_buffer, post->y_buffer,
+                                  source->y_stride, post->y_stride,
+                                  source->y_height, source->y_width, ppl);
+
+    vp10_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height,
+                              post->y_width, q2mbl(q));
+
+    vp10_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height,
+                         post->y_width, q2mbl(q));
+
+    vp10_post_proc_down_and_across(source->u_buffer, post->u_buffer,
+                                  source->uv_stride, post->uv_stride,
+                                  source->uv_height, source->uv_width, ppl);
+    vp10_post_proc_down_and_across(source->v_buffer, post->v_buffer,
+                                  source->uv_stride, post->uv_stride,
+                                  source->uv_height, source->uv_width, ppl);
+  }
+#else
+  vp10_post_proc_down_and_across(source->y_buffer, post->y_buffer,
+                                source->y_stride, post->y_stride,
+                                source->y_height, source->y_width, ppl);
+
+  vp10_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height,
+                            post->y_width, q2mbl(q));
+
+  vp10_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height,
+                       post->y_width, q2mbl(q));
+
+  vp10_post_proc_down_and_across(source->u_buffer, post->u_buffer,
+                                source->uv_stride, post->uv_stride,
+                                source->uv_height, source->uv_width, ppl);
+  vp10_post_proc_down_and_across(source->v_buffer, post->v_buffer,
+                                source->uv_stride, post->uv_stride,
+                                source->uv_height, source->uv_width, ppl);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}
+
+void vp10_deblock(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
+                 int q) {
+  const int ppl = (int)(6.0e-05 * q * q * q - 0.0067 * q * q + 0.306 * q
+                        + 0.0065 + 0.5);
+  int i;
+
+  const uint8_t *const srcs[3] = {src->y_buffer, src->u_buffer, src->v_buffer};
+  const int src_strides[3] = {src->y_stride, src->uv_stride, src->uv_stride};
+  const int src_widths[3] = {src->y_width, src->uv_width, src->uv_width};
+  const int src_heights[3] = {src->y_height, src->uv_height, src->uv_height};
+
+  uint8_t *const dsts[3] = {dst->y_buffer, dst->u_buffer, dst->v_buffer};
+  const int dst_strides[3] = {dst->y_stride, dst->uv_stride, dst->uv_stride};
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    assert((src->flags & YV12_FLAG_HIGHBITDEPTH) ==
+           (dst->flags & YV12_FLAG_HIGHBITDEPTH));
+    if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
+      vp10_highbd_post_proc_down_and_across(CONVERT_TO_SHORTPTR(srcs[i]),
+                                           CONVERT_TO_SHORTPTR(dsts[i]),
+                                           src_strides[i], dst_strides[i],
+                                           src_heights[i], src_widths[i], ppl);
+    } else {
+      vp10_post_proc_down_and_across(srcs[i], dsts[i],
+                                    src_strides[i], dst_strides[i],
+                                    src_heights[i], src_widths[i], ppl);
+    }
+#else
+    vp10_post_proc_down_and_across(srcs[i], dsts[i],
+                                  src_strides[i], dst_strides[i],
+                                  src_heights[i], src_widths[i], ppl);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  }
+}
+
+void vp10_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
+                 int q) {
+  const int ppl = (int)(6.0e-05 * q * q * q - 0.0067 * q * q + 0.306 * q
+                        + 0.0065 + 0.5);
+  int i;
+
+  const uint8_t *const srcs[3] = {src->y_buffer, src->u_buffer, src->v_buffer};
+  const int src_strides[3] = {src->y_stride, src->uv_stride, src->uv_stride};
+  const int src_widths[3] = {src->y_width, src->uv_width, src->uv_width};
+  const int src_heights[3] = {src->y_height, src->uv_height, src->uv_height};
+
+  uint8_t *const dsts[3] = {dst->y_buffer, dst->u_buffer, dst->v_buffer};
+  const int dst_strides[3] = {dst->y_stride, dst->uv_stride, dst->uv_stride};
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    const int src_stride = src_strides[i];
+    const int src_width = src_widths[i] - 4;
+    const int src_height = src_heights[i] - 4;
+    const int dst_stride = dst_strides[i];
+
+#if CONFIG_VP9_HIGHBITDEPTH
+    assert((src->flags & YV12_FLAG_HIGHBITDEPTH) ==
+           (dst->flags & YV12_FLAG_HIGHBITDEPTH));
+    if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
+      const uint16_t *const src_plane = CONVERT_TO_SHORTPTR(
+          srcs[i] + 2 * src_stride + 2);
+      uint16_t *const dst_plane = CONVERT_TO_SHORTPTR(
+          dsts[i] + 2 * dst_stride + 2);
+      vp10_highbd_post_proc_down_and_across(src_plane, dst_plane, src_stride,
+                                           dst_stride, src_height, src_width,
+                                           ppl);
+    } else {
+      const uint8_t *const src_plane = srcs[i] + 2 * src_stride + 2;
+      uint8_t *const dst_plane = dsts[i] + 2 * dst_stride + 2;
+
+      vp10_post_proc_down_and_across(src_plane, dst_plane, src_stride,
+                                    dst_stride, src_height, src_width, ppl);
+    }
+#else
+    const uint8_t *const src_plane = srcs[i] + 2 * src_stride + 2;
+    uint8_t *const dst_plane = dsts[i] + 2 * dst_stride + 2;
+    vp10_post_proc_down_and_across(src_plane, dst_plane, src_stride, dst_stride,
+                                  src_height, src_width, ppl);
+#endif
+  }
+}
+
+static double gaussian(double sigma, double mu, double x) {
+  return 1 / (sigma * sqrt(2.0 * 3.14159265)) *
+         (exp(-(x - mu) * (x - mu) / (2 * sigma * sigma)));
+}
+
+static void fillrd(struct postproc_state *state, int q, int a) {
+  char char_dist[300];
+
+  double sigma;
+  int ai = a, qi = q, i;
+
+  vpx_clear_system_state();
+
+  sigma = ai + .5 + .6 * (63 - qi) / 63.0;
+
+  /* set up a lookup table of 256 entries that matches
+   * a gaussian distribution with sigma determined by q.
+   */
+  {
+    int next, j;
+
+    next = 0;
+
+    for (i = -32; i < 32; i++) {
+      int a_i = (int)(0.5 + 256 * gaussian(sigma, 0, i));
+
+      if (a_i) {
+        for (j = 0; j < a_i; j++) {
+          char_dist[next + j] = (char) i;
+        }
+
+        next = next + j;
+      }
+    }
+
+    for (; next < 256; next++)
+      char_dist[next] = 0;
+  }
+
+  for (i = 0; i < 3072; i++) {
+    state->noise[i] = char_dist[rand() & 0xff];  // NOLINT
+  }
+
+  for (i = 0; i < 16; i++) {
+    state->blackclamp[i] = -char_dist[0];
+    state->whiteclamp[i] = -char_dist[0];
+    state->bothclamp[i] = -2 * char_dist[0];
+  }
+
+  state->last_q = q;
+  state->last_noise = a;
+}
+
+void vp10_plane_add_noise_c(uint8_t *start, char *noise,
+                           char blackclamp[16],
+                           char whiteclamp[16],
+                           char bothclamp[16],
+                           unsigned int width, unsigned int height, int pitch) {
+  unsigned int i, j;
+
+  // TODO(jbb): why does simd code use both but c doesn't,  normalize and
+  // fix..
+  (void) bothclamp;
+  for (i = 0; i < height; i++) {
+    uint8_t *pos = start + i * pitch;
+    char  *ref = (char *)(noise + (rand() & 0xff));  // NOLINT
+
+    for (j = 0; j < width; j++) {
+      if (pos[j] < blackclamp[0])
+        pos[j] = blackclamp[0];
+
+      if (pos[j] > 255 + whiteclamp[0])
+        pos[j] = 255 + whiteclamp[0];
+
+      pos[j] += ref[j];
+    }
+  }
+}
+
+static void swap_mi_and_prev_mi(VP10_COMMON *cm) {
+  // Current mip will be the prev_mip for the next frame.
+  MODE_INFO *temp = cm->postproc_state.prev_mip;
+  cm->postproc_state.prev_mip = cm->mip;
+  cm->mip = temp;
+
+  // Update the upper left visible macroblock ptrs.
+  cm->mi = cm->mip + cm->mi_stride + 1;
+  cm->postproc_state.prev_mi = cm->postproc_state.prev_mip + cm->mi_stride + 1;
+}
+
+int vp10_post_proc_frame(struct VP10Common *cm,
+                        YV12_BUFFER_CONFIG *dest, vp10_ppflags_t *ppflags) {
+  const int q = VPXMIN(105, cm->lf.filter_level * 2);
+  const int flags = ppflags->post_proc_flag;
+  YV12_BUFFER_CONFIG *const ppbuf = &cm->post_proc_buffer;
+  struct postproc_state *const ppstate = &cm->postproc_state;
+
+  if (!cm->frame_to_show)
+    return -1;
+
+  if (!flags) {
+    *dest = *cm->frame_to_show;
+    return 0;
+  }
+
+  vpx_clear_system_state();
+
+  // Alloc memory for prev_mip in the first frame.
+  if (cm->current_video_frame == 1) {
+    cm->postproc_state.last_base_qindex = cm->base_qindex;
+    cm->postproc_state.last_frame_valid = 1;
+    ppstate->prev_mip = vpx_calloc(cm->mi_alloc_size, sizeof(*cm->mip));
+    if (!ppstate->prev_mip) {
+      return 1;
+    }
+    ppstate->prev_mi = ppstate->prev_mip + cm->mi_stride + 1;
+    memset(ppstate->prev_mip, 0,
+           cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mip));
+  }
+
+  // Allocate post_proc_buffer_int if needed.
+  if ((flags & VP9D_MFQE) && !cm->post_proc_buffer_int.buffer_alloc) {
+    if ((flags & VP9D_DEMACROBLOCK) || (flags & VP9D_DEBLOCK)) {
+      const int width = ALIGN_POWER_OF_TWO(cm->width, 4);
+      const int height = ALIGN_POWER_OF_TWO(cm->height, 4);
+
+      if (vpx_alloc_frame_buffer(&cm->post_proc_buffer_int, width, height,
+                                 cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                 cm->use_highbitdepth,
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+                                 VP9_ENC_BORDER_IN_PIXELS,
+                                 cm->byte_alignment) < 0) {
+        vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                           "Failed to allocate MFQE framebuffer");
+      }
+
+      // Ensure that postproc is set to all 0s so that post proc
+      // doesn't pull random data in from edge.
+      memset(cm->post_proc_buffer_int.buffer_alloc, 128,
+             cm->post_proc_buffer.frame_size);
+    }
+  }
+
+  if (vpx_realloc_frame_buffer(&cm->post_proc_buffer, cm->width, cm->height,
+                               cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                               cm->use_highbitdepth,
+#endif
+                               VP9_DEC_BORDER_IN_PIXELS, cm->byte_alignment,
+                               NULL, NULL, NULL) < 0)
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate post-processing buffer");
+
+  if ((flags & VP9D_MFQE) && cm->current_video_frame >= 2 &&
+      cm->postproc_state.last_frame_valid && cm->bit_depth == 8 &&
+      cm->postproc_state.last_base_qindex <= last_q_thresh &&
+      cm->base_qindex - cm->postproc_state.last_base_qindex >= q_diff_thresh) {
+    vp10_mfqe(cm);
+    // TODO(jackychen): Consider whether enable deblocking by default
+    // if mfqe is enabled. Need to take both the quality and the speed
+    // into consideration.
+    if ((flags & VP9D_DEMACROBLOCK) || (flags & VP9D_DEBLOCK)) {
+      vp8_yv12_copy_frame(ppbuf, &cm->post_proc_buffer_int);
+    }
+    if ((flags & VP9D_DEMACROBLOCK) && cm->post_proc_buffer_int.buffer_alloc) {
+      deblock_and_de_macro_block(&cm->post_proc_buffer_int, ppbuf,
+                                 q + (ppflags->deblocking_level - 5) * 10,
+                                 1, 0);
+    } else if (flags & VP9D_DEBLOCK) {
+      vp10_deblock(&cm->post_proc_buffer_int, ppbuf, q);
+    } else {
+      vp8_yv12_copy_frame(&cm->post_proc_buffer_int, ppbuf);
+    }
+  } else if (flags & VP9D_DEMACROBLOCK) {
+    deblock_and_de_macro_block(cm->frame_to_show, ppbuf,
+                               q + (ppflags->deblocking_level - 5) * 10, 1, 0);
+  } else if (flags & VP9D_DEBLOCK) {
+    vp10_deblock(cm->frame_to_show, ppbuf, q);
+  } else {
+    vp8_yv12_copy_frame(cm->frame_to_show, ppbuf);
+  }
+
+  cm->postproc_state.last_base_qindex = cm->base_qindex;
+  cm->postproc_state.last_frame_valid = 1;
+
+  if (flags & VP9D_ADDNOISE) {
+    const int noise_level = ppflags->noise_level;
+    if (ppstate->last_q != q ||
+        ppstate->last_noise != noise_level) {
+      fillrd(ppstate, 63 - q, noise_level);
+    }
+
+    vp10_plane_add_noise(ppbuf->y_buffer, ppstate->noise, ppstate->blackclamp,
+                        ppstate->whiteclamp, ppstate->bothclamp,
+                        ppbuf->y_width, ppbuf->y_height, ppbuf->y_stride);
+  }
+
+  *dest = *ppbuf;
+
+  /* handle problem with extending borders */
+  dest->y_width = cm->width;
+  dest->y_height = cm->height;
+  dest->uv_width = dest->y_width >> cm->subsampling_x;
+  dest->uv_height = dest->y_height >> cm->subsampling_y;
+
+  swap_mi_and_prev_mi(cm);
+  return 0;
+}
+#endif  // CONFIG_VP9_POSTPROC
diff --git a/libs/libvpx/vp10/common/postproc.h b/libs/libvpx/vp10/common/postproc.h
new file mode 100644
index 0000000000..e2ce0dcc87
--- /dev/null
+++ b/libs/libvpx/vp10/common/postproc.h
@@ -0,0 +1,53 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP10_COMMON_POSTPROC_H_
+#define VP10_COMMON_POSTPROC_H_
+
+#include "vpx_ports/mem.h"
+#include "vpx_scale/yv12config.h"
+#include "vp10/common/blockd.h"
+#include "vp10/common/mfqe.h"
+#include "vp10/common/ppflags.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct postproc_state {
+  int last_q;
+  int last_noise;
+  char noise[3072];
+  int last_base_qindex;
+  int last_frame_valid;
+  MODE_INFO *prev_mip;
+  MODE_INFO *prev_mi;
+  DECLARE_ALIGNED(16, char, blackclamp[16]);
+  DECLARE_ALIGNED(16, char, whiteclamp[16]);
+  DECLARE_ALIGNED(16, char, bothclamp[16]);
+};
+
+struct VP10Common;
+
+#define MFQE_PRECISION 4
+
+int vp10_post_proc_frame(struct VP10Common *cm,
+                        YV12_BUFFER_CONFIG *dest, vp10_ppflags_t *flags);
+
+void vp10_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q);
+
+void vp10_deblock(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_COMMON_POSTPROC_H_
diff --git a/libs/libvpx/vp10/common/ppflags.h b/libs/libvpx/vp10/common/ppflags.h
new file mode 100644
index 0000000000..8592fe906a
--- /dev/null
+++ b/libs/libvpx/vp10/common/ppflags.h
@@ -0,0 +1,43 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_COMMON_PPFLAGS_H_
+#define VP10_COMMON_PPFLAGS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum {
+  VP9D_NOFILTERING            = 0,
+  VP9D_DEBLOCK                = 1 << 0,
+  VP9D_DEMACROBLOCK           = 1 << 1,
+  VP9D_ADDNOISE               = 1 << 2,
+  VP9D_DEBUG_TXT_FRAME_INFO   = 1 << 3,
+  VP9D_DEBUG_TXT_MBLK_MODES   = 1 << 4,
+  VP9D_DEBUG_TXT_DC_DIFF      = 1 << 5,
+  VP9D_DEBUG_TXT_RATE_INFO    = 1 << 6,
+  VP9D_DEBUG_DRAW_MV          = 1 << 7,
+  VP9D_DEBUG_CLR_BLK_MODES    = 1 << 8,
+  VP9D_DEBUG_CLR_FRM_REF_BLKS = 1 << 9,
+  VP9D_MFQE                   = 1 << 10
+};
+
+typedef struct {
+  int post_proc_flag;
+  int deblocking_level;
+  int noise_level;
+} vp10_ppflags_t;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_COMMON_PPFLAGS_H_
diff --git a/libs/libvpx/vp10/common/pred_common.c b/libs/libvpx/vp10/common/pred_common.c
new file mode 100644
index 0000000000..236ae54661
--- /dev/null
+++ b/libs/libvpx/vp10/common/pred_common.c
@@ -0,0 +1,339 @@
+
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp10/common/common.h"
+#include "vp10/common/pred_common.h"
+#include "vp10/common/seg_common.h"
+
+// Returns a context number for the given MB prediction signal
+int vp10_get_pred_context_switchable_interp(const MACROBLOCKD *xd) {
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries corresponding to real macroblocks.
+  // The prediction flags in these dummy entries are initialized to 0.
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int left_type = xd->left_available && is_inter_block(left_mbmi) ?
+                            left_mbmi->interp_filter : SWITCHABLE_FILTERS;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const int above_type = xd->up_available && is_inter_block(above_mbmi) ?
+                             above_mbmi->interp_filter : SWITCHABLE_FILTERS;
+
+  if (left_type == above_type)
+    return left_type;
+  else if (left_type == SWITCHABLE_FILTERS && above_type != SWITCHABLE_FILTERS)
+    return above_type;
+  else if (left_type != SWITCHABLE_FILTERS && above_type == SWITCHABLE_FILTERS)
+    return left_type;
+  else
+    return SWITCHABLE_FILTERS;
+}
+
+// The mode info data structure has a one element border above and to the
+// left of the entries corresponding to real macroblocks.
+// The prediction flags in these dummy entries are initialized to 0.
+// 0 - inter/inter, inter/--, --/inter, --/--
+// 1 - intra/inter, inter/intra
+// 2 - intra/--, --/intra
+// 3 - intra/intra
+int vp10_get_intra_inter_context(const MACROBLOCKD *xd) {
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int has_above = xd->up_available;
+  const int has_left = xd->left_available;
+
+  if (has_above && has_left) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+    return left_intra && above_intra ? 3
+                                     : left_intra || above_intra;
+  } else if (has_above || has_left) {  // one edge available
+    return 2 * !is_inter_block(has_above ? above_mbmi : left_mbmi);
+  } else {
+    return 0;
+  }
+}
+
+int vp10_get_reference_mode_context(const VP10_COMMON *cm,
+                                   const MACROBLOCKD *xd) {
+  int ctx;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int has_above = xd->up_available;
+  const int has_left = xd->left_available;
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries corresponding to real macroblocks.
+  // The prediction flags in these dummy entries are initialized to 0.
+  if (has_above && has_left) {  // both edges available
+    if (!has_second_ref(above_mbmi) && !has_second_ref(left_mbmi))
+      // neither edge uses comp pred (0/1)
+      ctx = (above_mbmi->ref_frame[0] == cm->comp_fixed_ref) ^
+            (left_mbmi->ref_frame[0] == cm->comp_fixed_ref);
+    else if (!has_second_ref(above_mbmi))
+      // one of two edges uses comp pred (2/3)
+      ctx = 2 + (above_mbmi->ref_frame[0] == cm->comp_fixed_ref ||
+                 !is_inter_block(above_mbmi));
+    else if (!has_second_ref(left_mbmi))
+      // one of two edges uses comp pred (2/3)
+      ctx = 2 + (left_mbmi->ref_frame[0] == cm->comp_fixed_ref ||
+                 !is_inter_block(left_mbmi));
+    else  // both edges use comp pred (4)
+      ctx = 4;
+  } else if (has_above || has_left) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
+
+    if (!has_second_ref(edge_mbmi))
+      // edge does not use comp pred (0/1)
+      ctx = edge_mbmi->ref_frame[0] == cm->comp_fixed_ref;
+    else
+      // edge uses comp pred (3)
+      ctx = 3;
+  } else {  // no edges available (1)
+    ctx = 1;
+  }
+  assert(ctx >= 0 && ctx < COMP_INTER_CONTEXTS);
+  return ctx;
+}
+
+// Returns a context number for the given MB prediction signal
+int vp10_get_pred_context_comp_ref_p(const VP10_COMMON *cm,
+                                    const MACROBLOCKD *xd) {
+  int pred_context;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int above_in_image = xd->up_available;
+  const int left_in_image = xd->left_available;
+
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries corresponding to real macroblocks.
+  // The prediction flags in these dummy entries are initialized to 0.
+  const int fix_ref_idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
+  const int var_ref_idx = !fix_ref_idx;
+
+  if (above_in_image && left_in_image) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
+    if (above_intra && left_intra) {  // intra/intra (2)
+      pred_context = 2;
+    } else if (above_intra || left_intra) {  // intra/inter
+      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
+
+      if (!has_second_ref(edge_mbmi))  // single pred (1/3)
+        pred_context = 1 + 2 * (edge_mbmi->ref_frame[0] != cm->comp_var_ref[1]);
+      else  // comp pred (1/3)
+        pred_context = 1 + 2 * (edge_mbmi->ref_frame[var_ref_idx]
+                                    != cm->comp_var_ref[1]);
+    } else {  // inter/inter
+      const int l_sg = !has_second_ref(left_mbmi);
+      const int a_sg = !has_second_ref(above_mbmi);
+      const MV_REFERENCE_FRAME vrfa = a_sg ? above_mbmi->ref_frame[0]
+                                           : above_mbmi->ref_frame[var_ref_idx];
+      const MV_REFERENCE_FRAME vrfl = l_sg ? left_mbmi->ref_frame[0]
+                                           : left_mbmi->ref_frame[var_ref_idx];
+
+      if (vrfa == vrfl && cm->comp_var_ref[1] == vrfa) {
+        pred_context = 0;
+      } else if (l_sg && a_sg) {  // single/single
+        if ((vrfa == cm->comp_fixed_ref && vrfl == cm->comp_var_ref[0]) ||
+            (vrfl == cm->comp_fixed_ref && vrfa == cm->comp_var_ref[0]))
+          pred_context = 4;
+        else if (vrfa == vrfl)
+          pred_context = 3;
+        else
+          pred_context = 1;
+      } else if (l_sg || a_sg) {  // single/comp
+        const MV_REFERENCE_FRAME vrfc = l_sg ? vrfa : vrfl;
+        const MV_REFERENCE_FRAME rfs = a_sg ? vrfa : vrfl;
+        if (vrfc == cm->comp_var_ref[1] && rfs != cm->comp_var_ref[1])
+          pred_context = 1;
+        else if (rfs == cm->comp_var_ref[1] && vrfc != cm->comp_var_ref[1])
+          pred_context = 2;
+        else
+          pred_context = 4;
+      } else if (vrfa == vrfl) {  // comp/comp
+        pred_context = 4;
+      } else {
+        pred_context = 2;
+      }
+    }
+  } else if (above_in_image || left_in_image) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
+
+    if (!is_inter_block(edge_mbmi)) {
+      pred_context = 2;
+    } else {
+      if (has_second_ref(edge_mbmi))
+        pred_context = 4 * (edge_mbmi->ref_frame[var_ref_idx]
+                              != cm->comp_var_ref[1]);
+      else
+        pred_context = 3 * (edge_mbmi->ref_frame[0] != cm->comp_var_ref[1]);
+    }
+  } else {  // no edges available (2)
+    pred_context = 2;
+  }
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+
+  return pred_context;
+}
+
+int vp10_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) {
+  int pred_context;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int has_above = xd->up_available;
+  const int has_left = xd->left_available;
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries corresponding to real macroblocks.
+  // The prediction flags in these dummy entries are initialized to 0.
+  if (has_above && has_left) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
+    if (above_intra && left_intra) {  // intra/intra
+      pred_context = 2;
+    } else if (above_intra || left_intra) {  // intra/inter or inter/intra
+      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
+      if (!has_second_ref(edge_mbmi))
+        pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST_FRAME);
+      else
+        pred_context = 1 + (edge_mbmi->ref_frame[0] == LAST_FRAME ||
+                            edge_mbmi->ref_frame[1] == LAST_FRAME);
+    } else {  // inter/inter
+      const int above_has_second = has_second_ref(above_mbmi);
+      const int left_has_second = has_second_ref(left_mbmi);
+      const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1];
+      const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
+
+      if (above_has_second && left_has_second) {
+        pred_context = 1 + (above0 == LAST_FRAME || above1 == LAST_FRAME ||
+                            left0 == LAST_FRAME || left1 == LAST_FRAME);
+      } else if (above_has_second || left_has_second) {
+        const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
+
+        if (rfs == LAST_FRAME)
+          pred_context = 3 + (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
+        else
+          pred_context = (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
+      } else {
+        pred_context = 2 * (above0 == LAST_FRAME) + 2 * (left0 == LAST_FRAME);
+      }
+    }
+  } else if (has_above || has_left) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
+    if (!is_inter_block(edge_mbmi)) {  // intra
+      pred_context = 2;
+    } else {  // inter
+      if (!has_second_ref(edge_mbmi))
+        pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST_FRAME);
+      else
+        pred_context = 1 + (edge_mbmi->ref_frame[0] == LAST_FRAME ||
+                            edge_mbmi->ref_frame[1] == LAST_FRAME);
+    }
+  } else {  // no edges available
+    pred_context = 2;
+  }
+
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+  return pred_context;
+}
+
+int vp10_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
+  int pred_context;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int has_above = xd->up_available;
+  const int has_left = xd->left_available;
+
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries corresponding to real macroblocks.
+  // The prediction flags in these dummy entries are initialized to 0.
+  if (has_above && has_left) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
+    if (above_intra && left_intra) {  // intra/intra
+      pred_context = 2;
+    } else if (above_intra || left_intra) {  // intra/inter or inter/intra
+      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
+      if (!has_second_ref(edge_mbmi)) {
+        if (edge_mbmi->ref_frame[0] == LAST_FRAME)
+          pred_context = 3;
+        else
+          pred_context = 4 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME);
+      } else {
+        pred_context = 1 + 2 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME ||
+                                edge_mbmi->ref_frame[1] == GOLDEN_FRAME);
+      }
+    } else {  // inter/inter
+      const int above_has_second = has_second_ref(above_mbmi);
+      const int left_has_second = has_second_ref(left_mbmi);
+      const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1];
+      const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
+
+      if (above_has_second && left_has_second) {
+        if (above0 == left0 && above1 == left1)
+          pred_context = 3 * (above0 == GOLDEN_FRAME ||
+                              above1 == GOLDEN_FRAME ||
+                              left0 == GOLDEN_FRAME ||
+                              left1 == GOLDEN_FRAME);
+        else
+          pred_context = 2;
+      } else if (above_has_second || left_has_second) {
+        const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
+
+        if (rfs == GOLDEN_FRAME)
+          pred_context = 3 + (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);
+        else if (rfs == ALTREF_FRAME)
+          pred_context = crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME;
+        else
+          pred_context = 1 + 2 * (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);
+      } else {
+        if (above0 == LAST_FRAME && left0 == LAST_FRAME) {
+          pred_context = 3;
+        } else if (above0 == LAST_FRAME || left0 == LAST_FRAME) {
+          const MV_REFERENCE_FRAME edge0 = (above0 == LAST_FRAME) ? left0
+                                                                  : above0;
+          pred_context = 4 * (edge0 == GOLDEN_FRAME);
+        } else {
+          pred_context = 2 * (above0 == GOLDEN_FRAME) +
+                             2 * (left0 == GOLDEN_FRAME);
+        }
+      }
+    }
+  } else if (has_above || has_left) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
+
+    if (!is_inter_block(edge_mbmi) ||
+        (edge_mbmi->ref_frame[0] == LAST_FRAME && !has_second_ref(edge_mbmi)))
+      pred_context = 2;
+    else if (!has_second_ref(edge_mbmi))
+      pred_context = 4 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME);
+    else
+      pred_context = 3 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME ||
+                          edge_mbmi->ref_frame[1] == GOLDEN_FRAME);
+  } else {  // no edges available (2)
+    pred_context = 2;
+  }
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+  return pred_context;
+}
diff --git a/libs/libvpx/vp10/common/pred_common.h b/libs/libvpx/vp10/common/pred_common.h
new file mode 100644
index 0000000000..d6d7146d7a
--- /dev/null
+++ b/libs/libvpx/vp10/common/pred_common.h
@@ -0,0 +1,172 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_COMMON_PRED_COMMON_H_
+#define VP10_COMMON_PRED_COMMON_H_
+
+#include "vp10/common/blockd.h"
+#include "vp10/common/onyxc_int.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE int get_segment_id(const VP10_COMMON *cm,
+                                 const uint8_t *segment_ids,
+                                 BLOCK_SIZE bsize, int mi_row, int mi_col) {
+  const int mi_offset = mi_row * cm->mi_cols + mi_col;
+  const int bw = num_8x8_blocks_wide_lookup[bsize];
+  const int bh = num_8x8_blocks_high_lookup[bsize];
+  const int xmis = VPXMIN(cm->mi_cols - mi_col, bw);
+  const int ymis = VPXMIN(cm->mi_rows - mi_row, bh);
+  int x, y, segment_id = MAX_SEGMENTS;
+
+  for (y = 0; y < ymis; ++y)
+    for (x = 0; x < xmis; ++x)
+      segment_id =
+          VPXMIN(segment_id, segment_ids[mi_offset + y * cm->mi_cols + x]);
+
+  assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
+  return segment_id;
+}
+
+static INLINE int vp10_get_pred_context_seg_id(const MACROBLOCKD *xd) {
+  const MODE_INFO *const above_mi = xd->above_mi;
+  const MODE_INFO *const left_mi = xd->left_mi;
+  const int above_sip = (above_mi != NULL) ?
+                        above_mi->mbmi.seg_id_predicted : 0;
+  const int left_sip = (left_mi != NULL) ? left_mi->mbmi.seg_id_predicted : 0;
+
+  return above_sip + left_sip;
+}
+
+static INLINE vpx_prob vp10_get_pred_prob_seg_id(
+    const struct segmentation_probs *segp, const MACROBLOCKD *xd) {
+  return segp->pred_probs[vp10_get_pred_context_seg_id(xd)];
+}
+
+static INLINE int vp10_get_skip_context(const MACROBLOCKD *xd) {
+  const MODE_INFO *const above_mi = xd->above_mi;
+  const MODE_INFO *const left_mi = xd->left_mi;
+  const int above_skip = (above_mi != NULL) ? above_mi->mbmi.skip : 0;
+  const int left_skip = (left_mi != NULL) ? left_mi->mbmi.skip : 0;
+  return above_skip + left_skip;
+}
+
+static INLINE vpx_prob vp10_get_skip_prob(const VP10_COMMON *cm,
+                                         const MACROBLOCKD *xd) {
+  return cm->fc->skip_probs[vp10_get_skip_context(xd)];
+}
+
+int vp10_get_pred_context_switchable_interp(const MACROBLOCKD *xd);
+
+int vp10_get_intra_inter_context(const MACROBLOCKD *xd);
+
+static INLINE vpx_prob vp10_get_intra_inter_prob(const VP10_COMMON *cm,
+                                                const MACROBLOCKD *xd) {
+  return cm->fc->intra_inter_prob[vp10_get_intra_inter_context(xd)];
+}
+
+int vp10_get_reference_mode_context(const VP10_COMMON *cm,
+                                    const MACROBLOCKD *xd);
+
+static INLINE vpx_prob vp10_get_reference_mode_prob(const VP10_COMMON *cm,
+                                                   const MACROBLOCKD *xd) {
+  return cm->fc->comp_inter_prob[vp10_get_reference_mode_context(cm, xd)];
+}
+
+int vp10_get_pred_context_comp_ref_p(const VP10_COMMON *cm,
+                                    const MACROBLOCKD *xd);
+
+static INLINE vpx_prob vp10_get_pred_prob_comp_ref_p(const VP10_COMMON *cm,
+                                                    const MACROBLOCKD *xd) {
+  const int pred_context = vp10_get_pred_context_comp_ref_p(cm, xd);
+  return cm->fc->comp_ref_prob[pred_context];
+}
+
+int vp10_get_pred_context_single_ref_p1(const MACROBLOCKD *xd);
+
+static INLINE vpx_prob vp10_get_pred_prob_single_ref_p1(const VP10_COMMON *cm,
+                                                       const MACROBLOCKD *xd) {
+  return cm->fc->single_ref_prob[vp10_get_pred_context_single_ref_p1(xd)][0];
+}
+
+int vp10_get_pred_context_single_ref_p2(const MACROBLOCKD *xd);
+
+static INLINE vpx_prob vp10_get_pred_prob_single_ref_p2(const VP10_COMMON *cm,
+                                                       const MACROBLOCKD *xd) {
+  return cm->fc->single_ref_prob[vp10_get_pred_context_single_ref_p2(xd)][1];
+}
+
+// Returns a context number for the given MB prediction signal
+// The mode info data structure has a one element border above and to the
+// left of the entries corresponding to real blocks.
+// The prediction flags in these dummy entries are initialized to 0.
+static INLINE int get_tx_size_context(const MACROBLOCKD *xd) {
+  const int max_tx_size = max_txsize_lookup[xd->mi[0]->mbmi.sb_type];
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int has_above = xd->up_available;
+  const int has_left = xd->left_available;
+  int above_ctx = (has_above && !above_mbmi->skip) ? (int)above_mbmi->tx_size
+                                                   : max_tx_size;
+  int left_ctx = (has_left && !left_mbmi->skip) ? (int)left_mbmi->tx_size
+                                                : max_tx_size;
+  if (!has_left)
+    left_ctx = above_ctx;
+
+  if (!has_above)
+    above_ctx = left_ctx;
+
+  return (above_ctx + left_ctx) > max_tx_size;
+}
+
+static INLINE const vpx_prob *get_tx_probs(TX_SIZE max_tx_size, int ctx,
+                                           const struct tx_probs *tx_probs) {
+  switch (max_tx_size) {
+    case TX_8X8:
+      return tx_probs->p8x8[ctx];
+    case TX_16X16:
+      return tx_probs->p16x16[ctx];
+    case TX_32X32:
+      return tx_probs->p32x32[ctx];
+    default:
+      assert(0 && "Invalid max_tx_size.");
+      return NULL;
+  }
+}
+
+static INLINE const vpx_prob *get_tx_probs2(TX_SIZE max_tx_size,
+                                            const MACROBLOCKD *xd,
+                                            const struct tx_probs *tx_probs) {
+  return get_tx_probs(max_tx_size, get_tx_size_context(xd), tx_probs);
+}
+
+static INLINE unsigned int *get_tx_counts(TX_SIZE max_tx_size, int ctx,
+                                          struct tx_counts *tx_counts) {
+  switch (max_tx_size) {
+    case TX_8X8:
+      return tx_counts->p8x8[ctx];
+    case TX_16X16:
+      return tx_counts->p16x16[ctx];
+    case TX_32X32:
+      return tx_counts->p32x32[ctx];
+    default:
+      assert(0 && "Invalid max_tx_size.");
+      return NULL;
+  }
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_COMMON_PRED_COMMON_H_
diff --git a/libs/libvpx/vp10/common/quant_common.c b/libs/libvpx/vp10/common/quant_common.c
new file mode 100644
index 0000000000..edf7394011
--- /dev/null
+++ b/libs/libvpx/vp10/common/quant_common.c
@@ -0,0 +1,278 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp10/common/common.h"
+#include "vp10/common/quant_common.h"
+#include "vp10/common/seg_common.h"
+
+static const int16_t dc_qlookup[QINDEX_RANGE] = {
+  4,       8,    8,    9,   10,   11,   12,   12,
+  13,     14,   15,   16,   17,   18,   19,   19,
+  20,     21,   22,   23,   24,   25,   26,   26,
+  27,     28,   29,   30,   31,   32,   32,   33,
+  34,     35,   36,   37,   38,   38,   39,   40,
+  41,     42,   43,   43,   44,   45,   46,   47,
+  48,     48,   49,   50,   51,   52,   53,   53,
+  54,     55,   56,   57,   57,   58,   59,   60,
+  61,     62,   62,   63,   64,   65,   66,   66,
+  67,     68,   69,   70,   70,   71,   72,   73,
+  74,     74,   75,   76,   77,   78,   78,   79,
+  80,     81,   81,   82,   83,   84,   85,   85,
+  87,     88,   90,   92,   93,   95,   96,   98,
+  99,    101,  102,  104,  105,  107,  108,  110,
+  111,   113,  114,  116,  117,  118,  120,  121,
+  123,   125,  127,  129,  131,  134,  136,  138,
+  140,   142,  144,  146,  148,  150,  152,  154,
+  156,   158,  161,  164,  166,  169,  172,  174,
+  177,   180,  182,  185,  187,  190,  192,  195,
+  199,   202,  205,  208,  211,  214,  217,  220,
+  223,   226,  230,  233,  237,  240,  243,  247,
+  250,   253,  257,  261,  265,  269,  272,  276,
+  280,   284,  288,  292,  296,  300,  304,  309,
+  313,   317,  322,  326,  330,  335,  340,  344,
+  349,   354,  359,  364,  369,  374,  379,  384,
+  389,   395,  400,  406,  411,  417,  423,  429,
+  435,   441,  447,  454,  461,  467,  475,  482,
+  489,   497,  505,  513,  522,  530,  539,  549,
+  559,   569,  579,  590,  602,  614,  626,  640,
+  654,   668,  684,  700,  717,  736,  755,  775,
+  796,   819,  843,  869,  896,  925,  955,  988,
+  1022, 1058, 1098, 1139, 1184, 1232, 1282, 1336,
+};
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static const int16_t dc_qlookup_10[QINDEX_RANGE] = {
+  4,     9,    10,    13,    15,    17,    20,    22,
+  25,    28,    31,    34,    37,    40,    43,    47,
+  50,    53,    57,    60,    64,    68,    71,    75,
+  78,    82,    86,    90,    93,    97,   101,   105,
+  109,   113,   116,   120,   124,   128,   132,   136,
+  140,   143,   147,   151,   155,   159,   163,   166,
+  170,   174,   178,   182,   185,   189,   193,   197,
+  200,   204,   208,   212,   215,   219,   223,   226,
+  230,   233,   237,   241,   244,   248,   251,   255,
+  259,   262,   266,   269,   273,   276,   280,   283,
+  287,   290,   293,   297,   300,   304,   307,   310,
+  314,   317,   321,   324,   327,   331,   334,   337,
+  343,   350,   356,   362,   369,   375,   381,   387,
+  394,   400,   406,   412,   418,   424,   430,   436,
+  442,   448,   454,   460,   466,   472,   478,   484,
+  490,   499,   507,   516,   525,   533,   542,   550,
+  559,   567,   576,   584,   592,   601,   609,   617,
+  625,   634,   644,   655,   666,   676,   687,   698,
+  708,   718,   729,   739,   749,   759,   770,   782,
+  795,   807,   819,   831,   844,   856,   868,   880,
+  891,   906,   920,   933,   947,   961,   975,   988,
+  1001,  1015,  1030,  1045,  1061,  1076,  1090,  1105,
+  1120,  1137,  1153,  1170,  1186,  1202,  1218,  1236,
+  1253,  1271,  1288,  1306,  1323,  1342,  1361,  1379,
+  1398,  1416,  1436,  1456,  1476,  1496,  1516,  1537,
+  1559,  1580,  1601,  1624,  1647,  1670,  1692,  1717,
+  1741,  1766,  1791,  1817,  1844,  1871,  1900,  1929,
+  1958,  1990,  2021,  2054,  2088,  2123,  2159,  2197,
+  2236,  2276,  2319,  2363,  2410,  2458,  2508,  2561,
+  2616,  2675,  2737,  2802,  2871,  2944,  3020,  3102,
+  3188,  3280,  3375,  3478,  3586,  3702,  3823,  3953,
+  4089,  4236,  4394,  4559,  4737,  4929,  5130,  5347,
+};
+
+static const int16_t dc_qlookup_12[QINDEX_RANGE] = {
+  4,    12,    18,    25,    33,    41,    50,    60,
+  70,    80,    91,   103,   115,   127,   140,   153,
+  166,   180,   194,   208,   222,   237,   251,   266,
+  281,   296,   312,   327,   343,   358,   374,   390,
+  405,   421,   437,   453,   469,   484,   500,   516,
+  532,   548,   564,   580,   596,   611,   627,   643,
+  659,   674,   690,   706,   721,   737,   752,   768,
+  783,   798,   814,   829,   844,   859,   874,   889,
+  904,   919,   934,   949,   964,   978,   993,  1008,
+  1022,  1037,  1051,  1065,  1080,  1094,  1108,  1122,
+  1136,  1151,  1165,  1179,  1192,  1206,  1220,  1234,
+  1248,  1261,  1275,  1288,  1302,  1315,  1329,  1342,
+  1368,  1393,  1419,  1444,  1469,  1494,  1519,  1544,
+  1569,  1594,  1618,  1643,  1668,  1692,  1717,  1741,
+  1765,  1789,  1814,  1838,  1862,  1885,  1909,  1933,
+  1957,  1992,  2027,  2061,  2096,  2130,  2165,  2199,
+  2233,  2267,  2300,  2334,  2367,  2400,  2434,  2467,
+  2499,  2532,  2575,  2618,  2661,  2704,  2746,  2788,
+  2830,  2872,  2913,  2954,  2995,  3036,  3076,  3127,
+  3177,  3226,  3275,  3324,  3373,  3421,  3469,  3517,
+  3565,  3621,  3677,  3733,  3788,  3843,  3897,  3951,
+  4005,  4058,  4119,  4181,  4241,  4301,  4361,  4420,
+  4479,  4546,  4612,  4677,  4742,  4807,  4871,  4942,
+  5013,  5083,  5153,  5222,  5291,  5367,  5442,  5517,
+  5591,  5665,  5745,  5825,  5905,  5984,  6063,  6149,
+  6234,  6319,  6404,  6495,  6587,  6678,  6769,  6867,
+  6966,  7064,  7163,  7269,  7376,  7483,  7599,  7715,
+  7832,  7958,  8085,  8214,  8352,  8492,  8635,  8788,
+  8945,  9104,  9275,  9450,  9639,  9832, 10031, 10245,
+  10465, 10702, 10946, 11210, 11482, 11776, 12081, 12409,
+  12750, 13118, 13501, 13913, 14343, 14807, 15290, 15812,
+  16356, 16943, 17575, 18237, 18949, 19718, 20521, 21387,
+};
+#endif
+
+static const int16_t ac_qlookup[QINDEX_RANGE] = {
+  4,       8,    9,   10,   11,   12,   13,   14,
+  15,     16,   17,   18,   19,   20,   21,   22,
+  23,     24,   25,   26,   27,   28,   29,   30,
+  31,     32,   33,   34,   35,   36,   37,   38,
+  39,     40,   41,   42,   43,   44,   45,   46,
+  47,     48,   49,   50,   51,   52,   53,   54,
+  55,     56,   57,   58,   59,   60,   61,   62,
+  63,     64,   65,   66,   67,   68,   69,   70,
+  71,     72,   73,   74,   75,   76,   77,   78,
+  79,     80,   81,   82,   83,   84,   85,   86,
+  87,     88,   89,   90,   91,   92,   93,   94,
+  95,     96,   97,   98,   99,  100,  101,  102,
+  104,   106,  108,  110,  112,  114,  116,  118,
+  120,   122,  124,  126,  128,  130,  132,  134,
+  136,   138,  140,  142,  144,  146,  148,  150,
+  152,   155,  158,  161,  164,  167,  170,  173,
+  176,   179,  182,  185,  188,  191,  194,  197,
+  200,   203,  207,  211,  215,  219,  223,  227,
+  231,   235,  239,  243,  247,  251,  255,  260,
+  265,   270,  275,  280,  285,  290,  295,  300,
+  305,   311,  317,  323,  329,  335,  341,  347,
+  353,   359,  366,  373,  380,  387,  394,  401,
+  408,   416,  424,  432,  440,  448,  456,  465,
+  474,   483,  492,  501,  510,  520,  530,  540,
+  550,   560,  571,  582,  593,  604,  615,  627,
+  639,   651,  663,  676,  689,  702,  715,  729,
+  743,   757,  771,  786,  801,  816,  832,  848,
+  864,   881,  898,  915,  933,  951,  969,  988,
+  1007, 1026, 1046, 1066, 1087, 1108, 1129, 1151,
+  1173, 1196, 1219, 1243, 1267, 1292, 1317, 1343,
+  1369, 1396, 1423, 1451, 1479, 1508, 1537, 1567,
+  1597, 1628, 1660, 1692, 1725, 1759, 1793, 1828,
+};
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static const int16_t ac_qlookup_10[QINDEX_RANGE] = {
+  4,     9,    11,    13,    16,    18,    21,    24,
+  27,    30,    33,    37,    40,    44,    48,    51,
+  55,    59,    63,    67,    71,    75,    79,    83,
+  88,    92,    96,   100,   105,   109,   114,   118,
+  122,   127,   131,   136,   140,   145,   149,   154,
+  158,   163,   168,   172,   177,   181,   186,   190,
+  195,   199,   204,   208,   213,   217,   222,   226,
+  231,   235,   240,   244,   249,   253,   258,   262,
+  267,   271,   275,   280,   284,   289,   293,   297,
+  302,   306,   311,   315,   319,   324,   328,   332,
+  337,   341,   345,   349,   354,   358,   362,   367,
+  371,   375,   379,   384,   388,   392,   396,   401,
+  409,   417,   425,   433,   441,   449,   458,   466,
+  474,   482,   490,   498,   506,   514,   523,   531,
+  539,   547,   555,   563,   571,   579,   588,   596,
+  604,   616,   628,   640,   652,   664,   676,   688,
+  700,   713,   725,   737,   749,   761,   773,   785,
+  797,   809,   825,   841,   857,   873,   889,   905,
+  922,   938,   954,   970,   986,  1002,  1018,  1038,
+  1058,  1078,  1098,  1118,  1138,  1158,  1178,  1198,
+  1218,  1242,  1266,  1290,  1314,  1338,  1362,  1386,
+  1411,  1435,  1463,  1491,  1519,  1547,  1575,  1603,
+  1631,  1663,  1695,  1727,  1759,  1791,  1823,  1859,
+  1895,  1931,  1967,  2003,  2039,  2079,  2119,  2159,
+  2199,  2239,  2283,  2327,  2371,  2415,  2459,  2507,
+  2555,  2603,  2651,  2703,  2755,  2807,  2859,  2915,
+  2971,  3027,  3083,  3143,  3203,  3263,  3327,  3391,
+  3455,  3523,  3591,  3659,  3731,  3803,  3876,  3952,
+  4028,  4104,  4184,  4264,  4348,  4432,  4516,  4604,
+  4692,  4784,  4876,  4972,  5068,  5168,  5268,  5372,
+  5476,  5584,  5692,  5804,  5916,  6032,  6148,  6268,
+  6388,  6512,  6640,  6768,  6900,  7036,  7172,  7312,
+};
+
+static const int16_t ac_qlookup_12[QINDEX_RANGE] = {
+  4,    13,    19,    27,    35,    44,    54,    64,
+  75,    87,    99,   112,   126,   139,   154,   168,
+  183,   199,   214,   230,   247,   263,   280,   297,
+  314,   331,   349,   366,   384,   402,   420,   438,
+  456,   475,   493,   511,   530,   548,   567,   586,
+  604,   623,   642,   660,   679,   698,   716,   735,
+  753,   772,   791,   809,   828,   846,   865,   884,
+  902,   920,   939,   957,   976,   994,  1012,  1030,
+  1049,  1067,  1085,  1103,  1121,  1139,  1157,  1175,
+  1193,  1211,  1229,  1246,  1264,  1282,  1299,  1317,
+  1335,  1352,  1370,  1387,  1405,  1422,  1440,  1457,
+  1474,  1491,  1509,  1526,  1543,  1560,  1577,  1595,
+  1627,  1660,  1693,  1725,  1758,  1791,  1824,  1856,
+  1889,  1922,  1954,  1987,  2020,  2052,  2085,  2118,
+  2150,  2183,  2216,  2248,  2281,  2313,  2346,  2378,
+  2411,  2459,  2508,  2556,  2605,  2653,  2701,  2750,
+  2798,  2847,  2895,  2943,  2992,  3040,  3088,  3137,
+  3185,  3234,  3298,  3362,  3426,  3491,  3555,  3619,
+  3684,  3748,  3812,  3876,  3941,  4005,  4069,  4149,
+  4230,  4310,  4390,  4470,  4550,  4631,  4711,  4791,
+  4871,  4967,  5064,  5160,  5256,  5352,  5448,  5544,
+  5641,  5737,  5849,  5961,  6073,  6185,  6297,  6410,
+  6522,  6650,  6778,  6906,  7034,  7162,  7290,  7435,
+  7579,  7723,  7867,  8011,  8155,  8315,  8475,  8635,
+  8795,  8956,  9132,  9308,  9484,  9660,  9836, 10028,
+  10220, 10412, 10604, 10812, 11020, 11228, 11437, 11661,
+  11885, 12109, 12333, 12573, 12813, 13053, 13309, 13565,
+  13821, 14093, 14365, 14637, 14925, 15213, 15502, 15806,
+  16110, 16414, 16734, 17054, 17390, 17726, 18062, 18414,
+  18766, 19134, 19502, 19886, 20270, 20670, 21070, 21486,
+  21902, 22334, 22766, 23214, 23662, 24126, 24590, 25070,
+  25551, 26047, 26559, 27071, 27599, 28143, 28687, 29247,
+};
+#endif
+
+int16_t vp10_dc_quant(int qindex, int delta, vpx_bit_depth_t bit_depth) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  switch (bit_depth) {
+    case VPX_BITS_8:
+      return dc_qlookup[clamp(qindex + delta, 0, MAXQ)];
+    case VPX_BITS_10:
+      return dc_qlookup_10[clamp(qindex + delta, 0, MAXQ)];
+    case VPX_BITS_12:
+      return dc_qlookup_12[clamp(qindex + delta, 0, MAXQ)];
+    default:
+      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+      return -1;
+  }
+#else
+  (void) bit_depth;
+  return dc_qlookup[clamp(qindex + delta, 0, MAXQ)];
+#endif
+}
+
+int16_t vp10_ac_quant(int qindex, int delta, vpx_bit_depth_t bit_depth) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  switch (bit_depth) {
+    case VPX_BITS_8:
+      return ac_qlookup[clamp(qindex + delta, 0, MAXQ)];
+    case VPX_BITS_10:
+      return ac_qlookup_10[clamp(qindex + delta, 0, MAXQ)];
+    case VPX_BITS_12:
+      return ac_qlookup_12[clamp(qindex + delta, 0, MAXQ)];
+    default:
+      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+      return -1;
+  }
+#else
+  (void) bit_depth;
+  return ac_qlookup[clamp(qindex + delta, 0, MAXQ)];
+#endif
+}
+
+int vp10_get_qindex(const struct segmentation *seg, int segment_id,
+                   int base_qindex) {
+  if (segfeature_active(seg, segment_id, SEG_LVL_ALT_Q)) {
+    const int data = get_segdata(seg, segment_id, SEG_LVL_ALT_Q);
+    const int seg_qindex = seg->abs_delta == SEGMENT_ABSDATA ?
+        data : base_qindex + data;
+    return clamp(seg_qindex, 0, MAXQ);
+  } else {
+    return base_qindex;
+  }
+}
+
diff --git a/libs/libvpx/vp10/common/quant_common.h b/libs/libvpx/vp10/common/quant_common.h
new file mode 100644
index 0000000000..6813e1734c
--- /dev/null
+++ b/libs/libvpx/vp10/common/quant_common.h
@@ -0,0 +1,36 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_COMMON_QUANT_COMMON_H_
+#define VP10_COMMON_QUANT_COMMON_H_
+
+#include "vpx/vpx_codec.h"
+#include "vp10/common/seg_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MINQ 0
+#define MAXQ 255
+#define QINDEX_RANGE (MAXQ - MINQ + 1)
+#define QINDEX_BITS 8
+
+int16_t vp10_dc_quant(int qindex, int delta, vpx_bit_depth_t bit_depth);
+int16_t vp10_ac_quant(int qindex, int delta, vpx_bit_depth_t bit_depth);
+
+int vp10_get_qindex(const struct segmentation *seg, int segment_id,
+                   int base_qindex);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_COMMON_QUANT_COMMON_H_
diff --git a/libs/libvpx/vp10/common/reconinter.c b/libs/libvpx/vp10/common/reconinter.c
new file mode 100644
index 0000000000..fdcb9673cf
--- /dev/null
+++ b/libs/libvpx/vp10/common/reconinter.c
@@ -0,0 +1,266 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vpx_scale_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+
+#include "vp10/common/blockd.h"
+#include "vp10/common/reconinter.h"
+#include "vp10/common/reconintra.h"
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp10_highbd_build_inter_predictor(const uint8_t *src, int src_stride,
+                                      uint8_t *dst, int dst_stride,
+                                      const MV *src_mv,
+                                      const struct scale_factors *sf,
+                                      int w, int h, int ref,
+                                      const InterpKernel *kernel,
+                                      enum mv_precision precision,
+                                      int x, int y, int bd) {
+  const int is_q4 = precision == MV_PRECISION_Q4;
+  const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2,
+                     is_q4 ? src_mv->col : src_mv->col * 2 };
+  MV32 mv = vp10_scale_mv(&mv_q4, x, y, sf);
+  const int subpel_x = mv.col & SUBPEL_MASK;
+  const int subpel_y = mv.row & SUBPEL_MASK;
+
+  src += (mv.row >> SUBPEL_BITS) * src_stride + (mv.col >> SUBPEL_BITS);
+
+  high_inter_predictor(src, src_stride, dst, dst_stride, subpel_x, subpel_y,
+                       sf, w, h, ref, kernel, sf->x_step_q4, sf->y_step_q4, bd);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+void vp10_build_inter_predictor(const uint8_t *src, int src_stride,
+                               uint8_t *dst, int dst_stride,
+                               const MV *src_mv,
+                               const struct scale_factors *sf,
+                               int w, int h, int ref,
+                               const InterpKernel *kernel,
+                               enum mv_precision precision,
+                               int x, int y) {
+  const int is_q4 = precision == MV_PRECISION_Q4;
+  const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2,
+                     is_q4 ? src_mv->col : src_mv->col * 2 };
+  MV32 mv = vp10_scale_mv(&mv_q4, x, y, sf);
+  const int subpel_x = mv.col & SUBPEL_MASK;
+  const int subpel_y = mv.row & SUBPEL_MASK;
+
+  src += (mv.row >> SUBPEL_BITS) * src_stride + (mv.col >> SUBPEL_BITS);
+
+  inter_predictor(src, src_stride, dst, dst_stride, subpel_x, subpel_y,
+                  sf, w, h, ref, kernel, sf->x_step_q4, sf->y_step_q4);
+}
+
+void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
+                                   int bw, int bh,
+                                   int x, int y, int w, int h,
+                                   int mi_x, int mi_y) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const MODE_INFO *mi = xd->mi[0];
+  const int is_compound = has_second_ref(&mi->mbmi);
+  const InterpKernel *kernel = vp10_filter_kernels[mi->mbmi.interp_filter];
+  int ref;
+
+  for (ref = 0; ref < 1 + is_compound; ++ref) {
+    const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
+    struct buf_2d *const pre_buf = &pd->pre[ref];
+    struct buf_2d *const dst_buf = &pd->dst;
+    uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
+    const MV mv = mi->mbmi.sb_type < BLOCK_8X8
+               ? average_split_mvs(pd, mi, ref, block)
+               : mi->mbmi.mv[ref].as_mv;
+
+    // TODO(jkoleszar): This clamping is done in the incorrect place for the
+    // scaling case. It needs to be done on the scaled MV, not the pre-scaling
+    // MV. Note however that it performs the subsampling aware scaling so
+    // that the result is always q4.
+    // mv_precision precision is MV_PRECISION_Q4.
+    const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh,
+                                               pd->subsampling_x,
+                                               pd->subsampling_y);
+
+    uint8_t *pre;
+    MV32 scaled_mv;
+    int xs, ys, subpel_x, subpel_y;
+    const int is_scaled = vp10_is_scaled(sf);
+
+    if (is_scaled) {
+      pre = pre_buf->buf + scaled_buffer_offset(x, y, pre_buf->stride, sf);
+      scaled_mv = vp10_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
+      xs = sf->x_step_q4;
+      ys = sf->y_step_q4;
+    } else {
+      pre = pre_buf->buf + (y * pre_buf->stride + x);
+      scaled_mv.row = mv_q4.row;
+      scaled_mv.col = mv_q4.col;
+      xs = ys = 16;
+    }
+    subpel_x = scaled_mv.col & SUBPEL_MASK;
+    subpel_y = scaled_mv.row & SUBPEL_MASK;
+    pre += (scaled_mv.row >> SUBPEL_BITS) * pre_buf->stride
+           + (scaled_mv.col >> SUBPEL_BITS);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      high_inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
+                           subpel_x, subpel_y, sf, w, h, ref, kernel, xs, ys,
+                           xd->bd);
+    } else {
+      inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
+                      subpel_x, subpel_y, sf, w, h, ref, kernel, xs, ys);
+    }
+#else
+    inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
+                    subpel_x, subpel_y, sf, w, h, ref, kernel, xs, ys);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  }
+}
+
+void vp10_build_inter_predictor_sub8x8(MACROBLOCKD *xd, int plane,
+                                       int i, int ir, int ic,
+                                       int mi_row, int mi_col) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  MODE_INFO *const mi = xd->mi[0];
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(mi->mbmi.sb_type, pd);
+  const int width = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+  const int height = 4 * num_4x4_blocks_high_lookup[plane_bsize];
+
+  uint8_t *const dst = &pd->dst.buf[(ir * pd->dst.stride + ic) << 2];
+  int ref;
+  const int is_compound = has_second_ref(&mi->mbmi);
+  const InterpKernel *kernel = vp10_filter_kernels[mi->mbmi.interp_filter];
+
+  for (ref = 0; ref < 1 + is_compound; ++ref) {
+    const uint8_t *pre =
+        &pd->pre[ref].buf[(ir * pd->pre[ref].stride + ic) << 2];
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    vp10_highbd_build_inter_predictor(pre, pd->pre[ref].stride,
+                                      dst, pd->dst.stride,
+                                      &mi->bmi[i].as_mv[ref].as_mv,
+                                      &xd->block_refs[ref]->sf, width, height,
+                                      ref, kernel, MV_PRECISION_Q3,
+                                      mi_col * MI_SIZE + 4 * ic,
+                                      mi_row * MI_SIZE + 4 * ir, xd->bd);
+  } else {
+    vp10_build_inter_predictor(pre, pd->pre[ref].stride,
+                               dst, pd->dst.stride,
+                               &mi->bmi[i].as_mv[ref].as_mv,
+                               &xd->block_refs[ref]->sf, width, height, ref,
+                               kernel, MV_PRECISION_Q3,
+                               mi_col * MI_SIZE + 4 * ic,
+                               mi_row * MI_SIZE + 4 * ir);
+  }
+#else
+    vp10_build_inter_predictor(pre, pd->pre[ref].stride,
+                               dst, pd->dst.stride,
+                               &mi->bmi[i].as_mv[ref].as_mv,
+                               &xd->block_refs[ref]->sf, width, height, ref,
+                               kernel, MV_PRECISION_Q3,
+                               mi_col * MI_SIZE + 4 * ic,
+                               mi_row * MI_SIZE + 4 * ir);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  }
+}
+
+static void build_inter_predictors_for_planes(MACROBLOCKD *xd, BLOCK_SIZE bsize,
+                                              int mi_row, int mi_col,
+                                              int plane_from, int plane_to) {
+  int plane;
+  const int mi_x = mi_col * MI_SIZE;
+  const int mi_y = mi_row * MI_SIZE;
+  for (plane = plane_from; plane <= plane_to; ++plane) {
+    const struct macroblockd_plane *pd = &xd->plane[plane];
+    const int bw = 4 * num_4x4_blocks_wide_lookup[bsize] >> pd->subsampling_x;
+    const int bh = 4 * num_4x4_blocks_high_lookup[bsize] >> pd->subsampling_y;
+
+    if (xd->mi[0]->mbmi.sb_type < BLOCK_8X8) {
+      const PARTITION_TYPE bp = bsize - xd->mi[0]->mbmi.sb_type;
+      const int have_vsplit = bp != PARTITION_HORZ;
+      const int have_hsplit = bp != PARTITION_VERT;
+      const int num_4x4_w = 2 >> ((!have_vsplit) | pd->subsampling_x);
+      const int num_4x4_h = 2 >> ((!have_hsplit) | pd->subsampling_y);
+      const int pw = 8 >> (have_vsplit | pd->subsampling_x);
+      const int ph = 8 >> (have_hsplit | pd->subsampling_y);
+      int x, y;
+      assert(bp != PARTITION_NONE && bp < PARTITION_TYPES);
+      assert(bsize == BLOCK_8X8);
+      assert(pw * num_4x4_w == bw && ph * num_4x4_h == bh);
+      for (y = 0; y < num_4x4_h; ++y)
+        for (x = 0; x < num_4x4_w; ++x)
+           build_inter_predictors(xd, plane, y * 2 + x, bw, bh,
+                                  4 * x, 4 * y, pw, ph, mi_x, mi_y);
+    } else {
+      build_inter_predictors(xd, plane, 0, bw, bh,
+                             0, 0, bw, bh, mi_x, mi_y);
+    }
+  }
+}
+
+void vp10_build_inter_predictors_sby(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                    BLOCK_SIZE bsize) {
+  build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 0, 0);
+}
+
+void vp10_build_inter_predictors_sbp(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                    BLOCK_SIZE bsize, int plane) {
+  build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, plane, plane);
+}
+
+void vp10_build_inter_predictors_sbuv(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                     BLOCK_SIZE bsize) {
+  build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 1,
+                                    MAX_MB_PLANE - 1);
+}
+
+void vp10_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                   BLOCK_SIZE bsize) {
+  build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 0,
+                                    MAX_MB_PLANE - 1);
+}
+
+void vp10_setup_dst_planes(struct macroblockd_plane planes[MAX_MB_PLANE],
+                          const YV12_BUFFER_CONFIG *src,
+                          int mi_row, int mi_col) {
+  uint8_t *const buffers[MAX_MB_PLANE] = { src->y_buffer, src->u_buffer,
+      src->v_buffer};
+  const int strides[MAX_MB_PLANE] = { src->y_stride, src->uv_stride,
+      src->uv_stride};
+  int i;
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    struct macroblockd_plane *const pd = &planes[i];
+    setup_pred_plane(&pd->dst, buffers[i], strides[i], mi_row, mi_col, NULL,
+                     pd->subsampling_x, pd->subsampling_y);
+  }
+}
+
+void vp10_setup_pre_planes(MACROBLOCKD *xd, int idx,
+                          const YV12_BUFFER_CONFIG *src,
+                          int mi_row, int mi_col,
+                          const struct scale_factors *sf) {
+  if (src != NULL) {
+    int i;
+    uint8_t *const buffers[MAX_MB_PLANE] = { src->y_buffer, src->u_buffer,
+        src->v_buffer};
+    const int strides[MAX_MB_PLANE] = { src->y_stride, src->uv_stride,
+        src->uv_stride};
+    for (i = 0; i < MAX_MB_PLANE; ++i) {
+      struct macroblockd_plane *const pd = &xd->plane[i];
+      setup_pred_plane(&pd->pre[idx], buffers[i], strides[i], mi_row, mi_col,
+                       sf, pd->subsampling_x, pd->subsampling_y);
+    }
+  }
+}
diff --git a/libs/libvpx/vp10/common/reconinter.h b/libs/libvpx/vp10/common/reconinter.h
new file mode 100644
index 0000000000..5678f473f6
--- /dev/null
+++ b/libs/libvpx/vp10/common/reconinter.h
@@ -0,0 +1,200 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_COMMON_RECONINTER_H_
+#define VP10_COMMON_RECONINTER_H_
+
+#include "vp10/common/filter.h"
+#include "vp10/common/onyxc_int.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_filter.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE void inter_predictor(const uint8_t *src, int src_stride,
+                                   uint8_t *dst, int dst_stride,
+                                   const int subpel_x,
+                                   const int subpel_y,
+                                   const struct scale_factors *sf,
+                                   int w, int h, int ref,
+                                   const InterpKernel *kernel,
+                                   int xs, int ys) {
+  sf->predict[subpel_x != 0][subpel_y != 0][ref](
+      src, src_stride, dst, dst_stride,
+      kernel[subpel_x], xs, kernel[subpel_y], ys, w, h);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void high_inter_predictor(const uint8_t *src, int src_stride,
+                                        uint8_t *dst, int dst_stride,
+                                        const int subpel_x,
+                                        const int subpel_y,
+                                        const struct scale_factors *sf,
+                                        int w, int h, int ref,
+                                        const InterpKernel *kernel,
+                                        int xs, int ys, int bd) {
+  sf->highbd_predict[subpel_x != 0][subpel_y != 0][ref](
+      src, src_stride, dst, dst_stride,
+      kernel[subpel_x], xs, kernel[subpel_y], ys, w, h, bd);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static INLINE int round_mv_comp_q4(int value) {
+  return (value < 0 ? value - 2 : value + 2) / 4;
+}
+
+static MV mi_mv_pred_q4(const MODE_INFO *mi, int idx) {
+  MV res = { round_mv_comp_q4(mi->bmi[0].as_mv[idx].as_mv.row +
+                              mi->bmi[1].as_mv[idx].as_mv.row +
+                              mi->bmi[2].as_mv[idx].as_mv.row +
+                              mi->bmi[3].as_mv[idx].as_mv.row),
+             round_mv_comp_q4(mi->bmi[0].as_mv[idx].as_mv.col +
+                              mi->bmi[1].as_mv[idx].as_mv.col +
+                              mi->bmi[2].as_mv[idx].as_mv.col +
+                              mi->bmi[3].as_mv[idx].as_mv.col) };
+  return res;
+}
+
+static INLINE int round_mv_comp_q2(int value) {
+  return (value < 0 ? value - 1 : value + 1) / 2;
+}
+
+static MV mi_mv_pred_q2(const MODE_INFO *mi, int idx, int block0, int block1) {
+  MV res = { round_mv_comp_q2(mi->bmi[block0].as_mv[idx].as_mv.row +
+                              mi->bmi[block1].as_mv[idx].as_mv.row),
+             round_mv_comp_q2(mi->bmi[block0].as_mv[idx].as_mv.col +
+                              mi->bmi[block1].as_mv[idx].as_mv.col) };
+  return res;
+}
+
+// TODO(jkoleszar): yet another mv clamping function :-(
+static INLINE MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd,
+                                           const MV *src_mv,
+                                           int bw, int bh, int ss_x, int ss_y) {
+  // If the MV points so far into the UMV border that no visible pixels
+  // are used for reconstruction, the subpel part of the MV can be
+  // discarded and the MV limited to 16 pixels with equivalent results.
+  const int spel_left = (VP9_INTERP_EXTEND + bw) << SUBPEL_BITS;
+  const int spel_right = spel_left - SUBPEL_SHIFTS;
+  const int spel_top = (VP9_INTERP_EXTEND + bh) << SUBPEL_BITS;
+  const int spel_bottom = spel_top - SUBPEL_SHIFTS;
+  MV clamped_mv = {
+    src_mv->row * (1 << (1 - ss_y)),
+    src_mv->col * (1 << (1 - ss_x))
+  };
+  assert(ss_x <= 1);
+  assert(ss_y <= 1);
+
+  clamp_mv(&clamped_mv,
+           xd->mb_to_left_edge * (1 << (1 - ss_x)) - spel_left,
+           xd->mb_to_right_edge * (1 << (1 - ss_x)) + spel_right,
+           xd->mb_to_top_edge * (1 << (1 - ss_y)) - spel_top,
+           xd->mb_to_bottom_edge * (1 << (1 - ss_y)) + spel_bottom);
+
+  return clamped_mv;
+}
+
+static INLINE MV average_split_mvs(const struct macroblockd_plane *pd,
+                                   const MODE_INFO *mi, int ref, int block) {
+  const int ss_idx = ((pd->subsampling_x > 0) << 1) | (pd->subsampling_y > 0);
+  MV res = {0, 0};
+  switch (ss_idx) {
+    case 0:
+      res = mi->bmi[block].as_mv[ref].as_mv;
+      break;
+    case 1:
+      res = mi_mv_pred_q2(mi, ref, block, block + 2);
+      break;
+    case 2:
+      res = mi_mv_pred_q2(mi, ref, block, block + 1);
+      break;
+    case 3:
+      res = mi_mv_pred_q4(mi, ref);
+      break;
+    default:
+      assert(ss_idx <= 3 && ss_idx >= 0);
+  }
+  return res;
+}
+
+void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
+                                   int bw, int bh,
+                                   int x, int y, int w, int h,
+                                   int mi_x, int mi_y);
+
+void vp10_build_inter_predictor_sub8x8(MACROBLOCKD *xd, int plane,
+                                       int i, int ir, int ic,
+                                       int mi_row, int mi_col);
+
+void vp10_build_inter_predictors_sby(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                    BLOCK_SIZE bsize);
+
+void vp10_build_inter_predictors_sbp(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                    BLOCK_SIZE bsize, int plane);
+
+void vp10_build_inter_predictors_sbuv(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                     BLOCK_SIZE bsize);
+
+void vp10_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                   BLOCK_SIZE bsize);
+
+void vp10_build_inter_predictor(const uint8_t *src, int src_stride,
+                               uint8_t *dst, int dst_stride,
+                               const MV *mv_q3,
+                               const struct scale_factors *sf,
+                               int w, int h, int do_avg,
+                               const InterpKernel *kernel,
+                               enum mv_precision precision,
+                               int x, int y);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp10_highbd_build_inter_predictor(const uint8_t *src, int src_stride,
+                                      uint8_t *dst, int dst_stride,
+                                      const MV *mv_q3,
+                                      const struct scale_factors *sf,
+                                      int w, int h, int do_avg,
+                                      const InterpKernel *kernel,
+                                      enum mv_precision precision,
+                                      int x, int y, int bd);
+#endif
+
+static INLINE int scaled_buffer_offset(int x_offset, int y_offset, int stride,
+                                       const struct scale_factors *sf) {
+  const int x = sf ? sf->scale_value_x(x_offset, sf) : x_offset;
+  const int y = sf ? sf->scale_value_y(y_offset, sf) : y_offset;
+  return y * stride + x;
+}
+
+static INLINE void setup_pred_plane(struct buf_2d *dst,
+                                    uint8_t *src, int stride,
+                                    int mi_row, int mi_col,
+                                    const struct scale_factors *scale,
+                                    int subsampling_x, int subsampling_y) {
+  const int x = (MI_SIZE * mi_col) >> subsampling_x;
+  const int y = (MI_SIZE * mi_row) >> subsampling_y;
+  dst->buf = src + scaled_buffer_offset(x, y, stride, scale);
+  dst->stride = stride;
+}
+
+void vp10_setup_dst_planes(struct macroblockd_plane planes[MAX_MB_PLANE],
+                          const YV12_BUFFER_CONFIG *src,
+                          int mi_row, int mi_col);
+
+void vp10_setup_pre_planes(MACROBLOCKD *xd, int idx,
+                          const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
+                          const struct scale_factors *sf);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_COMMON_RECONINTER_H_
diff --git a/libs/libvpx/vp10/common/reconintra.c b/libs/libvpx/vp10/common/reconintra.c
new file mode 100644
index 0000000000..e9e3949ad7
--- /dev/null
+++ b/libs/libvpx/vp10/common/reconintra.c
@@ -0,0 +1,793 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#if CONFIG_VP9_HIGHBITDEPTH
+#include "vpx_dsp/vpx_dsp_common.h"
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/vpx_once.h"
+
+#include "vp10/common/reconintra.h"
+#include "vp10/common/onyxc_int.h"
+
+#if CONFIG_MISC_FIXES
+enum {
+  NEED_LEFT = 1 << 1,
+  NEED_ABOVE = 1 << 2,
+  NEED_ABOVERIGHT = 1 << 3,
+  NEED_ABOVELEFT = 1 << 4,
+  NEED_BOTTOMLEFT = 1 << 5,
+};
+
+static const uint8_t extend_modes[INTRA_MODES] = {
+  NEED_ABOVE | NEED_LEFT,                   // DC
+  NEED_ABOVE,                               // V
+  NEED_LEFT,                                // H
+  NEED_ABOVE | NEED_ABOVERIGHT,             // D45
+  NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT,  // D135
+  NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT,  // D117
+  NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT,  // D153
+  NEED_LEFT | NEED_BOTTOMLEFT,              // D207
+  NEED_ABOVE | NEED_ABOVERIGHT,             // D63
+  NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT,  // TM
+};
+#else
+enum {
+  NEED_LEFT = 1 << 1,
+  NEED_ABOVE = 1 << 2,
+  NEED_ABOVERIGHT = 1 << 3,
+};
+
+static const uint8_t extend_modes[INTRA_MODES] = {
+  NEED_ABOVE | NEED_LEFT,       // DC
+  NEED_ABOVE,                   // V
+  NEED_LEFT,                    // H
+  NEED_ABOVERIGHT,              // D45
+  NEED_LEFT | NEED_ABOVE,       // D135
+  NEED_LEFT | NEED_ABOVE,       // D117
+  NEED_LEFT | NEED_ABOVE,       // D153
+  NEED_LEFT,                    // D207
+  NEED_ABOVERIGHT,              // D63
+  NEED_LEFT | NEED_ABOVE,       // TM
+};
+#endif
+
+#if CONFIG_MISC_FIXES
+static const uint8_t orders_64x64[1] = { 0 };
+static const uint8_t orders_64x32[2] = { 0, 1 };
+static const uint8_t orders_32x64[2] = { 0, 1 };
+static const uint8_t orders_32x32[4] = {
+  0, 1,
+  2, 3,
+};
+static const uint8_t orders_32x16[8] = {
+  0, 2,
+  1, 3,
+  4, 6,
+  5, 7,
+};
+static const uint8_t orders_16x32[8] = {
+  0, 1, 2, 3,
+  4, 5, 6, 7,
+};
+static const uint8_t orders_16x16[16] = {
+  0,   1,  4,  5,
+  2,   3,  6,  7,
+  8,   9, 12, 13,
+  10, 11, 14, 15,
+};
+static const uint8_t orders_16x8[32] = {
+  0,   2,  8, 10,
+  1,   3,  9, 11,
+  4,   6, 12, 14,
+  5,   7, 13, 15,
+  16, 18, 24, 26,
+  17, 19, 25, 27,
+  20, 22, 28, 30,
+  21, 23, 29, 31,
+};
+static const uint8_t orders_8x16[32] = {
+  0,   1,  2,  3,  8,  9, 10, 11,
+  4,   5,  6,  7, 12, 13, 14, 15,
+  16, 17, 18, 19, 24, 25, 26, 27,
+  20, 21, 22, 23, 28, 29, 30, 31,
+};
+static const uint8_t orders_8x8[64] = {
+  0,   1,  4,  5, 16, 17, 20, 21,
+  2,   3,  6,  7, 18, 19, 22, 23,
+  8,   9, 12, 13, 24, 25, 28, 29,
+  10, 11, 14, 15, 26, 27, 30, 31,
+  32, 33, 36, 37, 48, 49, 52, 53,
+  34, 35, 38, 39, 50, 51, 54, 55,
+  40, 41, 44, 45, 56, 57, 60, 61,
+  42, 43, 46, 47, 58, 59, 62, 63,
+};
+static const uint8_t *const orders[BLOCK_SIZES] = {
+  orders_8x8, orders_8x8, orders_8x8, orders_8x8,
+  orders_8x16, orders_16x8, orders_16x16,
+  orders_16x32, orders_32x16, orders_32x32,
+  orders_32x64, orders_64x32, orders_64x64,
+};
+
+static int vp10_has_right(BLOCK_SIZE bsize, int mi_row, int mi_col,
+                          int right_available,
+                          TX_SIZE txsz, int y, int x, int ss_x) {
+  if (y == 0) {
+    int wl = mi_width_log2_lookup[bsize];
+    int hl = mi_height_log2_lookup[bsize];
+    int w = 1 << (wl + 1 - ss_x);
+    int step = 1 << txsz;
+    const uint8_t *order = orders[bsize];
+    int my_order, tr_order;
+
+    if (x + step < w)
+      return 1;
+
+    mi_row = (mi_row & 7) >> hl;
+    mi_col = (mi_col & 7) >> wl;
+
+    if (mi_row == 0)
+      return right_available;
+
+    if (((mi_col + 1) << wl) >= 8)
+      return 0;
+
+    my_order = order[((mi_row + 0) << (3 - wl)) + mi_col + 0];
+    tr_order = order[((mi_row - 1) << (3 - wl)) + mi_col + 1];
+
+    return my_order > tr_order && right_available;
+  } else {
+    int wl = mi_width_log2_lookup[bsize];
+    int w = 1 << (wl + 1 - ss_x);
+    int step = 1 << txsz;
+
+    return x + step < w;
+  }
+}
+
+static int vp10_has_bottom(BLOCK_SIZE bsize, int mi_row, int mi_col,
+                           int bottom_available, TX_SIZE txsz,
+                           int y, int x, int ss_y) {
+  if (x == 0) {
+    int wl = mi_width_log2_lookup[bsize];
+    int hl = mi_height_log2_lookup[bsize];
+    int h = 1 << (hl + 1 - ss_y);
+    int step = 1 << txsz;
+    const uint8_t *order = orders[bsize];
+    int my_order, bl_order;
+
+    mi_row = (mi_row & 7) >> hl;
+    mi_col = (mi_col & 7) >> wl;
+
+    if (mi_col == 0)
+      return bottom_available &&
+             (mi_row << (hl + !ss_y)) + y + step < (8 << !ss_y);
+
+    if (((mi_row + 1) << hl) >= 8)
+      return 0;
+
+    if (y + step < h)
+      return 1;
+
+    my_order = order[((mi_row + 0) << (3 - wl)) + mi_col + 0];
+    bl_order = order[((mi_row + 1) << (3 - wl)) + mi_col - 1];
+
+    return bl_order < my_order && bottom_available;
+  } else {
+    return 0;
+  }
+}
+#endif
+
+typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left);
+
+static intra_pred_fn pred[INTRA_MODES][TX_SIZES];
+static intra_pred_fn dc_pred[2][2][TX_SIZES];
+
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef void (*intra_high_pred_fn)(uint16_t *dst, ptrdiff_t stride,
+                                   const uint16_t *above, const uint16_t *left,
+                                   int bd);
+static intra_high_pred_fn pred_high[INTRA_MODES][4];
+static intra_high_pred_fn dc_pred_high[2][2][4];
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static void vp10_init_intra_predictors_internal(void) {
+#define INIT_NO_4X4(p, type) \
+  p[TX_8X8] = vpx_##type##_predictor_8x8; \
+  p[TX_16X16] = vpx_##type##_predictor_16x16; \
+  p[TX_32X32] = vpx_##type##_predictor_32x32
+
+#define INIT_ALL_SIZES(p, type) \
+  p[TX_4X4] = vpx_##type##_predictor_4x4; \
+  INIT_NO_4X4(p, type)
+
+  INIT_ALL_SIZES(pred[V_PRED], v);
+  INIT_ALL_SIZES(pred[H_PRED], h);
+#if CONFIG_MISC_FIXES
+  INIT_ALL_SIZES(pred[D207_PRED], d207e);
+  INIT_ALL_SIZES(pred[D45_PRED], d45e);
+  INIT_ALL_SIZES(pred[D63_PRED], d63e);
+#else
+  INIT_ALL_SIZES(pred[D207_PRED], d207);
+  INIT_ALL_SIZES(pred[D45_PRED], d45);
+  INIT_ALL_SIZES(pred[D63_PRED], d63);
+#endif
+  INIT_ALL_SIZES(pred[D117_PRED], d117);
+  INIT_ALL_SIZES(pred[D135_PRED], d135);
+  INIT_ALL_SIZES(pred[D153_PRED], d153);
+  INIT_ALL_SIZES(pred[TM_PRED], tm);
+
+  INIT_ALL_SIZES(dc_pred[0][0], dc_128);
+  INIT_ALL_SIZES(dc_pred[0][1], dc_top);
+  INIT_ALL_SIZES(dc_pred[1][0], dc_left);
+  INIT_ALL_SIZES(dc_pred[1][1], dc);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  INIT_ALL_SIZES(pred_high[V_PRED], highbd_v);
+  INIT_ALL_SIZES(pred_high[H_PRED], highbd_h);
+#if CONFIG_MISC_FIXES
+  INIT_ALL_SIZES(pred_high[D207_PRED], highbd_d207e);
+  INIT_ALL_SIZES(pred_high[D45_PRED], highbd_d45e);
+  INIT_ALL_SIZES(pred_high[D63_PRED], highbd_d63);
+#else
+  INIT_ALL_SIZES(pred_high[D207_PRED], highbd_d207);
+  INIT_ALL_SIZES(pred_high[D45_PRED], highbd_d45);
+  INIT_ALL_SIZES(pred_high[D63_PRED], highbd_d63);
+#endif
+  INIT_ALL_SIZES(pred_high[D117_PRED], highbd_d117);
+  INIT_ALL_SIZES(pred_high[D135_PRED], highbd_d135);
+  INIT_ALL_SIZES(pred_high[D153_PRED], highbd_d153);
+  INIT_ALL_SIZES(pred_high[TM_PRED], highbd_tm);
+
+  INIT_ALL_SIZES(dc_pred_high[0][0], highbd_dc_128);
+  INIT_ALL_SIZES(dc_pred_high[0][1], highbd_dc_top);
+  INIT_ALL_SIZES(dc_pred_high[1][0], highbd_dc_left);
+  INIT_ALL_SIZES(dc_pred_high[1][1], highbd_dc);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#undef intra_pred_allsizes
+}
+
+#if CONFIG_MISC_FIXES
+static INLINE void memset16(uint16_t *dst, int val, int n) {
+  while (n--)
+    *dst++ = val;
+}
+#endif
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void build_intra_predictors_high(const MACROBLOCKD *xd,
+                                        const uint8_t *ref8,
+                                        int ref_stride,
+                                        uint8_t *dst8,
+                                        int dst_stride,
+                                        PREDICTION_MODE mode,
+                                        TX_SIZE tx_size,
+#if CONFIG_MISC_FIXES
+                                        int n_top_px, int n_topright_px,
+                                        int n_left_px, int n_bottomleft_px,
+#else
+                                        int up_available,
+                                        int left_available,
+                                        int right_available,
+#endif
+                                        int x, int y,
+                                        int plane, int bd) {
+  int i;
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+#if CONFIG_MISC_FIXES
+  DECLARE_ALIGNED(16, uint16_t, left_col[32]);
+#else
+  DECLARE_ALIGNED(16, uint16_t, left_col[64]);
+#endif
+  DECLARE_ALIGNED(16, uint16_t, above_data[64 + 16]);
+  uint16_t *above_row = above_data + 16;
+  const uint16_t *const_above_row = above_row;
+  const int bs = 4 << tx_size;
+#if CONFIG_MISC_FIXES
+  const uint16_t *above_ref = ref - ref_stride;
+#else
+  int frame_width, frame_height;
+  int x0, y0;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+#endif
+  const int need_left = extend_modes[mode] & NEED_LEFT;
+  const int need_above = extend_modes[mode] & NEED_ABOVE;
+  const int need_aboveright = extend_modes[mode] & NEED_ABOVERIGHT;
+  int base = 128 << (bd - 8);
+  // 127 127 127 .. 127 127 127 127 127 127
+  // 129  A   B  ..  Y   Z
+  // 129  C   D  ..  W   X
+  // 129  E   F  ..  U   V
+  // 129  G   H  ..  S   T   T   T   T   T
+
+#if CONFIG_MISC_FIXES
+  (void) x;
+  (void) y;
+  (void) plane;
+  (void) need_left;
+  (void) need_above;
+  (void) need_aboveright;
+
+  // NEED_LEFT
+  if (extend_modes[mode] & NEED_LEFT) {
+    const int need_bottom = !!(extend_modes[mode] & NEED_BOTTOMLEFT);
+    i = 0;
+    if (n_left_px > 0) {
+      for (; i < n_left_px; i++)
+        left_col[i] = ref[i * ref_stride - 1];
+      if (need_bottom && n_bottomleft_px > 0) {
+        assert(i == bs);
+        for (; i < bs + n_bottomleft_px; i++)
+          left_col[i] = ref[i * ref_stride - 1];
+      }
+      if (i < (bs << need_bottom))
+        memset16(&left_col[i], left_col[i - 1], (bs << need_bottom) - i);
+    } else {
+      memset16(left_col, base + 1, bs << need_bottom);
+    }
+  }
+
+  // NEED_ABOVE
+  if (extend_modes[mode] & NEED_ABOVE) {
+    const int need_right = !!(extend_modes[mode] & NEED_ABOVERIGHT);
+    if (n_top_px > 0) {
+      memcpy(above_row, above_ref, n_top_px * 2);
+      i = n_top_px;
+      if (need_right && n_topright_px > 0) {
+        assert(n_top_px == bs);
+        memcpy(above_row + bs, above_ref + bs, n_topright_px * 2);
+        i += n_topright_px;
+      }
+      if (i < (bs << need_right))
+        memset16(&above_row[i], above_row[i - 1], (bs << need_right) - i);
+    } else {
+      memset16(above_row, base - 1, bs << need_right);
+    }
+  }
+
+  if (extend_modes[mode] & NEED_ABOVELEFT) {
+    above_row[-1] = n_top_px > 0 ?
+        (n_left_px > 0 ? above_ref[-1] : base + 1) : base - 1;
+  }
+#else
+  // Get current frame pointer, width and height.
+  if (plane == 0) {
+    frame_width = xd->cur_buf->y_width;
+    frame_height = xd->cur_buf->y_height;
+  } else {
+    frame_width = xd->cur_buf->uv_width;
+    frame_height = xd->cur_buf->uv_height;
+  }
+
+  // Get block position in current frame.
+  x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x;
+  y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y;
+
+  // NEED_LEFT
+  if (need_left) {
+    if (left_available) {
+      if (xd->mb_to_bottom_edge < 0) {
+        /* slower path if the block needs border extension */
+        if (y0 + bs <= frame_height) {
+          for (i = 0; i < bs; ++i)
+            left_col[i] = ref[i * ref_stride - 1];
+        } else {
+          const int extend_bottom = frame_height - y0;
+          for (i = 0; i < extend_bottom; ++i)
+            left_col[i] = ref[i * ref_stride - 1];
+          for (; i < bs; ++i)
+            left_col[i] = ref[(extend_bottom - 1) * ref_stride - 1];
+        }
+      } else {
+        /* faster path if the block does not need extension */
+        for (i = 0; i < bs; ++i)
+          left_col[i] = ref[i * ref_stride - 1];
+      }
+    } else {
+      // TODO(Peter): this value should probably change for high bitdepth
+      vpx_memset16(left_col, base + 1, bs);
+    }
+  }
+
+  // NEED_ABOVE
+  if (need_above) {
+    if (up_available) {
+      const uint16_t *above_ref = ref - ref_stride;
+      if (xd->mb_to_right_edge < 0) {
+        /* slower path if the block needs border extension */
+        if (x0 + bs <= frame_width) {
+          memcpy(above_row, above_ref, bs * sizeof(above_row[0]));
+        } else if (x0 <= frame_width) {
+          const int r = frame_width - x0;
+          memcpy(above_row, above_ref, r * sizeof(above_row[0]));
+          vpx_memset16(above_row + r, above_row[r - 1], x0 + bs - frame_width);
+        }
+      } else {
+        /* faster path if the block does not need extension */
+        if (bs == 4 && right_available && left_available) {
+          const_above_row = above_ref;
+        } else {
+          memcpy(above_row, above_ref, bs * sizeof(above_row[0]));
+        }
+      }
+      above_row[-1] = left_available ? above_ref[-1] : (base + 1);
+    } else {
+      vpx_memset16(above_row, base - 1, bs);
+      above_row[-1] = base - 1;
+    }
+  }
+
+  // NEED_ABOVERIGHT
+  if (need_aboveright) {
+    if (up_available) {
+      const uint16_t *above_ref = ref - ref_stride;
+      if (xd->mb_to_right_edge < 0) {
+        /* slower path if the block needs border extension */
+        if (x0 + 2 * bs <= frame_width) {
+          if (right_available && bs == 4) {
+            memcpy(above_row, above_ref, 2 * bs * sizeof(above_row[0]));
+          } else {
+            memcpy(above_row, above_ref, bs * sizeof(above_row[0]));
+            vpx_memset16(above_row + bs, above_row[bs - 1], bs);
+          }
+        } else if (x0 + bs <= frame_width) {
+          const int r = frame_width - x0;
+          if (right_available && bs == 4) {
+            memcpy(above_row, above_ref, r * sizeof(above_row[0]));
+            vpx_memset16(above_row + r, above_row[r - 1],
+                         x0 + 2 * bs - frame_width);
+          } else {
+            memcpy(above_row, above_ref, bs * sizeof(above_row[0]));
+            vpx_memset16(above_row + bs, above_row[bs - 1], bs);
+          }
+        } else if (x0 <= frame_width) {
+          const int r = frame_width - x0;
+          memcpy(above_row, above_ref, r * sizeof(above_row[0]));
+          vpx_memset16(above_row + r, above_row[r - 1],
+                       x0 + 2 * bs - frame_width);
+        }
+        // TODO(Peter) this value should probably change for high bitdepth
+        above_row[-1] = left_available ? above_ref[-1] : (base + 1);
+      } else {
+        /* faster path if the block does not need extension */
+        if (bs == 4 && right_available && left_available) {
+          const_above_row = above_ref;
+        } else {
+          memcpy(above_row, above_ref, bs * sizeof(above_row[0]));
+          if (bs == 4 && right_available)
+            memcpy(above_row + bs, above_ref + bs, bs * sizeof(above_row[0]));
+          else
+            vpx_memset16(above_row + bs, above_row[bs - 1], bs);
+          // TODO(Peter): this value should probably change for high bitdepth
+          above_row[-1] = left_available ? above_ref[-1] : (base + 1);
+        }
+      }
+    } else {
+      vpx_memset16(above_row, base - 1, bs * 2);
+      // TODO(Peter): this value should probably change for high bitdepth
+      above_row[-1] = base - 1;
+    }
+  }
+#endif
+
+  // predict
+  if (mode == DC_PRED) {
+#if CONFIG_MISC_FIXES
+    dc_pred_high[n_left_px > 0][n_top_px > 0][tx_size](dst, dst_stride,
+                                                       const_above_row,
+                                                       left_col, xd->bd);
+#else
+    dc_pred_high[left_available][up_available][tx_size](dst, dst_stride,
+                                                        const_above_row,
+                                                        left_col, xd->bd);
+#endif
+  } else {
+    pred_high[mode][tx_size](dst, dst_stride, const_above_row, left_col,
+                             xd->bd);
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
+                                   int ref_stride, uint8_t *dst, int dst_stride,
+                                   PREDICTION_MODE mode, TX_SIZE tx_size,
+#if CONFIG_MISC_FIXES
+                                   int n_top_px, int n_topright_px,
+                                   int n_left_px, int n_bottomleft_px,
+#else
+                                   int up_available, int left_available,
+                                   int right_available,
+#endif
+                                   int x, int y, int plane) {
+  int i;
+#if CONFIG_MISC_FIXES
+  DECLARE_ALIGNED(16, uint8_t, left_col[64]);
+  const uint8_t *above_ref = ref - ref_stride;
+#else
+  DECLARE_ALIGNED(16, uint8_t, left_col[32]);
+  int frame_width, frame_height;
+  int x0, y0;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+#endif
+  DECLARE_ALIGNED(16, uint8_t, above_data[64 + 16]);
+  uint8_t *above_row = above_data + 16;
+  const uint8_t *const_above_row = above_row;
+  const int bs = 4 << tx_size;
+
+  // 127 127 127 .. 127 127 127 127 127 127
+  // 129  A   B  ..  Y   Z
+  // 129  C   D  ..  W   X
+  // 129  E   F  ..  U   V
+  // 129  G   H  ..  S   T   T   T   T   T
+  // ..
+
+#if CONFIG_MISC_FIXES
+  (void) xd;
+  (void) x;
+  (void) y;
+  (void) plane;
+  assert(n_top_px >= 0);
+  assert(n_topright_px >= 0);
+  assert(n_left_px >= 0);
+  assert(n_bottomleft_px >= 0);
+#else
+  // Get current frame pointer, width and height.
+  if (plane == 0) {
+    frame_width = xd->cur_buf->y_width;
+    frame_height = xd->cur_buf->y_height;
+  } else {
+    frame_width = xd->cur_buf->uv_width;
+    frame_height = xd->cur_buf->uv_height;
+  }
+
+  // Get block position in current frame.
+  x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x;
+  y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y;
+#endif
+
+  // NEED_LEFT
+  if (extend_modes[mode] & NEED_LEFT) {
+#if CONFIG_MISC_FIXES
+    const int need_bottom = !!(extend_modes[mode] & NEED_BOTTOMLEFT);
+    i = 0;
+    if (n_left_px > 0) {
+      for (; i < n_left_px; i++)
+        left_col[i] = ref[i * ref_stride - 1];
+      if (need_bottom && n_bottomleft_px > 0) {
+        assert(i == bs);
+        for (; i < bs + n_bottomleft_px; i++)
+          left_col[i] = ref[i * ref_stride - 1];
+      }
+      if (i < (bs << need_bottom))
+        memset(&left_col[i], left_col[i - 1], (bs << need_bottom) - i);
+    } else {
+      memset(left_col, 129, bs << need_bottom);
+    }
+#else
+    if (left_available) {
+      if (xd->mb_to_bottom_edge < 0) {
+        /* slower path if the block needs border extension */
+        if (y0 + bs <= frame_height) {
+          for (i = 0; i < bs; ++i)
+            left_col[i] = ref[i * ref_stride - 1];
+        } else {
+          const int extend_bottom = frame_height - y0;
+          for (i = 0; i < extend_bottom; ++i)
+            left_col[i] = ref[i * ref_stride - 1];
+          for (; i < bs; ++i)
+            left_col[i] = ref[(extend_bottom - 1) * ref_stride - 1];
+        }
+      } else {
+        /* faster path if the block does not need extension */
+        for (i = 0; i < bs; ++i)
+          left_col[i] = ref[i * ref_stride - 1];
+      }
+    } else {
+      memset(left_col, 129, bs);
+    }
+#endif
+  }
+
+  // NEED_ABOVE
+  if (extend_modes[mode] & NEED_ABOVE) {
+#if CONFIG_MISC_FIXES
+    const int need_right = !!(extend_modes[mode] & NEED_ABOVERIGHT);
+    if (n_top_px > 0) {
+      memcpy(above_row, above_ref, n_top_px);
+      i = n_top_px;
+      if (need_right && n_topright_px > 0) {
+        assert(n_top_px == bs);
+        memcpy(above_row + bs, above_ref + bs, n_topright_px);
+        i += n_topright_px;
+      }
+      if (i < (bs << need_right))
+        memset(&above_row[i], above_row[i - 1], (bs << need_right) - i);
+    } else {
+      memset(above_row, 127, bs << need_right);
+    }
+#else
+    if (up_available) {
+      const uint8_t *above_ref = ref - ref_stride;
+      if (xd->mb_to_right_edge < 0) {
+        /* slower path if the block needs border extension */
+        if (x0 + bs <= frame_width) {
+          memcpy(above_row, above_ref, bs);
+        } else if (x0 <= frame_width) {
+          const int r = frame_width - x0;
+          memcpy(above_row, above_ref, r);
+          memset(above_row + r, above_row[r - 1], x0 + bs - frame_width);
+        }
+      } else {
+        /* faster path if the block does not need extension */
+        if (bs == 4 && right_available && left_available) {
+          const_above_row = above_ref;
+        } else {
+          memcpy(above_row, above_ref, bs);
+        }
+      }
+      above_row[-1] = left_available ? above_ref[-1] : 129;
+    } else {
+      memset(above_row, 127, bs);
+      above_row[-1] = 127;
+    }
+#endif
+  }
+
+#if CONFIG_MISC_FIXES
+  if (extend_modes[mode] & NEED_ABOVELEFT) {
+    above_row[-1] = n_top_px > 0 ? (n_left_px > 0 ? above_ref[-1] : 129) : 127;
+  }
+#else
+  // NEED_ABOVERIGHT
+  if (extend_modes[mode] & NEED_ABOVERIGHT) {
+    if (up_available) {
+      const uint8_t *above_ref = ref - ref_stride;
+      if (xd->mb_to_right_edge < 0) {
+        /* slower path if the block needs border extension */
+        if (x0 + 2 * bs <= frame_width) {
+          if (right_available && bs == 4) {
+            memcpy(above_row, above_ref, 2 * bs);
+          } else {
+            memcpy(above_row, above_ref, bs);
+            memset(above_row + bs, above_row[bs - 1], bs);
+          }
+        } else if (x0 + bs <= frame_width) {
+          const int r = frame_width - x0;
+          if (right_available && bs == 4) {
+            memcpy(above_row, above_ref, r);
+            memset(above_row + r, above_row[r - 1], x0 + 2 * bs - frame_width);
+          } else {
+            memcpy(above_row, above_ref, bs);
+            memset(above_row + bs, above_row[bs - 1], bs);
+          }
+        } else if (x0 <= frame_width) {
+          const int r = frame_width - x0;
+          memcpy(above_row, above_ref, r);
+          memset(above_row + r, above_row[r - 1], x0 + 2 * bs - frame_width);
+        }
+      } else {
+        /* faster path if the block does not need extension */
+        if (bs == 4 && right_available && left_available) {
+          const_above_row = above_ref;
+        } else {
+          memcpy(above_row, above_ref, bs);
+          if (bs == 4 && right_available)
+            memcpy(above_row + bs, above_ref + bs, bs);
+          else
+            memset(above_row + bs, above_row[bs - 1], bs);
+        }
+      }
+      above_row[-1] = left_available ? above_ref[-1] : 129;
+    } else {
+      memset(above_row, 127, bs * 2);
+      above_row[-1] = 127;
+    }
+  }
+#endif
+
+  // predict
+  if (mode == DC_PRED) {
+#if CONFIG_MISC_FIXES
+    dc_pred[n_left_px > 0][n_top_px > 0][tx_size](dst, dst_stride,
+                                                  const_above_row, left_col);
+#else
+    dc_pred[left_available][up_available][tx_size](dst, dst_stride,
+                                                   const_above_row, left_col);
+#endif
+  } else {
+    pred[mode][tx_size](dst, dst_stride, const_above_row, left_col);
+  }
+}
+
+void vp10_predict_intra_block(const MACROBLOCKD *xd, int bwl_in, int bhl_in,
+                             TX_SIZE tx_size, PREDICTION_MODE mode,
+                             const uint8_t *ref, int ref_stride,
+                             uint8_t *dst, int dst_stride,
+                             int aoff, int loff, int plane) {
+  const int txw = (1 << tx_size);
+  const int have_top = loff || xd->up_available;
+  const int have_left = aoff || xd->left_available;
+  const int x = aoff * 4;
+  const int y = loff * 4;
+#if CONFIG_MISC_FIXES
+  const int bw = VPXMAX(2, 1 << bwl_in);
+  const int bh = VPXMAX(2, 1 << bhl_in);
+  const int mi_row = -xd->mb_to_top_edge >> 6;
+  const int mi_col = -xd->mb_to_left_edge >> 6;
+  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int right_available =
+      mi_col + (bw >> !pd->subsampling_x) < xd->tile.mi_col_end;
+  const int have_right = vp10_has_right(bsize, mi_row, mi_col,
+                                        right_available,
+                                        tx_size, loff, aoff,
+                                        pd->subsampling_x);
+  const int have_bottom = vp10_has_bottom(bsize, mi_row, mi_col,
+                                          xd->mb_to_bottom_edge > 0,
+                                          tx_size, loff, aoff,
+                                          pd->subsampling_y);
+  const int wpx = 4 * bw;
+  const int hpx = 4 * bh;
+  const int txpx = 4 * txw;
+
+  int xr = (xd->mb_to_right_edge >> (3 + pd->subsampling_x)) + (wpx - x - txpx);
+  int yd =
+      (xd->mb_to_bottom_edge >> (3 + pd->subsampling_y)) + (hpx - y - txpx);
+#else
+  const int bw = (1 << bwl_in);
+  const int have_right = (aoff + txw) < bw;
+#endif  // CONFIG_MISC_FIXES
+
+#if CONFIG_MISC_FIXES
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    build_intra_predictors_high(xd, ref, ref_stride, dst, dst_stride, mode,
+                                tx_size,
+                                have_top ? VPXMIN(txpx, xr + txpx) : 0,
+                                have_top && have_right ? VPXMIN(txpx, xr) : 0,
+                                have_left ? VPXMIN(txpx, yd + txpx) : 0,
+                                have_bottom && have_left ? VPXMIN(txpx, yd) : 0,
+                                x, y, plane, xd->bd);
+    return;
+  }
+#endif
+  build_intra_predictors(xd, ref, ref_stride, dst, dst_stride, mode,
+                         tx_size,
+                         have_top ? VPXMIN(txpx, xr + txpx) : 0,
+                         have_top && have_right ? VPXMIN(txpx, xr) : 0,
+                         have_left ? VPXMIN(txpx, yd + txpx) : 0,
+                         have_bottom && have_left ? VPXMIN(txpx, yd) : 0,
+                         x, y, plane);
+#else  // CONFIG_MISC_FIXES
+  (void) bhl_in;
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    build_intra_predictors_high(xd, ref, ref_stride, dst, dst_stride, mode,
+                                tx_size, have_top, have_left, have_right,
+                                x, y, plane, xd->bd);
+    return;
+  }
+#endif
+  build_intra_predictors(xd, ref, ref_stride, dst, dst_stride, mode, tx_size,
+                         have_top, have_left, have_right, x, y, plane);
+#endif  // CONFIG_MISC_FIXES
+}
+
+void vp10_init_intra_predictors(void) {
+  once(vp10_init_intra_predictors_internal);
+}
diff --git a/libs/libvpx/vp10/common/reconintra.h b/libs/libvpx/vp10/common/reconintra.h
new file mode 100644
index 0000000000..f451fb8f70
--- /dev/null
+++ b/libs/libvpx/vp10/common/reconintra.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_COMMON_RECONINTRA_H_
+#define VP10_COMMON_RECONINTRA_H_
+
+#include "vpx/vpx_integer.h"
+#include "vp10/common/blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp10_init_intra_predictors(void);
+
+void vp10_predict_intra_block(const MACROBLOCKD *xd, int bwl_in, int bhl_in,
+                             TX_SIZE tx_size, PREDICTION_MODE mode,
+                             const uint8_t *ref, int ref_stride,
+                             uint8_t *dst, int dst_stride,
+                             int aoff, int loff, int plane);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_COMMON_RECONINTRA_H_
diff --git a/libs/libvpx/vp10/common/scale.c b/libs/libvpx/vp10/common/scale.c
new file mode 100644
index 0000000000..ce6062c195
--- /dev/null
+++ b/libs/libvpx/vp10/common/scale.c
@@ -0,0 +1,166 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vp10/common/filter.h"
+#include "vp10/common/scale.h"
+#include "vpx_dsp/vpx_filter.h"
+
+static INLINE int scaled_x(int val, const struct scale_factors *sf) {
+  return (int)((int64_t)val * sf->x_scale_fp >> REF_SCALE_SHIFT);
+}
+
+static INLINE int scaled_y(int val, const struct scale_factors *sf) {
+  return (int)((int64_t)val * sf->y_scale_fp >> REF_SCALE_SHIFT);
+}
+
+static int unscaled_value(int val, const struct scale_factors *sf) {
+  (void) sf;
+  return val;
+}
+
+static int get_fixed_point_scale_factor(int other_size, int this_size) {
+  // Calculate scaling factor once for each reference frame
+  // and use fixed point scaling factors in decoding and encoding routines.
+  // Hardware implementations can calculate scale factor in device driver
+  // and use multiplication and shifting on hardware instead of division.
+  return (other_size << REF_SCALE_SHIFT) / this_size;
+}
+
+MV32 vp10_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf) {
+  const int x_off_q4 = scaled_x(x << SUBPEL_BITS, sf) & SUBPEL_MASK;
+  const int y_off_q4 = scaled_y(y << SUBPEL_BITS, sf) & SUBPEL_MASK;
+  const MV32 res = {
+    scaled_y(mv->row, sf) + y_off_q4,
+    scaled_x(mv->col, sf) + x_off_q4
+  };
+  return res;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp10_setup_scale_factors_for_frame(struct scale_factors *sf,
+                                       int other_w, int other_h,
+                                       int this_w, int this_h,
+                                       int use_highbd) {
+#else
+void vp10_setup_scale_factors_for_frame(struct scale_factors *sf,
+                                       int other_w, int other_h,
+                                       int this_w, int this_h) {
+#endif
+  if (!valid_ref_frame_size(other_w, other_h, this_w, this_h)) {
+    sf->x_scale_fp = REF_INVALID_SCALE;
+    sf->y_scale_fp = REF_INVALID_SCALE;
+    return;
+  }
+
+  sf->x_scale_fp = get_fixed_point_scale_factor(other_w, this_w);
+  sf->y_scale_fp = get_fixed_point_scale_factor(other_h, this_h);
+  sf->x_step_q4 = scaled_x(16, sf);
+  sf->y_step_q4 = scaled_y(16, sf);
+
+  if (vp10_is_scaled(sf)) {
+    sf->scale_value_x = scaled_x;
+    sf->scale_value_y = scaled_y;
+  } else {
+    sf->scale_value_x = unscaled_value;
+    sf->scale_value_y = unscaled_value;
+  }
+
+  // TODO(agrange): Investigate the best choice of functions to use here
+  // for EIGHTTAP_SMOOTH. Since it is not interpolating, need to choose what
+  // to do at full-pel offsets. The current selection, where the filter is
+  // applied in one direction only, and not at all for 0,0, seems to give the
+  // best quality, but it may be worth trying an additional mode that does
+  // do the filtering on full-pel.
+  if (sf->x_step_q4 == 16) {
+    if (sf->y_step_q4 == 16) {
+      // No scaling in either direction.
+      sf->predict[0][0][0] = vpx_convolve_copy;
+      sf->predict[0][0][1] = vpx_convolve_avg;
+      sf->predict[0][1][0] = vpx_convolve8_vert;
+      sf->predict[0][1][1] = vpx_convolve8_avg_vert;
+      sf->predict[1][0][0] = vpx_convolve8_horiz;
+      sf->predict[1][0][1] = vpx_convolve8_avg_horiz;
+    } else {
+      // No scaling in x direction. Must always scale in the y direction.
+      sf->predict[0][0][0] = vpx_convolve8_vert;
+      sf->predict[0][0][1] = vpx_convolve8_avg_vert;
+      sf->predict[0][1][0] = vpx_convolve8_vert;
+      sf->predict[0][1][1] = vpx_convolve8_avg_vert;
+      sf->predict[1][0][0] = vpx_convolve8;
+      sf->predict[1][0][1] = vpx_convolve8_avg;
+    }
+  } else {
+    if (sf->y_step_q4 == 16) {
+      // No scaling in the y direction. Must always scale in the x direction.
+      sf->predict[0][0][0] = vpx_convolve8_horiz;
+      sf->predict[0][0][1] = vpx_convolve8_avg_horiz;
+      sf->predict[0][1][0] = vpx_convolve8;
+      sf->predict[0][1][1] = vpx_convolve8_avg;
+      sf->predict[1][0][0] = vpx_convolve8_horiz;
+      sf->predict[1][0][1] = vpx_convolve8_avg_horiz;
+    } else {
+      // Must always scale in both directions.
+      sf->predict[0][0][0] = vpx_convolve8;
+      sf->predict[0][0][1] = vpx_convolve8_avg;
+      sf->predict[0][1][0] = vpx_convolve8;
+      sf->predict[0][1][1] = vpx_convolve8_avg;
+      sf->predict[1][0][0] = vpx_convolve8;
+      sf->predict[1][0][1] = vpx_convolve8_avg;
+    }
+  }
+  // 2D subpel motion always gets filtered in both directions
+  sf->predict[1][1][0] = vpx_convolve8;
+  sf->predict[1][1][1] = vpx_convolve8_avg;
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (use_highbd) {
+    if (sf->x_step_q4 == 16) {
+      if (sf->y_step_q4 == 16) {
+        // No scaling in either direction.
+        sf->highbd_predict[0][0][0] = vpx_highbd_convolve_copy;
+        sf->highbd_predict[0][0][1] = vpx_highbd_convolve_avg;
+        sf->highbd_predict[0][1][0] = vpx_highbd_convolve8_vert;
+        sf->highbd_predict[0][1][1] = vpx_highbd_convolve8_avg_vert;
+        sf->highbd_predict[1][0][0] = vpx_highbd_convolve8_horiz;
+        sf->highbd_predict[1][0][1] = vpx_highbd_convolve8_avg_horiz;
+      } else {
+        // No scaling in x direction. Must always scale in the y direction.
+        sf->highbd_predict[0][0][0] = vpx_highbd_convolve8_vert;
+        sf->highbd_predict[0][0][1] = vpx_highbd_convolve8_avg_vert;
+        sf->highbd_predict[0][1][0] = vpx_highbd_convolve8_vert;
+        sf->highbd_predict[0][1][1] = vpx_highbd_convolve8_avg_vert;
+        sf->highbd_predict[1][0][0] = vpx_highbd_convolve8;
+        sf->highbd_predict[1][0][1] = vpx_highbd_convolve8_avg;
+      }
+    } else {
+      if (sf->y_step_q4 == 16) {
+        // No scaling in the y direction. Must always scale in the x direction.
+        sf->highbd_predict[0][0][0] = vpx_highbd_convolve8_horiz;
+        sf->highbd_predict[0][0][1] = vpx_highbd_convolve8_avg_horiz;
+        sf->highbd_predict[0][1][0] = vpx_highbd_convolve8;
+        sf->highbd_predict[0][1][1] = vpx_highbd_convolve8_avg;
+        sf->highbd_predict[1][0][0] = vpx_highbd_convolve8_horiz;
+        sf->highbd_predict[1][0][1] = vpx_highbd_convolve8_avg_horiz;
+      } else {
+        // Must always scale in both directions.
+        sf->highbd_predict[0][0][0] = vpx_highbd_convolve8;
+        sf->highbd_predict[0][0][1] = vpx_highbd_convolve8_avg;
+        sf->highbd_predict[0][1][0] = vpx_highbd_convolve8;
+        sf->highbd_predict[0][1][1] = vpx_highbd_convolve8_avg;
+        sf->highbd_predict[1][0][0] = vpx_highbd_convolve8;
+        sf->highbd_predict[1][0][1] = vpx_highbd_convolve8_avg;
+      }
+    }
+    // 2D subpel motion always gets filtered in both directions.
+    sf->highbd_predict[1][1][0] = vpx_highbd_convolve8;
+    sf->highbd_predict[1][1][1] = vpx_highbd_convolve8_avg;
+  }
+#endif
+}
diff --git a/libs/libvpx/vp10/common/scale.h b/libs/libvpx/vp10/common/scale.h
new file mode 100644
index 0000000000..833f6c4119
--- /dev/null
+++ b/libs/libvpx/vp10/common/scale.h
@@ -0,0 +1,75 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_COMMON_SCALE_H_
+#define VP10_COMMON_SCALE_H_
+
+#include "vp10/common/mv.h"
+#include "vpx_dsp/vpx_convolve.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define REF_SCALE_SHIFT 14
+#define REF_NO_SCALE (1 << REF_SCALE_SHIFT)
+#define REF_INVALID_SCALE -1
+
+struct scale_factors {
+  int x_scale_fp;   // horizontal fixed point scale factor
+  int y_scale_fp;   // vertical fixed point scale factor
+  int x_step_q4;
+  int y_step_q4;
+
+  int (*scale_value_x)(int val, const struct scale_factors *sf);
+  int (*scale_value_y)(int val, const struct scale_factors *sf);
+
+  convolve_fn_t predict[2][2][2];  // horiz, vert, avg
+#if CONFIG_VP9_HIGHBITDEPTH
+  highbd_convolve_fn_t highbd_predict[2][2][2];  // horiz, vert, avg
+#endif
+};
+
+MV32 vp10_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp10_setup_scale_factors_for_frame(struct scale_factors *sf,
+                                       int other_w, int other_h,
+                                       int this_w, int this_h,
+                                       int use_high);
+#else
+void vp10_setup_scale_factors_for_frame(struct scale_factors *sf,
+                                       int other_w, int other_h,
+                                       int this_w, int this_h);
+#endif
+
+static INLINE int vp10_is_valid_scale(const struct scale_factors *sf) {
+  return sf->x_scale_fp != REF_INVALID_SCALE &&
+         sf->y_scale_fp != REF_INVALID_SCALE;
+}
+
+static INLINE int vp10_is_scaled(const struct scale_factors *sf) {
+  return vp10_is_valid_scale(sf) &&
+         (sf->x_scale_fp != REF_NO_SCALE || sf->y_scale_fp != REF_NO_SCALE);
+}
+
+static INLINE int valid_ref_frame_size(int ref_width, int ref_height,
+                                      int this_width, int this_height) {
+  return 2 * this_width >= ref_width &&
+         2 * this_height >= ref_height &&
+         this_width <= 16 * ref_width &&
+         this_height <= 16 * ref_height;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_COMMON_SCALE_H_
diff --git a/libs/libvpx/vp10/common/scan.c b/libs/libvpx/vp10/common/scan.c
new file mode 100644
index 0000000000..7217f6d045
--- /dev/null
+++ b/libs/libvpx/vp10/common/scan.c
@@ -0,0 +1,727 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vp10/common/scan.h"
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_4x4[16]) = {
+  0,  4,  1,  5,
+  8,  2, 12,  9,
+  3,  6, 13, 10,
+  7, 14, 11, 15,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, col_scan_4x4[16]) = {
+  0,  4,  8,  1,
+  12,  5,  9,  2,
+  13,  6, 10,  3,
+  7, 14, 11, 15,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, row_scan_4x4[16]) = {
+  0,  1,  4,  2,
+  5,  3,  6,  8,
+  9,  7, 12, 10,
+  13, 11, 14, 15,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_8x8[64]) = {
+  0,  8,  1, 16,  9,  2, 17, 24,
+  10,  3, 18, 25, 32, 11,  4, 26,
+  33, 19, 40, 12, 34, 27,  5, 41,
+  20, 48, 13, 35, 42, 28, 21,  6,
+  49, 56, 36, 43, 29,  7, 14, 50,
+  57, 44, 22, 37, 15, 51, 58, 30,
+  45, 23, 52, 59, 38, 31, 60, 53,
+  46, 39, 61, 54, 47, 62, 55, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, col_scan_8x8[64]) = {
+  0,  8, 16,  1, 24,  9, 32, 17,
+  2, 40, 25, 10, 33, 18, 48,  3,
+  26, 41, 11, 56, 19, 34,  4, 49,
+  27, 42, 12, 35, 20, 57, 50, 28,
+  5, 43, 13, 36, 58, 51, 21, 44,
+  6, 29, 59, 37, 14, 52, 22,  7,
+  45, 60, 30, 15, 38, 53, 23, 46,
+  31, 61, 39, 54, 47, 62, 55, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, row_scan_8x8[64]) = {
+  0,  1,  2,  8,  9,  3, 16, 10,
+  4, 17, 11, 24,  5, 18, 25, 12,
+  19, 26, 32,  6, 13, 20, 33, 27,
+  7, 34, 40, 21, 28, 41, 14, 35,
+  48, 42, 29, 36, 49, 22, 43, 15,
+  56, 37, 50, 44, 30, 57, 23, 51,
+  58, 45, 38, 52, 31, 59, 53, 46,
+  60, 39, 61, 47, 54, 55, 62, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_16x16[256]) = {
+  0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 64, 34, 49, 19, 65, 80,
+  50, 4, 35, 66, 20, 81, 96, 51, 5, 36, 82, 97, 67, 112, 21, 52,
+  98, 37, 83, 113, 6, 68, 128, 53, 22, 99, 114, 84, 7, 129, 38, 69,
+  100, 115, 144, 130, 85, 54, 23, 8, 145, 39, 70, 116, 101, 131, 160, 146,
+  55, 86, 24, 71, 132, 117, 161, 40, 9, 102, 147, 176, 162, 87, 56, 25,
+  133, 118, 177, 148, 72, 103, 41, 163, 10, 192, 178, 88, 57, 134, 149, 119,
+  26, 164, 73, 104, 193, 42, 179, 208, 11, 135, 89, 165, 120, 150, 58, 194,
+  180, 27, 74, 209, 105, 151, 136, 43, 90, 224, 166, 195, 181, 121, 210, 59,
+  12, 152, 106, 167, 196, 75, 137, 225, 211, 240, 182, 122, 91, 28, 197, 13,
+  226, 168, 183, 153, 44, 212, 138, 107, 241, 60, 29, 123, 198, 184, 227, 169,
+  242, 76, 213, 154, 45, 92, 14, 199, 139, 61, 228, 214, 170, 185, 243, 108,
+  77, 155, 30, 15, 200, 229, 124, 215, 244, 93, 46, 186, 171, 201, 109, 140,
+  230, 62, 216, 245, 31, 125, 78, 156, 231, 47, 187, 202, 217, 94, 246, 141,
+  63, 232, 172, 110, 247, 157, 79, 218, 203, 126, 233, 188, 248, 95, 173, 142,
+  219, 111, 249, 234, 158, 127, 189, 204, 250, 235, 143, 174, 220, 205, 159,
+  251,
+  190, 221, 175, 236, 237, 191, 206, 252, 222, 253, 207, 238, 223, 254, 239,
+  255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, col_scan_16x16[256]) = {
+  0, 16, 32, 48, 1, 64, 17, 80, 33, 96, 49, 2, 65, 112, 18, 81,
+  34, 128, 50, 97, 3, 66, 144, 19, 113, 35, 82, 160, 98, 51, 129, 4,
+  67, 176, 20, 114, 145, 83, 36, 99, 130, 52, 192, 5, 161, 68, 115, 21,
+  146, 84, 208, 177, 37, 131, 100, 53, 162, 224, 69, 6, 116, 193, 147, 85,
+  22, 240, 132, 38, 178, 101, 163, 54, 209, 117, 70, 7, 148, 194, 86, 179,
+  225, 23, 133, 39, 164, 8, 102, 210, 241, 55, 195, 118, 149, 71, 180, 24,
+  87, 226, 134, 165, 211, 40, 103, 56, 72, 150, 196, 242, 119, 9, 181, 227,
+  88, 166, 25, 135, 41, 104, 212, 57, 151, 197, 120, 73, 243, 182, 136, 167,
+  213, 89, 10, 228, 105, 152, 198, 26, 42, 121, 183, 244, 168, 58, 137, 229,
+  74, 214, 90, 153, 199, 184, 11, 106, 245, 27, 122, 230, 169, 43, 215, 59,
+  200, 138, 185, 246, 75, 12, 91, 154, 216, 231, 107, 28, 44, 201, 123, 170,
+  60, 247, 232, 76, 139, 13, 92, 217, 186, 248, 155, 108, 29, 124, 45, 202,
+  233, 171, 61, 14, 77, 140, 15, 249, 93, 30, 187, 156, 218, 46, 109, 125,
+  62, 172, 78, 203, 31, 141, 234, 94, 47, 188, 63, 157, 110, 250, 219, 79,
+  126, 204, 173, 142, 95, 189, 111, 235, 158, 220, 251, 127, 174, 143, 205,
+  236,
+  159, 190, 221, 252, 175, 206, 237, 191, 253, 222, 238, 207, 254, 223, 239,
+  255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, row_scan_16x16[256]) = {
+  0, 1, 2, 16, 3, 17, 4, 18, 32, 5, 33, 19, 6, 34, 48, 20,
+  49, 7, 35, 21, 50, 64, 8, 36, 65, 22, 51, 37, 80, 9, 66, 52,
+  23, 38, 81, 67, 10, 53, 24, 82, 68, 96, 39, 11, 54, 83, 97, 69,
+  25, 98, 84, 40, 112, 55, 12, 70, 99, 113, 85, 26, 41, 56, 114, 100,
+  13, 71, 128, 86, 27, 115, 101, 129, 42, 57, 72, 116, 14, 87, 130, 102,
+  144, 73, 131, 117, 28, 58, 15, 88, 43, 145, 103, 132, 146, 118, 74, 160,
+  89, 133, 104, 29, 59, 147, 119, 44, 161, 148, 90, 105, 134, 162, 120, 176,
+  75, 135, 149, 30, 60, 163, 177, 45, 121, 91, 106, 164, 178, 150, 192, 136,
+  165, 179, 31, 151, 193, 76, 122, 61, 137, 194, 107, 152, 180, 208, 46, 166,
+  167, 195, 92, 181, 138, 209, 123, 153, 224, 196, 77, 168, 210, 182, 240, 108,
+  197, 62, 154, 225, 183, 169, 211, 47, 139, 93, 184, 226, 212, 241, 198, 170,
+  124, 155, 199, 78, 213, 185, 109, 227, 200, 63, 228, 242, 140, 214, 171, 186,
+  156, 229, 243, 125, 94, 201, 244, 215, 216, 230, 141, 187, 202, 79, 172, 110,
+  157, 245, 217, 231, 95, 246, 232, 126, 203, 247, 233, 173, 218, 142, 111,
+  158,
+  188, 248, 127, 234, 219, 249, 189, 204, 143, 174, 159, 250, 235, 205, 220,
+  175,
+  190, 251, 221, 191, 206, 236, 207, 237, 252, 222, 253, 223, 238, 239, 254,
+  255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_32x32[1024]) = {
+  0, 32, 1, 64, 33, 2, 96, 65, 34, 128, 3, 97, 66, 160,
+  129, 35, 98, 4, 67, 130, 161, 192, 36, 99, 224, 5, 162, 193,
+  68, 131, 37, 100,
+  225, 194, 256, 163, 69, 132, 6, 226, 257, 288, 195, 101, 164, 38,
+  258, 7, 227, 289, 133, 320, 70, 196, 165, 290, 259, 228, 39, 321,
+  102, 352, 8, 197,
+  71, 134, 322, 291, 260, 353, 384, 229, 166, 103, 40, 354, 323, 292,
+  135, 385, 198, 261, 72, 9, 416, 167, 386, 355, 230, 324, 104, 293,
+  41, 417, 199, 136,
+  262, 387, 448, 325, 356, 10, 73, 418, 231, 168, 449, 294, 388, 105,
+  419, 263, 42, 200, 357, 450, 137, 480, 74, 326, 232, 11, 389, 169,
+  295, 420, 106, 451,
+  481, 358, 264, 327, 201, 43, 138, 512, 482, 390, 296, 233, 170, 421,
+  75, 452, 359, 12, 513, 265, 483, 328, 107, 202, 514, 544, 422, 391,
+  453, 139, 44, 234,
+  484, 297, 360, 171, 76, 515, 545, 266, 329, 454, 13, 423, 203, 108,
+  546, 485, 576, 298, 235, 140, 361, 330, 172, 547, 45, 455, 267, 577,
+  486, 77, 204, 362,
+  608, 14, 299, 578, 109, 236, 487, 609, 331, 141, 579, 46, 15, 173,
+  610, 363, 78, 205, 16, 110, 237, 611, 142, 47, 174, 79, 206, 17,
+  111, 238, 48, 143,
+  80, 175, 112, 207, 49, 18, 239, 81, 113, 19, 50, 82, 114, 51,
+  83, 115, 640, 516, 392, 268, 144, 20, 672, 641, 548, 517, 424,
+  393, 300, 269, 176, 145,
+  52, 21, 704, 673, 642, 580, 549, 518, 456, 425, 394, 332, 301,
+  270, 208, 177, 146, 84, 53, 22, 736, 705, 674, 643, 612, 581,
+  550, 519, 488, 457, 426, 395,
+  364, 333, 302, 271, 240, 209, 178, 147, 116, 85, 54, 23, 737,
+  706, 675, 613, 582, 551, 489, 458, 427, 365, 334, 303, 241,
+  210, 179, 117, 86, 55, 738, 707,
+  614, 583, 490, 459, 366, 335, 242, 211, 118, 87, 739, 615, 491,
+  367, 243, 119, 768, 644, 520, 396, 272, 148, 24, 800, 769, 676,
+  645, 552, 521, 428, 397, 304,
+  273, 180, 149, 56, 25, 832, 801, 770, 708, 677, 646, 584, 553,
+  522, 460, 429, 398, 336, 305, 274, 212, 181, 150, 88, 57, 26,
+  864, 833, 802, 771, 740, 709,
+  678, 647, 616, 585, 554, 523, 492, 461, 430, 399, 368, 337, 306,
+  275, 244, 213, 182, 151, 120, 89, 58, 27, 865, 834, 803, 741,
+  710, 679, 617, 586, 555, 493,
+  462, 431, 369, 338, 307, 245, 214, 183, 121, 90, 59, 866, 835,
+  742, 711, 618, 587, 494, 463, 370, 339, 246, 215, 122, 91, 867,
+  743, 619, 495, 371, 247, 123,
+  896, 772, 648, 524, 400, 276, 152, 28, 928, 897, 804, 773, 680,
+  649, 556, 525, 432, 401, 308, 277, 184, 153, 60, 29, 960, 929,
+  898, 836, 805, 774, 712, 681,
+  650, 588, 557, 526, 464, 433, 402, 340, 309, 278, 216, 185, 154,
+  92, 61, 30, 992, 961, 930, 899, 868, 837, 806, 775, 744, 713, 682,
+  651, 620, 589, 558, 527,
+  496, 465, 434, 403, 372, 341, 310, 279, 248, 217, 186, 155, 124,
+  93, 62, 31, 993, 962, 931, 869, 838, 807, 745, 714, 683, 621, 590,
+  559, 497, 466, 435, 373,
+  342, 311, 249, 218, 187, 125, 94, 63, 994, 963, 870, 839, 746, 715,
+  622, 591, 498, 467, 374, 343, 250, 219, 126, 95, 995, 871, 747, 623,
+  499, 375, 251, 127,
+  900, 776, 652, 528, 404, 280, 156, 932, 901, 808, 777, 684, 653, 560,
+  529, 436, 405, 312, 281, 188, 157, 964, 933, 902, 840, 809, 778, 716,
+  685, 654, 592, 561,
+  530, 468, 437, 406, 344, 313, 282, 220, 189, 158, 996, 965, 934, 903,
+  872, 841, 810, 779, 748, 717, 686, 655, 624, 593, 562, 531, 500, 469,
+  438, 407, 376, 345,
+  314, 283, 252, 221, 190, 159, 997, 966, 935, 873, 842, 811, 749, 718,
+  687, 625, 594, 563, 501, 470, 439, 377, 346, 315, 253, 222, 191, 998,
+  967, 874, 843, 750,
+  719, 626, 595, 502, 471, 378, 347, 254, 223, 999, 875, 751, 627, 503,
+  379, 255, 904, 780, 656, 532, 408, 284, 936, 905, 812, 781, 688, 657,
+  564, 533, 440, 409,
+  316, 285, 968, 937, 906, 844, 813, 782, 720, 689, 658, 596, 565, 534,
+  472, 441, 410, 348, 317, 286, 1000, 969, 938, 907, 876, 845, 814, 783,
+  752, 721, 690, 659,
+  628, 597, 566, 535, 504, 473, 442, 411, 380, 349, 318, 287, 1001, 970,
+  939, 877, 846, 815, 753, 722, 691, 629, 598, 567, 505, 474, 443, 381,
+  350, 319, 1002, 971,
+  878, 847, 754, 723, 630, 599, 506, 475, 382, 351, 1003, 879, 755, 631,
+  507, 383, 908, 784, 660, 536, 412, 940, 909, 816, 785, 692, 661, 568,
+  537, 444, 413, 972,
+  941, 910, 848, 817, 786, 724, 693, 662, 600, 569, 538, 476, 445, 414,
+  1004, 973, 942, 911, 880, 849, 818, 787, 756, 725, 694, 663, 632, 601,
+  570, 539, 508, 477,
+  446, 415, 1005, 974, 943, 881, 850, 819, 757, 726, 695, 633, 602, 571,
+  509, 478, 447, 1006, 975, 882, 851, 758, 727, 634, 603, 510, 479,
+  1007, 883, 759, 635, 511,
+  912, 788, 664, 540, 944, 913, 820, 789, 696, 665, 572, 541, 976, 945,
+  914, 852, 821, 790, 728, 697, 666, 604, 573, 542, 1008, 977, 946, 915,
+  884, 853, 822, 791,
+  760, 729, 698, 667, 636, 605, 574, 543, 1009, 978, 947, 885, 854, 823,
+  761, 730, 699, 637, 606, 575, 1010, 979, 886, 855, 762, 731, 638, 607,
+  1011, 887, 763, 639,
+  916, 792, 668, 948, 917, 824, 793, 700, 669, 980, 949, 918, 856, 825,
+  794, 732, 701, 670, 1012, 981, 950, 919, 888, 857, 826, 795, 764, 733,
+  702, 671, 1013, 982,
+  951, 889, 858, 827, 765, 734, 703, 1014, 983, 890, 859, 766, 735, 1015,
+  891, 767, 920, 796, 952, 921, 828, 797, 984, 953, 922, 860, 829, 798,
+  1016, 985, 954, 923,
+  892, 861, 830, 799, 1017, 986, 955, 893, 862, 831, 1018, 987, 894, 863,
+  1019, 895, 924, 956, 925, 988, 957, 926, 1020, 989, 958, 927, 1021,
+  990, 959, 1022, 991, 1023,
+};
+
+// Neighborhood 5-tuples for various scans and blocksizes,
+// in {top, left, topleft, topright, bottomleft} order
+// for each position in raster scan order.
+// -1 indicates the neighbor does not exist.
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 0, 0, 1, 4, 4, 4, 1, 1, 8, 8, 5, 8, 2, 2, 2, 5, 9, 12, 6, 9,
+  3, 6, 10, 13, 7, 10, 11, 14, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 4, 4, 0, 0, 8, 8, 1, 1, 5, 5, 1, 1, 9, 9, 2, 2, 6, 6, 2, 2, 3,
+  3, 10, 10, 7, 7, 11, 11, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 0, 0, 1, 1, 4, 4, 2, 2, 5, 5, 4, 4, 8, 8, 6, 6, 8, 8, 9, 9, 12,
+  12, 10, 10, 13, 13, 14, 14, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 8, 8, 0, 0, 16, 16, 1, 1, 24, 24, 9, 9, 1, 1, 32, 32, 17, 17, 2,
+  2, 25, 25, 10, 10, 40, 40, 2, 2, 18, 18, 33, 33, 3, 3, 48, 48, 11, 11, 26,
+  26, 3, 3, 41, 41, 19, 19, 34, 34, 4, 4, 27, 27, 12, 12, 49, 49, 42, 42, 20,
+  20, 4, 4, 35, 35, 5, 5, 28, 28, 50, 50, 43, 43, 13, 13, 36, 36, 5, 5, 21, 21,
+  51, 51, 29, 29, 6, 6, 44, 44, 14, 14, 6, 6, 37, 37, 52, 52, 22, 22, 7, 7, 30,
+  30, 45, 45, 15, 15, 38, 38, 23, 23, 53, 53, 31, 31, 46, 46, 39, 39, 54, 54,
+  47, 47, 55, 55, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 1, 1, 0, 0, 8, 8, 2, 2, 8, 8, 9, 9, 3, 3, 16, 16, 10, 10, 16, 16,
+  4, 4, 17, 17, 24, 24, 11, 11, 18, 18, 25, 25, 24, 24, 5, 5, 12, 12, 19, 19,
+  32, 32, 26, 26, 6, 6, 33, 33, 32, 32, 20, 20, 27, 27, 40, 40, 13, 13, 34, 34,
+  40, 40, 41, 41, 28, 28, 35, 35, 48, 48, 21, 21, 42, 42, 14, 14, 48, 48, 36,
+  36, 49, 49, 43, 43, 29, 29, 56, 56, 22, 22, 50, 50, 57, 57, 44, 44, 37, 37,
+  51, 51, 30, 30, 58, 58, 52, 52, 45, 45, 59, 59, 38, 38, 60, 60, 46, 46, 53,
+  53, 54, 54, 61, 61, 62, 62, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 0, 0, 8, 8, 1, 8, 1, 1, 9, 16, 16, 16, 2, 9, 2, 2, 10, 17, 17,
+  24, 24, 24, 3, 10, 3, 3, 18, 25, 25, 32, 11, 18, 32, 32, 4, 11, 26, 33, 19,
+  26, 4, 4, 33, 40, 12, 19, 40, 40, 5, 12, 27, 34, 34, 41, 20, 27, 13, 20, 5,
+  5, 41, 48, 48, 48, 28, 35, 35, 42, 21, 28, 6, 6, 6, 13, 42, 49, 49, 56, 36,
+  43, 14, 21, 29, 36, 7, 14, 43, 50, 50, 57, 22, 29, 37, 44, 15, 22, 44, 51,
+  51, 58, 30, 37, 23, 30, 52, 59, 45, 52, 38, 45, 31, 38, 53, 60, 46, 53, 39,
+  46, 54, 61, 47, 54, 55, 62, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 16, 16, 32, 32, 0, 0, 48, 48, 1, 1, 64, 64,
+  17, 17, 80, 80, 33, 33, 1, 1, 49, 49, 96, 96, 2, 2, 65, 65,
+  18, 18, 112, 112, 34, 34, 81, 81, 2, 2, 50, 50, 128, 128, 3, 3,
+  97, 97, 19, 19, 66, 66, 144, 144, 82, 82, 35, 35, 113, 113, 3, 3,
+  51, 51, 160, 160, 4, 4, 98, 98, 129, 129, 67, 67, 20, 20, 83, 83,
+  114, 114, 36, 36, 176, 176, 4, 4, 145, 145, 52, 52, 99, 99, 5, 5,
+  130, 130, 68, 68, 192, 192, 161, 161, 21, 21, 115, 115, 84, 84, 37, 37,
+  146, 146, 208, 208, 53, 53, 5, 5, 100, 100, 177, 177, 131, 131, 69, 69,
+  6, 6, 224, 224, 116, 116, 22, 22, 162, 162, 85, 85, 147, 147, 38, 38,
+  193, 193, 101, 101, 54, 54, 6, 6, 132, 132, 178, 178, 70, 70, 163, 163,
+  209, 209, 7, 7, 117, 117, 23, 23, 148, 148, 7, 7, 86, 86, 194, 194,
+  225, 225, 39, 39, 179, 179, 102, 102, 133, 133, 55, 55, 164, 164, 8, 8,
+  71, 71, 210, 210, 118, 118, 149, 149, 195, 195, 24, 24, 87, 87, 40, 40,
+  56, 56, 134, 134, 180, 180, 226, 226, 103, 103, 8, 8, 165, 165, 211, 211,
+  72, 72, 150, 150, 9, 9, 119, 119, 25, 25, 88, 88, 196, 196, 41, 41,
+  135, 135, 181, 181, 104, 104, 57, 57, 227, 227, 166, 166, 120, 120, 151, 151,
+  197, 197, 73, 73, 9, 9, 212, 212, 89, 89, 136, 136, 182, 182, 10, 10,
+  26, 26, 105, 105, 167, 167, 228, 228, 152, 152, 42, 42, 121, 121, 213, 213,
+  58, 58, 198, 198, 74, 74, 137, 137, 183, 183, 168, 168, 10, 10, 90, 90,
+  229, 229, 11, 11, 106, 106, 214, 214, 153, 153, 27, 27, 199, 199, 43, 43,
+  184, 184, 122, 122, 169, 169, 230, 230, 59, 59, 11, 11, 75, 75, 138, 138,
+  200, 200, 215, 215, 91, 91, 12, 12, 28, 28, 185, 185, 107, 107, 154, 154,
+  44, 44, 231, 231, 216, 216, 60, 60, 123, 123, 12, 12, 76, 76, 201, 201,
+  170, 170, 232, 232, 139, 139, 92, 92, 13, 13, 108, 108, 29, 29, 186, 186,
+  217, 217, 155, 155, 45, 45, 13, 13, 61, 61, 124, 124, 14, 14, 233, 233,
+  77, 77, 14, 14, 171, 171, 140, 140, 202, 202, 30, 30, 93, 93, 109, 109,
+  46, 46, 156, 156, 62, 62, 187, 187, 15, 15, 125, 125, 218, 218, 78, 78,
+  31, 31, 172, 172, 47, 47, 141, 141, 94, 94, 234, 234, 203, 203, 63, 63,
+  110, 110, 188, 188, 157, 157, 126, 126, 79, 79, 173, 173, 95, 95, 219, 219,
+  142, 142, 204, 204, 235, 235, 111, 111, 158, 158, 127, 127, 189, 189, 220,
+  220, 143, 143, 174, 174, 205, 205, 236, 236, 159, 159, 190, 190, 221, 221,
+  175, 175, 237, 237, 206, 206, 222, 222, 191, 191, 238, 238, 207, 207, 223,
+  223, 239, 239, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 1, 1, 0, 0, 2, 2, 16, 16, 3, 3, 17, 17,
+  16, 16, 4, 4, 32, 32, 18, 18, 5, 5, 33, 33, 32, 32, 19, 19,
+  48, 48, 6, 6, 34, 34, 20, 20, 49, 49, 48, 48, 7, 7, 35, 35,
+  64, 64, 21, 21, 50, 50, 36, 36, 64, 64, 8, 8, 65, 65, 51, 51,
+  22, 22, 37, 37, 80, 80, 66, 66, 9, 9, 52, 52, 23, 23, 81, 81,
+  67, 67, 80, 80, 38, 38, 10, 10, 53, 53, 82, 82, 96, 96, 68, 68,
+  24, 24, 97, 97, 83, 83, 39, 39, 96, 96, 54, 54, 11, 11, 69, 69,
+  98, 98, 112, 112, 84, 84, 25, 25, 40, 40, 55, 55, 113, 113, 99, 99,
+  12, 12, 70, 70, 112, 112, 85, 85, 26, 26, 114, 114, 100, 100, 128, 128,
+  41, 41, 56, 56, 71, 71, 115, 115, 13, 13, 86, 86, 129, 129, 101, 101,
+  128, 128, 72, 72, 130, 130, 116, 116, 27, 27, 57, 57, 14, 14, 87, 87,
+  42, 42, 144, 144, 102, 102, 131, 131, 145, 145, 117, 117, 73, 73, 144, 144,
+  88, 88, 132, 132, 103, 103, 28, 28, 58, 58, 146, 146, 118, 118, 43, 43,
+  160, 160, 147, 147, 89, 89, 104, 104, 133, 133, 161, 161, 119, 119, 160, 160,
+  74, 74, 134, 134, 148, 148, 29, 29, 59, 59, 162, 162, 176, 176, 44, 44,
+  120, 120, 90, 90, 105, 105, 163, 163, 177, 177, 149, 149, 176, 176, 135, 135,
+  164, 164, 178, 178, 30, 30, 150, 150, 192, 192, 75, 75, 121, 121, 60, 60,
+  136, 136, 193, 193, 106, 106, 151, 151, 179, 179, 192, 192, 45, 45, 165, 165,
+  166, 166, 194, 194, 91, 91, 180, 180, 137, 137, 208, 208, 122, 122, 152, 152,
+  208, 208, 195, 195, 76, 76, 167, 167, 209, 209, 181, 181, 224, 224, 107, 107,
+  196, 196, 61, 61, 153, 153, 224, 224, 182, 182, 168, 168, 210, 210, 46, 46,
+  138, 138, 92, 92, 183, 183, 225, 225, 211, 211, 240, 240, 197, 197, 169, 169,
+  123, 123, 154, 154, 198, 198, 77, 77, 212, 212, 184, 184, 108, 108, 226, 226,
+  199, 199, 62, 62, 227, 227, 241, 241, 139, 139, 213, 213, 170, 170, 185, 185,
+  155, 155, 228, 228, 242, 242, 124, 124, 93, 93, 200, 200, 243, 243, 214, 214,
+  215, 215, 229, 229, 140, 140, 186, 186, 201, 201, 78, 78, 171, 171, 109, 109,
+  156, 156, 244, 244, 216, 216, 230, 230, 94, 94, 245, 245, 231, 231, 125, 125,
+  202, 202, 246, 246, 232, 232, 172, 172, 217, 217, 141, 141, 110, 110, 157,
+  157, 187, 187, 247, 247, 126, 126, 233, 233, 218, 218, 248, 248, 188, 188,
+  203, 203, 142, 142, 173, 173, 158, 158, 249, 249, 234, 234, 204, 204, 219,
+  219, 174, 174, 189, 189, 250, 250, 220, 220, 190, 190, 205, 205, 235, 235,
+  206, 206, 236, 236, 251, 251, 221, 221, 252, 252, 222, 222, 237, 237, 238,
+  238, 253, 253, 254, 254, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 0, 0, 16, 16, 1, 16, 1, 1, 32, 32, 17, 32,
+  2, 17, 2, 2, 48, 48, 18, 33, 33, 48, 3, 18, 49, 64, 64, 64,
+  34, 49, 3, 3, 19, 34, 50, 65, 4, 19, 65, 80, 80, 80, 35, 50,
+  4, 4, 20, 35, 66, 81, 81, 96, 51, 66, 96, 96, 5, 20, 36, 51,
+  82, 97, 21, 36, 67, 82, 97, 112, 5, 5, 52, 67, 112, 112, 37, 52,
+  6, 21, 83, 98, 98, 113, 68, 83, 6, 6, 113, 128, 22, 37, 53, 68,
+  84, 99, 99, 114, 128, 128, 114, 129, 69, 84, 38, 53, 7, 22, 7, 7,
+  129, 144, 23, 38, 54, 69, 100, 115, 85, 100, 115, 130, 144, 144, 130, 145,
+  39, 54, 70, 85, 8, 23, 55, 70, 116, 131, 101, 116, 145, 160, 24, 39,
+  8, 8, 86, 101, 131, 146, 160, 160, 146, 161, 71, 86, 40, 55, 9, 24,
+  117, 132, 102, 117, 161, 176, 132, 147, 56, 71, 87, 102, 25, 40, 147, 162,
+  9, 9, 176, 176, 162, 177, 72, 87, 41, 56, 118, 133, 133, 148, 103, 118,
+  10, 25, 148, 163, 57, 72, 88, 103, 177, 192, 26, 41, 163, 178, 192, 192,
+  10, 10, 119, 134, 73, 88, 149, 164, 104, 119, 134, 149, 42, 57, 178, 193,
+  164, 179, 11, 26, 58, 73, 193, 208, 89, 104, 135, 150, 120, 135, 27, 42,
+  74, 89, 208, 208, 150, 165, 179, 194, 165, 180, 105, 120, 194, 209, 43, 58,
+  11, 11, 136, 151, 90, 105, 151, 166, 180, 195, 59, 74, 121, 136, 209, 224,
+  195, 210, 224, 224, 166, 181, 106, 121, 75, 90, 12, 27, 181, 196, 12, 12,
+  210, 225, 152, 167, 167, 182, 137, 152, 28, 43, 196, 211, 122, 137, 91, 106,
+  225, 240, 44, 59, 13, 28, 107, 122, 182, 197, 168, 183, 211, 226, 153, 168,
+  226, 241, 60, 75, 197, 212, 138, 153, 29, 44, 76, 91, 13, 13, 183, 198,
+  123, 138, 45, 60, 212, 227, 198, 213, 154, 169, 169, 184, 227, 242, 92, 107,
+  61, 76, 139, 154, 14, 29, 14, 14, 184, 199, 213, 228, 108, 123, 199, 214,
+  228, 243, 77, 92, 30, 45, 170, 185, 155, 170, 185, 200, 93, 108, 124, 139,
+  214, 229, 46, 61, 200, 215, 229, 244, 15, 30, 109, 124, 62, 77, 140, 155,
+  215, 230, 31, 46, 171, 186, 186, 201, 201, 216, 78, 93, 230, 245, 125, 140,
+  47, 62, 216, 231, 156, 171, 94, 109, 231, 246, 141, 156, 63, 78, 202, 217,
+  187, 202, 110, 125, 217, 232, 172, 187, 232, 247, 79, 94, 157, 172, 126, 141,
+  203, 218, 95, 110, 233, 248, 218, 233, 142, 157, 111, 126, 173, 188, 188, 203,
+  234, 249, 219, 234, 127, 142, 158, 173, 204, 219, 189, 204, 143, 158, 235,
+  250, 174, 189, 205, 220, 159, 174, 220, 235, 221, 236, 175, 190, 190, 205,
+  236, 251, 206, 221, 237, 252, 191, 206, 222, 237, 207, 222, 238, 253, 223,
+  238, 239, 254, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 0, 0, 32, 32, 1, 32, 1, 1, 64, 64, 33, 64,
+  2, 33, 96, 96, 2, 2, 65, 96, 34, 65, 128, 128, 97, 128, 3, 34,
+  66, 97, 3, 3, 35, 66, 98, 129, 129, 160, 160, 160, 4, 35, 67, 98,
+  192, 192, 4, 4, 130, 161, 161, 192, 36, 67, 99, 130, 5, 36, 68, 99,
+  193, 224, 162, 193, 224, 224, 131, 162, 37, 68, 100, 131, 5, 5, 194, 225,
+  225, 256, 256, 256, 163, 194, 69, 100, 132, 163, 6, 37, 226, 257, 6, 6,
+  195, 226, 257, 288, 101, 132, 288, 288, 38, 69, 164, 195, 133, 164, 258, 289,
+  227, 258, 196, 227, 7, 38, 289, 320, 70, 101, 320, 320, 7, 7, 165, 196,
+  39, 70, 102, 133, 290, 321, 259, 290, 228, 259, 321, 352, 352, 352, 197, 228,
+  134, 165, 71, 102, 8, 39, 322, 353, 291, 322, 260, 291, 103, 134, 353, 384,
+  166, 197, 229, 260, 40, 71, 8, 8, 384, 384, 135, 166, 354, 385, 323, 354,
+  198, 229, 292, 323, 72, 103, 261, 292, 9, 40, 385, 416, 167, 198, 104, 135,
+  230, 261, 355, 386, 416, 416, 293, 324, 324, 355, 9, 9, 41, 72, 386, 417,
+  199, 230, 136, 167, 417, 448, 262, 293, 356, 387, 73, 104, 387, 418, 231, 262,
+  10, 41, 168, 199, 325, 356, 418, 449, 105, 136, 448, 448, 42, 73, 294, 325,
+  200, 231, 10, 10, 357, 388, 137, 168, 263, 294, 388, 419, 74, 105, 419, 450,
+  449, 480, 326, 357, 232, 263, 295, 326, 169, 200, 11, 42, 106, 137, 480, 480,
+  450, 481, 358, 389, 264, 295, 201, 232, 138, 169, 389, 420, 43, 74, 420, 451,
+  327, 358, 11, 11, 481, 512, 233, 264, 451, 482, 296, 327, 75, 106, 170, 201,
+  482, 513, 512, 512, 390, 421, 359, 390, 421, 452, 107, 138, 12, 43, 202, 233,
+  452, 483, 265, 296, 328, 359, 139, 170, 44, 75, 483, 514, 513, 544, 234, 265,
+  297, 328, 422, 453, 12, 12, 391, 422, 171, 202, 76, 107, 514, 545, 453, 484,
+  544, 544, 266, 297, 203, 234, 108, 139, 329, 360, 298, 329, 140, 171, 515,
+  546, 13, 44, 423, 454, 235, 266, 545, 576, 454, 485, 45, 76, 172, 203, 330,
+  361, 576, 576, 13, 13, 267, 298, 546, 577, 77, 108, 204, 235, 455, 486, 577,
+  608, 299, 330, 109, 140, 547, 578, 14, 45, 14, 14, 141, 172, 578, 609, 331,
+  362, 46, 77, 173, 204, 15, 15, 78, 109, 205, 236, 579, 610, 110, 141, 15, 46,
+  142, 173, 47, 78, 174, 205, 16, 16, 79, 110, 206, 237, 16, 47, 111, 142,
+  48, 79, 143, 174, 80, 111, 175, 206, 17, 48, 17, 17, 207, 238, 49, 80,
+  81, 112, 18, 18, 18, 49, 50, 81, 82, 113, 19, 50, 51, 82, 83, 114, 608, 608,
+  484, 515, 360, 391, 236, 267, 112, 143, 19, 19, 640, 640, 609, 640, 516, 547,
+  485, 516, 392, 423, 361, 392, 268, 299, 237, 268, 144, 175, 113, 144, 20, 51,
+  20, 20, 672, 672, 641, 672, 610, 641, 548, 579, 517, 548, 486, 517, 424, 455,
+  393, 424, 362, 393, 300, 331, 269, 300, 238, 269, 176, 207, 145, 176, 114,
+  145, 52, 83, 21, 52, 21, 21, 704, 704, 673, 704, 642, 673, 611, 642, 580,
+  611, 549, 580, 518, 549, 487, 518, 456, 487, 425, 456, 394, 425, 363, 394,
+  332, 363, 301, 332, 270, 301, 239, 270, 208, 239, 177, 208, 146, 177, 115,
+  146, 84, 115, 53, 84, 22, 53, 22, 22, 705, 736, 674, 705, 643, 674, 581, 612,
+  550, 581, 519, 550, 457, 488, 426, 457, 395, 426, 333, 364, 302, 333, 271,
+  302, 209, 240, 178, 209, 147, 178, 85, 116, 54, 85, 23, 54, 706, 737, 675,
+  706, 582, 613, 551, 582, 458, 489, 427, 458, 334, 365, 303, 334, 210, 241,
+  179, 210, 86, 117, 55, 86, 707, 738, 583, 614, 459, 490, 335, 366, 211, 242,
+  87, 118, 736, 736, 612, 643, 488, 519, 364, 395, 240, 271, 116, 147, 23, 23,
+  768, 768, 737, 768, 644, 675, 613, 644, 520, 551, 489, 520, 396, 427, 365,
+  396, 272, 303, 241, 272, 148, 179, 117, 148, 24, 55, 24, 24, 800, 800, 769,
+  800, 738, 769, 676, 707, 645, 676, 614, 645, 552, 583, 521, 552, 490, 521,
+  428, 459, 397, 428, 366, 397, 304, 335, 273, 304, 242, 273, 180, 211, 149,
+  180, 118, 149, 56, 87, 25, 56, 25, 25, 832, 832, 801, 832, 770, 801, 739,
+  770, 708, 739, 677, 708, 646, 677, 615, 646, 584, 615, 553, 584, 522, 553,
+  491, 522, 460, 491, 429, 460, 398, 429, 367, 398, 336, 367, 305, 336, 274,
+  305, 243, 274, 212, 243, 181, 212, 150, 181, 119, 150, 88, 119, 57, 88, 26,
+  57, 26, 26, 833, 864, 802, 833, 771, 802, 709, 740, 678, 709, 647, 678, 585,
+  616, 554, 585, 523, 554, 461, 492, 430, 461, 399, 430, 337, 368, 306, 337,
+  275, 306, 213, 244, 182, 213, 151, 182, 89, 120, 58, 89, 27, 58, 834, 865,
+  803, 834, 710, 741, 679, 710, 586, 617, 555, 586, 462, 493, 431, 462, 338,
+  369, 307, 338, 214, 245, 183, 214, 90, 121, 59, 90, 835, 866, 711, 742, 587,
+  618, 463, 494, 339, 370, 215, 246, 91, 122, 864, 864, 740, 771, 616, 647,
+  492, 523, 368, 399, 244, 275, 120, 151, 27, 27, 896, 896, 865, 896, 772, 803,
+  741, 772, 648, 679, 617, 648, 524, 555, 493, 524, 400, 431, 369, 400, 276,
+  307, 245, 276, 152, 183, 121, 152, 28, 59, 28, 28, 928, 928, 897, 928, 866,
+  897, 804, 835, 773, 804, 742, 773, 680, 711, 649, 680, 618, 649, 556, 587,
+  525, 556, 494, 525, 432, 463, 401, 432, 370, 401, 308, 339, 277, 308, 246,
+  277, 184, 215, 153, 184, 122, 153, 60, 91, 29, 60, 29, 29, 960, 960, 929,
+  960, 898, 929, 867, 898, 836, 867, 805, 836, 774, 805, 743, 774, 712, 743,
+  681, 712, 650, 681, 619, 650, 588, 619, 557, 588, 526, 557, 495, 526, 464,
+  495, 433, 464, 402, 433, 371, 402, 340, 371, 309, 340, 278, 309, 247, 278,
+  216, 247, 185, 216, 154, 185, 123, 154, 92, 123, 61, 92, 30, 61, 30, 30,
+  961, 992, 930, 961, 899, 930, 837, 868, 806, 837, 775, 806, 713, 744, 682,
+  713, 651, 682, 589, 620, 558, 589, 527, 558, 465, 496, 434, 465, 403, 434,
+  341, 372, 310, 341, 279, 310, 217, 248, 186, 217, 155, 186, 93, 124, 62, 93,
+  31, 62, 962, 993, 931, 962, 838, 869, 807, 838, 714, 745, 683, 714, 590, 621,
+  559, 590, 466, 497, 435, 466, 342, 373, 311, 342, 218, 249, 187, 218, 94,
+  125, 63, 94, 963, 994, 839, 870, 715, 746, 591, 622, 467, 498, 343, 374, 219,
+  250, 95, 126, 868, 899, 744, 775, 620, 651, 496, 527, 372, 403, 248, 279,
+  124, 155, 900, 931, 869, 900, 776, 807, 745, 776, 652, 683, 621, 652, 528,
+  559, 497, 528, 404, 435, 373, 404, 280, 311, 249, 280, 156, 187, 125, 156,
+  932, 963, 901, 932, 870, 901, 808, 839, 777, 808, 746, 777, 684, 715, 653,
+  684, 622, 653, 560, 591, 529, 560, 498, 529, 436, 467, 405, 436, 374, 405,
+  312, 343, 281, 312, 250, 281, 188, 219, 157, 188, 126, 157, 964, 995, 933,
+  964, 902, 933, 871, 902, 840, 871, 809, 840, 778, 809, 747, 778, 716, 747,
+  685, 716, 654, 685, 623, 654, 592, 623, 561, 592, 530, 561, 499, 530, 468,
+  499, 437, 468, 406, 437, 375, 406, 344, 375, 313, 344, 282, 313, 251, 282,
+  220, 251, 189, 220, 158, 189, 127, 158, 965, 996, 934, 965, 903, 934, 841,
+  872, 810, 841, 779, 810, 717, 748, 686, 717, 655, 686, 593, 624, 562, 593,
+  531, 562, 469, 500, 438, 469, 407, 438, 345, 376, 314, 345, 283, 314, 221,
+  252, 190, 221, 159, 190, 966, 997, 935, 966, 842, 873, 811, 842, 718, 749,
+  687, 718, 594, 625, 563, 594, 470, 501, 439, 470, 346, 377, 315, 346, 222,
+  253, 191, 222, 967, 998, 843, 874, 719, 750, 595, 626, 471, 502, 347, 378,
+  223, 254, 872, 903, 748, 779, 624, 655, 500, 531, 376, 407, 252, 283, 904,
+  935, 873, 904, 780, 811, 749, 780, 656, 687, 625, 656, 532, 563, 501, 532,
+  408, 439, 377, 408, 284, 315, 253, 284, 936, 967, 905, 936, 874, 905, 812,
+  843, 781, 812, 750, 781, 688, 719, 657, 688, 626, 657, 564, 595, 533, 564,
+  502, 533, 440, 471, 409, 440, 378, 409, 316, 347, 285, 316, 254, 285, 968,
+  999, 937, 968, 906, 937, 875, 906, 844, 875, 813, 844, 782, 813, 751, 782,
+  720, 751, 689, 720, 658, 689, 627, 658, 596, 627, 565, 596, 534, 565, 503,
+  534, 472, 503, 441, 472, 410, 441, 379, 410, 348, 379, 317, 348, 286, 317,
+  255, 286, 969, 1000, 938, 969, 907, 938, 845, 876, 814, 845, 783, 814, 721,
+  752, 690, 721, 659, 690, 597, 628, 566, 597, 535, 566, 473, 504, 442, 473,
+  411, 442, 349, 380, 318, 349, 287, 318, 970, 1001, 939, 970, 846, 877, 815,
+  846, 722, 753, 691, 722, 598, 629, 567, 598, 474, 505, 443, 474, 350, 381,
+  319, 350, 971, 1002, 847, 878, 723, 754, 599, 630, 475, 506, 351, 382, 876,
+  907, 752, 783, 628, 659, 504, 535, 380, 411, 908, 939, 877, 908, 784, 815,
+  753, 784, 660, 691, 629, 660, 536, 567, 505, 536, 412, 443, 381, 412, 940,
+  971, 909, 940, 878, 909, 816, 847, 785, 816, 754, 785, 692, 723, 661, 692,
+  630, 661, 568, 599, 537, 568, 506, 537, 444, 475, 413, 444, 382, 413, 972,
+  1003, 941, 972, 910, 941, 879, 910, 848, 879, 817, 848, 786, 817, 755, 786,
+  724, 755, 693, 724, 662, 693, 631, 662, 600, 631, 569, 600, 538, 569, 507,
+  538, 476, 507, 445, 476, 414, 445, 383, 414, 973, 1004, 942, 973, 911, 942,
+  849, 880, 818, 849, 787, 818, 725, 756, 694, 725, 663, 694, 601, 632, 570,
+  601, 539, 570, 477, 508, 446, 477, 415, 446, 974, 1005, 943, 974, 850, 881,
+  819, 850, 726, 757, 695, 726, 602, 633, 571, 602, 478, 509, 447, 478, 975,
+  1006, 851, 882, 727, 758, 603, 634, 479, 510, 880, 911, 756, 787, 632, 663,
+  508, 539, 912, 943, 881, 912, 788, 819, 757, 788, 664, 695, 633, 664, 540,
+  571, 509, 540, 944, 975, 913, 944, 882, 913, 820, 851, 789, 820, 758, 789,
+  696, 727, 665, 696, 634, 665, 572, 603, 541, 572, 510, 541, 976, 1007, 945,
+  976, 914, 945, 883, 914, 852, 883, 821, 852, 790, 821, 759, 790, 728, 759,
+  697, 728, 666, 697, 635, 666, 604, 635, 573, 604, 542, 573, 511, 542, 977,
+  1008, 946, 977, 915, 946, 853, 884, 822, 853, 791, 822, 729, 760, 698, 729,
+  667, 698, 605, 636, 574, 605, 543, 574, 978, 1009, 947, 978, 854, 885, 823,
+  854, 730, 761, 699, 730, 606, 637, 575, 606, 979, 1010, 855, 886, 731, 762,
+  607, 638, 884, 915, 760, 791, 636, 667, 916, 947, 885, 916, 792, 823, 761,
+  792, 668, 699, 637, 668, 948, 979, 917, 948, 886, 917, 824, 855, 793, 824,
+  762, 793, 700, 731, 669, 700, 638, 669, 980, 1011, 949, 980, 918, 949, 887,
+  918, 856, 887, 825, 856, 794, 825, 763, 794, 732, 763, 701, 732, 670, 701,
+  639, 670, 981, 1012, 950, 981, 919, 950, 857, 888, 826, 857, 795, 826, 733,
+  764, 702, 733, 671, 702, 982, 1013, 951, 982, 858, 889, 827, 858, 734, 765,
+  703, 734, 983, 1014, 859, 890, 735, 766, 888, 919, 764, 795, 920, 951, 889,
+  920, 796, 827, 765, 796, 952, 983, 921, 952, 890, 921, 828, 859, 797, 828,
+  766, 797, 984, 1015, 953, 984, 922, 953, 891, 922, 860, 891, 829, 860, 798,
+  829, 767, 798, 985, 1016, 954, 985, 923, 954, 861, 892, 830, 861, 799, 830,
+  986, 1017, 955, 986, 862, 893, 831, 862, 987, 1018, 863, 894, 892, 923, 924,
+  955, 893, 924, 956, 987, 925, 956, 894, 925, 988, 1019, 957, 988, 926, 957,
+  895, 926, 989, 1020, 958, 989, 927, 958, 990, 1021, 959, 990, 991, 1022, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp10_default_iscan_4x4[16]) = {
+  0, 2, 5, 8, 1, 3, 9, 12, 4, 7, 11, 14, 6, 10, 13, 15,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp10_col_iscan_4x4[16]) = {
+  0, 3, 7, 11, 1, 5, 9, 12, 2, 6, 10, 14, 4, 8, 13, 15,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp10_row_iscan_4x4[16]) = {
+  0, 1, 3, 5, 2, 4, 6, 9, 7, 8, 11, 13, 10, 12, 14, 15,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp10_col_iscan_8x8[64]) = {
+  0, 3, 8, 15, 22, 32, 40, 47, 1, 5, 11, 18, 26, 34, 44, 51,
+  2, 7, 13, 20, 28, 38, 46, 54, 4, 10, 16, 24, 31, 41, 50, 56,
+  6, 12, 21, 27, 35, 43, 52, 58, 9, 17, 25, 33, 39, 48, 55, 60,
+  14, 23, 30, 37, 45, 53, 59, 62, 19, 29, 36, 42, 49, 57, 61, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp10_row_iscan_8x8[64]) = {
+  0, 1, 2, 5, 8, 12, 19, 24, 3, 4, 7, 10, 15, 20, 30, 39,
+  6, 9, 13, 16, 21, 27, 37, 46, 11, 14, 17, 23, 28, 34, 44, 52,
+  18, 22, 25, 31, 35, 41, 50, 57, 26, 29, 33, 38, 43, 49, 55, 59,
+  32, 36, 42, 47, 51, 54, 60, 61, 40, 45, 48, 53, 56, 58, 62, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp10_default_iscan_8x8[64]) = {
+  0, 2, 5, 9, 14, 22, 31, 37, 1, 4, 8, 13, 19, 26, 38, 44,
+  3, 6, 10, 17, 24, 30, 42, 49, 7, 11, 15, 21, 29, 36, 47, 53,
+  12, 16, 20, 27, 34, 43, 52, 57, 18, 23, 28, 35, 41, 48, 56, 60,
+  25, 32, 39, 45, 50, 55, 59, 62, 33, 40, 46, 51, 54, 58, 61, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp10_col_iscan_16x16[256]) = {
+  0, 4, 11, 20, 31, 43, 59, 75, 85, 109, 130, 150, 165, 181, 195, 198,
+  1, 6, 14, 23, 34, 47, 64, 81, 95, 114, 135, 153, 171, 188, 201, 212,
+  2, 8, 16, 25, 38, 52, 67, 83, 101, 116, 136, 157, 172, 190, 205, 216,
+  3, 10, 18, 29, 41, 55, 71, 89, 103, 119, 141, 159, 176, 194, 208, 218,
+  5, 12, 21, 32, 45, 58, 74, 93, 104, 123, 144, 164, 179, 196, 210, 223,
+  7, 15, 26, 37, 49, 63, 78, 96, 112, 129, 146, 166, 182, 200, 215, 228,
+  9, 19, 28, 39, 54, 69, 86, 102, 117, 132, 151, 170, 187, 206, 220, 230,
+  13, 24, 35, 46, 60, 73, 91, 108, 122, 137, 154, 174, 189, 207, 224, 235,
+  17, 30, 40, 53, 66, 82, 98, 115, 126, 142, 161, 180, 197, 213, 227, 237,
+  22, 36, 48, 62, 76, 92, 105, 120, 133, 147, 167, 186, 203, 219, 232, 240,
+  27, 44, 56, 70, 84, 99, 113, 127, 140, 156, 175, 193, 209, 226, 236, 244,
+  33, 51, 68, 79, 94, 110, 125, 138, 149, 162, 184, 202, 217, 229, 241, 247,
+  42, 61, 77, 90, 106, 121, 134, 148, 160, 173, 191, 211, 225, 238, 245, 251,
+  50, 72, 87, 100, 118, 128, 145, 158, 168, 183, 204, 222, 233, 242, 249, 253,
+  57, 80, 97, 111, 131, 143, 155, 169, 178, 192, 214, 231, 239, 246, 250, 254,
+  65, 88, 107, 124, 139, 152, 163, 177, 185, 199, 221, 234, 243, 248, 252, 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp10_row_iscan_16x16[256]) = {
+  0, 1, 2, 4, 6, 9, 12, 17, 22, 29, 36, 43, 54, 64, 76, 86,
+  3, 5, 7, 11, 15, 19, 25, 32, 38, 48, 59, 68, 84, 99, 115, 130,
+  8, 10, 13, 18, 23, 27, 33, 42, 51, 60, 72, 88, 103, 119, 142, 167,
+  14, 16, 20, 26, 31, 37, 44, 53, 61, 73, 85, 100, 116, 135, 161, 185,
+  21, 24, 30, 35, 40, 47, 55, 65, 74, 81, 94, 112, 133, 154, 179, 205,
+  28, 34, 39, 45, 50, 58, 67, 77, 87, 96, 106, 121, 146, 169, 196, 212,
+  41, 46, 49, 56, 63, 70, 79, 90, 98, 107, 122, 138, 159, 182, 207, 222,
+  52, 57, 62, 69, 75, 83, 93, 102, 110, 120, 134, 150, 176, 195, 215, 226,
+  66, 71, 78, 82, 91, 97, 108, 113, 127, 136, 148, 168, 188, 202, 221, 232,
+  80, 89, 92, 101, 105, 114, 125, 131, 139, 151, 162, 177, 192, 208, 223, 234,
+  95, 104, 109, 117, 123, 128, 143, 144, 155, 165, 175, 190, 206, 219, 233, 239,
+  111, 118, 124, 129, 140, 147, 157, 164, 170, 181, 191, 203, 224, 230, 240,
+  243, 126, 132, 137, 145, 153, 160, 174, 178, 184, 197, 204, 216, 231, 237,
+  244, 246, 141, 149, 156, 166, 172, 180, 189, 199, 200, 210, 220, 228, 238,
+  242, 249, 251, 152, 163, 171, 183, 186, 193, 201, 211, 214, 218, 227, 236,
+  245, 247, 252, 253, 158, 173, 187, 194, 198, 209, 213, 217, 225, 229, 235,
+  241, 248, 250, 254, 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp10_default_iscan_16x16[256]) = {
+  0, 2, 5, 9, 17, 24, 36, 44, 55, 72, 88, 104, 128, 143, 166, 179,
+  1, 4, 8, 13, 20, 30, 40, 54, 66, 79, 96, 113, 141, 154, 178, 196,
+  3, 7, 11, 18, 25, 33, 46, 57, 71, 86, 101, 119, 148, 164, 186, 201,
+  6, 12, 16, 23, 31, 39, 53, 64, 78, 92, 110, 127, 153, 169, 193, 208,
+  10, 14, 19, 28, 37, 47, 58, 67, 84, 98, 114, 133, 161, 176, 198, 214,
+  15, 21, 26, 34, 43, 52, 65, 77, 91, 106, 120, 140, 165, 185, 205, 221,
+  22, 27, 32, 41, 48, 60, 73, 85, 99, 116, 130, 151, 175, 190, 211, 225,
+  29, 35, 42, 49, 59, 69, 81, 95, 108, 125, 139, 155, 182, 197, 217, 229,
+  38, 45, 51, 61, 68, 80, 93, 105, 118, 134, 150, 168, 191, 207, 223, 234,
+  50, 56, 63, 74, 83, 94, 109, 117, 129, 147, 163, 177, 199, 213, 228, 238,
+  62, 70, 76, 87, 97, 107, 122, 131, 145, 159, 172, 188, 210, 222, 235, 242,
+  75, 82, 90, 102, 112, 124, 138, 146, 157, 173, 187, 202, 219, 230, 240, 245,
+  89, 100, 111, 123, 132, 142, 156, 167, 180, 189, 203, 216, 231, 237, 246, 250,
+  103, 115, 126, 136, 149, 162, 171, 183, 194, 204, 215, 224, 236, 241, 248,
+  252, 121, 135, 144, 158, 170, 181, 192, 200, 209, 218, 227, 233, 243, 244,
+  251, 254, 137, 152, 160, 174, 184, 195, 206, 212, 220, 226, 232, 239, 247,
+  249, 253, 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp10_default_iscan_32x32[1024]) = {
+  0, 2, 5, 10, 17, 25, 38, 47, 62, 83, 101, 121, 145, 170, 193, 204,
+  210, 219, 229, 233, 245, 257, 275, 299, 342, 356, 377, 405, 455, 471, 495,
+  527, 1, 4, 8, 15, 22, 30, 45, 58, 74, 92, 112, 133, 158, 184, 203, 215, 222,
+  228, 234, 237, 256, 274, 298, 317, 355, 376, 404, 426, 470, 494, 526, 551,
+  3, 7, 12, 18, 28, 36, 52, 64, 82, 102, 118, 142, 164, 189, 208, 217, 224,
+  231, 235, 238, 273, 297, 316, 329, 375, 403, 425, 440, 493, 525, 550, 567,
+  6, 11, 16, 23, 31, 43, 60, 73, 90, 109, 126, 150, 173, 196, 211, 220, 226,
+  232, 236, 239, 296, 315, 328, 335, 402, 424, 439, 447, 524, 549, 566, 575,
+  9, 14, 19, 29, 37, 50, 65, 78, 95, 116, 134, 157, 179, 201, 214, 223, 244,
+  255, 272, 295, 341, 354, 374, 401, 454, 469, 492, 523, 582, 596, 617, 645,
+  13, 20, 26, 35, 44, 54, 72, 85, 105, 123, 140, 163, 182, 205, 216, 225,
+  254, 271, 294, 314, 353, 373, 400, 423, 468, 491, 522, 548, 595, 616, 644,
+  666, 21, 27, 33, 42, 53, 63, 80, 94, 113, 132, 151, 172, 190, 209, 218, 227,
+  270, 293, 313, 327, 372, 399, 422, 438, 490, 521, 547, 565, 615, 643, 665,
+  680, 24, 32, 39, 48, 57, 71, 88, 104, 120, 139, 159, 178, 197, 212, 221, 230,
+  292, 312, 326, 334, 398, 421, 437, 446, 520, 546, 564, 574, 642, 664, 679,
+  687, 34, 40, 46, 56, 68, 81, 96, 111, 130, 147, 167, 186, 243, 253, 269, 291,
+  340, 352, 371, 397, 453, 467, 489, 519, 581, 594, 614, 641, 693, 705, 723,
+  747, 41, 49, 55, 67, 77, 91, 107, 124, 138, 161, 177, 194, 252, 268, 290,
+  311, 351, 370, 396, 420, 466, 488, 518, 545, 593, 613, 640, 663, 704, 722,
+  746, 765, 51, 59, 66, 76, 89, 99, 119, 131, 149, 168, 181, 200, 267, 289,
+  310, 325, 369, 395, 419, 436, 487, 517, 544, 563, 612, 639, 662, 678, 721,
+  745, 764, 777, 61, 69, 75, 87, 100, 114, 129, 144, 162, 180, 191, 207, 288,
+  309, 324, 333, 394, 418, 435, 445, 516, 543, 562, 573, 638, 661, 677, 686,
+  744, 763, 776, 783, 70, 79, 86, 97, 108, 122, 137, 155, 242, 251, 266, 287,
+  339, 350, 368, 393, 452, 465, 486, 515, 580, 592, 611, 637, 692, 703, 720,
+  743, 788, 798, 813, 833, 84, 93, 103, 110, 125, 141, 154, 171, 250, 265, 286,
+  308, 349, 367, 392, 417, 464, 485, 514, 542, 591, 610, 636, 660, 702, 719,
+  742, 762, 797, 812, 832, 848, 98, 106, 115, 127, 143, 156, 169, 185, 264,
+  285, 307, 323, 366, 391, 416, 434, 484, 513, 541, 561, 609, 635, 659, 676,
+  718, 741, 761, 775, 811, 831, 847, 858, 117, 128, 136, 148, 160, 175, 188,
+  198, 284, 306, 322, 332, 390, 415, 433, 444, 512, 540, 560, 572, 634, 658,
+  675, 685, 740, 760, 774, 782, 830, 846, 857, 863, 135, 146, 152, 165, 241,
+  249, 263, 283, 338, 348, 365, 389, 451, 463, 483, 511, 579, 590, 608, 633,
+  691, 701, 717, 739, 787, 796, 810, 829, 867, 875, 887, 903, 153, 166, 174,
+  183, 248, 262, 282, 305, 347, 364, 388, 414, 462, 482, 510, 539, 589, 607,
+  632, 657, 700, 716, 738, 759, 795, 809, 828, 845, 874, 886, 902, 915, 176,
+  187, 195, 202, 261, 281, 304, 321, 363, 387, 413, 432, 481, 509, 538, 559,
+  606, 631, 656, 674, 715, 737, 758, 773, 808, 827, 844, 856, 885, 901, 914,
+  923, 192, 199, 206, 213, 280, 303, 320, 331, 386, 412, 431, 443, 508, 537,
+  558, 571, 630, 655, 673, 684, 736, 757, 772, 781, 826, 843, 855, 862, 900,
+  913, 922, 927, 240, 247, 260, 279, 337, 346, 362, 385, 450, 461, 480, 507,
+  578, 588, 605, 629, 690, 699, 714, 735, 786, 794, 807, 825, 866, 873, 884,
+  899, 930, 936, 945, 957, 246, 259, 278, 302, 345, 361, 384, 411, 460, 479,
+  506, 536, 587, 604, 628, 654, 698, 713, 734, 756, 793, 806, 824, 842, 872,
+  883, 898, 912, 935, 944, 956, 966, 258, 277, 301, 319, 360, 383, 410, 430,
+  478, 505, 535, 557, 603, 627, 653, 672, 712, 733, 755, 771, 805, 823, 841,
+  854, 882, 897, 911, 921, 943, 955, 965, 972, 276, 300, 318, 330, 382, 409,
+  429, 442, 504, 534, 556, 570, 626, 652, 671, 683, 732, 754, 770, 780, 822,
+  840, 853, 861, 896, 910, 920, 926, 954, 964, 971, 975, 336, 344, 359, 381,
+  449, 459, 477, 503, 577, 586, 602, 625, 689, 697, 711, 731, 785, 792, 804,
+  821, 865, 871, 881, 895, 929, 934, 942, 953, 977, 981, 987, 995, 343, 358,
+  380, 408, 458, 476, 502, 533, 585, 601, 624, 651, 696, 710, 730, 753, 791,
+  803, 820, 839, 870, 880, 894, 909, 933, 941, 952, 963, 980, 986, 994, 1001,
+  357, 379, 407, 428, 475, 501, 532, 555, 600, 623, 650, 670, 709, 729, 752,
+  769, 802, 819, 838, 852, 879, 893, 908, 919, 940, 951, 962, 970, 985, 993,
+  1000, 1005, 378, 406, 427, 441, 500, 531, 554, 569, 622, 649, 669, 682, 728,
+  751, 768, 779, 818, 837, 851, 860, 892, 907, 918, 925, 950, 961, 969, 974,
+  992, 999, 1004, 1007, 448, 457, 474, 499, 576, 584, 599, 621, 688, 695, 708,
+  727, 784, 790, 801, 817, 864, 869, 878, 891, 928, 932, 939, 949, 976, 979,
+  984, 991, 1008, 1010, 1013, 1017, 456, 473, 498, 530, 583, 598, 620, 648,
+  694, 707, 726, 750, 789, 800, 816, 836, 868, 877, 890, 906, 931, 938, 948,
+  960, 978, 983, 990, 998, 1009, 1012, 1016, 1020, 472, 497, 529, 553, 597,
+  619, 647, 668, 706, 725, 749, 767, 799, 815, 835, 850, 876, 889, 905, 917,
+  937, 947, 959, 968, 982, 989, 997, 1003, 1011, 1015, 1019, 1022, 496, 528,
+  552, 568, 618, 646, 667, 681, 724, 748, 766, 778, 814, 834, 849, 859, 888,
+  904, 916, 924, 946, 958, 967, 973, 988, 996, 1002, 1006, 1014, 1018, 1021,
+  1023,
+};
+
+const scan_order vp10_default_scan_orders[TX_SIZES] = {
+  {default_scan_4x4,   vp10_default_iscan_4x4,   default_scan_4x4_neighbors},
+  {default_scan_8x8,   vp10_default_iscan_8x8,   default_scan_8x8_neighbors},
+  {default_scan_16x16, vp10_default_iscan_16x16, default_scan_16x16_neighbors},
+  {default_scan_32x32, vp10_default_iscan_32x32, default_scan_32x32_neighbors},
+};
+
+const scan_order vp10_scan_orders[TX_SIZES][TX_TYPES] = {
+  {  // TX_4X4
+    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {row_scan_4x4,     vp10_row_iscan_4x4,     row_scan_4x4_neighbors},
+    {col_scan_4x4,     vp10_col_iscan_4x4,     col_scan_4x4_neighbors},
+    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors}
+  }, {  // TX_8X8
+    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {row_scan_8x8,     vp10_row_iscan_8x8,     row_scan_8x8_neighbors},
+    {col_scan_8x8,     vp10_col_iscan_8x8,     col_scan_8x8_neighbors},
+    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors}
+  }, {  // TX_16X16
+    {default_scan_16x16, vp10_default_iscan_16x16, default_scan_16x16_neighbors},
+    {row_scan_16x16,     vp10_row_iscan_16x16,     row_scan_16x16_neighbors},
+    {col_scan_16x16,     vp10_col_iscan_16x16,     col_scan_16x16_neighbors},
+    {default_scan_16x16, vp10_default_iscan_16x16, default_scan_16x16_neighbors}
+  }, {  // TX_32X32
+    {default_scan_32x32, vp10_default_iscan_32x32, default_scan_32x32_neighbors},
+    {default_scan_32x32, vp10_default_iscan_32x32, default_scan_32x32_neighbors},
+    {default_scan_32x32, vp10_default_iscan_32x32, default_scan_32x32_neighbors},
+    {default_scan_32x32, vp10_default_iscan_32x32, default_scan_32x32_neighbors},
+  }
+};
diff --git a/libs/libvpx/vp10/common/scan.h b/libs/libvpx/vp10/common/scan.h
new file mode 100644
index 0000000000..f5a020f1e7
--- /dev/null
+++ b/libs/libvpx/vp10/common/scan.h
@@ -0,0 +1,49 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_COMMON_SCAN_H_
+#define VP10_COMMON_SCAN_H_
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+#include "vp10/common/enums.h"
+#include "vp10/common/blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_NEIGHBORS 2
+
+typedef struct {
+  const int16_t *scan;
+  const int16_t *iscan;
+  const int16_t *neighbors;
+} scan_order;
+
+extern const scan_order vp10_default_scan_orders[TX_SIZES];
+extern const scan_order vp10_scan_orders[TX_SIZES][TX_TYPES];
+
+static INLINE int get_coef_context(const int16_t *neighbors,
+                                   const uint8_t *token_cache, int c) {
+  return (1 + token_cache[neighbors[MAX_NEIGHBORS * c + 0]] +
+          token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >> 1;
+}
+
+static INLINE const scan_order *get_scan(TX_SIZE tx_size, TX_TYPE tx_type) {
+  return &vp10_scan_orders[tx_size][tx_type];
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_COMMON_SCAN_H_
diff --git a/libs/libvpx/vp10/common/seg_common.c b/libs/libvpx/vp10/common/seg_common.c
new file mode 100644
index 0000000000..1bf09b9a0f
--- /dev/null
+++ b/libs/libvpx/vp10/common/seg_common.c
@@ -0,0 +1,63 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vp10/common/blockd.h"
+#include "vp10/common/loopfilter.h"
+#include "vp10/common/seg_common.h"
+#include "vp10/common/quant_common.h"
+
+static const int seg_feature_data_signed[SEG_LVL_MAX] = { 1, 1, 0, 0 };
+
+static const int seg_feature_data_max[SEG_LVL_MAX] = {
+  MAXQ, MAX_LOOP_FILTER, 3, 0 };
+
+// These functions provide access to new segment level features.
+// Eventually these function may be "optimized out" but for the moment,
+// the coding mechanism is still subject to change so these provide a
+// convenient single point of change.
+
+void vp10_clearall_segfeatures(struct segmentation *seg) {
+  vp10_zero(seg->feature_data);
+  vp10_zero(seg->feature_mask);
+}
+
+void vp10_enable_segfeature(struct segmentation *seg, int segment_id,
+                           SEG_LVL_FEATURES feature_id) {
+  seg->feature_mask[segment_id] |= 1 << feature_id;
+}
+
+int vp10_seg_feature_data_max(SEG_LVL_FEATURES feature_id) {
+  return seg_feature_data_max[feature_id];
+}
+
+int vp10_is_segfeature_signed(SEG_LVL_FEATURES feature_id) {
+  return seg_feature_data_signed[feature_id];
+}
+
+void vp10_set_segdata(struct segmentation *seg, int segment_id,
+                     SEG_LVL_FEATURES feature_id, int seg_data) {
+  assert(seg_data <= seg_feature_data_max[feature_id]);
+  if (seg_data < 0) {
+    assert(seg_feature_data_signed[feature_id]);
+    assert(-seg_data <= seg_feature_data_max[feature_id]);
+  }
+
+  seg->feature_data[segment_id][feature_id] = seg_data;
+}
+
+const vpx_tree_index vp10_segment_tree[TREE_SIZE(MAX_SEGMENTS)] = {
+  2,  4,  6,  8, 10, 12,
+  0, -1, -2, -3, -4, -5, -6, -7
+};
+
+
+// TBD? Functions to read and write segment data with range / validity checking
diff --git a/libs/libvpx/vp10/common/seg_common.h b/libs/libvpx/vp10/common/seg_common.h
new file mode 100644
index 0000000000..cd38e8ee0d
--- /dev/null
+++ b/libs/libvpx/vp10/common/seg_common.h
@@ -0,0 +1,88 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_COMMON_SEG_COMMON_H_
+#define VP10_COMMON_SEG_COMMON_H_
+
+#include "vpx_dsp/prob.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define SEGMENT_DELTADATA   0
+#define SEGMENT_ABSDATA     1
+
+#define MAX_SEGMENTS     8
+#define SEG_TREE_PROBS   (MAX_SEGMENTS-1)
+
+#define PREDICTION_PROBS 3
+
+// Segment level features.
+typedef enum {
+  SEG_LVL_ALT_Q = 0,               // Use alternate Quantizer ....
+  SEG_LVL_ALT_LF = 1,              // Use alternate loop filter value...
+  SEG_LVL_REF_FRAME = 2,           // Optional Segment reference frame
+  SEG_LVL_SKIP = 3,                // Optional Segment (0,0) + skip mode
+  SEG_LVL_MAX = 4                  // Number of features supported
+} SEG_LVL_FEATURES;
+
+
+struct segmentation {
+  uint8_t enabled;
+  uint8_t update_map;
+  uint8_t update_data;
+  uint8_t abs_delta;
+  uint8_t temporal_update;
+
+  int16_t feature_data[MAX_SEGMENTS][SEG_LVL_MAX];
+  unsigned int feature_mask[MAX_SEGMENTS];
+};
+
+struct segmentation_probs {
+  vpx_prob tree_probs[SEG_TREE_PROBS];
+  vpx_prob pred_probs[PREDICTION_PROBS];
+};
+
+static INLINE int segfeature_active(const struct segmentation *seg,
+                                    int segment_id,
+                                    SEG_LVL_FEATURES feature_id) {
+  return seg->enabled &&
+         (seg->feature_mask[segment_id] & (1 << feature_id));
+}
+
+void vp10_clearall_segfeatures(struct segmentation *seg);
+
+void vp10_enable_segfeature(struct segmentation *seg,
+                           int segment_id,
+                           SEG_LVL_FEATURES feature_id);
+
+int vp10_seg_feature_data_max(SEG_LVL_FEATURES feature_id);
+
+int vp10_is_segfeature_signed(SEG_LVL_FEATURES feature_id);
+
+void vp10_set_segdata(struct segmentation *seg,
+                     int segment_id,
+                     SEG_LVL_FEATURES feature_id,
+                     int seg_data);
+
+static INLINE int get_segdata(const struct segmentation *seg, int segment_id,
+                              SEG_LVL_FEATURES feature_id) {
+  return seg->feature_data[segment_id][feature_id];
+}
+
+extern const vpx_tree_index vp10_segment_tree[TREE_SIZE(MAX_SEGMENTS)];
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_COMMON_SEG_COMMON_H_
+
diff --git a/libs/libvpx/vp10/common/textblit.c b/libs/libvpx/vp10/common/textblit.c
new file mode 100644
index 0000000000..2e8811ea50
--- /dev/null
+++ b/libs/libvpx/vp10/common/textblit.c
@@ -0,0 +1,120 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "vp10/common/textblit.h"
+
+static const int font[] = {
+  0x0, 0x5C00, 0x8020, 0xAFABEA, 0xD7EC0, 0x1111111, 0x1855740, 0x18000,
+  0x45C0, 0x74400, 0x51140, 0x23880, 0xC4000, 0x21080, 0x80000, 0x111110,
+  0xE9D72E, 0x87E40, 0x12AD732, 0xAAD62A, 0x4F94C4, 0x4D6B7, 0x456AA,
+  0x3E8423, 0xAAD6AA, 0xAAD6A2, 0x2800, 0x2A00, 0x8A880, 0x52940, 0x22A20,
+  0x15422, 0x6AD62E, 0x1E4A53E, 0xAAD6BF, 0x8C62E, 0xE8C63F, 0x118D6BF,
+  0x1094BF, 0xCAC62E, 0x1F2109F, 0x118FE31, 0xF8C628, 0x8A89F, 0x108421F,
+  0x1F1105F, 0x1F4105F, 0xE8C62E, 0x2294BF, 0x164C62E, 0x12694BF, 0x8AD6A2,
+  0x10FC21, 0x1F8421F, 0x744107, 0xF8220F, 0x1151151, 0x117041, 0x119D731,
+  0x47E0, 0x1041041, 0xFC400, 0x10440, 0x1084210, 0x820
+};
+
+static void plot(int x, int y, unsigned char *image, int pitch) {
+  image[x + y * pitch] ^= 255;
+}
+
+void vp10_blit_text(const char *msg, unsigned char *address, const int pitch) {
+  int letter_bitmap;
+  unsigned char *output_pos = address;
+  int colpos = 0;
+
+  while (msg[colpos] != 0) {
+    char letter = msg[colpos];
+    int fontcol, fontrow;
+
+    if (letter <= 'Z' && letter >= ' ')
+      letter_bitmap = font[letter - ' '];
+    else if (letter <= 'z' && letter >= 'a')
+      letter_bitmap = font[letter - 'a' + 'A' - ' '];
+    else
+      letter_bitmap = font[0];
+
+    for (fontcol = 6; fontcol >= 0; fontcol--)
+      for (fontrow = 0; fontrow < 5; fontrow++)
+        output_pos[fontrow * pitch + fontcol] =
+          ((letter_bitmap >> (fontcol * 5)) & (1 << fontrow) ? 255 : 0);
+
+    output_pos += 7;
+    colpos++;
+  }
+}
+
+
+
+/* Bresenham line algorithm */
+void vp10_blit_line(int x0, int x1, int y0, int y1, unsigned char *image,
+                   int pitch) {
+  int steep = abs(y1 - y0) > abs(x1 - x0);
+  int deltax, deltay;
+  int error, ystep, y, x;
+
+  if (steep) {
+    int t;
+    t = x0;
+    x0 = y0;
+    y0 = t;
+
+    t = x1;
+    x1 = y1;
+    y1 = t;
+  }
+
+  if (x0 > x1) {
+    int t;
+    t = x0;
+    x0 = x1;
+    x1 = t;
+
+    t = y0;
+    y0 = y1;
+    y1 = t;
+  }
+
+  deltax = x1 - x0;
+  deltay = abs(y1 - y0);
+  error  = deltax / 2;
+
+  y = y0;
+
+  if (y0 < y1)
+    ystep = 1;
+  else
+    ystep = -1;
+
+  if (steep) {
+    for (x = x0; x <= x1; x++) {
+      plot(y, x, image, pitch);
+
+      error = error - deltay;
+      if (error < 0) {
+        y = y + ystep;
+        error = error + deltax;
+      }
+    }
+  } else {
+    for (x = x0; x <= x1; x++) {
+      plot(x, y, image, pitch);
+
+      error = error - deltay;
+      if (error < 0) {
+        y = y + ystep;
+        error = error + deltax;
+      }
+    }
+  }
+}
diff --git a/libs/libvpx/vp10/common/textblit.h b/libs/libvpx/vp10/common/textblit.h
new file mode 100644
index 0000000000..c37140d0f1
--- /dev/null
+++ b/libs/libvpx/vp10/common/textblit.h
@@ -0,0 +1,27 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_COMMON_TEXTBLIT_H_
+#define VP10_COMMON_TEXTBLIT_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp10_blit_text(const char *msg, unsigned char *address, int pitch);
+
+void vp10_blit_line(int x0, int x1, int y0, int y1, unsigned char *image,
+                   int pitch);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_COMMON_TEXTBLIT_H_
diff --git a/libs/libvpx/vp10/common/thread_common.c b/libs/libvpx/vp10/common/thread_common.c
new file mode 100644
index 0000000000..0c7a1c22a8
--- /dev/null
+++ b/libs/libvpx/vp10/common/thread_common.c
@@ -0,0 +1,459 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp10/common/entropymode.h"
+#include "vp10/common/thread_common.h"
+#include "vp10/common/reconinter.h"
+#include "vp10/common/loopfilter.h"
+
+#if CONFIG_MULTITHREAD
+static INLINE void mutex_lock(pthread_mutex_t *const mutex) {
+  const int kMaxTryLocks = 4000;
+  int locked = 0;
+  int i;
+
+  for (i = 0; i < kMaxTryLocks; ++i) {
+    if (!pthread_mutex_trylock(mutex)) {
+      locked = 1;
+      break;
+    }
+  }
+
+  if (!locked)
+    pthread_mutex_lock(mutex);
+}
+#endif  // CONFIG_MULTITHREAD
+
+static INLINE void sync_read(VP9LfSync *const lf_sync, int r, int c) {
+#if CONFIG_MULTITHREAD
+  const int nsync = lf_sync->sync_range;
+
+  if (r && !(c & (nsync - 1))) {
+    pthread_mutex_t *const mutex = &lf_sync->mutex_[r - 1];
+    mutex_lock(mutex);
+
+    while (c > lf_sync->cur_sb_col[r - 1] - nsync) {
+      pthread_cond_wait(&lf_sync->cond_[r - 1], mutex);
+    }
+    pthread_mutex_unlock(mutex);
+  }
+#else
+  (void)lf_sync;
+  (void)r;
+  (void)c;
+#endif  // CONFIG_MULTITHREAD
+}
+
+static INLINE void sync_write(VP9LfSync *const lf_sync, int r, int c,
+                              const int sb_cols) {
+#if CONFIG_MULTITHREAD
+  const int nsync = lf_sync->sync_range;
+  int cur;
+  // Only signal when there are enough filtered SB for next row to run.
+  int sig = 1;
+
+  if (c < sb_cols - 1) {
+    cur = c;
+    if (c % nsync)
+      sig = 0;
+  } else {
+    cur = sb_cols + nsync;
+  }
+
+  if (sig) {
+    mutex_lock(&lf_sync->mutex_[r]);
+
+    lf_sync->cur_sb_col[r] = cur;
+
+    pthread_cond_signal(&lf_sync->cond_[r]);
+    pthread_mutex_unlock(&lf_sync->mutex_[r]);
+  }
+#else
+  (void)lf_sync;
+  (void)r;
+  (void)c;
+  (void)sb_cols;
+#endif  // CONFIG_MULTITHREAD
+}
+
+// Implement row loopfiltering for each thread.
+static INLINE
+void thread_loop_filter_rows(const YV12_BUFFER_CONFIG *const frame_buffer,
+                             VP10_COMMON *const cm,
+                             struct macroblockd_plane planes[MAX_MB_PLANE],
+                             int start, int stop, int y_only,
+                             VP9LfSync *const lf_sync) {
+  const int num_planes = y_only ? 1 : MAX_MB_PLANE;
+  const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2;
+  int mi_row, mi_col;
+  enum lf_path path;
+  if (y_only)
+    path = LF_PATH_444;
+  else if (planes[1].subsampling_y == 1 && planes[1].subsampling_x == 1)
+    path = LF_PATH_420;
+  else if (planes[1].subsampling_y == 0 && planes[1].subsampling_x == 0)
+    path = LF_PATH_444;
+  else
+    path = LF_PATH_SLOW;
+
+  for (mi_row = start; mi_row < stop;
+       mi_row += lf_sync->num_workers * MI_BLOCK_SIZE) {
+    MODE_INFO **const mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
+
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
+      const int r = mi_row >> MI_BLOCK_SIZE_LOG2;
+      const int c = mi_col >> MI_BLOCK_SIZE_LOG2;
+      LOOP_FILTER_MASK lfm;
+      int plane;
+
+      sync_read(lf_sync, r, c);
+
+      vp10_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
+
+      // TODO(JBB): Make setup_mask work for non 420.
+      vp10_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride,
+                     &lfm);
+
+      vp10_filter_block_plane_ss00(cm, &planes[0], mi_row, &lfm);
+      for (plane = 1; plane < num_planes; ++plane) {
+        switch (path) {
+          case LF_PATH_420:
+            vp10_filter_block_plane_ss11(cm, &planes[plane], mi_row, &lfm);
+            break;
+          case LF_PATH_444:
+            vp10_filter_block_plane_ss00(cm, &planes[plane], mi_row, &lfm);
+            break;
+          case LF_PATH_SLOW:
+            vp10_filter_block_plane_non420(cm, &planes[plane], mi + mi_col,
+                                          mi_row, mi_col);
+            break;
+        }
+      }
+
+      sync_write(lf_sync, r, c, sb_cols);
+    }
+  }
+}
+
+// Row-based multi-threaded loopfilter hook
+static int loop_filter_row_worker(VP9LfSync *const lf_sync,
+                                  LFWorkerData *const lf_data) {
+  thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
+                          lf_data->start, lf_data->stop, lf_data->y_only,
+                          lf_sync);
+  return 1;
+}
+
+static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame,
+                                VP10_COMMON *cm,
+                                struct macroblockd_plane planes[MAX_MB_PLANE],
+                                int start, int stop, int y_only,
+                                VPxWorker *workers, int nworkers,
+                                VP9LfSync *lf_sync) {
+  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
+  // Number of superblock rows and cols
+  const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
+  // Decoder may allocate more threads than number of tiles based on user's
+  // input.
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int num_workers = VPXMIN(nworkers, tile_cols);
+  int i;
+
+  if (!lf_sync->sync_range || sb_rows != lf_sync->rows ||
+      num_workers > lf_sync->num_workers) {
+    vp10_loop_filter_dealloc(lf_sync);
+    vp10_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);
+  }
+
+  // Initialize cur_sb_col to -1 for all SB rows.
+  memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);
+
+  // Set up loopfilter thread data.
+  // The decoder is capping num_workers because it has been observed that using
+  // more threads on the loopfilter than there are cores will hurt performance
+  // on Android. This is because the system will only schedule the tile decode
+  // workers on cores equal to the number of tile columns. Then if the decoder
+  // tries to use more threads for the loopfilter, it will hurt performance
+  // because of contention. If the multithreading code changes in the future
+  // then the number of workers used by the loopfilter should be revisited.
+  for (i = 0; i < num_workers; ++i) {
+    VPxWorker *const worker = &workers[i];
+    LFWorkerData *const lf_data = &lf_sync->lfdata[i];
+
+    worker->hook = (VPxWorkerHook)loop_filter_row_worker;
+    worker->data1 = lf_sync;
+    worker->data2 = lf_data;
+
+    // Loopfilter data
+    vp10_loop_filter_data_reset(lf_data, frame, cm, planes);
+    lf_data->start = start + i * MI_BLOCK_SIZE;
+    lf_data->stop = stop;
+    lf_data->y_only = y_only;
+
+    // Start loopfiltering
+    if (i == num_workers - 1) {
+      winterface->execute(worker);
+    } else {
+      winterface->launch(worker);
+    }
+  }
+
+  // Wait till all rows are finished
+  for (i = 0; i < num_workers; ++i) {
+    winterface->sync(&workers[i]);
+  }
+}
+
+void vp10_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
+                              VP10_COMMON *cm,
+                              struct macroblockd_plane planes[MAX_MB_PLANE],
+                              int frame_filter_level,
+                              int y_only, int partial_frame,
+                              VPxWorker *workers, int num_workers,
+                              VP9LfSync *lf_sync) {
+  int start_mi_row, end_mi_row, mi_rows_to_filter;
+
+  if (!frame_filter_level) return;
+
+  start_mi_row = 0;
+  mi_rows_to_filter = cm->mi_rows;
+  if (partial_frame && cm->mi_rows > 8) {
+    start_mi_row = cm->mi_rows >> 1;
+    start_mi_row &= 0xfffffff8;
+    mi_rows_to_filter = VPXMAX(cm->mi_rows / 8, 8);
+  }
+  end_mi_row = start_mi_row + mi_rows_to_filter;
+  vp10_loop_filter_frame_init(cm, frame_filter_level);
+
+  loop_filter_rows_mt(frame, cm, planes, start_mi_row, end_mi_row,
+                      y_only, workers, num_workers, lf_sync);
+}
+
+// Set up nsync by width.
+static INLINE int get_sync_range(int width) {
+  // nsync numbers are picked by testing. For example, for 4k
+  // video, using 4 gives best performance.
+  if (width < 640)
+    return 1;
+  else if (width <= 1280)
+    return 2;
+  else if (width <= 4096)
+    return 4;
+  else
+    return 8;
+}
+
+// Allocate memory for lf row synchronization
+void vp10_loop_filter_alloc(VP9LfSync *lf_sync, VP10_COMMON *cm, int rows,
+                           int width, int num_workers) {
+  lf_sync->rows = rows;
+#if CONFIG_MULTITHREAD
+  {
+    int i;
+
+    CHECK_MEM_ERROR(cm, lf_sync->mutex_,
+                    vpx_malloc(sizeof(*lf_sync->mutex_) * rows));
+    if (lf_sync->mutex_) {
+      for (i = 0; i < rows; ++i) {
+        pthread_mutex_init(&lf_sync->mutex_[i], NULL);
+      }
+    }
+
+    CHECK_MEM_ERROR(cm, lf_sync->cond_,
+                    vpx_malloc(sizeof(*lf_sync->cond_) * rows));
+    if (lf_sync->cond_) {
+      for (i = 0; i < rows; ++i) {
+        pthread_cond_init(&lf_sync->cond_[i], NULL);
+      }
+    }
+  }
+#endif  // CONFIG_MULTITHREAD
+
+  CHECK_MEM_ERROR(cm, lf_sync->lfdata,
+                  vpx_malloc(num_workers * sizeof(*lf_sync->lfdata)));
+  lf_sync->num_workers = num_workers;
+
+  CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col,
+                  vpx_malloc(sizeof(*lf_sync->cur_sb_col) * rows));
+
+  // Set up nsync.
+  lf_sync->sync_range = get_sync_range(width);
+}
+
+// Deallocate lf synchronization related mutex and data
+void vp10_loop_filter_dealloc(VP9LfSync *lf_sync) {
+  if (lf_sync != NULL) {
+#if CONFIG_MULTITHREAD
+    int i;
+
+    if (lf_sync->mutex_ != NULL) {
+      for (i = 0; i < lf_sync->rows; ++i) {
+        pthread_mutex_destroy(&lf_sync->mutex_[i]);
+      }
+      vpx_free(lf_sync->mutex_);
+    }
+    if (lf_sync->cond_ != NULL) {
+      for (i = 0; i < lf_sync->rows; ++i) {
+        pthread_cond_destroy(&lf_sync->cond_[i]);
+      }
+      vpx_free(lf_sync->cond_);
+    }
+#endif  // CONFIG_MULTITHREAD
+    vpx_free(lf_sync->lfdata);
+    vpx_free(lf_sync->cur_sb_col);
+    // clear the structure as the source of this call may be a resize in which
+    // case this call will be followed by an _alloc() which may fail.
+    vp10_zero(*lf_sync);
+  }
+}
+
+// Accumulate frame counts.
+void vp10_accumulate_frame_counts(VP10_COMMON *cm, FRAME_COUNTS *counts,
+                                 int is_dec) {
+  int i, j, k, l, m;
+
+  for (i = 0; i < BLOCK_SIZE_GROUPS; i++)
+    for (j = 0; j < INTRA_MODES; j++)
+      cm->counts.y_mode[i][j] += counts->y_mode[i][j];
+
+  for (i = 0; i < INTRA_MODES; i++)
+    for (j = 0; j < INTRA_MODES; j++)
+      cm->counts.uv_mode[i][j] += counts->uv_mode[i][j];
+
+  for (i = 0; i < PARTITION_CONTEXTS; i++)
+    for (j = 0; j < PARTITION_TYPES; j++)
+      cm->counts.partition[i][j] += counts->partition[i][j];
+
+  if (is_dec) {
+    int n;
+    for (i = 0; i < TX_SIZES; i++)
+      for (j = 0; j < PLANE_TYPES; j++)
+        for (k = 0; k < REF_TYPES; k++)
+          for (l = 0; l < COEF_BANDS; l++)
+            for (m = 0; m < COEFF_CONTEXTS; m++) {
+              cm->counts.eob_branch[i][j][k][l][m] +=
+                  counts->eob_branch[i][j][k][l][m];
+              for (n = 0; n < UNCONSTRAINED_NODES + 1; n++)
+                cm->counts.coef[i][j][k][l][m][n] +=
+                    counts->coef[i][j][k][l][m][n];
+            }
+  } else {
+    for (i = 0; i < TX_SIZES; i++)
+      for (j = 0; j < PLANE_TYPES; j++)
+        for (k = 0; k < REF_TYPES; k++)
+          for (l = 0; l < COEF_BANDS; l++)
+            for (m = 0; m < COEFF_CONTEXTS; m++)
+              cm->counts.eob_branch[i][j][k][l][m] +=
+                  counts->eob_branch[i][j][k][l][m];
+                // In the encoder, cm->counts.coef is only updated at frame
+                // level, so not need to accumulate it here.
+                // for (n = 0; n < UNCONSTRAINED_NODES + 1; n++)
+                //   cm->counts.coef[i][j][k][l][m][n] +=
+                //       counts->coef[i][j][k][l][m][n];
+  }
+
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
+    for (j = 0; j < SWITCHABLE_FILTERS; j++)
+      cm->counts.switchable_interp[i][j] += counts->switchable_interp[i][j];
+
+  for (i = 0; i < INTER_MODE_CONTEXTS; i++)
+    for (j = 0; j < INTER_MODES; j++)
+      cm->counts.inter_mode[i][j] += counts->inter_mode[i][j];
+
+  for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
+    for (j = 0; j < 2; j++)
+      cm->counts.intra_inter[i][j] += counts->intra_inter[i][j];
+
+  for (i = 0; i < COMP_INTER_CONTEXTS; i++)
+    for (j = 0; j < 2; j++)
+      cm->counts.comp_inter[i][j] += counts->comp_inter[i][j];
+
+  for (i = 0; i < REF_CONTEXTS; i++)
+    for (j = 0; j < 2; j++)
+      for (k = 0; k < 2; k++)
+      cm->counts.single_ref[i][j][k] += counts->single_ref[i][j][k];
+
+  for (i = 0; i < REF_CONTEXTS; i++)
+    for (j = 0; j < 2; j++)
+      cm->counts.comp_ref[i][j] += counts->comp_ref[i][j];
+
+  for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
+    for (j = 0; j < TX_SIZES; j++)
+      cm->counts.tx.p32x32[i][j] += counts->tx.p32x32[i][j];
+
+    for (j = 0; j < TX_SIZES - 1; j++)
+      cm->counts.tx.p16x16[i][j] += counts->tx.p16x16[i][j];
+
+    for (j = 0; j < TX_SIZES - 2; j++)
+      cm->counts.tx.p8x8[i][j] += counts->tx.p8x8[i][j];
+  }
+
+  for (i = 0; i < TX_SIZES; i++)
+    cm->counts.tx.tx_totals[i] += counts->tx.tx_totals[i];
+
+  for (i = 0; i < SKIP_CONTEXTS; i++)
+    for (j = 0; j < 2; j++)
+      cm->counts.skip[i][j] += counts->skip[i][j];
+
+  for (i = 0; i < MV_JOINTS; i++)
+    cm->counts.mv.joints[i] += counts->mv.joints[i];
+
+  for (k = 0; k < 2; k++) {
+    nmv_component_counts *comps = &cm->counts.mv.comps[k];
+    nmv_component_counts *comps_t = &counts->mv.comps[k];
+
+    for (i = 0; i < 2; i++) {
+      comps->sign[i] += comps_t->sign[i];
+      comps->class0_hp[i] += comps_t->class0_hp[i];
+      comps->hp[i] += comps_t->hp[i];
+    }
+
+    for (i = 0; i < MV_CLASSES; i++)
+      comps->classes[i] += comps_t->classes[i];
+
+    for (i = 0; i < CLASS0_SIZE; i++) {
+      comps->class0[i] += comps_t->class0[i];
+      for (j = 0; j < MV_FP_SIZE; j++)
+        comps->class0_fp[i][j] += comps_t->class0_fp[i][j];
+    }
+
+    for (i = 0; i < MV_OFFSET_BITS; i++)
+      for (j = 0; j < 2; j++)
+        comps->bits[i][j] += comps_t->bits[i][j];
+
+    for (i = 0; i < MV_FP_SIZE; i++)
+      comps->fp[i] += comps_t->fp[i];
+  }
+
+  for (i = 0; i < EXT_TX_SIZES; i++) {
+    int j;
+    for (j = 0; j < TX_TYPES; ++j)
+      for (k = 0; k < TX_TYPES; k++)
+        cm->counts.intra_ext_tx[i][j][k] += counts->intra_ext_tx[i][j][k];
+  }
+  for (i = 0; i < EXT_TX_SIZES; i++) {
+    for (k = 0; k < TX_TYPES; k++)
+      cm->counts.inter_ext_tx[i][k] += counts->inter_ext_tx[i][k];
+  }
+
+#if CONFIG_MISC_FIXES
+  for (i = 0; i < PREDICTION_PROBS; i++)
+    for (j = 0; j < 2; j++)
+      cm->counts.seg.pred[i][j] += counts->seg.pred[i][j];
+
+  for (i = 0; i < MAX_SEGMENTS; i++) {
+    cm->counts.seg.tree_total[i] += counts->seg.tree_total[i];
+    cm->counts.seg.tree_mispred[i] += counts->seg.tree_mispred[i];
+  }
+#endif
+}
diff --git a/libs/libvpx/vp10/common/thread_common.h b/libs/libvpx/vp10/common/thread_common.h
new file mode 100644
index 0000000000..a401ddcb2e
--- /dev/null
+++ b/libs/libvpx/vp10/common/thread_common.h
@@ -0,0 +1,65 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_COMMON_LOOPFILTER_THREAD_H_
+#define VP10_COMMON_LOOPFILTER_THREAD_H_
+#include "./vpx_config.h"
+#include "vp10/common/loopfilter.h"
+#include "vpx_util/vpx_thread.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP10Common;
+struct FRAME_COUNTS;
+
+// Loopfilter row synchronization
+typedef struct VP9LfSyncData {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *mutex_;
+  pthread_cond_t *cond_;
+#endif
+  // Allocate memory to store the loop-filtered superblock index in each row.
+  int *cur_sb_col;
+  // The optimal sync_range for different resolution and platform should be
+  // determined by testing. Currently, it is chosen to be a power-of-2 number.
+  int sync_range;
+  int rows;
+
+  // Row-based parallel loopfilter data
+  LFWorkerData *lfdata;
+  int num_workers;
+} VP9LfSync;
+
+// Allocate memory for loopfilter row synchronization.
+void vp10_loop_filter_alloc(VP9LfSync *lf_sync, struct VP10Common *cm, int rows,
+                           int width, int num_workers);
+
+// Deallocate loopfilter synchronization related mutex and data.
+void vp10_loop_filter_dealloc(VP9LfSync *lf_sync);
+
+// Multi-threaded loopfilter that uses the tile threads.
+void vp10_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
+                              struct VP10Common *cm,
+                              struct macroblockd_plane planes[MAX_MB_PLANE],
+                              int frame_filter_level,
+                              int y_only, int partial_frame,
+                              VPxWorker *workers, int num_workers,
+                              VP9LfSync *lf_sync);
+
+void vp10_accumulate_frame_counts(struct VP10Common *cm,
+                                 struct FRAME_COUNTS *counts, int is_dec);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_COMMON_LOOPFILTER_THREAD_H_
diff --git a/libs/libvpx/vp10/common/tile_common.c b/libs/libvpx/vp10/common/tile_common.c
new file mode 100644
index 0000000000..4d92b4c6bf
--- /dev/null
+++ b/libs/libvpx/vp10/common/tile_common.c
@@ -0,0 +1,59 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp10/common/tile_common.h"
+#include "vp10/common/onyxc_int.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+#define MIN_TILE_WIDTH_B64 4
+#define MAX_TILE_WIDTH_B64 64
+
+static int get_tile_offset(int idx, int mis, int log2) {
+  const int sb_cols = mi_cols_aligned_to_sb(mis) >> MI_BLOCK_SIZE_LOG2;
+  const int offset = ((idx * sb_cols) >> log2) << MI_BLOCK_SIZE_LOG2;
+  return VPXMIN(offset, mis);
+}
+
+void vp10_tile_set_row(TileInfo *tile, const VP10_COMMON *cm, int row) {
+  tile->mi_row_start = get_tile_offset(row, cm->mi_rows, cm->log2_tile_rows);
+  tile->mi_row_end = get_tile_offset(row + 1, cm->mi_rows, cm->log2_tile_rows);
+}
+
+void vp10_tile_set_col(TileInfo *tile, const VP10_COMMON *cm, int col) {
+  tile->mi_col_start = get_tile_offset(col, cm->mi_cols, cm->log2_tile_cols);
+  tile->mi_col_end = get_tile_offset(col + 1, cm->mi_cols, cm->log2_tile_cols);
+}
+
+void vp10_tile_init(TileInfo *tile, const VP10_COMMON *cm, int row, int col) {
+  vp10_tile_set_row(tile, cm, row);
+  vp10_tile_set_col(tile, cm, col);
+}
+
+static int get_min_log2_tile_cols(const int sb64_cols) {
+  int min_log2 = 0;
+  while ((MAX_TILE_WIDTH_B64 << min_log2) < sb64_cols)
+    ++min_log2;
+  return min_log2;
+}
+
+static int get_max_log2_tile_cols(const int sb64_cols) {
+  int max_log2 = 1;
+  while ((sb64_cols >> max_log2) >= MIN_TILE_WIDTH_B64)
+    ++max_log2;
+  return max_log2 - 1;
+}
+
+void vp10_get_tile_n_bits(int mi_cols,
+                         int *min_log2_tile_cols, int *max_log2_tile_cols) {
+  const int sb64_cols = mi_cols_aligned_to_sb(mi_cols) >> MI_BLOCK_SIZE_LOG2;
+  *min_log2_tile_cols = get_min_log2_tile_cols(sb64_cols);
+  *max_log2_tile_cols = get_max_log2_tile_cols(sb64_cols);
+  assert(*min_log2_tile_cols <= *max_log2_tile_cols);
+}
diff --git a/libs/libvpx/vp10/common/tile_common.h b/libs/libvpx/vp10/common/tile_common.h
new file mode 100644
index 0000000000..09cf060d8a
--- /dev/null
+++ b/libs/libvpx/vp10/common/tile_common.h
@@ -0,0 +1,40 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_COMMON_TILE_COMMON_H_
+#define VP10_COMMON_TILE_COMMON_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP10Common;
+
+typedef struct TileInfo {
+  int mi_row_start, mi_row_end;
+  int mi_col_start, mi_col_end;
+} TileInfo;
+
+// initializes 'tile->mi_(row|col)_(start|end)' for (row, col) based on
+// 'cm->log2_tile_(rows|cols)' & 'cm->mi_(rows|cols)'
+void vp10_tile_init(TileInfo *tile, const struct VP10Common *cm,
+                   int row, int col);
+
+void vp10_tile_set_row(TileInfo *tile, const struct VP10Common *cm, int row);
+void vp10_tile_set_col(TileInfo *tile, const struct VP10Common *cm, int col);
+
+void vp10_get_tile_n_bits(int mi_cols,
+                         int *min_log2_tile_cols, int *max_log2_tile_cols);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_COMMON_TILE_COMMON_H_
diff --git a/libs/libvpx/vp10/common/vp10_fwd_txfm.c b/libs/libvpx/vp10/common/vp10_fwd_txfm.c
new file mode 100644
index 0000000000..3211cd0828
--- /dev/null
+++ b/libs/libvpx/vp10/common/vp10_fwd_txfm.c
@@ -0,0 +1,824 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp10/common/vp10_fwd_txfm.h"
+
+void vp10_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
+  // The 2D transform is done with two passes which are actually pretty
+  // similar. In the first one, we transform the columns and transpose
+  // the results. In the second one, we transform the rows. To achieve that,
+  // as the first pass results are transposed, we transpose the columns (that
+  // is the transposed rows) and transpose the results (so that it goes back
+  // in normal/row positions).
+  int pass;
+  // We need an intermediate buffer between passes.
+  tran_low_t intermediate[4 * 4];
+  const int16_t *in_pass0 = input;
+  const tran_low_t *in = NULL;
+  tran_low_t *out = intermediate;
+  // Do the two transform/transpose passes
+  for (pass = 0; pass < 2; ++pass) {
+    tran_high_t input[4];      // canbe16
+    tran_high_t step[4];       // canbe16
+    tran_high_t temp1, temp2;  // needs32
+    int i;
+    for (i = 0; i < 4; ++i) {
+      // Load inputs.
+      if (0 == pass) {
+        input[0] = in_pass0[0 * stride] * 16;
+        input[1] = in_pass0[1 * stride] * 16;
+        input[2] = in_pass0[2 * stride] * 16;
+        input[3] = in_pass0[3 * stride] * 16;
+        if (i == 0 && input[0]) {
+          input[0] += 1;
+        }
+      } else {
+        input[0] = in[0 * 4];
+        input[1] = in[1 * 4];
+        input[2] = in[2 * 4];
+        input[3] = in[3 * 4];
+      }
+      // Transform.
+      step[0] = input[0] + input[3];
+      step[1] = input[1] + input[2];
+      step[2] = input[1] - input[2];
+      step[3] = input[0] - input[3];
+      temp1 = (step[0] + step[1]) * cospi_16_64;
+      temp2 = (step[0] - step[1]) * cospi_16_64;
+      out[0] = (tran_low_t)fdct_round_shift(temp1);
+      out[2] = (tran_low_t)fdct_round_shift(temp2);
+      temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
+      temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
+      out[1] = (tran_low_t)fdct_round_shift(temp1);
+      out[3] = (tran_low_t)fdct_round_shift(temp2);
+      // Do next column (which is a transposed row in second/horizontal pass)
+      in_pass0++;
+      in++;
+      out += 4;
+    }
+    // Setup in/out for next pass.
+    in = intermediate;
+    out = output;
+  }
+
+  {
+    int i, j;
+    for (i = 0; i < 4; ++i) {
+      for (j = 0; j < 4; ++j)
+        output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
+    }
+  }
+}
+
+void vp10_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) {
+  int r, c;
+  tran_low_t sum = 0;
+  for (r = 0; r < 4; ++r)
+    for (c = 0; c < 4; ++c)
+      sum += input[r * stride + c];
+
+  output[0] = sum << 1;
+  output[1] = 0;
+}
+
+void vp10_fdct8x8_c(const int16_t *input,
+    tran_low_t *final_output, int stride) {
+  int i, j;
+  tran_low_t intermediate[64];
+  int pass;
+  tran_low_t *output = intermediate;
+  const tran_low_t *in = NULL;
+
+  // Transform columns
+  for (pass = 0; pass < 2; ++pass) {
+    tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
+    tran_high_t t0, t1, t2, t3;                  // needs32
+    tran_high_t x0, x1, x2, x3;                  // canbe16
+
+    int i;
+    for (i = 0; i < 8; i++) {
+      // stage 1
+      if (pass == 0) {
+        s0 = (input[0 * stride] + input[7 * stride]) * 4;
+        s1 = (input[1 * stride] + input[6 * stride]) * 4;
+        s2 = (input[2 * stride] + input[5 * stride]) * 4;
+        s3 = (input[3 * stride] + input[4 * stride]) * 4;
+        s4 = (input[3 * stride] - input[4 * stride]) * 4;
+        s5 = (input[2 * stride] - input[5 * stride]) * 4;
+        s6 = (input[1 * stride] - input[6 * stride]) * 4;
+        s7 = (input[0 * stride] - input[7 * stride]) * 4;
+        ++input;
+      } else {
+        s0 = in[0 * 8] + in[7 * 8];
+        s1 = in[1 * 8] + in[6 * 8];
+        s2 = in[2 * 8] + in[5 * 8];
+        s3 = in[3 * 8] + in[4 * 8];
+        s4 = in[3 * 8] - in[4 * 8];
+        s5 = in[2 * 8] - in[5 * 8];
+        s6 = in[1 * 8] - in[6 * 8];
+        s7 = in[0 * 8] - in[7 * 8];
+        ++in;
+      }
+
+      // fdct4(step, step);
+      x0 = s0 + s3;
+      x1 = s1 + s2;
+      x2 = s1 - s2;
+      x3 = s0 - s3;
+      t0 = (x0 + x1) * cospi_16_64;
+      t1 = (x0 - x1) * cospi_16_64;
+      t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
+      t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
+      output[0] = (tran_low_t)fdct_round_shift(t0);
+      output[2] = (tran_low_t)fdct_round_shift(t2);
+      output[4] = (tran_low_t)fdct_round_shift(t1);
+      output[6] = (tran_low_t)fdct_round_shift(t3);
+
+      // Stage 2
+      t0 = (s6 - s5) * cospi_16_64;
+      t1 = (s6 + s5) * cospi_16_64;
+      t2 = fdct_round_shift(t0);
+      t3 = fdct_round_shift(t1);
+
+      // Stage 3
+      x0 = s4 + t2;
+      x1 = s4 - t2;
+      x2 = s7 - t3;
+      x3 = s7 + t3;
+
+      // Stage 4
+      t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
+      t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
+      t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+      t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
+      output[1] = (tran_low_t)fdct_round_shift(t0);
+      output[3] = (tran_low_t)fdct_round_shift(t2);
+      output[5] = (tran_low_t)fdct_round_shift(t1);
+      output[7] = (tran_low_t)fdct_round_shift(t3);
+      output += 8;
+    }
+    in  = intermediate;
+    output = final_output;
+  }
+
+  // Rows
+  for (i = 0; i < 8; ++i) {
+    for (j = 0; j < 8; ++j)
+      final_output[j + i * 8] /= 2;
+  }
+}
+
+void vp10_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) {
+  int r, c;
+  tran_low_t sum = 0;
+  for (r = 0; r < 8; ++r)
+    for (c = 0; c < 8; ++c)
+      sum += input[r * stride + c];
+
+  output[0] = sum;
+  output[1] = 0;
+}
+
+void vp10_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
+  // The 2D transform is done with two passes which are actually pretty
+  // similar. In the first one, we transform the columns and transpose
+  // the results. In the second one, we transform the rows. To achieve that,
+  // as the first pass results are transposed, we transpose the columns (that
+  // is the transposed rows) and transpose the results (so that it goes back
+  // in normal/row positions).
+  int pass;
+  // We need an intermediate buffer between passes.
+  tran_low_t intermediate[256];
+  const int16_t *in_pass0 = input;
+  const tran_low_t *in = NULL;
+  tran_low_t *out = intermediate;
+  // Do the two transform/transpose passes
+  for (pass = 0; pass < 2; ++pass) {
+    tran_high_t step1[8];      // canbe16
+    tran_high_t step2[8];      // canbe16
+    tran_high_t step3[8];      // canbe16
+    tran_high_t input[8];      // canbe16
+    tran_high_t temp1, temp2;  // needs32
+    int i;
+    for (i = 0; i < 16; i++) {
+      if (0 == pass) {
+        // Calculate input for the first 8 results.
+        input[0] = (in_pass0[0 * stride] + in_pass0[15 * stride]) * 4;
+        input[1] = (in_pass0[1 * stride] + in_pass0[14 * stride]) * 4;
+        input[2] = (in_pass0[2 * stride] + in_pass0[13 * stride]) * 4;
+        input[3] = (in_pass0[3 * stride] + in_pass0[12 * stride]) * 4;
+        input[4] = (in_pass0[4 * stride] + in_pass0[11 * stride]) * 4;
+        input[5] = (in_pass0[5 * stride] + in_pass0[10 * stride]) * 4;
+        input[6] = (in_pass0[6 * stride] + in_pass0[ 9 * stride]) * 4;
+        input[7] = (in_pass0[7 * stride] + in_pass0[ 8 * stride]) * 4;
+        // Calculate input for the next 8 results.
+        step1[0] = (in_pass0[7 * stride] - in_pass0[ 8 * stride]) * 4;
+        step1[1] = (in_pass0[6 * stride] - in_pass0[ 9 * stride]) * 4;
+        step1[2] = (in_pass0[5 * stride] - in_pass0[10 * stride]) * 4;
+        step1[3] = (in_pass0[4 * stride] - in_pass0[11 * stride]) * 4;
+        step1[4] = (in_pass0[3 * stride] - in_pass0[12 * stride]) * 4;
+        step1[5] = (in_pass0[2 * stride] - in_pass0[13 * stride]) * 4;
+        step1[6] = (in_pass0[1 * stride] - in_pass0[14 * stride]) * 4;
+        step1[7] = (in_pass0[0 * stride] - in_pass0[15 * stride]) * 4;
+      } else {
+        // Calculate input for the first 8 results.
+        input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2);
+        input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2);
+        input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2);
+        input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2);
+        input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2);
+        input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2);
+        input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2);
+        input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2);
+        // Calculate input for the next 8 results.
+        step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2);
+        step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2);
+        step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2);
+        step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2);
+        step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2);
+        step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2);
+        step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2);
+        step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2);
+      }
+      // Work on the first eight values; fdct8(input, even_results);
+      {
+        tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
+        tran_high_t t0, t1, t2, t3;                  // needs32
+        tran_high_t x0, x1, x2, x3;                  // canbe16
+
+        // stage 1
+        s0 = input[0] + input[7];
+        s1 = input[1] + input[6];
+        s2 = input[2] + input[5];
+        s3 = input[3] + input[4];
+        s4 = input[3] - input[4];
+        s5 = input[2] - input[5];
+        s6 = input[1] - input[6];
+        s7 = input[0] - input[7];
+
+        // fdct4(step, step);
+        x0 = s0 + s3;
+        x1 = s1 + s2;
+        x2 = s1 - s2;
+        x3 = s0 - s3;
+        t0 = (x0 + x1) * cospi_16_64;
+        t1 = (x0 - x1) * cospi_16_64;
+        t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
+        t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
+        out[0] = (tran_low_t)fdct_round_shift(t0);
+        out[4] = (tran_low_t)fdct_round_shift(t2);
+        out[8] = (tran_low_t)fdct_round_shift(t1);
+        out[12] = (tran_low_t)fdct_round_shift(t3);
+
+        // Stage 2
+        t0 = (s6 - s5) * cospi_16_64;
+        t1 = (s6 + s5) * cospi_16_64;
+        t2 = fdct_round_shift(t0);
+        t3 = fdct_round_shift(t1);
+
+        // Stage 3
+        x0 = s4 + t2;
+        x1 = s4 - t2;
+        x2 = s7 - t3;
+        x3 = s7 + t3;
+
+        // Stage 4
+        t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
+        t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
+        t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+        t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
+        out[2] = (tran_low_t)fdct_round_shift(t0);
+        out[6] = (tran_low_t)fdct_round_shift(t2);
+        out[10] = (tran_low_t)fdct_round_shift(t1);
+        out[14] = (tran_low_t)fdct_round_shift(t3);
+      }
+      // Work on the next eight values; step1 -> odd_results
+      {
+        // step 2
+        temp1 = (step1[5] - step1[2]) * cospi_16_64;
+        temp2 = (step1[4] - step1[3]) * cospi_16_64;
+        step2[2] = fdct_round_shift(temp1);
+        step2[3] = fdct_round_shift(temp2);
+        temp1 = (step1[4] + step1[3]) * cospi_16_64;
+        temp2 = (step1[5] + step1[2]) * cospi_16_64;
+        step2[4] = fdct_round_shift(temp1);
+        step2[5] = fdct_round_shift(temp2);
+        // step 3
+        step3[0] = step1[0] + step2[3];
+        step3[1] = step1[1] + step2[2];
+        step3[2] = step1[1] - step2[2];
+        step3[3] = step1[0] - step2[3];
+        step3[4] = step1[7] - step2[4];
+        step3[5] = step1[6] - step2[5];
+        step3[6] = step1[6] + step2[5];
+        step3[7] = step1[7] + step2[4];
+        // step 4
+        temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
+        temp2 = step3[2] * cospi_24_64 + step3[5] *  cospi_8_64;
+        step2[1] = fdct_round_shift(temp1);
+        step2[2] = fdct_round_shift(temp2);
+        temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
+        temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
+        step2[5] = fdct_round_shift(temp1);
+        step2[6] = fdct_round_shift(temp2);
+        // step 5
+        step1[0] = step3[0] + step2[1];
+        step1[1] = step3[0] - step2[1];
+        step1[2] = step3[3] + step2[2];
+        step1[3] = step3[3] - step2[2];
+        step1[4] = step3[4] - step2[5];
+        step1[5] = step3[4] + step2[5];
+        step1[6] = step3[7] - step2[6];
+        step1[7] = step3[7] + step2[6];
+        // step 6
+        temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
+        temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
+        out[1] = (tran_low_t)fdct_round_shift(temp1);
+        out[9] = (tran_low_t)fdct_round_shift(temp2);
+        temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
+        temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
+        out[5] = (tran_low_t)fdct_round_shift(temp1);
+        out[13] = (tran_low_t)fdct_round_shift(temp2);
+        temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
+        temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
+        out[3] = (tran_low_t)fdct_round_shift(temp1);
+        out[11] = (tran_low_t)fdct_round_shift(temp2);
+        temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
+        temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
+        out[7] = (tran_low_t)fdct_round_shift(temp1);
+        out[15] = (tran_low_t)fdct_round_shift(temp2);
+      }
+      // Do next column (which is a transposed row in second/horizontal pass)
+      in++;
+      in_pass0++;
+      out += 16;
+    }
+    // Setup in/out for next pass.
+    in = intermediate;
+    out = output;
+  }
+}
+
+void vp10_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) {
+  int r, c;
+  tran_low_t sum = 0;
+  for (r = 0; r < 16; ++r)
+    for (c = 0; c < 16; ++c)
+      sum += input[r * stride + c];
+
+  output[0] = sum >> 1;
+  output[1] = 0;
+}
+
+static INLINE tran_high_t dct_32_round(tran_high_t input) {
+  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+  // TODO(debargha, peter.derivaz): Find new bounds for this assert,
+  // and make the bounds consts.
+  // assert(-131072 <= rv && rv <= 131071);
+  return rv;
+}
+
+static INLINE tran_high_t half_round_shift(tran_high_t input) {
+  tran_high_t rv = (input + 1 + (input < 0)) >> 2;
+  return rv;
+}
+
+void vp10_fdct32(const tran_high_t *input, tran_high_t *output, int round) {
+  tran_high_t step[32];
+  // Stage 1
+  step[0] = input[0] + input[(32 - 1)];
+  step[1] = input[1] + input[(32 - 2)];
+  step[2] = input[2] + input[(32 - 3)];
+  step[3] = input[3] + input[(32 - 4)];
+  step[4] = input[4] + input[(32 - 5)];
+  step[5] = input[5] + input[(32 - 6)];
+  step[6] = input[6] + input[(32 - 7)];
+  step[7] = input[7] + input[(32 - 8)];
+  step[8] = input[8] + input[(32 - 9)];
+  step[9] = input[9] + input[(32 - 10)];
+  step[10] = input[10] + input[(32 - 11)];
+  step[11] = input[11] + input[(32 - 12)];
+  step[12] = input[12] + input[(32 - 13)];
+  step[13] = input[13] + input[(32 - 14)];
+  step[14] = input[14] + input[(32 - 15)];
+  step[15] = input[15] + input[(32 - 16)];
+  step[16] = -input[16] + input[(32 - 17)];
+  step[17] = -input[17] + input[(32 - 18)];
+  step[18] = -input[18] + input[(32 - 19)];
+  step[19] = -input[19] + input[(32 - 20)];
+  step[20] = -input[20] + input[(32 - 21)];
+  step[21] = -input[21] + input[(32 - 22)];
+  step[22] = -input[22] + input[(32 - 23)];
+  step[23] = -input[23] + input[(32 - 24)];
+  step[24] = -input[24] + input[(32 - 25)];
+  step[25] = -input[25] + input[(32 - 26)];
+  step[26] = -input[26] + input[(32 - 27)];
+  step[27] = -input[27] + input[(32 - 28)];
+  step[28] = -input[28] + input[(32 - 29)];
+  step[29] = -input[29] + input[(32 - 30)];
+  step[30] = -input[30] + input[(32 - 31)];
+  step[31] = -input[31] + input[(32 - 32)];
+
+  // Stage 2
+  output[0] = step[0] + step[16 - 1];
+  output[1] = step[1] + step[16 - 2];
+  output[2] = step[2] + step[16 - 3];
+  output[3] = step[3] + step[16 - 4];
+  output[4] = step[4] + step[16 - 5];
+  output[5] = step[5] + step[16 - 6];
+  output[6] = step[6] + step[16 - 7];
+  output[7] = step[7] + step[16 - 8];
+  output[8] = -step[8] + step[16 - 9];
+  output[9] = -step[9] + step[16 - 10];
+  output[10] = -step[10] + step[16 - 11];
+  output[11] = -step[11] + step[16 - 12];
+  output[12] = -step[12] + step[16 - 13];
+  output[13] = -step[13] + step[16 - 14];
+  output[14] = -step[14] + step[16 - 15];
+  output[15] = -step[15] + step[16 - 16];
+
+  output[16] = step[16];
+  output[17] = step[17];
+  output[18] = step[18];
+  output[19] = step[19];
+
+  output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64);
+  output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64);
+  output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64);
+  output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64);
+
+  output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64);
+  output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64);
+  output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64);
+  output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64);
+
+  output[28] = step[28];
+  output[29] = step[29];
+  output[30] = step[30];
+  output[31] = step[31];
+
+  // dump the magnitude by 4, hence the intermediate values are within
+  // the range of 16 bits.
+  if (round) {
+    output[0] = half_round_shift(output[0]);
+    output[1] = half_round_shift(output[1]);
+    output[2] = half_round_shift(output[2]);
+    output[3] = half_round_shift(output[3]);
+    output[4] = half_round_shift(output[4]);
+    output[5] = half_round_shift(output[5]);
+    output[6] = half_round_shift(output[6]);
+    output[7] = half_round_shift(output[7]);
+    output[8] = half_round_shift(output[8]);
+    output[9] = half_round_shift(output[9]);
+    output[10] = half_round_shift(output[10]);
+    output[11] = half_round_shift(output[11]);
+    output[12] = half_round_shift(output[12]);
+    output[13] = half_round_shift(output[13]);
+    output[14] = half_round_shift(output[14]);
+    output[15] = half_round_shift(output[15]);
+
+    output[16] = half_round_shift(output[16]);
+    output[17] = half_round_shift(output[17]);
+    output[18] = half_round_shift(output[18]);
+    output[19] = half_round_shift(output[19]);
+    output[20] = half_round_shift(output[20]);
+    output[21] = half_round_shift(output[21]);
+    output[22] = half_round_shift(output[22]);
+    output[23] = half_round_shift(output[23]);
+    output[24] = half_round_shift(output[24]);
+    output[25] = half_round_shift(output[25]);
+    output[26] = half_round_shift(output[26]);
+    output[27] = half_round_shift(output[27]);
+    output[28] = half_round_shift(output[28]);
+    output[29] = half_round_shift(output[29]);
+    output[30] = half_round_shift(output[30]);
+    output[31] = half_round_shift(output[31]);
+  }
+
+  // Stage 3
+  step[0] = output[0] + output[(8 - 1)];
+  step[1] = output[1] + output[(8 - 2)];
+  step[2] = output[2] + output[(8 - 3)];
+  step[3] = output[3] + output[(8 - 4)];
+  step[4] = -output[4] + output[(8 - 5)];
+  step[5] = -output[5] + output[(8 - 6)];
+  step[6] = -output[6] + output[(8 - 7)];
+  step[7] = -output[7] + output[(8 - 8)];
+  step[8] = output[8];
+  step[9] = output[9];
+  step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64);
+  step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64);
+  step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64);
+  step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64);
+  step[14] = output[14];
+  step[15] = output[15];
+
+  step[16] = output[16] + output[23];
+  step[17] = output[17] + output[22];
+  step[18] = output[18] + output[21];
+  step[19] = output[19] + output[20];
+  step[20] = -output[20] + output[19];
+  step[21] = -output[21] + output[18];
+  step[22] = -output[22] + output[17];
+  step[23] = -output[23] + output[16];
+  step[24] = -output[24] + output[31];
+  step[25] = -output[25] + output[30];
+  step[26] = -output[26] + output[29];
+  step[27] = -output[27] + output[28];
+  step[28] = output[28] + output[27];
+  step[29] = output[29] + output[26];
+  step[30] = output[30] + output[25];
+  step[31] = output[31] + output[24];
+
+  // Stage 4
+  output[0] = step[0] + step[3];
+  output[1] = step[1] + step[2];
+  output[2] = -step[2] + step[1];
+  output[3] = -step[3] + step[0];
+  output[4] = step[4];
+  output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64);
+  output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64);
+  output[7] = step[7];
+  output[8] = step[8] + step[11];
+  output[9] = step[9] + step[10];
+  output[10] = -step[10] + step[9];
+  output[11] = -step[11] + step[8];
+  output[12] = -step[12] + step[15];
+  output[13] = -step[13] + step[14];
+  output[14] = step[14] + step[13];
+  output[15] = step[15] + step[12];
+
+  output[16] = step[16];
+  output[17] = step[17];
+  output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64);
+  output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64);
+  output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64);
+  output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64);
+  output[22] = step[22];
+  output[23] = step[23];
+  output[24] = step[24];
+  output[25] = step[25];
+  output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64);
+  output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64);
+  output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64);
+  output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64);
+  output[30] = step[30];
+  output[31] = step[31];
+
+  // Stage 5
+  step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64);
+  step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64);
+  step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64);
+  step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64);
+  step[4] = output[4] + output[5];
+  step[5] = -output[5] + output[4];
+  step[6] = -output[6] + output[7];
+  step[7] = output[7] + output[6];
+  step[8] = output[8];
+  step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64);
+  step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64);
+  step[11] = output[11];
+  step[12] = output[12];
+  step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64);
+  step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64);
+  step[15] = output[15];
+
+  step[16] = output[16] + output[19];
+  step[17] = output[17] + output[18];
+  step[18] = -output[18] + output[17];
+  step[19] = -output[19] + output[16];
+  step[20] = -output[20] + output[23];
+  step[21] = -output[21] + output[22];
+  step[22] = output[22] + output[21];
+  step[23] = output[23] + output[20];
+  step[24] = output[24] + output[27];
+  step[25] = output[25] + output[26];
+  step[26] = -output[26] + output[25];
+  step[27] = -output[27] + output[24];
+  step[28] = -output[28] + output[31];
+  step[29] = -output[29] + output[30];
+  step[30] = output[30] + output[29];
+  step[31] = output[31] + output[28];
+
+  // Stage 6
+  output[0] = step[0];
+  output[1] = step[1];
+  output[2] = step[2];
+  output[3] = step[3];
+  output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64);
+  output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64);
+  output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64);
+  output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64);
+  output[8] = step[8] + step[9];
+  output[9] = -step[9] + step[8];
+  output[10] = -step[10] + step[11];
+  output[11] = step[11] + step[10];
+  output[12] = step[12] + step[13];
+  output[13] = -step[13] + step[12];
+  output[14] = -step[14] + step[15];
+  output[15] = step[15] + step[14];
+
+  output[16] = step[16];
+  output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64);
+  output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64);
+  output[19] = step[19];
+  output[20] = step[20];
+  output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64);
+  output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64);
+  output[23] = step[23];
+  output[24] = step[24];
+  output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64);
+  output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64);
+  output[27] = step[27];
+  output[28] = step[28];
+  output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64);
+  output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64);
+  output[31] = step[31];
+
+  // Stage 7
+  step[0] = output[0];
+  step[1] = output[1];
+  step[2] = output[2];
+  step[3] = output[3];
+  step[4] = output[4];
+  step[5] = output[5];
+  step[6] = output[6];
+  step[7] = output[7];
+  step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64);
+  step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64);
+  step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64);
+  step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64);
+  step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64);
+  step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64);
+  step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64);
+  step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64);
+
+  step[16] = output[16] + output[17];
+  step[17] = -output[17] + output[16];
+  step[18] = -output[18] + output[19];
+  step[19] = output[19] + output[18];
+  step[20] = output[20] + output[21];
+  step[21] = -output[21] + output[20];
+  step[22] = -output[22] + output[23];
+  step[23] = output[23] + output[22];
+  step[24] = output[24] + output[25];
+  step[25] = -output[25] + output[24];
+  step[26] = -output[26] + output[27];
+  step[27] = output[27] + output[26];
+  step[28] = output[28] + output[29];
+  step[29] = -output[29] + output[28];
+  step[30] = -output[30] + output[31];
+  step[31] = output[31] + output[30];
+
+  // Final stage --- outputs indices are bit-reversed.
+  output[0]  = step[0];
+  output[16] = step[1];
+  output[8]  = step[2];
+  output[24] = step[3];
+  output[4]  = step[4];
+  output[20] = step[5];
+  output[12] = step[6];
+  output[28] = step[7];
+  output[2]  = step[8];
+  output[18] = step[9];
+  output[10] = step[10];
+  output[26] = step[11];
+  output[6]  = step[12];
+  output[22] = step[13];
+  output[14] = step[14];
+  output[30] = step[15];
+
+  output[1]  = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64);
+  output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64);
+  output[9]  = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64);
+  output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64);
+  output[5]  = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64);
+  output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64);
+  output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64);
+  output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64);
+  output[3]  = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64);
+  output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64);
+  output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64);
+  output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64);
+  output[7]  = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64);
+  output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64);
+  output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64);
+  output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
+}
+
+void vp10_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
+  int i, j;
+  tran_high_t output[32 * 32];
+
+  // Columns
+  for (i = 0; i < 32; ++i) {
+    tran_high_t temp_in[32], temp_out[32];
+    for (j = 0; j < 32; ++j)
+      temp_in[j] = input[j * stride + i] * 4;
+    vp10_fdct32(temp_in, temp_out, 0);
+    for (j = 0; j < 32; ++j)
+      output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+  }
+
+  // Rows
+  for (i = 0; i < 32; ++i) {
+    tran_high_t temp_in[32], temp_out[32];
+    for (j = 0; j < 32; ++j)
+      temp_in[j] = output[j + i * 32];
+    vp10_fdct32(temp_in, temp_out, 0);
+    for (j = 0; j < 32; ++j)
+      out[j + i * 32] =
+          (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
+  }
+}
+
+// Note that although we use dct_32_round in dct32 computation flow,
+// this 2d fdct32x32 for rate-distortion optimization loop is operating
+// within 16 bits precision.
+void vp10_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
+  int i, j;
+  tran_high_t output[32 * 32];
+
+  // Columns
+  for (i = 0; i < 32; ++i) {
+    tran_high_t temp_in[32], temp_out[32];
+    for (j = 0; j < 32; ++j)
+      temp_in[j] = input[j * stride + i] * 4;
+    vp10_fdct32(temp_in, temp_out, 0);
+    for (j = 0; j < 32; ++j)
+      // TODO(cd): see quality impact of only doing
+      //           output[j * 32 + i] = (temp_out[j] + 1) >> 2;
+      //           PS: also change code in vp10_dsp/x86/vp10_dct_sse2.c
+      output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+  }
+
+  // Rows
+  for (i = 0; i < 32; ++i) {
+    tran_high_t temp_in[32], temp_out[32];
+    for (j = 0; j < 32; ++j)
+      temp_in[j] = output[j + i * 32];
+    vp10_fdct32(temp_in, temp_out, 1);
+    for (j = 0; j < 32; ++j)
+      out[j + i * 32] = (tran_low_t)temp_out[j];
+  }
+}
+
+void vp10_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) {
+  int r, c;
+  tran_low_t sum = 0;
+  for (r = 0; r < 32; ++r)
+    for (c = 0; c < 32; ++c)
+      sum += input[r * stride + c];
+
+  output[0] = sum >> 3;
+  output[1] = 0;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp10_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output,
+                          int stride) {
+  vp10_fdct4x4_c(input, output, stride);
+}
+
+void vp10_highbd_fdct8x8_c(const int16_t *input, tran_low_t *final_output,
+                          int stride) {
+  vp10_fdct8x8_c(input, final_output, stride);
+}
+
+void vp10_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *final_output,
+                            int stride) {
+  vp10_fdct8x8_1_c(input, final_output, stride);
+}
+
+void vp10_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output,
+                            int stride) {
+  vp10_fdct16x16_c(input, output, stride);
+}
+
+void vp10_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output,
+                              int stride) {
+  vp10_fdct16x16_1_c(input, output, stride);
+}
+
+void vp10_highbd_fdct32x32_c(const int16_t *input,
+    tran_low_t *out, int stride) {
+  vp10_fdct32x32_c(input, out, stride);
+}
+
+void vp10_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *out,
+                               int stride) {
+  vp10_fdct32x32_rd_c(input, out, stride);
+}
+
+void vp10_highbd_fdct32x32_1_c(const int16_t *input,
+    tran_low_t *out, int stride) {
+  vp10_fdct32x32_1_c(input, out, stride);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/libs/libvpx/vp10/common/vp10_fwd_txfm.h b/libs/libvpx/vp10/common/vp10_fwd_txfm.h
new file mode 100644
index 0000000000..46dbf3dd04
--- /dev/null
+++ b/libs/libvpx/vp10/common/vp10_fwd_txfm.h
@@ -0,0 +1,18 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_COMMON_VP10_FWD_TXFM_H_
+#define VP10_COMMON_VP10_FWD_TXFM_H_
+
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/fwd_txfm.h"
+
+void vp10_fdct32(const tran_high_t *input, tran_high_t *output, int round);
+#endif  // VP10_COMMON_VP10_FWD_TXFM_H_
diff --git a/libs/libvpx/vp10/common/vp10_inv_txfm.c b/libs/libvpx/vp10/common/vp10_inv_txfm.c
new file mode 100644
index 0000000000..403b209a20
--- /dev/null
+++ b/libs/libvpx/vp10/common/vp10_inv_txfm.c
@@ -0,0 +1,2499 @@
+/*
+ *
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <string.h>
+
+#include "vp10/common/vp10_inv_txfm.h"
+
+void vp10_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+/* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
+   0.5 shifts per pixel. */
+  int i;
+  tran_low_t output[16];
+  tran_high_t a1, b1, c1, d1, e1;
+  const tran_low_t *ip = input;
+  tran_low_t *op = output;
+
+  for (i = 0; i < 4; i++) {
+    a1 = ip[0] >> UNIT_QUANT_SHIFT;
+    c1 = ip[1] >> UNIT_QUANT_SHIFT;
+    d1 = ip[2] >> UNIT_QUANT_SHIFT;
+    b1 = ip[3] >> UNIT_QUANT_SHIFT;
+    a1 += c1;
+    d1 -= b1;
+    e1 = (a1 - d1) >> 1;
+    b1 = e1 - b1;
+    c1 = e1 - c1;
+    a1 -= b1;
+    d1 += c1;
+    op[0] = WRAPLOW(a1, 8);
+    op[1] = WRAPLOW(b1, 8);
+    op[2] = WRAPLOW(c1, 8);
+    op[3] = WRAPLOW(d1, 8);
+    ip += 4;
+    op += 4;
+  }
+
+  ip = output;
+  for (i = 0; i < 4; i++) {
+    a1 = ip[4 * 0];
+    c1 = ip[4 * 1];
+    d1 = ip[4 * 2];
+    b1 = ip[4 * 3];
+    a1 += c1;
+    d1 -= b1;
+    e1 = (a1 - d1) >> 1;
+    b1 = e1 - b1;
+    c1 = e1 - c1;
+    a1 -= b1;
+    d1 += c1;
+    dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1);
+    dest[stride * 1] = clip_pixel_add(dest[stride * 1], b1);
+    dest[stride * 2] = clip_pixel_add(dest[stride * 2], c1);
+    dest[stride * 3] = clip_pixel_add(dest[stride * 3], d1);
+
+    ip++;
+    dest++;
+  }
+}
+
+void vp10_iwht4x4_1_add_c(const tran_low_t *in,
+                          uint8_t *dest,
+                          int dest_stride) {
+  int i;
+  tran_high_t a1, e1;
+  tran_low_t tmp[4];
+  const tran_low_t *ip = in;
+  tran_low_t *op = tmp;
+
+  a1 = ip[0] >> UNIT_QUANT_SHIFT;
+  e1 = a1 >> 1;
+  a1 -= e1;
+  op[0] = WRAPLOW(a1, 8);
+  op[1] = op[2] = op[3] = WRAPLOW(e1, 8);
+
+  ip = tmp;
+  for (i = 0; i < 4; i++) {
+    e1 = ip[0] >> 1;
+    a1 = ip[0] - e1;
+    dest[dest_stride * 0] = clip_pixel_add(dest[dest_stride * 0], a1);
+    dest[dest_stride * 1] = clip_pixel_add(dest[dest_stride * 1], e1);
+    dest[dest_stride * 2] = clip_pixel_add(dest[dest_stride * 2], e1);
+    dest[dest_stride * 3] = clip_pixel_add(dest[dest_stride * 3], e1);
+    ip++;
+    dest++;
+  }
+}
+
+void vp10_idct4_c(const tran_low_t *input, tran_low_t *output) {
+  tran_low_t step[4];
+  tran_high_t temp1, temp2;
+  // stage 1
+  temp1 = (input[0] + input[2]) * cospi_16_64;
+  temp2 = (input[0] - input[2]) * cospi_16_64;
+  step[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
+  temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
+  step[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  // stage 2
+  output[0] = WRAPLOW(step[0] + step[3], 8);
+  output[1] = WRAPLOW(step[1] + step[2], 8);
+  output[2] = WRAPLOW(step[1] - step[2], 8);
+  output[3] = WRAPLOW(step[0] - step[3], 8);
+}
+
+void vp10_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+  tran_low_t out[4 * 4];
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[4], temp_out[4];
+
+  // Rows
+  for (i = 0; i < 4; ++i) {
+    vp10_idct4_c(input, outptr);
+    input += 4;
+    outptr += 4;
+  }
+
+  // Columns
+  for (i = 0; i < 4; ++i) {
+    for (j = 0; j < 4; ++j)
+      temp_in[j] = out[j * 4 + i];
+    vp10_idct4_c(temp_in, temp_out);
+    for (j = 0; j < 4; ++j) {
+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+                                            ROUND_POWER_OF_TWO(temp_out[j], 4));
+    }
+  }
+}
+
+void vp10_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
+                         int dest_stride) {
+  int i;
+  tran_high_t a1;
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
+  a1 = ROUND_POWER_OF_TWO(out, 4);
+
+  for (i = 0; i < 4; i++) {
+    dest[0] = clip_pixel_add(dest[0], a1);
+    dest[1] = clip_pixel_add(dest[1], a1);
+    dest[2] = clip_pixel_add(dest[2], a1);
+    dest[3] = clip_pixel_add(dest[3], a1);
+    dest += dest_stride;
+  }
+}
+
+void vp10_idct8_c(const tran_low_t *input, tran_low_t *output) {
+  tran_low_t step1[8], step2[8];
+  tran_high_t temp1, temp2;
+  // stage 1
+  step1[0] = input[0];
+  step1[2] = input[4];
+  step1[1] = input[2];
+  step1[3] = input[6];
+  temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
+  temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
+  step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
+  temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  // stage 2
+  temp1 = (step1[0] + step1[2]) * cospi_16_64;
+  temp2 = (step1[0] - step1[2]) * cospi_16_64;
+  step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
+  temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
+  step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[4] = WRAPLOW(step1[4] + step1[5], 8);
+  step2[5] = WRAPLOW(step1[4] - step1[5], 8);
+  step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
+  step2[7] = WRAPLOW(step1[6] + step1[7], 8);
+
+  // stage 3
+  step1[0] = WRAPLOW(step2[0] + step2[3], 8);
+  step1[1] = WRAPLOW(step2[1] + step2[2], 8);
+  step1[2] = WRAPLOW(step2[1] - step2[2], 8);
+  step1[3] = WRAPLOW(step2[0] - step2[3], 8);
+  step1[4] = step2[4];
+  temp1 = (step2[6] - step2[5]) * cospi_16_64;
+  temp2 = (step2[5] + step2[6]) * cospi_16_64;
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[7] = step2[7];
+
+  // stage 4
+  output[0] = WRAPLOW(step1[0] + step1[7], 8);
+  output[1] = WRAPLOW(step1[1] + step1[6], 8);
+  output[2] = WRAPLOW(step1[2] + step1[5], 8);
+  output[3] = WRAPLOW(step1[3] + step1[4], 8);
+  output[4] = WRAPLOW(step1[3] - step1[4], 8);
+  output[5] = WRAPLOW(step1[2] - step1[5], 8);
+  output[6] = WRAPLOW(step1[1] - step1[6], 8);
+  output[7] = WRAPLOW(step1[0] - step1[7], 8);
+}
+
+void vp10_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+  tran_low_t out[8 * 8];
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[8], temp_out[8];
+
+  // First transform rows
+  for (i = 0; i < 8; ++i) {
+    vp10_idct8_c(input, outptr);
+    input += 8;
+    outptr += 8;
+  }
+
+  // Then transform columns
+  for (i = 0; i < 8; ++i) {
+    for (j = 0; j < 8; ++j)
+      temp_in[j] = out[j * 8 + i];
+    vp10_idct8_c(temp_in, temp_out);
+    for (j = 0; j < 8; ++j) {
+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+                                            ROUND_POWER_OF_TWO(temp_out[j], 5));
+    }
+  }
+}
+
+void vp10_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+  int i, j;
+  tran_high_t a1;
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
+  a1 = ROUND_POWER_OF_TWO(out, 5);
+  for (j = 0; j < 8; ++j) {
+    for (i = 0; i < 8; ++i)
+      dest[i] = clip_pixel_add(dest[i], a1);
+    dest += stride;
+  }
+}
+
+void vp10_iadst4_c(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+  tran_low_t x0 = input[0];
+  tran_low_t x1 = input[1];
+  tran_low_t x2 = input[2];
+  tran_low_t x3 = input[3];
+
+  if (!(x0 | x1 | x2 | x3)) {
+    output[0] = output[1] = output[2] = output[3] = 0;
+    return;
+  }
+
+  s0 = sinpi_1_9 * x0;
+  s1 = sinpi_2_9 * x0;
+  s2 = sinpi_3_9 * x1;
+  s3 = sinpi_4_9 * x2;
+  s4 = sinpi_1_9 * x2;
+  s5 = sinpi_2_9 * x3;
+  s6 = sinpi_4_9 * x3;
+  s7 = x0 - x2 + x3;
+
+  s0 = s0 + s3 + s5;
+  s1 = s1 - s4 - s6;
+  s3 = s2;
+  s2 = sinpi_3_9 * s7;
+
+  // 1-D transform scaling factor is sqrt(2).
+  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
+  // + 1b (addition) = 29b.
+  // Hence the output bit depth is 15b.
+  output[0] = WRAPLOW(dct_const_round_shift(s0 + s3), 8);
+  output[1] = WRAPLOW(dct_const_round_shift(s1 + s3), 8);
+  output[2] = WRAPLOW(dct_const_round_shift(s2), 8);
+  output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3), 8);
+}
+
+void vp10_iadst8_c(const tran_low_t *input, tran_low_t *output) {
+  int s0, s1, s2, s3, s4, s5, s6, s7;
+
+  tran_high_t x0 = input[7];
+  tran_high_t x1 = input[0];
+  tran_high_t x2 = input[5];
+  tran_high_t x3 = input[2];
+  tran_high_t x4 = input[3];
+  tran_high_t x5 = input[4];
+  tran_high_t x6 = input[1];
+  tran_high_t x7 = input[6];
+
+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
+    output[0] = output[1] = output[2] = output[3] = output[4]
+              = output[5] = output[6] = output[7] = 0;
+    return;
+  }
+
+  // stage 1
+  s0 = (int)(cospi_2_64  * x0 + cospi_30_64 * x1);
+  s1 = (int)(cospi_30_64 * x0 - cospi_2_64  * x1);
+  s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);
+  s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);
+  s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);
+  s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);
+  s6 = (int)(cospi_26_64 * x6 + cospi_6_64  * x7);
+  s7 = (int)(cospi_6_64  * x6 - cospi_26_64 * x7);
+
+  x0 = WRAPLOW(dct_const_round_shift(s0 + s4), 8);
+  x1 = WRAPLOW(dct_const_round_shift(s1 + s5), 8);
+  x2 = WRAPLOW(dct_const_round_shift(s2 + s6), 8);
+  x3 = WRAPLOW(dct_const_round_shift(s3 + s7), 8);
+  x4 = WRAPLOW(dct_const_round_shift(s0 - s4), 8);
+  x5 = WRAPLOW(dct_const_round_shift(s1 - s5), 8);
+  x6 = WRAPLOW(dct_const_round_shift(s2 - s6), 8);
+  x7 = WRAPLOW(dct_const_round_shift(s3 - s7), 8);
+
+  // stage 2
+  s0 = (int)x0;
+  s1 = (int)x1;
+  s2 = (int)x2;
+  s3 = (int)x3;
+  s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5);
+  s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5);
+  s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
+  s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);
+
+  x0 = WRAPLOW(s0 + s2, 8);
+  x1 = WRAPLOW(s1 + s3, 8);
+  x2 = WRAPLOW(s0 - s2, 8);
+  x3 = WRAPLOW(s1 - s3, 8);
+  x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8);
+  x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8);
+  x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8);
+  x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);
+
+  // stage 3
+  s2 = (int)(cospi_16_64 * (x2 + x3));
+  s3 = (int)(cospi_16_64 * (x2 - x3));
+  s6 = (int)(cospi_16_64 * (x6 + x7));
+  s7 = (int)(cospi_16_64 * (x6 - x7));
+
+  x2 = WRAPLOW(dct_const_round_shift(s2), 8);
+  x3 = WRAPLOW(dct_const_round_shift(s3), 8);
+  x6 = WRAPLOW(dct_const_round_shift(s6), 8);
+  x7 = WRAPLOW(dct_const_round_shift(s7), 8);
+
+  output[0] = WRAPLOW(x0, 8);
+  output[1] = WRAPLOW(-x4, 8);
+  output[2] = WRAPLOW(x6, 8);
+  output[3] = WRAPLOW(-x2, 8);
+  output[4] = WRAPLOW(x3, 8);
+  output[5] = WRAPLOW(-x7, 8);
+  output[6] = WRAPLOW(x5, 8);
+  output[7] = WRAPLOW(-x1, 8);
+}
+
+void vp10_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+  tran_low_t out[8 * 8] = { 0 };
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[8], temp_out[8];
+
+  // First transform rows
+  // only first 4 row has non-zero coefs
+  for (i = 0; i < 4; ++i) {
+    vp10_idct8_c(input, outptr);
+    input += 8;
+    outptr += 8;
+  }
+
+  // Then transform columns
+  for (i = 0; i < 8; ++i) {
+    for (j = 0; j < 8; ++j)
+      temp_in[j] = out[j * 8 + i];
+    vp10_idct8_c(temp_in, temp_out);
+    for (j = 0; j < 8; ++j) {
+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+                                            ROUND_POWER_OF_TWO(temp_out[j], 5));
+    }
+  }
+}
+
+void vp10_idct16_c(const tran_low_t *input, tran_low_t *output) {
+  tran_low_t step1[16], step2[16];
+  tran_high_t temp1, temp2;
+
+  // stage 1
+  step1[0] = input[0/2];
+  step1[1] = input[16/2];
+  step1[2] = input[8/2];
+  step1[3] = input[24/2];
+  step1[4] = input[4/2];
+  step1[5] = input[20/2];
+  step1[6] = input[12/2];
+  step1[7] = input[28/2];
+  step1[8] = input[2/2];
+  step1[9] = input[18/2];
+  step1[10] = input[10/2];
+  step1[11] = input[26/2];
+  step1[12] = input[6/2];
+  step1[13] = input[22/2];
+  step1[14] = input[14/2];
+  step1[15] = input[30/2];
+
+  // stage 2
+  step2[0] = step1[0];
+  step2[1] = step1[1];
+  step2[2] = step1[2];
+  step2[3] = step1[3];
+  step2[4] = step1[4];
+  step2[5] = step1[5];
+  step2[6] = step1[6];
+  step2[7] = step1[7];
+
+  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
+  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
+  step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
+  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
+  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
+  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  // stage 3
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[2];
+  step1[3] = step2[3];
+
+  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
+  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
+  step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
+  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  step1[8] = WRAPLOW(step2[8] + step2[9], 8);
+  step1[9] = WRAPLOW(step2[8] - step2[9], 8);
+  step1[10] = WRAPLOW(-step2[10] + step2[11], 8);
+  step1[11] = WRAPLOW(step2[10] + step2[11], 8);
+  step1[12] = WRAPLOW(step2[12] + step2[13], 8);
+  step1[13] = WRAPLOW(step2[12] - step2[13], 8);
+  step1[14] = WRAPLOW(-step2[14] + step2[15], 8);
+  step1[15] = WRAPLOW(step2[14] + step2[15], 8);
+
+  // stage 4
+  temp1 = (step1[0] + step1[1]) * cospi_16_64;
+  temp2 = (step1[0] - step1[1]) * cospi_16_64;
+  step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
+  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
+  step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[4] = WRAPLOW(step1[4] + step1[5], 8);
+  step2[5] = WRAPLOW(step1[4] - step1[5], 8);
+  step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
+  step2[7] = WRAPLOW(step1[6] + step1[7], 8);
+
+  step2[8] = step1[8];
+  step2[15] = step1[15];
+  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
+  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
+  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+
+  // stage 5
+  step1[0] = WRAPLOW(step2[0] + step2[3], 8);
+  step1[1] = WRAPLOW(step2[1] + step2[2], 8);
+  step1[2] = WRAPLOW(step2[1] - step2[2], 8);
+  step1[3] = WRAPLOW(step2[0] - step2[3], 8);
+  step1[4] = step2[4];
+  temp1 = (step2[6] - step2[5]) * cospi_16_64;
+  temp2 = (step2[5] + step2[6]) * cospi_16_64;
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[7] = step2[7];
+
+  step1[8] = WRAPLOW(step2[8] + step2[11], 8);
+  step1[9] = WRAPLOW(step2[9] + step2[10], 8);
+  step1[10] = WRAPLOW(step2[9] - step2[10], 8);
+  step1[11] = WRAPLOW(step2[8] - step2[11], 8);
+  step1[12] = WRAPLOW(-step2[12] + step2[15], 8);
+  step1[13] = WRAPLOW(-step2[13] + step2[14], 8);
+  step1[14] = WRAPLOW(step2[13] + step2[14], 8);
+  step1[15] = WRAPLOW(step2[12] + step2[15], 8);
+
+  // stage 6
+  step2[0] = WRAPLOW(step1[0] + step1[7], 8);
+  step2[1] = WRAPLOW(step1[1] + step1[6], 8);
+  step2[2] = WRAPLOW(step1[2] + step1[5], 8);
+  step2[3] = WRAPLOW(step1[3] + step1[4], 8);
+  step2[4] = WRAPLOW(step1[3] - step1[4], 8);
+  step2[5] = WRAPLOW(step1[2] - step1[5], 8);
+  step2[6] = WRAPLOW(step1[1] - step1[6], 8);
+  step2[7] = WRAPLOW(step1[0] - step1[7], 8);
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
+  temp2 = (step1[10] + step1[13]) * cospi_16_64;
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
+  temp2 = (step1[11] + step1[12]) * cospi_16_64;
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[14] = step1[14];
+  step2[15] = step1[15];
+
+  // stage 7
+  output[0] = WRAPLOW(step2[0] + step2[15], 8);
+  output[1] = WRAPLOW(step2[1] + step2[14], 8);
+  output[2] = WRAPLOW(step2[2] + step2[13], 8);
+  output[3] = WRAPLOW(step2[3] + step2[12], 8);
+  output[4] = WRAPLOW(step2[4] + step2[11], 8);
+  output[5] = WRAPLOW(step2[5] + step2[10], 8);
+  output[6] = WRAPLOW(step2[6] + step2[9], 8);
+  output[7] = WRAPLOW(step2[7] + step2[8], 8);
+  output[8] = WRAPLOW(step2[7] - step2[8], 8);
+  output[9] = WRAPLOW(step2[6] - step2[9], 8);
+  output[10] = WRAPLOW(step2[5] - step2[10], 8);
+  output[11] = WRAPLOW(step2[4] - step2[11], 8);
+  output[12] = WRAPLOW(step2[3] - step2[12], 8);
+  output[13] = WRAPLOW(step2[2] - step2[13], 8);
+  output[14] = WRAPLOW(step2[1] - step2[14], 8);
+  output[15] = WRAPLOW(step2[0] - step2[15], 8);
+}
+
+void vp10_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
+                             int stride) {
+  tran_low_t out[16 * 16];
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[16], temp_out[16];
+
+  // First transform rows
+  for (i = 0; i < 16; ++i) {
+    vp10_idct16_c(input, outptr);
+    input += 16;
+    outptr += 16;
+  }
+
+  // Then transform columns
+  for (i = 0; i < 16; ++i) {
+    for (j = 0; j < 16; ++j)
+      temp_in[j] = out[j * 16 + i];
+    vp10_idct16_c(temp_in, temp_out);
+    for (j = 0; j < 16; ++j) {
+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
+    }
+  }
+}
+
+void vp10_iadst16_c(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
+  tran_high_t s9, s10, s11, s12, s13, s14, s15;
+
+  tran_high_t x0 = input[15];
+  tran_high_t x1 = input[0];
+  tran_high_t x2 = input[13];
+  tran_high_t x3 = input[2];
+  tran_high_t x4 = input[11];
+  tran_high_t x5 = input[4];
+  tran_high_t x6 = input[9];
+  tran_high_t x7 = input[6];
+  tran_high_t x8 = input[7];
+  tran_high_t x9 = input[8];
+  tran_high_t x10 = input[5];
+  tran_high_t x11 = input[10];
+  tran_high_t x12 = input[3];
+  tran_high_t x13 = input[12];
+  tran_high_t x14 = input[1];
+  tran_high_t x15 = input[14];
+
+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
+           | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
+    output[0] = output[1] = output[2] = output[3] = output[4]
+              = output[5] = output[6] = output[7] = output[8]
+              = output[9] = output[10] = output[11] = output[12]
+              = output[13] = output[14] = output[15] = 0;
+    return;
+  }
+
+  // stage 1
+  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
+  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
+  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
+  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
+  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
+  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
+  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
+  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
+  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
+  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
+  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
+  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
+  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
+  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
+  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
+  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
+
+  x0 = WRAPLOW(dct_const_round_shift(s0 + s8), 8);
+  x1 = WRAPLOW(dct_const_round_shift(s1 + s9), 8);
+  x2 = WRAPLOW(dct_const_round_shift(s2 + s10), 8);
+  x3 = WRAPLOW(dct_const_round_shift(s3 + s11), 8);
+  x4 = WRAPLOW(dct_const_round_shift(s4 + s12), 8);
+  x5 = WRAPLOW(dct_const_round_shift(s5 + s13), 8);
+  x6 = WRAPLOW(dct_const_round_shift(s6 + s14), 8);
+  x7 = WRAPLOW(dct_const_round_shift(s7 + s15), 8);
+  x8 = WRAPLOW(dct_const_round_shift(s0 - s8), 8);
+  x9 = WRAPLOW(dct_const_round_shift(s1 - s9), 8);
+  x10 = WRAPLOW(dct_const_round_shift(s2 - s10), 8);
+  x11 = WRAPLOW(dct_const_round_shift(s3 - s11), 8);
+  x12 = WRAPLOW(dct_const_round_shift(s4 - s12), 8);
+  x13 = WRAPLOW(dct_const_round_shift(s5 - s13), 8);
+  x14 = WRAPLOW(dct_const_round_shift(s6 - s14), 8);
+  x15 = WRAPLOW(dct_const_round_shift(s7 - s15), 8);
+
+  // stage 2
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = x4;
+  s5 = x5;
+  s6 = x6;
+  s7 = x7;
+  s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;
+  s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;
+  s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;
+  s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;
+  s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
+  s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;
+  s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
+  s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;
+
+  x0 = WRAPLOW(s0 + s4, 8);
+  x1 = WRAPLOW(s1 + s5, 8);
+  x2 = WRAPLOW(s2 + s6, 8);
+  x3 = WRAPLOW(s3 + s7, 8);
+  x4 = WRAPLOW(s0 - s4, 8);
+  x5 = WRAPLOW(s1 - s5, 8);
+  x6 = WRAPLOW(s2 - s6, 8);
+  x7 = WRAPLOW(s3 - s7, 8);
+  x8 = WRAPLOW(dct_const_round_shift(s8 + s12), 8);
+  x9 = WRAPLOW(dct_const_round_shift(s9 + s13), 8);
+  x10 = WRAPLOW(dct_const_round_shift(s10 + s14), 8);
+  x11 = WRAPLOW(dct_const_round_shift(s11 + s15), 8);
+  x12 = WRAPLOW(dct_const_round_shift(s8 - s12), 8);
+  x13 = WRAPLOW(dct_const_round_shift(s9 - s13), 8);
+  x14 = WRAPLOW(dct_const_round_shift(s10 - s14), 8);
+  x15 = WRAPLOW(dct_const_round_shift(s11 - s15), 8);
+
+  // stage 3
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = x4 * cospi_8_64  + x5 * cospi_24_64;
+  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
+  s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
+  s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;
+  s8 = x8;
+  s9 = x9;
+  s10 = x10;
+  s11 = x11;
+  s12 = x12 * cospi_8_64  + x13 * cospi_24_64;
+  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
+  s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
+  s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;
+
+  x0 = WRAPLOW(check_range(s0 + s2), 8);
+  x1 = WRAPLOW(check_range(s1 + s3), 8);
+  x2 = WRAPLOW(check_range(s0 - s2), 8);
+  x3 = WRAPLOW(check_range(s1 - s3), 8);
+  x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8);
+  x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8);
+  x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8);
+  x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);
+  x8 = WRAPLOW(check_range(s8 + s10), 8);
+  x9 = WRAPLOW(check_range(s9 + s11), 8);
+  x10 = WRAPLOW(check_range(s8 - s10), 8);
+  x11 = WRAPLOW(check_range(s9 - s11), 8);
+  x12 = WRAPLOW(dct_const_round_shift(s12 + s14), 8);
+  x13 = WRAPLOW(dct_const_round_shift(s13 + s15), 8);
+  x14 = WRAPLOW(dct_const_round_shift(s12 - s14), 8);
+  x15 = WRAPLOW(dct_const_round_shift(s13 - s15), 8);
+
+  // stage 4
+  s2 = (- cospi_16_64) * (x2 + x3);
+  s3 = cospi_16_64 * (x2 - x3);
+  s6 = cospi_16_64 * (x6 + x7);
+  s7 = cospi_16_64 * (- x6 + x7);
+  s10 = cospi_16_64 * (x10 + x11);
+  s11 = cospi_16_64 * (- x10 + x11);
+  s14 = (- cospi_16_64) * (x14 + x15);
+  s15 = cospi_16_64 * (x14 - x15);
+
+  x2 = WRAPLOW(dct_const_round_shift(s2), 8);
+  x3 = WRAPLOW(dct_const_round_shift(s3), 8);
+  x6 = WRAPLOW(dct_const_round_shift(s6), 8);
+  x7 = WRAPLOW(dct_const_round_shift(s7), 8);
+  x10 = WRAPLOW(dct_const_round_shift(s10), 8);
+  x11 = WRAPLOW(dct_const_round_shift(s11), 8);
+  x14 = WRAPLOW(dct_const_round_shift(s14), 8);
+  x15 = WRAPLOW(dct_const_round_shift(s15), 8);
+
+  output[0] = WRAPLOW(x0, 8);
+  output[1] = WRAPLOW(-x8, 8);
+  output[2] = WRAPLOW(x12, 8);
+  output[3] = WRAPLOW(-x4, 8);
+  output[4] = WRAPLOW(x6, 8);
+  output[5] = WRAPLOW(x14, 8);
+  output[6] = WRAPLOW(x10, 8);
+  output[7] = WRAPLOW(x2, 8);
+  output[8] = WRAPLOW(x3, 8);
+  output[9] = WRAPLOW(x11, 8);
+  output[10] = WRAPLOW(x15, 8);
+  output[11] = WRAPLOW(x7, 8);
+  output[12] = WRAPLOW(x5, 8);
+  output[13] = WRAPLOW(-x13, 8);
+  output[14] = WRAPLOW(x9, 8);
+  output[15] = WRAPLOW(-x1, 8);
+}
+
+void vp10_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
+                            int stride) {
+  tran_low_t out[16 * 16] = { 0 };
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[16], temp_out[16];
+
+  // First transform rows. Since all non-zero dct coefficients are in
+  // upper-left 4x4 area, we only need to calculate first 4 rows here.
+  for (i = 0; i < 4; ++i) {
+    vp10_idct16_c(input, outptr);
+    input += 16;
+    outptr += 16;
+  }
+
+  // Then transform columns
+  for (i = 0; i < 16; ++i) {
+    for (j = 0; j < 16; ++j)
+      temp_in[j] = out[j*16 + i];
+    vp10_idct16_c(temp_in, temp_out);
+    for (j = 0; j < 16; ++j) {
+      dest[j * stride + i] = clip_pixel_add(
+          dest[j * stride + i],
+          ROUND_POWER_OF_TWO(temp_out[j], 6));
+    }
+  }
+}
+
+void vp10_idct16x16_1_add_c(const tran_low_t *input,
+                            uint8_t *dest,
+                            int stride) {
+  int i, j;
+  tran_high_t a1;
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
+  a1 = ROUND_POWER_OF_TWO(out, 6);
+  for (j = 0; j < 16; ++j) {
+    for (i = 0; i < 16; ++i)
+      dest[i] = clip_pixel_add(dest[i], a1);
+    dest += stride;
+  }
+}
+
+void vp10_idct32_c(const tran_low_t *input, tran_low_t *output) {
+  tran_low_t step1[32], step2[32];
+  tran_high_t temp1, temp2;
+
+  // stage 1
+  step1[0] = input[0];
+  step1[1] = input[16];
+  step1[2] = input[8];
+  step1[3] = input[24];
+  step1[4] = input[4];
+  step1[5] = input[20];
+  step1[6] = input[12];
+  step1[7] = input[28];
+  step1[8] = input[2];
+  step1[9] = input[18];
+  step1[10] = input[10];
+  step1[11] = input[26];
+  step1[12] = input[6];
+  step1[13] = input[22];
+  step1[14] = input[14];
+  step1[15] = input[30];
+
+  temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
+  temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
+  step1[16] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[31] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
+  temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
+  step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
+  temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
+  step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
+  temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
+  step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
+  temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
+  step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
+  temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
+  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
+  temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
+  step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
+  temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
+  step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  // stage 2
+  step2[0] = step1[0];
+  step2[1] = step1[1];
+  step2[2] = step1[2];
+  step2[3] = step1[3];
+  step2[4] = step1[4];
+  step2[5] = step1[5];
+  step2[6] = step1[6];
+  step2[7] = step1[7];
+
+  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
+  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
+  step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
+  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
+  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
+  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  step2[16] = WRAPLOW(step1[16] + step1[17], 8);
+  step2[17] = WRAPLOW(step1[16] - step1[17], 8);
+  step2[18] = WRAPLOW(-step1[18] + step1[19], 8);
+  step2[19] = WRAPLOW(step1[18] + step1[19], 8);
+  step2[20] = WRAPLOW(step1[20] + step1[21], 8);
+  step2[21] = WRAPLOW(step1[20] - step1[21], 8);
+  step2[22] = WRAPLOW(-step1[22] + step1[23], 8);
+  step2[23] = WRAPLOW(step1[22] + step1[23], 8);
+  step2[24] = WRAPLOW(step1[24] + step1[25], 8);
+  step2[25] = WRAPLOW(step1[24] - step1[25], 8);
+  step2[26] = WRAPLOW(-step1[26] + step1[27], 8);
+  step2[27] = WRAPLOW(step1[26] + step1[27], 8);
+  step2[28] = WRAPLOW(step1[28] + step1[29], 8);
+  step2[29] = WRAPLOW(step1[28] - step1[29], 8);
+  step2[30] = WRAPLOW(-step1[30] + step1[31], 8);
+  step2[31] = WRAPLOW(step1[30] + step1[31], 8);
+
+  // stage 3
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[2];
+  step1[3] = step2[3];
+
+  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
+  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
+  step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
+  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  step1[8] = WRAPLOW(step2[8] + step2[9], 8);
+  step1[9] = WRAPLOW(step2[8] - step2[9], 8);
+  step1[10] = WRAPLOW(-step2[10] + step2[11], 8);
+  step1[11] = WRAPLOW(step2[10] + step2[11], 8);
+  step1[12] = WRAPLOW(step2[12] + step2[13], 8);
+  step1[13] = WRAPLOW(step2[12] - step2[13], 8);
+  step1[14] = WRAPLOW(-step2[14] + step2[15], 8);
+  step1[15] = WRAPLOW(step2[14] + step2[15], 8);
+
+  step1[16] = step2[16];
+  step1[31] = step2[31];
+  temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
+  temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
+  step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
+  temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
+  step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[19] = step2[19];
+  step1[20] = step2[20];
+  temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
+  temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
+  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
+  temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
+  step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[27] = step2[27];
+  step1[28] = step2[28];
+
+  // stage 4
+  temp1 = (step1[0] + step1[1]) * cospi_16_64;
+  temp2 = (step1[0] - step1[1]) * cospi_16_64;
+  step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
+  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
+  step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[4] = WRAPLOW(step1[4] + step1[5], 8);
+  step2[5] = WRAPLOW(step1[4] - step1[5], 8);
+  step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
+  step2[7] = WRAPLOW(step1[6] + step1[7], 8);
+
+  step2[8] = step1[8];
+  step2[15] = step1[15];
+  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
+  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
+  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+
+  step2[16] = WRAPLOW(step1[16] + step1[19], 8);
+  step2[17] = WRAPLOW(step1[17] + step1[18], 8);
+  step2[18] = WRAPLOW(step1[17] - step1[18], 8);
+  step2[19] = WRAPLOW(step1[16] - step1[19], 8);
+  step2[20] = WRAPLOW(-step1[20] + step1[23], 8);
+  step2[21] = WRAPLOW(-step1[21] + step1[22], 8);
+  step2[22] = WRAPLOW(step1[21] + step1[22], 8);
+  step2[23] = WRAPLOW(step1[20] + step1[23], 8);
+
+  step2[24] = WRAPLOW(step1[24] + step1[27], 8);
+  step2[25] = WRAPLOW(step1[25] + step1[26], 8);
+  step2[26] = WRAPLOW(step1[25] - step1[26], 8);
+  step2[27] = WRAPLOW(step1[24] - step1[27], 8);
+  step2[28] = WRAPLOW(-step1[28] + step1[31], 8);
+  step2[29] = WRAPLOW(-step1[29] + step1[30], 8);
+  step2[30] = WRAPLOW(step1[29] + step1[30], 8);
+  step2[31] = WRAPLOW(step1[28] + step1[31], 8);
+
+  // stage 5
+  step1[0] = WRAPLOW(step2[0] + step2[3], 8);
+  step1[1] = WRAPLOW(step2[1] + step2[2], 8);
+  step1[2] = WRAPLOW(step2[1] - step2[2], 8);
+  step1[3] = WRAPLOW(step2[0] - step2[3], 8);
+  step1[4] = step2[4];
+  temp1 = (step2[6] - step2[5]) * cospi_16_64;
+  temp2 = (step2[5] + step2[6]) * cospi_16_64;
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[7] = step2[7];
+
+  step1[8] = WRAPLOW(step2[8] + step2[11], 8);
+  step1[9] = WRAPLOW(step2[9] + step2[10], 8);
+  step1[10] = WRAPLOW(step2[9] - step2[10], 8);
+  step1[11] = WRAPLOW(step2[8] - step2[11], 8);
+  step1[12] = WRAPLOW(-step2[12] + step2[15], 8);
+  step1[13] = WRAPLOW(-step2[13] + step2[14], 8);
+  step1[14] = WRAPLOW(step2[13] + step2[14], 8);
+  step1[15] = WRAPLOW(step2[12] + step2[15], 8);
+
+  step1[16] = step2[16];
+  step1[17] = step2[17];
+  temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
+  temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
+  step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
+  temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
+  step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
+  temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
+  step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
+  temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
+  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[22] = step2[22];
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[25] = step2[25];
+  step1[30] = step2[30];
+  step1[31] = step2[31];
+
+  // stage 6
+  step2[0] = WRAPLOW(step1[0] + step1[7], 8);
+  step2[1] = WRAPLOW(step1[1] + step1[6], 8);
+  step2[2] = WRAPLOW(step1[2] + step1[5], 8);
+  step2[3] = WRAPLOW(step1[3] + step1[4], 8);
+  step2[4] = WRAPLOW(step1[3] - step1[4], 8);
+  step2[5] = WRAPLOW(step1[2] - step1[5], 8);
+  step2[6] = WRAPLOW(step1[1] - step1[6], 8);
+  step2[7] = WRAPLOW(step1[0] - step1[7], 8);
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
+  temp2 = (step1[10] + step1[13]) * cospi_16_64;
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
+  temp2 = (step1[11] + step1[12]) * cospi_16_64;
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[14] = step1[14];
+  step2[15] = step1[15];
+
+  step2[16] = WRAPLOW(step1[16] + step1[23], 8);
+  step2[17] = WRAPLOW(step1[17] + step1[22], 8);
+  step2[18] = WRAPLOW(step1[18] + step1[21], 8);
+  step2[19] = WRAPLOW(step1[19] + step1[20], 8);
+  step2[20] = WRAPLOW(step1[19] - step1[20], 8);
+  step2[21] = WRAPLOW(step1[18] - step1[21], 8);
+  step2[22] = WRAPLOW(step1[17] - step1[22], 8);
+  step2[23] = WRAPLOW(step1[16] - step1[23], 8);
+
+  step2[24] = WRAPLOW(-step1[24] + step1[31], 8);
+  step2[25] = WRAPLOW(-step1[25] + step1[30], 8);
+  step2[26] = WRAPLOW(-step1[26] + step1[29], 8);
+  step2[27] = WRAPLOW(-step1[27] + step1[28], 8);
+  step2[28] = WRAPLOW(step1[27] + step1[28], 8);
+  step2[29] = WRAPLOW(step1[26] + step1[29], 8);
+  step2[30] = WRAPLOW(step1[25] + step1[30], 8);
+  step2[31] = WRAPLOW(step1[24] + step1[31], 8);
+
+  // stage 7
+  step1[0] = WRAPLOW(step2[0] + step2[15], 8);
+  step1[1] = WRAPLOW(step2[1] + step2[14], 8);
+  step1[2] = WRAPLOW(step2[2] + step2[13], 8);
+  step1[3] = WRAPLOW(step2[3] + step2[12], 8);
+  step1[4] = WRAPLOW(step2[4] + step2[11], 8);
+  step1[5] = WRAPLOW(step2[5] + step2[10], 8);
+  step1[6] = WRAPLOW(step2[6] + step2[9], 8);
+  step1[7] = WRAPLOW(step2[7] + step2[8], 8);
+  step1[8] = WRAPLOW(step2[7] - step2[8], 8);
+  step1[9] = WRAPLOW(step2[6] - step2[9], 8);
+  step1[10] = WRAPLOW(step2[5] - step2[10], 8);
+  step1[11] = WRAPLOW(step2[4] - step2[11], 8);
+  step1[12] = WRAPLOW(step2[3] - step2[12], 8);
+  step1[13] = WRAPLOW(step2[2] - step2[13], 8);
+  step1[14] = WRAPLOW(step2[1] - step2[14], 8);
+  step1[15] = WRAPLOW(step2[0] - step2[15], 8);
+
+  step1[16] = step2[16];
+  step1[17] = step2[17];
+  step1[18] = step2[18];
+  step1[19] = step2[19];
+  temp1 = (-step2[20] + step2[27]) * cospi_16_64;
+  temp2 = (step2[20] + step2[27]) * cospi_16_64;
+  step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = (-step2[21] + step2[26]) * cospi_16_64;
+  temp2 = (step2[21] + step2[26]) * cospi_16_64;
+  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = (-step2[22] + step2[25]) * cospi_16_64;
+  temp2 = (step2[22] + step2[25]) * cospi_16_64;
+  step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = (-step2[23] + step2[24]) * cospi_16_64;
+  temp2 = (step2[23] + step2[24]) * cospi_16_64;
+  step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[28] = step2[28];
+  step1[29] = step2[29];
+  step1[30] = step2[30];
+  step1[31] = step2[31];
+
+  // final stage
+  output[0] = WRAPLOW(step1[0] + step1[31], 8);
+  output[1] = WRAPLOW(step1[1] + step1[30], 8);
+  output[2] = WRAPLOW(step1[2] + step1[29], 8);
+  output[3] = WRAPLOW(step1[3] + step1[28], 8);
+  output[4] = WRAPLOW(step1[4] + step1[27], 8);
+  output[5] = WRAPLOW(step1[5] + step1[26], 8);
+  output[6] = WRAPLOW(step1[6] + step1[25], 8);
+  output[7] = WRAPLOW(step1[7] + step1[24], 8);
+  output[8] = WRAPLOW(step1[8] + step1[23], 8);
+  output[9] = WRAPLOW(step1[9] + step1[22], 8);
+  output[10] = WRAPLOW(step1[10] + step1[21], 8);
+  output[11] = WRAPLOW(step1[11] + step1[20], 8);
+  output[12] = WRAPLOW(step1[12] + step1[19], 8);
+  output[13] = WRAPLOW(step1[13] + step1[18], 8);
+  output[14] = WRAPLOW(step1[14] + step1[17], 8);
+  output[15] = WRAPLOW(step1[15] + step1[16], 8);
+  output[16] = WRAPLOW(step1[15] - step1[16], 8);
+  output[17] = WRAPLOW(step1[14] - step1[17], 8);
+  output[18] = WRAPLOW(step1[13] - step1[18], 8);
+  output[19] = WRAPLOW(step1[12] - step1[19], 8);
+  output[20] = WRAPLOW(step1[11] - step1[20], 8);
+  output[21] = WRAPLOW(step1[10] - step1[21], 8);
+  output[22] = WRAPLOW(step1[9] - step1[22], 8);
+  output[23] = WRAPLOW(step1[8] - step1[23], 8);
+  output[24] = WRAPLOW(step1[7] - step1[24], 8);
+  output[25] = WRAPLOW(step1[6] - step1[25], 8);
+  output[26] = WRAPLOW(step1[5] - step1[26], 8);
+  output[27] = WRAPLOW(step1[4] - step1[27], 8);
+  output[28] = WRAPLOW(step1[3] - step1[28], 8);
+  output[29] = WRAPLOW(step1[2] - step1[29], 8);
+  output[30] = WRAPLOW(step1[1] - step1[30], 8);
+  output[31] = WRAPLOW(step1[0] - step1[31], 8);
+}
+
+void vp10_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
+                              int stride) {
+  tran_low_t out[32 * 32];
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[32], temp_out[32];
+
+  // Rows
+  for (i = 0; i < 32; ++i) {
+    int16_t zero_coeff[16];
+    for (j = 0; j < 16; ++j)
+      zero_coeff[j] = input[2 * j] | input[2 * j + 1];
+    for (j = 0; j < 8; ++j)
+      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+    for (j = 0; j < 4; ++j)
+      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+    for (j = 0; j < 2; ++j)
+      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+
+    if (zero_coeff[0] | zero_coeff[1])
+      vp10_idct32_c(input, outptr);
+    else
+      memset(outptr, 0, sizeof(tran_low_t) * 32);
+    input += 32;
+    outptr += 32;
+  }
+
+  // Columns
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 32; ++j)
+      temp_in[j] = out[j * 32 + i];
+    vp10_idct32_c(temp_in, temp_out);
+    for (j = 0; j < 32; ++j) {
+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
+    }
+  }
+}
+
+void vp10_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
+                            int stride) {
+  tran_low_t out[32 * 32] = {0};
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[32], temp_out[32];
+
+  // Rows
+  // only upper-left 8x8 has non-zero coeff
+  for (i = 0; i < 8; ++i) {
+    vp10_idct32_c(input, outptr);
+    input += 32;
+    outptr += 32;
+  }
+
+  // Columns
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 32; ++j)
+      temp_in[j] = out[j * 32 + i];
+    vp10_idct32_c(temp_in, temp_out);
+    for (j = 0; j < 32; ++j) {
+      dest[j * stride + i] = clip_pixel_add(
+          dest[j * stride + i],
+          ROUND_POWER_OF_TWO(temp_out[j], 6));
+    }
+  }
+}
+
+void vp10_idct32x32_1_add_c(const tran_low_t *input,
+                            uint8_t *dest,
+                            int stride) {
+  int i, j;
+  tran_high_t a1;
+
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
+  a1 = ROUND_POWER_OF_TWO(out, 6);
+
+  for (j = 0; j < 32; ++j) {
+    for (i = 0; i < 32; ++i)
+      dest[i] = clip_pixel_add(dest[i], a1);
+    dest += stride;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp10_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
+                                 int stride, int bd) {
+  /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
+     0.5 shifts per pixel. */
+  int i;
+  tran_low_t output[16];
+  tran_high_t a1, b1, c1, d1, e1;
+  const tran_low_t *ip = input;
+  tran_low_t *op = output;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  for (i = 0; i < 4; i++) {
+    a1 = ip[0] >> UNIT_QUANT_SHIFT;
+    c1 = ip[1] >> UNIT_QUANT_SHIFT;
+    d1 = ip[2] >> UNIT_QUANT_SHIFT;
+    b1 = ip[3] >> UNIT_QUANT_SHIFT;
+    a1 += c1;
+    d1 -= b1;
+    e1 = (a1 - d1) >> 1;
+    b1 = e1 - b1;
+    c1 = e1 - c1;
+    a1 -= b1;
+    d1 += c1;
+    op[0] = WRAPLOW(a1, bd);
+    op[1] = WRAPLOW(b1, bd);
+    op[2] = WRAPLOW(c1, bd);
+    op[3] = WRAPLOW(d1, bd);
+    ip += 4;
+    op += 4;
+  }
+
+  ip = output;
+  for (i = 0; i < 4; i++) {
+    a1 = ip[4 * 0];
+    c1 = ip[4 * 1];
+    d1 = ip[4 * 2];
+    b1 = ip[4 * 3];
+    a1 += c1;
+    d1 -= b1;
+    e1 = (a1 - d1) >> 1;
+    b1 = e1 - b1;
+    c1 = e1 - c1;
+    a1 -= b1;
+    d1 += c1;
+    dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd);
+    dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], b1, bd);
+    dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], c1, bd);
+    dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], d1, bd);
+
+    ip++;
+    dest++;
+  }
+}
+
+void vp10_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
+                                int dest_stride, int bd) {
+  int i;
+  tran_high_t a1, e1;
+  tran_low_t tmp[4];
+  const tran_low_t *ip = in;
+  tran_low_t *op = tmp;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+  (void) bd;
+
+  a1 = ip[0] >> UNIT_QUANT_SHIFT;
+  e1 = a1 >> 1;
+  a1 -= e1;
+  op[0] = WRAPLOW(a1, bd);
+  op[1] = op[2] = op[3] = WRAPLOW(e1, bd);
+
+  ip = tmp;
+  for (i = 0; i < 4; i++) {
+    e1 = ip[0] >> 1;
+    a1 = ip[0] - e1;
+    dest[dest_stride * 0] = highbd_clip_pixel_add(
+        dest[dest_stride * 0], a1, bd);
+    dest[dest_stride * 1] = highbd_clip_pixel_add(
+        dest[dest_stride * 1], e1, bd);
+    dest[dest_stride * 2] = highbd_clip_pixel_add(
+        dest[dest_stride * 2], e1, bd);
+    dest[dest_stride * 3] = highbd_clip_pixel_add(
+        dest[dest_stride * 3], e1, bd);
+    ip++;
+    dest++;
+  }
+}
+
+void vp10_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
+  tran_low_t step[4];
+  tran_high_t temp1, temp2;
+  (void) bd;
+  // stage 1
+  temp1 = (input[0] + input[2]) * cospi_16_64;
+  temp2 = (input[0] - input[2]) * cospi_16_64;
+  step[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
+  temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
+  step[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  // stage 2
+  output[0] = WRAPLOW(step[0] + step[3], bd);
+  output[1] = WRAPLOW(step[1] + step[2], bd);
+  output[2] = WRAPLOW(step[1] - step[2], bd);
+  output[3] = WRAPLOW(step[0] - step[3], bd);
+}
+
+void vp10_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
+                                 int stride, int bd) {
+  tran_low_t out[4 * 4];
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[4], temp_out[4];
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  // Rows
+  for (i = 0; i < 4; ++i) {
+    vp10_highbd_idct4_c(input, outptr, bd);
+    input += 4;
+    outptr += 4;
+  }
+
+  // Columns
+  for (i = 0; i < 4; ++i) {
+    for (j = 0; j < 4; ++j)
+      temp_in[j] = out[j * 4 + i];
+    vp10_highbd_idct4_c(temp_in, temp_out, bd);
+    for (j = 0; j < 4; ++j) {
+      dest[j * stride + i] = highbd_clip_pixel_add(
+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
+    }
+  }
+}
+
+void vp10_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
+                                int dest_stride, int bd) {
+  int i;
+  tran_high_t a1;
+  tran_low_t out = WRAPLOW(
+      highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
+  a1 = ROUND_POWER_OF_TWO(out, 4);
+
+  for (i = 0; i < 4; i++) {
+    dest[0] = highbd_clip_pixel_add(dest[0], a1, bd);
+    dest[1] = highbd_clip_pixel_add(dest[1], a1, bd);
+    dest[2] = highbd_clip_pixel_add(dest[2], a1, bd);
+    dest[3] = highbd_clip_pixel_add(dest[3], a1, bd);
+    dest += dest_stride;
+  }
+}
+
+void vp10_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
+  tran_low_t step1[8], step2[8];
+  tran_high_t temp1, temp2;
+  // stage 1
+  step1[0] = input[0];
+  step1[2] = input[4];
+  step1[1] = input[2];
+  step1[3] = input[6];
+  temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
+  temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
+  step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
+  temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
+  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  // stage 2 & stage 3 - even half
+  vp10_highbd_idct4_c(step1, step1, bd);
+
+  // stage 2 - odd half
+  step2[4] = WRAPLOW(step1[4] + step1[5], bd);
+  step2[5] = WRAPLOW(step1[4] - step1[5], bd);
+  step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
+  step2[7] = WRAPLOW(step1[6] + step1[7], bd);
+
+  // stage 3 - odd half
+  step1[4] = step2[4];
+  temp1 = (step2[6] - step2[5]) * cospi_16_64;
+  temp2 = (step2[5] + step2[6]) * cospi_16_64;
+  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[7] = step2[7];
+
+  // stage 4
+  output[0] = WRAPLOW(step1[0] + step1[7], bd);
+  output[1] = WRAPLOW(step1[1] + step1[6], bd);
+  output[2] = WRAPLOW(step1[2] + step1[5], bd);
+  output[3] = WRAPLOW(step1[3] + step1[4], bd);
+  output[4] = WRAPLOW(step1[3] - step1[4], bd);
+  output[5] = WRAPLOW(step1[2] - step1[5], bd);
+  output[6] = WRAPLOW(step1[1] - step1[6], bd);
+  output[7] = WRAPLOW(step1[0] - step1[7], bd);
+}
+
+void vp10_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
+                                 int stride, int bd) {
+  tran_low_t out[8 * 8];
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[8], temp_out[8];
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  // First transform rows.
+  for (i = 0; i < 8; ++i) {
+    vp10_highbd_idct8_c(input, outptr, bd);
+    input += 8;
+    outptr += 8;
+  }
+
+  // Then transform columns.
+  for (i = 0; i < 8; ++i) {
+    for (j = 0; j < 8; ++j)
+      temp_in[j] = out[j * 8 + i];
+    vp10_highbd_idct8_c(temp_in, temp_out, bd);
+    for (j = 0; j < 8; ++j) {
+      dest[j * stride + i] = highbd_clip_pixel_add(
+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+    }
+  }
+}
+
+void vp10_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
+                                int stride, int bd) {
+  int i, j;
+  tran_high_t a1;
+  tran_low_t out = WRAPLOW(
+      highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
+  a1 = ROUND_POWER_OF_TWO(out, 5);
+  for (j = 0; j < 8; ++j) {
+    for (i = 0; i < 8; ++i)
+      dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
+    dest += stride;
+  }
+}
+
+void vp10_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+  tran_low_t x0 = input[0];
+  tran_low_t x1 = input[1];
+  tran_low_t x2 = input[2];
+  tran_low_t x3 = input[3];
+  (void) bd;
+
+  if (!(x0 | x1 | x2 | x3)) {
+    memset(output, 0, 4 * sizeof(*output));
+    return;
+  }
+
+  s0 = sinpi_1_9 * x0;
+  s1 = sinpi_2_9 * x0;
+  s2 = sinpi_3_9 * x1;
+  s3 = sinpi_4_9 * x2;
+  s4 = sinpi_1_9 * x2;
+  s5 = sinpi_2_9 * x3;
+  s6 = sinpi_4_9 * x3;
+  s7 = (tran_high_t)(x0 - x2 + x3);
+
+  s0 = s0 + s3 + s5;
+  s1 = s1 - s4 - s6;
+  s3 = s2;
+  s2 = sinpi_3_9 * s7;
+
+  // 1-D transform scaling factor is sqrt(2).
+  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
+  // + 1b (addition) = 29b.
+  // Hence the output bit depth is 15b.
+  output[0] = WRAPLOW(highbd_dct_const_round_shift(s0 + s3, bd), bd);
+  output[1] = WRAPLOW(highbd_dct_const_round_shift(s1 + s3, bd), bd);
+  output[2] = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
+  output[3] = WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3, bd), bd);
+}
+
+void vp10_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+  tran_low_t x0 = input[7];
+  tran_low_t x1 = input[0];
+  tran_low_t x2 = input[5];
+  tran_low_t x3 = input[2];
+  tran_low_t x4 = input[3];
+  tran_low_t x5 = input[4];
+  tran_low_t x6 = input[1];
+  tran_low_t x7 = input[6];
+  (void) bd;
+
+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
+    memset(output, 0, 8 * sizeof(*output));
+    return;
+  }
+
+  // stage 1
+  s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
+  s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
+  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
+  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
+  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
+  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
+  s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
+  s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
+
+  x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s4, bd), bd);
+  x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s5, bd), bd);
+  x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s6, bd), bd);
+  x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s7, bd), bd);
+  x4 = WRAPLOW(highbd_dct_const_round_shift(s0 - s4, bd), bd);
+  x5 = WRAPLOW(highbd_dct_const_round_shift(s1 - s5, bd), bd);
+  x6 = WRAPLOW(highbd_dct_const_round_shift(s2 - s6, bd), bd);
+  x7 = WRAPLOW(highbd_dct_const_round_shift(s3 - s7, bd), bd);
+
+  // stage 2
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;
+  s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;
+  s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
+  s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
+
+  x0 = WRAPLOW(s0 + s2, bd);
+  x1 = WRAPLOW(s1 + s3, bd);
+  x2 = WRAPLOW(s0 - s2, bd);
+  x3 = WRAPLOW(s1 - s3, bd);
+  x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd);
+  x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd);
+  x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd);
+  x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd);
+
+  // stage 3
+  s2 = cospi_16_64 * (x2 + x3);
+  s3 = cospi_16_64 * (x2 - x3);
+  s6 = cospi_16_64 * (x6 + x7);
+  s7 = cospi_16_64 * (x6 - x7);
+
+  x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
+  x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd);
+  x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd);
+  x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd);
+
+  output[0] = WRAPLOW(x0, bd);
+  output[1] = WRAPLOW(-x4, bd);
+  output[2] = WRAPLOW(x6, bd);
+  output[3] = WRAPLOW(-x2, bd);
+  output[4] = WRAPLOW(x3, bd);
+  output[5] = WRAPLOW(-x7, bd);
+  output[6] = WRAPLOW(x5, bd);
+  output[7] = WRAPLOW(-x1, bd);
+}
+
+void vp10_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
+                                 int stride, int bd) {
+  tran_low_t out[8 * 8] = { 0 };
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[8], temp_out[8];
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  // First transform rows.
+  // Only first 4 row has non-zero coefs.
+  for (i = 0; i < 4; ++i) {
+    vp10_highbd_idct8_c(input, outptr, bd);
+    input += 8;
+    outptr += 8;
+  }
+  // Then transform columns.
+  for (i = 0; i < 8; ++i) {
+    for (j = 0; j < 8; ++j)
+      temp_in[j] = out[j * 8 + i];
+    vp10_highbd_idct8_c(temp_in, temp_out, bd);
+    for (j = 0; j < 8; ++j) {
+      dest[j * stride + i] = highbd_clip_pixel_add(
+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+    }
+  }
+}
+
+void vp10_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
+  tran_low_t step1[16], step2[16];
+  tran_high_t temp1, temp2;
+  (void) bd;
+
+  // stage 1
+  step1[0] = input[0/2];
+  step1[1] = input[16/2];
+  step1[2] = input[8/2];
+  step1[3] = input[24/2];
+  step1[4] = input[4/2];
+  step1[5] = input[20/2];
+  step1[6] = input[12/2];
+  step1[7] = input[28/2];
+  step1[8] = input[2/2];
+  step1[9] = input[18/2];
+  step1[10] = input[10/2];
+  step1[11] = input[26/2];
+  step1[12] = input[6/2];
+  step1[13] = input[22/2];
+  step1[14] = input[14/2];
+  step1[15] = input[30/2];
+
+  // stage 2
+  step2[0] = step1[0];
+  step2[1] = step1[1];
+  step2[2] = step1[2];
+  step2[3] = step1[3];
+  step2[4] = step1[4];
+  step2[5] = step1[5];
+  step2[6] = step1[6];
+  step2[7] = step1[7];
+
+  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
+  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
+  step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
+  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
+  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
+  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
+  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
+  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
+  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  // stage 3
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[2];
+  step1[3] = step2[3];
+
+  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
+  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
+  step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
+  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
+  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  step1[8] = WRAPLOW(step2[8] + step2[9], bd);
+  step1[9] = WRAPLOW(step2[8] - step2[9], bd);
+  step1[10] = WRAPLOW(-step2[10] + step2[11], bd);
+  step1[11] = WRAPLOW(step2[10] + step2[11], bd);
+  step1[12] = WRAPLOW(step2[12] + step2[13], bd);
+  step1[13] = WRAPLOW(step2[12] - step2[13], bd);
+  step1[14] = WRAPLOW(-step2[14] + step2[15], bd);
+  step1[15] = WRAPLOW(step2[14] + step2[15], bd);
+
+  // stage 4
+  temp1 = (step1[0] + step1[1]) * cospi_16_64;
+  temp2 = (step1[0] - step1[1]) * cospi_16_64;
+  step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
+  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
+  step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[4] = WRAPLOW(step1[4] + step1[5], bd);
+  step2[5] = WRAPLOW(step1[4] - step1[5], bd);
+  step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
+  step2[7] = WRAPLOW(step1[6] + step1[7], bd);
+
+  step2[8] = step1[8];
+  step2[15] = step1[15];
+  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
+  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
+  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
+  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
+  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+
+  // stage 5
+  step1[0] = WRAPLOW(step2[0] + step2[3], bd);
+  step1[1] = WRAPLOW(step2[1] + step2[2], bd);
+  step1[2] = WRAPLOW(step2[1] - step2[2], bd);
+  step1[3] = WRAPLOW(step2[0] - step2[3], bd);
+  step1[4] = step2[4];
+  temp1 = (step2[6] - step2[5]) * cospi_16_64;
+  temp2 = (step2[5] + step2[6]) * cospi_16_64;
+  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[7] = step2[7];
+
+  step1[8] = WRAPLOW(step2[8] + step2[11], bd);
+  step1[9] = WRAPLOW(step2[9] + step2[10], bd);
+  step1[10] = WRAPLOW(step2[9] - step2[10], bd);
+  step1[11] = WRAPLOW(step2[8] - step2[11], bd);
+  step1[12] = WRAPLOW(-step2[12] + step2[15], bd);
+  step1[13] = WRAPLOW(-step2[13] + step2[14], bd);
+  step1[14] = WRAPLOW(step2[13] + step2[14], bd);
+  step1[15] = WRAPLOW(step2[12] + step2[15], bd);
+
+  // stage 6
+  step2[0] = WRAPLOW(step1[0] + step1[7], bd);
+  step2[1] = WRAPLOW(step1[1] + step1[6], bd);
+  step2[2] = WRAPLOW(step1[2] + step1[5], bd);
+  step2[3] = WRAPLOW(step1[3] + step1[4], bd);
+  step2[4] = WRAPLOW(step1[3] - step1[4], bd);
+  step2[5] = WRAPLOW(step1[2] - step1[5], bd);
+  step2[6] = WRAPLOW(step1[1] - step1[6], bd);
+  step2[7] = WRAPLOW(step1[0] - step1[7], bd);
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
+  temp2 = (step1[10] + step1[13]) * cospi_16_64;
+  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
+  temp2 = (step1[11] + step1[12]) * cospi_16_64;
+  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[14] = step1[14];
+  step2[15] = step1[15];
+
+  // stage 7
+  output[0] = WRAPLOW(step2[0] + step2[15], bd);
+  output[1] = WRAPLOW(step2[1] + step2[14], bd);
+  output[2] = WRAPLOW(step2[2] + step2[13], bd);
+  output[3] = WRAPLOW(step2[3] + step2[12], bd);
+  output[4] = WRAPLOW(step2[4] + step2[11], bd);
+  output[5] = WRAPLOW(step2[5] + step2[10], bd);
+  output[6] = WRAPLOW(step2[6] + step2[9], bd);
+  output[7] = WRAPLOW(step2[7] + step2[8], bd);
+  output[8] = WRAPLOW(step2[7] - step2[8], bd);
+  output[9] = WRAPLOW(step2[6] - step2[9], bd);
+  output[10] = WRAPLOW(step2[5] - step2[10], bd);
+  output[11] = WRAPLOW(step2[4] - step2[11], bd);
+  output[12] = WRAPLOW(step2[3] - step2[12], bd);
+  output[13] = WRAPLOW(step2[2] - step2[13], bd);
+  output[14] = WRAPLOW(step2[1] - step2[14], bd);
+  output[15] = WRAPLOW(step2[0] - step2[15], bd);
+}
+
+void vp10_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
+                                    int stride, int bd) {
+  tran_low_t out[16 * 16];
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[16], temp_out[16];
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  // First transform rows.
+  for (i = 0; i < 16; ++i) {
+    vp10_highbd_idct16_c(input, outptr, bd);
+    input += 16;
+    outptr += 16;
+  }
+
+  // Then transform columns.
+  for (i = 0; i < 16; ++i) {
+    for (j = 0; j < 16; ++j)
+      temp_in[j] = out[j * 16 + i];
+    vp10_highbd_idct16_c(temp_in, temp_out, bd);
+    for (j = 0; j < 16; ++j) {
+      dest[j * stride + i] = highbd_clip_pixel_add(
+          dest[j * stride + i],
+          ROUND_POWER_OF_TWO(temp_out[j], 6),
+          bd);
+    }
+  }
+}
+
+void vp10_highbd_iadst16_c(const tran_low_t *input,
+                           tran_low_t *output,
+                           int bd) {
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
+  tran_high_t s9, s10, s11, s12, s13, s14, s15;
+
+  tran_low_t x0 = input[15];
+  tran_low_t x1 = input[0];
+  tran_low_t x2 = input[13];
+  tran_low_t x3 = input[2];
+  tran_low_t x4 = input[11];
+  tran_low_t x5 = input[4];
+  tran_low_t x6 = input[9];
+  tran_low_t x7 = input[6];
+  tran_low_t x8 = input[7];
+  tran_low_t x9 = input[8];
+  tran_low_t x10 = input[5];
+  tran_low_t x11 = input[10];
+  tran_low_t x12 = input[3];
+  tran_low_t x13 = input[12];
+  tran_low_t x14 = input[1];
+  tran_low_t x15 = input[14];
+  (void) bd;
+
+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
+           | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
+    memset(output, 0, 16 * sizeof(*output));
+    return;
+  }
+
+  // stage 1
+  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
+  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
+  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
+  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
+  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
+  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
+  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
+  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
+  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
+  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
+  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
+  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
+  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
+  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
+  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
+  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
+
+  x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s8, bd), bd);
+  x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s9, bd), bd);
+  x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s10, bd), bd);
+  x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s11, bd), bd);
+  x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s12, bd), bd);
+  x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s13, bd), bd);
+  x6 = WRAPLOW(highbd_dct_const_round_shift(s6 + s14, bd), bd);
+  x7 = WRAPLOW(highbd_dct_const_round_shift(s7 + s15, bd), bd);
+  x8  = WRAPLOW(highbd_dct_const_round_shift(s0 - s8, bd), bd);
+  x9  = WRAPLOW(highbd_dct_const_round_shift(s1 - s9, bd), bd);
+  x10 = WRAPLOW(highbd_dct_const_round_shift(s2 - s10, bd), bd);
+  x11 = WRAPLOW(highbd_dct_const_round_shift(s3 - s11, bd), bd);
+  x12 = WRAPLOW(highbd_dct_const_round_shift(s4 - s12, bd), bd);
+  x13 = WRAPLOW(highbd_dct_const_round_shift(s5 - s13, bd), bd);
+  x14 = WRAPLOW(highbd_dct_const_round_shift(s6 - s14, bd), bd);
+  x15 = WRAPLOW(highbd_dct_const_round_shift(s7 - s15, bd), bd);
+
+  // stage 2
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = x4;
+  s5 = x5;
+  s6 = x6;
+  s7 = x7;
+  s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
+  s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
+  s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
+  s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
+  s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
+  s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
+  s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
+  s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
+
+  x0 = WRAPLOW(s0 + s4, bd);
+  x1 = WRAPLOW(s1 + s5, bd);
+  x2 = WRAPLOW(s2 + s6, bd);
+  x3 = WRAPLOW(s3 + s7, bd);
+  x4 = WRAPLOW(s0 - s4, bd);
+  x5 = WRAPLOW(s1 - s5, bd);
+  x6 = WRAPLOW(s2 - s6, bd);
+  x7 = WRAPLOW(s3 - s7, bd);
+  x8 = WRAPLOW(highbd_dct_const_round_shift(s8 + s12, bd), bd);
+  x9 = WRAPLOW(highbd_dct_const_round_shift(s9 + s13, bd), bd);
+  x10 = WRAPLOW(highbd_dct_const_round_shift(s10 + s14, bd), bd);
+  x11 = WRAPLOW(highbd_dct_const_round_shift(s11 + s15, bd), bd);
+  x12 = WRAPLOW(highbd_dct_const_round_shift(s8 - s12, bd), bd);
+  x13 = WRAPLOW(highbd_dct_const_round_shift(s9 - s13, bd), bd);
+  x14 = WRAPLOW(highbd_dct_const_round_shift(s10 - s14, bd), bd);
+  x15 = WRAPLOW(highbd_dct_const_round_shift(s11 - s15, bd), bd);
+
+  // stage 3
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
+  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
+  s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
+  s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
+  s8 = x8;
+  s9 = x9;
+  s10 = x10;
+  s11 = x11;
+  s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
+  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
+  s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
+  s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
+
+  x0 = WRAPLOW(s0 + s2, bd);
+  x1 = WRAPLOW(s1 + s3, bd);
+  x2 = WRAPLOW(s0 - s2, bd);
+  x3 = WRAPLOW(s1 - s3, bd);
+  x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd);
+  x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd);
+  x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd);
+  x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd);
+  x8 = WRAPLOW(s8 + s10, bd);
+  x9 = WRAPLOW(s9 + s11, bd);
+  x10 = WRAPLOW(s8 - s10, bd);
+  x11 = WRAPLOW(s9 - s11, bd);
+  x12 = WRAPLOW(highbd_dct_const_round_shift(s12 + s14, bd), bd);
+  x13 = WRAPLOW(highbd_dct_const_round_shift(s13 + s15, bd), bd);
+  x14 = WRAPLOW(highbd_dct_const_round_shift(s12 - s14, bd), bd);
+  x15 = WRAPLOW(highbd_dct_const_round_shift(s13 - s15, bd), bd);
+
+  // stage 4
+  s2 = (- cospi_16_64) * (x2 + x3);
+  s3 = cospi_16_64 * (x2 - x3);
+  s6 = cospi_16_64 * (x6 + x7);
+  s7 = cospi_16_64 * (-x6 + x7);
+  s10 = cospi_16_64 * (x10 + x11);
+  s11 = cospi_16_64 * (-x10 + x11);
+  s14 = (- cospi_16_64) * (x14 + x15);
+  s15 = cospi_16_64 * (x14 - x15);
+
+  x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
+  x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd);
+  x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd);
+  x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd);
+  x10 = WRAPLOW(highbd_dct_const_round_shift(s10, bd), bd);
+  x11 = WRAPLOW(highbd_dct_const_round_shift(s11, bd), bd);
+  x14 = WRAPLOW(highbd_dct_const_round_shift(s14, bd), bd);
+  x15 = WRAPLOW(highbd_dct_const_round_shift(s15, bd), bd);
+
+  output[0] = WRAPLOW(x0, bd);
+  output[1] = WRAPLOW(-x8, bd);
+  output[2] = WRAPLOW(x12, bd);
+  output[3] = WRAPLOW(-x4, bd);
+  output[4] = WRAPLOW(x6, bd);
+  output[5] = WRAPLOW(x14, bd);
+  output[6] = WRAPLOW(x10, bd);
+  output[7] = WRAPLOW(x2, bd);
+  output[8] = WRAPLOW(x3, bd);
+  output[9] = WRAPLOW(x11, bd);
+  output[10] = WRAPLOW(x15, bd);
+  output[11] = WRAPLOW(x7, bd);
+  output[12] = WRAPLOW(x5, bd);
+  output[13] = WRAPLOW(-x13, bd);
+  output[14] = WRAPLOW(x9, bd);
+  output[15] = WRAPLOW(-x1, bd);
+}
+
+void vp10_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
+                                   int stride, int bd) {
+  tran_low_t out[16 * 16] = { 0 };
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[16], temp_out[16];
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  // First transform rows. Since all non-zero dct coefficients are in
+  // upper-left 4x4 area, we only need to calculate first 4 rows here.
+  for (i = 0; i < 4; ++i) {
+    vp10_highbd_idct16_c(input, outptr, bd);
+    input += 16;
+    outptr += 16;
+  }
+
+  // Then transform columns.
+  for (i = 0; i < 16; ++i) {
+    for (j = 0; j < 16; ++j)
+      temp_in[j] = out[j*16 + i];
+    vp10_highbd_idct16_c(temp_in, temp_out, bd);
+    for (j = 0; j < 16; ++j) {
+      dest[j * stride + i] = highbd_clip_pixel_add(
+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+    }
+  }
+}
+
+void vp10_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
+                                  int stride, int bd) {
+  int i, j;
+  tran_high_t a1;
+  tran_low_t out = WRAPLOW(
+      highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
+  a1 = ROUND_POWER_OF_TWO(out, 6);
+  for (j = 0; j < 16; ++j) {
+    for (i = 0; i < 16; ++i)
+      dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
+    dest += stride;
+  }
+}
+
+static void highbd_idct32_c(const tran_low_t *input,
+                            tran_low_t *output, int bd) {
+  tran_low_t step1[32], step2[32];
+  tran_high_t temp1, temp2;
+  (void) bd;
+
+  // stage 1
+  step1[0] = input[0];
+  step1[1] = input[16];
+  step1[2] = input[8];
+  step1[3] = input[24];
+  step1[4] = input[4];
+  step1[5] = input[20];
+  step1[6] = input[12];
+  step1[7] = input[28];
+  step1[8] = input[2];
+  step1[9] = input[18];
+  step1[10] = input[10];
+  step1[11] = input[26];
+  step1[12] = input[6];
+  step1[13] = input[22];
+  step1[14] = input[14];
+  step1[15] = input[30];
+
+  temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
+  temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
+  step1[16] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[31] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
+  temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
+  step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
+  temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
+  step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
+  temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
+  step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
+  temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
+  step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
+  temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
+  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
+  temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
+  step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
+  temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
+  step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  // stage 2
+  step2[0] = step1[0];
+  step2[1] = step1[1];
+  step2[2] = step1[2];
+  step2[3] = step1[3];
+  step2[4] = step1[4];
+  step2[5] = step1[5];
+  step2[6] = step1[6];
+  step2[7] = step1[7];
+
+  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
+  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
+  step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
+  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
+  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
+  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
+  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
+  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
+  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  step2[16] = WRAPLOW(step1[16] + step1[17], bd);
+  step2[17] = WRAPLOW(step1[16] - step1[17], bd);
+  step2[18] = WRAPLOW(-step1[18] + step1[19], bd);
+  step2[19] = WRAPLOW(step1[18] + step1[19], bd);
+  step2[20] = WRAPLOW(step1[20] + step1[21], bd);
+  step2[21] = WRAPLOW(step1[20] - step1[21], bd);
+  step2[22] = WRAPLOW(-step1[22] + step1[23], bd);
+  step2[23] = WRAPLOW(step1[22] + step1[23], bd);
+  step2[24] = WRAPLOW(step1[24] + step1[25], bd);
+  step2[25] = WRAPLOW(step1[24] - step1[25], bd);
+  step2[26] = WRAPLOW(-step1[26] + step1[27], bd);
+  step2[27] = WRAPLOW(step1[26] + step1[27], bd);
+  step2[28] = WRAPLOW(step1[28] + step1[29], bd);
+  step2[29] = WRAPLOW(step1[28] - step1[29], bd);
+  step2[30] = WRAPLOW(-step1[30] + step1[31], bd);
+  step2[31] = WRAPLOW(step1[30] + step1[31], bd);
+
+  // stage 3
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[2];
+  step1[3] = step2[3];
+
+  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
+  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
+  step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
+  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
+  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  step1[8] = WRAPLOW(step2[8] + step2[9], bd);
+  step1[9] = WRAPLOW(step2[8] - step2[9], bd);
+  step1[10] = WRAPLOW(-step2[10] + step2[11], bd);
+  step1[11] = WRAPLOW(step2[10] + step2[11], bd);
+  step1[12] = WRAPLOW(step2[12] + step2[13], bd);
+  step1[13] = WRAPLOW(step2[12] - step2[13], bd);
+  step1[14] = WRAPLOW(-step2[14] + step2[15], bd);
+  step1[15] = WRAPLOW(step2[14] + step2[15], bd);
+
+  step1[16] = step2[16];
+  step1[31] = step2[31];
+  temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
+  temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
+  step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
+  temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
+  step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[19] = step2[19];
+  step1[20] = step2[20];
+  temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
+  temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
+  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
+  temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
+  step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[27] = step2[27];
+  step1[28] = step2[28];
+
+  // stage 4
+  temp1 = (step1[0] + step1[1]) * cospi_16_64;
+  temp2 = (step1[0] - step1[1]) * cospi_16_64;
+  step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
+  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
+  step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[4] = WRAPLOW(step1[4] + step1[5], bd);
+  step2[5] = WRAPLOW(step1[4] - step1[5], bd);
+  step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
+  step2[7] = WRAPLOW(step1[6] + step1[7], bd);
+
+  step2[8] = step1[8];
+  step2[15] = step1[15];
+  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
+  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
+  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
+  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
+  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+
+  step2[16] = WRAPLOW(step1[16] + step1[19], bd);
+  step2[17] = WRAPLOW(step1[17] + step1[18], bd);
+  step2[18] = WRAPLOW(step1[17] - step1[18], bd);
+  step2[19] = WRAPLOW(step1[16] - step1[19], bd);
+  step2[20] = WRAPLOW(-step1[20] + step1[23], bd);
+  step2[21] = WRAPLOW(-step1[21] + step1[22], bd);
+  step2[22] = WRAPLOW(step1[21] + step1[22], bd);
+  step2[23] = WRAPLOW(step1[20] + step1[23], bd);
+
+  step2[24] = WRAPLOW(step1[24] + step1[27], bd);
+  step2[25] = WRAPLOW(step1[25] + step1[26], bd);
+  step2[26] = WRAPLOW(step1[25] - step1[26], bd);
+  step2[27] = WRAPLOW(step1[24] - step1[27], bd);
+  step2[28] = WRAPLOW(-step1[28] + step1[31], bd);
+  step2[29] = WRAPLOW(-step1[29] + step1[30], bd);
+  step2[30] = WRAPLOW(step1[29] + step1[30], bd);
+  step2[31] = WRAPLOW(step1[28] + step1[31], bd);
+
+  // stage 5
+  step1[0] = WRAPLOW(step2[0] + step2[3], bd);
+  step1[1] = WRAPLOW(step2[1] + step2[2], bd);
+  step1[2] = WRAPLOW(step2[1] - step2[2], bd);
+  step1[3] = WRAPLOW(step2[0] - step2[3], bd);
+  step1[4] = step2[4];
+  temp1 = (step2[6] - step2[5]) * cospi_16_64;
+  temp2 = (step2[5] + step2[6]) * cospi_16_64;
+  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[7] = step2[7];
+
+  step1[8] = WRAPLOW(step2[8] + step2[11], bd);
+  step1[9] = WRAPLOW(step2[9] + step2[10], bd);
+  step1[10] = WRAPLOW(step2[9] - step2[10], bd);
+  step1[11] = WRAPLOW(step2[8] - step2[11], bd);
+  step1[12] = WRAPLOW(-step2[12] + step2[15], bd);
+  step1[13] = WRAPLOW(-step2[13] + step2[14], bd);
+  step1[14] = WRAPLOW(step2[13] + step2[14], bd);
+  step1[15] = WRAPLOW(step2[12] + step2[15], bd);
+
+  step1[16] = step2[16];
+  step1[17] = step2[17];
+  temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
+  temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
+  step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
+  temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
+  step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
+  temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
+  step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
+  temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
+  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[22] = step2[22];
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[25] = step2[25];
+  step1[30] = step2[30];
+  step1[31] = step2[31];
+
+  // stage 6
+  step2[0] = WRAPLOW(step1[0] + step1[7], bd);
+  step2[1] = WRAPLOW(step1[1] + step1[6], bd);
+  step2[2] = WRAPLOW(step1[2] + step1[5], bd);
+  step2[3] = WRAPLOW(step1[3] + step1[4], bd);
+  step2[4] = WRAPLOW(step1[3] - step1[4], bd);
+  step2[5] = WRAPLOW(step1[2] - step1[5], bd);
+  step2[6] = WRAPLOW(step1[1] - step1[6], bd);
+  step2[7] = WRAPLOW(step1[0] - step1[7], bd);
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
+  temp2 = (step1[10] + step1[13]) * cospi_16_64;
+  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
+  temp2 = (step1[11] + step1[12]) * cospi_16_64;
+  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[14] = step1[14];
+  step2[15] = step1[15];
+
+  step2[16] = WRAPLOW(step1[16] + step1[23], bd);
+  step2[17] = WRAPLOW(step1[17] + step1[22], bd);
+  step2[18] = WRAPLOW(step1[18] + step1[21], bd);
+  step2[19] = WRAPLOW(step1[19] + step1[20], bd);
+  step2[20] = WRAPLOW(step1[19] - step1[20], bd);
+  step2[21] = WRAPLOW(step1[18] - step1[21], bd);
+  step2[22] = WRAPLOW(step1[17] - step1[22], bd);
+  step2[23] = WRAPLOW(step1[16] - step1[23], bd);
+
+  step2[24] = WRAPLOW(-step1[24] + step1[31], bd);
+  step2[25] = WRAPLOW(-step1[25] + step1[30], bd);
+  step2[26] = WRAPLOW(-step1[26] + step1[29], bd);
+  step2[27] = WRAPLOW(-step1[27] + step1[28], bd);
+  step2[28] = WRAPLOW(step1[27] + step1[28], bd);
+  step2[29] = WRAPLOW(step1[26] + step1[29], bd);
+  step2[30] = WRAPLOW(step1[25] + step1[30], bd);
+  step2[31] = WRAPLOW(step1[24] + step1[31], bd);
+
+  // stage 7
+  step1[0] = WRAPLOW(step2[0] + step2[15], bd);
+  step1[1] = WRAPLOW(step2[1] + step2[14], bd);
+  step1[2] = WRAPLOW(step2[2] + step2[13], bd);
+  step1[3] = WRAPLOW(step2[3] + step2[12], bd);
+  step1[4] = WRAPLOW(step2[4] + step2[11], bd);
+  step1[5] = WRAPLOW(step2[5] + step2[10], bd);
+  step1[6] = WRAPLOW(step2[6] + step2[9], bd);
+  step1[7] = WRAPLOW(step2[7] + step2[8], bd);
+  step1[8] = WRAPLOW(step2[7] - step2[8], bd);
+  step1[9] = WRAPLOW(step2[6] - step2[9], bd);
+  step1[10] = WRAPLOW(step2[5] - step2[10], bd);
+  step1[11] = WRAPLOW(step2[4] - step2[11], bd);
+  step1[12] = WRAPLOW(step2[3] - step2[12], bd);
+  step1[13] = WRAPLOW(step2[2] - step2[13], bd);
+  step1[14] = WRAPLOW(step2[1] - step2[14], bd);
+  step1[15] = WRAPLOW(step2[0] - step2[15], bd);
+
+  step1[16] = step2[16];
+  step1[17] = step2[17];
+  step1[18] = step2[18];
+  step1[19] = step2[19];
+  temp1 = (-step2[20] + step2[27]) * cospi_16_64;
+  temp2 = (step2[20] + step2[27]) * cospi_16_64;
+  step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = (-step2[21] + step2[26]) * cospi_16_64;
+  temp2 = (step2[21] + step2[26]) * cospi_16_64;
+  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = (-step2[22] + step2[25]) * cospi_16_64;
+  temp2 = (step2[22] + step2[25]) * cospi_16_64;
+  step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = (-step2[23] + step2[24]) * cospi_16_64;
+  temp2 = (step2[23] + step2[24]) * cospi_16_64;
+  step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[28] = step2[28];
+  step1[29] = step2[29];
+  step1[30] = step2[30];
+  step1[31] = step2[31];
+
+  // final stage
+  output[0] = WRAPLOW(step1[0] + step1[31], bd);
+  output[1] = WRAPLOW(step1[1] + step1[30], bd);
+  output[2] = WRAPLOW(step1[2] + step1[29], bd);
+  output[3] = WRAPLOW(step1[3] + step1[28], bd);
+  output[4] = WRAPLOW(step1[4] + step1[27], bd);
+  output[5] = WRAPLOW(step1[5] + step1[26], bd);
+  output[6] = WRAPLOW(step1[6] + step1[25], bd);
+  output[7] = WRAPLOW(step1[7] + step1[24], bd);
+  output[8] = WRAPLOW(step1[8] + step1[23], bd);
+  output[9] = WRAPLOW(step1[9] + step1[22], bd);
+  output[10] = WRAPLOW(step1[10] + step1[21], bd);
+  output[11] = WRAPLOW(step1[11] + step1[20], bd);
+  output[12] = WRAPLOW(step1[12] + step1[19], bd);
+  output[13] = WRAPLOW(step1[13] + step1[18], bd);
+  output[14] = WRAPLOW(step1[14] + step1[17], bd);
+  output[15] = WRAPLOW(step1[15] + step1[16], bd);
+  output[16] = WRAPLOW(step1[15] - step1[16], bd);
+  output[17] = WRAPLOW(step1[14] - step1[17], bd);
+  output[18] = WRAPLOW(step1[13] - step1[18], bd);
+  output[19] = WRAPLOW(step1[12] - step1[19], bd);
+  output[20] = WRAPLOW(step1[11] - step1[20], bd);
+  output[21] = WRAPLOW(step1[10] - step1[21], bd);
+  output[22] = WRAPLOW(step1[9] - step1[22], bd);
+  output[23] = WRAPLOW(step1[8] - step1[23], bd);
+  output[24] = WRAPLOW(step1[7] - step1[24], bd);
+  output[25] = WRAPLOW(step1[6] - step1[25], bd);
+  output[26] = WRAPLOW(step1[5] - step1[26], bd);
+  output[27] = WRAPLOW(step1[4] - step1[27], bd);
+  output[28] = WRAPLOW(step1[3] - step1[28], bd);
+  output[29] = WRAPLOW(step1[2] - step1[29], bd);
+  output[30] = WRAPLOW(step1[1] - step1[30], bd);
+  output[31] = WRAPLOW(step1[0] - step1[31], bd);
+}
+
+void vp10_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
+                                     int stride, int bd) {
+  tran_low_t out[32 * 32];
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[32], temp_out[32];
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  // Rows
+  for (i = 0; i < 32; ++i) {
+    tran_low_t zero_coeff[16];
+    for (j = 0; j < 16; ++j)
+      zero_coeff[j] = input[2 * j] | input[2 * j + 1];
+    for (j = 0; j < 8; ++j)
+      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+    for (j = 0; j < 4; ++j)
+      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+    for (j = 0; j < 2; ++j)
+      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+
+    if (zero_coeff[0] | zero_coeff[1])
+      highbd_idct32_c(input, outptr, bd);
+    else
+      memset(outptr, 0, sizeof(tran_low_t) * 32);
+    input += 32;
+    outptr += 32;
+  }
+
+  // Columns
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 32; ++j)
+      temp_in[j] = out[j * 32 + i];
+    highbd_idct32_c(temp_in, temp_out, bd);
+    for (j = 0; j < 32; ++j) {
+      dest[j * stride + i] = highbd_clip_pixel_add(
+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+    }
+  }
+}
+
+void vp10_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
+                                   int stride, int bd) {
+  tran_low_t out[32 * 32] = {0};
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[32], temp_out[32];
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  // Rows
+  // Only upper-left 8x8 has non-zero coeff.
+  for (i = 0; i < 8; ++i) {
+    highbd_idct32_c(input, outptr, bd);
+    input += 32;
+    outptr += 32;
+  }
+  // Columns
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 32; ++j)
+      temp_in[j] = out[j * 32 + i];
+    highbd_idct32_c(temp_in, temp_out, bd);
+    for (j = 0; j < 32; ++j) {
+      dest[j * stride + i] = highbd_clip_pixel_add(
+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+    }
+  }
+}
+
+void vp10_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
+                                  int stride, int bd) {
+  int i, j;
+  int a1;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  tran_low_t out = WRAPLOW(
+      highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
+  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
+  a1 = ROUND_POWER_OF_TWO(out, 6);
+
+  for (j = 0; j < 32; ++j) {
+    for (i = 0; i < 32; ++i)
+      dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
+    dest += stride;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/libs/libvpx/vp10/common/vp10_inv_txfm.h b/libs/libvpx/vp10/common/vp10_inv_txfm.h
new file mode 100644
index 0000000000..52611acbd2
--- /dev/null
+++ b/libs/libvpx/vp10/common/vp10_inv_txfm.h
@@ -0,0 +1,122 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_INV_TXFM_H_
+#define VPX_DSP_INV_TXFM_H_
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_ports/mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE tran_low_t check_range(tran_high_t input) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+  // For valid VP9 input streams, intermediate stage coefficients should always
+  // stay within the range of a signed 16 bit integer. Coefficients can go out
+  // of this range for invalid/corrupt VP9 streams. However, strictly checking
+  // this range for every intermediate coefficient can burdensome for a decoder,
+  // therefore the following assertion is only enabled when configured with
+  // --enable-coefficient-range-checking.
+  assert(INT16_MIN <= input);
+  assert(input <= INT16_MAX);
+#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
+  return (tran_low_t)input;
+}
+
+static INLINE tran_low_t dct_const_round_shift(tran_high_t input) {
+  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+  return check_range(rv);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE tran_low_t highbd_check_range(tran_high_t input,
+                                            int bd) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+  // For valid highbitdepth VP9 streams, intermediate stage coefficients will
+  // stay within the ranges:
+  // - 8 bit: signed 16 bit integer
+  // - 10 bit: signed 18 bit integer
+  // - 12 bit: signed 20 bit integer
+  const int32_t int_max = (1 << (7 + bd)) - 1;
+  const int32_t int_min = -int_max - 1;
+  assert(int_min <= input);
+  assert(input <= int_max);
+  (void) int_min;
+#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
+  (void) bd;
+  return (tran_low_t)input;
+}
+
+static INLINE tran_low_t highbd_dct_const_round_shift(tran_high_t input,
+                                                      int bd) {
+  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+  return highbd_check_range(rv, bd);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if CONFIG_EMULATE_HARDWARE
+// When CONFIG_EMULATE_HARDWARE is 1 the transform performs a
+// non-normative method to handle overflows. A stream that causes
+// overflows  in the inverse transform is considered invalid in VP9,
+// and a hardware implementer is free to choose any reasonable
+// method to handle overflows. However to aid in hardware
+// verification they can use a specific implementation of the
+// WRAPLOW() macro below that is identical to their intended
+// hardware implementation (and also use configure options to trigger
+// the C-implementation of the transform).
+//
+// The particular WRAPLOW implementation below performs strict
+// overflow wrapping to match common hardware implementations.
+// bd of 8 uses trans_low with 16bits, need to remove 16bits
+// bd of 10 uses trans_low with 18bits, need to remove 14bits
+// bd of 12 uses trans_low with 20bits, need to remove 12bits
+// bd of x uses trans_low with 8+x bits, need to remove 24-x bits
+#define WRAPLOW(x, bd) ((((int32_t)(x)) << (24 - bd)) >> (24 - bd))
+#else
+#define WRAPLOW(x, bd) ((int32_t)(x))
+#endif  // CONFIG_EMULATE_HARDWARE
+
+void vp10_idct4_c(const tran_low_t *input, tran_low_t *output);
+void vp10_idct8_c(const tran_low_t *input, tran_low_t *output);
+void vp10_idct16_c(const tran_low_t *input, tran_low_t *output);
+void vp10_idct32_c(const tran_low_t *input, tran_low_t *output);
+void vp10_iadst4_c(const tran_low_t *input, tran_low_t *output);
+void vp10_iadst8_c(const tran_low_t *input, tran_low_t *output);
+void vp10_iadst16_c(const tran_low_t *input, tran_low_t *output);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp10_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd);
+void vp10_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd);
+void vp10_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd);
+
+void vp10_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd);
+void vp10_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd);
+void vp10_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd);
+
+static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
+                                             int bd) {
+  trans = WRAPLOW(trans, bd);
+  return clip_pixel_highbd(WRAPLOW(dest + trans, bd), bd);
+}
+#endif
+
+static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) {
+  trans = WRAPLOW(trans, 8);
+  return clip_pixel(WRAPLOW(dest + trans, 8));
+}
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif  // VPX_DSP_INV_TXFM_H_
diff --git a/libs/libvpx/vp10/common/vp10_rtcd.c b/libs/libvpx/vp10/common/vp10_rtcd.c
new file mode 100644
index 0000000000..36b294ae8a
--- /dev/null
+++ b/libs/libvpx/vp10/common/vp10_rtcd.c
@@ -0,0 +1,19 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "./vpx_config.h"
+#define RTCD_C
+#include "./vp10_rtcd.h"
+#include "vpx_ports/vpx_once.h"
+
+void vp10_rtcd() {
+    // TODO(JBB): Remove this once, by insuring that both the encoder and
+    // decoder setup functions are protected by once();
+    once(setup_rtcd_internal);
+}
diff --git a/libs/libvpx/vp10/common/vp10_rtcd_defs.pl b/libs/libvpx/vp10/common/vp10_rtcd_defs.pl
new file mode 100644
index 0000000000..9860baedfe
--- /dev/null
+++ b/libs/libvpx/vp10/common/vp10_rtcd_defs.pl
@@ -0,0 +1,656 @@
+sub vp10_common_forward_decls() {
+print <<EOF
+/*
+ * VP10
+ */
+
+#include "vpx/vpx_integer.h"
+#include "vp10/common/common.h"
+#include "vp10/common/enums.h"
+
+struct macroblockd;
+
+/* Encoder forward decls */
+struct macroblock;
+struct vp9_variance_vtable;
+struct search_site_config;
+struct mv;
+union int_mv;
+struct yv12_buffer_config;
+EOF
+}
+forward_decls qw/vp10_common_forward_decls/;
+
+# x86inc.asm had specific constraints. break it out so it's easy to disable.
+# zero all the variables to avoid tricky else conditions.
+$mmx_x86inc = $sse_x86inc = $sse2_x86inc = $ssse3_x86inc = $avx_x86inc =
+  $avx2_x86inc = '';
+$mmx_x86_64_x86inc = $sse_x86_64_x86inc = $sse2_x86_64_x86inc =
+  $ssse3_x86_64_x86inc = $avx_x86_64_x86inc = $avx2_x86_64_x86inc = '';
+if (vpx_config("CONFIG_USE_X86INC") eq "yes") {
+  $mmx_x86inc = 'mmx';
+  $sse_x86inc = 'sse';
+  $sse2_x86inc = 'sse2';
+  $ssse3_x86inc = 'ssse3';
+  $avx_x86inc = 'avx';
+  $avx2_x86inc = 'avx2';
+  if ($opts{arch} eq "x86_64") {
+    $mmx_x86_64_x86inc = 'mmx';
+    $sse_x86_64_x86inc = 'sse';
+    $sse2_x86_64_x86inc = 'sse2';
+    $ssse3_x86_64_x86inc = 'ssse3';
+    $avx_x86_64_x86inc = 'avx';
+    $avx2_x86_64_x86inc = 'avx2';
+  }
+}
+
+# functions that are 64 bit only.
+$mmx_x86_64 = $sse2_x86_64 = $ssse3_x86_64 = $avx_x86_64 = $avx2_x86_64 = '';
+if ($opts{arch} eq "x86_64") {
+  $mmx_x86_64 = 'mmx';
+  $sse2_x86_64 = 'sse2';
+  $ssse3_x86_64 = 'ssse3';
+  $avx_x86_64 = 'avx';
+  $avx2_x86_64 = 'avx2';
+}
+
+#
+# post proc
+#
+if (vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
+add_proto qw/void vp10_mbpost_proc_down/, "uint8_t *dst, int pitch, int rows, int cols, int flimit";
+specialize qw/vp10_mbpost_proc_down sse2/;
+$vp10_mbpost_proc_down_sse2=vp10_mbpost_proc_down_xmm;
+
+add_proto qw/void vp10_mbpost_proc_across_ip/, "uint8_t *src, int pitch, int rows, int cols, int flimit";
+specialize qw/vp10_mbpost_proc_across_ip sse2/;
+$vp10_mbpost_proc_across_ip_sse2=vp10_mbpost_proc_across_ip_xmm;
+
+add_proto qw/void vp10_post_proc_down_and_across/, "const uint8_t *src_ptr, uint8_t *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit";
+specialize qw/vp10_post_proc_down_and_across sse2/;
+$vp10_post_proc_down_and_across_sse2=vp10_post_proc_down_and_across_xmm;
+
+add_proto qw/void vp10_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch";
+specialize qw/vp10_plane_add_noise sse2/;
+$vp10_plane_add_noise_sse2=vp10_plane_add_noise_wmt;
+
+add_proto qw/void vp10_filter_by_weight16x16/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight";
+specialize qw/vp10_filter_by_weight16x16 sse2 msa/;
+
+add_proto qw/void vp10_filter_by_weight8x8/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight";
+specialize qw/vp10_filter_by_weight8x8 sse2 msa/;
+}
+
+#
+# dct
+#
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  # Note as optimized versions of these functions are added we need to add a check to ensure
+  # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
+  if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
+    add_proto qw/void vp10_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    specialize qw/vp10_iht4x4_16_add/;
+
+    add_proto qw/void vp10_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    specialize qw/vp10_iht8x8_64_add/;
+
+    add_proto qw/void vp10_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+    specialize qw/vp10_iht16x16_256_add/;
+
+    add_proto qw/void vp10_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct4x4/;
+
+    add_proto qw/void vp10_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct4x4_1/;
+
+    add_proto qw/void vp10_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct8x8/;
+
+    add_proto qw/void vp10_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct8x8_1/;
+
+    add_proto qw/void vp10_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct16x16/;
+
+    add_proto qw/void vp10_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct16x16_1/;
+
+    add_proto qw/void vp10_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct32x32/;
+
+    add_proto qw/void vp10_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct32x32_rd/;
+
+    add_proto qw/void vp10_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct32x32_1/;
+
+    add_proto qw/void vp10_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_highbd_fdct4x4/;
+
+    add_proto qw/void vp10_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_highbd_fdct8x8/;
+
+    add_proto qw/void vp10_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_highbd_fdct8x8_1/;
+
+    add_proto qw/void vp10_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_highbd_fdct16x16/;
+
+    add_proto qw/void vp10_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_highbd_fdct16x16_1/;
+
+    add_proto qw/void vp10_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_highbd_fdct32x32/;
+
+    add_proto qw/void vp10_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_highbd_fdct32x32_rd/;
+
+    add_proto qw/void vp10_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_highbd_fdct32x32_1/;
+  } else {
+    add_proto qw/void vp10_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    specialize qw/vp10_iht4x4_16_add sse2/;
+
+    add_proto qw/void vp10_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    specialize qw/vp10_iht8x8_64_add sse2/;
+
+    add_proto qw/void vp10_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+    specialize qw/vp10_iht16x16_256_add/;
+
+    add_proto qw/void vp10_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct4x4 sse2/;
+
+    add_proto qw/void vp10_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct4x4_1 sse2/;
+
+    add_proto qw/void vp10_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct8x8 sse2/;
+
+    add_proto qw/void vp10_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct8x8_1 sse2/;
+
+    add_proto qw/void vp10_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct16x16 sse2/;
+
+    add_proto qw/void vp10_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct16x16_1 sse2/;
+
+    add_proto qw/void vp10_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct32x32 sse2/;
+
+    add_proto qw/void vp10_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct32x32_rd sse2/;
+
+    add_proto qw/void vp10_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct32x32_1 sse2/;
+
+    add_proto qw/void vp10_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_highbd_fdct4x4 sse2/;
+
+    add_proto qw/void vp10_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_highbd_fdct8x8 sse2/;
+
+    add_proto qw/void vp10_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_highbd_fdct8x8_1/;
+
+    add_proto qw/void vp10_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_highbd_fdct16x16 sse2/;
+
+    add_proto qw/void vp10_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_highbd_fdct16x16_1/;
+
+    add_proto qw/void vp10_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_highbd_fdct32x32 sse2/;
+
+    add_proto qw/void vp10_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_highbd_fdct32x32_rd sse2/;
+
+    add_proto qw/void vp10_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_highbd_fdct32x32_1/;
+  }
+} else {
+  # Force C versions if CONFIG_EMULATE_HARDWARE is 1
+  if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
+    add_proto qw/void vp10_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    specialize qw/vp10_iht4x4_16_add/;
+
+    add_proto qw/void vp10_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    specialize qw/vp10_iht8x8_64_add/;
+
+    add_proto qw/void vp10_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+    specialize qw/vp10_iht16x16_256_add/;
+
+    add_proto qw/void vp10_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct4x4/;
+
+    add_proto qw/void vp10_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct4x4_1/;
+
+    add_proto qw/void vp10_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct8x8/;
+
+    add_proto qw/void vp10_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct8x8_1/;
+
+    add_proto qw/void vp10_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct16x16/;
+
+    add_proto qw/void vp10_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct16x16_1/;
+
+    add_proto qw/void vp10_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct32x32/;
+
+    add_proto qw/void vp10_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct32x32_rd/;
+
+    add_proto qw/void vp10_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct32x32_1/;
+  } else {
+    add_proto qw/void vp10_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    specialize qw/vp10_iht4x4_16_add sse2 neon dspr2 msa/;
+
+    add_proto qw/void vp10_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    specialize qw/vp10_iht8x8_64_add sse2 neon dspr2 msa/;
+
+    add_proto qw/void vp10_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+    specialize qw/vp10_iht16x16_256_add sse2 dspr2 msa/;
+
+    add_proto qw/void vp10_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct4x4 sse2/;
+
+    add_proto qw/void vp10_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct4x4_1 sse2/;
+
+    add_proto qw/void vp10_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct8x8 sse2/;
+
+    add_proto qw/void vp10_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct8x8_1 sse2/;
+
+    add_proto qw/void vp10_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct16x16 sse2/;
+
+    add_proto qw/void vp10_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct16x16_1 sse2/;
+
+    add_proto qw/void vp10_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct32x32 sse2/;
+
+    add_proto qw/void vp10_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct32x32_rd sse2/;
+
+    add_proto qw/void vp10_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp10_fdct32x32_1 sse2/;
+  }
+}
+
+# High bitdepth functions
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  #
+  # Sub Pixel Filters
+  #
+  add_proto qw/void vp10_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vp10_highbd_convolve_copy/;
+
+  add_proto qw/void vp10_highbd_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vp10_highbd_convolve_avg/;
+
+  add_proto qw/void vp10_highbd_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vp10_highbd_convolve8/, "$sse2_x86_64";
+
+  add_proto qw/void vp10_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vp10_highbd_convolve8_horiz/, "$sse2_x86_64";
+
+  add_proto qw/void vp10_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vp10_highbd_convolve8_vert/, "$sse2_x86_64";
+
+  add_proto qw/void vp10_highbd_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vp10_highbd_convolve8_avg/, "$sse2_x86_64";
+
+  add_proto qw/void vp10_highbd_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vp10_highbd_convolve8_avg_horiz/, "$sse2_x86_64";
+
+  add_proto qw/void vp10_highbd_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vp10_highbd_convolve8_avg_vert/, "$sse2_x86_64";
+
+  #
+  # post proc
+  #
+  if (vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
+    add_proto qw/void vp10_highbd_mbpost_proc_down/, "uint16_t *dst, int pitch, int rows, int cols, int flimit";
+    specialize qw/vp10_highbd_mbpost_proc_down/;
+
+    add_proto qw/void vp10_highbd_mbpost_proc_across_ip/, "uint16_t *src, int pitch, int rows, int cols, int flimit";
+    specialize qw/vp10_highbd_mbpost_proc_across_ip/;
+
+    add_proto qw/void vp10_highbd_post_proc_down_and_across/, "const uint16_t *src_ptr, uint16_t *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit";
+    specialize qw/vp10_highbd_post_proc_down_and_across/;
+
+    add_proto qw/void vp10_highbd_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch";
+    specialize qw/vp10_highbd_plane_add_noise/;
+  }
+
+  #
+  # dct
+  #
+  # Note as optimized versions of these functions are added we need to add a check to ensure
+  # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
+  add_proto qw/void vp10_highbd_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+  specialize qw/vp10_highbd_iht4x4_16_add/;
+
+  add_proto qw/void vp10_highbd_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+  specialize qw/vp10_highbd_iht8x8_64_add/;
+
+  add_proto qw/void vp10_highbd_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd";
+  specialize qw/vp10_highbd_iht16x16_256_add/;
+}
+
+#
+# Encoder functions below this point.
+#
+if (vpx_config("CONFIG_VP10_ENCODER") eq "yes") {
+
+# ENCODEMB INVOKE
+
+#
+# Denoiser
+#
+if (vpx_config("CONFIG_VP9_TEMPORAL_DENOISING") eq "yes") {
+  add_proto qw/int vp10_denoiser_filter/, "const uint8_t *sig, int sig_stride, const uint8_t *mc_avg, int mc_avg_stride, uint8_t *avg, int avg_stride, int increase_denoising, BLOCK_SIZE bs, int motion_magnitude";
+  specialize qw/vp10_denoiser_filter sse2/;
+}
+
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+# the transform coefficients are held in 32-bit
+# values, so the assembler code for  vp10_block_error can no longer be used.
+  add_proto qw/int64_t vp10_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
+  specialize qw/vp10_block_error/;
+
+  add_proto qw/void vp10_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp10_quantize_fp/;
+
+  add_proto qw/void vp10_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp10_quantize_fp_32x32/;
+
+  add_proto qw/void vp10_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp10_fdct8x8_quant/;
+} else {
+  add_proto qw/int64_t vp10_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
+  specialize qw/vp10_block_error avx2 msa/, "$sse2_x86inc";
+
+  add_proto qw/int64_t vp10_block_error_fp/, "const int16_t *coeff, const int16_t *dqcoeff, int block_size";
+  specialize qw/vp10_block_error_fp neon/, "$sse2_x86inc";
+
+  add_proto qw/void vp10_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp10_quantize_fp neon sse2/, "$ssse3_x86_64_x86inc";
+
+  add_proto qw/void vp10_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp10_quantize_fp_32x32/, "$ssse3_x86_64_x86inc";
+
+  add_proto qw/void vp10_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp10_fdct8x8_quant sse2 ssse3 neon/;
+}
+
+# fdct functions
+
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void vp10_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp10_fht4x4 sse2/;
+
+  add_proto qw/void vp10_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp10_fht8x8 sse2/;
+
+  add_proto qw/void vp10_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp10_fht16x16 sse2/;
+
+  add_proto qw/void vp10_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp10_fwht4x4/, "$mmx_x86inc";
+} else {
+  add_proto qw/void vp10_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp10_fht4x4 sse2 msa/;
+
+  add_proto qw/void vp10_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp10_fht8x8 sse2 msa/;
+
+  add_proto qw/void vp10_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp10_fht16x16 sse2 msa/;
+
+  add_proto qw/void vp10_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp10_fwht4x4 msa/, "$mmx_x86inc";
+}
+
+# Inverse transform
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  # Note as optimized versions of these functions are added we need to add a check to ensure
+  # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
+  add_proto qw/void vp10_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp10_idct4x4_1_add/;
+
+  add_proto qw/void vp10_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp10_idct4x4_16_add/;
+
+  add_proto qw/void vp10_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp10_idct8x8_1_add/;
+
+  add_proto qw/void vp10_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp10_idct8x8_64_add/;
+
+  add_proto qw/void vp10_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp10_idct8x8_12_add/;
+
+  add_proto qw/void vp10_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp10_idct16x16_1_add/;
+
+  add_proto qw/void vp10_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp10_idct16x16_256_add/;
+
+  add_proto qw/void vp10_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp10_idct16x16_10_add/;
+
+  add_proto qw/void vp10_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp10_idct32x32_1024_add/;
+
+  add_proto qw/void vp10_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp10_idct32x32_34_add/;
+
+  add_proto qw/void vp10_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp10_idct32x32_1_add/;
+
+  add_proto qw/void vp10_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp10_iwht4x4_1_add/;
+
+  add_proto qw/void vp10_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp10_iwht4x4_16_add/;
+
+  add_proto qw/void vp10_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vp10_highbd_idct4x4_1_add/;
+
+  add_proto qw/void vp10_highbd_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vp10_highbd_idct8x8_1_add/;
+
+  add_proto qw/void vp10_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vp10_highbd_idct16x16_1_add/;
+
+  add_proto qw/void vp10_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vp10_highbd_idct32x32_1024_add/;
+
+  add_proto qw/void vp10_highbd_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vp10_highbd_idct32x32_34_add/;
+
+  add_proto qw/void vp10_highbd_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vp10_highbd_idct32x32_1_add/;
+
+  add_proto qw/void vp10_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vp10_highbd_iwht4x4_1_add/;
+
+  add_proto qw/void vp10_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vp10_highbd_iwht4x4_16_add/;
+
+  # Force C versions if CONFIG_EMULATE_HARDWARE is 1
+  if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
+    add_proto qw/void vp10_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vp10_highbd_idct4x4_16_add/;
+
+    add_proto qw/void vp10_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vp10_highbd_idct8x8_64_add/;
+
+    add_proto qw/void vp10_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vp10_highbd_idct8x8_10_add/;
+
+    add_proto qw/void vp10_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vp10_highbd_idct16x16_256_add/;
+
+    add_proto qw/void vp10_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vp10_highbd_idct16x16_10_add/;
+  } else {
+    add_proto qw/void vp10_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vp10_highbd_idct4x4_16_add sse2/;
+
+    add_proto qw/void vp10_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vp10_highbd_idct8x8_64_add sse2/;
+
+    add_proto qw/void vp10_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vp10_highbd_idct8x8_10_add sse2/;
+
+    add_proto qw/void vp10_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vp10_highbd_idct16x16_256_add sse2/;
+
+    add_proto qw/void vp10_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vp10_highbd_idct16x16_10_add sse2/;
+  }  # CONFIG_EMULATE_HARDWARE
+} else {
+  # Force C versions if CONFIG_EMULATE_HARDWARE is 1
+  if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
+    add_proto qw/void vp10_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp10_idct4x4_1_add/;
+
+    add_proto qw/void vp10_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp10_idct4x4_16_add/;
+
+    add_proto qw/void vp10_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp10_idct8x8_1_add/;
+
+    add_proto qw/void vp10_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp10_idct8x8_64_add/;
+
+    add_proto qw/void vp10_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp10_idct8x8_12_add/;
+
+    add_proto qw/void vp10_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp10_idct16x16_1_add/;
+
+    add_proto qw/void vp10_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp10_idct16x16_256_add/;
+
+    add_proto qw/void vp10_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp10_idct16x16_10_add/;
+
+    add_proto qw/void vp10_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp10_idct32x32_1024_add/;
+
+    add_proto qw/void vp10_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp10_idct32x32_34_add/;
+
+    add_proto qw/void vp10_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp10_idct32x32_1_add/;
+
+    add_proto qw/void vp10_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp10_iwht4x4_1_add/;
+
+    add_proto qw/void vp10_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp10_iwht4x4_16_add/;
+  } else {
+    add_proto qw/void vp10_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp10_idct4x4_1_add sse2/;
+
+    add_proto qw/void vp10_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp10_idct4x4_16_add sse2/;
+
+    add_proto qw/void vp10_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp10_idct8x8_1_add sse2/;
+
+    add_proto qw/void vp10_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp10_idct8x8_64_add sse2/;
+
+    add_proto qw/void vp10_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp10_idct8x8_12_add sse2/;
+
+    add_proto qw/void vp10_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp10_idct16x16_1_add sse2/;
+
+    add_proto qw/void vp10_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp10_idct16x16_256_add sse2/;
+
+    add_proto qw/void vp10_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp10_idct16x16_10_add sse2/;
+
+    add_proto qw/void vp10_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp10_idct32x32_1024_add sse2/;
+
+    add_proto qw/void vp10_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp10_idct32x32_34_add sse2/;
+
+    add_proto qw/void vp10_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp10_idct32x32_1_add sse2/;
+
+    add_proto qw/void vp10_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp10_iwht4x4_1_add/;
+
+    add_proto qw/void vp10_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp10_iwht4x4_16_add/;
+  }  # CONFIG_EMULATE_HARDWARE
+}  # CONFIG_VP9_HIGHBITDEPTH
+
+#
+# Motion search
+#
+add_proto qw/int vp10_full_search_sad/, "const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv";
+specialize qw/vp10_full_search_sad sse3 sse4_1/;
+$vp10_full_search_sad_sse3=vp10_full_search_sadx3;
+$vp10_full_search_sad_sse4_1=vp10_full_search_sadx8;
+
+add_proto qw/int vp10_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
+specialize qw/vp10_diamond_search_sad/;
+
+add_proto qw/int vp10_full_range_search/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
+specialize qw/vp10_full_range_search/;
+
+add_proto qw/void vp10_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
+specialize qw/vp10_temporal_filter_apply sse2 msa/;
+
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+
+  # ENCODEMB INVOKE
+
+  add_proto qw/int64_t vp10_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
+  specialize qw/vp10_highbd_block_error sse2/;
+
+  add_proto qw/void vp10_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp10_highbd_quantize_fp/;
+
+  add_proto qw/void vp10_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp10_highbd_quantize_fp_32x32/;
+
+  # fdct functions
+  add_proto qw/void vp10_highbd_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp10_highbd_fht4x4/;
+
+  add_proto qw/void vp10_highbd_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp10_highbd_fht8x8/;
+
+  add_proto qw/void vp10_highbd_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp10_highbd_fht16x16/;
+
+  add_proto qw/void vp10_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp10_highbd_fwht4x4/;
+
+  add_proto qw/void vp10_highbd_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
+  specialize qw/vp10_highbd_temporal_filter_apply/;
+
+}
+# End vp10_high encoder functions
+
+}
+# end encoder functions
+1;
diff --git a/libs/libvpx/vp10/common/x86/idct_intrin_sse2.c b/libs/libvpx/vp10/common/x86/idct_intrin_sse2.c
new file mode 100644
index 0000000000..a2c674b802
--- /dev/null
+++ b/libs/libvpx/vp10/common/x86/idct_intrin_sse2.c
@@ -0,0 +1,180 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+#include "vpx_ports/mem.h"
+
+void vp10_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
+                             int tx_type) {
+  __m128i in[2];
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i eight = _mm_set1_epi16(8);
+
+  in[0] = load_input_data(input);
+  in[1] = load_input_data(input + 8);
+
+  switch (tx_type) {
+    case 0:  // DCT_DCT
+      idct4_sse2(in);
+      idct4_sse2(in);
+      break;
+    case 1:  // ADST_DCT
+      idct4_sse2(in);
+      iadst4_sse2(in);
+      break;
+    case 2:  // DCT_ADST
+      iadst4_sse2(in);
+      idct4_sse2(in);
+      break;
+    case 3:  // ADST_ADST
+      iadst4_sse2(in);
+      iadst4_sse2(in);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+
+  // Final round and shift
+  in[0] = _mm_add_epi16(in[0], eight);
+  in[1] = _mm_add_epi16(in[1], eight);
+
+  in[0] = _mm_srai_epi16(in[0], 4);
+  in[1] = _mm_srai_epi16(in[1], 4);
+
+  // Reconstruction and Store
+  {
+    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
+    __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
+    d0 = _mm_unpacklo_epi32(d0,
+                            _mm_cvtsi32_si128(*(const int *)(dest + stride)));
+    d2 = _mm_unpacklo_epi32(
+        d2, _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)));
+    d0 = _mm_unpacklo_epi8(d0, zero);
+    d2 = _mm_unpacklo_epi8(d2, zero);
+    d0 = _mm_add_epi16(d0, in[0]);
+    d2 = _mm_add_epi16(d2, in[1]);
+    d0 = _mm_packus_epi16(d0, d2);
+    // store result[0]
+    *(int *)dest = _mm_cvtsi128_si32(d0);
+    // store result[1]
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
+    // store result[2]
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
+    // store result[3]
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
+  }
+}
+
+void vp10_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
+                             int tx_type) {
+  __m128i in[8];
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
+
+  // load input data
+  in[0] = load_input_data(input);
+  in[1] = load_input_data(input + 8 * 1);
+  in[2] = load_input_data(input + 8 * 2);
+  in[3] = load_input_data(input + 8 * 3);
+  in[4] = load_input_data(input + 8 * 4);
+  in[5] = load_input_data(input + 8 * 5);
+  in[6] = load_input_data(input + 8 * 6);
+  in[7] = load_input_data(input + 8 * 7);
+
+  switch (tx_type) {
+    case 0:  // DCT_DCT
+      idct8_sse2(in);
+      idct8_sse2(in);
+      break;
+    case 1:  // ADST_DCT
+      idct8_sse2(in);
+      iadst8_sse2(in);
+      break;
+    case 2:  // DCT_ADST
+      iadst8_sse2(in);
+      idct8_sse2(in);
+      break;
+    case 3:  // ADST_ADST
+      iadst8_sse2(in);
+      iadst8_sse2(in);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+
+  // Final rounding and shift
+  in[0] = _mm_adds_epi16(in[0], final_rounding);
+  in[1] = _mm_adds_epi16(in[1], final_rounding);
+  in[2] = _mm_adds_epi16(in[2], final_rounding);
+  in[3] = _mm_adds_epi16(in[3], final_rounding);
+  in[4] = _mm_adds_epi16(in[4], final_rounding);
+  in[5] = _mm_adds_epi16(in[5], final_rounding);
+  in[6] = _mm_adds_epi16(in[6], final_rounding);
+  in[7] = _mm_adds_epi16(in[7], final_rounding);
+
+  in[0] = _mm_srai_epi16(in[0], 5);
+  in[1] = _mm_srai_epi16(in[1], 5);
+  in[2] = _mm_srai_epi16(in[2], 5);
+  in[3] = _mm_srai_epi16(in[3], 5);
+  in[4] = _mm_srai_epi16(in[4], 5);
+  in[5] = _mm_srai_epi16(in[5], 5);
+  in[6] = _mm_srai_epi16(in[6], 5);
+  in[7] = _mm_srai_epi16(in[7], 5);
+
+  RECON_AND_STORE(dest + 0 * stride, in[0]);
+  RECON_AND_STORE(dest + 1 * stride, in[1]);
+  RECON_AND_STORE(dest + 2 * stride, in[2]);
+  RECON_AND_STORE(dest + 3 * stride, in[3]);
+  RECON_AND_STORE(dest + 4 * stride, in[4]);
+  RECON_AND_STORE(dest + 5 * stride, in[5]);
+  RECON_AND_STORE(dest + 6 * stride, in[6]);
+  RECON_AND_STORE(dest + 7 * stride, in[7]);
+}
+
+void vp10_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
+                                int stride, int tx_type) {
+  __m128i in0[16], in1[16];
+
+  load_buffer_8x16(input, in0);
+  input += 8;
+  load_buffer_8x16(input, in1);
+
+  switch (tx_type) {
+    case 0:  // DCT_DCT
+      idct16_sse2(in0, in1);
+      idct16_sse2(in0, in1);
+      break;
+    case 1:  // ADST_DCT
+      idct16_sse2(in0, in1);
+      iadst16_sse2(in0, in1);
+      break;
+    case 2:  // DCT_ADST
+      iadst16_sse2(in0, in1);
+      idct16_sse2(in0, in1);
+      break;
+    case 3:  // ADST_ADST
+      iadst16_sse2(in0, in1);
+      iadst16_sse2(in0, in1);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+
+  write_buffer_8x16(dest, in0, stride);
+  dest += 8;
+  write_buffer_8x16(dest, in1, stride);
+}
diff --git a/libs/libvpx/vp10/common/x86/mfqe_sse2.asm b/libs/libvpx/vp10/common/x86/mfqe_sse2.asm
new file mode 100644
index 0000000000..e714d06dbf
--- /dev/null
+++ b/libs/libvpx/vp10/common/x86/mfqe_sse2.asm
@@ -0,0 +1,287 @@
+;
+;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+;  This file is a duplicate of mfqe_sse2.asm in VP8.
+;  TODO(jackychen): Find a way to fix the duplicate.
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp10_filter_by_weight16x16_sse2
+;(
+;    unsigned char *src,
+;    int            src_stride,
+;    unsigned char *dst,
+;    int            dst_stride,
+;    int            src_weight
+;)
+global sym(vp10_filter_by_weight16x16_sse2) PRIVATE
+sym(vp10_filter_by_weight16x16_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    SAVE_XMM 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    movd        xmm0, arg(4)                ; src_weight
+    pshuflw     xmm0, xmm0, 0x0             ; replicate to all low words
+    punpcklqdq  xmm0, xmm0                  ; replicate to all hi words
+
+    movdqa      xmm1, [GLOBAL(tMFQE)]
+    psubw       xmm1, xmm0                  ; dst_weight
+
+    mov         rax, arg(0)                 ; src
+    mov         rsi, arg(1)                 ; src_stride
+    mov         rdx, arg(2)                 ; dst
+    mov         rdi, arg(3)                 ; dst_stride
+
+    mov         rcx, 16                     ; loop count
+    pxor        xmm6, xmm6
+
+.combine
+    movdqa      xmm2, [rax]
+    movdqa      xmm4, [rdx]
+    add         rax, rsi
+
+    ; src * src_weight
+    movdqa      xmm3, xmm2
+    punpcklbw   xmm2, xmm6
+    punpckhbw   xmm3, xmm6
+    pmullw      xmm2, xmm0
+    pmullw      xmm3, xmm0
+
+    ; dst * dst_weight
+    movdqa      xmm5, xmm4
+    punpcklbw   xmm4, xmm6
+    punpckhbw   xmm5, xmm6
+    pmullw      xmm4, xmm1
+    pmullw      xmm5, xmm1
+
+    ; sum, round and shift
+    paddw       xmm2, xmm4
+    paddw       xmm3, xmm5
+    paddw       xmm2, [GLOBAL(tMFQE_round)]
+    paddw       xmm3, [GLOBAL(tMFQE_round)]
+    psrlw       xmm2, 4
+    psrlw       xmm3, 4
+
+    packuswb    xmm2, xmm3
+    movdqa      [rdx], xmm2
+    add         rdx, rdi
+
+    dec         rcx
+    jnz         .combine
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+
+    ret
+
+;void vp10_filter_by_weight8x8_sse2
+;(
+;    unsigned char *src,
+;    int            src_stride,
+;    unsigned char *dst,
+;    int            dst_stride,
+;    int            src_weight
+;)
+global sym(vp10_filter_by_weight8x8_sse2) PRIVATE
+sym(vp10_filter_by_weight8x8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    movd        xmm0, arg(4)                ; src_weight
+    pshuflw     xmm0, xmm0, 0x0             ; replicate to all low words
+    punpcklqdq  xmm0, xmm0                  ; replicate to all hi words
+
+    movdqa      xmm1, [GLOBAL(tMFQE)]
+    psubw       xmm1, xmm0                  ; dst_weight
+
+    mov         rax, arg(0)                 ; src
+    mov         rsi, arg(1)                 ; src_stride
+    mov         rdx, arg(2)                 ; dst
+    mov         rdi, arg(3)                 ; dst_stride
+
+    mov         rcx, 8                      ; loop count
+    pxor        xmm4, xmm4
+
+.combine
+    movq        xmm2, [rax]
+    movq        xmm3, [rdx]
+    add         rax, rsi
+
+    ; src * src_weight
+    punpcklbw   xmm2, xmm4
+    pmullw      xmm2, xmm0
+
+    ; dst * dst_weight
+    punpcklbw   xmm3, xmm4
+    pmullw      xmm3, xmm1
+
+    ; sum, round and shift
+    paddw       xmm2, xmm3
+    paddw       xmm2, [GLOBAL(tMFQE_round)]
+    psrlw       xmm2, 4
+
+    packuswb    xmm2, xmm4
+    movq        [rdx], xmm2
+    add         rdx, rdi
+
+    dec         rcx
+    jnz         .combine
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+
+    ret
+
+;void vp10_variance_and_sad_16x16_sse2 | arg
+;(
+;    unsigned char *src1,          0
+;    int            stride1,       1
+;    unsigned char *src2,          2
+;    int            stride2,       3
+;    unsigned int  *variance,      4
+;    unsigned int  *sad,           5
+;)
+global sym(vp10_variance_and_sad_16x16_sse2) PRIVATE
+sym(vp10_variance_and_sad_16x16_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov         rax,        arg(0)          ; src1
+    mov         rcx,        arg(1)          ; stride1
+    mov         rdx,        arg(2)          ; src2
+    mov         rdi,        arg(3)          ; stride2
+
+    mov         rsi,        16              ; block height
+
+    ; Prep accumulator registers
+    pxor        xmm3, xmm3                  ; SAD
+    pxor        xmm4, xmm4                  ; sum of src2
+    pxor        xmm5, xmm5                  ; sum of src2^2
+
+    ; Because we're working with the actual output frames
+    ; we can't depend on any kind of data alignment.
+.accumulate
+    movdqa      xmm0, [rax]                 ; src1
+    movdqa      xmm1, [rdx]                 ; src2
+    add         rax, rcx                    ; src1 + stride1
+    add         rdx, rdi                    ; src2 + stride2
+
+    ; SAD(src1, src2)
+    psadbw      xmm0, xmm1
+    paddusw     xmm3, xmm0
+
+    ; SUM(src2)
+    pxor        xmm2, xmm2
+    psadbw      xmm2, xmm1                  ; sum src2 by misusing SAD against 0
+    paddusw     xmm4, xmm2
+
+    ; pmaddubsw would be ideal if it took two unsigned values. instead,
+    ; it expects a signed and an unsigned value. so instead we zero extend
+    ; and operate on words.
+    pxor        xmm2, xmm2
+    movdqa      xmm0, xmm1
+    punpcklbw   xmm0, xmm2
+    punpckhbw   xmm1, xmm2
+    pmaddwd     xmm0, xmm0
+    pmaddwd     xmm1, xmm1
+    paddd       xmm5, xmm0
+    paddd       xmm5, xmm1
+
+    sub         rsi,        1
+    jnz         .accumulate
+
+    ; phaddd only operates on adjacent double words.
+    ; Finalize SAD and store
+    movdqa      xmm0, xmm3
+    psrldq      xmm0, 8
+    paddusw     xmm0, xmm3
+    paddd       xmm0, [GLOBAL(t128)]
+    psrld       xmm0, 8
+
+    mov         rax,  arg(5)
+    movd        [rax], xmm0
+
+    ; Accumulate sum of src2
+    movdqa      xmm0, xmm4
+    psrldq      xmm0, 8
+    paddusw     xmm0, xmm4
+    ; Square src2. Ignore high value
+    pmuludq     xmm0, xmm0
+    psrld       xmm0, 8
+
+    ; phaddw could be used to sum adjacent values but we want
+    ; all the values summed. promote to doubles, accumulate,
+    ; shift and sum
+    pxor        xmm2, xmm2
+    movdqa      xmm1, xmm5
+    punpckldq   xmm1, xmm2
+    punpckhdq   xmm5, xmm2
+    paddd       xmm1, xmm5
+    movdqa      xmm2, xmm1
+    psrldq      xmm1, 8
+    paddd       xmm1, xmm2
+
+    psubd       xmm1, xmm0
+
+    ; (variance + 128) >> 8
+    paddd       xmm1, [GLOBAL(t128)]
+    psrld       xmm1, 8
+    mov         rax,  arg(4)
+
+    movd        [rax], xmm1
+
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+t128:
+%ifndef __NASM_VER__
+    ddq 128
+%elif CONFIG_BIG_ENDIAN
+    dq  0, 128
+%else
+    dq  128, 0
+%endif
+align 16
+tMFQE: ; 1 << MFQE_PRECISION
+    times 8 dw 0x10
+align 16
+tMFQE_round: ; 1 << (MFQE_PRECISION - 1)
+    times 8 dw 0x08
diff --git a/libs/libvpx/vp10/common/x86/postproc_sse2.asm b/libs/libvpx/vp10/common/x86/postproc_sse2.asm
new file mode 100644
index 0000000000..d5f8e927b1
--- /dev/null
+++ b/libs/libvpx/vp10/common/x86/postproc_sse2.asm
@@ -0,0 +1,694 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp10_post_proc_down_and_across_xmm
+;(
+;    unsigned char *src_ptr,
+;    unsigned char *dst_ptr,
+;    int src_pixels_per_line,
+;    int dst_pixels_per_line,
+;    int rows,
+;    int cols,
+;    int flimit
+;)
+global sym(vp10_post_proc_down_and_across_xmm) PRIVATE
+sym(vp10_post_proc_down_and_across_xmm):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+%if ABI_IS_32BIT=1 && CONFIG_PIC=1
+    ALIGN_STACK 16, rax
+    ; move the global rd onto the stack, since we don't have enough registers
+    ; to do PIC addressing
+    movdqa      xmm0, [GLOBAL(rd42)]
+    sub         rsp, 16
+    movdqa      [rsp], xmm0
+%define RD42 [rsp]
+%else
+%define RD42 [GLOBAL(rd42)]
+%endif
+
+
+        movd        xmm2,       dword ptr arg(6) ;flimit
+        punpcklwd   xmm2,       xmm2
+        punpckldq   xmm2,       xmm2
+        punpcklqdq  xmm2,       xmm2
+
+        mov         rsi,        arg(0) ;src_ptr
+        mov         rdi,        arg(1) ;dst_ptr
+
+        movsxd      rcx,        DWORD PTR arg(4) ;rows
+        movsxd      rax,        DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
+        pxor        xmm0,       xmm0              ; mm0 = 00000000
+
+.nextrow:
+
+        xor         rdx,        rdx       ; clear out rdx for use as loop counter
+.nextcol:
+        movq        xmm3,       QWORD PTR [rsi]         ; mm4 = r0 p0..p7
+        punpcklbw   xmm3,       xmm0                    ; mm3 = p0..p3
+        movdqa      xmm1,       xmm3                    ; mm1 = p0..p3
+        psllw       xmm3,       2                       ;
+
+        movq        xmm5,       QWORD PTR [rsi + rax]   ; mm4 = r1 p0..p7
+        punpcklbw   xmm5,       xmm0                    ; mm5 = r1 p0..p3
+        paddusw     xmm3,       xmm5                    ; mm3 += mm6
+
+        ; thresholding
+        movdqa      xmm7,       xmm1                    ; mm7 = r0 p0..p3
+        psubusw     xmm7,       xmm5                    ; mm7 = r0 p0..p3 - r1 p0..p3
+        psubusw     xmm5,       xmm1                    ; mm5 = r1 p0..p3 - r0 p0..p3
+        paddusw     xmm7,       xmm5                    ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
+        pcmpgtw     xmm7,       xmm2
+
+        movq        xmm5,       QWORD PTR [rsi + 2*rax] ; mm4 = r2 p0..p7
+        punpcklbw   xmm5,       xmm0                    ; mm5 = r2 p0..p3
+        paddusw     xmm3,       xmm5                    ; mm3 += mm5
+
+        ; thresholding
+        movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
+        psubusw     xmm6,       xmm5                    ; mm6 = r0 p0..p3 - r2 p0..p3
+        psubusw     xmm5,       xmm1                    ; mm5 = r2 p0..p3 - r2 p0..p3
+        paddusw     xmm6,       xmm5                    ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
+        pcmpgtw     xmm6,       xmm2
+        por         xmm7,       xmm6                    ; accumulate thresholds
+
+
+        neg         rax
+        movq        xmm5,       QWORD PTR [rsi+2*rax]   ; mm4 = r-2 p0..p7
+        punpcklbw   xmm5,       xmm0                    ; mm5 = r-2 p0..p3
+        paddusw     xmm3,       xmm5                    ; mm3 += mm5
+
+        ; thresholding
+        movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
+        psubusw     xmm6,       xmm5                    ; mm6 = p0..p3 - r-2 p0..p3
+        psubusw     xmm5,       xmm1                    ; mm5 = r-2 p0..p3 - p0..p3
+        paddusw     xmm6,       xmm5                    ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
+        pcmpgtw     xmm6,       xmm2
+        por         xmm7,       xmm6                    ; accumulate thresholds
+
+        movq        xmm4,       QWORD PTR [rsi+rax]     ; mm4 = r-1 p0..p7
+        punpcklbw   xmm4,       xmm0                    ; mm4 = r-1 p0..p3
+        paddusw     xmm3,       xmm4                    ; mm3 += mm5
+
+        ; thresholding
+        movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
+        psubusw     xmm6,       xmm4                    ; mm6 = p0..p3 - r-2 p0..p3
+        psubusw     xmm4,       xmm1                    ; mm5 = r-1 p0..p3 - p0..p3
+        paddusw     xmm6,       xmm4                    ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
+        pcmpgtw     xmm6,       xmm2
+        por         xmm7,       xmm6                    ; accumulate thresholds
+
+
+        paddusw     xmm3,       RD42                    ; mm3 += round value
+        psraw       xmm3,       3                       ; mm3 /= 8
+
+        pand        xmm1,       xmm7                    ; mm1 select vals > thresh from source
+        pandn       xmm7,       xmm3                    ; mm7 select vals < thresh from blurred result
+        paddusw     xmm1,       xmm7                    ; combination
+
+        packuswb    xmm1,       xmm0                    ; pack to bytes
+        movq        QWORD PTR [rdi], xmm1             ;
+
+        neg         rax                   ; pitch is positive
+        add         rsi,        8
+        add         rdi,        8
+
+        add         rdx,        8
+        cmp         edx,        dword arg(5) ;cols
+
+        jl          .nextcol
+
+        ; done with the all cols, start the across filtering in place
+        sub         rsi,        rdx
+        sub         rdi,        rdx
+
+        xor         rdx,        rdx
+        movq        mm0,        QWORD PTR [rdi-8];
+
+.acrossnextcol:
+        movq        xmm7,       QWORD PTR [rdi +rdx -2]
+        movd        xmm4,       DWORD PTR [rdi +rdx +6]
+
+        pslldq      xmm4,       8
+        por         xmm4,       xmm7
+
+        movdqa      xmm3,       xmm4
+        psrldq      xmm3,       2
+        punpcklbw   xmm3,       xmm0              ; mm3 = p0..p3
+        movdqa      xmm1,       xmm3              ; mm1 = p0..p3
+        psllw       xmm3,       2
+
+
+        movdqa      xmm5,       xmm4
+        psrldq      xmm5,       3
+        punpcklbw   xmm5,       xmm0              ; mm5 = p1..p4
+        paddusw     xmm3,       xmm5              ; mm3 += mm6
+
+        ; thresholding
+        movdqa      xmm7,       xmm1              ; mm7 = p0..p3
+        psubusw     xmm7,       xmm5              ; mm7 = p0..p3 - p1..p4
+        psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
+        paddusw     xmm7,       xmm5              ; mm7 = abs(p0..p3 - p1..p4)
+        pcmpgtw     xmm7,       xmm2
+
+        movdqa      xmm5,       xmm4
+        psrldq      xmm5,       4
+        punpcklbw   xmm5,       xmm0              ; mm5 = p2..p5
+        paddusw     xmm3,       xmm5              ; mm3 += mm5
+
+        ; thresholding
+        movdqa      xmm6,       xmm1              ; mm6 = p0..p3
+        psubusw     xmm6,       xmm5              ; mm6 = p0..p3 - p1..p4
+        psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
+        paddusw     xmm6,       xmm5              ; mm6 = abs(p0..p3 - p1..p4)
+        pcmpgtw     xmm6,       xmm2
+        por         xmm7,       xmm6              ; accumulate thresholds
+
+
+        movdqa      xmm5,       xmm4              ; mm5 = p-2..p5
+        punpcklbw   xmm5,       xmm0              ; mm5 = p-2..p1
+        paddusw     xmm3,       xmm5              ; mm3 += mm5
+
+        ; thresholding
+        movdqa      xmm6,       xmm1              ; mm6 = p0..p3
+        psubusw     xmm6,       xmm5              ; mm6 = p0..p3 - p1..p4
+        psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
+        paddusw     xmm6,       xmm5              ; mm6 = abs(p0..p3 - p1..p4)
+        pcmpgtw     xmm6,       xmm2
+        por         xmm7,       xmm6              ; accumulate thresholds
+
+        psrldq      xmm4,       1                   ; mm4 = p-1..p5
+        punpcklbw   xmm4,       xmm0              ; mm4 = p-1..p2
+        paddusw     xmm3,       xmm4              ; mm3 += mm5
+
+        ; thresholding
+        movdqa      xmm6,       xmm1              ; mm6 = p0..p3
+        psubusw     xmm6,       xmm4              ; mm6 = p0..p3 - p1..p4
+        psubusw     xmm4,       xmm1              ; mm5 = p1..p4 - p0..p3
+        paddusw     xmm6,       xmm4              ; mm6 = abs(p0..p3 - p1..p4)
+        pcmpgtw     xmm6,       xmm2
+        por         xmm7,       xmm6              ; accumulate thresholds
+
+        paddusw     xmm3,       RD42              ; mm3 += round value
+        psraw       xmm3,       3                 ; mm3 /= 8
+
+        pand        xmm1,       xmm7              ; mm1 select vals > thresh from source
+        pandn       xmm7,       xmm3              ; mm7 select vals < thresh from blurred result
+        paddusw     xmm1,       xmm7              ; combination
+
+        packuswb    xmm1,       xmm0              ; pack to bytes
+        movq        QWORD PTR [rdi+rdx-8],  mm0   ; store previous four bytes
+        movdq2q     mm0,        xmm1
+
+        add         rdx,        8
+        cmp         edx,        dword arg(5) ;cols
+        jl          .acrossnextcol;
+
+        ; last 8 pixels
+        movq        QWORD PTR [rdi+rdx-8],  mm0
+
+        ; done with this rwo
+        add         rsi,rax               ; next line
+        mov         eax, dword arg(3) ;dst_pixels_per_line ; destination pitch?
+        add         rdi,rax               ; next destination
+        mov         eax, dword arg(2) ;src_pixels_per_line ; destination pitch?
+
+        dec         rcx                   ; decrement count
+        jnz         .nextrow              ; next row
+
+%if ABI_IS_32BIT=1 && CONFIG_PIC=1
+    add rsp,16
+    pop rsp
+%endif
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%undef RD42
+
+
+;void vp10_mbpost_proc_down_xmm(unsigned char *dst,
+;                            int pitch, int rows, int cols,int flimit)
+extern sym(vp10_rv)
+global sym(vp10_mbpost_proc_down_xmm) PRIVATE
+sym(vp10_mbpost_proc_down_xmm):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 128+16
+
+    ; unsigned char d[16][8] at [rsp]
+    ; create flimit2 at [rsp+128]
+    mov         eax, dword ptr arg(4) ;flimit
+    mov         [rsp+128], eax
+    mov         [rsp+128+4], eax
+    mov         [rsp+128+8], eax
+    mov         [rsp+128+12], eax
+%define flimit4 [rsp+128]
+
+%if ABI_IS_32BIT=0
+    lea         r8,       [GLOBAL(sym(vp10_rv))]
+%endif
+
+    ;rows +=8;
+    add         dword arg(2), 8
+
+    ;for(c=0; c<cols; c+=8)
+.loop_col:
+            mov         rsi,        arg(0) ; s
+            pxor        xmm0,       xmm0        ;
+
+            movsxd      rax,        dword ptr arg(1) ;pitch       ;
+            neg         rax                                     ; rax = -pitch
+
+            lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
+            neg         rax
+
+
+            pxor        xmm5,       xmm5
+            pxor        xmm6,       xmm6        ;
+
+            pxor        xmm7,       xmm7        ;
+            mov         rdi,        rsi
+
+            mov         rcx,        15          ;
+
+.loop_initvar:
+            movq        xmm1,       QWORD PTR [rdi];
+            punpcklbw   xmm1,       xmm0        ;
+
+            paddw       xmm5,       xmm1        ;
+            pmullw      xmm1,       xmm1        ;
+
+            movdqa      xmm2,       xmm1        ;
+            punpcklwd   xmm1,       xmm0        ;
+
+            punpckhwd   xmm2,       xmm0        ;
+            paddd       xmm6,       xmm1        ;
+
+            paddd       xmm7,       xmm2        ;
+            lea         rdi,        [rdi+rax]   ;
+
+            dec         rcx
+            jne         .loop_initvar
+            ;save the var and sum
+            xor         rdx,        rdx
+.loop_row:
+            movq        xmm1,       QWORD PTR [rsi]     ; [s-pitch*8]
+            movq        xmm2,       QWORD PTR [rdi]     ; [s+pitch*7]
+
+            punpcklbw   xmm1,       xmm0
+            punpcklbw   xmm2,       xmm0
+
+            paddw       xmm5,       xmm2
+            psubw       xmm5,       xmm1
+
+            pmullw      xmm2,       xmm2
+            movdqa      xmm4,       xmm2
+
+            punpcklwd   xmm2,       xmm0
+            punpckhwd   xmm4,       xmm0
+
+            paddd       xmm6,       xmm2
+            paddd       xmm7,       xmm4
+
+            pmullw      xmm1,       xmm1
+            movdqa      xmm2,       xmm1
+
+            punpcklwd   xmm1,       xmm0
+            psubd       xmm6,       xmm1
+
+            punpckhwd   xmm2,       xmm0
+            psubd       xmm7,       xmm2
+
+
+            movdqa      xmm3,       xmm6
+            pslld       xmm3,       4
+
+            psubd       xmm3,       xmm6
+            movdqa      xmm1,       xmm5
+
+            movdqa      xmm4,       xmm5
+            pmullw      xmm1,       xmm1
+
+            pmulhw      xmm4,       xmm4
+            movdqa      xmm2,       xmm1
+
+            punpcklwd   xmm1,       xmm4
+            punpckhwd   xmm2,       xmm4
+
+            movdqa      xmm4,       xmm7
+            pslld       xmm4,       4
+
+            psubd       xmm4,       xmm7
+
+            psubd       xmm3,       xmm1
+            psubd       xmm4,       xmm2
+
+            psubd       xmm3,       flimit4
+            psubd       xmm4,       flimit4
+
+            psrad       xmm3,       31
+            psrad       xmm4,       31
+
+            packssdw    xmm3,       xmm4
+            packsswb    xmm3,       xmm0
+
+            movq        xmm1,       QWORD PTR [rsi+rax*8]
+
+            movq        xmm2,       xmm1
+            punpcklbw   xmm1,       xmm0
+
+            paddw       xmm1,       xmm5
+            mov         rcx,        rdx
+
+            and         rcx,        127
+%if ABI_IS_32BIT=1 && CONFIG_PIC=1
+            push        rax
+            lea         rax,        [GLOBAL(sym(vp10_rv))]
+            movdqu      xmm4,       [rax + rcx*2] ;vp10_rv[rcx*2]
+            pop         rax
+%elif ABI_IS_32BIT=0
+            movdqu      xmm4,       [r8 + rcx*2] ;vp10_rv[rcx*2]
+%else
+            movdqu      xmm4,       [sym(vp10_rv) + rcx*2]
+%endif
+
+            paddw       xmm1,       xmm4
+            ;paddw     xmm1,       eight8s
+            psraw       xmm1,       4
+
+            packuswb    xmm1,       xmm0
+            pand        xmm1,       xmm3
+
+            pandn       xmm3,       xmm2
+            por         xmm1,       xmm3
+
+            and         rcx,        15
+            movq        QWORD PTR   [rsp + rcx*8], xmm1 ;d[rcx*8]
+
+            mov         rcx,        rdx
+            sub         rcx,        8
+
+            and         rcx,        15
+            movq        mm0,        [rsp + rcx*8] ;d[rcx*8]
+
+            movq        [rsi],      mm0
+            lea         rsi,        [rsi+rax]
+
+            lea         rdi,        [rdi+rax]
+            add         rdx,        1
+
+            cmp         edx,        dword arg(2) ;rows
+            jl          .loop_row
+
+        add         dword arg(0), 8 ; s += 8
+        sub         dword arg(3), 8 ; cols -= 8
+        cmp         dword arg(3), 0
+        jg          .loop_col
+
+    add         rsp, 128+16
+    pop         rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%undef flimit4
+
+
+;void vp10_mbpost_proc_across_ip_xmm(unsigned char *src,
+;                                int pitch, int rows, int cols,int flimit)
+global sym(vp10_mbpost_proc_across_ip_xmm) PRIVATE
+sym(vp10_mbpost_proc_across_ip_xmm):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16
+
+    ; create flimit4 at [rsp]
+    mov         eax, dword ptr arg(4) ;flimit
+    mov         [rsp], eax
+    mov         [rsp+4], eax
+    mov         [rsp+8], eax
+    mov         [rsp+12], eax
+%define flimit4 [rsp]
+
+
+    ;for(r=0;r<rows;r++)
+.ip_row_loop:
+
+        xor         rdx,    rdx ;sumsq=0;
+        xor         rcx,    rcx ;sum=0;
+        mov         rsi,    arg(0); s
+        mov         rdi,    -8
+.ip_var_loop:
+        ;for(i=-8;i<=6;i++)
+        ;{
+        ;    sumsq += s[i]*s[i];
+        ;    sum   += s[i];
+        ;}
+        movzx       eax, byte [rsi+rdi]
+        add         ecx, eax
+        mul         al
+        add         edx, eax
+        add         rdi, 1
+        cmp         rdi, 6
+        jle         .ip_var_loop
+
+
+            ;mov         rax,    sumsq
+            ;movd        xmm7,   rax
+            movd        xmm7,   edx
+
+            ;mov         rax,    sum
+            ;movd        xmm6,   rax
+            movd        xmm6,   ecx
+
+            mov         rsi,    arg(0) ;s
+            xor         rcx,    rcx
+
+            movsxd      rdx,    dword arg(3) ;cols
+            add         rdx,    8
+            pxor        mm0,    mm0
+            pxor        mm1,    mm1
+
+            pxor        xmm0,   xmm0
+.nextcol4:
+
+            movd        xmm1,   DWORD PTR [rsi+rcx-8]   ; -8 -7 -6 -5
+            movd        xmm2,   DWORD PTR [rsi+rcx+7]   ; +7 +8 +9 +10
+
+            punpcklbw   xmm1,   xmm0                    ; expanding
+            punpcklbw   xmm2,   xmm0                    ; expanding
+
+            punpcklwd   xmm1,   xmm0                    ; expanding to dwords
+            punpcklwd   xmm2,   xmm0                    ; expanding to dwords
+
+            psubd       xmm2,   xmm1                    ; 7--8   8--7   9--6 10--5
+            paddd       xmm1,   xmm1                    ; -8*2   -7*2   -6*2 -5*2
+
+            paddd       xmm1,   xmm2                    ; 7+-8   8+-7   9+-6 10+-5
+            pmaddwd     xmm1,   xmm2                    ; squared of 7+-8   8+-7   9+-6 10+-5
+
+            paddd       xmm6,   xmm2
+            paddd       xmm7,   xmm1
+
+            pshufd      xmm6,   xmm6,   0               ; duplicate the last ones
+            pshufd      xmm7,   xmm7,   0               ; duplicate the last ones
+
+            psrldq      xmm1,       4                   ; 8--7   9--6 10--5  0000
+            psrldq      xmm2,       4                   ; 8--7   9--6 10--5  0000
+
+            pshufd      xmm3,   xmm1,   3               ; 0000  8--7   8--7   8--7 squared
+            pshufd      xmm4,   xmm2,   3               ; 0000  8--7   8--7   8--7 squared
+
+            paddd       xmm6,   xmm4
+            paddd       xmm7,   xmm3
+
+            pshufd      xmm3,   xmm1,   01011111b       ; 0000  0000   9--6   9--6 squared
+            pshufd      xmm4,   xmm2,   01011111b       ; 0000  0000   9--6   9--6 squared
+
+            paddd       xmm7,   xmm3
+            paddd       xmm6,   xmm4
+
+            pshufd      xmm3,   xmm1,   10111111b       ; 0000  0000   8--7   8--7 squared
+            pshufd      xmm4,   xmm2,   10111111b       ; 0000  0000   8--7   8--7 squared
+
+            paddd       xmm7,   xmm3
+            paddd       xmm6,   xmm4
+
+            movdqa      xmm3,   xmm6
+            pmaddwd     xmm3,   xmm3
+
+            movdqa      xmm5,   xmm7
+            pslld       xmm5,   4
+
+            psubd       xmm5,   xmm7
+            psubd       xmm5,   xmm3
+
+            psubd       xmm5,   flimit4
+            psrad       xmm5,   31
+
+            packssdw    xmm5,   xmm0
+            packsswb    xmm5,   xmm0
+
+            movd        xmm1,   DWORD PTR [rsi+rcx]
+            movq        xmm2,   xmm1
+
+            punpcklbw   xmm1,   xmm0
+            punpcklwd   xmm1,   xmm0
+
+            paddd       xmm1,   xmm6
+            paddd       xmm1,   [GLOBAL(four8s)]
+
+            psrad       xmm1,   4
+            packssdw    xmm1,   xmm0
+
+            packuswb    xmm1,   xmm0
+            pand        xmm1,   xmm5
+
+            pandn       xmm5,   xmm2
+            por         xmm5,   xmm1
+
+            movd        [rsi+rcx-8],  mm0
+            movq        mm0,    mm1
+
+            movdq2q     mm1,    xmm5
+            psrldq      xmm7,   12
+
+            psrldq      xmm6,   12
+            add         rcx,    4
+
+            cmp         rcx,    rdx
+            jl          .nextcol4
+
+        ;s+=pitch;
+        movsxd rax, dword arg(1)
+        add    arg(0), rax
+
+        sub dword arg(2), 1 ;rows-=1
+        cmp dword arg(2), 0
+        jg .ip_row_loop
+
+    add         rsp, 16
+    pop         rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%undef flimit4
+
+
+;void vp10_plane_add_noise_wmt (unsigned char *start, unsigned char *noise,
+;                            unsigned char blackclamp[16],
+;                            unsigned char whiteclamp[16],
+;                            unsigned char bothclamp[16],
+;                            unsigned int width, unsigned int height, int pitch)
+global sym(vp10_plane_add_noise_wmt) PRIVATE
+sym(vp10_plane_add_noise_wmt):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 8
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+.addnoise_loop:
+    call sym(LIBVPX_RAND) WRT_PLT
+    mov     rcx, arg(1) ;noise
+    and     rax, 0xff
+    add     rcx, rax
+
+    ; we rely on the fact that the clamping vectors are stored contiguously
+    ; in black/white/both order. Note that we have to reload this here because
+    ; rdx could be trashed by rand()
+    mov     rdx, arg(2) ; blackclamp
+
+
+            mov     rdi, rcx
+            movsxd  rcx, dword arg(5) ;[Width]
+            mov     rsi, arg(0) ;Pos
+            xor         rax,rax
+
+.addnoise_nextset:
+            movdqu      xmm1,[rsi+rax]         ; get the source
+
+            psubusb     xmm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
+            paddusb     xmm1, [rdx+32] ;bothclamp
+            psubusb     xmm1, [rdx+16] ;whiteclamp
+
+            movdqu      xmm2,[rdi+rax]         ; get the noise for this line
+            paddb       xmm1,xmm2              ; add it in
+            movdqu      [rsi+rax],xmm1         ; store the result
+
+            add         rax,16                 ; move to the next line
+
+            cmp         rax, rcx
+            jl          .addnoise_nextset
+
+    movsxd  rax, dword arg(7) ; Pitch
+    add     arg(0), rax ; Start += Pitch
+    sub     dword arg(6), 1   ; Height -= 1
+    jg      .addnoise_loop
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+SECTION_RODATA
+align 16
+rd42:
+    times 8 dw 0x04
+four8s:
+    times 4 dd 8
diff --git a/libs/libvpx/vp10/common/x86/vp10_fwd_dct32x32_impl_sse2.h b/libs/libvpx/vp10/common/x86/vp10_fwd_dct32x32_impl_sse2.h
new file mode 100644
index 0000000000..2490973e34
--- /dev/null
+++ b/libs/libvpx/vp10/common/x86/vp10_fwd_dct32x32_impl_sse2.h
@@ -0,0 +1,3154 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>  // SSE2
+
+#include "./vp10_rtcd.h"
+#include "vp10/common/vp10_fwd_txfm.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+// TODO(jingning) The high bit-depth version needs re-work for performance.
+// The current SSE2 implementation also causes cross reference to the static
+// functions in the C implementation file.
+#if DCT_HIGH_BIT_DEPTH
+#define ADD_EPI16 _mm_adds_epi16
+#define SUB_EPI16 _mm_subs_epi16
+#if FDCT32x32_HIGH_PRECISION
+void vp10_fdct32x32_rows_c(const int16_t *intermediate, tran_low_t *out) {
+    int i, j;
+    for (i = 0; i < 32; ++i) {
+      tran_high_t temp_in[32], temp_out[32];
+      for (j = 0; j < 32; ++j)
+        temp_in[j] = intermediate[j * 32 + i];
+      vp10_fdct32(temp_in, temp_out, 0);
+      for (j = 0; j < 32; ++j)
+        out[j + i * 32] =
+            (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
+    }
+}
+  #define HIGH_FDCT32x32_2D_C vp10_highbd_fdct32x32_c
+  #define HIGH_FDCT32x32_2D_ROWS_C vp10_fdct32x32_rows_c
+#else
+void vp10_fdct32x32_rd_rows_c(const int16_t *intermediate, tran_low_t *out) {
+    int i, j;
+    for (i = 0; i < 32; ++i) {
+      tran_high_t temp_in[32], temp_out[32];
+      for (j = 0; j < 32; ++j)
+        temp_in[j] = intermediate[j * 32 + i];
+      vp10_fdct32(temp_in, temp_out, 1);
+      for (j = 0; j < 32; ++j)
+        out[j + i * 32] = (tran_low_t)temp_out[j];
+    }
+}
+  #define HIGH_FDCT32x32_2D_C vp10_highbd_fdct32x32_rd_c
+  #define HIGH_FDCT32x32_2D_ROWS_C vp10_fdct32x32_rd_rows_c
+#endif  // FDCT32x32_HIGH_PRECISION
+#else
+#define ADD_EPI16 _mm_add_epi16
+#define SUB_EPI16 _mm_sub_epi16
+#endif  // DCT_HIGH_BIT_DEPTH
+
+
+void FDCT32x32_2D(const int16_t *input,
+                  tran_low_t *output_org, int stride) {
+  // Calculate pre-multiplied strides
+  const int str1 = stride;
+  const int str2 = 2 * stride;
+  const int str3 = 2 * stride + str1;
+  // We need an intermediate buffer between passes.
+  DECLARE_ALIGNED(16, int16_t, intermediate[32 * 32]);
+  // Constants
+  //    When we use them, in one case, they are all the same. In all others
+  //    it's a pair of them that we need to repeat four times. This is done
+  //    by constructing the 32 bit constant corresponding to that pair.
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64,   cospi_24_64);
+  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64,  cospi_8_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64,  cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64,  cospi_12_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64,   cospi_28_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64,  cospi_4_64);
+  const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
+  const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
+  const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64,  cospi_2_64);
+  const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64,  cospi_18_64);
+  const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64,  cospi_10_64);
+  const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64,   cospi_26_64);
+  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64,  cospi_6_64);
+  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64,  cospi_22_64);
+  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64,  cospi_14_64);
+  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64,   cospi_30_64);
+  const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64,  cospi_1_64);
+  const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64,  cospi_17_64);
+  const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64,  cospi_9_64);
+  const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64,   cospi_25_64);
+  const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64,  cospi_7_64);
+  const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64,   cospi_23_64);
+  const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64,  cospi_15_64);
+  const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64,   cospi_31_64);
+  const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64,  cospi_5_64);
+  const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64,  cospi_21_64);
+  const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64,  cospi_13_64);
+  const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64,   cospi_29_64);
+  const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64,  cospi_3_64);
+  const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64,  cospi_19_64);
+  const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64,  cospi_11_64);
+  const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64,   cospi_27_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i kZero = _mm_set1_epi16(0);
+  const __m128i kOne  = _mm_set1_epi16(1);
+  // Do the two transform/transpose passes
+  int pass;
+#if DCT_HIGH_BIT_DEPTH
+  int overflow;
+#endif
+  for (pass = 0; pass < 2; ++pass) {
+    // We process eight columns (transposed rows in second pass) at a time.
+    int column_start;
+    for (column_start = 0; column_start < 32; column_start += 8) {
+      __m128i step1[32];
+      __m128i step2[32];
+      __m128i step3[32];
+      __m128i out[32];
+      // Stage 1
+      // Note: even though all the loads below are aligned, using the aligned
+      //       intrinsic make the code slightly slower.
+      if (0 == pass) {
+        const int16_t *in  = &input[column_start];
+        // step1[i] =  (in[ 0 * stride] + in[(32 -  1) * stride]) << 2;
+        // Note: the next four blocks could be in a loop. That would help the
+        //       instruction cache but is actually slower.
+        {
+          const int16_t *ina =  in +  0 * str1;
+          const int16_t *inb =  in + 31 * str1;
+          __m128i *step1a = &step1[ 0];
+          __m128i *step1b = &step1[31];
+          const __m128i ina0  = _mm_loadu_si128((const __m128i *)(ina));
+          const __m128i ina1  = _mm_loadu_si128((const __m128i *)(ina + str1));
+          const __m128i ina2  = _mm_loadu_si128((const __m128i *)(ina + str2));
+          const __m128i ina3  = _mm_loadu_si128((const __m128i *)(ina + str3));
+          const __m128i inb3  = _mm_loadu_si128((const __m128i *)(inb - str3));
+          const __m128i inb2  = _mm_loadu_si128((const __m128i *)(inb - str2));
+          const __m128i inb1  = _mm_loadu_si128((const __m128i *)(inb - str1));
+          const __m128i inb0  = _mm_loadu_si128((const __m128i *)(inb));
+          step1a[ 0] = _mm_add_epi16(ina0, inb0);
+          step1a[ 1] = _mm_add_epi16(ina1, inb1);
+          step1a[ 2] = _mm_add_epi16(ina2, inb2);
+          step1a[ 3] = _mm_add_epi16(ina3, inb3);
+          step1b[-3] = _mm_sub_epi16(ina3, inb3);
+          step1b[-2] = _mm_sub_epi16(ina2, inb2);
+          step1b[-1] = _mm_sub_epi16(ina1, inb1);
+          step1b[-0] = _mm_sub_epi16(ina0, inb0);
+          step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
+          step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
+          step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
+          step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
+          step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
+          step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
+          step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
+          step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
+        }
+        {
+          const int16_t *ina =  in +  4 * str1;
+          const int16_t *inb =  in + 27 * str1;
+          __m128i *step1a = &step1[ 4];
+          __m128i *step1b = &step1[27];
+          const __m128i ina0  = _mm_loadu_si128((const __m128i *)(ina));
+          const __m128i ina1  = _mm_loadu_si128((const __m128i *)(ina + str1));
+          const __m128i ina2  = _mm_loadu_si128((const __m128i *)(ina + str2));
+          const __m128i ina3  = _mm_loadu_si128((const __m128i *)(ina + str3));
+          const __m128i inb3  = _mm_loadu_si128((const __m128i *)(inb - str3));
+          const __m128i inb2  = _mm_loadu_si128((const __m128i *)(inb - str2));
+          const __m128i inb1  = _mm_loadu_si128((const __m128i *)(inb - str1));
+          const __m128i inb0  = _mm_loadu_si128((const __m128i *)(inb));
+          step1a[ 0] = _mm_add_epi16(ina0, inb0);
+          step1a[ 1] = _mm_add_epi16(ina1, inb1);
+          step1a[ 2] = _mm_add_epi16(ina2, inb2);
+          step1a[ 3] = _mm_add_epi16(ina3, inb3);
+          step1b[-3] = _mm_sub_epi16(ina3, inb3);
+          step1b[-2] = _mm_sub_epi16(ina2, inb2);
+          step1b[-1] = _mm_sub_epi16(ina1, inb1);
+          step1b[-0] = _mm_sub_epi16(ina0, inb0);
+          step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
+          step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
+          step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
+          step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
+          step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
+          step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
+          step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
+          step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
+        }
+        {
+          const int16_t *ina =  in +  8 * str1;
+          const int16_t *inb =  in + 23 * str1;
+          __m128i *step1a = &step1[ 8];
+          __m128i *step1b = &step1[23];
+          const __m128i ina0  = _mm_loadu_si128((const __m128i *)(ina));
+          const __m128i ina1  = _mm_loadu_si128((const __m128i *)(ina + str1));
+          const __m128i ina2  = _mm_loadu_si128((const __m128i *)(ina + str2));
+          const __m128i ina3  = _mm_loadu_si128((const __m128i *)(ina + str3));
+          const __m128i inb3  = _mm_loadu_si128((const __m128i *)(inb - str3));
+          const __m128i inb2  = _mm_loadu_si128((const __m128i *)(inb - str2));
+          const __m128i inb1  = _mm_loadu_si128((const __m128i *)(inb - str1));
+          const __m128i inb0  = _mm_loadu_si128((const __m128i *)(inb));
+          step1a[ 0] = _mm_add_epi16(ina0, inb0);
+          step1a[ 1] = _mm_add_epi16(ina1, inb1);
+          step1a[ 2] = _mm_add_epi16(ina2, inb2);
+          step1a[ 3] = _mm_add_epi16(ina3, inb3);
+          step1b[-3] = _mm_sub_epi16(ina3, inb3);
+          step1b[-2] = _mm_sub_epi16(ina2, inb2);
+          step1b[-1] = _mm_sub_epi16(ina1, inb1);
+          step1b[-0] = _mm_sub_epi16(ina0, inb0);
+          step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
+          step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
+          step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
+          step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
+          step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
+          step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
+          step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
+          step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
+        }
+        {
+          const int16_t *ina =  in + 12 * str1;
+          const int16_t *inb =  in + 19 * str1;
+          __m128i *step1a = &step1[12];
+          __m128i *step1b = &step1[19];
+          const __m128i ina0  = _mm_loadu_si128((const __m128i *)(ina));
+          const __m128i ina1  = _mm_loadu_si128((const __m128i *)(ina + str1));
+          const __m128i ina2  = _mm_loadu_si128((const __m128i *)(ina + str2));
+          const __m128i ina3  = _mm_loadu_si128((const __m128i *)(ina + str3));
+          const __m128i inb3  = _mm_loadu_si128((const __m128i *)(inb - str3));
+          const __m128i inb2  = _mm_loadu_si128((const __m128i *)(inb - str2));
+          const __m128i inb1  = _mm_loadu_si128((const __m128i *)(inb - str1));
+          const __m128i inb0  = _mm_loadu_si128((const __m128i *)(inb));
+          step1a[ 0] = _mm_add_epi16(ina0, inb0);
+          step1a[ 1] = _mm_add_epi16(ina1, inb1);
+          step1a[ 2] = _mm_add_epi16(ina2, inb2);
+          step1a[ 3] = _mm_add_epi16(ina3, inb3);
+          step1b[-3] = _mm_sub_epi16(ina3, inb3);
+          step1b[-2] = _mm_sub_epi16(ina2, inb2);
+          step1b[-1] = _mm_sub_epi16(ina1, inb1);
+          step1b[-0] = _mm_sub_epi16(ina0, inb0);
+          step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
+          step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
+          step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
+          step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
+          step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
+          step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
+          step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
+          step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
+        }
+      } else {
+        int16_t *in = &intermediate[column_start];
+        // step1[i] =  in[ 0 * 32] + in[(32 -  1) * 32];
+        // Note: using the same approach as above to have common offset is
+        //       counter-productive as all offsets can be calculated at compile
+        //       time.
+        // Note: the next four blocks could be in a loop. That would help the
+        //       instruction cache but is actually slower.
+        {
+          __m128i in00  = _mm_loadu_si128((const __m128i *)(in +  0 * 32));
+          __m128i in01  = _mm_loadu_si128((const __m128i *)(in +  1 * 32));
+          __m128i in02  = _mm_loadu_si128((const __m128i *)(in +  2 * 32));
+          __m128i in03  = _mm_loadu_si128((const __m128i *)(in +  3 * 32));
+          __m128i in28  = _mm_loadu_si128((const __m128i *)(in + 28 * 32));
+          __m128i in29  = _mm_loadu_si128((const __m128i *)(in + 29 * 32));
+          __m128i in30  = _mm_loadu_si128((const __m128i *)(in + 30 * 32));
+          __m128i in31  = _mm_loadu_si128((const __m128i *)(in + 31 * 32));
+          step1[0] = ADD_EPI16(in00, in31);
+          step1[1] = ADD_EPI16(in01, in30);
+          step1[2] = ADD_EPI16(in02, in29);
+          step1[3] = ADD_EPI16(in03, in28);
+          step1[28] = SUB_EPI16(in03, in28);
+          step1[29] = SUB_EPI16(in02, in29);
+          step1[30] = SUB_EPI16(in01, in30);
+          step1[31] = SUB_EPI16(in00, in31);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x8(&step1[0], &step1[1], &step1[2],
+                                             &step1[3], &step1[28], &step1[29],
+                                             &step1[30], &step1[31]);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+        }
+        {
+          __m128i in04  = _mm_loadu_si128((const __m128i *)(in +  4 * 32));
+          __m128i in05  = _mm_loadu_si128((const __m128i *)(in +  5 * 32));
+          __m128i in06  = _mm_loadu_si128((const __m128i *)(in +  6 * 32));
+          __m128i in07  = _mm_loadu_si128((const __m128i *)(in +  7 * 32));
+          __m128i in24  = _mm_loadu_si128((const __m128i *)(in + 24 * 32));
+          __m128i in25  = _mm_loadu_si128((const __m128i *)(in + 25 * 32));
+          __m128i in26  = _mm_loadu_si128((const __m128i *)(in + 26 * 32));
+          __m128i in27  = _mm_loadu_si128((const __m128i *)(in + 27 * 32));
+          step1[4] = ADD_EPI16(in04, in27);
+          step1[5] = ADD_EPI16(in05, in26);
+          step1[6] = ADD_EPI16(in06, in25);
+          step1[7] = ADD_EPI16(in07, in24);
+          step1[24] = SUB_EPI16(in07, in24);
+          step1[25] = SUB_EPI16(in06, in25);
+          step1[26] = SUB_EPI16(in05, in26);
+          step1[27] = SUB_EPI16(in04, in27);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x8(&step1[4], &step1[5], &step1[6],
+                                             &step1[7], &step1[24], &step1[25],
+                                             &step1[26], &step1[27]);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+        }
+        {
+          __m128i in08  = _mm_loadu_si128((const __m128i *)(in +  8 * 32));
+          __m128i in09  = _mm_loadu_si128((const __m128i *)(in +  9 * 32));
+          __m128i in10  = _mm_loadu_si128((const __m128i *)(in + 10 * 32));
+          __m128i in11  = _mm_loadu_si128((const __m128i *)(in + 11 * 32));
+          __m128i in20  = _mm_loadu_si128((const __m128i *)(in + 20 * 32));
+          __m128i in21  = _mm_loadu_si128((const __m128i *)(in + 21 * 32));
+          __m128i in22  = _mm_loadu_si128((const __m128i *)(in + 22 * 32));
+          __m128i in23  = _mm_loadu_si128((const __m128i *)(in + 23 * 32));
+          step1[8] = ADD_EPI16(in08, in23);
+          step1[9] = ADD_EPI16(in09, in22);
+          step1[10] = ADD_EPI16(in10, in21);
+          step1[11] = ADD_EPI16(in11, in20);
+          step1[20] = SUB_EPI16(in11, in20);
+          step1[21] = SUB_EPI16(in10, in21);
+          step1[22] = SUB_EPI16(in09, in22);
+          step1[23] = SUB_EPI16(in08, in23);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x8(&step1[8], &step1[9], &step1[10],
+                                             &step1[11], &step1[20], &step1[21],
+                                             &step1[22], &step1[23]);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+        }
+        {
+          __m128i in12  = _mm_loadu_si128((const __m128i *)(in + 12 * 32));
+          __m128i in13  = _mm_loadu_si128((const __m128i *)(in + 13 * 32));
+          __m128i in14  = _mm_loadu_si128((const __m128i *)(in + 14 * 32));
+          __m128i in15  = _mm_loadu_si128((const __m128i *)(in + 15 * 32));
+          __m128i in16  = _mm_loadu_si128((const __m128i *)(in + 16 * 32));
+          __m128i in17  = _mm_loadu_si128((const __m128i *)(in + 17 * 32));
+          __m128i in18  = _mm_loadu_si128((const __m128i *)(in + 18 * 32));
+          __m128i in19  = _mm_loadu_si128((const __m128i *)(in + 19 * 32));
+          step1[12] = ADD_EPI16(in12, in19);
+          step1[13] = ADD_EPI16(in13, in18);
+          step1[14] = ADD_EPI16(in14, in17);
+          step1[15] = ADD_EPI16(in15, in16);
+          step1[16] = SUB_EPI16(in15, in16);
+          step1[17] = SUB_EPI16(in14, in17);
+          step1[18] = SUB_EPI16(in13, in18);
+          step1[19] = SUB_EPI16(in12, in19);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x8(&step1[12], &step1[13], &step1[14],
+                                             &step1[15], &step1[16], &step1[17],
+                                             &step1[18], &step1[19]);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+        }
+      }
+      // Stage 2
+      {
+        step2[0] = ADD_EPI16(step1[0], step1[15]);
+        step2[1] = ADD_EPI16(step1[1], step1[14]);
+        step2[2] = ADD_EPI16(step1[2], step1[13]);
+        step2[3] = ADD_EPI16(step1[3], step1[12]);
+        step2[4] = ADD_EPI16(step1[4], step1[11]);
+        step2[5] = ADD_EPI16(step1[5], step1[10]);
+        step2[6] = ADD_EPI16(step1[6], step1[ 9]);
+        step2[7] = ADD_EPI16(step1[7], step1[ 8]);
+        step2[8] = SUB_EPI16(step1[7], step1[ 8]);
+        step2[9] = SUB_EPI16(step1[6], step1[ 9]);
+        step2[10] = SUB_EPI16(step1[5], step1[10]);
+        step2[11] = SUB_EPI16(step1[4], step1[11]);
+        step2[12] = SUB_EPI16(step1[3], step1[12]);
+        step2[13] = SUB_EPI16(step1[2], step1[13]);
+        step2[14] = SUB_EPI16(step1[1], step1[14]);
+        step2[15] = SUB_EPI16(step1[0], step1[15]);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x16(
+            &step2[0], &step2[1], &step2[2], &step2[3],
+            &step2[4], &step2[5], &step2[6], &step2[7],
+            &step2[8], &step2[9], &step2[10], &step2[11],
+            &step2[12], &step2[13], &step2[14], &step2[15]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+      {
+        const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]);
+        const __m128i s2_20_1 = _mm_unpackhi_epi16(step1[27], step1[20]);
+        const __m128i s2_21_0 = _mm_unpacklo_epi16(step1[26], step1[21]);
+        const __m128i s2_21_1 = _mm_unpackhi_epi16(step1[26], step1[21]);
+        const __m128i s2_22_0 = _mm_unpacklo_epi16(step1[25], step1[22]);
+        const __m128i s2_22_1 = _mm_unpackhi_epi16(step1[25], step1[22]);
+        const __m128i s2_23_0 = _mm_unpacklo_epi16(step1[24], step1[23]);
+        const __m128i s2_23_1 = _mm_unpackhi_epi16(step1[24], step1[23]);
+        const __m128i s2_20_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_m16);
+        const __m128i s2_20_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_m16);
+        const __m128i s2_21_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_m16);
+        const __m128i s2_21_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_m16);
+        const __m128i s2_22_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_m16);
+        const __m128i s2_22_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_m16);
+        const __m128i s2_23_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_m16);
+        const __m128i s2_23_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_m16);
+        const __m128i s2_24_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_p16);
+        const __m128i s2_24_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_p16);
+        const __m128i s2_25_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_p16);
+        const __m128i s2_25_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_p16);
+        const __m128i s2_26_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_p16);
+        const __m128i s2_26_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_p16);
+        const __m128i s2_27_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_p16);
+        const __m128i s2_27_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_p16);
+        // dct_const_round_shift
+        const __m128i s2_20_4 = _mm_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING);
+        const __m128i s2_20_5 = _mm_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING);
+        const __m128i s2_21_4 = _mm_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING);
+        const __m128i s2_21_5 = _mm_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING);
+        const __m128i s2_22_4 = _mm_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING);
+        const __m128i s2_22_5 = _mm_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING);
+        const __m128i s2_23_4 = _mm_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING);
+        const __m128i s2_23_5 = _mm_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING);
+        const __m128i s2_24_4 = _mm_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING);
+        const __m128i s2_24_5 = _mm_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING);
+        const __m128i s2_25_4 = _mm_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING);
+        const __m128i s2_25_5 = _mm_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING);
+        const __m128i s2_26_4 = _mm_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING);
+        const __m128i s2_26_5 = _mm_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING);
+        const __m128i s2_27_4 = _mm_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING);
+        const __m128i s2_27_5 = _mm_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING);
+        const __m128i s2_20_6 = _mm_srai_epi32(s2_20_4, DCT_CONST_BITS);
+        const __m128i s2_20_7 = _mm_srai_epi32(s2_20_5, DCT_CONST_BITS);
+        const __m128i s2_21_6 = _mm_srai_epi32(s2_21_4, DCT_CONST_BITS);
+        const __m128i s2_21_7 = _mm_srai_epi32(s2_21_5, DCT_CONST_BITS);
+        const __m128i s2_22_6 = _mm_srai_epi32(s2_22_4, DCT_CONST_BITS);
+        const __m128i s2_22_7 = _mm_srai_epi32(s2_22_5, DCT_CONST_BITS);
+        const __m128i s2_23_6 = _mm_srai_epi32(s2_23_4, DCT_CONST_BITS);
+        const __m128i s2_23_7 = _mm_srai_epi32(s2_23_5, DCT_CONST_BITS);
+        const __m128i s2_24_6 = _mm_srai_epi32(s2_24_4, DCT_CONST_BITS);
+        const __m128i s2_24_7 = _mm_srai_epi32(s2_24_5, DCT_CONST_BITS);
+        const __m128i s2_25_6 = _mm_srai_epi32(s2_25_4, DCT_CONST_BITS);
+        const __m128i s2_25_7 = _mm_srai_epi32(s2_25_5, DCT_CONST_BITS);
+        const __m128i s2_26_6 = _mm_srai_epi32(s2_26_4, DCT_CONST_BITS);
+        const __m128i s2_26_7 = _mm_srai_epi32(s2_26_5, DCT_CONST_BITS);
+        const __m128i s2_27_6 = _mm_srai_epi32(s2_27_4, DCT_CONST_BITS);
+        const __m128i s2_27_7 = _mm_srai_epi32(s2_27_5, DCT_CONST_BITS);
+        // Combine
+        step2[20] = _mm_packs_epi32(s2_20_6, s2_20_7);
+        step2[21] = _mm_packs_epi32(s2_21_6, s2_21_7);
+        step2[22] = _mm_packs_epi32(s2_22_6, s2_22_7);
+        step2[23] = _mm_packs_epi32(s2_23_6, s2_23_7);
+        step2[24] = _mm_packs_epi32(s2_24_6, s2_24_7);
+        step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7);
+        step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7);
+        step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x8(&step2[20], &step2[21], &step2[22],
+                                           &step2[23], &step2[24], &step2[25],
+                                           &step2[26], &step2[27]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+
+#if !FDCT32x32_HIGH_PRECISION
+      // dump the magnitude by half, hence the intermediate values are within
+      // the range of 16 bits.
+      if (1 == pass) {
+        __m128i s3_00_0 = _mm_cmplt_epi16(step2[ 0], kZero);
+        __m128i s3_01_0 = _mm_cmplt_epi16(step2[ 1], kZero);
+        __m128i s3_02_0 = _mm_cmplt_epi16(step2[ 2], kZero);
+        __m128i s3_03_0 = _mm_cmplt_epi16(step2[ 3], kZero);
+        __m128i s3_04_0 = _mm_cmplt_epi16(step2[ 4], kZero);
+        __m128i s3_05_0 = _mm_cmplt_epi16(step2[ 5], kZero);
+        __m128i s3_06_0 = _mm_cmplt_epi16(step2[ 6], kZero);
+        __m128i s3_07_0 = _mm_cmplt_epi16(step2[ 7], kZero);
+        __m128i s2_08_0 = _mm_cmplt_epi16(step2[ 8], kZero);
+        __m128i s2_09_0 = _mm_cmplt_epi16(step2[ 9], kZero);
+        __m128i s3_10_0 = _mm_cmplt_epi16(step2[10], kZero);
+        __m128i s3_11_0 = _mm_cmplt_epi16(step2[11], kZero);
+        __m128i s3_12_0 = _mm_cmplt_epi16(step2[12], kZero);
+        __m128i s3_13_0 = _mm_cmplt_epi16(step2[13], kZero);
+        __m128i s2_14_0 = _mm_cmplt_epi16(step2[14], kZero);
+        __m128i s2_15_0 = _mm_cmplt_epi16(step2[15], kZero);
+        __m128i s3_16_0 = _mm_cmplt_epi16(step1[16], kZero);
+        __m128i s3_17_0 = _mm_cmplt_epi16(step1[17], kZero);
+        __m128i s3_18_0 = _mm_cmplt_epi16(step1[18], kZero);
+        __m128i s3_19_0 = _mm_cmplt_epi16(step1[19], kZero);
+        __m128i s3_20_0 = _mm_cmplt_epi16(step2[20], kZero);
+        __m128i s3_21_0 = _mm_cmplt_epi16(step2[21], kZero);
+        __m128i s3_22_0 = _mm_cmplt_epi16(step2[22], kZero);
+        __m128i s3_23_0 = _mm_cmplt_epi16(step2[23], kZero);
+        __m128i s3_24_0 = _mm_cmplt_epi16(step2[24], kZero);
+        __m128i s3_25_0 = _mm_cmplt_epi16(step2[25], kZero);
+        __m128i s3_26_0 = _mm_cmplt_epi16(step2[26], kZero);
+        __m128i s3_27_0 = _mm_cmplt_epi16(step2[27], kZero);
+        __m128i s3_28_0 = _mm_cmplt_epi16(step1[28], kZero);
+        __m128i s3_29_0 = _mm_cmplt_epi16(step1[29], kZero);
+        __m128i s3_30_0 = _mm_cmplt_epi16(step1[30], kZero);
+        __m128i s3_31_0 = _mm_cmplt_epi16(step1[31], kZero);
+
+        step2[0] = SUB_EPI16(step2[ 0], s3_00_0);
+        step2[1] = SUB_EPI16(step2[ 1], s3_01_0);
+        step2[2] = SUB_EPI16(step2[ 2], s3_02_0);
+        step2[3] = SUB_EPI16(step2[ 3], s3_03_0);
+        step2[4] = SUB_EPI16(step2[ 4], s3_04_0);
+        step2[5] = SUB_EPI16(step2[ 5], s3_05_0);
+        step2[6] = SUB_EPI16(step2[ 6], s3_06_0);
+        step2[7] = SUB_EPI16(step2[ 7], s3_07_0);
+        step2[8] = SUB_EPI16(step2[ 8], s2_08_0);
+        step2[9] = SUB_EPI16(step2[ 9], s2_09_0);
+        step2[10] = SUB_EPI16(step2[10], s3_10_0);
+        step2[11] = SUB_EPI16(step2[11], s3_11_0);
+        step2[12] = SUB_EPI16(step2[12], s3_12_0);
+        step2[13] = SUB_EPI16(step2[13], s3_13_0);
+        step2[14] = SUB_EPI16(step2[14], s2_14_0);
+        step2[15] = SUB_EPI16(step2[15], s2_15_0);
+        step1[16] = SUB_EPI16(step1[16], s3_16_0);
+        step1[17] = SUB_EPI16(step1[17], s3_17_0);
+        step1[18] = SUB_EPI16(step1[18], s3_18_0);
+        step1[19] = SUB_EPI16(step1[19], s3_19_0);
+        step2[20] = SUB_EPI16(step2[20], s3_20_0);
+        step2[21] = SUB_EPI16(step2[21], s3_21_0);
+        step2[22] = SUB_EPI16(step2[22], s3_22_0);
+        step2[23] = SUB_EPI16(step2[23], s3_23_0);
+        step2[24] = SUB_EPI16(step2[24], s3_24_0);
+        step2[25] = SUB_EPI16(step2[25], s3_25_0);
+        step2[26] = SUB_EPI16(step2[26], s3_26_0);
+        step2[27] = SUB_EPI16(step2[27], s3_27_0);
+        step1[28] = SUB_EPI16(step1[28], s3_28_0);
+        step1[29] = SUB_EPI16(step1[29], s3_29_0);
+        step1[30] = SUB_EPI16(step1[30], s3_30_0);
+        step1[31] = SUB_EPI16(step1[31], s3_31_0);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x32(
+            &step2[0], &step2[1], &step2[2], &step2[3],
+            &step2[4], &step2[5], &step2[6], &step2[7],
+            &step2[8], &step2[9], &step2[10], &step2[11],
+            &step2[12], &step2[13], &step2[14], &step2[15],
+            &step1[16], &step1[17], &step1[18], &step1[19],
+            &step2[20], &step2[21], &step2[22], &step2[23],
+            &step2[24], &step2[25], &step2[26], &step2[27],
+            &step1[28], &step1[29], &step1[30], &step1[31]);
+        if (overflow) {
+          HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+        step2[0] = _mm_add_epi16(step2[ 0], kOne);
+        step2[1] = _mm_add_epi16(step2[ 1], kOne);
+        step2[2] = _mm_add_epi16(step2[ 2], kOne);
+        step2[3] = _mm_add_epi16(step2[ 3], kOne);
+        step2[4] = _mm_add_epi16(step2[ 4], kOne);
+        step2[5] = _mm_add_epi16(step2[ 5], kOne);
+        step2[6] = _mm_add_epi16(step2[ 6], kOne);
+        step2[7] = _mm_add_epi16(step2[ 7], kOne);
+        step2[8] = _mm_add_epi16(step2[ 8], kOne);
+        step2[9] = _mm_add_epi16(step2[ 9], kOne);
+        step2[10] = _mm_add_epi16(step2[10], kOne);
+        step2[11] = _mm_add_epi16(step2[11], kOne);
+        step2[12] = _mm_add_epi16(step2[12], kOne);
+        step2[13] = _mm_add_epi16(step2[13], kOne);
+        step2[14] = _mm_add_epi16(step2[14], kOne);
+        step2[15] = _mm_add_epi16(step2[15], kOne);
+        step1[16] = _mm_add_epi16(step1[16], kOne);
+        step1[17] = _mm_add_epi16(step1[17], kOne);
+        step1[18] = _mm_add_epi16(step1[18], kOne);
+        step1[19] = _mm_add_epi16(step1[19], kOne);
+        step2[20] = _mm_add_epi16(step2[20], kOne);
+        step2[21] = _mm_add_epi16(step2[21], kOne);
+        step2[22] = _mm_add_epi16(step2[22], kOne);
+        step2[23] = _mm_add_epi16(step2[23], kOne);
+        step2[24] = _mm_add_epi16(step2[24], kOne);
+        step2[25] = _mm_add_epi16(step2[25], kOne);
+        step2[26] = _mm_add_epi16(step2[26], kOne);
+        step2[27] = _mm_add_epi16(step2[27], kOne);
+        step1[28] = _mm_add_epi16(step1[28], kOne);
+        step1[29] = _mm_add_epi16(step1[29], kOne);
+        step1[30] = _mm_add_epi16(step1[30], kOne);
+        step1[31] = _mm_add_epi16(step1[31], kOne);
+
+        step2[0] = _mm_srai_epi16(step2[ 0], 2);
+        step2[1] = _mm_srai_epi16(step2[ 1], 2);
+        step2[2] = _mm_srai_epi16(step2[ 2], 2);
+        step2[3] = _mm_srai_epi16(step2[ 3], 2);
+        step2[4] = _mm_srai_epi16(step2[ 4], 2);
+        step2[5] = _mm_srai_epi16(step2[ 5], 2);
+        step2[6] = _mm_srai_epi16(step2[ 6], 2);
+        step2[7] = _mm_srai_epi16(step2[ 7], 2);
+        step2[8] = _mm_srai_epi16(step2[ 8], 2);
+        step2[9] = _mm_srai_epi16(step2[ 9], 2);
+        step2[10] = _mm_srai_epi16(step2[10], 2);
+        step2[11] = _mm_srai_epi16(step2[11], 2);
+        step2[12] = _mm_srai_epi16(step2[12], 2);
+        step2[13] = _mm_srai_epi16(step2[13], 2);
+        step2[14] = _mm_srai_epi16(step2[14], 2);
+        step2[15] = _mm_srai_epi16(step2[15], 2);
+        step1[16] = _mm_srai_epi16(step1[16], 2);
+        step1[17] = _mm_srai_epi16(step1[17], 2);
+        step1[18] = _mm_srai_epi16(step1[18], 2);
+        step1[19] = _mm_srai_epi16(step1[19], 2);
+        step2[20] = _mm_srai_epi16(step2[20], 2);
+        step2[21] = _mm_srai_epi16(step2[21], 2);
+        step2[22] = _mm_srai_epi16(step2[22], 2);
+        step2[23] = _mm_srai_epi16(step2[23], 2);
+        step2[24] = _mm_srai_epi16(step2[24], 2);
+        step2[25] = _mm_srai_epi16(step2[25], 2);
+        step2[26] = _mm_srai_epi16(step2[26], 2);
+        step2[27] = _mm_srai_epi16(step2[27], 2);
+        step1[28] = _mm_srai_epi16(step1[28], 2);
+        step1[29] = _mm_srai_epi16(step1[29], 2);
+        step1[30] = _mm_srai_epi16(step1[30], 2);
+        step1[31] = _mm_srai_epi16(step1[31], 2);
+      }
+#endif  // !FDCT32x32_HIGH_PRECISION
+
+#if FDCT32x32_HIGH_PRECISION
+      if (pass == 0) {
+#endif
+      // Stage 3
+      {
+        step3[0] = ADD_EPI16(step2[(8 - 1)], step2[0]);
+        step3[1] = ADD_EPI16(step2[(8 - 2)], step2[1]);
+        step3[2] = ADD_EPI16(step2[(8 - 3)], step2[2]);
+        step3[3] = ADD_EPI16(step2[(8 - 4)], step2[3]);
+        step3[4] = SUB_EPI16(step2[(8 - 5)], step2[4]);
+        step3[5] = SUB_EPI16(step2[(8 - 6)], step2[5]);
+        step3[6] = SUB_EPI16(step2[(8 - 7)], step2[6]);
+        step3[7] = SUB_EPI16(step2[(8 - 8)], step2[7]);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x8(&step3[0], &step3[1], &step3[2],
+                                           &step3[3], &step3[4], &step3[5],
+                                           &step3[6], &step3[7]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+      {
+        const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
+        const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
+        const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
+        const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
+        const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
+        const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
+        const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
+        const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
+        const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
+        const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
+        const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
+        const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
+        // dct_const_round_shift
+        const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
+        const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
+        const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
+        const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
+        const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
+        const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
+        const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
+        const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
+        const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
+        const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
+        const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
+        const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
+        const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
+        const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
+        const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
+        const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
+        // Combine
+        step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7);
+        step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7);
+        step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7);
+        step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x4(&step3[10], &step3[11],
+                                           &step3[12], &step3[13]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+      {
+        step3[16] = ADD_EPI16(step2[23], step1[16]);
+        step3[17] = ADD_EPI16(step2[22], step1[17]);
+        step3[18] = ADD_EPI16(step2[21], step1[18]);
+        step3[19] = ADD_EPI16(step2[20], step1[19]);
+        step3[20] = SUB_EPI16(step1[19], step2[20]);
+        step3[21] = SUB_EPI16(step1[18], step2[21]);
+        step3[22] = SUB_EPI16(step1[17], step2[22]);
+        step3[23] = SUB_EPI16(step1[16], step2[23]);
+        step3[24] = SUB_EPI16(step1[31], step2[24]);
+        step3[25] = SUB_EPI16(step1[30], step2[25]);
+        step3[26] = SUB_EPI16(step1[29], step2[26]);
+        step3[27] = SUB_EPI16(step1[28], step2[27]);
+        step3[28] = ADD_EPI16(step2[27], step1[28]);
+        step3[29] = ADD_EPI16(step2[26], step1[29]);
+        step3[30] = ADD_EPI16(step2[25], step1[30]);
+        step3[31] = ADD_EPI16(step2[24], step1[31]);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x16(
+            &step3[16], &step3[17], &step3[18], &step3[19],
+            &step3[20], &step3[21], &step3[22], &step3[23],
+            &step3[24], &step3[25], &step3[26], &step3[27],
+            &step3[28], &step3[29], &step3[30], &step3[31]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+
+      // Stage 4
+      {
+        step1[0] = ADD_EPI16(step3[ 3], step3[ 0]);
+        step1[1] = ADD_EPI16(step3[ 2], step3[ 1]);
+        step1[2] = SUB_EPI16(step3[ 1], step3[ 2]);
+        step1[3] = SUB_EPI16(step3[ 0], step3[ 3]);
+        step1[8] = ADD_EPI16(step3[11], step2[ 8]);
+        step1[9] = ADD_EPI16(step3[10], step2[ 9]);
+        step1[10] = SUB_EPI16(step2[ 9], step3[10]);
+        step1[11] = SUB_EPI16(step2[ 8], step3[11]);
+        step1[12] = SUB_EPI16(step2[15], step3[12]);
+        step1[13] = SUB_EPI16(step2[14], step3[13]);
+        step1[14] = ADD_EPI16(step3[13], step2[14]);
+        step1[15] = ADD_EPI16(step3[12], step2[15]);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x16(
+            &step1[0], &step1[1], &step1[2], &step1[3],
+            &step1[4], &step1[5], &step1[6], &step1[7],
+            &step1[8], &step1[9], &step1[10], &step1[11],
+            &step1[12], &step1[13], &step1[14], &step1[15]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+      {
+        const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]);
+        const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]);
+        const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16);
+        const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16);
+        const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16);
+        const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16);
+        // dct_const_round_shift
+        const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
+        const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
+        const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
+        const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
+        const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS);
+        const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS);
+        const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS);
+        const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS);
+        // Combine
+        step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7);
+        step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x2(&step1[5], &step1[6]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+      {
+        const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]);
+        const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]);
+        const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]);
+        const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]);
+        const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]);
+        const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]);
+        const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]);
+        const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]);
+        const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24);
+        const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24);
+        const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24);
+        const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24);
+        const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08);
+        const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08);
+        const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08);
+        const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08);
+        const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24);
+        const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24);
+        const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24);
+        const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24);
+        const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08);
+        const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08);
+        const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08);
+        const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08);
+        // dct_const_round_shift
+        const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
+        const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
+        const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
+        const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
+        const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
+        const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
+        const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
+        const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
+        const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
+        const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
+        const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
+        const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
+        const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
+        const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
+        const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
+        const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
+        const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS);
+        const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS);
+        const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS);
+        const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS);
+        const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS);
+        const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS);
+        const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS);
+        const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS);
+        const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS);
+        const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS);
+        const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS);
+        const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS);
+        const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS);
+        const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS);
+        const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS);
+        const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS);
+        // Combine
+        step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7);
+        step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7);
+        step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7);
+        step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7);
+        step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7);
+        step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7);
+        step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7);
+        step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x8(&step1[18], &step1[19], &step1[20],
+                                           &step1[21], &step1[26], &step1[27],
+                                           &step1[28], &step1[29]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+      // Stage 5
+      {
+        step2[4] = ADD_EPI16(step1[5], step3[4]);
+        step2[5] = SUB_EPI16(step3[4], step1[5]);
+        step2[6] = SUB_EPI16(step3[7], step1[6]);
+        step2[7] = ADD_EPI16(step1[6], step3[7]);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x4(&step2[4], &step2[5],
+                                           &step2[6], &step2[7]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+      {
+        const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]);
+        const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]);
+        const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]);
+        const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]);
+        const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16);
+        const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16);
+        const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16);
+        const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16);
+        const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08);
+        const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08);
+        const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24);
+        const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24);
+        // dct_const_round_shift
+        const __m128i out_00_4 = _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_00_5 = _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_16_4 = _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_16_5 = _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_08_4 = _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_08_5 = _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_24_4 = _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_24_5 = _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS);
+        const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS);
+        const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS);
+        const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS);
+        const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS);
+        const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS);
+        const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS);
+        const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS);
+        // Combine
+        out[ 0] = _mm_packs_epi32(out_00_6, out_00_7);
+        out[16] = _mm_packs_epi32(out_16_6, out_16_7);
+        out[ 8] = _mm_packs_epi32(out_08_6, out_08_7);
+        out[24] = _mm_packs_epi32(out_24_6, out_24_7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x4(&out[0], &out[16],
+                                           &out[8], &out[24]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+      {
+        const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[ 9], step1[14]);
+        const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[ 9], step1[14]);
+        const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]);
+        const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]);
+        const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24);
+        const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24);
+        const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08);
+        const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08);
+        const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24);
+        const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24);
+        const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08);
+        const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08);
+        // dct_const_round_shift
+        const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
+        const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
+        const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
+        const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
+        const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
+        const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
+        const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
+        const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
+        const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS);
+        const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS);
+        const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS);
+        const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS);
+        const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS);
+        const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS);
+        const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS);
+        const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS);
+        // Combine
+        step2[ 9] = _mm_packs_epi32(s2_09_6, s2_09_7);
+        step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7);
+        step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7);
+        step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x4(&step2[9], &step2[10],
+                                           &step2[13], &step2[14]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+      {
+        step2[16] = ADD_EPI16(step1[19], step3[16]);
+        step2[17] = ADD_EPI16(step1[18], step3[17]);
+        step2[18] = SUB_EPI16(step3[17], step1[18]);
+        step2[19] = SUB_EPI16(step3[16], step1[19]);
+        step2[20] = SUB_EPI16(step3[23], step1[20]);
+        step2[21] = SUB_EPI16(step3[22], step1[21]);
+        step2[22] = ADD_EPI16(step1[21], step3[22]);
+        step2[23] = ADD_EPI16(step1[20], step3[23]);
+        step2[24] = ADD_EPI16(step1[27], step3[24]);
+        step2[25] = ADD_EPI16(step1[26], step3[25]);
+        step2[26] = SUB_EPI16(step3[25], step1[26]);
+        step2[27] = SUB_EPI16(step3[24], step1[27]);
+        step2[28] = SUB_EPI16(step3[31], step1[28]);
+        step2[29] = SUB_EPI16(step3[30], step1[29]);
+        step2[30] = ADD_EPI16(step1[29], step3[30]);
+        step2[31] = ADD_EPI16(step1[28], step3[31]);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x16(
+            &step2[16], &step2[17], &step2[18], &step2[19],
+            &step2[20], &step2[21], &step2[22], &step2[23],
+            &step2[24], &step2[25], &step2[26], &step2[27],
+            &step2[28], &step2[29], &step2[30], &step2[31]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+      // Stage 6
+      {
+        const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
+        const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
+        const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
+        const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
+        const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
+        const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
+        const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
+        const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
+        const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04);
+        const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04);
+        const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20);
+        const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20);
+        const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12);
+        const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12);
+        const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28);
+        const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28);
+        // dct_const_round_shift
+        const __m128i out_04_4 = _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_04_5 = _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_20_4 = _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_20_5 = _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_12_4 = _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_12_5 = _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_28_4 = _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_28_5 = _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS);
+        const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS);
+        const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS);
+        const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS);
+        const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS);
+        const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS);
+        const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS);
+        const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS);
+        // Combine
+        out[4] = _mm_packs_epi32(out_04_6, out_04_7);
+        out[20] = _mm_packs_epi32(out_20_6, out_20_7);
+        out[12] = _mm_packs_epi32(out_12_6, out_12_7);
+        out[28] = _mm_packs_epi32(out_28_6, out_28_7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x4(&out[4], &out[20],
+                                           &out[12], &out[28]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+      {
+        step3[8] = ADD_EPI16(step2[ 9], step1[ 8]);
+        step3[9] = SUB_EPI16(step1[ 8], step2[ 9]);
+        step3[10] = SUB_EPI16(step1[11], step2[10]);
+        step3[11] = ADD_EPI16(step2[10], step1[11]);
+        step3[12] = ADD_EPI16(step2[13], step1[12]);
+        step3[13] = SUB_EPI16(step1[12], step2[13]);
+        step3[14] = SUB_EPI16(step1[15], step2[14]);
+        step3[15] = ADD_EPI16(step2[14], step1[15]);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x8(&step3[8], &step3[9], &step3[10],
+                                           &step3[11], &step3[12], &step3[13],
+                                           &step3[14], &step3[15]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+      {
+        const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]);
+        const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]);
+        const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]);
+        const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]);
+        const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]);
+        const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]);
+        const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]);
+        const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]);
+        const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28);
+        const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28);
+        const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04);
+        const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04);
+        const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12);
+        const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12);
+        const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20);
+        const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20);
+        const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12);
+        const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12);
+        const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20);
+        const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20);
+        const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28);
+        const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28);
+        const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04);
+        const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04);
+        // dct_const_round_shift
+        const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
+        const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
+        const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
+        const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
+        const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
+        const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
+        const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
+        const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
+        const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS);
+        const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS);
+        const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS);
+        const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS);
+        const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS);
+        const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS);
+        const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS);
+        const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS);
+        const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
+        const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
+        const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
+        const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
+        const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
+        const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
+        const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
+        const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
+        const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS);
+        const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS);
+        const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS);
+        const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS);
+        const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS);
+        const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS);
+        const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS);
+        const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS);
+        // Combine
+        step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7);
+        step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7);
+        step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7);
+        step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7);
+        // Combine
+        step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7);
+        step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7);
+        step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7);
+        step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x8(&step3[17], &step3[18], &step3[21],
+                                           &step3[22], &step3[25], &step3[26],
+                                           &step3[29], &step3[30]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+      // Stage 7
+      {
+        const __m128i out_02_0 = _mm_unpacklo_epi16(step3[ 8], step3[15]);
+        const __m128i out_02_1 = _mm_unpackhi_epi16(step3[ 8], step3[15]);
+        const __m128i out_18_0 = _mm_unpacklo_epi16(step3[ 9], step3[14]);
+        const __m128i out_18_1 = _mm_unpackhi_epi16(step3[ 9], step3[14]);
+        const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]);
+        const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]);
+        const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]);
+        const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]);
+        const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02);
+        const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02);
+        const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18);
+        const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18);
+        const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10);
+        const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10);
+        const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26);
+        const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26);
+        const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06);
+        const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06);
+        const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22);
+        const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22);
+        const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14);
+        const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14);
+        const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30);
+        const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30);
+        // dct_const_round_shift
+        const __m128i out_02_4 = _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_02_5 = _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_18_4 = _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_18_5 = _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_10_4 = _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_10_5 = _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_26_4 = _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_26_5 = _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_06_4 = _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_06_5 = _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_22_4 = _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_22_5 = _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_14_4 = _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_14_5 = _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_30_4 = _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_30_5 = _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS);
+        const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS);
+        const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS);
+        const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS);
+        const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS);
+        const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS);
+        const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS);
+        const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS);
+        const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS);
+        const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS);
+        const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS);
+        const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS);
+        const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS);
+        const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS);
+        const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS);
+        const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS);
+        // Combine
+        out[ 2] = _mm_packs_epi32(out_02_6, out_02_7);
+        out[18] = _mm_packs_epi32(out_18_6, out_18_7);
+        out[10] = _mm_packs_epi32(out_10_6, out_10_7);
+        out[26] = _mm_packs_epi32(out_26_6, out_26_7);
+        out[ 6] = _mm_packs_epi32(out_06_6, out_06_7);
+        out[22] = _mm_packs_epi32(out_22_6, out_22_7);
+        out[14] = _mm_packs_epi32(out_14_6, out_14_7);
+        out[30] = _mm_packs_epi32(out_30_6, out_30_7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x8(&out[2], &out[18], &out[10],
+                                           &out[26], &out[6], &out[22],
+                                           &out[14], &out[30]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+      {
+        step1[16] = ADD_EPI16(step3[17], step2[16]);
+        step1[17] = SUB_EPI16(step2[16], step3[17]);
+        step1[18] = SUB_EPI16(step2[19], step3[18]);
+        step1[19] = ADD_EPI16(step3[18], step2[19]);
+        step1[20] = ADD_EPI16(step3[21], step2[20]);
+        step1[21] = SUB_EPI16(step2[20], step3[21]);
+        step1[22] = SUB_EPI16(step2[23], step3[22]);
+        step1[23] = ADD_EPI16(step3[22], step2[23]);
+        step1[24] = ADD_EPI16(step3[25], step2[24]);
+        step1[25] = SUB_EPI16(step2[24], step3[25]);
+        step1[26] = SUB_EPI16(step2[27], step3[26]);
+        step1[27] = ADD_EPI16(step3[26], step2[27]);
+        step1[28] = ADD_EPI16(step3[29], step2[28]);
+        step1[29] = SUB_EPI16(step2[28], step3[29]);
+        step1[30] = SUB_EPI16(step2[31], step3[30]);
+        step1[31] = ADD_EPI16(step3[30], step2[31]);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x16(
+            &step1[16], &step1[17], &step1[18], &step1[19],
+            &step1[20], &step1[21], &step1[22], &step1[23],
+            &step1[24], &step1[25], &step1[26], &step1[27],
+            &step1[28], &step1[29], &step1[30], &step1[31]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+      // Final stage --- outputs indices are bit-reversed.
+      {
+        const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]);
+        const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]);
+        const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]);
+        const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]);
+        const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]);
+        const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]);
+        const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]);
+        const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]);
+        const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01);
+        const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01);
+        const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17);
+        const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17);
+        const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09);
+        const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09);
+        const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25);
+        const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25);
+        const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07);
+        const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07);
+        const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23);
+        const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23);
+        const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15);
+        const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15);
+        const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31);
+        const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31);
+        // dct_const_round_shift
+        const __m128i out_01_4 = _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_01_5 = _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_17_4 = _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_17_5 = _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_09_4 = _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_09_5 = _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_25_4 = _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_25_5 = _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_07_4 = _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_07_5 = _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_23_4 = _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_23_5 = _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_15_4 = _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_15_5 = _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_31_4 = _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_31_5 = _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS);
+        const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS);
+        const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS);
+        const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS);
+        const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS);
+        const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS);
+        const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS);
+        const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS);
+        const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS);
+        const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS);
+        const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS);
+        const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS);
+        const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS);
+        const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS);
+        const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS);
+        const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS);
+        // Combine
+        out[ 1] = _mm_packs_epi32(out_01_6, out_01_7);
+        out[17] = _mm_packs_epi32(out_17_6, out_17_7);
+        out[ 9] = _mm_packs_epi32(out_09_6, out_09_7);
+        out[25] = _mm_packs_epi32(out_25_6, out_25_7);
+        out[ 7] = _mm_packs_epi32(out_07_6, out_07_7);
+        out[23] = _mm_packs_epi32(out_23_6, out_23_7);
+        out[15] = _mm_packs_epi32(out_15_6, out_15_7);
+        out[31] = _mm_packs_epi32(out_31_6, out_31_7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x8(&out[1], &out[17], &out[9],
+                                           &out[25], &out[7], &out[23],
+                                           &out[15], &out[31]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+      {
+        const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]);
+        const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]);
+        const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]);
+        const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]);
+        const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]);
+        const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]);
+        const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]);
+        const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]);
+        const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05);
+        const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05);
+        const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21);
+        const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21);
+        const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13);
+        const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13);
+        const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29);
+        const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29);
+        const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03);
+        const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03);
+        const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19);
+        const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19);
+        const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11);
+        const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11);
+        const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27);
+        const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27);
+        // dct_const_round_shift
+        const __m128i out_05_4 = _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_05_5 = _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_21_4 = _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_21_5 = _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_13_4 = _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_13_5 = _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_29_4 = _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_29_5 = _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_03_4 = _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_03_5 = _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_19_4 = _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_19_5 = _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_11_4 = _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_11_5 = _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_27_4 = _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_27_5 = _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS);
+        const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS);
+        const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS);
+        const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS);
+        const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS);
+        const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS);
+        const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS);
+        const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS);
+        const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS);
+        const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS);
+        const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS);
+        const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS);
+        const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS);
+        const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS);
+        const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS);
+        const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS);
+        // Combine
+        out[ 5] = _mm_packs_epi32(out_05_6, out_05_7);
+        out[21] = _mm_packs_epi32(out_21_6, out_21_7);
+        out[13] = _mm_packs_epi32(out_13_6, out_13_7);
+        out[29] = _mm_packs_epi32(out_29_6, out_29_7);
+        out[ 3] = _mm_packs_epi32(out_03_6, out_03_7);
+        out[19] = _mm_packs_epi32(out_19_6, out_19_7);
+        out[11] = _mm_packs_epi32(out_11_6, out_11_7);
+        out[27] = _mm_packs_epi32(out_27_6, out_27_7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x8(&out[5], &out[21], &out[13],
+                                           &out[29], &out[3], &out[19],
+                                           &out[11], &out[27]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+#if FDCT32x32_HIGH_PRECISION
+      } else {
+        __m128i lstep1[64], lstep2[64], lstep3[64];
+        __m128i u[32], v[32], sign[16];
+        const __m128i K32One = _mm_set_epi32(1, 1, 1, 1);
+        // start using 32-bit operations
+        // stage 3
+        {
+          // expanding to 32-bit length priori to addition operations
+          lstep2[ 0] = _mm_unpacklo_epi16(step2[ 0], kZero);
+          lstep2[ 1] = _mm_unpackhi_epi16(step2[ 0], kZero);
+          lstep2[ 2] = _mm_unpacklo_epi16(step2[ 1], kZero);
+          lstep2[ 3] = _mm_unpackhi_epi16(step2[ 1], kZero);
+          lstep2[ 4] = _mm_unpacklo_epi16(step2[ 2], kZero);
+          lstep2[ 5] = _mm_unpackhi_epi16(step2[ 2], kZero);
+          lstep2[ 6] = _mm_unpacklo_epi16(step2[ 3], kZero);
+          lstep2[ 7] = _mm_unpackhi_epi16(step2[ 3], kZero);
+          lstep2[ 8] = _mm_unpacklo_epi16(step2[ 4], kZero);
+          lstep2[ 9] = _mm_unpackhi_epi16(step2[ 4], kZero);
+          lstep2[10] = _mm_unpacklo_epi16(step2[ 5], kZero);
+          lstep2[11] = _mm_unpackhi_epi16(step2[ 5], kZero);
+          lstep2[12] = _mm_unpacklo_epi16(step2[ 6], kZero);
+          lstep2[13] = _mm_unpackhi_epi16(step2[ 6], kZero);
+          lstep2[14] = _mm_unpacklo_epi16(step2[ 7], kZero);
+          lstep2[15] = _mm_unpackhi_epi16(step2[ 7], kZero);
+          lstep2[ 0] = _mm_madd_epi16(lstep2[ 0], kOne);
+          lstep2[ 1] = _mm_madd_epi16(lstep2[ 1], kOne);
+          lstep2[ 2] = _mm_madd_epi16(lstep2[ 2], kOne);
+          lstep2[ 3] = _mm_madd_epi16(lstep2[ 3], kOne);
+          lstep2[ 4] = _mm_madd_epi16(lstep2[ 4], kOne);
+          lstep2[ 5] = _mm_madd_epi16(lstep2[ 5], kOne);
+          lstep2[ 6] = _mm_madd_epi16(lstep2[ 6], kOne);
+          lstep2[ 7] = _mm_madd_epi16(lstep2[ 7], kOne);
+          lstep2[ 8] = _mm_madd_epi16(lstep2[ 8], kOne);
+          lstep2[ 9] = _mm_madd_epi16(lstep2[ 9], kOne);
+          lstep2[10] = _mm_madd_epi16(lstep2[10], kOne);
+          lstep2[11] = _mm_madd_epi16(lstep2[11], kOne);
+          lstep2[12] = _mm_madd_epi16(lstep2[12], kOne);
+          lstep2[13] = _mm_madd_epi16(lstep2[13], kOne);
+          lstep2[14] = _mm_madd_epi16(lstep2[14], kOne);
+          lstep2[15] = _mm_madd_epi16(lstep2[15], kOne);
+
+          lstep3[ 0] = _mm_add_epi32(lstep2[14], lstep2[ 0]);
+          lstep3[ 1] = _mm_add_epi32(lstep2[15], lstep2[ 1]);
+          lstep3[ 2] = _mm_add_epi32(lstep2[12], lstep2[ 2]);
+          lstep3[ 3] = _mm_add_epi32(lstep2[13], lstep2[ 3]);
+          lstep3[ 4] = _mm_add_epi32(lstep2[10], lstep2[ 4]);
+          lstep3[ 5] = _mm_add_epi32(lstep2[11], lstep2[ 5]);
+          lstep3[ 6] = _mm_add_epi32(lstep2[ 8], lstep2[ 6]);
+          lstep3[ 7] = _mm_add_epi32(lstep2[ 9], lstep2[ 7]);
+          lstep3[ 8] = _mm_sub_epi32(lstep2[ 6], lstep2[ 8]);
+          lstep3[ 9] = _mm_sub_epi32(lstep2[ 7], lstep2[ 9]);
+          lstep3[10] = _mm_sub_epi32(lstep2[ 4], lstep2[10]);
+          lstep3[11] = _mm_sub_epi32(lstep2[ 5], lstep2[11]);
+          lstep3[12] = _mm_sub_epi32(lstep2[ 2], lstep2[12]);
+          lstep3[13] = _mm_sub_epi32(lstep2[ 3], lstep2[13]);
+          lstep3[14] = _mm_sub_epi32(lstep2[ 0], lstep2[14]);
+          lstep3[15] = _mm_sub_epi32(lstep2[ 1], lstep2[15]);
+        }
+        {
+          const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
+          const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
+          const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
+          const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
+          const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
+          const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
+          const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
+          const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
+          const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
+          const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
+          const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
+          const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
+          // dct_const_round_shift
+          const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
+          const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
+          const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
+          const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
+          const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
+          const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
+          const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
+          const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
+          lstep3[20] = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
+          lstep3[21] = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
+          lstep3[22] = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
+          lstep3[23] = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
+          lstep3[24] = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
+          lstep3[25] = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
+          lstep3[26] = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
+          lstep3[27] = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
+        }
+        {
+          lstep2[40] = _mm_unpacklo_epi16(step2[20], kZero);
+          lstep2[41] = _mm_unpackhi_epi16(step2[20], kZero);
+          lstep2[42] = _mm_unpacklo_epi16(step2[21], kZero);
+          lstep2[43] = _mm_unpackhi_epi16(step2[21], kZero);
+          lstep2[44] = _mm_unpacklo_epi16(step2[22], kZero);
+          lstep2[45] = _mm_unpackhi_epi16(step2[22], kZero);
+          lstep2[46] = _mm_unpacklo_epi16(step2[23], kZero);
+          lstep2[47] = _mm_unpackhi_epi16(step2[23], kZero);
+          lstep2[48] = _mm_unpacklo_epi16(step2[24], kZero);
+          lstep2[49] = _mm_unpackhi_epi16(step2[24], kZero);
+          lstep2[50] = _mm_unpacklo_epi16(step2[25], kZero);
+          lstep2[51] = _mm_unpackhi_epi16(step2[25], kZero);
+          lstep2[52] = _mm_unpacklo_epi16(step2[26], kZero);
+          lstep2[53] = _mm_unpackhi_epi16(step2[26], kZero);
+          lstep2[54] = _mm_unpacklo_epi16(step2[27], kZero);
+          lstep2[55] = _mm_unpackhi_epi16(step2[27], kZero);
+          lstep2[40] = _mm_madd_epi16(lstep2[40], kOne);
+          lstep2[41] = _mm_madd_epi16(lstep2[41], kOne);
+          lstep2[42] = _mm_madd_epi16(lstep2[42], kOne);
+          lstep2[43] = _mm_madd_epi16(lstep2[43], kOne);
+          lstep2[44] = _mm_madd_epi16(lstep2[44], kOne);
+          lstep2[45] = _mm_madd_epi16(lstep2[45], kOne);
+          lstep2[46] = _mm_madd_epi16(lstep2[46], kOne);
+          lstep2[47] = _mm_madd_epi16(lstep2[47], kOne);
+          lstep2[48] = _mm_madd_epi16(lstep2[48], kOne);
+          lstep2[49] = _mm_madd_epi16(lstep2[49], kOne);
+          lstep2[50] = _mm_madd_epi16(lstep2[50], kOne);
+          lstep2[51] = _mm_madd_epi16(lstep2[51], kOne);
+          lstep2[52] = _mm_madd_epi16(lstep2[52], kOne);
+          lstep2[53] = _mm_madd_epi16(lstep2[53], kOne);
+          lstep2[54] = _mm_madd_epi16(lstep2[54], kOne);
+          lstep2[55] = _mm_madd_epi16(lstep2[55], kOne);
+
+          lstep1[32] = _mm_unpacklo_epi16(step1[16], kZero);
+          lstep1[33] = _mm_unpackhi_epi16(step1[16], kZero);
+          lstep1[34] = _mm_unpacklo_epi16(step1[17], kZero);
+          lstep1[35] = _mm_unpackhi_epi16(step1[17], kZero);
+          lstep1[36] = _mm_unpacklo_epi16(step1[18], kZero);
+          lstep1[37] = _mm_unpackhi_epi16(step1[18], kZero);
+          lstep1[38] = _mm_unpacklo_epi16(step1[19], kZero);
+          lstep1[39] = _mm_unpackhi_epi16(step1[19], kZero);
+          lstep1[56] = _mm_unpacklo_epi16(step1[28], kZero);
+          lstep1[57] = _mm_unpackhi_epi16(step1[28], kZero);
+          lstep1[58] = _mm_unpacklo_epi16(step1[29], kZero);
+          lstep1[59] = _mm_unpackhi_epi16(step1[29], kZero);
+          lstep1[60] = _mm_unpacklo_epi16(step1[30], kZero);
+          lstep1[61] = _mm_unpackhi_epi16(step1[30], kZero);
+          lstep1[62] = _mm_unpacklo_epi16(step1[31], kZero);
+          lstep1[63] = _mm_unpackhi_epi16(step1[31], kZero);
+          lstep1[32] = _mm_madd_epi16(lstep1[32], kOne);
+          lstep1[33] = _mm_madd_epi16(lstep1[33], kOne);
+          lstep1[34] = _mm_madd_epi16(lstep1[34], kOne);
+          lstep1[35] = _mm_madd_epi16(lstep1[35], kOne);
+          lstep1[36] = _mm_madd_epi16(lstep1[36], kOne);
+          lstep1[37] = _mm_madd_epi16(lstep1[37], kOne);
+          lstep1[38] = _mm_madd_epi16(lstep1[38], kOne);
+          lstep1[39] = _mm_madd_epi16(lstep1[39], kOne);
+          lstep1[56] = _mm_madd_epi16(lstep1[56], kOne);
+          lstep1[57] = _mm_madd_epi16(lstep1[57], kOne);
+          lstep1[58] = _mm_madd_epi16(lstep1[58], kOne);
+          lstep1[59] = _mm_madd_epi16(lstep1[59], kOne);
+          lstep1[60] = _mm_madd_epi16(lstep1[60], kOne);
+          lstep1[61] = _mm_madd_epi16(lstep1[61], kOne);
+          lstep1[62] = _mm_madd_epi16(lstep1[62], kOne);
+          lstep1[63] = _mm_madd_epi16(lstep1[63], kOne);
+
+          lstep3[32] = _mm_add_epi32(lstep2[46], lstep1[32]);
+          lstep3[33] = _mm_add_epi32(lstep2[47], lstep1[33]);
+
+          lstep3[34] = _mm_add_epi32(lstep2[44], lstep1[34]);
+          lstep3[35] = _mm_add_epi32(lstep2[45], lstep1[35]);
+          lstep3[36] = _mm_add_epi32(lstep2[42], lstep1[36]);
+          lstep3[37] = _mm_add_epi32(lstep2[43], lstep1[37]);
+          lstep3[38] = _mm_add_epi32(lstep2[40], lstep1[38]);
+          lstep3[39] = _mm_add_epi32(lstep2[41], lstep1[39]);
+          lstep3[40] = _mm_sub_epi32(lstep1[38], lstep2[40]);
+          lstep3[41] = _mm_sub_epi32(lstep1[39], lstep2[41]);
+          lstep3[42] = _mm_sub_epi32(lstep1[36], lstep2[42]);
+          lstep3[43] = _mm_sub_epi32(lstep1[37], lstep2[43]);
+          lstep3[44] = _mm_sub_epi32(lstep1[34], lstep2[44]);
+          lstep3[45] = _mm_sub_epi32(lstep1[35], lstep2[45]);
+          lstep3[46] = _mm_sub_epi32(lstep1[32], lstep2[46]);
+          lstep3[47] = _mm_sub_epi32(lstep1[33], lstep2[47]);
+          lstep3[48] = _mm_sub_epi32(lstep1[62], lstep2[48]);
+          lstep3[49] = _mm_sub_epi32(lstep1[63], lstep2[49]);
+          lstep3[50] = _mm_sub_epi32(lstep1[60], lstep2[50]);
+          lstep3[51] = _mm_sub_epi32(lstep1[61], lstep2[51]);
+          lstep3[52] = _mm_sub_epi32(lstep1[58], lstep2[52]);
+          lstep3[53] = _mm_sub_epi32(lstep1[59], lstep2[53]);
+          lstep3[54] = _mm_sub_epi32(lstep1[56], lstep2[54]);
+          lstep3[55] = _mm_sub_epi32(lstep1[57], lstep2[55]);
+          lstep3[56] = _mm_add_epi32(lstep2[54], lstep1[56]);
+          lstep3[57] = _mm_add_epi32(lstep2[55], lstep1[57]);
+          lstep3[58] = _mm_add_epi32(lstep2[52], lstep1[58]);
+          lstep3[59] = _mm_add_epi32(lstep2[53], lstep1[59]);
+          lstep3[60] = _mm_add_epi32(lstep2[50], lstep1[60]);
+          lstep3[61] = _mm_add_epi32(lstep2[51], lstep1[61]);
+          lstep3[62] = _mm_add_epi32(lstep2[48], lstep1[62]);
+          lstep3[63] = _mm_add_epi32(lstep2[49], lstep1[63]);
+        }
+
+        // stage 4
+        {
+          // expanding to 32-bit length priori to addition operations
+          lstep2[16] = _mm_unpacklo_epi16(step2[ 8], kZero);
+          lstep2[17] = _mm_unpackhi_epi16(step2[ 8], kZero);
+          lstep2[18] = _mm_unpacklo_epi16(step2[ 9], kZero);
+          lstep2[19] = _mm_unpackhi_epi16(step2[ 9], kZero);
+          lstep2[28] = _mm_unpacklo_epi16(step2[14], kZero);
+          lstep2[29] = _mm_unpackhi_epi16(step2[14], kZero);
+          lstep2[30] = _mm_unpacklo_epi16(step2[15], kZero);
+          lstep2[31] = _mm_unpackhi_epi16(step2[15], kZero);
+          lstep2[16] = _mm_madd_epi16(lstep2[16], kOne);
+          lstep2[17] = _mm_madd_epi16(lstep2[17], kOne);
+          lstep2[18] = _mm_madd_epi16(lstep2[18], kOne);
+          lstep2[19] = _mm_madd_epi16(lstep2[19], kOne);
+          lstep2[28] = _mm_madd_epi16(lstep2[28], kOne);
+          lstep2[29] = _mm_madd_epi16(lstep2[29], kOne);
+          lstep2[30] = _mm_madd_epi16(lstep2[30], kOne);
+          lstep2[31] = _mm_madd_epi16(lstep2[31], kOne);
+
+          lstep1[ 0] = _mm_add_epi32(lstep3[ 6], lstep3[ 0]);
+          lstep1[ 1] = _mm_add_epi32(lstep3[ 7], lstep3[ 1]);
+          lstep1[ 2] = _mm_add_epi32(lstep3[ 4], lstep3[ 2]);
+          lstep1[ 3] = _mm_add_epi32(lstep3[ 5], lstep3[ 3]);
+          lstep1[ 4] = _mm_sub_epi32(lstep3[ 2], lstep3[ 4]);
+          lstep1[ 5] = _mm_sub_epi32(lstep3[ 3], lstep3[ 5]);
+          lstep1[ 6] = _mm_sub_epi32(lstep3[ 0], lstep3[ 6]);
+          lstep1[ 7] = _mm_sub_epi32(lstep3[ 1], lstep3[ 7]);
+          lstep1[16] = _mm_add_epi32(lstep3[22], lstep2[16]);
+          lstep1[17] = _mm_add_epi32(lstep3[23], lstep2[17]);
+          lstep1[18] = _mm_add_epi32(lstep3[20], lstep2[18]);
+          lstep1[19] = _mm_add_epi32(lstep3[21], lstep2[19]);
+          lstep1[20] = _mm_sub_epi32(lstep2[18], lstep3[20]);
+          lstep1[21] = _mm_sub_epi32(lstep2[19], lstep3[21]);
+          lstep1[22] = _mm_sub_epi32(lstep2[16], lstep3[22]);
+          lstep1[23] = _mm_sub_epi32(lstep2[17], lstep3[23]);
+          lstep1[24] = _mm_sub_epi32(lstep2[30], lstep3[24]);
+          lstep1[25] = _mm_sub_epi32(lstep2[31], lstep3[25]);
+          lstep1[26] = _mm_sub_epi32(lstep2[28], lstep3[26]);
+          lstep1[27] = _mm_sub_epi32(lstep2[29], lstep3[27]);
+          lstep1[28] = _mm_add_epi32(lstep3[26], lstep2[28]);
+          lstep1[29] = _mm_add_epi32(lstep3[27], lstep2[29]);
+          lstep1[30] = _mm_add_epi32(lstep3[24], lstep2[30]);
+          lstep1[31] = _mm_add_epi32(lstep3[25], lstep2[31]);
+        }
+        {
+        // to be continued...
+        //
+        const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
+        const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
+
+        u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]);
+        u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]);
+        u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]);
+        u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]);
+
+        // TODO(jingning): manually inline k_madd_epi32_ to further hide
+        // instruction latency.
+        v[0] = k_madd_epi32(u[0], k32_p16_m16);
+        v[1] = k_madd_epi32(u[1], k32_p16_m16);
+        v[2] = k_madd_epi32(u[2], k32_p16_m16);
+        v[3] = k_madd_epi32(u[3], k32_p16_m16);
+        v[4] = k_madd_epi32(u[0], k32_p16_p16);
+        v[5] = k_madd_epi32(u[1], k32_p16_p16);
+        v[6] = k_madd_epi32(u[2], k32_p16_p16);
+        v[7] = k_madd_epi32(u[3], k32_p16_p16);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = k_check_epi32_overflow_8(&v[0], &v[1], &v[2], &v[3],
+                                            &v[4], &v[5], &v[6], &v[7], &kZero);
+        if (overflow) {
+          HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+        u[0] = k_packs_epi64(v[0], v[1]);
+        u[1] = k_packs_epi64(v[2], v[3]);
+        u[2] = k_packs_epi64(v[4], v[5]);
+        u[3] = k_packs_epi64(v[6], v[7]);
+
+        v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+        v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+        v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+        v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+
+        lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+        lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+        lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+        lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+        }
+        {
+          const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
+          const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64);
+          const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
+
+          u[ 0] = _mm_unpacklo_epi32(lstep3[36], lstep3[58]);
+          u[ 1] = _mm_unpackhi_epi32(lstep3[36], lstep3[58]);
+          u[ 2] = _mm_unpacklo_epi32(lstep3[37], lstep3[59]);
+          u[ 3] = _mm_unpackhi_epi32(lstep3[37], lstep3[59]);
+          u[ 4] = _mm_unpacklo_epi32(lstep3[38], lstep3[56]);
+          u[ 5] = _mm_unpackhi_epi32(lstep3[38], lstep3[56]);
+          u[ 6] = _mm_unpacklo_epi32(lstep3[39], lstep3[57]);
+          u[ 7] = _mm_unpackhi_epi32(lstep3[39], lstep3[57]);
+          u[ 8] = _mm_unpacklo_epi32(lstep3[40], lstep3[54]);
+          u[ 9] = _mm_unpackhi_epi32(lstep3[40], lstep3[54]);
+          u[10] = _mm_unpacklo_epi32(lstep3[41], lstep3[55]);
+          u[11] = _mm_unpackhi_epi32(lstep3[41], lstep3[55]);
+          u[12] = _mm_unpacklo_epi32(lstep3[42], lstep3[52]);
+          u[13] = _mm_unpackhi_epi32(lstep3[42], lstep3[52]);
+          u[14] = _mm_unpacklo_epi32(lstep3[43], lstep3[53]);
+          u[15] = _mm_unpackhi_epi32(lstep3[43], lstep3[53]);
+
+          v[ 0] = k_madd_epi32(u[ 0], k32_m08_p24);
+          v[ 1] = k_madd_epi32(u[ 1], k32_m08_p24);
+          v[ 2] = k_madd_epi32(u[ 2], k32_m08_p24);
+          v[ 3] = k_madd_epi32(u[ 3], k32_m08_p24);
+          v[ 4] = k_madd_epi32(u[ 4], k32_m08_p24);
+          v[ 5] = k_madd_epi32(u[ 5], k32_m08_p24);
+          v[ 6] = k_madd_epi32(u[ 6], k32_m08_p24);
+          v[ 7] = k_madd_epi32(u[ 7], k32_m08_p24);
+          v[ 8] = k_madd_epi32(u[ 8], k32_m24_m08);
+          v[ 9] = k_madd_epi32(u[ 9], k32_m24_m08);
+          v[10] = k_madd_epi32(u[10], k32_m24_m08);
+          v[11] = k_madd_epi32(u[11], k32_m24_m08);
+          v[12] = k_madd_epi32(u[12], k32_m24_m08);
+          v[13] = k_madd_epi32(u[13], k32_m24_m08);
+          v[14] = k_madd_epi32(u[14], k32_m24_m08);
+          v[15] = k_madd_epi32(u[15], k32_m24_m08);
+          v[16] = k_madd_epi32(u[12], k32_m08_p24);
+          v[17] = k_madd_epi32(u[13], k32_m08_p24);
+          v[18] = k_madd_epi32(u[14], k32_m08_p24);
+          v[19] = k_madd_epi32(u[15], k32_m08_p24);
+          v[20] = k_madd_epi32(u[ 8], k32_m08_p24);
+          v[21] = k_madd_epi32(u[ 9], k32_m08_p24);
+          v[22] = k_madd_epi32(u[10], k32_m08_p24);
+          v[23] = k_madd_epi32(u[11], k32_m08_p24);
+          v[24] = k_madd_epi32(u[ 4], k32_p24_p08);
+          v[25] = k_madd_epi32(u[ 5], k32_p24_p08);
+          v[26] = k_madd_epi32(u[ 6], k32_p24_p08);
+          v[27] = k_madd_epi32(u[ 7], k32_p24_p08);
+          v[28] = k_madd_epi32(u[ 0], k32_p24_p08);
+          v[29] = k_madd_epi32(u[ 1], k32_p24_p08);
+          v[30] = k_madd_epi32(u[ 2], k32_p24_p08);
+          v[31] = k_madd_epi32(u[ 3], k32_p24_p08);
+
+#if DCT_HIGH_BIT_DEPTH
+          overflow = k_check_epi32_overflow_32(
+              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+              &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+              &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23],
+              &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31],
+              &kZero);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+          u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
+          u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
+          u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
+          u[ 3] = k_packs_epi64(v[ 6], v[ 7]);
+          u[ 4] = k_packs_epi64(v[ 8], v[ 9]);
+          u[ 5] = k_packs_epi64(v[10], v[11]);
+          u[ 6] = k_packs_epi64(v[12], v[13]);
+          u[ 7] = k_packs_epi64(v[14], v[15]);
+          u[ 8] = k_packs_epi64(v[16], v[17]);
+          u[ 9] = k_packs_epi64(v[18], v[19]);
+          u[10] = k_packs_epi64(v[20], v[21]);
+          u[11] = k_packs_epi64(v[22], v[23]);
+          u[12] = k_packs_epi64(v[24], v[25]);
+          u[13] = k_packs_epi64(v[26], v[27]);
+          u[14] = k_packs_epi64(v[28], v[29]);
+          u[15] = k_packs_epi64(v[30], v[31]);
+
+          v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+          v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+          v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+          v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+          v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+          v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+          v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+          v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+          v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+          v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+          v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+          v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+          v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+          v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+          v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+          v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+          lstep1[36] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS);
+          lstep1[37] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS);
+          lstep1[38] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS);
+          lstep1[39] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS);
+          lstep1[40] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS);
+          lstep1[41] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS);
+          lstep1[42] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS);
+          lstep1[43] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS);
+          lstep1[52] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS);
+          lstep1[53] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS);
+          lstep1[54] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+          lstep1[55] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+          lstep1[56] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+          lstep1[57] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+          lstep1[58] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+          lstep1[59] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+        }
+        // stage 5
+        {
+          lstep2[ 8] = _mm_add_epi32(lstep1[10], lstep3[ 8]);
+          lstep2[ 9] = _mm_add_epi32(lstep1[11], lstep3[ 9]);
+          lstep2[10] = _mm_sub_epi32(lstep3[ 8], lstep1[10]);
+          lstep2[11] = _mm_sub_epi32(lstep3[ 9], lstep1[11]);
+          lstep2[12] = _mm_sub_epi32(lstep3[14], lstep1[12]);
+          lstep2[13] = _mm_sub_epi32(lstep3[15], lstep1[13]);
+          lstep2[14] = _mm_add_epi32(lstep1[12], lstep3[14]);
+          lstep2[15] = _mm_add_epi32(lstep1[13], lstep3[15]);
+        }
+        {
+          const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
+          const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
+          const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
+          const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
+
+          u[0] = _mm_unpacklo_epi32(lstep1[0], lstep1[2]);
+          u[1] = _mm_unpackhi_epi32(lstep1[0], lstep1[2]);
+          u[2] = _mm_unpacklo_epi32(lstep1[1], lstep1[3]);
+          u[3] = _mm_unpackhi_epi32(lstep1[1], lstep1[3]);
+          u[4] = _mm_unpacklo_epi32(lstep1[4], lstep1[6]);
+          u[5] = _mm_unpackhi_epi32(lstep1[4], lstep1[6]);
+          u[6] = _mm_unpacklo_epi32(lstep1[5], lstep1[7]);
+          u[7] = _mm_unpackhi_epi32(lstep1[5], lstep1[7]);
+
+          // TODO(jingning): manually inline k_madd_epi32_ to further hide
+          // instruction latency.
+          v[ 0] = k_madd_epi32(u[0], k32_p16_p16);
+          v[ 1] = k_madd_epi32(u[1], k32_p16_p16);
+          v[ 2] = k_madd_epi32(u[2], k32_p16_p16);
+          v[ 3] = k_madd_epi32(u[3], k32_p16_p16);
+          v[ 4] = k_madd_epi32(u[0], k32_p16_m16);
+          v[ 5] = k_madd_epi32(u[1], k32_p16_m16);
+          v[ 6] = k_madd_epi32(u[2], k32_p16_m16);
+          v[ 7] = k_madd_epi32(u[3], k32_p16_m16);
+          v[ 8] = k_madd_epi32(u[4], k32_p24_p08);
+          v[ 9] = k_madd_epi32(u[5], k32_p24_p08);
+          v[10] = k_madd_epi32(u[6], k32_p24_p08);
+          v[11] = k_madd_epi32(u[7], k32_p24_p08);
+          v[12] = k_madd_epi32(u[4], k32_m08_p24);
+          v[13] = k_madd_epi32(u[5], k32_m08_p24);
+          v[14] = k_madd_epi32(u[6], k32_m08_p24);
+          v[15] = k_madd_epi32(u[7], k32_m08_p24);
+
+#if DCT_HIGH_BIT_DEPTH
+          overflow = k_check_epi32_overflow_16(
+              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+              &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+              &kZero);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+          u[0] = k_packs_epi64(v[0], v[1]);
+          u[1] = k_packs_epi64(v[2], v[3]);
+          u[2] = k_packs_epi64(v[4], v[5]);
+          u[3] = k_packs_epi64(v[6], v[7]);
+          u[4] = k_packs_epi64(v[8], v[9]);
+          u[5] = k_packs_epi64(v[10], v[11]);
+          u[6] = k_packs_epi64(v[12], v[13]);
+          u[7] = k_packs_epi64(v[14], v[15]);
+
+          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+          v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+          v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+          v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+          v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+          u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+          u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+          u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+          u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+          u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+          u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+          u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+          u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+
+          sign[0] = _mm_cmplt_epi32(u[0], kZero);
+          sign[1] = _mm_cmplt_epi32(u[1], kZero);
+          sign[2] = _mm_cmplt_epi32(u[2], kZero);
+          sign[3] = _mm_cmplt_epi32(u[3], kZero);
+          sign[4] = _mm_cmplt_epi32(u[4], kZero);
+          sign[5] = _mm_cmplt_epi32(u[5], kZero);
+          sign[6] = _mm_cmplt_epi32(u[6], kZero);
+          sign[7] = _mm_cmplt_epi32(u[7], kZero);
+
+          u[0] = _mm_sub_epi32(u[0], sign[0]);
+          u[1] = _mm_sub_epi32(u[1], sign[1]);
+          u[2] = _mm_sub_epi32(u[2], sign[2]);
+          u[3] = _mm_sub_epi32(u[3], sign[3]);
+          u[4] = _mm_sub_epi32(u[4], sign[4]);
+          u[5] = _mm_sub_epi32(u[5], sign[5]);
+          u[6] = _mm_sub_epi32(u[6], sign[6]);
+          u[7] = _mm_sub_epi32(u[7], sign[7]);
+
+          u[0] = _mm_add_epi32(u[0], K32One);
+          u[1] = _mm_add_epi32(u[1], K32One);
+          u[2] = _mm_add_epi32(u[2], K32One);
+          u[3] = _mm_add_epi32(u[3], K32One);
+          u[4] = _mm_add_epi32(u[4], K32One);
+          u[5] = _mm_add_epi32(u[5], K32One);
+          u[6] = _mm_add_epi32(u[6], K32One);
+          u[7] = _mm_add_epi32(u[7], K32One);
+
+          u[0] = _mm_srai_epi32(u[0], 2);
+          u[1] = _mm_srai_epi32(u[1], 2);
+          u[2] = _mm_srai_epi32(u[2], 2);
+          u[3] = _mm_srai_epi32(u[3], 2);
+          u[4] = _mm_srai_epi32(u[4], 2);
+          u[5] = _mm_srai_epi32(u[5], 2);
+          u[6] = _mm_srai_epi32(u[6], 2);
+          u[7] = _mm_srai_epi32(u[7], 2);
+
+          // Combine
+          out[ 0] = _mm_packs_epi32(u[0], u[1]);
+          out[16] = _mm_packs_epi32(u[2], u[3]);
+          out[ 8] = _mm_packs_epi32(u[4], u[5]);
+          out[24] = _mm_packs_epi32(u[6], u[7]);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x4(&out[0], &out[16],
+                                             &out[8], &out[24]);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+        }
+        {
+          const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
+          const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64);
+          const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
+
+          u[0] = _mm_unpacklo_epi32(lstep1[18], lstep1[28]);
+          u[1] = _mm_unpackhi_epi32(lstep1[18], lstep1[28]);
+          u[2] = _mm_unpacklo_epi32(lstep1[19], lstep1[29]);
+          u[3] = _mm_unpackhi_epi32(lstep1[19], lstep1[29]);
+          u[4] = _mm_unpacklo_epi32(lstep1[20], lstep1[26]);
+          u[5] = _mm_unpackhi_epi32(lstep1[20], lstep1[26]);
+          u[6] = _mm_unpacklo_epi32(lstep1[21], lstep1[27]);
+          u[7] = _mm_unpackhi_epi32(lstep1[21], lstep1[27]);
+
+          v[0] = k_madd_epi32(u[0], k32_m08_p24);
+          v[1] = k_madd_epi32(u[1], k32_m08_p24);
+          v[2] = k_madd_epi32(u[2], k32_m08_p24);
+          v[3] = k_madd_epi32(u[3], k32_m08_p24);
+          v[4] = k_madd_epi32(u[4], k32_m24_m08);
+          v[5] = k_madd_epi32(u[5], k32_m24_m08);
+          v[6] = k_madd_epi32(u[6], k32_m24_m08);
+          v[7] = k_madd_epi32(u[7], k32_m24_m08);
+          v[ 8] = k_madd_epi32(u[4], k32_m08_p24);
+          v[ 9] = k_madd_epi32(u[5], k32_m08_p24);
+          v[10] = k_madd_epi32(u[6], k32_m08_p24);
+          v[11] = k_madd_epi32(u[7], k32_m08_p24);
+          v[12] = k_madd_epi32(u[0], k32_p24_p08);
+          v[13] = k_madd_epi32(u[1], k32_p24_p08);
+          v[14] = k_madd_epi32(u[2], k32_p24_p08);
+          v[15] = k_madd_epi32(u[3], k32_p24_p08);
+
+#if DCT_HIGH_BIT_DEPTH
+          overflow = k_check_epi32_overflow_16(
+              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+              &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+              &kZero);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+          u[0] = k_packs_epi64(v[0], v[1]);
+          u[1] = k_packs_epi64(v[2], v[3]);
+          u[2] = k_packs_epi64(v[4], v[5]);
+          u[3] = k_packs_epi64(v[6], v[7]);
+          u[4] = k_packs_epi64(v[8], v[9]);
+          u[5] = k_packs_epi64(v[10], v[11]);
+          u[6] = k_packs_epi64(v[12], v[13]);
+          u[7] = k_packs_epi64(v[14], v[15]);
+
+          u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+          u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+          u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+          u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+          u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+          u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+          u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+          u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+          lstep2[18] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+          lstep2[19] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+          lstep2[20] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+          lstep2[21] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+          lstep2[26] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+          lstep2[27] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+          lstep2[28] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+          lstep2[29] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+        }
+        {
+          lstep2[32] = _mm_add_epi32(lstep1[38], lstep3[32]);
+          lstep2[33] = _mm_add_epi32(lstep1[39], lstep3[33]);
+          lstep2[34] = _mm_add_epi32(lstep1[36], lstep3[34]);
+          lstep2[35] = _mm_add_epi32(lstep1[37], lstep3[35]);
+          lstep2[36] = _mm_sub_epi32(lstep3[34], lstep1[36]);
+          lstep2[37] = _mm_sub_epi32(lstep3[35], lstep1[37]);
+          lstep2[38] = _mm_sub_epi32(lstep3[32], lstep1[38]);
+          lstep2[39] = _mm_sub_epi32(lstep3[33], lstep1[39]);
+          lstep2[40] = _mm_sub_epi32(lstep3[46], lstep1[40]);
+          lstep2[41] = _mm_sub_epi32(lstep3[47], lstep1[41]);
+          lstep2[42] = _mm_sub_epi32(lstep3[44], lstep1[42]);
+          lstep2[43] = _mm_sub_epi32(lstep3[45], lstep1[43]);
+          lstep2[44] = _mm_add_epi32(lstep1[42], lstep3[44]);
+          lstep2[45] = _mm_add_epi32(lstep1[43], lstep3[45]);
+          lstep2[46] = _mm_add_epi32(lstep1[40], lstep3[46]);
+          lstep2[47] = _mm_add_epi32(lstep1[41], lstep3[47]);
+          lstep2[48] = _mm_add_epi32(lstep1[54], lstep3[48]);
+          lstep2[49] = _mm_add_epi32(lstep1[55], lstep3[49]);
+          lstep2[50] = _mm_add_epi32(lstep1[52], lstep3[50]);
+          lstep2[51] = _mm_add_epi32(lstep1[53], lstep3[51]);
+          lstep2[52] = _mm_sub_epi32(lstep3[50], lstep1[52]);
+          lstep2[53] = _mm_sub_epi32(lstep3[51], lstep1[53]);
+          lstep2[54] = _mm_sub_epi32(lstep3[48], lstep1[54]);
+          lstep2[55] = _mm_sub_epi32(lstep3[49], lstep1[55]);
+          lstep2[56] = _mm_sub_epi32(lstep3[62], lstep1[56]);
+          lstep2[57] = _mm_sub_epi32(lstep3[63], lstep1[57]);
+          lstep2[58] = _mm_sub_epi32(lstep3[60], lstep1[58]);
+          lstep2[59] = _mm_sub_epi32(lstep3[61], lstep1[59]);
+          lstep2[60] = _mm_add_epi32(lstep1[58], lstep3[60]);
+          lstep2[61] = _mm_add_epi32(lstep1[59], lstep3[61]);
+          lstep2[62] = _mm_add_epi32(lstep1[56], lstep3[62]);
+          lstep2[63] = _mm_add_epi32(lstep1[57], lstep3[63]);
+        }
+        // stage 6
+        {
+          const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64);
+          const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64);
+          const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64);
+          const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64);
+
+          u[0] = _mm_unpacklo_epi32(lstep2[ 8], lstep2[14]);
+          u[1] = _mm_unpackhi_epi32(lstep2[ 8], lstep2[14]);
+          u[2] = _mm_unpacklo_epi32(lstep2[ 9], lstep2[15]);
+          u[3] = _mm_unpackhi_epi32(lstep2[ 9], lstep2[15]);
+          u[4] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]);
+          u[5] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]);
+          u[6] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]);
+          u[7] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]);
+          u[8] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]);
+          u[9] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]);
+          u[10] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]);
+          u[11] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]);
+          u[12] = _mm_unpacklo_epi32(lstep2[ 8], lstep2[14]);
+          u[13] = _mm_unpackhi_epi32(lstep2[ 8], lstep2[14]);
+          u[14] = _mm_unpacklo_epi32(lstep2[ 9], lstep2[15]);
+          u[15] = _mm_unpackhi_epi32(lstep2[ 9], lstep2[15]);
+
+          v[0] = k_madd_epi32(u[0], k32_p28_p04);
+          v[1] = k_madd_epi32(u[1], k32_p28_p04);
+          v[2] = k_madd_epi32(u[2], k32_p28_p04);
+          v[3] = k_madd_epi32(u[3], k32_p28_p04);
+          v[4] = k_madd_epi32(u[4], k32_p12_p20);
+          v[5] = k_madd_epi32(u[5], k32_p12_p20);
+          v[6] = k_madd_epi32(u[6], k32_p12_p20);
+          v[7] = k_madd_epi32(u[7], k32_p12_p20);
+          v[ 8] = k_madd_epi32(u[ 8], k32_m20_p12);
+          v[ 9] = k_madd_epi32(u[ 9], k32_m20_p12);
+          v[10] = k_madd_epi32(u[10], k32_m20_p12);
+          v[11] = k_madd_epi32(u[11], k32_m20_p12);
+          v[12] = k_madd_epi32(u[12], k32_m04_p28);
+          v[13] = k_madd_epi32(u[13], k32_m04_p28);
+          v[14] = k_madd_epi32(u[14], k32_m04_p28);
+          v[15] = k_madd_epi32(u[15], k32_m04_p28);
+
+#if DCT_HIGH_BIT_DEPTH
+          overflow = k_check_epi32_overflow_16(
+              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+              &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+              &kZero);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+          u[0] = k_packs_epi64(v[0], v[1]);
+          u[1] = k_packs_epi64(v[2], v[3]);
+          u[2] = k_packs_epi64(v[4], v[5]);
+          u[3] = k_packs_epi64(v[6], v[7]);
+          u[4] = k_packs_epi64(v[8], v[9]);
+          u[5] = k_packs_epi64(v[10], v[11]);
+          u[6] = k_packs_epi64(v[12], v[13]);
+          u[7] = k_packs_epi64(v[14], v[15]);
+
+          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+          v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+          v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+          v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+          v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+          u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+          u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+          u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+          u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+          u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+          u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+          u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+          u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+
+          sign[0] = _mm_cmplt_epi32(u[0], kZero);
+          sign[1] = _mm_cmplt_epi32(u[1], kZero);
+          sign[2] = _mm_cmplt_epi32(u[2], kZero);
+          sign[3] = _mm_cmplt_epi32(u[3], kZero);
+          sign[4] = _mm_cmplt_epi32(u[4], kZero);
+          sign[5] = _mm_cmplt_epi32(u[5], kZero);
+          sign[6] = _mm_cmplt_epi32(u[6], kZero);
+          sign[7] = _mm_cmplt_epi32(u[7], kZero);
+
+          u[0] = _mm_sub_epi32(u[0], sign[0]);
+          u[1] = _mm_sub_epi32(u[1], sign[1]);
+          u[2] = _mm_sub_epi32(u[2], sign[2]);
+          u[3] = _mm_sub_epi32(u[3], sign[3]);
+          u[4] = _mm_sub_epi32(u[4], sign[4]);
+          u[5] = _mm_sub_epi32(u[5], sign[5]);
+          u[6] = _mm_sub_epi32(u[6], sign[6]);
+          u[7] = _mm_sub_epi32(u[7], sign[7]);
+
+          u[0] = _mm_add_epi32(u[0], K32One);
+          u[1] = _mm_add_epi32(u[1], K32One);
+          u[2] = _mm_add_epi32(u[2], K32One);
+          u[3] = _mm_add_epi32(u[3], K32One);
+          u[4] = _mm_add_epi32(u[4], K32One);
+          u[5] = _mm_add_epi32(u[5], K32One);
+          u[6] = _mm_add_epi32(u[6], K32One);
+          u[7] = _mm_add_epi32(u[7], K32One);
+
+          u[0] = _mm_srai_epi32(u[0], 2);
+          u[1] = _mm_srai_epi32(u[1], 2);
+          u[2] = _mm_srai_epi32(u[2], 2);
+          u[3] = _mm_srai_epi32(u[3], 2);
+          u[4] = _mm_srai_epi32(u[4], 2);
+          u[5] = _mm_srai_epi32(u[5], 2);
+          u[6] = _mm_srai_epi32(u[6], 2);
+          u[7] = _mm_srai_epi32(u[7], 2);
+
+          out[ 4] = _mm_packs_epi32(u[0], u[1]);
+          out[20] = _mm_packs_epi32(u[2], u[3]);
+          out[12] = _mm_packs_epi32(u[4], u[5]);
+          out[28] = _mm_packs_epi32(u[6], u[7]);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x4(&out[4], &out[20],
+                                             &out[12], &out[28]);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+        }
+        {
+          lstep3[16] = _mm_add_epi32(lstep2[18], lstep1[16]);
+          lstep3[17] = _mm_add_epi32(lstep2[19], lstep1[17]);
+          lstep3[18] = _mm_sub_epi32(lstep1[16], lstep2[18]);
+          lstep3[19] = _mm_sub_epi32(lstep1[17], lstep2[19]);
+          lstep3[20] = _mm_sub_epi32(lstep1[22], lstep2[20]);
+          lstep3[21] = _mm_sub_epi32(lstep1[23], lstep2[21]);
+          lstep3[22] = _mm_add_epi32(lstep2[20], lstep1[22]);
+          lstep3[23] = _mm_add_epi32(lstep2[21], lstep1[23]);
+          lstep3[24] = _mm_add_epi32(lstep2[26], lstep1[24]);
+          lstep3[25] = _mm_add_epi32(lstep2[27], lstep1[25]);
+          lstep3[26] = _mm_sub_epi32(lstep1[24], lstep2[26]);
+          lstep3[27] = _mm_sub_epi32(lstep1[25], lstep2[27]);
+          lstep3[28] = _mm_sub_epi32(lstep1[30], lstep2[28]);
+          lstep3[29] = _mm_sub_epi32(lstep1[31], lstep2[29]);
+          lstep3[30] = _mm_add_epi32(lstep2[28], lstep1[30]);
+          lstep3[31] = _mm_add_epi32(lstep2[29], lstep1[31]);
+        }
+        {
+          const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64);
+          const __m128i k32_m28_m04 = pair_set_epi32(-cospi_28_64, -cospi_4_64);
+          const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64);
+          const __m128i k32_m12_m20 = pair_set_epi32(-cospi_12_64,
+                                                     -cospi_20_64);
+          const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64);
+          const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64);
+
+          u[ 0] = _mm_unpacklo_epi32(lstep2[34], lstep2[60]);
+          u[ 1] = _mm_unpackhi_epi32(lstep2[34], lstep2[60]);
+          u[ 2] = _mm_unpacklo_epi32(lstep2[35], lstep2[61]);
+          u[ 3] = _mm_unpackhi_epi32(lstep2[35], lstep2[61]);
+          u[ 4] = _mm_unpacklo_epi32(lstep2[36], lstep2[58]);
+          u[ 5] = _mm_unpackhi_epi32(lstep2[36], lstep2[58]);
+          u[ 6] = _mm_unpacklo_epi32(lstep2[37], lstep2[59]);
+          u[ 7] = _mm_unpackhi_epi32(lstep2[37], lstep2[59]);
+          u[ 8] = _mm_unpacklo_epi32(lstep2[42], lstep2[52]);
+          u[ 9] = _mm_unpackhi_epi32(lstep2[42], lstep2[52]);
+          u[10] = _mm_unpacklo_epi32(lstep2[43], lstep2[53]);
+          u[11] = _mm_unpackhi_epi32(lstep2[43], lstep2[53]);
+          u[12] = _mm_unpacklo_epi32(lstep2[44], lstep2[50]);
+          u[13] = _mm_unpackhi_epi32(lstep2[44], lstep2[50]);
+          u[14] = _mm_unpacklo_epi32(lstep2[45], lstep2[51]);
+          u[15] = _mm_unpackhi_epi32(lstep2[45], lstep2[51]);
+
+          v[ 0] = k_madd_epi32(u[ 0], k32_m04_p28);
+          v[ 1] = k_madd_epi32(u[ 1], k32_m04_p28);
+          v[ 2] = k_madd_epi32(u[ 2], k32_m04_p28);
+          v[ 3] = k_madd_epi32(u[ 3], k32_m04_p28);
+          v[ 4] = k_madd_epi32(u[ 4], k32_m28_m04);
+          v[ 5] = k_madd_epi32(u[ 5], k32_m28_m04);
+          v[ 6] = k_madd_epi32(u[ 6], k32_m28_m04);
+          v[ 7] = k_madd_epi32(u[ 7], k32_m28_m04);
+          v[ 8] = k_madd_epi32(u[ 8], k32_m20_p12);
+          v[ 9] = k_madd_epi32(u[ 9], k32_m20_p12);
+          v[10] = k_madd_epi32(u[10], k32_m20_p12);
+          v[11] = k_madd_epi32(u[11], k32_m20_p12);
+          v[12] = k_madd_epi32(u[12], k32_m12_m20);
+          v[13] = k_madd_epi32(u[13], k32_m12_m20);
+          v[14] = k_madd_epi32(u[14], k32_m12_m20);
+          v[15] = k_madd_epi32(u[15], k32_m12_m20);
+          v[16] = k_madd_epi32(u[12], k32_m20_p12);
+          v[17] = k_madd_epi32(u[13], k32_m20_p12);
+          v[18] = k_madd_epi32(u[14], k32_m20_p12);
+          v[19] = k_madd_epi32(u[15], k32_m20_p12);
+          v[20] = k_madd_epi32(u[ 8], k32_p12_p20);
+          v[21] = k_madd_epi32(u[ 9], k32_p12_p20);
+          v[22] = k_madd_epi32(u[10], k32_p12_p20);
+          v[23] = k_madd_epi32(u[11], k32_p12_p20);
+          v[24] = k_madd_epi32(u[ 4], k32_m04_p28);
+          v[25] = k_madd_epi32(u[ 5], k32_m04_p28);
+          v[26] = k_madd_epi32(u[ 6], k32_m04_p28);
+          v[27] = k_madd_epi32(u[ 7], k32_m04_p28);
+          v[28] = k_madd_epi32(u[ 0], k32_p28_p04);
+          v[29] = k_madd_epi32(u[ 1], k32_p28_p04);
+          v[30] = k_madd_epi32(u[ 2], k32_p28_p04);
+          v[31] = k_madd_epi32(u[ 3], k32_p28_p04);
+
+#if DCT_HIGH_BIT_DEPTH
+          overflow = k_check_epi32_overflow_32(
+              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+              &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+              &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23],
+              &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31],
+              &kZero);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+          u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
+          u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
+          u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
+          u[ 3] = k_packs_epi64(v[ 6], v[ 7]);
+          u[ 4] = k_packs_epi64(v[ 8], v[ 9]);
+          u[ 5] = k_packs_epi64(v[10], v[11]);
+          u[ 6] = k_packs_epi64(v[12], v[13]);
+          u[ 7] = k_packs_epi64(v[14], v[15]);
+          u[ 8] = k_packs_epi64(v[16], v[17]);
+          u[ 9] = k_packs_epi64(v[18], v[19]);
+          u[10] = k_packs_epi64(v[20], v[21]);
+          u[11] = k_packs_epi64(v[22], v[23]);
+          u[12] = k_packs_epi64(v[24], v[25]);
+          u[13] = k_packs_epi64(v[26], v[27]);
+          u[14] = k_packs_epi64(v[28], v[29]);
+          u[15] = k_packs_epi64(v[30], v[31]);
+
+          v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+          v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+          v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+          v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+          v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+          v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+          v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+          v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+          v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+          v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+          v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+          v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+          v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+          v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+          v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+          v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+          lstep3[34] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS);
+          lstep3[35] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS);
+          lstep3[36] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS);
+          lstep3[37] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS);
+          lstep3[42] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS);
+          lstep3[43] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS);
+          lstep3[44] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS);
+          lstep3[45] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS);
+          lstep3[50] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS);
+          lstep3[51] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS);
+          lstep3[52] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+          lstep3[53] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+          lstep3[58] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+          lstep3[59] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+          lstep3[60] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+          lstep3[61] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+        }
+        // stage 7
+        {
+          const __m128i k32_p30_p02 = pair_set_epi32(cospi_30_64, cospi_2_64);
+          const __m128i k32_p14_p18 = pair_set_epi32(cospi_14_64, cospi_18_64);
+          const __m128i k32_p22_p10 = pair_set_epi32(cospi_22_64, cospi_10_64);
+          const __m128i k32_p06_p26 = pair_set_epi32(cospi_6_64,  cospi_26_64);
+          const __m128i k32_m26_p06 = pair_set_epi32(-cospi_26_64, cospi_6_64);
+          const __m128i k32_m10_p22 = pair_set_epi32(-cospi_10_64, cospi_22_64);
+          const __m128i k32_m18_p14 = pair_set_epi32(-cospi_18_64, cospi_14_64);
+          const __m128i k32_m02_p30 = pair_set_epi32(-cospi_2_64, cospi_30_64);
+
+          u[ 0] = _mm_unpacklo_epi32(lstep3[16], lstep3[30]);
+          u[ 1] = _mm_unpackhi_epi32(lstep3[16], lstep3[30]);
+          u[ 2] = _mm_unpacklo_epi32(lstep3[17], lstep3[31]);
+          u[ 3] = _mm_unpackhi_epi32(lstep3[17], lstep3[31]);
+          u[ 4] = _mm_unpacklo_epi32(lstep3[18], lstep3[28]);
+          u[ 5] = _mm_unpackhi_epi32(lstep3[18], lstep3[28]);
+          u[ 6] = _mm_unpacklo_epi32(lstep3[19], lstep3[29]);
+          u[ 7] = _mm_unpackhi_epi32(lstep3[19], lstep3[29]);
+          u[ 8] = _mm_unpacklo_epi32(lstep3[20], lstep3[26]);
+          u[ 9] = _mm_unpackhi_epi32(lstep3[20], lstep3[26]);
+          u[10] = _mm_unpacklo_epi32(lstep3[21], lstep3[27]);
+          u[11] = _mm_unpackhi_epi32(lstep3[21], lstep3[27]);
+          u[12] = _mm_unpacklo_epi32(lstep3[22], lstep3[24]);
+          u[13] = _mm_unpackhi_epi32(lstep3[22], lstep3[24]);
+          u[14] = _mm_unpacklo_epi32(lstep3[23], lstep3[25]);
+          u[15] = _mm_unpackhi_epi32(lstep3[23], lstep3[25]);
+
+          v[ 0] = k_madd_epi32(u[ 0], k32_p30_p02);
+          v[ 1] = k_madd_epi32(u[ 1], k32_p30_p02);
+          v[ 2] = k_madd_epi32(u[ 2], k32_p30_p02);
+          v[ 3] = k_madd_epi32(u[ 3], k32_p30_p02);
+          v[ 4] = k_madd_epi32(u[ 4], k32_p14_p18);
+          v[ 5] = k_madd_epi32(u[ 5], k32_p14_p18);
+          v[ 6] = k_madd_epi32(u[ 6], k32_p14_p18);
+          v[ 7] = k_madd_epi32(u[ 7], k32_p14_p18);
+          v[ 8] = k_madd_epi32(u[ 8], k32_p22_p10);
+          v[ 9] = k_madd_epi32(u[ 9], k32_p22_p10);
+          v[10] = k_madd_epi32(u[10], k32_p22_p10);
+          v[11] = k_madd_epi32(u[11], k32_p22_p10);
+          v[12] = k_madd_epi32(u[12], k32_p06_p26);
+          v[13] = k_madd_epi32(u[13], k32_p06_p26);
+          v[14] = k_madd_epi32(u[14], k32_p06_p26);
+          v[15] = k_madd_epi32(u[15], k32_p06_p26);
+          v[16] = k_madd_epi32(u[12], k32_m26_p06);
+          v[17] = k_madd_epi32(u[13], k32_m26_p06);
+          v[18] = k_madd_epi32(u[14], k32_m26_p06);
+          v[19] = k_madd_epi32(u[15], k32_m26_p06);
+          v[20] = k_madd_epi32(u[ 8], k32_m10_p22);
+          v[21] = k_madd_epi32(u[ 9], k32_m10_p22);
+          v[22] = k_madd_epi32(u[10], k32_m10_p22);
+          v[23] = k_madd_epi32(u[11], k32_m10_p22);
+          v[24] = k_madd_epi32(u[ 4], k32_m18_p14);
+          v[25] = k_madd_epi32(u[ 5], k32_m18_p14);
+          v[26] = k_madd_epi32(u[ 6], k32_m18_p14);
+          v[27] = k_madd_epi32(u[ 7], k32_m18_p14);
+          v[28] = k_madd_epi32(u[ 0], k32_m02_p30);
+          v[29] = k_madd_epi32(u[ 1], k32_m02_p30);
+          v[30] = k_madd_epi32(u[ 2], k32_m02_p30);
+          v[31] = k_madd_epi32(u[ 3], k32_m02_p30);
+
+#if DCT_HIGH_BIT_DEPTH
+          overflow = k_check_epi32_overflow_32(
+              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+              &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+              &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23],
+              &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31],
+              &kZero);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+          u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
+          u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
+          u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
+          u[ 3] = k_packs_epi64(v[ 6], v[ 7]);
+          u[ 4] = k_packs_epi64(v[ 8], v[ 9]);
+          u[ 5] = k_packs_epi64(v[10], v[11]);
+          u[ 6] = k_packs_epi64(v[12], v[13]);
+          u[ 7] = k_packs_epi64(v[14], v[15]);
+          u[ 8] = k_packs_epi64(v[16], v[17]);
+          u[ 9] = k_packs_epi64(v[18], v[19]);
+          u[10] = k_packs_epi64(v[20], v[21]);
+          u[11] = k_packs_epi64(v[22], v[23]);
+          u[12] = k_packs_epi64(v[24], v[25]);
+          u[13] = k_packs_epi64(v[26], v[27]);
+          u[14] = k_packs_epi64(v[28], v[29]);
+          u[15] = k_packs_epi64(v[30], v[31]);
+
+          v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+          v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+          v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+          v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+          v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+          v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+          v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+          v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+          v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+          v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+          v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+          v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+          v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+          v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+          v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+          v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+          u[ 0] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS);
+          u[ 1] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS);
+          u[ 2] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS);
+          u[ 3] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS);
+          u[ 4] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS);
+          u[ 5] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS);
+          u[ 6] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS);
+          u[ 7] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS);
+          u[ 8] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS);
+          u[ 9] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS);
+          u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+          u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+          u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+          u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+          u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+          u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+
+          v[ 0] = _mm_cmplt_epi32(u[ 0], kZero);
+          v[ 1] = _mm_cmplt_epi32(u[ 1], kZero);
+          v[ 2] = _mm_cmplt_epi32(u[ 2], kZero);
+          v[ 3] = _mm_cmplt_epi32(u[ 3], kZero);
+          v[ 4] = _mm_cmplt_epi32(u[ 4], kZero);
+          v[ 5] = _mm_cmplt_epi32(u[ 5], kZero);
+          v[ 6] = _mm_cmplt_epi32(u[ 6], kZero);
+          v[ 7] = _mm_cmplt_epi32(u[ 7], kZero);
+          v[ 8] = _mm_cmplt_epi32(u[ 8], kZero);
+          v[ 9] = _mm_cmplt_epi32(u[ 9], kZero);
+          v[10] = _mm_cmplt_epi32(u[10], kZero);
+          v[11] = _mm_cmplt_epi32(u[11], kZero);
+          v[12] = _mm_cmplt_epi32(u[12], kZero);
+          v[13] = _mm_cmplt_epi32(u[13], kZero);
+          v[14] = _mm_cmplt_epi32(u[14], kZero);
+          v[15] = _mm_cmplt_epi32(u[15], kZero);
+
+          u[ 0] = _mm_sub_epi32(u[ 0], v[ 0]);
+          u[ 1] = _mm_sub_epi32(u[ 1], v[ 1]);
+          u[ 2] = _mm_sub_epi32(u[ 2], v[ 2]);
+          u[ 3] = _mm_sub_epi32(u[ 3], v[ 3]);
+          u[ 4] = _mm_sub_epi32(u[ 4], v[ 4]);
+          u[ 5] = _mm_sub_epi32(u[ 5], v[ 5]);
+          u[ 6] = _mm_sub_epi32(u[ 6], v[ 6]);
+          u[ 7] = _mm_sub_epi32(u[ 7], v[ 7]);
+          u[ 8] = _mm_sub_epi32(u[ 8], v[ 8]);
+          u[ 9] = _mm_sub_epi32(u[ 9], v[ 9]);
+          u[10] = _mm_sub_epi32(u[10], v[10]);
+          u[11] = _mm_sub_epi32(u[11], v[11]);
+          u[12] = _mm_sub_epi32(u[12], v[12]);
+          u[13] = _mm_sub_epi32(u[13], v[13]);
+          u[14] = _mm_sub_epi32(u[14], v[14]);
+          u[15] = _mm_sub_epi32(u[15], v[15]);
+
+          v[ 0] = _mm_add_epi32(u[ 0], K32One);
+          v[ 1] = _mm_add_epi32(u[ 1], K32One);
+          v[ 2] = _mm_add_epi32(u[ 2], K32One);
+          v[ 3] = _mm_add_epi32(u[ 3], K32One);
+          v[ 4] = _mm_add_epi32(u[ 4], K32One);
+          v[ 5] = _mm_add_epi32(u[ 5], K32One);
+          v[ 6] = _mm_add_epi32(u[ 6], K32One);
+          v[ 7] = _mm_add_epi32(u[ 7], K32One);
+          v[ 8] = _mm_add_epi32(u[ 8], K32One);
+          v[ 9] = _mm_add_epi32(u[ 9], K32One);
+          v[10] = _mm_add_epi32(u[10], K32One);
+          v[11] = _mm_add_epi32(u[11], K32One);
+          v[12] = _mm_add_epi32(u[12], K32One);
+          v[13] = _mm_add_epi32(u[13], K32One);
+          v[14] = _mm_add_epi32(u[14], K32One);
+          v[15] = _mm_add_epi32(u[15], K32One);
+
+          u[ 0] = _mm_srai_epi32(v[ 0], 2);
+          u[ 1] = _mm_srai_epi32(v[ 1], 2);
+          u[ 2] = _mm_srai_epi32(v[ 2], 2);
+          u[ 3] = _mm_srai_epi32(v[ 3], 2);
+          u[ 4] = _mm_srai_epi32(v[ 4], 2);
+          u[ 5] = _mm_srai_epi32(v[ 5], 2);
+          u[ 6] = _mm_srai_epi32(v[ 6], 2);
+          u[ 7] = _mm_srai_epi32(v[ 7], 2);
+          u[ 8] = _mm_srai_epi32(v[ 8], 2);
+          u[ 9] = _mm_srai_epi32(v[ 9], 2);
+          u[10] = _mm_srai_epi32(v[10], 2);
+          u[11] = _mm_srai_epi32(v[11], 2);
+          u[12] = _mm_srai_epi32(v[12], 2);
+          u[13] = _mm_srai_epi32(v[13], 2);
+          u[14] = _mm_srai_epi32(v[14], 2);
+          u[15] = _mm_srai_epi32(v[15], 2);
+
+          out[ 2] = _mm_packs_epi32(u[0], u[1]);
+          out[18] = _mm_packs_epi32(u[2], u[3]);
+          out[10] = _mm_packs_epi32(u[4], u[5]);
+          out[26] = _mm_packs_epi32(u[6], u[7]);
+          out[ 6] = _mm_packs_epi32(u[8], u[9]);
+          out[22] = _mm_packs_epi32(u[10], u[11]);
+          out[14] = _mm_packs_epi32(u[12], u[13]);
+          out[30] = _mm_packs_epi32(u[14], u[15]);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x8(&out[2], &out[18], &out[10],
+                                             &out[26], &out[6], &out[22],
+                                             &out[14], &out[30]);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+        }
+        {
+          lstep1[32] = _mm_add_epi32(lstep3[34], lstep2[32]);
+          lstep1[33] = _mm_add_epi32(lstep3[35], lstep2[33]);
+          lstep1[34] = _mm_sub_epi32(lstep2[32], lstep3[34]);
+          lstep1[35] = _mm_sub_epi32(lstep2[33], lstep3[35]);
+          lstep1[36] = _mm_sub_epi32(lstep2[38], lstep3[36]);
+          lstep1[37] = _mm_sub_epi32(lstep2[39], lstep3[37]);
+          lstep1[38] = _mm_add_epi32(lstep3[36], lstep2[38]);
+          lstep1[39] = _mm_add_epi32(lstep3[37], lstep2[39]);
+          lstep1[40] = _mm_add_epi32(lstep3[42], lstep2[40]);
+          lstep1[41] = _mm_add_epi32(lstep3[43], lstep2[41]);
+          lstep1[42] = _mm_sub_epi32(lstep2[40], lstep3[42]);
+          lstep1[43] = _mm_sub_epi32(lstep2[41], lstep3[43]);
+          lstep1[44] = _mm_sub_epi32(lstep2[46], lstep3[44]);
+          lstep1[45] = _mm_sub_epi32(lstep2[47], lstep3[45]);
+          lstep1[46] = _mm_add_epi32(lstep3[44], lstep2[46]);
+          lstep1[47] = _mm_add_epi32(lstep3[45], lstep2[47]);
+          lstep1[48] = _mm_add_epi32(lstep3[50], lstep2[48]);
+          lstep1[49] = _mm_add_epi32(lstep3[51], lstep2[49]);
+          lstep1[50] = _mm_sub_epi32(lstep2[48], lstep3[50]);
+          lstep1[51] = _mm_sub_epi32(lstep2[49], lstep3[51]);
+          lstep1[52] = _mm_sub_epi32(lstep2[54], lstep3[52]);
+          lstep1[53] = _mm_sub_epi32(lstep2[55], lstep3[53]);
+          lstep1[54] = _mm_add_epi32(lstep3[52], lstep2[54]);
+          lstep1[55] = _mm_add_epi32(lstep3[53], lstep2[55]);
+          lstep1[56] = _mm_add_epi32(lstep3[58], lstep2[56]);
+          lstep1[57] = _mm_add_epi32(lstep3[59], lstep2[57]);
+          lstep1[58] = _mm_sub_epi32(lstep2[56], lstep3[58]);
+          lstep1[59] = _mm_sub_epi32(lstep2[57], lstep3[59]);
+          lstep1[60] = _mm_sub_epi32(lstep2[62], lstep3[60]);
+          lstep1[61] = _mm_sub_epi32(lstep2[63], lstep3[61]);
+          lstep1[62] = _mm_add_epi32(lstep3[60], lstep2[62]);
+          lstep1[63] = _mm_add_epi32(lstep3[61], lstep2[63]);
+        }
+        // stage 8
+        {
+          const __m128i k32_p31_p01 = pair_set_epi32(cospi_31_64, cospi_1_64);
+          const __m128i k32_p15_p17 = pair_set_epi32(cospi_15_64, cospi_17_64);
+          const __m128i k32_p23_p09 = pair_set_epi32(cospi_23_64, cospi_9_64);
+          const __m128i k32_p07_p25 = pair_set_epi32(cospi_7_64, cospi_25_64);
+          const __m128i k32_m25_p07 = pair_set_epi32(-cospi_25_64, cospi_7_64);
+          const __m128i k32_m09_p23 = pair_set_epi32(-cospi_9_64, cospi_23_64);
+          const __m128i k32_m17_p15 = pair_set_epi32(-cospi_17_64, cospi_15_64);
+          const __m128i k32_m01_p31 = pair_set_epi32(-cospi_1_64, cospi_31_64);
+
+          u[ 0] = _mm_unpacklo_epi32(lstep1[32], lstep1[62]);
+          u[ 1] = _mm_unpackhi_epi32(lstep1[32], lstep1[62]);
+          u[ 2] = _mm_unpacklo_epi32(lstep1[33], lstep1[63]);
+          u[ 3] = _mm_unpackhi_epi32(lstep1[33], lstep1[63]);
+          u[ 4] = _mm_unpacklo_epi32(lstep1[34], lstep1[60]);
+          u[ 5] = _mm_unpackhi_epi32(lstep1[34], lstep1[60]);
+          u[ 6] = _mm_unpacklo_epi32(lstep1[35], lstep1[61]);
+          u[ 7] = _mm_unpackhi_epi32(lstep1[35], lstep1[61]);
+          u[ 8] = _mm_unpacklo_epi32(lstep1[36], lstep1[58]);
+          u[ 9] = _mm_unpackhi_epi32(lstep1[36], lstep1[58]);
+          u[10] = _mm_unpacklo_epi32(lstep1[37], lstep1[59]);
+          u[11] = _mm_unpackhi_epi32(lstep1[37], lstep1[59]);
+          u[12] = _mm_unpacklo_epi32(lstep1[38], lstep1[56]);
+          u[13] = _mm_unpackhi_epi32(lstep1[38], lstep1[56]);
+          u[14] = _mm_unpacklo_epi32(lstep1[39], lstep1[57]);
+          u[15] = _mm_unpackhi_epi32(lstep1[39], lstep1[57]);
+
+          v[ 0] = k_madd_epi32(u[ 0], k32_p31_p01);
+          v[ 1] = k_madd_epi32(u[ 1], k32_p31_p01);
+          v[ 2] = k_madd_epi32(u[ 2], k32_p31_p01);
+          v[ 3] = k_madd_epi32(u[ 3], k32_p31_p01);
+          v[ 4] = k_madd_epi32(u[ 4], k32_p15_p17);
+          v[ 5] = k_madd_epi32(u[ 5], k32_p15_p17);
+          v[ 6] = k_madd_epi32(u[ 6], k32_p15_p17);
+          v[ 7] = k_madd_epi32(u[ 7], k32_p15_p17);
+          v[ 8] = k_madd_epi32(u[ 8], k32_p23_p09);
+          v[ 9] = k_madd_epi32(u[ 9], k32_p23_p09);
+          v[10] = k_madd_epi32(u[10], k32_p23_p09);
+          v[11] = k_madd_epi32(u[11], k32_p23_p09);
+          v[12] = k_madd_epi32(u[12], k32_p07_p25);
+          v[13] = k_madd_epi32(u[13], k32_p07_p25);
+          v[14] = k_madd_epi32(u[14], k32_p07_p25);
+          v[15] = k_madd_epi32(u[15], k32_p07_p25);
+          v[16] = k_madd_epi32(u[12], k32_m25_p07);
+          v[17] = k_madd_epi32(u[13], k32_m25_p07);
+          v[18] = k_madd_epi32(u[14], k32_m25_p07);
+          v[19] = k_madd_epi32(u[15], k32_m25_p07);
+          v[20] = k_madd_epi32(u[ 8], k32_m09_p23);
+          v[21] = k_madd_epi32(u[ 9], k32_m09_p23);
+          v[22] = k_madd_epi32(u[10], k32_m09_p23);
+          v[23] = k_madd_epi32(u[11], k32_m09_p23);
+          v[24] = k_madd_epi32(u[ 4], k32_m17_p15);
+          v[25] = k_madd_epi32(u[ 5], k32_m17_p15);
+          v[26] = k_madd_epi32(u[ 6], k32_m17_p15);
+          v[27] = k_madd_epi32(u[ 7], k32_m17_p15);
+          v[28] = k_madd_epi32(u[ 0], k32_m01_p31);
+          v[29] = k_madd_epi32(u[ 1], k32_m01_p31);
+          v[30] = k_madd_epi32(u[ 2], k32_m01_p31);
+          v[31] = k_madd_epi32(u[ 3], k32_m01_p31);
+
+#if DCT_HIGH_BIT_DEPTH
+          overflow = k_check_epi32_overflow_32(
+              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+              &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+              &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23],
+              &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31],
+              &kZero);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+          u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
+          u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
+          u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
+          u[ 3] = k_packs_epi64(v[ 6], v[ 7]);
+          u[ 4] = k_packs_epi64(v[ 8], v[ 9]);
+          u[ 5] = k_packs_epi64(v[10], v[11]);
+          u[ 6] = k_packs_epi64(v[12], v[13]);
+          u[ 7] = k_packs_epi64(v[14], v[15]);
+          u[ 8] = k_packs_epi64(v[16], v[17]);
+          u[ 9] = k_packs_epi64(v[18], v[19]);
+          u[10] = k_packs_epi64(v[20], v[21]);
+          u[11] = k_packs_epi64(v[22], v[23]);
+          u[12] = k_packs_epi64(v[24], v[25]);
+          u[13] = k_packs_epi64(v[26], v[27]);
+          u[14] = k_packs_epi64(v[28], v[29]);
+          u[15] = k_packs_epi64(v[30], v[31]);
+
+          v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+          v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+          v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+          v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+          v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+          v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+          v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+          v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+          v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+          v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+          v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+          v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+          v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+          v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+          v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+          v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+          u[ 0] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS);
+          u[ 1] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS);
+          u[ 2] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS);
+          u[ 3] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS);
+          u[ 4] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS);
+          u[ 5] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS);
+          u[ 6] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS);
+          u[ 7] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS);
+          u[ 8] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS);
+          u[ 9] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS);
+          u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+          u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+          u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+          u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+          u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+          u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+
+          v[ 0] = _mm_cmplt_epi32(u[ 0], kZero);
+          v[ 1] = _mm_cmplt_epi32(u[ 1], kZero);
+          v[ 2] = _mm_cmplt_epi32(u[ 2], kZero);
+          v[ 3] = _mm_cmplt_epi32(u[ 3], kZero);
+          v[ 4] = _mm_cmplt_epi32(u[ 4], kZero);
+          v[ 5] = _mm_cmplt_epi32(u[ 5], kZero);
+          v[ 6] = _mm_cmplt_epi32(u[ 6], kZero);
+          v[ 7] = _mm_cmplt_epi32(u[ 7], kZero);
+          v[ 8] = _mm_cmplt_epi32(u[ 8], kZero);
+          v[ 9] = _mm_cmplt_epi32(u[ 9], kZero);
+          v[10] = _mm_cmplt_epi32(u[10], kZero);
+          v[11] = _mm_cmplt_epi32(u[11], kZero);
+          v[12] = _mm_cmplt_epi32(u[12], kZero);
+          v[13] = _mm_cmplt_epi32(u[13], kZero);
+          v[14] = _mm_cmplt_epi32(u[14], kZero);
+          v[15] = _mm_cmplt_epi32(u[15], kZero);
+
+          u[ 0] = _mm_sub_epi32(u[ 0], v[ 0]);
+          u[ 1] = _mm_sub_epi32(u[ 1], v[ 1]);
+          u[ 2] = _mm_sub_epi32(u[ 2], v[ 2]);
+          u[ 3] = _mm_sub_epi32(u[ 3], v[ 3]);
+          u[ 4] = _mm_sub_epi32(u[ 4], v[ 4]);
+          u[ 5] = _mm_sub_epi32(u[ 5], v[ 5]);
+          u[ 6] = _mm_sub_epi32(u[ 6], v[ 6]);
+          u[ 7] = _mm_sub_epi32(u[ 7], v[ 7]);
+          u[ 8] = _mm_sub_epi32(u[ 8], v[ 8]);
+          u[ 9] = _mm_sub_epi32(u[ 9], v[ 9]);
+          u[10] = _mm_sub_epi32(u[10], v[10]);
+          u[11] = _mm_sub_epi32(u[11], v[11]);
+          u[12] = _mm_sub_epi32(u[12], v[12]);
+          u[13] = _mm_sub_epi32(u[13], v[13]);
+          u[14] = _mm_sub_epi32(u[14], v[14]);
+          u[15] = _mm_sub_epi32(u[15], v[15]);
+
+          v[0] = _mm_add_epi32(u[0], K32One);
+          v[1] = _mm_add_epi32(u[1], K32One);
+          v[2] = _mm_add_epi32(u[2], K32One);
+          v[3] = _mm_add_epi32(u[3], K32One);
+          v[4] = _mm_add_epi32(u[4], K32One);
+          v[5] = _mm_add_epi32(u[5], K32One);
+          v[6] = _mm_add_epi32(u[6], K32One);
+          v[7] = _mm_add_epi32(u[7], K32One);
+          v[8] = _mm_add_epi32(u[8], K32One);
+          v[9] = _mm_add_epi32(u[9], K32One);
+          v[10] = _mm_add_epi32(u[10], K32One);
+          v[11] = _mm_add_epi32(u[11], K32One);
+          v[12] = _mm_add_epi32(u[12], K32One);
+          v[13] = _mm_add_epi32(u[13], K32One);
+          v[14] = _mm_add_epi32(u[14], K32One);
+          v[15] = _mm_add_epi32(u[15], K32One);
+
+          u[0] = _mm_srai_epi32(v[0], 2);
+          u[1] = _mm_srai_epi32(v[1], 2);
+          u[2] = _mm_srai_epi32(v[2], 2);
+          u[3] = _mm_srai_epi32(v[3], 2);
+          u[4] = _mm_srai_epi32(v[4], 2);
+          u[5] = _mm_srai_epi32(v[5], 2);
+          u[6] = _mm_srai_epi32(v[6], 2);
+          u[7] = _mm_srai_epi32(v[7], 2);
+          u[8] = _mm_srai_epi32(v[8], 2);
+          u[9] = _mm_srai_epi32(v[9], 2);
+          u[10] = _mm_srai_epi32(v[10], 2);
+          u[11] = _mm_srai_epi32(v[11], 2);
+          u[12] = _mm_srai_epi32(v[12], 2);
+          u[13] = _mm_srai_epi32(v[13], 2);
+          u[14] = _mm_srai_epi32(v[14], 2);
+          u[15] = _mm_srai_epi32(v[15], 2);
+
+          out[ 1] = _mm_packs_epi32(u[0], u[1]);
+          out[17] = _mm_packs_epi32(u[2], u[3]);
+          out[ 9] = _mm_packs_epi32(u[4], u[5]);
+          out[25] = _mm_packs_epi32(u[6], u[7]);
+          out[ 7] = _mm_packs_epi32(u[8], u[9]);
+          out[23] = _mm_packs_epi32(u[10], u[11]);
+          out[15] = _mm_packs_epi32(u[12], u[13]);
+          out[31] = _mm_packs_epi32(u[14], u[15]);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x8(&out[1], &out[17], &out[9],
+                                             &out[25], &out[7], &out[23],
+                                             &out[15], &out[31]);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+        }
+        {
+          const __m128i k32_p27_p05 = pair_set_epi32(cospi_27_64, cospi_5_64);
+          const __m128i k32_p11_p21 = pair_set_epi32(cospi_11_64, cospi_21_64);
+          const __m128i k32_p19_p13 = pair_set_epi32(cospi_19_64, cospi_13_64);
+          const __m128i k32_p03_p29 = pair_set_epi32(cospi_3_64, cospi_29_64);
+          const __m128i k32_m29_p03 = pair_set_epi32(-cospi_29_64, cospi_3_64);
+          const __m128i k32_m13_p19 = pair_set_epi32(-cospi_13_64, cospi_19_64);
+          const __m128i k32_m21_p11 = pair_set_epi32(-cospi_21_64, cospi_11_64);
+          const __m128i k32_m05_p27 = pair_set_epi32(-cospi_5_64, cospi_27_64);
+
+          u[ 0] = _mm_unpacklo_epi32(lstep1[40], lstep1[54]);
+          u[ 1] = _mm_unpackhi_epi32(lstep1[40], lstep1[54]);
+          u[ 2] = _mm_unpacklo_epi32(lstep1[41], lstep1[55]);
+          u[ 3] = _mm_unpackhi_epi32(lstep1[41], lstep1[55]);
+          u[ 4] = _mm_unpacklo_epi32(lstep1[42], lstep1[52]);
+          u[ 5] = _mm_unpackhi_epi32(lstep1[42], lstep1[52]);
+          u[ 6] = _mm_unpacklo_epi32(lstep1[43], lstep1[53]);
+          u[ 7] = _mm_unpackhi_epi32(lstep1[43], lstep1[53]);
+          u[ 8] = _mm_unpacklo_epi32(lstep1[44], lstep1[50]);
+          u[ 9] = _mm_unpackhi_epi32(lstep1[44], lstep1[50]);
+          u[10] = _mm_unpacklo_epi32(lstep1[45], lstep1[51]);
+          u[11] = _mm_unpackhi_epi32(lstep1[45], lstep1[51]);
+          u[12] = _mm_unpacklo_epi32(lstep1[46], lstep1[48]);
+          u[13] = _mm_unpackhi_epi32(lstep1[46], lstep1[48]);
+          u[14] = _mm_unpacklo_epi32(lstep1[47], lstep1[49]);
+          u[15] = _mm_unpackhi_epi32(lstep1[47], lstep1[49]);
+
+          v[ 0] = k_madd_epi32(u[ 0], k32_p27_p05);
+          v[ 1] = k_madd_epi32(u[ 1], k32_p27_p05);
+          v[ 2] = k_madd_epi32(u[ 2], k32_p27_p05);
+          v[ 3] = k_madd_epi32(u[ 3], k32_p27_p05);
+          v[ 4] = k_madd_epi32(u[ 4], k32_p11_p21);
+          v[ 5] = k_madd_epi32(u[ 5], k32_p11_p21);
+          v[ 6] = k_madd_epi32(u[ 6], k32_p11_p21);
+          v[ 7] = k_madd_epi32(u[ 7], k32_p11_p21);
+          v[ 8] = k_madd_epi32(u[ 8], k32_p19_p13);
+          v[ 9] = k_madd_epi32(u[ 9], k32_p19_p13);
+          v[10] = k_madd_epi32(u[10], k32_p19_p13);
+          v[11] = k_madd_epi32(u[11], k32_p19_p13);
+          v[12] = k_madd_epi32(u[12], k32_p03_p29);
+          v[13] = k_madd_epi32(u[13], k32_p03_p29);
+          v[14] = k_madd_epi32(u[14], k32_p03_p29);
+          v[15] = k_madd_epi32(u[15], k32_p03_p29);
+          v[16] = k_madd_epi32(u[12], k32_m29_p03);
+          v[17] = k_madd_epi32(u[13], k32_m29_p03);
+          v[18] = k_madd_epi32(u[14], k32_m29_p03);
+          v[19] = k_madd_epi32(u[15], k32_m29_p03);
+          v[20] = k_madd_epi32(u[ 8], k32_m13_p19);
+          v[21] = k_madd_epi32(u[ 9], k32_m13_p19);
+          v[22] = k_madd_epi32(u[10], k32_m13_p19);
+          v[23] = k_madd_epi32(u[11], k32_m13_p19);
+          v[24] = k_madd_epi32(u[ 4], k32_m21_p11);
+          v[25] = k_madd_epi32(u[ 5], k32_m21_p11);
+          v[26] = k_madd_epi32(u[ 6], k32_m21_p11);
+          v[27] = k_madd_epi32(u[ 7], k32_m21_p11);
+          v[28] = k_madd_epi32(u[ 0], k32_m05_p27);
+          v[29] = k_madd_epi32(u[ 1], k32_m05_p27);
+          v[30] = k_madd_epi32(u[ 2], k32_m05_p27);
+          v[31] = k_madd_epi32(u[ 3], k32_m05_p27);
+
+#if DCT_HIGH_BIT_DEPTH
+          overflow = k_check_epi32_overflow_32(
+              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+              &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+              &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23],
+              &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31],
+              &kZero);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+          u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
+          u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
+          u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
+          u[ 3] = k_packs_epi64(v[ 6], v[ 7]);
+          u[ 4] = k_packs_epi64(v[ 8], v[ 9]);
+          u[ 5] = k_packs_epi64(v[10], v[11]);
+          u[ 6] = k_packs_epi64(v[12], v[13]);
+          u[ 7] = k_packs_epi64(v[14], v[15]);
+          u[ 8] = k_packs_epi64(v[16], v[17]);
+          u[ 9] = k_packs_epi64(v[18], v[19]);
+          u[10] = k_packs_epi64(v[20], v[21]);
+          u[11] = k_packs_epi64(v[22], v[23]);
+          u[12] = k_packs_epi64(v[24], v[25]);
+          u[13] = k_packs_epi64(v[26], v[27]);
+          u[14] = k_packs_epi64(v[28], v[29]);
+          u[15] = k_packs_epi64(v[30], v[31]);
+
+          v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+          v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+          v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+          v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+          v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+          v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+          v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+          v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+          v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+          v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+          v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+          v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+          v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+          v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+          v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+          v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+          u[ 0] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS);
+          u[ 1] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS);
+          u[ 2] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS);
+          u[ 3] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS);
+          u[ 4] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS);
+          u[ 5] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS);
+          u[ 6] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS);
+          u[ 7] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS);
+          u[ 8] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS);
+          u[ 9] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS);
+          u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+          u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+          u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+          u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+          u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+          u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+
+          v[ 0] = _mm_cmplt_epi32(u[ 0], kZero);
+          v[ 1] = _mm_cmplt_epi32(u[ 1], kZero);
+          v[ 2] = _mm_cmplt_epi32(u[ 2], kZero);
+          v[ 3] = _mm_cmplt_epi32(u[ 3], kZero);
+          v[ 4] = _mm_cmplt_epi32(u[ 4], kZero);
+          v[ 5] = _mm_cmplt_epi32(u[ 5], kZero);
+          v[ 6] = _mm_cmplt_epi32(u[ 6], kZero);
+          v[ 7] = _mm_cmplt_epi32(u[ 7], kZero);
+          v[ 8] = _mm_cmplt_epi32(u[ 8], kZero);
+          v[ 9] = _mm_cmplt_epi32(u[ 9], kZero);
+          v[10] = _mm_cmplt_epi32(u[10], kZero);
+          v[11] = _mm_cmplt_epi32(u[11], kZero);
+          v[12] = _mm_cmplt_epi32(u[12], kZero);
+          v[13] = _mm_cmplt_epi32(u[13], kZero);
+          v[14] = _mm_cmplt_epi32(u[14], kZero);
+          v[15] = _mm_cmplt_epi32(u[15], kZero);
+
+          u[ 0] = _mm_sub_epi32(u[ 0], v[ 0]);
+          u[ 1] = _mm_sub_epi32(u[ 1], v[ 1]);
+          u[ 2] = _mm_sub_epi32(u[ 2], v[ 2]);
+          u[ 3] = _mm_sub_epi32(u[ 3], v[ 3]);
+          u[ 4] = _mm_sub_epi32(u[ 4], v[ 4]);
+          u[ 5] = _mm_sub_epi32(u[ 5], v[ 5]);
+          u[ 6] = _mm_sub_epi32(u[ 6], v[ 6]);
+          u[ 7] = _mm_sub_epi32(u[ 7], v[ 7]);
+          u[ 8] = _mm_sub_epi32(u[ 8], v[ 8]);
+          u[ 9] = _mm_sub_epi32(u[ 9], v[ 9]);
+          u[10] = _mm_sub_epi32(u[10], v[10]);
+          u[11] = _mm_sub_epi32(u[11], v[11]);
+          u[12] = _mm_sub_epi32(u[12], v[12]);
+          u[13] = _mm_sub_epi32(u[13], v[13]);
+          u[14] = _mm_sub_epi32(u[14], v[14]);
+          u[15] = _mm_sub_epi32(u[15], v[15]);
+
+          v[0] = _mm_add_epi32(u[0], K32One);
+          v[1] = _mm_add_epi32(u[1], K32One);
+          v[2] = _mm_add_epi32(u[2], K32One);
+          v[3] = _mm_add_epi32(u[3], K32One);
+          v[4] = _mm_add_epi32(u[4], K32One);
+          v[5] = _mm_add_epi32(u[5], K32One);
+          v[6] = _mm_add_epi32(u[6], K32One);
+          v[7] = _mm_add_epi32(u[7], K32One);
+          v[8] = _mm_add_epi32(u[8], K32One);
+          v[9] = _mm_add_epi32(u[9], K32One);
+          v[10] = _mm_add_epi32(u[10], K32One);
+          v[11] = _mm_add_epi32(u[11], K32One);
+          v[12] = _mm_add_epi32(u[12], K32One);
+          v[13] = _mm_add_epi32(u[13], K32One);
+          v[14] = _mm_add_epi32(u[14], K32One);
+          v[15] = _mm_add_epi32(u[15], K32One);
+
+          u[0] = _mm_srai_epi32(v[0], 2);
+          u[1] = _mm_srai_epi32(v[1], 2);
+          u[2] = _mm_srai_epi32(v[2], 2);
+          u[3] = _mm_srai_epi32(v[3], 2);
+          u[4] = _mm_srai_epi32(v[4], 2);
+          u[5] = _mm_srai_epi32(v[5], 2);
+          u[6] = _mm_srai_epi32(v[6], 2);
+          u[7] = _mm_srai_epi32(v[7], 2);
+          u[8] = _mm_srai_epi32(v[8], 2);
+          u[9] = _mm_srai_epi32(v[9], 2);
+          u[10] = _mm_srai_epi32(v[10], 2);
+          u[11] = _mm_srai_epi32(v[11], 2);
+          u[12] = _mm_srai_epi32(v[12], 2);
+          u[13] = _mm_srai_epi32(v[13], 2);
+          u[14] = _mm_srai_epi32(v[14], 2);
+          u[15] = _mm_srai_epi32(v[15], 2);
+
+          out[ 5] = _mm_packs_epi32(u[0], u[1]);
+          out[21] = _mm_packs_epi32(u[2], u[3]);
+          out[13] = _mm_packs_epi32(u[4], u[5]);
+          out[29] = _mm_packs_epi32(u[6], u[7]);
+          out[ 3] = _mm_packs_epi32(u[8], u[9]);
+          out[19] = _mm_packs_epi32(u[10], u[11]);
+          out[11] = _mm_packs_epi32(u[12], u[13]);
+          out[27] = _mm_packs_epi32(u[14], u[15]);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x8(&out[5], &out[21], &out[13],
+                                             &out[29], &out[3], &out[19],
+                                             &out[11], &out[27]);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+        }
+      }
+#endif  // FDCT32x32_HIGH_PRECISION
+      // Transpose the results, do it as four 8x8 transposes.
+      {
+        int transpose_block;
+        int16_t *output0 = &intermediate[column_start * 32];
+        tran_low_t *output1 = &output_org[column_start * 32];
+        for (transpose_block = 0; transpose_block < 4; ++transpose_block) {
+          __m128i *this_out = &out[8 * transpose_block];
+          // 00 01 02 03 04 05 06 07
+          // 10 11 12 13 14 15 16 17
+          // 20 21 22 23 24 25 26 27
+          // 30 31 32 33 34 35 36 37
+          // 40 41 42 43 44 45 46 47
+          // 50 51 52 53 54 55 56 57
+          // 60 61 62 63 64 65 66 67
+          // 70 71 72 73 74 75 76 77
+          const __m128i tr0_0 = _mm_unpacklo_epi16(this_out[0], this_out[1]);
+          const __m128i tr0_1 = _mm_unpacklo_epi16(this_out[2], this_out[3]);
+          const __m128i tr0_2 = _mm_unpackhi_epi16(this_out[0], this_out[1]);
+          const __m128i tr0_3 = _mm_unpackhi_epi16(this_out[2], this_out[3]);
+          const __m128i tr0_4 = _mm_unpacklo_epi16(this_out[4], this_out[5]);
+          const __m128i tr0_5 = _mm_unpacklo_epi16(this_out[6], this_out[7]);
+          const __m128i tr0_6 = _mm_unpackhi_epi16(this_out[4], this_out[5]);
+          const __m128i tr0_7 = _mm_unpackhi_epi16(this_out[6], this_out[7]);
+          // 00 10 01 11 02 12 03 13
+          // 20 30 21 31 22 32 23 33
+          // 04 14 05 15 06 16 07 17
+          // 24 34 25 35 26 36 27 37
+          // 40 50 41 51 42 52 43 53
+          // 60 70 61 71 62 72 63 73
+          // 54 54 55 55 56 56 57 57
+          // 64 74 65 75 66 76 67 77
+          const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+          const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+          const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+          const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+          const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+          const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+          const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+          const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+          // 00 10 20 30 01 11 21 31
+          // 40 50 60 70 41 51 61 71
+          // 02 12 22 32 03 13 23 33
+          // 42 52 62 72 43 53 63 73
+          // 04 14 24 34 05 15 21 36
+          // 44 54 64 74 45 55 61 76
+          // 06 16 26 36 07 17 27 37
+          // 46 56 66 76 47 57 67 77
+          __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+          __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+          __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+          __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+          __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+          __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+          __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+          __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+          // 00 10 20 30 40 50 60 70
+          // 01 11 21 31 41 51 61 71
+          // 02 12 22 32 42 52 62 72
+          // 03 13 23 33 43 53 63 73
+          // 04 14 24 34 44 54 64 74
+          // 05 15 25 35 45 55 65 75
+          // 06 16 26 36 46 56 66 76
+          // 07 17 27 37 47 57 67 77
+          if (0 == pass) {
+            // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2;
+            // TODO(cd): see quality impact of only doing
+            //           output[j] = (output[j] + 1) >> 2;
+            //           which would remove the code between here ...
+            __m128i tr2_0_0 = _mm_cmpgt_epi16(tr2_0, kZero);
+            __m128i tr2_1_0 = _mm_cmpgt_epi16(tr2_1, kZero);
+            __m128i tr2_2_0 = _mm_cmpgt_epi16(tr2_2, kZero);
+            __m128i tr2_3_0 = _mm_cmpgt_epi16(tr2_3, kZero);
+            __m128i tr2_4_0 = _mm_cmpgt_epi16(tr2_4, kZero);
+            __m128i tr2_5_0 = _mm_cmpgt_epi16(tr2_5, kZero);
+            __m128i tr2_6_0 = _mm_cmpgt_epi16(tr2_6, kZero);
+            __m128i tr2_7_0 = _mm_cmpgt_epi16(tr2_7, kZero);
+            tr2_0 = _mm_sub_epi16(tr2_0, tr2_0_0);
+            tr2_1 = _mm_sub_epi16(tr2_1, tr2_1_0);
+            tr2_2 = _mm_sub_epi16(tr2_2, tr2_2_0);
+            tr2_3 = _mm_sub_epi16(tr2_3, tr2_3_0);
+            tr2_4 = _mm_sub_epi16(tr2_4, tr2_4_0);
+            tr2_5 = _mm_sub_epi16(tr2_5, tr2_5_0);
+            tr2_6 = _mm_sub_epi16(tr2_6, tr2_6_0);
+            tr2_7 = _mm_sub_epi16(tr2_7, tr2_7_0);
+            //           ... and here.
+            //           PS: also change code in vp9/encoder/vp9_dct.c
+            tr2_0 = _mm_add_epi16(tr2_0, kOne);
+            tr2_1 = _mm_add_epi16(tr2_1, kOne);
+            tr2_2 = _mm_add_epi16(tr2_2, kOne);
+            tr2_3 = _mm_add_epi16(tr2_3, kOne);
+            tr2_4 = _mm_add_epi16(tr2_4, kOne);
+            tr2_5 = _mm_add_epi16(tr2_5, kOne);
+            tr2_6 = _mm_add_epi16(tr2_6, kOne);
+            tr2_7 = _mm_add_epi16(tr2_7, kOne);
+            tr2_0 = _mm_srai_epi16(tr2_0, 2);
+            tr2_1 = _mm_srai_epi16(tr2_1, 2);
+            tr2_2 = _mm_srai_epi16(tr2_2, 2);
+            tr2_3 = _mm_srai_epi16(tr2_3, 2);
+            tr2_4 = _mm_srai_epi16(tr2_4, 2);
+            tr2_5 = _mm_srai_epi16(tr2_5, 2);
+            tr2_6 = _mm_srai_epi16(tr2_6, 2);
+            tr2_7 = _mm_srai_epi16(tr2_7, 2);
+          }
+          // Note: even though all these stores are aligned, using the aligned
+          //       intrinsic make the code slightly slower.
+          if (pass == 0) {
+            _mm_storeu_si128((__m128i *)(output0 + 0 * 32), tr2_0);
+            _mm_storeu_si128((__m128i *)(output0 + 1 * 32), tr2_1);
+            _mm_storeu_si128((__m128i *)(output0 + 2 * 32), tr2_2);
+            _mm_storeu_si128((__m128i *)(output0 + 3 * 32), tr2_3);
+            _mm_storeu_si128((__m128i *)(output0 + 4 * 32), tr2_4);
+            _mm_storeu_si128((__m128i *)(output0 + 5 * 32), tr2_5);
+            _mm_storeu_si128((__m128i *)(output0 + 6 * 32), tr2_6);
+            _mm_storeu_si128((__m128i *)(output0 + 7 * 32), tr2_7);
+            // Process next 8x8
+            output0 += 8;
+          } else {
+            storeu_output(&tr2_0, (output1 + 0 * 32));
+            storeu_output(&tr2_1, (output1 + 1 * 32));
+            storeu_output(&tr2_2, (output1 + 2 * 32));
+            storeu_output(&tr2_3, (output1 + 3 * 32));
+            storeu_output(&tr2_4, (output1 + 4 * 32));
+            storeu_output(&tr2_5, (output1 + 5 * 32));
+            storeu_output(&tr2_6, (output1 + 6 * 32));
+            storeu_output(&tr2_7, (output1 + 7 * 32));
+            // Process next 8x8
+            output1 += 8;
+          }
+        }
+      }
+    }
+  }
+}  // NOLINT
+
+#undef ADD_EPI16
+#undef SUB_EPI16
+#undef HIGH_FDCT32x32_2D_C
+#undef HIGH_FDCT32x32_2D_ROWS_C
diff --git a/libs/libvpx/vp10/common/x86/vp10_fwd_txfm_impl_sse2.h b/libs/libvpx/vp10/common/x86/vp10_fwd_txfm_impl_sse2.h
new file mode 100644
index 0000000000..69889e2e98
--- /dev/null
+++ b/libs/libvpx/vp10/common/x86/vp10_fwd_txfm_impl_sse2.h
@@ -0,0 +1,1027 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>  // SSE2
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/x86/fwd_txfm_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+#include "vpx_ports/mem.h"
+
+// TODO(jingning) The high bit-depth functions need rework for performance.
+// After we properly fix the high bit-depth function implementations, this
+// file's dependency should be substantially simplified.
+#if DCT_HIGH_BIT_DEPTH
+#define ADD_EPI16 _mm_adds_epi16
+#define SUB_EPI16 _mm_subs_epi16
+
+#else
+#define ADD_EPI16 _mm_add_epi16
+#define SUB_EPI16 _mm_sub_epi16
+#endif
+
+void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) {
+  // This 2D transform implements 4 vertical 1D transforms followed
+  // by 4 horizontal 1D transforms.  The multiplies and adds are as given
+  // by Chen, Smith and Fralick ('77).  The commands for moving the data
+  // around have been minimized by hand.
+  // For the purposes of the comments, the 16 inputs are referred to at i0
+  // through iF (in raster order), intermediate variables are a0, b0, c0
+  // through f, and correspond to the in-place computations mapped to input
+  // locations.  The outputs, o0 through oF are labeled according to the
+  // output locations.
+
+  // Constants
+  // These are the coefficients used for the multiplies.
+  // In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64),
+  // where cospi_N_64 = cos(N pi /64)
+  const __m128i k__cospi_A = octa_set_epi16(cospi_16_64, cospi_16_64,
+                                            cospi_16_64, cospi_16_64,
+                                            cospi_16_64, -cospi_16_64,
+                                            cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_B = octa_set_epi16(cospi_16_64, -cospi_16_64,
+                                            cospi_16_64, -cospi_16_64,
+                                            cospi_16_64, cospi_16_64,
+                                            cospi_16_64, cospi_16_64);
+  const __m128i k__cospi_C = octa_set_epi16(cospi_8_64, cospi_24_64,
+                                            cospi_8_64, cospi_24_64,
+                                            cospi_24_64, -cospi_8_64,
+                                            cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_D = octa_set_epi16(cospi_24_64, -cospi_8_64,
+                                            cospi_24_64, -cospi_8_64,
+                                            cospi_8_64, cospi_24_64,
+                                            cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_E = octa_set_epi16(cospi_16_64, cospi_16_64,
+                                            cospi_16_64, cospi_16_64,
+                                            cospi_16_64, cospi_16_64,
+                                            cospi_16_64, cospi_16_64);
+  const __m128i k__cospi_F = octa_set_epi16(cospi_16_64, -cospi_16_64,
+                                            cospi_16_64, -cospi_16_64,
+                                            cospi_16_64, -cospi_16_64,
+                                            cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_G = octa_set_epi16(cospi_8_64, cospi_24_64,
+                                            cospi_8_64, cospi_24_64,
+                                            -cospi_8_64, -cospi_24_64,
+                                            -cospi_8_64, -cospi_24_64);
+  const __m128i k__cospi_H = octa_set_epi16(cospi_24_64, -cospi_8_64,
+                                            cospi_24_64, -cospi_8_64,
+                                            -cospi_24_64, cospi_8_64,
+                                            -cospi_24_64, cospi_8_64);
+
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  // This second rounding constant saves doing some extra adds at the end
+  const __m128i k__DCT_CONST_ROUNDING2 = _mm_set1_epi32(DCT_CONST_ROUNDING
+                                               +(DCT_CONST_ROUNDING << 1));
+  const int DCT_CONST_BITS2 =  DCT_CONST_BITS + 2;
+  const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
+  const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
+  __m128i in0, in1;
+#if DCT_HIGH_BIT_DEPTH
+  __m128i cmp0, cmp1;
+  int test, overflow;
+#endif
+
+  // Load inputs.
+  in0  = _mm_loadl_epi64((const __m128i *)(input +  0 * stride));
+  in1  = _mm_loadl_epi64((const __m128i *)(input +  1 * stride));
+  in1  = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *)
+                                                 (input +  2 * stride)));
+  in0  = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *)
+                                                 (input +  3 * stride)));
+  // in0 = [i0 i1 i2 i3 iC iD iE iF]
+  // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
+#if DCT_HIGH_BIT_DEPTH
+  // Check inputs small enough to use optimised code
+  cmp0 = _mm_xor_si128(_mm_cmpgt_epi16(in0, _mm_set1_epi16(0x3ff)),
+                       _mm_cmplt_epi16(in0, _mm_set1_epi16(0xfc00)));
+  cmp1 = _mm_xor_si128(_mm_cmpgt_epi16(in1, _mm_set1_epi16(0x3ff)),
+                       _mm_cmplt_epi16(in1, _mm_set1_epi16(0xfc00)));
+  test = _mm_movemask_epi8(_mm_or_si128(cmp0, cmp1));
+  if (test) {
+    vpx_highbd_fdct4x4_c(input, output, stride);
+    return;
+  }
+#endif  // DCT_HIGH_BIT_DEPTH
+
+  // multiply by 16 to give some extra precision
+  in0 = _mm_slli_epi16(in0, 4);
+  in1 = _mm_slli_epi16(in1, 4);
+  // if (i == 0 && input[0]) input[0] += 1;
+  // add 1 to the upper left pixel if it is non-zero, which helps reduce
+  // the round-trip error
+  {
+    // The mask will only contain whether the first value is zero, all
+    // other comparison will fail as something shifted by 4 (above << 4)
+    // can never be equal to one. To increment in the non-zero case, we
+    // add the mask and one for the first element:
+    //   - if zero, mask = -1, v = v - 1 + 1 = v
+    //   - if non-zero, mask = 0, v = v + 0 + 1 = v + 1
+    __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a);
+    in0 = _mm_add_epi16(in0, mask);
+    in0 = _mm_add_epi16(in0, k__nonzero_bias_b);
+  }
+  // There are 4 total stages, alternating between an add/subtract stage
+  // followed by an multiply-and-add stage.
+  {
+    // Stage 1: Add/subtract
+
+    // in0 = [i0 i1 i2 i3 iC iD iE iF]
+    // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
+    const __m128i r0 = _mm_unpacklo_epi16(in0, in1);
+    const __m128i r1 = _mm_unpackhi_epi16(in0, in1);
+    // r0 = [i0 i4 i1 i5 i2 i6 i3 i7]
+    // r1 = [iC i8 iD i9 iE iA iF iB]
+    const __m128i r2 = _mm_shuffle_epi32(r0, 0xB4);
+    const __m128i r3 = _mm_shuffle_epi32(r1, 0xB4);
+    // r2 = [i0 i4 i1 i5 i3 i7 i2 i6]
+    // r3 = [iC i8 iD i9 iF iB iE iA]
+
+    const __m128i t0 = _mm_add_epi16(r2, r3);
+    const __m128i t1 = _mm_sub_epi16(r2, r3);
+    // t0 = [a0 a4 a1 a5 a3 a7 a2 a6]
+    // t1 = [aC a8 aD a9 aF aB aE aA]
+
+    // Stage 2: multiply by constants (which gets us into 32 bits).
+    // The constants needed here are:
+    // k__cospi_A = [p16 p16 p16 p16 p16 m16 p16 m16]
+    // k__cospi_B = [p16 m16 p16 m16 p16 p16 p16 p16]
+    // k__cospi_C = [p08 p24 p08 p24 p24 m08 p24 m08]
+    // k__cospi_D = [p24 m08 p24 m08 p08 p24 p08 p24]
+    const __m128i u0 = _mm_madd_epi16(t0, k__cospi_A);
+    const __m128i u2 = _mm_madd_epi16(t0, k__cospi_B);
+    const __m128i u1 = _mm_madd_epi16(t1, k__cospi_C);
+    const __m128i u3 = _mm_madd_epi16(t1, k__cospi_D);
+    // Then add and right-shift to get back to 16-bit range
+    const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+    const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+    const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+    const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+    const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+    const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+    const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+    const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+    // w0 = [b0 b1 b7 b6]
+    // w1 = [b8 b9 bF bE]
+    // w2 = [b4 b5 b3 b2]
+    // w3 = [bC bD bB bA]
+    const __m128i x0 = _mm_packs_epi32(w0, w1);
+    const __m128i x1 = _mm_packs_epi32(w2, w3);
+#if DCT_HIGH_BIT_DEPTH
+    overflow = check_epi16_overflow_x2(&x0, &x1);
+    if (overflow) {
+      vpx_highbd_fdct4x4_c(input, output, stride);
+      return;
+    }
+#endif  // DCT_HIGH_BIT_DEPTH
+    // x0 = [b0 b1 b7 b6 b8 b9 bF bE]
+    // x1 = [b4 b5 b3 b2 bC bD bB bA]
+    in0 = _mm_shuffle_epi32(x0, 0xD8);
+    in1 = _mm_shuffle_epi32(x1, 0x8D);
+    // in0 = [b0 b1 b8 b9 b7 b6 bF bE]
+    // in1 = [b3 b2 bB bA b4 b5 bC bD]
+  }
+  {
+    // vertical DCTs finished. Now we do the horizontal DCTs.
+    // Stage 3: Add/subtract
+
+    const __m128i t0 = ADD_EPI16(in0, in1);
+    const __m128i t1 = SUB_EPI16(in0, in1);
+    // t0 = [c0 c1 c8 c9  c4  c5  cC  cD]
+    // t1 = [c3 c2 cB cA -c7 -c6 -cF -cE]
+#if DCT_HIGH_BIT_DEPTH
+    overflow = check_epi16_overflow_x2(&t0, &t1);
+    if (overflow) {
+      vpx_highbd_fdct4x4_c(input, output, stride);
+      return;
+    }
+#endif  // DCT_HIGH_BIT_DEPTH
+
+    // Stage 4: multiply by constants (which gets us into 32 bits).
+    {
+      // The constants needed here are:
+      // k__cospi_E = [p16 p16 p16 p16 p16 p16 p16 p16]
+      // k__cospi_F = [p16 m16 p16 m16 p16 m16 p16 m16]
+      // k__cospi_G = [p08 p24 p08 p24 m08 m24 m08 m24]
+      // k__cospi_H = [p24 m08 p24 m08 m24 p08 m24 p08]
+      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_E);
+      const __m128i u1 = _mm_madd_epi16(t0, k__cospi_F);
+      const __m128i u2 = _mm_madd_epi16(t1, k__cospi_G);
+      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_H);
+      // Then add and right-shift to get back to 16-bit range
+      // but this combines the final right-shift as well to save operations
+      // This unusual rounding operations is to maintain bit-accurate
+      // compatibility with the c version of this function which has two
+      // rounding steps in a row.
+      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING2);
+      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING2);
+      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING2);
+      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING2);
+      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS2);
+      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS2);
+      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS2);
+      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS2);
+      // w0 = [o0 o4 o8 oC]
+      // w1 = [o2 o6 oA oE]
+      // w2 = [o1 o5 o9 oD]
+      // w3 = [o3 o7 oB oF]
+      // remember the o's are numbered according to the correct output location
+      const __m128i x0 = _mm_packs_epi32(w0, w1);
+      const __m128i x1 = _mm_packs_epi32(w2, w3);
+#if DCT_HIGH_BIT_DEPTH
+      overflow = check_epi16_overflow_x2(&x0, &x1);
+      if (overflow) {
+        vpx_highbd_fdct4x4_c(input, output, stride);
+        return;
+      }
+#endif  // DCT_HIGH_BIT_DEPTH
+      {
+        // x0 = [o0 o4 o8 oC o2 o6 oA oE]
+        // x1 = [o1 o5 o9 oD o3 o7 oB oF]
+        const __m128i y0 = _mm_unpacklo_epi16(x0, x1);
+        const __m128i y1 = _mm_unpackhi_epi16(x0, x1);
+        // y0 = [o0 o1 o4 o5 o8 o9 oC oD]
+        // y1 = [o2 o3 o6 o7 oA oB oE oF]
+        in0 = _mm_unpacklo_epi32(y0, y1);
+        // in0 = [o0 o1 o2 o3 o4 o5 o6 o7]
+        in1 = _mm_unpackhi_epi32(y0, y1);
+        // in1 = [o8 o9 oA oB oC oD oE oF]
+      }
+    }
+  }
+  // Post-condition (v + 1) >> 2 is now incorporated into previous
+  // add and right-shift commands.  Only 2 store instructions needed
+  // because we are using the fact that 1/3 are stored just after 0/2.
+  storeu_output(&in0, output + 0 * 4);
+  storeu_output(&in1, output + 2 * 4);
+}
+
+
+void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
+  int pass;
+  // Constants
+  //    When we use them, in one case, they are all the same. In all others
+  //    it's a pair of them that we need to repeat four times. This is done
+  //    by constructing the 32 bit constant corresponding to that pair.
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+#if DCT_HIGH_BIT_DEPTH
+  int overflow;
+#endif
+  // Load input
+  __m128i in0  = _mm_load_si128((const __m128i *)(input + 0 * stride));
+  __m128i in1  = _mm_load_si128((const __m128i *)(input + 1 * stride));
+  __m128i in2  = _mm_load_si128((const __m128i *)(input + 2 * stride));
+  __m128i in3  = _mm_load_si128((const __m128i *)(input + 3 * stride));
+  __m128i in4  = _mm_load_si128((const __m128i *)(input + 4 * stride));
+  __m128i in5  = _mm_load_si128((const __m128i *)(input + 5 * stride));
+  __m128i in6  = _mm_load_si128((const __m128i *)(input + 6 * stride));
+  __m128i in7  = _mm_load_si128((const __m128i *)(input + 7 * stride));
+  // Pre-condition input (shift by two)
+  in0 = _mm_slli_epi16(in0, 2);
+  in1 = _mm_slli_epi16(in1, 2);
+  in2 = _mm_slli_epi16(in2, 2);
+  in3 = _mm_slli_epi16(in3, 2);
+  in4 = _mm_slli_epi16(in4, 2);
+  in5 = _mm_slli_epi16(in5, 2);
+  in6 = _mm_slli_epi16(in6, 2);
+  in7 = _mm_slli_epi16(in7, 2);
+
+  // We do two passes, first the columns, then the rows. The results of the
+  // first pass are transposed so that the same column code can be reused. The
+  // results of the second pass are also transposed so that the rows (processed
+  // as columns) are put back in row positions.
+  for (pass = 0; pass < 2; pass++) {
+    // To store results of each pass before the transpose.
+    __m128i res0, res1, res2, res3, res4, res5, res6, res7;
+    // Add/subtract
+    const __m128i q0 = ADD_EPI16(in0, in7);
+    const __m128i q1 = ADD_EPI16(in1, in6);
+    const __m128i q2 = ADD_EPI16(in2, in5);
+    const __m128i q3 = ADD_EPI16(in3, in4);
+    const __m128i q4 = SUB_EPI16(in3, in4);
+    const __m128i q5 = SUB_EPI16(in2, in5);
+    const __m128i q6 = SUB_EPI16(in1, in6);
+    const __m128i q7 = SUB_EPI16(in0, in7);
+#if DCT_HIGH_BIT_DEPTH
+    if (pass == 1) {
+      overflow = check_epi16_overflow_x8(&q0, &q1, &q2, &q3,
+                                         &q4, &q5, &q6, &q7);
+      if (overflow) {
+        vpx_highbd_fdct8x8_c(input, output, stride);
+        return;
+      }
+    }
+#endif  // DCT_HIGH_BIT_DEPTH
+    // Work on first four results
+    {
+      // Add/subtract
+      const __m128i r0 = ADD_EPI16(q0, q3);
+      const __m128i r1 = ADD_EPI16(q1, q2);
+      const __m128i r2 = SUB_EPI16(q1, q2);
+      const __m128i r3 = SUB_EPI16(q0, q3);
+#if DCT_HIGH_BIT_DEPTH
+      overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3);
+      if (overflow) {
+        vpx_highbd_fdct8x8_c(input, output, stride);
+        return;
+      }
+#endif  // DCT_HIGH_BIT_DEPTH
+      // Interleave to do the multiply by constants which gets us into 32bits
+      {
+        const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+        const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
+        const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+        const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
+        const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+        const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
+        const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+        const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
+        const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
+        const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
+        const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+        const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
+        // dct_const_round_shift
+        const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+        const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+        const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+        const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+        const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+        const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+        const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+        const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+        const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+        const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+        const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+        const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+        const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+        const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+        const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+        const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+        // Combine
+        res0 = _mm_packs_epi32(w0, w1);
+        res4 = _mm_packs_epi32(w2, w3);
+        res2 = _mm_packs_epi32(w4, w5);
+        res6 = _mm_packs_epi32(w6, w7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x4(&res0, &res4, &res2, &res6);
+        if (overflow) {
+          vpx_highbd_fdct8x8_c(input, output, stride);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+    }
+    // Work on next four results
+    {
+      // Interleave to do the multiply by constants which gets us into 32bits
+      const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
+      const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
+      const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
+      const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
+      const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
+      const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
+      // dct_const_round_shift
+      const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
+      const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
+      const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
+      const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
+      const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
+      const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
+      const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
+      const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
+      // Combine
+      const __m128i r0 = _mm_packs_epi32(s0, s1);
+      const __m128i r1 = _mm_packs_epi32(s2, s3);
+#if DCT_HIGH_BIT_DEPTH
+      overflow = check_epi16_overflow_x2(&r0, &r1);
+      if (overflow) {
+        vpx_highbd_fdct8x8_c(input, output, stride);
+        return;
+      }
+#endif  // DCT_HIGH_BIT_DEPTH
+      {
+        // Add/subtract
+        const __m128i x0 = ADD_EPI16(q4, r0);
+        const __m128i x1 = SUB_EPI16(q4, r0);
+        const __m128i x2 = SUB_EPI16(q7, r1);
+        const __m128i x3 = ADD_EPI16(q7, r1);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3);
+        if (overflow) {
+          vpx_highbd_fdct8x8_c(input, output, stride);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+        // Interleave to do the multiply by constants which gets us into 32bits
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
+          const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
+          const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
+          const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
+          const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
+          const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
+          const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
+          const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
+          const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
+          const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+          const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+          const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+          const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+          const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+          const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+          const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+          // Combine
+          res1 = _mm_packs_epi32(w0, w1);
+          res7 = _mm_packs_epi32(w2, w3);
+          res5 = _mm_packs_epi32(w4, w5);
+          res3 = _mm_packs_epi32(w6, w7);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x4(&res1, &res7, &res5, &res3);
+          if (overflow) {
+            vpx_highbd_fdct8x8_c(input, output, stride);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+        }
+      }
+    }
+    // Transpose the 8x8.
+    {
+      // 00 01 02 03 04 05 06 07
+      // 10 11 12 13 14 15 16 17
+      // 20 21 22 23 24 25 26 27
+      // 30 31 32 33 34 35 36 37
+      // 40 41 42 43 44 45 46 47
+      // 50 51 52 53 54 55 56 57
+      // 60 61 62 63 64 65 66 67
+      // 70 71 72 73 74 75 76 77
+      const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
+      const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
+      const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
+      const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
+      const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
+      const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
+      const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
+      const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
+      // 00 10 01 11 02 12 03 13
+      // 20 30 21 31 22 32 23 33
+      // 04 14 05 15 06 16 07 17
+      // 24 34 25 35 26 36 27 37
+      // 40 50 41 51 42 52 43 53
+      // 60 70 61 71 62 72 63 73
+      // 54 54 55 55 56 56 57 57
+      // 64 74 65 75 66 76 67 77
+      const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+      const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+      const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+      const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+      const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+      const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+      const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+      const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+      // 00 10 20 30 01 11 21 31
+      // 40 50 60 70 41 51 61 71
+      // 02 12 22 32 03 13 23 33
+      // 42 52 62 72 43 53 63 73
+      // 04 14 24 34 05 15 21 36
+      // 44 54 64 74 45 55 61 76
+      // 06 16 26 36 07 17 27 37
+      // 46 56 66 76 47 57 67 77
+      in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+      in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+      in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+      in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+      in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+      in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+      in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+      in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+      // 00 10 20 30 40 50 60 70
+      // 01 11 21 31 41 51 61 71
+      // 02 12 22 32 42 52 62 72
+      // 03 13 23 33 43 53 63 73
+      // 04 14 24 34 44 54 64 74
+      // 05 15 25 35 45 55 65 75
+      // 06 16 26 36 46 56 66 76
+      // 07 17 27 37 47 57 67 77
+    }
+  }
+  // Post-condition output and store it
+  {
+    // Post-condition (division by two)
+    //    division of two 16 bits signed numbers using shifts
+    //    n / 2 = (n - (n >> 15)) >> 1
+    const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
+    const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
+    const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
+    const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
+    const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
+    const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
+    const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
+    const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
+    in0 = _mm_sub_epi16(in0, sign_in0);
+    in1 = _mm_sub_epi16(in1, sign_in1);
+    in2 = _mm_sub_epi16(in2, sign_in2);
+    in3 = _mm_sub_epi16(in3, sign_in3);
+    in4 = _mm_sub_epi16(in4, sign_in4);
+    in5 = _mm_sub_epi16(in5, sign_in5);
+    in6 = _mm_sub_epi16(in6, sign_in6);
+    in7 = _mm_sub_epi16(in7, sign_in7);
+    in0 = _mm_srai_epi16(in0, 1);
+    in1 = _mm_srai_epi16(in1, 1);
+    in2 = _mm_srai_epi16(in2, 1);
+    in3 = _mm_srai_epi16(in3, 1);
+    in4 = _mm_srai_epi16(in4, 1);
+    in5 = _mm_srai_epi16(in5, 1);
+    in6 = _mm_srai_epi16(in6, 1);
+    in7 = _mm_srai_epi16(in7, 1);
+    // store results
+    store_output(&in0, (output + 0 * 8));
+    store_output(&in1, (output + 1 * 8));
+    store_output(&in2, (output + 2 * 8));
+    store_output(&in3, (output + 3 * 8));
+    store_output(&in4, (output + 4 * 8));
+    store_output(&in5, (output + 5 * 8));
+    store_output(&in6, (output + 6 * 8));
+    store_output(&in7, (output + 7 * 8));
+  }
+}
+
+void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
+  // The 2D transform is done with two passes which are actually pretty
+  // similar. In the first one, we transform the columns and transpose
+  // the results. In the second one, we transform the rows. To achieve that,
+  // as the first pass results are transposed, we transpose the columns (that
+  // is the transposed rows) and transpose the results (so that it goes back
+  // in normal/row positions).
+  int pass;
+  // We need an intermediate buffer between passes.
+  DECLARE_ALIGNED(16, int16_t, intermediate[256]);
+  const int16_t *in = input;
+  int16_t *out0 = intermediate;
+  tran_low_t *out1 = output;
+  // Constants
+  //    When we use them, in one case, they are all the same. In all others
+  //    it's a pair of them that we need to repeat four times. This is done
+  //    by constructing the 32 bit constant corresponding to that pair.
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
+  const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
+  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
+  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
+  const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
+  const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
+  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
+  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i kOne = _mm_set1_epi16(1);
+  // Do the two transform/transpose passes
+  for (pass = 0; pass < 2; ++pass) {
+    // We process eight columns (transposed rows in second pass) at a time.
+    int column_start;
+#if DCT_HIGH_BIT_DEPTH
+    int overflow;
+#endif
+    for (column_start = 0; column_start < 16; column_start += 8) {
+      __m128i in00, in01, in02, in03, in04, in05, in06, in07;
+      __m128i in08, in09, in10, in11, in12, in13, in14, in15;
+      __m128i input0, input1, input2, input3, input4, input5, input6, input7;
+      __m128i step1_0, step1_1, step1_2, step1_3;
+      __m128i step1_4, step1_5, step1_6, step1_7;
+      __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
+      __m128i step3_0, step3_1, step3_2, step3_3;
+      __m128i step3_4, step3_5, step3_6, step3_7;
+      __m128i res00, res01, res02, res03, res04, res05, res06, res07;
+      __m128i res08, res09, res10, res11, res12, res13, res14, res15;
+      // Load and pre-condition input.
+      if (0 == pass) {
+        in00  = _mm_load_si128((const __m128i *)(in +  0 * stride));
+        in01  = _mm_load_si128((const __m128i *)(in +  1 * stride));
+        in02  = _mm_load_si128((const __m128i *)(in +  2 * stride));
+        in03  = _mm_load_si128((const __m128i *)(in +  3 * stride));
+        in04  = _mm_load_si128((const __m128i *)(in +  4 * stride));
+        in05  = _mm_load_si128((const __m128i *)(in +  5 * stride));
+        in06  = _mm_load_si128((const __m128i *)(in +  6 * stride));
+        in07  = _mm_load_si128((const __m128i *)(in +  7 * stride));
+        in08  = _mm_load_si128((const __m128i *)(in +  8 * stride));
+        in09  = _mm_load_si128((const __m128i *)(in +  9 * stride));
+        in10  = _mm_load_si128((const __m128i *)(in + 10 * stride));
+        in11  = _mm_load_si128((const __m128i *)(in + 11 * stride));
+        in12  = _mm_load_si128((const __m128i *)(in + 12 * stride));
+        in13  = _mm_load_si128((const __m128i *)(in + 13 * stride));
+        in14  = _mm_load_si128((const __m128i *)(in + 14 * stride));
+        in15  = _mm_load_si128((const __m128i *)(in + 15 * stride));
+        // x = x << 2
+        in00 = _mm_slli_epi16(in00, 2);
+        in01 = _mm_slli_epi16(in01, 2);
+        in02 = _mm_slli_epi16(in02, 2);
+        in03 = _mm_slli_epi16(in03, 2);
+        in04 = _mm_slli_epi16(in04, 2);
+        in05 = _mm_slli_epi16(in05, 2);
+        in06 = _mm_slli_epi16(in06, 2);
+        in07 = _mm_slli_epi16(in07, 2);
+        in08 = _mm_slli_epi16(in08, 2);
+        in09 = _mm_slli_epi16(in09, 2);
+        in10 = _mm_slli_epi16(in10, 2);
+        in11 = _mm_slli_epi16(in11, 2);
+        in12 = _mm_slli_epi16(in12, 2);
+        in13 = _mm_slli_epi16(in13, 2);
+        in14 = _mm_slli_epi16(in14, 2);
+        in15 = _mm_slli_epi16(in15, 2);
+      } else {
+        in00  = _mm_load_si128((const __m128i *)(in +  0 * 16));
+        in01  = _mm_load_si128((const __m128i *)(in +  1 * 16));
+        in02  = _mm_load_si128((const __m128i *)(in +  2 * 16));
+        in03  = _mm_load_si128((const __m128i *)(in +  3 * 16));
+        in04  = _mm_load_si128((const __m128i *)(in +  4 * 16));
+        in05  = _mm_load_si128((const __m128i *)(in +  5 * 16));
+        in06  = _mm_load_si128((const __m128i *)(in +  6 * 16));
+        in07  = _mm_load_si128((const __m128i *)(in +  7 * 16));
+        in08  = _mm_load_si128((const __m128i *)(in +  8 * 16));
+        in09  = _mm_load_si128((const __m128i *)(in +  9 * 16));
+        in10  = _mm_load_si128((const __m128i *)(in + 10 * 16));
+        in11  = _mm_load_si128((const __m128i *)(in + 11 * 16));
+        in12  = _mm_load_si128((const __m128i *)(in + 12 * 16));
+        in13  = _mm_load_si128((const __m128i *)(in + 13 * 16));
+        in14  = _mm_load_si128((const __m128i *)(in + 14 * 16));
+        in15  = _mm_load_si128((const __m128i *)(in + 15 * 16));
+        // x = (x + 1) >> 2
+        in00 = _mm_add_epi16(in00, kOne);
+        in01 = _mm_add_epi16(in01, kOne);
+        in02 = _mm_add_epi16(in02, kOne);
+        in03 = _mm_add_epi16(in03, kOne);
+        in04 = _mm_add_epi16(in04, kOne);
+        in05 = _mm_add_epi16(in05, kOne);
+        in06 = _mm_add_epi16(in06, kOne);
+        in07 = _mm_add_epi16(in07, kOne);
+        in08 = _mm_add_epi16(in08, kOne);
+        in09 = _mm_add_epi16(in09, kOne);
+        in10 = _mm_add_epi16(in10, kOne);
+        in11 = _mm_add_epi16(in11, kOne);
+        in12 = _mm_add_epi16(in12, kOne);
+        in13 = _mm_add_epi16(in13, kOne);
+        in14 = _mm_add_epi16(in14, kOne);
+        in15 = _mm_add_epi16(in15, kOne);
+        in00 = _mm_srai_epi16(in00, 2);
+        in01 = _mm_srai_epi16(in01, 2);
+        in02 = _mm_srai_epi16(in02, 2);
+        in03 = _mm_srai_epi16(in03, 2);
+        in04 = _mm_srai_epi16(in04, 2);
+        in05 = _mm_srai_epi16(in05, 2);
+        in06 = _mm_srai_epi16(in06, 2);
+        in07 = _mm_srai_epi16(in07, 2);
+        in08 = _mm_srai_epi16(in08, 2);
+        in09 = _mm_srai_epi16(in09, 2);
+        in10 = _mm_srai_epi16(in10, 2);
+        in11 = _mm_srai_epi16(in11, 2);
+        in12 = _mm_srai_epi16(in12, 2);
+        in13 = _mm_srai_epi16(in13, 2);
+        in14 = _mm_srai_epi16(in14, 2);
+        in15 = _mm_srai_epi16(in15, 2);
+      }
+      in += 8;
+      // Calculate input for the first 8 results.
+      {
+        input0 = ADD_EPI16(in00, in15);
+        input1 = ADD_EPI16(in01, in14);
+        input2 = ADD_EPI16(in02, in13);
+        input3 = ADD_EPI16(in03, in12);
+        input4 = ADD_EPI16(in04, in11);
+        input5 = ADD_EPI16(in05, in10);
+        input6 = ADD_EPI16(in06, in09);
+        input7 = ADD_EPI16(in07, in08);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x8(&input0, &input1, &input2, &input3,
+                                           &input4, &input5, &input6, &input7);
+        if (overflow) {
+          vpx_highbd_fdct16x16_c(input, output, stride);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+      // Calculate input for the next 8 results.
+      {
+        step1_0 = SUB_EPI16(in07, in08);
+        step1_1 = SUB_EPI16(in06, in09);
+        step1_2 = SUB_EPI16(in05, in10);
+        step1_3 = SUB_EPI16(in04, in11);
+        step1_4 = SUB_EPI16(in03, in12);
+        step1_5 = SUB_EPI16(in02, in13);
+        step1_6 = SUB_EPI16(in01, in14);
+        step1_7 = SUB_EPI16(in00, in15);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x8(&step1_0, &step1_1,
+                                           &step1_2, &step1_3,
+                                           &step1_4, &step1_5,
+                                           &step1_6, &step1_7);
+        if (overflow) {
+          vpx_highbd_fdct16x16_c(input, output, stride);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+      // Work on the first eight values; fdct8(input, even_results);
+      {
+        // Add/subtract
+        const __m128i q0 = ADD_EPI16(input0, input7);
+        const __m128i q1 = ADD_EPI16(input1, input6);
+        const __m128i q2 = ADD_EPI16(input2, input5);
+        const __m128i q3 = ADD_EPI16(input3, input4);
+        const __m128i q4 = SUB_EPI16(input3, input4);
+        const __m128i q5 = SUB_EPI16(input2, input5);
+        const __m128i q6 = SUB_EPI16(input1, input6);
+        const __m128i q7 = SUB_EPI16(input0, input7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x8(&q0, &q1, &q2, &q3,
+                                           &q4, &q5, &q6, &q7);
+        if (overflow) {
+          vpx_highbd_fdct16x16_c(input, output, stride);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+        // Work on first four results
+        {
+          // Add/subtract
+          const __m128i r0 = ADD_EPI16(q0, q3);
+          const __m128i r1 = ADD_EPI16(q1, q2);
+          const __m128i r2 = SUB_EPI16(q1, q2);
+          const __m128i r3 = SUB_EPI16(q0, q3);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3);
+          if (overflow) {
+            vpx_highbd_fdct16x16_c(input, output, stride);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+          // Interleave to do the multiply by constants which gets us
+          // into 32 bits.
+          {
+            const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+            const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
+            const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+            const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
+            res00 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+            res08 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+            res04 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+            res12 = mult_round_shift(&t2, &t3, &k__cospi_m08_p24,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+            overflow = check_epi16_overflow_x4(&res00, &res08, &res04, &res12);
+            if (overflow) {
+              vpx_highbd_fdct16x16_c(input, output, stride);
+              return;
+            }
+#endif  // DCT_HIGH_BIT_DEPTH
+          }
+        }
+        // Work on next four results
+        {
+          // Interleave to do the multiply by constants which gets us
+          // into 32 bits.
+          const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
+          const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
+          const __m128i r0 = mult_round_shift(&d0, &d1, &k__cospi_p16_m16,
+                                              &k__DCT_CONST_ROUNDING,
+                                              DCT_CONST_BITS);
+          const __m128i r1 = mult_round_shift(&d0, &d1, &k__cospi_p16_p16,
+                                              &k__DCT_CONST_ROUNDING,
+                                              DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x2(&r0, &r1);
+          if (overflow) {
+            vpx_highbd_fdct16x16_c(input, output, stride);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+          {
+            // Add/subtract
+            const __m128i x0 = ADD_EPI16(q4, r0);
+            const __m128i x1 = SUB_EPI16(q4, r0);
+            const __m128i x2 = SUB_EPI16(q7, r1);
+            const __m128i x3 = ADD_EPI16(q7, r1);
+#if DCT_HIGH_BIT_DEPTH
+            overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3);
+            if (overflow) {
+              vpx_highbd_fdct16x16_c(input, output, stride);
+              return;
+            }
+#endif  // DCT_HIGH_BIT_DEPTH
+            // Interleave to do the multiply by constants which gets us
+            // into 32 bits.
+            {
+              const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
+              const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
+              const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
+              const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
+              res02 = mult_round_shift(&t0, &t1, &k__cospi_p28_p04,
+                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+              res14 = mult_round_shift(&t0, &t1, &k__cospi_m04_p28,
+                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+              res10 = mult_round_shift(&t2, &t3, &k__cospi_p12_p20,
+                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+              res06 = mult_round_shift(&t2, &t3, &k__cospi_m20_p12,
+                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+              overflow = check_epi16_overflow_x4(&res02, &res14,
+                                                 &res10, &res06);
+              if (overflow) {
+                vpx_highbd_fdct16x16_c(input, output, stride);
+                return;
+              }
+#endif  // DCT_HIGH_BIT_DEPTH
+            }
+          }
+        }
+      }
+      // Work on the next eight values; step1 -> odd_results
+      {
+        // step 2
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
+          const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
+          const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
+          const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
+          step2_2 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          step2_3 = mult_round_shift(&t2, &t3, &k__cospi_p16_m16,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          step2_5 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          step2_4 = mult_round_shift(&t2, &t3, &k__cospi_p16_p16,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x4(&step2_2, &step2_3, &step2_5,
+                                             &step2_4);
+          if (overflow) {
+            vpx_highbd_fdct16x16_c(input, output, stride);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+        }
+        // step 3
+        {
+          step3_0 = ADD_EPI16(step1_0, step2_3);
+          step3_1 = ADD_EPI16(step1_1, step2_2);
+          step3_2 = SUB_EPI16(step1_1, step2_2);
+          step3_3 = SUB_EPI16(step1_0, step2_3);
+          step3_4 = SUB_EPI16(step1_7, step2_4);
+          step3_5 = SUB_EPI16(step1_6, step2_5);
+          step3_6 = ADD_EPI16(step1_6, step2_5);
+          step3_7 = ADD_EPI16(step1_7, step2_4);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x8(&step3_0, &step3_1,
+                                             &step3_2, &step3_3,
+                                             &step3_4, &step3_5,
+                                             &step3_6, &step3_7);
+          if (overflow) {
+            vpx_highbd_fdct16x16_c(input, output, stride);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+        }
+        // step 4
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
+          const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
+          const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
+          const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
+          step2_1 = mult_round_shift(&t0, &t1, &k__cospi_m08_p24,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          step2_2 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          step2_6 = mult_round_shift(&t0, &t1, &k__cospi_p24_p08,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          step2_5 = mult_round_shift(&t2, &t3, &k__cospi_p08_m24,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x4(&step2_1, &step2_2, &step2_6,
+                                             &step2_5);
+          if (overflow) {
+            vpx_highbd_fdct16x16_c(input, output, stride);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+        }
+        // step 5
+        {
+          step1_0 = ADD_EPI16(step3_0, step2_1);
+          step1_1 = SUB_EPI16(step3_0, step2_1);
+          step1_2 = ADD_EPI16(step3_3, step2_2);
+          step1_3 = SUB_EPI16(step3_3, step2_2);
+          step1_4 = SUB_EPI16(step3_4, step2_5);
+          step1_5 = ADD_EPI16(step3_4, step2_5);
+          step1_6 = SUB_EPI16(step3_7, step2_6);
+          step1_7 = ADD_EPI16(step3_7, step2_6);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x8(&step1_0, &step1_1,
+                                             &step1_2, &step1_3,
+                                             &step1_4, &step1_5,
+                                             &step1_6, &step1_7);
+          if (overflow) {
+            vpx_highbd_fdct16x16_c(input, output, stride);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+        }
+        // step 6
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
+          const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
+          const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
+          const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
+          res01 = mult_round_shift(&t0, &t1, &k__cospi_p30_p02,
+                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          res09 = mult_round_shift(&t2, &t3, &k__cospi_p14_p18,
+                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          res15 = mult_round_shift(&t0, &t1, &k__cospi_m02_p30,
+                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          res07 = mult_round_shift(&t2, &t3, &k__cospi_m18_p14,
+                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x4(&res01, &res09, &res15, &res07);
+          if (overflow) {
+            vpx_highbd_fdct16x16_c(input, output, stride);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+        }
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
+          const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
+          const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
+          const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
+          res05 = mult_round_shift(&t0, &t1, &k__cospi_p22_p10,
+                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          res13 = mult_round_shift(&t2, &t3, &k__cospi_p06_p26,
+                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          res11 = mult_round_shift(&t0, &t1, &k__cospi_m10_p22,
+                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          res03 = mult_round_shift(&t2, &t3, &k__cospi_m26_p06,
+                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x4(&res05, &res13, &res11, &res03);
+          if (overflow) {
+            vpx_highbd_fdct16x16_c(input, output, stride);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+        }
+      }
+      // Transpose the results, do it as two 8x8 transposes.
+      transpose_and_output8x8(&res00, &res01, &res02, &res03,
+                              &res04, &res05, &res06, &res07,
+                              pass, out0, out1);
+      transpose_and_output8x8(&res08, &res09, &res10, &res11,
+                              &res12, &res13, &res14, &res15,
+                              pass, out0 + 8, out1 + 8);
+      if (pass == 0) {
+        out0 += 8*16;
+      } else {
+        out1 += 8*16;
+      }
+    }
+    // Setup in/out for next pass.
+    in = intermediate;
+  }
+}
+
+#undef ADD_EPI16
+#undef SUB_EPI16
diff --git a/libs/libvpx/vp10/common/x86/vp10_fwd_txfm_sse2.c b/libs/libvpx/vp10/common/x86/vp10_fwd_txfm_sse2.c
new file mode 100644
index 0000000000..032c3ccd1d
--- /dev/null
+++ b/libs/libvpx/vp10/common/x86/vp10_fwd_txfm_sse2.c
@@ -0,0 +1,271 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>  // SSE2
+
+#include "./vpx_config.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/x86/fwd_txfm_sse2.h"
+
+void vp10_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
+  __m128i in0, in1;
+  __m128i tmp;
+  const __m128i zero = _mm_setzero_si128();
+  in0  = _mm_loadl_epi64((const __m128i *)(input +  0 * stride));
+  in1  = _mm_loadl_epi64((const __m128i *)(input +  1 * stride));
+  in1  = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *)
+         (input +  2 * stride)));
+  in0  = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *)
+         (input +  3 * stride)));
+
+  tmp = _mm_add_epi16(in0, in1);
+  in0 = _mm_unpacklo_epi16(zero, tmp);
+  in1 = _mm_unpackhi_epi16(zero, tmp);
+  in0 = _mm_srai_epi32(in0, 16);
+  in1 = _mm_srai_epi32(in1, 16);
+
+  tmp = _mm_add_epi32(in0, in1);
+  in0 = _mm_unpacklo_epi32(tmp, zero);
+  in1 = _mm_unpackhi_epi32(tmp, zero);
+
+  tmp = _mm_add_epi32(in0, in1);
+  in0 = _mm_srli_si128(tmp, 8);
+
+  in1 = _mm_add_epi32(tmp, in0);
+  in0 = _mm_slli_epi32(in1, 1);
+  store_output(&in0, output);
+}
+
+void vp10_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
+  __m128i in0  = _mm_load_si128((const __m128i *)(input + 0 * stride));
+  __m128i in1  = _mm_load_si128((const __m128i *)(input + 1 * stride));
+  __m128i in2  = _mm_load_si128((const __m128i *)(input + 2 * stride));
+  __m128i in3  = _mm_load_si128((const __m128i *)(input + 3 * stride));
+  __m128i u0, u1, sum;
+
+  u0 = _mm_add_epi16(in0, in1);
+  u1 = _mm_add_epi16(in2, in3);
+
+  in0  = _mm_load_si128((const __m128i *)(input + 4 * stride));
+  in1  = _mm_load_si128((const __m128i *)(input + 5 * stride));
+  in2  = _mm_load_si128((const __m128i *)(input + 6 * stride));
+  in3  = _mm_load_si128((const __m128i *)(input + 7 * stride));
+
+  sum = _mm_add_epi16(u0, u1);
+
+  in0 = _mm_add_epi16(in0, in1);
+  in2 = _mm_add_epi16(in2, in3);
+  sum = _mm_add_epi16(sum, in0);
+
+  u0  = _mm_setzero_si128();
+  sum = _mm_add_epi16(sum, in2);
+
+  in0 = _mm_unpacklo_epi16(u0, sum);
+  in1 = _mm_unpackhi_epi16(u0, sum);
+  in0 = _mm_srai_epi32(in0, 16);
+  in1 = _mm_srai_epi32(in1, 16);
+
+  sum = _mm_add_epi32(in0, in1);
+  in0 = _mm_unpacklo_epi32(sum, u0);
+  in1 = _mm_unpackhi_epi32(sum, u0);
+
+  sum = _mm_add_epi32(in0, in1);
+  in0 = _mm_srli_si128(sum, 8);
+
+  in1 = _mm_add_epi32(sum, in0);
+  store_output(&in1, output);
+}
+
+void vp10_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output,
+                          int stride) {
+  __m128i in0, in1, in2, in3;
+  __m128i u0, u1;
+  __m128i sum = _mm_setzero_si128();
+  int i;
+
+  for (i = 0; i < 2; ++i) {
+    input += 8 * i;
+    in0  = _mm_load_si128((const __m128i *)(input +  0 * stride));
+    in1  = _mm_load_si128((const __m128i *)(input +  1 * stride));
+    in2  = _mm_load_si128((const __m128i *)(input +  2 * stride));
+    in3  = _mm_load_si128((const __m128i *)(input +  3 * stride));
+
+    u0 = _mm_add_epi16(in0, in1);
+    u1 = _mm_add_epi16(in2, in3);
+    sum = _mm_add_epi16(sum, u0);
+
+    in0  = _mm_load_si128((const __m128i *)(input +  4 * stride));
+    in1  = _mm_load_si128((const __m128i *)(input +  5 * stride));
+    in2  = _mm_load_si128((const __m128i *)(input +  6 * stride));
+    in3  = _mm_load_si128((const __m128i *)(input +  7 * stride));
+
+    sum = _mm_add_epi16(sum, u1);
+    u0  = _mm_add_epi16(in0, in1);
+    u1  = _mm_add_epi16(in2, in3);
+    sum = _mm_add_epi16(sum, u0);
+
+    in0  = _mm_load_si128((const __m128i *)(input +  8 * stride));
+    in1  = _mm_load_si128((const __m128i *)(input +  9 * stride));
+    in2  = _mm_load_si128((const __m128i *)(input + 10 * stride));
+    in3  = _mm_load_si128((const __m128i *)(input + 11 * stride));
+
+    sum = _mm_add_epi16(sum, u1);
+    u0  = _mm_add_epi16(in0, in1);
+    u1  = _mm_add_epi16(in2, in3);
+    sum = _mm_add_epi16(sum, u0);
+
+    in0  = _mm_load_si128((const __m128i *)(input + 12 * stride));
+    in1  = _mm_load_si128((const __m128i *)(input + 13 * stride));
+    in2  = _mm_load_si128((const __m128i *)(input + 14 * stride));
+    in3  = _mm_load_si128((const __m128i *)(input + 15 * stride));
+
+    sum = _mm_add_epi16(sum, u1);
+    u0  = _mm_add_epi16(in0, in1);
+    u1  = _mm_add_epi16(in2, in3);
+    sum = _mm_add_epi16(sum, u0);
+
+    sum = _mm_add_epi16(sum, u1);
+  }
+
+  u0  = _mm_setzero_si128();
+  in0 = _mm_unpacklo_epi16(u0, sum);
+  in1 = _mm_unpackhi_epi16(u0, sum);
+  in0 = _mm_srai_epi32(in0, 16);
+  in1 = _mm_srai_epi32(in1, 16);
+
+  sum = _mm_add_epi32(in0, in1);
+  in0 = _mm_unpacklo_epi32(sum, u0);
+  in1 = _mm_unpackhi_epi32(sum, u0);
+
+  sum = _mm_add_epi32(in0, in1);
+  in0 = _mm_srli_si128(sum, 8);
+
+  in1 = _mm_add_epi32(sum, in0);
+  in1 = _mm_srai_epi32(in1, 1);
+  store_output(&in1, output);
+}
+
+void vp10_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output,
+                          int stride) {
+  __m128i in0, in1, in2, in3;
+  __m128i u0, u1;
+  __m128i sum = _mm_setzero_si128();
+  int i;
+
+  for (i = 0; i < 8; ++i) {
+    in0  = _mm_load_si128((const __m128i *)(input +  0));
+    in1  = _mm_load_si128((const __m128i *)(input +  8));
+    in2  = _mm_load_si128((const __m128i *)(input + 16));
+    in3  = _mm_load_si128((const __m128i *)(input + 24));
+
+    input += stride;
+    u0 = _mm_add_epi16(in0, in1);
+    u1 = _mm_add_epi16(in2, in3);
+    sum = _mm_add_epi16(sum, u0);
+
+    in0  = _mm_load_si128((const __m128i *)(input +  0));
+    in1  = _mm_load_si128((const __m128i *)(input +  8));
+    in2  = _mm_load_si128((const __m128i *)(input + 16));
+    in3  = _mm_load_si128((const __m128i *)(input + 24));
+
+    input += stride;
+    sum = _mm_add_epi16(sum, u1);
+    u0  = _mm_add_epi16(in0, in1);
+    u1  = _mm_add_epi16(in2, in3);
+    sum = _mm_add_epi16(sum, u0);
+
+    in0  = _mm_load_si128((const __m128i *)(input +  0));
+    in1  = _mm_load_si128((const __m128i *)(input +  8));
+    in2  = _mm_load_si128((const __m128i *)(input + 16));
+    in3  = _mm_load_si128((const __m128i *)(input + 24));
+
+    input += stride;
+    sum = _mm_add_epi16(sum, u1);
+    u0  = _mm_add_epi16(in0, in1);
+    u1  = _mm_add_epi16(in2, in3);
+    sum = _mm_add_epi16(sum, u0);
+
+    in0  = _mm_load_si128((const __m128i *)(input +  0));
+    in1  = _mm_load_si128((const __m128i *)(input +  8));
+    in2  = _mm_load_si128((const __m128i *)(input + 16));
+    in3  = _mm_load_si128((const __m128i *)(input + 24));
+
+    input += stride;
+    sum = _mm_add_epi16(sum, u1);
+    u0  = _mm_add_epi16(in0, in1);
+    u1  = _mm_add_epi16(in2, in3);
+    sum = _mm_add_epi16(sum, u0);
+
+    sum = _mm_add_epi16(sum, u1);
+  }
+
+  u0  = _mm_setzero_si128();
+  in0 = _mm_unpacklo_epi16(u0, sum);
+  in1 = _mm_unpackhi_epi16(u0, sum);
+  in0 = _mm_srai_epi32(in0, 16);
+  in1 = _mm_srai_epi32(in1, 16);
+
+  sum = _mm_add_epi32(in0, in1);
+  in0 = _mm_unpacklo_epi32(sum, u0);
+  in1 = _mm_unpackhi_epi32(sum, u0);
+
+  sum = _mm_add_epi32(in0, in1);
+  in0 = _mm_srli_si128(sum, 8);
+
+  in1 = _mm_add_epi32(sum, in0);
+  in1 = _mm_srai_epi32(in1, 3);
+  store_output(&in1, output);
+}
+
+#define DCT_HIGH_BIT_DEPTH 0
+#define FDCT4x4_2D vp10_fdct4x4_sse2
+#define FDCT8x8_2D vp10_fdct8x8_sse2
+#define FDCT16x16_2D vp10_fdct16x16_sse2
+#include "vp10/common/x86/vp10_fwd_txfm_impl_sse2.h"
+#undef  FDCT4x4_2D
+#undef  FDCT8x8_2D
+#undef  FDCT16x16_2D
+
+#define FDCT32x32_2D vp10_fdct32x32_rd_sse2
+#define FDCT32x32_HIGH_PRECISION 0
+#include "vp10/common/x86/vp10_fwd_dct32x32_impl_sse2.h"
+#undef  FDCT32x32_2D
+#undef  FDCT32x32_HIGH_PRECISION
+
+#define FDCT32x32_2D vp10_fdct32x32_sse2
+#define FDCT32x32_HIGH_PRECISION 1
+#include "vp10/common/x86/vp10_fwd_dct32x32_impl_sse2.h"  // NOLINT
+#undef  FDCT32x32_2D
+#undef  FDCT32x32_HIGH_PRECISION
+#undef  DCT_HIGH_BIT_DEPTH
+
+#if CONFIG_VP9_HIGHBITDEPTH
+#define DCT_HIGH_BIT_DEPTH 1
+#define FDCT4x4_2D vp10_highbd_fdct4x4_sse2
+#define FDCT8x8_2D vp10_highbd_fdct8x8_sse2
+#define FDCT16x16_2D vp10_highbd_fdct16x16_sse2
+#include "vp10/common/x86/vp10_fwd_txfm_impl_sse2.h" // NOLINT
+#undef  FDCT4x4_2D
+#undef  FDCT8x8_2D
+#undef  FDCT16x16_2D
+
+#define FDCT32x32_2D vp10_highbd_fdct32x32_rd_sse2
+#define FDCT32x32_HIGH_PRECISION 0
+#include "vp10/common/x86/vp10_fwd_dct32x32_impl_sse2.h" // NOLINT
+#undef  FDCT32x32_2D
+#undef  FDCT32x32_HIGH_PRECISION
+
+#define FDCT32x32_2D vp10_highbd_fdct32x32_sse2
+#define FDCT32x32_HIGH_PRECISION 1
+#include "vp10/common/x86/vp10_fwd_dct32x32_impl_sse2.h" // NOLINT
+#undef  FDCT32x32_2D
+#undef  FDCT32x32_HIGH_PRECISION
+#undef  DCT_HIGH_BIT_DEPTH
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/libs/libvpx/vp10/common/x86/vp10_inv_txfm_sse2.c b/libs/libvpx/vp10/common/x86/vp10_inv_txfm_sse2.c
new file mode 100644
index 0000000000..b25e22e0ee
--- /dev/null
+++ b/libs/libvpx/vp10/common/x86/vp10_inv_txfm_sse2.c
@@ -0,0 +1,4058 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp10_rtcd.h"
+#include "vp10/common/x86/vp10_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+#define RECON_AND_STORE4X4(dest, in_x) \
+{                                                     \
+  __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
+  d0 = _mm_unpacklo_epi8(d0, zero); \
+  d0 = _mm_add_epi16(in_x, d0); \
+  d0 = _mm_packus_epi16(d0, d0); \
+  *(int *)(dest) = _mm_cvtsi128_si32(d0); \
+}
+
+void vp10_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i eight = _mm_set1_epi16(8);
+  const __m128i cst = _mm_setr_epi16(
+      (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64,
+      (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
+      (int16_t)cospi_8_64, (int16_t)cospi_24_64);
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  __m128i input0, input1, input2, input3;
+
+  // Rows
+  input0 = _mm_load_si128((const __m128i *)input);
+  input2 = _mm_load_si128((const __m128i *)(input + 8));
+
+  // Construct i3, i1, i3, i1, i2, i0, i2, i0
+  input0 = _mm_shufflelo_epi16(input0, 0xd8);
+  input0 = _mm_shufflehi_epi16(input0, 0xd8);
+  input2 = _mm_shufflelo_epi16(input2, 0xd8);
+  input2 = _mm_shufflehi_epi16(input2, 0xd8);
+
+  input1 = _mm_unpackhi_epi32(input0, input0);
+  input0 = _mm_unpacklo_epi32(input0, input0);
+  input3 = _mm_unpackhi_epi32(input2, input2);
+  input2 = _mm_unpacklo_epi32(input2, input2);
+
+  // Stage 1
+  input0 = _mm_madd_epi16(input0, cst);
+  input1 = _mm_madd_epi16(input1, cst);
+  input2 = _mm_madd_epi16(input2, cst);
+  input3 = _mm_madd_epi16(input3, cst);
+
+  input0 = _mm_add_epi32(input0, rounding);
+  input1 = _mm_add_epi32(input1, rounding);
+  input2 = _mm_add_epi32(input2, rounding);
+  input3 = _mm_add_epi32(input3, rounding);
+
+  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
+  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
+  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
+  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
+
+  // Stage 2
+  input0 = _mm_packs_epi32(input0, input1);
+  input1 = _mm_packs_epi32(input2, input3);
+
+  // Transpose
+  input2 = _mm_unpacklo_epi16(input0, input1);
+  input3 = _mm_unpackhi_epi16(input0, input1);
+  input0 = _mm_unpacklo_epi32(input2, input3);
+  input1 = _mm_unpackhi_epi32(input2, input3);
+
+  // Switch column2, column 3, and then, we got:
+  // input2: column1, column 0;  input3: column2, column 3.
+  input1 = _mm_shuffle_epi32(input1, 0x4e);
+  input2 = _mm_add_epi16(input0, input1);
+  input3 = _mm_sub_epi16(input0, input1);
+
+  // Columns
+  // Construct i3, i1, i3, i1, i2, i0, i2, i0
+  input0 = _mm_unpacklo_epi32(input2, input2);
+  input1 = _mm_unpackhi_epi32(input2, input2);
+  input2 = _mm_unpackhi_epi32(input3, input3);
+  input3 = _mm_unpacklo_epi32(input3, input3);
+
+  // Stage 1
+  input0 = _mm_madd_epi16(input0, cst);
+  input1 = _mm_madd_epi16(input1, cst);
+  input2 = _mm_madd_epi16(input2, cst);
+  input3 = _mm_madd_epi16(input3, cst);
+
+  input0 = _mm_add_epi32(input0, rounding);
+  input1 = _mm_add_epi32(input1, rounding);
+  input2 = _mm_add_epi32(input2, rounding);
+  input3 = _mm_add_epi32(input3, rounding);
+
+  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
+  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
+  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
+  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
+
+  // Stage 2
+  input0 = _mm_packs_epi32(input0, input2);
+  input1 = _mm_packs_epi32(input1, input3);
+
+  // Transpose
+  input2 = _mm_unpacklo_epi16(input0, input1);
+  input3 = _mm_unpackhi_epi16(input0, input1);
+  input0 = _mm_unpacklo_epi32(input2, input3);
+  input1 = _mm_unpackhi_epi32(input2, input3);
+
+  // Switch column2, column 3, and then, we got:
+  // input2: column1, column 0;  input3: column2, column 3.
+  input1 = _mm_shuffle_epi32(input1, 0x4e);
+  input2 = _mm_add_epi16(input0, input1);
+  input3 = _mm_sub_epi16(input0, input1);
+
+  // Final round and shift
+  input2 = _mm_add_epi16(input2, eight);
+  input3 = _mm_add_epi16(input3, eight);
+
+  input2 = _mm_srai_epi16(input2, 4);
+  input3 = _mm_srai_epi16(input3, 4);
+
+  // Reconstruction and Store
+  {
+    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
+    __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
+    d0 = _mm_unpacklo_epi32(d0,
+                            _mm_cvtsi32_si128(*(const int *)(dest + stride)));
+    d2 = _mm_unpacklo_epi32(
+        _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2);
+    d0 = _mm_unpacklo_epi8(d0, zero);
+    d2 = _mm_unpacklo_epi8(d2, zero);
+    d0 = _mm_add_epi16(d0, input2);
+    d2 = _mm_add_epi16(d2, input3);
+    d0 = _mm_packus_epi16(d0, d2);
+    // store input0
+    *(int *)dest = _mm_cvtsi128_si32(d0);
+    // store input1
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
+    // store input2
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
+    // store input3
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
+  }
+}
+
+void vp10_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
+  __m128i dc_value;
+  const __m128i zero = _mm_setzero_si128();
+  int a;
+
+  a = dct_const_round_shift(input[0] * cospi_16_64);
+  a = dct_const_round_shift(a * cospi_16_64);
+  a = ROUND_POWER_OF_TWO(a, 4);
+
+  dc_value = _mm_set1_epi16(a);
+
+  RECON_AND_STORE4X4(dest + 0 * stride, dc_value);
+  RECON_AND_STORE4X4(dest + 1 * stride, dc_value);
+  RECON_AND_STORE4X4(dest + 2 * stride, dc_value);
+  RECON_AND_STORE4X4(dest + 3 * stride, dc_value);
+}
+
+static INLINE void transpose_4x4(__m128i *res) {
+  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
+  const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
+
+  res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
+  res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
+}
+
+void vp10_idct4_sse2(__m128i *in) {
+  const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  __m128i u[8], v[8];
+
+  transpose_4x4(in);
+  // stage 1
+  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
+  u[1] = _mm_unpackhi_epi16(in[0], in[1]);
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+  v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
+  v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+
+  u[0] = _mm_packs_epi32(v[0], v[1]);
+  u[1] = _mm_packs_epi32(v[3], v[2]);
+
+  // stage 2
+  in[0] = _mm_add_epi16(u[0], u[1]);
+  in[1] = _mm_sub_epi16(u[0], u[1]);
+  in[1] = _mm_shuffle_epi32(in[1], 0x4E);
+}
+
+void vp10_iadst4_sse2(__m128i *in) {
+  const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
+  const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
+  const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
+  const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
+  const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
+  const __m128i kZero = _mm_set1_epi16(0);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  __m128i u[8], v[8], in7;
+
+  transpose_4x4(in);
+  in7 = _mm_srli_si128(in[1], 8);
+  in7 = _mm_add_epi16(in7, in[0]);
+  in7 = _mm_sub_epi16(in7, in[1]);
+
+  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
+  u[1] = _mm_unpackhi_epi16(in[0], in[1]);
+  u[2] = _mm_unpacklo_epi16(in7, kZero);
+  u[3] = _mm_unpackhi_epi16(in[0], kZero);
+
+  v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04);  // s0 + s3
+  v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02);  // s2 + s5
+  v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x2
+  v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01);  // s1 - s4
+  v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04);  // s2 - s6
+  v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s2
+
+  u[0] = _mm_add_epi32(v[0], v[1]);
+  u[1] = _mm_add_epi32(v[3], v[4]);
+  u[2] = v[2];
+  u[3] = _mm_add_epi32(u[0], u[1]);
+  u[4] = _mm_slli_epi32(v[5], 2);
+  u[5] = _mm_add_epi32(u[3], v[5]);
+  u[6] = _mm_sub_epi32(u[5], u[4]);
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+
+  in[0] = _mm_packs_epi32(u[0], u[1]);
+  in[1] = _mm_packs_epi32(u[2], u[3]);
+}
+
+#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \
+                      out0, out1, out2, out3, out4, out5, out6, out7) \
+  {                                                     \
+    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
+    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
+    const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
+    const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
+    const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
+    const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
+    const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \
+    const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \
+                                                        \
+    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
+    const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
+    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
+    const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
+    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
+    const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
+    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
+    const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
+                                                            \
+    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
+    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
+    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
+    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
+    out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
+    out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
+    out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
+    out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
+  }
+
+#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, \
+                         out0, out1, out2, out3) \
+  {                                              \
+    const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \
+    const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \
+    const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \
+    const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \
+    \
+    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
+    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
+    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
+    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
+    \
+    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
+    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
+    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
+    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
+  }
+
+#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
+  {                                            \
+    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
+    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
+    out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
+    out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
+  }
+
+// Define Macro for multiplying elements by constants and adding them together.
+#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \
+                               cst0, cst1, cst2, cst3, res0, res1, res2, res3) \
+  {   \
+      tmp0 = _mm_madd_epi16(lo_0, cst0); \
+      tmp1 = _mm_madd_epi16(hi_0, cst0); \
+      tmp2 = _mm_madd_epi16(lo_0, cst1); \
+      tmp3 = _mm_madd_epi16(hi_0, cst1); \
+      tmp4 = _mm_madd_epi16(lo_1, cst2); \
+      tmp5 = _mm_madd_epi16(hi_1, cst2); \
+      tmp6 = _mm_madd_epi16(lo_1, cst3); \
+      tmp7 = _mm_madd_epi16(hi_1, cst3); \
+      \
+      tmp0 = _mm_add_epi32(tmp0, rounding); \
+      tmp1 = _mm_add_epi32(tmp1, rounding); \
+      tmp2 = _mm_add_epi32(tmp2, rounding); \
+      tmp3 = _mm_add_epi32(tmp3, rounding); \
+      tmp4 = _mm_add_epi32(tmp4, rounding); \
+      tmp5 = _mm_add_epi32(tmp5, rounding); \
+      tmp6 = _mm_add_epi32(tmp6, rounding); \
+      tmp7 = _mm_add_epi32(tmp7, rounding); \
+      \
+      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+      tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
+      tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
+      tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
+      tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
+      \
+      res0 = _mm_packs_epi32(tmp0, tmp1); \
+      res1 = _mm_packs_epi32(tmp2, tmp3); \
+      res2 = _mm_packs_epi32(tmp4, tmp5); \
+      res3 = _mm_packs_epi32(tmp6, tmp7); \
+  }
+
+#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
+  {   \
+      tmp0 = _mm_madd_epi16(lo_0, cst0); \
+      tmp1 = _mm_madd_epi16(hi_0, cst0); \
+      tmp2 = _mm_madd_epi16(lo_0, cst1); \
+      tmp3 = _mm_madd_epi16(hi_0, cst1); \
+      \
+      tmp0 = _mm_add_epi32(tmp0, rounding); \
+      tmp1 = _mm_add_epi32(tmp1, rounding); \
+      tmp2 = _mm_add_epi32(tmp2, rounding); \
+      tmp3 = _mm_add_epi32(tmp3, rounding); \
+      \
+      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+      \
+      res0 = _mm_packs_epi32(tmp0, tmp1); \
+      res1 = _mm_packs_epi32(tmp2, tmp3); \
+  }
+
+#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, \
+              out0, out1, out2, out3, out4, out5, out6, out7)  \
+  { \
+  /* Stage1 */      \
+  { \
+    const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
+    const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \
+    const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \
+    const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \
+    \
+    MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \
+                          stg1_1, stg1_2, stg1_3, stp1_4,      \
+                          stp1_7, stp1_5, stp1_6)              \
+  } \
+    \
+  /* Stage2 */ \
+  { \
+    const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \
+    const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \
+    const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \
+    const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \
+    \
+    MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \
+                           stg2_1, stg2_2, stg2_3, stp2_0,     \
+                           stp2_1, stp2_2, stp2_3)             \
+    \
+    stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \
+    stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \
+    stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \
+    stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \
+  } \
+    \
+  /* Stage3 */ \
+  { \
+    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+    const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+    \
+    stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \
+    stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \
+    stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \
+    stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \
+    \
+    tmp0 = _mm_madd_epi16(lo_56, stg2_1); \
+    tmp1 = _mm_madd_epi16(hi_56, stg2_1); \
+    tmp2 = _mm_madd_epi16(lo_56, stg2_0); \
+    tmp3 = _mm_madd_epi16(hi_56, stg2_0); \
+    \
+    tmp0 = _mm_add_epi32(tmp0, rounding); \
+    tmp1 = _mm_add_epi32(tmp1, rounding); \
+    tmp2 = _mm_add_epi32(tmp2, rounding); \
+    tmp3 = _mm_add_epi32(tmp3, rounding); \
+    \
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+    \
+    stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+    stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+  } \
+  \
+  /* Stage4  */ \
+  out0 = _mm_adds_epi16(stp1_0, stp2_7); \
+  out1 = _mm_adds_epi16(stp1_1, stp1_6); \
+  out2 = _mm_adds_epi16(stp1_2, stp1_5); \
+  out3 = _mm_adds_epi16(stp1_3, stp2_4); \
+  out4 = _mm_subs_epi16(stp1_3, stp2_4); \
+  out5 = _mm_subs_epi16(stp1_2, stp1_5); \
+  out6 = _mm_subs_epi16(stp1_1, stp1_6); \
+  out7 = _mm_subs_epi16(stp1_0, stp2_7); \
+  }
+
+void vp10_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
+  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  int i;
+
+  // Load input data.
+  in0 = _mm_load_si128((const __m128i *)input);
+  in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
+  in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
+  in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
+  in4 = _mm_load_si128((const __m128i *)(input + 8 * 4));
+  in5 = _mm_load_si128((const __m128i *)(input + 8 * 5));
+  in6 = _mm_load_si128((const __m128i *)(input + 8 * 6));
+  in7 = _mm_load_si128((const __m128i *)(input + 8 * 7));
+
+  // 2-D
+  for (i = 0; i < 2; i++) {
+    // 8x8 Transpose is copied from vp10_fdct8x8_sse2()
+    TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7,
+                  in0, in1, in2, in3, in4, in5, in6, in7);
+
+    // 4-stage 1D vp10_idct8x8
+    IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
+          in0, in1, in2, in3, in4, in5, in6, in7);
+  }
+
+  // Final rounding and shift
+  in0 = _mm_adds_epi16(in0, final_rounding);
+  in1 = _mm_adds_epi16(in1, final_rounding);
+  in2 = _mm_adds_epi16(in2, final_rounding);
+  in3 = _mm_adds_epi16(in3, final_rounding);
+  in4 = _mm_adds_epi16(in4, final_rounding);
+  in5 = _mm_adds_epi16(in5, final_rounding);
+  in6 = _mm_adds_epi16(in6, final_rounding);
+  in7 = _mm_adds_epi16(in7, final_rounding);
+
+  in0 = _mm_srai_epi16(in0, 5);
+  in1 = _mm_srai_epi16(in1, 5);
+  in2 = _mm_srai_epi16(in2, 5);
+  in3 = _mm_srai_epi16(in3, 5);
+  in4 = _mm_srai_epi16(in4, 5);
+  in5 = _mm_srai_epi16(in5, 5);
+  in6 = _mm_srai_epi16(in6, 5);
+  in7 = _mm_srai_epi16(in7, 5);
+
+  RECON_AND_STORE(dest + 0 * stride, in0);
+  RECON_AND_STORE(dest + 1 * stride, in1);
+  RECON_AND_STORE(dest + 2 * stride, in2);
+  RECON_AND_STORE(dest + 3 * stride, in3);
+  RECON_AND_STORE(dest + 4 * stride, in4);
+  RECON_AND_STORE(dest + 5 * stride, in5);
+  RECON_AND_STORE(dest + 6 * stride, in6);
+  RECON_AND_STORE(dest + 7 * stride, in7);
+}
+
+void vp10_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
+  __m128i dc_value;
+  const __m128i zero = _mm_setzero_si128();
+  int a;
+
+  a = dct_const_round_shift(input[0] * cospi_16_64);
+  a = dct_const_round_shift(a * cospi_16_64);
+  a = ROUND_POWER_OF_TWO(a, 5);
+
+  dc_value = _mm_set1_epi16(a);
+
+  RECON_AND_STORE(dest + 0 * stride, dc_value);
+  RECON_AND_STORE(dest + 1 * stride, dc_value);
+  RECON_AND_STORE(dest + 2 * stride, dc_value);
+  RECON_AND_STORE(dest + 3 * stride, dc_value);
+  RECON_AND_STORE(dest + 4 * stride, dc_value);
+  RECON_AND_STORE(dest + 5 * stride, dc_value);
+  RECON_AND_STORE(dest + 6 * stride, dc_value);
+  RECON_AND_STORE(dest + 7 * stride, dc_value);
+}
+
+void vp10_idct8_sse2(__m128i *in) {
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+  // 8x8 Transpose is copied from vp10_fdct8x8_sse2()
+  TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7],
+                in0, in1, in2, in3, in4, in5, in6, in7);
+
+  // 4-stage 1D vp10_idct8x8
+  IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
+        in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]);
+}
+
+void vp10_iadst8_sse2(__m128i *in) {
+  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
+  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
+  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
+  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
+  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__const_0 = _mm_set1_epi16(0);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
+  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
+  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+
+  // transpose
+  array_transpose_8x8(in, in);
+
+  // properly aligned for butterfly input
+  in0 = in[7];
+  in1 = in[0];
+  in2 = in[5];
+  in3 = in[2];
+  in4 = in[3];
+  in5 = in[4];
+  in6 = in[1];
+  in7 = in[6];
+
+  // column transformation
+  // stage 1
+  // interleave and multiply/add into 32-bit integer
+  s0 = _mm_unpacklo_epi16(in0, in1);
+  s1 = _mm_unpackhi_epi16(in0, in1);
+  s2 = _mm_unpacklo_epi16(in2, in3);
+  s3 = _mm_unpackhi_epi16(in2, in3);
+  s4 = _mm_unpacklo_epi16(in4, in5);
+  s5 = _mm_unpackhi_epi16(in4, in5);
+  s6 = _mm_unpacklo_epi16(in6, in7);
+  s7 = _mm_unpackhi_epi16(in6, in7);
+
+  u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
+  u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
+  u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
+  u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
+  u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
+  u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
+  u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
+  u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
+  u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
+  u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
+  u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
+  u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
+  u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
+  u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
+  u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
+  u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
+
+  // addition
+  w0 = _mm_add_epi32(u0, u8);
+  w1 = _mm_add_epi32(u1, u9);
+  w2 = _mm_add_epi32(u2, u10);
+  w3 = _mm_add_epi32(u3, u11);
+  w4 = _mm_add_epi32(u4, u12);
+  w5 = _mm_add_epi32(u5, u13);
+  w6 = _mm_add_epi32(u6, u14);
+  w7 = _mm_add_epi32(u7, u15);
+  w8 = _mm_sub_epi32(u0, u8);
+  w9 = _mm_sub_epi32(u1, u9);
+  w10 = _mm_sub_epi32(u2, u10);
+  w11 = _mm_sub_epi32(u3, u11);
+  w12 = _mm_sub_epi32(u4, u12);
+  w13 = _mm_sub_epi32(u5, u13);
+  w14 = _mm_sub_epi32(u6, u14);
+  w15 = _mm_sub_epi32(u7, u15);
+
+  // shift and rounding
+  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
+  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
+  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
+  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
+  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
+  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
+  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
+  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
+  v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
+  v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
+  v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
+  v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
+  v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
+  v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
+  v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
+  v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
+
+  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+  u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
+  u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
+  u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
+  u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
+  u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
+  u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
+  u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
+  u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
+
+  // back to 16-bit and pack 8 integers into __m128i
+  in[0] = _mm_packs_epi32(u0, u1);
+  in[1] = _mm_packs_epi32(u2, u3);
+  in[2] = _mm_packs_epi32(u4, u5);
+  in[3] = _mm_packs_epi32(u6, u7);
+  in[4] = _mm_packs_epi32(u8, u9);
+  in[5] = _mm_packs_epi32(u10, u11);
+  in[6] = _mm_packs_epi32(u12, u13);
+  in[7] = _mm_packs_epi32(u14, u15);
+
+  // stage 2
+  s0 = _mm_add_epi16(in[0], in[2]);
+  s1 = _mm_add_epi16(in[1], in[3]);
+  s2 = _mm_sub_epi16(in[0], in[2]);
+  s3 = _mm_sub_epi16(in[1], in[3]);
+  u0 = _mm_unpacklo_epi16(in[4], in[5]);
+  u1 = _mm_unpackhi_epi16(in[4], in[5]);
+  u2 = _mm_unpacklo_epi16(in[6], in[7]);
+  u3 = _mm_unpackhi_epi16(in[6], in[7]);
+
+  v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
+  v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
+  v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
+  v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
+  v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
+  v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
+  v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
+  v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
+
+  w0 = _mm_add_epi32(v0, v4);
+  w1 = _mm_add_epi32(v1, v5);
+  w2 = _mm_add_epi32(v2, v6);
+  w3 = _mm_add_epi32(v3, v7);
+  w4 = _mm_sub_epi32(v0, v4);
+  w5 = _mm_sub_epi32(v1, v5);
+  w6 = _mm_sub_epi32(v2, v6);
+  w7 = _mm_sub_epi32(v3, v7);
+
+  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
+  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
+  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
+  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
+  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
+  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
+  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
+  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
+
+  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+
+  // back to 16-bit intergers
+  s4 = _mm_packs_epi32(u0, u1);
+  s5 = _mm_packs_epi32(u2, u3);
+  s6 = _mm_packs_epi32(u4, u5);
+  s7 = _mm_packs_epi32(u6, u7);
+
+  // stage 3
+  u0 = _mm_unpacklo_epi16(s2, s3);
+  u1 = _mm_unpackhi_epi16(s2, s3);
+  u2 = _mm_unpacklo_epi16(s6, s7);
+  u3 = _mm_unpackhi_epi16(s6, s7);
+
+  v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
+  v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
+  v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
+  v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
+  v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
+  v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
+  v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
+  v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
+
+  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
+  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
+  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
+  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
+
+  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
+  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
+  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
+  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
+
+  s2 = _mm_packs_epi32(v0, v1);
+  s3 = _mm_packs_epi32(v2, v3);
+  s6 = _mm_packs_epi32(v4, v5);
+  s7 = _mm_packs_epi32(v6, v7);
+
+  in[0] = s0;
+  in[1] = _mm_sub_epi16(k__const_0, s4);
+  in[2] = s6;
+  in[3] = _mm_sub_epi16(k__const_0, s2);
+  in[4] = s3;
+  in[5] = _mm_sub_epi16(k__const_0, s7);
+  in[6] = s5;
+  in[7] = _mm_sub_epi16(k__const_0, s1);
+}
+
+void vp10_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
+  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+  // Rows. Load 4-row input data.
+  in0 = _mm_load_si128((const __m128i *)input);
+  in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
+  in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
+  in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
+
+  // 8x4 Transpose
+  TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
+  // Stage1
+  {
+    const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero);
+    const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero);
+
+    tmp0 = _mm_madd_epi16(lo_17, stg1_0);
+    tmp2 = _mm_madd_epi16(lo_17, stg1_1);
+    tmp4 = _mm_madd_epi16(lo_35, stg1_2);
+    tmp6 = _mm_madd_epi16(lo_35, stg1_3);
+
+    tmp0 = _mm_add_epi32(tmp0, rounding);
+    tmp2 = _mm_add_epi32(tmp2, rounding);
+    tmp4 = _mm_add_epi32(tmp4, rounding);
+    tmp6 = _mm_add_epi32(tmp6, rounding);
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
+    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
+
+    stp1_4 = _mm_packs_epi32(tmp0, tmp2);
+    stp1_5 = _mm_packs_epi32(tmp4, tmp6);
+  }
+
+  // Stage2
+  {
+    const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero);
+    const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero);
+
+    tmp0 = _mm_madd_epi16(lo_04, stg2_0);
+    tmp2 = _mm_madd_epi16(lo_04, stg2_1);
+    tmp4 = _mm_madd_epi16(lo_26, stg2_2);
+    tmp6 = _mm_madd_epi16(lo_26, stg2_3);
+
+    tmp0 = _mm_add_epi32(tmp0, rounding);
+    tmp2 = _mm_add_epi32(tmp2, rounding);
+    tmp4 = _mm_add_epi32(tmp4, rounding);
+    tmp6 = _mm_add_epi32(tmp6, rounding);
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
+    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
+
+    stp2_0 = _mm_packs_epi32(tmp0, tmp2);
+    stp2_2 = _mm_packs_epi32(tmp6, tmp4);
+
+    tmp0 = _mm_adds_epi16(stp1_4, stp1_5);
+    tmp1 = _mm_subs_epi16(stp1_4, stp1_5);
+
+    stp2_4 = tmp0;
+    stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
+    stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
+  }
+
+  // Stage3
+  {
+    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
+
+    tmp4 = _mm_adds_epi16(stp2_0, stp2_2);
+    tmp6 = _mm_subs_epi16(stp2_0, stp2_2);
+
+    stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4);
+    stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4);
+
+    tmp0 = _mm_madd_epi16(lo_56, stg3_0);
+    tmp2 = _mm_madd_epi16(lo_56, stg2_0);  // stg3_1 = stg2_0
+
+    tmp0 = _mm_add_epi32(tmp0, rounding);
+    tmp2 = _mm_add_epi32(tmp2, rounding);
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+
+    stp1_5 = _mm_packs_epi32(tmp0, tmp2);
+  }
+
+  // Stage4
+  tmp0 = _mm_adds_epi16(stp1_3, stp2_4);
+  tmp1 = _mm_adds_epi16(stp1_2, stp1_5);
+  tmp2 = _mm_subs_epi16(stp1_3, stp2_4);
+  tmp3 = _mm_subs_epi16(stp1_2, stp1_5);
+
+  TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
+
+  IDCT8(in0, in1, in2, in3, zero, zero, zero, zero,
+        in0, in1, in2, in3, in4, in5, in6, in7);
+  // Final rounding and shift
+  in0 = _mm_adds_epi16(in0, final_rounding);
+  in1 = _mm_adds_epi16(in1, final_rounding);
+  in2 = _mm_adds_epi16(in2, final_rounding);
+  in3 = _mm_adds_epi16(in3, final_rounding);
+  in4 = _mm_adds_epi16(in4, final_rounding);
+  in5 = _mm_adds_epi16(in5, final_rounding);
+  in6 = _mm_adds_epi16(in6, final_rounding);
+  in7 = _mm_adds_epi16(in7, final_rounding);
+
+  in0 = _mm_srai_epi16(in0, 5);
+  in1 = _mm_srai_epi16(in1, 5);
+  in2 = _mm_srai_epi16(in2, 5);
+  in3 = _mm_srai_epi16(in3, 5);
+  in4 = _mm_srai_epi16(in4, 5);
+  in5 = _mm_srai_epi16(in5, 5);
+  in6 = _mm_srai_epi16(in6, 5);
+  in7 = _mm_srai_epi16(in7, 5);
+
+  RECON_AND_STORE(dest + 0 * stride, in0);
+  RECON_AND_STORE(dest + 1 * stride, in1);
+  RECON_AND_STORE(dest + 2 * stride, in2);
+  RECON_AND_STORE(dest + 3 * stride, in3);
+  RECON_AND_STORE(dest + 4 * stride, in4);
+  RECON_AND_STORE(dest + 5 * stride, in5);
+  RECON_AND_STORE(dest + 6 * stride, in6);
+  RECON_AND_STORE(dest + 7 * stride, in7);
+}
+
+#define IDCT16 \
+  /* Stage2 */ \
+  { \
+    const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \
+    const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \
+    const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]);   \
+    const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]);   \
+    const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \
+    const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \
+    const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \
+    const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \
+    \
+    MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \
+                           stg2_0, stg2_1, stg2_2, stg2_3, \
+                           stp2_8, stp2_15, stp2_9, stp2_14) \
+    \
+    MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \
+                           stg2_4, stg2_5, stg2_6, stg2_7, \
+                           stp2_10, stp2_13, stp2_11, stp2_12) \
+  } \
+    \
+  /* Stage3 */ \
+  { \
+    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \
+    const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \
+    const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \
+    const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \
+    \
+    MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \
+                           stg3_0, stg3_1, stg3_2, stg3_3, \
+                           stp1_4, stp1_7, stp1_5, stp1_6) \
+    \
+    stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);  \
+    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);    \
+    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
+    stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
+    \
+    stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \
+    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
+    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
+    stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
+  } \
+  \
+  /* Stage4 */ \
+  { \
+    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \
+    const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \
+    const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \
+    const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \
+    \
+    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
+    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
+    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+    \
+    MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \
+                           stg4_0, stg4_1, stg4_2, stg4_3, \
+                           stp2_0, stp2_1, stp2_2, stp2_3) \
+    \
+    stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
+    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
+    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
+    stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
+    \
+    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
+                           stg4_4, stg4_5, stg4_6, stg4_7, \
+                           stp2_9, stp2_14, stp2_10, stp2_13) \
+  } \
+    \
+  /* Stage5 */ \
+  { \
+    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+    \
+    stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
+    stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
+    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
+    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
+    \
+    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
+    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
+    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
+    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
+    \
+    tmp0 = _mm_add_epi32(tmp0, rounding); \
+    tmp1 = _mm_add_epi32(tmp1, rounding); \
+    tmp2 = _mm_add_epi32(tmp2, rounding); \
+    tmp3 = _mm_add_epi32(tmp3, rounding); \
+    \
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+    \
+    stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+    stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+    \
+    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);  \
+    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);    \
+    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);   \
+    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
+    \
+    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
+    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);   \
+    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);   \
+    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
+  } \
+    \
+  /* Stage6 */ \
+  { \
+    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
+    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
+    \
+    stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
+    stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
+    stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
+    stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
+    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
+    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
+    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
+    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
+    \
+    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
+                           stg6_0, stg4_0, stg6_0, stg4_0, \
+                           stp2_10, stp2_13, stp2_11, stp2_12) \
+  }
+
+#define IDCT16_10 \
+    /* Stage2 */ \
+    { \
+      const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \
+      const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \
+      const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \
+      const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \
+      \
+      MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, \
+                             stg2_0, stg2_1, stg2_6, stg2_7, \
+                             stp1_8_0, stp1_15, stp1_11, stp1_12_0) \
+    } \
+      \
+    /* Stage3 */ \
+    { \
+      const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \
+      const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \
+      \
+      MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, \
+                               stg3_0, stg3_1,  \
+                               stp2_4, stp2_7) \
+      \
+      stp1_9  =  stp1_8_0; \
+      stp1_10 =  stp1_11;  \
+      \
+      stp1_13 = stp1_12_0; \
+      stp1_14 = stp1_15;   \
+    } \
+    \
+    /* Stage4 */ \
+    { \
+      const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \
+      const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \
+      \
+      const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
+      const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
+      const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+      const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+      \
+      MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, \
+                               stg4_0, stg4_1, \
+                               stp1_0, stp1_1) \
+      stp2_5 = stp2_4; \
+      stp2_6 = stp2_7; \
+      \
+      MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
+                             stg4_4, stg4_5, stg4_6, stg4_7, \
+                             stp2_9, stp2_14, stp2_10, stp2_13) \
+    } \
+      \
+    /* Stage5 */ \
+    { \
+      const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+      const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+      \
+      stp1_2 = stp1_1; \
+      stp1_3 = stp1_0; \
+      \
+      tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
+      tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
+      tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
+      tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
+      \
+      tmp0 = _mm_add_epi32(tmp0, rounding); \
+      tmp1 = _mm_add_epi32(tmp1, rounding); \
+      tmp2 = _mm_add_epi32(tmp2, rounding); \
+      tmp3 = _mm_add_epi32(tmp3, rounding); \
+      \
+      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+      \
+      stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+      stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+      \
+      stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);  \
+      stp1_9 = _mm_add_epi16(stp2_9, stp2_10);    \
+      stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);   \
+      stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
+      \
+      stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
+      stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);   \
+      stp1_14 = _mm_add_epi16(stp2_14, stp2_13);   \
+      stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
+    } \
+      \
+    /* Stage6 */ \
+    { \
+      const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+      const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+      const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
+      const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
+      \
+      stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
+      stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
+      stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
+      stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
+      stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
+      stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
+      stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
+      stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
+      \
+      MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
+                             stg6_0, stg4_0, stg6_0, stg4_0, \
+                             stp2_10, stp2_13, stp2_11, stp2_12) \
+    }
+
+void vp10_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
+                                int stride) {
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+  const __m128i zero = _mm_setzero_si128();
+
+  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
+  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
+  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
+  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+
+  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
+
+  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+
+  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+
+  __m128i in[16], l[16], r[16], *curr1;
+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
+          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+          stp1_8_0, stp1_12_0;
+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  int i;
+
+  curr1 = l;
+  for (i = 0; i < 2; i++) {
+    // 1-D vp10_idct
+
+    // Load input data.
+    in[0] = _mm_load_si128((const __m128i *)input);
+    in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1));
+    in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
+    in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3));
+    in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
+    in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5));
+    in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
+    in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7));
+    in[4] = _mm_load_si128((const __m128i *)(input + 8 * 8));
+    in[12] = _mm_load_si128((const __m128i *)(input + 8 * 9));
+    in[5] = _mm_load_si128((const __m128i *)(input + 8 * 10));
+    in[13] = _mm_load_si128((const __m128i *)(input + 8 * 11));
+    in[6] = _mm_load_si128((const __m128i *)(input + 8 * 12));
+    in[14] = _mm_load_si128((const __m128i *)(input + 8 * 13));
+    in[7] = _mm_load_si128((const __m128i *)(input + 8 * 14));
+    in[15] = _mm_load_si128((const __m128i *)(input + 8 * 15));
+
+    array_transpose_8x8(in, in);
+    array_transpose_8x8(in + 8, in + 8);
+
+    IDCT16
+
+    // Stage7
+    curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
+    curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
+    curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
+    curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
+    curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
+    curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
+    curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
+    curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
+    curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
+    curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
+    curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
+    curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
+    curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
+    curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
+    curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
+    curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
+
+    curr1 = r;
+    input += 128;
+  }
+  for (i = 0; i < 2; i++) {
+    int j;
+    // 1-D vp10_idct
+    array_transpose_8x8(l + i * 8, in);
+    array_transpose_8x8(r + i * 8, in + 8);
+
+    IDCT16
+
+    // 2-D
+    in[0] = _mm_add_epi16(stp2_0, stp1_15);
+    in[1] = _mm_add_epi16(stp2_1, stp1_14);
+    in[2] = _mm_add_epi16(stp2_2, stp2_13);
+    in[3] = _mm_add_epi16(stp2_3, stp2_12);
+    in[4] = _mm_add_epi16(stp2_4, stp2_11);
+    in[5] = _mm_add_epi16(stp2_5, stp2_10);
+    in[6] = _mm_add_epi16(stp2_6, stp1_9);
+    in[7] = _mm_add_epi16(stp2_7, stp1_8);
+    in[8] = _mm_sub_epi16(stp2_7, stp1_8);
+    in[9] = _mm_sub_epi16(stp2_6, stp1_9);
+    in[10] = _mm_sub_epi16(stp2_5, stp2_10);
+    in[11] = _mm_sub_epi16(stp2_4, stp2_11);
+    in[12] = _mm_sub_epi16(stp2_3, stp2_12);
+    in[13] = _mm_sub_epi16(stp2_2, stp2_13);
+    in[14] = _mm_sub_epi16(stp2_1, stp1_14);
+    in[15] = _mm_sub_epi16(stp2_0, stp1_15);
+
+    for (j = 0; j < 16; ++j) {
+      // Final rounding and shift
+      in[j] = _mm_adds_epi16(in[j], final_rounding);
+      in[j] = _mm_srai_epi16(in[j], 6);
+      RECON_AND_STORE(dest + j * stride, in[j]);
+    }
+
+    dest += 8;
+  }
+}
+
+void vp10_idct16x16_1_add_sse2(const int16_t *input,
+                               uint8_t *dest,
+                               int stride) {
+  __m128i dc_value;
+  const __m128i zero = _mm_setzero_si128();
+  int a, i;
+
+  a = dct_const_round_shift(input[0] * cospi_16_64);
+  a = dct_const_round_shift(a * cospi_16_64);
+  a = ROUND_POWER_OF_TWO(a, 6);
+
+  dc_value = _mm_set1_epi16(a);
+
+  for (i = 0; i < 2; ++i) {
+    RECON_AND_STORE(dest +  0 * stride, dc_value);
+    RECON_AND_STORE(dest +  1 * stride, dc_value);
+    RECON_AND_STORE(dest +  2 * stride, dc_value);
+    RECON_AND_STORE(dest +  3 * stride, dc_value);
+    RECON_AND_STORE(dest +  4 * stride, dc_value);
+    RECON_AND_STORE(dest +  5 * stride, dc_value);
+    RECON_AND_STORE(dest +  6 * stride, dc_value);
+    RECON_AND_STORE(dest +  7 * stride, dc_value);
+    RECON_AND_STORE(dest +  8 * stride, dc_value);
+    RECON_AND_STORE(dest +  9 * stride, dc_value);
+    RECON_AND_STORE(dest + 10 * stride, dc_value);
+    RECON_AND_STORE(dest + 11 * stride, dc_value);
+    RECON_AND_STORE(dest + 12 * stride, dc_value);
+    RECON_AND_STORE(dest + 13 * stride, dc_value);
+    RECON_AND_STORE(dest + 14 * stride, dc_value);
+    RECON_AND_STORE(dest + 15 * stride, dc_value);
+    dest += 8;
+  }
+}
+
+static void vp10_iadst16_8col(__m128i *in) {
+  // perform 16x16 1-D ADST for 8 columns
+  __m128i s[16], x[16], u[32], v[32];
+  const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
+  const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+  const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
+  const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+  const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
+  const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
+  const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
+  const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
+  const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
+  const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
+  const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
+  const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
+  const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
+  const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+  const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
+  const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
+  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+  const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i kZero = _mm_set1_epi16(0);
+
+  u[0] = _mm_unpacklo_epi16(in[15], in[0]);
+  u[1] = _mm_unpackhi_epi16(in[15], in[0]);
+  u[2] = _mm_unpacklo_epi16(in[13], in[2]);
+  u[3] = _mm_unpackhi_epi16(in[13], in[2]);
+  u[4] = _mm_unpacklo_epi16(in[11], in[4]);
+  u[5] = _mm_unpackhi_epi16(in[11], in[4]);
+  u[6] = _mm_unpacklo_epi16(in[9], in[6]);
+  u[7] = _mm_unpackhi_epi16(in[9], in[6]);
+  u[8] = _mm_unpacklo_epi16(in[7], in[8]);
+  u[9] = _mm_unpackhi_epi16(in[7], in[8]);
+  u[10] = _mm_unpacklo_epi16(in[5], in[10]);
+  u[11] = _mm_unpackhi_epi16(in[5], in[10]);
+  u[12] = _mm_unpacklo_epi16(in[3], in[12]);
+  u[13] = _mm_unpackhi_epi16(in[3], in[12]);
+  u[14] = _mm_unpacklo_epi16(in[1], in[14]);
+  u[15] = _mm_unpackhi_epi16(in[1], in[14]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
+  v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
+  v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
+  v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
+  v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
+  v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
+  v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
+  v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
+  v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
+  v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
+  v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
+  v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
+  v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
+  v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
+  v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
+  v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
+  v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
+
+  u[0] = _mm_add_epi32(v[0], v[16]);
+  u[1] = _mm_add_epi32(v[1], v[17]);
+  u[2] = _mm_add_epi32(v[2], v[18]);
+  u[3] = _mm_add_epi32(v[3], v[19]);
+  u[4] = _mm_add_epi32(v[4], v[20]);
+  u[5] = _mm_add_epi32(v[5], v[21]);
+  u[6] = _mm_add_epi32(v[6], v[22]);
+  u[7] = _mm_add_epi32(v[7], v[23]);
+  u[8] = _mm_add_epi32(v[8], v[24]);
+  u[9] = _mm_add_epi32(v[9], v[25]);
+  u[10] = _mm_add_epi32(v[10], v[26]);
+  u[11] = _mm_add_epi32(v[11], v[27]);
+  u[12] = _mm_add_epi32(v[12], v[28]);
+  u[13] = _mm_add_epi32(v[13], v[29]);
+  u[14] = _mm_add_epi32(v[14], v[30]);
+  u[15] = _mm_add_epi32(v[15], v[31]);
+  u[16] = _mm_sub_epi32(v[0], v[16]);
+  u[17] = _mm_sub_epi32(v[1], v[17]);
+  u[18] = _mm_sub_epi32(v[2], v[18]);
+  u[19] = _mm_sub_epi32(v[3], v[19]);
+  u[20] = _mm_sub_epi32(v[4], v[20]);
+  u[21] = _mm_sub_epi32(v[5], v[21]);
+  u[22] = _mm_sub_epi32(v[6], v[22]);
+  u[23] = _mm_sub_epi32(v[7], v[23]);
+  u[24] = _mm_sub_epi32(v[8], v[24]);
+  u[25] = _mm_sub_epi32(v[9], v[25]);
+  u[26] = _mm_sub_epi32(v[10], v[26]);
+  u[27] = _mm_sub_epi32(v[11], v[27]);
+  u[28] = _mm_sub_epi32(v[12], v[28]);
+  u[29] = _mm_sub_epi32(v[13], v[29]);
+  u[30] = _mm_sub_epi32(v[14], v[30]);
+  u[31] = _mm_sub_epi32(v[15], v[31]);
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+  v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
+  v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
+  v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
+  v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
+  v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
+  v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
+  v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
+  v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
+  v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
+  v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
+  v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
+  v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
+  v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
+  v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
+  v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
+  v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+  u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
+  u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
+  u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
+  u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
+  u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
+  u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
+  u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
+  u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
+  u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
+  u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
+  u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
+  u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
+  u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
+  u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
+  u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
+  u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
+
+  s[0] = _mm_packs_epi32(u[0], u[1]);
+  s[1] = _mm_packs_epi32(u[2], u[3]);
+  s[2] = _mm_packs_epi32(u[4], u[5]);
+  s[3] = _mm_packs_epi32(u[6], u[7]);
+  s[4] = _mm_packs_epi32(u[8], u[9]);
+  s[5] = _mm_packs_epi32(u[10], u[11]);
+  s[6] = _mm_packs_epi32(u[12], u[13]);
+  s[7] = _mm_packs_epi32(u[14], u[15]);
+  s[8] = _mm_packs_epi32(u[16], u[17]);
+  s[9] = _mm_packs_epi32(u[18], u[19]);
+  s[10] = _mm_packs_epi32(u[20], u[21]);
+  s[11] = _mm_packs_epi32(u[22], u[23]);
+  s[12] = _mm_packs_epi32(u[24], u[25]);
+  s[13] = _mm_packs_epi32(u[26], u[27]);
+  s[14] = _mm_packs_epi32(u[28], u[29]);
+  s[15] = _mm_packs_epi32(u[30], u[31]);
+
+  // stage 2
+  u[0] = _mm_unpacklo_epi16(s[8], s[9]);
+  u[1] = _mm_unpackhi_epi16(s[8], s[9]);
+  u[2] = _mm_unpacklo_epi16(s[10], s[11]);
+  u[3] = _mm_unpackhi_epi16(s[10], s[11]);
+  u[4] = _mm_unpacklo_epi16(s[12], s[13]);
+  u[5] = _mm_unpackhi_epi16(s[12], s[13]);
+  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
+  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
+
+  u[0] = _mm_add_epi32(v[0], v[8]);
+  u[1] = _mm_add_epi32(v[1], v[9]);
+  u[2] = _mm_add_epi32(v[2], v[10]);
+  u[3] = _mm_add_epi32(v[3], v[11]);
+  u[4] = _mm_add_epi32(v[4], v[12]);
+  u[5] = _mm_add_epi32(v[5], v[13]);
+  u[6] = _mm_add_epi32(v[6], v[14]);
+  u[7] = _mm_add_epi32(v[7], v[15]);
+  u[8] = _mm_sub_epi32(v[0], v[8]);
+  u[9] = _mm_sub_epi32(v[1], v[9]);
+  u[10] = _mm_sub_epi32(v[2], v[10]);
+  u[11] = _mm_sub_epi32(v[3], v[11]);
+  u[12] = _mm_sub_epi32(v[4], v[12]);
+  u[13] = _mm_sub_epi32(v[5], v[13]);
+  u[14] = _mm_sub_epi32(v[6], v[14]);
+  u[15] = _mm_sub_epi32(v[7], v[15]);
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+
+  x[0] = _mm_add_epi16(s[0], s[4]);
+  x[1] = _mm_add_epi16(s[1], s[5]);
+  x[2] = _mm_add_epi16(s[2], s[6]);
+  x[3] = _mm_add_epi16(s[3], s[7]);
+  x[4] = _mm_sub_epi16(s[0], s[4]);
+  x[5] = _mm_sub_epi16(s[1], s[5]);
+  x[6] = _mm_sub_epi16(s[2], s[6]);
+  x[7] = _mm_sub_epi16(s[3], s[7]);
+  x[8] = _mm_packs_epi32(u[0], u[1]);
+  x[9] = _mm_packs_epi32(u[2], u[3]);
+  x[10] = _mm_packs_epi32(u[4], u[5]);
+  x[11] = _mm_packs_epi32(u[6], u[7]);
+  x[12] = _mm_packs_epi32(u[8], u[9]);
+  x[13] = _mm_packs_epi32(u[10], u[11]);
+  x[14] = _mm_packs_epi32(u[12], u[13]);
+  x[15] = _mm_packs_epi32(u[14], u[15]);
+
+  // stage 3
+  u[0] = _mm_unpacklo_epi16(x[4], x[5]);
+  u[1] = _mm_unpackhi_epi16(x[4], x[5]);
+  u[2] = _mm_unpacklo_epi16(x[6], x[7]);
+  u[3] = _mm_unpackhi_epi16(x[6], x[7]);
+  u[4] = _mm_unpacklo_epi16(x[12], x[13]);
+  u[5] = _mm_unpackhi_epi16(x[12], x[13]);
+  u[6] = _mm_unpacklo_epi16(x[14], x[15]);
+  u[7] = _mm_unpackhi_epi16(x[14], x[15]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
+
+  u[0] = _mm_add_epi32(v[0], v[4]);
+  u[1] = _mm_add_epi32(v[1], v[5]);
+  u[2] = _mm_add_epi32(v[2], v[6]);
+  u[3] = _mm_add_epi32(v[3], v[7]);
+  u[4] = _mm_sub_epi32(v[0], v[4]);
+  u[5] = _mm_sub_epi32(v[1], v[5]);
+  u[6] = _mm_sub_epi32(v[2], v[6]);
+  u[7] = _mm_sub_epi32(v[3], v[7]);
+  u[8] = _mm_add_epi32(v[8], v[12]);
+  u[9] = _mm_add_epi32(v[9], v[13]);
+  u[10] = _mm_add_epi32(v[10], v[14]);
+  u[11] = _mm_add_epi32(v[11], v[15]);
+  u[12] = _mm_sub_epi32(v[8], v[12]);
+  u[13] = _mm_sub_epi32(v[9], v[13]);
+  u[14] = _mm_sub_epi32(v[10], v[14]);
+  u[15] = _mm_sub_epi32(v[11], v[15]);
+
+  u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+  u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+  u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+  u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+  u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+  u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+  u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+  u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+  u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+  s[0] = _mm_add_epi16(x[0], x[2]);
+  s[1] = _mm_add_epi16(x[1], x[3]);
+  s[2] = _mm_sub_epi16(x[0], x[2]);
+  s[3] = _mm_sub_epi16(x[1], x[3]);
+  s[4] = _mm_packs_epi32(v[0], v[1]);
+  s[5] = _mm_packs_epi32(v[2], v[3]);
+  s[6] = _mm_packs_epi32(v[4], v[5]);
+  s[7] = _mm_packs_epi32(v[6], v[7]);
+  s[8] = _mm_add_epi16(x[8], x[10]);
+  s[9] = _mm_add_epi16(x[9], x[11]);
+  s[10] = _mm_sub_epi16(x[8], x[10]);
+  s[11] = _mm_sub_epi16(x[9], x[11]);
+  s[12] = _mm_packs_epi32(v[8], v[9]);
+  s[13] = _mm_packs_epi32(v[10], v[11]);
+  s[14] = _mm_packs_epi32(v[12], v[13]);
+  s[15] = _mm_packs_epi32(v[14], v[15]);
+
+  // stage 4
+  u[0] = _mm_unpacklo_epi16(s[2], s[3]);
+  u[1] = _mm_unpackhi_epi16(s[2], s[3]);
+  u[2] = _mm_unpacklo_epi16(s[6], s[7]);
+  u[3] = _mm_unpackhi_epi16(s[6], s[7]);
+  u[4] = _mm_unpacklo_epi16(s[10], s[11]);
+  u[5] = _mm_unpackhi_epi16(s[10], s[11]);
+  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
+  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+  in[0] = s[0];
+  in[1] = _mm_sub_epi16(kZero, s[8]);
+  in[2] = s[12];
+  in[3] = _mm_sub_epi16(kZero, s[4]);
+  in[4] = _mm_packs_epi32(v[4], v[5]);
+  in[5] = _mm_packs_epi32(v[12], v[13]);
+  in[6] = _mm_packs_epi32(v[8], v[9]);
+  in[7] = _mm_packs_epi32(v[0], v[1]);
+  in[8] = _mm_packs_epi32(v[2], v[3]);
+  in[9] = _mm_packs_epi32(v[10], v[11]);
+  in[10] = _mm_packs_epi32(v[14], v[15]);
+  in[11] = _mm_packs_epi32(v[6], v[7]);
+  in[12] = s[5];
+  in[13] = _mm_sub_epi16(kZero, s[13]);
+  in[14] = s[9];
+  in[15] = _mm_sub_epi16(kZero, s[1]);
+}
+
+static void vp10_idct16_8col(__m128i *in) {
+  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
+  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
+  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
+  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
+  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  __m128i v[16], u[16], s[16], t[16];
+
+  // stage 1
+  s[0] = in[0];
+  s[1] = in[8];
+  s[2] = in[4];
+  s[3] = in[12];
+  s[4] = in[2];
+  s[5] = in[10];
+  s[6] = in[6];
+  s[7] = in[14];
+  s[8] = in[1];
+  s[9] = in[9];
+  s[10] = in[5];
+  s[11] = in[13];
+  s[12] = in[3];
+  s[13] = in[11];
+  s[14] = in[7];
+  s[15] = in[15];
+
+  // stage 2
+  u[0] = _mm_unpacklo_epi16(s[8], s[15]);
+  u[1] = _mm_unpackhi_epi16(s[8], s[15]);
+  u[2] = _mm_unpacklo_epi16(s[9], s[14]);
+  u[3] = _mm_unpackhi_epi16(s[9], s[14]);
+  u[4] = _mm_unpacklo_epi16(s[10], s[13]);
+  u[5] = _mm_unpackhi_epi16(s[10], s[13]);
+  u[6] = _mm_unpacklo_epi16(s[11], s[12]);
+  u[7] = _mm_unpackhi_epi16(s[11], s[12]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+  s[8]  = _mm_packs_epi32(u[0], u[1]);
+  s[15] = _mm_packs_epi32(u[2], u[3]);
+  s[9]  = _mm_packs_epi32(u[4], u[5]);
+  s[14] = _mm_packs_epi32(u[6], u[7]);
+  s[10] = _mm_packs_epi32(u[8], u[9]);
+  s[13] = _mm_packs_epi32(u[10], u[11]);
+  s[11] = _mm_packs_epi32(u[12], u[13]);
+  s[12] = _mm_packs_epi32(u[14], u[15]);
+
+  // stage 3
+  t[0] = s[0];
+  t[1] = s[1];
+  t[2] = s[2];
+  t[3] = s[3];
+  u[0] = _mm_unpacklo_epi16(s[4], s[7]);
+  u[1] = _mm_unpackhi_epi16(s[4], s[7]);
+  u[2] = _mm_unpacklo_epi16(s[5], s[6]);
+  u[3] = _mm_unpackhi_epi16(s[5], s[6]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+  t[4] = _mm_packs_epi32(u[0], u[1]);
+  t[7] = _mm_packs_epi32(u[2], u[3]);
+  t[5] = _mm_packs_epi32(u[4], u[5]);
+  t[6] = _mm_packs_epi32(u[6], u[7]);
+  t[8] = _mm_add_epi16(s[8], s[9]);
+  t[9] = _mm_sub_epi16(s[8], s[9]);
+  t[10] = _mm_sub_epi16(s[11], s[10]);
+  t[11] = _mm_add_epi16(s[10], s[11]);
+  t[12] = _mm_add_epi16(s[12], s[13]);
+  t[13] = _mm_sub_epi16(s[12], s[13]);
+  t[14] = _mm_sub_epi16(s[15], s[14]);
+  t[15] = _mm_add_epi16(s[14], s[15]);
+
+  // stage 4
+  u[0] = _mm_unpacklo_epi16(t[0], t[1]);
+  u[1] = _mm_unpackhi_epi16(t[0], t[1]);
+  u[2] = _mm_unpacklo_epi16(t[2], t[3]);
+  u[3] = _mm_unpackhi_epi16(t[2], t[3]);
+  u[4] = _mm_unpacklo_epi16(t[9], t[14]);
+  u[5] = _mm_unpackhi_epi16(t[9], t[14]);
+  u[6] = _mm_unpacklo_epi16(t[10], t[13]);
+  u[7] = _mm_unpackhi_epi16(t[10], t[13]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+  s[0] = _mm_packs_epi32(u[0], u[1]);
+  s[1] = _mm_packs_epi32(u[2], u[3]);
+  s[2] = _mm_packs_epi32(u[4], u[5]);
+  s[3] = _mm_packs_epi32(u[6], u[7]);
+  s[4] = _mm_add_epi16(t[4], t[5]);
+  s[5] = _mm_sub_epi16(t[4], t[5]);
+  s[6] = _mm_sub_epi16(t[7], t[6]);
+  s[7] = _mm_add_epi16(t[6], t[7]);
+  s[8] = t[8];
+  s[15] = t[15];
+  s[9]  = _mm_packs_epi32(u[8], u[9]);
+  s[14] = _mm_packs_epi32(u[10], u[11]);
+  s[10] = _mm_packs_epi32(u[12], u[13]);
+  s[13] = _mm_packs_epi32(u[14], u[15]);
+  s[11] = t[11];
+  s[12] = t[12];
+
+  // stage 5
+  t[0] = _mm_add_epi16(s[0], s[3]);
+  t[1] = _mm_add_epi16(s[1], s[2]);
+  t[2] = _mm_sub_epi16(s[1], s[2]);
+  t[3] = _mm_sub_epi16(s[0], s[3]);
+  t[4] = s[4];
+  t[7] = s[7];
+
+  u[0] = _mm_unpacklo_epi16(s[5], s[6]);
+  u[1] = _mm_unpackhi_epi16(s[5], s[6]);
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  t[5] = _mm_packs_epi32(u[0], u[1]);
+  t[6] = _mm_packs_epi32(u[2], u[3]);
+
+  t[8] = _mm_add_epi16(s[8], s[11]);
+  t[9] = _mm_add_epi16(s[9], s[10]);
+  t[10] = _mm_sub_epi16(s[9], s[10]);
+  t[11] = _mm_sub_epi16(s[8], s[11]);
+  t[12] = _mm_sub_epi16(s[15], s[12]);
+  t[13] = _mm_sub_epi16(s[14], s[13]);
+  t[14] = _mm_add_epi16(s[13], s[14]);
+  t[15] = _mm_add_epi16(s[12], s[15]);
+
+  // stage 6
+  s[0] = _mm_add_epi16(t[0], t[7]);
+  s[1] = _mm_add_epi16(t[1], t[6]);
+  s[2] = _mm_add_epi16(t[2], t[5]);
+  s[3] = _mm_add_epi16(t[3], t[4]);
+  s[4] = _mm_sub_epi16(t[3], t[4]);
+  s[5] = _mm_sub_epi16(t[2], t[5]);
+  s[6] = _mm_sub_epi16(t[1], t[6]);
+  s[7] = _mm_sub_epi16(t[0], t[7]);
+  s[8] = t[8];
+  s[9] = t[9];
+
+  u[0] = _mm_unpacklo_epi16(t[10], t[13]);
+  u[1] = _mm_unpackhi_epi16(t[10], t[13]);
+  u[2] = _mm_unpacklo_epi16(t[11], t[12]);
+  u[3] = _mm_unpackhi_epi16(t[11], t[12]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+  s[10] = _mm_packs_epi32(u[0], u[1]);
+  s[13] = _mm_packs_epi32(u[2], u[3]);
+  s[11] = _mm_packs_epi32(u[4], u[5]);
+  s[12] = _mm_packs_epi32(u[6], u[7]);
+  s[14] = t[14];
+  s[15] = t[15];
+
+  // stage 7
+  in[0] = _mm_add_epi16(s[0], s[15]);
+  in[1] = _mm_add_epi16(s[1], s[14]);
+  in[2] = _mm_add_epi16(s[2], s[13]);
+  in[3] = _mm_add_epi16(s[3], s[12]);
+  in[4] = _mm_add_epi16(s[4], s[11]);
+  in[5] = _mm_add_epi16(s[5], s[10]);
+  in[6] = _mm_add_epi16(s[6], s[9]);
+  in[7] = _mm_add_epi16(s[7], s[8]);
+  in[8] = _mm_sub_epi16(s[7], s[8]);
+  in[9] = _mm_sub_epi16(s[6], s[9]);
+  in[10] = _mm_sub_epi16(s[5], s[10]);
+  in[11] = _mm_sub_epi16(s[4], s[11]);
+  in[12] = _mm_sub_epi16(s[3], s[12]);
+  in[13] = _mm_sub_epi16(s[2], s[13]);
+  in[14] = _mm_sub_epi16(s[1], s[14]);
+  in[15] = _mm_sub_epi16(s[0], s[15]);
+}
+
+void vp10_idct16_sse2(__m128i *in0, __m128i *in1) {
+  array_transpose_16x16(in0, in1);
+  vp10_idct16_8col(in0);
+  vp10_idct16_8col(in1);
+}
+
+void vp10_iadst16_sse2(__m128i *in0, __m128i *in1) {
+  array_transpose_16x16(in0, in1);
+  vp10_iadst16_8col(in0);
+  vp10_iadst16_8col(in1);
+}
+
+void vp10_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
+                               int stride) {
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+  const __m128i zero = _mm_setzero_si128();
+
+  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
+  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+
+  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+
+  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+
+  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+  __m128i in[16], l[16];
+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6,
+          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+          stp1_8_0, stp1_12_0;
+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  int i;
+  // First 1-D inverse DCT
+  // Load input data.
+  in[0] = _mm_load_si128((const __m128i *)input);
+  in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
+  in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
+  in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
+
+  TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
+
+  // Stage2
+  {
+    const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);
+    const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]);
+
+    tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
+    tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
+    tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
+    tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
+
+    tmp0 = _mm_add_epi32(tmp0, rounding);
+    tmp2 = _mm_add_epi32(tmp2, rounding);
+    tmp5 = _mm_add_epi32(tmp5, rounding);
+    tmp7 = _mm_add_epi32(tmp7, rounding);
+
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
+    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
+
+    stp2_8  = _mm_packs_epi32(tmp0, tmp2);
+    stp2_11 = _mm_packs_epi32(tmp5, tmp7);
+  }
+
+  // Stage3
+  {
+    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero);
+
+    tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
+    tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
+
+    tmp0 = _mm_add_epi32(tmp0, rounding);
+    tmp2 = _mm_add_epi32(tmp2, rounding);
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+
+    stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
+    stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
+
+    stp1_4 = _mm_packs_epi32(tmp0, tmp2);
+  }
+
+  // Stage4
+  {
+    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);
+    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
+    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
+
+    tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
+    tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
+    tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
+    tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
+    tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
+    tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
+
+    tmp0 = _mm_add_epi32(tmp0, rounding);
+    tmp2 = _mm_add_epi32(tmp2, rounding);
+    tmp1 = _mm_add_epi32(tmp1, rounding);
+    tmp3 = _mm_add_epi32(tmp3, rounding);
+    tmp5 = _mm_add_epi32(tmp5, rounding);
+    tmp7 = _mm_add_epi32(tmp7, rounding);
+
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
+    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
+    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
+    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
+
+    stp1_0 = _mm_packs_epi32(tmp0, tmp0);
+    stp1_1 = _mm_packs_epi32(tmp2, tmp2);
+    stp2_9 = _mm_packs_epi32(tmp1, tmp3);
+    stp2_10 = _mm_packs_epi32(tmp5, tmp7);
+
+    stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
+  }
+
+  // Stage5 and Stage6
+  {
+    tmp0 = _mm_add_epi16(stp2_8, stp2_11);
+    tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
+    tmp2 = _mm_add_epi16(stp2_9, stp2_10);
+    tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
+
+    stp1_9  = _mm_unpacklo_epi64(tmp2, zero);
+    stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
+    stp1_8  = _mm_unpacklo_epi64(tmp0, zero);
+    stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
+
+    stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
+    stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
+    stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
+    stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
+  }
+
+  // Stage6
+  {
+    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4);
+    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
+    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
+
+    tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
+    tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
+    tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
+    tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
+    tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
+    tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
+
+    tmp1 = _mm_add_epi32(tmp1, rounding);
+    tmp3 = _mm_add_epi32(tmp3, rounding);
+    tmp0 = _mm_add_epi32(tmp0, rounding);
+    tmp2 = _mm_add_epi32(tmp2, rounding);
+    tmp4 = _mm_add_epi32(tmp4, rounding);
+    tmp6 = _mm_add_epi32(tmp6, rounding);
+
+    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
+    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
+    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
+
+    stp1_6 = _mm_packs_epi32(tmp3, tmp1);
+
+    stp2_10 = _mm_packs_epi32(tmp0, zero);
+    stp2_13 = _mm_packs_epi32(tmp2, zero);
+    stp2_11 = _mm_packs_epi32(tmp4, zero);
+    stp2_12 = _mm_packs_epi32(tmp6, zero);
+
+    tmp0 = _mm_add_epi16(stp1_0, stp1_4);
+    tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
+    tmp2 = _mm_add_epi16(stp1_1, stp1_6);
+    tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
+
+    stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
+    stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
+    stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
+    stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
+    stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
+    stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
+    stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
+    stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
+  }
+
+  // Stage7. Left 8x16 only.
+  l[0] = _mm_add_epi16(stp2_0, stp1_15);
+  l[1] = _mm_add_epi16(stp2_1, stp1_14);
+  l[2] = _mm_add_epi16(stp2_2, stp2_13);
+  l[3] = _mm_add_epi16(stp2_3, stp2_12);
+  l[4] = _mm_add_epi16(stp2_4, stp2_11);
+  l[5] = _mm_add_epi16(stp2_5, stp2_10);
+  l[6] = _mm_add_epi16(stp2_6, stp1_9);
+  l[7] = _mm_add_epi16(stp2_7, stp1_8);
+  l[8] = _mm_sub_epi16(stp2_7, stp1_8);
+  l[9] = _mm_sub_epi16(stp2_6, stp1_9);
+  l[10] = _mm_sub_epi16(stp2_5, stp2_10);
+  l[11] = _mm_sub_epi16(stp2_4, stp2_11);
+  l[12] = _mm_sub_epi16(stp2_3, stp2_12);
+  l[13] = _mm_sub_epi16(stp2_2, stp2_13);
+  l[14] = _mm_sub_epi16(stp2_1, stp1_14);
+  l[15] = _mm_sub_epi16(stp2_0, stp1_15);
+
+  // Second 1-D inverse transform, performed per 8x16 block
+  for (i = 0; i < 2; i++) {
+    int j;
+    array_transpose_4X8(l + 8 * i, in);
+
+    IDCT16_10
+
+    // Stage7
+    in[0] = _mm_add_epi16(stp2_0, stp1_15);
+    in[1] = _mm_add_epi16(stp2_1, stp1_14);
+    in[2] = _mm_add_epi16(stp2_2, stp2_13);
+    in[3] = _mm_add_epi16(stp2_3, stp2_12);
+    in[4] = _mm_add_epi16(stp2_4, stp2_11);
+    in[5] = _mm_add_epi16(stp2_5, stp2_10);
+    in[6] = _mm_add_epi16(stp2_6, stp1_9);
+    in[7] = _mm_add_epi16(stp2_7, stp1_8);
+    in[8] = _mm_sub_epi16(stp2_7, stp1_8);
+    in[9] = _mm_sub_epi16(stp2_6, stp1_9);
+    in[10] = _mm_sub_epi16(stp2_5, stp2_10);
+    in[11] = _mm_sub_epi16(stp2_4, stp2_11);
+    in[12] = _mm_sub_epi16(stp2_3, stp2_12);
+    in[13] = _mm_sub_epi16(stp2_2, stp2_13);
+    in[14] = _mm_sub_epi16(stp2_1, stp1_14);
+    in[15] = _mm_sub_epi16(stp2_0, stp1_15);
+
+    for (j = 0; j < 16; ++j) {
+      // Final rounding and shift
+      in[j] = _mm_adds_epi16(in[j], final_rounding);
+      in[j] = _mm_srai_epi16(in[j], 6);
+      RECON_AND_STORE(dest + j * stride, in[j]);
+    }
+
+    dest += 8;
+  }
+}
+
+#define LOAD_DQCOEFF(reg, input) \
+  {  \
+    reg = _mm_load_si128((const __m128i *) input); \
+    input += 8; \
+  }  \
+
+#define IDCT32_34 \
+/* Stage1 */ \
+{ \
+  const __m128i zero = _mm_setzero_si128();\
+  const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \
+  const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \
+  \
+  const __m128i lo_25_7= _mm_unpacklo_epi16(zero, in[7]); \
+  const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \
+  \
+  const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \
+  const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \
+  \
+  const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \
+  const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \
+  \
+  MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, \
+                         stg1_1, stp1_16, stp1_31); \
+  MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, \
+                         stg1_7, stp1_19, stp1_28); \
+  MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, \
+                         stg1_9, stp1_20, stp1_27); \
+  MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, \
+                         stg1_15, stp1_23, stp1_24); \
+} \
+\
+/* Stage2 */ \
+{ \
+  const __m128i zero = _mm_setzero_si128();\
+  const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \
+  const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \
+  \
+  const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \
+  const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \
+  \
+  MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, \
+                         stg2_1, stp2_8, stp2_15); \
+  MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, \
+                         stg2_7, stp2_11, stp2_12); \
+  \
+  stp2_16 = stp1_16; \
+  stp2_19 = stp1_19; \
+  \
+  stp2_20 = stp1_20; \
+  stp2_23 = stp1_23; \
+  \
+  stp2_24 = stp1_24; \
+  stp2_27 = stp1_27; \
+  \
+  stp2_28 = stp1_28; \
+  stp2_31 = stp1_31; \
+} \
+\
+/* Stage3 */ \
+{ \
+  const __m128i zero = _mm_setzero_si128();\
+  const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \
+  const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \
+  \
+  const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \
+  const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \
+  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \
+  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \
+  \
+  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \
+  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \
+  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \
+  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \
+  \
+  MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, \
+                         stg3_1, stp1_4, stp1_7); \
+  \
+  stp1_8 = stp2_8; \
+  stp1_11 = stp2_11; \
+  stp1_12 = stp2_12; \
+  stp1_15 = stp2_15; \
+  \
+  MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
+                         stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
+                         stp1_18, stp1_29) \
+  MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
+                         stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
+                         stp1_22, stp1_25) \
+  \
+  stp1_16 = stp2_16; \
+  stp1_31 = stp2_31; \
+  stp1_19 = stp2_19; \
+  stp1_20 = stp2_20; \
+  stp1_23 = stp2_23; \
+  stp1_24 = stp2_24; \
+  stp1_27 = stp2_27; \
+  stp1_28 = stp2_28; \
+} \
+\
+/* Stage4 */ \
+{ \
+  const __m128i zero = _mm_setzero_si128();\
+  const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \
+  const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \
+  \
+  const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \
+  const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \
+  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \
+  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \
+  \
+  MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, \
+                         stg4_1, stp2_0, stp2_1); \
+  \
+  stp2_4 = stp1_4; \
+  stp2_5 = stp1_4; \
+  stp2_6 = stp1_7; \
+  stp2_7 = stp1_7; \
+  \
+  MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
+                         stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
+                         stp2_10, stp2_13) \
+  \
+  stp2_8 = stp1_8; \
+  stp2_15 = stp1_15; \
+  stp2_11 = stp1_11; \
+  stp2_12 = stp1_12; \
+  \
+  stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
+  stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
+  stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
+  stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
+  stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
+  stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
+  stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
+  stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
+  \
+  stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
+  stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
+  stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
+  stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
+  stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
+  stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
+  stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
+  stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
+} \
+\
+/* Stage5 */ \
+{ \
+  const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+  const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
+  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
+  \
+  const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
+  const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
+  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+  \
+  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+  \
+  stp1_0 = stp2_0; \
+  stp1_1 = stp2_1; \
+  stp1_2 = stp2_1; \
+  stp1_3 = stp2_0; \
+  \
+  tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
+  tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
+  tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
+  tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
+  \
+  tmp0 = _mm_add_epi32(tmp0, rounding); \
+  tmp1 = _mm_add_epi32(tmp1, rounding); \
+  tmp2 = _mm_add_epi32(tmp2, rounding); \
+  tmp3 = _mm_add_epi32(tmp3, rounding); \
+  \
+  tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+  tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+  tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+  tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+  \
+  stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+  stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+  \
+  stp1_4 = stp2_4; \
+  stp1_7 = stp2_7; \
+  \
+  stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
+  stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
+  stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
+  stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
+  stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
+  stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
+  stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
+  stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
+  \
+  stp1_16 = stp2_16; \
+  stp1_17 = stp2_17; \
+  \
+  MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
+                         stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
+                         stp1_19, stp1_28) \
+  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
+                         stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
+                         stp1_21, stp1_26) \
+  \
+  stp1_22 = stp2_22; \
+  stp1_23 = stp2_23; \
+  stp1_24 = stp2_24; \
+  stp1_25 = stp2_25; \
+  stp1_30 = stp2_30; \
+  stp1_31 = stp2_31; \
+} \
+\
+/* Stage6 */ \
+{ \
+  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+  const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
+  const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
+  \
+  stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
+  stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
+  stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
+  stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
+  stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
+  stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
+  stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
+  stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
+  \
+  stp2_8 = stp1_8; \
+  stp2_9 = stp1_9; \
+  stp2_14 = stp1_14; \
+  stp2_15 = stp1_15; \
+  \
+  MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
+                         stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
+                         stp2_13, stp2_11, stp2_12) \
+  \
+  stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
+  stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
+  stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
+  stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
+  stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
+  stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
+  stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
+  stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
+  \
+  stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
+  stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
+  stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
+  stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
+  stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
+  stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
+  stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
+  stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
+} \
+\
+/* Stage7 */ \
+{ \
+  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+  \
+  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
+  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
+  const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
+  const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
+  \
+  stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
+  stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
+  stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
+  stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
+  stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
+  stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
+  stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
+  stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
+  stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
+  stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
+  stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
+  stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
+  stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
+  stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
+  stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
+  stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
+  \
+  stp1_16 = stp2_16; \
+  stp1_17 = stp2_17; \
+  stp1_18 = stp2_18; \
+  stp1_19 = stp2_19; \
+  \
+  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
+                         stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
+                         stp1_21, stp1_26) \
+  MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
+                         stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
+                         stp1_23, stp1_24) \
+  \
+  stp1_28 = stp2_28; \
+  stp1_29 = stp2_29; \
+  stp1_30 = stp2_30; \
+  stp1_31 = stp2_31; \
+}
+
+
+#define IDCT32 \
+/* Stage1 */ \
+{ \
+  const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \
+  const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \
+  const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \
+  const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \
+  \
+  const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \
+  const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \
+  const __m128i lo_25_7= _mm_unpacklo_epi16(in[25], in[7]); \
+  const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \
+  \
+  const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \
+  const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \
+  const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \
+  const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \
+  \
+  const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \
+  const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \
+  const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \
+  const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \
+  \
+  MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
+                         stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \
+                         stp1_17, stp1_30) \
+  MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \
+                         stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \
+                         stp1_19, stp1_28) \
+  MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \
+                         stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \
+                         stp1_21, stp1_26) \
+  MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \
+                         stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \
+                         stp1_23, stp1_24) \
+} \
+\
+/* Stage2 */ \
+{ \
+  const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \
+  const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \
+  const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \
+  const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \
+  \
+  const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \
+  const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \
+  const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \
+  const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \
+  \
+  MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
+                         stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
+                         stp2_14) \
+  MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \
+                         stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \
+                         stp2_11, stp2_12) \
+  \
+  stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \
+  stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \
+  stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \
+  stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \
+  \
+  stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \
+  stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \
+  stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \
+  stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \
+  \
+  stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \
+  stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \
+  stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \
+  stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \
+  \
+  stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \
+  stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \
+  stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \
+  stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \
+} \
+\
+/* Stage3 */ \
+{ \
+  const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \
+  const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \
+  const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \
+  const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \
+  \
+  const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
+  const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
+  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
+  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
+  \
+  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
+  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
+  \
+  MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \
+                         stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \
+                         stp1_6) \
+  \
+  stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \
+  stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
+  stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
+  stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
+  stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \
+  stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
+  stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
+  stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
+  \
+  MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
+                         stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
+                         stp1_18, stp1_29) \
+  MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
+                         stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
+                         stp1_22, stp1_25) \
+  \
+  stp1_16 = stp2_16; \
+  stp1_31 = stp2_31; \
+  stp1_19 = stp2_19; \
+  stp1_20 = stp2_20; \
+  stp1_23 = stp2_23; \
+  stp1_24 = stp2_24; \
+  stp1_27 = stp2_27; \
+  stp1_28 = stp2_28; \
+} \
+\
+/* Stage4 */ \
+{ \
+  const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \
+  const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \
+  const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \
+  const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \
+  \
+  const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
+  const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
+  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+  \
+  MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \
+                         stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \
+                         stp2_2, stp2_3) \
+  \
+  stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
+  stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
+  stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
+  stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
+  \
+  MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
+                         stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
+                         stp2_10, stp2_13) \
+  \
+  stp2_8 = stp1_8; \
+  stp2_15 = stp1_15; \
+  stp2_11 = stp1_11; \
+  stp2_12 = stp1_12; \
+  \
+  stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
+  stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
+  stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
+  stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
+  stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
+  stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
+  stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
+  stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
+  \
+  stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
+  stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
+  stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
+  stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
+  stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
+  stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
+  stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
+  stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
+} \
+\
+/* Stage5 */ \
+{ \
+  const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+  const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
+  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
+  \
+  const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
+  const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
+  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+  \
+  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+  \
+  stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
+  stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
+  stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
+  stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
+  \
+  tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
+  tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
+  tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
+  tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
+  \
+  tmp0 = _mm_add_epi32(tmp0, rounding); \
+  tmp1 = _mm_add_epi32(tmp1, rounding); \
+  tmp2 = _mm_add_epi32(tmp2, rounding); \
+  tmp3 = _mm_add_epi32(tmp3, rounding); \
+  \
+  tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+  tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+  tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+  tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+  \
+  stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+  stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+  \
+  stp1_4 = stp2_4; \
+  stp1_7 = stp2_7; \
+  \
+  stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
+  stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
+  stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
+  stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
+  stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
+  stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
+  stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
+  stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
+  \
+  stp1_16 = stp2_16; \
+  stp1_17 = stp2_17; \
+  \
+  MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
+                         stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
+                         stp1_19, stp1_28) \
+  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
+                         stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
+                         stp1_21, stp1_26) \
+  \
+  stp1_22 = stp2_22; \
+  stp1_23 = stp2_23; \
+  stp1_24 = stp2_24; \
+  stp1_25 = stp2_25; \
+  stp1_30 = stp2_30; \
+  stp1_31 = stp2_31; \
+} \
+\
+/* Stage6 */ \
+{ \
+  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+  const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
+  const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
+  \
+  stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
+  stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
+  stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
+  stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
+  stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
+  stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
+  stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
+  stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
+  \
+  stp2_8 = stp1_8; \
+  stp2_9 = stp1_9; \
+  stp2_14 = stp1_14; \
+  stp2_15 = stp1_15; \
+  \
+  MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
+                         stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
+                         stp2_13, stp2_11, stp2_12) \
+  \
+  stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
+  stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
+  stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
+  stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
+  stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
+  stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
+  stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
+  stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
+  \
+  stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
+  stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
+  stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
+  stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
+  stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
+  stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
+  stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
+  stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
+} \
+\
+/* Stage7 */ \
+{ \
+  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+  \
+  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
+  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
+  const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
+  const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
+  \
+  stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
+  stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
+  stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
+  stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
+  stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
+  stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
+  stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
+  stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
+  stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
+  stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
+  stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
+  stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
+  stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
+  stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
+  stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
+  stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
+  \
+  stp1_16 = stp2_16; \
+  stp1_17 = stp2_17; \
+  stp1_18 = stp2_18; \
+  stp1_19 = stp2_19; \
+  \
+  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
+                         stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
+                         stp1_21, stp1_26) \
+  MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
+                         stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
+                         stp1_23, stp1_24) \
+  \
+  stp1_28 = stp2_28; \
+  stp1_29 = stp2_29; \
+  stp1_30 = stp2_30; \
+  stp1_31 = stp2_31; \
+}
+
+// Only upper-left 8x8 has non-zero coeff
+void vp10_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
+                               int stride) {
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i final_rounding = _mm_set1_epi16(1<<5);
+
+  // vp10_idct constants for each stage
+  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
+  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
+  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
+  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
+
+  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
+  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+
+  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
+  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
+
+  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+
+  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+
+  __m128i in[32], col[32];
+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
+          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+          stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
+          stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
+          stp1_30, stp1_31;
+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
+          stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
+          stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
+          stp2_30, stp2_31;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  int i;
+
+  // Load input data. Only need to load the top left 8x8 block.
+  in[0] = _mm_load_si128((const __m128i *)input);
+  in[1] = _mm_load_si128((const __m128i *)(input + 32));
+  in[2] = _mm_load_si128((const __m128i *)(input + 64));
+  in[3] = _mm_load_si128((const __m128i *)(input + 96));
+  in[4] = _mm_load_si128((const __m128i *)(input + 128));
+  in[5] = _mm_load_si128((const __m128i *)(input + 160));
+  in[6] = _mm_load_si128((const __m128i *)(input + 192));
+  in[7] = _mm_load_si128((const __m128i *)(input + 224));
+
+  for (i = 8; i < 32; ++i) {
+    in[i] = _mm_setzero_si128();
+  }
+
+  array_transpose_8x8(in, in);
+  // TODO(hkuang): Following transposes are unnecessary. But remove them will
+  // lead to performance drop on some devices.
+  array_transpose_8x8(in + 8, in + 8);
+  array_transpose_8x8(in + 16, in + 16);
+  array_transpose_8x8(in + 24, in + 24);
+
+  IDCT32_34
+
+  // 1_D: Store 32 intermediate results for each 8x32 block.
+  col[0] = _mm_add_epi16(stp1_0, stp1_31);
+  col[1] = _mm_add_epi16(stp1_1, stp1_30);
+  col[2] = _mm_add_epi16(stp1_2, stp1_29);
+  col[3] = _mm_add_epi16(stp1_3, stp1_28);
+  col[4] = _mm_add_epi16(stp1_4, stp1_27);
+  col[5] = _mm_add_epi16(stp1_5, stp1_26);
+  col[6] = _mm_add_epi16(stp1_6, stp1_25);
+  col[7] = _mm_add_epi16(stp1_7, stp1_24);
+  col[8] = _mm_add_epi16(stp1_8, stp1_23);
+  col[9] = _mm_add_epi16(stp1_9, stp1_22);
+  col[10] = _mm_add_epi16(stp1_10, stp1_21);
+  col[11] = _mm_add_epi16(stp1_11, stp1_20);
+  col[12] = _mm_add_epi16(stp1_12, stp1_19);
+  col[13] = _mm_add_epi16(stp1_13, stp1_18);
+  col[14] = _mm_add_epi16(stp1_14, stp1_17);
+  col[15] = _mm_add_epi16(stp1_15, stp1_16);
+  col[16] = _mm_sub_epi16(stp1_15, stp1_16);
+  col[17] = _mm_sub_epi16(stp1_14, stp1_17);
+  col[18] = _mm_sub_epi16(stp1_13, stp1_18);
+  col[19] = _mm_sub_epi16(stp1_12, stp1_19);
+  col[20] = _mm_sub_epi16(stp1_11, stp1_20);
+  col[21] = _mm_sub_epi16(stp1_10, stp1_21);
+  col[22] = _mm_sub_epi16(stp1_9, stp1_22);
+  col[23] = _mm_sub_epi16(stp1_8, stp1_23);
+  col[24] = _mm_sub_epi16(stp1_7, stp1_24);
+  col[25] = _mm_sub_epi16(stp1_6, stp1_25);
+  col[26] = _mm_sub_epi16(stp1_5, stp1_26);
+  col[27] = _mm_sub_epi16(stp1_4, stp1_27);
+  col[28] = _mm_sub_epi16(stp1_3, stp1_28);
+  col[29] = _mm_sub_epi16(stp1_2, stp1_29);
+  col[30] = _mm_sub_epi16(stp1_1, stp1_30);
+  col[31] = _mm_sub_epi16(stp1_0, stp1_31);
+  for (i = 0; i < 4; i++) {
+    int j;
+    const __m128i zero = _mm_setzero_si128();
+    // Transpose 32x8 block to 8x32 block
+    array_transpose_8x8(col + i * 8, in);
+    IDCT32_34
+
+    // 2_D: Calculate the results and store them to destination.
+    in[0] = _mm_add_epi16(stp1_0, stp1_31);
+    in[1] = _mm_add_epi16(stp1_1, stp1_30);
+    in[2] = _mm_add_epi16(stp1_2, stp1_29);
+    in[3] = _mm_add_epi16(stp1_3, stp1_28);
+    in[4] = _mm_add_epi16(stp1_4, stp1_27);
+    in[5] = _mm_add_epi16(stp1_5, stp1_26);
+    in[6] = _mm_add_epi16(stp1_6, stp1_25);
+    in[7] = _mm_add_epi16(stp1_7, stp1_24);
+    in[8] = _mm_add_epi16(stp1_8, stp1_23);
+    in[9] = _mm_add_epi16(stp1_9, stp1_22);
+    in[10] = _mm_add_epi16(stp1_10, stp1_21);
+    in[11] = _mm_add_epi16(stp1_11, stp1_20);
+    in[12] = _mm_add_epi16(stp1_12, stp1_19);
+    in[13] = _mm_add_epi16(stp1_13, stp1_18);
+    in[14] = _mm_add_epi16(stp1_14, stp1_17);
+    in[15] = _mm_add_epi16(stp1_15, stp1_16);
+    in[16] = _mm_sub_epi16(stp1_15, stp1_16);
+    in[17] = _mm_sub_epi16(stp1_14, stp1_17);
+    in[18] = _mm_sub_epi16(stp1_13, stp1_18);
+    in[19] = _mm_sub_epi16(stp1_12, stp1_19);
+    in[20] = _mm_sub_epi16(stp1_11, stp1_20);
+    in[21] = _mm_sub_epi16(stp1_10, stp1_21);
+    in[22] = _mm_sub_epi16(stp1_9, stp1_22);
+    in[23] = _mm_sub_epi16(stp1_8, stp1_23);
+    in[24] = _mm_sub_epi16(stp1_7, stp1_24);
+    in[25] = _mm_sub_epi16(stp1_6, stp1_25);
+    in[26] = _mm_sub_epi16(stp1_5, stp1_26);
+    in[27] = _mm_sub_epi16(stp1_4, stp1_27);
+    in[28] = _mm_sub_epi16(stp1_3, stp1_28);
+    in[29] = _mm_sub_epi16(stp1_2, stp1_29);
+    in[30] = _mm_sub_epi16(stp1_1, stp1_30);
+    in[31] = _mm_sub_epi16(stp1_0, stp1_31);
+
+    for (j = 0; j < 32; ++j) {
+      // Final rounding and shift
+      in[j] = _mm_adds_epi16(in[j], final_rounding);
+      in[j] = _mm_srai_epi16(in[j], 6);
+      RECON_AND_STORE(dest + j * stride, in[j]);
+    }
+
+    dest += 8;
+  }
+}
+
+void vp10_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
+                                 int stride) {
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+  const __m128i zero = _mm_setzero_si128();
+
+  // vp10_idct constants for each stage
+  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
+  const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
+  const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
+  const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
+  const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
+  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
+  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
+  const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
+  const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
+  const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
+  const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
+  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
+
+  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
+  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
+  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
+  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+
+  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
+  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
+  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
+
+  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+
+  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+
+  __m128i in[32], col[128], zero_idx[16];
+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
+          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+          stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
+          stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
+          stp1_30, stp1_31;
+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
+          stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
+          stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
+          stp2_30, stp2_31;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  int i, j, i32;
+
+  for (i = 0; i < 4; i++) {
+    i32 = (i << 5);
+    // First 1-D vp10_idct
+    // Load input data.
+    LOAD_DQCOEFF(in[0], input);
+    LOAD_DQCOEFF(in[8], input);
+    LOAD_DQCOEFF(in[16], input);
+    LOAD_DQCOEFF(in[24], input);
+    LOAD_DQCOEFF(in[1], input);
+    LOAD_DQCOEFF(in[9], input);
+    LOAD_DQCOEFF(in[17], input);
+    LOAD_DQCOEFF(in[25], input);
+    LOAD_DQCOEFF(in[2], input);
+    LOAD_DQCOEFF(in[10], input);
+    LOAD_DQCOEFF(in[18], input);
+    LOAD_DQCOEFF(in[26], input);
+    LOAD_DQCOEFF(in[3], input);
+    LOAD_DQCOEFF(in[11], input);
+    LOAD_DQCOEFF(in[19], input);
+    LOAD_DQCOEFF(in[27], input);
+
+    LOAD_DQCOEFF(in[4], input);
+    LOAD_DQCOEFF(in[12], input);
+    LOAD_DQCOEFF(in[20], input);
+    LOAD_DQCOEFF(in[28], input);
+    LOAD_DQCOEFF(in[5], input);
+    LOAD_DQCOEFF(in[13], input);
+    LOAD_DQCOEFF(in[21], input);
+    LOAD_DQCOEFF(in[29], input);
+    LOAD_DQCOEFF(in[6], input);
+    LOAD_DQCOEFF(in[14], input);
+    LOAD_DQCOEFF(in[22], input);
+    LOAD_DQCOEFF(in[30], input);
+    LOAD_DQCOEFF(in[7], input);
+    LOAD_DQCOEFF(in[15], input);
+    LOAD_DQCOEFF(in[23], input);
+    LOAD_DQCOEFF(in[31], input);
+
+    // checking if all entries are zero
+    zero_idx[0] = _mm_or_si128(in[0], in[1]);
+    zero_idx[1] = _mm_or_si128(in[2], in[3]);
+    zero_idx[2] = _mm_or_si128(in[4], in[5]);
+    zero_idx[3] = _mm_or_si128(in[6], in[7]);
+    zero_idx[4] = _mm_or_si128(in[8], in[9]);
+    zero_idx[5] = _mm_or_si128(in[10], in[11]);
+    zero_idx[6] = _mm_or_si128(in[12], in[13]);
+    zero_idx[7] = _mm_or_si128(in[14], in[15]);
+    zero_idx[8] = _mm_or_si128(in[16], in[17]);
+    zero_idx[9] = _mm_or_si128(in[18], in[19]);
+    zero_idx[10] = _mm_or_si128(in[20], in[21]);
+    zero_idx[11] = _mm_or_si128(in[22], in[23]);
+    zero_idx[12] = _mm_or_si128(in[24], in[25]);
+    zero_idx[13] = _mm_or_si128(in[26], in[27]);
+    zero_idx[14] = _mm_or_si128(in[28], in[29]);
+    zero_idx[15] = _mm_or_si128(in[30], in[31]);
+
+    zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
+    zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
+    zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
+    zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
+    zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
+    zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
+    zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
+    zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
+
+    zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
+    zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
+    zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
+    zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
+    zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
+    zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
+    zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
+
+    if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) {
+      col[i32 + 0] = _mm_setzero_si128();
+      col[i32 + 1] = _mm_setzero_si128();
+      col[i32 + 2] = _mm_setzero_si128();
+      col[i32 + 3] = _mm_setzero_si128();
+      col[i32 + 4] = _mm_setzero_si128();
+      col[i32 + 5] = _mm_setzero_si128();
+      col[i32 + 6] = _mm_setzero_si128();
+      col[i32 + 7] = _mm_setzero_si128();
+      col[i32 + 8] = _mm_setzero_si128();
+      col[i32 + 9] = _mm_setzero_si128();
+      col[i32 + 10] = _mm_setzero_si128();
+      col[i32 + 11] = _mm_setzero_si128();
+      col[i32 + 12] = _mm_setzero_si128();
+      col[i32 + 13] = _mm_setzero_si128();
+      col[i32 + 14] = _mm_setzero_si128();
+      col[i32 + 15] = _mm_setzero_si128();
+      col[i32 + 16] = _mm_setzero_si128();
+      col[i32 + 17] = _mm_setzero_si128();
+      col[i32 + 18] = _mm_setzero_si128();
+      col[i32 + 19] = _mm_setzero_si128();
+      col[i32 + 20] = _mm_setzero_si128();
+      col[i32 + 21] = _mm_setzero_si128();
+      col[i32 + 22] = _mm_setzero_si128();
+      col[i32 + 23] = _mm_setzero_si128();
+      col[i32 + 24] = _mm_setzero_si128();
+      col[i32 + 25] = _mm_setzero_si128();
+      col[i32 + 26] = _mm_setzero_si128();
+      col[i32 + 27] = _mm_setzero_si128();
+      col[i32 + 28] = _mm_setzero_si128();
+      col[i32 + 29] = _mm_setzero_si128();
+      col[i32 + 30] = _mm_setzero_si128();
+      col[i32 + 31] = _mm_setzero_si128();
+      continue;
+    }
+
+    // Transpose 32x8 block to 8x32 block
+    array_transpose_8x8(in, in);
+    array_transpose_8x8(in + 8, in + 8);
+    array_transpose_8x8(in + 16, in + 16);
+    array_transpose_8x8(in + 24, in + 24);
+
+    IDCT32
+
+    // 1_D: Store 32 intermediate results for each 8x32 block.
+    col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
+    col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
+    col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
+    col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
+    col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
+    col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
+    col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
+    col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
+    col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
+    col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
+    col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
+    col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
+    col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
+    col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
+    col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
+    col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
+    col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
+    col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
+    col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
+    col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
+    col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
+    col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
+    col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
+    col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
+    col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
+    col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
+    col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
+    col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
+    col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
+    col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
+    col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
+    col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
+  }
+  for (i = 0; i < 4; i++) {
+    // Second 1-D vp10_idct
+    j = i << 3;
+
+    // Transpose 32x8 block to 8x32 block
+    array_transpose_8x8(col + j, in);
+    array_transpose_8x8(col + j + 32, in + 8);
+    array_transpose_8x8(col + j + 64, in + 16);
+    array_transpose_8x8(col + j + 96, in + 24);
+
+    IDCT32
+
+    // 2_D: Calculate the results and store them to destination.
+    in[0] = _mm_add_epi16(stp1_0, stp1_31);
+    in[1] = _mm_add_epi16(stp1_1, stp1_30);
+    in[2] = _mm_add_epi16(stp1_2, stp1_29);
+    in[3] = _mm_add_epi16(stp1_3, stp1_28);
+    in[4] = _mm_add_epi16(stp1_4, stp1_27);
+    in[5] = _mm_add_epi16(stp1_5, stp1_26);
+    in[6] = _mm_add_epi16(stp1_6, stp1_25);
+    in[7] = _mm_add_epi16(stp1_7, stp1_24);
+    in[8] = _mm_add_epi16(stp1_8, stp1_23);
+    in[9] = _mm_add_epi16(stp1_9, stp1_22);
+    in[10] = _mm_add_epi16(stp1_10, stp1_21);
+    in[11] = _mm_add_epi16(stp1_11, stp1_20);
+    in[12] = _mm_add_epi16(stp1_12, stp1_19);
+    in[13] = _mm_add_epi16(stp1_13, stp1_18);
+    in[14] = _mm_add_epi16(stp1_14, stp1_17);
+    in[15] = _mm_add_epi16(stp1_15, stp1_16);
+    in[16] = _mm_sub_epi16(stp1_15, stp1_16);
+    in[17] = _mm_sub_epi16(stp1_14, stp1_17);
+    in[18] = _mm_sub_epi16(stp1_13, stp1_18);
+    in[19] = _mm_sub_epi16(stp1_12, stp1_19);
+    in[20] = _mm_sub_epi16(stp1_11, stp1_20);
+    in[21] = _mm_sub_epi16(stp1_10, stp1_21);
+    in[22] = _mm_sub_epi16(stp1_9, stp1_22);
+    in[23] = _mm_sub_epi16(stp1_8, stp1_23);
+    in[24] = _mm_sub_epi16(stp1_7, stp1_24);
+    in[25] = _mm_sub_epi16(stp1_6, stp1_25);
+    in[26] = _mm_sub_epi16(stp1_5, stp1_26);
+    in[27] = _mm_sub_epi16(stp1_4, stp1_27);
+    in[28] = _mm_sub_epi16(stp1_3, stp1_28);
+    in[29] = _mm_sub_epi16(stp1_2, stp1_29);
+    in[30] = _mm_sub_epi16(stp1_1, stp1_30);
+    in[31] = _mm_sub_epi16(stp1_0, stp1_31);
+
+    for (j = 0; j < 32; ++j) {
+      // Final rounding and shift
+      in[j] = _mm_adds_epi16(in[j], final_rounding);
+      in[j] = _mm_srai_epi16(in[j], 6);
+      RECON_AND_STORE(dest + j * stride, in[j]);
+    }
+
+    dest += 8;
+  }
+}
+
+void vp10_idct32x32_1_add_sse2(const int16_t *input,
+                               uint8_t *dest,
+                               int stride) {
+  __m128i dc_value;
+  const __m128i zero = _mm_setzero_si128();
+  int a, i;
+
+  a = dct_const_round_shift(input[0] * cospi_16_64);
+  a = dct_const_round_shift(a * cospi_16_64);
+  a = ROUND_POWER_OF_TWO(a, 6);
+
+  dc_value = _mm_set1_epi16(a);
+
+  for (i = 0; i < 4; ++i) {
+    int j;
+    for (j = 0; j < 32; ++j) {
+      RECON_AND_STORE(dest + j * stride, dc_value);
+    }
+    dest += 8;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
+  __m128i ubounded, retval;
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one);
+  ubounded = _mm_cmpgt_epi16(value, max);
+  retval = _mm_andnot_si128(ubounded, value);
+  ubounded = _mm_and_si128(ubounded, max);
+  retval = _mm_or_si128(retval, ubounded);
+  retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));
+  return retval;
+}
+
+void vp10_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8,
+                                    int stride, int bd) {
+  tran_low_t out[4 * 4];
+  tran_low_t *outptr = out;
+  int i, j;
+  __m128i inptr[4];
+  __m128i sign_bits[2];
+  __m128i temp_mm, min_input, max_input;
+  int test;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+  int optimised_cols = 0;
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i eight = _mm_set1_epi16(8);
+  const __m128i max = _mm_set1_epi16(12043);
+  const __m128i min = _mm_set1_epi16(-12043);
+  // Load input into __m128i
+  inptr[0] = _mm_loadu_si128((const __m128i *)input);
+  inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4));
+  inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8));
+  inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12));
+
+  // Pack to 16 bits
+  inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]);
+  inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]);
+
+  max_input = _mm_max_epi16(inptr[0], inptr[1]);
+  min_input = _mm_min_epi16(inptr[0], inptr[1]);
+  max_input = _mm_cmpgt_epi16(max_input, max);
+  min_input = _mm_cmplt_epi16(min_input, min);
+  temp_mm = _mm_or_si128(max_input, min_input);
+  test = _mm_movemask_epi8(temp_mm);
+
+  if (!test) {
+    // Do the row transform
+    vp10_idct4_sse2(inptr);
+
+    // Check the min & max values
+    max_input = _mm_max_epi16(inptr[0], inptr[1]);
+    min_input = _mm_min_epi16(inptr[0], inptr[1]);
+    max_input = _mm_cmpgt_epi16(max_input, max);
+    min_input = _mm_cmplt_epi16(min_input, min);
+    temp_mm = _mm_or_si128(max_input, min_input);
+    test = _mm_movemask_epi8(temp_mm);
+
+    if (test) {
+      transpose_4x4(inptr);
+      sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);
+      sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);
+      inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);
+      inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]);
+      inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]);
+      inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]);
+      _mm_storeu_si128((__m128i *)outptr, inptr[0]);
+      _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]);
+      _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]);
+      _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]);
+    } else {
+      // Set to use the optimised transform for the column
+      optimised_cols = 1;
+    }
+  } else {
+    // Run the un-optimised row transform
+    for (i = 0; i < 4; ++i) {
+      vp10_highbd_idct4_c(input, outptr, bd);
+      input += 4;
+      outptr += 4;
+    }
+  }
+
+  if (optimised_cols) {
+    vp10_idct4_sse2(inptr);
+
+    // Final round and shift
+    inptr[0] = _mm_add_epi16(inptr[0], eight);
+    inptr[1] = _mm_add_epi16(inptr[1], eight);
+
+    inptr[0] = _mm_srai_epi16(inptr[0], 4);
+    inptr[1] = _mm_srai_epi16(inptr[1], 4);
+
+    // Reconstruction and Store
+    {
+      __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
+      __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
+      d0 = _mm_unpacklo_epi64(
+          d0, _mm_loadl_epi64((const __m128i *)(dest + stride)));
+      d2 = _mm_unpacklo_epi64(
+          d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
+      d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd);
+      d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd);
+      // store input0
+      _mm_storel_epi64((__m128i *)dest, d0);
+      // store input1
+      d0 = _mm_srli_si128(d0, 8);
+      _mm_storel_epi64((__m128i *)(dest + stride), d0);
+      // store input2
+      _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
+      // store input3
+      d2 = _mm_srli_si128(d2, 8);
+      _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
+    }
+  } else {
+    // Run the un-optimised column transform
+    tran_low_t temp_in[4], temp_out[4];
+    // Columns
+    for (i = 0; i < 4; ++i) {
+      for (j = 0; j < 4; ++j)
+        temp_in[j] = out[j * 4 + i];
+      vp10_highbd_idct4_c(temp_in, temp_out, bd);
+      for (j = 0; j < 4; ++j) {
+        dest[j * stride + i] = highbd_clip_pixel_add(
+            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
+      }
+    }
+  }
+}
+
+void vp10_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8,
+                                    int stride, int bd) {
+  tran_low_t out[8 * 8];
+  tran_low_t *outptr = out;
+  int i, j, test;
+  __m128i inptr[8];
+  __m128i min_input, max_input, temp1, temp2, sign_bits;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i sixteen = _mm_set1_epi16(16);
+  const __m128i max = _mm_set1_epi16(6201);
+  const __m128i min = _mm_set1_epi16(-6201);
+  int optimised_cols = 0;
+
+  // Load input into __m128i & pack to 16 bits
+  for (i = 0; i < 8; i++) {
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
+    inptr[i] = _mm_packs_epi32(temp1, temp2);
+  }
+
+  // Find the min & max for the row transform
+  max_input = _mm_max_epi16(inptr[0], inptr[1]);
+  min_input = _mm_min_epi16(inptr[0], inptr[1]);
+  for (i = 2; i < 8; i++) {
+    max_input = _mm_max_epi16(max_input, inptr[i]);
+    min_input = _mm_min_epi16(min_input, inptr[i]);
+  }
+  max_input = _mm_cmpgt_epi16(max_input, max);
+  min_input = _mm_cmplt_epi16(min_input, min);
+  temp1 = _mm_or_si128(max_input, min_input);
+  test = _mm_movemask_epi8(temp1);
+
+  if (!test) {
+    // Do the row transform
+    vp10_idct8_sse2(inptr);
+
+    // Find the min & max for the column transform
+    max_input = _mm_max_epi16(inptr[0], inptr[1]);
+    min_input = _mm_min_epi16(inptr[0], inptr[1]);
+    for (i = 2; i < 8; i++) {
+      max_input = _mm_max_epi16(max_input, inptr[i]);
+      min_input = _mm_min_epi16(min_input, inptr[i]);
+    }
+    max_input = _mm_cmpgt_epi16(max_input, max);
+    min_input = _mm_cmplt_epi16(min_input, min);
+    temp1 = _mm_or_si128(max_input, min_input);
+    test = _mm_movemask_epi8(temp1);
+
+    if (test) {
+      array_transpose_8x8(inptr, inptr);
+      for (i = 0; i < 8; i++) {
+        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
+        temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
+        temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
+      }
+    } else {
+      // Set to use the optimised transform for the column
+      optimised_cols = 1;
+    }
+  } else {
+    // Run the un-optimised row transform
+    for (i = 0; i < 8; ++i) {
+      vp10_highbd_idct8_c(input, outptr, bd);
+      input += 8;
+      outptr += 8;
+    }
+  }
+
+  if (optimised_cols) {
+    vp10_idct8_sse2(inptr);
+
+    // Final round & shift and Reconstruction and Store
+    {
+      __m128i d[8];
+      for (i = 0; i < 8; i++) {
+        inptr[i] = _mm_add_epi16(inptr[i], sixteen);
+        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
+        inptr[i] = _mm_srai_epi16(inptr[i], 5);
+        d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
+        // Store
+        _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
+      }
+    }
+  } else {
+    // Run the un-optimised column transform
+    tran_low_t temp_in[8], temp_out[8];
+    for (i = 0; i < 8; ++i) {
+      for (j = 0; j < 8; ++j)
+        temp_in[j] = out[j * 8 + i];
+      vp10_highbd_idct8_c(temp_in, temp_out, bd);
+      for (j = 0; j < 8; ++j) {
+        dest[j * stride + i] = highbd_clip_pixel_add(
+            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+      }
+    }
+  }
+}
+
+void vp10_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
+                                    int stride, int bd) {
+  tran_low_t out[8 * 8] = { 0 };
+  tran_low_t *outptr = out;
+  int i, j, test;
+  __m128i inptr[8];
+  __m128i min_input, max_input, temp1, temp2, sign_bits;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i sixteen = _mm_set1_epi16(16);
+  const __m128i max = _mm_set1_epi16(6201);
+  const __m128i min = _mm_set1_epi16(-6201);
+  int optimised_cols = 0;
+
+  // Load input into __m128i & pack to 16 bits
+  for (i = 0; i < 8; i++) {
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
+    inptr[i] = _mm_packs_epi32(temp1, temp2);
+  }
+
+  // Find the min & max for the row transform
+  // only first 4 row has non-zero coefs
+  max_input = _mm_max_epi16(inptr[0], inptr[1]);
+  min_input = _mm_min_epi16(inptr[0], inptr[1]);
+  for (i = 2; i < 4; i++) {
+    max_input = _mm_max_epi16(max_input, inptr[i]);
+    min_input = _mm_min_epi16(min_input, inptr[i]);
+  }
+  max_input = _mm_cmpgt_epi16(max_input, max);
+  min_input = _mm_cmplt_epi16(min_input, min);
+  temp1 = _mm_or_si128(max_input, min_input);
+  test = _mm_movemask_epi8(temp1);
+
+  if (!test) {
+    // Do the row transform
+    vp10_idct8_sse2(inptr);
+
+    // Find the min & max for the column transform
+    // N.B. Only first 4 cols contain non-zero coeffs
+    max_input = _mm_max_epi16(inptr[0], inptr[1]);
+    min_input = _mm_min_epi16(inptr[0], inptr[1]);
+    for (i = 2; i < 8; i++) {
+      max_input = _mm_max_epi16(max_input, inptr[i]);
+      min_input = _mm_min_epi16(min_input, inptr[i]);
+    }
+    max_input = _mm_cmpgt_epi16(max_input, max);
+    min_input = _mm_cmplt_epi16(min_input, min);
+    temp1 = _mm_or_si128(max_input, min_input);
+    test = _mm_movemask_epi8(temp1);
+
+    if (test) {
+      // Use fact only first 4 rows contain non-zero coeffs
+      array_transpose_4X8(inptr, inptr);
+      for (i = 0; i < 4; i++) {
+        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
+        temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
+        temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
+      }
+    } else {
+      // Set to use the optimised transform for the column
+      optimised_cols = 1;
+    }
+  } else {
+    // Run the un-optimised row transform
+    for (i = 0; i < 4; ++i) {
+      vp10_highbd_idct8_c(input, outptr, bd);
+      input += 8;
+      outptr += 8;
+    }
+  }
+
+  if (optimised_cols) {
+    vp10_idct8_sse2(inptr);
+
+    // Final round & shift and Reconstruction and Store
+    {
+      __m128i d[8];
+      for (i = 0; i < 8; i++) {
+        inptr[i] = _mm_add_epi16(inptr[i], sixteen);
+        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
+        inptr[i] = _mm_srai_epi16(inptr[i], 5);
+        d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
+        // Store
+        _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
+      }
+    }
+  } else {
+    // Run the un-optimised column transform
+    tran_low_t temp_in[8], temp_out[8];
+    for (i = 0; i < 8; ++i) {
+      for (j = 0; j < 8; ++j)
+        temp_in[j] = out[j * 8 + i];
+      vp10_highbd_idct8_c(temp_in, temp_out, bd);
+      for (j = 0; j < 8; ++j) {
+        dest[j * stride + i] = highbd_clip_pixel_add(
+            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+      }
+    }
+  }
+}
+
+void vp10_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8,
+                                       int stride, int bd) {
+  tran_low_t out[16 * 16];
+  tran_low_t *outptr = out;
+  int i, j, test;
+  __m128i inptr[32];
+  __m128i min_input, max_input, temp1, temp2, sign_bits;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i rounding = _mm_set1_epi16(32);
+  const __m128i max = _mm_set1_epi16(3155);
+  const __m128i min = _mm_set1_epi16(-3155);
+  int optimised_cols = 0;
+
+  // Load input into __m128i & pack to 16 bits
+  for (i = 0; i < 16; i++) {
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
+    inptr[i] = _mm_packs_epi32(temp1, temp2);
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
+    inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
+  }
+
+  // Find the min & max for the row transform
+  max_input = _mm_max_epi16(inptr[0], inptr[1]);
+  min_input = _mm_min_epi16(inptr[0], inptr[1]);
+  for (i = 2; i < 32; i++) {
+    max_input = _mm_max_epi16(max_input, inptr[i]);
+    min_input = _mm_min_epi16(min_input, inptr[i]);
+  }
+  max_input = _mm_cmpgt_epi16(max_input, max);
+  min_input = _mm_cmplt_epi16(min_input, min);
+  temp1 = _mm_or_si128(max_input, min_input);
+  test = _mm_movemask_epi8(temp1);
+
+  if (!test) {
+    // Do the row transform
+    vp10_idct16_sse2(inptr, inptr + 16);
+
+    // Find the min & max for the column transform
+    max_input = _mm_max_epi16(inptr[0], inptr[1]);
+    min_input = _mm_min_epi16(inptr[0], inptr[1]);
+    for (i = 2; i < 32; i++) {
+      max_input = _mm_max_epi16(max_input, inptr[i]);
+      min_input = _mm_min_epi16(min_input, inptr[i]);
+    }
+    max_input = _mm_cmpgt_epi16(max_input, max);
+    min_input = _mm_cmplt_epi16(min_input, min);
+    temp1 = _mm_or_si128(max_input, min_input);
+    test = _mm_movemask_epi8(temp1);
+
+    if (test) {
+      array_transpose_16x16(inptr, inptr + 16);
+      for (i = 0; i < 16; i++) {
+        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
+        temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
+        temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
+        sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
+        temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
+        temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
+      }
+    } else {
+      // Set to use the optimised transform for the column
+      optimised_cols = 1;
+    }
+  } else {
+    // Run the un-optimised row transform
+    for (i = 0; i < 16; ++i) {
+      vp10_highbd_idct16_c(input, outptr, bd);
+      input += 16;
+      outptr += 16;
+    }
+  }
+
+  if (optimised_cols) {
+    vp10_idct16_sse2(inptr, inptr + 16);
+
+    // Final round & shift and Reconstruction and Store
+    {
+      __m128i d[2];
+      for (i = 0; i < 16; i++) {
+        inptr[i   ] = _mm_add_epi16(inptr[i   ], rounding);
+        inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);
+        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
+        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));
+        inptr[i   ] = _mm_srai_epi16(inptr[i   ], 6);
+        inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);
+        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i   ]), bd);
+        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);
+        // Store
+        _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
+        _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
+      }
+    }
+  } else {
+    // Run the un-optimised column transform
+    tran_low_t temp_in[16], temp_out[16];
+    for (i = 0; i < 16; ++i) {
+      for (j = 0; j < 16; ++j)
+        temp_in[j] = out[j * 16 + i];
+      vp10_highbd_idct16_c(temp_in, temp_out, bd);
+      for (j = 0; j < 16; ++j) {
+        dest[j * stride + i] = highbd_clip_pixel_add(
+            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+      }
+    }
+  }
+}
+
+void vp10_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
+                                      int stride, int bd) {
+  tran_low_t out[16 * 16] = { 0 };
+  tran_low_t *outptr = out;
+  int i, j, test;
+  __m128i inptr[32];
+  __m128i min_input, max_input, temp1, temp2, sign_bits;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i rounding = _mm_set1_epi16(32);
+  const __m128i max = _mm_set1_epi16(3155);
+  const __m128i min = _mm_set1_epi16(-3155);
+  int optimised_cols = 0;
+
+  // Load input into __m128i & pack to 16 bits
+  for (i = 0; i < 16; i++) {
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
+    inptr[i] = _mm_packs_epi32(temp1, temp2);
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
+    inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
+  }
+
+  // Find the min & max for the row transform
+  // Since all non-zero dct coefficients are in upper-left 4x4 area,
+  // we only need to consider first 4 rows here.
+  max_input = _mm_max_epi16(inptr[0], inptr[1]);
+  min_input = _mm_min_epi16(inptr[0], inptr[1]);
+  for (i = 2; i < 4; i++) {
+    max_input = _mm_max_epi16(max_input, inptr[i]);
+    min_input = _mm_min_epi16(min_input, inptr[i]);
+  }
+  max_input = _mm_cmpgt_epi16(max_input, max);
+  min_input = _mm_cmplt_epi16(min_input, min);
+  temp1 = _mm_or_si128(max_input, min_input);
+  test = _mm_movemask_epi8(temp1);
+
+  if (!test) {
+    // Do the row transform (N.B. This transposes inptr)
+    vp10_idct16_sse2(inptr, inptr + 16);
+
+    // Find the min & max for the column transform
+    // N.B. Only first 4 cols contain non-zero coeffs
+    max_input = _mm_max_epi16(inptr[0], inptr[1]);
+    min_input = _mm_min_epi16(inptr[0], inptr[1]);
+    for (i = 2; i < 16; i++) {
+      max_input = _mm_max_epi16(max_input, inptr[i]);
+      min_input = _mm_min_epi16(min_input, inptr[i]);
+    }
+    max_input = _mm_cmpgt_epi16(max_input, max);
+    min_input = _mm_cmplt_epi16(min_input, min);
+    temp1 = _mm_or_si128(max_input, min_input);
+    test = _mm_movemask_epi8(temp1);
+
+    if (test) {
+      // Use fact only first 4 rows contain non-zero coeffs
+      array_transpose_8x8(inptr, inptr);
+      array_transpose_8x8(inptr + 8, inptr + 16);
+      for (i = 0; i < 4; i++) {
+        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
+        temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
+        temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
+        sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
+        temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
+        temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
+      }
+    } else {
+      // Set to use the optimised transform for the column
+      optimised_cols = 1;
+    }
+  } else {
+    // Run the un-optimised row transform
+    for (i = 0; i < 4; ++i) {
+      vp10_highbd_idct16_c(input, outptr, bd);
+      input += 16;
+      outptr += 16;
+    }
+  }
+
+  if (optimised_cols) {
+    vp10_idct16_sse2(inptr, inptr + 16);
+
+    // Final round & shift and Reconstruction and Store
+    {
+      __m128i d[2];
+      for (i = 0; i < 16; i++) {
+        inptr[i   ] = _mm_add_epi16(inptr[i   ], rounding);
+        inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);
+        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
+        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));
+        inptr[i   ] = _mm_srai_epi16(inptr[i   ], 6);
+        inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);
+        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i   ]), bd);
+        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);
+        // Store
+        _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
+        _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
+      }
+    }
+  } else {
+    // Run the un-optimised column transform
+    tran_low_t temp_in[16], temp_out[16];
+    for (i = 0; i < 16; ++i) {
+      for (j = 0; j < 16; ++j)
+        temp_in[j] = out[j * 16 + i];
+      vp10_highbd_idct16_c(temp_in, temp_out, bd);
+      for (j = 0; j < 16; ++j) {
+        dest[j * stride + i] = highbd_clip_pixel_add(
+            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+      }
+    }
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/libs/libvpx/vp10/common/x86/vp10_inv_txfm_sse2.h b/libs/libvpx/vp10/common/x86/vp10_inv_txfm_sse2.h
new file mode 100644
index 0000000000..b79781aeeb
--- /dev/null
+++ b/libs/libvpx/vp10/common/x86/vp10_inv_txfm_sse2.h
@@ -0,0 +1,184 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_X86_INV_TXFM_SSE2_H_
+#define VPX_DSP_X86_INV_TXFM_SSE2_H_
+
+#include <emmintrin.h>  // SSE2
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vp10/common/vp10_inv_txfm.h"
+
+// perform 8x8 transpose
+static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
+  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
+  const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
+  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
+  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
+  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
+
+  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+
+  res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
+  res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
+  res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
+  res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
+  res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
+  res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
+  res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
+  res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
+}
+
+#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \
+  {                                                     \
+    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
+    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
+                                                        \
+    in0 = _mm_unpacklo_epi32(tr0_0, tr0_1);  /* i1 i0 */  \
+    in1 = _mm_unpackhi_epi32(tr0_0, tr0_1);  /* i3 i2 */  \
+  }
+
+static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) {
+  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
+
+  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+
+  out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4);
+  out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4);
+  out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6);
+  out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6);
+}
+
+static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
+  __m128i tbuf[8];
+  array_transpose_8x8(res0, res0);
+  array_transpose_8x8(res1, tbuf);
+  array_transpose_8x8(res0 + 8, res1);
+  array_transpose_8x8(res1 + 8, res1 + 8);
+
+  res0[8] = tbuf[0];
+  res0[9] = tbuf[1];
+  res0[10] = tbuf[2];
+  res0[11] = tbuf[3];
+  res0[12] = tbuf[4];
+  res0[13] = tbuf[5];
+  res0[14] = tbuf[6];
+  res0[15] = tbuf[7];
+}
+
+static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) {
+  in[0]  = _mm_load_si128((const __m128i *)(input + 0 * 16));
+  in[1]  = _mm_load_si128((const __m128i *)(input + 1 * 16));
+  in[2]  = _mm_load_si128((const __m128i *)(input + 2 * 16));
+  in[3]  = _mm_load_si128((const __m128i *)(input + 3 * 16));
+  in[4]  = _mm_load_si128((const __m128i *)(input + 4 * 16));
+  in[5]  = _mm_load_si128((const __m128i *)(input + 5 * 16));
+  in[6]  = _mm_load_si128((const __m128i *)(input + 6 * 16));
+  in[7]  = _mm_load_si128((const __m128i *)(input + 7 * 16));
+
+  in[8]  = _mm_load_si128((const __m128i *)(input + 8 * 16));
+  in[9]  = _mm_load_si128((const __m128i *)(input + 9 * 16));
+  in[10]  = _mm_load_si128((const __m128i *)(input + 10 * 16));
+  in[11]  = _mm_load_si128((const __m128i *)(input + 11 * 16));
+  in[12]  = _mm_load_si128((const __m128i *)(input + 12 * 16));
+  in[13]  = _mm_load_si128((const __m128i *)(input + 13 * 16));
+  in[14]  = _mm_load_si128((const __m128i *)(input + 14 * 16));
+  in[15]  = _mm_load_si128((const __m128i *)(input + 15 * 16));
+}
+
+#define RECON_AND_STORE(dest, in_x) \
+  {                                                     \
+     __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
+      d0 = _mm_unpacklo_epi8(d0, zero); \
+      d0 = _mm_add_epi16(in_x, d0); \
+      d0 = _mm_packus_epi16(d0, d0); \
+      _mm_storel_epi64((__m128i *)(dest), d0); \
+  }
+
+static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
+  const __m128i final_rounding = _mm_set1_epi16(1<<5);
+  const __m128i zero = _mm_setzero_si128();
+  // Final rounding and shift
+  in[0] = _mm_adds_epi16(in[0], final_rounding);
+  in[1] = _mm_adds_epi16(in[1], final_rounding);
+  in[2] = _mm_adds_epi16(in[2], final_rounding);
+  in[3] = _mm_adds_epi16(in[3], final_rounding);
+  in[4] = _mm_adds_epi16(in[4], final_rounding);
+  in[5] = _mm_adds_epi16(in[5], final_rounding);
+  in[6] = _mm_adds_epi16(in[6], final_rounding);
+  in[7] = _mm_adds_epi16(in[7], final_rounding);
+  in[8] = _mm_adds_epi16(in[8], final_rounding);
+  in[9] = _mm_adds_epi16(in[9], final_rounding);
+  in[10] = _mm_adds_epi16(in[10], final_rounding);
+  in[11] = _mm_adds_epi16(in[11], final_rounding);
+  in[12] = _mm_adds_epi16(in[12], final_rounding);
+  in[13] = _mm_adds_epi16(in[13], final_rounding);
+  in[14] = _mm_adds_epi16(in[14], final_rounding);
+  in[15] = _mm_adds_epi16(in[15], final_rounding);
+
+  in[0] = _mm_srai_epi16(in[0], 6);
+  in[1] = _mm_srai_epi16(in[1], 6);
+  in[2] = _mm_srai_epi16(in[2], 6);
+  in[3] = _mm_srai_epi16(in[3], 6);
+  in[4] = _mm_srai_epi16(in[4], 6);
+  in[5] = _mm_srai_epi16(in[5], 6);
+  in[6] = _mm_srai_epi16(in[6], 6);
+  in[7] = _mm_srai_epi16(in[7], 6);
+  in[8] = _mm_srai_epi16(in[8], 6);
+  in[9] = _mm_srai_epi16(in[9], 6);
+  in[10] = _mm_srai_epi16(in[10], 6);
+  in[11] = _mm_srai_epi16(in[11], 6);
+  in[12] = _mm_srai_epi16(in[12], 6);
+  in[13] = _mm_srai_epi16(in[13], 6);
+  in[14] = _mm_srai_epi16(in[14], 6);
+  in[15] = _mm_srai_epi16(in[15], 6);
+
+  RECON_AND_STORE(dest +  0 * stride, in[0]);
+  RECON_AND_STORE(dest +  1 * stride, in[1]);
+  RECON_AND_STORE(dest +  2 * stride, in[2]);
+  RECON_AND_STORE(dest +  3 * stride, in[3]);
+  RECON_AND_STORE(dest +  4 * stride, in[4]);
+  RECON_AND_STORE(dest +  5 * stride, in[5]);
+  RECON_AND_STORE(dest +  6 * stride, in[6]);
+  RECON_AND_STORE(dest +  7 * stride, in[7]);
+  RECON_AND_STORE(dest +  8 * stride, in[8]);
+  RECON_AND_STORE(dest +  9 * stride, in[9]);
+  RECON_AND_STORE(dest + 10 * stride, in[10]);
+  RECON_AND_STORE(dest + 11 * stride, in[11]);
+  RECON_AND_STORE(dest + 12 * stride, in[12]);
+  RECON_AND_STORE(dest + 13 * stride, in[13]);
+  RECON_AND_STORE(dest + 14 * stride, in[14]);
+  RECON_AND_STORE(dest + 15 * stride, in[15]);
+}
+
+void idct4_sse2(__m128i *in);
+void idct8_sse2(__m128i *in);
+void idct16_sse2(__m128i *in0, __m128i *in1);
+void iadst4_sse2(__m128i *in);
+void iadst8_sse2(__m128i *in);
+void iadst16_sse2(__m128i *in0, __m128i *in1);
+
+#endif  // VPX_DSP_X86_INV_TXFM_SSE2_H_
diff --git a/libs/libvpx/vp10/decoder/decodeframe.c b/libs/libvpx/vp10/decoder/decodeframe.c
new file mode 100644
index 0000000000..1c3f182390
--- /dev/null
+++ b/libs/libvpx/vp10/decoder/decodeframe.c
@@ -0,0 +1,2433 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdlib.h>  // qsort()
+
+#include "./vp10_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_scale_rtcd.h"
+
+#include "vpx_dsp/bitreader_buffer.h"
+#include "vpx_dsp/bitreader.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/mem_ops.h"
+#include "vpx_scale/vpx_scale.h"
+#include "vpx_util/vpx_thread.h"
+
+#include "vp10/common/alloccommon.h"
+#include "vp10/common/common.h"
+#include "vp10/common/entropy.h"
+#include "vp10/common/entropymode.h"
+#include "vp10/common/idct.h"
+#include "vp10/common/thread_common.h"
+#include "vp10/common/pred_common.h"
+#include "vp10/common/quant_common.h"
+#include "vp10/common/reconintra.h"
+#include "vp10/common/reconinter.h"
+#include "vp10/common/seg_common.h"
+#include "vp10/common/tile_common.h"
+
+#include "vp10/decoder/decodeframe.h"
+#include "vp10/decoder/detokenize.h"
+#include "vp10/decoder/decodemv.h"
+#include "vp10/decoder/decoder.h"
+#include "vp10/decoder/dsubexp.h"
+
+#define MAX_VP9_HEADER_SIZE 80
+
+static int is_compound_reference_allowed(const VP10_COMMON *cm) {
+  int i;
+  if (frame_is_intra_only(cm))
+    return 0;
+  for (i = 1; i < REFS_PER_FRAME; ++i)
+    if (cm->ref_frame_sign_bias[i + 1] != cm->ref_frame_sign_bias[1])
+      return 1;
+
+  return 0;
+}
+
+static void setup_compound_reference_mode(VP10_COMMON *cm) {
+  if (cm->ref_frame_sign_bias[LAST_FRAME] ==
+          cm->ref_frame_sign_bias[GOLDEN_FRAME]) {
+    cm->comp_fixed_ref = ALTREF_FRAME;
+    cm->comp_var_ref[0] = LAST_FRAME;
+    cm->comp_var_ref[1] = GOLDEN_FRAME;
+  } else if (cm->ref_frame_sign_bias[LAST_FRAME] ==
+                 cm->ref_frame_sign_bias[ALTREF_FRAME]) {
+    cm->comp_fixed_ref = GOLDEN_FRAME;
+    cm->comp_var_ref[0] = LAST_FRAME;
+    cm->comp_var_ref[1] = ALTREF_FRAME;
+  } else {
+    cm->comp_fixed_ref = LAST_FRAME;
+    cm->comp_var_ref[0] = GOLDEN_FRAME;
+    cm->comp_var_ref[1] = ALTREF_FRAME;
+  }
+}
+
+static int read_is_valid(const uint8_t *start, size_t len, const uint8_t *end) {
+  return len != 0 && len <= (size_t)(end - start);
+}
+
+static int decode_unsigned_max(struct vpx_read_bit_buffer *rb, int max) {
+  const int data = vpx_rb_read_literal(rb, get_unsigned_bits(max));
+  return data > max ? max : data;
+}
+
+#if CONFIG_MISC_FIXES
+static TX_MODE read_tx_mode(struct vpx_read_bit_buffer *rb) {
+  return vpx_rb_read_bit(rb) ? TX_MODE_SELECT : vpx_rb_read_literal(rb, 2);
+}
+#else
+static TX_MODE read_tx_mode(vpx_reader *r) {
+  TX_MODE tx_mode = vpx_read_literal(r, 2);
+  if (tx_mode == ALLOW_32X32)
+    tx_mode += vpx_read_bit(r);
+  return tx_mode;
+}
+#endif
+
+static void read_tx_mode_probs(struct tx_probs *tx_probs, vpx_reader *r) {
+  int i, j;
+
+  for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
+    for (j = 0; j < TX_SIZES - 3; ++j)
+      vp10_diff_update_prob(r, &tx_probs->p8x8[i][j]);
+
+  for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
+    for (j = 0; j < TX_SIZES - 2; ++j)
+      vp10_diff_update_prob(r, &tx_probs->p16x16[i][j]);
+
+  for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
+    for (j = 0; j < TX_SIZES - 1; ++j)
+      vp10_diff_update_prob(r, &tx_probs->p32x32[i][j]);
+}
+
+static void read_switchable_interp_probs(FRAME_CONTEXT *fc, vpx_reader *r) {
+  int i, j;
+  for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
+    for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i)
+      vp10_diff_update_prob(r, &fc->switchable_interp_prob[j][i]);
+}
+
+static void read_inter_mode_probs(FRAME_CONTEXT *fc, vpx_reader *r) {
+  int i, j;
+  for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
+    for (j = 0; j < INTER_MODES - 1; ++j)
+      vp10_diff_update_prob(r, &fc->inter_mode_probs[i][j]);
+}
+
+#if CONFIG_MISC_FIXES
+static REFERENCE_MODE read_frame_reference_mode(const VP10_COMMON *cm,
+    struct vpx_read_bit_buffer *rb) {
+  if (is_compound_reference_allowed(cm)) {
+    return vpx_rb_read_bit(rb) ? REFERENCE_MODE_SELECT
+                               : (vpx_rb_read_bit(rb) ? COMPOUND_REFERENCE
+                                                      : SINGLE_REFERENCE);
+  } else {
+    return SINGLE_REFERENCE;
+  }
+}
+#else
+static REFERENCE_MODE read_frame_reference_mode(const VP10_COMMON *cm,
+                                                vpx_reader *r) {
+  if (is_compound_reference_allowed(cm)) {
+    return vpx_read_bit(r) ? (vpx_read_bit(r) ? REFERENCE_MODE_SELECT
+                                              : COMPOUND_REFERENCE)
+                           : SINGLE_REFERENCE;
+  } else {
+    return SINGLE_REFERENCE;
+  }
+}
+#endif
+
+static void read_frame_reference_mode_probs(VP10_COMMON *cm, vpx_reader *r) {
+  FRAME_CONTEXT *const fc = cm->fc;
+  int i;
+
+  if (cm->reference_mode == REFERENCE_MODE_SELECT)
+    for (i = 0; i < COMP_INTER_CONTEXTS; ++i)
+      vp10_diff_update_prob(r, &fc->comp_inter_prob[i]);
+
+  if (cm->reference_mode != COMPOUND_REFERENCE)
+    for (i = 0; i < REF_CONTEXTS; ++i) {
+      vp10_diff_update_prob(r, &fc->single_ref_prob[i][0]);
+      vp10_diff_update_prob(r, &fc->single_ref_prob[i][1]);
+    }
+
+  if (cm->reference_mode != SINGLE_REFERENCE)
+    for (i = 0; i < REF_CONTEXTS; ++i)
+      vp10_diff_update_prob(r, &fc->comp_ref_prob[i]);
+}
+
+static void update_mv_probs(vpx_prob *p, int n, vpx_reader *r) {
+  int i;
+  for (i = 0; i < n; ++i)
+#if CONFIG_MISC_FIXES
+    vp10_diff_update_prob(r, &p[i]);
+#else
+    if (vpx_read(r, MV_UPDATE_PROB))
+      p[i] = (vpx_read_literal(r, 7) << 1) | 1;
+#endif
+}
+
+static void read_mv_probs(nmv_context *ctx, int allow_hp, vpx_reader *r) {
+  int i, j;
+
+  update_mv_probs(ctx->joints, MV_JOINTS - 1, r);
+
+  for (i = 0; i < 2; ++i) {
+    nmv_component *const comp_ctx = &ctx->comps[i];
+    update_mv_probs(&comp_ctx->sign, 1, r);
+    update_mv_probs(comp_ctx->classes, MV_CLASSES - 1, r);
+    update_mv_probs(comp_ctx->class0, CLASS0_SIZE - 1, r);
+    update_mv_probs(comp_ctx->bits, MV_OFFSET_BITS, r);
+  }
+
+  for (i = 0; i < 2; ++i) {
+    nmv_component *const comp_ctx = &ctx->comps[i];
+    for (j = 0; j < CLASS0_SIZE; ++j)
+      update_mv_probs(comp_ctx->class0_fp[j], MV_FP_SIZE - 1, r);
+    update_mv_probs(comp_ctx->fp, 3, r);
+  }
+
+  if (allow_hp) {
+    for (i = 0; i < 2; ++i) {
+      nmv_component *const comp_ctx = &ctx->comps[i];
+      update_mv_probs(&comp_ctx->class0_hp, 1, r);
+      update_mv_probs(&comp_ctx->hp, 1, r);
+    }
+  }
+}
+
+static void inverse_transform_block_inter(MACROBLOCKD* xd, int plane,
+                                          const TX_SIZE tx_size,
+                                          uint8_t *dst, int stride,
+                                          int eob, int block) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  TX_TYPE tx_type = get_tx_type(pd->plane_type, xd, block);
+  const int seg_id = xd->mi[0]->mbmi.segment_id;
+  if (eob > 0) {
+    tran_low_t *const dqcoeff = pd->dqcoeff;
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      switch (tx_size) {
+        case TX_4X4:
+          vp10_highbd_inv_txfm_add_4x4(dqcoeff, dst, stride, eob, xd->bd,
+                                       tx_type, xd->lossless[seg_id]);
+          break;
+        case TX_8X8:
+          vp10_highbd_inv_txfm_add_8x8(dqcoeff, dst, stride, eob, xd->bd,
+                                       tx_type);
+          break;
+        case TX_16X16:
+          vp10_highbd_inv_txfm_add_16x16(dqcoeff, dst, stride, eob, xd->bd,
+                                         tx_type);
+          break;
+        case TX_32X32:
+          vp10_highbd_inv_txfm_add_32x32(dqcoeff, dst, stride, eob, xd->bd,
+                                         tx_type);
+          break;
+        default:
+          assert(0 && "Invalid transform size");
+          return;
+      }
+    } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      switch (tx_size) {
+        case TX_4X4:
+          vp10_inv_txfm_add_4x4(dqcoeff, dst, stride, eob, tx_type,
+                                xd->lossless[seg_id]);
+          break;
+        case TX_8X8:
+          vp10_inv_txfm_add_8x8(dqcoeff, dst, stride, eob, tx_type);
+          break;
+        case TX_16X16:
+          vp10_inv_txfm_add_16x16(dqcoeff, dst, stride, eob, tx_type);
+          break;
+        case TX_32X32:
+          vp10_inv_txfm_add_32x32(dqcoeff, dst, stride, eob, tx_type);
+          break;
+        default:
+          assert(0 && "Invalid transform size");
+          return;
+      }
+#if CONFIG_VP9_HIGHBITDEPTH
+    }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    if (eob == 1) {
+      dqcoeff[0] = 0;
+    } else {
+      if (tx_type == DCT_DCT && tx_size <= TX_16X16 && eob <= 10)
+        memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0]));
+      else if (tx_size == TX_32X32 && eob <= 34)
+        memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0]));
+      else
+        memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0]));
+    }
+  }
+}
+
+static void inverse_transform_block_intra(MACROBLOCKD* xd, int plane,
+                                          const TX_TYPE tx_type,
+                                          const TX_SIZE tx_size,
+                                          uint8_t *dst, int stride,
+                                          int eob) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int seg_id = xd->mi[0]->mbmi.segment_id;
+  if (eob > 0) {
+    tran_low_t *const dqcoeff = pd->dqcoeff;
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      switch (tx_size) {
+        case TX_4X4:
+          vp10_highbd_inv_txfm_add_4x4(dqcoeff, dst, stride, eob, xd->bd,
+                                       tx_type, xd->lossless[seg_id]);
+          break;
+        case TX_8X8:
+          vp10_highbd_inv_txfm_add_8x8(dqcoeff, dst, stride, eob, xd->bd,
+                                       tx_type);
+          break;
+        case TX_16X16:
+          vp10_highbd_inv_txfm_add_16x16(dqcoeff, dst, stride, eob, xd->bd,
+                                         tx_type);
+          break;
+        case TX_32X32:
+          vp10_highbd_inv_txfm_add_32x32(dqcoeff, dst, stride, eob, xd->bd,
+                                         tx_type);
+          break;
+        default:
+          assert(0 && "Invalid transform size");
+          return;
+      }
+    } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      switch (tx_size) {
+        case TX_4X4:
+          vp10_inv_txfm_add_4x4(dqcoeff, dst, stride, eob, tx_type,
+                                xd->lossless[seg_id]);
+          break;
+        case TX_8X8:
+          vp10_inv_txfm_add_8x8(dqcoeff, dst, stride, eob, tx_type);
+          break;
+        case TX_16X16:
+          vp10_inv_txfm_add_16x16(dqcoeff, dst, stride, eob, tx_type);
+          break;
+        case TX_32X32:
+          vp10_inv_txfm_add_32x32(dqcoeff, dst, stride, eob, tx_type);
+          break;
+        default:
+          assert(0 && "Invalid transform size");
+          return;
+      }
+#if CONFIG_VP9_HIGHBITDEPTH
+    }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    if (eob == 1) {
+      dqcoeff[0] = 0;
+    } else {
+      if (tx_type == DCT_DCT && tx_size <= TX_16X16 && eob <= 10)
+        memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0]));
+      else if (tx_size == TX_32X32 && eob <= 34)
+        memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0]));
+      else
+        memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0]));
+    }
+  }
+}
+
+static void predict_and_reconstruct_intra_block(MACROBLOCKD *const xd,
+                                                vpx_reader *r,
+                                                MB_MODE_INFO *const mbmi,
+                                                int plane,
+                                                int row, int col,
+                                                TX_SIZE tx_size) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  PREDICTION_MODE mode = (plane == 0) ? mbmi->mode : mbmi->uv_mode;
+  PLANE_TYPE plane_type = (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
+  uint8_t *dst;
+  int block_idx = (row << 1) + col;
+  dst = &pd->dst.buf[4 * row * pd->dst.stride + 4 * col];
+
+  if (mbmi->sb_type < BLOCK_8X8)
+    if (plane == 0)
+      mode = xd->mi[0]->bmi[(row << 1) + col].as_mode;
+
+  vp10_predict_intra_block(xd, pd->n4_wl, pd->n4_hl, tx_size, mode,
+                          dst, pd->dst.stride, dst, pd->dst.stride,
+                          col, row, plane);
+
+  if (!mbmi->skip) {
+    TX_TYPE tx_type = get_tx_type(plane_type, xd, block_idx);
+    const scan_order *sc = get_scan(tx_size, tx_type);
+    const int eob = vp10_decode_block_tokens(xd, plane, sc, col, row, tx_size,
+                                             r, mbmi->segment_id);
+    inverse_transform_block_intra(xd, plane, tx_type, tx_size,
+                                  dst, pd->dst.stride, eob);
+  }
+}
+
+static int reconstruct_inter_block(MACROBLOCKD *const xd, vpx_reader *r,
+                                   MB_MODE_INFO *const mbmi, int plane,
+                                   int row, int col, TX_SIZE tx_size) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  PLANE_TYPE plane_type = (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
+  int block_idx = (row << 1) + col;
+  TX_TYPE tx_type = get_tx_type(plane_type, xd, block_idx);
+  const scan_order *sc = get_scan(tx_size, tx_type);
+  const int eob = vp10_decode_block_tokens(xd, plane, sc, col, row, tx_size, r,
+                                          mbmi->segment_id);
+
+  inverse_transform_block_inter(xd, plane, tx_size,
+                            &pd->dst.buf[4 * row * pd->dst.stride + 4 * col],
+                            pd->dst.stride, eob, block_idx);
+  return eob;
+}
+
+static void build_mc_border(const uint8_t *src, int src_stride,
+                            uint8_t *dst, int dst_stride,
+                            int x, int y, int b_w, int b_h, int w, int h) {
+  // Get a pointer to the start of the real data for this row.
+  const uint8_t *ref_row = src - x - y * src_stride;
+
+  if (y >= h)
+    ref_row += (h - 1) * src_stride;
+  else if (y > 0)
+    ref_row += y * src_stride;
+
+  do {
+    int right = 0, copy;
+    int left = x < 0 ? -x : 0;
+
+    if (left > b_w)
+      left = b_w;
+
+    if (x + b_w > w)
+      right = x + b_w - w;
+
+    if (right > b_w)
+      right = b_w;
+
+    copy = b_w - left - right;
+
+    if (left)
+      memset(dst, ref_row[0], left);
+
+    if (copy)
+      memcpy(dst + left, ref_row + x + left, copy);
+
+    if (right)
+      memset(dst + left + copy, ref_row[w - 1], right);
+
+    dst += dst_stride;
+    ++y;
+
+    if (y > 0 && y < h)
+      ref_row += src_stride;
+  } while (--b_h);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void high_build_mc_border(const uint8_t *src8, int src_stride,
+                                 uint16_t *dst, int dst_stride,
+                                 int x, int y, int b_w, int b_h,
+                                 int w, int h) {
+  // Get a pointer to the start of the real data for this row.
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *ref_row = src - x - y * src_stride;
+
+  if (y >= h)
+    ref_row += (h - 1) * src_stride;
+  else if (y > 0)
+    ref_row += y * src_stride;
+
+  do {
+    int right = 0, copy;
+    int left = x < 0 ? -x : 0;
+
+    if (left > b_w)
+      left = b_w;
+
+    if (x + b_w > w)
+      right = x + b_w - w;
+
+    if (right > b_w)
+      right = b_w;
+
+    copy = b_w - left - right;
+
+    if (left)
+      vpx_memset16(dst, ref_row[0], left);
+
+    if (copy)
+      memcpy(dst + left, ref_row + x + left, copy * sizeof(uint16_t));
+
+    if (right)
+      vpx_memset16(dst + left + copy, ref_row[w - 1], right);
+
+    dst += dst_stride;
+    ++y;
+
+    if (y > 0 && y < h)
+      ref_row += src_stride;
+  } while (--b_h);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void extend_and_predict(const uint8_t *buf_ptr1, int pre_buf_stride,
+                               int x0, int y0, int b_w, int b_h,
+                               int frame_width, int frame_height,
+                               int border_offset,
+                               uint8_t *const dst, int dst_buf_stride,
+                               int subpel_x, int subpel_y,
+                               const InterpKernel *kernel,
+                               const struct scale_factors *sf,
+                               MACROBLOCKD *xd,
+                               int w, int h, int ref, int xs, int ys) {
+  DECLARE_ALIGNED(16, uint16_t, mc_buf_high[80 * 2 * 80 * 2]);
+  const uint8_t *buf_ptr;
+
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    high_build_mc_border(buf_ptr1, pre_buf_stride, mc_buf_high, b_w,
+                         x0, y0, b_w, b_h, frame_width, frame_height);
+    buf_ptr = CONVERT_TO_BYTEPTR(mc_buf_high) + border_offset;
+  } else {
+    build_mc_border(buf_ptr1, pre_buf_stride, (uint8_t *)mc_buf_high, b_w,
+                    x0, y0, b_w, b_h, frame_width, frame_height);
+    buf_ptr = ((uint8_t *)mc_buf_high) + border_offset;
+  }
+
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    high_inter_predictor(buf_ptr, b_w, dst, dst_buf_stride, subpel_x,
+                         subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd);
+  } else {
+    inter_predictor(buf_ptr, b_w, dst, dst_buf_stride, subpel_x,
+                    subpel_y, sf, w, h, ref, kernel, xs, ys);
+  }
+}
+#else
+static void extend_and_predict(const uint8_t *buf_ptr1, int pre_buf_stride,
+                               int x0, int y0, int b_w, int b_h,
+                               int frame_width, int frame_height,
+                               int border_offset,
+                               uint8_t *const dst, int dst_buf_stride,
+                               int subpel_x, int subpel_y,
+                               const InterpKernel *kernel,
+                               const struct scale_factors *sf,
+                               int w, int h, int ref, int xs, int ys) {
+  DECLARE_ALIGNED(16, uint8_t, mc_buf[80 * 2 * 80 * 2]);
+  const uint8_t *buf_ptr;
+
+  build_mc_border(buf_ptr1, pre_buf_stride, mc_buf, b_w,
+                  x0, y0, b_w, b_h, frame_width, frame_height);
+  buf_ptr = mc_buf + border_offset;
+
+  inter_predictor(buf_ptr, b_w, dst, dst_buf_stride, subpel_x,
+                  subpel_y, sf, w, h, ref, kernel, xs, ys);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static void dec_build_inter_predictors(VP10Decoder *const pbi, MACROBLOCKD *xd,
+                                       int plane, int bw, int bh, int x,
+                                       int y, int w, int h, int mi_x, int mi_y,
+                                       const InterpKernel *kernel,
+                                       const struct scale_factors *sf,
+                                       struct buf_2d *pre_buf,
+                                       struct buf_2d *dst_buf, const MV* mv,
+                                       RefCntBuffer *ref_frame_buf,
+                                       int is_scaled, int ref) {
+  VP10_COMMON *const cm = &pbi->common;
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
+  MV32 scaled_mv;
+  int xs, ys, x0, y0, x0_16, y0_16, frame_width, frame_height,
+      buf_stride, subpel_x, subpel_y;
+  uint8_t *ref_frame, *buf_ptr;
+
+  // Get reference frame pointer, width and height.
+  if (plane == 0) {
+    frame_width = ref_frame_buf->buf.y_crop_width;
+    frame_height = ref_frame_buf->buf.y_crop_height;
+    ref_frame = ref_frame_buf->buf.y_buffer;
+  } else {
+    frame_width = ref_frame_buf->buf.uv_crop_width;
+    frame_height = ref_frame_buf->buf.uv_crop_height;
+    ref_frame = plane == 1 ? ref_frame_buf->buf.u_buffer
+                         : ref_frame_buf->buf.v_buffer;
+  }
+
+  if (is_scaled) {
+    const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, mv, bw, bh,
+                                               pd->subsampling_x,
+                                               pd->subsampling_y);
+    // Co-ordinate of containing block to pixel precision.
+    int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x));
+    int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y));
+
+    // Co-ordinate of the block to 1/16th pixel precision.
+    x0_16 = (x_start + x) << SUBPEL_BITS;
+    y0_16 = (y_start + y) << SUBPEL_BITS;
+
+    // Co-ordinate of current block in reference frame
+    // to 1/16th pixel precision.
+    x0_16 = sf->scale_value_x(x0_16, sf);
+    y0_16 = sf->scale_value_y(y0_16, sf);
+
+    // Map the top left corner of the block into the reference frame.
+    x0 = sf->scale_value_x(x_start + x, sf);
+    y0 = sf->scale_value_y(y_start + y, sf);
+
+    // Scale the MV and incorporate the sub-pixel offset of the block
+    // in the reference frame.
+    scaled_mv = vp10_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
+    xs = sf->x_step_q4;
+    ys = sf->y_step_q4;
+  } else {
+    // Co-ordinate of containing block to pixel precision.
+    x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x;
+    y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y;
+
+    // Co-ordinate of the block to 1/16th pixel precision.
+    x0_16 = x0 << SUBPEL_BITS;
+    y0_16 = y0 << SUBPEL_BITS;
+
+    scaled_mv.row = mv->row * (1 << (1 - pd->subsampling_y));
+    scaled_mv.col = mv->col * (1 << (1 - pd->subsampling_x));
+    xs = ys = 16;
+  }
+  subpel_x = scaled_mv.col & SUBPEL_MASK;
+  subpel_y = scaled_mv.row & SUBPEL_MASK;
+
+  // Calculate the top left corner of the best matching block in the
+  // reference frame.
+  x0 += scaled_mv.col >> SUBPEL_BITS;
+  y0 += scaled_mv.row >> SUBPEL_BITS;
+  x0_16 += scaled_mv.col;
+  y0_16 += scaled_mv.row;
+
+  // Get reference block pointer.
+  buf_ptr = ref_frame + y0 * pre_buf->stride + x0;
+  buf_stride = pre_buf->stride;
+
+  // Do border extension if there is motion or the
+  // width/height is not a multiple of 8 pixels.
+  if (is_scaled || scaled_mv.col || scaled_mv.row ||
+      (frame_width & 0x7) || (frame_height & 0x7)) {
+    int y1 = ((y0_16 + (h - 1) * ys) >> SUBPEL_BITS) + 1;
+
+    // Get reference block bottom right horizontal coordinate.
+    int x1 = ((x0_16 + (w - 1) * xs) >> SUBPEL_BITS) + 1;
+    int x_pad = 0, y_pad = 0;
+
+    if (subpel_x || (sf->x_step_q4 != SUBPEL_SHIFTS)) {
+      x0 -= VP9_INTERP_EXTEND - 1;
+      x1 += VP9_INTERP_EXTEND;
+      x_pad = 1;
+    }
+
+    if (subpel_y || (sf->y_step_q4 != SUBPEL_SHIFTS)) {
+      y0 -= VP9_INTERP_EXTEND - 1;
+      y1 += VP9_INTERP_EXTEND;
+      y_pad = 1;
+    }
+
+    // Wait until reference block is ready. Pad 7 more pixels as last 7
+    // pixels of each superblock row can be changed by next superblock row.
+    if (cm->frame_parallel_decode)
+      vp10_frameworker_wait(pbi->frame_worker_owner, ref_frame_buf,
+                            VPXMAX(0, (y1 + 7)) << (plane == 0 ? 0 : 1));
+
+    // Skip border extension if block is inside the frame.
+    if (x0 < 0 || x0 > frame_width - 1 || x1 < 0 || x1 > frame_width - 1 ||
+        y0 < 0 || y0 > frame_height - 1 || y1 < 0 || y1 > frame_height - 1) {
+      // Extend the border.
+      const uint8_t *const buf_ptr1 = ref_frame + y0 * buf_stride + x0;
+      const int b_w = x1 - x0 + 1;
+      const int b_h = y1 - y0 + 1;
+      const int border_offset = y_pad * 3 * b_w + x_pad * 3;
+
+      extend_and_predict(buf_ptr1, buf_stride, x0, y0, b_w, b_h,
+                         frame_width, frame_height, border_offset,
+                         dst, dst_buf->stride,
+                         subpel_x, subpel_y,
+                         kernel, sf,
+#if CONFIG_VP9_HIGHBITDEPTH
+                         xd,
+#endif
+                         w, h, ref, xs, ys);
+      return;
+    }
+  } else {
+    // Wait until reference block is ready. Pad 7 more pixels as last 7
+    // pixels of each superblock row can be changed by next superblock row.
+     if (cm->frame_parallel_decode) {
+       const int y1 = (y0_16 + (h - 1) * ys) >> SUBPEL_BITS;
+       vp10_frameworker_wait(pbi->frame_worker_owner, ref_frame_buf,
+                             VPXMAX(0, (y1 + 7)) << (plane == 0 ? 0 : 1));
+     }
+  }
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    high_inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
+                         subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd);
+  } else {
+    inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
+                    subpel_y, sf, w, h, ref, kernel, xs, ys);
+  }
+#else
+  inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
+                  subpel_y, sf, w, h, ref, kernel, xs, ys);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}
+
+static void dec_build_inter_predictors_sb(VP10Decoder *const pbi,
+                                          MACROBLOCKD *xd,
+                                          int mi_row, int mi_col) {
+  int plane;
+  const int mi_x = mi_col * MI_SIZE;
+  const int mi_y = mi_row * MI_SIZE;
+  const MODE_INFO *mi = xd->mi[0];
+  const InterpKernel *kernel = vp10_filter_kernels[mi->mbmi.interp_filter];
+  const BLOCK_SIZE sb_type = mi->mbmi.sb_type;
+  const int is_compound = has_second_ref(&mi->mbmi);
+
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    struct macroblockd_plane *const pd = &xd->plane[plane];
+    struct buf_2d *const dst_buf = &pd->dst;
+    const int num_4x4_w = pd->n4_w;
+    const int num_4x4_h = pd->n4_h;
+
+    const int n4w_x4 = 4 * num_4x4_w;
+    const int n4h_x4 = 4 * num_4x4_h;
+    int ref;
+
+    for (ref = 0; ref < 1 + is_compound; ++ref) {
+      const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
+      struct buf_2d *const pre_buf = &pd->pre[ref];
+      const int idx = xd->block_refs[ref]->idx;
+      BufferPool *const pool = pbi->common.buffer_pool;
+      RefCntBuffer *const ref_frame_buf = &pool->frame_bufs[idx];
+      const int is_scaled = vp10_is_scaled(sf);
+
+      if (sb_type < BLOCK_8X8) {
+        const PARTITION_TYPE bp = BLOCK_8X8 - sb_type;
+        const int have_vsplit = bp != PARTITION_HORZ;
+        const int have_hsplit = bp != PARTITION_VERT;
+        const int num_4x4_w = 2 >> ((!have_vsplit) | pd->subsampling_x);
+        const int num_4x4_h = 2 >> ((!have_hsplit) | pd->subsampling_y);
+        const int pw = 8 >> (have_vsplit | pd->subsampling_x);
+        const int ph = 8 >> (have_hsplit | pd->subsampling_y);
+        int x, y;
+        for (y = 0; y < num_4x4_h; ++y) {
+          for (x = 0; x < num_4x4_w; ++x) {
+            const MV mv = average_split_mvs(pd, mi, ref, y * 2 + x);
+            dec_build_inter_predictors(pbi, xd, plane, n4w_x4, n4h_x4,
+                                       4 * x, 4 * y, pw, ph, mi_x, mi_y, kernel,
+                                       sf, pre_buf, dst_buf, &mv,
+                                       ref_frame_buf, is_scaled, ref);
+          }
+        }
+      } else {
+        const MV mv = mi->mbmi.mv[ref].as_mv;
+        dec_build_inter_predictors(pbi, xd, plane, n4w_x4, n4h_x4,
+                                   0, 0, n4w_x4, n4h_x4, mi_x, mi_y, kernel,
+                                   sf, pre_buf, dst_buf, &mv, ref_frame_buf,
+                                   is_scaled, ref);
+      }
+    }
+  }
+}
+
+static INLINE TX_SIZE dec_get_uv_tx_size(const MB_MODE_INFO *mbmi,
+                                         int n4_wl, int n4_hl) {
+  // get minimum log2 num4x4s dimension
+  const int x = VPXMIN(n4_wl, n4_hl);
+  return VPXMIN(mbmi->tx_size,  x);
+}
+
+static INLINE void dec_reset_skip_context(MACROBLOCKD *xd) {
+  int i;
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    struct macroblockd_plane *const pd = &xd->plane[i];
+    memset(pd->above_context, 0, sizeof(ENTROPY_CONTEXT) * pd->n4_w);
+    memset(pd->left_context, 0, sizeof(ENTROPY_CONTEXT) * pd->n4_h);
+  }
+}
+
+static void set_plane_n4(MACROBLOCKD *const xd, int bw, int bh, int bwl,
+                         int bhl) {
+  int i;
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    xd->plane[i].n4_w = (bw << 1) >> xd->plane[i].subsampling_x;
+    xd->plane[i].n4_h = (bh << 1) >> xd->plane[i].subsampling_y;
+    xd->plane[i].n4_wl = bwl - xd->plane[i].subsampling_x;
+    xd->plane[i].n4_hl = bhl - xd->plane[i].subsampling_y;
+  }
+}
+
+static MB_MODE_INFO *set_offsets(VP10_COMMON *const cm, MACROBLOCKD *const xd,
+                                 BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                 int bw, int bh, int x_mis, int y_mis,
+                                 int bwl, int bhl) {
+  const int offset = mi_row * cm->mi_stride + mi_col;
+  int x, y;
+  const TileInfo *const tile = &xd->tile;
+
+  xd->mi = cm->mi_grid_visible + offset;
+  xd->mi[0] = &cm->mi[offset];
+  // TODO(slavarnway): Generate sb_type based on bwl and bhl, instead of
+  // passing bsize from decode_partition().
+  xd->mi[0]->mbmi.sb_type = bsize;
+  for (y = 0; y < y_mis; ++y)
+    for (x = !y; x < x_mis; ++x) {
+      xd->mi[y * cm->mi_stride + x] = xd->mi[0];
+    }
+
+  set_plane_n4(xd, bw, bh, bwl, bhl);
+
+  set_skip_context(xd, mi_row, mi_col);
+
+  // Distance of Mb to the various image edges. These are specified to 8th pel
+  // as they are always compared to values that are in 1/8th pel units
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
+
+  vp10_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col);
+  return &xd->mi[0]->mbmi;
+}
+
+static void decode_block(VP10Decoder *const pbi, MACROBLOCKD *const xd,
+                         int mi_row, int mi_col,
+                         vpx_reader *r, BLOCK_SIZE bsize,
+                         int bwl, int bhl) {
+  VP10_COMMON *const cm = &pbi->common;
+  const int less8x8 = bsize < BLOCK_8X8;
+  const int bw = 1 << (bwl - 1);
+  const int bh = 1 << (bhl - 1);
+  const int x_mis = VPXMIN(bw, cm->mi_cols - mi_col);
+  const int y_mis = VPXMIN(bh, cm->mi_rows - mi_row);
+
+  MB_MODE_INFO *mbmi = set_offsets(cm, xd, bsize, mi_row, mi_col,
+                                   bw, bh, x_mis, y_mis, bwl, bhl);
+
+  if (bsize >= BLOCK_8X8 && (cm->subsampling_x || cm->subsampling_y)) {
+    const BLOCK_SIZE uv_subsize =
+        ss_size_lookup[bsize][cm->subsampling_x][cm->subsampling_y];
+    if (uv_subsize == BLOCK_INVALID)
+      vpx_internal_error(xd->error_info,
+                         VPX_CODEC_CORRUPT_FRAME, "Invalid block size.");
+  }
+
+  vp10_read_mode_info(pbi, xd, mi_row, mi_col, r, x_mis, y_mis);
+
+  if (mbmi->skip) {
+    dec_reset_skip_context(xd);
+  }
+
+  if (!is_inter_block(mbmi)) {
+    int plane;
+    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      const TX_SIZE tx_size =
+          plane ? dec_get_uv_tx_size(mbmi, pd->n4_wl, pd->n4_hl)
+                  : mbmi->tx_size;
+      const int num_4x4_w = pd->n4_w;
+      const int num_4x4_h = pd->n4_h;
+      const int step = (1 << tx_size);
+      int row, col;
+      const int max_blocks_wide = num_4x4_w + (xd->mb_to_right_edge >= 0 ?
+          0 : xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+      const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ?
+          0 : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+
+      for (row = 0; row < max_blocks_high; row += step)
+        for (col = 0; col < max_blocks_wide; col += step)
+          predict_and_reconstruct_intra_block(xd, r, mbmi, plane,
+                                              row, col, tx_size);
+    }
+  } else {
+    // Prediction
+    dec_build_inter_predictors_sb(pbi, xd, mi_row, mi_col);
+
+    // Reconstruction
+    if (!mbmi->skip) {
+      int eobtotal = 0;
+      int plane;
+
+      for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+        const struct macroblockd_plane *const pd = &xd->plane[plane];
+        const TX_SIZE tx_size =
+            plane ? dec_get_uv_tx_size(mbmi, pd->n4_wl, pd->n4_hl)
+                    : mbmi->tx_size;
+        const int num_4x4_w = pd->n4_w;
+        const int num_4x4_h = pd->n4_h;
+        const int step = (1 << tx_size);
+        int row, col;
+        const int max_blocks_wide = num_4x4_w + (xd->mb_to_right_edge >= 0 ?
+            0 : xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+        const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ?
+            0 : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+
+        for (row = 0; row < max_blocks_high; row += step)
+          for (col = 0; col < max_blocks_wide; col += step)
+            eobtotal += reconstruct_inter_block(xd, r, mbmi, plane, row, col,
+                                                tx_size);
+      }
+
+      if (!less8x8 && eobtotal == 0)
+#if CONFIG_MISC_FIXES
+        mbmi->has_no_coeffs = 1;  // skip loopfilter
+#else
+        mbmi->skip = 1;  // skip loopfilter
+#endif
+    }
+  }
+
+  xd->corrupted |= vpx_reader_has_error(r);
+}
+
+static INLINE int dec_partition_plane_context(const MACROBLOCKD *xd,
+                                              int mi_row, int mi_col,
+                                              int bsl) {
+  const PARTITION_CONTEXT *above_ctx = xd->above_seg_context + mi_col;
+  const PARTITION_CONTEXT *left_ctx = xd->left_seg_context + (mi_row & MI_MASK);
+  int above = (*above_ctx >> bsl) & 1 , left = (*left_ctx >> bsl) & 1;
+
+//  assert(bsl >= 0);
+
+  return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
+}
+
+static INLINE void dec_update_partition_context(MACROBLOCKD *xd,
+                                                int mi_row, int mi_col,
+                                                BLOCK_SIZE subsize,
+                                                int bw) {
+  PARTITION_CONTEXT *const above_ctx = xd->above_seg_context + mi_col;
+  PARTITION_CONTEXT *const left_ctx = xd->left_seg_context + (mi_row & MI_MASK);
+
+  // update the partition context at the end notes. set partition bits
+  // of block sizes larger than the current one to be one, and partition
+  // bits of smaller block sizes to be zero.
+  memset(above_ctx, partition_context_lookup[subsize].above, bw);
+  memset(left_ctx, partition_context_lookup[subsize].left, bw);
+}
+
+static PARTITION_TYPE read_partition(VP10_COMMON *cm, MACROBLOCKD *xd,
+                                     int mi_row, int mi_col, vpx_reader *r,
+                                     int has_rows, int has_cols, int bsl) {
+  const int ctx = dec_partition_plane_context(xd, mi_row, mi_col, bsl);
+  const vpx_prob *const probs = cm->fc->partition_prob[ctx];
+  FRAME_COUNTS *counts = xd->counts;
+  PARTITION_TYPE p;
+
+  if (has_rows && has_cols)
+    p = (PARTITION_TYPE)vpx_read_tree(r, vp10_partition_tree, probs);
+  else if (!has_rows && has_cols)
+    p = vpx_read(r, probs[1]) ? PARTITION_SPLIT : PARTITION_HORZ;
+  else if (has_rows && !has_cols)
+    p = vpx_read(r, probs[2]) ? PARTITION_SPLIT : PARTITION_VERT;
+  else
+    p = PARTITION_SPLIT;
+
+  if (counts)
+    ++counts->partition[ctx][p];
+
+  return p;
+}
+
+// TODO(slavarnway): eliminate bsize and subsize in future commits
+static void decode_partition(VP10Decoder *const pbi, MACROBLOCKD *const xd,
+                             int mi_row, int mi_col,
+                             vpx_reader* r, BLOCK_SIZE bsize, int n4x4_l2) {
+  VP10_COMMON *const cm = &pbi->common;
+  const int n8x8_l2 = n4x4_l2 - 1;
+  const int num_8x8_wh = 1 << n8x8_l2;
+  const int hbs = num_8x8_wh >> 1;
+  PARTITION_TYPE partition;
+  BLOCK_SIZE subsize;
+  const int has_rows = (mi_row + hbs) < cm->mi_rows;
+  const int has_cols = (mi_col + hbs) < cm->mi_cols;
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
+
+  partition = read_partition(cm, xd, mi_row, mi_col, r, has_rows, has_cols,
+                             n8x8_l2);
+  subsize = subsize_lookup[partition][bsize];  // get_subsize(bsize, partition);
+  if (!hbs) {
+    // calculate bmode block dimensions (log 2)
+    xd->bmode_blocks_wl = 1 >> !!(partition & PARTITION_VERT);
+    xd->bmode_blocks_hl = 1 >> !!(partition & PARTITION_HORZ);
+    decode_block(pbi, xd, mi_row, mi_col, r, subsize, 1, 1);
+  } else {
+    switch (partition) {
+      case PARTITION_NONE:
+        decode_block(pbi, xd, mi_row, mi_col, r, subsize, n4x4_l2, n4x4_l2);
+        break;
+      case PARTITION_HORZ:
+        decode_block(pbi, xd, mi_row, mi_col, r, subsize, n4x4_l2, n8x8_l2);
+        if (has_rows)
+          decode_block(pbi, xd, mi_row + hbs, mi_col, r, subsize, n4x4_l2,
+                       n8x8_l2);
+        break;
+      case PARTITION_VERT:
+        decode_block(pbi, xd, mi_row, mi_col, r, subsize, n8x8_l2, n4x4_l2);
+        if (has_cols)
+          decode_block(pbi, xd, mi_row, mi_col + hbs, r, subsize, n8x8_l2,
+                       n4x4_l2);
+        break;
+      case PARTITION_SPLIT:
+        decode_partition(pbi, xd, mi_row, mi_col, r, subsize, n8x8_l2);
+        decode_partition(pbi, xd, mi_row, mi_col + hbs, r, subsize, n8x8_l2);
+        decode_partition(pbi, xd, mi_row + hbs, mi_col, r, subsize, n8x8_l2);
+        decode_partition(pbi, xd, mi_row + hbs, mi_col + hbs, r, subsize,
+                         n8x8_l2);
+        break;
+      default:
+        assert(0 && "Invalid partition type");
+    }
+  }
+
+  // update partition context
+  if (bsize >= BLOCK_8X8 &&
+      (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
+    dec_update_partition_context(xd, mi_row, mi_col, subsize, num_8x8_wh);
+}
+
+static void setup_token_decoder(const uint8_t *data,
+                                const uint8_t *data_end,
+                                size_t read_size,
+                                struct vpx_internal_error_info *error_info,
+                                vpx_reader *r,
+                                vpx_decrypt_cb decrypt_cb,
+                                void *decrypt_state) {
+  // Validate the calculated partition length. If the buffer
+  // described by the partition can't be fully read, then restrict
+  // it to the portion that can be (for EC mode) or throw an error.
+  if (!read_is_valid(data, read_size, data_end))
+    vpx_internal_error(error_info, VPX_CODEC_CORRUPT_FRAME,
+                       "Truncated packet or corrupt tile length");
+
+  if (vpx_reader_init(r, data, read_size, decrypt_cb, decrypt_state))
+    vpx_internal_error(error_info, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate bool decoder %d", 1);
+}
+
+static void read_coef_probs_common(vp10_coeff_probs_model *coef_probs,
+                                   vpx_reader *r) {
+  int i, j, k, l, m;
+
+  if (vpx_read_bit(r))
+    for (i = 0; i < PLANE_TYPES; ++i)
+      for (j = 0; j < REF_TYPES; ++j)
+        for (k = 0; k < COEF_BANDS; ++k)
+          for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l)
+            for (m = 0; m < UNCONSTRAINED_NODES; ++m)
+              vp10_diff_update_prob(r, &coef_probs[i][j][k][l][m]);
+}
+
+static void read_coef_probs(FRAME_CONTEXT *fc, TX_MODE tx_mode,
+                            vpx_reader *r) {
+    const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
+    TX_SIZE tx_size;
+    for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
+      read_coef_probs_common(fc->coef_probs[tx_size], r);
+}
+
+static void setup_segmentation(VP10_COMMON *const cm,
+                               struct vpx_read_bit_buffer *rb) {
+  struct segmentation *const seg = &cm->seg;
+#if !CONFIG_MISC_FIXES
+  struct segmentation_probs *const segp = &cm->segp;
+#endif
+  int i, j;
+
+  seg->update_map = 0;
+  seg->update_data = 0;
+
+  seg->enabled = vpx_rb_read_bit(rb);
+  if (!seg->enabled)
+    return;
+
+  // Segmentation map update
+  if (frame_is_intra_only(cm) || cm->error_resilient_mode) {
+    seg->update_map = 1;
+  } else {
+    seg->update_map = vpx_rb_read_bit(rb);
+  }
+  if (seg->update_map) {
+#if !CONFIG_MISC_FIXES
+    for (i = 0; i < SEG_TREE_PROBS; i++)
+      segp->tree_probs[i] = vpx_rb_read_bit(rb) ? vpx_rb_read_literal(rb, 8)
+                                                : MAX_PROB;
+#endif
+    if (frame_is_intra_only(cm) || cm->error_resilient_mode) {
+      seg->temporal_update = 0;
+    } else {
+      seg->temporal_update = vpx_rb_read_bit(rb);
+    }
+#if !CONFIG_MISC_FIXES
+    if (seg->temporal_update) {
+      for (i = 0; i < PREDICTION_PROBS; i++)
+        segp->pred_probs[i] = vpx_rb_read_bit(rb) ? vpx_rb_read_literal(rb, 8)
+                                                  : MAX_PROB;
+    } else {
+      for (i = 0; i < PREDICTION_PROBS; i++)
+        segp->pred_probs[i] = MAX_PROB;
+    }
+#endif
+  }
+
+  // Segmentation data update
+  seg->update_data = vpx_rb_read_bit(rb);
+  if (seg->update_data) {
+    seg->abs_delta = vpx_rb_read_bit(rb);
+
+    vp10_clearall_segfeatures(seg);
+
+    for (i = 0; i < MAX_SEGMENTS; i++) {
+      for (j = 0; j < SEG_LVL_MAX; j++) {
+        int data = 0;
+        const int feature_enabled = vpx_rb_read_bit(rb);
+        if (feature_enabled) {
+          vp10_enable_segfeature(seg, i, j);
+          data = decode_unsigned_max(rb, vp10_seg_feature_data_max(j));
+          if (vp10_is_segfeature_signed(j))
+            data = vpx_rb_read_bit(rb) ? -data : data;
+        }
+        vp10_set_segdata(seg, i, j, data);
+      }
+    }
+  }
+}
+
+static void setup_loopfilter(struct loopfilter *lf,
+                             struct vpx_read_bit_buffer *rb) {
+  lf->filter_level = vpx_rb_read_literal(rb, 6);
+  lf->sharpness_level = vpx_rb_read_literal(rb, 3);
+
+  // Read in loop filter deltas applied at the MB level based on mode or ref
+  // frame.
+  lf->mode_ref_delta_update = 0;
+
+  lf->mode_ref_delta_enabled = vpx_rb_read_bit(rb);
+  if (lf->mode_ref_delta_enabled) {
+    lf->mode_ref_delta_update = vpx_rb_read_bit(rb);
+    if (lf->mode_ref_delta_update) {
+      int i;
+
+      for (i = 0; i < MAX_REF_FRAMES; i++)
+        if (vpx_rb_read_bit(rb))
+          lf->ref_deltas[i] = vpx_rb_read_inv_signed_literal(rb, 6);
+
+      for (i = 0; i < MAX_MODE_LF_DELTAS; i++)
+        if (vpx_rb_read_bit(rb))
+          lf->mode_deltas[i] = vpx_rb_read_inv_signed_literal(rb, 6);
+    }
+  }
+}
+
+static INLINE int read_delta_q(struct vpx_read_bit_buffer *rb) {
+  return vpx_rb_read_bit(rb) ?
+      vpx_rb_read_inv_signed_literal(rb, CONFIG_MISC_FIXES ? 6 : 4) : 0;
+}
+
+static void setup_quantization(VP10_COMMON *const cm,
+                               struct vpx_read_bit_buffer *rb) {
+  cm->base_qindex = vpx_rb_read_literal(rb, QINDEX_BITS);
+  cm->y_dc_delta_q = read_delta_q(rb);
+  cm->uv_dc_delta_q = read_delta_q(rb);
+  cm->uv_ac_delta_q = read_delta_q(rb);
+  cm->dequant_bit_depth = cm->bit_depth;
+}
+
+static void setup_segmentation_dequant(VP10_COMMON *const cm) {
+  // Build y/uv dequant values based on segmentation.
+  if (cm->seg.enabled) {
+    int i;
+    for (i = 0; i < MAX_SEGMENTS; ++i) {
+      const int qindex = vp10_get_qindex(&cm->seg, i, cm->base_qindex);
+      cm->y_dequant[i][0] = vp10_dc_quant(qindex, cm->y_dc_delta_q,
+                                         cm->bit_depth);
+      cm->y_dequant[i][1] = vp10_ac_quant(qindex, 0, cm->bit_depth);
+      cm->uv_dequant[i][0] = vp10_dc_quant(qindex, cm->uv_dc_delta_q,
+                                          cm->bit_depth);
+      cm->uv_dequant[i][1] = vp10_ac_quant(qindex, cm->uv_ac_delta_q,
+                                          cm->bit_depth);
+    }
+  } else {
+    const int qindex = cm->base_qindex;
+    // When segmentation is disabled, only the first value is used.  The
+    // remaining are don't cares.
+    cm->y_dequant[0][0] = vp10_dc_quant(qindex, cm->y_dc_delta_q, cm->bit_depth);
+    cm->y_dequant[0][1] = vp10_ac_quant(qindex, 0, cm->bit_depth);
+    cm->uv_dequant[0][0] = vp10_dc_quant(qindex, cm->uv_dc_delta_q,
+                                        cm->bit_depth);
+    cm->uv_dequant[0][1] = vp10_ac_quant(qindex, cm->uv_ac_delta_q,
+                                        cm->bit_depth);
+  }
+}
+
+static INTERP_FILTER read_interp_filter(struct vpx_read_bit_buffer *rb) {
+  return vpx_rb_read_bit(rb) ? SWITCHABLE : vpx_rb_read_literal(rb, 2);
+}
+
+static void setup_render_size(VP10_COMMON *cm,
+                              struct vpx_read_bit_buffer *rb) {
+  cm->render_width = cm->width;
+  cm->render_height = cm->height;
+  if (vpx_rb_read_bit(rb))
+    vp10_read_frame_size(rb, &cm->render_width, &cm->render_height);
+}
+
+static void resize_mv_buffer(VP10_COMMON *cm) {
+  vpx_free(cm->cur_frame->mvs);
+  cm->cur_frame->mi_rows = cm->mi_rows;
+  cm->cur_frame->mi_cols = cm->mi_cols;
+  cm->cur_frame->mvs = (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols,
+                                            sizeof(*cm->cur_frame->mvs));
+}
+
+static void resize_context_buffers(VP10_COMMON *cm, int width, int height) {
+#if CONFIG_SIZE_LIMIT
+  if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT)
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Dimensions of %dx%d beyond allowed size of %dx%d.",
+                       width, height, DECODE_WIDTH_LIMIT, DECODE_HEIGHT_LIMIT);
+#endif
+  if (cm->width != width || cm->height != height) {
+    const int new_mi_rows =
+        ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2) >> MI_SIZE_LOG2;
+    const int new_mi_cols =
+        ALIGN_POWER_OF_TWO(width,  MI_SIZE_LOG2) >> MI_SIZE_LOG2;
+
+    // Allocations in vp10_alloc_context_buffers() depend on individual
+    // dimensions as well as the overall size.
+    if (new_mi_cols > cm->mi_cols || new_mi_rows > cm->mi_rows) {
+      if (vp10_alloc_context_buffers(cm, width, height))
+        vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                           "Failed to allocate context buffers");
+    } else {
+      vp10_set_mb_mi(cm, width, height);
+    }
+    vp10_init_context_buffers(cm);
+    cm->width = width;
+    cm->height = height;
+  }
+  if (cm->cur_frame->mvs == NULL || cm->mi_rows > cm->cur_frame->mi_rows ||
+      cm->mi_cols > cm->cur_frame->mi_cols) {
+    resize_mv_buffer(cm);
+  }
+}
+
+static void setup_frame_size(VP10_COMMON *cm, struct vpx_read_bit_buffer *rb) {
+  int width, height;
+  BufferPool *const pool = cm->buffer_pool;
+  vp10_read_frame_size(rb, &width, &height);
+  resize_context_buffers(cm, width, height);
+  setup_render_size(cm, rb);
+
+  lock_buffer_pool(pool);
+  if (vpx_realloc_frame_buffer(
+          get_frame_new_buffer(cm), cm->width, cm->height,
+          cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+          cm->use_highbitdepth,
+#endif
+          VP9_DEC_BORDER_IN_PIXELS,
+          cm->byte_alignment,
+          &pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer, pool->get_fb_cb,
+          pool->cb_priv)) {
+    unlock_buffer_pool(pool);
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate frame buffer");
+  }
+  unlock_buffer_pool(pool);
+
+  pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x;
+  pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y;
+  pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth;
+  pool->frame_bufs[cm->new_fb_idx].buf.color_space = cm->color_space;
+  pool->frame_bufs[cm->new_fb_idx].buf.color_range = cm->color_range;
+  pool->frame_bufs[cm->new_fb_idx].buf.render_width  = cm->render_width;
+  pool->frame_bufs[cm->new_fb_idx].buf.render_height = cm->render_height;
+}
+
+static INLINE int valid_ref_frame_img_fmt(vpx_bit_depth_t ref_bit_depth,
+                                          int ref_xss, int ref_yss,
+                                          vpx_bit_depth_t this_bit_depth,
+                                          int this_xss, int this_yss) {
+  return ref_bit_depth == this_bit_depth && ref_xss == this_xss &&
+         ref_yss == this_yss;
+}
+
+static void setup_frame_size_with_refs(VP10_COMMON *cm,
+                                       struct vpx_read_bit_buffer *rb) {
+  int width, height;
+  int found = 0, i;
+  int has_valid_ref_frame = 0;
+  BufferPool *const pool = cm->buffer_pool;
+  for (i = 0; i < REFS_PER_FRAME; ++i) {
+    if (vpx_rb_read_bit(rb)) {
+      YV12_BUFFER_CONFIG *const buf = cm->frame_refs[i].buf;
+      width = buf->y_crop_width;
+      height = buf->y_crop_height;
+#if CONFIG_MISC_FIXES
+      cm->render_width = buf->render_width;
+      cm->render_height = buf->render_height;
+#endif
+      found = 1;
+      break;
+    }
+  }
+
+  if (!found) {
+    vp10_read_frame_size(rb, &width, &height);
+#if CONFIG_MISC_FIXES
+    setup_render_size(cm, rb);
+#endif
+  }
+
+  if (width <= 0 || height <= 0)
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Invalid frame size");
+
+  // Check to make sure at least one of frames that this frame references
+  // has valid dimensions.
+  for (i = 0; i < REFS_PER_FRAME; ++i) {
+    RefBuffer *const ref_frame = &cm->frame_refs[i];
+    has_valid_ref_frame |= valid_ref_frame_size(ref_frame->buf->y_crop_width,
+                                                ref_frame->buf->y_crop_height,
+                                                width, height);
+  }
+  if (!has_valid_ref_frame)
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Referenced frame has invalid size");
+  for (i = 0; i < REFS_PER_FRAME; ++i) {
+    RefBuffer *const ref_frame = &cm->frame_refs[i];
+    if (!valid_ref_frame_img_fmt(
+            ref_frame->buf->bit_depth,
+            ref_frame->buf->subsampling_x,
+            ref_frame->buf->subsampling_y,
+            cm->bit_depth,
+            cm->subsampling_x,
+            cm->subsampling_y))
+      vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                         "Referenced frame has incompatible color format");
+  }
+
+  resize_context_buffers(cm, width, height);
+#if !CONFIG_MISC_FIXES
+  setup_render_size(cm, rb);
+#endif
+
+  lock_buffer_pool(pool);
+  if (vpx_realloc_frame_buffer(
+          get_frame_new_buffer(cm), cm->width, cm->height,
+          cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+          cm->use_highbitdepth,
+#endif
+          VP9_DEC_BORDER_IN_PIXELS,
+          cm->byte_alignment,
+          &pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer, pool->get_fb_cb,
+          pool->cb_priv)) {
+    unlock_buffer_pool(pool);
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate frame buffer");
+  }
+  unlock_buffer_pool(pool);
+
+  pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x;
+  pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y;
+  pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth;
+  pool->frame_bufs[cm->new_fb_idx].buf.color_space = cm->color_space;
+  pool->frame_bufs[cm->new_fb_idx].buf.color_range = cm->color_range;
+  pool->frame_bufs[cm->new_fb_idx].buf.render_width  = cm->render_width;
+  pool->frame_bufs[cm->new_fb_idx].buf.render_height = cm->render_height;
+}
+
+static void setup_tile_info(VP10_COMMON *cm, struct vpx_read_bit_buffer *rb) {
+  int min_log2_tile_cols, max_log2_tile_cols, max_ones;
+  vp10_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
+
+  // columns
+  max_ones = max_log2_tile_cols - min_log2_tile_cols;
+  cm->log2_tile_cols = min_log2_tile_cols;
+  while (max_ones-- && vpx_rb_read_bit(rb))
+    cm->log2_tile_cols++;
+
+  if (cm->log2_tile_cols > 6)
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Invalid number of tile columns");
+
+  // rows
+  cm->log2_tile_rows = vpx_rb_read_bit(rb);
+  if (cm->log2_tile_rows)
+    cm->log2_tile_rows += vpx_rb_read_bit(rb);
+
+#if CONFIG_MISC_FIXES
+  // tile size magnitude
+  if (cm->log2_tile_rows > 0 || cm->log2_tile_cols > 0) {
+    cm->tile_sz_mag = vpx_rb_read_literal(rb, 2);
+  }
+#else
+  cm->tile_sz_mag = 3;
+#endif
+}
+
+typedef struct TileBuffer {
+  const uint8_t *data;
+  size_t size;
+  int col;  // only used with multi-threaded decoding
+} TileBuffer;
+
+static int mem_get_varsize(const uint8_t *data, const int mag) {
+  switch (mag) {
+    case 0:
+      return data[0];
+    case 1:
+      return mem_get_le16(data);
+    case 2:
+      return mem_get_le24(data);
+    case 3:
+      return mem_get_le32(data);
+  }
+
+  assert("Invalid tile size marker value" && 0);
+
+  return -1;
+}
+
+// Reads the next tile returning its size and adjusting '*data' accordingly
+// based on 'is_last'.
+static void get_tile_buffer(const uint8_t *const data_end,
+                            const int tile_sz_mag, int is_last,
+                            struct vpx_internal_error_info *error_info,
+                            const uint8_t **data,
+                            vpx_decrypt_cb decrypt_cb, void *decrypt_state,
+                            TileBuffer *buf) {
+  size_t size;
+
+  if (!is_last) {
+    if (!read_is_valid(*data, 4, data_end))
+      vpx_internal_error(error_info, VPX_CODEC_CORRUPT_FRAME,
+                         "Truncated packet or corrupt tile length");
+
+    if (decrypt_cb) {
+      uint8_t be_data[4];
+      decrypt_cb(decrypt_state, *data, be_data, tile_sz_mag + 1);
+      size = mem_get_varsize(be_data, tile_sz_mag) + CONFIG_MISC_FIXES;
+    } else {
+      size = mem_get_varsize(*data, tile_sz_mag) + CONFIG_MISC_FIXES;
+    }
+    *data += tile_sz_mag + 1;
+
+    if (size > (size_t)(data_end - *data))
+      vpx_internal_error(error_info, VPX_CODEC_CORRUPT_FRAME,
+                         "Truncated packet or corrupt tile size");
+  } else {
+    size = data_end - *data;
+  }
+
+  buf->data = *data;
+  buf->size = size;
+
+  *data += size;
+}
+
+static void get_tile_buffers(VP10Decoder *pbi,
+                             const uint8_t *data, const uint8_t *data_end,
+                             int tile_cols, int tile_rows,
+                             TileBuffer (*tile_buffers)[1 << 6]) {
+  int r, c;
+
+  for (r = 0; r < tile_rows; ++r) {
+    for (c = 0; c < tile_cols; ++c) {
+      const int is_last = (r == tile_rows - 1) && (c == tile_cols - 1);
+      TileBuffer *const buf = &tile_buffers[r][c];
+      buf->col = c;
+      get_tile_buffer(data_end, pbi->common.tile_sz_mag,
+                      is_last, &pbi->common.error, &data,
+                      pbi->decrypt_cb, pbi->decrypt_state, buf);
+    }
+  }
+}
+
+static const uint8_t *decode_tiles(VP10Decoder *pbi,
+                                   const uint8_t *data,
+                                   const uint8_t *data_end) {
+  VP10_COMMON *const cm = &pbi->common;
+  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
+  const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int tile_rows = 1 << cm->log2_tile_rows;
+  TileBuffer tile_buffers[4][1 << 6];
+  int tile_row, tile_col;
+  int mi_row, mi_col;
+  TileData *tile_data = NULL;
+
+  if (cm->lf.filter_level && !cm->skip_loop_filter &&
+      pbi->lf_worker.data1 == NULL) {
+    CHECK_MEM_ERROR(cm, pbi->lf_worker.data1,
+                    vpx_memalign(32, sizeof(LFWorkerData)));
+    pbi->lf_worker.hook = (VPxWorkerHook)vp10_loop_filter_worker;
+    if (pbi->max_threads > 1 && !winterface->reset(&pbi->lf_worker)) {
+      vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                         "Loop filter thread creation failed");
+    }
+  }
+
+  if (cm->lf.filter_level && !cm->skip_loop_filter) {
+    LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
+    // Be sure to sync as we might be resuming after a failed frame decode.
+    winterface->sync(&pbi->lf_worker);
+    vp10_loop_filter_data_reset(lf_data, get_frame_new_buffer(cm), cm,
+                               pbi->mb.plane);
+  }
+
+  assert(tile_rows <= 4);
+  assert(tile_cols <= (1 << 6));
+
+  // Note: this memset assumes above_context[0], [1] and [2]
+  // are allocated as part of the same buffer.
+  memset(cm->above_context, 0,
+         sizeof(*cm->above_context) * MAX_MB_PLANE * 2 * aligned_cols);
+
+  memset(cm->above_seg_context, 0,
+         sizeof(*cm->above_seg_context) * aligned_cols);
+
+  get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows, tile_buffers);
+
+  if (pbi->tile_data == NULL ||
+      (tile_cols * tile_rows) != pbi->total_tiles) {
+    vpx_free(pbi->tile_data);
+    CHECK_MEM_ERROR(
+        cm,
+        pbi->tile_data,
+        vpx_memalign(32, tile_cols * tile_rows * (sizeof(*pbi->tile_data))));
+    pbi->total_tiles = tile_rows * tile_cols;
+  }
+
+  // Load all tile information into tile_data.
+  for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
+    for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+      const TileBuffer *const buf = &tile_buffers[tile_row][tile_col];
+      tile_data = pbi->tile_data + tile_cols * tile_row + tile_col;
+      tile_data->cm = cm;
+      tile_data->xd = pbi->mb;
+      tile_data->xd.corrupted = 0;
+      tile_data->xd.counts =
+          cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD ?
+              &cm->counts : NULL;
+      vp10_zero(tile_data->dqcoeff);
+      vp10_tile_init(&tile_data->xd.tile, tile_data->cm, tile_row, tile_col);
+      setup_token_decoder(buf->data, data_end, buf->size, &cm->error,
+                          &tile_data->bit_reader, pbi->decrypt_cb,
+                          pbi->decrypt_state);
+      vp10_init_macroblockd(cm, &tile_data->xd, tile_data->dqcoeff);
+      tile_data->xd.plane[0].color_index_map = tile_data->color_index_map[0];
+      tile_data->xd.plane[1].color_index_map = tile_data->color_index_map[1];
+    }
+  }
+
+  for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
+    TileInfo tile;
+    vp10_tile_set_row(&tile, cm, tile_row);
+    for (mi_row = tile.mi_row_start; mi_row < tile.mi_row_end;
+         mi_row += MI_BLOCK_SIZE) {
+      for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+        const int col = pbi->inv_tile_order ?
+                        tile_cols - tile_col - 1 : tile_col;
+        tile_data = pbi->tile_data + tile_cols * tile_row + col;
+        vp10_tile_set_col(&tile, tile_data->cm, col);
+        vp10_zero(tile_data->xd.left_context);
+        vp10_zero(tile_data->xd.left_seg_context);
+        for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end;
+             mi_col += MI_BLOCK_SIZE) {
+          decode_partition(pbi, &tile_data->xd, mi_row,
+                           mi_col, &tile_data->bit_reader, BLOCK_64X64, 4);
+        }
+        pbi->mb.corrupted |= tile_data->xd.corrupted;
+        if (pbi->mb.corrupted)
+            vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                               "Failed to decode tile data");
+      }
+      // Loopfilter one row.
+      if (cm->lf.filter_level && !cm->skip_loop_filter) {
+        const int lf_start = mi_row - MI_BLOCK_SIZE;
+        LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
+
+        // delay the loopfilter by 1 macroblock row.
+        if (lf_start < 0) continue;
+
+        // decoding has completed: finish up the loop filter in this thread.
+        if (mi_row + MI_BLOCK_SIZE >= cm->mi_rows) continue;
+
+        winterface->sync(&pbi->lf_worker);
+        lf_data->start = lf_start;
+        lf_data->stop = mi_row;
+        if (pbi->max_threads > 1) {
+          winterface->launch(&pbi->lf_worker);
+        } else {
+          winterface->execute(&pbi->lf_worker);
+        }
+      }
+      // After loopfiltering, the last 7 row pixels in each superblock row may
+      // still be changed by the longest loopfilter of the next superblock
+      // row.
+      if (cm->frame_parallel_decode)
+        vp10_frameworker_broadcast(pbi->cur_buf,
+                                  mi_row << MI_BLOCK_SIZE_LOG2);
+    }
+  }
+
+  // Loopfilter remaining rows in the frame.
+  if (cm->lf.filter_level && !cm->skip_loop_filter) {
+    LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
+    winterface->sync(&pbi->lf_worker);
+    lf_data->start = lf_data->stop;
+    lf_data->stop = cm->mi_rows;
+    winterface->execute(&pbi->lf_worker);
+  }
+
+  // Get last tile data.
+  tile_data = pbi->tile_data + tile_cols * tile_rows - 1;
+
+  if (cm->frame_parallel_decode)
+    vp10_frameworker_broadcast(pbi->cur_buf, INT_MAX);
+  return vpx_reader_find_end(&tile_data->bit_reader);
+}
+
+static int tile_worker_hook(TileWorkerData *const tile_data,
+                            const TileInfo *const tile) {
+  int mi_row, mi_col;
+
+  if (setjmp(tile_data->error_info.jmp)) {
+    tile_data->error_info.setjmp = 0;
+    tile_data->xd.corrupted = 1;
+    return 0;
+  }
+
+  tile_data->error_info.setjmp = 1;
+  tile_data->xd.error_info = &tile_data->error_info;
+
+  for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
+       mi_row += MI_BLOCK_SIZE) {
+    vp10_zero(tile_data->xd.left_context);
+    vp10_zero(tile_data->xd.left_seg_context);
+    for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
+         mi_col += MI_BLOCK_SIZE) {
+      decode_partition(tile_data->pbi, &tile_data->xd,
+                       mi_row, mi_col, &tile_data->bit_reader,
+                       BLOCK_64X64, 4);
+    }
+  }
+  return !tile_data->xd.corrupted;
+}
+
+// sorts in descending order
+static int compare_tile_buffers(const void *a, const void *b) {
+  const TileBuffer *const buf1 = (const TileBuffer*)a;
+  const TileBuffer *const buf2 = (const TileBuffer*)b;
+  return (int)(buf2->size - buf1->size);
+}
+
+static const uint8_t *decode_tiles_mt(VP10Decoder *pbi,
+                                      const uint8_t *data,
+                                      const uint8_t *data_end) {
+  VP10_COMMON *const cm = &pbi->common;
+  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
+  const uint8_t *bit_reader_end = NULL;
+  const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int tile_rows = 1 << cm->log2_tile_rows;
+  const int num_workers = VPXMIN(pbi->max_threads & ~1, tile_cols);
+  TileBuffer tile_buffers[1][1 << 6];
+  int n;
+  int final_worker = -1;
+
+  assert(tile_cols <= (1 << 6));
+  assert(tile_rows == 1);
+  (void)tile_rows;
+
+  // TODO(jzern): See if we can remove the restriction of passing in max
+  // threads to the decoder.
+  if (pbi->num_tile_workers == 0) {
+    const int num_threads = pbi->max_threads & ~1;
+    int i;
+    CHECK_MEM_ERROR(cm, pbi->tile_workers,
+                    vpx_malloc(num_threads * sizeof(*pbi->tile_workers)));
+    // Ensure tile data offsets will be properly aligned. This may fail on
+    // platforms without DECLARE_ALIGNED().
+    assert((sizeof(*pbi->tile_worker_data) % 16) == 0);
+    CHECK_MEM_ERROR(cm, pbi->tile_worker_data,
+                    vpx_memalign(32, num_threads *
+                                 sizeof(*pbi->tile_worker_data)));
+    CHECK_MEM_ERROR(cm, pbi->tile_worker_info,
+                    vpx_malloc(num_threads * sizeof(*pbi->tile_worker_info)));
+    for (i = 0; i < num_threads; ++i) {
+      VPxWorker *const worker = &pbi->tile_workers[i];
+      ++pbi->num_tile_workers;
+
+      winterface->init(worker);
+      if (i < num_threads - 1 && !winterface->reset(worker)) {
+        vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                           "Tile decoder thread creation failed");
+      }
+    }
+  }
+
+  // Reset tile decoding hook
+  for (n = 0; n < num_workers; ++n) {
+    VPxWorker *const worker = &pbi->tile_workers[n];
+    winterface->sync(worker);
+    worker->hook = (VPxWorkerHook)tile_worker_hook;
+    worker->data1 = &pbi->tile_worker_data[n];
+    worker->data2 = &pbi->tile_worker_info[n];
+  }
+
+  // Note: this memset assumes above_context[0], [1] and [2]
+  // are allocated as part of the same buffer.
+  memset(cm->above_context, 0,
+         sizeof(*cm->above_context) * MAX_MB_PLANE * 2 * aligned_mi_cols);
+  memset(cm->above_seg_context, 0,
+         sizeof(*cm->above_seg_context) * aligned_mi_cols);
+
+  // Load tile data into tile_buffers
+  get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows, tile_buffers);
+
+  // Sort the buffers based on size in descending order.
+  qsort(tile_buffers[0], tile_cols, sizeof(tile_buffers[0][0]),
+        compare_tile_buffers);
+
+  // Rearrange the tile buffers such that per-tile group the largest, and
+  // presumably the most difficult, tile will be decoded in the main thread.
+  // This should help minimize the number of instances where the main thread is
+  // waiting for a worker to complete.
+  {
+    int group_start = 0;
+    while (group_start < tile_cols) {
+      const TileBuffer largest = tile_buffers[0][group_start];
+      const int group_end = VPXMIN(group_start + num_workers, tile_cols) - 1;
+      memmove(tile_buffers[0] + group_start, tile_buffers[0] + group_start + 1,
+              (group_end - group_start) * sizeof(tile_buffers[0][0]));
+      tile_buffers[0][group_end] = largest;
+      group_start = group_end + 1;
+    }
+  }
+
+  // Initialize thread frame counts.
+  if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+    int i;
+
+    for (i = 0; i < num_workers; ++i) {
+      TileWorkerData *const tile_data =
+          (TileWorkerData*)pbi->tile_workers[i].data1;
+      vp10_zero(tile_data->counts);
+    }
+  }
+
+  n = 0;
+  while (n < tile_cols) {
+    int i;
+    for (i = 0; i < num_workers && n < tile_cols; ++i) {
+      VPxWorker *const worker = &pbi->tile_workers[i];
+      TileWorkerData *const tile_data = (TileWorkerData*)worker->data1;
+      TileInfo *const tile = (TileInfo*)worker->data2;
+      TileBuffer *const buf = &tile_buffers[0][n];
+
+      tile_data->pbi = pbi;
+      tile_data->xd = pbi->mb;
+      tile_data->xd.corrupted = 0;
+      tile_data->xd.counts =
+          cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD ?
+              &tile_data->counts : NULL;
+      vp10_zero(tile_data->dqcoeff);
+      vp10_tile_init(tile, cm, 0, buf->col);
+      vp10_tile_init(&tile_data->xd.tile, cm, 0, buf->col);
+      setup_token_decoder(buf->data, data_end, buf->size, &cm->error,
+                          &tile_data->bit_reader, pbi->decrypt_cb,
+                          pbi->decrypt_state);
+      vp10_init_macroblockd(cm, &tile_data->xd, tile_data->dqcoeff);
+      tile_data->xd.plane[0].color_index_map = tile_data->color_index_map[0];
+      tile_data->xd.plane[1].color_index_map = tile_data->color_index_map[1];
+
+      worker->had_error = 0;
+      if (i == num_workers - 1 || n == tile_cols - 1) {
+        winterface->execute(worker);
+      } else {
+        winterface->launch(worker);
+      }
+
+      if (buf->col == tile_cols - 1) {
+        final_worker = i;
+      }
+
+      ++n;
+    }
+
+    for (; i > 0; --i) {
+      VPxWorker *const worker = &pbi->tile_workers[i - 1];
+      // TODO(jzern): The tile may have specific error data associated with
+      // its vpx_internal_error_info which could be propagated to the main info
+      // in cm. Additionally once the threads have been synced and an error is
+      // detected, there's no point in continuing to decode tiles.
+      pbi->mb.corrupted |= !winterface->sync(worker);
+    }
+    if (final_worker > -1) {
+      TileWorkerData *const tile_data =
+          (TileWorkerData*)pbi->tile_workers[final_worker].data1;
+      bit_reader_end = vpx_reader_find_end(&tile_data->bit_reader);
+      final_worker = -1;
+    }
+
+    // Accumulate thread frame counts.
+    if (n >= tile_cols &&
+        cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+      for (i = 0; i < num_workers; ++i) {
+        TileWorkerData *const tile_data =
+            (TileWorkerData*)pbi->tile_workers[i].data1;
+        vp10_accumulate_frame_counts(cm, &tile_data->counts, 1);
+      }
+    }
+  }
+
+  return bit_reader_end;
+}
+
+static void error_handler(void *data) {
+  VP10_COMMON *const cm = (VP10_COMMON *)data;
+  vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, "Truncated packet");
+}
+
+static void read_bitdepth_colorspace_sampling(
+    VP10_COMMON *cm, struct vpx_read_bit_buffer *rb) {
+  if (cm->profile >= PROFILE_2) {
+    cm->bit_depth = vpx_rb_read_bit(rb) ? VPX_BITS_12 : VPX_BITS_10;
+#if CONFIG_VP9_HIGHBITDEPTH
+    cm->use_highbitdepth = 1;
+#endif
+  } else {
+    cm->bit_depth = VPX_BITS_8;
+#if CONFIG_VP9_HIGHBITDEPTH
+    cm->use_highbitdepth = 0;
+#endif
+  }
+  cm->color_space = vpx_rb_read_literal(rb, 3);
+  if (cm->color_space != VPX_CS_SRGB) {
+    // [16,235] (including xvycc) vs [0,255] range
+    cm->color_range = vpx_rb_read_bit(rb);
+    if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) {
+      cm->subsampling_x = vpx_rb_read_bit(rb);
+      cm->subsampling_y = vpx_rb_read_bit(rb);
+      if (cm->subsampling_x == 1 && cm->subsampling_y == 1)
+        vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+                           "4:2:0 color not supported in profile 1 or 3");
+      if (vpx_rb_read_bit(rb))
+        vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+                           "Reserved bit set");
+    } else {
+      cm->subsampling_y = cm->subsampling_x = 1;
+    }
+  } else {
+    if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) {
+      // Note if colorspace is SRGB then 4:4:4 chroma sampling is assumed.
+      // 4:2:2 or 4:4:0 chroma sampling is not allowed.
+      cm->subsampling_y = cm->subsampling_x = 0;
+      if (vpx_rb_read_bit(rb))
+        vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+                           "Reserved bit set");
+    } else {
+      vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+                         "4:4:4 color not supported in profile 0 or 2");
+    }
+  }
+}
+
+static size_t read_uncompressed_header(VP10Decoder *pbi,
+                                       struct vpx_read_bit_buffer *rb) {
+  VP10_COMMON *const cm = &pbi->common;
+  MACROBLOCKD *const xd = &pbi->mb;
+  BufferPool *const pool = cm->buffer_pool;
+  RefCntBuffer *const frame_bufs = pool->frame_bufs;
+  int i, mask, ref_index = 0;
+  size_t sz;
+
+  cm->last_frame_type = cm->frame_type;
+  cm->last_intra_only = cm->intra_only;
+
+  if (vpx_rb_read_literal(rb, 2) != VP9_FRAME_MARKER)
+      vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+                         "Invalid frame marker");
+
+  cm->profile = vp10_read_profile(rb);
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (cm->profile >= MAX_PROFILES)
+    vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+                       "Unsupported bitstream profile");
+#else
+  if (cm->profile >= PROFILE_2)
+    vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+                       "Unsupported bitstream profile");
+#endif
+
+  cm->show_existing_frame = vpx_rb_read_bit(rb);
+  if (cm->show_existing_frame) {
+    // Show an existing frame directly.
+    const int frame_to_show = cm->ref_frame_map[vpx_rb_read_literal(rb, 3)];
+    lock_buffer_pool(pool);
+    if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1) {
+      unlock_buffer_pool(pool);
+      vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+                         "Buffer %d does not contain a decoded frame",
+                         frame_to_show);
+    }
+
+    ref_cnt_fb(frame_bufs, &cm->new_fb_idx, frame_to_show);
+    unlock_buffer_pool(pool);
+    pbi->refresh_frame_flags = 0;
+    cm->lf.filter_level = 0;
+    cm->show_frame = 1;
+
+    if (cm->frame_parallel_decode) {
+      for (i = 0; i < REF_FRAMES; ++i)
+        cm->next_ref_frame_map[i] = cm->ref_frame_map[i];
+    }
+    return 0;
+  }
+
+  cm->frame_type = (FRAME_TYPE) vpx_rb_read_bit(rb);
+  cm->show_frame = vpx_rb_read_bit(rb);
+  cm->error_resilient_mode = vpx_rb_read_bit(rb);
+
+  if (cm->frame_type == KEY_FRAME) {
+    if (!vp10_read_sync_code(rb))
+      vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+                         "Invalid frame sync code");
+
+    read_bitdepth_colorspace_sampling(cm, rb);
+    pbi->refresh_frame_flags = (1 << REF_FRAMES) - 1;
+
+    for (i = 0; i < REFS_PER_FRAME; ++i) {
+      cm->frame_refs[i].idx = INVALID_IDX;
+      cm->frame_refs[i].buf = NULL;
+    }
+
+    setup_frame_size(cm, rb);
+    if (pbi->need_resync) {
+      memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
+      pbi->need_resync = 0;
+    }
+  } else {
+    cm->intra_only = cm->show_frame ? 0 : vpx_rb_read_bit(rb);
+
+    if (cm->error_resilient_mode) {
+        cm->reset_frame_context = RESET_FRAME_CONTEXT_ALL;
+    } else {
+#if CONFIG_MISC_FIXES
+      if (cm->intra_only) {
+          cm->reset_frame_context =
+              vpx_rb_read_bit(rb) ? RESET_FRAME_CONTEXT_ALL
+                                  : RESET_FRAME_CONTEXT_CURRENT;
+      } else {
+          cm->reset_frame_context =
+              vpx_rb_read_bit(rb) ? RESET_FRAME_CONTEXT_CURRENT
+                                  : RESET_FRAME_CONTEXT_NONE;
+          if (cm->reset_frame_context == RESET_FRAME_CONTEXT_CURRENT)
+            cm->reset_frame_context =
+                  vpx_rb_read_bit(rb) ? RESET_FRAME_CONTEXT_ALL
+                                      : RESET_FRAME_CONTEXT_CURRENT;
+      }
+#else
+      static const RESET_FRAME_CONTEXT_MODE reset_frame_context_conv_tbl[4] = {
+        RESET_FRAME_CONTEXT_NONE, RESET_FRAME_CONTEXT_NONE,
+        RESET_FRAME_CONTEXT_CURRENT, RESET_FRAME_CONTEXT_ALL
+      };
+
+      cm->reset_frame_context =
+          reset_frame_context_conv_tbl[vpx_rb_read_literal(rb, 2)];
+#endif
+    }
+
+    if (cm->intra_only) {
+      if (!vp10_read_sync_code(rb))
+        vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+                           "Invalid frame sync code");
+#if CONFIG_MISC_FIXES
+      read_bitdepth_colorspace_sampling(cm, rb);
+#else
+      if (cm->profile > PROFILE_0) {
+        read_bitdepth_colorspace_sampling(cm, rb);
+      } else {
+        // NOTE: The intra-only frame header does not include the specification
+        // of either the color format or color sub-sampling in profile 0. VP9
+        // specifies that the default color format should be YUV 4:2:0 in this
+        // case (normative).
+        cm->color_space = VPX_CS_BT_601;
+        cm->color_range = 0;
+        cm->subsampling_y = cm->subsampling_x = 1;
+        cm->bit_depth = VPX_BITS_8;
+#if CONFIG_VP9_HIGHBITDEPTH
+        cm->use_highbitdepth = 0;
+#endif
+      }
+#endif
+
+      pbi->refresh_frame_flags = vpx_rb_read_literal(rb, REF_FRAMES);
+      setup_frame_size(cm, rb);
+      if (pbi->need_resync) {
+        memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
+        pbi->need_resync = 0;
+      }
+    } else if (pbi->need_resync != 1) {  /* Skip if need resync */
+      pbi->refresh_frame_flags = vpx_rb_read_literal(rb, REF_FRAMES);
+      for (i = 0; i < REFS_PER_FRAME; ++i) {
+        const int ref = vpx_rb_read_literal(rb, REF_FRAMES_LOG2);
+        const int idx = cm->ref_frame_map[ref];
+        RefBuffer *const ref_frame = &cm->frame_refs[i];
+        ref_frame->idx = idx;
+        ref_frame->buf = &frame_bufs[idx].buf;
+        cm->ref_frame_sign_bias[LAST_FRAME + i] = vpx_rb_read_bit(rb);
+      }
+
+      setup_frame_size_with_refs(cm, rb);
+
+      cm->allow_high_precision_mv = vpx_rb_read_bit(rb);
+      cm->interp_filter = read_interp_filter(rb);
+
+      for (i = 0; i < REFS_PER_FRAME; ++i) {
+        RefBuffer *const ref_buf = &cm->frame_refs[i];
+#if CONFIG_VP9_HIGHBITDEPTH
+        vp10_setup_scale_factors_for_frame(&ref_buf->sf,
+                                          ref_buf->buf->y_crop_width,
+                                          ref_buf->buf->y_crop_height,
+                                          cm->width, cm->height,
+                                          cm->use_highbitdepth);
+#else
+        vp10_setup_scale_factors_for_frame(&ref_buf->sf,
+                                          ref_buf->buf->y_crop_width,
+                                          ref_buf->buf->y_crop_height,
+                                          cm->width, cm->height);
+#endif
+      }
+    }
+  }
+#if CONFIG_VP9_HIGHBITDEPTH
+  get_frame_new_buffer(cm)->bit_depth = cm->bit_depth;
+#endif
+  get_frame_new_buffer(cm)->color_space = cm->color_space;
+  get_frame_new_buffer(cm)->color_range = cm->color_range;
+  get_frame_new_buffer(cm)->render_width  = cm->render_width;
+  get_frame_new_buffer(cm)->render_height = cm->render_height;
+
+  if (pbi->need_resync) {
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Keyframe / intra-only frame required to reset decoder"
+                       " state");
+  }
+
+  if (!cm->error_resilient_mode) {
+    cm->refresh_frame_context =
+        vpx_rb_read_bit(rb) ? REFRESH_FRAME_CONTEXT_FORWARD
+                            : REFRESH_FRAME_CONTEXT_OFF;
+    if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_FORWARD) {
+        cm->refresh_frame_context =
+            vpx_rb_read_bit(rb) ? REFRESH_FRAME_CONTEXT_FORWARD
+                                : REFRESH_FRAME_CONTEXT_BACKWARD;
+#if !CONFIG_MISC_FIXES
+    } else {
+      vpx_rb_read_bit(rb);  // parallel decoding mode flag
+#endif
+    }
+  } else {
+    cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_OFF;
+  }
+
+  // This flag will be overridden by the call to vp10_setup_past_independence
+  // below, forcing the use of context 0 for those frame types.
+  cm->frame_context_idx = vpx_rb_read_literal(rb, FRAME_CONTEXTS_LOG2);
+
+  // Generate next_ref_frame_map.
+  lock_buffer_pool(pool);
+  for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
+    if (mask & 1) {
+      cm->next_ref_frame_map[ref_index] = cm->new_fb_idx;
+      ++frame_bufs[cm->new_fb_idx].ref_count;
+    } else {
+      cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index];
+    }
+    // Current thread holds the reference frame.
+    if (cm->ref_frame_map[ref_index] >= 0)
+      ++frame_bufs[cm->ref_frame_map[ref_index]].ref_count;
+    ++ref_index;
+  }
+
+  for (; ref_index < REF_FRAMES; ++ref_index) {
+    cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index];
+    // Current thread holds the reference frame.
+    if (cm->ref_frame_map[ref_index] >= 0)
+      ++frame_bufs[cm->ref_frame_map[ref_index]].ref_count;
+  }
+  unlock_buffer_pool(pool);
+  pbi->hold_ref_buf = 1;
+
+  if (frame_is_intra_only(cm) || cm->error_resilient_mode)
+    vp10_setup_past_independence(cm);
+
+  setup_loopfilter(&cm->lf, rb);
+  setup_quantization(cm, rb);
+#if CONFIG_VP9_HIGHBITDEPTH
+  xd->bd = (int)cm->bit_depth;
+#endif
+
+  setup_segmentation(cm, rb);
+
+  {
+    int i;
+    for (i = 0; i < MAX_SEGMENTS; ++i) {
+      const int qindex = CONFIG_MISC_FIXES && cm->seg.enabled ?
+          vp10_get_qindex(&cm->seg, i, cm->base_qindex) :
+          cm->base_qindex;
+      xd->lossless[i] = qindex == 0 &&
+          cm->y_dc_delta_q == 0 &&
+          cm->uv_dc_delta_q == 0 &&
+          cm->uv_ac_delta_q == 0;
+    }
+  }
+
+  setup_segmentation_dequant(cm);
+#if CONFIG_MISC_FIXES
+  cm->tx_mode = (!cm->seg.enabled && xd->lossless[0]) ? ONLY_4X4
+                                                      : read_tx_mode(rb);
+  cm->reference_mode = read_frame_reference_mode(cm, rb);
+#endif
+
+  setup_tile_info(cm, rb);
+  sz = vpx_rb_read_literal(rb, 16);
+
+  if (sz == 0)
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Invalid header size");
+
+  return sz;
+}
+
+static void read_ext_tx_probs(FRAME_CONTEXT *fc, vpx_reader *r) {
+  int i, j, k;
+  if (vpx_read(r, GROUP_DIFF_UPDATE_PROB)) {
+    for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+      for (j = 0; j < TX_TYPES; ++j)
+        for (k = 0; k < TX_TYPES - 1; ++k)
+          vp10_diff_update_prob(r, &fc->intra_ext_tx_prob[i][j][k]);
+    }
+  }
+  if (vpx_read(r, GROUP_DIFF_UPDATE_PROB)) {
+    for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+      for (k = 0; k < TX_TYPES - 1; ++k)
+        vp10_diff_update_prob(r, &fc->inter_ext_tx_prob[i][k]);
+    }
+  }
+}
+
+static int read_compressed_header(VP10Decoder *pbi, const uint8_t *data,
+                                  size_t partition_size) {
+  VP10_COMMON *const cm = &pbi->common;
+#if !CONFIG_MISC_FIXES
+  MACROBLOCKD *const xd = &pbi->mb;
+#endif
+  FRAME_CONTEXT *const fc = cm->fc;
+  vpx_reader r;
+  int k, i, j;
+
+  if (vpx_reader_init(&r, data, partition_size, pbi->decrypt_cb,
+                      pbi->decrypt_state))
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate bool decoder 0");
+
+#if !CONFIG_MISC_FIXES
+  cm->tx_mode = xd->lossless[0] ? ONLY_4X4 : read_tx_mode(&r);
+#endif
+  if (cm->tx_mode == TX_MODE_SELECT)
+    read_tx_mode_probs(&fc->tx_probs, &r);
+  read_coef_probs(fc, cm->tx_mode, &r);
+
+  for (k = 0; k < SKIP_CONTEXTS; ++k)
+    vp10_diff_update_prob(&r, &fc->skip_probs[k]);
+
+#if CONFIG_MISC_FIXES
+  if (cm->seg.enabled) {
+    if (cm->seg.temporal_update) {
+      for (k = 0; k < PREDICTION_PROBS; k++)
+        vp10_diff_update_prob(&r, &cm->fc->seg.pred_probs[k]);
+    }
+    for (k = 0; k < MAX_SEGMENTS - 1; k++)
+      vp10_diff_update_prob(&r, &cm->fc->seg.tree_probs[k]);
+  }
+
+  for (j = 0; j < INTRA_MODES; j++)
+    for (i = 0; i < INTRA_MODES - 1; ++i)
+      vp10_diff_update_prob(&r, &fc->uv_mode_prob[j][i]);
+
+  for (j = 0; j < PARTITION_CONTEXTS; ++j)
+    for (i = 0; i < PARTITION_TYPES - 1; ++i)
+      vp10_diff_update_prob(&r, &fc->partition_prob[j][i]);
+#endif
+
+  if (frame_is_intra_only(cm)) {
+    vp10_copy(cm->kf_y_prob, vp10_kf_y_mode_prob);
+#if CONFIG_MISC_FIXES
+    for (k = 0; k < INTRA_MODES; k++)
+      for (j = 0; j < INTRA_MODES; j++)
+        for (i = 0; i < INTRA_MODES - 1; ++i)
+          vp10_diff_update_prob(&r, &cm->kf_y_prob[k][j][i]);
+#endif
+  } else {
+    nmv_context *const nmvc = &fc->nmvc;
+
+    read_inter_mode_probs(fc, &r);
+
+    if (cm->interp_filter == SWITCHABLE)
+      read_switchable_interp_probs(fc, &r);
+
+    for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
+      vp10_diff_update_prob(&r, &fc->intra_inter_prob[i]);
+
+#if !CONFIG_MISC_FIXES
+    cm->reference_mode = read_frame_reference_mode(cm, &r);
+#endif
+    if (cm->reference_mode != SINGLE_REFERENCE)
+      setup_compound_reference_mode(cm);
+    read_frame_reference_mode_probs(cm, &r);
+
+    for (j = 0; j < BLOCK_SIZE_GROUPS; j++)
+      for (i = 0; i < INTRA_MODES - 1; ++i)
+        vp10_diff_update_prob(&r, &fc->y_mode_prob[j][i]);
+
+#if !CONFIG_MISC_FIXES
+    for (j = 0; j < PARTITION_CONTEXTS; ++j)
+      for (i = 0; i < PARTITION_TYPES - 1; ++i)
+        vp10_diff_update_prob(&r, &fc->partition_prob[j][i]);
+#endif
+
+    read_mv_probs(nmvc, cm->allow_high_precision_mv, &r);
+    read_ext_tx_probs(fc, &r);
+  }
+
+  return vpx_reader_has_error(&r);
+}
+
+#ifdef NDEBUG
+#define debug_check_frame_counts(cm) (void)0
+#else  // !NDEBUG
+// Counts should only be incremented when frame_parallel_decoding_mode and
+// error_resilient_mode are disabled.
+static void debug_check_frame_counts(const VP10_COMMON *const cm) {
+  FRAME_COUNTS zero_counts;
+  vp10_zero(zero_counts);
+  assert(cm->refresh_frame_context != REFRESH_FRAME_CONTEXT_BACKWARD ||
+         cm->error_resilient_mode);
+  assert(!memcmp(cm->counts.y_mode, zero_counts.y_mode,
+                 sizeof(cm->counts.y_mode)));
+  assert(!memcmp(cm->counts.uv_mode, zero_counts.uv_mode,
+                 sizeof(cm->counts.uv_mode)));
+  assert(!memcmp(cm->counts.partition, zero_counts.partition,
+                 sizeof(cm->counts.partition)));
+  assert(!memcmp(cm->counts.coef, zero_counts.coef,
+                 sizeof(cm->counts.coef)));
+  assert(!memcmp(cm->counts.eob_branch, zero_counts.eob_branch,
+                 sizeof(cm->counts.eob_branch)));
+  assert(!memcmp(cm->counts.switchable_interp, zero_counts.switchable_interp,
+                 sizeof(cm->counts.switchable_interp)));
+  assert(!memcmp(cm->counts.inter_mode, zero_counts.inter_mode,
+                 sizeof(cm->counts.inter_mode)));
+  assert(!memcmp(cm->counts.intra_inter, zero_counts.intra_inter,
+                 sizeof(cm->counts.intra_inter)));
+  assert(!memcmp(cm->counts.comp_inter, zero_counts.comp_inter,
+                 sizeof(cm->counts.comp_inter)));
+  assert(!memcmp(cm->counts.single_ref, zero_counts.single_ref,
+                 sizeof(cm->counts.single_ref)));
+  assert(!memcmp(cm->counts.comp_ref, zero_counts.comp_ref,
+                 sizeof(cm->counts.comp_ref)));
+  assert(!memcmp(&cm->counts.tx, &zero_counts.tx, sizeof(cm->counts.tx)));
+  assert(!memcmp(cm->counts.skip, zero_counts.skip, sizeof(cm->counts.skip)));
+  assert(!memcmp(&cm->counts.mv, &zero_counts.mv, sizeof(cm->counts.mv)));
+  assert(!memcmp(cm->counts.intra_ext_tx, zero_counts.intra_ext_tx,
+                 sizeof(cm->counts.intra_ext_tx)));
+  assert(!memcmp(cm->counts.inter_ext_tx, zero_counts.inter_ext_tx,
+                 sizeof(cm->counts.inter_ext_tx)));
+}
+#endif  // NDEBUG
+
+static struct vpx_read_bit_buffer *init_read_bit_buffer(
+    VP10Decoder *pbi,
+    struct vpx_read_bit_buffer *rb,
+    const uint8_t *data,
+    const uint8_t *data_end,
+    uint8_t clear_data[MAX_VP9_HEADER_SIZE]) {
+  rb->bit_offset = 0;
+  rb->error_handler = error_handler;
+  rb->error_handler_data = &pbi->common;
+  if (pbi->decrypt_cb) {
+    const int n = (int)VPXMIN(MAX_VP9_HEADER_SIZE, data_end - data);
+    pbi->decrypt_cb(pbi->decrypt_state, data, clear_data, n);
+    rb->bit_buffer = clear_data;
+    rb->bit_buffer_end = clear_data + n;
+  } else {
+    rb->bit_buffer = data;
+    rb->bit_buffer_end = data_end;
+  }
+  return rb;
+}
+
+//------------------------------------------------------------------------------
+
+int vp10_read_sync_code(struct vpx_read_bit_buffer *const rb) {
+  return vpx_rb_read_literal(rb, 8) == VP10_SYNC_CODE_0 &&
+         vpx_rb_read_literal(rb, 8) == VP10_SYNC_CODE_1 &&
+         vpx_rb_read_literal(rb, 8) == VP10_SYNC_CODE_2;
+}
+
+void vp10_read_frame_size(struct vpx_read_bit_buffer *rb,
+                         int *width, int *height) {
+  *width = vpx_rb_read_literal(rb, 16) + 1;
+  *height = vpx_rb_read_literal(rb, 16) + 1;
+}
+
+BITSTREAM_PROFILE vp10_read_profile(struct vpx_read_bit_buffer *rb) {
+  int profile = vpx_rb_read_bit(rb);
+  profile |= vpx_rb_read_bit(rb) << 1;
+  if (profile > 2)
+    profile += vpx_rb_read_bit(rb);
+  return (BITSTREAM_PROFILE) profile;
+}
+
+void vp10_decode_frame(VP10Decoder *pbi,
+                      const uint8_t *data, const uint8_t *data_end,
+                      const uint8_t **p_data_end) {
+  VP10_COMMON *const cm = &pbi->common;
+  MACROBLOCKD *const xd = &pbi->mb;
+  struct vpx_read_bit_buffer rb;
+  int context_updated = 0;
+  uint8_t clear_data[MAX_VP9_HEADER_SIZE];
+  const size_t first_partition_size = read_uncompressed_header(pbi,
+      init_read_bit_buffer(pbi, &rb, data, data_end, clear_data));
+  const int tile_rows = 1 << cm->log2_tile_rows;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm);
+  xd->cur_buf = new_fb;
+
+  if (!first_partition_size) {
+    // showing a frame directly
+    *p_data_end = data + (cm->profile <= PROFILE_2 ? 1 : 2);
+    return;
+  }
+
+  data += vpx_rb_bytes_read(&rb);
+  if (!read_is_valid(data, first_partition_size, data_end))
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Truncated packet or corrupt header length");
+
+  cm->use_prev_frame_mvs = !cm->error_resilient_mode &&
+                           cm->width == cm->last_width &&
+                           cm->height == cm->last_height &&
+                           !cm->last_intra_only &&
+                           cm->last_show_frame &&
+                           (cm->last_frame_type != KEY_FRAME);
+
+  vp10_setup_block_planes(xd, cm->subsampling_x, cm->subsampling_y);
+
+  *cm->fc = cm->frame_contexts[cm->frame_context_idx];
+  if (!cm->fc->initialized)
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Uninitialized entropy context.");
+
+  vp10_zero(cm->counts);
+
+  xd->corrupted = 0;
+  new_fb->corrupted = read_compressed_header(pbi, data, first_partition_size);
+  if (new_fb->corrupted)
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Decode failed. Frame data header is corrupted.");
+
+  if (cm->lf.filter_level && !cm->skip_loop_filter) {
+    vp10_loop_filter_frame_init(cm, cm->lf.filter_level);
+  }
+
+  // If encoded in frame parallel mode, frame context is ready after decoding
+  // the frame header.
+  if (cm->frame_parallel_decode &&
+      cm->refresh_frame_context != REFRESH_FRAME_CONTEXT_BACKWARD) {
+    VPxWorker *const worker = pbi->frame_worker_owner;
+    FrameWorkerData *const frame_worker_data = worker->data1;
+    if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_FORWARD) {
+      context_updated = 1;
+      cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
+    }
+    vp10_frameworker_lock_stats(worker);
+    pbi->cur_buf->row = -1;
+    pbi->cur_buf->col = -1;
+    frame_worker_data->frame_context_ready = 1;
+    // Signal the main thread that context is ready.
+    vp10_frameworker_signal_stats(worker);
+    vp10_frameworker_unlock_stats(worker);
+  }
+
+  if (pbi->max_threads > 1 && tile_rows == 1 && tile_cols > 1) {
+    // Multi-threaded tile decoder
+    *p_data_end = decode_tiles_mt(pbi, data + first_partition_size, data_end);
+    if (!xd->corrupted) {
+      if (!cm->skip_loop_filter) {
+        // If multiple threads are used to decode tiles, then we use those
+        // threads to do parallel loopfiltering.
+        vp10_loop_filter_frame_mt(new_fb, cm, pbi->mb.plane,
+                                 cm->lf.filter_level, 0, 0, pbi->tile_workers,
+                                 pbi->num_tile_workers, &pbi->lf_row_sync);
+      }
+    } else {
+      vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                         "Decode failed. Frame data is corrupted.");
+
+    }
+  } else {
+    *p_data_end = decode_tiles(pbi, data + first_partition_size, data_end);
+  }
+
+  if (!xd->corrupted) {
+    if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+      vp10_adapt_coef_probs(cm);
+#if CONFIG_MISC_FIXES
+      vp10_adapt_intra_frame_probs(cm);
+#endif
+
+      if (!frame_is_intra_only(cm)) {
+#if !CONFIG_MISC_FIXES
+        vp10_adapt_intra_frame_probs(cm);
+#endif
+        vp10_adapt_inter_frame_probs(cm);
+        vp10_adapt_mv_probs(cm, cm->allow_high_precision_mv);
+      }
+    } else {
+      debug_check_frame_counts(cm);
+    }
+  } else {
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Decode failed. Frame data is corrupted.");
+  }
+
+  // Non frame parallel update frame context here.
+  if (cm->refresh_frame_context != REFRESH_FRAME_CONTEXT_OFF &&
+      !context_updated)
+    cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
+}
diff --git a/libs/libvpx/vp10/decoder/decodeframe.h b/libs/libvpx/vp10/decoder/decodeframe.h
new file mode 100644
index 0000000000..770ae154e5
--- /dev/null
+++ b/libs/libvpx/vp10/decoder/decodeframe.h
@@ -0,0 +1,35 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP10_DECODER_DECODEFRAME_H_
+#define VP10_DECODER_DECODEFRAME_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP10Decoder;
+struct vpx_read_bit_buffer;
+
+int vp10_read_sync_code(struct vpx_read_bit_buffer *const rb);
+void vp10_read_frame_size(struct vpx_read_bit_buffer *rb,
+                         int *width, int *height);
+BITSTREAM_PROFILE vp10_read_profile(struct vpx_read_bit_buffer *rb);
+
+void vp10_decode_frame(struct VP10Decoder *pbi,
+                      const uint8_t *data, const uint8_t *data_end,
+                      const uint8_t **p_data_end);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_DECODER_DECODEFRAME_H_
diff --git a/libs/libvpx/vp10/decoder/decodemv.c b/libs/libvpx/vp10/decoder/decodemv.c
new file mode 100644
index 0000000000..01b796c106
--- /dev/null
+++ b/libs/libvpx/vp10/decoder/decodemv.c
@@ -0,0 +1,704 @@
+/*
+  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vp10/common/common.h"
+#include "vp10/common/entropy.h"
+#include "vp10/common/entropymode.h"
+#include "vp10/common/entropymv.h"
+#include "vp10/common/mvref_common.h"
+#include "vp10/common/pred_common.h"
+#include "vp10/common/reconinter.h"
+#include "vp10/common/seg_common.h"
+
+#include "vp10/decoder/decodemv.h"
+#include "vp10/decoder/decodeframe.h"
+
+#include "vpx_dsp/vpx_dsp_common.h"
+
+static PREDICTION_MODE read_intra_mode(vpx_reader *r, const vpx_prob *p) {
+  return (PREDICTION_MODE)vpx_read_tree(r, vp10_intra_mode_tree, p);
+}
+
+static PREDICTION_MODE read_intra_mode_y(VP10_COMMON *cm, MACROBLOCKD *xd,
+                                         vpx_reader *r, int size_group) {
+  const PREDICTION_MODE y_mode =
+      read_intra_mode(r, cm->fc->y_mode_prob[size_group]);
+  FRAME_COUNTS *counts = xd->counts;
+  if (counts)
+    ++counts->y_mode[size_group][y_mode];
+  return y_mode;
+}
+
+static PREDICTION_MODE read_intra_mode_uv(VP10_COMMON *cm, MACROBLOCKD *xd,
+                                          vpx_reader *r,
+                                          PREDICTION_MODE y_mode) {
+  const PREDICTION_MODE uv_mode = read_intra_mode(r,
+                                         cm->fc->uv_mode_prob[y_mode]);
+  FRAME_COUNTS *counts = xd->counts;
+  if (counts)
+    ++counts->uv_mode[y_mode][uv_mode];
+  return uv_mode;
+}
+
+static PREDICTION_MODE read_inter_mode(VP10_COMMON *cm, MACROBLOCKD *xd,
+                                       vpx_reader *r, int ctx) {
+  const int mode = vpx_read_tree(r, vp10_inter_mode_tree,
+                                 cm->fc->inter_mode_probs[ctx]);
+  FRAME_COUNTS *counts = xd->counts;
+  if (counts)
+    ++counts->inter_mode[ctx][mode];
+
+  return NEARESTMV + mode;
+}
+
+static int read_segment_id(vpx_reader *r,
+    const struct segmentation_probs *segp) {
+  return vpx_read_tree(r, vp10_segment_tree, segp->tree_probs);
+}
+
+static TX_SIZE read_selected_tx_size(VP10_COMMON *cm, MACROBLOCKD *xd,
+                                     TX_SIZE max_tx_size, vpx_reader *r) {
+  FRAME_COUNTS *counts = xd->counts;
+  const int ctx = get_tx_size_context(xd);
+  const vpx_prob *tx_probs = get_tx_probs(max_tx_size, ctx, &cm->fc->tx_probs);
+  int tx_size = vpx_read(r, tx_probs[0]);
+  if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) {
+    tx_size += vpx_read(r, tx_probs[1]);
+    if (tx_size != TX_8X8 && max_tx_size >= TX_32X32)
+      tx_size += vpx_read(r, tx_probs[2]);
+  }
+
+  if (counts)
+    ++get_tx_counts(max_tx_size, ctx, &counts->tx)[tx_size];
+  return (TX_SIZE)tx_size;
+}
+
+static TX_SIZE read_tx_size(VP10_COMMON *cm, MACROBLOCKD *xd,
+                            int allow_select, vpx_reader *r) {
+  TX_MODE tx_mode = cm->tx_mode;
+  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
+  if (xd->lossless[xd->mi[0]->mbmi.segment_id])
+    return TX_4X4;
+  if (allow_select && tx_mode == TX_MODE_SELECT && bsize >= BLOCK_8X8)
+    return read_selected_tx_size(cm, xd, max_tx_size, r);
+  else
+    return VPXMIN(max_tx_size, tx_mode_to_biggest_tx_size[tx_mode]);
+}
+
+static int dec_get_segment_id(const VP10_COMMON *cm, const uint8_t *segment_ids,
+                              int mi_offset, int x_mis, int y_mis) {
+  int x, y, segment_id = INT_MAX;
+
+  for (y = 0; y < y_mis; y++)
+    for (x = 0; x < x_mis; x++)
+      segment_id =
+          VPXMIN(segment_id, segment_ids[mi_offset + y * cm->mi_cols + x]);
+
+  assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
+  return segment_id;
+}
+
+static void set_segment_id(VP10_COMMON *cm, int mi_offset,
+                           int x_mis, int y_mis, int segment_id) {
+  int x, y;
+
+  assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
+
+  for (y = 0; y < y_mis; y++)
+    for (x = 0; x < x_mis; x++)
+      cm->current_frame_seg_map[mi_offset + y * cm->mi_cols + x] = segment_id;
+}
+
+static int read_intra_segment_id(VP10_COMMON *const cm, MACROBLOCKD *const xd,
+                                 int mi_offset, int x_mis, int y_mis,
+                                 vpx_reader *r) {
+  struct segmentation *const seg = &cm->seg;
+#if CONFIG_MISC_FIXES
+  FRAME_COUNTS *counts = xd->counts;
+  struct segmentation_probs *const segp = &cm->fc->seg;
+#else
+  struct segmentation_probs *const segp = &cm->segp;
+#endif
+  int segment_id;
+
+#if !CONFIG_MISC_FIXES
+  (void) xd;
+#endif
+
+  if (!seg->enabled)
+    return 0;  // Default for disabled segmentation
+
+  assert(seg->update_map && !seg->temporal_update);
+
+  segment_id = read_segment_id(r, segp);
+#if CONFIG_MISC_FIXES
+  if (counts)
+    ++counts->seg.tree_total[segment_id];
+#endif
+  set_segment_id(cm, mi_offset, x_mis, y_mis, segment_id);
+  return segment_id;
+}
+
+static void copy_segment_id(const VP10_COMMON *cm,
+                           const uint8_t *last_segment_ids,
+                           uint8_t *current_segment_ids,
+                           int mi_offset, int x_mis, int y_mis) {
+  int x, y;
+
+  for (y = 0; y < y_mis; y++)
+    for (x = 0; x < x_mis; x++)
+      current_segment_ids[mi_offset + y * cm->mi_cols + x] =  last_segment_ids ?
+          last_segment_ids[mi_offset + y * cm->mi_cols + x] : 0;
+}
+
+static int read_inter_segment_id(VP10_COMMON *const cm, MACROBLOCKD *const xd,
+                                 int mi_row, int mi_col, vpx_reader *r) {
+  struct segmentation *const seg = &cm->seg;
+#if CONFIG_MISC_FIXES
+  FRAME_COUNTS *counts = xd->counts;
+  struct segmentation_probs *const segp = &cm->fc->seg;
+#else
+  struct segmentation_probs *const segp = &cm->segp;
+#endif
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  int predicted_segment_id, segment_id;
+  const int mi_offset = mi_row * cm->mi_cols + mi_col;
+  const int bw = xd->plane[0].n4_w >> 1;
+  const int bh = xd->plane[0].n4_h >> 1;
+
+  // TODO(slavarnway): move x_mis, y_mis into xd ?????
+  const int x_mis = VPXMIN(cm->mi_cols - mi_col, bw);
+  const int y_mis = VPXMIN(cm->mi_rows - mi_row, bh);
+
+  if (!seg->enabled)
+    return 0;  // Default for disabled segmentation
+
+  predicted_segment_id = cm->last_frame_seg_map ?
+      dec_get_segment_id(cm, cm->last_frame_seg_map, mi_offset, x_mis, y_mis) :
+      0;
+
+  if (!seg->update_map) {
+    copy_segment_id(cm, cm->last_frame_seg_map, cm->current_frame_seg_map,
+                    mi_offset, x_mis, y_mis);
+    return predicted_segment_id;
+  }
+
+  if (seg->temporal_update) {
+    const int ctx = vp10_get_pred_context_seg_id(xd);
+    const vpx_prob pred_prob = segp->pred_probs[ctx];
+    mbmi->seg_id_predicted = vpx_read(r, pred_prob);
+#if CONFIG_MISC_FIXES
+    if (counts)
+      ++counts->seg.pred[ctx][mbmi->seg_id_predicted];
+#endif
+    if (mbmi->seg_id_predicted) {
+      segment_id = predicted_segment_id;
+    } else {
+      segment_id = read_segment_id(r, segp);
+#if CONFIG_MISC_FIXES
+      if (counts)
+        ++counts->seg.tree_mispred[segment_id];
+#endif
+    }
+  } else {
+    segment_id = read_segment_id(r, segp);
+#if CONFIG_MISC_FIXES
+    if (counts)
+      ++counts->seg.tree_total[segment_id];
+#endif
+  }
+  set_segment_id(cm, mi_offset, x_mis, y_mis, segment_id);
+  return segment_id;
+}
+
+static int read_skip(VP10_COMMON *cm, const MACROBLOCKD *xd,
+                     int segment_id, vpx_reader *r) {
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
+    return 1;
+  } else {
+    const int ctx = vp10_get_skip_context(xd);
+    const int skip = vpx_read(r, cm->fc->skip_probs[ctx]);
+    FRAME_COUNTS *counts = xd->counts;
+    if (counts)
+      ++counts->skip[ctx][skip];
+    return skip;
+  }
+}
+
+static void read_intra_frame_mode_info(VP10_COMMON *const cm,
+                                       MACROBLOCKD *const xd,
+                                       int mi_row, int mi_col, vpx_reader *r) {
+  MODE_INFO *const mi = xd->mi[0];
+  MB_MODE_INFO *const mbmi = &mi->mbmi;
+  const MODE_INFO *above_mi = xd->above_mi;
+  const MODE_INFO *left_mi  = xd->left_mi;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  int i;
+  const int mi_offset = mi_row * cm->mi_cols + mi_col;
+  const int bw = xd->plane[0].n4_w >> 1;
+  const int bh = xd->plane[0].n4_h >> 1;
+
+  // TODO(slavarnway): move x_mis, y_mis into xd ?????
+  const int x_mis = VPXMIN(cm->mi_cols - mi_col, bw);
+  const int y_mis = VPXMIN(cm->mi_rows - mi_row, bh);
+
+  mbmi->segment_id = read_intra_segment_id(cm, xd, mi_offset, x_mis, y_mis, r);
+  mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r);
+  mbmi->tx_size = read_tx_size(cm, xd, 1, r);
+  mbmi->ref_frame[0] = INTRA_FRAME;
+  mbmi->ref_frame[1] = NONE;
+
+  switch (bsize) {
+    case BLOCK_4X4:
+      for (i = 0; i < 4; ++i)
+        mi->bmi[i].as_mode =
+            read_intra_mode(r, get_y_mode_probs(cm, mi, above_mi, left_mi, i));
+      mbmi->mode = mi->bmi[3].as_mode;
+      break;
+    case BLOCK_4X8:
+      mi->bmi[0].as_mode = mi->bmi[2].as_mode =
+          read_intra_mode(r, get_y_mode_probs(cm, mi, above_mi, left_mi, 0));
+      mi->bmi[1].as_mode = mi->bmi[3].as_mode = mbmi->mode =
+          read_intra_mode(r, get_y_mode_probs(cm, mi, above_mi, left_mi, 1));
+      break;
+    case BLOCK_8X4:
+      mi->bmi[0].as_mode = mi->bmi[1].as_mode =
+          read_intra_mode(r, get_y_mode_probs(cm, mi, above_mi, left_mi, 0));
+      mi->bmi[2].as_mode = mi->bmi[3].as_mode = mbmi->mode =
+          read_intra_mode(r, get_y_mode_probs(cm, mi, above_mi, left_mi, 2));
+      break;
+    default:
+      mbmi->mode = read_intra_mode(r,
+          get_y_mode_probs(cm, mi, above_mi, left_mi, 0));
+  }
+
+  mbmi->uv_mode = read_intra_mode_uv(cm, xd, r, mbmi->mode);
+
+  if (mbmi->tx_size < TX_32X32 &&
+      cm->base_qindex > 0 && !mbmi->skip &&
+      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+    FRAME_COUNTS *counts = xd->counts;
+    TX_TYPE tx_type_nom = intra_mode_to_tx_type_context[mbmi->mode];
+    mbmi->tx_type = vpx_read_tree(
+        r, vp10_ext_tx_tree,
+        cm->fc->intra_ext_tx_prob[mbmi->tx_size][tx_type_nom]);
+    if (counts)
+      ++counts->intra_ext_tx[mbmi->tx_size][tx_type_nom][mbmi->tx_type];
+  } else {
+    mbmi->tx_type = DCT_DCT;
+  }
+}
+
+static int read_mv_component(vpx_reader *r,
+                             const nmv_component *mvcomp, int usehp) {
+  int mag, d, fr, hp;
+  const int sign = vpx_read(r, mvcomp->sign);
+  const int mv_class = vpx_read_tree(r, vp10_mv_class_tree, mvcomp->classes);
+  const int class0 = mv_class == MV_CLASS_0;
+
+  // Integer part
+  if (class0) {
+    d = vpx_read_tree(r, vp10_mv_class0_tree, mvcomp->class0);
+    mag = 0;
+  } else {
+    int i;
+    const int n = mv_class + CLASS0_BITS - 1;  // number of bits
+
+    d = 0;
+    for (i = 0; i < n; ++i)
+      d |= vpx_read(r, mvcomp->bits[i]) << i;
+    mag = CLASS0_SIZE << (mv_class + 2);
+  }
+
+  // Fractional part
+  fr = vpx_read_tree(r, vp10_mv_fp_tree, class0 ? mvcomp->class0_fp[d]
+                                               : mvcomp->fp);
+
+  // High precision part (if hp is not used, the default value of the hp is 1)
+  hp = usehp ? vpx_read(r, class0 ? mvcomp->class0_hp : mvcomp->hp)
+             : 1;
+
+  // Result
+  mag += ((d << 3) | (fr << 1) | hp) + 1;
+  return sign ? -mag : mag;
+}
+
+static INLINE void read_mv(vpx_reader *r, MV *mv, const MV *ref,
+                           const nmv_context *ctx,
+                           nmv_context_counts *counts, int allow_hp) {
+  const MV_JOINT_TYPE joint_type =
+      (MV_JOINT_TYPE)vpx_read_tree(r, vp10_mv_joint_tree, ctx->joints);
+  const int use_hp = allow_hp && vp10_use_mv_hp(ref);
+  MV diff = {0, 0};
+
+  if (mv_joint_vertical(joint_type))
+    diff.row = read_mv_component(r, &ctx->comps[0], use_hp);
+
+  if (mv_joint_horizontal(joint_type))
+    diff.col = read_mv_component(r, &ctx->comps[1], use_hp);
+
+  vp10_inc_mv(&diff, counts, use_hp);
+
+  mv->row = ref->row + diff.row;
+  mv->col = ref->col + diff.col;
+}
+
+static REFERENCE_MODE read_block_reference_mode(VP10_COMMON *cm,
+                                                const MACROBLOCKD *xd,
+                                                vpx_reader *r) {
+  if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+    const int ctx = vp10_get_reference_mode_context(cm, xd);
+    const REFERENCE_MODE mode =
+        (REFERENCE_MODE)vpx_read(r, cm->fc->comp_inter_prob[ctx]);
+    FRAME_COUNTS *counts = xd->counts;
+    if (counts)
+      ++counts->comp_inter[ctx][mode];
+    return mode;  // SINGLE_REFERENCE or COMPOUND_REFERENCE
+  } else {
+    return cm->reference_mode;
+  }
+}
+
+// Read the referncence frame
+static void read_ref_frames(VP10_COMMON *const cm, MACROBLOCKD *const xd,
+                            vpx_reader *r,
+                            int segment_id, MV_REFERENCE_FRAME ref_frame[2]) {
+  FRAME_CONTEXT *const fc = cm->fc;
+  FRAME_COUNTS *counts = xd->counts;
+
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
+    ref_frame[0] = (MV_REFERENCE_FRAME)get_segdata(&cm->seg, segment_id,
+                                                   SEG_LVL_REF_FRAME);
+    ref_frame[1] = NONE;
+  } else {
+    const REFERENCE_MODE mode = read_block_reference_mode(cm, xd, r);
+    // FIXME(rbultje) I'm pretty sure this breaks segmentation ref frame coding
+    if (mode == COMPOUND_REFERENCE) {
+      const int idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
+      const int ctx = vp10_get_pred_context_comp_ref_p(cm, xd);
+      const int bit = vpx_read(r, fc->comp_ref_prob[ctx]);
+      if (counts)
+        ++counts->comp_ref[ctx][bit];
+      ref_frame[idx] = cm->comp_fixed_ref;
+      ref_frame[!idx] = cm->comp_var_ref[bit];
+    } else if (mode == SINGLE_REFERENCE) {
+      const int ctx0 = vp10_get_pred_context_single_ref_p1(xd);
+      const int bit0 = vpx_read(r, fc->single_ref_prob[ctx0][0]);
+      if (counts)
+        ++counts->single_ref[ctx0][0][bit0];
+      if (bit0) {
+        const int ctx1 = vp10_get_pred_context_single_ref_p2(xd);
+        const int bit1 = vpx_read(r, fc->single_ref_prob[ctx1][1]);
+        if (counts)
+          ++counts->single_ref[ctx1][1][bit1];
+        ref_frame[0] = bit1 ? ALTREF_FRAME : GOLDEN_FRAME;
+      } else {
+        ref_frame[0] = LAST_FRAME;
+      }
+
+      ref_frame[1] = NONE;
+    } else {
+      assert(0 && "Invalid prediction mode.");
+    }
+  }
+}
+
+
+static INLINE INTERP_FILTER read_switchable_interp_filter(
+    VP10_COMMON *const cm, MACROBLOCKD *const xd,
+    vpx_reader *r) {
+  const int ctx = vp10_get_pred_context_switchable_interp(xd);
+  const INTERP_FILTER type =
+      (INTERP_FILTER)vpx_read_tree(r, vp10_switchable_interp_tree,
+                                   cm->fc->switchable_interp_prob[ctx]);
+  FRAME_COUNTS *counts = xd->counts;
+  if (counts)
+    ++counts->switchable_interp[ctx][type];
+  return type;
+}
+
+static void read_intra_block_mode_info(VP10_COMMON *const cm,
+                                       MACROBLOCKD *const xd, MODE_INFO *mi,
+                                       vpx_reader *r) {
+  MB_MODE_INFO *const mbmi = &mi->mbmi;
+  const BLOCK_SIZE bsize = mi->mbmi.sb_type;
+  int i;
+
+  mbmi->ref_frame[0] = INTRA_FRAME;
+  mbmi->ref_frame[1] = NONE;
+
+  switch (bsize) {
+    case BLOCK_4X4:
+      for (i = 0; i < 4; ++i)
+        mi->bmi[i].as_mode = read_intra_mode_y(cm, xd, r, 0);
+      mbmi->mode = mi->bmi[3].as_mode;
+      break;
+    case BLOCK_4X8:
+      mi->bmi[0].as_mode = mi->bmi[2].as_mode = read_intra_mode_y(cm, xd,
+                                                                  r, 0);
+      mi->bmi[1].as_mode = mi->bmi[3].as_mode = mbmi->mode =
+          read_intra_mode_y(cm, xd, r, 0);
+      break;
+    case BLOCK_8X4:
+      mi->bmi[0].as_mode = mi->bmi[1].as_mode = read_intra_mode_y(cm, xd,
+                                                                  r, 0);
+      mi->bmi[2].as_mode = mi->bmi[3].as_mode = mbmi->mode =
+          read_intra_mode_y(cm, xd, r, 0);
+      break;
+    default:
+      mbmi->mode = read_intra_mode_y(cm, xd, r, size_group_lookup[bsize]);
+  }
+
+  mbmi->uv_mode = read_intra_mode_uv(cm, xd, r, mbmi->mode);
+}
+
+static INLINE int is_mv_valid(const MV *mv) {
+  return mv->row > MV_LOW && mv->row < MV_UPP &&
+         mv->col > MV_LOW && mv->col < MV_UPP;
+}
+
+static INLINE int assign_mv(VP10_COMMON *cm, MACROBLOCKD *xd,
+                            PREDICTION_MODE mode,
+                            int_mv mv[2], int_mv ref_mv[2],
+                            int_mv nearest_mv[2], int_mv near_mv[2],
+                            int is_compound, int allow_hp, vpx_reader *r) {
+  int i;
+  int ret = 1;
+
+  switch (mode) {
+    case NEWMV: {
+      FRAME_COUNTS *counts = xd->counts;
+      nmv_context_counts *const mv_counts = counts ? &counts->mv : NULL;
+      for (i = 0; i < 1 + is_compound; ++i) {
+        read_mv(r, &mv[i].as_mv, &ref_mv[i].as_mv, &cm->fc->nmvc, mv_counts,
+                allow_hp);
+        ret = ret && is_mv_valid(&mv[i].as_mv);
+      }
+      break;
+    }
+    case NEARESTMV: {
+      mv[0].as_int = nearest_mv[0].as_int;
+      if (is_compound)
+        mv[1].as_int = nearest_mv[1].as_int;
+      break;
+    }
+    case NEARMV: {
+      mv[0].as_int = near_mv[0].as_int;
+      if (is_compound)
+        mv[1].as_int = near_mv[1].as_int;
+      break;
+    }
+    case ZEROMV: {
+      mv[0].as_int = 0;
+      if (is_compound)
+        mv[1].as_int = 0;
+      break;
+    }
+    default: {
+      return 0;
+    }
+  }
+  return ret;
+}
+
+static int read_is_inter_block(VP10_COMMON *const cm, MACROBLOCKD *const xd,
+                               int segment_id, vpx_reader *r) {
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
+    return get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME) != INTRA_FRAME;
+  } else {
+    const int ctx = vp10_get_intra_inter_context(xd);
+    const int is_inter = vpx_read(r, cm->fc->intra_inter_prob[ctx]);
+    FRAME_COUNTS *counts = xd->counts;
+    if (counts)
+      ++counts->intra_inter[ctx][is_inter];
+    return is_inter;
+  }
+}
+
+static void fpm_sync(void *const data, int mi_row) {
+  VP10Decoder *const pbi = (VP10Decoder *)data;
+  vp10_frameworker_wait(pbi->frame_worker_owner, pbi->common.prev_frame,
+                       mi_row << MI_BLOCK_SIZE_LOG2);
+}
+
+static void read_inter_block_mode_info(VP10Decoder *const pbi,
+                                       MACROBLOCKD *const xd,
+                                       MODE_INFO *const mi,
+                                       int mi_row, int mi_col, vpx_reader *r) {
+  VP10_COMMON *const cm = &pbi->common;
+  MB_MODE_INFO *const mbmi = &mi->mbmi;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const int allow_hp = cm->allow_high_precision_mv;
+  int_mv nearestmv[2], nearmv[2];
+  int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
+  int ref, is_compound;
+  uint8_t inter_mode_ctx[MAX_REF_FRAMES];
+
+  read_ref_frames(cm, xd, r, mbmi->segment_id, mbmi->ref_frame);
+  is_compound = has_second_ref(mbmi);
+
+  for (ref = 0; ref < 1 + is_compound; ++ref) {
+    const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
+    RefBuffer *ref_buf = &cm->frame_refs[frame - LAST_FRAME];
+
+    xd->block_refs[ref] = ref_buf;
+    if ((!vp10_is_valid_scale(&ref_buf->sf)))
+      vpx_internal_error(xd->error_info, VPX_CODEC_UNSUP_BITSTREAM,
+                         "Reference frame has invalid dimensions");
+    vp10_setup_pre_planes(xd, ref, ref_buf->buf, mi_row, mi_col,
+                         &ref_buf->sf);
+    vp10_find_mv_refs(cm, xd, mi, frame, ref_mvs[frame],
+                     mi_row, mi_col, fpm_sync, (void *)pbi, inter_mode_ctx);
+  }
+
+  if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+    mbmi->mode = ZEROMV;
+    if (bsize < BLOCK_8X8) {
+        vpx_internal_error(xd->error_info, VPX_CODEC_UNSUP_BITSTREAM,
+                           "Invalid usage of segement feature on small blocks");
+        return;
+    }
+  } else {
+    if (bsize >= BLOCK_8X8)
+      mbmi->mode = read_inter_mode(cm, xd, r,
+                                   inter_mode_ctx[mbmi->ref_frame[0]]);
+  }
+
+  if (bsize < BLOCK_8X8 || mbmi->mode != ZEROMV) {
+    for (ref = 0; ref < 1 + is_compound; ++ref) {
+      vp10_find_best_ref_mvs(allow_hp, ref_mvs[mbmi->ref_frame[ref]],
+                             &nearestmv[ref], &nearmv[ref]);
+    }
+  }
+
+  mbmi->interp_filter = (cm->interp_filter == SWITCHABLE)
+                      ? read_switchable_interp_filter(cm, xd, r)
+                      : cm->interp_filter;
+
+  if (bsize < BLOCK_8X8) {
+    const int num_4x4_w = 1 << xd->bmode_blocks_wl;
+    const int num_4x4_h = 1 << xd->bmode_blocks_hl;
+    int idx, idy;
+    PREDICTION_MODE b_mode;
+    int_mv nearest_sub8x8[2], near_sub8x8[2];
+    for (idy = 0; idy < 2; idy += num_4x4_h) {
+      for (idx = 0; idx < 2; idx += num_4x4_w) {
+        int_mv block[2];
+        const int j = idy * 2 + idx;
+        b_mode = read_inter_mode(cm, xd, r, inter_mode_ctx[mbmi->ref_frame[0]]);
+
+        if (b_mode == NEARESTMV || b_mode == NEARMV) {
+          uint8_t dummy_mode_ctx[MAX_REF_FRAMES];
+          for (ref = 0; ref < 1 + is_compound; ++ref)
+            vp10_append_sub8x8_mvs_for_idx(cm, xd, j, ref, mi_row, mi_col,
+                                          &nearest_sub8x8[ref],
+                                          &near_sub8x8[ref],
+                                          dummy_mode_ctx);
+        }
+
+        if (!assign_mv(cm, xd, b_mode, block, nearestmv,
+                       nearest_sub8x8, near_sub8x8,
+                       is_compound, allow_hp, r)) {
+          xd->corrupted |= 1;
+          break;
+        };
+
+        mi->bmi[j].as_mv[0].as_int = block[0].as_int;
+        if (is_compound)
+          mi->bmi[j].as_mv[1].as_int = block[1].as_int;
+
+        if (num_4x4_h == 2)
+          mi->bmi[j + 2] = mi->bmi[j];
+        if (num_4x4_w == 2)
+          mi->bmi[j + 1] = mi->bmi[j];
+      }
+    }
+
+    mi->mbmi.mode = b_mode;
+
+    mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
+    mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
+  } else {
+    xd->corrupted |= !assign_mv(cm, xd, mbmi->mode, mbmi->mv, nearestmv,
+                                nearestmv, nearmv, is_compound, allow_hp, r);
+  }
+}
+
+static void read_inter_frame_mode_info(VP10Decoder *const pbi,
+                                       MACROBLOCKD *const xd,
+                                       int mi_row, int mi_col, vpx_reader *r) {
+  VP10_COMMON *const cm = &pbi->common;
+  MODE_INFO *const mi = xd->mi[0];
+  MB_MODE_INFO *const mbmi = &mi->mbmi;
+  int inter_block;
+
+  mbmi->mv[0].as_int = 0;
+  mbmi->mv[1].as_int = 0;
+  mbmi->segment_id = read_inter_segment_id(cm, xd, mi_row, mi_col, r);
+  mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r);
+  inter_block = read_is_inter_block(cm, xd, mbmi->segment_id, r);
+  mbmi->tx_size = read_tx_size(cm, xd, !mbmi->skip || !inter_block, r);
+
+  if (inter_block)
+    read_inter_block_mode_info(pbi, xd, mi, mi_row, mi_col, r);
+  else
+    read_intra_block_mode_info(cm, xd, mi, r);
+
+  if (mbmi->tx_size < TX_32X32 &&
+      cm->base_qindex > 0 && !mbmi->skip &&
+      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+    FRAME_COUNTS *counts = xd->counts;
+    if (inter_block) {
+      mbmi->tx_type = vpx_read_tree(
+          r, vp10_ext_tx_tree,
+          cm->fc->inter_ext_tx_prob[mbmi->tx_size]);
+      if (counts)
+        ++counts->inter_ext_tx[mbmi->tx_size][mbmi->tx_type];
+    } else {
+      const TX_TYPE tx_type_nom = intra_mode_to_tx_type_context[mbmi->mode];
+      mbmi->tx_type = vpx_read_tree(
+          r, vp10_ext_tx_tree,
+          cm->fc->intra_ext_tx_prob[mbmi->tx_size][tx_type_nom]);
+      if (counts)
+        ++counts->intra_ext_tx[mbmi->tx_size][tx_type_nom][mbmi->tx_type];
+    }
+  } else {
+    mbmi->tx_type = DCT_DCT;
+  }
+}
+
+void vp10_read_mode_info(VP10Decoder *const pbi, MACROBLOCKD *xd,
+                        int mi_row, int mi_col, vpx_reader *r,
+                        int x_mis, int y_mis) {
+  VP10_COMMON *const cm = &pbi->common;
+  MODE_INFO *const mi = xd->mi[0];
+  MV_REF* frame_mvs = cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col;
+  int w, h;
+
+  if (frame_is_intra_only(cm)) {
+    read_intra_frame_mode_info(cm, xd, mi_row, mi_col, r);
+  } else {
+    read_inter_frame_mode_info(pbi, xd, mi_row, mi_col, r);
+
+    for (h = 0; h < y_mis; ++h) {
+      MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols;
+      for (w = 0; w < x_mis; ++w) {
+        MV_REF *const mv = frame_mv + w;
+        mv->ref_frame[0] = mi->mbmi.ref_frame[0];
+        mv->ref_frame[1] = mi->mbmi.ref_frame[1];
+        mv->mv[0].as_int = mi->mbmi.mv[0].as_int;
+        mv->mv[1].as_int = mi->mbmi.mv[1].as_int;
+      }
+    }
+  }
+}
diff --git a/libs/libvpx/vp10/decoder/decodemv.h b/libs/libvpx/vp10/decoder/decodemv.h
new file mode 100644
index 0000000000..6653be5f69
--- /dev/null
+++ b/libs/libvpx/vp10/decoder/decodemv.h
@@ -0,0 +1,30 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_DECODER_DECODEMV_H_
+#define VP10_DECODER_DECODEMV_H_
+
+#include "vpx_dsp/bitreader.h"
+
+#include "vp10/decoder/decoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp10_read_mode_info(VP10Decoder *const pbi, MACROBLOCKD *xd,
+                        int mi_row, int mi_col, vpx_reader *r,
+                        int x_mis, int y_mis);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_DECODER_DECODEMV_H_
diff --git a/libs/libvpx/vp10/decoder/decoder.c b/libs/libvpx/vp10/decoder/decoder.c
new file mode 100644
index 0000000000..d8864d22e6
--- /dev/null
+++ b/libs/libvpx/vp10/decoder/decoder.c
@@ -0,0 +1,522 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <stdio.h>
+
+#include "./vp10_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_scale_rtcd.h"
+
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/system_state.h"
+#include "vpx_ports/vpx_once.h"
+#include "vpx_ports/vpx_timer.h"
+#include "vpx_scale/vpx_scale.h"
+#include "vpx_util/vpx_thread.h"
+
+#include "vp10/common/alloccommon.h"
+#include "vp10/common/loopfilter.h"
+#include "vp10/common/onyxc_int.h"
+#if CONFIG_VP9_POSTPROC
+#include "vp10/common/postproc.h"
+#endif
+#include "vp10/common/quant_common.h"
+#include "vp10/common/reconintra.h"
+
+#include "vp10/decoder/decodeframe.h"
+#include "vp10/decoder/decoder.h"
+#include "vp10/decoder/detokenize.h"
+
+static void initialize_dec(void) {
+  static volatile int init_done = 0;
+
+  if (!init_done) {
+    vp10_rtcd();
+    vpx_dsp_rtcd();
+    vpx_scale_rtcd();
+    vp10_init_intra_predictors();
+    init_done = 1;
+  }
+}
+
+static void vp10_dec_setup_mi(VP10_COMMON *cm) {
+  cm->mi = cm->mip + cm->mi_stride + 1;
+  cm->mi_grid_visible = cm->mi_grid_base + cm->mi_stride + 1;
+  memset(cm->mi_grid_base, 0,
+         cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mi_grid_base));
+}
+
+static int vp10_dec_alloc_mi(VP10_COMMON *cm, int mi_size) {
+  cm->mip = vpx_calloc(mi_size, sizeof(*cm->mip));
+  if (!cm->mip)
+    return 1;
+  cm->mi_alloc_size = mi_size;
+  cm->mi_grid_base = (MODE_INFO **)vpx_calloc(mi_size, sizeof(MODE_INFO*));
+  if (!cm->mi_grid_base)
+    return 1;
+  return 0;
+}
+
+static void vp10_dec_free_mi(VP10_COMMON *cm) {
+  vpx_free(cm->mip);
+  cm->mip = NULL;
+  vpx_free(cm->mi_grid_base);
+  cm->mi_grid_base = NULL;
+}
+
+VP10Decoder *vp10_decoder_create(BufferPool *const pool) {
+  VP10Decoder *volatile const pbi = vpx_memalign(32, sizeof(*pbi));
+  VP10_COMMON *volatile const cm = pbi ? &pbi->common : NULL;
+
+  if (!cm)
+    return NULL;
+
+  vp10_zero(*pbi);
+
+  if (setjmp(cm->error.jmp)) {
+    cm->error.setjmp = 0;
+    vp10_decoder_remove(pbi);
+    return NULL;
+  }
+
+  cm->error.setjmp = 1;
+
+  CHECK_MEM_ERROR(cm, cm->fc,
+                  (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc)));
+  CHECK_MEM_ERROR(cm, cm->frame_contexts,
+                  (FRAME_CONTEXT *)vpx_calloc(FRAME_CONTEXTS,
+                  sizeof(*cm->frame_contexts)));
+
+  pbi->need_resync = 1;
+  once(initialize_dec);
+
+  // Initialize the references to not point to any frame buffers.
+  memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
+  memset(&cm->next_ref_frame_map, -1, sizeof(cm->next_ref_frame_map));
+
+  cm->current_video_frame = 0;
+  pbi->ready_for_new_data = 1;
+  pbi->common.buffer_pool = pool;
+
+  cm->bit_depth = VPX_BITS_8;
+  cm->dequant_bit_depth = VPX_BITS_8;
+
+  cm->alloc_mi = vp10_dec_alloc_mi;
+  cm->free_mi = vp10_dec_free_mi;
+  cm->setup_mi = vp10_dec_setup_mi;
+
+  vp10_loop_filter_init(cm);
+
+  cm->error.setjmp = 0;
+
+  vpx_get_worker_interface()->init(&pbi->lf_worker);
+
+  return pbi;
+}
+
+void vp10_decoder_remove(VP10Decoder *pbi) {
+  int i;
+
+  if (!pbi)
+    return;
+
+  vpx_get_worker_interface()->end(&pbi->lf_worker);
+  vpx_free(pbi->lf_worker.data1);
+  vpx_free(pbi->tile_data);
+  for (i = 0; i < pbi->num_tile_workers; ++i) {
+    VPxWorker *const worker = &pbi->tile_workers[i];
+    vpx_get_worker_interface()->end(worker);
+  }
+  vpx_free(pbi->tile_worker_data);
+  vpx_free(pbi->tile_worker_info);
+  vpx_free(pbi->tile_workers);
+
+  if (pbi->num_tile_workers > 0) {
+    vp10_loop_filter_dealloc(&pbi->lf_row_sync);
+  }
+
+  vpx_free(pbi);
+}
+
+static int equal_dimensions(const YV12_BUFFER_CONFIG *a,
+                            const YV12_BUFFER_CONFIG *b) {
+    return a->y_height == b->y_height && a->y_width == b->y_width &&
+           a->uv_height == b->uv_height && a->uv_width == b->uv_width;
+}
+
+vpx_codec_err_t vp10_copy_reference_dec(VP10Decoder *pbi,
+                                       VP9_REFFRAME ref_frame_flag,
+                                       YV12_BUFFER_CONFIG *sd) {
+  VP10_COMMON *cm = &pbi->common;
+
+  /* TODO(jkoleszar): The decoder doesn't have any real knowledge of what the
+   * encoder is using the frame buffers for. This is just a stub to keep the
+   * vpxenc --test-decode functionality working, and will be replaced in a
+   * later commit that adds VP9-specific controls for this functionality.
+   */
+  if (ref_frame_flag == VP9_LAST_FLAG) {
+    const YV12_BUFFER_CONFIG *const cfg = get_ref_frame(cm, 0);
+    if (cfg == NULL) {
+      vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                         "No 'last' reference frame");
+      return VPX_CODEC_ERROR;
+    }
+    if (!equal_dimensions(cfg, sd))
+      vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                         "Incorrect buffer dimensions");
+    else
+      vp8_yv12_copy_frame(cfg, sd);
+  } else {
+    vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                       "Invalid reference frame");
+  }
+
+  return cm->error.error_code;
+}
+
+
+vpx_codec_err_t vp10_set_reference_dec(VP10_COMMON *cm,
+                                      VP9_REFFRAME ref_frame_flag,
+                                      YV12_BUFFER_CONFIG *sd) {
+  RefBuffer *ref_buf = NULL;
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+
+  // TODO(jkoleszar): The decoder doesn't have any real knowledge of what the
+  // encoder is using the frame buffers for. This is just a stub to keep the
+  // vpxenc --test-decode functionality working, and will be replaced in a
+  // later commit that adds VP9-specific controls for this functionality.
+  if (ref_frame_flag == VP9_LAST_FLAG) {
+    ref_buf = &cm->frame_refs[0];
+  } else if (ref_frame_flag == VP9_GOLD_FLAG) {
+    ref_buf = &cm->frame_refs[1];
+  } else if (ref_frame_flag == VP9_ALT_FLAG) {
+    ref_buf = &cm->frame_refs[2];
+  } else {
+    vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                       "Invalid reference frame");
+    return cm->error.error_code;
+  }
+
+  if (!equal_dimensions(ref_buf->buf, sd)) {
+    vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                       "Incorrect buffer dimensions");
+  } else {
+    int *ref_fb_ptr = &ref_buf->idx;
+
+    // Find an empty frame buffer.
+    const int free_fb = get_free_fb(cm);
+    if (cm->new_fb_idx == INVALID_IDX)
+      return VPX_CODEC_MEM_ERROR;
+
+    // Decrease ref_count since it will be increased again in
+    // ref_cnt_fb() below.
+    --frame_bufs[free_fb].ref_count;
+
+    // Manage the reference counters and copy image.
+    ref_cnt_fb(frame_bufs, ref_fb_ptr, free_fb);
+    ref_buf->buf = &frame_bufs[*ref_fb_ptr].buf;
+    vp8_yv12_copy_frame(sd, ref_buf->buf);
+  }
+
+  return cm->error.error_code;
+}
+
+/* If any buffer updating is signaled it should be done here. */
+static void swap_frame_buffers(VP10Decoder *pbi) {
+  int ref_index = 0, mask;
+  VP10_COMMON *const cm = &pbi->common;
+  BufferPool *const pool = cm->buffer_pool;
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+
+  lock_buffer_pool(pool);
+  for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
+    const int old_idx = cm->ref_frame_map[ref_index];
+    // Current thread releases the holding of reference frame.
+    decrease_ref_count(old_idx, frame_bufs, pool);
+
+    // Release the reference frame in reference map.
+    if ((mask & 1) && old_idx >= 0) {
+      decrease_ref_count(old_idx, frame_bufs, pool);
+    }
+    cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
+    ++ref_index;
+  }
+
+  // Current thread releases the holding of reference frame.
+  for (; ref_index < REF_FRAMES && !cm->show_existing_frame; ++ref_index) {
+    const int old_idx = cm->ref_frame_map[ref_index];
+    decrease_ref_count(old_idx, frame_bufs, pool);
+    cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
+  }
+  unlock_buffer_pool(pool);
+  pbi->hold_ref_buf = 0;
+  cm->frame_to_show = get_frame_new_buffer(cm);
+
+  if (!cm->frame_parallel_decode || !cm->show_frame) {
+    lock_buffer_pool(pool);
+    --frame_bufs[cm->new_fb_idx].ref_count;
+    unlock_buffer_pool(pool);
+  }
+
+  // Invalidate these references until the next frame starts.
+  for (ref_index = 0; ref_index < 3; ref_index++)
+    cm->frame_refs[ref_index].idx = -1;
+}
+
+int vp10_receive_compressed_data(VP10Decoder *pbi,
+                                size_t size, const uint8_t **psource) {
+  VP10_COMMON *volatile const cm = &pbi->common;
+  BufferPool *volatile const pool = cm->buffer_pool;
+  RefCntBuffer *volatile const frame_bufs = cm->buffer_pool->frame_bufs;
+  const uint8_t *source = *psource;
+  int retcode = 0;
+  cm->error.error_code = VPX_CODEC_OK;
+
+  if (size == 0) {
+    // This is used to signal that we are missing frames.
+    // We do not know if the missing frame(s) was supposed to update
+    // any of the reference buffers, but we act conservative and
+    // mark only the last buffer as corrupted.
+    //
+    // TODO(jkoleszar): Error concealment is undefined and non-normative
+    // at this point, but if it becomes so, [0] may not always be the correct
+    // thing to do here.
+    if (cm->frame_refs[0].idx > 0) {
+      assert(cm->frame_refs[0].buf != NULL);
+      cm->frame_refs[0].buf->corrupted = 1;
+    }
+  }
+
+  pbi->ready_for_new_data = 0;
+
+  // Check if the previous frame was a frame without any references to it.
+  // Release frame buffer if not decoding in frame parallel mode.
+  if (!cm->frame_parallel_decode && cm->new_fb_idx >= 0
+      && frame_bufs[cm->new_fb_idx].ref_count == 0)
+    pool->release_fb_cb(pool->cb_priv,
+                        &frame_bufs[cm->new_fb_idx].raw_frame_buffer);
+  // Find a free frame buffer. Return error if can not find any.
+  cm->new_fb_idx = get_free_fb(cm);
+  if (cm->new_fb_idx == INVALID_IDX)
+    return VPX_CODEC_MEM_ERROR;
+
+  // Assign a MV array to the frame buffer.
+  cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx];
+
+  pbi->hold_ref_buf = 0;
+  if (cm->frame_parallel_decode) {
+    VPxWorker *const worker = pbi->frame_worker_owner;
+    vp10_frameworker_lock_stats(worker);
+    frame_bufs[cm->new_fb_idx].frame_worker_owner = worker;
+    // Reset decoding progress.
+    pbi->cur_buf = &frame_bufs[cm->new_fb_idx];
+    pbi->cur_buf->row = -1;
+    pbi->cur_buf->col = -1;
+    vp10_frameworker_unlock_stats(worker);
+  } else {
+    pbi->cur_buf = &frame_bufs[cm->new_fb_idx];
+  }
+
+
+  if (setjmp(cm->error.jmp)) {
+    const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
+    int i;
+
+    cm->error.setjmp = 0;
+    pbi->ready_for_new_data = 1;
+
+    // Synchronize all threads immediately as a subsequent decode call may
+    // cause a resize invalidating some allocations.
+    winterface->sync(&pbi->lf_worker);
+    for (i = 0; i < pbi->num_tile_workers; ++i) {
+      winterface->sync(&pbi->tile_workers[i]);
+    }
+
+    lock_buffer_pool(pool);
+    // Release all the reference buffers if worker thread is holding them.
+    if (pbi->hold_ref_buf == 1) {
+      int ref_index = 0, mask;
+      for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
+        const int old_idx = cm->ref_frame_map[ref_index];
+        // Current thread releases the holding of reference frame.
+        decrease_ref_count(old_idx, frame_bufs, pool);
+
+        // Release the reference frame in reference map.
+        if ((mask & 1) && old_idx >= 0) {
+          decrease_ref_count(old_idx, frame_bufs, pool);
+        }
+        ++ref_index;
+      }
+
+      // Current thread releases the holding of reference frame.
+      for (; ref_index < REF_FRAMES && !cm->show_existing_frame; ++ref_index) {
+        const int old_idx = cm->ref_frame_map[ref_index];
+        decrease_ref_count(old_idx, frame_bufs, pool);
+      }
+      pbi->hold_ref_buf = 0;
+    }
+    // Release current frame.
+    decrease_ref_count(cm->new_fb_idx, frame_bufs, pool);
+    unlock_buffer_pool(pool);
+
+    vpx_clear_system_state();
+    return -1;
+  }
+
+  cm->error.setjmp = 1;
+  vp10_decode_frame(pbi, source, source + size, psource);
+
+  swap_frame_buffers(pbi);
+
+  vpx_clear_system_state();
+
+  if (!cm->show_existing_frame) {
+    cm->last_show_frame = cm->show_frame;
+    cm->prev_frame = cm->cur_frame;
+    if (cm->seg.enabled && !cm->frame_parallel_decode)
+      vp10_swap_current_and_last_seg_map(cm);
+  }
+
+  // Update progress in frame parallel decode.
+  if (cm->frame_parallel_decode) {
+    // Need to lock the mutex here as another thread may
+    // be accessing this buffer.
+    VPxWorker *const worker = pbi->frame_worker_owner;
+    FrameWorkerData *const frame_worker_data = worker->data1;
+    vp10_frameworker_lock_stats(worker);
+
+    if (cm->show_frame) {
+      cm->current_video_frame++;
+    }
+    frame_worker_data->frame_decoded = 1;
+    frame_worker_data->frame_context_ready = 1;
+    vp10_frameworker_signal_stats(worker);
+    vp10_frameworker_unlock_stats(worker);
+  } else {
+    cm->last_width = cm->width;
+    cm->last_height = cm->height;
+    if (cm->show_frame) {
+      cm->current_video_frame++;
+    }
+  }
+
+  cm->error.setjmp = 0;
+  return retcode;
+}
+
+int vp10_get_raw_frame(VP10Decoder *pbi, YV12_BUFFER_CONFIG *sd,
+                      vp10_ppflags_t *flags) {
+  VP10_COMMON *const cm = &pbi->common;
+  int ret = -1;
+#if !CONFIG_VP9_POSTPROC
+  (void)*flags;
+#endif
+
+  if (pbi->ready_for_new_data == 1)
+    return ret;
+
+  pbi->ready_for_new_data = 1;
+
+  /* no raw frame to show!!! */
+  if (!cm->show_frame)
+    return ret;
+
+  pbi->ready_for_new_data = 1;
+
+#if CONFIG_VP9_POSTPROC
+  if (!cm->show_existing_frame) {
+    ret = vp10_post_proc_frame(cm, sd, flags);
+  } else {
+    *sd = *cm->frame_to_show;
+    ret = 0;
+  }
+#else
+  *sd = *cm->frame_to_show;
+  ret = 0;
+#endif /*!CONFIG_POSTPROC*/
+  vpx_clear_system_state();
+  return ret;
+}
+
+vpx_codec_err_t vp10_parse_superframe_index(const uint8_t *data,
+                                           size_t data_sz,
+                                           uint32_t sizes[8], int *count,
+                                           vpx_decrypt_cb decrypt_cb,
+                                           void *decrypt_state) {
+  // A chunk ending with a byte matching 0xc0 is an invalid chunk unless
+  // it is a super frame index. If the last byte of real video compression
+  // data is 0xc0 the encoder must add a 0 byte. If we have the marker but
+  // not the associated matching marker byte at the front of the index we have
+  // an invalid bitstream and need to return an error.
+
+  uint8_t marker;
+#if CONFIG_MISC_FIXES
+  size_t frame_sz_sum = 0;
+#endif
+
+  assert(data_sz);
+  marker = read_marker(decrypt_cb, decrypt_state, data + data_sz - 1);
+  *count = 0;
+
+  if ((marker & 0xe0) == 0xc0) {
+    const uint32_t frames = (marker & 0x7) + 1;
+    const uint32_t mag = ((marker >> 3) & 0x3) + 1;
+    const size_t index_sz = 2 + mag * (frames - CONFIG_MISC_FIXES);
+
+    // This chunk is marked as having a superframe index but doesn't have
+    // enough data for it, thus it's an invalid superframe index.
+    if (data_sz < index_sz)
+      return VPX_CODEC_CORRUPT_FRAME;
+
+    {
+      const uint8_t marker2 = read_marker(decrypt_cb, decrypt_state,
+                                          data + data_sz - index_sz);
+
+      // This chunk is marked as having a superframe index but doesn't have
+      // the matching marker byte at the front of the index therefore it's an
+      // invalid chunk.
+      if (marker != marker2)
+        return VPX_CODEC_CORRUPT_FRAME;
+    }
+
+    {
+      // Found a valid superframe index.
+      uint32_t i, j;
+      const uint8_t *x = &data[data_sz - index_sz + 1];
+
+      // Frames has a maximum of 8 and mag has a maximum of 4.
+      uint8_t clear_buffer[32];
+      assert(sizeof(clear_buffer) >= frames * mag);
+      if (decrypt_cb) {
+        decrypt_cb(decrypt_state, x, clear_buffer, frames * mag);
+        x = clear_buffer;
+      }
+
+      for (i = 0; i < frames - CONFIG_MISC_FIXES; ++i) {
+        uint32_t this_sz = 0;
+
+        for (j = 0; j < mag; ++j)
+          this_sz |= (*x++) << (j * 8);
+        this_sz += CONFIG_MISC_FIXES;
+        sizes[i] = this_sz;
+#if CONFIG_MISC_FIXES
+        frame_sz_sum += this_sz;
+#endif
+      }
+#if CONFIG_MISC_FIXES
+      sizes[i] = data_sz - index_sz - frame_sz_sum;
+#endif
+      *count = frames;
+    }
+  }
+  return VPX_CODEC_OK;
+}
diff --git a/libs/libvpx/vp10/decoder/decoder.h b/libs/libvpx/vp10/decoder/decoder.h
new file mode 100644
index 0000000000..72a6310202
--- /dev/null
+++ b/libs/libvpx/vp10/decoder/decoder.h
@@ -0,0 +1,141 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_DECODER_DECODER_H_
+#define VP10_DECODER_DECODER_H_
+
+#include "./vpx_config.h"
+
+#include "vpx/vpx_codec.h"
+#include "vpx_dsp/bitreader.h"
+#include "vpx_scale/yv12config.h"
+#include "vpx_util/vpx_thread.h"
+
+#include "vp10/common/thread_common.h"
+#include "vp10/common/onyxc_int.h"
+#include "vp10/common/ppflags.h"
+#include "vp10/decoder/dthread.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// TODO(hkuang): combine this with TileWorkerData.
+typedef struct TileData {
+  VP10_COMMON *cm;
+  vpx_reader bit_reader;
+  DECLARE_ALIGNED(16, MACROBLOCKD, xd);
+  /* dqcoeff are shared by all the planes. So planes must be decoded serially */
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
+  DECLARE_ALIGNED(16, uint8_t, color_index_map[2][64 * 64]);
+} TileData;
+
+typedef struct TileWorkerData {
+  struct VP10Decoder *pbi;
+  vpx_reader bit_reader;
+  FRAME_COUNTS counts;
+  DECLARE_ALIGNED(16, MACROBLOCKD, xd);
+  /* dqcoeff are shared by all the planes. So planes must be decoded serially */
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
+  DECLARE_ALIGNED(16, uint8_t, color_index_map[2][64 * 64]);
+  struct vpx_internal_error_info error_info;
+} TileWorkerData;
+
+typedef struct VP10Decoder {
+  DECLARE_ALIGNED(16, MACROBLOCKD, mb);
+
+  DECLARE_ALIGNED(16, VP10_COMMON, common);
+
+  int ready_for_new_data;
+
+  int refresh_frame_flags;
+
+  // TODO(hkuang): Combine this with cur_buf in macroblockd as they are
+  // the same.
+  RefCntBuffer *cur_buf;   //  Current decoding frame buffer.
+
+  VPxWorker *frame_worker_owner;   // frame_worker that owns this pbi.
+  VPxWorker lf_worker;
+  VPxWorker *tile_workers;
+  TileWorkerData *tile_worker_data;
+  TileInfo *tile_worker_info;
+  int num_tile_workers;
+
+  TileData *tile_data;
+  int total_tiles;
+
+  VP9LfSync lf_row_sync;
+
+  vpx_decrypt_cb decrypt_cb;
+  void *decrypt_state;
+
+  int max_threads;
+  int inv_tile_order;
+  int need_resync;  // wait for key/intra-only frame.
+  int hold_ref_buf;  // hold the reference buffer.
+} VP10Decoder;
+
+int vp10_receive_compressed_data(struct VP10Decoder *pbi,
+                                size_t size, const uint8_t **dest);
+
+int vp10_get_raw_frame(struct VP10Decoder *pbi, YV12_BUFFER_CONFIG *sd,
+                      vp10_ppflags_t *flags);
+
+vpx_codec_err_t vp10_copy_reference_dec(struct VP10Decoder *pbi,
+                                       VP9_REFFRAME ref_frame_flag,
+                                       YV12_BUFFER_CONFIG *sd);
+
+vpx_codec_err_t vp10_set_reference_dec(VP10_COMMON *cm,
+                                      VP9_REFFRAME ref_frame_flag,
+                                      YV12_BUFFER_CONFIG *sd);
+
+static INLINE uint8_t read_marker(vpx_decrypt_cb decrypt_cb,
+                                  void *decrypt_state,
+                                  const uint8_t *data) {
+  if (decrypt_cb) {
+    uint8_t marker;
+    decrypt_cb(decrypt_state, data, &marker, 1);
+    return marker;
+  }
+  return *data;
+}
+
+// This function is exposed for use in tests, as well as the inlined function
+// "read_marker".
+vpx_codec_err_t vp10_parse_superframe_index(const uint8_t *data,
+                                           size_t data_sz,
+                                           uint32_t sizes[8], int *count,
+                                           vpx_decrypt_cb decrypt_cb,
+                                           void *decrypt_state);
+
+struct VP10Decoder *vp10_decoder_create(BufferPool *const pool);
+
+void vp10_decoder_remove(struct VP10Decoder *pbi);
+
+static INLINE void decrease_ref_count(int idx, RefCntBuffer *const frame_bufs,
+                                      BufferPool *const pool) {
+  if (idx >= 0) {
+    --frame_bufs[idx].ref_count;
+    // A worker may only get a free framebuffer index when calling get_free_fb.
+    // But the private buffer is not set up until finish decoding header.
+    // So any error happens during decoding header, the frame_bufs will not
+    // have valid priv buffer.
+    if (frame_bufs[idx].ref_count == 0 &&
+        frame_bufs[idx].raw_frame_buffer.priv) {
+      pool->release_fb_cb(pool->cb_priv, &frame_bufs[idx].raw_frame_buffer);
+    }
+  }
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_DECODER_DECODER_H_
diff --git a/libs/libvpx/vp10/decoder/detokenize.c b/libs/libvpx/vp10/decoder/detokenize.c
new file mode 100644
index 0000000000..d39e3dc06c
--- /dev/null
+++ b/libs/libvpx/vp10/decoder/detokenize.c
@@ -0,0 +1,276 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+
+#include "vp10/common/blockd.h"
+#include "vp10/common/common.h"
+#include "vp10/common/entropy.h"
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+#include "vp10/common/idct.h"
+#endif
+
+#include "vp10/decoder/detokenize.h"
+
+#define EOB_CONTEXT_NODE            0
+#define ZERO_CONTEXT_NODE           1
+#define ONE_CONTEXT_NODE            2
+#define LOW_VAL_CONTEXT_NODE        0
+#define TWO_CONTEXT_NODE            1
+#define THREE_CONTEXT_NODE          2
+#define HIGH_LOW_CONTEXT_NODE       3
+#define CAT_ONE_CONTEXT_NODE        4
+#define CAT_THREEFOUR_CONTEXT_NODE  5
+#define CAT_THREE_CONTEXT_NODE      6
+#define CAT_FIVE_CONTEXT_NODE       7
+
+#define INCREMENT_COUNT(token)                              \
+  do {                                                      \
+     if (counts)                                            \
+       ++coef_counts[band][ctx][token];                     \
+  } while (0)
+
+static INLINE int read_coeff(const vpx_prob *probs, int n, vpx_reader *r) {
+  int i, val = 0;
+  for (i = 0; i < n; ++i)
+    val = (val << 1) | vpx_read(r, probs[i]);
+  return val;
+}
+
+static int decode_coefs(const MACROBLOCKD *xd,
+                        PLANE_TYPE type,
+                        tran_low_t *dqcoeff, TX_SIZE tx_size, const int16_t *dq,
+                        int ctx, const int16_t *scan, const int16_t *nb,
+                        vpx_reader *r) {
+  FRAME_COUNTS *counts = xd->counts;
+  const int max_eob = 16 << (tx_size << 1);
+  const FRAME_CONTEXT *const fc = xd->fc;
+  const int ref = is_inter_block(&xd->mi[0]->mbmi);
+  int band, c = 0;
+  const vpx_prob (*coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
+      fc->coef_probs[tx_size][type][ref];
+  const vpx_prob *prob;
+  unsigned int (*coef_counts)[COEFF_CONTEXTS][UNCONSTRAINED_NODES + 1];
+  unsigned int (*eob_branch_count)[COEFF_CONTEXTS];
+  uint8_t token_cache[32 * 32];
+  const uint8_t *band_translate = get_band_translate(tx_size);
+  const int dq_shift = (tx_size == TX_32X32);
+  int v, token;
+  int16_t dqv = dq[0];
+  const uint8_t *cat1_prob;
+  const uint8_t *cat2_prob;
+  const uint8_t *cat3_prob;
+  const uint8_t *cat4_prob;
+  const uint8_t *cat5_prob;
+  const uint8_t *cat6_prob;
+
+  if (counts) {
+    coef_counts = counts->coef[tx_size][type][ref];
+    eob_branch_count = counts->eob_branch[tx_size][type][ref];
+  }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->bd > VPX_BITS_8) {
+    if (xd->bd == VPX_BITS_10) {
+      cat1_prob = vp10_cat1_prob_high10;
+      cat2_prob = vp10_cat2_prob_high10;
+      cat3_prob = vp10_cat3_prob_high10;
+      cat4_prob = vp10_cat4_prob_high10;
+      cat5_prob = vp10_cat5_prob_high10;
+      cat6_prob = vp10_cat6_prob_high10;
+    } else {
+      cat1_prob = vp10_cat1_prob_high12;
+      cat2_prob = vp10_cat2_prob_high12;
+      cat3_prob = vp10_cat3_prob_high12;
+      cat4_prob = vp10_cat4_prob_high12;
+      cat5_prob = vp10_cat5_prob_high12;
+      cat6_prob = vp10_cat6_prob_high12;
+    }
+  } else {
+    cat1_prob = vp10_cat1_prob;
+    cat2_prob = vp10_cat2_prob;
+    cat3_prob = vp10_cat3_prob;
+    cat4_prob = vp10_cat4_prob;
+    cat5_prob = vp10_cat5_prob;
+    cat6_prob = vp10_cat6_prob;
+  }
+#else
+  cat1_prob = vp10_cat1_prob;
+  cat2_prob = vp10_cat2_prob;
+  cat3_prob = vp10_cat3_prob;
+  cat4_prob = vp10_cat4_prob;
+  cat5_prob = vp10_cat5_prob;
+  cat6_prob = vp10_cat6_prob;
+#endif
+
+  while (c < max_eob) {
+    int val = -1;
+    band = *band_translate++;
+    prob = coef_probs[band][ctx];
+    if (counts)
+      ++eob_branch_count[band][ctx];
+    if (!vpx_read(r, prob[EOB_CONTEXT_NODE])) {
+      INCREMENT_COUNT(EOB_MODEL_TOKEN);
+      break;
+    }
+
+    while (!vpx_read(r, prob[ZERO_CONTEXT_NODE])) {
+      INCREMENT_COUNT(ZERO_TOKEN);
+      dqv = dq[1];
+      token_cache[scan[c]] = 0;
+      ++c;
+      if (c >= max_eob)
+        return c;  // zero tokens at the end (no eob token)
+      ctx = get_coef_context(nb, token_cache, c);
+      band = *band_translate++;
+      prob = coef_probs[band][ctx];
+    }
+
+    if (!vpx_read(r, prob[ONE_CONTEXT_NODE])) {
+      INCREMENT_COUNT(ONE_TOKEN);
+      token = ONE_TOKEN;
+      val = 1;
+    } else {
+      INCREMENT_COUNT(TWO_TOKEN);
+      token = vpx_read_tree(r, vp10_coef_con_tree,
+                            vp10_pareto8_full[prob[PIVOT_NODE] - 1]);
+      switch (token) {
+        case TWO_TOKEN:
+        case THREE_TOKEN:
+        case FOUR_TOKEN:
+          val = token;
+          break;
+        case CATEGORY1_TOKEN:
+          val = CAT1_MIN_VAL + read_coeff(cat1_prob, 1, r);
+          break;
+        case CATEGORY2_TOKEN:
+          val = CAT2_MIN_VAL + read_coeff(cat2_prob, 2, r);
+          break;
+        case CATEGORY3_TOKEN:
+          val = CAT3_MIN_VAL + read_coeff(cat3_prob, 3, r);
+          break;
+        case CATEGORY4_TOKEN:
+          val = CAT4_MIN_VAL + read_coeff(cat4_prob, 4, r);
+          break;
+        case CATEGORY5_TOKEN:
+          val = CAT5_MIN_VAL + read_coeff(cat5_prob, 5, r);
+          break;
+        case CATEGORY6_TOKEN: {
+#if CONFIG_MISC_FIXES
+          const int skip_bits = TX_SIZES - 1 - tx_size;
+#else
+          const int skip_bits = 0;
+#endif
+          const uint8_t *cat6p = cat6_prob + skip_bits;
+#if CONFIG_VP9_HIGHBITDEPTH
+          switch (xd->bd) {
+            case VPX_BITS_8:
+              val = CAT6_MIN_VAL + read_coeff(cat6p, 14 - skip_bits, r);
+              break;
+            case VPX_BITS_10:
+              val = CAT6_MIN_VAL + read_coeff(cat6p, 16 - skip_bits, r);
+              break;
+            case VPX_BITS_12:
+              val = CAT6_MIN_VAL + read_coeff(cat6p, 18 - skip_bits, r);
+              break;
+            default:
+              assert(0);
+              return -1;
+          }
+#else
+          val = CAT6_MIN_VAL + read_coeff(cat6p, 14 - skip_bits, r);
+#endif
+          break;
+        }
+      }
+    }
+    v = (val * dqv) >> dq_shift;
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+#if CONFIG_VP9_HIGHBITDEPTH
+    dqcoeff[scan[c]] = highbd_check_range((vpx_read_bit(r) ? -v : v),
+                                          xd->bd);
+#else
+    dqcoeff[scan[c]] = check_range(vpx_read_bit(r) ? -v : v);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#else
+    dqcoeff[scan[c]] = vpx_read_bit(r) ? -v : v;
+#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
+    token_cache[scan[c]] = vp10_pt_energy_class[token];
+    ++c;
+    ctx = get_coef_context(nb, token_cache, c);
+    dqv = dq[1];
+  }
+
+  return c;
+}
+
+// TODO(slavarnway): Decode version of vp10_set_context.  Modify vp10_set_context
+// after testing is complete, then delete this version.
+static
+void dec_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
+                      TX_SIZE tx_size, int has_eob,
+                      int aoff, int loff) {
+  ENTROPY_CONTEXT *const a = pd->above_context + aoff;
+  ENTROPY_CONTEXT *const l = pd->left_context + loff;
+  const int tx_size_in_blocks = 1 << tx_size;
+
+  // above
+  if (has_eob && xd->mb_to_right_edge < 0) {
+    int i;
+    const int blocks_wide = pd->n4_w +
+                            (xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+    int above_contexts = tx_size_in_blocks;
+    if (above_contexts + aoff > blocks_wide)
+      above_contexts = blocks_wide - aoff;
+
+    for (i = 0; i < above_contexts; ++i)
+      a[i] = has_eob;
+    for (i = above_contexts; i < tx_size_in_blocks; ++i)
+      a[i] = 0;
+  } else {
+    memset(a, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
+  }
+
+  // left
+  if (has_eob && xd->mb_to_bottom_edge < 0) {
+    int i;
+    const int blocks_high = pd->n4_h +
+                            (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+    int left_contexts = tx_size_in_blocks;
+    if (left_contexts + loff > blocks_high)
+      left_contexts = blocks_high - loff;
+
+    for (i = 0; i < left_contexts; ++i)
+      l[i] = has_eob;
+    for (i = left_contexts; i < tx_size_in_blocks; ++i)
+      l[i] = 0;
+  } else {
+    memset(l, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
+  }
+}
+
+int vp10_decode_block_tokens(MACROBLOCKD *xd,
+                            int plane, const scan_order *sc,
+                            int x, int y,
+                            TX_SIZE tx_size, vpx_reader *r,
+                            int seg_id) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int16_t *const dequant = pd->seg_dequant[seg_id];
+  const int ctx = get_entropy_context(tx_size, pd->above_context + x,
+                                               pd->left_context + y);
+  const int eob = decode_coefs(xd, pd->plane_type,
+                               pd->dqcoeff, tx_size,
+                               dequant, ctx, sc->scan, sc->neighbors, r);
+  dec_set_contexts(xd, pd, tx_size, eob > 0, x, y);
+  return eob;
+}
+
+
diff --git a/libs/libvpx/vp10/decoder/detokenize.h b/libs/libvpx/vp10/decoder/detokenize.h
new file mode 100644
index 0000000000..c3fd90a728
--- /dev/null
+++ b/libs/libvpx/vp10/decoder/detokenize.h
@@ -0,0 +1,33 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP10_DECODER_DETOKENIZE_H_
+#define VP10_DECODER_DETOKENIZE_H_
+
+#include "vpx_dsp/bitreader.h"
+#include "vp10/decoder/decoder.h"
+#include "vp10/common/scan.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int vp10_decode_block_tokens(MACROBLOCKD *xd,
+                            int plane, const scan_order *sc,
+                            int x, int y,
+                            TX_SIZE tx_size, vpx_reader *r,
+                            int seg_id);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_DECODER_DETOKENIZE_H_
diff --git a/libs/libvpx/vp10/decoder/dsubexp.c b/libs/libvpx/vp10/decoder/dsubexp.c
new file mode 100644
index 0000000000..36c1917bc2
--- /dev/null
+++ b/libs/libvpx/vp10/decoder/dsubexp.c
@@ -0,0 +1,79 @@
+/*
+  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vp10/common/entropy.h"
+
+#include "vp10/decoder/dsubexp.h"
+
+static int inv_recenter_nonneg(int v, int m) {
+  if (v > 2 * m)
+    return v;
+
+  return (v & 1) ? m - ((v + 1) >> 1) : m + (v >> 1);
+}
+
+static int decode_uniform(vpx_reader *r) {
+  const int l = 8;
+  const int m = (1 << l) - 191 + CONFIG_MISC_FIXES;
+  const int v = vpx_read_literal(r, l - 1);
+  return v < m ?  v : (v << 1) - m + vpx_read_bit(r);
+}
+
+static int inv_remap_prob(int v, int m) {
+  static uint8_t inv_map_table[MAX_PROB - CONFIG_MISC_FIXES] = {
+      7,  20,  33,  46,  59,  72,  85,  98, 111, 124, 137, 150, 163, 176, 189,
+    202, 215, 228, 241, 254,   1,   2,   3,   4,   5,   6,   8,   9,  10,  11,
+     12,  13,  14,  15,  16,  17,  18,  19,  21,  22,  23,  24,  25,  26,  27,
+     28,  29,  30,  31,  32,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,
+     44,  45,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  60,
+     61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  73,  74,  75,  76,
+     77,  78,  79,  80,  81,  82,  83,  84,  86,  87,  88,  89,  90,  91,  92,
+     93,  94,  95,  96,  97,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108,
+    109, 110, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 125,
+    126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 138, 139, 140, 141,
+    142, 143, 144, 145, 146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157,
+    158, 159, 160, 161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173,
+    174, 175, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 190,
+    191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206,
+    207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221, 222,
+    223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238,
+    239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253,
+#if !CONFIG_MISC_FIXES
+    253
+#endif
+  };
+  assert(v < (int)(sizeof(inv_map_table) / sizeof(inv_map_table[0])));
+  v = inv_map_table[v];
+  m--;
+  if ((m << 1) <= MAX_PROB) {
+    return 1 + inv_recenter_nonneg(v, m);
+  } else {
+    return MAX_PROB - inv_recenter_nonneg(v, MAX_PROB - 1 - m);
+  }
+}
+
+static int decode_term_subexp(vpx_reader *r) {
+  if (!vpx_read_bit(r))
+    return vpx_read_literal(r, 4);
+  if (!vpx_read_bit(r))
+    return vpx_read_literal(r, 4) + 16;
+  if (!vpx_read_bit(r))
+    return vpx_read_literal(r, 5) + 32;
+  return decode_uniform(r) + 64;
+}
+
+void vp10_diff_update_prob(vpx_reader *r, vpx_prob* p) {
+  if (vpx_read(r, DIFF_UPDATE_PROB)) {
+    const int delp = decode_term_subexp(r);
+    *p = (vpx_prob)inv_remap_prob(delp, *p);
+  }
+}
diff --git a/libs/libvpx/vp10/decoder/dsubexp.h b/libs/libvpx/vp10/decoder/dsubexp.h
new file mode 100644
index 0000000000..1a7ed99104
--- /dev/null
+++ b/libs/libvpx/vp10/decoder/dsubexp.h
@@ -0,0 +1,27 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP10_DECODER_DSUBEXP_H_
+#define VP10_DECODER_DSUBEXP_H_
+
+#include "vpx_dsp/bitreader.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp10_diff_update_prob(vpx_reader *r, vpx_prob* p);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_DECODER_DSUBEXP_H_
diff --git a/libs/libvpx/vp10/decoder/dthread.c b/libs/libvpx/vp10/decoder/dthread.c
new file mode 100644
index 0000000000..4206adcb61
--- /dev/null
+++ b/libs/libvpx/vp10/decoder/dthread.c
@@ -0,0 +1,189 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp10/common/reconinter.h"
+#include "vp10/decoder/dthread.h"
+#include "vp10/decoder/decoder.h"
+
+// #define DEBUG_THREAD
+
+// TODO(hkuang): Clean up all the #ifdef in this file.
+void vp10_frameworker_lock_stats(VPxWorker *const worker) {
+#if CONFIG_MULTITHREAD
+  FrameWorkerData *const worker_data = worker->data1;
+  pthread_mutex_lock(&worker_data->stats_mutex);
+#else
+  (void)worker;
+#endif
+}
+
+void vp10_frameworker_unlock_stats(VPxWorker *const worker) {
+#if CONFIG_MULTITHREAD
+  FrameWorkerData *const worker_data = worker->data1;
+  pthread_mutex_unlock(&worker_data->stats_mutex);
+#else
+  (void)worker;
+#endif
+}
+
+void vp10_frameworker_signal_stats(VPxWorker *const worker) {
+#if CONFIG_MULTITHREAD
+  FrameWorkerData *const worker_data = worker->data1;
+
+// TODO(hkuang): Fix the pthread_cond_broadcast in windows wrapper.
+#if defined(_WIN32) && !HAVE_PTHREAD_H
+  pthread_cond_signal(&worker_data->stats_cond);
+#else
+  pthread_cond_broadcast(&worker_data->stats_cond);
+#endif
+
+#else
+  (void)worker;
+#endif
+}
+
+// This macro prevents thread_sanitizer from reporting known concurrent writes.
+#if defined(__has_feature)
+#if __has_feature(thread_sanitizer)
+#define BUILDING_WITH_TSAN
+#endif
+#endif
+
+// TODO(hkuang): Remove worker parameter as it is only used in debug code.
+void vp10_frameworker_wait(VPxWorker *const worker, RefCntBuffer *const ref_buf,
+                          int row) {
+#if CONFIG_MULTITHREAD
+  if (!ref_buf)
+    return;
+
+#ifndef BUILDING_WITH_TSAN
+  // The following line of code will get harmless tsan error but it is the key
+  // to get best performance.
+  if (ref_buf->row >= row && ref_buf->buf.corrupted != 1) return;
+#endif
+
+  {
+    // Find the worker thread that owns the reference frame. If the reference
+    // frame has been fully decoded, it may not have owner.
+    VPxWorker *const ref_worker = ref_buf->frame_worker_owner;
+    FrameWorkerData *const ref_worker_data =
+        (FrameWorkerData *)ref_worker->data1;
+    const VP10Decoder *const pbi = ref_worker_data->pbi;
+
+#ifdef DEBUG_THREAD
+    {
+      FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+      printf("%d %p worker is waiting for %d %p worker (%d)  ref %d \r\n",
+             worker_data->worker_id, worker, ref_worker_data->worker_id,
+             ref_buf->frame_worker_owner, row, ref_buf->row);
+    }
+#endif
+
+    vp10_frameworker_lock_stats(ref_worker);
+    while (ref_buf->row < row && pbi->cur_buf == ref_buf &&
+           ref_buf->buf.corrupted != 1) {
+      pthread_cond_wait(&ref_worker_data->stats_cond,
+                        &ref_worker_data->stats_mutex);
+    }
+
+    if (ref_buf->buf.corrupted == 1) {
+      FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+      vp10_frameworker_unlock_stats(ref_worker);
+      vpx_internal_error(&worker_data->pbi->common.error,
+                         VPX_CODEC_CORRUPT_FRAME,
+                         "Worker %p failed to decode frame", worker);
+    }
+    vp10_frameworker_unlock_stats(ref_worker);
+  }
+#else
+  (void)worker;
+  (void)ref_buf;
+  (void)row;
+  (void)ref_buf;
+#endif  // CONFIG_MULTITHREAD
+}
+
+void vp10_frameworker_broadcast(RefCntBuffer *const buf, int row) {
+#if CONFIG_MULTITHREAD
+  VPxWorker *worker = buf->frame_worker_owner;
+
+#ifdef DEBUG_THREAD
+  {
+    FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+    printf("%d %p worker decode to (%d) \r\n", worker_data->worker_id,
+           buf->frame_worker_owner, row);
+  }
+#endif
+
+  vp10_frameworker_lock_stats(worker);
+  buf->row = row;
+  vp10_frameworker_signal_stats(worker);
+  vp10_frameworker_unlock_stats(worker);
+#else
+  (void)buf;
+  (void)row;
+#endif  // CONFIG_MULTITHREAD
+}
+
+void vp10_frameworker_copy_context(VPxWorker *const dst_worker,
+                                  VPxWorker *const src_worker) {
+#if CONFIG_MULTITHREAD
+  FrameWorkerData *const src_worker_data = (FrameWorkerData *)src_worker->data1;
+  FrameWorkerData *const dst_worker_data = (FrameWorkerData *)dst_worker->data1;
+  VP10_COMMON *const src_cm = &src_worker_data->pbi->common;
+  VP10_COMMON *const dst_cm = &dst_worker_data->pbi->common;
+  int i;
+
+  // Wait until source frame's context is ready.
+  vp10_frameworker_lock_stats(src_worker);
+  while (!src_worker_data->frame_context_ready) {
+    pthread_cond_wait(&src_worker_data->stats_cond,
+        &src_worker_data->stats_mutex);
+  }
+
+  dst_cm->last_frame_seg_map = src_cm->seg.enabled ?
+      src_cm->current_frame_seg_map : src_cm->last_frame_seg_map;
+  dst_worker_data->pbi->need_resync = src_worker_data->pbi->need_resync;
+  vp10_frameworker_unlock_stats(src_worker);
+
+  dst_cm->bit_depth = src_cm->bit_depth;
+#if CONFIG_VP9_HIGHBITDEPTH
+  dst_cm->use_highbitdepth = src_cm->use_highbitdepth;
+#endif
+  dst_cm->prev_frame = src_cm->show_existing_frame ?
+                       src_cm->prev_frame : src_cm->cur_frame;
+  dst_cm->last_width = !src_cm->show_existing_frame ?
+                       src_cm->width : src_cm->last_width;
+  dst_cm->last_height = !src_cm->show_existing_frame ?
+                        src_cm->height : src_cm->last_height;
+  dst_cm->subsampling_x = src_cm->subsampling_x;
+  dst_cm->subsampling_y = src_cm->subsampling_y;
+  dst_cm->frame_type = src_cm->frame_type;
+  dst_cm->last_show_frame = !src_cm->show_existing_frame ?
+                            src_cm->show_frame : src_cm->last_show_frame;
+  for (i = 0; i < REF_FRAMES; ++i)
+    dst_cm->ref_frame_map[i] = src_cm->next_ref_frame_map[i];
+
+  memcpy(dst_cm->lf_info.lfthr, src_cm->lf_info.lfthr,
+         (MAX_LOOP_FILTER + 1) * sizeof(loop_filter_thresh));
+  dst_cm->lf.last_sharpness_level = src_cm->lf.sharpness_level;
+  dst_cm->lf.filter_level = src_cm->lf.filter_level;
+  memcpy(dst_cm->lf.ref_deltas, src_cm->lf.ref_deltas, MAX_REF_FRAMES);
+  memcpy(dst_cm->lf.mode_deltas, src_cm->lf.mode_deltas, MAX_MODE_LF_DELTAS);
+  dst_cm->seg = src_cm->seg;
+  memcpy(dst_cm->frame_contexts, src_cm->frame_contexts,
+         FRAME_CONTEXTS * sizeof(dst_cm->frame_contexts[0]));
+#else
+  (void) dst_worker;
+  (void) src_worker;
+#endif  // CONFIG_MULTITHREAD
+}
diff --git a/libs/libvpx/vp10/decoder/dthread.h b/libs/libvpx/vp10/decoder/dthread.h
new file mode 100644
index 0000000000..1b0dc0191d
--- /dev/null
+++ b/libs/libvpx/vp10/decoder/dthread.h
@@ -0,0 +1,74 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_DECODER_DTHREAD_H_
+#define VP10_DECODER_DTHREAD_H_
+
+#include "./vpx_config.h"
+#include "vpx_util/vpx_thread.h"
+#include "vpx/internal/vpx_codec_internal.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP10Common;
+struct VP10Decoder;
+
+// WorkerData for the FrameWorker thread. It contains all the information of
+// the worker and decode structures for decoding a frame.
+typedef struct FrameWorkerData {
+  struct VP10Decoder *pbi;
+  const uint8_t *data;
+  const uint8_t *data_end;
+  size_t data_size;
+  void *user_priv;
+  int result;
+  int worker_id;
+  int received_frame;
+
+  // scratch_buffer is used in frame parallel mode only.
+  // It is used to make a copy of the compressed data.
+  uint8_t *scratch_buffer;
+  size_t scratch_buffer_size;
+
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t stats_mutex;
+  pthread_cond_t stats_cond;
+#endif
+
+  int frame_context_ready;  // Current frame's context is ready to read.
+  int frame_decoded;        // Finished decoding current frame.
+} FrameWorkerData;
+
+void vp10_frameworker_lock_stats(VPxWorker *const worker);
+void vp10_frameworker_unlock_stats(VPxWorker *const worker);
+void vp10_frameworker_signal_stats(VPxWorker *const worker);
+
+// Wait until ref_buf has been decoded to row in real pixel unit.
+// Note: worker may already finish decoding ref_buf and release it in order to
+// start decoding next frame. So need to check whether worker is still decoding
+// ref_buf.
+void vp10_frameworker_wait(VPxWorker *const worker, RefCntBuffer *const ref_buf,
+                          int row);
+
+// FrameWorker broadcasts its decoding progress so other workers that are
+// waiting on it can resume decoding.
+void vp10_frameworker_broadcast(RefCntBuffer *const buf, int row);
+
+// Copy necessary decoding context from src worker to dst worker.
+void vp10_frameworker_copy_context(VPxWorker *const dst_worker,
+                                  VPxWorker *const src_worker);
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif  // VP10_DECODER_DTHREAD_H_
diff --git a/libs/libvpx/vp10/encoder/aq_complexity.c b/libs/libvpx/vp10/encoder/aq_complexity.c
new file mode 100644
index 0000000000..2506a4e552
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/aq_complexity.c
@@ -0,0 +1,165 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <limits.h>
+#include <math.h>
+
+#include "vp10/encoder/aq_complexity.h"
+#include "vp10/encoder/aq_variance.h"
+#include "vp10/encoder/encodeframe.h"
+#include "vp10/common/seg_common.h"
+#include "vp10/encoder/segmentation.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_ports/system_state.h"
+
+#define AQ_C_SEGMENTS  5
+#define DEFAULT_AQ2_SEG 3   // Neutral Q segment
+#define AQ_C_STRENGTHS 3
+static const double aq_c_q_adj_factor[AQ_C_STRENGTHS][AQ_C_SEGMENTS] =
+  { {1.75, 1.25, 1.05, 1.00, 0.90},
+    {2.00, 1.50, 1.15, 1.00, 0.85},
+    {2.50, 1.75, 1.25, 1.00, 0.80} };
+static const double aq_c_transitions[AQ_C_STRENGTHS][AQ_C_SEGMENTS] =
+  { {0.15, 0.30, 0.55, 2.00, 100.0},
+    {0.20, 0.40, 0.65, 2.00, 100.0},
+    {0.25, 0.50, 0.75, 2.00, 100.0} };
+static const double aq_c_var_thresholds[AQ_C_STRENGTHS][AQ_C_SEGMENTS] =
+  { {-4.0, -3.0, -2.0, 100.00, 100.0},
+    {-3.5, -2.5, -1.5, 100.00, 100.0},
+    {-3.0, -2.0, -1.0, 100.00, 100.0} };
+
+#define DEFAULT_COMPLEXITY 64
+
+
+static int get_aq_c_strength(int q_index, vpx_bit_depth_t bit_depth) {
+  // Approximate base quatizer (truncated to int)
+  const int base_quant = vp10_ac_quant(q_index, 0, bit_depth) / 4;
+  return (base_quant > 10) + (base_quant > 25);
+}
+
+void vp10_setup_in_frame_q_adj(VP10_COMP *cpi) {
+  VP10_COMMON *const cm = &cpi->common;
+  struct segmentation *const seg = &cm->seg;
+
+  // Make SURE use of floating point in this function is safe.
+  vpx_clear_system_state();
+
+  if (frame_is_intra_only(cm) || cm->error_resilient_mode ||
+      cpi->refresh_alt_ref_frame ||
+      (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
+    int segment;
+    const int aq_strength = get_aq_c_strength(cm->base_qindex, cm->bit_depth);
+
+    // Clear down the segment map.
+    memset(cpi->segmentation_map, DEFAULT_AQ2_SEG, cm->mi_rows * cm->mi_cols);
+
+    vp10_clearall_segfeatures(seg);
+
+    // Segmentation only makes sense if the target bits per SB is above a
+    // threshold. Below this the overheads will usually outweigh any benefit.
+    if (cpi->rc.sb64_target_rate < 256) {
+      vp10_disable_segmentation(seg);
+      return;
+    }
+
+    vp10_enable_segmentation(seg);
+
+    // Select delta coding method.
+    seg->abs_delta = SEGMENT_DELTADATA;
+
+    // Default segment "Q" feature is disabled so it defaults to the baseline Q.
+    vp10_disable_segfeature(seg, DEFAULT_AQ2_SEG, SEG_LVL_ALT_Q);
+
+    // Use some of the segments for in frame Q adjustment.
+    for (segment = 0; segment < AQ_C_SEGMENTS; ++segment) {
+      int qindex_delta;
+
+      if (segment == DEFAULT_AQ2_SEG)
+        continue;
+
+      qindex_delta =
+        vp10_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex,
+                                   aq_c_q_adj_factor[aq_strength][segment],
+                                   cm->bit_depth);
+
+
+      // For AQ complexity mode, we dont allow Q0 in a segment if the base
+      // Q is not 0. Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment
+      // Q delta is sometimes applied without going back around the rd loop.
+      // This could lead to an illegal combination of partition size and q.
+      if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) {
+        qindex_delta = -cm->base_qindex + 1;
+      }
+      if ((cm->base_qindex + qindex_delta) > 0) {
+        vp10_enable_segfeature(seg, segment, SEG_LVL_ALT_Q);
+        vp10_set_segdata(seg, segment, SEG_LVL_ALT_Q, qindex_delta);
+      }
+    }
+  }
+}
+
+#define DEFAULT_LV_THRESH 10.0
+#define MIN_DEFAULT_LV_THRESH 8.0
+#define VAR_STRENGTH_STEP 0.25
+// Select a segment for the current block.
+// The choice of segment for a block depends on the ratio of the projected
+// bits for the block vs a target average and its spatial complexity.
+void vp10_caq_select_segment(VP10_COMP *cpi, MACROBLOCK *mb, BLOCK_SIZE bs,
+                            int mi_row, int mi_col, int projected_rate) {
+  VP10_COMMON *const cm = &cpi->common;
+
+  const int mi_offset = mi_row * cm->mi_cols + mi_col;
+  const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64];
+  const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64];
+  const int xmis = VPXMIN(cm->mi_cols - mi_col, num_8x8_blocks_wide_lookup[bs]);
+  const int ymis = VPXMIN(cm->mi_rows - mi_row, num_8x8_blocks_high_lookup[bs]);
+  int x, y;
+  int i;
+  unsigned char segment;
+
+  if (0) {
+    segment = DEFAULT_AQ2_SEG;
+  } else {
+    // Rate depends on fraction of a SB64 in frame (xmis * ymis / bw * bh).
+    // It is converted to bits * 256 units.
+    const int target_rate = (cpi->rc.sb64_target_rate * xmis * ymis * 256) /
+                            (bw * bh);
+    double logvar;
+    double low_var_thresh;
+    const int aq_strength = get_aq_c_strength(cm->base_qindex, cm->bit_depth);
+
+    vpx_clear_system_state();
+    low_var_thresh = (cpi->oxcf.pass == 2)
+      ? VPXMAX(cpi->twopass.mb_av_energy, MIN_DEFAULT_LV_THRESH)
+      : DEFAULT_LV_THRESH;
+
+    vp10_setup_src_planes(mb, cpi->Source, mi_row, mi_col);
+    logvar = vp10_log_block_var(cpi, mb, bs);
+
+    segment = AQ_C_SEGMENTS - 1;    // Just in case no break out below.
+    for (i = 0; i < AQ_C_SEGMENTS; ++i) {
+      // Test rate against a threshold value and variance against a threshold.
+      // Increasing segment number (higher variance and complexity) = higher Q.
+      if ((projected_rate <
+           target_rate * aq_c_transitions[aq_strength][i]) &&
+          (logvar < (low_var_thresh + aq_c_var_thresholds[aq_strength][i]))) {
+        segment = i;
+        break;
+      }
+    }
+  }
+
+  // Fill in the entires in the segment map corresponding to this SB64.
+  for (y = 0; y < ymis; y++) {
+    for (x = 0; x < xmis; x++) {
+      cpi->segmentation_map[mi_offset + y * cm->mi_cols + x] = segment;
+    }
+  }
+}
diff --git a/libs/libvpx/vp10/encoder/aq_complexity.h b/libs/libvpx/vp10/encoder/aq_complexity.h
new file mode 100644
index 0000000000..f9de2ada3e
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/aq_complexity.h
@@ -0,0 +1,37 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP10_ENCODER_AQ_COMPLEXITY_H_
+#define VP10_ENCODER_AQ_COMPLEXITY_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "vp10/common/enums.h"
+
+struct VP10_COMP;
+struct macroblock;
+
+// Select a segment for the current Block.
+void vp10_caq_select_segment(struct VP10_COMP *cpi, struct macroblock *,
+                            BLOCK_SIZE bs,
+                            int mi_row, int mi_col, int projected_rate);
+
+// This function sets up a set of segments with delta Q values around
+// the baseline frame quantizer.
+void vp10_setup_in_frame_q_adj(struct VP10_COMP *cpi);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_ENCODER_AQ_COMPLEXITY_H_
diff --git a/libs/libvpx/vp10/encoder/aq_cyclicrefresh.c b/libs/libvpx/vp10/encoder/aq_cyclicrefresh.c
new file mode 100644
index 0000000000..660670ccea
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/aq_cyclicrefresh.c
@@ -0,0 +1,567 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <limits.h>
+#include <math.h>
+
+#include "vp10/common/seg_common.h"
+#include "vp10/encoder/aq_cyclicrefresh.h"
+#include "vp10/encoder/ratectrl.h"
+#include "vp10/encoder/segmentation.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_ports/system_state.h"
+
+struct CYCLIC_REFRESH {
+  // Percentage of blocks per frame that are targeted as candidates
+  // for cyclic refresh.
+  int percent_refresh;
+  // Maximum q-delta as percentage of base q.
+  int max_qdelta_perc;
+  // Superblock starting index for cycling through the frame.
+  int sb_index;
+  // Controls how long block will need to wait to be refreshed again, in
+  // excess of the cycle time, i.e., in the case of all zero motion, block
+  // will be refreshed every (100/percent_refresh + time_for_refresh) frames.
+  int time_for_refresh;
+  // Target number of (8x8) blocks that are set for delta-q.
+  int target_num_seg_blocks;
+  // Actual number of (8x8) blocks that were applied delta-q.
+  int actual_num_seg1_blocks;
+  int actual_num_seg2_blocks;
+  // RD mult. parameters for segment 1.
+  int rdmult;
+  // Cyclic refresh map.
+  signed char *map;
+  // Map of the last q a block was coded at.
+  uint8_t *last_coded_q_map;
+  // Thresholds applied to the projected rate/distortion of the coding block,
+  // when deciding whether block should be refreshed.
+  int64_t thresh_rate_sb;
+  int64_t thresh_dist_sb;
+  // Threshold applied to the motion vector (in units of 1/8 pel) of the
+  // coding block, when deciding whether block should be refreshed.
+  int16_t motion_thresh;
+  // Rate target ratio to set q delta.
+  double rate_ratio_qdelta;
+  // Boost factor for rate target ratio, for segment CR_SEGMENT_ID_BOOST2.
+  int rate_boost_fac;
+  double low_content_avg;
+  int qindex_delta[3];
+};
+
+CYCLIC_REFRESH *vp10_cyclic_refresh_alloc(int mi_rows, int mi_cols) {
+  size_t last_coded_q_map_size;
+  CYCLIC_REFRESH *const cr = vpx_calloc(1, sizeof(*cr));
+  if (cr == NULL)
+    return NULL;
+
+  cr->map = vpx_calloc(mi_rows * mi_cols, sizeof(*cr->map));
+  if (cr->map == NULL) {
+    vpx_free(cr);
+    return NULL;
+  }
+  last_coded_q_map_size = mi_rows * mi_cols * sizeof(*cr->last_coded_q_map);
+  cr->last_coded_q_map = vpx_malloc(last_coded_q_map_size);
+  if (cr->last_coded_q_map == NULL) {
+    vpx_free(cr);
+    return NULL;
+  }
+  assert(MAXQ <= 255);
+  memset(cr->last_coded_q_map, MAXQ, last_coded_q_map_size);
+
+  return cr;
+}
+
+void vp10_cyclic_refresh_free(CYCLIC_REFRESH *cr) {
+  vpx_free(cr->map);
+  vpx_free(cr->last_coded_q_map);
+  vpx_free(cr);
+}
+
+// Check if we should turn off cyclic refresh based on bitrate condition.
+static int apply_cyclic_refresh_bitrate(const VP10_COMMON *cm,
+                                        const RATE_CONTROL *rc) {
+  // Turn off cyclic refresh if bits available per frame is not sufficiently
+  // larger than bit cost of segmentation. Segment map bit cost should scale
+  // with number of seg blocks, so compare available bits to number of blocks.
+  // Average bits available per frame = avg_frame_bandwidth
+  // Number of (8x8) blocks in frame = mi_rows * mi_cols;
+  const float factor = 0.25;
+  const int number_blocks = cm->mi_rows  * cm->mi_cols;
+  // The condition below corresponds to turning off at target bitrates:
+  // (at 30fps), ~12kbps for CIF, 36kbps for VGA, 100kps for HD/720p.
+  // Also turn off at very small frame sizes, to avoid too large fraction of
+  // superblocks to be refreshed per frame. Threshold below is less than QCIF.
+  if (rc->avg_frame_bandwidth < factor * number_blocks ||
+      number_blocks / 64 < 5)
+    return 0;
+  else
+    return 1;
+}
+
+// Check if this coding block, of size bsize, should be considered for refresh
+// (lower-qp coding). Decision can be based on various factors, such as
+// size of the coding block (i.e., below min_block size rejected), coding
+// mode, and rate/distortion.
+static int candidate_refresh_aq(const CYCLIC_REFRESH *cr,
+                                const MB_MODE_INFO *mbmi,
+                                int64_t rate,
+                                int64_t dist,
+                                int bsize) {
+  MV mv = mbmi->mv[0].as_mv;
+  // Reject the block for lower-qp coding if projected distortion
+  // is above the threshold, and any of the following is true:
+  // 1) mode uses large mv
+  // 2) mode is an intra-mode
+  // Otherwise accept for refresh.
+  if (dist > cr->thresh_dist_sb &&
+      (mv.row > cr->motion_thresh || mv.row < -cr->motion_thresh ||
+       mv.col > cr->motion_thresh || mv.col < -cr->motion_thresh ||
+       !is_inter_block(mbmi)))
+    return CR_SEGMENT_ID_BASE;
+  else  if (bsize >= BLOCK_16X16 &&
+            rate < cr->thresh_rate_sb &&
+            is_inter_block(mbmi) &&
+            mbmi->mv[0].as_int == 0 &&
+            cr->rate_boost_fac > 10)
+    // More aggressive delta-q for bigger blocks with zero motion.
+    return CR_SEGMENT_ID_BOOST2;
+  else
+    return CR_SEGMENT_ID_BOOST1;
+}
+
+// Compute delta-q for the segment.
+static int compute_deltaq(const VP10_COMP *cpi, int q, double rate_factor) {
+  const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  int deltaq = vp10_compute_qdelta_by_rate(rc, cpi->common.frame_type,
+                                          q, rate_factor,
+                                          cpi->common.bit_depth);
+  if ((-deltaq) > cr->max_qdelta_perc * q / 100) {
+    deltaq = -cr->max_qdelta_perc * q / 100;
+  }
+  return deltaq;
+}
+
+// For the just encoded frame, estimate the bits, incorporating the delta-q
+// from non-base segment. For now ignore effect of multiple segments
+// (with different delta-q). Note this function is called in the postencode
+// (called from rc_update_rate_correction_factors()).
+int vp10_cyclic_refresh_estimate_bits_at_q(const VP10_COMP *cpi,
+                                          double correction_factor) {
+  const VP10_COMMON *const cm = &cpi->common;
+  const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  int estimated_bits;
+  int mbs = cm->MBs;
+  int num8x8bl = mbs << 2;
+  // Weight for non-base segments: use actual number of blocks refreshed in
+  // previous/just encoded frame. Note number of blocks here is in 8x8 units.
+  double weight_segment1 = (double)cr->actual_num_seg1_blocks / num8x8bl;
+  double weight_segment2 = (double)cr->actual_num_seg2_blocks / num8x8bl;
+  // Take segment weighted average for estimated bits.
+  estimated_bits = (int)((1.0 - weight_segment1 - weight_segment2) *
+      vp10_estimate_bits_at_q(cm->frame_type, cm->base_qindex, mbs,
+                             correction_factor, cm->bit_depth) +
+                             weight_segment1 *
+      vp10_estimate_bits_at_q(cm->frame_type,
+                             cm->base_qindex + cr->qindex_delta[1], mbs,
+                             correction_factor, cm->bit_depth) +
+                             weight_segment2 *
+      vp10_estimate_bits_at_q(cm->frame_type,
+                             cm->base_qindex + cr->qindex_delta[2], mbs,
+                             correction_factor, cm->bit_depth));
+  return estimated_bits;
+}
+
+// Prior to encoding the frame, estimate the bits per mb, for a given q = i and
+// a corresponding delta-q (for segment 1). This function is called in the
+// rc_regulate_q() to set the base qp index.
+// Note: the segment map is set to either 0/CR_SEGMENT_ID_BASE (no refresh) or
+// to 1/CR_SEGMENT_ID_BOOST1 (refresh) for each superblock, prior to encoding.
+int vp10_cyclic_refresh_rc_bits_per_mb(const VP10_COMP *cpi, int i,
+                                      double correction_factor) {
+  const VP10_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  int bits_per_mb;
+  int num8x8bl = cm->MBs << 2;
+  // Weight for segment prior to encoding: take the average of the target
+  // number for the frame to be encoded and the actual from the previous frame.
+  double weight_segment = (double)((cr->target_num_seg_blocks +
+      cr->actual_num_seg1_blocks + cr->actual_num_seg2_blocks) >> 1) /
+      num8x8bl;
+  // Compute delta-q corresponding to qindex i.
+  int deltaq = compute_deltaq(cpi, i, cr->rate_ratio_qdelta);
+  // Take segment weighted average for bits per mb.
+  bits_per_mb = (int)((1.0 - weight_segment) *
+      vp10_rc_bits_per_mb(cm->frame_type, i, correction_factor, cm->bit_depth) +
+      weight_segment *
+      vp10_rc_bits_per_mb(cm->frame_type, i + deltaq, correction_factor,
+                         cm->bit_depth));
+  return bits_per_mb;
+}
+
+// Prior to coding a given prediction block, of size bsize at (mi_row, mi_col),
+// check if we should reset the segment_id, and update the cyclic_refresh map
+// and segmentation map.
+void vp10_cyclic_refresh_update_segment(VP10_COMP *const cpi,
+                                       MB_MODE_INFO *const mbmi,
+                                       int mi_row, int mi_col,
+                                       BLOCK_SIZE bsize,
+                                       int64_t rate,
+                                       int64_t dist,
+                                       int skip) {
+  const VP10_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  const int bw = num_8x8_blocks_wide_lookup[bsize];
+  const int bh = num_8x8_blocks_high_lookup[bsize];
+  const int xmis = VPXMIN(cm->mi_cols - mi_col, bw);
+  const int ymis = VPXMIN(cm->mi_rows - mi_row, bh);
+  const int block_index = mi_row * cm->mi_cols + mi_col;
+  const int refresh_this_block = candidate_refresh_aq(cr, mbmi, rate, dist,
+                                                      bsize);
+  // Default is to not update the refresh map.
+  int new_map_value = cr->map[block_index];
+  int x = 0; int y = 0;
+
+  // If this block is labeled for refresh, check if we should reset the
+  // segment_id.
+  if (cyclic_refresh_segment_id_boosted(mbmi->segment_id)) {
+    mbmi->segment_id = refresh_this_block;
+    // Reset segment_id if will be skipped.
+    if (skip)
+      mbmi->segment_id = CR_SEGMENT_ID_BASE;
+  }
+
+  // Update the cyclic refresh map, to be used for setting segmentation map
+  // for the next frame. If the block  will be refreshed this frame, mark it
+  // as clean. The magnitude of the -ve influences how long before we consider
+  // it for refresh again.
+  if (cyclic_refresh_segment_id_boosted(mbmi->segment_id)) {
+    new_map_value = -cr->time_for_refresh;
+  } else if (refresh_this_block) {
+    // Else if it is accepted as candidate for refresh, and has not already
+    // been refreshed (marked as 1) then mark it as a candidate for cleanup
+    // for future time (marked as 0), otherwise don't update it.
+    if (cr->map[block_index] == 1)
+      new_map_value = 0;
+  } else {
+    // Leave it marked as block that is not candidate for refresh.
+    new_map_value = 1;
+  }
+
+  // Update entries in the cyclic refresh map with new_map_value, and
+  // copy mbmi->segment_id into global segmentation map.
+  for (y = 0; y < ymis; y++)
+    for (x = 0; x < xmis; x++) {
+      int map_offset = block_index + y * cm->mi_cols + x;
+      cr->map[map_offset] = new_map_value;
+      cpi->segmentation_map[map_offset] = mbmi->segment_id;
+      // Inter skip blocks were clearly not coded at the current qindex, so
+      // don't update the map for them. For cases where motion is non-zero or
+      // the reference frame isn't the previous frame, the previous value in
+      // the map for this spatial location is not entirely correct.
+      if (!is_inter_block(mbmi) || !skip)
+        cr->last_coded_q_map[map_offset] = clamp(
+            cm->base_qindex + cr->qindex_delta[mbmi->segment_id], 0, MAXQ);
+    }
+}
+
+// Update the actual number of blocks that were applied the segment delta q.
+void vp10_cyclic_refresh_postencode(VP10_COMP *const cpi) {
+  VP10_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  unsigned char *const seg_map = cpi->segmentation_map;
+  int mi_row, mi_col;
+  cr->actual_num_seg1_blocks = 0;
+  cr->actual_num_seg2_blocks = 0;
+  for (mi_row = 0; mi_row < cm->mi_rows; mi_row++)
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) {
+      if (cyclic_refresh_segment_id(
+          seg_map[mi_row * cm->mi_cols + mi_col]) == CR_SEGMENT_ID_BOOST1)
+        cr->actual_num_seg1_blocks++;
+      else if (cyclic_refresh_segment_id(
+          seg_map[mi_row * cm->mi_cols + mi_col]) == CR_SEGMENT_ID_BOOST2)
+        cr->actual_num_seg2_blocks++;
+    }
+}
+
+// Set golden frame update interval, for 1 pass CBR mode.
+void vp10_cyclic_refresh_set_golden_update(VP10_COMP *const cpi) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  // Set minimum gf_interval for GF update to a multiple (== 2) of refresh
+  // period. Depending on past encoding stats, GF flag may be reset and update
+  // may not occur until next baseline_gf_interval.
+  if (cr->percent_refresh > 0)
+    rc->baseline_gf_interval = 4 * (100 / cr->percent_refresh);
+  else
+    rc->baseline_gf_interval = 40;
+}
+
+// Update some encoding stats (from the just encoded frame). If this frame's
+// background has high motion, refresh the golden frame. Otherwise, if the
+// golden reference is to be updated check if we should NOT update the golden
+// ref.
+void vp10_cyclic_refresh_check_golden_update(VP10_COMP *const cpi) {
+  VP10_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  int mi_row, mi_col;
+  double fraction_low = 0.0;
+  int low_content_frame = 0;
+
+  MODE_INFO **mi = cm->mi_grid_visible;
+  RATE_CONTROL *const rc = &cpi->rc;
+  const int rows = cm->mi_rows, cols = cm->mi_cols;
+  int cnt1 = 0, cnt2 = 0;
+  int force_gf_refresh = 0;
+
+  for (mi_row = 0; mi_row < rows; mi_row++) {
+    for (mi_col = 0; mi_col < cols; mi_col++) {
+      int16_t abs_mvr = mi[0]->mbmi.mv[0].as_mv.row >= 0 ?
+          mi[0]->mbmi.mv[0].as_mv.row : -1 * mi[0]->mbmi.mv[0].as_mv.row;
+      int16_t abs_mvc = mi[0]->mbmi.mv[0].as_mv.col >= 0 ?
+          mi[0]->mbmi.mv[0].as_mv.col : -1 * mi[0]->mbmi.mv[0].as_mv.col;
+
+      // Calculate the motion of the background.
+      if (abs_mvr <= 16 && abs_mvc <= 16) {
+        cnt1++;
+        if (abs_mvr == 0 && abs_mvc == 0)
+          cnt2++;
+      }
+      mi++;
+
+      // Accumulate low_content_frame.
+      if (cr->map[mi_row * cols + mi_col] < 1)
+        low_content_frame++;
+    }
+    mi += 8;
+  }
+
+  // For video conference clips, if the background has high motion in current
+  // frame because of the camera movement, set this frame as the golden frame.
+  // Use 70% and 5% as the thresholds for golden frame refreshing.
+  // Also, force this frame as a golden update frame if this frame will change
+  // the resolution (resize_pending != 0).
+  if (cpi->resize_pending != 0 ||
+     (cnt1 * 10 > (70 * rows * cols) && cnt2 * 20 < cnt1)) {
+    vp10_cyclic_refresh_set_golden_update(cpi);
+    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+
+    if (rc->frames_till_gf_update_due > rc->frames_to_key)
+      rc->frames_till_gf_update_due = rc->frames_to_key;
+    cpi->refresh_golden_frame = 1;
+    force_gf_refresh = 1;
+  }
+
+  fraction_low =
+      (double)low_content_frame / (rows * cols);
+  // Update average.
+  cr->low_content_avg = (fraction_low + 3 * cr->low_content_avg) / 4;
+  if (!force_gf_refresh && cpi->refresh_golden_frame == 1) {
+    // Don't update golden reference if the amount of low_content for the
+    // current encoded frame is small, or if the recursive average of the
+    // low_content over the update interval window falls below threshold.
+    if (fraction_low < 0.8 || cr->low_content_avg < 0.7)
+      cpi->refresh_golden_frame = 0;
+    // Reset for next internal.
+    cr->low_content_avg = fraction_low;
+  }
+}
+
+// Update the segmentation map, and related quantities: cyclic refresh map,
+// refresh sb_index, and target number of blocks to be refreshed.
+// The map is set to either 0/CR_SEGMENT_ID_BASE (no refresh) or to
+// 1/CR_SEGMENT_ID_BOOST1 (refresh) for each superblock.
+// Blocks labeled as BOOST1 may later get set to BOOST2 (during the
+// encoding of the superblock).
+static void cyclic_refresh_update_map(VP10_COMP *const cpi) {
+  VP10_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  unsigned char *const seg_map = cpi->segmentation_map;
+  int i, block_count, bl_index, sb_rows, sb_cols, sbs_in_frame;
+  int xmis, ymis, x, y;
+  memset(seg_map, CR_SEGMENT_ID_BASE, cm->mi_rows * cm->mi_cols);
+  sb_cols = (cm->mi_cols + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE;
+  sb_rows = (cm->mi_rows + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE;
+  sbs_in_frame = sb_cols * sb_rows;
+  // Number of target blocks to get the q delta (segment 1).
+  block_count = cr->percent_refresh * cm->mi_rows * cm->mi_cols / 100;
+  // Set the segmentation map: cycle through the superblocks, starting at
+  // cr->mb_index, and stopping when either block_count blocks have been found
+  // to be refreshed, or we have passed through whole frame.
+  assert(cr->sb_index < sbs_in_frame);
+  i = cr->sb_index;
+  cr->target_num_seg_blocks = 0;
+  do {
+    int sum_map = 0;
+    // Get the mi_row/mi_col corresponding to superblock index i.
+    int sb_row_index = (i / sb_cols);
+    int sb_col_index = i - sb_row_index * sb_cols;
+    int mi_row = sb_row_index * MI_BLOCK_SIZE;
+    int mi_col = sb_col_index * MI_BLOCK_SIZE;
+    int qindex_thresh =
+        cpi->oxcf.content == VP9E_CONTENT_SCREEN
+            ? vp10_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST2, cm->base_qindex)
+            : 0;
+    assert(mi_row >= 0 && mi_row < cm->mi_rows);
+    assert(mi_col >= 0 && mi_col < cm->mi_cols);
+    bl_index = mi_row * cm->mi_cols + mi_col;
+    // Loop through all 8x8 blocks in superblock and update map.
+    xmis =
+        VPXMIN(cm->mi_cols - mi_col, num_8x8_blocks_wide_lookup[BLOCK_64X64]);
+    ymis =
+        VPXMIN(cm->mi_rows - mi_row, num_8x8_blocks_high_lookup[BLOCK_64X64]);
+    for (y = 0; y < ymis; y++) {
+      for (x = 0; x < xmis; x++) {
+        const int bl_index2 = bl_index + y * cm->mi_cols + x;
+        // If the block is as a candidate for clean up then mark it
+        // for possible boost/refresh (segment 1). The segment id may get
+        // reset to 0 later if block gets coded anything other than ZEROMV.
+        if (cr->map[bl_index2] == 0) {
+          if (cr->last_coded_q_map[bl_index2] > qindex_thresh)
+            sum_map++;
+        } else if (cr->map[bl_index2] < 0) {
+          cr->map[bl_index2]++;
+        }
+      }
+    }
+    // Enforce constant segment over superblock.
+    // If segment is at least half of superblock, set to 1.
+    if (sum_map >= xmis * ymis / 2) {
+      for (y = 0; y < ymis; y++)
+        for (x = 0; x < xmis; x++) {
+          seg_map[bl_index + y * cm->mi_cols + x] = CR_SEGMENT_ID_BOOST1;
+        }
+      cr->target_num_seg_blocks += xmis * ymis;
+    }
+    i++;
+    if (i == sbs_in_frame) {
+      i = 0;
+    }
+  } while (cr->target_num_seg_blocks < block_count && i != cr->sb_index);
+  cr->sb_index = i;
+}
+
+// Set cyclic refresh parameters.
+void vp10_cyclic_refresh_update_parameters(VP10_COMP *const cpi) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const VP10_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  cr->percent_refresh = 10;
+  cr->max_qdelta_perc = 50;
+  cr->time_for_refresh = 0;
+  // Use larger delta-qp (increase rate_ratio_qdelta) for first few (~4)
+  // periods of the refresh cycle, after a key frame.
+  if (rc->frames_since_key <  4 * cr->percent_refresh)
+    cr->rate_ratio_qdelta = 3.0;
+  else
+    cr->rate_ratio_qdelta = 2.0;
+  // Adjust some parameters for low resolutions at low bitrates.
+  if (cm->width <= 352 &&
+      cm->height <= 288 &&
+      rc->avg_frame_bandwidth < 3400) {
+    cr->motion_thresh = 4;
+    cr->rate_boost_fac = 10;
+  } else {
+    cr->motion_thresh = 32;
+    cr->rate_boost_fac = 17;
+  }
+}
+
+// Setup cyclic background refresh: set delta q and segmentation map.
+void vp10_cyclic_refresh_setup(VP10_COMP *const cpi) {
+  VP10_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  struct segmentation *const seg = &cm->seg;
+  const int apply_cyclic_refresh  = apply_cyclic_refresh_bitrate(cm, rc);
+  if (cm->current_video_frame == 0)
+    cr->low_content_avg = 0.0;
+  // Don't apply refresh on key frame or enhancement layer frames.
+  if (!apply_cyclic_refresh || cm->frame_type == KEY_FRAME) {
+    // Set segmentation map to 0 and disable.
+    unsigned char *const seg_map = cpi->segmentation_map;
+    memset(seg_map, 0, cm->mi_rows * cm->mi_cols);
+    vp10_disable_segmentation(&cm->seg);
+    if (cm->frame_type == KEY_FRAME) {
+      memset(cr->last_coded_q_map, MAXQ,
+             cm->mi_rows * cm->mi_cols * sizeof(*cr->last_coded_q_map));
+      cr->sb_index = 0;
+    }
+    return;
+  } else {
+    int qindex_delta = 0;
+    int qindex2;
+    const double q = vp10_convert_qindex_to_q(cm->base_qindex, cm->bit_depth);
+    vpx_clear_system_state();
+    // Set rate threshold to some multiple (set to 2 for now) of the target
+    // rate (target is given by sb64_target_rate and scaled by 256).
+    cr->thresh_rate_sb = ((int64_t)(rc->sb64_target_rate) << 8) << 2;
+    // Distortion threshold, quadratic in Q, scale factor to be adjusted.
+    // q will not exceed 457, so (q * q) is within 32bit; see:
+    // vp10_convert_qindex_to_q(), vp10_ac_quant(), ac_qlookup*[].
+    cr->thresh_dist_sb = ((int64_t)(q * q)) << 2;
+
+    // Set up segmentation.
+    // Clear down the segment map.
+    vp10_enable_segmentation(&cm->seg);
+    vp10_clearall_segfeatures(seg);
+    // Select delta coding method.
+    seg->abs_delta = SEGMENT_DELTADATA;
+
+    // Note: setting temporal_update has no effect, as the seg-map coding method
+    // (temporal or spatial) is determined in vp10_choose_segmap_coding_method(),
+    // based on the coding cost of each method. For error_resilient mode on the
+    // last_frame_seg_map is set to 0, so if temporal coding is used, it is
+    // relative to 0 previous map.
+    // seg->temporal_update = 0;
+
+    // Segment BASE "Q" feature is disabled so it defaults to the baseline Q.
+    vp10_disable_segfeature(seg, CR_SEGMENT_ID_BASE, SEG_LVL_ALT_Q);
+    // Use segment BOOST1 for in-frame Q adjustment.
+    vp10_enable_segfeature(seg, CR_SEGMENT_ID_BOOST1, SEG_LVL_ALT_Q);
+    // Use segment BOOST2 for more aggressive in-frame Q adjustment.
+    vp10_enable_segfeature(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q);
+
+    // Set the q delta for segment BOOST1.
+    qindex_delta = compute_deltaq(cpi, cm->base_qindex, cr->rate_ratio_qdelta);
+    cr->qindex_delta[1] = qindex_delta;
+
+    // Compute rd-mult for segment BOOST1.
+    qindex2 = clamp(cm->base_qindex + cm->y_dc_delta_q + qindex_delta, 0, MAXQ);
+
+    cr->rdmult = vp10_compute_rd_mult(cpi, qindex2);
+
+    vp10_set_segdata(seg, CR_SEGMENT_ID_BOOST1, SEG_LVL_ALT_Q, qindex_delta);
+
+    // Set a more aggressive (higher) q delta for segment BOOST2.
+    qindex_delta = compute_deltaq(
+        cpi, cm->base_qindex,
+        VPXMIN(CR_MAX_RATE_TARGET_RATIO,
+               0.1 * cr->rate_boost_fac * cr->rate_ratio_qdelta));
+    cr->qindex_delta[2] = qindex_delta;
+    vp10_set_segdata(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q, qindex_delta);
+
+    // Update the segmentation and refresh map.
+    cyclic_refresh_update_map(cpi);
+  }
+}
+
+int vp10_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr) {
+  return cr->rdmult;
+}
+
+void vp10_cyclic_refresh_reset_resize(VP10_COMP *const cpi) {
+  const VP10_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  memset(cr->map, 0, cm->mi_rows * cm->mi_cols);
+  cr->sb_index = 0;
+  cpi->refresh_golden_frame = 1;
+}
diff --git a/libs/libvpx/vp10/encoder/aq_cyclicrefresh.h b/libs/libvpx/vp10/encoder/aq_cyclicrefresh.h
new file mode 100644
index 0000000000..f6714c5c8d
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/aq_cyclicrefresh.h
@@ -0,0 +1,98 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP10_ENCODER_AQ_CYCLICREFRESH_H_
+#define VP10_ENCODER_AQ_CYCLICREFRESH_H_
+
+#include "vp10/common/blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// The segment ids used in cyclic refresh: from base (no boost) to increasing
+// boost (higher delta-qp).
+#define CR_SEGMENT_ID_BASE    0
+#define CR_SEGMENT_ID_BOOST1  1
+#define CR_SEGMENT_ID_BOOST2  2
+
+// Maximum rate target ratio for setting segment delta-qp.
+#define CR_MAX_RATE_TARGET_RATIO 4.0
+
+struct VP10_COMP;
+
+struct CYCLIC_REFRESH;
+typedef struct CYCLIC_REFRESH CYCLIC_REFRESH;
+
+CYCLIC_REFRESH *vp10_cyclic_refresh_alloc(int mi_rows, int mi_cols);
+
+void vp10_cyclic_refresh_free(CYCLIC_REFRESH *cr);
+
+// Estimate the bits, incorporating the delta-q from segment 1, after encoding
+// the frame.
+int vp10_cyclic_refresh_estimate_bits_at_q(const struct VP10_COMP *cpi,
+                                          double correction_factor);
+
+// Estimate the bits per mb, for a given q = i and a corresponding delta-q
+// (for segment 1), prior to encoding the frame.
+int vp10_cyclic_refresh_rc_bits_per_mb(const struct VP10_COMP *cpi, int i,
+                                      double correction_factor);
+
+// Prior to coding a given prediction block, of size bsize at (mi_row, mi_col),
+// check if we should reset the segment_id, and update the cyclic_refresh map
+// and segmentation map.
+void vp10_cyclic_refresh_update_segment(struct VP10_COMP *const cpi,
+                                       MB_MODE_INFO *const mbmi,
+                                       int mi_row, int mi_col, BLOCK_SIZE bsize,
+                                       int64_t rate, int64_t dist, int skip);
+
+// Update the segmentation map, and related quantities: cyclic refresh map,
+// refresh sb_index, and target number of blocks to be refreshed.
+void vp10_cyclic_refresh_update__map(struct VP10_COMP *const cpi);
+
+// Update the actual number of blocks that were applied the segment delta q.
+void vp10_cyclic_refresh_postencode(struct VP10_COMP *const cpi);
+
+// Set golden frame update interval, for 1 pass CBR mode.
+void vp10_cyclic_refresh_set_golden_update(struct VP10_COMP *const cpi);
+
+// Check if we should not update golden reference, based on past refresh stats.
+void vp10_cyclic_refresh_check_golden_update(struct VP10_COMP *const cpi);
+
+// Set/update global/frame level refresh parameters.
+void vp10_cyclic_refresh_update_parameters(struct VP10_COMP *const cpi);
+
+// Setup cyclic background refresh: set delta q and segmentation map.
+void vp10_cyclic_refresh_setup(struct VP10_COMP *const cpi);
+
+int vp10_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr);
+
+void vp10_cyclic_refresh_reset_resize(struct VP10_COMP *const cpi);
+
+static INLINE int cyclic_refresh_segment_id_boosted(int segment_id) {
+  return segment_id == CR_SEGMENT_ID_BOOST1 ||
+         segment_id == CR_SEGMENT_ID_BOOST2;
+}
+
+static INLINE int cyclic_refresh_segment_id(int segment_id) {
+  if (segment_id == CR_SEGMENT_ID_BOOST1)
+    return CR_SEGMENT_ID_BOOST1;
+  else if (segment_id == CR_SEGMENT_ID_BOOST2)
+    return CR_SEGMENT_ID_BOOST2;
+  else
+    return CR_SEGMENT_ID_BASE;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_ENCODER_AQ_CYCLICREFRESH_H_
diff --git a/libs/libvpx/vp10/encoder/aq_variance.c b/libs/libvpx/vp10/encoder/aq_variance.c
new file mode 100644
index 0000000000..bed5162fb2
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/aq_variance.c
@@ -0,0 +1,206 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+
+#include "vpx_ports/mem.h"
+
+#include "vp10/encoder/aq_variance.h"
+
+#include "vp10/common/seg_common.h"
+#include "vp10/encoder/ratectrl.h"
+#include "vp10/encoder/rd.h"
+#include "vp10/encoder/segmentation.h"
+#include "vpx_ports/system_state.h"
+
+#define ENERGY_MIN (-4)
+#define ENERGY_MAX (1)
+#define ENERGY_SPAN (ENERGY_MAX - ENERGY_MIN +  1)
+#define ENERGY_IN_BOUNDS(energy)\
+  assert((energy) >= ENERGY_MIN && (energy) <= ENERGY_MAX)
+
+static const double rate_ratio[MAX_SEGMENTS] =
+  {2.5, 2.0, 1.5, 1.0, 0.75, 1.0, 1.0, 1.0};
+static const int segment_id[ENERGY_SPAN] = {0, 1, 1, 2, 3, 4};
+
+#define SEGMENT_ID(i) segment_id[(i) - ENERGY_MIN]
+
+DECLARE_ALIGNED(16, static const uint8_t, vp10_64_zeros[64]) = {0};
+#if CONFIG_VP9_HIGHBITDEPTH
+DECLARE_ALIGNED(16, static const uint16_t, vp10_highbd_64_zeros[64]) = {0};
+#endif
+
+unsigned int vp10_vaq_segment_id(int energy) {
+  ENERGY_IN_BOUNDS(energy);
+  return SEGMENT_ID(energy);
+}
+
+void vp10_vaq_frame_setup(VP10_COMP *cpi) {
+  VP10_COMMON *cm = &cpi->common;
+  struct segmentation *seg = &cm->seg;
+  int i;
+
+  if (frame_is_intra_only(cm) || cm->error_resilient_mode ||
+      cpi->refresh_alt_ref_frame ||
+      (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
+    vp10_enable_segmentation(seg);
+    vp10_clearall_segfeatures(seg);
+
+    seg->abs_delta = SEGMENT_DELTADATA;
+
+    vpx_clear_system_state();
+
+    for (i = 0; i < MAX_SEGMENTS; ++i) {
+      int qindex_delta =
+          vp10_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex,
+                                     rate_ratio[i], cm->bit_depth);
+
+      // We don't allow qindex 0 in a segment if the base value is not 0.
+      // Q index 0 (lossless) implies 4x4 encoding only and in AQ mode a segment
+      // Q delta is sometimes applied without going back around the rd loop.
+      // This could lead to an illegal combination of partition size and q.
+      if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) {
+        qindex_delta = -cm->base_qindex + 1;
+      }
+
+      // No need to enable SEG_LVL_ALT_Q for this segment.
+      if (rate_ratio[i] == 1.0) {
+        continue;
+      }
+
+      vp10_set_segdata(seg, i, SEG_LVL_ALT_Q, qindex_delta);
+      vp10_enable_segfeature(seg, i, SEG_LVL_ALT_Q);
+    }
+  }
+}
+
+/* TODO(agrange, paulwilkins): The block_variance calls the unoptimized versions
+ * of variance() and highbd_8_variance(). It should not.
+ */
+static void aq_variance(const uint8_t *a, int  a_stride,
+                        const uint8_t *b, int  b_stride,
+                        int  w, int  h, unsigned int *sse, int *sum) {
+  int i, j;
+
+  *sum = 0;
+  *sse = 0;
+
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      const int diff = a[j] - b[j];
+      *sum += diff;
+      *sse += diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void aq_highbd_variance64(const uint8_t *a8, int  a_stride,
+                                 const uint8_t *b8, int  b_stride,
+                                 int w, int h, uint64_t *sse, uint64_t *sum) {
+  int i, j;
+
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  *sum = 0;
+  *sse = 0;
+
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      const int diff = a[j] - b[j];
+      *sum += diff;
+      *sse += diff * diff;
+    }
+    a += a_stride;
+    b += b_stride;
+  }
+}
+
+static void aq_highbd_8_variance(const uint8_t *a8, int  a_stride,
+                                 const uint8_t *b8, int  b_stride,
+                                 int w, int h, unsigned int *sse, int *sum) {
+  uint64_t sse_long = 0;
+  uint64_t sum_long = 0;
+  aq_highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+  *sse = (unsigned int)sse_long;
+  *sum = (int)sum_long;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static unsigned int block_variance(VP10_COMP *cpi, MACROBLOCK *x,
+                                   BLOCK_SIZE bs) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  unsigned int var, sse;
+  int right_overflow = (xd->mb_to_right_edge < 0) ?
+      ((-xd->mb_to_right_edge) >> 3) : 0;
+  int bottom_overflow = (xd->mb_to_bottom_edge < 0) ?
+      ((-xd->mb_to_bottom_edge) >> 3) : 0;
+
+  if (right_overflow || bottom_overflow) {
+    const int bw = 8 * num_8x8_blocks_wide_lookup[bs] - right_overflow;
+    const int bh = 8 * num_8x8_blocks_high_lookup[bs] - bottom_overflow;
+    int avg;
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      aq_highbd_8_variance(x->plane[0].src.buf, x->plane[0].src.stride,
+                           CONVERT_TO_BYTEPTR(vp10_highbd_64_zeros), 0, bw, bh,
+                           &sse, &avg);
+      sse >>= 2 * (xd->bd - 8);
+      avg >>= (xd->bd - 8);
+    } else {
+      aq_variance(x->plane[0].src.buf, x->plane[0].src.stride,
+                  vp10_64_zeros, 0, bw, bh, &sse, &avg);
+    }
+#else
+    aq_variance(x->plane[0].src.buf, x->plane[0].src.stride,
+                vp10_64_zeros, 0, bw, bh, &sse, &avg);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    var = sse - (((int64_t)avg * avg) / (bw * bh));
+    return (256 * var) / (bw * bh);
+  } else {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf,
+                               x->plane[0].src.stride,
+                               CONVERT_TO_BYTEPTR(vp10_highbd_64_zeros),
+                               0, &sse);
+    } else {
+      var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf,
+                               x->plane[0].src.stride,
+                               vp10_64_zeros, 0, &sse);
+    }
+#else
+    var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf,
+                             x->plane[0].src.stride,
+                             vp10_64_zeros, 0, &sse);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    return (256 * var) >> num_pels_log2_lookup[bs];
+  }
+}
+
+double vp10_log_block_var(VP10_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
+  unsigned int var = block_variance(cpi, x, bs);
+  vpx_clear_system_state();
+  return log(var + 1.0);
+}
+
+#define DEFAULT_E_MIDPOINT 10.0
+int vp10_block_energy(VP10_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
+  double energy;
+  double energy_midpoint;
+  vpx_clear_system_state();
+  energy_midpoint =
+    (cpi->oxcf.pass == 2) ? cpi->twopass.mb_av_energy : DEFAULT_E_MIDPOINT;
+  energy = vp10_log_block_var(cpi, x, bs) - energy_midpoint;
+  return clamp((int)round(energy), ENERGY_MIN, ENERGY_MAX);
+}
diff --git a/libs/libvpx/vp10/encoder/aq_variance.h b/libs/libvpx/vp10/encoder/aq_variance.h
new file mode 100644
index 0000000000..318f5f27f1
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/aq_variance.h
@@ -0,0 +1,31 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP10_ENCODER_AQ_VARIANCE_H_
+#define VP10_ENCODER_AQ_VARIANCE_H_
+
+#include "vp10/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+unsigned int vp10_vaq_segment_id(int energy);
+void vp10_vaq_frame_setup(VP10_COMP *cpi);
+
+int vp10_block_energy(VP10_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs);
+double vp10_log_block_var(VP10_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_ENCODER_AQ_VARIANCE_H_
diff --git a/libs/libvpx/vp10/encoder/arm/neon/dct_neon.c b/libs/libvpx/vp10/encoder/arm/neon/dct_neon.c
new file mode 100644
index 0000000000..b37a2ff3a9
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/arm/neon/dct_neon.c
@@ -0,0 +1,36 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vp10_rtcd.h"
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vp10/common/blockd.h"
+#include "vpx_dsp/txfm_common.h"
+
+void vp10_fdct8x8_quant_neon(const int16_t *input, int stride,
+                            int16_t* coeff_ptr, intptr_t n_coeffs,
+                            int skip_block, const int16_t* zbin_ptr,
+                            const int16_t* round_ptr, const int16_t* quant_ptr,
+                            const int16_t* quant_shift_ptr,
+                            int16_t* qcoeff_ptr, int16_t* dqcoeff_ptr,
+                            const int16_t* dequant_ptr, uint16_t* eob_ptr,
+                            const int16_t* scan_ptr,
+                            const int16_t* iscan_ptr) {
+  int16_t temp_buffer[64];
+  (void)coeff_ptr;
+
+  vpx_fdct8x8_neon(input, temp_buffer, stride);
+  vp10_quantize_fp_neon(temp_buffer, n_coeffs, skip_block, zbin_ptr, round_ptr,
+                       quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr,
+                       dequant_ptr, eob_ptr, scan_ptr, iscan_ptr);
+}
diff --git a/libs/libvpx/vp10/encoder/arm/neon/error_neon.c b/libs/libvpx/vp10/encoder/arm/neon/error_neon.c
new file mode 100644
index 0000000000..009520aeed
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/arm/neon/error_neon.c
@@ -0,0 +1,41 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vp10_rtcd.h"
+
+int64_t vp10_block_error_fp_neon(const int16_t *coeff, const int16_t *dqcoeff,
+                                int block_size) {
+  int64x2_t error = vdupq_n_s64(0);
+
+  assert(block_size >= 8);
+  assert((block_size % 8) == 0);
+
+  do {
+    const int16x8_t c = vld1q_s16(coeff);
+    const int16x8_t d = vld1q_s16(dqcoeff);
+    const int16x8_t diff = vsubq_s16(c, d);
+    const int16x4_t diff_lo = vget_low_s16(diff);
+    const int16x4_t diff_hi = vget_high_s16(diff);
+    // diff is 15-bits, the squares 30, so we can store 2 in 31-bits before
+    // accumulating them in 64-bits.
+    const int32x4_t err0 = vmull_s16(diff_lo, diff_lo);
+    const int32x4_t err1 = vmlal_s16(err0, diff_hi, diff_hi);
+    const int64x2_t err2 = vaddl_s32(vget_low_s32(err1), vget_high_s32(err1));
+    error = vaddq_s64(error, err2);
+    coeff += 8;
+    dqcoeff += 8;
+    block_size -= 8;
+  } while (block_size != 0);
+
+  return vgetq_lane_s64(error, 0) + vgetq_lane_s64(error, 1);
+}
diff --git a/libs/libvpx/vp10/encoder/arm/neon/quantize_neon.c b/libs/libvpx/vp10/encoder/arm/neon/quantize_neon.c
new file mode 100644
index 0000000000..9354ced699
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/arm/neon/quantize_neon.c
@@ -0,0 +1,118 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include <math.h>
+
+#include "vpx_mem/vpx_mem.h"
+
+#include "vp10/common/quant_common.h"
+#include "vp10/common/seg_common.h"
+
+#include "vp10/encoder/encoder.h"
+#include "vp10/encoder/quantize.h"
+#include "vp10/encoder/rd.h"
+
+void vp10_quantize_fp_neon(const int16_t *coeff_ptr, intptr_t count,
+                          int skip_block, const int16_t *zbin_ptr,
+                          const int16_t *round_ptr, const int16_t *quant_ptr,
+                          const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
+                          int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                          uint16_t *eob_ptr,
+                          const int16_t *scan, const int16_t *iscan) {
+  // TODO(jingning) Decide the need of these arguments after the
+  // quantization process is completed.
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)scan;
+
+  if (!skip_block) {
+    // Quantization pass: All coefficients with index >= zero_flag are
+    // skippable. Note: zero_flag can be zero.
+    int i;
+    const int16x8_t v_zero = vdupq_n_s16(0);
+    const int16x8_t v_one = vdupq_n_s16(1);
+    int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
+    int16x8_t v_round = vmovq_n_s16(round_ptr[1]);
+    int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]);
+    int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]);
+    // adjust for dc
+    v_round = vsetq_lane_s16(round_ptr[0], v_round, 0);
+    v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0);
+    v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0);
+    // process dc and the first seven ac coeffs
+    {
+      const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+      const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[0]);
+      const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+      const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero);
+      const int32x4_t v_tmp_lo = vmull_s16(vget_low_s16(v_tmp),
+                                           vget_low_s16(v_quant));
+      const int32x4_t v_tmp_hi = vmull_s16(vget_high_s16(v_tmp),
+                                           vget_high_s16(v_quant));
+      const int16x8_t v_tmp2 = vcombine_s16(vshrn_n_s32(v_tmp_lo, 16),
+                                            vshrn_n_s32(v_tmp_hi, 16));
+      const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
+      const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
+      const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
+      const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
+      const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
+      const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
+      v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
+      vst1q_s16(&qcoeff_ptr[0], v_qcoeff);
+      vst1q_s16(&dqcoeff_ptr[0], v_dqcoeff);
+      v_round = vmovq_n_s16(round_ptr[1]);
+      v_quant = vmovq_n_s16(quant_ptr[1]);
+      v_dequant = vmovq_n_s16(dequant_ptr[1]);
+    }
+    // now process the rest of the ac coeffs
+    for (i = 8; i < count; i += 8) {
+      const int16x8_t v_iscan = vld1q_s16(&iscan[i]);
+      const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[i]);
+      const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+      const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero);
+      const int32x4_t v_tmp_lo = vmull_s16(vget_low_s16(v_tmp),
+                                           vget_low_s16(v_quant));
+      const int32x4_t v_tmp_hi = vmull_s16(vget_high_s16(v_tmp),
+                                           vget_high_s16(v_quant));
+      const int16x8_t v_tmp2 = vcombine_s16(vshrn_n_s32(v_tmp_lo, 16),
+                                            vshrn_n_s32(v_tmp_hi, 16));
+      const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
+      const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
+      const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
+      const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
+      const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
+      const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
+      v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
+      vst1q_s16(&qcoeff_ptr[i], v_qcoeff);
+      vst1q_s16(&dqcoeff_ptr[i], v_dqcoeff);
+    }
+    {
+      const int16x4_t v_eobmax_3210 =
+          vmax_s16(vget_low_s16(v_eobmax_76543210),
+                   vget_high_s16(v_eobmax_76543210));
+      const int64x1_t v_eobmax_xx32 =
+          vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
+      const int16x4_t v_eobmax_tmp =
+          vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
+      const int64x1_t v_eobmax_xxx3 =
+          vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
+      const int16x4_t v_eobmax_final =
+          vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
+
+      *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0);
+    }
+  } else {
+    memset(qcoeff_ptr, 0, count * sizeof(int16_t));
+    memset(dqcoeff_ptr, 0, count * sizeof(int16_t));
+    *eob_ptr = 0;
+  }
+}
diff --git a/libs/libvpx/vp10/encoder/bitstream.c b/libs/libvpx/vp10/encoder/bitstream.c
new file mode 100644
index 0000000000..04ce61d55a
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/bitstream.c
@@ -0,0 +1,1573 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <limits.h>
+
+#include "vpx/vpx_encoder.h"
+#include "vpx_dsp/bitwriter_buffer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem_ops.h"
+#include "vpx_ports/system_state.h"
+
+#include "vp10/common/entropy.h"
+#include "vp10/common/entropymode.h"
+#include "vp10/common/entropymv.h"
+#include "vp10/common/mvref_common.h"
+#include "vp10/common/pred_common.h"
+#include "vp10/common/seg_common.h"
+#include "vp10/common/tile_common.h"
+
+#include "vp10/encoder/cost.h"
+#include "vp10/encoder/bitstream.h"
+#include "vp10/encoder/encodemv.h"
+#include "vp10/encoder/mcomp.h"
+#include "vp10/encoder/segmentation.h"
+#include "vp10/encoder/subexp.h"
+#include "vp10/encoder/tokenize.h"
+
+static const struct vp10_token intra_mode_encodings[INTRA_MODES] = {
+  {0, 1}, {6, 3}, {28, 5}, {30, 5}, {58, 6}, {59, 6}, {126, 7}, {127, 7},
+  {62, 6}, {2, 2}};
+static const struct vp10_token switchable_interp_encodings[SWITCHABLE_FILTERS] =
+  {{0, 1}, {2, 2}, {3, 2}};
+static const struct vp10_token partition_encodings[PARTITION_TYPES] =
+  {{0, 1}, {2, 2}, {6, 3}, {7, 3}};
+static const struct vp10_token inter_mode_encodings[INTER_MODES] =
+  {{2, 2}, {6, 3}, {0, 1}, {7, 3}};
+
+static struct vp10_token ext_tx_encodings[TX_TYPES];
+
+void vp10_encode_token_init() {
+  vp10_tokens_from_tree(ext_tx_encodings, vp10_ext_tx_tree);
+}
+
+static void write_intra_mode(vpx_writer *w, PREDICTION_MODE mode,
+                             const vpx_prob *probs) {
+  vp10_write_token(w, vp10_intra_mode_tree, probs, &intra_mode_encodings[mode]);
+}
+
+static void write_inter_mode(vpx_writer *w, PREDICTION_MODE mode,
+                             const vpx_prob *probs) {
+  assert(is_inter_mode(mode));
+  vp10_write_token(w, vp10_inter_mode_tree, probs,
+                  &inter_mode_encodings[INTER_OFFSET(mode)]);
+}
+
+static void encode_unsigned_max(struct vpx_write_bit_buffer *wb,
+                                int data, int max) {
+  vpx_wb_write_literal(wb, data, get_unsigned_bits(max));
+}
+
+static void prob_diff_update(const vpx_tree_index *tree,
+                             vpx_prob probs[/*n - 1*/],
+                             const unsigned int counts[/*n - 1*/],
+                             int n, vpx_writer *w) {
+  int i;
+  unsigned int branch_ct[32][2];
+
+  // Assuming max number of probabilities <= 32
+  assert(n <= 32);
+
+  vp10_tree_probs_from_distribution(tree, branch_ct, counts);
+  for (i = 0; i < n - 1; ++i)
+    vp10_cond_prob_diff_update(w, &probs[i], branch_ct[i]);
+}
+
+static int prob_diff_update_savings(const vpx_tree_index *tree,
+                                    vpx_prob probs[/*n - 1*/],
+                                    const unsigned int counts[/*n - 1*/],
+                                    int n) {
+  int i;
+  unsigned int branch_ct[32][2];
+  int savings = 0;
+
+  // Assuming max number of probabilities <= 32
+  assert(n <= 32);
+  vp10_tree_probs_from_distribution(tree, branch_ct, counts);
+  for (i = 0; i < n - 1; ++i) {
+    savings += vp10_cond_prob_diff_update_savings(&probs[i],
+                                                  branch_ct[i]);
+  }
+  return savings;
+}
+
+static void write_selected_tx_size(const VP10_COMMON *cm,
+                                   const MACROBLOCKD *xd, vpx_writer *w) {
+  TX_SIZE tx_size = xd->mi[0]->mbmi.tx_size;
+  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
+  const vpx_prob *const tx_probs = get_tx_probs2(max_tx_size, xd,
+                                                 &cm->fc->tx_probs);
+  vpx_write(w, tx_size != TX_4X4, tx_probs[0]);
+  if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) {
+    vpx_write(w, tx_size != TX_8X8, tx_probs[1]);
+    if (tx_size != TX_8X8 && max_tx_size >= TX_32X32)
+      vpx_write(w, tx_size != TX_16X16, tx_probs[2]);
+  }
+}
+
+static int write_skip(const VP10_COMMON *cm, const MACROBLOCKD *xd,
+                      int segment_id, const MODE_INFO *mi, vpx_writer *w) {
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
+    return 1;
+  } else {
+    const int skip = mi->mbmi.skip;
+    vpx_write(w, skip, vp10_get_skip_prob(cm, xd));
+    return skip;
+  }
+}
+
+static void update_skip_probs(VP10_COMMON *cm, vpx_writer *w,
+                              FRAME_COUNTS *counts) {
+  int k;
+
+  for (k = 0; k < SKIP_CONTEXTS; ++k)
+    vp10_cond_prob_diff_update(w, &cm->fc->skip_probs[k], counts->skip[k]);
+}
+
+static void update_switchable_interp_probs(VP10_COMMON *cm, vpx_writer *w,
+                                           FRAME_COUNTS *counts) {
+  int j;
+  for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
+    prob_diff_update(vp10_switchable_interp_tree,
+                     cm->fc->switchable_interp_prob[j],
+                     counts->switchable_interp[j], SWITCHABLE_FILTERS, w);
+}
+
+static void update_ext_tx_probs(VP10_COMMON *cm, vpx_writer *w) {
+  const int savings_thresh = vp10_cost_one(GROUP_DIFF_UPDATE_PROB) -
+                             vp10_cost_zero(GROUP_DIFF_UPDATE_PROB);
+  int i, j;
+
+  int savings = 0;
+  int do_update = 0;
+  for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+    for (j = 0; j < TX_TYPES; ++j)
+      savings += prob_diff_update_savings(
+          vp10_ext_tx_tree, cm->fc->intra_ext_tx_prob[i][j],
+          cm->counts.intra_ext_tx[i][j], TX_TYPES);
+  }
+  do_update = savings > savings_thresh;
+  vpx_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
+  if (do_update) {
+    for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+      for (j = 0; j < TX_TYPES; ++j)
+        prob_diff_update(vp10_ext_tx_tree,
+                         cm->fc->intra_ext_tx_prob[i][j],
+                         cm->counts.intra_ext_tx[i][j],
+                         TX_TYPES, w);
+    }
+  }
+  savings = 0;
+  do_update = 0;
+  for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+    savings += prob_diff_update_savings(
+        vp10_ext_tx_tree, cm->fc->inter_ext_tx_prob[i],
+        cm->counts.inter_ext_tx[i], TX_TYPES);
+  }
+  do_update = savings > savings_thresh;
+  vpx_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
+  if (do_update) {
+    for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+      prob_diff_update(vp10_ext_tx_tree,
+                       cm->fc->inter_ext_tx_prob[i],
+                       cm->counts.inter_ext_tx[i],
+                       TX_TYPES, w);
+    }
+  }
+}
+
+static void pack_mb_tokens(vpx_writer *w,
+                           TOKENEXTRA **tp, const TOKENEXTRA *const stop,
+                           vpx_bit_depth_t bit_depth, const TX_SIZE tx) {
+  TOKENEXTRA *p = *tp;
+#if !CONFIG_MISC_FIXES
+  (void) tx;
+#endif
+
+  while (p < stop && p->token != EOSB_TOKEN) {
+    const int t = p->token;
+    const struct vp10_token *const a = &vp10_coef_encodings[t];
+    int i = 0;
+    int v = a->value;
+    int n = a->len;
+#if CONFIG_VP9_HIGHBITDEPTH
+    const vp10_extra_bit *b;
+    if (bit_depth == VPX_BITS_12)
+      b = &vp10_extra_bits_high12[t];
+    else if (bit_depth == VPX_BITS_10)
+      b = &vp10_extra_bits_high10[t];
+    else
+      b = &vp10_extra_bits[t];
+#else
+    const vp10_extra_bit *const b = &vp10_extra_bits[t];
+    (void) bit_depth;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    /* skip one or two nodes */
+    if (p->skip_eob_node) {
+      n -= p->skip_eob_node;
+      i = 2 * p->skip_eob_node;
+    }
+
+    // TODO(jbb): expanding this can lead to big gains.  It allows
+    // much better branch prediction and would enable us to avoid numerous
+    // lookups and compares.
+
+    // If we have a token that's in the constrained set, the coefficient tree
+    // is split into two treed writes.  The first treed write takes care of the
+    // unconstrained nodes.  The second treed write takes care of the
+    // constrained nodes.
+    if (t >= TWO_TOKEN && t < EOB_TOKEN) {
+      int len = UNCONSTRAINED_NODES - p->skip_eob_node;
+      int bits = v >> (n - len);
+      vp10_write_tree(w, vp10_coef_tree, p->context_tree, bits, len, i);
+      vp10_write_tree(w, vp10_coef_con_tree,
+                     vp10_pareto8_full[p->context_tree[PIVOT_NODE] - 1],
+                     v, n - len, 0);
+    } else {
+      vp10_write_tree(w, vp10_coef_tree, p->context_tree, v, n, i);
+    }
+
+    if (b->base_val) {
+      const int e = p->extra, l = b->len;
+#if CONFIG_MISC_FIXES
+      int skip_bits =
+          (b->base_val == CAT6_MIN_VAL) ? TX_SIZES - 1 - tx : 0;
+#else
+      int skip_bits = 0;
+#endif
+
+      if (l) {
+        const unsigned char *pb = b->prob;
+        int v = e >> 1;
+        int n = l;              /* number of bits in v, assumed nonzero */
+        int i = 0;
+
+        do {
+          const int bb = (v >> --n) & 1;
+          if (skip_bits) {
+            skip_bits--;
+            assert(!bb);
+          } else {
+            vpx_write(w, bb, pb[i >> 1]);
+          }
+          i = b->tree[i + bb];
+        } while (n);
+      }
+
+      vpx_write_bit(w, e & 1);
+    }
+    ++p;
+  }
+
+  *tp = p;
+}
+
+static void write_segment_id(vpx_writer *w, const struct segmentation *seg,
+                             const struct segmentation_probs *segp,
+                             int segment_id) {
+  if (seg->enabled && seg->update_map)
+    vp10_write_tree(w, vp10_segment_tree, segp->tree_probs, segment_id, 3, 0);
+}
+
+// This function encodes the reference frame
+static void write_ref_frames(const VP10_COMMON *cm, const MACROBLOCKD *xd,
+                             vpx_writer *w) {
+  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const int is_compound = has_second_ref(mbmi);
+  const int segment_id = mbmi->segment_id;
+
+  // If segment level coding of this signal is disabled...
+  // or the segment allows multiple reference frame options
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
+    assert(!is_compound);
+    assert(mbmi->ref_frame[0] ==
+               get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME));
+  } else {
+    // does the feature use compound prediction or not
+    // (if not specified at the frame/segment level)
+    if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+      vpx_write(w, is_compound, vp10_get_reference_mode_prob(cm, xd));
+    } else {
+      assert(!is_compound == (cm->reference_mode == SINGLE_REFERENCE));
+    }
+
+    if (is_compound) {
+      vpx_write(w, mbmi->ref_frame[0] == GOLDEN_FRAME,
+                vp10_get_pred_prob_comp_ref_p(cm, xd));
+    } else {
+      const int bit0 = mbmi->ref_frame[0] != LAST_FRAME;
+      vpx_write(w, bit0, vp10_get_pred_prob_single_ref_p1(cm, xd));
+      if (bit0) {
+        const int bit1 = mbmi->ref_frame[0] != GOLDEN_FRAME;
+        vpx_write(w, bit1, vp10_get_pred_prob_single_ref_p2(cm, xd));
+      }
+    }
+  }
+}
+
+static void pack_inter_mode_mvs(VP10_COMP *cpi, const MODE_INFO *mi,
+                                vpx_writer *w) {
+  VP10_COMMON *const cm = &cpi->common;
+  const nmv_context *nmvc = &cm->fc->nmvc;
+  const MACROBLOCK *const x = &cpi->td.mb;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct segmentation *const seg = &cm->seg;
+#if CONFIG_MISC_FIXES
+  const struct segmentation_probs *const segp = &cm->fc->seg;
+#else
+  const struct segmentation_probs *const segp = &cm->segp;
+#endif
+  const MB_MODE_INFO *const mbmi = &mi->mbmi;
+  const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  const PREDICTION_MODE mode = mbmi->mode;
+  const int segment_id = mbmi->segment_id;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const int allow_hp = cm->allow_high_precision_mv;
+  const int is_inter = is_inter_block(mbmi);
+  const int is_compound = has_second_ref(mbmi);
+  int skip, ref;
+
+  if (seg->update_map) {
+    if (seg->temporal_update) {
+      const int pred_flag = mbmi->seg_id_predicted;
+      vpx_prob pred_prob = vp10_get_pred_prob_seg_id(segp, xd);
+      vpx_write(w, pred_flag, pred_prob);
+      if (!pred_flag)
+        write_segment_id(w, seg, segp, segment_id);
+    } else {
+      write_segment_id(w, seg, segp, segment_id);
+    }
+  }
+
+  skip = write_skip(cm, xd, segment_id, mi, w);
+
+  if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
+    vpx_write(w, is_inter, vp10_get_intra_inter_prob(cm, xd));
+
+  if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT &&
+      !(is_inter && skip) && !xd->lossless[segment_id]) {
+    write_selected_tx_size(cm, xd, w);
+  }
+
+  if (!is_inter) {
+    if (bsize >= BLOCK_8X8) {
+      write_intra_mode(w, mode, cm->fc->y_mode_prob[size_group_lookup[bsize]]);
+    } else {
+      int idx, idy;
+      const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+      const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+      for (idy = 0; idy < 2; idy += num_4x4_h) {
+        for (idx = 0; idx < 2; idx += num_4x4_w) {
+          const PREDICTION_MODE b_mode = mi->bmi[idy * 2 + idx].as_mode;
+          write_intra_mode(w, b_mode, cm->fc->y_mode_prob[0]);
+        }
+      }
+    }
+    write_intra_mode(w, mbmi->uv_mode, cm->fc->uv_mode_prob[mode]);
+  } else {
+    const int mode_ctx = mbmi_ext->mode_context[mbmi->ref_frame[0]];
+    const vpx_prob *const inter_probs = cm->fc->inter_mode_probs[mode_ctx];
+    write_ref_frames(cm, xd, w);
+
+    // If segment skip is not enabled code the mode.
+    if (!segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
+      if (bsize >= BLOCK_8X8) {
+        write_inter_mode(w, mode, inter_probs);
+      }
+    }
+
+    if (cm->interp_filter == SWITCHABLE) {
+      const int ctx = vp10_get_pred_context_switchable_interp(xd);
+      vp10_write_token(w, vp10_switchable_interp_tree,
+                      cm->fc->switchable_interp_prob[ctx],
+                      &switchable_interp_encodings[mbmi->interp_filter]);
+      ++cpi->interp_filter_selected[0][mbmi->interp_filter];
+    } else {
+      assert(mbmi->interp_filter == cm->interp_filter);
+    }
+
+    if (bsize < BLOCK_8X8) {
+      const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+      const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+      int idx, idy;
+      for (idy = 0; idy < 2; idy += num_4x4_h) {
+        for (idx = 0; idx < 2; idx += num_4x4_w) {
+          const int j = idy * 2 + idx;
+          const PREDICTION_MODE b_mode = mi->bmi[j].as_mode;
+          write_inter_mode(w, b_mode, inter_probs);
+          if (b_mode == NEWMV) {
+            for (ref = 0; ref < 1 + is_compound; ++ref)
+              vp10_encode_mv(cpi, w, &mi->bmi[j].as_mv[ref].as_mv,
+                            &mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0].as_mv,
+                            nmvc, allow_hp);
+          }
+        }
+      }
+    } else {
+      if (mode == NEWMV) {
+        for (ref = 0; ref < 1 + is_compound; ++ref)
+          vp10_encode_mv(cpi, w, &mbmi->mv[ref].as_mv,
+                        &mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0].as_mv, nmvc,
+                        allow_hp);
+      }
+    }
+  }
+  if (mbmi->tx_size < TX_32X32 &&
+      cm->base_qindex > 0 && !mbmi->skip &&
+      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+    if (is_inter) {
+      vp10_write_token(
+          w, vp10_ext_tx_tree,
+          cm->fc->inter_ext_tx_prob[mbmi->tx_size],
+          &ext_tx_encodings[mbmi->tx_type]);
+    } else {
+      vp10_write_token(
+          w, vp10_ext_tx_tree,
+          cm->fc->intra_ext_tx_prob[mbmi->tx_size]
+                                   [intra_mode_to_tx_type_context[mbmi->mode]],
+          &ext_tx_encodings[mbmi->tx_type]);
+    }
+  } else {
+    if (!mbmi->skip)
+      assert(mbmi->tx_type == DCT_DCT);
+  }
+}
+
+static void write_mb_modes_kf(const VP10_COMMON *cm, const MACROBLOCKD *xd,
+                              MODE_INFO **mi_8x8, vpx_writer *w) {
+  const struct segmentation *const seg = &cm->seg;
+#if CONFIG_MISC_FIXES
+  const struct segmentation_probs *const segp = &cm->fc->seg;
+#else
+  const struct segmentation_probs *const segp = &cm->segp;
+#endif
+  const MODE_INFO *const mi = mi_8x8[0];
+  const MODE_INFO *const above_mi = xd->above_mi;
+  const MODE_INFO *const left_mi = xd->left_mi;
+  const MB_MODE_INFO *const mbmi = &mi->mbmi;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+
+  if (seg->update_map)
+    write_segment_id(w, seg, segp, mbmi->segment_id);
+
+  write_skip(cm, xd, mbmi->segment_id, mi, w);
+
+  if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT &&
+      !xd->lossless[mbmi->segment_id])
+    write_selected_tx_size(cm, xd, w);
+
+  if (bsize >= BLOCK_8X8) {
+    write_intra_mode(w, mbmi->mode,
+                     get_y_mode_probs(cm, mi, above_mi, left_mi, 0));
+  } else {
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+    int idx, idy;
+
+    for (idy = 0; idy < 2; idy += num_4x4_h) {
+      for (idx = 0; idx < 2; idx += num_4x4_w) {
+        const int block = idy * 2 + idx;
+        write_intra_mode(w, mi->bmi[block].as_mode,
+                         get_y_mode_probs(cm, mi, above_mi, left_mi, block));
+      }
+    }
+  }
+
+  write_intra_mode(w, mbmi->uv_mode, cm->fc->uv_mode_prob[mbmi->mode]);
+
+  if (mbmi->tx_size < TX_32X32 &&
+      cm->base_qindex > 0 && !mbmi->skip &&
+      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+    vp10_write_token(
+        w, vp10_ext_tx_tree,
+        cm->fc->intra_ext_tx_prob[mbmi->tx_size]
+                                 [intra_mode_to_tx_type_context[mbmi->mode]],
+        &ext_tx_encodings[mbmi->tx_type]);
+  }
+}
+
+static void write_modes_b(VP10_COMP *cpi, const TileInfo *const tile,
+                          vpx_writer *w, TOKENEXTRA **tok,
+                          const TOKENEXTRA *const tok_end,
+                          int mi_row, int mi_col) {
+  const VP10_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  MODE_INFO *m;
+  int plane;
+
+  xd->mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col);
+  m = xd->mi[0];
+
+  cpi->td.mb.mbmi_ext = cpi->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
+
+  set_mi_row_col(xd, tile,
+                 mi_row, num_8x8_blocks_high_lookup[m->mbmi.sb_type],
+                 mi_col, num_8x8_blocks_wide_lookup[m->mbmi.sb_type],
+                 cm->mi_rows, cm->mi_cols);
+  if (frame_is_intra_only(cm)) {
+    write_mb_modes_kf(cm, xd, xd->mi, w);
+  } else {
+    pack_inter_mode_mvs(cpi, m, w);
+  }
+
+  if (!m->mbmi.skip) {
+    assert(*tok < tok_end);
+    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+      TX_SIZE tx = plane ? get_uv_tx_size(&m->mbmi, &xd->plane[plane])
+                         : m->mbmi.tx_size;
+      pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx);
+      assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN);
+      (*tok)++;
+    }
+  }
+}
+
+static void write_partition(const VP10_COMMON *const cm,
+                            const MACROBLOCKD *const xd,
+                            int hbs, int mi_row, int mi_col,
+                            PARTITION_TYPE p, BLOCK_SIZE bsize, vpx_writer *w) {
+  const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
+  const vpx_prob *const probs = cm->fc->partition_prob[ctx];
+  const int has_rows = (mi_row + hbs) < cm->mi_rows;
+  const int has_cols = (mi_col + hbs) < cm->mi_cols;
+
+  if (has_rows && has_cols) {
+    vp10_write_token(w, vp10_partition_tree, probs, &partition_encodings[p]);
+  } else if (!has_rows && has_cols) {
+    assert(p == PARTITION_SPLIT || p == PARTITION_HORZ);
+    vpx_write(w, p == PARTITION_SPLIT, probs[1]);
+  } else if (has_rows && !has_cols) {
+    assert(p == PARTITION_SPLIT || p == PARTITION_VERT);
+    vpx_write(w, p == PARTITION_SPLIT, probs[2]);
+  } else {
+    assert(p == PARTITION_SPLIT);
+  }
+}
+
+static void write_modes_sb(VP10_COMP *cpi,
+                           const TileInfo *const tile, vpx_writer *w,
+                           TOKENEXTRA **tok, const TOKENEXTRA *const tok_end,
+                           int mi_row, int mi_col, BLOCK_SIZE bsize) {
+  const VP10_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+
+  const int bsl = b_width_log2_lookup[bsize];
+  const int bs = (1 << bsl) / 4;
+  PARTITION_TYPE partition;
+  BLOCK_SIZE subsize;
+  const MODE_INFO *m = NULL;
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
+
+  m = cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col];
+
+  partition = partition_lookup[bsl][m->mbmi.sb_type];
+  write_partition(cm, xd, bs, mi_row, mi_col, partition, bsize, w);
+  subsize = get_subsize(bsize, partition);
+  if (subsize < BLOCK_8X8) {
+    write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+  } else {
+    switch (partition) {
+      case PARTITION_NONE:
+        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+        break;
+      case PARTITION_HORZ:
+        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+        if (mi_row + bs < cm->mi_rows)
+          write_modes_b(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col);
+        break;
+      case PARTITION_VERT:
+        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+        if (mi_col + bs < cm->mi_cols)
+          write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + bs);
+        break;
+      case PARTITION_SPLIT:
+        write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, subsize);
+        write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col + bs,
+                       subsize);
+        write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col,
+                       subsize);
+        write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col + bs,
+                       subsize);
+        break;
+      default:
+        assert(0);
+    }
+  }
+
+  // update partition context
+  if (bsize >= BLOCK_8X8 &&
+      (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
+    update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+}
+
+static void write_modes(VP10_COMP *cpi,
+                        const TileInfo *const tile, vpx_writer *w,
+                        TOKENEXTRA **tok, const TOKENEXTRA *const tok_end) {
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  int mi_row, mi_col;
+
+  for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
+       mi_row += MI_BLOCK_SIZE) {
+    vp10_zero(xd->left_seg_context);
+    for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
+         mi_col += MI_BLOCK_SIZE)
+      write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col,
+                     BLOCK_64X64);
+  }
+}
+
+static void build_tree_distribution(VP10_COMP *cpi, TX_SIZE tx_size,
+                                    vp10_coeff_stats *coef_branch_ct,
+                                    vp10_coeff_probs_model *coef_probs) {
+  vp10_coeff_count *coef_counts = cpi->td.rd_counts.coef_counts[tx_size];
+  unsigned int (*eob_branch_ct)[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS] =
+      cpi->common.counts.eob_branch[tx_size];
+  int i, j, k, l, m;
+
+  for (i = 0; i < PLANE_TYPES; ++i) {
+    for (j = 0; j < REF_TYPES; ++j) {
+      for (k = 0; k < COEF_BANDS; ++k) {
+        for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
+          vp10_tree_probs_from_distribution(vp10_coef_tree,
+                                           coef_branch_ct[i][j][k][l],
+                                           coef_counts[i][j][k][l]);
+          coef_branch_ct[i][j][k][l][0][1] = eob_branch_ct[i][j][k][l] -
+                                             coef_branch_ct[i][j][k][l][0][0];
+          for (m = 0; m < UNCONSTRAINED_NODES; ++m)
+            coef_probs[i][j][k][l][m] = get_binary_prob(
+                                            coef_branch_ct[i][j][k][l][m][0],
+                                            coef_branch_ct[i][j][k][l][m][1]);
+        }
+      }
+    }
+  }
+}
+
+static void update_coef_probs_common(vpx_writer* const bc, VP10_COMP *cpi,
+                                     TX_SIZE tx_size,
+                                     vp10_coeff_stats *frame_branch_ct,
+                                     vp10_coeff_probs_model *new_coef_probs) {
+  vp10_coeff_probs_model *old_coef_probs = cpi->common.fc->coef_probs[tx_size];
+  const vpx_prob upd = DIFF_UPDATE_PROB;
+  const int entropy_nodes_update = UNCONSTRAINED_NODES;
+  int i, j, k, l, t;
+  int stepsize = cpi->sf.coeff_prob_appx_step;
+
+  switch (cpi->sf.use_fast_coef_updates) {
+    case TWO_LOOP: {
+      /* dry run to see if there is any update at all needed */
+      int savings = 0;
+      int update[2] = {0, 0};
+      for (i = 0; i < PLANE_TYPES; ++i) {
+        for (j = 0; j < REF_TYPES; ++j) {
+          for (k = 0; k < COEF_BANDS; ++k) {
+            for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
+              for (t = 0; t < entropy_nodes_update; ++t) {
+                vpx_prob newp = new_coef_probs[i][j][k][l][t];
+                const vpx_prob oldp = old_coef_probs[i][j][k][l][t];
+                int s;
+                int u = 0;
+                if (t == PIVOT_NODE)
+                  s = vp10_prob_diff_update_savings_search_model(
+                      frame_branch_ct[i][j][k][l][0],
+                      old_coef_probs[i][j][k][l], &newp, upd, stepsize);
+                else
+                  s = vp10_prob_diff_update_savings_search(
+                      frame_branch_ct[i][j][k][l][t], oldp, &newp, upd);
+                if (s > 0 && newp != oldp)
+                  u = 1;
+                if (u)
+                  savings += s - (int)(vp10_cost_zero(upd));
+                else
+                  savings -= (int)(vp10_cost_zero(upd));
+                update[u]++;
+              }
+            }
+          }
+        }
+      }
+
+      // printf("Update %d %d, savings %d\n", update[0], update[1], savings);
+      /* Is coef updated at all */
+      if (update[1] == 0 || savings < 0) {
+        vpx_write_bit(bc, 0);
+        return;
+      }
+      vpx_write_bit(bc, 1);
+      for (i = 0; i < PLANE_TYPES; ++i) {
+        for (j = 0; j < REF_TYPES; ++j) {
+          for (k = 0; k < COEF_BANDS; ++k) {
+            for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
+              // calc probs and branch cts for this frame only
+              for (t = 0; t < entropy_nodes_update; ++t) {
+                vpx_prob newp = new_coef_probs[i][j][k][l][t];
+                vpx_prob *oldp = old_coef_probs[i][j][k][l] + t;
+                const vpx_prob upd = DIFF_UPDATE_PROB;
+                int s;
+                int u = 0;
+                if (t == PIVOT_NODE)
+                  s = vp10_prob_diff_update_savings_search_model(
+                      frame_branch_ct[i][j][k][l][0],
+                      old_coef_probs[i][j][k][l], &newp, upd, stepsize);
+                else
+                  s = vp10_prob_diff_update_savings_search(
+                      frame_branch_ct[i][j][k][l][t],
+                      *oldp, &newp, upd);
+                if (s > 0 && newp != *oldp)
+                  u = 1;
+                vpx_write(bc, u, upd);
+                if (u) {
+                  /* send/use new probability */
+                  vp10_write_prob_diff_update(bc, newp, *oldp);
+                  *oldp = newp;
+                }
+              }
+            }
+          }
+        }
+      }
+      return;
+    }
+
+    case ONE_LOOP_REDUCED: {
+      int updates = 0;
+      int noupdates_before_first = 0;
+      for (i = 0; i < PLANE_TYPES; ++i) {
+        for (j = 0; j < REF_TYPES; ++j) {
+          for (k = 0; k < COEF_BANDS; ++k) {
+            for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
+              // calc probs and branch cts for this frame only
+              for (t = 0; t < entropy_nodes_update; ++t) {
+                vpx_prob newp = new_coef_probs[i][j][k][l][t];
+                vpx_prob *oldp = old_coef_probs[i][j][k][l] + t;
+                int s;
+                int u = 0;
+
+                if (t == PIVOT_NODE) {
+                  s = vp10_prob_diff_update_savings_search_model(
+                      frame_branch_ct[i][j][k][l][0],
+                      old_coef_probs[i][j][k][l], &newp, upd, stepsize);
+                } else {
+                  s = vp10_prob_diff_update_savings_search(
+                      frame_branch_ct[i][j][k][l][t],
+                      *oldp, &newp, upd);
+                }
+
+                if (s > 0 && newp != *oldp)
+                  u = 1;
+                updates += u;
+                if (u == 0 && updates == 0) {
+                  noupdates_before_first++;
+                  continue;
+                }
+                if (u == 1 && updates == 1) {
+                  int v;
+                  // first update
+                  vpx_write_bit(bc, 1);
+                  for (v = 0; v < noupdates_before_first; ++v)
+                    vpx_write(bc, 0, upd);
+                }
+                vpx_write(bc, u, upd);
+                if (u) {
+                  /* send/use new probability */
+                  vp10_write_prob_diff_update(bc, newp, *oldp);
+                  *oldp = newp;
+                }
+              }
+            }
+          }
+        }
+      }
+      if (updates == 0) {
+        vpx_write_bit(bc, 0);  // no updates
+      }
+      return;
+    }
+    default:
+      assert(0);
+  }
+}
+
+static void update_coef_probs(VP10_COMP *cpi, vpx_writer* w) {
+  const TX_MODE tx_mode = cpi->common.tx_mode;
+  const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
+  TX_SIZE tx_size;
+  for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size) {
+    vp10_coeff_stats frame_branch_ct[PLANE_TYPES];
+    vp10_coeff_probs_model frame_coef_probs[PLANE_TYPES];
+    if (cpi->td.counts->tx.tx_totals[tx_size] <= 20 ||
+        (tx_size >= TX_16X16 && cpi->sf.tx_size_search_method == USE_TX_8X8)) {
+      vpx_write_bit(w, 0);
+    } else {
+      build_tree_distribution(cpi, tx_size, frame_branch_ct,
+                              frame_coef_probs);
+      update_coef_probs_common(w, cpi, tx_size, frame_branch_ct,
+                               frame_coef_probs);
+    }
+  }
+}
+
+static void encode_loopfilter(struct loopfilter *lf,
+                              struct vpx_write_bit_buffer *wb) {
+  int i;
+
+  // Encode the loop filter level and type
+  vpx_wb_write_literal(wb, lf->filter_level, 6);
+  vpx_wb_write_literal(wb, lf->sharpness_level, 3);
+
+  // Write out loop filter deltas applied at the MB level based on mode or
+  // ref frame (if they are enabled).
+  vpx_wb_write_bit(wb, lf->mode_ref_delta_enabled);
+
+  if (lf->mode_ref_delta_enabled) {
+    vpx_wb_write_bit(wb, lf->mode_ref_delta_update);
+    if (lf->mode_ref_delta_update) {
+      for (i = 0; i < MAX_REF_FRAMES; i++) {
+        const int delta = lf->ref_deltas[i];
+        const int changed = delta != lf->last_ref_deltas[i];
+        vpx_wb_write_bit(wb, changed);
+        if (changed) {
+          lf->last_ref_deltas[i] = delta;
+          vpx_wb_write_inv_signed_literal(wb, delta, 6);
+        }
+      }
+
+      for (i = 0; i < MAX_MODE_LF_DELTAS; i++) {
+        const int delta = lf->mode_deltas[i];
+        const int changed = delta != lf->last_mode_deltas[i];
+        vpx_wb_write_bit(wb, changed);
+        if (changed) {
+          lf->last_mode_deltas[i] = delta;
+          vpx_wb_write_inv_signed_literal(wb, delta, 6);
+        }
+      }
+    }
+  }
+}
+
+static void write_delta_q(struct vpx_write_bit_buffer *wb, int delta_q) {
+  if (delta_q != 0) {
+    vpx_wb_write_bit(wb, 1);
+    vpx_wb_write_inv_signed_literal(wb, delta_q, CONFIG_MISC_FIXES ? 6 : 4);
+  } else {
+    vpx_wb_write_bit(wb, 0);
+  }
+}
+
+static void encode_quantization(const VP10_COMMON *const cm,
+                                struct vpx_write_bit_buffer *wb) {
+  vpx_wb_write_literal(wb, cm->base_qindex, QINDEX_BITS);
+  write_delta_q(wb, cm->y_dc_delta_q);
+  write_delta_q(wb, cm->uv_dc_delta_q);
+  write_delta_q(wb, cm->uv_ac_delta_q);
+}
+
+static void encode_segmentation(VP10_COMMON *cm, MACROBLOCKD *xd,
+                                struct vpx_write_bit_buffer *wb) {
+  int i, j;
+
+  const struct segmentation *seg = &cm->seg;
+#if !CONFIG_MISC_FIXES
+  const struct segmentation_probs *segp = &cm->segp;
+#endif
+
+  vpx_wb_write_bit(wb, seg->enabled);
+  if (!seg->enabled)
+    return;
+
+  // Segmentation map
+  if (!frame_is_intra_only(cm) && !cm->error_resilient_mode) {
+    vpx_wb_write_bit(wb, seg->update_map);
+  } else {
+    assert(seg->update_map == 1);
+  }
+  if (seg->update_map) {
+    // Select the coding strategy (temporal or spatial)
+    vp10_choose_segmap_coding_method(cm, xd);
+#if !CONFIG_MISC_FIXES
+    // Write out probabilities used to decode unpredicted  macro-block segments
+    for (i = 0; i < SEG_TREE_PROBS; i++) {
+      const int prob = segp->tree_probs[i];
+      const int update = prob != MAX_PROB;
+      vpx_wb_write_bit(wb, update);
+      if (update)
+        vpx_wb_write_literal(wb, prob, 8);
+    }
+#endif
+
+    // Write out the chosen coding method.
+    if (!frame_is_intra_only(cm) && !cm->error_resilient_mode) {
+      vpx_wb_write_bit(wb, seg->temporal_update);
+    } else {
+      assert(seg->temporal_update == 0);
+    }
+
+#if !CONFIG_MISC_FIXES
+    if (seg->temporal_update) {
+      for (i = 0; i < PREDICTION_PROBS; i++) {
+        const int prob = segp->pred_probs[i];
+        const int update = prob != MAX_PROB;
+        vpx_wb_write_bit(wb, update);
+        if (update)
+          vpx_wb_write_literal(wb, prob, 8);
+      }
+    }
+#endif
+  }
+
+  // Segmentation data
+  vpx_wb_write_bit(wb, seg->update_data);
+  if (seg->update_data) {
+    vpx_wb_write_bit(wb, seg->abs_delta);
+
+    for (i = 0; i < MAX_SEGMENTS; i++) {
+      for (j = 0; j < SEG_LVL_MAX; j++) {
+        const int active = segfeature_active(seg, i, j);
+        vpx_wb_write_bit(wb, active);
+        if (active) {
+          const int data = get_segdata(seg, i, j);
+          const int data_max = vp10_seg_feature_data_max(j);
+
+          if (vp10_is_segfeature_signed(j)) {
+            encode_unsigned_max(wb, abs(data), data_max);
+            vpx_wb_write_bit(wb, data < 0);
+          } else {
+            encode_unsigned_max(wb, data, data_max);
+          }
+        }
+      }
+    }
+  }
+}
+
+#if CONFIG_MISC_FIXES
+static void update_seg_probs(VP10_COMP *cpi, vpx_writer *w) {
+  VP10_COMMON *cm = &cpi->common;
+
+  if (!cpi->common.seg.enabled)
+    return;
+
+  if (cpi->common.seg.temporal_update) {
+    int i;
+
+    for (i = 0; i < PREDICTION_PROBS; i++)
+      vp10_cond_prob_diff_update(w, &cm->fc->seg.pred_probs[i],
+          cm->counts.seg.pred[i]);
+
+    prob_diff_update(vp10_segment_tree, cm->fc->seg.tree_probs,
+        cm->counts.seg.tree_mispred, MAX_SEGMENTS, w);
+  } else {
+    prob_diff_update(vp10_segment_tree, cm->fc->seg.tree_probs,
+        cm->counts.seg.tree_total, MAX_SEGMENTS, w);
+  }
+}
+
+static void write_txfm_mode(TX_MODE mode, struct vpx_write_bit_buffer *wb) {
+  vpx_wb_write_bit(wb, mode == TX_MODE_SELECT);
+  if (mode != TX_MODE_SELECT)
+    vpx_wb_write_literal(wb, mode, 2);
+}
+#else
+static void write_txfm_mode(TX_MODE mode, struct vpx_writer *wb) {
+  vpx_write_literal(wb, VPXMIN(mode, ALLOW_32X32), 2);
+  if (mode >= ALLOW_32X32)
+    vpx_write_bit(wb, mode == TX_MODE_SELECT);
+}
+#endif
+
+
+static void update_txfm_probs(VP10_COMMON *cm, vpx_writer *w,
+                              FRAME_COUNTS *counts) {
+
+  if (cm->tx_mode == TX_MODE_SELECT) {
+    int i, j;
+    unsigned int ct_8x8p[TX_SIZES - 3][2];
+    unsigned int ct_16x16p[TX_SIZES - 2][2];
+    unsigned int ct_32x32p[TX_SIZES - 1][2];
+
+
+    for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
+      vp10_tx_counts_to_branch_counts_8x8(counts->tx.p8x8[i], ct_8x8p);
+      for (j = 0; j < TX_SIZES - 3; j++)
+        vp10_cond_prob_diff_update(w, &cm->fc->tx_probs.p8x8[i][j], ct_8x8p[j]);
+    }
+
+    for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
+      vp10_tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i], ct_16x16p);
+      for (j = 0; j < TX_SIZES - 2; j++)
+        vp10_cond_prob_diff_update(w, &cm->fc->tx_probs.p16x16[i][j],
+                                  ct_16x16p[j]);
+    }
+
+    for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
+      vp10_tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i], ct_32x32p);
+      for (j = 0; j < TX_SIZES - 1; j++)
+        vp10_cond_prob_diff_update(w, &cm->fc->tx_probs.p32x32[i][j],
+                                  ct_32x32p[j]);
+    }
+  }
+}
+
+static void write_interp_filter(INTERP_FILTER filter,
+                                struct vpx_write_bit_buffer *wb) {
+  vpx_wb_write_bit(wb, filter == SWITCHABLE);
+  if (filter != SWITCHABLE)
+    vpx_wb_write_literal(wb, filter, 2);
+}
+
+static void fix_interp_filter(VP10_COMMON *cm, FRAME_COUNTS *counts) {
+  if (cm->interp_filter == SWITCHABLE) {
+    // Check to see if only one of the filters is actually used
+    int count[SWITCHABLE_FILTERS];
+    int i, j, c = 0;
+    for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
+      count[i] = 0;
+      for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
+        count[i] += counts->switchable_interp[j][i];
+      c += (count[i] > 0);
+    }
+    if (c == 1) {
+      // Only one filter is used. So set the filter at frame level
+      for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
+        if (count[i]) {
+          cm->interp_filter = i;
+          break;
+        }
+      }
+    }
+  }
+}
+
+static void write_tile_info(const VP10_COMMON *const cm,
+                            struct vpx_write_bit_buffer *wb) {
+  int min_log2_tile_cols, max_log2_tile_cols, ones;
+  vp10_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
+
+  // columns
+  ones = cm->log2_tile_cols - min_log2_tile_cols;
+  while (ones--)
+    vpx_wb_write_bit(wb, 1);
+
+  if (cm->log2_tile_cols < max_log2_tile_cols)
+    vpx_wb_write_bit(wb, 0);
+
+  // rows
+  vpx_wb_write_bit(wb, cm->log2_tile_rows != 0);
+  if (cm->log2_tile_rows != 0)
+    vpx_wb_write_bit(wb, cm->log2_tile_rows != 1);
+}
+
+static int get_refresh_mask(VP10_COMP *cpi) {
+  if (vp10_preserve_existing_gf(cpi)) {
+    // We have decided to preserve the previously existing golden frame as our
+    // new ARF frame. However, in the short term we leave it in the GF slot and,
+    // if we're updating the GF with the current decoded frame, we save it
+    // instead to the ARF slot.
+    // Later, in the function vp10_encoder.c:vp10_update_reference_frames() we
+    // will swap gld_fb_idx and alt_fb_idx to achieve our objective. We do it
+    // there so that it can be done outside of the recode loop.
+    // Note: This is highly specific to the use of ARF as a forward reference,
+    // and this needs to be generalized as other uses are implemented
+    // (like RTC/temporal scalability).
+    return (cpi->refresh_last_frame << cpi->lst_fb_idx) |
+           (cpi->refresh_golden_frame << cpi->alt_fb_idx);
+  } else {
+    int arf_idx = cpi->alt_fb_idx;
+    if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) {
+      const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+      arf_idx = gf_group->arf_update_idx[gf_group->index];
+    }
+    return (cpi->refresh_last_frame << cpi->lst_fb_idx) |
+           (cpi->refresh_golden_frame << cpi->gld_fb_idx) |
+           (cpi->refresh_alt_ref_frame << arf_idx);
+  }
+}
+
+static size_t encode_tiles(VP10_COMP *cpi, uint8_t *data_ptr,
+                           unsigned int *max_tile_sz) {
+  VP10_COMMON *const cm = &cpi->common;
+  vpx_writer residual_bc;
+  int tile_row, tile_col;
+  TOKENEXTRA *tok_end;
+  size_t total_size = 0;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int tile_rows = 1 << cm->log2_tile_rows;
+  unsigned int max_tile = 0;
+
+  memset(cm->above_seg_context, 0,
+         sizeof(*cm->above_seg_context) * mi_cols_aligned_to_sb(cm->mi_cols));
+
+  for (tile_row = 0; tile_row < tile_rows; tile_row++) {
+    for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+      int tile_idx = tile_row * tile_cols + tile_col;
+      TOKENEXTRA *tok = cpi->tile_tok[tile_row][tile_col];
+
+      tok_end = cpi->tile_tok[tile_row][tile_col] +
+          cpi->tok_count[tile_row][tile_col];
+
+      if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1)
+        vpx_start_encode(&residual_bc, data_ptr + total_size + 4);
+      else
+        vpx_start_encode(&residual_bc, data_ptr + total_size);
+
+      write_modes(cpi, &cpi->tile_data[tile_idx].tile_info,
+                  &residual_bc, &tok, tok_end);
+      assert(tok == tok_end);
+      vpx_stop_encode(&residual_bc);
+      if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1) {
+        unsigned int tile_sz;
+
+        // size of this tile
+        assert(residual_bc.pos > 0);
+        tile_sz = residual_bc.pos - CONFIG_MISC_FIXES;
+        mem_put_le32(data_ptr + total_size, tile_sz);
+        max_tile = max_tile > tile_sz ? max_tile : tile_sz;
+        total_size += 4;
+      }
+
+      total_size += residual_bc.pos;
+    }
+  }
+  *max_tile_sz = max_tile;
+
+  return total_size;
+}
+
+static void write_render_size(const VP10_COMMON *cm,
+                              struct vpx_write_bit_buffer *wb) {
+  const int scaling_active = cm->width != cm->render_width ||
+                             cm->height != cm->render_height;
+  vpx_wb_write_bit(wb, scaling_active);
+  if (scaling_active) {
+    vpx_wb_write_literal(wb, cm->render_width - 1, 16);
+    vpx_wb_write_literal(wb, cm->render_height - 1, 16);
+  }
+}
+
+static void write_frame_size(const VP10_COMMON *cm,
+                             struct vpx_write_bit_buffer *wb) {
+  vpx_wb_write_literal(wb, cm->width - 1, 16);
+  vpx_wb_write_literal(wb, cm->height - 1, 16);
+
+  write_render_size(cm, wb);
+}
+
+static void write_frame_size_with_refs(VP10_COMP *cpi,
+                                       struct vpx_write_bit_buffer *wb) {
+  VP10_COMMON *const cm = &cpi->common;
+  int found = 0;
+
+  MV_REFERENCE_FRAME ref_frame;
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, ref_frame);
+
+    if (cfg != NULL) {
+      found = cm->width == cfg->y_crop_width &&
+              cm->height == cfg->y_crop_height;
+#if CONFIG_MISC_FIXES
+      found &= cm->render_width == cfg->render_width &&
+               cm->render_height == cfg->render_height;
+#endif
+    }
+    vpx_wb_write_bit(wb, found);
+    if (found) {
+      break;
+    }
+  }
+
+  if (!found) {
+    vpx_wb_write_literal(wb, cm->width - 1, 16);
+    vpx_wb_write_literal(wb, cm->height - 1, 16);
+
+#if CONFIG_MISC_FIXES
+    write_render_size(cm, wb);
+#endif
+  }
+
+#if !CONFIG_MISC_FIXES
+  write_render_size(cm, wb);
+#endif
+}
+
+static void write_sync_code(struct vpx_write_bit_buffer *wb) {
+  vpx_wb_write_literal(wb, VP10_SYNC_CODE_0, 8);
+  vpx_wb_write_literal(wb, VP10_SYNC_CODE_1, 8);
+  vpx_wb_write_literal(wb, VP10_SYNC_CODE_2, 8);
+}
+
+static void write_profile(BITSTREAM_PROFILE profile,
+                          struct vpx_write_bit_buffer *wb) {
+  switch (profile) {
+    case PROFILE_0:
+      vpx_wb_write_literal(wb, 0, 2);
+      break;
+    case PROFILE_1:
+      vpx_wb_write_literal(wb, 2, 2);
+      break;
+    case PROFILE_2:
+      vpx_wb_write_literal(wb, 1, 2);
+      break;
+    case PROFILE_3:
+      vpx_wb_write_literal(wb, 6, 3);
+      break;
+    default:
+      assert(0);
+  }
+}
+
+static void write_bitdepth_colorspace_sampling(
+    VP10_COMMON *const cm, struct vpx_write_bit_buffer *wb) {
+  if (cm->profile >= PROFILE_2) {
+    assert(cm->bit_depth > VPX_BITS_8);
+    vpx_wb_write_bit(wb, cm->bit_depth == VPX_BITS_10 ? 0 : 1);
+  }
+  vpx_wb_write_literal(wb, cm->color_space, 3);
+  if (cm->color_space != VPX_CS_SRGB) {
+    // 0: [16, 235] (i.e. xvYCC), 1: [0, 255]
+    vpx_wb_write_bit(wb, cm->color_range);
+    if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) {
+      assert(cm->subsampling_x != 1 || cm->subsampling_y != 1);
+      vpx_wb_write_bit(wb, cm->subsampling_x);
+      vpx_wb_write_bit(wb, cm->subsampling_y);
+      vpx_wb_write_bit(wb, 0);  // unused
+    } else {
+      assert(cm->subsampling_x == 1 && cm->subsampling_y == 1);
+    }
+  } else {
+    assert(cm->profile == PROFILE_1 || cm->profile == PROFILE_3);
+    vpx_wb_write_bit(wb, 0);  // unused
+  }
+}
+
+static void write_uncompressed_header(VP10_COMP *cpi,
+                                      struct vpx_write_bit_buffer *wb) {
+  VP10_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+
+  vpx_wb_write_literal(wb, VP9_FRAME_MARKER, 2);
+
+  write_profile(cm->profile, wb);
+
+  vpx_wb_write_bit(wb, 0);  // show_existing_frame
+  vpx_wb_write_bit(wb, cm->frame_type);
+  vpx_wb_write_bit(wb, cm->show_frame);
+  vpx_wb_write_bit(wb, cm->error_resilient_mode);
+
+  if (cm->frame_type == KEY_FRAME) {
+    write_sync_code(wb);
+    write_bitdepth_colorspace_sampling(cm, wb);
+    write_frame_size(cm, wb);
+  } else {
+    if (!cm->show_frame)
+      vpx_wb_write_bit(wb, cm->intra_only);
+
+    if (!cm->error_resilient_mode) {
+#if CONFIG_MISC_FIXES
+      if (cm->intra_only) {
+        vpx_wb_write_bit(wb,
+                         cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL);
+      } else {
+        vpx_wb_write_bit(wb,
+                         cm->reset_frame_context != RESET_FRAME_CONTEXT_NONE);
+        if (cm->reset_frame_context != RESET_FRAME_CONTEXT_NONE)
+          vpx_wb_write_bit(wb,
+                           cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL);
+      }
+#else
+      static const int reset_frame_context_conv_tbl[3] = { 0, 2, 3 };
+
+      vpx_wb_write_literal(wb,
+          reset_frame_context_conv_tbl[cm->reset_frame_context], 2);
+#endif
+    }
+
+    if (cm->intra_only) {
+      write_sync_code(wb);
+
+#if CONFIG_MISC_FIXES
+      write_bitdepth_colorspace_sampling(cm, wb);
+#else
+      // Note for profile 0, 420 8bpp is assumed.
+      if (cm->profile > PROFILE_0) {
+        write_bitdepth_colorspace_sampling(cm, wb);
+      }
+#endif
+
+      vpx_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES);
+      write_frame_size(cm, wb);
+    } else {
+      MV_REFERENCE_FRAME ref_frame;
+      vpx_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES);
+      for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+        assert(get_ref_frame_map_idx(cpi, ref_frame) != INVALID_IDX);
+        vpx_wb_write_literal(wb, get_ref_frame_map_idx(cpi, ref_frame),
+                             REF_FRAMES_LOG2);
+        vpx_wb_write_bit(wb, cm->ref_frame_sign_bias[ref_frame]);
+      }
+
+      write_frame_size_with_refs(cpi, wb);
+
+      vpx_wb_write_bit(wb, cm->allow_high_precision_mv);
+
+      fix_interp_filter(cm, cpi->td.counts);
+      write_interp_filter(cm->interp_filter, wb);
+    }
+  }
+
+  if (!cm->error_resilient_mode) {
+    vpx_wb_write_bit(wb,
+                     cm->refresh_frame_context != REFRESH_FRAME_CONTEXT_OFF);
+#if CONFIG_MISC_FIXES
+    if (cm->refresh_frame_context != REFRESH_FRAME_CONTEXT_OFF)
+#endif
+      vpx_wb_write_bit(wb, cm->refresh_frame_context !=
+                               REFRESH_FRAME_CONTEXT_BACKWARD);
+  }
+
+  vpx_wb_write_literal(wb, cm->frame_context_idx, FRAME_CONTEXTS_LOG2);
+
+  encode_loopfilter(&cm->lf, wb);
+  encode_quantization(cm, wb);
+  encode_segmentation(cm, xd, wb);
+#if CONFIG_MISC_FIXES
+  if (!cm->seg.enabled && xd->lossless[0])
+    cm->tx_mode = TX_4X4;
+  else
+    write_txfm_mode(cm->tx_mode, wb);
+  if (cpi->allow_comp_inter_inter) {
+    const int use_hybrid_pred = cm->reference_mode == REFERENCE_MODE_SELECT;
+    const int use_compound_pred = cm->reference_mode != SINGLE_REFERENCE;
+
+    vpx_wb_write_bit(wb, use_hybrid_pred);
+    if (!use_hybrid_pred)
+      vpx_wb_write_bit(wb, use_compound_pred);
+  }
+#endif
+
+  write_tile_info(cm, wb);
+}
+
+static size_t write_compressed_header(VP10_COMP *cpi, uint8_t *data) {
+  VP10_COMMON *const cm = &cpi->common;
+  FRAME_CONTEXT *const fc = cm->fc;
+  FRAME_COUNTS *counts = cpi->td.counts;
+  vpx_writer header_bc;
+  int i;
+#if CONFIG_MISC_FIXES
+  int j;
+#endif
+
+  vpx_start_encode(&header_bc, data);
+
+#if !CONFIG_MISC_FIXES
+  if (cpi->td.mb.e_mbd.lossless[0]) {
+    cm->tx_mode = TX_4X4;
+  } else {
+    write_txfm_mode(cm->tx_mode, &header_bc);
+    update_txfm_probs(cm, &header_bc, counts);
+  }
+#else
+  update_txfm_probs(cm, &header_bc, counts);
+#endif
+  update_coef_probs(cpi, &header_bc);
+  update_skip_probs(cm, &header_bc, counts);
+#if CONFIG_MISC_FIXES
+  update_seg_probs(cpi, &header_bc);
+
+  for (i = 0; i < INTRA_MODES; ++i)
+    prob_diff_update(vp10_intra_mode_tree, fc->uv_mode_prob[i],
+                     counts->uv_mode[i], INTRA_MODES, &header_bc);
+
+  for (i = 0; i < PARTITION_CONTEXTS; ++i)
+    prob_diff_update(vp10_partition_tree, fc->partition_prob[i],
+                     counts->partition[i], PARTITION_TYPES, &header_bc);
+#endif
+
+  if (frame_is_intra_only(cm)) {
+    vp10_copy(cm->kf_y_prob, vp10_kf_y_mode_prob);
+#if CONFIG_MISC_FIXES
+    for (i = 0; i < INTRA_MODES; ++i)
+      for (j = 0; j < INTRA_MODES; ++j)
+        prob_diff_update(vp10_intra_mode_tree, cm->kf_y_prob[i][j],
+                         counts->kf_y_mode[i][j], INTRA_MODES, &header_bc);
+#endif
+  } else {
+    for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
+      prob_diff_update(vp10_inter_mode_tree, cm->fc->inter_mode_probs[i],
+                       counts->inter_mode[i], INTER_MODES, &header_bc);
+
+    if (cm->interp_filter == SWITCHABLE)
+      update_switchable_interp_probs(cm, &header_bc, counts);
+
+    for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
+      vp10_cond_prob_diff_update(&header_bc, &fc->intra_inter_prob[i],
+                                counts->intra_inter[i]);
+
+    if (cpi->allow_comp_inter_inter) {
+      const int use_hybrid_pred = cm->reference_mode == REFERENCE_MODE_SELECT;
+#if !CONFIG_MISC_FIXES
+      const int use_compound_pred = cm->reference_mode != SINGLE_REFERENCE;
+
+      vpx_write_bit(&header_bc, use_compound_pred);
+      if (use_compound_pred) {
+        vpx_write_bit(&header_bc, use_hybrid_pred);
+        if (use_hybrid_pred)
+          for (i = 0; i < COMP_INTER_CONTEXTS; i++)
+            vp10_cond_prob_diff_update(&header_bc, &fc->comp_inter_prob[i],
+                                      counts->comp_inter[i]);
+      }
+#else
+      if (use_hybrid_pred)
+        for (i = 0; i < COMP_INTER_CONTEXTS; i++)
+          vp10_cond_prob_diff_update(&header_bc, &fc->comp_inter_prob[i],
+                                     counts->comp_inter[i]);
+#endif
+    }
+
+    if (cm->reference_mode != COMPOUND_REFERENCE) {
+      for (i = 0; i < REF_CONTEXTS; i++) {
+        vp10_cond_prob_diff_update(&header_bc, &fc->single_ref_prob[i][0],
+                                  counts->single_ref[i][0]);
+        vp10_cond_prob_diff_update(&header_bc, &fc->single_ref_prob[i][1],
+                                  counts->single_ref[i][1]);
+      }
+    }
+
+    if (cm->reference_mode != SINGLE_REFERENCE)
+      for (i = 0; i < REF_CONTEXTS; i++)
+        vp10_cond_prob_diff_update(&header_bc, &fc->comp_ref_prob[i],
+                                  counts->comp_ref[i]);
+
+    for (i = 0; i < BLOCK_SIZE_GROUPS; ++i)
+      prob_diff_update(vp10_intra_mode_tree, cm->fc->y_mode_prob[i],
+                       counts->y_mode[i], INTRA_MODES, &header_bc);
+
+#if !CONFIG_MISC_FIXES
+    for (i = 0; i < PARTITION_CONTEXTS; ++i)
+      prob_diff_update(vp10_partition_tree, fc->partition_prob[i],
+                       counts->partition[i], PARTITION_TYPES, &header_bc);
+#endif
+
+    vp10_write_nmv_probs(cm, cm->allow_high_precision_mv, &header_bc,
+                        &counts->mv);
+    update_ext_tx_probs(cm, &header_bc);
+  }
+
+  vpx_stop_encode(&header_bc);
+  assert(header_bc.pos <= 0xffff);
+
+  return header_bc.pos;
+}
+
+#if CONFIG_MISC_FIXES
+static int remux_tiles(uint8_t *dest, const int sz,
+                       const int n_tiles, const int mag) {
+  int rpos = 0, wpos = 0, n;
+
+  for (n = 0; n < n_tiles; n++) {
+    int tile_sz;
+
+    if (n == n_tiles - 1) {
+      tile_sz = sz - rpos;
+    } else {
+      tile_sz = mem_get_le32(&dest[rpos]) + 1;
+      rpos += 4;
+      switch (mag) {
+        case 0:
+          dest[wpos] = tile_sz - 1;
+          break;
+        case 1:
+          mem_put_le16(&dest[wpos], tile_sz - 1);
+          break;
+        case 2:
+          mem_put_le24(&dest[wpos], tile_sz - 1);
+          break;
+        case 3:  // remuxing should only happen if mag < 3
+        default:
+          assert("Invalid value for tile size magnitude" && 0);
+      }
+      wpos += mag + 1;
+    }
+
+    memmove(&dest[wpos], &dest[rpos], tile_sz);
+    wpos += tile_sz;
+    rpos += tile_sz;
+  }
+
+  assert(rpos > wpos);
+  assert(rpos == sz);
+
+  return wpos;
+}
+#endif
+
+void vp10_pack_bitstream(VP10_COMP *const cpi, uint8_t *dest, size_t *size) {
+  uint8_t *data = dest;
+  size_t first_part_size, uncompressed_hdr_size, data_sz;
+  struct vpx_write_bit_buffer wb = {data, 0};
+  struct vpx_write_bit_buffer saved_wb;
+  unsigned int max_tile;
+#if CONFIG_MISC_FIXES
+  VP10_COMMON *const cm = &cpi->common;
+  const int n_log2_tiles = cm->log2_tile_rows + cm->log2_tile_cols;
+  const int have_tiles = n_log2_tiles > 0;
+#else
+  const int have_tiles = 0;  // we have tiles, but we don't want to write a
+                             // tile size marker in the header
+#endif
+
+  write_uncompressed_header(cpi, &wb);
+  saved_wb = wb;
+  // don't know in advance first part. size
+  vpx_wb_write_literal(&wb, 0, 16 + have_tiles * 2);
+
+  uncompressed_hdr_size = vpx_wb_bytes_written(&wb);
+  data += uncompressed_hdr_size;
+
+  vpx_clear_system_state();
+
+  first_part_size = write_compressed_header(cpi, data);
+  data += first_part_size;
+
+  data_sz = encode_tiles(cpi, data, &max_tile);
+#if CONFIG_MISC_FIXES
+  if (max_tile > 0) {
+    int mag;
+    unsigned int mask;
+
+    // Choose the (tile size) magnitude
+    for (mag = 0, mask = 0xff; mag < 4; mag++) {
+      if (max_tile <= mask)
+        break;
+      mask <<= 8;
+      mask |= 0xff;
+    }
+    assert(n_log2_tiles > 0);
+    vpx_wb_write_literal(&saved_wb, mag, 2);
+    if (mag < 3)
+      data_sz = remux_tiles(data, (int)data_sz, 1 << n_log2_tiles, mag);
+  } else {
+    assert(n_log2_tiles == 0);
+  }
+#endif
+  data += data_sz;
+
+  // TODO(jbb): Figure out what to do if first_part_size > 16 bits.
+  vpx_wb_write_literal(&saved_wb, (int)first_part_size, 16);
+
+  *size = data - dest;
+}
diff --git a/libs/libvpx/vp10/encoder/bitstream.h b/libs/libvpx/vp10/encoder/bitstream.h
new file mode 100644
index 0000000000..b1da89f1d7
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/bitstream.h
@@ -0,0 +1,33 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP10_ENCODER_BITSTREAM_H_
+#define VP10_ENCODER_BITSTREAM_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "vp10/encoder/encoder.h"
+
+void vp10_encode_token_init();
+void vp10_pack_bitstream(VP10_COMP *const cpi, uint8_t *dest, size_t *size);
+
+static INLINE int vp10_preserve_existing_gf(VP10_COMP *cpi) {
+  return !cpi->multi_arf_allowed && cpi->refresh_golden_frame &&
+         cpi->rc.is_src_frame_alt_ref;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_ENCODER_BITSTREAM_H_
diff --git a/libs/libvpx/vp10/encoder/block.h b/libs/libvpx/vp10/encoder/block.h
new file mode 100644
index 0000000000..ab0252baae
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/block.h
@@ -0,0 +1,143 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_ENCODER_BLOCK_H_
+#define VP10_ENCODER_BLOCK_H_
+
+#include "vp10/common/entropymv.h"
+#include "vp10/common/entropy.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+  unsigned int sse;
+  int sum;
+  unsigned int var;
+} diff;
+
+struct macroblock_plane {
+  DECLARE_ALIGNED(16, int16_t, src_diff[64 * 64]);
+  tran_low_t *qcoeff;
+  tran_low_t *coeff;
+  uint16_t *eobs;
+  struct buf_2d src;
+
+  // Quantizer setings
+  int16_t *quant_fp;
+  int16_t *round_fp;
+  int16_t *quant;
+  int16_t *quant_shift;
+  int16_t *zbin;
+  int16_t *round;
+
+  int64_t quant_thred[2];
+};
+
+/* The [2] dimension is for whether we skip the EOB node (i.e. if previous
+ * coefficient in this block was zero) or not. */
+typedef unsigned int vp10_coeff_cost[PLANE_TYPES][REF_TYPES][COEF_BANDS][2]
+                                   [COEFF_CONTEXTS][ENTROPY_TOKENS];
+
+typedef struct {
+  int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
+  uint8_t mode_context[MAX_REF_FRAMES];
+} MB_MODE_INFO_EXT;
+
+typedef struct macroblock MACROBLOCK;
+struct macroblock {
+  struct macroblock_plane plane[MAX_MB_PLANE];
+
+  MACROBLOCKD e_mbd;
+  MB_MODE_INFO_EXT *mbmi_ext;
+  int skip_block;
+  int select_tx_size;
+  int skip_recode;
+  int skip_optimize;
+  int q_index;
+
+  int errorperbit;
+  int sadperbit16;
+  int sadperbit4;
+  int rddiv;
+  int rdmult;
+  int mb_energy;
+  int * m_search_count_ptr;
+  int * ex_search_count_ptr;
+
+  // These are set to their default values at the beginning, and then adjusted
+  // further in the encoding process.
+  BLOCK_SIZE min_partition_size;
+  BLOCK_SIZE max_partition_size;
+
+  int mv_best_ref_index[MAX_REF_FRAMES];
+  unsigned int max_mv_context[MAX_REF_FRAMES];
+  unsigned int source_variance;
+  unsigned int pred_sse[MAX_REF_FRAMES];
+  int pred_mv_sad[MAX_REF_FRAMES];
+
+  int nmvjointcost[MV_JOINTS];
+  int *nmvcost[2];
+  int *nmvcost_hp[2];
+  int **mvcost;
+
+  int nmvjointsadcost[MV_JOINTS];
+  int *nmvsadcost[2];
+  int *nmvsadcost_hp[2];
+  int **mvsadcost;
+
+  // These define limits to motion vector components to prevent them
+  // from extending outside the UMV borders
+  int mv_col_min;
+  int mv_col_max;
+  int mv_row_min;
+  int mv_row_max;
+
+  // Notes transform blocks where no coefficents are coded.
+  // Set during mode selection. Read during block encoding.
+  uint8_t zcoeff_blk[TX_SIZES][256];
+
+  int skip;
+
+  int encode_breakout;
+
+  // note that token_costs is the cost when eob node is skipped
+  vp10_coeff_cost token_costs[TX_SIZES];
+
+  int optimize;
+
+  // indicate if it is in the rd search loop or encoding process
+  int use_lp32x32fdct;
+
+  // use fast quantization process
+  int quant_fp;
+
+  // skip forward transform and quantization
+  uint8_t skip_txfm[MAX_MB_PLANE << 2];
+  #define SKIP_TXFM_NONE 0
+  #define SKIP_TXFM_AC_DC 1
+  #define SKIP_TXFM_AC_ONLY 2
+
+  int64_t bsse[MAX_MB_PLANE << 2];
+
+  // Used to store sub partition's choices.
+  MV pred_mv[MAX_REF_FRAMES];
+
+  // Strong color activity detection. Used in RTC coding mode to enhance
+  // the visual quality at the boundary of moving color objects.
+  uint8_t color_sensitivity[2];
+};
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_ENCODER_BLOCK_H_
diff --git a/libs/libvpx/vp10/encoder/blockiness.c b/libs/libvpx/vp10/encoder/blockiness.c
new file mode 100644
index 0000000000..ede13e0e59
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/blockiness.c
@@ -0,0 +1,141 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp10_rtcd.h"
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vp10/common/common.h"
+#include "vp10/common/filter.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_convolve.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/system_state.h"
+
+static int horizontal_filter(const uint8_t *s) {
+  return (s[1] - s[-2]) * 2 + (s[-1] - s[0]) * 6;
+}
+
+static int vertical_filter(const uint8_t *s, int p) {
+  return (s[p] - s[-2 * p]) * 2 + (s[-p] - s[0]) * 6;
+}
+
+static int variance(int sum, int sum_squared, int size) {
+  return sum_squared / size - (sum / size) * (sum / size);
+}
+// Calculate a blockiness level for a vertical block edge.
+// This function returns a new blockiness metric that's defined as
+
+//              p0 p1 p2 p3
+//              q0 q1 q2 q3
+// block edge ->
+//              r0 r1 r2 r3
+//              s0 s1 s2 s3
+
+// blockiness =  p0*-2+q0*6+r0*-6+s0*2 +
+//               p1*-2+q1*6+r1*-6+s1*2 +
+//               p2*-2+q2*6+r2*-6+s2*2 +
+//               p3*-2+q3*6+r3*-6+s3*2 ;
+
+// reconstructed_blockiness = abs(blockiness from reconstructed buffer -
+//                                blockiness from source buffer,0)
+//
+// I make the assumption that flat blocks are much more visible than high
+// contrast blocks. As such, I scale the result of the blockiness calc
+// by dividing the blockiness by the variance of the pixels on either side
+// of the edge as follows:
+// var_0 = (q0^2+q1^2+q2^2+q3^2) - ((q0 + q1 + q2 + q3) / 4 )^2
+// var_1 = (r0^2+r1^2+r2^2+r3^2) - ((r0 + r1 + r2 + r3) / 4 )^2
+// The returned blockiness is the scaled value
+// Reconstructed blockiness / ( 1 + var_0 + var_1 ) ;
+static int blockiness_vertical(const uint8_t *s, int sp, const uint8_t *r,
+                               int rp, int size) {
+  int s_blockiness = 0;
+  int r_blockiness = 0;
+  int sum_0 = 0;
+  int sum_sq_0 = 0;
+  int sum_1 = 0;
+  int sum_sq_1 = 0;
+  int i;
+  int var_0;
+  int var_1;
+  for (i = 0; i < size; ++i, s += sp, r += rp) {
+    s_blockiness += horizontal_filter(s);
+    r_blockiness += horizontal_filter(r);
+    sum_0 += s[0];
+    sum_sq_0 += s[0]*s[0];
+    sum_1 += s[-1];
+    sum_sq_1 += s[-1]*s[-1];
+  }
+  var_0 = variance(sum_0, sum_sq_0, size);
+  var_1 = variance(sum_1, sum_sq_1, size);
+  r_blockiness = abs(r_blockiness);
+  s_blockiness = abs(s_blockiness);
+
+  if (r_blockiness > s_blockiness)
+    return (r_blockiness - s_blockiness) / (1 + var_0 + var_1);
+  else
+    return 0;
+}
+
+// Calculate a blockiness level for a horizontal block edge
+// same as above.
+static int blockiness_horizontal(const uint8_t *s, int sp, const uint8_t *r,
+                                 int rp, int size) {
+  int s_blockiness = 0;
+  int r_blockiness = 0;
+  int sum_0 = 0;
+  int sum_sq_0 = 0;
+  int sum_1 = 0;
+  int sum_sq_1 = 0;
+  int i;
+  int var_0;
+  int var_1;
+  for (i = 0; i < size; ++i, ++s, ++r) {
+    s_blockiness += vertical_filter(s, sp);
+    r_blockiness += vertical_filter(r, rp);
+    sum_0 += s[0];
+    sum_sq_0 += s[0] * s[0];
+    sum_1 += s[-sp];
+    sum_sq_1 += s[-sp] * s[-sp];
+  }
+  var_0 = variance(sum_0, sum_sq_0, size);
+  var_1 = variance(sum_1, sum_sq_1, size);
+  r_blockiness = abs(r_blockiness);
+  s_blockiness = abs(s_blockiness);
+
+  if (r_blockiness > s_blockiness)
+    return (r_blockiness - s_blockiness) / (1 + var_0 + var_1);
+  else
+    return 0;
+}
+
+// This function returns the blockiness for the entire frame currently by
+// looking at all borders in steps of 4.
+double vp10_get_blockiness(const unsigned char *img1, int img1_pitch,
+                          const unsigned char *img2, int img2_pitch,
+                          int width, int height ) {
+  double blockiness = 0;
+  int i, j;
+  vpx_clear_system_state();
+  for (i = 0; i < height; i += 4, img1 += img1_pitch * 4,
+       img2 += img2_pitch * 4) {
+    for (j = 0; j < width; j += 4) {
+      if (i > 0 && i < height && j > 0 && j < width) {
+        blockiness += blockiness_vertical(img1 + j, img1_pitch,
+                                          img2 + j, img2_pitch, 4);
+        blockiness += blockiness_horizontal(img1 + j, img1_pitch,
+                                            img2 + j, img2_pitch, 4);
+      }
+    }
+  }
+  blockiness /= width * height / 16;
+  return blockiness;
+}
diff --git a/libs/libvpx/vp10/encoder/context_tree.c b/libs/libvpx/vp10/encoder/context_tree.c
new file mode 100644
index 0000000000..6c056d28e1
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/context_tree.c
@@ -0,0 +1,166 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp10/encoder/context_tree.h"
+#include "vp10/encoder/encoder.h"
+
+static const BLOCK_SIZE square[] = {
+  BLOCK_8X8,
+  BLOCK_16X16,
+  BLOCK_32X32,
+  BLOCK_64X64,
+};
+
+static void alloc_mode_context(VP10_COMMON *cm, int num_4x4_blk,
+                               PICK_MODE_CONTEXT *ctx) {
+  const int num_blk = (num_4x4_blk < 4 ? 4 : num_4x4_blk);
+  const int num_pix = num_blk << 4;
+  int i, k;
+  ctx->num_4x4_blk = num_blk;
+
+  CHECK_MEM_ERROR(cm, ctx->zcoeff_blk,
+                  vpx_calloc(num_blk, sizeof(uint8_t)));
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    for (k = 0; k < 3; ++k) {
+      CHECK_MEM_ERROR(cm, ctx->coeff[i][k],
+                      vpx_memalign(32, num_pix * sizeof(*ctx->coeff[i][k])));
+      CHECK_MEM_ERROR(cm, ctx->qcoeff[i][k],
+                      vpx_memalign(32, num_pix * sizeof(*ctx->qcoeff[i][k])));
+      CHECK_MEM_ERROR(cm, ctx->dqcoeff[i][k],
+                      vpx_memalign(32, num_pix * sizeof(*ctx->dqcoeff[i][k])));
+      CHECK_MEM_ERROR(cm, ctx->eobs[i][k],
+                      vpx_memalign(32, num_blk * sizeof(*ctx->eobs[i][k])));
+      ctx->coeff_pbuf[i][k]   = ctx->coeff[i][k];
+      ctx->qcoeff_pbuf[i][k]  = ctx->qcoeff[i][k];
+      ctx->dqcoeff_pbuf[i][k] = ctx->dqcoeff[i][k];
+      ctx->eobs_pbuf[i][k]    = ctx->eobs[i][k];
+    }
+  }
+}
+
+static void free_mode_context(PICK_MODE_CONTEXT *ctx) {
+  int i, k;
+  vpx_free(ctx->zcoeff_blk);
+  ctx->zcoeff_blk = 0;
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    for (k = 0; k < 3; ++k) {
+      vpx_free(ctx->coeff[i][k]);
+      ctx->coeff[i][k] = 0;
+      vpx_free(ctx->qcoeff[i][k]);
+      ctx->qcoeff[i][k] = 0;
+      vpx_free(ctx->dqcoeff[i][k]);
+      ctx->dqcoeff[i][k] = 0;
+      vpx_free(ctx->eobs[i][k]);
+      ctx->eobs[i][k] = 0;
+    }
+  }
+
+  for (i = 0; i < 2; ++i) {
+    vpx_free(ctx->color_index_map[i]);
+    ctx->color_index_map[i] = 0;
+  }
+}
+
+static void alloc_tree_contexts(VP10_COMMON *cm, PC_TREE *tree,
+                                int num_4x4_blk) {
+  alloc_mode_context(cm, num_4x4_blk, &tree->none);
+  alloc_mode_context(cm, num_4x4_blk/2, &tree->horizontal[0]);
+  alloc_mode_context(cm, num_4x4_blk/2, &tree->vertical[0]);
+
+  if (num_4x4_blk > 4) {
+    alloc_mode_context(cm, num_4x4_blk/2, &tree->horizontal[1]);
+    alloc_mode_context(cm, num_4x4_blk/2, &tree->vertical[1]);
+  } else {
+    memset(&tree->horizontal[1], 0, sizeof(tree->horizontal[1]));
+    memset(&tree->vertical[1], 0, sizeof(tree->vertical[1]));
+  }
+}
+
+static void free_tree_contexts(PC_TREE *tree) {
+  free_mode_context(&tree->none);
+  free_mode_context(&tree->horizontal[0]);
+  free_mode_context(&tree->horizontal[1]);
+  free_mode_context(&tree->vertical[0]);
+  free_mode_context(&tree->vertical[1]);
+}
+
+// This function sets up a tree of contexts such that at each square
+// partition level. There are contexts for none, horizontal, vertical, and
+// split.  Along with a block_size value and a selected block_size which
+// represents the state of our search.
+void vp10_setup_pc_tree(VP10_COMMON *cm, ThreadData *td) {
+  int i, j;
+  const int leaf_nodes = 64;
+  const int tree_nodes = 64 + 16 + 4 + 1;
+  int pc_tree_index = 0;
+  PC_TREE *this_pc;
+  PICK_MODE_CONTEXT *this_leaf;
+  int square_index = 1;
+  int nodes;
+
+  vpx_free(td->leaf_tree);
+  CHECK_MEM_ERROR(cm, td->leaf_tree, vpx_calloc(leaf_nodes,
+                                                sizeof(*td->leaf_tree)));
+  vpx_free(td->pc_tree);
+  CHECK_MEM_ERROR(cm, td->pc_tree, vpx_calloc(tree_nodes,
+                                              sizeof(*td->pc_tree)));
+
+  this_pc = &td->pc_tree[0];
+  this_leaf = &td->leaf_tree[0];
+
+  // 4x4 blocks smaller than 8x8 but in the same 8x8 block share the same
+  // context so we only need to allocate 1 for each 8x8 block.
+  for (i = 0; i < leaf_nodes; ++i)
+    alloc_mode_context(cm, 1, &td->leaf_tree[i]);
+
+  // Sets up all the leaf nodes in the tree.
+  for (pc_tree_index = 0; pc_tree_index < leaf_nodes; ++pc_tree_index) {
+    PC_TREE *const tree = &td->pc_tree[pc_tree_index];
+    tree->block_size = square[0];
+    alloc_tree_contexts(cm, tree, 4);
+    tree->leaf_split[0] = this_leaf++;
+    for (j = 1; j < 4; j++)
+      tree->leaf_split[j] = tree->leaf_split[0];
+  }
+
+  // Each node has 4 leaf nodes, fill each block_size level of the tree
+  // from leafs to the root.
+  for (nodes = 16; nodes > 0; nodes >>= 2) {
+    for (i = 0; i < nodes; ++i) {
+      PC_TREE *const tree = &td->pc_tree[pc_tree_index];
+      alloc_tree_contexts(cm, tree, 4 << (2 * square_index));
+      tree->block_size = square[square_index];
+      for (j = 0; j < 4; j++)
+        tree->split[j] = this_pc++;
+      ++pc_tree_index;
+    }
+    ++square_index;
+  }
+  td->pc_root = &td->pc_tree[tree_nodes - 1];
+  td->pc_root[0].none.best_mode_index = 2;
+}
+
+void vp10_free_pc_tree(ThreadData *td) {
+  const int tree_nodes = 64 + 16 + 4 + 1;
+  int i;
+
+  // Set up all 4x4 mode contexts
+  for (i = 0; i < 64; ++i)
+    free_mode_context(&td->leaf_tree[i]);
+
+  // Sets up all the leaf nodes in the tree.
+  for (i = 0; i < tree_nodes; ++i)
+    free_tree_contexts(&td->pc_tree[i]);
+
+  vpx_free(td->pc_tree);
+  td->pc_tree = NULL;
+  vpx_free(td->leaf_tree);
+  td->leaf_tree = NULL;
+}
diff --git a/libs/libvpx/vp10/encoder/context_tree.h b/libs/libvpx/vp10/encoder/context_tree.h
new file mode 100644
index 0000000000..2a0fffbfb2
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/context_tree.h
@@ -0,0 +1,96 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_ENCODER_CONTEXT_TREE_H_
+#define VP10_ENCODER_CONTEXT_TREE_H_
+
+#include "vp10/common/blockd.h"
+#include "vp10/encoder/block.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP10_COMP;
+struct VP10Common;
+struct ThreadData;
+
+// Structure to hold snapshot of coding context during the mode picking process
+typedef struct {
+  MODE_INFO mic;
+  MB_MODE_INFO_EXT mbmi_ext;
+  uint8_t *zcoeff_blk;
+  uint8_t *color_index_map[2];
+  tran_low_t *coeff[MAX_MB_PLANE][3];
+  tran_low_t *qcoeff[MAX_MB_PLANE][3];
+  tran_low_t *dqcoeff[MAX_MB_PLANE][3];
+  uint16_t *eobs[MAX_MB_PLANE][3];
+
+  // dual buffer pointers, 0: in use, 1: best in store
+  tran_low_t *coeff_pbuf[MAX_MB_PLANE][3];
+  tran_low_t *qcoeff_pbuf[MAX_MB_PLANE][3];
+  tran_low_t *dqcoeff_pbuf[MAX_MB_PLANE][3];
+  uint16_t *eobs_pbuf[MAX_MB_PLANE][3];
+
+  int is_coded;
+  int num_4x4_blk;
+  int skip;
+  int pred_pixel_ready;
+  // For current partition, only if all Y, U, and V transform blocks'
+  // coefficients are quantized to 0, skippable is set to 0.
+  int skippable;
+  uint8_t skip_txfm[MAX_MB_PLANE << 2];
+  int best_mode_index;
+  int hybrid_pred_diff;
+  int comp_pred_diff;
+  int single_pred_diff;
+  int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
+
+  // TODO(jingning) Use RD_COST struct here instead. This involves a boarder
+  // scope of refactoring.
+  int rate;
+  int64_t dist;
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+  unsigned int newmv_sse;
+  unsigned int zeromv_sse;
+  PREDICTION_MODE best_sse_inter_mode;
+  int_mv best_sse_mv;
+  MV_REFERENCE_FRAME best_reference_frame;
+  MV_REFERENCE_FRAME best_zeromv_reference_frame;
+#endif
+
+  // motion vector cache for adaptive motion search control in partition
+  // search loop
+  MV pred_mv[MAX_REF_FRAMES];
+  INTERP_FILTER pred_interp_filter;
+} PICK_MODE_CONTEXT;
+
+typedef struct PC_TREE {
+  int index;
+  PARTITION_TYPE partitioning;
+  BLOCK_SIZE block_size;
+  PICK_MODE_CONTEXT none;
+  PICK_MODE_CONTEXT horizontal[2];
+  PICK_MODE_CONTEXT vertical[2];
+  union {
+    struct PC_TREE *split[4];
+    PICK_MODE_CONTEXT *leaf_split[4];
+  };
+} PC_TREE;
+
+void vp10_setup_pc_tree(struct VP10Common *cm, struct ThreadData *td);
+void vp10_free_pc_tree(struct ThreadData *td);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif /* VP10_ENCODER_CONTEXT_TREE_H_ */
diff --git a/libs/libvpx/vp10/encoder/cost.c b/libs/libvpx/vp10/encoder/cost.c
new file mode 100644
index 0000000000..aab826322b
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/cost.c
@@ -0,0 +1,63 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <assert.h>
+
+#include "vp10/encoder/cost.h"
+
+const unsigned int vp10_prob_cost[256] = {
+  2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161,
+  1129, 1099, 1072, 1046, 1023, 1000, 979,  959,  940,  922,  905,  889,
+  873,  858,  843,  829,  816,  803,  790,  778,  767,  755,  744,  733,
+  723,  713,  703,  693,  684,  675,  666,  657,  649,  641,  633,  625,
+  617,  609,  602,  594,  587,  580,  573,  567,  560,  553,  547,  541,
+  534,  528,  522,  516,  511,  505,  499,  494,  488,  483,  477,  472,
+  467,  462,  457,  452,  447,  442,  437,  433,  428,  424,  419,  415,
+  410,  406,  401,  397,  393,  389,  385,  381,  377,  373,  369,  365,
+  361,  357,  353,  349,  346,  342,  338,  335,  331,  328,  324,  321,
+  317,  314,  311,  307,  304,  301,  297,  294,  291,  288,  285,  281,
+  278,  275,  272,  269,  266,  263,  260,  257,  255,  252,  249,  246,
+  243,  240,  238,  235,  232,  229,  227,  224,  221,  219,  216,  214,
+  211,  208,  206,  203,  201,  198,  196,  194,  191,  189,  186,  184,
+  181,  179,  177,  174,  172,  170,  168,  165,  163,  161,  159,  156,
+  154,  152,  150,  148,  145,  143,  141,  139,  137,  135,  133,  131,
+  129,  127,  125,  123,  121,  119,  117,  115,  113,  111,  109,  107,
+  105,  103,  101,  99,   97,   95,   93,   92,   90,   88,   86,   84,
+  82,   81,   79,   77,   75,   73,   72,   70,   68,   66,   65,   63,
+  61,   60,   58,   56,   55,   53,   51,   50,   48,   46,   45,   43,
+  41,   40,   38,   37,   35,   33,   32,   30,   29,   27,   25,   24,
+  22,   21,   19,   18,   16,   15,   13,   12,   10,   9,    7,    6,
+  4,    3,    1,    1};
+
+static void cost(int *costs, vpx_tree tree, const vpx_prob *probs,
+                 int i, int c) {
+  const vpx_prob prob = probs[i / 2];
+  int b;
+
+  for (b = 0; b <= 1; ++b) {
+    const int cc = c + vp10_cost_bit(prob, b);
+    const vpx_tree_index ii = tree[i + b];
+
+    if (ii <= 0)
+      costs[-ii] = cc;
+    else
+      cost(costs, tree, probs, ii, cc);
+  }
+}
+
+void vp10_cost_tokens(int *costs, const vpx_prob *probs, vpx_tree tree) {
+  cost(costs, tree, probs, 0, 0);
+}
+
+void vp10_cost_tokens_skip(int *costs, const vpx_prob *probs, vpx_tree tree) {
+  assert(tree[0] <= 0 && tree[1] > 0);
+
+  costs[-tree[0]] = vp10_cost_bit(probs[0], 0);
+  cost(costs, tree, probs, 2, 0);
+}
diff --git a/libs/libvpx/vp10/encoder/cost.h b/libs/libvpx/vp10/encoder/cost.h
new file mode 100644
index 0000000000..b9619c6b1b
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/cost.h
@@ -0,0 +1,55 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_ENCODER_COST_H_
+#define VP10_ENCODER_COST_H_
+
+#include "vpx_dsp/prob.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern const unsigned int vp10_prob_cost[256];
+
+#define vp10_cost_zero(prob) (vp10_prob_cost[prob])
+
+#define vp10_cost_one(prob) vp10_cost_zero(vpx_complement(prob))
+
+#define vp10_cost_bit(prob, bit) vp10_cost_zero((bit) ? vpx_complement(prob) \
+                                                    : (prob))
+
+static INLINE unsigned int cost_branch256(const unsigned int ct[2],
+                                          vpx_prob p) {
+  return ct[0] * vp10_cost_zero(p) + ct[1] * vp10_cost_one(p);
+}
+
+static INLINE int treed_cost(vpx_tree tree, const vpx_prob *probs,
+                             int bits, int len) {
+  int cost = 0;
+  vpx_tree_index i = 0;
+
+  do {
+    const int bit = (bits >> --len) & 1;
+    cost += vp10_cost_bit(probs[i >> 1], bit);
+    i = tree[i + bit];
+  } while (len);
+
+  return cost;
+}
+
+void vp10_cost_tokens(int *costs, const vpx_prob *probs, vpx_tree tree);
+void vp10_cost_tokens_skip(int *costs, const vpx_prob *probs, vpx_tree tree);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_ENCODER_COST_H_
diff --git a/libs/libvpx/vp10/encoder/dct.c b/libs/libvpx/vp10/encoder/dct.c
new file mode 100644
index 0000000000..132a141741
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/dct.c
@@ -0,0 +1,1303 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <math.h>
+
+#include "./vp10_rtcd.h"
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vp10/common/blockd.h"
+#include "vp10/common/idct.h"
+#include "vpx_dsp/fwd_txfm.h"
+#include "vpx_ports/mem.h"
+
+static INLINE void range_check(const tran_low_t *input, const int size,
+                               const int bit) {
+#if 0  // CONFIG_COEFFICIENT_RANGE_CHECKING
+// TODO(angiebird): the range_check is not used because the bit range
+// in fdct# is not correct. Since we are going to merge in a new version
+// of fdct# from nextgenv2, we won't fix the incorrect bit range now.
+  int i;
+  for (i = 0; i < size; ++i) {
+    assert(abs(input[i]) < (1 << bit));
+  }
+#else
+  (void)input;
+  (void)size;
+  (void)bit;
+#endif
+}
+
+static void fdct4(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t temp;
+  tran_low_t step[4];
+
+  // stage 0
+  range_check(input, 4, 14);
+
+  // stage 1
+  output[0] = input[0] + input[3];
+  output[1] = input[1] + input[2];
+  output[2] = input[1] - input[2];
+  output[3] = input[0] - input[3];
+
+  range_check(output, 4, 15);
+
+  // stage 2
+  temp = output[0] * cospi_16_64 + output[1] * cospi_16_64;
+  step[0] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[1] * -cospi_16_64 + output[0] * cospi_16_64;
+  step[1] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[2] * cospi_24_64 + output[3] * cospi_8_64;
+  step[2] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[3] * cospi_24_64 + output[2] * -cospi_8_64;
+  step[3] = (tran_low_t)fdct_round_shift(temp);
+
+  range_check(step, 4, 16);
+
+  // stage 3
+  output[0] = step[0];
+  output[1] = step[2];
+  output[2] = step[1];
+  output[3] = step[3];
+
+  range_check(output, 4, 16);
+}
+
+static void fdct8(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t temp;
+  tran_low_t step[8];
+
+  // stage 0
+  range_check(input, 8, 13);
+
+  // stage 1
+  output[0] = input[0] + input[7];
+  output[1] = input[1] + input[6];
+  output[2] = input[2] + input[5];
+  output[3] = input[3] + input[4];
+  output[4] = input[3] - input[4];
+  output[5] = input[2] - input[5];
+  output[6] = input[1] - input[6];
+  output[7] = input[0] - input[7];
+
+  range_check(output, 8, 14);
+
+  // stage 2
+  step[0] = output[0] + output[3];
+  step[1] = output[1] + output[2];
+  step[2] = output[1] - output[2];
+  step[3] = output[0] - output[3];
+  step[4] = output[4];
+  temp = output[5] * -cospi_16_64 + output[6] * cospi_16_64;
+  step[5] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[6] * cospi_16_64 + output[5] * cospi_16_64;
+  step[6] = (tran_low_t)fdct_round_shift(temp);
+  step[7] = output[7];
+
+  range_check(step, 8, 15);
+
+  // stage 3
+  temp = step[0] * cospi_16_64 + step[1] * cospi_16_64;
+  output[0] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[1] * -cospi_16_64 + step[0] * cospi_16_64;
+  output[1] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[2] * cospi_24_64 + step[3] * cospi_8_64;
+  output[2] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[3] * cospi_24_64 + step[2] * -cospi_8_64;
+  output[3] = (tran_low_t)fdct_round_shift(temp);
+  output[4] = step[4] + step[5];
+  output[5] = step[4] - step[5];
+  output[6] = step[7] - step[6];
+  output[7] = step[7] + step[6];
+
+  range_check(output, 8, 16);
+
+  // stage 4
+  step[0] = output[0];
+  step[1] = output[1];
+  step[2] = output[2];
+  step[3] = output[3];
+  temp = output[4] * cospi_28_64 + output[7] * cospi_4_64;
+  step[4] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[5] * cospi_12_64 + output[6] * cospi_20_64;
+  step[5] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[6] * cospi_12_64 + output[5] * -cospi_20_64;
+  step[6] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[7] * cospi_28_64 + output[4] * -cospi_4_64;
+  step[7] = (tran_low_t)fdct_round_shift(temp);
+
+  range_check(step, 8, 16);
+
+  // stage 5
+  output[0] = step[0];
+  output[1] = step[4];
+  output[2] = step[2];
+  output[3] = step[6];
+  output[4] = step[1];
+  output[5] = step[5];
+  output[6] = step[3];
+  output[7] = step[7];
+
+  range_check(output, 8, 16);
+}
+
+static void fdct16(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t temp;
+  tran_low_t step[16];
+
+  // stage 0
+  range_check(input, 16, 13);
+
+  // stage 1
+  output[0] = input[0] + input[15];
+  output[1] = input[1] + input[14];
+  output[2] = input[2] + input[13];
+  output[3] = input[3] + input[12];
+  output[4] = input[4] + input[11];
+  output[5] = input[5] + input[10];
+  output[6] = input[6] + input[9];
+  output[7] = input[7] + input[8];
+  output[8] = input[7] - input[8];
+  output[9] = input[6] - input[9];
+  output[10] = input[5] - input[10];
+  output[11] = input[4] - input[11];
+  output[12] = input[3] - input[12];
+  output[13] = input[2] - input[13];
+  output[14] = input[1] - input[14];
+  output[15] = input[0] - input[15];
+
+  range_check(output, 16, 14);
+
+  // stage 2
+  step[0] = output[0] + output[7];
+  step[1] = output[1] + output[6];
+  step[2] = output[2] + output[5];
+  step[3] = output[3] + output[4];
+  step[4] = output[3] - output[4];
+  step[5] = output[2] - output[5];
+  step[6] = output[1] - output[6];
+  step[7] = output[0] - output[7];
+  step[8] = output[8];
+  step[9] = output[9];
+  temp = output[10] * -cospi_16_64 + output[13] * cospi_16_64;
+  step[10] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[11] * -cospi_16_64 + output[12] * cospi_16_64;
+  step[11] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[12] * cospi_16_64 + output[11] * cospi_16_64;
+  step[12] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[13] * cospi_16_64 + output[10] * cospi_16_64;
+  step[13] = (tran_low_t)fdct_round_shift(temp);
+  step[14] = output[14];
+  step[15] = output[15];
+
+  range_check(step, 16, 15);
+
+  // stage 3
+  output[0] = step[0] + step[3];
+  output[1] = step[1] + step[2];
+  output[2] = step[1] - step[2];
+  output[3] = step[0] - step[3];
+  output[4] = step[4];
+  temp = step[5] * -cospi_16_64 + step[6] * cospi_16_64;
+  output[5] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[6] * cospi_16_64 + step[5] * cospi_16_64;
+  output[6] = (tran_low_t)fdct_round_shift(temp);
+  output[7] = step[7];
+  output[8] = step[8] + step[11];
+  output[9] = step[9] + step[10];
+  output[10] = step[9] - step[10];
+  output[11] = step[8] - step[11];
+  output[12] = step[15] - step[12];
+  output[13] = step[14] - step[13];
+  output[14] = step[14] + step[13];
+  output[15] = step[15] + step[12];
+
+  range_check(output, 16, 16);
+
+  // stage 4
+  temp = output[0] * cospi_16_64 + output[1] * cospi_16_64;
+  step[0] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[1] * -cospi_16_64 + output[0] * cospi_16_64;
+  step[1] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[2] * cospi_24_64 + output[3] * cospi_8_64;
+  step[2] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[3] * cospi_24_64 + output[2] * -cospi_8_64;
+  step[3] = (tran_low_t)fdct_round_shift(temp);
+  step[4] = output[4] + output[5];
+  step[5] = output[4] - output[5];
+  step[6] = output[7] - output[6];
+  step[7] = output[7] + output[6];
+  step[8] = output[8];
+  temp = output[9] * -cospi_8_64 + output[14] * cospi_24_64;
+  step[9] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[10] * -cospi_24_64 + output[13] * -cospi_8_64;
+  step[10] = (tran_low_t)fdct_round_shift(temp);
+  step[11] = output[11];
+  step[12] = output[12];
+  temp = output[13] * cospi_24_64 + output[10] * -cospi_8_64;
+  step[13] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[14] * cospi_8_64 + output[9] * cospi_24_64;
+  step[14] = (tran_low_t)fdct_round_shift(temp);
+  step[15] = output[15];
+
+  range_check(step, 16, 16);
+
+  // stage 5
+  output[0] = step[0];
+  output[1] = step[1];
+  output[2] = step[2];
+  output[3] = step[3];
+  temp = step[4] * cospi_28_64 + step[7] * cospi_4_64;
+  output[4] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[5] * cospi_12_64 + step[6] * cospi_20_64;
+  output[5] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[6] * cospi_12_64 + step[5] * -cospi_20_64;
+  output[6] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[7] * cospi_28_64 + step[4] * -cospi_4_64;
+  output[7] = (tran_low_t)fdct_round_shift(temp);
+  output[8] = step[8] + step[9];
+  output[9] = step[8] - step[9];
+  output[10] = step[11] - step[10];
+  output[11] = step[11] + step[10];
+  output[12] = step[12] + step[13];
+  output[13] = step[12] - step[13];
+  output[14] = step[15] - step[14];
+  output[15] = step[15] + step[14];
+
+  range_check(output, 16, 16);
+
+  // stage 6
+  step[0] = output[0];
+  step[1] = output[1];
+  step[2] = output[2];
+  step[3] = output[3];
+  step[4] = output[4];
+  step[5] = output[5];
+  step[6] = output[6];
+  step[7] = output[7];
+  temp = output[8] * cospi_30_64 + output[15] * cospi_2_64;
+  step[8] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[9] * cospi_14_64 + output[14] * cospi_18_64;
+  step[9] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[10] * cospi_22_64 + output[13] * cospi_10_64;
+  step[10] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[11] * cospi_6_64 + output[12] * cospi_26_64;
+  step[11] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[12] * cospi_6_64 + output[11] * -cospi_26_64;
+  step[12] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[13] * cospi_22_64 + output[10] * -cospi_10_64;
+  step[13] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[14] * cospi_14_64 + output[9] * -cospi_18_64;
+  step[14] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[15] * cospi_30_64 + output[8] * -cospi_2_64;
+  step[15] = (tran_low_t)fdct_round_shift(temp);
+
+  range_check(step, 16, 16);
+
+  // stage 7
+  output[0] = step[0];
+  output[1] = step[8];
+  output[2] = step[4];
+  output[3] = step[12];
+  output[4] = step[2];
+  output[5] = step[10];
+  output[6] = step[6];
+  output[7] = step[14];
+  output[8] = step[1];
+  output[9] = step[9];
+  output[10] = step[5];
+  output[11] = step[13];
+  output[12] = step[3];
+  output[13] = step[11];
+  output[14] = step[7];
+  output[15] = step[15];
+
+  range_check(output, 16, 16);
+}
+
+/* TODO(angiebird): Unify this with vp10_fwd_txfm.c: vp10_fdct32
+static void fdct32(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t temp;
+  tran_low_t step[32];
+
+  // stage 0
+  range_check(input, 32, 14);
+
+  // stage 1
+  output[0] = input[0] + input[31];
+  output[1] = input[1] + input[30];
+  output[2] = input[2] + input[29];
+  output[3] = input[3] + input[28];
+  output[4] = input[4] + input[27];
+  output[5] = input[5] + input[26];
+  output[6] = input[6] + input[25];
+  output[7] = input[7] + input[24];
+  output[8] = input[8] + input[23];
+  output[9] = input[9] + input[22];
+  output[10] = input[10] + input[21];
+  output[11] = input[11] + input[20];
+  output[12] = input[12] + input[19];
+  output[13] = input[13] + input[18];
+  output[14] = input[14] + input[17];
+  output[15] = input[15] + input[16];
+  output[16] = input[15] - input[16];
+  output[17] = input[14] - input[17];
+  output[18] = input[13] - input[18];
+  output[19] = input[12] - input[19];
+  output[20] = input[11] - input[20];
+  output[21] = input[10] - input[21];
+  output[22] = input[9] - input[22];
+  output[23] = input[8] - input[23];
+  output[24] = input[7] - input[24];
+  output[25] = input[6] - input[25];
+  output[26] = input[5] - input[26];
+  output[27] = input[4] - input[27];
+  output[28] = input[3] - input[28];
+  output[29] = input[2] - input[29];
+  output[30] = input[1] - input[30];
+  output[31] = input[0] - input[31];
+
+  range_check(output, 32, 15);
+
+  // stage 2
+  step[0] = output[0] + output[15];
+  step[1] = output[1] + output[14];
+  step[2] = output[2] + output[13];
+  step[3] = output[3] + output[12];
+  step[4] = output[4] + output[11];
+  step[5] = output[5] + output[10];
+  step[6] = output[6] + output[9];
+  step[7] = output[7] + output[8];
+  step[8] = output[7] - output[8];
+  step[9] = output[6] - output[9];
+  step[10] = output[5] - output[10];
+  step[11] = output[4] - output[11];
+  step[12] = output[3] - output[12];
+  step[13] = output[2] - output[13];
+  step[14] = output[1] - output[14];
+  step[15] = output[0] - output[15];
+  step[16] = output[16];
+  step[17] = output[17];
+  step[18] = output[18];
+  step[19] = output[19];
+  temp = output[20] * -cospi_16_64 + output[27] * cospi_16_64;
+  step[20] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[21] * -cospi_16_64 + output[26] * cospi_16_64;
+  step[21] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[22] * -cospi_16_64 + output[25] * cospi_16_64;
+  step[22] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[23] * -cospi_16_64 + output[24] * cospi_16_64;
+  step[23] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[24] * cospi_16_64 + output[23] * cospi_16_64;
+  step[24] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[25] * cospi_16_64 + output[22] * cospi_16_64;
+  step[25] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[26] * cospi_16_64 + output[21] * cospi_16_64;
+  step[26] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[27] * cospi_16_64 + output[20] * cospi_16_64;
+  step[27] = (tran_low_t)fdct_round_shift(temp);
+  step[28] = output[28];
+  step[29] = output[29];
+  step[30] = output[30];
+  step[31] = output[31];
+
+  range_check(step, 32, 16);
+
+  // stage 3
+  output[0] = step[0] + step[7];
+  output[1] = step[1] + step[6];
+  output[2] = step[2] + step[5];
+  output[3] = step[3] + step[4];
+  output[4] = step[3] - step[4];
+  output[5] = step[2] - step[5];
+  output[6] = step[1] - step[6];
+  output[7] = step[0] - step[7];
+  output[8] = step[8];
+  output[9] = step[9];
+  temp = step[10] * -cospi_16_64 + step[13] * cospi_16_64;
+  output[10] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[11] * -cospi_16_64 + step[12] * cospi_16_64;
+  output[11] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[12] * cospi_16_64 + step[11] * cospi_16_64;
+  output[12] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[13] * cospi_16_64 + step[10] * cospi_16_64;
+  output[13] = (tran_low_t)fdct_round_shift(temp);
+  output[14] = step[14];
+  output[15] = step[15];
+  output[16] = step[16] + step[23];
+  output[17] = step[17] + step[22];
+  output[18] = step[18] + step[21];
+  output[19] = step[19] + step[20];
+  output[20] = step[19] - step[20];
+  output[21] = step[18] - step[21];
+  output[22] = step[17] - step[22];
+  output[23] = step[16] - step[23];
+  output[24] = step[31] - step[24];
+  output[25] = step[30] - step[25];
+  output[26] = step[29] - step[26];
+  output[27] = step[28] - step[27];
+  output[28] = step[28] + step[27];
+  output[29] = step[29] + step[26];
+  output[30] = step[30] + step[25];
+  output[31] = step[31] + step[24];
+
+  range_check(output, 32, 17);
+
+  // stage 4
+  step[0] = output[0] + output[3];
+  step[1] = output[1] + output[2];
+  step[2] = output[1] - output[2];
+  step[3] = output[0] - output[3];
+  step[4] = output[4];
+  temp = output[5] * -cospi_16_64 + output[6] * cospi_16_64;
+  step[5] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[6] * cospi_16_64 + output[5] * cospi_16_64;
+  step[6] = (tran_low_t)fdct_round_shift(temp);
+  step[7] = output[7];
+  step[8] = output[8] + output[11];
+  step[9] = output[9] + output[10];
+  step[10] = output[9] - output[10];
+  step[11] = output[8] - output[11];
+  step[12] = output[15] - output[12];
+  step[13] = output[14] - output[13];
+  step[14] = output[14] + output[13];
+  step[15] = output[15] + output[12];
+  step[16] = output[16];
+  step[17] = output[17];
+  temp = output[18] * -cospi_8_64 + output[29] * cospi_24_64;
+  step[18] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[19] * -cospi_8_64 + output[28] * cospi_24_64;
+  step[19] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[20] * -cospi_24_64 + output[27] * -cospi_8_64;
+  step[20] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[21] * -cospi_24_64 + output[26] * -cospi_8_64;
+  step[21] = (tran_low_t)fdct_round_shift(temp);
+  step[22] = output[22];
+  step[23] = output[23];
+  step[24] = output[24];
+  step[25] = output[25];
+  temp = output[26] * cospi_24_64 + output[21] * -cospi_8_64;
+  step[26] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[27] * cospi_24_64 + output[20] * -cospi_8_64;
+  step[27] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[28] * cospi_8_64 + output[19] * cospi_24_64;
+  step[28] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[29] * cospi_8_64 + output[18] * cospi_24_64;
+  step[29] = (tran_low_t)fdct_round_shift(temp);
+  step[30] = output[30];
+  step[31] = output[31];
+
+  range_check(step, 32, 18);
+
+  // stage 5
+  temp = step[0] * cospi_16_64 + step[1] * cospi_16_64;
+  output[0] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[1] * -cospi_16_64 + step[0] * cospi_16_64;
+  output[1] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[2] * cospi_24_64 + step[3] * cospi_8_64;
+  output[2] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[3] * cospi_24_64 + step[2] * -cospi_8_64;
+  output[3] = (tran_low_t)fdct_round_shift(temp);
+  output[4] = step[4] + step[5];
+  output[5] = step[4] - step[5];
+  output[6] = step[7] - step[6];
+  output[7] = step[7] + step[6];
+  output[8] = step[8];
+  temp = step[9] * -cospi_8_64 + step[14] * cospi_24_64;
+  output[9] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[10] * -cospi_24_64 + step[13] * -cospi_8_64;
+  output[10] = (tran_low_t)fdct_round_shift(temp);
+  output[11] = step[11];
+  output[12] = step[12];
+  temp = step[13] * cospi_24_64 + step[10] * -cospi_8_64;
+  output[13] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[14] * cospi_8_64 + step[9] * cospi_24_64;
+  output[14] = (tran_low_t)fdct_round_shift(temp);
+  output[15] = step[15];
+  output[16] = step[16] + step[19];
+  output[17] = step[17] + step[18];
+  output[18] = step[17] - step[18];
+  output[19] = step[16] - step[19];
+  output[20] = step[23] - step[20];
+  output[21] = step[22] - step[21];
+  output[22] = step[22] + step[21];
+  output[23] = step[23] + step[20];
+  output[24] = step[24] + step[27];
+  output[25] = step[25] + step[26];
+  output[26] = step[25] - step[26];
+  output[27] = step[24] - step[27];
+  output[28] = step[31] - step[28];
+  output[29] = step[30] - step[29];
+  output[30] = step[30] + step[29];
+  output[31] = step[31] + step[28];
+
+  range_check(output, 32, 18);
+
+  // stage 6
+  step[0] = output[0];
+  step[1] = output[1];
+  step[2] = output[2];
+  step[3] = output[3];
+  temp = output[4] * cospi_28_64 + output[7] * cospi_4_64;
+  step[4] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[5] * cospi_12_64 + output[6] * cospi_20_64;
+  step[5] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[6] * cospi_12_64 + output[5] * -cospi_20_64;
+  step[6] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[7] * cospi_28_64 + output[4] * -cospi_4_64;
+  step[7] = (tran_low_t)fdct_round_shift(temp);
+  step[8] = output[8] + output[9];
+  step[9] = output[8] - output[9];
+  step[10] = output[11] - output[10];
+  step[11] = output[11] + output[10];
+  step[12] = output[12] + output[13];
+  step[13] = output[12] - output[13];
+  step[14] = output[15] - output[14];
+  step[15] = output[15] + output[14];
+  step[16] = output[16];
+  temp = output[17] * -cospi_4_64 + output[30] * cospi_28_64;
+  step[17] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[18] * -cospi_28_64 + output[29] * -cospi_4_64;
+  step[18] = (tran_low_t)fdct_round_shift(temp);
+  step[19] = output[19];
+  step[20] = output[20];
+  temp = output[21] * -cospi_20_64 + output[26] * cospi_12_64;
+  step[21] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[22] * -cospi_12_64 + output[25] * -cospi_20_64;
+  step[22] = (tran_low_t)fdct_round_shift(temp);
+  step[23] = output[23];
+  step[24] = output[24];
+  temp = output[25] * cospi_12_64 + output[22] * -cospi_20_64;
+  step[25] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[26] * cospi_20_64 + output[21] * cospi_12_64;
+  step[26] = (tran_low_t)fdct_round_shift(temp);
+  step[27] = output[27];
+  step[28] = output[28];
+  temp = output[29] * cospi_28_64 + output[18] * -cospi_4_64;
+  step[29] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[30] * cospi_4_64 + output[17] * cospi_28_64;
+  step[30] = (tran_low_t)fdct_round_shift(temp);
+  step[31] = output[31];
+
+  range_check(step, 32, 18);
+
+  // stage 7
+  output[0] = step[0];
+  output[1] = step[1];
+  output[2] = step[2];
+  output[3] = step[3];
+  output[4] = step[4];
+  output[5] = step[5];
+  output[6] = step[6];
+  output[7] = step[7];
+  temp = step[8] * cospi_30_64 + step[15] * cospi_2_64;
+  output[8] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[9] * cospi_14_64 + step[14] * cospi_18_64;
+  output[9] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[10] * cospi_22_64 + step[13] * cospi_10_64;
+  output[10] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[11] * cospi_6_64 + step[12] * cospi_26_64;
+  output[11] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[12] * cospi_6_64 + step[11] * -cospi_26_64;
+  output[12] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[13] * cospi_22_64 + step[10] * -cospi_10_64;
+  output[13] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[14] * cospi_14_64 + step[9] * -cospi_18_64;
+  output[14] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[15] * cospi_30_64 + step[8] * -cospi_2_64;
+  output[15] = (tran_low_t)fdct_round_shift(temp);
+  output[16] = step[16] + step[17];
+  output[17] = step[16] - step[17];
+  output[18] = step[19] - step[18];
+  output[19] = step[19] + step[18];
+  output[20] = step[20] + step[21];
+  output[21] = step[20] - step[21];
+  output[22] = step[23] - step[22];
+  output[23] = step[23] + step[22];
+  output[24] = step[24] + step[25];
+  output[25] = step[24] - step[25];
+  output[26] = step[27] - step[26];
+  output[27] = step[27] + step[26];
+  output[28] = step[28] + step[29];
+  output[29] = step[28] - step[29];
+  output[30] = step[31] - step[30];
+  output[31] = step[31] + step[30];
+
+  range_check(output, 32, 18);
+
+  // stage 8
+  step[0] = output[0];
+  step[1] = output[1];
+  step[2] = output[2];
+  step[3] = output[3];
+  step[4] = output[4];
+  step[5] = output[5];
+  step[6] = output[6];
+  step[7] = output[7];
+  step[8] = output[8];
+  step[9] = output[9];
+  step[10] = output[10];
+  step[11] = output[11];
+  step[12] = output[12];
+  step[13] = output[13];
+  step[14] = output[14];
+  step[15] = output[15];
+  temp = output[16] * cospi_31_64 + output[31] * cospi_1_64;
+  step[16] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[17] * cospi_15_64 + output[30] * cospi_17_64;
+  step[17] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[18] * cospi_23_64 + output[29] * cospi_9_64;
+  step[18] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[19] * cospi_7_64 + output[28] * cospi_25_64;
+  step[19] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[20] * cospi_27_64 + output[27] * cospi_5_64;
+  step[20] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[21] * cospi_11_64 + output[26] * cospi_21_64;
+  step[21] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[22] * cospi_19_64 + output[25] * cospi_13_64;
+  step[22] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[23] * cospi_3_64 + output[24] * cospi_29_64;
+  step[23] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[24] * cospi_3_64 + output[23] * -cospi_29_64;
+  step[24] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[25] * cospi_19_64 + output[22] * -cospi_13_64;
+  step[25] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[26] * cospi_11_64 + output[21] * -cospi_21_64;
+  step[26] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[27] * cospi_27_64 + output[20] * -cospi_5_64;
+  step[27] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[28] * cospi_7_64 + output[19] * -cospi_25_64;
+  step[28] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[29] * cospi_23_64 + output[18] * -cospi_9_64;
+  step[29] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[30] * cospi_15_64 + output[17] * -cospi_17_64;
+  step[30] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[31] * cospi_31_64 + output[16] * -cospi_1_64;
+  step[31] = (tran_low_t)fdct_round_shift(temp);
+
+  range_check(step, 32, 18);
+
+  // stage 9
+  output[0] = step[0];
+  output[1] = step[16];
+  output[2] = step[8];
+  output[3] = step[24];
+  output[4] = step[4];
+  output[5] = step[20];
+  output[6] = step[12];
+  output[7] = step[28];
+  output[8] = step[2];
+  output[9] = step[18];
+  output[10] = step[10];
+  output[11] = step[26];
+  output[12] = step[6];
+  output[13] = step[22];
+  output[14] = step[14];
+  output[15] = step[30];
+  output[16] = step[1];
+  output[17] = step[17];
+  output[18] = step[9];
+  output[19] = step[25];
+  output[20] = step[5];
+  output[21] = step[21];
+  output[22] = step[13];
+  output[23] = step[29];
+  output[24] = step[3];
+  output[25] = step[19];
+  output[26] = step[11];
+  output[27] = step[27];
+  output[28] = step[7];
+  output[29] = step[23];
+  output[30] = step[15];
+  output[31] = step[31];
+
+  range_check(output, 32, 18);
+}
+*/
+
+static void fadst4(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t x0, x1, x2, x3;
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+  x0 = input[0];
+  x1 = input[1];
+  x2 = input[2];
+  x3 = input[3];
+
+  if (!(x0 | x1 | x2 | x3)) {
+    output[0] = output[1] = output[2] = output[3] = 0;
+    return;
+  }
+
+  s0 = sinpi_1_9 * x0;
+  s1 = sinpi_4_9 * x0;
+  s2 = sinpi_2_9 * x1;
+  s3 = sinpi_1_9 * x1;
+  s4 = sinpi_3_9 * x2;
+  s5 = sinpi_4_9 * x3;
+  s6 = sinpi_2_9 * x3;
+  s7 = x0 + x1 - x3;
+
+  x0 = s0 + s2 + s5;
+  x1 = sinpi_3_9 * s7;
+  x2 = s1 - s3 + s6;
+  x3 = s4;
+
+  s0 = x0 + x3;
+  s1 = x1;
+  s2 = x2 - x3;
+  s3 = x2 - x0 + x3;
+
+  // 1-D transform scaling factor is sqrt(2).
+  output[0] = (tran_low_t)fdct_round_shift(s0);
+  output[1] = (tran_low_t)fdct_round_shift(s1);
+  output[2] = (tran_low_t)fdct_round_shift(s2);
+  output[3] = (tran_low_t)fdct_round_shift(s3);
+}
+
+static void fadst8(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+  tran_high_t x0 = input[7];
+  tran_high_t x1 = input[0];
+  tran_high_t x2 = input[5];
+  tran_high_t x3 = input[2];
+  tran_high_t x4 = input[3];
+  tran_high_t x5 = input[4];
+  tran_high_t x6 = input[1];
+  tran_high_t x7 = input[6];
+
+  // stage 1
+  s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
+  s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
+  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
+  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
+  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
+  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
+  s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
+  s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
+
+  x0 = fdct_round_shift(s0 + s4);
+  x1 = fdct_round_shift(s1 + s5);
+  x2 = fdct_round_shift(s2 + s6);
+  x3 = fdct_round_shift(s3 + s7);
+  x4 = fdct_round_shift(s0 - s4);
+  x5 = fdct_round_shift(s1 - s5);
+  x6 = fdct_round_shift(s2 - s6);
+  x7 = fdct_round_shift(s3 - s7);
+
+  // stage 2
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = cospi_8_64  * x4 + cospi_24_64 * x5;
+  s5 = cospi_24_64 * x4 - cospi_8_64  * x5;
+  s6 = - cospi_24_64 * x6 + cospi_8_64  * x7;
+  s7 =   cospi_8_64  * x6 + cospi_24_64 * x7;
+
+  x0 = s0 + s2;
+  x1 = s1 + s3;
+  x2 = s0 - s2;
+  x3 = s1 - s3;
+  x4 = fdct_round_shift(s4 + s6);
+  x5 = fdct_round_shift(s5 + s7);
+  x6 = fdct_round_shift(s4 - s6);
+  x7 = fdct_round_shift(s5 - s7);
+
+  // stage 3
+  s2 = cospi_16_64 * (x2 + x3);
+  s3 = cospi_16_64 * (x2 - x3);
+  s6 = cospi_16_64 * (x6 + x7);
+  s7 = cospi_16_64 * (x6 - x7);
+
+  x2 = fdct_round_shift(s2);
+  x3 = fdct_round_shift(s3);
+  x6 = fdct_round_shift(s6);
+  x7 = fdct_round_shift(s7);
+
+  output[0] = (tran_low_t)x0;
+  output[1] = (tran_low_t)-x4;
+  output[2] = (tran_low_t)x6;
+  output[3] = (tran_low_t)-x2;
+  output[4] = (tran_low_t)x3;
+  output[5] = (tran_low_t)-x7;
+  output[6] = (tran_low_t)x5;
+  output[7] = (tran_low_t)-x1;
+}
+
+static void fadst16(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
+  tran_high_t s9, s10, s11, s12, s13, s14, s15;
+
+  tran_high_t x0 = input[15];
+  tran_high_t x1 = input[0];
+  tran_high_t x2 = input[13];
+  tran_high_t x3 = input[2];
+  tran_high_t x4 = input[11];
+  tran_high_t x5 = input[4];
+  tran_high_t x6 = input[9];
+  tran_high_t x7 = input[6];
+  tran_high_t x8 = input[7];
+  tran_high_t x9 = input[8];
+  tran_high_t x10 = input[5];
+  tran_high_t x11 = input[10];
+  tran_high_t x12 = input[3];
+  tran_high_t x13 = input[12];
+  tran_high_t x14 = input[1];
+  tran_high_t x15 = input[14];
+
+  // stage 1
+  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
+  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
+  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
+  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
+  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
+  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
+  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
+  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
+  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
+  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
+  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
+  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
+  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
+  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
+  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
+  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
+
+  x0 = fdct_round_shift(s0 + s8);
+  x1 = fdct_round_shift(s1 + s9);
+  x2 = fdct_round_shift(s2 + s10);
+  x3 = fdct_round_shift(s3 + s11);
+  x4 = fdct_round_shift(s4 + s12);
+  x5 = fdct_round_shift(s5 + s13);
+  x6 = fdct_round_shift(s6 + s14);
+  x7 = fdct_round_shift(s7 + s15);
+  x8  = fdct_round_shift(s0 - s8);
+  x9  = fdct_round_shift(s1 - s9);
+  x10 = fdct_round_shift(s2 - s10);
+  x11 = fdct_round_shift(s3 - s11);
+  x12 = fdct_round_shift(s4 - s12);
+  x13 = fdct_round_shift(s5 - s13);
+  x14 = fdct_round_shift(s6 - s14);
+  x15 = fdct_round_shift(s7 - s15);
+
+  // stage 2
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = x4;
+  s5 = x5;
+  s6 = x6;
+  s7 = x7;
+  s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;
+  s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;
+  s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;
+  s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;
+  s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
+  s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;
+  s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
+  s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;
+
+  x0 = s0 + s4;
+  x1 = s1 + s5;
+  x2 = s2 + s6;
+  x3 = s3 + s7;
+  x4 = s0 - s4;
+  x5 = s1 - s5;
+  x6 = s2 - s6;
+  x7 = s3 - s7;
+  x8 = fdct_round_shift(s8 + s12);
+  x9 = fdct_round_shift(s9 + s13);
+  x10 = fdct_round_shift(s10 + s14);
+  x11 = fdct_round_shift(s11 + s15);
+  x12 = fdct_round_shift(s8 - s12);
+  x13 = fdct_round_shift(s9 - s13);
+  x14 = fdct_round_shift(s10 - s14);
+  x15 = fdct_round_shift(s11 - s15);
+
+  // stage 3
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = x4 * cospi_8_64  + x5 * cospi_24_64;
+  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
+  s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
+  s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;
+  s8 = x8;
+  s9 = x9;
+  s10 = x10;
+  s11 = x11;
+  s12 = x12 * cospi_8_64  + x13 * cospi_24_64;
+  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
+  s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
+  s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;
+
+  x0 = s0 + s2;
+  x1 = s1 + s3;
+  x2 = s0 - s2;
+  x3 = s1 - s3;
+  x4 = fdct_round_shift(s4 + s6);
+  x5 = fdct_round_shift(s5 + s7);
+  x6 = fdct_round_shift(s4 - s6);
+  x7 = fdct_round_shift(s5 - s7);
+  x8 = s8 + s10;
+  x9 = s9 + s11;
+  x10 = s8 - s10;
+  x11 = s9 - s11;
+  x12 = fdct_round_shift(s12 + s14);
+  x13 = fdct_round_shift(s13 + s15);
+  x14 = fdct_round_shift(s12 - s14);
+  x15 = fdct_round_shift(s13 - s15);
+
+  // stage 4
+  s2 = (- cospi_16_64) * (x2 + x3);
+  s3 = cospi_16_64 * (x2 - x3);
+  s6 = cospi_16_64 * (x6 + x7);
+  s7 = cospi_16_64 * (- x6 + x7);
+  s10 = cospi_16_64 * (x10 + x11);
+  s11 = cospi_16_64 * (- x10 + x11);
+  s14 = (- cospi_16_64) * (x14 + x15);
+  s15 = cospi_16_64 * (x14 - x15);
+
+  x2 = fdct_round_shift(s2);
+  x3 = fdct_round_shift(s3);
+  x6 = fdct_round_shift(s6);
+  x7 = fdct_round_shift(s7);
+  x10 = fdct_round_shift(s10);
+  x11 = fdct_round_shift(s11);
+  x14 = fdct_round_shift(s14);
+  x15 = fdct_round_shift(s15);
+
+  output[0] = (tran_low_t)x0;
+  output[1] = (tran_low_t)-x8;
+  output[2] = (tran_low_t)x12;
+  output[3] = (tran_low_t)-x4;
+  output[4] = (tran_low_t)x6;
+  output[5] = (tran_low_t)x14;
+  output[6] = (tran_low_t)x10;
+  output[7] = (tran_low_t)x2;
+  output[8] = (tran_low_t)x3;
+  output[9] = (tran_low_t)x11;
+  output[10] = (tran_low_t)x15;
+  output[11] = (tran_low_t)x7;
+  output[12] = (tran_low_t)x5;
+  output[13] = (tran_low_t)-x13;
+  output[14] = (tran_low_t)x9;
+  output[15] = (tran_low_t)-x1;
+}
+
+static const transform_2d FHT_4[] = {
+  { fdct4,  fdct4  },  // DCT_DCT  = 0
+  { fadst4, fdct4  },  // ADST_DCT = 1
+  { fdct4,  fadst4 },  // DCT_ADST = 2
+  { fadst4, fadst4 }   // ADST_ADST = 3
+};
+
+static const transform_2d FHT_8[] = {
+  { fdct8,  fdct8  },  // DCT_DCT  = 0
+  { fadst8, fdct8  },  // ADST_DCT = 1
+  { fdct8,  fadst8 },  // DCT_ADST = 2
+  { fadst8, fadst8 }   // ADST_ADST = 3
+};
+
+static const transform_2d FHT_16[] = {
+  { fdct16,  fdct16  },  // DCT_DCT  = 0
+  { fadst16, fdct16  },  // ADST_DCT = 1
+  { fdct16,  fadst16 },  // DCT_ADST = 2
+  { fadst16, fadst16 }   // ADST_ADST = 3
+};
+
+void vp10_fht4x4_c(const int16_t *input, tran_low_t *output,
+                  int stride, int tx_type) {
+  if (tx_type == DCT_DCT) {
+    vpx_fdct4x4_c(input, output, stride);
+  } else {
+    tran_low_t out[4 * 4];
+    int i, j;
+    tran_low_t temp_in[4], temp_out[4];
+    const transform_2d ht = FHT_4[tx_type];
+
+    // Columns
+    for (i = 0; i < 4; ++i) {
+      for (j = 0; j < 4; ++j)
+        temp_in[j] = input[j * stride + i] * 16;
+      if (i == 0 && temp_in[0])
+        temp_in[0] += 1;
+      ht.cols(temp_in, temp_out);
+      for (j = 0; j < 4; ++j)
+        out[j * 4 + i] = temp_out[j];
+    }
+
+    // Rows
+    for (i = 0; i < 4; ++i) {
+      for (j = 0; j < 4; ++j)
+        temp_in[j] = out[j + i * 4];
+      ht.rows(temp_in, temp_out);
+      for (j = 0; j < 4; ++j)
+        output[j + i * 4] = (temp_out[j] + 1) >> 2;
+    }
+  }
+}
+
+void vp10_fdct8x8_quant_c(const int16_t *input, int stride,
+                         tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                         int skip_block,
+                         const int16_t *zbin_ptr, const int16_t *round_ptr,
+                         const int16_t *quant_ptr,
+                         const int16_t *quant_shift_ptr,
+                         tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                         const int16_t *dequant_ptr,
+                         uint16_t *eob_ptr,
+                         const int16_t *scan, const int16_t *iscan) {
+  int eob = -1;
+
+  int i, j;
+  tran_low_t intermediate[64];
+
+  // Transform columns
+  {
+    tran_low_t *output = intermediate;
+    tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
+    tran_high_t t0, t1, t2, t3;                  // needs32
+    tran_high_t x0, x1, x2, x3;                  // canbe16
+
+    int i;
+    for (i = 0; i < 8; i++) {
+      // stage 1
+      s0 = (input[0 * stride] + input[7 * stride]) * 4;
+      s1 = (input[1 * stride] + input[6 * stride]) * 4;
+      s2 = (input[2 * stride] + input[5 * stride]) * 4;
+      s3 = (input[3 * stride] + input[4 * stride]) * 4;
+      s4 = (input[3 * stride] - input[4 * stride]) * 4;
+      s5 = (input[2 * stride] - input[5 * stride]) * 4;
+      s6 = (input[1 * stride] - input[6 * stride]) * 4;
+      s7 = (input[0 * stride] - input[7 * stride]) * 4;
+
+      // fdct4(step, step);
+      x0 = s0 + s3;
+      x1 = s1 + s2;
+      x2 = s1 - s2;
+      x3 = s0 - s3;
+      t0 = (x0 + x1) * cospi_16_64;
+      t1 = (x0 - x1) * cospi_16_64;
+      t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
+      t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
+      output[0 * 8] = (tran_low_t)fdct_round_shift(t0);
+      output[2 * 8] = (tran_low_t)fdct_round_shift(t2);
+      output[4 * 8] = (tran_low_t)fdct_round_shift(t1);
+      output[6 * 8] = (tran_low_t)fdct_round_shift(t3);
+
+      // stage 2
+      t0 = (s6 - s5) * cospi_16_64;
+      t1 = (s6 + s5) * cospi_16_64;
+      t2 = fdct_round_shift(t0);
+      t3 = fdct_round_shift(t1);
+
+      // stage 3
+      x0 = s4 + t2;
+      x1 = s4 - t2;
+      x2 = s7 - t3;
+      x3 = s7 + t3;
+
+      // stage 4
+      t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
+      t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
+      t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+      t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
+      output[1 * 8] = (tran_low_t)fdct_round_shift(t0);
+      output[3 * 8] = (tran_low_t)fdct_round_shift(t2);
+      output[5 * 8] = (tran_low_t)fdct_round_shift(t1);
+      output[7 * 8] = (tran_low_t)fdct_round_shift(t3);
+      input++;
+      output++;
+    }
+  }
+
+  // Rows
+  for (i = 0; i < 8; ++i) {
+    fdct8(&intermediate[i * 8], &coeff_ptr[i * 8]);
+    for (j = 0; j < 8; ++j)
+      coeff_ptr[j + i * 8] /= 2;
+  }
+
+  // TODO(jingning) Decide the need of these arguments after the
+  // quantization process is completed.
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Quantization pass: All coefficients with index >= zero_flag are
+    // skippable. Note: zero_flag can be zero.
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+      int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+      tmp = (tmp * quant_ptr[rc != 0]) >> 16;
+
+      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+
+      if (tmp)
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+void vp10_fht8x8_c(const int16_t *input, tran_low_t *output,
+                  int stride, int tx_type) {
+  if (tx_type == DCT_DCT) {
+    vpx_fdct8x8_c(input, output, stride);
+  } else {
+    tran_low_t out[64];
+    int i, j;
+    tran_low_t temp_in[8], temp_out[8];
+    const transform_2d ht = FHT_8[tx_type];
+
+    // Columns
+    for (i = 0; i < 8; ++i) {
+      for (j = 0; j < 8; ++j)
+        temp_in[j] = input[j * stride + i] * 4;
+      ht.cols(temp_in, temp_out);
+      for (j = 0; j < 8; ++j)
+        out[j * 8 + i] = temp_out[j];
+    }
+
+    // Rows
+    for (i = 0; i < 8; ++i) {
+      for (j = 0; j < 8; ++j)
+        temp_in[j] = out[j + i * 8];
+      ht.rows(temp_in, temp_out);
+      for (j = 0; j < 8; ++j)
+        output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
+    }
+  }
+}
+
+/* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
+   pixel. */
+void vp10_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
+  int i;
+  tran_high_t a1, b1, c1, d1, e1;
+  const int16_t *ip_pass0 = input;
+  const tran_low_t *ip = NULL;
+  tran_low_t *op = output;
+
+  for (i = 0; i < 4; i++) {
+    a1 = ip_pass0[0 * stride];
+    b1 = ip_pass0[1 * stride];
+    c1 = ip_pass0[2 * stride];
+    d1 = ip_pass0[3 * stride];
+
+    a1 += b1;
+    d1 = d1 - c1;
+    e1 = (a1 - d1) >> 1;
+    b1 = e1 - b1;
+    c1 = e1 - c1;
+    a1 -= c1;
+    d1 += b1;
+    op[0] = (tran_low_t)a1;
+    op[4] = (tran_low_t)c1;
+    op[8] = (tran_low_t)d1;
+    op[12] = (tran_low_t)b1;
+
+    ip_pass0++;
+    op++;
+  }
+  ip = output;
+  op = output;
+
+  for (i = 0; i < 4; i++) {
+    a1 = ip[0];
+    b1 = ip[1];
+    c1 = ip[2];
+    d1 = ip[3];
+
+    a1 += b1;
+    d1 -= c1;
+    e1 = (a1 - d1) >> 1;
+    b1 = e1 - b1;
+    c1 = e1 - c1;
+    a1 -= c1;
+    d1 += b1;
+    op[0] = (tran_low_t)(a1 * UNIT_QUANT_FACTOR);
+    op[1] = (tran_low_t)(c1 * UNIT_QUANT_FACTOR);
+    op[2] = (tran_low_t)(d1 * UNIT_QUANT_FACTOR);
+    op[3] = (tran_low_t)(b1 * UNIT_QUANT_FACTOR);
+
+    ip += 4;
+    op += 4;
+  }
+}
+
+void vp10_fht16x16_c(const int16_t *input, tran_low_t *output,
+                    int stride, int tx_type) {
+  if (tx_type == DCT_DCT) {
+    vpx_fdct16x16_c(input, output, stride);
+  } else {
+    tran_low_t out[256];
+    int i, j;
+    tran_low_t temp_in[16], temp_out[16];
+    const transform_2d ht = FHT_16[tx_type];
+
+    // Columns
+    for (i = 0; i < 16; ++i) {
+      for (j = 0; j < 16; ++j)
+        temp_in[j] = input[j * stride + i] * 4;
+      ht.cols(temp_in, temp_out);
+      for (j = 0; j < 16; ++j)
+        out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
+    }
+
+    // Rows
+    for (i = 0; i < 16; ++i) {
+      for (j = 0; j < 16; ++j)
+        temp_in[j] = out[j + i * 16];
+      ht.rows(temp_in, temp_out);
+      for (j = 0; j < 16; ++j)
+        output[j + i * 16] = temp_out[j];
+    }
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp10_highbd_fht4x4_c(const int16_t *input, tran_low_t *output,
+                         int stride, int tx_type) {
+  vp10_fht4x4_c(input, output, stride, tx_type);
+}
+
+void vp10_highbd_fht8x8_c(const int16_t *input, tran_low_t *output,
+                         int stride, int tx_type) {
+  vp10_fht8x8_c(input, output, stride, tx_type);
+}
+
+void vp10_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output,
+                          int stride) {
+  vp10_fwht4x4_c(input, output, stride);
+}
+
+void vp10_highbd_fht16x16_c(const int16_t *input, tran_low_t *output,
+                           int stride, int tx_type) {
+  vp10_fht16x16_c(input, output, stride, tx_type);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/libs/libvpx/vp10/encoder/denoiser.c b/libs/libvpx/vp10/encoder/denoiser.c
new file mode 100644
index 0000000000..e5d8157a4a
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/denoiser.c
@@ -0,0 +1,500 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_scale/yv12config.h"
+#include "vpx/vpx_integer.h"
+#include "vp10/common/reconinter.h"
+#include "vp10/encoder/context_tree.h"
+#include "vp10/encoder/denoiser.h"
+
+/* The VP9 denoiser is a work-in-progress. It currently is only designed to work
+ * with speed 6, though it (inexplicably) seems to also work with speed 5 (one
+ * would need to modify the source code in vp10_pickmode.c and vp10_encoder.c to
+ * make the calls to the vp10_denoiser_* functions when in speed 5).
+ *
+ * The implementation is very similar to that of the VP8 denoiser. While
+ * choosing the motion vectors / reference frames, the denoiser is run, and if
+ * it did not modify the signal to much, the denoised block is copied to the
+ * signal.
+ */
+
+#ifdef OUTPUT_YUV_DENOISED
+static void make_grayscale(YV12_BUFFER_CONFIG *yuv);
+#endif
+
+static int absdiff_thresh(BLOCK_SIZE bs, int increase_denoising) {
+  (void)bs;
+  return 3 + (increase_denoising ? 1 : 0);
+}
+
+static int delta_thresh(BLOCK_SIZE bs, int increase_denoising) {
+  (void)bs;
+  (void)increase_denoising;
+  return 4;
+}
+
+static int noise_motion_thresh(BLOCK_SIZE bs, int increase_denoising) {
+  (void)bs;
+  (void)increase_denoising;
+  return 625;
+}
+
+static unsigned int sse_thresh(BLOCK_SIZE bs, int increase_denoising) {
+  return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 60 : 40);
+}
+
+static int sse_diff_thresh(BLOCK_SIZE bs, int increase_denoising,
+                           int motion_magnitude) {
+  if (motion_magnitude >
+      noise_motion_thresh(bs, increase_denoising)) {
+    return 0;
+  } else {
+    return (1 << num_pels_log2_lookup[bs]) * 20;
+  }
+}
+
+int total_adj_strong_thresh(BLOCK_SIZE bs, int increase_denoising) {
+  return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 3 : 2);
+}
+
+static int total_adj_weak_thresh(BLOCK_SIZE bs, int increase_denoising) {
+  return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 3 : 2);
+}
+
+// TODO(jackychen): If increase_denoising is enabled in the future,
+// we might need to update the code for calculating 'total_adj' in
+// case the C code is not bit-exact with corresponding sse2 code.
+int vp10_denoiser_filter_c(const uint8_t *sig, int sig_stride,
+                          const uint8_t *mc_avg,
+                          int mc_avg_stride,
+                          uint8_t *avg, int avg_stride,
+                          int increase_denoising,
+                          BLOCK_SIZE bs,
+                          int motion_magnitude) {
+  int r, c;
+  const uint8_t *sig_start = sig;
+  const uint8_t *mc_avg_start = mc_avg;
+  uint8_t *avg_start = avg;
+  int diff, adj, absdiff, delta;
+  int adj_val[] = {3, 4, 6};
+  int total_adj = 0;
+  int shift_inc = 1;
+
+  // If motion_magnitude is small, making the denoiser more aggressive by
+  // increasing the adjustment for each level. Add another increment for
+  // blocks that are labeled for increase denoising.
+  if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) {
+    if (increase_denoising) {
+      shift_inc = 2;
+    }
+    adj_val[0] += shift_inc;
+    adj_val[1] += shift_inc;
+    adj_val[2] += shift_inc;
+  }
+
+  // First attempt to apply a strong temporal denoising filter.
+  for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) {
+    for (c = 0; c < (4 << b_width_log2_lookup[bs]); ++c) {
+      diff = mc_avg[c] - sig[c];
+      absdiff = abs(diff);
+
+      if (absdiff <= absdiff_thresh(bs, increase_denoising)) {
+        avg[c] = mc_avg[c];
+        total_adj += diff;
+      } else {
+        switch (absdiff) {
+          case 4: case 5: case 6: case 7:
+            adj = adj_val[0];
+            break;
+          case 8: case 9: case 10: case 11:
+          case 12: case 13: case 14: case 15:
+            adj = adj_val[1];
+            break;
+          default:
+            adj = adj_val[2];
+        }
+        if (diff > 0) {
+          avg[c] = VPXMIN(UINT8_MAX, sig[c] + adj);
+          total_adj += adj;
+        } else {
+          avg[c] = VPXMAX(0, sig[c] - adj);
+          total_adj -= adj;
+        }
+      }
+    }
+    sig += sig_stride;
+    avg += avg_stride;
+    mc_avg += mc_avg_stride;
+  }
+
+  // If the strong filter did not modify the signal too much, we're all set.
+  if (abs(total_adj) <= total_adj_strong_thresh(bs, increase_denoising)) {
+    return FILTER_BLOCK;
+  }
+
+  // Otherwise, we try to dampen the filter if the delta is not too high.
+  delta = ((abs(total_adj) - total_adj_strong_thresh(bs, increase_denoising))
+           >> num_pels_log2_lookup[bs]) + 1;
+
+  if (delta >= delta_thresh(bs, increase_denoising)) {
+    return COPY_BLOCK;
+  }
+
+  mc_avg =  mc_avg_start;
+  avg = avg_start;
+  sig = sig_start;
+  for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) {
+    for (c = 0; c < (4 << b_width_log2_lookup[bs]); ++c) {
+      diff = mc_avg[c] - sig[c];
+      adj = abs(diff);
+      if (adj > delta) {
+        adj = delta;
+      }
+      if (diff > 0) {
+        // Diff positive means we made positive adjustment above
+        // (in first try/attempt), so now make negative adjustment to bring
+        // denoised signal down.
+        avg[c] = VPXMAX(0, avg[c] - adj);
+        total_adj -= adj;
+      } else {
+        // Diff negative means we made negative adjustment above
+        // (in first try/attempt), so now make positive adjustment to bring
+        // denoised signal up.
+        avg[c] = VPXMIN(UINT8_MAX, avg[c] + adj);
+        total_adj += adj;
+      }
+    }
+    sig += sig_stride;
+    avg += avg_stride;
+    mc_avg += mc_avg_stride;
+  }
+
+  // We can use the filter if it has been sufficiently dampened
+  if (abs(total_adj) <= total_adj_weak_thresh(bs, increase_denoising)) {
+    return FILTER_BLOCK;
+  }
+  return COPY_BLOCK;
+}
+
+static uint8_t *block_start(uint8_t *framebuf, int stride,
+                            int mi_row, int mi_col) {
+  return framebuf + (stride * mi_row * 8) + (mi_col * 8);
+}
+
+static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser,
+                                                         MACROBLOCK *mb,
+                                                         BLOCK_SIZE bs,
+                                                         int increase_denoising,
+                                                         int mi_row,
+                                                         int mi_col,
+                                                         PICK_MODE_CONTEXT *ctx,
+                                                         int *motion_magnitude
+                                                         ) {
+  int mv_col, mv_row;
+  int sse_diff = ctx->zeromv_sse - ctx->newmv_sse;
+  MV_REFERENCE_FRAME frame;
+  MACROBLOCKD *filter_mbd = &mb->e_mbd;
+  MB_MODE_INFO *mbmi = &filter_mbd->mi[0]->mbmi;
+  MB_MODE_INFO saved_mbmi;
+  int i, j;
+  struct buf_2d saved_dst[MAX_MB_PLANE];
+  struct buf_2d saved_pre[MAX_MB_PLANE][2];  // 2 pre buffers
+
+  mv_col = ctx->best_sse_mv.as_mv.col;
+  mv_row = ctx->best_sse_mv.as_mv.row;
+  *motion_magnitude = mv_row * mv_row + mv_col * mv_col;
+  frame = ctx->best_reference_frame;
+
+  saved_mbmi = *mbmi;
+
+  // If the best reference frame uses inter-prediction and there is enough of a
+  // difference in sum-squared-error, use it.
+  if (frame != INTRA_FRAME &&
+      sse_diff > sse_diff_thresh(bs, increase_denoising, *motion_magnitude)) {
+    mbmi->ref_frame[0] = ctx->best_reference_frame;
+    mbmi->mode = ctx->best_sse_inter_mode;
+    mbmi->mv[0] = ctx->best_sse_mv;
+  } else {
+    // Otherwise, use the zero reference frame.
+    frame = ctx->best_zeromv_reference_frame;
+
+    mbmi->ref_frame[0] = ctx->best_zeromv_reference_frame;
+    mbmi->mode = ZEROMV;
+    mbmi->mv[0].as_int = 0;
+
+    ctx->best_sse_inter_mode = ZEROMV;
+    ctx->best_sse_mv.as_int = 0;
+    ctx->newmv_sse = ctx->zeromv_sse;
+  }
+
+  if (ctx->newmv_sse > sse_thresh(bs, increase_denoising)) {
+    // Restore everything to its original state
+    *mbmi = saved_mbmi;
+    return COPY_BLOCK;
+  }
+  if (*motion_magnitude >
+     (noise_motion_thresh(bs, increase_denoising) << 3)) {
+    // Restore everything to its original state
+    *mbmi = saved_mbmi;
+    return COPY_BLOCK;
+  }
+
+  // We will restore these after motion compensation.
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    for (j = 0; j < 2; ++j) {
+      saved_pre[i][j] = filter_mbd->plane[i].pre[j];
+    }
+    saved_dst[i] = filter_mbd->plane[i].dst;
+  }
+
+  // Set the pointers in the MACROBLOCKD to point to the buffers in the denoiser
+  // struct.
+  for (j = 0; j < 2; ++j) {
+    filter_mbd->plane[0].pre[j].buf =
+        block_start(denoiser->running_avg_y[frame].y_buffer,
+                    denoiser->running_avg_y[frame].y_stride,
+                    mi_row, mi_col);
+    filter_mbd->plane[0].pre[j].stride =
+        denoiser->running_avg_y[frame].y_stride;
+    filter_mbd->plane[1].pre[j].buf =
+        block_start(denoiser->running_avg_y[frame].u_buffer,
+                    denoiser->running_avg_y[frame].uv_stride,
+                    mi_row, mi_col);
+    filter_mbd->plane[1].pre[j].stride =
+        denoiser->running_avg_y[frame].uv_stride;
+    filter_mbd->plane[2].pre[j].buf =
+        block_start(denoiser->running_avg_y[frame].v_buffer,
+                    denoiser->running_avg_y[frame].uv_stride,
+                    mi_row, mi_col);
+    filter_mbd->plane[2].pre[j].stride =
+        denoiser->running_avg_y[frame].uv_stride;
+  }
+  filter_mbd->plane[0].dst.buf =
+      block_start(denoiser->mc_running_avg_y.y_buffer,
+                  denoiser->mc_running_avg_y.y_stride,
+                  mi_row, mi_col);
+  filter_mbd->plane[0].dst.stride = denoiser->mc_running_avg_y.y_stride;
+  filter_mbd->plane[1].dst.buf =
+      block_start(denoiser->mc_running_avg_y.u_buffer,
+                  denoiser->mc_running_avg_y.uv_stride,
+                  mi_row, mi_col);
+  filter_mbd->plane[1].dst.stride = denoiser->mc_running_avg_y.uv_stride;
+  filter_mbd->plane[2].dst.buf =
+      block_start(denoiser->mc_running_avg_y.v_buffer,
+                  denoiser->mc_running_avg_y.uv_stride,
+                  mi_row, mi_col);
+  filter_mbd->plane[2].dst.stride = denoiser->mc_running_avg_y.uv_stride;
+
+  vp10_build_inter_predictors_sby(filter_mbd, mv_row, mv_col, bs);
+
+  // Restore everything to its original state
+  *mbmi = saved_mbmi;
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    for (j = 0; j < 2; ++j) {
+      filter_mbd->plane[i].pre[j] = saved_pre[i][j];
+    }
+    filter_mbd->plane[i].dst = saved_dst[i];
+  }
+
+  mv_row = ctx->best_sse_mv.as_mv.row;
+  mv_col = ctx->best_sse_mv.as_mv.col;
+
+  return FILTER_BLOCK;
+}
+
+void vp10_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
+                          int mi_row, int mi_col, BLOCK_SIZE bs,
+                          PICK_MODE_CONTEXT *ctx) {
+  int motion_magnitude = 0;
+  VP9_DENOISER_DECISION decision = FILTER_BLOCK;
+  YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME];
+  YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y;
+  uint8_t *avg_start = block_start(avg.y_buffer, avg.y_stride, mi_row, mi_col);
+  uint8_t *mc_avg_start = block_start(mc_avg.y_buffer, mc_avg.y_stride,
+                                          mi_row, mi_col);
+  struct buf_2d src = mb->plane[0].src;
+
+  decision = perform_motion_compensation(denoiser, mb, bs,
+                                         denoiser->increase_denoising,
+                                         mi_row, mi_col, ctx,
+                                         &motion_magnitude);
+
+  if (decision == FILTER_BLOCK) {
+    decision = vp10_denoiser_filter(src.buf, src.stride,
+                                 mc_avg_start, mc_avg.y_stride,
+                                 avg_start, avg.y_stride,
+                                 0, bs, motion_magnitude);
+  }
+
+  if (decision == FILTER_BLOCK) {
+    vpx_convolve_copy(avg_start, avg.y_stride, src.buf, src.stride,
+                      NULL, 0, NULL, 0,
+                      num_4x4_blocks_wide_lookup[bs] << 2,
+                      num_4x4_blocks_high_lookup[bs] << 2);
+  } else {  // COPY_BLOCK
+    vpx_convolve_copy(src.buf, src.stride, avg_start, avg.y_stride,
+                      NULL, 0, NULL, 0,
+                      num_4x4_blocks_wide_lookup[bs] << 2,
+                      num_4x4_blocks_high_lookup[bs] << 2);
+  }
+}
+
+static void copy_frame(YV12_BUFFER_CONFIG dest, const YV12_BUFFER_CONFIG src) {
+  int r;
+  const uint8_t *srcbuf = src.y_buffer;
+  uint8_t *destbuf = dest.y_buffer;
+
+  assert(dest.y_width == src.y_width);
+  assert(dest.y_height == src.y_height);
+
+  for (r = 0; r < dest.y_height; ++r) {
+    memcpy(destbuf, srcbuf, dest.y_width);
+    destbuf += dest.y_stride;
+    srcbuf += src.y_stride;
+  }
+}
+
+static void swap_frame_buffer(YV12_BUFFER_CONFIG *dest,
+                              YV12_BUFFER_CONFIG *src) {
+  uint8_t *tmp_buf = dest->y_buffer;
+  assert(dest->y_width == src->y_width);
+  assert(dest->y_height == src->y_height);
+  dest->y_buffer = src->y_buffer;
+  src->y_buffer = tmp_buf;
+}
+
+void vp10_denoiser_update_frame_info(VP9_DENOISER *denoiser,
+                                    YV12_BUFFER_CONFIG src,
+                                    FRAME_TYPE frame_type,
+                                    int refresh_alt_ref_frame,
+                                    int refresh_golden_frame,
+                                    int refresh_last_frame) {
+  if (frame_type == KEY_FRAME) {
+    int i;
+    // Start at 1 so as not to overwrite the INTRA_FRAME
+    for (i = 1; i < MAX_REF_FRAMES; ++i)
+      copy_frame(denoiser->running_avg_y[i], src);
+    return;
+  }
+
+  /* For non key frames */
+  if (refresh_alt_ref_frame) {
+    swap_frame_buffer(&denoiser->running_avg_y[ALTREF_FRAME],
+                      &denoiser->running_avg_y[INTRA_FRAME]);
+  }
+  if (refresh_golden_frame) {
+    swap_frame_buffer(&denoiser->running_avg_y[GOLDEN_FRAME],
+                      &denoiser->running_avg_y[INTRA_FRAME]);
+  }
+  if (refresh_last_frame) {
+    swap_frame_buffer(&denoiser->running_avg_y[LAST_FRAME],
+                      &denoiser->running_avg_y[INTRA_FRAME]);
+  }
+}
+
+void vp10_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx) {
+  ctx->zeromv_sse = UINT_MAX;
+  ctx->newmv_sse = UINT_MAX;
+}
+
+void vp10_denoiser_update_frame_stats(MB_MODE_INFO *mbmi, unsigned int sse,
+                                     PREDICTION_MODE mode,
+                                     PICK_MODE_CONTEXT *ctx) {
+  // TODO(tkopp): Use both MVs if possible
+  if (mbmi->mv[0].as_int == 0 && sse < ctx->zeromv_sse) {
+    ctx->zeromv_sse = sse;
+    ctx->best_zeromv_reference_frame = mbmi->ref_frame[0];
+  }
+
+  if (mbmi->mv[0].as_int != 0 && sse < ctx->newmv_sse) {
+    ctx->newmv_sse = sse;
+    ctx->best_sse_inter_mode = mode;
+    ctx->best_sse_mv = mbmi->mv[0];
+    ctx->best_reference_frame = mbmi->ref_frame[0];
+  }
+}
+
+int vp10_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height,
+                       int ssx, int ssy,
+#if CONFIG_VP9_HIGHBITDEPTH
+                       int use_highbitdepth,
+#endif
+                       int border) {
+  int i, fail;
+  const int legacy_byte_alignment = 0;
+  assert(denoiser != NULL);
+
+  for (i = 0; i < MAX_REF_FRAMES; ++i) {
+    fail = vpx_alloc_frame_buffer(&denoiser->running_avg_y[i], width, height,
+                                  ssx, ssy,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                  use_highbitdepth,
+#endif
+                                  border, legacy_byte_alignment);
+    if (fail) {
+      vp10_denoiser_free(denoiser);
+      return 1;
+    }
+#ifdef OUTPUT_YUV_DENOISED
+    make_grayscale(&denoiser->running_avg_y[i]);
+#endif
+  }
+
+  fail = vpx_alloc_frame_buffer(&denoiser->mc_running_avg_y, width, height,
+                                ssx, ssy,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                use_highbitdepth,
+#endif
+                                border, legacy_byte_alignment);
+  if (fail) {
+    vp10_denoiser_free(denoiser);
+    return 1;
+  }
+#ifdef OUTPUT_YUV_DENOISED
+  make_grayscale(&denoiser->running_avg_y[i]);
+#endif
+  denoiser->increase_denoising = 0;
+  denoiser->frame_buffer_initialized = 1;
+
+  return 0;
+}
+
+void vp10_denoiser_free(VP9_DENOISER *denoiser) {
+  int i;
+  denoiser->frame_buffer_initialized = 0;
+  if (denoiser == NULL) {
+    return;
+  }
+  for (i = 0; i < MAX_REF_FRAMES; ++i) {
+    vpx_free_frame_buffer(&denoiser->running_avg_y[i]);
+  }
+  vpx_free_frame_buffer(&denoiser->mc_running_avg_y);
+}
+
+#ifdef OUTPUT_YUV_DENOISED
+static void make_grayscale(YV12_BUFFER_CONFIG *yuv) {
+  int r, c;
+  uint8_t *u = yuv->u_buffer;
+  uint8_t *v = yuv->v_buffer;
+
+  for (r = 0; r < yuv->uv_height; ++r) {
+    for (c = 0; c < yuv->uv_width; ++c) {
+      u[c] = UINT8_MAX / 2;
+      v[c] = UINT8_MAX / 2;
+    }
+    u += yuv->uv_stride;
+    v += yuv->uv_stride;
+  }
+}
+#endif
diff --git a/libs/libvpx/vp10/encoder/denoiser.h b/libs/libvpx/vp10/encoder/denoiser.h
new file mode 100644
index 0000000000..e543fb05fa
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/denoiser.h
@@ -0,0 +1,69 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_DENOISER_H_
+#define VP9_ENCODER_DENOISER_H_
+
+#include "vp10/encoder/block.h"
+#include "vpx_scale/yv12config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MOTION_MAGNITUDE_THRESHOLD (8 * 3)
+
+typedef enum vp10_denoiser_decision {
+  COPY_BLOCK,
+  FILTER_BLOCK
+} VP9_DENOISER_DECISION;
+
+typedef struct vp10_denoiser {
+  YV12_BUFFER_CONFIG running_avg_y[MAX_REF_FRAMES];
+  YV12_BUFFER_CONFIG mc_running_avg_y;
+  int increase_denoising;
+  int frame_buffer_initialized;
+} VP9_DENOISER;
+
+void vp10_denoiser_update_frame_info(VP9_DENOISER *denoiser,
+                                    YV12_BUFFER_CONFIG src,
+                                    FRAME_TYPE frame_type,
+                                    int refresh_alt_ref_frame,
+                                    int refresh_golden_frame,
+                                    int refresh_last_frame);
+
+void vp10_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
+                          int mi_row, int mi_col, BLOCK_SIZE bs,
+                          PICK_MODE_CONTEXT *ctx);
+
+void vp10_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx);
+
+void vp10_denoiser_update_frame_stats(MB_MODE_INFO *mbmi,
+                                     unsigned int sse, PREDICTION_MODE mode,
+                                     PICK_MODE_CONTEXT *ctx);
+
+int vp10_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height,
+                       int ssx, int ssy,
+#if CONFIG_VP9_HIGHBITDEPTH
+                       int use_highbitdepth,
+#endif
+                       int border);
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+int total_adj_strong_thresh(BLOCK_SIZE bs, int increase_denoising);
+#endif
+
+void vp10_denoiser_free(VP9_DENOISER *denoiser);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_DENOISER_H_
diff --git a/libs/libvpx/vp10/encoder/encodeframe.c b/libs/libvpx/vp10/encoder/encodeframe.c
new file mode 100644
index 0000000000..26ce5a1ebe
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/encodeframe.c
@@ -0,0 +1,3039 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "./vp10_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/vpx_timer.h"
+#include "vpx_ports/system_state.h"
+
+#include "vp10/common/common.h"
+#include "vp10/common/entropy.h"
+#include "vp10/common/entropymode.h"
+#include "vp10/common/idct.h"
+#include "vp10/common/mvref_common.h"
+#include "vp10/common/pred_common.h"
+#include "vp10/common/quant_common.h"
+#include "vp10/common/reconintra.h"
+#include "vp10/common/reconinter.h"
+#include "vp10/common/seg_common.h"
+#include "vp10/common/tile_common.h"
+
+#include "vp10/encoder/aq_complexity.h"
+#include "vp10/encoder/aq_cyclicrefresh.h"
+#include "vp10/encoder/aq_variance.h"
+#include "vp10/encoder/encodeframe.h"
+#include "vp10/encoder/encodemb.h"
+#include "vp10/encoder/encodemv.h"
+#include "vp10/encoder/ethread.h"
+#include "vp10/encoder/extend.h"
+#include "vp10/encoder/rd.h"
+#include "vp10/encoder/rdopt.h"
+#include "vp10/encoder/segmentation.h"
+#include "vp10/encoder/tokenize.h"
+
+static void encode_superblock(VP10_COMP *cpi, ThreadData * td,
+                              TOKENEXTRA **t, int output_enabled,
+                              int mi_row, int mi_col, BLOCK_SIZE bsize,
+                              PICK_MODE_CONTEXT *ctx);
+
+// This is used as a reference when computing the source variance for the
+//  purposes of activity masking.
+// Eventually this should be replaced by custom no-reference routines,
+//  which will be faster.
+static const uint8_t VP9_VAR_OFFS[64] = {
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128
+};
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static const uint16_t VP9_HIGH_VAR_OFFS_8[64] = {
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128
+};
+
+static const uint16_t VP9_HIGH_VAR_OFFS_10[64] = {
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4
+};
+
+static const uint16_t VP9_HIGH_VAR_OFFS_12[64] = {
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16
+};
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+unsigned int vp10_get_sby_perpixel_variance(VP10_COMP *cpi,
+                                           const struct buf_2d *ref,
+                                           BLOCK_SIZE bs) {
+  unsigned int sse;
+  const unsigned int var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
+                                              VP9_VAR_OFFS, 0, &sse);
+  return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+unsigned int vp10_high_get_sby_perpixel_variance(
+    VP10_COMP *cpi, const struct buf_2d *ref, BLOCK_SIZE bs, int bd) {
+  unsigned int var, sse;
+  switch (bd) {
+    case 10:
+      var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
+                               CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_10),
+                               0, &sse);
+      break;
+    case 12:
+      var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
+                               CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_12),
+                               0, &sse);
+      break;
+    case 8:
+    default:
+      var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
+                               CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_8),
+                               0, &sse);
+      break;
+  }
+  return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static unsigned int get_sby_perpixel_diff_variance(VP10_COMP *cpi,
+                                                   const struct buf_2d *ref,
+                                                   int mi_row, int mi_col,
+                                                   BLOCK_SIZE bs) {
+  unsigned int sse, var;
+  uint8_t *last_y;
+  const YV12_BUFFER_CONFIG *last = get_ref_frame_buffer(cpi, LAST_FRAME);
+
+  assert(last != NULL);
+  last_y =
+      &last->y_buffer[mi_row * MI_SIZE * last->y_stride + mi_col * MI_SIZE];
+  var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride, last_y, last->y_stride, &sse);
+  return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
+}
+
+static BLOCK_SIZE get_rd_var_based_fixed_partition(VP10_COMP *cpi,
+                                                   MACROBLOCK *x,
+                                                   int mi_row,
+                                                   int mi_col) {
+  unsigned int var = get_sby_perpixel_diff_variance(cpi, &x->plane[0].src,
+                                                    mi_row, mi_col,
+                                                    BLOCK_64X64);
+  if (var < 8)
+    return BLOCK_64X64;
+  else if (var < 128)
+    return BLOCK_32X32;
+  else if (var < 2048)
+    return BLOCK_16X16;
+  else
+    return BLOCK_8X8;
+}
+
+// Lighter version of set_offsets that only sets the mode info
+// pointers.
+static INLINE void set_mode_info_offsets(VP10_COMP *const cpi,
+                                         MACROBLOCK *const x,
+                                         MACROBLOCKD *const xd,
+                                         int mi_row,
+                                         int mi_col) {
+  VP10_COMMON *const cm = &cpi->common;
+  const int idx_str = xd->mi_stride * mi_row + mi_col;
+  xd->mi = cm->mi_grid_visible + idx_str;
+  xd->mi[0] = cm->mi + idx_str;
+  x->mbmi_ext = cpi->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
+}
+
+static void set_offsets(VP10_COMP *cpi, const TileInfo *const tile,
+                        MACROBLOCK *const x, int mi_row, int mi_col,
+                        BLOCK_SIZE bsize) {
+  VP10_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi;
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const struct segmentation *const seg = &cm->seg;
+
+  set_skip_context(xd, mi_row, mi_col);
+
+  set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
+
+  mbmi = &xd->mi[0]->mbmi;
+
+  // Set up destination pointers.
+  vp10_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col);
+
+  // Set up limit values for MV components.
+  // Mv beyond the range do not produce new/different prediction block.
+  x->mv_row_min = -(((mi_row + mi_height) * MI_SIZE) + VP9_INTERP_EXTEND);
+  x->mv_col_min = -(((mi_col + mi_width) * MI_SIZE) + VP9_INTERP_EXTEND);
+  x->mv_row_max = (cm->mi_rows - mi_row) * MI_SIZE + VP9_INTERP_EXTEND;
+  x->mv_col_max = (cm->mi_cols - mi_col) * MI_SIZE + VP9_INTERP_EXTEND;
+
+  // Set up distance of MB to edge of frame in 1/8th pel units.
+  assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1)));
+  set_mi_row_col(xd, tile, mi_row, mi_height, mi_col, mi_width,
+                 cm->mi_rows, cm->mi_cols);
+
+  // Set up source buffers.
+  vp10_setup_src_planes(x, cpi->Source, mi_row, mi_col);
+
+  // R/D setup.
+  x->rddiv = cpi->rd.RDDIV;
+  x->rdmult = cpi->rd.RDMULT;
+
+  // Setup segment ID.
+  if (seg->enabled) {
+    if (cpi->oxcf.aq_mode != VARIANCE_AQ) {
+      const uint8_t *const map = seg->update_map ? cpi->segmentation_map
+                                                 : cm->last_frame_seg_map;
+      mbmi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
+    }
+    vp10_init_plane_quantizers(cpi, x);
+
+    x->encode_breakout = cpi->segment_encode_breakout[mbmi->segment_id];
+  } else {
+    mbmi->segment_id = 0;
+    x->encode_breakout = cpi->encode_breakout;
+  }
+
+  // required by vp10_append_sub8x8_mvs_for_idx() and vp10_find_best_ref_mvs()
+  xd->tile = *tile;
+}
+
+static void set_block_size(VP10_COMP * const cpi,
+                           MACROBLOCK *const x,
+                           MACROBLOCKD *const xd,
+                           int mi_row, int mi_col,
+                           BLOCK_SIZE bsize) {
+  if (cpi->common.mi_cols > mi_col && cpi->common.mi_rows > mi_row) {
+    set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
+    xd->mi[0]->mbmi.sb_type = bsize;
+  }
+}
+
+typedef struct {
+  int64_t sum_square_error;
+  int64_t sum_error;
+  int log2_count;
+  int variance;
+} var;
+
+typedef struct {
+  var none;
+  var horz[2];
+  var vert[2];
+} partition_variance;
+
+typedef struct {
+  partition_variance part_variances;
+  var split[4];
+} v4x4;
+
+typedef struct {
+  partition_variance part_variances;
+  v4x4 split[4];
+} v8x8;
+
+typedef struct {
+  partition_variance part_variances;
+  v8x8 split[4];
+} v16x16;
+
+typedef struct {
+  partition_variance part_variances;
+  v16x16 split[4];
+} v32x32;
+
+typedef struct {
+  partition_variance part_variances;
+  v32x32 split[4];
+} v64x64;
+
+typedef struct {
+  partition_variance *part_variances;
+  var *split[4];
+} variance_node;
+
+typedef enum {
+  V16X16,
+  V32X32,
+  V64X64,
+} TREE_LEVEL;
+
+static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) {
+  int i;
+  node->part_variances = NULL;
+  switch (bsize) {
+    case BLOCK_64X64: {
+      v64x64 *vt = (v64x64 *) data;
+      node->part_variances = &vt->part_variances;
+      for (i = 0; i < 4; i++)
+        node->split[i] = &vt->split[i].part_variances.none;
+      break;
+    }
+    case BLOCK_32X32: {
+      v32x32 *vt = (v32x32 *) data;
+      node->part_variances = &vt->part_variances;
+      for (i = 0; i < 4; i++)
+        node->split[i] = &vt->split[i].part_variances.none;
+      break;
+    }
+    case BLOCK_16X16: {
+      v16x16 *vt = (v16x16 *) data;
+      node->part_variances = &vt->part_variances;
+      for (i = 0; i < 4; i++)
+        node->split[i] = &vt->split[i].part_variances.none;
+      break;
+    }
+    case BLOCK_8X8: {
+      v8x8 *vt = (v8x8 *) data;
+      node->part_variances = &vt->part_variances;
+      for (i = 0; i < 4; i++)
+        node->split[i] = &vt->split[i].part_variances.none;
+      break;
+    }
+    case BLOCK_4X4: {
+      v4x4 *vt = (v4x4 *) data;
+      node->part_variances = &vt->part_variances;
+      for (i = 0; i < 4; i++)
+        node->split[i] = &vt->split[i];
+      break;
+    }
+    default: {
+      assert(0);
+      break;
+    }
+  }
+}
+
+// Set variance values given sum square error, sum error, count.
+static void fill_variance(int64_t s2, int64_t s, int c, var *v) {
+  v->sum_square_error = s2;
+  v->sum_error = s;
+  v->log2_count = c;
+}
+
+static void get_variance(var *v) {
+  v->variance = (int)(256 * (v->sum_square_error -
+      ((v->sum_error * v->sum_error) >> v->log2_count)) >> v->log2_count);
+}
+
+static void sum_2_variances(const var *a, const var *b, var *r) {
+  assert(a->log2_count == b->log2_count);
+  fill_variance(a->sum_square_error + b->sum_square_error,
+                a->sum_error + b->sum_error, a->log2_count + 1, r);
+}
+
+static void fill_variance_tree(void *data, BLOCK_SIZE bsize) {
+  variance_node node;
+  memset(&node, 0, sizeof(node));
+  tree_to_node(data, bsize, &node);
+  sum_2_variances(node.split[0], node.split[1], &node.part_variances->horz[0]);
+  sum_2_variances(node.split[2], node.split[3], &node.part_variances->horz[1]);
+  sum_2_variances(node.split[0], node.split[2], &node.part_variances->vert[0]);
+  sum_2_variances(node.split[1], node.split[3], &node.part_variances->vert[1]);
+  sum_2_variances(&node.part_variances->vert[0], &node.part_variances->vert[1],
+                  &node.part_variances->none);
+}
+
+static int set_vt_partitioning(VP10_COMP *cpi,
+                               MACROBLOCK *const x,
+                               MACROBLOCKD *const xd,
+                               void *data,
+                               BLOCK_SIZE bsize,
+                               int mi_row,
+                               int mi_col,
+                               int64_t threshold,
+                               BLOCK_SIZE bsize_min,
+                               int force_split) {
+  VP10_COMMON * const cm = &cpi->common;
+  variance_node vt;
+  const int block_width = num_8x8_blocks_wide_lookup[bsize];
+  const int block_height = num_8x8_blocks_high_lookup[bsize];
+  const int low_res = (cm->width <= 352 && cm->height <= 288);
+
+  assert(block_height == block_width);
+  tree_to_node(data, bsize, &vt);
+
+  if (force_split == 1)
+    return 0;
+
+  // For bsize=bsize_min (16x16/8x8 for 8x8/4x4 downsampling), select if
+  // variance is below threshold, otherwise split will be selected.
+  // No check for vert/horiz split as too few samples for variance.
+  if (bsize == bsize_min) {
+    // Variance already computed to set the force_split.
+    if (low_res || cm->frame_type == KEY_FRAME)
+      get_variance(&vt.part_variances->none);
+    if (mi_col + block_width / 2 < cm->mi_cols &&
+        mi_row + block_height / 2 < cm->mi_rows &&
+        vt.part_variances->none.variance < threshold) {
+      set_block_size(cpi, x, xd, mi_row, mi_col, bsize);
+      return 1;
+    }
+    return 0;
+  } else if (bsize > bsize_min) {
+    // Variance already computed to set the force_split.
+    if (low_res || cm->frame_type == KEY_FRAME)
+      get_variance(&vt.part_variances->none);
+    // For key frame: take split for bsize above 32X32 or very high variance.
+    if (cm->frame_type == KEY_FRAME &&
+        (bsize > BLOCK_32X32 ||
+        vt.part_variances->none.variance > (threshold << 4))) {
+      return 0;
+    }
+    // If variance is low, take the bsize (no split).
+    if (mi_col + block_width / 2 < cm->mi_cols &&
+        mi_row + block_height / 2 < cm->mi_rows &&
+        vt.part_variances->none.variance < threshold) {
+      set_block_size(cpi, x, xd, mi_row, mi_col, bsize);
+      return 1;
+    }
+
+    // Check vertical split.
+    if (mi_row + block_height / 2 < cm->mi_rows) {
+      BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_VERT);
+      get_variance(&vt.part_variances->vert[0]);
+      get_variance(&vt.part_variances->vert[1]);
+      if (vt.part_variances->vert[0].variance < threshold &&
+          vt.part_variances->vert[1].variance < threshold &&
+          get_plane_block_size(subsize, &xd->plane[1]) < BLOCK_INVALID) {
+        set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
+        set_block_size(cpi, x, xd, mi_row, mi_col + block_width / 2, subsize);
+        return 1;
+      }
+    }
+    // Check horizontal split.
+    if (mi_col + block_width / 2 < cm->mi_cols) {
+      BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_HORZ);
+      get_variance(&vt.part_variances->horz[0]);
+      get_variance(&vt.part_variances->horz[1]);
+      if (vt.part_variances->horz[0].variance < threshold &&
+          vt.part_variances->horz[1].variance < threshold &&
+          get_plane_block_size(subsize, &xd->plane[1]) < BLOCK_INVALID) {
+        set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
+        set_block_size(cpi, x, xd, mi_row + block_height / 2, mi_col, subsize);
+        return 1;
+      }
+    }
+
+    return 0;
+  }
+  return 0;
+}
+
+// Set the variance split thresholds for following the block sizes:
+// 0 - threshold_64x64, 1 - threshold_32x32, 2 - threshold_16x16,
+// 3 - vbp_threshold_8x8. vbp_threshold_8x8 (to split to 4x4 partition) is
+// currently only used on key frame.
+static void set_vbp_thresholds(VP10_COMP *cpi, int64_t thresholds[], int q) {
+  VP10_COMMON *const cm = &cpi->common;
+  const int is_key_frame = (cm->frame_type == KEY_FRAME);
+  const int threshold_multiplier = is_key_frame ? 20 : 1;
+  const int64_t threshold_base = (int64_t)(threshold_multiplier *
+      cpi->y_dequant[q][1]);
+  if (is_key_frame) {
+    thresholds[0] = threshold_base;
+    thresholds[1] = threshold_base >> 2;
+    thresholds[2] = threshold_base >> 2;
+    thresholds[3] = threshold_base << 2;
+  } else {
+    thresholds[1] = threshold_base;
+    if (cm->width <= 352 && cm->height <= 288) {
+      thresholds[0] = threshold_base >> 2;
+      thresholds[2] = threshold_base << 3;
+    } else {
+      thresholds[0] = threshold_base;
+      thresholds[1] = (5 * threshold_base) >> 2;
+      if (cm->width >= 1920 && cm->height >= 1080)
+        thresholds[1] = (7 * threshold_base) >> 2;
+      thresholds[2] = threshold_base << cpi->oxcf.speed;
+    }
+  }
+}
+
+void vp10_set_variance_partition_thresholds(VP10_COMP *cpi, int q) {
+  VP10_COMMON *const cm = &cpi->common;
+  SPEED_FEATURES *const sf = &cpi->sf;
+  const int is_key_frame = (cm->frame_type == KEY_FRAME);
+  if (sf->partition_search_type != VAR_BASED_PARTITION &&
+      sf->partition_search_type != REFERENCE_PARTITION) {
+    return;
+  } else {
+    set_vbp_thresholds(cpi, cpi->vbp_thresholds, q);
+    // The thresholds below are not changed locally.
+    if (is_key_frame) {
+      cpi->vbp_threshold_sad = 0;
+      cpi->vbp_bsize_min = BLOCK_8X8;
+    } else {
+      if (cm->width <= 352 && cm->height <= 288)
+        cpi->vbp_threshold_sad = 100;
+      else
+        cpi->vbp_threshold_sad = (cpi->y_dequant[q][1] << 1) > 1000 ?
+            (cpi->y_dequant[q][1] << 1) : 1000;
+      cpi->vbp_bsize_min = BLOCK_16X16;
+    }
+    cpi->vbp_threshold_minmax = 15 + (q >> 3);
+  }
+}
+
+// Compute the minmax over the 8x8 subblocks.
+static int compute_minmax_8x8(const uint8_t *s, int sp, const uint8_t *d,
+                              int dp, int x16_idx, int y16_idx,
+#if CONFIG_VP9_HIGHBITDEPTH
+                              int highbd_flag,
+#endif
+                              int pixels_wide,
+                              int pixels_high) {
+  int k;
+  int minmax_max = 0;
+  int minmax_min = 255;
+  // Loop over the 4 8x8 subblocks.
+  for (k = 0; k < 4; k++) {
+    int x8_idx = x16_idx + ((k & 1) << 3);
+    int y8_idx = y16_idx + ((k >> 1) << 3);
+    int min = 0;
+    int max = 0;
+    if (x8_idx < pixels_wide && y8_idx < pixels_high) {
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
+        vpx_highbd_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
+                              d + y8_idx * dp + x8_idx, dp,
+                              &min, &max);
+      } else {
+        vpx_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
+                       d + y8_idx * dp + x8_idx, dp,
+                       &min, &max);
+      }
+#else
+      vpx_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
+                     d + y8_idx * dp + x8_idx, dp,
+                     &min, &max);
+#endif
+      if ((max - min) > minmax_max)
+        minmax_max = (max - min);
+      if ((max - min) < minmax_min)
+        minmax_min = (max - min);
+    }
+  }
+  return (minmax_max - minmax_min);
+}
+
+static void fill_variance_4x4avg(const uint8_t *s, int sp, const uint8_t *d,
+                                 int dp, int x8_idx, int y8_idx, v8x8 *vst,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                 int highbd_flag,
+#endif
+                                 int pixels_wide,
+                                 int pixels_high,
+                                 int is_key_frame) {
+  int k;
+  for (k = 0; k < 4; k++) {
+    int x4_idx = x8_idx + ((k & 1) << 2);
+    int y4_idx = y8_idx + ((k >> 1) << 2);
+    unsigned int sse = 0;
+    int sum = 0;
+    if (x4_idx < pixels_wide && y4_idx < pixels_high) {
+      int s_avg;
+      int d_avg = 128;
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
+        s_avg = vpx_highbd_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+        if (!is_key_frame)
+          d_avg = vpx_highbd_avg_4x4(d + y4_idx * dp + x4_idx, dp);
+      } else {
+        s_avg = vpx_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+        if (!is_key_frame)
+          d_avg = vpx_avg_4x4(d + y4_idx * dp + x4_idx, dp);
+      }
+#else
+      s_avg = vpx_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+      if (!is_key_frame)
+        d_avg = vpx_avg_4x4(d + y4_idx * dp + x4_idx, dp);
+#endif
+      sum = s_avg - d_avg;
+      sse = sum * sum;
+    }
+    fill_variance(sse, sum, 0, &vst->split[k].part_variances.none);
+  }
+}
+
+static void fill_variance_8x8avg(const uint8_t *s, int sp, const uint8_t *d,
+                                 int dp, int x16_idx, int y16_idx, v16x16 *vst,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                 int highbd_flag,
+#endif
+                                 int pixels_wide,
+                                 int pixels_high,
+                                 int is_key_frame) {
+  int k;
+  for (k = 0; k < 4; k++) {
+    int x8_idx = x16_idx + ((k & 1) << 3);
+    int y8_idx = y16_idx + ((k >> 1) << 3);
+    unsigned int sse = 0;
+    int sum = 0;
+    if (x8_idx < pixels_wide && y8_idx < pixels_high) {
+      int s_avg;
+      int d_avg = 128;
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
+        s_avg = vpx_highbd_avg_8x8(s + y8_idx * sp + x8_idx, sp);
+        if (!is_key_frame)
+          d_avg = vpx_highbd_avg_8x8(d + y8_idx * dp + x8_idx, dp);
+      } else {
+        s_avg = vpx_avg_8x8(s + y8_idx * sp + x8_idx, sp);
+        if (!is_key_frame)
+          d_avg = vpx_avg_8x8(d + y8_idx * dp + x8_idx, dp);
+      }
+#else
+      s_avg = vpx_avg_8x8(s + y8_idx * sp + x8_idx, sp);
+      if (!is_key_frame)
+        d_avg = vpx_avg_8x8(d + y8_idx * dp + x8_idx, dp);
+#endif
+      sum = s_avg - d_avg;
+      sse = sum * sum;
+    }
+    fill_variance(sse, sum, 0, &vst->split[k].part_variances.none);
+  }
+}
+
+// This function chooses partitioning based on the variance between source and
+// reconstructed last, where variance is computed for down-sampled inputs.
+static int choose_partitioning(VP10_COMP *cpi,
+                                const TileInfo *const tile,
+                                MACROBLOCK *x,
+                                int mi_row, int mi_col) {
+  VP10_COMMON * const cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  int i, j, k, m;
+  v64x64 vt;
+  v16x16 vt2[16];
+  int force_split[21];
+  uint8_t *s;
+  const uint8_t *d;
+  int sp;
+  int dp;
+  int pixels_wide = 64, pixels_high = 64;
+  int64_t thresholds[4] = {cpi->vbp_thresholds[0], cpi->vbp_thresholds[1],
+      cpi->vbp_thresholds[2], cpi->vbp_thresholds[3]};
+
+  // Always use 4x4 partition for key frame.
+  const int is_key_frame = (cm->frame_type == KEY_FRAME);
+  const int use_4x4_partition = is_key_frame;
+  const int low_res = (cm->width <= 352 && cm->height <= 288);
+  int variance4x4downsample[16];
+
+  int segment_id = CR_SEGMENT_ID_BASE;
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) {
+    const uint8_t *const map = cm->seg.update_map ? cpi->segmentation_map :
+                                                    cm->last_frame_seg_map;
+    segment_id = get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col);
+
+    if (cyclic_refresh_segment_id_boosted(segment_id)) {
+      int q = vp10_get_qindex(&cm->seg, segment_id, cm->base_qindex);
+      set_vbp_thresholds(cpi, thresholds, q);
+    }
+  }
+
+  set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64);
+
+  if (xd->mb_to_right_edge < 0)
+    pixels_wide += (xd->mb_to_right_edge >> 3);
+  if (xd->mb_to_bottom_edge < 0)
+    pixels_high += (xd->mb_to_bottom_edge >> 3);
+
+  s = x->plane[0].src.buf;
+  sp = x->plane[0].src.stride;
+
+  if (!is_key_frame) {
+    MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+    unsigned int uv_sad;
+    const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
+
+    const YV12_BUFFER_CONFIG *yv12_g = NULL;
+    unsigned int y_sad, y_sad_g;
+    const BLOCK_SIZE bsize = BLOCK_32X32
+        + (mi_col + 4 < cm->mi_cols) * 2 + (mi_row + 4 < cm->mi_rows);
+
+    assert(yv12 != NULL);
+    yv12_g = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+
+    if (yv12_g && yv12_g != yv12) {
+      vp10_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
+                           &cm->frame_refs[GOLDEN_FRAME - 1].sf);
+      y_sad_g = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf,
+                                       x->plane[0].src.stride,
+                                       xd->plane[0].pre[0].buf,
+                                       xd->plane[0].pre[0].stride);
+    } else {
+      y_sad_g = UINT_MAX;
+    }
+
+    vp10_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
+                         &cm->frame_refs[LAST_FRAME - 1].sf);
+    mbmi->ref_frame[0] = LAST_FRAME;
+    mbmi->ref_frame[1] = NONE;
+    mbmi->sb_type = BLOCK_64X64;
+    mbmi->mv[0].as_int = 0;
+    mbmi->interp_filter = BILINEAR;
+
+    y_sad = vp10_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col);
+    if (y_sad_g < y_sad) {
+      vp10_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
+                           &cm->frame_refs[GOLDEN_FRAME - 1].sf);
+      mbmi->ref_frame[0] = GOLDEN_FRAME;
+      mbmi->mv[0].as_int = 0;
+      y_sad = y_sad_g;
+    } else {
+      x->pred_mv[LAST_FRAME] = mbmi->mv[0].as_mv;
+    }
+
+    vp10_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64);
+
+    for (i = 1; i <= 2; ++i) {
+      struct macroblock_plane  *p = &x->plane[i];
+      struct macroblockd_plane *pd = &xd->plane[i];
+      const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
+
+      if (bs == BLOCK_INVALID)
+        uv_sad = UINT_MAX;
+      else
+        uv_sad = cpi->fn_ptr[bs].sdf(p->src.buf, p->src.stride,
+                                     pd->dst.buf, pd->dst.stride);
+
+      x->color_sensitivity[i - 1] = uv_sad > (y_sad >> 2);
+    }
+
+    d = xd->plane[0].dst.buf;
+    dp = xd->plane[0].dst.stride;
+
+    // If the y_sad is very small, take 64x64 as partition and exit.
+    // Don't check on boosted segment for now, as 64x64 is suppressed there.
+    if (segment_id == CR_SEGMENT_ID_BASE &&
+        y_sad < cpi->vbp_threshold_sad) {
+      const int block_width = num_8x8_blocks_wide_lookup[BLOCK_64X64];
+      const int block_height = num_8x8_blocks_high_lookup[BLOCK_64X64];
+      if (mi_col + block_width / 2 < cm->mi_cols &&
+          mi_row + block_height / 2 < cm->mi_rows) {
+        set_block_size(cpi, x, xd, mi_row, mi_col, BLOCK_64X64);
+        return 0;
+      }
+    }
+  } else {
+    d = VP9_VAR_OFFS;
+    dp = 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      switch (xd->bd) {
+        case 10:
+          d = CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_10);
+          break;
+        case 12:
+          d = CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_12);
+          break;
+        case 8:
+        default:
+          d = CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_8);
+          break;
+      }
+    }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  }
+
+  // Index for force_split: 0 for 64x64, 1-4 for 32x32 blocks,
+  // 5-20 for the 16x16 blocks.
+  force_split[0] = 0;
+  // Fill in the entire tree of 8x8 (or 4x4 under some conditions) variances
+  // for splits.
+  for (i = 0; i < 4; i++) {
+    const int x32_idx = ((i & 1) << 5);
+    const int y32_idx = ((i >> 1) << 5);
+    const int i2 = i << 2;
+    force_split[i + 1] = 0;
+    for (j = 0; j < 4; j++) {
+      const int x16_idx = x32_idx + ((j & 1) << 4);
+      const int y16_idx = y32_idx + ((j >> 1) << 4);
+      const int split_index = 5 + i2 + j;
+      v16x16 *vst = &vt.split[i].split[j];
+      force_split[split_index] = 0;
+      variance4x4downsample[i2 + j] = 0;
+      if (!is_key_frame) {
+        fill_variance_8x8avg(s, sp, d, dp, x16_idx, y16_idx, vst,
+#if CONFIG_VP9_HIGHBITDEPTH
+                            xd->cur_buf->flags,
+#endif
+                            pixels_wide,
+                            pixels_high,
+                            is_key_frame);
+        fill_variance_tree(&vt.split[i].split[j], BLOCK_16X16);
+        get_variance(&vt.split[i].split[j].part_variances.none);
+        if (vt.split[i].split[j].part_variances.none.variance >
+            thresholds[2]) {
+          // 16X16 variance is above threshold for split, so force split to 8x8
+          // for this 16x16 block (this also forces splits for upper levels).
+          force_split[split_index] = 1;
+          force_split[i + 1] = 1;
+          force_split[0] = 1;
+        } else if (vt.split[i].split[j].part_variances.none.variance >
+                   thresholds[1] &&
+                   !cyclic_refresh_segment_id_boosted(segment_id)) {
+          // We have some nominal amount of 16x16 variance (based on average),
+          // compute the minmax over the 8x8 sub-blocks, and if above threshold,
+          // force split to 8x8 block for this 16x16 block.
+          int minmax = compute_minmax_8x8(s, sp, d, dp, x16_idx, y16_idx,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                          xd->cur_buf->flags,
+#endif
+                                          pixels_wide, pixels_high);
+          if (minmax > cpi->vbp_threshold_minmax) {
+            force_split[split_index] = 1;
+            force_split[i + 1] = 1;
+            force_split[0] = 1;
+          }
+        }
+      }
+      if (is_key_frame || (low_res &&
+          vt.split[i].split[j].part_variances.none.variance >
+          (thresholds[1] << 1))) {
+        force_split[split_index] = 0;
+        // Go down to 4x4 down-sampling for variance.
+        variance4x4downsample[i2 + j] = 1;
+        for (k = 0; k < 4; k++) {
+          int x8_idx = x16_idx + ((k & 1) << 3);
+          int y8_idx = y16_idx + ((k >> 1) << 3);
+          v8x8 *vst2 = is_key_frame ? &vst->split[k] :
+              &vt2[i2 + j].split[k];
+          fill_variance_4x4avg(s, sp, d, dp, x8_idx, y8_idx, vst2,
+#if CONFIG_VP9_HIGHBITDEPTH
+                               xd->cur_buf->flags,
+#endif
+                               pixels_wide,
+                               pixels_high,
+                               is_key_frame);
+        }
+      }
+    }
+  }
+
+  // Fill the rest of the variance tree by summing split partition values.
+  for (i = 0; i < 4; i++) {
+    const int i2 = i << 2;
+    for (j = 0; j < 4; j++) {
+      if (variance4x4downsample[i2 + j] == 1) {
+        v16x16 *vtemp = (!is_key_frame) ? &vt2[i2 + j] :
+            &vt.split[i].split[j];
+        for (m = 0; m < 4; m++)
+          fill_variance_tree(&vtemp->split[m], BLOCK_8X8);
+        fill_variance_tree(vtemp, BLOCK_16X16);
+      }
+    }
+    fill_variance_tree(&vt.split[i], BLOCK_32X32);
+    // If variance of this 32x32 block is above the threshold, force the block
+    // to split. This also forces a split on the upper (64x64) level.
+    if (!force_split[i + 1]) {
+      get_variance(&vt.split[i].part_variances.none);
+      if (vt.split[i].part_variances.none.variance > thresholds[1]) {
+        force_split[i + 1] = 1;
+        force_split[0] = 1;
+      }
+    }
+  }
+  if (!force_split[0]) {
+    fill_variance_tree(&vt, BLOCK_64X64);
+    get_variance(&vt.part_variances.none);
+  }
+
+  // Now go through the entire structure, splitting every block size until
+  // we get to one that's got a variance lower than our threshold.
+  if ( mi_col + 8 > cm->mi_cols || mi_row + 8 > cm->mi_rows ||
+      !set_vt_partitioning(cpi, x, xd, &vt, BLOCK_64X64, mi_row, mi_col,
+                           thresholds[0], BLOCK_16X16, force_split[0])) {
+    for (i = 0; i < 4; ++i) {
+      const int x32_idx = ((i & 1) << 2);
+      const int y32_idx = ((i >> 1) << 2);
+      const int i2 = i << 2;
+      if (!set_vt_partitioning(cpi, x, xd, &vt.split[i], BLOCK_32X32,
+                               (mi_row + y32_idx), (mi_col + x32_idx),
+                               thresholds[1], BLOCK_16X16,
+                               force_split[i + 1])) {
+        for (j = 0; j < 4; ++j) {
+          const int x16_idx = ((j & 1) << 1);
+          const int y16_idx = ((j >> 1) << 1);
+          // For inter frames: if variance4x4downsample[] == 1 for this 16x16
+          // block, then the variance is based on 4x4 down-sampling, so use vt2
+          // in set_vt_partioning(), otherwise use vt.
+          v16x16 *vtemp = (!is_key_frame &&
+                           variance4x4downsample[i2 + j] == 1) ?
+                           &vt2[i2 + j] : &vt.split[i].split[j];
+          if (!set_vt_partitioning(cpi, x, xd, vtemp, BLOCK_16X16,
+                                   mi_row + y32_idx + y16_idx,
+                                   mi_col + x32_idx + x16_idx,
+                                   thresholds[2],
+                                   cpi->vbp_bsize_min,
+                                   force_split[5 + i2  + j])) {
+            for (k = 0; k < 4; ++k) {
+              const int x8_idx = (k & 1);
+              const int y8_idx = (k >> 1);
+              if (use_4x4_partition) {
+                if (!set_vt_partitioning(cpi, x, xd, &vtemp->split[k],
+                                         BLOCK_8X8,
+                                         mi_row + y32_idx + y16_idx + y8_idx,
+                                         mi_col + x32_idx + x16_idx + x8_idx,
+                                         thresholds[3], BLOCK_8X8, 0)) {
+                  set_block_size(cpi, x, xd,
+                                 (mi_row + y32_idx + y16_idx + y8_idx),
+                                 (mi_col + x32_idx + x16_idx + x8_idx),
+                                 BLOCK_4X4);
+                }
+              } else {
+                set_block_size(cpi, x, xd,
+                               (mi_row + y32_idx + y16_idx + y8_idx),
+                               (mi_col + x32_idx + x16_idx + x8_idx),
+                               BLOCK_8X8);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  return 0;
+}
+
+static void update_state(VP10_COMP *cpi, ThreadData *td,
+                         PICK_MODE_CONTEXT *ctx,
+                         int mi_row, int mi_col, BLOCK_SIZE bsize,
+                         int output_enabled) {
+  int i, x_idx, y;
+  VP10_COMMON *const cm = &cpi->common;
+  RD_COUNTS *const rdc = &td->rd_counts;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *const p = x->plane;
+  struct macroblockd_plane *const pd = xd->plane;
+  MODE_INFO *mi = &ctx->mic;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MODE_INFO *mi_addr = xd->mi[0];
+  const struct segmentation *const seg = &cm->seg;
+  const int bw = num_8x8_blocks_wide_lookup[mi->mbmi.sb_type];
+  const int bh = num_8x8_blocks_high_lookup[mi->mbmi.sb_type];
+  const int x_mis = VPXMIN(bw, cm->mi_cols - mi_col);
+  const int y_mis = VPXMIN(bh, cm->mi_rows - mi_row);
+  MV_REF *const frame_mvs =
+      cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col;
+  int w, h;
+
+  const int mis = cm->mi_stride;
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  int max_plane;
+
+  assert(mi->mbmi.sb_type == bsize);
+
+  *mi_addr = *mi;
+  *x->mbmi_ext = ctx->mbmi_ext;
+
+  // If segmentation in use
+  if (seg->enabled) {
+    // For in frame complexity AQ copy the segment id from the segment map.
+    if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
+      const uint8_t *const map = seg->update_map ? cpi->segmentation_map
+                                                 : cm->last_frame_seg_map;
+      mi_addr->mbmi.segment_id =
+        get_segment_id(cm, map, bsize, mi_row, mi_col);
+    }
+    // Else for cyclic refresh mode update the segment map, set the segment id
+    // and then update the quantizer.
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+      vp10_cyclic_refresh_update_segment(cpi, &xd->mi[0]->mbmi, mi_row,
+                                        mi_col, bsize, ctx->rate, ctx->dist,
+                                        x->skip);
+    }
+  }
+
+  max_plane = is_inter_block(mbmi) ? MAX_MB_PLANE : 1;
+  for (i = 0; i < max_plane; ++i) {
+    p[i].coeff = ctx->coeff_pbuf[i][1];
+    p[i].qcoeff = ctx->qcoeff_pbuf[i][1];
+    pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
+    p[i].eobs = ctx->eobs_pbuf[i][1];
+  }
+
+  for (i = max_plane; i < MAX_MB_PLANE; ++i) {
+    p[i].coeff = ctx->coeff_pbuf[i][2];
+    p[i].qcoeff = ctx->qcoeff_pbuf[i][2];
+    pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][2];
+    p[i].eobs = ctx->eobs_pbuf[i][2];
+  }
+
+  for (i = 0; i < 2; ++i)
+    pd[i].color_index_map = ctx->color_index_map[i];
+
+  // Restore the coding context of the MB to that that was in place
+  // when the mode was picked for it
+  for (y = 0; y < mi_height; y++)
+    for (x_idx = 0; x_idx < mi_width; x_idx++)
+      if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > x_idx
+        && (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > y) {
+        xd->mi[x_idx + y * mis] = mi_addr;
+      }
+
+  if (cpi->oxcf.aq_mode)
+    vp10_init_plane_quantizers(cpi, x);
+
+  if (is_inter_block(mbmi) && mbmi->sb_type < BLOCK_8X8) {
+    mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
+    mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
+  }
+
+  x->skip = ctx->skip;
+  memcpy(x->zcoeff_blk[mbmi->tx_size], ctx->zcoeff_blk,
+         sizeof(ctx->zcoeff_blk[0]) * ctx->num_4x4_blk);
+
+  if (!output_enabled)
+    return;
+
+#if CONFIG_INTERNAL_STATS
+  if (frame_is_intra_only(cm)) {
+    static const int kf_mode_index[] = {
+      THR_DC        /*DC_PRED*/,
+      THR_V_PRED    /*V_PRED*/,
+      THR_H_PRED    /*H_PRED*/,
+      THR_D45_PRED  /*D45_PRED*/,
+      THR_D135_PRED /*D135_PRED*/,
+      THR_D117_PRED /*D117_PRED*/,
+      THR_D153_PRED /*D153_PRED*/,
+      THR_D207_PRED /*D207_PRED*/,
+      THR_D63_PRED  /*D63_PRED*/,
+      THR_TM        /*TM_PRED*/,
+    };
+    ++cpi->mode_chosen_counts[kf_mode_index[mbmi->mode]];
+  } else {
+    // Note how often each mode chosen as best
+    ++cpi->mode_chosen_counts[ctx->best_mode_index];
+  }
+#endif
+  if (!frame_is_intra_only(cm)) {
+    if (is_inter_block(mbmi)) {
+      vp10_update_mv_count(td);
+
+      if (cm->interp_filter == SWITCHABLE) {
+        const int ctx = vp10_get_pred_context_switchable_interp(xd);
+        ++td->counts->switchable_interp[ctx][mbmi->interp_filter];
+      }
+    }
+
+    rdc->comp_pred_diff[SINGLE_REFERENCE] += ctx->single_pred_diff;
+    rdc->comp_pred_diff[COMPOUND_REFERENCE] += ctx->comp_pred_diff;
+    rdc->comp_pred_diff[REFERENCE_MODE_SELECT] += ctx->hybrid_pred_diff;
+
+    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+      rdc->filter_diff[i] += ctx->best_filter_diff[i];
+  }
+
+  for (h = 0; h < y_mis; ++h) {
+    MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols;
+    for (w = 0; w < x_mis; ++w) {
+      MV_REF *const mv = frame_mv + w;
+      mv->ref_frame[0] = mi->mbmi.ref_frame[0];
+      mv->ref_frame[1] = mi->mbmi.ref_frame[1];
+      mv->mv[0].as_int = mi->mbmi.mv[0].as_int;
+      mv->mv[1].as_int = mi->mbmi.mv[1].as_int;
+    }
+  }
+}
+
+void vp10_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src,
+                          int mi_row, int mi_col) {
+  uint8_t *const buffers[3] = {src->y_buffer, src->u_buffer, src->v_buffer };
+  const int strides[3] = {src->y_stride, src->uv_stride, src->uv_stride };
+  int i;
+
+  // Set current frame pointer.
+  x->e_mbd.cur_buf = src;
+
+  for (i = 0; i < MAX_MB_PLANE; i++)
+    setup_pred_plane(&x->plane[i].src, buffers[i], strides[i], mi_row, mi_col,
+                     NULL, x->e_mbd.plane[i].subsampling_x,
+                     x->e_mbd.plane[i].subsampling_y);
+}
+
+static int set_segment_rdmult(VP10_COMP *const cpi,
+                               MACROBLOCK *const x,
+                               int8_t segment_id) {
+  int segment_qindex;
+  VP10_COMMON *const cm = &cpi->common;
+  vp10_init_plane_quantizers(cpi, x);
+  vpx_clear_system_state();
+  segment_qindex = vp10_get_qindex(&cm->seg, segment_id,
+                                  cm->base_qindex);
+  return vp10_compute_rd_mult(cpi, segment_qindex + cm->y_dc_delta_q);
+}
+
+static void rd_pick_sb_modes(VP10_COMP *cpi,
+                             TileDataEnc *tile_data,
+                             MACROBLOCK *const x,
+                             int mi_row, int mi_col, RD_COST *rd_cost,
+                             BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                             int64_t best_rd) {
+  VP10_COMMON *const cm = &cpi->common;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi;
+  struct macroblock_plane *const p = x->plane;
+  struct macroblockd_plane *const pd = xd->plane;
+  const AQ_MODE aq_mode = cpi->oxcf.aq_mode;
+  int i, orig_rdmult;
+
+  vpx_clear_system_state();
+
+  // Use the lower precision, but faster, 32x32 fdct for mode selection.
+  x->use_lp32x32fdct = 1;
+
+  set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+  mbmi = &xd->mi[0]->mbmi;
+  mbmi->sb_type = bsize;
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    p[i].coeff = ctx->coeff_pbuf[i][0];
+    p[i].qcoeff = ctx->qcoeff_pbuf[i][0];
+    pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][0];
+    p[i].eobs = ctx->eobs_pbuf[i][0];
+  }
+
+  for (i = 0; i < 2; ++i)
+    pd[i].color_index_map = ctx->color_index_map[i];
+
+  ctx->is_coded = 0;
+  ctx->skippable = 0;
+  ctx->pred_pixel_ready = 0;
+  x->skip_recode = 0;
+
+  // Set to zero to make sure we do not use the previous encoded frame stats
+  mbmi->skip = 0;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    x->source_variance =
+        vp10_high_get_sby_perpixel_variance(cpi, &x->plane[0].src,
+                                            bsize, xd->bd);
+  } else {
+    x->source_variance =
+      vp10_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
+  }
+#else
+  x->source_variance =
+    vp10_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  // Save rdmult before it might be changed, so it can be restored later.
+  orig_rdmult = x->rdmult;
+
+  if (aq_mode == VARIANCE_AQ) {
+    const int energy = bsize <= BLOCK_16X16 ? x->mb_energy
+                                            : vp10_block_energy(cpi, x, bsize);
+    if (cm->frame_type == KEY_FRAME ||
+        cpi->refresh_alt_ref_frame ||
+        (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
+      mbmi->segment_id = vp10_vaq_segment_id(energy);
+    } else {
+      const uint8_t *const map = cm->seg.update_map ? cpi->segmentation_map
+                                                    : cm->last_frame_seg_map;
+      mbmi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
+    }
+    x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id);
+  } else if (aq_mode == COMPLEXITY_AQ) {
+    x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id);
+  } else if (aq_mode == CYCLIC_REFRESH_AQ) {
+    const uint8_t *const map = cm->seg.update_map ? cpi->segmentation_map
+                                                  : cm->last_frame_seg_map;
+    // If segment is boosted, use rdmult for that segment.
+    if (cyclic_refresh_segment_id_boosted(
+            get_segment_id(cm, map, bsize, mi_row, mi_col)))
+      x->rdmult = vp10_cyclic_refresh_get_rdmult(cpi->cyclic_refresh);
+  }
+
+  // Find best coding mode & reconstruct the MB so it is available
+  // as a predictor for MBs that follow in the SB
+  if (frame_is_intra_only(cm)) {
+    vp10_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, best_rd);
+  } else {
+    if (bsize >= BLOCK_8X8) {
+      if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
+        vp10_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, rd_cost, bsize,
+                                           ctx, best_rd);
+      else
+        vp10_rd_pick_inter_mode_sb(cpi, tile_data, x, mi_row, mi_col,
+                                  rd_cost, bsize, ctx, best_rd);
+    } else {
+      vp10_rd_pick_inter_mode_sub8x8(cpi, tile_data, x, mi_row, mi_col,
+                                    rd_cost, bsize, ctx, best_rd);
+    }
+  }
+
+
+  // Examine the resulting rate and for AQ mode 2 make a segment choice.
+  if ((rd_cost->rate != INT_MAX) &&
+      (aq_mode == COMPLEXITY_AQ) && (bsize >= BLOCK_16X16) &&
+      (cm->frame_type == KEY_FRAME ||
+       cpi->refresh_alt_ref_frame ||
+       (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref))) {
+    vp10_caq_select_segment(cpi, x, bsize, mi_row, mi_col, rd_cost->rate);
+  }
+
+  x->rdmult = orig_rdmult;
+
+  // TODO(jingning) The rate-distortion optimization flow needs to be
+  // refactored to provide proper exit/return handle.
+  if (rd_cost->rate == INT_MAX)
+    rd_cost->rdcost = INT64_MAX;
+
+  ctx->rate = rd_cost->rate;
+  ctx->dist = rd_cost->dist;
+}
+
+static void update_stats(VP10_COMMON *cm, ThreadData *td) {
+  const MACROBLOCK *x = &td->mb;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MODE_INFO *const mi = xd->mi[0];
+  const MB_MODE_INFO *const mbmi = &mi->mbmi;
+  const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+
+  if (!frame_is_intra_only(cm)) {
+    FRAME_COUNTS *const counts = td->counts;
+    const int inter_block = is_inter_block(mbmi);
+    const int seg_ref_active = segfeature_active(&cm->seg, mbmi->segment_id,
+                                                 SEG_LVL_REF_FRAME);
+    if (!seg_ref_active) {
+      counts->intra_inter[vp10_get_intra_inter_context(xd)][inter_block]++;
+      // If the segment reference feature is enabled we have only a single
+      // reference frame allowed for the segment so exclude it from
+      // the reference frame counts used to work out probabilities.
+      if (inter_block) {
+        const MV_REFERENCE_FRAME ref0 = mbmi->ref_frame[0];
+        if (cm->reference_mode == REFERENCE_MODE_SELECT)
+          counts->comp_inter[vp10_get_reference_mode_context(cm, xd)]
+                            [has_second_ref(mbmi)]++;
+
+        if (has_second_ref(mbmi)) {
+          counts->comp_ref[vp10_get_pred_context_comp_ref_p(cm, xd)]
+                          [ref0 == GOLDEN_FRAME]++;
+        } else {
+          counts->single_ref[vp10_get_pred_context_single_ref_p1(xd)][0]
+                            [ref0 != LAST_FRAME]++;
+          if (ref0 != LAST_FRAME)
+            counts->single_ref[vp10_get_pred_context_single_ref_p2(xd)][1]
+                              [ref0 != GOLDEN_FRAME]++;
+        }
+      }
+    }
+    if (inter_block &&
+        !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+      const int mode_ctx = mbmi_ext->mode_context[mbmi->ref_frame[0]];
+      if (bsize >= BLOCK_8X8) {
+        const PREDICTION_MODE mode = mbmi->mode;
+        ++counts->inter_mode[mode_ctx][INTER_OFFSET(mode)];
+      } else {
+        const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+        const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+        int idx, idy;
+        for (idy = 0; idy < 2; idy += num_4x4_h) {
+          for (idx = 0; idx < 2; idx += num_4x4_w) {
+            const int j = idy * 2 + idx;
+            const PREDICTION_MODE b_mode = mi->bmi[j].as_mode;
+            ++counts->inter_mode[mode_ctx][INTER_OFFSET(b_mode)];
+          }
+        }
+      }
+    }
+  }
+}
+
+static void restore_context(MACROBLOCK *const x, int mi_row, int mi_col,
+                            ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
+                            ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
+                            PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8],
+                            BLOCK_SIZE bsize) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int p;
+  const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+  const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
+  int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  int mi_height = num_8x8_blocks_high_lookup[bsize];
+  for (p = 0; p < MAX_MB_PLANE; p++) {
+    memcpy(
+        xd->above_context[p] + ((mi_col * 2) >> xd->plane[p].subsampling_x),
+        a + num_4x4_blocks_wide * p,
+        (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
+        xd->plane[p].subsampling_x);
+    memcpy(
+        xd->left_context[p]
+            + ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
+        l + num_4x4_blocks_high * p,
+        (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
+        xd->plane[p].subsampling_y);
+  }
+  memcpy(xd->above_seg_context + mi_col, sa,
+         sizeof(*xd->above_seg_context) * mi_width);
+  memcpy(xd->left_seg_context + (mi_row & MI_MASK), sl,
+         sizeof(xd->left_seg_context[0]) * mi_height);
+}
+
+static void save_context(MACROBLOCK *const x, int mi_row, int mi_col,
+                         ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
+                         ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
+                         PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8],
+                         BLOCK_SIZE bsize) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  int p;
+  const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+  const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
+  int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  int mi_height = num_8x8_blocks_high_lookup[bsize];
+
+  // buffer the above/left context information of the block in search.
+  for (p = 0; p < MAX_MB_PLANE; ++p) {
+    memcpy(
+        a + num_4x4_blocks_wide * p,
+        xd->above_context[p] + (mi_col * 2 >> xd->plane[p].subsampling_x),
+        (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
+        xd->plane[p].subsampling_x);
+    memcpy(
+        l + num_4x4_blocks_high * p,
+        xd->left_context[p]
+            + ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
+        (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
+        xd->plane[p].subsampling_y);
+  }
+  memcpy(sa, xd->above_seg_context + mi_col,
+         sizeof(*xd->above_seg_context) * mi_width);
+  memcpy(sl, xd->left_seg_context + (mi_row & MI_MASK),
+         sizeof(xd->left_seg_context[0]) * mi_height);
+}
+
+static void encode_b(VP10_COMP *cpi, const TileInfo *const tile,
+                     ThreadData *td,
+                     TOKENEXTRA **tp, int mi_row, int mi_col,
+                     int output_enabled, BLOCK_SIZE bsize,
+                     PICK_MODE_CONTEXT *ctx) {
+  MACROBLOCK *const x = &td->mb;
+  set_offsets(cpi, tile, x, mi_row, mi_col, bsize);
+  update_state(cpi, td, ctx, mi_row, mi_col, bsize, output_enabled);
+  encode_superblock(cpi, td, tp, output_enabled, mi_row, mi_col, bsize, ctx);
+
+  if (output_enabled) {
+    update_stats(&cpi->common, td);
+  }
+}
+
+static void encode_sb(VP10_COMP *cpi, ThreadData *td,
+                      const TileInfo *const tile,
+                      TOKENEXTRA **tp, int mi_row, int mi_col,
+                      int output_enabled, BLOCK_SIZE bsize,
+                      PC_TREE *pc_tree) {
+  VP10_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
+  int ctx;
+  PARTITION_TYPE partition;
+  BLOCK_SIZE subsize = bsize;
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
+
+  if (bsize >= BLOCK_8X8) {
+    ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
+    subsize = get_subsize(bsize, pc_tree->partitioning);
+  } else {
+    ctx = 0;
+    subsize = BLOCK_4X4;
+  }
+
+  partition = partition_lookup[bsl][subsize];
+  if (output_enabled && bsize != BLOCK_4X4)
+    td->counts->partition[ctx][partition]++;
+
+  switch (partition) {
+    case PARTITION_NONE:
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
+               &pc_tree->none);
+      break;
+    case PARTITION_VERT:
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
+               &pc_tree->vertical[0]);
+      if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) {
+        encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, output_enabled,
+                 subsize, &pc_tree->vertical[1]);
+      }
+      break;
+    case PARTITION_HORZ:
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
+               &pc_tree->horizontal[0]);
+      if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) {
+        encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, output_enabled,
+                 subsize, &pc_tree->horizontal[1]);
+      }
+      break;
+    case PARTITION_SPLIT:
+      if (bsize == BLOCK_8X8) {
+        encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
+                 pc_tree->leaf_split[0]);
+      } else {
+        encode_sb(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize,
+                  pc_tree->split[0]);
+        encode_sb(cpi, td, tile, tp, mi_row, mi_col + hbs, output_enabled,
+                  subsize, pc_tree->split[1]);
+        encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col, output_enabled,
+                  subsize, pc_tree->split[2]);
+        encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs, output_enabled,
+                  subsize, pc_tree->split[3]);
+      }
+      break;
+    default:
+      assert(0 && "Invalid partition type.");
+      break;
+  }
+
+  if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
+    update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+}
+
+// Check to see if the given partition size is allowed for a specified number
+// of 8x8 block rows and columns remaining in the image.
+// If not then return the largest allowed partition size
+static BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize,
+                                      int rows_left, int cols_left,
+                                      int *bh, int *bw) {
+  if (rows_left <= 0 || cols_left <= 0) {
+    return VPXMIN(bsize, BLOCK_8X8);
+  } else {
+    for (; bsize > 0; bsize -= 3) {
+      *bh = num_8x8_blocks_high_lookup[bsize];
+      *bw = num_8x8_blocks_wide_lookup[bsize];
+      if ((*bh <= rows_left) && (*bw <= cols_left)) {
+        break;
+      }
+    }
+  }
+  return bsize;
+}
+
+static void set_partial_b64x64_partition(MODE_INFO *mi, int mis,
+    int bh_in, int bw_in, int row8x8_remaining, int col8x8_remaining,
+    BLOCK_SIZE bsize, MODE_INFO **mi_8x8) {
+  int bh = bh_in;
+  int r, c;
+  for (r = 0; r < MI_BLOCK_SIZE; r += bh) {
+    int bw = bw_in;
+    for (c = 0; c < MI_BLOCK_SIZE; c += bw) {
+      const int index = r * mis + c;
+      mi_8x8[index] = mi + index;
+      mi_8x8[index]->mbmi.sb_type = find_partition_size(bsize,
+          row8x8_remaining - r, col8x8_remaining - c, &bh, &bw);
+    }
+  }
+}
+
+// This function attempts to set all mode info entries in a given SB64
+// to the same block partition size.
+// However, at the bottom and right borders of the image the requested size
+// may not be allowed in which case this code attempts to choose the largest
+// allowable partition.
+static void set_fixed_partitioning(VP10_COMP *cpi, const TileInfo *const tile,
+                                   MODE_INFO **mi_8x8, int mi_row, int mi_col,
+                                   BLOCK_SIZE bsize) {
+  VP10_COMMON *const cm = &cpi->common;
+  const int mis = cm->mi_stride;
+  const int row8x8_remaining = tile->mi_row_end - mi_row;
+  const int col8x8_remaining = tile->mi_col_end - mi_col;
+  int block_row, block_col;
+  MODE_INFO *mi_upper_left = cm->mi + mi_row * mis + mi_col;
+  int bh = num_8x8_blocks_high_lookup[bsize];
+  int bw = num_8x8_blocks_wide_lookup[bsize];
+
+  assert((row8x8_remaining > 0) && (col8x8_remaining > 0));
+
+  // Apply the requested partition size to the SB64 if it is all "in image"
+  if ((col8x8_remaining >= MI_BLOCK_SIZE) &&
+      (row8x8_remaining >= MI_BLOCK_SIZE)) {
+    for (block_row = 0; block_row < MI_BLOCK_SIZE; block_row += bh) {
+      for (block_col = 0; block_col < MI_BLOCK_SIZE; block_col += bw) {
+        int index = block_row * mis + block_col;
+        mi_8x8[index] = mi_upper_left + index;
+        mi_8x8[index]->mbmi.sb_type = bsize;
+      }
+    }
+  } else {
+    // Else this is a partial SB64.
+    set_partial_b64x64_partition(mi_upper_left, mis, bh, bw, row8x8_remaining,
+        col8x8_remaining, bsize, mi_8x8);
+  }
+}
+
+static void rd_use_partition(VP10_COMP *cpi,
+                             ThreadData *td,
+                             TileDataEnc *tile_data,
+                             MODE_INFO **mi_8x8, TOKENEXTRA **tp,
+                             int mi_row, int mi_col,
+                             BLOCK_SIZE bsize,
+                             int *rate, int64_t *dist,
+                             int do_recon, PC_TREE *pc_tree) {
+  VP10_COMMON *const cm = &cpi->common;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int mis = cm->mi_stride;
+  const int bsl = b_width_log2_lookup[bsize];
+  const int mi_step = num_4x4_blocks_wide_lookup[bsize] / 2;
+  const int bss = (1 << bsl) / 4;
+  int i, pl;
+  PARTITION_TYPE partition = PARTITION_NONE;
+  BLOCK_SIZE subsize;
+  ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
+  PARTITION_CONTEXT sl[8], sa[8];
+  RD_COST last_part_rdc, none_rdc, chosen_rdc;
+  BLOCK_SIZE sub_subsize = BLOCK_4X4;
+  int splits_below = 0;
+  BLOCK_SIZE bs_type = mi_8x8[0]->mbmi.sb_type;
+  int do_partition_search = 1;
+  PICK_MODE_CONTEXT *ctx = &pc_tree->none;
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
+
+  assert(num_4x4_blocks_wide_lookup[bsize] ==
+         num_4x4_blocks_high_lookup[bsize]);
+
+  vp10_rd_cost_reset(&last_part_rdc);
+  vp10_rd_cost_reset(&none_rdc);
+  vp10_rd_cost_reset(&chosen_rdc);
+
+  partition = partition_lookup[bsl][bs_type];
+  subsize = get_subsize(bsize, partition);
+
+  pc_tree->partitioning = partition;
+  save_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+
+  if (bsize == BLOCK_16X16 && cpi->oxcf.aq_mode) {
+    set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+    x->mb_energy = vp10_block_energy(cpi, x, bsize);
+  }
+
+  if (do_partition_search &&
+      cpi->sf.partition_search_type == SEARCH_PARTITION &&
+      cpi->sf.adjust_partitioning_from_last_frame) {
+    // Check if any of the sub blocks are further split.
+    if (partition == PARTITION_SPLIT && subsize > BLOCK_8X8) {
+      sub_subsize = get_subsize(subsize, PARTITION_SPLIT);
+      splits_below = 1;
+      for (i = 0; i < 4; i++) {
+        int jj = i >> 1, ii = i & 0x01;
+        MODE_INFO *this_mi = mi_8x8[jj * bss * mis + ii * bss];
+        if (this_mi && this_mi->mbmi.sb_type >= sub_subsize) {
+          splits_below = 0;
+        }
+      }
+    }
+
+    // If partition is not none try none unless each of the 4 splits are split
+    // even further..
+    if (partition != PARTITION_NONE && !splits_below &&
+        mi_row + (mi_step >> 1) < cm->mi_rows &&
+        mi_col + (mi_step >> 1) < cm->mi_cols) {
+      pc_tree->partitioning = PARTITION_NONE;
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc, bsize,
+                       ctx, INT64_MAX);
+
+      pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+
+      if (none_rdc.rate < INT_MAX) {
+        none_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
+        none_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, none_rdc.rate,
+                                 none_rdc.dist);
+      }
+
+      restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+      mi_8x8[0]->mbmi.sb_type = bs_type;
+      pc_tree->partitioning = partition;
+    }
+  }
+
+  switch (partition) {
+    case PARTITION_NONE:
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+                       bsize, ctx, INT64_MAX);
+      break;
+    case PARTITION_HORZ:
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+                       subsize, &pc_tree->horizontal[0],
+                       INT64_MAX);
+      if (last_part_rdc.rate != INT_MAX &&
+          bsize >= BLOCK_8X8 && mi_row + (mi_step >> 1) < cm->mi_rows) {
+        RD_COST tmp_rdc;
+        PICK_MODE_CONTEXT *ctx = &pc_tree->horizontal[0];
+        vp10_rd_cost_init(&tmp_rdc);
+        update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
+        encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
+        rd_pick_sb_modes(cpi, tile_data, x,
+                         mi_row + (mi_step >> 1), mi_col, &tmp_rdc,
+                         subsize, &pc_tree->horizontal[1], INT64_MAX);
+        if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+          vp10_rd_cost_reset(&last_part_rdc);
+          break;
+        }
+        last_part_rdc.rate += tmp_rdc.rate;
+        last_part_rdc.dist += tmp_rdc.dist;
+        last_part_rdc.rdcost += tmp_rdc.rdcost;
+      }
+      break;
+    case PARTITION_VERT:
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+                       subsize, &pc_tree->vertical[0], INT64_MAX);
+      if (last_part_rdc.rate != INT_MAX &&
+          bsize >= BLOCK_8X8 && mi_col + (mi_step >> 1) < cm->mi_cols) {
+        RD_COST tmp_rdc;
+        PICK_MODE_CONTEXT *ctx = &pc_tree->vertical[0];
+        vp10_rd_cost_init(&tmp_rdc);
+        update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
+        encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
+        rd_pick_sb_modes(cpi, tile_data, x,
+                         mi_row, mi_col + (mi_step >> 1), &tmp_rdc,
+                         subsize, &pc_tree->vertical[bsize > BLOCK_8X8],
+                         INT64_MAX);
+        if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+          vp10_rd_cost_reset(&last_part_rdc);
+          break;
+        }
+        last_part_rdc.rate += tmp_rdc.rate;
+        last_part_rdc.dist += tmp_rdc.dist;
+        last_part_rdc.rdcost += tmp_rdc.rdcost;
+      }
+      break;
+    case PARTITION_SPLIT:
+      if (bsize == BLOCK_8X8) {
+        rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+                         subsize, pc_tree->leaf_split[0], INT64_MAX);
+        break;
+      }
+      last_part_rdc.rate = 0;
+      last_part_rdc.dist = 0;
+      last_part_rdc.rdcost = 0;
+      for (i = 0; i < 4; i++) {
+        int x_idx = (i & 1) * (mi_step >> 1);
+        int y_idx = (i >> 1) * (mi_step >> 1);
+        int jj = i >> 1, ii = i & 0x01;
+        RD_COST tmp_rdc;
+        if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
+          continue;
+
+        vp10_rd_cost_init(&tmp_rdc);
+        rd_use_partition(cpi, td, tile_data,
+                         mi_8x8 + jj * bss * mis + ii * bss, tp,
+                         mi_row + y_idx, mi_col + x_idx, subsize,
+                         &tmp_rdc.rate, &tmp_rdc.dist,
+                         i != 3, pc_tree->split[i]);
+        if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+          vp10_rd_cost_reset(&last_part_rdc);
+          break;
+        }
+        last_part_rdc.rate += tmp_rdc.rate;
+        last_part_rdc.dist += tmp_rdc.dist;
+      }
+      break;
+    default:
+      assert(0);
+      break;
+  }
+
+  pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+  if (last_part_rdc.rate < INT_MAX) {
+    last_part_rdc.rate += cpi->partition_cost[pl][partition];
+    last_part_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+                                  last_part_rdc.rate, last_part_rdc.dist);
+  }
+
+  if (do_partition_search
+      && cpi->sf.adjust_partitioning_from_last_frame
+      && cpi->sf.partition_search_type == SEARCH_PARTITION
+      && partition != PARTITION_SPLIT && bsize > BLOCK_8X8
+      && (mi_row + mi_step < cm->mi_rows ||
+          mi_row + (mi_step >> 1) == cm->mi_rows)
+      && (mi_col + mi_step < cm->mi_cols ||
+          mi_col + (mi_step >> 1) == cm->mi_cols)) {
+    BLOCK_SIZE split_subsize = get_subsize(bsize, PARTITION_SPLIT);
+    chosen_rdc.rate = 0;
+    chosen_rdc.dist = 0;
+    restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+    pc_tree->partitioning = PARTITION_SPLIT;
+
+    // Split partition.
+    for (i = 0; i < 4; i++) {
+      int x_idx = (i & 1) * (mi_step >> 1);
+      int y_idx = (i >> 1) * (mi_step >> 1);
+      RD_COST tmp_rdc;
+      ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
+      PARTITION_CONTEXT sl[8], sa[8];
+
+      if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
+        continue;
+
+      save_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+      pc_tree->split[i]->partitioning = PARTITION_NONE;
+      rd_pick_sb_modes(cpi, tile_data, x,
+                       mi_row + y_idx, mi_col + x_idx, &tmp_rdc,
+                       split_subsize, &pc_tree->split[i]->none, INT64_MAX);
+
+      restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+
+      if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+        vp10_rd_cost_reset(&chosen_rdc);
+        break;
+      }
+
+      chosen_rdc.rate += tmp_rdc.rate;
+      chosen_rdc.dist += tmp_rdc.dist;
+
+      if (i != 3)
+        encode_sb(cpi, td, tile_info, tp,  mi_row + y_idx, mi_col + x_idx, 0,
+                  split_subsize, pc_tree->split[i]);
+
+      pl = partition_plane_context(xd, mi_row + y_idx, mi_col + x_idx,
+                                   split_subsize);
+      chosen_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
+    }
+    pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+    if (chosen_rdc.rate < INT_MAX) {
+      chosen_rdc.rate += cpi->partition_cost[pl][PARTITION_SPLIT];
+      chosen_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+                                 chosen_rdc.rate, chosen_rdc.dist);
+    }
+  }
+
+  // If last_part is better set the partitioning to that.
+  if (last_part_rdc.rdcost < chosen_rdc.rdcost) {
+    mi_8x8[0]->mbmi.sb_type = bsize;
+    if (bsize >= BLOCK_8X8)
+      pc_tree->partitioning = partition;
+    chosen_rdc = last_part_rdc;
+  }
+  // If none was better set the partitioning to that.
+  if (none_rdc.rdcost < chosen_rdc.rdcost) {
+    if (bsize >= BLOCK_8X8)
+      pc_tree->partitioning = PARTITION_NONE;
+    chosen_rdc = none_rdc;
+  }
+
+  restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+
+  // We must have chosen a partitioning and encoding or we'll fail later on.
+  // No other opportunities for success.
+  if (bsize == BLOCK_64X64)
+    assert(chosen_rdc.rate < INT_MAX && chosen_rdc.dist < INT64_MAX);
+
+  if (do_recon) {
+    int output_enabled = (bsize == BLOCK_64X64);
+    encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, bsize,
+              pc_tree);
+  }
+
+  *rate = chosen_rdc.rate;
+  *dist = chosen_rdc.dist;
+}
+
+static const BLOCK_SIZE min_partition_size[BLOCK_SIZES] = {
+  BLOCK_4X4,   BLOCK_4X4,   BLOCK_4X4,
+  BLOCK_4X4,   BLOCK_4X4,   BLOCK_4X4,
+  BLOCK_8X8,   BLOCK_8X8,   BLOCK_8X8,
+  BLOCK_16X16, BLOCK_16X16, BLOCK_16X16,
+  BLOCK_16X16
+};
+
+static const BLOCK_SIZE max_partition_size[BLOCK_SIZES] = {
+  BLOCK_8X8,   BLOCK_16X16, BLOCK_16X16,
+  BLOCK_16X16, BLOCK_32X32, BLOCK_32X32,
+  BLOCK_32X32, BLOCK_64X64, BLOCK_64X64,
+  BLOCK_64X64, BLOCK_64X64, BLOCK_64X64,
+  BLOCK_64X64
+};
+
+
+// Look at all the mode_info entries for blocks that are part of this
+// partition and find the min and max values for sb_type.
+// At the moment this is designed to work on a 64x64 SB but could be
+// adjusted to use a size parameter.
+//
+// The min and max are assumed to have been initialized prior to calling this
+// function so repeat calls can accumulate a min and max of more than one sb64.
+static void get_sb_partition_size_range(MACROBLOCKD *xd, MODE_INFO **mi_8x8,
+                                        BLOCK_SIZE *min_block_size,
+                                        BLOCK_SIZE *max_block_size,
+                                        int bs_hist[BLOCK_SIZES]) {
+  int sb_width_in_blocks = MI_BLOCK_SIZE;
+  int sb_height_in_blocks  = MI_BLOCK_SIZE;
+  int i, j;
+  int index = 0;
+
+  // Check the sb_type for each block that belongs to this region.
+  for (i = 0; i < sb_height_in_blocks; ++i) {
+    for (j = 0; j < sb_width_in_blocks; ++j) {
+      MODE_INFO *mi = mi_8x8[index+j];
+      BLOCK_SIZE sb_type = mi ? mi->mbmi.sb_type : 0;
+      bs_hist[sb_type]++;
+      *min_block_size = VPXMIN(*min_block_size, sb_type);
+      *max_block_size = VPXMAX(*max_block_size, sb_type);
+    }
+    index += xd->mi_stride;
+  }
+}
+
+// Next square block size less or equal than current block size.
+static const BLOCK_SIZE next_square_size[BLOCK_SIZES] = {
+  BLOCK_4X4, BLOCK_4X4, BLOCK_4X4,
+  BLOCK_8X8, BLOCK_8X8, BLOCK_8X8,
+  BLOCK_16X16, BLOCK_16X16, BLOCK_16X16,
+  BLOCK_32X32, BLOCK_32X32, BLOCK_32X32,
+  BLOCK_64X64
+};
+
+// Look at neighboring blocks and set a min and max partition size based on
+// what they chose.
+static void rd_auto_partition_range(VP10_COMP *cpi, const TileInfo *const tile,
+                                    MACROBLOCKD *const xd,
+                                    int mi_row, int mi_col,
+                                    BLOCK_SIZE *min_block_size,
+                                    BLOCK_SIZE *max_block_size) {
+  VP10_COMMON *const cm = &cpi->common;
+  MODE_INFO **mi = xd->mi;
+  const int left_in_image = xd->left_available && mi[-1];
+  const int above_in_image = xd->up_available && mi[-xd->mi_stride];
+  const int row8x8_remaining = tile->mi_row_end - mi_row;
+  const int col8x8_remaining = tile->mi_col_end - mi_col;
+  int bh, bw;
+  BLOCK_SIZE min_size = BLOCK_4X4;
+  BLOCK_SIZE max_size = BLOCK_64X64;
+  int bs_hist[BLOCK_SIZES] = {0};
+
+  // Trap case where we do not have a prediction.
+  if (left_in_image || above_in_image || cm->frame_type != KEY_FRAME) {
+    // Default "min to max" and "max to min"
+    min_size = BLOCK_64X64;
+    max_size = BLOCK_4X4;
+
+    // NOTE: each call to get_sb_partition_size_range() uses the previous
+    // passed in values for min and max as a starting point.
+    // Find the min and max partition used in previous frame at this location
+    if (cm->frame_type != KEY_FRAME) {
+      MODE_INFO **prev_mi =
+          &cm->prev_mi_grid_visible[mi_row * xd->mi_stride + mi_col];
+      get_sb_partition_size_range(xd, prev_mi, &min_size, &max_size, bs_hist);
+    }
+    // Find the min and max partition sizes used in the left SB64
+    if (left_in_image) {
+      MODE_INFO **left_sb64_mi = &mi[-MI_BLOCK_SIZE];
+      get_sb_partition_size_range(xd, left_sb64_mi, &min_size, &max_size,
+                                  bs_hist);
+    }
+    // Find the min and max partition sizes used in the above SB64.
+    if (above_in_image) {
+      MODE_INFO **above_sb64_mi = &mi[-xd->mi_stride * MI_BLOCK_SIZE];
+      get_sb_partition_size_range(xd, above_sb64_mi, &min_size, &max_size,
+                                  bs_hist);
+    }
+
+    // Adjust observed min and max for "relaxed" auto partition case.
+    if (cpi->sf.auto_min_max_partition_size == RELAXED_NEIGHBORING_MIN_MAX) {
+      min_size = min_partition_size[min_size];
+      max_size = max_partition_size[max_size];
+    }
+  }
+
+  // Check border cases where max and min from neighbors may not be legal.
+  max_size = find_partition_size(max_size,
+                                 row8x8_remaining, col8x8_remaining,
+                                 &bh, &bw);
+  // Test for blocks at the edge of the active image.
+  // This may be the actual edge of the image or where there are formatting
+  // bars.
+  if (vp10_active_edge_sb(cpi, mi_row, mi_col)) {
+    min_size = BLOCK_4X4;
+  } else {
+    min_size =
+        VPXMIN(cpi->sf.rd_auto_partition_min_limit, VPXMIN(min_size, max_size));
+  }
+
+  // When use_square_partition_only is true, make sure at least one square
+  // partition is allowed by selecting the next smaller square size as
+  // *min_block_size.
+  if (cpi->sf.use_square_partition_only &&
+      next_square_size[max_size] < min_size) {
+     min_size = next_square_size[max_size];
+  }
+
+  *min_block_size = min_size;
+  *max_block_size = max_size;
+}
+
+// TODO(jingning) refactor functions setting partition search range
+static void set_partition_range(VP10_COMMON *cm, MACROBLOCKD *xd,
+                                int mi_row, int mi_col, BLOCK_SIZE bsize,
+                                BLOCK_SIZE *min_bs, BLOCK_SIZE *max_bs) {
+  int mi_width  = num_8x8_blocks_wide_lookup[bsize];
+  int mi_height = num_8x8_blocks_high_lookup[bsize];
+  int idx, idy;
+
+  MODE_INFO *mi;
+  const int idx_str = cm->mi_stride * mi_row + mi_col;
+  MODE_INFO **prev_mi = &cm->prev_mi_grid_visible[idx_str];
+  BLOCK_SIZE bs, min_size, max_size;
+
+  min_size = BLOCK_64X64;
+  max_size = BLOCK_4X4;
+
+  if (prev_mi) {
+    for (idy = 0; idy < mi_height; ++idy) {
+      for (idx = 0; idx < mi_width; ++idx) {
+        mi = prev_mi[idy * cm->mi_stride + idx];
+        bs = mi ? mi->mbmi.sb_type : bsize;
+        min_size = VPXMIN(min_size, bs);
+        max_size = VPXMAX(max_size, bs);
+      }
+    }
+  }
+
+  if (xd->left_available) {
+    for (idy = 0; idy < mi_height; ++idy) {
+      mi = xd->mi[idy * cm->mi_stride - 1];
+      bs = mi ? mi->mbmi.sb_type : bsize;
+      min_size = VPXMIN(min_size, bs);
+      max_size = VPXMAX(max_size, bs);
+    }
+  }
+
+  if (xd->up_available) {
+    for (idx = 0; idx < mi_width; ++idx) {
+      mi = xd->mi[idx - cm->mi_stride];
+      bs = mi ? mi->mbmi.sb_type : bsize;
+      min_size = VPXMIN(min_size, bs);
+      max_size = VPXMAX(max_size, bs);
+    }
+  }
+
+  if (min_size == max_size) {
+    min_size = min_partition_size[min_size];
+    max_size = max_partition_size[max_size];
+  }
+
+  *min_bs = min_size;
+  *max_bs = max_size;
+}
+
+static INLINE void store_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
+  memcpy(ctx->pred_mv, x->pred_mv, sizeof(x->pred_mv));
+}
+
+static INLINE void load_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
+  memcpy(x->pred_mv, ctx->pred_mv, sizeof(x->pred_mv));
+}
+
+#if CONFIG_FP_MB_STATS
+const int num_16x16_blocks_wide_lookup[BLOCK_SIZES] =
+  {1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 4, 4};
+const int num_16x16_blocks_high_lookup[BLOCK_SIZES] =
+  {1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 4, 2, 4};
+const int qindex_skip_threshold_lookup[BLOCK_SIZES] =
+  {0, 10, 10, 30, 40, 40, 60, 80, 80, 90, 100, 100, 120};
+const int qindex_split_threshold_lookup[BLOCK_SIZES] =
+  {0, 3, 3, 7, 15, 15, 30, 40, 40, 60, 80, 80, 120};
+const int complexity_16x16_blocks_threshold[BLOCK_SIZES] =
+  {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 6};
+
+typedef enum {
+  MV_ZERO = 0,
+  MV_LEFT = 1,
+  MV_UP = 2,
+  MV_RIGHT = 3,
+  MV_DOWN = 4,
+  MV_INVALID
+} MOTION_DIRECTION;
+
+static INLINE MOTION_DIRECTION get_motion_direction_fp(uint8_t fp_byte) {
+  if (fp_byte & FPMB_MOTION_ZERO_MASK) {
+    return MV_ZERO;
+  } else if (fp_byte & FPMB_MOTION_LEFT_MASK) {
+    return MV_LEFT;
+  } else if (fp_byte & FPMB_MOTION_RIGHT_MASK) {
+    return MV_RIGHT;
+  } else if (fp_byte & FPMB_MOTION_UP_MASK) {
+    return MV_UP;
+  } else {
+    return MV_DOWN;
+  }
+}
+
+static INLINE int get_motion_inconsistency(MOTION_DIRECTION this_mv,
+                                           MOTION_DIRECTION that_mv) {
+  if (this_mv == that_mv) {
+    return 0;
+  } else {
+    return abs(this_mv - that_mv) == 2 ? 2 : 1;
+  }
+}
+#endif
+
+// TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
+// unlikely to be selected depending on previous rate-distortion optimization
+// results, for encoding speed-up.
+static void rd_pick_partition(VP10_COMP *cpi, ThreadData *td,
+                              TileDataEnc *tile_data,
+                              TOKENEXTRA **tp, int mi_row, int mi_col,
+                              BLOCK_SIZE bsize, RD_COST *rd_cost,
+                              int64_t best_rd, PC_TREE *pc_tree) {
+  VP10_COMMON *const cm = &cpi->common;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int mi_step = num_8x8_blocks_wide_lookup[bsize] / 2;
+  ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
+  PARTITION_CONTEXT sl[8], sa[8];
+  TOKENEXTRA *tp_orig = *tp;
+  PICK_MODE_CONTEXT *ctx = &pc_tree->none;
+  int i, pl;
+  BLOCK_SIZE subsize;
+  RD_COST this_rdc, sum_rdc, best_rdc;
+  int do_split = bsize >= BLOCK_8X8;
+  int do_rect = 1;
+
+  // Override skipping rectangular partition operations for edge blocks
+  const int force_horz_split = (mi_row + mi_step >= cm->mi_rows);
+  const int force_vert_split = (mi_col + mi_step >= cm->mi_cols);
+  const int xss = x->e_mbd.plane[1].subsampling_x;
+  const int yss = x->e_mbd.plane[1].subsampling_y;
+
+  BLOCK_SIZE min_size = x->min_partition_size;
+  BLOCK_SIZE max_size = x->max_partition_size;
+
+#if CONFIG_FP_MB_STATS
+  unsigned int src_diff_var = UINT_MAX;
+  int none_complexity = 0;
+#endif
+
+  int partition_none_allowed = !force_horz_split && !force_vert_split;
+  int partition_horz_allowed = !force_vert_split && yss <= xss &&
+                               bsize >= BLOCK_8X8;
+  int partition_vert_allowed = !force_horz_split && xss <= yss &&
+                               bsize >= BLOCK_8X8;
+  (void) *tp_orig;
+
+  assert(num_8x8_blocks_wide_lookup[bsize] ==
+             num_8x8_blocks_high_lookup[bsize]);
+
+  vp10_rd_cost_init(&this_rdc);
+  vp10_rd_cost_init(&sum_rdc);
+  vp10_rd_cost_reset(&best_rdc);
+  best_rdc.rdcost = best_rd;
+
+  set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+
+  if (bsize == BLOCK_16X16 && cpi->oxcf.aq_mode)
+    x->mb_energy = vp10_block_energy(cpi, x, bsize);
+
+  if (cpi->sf.cb_partition_search && bsize == BLOCK_16X16) {
+    int cb_partition_search_ctrl = ((pc_tree->index == 0 || pc_tree->index == 3)
+        + get_chessboard_index(cm->current_video_frame)) & 0x1;
+
+    if (cb_partition_search_ctrl && bsize > min_size && bsize < max_size)
+      set_partition_range(cm, xd, mi_row, mi_col, bsize, &min_size, &max_size);
+  }
+
+  // Determine partition types in search according to the speed features.
+  // The threshold set here has to be of square block size.
+  if (cpi->sf.auto_min_max_partition_size) {
+    partition_none_allowed &= (bsize <= max_size && bsize >= min_size);
+    partition_horz_allowed &= ((bsize <= max_size && bsize > min_size) ||
+                                force_horz_split);
+    partition_vert_allowed &= ((bsize <= max_size && bsize > min_size) ||
+                                force_vert_split);
+    do_split &= bsize > min_size;
+  }
+  if (cpi->sf.use_square_partition_only) {
+    partition_horz_allowed &= force_horz_split;
+    partition_vert_allowed &= force_vert_split;
+  }
+
+  save_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+
+#if CONFIG_FP_MB_STATS
+  if (cpi->use_fp_mb_stats) {
+    set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+    src_diff_var = get_sby_perpixel_diff_variance(cpi, &x->plane[0].src,
+                                                  mi_row, mi_col, bsize);
+  }
+#endif
+
+#if CONFIG_FP_MB_STATS
+  // Decide whether we shall split directly and skip searching NONE by using
+  // the first pass block statistics
+  if (cpi->use_fp_mb_stats && bsize >= BLOCK_32X32 && do_split &&
+      partition_none_allowed && src_diff_var > 4 &&
+      cm->base_qindex < qindex_split_threshold_lookup[bsize]) {
+    int mb_row = mi_row >> 1;
+    int mb_col = mi_col >> 1;
+    int mb_row_end =
+        VPXMIN(mb_row + num_16x16_blocks_high_lookup[bsize], cm->mb_rows);
+    int mb_col_end =
+        VPXMIN(mb_col + num_16x16_blocks_wide_lookup[bsize], cm->mb_cols);
+    int r, c;
+
+    // compute a complexity measure, basically measure inconsistency of motion
+    // vectors obtained from the first pass in the current block
+    for (r = mb_row; r < mb_row_end ; r++) {
+      for (c = mb_col; c < mb_col_end; c++) {
+        const int mb_index = r * cm->mb_cols + c;
+
+        MOTION_DIRECTION this_mv;
+        MOTION_DIRECTION right_mv;
+        MOTION_DIRECTION bottom_mv;
+
+        this_mv =
+            get_motion_direction_fp(cpi->twopass.this_frame_mb_stats[mb_index]);
+
+        // to its right
+        if (c != mb_col_end - 1) {
+          right_mv = get_motion_direction_fp(
+              cpi->twopass.this_frame_mb_stats[mb_index + 1]);
+          none_complexity += get_motion_inconsistency(this_mv, right_mv);
+        }
+
+        // to its bottom
+        if (r != mb_row_end - 1) {
+          bottom_mv = get_motion_direction_fp(
+              cpi->twopass.this_frame_mb_stats[mb_index + cm->mb_cols]);
+          none_complexity += get_motion_inconsistency(this_mv, bottom_mv);
+        }
+
+        // do not count its left and top neighbors to avoid double counting
+      }
+    }
+
+    if (none_complexity > complexity_16x16_blocks_threshold[bsize]) {
+      partition_none_allowed = 0;
+    }
+  }
+#endif
+
+  // PARTITION_NONE
+  if (partition_none_allowed) {
+    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col,
+                     &this_rdc, bsize, ctx, best_rdc.rdcost);
+    if (this_rdc.rate != INT_MAX) {
+      if (bsize >= BLOCK_8X8) {
+        pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+        this_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
+        this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+                                 this_rdc.rate, this_rdc.dist);
+      }
+
+      if (this_rdc.rdcost < best_rdc.rdcost) {
+        int64_t dist_breakout_thr = cpi->sf.partition_search_breakout_dist_thr;
+        int rate_breakout_thr = cpi->sf.partition_search_breakout_rate_thr;
+
+        best_rdc = this_rdc;
+        if (bsize >= BLOCK_8X8)
+          pc_tree->partitioning = PARTITION_NONE;
+
+        // Adjust dist breakout threshold according to the partition size.
+        dist_breakout_thr >>= 8 - (b_width_log2_lookup[bsize] +
+            b_height_log2_lookup[bsize]);
+
+        rate_breakout_thr *= num_pels_log2_lookup[bsize];
+
+        // If all y, u, v transform blocks in this partition are skippable, and
+        // the dist & rate are within the thresholds, the partition search is
+        // terminated for current branch of the partition search tree.
+        // The dist & rate thresholds are set to 0 at speed 0 to disable the
+        // early termination at that speed.
+        if (!x->e_mbd.lossless[xd->mi[0]->mbmi.segment_id] &&
+            (ctx->skippable && best_rdc.dist < dist_breakout_thr &&
+            best_rdc.rate < rate_breakout_thr)) {
+          do_split = 0;
+          do_rect = 0;
+        }
+
+#if CONFIG_FP_MB_STATS
+        // Check if every 16x16 first pass block statistics has zero
+        // motion and the corresponding first pass residue is small enough.
+        // If that is the case, check the difference variance between the
+        // current frame and the last frame. If the variance is small enough,
+        // stop further splitting in RD optimization
+        if (cpi->use_fp_mb_stats && do_split != 0 &&
+            cm->base_qindex > qindex_skip_threshold_lookup[bsize]) {
+          int mb_row = mi_row >> 1;
+          int mb_col = mi_col >> 1;
+          int mb_row_end =
+              VPXMIN(mb_row + num_16x16_blocks_high_lookup[bsize], cm->mb_rows);
+          int mb_col_end =
+              VPXMIN(mb_col + num_16x16_blocks_wide_lookup[bsize], cm->mb_cols);
+          int r, c;
+
+          int skip = 1;
+          for (r = mb_row; r < mb_row_end; r++) {
+            for (c = mb_col; c < mb_col_end; c++) {
+              const int mb_index = r * cm->mb_cols + c;
+              if (!(cpi->twopass.this_frame_mb_stats[mb_index] &
+                    FPMB_MOTION_ZERO_MASK) ||
+                  !(cpi->twopass.this_frame_mb_stats[mb_index] &
+                    FPMB_ERROR_SMALL_MASK)) {
+                skip = 0;
+                break;
+              }
+            }
+            if (skip == 0) {
+              break;
+            }
+          }
+          if (skip) {
+            if (src_diff_var == UINT_MAX) {
+              set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+              src_diff_var = get_sby_perpixel_diff_variance(
+                  cpi, &x->plane[0].src, mi_row, mi_col, bsize);
+            }
+            if (src_diff_var < 8) {
+              do_split = 0;
+              do_rect = 0;
+            }
+          }
+        }
+#endif
+      }
+    }
+    restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+  }
+
+  // store estimated motion vector
+  if (cpi->sf.adaptive_motion_search)
+    store_pred_mv(x, ctx);
+
+  // PARTITION_SPLIT
+  // TODO(jingning): use the motion vectors given by the above search as
+  // the starting point of motion search in the following partition type check.
+  if (do_split) {
+    subsize = get_subsize(bsize, PARTITION_SPLIT);
+    if (bsize == BLOCK_8X8) {
+      i = 4;
+      if (cpi->sf.adaptive_pred_interp_filter && partition_none_allowed)
+        pc_tree->leaf_split[0]->pred_interp_filter =
+            ctx->mic.mbmi.interp_filter;
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
+                       pc_tree->leaf_split[0], best_rdc.rdcost);
+      if (sum_rdc.rate == INT_MAX)
+        sum_rdc.rdcost = INT64_MAX;
+    } else {
+      for (i = 0; i < 4 && sum_rdc.rdcost < best_rdc.rdcost; ++i) {
+      const int x_idx = (i & 1) * mi_step;
+      const int y_idx = (i >> 1) * mi_step;
+
+        if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
+          continue;
+
+        if (cpi->sf.adaptive_motion_search)
+          load_pred_mv(x, ctx);
+
+        pc_tree->split[i]->index = i;
+        rd_pick_partition(cpi, td, tile_data, tp,
+                          mi_row + y_idx, mi_col + x_idx,
+                          subsize, &this_rdc,
+                          best_rdc.rdcost - sum_rdc.rdcost, pc_tree->split[i]);
+
+        if (this_rdc.rate == INT_MAX) {
+          sum_rdc.rdcost = INT64_MAX;
+          break;
+        } else {
+          sum_rdc.rate += this_rdc.rate;
+          sum_rdc.dist += this_rdc.dist;
+          sum_rdc.rdcost += this_rdc.rdcost;
+        }
+      }
+    }
+
+    if (sum_rdc.rdcost < best_rdc.rdcost && i == 4) {
+      pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+      sum_rdc.rate += cpi->partition_cost[pl][PARTITION_SPLIT];
+      sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+                              sum_rdc.rate, sum_rdc.dist);
+
+      if (sum_rdc.rdcost < best_rdc.rdcost) {
+        best_rdc = sum_rdc;
+        pc_tree->partitioning = PARTITION_SPLIT;
+      }
+    } else {
+      // skip rectangular partition test when larger block size
+      // gives better rd cost
+      if (cpi->sf.less_rectangular_check)
+        do_rect &= !partition_none_allowed;
+    }
+    restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+  }
+
+  // PARTITION_HORZ
+  if (partition_horz_allowed &&
+      (do_rect || vp10_active_h_edge(cpi, mi_row, mi_step))) {
+      subsize = get_subsize(bsize, PARTITION_HORZ);
+    if (cpi->sf.adaptive_motion_search)
+      load_pred_mv(x, ctx);
+    if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
+        partition_none_allowed)
+      pc_tree->horizontal[0].pred_interp_filter =
+          ctx->mic.mbmi.interp_filter;
+    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
+                     &pc_tree->horizontal[0], best_rdc.rdcost);
+
+    if (sum_rdc.rdcost < best_rdc.rdcost && mi_row + mi_step < cm->mi_rows &&
+        bsize > BLOCK_8X8) {
+      PICK_MODE_CONTEXT *ctx = &pc_tree->horizontal[0];
+      update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
+      encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
+
+      if (cpi->sf.adaptive_motion_search)
+        load_pred_mv(x, ctx);
+      if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
+          partition_none_allowed)
+        pc_tree->horizontal[1].pred_interp_filter =
+            ctx->mic.mbmi.interp_filter;
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col,
+                       &this_rdc, subsize, &pc_tree->horizontal[1],
+                       best_rdc.rdcost - sum_rdc.rdcost);
+      if (this_rdc.rate == INT_MAX) {
+        sum_rdc.rdcost = INT64_MAX;
+      } else {
+        sum_rdc.rate += this_rdc.rate;
+        sum_rdc.dist += this_rdc.dist;
+        sum_rdc.rdcost += this_rdc.rdcost;
+      }
+    }
+
+    if (sum_rdc.rdcost < best_rdc.rdcost) {
+      pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+      sum_rdc.rate += cpi->partition_cost[pl][PARTITION_HORZ];
+      sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+      if (sum_rdc.rdcost < best_rdc.rdcost) {
+        best_rdc = sum_rdc;
+        pc_tree->partitioning = PARTITION_HORZ;
+      }
+    }
+    restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+  }
+  // PARTITION_VERT
+  if (partition_vert_allowed &&
+      (do_rect || vp10_active_v_edge(cpi, mi_col, mi_step))) {
+      subsize = get_subsize(bsize, PARTITION_VERT);
+
+    if (cpi->sf.adaptive_motion_search)
+      load_pred_mv(x, ctx);
+    if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
+        partition_none_allowed)
+      pc_tree->vertical[0].pred_interp_filter =
+          ctx->mic.mbmi.interp_filter;
+    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
+                     &pc_tree->vertical[0], best_rdc.rdcost);
+    if (sum_rdc.rdcost < best_rdc.rdcost && mi_col + mi_step < cm->mi_cols &&
+        bsize > BLOCK_8X8) {
+      update_state(cpi, td, &pc_tree->vertical[0], mi_row, mi_col, subsize, 0);
+      encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize,
+                        &pc_tree->vertical[0]);
+
+      if (cpi->sf.adaptive_motion_search)
+        load_pred_mv(x, ctx);
+      if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
+          partition_none_allowed)
+        pc_tree->vertical[1].pred_interp_filter =
+            ctx->mic.mbmi.interp_filter;
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step,
+                       &this_rdc, subsize,
+                       &pc_tree->vertical[1], best_rdc.rdcost - sum_rdc.rdcost);
+      if (this_rdc.rate == INT_MAX) {
+        sum_rdc.rdcost = INT64_MAX;
+      } else {
+        sum_rdc.rate += this_rdc.rate;
+        sum_rdc.dist += this_rdc.dist;
+        sum_rdc.rdcost += this_rdc.rdcost;
+      }
+    }
+
+    if (sum_rdc.rdcost < best_rdc.rdcost) {
+      pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+      sum_rdc.rate += cpi->partition_cost[pl][PARTITION_VERT];
+      sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+                              sum_rdc.rate, sum_rdc.dist);
+      if (sum_rdc.rdcost < best_rdc.rdcost) {
+        best_rdc = sum_rdc;
+        pc_tree->partitioning = PARTITION_VERT;
+      }
+    }
+    restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+  }
+
+  // TODO(jbb): This code added so that we avoid static analysis
+  // warning related to the fact that best_rd isn't used after this
+  // point.  This code should be refactored so that the duplicate
+  // checks occur in some sub function and thus are used...
+  (void) best_rd;
+  *rd_cost = best_rdc;
+
+
+  if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX &&
+      pc_tree->index != 3) {
+    int output_enabled = (bsize == BLOCK_64X64);
+    encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled,
+              bsize, pc_tree);
+  }
+
+  if (bsize == BLOCK_64X64) {
+    assert(tp_orig < *tp || (tp_orig == *tp && xd->mi[0]->mbmi.skip));
+    assert(best_rdc.rate < INT_MAX);
+    assert(best_rdc.dist < INT64_MAX);
+  } else {
+    assert(tp_orig == *tp);
+  }
+}
+
+static void encode_rd_sb_row(VP10_COMP *cpi,
+                             ThreadData *td,
+                             TileDataEnc *tile_data,
+                             int mi_row,
+                             TOKENEXTRA **tp) {
+  VP10_COMMON *const cm = &cpi->common;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  SPEED_FEATURES *const sf = &cpi->sf;
+  int mi_col;
+
+  // Initialize the left context for the new SB row
+  memset(&xd->left_context, 0, sizeof(xd->left_context));
+  memset(xd->left_seg_context, 0, sizeof(xd->left_seg_context));
+
+  // Code each SB in the row
+  for (mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end;
+       mi_col += MI_BLOCK_SIZE) {
+    const struct segmentation *const seg = &cm->seg;
+    int dummy_rate;
+    int64_t dummy_dist;
+    RD_COST dummy_rdc;
+    int i;
+    int seg_skip = 0;
+
+    const int idx_str = cm->mi_stride * mi_row + mi_col;
+    MODE_INFO **mi = cm->mi_grid_visible + idx_str;
+
+    if (sf->adaptive_pred_interp_filter) {
+      for (i = 0; i < 64; ++i)
+        td->leaf_tree[i].pred_interp_filter = SWITCHABLE;
+
+      for (i = 0; i < 64; ++i) {
+        td->pc_tree[i].vertical[0].pred_interp_filter = SWITCHABLE;
+        td->pc_tree[i].vertical[1].pred_interp_filter = SWITCHABLE;
+        td->pc_tree[i].horizontal[0].pred_interp_filter = SWITCHABLE;
+        td->pc_tree[i].horizontal[1].pred_interp_filter = SWITCHABLE;
+      }
+    }
+
+    vp10_zero(x->pred_mv);
+    td->pc_root->index = 0;
+
+    if (seg->enabled) {
+      const uint8_t *const map = seg->update_map ? cpi->segmentation_map
+                                                 : cm->last_frame_seg_map;
+      int segment_id = get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col);
+      seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP);
+    }
+
+    x->source_variance = UINT_MAX;
+    if (sf->partition_search_type == FIXED_PARTITION || seg_skip) {
+      const BLOCK_SIZE bsize =
+          seg_skip ? BLOCK_64X64 : sf->always_this_block_size;
+      set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
+      set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
+      rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+                       BLOCK_64X64, &dummy_rate, &dummy_dist, 1, td->pc_root);
+    } else if (cpi->partition_search_skippable_frame) {
+      BLOCK_SIZE bsize;
+      set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
+      bsize = get_rd_var_based_fixed_partition(cpi, x, mi_row, mi_col);
+      set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
+      rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+                       BLOCK_64X64, &dummy_rate, &dummy_dist, 1, td->pc_root);
+    } else if (sf->partition_search_type == VAR_BASED_PARTITION &&
+               cm->frame_type != KEY_FRAME) {
+      choose_partitioning(cpi, tile_info, x, mi_row, mi_col);
+      rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+                       BLOCK_64X64, &dummy_rate, &dummy_dist, 1, td->pc_root);
+    } else {
+      // If required set upper and lower partition size limits
+      if (sf->auto_min_max_partition_size) {
+        set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
+        rd_auto_partition_range(cpi, tile_info, xd, mi_row, mi_col,
+                                &x->min_partition_size,
+                                &x->max_partition_size);
+      }
+      rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, BLOCK_64X64,
+                        &dummy_rdc, INT64_MAX, td->pc_root);
+    }
+  }
+}
+
+static void init_encode_frame_mb_context(VP10_COMP *cpi) {
+  MACROBLOCK *const x = &cpi->td.mb;
+  VP10_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+
+  // Copy data over into macro block data structures.
+  vp10_setup_src_planes(x, cpi->Source, 0, 0);
+
+  vp10_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
+
+  // Note: this memset assumes above_context[0], [1] and [2]
+  // are allocated as part of the same buffer.
+  memset(xd->above_context[0], 0,
+         sizeof(*xd->above_context[0]) *
+         2 * aligned_mi_cols * MAX_MB_PLANE);
+  memset(xd->above_seg_context, 0,
+         sizeof(*xd->above_seg_context) * aligned_mi_cols);
+}
+
+static int check_dual_ref_flags(VP10_COMP *cpi) {
+  const int ref_flags = cpi->ref_frame_flags;
+
+  if (segfeature_active(&cpi->common.seg, 1, SEG_LVL_REF_FRAME)) {
+    return 0;
+  } else {
+    return (!!(ref_flags & VP9_GOLD_FLAG) + !!(ref_flags & VP9_LAST_FLAG)
+        + !!(ref_flags & VP9_ALT_FLAG)) >= 2;
+  }
+}
+
+static void reset_skip_tx_size(VP10_COMMON *cm, TX_SIZE max_tx_size) {
+  int mi_row, mi_col;
+  const int mis = cm->mi_stride;
+  MODE_INFO **mi_ptr = cm->mi_grid_visible;
+
+  for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row, mi_ptr += mis) {
+    for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) {
+      if (mi_ptr[mi_col]->mbmi.tx_size > max_tx_size)
+        mi_ptr[mi_col]->mbmi.tx_size = max_tx_size;
+    }
+  }
+}
+
+static MV_REFERENCE_FRAME get_frame_type(const VP10_COMP *cpi) {
+  if (frame_is_intra_only(&cpi->common))
+    return INTRA_FRAME;
+  else if (cpi->rc.is_src_frame_alt_ref && cpi->refresh_golden_frame)
+    return ALTREF_FRAME;
+  else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)
+    return GOLDEN_FRAME;
+  else
+    return LAST_FRAME;
+}
+
+static TX_MODE select_tx_mode(const VP10_COMP *cpi, MACROBLOCKD *const xd) {
+  if (xd->lossless[0])
+    return ONLY_4X4;
+  if (cpi->sf.tx_size_search_method == USE_LARGESTALL)
+    return ALLOW_32X32;
+  else if (cpi->sf.tx_size_search_method == USE_FULL_RD||
+           cpi->sf.tx_size_search_method == USE_TX_8X8)
+    return TX_MODE_SELECT;
+  else
+    return cpi->common.tx_mode;
+}
+
+void vp10_init_tile_data(VP10_COMP *cpi) {
+  VP10_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int tile_rows = 1 << cm->log2_tile_rows;
+  int tile_col, tile_row;
+  TOKENEXTRA *pre_tok = cpi->tile_tok[0][0];
+  int tile_tok = 0;
+
+  if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) {
+    if (cpi->tile_data != NULL)
+      vpx_free(cpi->tile_data);
+    CHECK_MEM_ERROR(cm, cpi->tile_data,
+        vpx_malloc(tile_cols * tile_rows * sizeof(*cpi->tile_data)));
+    cpi->allocated_tiles = tile_cols * tile_rows;
+
+    for (tile_row = 0; tile_row < tile_rows; ++tile_row)
+      for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+        TileDataEnc *tile_data =
+            &cpi->tile_data[tile_row * tile_cols + tile_col];
+        int i, j;
+        for (i = 0; i < BLOCK_SIZES; ++i) {
+          for (j = 0; j < MAX_MODES; ++j) {
+            tile_data->thresh_freq_fact[i][j] = 32;
+            tile_data->mode_map[i][j] = j;
+          }
+        }
+      }
+  }
+
+  for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
+    for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+      TileInfo *tile_info =
+          &cpi->tile_data[tile_row * tile_cols + tile_col].tile_info;
+      vp10_tile_init(tile_info, cm, tile_row, tile_col);
+
+      cpi->tile_tok[tile_row][tile_col] = pre_tok + tile_tok;
+      pre_tok = cpi->tile_tok[tile_row][tile_col];
+      tile_tok = allocated_tokens(*tile_info);
+    }
+  }
+}
+
+void vp10_encode_tile(VP10_COMP *cpi, ThreadData *td,
+                     int tile_row, int tile_col) {
+  VP10_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  TileDataEnc *this_tile =
+      &cpi->tile_data[tile_row * tile_cols + tile_col];
+  const TileInfo * const tile_info = &this_tile->tile_info;
+  TOKENEXTRA *tok = cpi->tile_tok[tile_row][tile_col];
+  int mi_row;
+
+  // Set up pointers to per thread motion search counters.
+  td->mb.m_search_count_ptr = &td->rd_counts.m_search_count;
+  td->mb.ex_search_count_ptr = &td->rd_counts.ex_search_count;
+
+  for (mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end;
+       mi_row += MI_BLOCK_SIZE) {
+    encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok);
+  }
+  cpi->tok_count[tile_row][tile_col] =
+      (unsigned int)(tok - cpi->tile_tok[tile_row][tile_col]);
+  assert(tok - cpi->tile_tok[tile_row][tile_col] <=
+      allocated_tokens(*tile_info));
+}
+
+static void encode_tiles(VP10_COMP *cpi) {
+  VP10_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int tile_rows = 1 << cm->log2_tile_rows;
+  int tile_col, tile_row;
+
+  vp10_init_tile_data(cpi);
+
+  for (tile_row = 0; tile_row < tile_rows; ++tile_row)
+    for (tile_col = 0; tile_col < tile_cols; ++tile_col)
+      vp10_encode_tile(cpi, &cpi->td, tile_row, tile_col);
+}
+
+#if CONFIG_FP_MB_STATS
+static int input_fpmb_stats(FIRSTPASS_MB_STATS *firstpass_mb_stats,
+                            VP10_COMMON *cm, uint8_t **this_frame_mb_stats) {
+  uint8_t *mb_stats_in = firstpass_mb_stats->mb_stats_start +
+      cm->current_video_frame * cm->MBs * sizeof(uint8_t);
+
+  if (mb_stats_in > firstpass_mb_stats->mb_stats_end)
+    return EOF;
+
+  *this_frame_mb_stats = mb_stats_in;
+
+  return 1;
+}
+#endif
+
+static void encode_frame_internal(VP10_COMP *cpi) {
+  ThreadData *const td = &cpi->td;
+  MACROBLOCK *const x = &td->mb;
+  VP10_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  RD_COUNTS *const rdc = &cpi->td.rd_counts;
+  int i;
+
+  xd->mi = cm->mi_grid_visible;
+  xd->mi[0] = cm->mi;
+
+  vp10_zero(*td->counts);
+  vp10_zero(rdc->coef_counts);
+  vp10_zero(rdc->comp_pred_diff);
+  vp10_zero(rdc->filter_diff);
+  rdc->m_search_count = 0;   // Count of motion search hits.
+  rdc->ex_search_count = 0;  // Exhaustive mesh search hits.
+
+  for (i = 0; i < MAX_SEGMENTS; ++i) {
+    const int qindex = CONFIG_MISC_FIXES && cm->seg.enabled ?
+                       vp10_get_qindex(&cm->seg, i, cm->base_qindex) :
+                       cm->base_qindex;
+    xd->lossless[i] = qindex == 0 &&
+                      cm->y_dc_delta_q == 0 &&
+                      cm->uv_dc_delta_q == 0 &&
+                      cm->uv_ac_delta_q == 0;
+  }
+
+  if (!cm->seg.enabled && xd->lossless[0])
+    x->optimize = 0;
+
+  cm->tx_mode = select_tx_mode(cpi, xd);
+
+  vp10_frame_init_quantizer(cpi);
+
+  vp10_initialize_rd_consts(cpi);
+  vp10_initialize_me_consts(cpi, x, cm->base_qindex);
+  init_encode_frame_mb_context(cpi);
+  cm->use_prev_frame_mvs = !cm->error_resilient_mode &&
+                           cm->width == cm->last_width &&
+                           cm->height == cm->last_height &&
+                           !cm->intra_only &&
+                           cm->last_show_frame;
+  // Special case: set prev_mi to NULL when the previous mode info
+  // context cannot be used.
+  cm->prev_mi = cm->use_prev_frame_mvs ?
+                cm->prev_mip + cm->mi_stride + 1 : NULL;
+
+  x->quant_fp = cpi->sf.use_quant_fp;
+  vp10_zero(x->skip_txfm);
+
+  {
+    struct vpx_usec_timer emr_timer;
+    vpx_usec_timer_start(&emr_timer);
+
+#if CONFIG_FP_MB_STATS
+  if (cpi->use_fp_mb_stats) {
+    input_fpmb_stats(&cpi->twopass.firstpass_mb_stats, cm,
+                     &cpi->twopass.this_frame_mb_stats);
+  }
+#endif
+
+    // If allowed, encoding tiles in parallel with one thread handling one tile.
+    if (VPXMIN(cpi->oxcf.max_threads, 1 << cm->log2_tile_cols) > 1)
+      vp10_encode_tiles_mt(cpi);
+    else
+      encode_tiles(cpi);
+
+    vpx_usec_timer_mark(&emr_timer);
+    cpi->time_encode_sb_row += vpx_usec_timer_elapsed(&emr_timer);
+  }
+
+#if 0
+  // Keep record of the total distortion this time around for future use
+  cpi->last_frame_distortion = cpi->frame_distortion;
+#endif
+}
+
+static INTERP_FILTER get_interp_filter(
+    const int64_t threshes[SWITCHABLE_FILTER_CONTEXTS], int is_alt_ref) {
+  if (!is_alt_ref &&
+      threshes[EIGHTTAP_SMOOTH] > threshes[EIGHTTAP] &&
+      threshes[EIGHTTAP_SMOOTH] > threshes[EIGHTTAP_SHARP] &&
+      threshes[EIGHTTAP_SMOOTH] > threshes[SWITCHABLE - 1]) {
+    return EIGHTTAP_SMOOTH;
+  } else if (threshes[EIGHTTAP_SHARP] > threshes[EIGHTTAP] &&
+             threshes[EIGHTTAP_SHARP] > threshes[SWITCHABLE - 1]) {
+    return EIGHTTAP_SHARP;
+  } else if (threshes[EIGHTTAP] > threshes[SWITCHABLE - 1]) {
+    return EIGHTTAP;
+  } else {
+    return SWITCHABLE;
+  }
+}
+
+void vp10_encode_frame(VP10_COMP *cpi) {
+  VP10_COMMON *const cm = &cpi->common;
+
+  // In the longer term the encoder should be generalized to match the
+  // decoder such that we allow compound where one of the 3 buffers has a
+  // different sign bias and that buffer is then the fixed ref. However, this
+  // requires further work in the rd loop. For now the only supported encoder
+  // side behavior is where the ALT ref buffer has opposite sign bias to
+  // the other two.
+  if (!frame_is_intra_only(cm)) {
+    if ((cm->ref_frame_sign_bias[ALTREF_FRAME] ==
+             cm->ref_frame_sign_bias[GOLDEN_FRAME]) ||
+        (cm->ref_frame_sign_bias[ALTREF_FRAME] ==
+             cm->ref_frame_sign_bias[LAST_FRAME])) {
+      cpi->allow_comp_inter_inter = 0;
+    } else {
+      cpi->allow_comp_inter_inter = 1;
+      cm->comp_fixed_ref = ALTREF_FRAME;
+      cm->comp_var_ref[0] = LAST_FRAME;
+      cm->comp_var_ref[1] = GOLDEN_FRAME;
+    }
+  } else {
+    cpi->allow_comp_inter_inter = 0;
+  }
+
+  if (cpi->sf.frame_parameter_update) {
+    int i;
+    RD_OPT *const rd_opt = &cpi->rd;
+    FRAME_COUNTS *counts = cpi->td.counts;
+    RD_COUNTS *const rdc = &cpi->td.rd_counts;
+
+    // This code does a single RD pass over the whole frame assuming
+    // either compound, single or hybrid prediction as per whatever has
+    // worked best for that type of frame in the past.
+    // It also predicts whether another coding mode would have worked
+    // better that this coding mode. If that is the case, it remembers
+    // that for subsequent frames.
+    // It does the same analysis for transform size selection also.
+    const MV_REFERENCE_FRAME frame_type = get_frame_type(cpi);
+    int64_t *const mode_thrs = rd_opt->prediction_type_threshes[frame_type];
+    int64_t *const filter_thrs = rd_opt->filter_threshes[frame_type];
+    const int is_alt_ref = frame_type == ALTREF_FRAME;
+
+    /* prediction (compound, single or hybrid) mode selection */
+    if (is_alt_ref || !cpi->allow_comp_inter_inter)
+      cm->reference_mode = SINGLE_REFERENCE;
+    else if (mode_thrs[COMPOUND_REFERENCE] > mode_thrs[SINGLE_REFERENCE] &&
+             mode_thrs[COMPOUND_REFERENCE] >
+                 mode_thrs[REFERENCE_MODE_SELECT] &&
+             check_dual_ref_flags(cpi) &&
+             cpi->static_mb_pct == 100)
+      cm->reference_mode = COMPOUND_REFERENCE;
+    else if (mode_thrs[SINGLE_REFERENCE] > mode_thrs[REFERENCE_MODE_SELECT])
+      cm->reference_mode = SINGLE_REFERENCE;
+    else
+      cm->reference_mode = REFERENCE_MODE_SELECT;
+
+    if (cm->interp_filter == SWITCHABLE)
+      cm->interp_filter = get_interp_filter(filter_thrs, is_alt_ref);
+
+    encode_frame_internal(cpi);
+
+    for (i = 0; i < REFERENCE_MODES; ++i)
+      mode_thrs[i] = (mode_thrs[i] + rdc->comp_pred_diff[i] / cm->MBs) / 2;
+
+    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+      filter_thrs[i] = (filter_thrs[i] + rdc->filter_diff[i] / cm->MBs) / 2;
+
+    if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+      int single_count_zero = 0;
+      int comp_count_zero = 0;
+
+      for (i = 0; i < COMP_INTER_CONTEXTS; i++) {
+        single_count_zero += counts->comp_inter[i][0];
+        comp_count_zero += counts->comp_inter[i][1];
+      }
+
+      if (comp_count_zero == 0) {
+        cm->reference_mode = SINGLE_REFERENCE;
+        vp10_zero(counts->comp_inter);
+      } else if (single_count_zero == 0) {
+        cm->reference_mode = COMPOUND_REFERENCE;
+        vp10_zero(counts->comp_inter);
+      }
+    }
+
+    if (cm->tx_mode == TX_MODE_SELECT) {
+      int count4x4 = 0;
+      int count8x8_lp = 0, count8x8_8x8p = 0;
+      int count16x16_16x16p = 0, count16x16_lp = 0;
+      int count32x32 = 0;
+
+      for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
+        count4x4 += counts->tx.p32x32[i][TX_4X4];
+        count4x4 += counts->tx.p16x16[i][TX_4X4];
+        count4x4 += counts->tx.p8x8[i][TX_4X4];
+
+        count8x8_lp += counts->tx.p32x32[i][TX_8X8];
+        count8x8_lp += counts->tx.p16x16[i][TX_8X8];
+        count8x8_8x8p += counts->tx.p8x8[i][TX_8X8];
+
+        count16x16_16x16p += counts->tx.p16x16[i][TX_16X16];
+        count16x16_lp += counts->tx.p32x32[i][TX_16X16];
+        count32x32 += counts->tx.p32x32[i][TX_32X32];
+      }
+      if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 &&
+          count32x32 == 0) {
+        cm->tx_mode = ALLOW_8X8;
+        reset_skip_tx_size(cm, TX_8X8);
+      } else if (count8x8_8x8p == 0 && count16x16_16x16p == 0 &&
+                 count8x8_lp == 0 && count16x16_lp == 0 && count32x32 == 0) {
+        cm->tx_mode = ONLY_4X4;
+        reset_skip_tx_size(cm, TX_4X4);
+      } else if (count8x8_lp == 0 && count16x16_lp == 0 && count4x4 == 0) {
+        cm->tx_mode = ALLOW_32X32;
+      } else if (count32x32 == 0 && count8x8_lp == 0 && count4x4 == 0) {
+        cm->tx_mode = ALLOW_16X16;
+        reset_skip_tx_size(cm, TX_16X16);
+      }
+    }
+  } else {
+    cm->reference_mode = SINGLE_REFERENCE;
+    encode_frame_internal(cpi);
+  }
+}
+
+static void sum_intra_stats(FRAME_COUNTS *counts, const MODE_INFO *mi,
+                            const MODE_INFO *above_mi, const MODE_INFO *left_mi,
+                            const int intraonly) {
+  const PREDICTION_MODE y_mode = mi->mbmi.mode;
+  const PREDICTION_MODE uv_mode = mi->mbmi.uv_mode;
+  const BLOCK_SIZE bsize = mi->mbmi.sb_type;
+
+  if (bsize < BLOCK_8X8) {
+    int idx, idy;
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+    for (idy = 0; idy < 2; idy += num_4x4_h)
+      for (idx = 0; idx < 2; idx += num_4x4_w) {
+        const int bidx = idy * 2 + idx;
+        const PREDICTION_MODE bmode = mi->bmi[bidx].as_mode;
+        if (intraonly) {
+          const PREDICTION_MODE a = vp10_above_block_mode(mi, above_mi, bidx);
+          const PREDICTION_MODE l = vp10_left_block_mode(mi, left_mi, bidx);
+          ++counts->kf_y_mode[a][l][bmode];
+        } else {
+          ++counts->y_mode[0][bmode];
+        }
+      }
+  } else {
+    if (intraonly) {
+      const PREDICTION_MODE above = vp10_above_block_mode(mi, above_mi, 0);
+      const PREDICTION_MODE left = vp10_left_block_mode(mi, left_mi, 0);
+      ++counts->kf_y_mode[above][left][y_mode];
+    } else {
+      ++counts->y_mode[size_group_lookup[bsize]][y_mode];
+    }
+  }
+
+  ++counts->uv_mode[y_mode][uv_mode];
+}
+
+static void encode_superblock(VP10_COMP *cpi, ThreadData *td,
+                              TOKENEXTRA **t, int output_enabled,
+                              int mi_row, int mi_col, BLOCK_SIZE bsize,
+                              PICK_MODE_CONTEXT *ctx) {
+  VP10_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO **mi_8x8 = xd->mi;
+  MODE_INFO *mi = mi_8x8[0];
+  MB_MODE_INFO *mbmi = &mi->mbmi;
+  const int seg_skip = segfeature_active(&cm->seg, mbmi->segment_id,
+                                         SEG_LVL_SKIP);
+  const int mis = cm->mi_stride;
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+
+  x->skip_recode = !x->select_tx_size && mbmi->sb_type >= BLOCK_8X8 &&
+                   cpi->oxcf.aq_mode != COMPLEXITY_AQ &&
+                   cpi->oxcf.aq_mode != CYCLIC_REFRESH_AQ &&
+                   cpi->sf.allow_skip_recode;
+
+  if (!x->skip_recode)
+    memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
+
+  x->skip_optimize = ctx->is_coded;
+  ctx->is_coded = 1;
+  x->use_lp32x32fdct = cpi->sf.use_lp32x32fdct;
+
+  if (!is_inter_block(mbmi)) {
+    int plane;
+    mbmi->skip = 1;
+    for (plane = 0; plane < MAX_MB_PLANE; ++plane)
+      vp10_encode_intra_block_plane(x, VPXMAX(bsize, BLOCK_8X8), plane);
+    if (output_enabled)
+      sum_intra_stats(td->counts, mi, xd->above_mi, xd->left_mi,
+                      frame_is_intra_only(cm));
+    vp10_tokenize_sb(cpi, td, t, !output_enabled, VPXMAX(bsize, BLOCK_8X8));
+  } else {
+    int ref;
+    const int is_compound = has_second_ref(mbmi);
+    set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+    for (ref = 0; ref < 1 + is_compound; ++ref) {
+      YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi,
+                                                     mbmi->ref_frame[ref]);
+      assert(cfg != NULL);
+      vp10_setup_pre_planes(xd, ref, cfg, mi_row, mi_col,
+                           &xd->block_refs[ref]->sf);
+    }
+    if (!(cpi->sf.reuse_inter_pred_sby && ctx->pred_pixel_ready) || seg_skip)
+      vp10_build_inter_predictors_sby(xd, mi_row, mi_col,
+                                      VPXMAX(bsize, BLOCK_8X8));
+
+    vp10_build_inter_predictors_sbuv(xd, mi_row, mi_col,
+                                     VPXMAX(bsize, BLOCK_8X8));
+
+    vp10_encode_sb(x, VPXMAX(bsize, BLOCK_8X8));
+    vp10_tokenize_sb(cpi, td, t, !output_enabled, VPXMAX(bsize, BLOCK_8X8));
+  }
+
+  if (output_enabled) {
+    if (cm->tx_mode == TX_MODE_SELECT &&
+        mbmi->sb_type >= BLOCK_8X8  &&
+        !(is_inter_block(mbmi) && (mbmi->skip || seg_skip))) {
+      ++get_tx_counts(max_txsize_lookup[bsize], get_tx_size_context(xd),
+                      &td->counts->tx)[mbmi->tx_size];
+    } else {
+      int x, y;
+      TX_SIZE tx_size;
+      // The new intra coding scheme requires no change of transform size
+      if (is_inter_block(&mi->mbmi)) {
+        tx_size = VPXMIN(tx_mode_to_biggest_tx_size[cm->tx_mode],
+                         max_txsize_lookup[bsize]);
+      } else {
+        tx_size = (bsize >= BLOCK_8X8) ? mbmi->tx_size : TX_4X4;
+      }
+
+      for (y = 0; y < mi_height; y++)
+        for (x = 0; x < mi_width; x++)
+          if (mi_col + x < cm->mi_cols && mi_row + y < cm->mi_rows)
+            mi_8x8[mis * y + x]->mbmi.tx_size = tx_size;
+    }
+    ++td->counts->tx.tx_totals[mbmi->tx_size];
+    ++td->counts->tx.tx_totals[get_uv_tx_size(mbmi, &xd->plane[1])];
+    if (mbmi->tx_size < TX_32X32 &&
+        cm->base_qindex > 0 && !mbmi->skip &&
+        !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+      if (is_inter_block(mbmi)) {
+        ++td->counts->inter_ext_tx[mbmi->tx_size][mbmi->tx_type];
+      } else {
+        ++td->counts->intra_ext_tx[mbmi->tx_size]
+                                  [intra_mode_to_tx_type_context[mbmi->mode]]
+                                  [mbmi->tx_type];
+      }
+    }
+  }
+}
diff --git a/libs/libvpx/vp10/encoder/encodeframe.h b/libs/libvpx/vp10/encoder/encodeframe.h
new file mode 100644
index 0000000000..fbb81f8b17
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/encodeframe.h
@@ -0,0 +1,49 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP10_ENCODER_ENCODEFRAME_H_
+#define VP10_ENCODER_ENCODEFRAME_H_
+
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct macroblock;
+struct yv12_buffer_config;
+struct VP10_COMP;
+struct ThreadData;
+
+// Constants used in SOURCE_VAR_BASED_PARTITION
+#define VAR_HIST_MAX_BG_VAR 1000
+#define VAR_HIST_FACTOR 10
+#define VAR_HIST_BINS (VAR_HIST_MAX_BG_VAR / VAR_HIST_FACTOR + 1)
+#define VAR_HIST_LARGE_CUT_OFF 75
+#define VAR_HIST_SMALL_CUT_OFF 45
+
+void vp10_setup_src_planes(struct macroblock *x,
+                          const struct yv12_buffer_config *src,
+                          int mi_row, int mi_col);
+
+void vp10_encode_frame(struct VP10_COMP *cpi);
+
+void vp10_init_tile_data(struct VP10_COMP *cpi);
+void vp10_encode_tile(struct VP10_COMP *cpi, struct ThreadData *td,
+                     int tile_row, int tile_col);
+
+void vp10_set_variance_partition_thresholds(struct VP10_COMP *cpi, int q);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_ENCODER_ENCODEFRAME_H_
diff --git a/libs/libvpx/vp10/encoder/encodemb.c b/libs/libvpx/vp10/encoder/encodemb.c
new file mode 100644
index 0000000000..92ba4ddb44
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/encodemb.c
@@ -0,0 +1,1133 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "./vp10_rtcd.h"
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_dsp/quantize.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+
+#include "vp10/common/idct.h"
+#include "vp10/common/reconinter.h"
+#include "vp10/common/reconintra.h"
+#include "vp10/common/scan.h"
+
+#include "vp10/encoder/encodemb.h"
+#include "vp10/encoder/rd.h"
+#include "vp10/encoder/tokenize.h"
+
+struct optimize_ctx {
+  ENTROPY_CONTEXT ta[MAX_MB_PLANE][16];
+  ENTROPY_CONTEXT tl[MAX_MB_PLANE][16];
+};
+
+void vp10_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
+  struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane];
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+  const int bw = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+  const int bh = 4 * num_4x4_blocks_high_lookup[plane_bsize];
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    vpx_highbd_subtract_block(bh, bw, p->src_diff, bw, p->src.buf,
+                              p->src.stride, pd->dst.buf, pd->dst.stride,
+                              x->e_mbd.bd);
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  vpx_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
+                     pd->dst.buf, pd->dst.stride);
+}
+
+#define RDTRUNC(RM, DM, R, D) ((128 + (R) * (RM)) & 0xFF)
+
+typedef struct vp10_token_state {
+  int           rate;
+  int           error;
+  int           next;
+  int16_t       token;
+  short         qc;
+} vp10_token_state;
+
+// TODO(jimbankoski): experiment to find optimal RD numbers.
+static const int plane_rd_mult[PLANE_TYPES] = { 4, 2 };
+
+#define UPDATE_RD_COST()\
+{\
+  rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);\
+  rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);\
+  if (rd_cost0 == rd_cost1) {\
+    rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);\
+    rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);\
+  }\
+}
+
+// This function is a place holder for now but may ultimately need
+// to scan previous tokens to work out the correct context.
+static int trellis_get_coeff_context(const int16_t *scan,
+                                     const int16_t *nb,
+                                     int idx, int token,
+                                     uint8_t *token_cache) {
+  int bak = token_cache[scan[idx]], pt;
+  token_cache[scan[idx]] = vp10_pt_energy_class[token];
+  pt = get_coef_context(nb, token_cache, idx + 1);
+  token_cache[scan[idx]] = bak;
+  return pt;
+}
+
+static int optimize_b(MACROBLOCK *mb, int plane, int block,
+                      TX_SIZE tx_size, int ctx) {
+  MACROBLOCKD *const xd = &mb->e_mbd;
+  struct macroblock_plane *const p = &mb->plane[plane];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int ref = is_inter_block(&xd->mi[0]->mbmi);
+  vp10_token_state tokens[1025][2];
+  unsigned best_index[1025][2];
+  uint8_t token_cache[1024];
+  const tran_low_t *const coeff = BLOCK_OFFSET(mb->plane[plane].coeff, block);
+  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  const int eob = p->eobs[block];
+  const PLANE_TYPE type = pd->plane_type;
+  const int default_eob = 16 << (tx_size << 1);
+  const int mul = 1 + (tx_size == TX_32X32);
+  const int16_t *dequant_ptr = pd->dequant;
+  const uint8_t *const band_translate = get_band_translate(tx_size);
+  TX_TYPE tx_type = get_tx_type(type, xd, block);
+  const scan_order *const so = get_scan(tx_size, tx_type);
+  const int16_t *const scan = so->scan;
+  const int16_t *const nb = so->neighbors;
+  int next = eob, sz = 0;
+  int64_t rdmult = mb->rdmult * plane_rd_mult[type], rddiv = mb->rddiv;
+  int64_t rd_cost0, rd_cost1;
+  int rate0, rate1, error0, error1;
+  int16_t t0, t1;
+  EXTRABIT e0;
+  int best, band, pt, i, final_eob;
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int16_t *cat6_high_cost = vp10_get_high_cost_table(xd->bd);
+#else
+  const int16_t *cat6_high_cost = vp10_get_high_cost_table(8);
+#endif
+
+  assert((!type && !plane) || (type && plane));
+  assert(eob <= default_eob);
+
+  /* Now set up a Viterbi trellis to evaluate alternative roundings. */
+  if (!ref)
+    rdmult = (rdmult * 9) >> 4;
+
+  /* Initialize the sentinel node of the trellis. */
+  tokens[eob][0].rate = 0;
+  tokens[eob][0].error = 0;
+  tokens[eob][0].next = default_eob;
+  tokens[eob][0].token = EOB_TOKEN;
+  tokens[eob][0].qc = 0;
+  tokens[eob][1] = tokens[eob][0];
+
+  for (i = 0; i < eob; i++)
+    token_cache[scan[i]] =
+        vp10_pt_energy_class[vp10_get_token(qcoeff[scan[i]])];
+
+  for (i = eob; i-- > 0;) {
+    int base_bits, d2, dx;
+    const int rc = scan[i];
+    int x = qcoeff[rc];
+    /* Only add a trellis state for non-zero coefficients. */
+    if (x) {
+      int shortcut = 0;
+      error0 = tokens[next][0].error;
+      error1 = tokens[next][1].error;
+      /* Evaluate the first possibility for this state. */
+      rate0 = tokens[next][0].rate;
+      rate1 = tokens[next][1].rate;
+      vp10_get_token_extra(x, &t0, &e0);
+      /* Consider both possible successor states. */
+      if (next < default_eob) {
+        band = band_translate[i + 1];
+        pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
+        rate0 += mb->token_costs[tx_size][type][ref][band][0][pt]
+                                [tokens[next][0].token];
+        rate1 += mb->token_costs[tx_size][type][ref][band][0][pt]
+                                [tokens[next][1].token];
+      }
+      UPDATE_RD_COST();
+      /* And pick the best. */
+      best = rd_cost1 < rd_cost0;
+      base_bits = vp10_get_cost(t0, e0, cat6_high_cost);
+      dx = mul * (dqcoeff[rc] - coeff[rc]);
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        dx >>= xd->bd - 8;
+      }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      d2 = dx * dx;
+      tokens[i][0].rate = base_bits + (best ? rate1 : rate0);
+      tokens[i][0].error = d2 + (best ? error1 : error0);
+      tokens[i][0].next = next;
+      tokens[i][0].token = t0;
+      tokens[i][0].qc = x;
+      best_index[i][0] = best;
+
+      /* Evaluate the second possibility for this state. */
+      rate0 = tokens[next][0].rate;
+      rate1 = tokens[next][1].rate;
+
+      if ((abs(x) * dequant_ptr[rc != 0] > abs(coeff[rc]) * mul) &&
+          (abs(x) * dequant_ptr[rc != 0] < abs(coeff[rc]) * mul +
+                                               dequant_ptr[rc != 0]))
+        shortcut = 1;
+      else
+        shortcut = 0;
+
+      if (shortcut) {
+        sz = -(x < 0);
+        x -= 2 * sz + 1;
+      }
+
+      /* Consider both possible successor states. */
+      if (!x) {
+        /* If we reduced this coefficient to zero, check to see if
+         *  we need to move the EOB back here.
+         */
+        t0 = tokens[next][0].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
+        t1 = tokens[next][1].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
+        e0 = 0;
+      } else {
+        vp10_get_token_extra(x, &t0, &e0);
+        t1 = t0;
+      }
+      if (next < default_eob) {
+        band = band_translate[i + 1];
+        if (t0 != EOB_TOKEN) {
+          pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
+          rate0 += mb->token_costs[tx_size][type][ref][band][!x][pt]
+                                  [tokens[next][0].token];
+        }
+        if (t1 != EOB_TOKEN) {
+          pt = trellis_get_coeff_context(scan, nb, i, t1, token_cache);
+          rate1 += mb->token_costs[tx_size][type][ref][band][!x][pt]
+                                  [tokens[next][1].token];
+        }
+      }
+
+      UPDATE_RD_COST();
+      /* And pick the best. */
+      best = rd_cost1 < rd_cost0;
+      base_bits = vp10_get_cost(t0, e0, cat6_high_cost);
+
+      if (shortcut) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+          dx -= ((dequant_ptr[rc != 0] >> (xd->bd - 8)) + sz) ^ sz;
+        } else {
+          dx -= (dequant_ptr[rc != 0] + sz) ^ sz;
+        }
+#else
+        dx -= (dequant_ptr[rc != 0] + sz) ^ sz;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        d2 = dx * dx;
+      }
+      tokens[i][1].rate = base_bits + (best ? rate1 : rate0);
+      tokens[i][1].error = d2 + (best ? error1 : error0);
+      tokens[i][1].next = next;
+      tokens[i][1].token = best ? t1 : t0;
+      tokens[i][1].qc = x;
+      best_index[i][1] = best;
+      /* Finally, make this the new head of the trellis. */
+      next = i;
+    } else {
+      /* There's no choice to make for a zero coefficient, so we don't
+       *  add a new trellis node, but we do need to update the costs.
+       */
+      band = band_translate[i + 1];
+      t0 = tokens[next][0].token;
+      t1 = tokens[next][1].token;
+      /* Update the cost of each path if we're past the EOB token. */
+      if (t0 != EOB_TOKEN) {
+        tokens[next][0].rate +=
+            mb->token_costs[tx_size][type][ref][band][1][0][t0];
+        tokens[next][0].token = ZERO_TOKEN;
+      }
+      if (t1 != EOB_TOKEN) {
+        tokens[next][1].rate +=
+            mb->token_costs[tx_size][type][ref][band][1][0][t1];
+        tokens[next][1].token = ZERO_TOKEN;
+      }
+      best_index[i][0] = best_index[i][1] = 0;
+      /* Don't update next, because we didn't add a new node. */
+    }
+  }
+
+  /* Now pick the best path through the whole trellis. */
+  band = band_translate[i + 1];
+  rate0 = tokens[next][0].rate;
+  rate1 = tokens[next][1].rate;
+  error0 = tokens[next][0].error;
+  error1 = tokens[next][1].error;
+  t0 = tokens[next][0].token;
+  t1 = tokens[next][1].token;
+  rate0 += mb->token_costs[tx_size][type][ref][band][0][ctx][t0];
+  rate1 += mb->token_costs[tx_size][type][ref][band][0][ctx][t1];
+  UPDATE_RD_COST();
+  best = rd_cost1 < rd_cost0;
+  final_eob = -1;
+  memset(qcoeff, 0, sizeof(*qcoeff) * (16 << (tx_size * 2)));
+  memset(dqcoeff, 0, sizeof(*dqcoeff) * (16 << (tx_size * 2)));
+  for (i = next; i < eob; i = next) {
+    const int x = tokens[i][best].qc;
+    const int rc = scan[i];
+    if (x) {
+      final_eob = i;
+    }
+
+    qcoeff[rc] = x;
+    dqcoeff[rc] = (x * dequant_ptr[rc != 0]) / mul;
+
+    next = tokens[i][best].next;
+    best = best_index[i][best];
+  }
+  final_eob++;
+
+  mb->plane[plane].eobs[block] = final_eob;
+  return final_eob;
+}
+
+static INLINE void fdct32x32(int rd_transform,
+                             const int16_t *src, tran_low_t *dst,
+                             int src_stride) {
+  if (rd_transform)
+    vpx_fdct32x32_rd(src, dst, src_stride);
+  else
+    vpx_fdct32x32(src, dst, src_stride);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void highbd_fdct32x32(int rd_transform, const int16_t *src,
+                                    tran_low_t *dst, int src_stride) {
+  if (rd_transform)
+    vpx_highbd_fdct32x32_rd(src, dst, src_stride);
+  else
+    vpx_highbd_fdct32x32(src, dst, src_stride);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+void vp10_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
+                       int diff_stride, TX_TYPE tx_type, int lossless) {
+  if (lossless) {
+    vp10_fwht4x4(src_diff, coeff, diff_stride);
+  } else {
+    switch (tx_type) {
+      case DCT_DCT:
+        vpx_fdct4x4(src_diff, coeff, diff_stride);
+        break;
+      case ADST_DCT:
+      case DCT_ADST:
+      case ADST_ADST:
+        vp10_fht4x4(src_diff, coeff, diff_stride, tx_type);
+        break;
+      default:
+        assert(0);
+        break;
+    }
+  }
+}
+
+static void fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
+                         int diff_stride, TX_TYPE tx_type) {
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+      vp10_fht8x8(src_diff, coeff, diff_stride, tx_type);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
+
+static void fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
+                           int diff_stride, TX_TYPE tx_type) {
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+      vp10_fht16x16(src_diff, coeff, diff_stride, tx_type);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
+
+static void fwd_txfm_32x32(int rd_transform, const int16_t *src_diff,
+                           tran_low_t *coeff, int diff_stride,
+                           TX_TYPE tx_type) {
+  switch (tx_type) {
+    case DCT_DCT:
+      fdct32x32(rd_transform, src_diff, coeff, diff_stride);
+      break;
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+      assert(0);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp10_highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
+                              int diff_stride, TX_TYPE tx_type, int lossless) {
+  if (lossless) {
+    assert(tx_type == DCT_DCT);
+    vp10_highbd_fwht4x4(src_diff, coeff, diff_stride);
+  } else {
+    switch (tx_type) {
+      case DCT_DCT:
+        vpx_highbd_fdct4x4(src_diff, coeff, diff_stride);
+        break;
+      case ADST_DCT:
+      case DCT_ADST:
+      case ADST_ADST:
+        vp10_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type);
+        break;
+      default:
+        assert(0);
+        break;
+    }
+  }
+}
+
+static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
+                         int diff_stride, TX_TYPE tx_type) {
+  switch (tx_type) {
+    case DCT_DCT:
+      vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
+      break;
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+      vp10_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
+
+static void highbd_fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
+                           int diff_stride, TX_TYPE tx_type) {
+  switch (tx_type) {
+    case DCT_DCT:
+      vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
+      break;
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+      vp10_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
+
+static void highbd_fwd_txfm_32x32(int rd_transform, const int16_t *src_diff,
+                                  tran_low_t *coeff, int diff_stride,
+                                  TX_TYPE tx_type) {
+  switch (tx_type) {
+    case DCT_DCT:
+      highbd_fdct32x32(rd_transform, src_diff, coeff, diff_stride);
+      break;
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+      assert(0);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+void vp10_xform_quant_fp(MACROBLOCK *x, int plane, int block,
+                         int blk_row, int blk_col,
+                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  PLANE_TYPE plane_type = (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
+  TX_TYPE tx_type = get_tx_type(plane_type, xd, block);
+  const scan_order *const scan_order = get_scan(tx_size, tx_type);
+  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  uint16_t *const eob = &p->eobs[block];
+  const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+  const int16_t *src_diff;
+  src_diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    switch (tx_size) {
+      case TX_32X32:
+        highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
+        vp10_highbd_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin,
+                                     p->round_fp, p->quant_fp, p->quant_shift,
+                                     qcoeff, dqcoeff, pd->dequant,
+                                     eob, scan_order->scan,
+                                     scan_order->iscan);
+        break;
+      case TX_16X16:
+        vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
+        vp10_highbd_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
+                               p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                               pd->dequant, eob,
+                               scan_order->scan, scan_order->iscan);
+        break;
+      case TX_8X8:
+        vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
+        vp10_highbd_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp,
+                               p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                               pd->dequant, eob,
+                               scan_order->scan, scan_order->iscan);
+        break;
+      case TX_4X4:
+        if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
+          vp10_highbd_fwht4x4(src_diff, coeff, diff_stride);
+        } else {
+          vpx_highbd_fdct4x4(src_diff, coeff, diff_stride);
+        }
+        vp10_highbd_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
+                               p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                               pd->dequant, eob,
+                               scan_order->scan, scan_order->iscan);
+        break;
+      default:
+        assert(0);
+    }
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  switch (tx_size) {
+    case TX_32X32:
+      fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
+      vp10_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin, p->round_fp,
+                            p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                            pd->dequant, eob, scan_order->scan,
+                            scan_order->iscan);
+      break;
+    case TX_16X16:
+      vpx_fdct16x16(src_diff, coeff, diff_stride);
+      vp10_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
+                      p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                      pd->dequant, eob,
+                      scan_order->scan, scan_order->iscan);
+      break;
+    case TX_8X8:
+      vp10_fdct8x8_quant(src_diff, diff_stride, coeff, 64,
+                        x->skip_block, p->zbin, p->round_fp,
+                        p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                        pd->dequant, eob,
+                        scan_order->scan, scan_order->iscan);
+      break;
+    case TX_4X4:
+      if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
+        vp10_fwht4x4(src_diff, coeff, diff_stride);
+      } else {
+        vpx_fdct4x4(src_diff, coeff, diff_stride);
+      }
+      vp10_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
+                      p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                      pd->dequant, eob,
+                      scan_order->scan, scan_order->iscan);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
+
+void vp10_xform_quant_dc(MACROBLOCK *x, int plane, int block,
+                         int blk_row, int blk_col,
+                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  uint16_t *const eob = &p->eobs[block];
+  const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+  const int16_t *src_diff;
+  src_diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    switch (tx_size) {
+      case TX_32X32:
+        vpx_highbd_fdct32x32_1(src_diff, coeff, diff_stride);
+        vpx_highbd_quantize_dc_32x32(coeff, x->skip_block, p->round,
+                                     p->quant_fp[0], qcoeff, dqcoeff,
+                                     pd->dequant[0], eob);
+        break;
+      case TX_16X16:
+        vpx_highbd_fdct16x16_1(src_diff, coeff, diff_stride);
+        vpx_highbd_quantize_dc(coeff, 256, x->skip_block, p->round,
+                               p->quant_fp[0], qcoeff, dqcoeff,
+                               pd->dequant[0], eob);
+        break;
+      case TX_8X8:
+        vpx_highbd_fdct8x8_1(src_diff, coeff, diff_stride);
+        vpx_highbd_quantize_dc(coeff, 64, x->skip_block, p->round,
+                               p->quant_fp[0], qcoeff, dqcoeff,
+                               pd->dequant[0], eob);
+        break;
+      case TX_4X4:
+        if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
+          vp10_highbd_fwht4x4(src_diff, coeff, diff_stride);
+        } else {
+          vpx_highbd_fdct4x4(src_diff, coeff, diff_stride);
+        }
+        vpx_highbd_quantize_dc(coeff, 16, x->skip_block, p->round,
+                               p->quant_fp[0], qcoeff, dqcoeff,
+                               pd->dequant[0], eob);
+        break;
+      default:
+        assert(0);
+    }
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  switch (tx_size) {
+    case TX_32X32:
+      vpx_fdct32x32_1(src_diff, coeff, diff_stride);
+      vpx_quantize_dc_32x32(coeff, x->skip_block, p->round,
+                            p->quant_fp[0], qcoeff, dqcoeff,
+                            pd->dequant[0], eob);
+      break;
+    case TX_16X16:
+      vpx_fdct16x16_1(src_diff, coeff, diff_stride);
+      vpx_quantize_dc(coeff, 256, x->skip_block, p->round,
+                     p->quant_fp[0], qcoeff, dqcoeff,
+                     pd->dequant[0], eob);
+      break;
+    case TX_8X8:
+      vpx_fdct8x8_1(src_diff, coeff, diff_stride);
+      vpx_quantize_dc(coeff, 64, x->skip_block, p->round,
+                      p->quant_fp[0], qcoeff, dqcoeff,
+                      pd->dequant[0], eob);
+      break;
+    case TX_4X4:
+      if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
+        vp10_fwht4x4(src_diff, coeff, diff_stride);
+      } else {
+        vpx_fdct4x4(src_diff, coeff, diff_stride);
+      }
+      vpx_quantize_dc(coeff, 16, x->skip_block, p->round,
+                      p->quant_fp[0], qcoeff, dqcoeff,
+                      pd->dequant[0], eob);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
+
+
+
+void vp10_xform_quant(MACROBLOCK *x, int plane, int block,
+                      int blk_row, int blk_col,
+                      BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  PLANE_TYPE plane_type = (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
+  TX_TYPE tx_type = get_tx_type(plane_type, xd, block);
+  const scan_order *const scan_order = get_scan(tx_size, tx_type);
+  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  uint16_t *const eob = &p->eobs[block];
+  const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+  const int16_t *src_diff;
+  src_diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+     switch (tx_size) {
+      case TX_32X32:
+        highbd_fwd_txfm_32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride,
+                         tx_type);
+        vpx_highbd_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin,
+                                    p->round, p->quant, p->quant_shift, qcoeff,
+                                    dqcoeff, pd->dequant, eob,
+                                    scan_order->scan, scan_order->iscan);
+        break;
+      case TX_16X16:
+        highbd_fwd_txfm_16x16(src_diff, coeff, diff_stride, tx_type);
+        vpx_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
+                              p->quant, p->quant_shift, qcoeff, dqcoeff,
+                              pd->dequant, eob,
+                              scan_order->scan, scan_order->iscan);
+        break;
+      case TX_8X8:
+        highbd_fwd_txfm_8x8(src_diff, coeff, diff_stride, tx_type);
+        vpx_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
+                              p->quant, p->quant_shift, qcoeff, dqcoeff,
+                              pd->dequant, eob,
+                              scan_order->scan, scan_order->iscan);
+        break;
+      case TX_4X4:
+        vp10_highbd_fwd_txfm_4x4(src_diff, coeff, diff_stride, tx_type,
+                                 xd->lossless[xd->mi[0]->mbmi.segment_id]);
+        vpx_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
+                              p->quant, p->quant_shift, qcoeff, dqcoeff,
+                              pd->dequant, eob,
+                              scan_order->scan, scan_order->iscan);
+        break;
+      default:
+        assert(0);
+    }
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  switch (tx_size) {
+    case TX_32X32:
+      fwd_txfm_32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride, tx_type);
+      vpx_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
+                           p->quant, p->quant_shift, qcoeff, dqcoeff,
+                           pd->dequant, eob, scan_order->scan,
+                           scan_order->iscan);
+      break;
+    case TX_16X16:
+      fwd_txfm_16x16(src_diff, coeff, diff_stride, tx_type);
+      vpx_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
+                     p->quant, p->quant_shift, qcoeff, dqcoeff,
+                     pd->dequant, eob,
+                     scan_order->scan, scan_order->iscan);
+      break;
+    case TX_8X8:
+      fwd_txfm_8x8(src_diff, coeff, diff_stride, tx_type);
+      vpx_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
+                     p->quant, p->quant_shift, qcoeff, dqcoeff,
+                     pd->dequant, eob,
+                     scan_order->scan, scan_order->iscan);
+      break;
+    case TX_4X4:
+      vp10_fwd_txfm_4x4(src_diff, coeff, diff_stride, tx_type,
+                        xd->lossless[xd->mi[0]->mbmi.segment_id]);
+      vpx_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
+                     p->quant, p->quant_shift, qcoeff, dqcoeff,
+                     pd->dequant, eob,
+                     scan_order->scan, scan_order->iscan);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
+
+static void encode_block(int plane, int block, int blk_row, int blk_col,
+                         BLOCK_SIZE plane_bsize,
+                         TX_SIZE tx_size, void *arg) {
+  struct encode_b_args *const args = arg;
+  MACROBLOCK *const x = args->x;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct optimize_ctx *const ctx = args->ctx;
+  struct macroblock_plane *const p = &x->plane[plane];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  uint8_t *dst;
+  ENTROPY_CONTEXT *a, *l;
+  TX_TYPE tx_type = get_tx_type(pd->plane_type, xd, block);
+  dst = &pd->dst.buf[4 * blk_row * pd->dst.stride + 4 * blk_col];
+  a = &ctx->ta[plane][blk_col];
+  l = &ctx->tl[plane][blk_row];
+
+  // TODO(jingning): per transformed block zero forcing only enabled for
+  // luma component. will integrate chroma components as well.
+  if (x->zcoeff_blk[tx_size][block] && plane == 0) {
+    p->eobs[block] = 0;
+    *a = *l = 0;
+    return;
+  }
+
+  if (!x->skip_recode) {
+    if (x->quant_fp) {
+      // Encoding process for rtc mode
+      if (x->skip_txfm[0] == SKIP_TXFM_AC_DC && plane == 0) {
+        // skip forward transform
+        p->eobs[block] = 0;
+        *a = *l = 0;
+        return;
+      } else {
+        vp10_xform_quant_fp(x, plane, block, blk_row, blk_col,
+                            plane_bsize, tx_size);
+      }
+    } else {
+      if (max_txsize_lookup[plane_bsize] == tx_size) {
+        int txfm_blk_index = (plane << 2) + (block >> (tx_size << 1));
+        if (x->skip_txfm[txfm_blk_index] == SKIP_TXFM_NONE) {
+          // full forward transform and quantization
+          vp10_xform_quant(x, plane, block, blk_row, blk_col,
+                           plane_bsize, tx_size);
+        } else if (x->skip_txfm[txfm_blk_index] == SKIP_TXFM_AC_ONLY) {
+          // fast path forward transform and quantization
+          vp10_xform_quant_dc(x, plane, block, blk_row, blk_col,
+                              plane_bsize, tx_size);
+        } else {
+          // skip forward transform
+          p->eobs[block] = 0;
+          *a = *l = 0;
+          return;
+        }
+      } else {
+        vp10_xform_quant(x, plane, block, blk_row, blk_col,
+                         plane_bsize, tx_size);
+      }
+    }
+  }
+
+  if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
+    const int ctx = combine_entropy_contexts(*a, *l);
+    *a = *l = optimize_b(x, plane, block, tx_size, ctx) > 0;
+  } else {
+    *a = *l = p->eobs[block] > 0;
+  }
+
+  if (p->eobs[block])
+    *(args->skip) = 0;
+
+  if (p->eobs[block] == 0)
+    return;
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    switch (tx_size) {
+      case TX_32X32:
+        vp10_highbd_inv_txfm_add_32x32(dqcoeff, dst, pd->dst.stride,
+                                       p->eobs[block], xd->bd, tx_type);
+        break;
+      case TX_16X16:
+        vp10_highbd_inv_txfm_add_16x16(dqcoeff, dst, pd->dst.stride,
+                                       p->eobs[block], xd->bd, tx_type);
+        break;
+      case TX_8X8:
+        vp10_highbd_inv_txfm_add_8x8(dqcoeff, dst, pd->dst.stride,
+                                     p->eobs[block], xd->bd, tx_type);
+        break;
+      case TX_4X4:
+        // this is like vp10_short_idct4x4 but has a special case around eob<=1
+        // which is significant (not just an optimization) for the lossless
+        // case.
+        vp10_highbd_inv_txfm_add_4x4(dqcoeff, dst, pd->dst.stride,
+                                     p->eobs[block], xd->bd, tx_type,
+                                     xd->lossless[xd->mi[0]->mbmi.segment_id]);
+        break;
+      default:
+        assert(0 && "Invalid transform size");
+        break;
+    }
+
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  switch (tx_size) {
+    case TX_32X32:
+      vp10_inv_txfm_add_32x32(dqcoeff, dst, pd->dst.stride, p->eobs[block],
+                              tx_type);
+      break;
+    case TX_16X16:
+      vp10_inv_txfm_add_16x16(dqcoeff, dst, pd->dst.stride, p->eobs[block],
+                              tx_type);
+      break;
+    case TX_8X8:
+      vp10_inv_txfm_add_8x8(dqcoeff, dst, pd->dst.stride, p->eobs[block],
+                            tx_type);
+      break;
+    case TX_4X4:
+      // this is like vp10_short_idct4x4 but has a special case around eob<=1
+      // which is significant (not just an optimization) for the lossless
+      // case.
+      vp10_inv_txfm_add_4x4(dqcoeff, dst, pd->dst.stride, p->eobs[block],
+                            tx_type, xd->lossless[xd->mi[0]->mbmi.segment_id]);
+      break;
+    default:
+      assert(0 && "Invalid transform size");
+      break;
+  }
+}
+
+static void encode_block_pass1(int plane, int block, int blk_row, int blk_col,
+                               BLOCK_SIZE plane_bsize,
+                               TX_SIZE tx_size, void *arg) {
+  MACROBLOCK *const x = (MACROBLOCK *)arg;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *const p = &x->plane[plane];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  uint8_t *dst;
+  dst = &pd->dst.buf[4 * blk_row * pd->dst.stride + 4 * blk_col];
+
+  vp10_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size);
+
+  if (p->eobs[block] > 0) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      if (xd->lossless[0]) {
+        vp10_highbd_iwht4x4_add(dqcoeff, dst, pd->dst.stride,
+                                p->eobs[block], xd->bd);
+      } else {
+        vp10_highbd_idct4x4_add(dqcoeff, dst, pd->dst.stride,
+                                p->eobs[block], xd->bd);
+      }
+      return;
+    }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    if (xd->lossless[0]) {
+      vp10_iwht4x4_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
+    } else {
+      vp10_idct4x4_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
+    }
+  }
+}
+
+void vp10_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize) {
+  vp10_subtract_plane(x, bsize, 0);
+  vp10_foreach_transformed_block_in_plane(&x->e_mbd, bsize, 0,
+                                         encode_block_pass1, x);
+}
+
+void vp10_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct optimize_ctx ctx;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  struct encode_b_args arg = {x, &ctx, &mbmi->skip};
+  int plane;
+
+  mbmi->skip = 1;
+
+  if (x->skip)
+    return;
+
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    if (!x->skip_recode)
+      vp10_subtract_plane(x, bsize, plane);
+
+    if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
+      const struct macroblockd_plane* const pd = &xd->plane[plane];
+      const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi, pd) : mbmi->tx_size;
+      vp10_get_entropy_contexts(bsize, tx_size, pd,
+                               ctx.ta[plane], ctx.tl[plane]);
+    }
+
+    vp10_foreach_transformed_block_in_plane(xd, bsize, plane, encode_block,
+                                           &arg);
+  }
+}
+
+void vp10_encode_block_intra(int plane, int block, int blk_row, int blk_col,
+                             BLOCK_SIZE plane_bsize,
+                             TX_SIZE tx_size, void *arg) {
+  struct encode_b_args* const args = arg;
+  MACROBLOCK *const x = args->x;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  struct macroblock_plane *const p = &x->plane[plane];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  tran_low_t *coeff = BLOCK_OFFSET(p->coeff, block);
+  tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  PLANE_TYPE plane_type = (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
+  TX_TYPE tx_type = get_tx_type(plane_type, xd, block);
+  const scan_order *const scan_order = get_scan(tx_size, tx_type);
+  PREDICTION_MODE mode;
+  const int bwl = b_width_log2_lookup[plane_bsize];
+  const int bhl = b_height_log2_lookup[plane_bsize];
+  const int diff_stride = 4 * (1 << bwl);
+  uint8_t *src, *dst;
+  int16_t *src_diff;
+  uint16_t *eob = &p->eobs[block];
+  const int src_stride = p->src.stride;
+  const int dst_stride = pd->dst.stride;
+  dst = &pd->dst.buf[4 * (blk_row * dst_stride + blk_col)];
+  src = &p->src.buf[4 * (blk_row * src_stride + blk_col)];
+  src_diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
+
+  mode = plane == 0 ? get_y_mode(xd->mi[0], block) : mbmi->uv_mode;
+  vp10_predict_intra_block(xd, bwl, bhl, tx_size, mode, dst, dst_stride,
+                          dst, dst_stride, blk_col, blk_row, plane);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    switch (tx_size) {
+      case TX_32X32:
+        if (!x->skip_recode) {
+          vpx_highbd_subtract_block(32, 32, src_diff, diff_stride,
+                                    src, src_stride, dst, dst_stride, xd->bd);
+          highbd_fwd_txfm_32x32(x->use_lp32x32fdct, src_diff, coeff,
+                                diff_stride, tx_type);
+          vpx_highbd_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin,
+                                      p->round, p->quant, p->quant_shift,
+                                      qcoeff, dqcoeff, pd->dequant, eob,
+                                      scan_order->scan, scan_order->iscan);
+        }
+        if (*eob)
+          vp10_highbd_inv_txfm_add_32x32(dqcoeff, dst, dst_stride, *eob, xd->bd,
+                                         tx_type);
+        break;
+      case TX_16X16:
+        if (!x->skip_recode) {
+          vpx_highbd_subtract_block(16, 16, src_diff, diff_stride,
+                                    src, src_stride, dst, dst_stride, xd->bd);
+          highbd_fwd_txfm_16x16(src_diff, coeff, diff_stride, tx_type);
+          vpx_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
+                                p->quant, p->quant_shift, qcoeff, dqcoeff,
+                                pd->dequant, eob,
+                                scan_order->scan, scan_order->iscan);
+        }
+        if (*eob)
+          vp10_highbd_inv_txfm_add_16x16(dqcoeff, dst, dst_stride, *eob, xd->bd,
+                                         tx_type);
+        break;
+      case TX_8X8:
+        if (!x->skip_recode) {
+          vpx_highbd_subtract_block(8, 8, src_diff, diff_stride,
+                                    src, src_stride, dst, dst_stride, xd->bd);
+          highbd_fwd_txfm_8x8(src_diff, coeff, diff_stride, tx_type);
+          vpx_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
+                                p->quant, p->quant_shift, qcoeff, dqcoeff,
+                                pd->dequant, eob,
+                                scan_order->scan, scan_order->iscan);
+        }
+        if (*eob)
+          vp10_highbd_inv_txfm_add_8x8(dqcoeff, dst, dst_stride, *eob, xd->bd,
+                                       tx_type);
+        break;
+      case TX_4X4:
+        if (!x->skip_recode) {
+          vpx_highbd_subtract_block(4, 4, src_diff, diff_stride,
+                                    src, src_stride, dst, dst_stride, xd->bd);
+          vp10_highbd_fwd_txfm_4x4(src_diff, coeff, diff_stride, tx_type,
+                                   xd->lossless[mbmi->segment_id]);
+          vpx_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
+                                p->quant, p->quant_shift, qcoeff, dqcoeff,
+                                pd->dequant, eob,
+                                scan_order->scan, scan_order->iscan);
+        }
+
+        if (*eob)
+          // this is like vp10_short_idct4x4 but has a special case around
+          // eob<=1 which is significant (not just an optimization) for the
+          // lossless case.
+          vp10_highbd_inv_txfm_add_4x4(dqcoeff, dst, dst_stride, *eob, xd->bd,
+                                       tx_type, xd->lossless[mbmi->segment_id]);
+        break;
+      default:
+        assert(0);
+        return;
+    }
+    if (*eob)
+      *(args->skip) = 0;
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  switch (tx_size) {
+    case TX_32X32:
+      if (!x->skip_recode) {
+        vpx_subtract_block(32, 32, src_diff, diff_stride,
+                           src, src_stride, dst, dst_stride);
+        fwd_txfm_32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride,
+                       tx_type);
+        vpx_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
+                             p->quant, p->quant_shift, qcoeff, dqcoeff,
+                             pd->dequant, eob, scan_order->scan,
+                             scan_order->iscan);
+      }
+      if (*eob)
+        vp10_inv_txfm_add_32x32(dqcoeff, dst, dst_stride, *eob, tx_type);
+      break;
+    case TX_16X16:
+      if (!x->skip_recode) {
+        vpx_subtract_block(16, 16, src_diff, diff_stride,
+                           src, src_stride, dst, dst_stride);
+        fwd_txfm_16x16(src_diff, coeff, diff_stride, tx_type);
+        vpx_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
+                       p->quant, p->quant_shift, qcoeff, dqcoeff,
+                       pd->dequant, eob, scan_order->scan,
+                       scan_order->iscan);
+      }
+      if (*eob)
+        vp10_inv_txfm_add_16x16(dqcoeff, dst, dst_stride, *eob, tx_type);
+      break;
+    case TX_8X8:
+      if (!x->skip_recode) {
+        vpx_subtract_block(8, 8, src_diff, diff_stride,
+                           src, src_stride, dst, dst_stride);
+        fwd_txfm_8x8(src_diff, coeff, diff_stride, tx_type);
+        vpx_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
+                       p->quant_shift, qcoeff, dqcoeff,
+                       pd->dequant, eob, scan_order->scan,
+                       scan_order->iscan);
+      }
+      if (*eob)
+        vp10_inv_txfm_add_8x8(dqcoeff, dst, dst_stride, *eob, tx_type);
+      break;
+    case TX_4X4:
+      if (!x->skip_recode) {
+        vpx_subtract_block(4, 4, src_diff, diff_stride,
+                           src, src_stride, dst, dst_stride);
+        vp10_fwd_txfm_4x4(src_diff, coeff, diff_stride, tx_type,
+                          xd->lossless[mbmi->segment_id]);
+        vpx_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
+                       p->quant_shift, qcoeff, dqcoeff,
+                       pd->dequant, eob, scan_order->scan,
+                       scan_order->iscan);
+      }
+
+      if (*eob) {
+        // this is like vp10_short_idct4x4 but has a special case around eob<=1
+        // which is significant (not just an optimization) for the lossless
+        // case.
+        vp10_inv_txfm_add_4x4(dqcoeff, dst, dst_stride, *eob, tx_type,
+                              xd->lossless[mbmi->segment_id]);
+      }
+      break;
+    default:
+      assert(0);
+      break;
+  }
+  if (*eob)
+    *(args->skip) = 0;
+}
+
+void vp10_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  struct encode_b_args arg = {x, NULL, &xd->mi[0]->mbmi.skip};
+
+  vp10_foreach_transformed_block_in_plane(xd, bsize, plane,
+                                          vp10_encode_block_intra, &arg);
+}
diff --git a/libs/libvpx/vp10/encoder/encodemb.h b/libs/libvpx/vp10/encoder/encodemb.h
new file mode 100644
index 0000000000..2e6516e0b0
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/encodemb.h
@@ -0,0 +1,58 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_ENCODER_ENCODEMB_H_
+#define VP10_ENCODER_ENCODEMB_H_
+
+#include "./vpx_config.h"
+#include "vp10/encoder/block.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct encode_b_args {
+  MACROBLOCK *x;
+  struct optimize_ctx *ctx;
+  int8_t *skip;
+};
+void vp10_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize);
+void vp10_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize);
+void vp10_xform_quant_fp(MACROBLOCK *x, int plane, int block,
+                         int blk_row, int blk_col,
+                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
+void vp10_xform_quant_dc(MACROBLOCK *x, int plane, int block,
+                         int blk_row, int blk_col,
+                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
+void vp10_xform_quant(MACROBLOCK *x, int plane, int block,
+                      int blk_row, int blk_col,
+                      BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
+
+void vp10_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
+
+void vp10_encode_block_intra(int plane, int block, int blk_row, int blk_col,
+                             BLOCK_SIZE plane_bsize,
+                             TX_SIZE tx_size, void *arg);
+
+void vp10_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
+
+void vp10_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
+                       int diff_stride, TX_TYPE tx_type, int lossless);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp10_highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
+                              int diff_stride, TX_TYPE tx_type, int lossless);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_ENCODER_ENCODEMB_H_
diff --git a/libs/libvpx/vp10/encoder/encodemv.c b/libs/libvpx/vp10/encoder/encodemv.c
new file mode 100644
index 0000000000..0736c65b3f
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/encodemv.c
@@ -0,0 +1,274 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+
+#include "vp10/common/common.h"
+#include "vp10/common/entropymode.h"
+
+#include "vp10/encoder/cost.h"
+#include "vp10/encoder/encodemv.h"
+#include "vp10/encoder/subexp.h"
+
+#include "vpx_dsp/vpx_dsp_common.h"
+
+static struct vp10_token mv_joint_encodings[MV_JOINTS];
+static struct vp10_token mv_class_encodings[MV_CLASSES];
+static struct vp10_token mv_fp_encodings[MV_FP_SIZE];
+static struct vp10_token mv_class0_encodings[CLASS0_SIZE];
+
+void vp10_entropy_mv_init(void) {
+  vp10_tokens_from_tree(mv_joint_encodings, vp10_mv_joint_tree);
+  vp10_tokens_from_tree(mv_class_encodings, vp10_mv_class_tree);
+  vp10_tokens_from_tree(mv_class0_encodings, vp10_mv_class0_tree);
+  vp10_tokens_from_tree(mv_fp_encodings, vp10_mv_fp_tree);
+}
+
+static void encode_mv_component(vpx_writer* w, int comp,
+                                const nmv_component* mvcomp, int usehp) {
+  int offset;
+  const int sign = comp < 0;
+  const int mag = sign ? -comp : comp;
+  const int mv_class = vp10_get_mv_class(mag - 1, &offset);
+  const int d = offset >> 3;                // int mv data
+  const int fr = (offset >> 1) & 3;         // fractional mv data
+  const int hp = offset & 1;                // high precision mv data
+
+  assert(comp != 0);
+
+  // Sign
+  vpx_write(w, sign, mvcomp->sign);
+
+  // Class
+  vp10_write_token(w, vp10_mv_class_tree, mvcomp->classes,
+                  &mv_class_encodings[mv_class]);
+
+  // Integer bits
+  if (mv_class == MV_CLASS_0) {
+    vp10_write_token(w, vp10_mv_class0_tree, mvcomp->class0,
+                    &mv_class0_encodings[d]);
+  } else {
+    int i;
+    const int n = mv_class + CLASS0_BITS - 1;  // number of bits
+    for (i = 0; i < n; ++i)
+      vpx_write(w, (d >> i) & 1, mvcomp->bits[i]);
+  }
+
+  // Fractional bits
+  vp10_write_token(w, vp10_mv_fp_tree,
+                  mv_class == MV_CLASS_0 ?  mvcomp->class0_fp[d] : mvcomp->fp,
+                  &mv_fp_encodings[fr]);
+
+  // High precision bit
+  if (usehp)
+    vpx_write(w, hp,
+              mv_class == MV_CLASS_0 ? mvcomp->class0_hp : mvcomp->hp);
+}
+
+
+static void build_nmv_component_cost_table(int *mvcost,
+                                           const nmv_component* const mvcomp,
+                                           int usehp) {
+  int i, v;
+  int sign_cost[2], class_cost[MV_CLASSES], class0_cost[CLASS0_SIZE];
+  int bits_cost[MV_OFFSET_BITS][2];
+  int class0_fp_cost[CLASS0_SIZE][MV_FP_SIZE], fp_cost[MV_FP_SIZE];
+  int class0_hp_cost[2], hp_cost[2];
+
+  sign_cost[0] = vp10_cost_zero(mvcomp->sign);
+  sign_cost[1] = vp10_cost_one(mvcomp->sign);
+  vp10_cost_tokens(class_cost, mvcomp->classes, vp10_mv_class_tree);
+  vp10_cost_tokens(class0_cost, mvcomp->class0, vp10_mv_class0_tree);
+  for (i = 0; i < MV_OFFSET_BITS; ++i) {
+    bits_cost[i][0] = vp10_cost_zero(mvcomp->bits[i]);
+    bits_cost[i][1] = vp10_cost_one(mvcomp->bits[i]);
+  }
+
+  for (i = 0; i < CLASS0_SIZE; ++i)
+    vp10_cost_tokens(class0_fp_cost[i], mvcomp->class0_fp[i], vp10_mv_fp_tree);
+  vp10_cost_tokens(fp_cost, mvcomp->fp, vp10_mv_fp_tree);
+
+  if (usehp) {
+    class0_hp_cost[0] = vp10_cost_zero(mvcomp->class0_hp);
+    class0_hp_cost[1] = vp10_cost_one(mvcomp->class0_hp);
+    hp_cost[0] = vp10_cost_zero(mvcomp->hp);
+    hp_cost[1] = vp10_cost_one(mvcomp->hp);
+  }
+  mvcost[0] = 0;
+  for (v = 1; v <= MV_MAX; ++v) {
+    int z, c, o, d, e, f, cost = 0;
+    z = v - 1;
+    c = vp10_get_mv_class(z, &o);
+    cost += class_cost[c];
+    d = (o >> 3);               /* int mv data */
+    f = (o >> 1) & 3;           /* fractional pel mv data */
+    e = (o & 1);                /* high precision mv data */
+    if (c == MV_CLASS_0) {
+      cost += class0_cost[d];
+    } else {
+      int i, b;
+      b = c + CLASS0_BITS - 1;  /* number of bits */
+      for (i = 0; i < b; ++i)
+        cost += bits_cost[i][((d >> i) & 1)];
+    }
+    if (c == MV_CLASS_0) {
+      cost += class0_fp_cost[d][f];
+    } else {
+      cost += fp_cost[f];
+    }
+    if (usehp) {
+      if (c == MV_CLASS_0) {
+        cost += class0_hp_cost[e];
+      } else {
+        cost += hp_cost[e];
+      }
+    }
+    mvcost[v] = cost + sign_cost[0];
+    mvcost[-v] = cost + sign_cost[1];
+  }
+}
+
+static void update_mv(vpx_writer *w, const unsigned int ct[2], vpx_prob *cur_p,
+                      vpx_prob upd_p) {
+#if CONFIG_MISC_FIXES
+  (void) upd_p;
+  vp10_cond_prob_diff_update(w, cur_p, ct);
+#else
+  const vpx_prob new_p = get_binary_prob(ct[0], ct[1]) | 1;
+  const int update = cost_branch256(ct, *cur_p) + vp10_cost_zero(upd_p) >
+                     cost_branch256(ct, new_p) + vp10_cost_one(upd_p) + 7 * 256;
+  vpx_write(w, update, upd_p);
+  if (update) {
+    *cur_p = new_p;
+    vpx_write_literal(w, new_p >> 1, 7);
+  }
+#endif
+}
+
+static void write_mv_update(const vpx_tree_index *tree,
+                            vpx_prob probs[/*n - 1*/],
+                            const unsigned int counts[/*n - 1*/],
+                            int n, vpx_writer *w) {
+  int i;
+  unsigned int branch_ct[32][2];
+
+  // Assuming max number of probabilities <= 32
+  assert(n <= 32);
+
+  vp10_tree_probs_from_distribution(tree, branch_ct, counts);
+  for (i = 0; i < n - 1; ++i)
+    update_mv(w, branch_ct[i], &probs[i], MV_UPDATE_PROB);
+}
+
+void vp10_write_nmv_probs(VP10_COMMON *cm, int usehp, vpx_writer *w,
+                         nmv_context_counts *const counts) {
+  int i, j;
+  nmv_context *const mvc = &cm->fc->nmvc;
+
+  write_mv_update(vp10_mv_joint_tree, mvc->joints, counts->joints, MV_JOINTS, w);
+
+  for (i = 0; i < 2; ++i) {
+    nmv_component *comp = &mvc->comps[i];
+    nmv_component_counts *comp_counts = &counts->comps[i];
+
+    update_mv(w, comp_counts->sign, &comp->sign, MV_UPDATE_PROB);
+    write_mv_update(vp10_mv_class_tree, comp->classes, comp_counts->classes,
+                    MV_CLASSES, w);
+    write_mv_update(vp10_mv_class0_tree, comp->class0, comp_counts->class0,
+                    CLASS0_SIZE, w);
+    for (j = 0; j < MV_OFFSET_BITS; ++j)
+      update_mv(w, comp_counts->bits[j], &comp->bits[j], MV_UPDATE_PROB);
+  }
+
+  for (i = 0; i < 2; ++i) {
+    for (j = 0; j < CLASS0_SIZE; ++j)
+      write_mv_update(vp10_mv_fp_tree, mvc->comps[i].class0_fp[j],
+                      counts->comps[i].class0_fp[j], MV_FP_SIZE, w);
+
+    write_mv_update(vp10_mv_fp_tree, mvc->comps[i].fp, counts->comps[i].fp,
+                    MV_FP_SIZE, w);
+  }
+
+  if (usehp) {
+    for (i = 0; i < 2; ++i) {
+      update_mv(w, counts->comps[i].class0_hp, &mvc->comps[i].class0_hp,
+                MV_UPDATE_PROB);
+      update_mv(w, counts->comps[i].hp, &mvc->comps[i].hp, MV_UPDATE_PROB);
+    }
+  }
+}
+
+void vp10_encode_mv(VP10_COMP* cpi, vpx_writer* w,
+                   const MV* mv, const MV* ref,
+                   const nmv_context* mvctx, int usehp) {
+  const MV diff = {mv->row - ref->row,
+                   mv->col - ref->col};
+  const MV_JOINT_TYPE j = vp10_get_mv_joint(&diff);
+  usehp = usehp && vp10_use_mv_hp(ref);
+
+  vp10_write_token(w, vp10_mv_joint_tree, mvctx->joints, &mv_joint_encodings[j]);
+  if (mv_joint_vertical(j))
+    encode_mv_component(w, diff.row, &mvctx->comps[0], usehp);
+
+  if (mv_joint_horizontal(j))
+    encode_mv_component(w, diff.col, &mvctx->comps[1], usehp);
+
+  // If auto_mv_step_size is enabled then keep track of the largest
+  // motion vector component used.
+  if (cpi->sf.mv.auto_mv_step_size) {
+    unsigned int maxv = VPXMAX(abs(mv->row), abs(mv->col)) >> 3;
+    cpi->max_mv_magnitude = VPXMAX(maxv, cpi->max_mv_magnitude);
+  }
+}
+
+void vp10_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
+                              const nmv_context* ctx, int usehp) {
+  vp10_cost_tokens(mvjoint, ctx->joints, vp10_mv_joint_tree);
+  build_nmv_component_cost_table(mvcost[0], &ctx->comps[0], usehp);
+  build_nmv_component_cost_table(mvcost[1], &ctx->comps[1], usehp);
+}
+
+static void inc_mvs(const MB_MODE_INFO *mbmi, const MB_MODE_INFO_EXT *mbmi_ext,
+                    const int_mv mvs[2],
+                    nmv_context_counts *counts) {
+  int i;
+
+  for (i = 0; i < 1 + has_second_ref(mbmi); ++i) {
+    const MV *ref = &mbmi_ext->ref_mvs[mbmi->ref_frame[i]][0].as_mv;
+    const MV diff = {mvs[i].as_mv.row - ref->row,
+                     mvs[i].as_mv.col - ref->col};
+    vp10_inc_mv(&diff, counts, vp10_use_mv_hp(ref));
+  }
+}
+
+void vp10_update_mv_count(ThreadData *td) {
+  const MACROBLOCKD *xd = &td->mb.e_mbd;
+  const MODE_INFO *mi = xd->mi[0];
+  const MB_MODE_INFO *const mbmi = &mi->mbmi;
+  const MB_MODE_INFO_EXT *mbmi_ext = td->mb.mbmi_ext;
+
+  if (mbmi->sb_type < BLOCK_8X8) {
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[mbmi->sb_type];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[mbmi->sb_type];
+    int idx, idy;
+
+    for (idy = 0; idy < 2; idy += num_4x4_h) {
+      for (idx = 0; idx < 2; idx += num_4x4_w) {
+        const int i = idy * 2 + idx;
+        if (mi->bmi[i].as_mode == NEWMV)
+          inc_mvs(mbmi, mbmi_ext, mi->bmi[i].as_mv, &td->counts->mv);
+      }
+    }
+  } else {
+    if (mbmi->mode == NEWMV)
+      inc_mvs(mbmi, mbmi_ext, mbmi->mv, &td->counts->mv);
+  }
+}
+
diff --git a/libs/libvpx/vp10/encoder/encodemv.h b/libs/libvpx/vp10/encoder/encodemv.h
new file mode 100644
index 0000000000..006f6d7c71
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/encodemv.h
@@ -0,0 +1,38 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP10_ENCODER_ENCODEMV_H_
+#define VP10_ENCODER_ENCODEMV_H_
+
+#include "vp10/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp10_entropy_mv_init(void);
+
+void vp10_write_nmv_probs(VP10_COMMON *cm, int usehp, vpx_writer *w,
+                         nmv_context_counts *const counts);
+
+void vp10_encode_mv(VP10_COMP *cpi, vpx_writer* w, const MV* mv, const MV* ref,
+                   const nmv_context* mvctx, int usehp);
+
+void vp10_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
+                              const nmv_context* mvctx, int usehp);
+
+void vp10_update_mv_count(ThreadData *td);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_ENCODER_ENCODEMV_H_
diff --git a/libs/libvpx/vp10/encoder/encoder.c b/libs/libvpx/vp10/encoder/encoder.c
new file mode 100644
index 0000000000..d3a4dc12e4
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/encoder.c
@@ -0,0 +1,4479 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <limits.h>
+
+#include "./vpx_config.h"
+
+#include "vp10/common/alloccommon.h"
+#include "vp10/common/filter.h"
+#include "vp10/common/idct.h"
+#if CONFIG_VP9_POSTPROC
+#include "vp10/common/postproc.h"
+#endif
+#include "vp10/common/reconinter.h"
+#include "vp10/common/reconintra.h"
+#include "vp10/common/tile_common.h"
+
+#include "vp10/encoder/aq_complexity.h"
+#include "vp10/encoder/aq_cyclicrefresh.h"
+#include "vp10/encoder/aq_variance.h"
+#include "vp10/encoder/bitstream.h"
+#include "vp10/encoder/context_tree.h"
+#include "vp10/encoder/encodeframe.h"
+#include "vp10/encoder/encodemv.h"
+#include "vp10/encoder/encoder.h"
+#include "vp10/encoder/ethread.h"
+#include "vp10/encoder/firstpass.h"
+#include "vp10/encoder/mbgraph.h"
+#include "vp10/encoder/picklpf.h"
+#include "vp10/encoder/ratectrl.h"
+#include "vp10/encoder/rd.h"
+#include "vp10/encoder/resize.h"
+#include "vp10/encoder/segmentation.h"
+#include "vp10/encoder/skin_detection.h"
+#include "vp10/encoder/speed_features.h"
+#include "vp10/encoder/temporal_filter.h"
+
+#include "./vp10_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_scale_rtcd.h"
+#include "vpx/internal/vpx_psnr.h"
+#if CONFIG_INTERNAL_STATS
+#include "vpx_dsp/ssim.h"
+#endif
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/system_state.h"
+#include "vpx_ports/vpx_timer.h"
+#include "vpx_scale/vpx_scale.h"
+
+#define AM_SEGMENT_ID_INACTIVE 7
+#define AM_SEGMENT_ID_ACTIVE 0
+
+#define SHARP_FILTER_QTHRESH 0          /* Q threshold for 8-tap sharp filter */
+
+#define ALTREF_HIGH_PRECISION_MV 1      // Whether to use high precision mv
+                                         //  for altref computation.
+#define HIGH_PRECISION_MV_QTHRESH 200   // Q threshold for high precision
+                                         // mv. Choose a very high value for
+                                         // now so that HIGH_PRECISION is always
+                                         // chosen.
+// #define OUTPUT_YUV_REC
+
+#ifdef OUTPUT_YUV_DENOISED
+FILE *yuv_denoised_file = NULL;
+#endif
+#ifdef OUTPUT_YUV_SKINMAP
+FILE *yuv_skinmap_file = NULL;
+#endif
+#ifdef OUTPUT_YUV_REC
+FILE *yuv_rec_file;
+#endif
+
+#if 0
+FILE *framepsnr;
+FILE *kf_list;
+FILE *keyfile;
+#endif
+
+static INLINE void Scale2Ratio(VPX_SCALING mode, int *hr, int *hs) {
+  switch (mode) {
+    case NORMAL:
+      *hr = 1;
+      *hs = 1;
+      break;
+    case FOURFIVE:
+      *hr = 4;
+      *hs = 5;
+      break;
+    case THREEFIVE:
+      *hr = 3;
+      *hs = 5;
+    break;
+    case ONETWO:
+      *hr = 1;
+      *hs = 2;
+    break;
+    default:
+      *hr = 1;
+      *hs = 1;
+       assert(0);
+      break;
+  }
+}
+
+// Mark all inactive blocks as active. Other segmentation features may be set
+// so memset cannot be used, instead only inactive blocks should be reset.
+static void suppress_active_map(VP10_COMP *cpi) {
+  unsigned char *const seg_map = cpi->segmentation_map;
+  int i;
+  if (cpi->active_map.enabled || cpi->active_map.update)
+    for (i = 0; i < cpi->common.mi_rows * cpi->common.mi_cols; ++i)
+      if (seg_map[i] == AM_SEGMENT_ID_INACTIVE)
+        seg_map[i] = AM_SEGMENT_ID_ACTIVE;
+}
+
+static void apply_active_map(VP10_COMP *cpi) {
+  struct segmentation *const seg = &cpi->common.seg;
+  unsigned char *const seg_map = cpi->segmentation_map;
+  const unsigned char *const active_map = cpi->active_map.map;
+  int i;
+
+  assert(AM_SEGMENT_ID_ACTIVE == CR_SEGMENT_ID_BASE);
+
+  if (frame_is_intra_only(&cpi->common)) {
+    cpi->active_map.enabled = 0;
+    cpi->active_map.update = 1;
+  }
+
+  if (cpi->active_map.update) {
+    if (cpi->active_map.enabled) {
+      for (i = 0; i < cpi->common.mi_rows * cpi->common.mi_cols; ++i)
+        if (seg_map[i] == AM_SEGMENT_ID_ACTIVE) seg_map[i] = active_map[i];
+      vp10_enable_segmentation(seg);
+      vp10_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP);
+      vp10_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF);
+      // Setting the data to -MAX_LOOP_FILTER will result in the computed loop
+      // filter level being zero regardless of the value of seg->abs_delta.
+      vp10_set_segdata(seg, AM_SEGMENT_ID_INACTIVE,
+                      SEG_LVL_ALT_LF, -MAX_LOOP_FILTER);
+    } else {
+      vp10_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP);
+      vp10_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF);
+      if (seg->enabled) {
+        seg->update_data = 1;
+        seg->update_map = 1;
+      }
+    }
+    cpi->active_map.update = 0;
+  }
+}
+
+int vp10_set_active_map(VP10_COMP* cpi,
+                       unsigned char* new_map_16x16,
+                       int rows,
+                       int cols) {
+  if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols) {
+    unsigned char *const active_map_8x8 = cpi->active_map.map;
+    const int mi_rows = cpi->common.mi_rows;
+    const int mi_cols = cpi->common.mi_cols;
+    cpi->active_map.update = 1;
+    if (new_map_16x16) {
+      int r, c;
+      for (r = 0; r < mi_rows; ++r) {
+        for (c = 0; c < mi_cols; ++c) {
+          active_map_8x8[r * mi_cols + c] =
+              new_map_16x16[(r >> 1) * cols + (c >> 1)]
+                  ? AM_SEGMENT_ID_ACTIVE
+                  : AM_SEGMENT_ID_INACTIVE;
+        }
+      }
+      cpi->active_map.enabled = 1;
+    } else {
+      cpi->active_map.enabled = 0;
+    }
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+int vp10_get_active_map(VP10_COMP* cpi,
+                       unsigned char* new_map_16x16,
+                       int rows,
+                       int cols) {
+  if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols &&
+      new_map_16x16) {
+    unsigned char* const seg_map_8x8 = cpi->segmentation_map;
+    const int mi_rows = cpi->common.mi_rows;
+    const int mi_cols = cpi->common.mi_cols;
+    memset(new_map_16x16, !cpi->active_map.enabled, rows * cols);
+    if (cpi->active_map.enabled) {
+      int r, c;
+      for (r = 0; r < mi_rows; ++r) {
+        for (c = 0; c < mi_cols; ++c) {
+          // Cyclic refresh segments are considered active despite not having
+          // AM_SEGMENT_ID_ACTIVE
+          new_map_16x16[(r >> 1) * cols + (c >> 1)] |=
+              seg_map_8x8[r * mi_cols + c] != AM_SEGMENT_ID_INACTIVE;
+        }
+      }
+    }
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+void vp10_set_high_precision_mv(VP10_COMP *cpi, int allow_high_precision_mv) {
+  MACROBLOCK *const mb = &cpi->td.mb;
+  cpi->common.allow_high_precision_mv = allow_high_precision_mv;
+  if (cpi->common.allow_high_precision_mv) {
+    mb->mvcost = mb->nmvcost_hp;
+    mb->mvsadcost = mb->nmvsadcost_hp;
+  } else {
+    mb->mvcost = mb->nmvcost;
+    mb->mvsadcost = mb->nmvsadcost;
+  }
+}
+
+static void setup_frame(VP10_COMP *cpi) {
+  VP10_COMMON *const cm = &cpi->common;
+  // Set up entropy context depending on frame type. The decoder mandates
+  // the use of the default context, index 0, for keyframes and inter
+  // frames where the error_resilient_mode or intra_only flag is set. For
+  // other inter-frames the encoder currently uses only two contexts;
+  // context 1 for ALTREF frames and context 0 for the others.
+  if (frame_is_intra_only(cm) || cm->error_resilient_mode) {
+    vp10_setup_past_independence(cm);
+  } else {
+    cm->frame_context_idx = cpi->refresh_alt_ref_frame;
+  }
+
+  if (cm->frame_type == KEY_FRAME) {
+    cpi->refresh_golden_frame = 1;
+    cpi->refresh_alt_ref_frame = 1;
+    vp10_zero(cpi->interp_filter_selected);
+  } else {
+    *cm->fc = cm->frame_contexts[cm->frame_context_idx];
+    vp10_zero(cpi->interp_filter_selected[0]);
+  }
+}
+
+static void vp10_enc_setup_mi(VP10_COMMON *cm) {
+  int i;
+  cm->mi = cm->mip + cm->mi_stride + 1;
+  memset(cm->mip, 0, cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mip));
+  cm->prev_mi = cm->prev_mip + cm->mi_stride + 1;
+  // Clear top border row
+  memset(cm->prev_mip, 0, sizeof(*cm->prev_mip) * cm->mi_stride);
+  // Clear left border column
+  for (i = 1; i < cm->mi_rows + 1; ++i)
+    memset(&cm->prev_mip[i * cm->mi_stride], 0, sizeof(*cm->prev_mip));
+
+  cm->mi_grid_visible = cm->mi_grid_base + cm->mi_stride + 1;
+  cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mi_stride + 1;
+
+  memset(cm->mi_grid_base, 0,
+         cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mi_grid_base));
+}
+
+static int vp10_enc_alloc_mi(VP10_COMMON *cm, int mi_size) {
+  cm->mip = vpx_calloc(mi_size, sizeof(*cm->mip));
+  if (!cm->mip)
+    return 1;
+  cm->prev_mip = vpx_calloc(mi_size, sizeof(*cm->prev_mip));
+  if (!cm->prev_mip)
+    return 1;
+  cm->mi_alloc_size = mi_size;
+
+  cm->mi_grid_base = (MODE_INFO **)vpx_calloc(mi_size, sizeof(MODE_INFO*));
+  if (!cm->mi_grid_base)
+    return 1;
+  cm->prev_mi_grid_base = (MODE_INFO **)vpx_calloc(mi_size, sizeof(MODE_INFO*));
+  if (!cm->prev_mi_grid_base)
+    return 1;
+
+  return 0;
+}
+
+static void vp10_enc_free_mi(VP10_COMMON *cm) {
+  vpx_free(cm->mip);
+  cm->mip = NULL;
+  vpx_free(cm->prev_mip);
+  cm->prev_mip = NULL;
+  vpx_free(cm->mi_grid_base);
+  cm->mi_grid_base = NULL;
+  vpx_free(cm->prev_mi_grid_base);
+  cm->prev_mi_grid_base = NULL;
+}
+
+static void vp10_swap_mi_and_prev_mi(VP10_COMMON *cm) {
+  // Current mip will be the prev_mip for the next frame.
+  MODE_INFO **temp_base = cm->prev_mi_grid_base;
+  MODE_INFO *temp = cm->prev_mip;
+  cm->prev_mip = cm->mip;
+  cm->mip = temp;
+
+  // Update the upper left visible macroblock ptrs.
+  cm->mi = cm->mip + cm->mi_stride + 1;
+  cm->prev_mi = cm->prev_mip + cm->mi_stride + 1;
+
+  cm->prev_mi_grid_base = cm->mi_grid_base;
+  cm->mi_grid_base = temp_base;
+  cm->mi_grid_visible = cm->mi_grid_base + cm->mi_stride + 1;
+  cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mi_stride + 1;
+}
+
+void vp10_initialize_enc(void) {
+  static volatile int init_done = 0;
+
+  if (!init_done) {
+    vp10_rtcd();
+    vpx_dsp_rtcd();
+    vpx_scale_rtcd();
+    vp10_init_intra_predictors();
+    vp10_init_me_luts();
+    vp10_rc_init_minq_luts();
+    vp10_entropy_mv_init();
+    vp10_temporal_filter_init();
+    vp10_encode_token_init();
+    init_done = 1;
+  }
+}
+
+static void dealloc_compressor_data(VP10_COMP *cpi) {
+  VP10_COMMON *const cm = &cpi->common;
+
+  vpx_free(cpi->mbmi_ext_base);
+  cpi->mbmi_ext_base = NULL;
+
+  vpx_free(cpi->tile_data);
+  cpi->tile_data = NULL;
+
+  // Delete sementation map
+  vpx_free(cpi->segmentation_map);
+  cpi->segmentation_map = NULL;
+  vpx_free(cpi->coding_context.last_frame_seg_map_copy);
+  cpi->coding_context.last_frame_seg_map_copy = NULL;
+
+  vpx_free(cpi->nmvcosts[0]);
+  vpx_free(cpi->nmvcosts[1]);
+  cpi->nmvcosts[0] = NULL;
+  cpi->nmvcosts[1] = NULL;
+
+  vpx_free(cpi->nmvcosts_hp[0]);
+  vpx_free(cpi->nmvcosts_hp[1]);
+  cpi->nmvcosts_hp[0] = NULL;
+  cpi->nmvcosts_hp[1] = NULL;
+
+  vpx_free(cpi->nmvsadcosts[0]);
+  vpx_free(cpi->nmvsadcosts[1]);
+  cpi->nmvsadcosts[0] = NULL;
+  cpi->nmvsadcosts[1] = NULL;
+
+  vpx_free(cpi->nmvsadcosts_hp[0]);
+  vpx_free(cpi->nmvsadcosts_hp[1]);
+  cpi->nmvsadcosts_hp[0] = NULL;
+  cpi->nmvsadcosts_hp[1] = NULL;
+
+  vp10_cyclic_refresh_free(cpi->cyclic_refresh);
+  cpi->cyclic_refresh = NULL;
+
+  vpx_free(cpi->active_map.map);
+  cpi->active_map.map = NULL;
+
+  vp10_free_ref_frame_buffers(cm->buffer_pool);
+#if CONFIG_VP9_POSTPROC
+  vp10_free_postproc_buffers(cm);
+#endif
+  vp10_free_context_buffers(cm);
+
+  vpx_free_frame_buffer(&cpi->last_frame_uf);
+  vpx_free_frame_buffer(&cpi->scaled_source);
+  vpx_free_frame_buffer(&cpi->scaled_last_source);
+  vpx_free_frame_buffer(&cpi->alt_ref_buffer);
+  vp10_lookahead_destroy(cpi->lookahead);
+
+  vpx_free(cpi->tile_tok[0][0]);
+  cpi->tile_tok[0][0] = 0;
+
+  vp10_free_pc_tree(&cpi->td);
+
+  if (cpi->source_diff_var != NULL) {
+    vpx_free(cpi->source_diff_var);
+    cpi->source_diff_var = NULL;
+  }
+}
+
+static void save_coding_context(VP10_COMP *cpi) {
+  CODING_CONTEXT *const cc = &cpi->coding_context;
+  VP10_COMMON *cm = &cpi->common;
+
+  // Stores a snapshot of key state variables which can subsequently be
+  // restored with a call to vp10_restore_coding_context. These functions are
+  // intended for use in a re-code loop in vp10_compress_frame where the
+  // quantizer value is adjusted between loop iterations.
+  vp10_copy(cc->nmvjointcost,  cpi->td.mb.nmvjointcost);
+
+  memcpy(cc->nmvcosts[0], cpi->nmvcosts[0],
+         MV_VALS * sizeof(*cpi->nmvcosts[0]));
+  memcpy(cc->nmvcosts[1], cpi->nmvcosts[1],
+         MV_VALS * sizeof(*cpi->nmvcosts[1]));
+  memcpy(cc->nmvcosts_hp[0], cpi->nmvcosts_hp[0],
+         MV_VALS * sizeof(*cpi->nmvcosts_hp[0]));
+  memcpy(cc->nmvcosts_hp[1], cpi->nmvcosts_hp[1],
+         MV_VALS * sizeof(*cpi->nmvcosts_hp[1]));
+
+#if !CONFIG_MISC_FIXES
+  vp10_copy(cc->segment_pred_probs, cm->segp.pred_probs);
+#endif
+
+  memcpy(cpi->coding_context.last_frame_seg_map_copy,
+         cm->last_frame_seg_map, (cm->mi_rows * cm->mi_cols));
+
+  vp10_copy(cc->last_ref_lf_deltas, cm->lf.last_ref_deltas);
+  vp10_copy(cc->last_mode_lf_deltas, cm->lf.last_mode_deltas);
+
+  cc->fc = *cm->fc;
+}
+
+static void restore_coding_context(VP10_COMP *cpi) {
+  CODING_CONTEXT *const cc = &cpi->coding_context;
+  VP10_COMMON *cm = &cpi->common;
+
+  // Restore key state variables to the snapshot state stored in the
+  // previous call to vp10_save_coding_context.
+  vp10_copy(cpi->td.mb.nmvjointcost, cc->nmvjointcost);
+
+  memcpy(cpi->nmvcosts[0], cc->nmvcosts[0], MV_VALS * sizeof(*cc->nmvcosts[0]));
+  memcpy(cpi->nmvcosts[1], cc->nmvcosts[1], MV_VALS * sizeof(*cc->nmvcosts[1]));
+  memcpy(cpi->nmvcosts_hp[0], cc->nmvcosts_hp[0],
+         MV_VALS * sizeof(*cc->nmvcosts_hp[0]));
+  memcpy(cpi->nmvcosts_hp[1], cc->nmvcosts_hp[1],
+         MV_VALS * sizeof(*cc->nmvcosts_hp[1]));
+
+#if !CONFIG_MISC_FIXES
+  vp10_copy(cm->segp.pred_probs, cc->segment_pred_probs);
+#endif
+
+  memcpy(cm->last_frame_seg_map,
+         cpi->coding_context.last_frame_seg_map_copy,
+         (cm->mi_rows * cm->mi_cols));
+
+  vp10_copy(cm->lf.last_ref_deltas, cc->last_ref_lf_deltas);
+  vp10_copy(cm->lf.last_mode_deltas, cc->last_mode_lf_deltas);
+
+  *cm->fc = cc->fc;
+}
+
+static void configure_static_seg_features(VP10_COMP *cpi) {
+  VP10_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  struct segmentation *const seg = &cm->seg;
+
+  int high_q = (int)(rc->avg_q > 48.0);
+  int qi_delta;
+
+  // Disable and clear down for KF
+  if (cm->frame_type == KEY_FRAME) {
+    // Clear down the global segmentation map
+    memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+    seg->update_map = 0;
+    seg->update_data = 0;
+    cpi->static_mb_pct = 0;
+
+    // Disable segmentation
+    vp10_disable_segmentation(seg);
+
+    // Clear down the segment features.
+    vp10_clearall_segfeatures(seg);
+  } else if (cpi->refresh_alt_ref_frame) {
+    // If this is an alt ref frame
+    // Clear down the global segmentation map
+    memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+    seg->update_map = 0;
+    seg->update_data = 0;
+    cpi->static_mb_pct = 0;
+
+    // Disable segmentation and individual segment features by default
+    vp10_disable_segmentation(seg);
+    vp10_clearall_segfeatures(seg);
+
+    // Scan frames from current to arf frame.
+    // This function re-enables segmentation if appropriate.
+    vp10_update_mbgraph_stats(cpi);
+
+    // If segmentation was enabled set those features needed for the
+    // arf itself.
+    if (seg->enabled) {
+      seg->update_map = 1;
+      seg->update_data = 1;
+
+      qi_delta = vp10_compute_qdelta(rc, rc->avg_q, rc->avg_q * 0.875,
+                                    cm->bit_depth);
+      vp10_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta - 2);
+      vp10_set_segdata(seg, 1, SEG_LVL_ALT_LF, -2);
+
+      vp10_enable_segfeature(seg, 1, SEG_LVL_ALT_Q);
+      vp10_enable_segfeature(seg, 1, SEG_LVL_ALT_LF);
+
+      // Where relevant assume segment data is delta data
+      seg->abs_delta = SEGMENT_DELTADATA;
+    }
+  } else if (seg->enabled) {
+    // All other frames if segmentation has been enabled
+
+    // First normal frame in a valid gf or alt ref group
+    if (rc->frames_since_golden == 0) {
+      // Set up segment features for normal frames in an arf group
+      if (rc->source_alt_ref_active) {
+        seg->update_map = 0;
+        seg->update_data = 1;
+        seg->abs_delta = SEGMENT_DELTADATA;
+
+        qi_delta = vp10_compute_qdelta(rc, rc->avg_q, rc->avg_q * 1.125,
+                                      cm->bit_depth);
+        vp10_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta + 2);
+        vp10_enable_segfeature(seg, 1, SEG_LVL_ALT_Q);
+
+        vp10_set_segdata(seg, 1, SEG_LVL_ALT_LF, -2);
+        vp10_enable_segfeature(seg, 1, SEG_LVL_ALT_LF);
+
+        // Segment coding disabled for compred testing
+        if (high_q || (cpi->static_mb_pct == 100)) {
+          vp10_set_segdata(seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME);
+          vp10_enable_segfeature(seg, 1, SEG_LVL_REF_FRAME);
+          vp10_enable_segfeature(seg, 1, SEG_LVL_SKIP);
+        }
+      } else {
+        // Disable segmentation and clear down features if alt ref
+        // is not active for this group
+
+        vp10_disable_segmentation(seg);
+
+        memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+
+        seg->update_map = 0;
+        seg->update_data = 0;
+
+        vp10_clearall_segfeatures(seg);
+      }
+    } else if (rc->is_src_frame_alt_ref) {
+      // Special case where we are coding over the top of a previous
+      // alt ref frame.
+      // Segment coding disabled for compred testing
+
+      // Enable ref frame features for segment 0 as well
+      vp10_enable_segfeature(seg, 0, SEG_LVL_REF_FRAME);
+      vp10_enable_segfeature(seg, 1, SEG_LVL_REF_FRAME);
+
+      // All mbs should use ALTREF_FRAME
+      vp10_clear_segdata(seg, 0, SEG_LVL_REF_FRAME);
+      vp10_set_segdata(seg, 0, SEG_LVL_REF_FRAME, ALTREF_FRAME);
+      vp10_clear_segdata(seg, 1, SEG_LVL_REF_FRAME);
+      vp10_set_segdata(seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME);
+
+      // Skip all MBs if high Q (0,0 mv and skip coeffs)
+      if (high_q) {
+        vp10_enable_segfeature(seg, 0, SEG_LVL_SKIP);
+        vp10_enable_segfeature(seg, 1, SEG_LVL_SKIP);
+      }
+      // Enable data update
+      seg->update_data = 1;
+    } else {
+      // All other frames.
+
+      // No updates.. leave things as they are.
+      seg->update_map = 0;
+      seg->update_data = 0;
+    }
+  }
+}
+
+static void update_reference_segmentation_map(VP10_COMP *cpi) {
+  VP10_COMMON *const cm = &cpi->common;
+  MODE_INFO **mi_8x8_ptr = cm->mi_grid_visible;
+  uint8_t *cache_ptr = cm->last_frame_seg_map;
+  int row, col;
+
+  for (row = 0; row < cm->mi_rows; row++) {
+    MODE_INFO **mi_8x8 = mi_8x8_ptr;
+    uint8_t *cache = cache_ptr;
+    for (col = 0; col < cm->mi_cols; col++, mi_8x8++, cache++)
+      cache[0] = mi_8x8[0]->mbmi.segment_id;
+    mi_8x8_ptr += cm->mi_stride;
+    cache_ptr += cm->mi_cols;
+  }
+}
+
+static void alloc_raw_frame_buffers(VP10_COMP *cpi) {
+  VP10_COMMON *cm = &cpi->common;
+  const VP10EncoderConfig *oxcf = &cpi->oxcf;
+
+  if (!cpi->lookahead)
+    cpi->lookahead = vp10_lookahead_init(oxcf->width, oxcf->height,
+                                        cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                      cm->use_highbitdepth,
+#endif
+                                      oxcf->lag_in_frames);
+  if (!cpi->lookahead)
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate lag buffers");
+
+  // TODO(agrange) Check if ARF is enabled and skip allocation if not.
+  if (vpx_realloc_frame_buffer(&cpi->alt_ref_buffer,
+                               oxcf->width, oxcf->height,
+                               cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                               cm->use_highbitdepth,
+#endif
+                               VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+                               NULL, NULL, NULL))
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate altref buffer");
+}
+
+static void alloc_util_frame_buffers(VP10_COMP *cpi) {
+  VP10_COMMON *const cm = &cpi->common;
+  if (vpx_realloc_frame_buffer(&cpi->last_frame_uf,
+                               cm->width, cm->height,
+                               cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                               cm->use_highbitdepth,
+#endif
+                               VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+                               NULL, NULL, NULL))
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate last frame buffer");
+
+  if (vpx_realloc_frame_buffer(&cpi->scaled_source,
+                               cm->width, cm->height,
+                               cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                               cm->use_highbitdepth,
+#endif
+                               VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+                               NULL, NULL, NULL))
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate scaled source buffer");
+
+  if (vpx_realloc_frame_buffer(&cpi->scaled_last_source,
+                               cm->width, cm->height,
+                               cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                               cm->use_highbitdepth,
+#endif
+                               VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+                               NULL, NULL, NULL))
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate scaled last source buffer");
+}
+
+
+static int alloc_context_buffers_ext(VP10_COMP *cpi) {
+  VP10_COMMON *cm = &cpi->common;
+  int mi_size = cm->mi_cols * cm->mi_rows;
+
+  cpi->mbmi_ext_base = vpx_calloc(mi_size, sizeof(*cpi->mbmi_ext_base));
+  if (!cpi->mbmi_ext_base)
+    return 1;
+
+  return 0;
+}
+
+void vp10_alloc_compressor_data(VP10_COMP *cpi) {
+  VP10_COMMON *cm = &cpi->common;
+
+  vp10_alloc_context_buffers(cm, cm->width, cm->height);
+
+  alloc_context_buffers_ext(cpi);
+
+  vpx_free(cpi->tile_tok[0][0]);
+
+  {
+    unsigned int tokens = get_token_alloc(cm->mb_rows, cm->mb_cols);
+    CHECK_MEM_ERROR(cm, cpi->tile_tok[0][0],
+        vpx_calloc(tokens, sizeof(*cpi->tile_tok[0][0])));
+  }
+
+  vp10_setup_pc_tree(&cpi->common, &cpi->td);
+}
+
+void vp10_new_framerate(VP10_COMP *cpi, double framerate) {
+  cpi->framerate = framerate < 0.1 ? 30 : framerate;
+  vp10_rc_update_framerate(cpi);
+}
+
+static void set_tile_limits(VP10_COMP *cpi) {
+  VP10_COMMON *const cm = &cpi->common;
+
+  int min_log2_tile_cols, max_log2_tile_cols;
+  vp10_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
+
+  cm->log2_tile_cols = clamp(cpi->oxcf.tile_columns,
+                             min_log2_tile_cols, max_log2_tile_cols);
+  cm->log2_tile_rows = cpi->oxcf.tile_rows;
+}
+
+static void update_frame_size(VP10_COMP *cpi) {
+  VP10_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+
+  vp10_set_mb_mi(cm, cm->width, cm->height);
+  vp10_init_context_buffers(cm);
+  vp10_init_macroblockd(cm, xd, NULL);
+  memset(cpi->mbmi_ext_base, 0,
+         cm->mi_rows * cm->mi_cols * sizeof(*cpi->mbmi_ext_base));
+
+  set_tile_limits(cpi);
+}
+
+static void init_buffer_indices(VP10_COMP *cpi) {
+  cpi->lst_fb_idx = 0;
+  cpi->gld_fb_idx = 1;
+  cpi->alt_fb_idx = 2;
+}
+
+static void init_config(struct VP10_COMP *cpi, VP10EncoderConfig *oxcf) {
+  VP10_COMMON *const cm = &cpi->common;
+
+  cpi->oxcf = *oxcf;
+  cpi->framerate = oxcf->init_framerate;
+
+  cm->profile = oxcf->profile;
+  cm->bit_depth = oxcf->bit_depth;
+#if CONFIG_VP9_HIGHBITDEPTH
+  cm->use_highbitdepth = oxcf->use_highbitdepth;
+#endif
+  cm->color_space = oxcf->color_space;
+  cm->color_range = oxcf->color_range;
+
+  cm->width = oxcf->width;
+  cm->height = oxcf->height;
+  vp10_alloc_compressor_data(cpi);
+
+  // Single thread case: use counts in common.
+  cpi->td.counts = &cm->counts;
+
+  // change includes all joint functionality
+  vp10_change_config(cpi, oxcf);
+
+  cpi->static_mb_pct = 0;
+  cpi->ref_frame_flags = 0;
+
+  init_buffer_indices(cpi);
+}
+
+static void set_rc_buffer_sizes(RATE_CONTROL *rc,
+                                const VP10EncoderConfig *oxcf) {
+  const int64_t bandwidth = oxcf->target_bandwidth;
+  const int64_t starting = oxcf->starting_buffer_level_ms;
+  const int64_t optimal = oxcf->optimal_buffer_level_ms;
+  const int64_t maximum = oxcf->maximum_buffer_size_ms;
+
+  rc->starting_buffer_level = starting * bandwidth / 1000;
+  rc->optimal_buffer_level = (optimal == 0) ? bandwidth / 8
+                                            : optimal * bandwidth / 1000;
+  rc->maximum_buffer_size = (maximum == 0) ? bandwidth / 8
+                                           : maximum * bandwidth / 1000;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX3F, SDX8F, SDX4DF) \
+    cpi->fn_ptr[BT].sdf = SDF; \
+    cpi->fn_ptr[BT].sdaf = SDAF; \
+    cpi->fn_ptr[BT].vf = VF; \
+    cpi->fn_ptr[BT].svf = SVF; \
+    cpi->fn_ptr[BT].svaf = SVAF; \
+    cpi->fn_ptr[BT].sdx3f = SDX3F; \
+    cpi->fn_ptr[BT].sdx8f = SDX8F; \
+    cpi->fn_ptr[BT].sdx4df = SDX4DF;
+
+#define MAKE_BFP_SAD_WRAPPER(fnname) \
+static unsigned int fnname##_bits8(const uint8_t *src_ptr, \
+                                   int source_stride, \
+                                   const uint8_t *ref_ptr, \
+                                   int ref_stride) {  \
+  return fnname(src_ptr, source_stride, ref_ptr, ref_stride); \
+} \
+static unsigned int fnname##_bits10(const uint8_t *src_ptr, \
+                                    int source_stride, \
+                                    const uint8_t *ref_ptr, \
+                                    int ref_stride) {  \
+  return fnname(src_ptr, source_stride, ref_ptr, ref_stride) >> 2; \
+} \
+static unsigned int fnname##_bits12(const uint8_t *src_ptr, \
+                                    int source_stride, \
+                                    const uint8_t *ref_ptr, \
+                                    int ref_stride) {  \
+  return fnname(src_ptr, source_stride, ref_ptr, ref_stride) >> 4; \
+}
+
+#define MAKE_BFP_SADAVG_WRAPPER(fnname) static unsigned int \
+fnname##_bits8(const uint8_t *src_ptr, \
+               int source_stride, \
+               const uint8_t *ref_ptr, \
+               int ref_stride, \
+               const uint8_t *second_pred) {  \
+  return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred); \
+} \
+static unsigned int fnname##_bits10(const uint8_t *src_ptr, \
+                                    int source_stride, \
+                                    const uint8_t *ref_ptr, \
+                                    int ref_stride, \
+                                    const uint8_t *second_pred) {  \
+  return fnname(src_ptr, source_stride, ref_ptr, ref_stride, \
+                second_pred) >> 2; \
+} \
+static unsigned int fnname##_bits12(const uint8_t *src_ptr, \
+                                    int source_stride, \
+                                    const uint8_t *ref_ptr, \
+                                    int ref_stride, \
+                                    const uint8_t *second_pred) {  \
+  return fnname(src_ptr, source_stride, ref_ptr, ref_stride, \
+                second_pred) >> 4; \
+}
+
+#define MAKE_BFP_SAD3_WRAPPER(fnname) \
+static void fnname##_bits8(const uint8_t *src_ptr, \
+                           int source_stride, \
+                           const uint8_t *ref_ptr, \
+                           int  ref_stride, \
+                           unsigned int *sad_array) {  \
+  fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+} \
+static void fnname##_bits10(const uint8_t *src_ptr, \
+                            int source_stride, \
+                            const uint8_t *ref_ptr, \
+                            int  ref_stride, \
+                            unsigned int *sad_array) {  \
+  int i; \
+  fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+  for (i = 0; i < 3; i++) \
+    sad_array[i] >>= 2; \
+} \
+static void fnname##_bits12(const uint8_t *src_ptr, \
+                            int source_stride, \
+                            const uint8_t *ref_ptr, \
+                            int  ref_stride, \
+                            unsigned int *sad_array) {  \
+  int i; \
+  fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+  for (i = 0; i < 3; i++) \
+    sad_array[i] >>= 4; \
+}
+
+#define MAKE_BFP_SAD8_WRAPPER(fnname) \
+static void fnname##_bits8(const uint8_t *src_ptr, \
+                           int source_stride, \
+                           const uint8_t *ref_ptr, \
+                           int  ref_stride, \
+                           unsigned int *sad_array) {  \
+  fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+} \
+static void fnname##_bits10(const uint8_t *src_ptr, \
+                            int source_stride, \
+                            const uint8_t *ref_ptr, \
+                            int  ref_stride, \
+                            unsigned int *sad_array) {  \
+  int i; \
+  fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+  for (i = 0; i < 8; i++) \
+    sad_array[i] >>= 2; \
+} \
+static void fnname##_bits12(const uint8_t *src_ptr, \
+                            int source_stride, \
+                            const uint8_t *ref_ptr, \
+                            int  ref_stride, \
+                            unsigned int *sad_array) {  \
+  int i; \
+  fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+  for (i = 0; i < 8; i++) \
+    sad_array[i] >>= 4; \
+}
+#define MAKE_BFP_SAD4D_WRAPPER(fnname) \
+static void fnname##_bits8(const uint8_t *src_ptr, \
+                           int source_stride, \
+                           const uint8_t* const ref_ptr[], \
+                           int  ref_stride, \
+                           unsigned int *sad_array) {  \
+  fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+} \
+static void fnname##_bits10(const uint8_t *src_ptr, \
+                            int source_stride, \
+                            const uint8_t* const ref_ptr[], \
+                            int  ref_stride, \
+                            unsigned int *sad_array) {  \
+  int i; \
+  fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+  for (i = 0; i < 4; i++) \
+  sad_array[i] >>= 2; \
+} \
+static void fnname##_bits12(const uint8_t *src_ptr, \
+                            int source_stride, \
+                            const uint8_t* const ref_ptr[], \
+                            int  ref_stride, \
+                            unsigned int *sad_array) {  \
+  int i; \
+  fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+  for (i = 0; i < 4; i++) \
+  sad_array[i] >>= 4; \
+}
+
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x16)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x16x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x32)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x32x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad64x32)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad64x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad64x32x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x64)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x64_avg)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x64x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x32)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x32_avg)
+MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad32x32x3)
+MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad32x32x8)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x32x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad64x64)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad64x64_avg)
+MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad64x64x3)
+MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad64x64x8)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad64x64x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x16)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x16_avg)
+MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad16x16x3)
+MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad16x16x8)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x16x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x8)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x8_avg)
+MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad16x8x3)
+MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad16x8x8)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x8x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x16)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x16_avg)
+MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad8x16x3)
+MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad8x16x8)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x16x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x8)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x8_avg)
+MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad8x8x3)
+MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad8x8x8)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x8x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x4)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x4_avg)
+MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad8x4x8)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x4x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad4x8)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad4x8_avg)
+MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad4x8x8)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x8x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad4x4)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad4x4_avg)
+MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad4x4x3)
+MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad4x4x8)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x4x4d)
+
+static void  highbd_set_var_fns(VP10_COMP *const cpi) {
+  VP10_COMMON *const cm = &cpi->common;
+  if (cm->use_highbitdepth) {
+    switch (cm->bit_depth) {
+      case VPX_BITS_8:
+        HIGHBD_BFP(BLOCK_32X16,
+                   vpx_highbd_sad32x16_bits8,
+                   vpx_highbd_sad32x16_avg_bits8,
+                   vpx_highbd_8_variance32x16,
+                   vpx_highbd_8_sub_pixel_variance32x16,
+                   vpx_highbd_8_sub_pixel_avg_variance32x16,
+                   NULL,
+                   NULL,
+                   vpx_highbd_sad32x16x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_16X32,
+                   vpx_highbd_sad16x32_bits8,
+                   vpx_highbd_sad16x32_avg_bits8,
+                   vpx_highbd_8_variance16x32,
+                   vpx_highbd_8_sub_pixel_variance16x32,
+                   vpx_highbd_8_sub_pixel_avg_variance16x32,
+                   NULL,
+                   NULL,
+                   vpx_highbd_sad16x32x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_64X32,
+                   vpx_highbd_sad64x32_bits8,
+                   vpx_highbd_sad64x32_avg_bits8,
+                   vpx_highbd_8_variance64x32,
+                   vpx_highbd_8_sub_pixel_variance64x32,
+                   vpx_highbd_8_sub_pixel_avg_variance64x32,
+                   NULL,
+                   NULL,
+                   vpx_highbd_sad64x32x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_32X64,
+                   vpx_highbd_sad32x64_bits8,
+                   vpx_highbd_sad32x64_avg_bits8,
+                   vpx_highbd_8_variance32x64,
+                   vpx_highbd_8_sub_pixel_variance32x64,
+                   vpx_highbd_8_sub_pixel_avg_variance32x64,
+                   NULL,
+                   NULL,
+                   vpx_highbd_sad32x64x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_32X32,
+                   vpx_highbd_sad32x32_bits8,
+                   vpx_highbd_sad32x32_avg_bits8,
+                   vpx_highbd_8_variance32x32,
+                   vpx_highbd_8_sub_pixel_variance32x32,
+                   vpx_highbd_8_sub_pixel_avg_variance32x32,
+                   vpx_highbd_sad32x32x3_bits8,
+                   vpx_highbd_sad32x32x8_bits8,
+                   vpx_highbd_sad32x32x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_64X64,
+                   vpx_highbd_sad64x64_bits8,
+                   vpx_highbd_sad64x64_avg_bits8,
+                   vpx_highbd_8_variance64x64,
+                   vpx_highbd_8_sub_pixel_variance64x64,
+                   vpx_highbd_8_sub_pixel_avg_variance64x64,
+                   vpx_highbd_sad64x64x3_bits8,
+                   vpx_highbd_sad64x64x8_bits8,
+                   vpx_highbd_sad64x64x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_16X16,
+                   vpx_highbd_sad16x16_bits8,
+                   vpx_highbd_sad16x16_avg_bits8,
+                   vpx_highbd_8_variance16x16,
+                   vpx_highbd_8_sub_pixel_variance16x16,
+                   vpx_highbd_8_sub_pixel_avg_variance16x16,
+                   vpx_highbd_sad16x16x3_bits8,
+                   vpx_highbd_sad16x16x8_bits8,
+                   vpx_highbd_sad16x16x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_16X8,
+                   vpx_highbd_sad16x8_bits8,
+                   vpx_highbd_sad16x8_avg_bits8,
+                   vpx_highbd_8_variance16x8,
+                   vpx_highbd_8_sub_pixel_variance16x8,
+                   vpx_highbd_8_sub_pixel_avg_variance16x8,
+                   vpx_highbd_sad16x8x3_bits8,
+                   vpx_highbd_sad16x8x8_bits8,
+                   vpx_highbd_sad16x8x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_8X16,
+                   vpx_highbd_sad8x16_bits8,
+                   vpx_highbd_sad8x16_avg_bits8,
+                   vpx_highbd_8_variance8x16,
+                   vpx_highbd_8_sub_pixel_variance8x16,
+                   vpx_highbd_8_sub_pixel_avg_variance8x16,
+                   vpx_highbd_sad8x16x3_bits8,
+                   vpx_highbd_sad8x16x8_bits8,
+                   vpx_highbd_sad8x16x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_8X8,
+                   vpx_highbd_sad8x8_bits8,
+                   vpx_highbd_sad8x8_avg_bits8,
+                   vpx_highbd_8_variance8x8,
+                   vpx_highbd_8_sub_pixel_variance8x8,
+                   vpx_highbd_8_sub_pixel_avg_variance8x8,
+                   vpx_highbd_sad8x8x3_bits8,
+                   vpx_highbd_sad8x8x8_bits8,
+                   vpx_highbd_sad8x8x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_8X4,
+                   vpx_highbd_sad8x4_bits8,
+                   vpx_highbd_sad8x4_avg_bits8,
+                   vpx_highbd_8_variance8x4,
+                   vpx_highbd_8_sub_pixel_variance8x4,
+                   vpx_highbd_8_sub_pixel_avg_variance8x4,
+                   NULL,
+                   vpx_highbd_sad8x4x8_bits8,
+                   vpx_highbd_sad8x4x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_4X8,
+                   vpx_highbd_sad4x8_bits8,
+                   vpx_highbd_sad4x8_avg_bits8,
+                   vpx_highbd_8_variance4x8,
+                   vpx_highbd_8_sub_pixel_variance4x8,
+                   vpx_highbd_8_sub_pixel_avg_variance4x8,
+                   NULL,
+                   vpx_highbd_sad4x8x8_bits8,
+                   vpx_highbd_sad4x8x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_4X4,
+                   vpx_highbd_sad4x4_bits8,
+                   vpx_highbd_sad4x4_avg_bits8,
+                   vpx_highbd_8_variance4x4,
+                   vpx_highbd_8_sub_pixel_variance4x4,
+                   vpx_highbd_8_sub_pixel_avg_variance4x4,
+                   vpx_highbd_sad4x4x3_bits8,
+                   vpx_highbd_sad4x4x8_bits8,
+                   vpx_highbd_sad4x4x4d_bits8)
+        break;
+
+      case VPX_BITS_10:
+        HIGHBD_BFP(BLOCK_32X16,
+                   vpx_highbd_sad32x16_bits10,
+                   vpx_highbd_sad32x16_avg_bits10,
+                   vpx_highbd_10_variance32x16,
+                   vpx_highbd_10_sub_pixel_variance32x16,
+                   vpx_highbd_10_sub_pixel_avg_variance32x16,
+                   NULL,
+                   NULL,
+                   vpx_highbd_sad32x16x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_16X32,
+                   vpx_highbd_sad16x32_bits10,
+                   vpx_highbd_sad16x32_avg_bits10,
+                   vpx_highbd_10_variance16x32,
+                   vpx_highbd_10_sub_pixel_variance16x32,
+                   vpx_highbd_10_sub_pixel_avg_variance16x32,
+                   NULL,
+                   NULL,
+                   vpx_highbd_sad16x32x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_64X32,
+                   vpx_highbd_sad64x32_bits10,
+                   vpx_highbd_sad64x32_avg_bits10,
+                   vpx_highbd_10_variance64x32,
+                   vpx_highbd_10_sub_pixel_variance64x32,
+                   vpx_highbd_10_sub_pixel_avg_variance64x32,
+                   NULL,
+                   NULL,
+                   vpx_highbd_sad64x32x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_32X64,
+                   vpx_highbd_sad32x64_bits10,
+                   vpx_highbd_sad32x64_avg_bits10,
+                   vpx_highbd_10_variance32x64,
+                   vpx_highbd_10_sub_pixel_variance32x64,
+                   vpx_highbd_10_sub_pixel_avg_variance32x64,
+                   NULL,
+                   NULL,
+                   vpx_highbd_sad32x64x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_32X32,
+                   vpx_highbd_sad32x32_bits10,
+                   vpx_highbd_sad32x32_avg_bits10,
+                   vpx_highbd_10_variance32x32,
+                   vpx_highbd_10_sub_pixel_variance32x32,
+                   vpx_highbd_10_sub_pixel_avg_variance32x32,
+                   vpx_highbd_sad32x32x3_bits10,
+                   vpx_highbd_sad32x32x8_bits10,
+                   vpx_highbd_sad32x32x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_64X64,
+                   vpx_highbd_sad64x64_bits10,
+                   vpx_highbd_sad64x64_avg_bits10,
+                   vpx_highbd_10_variance64x64,
+                   vpx_highbd_10_sub_pixel_variance64x64,
+                   vpx_highbd_10_sub_pixel_avg_variance64x64,
+                   vpx_highbd_sad64x64x3_bits10,
+                   vpx_highbd_sad64x64x8_bits10,
+                   vpx_highbd_sad64x64x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_16X16,
+                   vpx_highbd_sad16x16_bits10,
+                   vpx_highbd_sad16x16_avg_bits10,
+                   vpx_highbd_10_variance16x16,
+                   vpx_highbd_10_sub_pixel_variance16x16,
+                   vpx_highbd_10_sub_pixel_avg_variance16x16,
+                   vpx_highbd_sad16x16x3_bits10,
+                   vpx_highbd_sad16x16x8_bits10,
+                   vpx_highbd_sad16x16x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_16X8,
+                   vpx_highbd_sad16x8_bits10,
+                   vpx_highbd_sad16x8_avg_bits10,
+                   vpx_highbd_10_variance16x8,
+                   vpx_highbd_10_sub_pixel_variance16x8,
+                   vpx_highbd_10_sub_pixel_avg_variance16x8,
+                   vpx_highbd_sad16x8x3_bits10,
+                   vpx_highbd_sad16x8x8_bits10,
+                   vpx_highbd_sad16x8x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_8X16,
+                   vpx_highbd_sad8x16_bits10,
+                   vpx_highbd_sad8x16_avg_bits10,
+                   vpx_highbd_10_variance8x16,
+                   vpx_highbd_10_sub_pixel_variance8x16,
+                   vpx_highbd_10_sub_pixel_avg_variance8x16,
+                   vpx_highbd_sad8x16x3_bits10,
+                   vpx_highbd_sad8x16x8_bits10,
+                   vpx_highbd_sad8x16x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_8X8,
+                   vpx_highbd_sad8x8_bits10,
+                   vpx_highbd_sad8x8_avg_bits10,
+                   vpx_highbd_10_variance8x8,
+                   vpx_highbd_10_sub_pixel_variance8x8,
+                   vpx_highbd_10_sub_pixel_avg_variance8x8,
+                   vpx_highbd_sad8x8x3_bits10,
+                   vpx_highbd_sad8x8x8_bits10,
+                   vpx_highbd_sad8x8x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_8X4,
+                   vpx_highbd_sad8x4_bits10,
+                   vpx_highbd_sad8x4_avg_bits10,
+                   vpx_highbd_10_variance8x4,
+                   vpx_highbd_10_sub_pixel_variance8x4,
+                   vpx_highbd_10_sub_pixel_avg_variance8x4,
+                   NULL,
+                   vpx_highbd_sad8x4x8_bits10,
+                   vpx_highbd_sad8x4x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_4X8,
+                   vpx_highbd_sad4x8_bits10,
+                   vpx_highbd_sad4x8_avg_bits10,
+                   vpx_highbd_10_variance4x8,
+                   vpx_highbd_10_sub_pixel_variance4x8,
+                   vpx_highbd_10_sub_pixel_avg_variance4x8,
+                   NULL,
+                   vpx_highbd_sad4x8x8_bits10,
+                   vpx_highbd_sad4x8x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_4X4,
+                   vpx_highbd_sad4x4_bits10,
+                   vpx_highbd_sad4x4_avg_bits10,
+                   vpx_highbd_10_variance4x4,
+                   vpx_highbd_10_sub_pixel_variance4x4,
+                   vpx_highbd_10_sub_pixel_avg_variance4x4,
+                   vpx_highbd_sad4x4x3_bits10,
+                   vpx_highbd_sad4x4x8_bits10,
+                   vpx_highbd_sad4x4x4d_bits10)
+        break;
+
+      case VPX_BITS_12:
+        HIGHBD_BFP(BLOCK_32X16,
+                   vpx_highbd_sad32x16_bits12,
+                   vpx_highbd_sad32x16_avg_bits12,
+                   vpx_highbd_12_variance32x16,
+                   vpx_highbd_12_sub_pixel_variance32x16,
+                   vpx_highbd_12_sub_pixel_avg_variance32x16,
+                   NULL,
+                   NULL,
+                   vpx_highbd_sad32x16x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_16X32,
+                   vpx_highbd_sad16x32_bits12,
+                   vpx_highbd_sad16x32_avg_bits12,
+                   vpx_highbd_12_variance16x32,
+                   vpx_highbd_12_sub_pixel_variance16x32,
+                   vpx_highbd_12_sub_pixel_avg_variance16x32,
+                   NULL,
+                   NULL,
+                   vpx_highbd_sad16x32x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_64X32,
+                   vpx_highbd_sad64x32_bits12,
+                   vpx_highbd_sad64x32_avg_bits12,
+                   vpx_highbd_12_variance64x32,
+                   vpx_highbd_12_sub_pixel_variance64x32,
+                   vpx_highbd_12_sub_pixel_avg_variance64x32,
+                   NULL,
+                   NULL,
+                   vpx_highbd_sad64x32x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_32X64,
+                   vpx_highbd_sad32x64_bits12,
+                   vpx_highbd_sad32x64_avg_bits12,
+                   vpx_highbd_12_variance32x64,
+                   vpx_highbd_12_sub_pixel_variance32x64,
+                   vpx_highbd_12_sub_pixel_avg_variance32x64,
+                   NULL,
+                   NULL,
+                   vpx_highbd_sad32x64x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_32X32,
+                   vpx_highbd_sad32x32_bits12,
+                   vpx_highbd_sad32x32_avg_bits12,
+                   vpx_highbd_12_variance32x32,
+                   vpx_highbd_12_sub_pixel_variance32x32,
+                   vpx_highbd_12_sub_pixel_avg_variance32x32,
+                   vpx_highbd_sad32x32x3_bits12,
+                   vpx_highbd_sad32x32x8_bits12,
+                   vpx_highbd_sad32x32x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_64X64,
+                   vpx_highbd_sad64x64_bits12,
+                   vpx_highbd_sad64x64_avg_bits12,
+                   vpx_highbd_12_variance64x64,
+                   vpx_highbd_12_sub_pixel_variance64x64,
+                   vpx_highbd_12_sub_pixel_avg_variance64x64,
+                   vpx_highbd_sad64x64x3_bits12,
+                   vpx_highbd_sad64x64x8_bits12,
+                   vpx_highbd_sad64x64x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_16X16,
+                   vpx_highbd_sad16x16_bits12,
+                   vpx_highbd_sad16x16_avg_bits12,
+                   vpx_highbd_12_variance16x16,
+                   vpx_highbd_12_sub_pixel_variance16x16,
+                   vpx_highbd_12_sub_pixel_avg_variance16x16,
+                   vpx_highbd_sad16x16x3_bits12,
+                   vpx_highbd_sad16x16x8_bits12,
+                   vpx_highbd_sad16x16x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_16X8,
+                   vpx_highbd_sad16x8_bits12,
+                   vpx_highbd_sad16x8_avg_bits12,
+                   vpx_highbd_12_variance16x8,
+                   vpx_highbd_12_sub_pixel_variance16x8,
+                   vpx_highbd_12_sub_pixel_avg_variance16x8,
+                   vpx_highbd_sad16x8x3_bits12,
+                   vpx_highbd_sad16x8x8_bits12,
+                   vpx_highbd_sad16x8x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_8X16,
+                   vpx_highbd_sad8x16_bits12,
+                   vpx_highbd_sad8x16_avg_bits12,
+                   vpx_highbd_12_variance8x16,
+                   vpx_highbd_12_sub_pixel_variance8x16,
+                   vpx_highbd_12_sub_pixel_avg_variance8x16,
+                   vpx_highbd_sad8x16x3_bits12,
+                   vpx_highbd_sad8x16x8_bits12,
+                   vpx_highbd_sad8x16x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_8X8,
+                   vpx_highbd_sad8x8_bits12,
+                   vpx_highbd_sad8x8_avg_bits12,
+                   vpx_highbd_12_variance8x8,
+                   vpx_highbd_12_sub_pixel_variance8x8,
+                   vpx_highbd_12_sub_pixel_avg_variance8x8,
+                   vpx_highbd_sad8x8x3_bits12,
+                   vpx_highbd_sad8x8x8_bits12,
+                   vpx_highbd_sad8x8x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_8X4,
+                   vpx_highbd_sad8x4_bits12,
+                   vpx_highbd_sad8x4_avg_bits12,
+                   vpx_highbd_12_variance8x4,
+                   vpx_highbd_12_sub_pixel_variance8x4,
+                   vpx_highbd_12_sub_pixel_avg_variance8x4,
+                   NULL,
+                   vpx_highbd_sad8x4x8_bits12,
+                   vpx_highbd_sad8x4x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_4X8,
+                   vpx_highbd_sad4x8_bits12,
+                   vpx_highbd_sad4x8_avg_bits12,
+                   vpx_highbd_12_variance4x8,
+                   vpx_highbd_12_sub_pixel_variance4x8,
+                   vpx_highbd_12_sub_pixel_avg_variance4x8,
+                   NULL,
+                   vpx_highbd_sad4x8x8_bits12,
+                   vpx_highbd_sad4x8x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_4X4,
+                   vpx_highbd_sad4x4_bits12,
+                   vpx_highbd_sad4x4_avg_bits12,
+                   vpx_highbd_12_variance4x4,
+                   vpx_highbd_12_sub_pixel_variance4x4,
+                   vpx_highbd_12_sub_pixel_avg_variance4x4,
+                   vpx_highbd_sad4x4x3_bits12,
+                   vpx_highbd_sad4x4x8_bits12,
+                   vpx_highbd_sad4x4x4d_bits12)
+        break;
+
+      default:
+        assert(0 && "cm->bit_depth should be VPX_BITS_8, "
+                    "VPX_BITS_10 or VPX_BITS_12");
+    }
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static void realloc_segmentation_maps(VP10_COMP *cpi) {
+  VP10_COMMON *const cm = &cpi->common;
+
+  // Create the encoder segmentation map and set all entries to 0
+  vpx_free(cpi->segmentation_map);
+  CHECK_MEM_ERROR(cm, cpi->segmentation_map,
+                  vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
+
+  // Create a map used for cyclic background refresh.
+  if (cpi->cyclic_refresh)
+    vp10_cyclic_refresh_free(cpi->cyclic_refresh);
+  CHECK_MEM_ERROR(cm, cpi->cyclic_refresh,
+                  vp10_cyclic_refresh_alloc(cm->mi_rows, cm->mi_cols));
+
+  // Create a map used to mark inactive areas.
+  vpx_free(cpi->active_map.map);
+  CHECK_MEM_ERROR(cm, cpi->active_map.map,
+                  vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
+
+  // And a place holder structure is the coding context
+  // for use if we want to save and restore it
+  vpx_free(cpi->coding_context.last_frame_seg_map_copy);
+  CHECK_MEM_ERROR(cm, cpi->coding_context.last_frame_seg_map_copy,
+                  vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
+}
+
+void vp10_change_config(struct VP10_COMP *cpi, const VP10EncoderConfig *oxcf) {
+  VP10_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+
+  if (cm->profile != oxcf->profile)
+    cm->profile = oxcf->profile;
+  cm->bit_depth = oxcf->bit_depth;
+  cm->color_space = oxcf->color_space;
+  cm->color_range = oxcf->color_range;
+
+  if (cm->profile <= PROFILE_1)
+    assert(cm->bit_depth == VPX_BITS_8);
+  else
+    assert(cm->bit_depth > VPX_BITS_8);
+
+  cpi->oxcf = *oxcf;
+#if CONFIG_VP9_HIGHBITDEPTH
+  cpi->td.mb.e_mbd.bd = (int)cm->bit_depth;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  if ((oxcf->pass == 0) && (oxcf->rc_mode == VPX_Q)) {
+    rc->baseline_gf_interval = FIXED_GF_INTERVAL;
+  } else {
+    rc->baseline_gf_interval = (MIN_GF_INTERVAL + MAX_GF_INTERVAL) / 2;
+  }
+
+  cpi->refresh_golden_frame = 0;
+  cpi->refresh_last_frame = 1;
+  cm->refresh_frame_context =
+      oxcf->error_resilient_mode ? REFRESH_FRAME_CONTEXT_OFF :
+          oxcf->frame_parallel_decoding_mode ? REFRESH_FRAME_CONTEXT_FORWARD
+                                             : REFRESH_FRAME_CONTEXT_BACKWARD;
+  cm->reset_frame_context = RESET_FRAME_CONTEXT_NONE;
+
+  vp10_reset_segment_features(cm);
+  vp10_set_high_precision_mv(cpi, 0);
+
+  {
+    int i;
+
+    for (i = 0; i < MAX_SEGMENTS; i++)
+      cpi->segment_encode_breakout[i] = cpi->oxcf.encode_breakout;
+  }
+  cpi->encode_breakout = cpi->oxcf.encode_breakout;
+
+  set_rc_buffer_sizes(rc, &cpi->oxcf);
+
+  // Under a configuration change, where maximum_buffer_size may change,
+  // keep buffer level clipped to the maximum allowed buffer size.
+  rc->bits_off_target = VPXMIN(rc->bits_off_target, rc->maximum_buffer_size);
+  rc->buffer_level = VPXMIN(rc->buffer_level, rc->maximum_buffer_size);
+
+  // Set up frame rate and related parameters rate control values.
+  vp10_new_framerate(cpi, cpi->framerate);
+
+  // Set absolute upper and lower quality limits
+  rc->worst_quality = cpi->oxcf.worst_allowed_q;
+  rc->best_quality = cpi->oxcf.best_allowed_q;
+
+  cm->interp_filter = cpi->sf.default_interp_filter;
+
+  if (cpi->oxcf.render_width > 0 && cpi->oxcf.render_height > 0) {
+    cm->render_width = cpi->oxcf.render_width;
+    cm->render_height = cpi->oxcf.render_height;
+  } else {
+    cm->render_width = cpi->oxcf.width;
+    cm->render_height = cpi->oxcf.height;
+  }
+  cm->width = cpi->oxcf.width;
+  cm->height = cpi->oxcf.height;
+
+  if (cpi->initial_width) {
+    if (cm->width > cpi->initial_width || cm->height > cpi->initial_height) {
+      vp10_free_context_buffers(cm);
+      vp10_alloc_compressor_data(cpi);
+      realloc_segmentation_maps(cpi);
+      cpi->initial_width = cpi->initial_height = 0;
+    }
+  }
+  update_frame_size(cpi);
+
+  cpi->alt_ref_source = NULL;
+  rc->is_src_frame_alt_ref = 0;
+
+#if 0
+  // Experimental RD Code
+  cpi->frame_distortion = 0;
+  cpi->last_frame_distortion = 0;
+#endif
+
+  set_tile_limits(cpi);
+
+  cpi->ext_refresh_frame_flags_pending = 0;
+  cpi->ext_refresh_frame_context_pending = 0;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  highbd_set_var_fns(cpi);
+#endif
+}
+
+#ifndef M_LOG2_E
+#define M_LOG2_E 0.693147180559945309417
+#endif
+#define log2f(x) (log (x) / (float) M_LOG2_E)
+
+static void cal_nmvjointsadcost(int *mvjointsadcost) {
+  mvjointsadcost[0] = 600;
+  mvjointsadcost[1] = 300;
+  mvjointsadcost[2] = 300;
+  mvjointsadcost[3] = 300;
+}
+
+static void cal_nmvsadcosts(int *mvsadcost[2]) {
+  int i = 1;
+
+  mvsadcost[0][0] = 0;
+  mvsadcost[1][0] = 0;
+
+  do {
+    double z = 256 * (2 * (log2f(8 * i) + .6));
+    mvsadcost[0][i] = (int)z;
+    mvsadcost[1][i] = (int)z;
+    mvsadcost[0][-i] = (int)z;
+    mvsadcost[1][-i] = (int)z;
+  } while (++i <= MV_MAX);
+}
+
+static void cal_nmvsadcosts_hp(int *mvsadcost[2]) {
+  int i = 1;
+
+  mvsadcost[0][0] = 0;
+  mvsadcost[1][0] = 0;
+
+  do {
+    double z = 256 * (2 * (log2f(8 * i) + .6));
+    mvsadcost[0][i] = (int)z;
+    mvsadcost[1][i] = (int)z;
+    mvsadcost[0][-i] = (int)z;
+    mvsadcost[1][-i] = (int)z;
+  } while (++i <= MV_MAX);
+}
+
+
+VP10_COMP *vp10_create_compressor(VP10EncoderConfig *oxcf,
+                                BufferPool *const pool) {
+  unsigned int i;
+  VP10_COMP *volatile const cpi = vpx_memalign(32, sizeof(VP10_COMP));
+  VP10_COMMON *volatile const cm = cpi != NULL ? &cpi->common : NULL;
+
+  if (!cm)
+    return NULL;
+
+  vp10_zero(*cpi);
+
+  if (setjmp(cm->error.jmp)) {
+    cm->error.setjmp = 0;
+    vp10_remove_compressor(cpi);
+    return 0;
+  }
+
+  cm->error.setjmp = 1;
+  cm->alloc_mi = vp10_enc_alloc_mi;
+  cm->free_mi = vp10_enc_free_mi;
+  cm->setup_mi = vp10_enc_setup_mi;
+
+  CHECK_MEM_ERROR(cm, cm->fc,
+                  (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc)));
+  CHECK_MEM_ERROR(cm, cm->frame_contexts,
+                  (FRAME_CONTEXT *)vpx_calloc(FRAME_CONTEXTS,
+                  sizeof(*cm->frame_contexts)));
+
+  cpi->resize_state = 0;
+  cpi->resize_avg_qp = 0;
+  cpi->resize_buffer_underflow = 0;
+  cpi->common.buffer_pool = pool;
+
+  init_config(cpi, oxcf);
+  vp10_rc_init(&cpi->oxcf, oxcf->pass, &cpi->rc);
+
+  cm->current_video_frame = 0;
+  cpi->partition_search_skippable_frame = 0;
+  cpi->tile_data = NULL;
+
+  realloc_segmentation_maps(cpi);
+
+  CHECK_MEM_ERROR(cm, cpi->nmvcosts[0],
+                  vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts[0])));
+  CHECK_MEM_ERROR(cm, cpi->nmvcosts[1],
+                  vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts[1])));
+  CHECK_MEM_ERROR(cm, cpi->nmvcosts_hp[0],
+                  vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts_hp[0])));
+  CHECK_MEM_ERROR(cm, cpi->nmvcosts_hp[1],
+                  vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts_hp[1])));
+  CHECK_MEM_ERROR(cm, cpi->nmvsadcosts[0],
+                  vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts[0])));
+  CHECK_MEM_ERROR(cm, cpi->nmvsadcosts[1],
+                  vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts[1])));
+  CHECK_MEM_ERROR(cm, cpi->nmvsadcosts_hp[0],
+                  vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts_hp[0])));
+  CHECK_MEM_ERROR(cm, cpi->nmvsadcosts_hp[1],
+                  vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts_hp[1])));
+
+  for (i = 0; i < (sizeof(cpi->mbgraph_stats) /
+                   sizeof(cpi->mbgraph_stats[0])); i++) {
+    CHECK_MEM_ERROR(cm, cpi->mbgraph_stats[i].mb_stats,
+                    vpx_calloc(cm->MBs *
+                               sizeof(*cpi->mbgraph_stats[i].mb_stats), 1));
+  }
+
+#if CONFIG_FP_MB_STATS
+  cpi->use_fp_mb_stats = 0;
+  if (cpi->use_fp_mb_stats) {
+    // a place holder used to store the first pass mb stats in the first pass
+    CHECK_MEM_ERROR(cm, cpi->twopass.frame_mb_stats_buf,
+                    vpx_calloc(cm->MBs * sizeof(uint8_t), 1));
+  } else {
+    cpi->twopass.frame_mb_stats_buf = NULL;
+  }
+#endif
+
+  cpi->refresh_alt_ref_frame = 0;
+  cpi->multi_arf_last_grp_enabled = 0;
+
+  cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
+#if CONFIG_INTERNAL_STATS
+  cpi->b_calculate_ssimg = 0;
+  cpi->b_calculate_blockiness = 1;
+  cpi->b_calculate_consistency = 1;
+  cpi->total_inconsistency = 0;
+  cpi->psnr.worst = 100.0;
+  cpi->worst_ssim = 100.0;
+
+  cpi->count = 0;
+  cpi->bytes = 0;
+
+  if (cpi->b_calculate_psnr) {
+    cpi->total_sq_error = 0;
+    cpi->total_samples = 0;
+
+    cpi->totalp_sq_error = 0;
+    cpi->totalp_samples = 0;
+
+    cpi->tot_recode_hits = 0;
+    cpi->summed_quality = 0;
+    cpi->summed_weights = 0;
+    cpi->summedp_quality = 0;
+    cpi->summedp_weights = 0;
+  }
+
+  if (cpi->b_calculate_ssimg) {
+    cpi->ssimg.worst= 100.0;
+  }
+  cpi->fastssim.worst = 100.0;
+
+  cpi->psnrhvs.worst = 100.0;
+
+  if (cpi->b_calculate_blockiness) {
+    cpi->total_blockiness = 0;
+    cpi->worst_blockiness = 0.0;
+  }
+
+  if (cpi->b_calculate_consistency) {
+    cpi->ssim_vars = vpx_malloc(sizeof(*cpi->ssim_vars) *
+                                4 * cpi->common.mi_rows * cpi->common.mi_cols);
+    cpi->worst_consistency = 100.0;
+  }
+
+#endif
+
+  cpi->first_time_stamp_ever = INT64_MAX;
+
+  cal_nmvjointsadcost(cpi->td.mb.nmvjointsadcost);
+  cpi->td.mb.nmvcost[0] = &cpi->nmvcosts[0][MV_MAX];
+  cpi->td.mb.nmvcost[1] = &cpi->nmvcosts[1][MV_MAX];
+  cpi->td.mb.nmvsadcost[0] = &cpi->nmvsadcosts[0][MV_MAX];
+  cpi->td.mb.nmvsadcost[1] = &cpi->nmvsadcosts[1][MV_MAX];
+  cal_nmvsadcosts(cpi->td.mb.nmvsadcost);
+
+  cpi->td.mb.nmvcost_hp[0] = &cpi->nmvcosts_hp[0][MV_MAX];
+  cpi->td.mb.nmvcost_hp[1] = &cpi->nmvcosts_hp[1][MV_MAX];
+  cpi->td.mb.nmvsadcost_hp[0] = &cpi->nmvsadcosts_hp[0][MV_MAX];
+  cpi->td.mb.nmvsadcost_hp[1] = &cpi->nmvsadcosts_hp[1][MV_MAX];
+  cal_nmvsadcosts_hp(cpi->td.mb.nmvsadcost_hp);
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+#ifdef OUTPUT_YUV_DENOISED
+  yuv_denoised_file = fopen("denoised.yuv", "ab");
+#endif
+#endif
+#ifdef OUTPUT_YUV_SKINMAP
+  yuv_skinmap_file = fopen("skinmap.yuv", "ab");
+#endif
+#ifdef OUTPUT_YUV_REC
+  yuv_rec_file = fopen("rec.yuv", "wb");
+#endif
+
+#if 0
+  framepsnr = fopen("framepsnr.stt", "a");
+  kf_list = fopen("kf_list.stt", "w");
+#endif
+
+  cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED;
+
+  if (oxcf->pass == 1) {
+    vp10_init_first_pass(cpi);
+  } else if (oxcf->pass == 2) {
+    const size_t packet_sz = sizeof(FIRSTPASS_STATS);
+    const int packets = (int)(oxcf->two_pass_stats_in.sz / packet_sz);
+
+#if CONFIG_FP_MB_STATS
+    if (cpi->use_fp_mb_stats) {
+      const size_t psz = cpi->common.MBs * sizeof(uint8_t);
+      const int ps = (int)(oxcf->firstpass_mb_stats_in.sz / psz);
+
+      cpi->twopass.firstpass_mb_stats.mb_stats_start =
+          oxcf->firstpass_mb_stats_in.buf;
+      cpi->twopass.firstpass_mb_stats.mb_stats_end =
+          cpi->twopass.firstpass_mb_stats.mb_stats_start +
+          (ps - 1) * cpi->common.MBs * sizeof(uint8_t);
+    }
+#endif
+
+    cpi->twopass.stats_in_start = oxcf->two_pass_stats_in.buf;
+    cpi->twopass.stats_in = cpi->twopass.stats_in_start;
+    cpi->twopass.stats_in_end = &cpi->twopass.stats_in[packets - 1];
+
+    vp10_init_second_pass(cpi);
+  }
+
+  vp10_set_speed_features_framesize_independent(cpi);
+  vp10_set_speed_features_framesize_dependent(cpi);
+
+  // Allocate memory to store variances for a frame.
+  CHECK_MEM_ERROR(cm, cpi->source_diff_var,
+                  vpx_calloc(cm->MBs, sizeof(diff)));
+  cpi->source_var_thresh = 0;
+  cpi->frames_till_next_var_check = 0;
+
+#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX3F, SDX8F, SDX4DF)\
+    cpi->fn_ptr[BT].sdf            = SDF; \
+    cpi->fn_ptr[BT].sdaf           = SDAF; \
+    cpi->fn_ptr[BT].vf             = VF; \
+    cpi->fn_ptr[BT].svf            = SVF; \
+    cpi->fn_ptr[BT].svaf           = SVAF; \
+    cpi->fn_ptr[BT].sdx3f          = SDX3F; \
+    cpi->fn_ptr[BT].sdx8f          = SDX8F; \
+    cpi->fn_ptr[BT].sdx4df         = SDX4DF;
+
+  BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad32x16_avg,
+      vpx_variance32x16, vpx_sub_pixel_variance32x16,
+      vpx_sub_pixel_avg_variance32x16, NULL, NULL, vpx_sad32x16x4d)
+
+  BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad16x32_avg,
+      vpx_variance16x32, vpx_sub_pixel_variance16x32,
+      vpx_sub_pixel_avg_variance16x32, NULL, NULL, vpx_sad16x32x4d)
+
+  BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad64x32_avg,
+      vpx_variance64x32, vpx_sub_pixel_variance64x32,
+      vpx_sub_pixel_avg_variance64x32, NULL, NULL, vpx_sad64x32x4d)
+
+  BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad32x64_avg,
+      vpx_variance32x64, vpx_sub_pixel_variance32x64,
+      vpx_sub_pixel_avg_variance32x64, NULL, NULL, vpx_sad32x64x4d)
+
+  BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad32x32_avg,
+      vpx_variance32x32, vpx_sub_pixel_variance32x32,
+      vpx_sub_pixel_avg_variance32x32, vpx_sad32x32x3, vpx_sad32x32x8,
+      vpx_sad32x32x4d)
+
+  BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad64x64_avg,
+      vpx_variance64x64, vpx_sub_pixel_variance64x64,
+      vpx_sub_pixel_avg_variance64x64, vpx_sad64x64x3, vpx_sad64x64x8,
+      vpx_sad64x64x4d)
+
+  BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad16x16_avg,
+      vpx_variance16x16, vpx_sub_pixel_variance16x16,
+      vpx_sub_pixel_avg_variance16x16, vpx_sad16x16x3, vpx_sad16x16x8,
+      vpx_sad16x16x4d)
+
+  BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad16x8_avg,
+      vpx_variance16x8, vpx_sub_pixel_variance16x8,
+      vpx_sub_pixel_avg_variance16x8,
+      vpx_sad16x8x3, vpx_sad16x8x8, vpx_sad16x8x4d)
+
+  BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad8x16_avg,
+      vpx_variance8x16, vpx_sub_pixel_variance8x16,
+      vpx_sub_pixel_avg_variance8x16,
+      vpx_sad8x16x3, vpx_sad8x16x8, vpx_sad8x16x4d)
+
+  BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad8x8_avg,
+      vpx_variance8x8, vpx_sub_pixel_variance8x8,
+      vpx_sub_pixel_avg_variance8x8,
+      vpx_sad8x8x3, vpx_sad8x8x8, vpx_sad8x8x4d)
+
+  BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad8x4_avg,
+      vpx_variance8x4, vpx_sub_pixel_variance8x4,
+      vpx_sub_pixel_avg_variance8x4, NULL, vpx_sad8x4x8, vpx_sad8x4x4d)
+
+  BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad4x8_avg,
+      vpx_variance4x8, vpx_sub_pixel_variance4x8,
+      vpx_sub_pixel_avg_variance4x8, NULL, vpx_sad4x8x8, vpx_sad4x8x4d)
+
+  BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad4x4_avg,
+      vpx_variance4x4, vpx_sub_pixel_variance4x4,
+      vpx_sub_pixel_avg_variance4x4,
+      vpx_sad4x4x3, vpx_sad4x4x8, vpx_sad4x4x4d)
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  highbd_set_var_fns(cpi);
+#endif
+
+  /* vp10_init_quantizer() is first called here. Add check in
+   * vp10_frame_init_quantizer() so that vp10_init_quantizer is only
+   * called later when needed. This will avoid unnecessary calls of
+   * vp10_init_quantizer() for every frame.
+   */
+  vp10_init_quantizer(cpi);
+
+  vp10_loop_filter_init(cm);
+
+  cm->error.setjmp = 0;
+
+  return cpi;
+}
+#define SNPRINT(H, T) \
+  snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T))
+
+#define SNPRINT2(H, T, V) \
+  snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T), (V))
+
+void vp10_remove_compressor(VP10_COMP *cpi) {
+  VP10_COMMON *cm;
+  unsigned int i;
+  int t;
+
+  if (!cpi)
+    return;
+
+  cm = &cpi->common;
+  if (cm->current_video_frame > 0) {
+#if CONFIG_INTERNAL_STATS
+    vpx_clear_system_state();
+
+    if (cpi->oxcf.pass != 1) {
+      char headings[512] = {0};
+      char results[512] = {0};
+      FILE *f = fopen("opsnr.stt", "a");
+      double time_encoded = (cpi->last_end_time_stamp_seen
+                             - cpi->first_time_stamp_ever) / 10000000.000;
+      double total_encode_time = (cpi->time_receive_data +
+                                  cpi->time_compress_data)   / 1000.000;
+      const double dr =
+          (double)cpi->bytes * (double) 8 / (double)1000 / time_encoded;
+      const double peak = (double)((1 << cpi->oxcf.input_bit_depth) - 1);
+
+      if (cpi->b_calculate_psnr) {
+        const double total_psnr =
+            vpx_sse_to_psnr((double)cpi->total_samples, peak,
+                            (double)cpi->total_sq_error);
+        const double totalp_psnr =
+            vpx_sse_to_psnr((double)cpi->totalp_samples, peak,
+                            (double)cpi->totalp_sq_error);
+        const double total_ssim = 100 * pow(cpi->summed_quality /
+                                            cpi->summed_weights, 8.0);
+        const double totalp_ssim = 100 * pow(cpi->summedp_quality /
+                                             cpi->summedp_weights, 8.0);
+
+        snprintf(headings, sizeof(headings),
+                 "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\t"
+                 "VPXSSIM\tVPSSIMP\tFASTSIM\tPSNRHVS\t"
+                 "WstPsnr\tWstSsim\tWstFast\tWstHVS");
+        snprintf(results, sizeof(results),
+                 "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+                 "%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+                 "%7.3f\t%7.3f\t%7.3f\t%7.3f",
+                 dr, cpi->psnr.stat[ALL] / cpi->count, total_psnr,
+                 cpi->psnrp.stat[ALL] / cpi->count, totalp_psnr,
+                 total_ssim, totalp_ssim,
+                 cpi->fastssim.stat[ALL] / cpi->count,
+                 cpi->psnrhvs.stat[ALL] / cpi->count,
+                 cpi->psnr.worst, cpi->worst_ssim, cpi->fastssim.worst,
+                 cpi->psnrhvs.worst);
+
+        if (cpi->b_calculate_blockiness) {
+          SNPRINT(headings, "\t  Block\tWstBlck");
+          SNPRINT2(results, "\t%7.3f", cpi->total_blockiness / cpi->count);
+          SNPRINT2(results, "\t%7.3f", cpi->worst_blockiness);
+        }
+
+        if (cpi->b_calculate_consistency) {
+          double consistency =
+              vpx_sse_to_psnr((double)cpi->totalp_samples, peak,
+                              (double)cpi->total_inconsistency);
+
+          SNPRINT(headings, "\tConsist\tWstCons");
+          SNPRINT2(results, "\t%7.3f", consistency);
+          SNPRINT2(results, "\t%7.3f", cpi->worst_consistency);
+        }
+
+        if (cpi->b_calculate_ssimg) {
+          SNPRINT(headings, "\t  SSIMG\tWtSSIMG");
+          SNPRINT2(results, "\t%7.3f", cpi->ssimg.stat[ALL] / cpi->count);
+          SNPRINT2(results, "\t%7.3f", cpi->ssimg.worst);
+        }
+
+        fprintf(f, "%s\t    Time\n", headings);
+        fprintf(f, "%s\t%8.0f\n", results, total_encode_time);
+      }
+
+      fclose(f);
+    }
+
+#endif
+
+#if 0
+    {
+      printf("\n_pick_loop_filter_level:%d\n", cpi->time_pick_lpf / 1000);
+      printf("\n_frames recive_data encod_mb_row compress_frame  Total\n");
+      printf("%6d %10ld %10ld %10ld %10ld\n", cpi->common.current_video_frame,
+             cpi->time_receive_data / 1000, cpi->time_encode_sb_row / 1000,
+             cpi->time_compress_data / 1000,
+             (cpi->time_receive_data + cpi->time_compress_data) / 1000);
+    }
+#endif
+  }
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+  vp10_denoiser_free(&(cpi->denoiser));
+#endif
+
+  for (t = 0; t < cpi->num_workers; ++t) {
+    VPxWorker *const worker = &cpi->workers[t];
+    EncWorkerData *const thread_data = &cpi->tile_thr_data[t];
+
+    // Deallocate allocated threads.
+    vpx_get_worker_interface()->end(worker);
+
+    // Deallocate allocated thread data.
+    if (t < cpi->num_workers - 1) {
+      vpx_free(thread_data->td->counts);
+      vp10_free_pc_tree(thread_data->td);
+      vpx_free(thread_data->td);
+    }
+  }
+  vpx_free(cpi->tile_thr_data);
+  vpx_free(cpi->workers);
+
+  if (cpi->num_workers > 1)
+    vp10_loop_filter_dealloc(&cpi->lf_row_sync);
+
+  dealloc_compressor_data(cpi);
+
+  for (i = 0; i < sizeof(cpi->mbgraph_stats) /
+                  sizeof(cpi->mbgraph_stats[0]); ++i) {
+    vpx_free(cpi->mbgraph_stats[i].mb_stats);
+  }
+
+#if CONFIG_FP_MB_STATS
+  if (cpi->use_fp_mb_stats) {
+    vpx_free(cpi->twopass.frame_mb_stats_buf);
+    cpi->twopass.frame_mb_stats_buf = NULL;
+  }
+#endif
+
+  vp10_remove_common(cm);
+  vp10_free_ref_frame_buffers(cm->buffer_pool);
+#if CONFIG_VP9_POSTPROC
+  vp10_free_postproc_buffers(cm);
+#endif
+  vpx_free(cpi);
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+#ifdef OUTPUT_YUV_DENOISED
+  fclose(yuv_denoised_file);
+#endif
+#endif
+#ifdef OUTPUT_YUV_SKINMAP
+  fclose(yuv_skinmap_file);
+#endif
+#ifdef OUTPUT_YUV_REC
+  fclose(yuv_rec_file);
+#endif
+
+#if 0
+
+  if (keyfile)
+    fclose(keyfile);
+
+  if (framepsnr)
+    fclose(framepsnr);
+
+  if (kf_list)
+    fclose(kf_list);
+
+#endif
+}
+
+/* TODO(yaowu): The block_variance calls the unoptimized versions of variance()
+ * and highbd_8_variance(). It should not.
+ */
+static void encoder_variance(const uint8_t *a, int  a_stride,
+                             const uint8_t *b, int  b_stride,
+                             int  w, int  h, unsigned int *sse, int *sum) {
+  int i, j;
+
+  *sum = 0;
+  *sse = 0;
+
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      const int diff = a[j] - b[j];
+      *sum += diff;
+      *sse += diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void encoder_highbd_variance64(const uint8_t *a8, int  a_stride,
+                                      const uint8_t *b8, int  b_stride,
+                                      int w, int h, uint64_t *sse,
+                                      uint64_t *sum) {
+  int i, j;
+
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  *sum = 0;
+  *sse = 0;
+
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      const int diff = a[j] - b[j];
+      *sum += diff;
+      *sse += diff * diff;
+    }
+    a += a_stride;
+    b += b_stride;
+  }
+}
+
+static void encoder_highbd_8_variance(const uint8_t *a8, int  a_stride,
+                                      const uint8_t *b8, int  b_stride,
+                                      int w, int h,
+                                      unsigned int *sse, int *sum) {
+  uint64_t sse_long = 0;
+  uint64_t sum_long = 0;
+  encoder_highbd_variance64(a8, a_stride, b8, b_stride, w, h,
+                            &sse_long, &sum_long);
+  *sse = (unsigned int)sse_long;
+  *sum = (int)sum_long;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static int64_t get_sse(const uint8_t *a, int a_stride,
+                       const uint8_t *b, int b_stride,
+                       int width, int height) {
+  const int dw = width % 16;
+  const int dh = height % 16;
+  int64_t total_sse = 0;
+  unsigned int sse = 0;
+  int sum = 0;
+  int x, y;
+
+  if (dw > 0) {
+    encoder_variance(&a[width - dw], a_stride, &b[width - dw], b_stride,
+                     dw, height, &sse, &sum);
+    total_sse += sse;
+  }
+
+  if (dh > 0) {
+    encoder_variance(&a[(height - dh) * a_stride], a_stride,
+                     &b[(height - dh) * b_stride], b_stride,
+                     width - dw, dh, &sse, &sum);
+    total_sse += sse;
+  }
+
+  for (y = 0; y < height / 16; ++y) {
+    const uint8_t *pa = a;
+    const uint8_t *pb = b;
+    for (x = 0; x < width / 16; ++x) {
+      vpx_mse16x16(pa, a_stride, pb, b_stride, &sse);
+      total_sse += sse;
+
+      pa += 16;
+      pb += 16;
+    }
+
+    a += 16 * a_stride;
+    b += 16 * b_stride;
+  }
+
+  return total_sse;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static int64_t highbd_get_sse_shift(const uint8_t *a8, int a_stride,
+                                    const uint8_t *b8, int b_stride,
+                                    int width, int height,
+                                    unsigned int input_shift) {
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  int64_t total_sse = 0;
+  int x, y;
+  for (y = 0; y < height; ++y) {
+    for (x = 0; x < width; ++x) {
+      int64_t diff;
+      diff = (a[x] >> input_shift) - (b[x] >> input_shift);
+      total_sse += diff * diff;
+    }
+    a += a_stride;
+    b += b_stride;
+  }
+  return total_sse;
+}
+
+static int64_t highbd_get_sse(const uint8_t *a, int a_stride,
+                              const uint8_t *b, int b_stride,
+                              int width, int height) {
+  int64_t total_sse = 0;
+  int x, y;
+  const int dw = width % 16;
+  const int dh = height % 16;
+  unsigned int sse = 0;
+  int sum = 0;
+  if (dw > 0) {
+    encoder_highbd_8_variance(&a[width - dw], a_stride,
+                              &b[width - dw], b_stride,
+                              dw, height, &sse, &sum);
+    total_sse += sse;
+  }
+  if (dh > 0) {
+    encoder_highbd_8_variance(&a[(height - dh) * a_stride], a_stride,
+                              &b[(height - dh) * b_stride], b_stride,
+                              width - dw, dh, &sse, &sum);
+    total_sse += sse;
+  }
+  for (y = 0; y < height / 16; ++y) {
+    const uint8_t *pa = a;
+    const uint8_t *pb = b;
+    for (x = 0; x < width / 16; ++x) {
+      vpx_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse);
+      total_sse += sse;
+      pa += 16;
+      pb += 16;
+    }
+    a += 16 * a_stride;
+    b += 16 * b_stride;
+  }
+  return total_sse;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+typedef struct {
+  double psnr[4];       // total/y/u/v
+  uint64_t sse[4];      // total/y/u/v
+  uint32_t samples[4];  // total/y/u/v
+} PSNR_STATS;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
+                             const YV12_BUFFER_CONFIG *b,
+                             PSNR_STATS *psnr,
+                             unsigned int bit_depth,
+                             unsigned int in_bit_depth) {
+  const int widths[3] =
+      {a->y_crop_width,  a->uv_crop_width,  a->uv_crop_width };
+  const int heights[3] =
+      {a->y_crop_height, a->uv_crop_height, a->uv_crop_height};
+  const uint8_t *a_planes[3] = {a->y_buffer, a->u_buffer,  a->v_buffer };
+  const int a_strides[3] = {a->y_stride, a->uv_stride, a->uv_stride};
+  const uint8_t *b_planes[3] = {b->y_buffer, b->u_buffer,  b->v_buffer };
+  const int b_strides[3] = {b->y_stride, b->uv_stride, b->uv_stride};
+  int i;
+  uint64_t total_sse = 0;
+  uint32_t total_samples = 0;
+  const double peak = (double)((1 << in_bit_depth) - 1);
+  const unsigned int input_shift = bit_depth - in_bit_depth;
+
+  for (i = 0; i < 3; ++i) {
+    const int w = widths[i];
+    const int h = heights[i];
+    const uint32_t samples = w * h;
+    uint64_t sse;
+    if (a->flags & YV12_FLAG_HIGHBITDEPTH) {
+      if (input_shift) {
+        sse = highbd_get_sse_shift(a_planes[i], a_strides[i],
+                                   b_planes[i], b_strides[i], w, h,
+                                   input_shift);
+      } else {
+        sse = highbd_get_sse(a_planes[i], a_strides[i],
+                             b_planes[i], b_strides[i], w, h);
+      }
+    } else {
+      sse = get_sse(a_planes[i], a_strides[i],
+                    b_planes[i], b_strides[i],
+                    w, h);
+    }
+    psnr->sse[1 + i] = sse;
+    psnr->samples[1 + i] = samples;
+    psnr->psnr[1 + i] = vpx_sse_to_psnr(samples, peak, (double)sse);
+
+    total_sse += sse;
+    total_samples += samples;
+  }
+
+  psnr->sse[0] = total_sse;
+  psnr->samples[0] = total_samples;
+  psnr->psnr[0] = vpx_sse_to_psnr((double)total_samples, peak,
+                                  (double)total_sse);
+}
+
+#else  // !CONFIG_VP9_HIGHBITDEPTH
+
+static void calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
+                      PSNR_STATS *psnr) {
+  static const double peak = 255.0;
+  const int widths[3]        = {
+      a->y_crop_width, a->uv_crop_width, a->uv_crop_width};
+  const int heights[3]       = {
+      a->y_crop_height, a->uv_crop_height, a->uv_crop_height};
+  const uint8_t *a_planes[3] = {a->y_buffer, a->u_buffer, a->v_buffer};
+  const int a_strides[3]     = {a->y_stride, a->uv_stride, a->uv_stride};
+  const uint8_t *b_planes[3] = {b->y_buffer, b->u_buffer, b->v_buffer};
+  const int b_strides[3]     = {b->y_stride, b->uv_stride, b->uv_stride};
+  int i;
+  uint64_t total_sse = 0;
+  uint32_t total_samples = 0;
+
+  for (i = 0; i < 3; ++i) {
+    const int w = widths[i];
+    const int h = heights[i];
+    const uint32_t samples = w * h;
+    const uint64_t sse = get_sse(a_planes[i], a_strides[i],
+                                 b_planes[i], b_strides[i],
+                                 w, h);
+    psnr->sse[1 + i] = sse;
+    psnr->samples[1 + i] = samples;
+    psnr->psnr[1 + i] = vpx_sse_to_psnr(samples, peak, (double)sse);
+
+    total_sse += sse;
+    total_samples += samples;
+  }
+
+  psnr->sse[0] = total_sse;
+  psnr->samples[0] = total_samples;
+  psnr->psnr[0] = vpx_sse_to_psnr((double)total_samples, peak,
+                                  (double)total_sse);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static void generate_psnr_packet(VP10_COMP *cpi) {
+  struct vpx_codec_cx_pkt pkt;
+  int i;
+  PSNR_STATS psnr;
+#if CONFIG_VP9_HIGHBITDEPTH
+  calc_highbd_psnr(cpi->Source, cpi->common.frame_to_show, &psnr,
+                   cpi->td.mb.e_mbd.bd, cpi->oxcf.input_bit_depth);
+#else
+  calc_psnr(cpi->Source, cpi->common.frame_to_show, &psnr);
+#endif
+
+  for (i = 0; i < 4; ++i) {
+    pkt.data.psnr.samples[i] = psnr.samples[i];
+    pkt.data.psnr.sse[i] = psnr.sse[i];
+    pkt.data.psnr.psnr[i] = psnr.psnr[i];
+  }
+  pkt.kind = VPX_CODEC_PSNR_PKT;
+  vpx_codec_pkt_list_add(cpi->output_pkt_list, &pkt);
+}
+
+int vp10_use_as_reference(VP10_COMP *cpi, int ref_frame_flags) {
+  if (ref_frame_flags > 7)
+    return -1;
+
+  cpi->ref_frame_flags = ref_frame_flags;
+  return 0;
+}
+
+void vp10_update_reference(VP10_COMP *cpi, int ref_frame_flags) {
+  cpi->ext_refresh_golden_frame = (ref_frame_flags & VP9_GOLD_FLAG) != 0;
+  cpi->ext_refresh_alt_ref_frame = (ref_frame_flags & VP9_ALT_FLAG) != 0;
+  cpi->ext_refresh_last_frame = (ref_frame_flags & VP9_LAST_FLAG) != 0;
+  cpi->ext_refresh_frame_flags_pending = 1;
+}
+
+static YV12_BUFFER_CONFIG *get_vp10_ref_frame_buffer(VP10_COMP *cpi,
+                                VP9_REFFRAME ref_frame_flag) {
+  MV_REFERENCE_FRAME ref_frame = NONE;
+  if (ref_frame_flag == VP9_LAST_FLAG)
+    ref_frame = LAST_FRAME;
+  else if (ref_frame_flag == VP9_GOLD_FLAG)
+    ref_frame = GOLDEN_FRAME;
+  else if (ref_frame_flag == VP9_ALT_FLAG)
+    ref_frame = ALTREF_FRAME;
+
+  return ref_frame == NONE ? NULL : get_ref_frame_buffer(cpi, ref_frame);
+}
+
+int vp10_copy_reference_enc(VP10_COMP *cpi, VP9_REFFRAME ref_frame_flag,
+                           YV12_BUFFER_CONFIG *sd) {
+  YV12_BUFFER_CONFIG *cfg = get_vp10_ref_frame_buffer(cpi, ref_frame_flag);
+  if (cfg) {
+    vp8_yv12_copy_frame(cfg, sd);
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+int vp10_set_reference_enc(VP10_COMP *cpi, VP9_REFFRAME ref_frame_flag,
+                          YV12_BUFFER_CONFIG *sd) {
+  YV12_BUFFER_CONFIG *cfg = get_vp10_ref_frame_buffer(cpi, ref_frame_flag);
+  if (cfg) {
+    vp8_yv12_copy_frame(sd, cfg);
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+int vp10_update_entropy(VP10_COMP * cpi, int update) {
+  cpi->ext_refresh_frame_context = update;
+  cpi->ext_refresh_frame_context_pending = 1;
+  return 0;
+}
+
+#if defined(OUTPUT_YUV_DENOISED) || defined(OUTPUT_YUV_SKINMAP)
+// The denoiser buffer is allocated as a YUV 440 buffer. This function writes it
+// as YUV 420. We simply use the top-left pixels of the UV buffers, since we do
+// not denoise the UV channels at this time. If ever we implement UV channel
+// denoising we will have to modify this.
+void vp10_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f) {
+  uint8_t *src = s->y_buffer;
+  int h = s->y_height;
+
+  do {
+    fwrite(src, s->y_width, 1, f);
+    src += s->y_stride;
+  } while (--h);
+
+  src = s->u_buffer;
+  h = s->uv_height;
+
+  do {
+    fwrite(src, s->uv_width, 1, f);
+    src += s->uv_stride;
+  } while (--h);
+
+  src = s->v_buffer;
+  h = s->uv_height;
+
+  do {
+    fwrite(src, s->uv_width, 1, f);
+    src += s->uv_stride;
+  } while (--h);
+}
+#endif
+
+#ifdef OUTPUT_YUV_REC
+void vp10_write_yuv_rec_frame(VP10_COMMON *cm) {
+  YV12_BUFFER_CONFIG *s = cm->frame_to_show;
+  uint8_t *src = s->y_buffer;
+  int h = cm->height;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (s->flags & YV12_FLAG_HIGHBITDEPTH) {
+    uint16_t *src16 = CONVERT_TO_SHORTPTR(s->y_buffer);
+
+    do {
+      fwrite(src16, s->y_width, 2,  yuv_rec_file);
+      src16 += s->y_stride;
+    } while (--h);
+
+    src16 = CONVERT_TO_SHORTPTR(s->u_buffer);
+    h = s->uv_height;
+
+    do {
+      fwrite(src16, s->uv_width, 2,  yuv_rec_file);
+      src16 += s->uv_stride;
+    } while (--h);
+
+    src16 = CONVERT_TO_SHORTPTR(s->v_buffer);
+    h = s->uv_height;
+
+    do {
+      fwrite(src16, s->uv_width, 2, yuv_rec_file);
+      src16 += s->uv_stride;
+    } while (--h);
+
+    fflush(yuv_rec_file);
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  do {
+    fwrite(src, s->y_width, 1,  yuv_rec_file);
+    src += s->y_stride;
+  } while (--h);
+
+  src = s->u_buffer;
+  h = s->uv_height;
+
+  do {
+    fwrite(src, s->uv_width, 1,  yuv_rec_file);
+    src += s->uv_stride;
+  } while (--h);
+
+  src = s->v_buffer;
+  h = s->uv_height;
+
+  do {
+    fwrite(src, s->uv_width, 1, yuv_rec_file);
+    src += s->uv_stride;
+  } while (--h);
+
+  fflush(yuv_rec_file);
+}
+#endif
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
+                                                YV12_BUFFER_CONFIG *dst,
+                                                int bd) {
+#else
+static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
+                                                YV12_BUFFER_CONFIG *dst) {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  // TODO(dkovalev): replace YV12_BUFFER_CONFIG with vpx_image_t
+  int i;
+  const uint8_t *const srcs[3] = {src->y_buffer, src->u_buffer, src->v_buffer};
+  const int src_strides[3] = {src->y_stride, src->uv_stride, src->uv_stride};
+  const int src_widths[3] = {src->y_crop_width, src->uv_crop_width,
+                             src->uv_crop_width };
+  const int src_heights[3] = {src->y_crop_height, src->uv_crop_height,
+                              src->uv_crop_height};
+  uint8_t *const dsts[3] = {dst->y_buffer, dst->u_buffer, dst->v_buffer};
+  const int dst_strides[3] = {dst->y_stride, dst->uv_stride, dst->uv_stride};
+  const int dst_widths[3] = {dst->y_crop_width, dst->uv_crop_width,
+                             dst->uv_crop_width};
+  const int dst_heights[3] = {dst->y_crop_height, dst->uv_crop_height,
+                              dst->uv_crop_height};
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
+      vp10_highbd_resize_plane(srcs[i], src_heights[i], src_widths[i],
+                              src_strides[i], dsts[i], dst_heights[i],
+                              dst_widths[i], dst_strides[i], bd);
+    } else {
+      vp10_resize_plane(srcs[i], src_heights[i], src_widths[i], src_strides[i],
+                       dsts[i], dst_heights[i], dst_widths[i], dst_strides[i]);
+    }
+#else
+    vp10_resize_plane(srcs[i], src_heights[i], src_widths[i], src_strides[i],
+                     dsts[i], dst_heights[i], dst_widths[i], dst_strides[i]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  }
+  vpx_extend_frame_borders(dst);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
+                                   YV12_BUFFER_CONFIG *dst, int bd) {
+#else
+static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
+                                   YV12_BUFFER_CONFIG *dst) {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  const int src_w = src->y_crop_width;
+  const int src_h = src->y_crop_height;
+  const int dst_w = dst->y_crop_width;
+  const int dst_h = dst->y_crop_height;
+  const uint8_t *const srcs[3] = {src->y_buffer, src->u_buffer, src->v_buffer};
+  const int src_strides[3] = {src->y_stride, src->uv_stride, src->uv_stride};
+  uint8_t *const dsts[3] = {dst->y_buffer, dst->u_buffer, dst->v_buffer};
+  const int dst_strides[3] = {dst->y_stride, dst->uv_stride, dst->uv_stride};
+  const InterpKernel *const kernel = vp10_filter_kernels[EIGHTTAP];
+  int x, y, i;
+
+  for (y = 0; y < dst_h; y += 16) {
+    for (x = 0; x < dst_w; x += 16) {
+      for (i = 0; i < MAX_MB_PLANE; ++i) {
+        const int factor = (i == 0 || i == 3 ? 1 : 2);
+        const int x_q4 = x * (16 / factor) * src_w / dst_w;
+        const int y_q4 = y * (16 / factor) * src_h / dst_h;
+        const int src_stride = src_strides[i];
+        const int dst_stride = dst_strides[i];
+        const uint8_t *src_ptr = srcs[i] + (y / factor) * src_h / dst_h *
+                                     src_stride + (x / factor) * src_w / dst_w;
+        uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
+          vpx_highbd_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
+                               kernel[x_q4 & 0xf], 16 * src_w / dst_w,
+                               kernel[y_q4 & 0xf], 16 * src_h / dst_h,
+                               16 / factor, 16 / factor, bd);
+        } else {
+          vpx_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
+                        kernel[x_q4 & 0xf], 16 * src_w / dst_w,
+                        kernel[y_q4 & 0xf], 16 * src_h / dst_h,
+                        16 / factor, 16 / factor);
+        }
+#else
+        vpx_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
+                      kernel[x_q4 & 0xf], 16 * src_w / dst_w,
+                      kernel[y_q4 & 0xf], 16 * src_h / dst_h,
+                      16 / factor, 16 / factor);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
+    }
+  }
+
+  vpx_extend_frame_borders(dst);
+}
+
+static int scale_down(VP10_COMP *cpi, int q) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+  int scale = 0;
+  assert(frame_is_kf_gf_arf(cpi));
+
+  if (rc->frame_size_selector == UNSCALED &&
+      q >= rc->rf_level_maxq[gf_group->rf_level[gf_group->index]]) {
+    const int max_size_thresh = (int)(rate_thresh_mult[SCALE_STEP1]
+        * VPXMAX(rc->this_frame_target, rc->avg_frame_bandwidth));
+    scale = rc->projected_frame_size > max_size_thresh ? 1 : 0;
+  }
+  return scale;
+}
+
+// Function to test for conditions that indicate we should loop
+// back and recode a frame.
+static int recode_loop_test(VP10_COMP *cpi,
+                            int high_limit, int low_limit,
+                            int q, int maxq, int minq) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const VP10EncoderConfig *const oxcf = &cpi->oxcf;
+  const int frame_is_kfgfarf = frame_is_kf_gf_arf(cpi);
+  int force_recode = 0;
+
+  if ((rc->projected_frame_size >= rc->max_frame_bandwidth) ||
+      (cpi->sf.recode_loop == ALLOW_RECODE) ||
+      (frame_is_kfgfarf &&
+       (cpi->sf.recode_loop == ALLOW_RECODE_KFARFGF))) {
+    if (frame_is_kfgfarf &&
+        (oxcf->resize_mode == RESIZE_DYNAMIC) &&
+        scale_down(cpi, q)) {
+        // Code this group at a lower resolution.
+        cpi->resize_pending = 1;
+        return 1;
+    }
+
+    // TODO(agrange) high_limit could be greater than the scale-down threshold.
+    if ((rc->projected_frame_size > high_limit && q < maxq) ||
+        (rc->projected_frame_size < low_limit && q > minq)) {
+      force_recode = 1;
+    } else if (cpi->oxcf.rc_mode == VPX_CQ) {
+      // Deal with frame undershoot and whether or not we are
+      // below the automatically set cq level.
+      if (q > oxcf->cq_level &&
+          rc->projected_frame_size < ((rc->this_frame_target * 7) >> 3)) {
+        force_recode = 1;
+      }
+    }
+  }
+  return force_recode;
+}
+
+void vp10_update_reference_frames(VP10_COMP *cpi) {
+  VP10_COMMON * const cm = &cpi->common;
+  BufferPool *const pool = cm->buffer_pool;
+
+  // At this point the new frame has been encoded.
+  // If any buffer copy / swapping is signaled it should be done here.
+  if (cm->frame_type == KEY_FRAME) {
+    ref_cnt_fb(pool->frame_bufs,
+               &cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx);
+    ref_cnt_fb(pool->frame_bufs,
+               &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx);
+  } else if (vp10_preserve_existing_gf(cpi)) {
+    // We have decided to preserve the previously existing golden frame as our
+    // new ARF frame. However, in the short term in function
+    // vp10_bitstream.c::get_refresh_mask() we left it in the GF slot and, if
+    // we're updating the GF with the current decoded frame, we save it to the
+    // ARF slot instead.
+    // We now have to update the ARF with the current frame and swap gld_fb_idx
+    // and alt_fb_idx so that, overall, we've stored the old GF in the new ARF
+    // slot and, if we're updating the GF, the current frame becomes the new GF.
+    int tmp;
+
+    ref_cnt_fb(pool->frame_bufs,
+               &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx);
+
+    tmp = cpi->alt_fb_idx;
+    cpi->alt_fb_idx = cpi->gld_fb_idx;
+    cpi->gld_fb_idx = tmp;
+  } else { /* For non key/golden frames */
+    if (cpi->refresh_alt_ref_frame) {
+      int arf_idx = cpi->alt_fb_idx;
+      if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) {
+        const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+        arf_idx = gf_group->arf_update_idx[gf_group->index];
+      }
+
+      ref_cnt_fb(pool->frame_bufs,
+                 &cm->ref_frame_map[arf_idx], cm->new_fb_idx);
+      memcpy(cpi->interp_filter_selected[ALTREF_FRAME],
+             cpi->interp_filter_selected[0],
+             sizeof(cpi->interp_filter_selected[0]));
+    }
+
+    if (cpi->refresh_golden_frame) {
+      ref_cnt_fb(pool->frame_bufs,
+                 &cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx);
+      if (!cpi->rc.is_src_frame_alt_ref)
+        memcpy(cpi->interp_filter_selected[GOLDEN_FRAME],
+               cpi->interp_filter_selected[0],
+               sizeof(cpi->interp_filter_selected[0]));
+      else
+        memcpy(cpi->interp_filter_selected[GOLDEN_FRAME],
+               cpi->interp_filter_selected[ALTREF_FRAME],
+               sizeof(cpi->interp_filter_selected[ALTREF_FRAME]));
+    }
+  }
+
+  if (cpi->refresh_last_frame) {
+    ref_cnt_fb(pool->frame_bufs,
+               &cm->ref_frame_map[cpi->lst_fb_idx], cm->new_fb_idx);
+    if (!cpi->rc.is_src_frame_alt_ref)
+      memcpy(cpi->interp_filter_selected[LAST_FRAME],
+             cpi->interp_filter_selected[0],
+             sizeof(cpi->interp_filter_selected[0]));
+  }
+#if CONFIG_VP9_TEMPORAL_DENOISING
+  if (cpi->oxcf.noise_sensitivity > 0) {
+    vp10_denoiser_update_frame_info(&cpi->denoiser,
+                                   *cpi->Source,
+                                   cpi->common.frame_type,
+                                   cpi->refresh_alt_ref_frame,
+                                   cpi->refresh_golden_frame,
+                                   cpi->refresh_last_frame);
+  }
+#endif
+}
+
+static void loopfilter_frame(VP10_COMP *cpi, VP10_COMMON *cm) {
+  MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
+  struct loopfilter *lf = &cm->lf;
+  if (is_lossless_requested(&cpi->oxcf)) {
+    lf->filter_level = 0;
+  } else {
+    struct vpx_usec_timer timer;
+
+    vpx_clear_system_state();
+
+    vpx_usec_timer_start(&timer);
+
+    vp10_pick_filter_level(cpi->Source, cpi, cpi->sf.lpf_pick);
+
+    vpx_usec_timer_mark(&timer);
+    cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer);
+  }
+
+  if (lf->filter_level > 0) {
+    if (cpi->num_workers > 1)
+      vp10_loop_filter_frame_mt(cm->frame_to_show, cm, xd->plane,
+                               lf->filter_level, 0, 0,
+                               cpi->workers, cpi->num_workers,
+                               &cpi->lf_row_sync);
+    else
+      vp10_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0);
+  }
+
+  vpx_extend_frame_inner_borders(cm->frame_to_show);
+}
+
+static INLINE void alloc_frame_mvs(const VP10_COMMON *cm,
+                                   int buffer_idx) {
+  RefCntBuffer *const new_fb_ptr = &cm->buffer_pool->frame_bufs[buffer_idx];
+  if (new_fb_ptr->mvs == NULL ||
+      new_fb_ptr->mi_rows < cm->mi_rows ||
+      new_fb_ptr->mi_cols < cm->mi_cols) {
+    vpx_free(new_fb_ptr->mvs);
+    new_fb_ptr->mvs =
+      (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols,
+                           sizeof(*new_fb_ptr->mvs));
+    new_fb_ptr->mi_rows = cm->mi_rows;
+    new_fb_ptr->mi_cols = cm->mi_cols;
+  }
+}
+
+void vp10_scale_references(VP10_COMP *cpi) {
+  VP10_COMMON *cm = &cpi->common;
+  MV_REFERENCE_FRAME ref_frame;
+  const VP9_REFFRAME ref_mask[3] = {VP9_LAST_FLAG, VP9_GOLD_FLAG, VP9_ALT_FLAG};
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    // Need to convert from VP9_REFFRAME to index into ref_mask (subtract 1).
+    if (cpi->ref_frame_flags & ref_mask[ref_frame - 1]) {
+      BufferPool *const pool = cm->buffer_pool;
+      const YV12_BUFFER_CONFIG *const ref = get_ref_frame_buffer(cpi,
+                                                                 ref_frame);
+
+      if (ref == NULL) {
+        cpi->scaled_ref_idx[ref_frame - 1] = INVALID_IDX;
+        continue;
+      }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
+        RefCntBuffer *new_fb_ptr = NULL;
+        int force_scaling = 0;
+        int new_fb = cpi->scaled_ref_idx[ref_frame - 1];
+        if (new_fb == INVALID_IDX) {
+          new_fb = get_free_fb(cm);
+          force_scaling = 1;
+        }
+        if (new_fb == INVALID_IDX)
+          return;
+        new_fb_ptr = &pool->frame_bufs[new_fb];
+        if (force_scaling ||
+            new_fb_ptr->buf.y_crop_width != cm->width ||
+            new_fb_ptr->buf.y_crop_height != cm->height) {
+          vpx_realloc_frame_buffer(&new_fb_ptr->buf,
+                                   cm->width, cm->height,
+                                   cm->subsampling_x, cm->subsampling_y,
+                                   cm->use_highbitdepth,
+                                   VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+                                   NULL, NULL, NULL);
+          scale_and_extend_frame(ref, &new_fb_ptr->buf, (int)cm->bit_depth);
+          cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
+          alloc_frame_mvs(cm, new_fb);
+        }
+#else
+      if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
+        RefCntBuffer *new_fb_ptr = NULL;
+        int force_scaling = 0;
+        int new_fb = cpi->scaled_ref_idx[ref_frame - 1];
+        if (new_fb == INVALID_IDX) {
+          new_fb = get_free_fb(cm);
+          force_scaling = 1;
+        }
+        if (new_fb == INVALID_IDX)
+          return;
+        new_fb_ptr = &pool->frame_bufs[new_fb];
+        if (force_scaling ||
+            new_fb_ptr->buf.y_crop_width != cm->width ||
+            new_fb_ptr->buf.y_crop_height != cm->height) {
+          vpx_realloc_frame_buffer(&new_fb_ptr->buf,
+                                   cm->width, cm->height,
+                                   cm->subsampling_x, cm->subsampling_y,
+                                   VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+                                   NULL, NULL, NULL);
+          scale_and_extend_frame(ref, &new_fb_ptr->buf);
+          cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
+          alloc_frame_mvs(cm, new_fb);
+        }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+        RefCntBuffer *const buf = &pool->frame_bufs[buf_idx];
+        buf->buf.y_crop_width = ref->y_crop_width;
+        buf->buf.y_crop_height = ref->y_crop_height;
+        cpi->scaled_ref_idx[ref_frame - 1] = buf_idx;
+        ++buf->ref_count;
+      }
+    } else {
+      if (cpi->oxcf.pass != 0)
+        cpi->scaled_ref_idx[ref_frame - 1] = INVALID_IDX;
+    }
+  }
+}
+
+static void release_scaled_references(VP10_COMP *cpi) {
+  VP10_COMMON *cm = &cpi->common;
+  int i;
+  if (cpi->oxcf.pass == 0) {
+    // Only release scaled references under certain conditions:
+    // if reference will be updated, or if scaled reference has same resolution.
+    int refresh[3];
+    refresh[0] = (cpi->refresh_last_frame) ? 1 : 0;
+    refresh[1] = (cpi->refresh_golden_frame) ? 1 : 0;
+    refresh[2] = (cpi->refresh_alt_ref_frame) ? 1 : 0;
+    for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+      const int idx = cpi->scaled_ref_idx[i - 1];
+      RefCntBuffer *const buf = idx != INVALID_IDX ?
+          &cm->buffer_pool->frame_bufs[idx] : NULL;
+      const YV12_BUFFER_CONFIG *const ref = get_ref_frame_buffer(cpi, i);
+      if (buf != NULL &&
+          (refresh[i - 1] ||
+          (buf->buf.y_crop_width == ref->y_crop_width &&
+           buf->buf.y_crop_height == ref->y_crop_height))) {
+        --buf->ref_count;
+        cpi->scaled_ref_idx[i -1] = INVALID_IDX;
+      }
+    }
+  } else {
+    for (i = 0; i < MAX_REF_FRAMES; ++i) {
+      const int idx = cpi->scaled_ref_idx[i];
+      RefCntBuffer *const buf = idx != INVALID_IDX ?
+          &cm->buffer_pool->frame_bufs[idx] : NULL;
+      if (buf != NULL) {
+        --buf->ref_count;
+        cpi->scaled_ref_idx[i] = INVALID_IDX;
+      }
+    }
+  }
+}
+
+static void full_to_model_count(unsigned int *model_count,
+                                unsigned int *full_count) {
+  int n;
+  model_count[ZERO_TOKEN] = full_count[ZERO_TOKEN];
+  model_count[ONE_TOKEN] = full_count[ONE_TOKEN];
+  model_count[TWO_TOKEN] = full_count[TWO_TOKEN];
+  for (n = THREE_TOKEN; n < EOB_TOKEN; ++n)
+    model_count[TWO_TOKEN] += full_count[n];
+  model_count[EOB_MODEL_TOKEN] = full_count[EOB_TOKEN];
+}
+
+static void full_to_model_counts(vp10_coeff_count_model *model_count,
+                                 vp10_coeff_count *full_count) {
+  int i, j, k, l;
+
+  for (i = 0; i < PLANE_TYPES; ++i)
+    for (j = 0; j < REF_TYPES; ++j)
+      for (k = 0; k < COEF_BANDS; ++k)
+        for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l)
+          full_to_model_count(model_count[i][j][k][l], full_count[i][j][k][l]);
+}
+
+#if 0 && CONFIG_INTERNAL_STATS
+static void output_frame_level_debug_stats(VP10_COMP *cpi) {
+  VP10_COMMON *const cm = &cpi->common;
+  FILE *const f = fopen("tmp.stt", cm->current_video_frame ? "a" : "w");
+  int64_t recon_err;
+
+  vpx_clear_system_state();
+
+  recon_err = vp10_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+
+  if (cpi->twopass.total_left_stats.coded_error != 0.0)
+    fprintf(f, "%10u %dx%d  %10d %10d %d %d %10d %10d %10d %10d"
+       "%10"PRId64" %10"PRId64" %5d %5d %10"PRId64" "
+       "%10"PRId64" %10"PRId64" %10d "
+       "%7.2lf %7.2lf %7.2lf %7.2lf %7.2lf"
+        "%6d %6d %5d %5d %5d "
+        "%10"PRId64" %10.3lf"
+        "%10lf %8u %10"PRId64" %10d %10d %10d\n",
+        cpi->common.current_video_frame,
+        cm->width, cm->height,
+        cpi->td.rd_counts.m_search_count,
+        cpi->td.rd_counts.ex_search_count,
+        cpi->rc.source_alt_ref_pending,
+        cpi->rc.source_alt_ref_active,
+        cpi->rc.this_frame_target,
+        cpi->rc.projected_frame_size,
+        cpi->rc.projected_frame_size / cpi->common.MBs,
+        (cpi->rc.projected_frame_size - cpi->rc.this_frame_target),
+        cpi->rc.vbr_bits_off_target,
+        cpi->rc.vbr_bits_off_target_fast,
+        cpi->twopass.extend_minq,
+        cpi->twopass.extend_minq_fast,
+        cpi->rc.total_target_vs_actual,
+        (cpi->rc.starting_buffer_level - cpi->rc.bits_off_target),
+        cpi->rc.total_actual_bits, cm->base_qindex,
+        vp10_convert_qindex_to_q(cm->base_qindex, cm->bit_depth),
+        (double)vp10_dc_quant(cm->base_qindex, 0, cm->bit_depth) / 4.0,
+        vp10_convert_qindex_to_q(cpi->twopass.active_worst_quality,
+                                cm->bit_depth),
+        cpi->rc.avg_q,
+        vp10_convert_qindex_to_q(cpi->oxcf.cq_level, cm->bit_depth),
+        cpi->refresh_last_frame, cpi->refresh_golden_frame,
+        cpi->refresh_alt_ref_frame, cm->frame_type, cpi->rc.gfu_boost,
+        cpi->twopass.bits_left,
+        cpi->twopass.total_left_stats.coded_error,
+        cpi->twopass.bits_left /
+            (1 + cpi->twopass.total_left_stats.coded_error),
+        cpi->tot_recode_hits, recon_err, cpi->rc.kf_boost,
+        cpi->twopass.kf_zeromotion_pct,
+        cpi->twopass.fr_content_type);
+
+  fclose(f);
+
+  if (0) {
+    FILE *const fmodes = fopen("Modes.stt", "a");
+    int i;
+
+    fprintf(fmodes, "%6d:%1d:%1d:%1d ", cpi->common.current_video_frame,
+            cm->frame_type, cpi->refresh_golden_frame,
+            cpi->refresh_alt_ref_frame);
+
+    for (i = 0; i < MAX_MODES; ++i)
+      fprintf(fmodes, "%5d ", cpi->mode_chosen_counts[i]);
+
+    fprintf(fmodes, "\n");
+
+    fclose(fmodes);
+  }
+}
+#endif
+
+static void set_mv_search_params(VP10_COMP *cpi) {
+  const VP10_COMMON *const cm = &cpi->common;
+  const unsigned int max_mv_def = VPXMIN(cm->width, cm->height);
+
+  // Default based on max resolution.
+  cpi->mv_step_param = vp10_init_search_range(max_mv_def);
+
+  if (cpi->sf.mv.auto_mv_step_size) {
+    if (frame_is_intra_only(cm)) {
+      // Initialize max_mv_magnitude for use in the first INTER frame
+      // after a key/intra-only frame.
+      cpi->max_mv_magnitude = max_mv_def;
+    } else {
+      if (cm->show_frame) {
+        // Allow mv_steps to correspond to twice the max mv magnitude found
+        // in the previous frame, capped by the default max_mv_magnitude based
+        // on resolution.
+        cpi->mv_step_param = vp10_init_search_range(
+            VPXMIN(max_mv_def, 2 * cpi->max_mv_magnitude));
+      }
+      cpi->max_mv_magnitude = 0;
+    }
+  }
+}
+
+static void set_size_independent_vars(VP10_COMP *cpi) {
+  vp10_set_speed_features_framesize_independent(cpi);
+  vp10_set_rd_speed_thresholds(cpi);
+  vp10_set_rd_speed_thresholds_sub8x8(cpi);
+  cpi->common.interp_filter = cpi->sf.default_interp_filter;
+}
+
+static void set_size_dependent_vars(VP10_COMP *cpi, int *q,
+                                    int *bottom_index, int *top_index) {
+  VP10_COMMON *const cm = &cpi->common;
+  const VP10EncoderConfig *const oxcf = &cpi->oxcf;
+
+  // Setup variables that depend on the dimensions of the frame.
+  vp10_set_speed_features_framesize_dependent(cpi);
+
+  // Decide q and q bounds.
+  *q = vp10_rc_pick_q_and_bounds(cpi, bottom_index, top_index);
+
+  if (!frame_is_intra_only(cm)) {
+    vp10_set_high_precision_mv(cpi, (*q) < HIGH_PRECISION_MV_QTHRESH);
+  }
+
+  // Configure experimental use of segmentation for enhanced coding of
+  // static regions if indicated.
+  // Only allowed in the second pass of a two pass encode, as it requires
+  // lagged coding, and if the relevant speed feature flag is set.
+  if (oxcf->pass == 2 && cpi->sf.static_segmentation)
+    configure_static_seg_features(cpi);
+
+#if CONFIG_VP9_POSTPROC
+  if (oxcf->noise_sensitivity > 0) {
+    int l = 0;
+    switch (oxcf->noise_sensitivity) {
+      case 1:
+        l = 20;
+        break;
+      case 2:
+        l = 40;
+        break;
+      case 3:
+        l = 60;
+        break;
+      case 4:
+      case 5:
+        l = 100;
+        break;
+      case 6:
+        l = 150;
+        break;
+    }
+    vp10_denoise(cpi->Source, cpi->Source, l);
+  }
+#endif  // CONFIG_VP9_POSTPROC
+}
+
+static void init_motion_estimation(VP10_COMP *cpi) {
+  int y_stride = cpi->scaled_source.y_stride;
+
+  if (cpi->sf.mv.search_method == NSTEP) {
+    vp10_init3smotion_compensation(&cpi->ss_cfg, y_stride);
+  } else if (cpi->sf.mv.search_method == DIAMOND) {
+    vp10_init_dsmotion_compensation(&cpi->ss_cfg, y_stride);
+  }
+}
+
+static void set_frame_size(VP10_COMP *cpi) {
+  int ref_frame;
+  VP10_COMMON *const cm = &cpi->common;
+  VP10EncoderConfig *const oxcf = &cpi->oxcf;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+
+  if (oxcf->pass == 2 &&
+      oxcf->rc_mode == VPX_VBR &&
+      ((oxcf->resize_mode == RESIZE_FIXED && cm->current_video_frame == 0) ||
+        (oxcf->resize_mode == RESIZE_DYNAMIC && cpi->resize_pending))) {
+    vp10_calculate_coded_size(
+        cpi, &oxcf->scaled_frame_width, &oxcf->scaled_frame_height);
+
+    // There has been a change in frame size.
+    vp10_set_size_literal(cpi, oxcf->scaled_frame_width,
+                         oxcf->scaled_frame_height);
+  }
+
+  if (oxcf->pass == 0 &&
+      oxcf->rc_mode == VPX_CBR &&
+      oxcf->resize_mode == RESIZE_DYNAMIC) {
+      if (cpi->resize_pending == 1) {
+        oxcf->scaled_frame_width =
+            (cm->width * cpi->resize_scale_num) / cpi->resize_scale_den;
+        oxcf->scaled_frame_height =
+            (cm->height * cpi->resize_scale_num) /cpi->resize_scale_den;
+      } else if (cpi->resize_pending == -1) {
+        // Go back up to original size.
+        oxcf->scaled_frame_width = oxcf->width;
+        oxcf->scaled_frame_height = oxcf->height;
+      }
+      if (cpi->resize_pending != 0) {
+        // There has been a change in frame size.
+        vp10_set_size_literal(cpi,
+                             oxcf->scaled_frame_width,
+                             oxcf->scaled_frame_height);
+
+        // TODO(agrange) Scale cpi->max_mv_magnitude if frame-size has changed.
+        set_mv_search_params(cpi);
+      }
+  }
+
+  if (oxcf->pass == 2) {
+    vp10_set_target_rate(cpi);
+  }
+
+  alloc_frame_mvs(cm, cm->new_fb_idx);
+
+  // Reset the frame pointers to the current frame size.
+  vpx_realloc_frame_buffer(get_frame_new_buffer(cm),
+                           cm->width, cm->height,
+                           cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                           cm->use_highbitdepth,
+#endif
+                           VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+                           NULL, NULL, NULL);
+
+  alloc_util_frame_buffers(cpi);
+  init_motion_estimation(cpi);
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    RefBuffer *const ref_buf = &cm->frame_refs[ref_frame - 1];
+    const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+
+    ref_buf->idx = buf_idx;
+
+    if (buf_idx != INVALID_IDX) {
+      YV12_BUFFER_CONFIG *const buf = &cm->buffer_pool->frame_bufs[buf_idx].buf;
+      ref_buf->buf = buf;
+#if CONFIG_VP9_HIGHBITDEPTH
+      vp10_setup_scale_factors_for_frame(&ref_buf->sf,
+                                        buf->y_crop_width, buf->y_crop_height,
+                                        cm->width, cm->height,
+                                        (buf->flags & YV12_FLAG_HIGHBITDEPTH) ?
+                                            1 : 0);
+#else
+      vp10_setup_scale_factors_for_frame(&ref_buf->sf,
+                                        buf->y_crop_width, buf->y_crop_height,
+                                        cm->width, cm->height);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      if (vp10_is_scaled(&ref_buf->sf))
+        vpx_extend_frame_borders(buf);
+    } else {
+      ref_buf->buf = NULL;
+    }
+  }
+
+  set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME);
+}
+
+static void encode_without_recode_loop(VP10_COMP *cpi) {
+  VP10_COMMON *const cm = &cpi->common;
+  int q = 0, bottom_index = 0, top_index = 0;  // Dummy variables.
+
+  vpx_clear_system_state();
+
+  set_frame_size(cpi);
+
+  // For 1 pass CBR under dynamic resize mode: use faster scaling for source.
+  // Only for 2x2 scaling for now.
+  if (cpi->oxcf.pass == 0 &&
+      cpi->oxcf.rc_mode == VPX_CBR &&
+      cpi->oxcf.resize_mode == RESIZE_DYNAMIC &&
+      cpi->un_scaled_source->y_width == (cm->width << 1) &&
+      cpi->un_scaled_source->y_height == (cm->height << 1)) {
+    cpi->Source = vp10_scale_if_required_fast(cm,
+                                             cpi->un_scaled_source,
+                                             &cpi->scaled_source);
+    if (cpi->unscaled_last_source != NULL)
+       cpi->Last_Source = vp10_scale_if_required_fast(cm,
+                                                     cpi->unscaled_last_source,
+                                                     &cpi->scaled_last_source);
+  } else {
+    cpi->Source = vp10_scale_if_required(cm, cpi->un_scaled_source,
+                                        &cpi->scaled_source);
+    if (cpi->unscaled_last_source != NULL)
+      cpi->Last_Source = vp10_scale_if_required(cm, cpi->unscaled_last_source,
+                                               &cpi->scaled_last_source);
+  }
+
+  if (frame_is_intra_only(cm) == 0) {
+    vp10_scale_references(cpi);
+  }
+
+  set_size_independent_vars(cpi);
+  set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
+
+  vp10_set_quantizer(cm, q);
+  vp10_set_variance_partition_thresholds(cpi, q);
+
+  setup_frame(cpi);
+
+  suppress_active_map(cpi);
+  // Variance adaptive and in frame q adjustment experiments are mutually
+  // exclusive.
+  if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
+    vp10_vaq_frame_setup(cpi);
+  } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
+    vp10_setup_in_frame_q_adj(cpi);
+  } else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+    vp10_cyclic_refresh_setup(cpi);
+  }
+  apply_active_map(cpi);
+
+  // transform / motion compensation build reconstruction frame
+  vp10_encode_frame(cpi);
+
+  // Update some stats from cyclic refresh, and check if we should not update
+  // golden reference, for 1 pass CBR.
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+      cm->frame_type != KEY_FRAME &&
+      (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR))
+    vp10_cyclic_refresh_check_golden_update(cpi);
+
+  // Update the skip mb flag probabilities based on the distribution
+  // seen in the last encoder iteration.
+  // update_base_skip_probs(cpi);
+  vpx_clear_system_state();
+}
+
+static void encode_with_recode_loop(VP10_COMP *cpi,
+                                    size_t *size,
+                                    uint8_t *dest) {
+  VP10_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  int bottom_index, top_index;
+  int loop_count = 0;
+  int loop_at_this_size = 0;
+  int loop = 0;
+  int overshoot_seen = 0;
+  int undershoot_seen = 0;
+  int frame_over_shoot_limit;
+  int frame_under_shoot_limit;
+  int q = 0, q_low = 0, q_high = 0;
+
+  set_size_independent_vars(cpi);
+
+  do {
+    vpx_clear_system_state();
+
+    set_frame_size(cpi);
+
+    if (loop_count == 0 || cpi->resize_pending != 0) {
+      set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
+
+      // TODO(agrange) Scale cpi->max_mv_magnitude if frame-size has changed.
+      set_mv_search_params(cpi);
+
+      // Reset the loop state for new frame size.
+      overshoot_seen = 0;
+      undershoot_seen = 0;
+
+      // Reconfiguration for change in frame size has concluded.
+      cpi->resize_pending = 0;
+
+      q_low = bottom_index;
+      q_high = top_index;
+
+      loop_at_this_size = 0;
+    }
+
+    // Decide frame size bounds first time through.
+    if (loop_count == 0) {
+      vp10_rc_compute_frame_size_bounds(cpi, rc->this_frame_target,
+                                       &frame_under_shoot_limit,
+                                       &frame_over_shoot_limit);
+    }
+
+    cpi->Source = vp10_scale_if_required(cm, cpi->un_scaled_source,
+                                      &cpi->scaled_source);
+
+    if (cpi->unscaled_last_source != NULL)
+      cpi->Last_Source = vp10_scale_if_required(cm, cpi->unscaled_last_source,
+                                               &cpi->scaled_last_source);
+
+    if (frame_is_intra_only(cm) == 0) {
+      if (loop_count > 0) {
+        release_scaled_references(cpi);
+      }
+      vp10_scale_references(cpi);
+    }
+
+    vp10_set_quantizer(cm, q);
+
+    if (loop_count == 0)
+      setup_frame(cpi);
+
+    // Variance adaptive and in frame q adjustment experiments are mutually
+    // exclusive.
+    if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
+      vp10_vaq_frame_setup(cpi);
+    } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
+      vp10_setup_in_frame_q_adj(cpi);
+    }
+
+    // transform / motion compensation build reconstruction frame
+    vp10_encode_frame(cpi);
+
+    // Update the skip mb flag probabilities based on the distribution
+    // seen in the last encoder iteration.
+    // update_base_skip_probs(cpi);
+
+    vpx_clear_system_state();
+
+    // Dummy pack of the bitstream using up to date stats to get an
+    // accurate estimate of output frame size to determine if we need
+    // to recode.
+    if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF) {
+      save_coding_context(cpi);
+      vp10_pack_bitstream(cpi, dest, size);
+
+      rc->projected_frame_size = (int)(*size) << 3;
+      restore_coding_context(cpi);
+
+      if (frame_over_shoot_limit == 0)
+        frame_over_shoot_limit = 1;
+    }
+
+    if (cpi->oxcf.rc_mode == VPX_Q) {
+      loop = 0;
+    } else {
+      if ((cm->frame_type == KEY_FRAME) &&
+           rc->this_key_frame_forced &&
+           (rc->projected_frame_size < rc->max_frame_bandwidth)) {
+        int last_q = q;
+        int64_t kf_err;
+
+        int64_t high_err_target = cpi->ambient_err;
+        int64_t low_err_target = cpi->ambient_err >> 1;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (cm->use_highbitdepth) {
+          kf_err = vp10_highbd_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+        } else {
+          kf_err = vp10_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+        }
+#else
+        kf_err = vp10_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+        // Prevent possible divide by zero error below for perfect KF
+        kf_err += !kf_err;
+
+        // The key frame is not good enough or we can afford
+        // to make it better without undue risk of popping.
+        if ((kf_err > high_err_target &&
+             rc->projected_frame_size <= frame_over_shoot_limit) ||
+            (kf_err > low_err_target &&
+             rc->projected_frame_size <= frame_under_shoot_limit)) {
+          // Lower q_high
+          q_high = q > q_low ? q - 1 : q_low;
+
+          // Adjust Q
+          q = (int)((q * high_err_target) / kf_err);
+          q = VPXMIN(q, (q_high + q_low) >> 1);
+        } else if (kf_err < low_err_target &&
+                   rc->projected_frame_size >= frame_under_shoot_limit) {
+          // The key frame is much better than the previous frame
+          // Raise q_low
+          q_low = q < q_high ? q + 1 : q_high;
+
+          // Adjust Q
+          q = (int)((q * low_err_target) / kf_err);
+          q = VPXMIN(q, (q_high + q_low + 1) >> 1);
+        }
+
+        // Clamp Q to upper and lower limits:
+        q = clamp(q, q_low, q_high);
+
+        loop = q != last_q;
+      } else if (recode_loop_test(
+          cpi, frame_over_shoot_limit, frame_under_shoot_limit,
+          q, VPXMAX(q_high, top_index), bottom_index)) {
+        // Is the projected frame size out of range and are we allowed
+        // to attempt to recode.
+        int last_q = q;
+        int retries = 0;
+
+        if (cpi->resize_pending == 1) {
+          // Change in frame size so go back around the recode loop.
+          cpi->rc.frame_size_selector =
+              SCALE_STEP1 - cpi->rc.frame_size_selector;
+          cpi->rc.next_frame_size_selector = cpi->rc.frame_size_selector;
+
+#if CONFIG_INTERNAL_STATS
+          ++cpi->tot_recode_hits;
+#endif
+          ++loop_count;
+          loop = 1;
+          continue;
+        }
+
+        // Frame size out of permitted range:
+        // Update correction factor & compute new Q to try...
+
+        // Frame is too large
+        if (rc->projected_frame_size > rc->this_frame_target) {
+          // Special case if the projected size is > the max allowed.
+          if (rc->projected_frame_size >= rc->max_frame_bandwidth)
+            q_high = rc->worst_quality;
+
+          // Raise Qlow as to at least the current value
+          q_low = q < q_high ? q + 1 : q_high;
+
+          if (undershoot_seen || loop_at_this_size > 1) {
+            // Update rate_correction_factor unless
+            vp10_rc_update_rate_correction_factors(cpi);
+
+            q = (q_high + q_low + 1) / 2;
+          } else {
+            // Update rate_correction_factor unless
+            vp10_rc_update_rate_correction_factors(cpi);
+
+            q = vp10_rc_regulate_q(cpi, rc->this_frame_target,
+                                   bottom_index, VPXMAX(q_high, top_index));
+
+            while (q < q_low && retries < 10) {
+              vp10_rc_update_rate_correction_factors(cpi);
+              q = vp10_rc_regulate_q(cpi, rc->this_frame_target,
+                                     bottom_index, VPXMAX(q_high, top_index));
+              retries++;
+            }
+          }
+
+          overshoot_seen = 1;
+        } else {
+          // Frame is too small
+          q_high = q > q_low ? q - 1 : q_low;
+
+          if (overshoot_seen || loop_at_this_size > 1) {
+            vp10_rc_update_rate_correction_factors(cpi);
+            q = (q_high + q_low) / 2;
+          } else {
+            vp10_rc_update_rate_correction_factors(cpi);
+            q = vp10_rc_regulate_q(cpi, rc->this_frame_target,
+                                   bottom_index, top_index);
+            // Special case reset for qlow for constrained quality.
+            // This should only trigger where there is very substantial
+            // undershoot on a frame and the auto cq level is above
+            // the user passsed in value.
+            if (cpi->oxcf.rc_mode == VPX_CQ &&
+                q < q_low) {
+              q_low = q;
+            }
+
+            while (q > q_high && retries < 10) {
+              vp10_rc_update_rate_correction_factors(cpi);
+              q = vp10_rc_regulate_q(cpi, rc->this_frame_target,
+                                     bottom_index, top_index);
+              retries++;
+            }
+          }
+
+          undershoot_seen = 1;
+        }
+
+        // Clamp Q to upper and lower limits:
+        q = clamp(q, q_low, q_high);
+
+        loop = (q != last_q);
+      } else {
+        loop = 0;
+      }
+    }
+
+    // Special case for overlay frame.
+    if (rc->is_src_frame_alt_ref &&
+        rc->projected_frame_size < rc->max_frame_bandwidth)
+      loop = 0;
+
+    if (loop) {
+      ++loop_count;
+      ++loop_at_this_size;
+
+#if CONFIG_INTERNAL_STATS
+      ++cpi->tot_recode_hits;
+#endif
+    }
+  } while (loop);
+}
+
+static int get_ref_frame_flags(const VP10_COMP *cpi) {
+  const int *const map = cpi->common.ref_frame_map;
+  const int gold_is_last = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idx];
+  const int alt_is_last = map[cpi->alt_fb_idx] == map[cpi->lst_fb_idx];
+  const int gold_is_alt = map[cpi->gld_fb_idx] == map[cpi->alt_fb_idx];
+  int flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;
+
+  if (gold_is_last)
+    flags &= ~VP9_GOLD_FLAG;
+
+  if (cpi->rc.frames_till_gf_update_due == INT_MAX)
+    flags &= ~VP9_GOLD_FLAG;
+
+  if (alt_is_last)
+    flags &= ~VP9_ALT_FLAG;
+
+  if (gold_is_alt)
+    flags &= ~VP9_ALT_FLAG;
+
+  return flags;
+}
+
+static void set_ext_overrides(VP10_COMP *cpi) {
+  // Overrides the defaults with the externally supplied values with
+  // vp10_update_reference() and vp10_update_entropy() calls
+  // Note: The overrides are valid only for the next frame passed
+  // to encode_frame_to_data_rate() function
+  if (cpi->ext_refresh_frame_context_pending) {
+    cpi->common.refresh_frame_context = cpi->ext_refresh_frame_context;
+    cpi->ext_refresh_frame_context_pending = 0;
+  }
+  if (cpi->ext_refresh_frame_flags_pending) {
+    cpi->refresh_last_frame = cpi->ext_refresh_last_frame;
+    cpi->refresh_golden_frame = cpi->ext_refresh_golden_frame;
+    cpi->refresh_alt_ref_frame = cpi->ext_refresh_alt_ref_frame;
+    cpi->ext_refresh_frame_flags_pending = 0;
+  }
+}
+
+YV12_BUFFER_CONFIG *vp10_scale_if_required_fast(VP10_COMMON *cm,
+                                               YV12_BUFFER_CONFIG *unscaled,
+                                               YV12_BUFFER_CONFIG *scaled) {
+  if (cm->mi_cols * MI_SIZE != unscaled->y_width ||
+      cm->mi_rows * MI_SIZE != unscaled->y_height) {
+    // For 2x2 scaling down.
+    vpx_scale_frame(unscaled, scaled, unscaled->y_buffer, 9, 2, 1,
+                    2, 1, 0);
+    vpx_extend_frame_borders(scaled);
+    return scaled;
+  } else {
+    return unscaled;
+  }
+}
+
+YV12_BUFFER_CONFIG *vp10_scale_if_required(VP10_COMMON *cm,
+                                          YV12_BUFFER_CONFIG *unscaled,
+                                          YV12_BUFFER_CONFIG *scaled) {
+  if (cm->mi_cols * MI_SIZE != unscaled->y_width ||
+      cm->mi_rows * MI_SIZE != unscaled->y_height) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    scale_and_extend_frame_nonnormative(unscaled, scaled, (int)cm->bit_depth);
+#else
+    scale_and_extend_frame_nonnormative(unscaled, scaled);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    return scaled;
+  } else {
+    return unscaled;
+  }
+}
+
+static void set_arf_sign_bias(VP10_COMP *cpi) {
+  VP10_COMMON *const cm = &cpi->common;
+  int arf_sign_bias;
+
+  if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) {
+    const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+    arf_sign_bias = cpi->rc.source_alt_ref_active &&
+                    (!cpi->refresh_alt_ref_frame ||
+                     (gf_group->rf_level[gf_group->index] == GF_ARF_LOW));
+  } else {
+    arf_sign_bias =
+      (cpi->rc.source_alt_ref_active && !cpi->refresh_alt_ref_frame);
+  }
+  cm->ref_frame_sign_bias[ALTREF_FRAME] = arf_sign_bias;
+}
+
+static int setup_interp_filter_search_mask(VP10_COMP *cpi) {
+  INTERP_FILTER ifilter;
+  int ref_total[MAX_REF_FRAMES] = {0};
+  MV_REFERENCE_FRAME ref;
+  int mask = 0;
+  if (cpi->common.last_frame_type == KEY_FRAME ||
+      cpi->refresh_alt_ref_frame)
+    return mask;
+  for (ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref)
+    for (ifilter = EIGHTTAP; ifilter <= EIGHTTAP_SHARP; ++ifilter)
+      ref_total[ref] += cpi->interp_filter_selected[ref][ifilter];
+
+  for (ifilter = EIGHTTAP; ifilter <= EIGHTTAP_SHARP; ++ifilter) {
+    if ((ref_total[LAST_FRAME] &&
+        cpi->interp_filter_selected[LAST_FRAME][ifilter] == 0) &&
+        (ref_total[GOLDEN_FRAME] == 0 ||
+         cpi->interp_filter_selected[GOLDEN_FRAME][ifilter] * 50
+           < ref_total[GOLDEN_FRAME]) &&
+        (ref_total[ALTREF_FRAME] == 0 ||
+         cpi->interp_filter_selected[ALTREF_FRAME][ifilter] * 50
+           < ref_total[ALTREF_FRAME]))
+      mask |= 1 << ifilter;
+  }
+  return mask;
+}
+
+static void encode_frame_to_data_rate(VP10_COMP *cpi,
+                                      size_t *size,
+                                      uint8_t *dest,
+                                      unsigned int *frame_flags) {
+  VP10_COMMON *const cm = &cpi->common;
+  const VP10EncoderConfig *const oxcf = &cpi->oxcf;
+  struct segmentation *const seg = &cm->seg;
+  TX_SIZE t;
+
+  set_ext_overrides(cpi);
+  vpx_clear_system_state();
+
+  // Set the arf sign bias for this frame.
+  set_arf_sign_bias(cpi);
+
+  // Set default state for segment based loop filter update flags.
+  cm->lf.mode_ref_delta_update = 0;
+
+  if (cpi->oxcf.pass == 2 &&
+      cpi->sf.adaptive_interp_filter_search)
+    cpi->sf.interp_filter_search_mask =
+        setup_interp_filter_search_mask(cpi);
+
+  // Set various flags etc to special state if it is a key frame.
+  if (frame_is_intra_only(cm)) {
+    // Reset the loop filter deltas and segmentation map.
+    vp10_reset_segment_features(cm);
+
+    // If segmentation is enabled force a map update for key frames.
+    if (seg->enabled) {
+      seg->update_map = 1;
+      seg->update_data = 1;
+    }
+
+    // The alternate reference frame cannot be active for a key frame.
+    cpi->rc.source_alt_ref_active = 0;
+
+    cm->error_resilient_mode = oxcf->error_resilient_mode;
+
+    // By default, encoder assumes decoder can use prev_mi.
+    if (cm->error_resilient_mode) {
+      cm->reset_frame_context = RESET_FRAME_CONTEXT_NONE;
+      cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_OFF;
+    } else if (cm->intra_only) {
+      // Only reset the current context.
+      cm->reset_frame_context = RESET_FRAME_CONTEXT_CURRENT;
+    }
+  }
+
+  // For 1 pass CBR, check if we are dropping this frame.
+  // Never drop on key frame.
+  if (oxcf->pass == 0 &&
+      oxcf->rc_mode == VPX_CBR &&
+      cm->frame_type != KEY_FRAME) {
+    if (vp10_rc_drop_frame(cpi)) {
+      vp10_rc_postencode_update_drop_frame(cpi);
+      ++cm->current_video_frame;
+      return;
+    }
+  }
+
+  vpx_clear_system_state();
+
+#if CONFIG_INTERNAL_STATS
+  memset(cpi->mode_chosen_counts, 0,
+         MAX_MODES * sizeof(*cpi->mode_chosen_counts));
+#endif
+
+  if (cpi->sf.recode_loop == DISALLOW_RECODE) {
+    encode_without_recode_loop(cpi);
+  } else {
+    encode_with_recode_loop(cpi, size, dest);
+  }
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+#ifdef OUTPUT_YUV_DENOISED
+  if (oxcf->noise_sensitivity > 0) {
+    vp10_write_yuv_frame_420(&cpi->denoiser.running_avg_y[INTRA_FRAME],
+                            yuv_denoised_file);
+  }
+#endif
+#endif
+#ifdef OUTPUT_YUV_SKINMAP
+  if (cpi->common.current_video_frame > 1) {
+    vp10_compute_skin_map(cpi, yuv_skinmap_file);
+  }
+#endif
+
+  // Special case code to reduce pulsing when key frames are forced at a
+  // fixed interval. Note the reconstruction error if it is the frame before
+  // the force key frame
+  if (cpi->rc.next_key_frame_forced && cpi->rc.frames_to_key == 1) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (cm->use_highbitdepth) {
+      cpi->ambient_err = vp10_highbd_get_y_sse(cpi->Source,
+                                              get_frame_new_buffer(cm));
+    } else {
+      cpi->ambient_err = vp10_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+    }
+#else
+    cpi->ambient_err = vp10_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  }
+
+  // If the encoder forced a KEY_FRAME decision
+  if (cm->frame_type == KEY_FRAME)
+    cpi->refresh_last_frame = 1;
+
+  cm->frame_to_show = get_frame_new_buffer(cm);
+  cm->frame_to_show->color_space = cm->color_space;
+  cm->frame_to_show->color_range = cm->color_range;
+  cm->frame_to_show->render_width  = cm->render_width;
+  cm->frame_to_show->render_height = cm->render_height;
+
+  // Pick the loop filter level for the frame.
+  loopfilter_frame(cpi, cm);
+
+  // build the bitstream
+  vp10_pack_bitstream(cpi, dest, size);
+
+  if (cm->seg.update_map)
+    update_reference_segmentation_map(cpi);
+
+  if (frame_is_intra_only(cm) == 0) {
+    release_scaled_references(cpi);
+  }
+  vp10_update_reference_frames(cpi);
+
+  for (t = TX_4X4; t <= TX_32X32; t++)
+    full_to_model_counts(cpi->td.counts->coef[t],
+                         cpi->td.rd_counts.coef_counts[t]);
+
+  if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+    vp10_adapt_coef_probs(cm);
+#if CONFIG_MISC_FIXES
+    vp10_adapt_intra_frame_probs(cm);
+#else
+    if (!frame_is_intra_only(cm))
+      vp10_adapt_intra_frame_probs(cm);
+#endif
+  }
+
+  if (!frame_is_intra_only(cm)) {
+    if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+      vp10_adapt_inter_frame_probs(cm);
+      vp10_adapt_mv_probs(cm, cm->allow_high_precision_mv);
+    }
+  }
+
+  if (cpi->refresh_golden_frame == 1)
+    cpi->frame_flags |= FRAMEFLAGS_GOLDEN;
+  else
+    cpi->frame_flags &= ~FRAMEFLAGS_GOLDEN;
+
+  if (cpi->refresh_alt_ref_frame == 1)
+    cpi->frame_flags |= FRAMEFLAGS_ALTREF;
+  else
+    cpi->frame_flags &= ~FRAMEFLAGS_ALTREF;
+
+  cpi->ref_frame_flags = get_ref_frame_flags(cpi);
+
+  cm->last_frame_type = cm->frame_type;
+
+  vp10_rc_postencode_update(cpi, *size);
+
+#if 0
+  output_frame_level_debug_stats(cpi);
+#endif
+
+  if (cm->frame_type == KEY_FRAME) {
+    // Tell the caller that the frame was coded as a key frame
+    *frame_flags = cpi->frame_flags | FRAMEFLAGS_KEY;
+  } else {
+    *frame_flags = cpi->frame_flags & ~FRAMEFLAGS_KEY;
+  }
+
+  // Clear the one shot update flags for segmentation map and mode/ref loop
+  // filter deltas.
+  cm->seg.update_map = 0;
+  cm->seg.update_data = 0;
+  cm->lf.mode_ref_delta_update = 0;
+
+  // keep track of the last coded dimensions
+  cm->last_width = cm->width;
+  cm->last_height = cm->height;
+
+  // reset to normal state now that we are done.
+  if (!cm->show_existing_frame)
+    cm->last_show_frame = cm->show_frame;
+
+  if (cm->show_frame) {
+    vp10_swap_mi_and_prev_mi(cm);
+    // Don't increment frame counters if this was an altref buffer
+    // update not a real frame
+    ++cm->current_video_frame;
+  }
+  cm->prev_frame = cm->cur_frame;
+}
+
+static void Pass0Encode(VP10_COMP *cpi, size_t *size, uint8_t *dest,
+                        unsigned int *frame_flags) {
+  if (cpi->oxcf.rc_mode == VPX_CBR) {
+    vp10_rc_get_one_pass_cbr_params(cpi);
+  } else {
+    vp10_rc_get_one_pass_vbr_params(cpi);
+  }
+  encode_frame_to_data_rate(cpi, size, dest, frame_flags);
+}
+
+static void Pass2Encode(VP10_COMP *cpi, size_t *size,
+                        uint8_t *dest, unsigned int *frame_flags) {
+  cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED;
+  encode_frame_to_data_rate(cpi, size, dest, frame_flags);
+
+  vp10_twopass_postencode_update(cpi);
+}
+
+static void init_ref_frame_bufs(VP10_COMMON *cm) {
+  int i;
+  BufferPool *const pool = cm->buffer_pool;
+  cm->new_fb_idx = INVALID_IDX;
+  for (i = 0; i < REF_FRAMES; ++i) {
+    cm->ref_frame_map[i] = INVALID_IDX;
+    pool->frame_bufs[i].ref_count = 0;
+  }
+}
+
+static void check_initial_width(VP10_COMP *cpi,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                int use_highbitdepth,
+#endif
+                                int subsampling_x, int subsampling_y) {
+  VP10_COMMON *const cm = &cpi->common;
+
+  if (!cpi->initial_width ||
+#if CONFIG_VP9_HIGHBITDEPTH
+      cm->use_highbitdepth != use_highbitdepth ||
+#endif
+      cm->subsampling_x != subsampling_x ||
+      cm->subsampling_y != subsampling_y) {
+    cm->subsampling_x = subsampling_x;
+    cm->subsampling_y = subsampling_y;
+#if CONFIG_VP9_HIGHBITDEPTH
+    cm->use_highbitdepth = use_highbitdepth;
+#endif
+
+    alloc_raw_frame_buffers(cpi);
+    init_ref_frame_bufs(cm);
+    alloc_util_frame_buffers(cpi);
+
+    init_motion_estimation(cpi);  // TODO(agrange) This can be removed.
+
+    cpi->initial_width = cm->width;
+    cpi->initial_height = cm->height;
+    cpi->initial_mbs = cm->MBs;
+  }
+}
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+static void setup_denoiser_buffer(VP10_COMP *cpi) {
+  VP10_COMMON *const cm = &cpi->common;
+  if (cpi->oxcf.noise_sensitivity > 0 &&
+      !cpi->denoiser.frame_buffer_initialized) {
+    vp10_denoiser_alloc(&(cpi->denoiser), cm->width, cm->height,
+                       cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                       cm->use_highbitdepth,
+#endif
+                       VP9_ENC_BORDER_IN_PIXELS);
+  }
+}
+#endif
+
+int vp10_receive_raw_frame(VP10_COMP *cpi, unsigned int frame_flags,
+                          YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
+                          int64_t end_time) {
+  VP10_COMMON *volatile const cm = &cpi->common;
+  struct vpx_usec_timer timer;
+  volatile int res = 0;
+  const int subsampling_x = sd->subsampling_x;
+  const int subsampling_y = sd->subsampling_y;
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int use_highbitdepth = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0;
+#endif
+
+  if (setjmp(cm->error.jmp)) {
+    cm->error.setjmp = 0;
+    return -1;
+  }
+  cm->error.setjmp = 1;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  check_initial_width(cpi, use_highbitdepth, subsampling_x, subsampling_y);
+#else
+  check_initial_width(cpi, subsampling_x, subsampling_y);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+  setup_denoiser_buffer(cpi);
+#endif
+  vpx_usec_timer_start(&timer);
+
+  if (vp10_lookahead_push(cpi->lookahead, sd, time_stamp, end_time,
+#if CONFIG_VP9_HIGHBITDEPTH
+                         use_highbitdepth,
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+                         frame_flags))
+    res = -1;
+  vpx_usec_timer_mark(&timer);
+  cpi->time_receive_data += vpx_usec_timer_elapsed(&timer);
+
+  if ((cm->profile == PROFILE_0 || cm->profile == PROFILE_2) &&
+      (subsampling_x != 1 || subsampling_y != 1)) {
+    vpx_internal_error(&cm->error, VPX_CODEC_INVALID_PARAM,
+                       "Non-4:2:0 color format requires profile 1 or 3");
+    res = -1;
+  }
+  if ((cm->profile == PROFILE_1 || cm->profile == PROFILE_3) &&
+      (subsampling_x == 1 && subsampling_y == 1)) {
+    vpx_internal_error(&cm->error, VPX_CODEC_INVALID_PARAM,
+                       "4:2:0 color format requires profile 0 or 2");
+    res = -1;
+  }
+
+  cm->error.setjmp = 0;
+  return res;
+}
+
+
+static int frame_is_reference(const VP10_COMP *cpi) {
+  const VP10_COMMON *cm = &cpi->common;
+
+  return cm->frame_type == KEY_FRAME ||
+         cpi->refresh_last_frame ||
+         cpi->refresh_golden_frame ||
+         cpi->refresh_alt_ref_frame ||
+         cm->refresh_frame_context != REFRESH_FRAME_CONTEXT_OFF ||
+         cm->lf.mode_ref_delta_update ||
+         cm->seg.update_map ||
+         cm->seg.update_data;
+}
+
+static void adjust_frame_rate(VP10_COMP *cpi,
+                              const struct lookahead_entry *source) {
+  int64_t this_duration;
+  int step = 0;
+
+  if (source->ts_start == cpi->first_time_stamp_ever) {
+    this_duration = source->ts_end - source->ts_start;
+    step = 1;
+  } else {
+    int64_t last_duration = cpi->last_end_time_stamp_seen
+        - cpi->last_time_stamp_seen;
+
+    this_duration = source->ts_end - cpi->last_end_time_stamp_seen;
+
+    // do a step update if the duration changes by 10%
+    if (last_duration)
+      step = (int)((this_duration - last_duration) * 10 / last_duration);
+  }
+
+  if (this_duration) {
+    if (step) {
+      vp10_new_framerate(cpi, 10000000.0 / this_duration);
+    } else {
+      // Average this frame's rate into the last second's average
+      // frame rate. If we haven't seen 1 second yet, then average
+      // over the whole interval seen.
+      const double interval = VPXMIN(
+          (double)(source->ts_end - cpi->first_time_stamp_ever), 10000000.0);
+      double avg_duration = 10000000.0 / cpi->framerate;
+      avg_duration *= (interval - avg_duration + this_duration);
+      avg_duration /= interval;
+
+      vp10_new_framerate(cpi, 10000000.0 / avg_duration);
+    }
+  }
+  cpi->last_time_stamp_seen = source->ts_start;
+  cpi->last_end_time_stamp_seen = source->ts_end;
+}
+
+// Returns 0 if this is not an alt ref else the offset of the source frame
+// used as the arf midpoint.
+static int get_arf_src_index(VP10_COMP *cpi) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  int arf_src_index = 0;
+  if (is_altref_enabled(cpi)) {
+    if (cpi->oxcf.pass == 2) {
+      const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+      if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
+        arf_src_index = gf_group->arf_src_offset[gf_group->index];
+      }
+    } else if (rc->source_alt_ref_pending) {
+      arf_src_index = rc->frames_till_gf_update_due;
+    }
+  }
+  return arf_src_index;
+}
+
+static void check_src_altref(VP10_COMP *cpi,
+                             const struct lookahead_entry *source) {
+  RATE_CONTROL *const rc = &cpi->rc;
+
+  if (cpi->oxcf.pass == 2) {
+    const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+    rc->is_src_frame_alt_ref =
+      (gf_group->update_type[gf_group->index] == OVERLAY_UPDATE);
+  } else {
+    rc->is_src_frame_alt_ref = cpi->alt_ref_source &&
+                               (source == cpi->alt_ref_source);
+  }
+
+  if (rc->is_src_frame_alt_ref) {
+    // Current frame is an ARF overlay frame.
+    cpi->alt_ref_source = NULL;
+
+    // Don't refresh the last buffer for an ARF overlay frame. It will
+    // become the GF so preserve last as an alternative prediction option.
+    cpi->refresh_last_frame = 0;
+  }
+}
+
+#if CONFIG_INTERNAL_STATS
+extern double vp10_get_blockiness(const unsigned char *img1, int img1_pitch,
+                                 const unsigned char *img2, int img2_pitch,
+                                 int width, int height);
+
+static void adjust_image_stat(double y, double u, double v, double all,
+                              ImageStat *s) {
+  s->stat[Y] += y;
+  s->stat[U] += u;
+  s->stat[V] += v;
+  s->stat[ALL] += all;
+  s->worst = VPXMIN(s->worst, all);
+}
+#endif  // CONFIG_INTERNAL_STATS
+
+int vp10_get_compressed_data(VP10_COMP *cpi, unsigned int *frame_flags,
+                            size_t *size, uint8_t *dest,
+                            int64_t *time_stamp, int64_t *time_end, int flush) {
+  const VP10EncoderConfig *const oxcf = &cpi->oxcf;
+  VP10_COMMON *const cm = &cpi->common;
+  BufferPool *const pool = cm->buffer_pool;
+  RATE_CONTROL *const rc = &cpi->rc;
+  struct vpx_usec_timer  cmptimer;
+  YV12_BUFFER_CONFIG *force_src_buffer = NULL;
+  struct lookahead_entry *last_source = NULL;
+  struct lookahead_entry *source = NULL;
+  int arf_src_index;
+  int i;
+
+  vpx_usec_timer_start(&cmptimer);
+
+  vp10_set_high_precision_mv(cpi, ALTREF_HIGH_PRECISION_MV);
+
+  // Is multi-arf enabled.
+  // Note that at the moment multi_arf is only configured for 2 pass VBR
+  if ((oxcf->pass == 2) && (cpi->oxcf.enable_auto_arf > 1))
+    cpi->multi_arf_allowed = 1;
+  else
+    cpi->multi_arf_allowed = 0;
+
+  // Normal defaults
+  cm->reset_frame_context = RESET_FRAME_CONTEXT_NONE;
+  cm->refresh_frame_context =
+      oxcf->error_resilient_mode ? REFRESH_FRAME_CONTEXT_OFF :
+          oxcf->frame_parallel_decoding_mode ? REFRESH_FRAME_CONTEXT_FORWARD
+                                             : REFRESH_FRAME_CONTEXT_BACKWARD;
+
+  cpi->refresh_last_frame = 1;
+  cpi->refresh_golden_frame = 0;
+  cpi->refresh_alt_ref_frame = 0;
+
+  // Should we encode an arf frame.
+  arf_src_index = get_arf_src_index(cpi);
+
+  if (arf_src_index) {
+    assert(arf_src_index <= rc->frames_to_key);
+
+    if ((source = vp10_lookahead_peek(cpi->lookahead, arf_src_index)) != NULL) {
+      cpi->alt_ref_source = source;
+
+      if (oxcf->arnr_max_frames > 0) {
+        // Produce the filtered ARF frame.
+        vp10_temporal_filter(cpi, arf_src_index);
+        vpx_extend_frame_borders(&cpi->alt_ref_buffer);
+        force_src_buffer = &cpi->alt_ref_buffer;
+      }
+
+      cm->show_frame = 0;
+      cm->intra_only = 0;
+      cpi->refresh_alt_ref_frame = 1;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_last_frame = 0;
+      rc->is_src_frame_alt_ref = 0;
+      rc->source_alt_ref_pending = 0;
+    } else {
+      rc->source_alt_ref_pending = 0;
+    }
+  }
+
+  if (!source) {
+    // Get last frame source.
+    if (cm->current_video_frame > 0) {
+      if ((last_source = vp10_lookahead_peek(cpi->lookahead, -1)) == NULL)
+        return -1;
+    }
+
+    // Read in the source frame.
+    source = vp10_lookahead_pop(cpi->lookahead, flush);
+
+    if (source != NULL) {
+      cm->show_frame = 1;
+      cm->intra_only = 0;
+
+      // Check to see if the frame should be encoded as an arf overlay.
+      check_src_altref(cpi, source);
+    }
+  }
+
+  if (source) {
+    cpi->un_scaled_source = cpi->Source = force_src_buffer ? force_src_buffer
+                                                           : &source->img;
+
+    cpi->unscaled_last_source = last_source != NULL ? &last_source->img : NULL;
+
+    *time_stamp = source->ts_start;
+    *time_end = source->ts_end;
+    *frame_flags = (source->flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
+
+  } else {
+    *size = 0;
+    if (flush && oxcf->pass == 1 && !cpi->twopass.first_pass_done) {
+      vp10_end_first_pass(cpi);    /* get last stats packet */
+      cpi->twopass.first_pass_done = 1;
+    }
+    return -1;
+  }
+
+  if (source->ts_start < cpi->first_time_stamp_ever) {
+    cpi->first_time_stamp_ever = source->ts_start;
+    cpi->last_end_time_stamp_seen = source->ts_start;
+  }
+
+  // Clear down mmx registers
+  vpx_clear_system_state();
+
+  // adjust frame rates based on timestamps given
+  if (cm->show_frame) {
+    adjust_frame_rate(cpi, source);
+  }
+
+  // Find a free buffer for the new frame, releasing the reference previously
+  // held.
+  if (cm->new_fb_idx != INVALID_IDX) {
+    --pool->frame_bufs[cm->new_fb_idx].ref_count;
+  }
+  cm->new_fb_idx = get_free_fb(cm);
+
+  if (cm->new_fb_idx == INVALID_IDX)
+    return -1;
+
+  cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx];
+
+  if (cpi->multi_arf_allowed) {
+    if (cm->frame_type == KEY_FRAME) {
+      init_buffer_indices(cpi);
+    } else if (oxcf->pass == 2) {
+      const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+      cpi->alt_fb_idx = gf_group->arf_ref_idx[gf_group->index];
+    }
+  }
+
+  // Start with a 0 size frame.
+  *size = 0;
+
+  cpi->frame_flags = *frame_flags;
+
+  if (oxcf->pass == 2) {
+    vp10_rc_get_second_pass_params(cpi);
+  } else if (oxcf->pass == 1) {
+    set_frame_size(cpi);
+  }
+
+  if (cpi->oxcf.pass != 0 || frame_is_intra_only(cm) == 1) {
+    for (i = 0; i < MAX_REF_FRAMES; ++i)
+      cpi->scaled_ref_idx[i] = INVALID_IDX;
+  }
+
+  if (oxcf->pass == 1) {
+    cpi->td.mb.e_mbd.lossless[0] = is_lossless_requested(oxcf);
+    vp10_first_pass(cpi, source);
+  } else if (oxcf->pass == 2) {
+    Pass2Encode(cpi, size, dest, frame_flags);
+  } else {
+    // One pass encode
+    Pass0Encode(cpi, size, dest, frame_flags);
+  }
+
+  if (cm->refresh_frame_context != REFRESH_FRAME_CONTEXT_OFF)
+    cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
+
+  // No frame encoded, or frame was dropped, release scaled references.
+  if ((*size == 0) && (frame_is_intra_only(cm) == 0)) {
+    release_scaled_references(cpi);
+  }
+
+  if (*size > 0) {
+    cpi->droppable = !frame_is_reference(cpi);
+  }
+
+  vpx_usec_timer_mark(&cmptimer);
+  cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer);
+
+  if (cpi->b_calculate_psnr && oxcf->pass != 1 && cm->show_frame)
+    generate_psnr_packet(cpi);
+
+#if CONFIG_INTERNAL_STATS
+
+  if (oxcf->pass != 1) {
+    double samples = 0.0;
+    cpi->bytes += (int)(*size);
+
+    if (cm->show_frame) {
+      cpi->count++;
+
+      if (cpi->b_calculate_psnr) {
+        YV12_BUFFER_CONFIG *orig = cpi->Source;
+        YV12_BUFFER_CONFIG *recon = cpi->common.frame_to_show;
+        YV12_BUFFER_CONFIG *pp = &cm->post_proc_buffer;
+        PSNR_STATS psnr;
+#if CONFIG_VP9_HIGHBITDEPTH
+        calc_highbd_psnr(orig, recon, &psnr, cpi->td.mb.e_mbd.bd,
+                         cpi->oxcf.input_bit_depth);
+#else
+        calc_psnr(orig, recon, &psnr);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+        adjust_image_stat(psnr.psnr[1], psnr.psnr[2], psnr.psnr[3],
+                          psnr.psnr[0], &cpi->psnr);
+        cpi->total_sq_error += psnr.sse[0];
+        cpi->total_samples += psnr.samples[0];
+        samples = psnr.samples[0];
+
+        {
+          PSNR_STATS psnr2;
+          double frame_ssim2 = 0, weight = 0;
+#if CONFIG_VP9_POSTPROC
+          if (vpx_alloc_frame_buffer(&cm->post_proc_buffer,
+                                     recon->y_crop_width, recon->y_crop_height,
+                                     cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                     cm->use_highbitdepth,
+#endif
+                                     VP9_ENC_BORDER_IN_PIXELS,
+                                     cm->byte_alignment) < 0) {
+            vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                               "Failed to allocate post processing buffer");
+          }
+
+          vp10_deblock(cm->frame_to_show, &cm->post_proc_buffer,
+                      cm->lf.filter_level * 10 / 6);
+#endif
+          vpx_clear_system_state();
+
+#if CONFIG_VP9_HIGHBITDEPTH
+          calc_highbd_psnr(orig, pp, &psnr2, cpi->td.mb.e_mbd.bd,
+                           cpi->oxcf.input_bit_depth);
+#else
+          calc_psnr(orig, pp, &psnr2);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+          cpi->totalp_sq_error += psnr2.sse[0];
+          cpi->totalp_samples += psnr2.samples[0];
+          adjust_image_stat(psnr2.psnr[1], psnr2.psnr[2], psnr2.psnr[3],
+                            psnr2.psnr[0], &cpi->psnrp);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+          if (cm->use_highbitdepth) {
+            frame_ssim2 = vpx_highbd_calc_ssim(orig, recon, &weight,
+                                               (int)cm->bit_depth);
+          } else {
+            frame_ssim2 = vpx_calc_ssim(orig, recon, &weight);
+          }
+#else
+          frame_ssim2 = vpx_calc_ssim(orig, recon, &weight);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+          cpi->worst_ssim= VPXMIN(cpi->worst_ssim, frame_ssim2);
+          cpi->summed_quality += frame_ssim2 * weight;
+          cpi->summed_weights += weight;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+          if (cm->use_highbitdepth) {
+            frame_ssim2 = vpx_highbd_calc_ssim(
+                orig, &cm->post_proc_buffer, &weight, (int)cm->bit_depth);
+          } else {
+            frame_ssim2 = vpx_calc_ssim(orig, &cm->post_proc_buffer, &weight);
+          }
+#else
+          frame_ssim2 = vpx_calc_ssim(orig, &cm->post_proc_buffer, &weight);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+          cpi->summedp_quality += frame_ssim2 * weight;
+          cpi->summedp_weights += weight;
+#if 0
+          {
+            FILE *f = fopen("q_used.stt", "a");
+            fprintf(f, "%5d : Y%f7.3:U%f7.3:V%f7.3:F%f7.3:S%7.3f\n",
+                    cpi->common.current_video_frame, y2, u2, v2,
+                    frame_psnr2, frame_ssim2);
+            fclose(f);
+          }
+#endif
+        }
+      }
+      if (cpi->b_calculate_blockiness) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (!cm->use_highbitdepth)
+#endif
+        {
+          double frame_blockiness = vp10_get_blockiness(
+              cpi->Source->y_buffer, cpi->Source->y_stride,
+              cm->frame_to_show->y_buffer, cm->frame_to_show->y_stride,
+              cpi->Source->y_width, cpi->Source->y_height);
+          cpi->worst_blockiness =
+              VPXMAX(cpi->worst_blockiness, frame_blockiness);
+          cpi->total_blockiness += frame_blockiness;
+        }
+      }
+
+      if (cpi->b_calculate_consistency) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (!cm->use_highbitdepth)
+#endif
+        {
+          double this_inconsistency = vpx_get_ssim_metrics(
+              cpi->Source->y_buffer, cpi->Source->y_stride,
+              cm->frame_to_show->y_buffer, cm->frame_to_show->y_stride,
+              cpi->Source->y_width, cpi->Source->y_height, cpi->ssim_vars,
+              &cpi->metrics, 1);
+
+          const double peak = (double)((1 << cpi->oxcf.input_bit_depth) - 1);
+          double consistency = vpx_sse_to_psnr(samples, peak,
+                                             (double)cpi->total_inconsistency);
+          if (consistency > 0.0)
+            cpi->worst_consistency =
+                VPXMIN(cpi->worst_consistency, consistency);
+          cpi->total_inconsistency += this_inconsistency;
+        }
+      }
+
+      if (cpi->b_calculate_ssimg) {
+        double y, u, v, frame_all;
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (cm->use_highbitdepth) {
+          frame_all = vpx_highbd_calc_ssimg(cpi->Source, cm->frame_to_show, &y,
+                                            &u, &v, (int)cm->bit_depth);
+        } else {
+          frame_all = vpx_calc_ssimg(cpi->Source, cm->frame_to_show, &y, &u,
+                                     &v);
+        }
+#else
+        frame_all = vpx_calc_ssimg(cpi->Source, cm->frame_to_show, &y, &u, &v);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        adjust_image_stat(y, u, v, frame_all, &cpi->ssimg);
+      }
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (!cm->use_highbitdepth)
+#endif
+      {
+        double y, u, v, frame_all;
+        frame_all = vpx_calc_fastssim(cpi->Source, cm->frame_to_show, &y, &u,
+                                      &v);
+        adjust_image_stat(y, u, v, frame_all, &cpi->fastssim);
+        /* TODO(JBB): add 10/12 bit support */
+      }
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (!cm->use_highbitdepth)
+#endif
+      {
+        double y, u, v, frame_all;
+        frame_all = vpx_psnrhvs(cpi->Source, cm->frame_to_show, &y, &u, &v);
+        adjust_image_stat(y, u, v, frame_all, &cpi->psnrhvs);
+      }
+    }
+  }
+#endif
+
+  vpx_clear_system_state();
+  return 0;
+}
+
+int vp10_get_preview_raw_frame(VP10_COMP *cpi, YV12_BUFFER_CONFIG *dest,
+                              vp10_ppflags_t *flags) {
+  VP10_COMMON *cm = &cpi->common;
+#if !CONFIG_VP9_POSTPROC
+  (void)flags;
+#endif
+
+  if (!cm->show_frame) {
+    return -1;
+  } else {
+    int ret;
+#if CONFIG_VP9_POSTPROC
+    ret = vp10_post_proc_frame(cm, dest, flags);
+#else
+    if (cm->frame_to_show) {
+      *dest = *cm->frame_to_show;
+      dest->y_width = cm->width;
+      dest->y_height = cm->height;
+      dest->uv_width = cm->width >> cm->subsampling_x;
+      dest->uv_height = cm->height >> cm->subsampling_y;
+      ret = 0;
+    } else {
+      ret = -1;
+    }
+#endif  // !CONFIG_VP9_POSTPROC
+    vpx_clear_system_state();
+    return ret;
+  }
+}
+
+int vp10_set_internal_size(VP10_COMP *cpi,
+                          VPX_SCALING horiz_mode, VPX_SCALING vert_mode) {
+  VP10_COMMON *cm = &cpi->common;
+  int hr = 0, hs = 0, vr = 0, vs = 0;
+
+  if (horiz_mode > ONETWO || vert_mode > ONETWO)
+    return -1;
+
+  Scale2Ratio(horiz_mode, &hr, &hs);
+  Scale2Ratio(vert_mode, &vr, &vs);
+
+  // always go to the next whole number
+  cm->width = (hs - 1 + cpi->oxcf.width * hr) / hs;
+  cm->height = (vs - 1 + cpi->oxcf.height * vr) / vs;
+  assert(cm->width <= cpi->initial_width);
+  assert(cm->height <= cpi->initial_height);
+
+  update_frame_size(cpi);
+
+  return 0;
+}
+
+int vp10_set_size_literal(VP10_COMP *cpi, unsigned int width,
+                         unsigned int height) {
+  VP10_COMMON *cm = &cpi->common;
+#if CONFIG_VP9_HIGHBITDEPTH
+  check_initial_width(cpi, cm->use_highbitdepth, 1, 1);
+#else
+  check_initial_width(cpi, 1, 1);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+  setup_denoiser_buffer(cpi);
+#endif
+
+  if (width) {
+    cm->width = width;
+    if (cm->width > cpi->initial_width) {
+      cm->width = cpi->initial_width;
+      printf("Warning: Desired width too large, changed to %d\n", cm->width);
+    }
+  }
+
+  if (height) {
+    cm->height = height;
+    if (cm->height > cpi->initial_height) {
+      cm->height = cpi->initial_height;
+      printf("Warning: Desired height too large, changed to %d\n", cm->height);
+    }
+  }
+  assert(cm->width <= cpi->initial_width);
+  assert(cm->height <= cpi->initial_height);
+
+  update_frame_size(cpi);
+
+  return 0;
+}
+
+int64_t vp10_get_y_sse(const YV12_BUFFER_CONFIG *a,
+                      const YV12_BUFFER_CONFIG *b) {
+  assert(a->y_crop_width == b->y_crop_width);
+  assert(a->y_crop_height == b->y_crop_height);
+
+  return get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
+                 a->y_crop_width, a->y_crop_height);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+int64_t vp10_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
+                             const YV12_BUFFER_CONFIG *b) {
+  assert(a->y_crop_width == b->y_crop_width);
+  assert(a->y_crop_height == b->y_crop_height);
+  assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+  assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+
+  return highbd_get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
+                        a->y_crop_width, a->y_crop_height);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+int vp10_get_quantizer(VP10_COMP *cpi) {
+  return cpi->common.base_qindex;
+}
+
+void vp10_apply_encoding_flags(VP10_COMP *cpi, vpx_enc_frame_flags_t flags) {
+  if (flags & (VP8_EFLAG_NO_REF_LAST | VP8_EFLAG_NO_REF_GF |
+               VP8_EFLAG_NO_REF_ARF)) {
+    int ref = 7;
+
+    if (flags & VP8_EFLAG_NO_REF_LAST)
+      ref ^= VP9_LAST_FLAG;
+
+    if (flags & VP8_EFLAG_NO_REF_GF)
+      ref ^= VP9_GOLD_FLAG;
+
+    if (flags & VP8_EFLAG_NO_REF_ARF)
+      ref ^= VP9_ALT_FLAG;
+
+    vp10_use_as_reference(cpi, ref);
+  }
+
+  if (flags & (VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF |
+               VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_FORCE_GF |
+               VP8_EFLAG_FORCE_ARF)) {
+    int upd = 7;
+
+    if (flags & VP8_EFLAG_NO_UPD_LAST)
+      upd ^= VP9_LAST_FLAG;
+
+    if (flags & VP8_EFLAG_NO_UPD_GF)
+      upd ^= VP9_GOLD_FLAG;
+
+    if (flags & VP8_EFLAG_NO_UPD_ARF)
+      upd ^= VP9_ALT_FLAG;
+
+    vp10_update_reference(cpi, upd);
+  }
+
+  if (flags & VP8_EFLAG_NO_UPD_ENTROPY) {
+    vp10_update_entropy(cpi, 0);
+  }
+}
diff --git a/libs/libvpx/vp10/encoder/encoder.h b/libs/libvpx/vp10/encoder/encoder.h
new file mode 100644
index 0000000000..bd6a00932f
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/encoder.h
@@ -0,0 +1,648 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_ENCODER_ENCODER_H_
+#define VP10_ENCODER_ENCODER_H_
+
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "vpx/vp8cx.h"
+
+#include "vp10/common/alloccommon.h"
+#include "vp10/common/ppflags.h"
+#include "vp10/common/entropymode.h"
+#include "vp10/common/thread_common.h"
+#include "vp10/common/onyxc_int.h"
+
+#include "vp10/encoder/aq_cyclicrefresh.h"
+#include "vp10/encoder/context_tree.h"
+#include "vp10/encoder/encodemb.h"
+#include "vp10/encoder/firstpass.h"
+#include "vp10/encoder/lookahead.h"
+#include "vp10/encoder/mbgraph.h"
+#include "vp10/encoder/mcomp.h"
+#include "vp10/encoder/quantize.h"
+#include "vp10/encoder/ratectrl.h"
+#include "vp10/encoder/rd.h"
+#include "vp10/encoder/speed_features.h"
+#include "vp10/encoder/tokenize.h"
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+#include "vp10/encoder/denoiser.h"
+#endif
+
+#if CONFIG_INTERNAL_STATS
+#include "vpx_dsp/ssim.h"
+#endif
+#include "vpx_dsp/variance.h"
+#include "vpx/internal/vpx_codec_internal.h"
+#include "vpx_util/vpx_thread.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+  int nmvjointcost[MV_JOINTS];
+  int nmvcosts[2][MV_VALS];
+  int nmvcosts_hp[2][MV_VALS];
+
+#if !CONFIG_MISC_FIXES
+  vpx_prob segment_pred_probs[PREDICTION_PROBS];
+#endif
+
+  unsigned char *last_frame_seg_map_copy;
+
+  // 0 = Intra, Last, GF, ARF
+  signed char last_ref_lf_deltas[MAX_REF_FRAMES];
+  // 0 = ZERO_MV, MV
+  signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS];
+
+  FRAME_CONTEXT fc;
+} CODING_CONTEXT;
+
+
+typedef enum {
+  // encode_breakout is disabled.
+  ENCODE_BREAKOUT_DISABLED = 0,
+  // encode_breakout is enabled.
+  ENCODE_BREAKOUT_ENABLED = 1,
+  // encode_breakout is enabled with small max_thresh limit.
+  ENCODE_BREAKOUT_LIMITED = 2
+} ENCODE_BREAKOUT_TYPE;
+
+typedef enum {
+  NORMAL      = 0,
+  FOURFIVE    = 1,
+  THREEFIVE   = 2,
+  ONETWO      = 3
+} VPX_SCALING;
+
+typedef enum {
+  // Good Quality Fast Encoding. The encoder balances quality with the amount of
+  // time it takes to encode the output. Speed setting controls how fast.
+  GOOD,
+
+  // The encoder places priority on the quality of the output over encoding
+  // speed. The output is compressed at the highest possible quality. This
+  // option takes the longest amount of time to encode. Speed setting ignored.
+  BEST,
+
+  // Realtime/Live Encoding. This mode is optimized for realtime encoding (for
+  // example, capturing a television signal or feed from a live camera). Speed
+  // setting controls how fast.
+  REALTIME
+} MODE;
+
+typedef enum {
+  FRAMEFLAGS_KEY    = 1 << 0,
+  FRAMEFLAGS_GOLDEN = 1 << 1,
+  FRAMEFLAGS_ALTREF = 1 << 2,
+} FRAMETYPE_FLAGS;
+
+typedef enum {
+  NO_AQ = 0,
+  VARIANCE_AQ = 1,
+  COMPLEXITY_AQ = 2,
+  CYCLIC_REFRESH_AQ = 3,
+  AQ_MODE_COUNT  // This should always be the last member of the enum
+} AQ_MODE;
+
+typedef enum {
+  RESIZE_NONE = 0,    // No frame resizing allowed.
+  RESIZE_FIXED = 1,   // All frames are coded at the specified dimension.
+  RESIZE_DYNAMIC = 2  // Coded size of each frame is determined by the codec.
+} RESIZE_TYPE;
+
+typedef struct VP10EncoderConfig {
+  BITSTREAM_PROFILE profile;
+  vpx_bit_depth_t bit_depth;     // Codec bit-depth.
+  int width;  // width of data passed to the compressor
+  int height;  // height of data passed to the compressor
+  unsigned int input_bit_depth;  // Input bit depth.
+  double init_framerate;  // set to passed in framerate
+  int64_t target_bandwidth;  // bandwidth to be used in kilobits per second
+
+  int noise_sensitivity;  // pre processing blur: recommendation 0
+  int sharpness;  // sharpening output: recommendation 0:
+  int speed;
+  // maximum allowed bitrate for any intra frame in % of bitrate target.
+  unsigned int rc_max_intra_bitrate_pct;
+  // maximum allowed bitrate for any inter frame in % of bitrate target.
+  unsigned int rc_max_inter_bitrate_pct;
+  // percent of rate boost for golden frame in CBR mode.
+  unsigned int gf_cbr_boost_pct;
+
+  MODE mode;
+  int pass;
+
+  // Key Framing Operations
+  int auto_key;  // autodetect cut scenes and set the keyframes
+  int key_freq;  // maximum distance to key frame.
+
+  int lag_in_frames;  // how many frames lag before we start encoding
+
+  // ----------------------------------------------------------------
+  // DATARATE CONTROL OPTIONS
+
+  // vbr, cbr, constrained quality or constant quality
+  enum vpx_rc_mode rc_mode;
+
+  // buffer targeting aggressiveness
+  int under_shoot_pct;
+  int over_shoot_pct;
+
+  // buffering parameters
+  int64_t starting_buffer_level_ms;
+  int64_t optimal_buffer_level_ms;
+  int64_t maximum_buffer_size_ms;
+
+  // Frame drop threshold.
+  int drop_frames_water_mark;
+
+  // controlling quality
+  int fixed_q;
+  int worst_allowed_q;
+  int best_allowed_q;
+  int cq_level;
+  AQ_MODE aq_mode;  // Adaptive Quantization mode
+
+  // Internal frame size scaling.
+  RESIZE_TYPE resize_mode;
+  int scaled_frame_width;
+  int scaled_frame_height;
+
+  // Enable feature to reduce the frame quantization every x frames.
+  int frame_periodic_boost;
+
+  // two pass datarate control
+  int two_pass_vbrbias;        // two pass datarate control tweaks
+  int two_pass_vbrmin_section;
+  int two_pass_vbrmax_section;
+  // END DATARATE CONTROL OPTIONS
+  // ----------------------------------------------------------------
+
+  int enable_auto_arf;
+
+  int encode_breakout;  // early breakout : for video conf recommend 800
+
+  /* Bitfield defining the error resiliency features to enable.
+   * Can provide decodable frames after losses in previous
+   * frames and decodable partitions after losses in the same frame.
+   */
+  unsigned int error_resilient_mode;
+
+  /* Bitfield defining the parallel decoding mode where the
+   * decoding in successive frames may be conducted in parallel
+   * just by decoding the frame headers.
+   */
+  unsigned int frame_parallel_decoding_mode;
+
+  int arnr_max_frames;
+  int arnr_strength;
+
+  int min_gf_interval;
+  int max_gf_interval;
+
+  int tile_columns;
+  int tile_rows;
+
+  int max_threads;
+
+  vpx_fixed_buf_t two_pass_stats_in;
+  struct vpx_codec_pkt_list *output_pkt_list;
+
+#if CONFIG_FP_MB_STATS
+  vpx_fixed_buf_t firstpass_mb_stats_in;
+#endif
+
+  vp8e_tuning tuning;
+  vp9e_tune_content content;
+#if CONFIG_VP9_HIGHBITDEPTH
+  int use_highbitdepth;
+#endif
+  vpx_color_space_t color_space;
+  int color_range;
+  int render_width;
+  int render_height;
+} VP10EncoderConfig;
+
+static INLINE int is_lossless_requested(const VP10EncoderConfig *cfg) {
+  return cfg->best_allowed_q == 0 && cfg->worst_allowed_q == 0;
+}
+
+// TODO(jingning) All spatially adaptive variables should go to TileDataEnc.
+typedef struct TileDataEnc {
+  TileInfo tile_info;
+  int thresh_freq_fact[BLOCK_SIZES][MAX_MODES];
+  int mode_map[BLOCK_SIZES][MAX_MODES];
+} TileDataEnc;
+
+typedef struct RD_COUNTS {
+  vp10_coeff_count coef_counts[TX_SIZES][PLANE_TYPES];
+  int64_t comp_pred_diff[REFERENCE_MODES];
+  int64_t filter_diff[SWITCHABLE_FILTER_CONTEXTS];
+  int m_search_count;
+  int ex_search_count;
+} RD_COUNTS;
+
+typedef struct ThreadData {
+  MACROBLOCK mb;
+  RD_COUNTS rd_counts;
+  FRAME_COUNTS *counts;
+
+  PICK_MODE_CONTEXT *leaf_tree;
+  PC_TREE *pc_tree;
+  PC_TREE *pc_root;
+} ThreadData;
+
+struct EncWorkerData;
+
+typedef struct ActiveMap {
+  int enabled;
+  int update;
+  unsigned char *map;
+} ActiveMap;
+
+typedef enum {
+  Y,
+  U,
+  V,
+  ALL
+} STAT_TYPE;
+
+typedef struct IMAGE_STAT {
+  double stat[ALL+1];
+  double worst;
+} ImageStat;
+
+typedef struct VP10_COMP {
+  QUANTS quants;
+  ThreadData td;
+  MB_MODE_INFO_EXT *mbmi_ext_base;
+  DECLARE_ALIGNED(16, int16_t, y_dequant[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, uv_dequant[QINDEX_RANGE][8]);
+  VP10_COMMON common;
+  VP10EncoderConfig oxcf;
+  struct lookahead_ctx    *lookahead;
+  struct lookahead_entry  *alt_ref_source;
+
+  YV12_BUFFER_CONFIG *Source;
+  YV12_BUFFER_CONFIG *Last_Source;  // NULL for first frame and alt_ref frames
+  YV12_BUFFER_CONFIG *un_scaled_source;
+  YV12_BUFFER_CONFIG scaled_source;
+  YV12_BUFFER_CONFIG *unscaled_last_source;
+  YV12_BUFFER_CONFIG scaled_last_source;
+
+  TileDataEnc *tile_data;
+  int allocated_tiles;  // Keep track of memory allocated for tiles.
+
+  // For a still frame, this flag is set to 1 to skip partition search.
+  int partition_search_skippable_frame;
+
+  int scaled_ref_idx[MAX_REF_FRAMES];
+  int lst_fb_idx;
+  int gld_fb_idx;
+  int alt_fb_idx;
+
+  int refresh_last_frame;
+  int refresh_golden_frame;
+  int refresh_alt_ref_frame;
+
+  int ext_refresh_frame_flags_pending;
+  int ext_refresh_last_frame;
+  int ext_refresh_golden_frame;
+  int ext_refresh_alt_ref_frame;
+
+  int ext_refresh_frame_context_pending;
+  int ext_refresh_frame_context;
+
+  YV12_BUFFER_CONFIG last_frame_uf;
+
+  TOKENEXTRA *tile_tok[4][1 << 6];
+  unsigned int tok_count[4][1 << 6];
+
+  // Ambient reconstruction err target for force key frames
+  int64_t ambient_err;
+
+  RD_OPT rd;
+
+  CODING_CONTEXT coding_context;
+
+  int *nmvcosts[2];
+  int *nmvcosts_hp[2];
+  int *nmvsadcosts[2];
+  int *nmvsadcosts_hp[2];
+
+  int64_t last_time_stamp_seen;
+  int64_t last_end_time_stamp_seen;
+  int64_t first_time_stamp_ever;
+
+  RATE_CONTROL rc;
+  double framerate;
+
+  int interp_filter_selected[MAX_REF_FRAMES][SWITCHABLE];
+
+  struct vpx_codec_pkt_list  *output_pkt_list;
+
+  MBGRAPH_FRAME_STATS mbgraph_stats[MAX_LAG_BUFFERS];
+  int mbgraph_n_frames;             // number of frames filled in the above
+  int static_mb_pct;                // % forced skip mbs by segmentation
+  int ref_frame_flags;
+
+  SPEED_FEATURES sf;
+
+  unsigned int max_mv_magnitude;
+  int mv_step_param;
+
+  int allow_comp_inter_inter;
+
+  // Default value is 1. From first pass stats, encode_breakout may be disabled.
+  ENCODE_BREAKOUT_TYPE allow_encode_breakout;
+
+  // Get threshold from external input. A suggested threshold is 800 for HD
+  // clips, and 300 for < HD clips.
+  int encode_breakout;
+
+  unsigned char *segmentation_map;
+
+  // segment threashold for encode breakout
+  int  segment_encode_breakout[MAX_SEGMENTS];
+
+  CYCLIC_REFRESH *cyclic_refresh;
+  ActiveMap active_map;
+
+  fractional_mv_step_fp *find_fractional_mv_step;
+  vp10_full_search_fn_t full_search_sad;
+  vp10_diamond_search_fn_t diamond_search_sad;
+  vp9_variance_fn_ptr_t fn_ptr[BLOCK_SIZES];
+  uint64_t time_receive_data;
+  uint64_t time_compress_data;
+  uint64_t time_pick_lpf;
+  uint64_t time_encode_sb_row;
+
+#if CONFIG_FP_MB_STATS
+  int use_fp_mb_stats;
+#endif
+
+  TWO_PASS twopass;
+
+  YV12_BUFFER_CONFIG alt_ref_buffer;
+
+
+#if CONFIG_INTERNAL_STATS
+  unsigned int mode_chosen_counts[MAX_MODES];
+
+  int    count;
+  uint64_t total_sq_error;
+  uint64_t total_samples;
+  ImageStat psnr;
+
+  uint64_t totalp_sq_error;
+  uint64_t totalp_samples;
+  ImageStat psnrp;
+
+  double total_blockiness;
+  double worst_blockiness;
+
+  int    bytes;
+  double summed_quality;
+  double summed_weights;
+  double summedp_quality;
+  double summedp_weights;
+  unsigned int tot_recode_hits;
+  double worst_ssim;
+
+  ImageStat ssimg;
+  ImageStat fastssim;
+  ImageStat psnrhvs;
+
+  int b_calculate_ssimg;
+  int b_calculate_blockiness;
+
+  int b_calculate_consistency;
+
+  double total_inconsistency;
+  double worst_consistency;
+  Ssimv *ssim_vars;
+  Metrics metrics;
+#endif
+  int b_calculate_psnr;
+
+  int droppable;
+
+  int initial_width;
+  int initial_height;
+  int initial_mbs;  // Number of MBs in the full-size frame; to be used to
+                    // normalize the firstpass stats. This will differ from the
+                    // number of MBs in the current frame when the frame is
+                    // scaled.
+
+  // Store frame variance info in SOURCE_VAR_BASED_PARTITION search type.
+  diff *source_diff_var;
+  // The threshold used in SOURCE_VAR_BASED_PARTITION search type.
+  unsigned int source_var_thresh;
+  int frames_till_next_var_check;
+
+  int frame_flags;
+
+  search_site_config ss_cfg;
+
+  int mbmode_cost[INTRA_MODES];
+  unsigned int inter_mode_cost[INTER_MODE_CONTEXTS][INTER_MODES];
+  int intra_uv_mode_cost[INTRA_MODES][INTRA_MODES];
+  int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
+  int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
+  int partition_cost[PARTITION_CONTEXTS][PARTITION_TYPES];
+
+  int multi_arf_allowed;
+  int multi_arf_enabled;
+  int multi_arf_last_grp_enabled;
+
+  int intra_tx_type_costs[EXT_TX_SIZES][TX_TYPES][TX_TYPES];
+  int inter_tx_type_costs[EXT_TX_SIZES][TX_TYPES];
+#if CONFIG_VP9_TEMPORAL_DENOISING
+  VP9_DENOISER denoiser;
+#endif
+
+  int resize_pending;
+  int resize_state;
+  int resize_scale_num;
+  int resize_scale_den;
+  int resize_avg_qp;
+  int resize_buffer_underflow;
+  int resize_count;
+
+  // VAR_BASED_PARTITION thresholds
+  // 0 - threshold_64x64; 1 - threshold_32x32;
+  // 2 - threshold_16x16; 3 - vbp_threshold_8x8;
+  int64_t vbp_thresholds[4];
+  int64_t vbp_threshold_minmax;
+  int64_t vbp_threshold_sad;
+  BLOCK_SIZE vbp_bsize_min;
+
+  // Multi-threading
+  int num_workers;
+  VPxWorker *workers;
+  struct EncWorkerData *tile_thr_data;
+  VP9LfSync lf_row_sync;
+} VP10_COMP;
+
+void vp10_initialize_enc(void);
+
+struct VP10_COMP *vp10_create_compressor(VP10EncoderConfig *oxcf,
+                                       BufferPool *const pool);
+void vp10_remove_compressor(VP10_COMP *cpi);
+
+void vp10_change_config(VP10_COMP *cpi, const VP10EncoderConfig *oxcf);
+
+  // receive a frames worth of data. caller can assume that a copy of this
+  // frame is made and not just a copy of the pointer..
+int vp10_receive_raw_frame(VP10_COMP *cpi, unsigned int frame_flags,
+                          YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
+                          int64_t end_time_stamp);
+
+int vp10_get_compressed_data(VP10_COMP *cpi, unsigned int *frame_flags,
+                            size_t *size, uint8_t *dest,
+                            int64_t *time_stamp, int64_t *time_end, int flush);
+
+int vp10_get_preview_raw_frame(VP10_COMP *cpi, YV12_BUFFER_CONFIG *dest,
+                              vp10_ppflags_t *flags);
+
+int vp10_use_as_reference(VP10_COMP *cpi, int ref_frame_flags);
+
+void vp10_update_reference(VP10_COMP *cpi, int ref_frame_flags);
+
+int vp10_copy_reference_enc(VP10_COMP *cpi, VP9_REFFRAME ref_frame_flag,
+                           YV12_BUFFER_CONFIG *sd);
+
+int vp10_set_reference_enc(VP10_COMP *cpi, VP9_REFFRAME ref_frame_flag,
+                          YV12_BUFFER_CONFIG *sd);
+
+int vp10_update_entropy(VP10_COMP *cpi, int update);
+
+int vp10_set_active_map(VP10_COMP *cpi, unsigned char *map, int rows, int cols);
+
+int vp10_get_active_map(VP10_COMP *cpi, unsigned char *map, int rows, int cols);
+
+int vp10_set_internal_size(VP10_COMP *cpi,
+                          VPX_SCALING horiz_mode, VPX_SCALING vert_mode);
+
+int vp10_set_size_literal(VP10_COMP *cpi, unsigned int width,
+                         unsigned int height);
+
+int vp10_get_quantizer(struct VP10_COMP *cpi);
+
+static INLINE int frame_is_kf_gf_arf(const VP10_COMP *cpi) {
+  return frame_is_intra_only(&cpi->common) ||
+         cpi->refresh_alt_ref_frame ||
+         (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref);
+}
+
+static INLINE int get_ref_frame_map_idx(const VP10_COMP *cpi,
+                                        MV_REFERENCE_FRAME ref_frame) {
+  if (ref_frame == LAST_FRAME) {
+    return cpi->lst_fb_idx;
+  } else if (ref_frame == GOLDEN_FRAME) {
+    return cpi->gld_fb_idx;
+  } else {
+    return cpi->alt_fb_idx;
+  }
+}
+
+static INLINE int get_ref_frame_buf_idx(const VP10_COMP *const cpi,
+                                        int ref_frame) {
+  const VP10_COMMON *const cm = &cpi->common;
+  const int map_idx = get_ref_frame_map_idx(cpi, ref_frame);
+  return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : INVALID_IDX;
+}
+
+static INLINE YV12_BUFFER_CONFIG *get_ref_frame_buffer(
+    VP10_COMP *cpi, MV_REFERENCE_FRAME ref_frame) {
+  VP10_COMMON *const cm = &cpi->common;
+  const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+  return
+      buf_idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[buf_idx].buf : NULL;
+}
+
+static INLINE int get_token_alloc(int mb_rows, int mb_cols) {
+  // TODO(JBB): double check we can't exceed this token count if we have a
+  // 32x32 transform crossing a boundary at a multiple of 16.
+  // mb_rows, cols are in units of 16 pixels. We assume 3 planes all at full
+  // resolution. We assume up to 1 token per pixel, and then allow
+  // a head room of 1 EOSB token per 8x8 block per plane.
+  return mb_rows * mb_cols * (16 * 16 + 4) * 3;
+}
+
+// Get the allocated token size for a tile. It does the same calculation as in
+// the frame token allocation.
+static INLINE int allocated_tokens(TileInfo tile) {
+  int tile_mb_rows = (tile.mi_row_end - tile.mi_row_start + 1) >> 1;
+  int tile_mb_cols = (tile.mi_col_end - tile.mi_col_start + 1) >> 1;
+
+  return get_token_alloc(tile_mb_rows, tile_mb_cols);
+}
+
+int64_t vp10_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
+#if CONFIG_VP9_HIGHBITDEPTH
+int64_t vp10_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
+                             const YV12_BUFFER_CONFIG *b);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+void vp10_alloc_compressor_data(VP10_COMP *cpi);
+
+void vp10_scale_references(VP10_COMP *cpi);
+
+void vp10_update_reference_frames(VP10_COMP *cpi);
+
+void vp10_set_high_precision_mv(VP10_COMP *cpi, int allow_high_precision_mv);
+
+YV12_BUFFER_CONFIG *vp10_scale_if_required_fast(VP10_COMMON *cm,
+                                               YV12_BUFFER_CONFIG *unscaled,
+                                               YV12_BUFFER_CONFIG *scaled);
+
+YV12_BUFFER_CONFIG *vp10_scale_if_required(VP10_COMMON *cm,
+                                          YV12_BUFFER_CONFIG *unscaled,
+                                          YV12_BUFFER_CONFIG *scaled);
+
+void vp10_apply_encoding_flags(VP10_COMP *cpi, vpx_enc_frame_flags_t flags);
+
+static INLINE int is_altref_enabled(const VP10_COMP *const cpi) {
+  return cpi->oxcf.mode != REALTIME && cpi->oxcf.lag_in_frames > 0 &&
+         cpi->oxcf.enable_auto_arf;
+}
+
+static INLINE void set_ref_ptrs(VP10_COMMON *cm, MACROBLOCKD *xd,
+                                MV_REFERENCE_FRAME ref0,
+                                MV_REFERENCE_FRAME ref1) {
+  xd->block_refs[0] = &cm->frame_refs[ref0 >= LAST_FRAME ? ref0 - LAST_FRAME
+                                                         : 0];
+  xd->block_refs[1] = &cm->frame_refs[ref1 >= LAST_FRAME ? ref1 - LAST_FRAME
+                                                         : 0];
+}
+
+static INLINE int get_chessboard_index(const int frame_index) {
+  return frame_index & 0x1;
+}
+
+static INLINE int *cond_cost_list(const struct VP10_COMP *cpi, int *cost_list) {
+  return cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL;
+}
+
+void vp10_new_framerate(VP10_COMP *cpi, double framerate);
+
+#define LAYER_IDS_TO_IDX(sl, tl, num_tl) ((sl) * (num_tl) + (tl))
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_ENCODER_ENCODER_H_
diff --git a/libs/libvpx/vp10/encoder/ethread.c b/libs/libvpx/vp10/encoder/ethread.c
new file mode 100644
index 0000000000..ad47ccf043
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/ethread.c
@@ -0,0 +1,168 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp10/encoder/encodeframe.h"
+#include "vp10/encoder/encoder.h"
+#include "vp10/encoder/ethread.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
+  int i, j, k, l, m, n;
+
+  for (i = 0; i < REFERENCE_MODES; i++)
+    td->rd_counts.comp_pred_diff[i] += td_t->rd_counts.comp_pred_diff[i];
+
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
+    td->rd_counts.filter_diff[i] += td_t->rd_counts.filter_diff[i];
+
+  for (i = 0; i < TX_SIZES; i++)
+    for (j = 0; j < PLANE_TYPES; j++)
+      for (k = 0; k < REF_TYPES; k++)
+        for (l = 0; l < COEF_BANDS; l++)
+          for (m = 0; m < COEFF_CONTEXTS; m++)
+            for (n = 0; n < ENTROPY_TOKENS; n++)
+              td->rd_counts.coef_counts[i][j][k][l][m][n] +=
+                  td_t->rd_counts.coef_counts[i][j][k][l][m][n];
+
+
+  // Counts of all motion searches and exhuastive mesh searches.
+  td->rd_counts.m_search_count += td_t->rd_counts.m_search_count;
+  td->rd_counts.ex_search_count += td_t->rd_counts.ex_search_count;
+}
+
+static int enc_worker_hook(EncWorkerData *const thread_data, void *unused) {
+  VP10_COMP *const cpi = thread_data->cpi;
+  const VP10_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int tile_rows = 1 << cm->log2_tile_rows;
+  int t;
+
+  (void) unused;
+
+  for (t = thread_data->start; t < tile_rows * tile_cols;
+      t += cpi->num_workers) {
+    int tile_row = t / tile_cols;
+    int tile_col = t % tile_cols;
+
+    vp10_encode_tile(cpi, thread_data->td, tile_row, tile_col);
+  }
+
+  return 0;
+}
+
+void vp10_encode_tiles_mt(VP10_COMP *cpi) {
+  VP10_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
+  const int num_workers = VPXMIN(cpi->oxcf.max_threads, tile_cols);
+  int i;
+
+  vp10_init_tile_data(cpi);
+
+  // Only run once to create threads and allocate thread data.
+  if (cpi->num_workers == 0) {
+    int allocated_workers = num_workers;
+
+    CHECK_MEM_ERROR(cm, cpi->workers,
+                    vpx_malloc(allocated_workers * sizeof(*cpi->workers)));
+
+    CHECK_MEM_ERROR(cm, cpi->tile_thr_data,
+                    vpx_calloc(allocated_workers,
+                    sizeof(*cpi->tile_thr_data)));
+
+    for (i = 0; i < allocated_workers; i++) {
+      VPxWorker *const worker = &cpi->workers[i];
+      EncWorkerData *thread_data = &cpi->tile_thr_data[i];
+
+      ++cpi->num_workers;
+      winterface->init(worker);
+
+      if (i < allocated_workers - 1) {
+        thread_data->cpi = cpi;
+
+        // Allocate thread data.
+        CHECK_MEM_ERROR(cm, thread_data->td,
+                        vpx_memalign(32, sizeof(*thread_data->td)));
+        vp10_zero(*thread_data->td);
+
+        // Set up pc_tree.
+        thread_data->td->leaf_tree = NULL;
+        thread_data->td->pc_tree = NULL;
+        vp10_setup_pc_tree(cm, thread_data->td);
+
+        // Allocate frame counters in thread data.
+        CHECK_MEM_ERROR(cm, thread_data->td->counts,
+                        vpx_calloc(1, sizeof(*thread_data->td->counts)));
+
+        // Create threads
+        if (!winterface->reset(worker))
+          vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                             "Tile encoder thread creation failed");
+      } else {
+        // Main thread acts as a worker and uses the thread data in cpi.
+        thread_data->cpi = cpi;
+        thread_data->td = &cpi->td;
+      }
+
+      winterface->sync(worker);
+    }
+  }
+
+  for (i = 0; i < num_workers; i++) {
+    VPxWorker *const worker = &cpi->workers[i];
+    EncWorkerData *thread_data;
+
+    worker->hook = (VPxWorkerHook)enc_worker_hook;
+    worker->data1 = &cpi->tile_thr_data[i];
+    worker->data2 = NULL;
+    thread_data = (EncWorkerData*)worker->data1;
+
+    // Before encoding a frame, copy the thread data from cpi.
+    if (thread_data->td != &cpi->td) {
+      thread_data->td->mb = cpi->td.mb;
+      thread_data->td->rd_counts = cpi->td.rd_counts;
+    }
+    if (thread_data->td->counts != &cpi->common.counts) {
+      memcpy(thread_data->td->counts, &cpi->common.counts,
+             sizeof(cpi->common.counts));
+    }
+  }
+
+  // Encode a frame
+  for (i = 0; i < num_workers; i++) {
+    VPxWorker *const worker = &cpi->workers[i];
+    EncWorkerData *const thread_data = (EncWorkerData*)worker->data1;
+
+    // Set the starting tile for each thread.
+    thread_data->start = i;
+
+    if (i == cpi->num_workers - 1)
+      winterface->execute(worker);
+    else
+      winterface->launch(worker);
+  }
+
+  // Encoding ends.
+  for (i = 0; i < num_workers; i++) {
+    VPxWorker *const worker = &cpi->workers[i];
+    winterface->sync(worker);
+  }
+
+  for (i = 0; i < num_workers; i++) {
+    VPxWorker *const worker = &cpi->workers[i];
+    EncWorkerData *const thread_data = (EncWorkerData*)worker->data1;
+
+    // Accumulate counters.
+    if (i < cpi->num_workers - 1) {
+      vp10_accumulate_frame_counts(cm, thread_data->td->counts, 0);
+      accumulate_rd_opt(&cpi->td, thread_data->td);
+    }
+  }
+}
diff --git a/libs/libvpx/vp10/encoder/ethread.h b/libs/libvpx/vp10/encoder/ethread.h
new file mode 100644
index 0000000000..d72816cd5c
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/ethread.h
@@ -0,0 +1,33 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_ENCODER_ETHREAD_H_
+#define VP10_ENCODER_ETHREAD_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP10_COMP;
+struct ThreadData;
+
+typedef struct EncWorkerData {
+  struct VP10_COMP *cpi;
+  struct ThreadData *td;
+  int start;
+} EncWorkerData;
+
+void vp10_encode_tiles_mt(struct VP10_COMP *cpi);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_ENCODER_ETHREAD_H_
diff --git a/libs/libvpx/vp10/encoder/extend.c b/libs/libvpx/vp10/encoder/extend.c
new file mode 100644
index 0000000000..4c8ce3b572
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/extend.c
@@ -0,0 +1,201 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+
+#include "vp10/common/common.h"
+#include "vp10/encoder/extend.h"
+
+static void copy_and_extend_plane(const uint8_t *src, int src_pitch,
+                                  uint8_t *dst, int dst_pitch,
+                                  int w, int h,
+                                  int extend_top, int extend_left,
+                                  int extend_bottom, int extend_right) {
+  int i, linesize;
+
+  // copy the left and right most columns out
+  const uint8_t *src_ptr1 = src;
+  const uint8_t *src_ptr2 = src + w - 1;
+  uint8_t *dst_ptr1 = dst - extend_left;
+  uint8_t *dst_ptr2 = dst + w;
+
+  for (i = 0; i < h; i++) {
+    memset(dst_ptr1, src_ptr1[0], extend_left);
+    memcpy(dst_ptr1 + extend_left, src_ptr1, w);
+    memset(dst_ptr2, src_ptr2[0], extend_right);
+    src_ptr1 += src_pitch;
+    src_ptr2 += src_pitch;
+    dst_ptr1 += dst_pitch;
+    dst_ptr2 += dst_pitch;
+  }
+
+  // Now copy the top and bottom lines into each line of the respective
+  // borders
+  src_ptr1 = dst - extend_left;
+  src_ptr2 = dst + dst_pitch * (h - 1) - extend_left;
+  dst_ptr1 = dst + dst_pitch * (-extend_top) - extend_left;
+  dst_ptr2 = dst + dst_pitch * (h) - extend_left;
+  linesize = extend_left + extend_right + w;
+
+  for (i = 0; i < extend_top; i++) {
+    memcpy(dst_ptr1, src_ptr1, linesize);
+    dst_ptr1 += dst_pitch;
+  }
+
+  for (i = 0; i < extend_bottom; i++) {
+    memcpy(dst_ptr2, src_ptr2, linesize);
+    dst_ptr2 += dst_pitch;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_copy_and_extend_plane(const uint8_t *src8, int src_pitch,
+                                         uint8_t *dst8, int dst_pitch,
+                                         int w, int h,
+                                         int extend_top, int extend_left,
+                                         int extend_bottom, int extend_right) {
+  int i, linesize;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+
+  // copy the left and right most columns out
+  const uint16_t *src_ptr1 = src;
+  const uint16_t *src_ptr2 = src + w - 1;
+  uint16_t *dst_ptr1 = dst - extend_left;
+  uint16_t *dst_ptr2 = dst + w;
+
+  for (i = 0; i < h; i++) {
+    vpx_memset16(dst_ptr1, src_ptr1[0], extend_left);
+    memcpy(dst_ptr1 + extend_left, src_ptr1, w * sizeof(src_ptr1[0]));
+    vpx_memset16(dst_ptr2, src_ptr2[0], extend_right);
+    src_ptr1 += src_pitch;
+    src_ptr2 += src_pitch;
+    dst_ptr1 += dst_pitch;
+    dst_ptr2 += dst_pitch;
+  }
+
+  // Now copy the top and bottom lines into each line of the respective
+  // borders
+  src_ptr1 = dst - extend_left;
+  src_ptr2 = dst + dst_pitch * (h - 1) - extend_left;
+  dst_ptr1 = dst + dst_pitch * (-extend_top) - extend_left;
+  dst_ptr2 = dst + dst_pitch * (h) - extend_left;
+  linesize = extend_left + extend_right + w;
+
+  for (i = 0; i < extend_top; i++) {
+    memcpy(dst_ptr1, src_ptr1, linesize * sizeof(src_ptr1[0]));
+    dst_ptr1 += dst_pitch;
+  }
+
+  for (i = 0; i < extend_bottom; i++) {
+    memcpy(dst_ptr2, src_ptr2, linesize * sizeof(src_ptr2[0]));
+    dst_ptr2 += dst_pitch;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+void vp10_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
+                               YV12_BUFFER_CONFIG *dst) {
+  // Extend src frame in buffer
+  // Altref filtering assumes 16 pixel extension
+  const int et_y = 16;
+  const int el_y = 16;
+  // Motion estimation may use src block variance with the block size up
+  // to 64x64, so the right and bottom need to be extended to 64 multiple
+  // or up to 16, whichever is greater.
+  const int er_y =
+      VPXMAX(src->y_width + 16, ALIGN_POWER_OF_TWO(src->y_width, 6)) -
+      src->y_crop_width;
+  const int eb_y =
+      VPXMAX(src->y_height + 16, ALIGN_POWER_OF_TWO(src->y_height, 6)) -
+      src->y_crop_height;
+  const int uv_width_subsampling = (src->uv_width != src->y_width);
+  const int uv_height_subsampling = (src->uv_height != src->y_height);
+  const int et_uv = et_y >> uv_height_subsampling;
+  const int el_uv = el_y >> uv_width_subsampling;
+  const int eb_uv = eb_y >> uv_height_subsampling;
+  const int er_uv = er_y >> uv_width_subsampling;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
+    highbd_copy_and_extend_plane(src->y_buffer, src->y_stride,
+                                 dst->y_buffer, dst->y_stride,
+                                 src->y_crop_width, src->y_crop_height,
+                                 et_y, el_y, eb_y, er_y);
+
+    highbd_copy_and_extend_plane(src->u_buffer, src->uv_stride,
+                                 dst->u_buffer, dst->uv_stride,
+                                 src->uv_crop_width, src->uv_crop_height,
+                                 et_uv, el_uv, eb_uv, er_uv);
+
+    highbd_copy_and_extend_plane(src->v_buffer, src->uv_stride,
+                                 dst->v_buffer, dst->uv_stride,
+                                 src->uv_crop_width, src->uv_crop_height,
+                                 et_uv, el_uv, eb_uv, er_uv);
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  copy_and_extend_plane(src->y_buffer, src->y_stride,
+                        dst->y_buffer, dst->y_stride,
+                        src->y_crop_width, src->y_crop_height,
+                        et_y, el_y, eb_y, er_y);
+
+  copy_and_extend_plane(src->u_buffer, src->uv_stride,
+                        dst->u_buffer, dst->uv_stride,
+                        src->uv_crop_width, src->uv_crop_height,
+                        et_uv, el_uv, eb_uv, er_uv);
+
+  copy_and_extend_plane(src->v_buffer, src->uv_stride,
+                        dst->v_buffer, dst->uv_stride,
+                        src->uv_crop_width, src->uv_crop_height,
+                        et_uv, el_uv, eb_uv, er_uv);
+}
+
+void vp10_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src,
+                                         YV12_BUFFER_CONFIG *dst,
+                                         int srcy, int srcx,
+                                         int srch, int srcw) {
+  // If the side is not touching the bounder then don't extend.
+  const int et_y = srcy ? 0 : dst->border;
+  const int el_y = srcx ? 0 : dst->border;
+  const int eb_y = srcy + srch != src->y_height ? 0 :
+                      dst->border + dst->y_height - src->y_height;
+  const int er_y = srcx + srcw != src->y_width ? 0 :
+                      dst->border + dst->y_width - src->y_width;
+  const int src_y_offset = srcy * src->y_stride + srcx;
+  const int dst_y_offset = srcy * dst->y_stride + srcx;
+
+  const int et_uv = ROUND_POWER_OF_TWO(et_y, 1);
+  const int el_uv = ROUND_POWER_OF_TWO(el_y, 1);
+  const int eb_uv = ROUND_POWER_OF_TWO(eb_y, 1);
+  const int er_uv = ROUND_POWER_OF_TWO(er_y, 1);
+  const int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1);
+  const int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1);
+  const int srch_uv = ROUND_POWER_OF_TWO(srch, 1);
+  const int srcw_uv = ROUND_POWER_OF_TWO(srcw, 1);
+
+  copy_and_extend_plane(src->y_buffer + src_y_offset, src->y_stride,
+                        dst->y_buffer + dst_y_offset, dst->y_stride,
+                        srcw, srch,
+                        et_y, el_y, eb_y, er_y);
+
+  copy_and_extend_plane(src->u_buffer + src_uv_offset, src->uv_stride,
+                        dst->u_buffer + dst_uv_offset, dst->uv_stride,
+                        srcw_uv, srch_uv,
+                        et_uv, el_uv, eb_uv, er_uv);
+
+  copy_and_extend_plane(src->v_buffer + src_uv_offset, src->uv_stride,
+                        dst->v_buffer + dst_uv_offset, dst->uv_stride,
+                        srcw_uv, srch_uv,
+                        et_uv, el_uv, eb_uv, er_uv);
+}
diff --git a/libs/libvpx/vp10/encoder/extend.h b/libs/libvpx/vp10/encoder/extend.h
new file mode 100644
index 0000000000..6f502ef6a0
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/extend.h
@@ -0,0 +1,33 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_ENCODER_EXTEND_H_
+#define VP10_ENCODER_EXTEND_H_
+
+#include "vpx_scale/yv12config.h"
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+void vp10_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
+                               YV12_BUFFER_CONFIG *dst);
+
+void vp10_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src,
+                                         YV12_BUFFER_CONFIG *dst,
+                                         int srcy, int srcx,
+                                         int srch, int srcw);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_ENCODER_EXTEND_H_
diff --git a/libs/libvpx/vp10/encoder/firstpass.c b/libs/libvpx/vp10/encoder/firstpass.c
new file mode 100644
index 0000000000..bc1ce001bb
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/firstpass.c
@@ -0,0 +1,2667 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_scale_rtcd.h"
+
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/system_state.h"
+#include "vpx_scale/vpx_scale.h"
+#include "vpx_scale/yv12config.h"
+
+#include "vp10/common/entropymv.h"
+#include "vp10/common/quant_common.h"
+#include "vp10/common/reconinter.h"  // vp10_setup_dst_planes()
+#include "vp10/encoder/aq_variance.h"
+#include "vp10/encoder/block.h"
+#include "vp10/encoder/encodeframe.h"
+#include "vp10/encoder/encodemb.h"
+#include "vp10/encoder/encodemv.h"
+#include "vp10/encoder/encoder.h"
+#include "vp10/encoder/extend.h"
+#include "vp10/encoder/firstpass.h"
+#include "vp10/encoder/mcomp.h"
+#include "vp10/encoder/quantize.h"
+#include "vp10/encoder/rd.h"
+#include "vpx_dsp/variance.h"
+
+#define OUTPUT_FPF          0
+#define ARF_STATS_OUTPUT    0
+
+#define GROUP_ADAPTIVE_MAXQ 1
+
+#define BOOST_BREAKOUT      12.5
+#define BOOST_FACTOR        12.5
+#define ERR_DIVISOR         128.0
+#define FACTOR_PT_LOW       0.70
+#define FACTOR_PT_HIGH      0.90
+#define FIRST_PASS_Q        10.0
+#define GF_MAX_BOOST        96.0
+#define INTRA_MODE_PENALTY  1024
+#define KF_MAX_BOOST        128.0
+#define MIN_ARF_GF_BOOST    240
+#define MIN_DECAY_FACTOR    0.01
+#define MIN_KF_BOOST        300
+#define NEW_MV_MODE_PENALTY 32
+#define DARK_THRESH         64
+#define DEFAULT_GRP_WEIGHT  1.0
+#define RC_FACTOR_MIN       0.75
+#define RC_FACTOR_MAX       1.75
+
+
+#define NCOUNT_INTRA_THRESH 8192
+#define NCOUNT_INTRA_FACTOR 3
+#define NCOUNT_FRAME_II_THRESH 5.0
+
+#define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x) - 0.000001 : (x) + 0.000001)
+
+#if ARF_STATS_OUTPUT
+unsigned int arf_count = 0;
+#endif
+
+// Resets the first pass file to the given position using a relative seek from
+// the current position.
+static void reset_fpf_position(TWO_PASS *p,
+                               const FIRSTPASS_STATS *position) {
+  p->stats_in = position;
+}
+
+// Read frame stats at an offset from the current position.
+static const FIRSTPASS_STATS *read_frame_stats(const TWO_PASS *p, int offset) {
+  if ((offset >= 0 && p->stats_in + offset >= p->stats_in_end) ||
+      (offset < 0 && p->stats_in + offset < p->stats_in_start)) {
+    return NULL;
+  }
+
+  return &p->stats_in[offset];
+}
+
+static int input_stats(TWO_PASS *p, FIRSTPASS_STATS *fps) {
+  if (p->stats_in >= p->stats_in_end)
+    return EOF;
+
+  *fps = *p->stats_in;
+  ++p->stats_in;
+  return 1;
+}
+
+static void output_stats(FIRSTPASS_STATS *stats,
+                         struct vpx_codec_pkt_list *pktlist) {
+  struct vpx_codec_cx_pkt pkt;
+  pkt.kind = VPX_CODEC_STATS_PKT;
+  pkt.data.twopass_stats.buf = stats;
+  pkt.data.twopass_stats.sz = sizeof(FIRSTPASS_STATS);
+  vpx_codec_pkt_list_add(pktlist, &pkt);
+
+// TEMP debug code
+#if OUTPUT_FPF
+  {
+    FILE *fpfile;
+    fpfile = fopen("firstpass.stt", "a");
+
+    fprintf(fpfile, "%12.0lf %12.4lf %12.0lf %12.0lf %12.0lf %12.4lf %12.4lf"
+            "%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf"
+            "%12.4lf %12.4lf %12.0lf %12.0lf %12.0lf %12.4lf\n",
+            stats->frame,
+            stats->weight,
+            stats->intra_error,
+            stats->coded_error,
+            stats->sr_coded_error,
+            stats->pcnt_inter,
+            stats->pcnt_motion,
+            stats->pcnt_second_ref,
+            stats->pcnt_neutral,
+            stats->intra_skip_pct,
+            stats->inactive_zone_rows,
+            stats->inactive_zone_cols,
+            stats->MVr,
+            stats->mvr_abs,
+            stats->MVc,
+            stats->mvc_abs,
+            stats->MVrv,
+            stats->MVcv,
+            stats->mv_in_out_count,
+            stats->new_mv_count,
+            stats->count,
+            stats->duration);
+    fclose(fpfile);
+  }
+#endif
+}
+
+#if CONFIG_FP_MB_STATS
+static void output_fpmb_stats(uint8_t *this_frame_mb_stats,
+                              VP10_COMMON *cm,
+                              struct vpx_codec_pkt_list *pktlist) {
+  struct vpx_codec_cx_pkt pkt;
+  pkt.kind = VPX_CODEC_FPMB_STATS_PKT;
+  pkt.data.firstpass_mb_stats.buf = this_frame_mb_stats;
+  pkt.data.firstpass_mb_stats.sz = cm->initial_mbs * sizeof(uint8_t);
+  vpx_codec_pkt_list_add(pktlist, &pkt);
+}
+#endif
+
+static void zero_stats(FIRSTPASS_STATS *section) {
+  section->frame = 0.0;
+  section->weight = 0.0;
+  section->intra_error = 0.0;
+  section->coded_error = 0.0;
+  section->sr_coded_error = 0.0;
+  section->pcnt_inter  = 0.0;
+  section->pcnt_motion  = 0.0;
+  section->pcnt_second_ref = 0.0;
+  section->pcnt_neutral = 0.0;
+  section->intra_skip_pct = 0.0;
+  section->inactive_zone_rows = 0.0;
+  section->inactive_zone_cols = 0.0;
+  section->MVr = 0.0;
+  section->mvr_abs     = 0.0;
+  section->MVc        = 0.0;
+  section->mvc_abs     = 0.0;
+  section->MVrv       = 0.0;
+  section->MVcv       = 0.0;
+  section->mv_in_out_count  = 0.0;
+  section->new_mv_count = 0.0;
+  section->count      = 0.0;
+  section->duration   = 1.0;
+}
+
+static void accumulate_stats(FIRSTPASS_STATS *section,
+                             const FIRSTPASS_STATS *frame) {
+  section->frame += frame->frame;
+  section->weight += frame->weight;
+  section->intra_error += frame->intra_error;
+  section->coded_error += frame->coded_error;
+  section->sr_coded_error += frame->sr_coded_error;
+  section->pcnt_inter  += frame->pcnt_inter;
+  section->pcnt_motion += frame->pcnt_motion;
+  section->pcnt_second_ref += frame->pcnt_second_ref;
+  section->pcnt_neutral += frame->pcnt_neutral;
+  section->intra_skip_pct += frame->intra_skip_pct;
+  section->inactive_zone_rows += frame->inactive_zone_rows;
+  section->inactive_zone_cols += frame->inactive_zone_cols;
+  section->MVr += frame->MVr;
+  section->mvr_abs     += frame->mvr_abs;
+  section->MVc        += frame->MVc;
+  section->mvc_abs     += frame->mvc_abs;
+  section->MVrv       += frame->MVrv;
+  section->MVcv       += frame->MVcv;
+  section->mv_in_out_count  += frame->mv_in_out_count;
+  section->new_mv_count += frame->new_mv_count;
+  section->count      += frame->count;
+  section->duration   += frame->duration;
+}
+
+static void subtract_stats(FIRSTPASS_STATS *section,
+                           const FIRSTPASS_STATS *frame) {
+  section->frame -= frame->frame;
+  section->weight -= frame->weight;
+  section->intra_error -= frame->intra_error;
+  section->coded_error -= frame->coded_error;
+  section->sr_coded_error -= frame->sr_coded_error;
+  section->pcnt_inter  -= frame->pcnt_inter;
+  section->pcnt_motion -= frame->pcnt_motion;
+  section->pcnt_second_ref -= frame->pcnt_second_ref;
+  section->pcnt_neutral -= frame->pcnt_neutral;
+  section->intra_skip_pct -= frame->intra_skip_pct;
+  section->inactive_zone_rows -= frame->inactive_zone_rows;
+  section->inactive_zone_cols -= frame->inactive_zone_cols;
+  section->MVr -= frame->MVr;
+  section->mvr_abs     -= frame->mvr_abs;
+  section->MVc        -= frame->MVc;
+  section->mvc_abs     -= frame->mvc_abs;
+  section->MVrv       -= frame->MVrv;
+  section->MVcv       -= frame->MVcv;
+  section->mv_in_out_count  -= frame->mv_in_out_count;
+  section->new_mv_count -= frame->new_mv_count;
+  section->count      -= frame->count;
+  section->duration   -= frame->duration;
+}
+
+// Calculate an active area of the image that discounts formatting
+// bars and partially discounts other 0 energy areas.
+#define MIN_ACTIVE_AREA 0.5
+#define MAX_ACTIVE_AREA 1.0
+static double calculate_active_area(const VP10_COMP *cpi,
+                                    const FIRSTPASS_STATS *this_frame)
+{
+  double active_pct;
+
+  active_pct = 1.0 -
+    ((this_frame->intra_skip_pct / 2) +
+     ((this_frame->inactive_zone_rows * 2) / (double)cpi->common.mb_rows));
+  return fclamp(active_pct, MIN_ACTIVE_AREA, MAX_ACTIVE_AREA);
+}
+
+// Calculate a modified Error used in distributing bits between easier and
+// harder frames.
+#define ACT_AREA_CORRECTION 0.5
+static double calculate_modified_err(const VP10_COMP *cpi,
+                                     const TWO_PASS *twopass,
+                                     const VP10EncoderConfig *oxcf,
+                                     const FIRSTPASS_STATS *this_frame) {
+  const FIRSTPASS_STATS *const stats = &twopass->total_stats;
+  const double av_weight = stats->weight / stats->count;
+  const double av_err = (stats->coded_error * av_weight) / stats->count;
+  double modified_error =
+    av_err * pow(this_frame->coded_error * this_frame->weight /
+                 DOUBLE_DIVIDE_CHECK(av_err), oxcf->two_pass_vbrbias / 100.0);
+
+  // Correction for active area. Frames with a reduced active area
+  // (eg due to formatting bars) have a higher error per mb for the
+  // remaining active MBs. The correction here assumes that coding
+  // 0.5N blocks of complexity 2X is a little easier than coding N
+  // blocks of complexity X.
+  modified_error *=
+    pow(calculate_active_area(cpi, this_frame), ACT_AREA_CORRECTION);
+
+  return fclamp(modified_error,
+                twopass->modified_error_min, twopass->modified_error_max);
+}
+
+// This function returns the maximum target rate per frame.
+static int frame_max_bits(const RATE_CONTROL *rc,
+                          const VP10EncoderConfig *oxcf) {
+  int64_t max_bits = ((int64_t)rc->avg_frame_bandwidth *
+                          (int64_t)oxcf->two_pass_vbrmax_section) / 100;
+  if (max_bits < 0)
+    max_bits = 0;
+  else if (max_bits > rc->max_frame_bandwidth)
+    max_bits = rc->max_frame_bandwidth;
+
+  return (int)max_bits;
+}
+
+void vp10_init_first_pass(VP10_COMP *cpi) {
+  zero_stats(&cpi->twopass.total_stats);
+}
+
+void vp10_end_first_pass(VP10_COMP *cpi) {
+  output_stats(&cpi->twopass.total_stats, cpi->output_pkt_list);
+}
+
+static vpx_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) {
+  switch (bsize) {
+    case BLOCK_8X8:
+      return vpx_mse8x8;
+    case BLOCK_16X8:
+      return vpx_mse16x8;
+    case BLOCK_8X16:
+      return vpx_mse8x16;
+    default:
+      return vpx_mse16x16;
+  }
+}
+
+static unsigned int get_prediction_error(BLOCK_SIZE bsize,
+                                         const struct buf_2d *src,
+                                         const struct buf_2d *ref) {
+  unsigned int sse;
+  const vpx_variance_fn_t fn = get_block_variance_fn(bsize);
+  fn(src->buf, src->stride, ref->buf, ref->stride, &sse);
+  return sse;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static vpx_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,
+                                                      int bd) {
+  switch (bd) {
+    default:
+      switch (bsize) {
+        case BLOCK_8X8:
+          return vpx_highbd_8_mse8x8;
+        case BLOCK_16X8:
+          return vpx_highbd_8_mse16x8;
+        case BLOCK_8X16:
+          return vpx_highbd_8_mse8x16;
+        default:
+          return vpx_highbd_8_mse16x16;
+      }
+      break;
+    case 10:
+      switch (bsize) {
+        case BLOCK_8X8:
+          return vpx_highbd_10_mse8x8;
+        case BLOCK_16X8:
+          return vpx_highbd_10_mse16x8;
+        case BLOCK_8X16:
+          return vpx_highbd_10_mse8x16;
+        default:
+          return vpx_highbd_10_mse16x16;
+      }
+      break;
+    case 12:
+      switch (bsize) {
+        case BLOCK_8X8:
+          return vpx_highbd_12_mse8x8;
+        case BLOCK_16X8:
+          return vpx_highbd_12_mse16x8;
+        case BLOCK_8X16:
+          return vpx_highbd_12_mse8x16;
+        default:
+          return vpx_highbd_12_mse16x16;
+      }
+      break;
+  }
+}
+
+static unsigned int highbd_get_prediction_error(BLOCK_SIZE bsize,
+                                                const struct buf_2d *src,
+                                                const struct buf_2d *ref,
+                                                int bd) {
+  unsigned int sse;
+  const vpx_variance_fn_t fn = highbd_get_block_variance_fn(bsize, bd);
+  fn(src->buf, src->stride, ref->buf, ref->stride, &sse);
+  return sse;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+// Refine the motion search range according to the frame dimension
+// for first pass test.
+static int get_search_range(const VP10_COMP *cpi) {
+  int sr = 0;
+  const int dim = VPXMIN(cpi->initial_width, cpi->initial_height);
+
+  while ((dim << sr) < MAX_FULL_PEL_VAL)
+    ++sr;
+  return sr;
+}
+
+static void first_pass_motion_search(VP10_COMP *cpi, MACROBLOCK *x,
+                                     const MV *ref_mv, MV *best_mv,
+                                     int *best_motion_err) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MV tmp_mv = {0, 0};
+  MV ref_mv_full = {ref_mv->row >> 3, ref_mv->col >> 3};
+  int num00, tmp_err, n;
+  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize];
+  const int new_mv_mode_penalty = NEW_MV_MODE_PENALTY;
+
+  int step_param = 3;
+  int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
+  const int sr = get_search_range(cpi);
+  step_param += sr;
+  further_steps -= sr;
+
+  // Override the default variance function to use MSE.
+  v_fn_ptr.vf = get_block_variance_fn(bsize);
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    v_fn_ptr.vf = highbd_get_block_variance_fn(bsize, xd->bd);
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  // Center the initial step/diamond search on best mv.
+  tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv,
+                                    step_param,
+                                    x->sadperbit16, &num00, &v_fn_ptr, ref_mv);
+  if (tmp_err < INT_MAX)
+    tmp_err = vp10_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1);
+  if (tmp_err < INT_MAX - new_mv_mode_penalty)
+    tmp_err += new_mv_mode_penalty;
+
+  if (tmp_err < *best_motion_err) {
+    *best_motion_err = tmp_err;
+    *best_mv = tmp_mv;
+  }
+
+  // Carry out further step/diamond searches as necessary.
+  n = num00;
+  num00 = 0;
+
+  while (n < further_steps) {
+    ++n;
+
+    if (num00) {
+      --num00;
+    } else {
+      tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv,
+                                        step_param + n, x->sadperbit16,
+                                        &num00, &v_fn_ptr, ref_mv);
+      if (tmp_err < INT_MAX)
+        tmp_err = vp10_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1);
+      if (tmp_err < INT_MAX - new_mv_mode_penalty)
+        tmp_err += new_mv_mode_penalty;
+
+      if (tmp_err < *best_motion_err) {
+        *best_motion_err = tmp_err;
+        *best_mv = tmp_mv;
+      }
+    }
+  }
+}
+
+static BLOCK_SIZE get_bsize(const VP10_COMMON *cm, int mb_row, int mb_col) {
+  if (2 * mb_col + 1 < cm->mi_cols) {
+    return 2 * mb_row + 1 < cm->mi_rows ? BLOCK_16X16
+                                        : BLOCK_16X8;
+  } else {
+    return 2 * mb_row + 1 < cm->mi_rows ? BLOCK_8X16
+                                        : BLOCK_8X8;
+  }
+}
+
+static int find_fp_qindex(vpx_bit_depth_t bit_depth) {
+  int i;
+
+  for (i = 0; i < QINDEX_RANGE; ++i)
+    if (vp10_convert_qindex_to_q(i, bit_depth) >= FIRST_PASS_Q)
+      break;
+
+  if (i == QINDEX_RANGE)
+    i--;
+
+  return i;
+}
+
+static void set_first_pass_params(VP10_COMP *cpi) {
+  VP10_COMMON *const cm = &cpi->common;
+  if (!cpi->refresh_alt_ref_frame &&
+      (cm->current_video_frame == 0 ||
+       (cpi->frame_flags & FRAMEFLAGS_KEY))) {
+    cm->frame_type = KEY_FRAME;
+  } else {
+    cm->frame_type = INTER_FRAME;
+  }
+  // Do not use periodic key frames.
+  cpi->rc.frames_to_key = INT_MAX;
+}
+
+#define UL_INTRA_THRESH 50
+#define INVALID_ROW -1
+void vp10_first_pass(VP10_COMP *cpi, const struct lookahead_entry *source) {
+  int mb_row, mb_col;
+  MACROBLOCK *const x = &cpi->td.mb;
+  VP10_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  TileInfo tile;
+  struct macroblock_plane *const p = x->plane;
+  struct macroblockd_plane *const pd = xd->plane;
+  const PICK_MODE_CONTEXT *ctx = &cpi->td.pc_root->none;
+  int i;
+
+  int recon_yoffset, recon_uvoffset;
+  int64_t intra_error = 0;
+  int64_t coded_error = 0;
+  int64_t sr_coded_error = 0;
+
+  int sum_mvr = 0, sum_mvc = 0;
+  int sum_mvr_abs = 0, sum_mvc_abs = 0;
+  int64_t sum_mvrs = 0, sum_mvcs = 0;
+  int mvcount = 0;
+  int intercount = 0;
+  int second_ref_count = 0;
+  const int intrapenalty = INTRA_MODE_PENALTY;
+  double neutral_count;
+  int intra_skip_count = 0;
+  int image_data_start_row = INVALID_ROW;
+  int new_mv_count = 0;
+  int sum_in_vectors = 0;
+  MV lastmv = {0, 0};
+  TWO_PASS *twopass = &cpi->twopass;
+  const MV zero_mv = {0, 0};
+  int recon_y_stride, recon_uv_stride, uv_mb_height;
+
+  YV12_BUFFER_CONFIG *const lst_yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
+  YV12_BUFFER_CONFIG *gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+  YV12_BUFFER_CONFIG *const new_yv12 = get_frame_new_buffer(cm);
+  const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12;
+  double intra_factor;
+  double brightness_factor;
+  BufferPool *const pool = cm->buffer_pool;
+
+  // First pass code requires valid last and new frame buffers.
+  assert(new_yv12 != NULL);
+  assert(frame_is_intra_only(cm) || (lst_yv12 != NULL));
+
+#if CONFIG_FP_MB_STATS
+  if (cpi->use_fp_mb_stats) {
+    vp10_zero_array(cpi->twopass.frame_mb_stats_buf, cm->initial_mbs);
+  }
+#endif
+
+  vpx_clear_system_state();
+
+  intra_factor = 0.0;
+  brightness_factor = 0.0;
+  neutral_count = 0.0;
+
+  set_first_pass_params(cpi);
+  vp10_set_quantizer(cm, find_fp_qindex(cm->bit_depth));
+
+  vp10_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
+
+  vp10_setup_src_planes(x, cpi->Source, 0, 0);
+  vp10_setup_dst_planes(xd->plane, new_yv12, 0, 0);
+
+  if (!frame_is_intra_only(cm)) {
+    vp10_setup_pre_planes(xd, 0, first_ref_buf, 0, 0, NULL);
+  }
+
+  xd->mi = cm->mi_grid_visible;
+  xd->mi[0] = cm->mi;
+
+  vp10_frame_init_quantizer(cpi);
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    p[i].coeff = ctx->coeff_pbuf[i][1];
+    p[i].qcoeff = ctx->qcoeff_pbuf[i][1];
+    pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
+    p[i].eobs = ctx->eobs_pbuf[i][1];
+  }
+  x->skip_recode = 0;
+
+  vp10_init_mv_probs(cm);
+  vp10_initialize_rd_consts(cpi);
+
+  // Tiling is ignored in the first pass.
+  vp10_tile_init(&tile, cm, 0, 0);
+
+  recon_y_stride = new_yv12->y_stride;
+  recon_uv_stride = new_yv12->uv_stride;
+  uv_mb_height = 16 >> (new_yv12->y_height > new_yv12->uv_height);
+
+  for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
+    MV best_ref_mv = {0, 0};
+
+    // Reset above block coeffs.
+    xd->up_available = (mb_row != 0);
+    recon_yoffset = (mb_row * recon_y_stride * 16);
+    recon_uvoffset = (mb_row * recon_uv_stride * uv_mb_height);
+
+    // Set up limit values for motion vectors to prevent them extending
+    // outside the UMV borders.
+    x->mv_row_min = -((mb_row * 16) + BORDER_MV_PIXELS_B16);
+    x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16)
+                    + BORDER_MV_PIXELS_B16;
+
+    for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) {
+      int this_error;
+      const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
+      const BLOCK_SIZE bsize = get_bsize(cm, mb_row, mb_col);
+      double log_intra;
+      int level_sample;
+
+#if CONFIG_FP_MB_STATS
+      const int mb_index = mb_row * cm->mb_cols + mb_col;
+#endif
+
+      vpx_clear_system_state();
+
+      xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset;
+      xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset;
+      xd->plane[2].dst.buf = new_yv12->v_buffer + recon_uvoffset;
+      xd->left_available = (mb_col != 0);
+      xd->mi[0]->mbmi.sb_type = bsize;
+      xd->mi[0]->mbmi.ref_frame[0] = INTRA_FRAME;
+      set_mi_row_col(xd, &tile,
+                     mb_row << 1, num_8x8_blocks_high_lookup[bsize],
+                     mb_col << 1, num_8x8_blocks_wide_lookup[bsize],
+                     cm->mi_rows, cm->mi_cols);
+
+      // Do intra 16x16 prediction.
+      xd->mi[0]->mbmi.segment_id = 0;
+      xd->mi[0]->mbmi.mode = DC_PRED;
+      xd->mi[0]->mbmi.tx_size = use_dc_pred ?
+         (bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4;
+      vp10_encode_intra_block_plane(x, bsize, 0);
+      this_error = vpx_get_mb_ss(x->plane[0].src_diff);
+
+      // Keep a record of blocks that have almost no intra error residual
+      // (i.e. are in effect completely flat and untextured in the intra
+      // domain). In natural videos this is uncommon, but it is much more
+      // common in animations, graphics and screen content, so may be used
+      // as a signal to detect these types of content.
+      if (this_error < UL_INTRA_THRESH) {
+        ++intra_skip_count;
+      } else if ((mb_col > 0) && (image_data_start_row == INVALID_ROW)) {
+        image_data_start_row = mb_row;
+      }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (cm->use_highbitdepth) {
+        switch (cm->bit_depth) {
+          case VPX_BITS_8:
+            break;
+          case VPX_BITS_10:
+            this_error >>= 4;
+            break;
+          case VPX_BITS_12:
+            this_error >>= 8;
+            break;
+          default:
+            assert(0 && "cm->bit_depth should be VPX_BITS_8, "
+                        "VPX_BITS_10 or VPX_BITS_12");
+            return;
+        }
+      }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+      vpx_clear_system_state();
+      log_intra = log(this_error + 1.0);
+      if (log_intra < 10.0)
+        intra_factor += 1.0 + ((10.0 - log_intra) * 0.05);
+      else
+        intra_factor += 1.0;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (cm->use_highbitdepth)
+        level_sample = CONVERT_TO_SHORTPTR(x->plane[0].src.buf)[0];
+      else
+        level_sample = x->plane[0].src.buf[0];
+#else
+      level_sample = x->plane[0].src.buf[0];
+#endif
+      if ((level_sample < DARK_THRESH) && (log_intra < 9.0))
+        brightness_factor += 1.0 + (0.01 * (DARK_THRESH - level_sample));
+      else
+        brightness_factor += 1.0;
+
+      // Intrapenalty below deals with situations where the intra and inter
+      // error scores are very low (e.g. a plain black frame).
+      // We do not have special cases in first pass for 0,0 and nearest etc so
+      // all inter modes carry an overhead cost estimate for the mv.
+      // When the error score is very low this causes us to pick all or lots of
+      // INTRA modes and throw lots of key frames.
+      // This penalty adds a cost matching that of a 0,0 mv to the intra case.
+      this_error += intrapenalty;
+
+      // Accumulate the intra error.
+      intra_error += (int64_t)this_error;
+
+#if CONFIG_FP_MB_STATS
+      if (cpi->use_fp_mb_stats) {
+        // initialization
+        cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
+      }
+#endif
+
+      // Set up limit values for motion vectors to prevent them extending
+      // outside the UMV borders.
+      x->mv_col_min = -((mb_col * 16) + BORDER_MV_PIXELS_B16);
+      x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + BORDER_MV_PIXELS_B16;
+
+      // Other than for the first frame do a motion search.
+      if (cm->current_video_frame > 0) {
+        int tmp_err, motion_error, raw_motion_error;
+        // Assume 0,0 motion with no mv overhead.
+        MV mv = {0, 0} , tmp_mv = {0, 0};
+        struct buf_2d unscaled_last_source_buf_2d;
+
+        xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+          motion_error = highbd_get_prediction_error(
+              bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);
+        } else {
+          motion_error = get_prediction_error(
+              bsize, &x->plane[0].src, &xd->plane[0].pre[0]);
+        }
+#else
+        motion_error = get_prediction_error(
+            bsize, &x->plane[0].src, &xd->plane[0].pre[0]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+        // Compute the motion error of the 0,0 motion using the last source
+        // frame as the reference. Skip the further motion search on
+        // reconstructed frame if this error is small.
+        unscaled_last_source_buf_2d.buf =
+            cpi->unscaled_last_source->y_buffer + recon_yoffset;
+        unscaled_last_source_buf_2d.stride =
+            cpi->unscaled_last_source->y_stride;
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+          raw_motion_error = highbd_get_prediction_error(
+              bsize, &x->plane[0].src, &unscaled_last_source_buf_2d, xd->bd);
+        } else {
+          raw_motion_error = get_prediction_error(
+              bsize, &x->plane[0].src, &unscaled_last_source_buf_2d);
+        }
+#else
+        raw_motion_error = get_prediction_error(
+            bsize, &x->plane[0].src, &unscaled_last_source_buf_2d);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+        // TODO(pengchong): Replace the hard-coded threshold
+        if (raw_motion_error > 25) {
+          // Test last reference frame using the previous best mv as the
+          // starting point (best reference) for the search.
+          first_pass_motion_search(cpi, x, &best_ref_mv, &mv, &motion_error);
+
+          // If the current best reference mv is not centered on 0,0 then do a
+          // 0,0 based search as well.
+          if (!is_zero_mv(&best_ref_mv)) {
+            tmp_err = INT_MAX;
+            first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &tmp_err);
+
+            if (tmp_err < motion_error) {
+              motion_error = tmp_err;
+              mv = tmp_mv;
+            }
+          }
+
+          // Search in an older reference frame.
+          if ((cm->current_video_frame > 1) && gld_yv12 != NULL) {
+            // Assume 0,0 motion with no mv overhead.
+            int gf_motion_error;
+
+            xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset;
+#if CONFIG_VP9_HIGHBITDEPTH
+            if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+              gf_motion_error = highbd_get_prediction_error(
+                  bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);
+            } else {
+              gf_motion_error = get_prediction_error(
+                  bsize, &x->plane[0].src, &xd->plane[0].pre[0]);
+            }
+#else
+            gf_motion_error = get_prediction_error(
+                bsize, &x->plane[0].src, &xd->plane[0].pre[0]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+            first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv,
+                                     &gf_motion_error);
+
+            if (gf_motion_error < motion_error && gf_motion_error < this_error)
+              ++second_ref_count;
+
+            // Reset to last frame as reference buffer.
+            xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
+            xd->plane[1].pre[0].buf = first_ref_buf->u_buffer + recon_uvoffset;
+            xd->plane[2].pre[0].buf = first_ref_buf->v_buffer + recon_uvoffset;
+
+            // In accumulating a score for the older reference frame take the
+            // best of the motion predicted score and the intra coded error
+            // (just as will be done for) accumulation of "coded_error" for
+            // the last frame.
+            if (gf_motion_error < this_error)
+              sr_coded_error += gf_motion_error;
+            else
+              sr_coded_error += this_error;
+          } else {
+            sr_coded_error += motion_error;
+          }
+        } else {
+          sr_coded_error += motion_error;
+        }
+
+        // Start by assuming that intra mode is best.
+        best_ref_mv.row = 0;
+        best_ref_mv.col = 0;
+
+#if CONFIG_FP_MB_STATS
+        if (cpi->use_fp_mb_stats) {
+          // intra predication statistics
+          cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
+          cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_DCINTRA_MASK;
+          cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK;
+          if (this_error > FPMB_ERROR_LARGE_TH) {
+            cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LARGE_MASK;
+          } else if (this_error < FPMB_ERROR_SMALL_TH) {
+            cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_SMALL_MASK;
+          }
+        }
+#endif
+
+        if (motion_error <= this_error) {
+          vpx_clear_system_state();
+
+          // Keep a count of cases where the inter and intra were very close
+          // and very low. This helps with scene cut detection for example in
+          // cropped clips with black bars at the sides or top and bottom.
+          if (((this_error - intrapenalty) * 9 <= motion_error * 10) &&
+              (this_error < (2 * intrapenalty))) {
+            neutral_count += 1.0;
+          // Also track cases where the intra is not much worse than the inter
+          // and use this in limiting the GF/arf group length.
+          } else if ((this_error > NCOUNT_INTRA_THRESH) &&
+                     (this_error < (NCOUNT_INTRA_FACTOR * motion_error))) {
+            neutral_count += (double)motion_error /
+                             DOUBLE_DIVIDE_CHECK((double)this_error);
+          }
+
+          mv.row *= 8;
+          mv.col *= 8;
+          this_error = motion_error;
+          xd->mi[0]->mbmi.mode = NEWMV;
+          xd->mi[0]->mbmi.mv[0].as_mv = mv;
+          xd->mi[0]->mbmi.tx_size = TX_4X4;
+          xd->mi[0]->mbmi.ref_frame[0] = LAST_FRAME;
+          xd->mi[0]->mbmi.ref_frame[1] = NONE;
+          vp10_build_inter_predictors_sby(xd, mb_row << 1, mb_col << 1, bsize);
+          vp10_encode_sby_pass1(x, bsize);
+          sum_mvr += mv.row;
+          sum_mvr_abs += abs(mv.row);
+          sum_mvc += mv.col;
+          sum_mvc_abs += abs(mv.col);
+          sum_mvrs += mv.row * mv.row;
+          sum_mvcs += mv.col * mv.col;
+          ++intercount;
+
+          best_ref_mv = mv;
+
+#if CONFIG_FP_MB_STATS
+          if (cpi->use_fp_mb_stats) {
+            // inter predication statistics
+            cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
+            cpi->twopass.frame_mb_stats_buf[mb_index] &= ~FPMB_DCINTRA_MASK;
+            cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK;
+            if (this_error > FPMB_ERROR_LARGE_TH) {
+              cpi->twopass.frame_mb_stats_buf[mb_index] |=
+                  FPMB_ERROR_LARGE_MASK;
+            } else if (this_error < FPMB_ERROR_SMALL_TH) {
+              cpi->twopass.frame_mb_stats_buf[mb_index] |=
+                  FPMB_ERROR_SMALL_MASK;
+            }
+          }
+#endif
+
+          if (!is_zero_mv(&mv)) {
+            ++mvcount;
+
+#if CONFIG_FP_MB_STATS
+            if (cpi->use_fp_mb_stats) {
+              cpi->twopass.frame_mb_stats_buf[mb_index] &=
+                  ~FPMB_MOTION_ZERO_MASK;
+              // check estimated motion direction
+              if (mv.as_mv.col > 0 && mv.as_mv.col >= abs(mv.as_mv.row)) {
+                // right direction
+                cpi->twopass.frame_mb_stats_buf[mb_index] |=
+                    FPMB_MOTION_RIGHT_MASK;
+              } else if (mv.as_mv.row < 0 &&
+                         abs(mv.as_mv.row) >= abs(mv.as_mv.col)) {
+                // up direction
+                cpi->twopass.frame_mb_stats_buf[mb_index] |=
+                    FPMB_MOTION_UP_MASK;
+              } else if (mv.as_mv.col < 0 &&
+                         abs(mv.as_mv.col) >= abs(mv.as_mv.row)) {
+                // left direction
+                cpi->twopass.frame_mb_stats_buf[mb_index] |=
+                    FPMB_MOTION_LEFT_MASK;
+              } else {
+                // down direction
+                cpi->twopass.frame_mb_stats_buf[mb_index] |=
+                    FPMB_MOTION_DOWN_MASK;
+              }
+            }
+#endif
+
+            // Non-zero vector, was it different from the last non zero vector?
+            if (!is_equal_mv(&mv, &lastmv))
+              ++new_mv_count;
+            lastmv = mv;
+
+            // Does the row vector point inwards or outwards?
+            if (mb_row < cm->mb_rows / 2) {
+              if (mv.row > 0)
+                --sum_in_vectors;
+              else if (mv.row < 0)
+                ++sum_in_vectors;
+            } else if (mb_row > cm->mb_rows / 2) {
+              if (mv.row > 0)
+                ++sum_in_vectors;
+              else if (mv.row < 0)
+                --sum_in_vectors;
+            }
+
+            // Does the col vector point inwards or outwards?
+            if (mb_col < cm->mb_cols / 2) {
+              if (mv.col > 0)
+                --sum_in_vectors;
+              else if (mv.col < 0)
+                ++sum_in_vectors;
+            } else if (mb_col > cm->mb_cols / 2) {
+              if (mv.col > 0)
+                ++sum_in_vectors;
+              else if (mv.col < 0)
+                --sum_in_vectors;
+            }
+          }
+        }
+      } else {
+        sr_coded_error += (int64_t)this_error;
+      }
+      coded_error += (int64_t)this_error;
+
+      // Adjust to the next column of MBs.
+      x->plane[0].src.buf += 16;
+      x->plane[1].src.buf += uv_mb_height;
+      x->plane[2].src.buf += uv_mb_height;
+
+      recon_yoffset += 16;
+      recon_uvoffset += uv_mb_height;
+    }
+
+    // Adjust to the next row of MBs.
+    x->plane[0].src.buf += 16 * x->plane[0].src.stride - 16 * cm->mb_cols;
+    x->plane[1].src.buf += uv_mb_height * x->plane[1].src.stride -
+                           uv_mb_height * cm->mb_cols;
+    x->plane[2].src.buf += uv_mb_height * x->plane[1].src.stride -
+                           uv_mb_height * cm->mb_cols;
+
+    vpx_clear_system_state();
+  }
+
+  // Clamp the image start to rows/2. This number of rows is discarded top
+  // and bottom as dead data so rows / 2 means the frame is blank.
+  if ((image_data_start_row > cm->mb_rows / 2) ||
+      (image_data_start_row == INVALID_ROW)) {
+    image_data_start_row = cm->mb_rows / 2;
+  }
+  // Exclude any image dead zone
+  if (image_data_start_row > 0) {
+    intra_skip_count =
+        VPXMAX(0, intra_skip_count - (image_data_start_row * cm->mb_cols * 2));
+  }
+
+  {
+    FIRSTPASS_STATS fps;
+    // The minimum error here insures some bit allocation to frames even
+    // in static regions. The allocation per MB declines for larger formats
+    // where the typical "real" energy per MB also falls.
+    // Initial estimate here uses sqrt(mbs) to define the min_err, where the
+    // number of mbs is proportional to the image area.
+    const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+                        ? cpi->initial_mbs : cpi->common.MBs;
+    const double min_err = 200 * sqrt(num_mbs);
+
+    intra_factor = intra_factor / (double)num_mbs;
+    brightness_factor = brightness_factor / (double)num_mbs;
+    fps.weight = intra_factor * brightness_factor;
+
+    fps.frame = cm->current_video_frame;
+    fps.coded_error = (double)(coded_error >> 8) + min_err;
+    fps.sr_coded_error = (double)(sr_coded_error >> 8) + min_err;
+    fps.intra_error = (double)(intra_error >> 8) + min_err;
+    fps.count = 1.0;
+    fps.pcnt_inter = (double)intercount / num_mbs;
+    fps.pcnt_second_ref = (double)second_ref_count / num_mbs;
+    fps.pcnt_neutral = (double)neutral_count / num_mbs;
+    fps.intra_skip_pct = (double)intra_skip_count / num_mbs;
+    fps.inactive_zone_rows = (double)image_data_start_row;
+    fps.inactive_zone_cols = (double)0;  // TODO(paulwilkins): fix
+
+    if (mvcount > 0) {
+      fps.MVr = (double)sum_mvr / mvcount;
+      fps.mvr_abs = (double)sum_mvr_abs / mvcount;
+      fps.MVc = (double)sum_mvc / mvcount;
+      fps.mvc_abs = (double)sum_mvc_abs / mvcount;
+      fps.MVrv = ((double)sum_mvrs -
+                  ((double)sum_mvr * sum_mvr / mvcount)) / mvcount;
+      fps.MVcv = ((double)sum_mvcs -
+                  ((double)sum_mvc * sum_mvc / mvcount)) / mvcount;
+      fps.mv_in_out_count = (double)sum_in_vectors / (mvcount * 2);
+      fps.new_mv_count = new_mv_count;
+      fps.pcnt_motion = (double)mvcount / num_mbs;
+    } else {
+      fps.MVr = 0.0;
+      fps.mvr_abs = 0.0;
+      fps.MVc = 0.0;
+      fps.mvc_abs = 0.0;
+      fps.MVrv = 0.0;
+      fps.MVcv = 0.0;
+      fps.mv_in_out_count = 0.0;
+      fps.new_mv_count = 0.0;
+      fps.pcnt_motion = 0.0;
+    }
+
+    // TODO(paulwilkins):  Handle the case when duration is set to 0, or
+    // something less than the full time between subsequent values of
+    // cpi->source_time_stamp.
+    fps.duration = (double)(source->ts_end - source->ts_start);
+
+    // Don't want to do output stats with a stack variable!
+    twopass->this_frame_stats = fps;
+    output_stats(&twopass->this_frame_stats, cpi->output_pkt_list);
+    accumulate_stats(&twopass->total_stats, &fps);
+
+#if CONFIG_FP_MB_STATS
+    if (cpi->use_fp_mb_stats) {
+      output_fpmb_stats(twopass->frame_mb_stats_buf, cm, cpi->output_pkt_list);
+    }
+#endif
+  }
+
+  // Copy the previous Last Frame back into gf and and arf buffers if
+  // the prediction is good enough... but also don't allow it to lag too far.
+  if ((twopass->sr_update_lag > 3) ||
+      ((cm->current_video_frame > 0) &&
+       (twopass->this_frame_stats.pcnt_inter > 0.20) &&
+       ((twopass->this_frame_stats.intra_error /
+         DOUBLE_DIVIDE_CHECK(twopass->this_frame_stats.coded_error)) > 2.0))) {
+    if (gld_yv12 != NULL) {
+      ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
+                 cm->ref_frame_map[cpi->lst_fb_idx]);
+    }
+    twopass->sr_update_lag = 1;
+  } else {
+    ++twopass->sr_update_lag;
+  }
+
+  vpx_extend_frame_borders(new_yv12);
+
+  // The frame we just compressed now becomes the last frame.
+  ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->lst_fb_idx],
+             cm->new_fb_idx);
+
+  // Special case for the first frame. Copy into the GF buffer as a second
+  // reference.
+  if (cm->current_video_frame == 0 && cpi->gld_fb_idx != INVALID_IDX) {
+    ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
+               cm->ref_frame_map[cpi->lst_fb_idx]);
+  }
+
+  // Use this to see what the first pass reconstruction looks like.
+  if (0) {
+    char filename[512];
+    FILE *recon_file;
+    snprintf(filename, sizeof(filename), "enc%04d.yuv",
+             (int)cm->current_video_frame);
+
+    if (cm->current_video_frame == 0)
+      recon_file = fopen(filename, "wb");
+    else
+      recon_file = fopen(filename, "ab");
+
+    (void)fwrite(lst_yv12->buffer_alloc, lst_yv12->frame_size, 1, recon_file);
+    fclose(recon_file);
+  }
+
+  ++cm->current_video_frame;
+}
+
+static double calc_correction_factor(double err_per_mb,
+                                     double err_divisor,
+                                     double pt_low,
+                                     double pt_high,
+                                     int q,
+                                     vpx_bit_depth_t bit_depth) {
+  const double error_term = err_per_mb / err_divisor;
+
+  // Adjustment based on actual quantizer to power term.
+  const double power_term =
+      VPXMIN(vp10_convert_qindex_to_q(q, bit_depth) * 0.01 + pt_low, pt_high);
+
+  // Calculate correction factor.
+  if (power_term < 1.0)
+    assert(error_term >= 0.0);
+
+  return fclamp(pow(error_term, power_term), 0.05, 5.0);
+}
+
+// Larger image formats are expected to be a little harder to code relatively
+// given the same prediction error score. This in part at least relates to the
+// increased size and hence coding cost of motion vectors.
+#define EDIV_SIZE_FACTOR 800
+
+static int get_twopass_worst_quality(const VP10_COMP *cpi,
+                                     const double section_err,
+                                     double inactive_zone,
+                                     int section_target_bandwidth,
+                                     double group_weight_factor) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const VP10EncoderConfig *const oxcf = &cpi->oxcf;
+
+  inactive_zone = fclamp(inactive_zone, 0.0, 1.0);
+
+  if (section_target_bandwidth <= 0) {
+    return rc->worst_quality;  // Highest value allowed
+  } else {
+    const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+                        ? cpi->initial_mbs : cpi->common.MBs;
+    const int active_mbs = VPXMAX(1, num_mbs - (int)(num_mbs * inactive_zone));
+    const double av_err_per_mb = section_err / active_mbs;
+    const double speed_term = 1.0 + 0.04 * oxcf->speed;
+    const double ediv_size_correction = (double)num_mbs / EDIV_SIZE_FACTOR;
+    const int target_norm_bits_per_mb = ((uint64_t)section_target_bandwidth <<
+                                         BPER_MB_NORMBITS) / active_mbs;
+
+    int q;
+
+    // Try and pick a max Q that will be high enough to encode the
+    // content at the given rate.
+    for (q = rc->best_quality; q < rc->worst_quality; ++q) {
+      const double factor =
+          calc_correction_factor(av_err_per_mb,
+                                 ERR_DIVISOR - ediv_size_correction,
+                                 FACTOR_PT_LOW, FACTOR_PT_HIGH, q,
+                                 cpi->common.bit_depth);
+      const int bits_per_mb =
+        vp10_rc_bits_per_mb(INTER_FRAME, q,
+                           factor * speed_term * group_weight_factor,
+                           cpi->common.bit_depth);
+      if (bits_per_mb <= target_norm_bits_per_mb)
+        break;
+    }
+
+    // Restriction on active max q for constrained quality mode.
+    if (cpi->oxcf.rc_mode == VPX_CQ)
+      q = VPXMAX(q, oxcf->cq_level);
+    return q;
+  }
+}
+
+static void setup_rf_level_maxq(VP10_COMP *cpi) {
+  int i;
+  RATE_CONTROL *const rc = &cpi->rc;
+  for (i = INTER_NORMAL; i < RATE_FACTOR_LEVELS; ++i) {
+    int qdelta = vp10_frame_type_qdelta(cpi, i, rc->worst_quality);
+    rc->rf_level_maxq[i] = VPXMAX(rc->worst_quality + qdelta, rc->best_quality);
+  }
+}
+
+void vp10_init_subsampling(VP10_COMP *cpi) {
+  const VP10_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  const int w = cm->width;
+  const int h = cm->height;
+  int i;
+
+  for (i = 0; i < FRAME_SCALE_STEPS; ++i) {
+    // Note: Frames with odd-sized dimensions may result from this scaling.
+    rc->frame_width[i] = (w * 16) / frame_scale_factor[i];
+    rc->frame_height[i] = (h * 16) / frame_scale_factor[i];
+  }
+
+  setup_rf_level_maxq(cpi);
+}
+
+void vp10_calculate_coded_size(VP10_COMP *cpi,
+                          int *scaled_frame_width,
+                          int *scaled_frame_height) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  *scaled_frame_width = rc->frame_width[rc->frame_size_selector];
+  *scaled_frame_height = rc->frame_height[rc->frame_size_selector];
+}
+
+void vp10_init_second_pass(VP10_COMP *cpi) {
+  const VP10EncoderConfig *const oxcf = &cpi->oxcf;
+  TWO_PASS *const twopass = &cpi->twopass;
+  double frame_rate;
+  FIRSTPASS_STATS *stats;
+
+  zero_stats(&twopass->total_stats);
+  zero_stats(&twopass->total_left_stats);
+
+  if (!twopass->stats_in_end)
+    return;
+
+  stats = &twopass->total_stats;
+
+  *stats = *twopass->stats_in_end;
+  twopass->total_left_stats = *stats;
+
+  frame_rate = 10000000.0 * stats->count / stats->duration;
+  // Each frame can have a different duration, as the frame rate in the source
+  // isn't guaranteed to be constant. The frame rate prior to the first frame
+  // encoded in the second pass is a guess. However, the sum duration is not.
+  // It is calculated based on the actual durations of all frames from the
+  // first pass.
+  vp10_new_framerate(cpi, frame_rate);
+  twopass->bits_left = (int64_t)(stats->duration * oxcf->target_bandwidth /
+                       10000000.0);
+
+  // This variable monitors how far behind the second ref update is lagging.
+  twopass->sr_update_lag = 1;
+
+  // Scan the first pass file and calculate a modified total error based upon
+  // the bias/power function used to allocate bits.
+  {
+    const double avg_error = stats->coded_error /
+                             DOUBLE_DIVIDE_CHECK(stats->count);
+    const FIRSTPASS_STATS *s = twopass->stats_in;
+    double modified_error_total = 0.0;
+    twopass->modified_error_min = (avg_error *
+                                      oxcf->two_pass_vbrmin_section) / 100;
+    twopass->modified_error_max = (avg_error *
+                                      oxcf->two_pass_vbrmax_section) / 100;
+    while (s < twopass->stats_in_end) {
+      modified_error_total += calculate_modified_err(cpi, twopass, oxcf, s);
+      ++s;
+    }
+    twopass->modified_error_left = modified_error_total;
+  }
+
+  // Reset the vbr bits off target counters
+  cpi->rc.vbr_bits_off_target = 0;
+  cpi->rc.vbr_bits_off_target_fast = 0;
+
+  cpi->rc.rate_error_estimate = 0;
+
+  // Static sequence monitor variables.
+  twopass->kf_zeromotion_pct = 100;
+  twopass->last_kfgroup_zeromotion_pct = 100;
+
+  if (oxcf->resize_mode != RESIZE_NONE) {
+    vp10_init_subsampling(cpi);
+  }
+}
+
+#define SR_DIFF_PART 0.0015
+#define MOTION_AMP_PART 0.003
+#define INTRA_PART 0.005
+#define DEFAULT_DECAY_LIMIT 0.75
+#define LOW_SR_DIFF_TRHESH 0.1
+#define SR_DIFF_MAX 128.0
+
+static double get_sr_decay_rate(const VP10_COMP *cpi,
+                                const FIRSTPASS_STATS *frame) {
+  const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+                      ? cpi->initial_mbs : cpi->common.MBs;
+  double sr_diff =
+      (frame->sr_coded_error - frame->coded_error) / num_mbs;
+  double sr_decay = 1.0;
+  double modified_pct_inter;
+  double modified_pcnt_intra;
+  const double motion_amplitude_factor =
+    frame->pcnt_motion * ((frame->mvc_abs + frame->mvr_abs) / 2);
+
+  modified_pct_inter = frame->pcnt_inter;
+  if ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) <
+      (double)NCOUNT_FRAME_II_THRESH) {
+    modified_pct_inter = frame->pcnt_inter - frame->pcnt_neutral;
+  }
+  modified_pcnt_intra = 100 * (1.0 - modified_pct_inter);
+
+
+  if ((sr_diff > LOW_SR_DIFF_TRHESH)) {
+    sr_diff = VPXMIN(sr_diff, SR_DIFF_MAX);
+    sr_decay = 1.0 - (SR_DIFF_PART * sr_diff) -
+               (MOTION_AMP_PART * motion_amplitude_factor) -
+               (INTRA_PART * modified_pcnt_intra);
+  }
+  return VPXMAX(sr_decay, VPXMIN(DEFAULT_DECAY_LIMIT, modified_pct_inter));
+}
+
+// This function gives an estimate of how badly we believe the prediction
+// quality is decaying from frame to frame.
+static double get_zero_motion_factor(const VP10_COMP *cpi,
+                                     const FIRSTPASS_STATS *frame) {
+  const double zero_motion_pct = frame->pcnt_inter -
+                                 frame->pcnt_motion;
+  double sr_decay = get_sr_decay_rate(cpi, frame);
+  return VPXMIN(sr_decay, zero_motion_pct);
+}
+
+#define ZM_POWER_FACTOR 0.75
+
+static double get_prediction_decay_rate(const VP10_COMP *cpi,
+                                        const FIRSTPASS_STATS *next_frame) {
+  const double sr_decay_rate = get_sr_decay_rate(cpi, next_frame);
+  const double zero_motion_factor =
+    (0.95 * pow((next_frame->pcnt_inter - next_frame->pcnt_motion),
+                ZM_POWER_FACTOR));
+
+  return VPXMAX(zero_motion_factor,
+                (sr_decay_rate + ((1.0 - sr_decay_rate) * zero_motion_factor)));
+}
+
+// Function to test for a condition where a complex transition is followed
+// by a static section. For example in slide shows where there is a fade
+// between slides. This is to help with more optimal kf and gf positioning.
+static int detect_transition_to_still(VP10_COMP *cpi,
+                                      int frame_interval, int still_interval,
+                                      double loop_decay_rate,
+                                      double last_decay_rate) {
+  TWO_PASS *const twopass = &cpi->twopass;
+  RATE_CONTROL *const rc = &cpi->rc;
+
+  // Break clause to detect very still sections after motion
+  // For example a static image after a fade or other transition
+  // instead of a clean scene cut.
+  if (frame_interval > rc->min_gf_interval &&
+      loop_decay_rate >= 0.999 &&
+      last_decay_rate < 0.9) {
+    int j;
+
+    // Look ahead a few frames to see if static condition persists...
+    for (j = 0; j < still_interval; ++j) {
+      const FIRSTPASS_STATS *stats = &twopass->stats_in[j];
+      if (stats >= twopass->stats_in_end)
+        break;
+
+      if (stats->pcnt_inter - stats->pcnt_motion < 0.999)
+        break;
+    }
+
+    // Only if it does do we signal a transition to still.
+    return j == still_interval;
+  }
+
+  return 0;
+}
+
+// This function detects a flash through the high relative pcnt_second_ref
+// score in the frame following a flash frame. The offset passed in should
+// reflect this.
+static int detect_flash(const TWO_PASS *twopass, int offset) {
+  const FIRSTPASS_STATS *const next_frame = read_frame_stats(twopass, offset);
+
+  // What we are looking for here is a situation where there is a
+  // brief break in prediction (such as a flash) but subsequent frames
+  // are reasonably well predicted by an earlier (pre flash) frame.
+  // The recovery after a flash is indicated by a high pcnt_second_ref
+  // compared to pcnt_inter.
+  return next_frame != NULL &&
+         next_frame->pcnt_second_ref > next_frame->pcnt_inter &&
+         next_frame->pcnt_second_ref >= 0.5;
+}
+
+// Update the motion related elements to the GF arf boost calculation.
+static void accumulate_frame_motion_stats(const FIRSTPASS_STATS *stats,
+                                          double *mv_in_out,
+                                          double *mv_in_out_accumulator,
+                                          double *abs_mv_in_out_accumulator,
+                                          double *mv_ratio_accumulator) {
+  const double pct = stats->pcnt_motion;
+
+  // Accumulate Motion In/Out of frame stats.
+  *mv_in_out = stats->mv_in_out_count * pct;
+  *mv_in_out_accumulator += *mv_in_out;
+  *abs_mv_in_out_accumulator += fabs(*mv_in_out);
+
+  // Accumulate a measure of how uniform (or conversely how random) the motion
+  // field is (a ratio of abs(mv) / mv).
+  if (pct > 0.05) {
+    const double mvr_ratio = fabs(stats->mvr_abs) /
+                                 DOUBLE_DIVIDE_CHECK(fabs(stats->MVr));
+    const double mvc_ratio = fabs(stats->mvc_abs) /
+                                 DOUBLE_DIVIDE_CHECK(fabs(stats->MVc));
+
+    *mv_ratio_accumulator += pct * (mvr_ratio < stats->mvr_abs ?
+                                       mvr_ratio : stats->mvr_abs);
+    *mv_ratio_accumulator += pct * (mvc_ratio < stats->mvc_abs ?
+                                       mvc_ratio : stats->mvc_abs);
+  }
+}
+
+#define BASELINE_ERR_PER_MB 1000.0
+static double calc_frame_boost(VP10_COMP *cpi,
+                               const FIRSTPASS_STATS *this_frame,
+                               double this_frame_mv_in_out,
+                               double max_boost) {
+  double frame_boost;
+  const double lq =
+    vp10_convert_qindex_to_q(cpi->rc.avg_frame_qindex[INTER_FRAME],
+                            cpi->common.bit_depth);
+  const double boost_q_correction = VPXMIN((0.5 + (lq * 0.015)), 1.5);
+  int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+                ? cpi->initial_mbs : cpi->common.MBs;
+
+  // Correct for any inactive region in the image
+  num_mbs = (int)VPXMAX(1, num_mbs * calculate_active_area(cpi, this_frame));
+
+  // Underlying boost factor is based on inter error ratio.
+  frame_boost = (BASELINE_ERR_PER_MB * num_mbs) /
+                DOUBLE_DIVIDE_CHECK(this_frame->coded_error);
+  frame_boost = frame_boost * BOOST_FACTOR * boost_q_correction;
+
+  // Increase boost for frames where new data coming into frame (e.g. zoom out).
+  // Slightly reduce boost if there is a net balance of motion out of the frame
+  // (zoom in). The range for this_frame_mv_in_out is -1.0 to +1.0.
+  if (this_frame_mv_in_out > 0.0)
+    frame_boost += frame_boost * (this_frame_mv_in_out * 2.0);
+  // In the extreme case the boost is halved.
+  else
+    frame_boost += frame_boost * (this_frame_mv_in_out / 2.0);
+
+  return VPXMIN(frame_boost, max_boost * boost_q_correction);
+}
+
+static int calc_arf_boost(VP10_COMP *cpi, int offset,
+                          int f_frames, int b_frames,
+                          int *f_boost, int *b_boost) {
+  TWO_PASS *const twopass = &cpi->twopass;
+  int i;
+  double boost_score = 0.0;
+  double mv_ratio_accumulator = 0.0;
+  double decay_accumulator = 1.0;
+  double this_frame_mv_in_out = 0.0;
+  double mv_in_out_accumulator = 0.0;
+  double abs_mv_in_out_accumulator = 0.0;
+  int arf_boost;
+  int flash_detected = 0;
+
+  // Search forward from the proposed arf/next gf position.
+  for (i = 0; i < f_frames; ++i) {
+    const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset);
+    if (this_frame == NULL)
+      break;
+
+    // Update the motion related elements to the boost calculation.
+    accumulate_frame_motion_stats(this_frame,
+                                  &this_frame_mv_in_out, &mv_in_out_accumulator,
+                                  &abs_mv_in_out_accumulator,
+                                  &mv_ratio_accumulator);
+
+    // We want to discount the flash frame itself and the recovery
+    // frame that follows as both will have poor scores.
+    flash_detected = detect_flash(twopass, i + offset) ||
+                     detect_flash(twopass, i + offset + 1);
+
+    // Accumulate the effect of prediction quality decay.
+    if (!flash_detected) {
+      decay_accumulator *= get_prediction_decay_rate(cpi, this_frame);
+      decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
+                          ? MIN_DECAY_FACTOR : decay_accumulator;
+    }
+
+    boost_score += decay_accumulator * calc_frame_boost(cpi, this_frame,
+                                                        this_frame_mv_in_out,
+                                                        GF_MAX_BOOST);
+  }
+
+  *f_boost = (int)boost_score;
+
+  // Reset for backward looking loop.
+  boost_score = 0.0;
+  mv_ratio_accumulator = 0.0;
+  decay_accumulator = 1.0;
+  this_frame_mv_in_out = 0.0;
+  mv_in_out_accumulator = 0.0;
+  abs_mv_in_out_accumulator = 0.0;
+
+  // Search backward towards last gf position.
+  for (i = -1; i >= -b_frames; --i) {
+    const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset);
+    if (this_frame == NULL)
+      break;
+
+    // Update the motion related elements to the boost calculation.
+    accumulate_frame_motion_stats(this_frame,
+                                  &this_frame_mv_in_out, &mv_in_out_accumulator,
+                                  &abs_mv_in_out_accumulator,
+                                  &mv_ratio_accumulator);
+
+    // We want to discount the the flash frame itself and the recovery
+    // frame that follows as both will have poor scores.
+    flash_detected = detect_flash(twopass, i + offset) ||
+                     detect_flash(twopass, i + offset + 1);
+
+    // Cumulative effect of prediction quality decay.
+    if (!flash_detected) {
+      decay_accumulator *= get_prediction_decay_rate(cpi, this_frame);
+      decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
+                              ? MIN_DECAY_FACTOR : decay_accumulator;
+    }
+
+    boost_score += decay_accumulator * calc_frame_boost(cpi, this_frame,
+                                                        this_frame_mv_in_out,
+                                                        GF_MAX_BOOST);
+  }
+  *b_boost = (int)boost_score;
+
+  arf_boost = (*f_boost + *b_boost);
+  if (arf_boost < ((b_frames + f_frames) * 20))
+    arf_boost = ((b_frames + f_frames) * 20);
+  arf_boost = VPXMAX(arf_boost, MIN_ARF_GF_BOOST);
+
+  return arf_boost;
+}
+
+// Calculate a section intra ratio used in setting max loop filter.
+static int calculate_section_intra_ratio(const FIRSTPASS_STATS *begin,
+                                         const FIRSTPASS_STATS *end,
+                                         int section_length) {
+  const FIRSTPASS_STATS *s = begin;
+  double intra_error = 0.0;
+  double coded_error = 0.0;
+  int i = 0;
+
+  while (s < end && i < section_length) {
+    intra_error += s->intra_error;
+    coded_error += s->coded_error;
+    ++s;
+    ++i;
+  }
+
+  return (int)(intra_error / DOUBLE_DIVIDE_CHECK(coded_error));
+}
+
+// Calculate the total bits to allocate in this GF/ARF group.
+static int64_t calculate_total_gf_group_bits(VP10_COMP *cpi,
+                                             double gf_group_err) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const TWO_PASS *const twopass = &cpi->twopass;
+  const int max_bits = frame_max_bits(rc, &cpi->oxcf);
+  int64_t total_group_bits;
+
+  // Calculate the bits to be allocated to the group as a whole.
+  if ((twopass->kf_group_bits > 0) && (twopass->kf_group_error_left > 0)) {
+    total_group_bits = (int64_t)(twopass->kf_group_bits *
+                                 (gf_group_err / twopass->kf_group_error_left));
+  } else {
+    total_group_bits = 0;
+  }
+
+  // Clamp odd edge cases.
+  total_group_bits = (total_group_bits < 0) ?
+     0 : (total_group_bits > twopass->kf_group_bits) ?
+     twopass->kf_group_bits : total_group_bits;
+
+  // Clip based on user supplied data rate variability limit.
+  if (total_group_bits > (int64_t)max_bits * rc->baseline_gf_interval)
+    total_group_bits = (int64_t)max_bits * rc->baseline_gf_interval;
+
+  return total_group_bits;
+}
+
+// Calculate the number bits extra to assign to boosted frames in a group.
+static int calculate_boost_bits(int frame_count,
+                                int boost, int64_t total_group_bits) {
+  int allocation_chunks;
+
+  // return 0 for invalid inputs (could arise e.g. through rounding errors)
+  if (!boost || (total_group_bits <= 0) || (frame_count <= 0) )
+    return 0;
+
+  allocation_chunks = (frame_count * 100) + boost;
+
+  // Prevent overflow.
+  if (boost > 1023) {
+    int divisor = boost >> 10;
+    boost /= divisor;
+    allocation_chunks /= divisor;
+  }
+
+  // Calculate the number of extra bits for use in the boosted frame or frames.
+  return VPXMAX((int)(((int64_t)boost * total_group_bits) / allocation_chunks),
+                0);
+}
+
+// Current limit on maximum number of active arfs in a GF/ARF group.
+#define MAX_ACTIVE_ARFS 2
+#define ARF_SLOT1 2
+#define ARF_SLOT2 3
+// This function indirects the choice of buffers for arfs.
+// At the moment the values are fixed but this may change as part of
+// the integration process with other codec features that swap buffers around.
+static void get_arf_buffer_indices(unsigned char *arf_buffer_indices) {
+  arf_buffer_indices[0] = ARF_SLOT1;
+  arf_buffer_indices[1] = ARF_SLOT2;
+}
+
+static void allocate_gf_group_bits(VP10_COMP *cpi, int64_t gf_group_bits,
+                                   double group_error, int gf_arf_bits) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  const VP10EncoderConfig *const oxcf = &cpi->oxcf;
+  TWO_PASS *const twopass = &cpi->twopass;
+  GF_GROUP *const gf_group = &twopass->gf_group;
+  FIRSTPASS_STATS frame_stats;
+  int i;
+  int frame_index = 1;
+  int target_frame_size;
+  int key_frame;
+  const int max_bits = frame_max_bits(&cpi->rc, &cpi->oxcf);
+  int64_t total_group_bits = gf_group_bits;
+  double modified_err = 0.0;
+  double err_fraction;
+  int mid_boost_bits = 0;
+  int mid_frame_idx;
+  unsigned char arf_buffer_indices[MAX_ACTIVE_ARFS];
+
+  key_frame = cpi->common.frame_type == KEY_FRAME;
+
+  get_arf_buffer_indices(arf_buffer_indices);
+
+  // For key frames the frame target rate is already set and it
+  // is also the golden frame.
+  if (!key_frame) {
+    if (rc->source_alt_ref_active) {
+      gf_group->update_type[0] = OVERLAY_UPDATE;
+      gf_group->rf_level[0] = INTER_NORMAL;
+      gf_group->bit_allocation[0] = 0;
+    } else {
+      gf_group->update_type[0] = GF_UPDATE;
+      gf_group->rf_level[0] = GF_ARF_STD;
+      gf_group->bit_allocation[0] = gf_arf_bits;
+    }
+    gf_group->arf_update_idx[0] = arf_buffer_indices[0];
+    gf_group->arf_ref_idx[0] = arf_buffer_indices[0];
+
+    // Step over the golden frame / overlay frame
+    if (EOF == input_stats(twopass, &frame_stats))
+      return;
+  }
+
+  // Deduct the boost bits for arf (or gf if it is not a key frame)
+  // from the group total.
+  if (rc->source_alt_ref_pending || !key_frame)
+    total_group_bits -= gf_arf_bits;
+
+  // Store the bits to spend on the ARF if there is one.
+  if (rc->source_alt_ref_pending) {
+    gf_group->update_type[frame_index] = ARF_UPDATE;
+    gf_group->rf_level[frame_index] = GF_ARF_STD;
+    gf_group->bit_allocation[frame_index] = gf_arf_bits;
+
+    gf_group->arf_src_offset[frame_index] =
+        (unsigned char)(rc->baseline_gf_interval - 1);
+
+    gf_group->arf_update_idx[frame_index] = arf_buffer_indices[0];
+    gf_group->arf_ref_idx[frame_index] =
+      arf_buffer_indices[cpi->multi_arf_last_grp_enabled &&
+                         rc->source_alt_ref_active];
+    ++frame_index;
+
+    if (cpi->multi_arf_enabled) {
+      // Set aside a slot for a level 1 arf.
+      gf_group->update_type[frame_index] = ARF_UPDATE;
+      gf_group->rf_level[frame_index] = GF_ARF_LOW;
+      gf_group->arf_src_offset[frame_index] =
+        (unsigned char)((rc->baseline_gf_interval >> 1) - 1);
+      gf_group->arf_update_idx[frame_index] = arf_buffer_indices[1];
+      gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[0];
+      ++frame_index;
+    }
+  }
+
+  // Define middle frame
+  mid_frame_idx = frame_index + (rc->baseline_gf_interval >> 1) - 1;
+
+  // Allocate bits to the other frames in the group.
+  for (i = 0; i < rc->baseline_gf_interval - rc->source_alt_ref_pending; ++i) {
+    int arf_idx = 0;
+    if (EOF == input_stats(twopass, &frame_stats))
+      break;
+
+    modified_err = calculate_modified_err(cpi, twopass, oxcf, &frame_stats);
+
+    if (group_error > 0)
+      err_fraction = modified_err / DOUBLE_DIVIDE_CHECK(group_error);
+    else
+      err_fraction = 0.0;
+
+    target_frame_size = (int)((double)total_group_bits * err_fraction);
+
+    if (rc->source_alt_ref_pending && cpi->multi_arf_enabled) {
+      mid_boost_bits += (target_frame_size >> 4);
+      target_frame_size -= (target_frame_size >> 4);
+
+      if (frame_index <= mid_frame_idx)
+        arf_idx = 1;
+    }
+    gf_group->arf_update_idx[frame_index] = arf_buffer_indices[arf_idx];
+    gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[arf_idx];
+
+    target_frame_size = clamp(target_frame_size, 0,
+                              VPXMIN(max_bits, (int)total_group_bits));
+
+    gf_group->update_type[frame_index] = LF_UPDATE;
+    gf_group->rf_level[frame_index] = INTER_NORMAL;
+
+    gf_group->bit_allocation[frame_index] = target_frame_size;
+    ++frame_index;
+  }
+
+  // Note:
+  // We need to configure the frame at the end of the sequence + 1 that will be
+  // the start frame for the next group. Otherwise prior to the call to
+  // vp10_rc_get_second_pass_params() the data will be undefined.
+  gf_group->arf_update_idx[frame_index] = arf_buffer_indices[0];
+  gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[0];
+
+  if (rc->source_alt_ref_pending) {
+    gf_group->update_type[frame_index] = OVERLAY_UPDATE;
+    gf_group->rf_level[frame_index] = INTER_NORMAL;
+
+    // Final setup for second arf and its overlay.
+    if (cpi->multi_arf_enabled) {
+      gf_group->bit_allocation[2] =
+          gf_group->bit_allocation[mid_frame_idx] + mid_boost_bits;
+      gf_group->update_type[mid_frame_idx] = OVERLAY_UPDATE;
+      gf_group->bit_allocation[mid_frame_idx] = 0;
+    }
+  } else {
+    gf_group->update_type[frame_index] = GF_UPDATE;
+    gf_group->rf_level[frame_index] = GF_ARF_STD;
+  }
+
+  // Note whether multi-arf was enabled this group for next time.
+  cpi->multi_arf_last_grp_enabled = cpi->multi_arf_enabled;
+}
+
+// Analyse and define a gf/arf group.
+static void define_gf_group(VP10_COMP *cpi, FIRSTPASS_STATS *this_frame) {
+  VP10_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  VP10EncoderConfig *const oxcf = &cpi->oxcf;
+  TWO_PASS *const twopass = &cpi->twopass;
+  FIRSTPASS_STATS next_frame;
+  const FIRSTPASS_STATS *const start_pos = twopass->stats_in;
+  int i;
+
+  double boost_score = 0.0;
+  double old_boost_score = 0.0;
+  double gf_group_err = 0.0;
+#if GROUP_ADAPTIVE_MAXQ
+  double gf_group_raw_error = 0.0;
+#endif
+  double gf_group_skip_pct = 0.0;
+  double gf_group_inactive_zone_rows = 0.0;
+  double gf_first_frame_err = 0.0;
+  double mod_frame_err = 0.0;
+
+  double mv_ratio_accumulator = 0.0;
+  double decay_accumulator = 1.0;
+  double zero_motion_accumulator = 1.0;
+
+  double loop_decay_rate = 1.00;
+  double last_loop_decay_rate = 1.00;
+
+  double this_frame_mv_in_out = 0.0;
+  double mv_in_out_accumulator = 0.0;
+  double abs_mv_in_out_accumulator = 0.0;
+  double mv_ratio_accumulator_thresh;
+  unsigned int allow_alt_ref = is_altref_enabled(cpi);
+
+  int f_boost = 0;
+  int b_boost = 0;
+  int flash_detected;
+  int active_max_gf_interval;
+  int active_min_gf_interval;
+  int64_t gf_group_bits;
+  double gf_group_error_left;
+  int gf_arf_bits;
+  const int is_key_frame = frame_is_intra_only(cm);
+  const int arf_active_or_kf = is_key_frame || rc->source_alt_ref_active;
+
+  // Reset the GF group data structures unless this is a key
+  // frame in which case it will already have been done.
+  if (is_key_frame == 0) {
+    vp10_zero(twopass->gf_group);
+  }
+
+  vpx_clear_system_state();
+  vp10_zero(next_frame);
+
+  // Load stats for the current frame.
+  mod_frame_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
+
+  // Note the error of the frame at the start of the group. This will be
+  // the GF frame error if we code a normal gf.
+  gf_first_frame_err = mod_frame_err;
+
+  // If this is a key frame or the overlay from a previous arf then
+  // the error score / cost of this frame has already been accounted for.
+  if (arf_active_or_kf) {
+    gf_group_err -= gf_first_frame_err;
+#if GROUP_ADAPTIVE_MAXQ
+    gf_group_raw_error -= this_frame->coded_error;
+#endif
+    gf_group_skip_pct -= this_frame->intra_skip_pct;
+    gf_group_inactive_zone_rows -= this_frame->inactive_zone_rows;
+  }
+
+  // Motion breakout threshold for loop below depends on image size.
+  mv_ratio_accumulator_thresh =
+      (cpi->initial_height + cpi->initial_width) / 4.0;
+
+  // Set a maximum and minimum interval for the GF group.
+  // If the image appears almost completely static we can extend beyond this.
+  {
+    int int_max_q =
+      (int)(vp10_convert_qindex_to_q(twopass->active_worst_quality,
+                                     cpi->common.bit_depth));
+    int int_lbq =
+      (int)(vp10_convert_qindex_to_q(rc->last_boosted_qindex,
+                                     cpi->common.bit_depth));
+    active_min_gf_interval = rc->min_gf_interval + VPXMIN(2, int_max_q / 200);
+    if (active_min_gf_interval > rc->max_gf_interval)
+      active_min_gf_interval = rc->max_gf_interval;
+
+    if (cpi->multi_arf_allowed) {
+      active_max_gf_interval = rc->max_gf_interval;
+    } else {
+      // The value chosen depends on the active Q range. At low Q we have
+      // bits to spare and are better with a smaller interval and smaller boost.
+      // At high Q when there are few bits to spare we are better with a longer
+      // interval to spread the cost of the GF.
+      active_max_gf_interval = 12 + VPXMIN(4, (int_lbq / 6));
+
+      // We have: active_min_gf_interval <= rc->max_gf_interval
+      if (active_max_gf_interval < active_min_gf_interval)
+        active_max_gf_interval = active_min_gf_interval;
+      else if (active_max_gf_interval > rc->max_gf_interval)
+        active_max_gf_interval = rc->max_gf_interval;
+    }
+  }
+
+  i = 0;
+  while (i < rc->static_scene_max_gf_interval && i < rc->frames_to_key) {
+    ++i;
+
+    // Accumulate error score of frames in this gf group.
+    mod_frame_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
+    gf_group_err += mod_frame_err;
+#if GROUP_ADAPTIVE_MAXQ
+    gf_group_raw_error += this_frame->coded_error;
+#endif
+    gf_group_skip_pct += this_frame->intra_skip_pct;
+    gf_group_inactive_zone_rows += this_frame->inactive_zone_rows;
+
+    if (EOF == input_stats(twopass, &next_frame))
+      break;
+
+    // Test for the case where there is a brief flash but the prediction
+    // quality back to an earlier frame is then restored.
+    flash_detected = detect_flash(twopass, 0);
+
+    // Update the motion related elements to the boost calculation.
+    accumulate_frame_motion_stats(&next_frame,
+                                  &this_frame_mv_in_out, &mv_in_out_accumulator,
+                                  &abs_mv_in_out_accumulator,
+                                  &mv_ratio_accumulator);
+
+    // Accumulate the effect of prediction quality decay.
+    if (!flash_detected) {
+      last_loop_decay_rate = loop_decay_rate;
+      loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
+
+      decay_accumulator = decay_accumulator * loop_decay_rate;
+
+      // Monitor for static sections.
+      zero_motion_accumulator = VPXMIN(
+          zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));
+
+      // Break clause to detect very still sections after motion. For example,
+      // a static image after a fade or other transition.
+      if (detect_transition_to_still(cpi, i, 5, loop_decay_rate,
+                                     last_loop_decay_rate)) {
+        allow_alt_ref = 0;
+        break;
+      }
+    }
+
+    // Calculate a boost number for this frame.
+    boost_score += decay_accumulator * calc_frame_boost(cpi, &next_frame,
+                                                        this_frame_mv_in_out,
+                                                        GF_MAX_BOOST);
+
+    // Break out conditions.
+    if (
+      // Break at active_max_gf_interval unless almost totally static.
+      (i >= (active_max_gf_interval + arf_active_or_kf) &&
+            zero_motion_accumulator < 0.995) ||
+      (
+        // Don't break out with a very short interval.
+        (i >= active_min_gf_interval + arf_active_or_kf) &&
+        (!flash_detected) &&
+        ((mv_ratio_accumulator > mv_ratio_accumulator_thresh) ||
+         (abs_mv_in_out_accumulator > 3.0) ||
+         (mv_in_out_accumulator < -2.0) ||
+         ((boost_score - old_boost_score) < BOOST_BREAKOUT)))) {
+      boost_score = old_boost_score;
+      break;
+    }
+
+    *this_frame = next_frame;
+    old_boost_score = boost_score;
+  }
+
+  twopass->gf_zeromotion_pct = (int)(zero_motion_accumulator * 1000.0);
+
+  // Was the group length constrained by the requirement for a new KF?
+  rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0;
+
+  // Should we use the alternate reference frame.
+  if (allow_alt_ref &&
+    (i < cpi->oxcf.lag_in_frames) &&
+    (i >= rc->min_gf_interval)) {
+    // Calculate the boost for alt ref.
+    rc->gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost,
+      &b_boost);
+    rc->source_alt_ref_pending = 1;
+
+    // Test to see if multi arf is appropriate.
+    cpi->multi_arf_enabled =
+      (cpi->multi_arf_allowed && (rc->baseline_gf_interval >= 6) &&
+      (zero_motion_accumulator < 0.995)) ? 1 : 0;
+  } else {
+    rc->gfu_boost = VPXMAX((int)boost_score, MIN_ARF_GF_BOOST);
+    rc->source_alt_ref_pending = 0;
+  }
+
+  // Set the interval until the next gf.
+  rc->baseline_gf_interval = i - (is_key_frame || rc->source_alt_ref_pending);
+
+  rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+
+  // Reset the file position.
+  reset_fpf_position(twopass, start_pos);
+
+  // Calculate the bits to be allocated to the gf/arf group as a whole
+  gf_group_bits = calculate_total_gf_group_bits(cpi, gf_group_err);
+
+#if GROUP_ADAPTIVE_MAXQ
+  // Calculate an estimate of the maxq needed for the group.
+  // We are more agressive about correcting for sections
+  // where there could be significant overshoot than for easier
+  // sections where we do not wish to risk creating an overshoot
+  // of the allocated bit budget.
+  if ((cpi->oxcf.rc_mode != VPX_Q) && (rc->baseline_gf_interval > 1)) {
+    const int vbr_group_bits_per_frame =
+      (int)(gf_group_bits / rc->baseline_gf_interval);
+    const double group_av_err = gf_group_raw_error  / rc->baseline_gf_interval;
+    const double group_av_skip_pct =
+      gf_group_skip_pct / rc->baseline_gf_interval;
+    const double group_av_inactive_zone =
+      ((gf_group_inactive_zone_rows * 2) /
+       (rc->baseline_gf_interval * (double)cm->mb_rows));
+
+    int tmp_q;
+    // rc factor is a weight factor that corrects for local rate control drift.
+    double rc_factor = 1.0;
+    if (rc->rate_error_estimate > 0) {
+      rc_factor = VPXMAX(RC_FACTOR_MIN,
+                         (double)(100 - rc->rate_error_estimate) / 100.0);
+    } else {
+      rc_factor = VPXMIN(RC_FACTOR_MAX,
+                         (double)(100 - rc->rate_error_estimate) / 100.0);
+    }
+    tmp_q =
+      get_twopass_worst_quality(cpi, group_av_err,
+                                (group_av_skip_pct + group_av_inactive_zone),
+                                vbr_group_bits_per_frame,
+                                twopass->kfgroup_inter_fraction * rc_factor);
+    twopass->active_worst_quality =
+      VPXMAX(tmp_q, twopass->active_worst_quality >> 1);
+  }
+#endif
+
+  // Calculate the extra bits to be used for boosted frame(s)
+  gf_arf_bits = calculate_boost_bits(rc->baseline_gf_interval,
+                                     rc->gfu_boost, gf_group_bits);
+
+  // Adjust KF group bits and error remaining.
+  twopass->kf_group_error_left -= (int64_t)gf_group_err;
+
+  // If this is an arf update we want to remove the score for the overlay
+  // frame at the end which will usually be very cheap to code.
+  // The overlay frame has already, in effect, been coded so we want to spread
+  // the remaining bits among the other frames.
+  // For normal GFs remove the score for the GF itself unless this is
+  // also a key frame in which case it has already been accounted for.
+  if (rc->source_alt_ref_pending) {
+    gf_group_error_left = gf_group_err - mod_frame_err;
+  } else if (is_key_frame == 0) {
+    gf_group_error_left = gf_group_err - gf_first_frame_err;
+  } else {
+    gf_group_error_left = gf_group_err;
+  }
+
+  // Allocate bits to each of the frames in the GF group.
+  allocate_gf_group_bits(cpi, gf_group_bits, gf_group_error_left, gf_arf_bits);
+
+  // Reset the file position.
+  reset_fpf_position(twopass, start_pos);
+
+  // Calculate a section intra ratio used in setting max loop filter.
+  if (cpi->common.frame_type != KEY_FRAME) {
+    twopass->section_intra_rating =
+        calculate_section_intra_ratio(start_pos, twopass->stats_in_end,
+                                      rc->baseline_gf_interval);
+  }
+
+  if (oxcf->resize_mode == RESIZE_DYNAMIC) {
+    // Default to starting GF groups at normal frame size.
+    cpi->rc.next_frame_size_selector = UNSCALED;
+  }
+}
+
+// Threshold for use of the lagging second reference frame. High second ref
+// usage may point to a transient event like a flash or occlusion rather than
+// a real scene cut.
+#define SECOND_REF_USEAGE_THRESH 0.1
+// Minimum % intra coding observed in first pass (1.0 = 100%)
+#define MIN_INTRA_LEVEL 0.25
+// Minimum ratio between the % of intra coding and inter coding in the first
+// pass after discounting neutral blocks (discounting neutral blocks in this
+// way helps catch scene cuts in clips with very flat areas or letter box
+// format clips with image padding.
+#define INTRA_VS_INTER_THRESH 2.0
+// Hard threshold where the first pass chooses intra for almost all blocks.
+// In such a case even if the frame is not a scene cut coding a key frame
+// may be a good option.
+#define VERY_LOW_INTER_THRESH 0.05
+// Maximum threshold for the relative ratio of intra error score vs best
+// inter error score.
+#define KF_II_ERR_THRESHOLD 2.5
+// In real scene cuts there is almost always a sharp change in the intra
+// or inter error score.
+#define ERR_CHANGE_THRESHOLD 0.4
+// For real scene cuts we expect an improvment in the intra inter error
+// ratio in the next frame.
+#define II_IMPROVEMENT_THRESHOLD 3.5
+#define KF_II_MAX 128.0
+
+static int test_candidate_kf(TWO_PASS *twopass,
+                             const FIRSTPASS_STATS *last_frame,
+                             const FIRSTPASS_STATS *this_frame,
+                             const FIRSTPASS_STATS *next_frame) {
+  int is_viable_kf = 0;
+  double pcnt_intra = 1.0 - this_frame->pcnt_inter;
+  double modified_pcnt_inter =
+    this_frame->pcnt_inter - this_frame->pcnt_neutral;
+
+  // Does the frame satisfy the primary criteria of a key frame?
+  // See above for an explanation of the test criteria.
+  // If so, then examine how well it predicts subsequent frames.
+  if ((this_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) &&
+      (next_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) &&
+      ((this_frame->pcnt_inter < VERY_LOW_INTER_THRESH) ||
+       ((pcnt_intra > MIN_INTRA_LEVEL) &&
+        (pcnt_intra > (INTRA_VS_INTER_THRESH * modified_pcnt_inter)) &&
+        ((this_frame->intra_error /
+          DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) <
+          KF_II_ERR_THRESHOLD) &&
+        ((fabs(last_frame->coded_error - this_frame->coded_error) /
+          DOUBLE_DIVIDE_CHECK(this_frame->coded_error) >
+          ERR_CHANGE_THRESHOLD) ||
+         (fabs(last_frame->intra_error - this_frame->intra_error) /
+          DOUBLE_DIVIDE_CHECK(this_frame->intra_error) >
+          ERR_CHANGE_THRESHOLD) ||
+         ((next_frame->intra_error /
+          DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) >
+          II_IMPROVEMENT_THRESHOLD))))) {
+    int i;
+    const FIRSTPASS_STATS *start_pos = twopass->stats_in;
+    FIRSTPASS_STATS local_next_frame = *next_frame;
+    double boost_score = 0.0;
+    double old_boost_score = 0.0;
+    double decay_accumulator = 1.0;
+
+    // Examine how well the key frame predicts subsequent frames.
+    for (i = 0; i < 16; ++i) {
+      double next_iiratio = (BOOST_FACTOR * local_next_frame.intra_error /
+                             DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error));
+
+      if (next_iiratio > KF_II_MAX)
+        next_iiratio = KF_II_MAX;
+
+      // Cumulative effect of decay in prediction quality.
+      if (local_next_frame.pcnt_inter > 0.85)
+        decay_accumulator *= local_next_frame.pcnt_inter;
+      else
+        decay_accumulator *= (0.85 + local_next_frame.pcnt_inter) / 2.0;
+
+      // Keep a running total.
+      boost_score += (decay_accumulator * next_iiratio);
+
+      // Test various breakout clauses.
+      if ((local_next_frame.pcnt_inter < 0.05) ||
+          (next_iiratio < 1.5) ||
+          (((local_next_frame.pcnt_inter -
+             local_next_frame.pcnt_neutral) < 0.20) &&
+           (next_iiratio < 3.0)) ||
+          ((boost_score - old_boost_score) < 3.0) ||
+          (local_next_frame.intra_error < 200)) {
+        break;
+      }
+
+      old_boost_score = boost_score;
+
+      // Get the next frame details
+      if (EOF == input_stats(twopass, &local_next_frame))
+        break;
+    }
+
+    // If there is tolerable prediction for at least the next 3 frames then
+    // break out else discard this potential key frame and move on
+    if (boost_score > 30.0 && (i > 3)) {
+      is_viable_kf = 1;
+    } else {
+      // Reset the file position
+      reset_fpf_position(twopass, start_pos);
+
+      is_viable_kf = 0;
+    }
+  }
+
+  return is_viable_kf;
+}
+
+#define FRAMES_TO_CHECK_DECAY 8
+
+static void find_next_key_frame(VP10_COMP *cpi, FIRSTPASS_STATS *this_frame) {
+  int i, j;
+  RATE_CONTROL *const rc = &cpi->rc;
+  TWO_PASS *const twopass = &cpi->twopass;
+  GF_GROUP *const gf_group = &twopass->gf_group;
+  const VP10EncoderConfig *const oxcf = &cpi->oxcf;
+  const FIRSTPASS_STATS first_frame = *this_frame;
+  const FIRSTPASS_STATS *const start_position = twopass->stats_in;
+  FIRSTPASS_STATS next_frame;
+  FIRSTPASS_STATS last_frame;
+  int kf_bits = 0;
+  int loop_decay_counter = 0;
+  double decay_accumulator = 1.0;
+  double av_decay_accumulator = 0.0;
+  double zero_motion_accumulator = 1.0;
+  double boost_score = 0.0;
+  double kf_mod_err = 0.0;
+  double kf_group_err = 0.0;
+  double recent_loop_decay[FRAMES_TO_CHECK_DECAY];
+
+  vp10_zero(next_frame);
+
+  cpi->common.frame_type = KEY_FRAME;
+
+  // Reset the GF group data structures.
+  vp10_zero(*gf_group);
+
+  // Is this a forced key frame by interval.
+  rc->this_key_frame_forced = rc->next_key_frame_forced;
+
+  // Clear the alt ref active flag and last group multi arf flags as they
+  // can never be set for a key frame.
+  rc->source_alt_ref_active = 0;
+  cpi->multi_arf_last_grp_enabled = 0;
+
+  // KF is always a GF so clear frames till next gf counter.
+  rc->frames_till_gf_update_due = 0;
+
+  rc->frames_to_key = 1;
+
+  twopass->kf_group_bits = 0;        // Total bits available to kf group
+  twopass->kf_group_error_left = 0;  // Group modified error score.
+
+  kf_mod_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
+
+  // Initialize the decay rates for the recent frames to check
+  for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j)
+    recent_loop_decay[j] = 1.0;
+
+  // Find the next keyframe.
+  i = 0;
+  while (twopass->stats_in < twopass->stats_in_end &&
+         rc->frames_to_key < cpi->oxcf.key_freq) {
+    // Accumulate kf group error.
+    kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
+
+    // Load the next frame's stats.
+    last_frame = *this_frame;
+    input_stats(twopass, this_frame);
+
+    // Provided that we are not at the end of the file...
+    if (cpi->oxcf.auto_key && twopass->stats_in < twopass->stats_in_end) {
+      double loop_decay_rate;
+
+      // Check for a scene cut.
+      if (test_candidate_kf(twopass, &last_frame, this_frame,
+                            twopass->stats_in))
+        break;
+
+      // How fast is the prediction quality decaying?
+      loop_decay_rate = get_prediction_decay_rate(cpi, twopass->stats_in);
+
+      // We want to know something about the recent past... rather than
+      // as used elsewhere where we are concerned with decay in prediction
+      // quality since the last GF or KF.
+      recent_loop_decay[i % FRAMES_TO_CHECK_DECAY] = loop_decay_rate;
+      decay_accumulator = 1.0;
+      for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j)
+        decay_accumulator *= recent_loop_decay[j];
+
+      // Special check for transition or high motion followed by a
+      // static scene.
+      if (detect_transition_to_still(cpi, i, cpi->oxcf.key_freq - i,
+                                     loop_decay_rate, decay_accumulator))
+        break;
+
+      // Step on to the next frame.
+      ++rc->frames_to_key;
+
+      // If we don't have a real key frame within the next two
+      // key_freq intervals then break out of the loop.
+      if (rc->frames_to_key >= 2 * cpi->oxcf.key_freq)
+        break;
+    } else {
+      ++rc->frames_to_key;
+    }
+    ++i;
+  }
+
+  // If there is a max kf interval set by the user we must obey it.
+  // We already breakout of the loop above at 2x max.
+  // This code centers the extra kf if the actual natural interval
+  // is between 1x and 2x.
+  if (cpi->oxcf.auto_key &&
+      rc->frames_to_key > cpi->oxcf.key_freq) {
+    FIRSTPASS_STATS tmp_frame = first_frame;
+
+    rc->frames_to_key /= 2;
+
+    // Reset to the start of the group.
+    reset_fpf_position(twopass, start_position);
+
+    kf_group_err = 0.0;
+
+    // Rescan to get the correct error data for the forced kf group.
+    for (i = 0; i < rc->frames_to_key; ++i) {
+      kf_group_err += calculate_modified_err(cpi, twopass, oxcf, &tmp_frame);
+      input_stats(twopass, &tmp_frame);
+    }
+    rc->next_key_frame_forced = 1;
+  } else if (twopass->stats_in == twopass->stats_in_end ||
+             rc->frames_to_key >= cpi->oxcf.key_freq) {
+    rc->next_key_frame_forced = 1;
+  } else {
+    rc->next_key_frame_forced = 0;
+  }
+
+  // Special case for the last key frame of the file.
+  if (twopass->stats_in >= twopass->stats_in_end) {
+    // Accumulate kf group error.
+    kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
+  }
+
+  // Calculate the number of bits that should be assigned to the kf group.
+  if (twopass->bits_left > 0 && twopass->modified_error_left > 0.0) {
+    // Maximum number of bits for a single normal frame (not key frame).
+    const int max_bits = frame_max_bits(rc, &cpi->oxcf);
+
+    // Maximum number of bits allocated to the key frame group.
+    int64_t max_grp_bits;
+
+    // Default allocation based on bits left and relative
+    // complexity of the section.
+    twopass->kf_group_bits = (int64_t)(twopass->bits_left *
+       (kf_group_err / twopass->modified_error_left));
+
+    // Clip based on maximum per frame rate defined by the user.
+    max_grp_bits = (int64_t)max_bits * (int64_t)rc->frames_to_key;
+    if (twopass->kf_group_bits > max_grp_bits)
+      twopass->kf_group_bits = max_grp_bits;
+  } else {
+    twopass->kf_group_bits = 0;
+  }
+  twopass->kf_group_bits = VPXMAX(0, twopass->kf_group_bits);
+
+  // Reset the first pass file position.
+  reset_fpf_position(twopass, start_position);
+
+  // Scan through the kf group collating various stats used to determine
+  // how many bits to spend on it.
+  decay_accumulator = 1.0;
+  boost_score = 0.0;
+  for (i = 0; i < (rc->frames_to_key - 1); ++i) {
+    if (EOF == input_stats(twopass, &next_frame))
+      break;
+
+    // Monitor for static sections.
+    zero_motion_accumulator = VPXMIN(
+        zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));
+
+    // Not all frames in the group are necessarily used in calculating boost.
+    if ((i <= rc->max_gf_interval) ||
+        ((i <= (rc->max_gf_interval * 4)) && (decay_accumulator > 0.5))) {
+      const double frame_boost =
+        calc_frame_boost(cpi, this_frame, 0, KF_MAX_BOOST);
+
+      // How fast is prediction quality decaying.
+      if (!detect_flash(twopass, 0)) {
+        const double loop_decay_rate =
+          get_prediction_decay_rate(cpi, &next_frame);
+        decay_accumulator *= loop_decay_rate;
+        decay_accumulator = VPXMAX(decay_accumulator, MIN_DECAY_FACTOR);
+        av_decay_accumulator += decay_accumulator;
+        ++loop_decay_counter;
+      }
+      boost_score += (decay_accumulator * frame_boost);
+    }
+  }
+  av_decay_accumulator /= (double)loop_decay_counter;
+
+  reset_fpf_position(twopass, start_position);
+
+  // Store the zero motion percentage
+  twopass->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0);
+
+  // Calculate a section intra ratio used in setting max loop filter.
+  twopass->section_intra_rating =
+      calculate_section_intra_ratio(start_position, twopass->stats_in_end,
+                                    rc->frames_to_key);
+
+  // Apply various clamps for min and max boost
+  rc->kf_boost = (int)(av_decay_accumulator * boost_score);
+  rc->kf_boost = VPXMAX(rc->kf_boost, (rc->frames_to_key * 3));
+  rc->kf_boost = VPXMAX(rc->kf_boost, MIN_KF_BOOST);
+
+  // Work out how many bits to allocate for the key frame itself.
+  kf_bits = calculate_boost_bits((rc->frames_to_key - 1),
+                                  rc->kf_boost, twopass->kf_group_bits);
+
+  // Work out the fraction of the kf group bits reserved for the inter frames
+  // within the group after discounting the bits for the kf itself.
+  if (twopass->kf_group_bits) {
+    twopass->kfgroup_inter_fraction =
+      (double)(twopass->kf_group_bits - kf_bits) /
+      (double)twopass->kf_group_bits;
+  } else {
+    twopass->kfgroup_inter_fraction = 1.0;
+  }
+
+  twopass->kf_group_bits -= kf_bits;
+
+  // Save the bits to spend on the key frame.
+  gf_group->bit_allocation[0] = kf_bits;
+  gf_group->update_type[0] = KF_UPDATE;
+  gf_group->rf_level[0] = KF_STD;
+
+  // Note the total error score of the kf group minus the key frame itself.
+  twopass->kf_group_error_left = (int)(kf_group_err - kf_mod_err);
+
+  // Adjust the count of total modified error left.
+  // The count of bits left is adjusted elsewhere based on real coded frame
+  // sizes.
+  twopass->modified_error_left -= kf_group_err;
+
+  if (oxcf->resize_mode == RESIZE_DYNAMIC) {
+    // Default to normal-sized frame on keyframes.
+    cpi->rc.next_frame_size_selector = UNSCALED;
+  }
+}
+
+// Define the reference buffers that will be updated post encode.
+static void configure_buffer_updates(VP10_COMP *cpi) {
+  TWO_PASS *const twopass = &cpi->twopass;
+
+  cpi->rc.is_src_frame_alt_ref = 0;
+  switch (twopass->gf_group.update_type[twopass->gf_group.index]) {
+    case KF_UPDATE:
+      cpi->refresh_last_frame = 1;
+      cpi->refresh_golden_frame = 1;
+      cpi->refresh_alt_ref_frame = 1;
+      break;
+    case LF_UPDATE:
+      cpi->refresh_last_frame = 1;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_alt_ref_frame = 0;
+      break;
+    case GF_UPDATE:
+      cpi->refresh_last_frame = 1;
+      cpi->refresh_golden_frame = 1;
+      cpi->refresh_alt_ref_frame = 0;
+      break;
+    case OVERLAY_UPDATE:
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 1;
+      cpi->refresh_alt_ref_frame = 0;
+      cpi->rc.is_src_frame_alt_ref = 1;
+      break;
+    case ARF_UPDATE:
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_alt_ref_frame = 1;
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
+
+static int is_skippable_frame(const VP10_COMP *cpi) {
+  // If the current frame does not have non-zero motion vector detected in the
+  // first  pass, and so do its previous and forward frames, then this frame
+  // can be skipped for partition check, and the partition size is assigned
+  // according to the variance
+  const TWO_PASS *const twopass = &cpi->twopass;
+
+  return (!frame_is_intra_only(&cpi->common) &&
+    twopass->stats_in - 2 > twopass->stats_in_start &&
+    twopass->stats_in < twopass->stats_in_end &&
+    (twopass->stats_in - 1)->pcnt_inter - (twopass->stats_in - 1)->pcnt_motion
+    == 1 &&
+    (twopass->stats_in - 2)->pcnt_inter - (twopass->stats_in - 2)->pcnt_motion
+    == 1 &&
+    twopass->stats_in->pcnt_inter - twopass->stats_in->pcnt_motion == 1);
+}
+
+void vp10_rc_get_second_pass_params(VP10_COMP *cpi) {
+  VP10_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  TWO_PASS *const twopass = &cpi->twopass;
+  GF_GROUP *const gf_group = &twopass->gf_group;
+  int frames_left;
+  FIRSTPASS_STATS this_frame;
+
+  int target_rate;
+
+  frames_left = (int)(twopass->total_stats.count -
+                cm->current_video_frame);
+
+  if (!twopass->stats_in)
+    return;
+
+  // If this is an arf frame then we dont want to read the stats file or
+  // advance the input pointer as we already have what we need.
+  if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
+    int target_rate;
+    configure_buffer_updates(cpi);
+    target_rate = gf_group->bit_allocation[gf_group->index];
+    target_rate = vp10_rc_clamp_pframe_target_size(cpi, target_rate);
+    rc->base_frame_target = target_rate;
+
+    cm->frame_type = INTER_FRAME;
+
+    // Do the firstpass stats indicate that this frame is skippable for the
+    // partition search?
+    if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2) {
+      cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
+    }
+
+    return;
+  }
+
+  vpx_clear_system_state();
+
+  if (cpi->oxcf.rc_mode == VPX_Q) {
+    twopass->active_worst_quality = cpi->oxcf.cq_level;
+  } else if (cm->current_video_frame == 0) {
+    // Special case code for first frame.
+    const int section_target_bandwidth = (int)(twopass->bits_left /
+                                               frames_left);
+    const double section_length = twopass->total_left_stats.count;
+    const double section_error =
+      twopass->total_left_stats.coded_error / section_length;
+    const double section_intra_skip =
+      twopass->total_left_stats.intra_skip_pct / section_length;
+    const double section_inactive_zone =
+      (twopass->total_left_stats.inactive_zone_rows * 2) /
+      ((double)cm->mb_rows * section_length);
+    const int tmp_q =
+      get_twopass_worst_quality(cpi, section_error,
+                                section_intra_skip + section_inactive_zone,
+                                section_target_bandwidth, DEFAULT_GRP_WEIGHT);
+
+    twopass->active_worst_quality = tmp_q;
+    twopass->baseline_active_worst_quality = tmp_q;
+    rc->ni_av_qi = tmp_q;
+    rc->last_q[INTER_FRAME] = tmp_q;
+    rc->avg_q = vp10_convert_qindex_to_q(tmp_q, cm->bit_depth);
+    rc->avg_frame_qindex[INTER_FRAME] = tmp_q;
+    rc->last_q[KEY_FRAME] = (tmp_q + cpi->oxcf.best_allowed_q) / 2;
+    rc->avg_frame_qindex[KEY_FRAME] = rc->last_q[KEY_FRAME];
+  }
+  vp10_zero(this_frame);
+  if (EOF == input_stats(twopass, &this_frame))
+    return;
+
+  // Set the frame content type flag.
+  if (this_frame.intra_skip_pct >= FC_ANIMATION_THRESH)
+    twopass->fr_content_type = FC_GRAPHICS_ANIMATION;
+  else
+    twopass->fr_content_type = FC_NORMAL;
+
+  // Keyframe and section processing.
+  if (rc->frames_to_key == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY)) {
+    FIRSTPASS_STATS this_frame_copy;
+    this_frame_copy = this_frame;
+    // Define next KF group and assign bits to it.
+    find_next_key_frame(cpi, &this_frame);
+    this_frame = this_frame_copy;
+  } else {
+    cm->frame_type = INTER_FRAME;
+  }
+
+  // Define a new GF/ARF group. (Should always enter here for key frames).
+  if (rc->frames_till_gf_update_due == 0) {
+    define_gf_group(cpi, &this_frame);
+
+    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+
+#if ARF_STATS_OUTPUT
+    {
+      FILE *fpfile;
+      fpfile = fopen("arf.stt", "a");
+      ++arf_count;
+      fprintf(fpfile, "%10d %10ld %10d %10d %10ld\n",
+              cm->current_video_frame, rc->frames_till_gf_update_due,
+              rc->kf_boost, arf_count, rc->gfu_boost);
+
+      fclose(fpfile);
+    }
+#endif
+  }
+
+  configure_buffer_updates(cpi);
+
+  // Do the firstpass stats indicate that this frame is skippable for the
+  // partition search?
+  if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2) {
+    cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
+  }
+
+  target_rate = gf_group->bit_allocation[gf_group->index];
+  if (cpi->common.frame_type == KEY_FRAME)
+    target_rate = vp10_rc_clamp_iframe_target_size(cpi, target_rate);
+  else
+    target_rate = vp10_rc_clamp_pframe_target_size(cpi, target_rate);
+
+  rc->base_frame_target = target_rate;
+
+  {
+    const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+                        ? cpi->initial_mbs : cpi->common.MBs;
+    // The multiplication by 256 reverses a scaling factor of (>> 8)
+    // applied when combining MB error values for the frame.
+    twopass->mb_av_energy =
+      log(((this_frame.intra_error * 256.0) / num_mbs) + 1.0);
+  }
+
+  // Update the total stats remaining structure.
+  subtract_stats(&twopass->total_left_stats, &this_frame);
+}
+
+#define MINQ_ADJ_LIMIT 48
+#define MINQ_ADJ_LIMIT_CQ 20
+#define HIGH_UNDERSHOOT_RATIO 2
+void vp10_twopass_postencode_update(VP10_COMP *cpi) {
+  TWO_PASS *const twopass = &cpi->twopass;
+  RATE_CONTROL *const rc = &cpi->rc;
+  const int bits_used = rc->base_frame_target;
+
+  // VBR correction is done through rc->vbr_bits_off_target. Based on the
+  // sign of this value, a limited % adjustment is made to the target rate
+  // of subsequent frames, to try and push it back towards 0. This method
+  // is designed to prevent extreme behaviour at the end of a clip
+  // or group of frames.
+  rc->vbr_bits_off_target += rc->base_frame_target - rc->projected_frame_size;
+  twopass->bits_left = VPXMAX(twopass->bits_left - bits_used, 0);
+
+  // Calculate the pct rc error.
+  if (rc->total_actual_bits) {
+    rc->rate_error_estimate =
+      (int)((rc->vbr_bits_off_target * 100) / rc->total_actual_bits);
+    rc->rate_error_estimate = clamp(rc->rate_error_estimate, -100, 100);
+  } else {
+    rc->rate_error_estimate = 0;
+  }
+
+  if (cpi->common.frame_type != KEY_FRAME) {
+    twopass->kf_group_bits -= bits_used;
+    twopass->last_kfgroup_zeromotion_pct = twopass->kf_zeromotion_pct;
+  }
+  twopass->kf_group_bits = VPXMAX(twopass->kf_group_bits, 0);
+
+  // Increment the gf group index ready for the next frame.
+  ++twopass->gf_group.index;
+
+  // If the rate control is drifting consider adjustment to min or maxq.
+  if ((cpi->oxcf.rc_mode != VPX_Q) &&
+      (cpi->twopass.gf_zeromotion_pct < VLOW_MOTION_THRESHOLD) &&
+      !cpi->rc.is_src_frame_alt_ref) {
+    const int maxq_adj_limit =
+      rc->worst_quality - twopass->active_worst_quality;
+    const int minq_adj_limit =
+        (cpi->oxcf.rc_mode == VPX_CQ ? MINQ_ADJ_LIMIT_CQ : MINQ_ADJ_LIMIT);
+
+    // Undershoot.
+    if (rc->rate_error_estimate > cpi->oxcf.under_shoot_pct) {
+      --twopass->extend_maxq;
+      if (rc->rolling_target_bits >= rc->rolling_actual_bits)
+        ++twopass->extend_minq;
+    // Overshoot.
+    } else if (rc->rate_error_estimate < -cpi->oxcf.over_shoot_pct) {
+      --twopass->extend_minq;
+      if (rc->rolling_target_bits < rc->rolling_actual_bits)
+        ++twopass->extend_maxq;
+    } else {
+      // Adjustment for extreme local overshoot.
+      if (rc->projected_frame_size > (2 * rc->base_frame_target) &&
+          rc->projected_frame_size > (2 * rc->avg_frame_bandwidth))
+        ++twopass->extend_maxq;
+
+      // Unwind undershoot or overshoot adjustment.
+      if (rc->rolling_target_bits < rc->rolling_actual_bits)
+        --twopass->extend_minq;
+      else if (rc->rolling_target_bits > rc->rolling_actual_bits)
+        --twopass->extend_maxq;
+    }
+
+    twopass->extend_minq = clamp(twopass->extend_minq, 0, minq_adj_limit);
+    twopass->extend_maxq = clamp(twopass->extend_maxq, 0, maxq_adj_limit);
+
+    // If there is a big and undexpected undershoot then feed the extra
+    // bits back in quickly. One situation where this may happen is if a
+    // frame is unexpectedly almost perfectly predicted by the ARF or GF
+    // but not very well predcited by the previous frame.
+    if (!frame_is_kf_gf_arf(cpi) && !cpi->rc.is_src_frame_alt_ref) {
+      int fast_extra_thresh = rc->base_frame_target / HIGH_UNDERSHOOT_RATIO;
+      if (rc->projected_frame_size < fast_extra_thresh) {
+        rc->vbr_bits_off_target_fast +=
+          fast_extra_thresh - rc->projected_frame_size;
+        rc->vbr_bits_off_target_fast =
+          VPXMIN(rc->vbr_bits_off_target_fast, (4 * rc->avg_frame_bandwidth));
+
+        // Fast adaptation of minQ if necessary to use up the extra bits.
+        if (rc->avg_frame_bandwidth) {
+          twopass->extend_minq_fast =
+            (int)(rc->vbr_bits_off_target_fast * 8 / rc->avg_frame_bandwidth);
+        }
+        twopass->extend_minq_fast = VPXMIN(
+            twopass->extend_minq_fast, minq_adj_limit - twopass->extend_minq);
+      } else if (rc->vbr_bits_off_target_fast) {
+        twopass->extend_minq_fast = VPXMIN(
+            twopass->extend_minq_fast, minq_adj_limit - twopass->extend_minq);
+      } else {
+        twopass->extend_minq_fast = 0;
+      }
+    }
+  }
+}
diff --git a/libs/libvpx/vp10/encoder/firstpass.h b/libs/libvpx/vp10/encoder/firstpass.h
new file mode 100644
index 0000000000..68a88879c4
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/firstpass.h
@@ -0,0 +1,166 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_ENCODER_FIRSTPASS_H_
+#define VP10_ENCODER_FIRSTPASS_H_
+
+#include "vp10/encoder/lookahead.h"
+#include "vp10/encoder/ratectrl.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if CONFIG_FP_MB_STATS
+
+#define FPMB_DCINTRA_MASK 0x01
+
+#define FPMB_MOTION_ZERO_MASK 0x02
+#define FPMB_MOTION_LEFT_MASK 0x04
+#define FPMB_MOTION_RIGHT_MASK 0x08
+#define FPMB_MOTION_UP_MASK 0x10
+#define FPMB_MOTION_DOWN_MASK 0x20
+
+#define FPMB_ERROR_SMALL_MASK 0x40
+#define FPMB_ERROR_LARGE_MASK 0x80
+#define FPMB_ERROR_SMALL_TH 2000
+#define FPMB_ERROR_LARGE_TH 48000
+
+typedef struct {
+  uint8_t *mb_stats_start;
+  uint8_t *mb_stats_end;
+} FIRSTPASS_MB_STATS;
+#endif
+
+#define VLOW_MOTION_THRESHOLD 950
+
+typedef struct {
+  double frame;
+  double weight;
+  double intra_error;
+  double coded_error;
+  double sr_coded_error;
+  double pcnt_inter;
+  double pcnt_motion;
+  double pcnt_second_ref;
+  double pcnt_neutral;
+  double intra_skip_pct;
+  double inactive_zone_rows;  // Image mask rows top and bottom.
+  double inactive_zone_cols;  // Image mask columns at left and right edges.
+  double MVr;
+  double mvr_abs;
+  double MVc;
+  double mvc_abs;
+  double MVrv;
+  double MVcv;
+  double mv_in_out_count;
+  double new_mv_count;
+  double duration;
+  double count;
+} FIRSTPASS_STATS;
+
+typedef enum {
+  KF_UPDATE = 0,
+  LF_UPDATE = 1,
+  GF_UPDATE = 2,
+  ARF_UPDATE = 3,
+  OVERLAY_UPDATE = 4,
+  FRAME_UPDATE_TYPES = 5
+} FRAME_UPDATE_TYPE;
+
+#define FC_ANIMATION_THRESH 0.15
+typedef enum {
+  FC_NORMAL = 0,
+  FC_GRAPHICS_ANIMATION = 1,
+  FRAME_CONTENT_TYPES = 2
+} FRAME_CONTENT_TYPE;
+
+typedef struct {
+  unsigned char index;
+  RATE_FACTOR_LEVEL rf_level[(MAX_LAG_BUFFERS * 2) + 1];
+  FRAME_UPDATE_TYPE update_type[(MAX_LAG_BUFFERS * 2) + 1];
+  unsigned char arf_src_offset[(MAX_LAG_BUFFERS * 2) + 1];
+  unsigned char arf_update_idx[(MAX_LAG_BUFFERS * 2) + 1];
+  unsigned char arf_ref_idx[(MAX_LAG_BUFFERS * 2) + 1];
+  int bit_allocation[(MAX_LAG_BUFFERS * 2) + 1];
+} GF_GROUP;
+
+typedef struct {
+  unsigned int section_intra_rating;
+  FIRSTPASS_STATS total_stats;
+  FIRSTPASS_STATS this_frame_stats;
+  const FIRSTPASS_STATS *stats_in;
+  const FIRSTPASS_STATS *stats_in_start;
+  const FIRSTPASS_STATS *stats_in_end;
+  FIRSTPASS_STATS total_left_stats;
+  int first_pass_done;
+  int64_t bits_left;
+  double modified_error_min;
+  double modified_error_max;
+  double modified_error_left;
+  double mb_av_energy;
+
+#if CONFIG_FP_MB_STATS
+  uint8_t *frame_mb_stats_buf;
+  uint8_t *this_frame_mb_stats;
+  FIRSTPASS_MB_STATS firstpass_mb_stats;
+#endif
+  // An indication of the content type of the current frame
+  FRAME_CONTENT_TYPE fr_content_type;
+
+  // Projected total bits available for a key frame group of frames
+  int64_t kf_group_bits;
+
+  // Error score of frames still to be coded in kf group
+  int64_t kf_group_error_left;
+
+  // The fraction for a kf groups total bits allocated to the inter frames
+  double kfgroup_inter_fraction;
+
+  int sr_update_lag;
+
+  int kf_zeromotion_pct;
+  int last_kfgroup_zeromotion_pct;
+  int gf_zeromotion_pct;
+  int active_worst_quality;
+  int baseline_active_worst_quality;
+  int extend_minq;
+  int extend_maxq;
+  int extend_minq_fast;
+
+  GF_GROUP gf_group;
+} TWO_PASS;
+
+struct VP10_COMP;
+
+void vp10_init_first_pass(struct VP10_COMP *cpi);
+void vp10_rc_get_first_pass_params(struct VP10_COMP *cpi);
+void vp10_first_pass(struct VP10_COMP *cpi,
+                     const struct lookahead_entry *source);
+void vp10_end_first_pass(struct VP10_COMP *cpi);
+
+void vp10_init_second_pass(struct VP10_COMP *cpi);
+void vp10_rc_get_second_pass_params(struct VP10_COMP *cpi);
+void vp10_twopass_postencode_update(struct VP10_COMP *cpi);
+
+// Post encode update of the rate control parameters for 2-pass
+void vp10_twopass_postencode_update(struct VP10_COMP *cpi);
+
+void vp10_init_subsampling(struct VP10_COMP *cpi);
+
+void vp10_calculate_coded_size(struct VP10_COMP *cpi,
+                          int *scaled_frame_width,
+                          int *scaled_frame_height);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_ENCODER_FIRSTPASS_H_
diff --git a/libs/libvpx/vp10/encoder/lookahead.c b/libs/libvpx/vp10/encoder/lookahead.c
new file mode 100644
index 0000000000..dce0139038
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/lookahead.c
@@ -0,0 +1,245 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <assert.h>
+#include <stdlib.h>
+
+#include "./vpx_config.h"
+
+#include "vp10/common/common.h"
+
+#include "vp10/encoder/encoder.h"
+#include "vp10/encoder/extend.h"
+#include "vp10/encoder/lookahead.h"
+
+/* Return the buffer at the given absolute index and increment the index */
+static struct lookahead_entry *pop(struct lookahead_ctx *ctx,
+                                   unsigned int *idx) {
+  unsigned int index = *idx;
+  struct lookahead_entry *buf = ctx->buf + index;
+
+  assert(index < ctx->max_sz);
+  if (++index >= ctx->max_sz)
+    index -= ctx->max_sz;
+  *idx = index;
+  return buf;
+}
+
+
+void vp10_lookahead_destroy(struct lookahead_ctx *ctx) {
+  if (ctx) {
+    if (ctx->buf) {
+      unsigned int i;
+
+      for (i = 0; i < ctx->max_sz; i++)
+        vpx_free_frame_buffer(&ctx->buf[i].img);
+      free(ctx->buf);
+    }
+    free(ctx);
+  }
+}
+
+
+struct lookahead_ctx *vp10_lookahead_init(unsigned int width,
+                                         unsigned int height,
+                                         unsigned int subsampling_x,
+                                         unsigned int subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                         int use_highbitdepth,
+#endif
+                                         unsigned int depth) {
+  struct lookahead_ctx *ctx = NULL;
+
+  // Clamp the lookahead queue depth
+  depth = clamp(depth, 1, MAX_LAG_BUFFERS);
+
+  // Allocate memory to keep previous source frames available.
+  depth += MAX_PRE_FRAMES;
+
+  // Allocate the lookahead structures
+  ctx = calloc(1, sizeof(*ctx));
+  if (ctx) {
+    const int legacy_byte_alignment = 0;
+    unsigned int i;
+    ctx->max_sz = depth;
+    ctx->buf = calloc(depth, sizeof(*ctx->buf));
+    if (!ctx->buf)
+      goto bail;
+    for (i = 0; i < depth; i++)
+      if (vpx_alloc_frame_buffer(&ctx->buf[i].img,
+                                 width, height, subsampling_x, subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                 use_highbitdepth,
+#endif
+                                 VP9_ENC_BORDER_IN_PIXELS,
+                                 legacy_byte_alignment))
+        goto bail;
+  }
+  return ctx;
+ bail:
+  vp10_lookahead_destroy(ctx);
+  return NULL;
+}
+
+#define USE_PARTIAL_COPY 0
+
+int vp10_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG   *src,
+                       int64_t ts_start, int64_t ts_end,
+#if CONFIG_VP9_HIGHBITDEPTH
+                       int use_highbitdepth,
+#endif
+                       unsigned int flags) {
+  struct lookahead_entry *buf;
+#if USE_PARTIAL_COPY
+  int row, col, active_end;
+  int mb_rows = (src->y_height + 15) >> 4;
+  int mb_cols = (src->y_width + 15) >> 4;
+#endif
+  int width = src->y_crop_width;
+  int height = src->y_crop_height;
+  int uv_width = src->uv_crop_width;
+  int uv_height = src->uv_crop_height;
+  int subsampling_x = src->subsampling_x;
+  int subsampling_y = src->subsampling_y;
+  int larger_dimensions, new_dimensions;
+
+  if (ctx->sz + 1  + MAX_PRE_FRAMES > ctx->max_sz)
+    return 1;
+  ctx->sz++;
+  buf = pop(ctx, &ctx->write_idx);
+
+  new_dimensions = width != buf->img.y_crop_width ||
+                   height != buf->img.y_crop_height ||
+                   uv_width != buf->img.uv_crop_width ||
+                   uv_height != buf->img.uv_crop_height;
+  larger_dimensions = width > buf->img.y_width ||
+                      height > buf->img.y_height ||
+                      uv_width > buf->img.uv_width ||
+                      uv_height > buf->img.uv_height;
+  assert(!larger_dimensions || new_dimensions);
+
+#if USE_PARTIAL_COPY
+  // TODO(jkoleszar): This is disabled for now, as
+  // vp10_copy_and_extend_frame_with_rect is not subsampling/alpha aware.
+
+  // Only do this partial copy if the following conditions are all met:
+  // 1. Lookahead queue has has size of 1.
+  // 2. Active map is provided.
+  // 3. This is not a key frame, golden nor altref frame.
+  if (!new_dimensions && ctx->max_sz == 1 && active_map && !flags) {
+    for (row = 0; row < mb_rows; ++row) {
+      col = 0;
+
+      while (1) {
+        // Find the first active macroblock in this row.
+        for (; col < mb_cols; ++col) {
+          if (active_map[col])
+            break;
+        }
+
+        // No more active macroblock in this row.
+        if (col == mb_cols)
+          break;
+
+        // Find the end of active region in this row.
+        active_end = col;
+
+        for (; active_end < mb_cols; ++active_end) {
+          if (!active_map[active_end])
+            break;
+        }
+
+        // Only copy this active region.
+        vp10_copy_and_extend_frame_with_rect(src, &buf->img,
+                                            row << 4,
+                                            col << 4, 16,
+                                            (active_end - col) << 4);
+
+        // Start again from the end of this active region.
+        col = active_end;
+      }
+
+      active_map += mb_cols;
+    }
+  } else {
+#endif
+    if (larger_dimensions) {
+      YV12_BUFFER_CONFIG new_img;
+      memset(&new_img, 0, sizeof(new_img));
+      if (vpx_alloc_frame_buffer(&new_img,
+                                 width, height, subsampling_x, subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                 use_highbitdepth,
+#endif
+                                 VP9_ENC_BORDER_IN_PIXELS,
+                                 0))
+          return 1;
+      vpx_free_frame_buffer(&buf->img);
+      buf->img = new_img;
+    } else if (new_dimensions) {
+      buf->img.y_crop_width = src->y_crop_width;
+      buf->img.y_crop_height = src->y_crop_height;
+      buf->img.uv_crop_width = src->uv_crop_width;
+      buf->img.uv_crop_height = src->uv_crop_height;
+      buf->img.subsampling_x = src->subsampling_x;
+      buf->img.subsampling_y = src->subsampling_y;
+    }
+    // Partial copy not implemented yet
+    vp10_copy_and_extend_frame(src, &buf->img);
+#if USE_PARTIAL_COPY
+  }
+#endif
+
+  buf->ts_start = ts_start;
+  buf->ts_end = ts_end;
+  buf->flags = flags;
+  return 0;
+}
+
+
+struct lookahead_entry *vp10_lookahead_pop(struct lookahead_ctx *ctx,
+                                          int drain) {
+  struct lookahead_entry *buf = NULL;
+
+  if (ctx && ctx->sz && (drain || ctx->sz == ctx->max_sz - MAX_PRE_FRAMES)) {
+    buf = pop(ctx, &ctx->read_idx);
+    ctx->sz--;
+  }
+  return buf;
+}
+
+
+struct lookahead_entry *vp10_lookahead_peek(struct lookahead_ctx *ctx,
+                                           int index) {
+  struct lookahead_entry *buf = NULL;
+
+  if (index >= 0) {
+    // Forward peek
+    if (index < (int)ctx->sz) {
+      index += ctx->read_idx;
+      if (index >= (int)ctx->max_sz)
+        index -= ctx->max_sz;
+      buf = ctx->buf + index;
+    }
+  } else if (index < 0) {
+    // Backward peek
+    if (-index <= MAX_PRE_FRAMES) {
+      index += ctx->read_idx;
+      if (index < 0)
+        index += ctx->max_sz;
+      buf = ctx->buf + index;
+    }
+  }
+
+  return buf;
+}
+
+unsigned int vp10_lookahead_depth(struct lookahead_ctx *ctx) {
+  return ctx->sz;
+}
diff --git a/libs/libvpx/vp10/encoder/lookahead.h b/libs/libvpx/vp10/encoder/lookahead.h
new file mode 100644
index 0000000000..22429aeeb0
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/lookahead.h
@@ -0,0 +1,119 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_ENCODER_LOOKAHEAD_H_
+#define VP10_ENCODER_LOOKAHEAD_H_
+
+#include "vpx_scale/yv12config.h"
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_LAG_BUFFERS 25
+
+struct lookahead_entry {
+  YV12_BUFFER_CONFIG  img;
+  int64_t             ts_start;
+  int64_t             ts_end;
+  unsigned int        flags;
+};
+
+// The max of past frames we want to keep in the queue.
+#define MAX_PRE_FRAMES 1
+
+struct lookahead_ctx {
+  unsigned int max_sz;         /* Absolute size of the queue */
+  unsigned int sz;             /* Number of buffers currently in the queue */
+  unsigned int read_idx;       /* Read index */
+  unsigned int write_idx;      /* Write index */
+  struct lookahead_entry *buf; /* Buffer list */
+};
+
+/**\brief Initializes the lookahead stage
+ *
+ * The lookahead stage is a queue of frame buffers on which some analysis
+ * may be done when buffers are enqueued.
+ */
+struct lookahead_ctx *vp10_lookahead_init(unsigned int width,
+                                         unsigned int height,
+                                         unsigned int subsampling_x,
+                                         unsigned int subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                         int use_highbitdepth,
+#endif
+                                         unsigned int depth);
+
+
+/**\brief Destroys the lookahead stage
+ */
+void vp10_lookahead_destroy(struct lookahead_ctx *ctx);
+
+
+/**\brief Enqueue a source buffer
+ *
+ * This function will copy the source image into a new framebuffer with
+ * the expected stride/border.
+ *
+ * If active_map is non-NULL and there is only one frame in the queue, then copy
+ * only active macroblocks.
+ *
+ * \param[in] ctx         Pointer to the lookahead context
+ * \param[in] src         Pointer to the image to enqueue
+ * \param[in] ts_start    Timestamp for the start of this frame
+ * \param[in] ts_end      Timestamp for the end of this frame
+ * \param[in] flags       Flags set on this frame
+ * \param[in] active_map  Map that specifies which macroblock is active
+ */
+int vp10_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
+                       int64_t ts_start, int64_t ts_end,
+#if CONFIG_VP9_HIGHBITDEPTH
+                       int use_highbitdepth,
+#endif
+                       unsigned int flags);
+
+
+/**\brief Get the next source buffer to encode
+ *
+ *
+ * \param[in] ctx       Pointer to the lookahead context
+ * \param[in] drain     Flag indicating the buffer should be drained
+ *                      (return a buffer regardless of the current queue depth)
+ *
+ * \retval NULL, if drain set and queue is empty
+ * \retval NULL, if drain not set and queue not of the configured depth
+ */
+struct lookahead_entry *vp10_lookahead_pop(struct lookahead_ctx *ctx,
+                                          int drain);
+
+
+/**\brief Get a future source buffer to encode
+ *
+ * \param[in] ctx       Pointer to the lookahead context
+ * \param[in] index     Index of the frame to be returned, 0 == next frame
+ *
+ * \retval NULL, if no buffer exists at the specified index
+ */
+struct lookahead_entry *vp10_lookahead_peek(struct lookahead_ctx *ctx,
+                                           int index);
+
+
+/**\brief Get the number of frames currently in the lookahead queue
+ *
+ * \param[in] ctx       Pointer to the lookahead context
+ */
+unsigned int vp10_lookahead_depth(struct lookahead_ctx *ctx);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_ENCODER_LOOKAHEAD_H_
diff --git a/libs/libvpx/vp10/encoder/mbgraph.c b/libs/libvpx/vp10/encoder/mbgraph.c
new file mode 100644
index 0000000000..ed0f53909f
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/mbgraph.c
@@ -0,0 +1,417 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <limits.h>
+
+#include "./vp10_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/system_state.h"
+#include "vp10/encoder/segmentation.h"
+#include "vp10/encoder/mcomp.h"
+#include "vp10/common/blockd.h"
+#include "vp10/common/reconinter.h"
+#include "vp10/common/reconintra.h"
+
+
+static unsigned int do_16x16_motion_iteration(VP10_COMP *cpi,
+                                              const MV *ref_mv,
+                                              MV *dst_mv,
+                                              int mb_row,
+                                              int mb_col) {
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
+  const vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16];
+
+  const int tmp_col_min = x->mv_col_min;
+  const int tmp_col_max = x->mv_col_max;
+  const int tmp_row_min = x->mv_row_min;
+  const int tmp_row_max = x->mv_row_max;
+  MV ref_full;
+  int cost_list[5];
+
+  // Further step/diamond searches as necessary
+  int step_param = mv_sf->reduce_first_step_size;
+  step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2);
+
+  vp10_set_mv_search_range(x, ref_mv);
+
+  ref_full.col = ref_mv->col >> 3;
+  ref_full.row = ref_mv->row >> 3;
+
+  /*cpi->sf.search_method == HEX*/
+  vp10_hex_search(x, &ref_full, step_param, x->errorperbit, 0,
+                 cond_cost_list(cpi, cost_list),
+                 &v_fn_ptr, 0, ref_mv, dst_mv);
+
+  // Try sub-pixel MC
+  // if (bestsme > error_thresh && bestsme < INT_MAX)
+  {
+    int distortion;
+    unsigned int sse;
+    cpi->find_fractional_mv_step(
+        x, dst_mv, ref_mv, cpi->common.allow_high_precision_mv, x->errorperbit,
+        &v_fn_ptr, 0, mv_sf->subpel_iters_per_step,
+        cond_cost_list(cpi, cost_list),
+        NULL, NULL,
+        &distortion, &sse, NULL, 0, 0);
+  }
+
+  xd->mi[0]->mbmi.mode = NEWMV;
+  xd->mi[0]->mbmi.mv[0].as_mv = *dst_mv;
+
+  vp10_build_inter_predictors_sby(xd, mb_row, mb_col, BLOCK_16X16);
+
+  /* restore UMV window */
+  x->mv_col_min = tmp_col_min;
+  x->mv_col_max = tmp_col_max;
+  x->mv_row_min = tmp_row_min;
+  x->mv_row_max = tmp_row_max;
+
+  return vpx_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
+                      xd->plane[0].dst.buf, xd->plane[0].dst.stride);
+}
+
+static int do_16x16_motion_search(VP10_COMP *cpi, const MV *ref_mv,
+                                  int_mv *dst_mv, int mb_row, int mb_col) {
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  unsigned int err, tmp_err;
+  MV tmp_mv;
+
+  // Try zero MV first
+  // FIXME should really use something like near/nearest MV and/or MV prediction
+  err = vpx_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
+                     xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride);
+  dst_mv->as_int = 0;
+
+  // Test last reference frame using the previous best mv as the
+  // starting point (best reference) for the search
+  tmp_err = do_16x16_motion_iteration(cpi, ref_mv, &tmp_mv, mb_row, mb_col);
+  if (tmp_err < err) {
+    err = tmp_err;
+    dst_mv->as_mv = tmp_mv;
+  }
+
+  // If the current best reference mv is not centered on 0,0 then do a 0,0
+  // based search as well.
+  if (ref_mv->row != 0 || ref_mv->col != 0) {
+    unsigned int tmp_err;
+    MV zero_ref_mv = {0, 0}, tmp_mv;
+
+    tmp_err = do_16x16_motion_iteration(cpi, &zero_ref_mv, &tmp_mv,
+                                        mb_row, mb_col);
+    if (tmp_err < err) {
+      dst_mv->as_mv = tmp_mv;
+      err = tmp_err;
+    }
+  }
+
+  return err;
+}
+
+static int do_16x16_zerozero_search(VP10_COMP *cpi, int_mv *dst_mv) {
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  unsigned int err;
+
+  // Try zero MV first
+  // FIXME should really use something like near/nearest MV and/or MV prediction
+  err = vpx_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
+                     xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride);
+
+  dst_mv->as_int = 0;
+
+  return err;
+}
+static int find_best_16x16_intra(VP10_COMP *cpi, PREDICTION_MODE *pbest_mode) {
+  MACROBLOCK   *const x  = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  PREDICTION_MODE best_mode = -1, mode;
+  unsigned int best_err = INT_MAX;
+
+  // calculate SATD for each intra prediction mode;
+  // we're intentionally not doing 4x4, we just want a rough estimate
+  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
+    unsigned int err;
+
+    xd->mi[0]->mbmi.mode = mode;
+    vp10_predict_intra_block(xd, 2, 2, TX_16X16, mode,
+                            x->plane[0].src.buf, x->plane[0].src.stride,
+                            xd->plane[0].dst.buf, xd->plane[0].dst.stride,
+                            0, 0, 0);
+    err = vpx_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
+                       xd->plane[0].dst.buf, xd->plane[0].dst.stride);
+
+    // find best
+    if (err < best_err) {
+      best_err  = err;
+      best_mode = mode;
+    }
+  }
+
+  if (pbest_mode)
+    *pbest_mode = best_mode;
+
+  return best_err;
+}
+
+static void update_mbgraph_mb_stats
+(
+  VP10_COMP *cpi,
+  MBGRAPH_MB_STATS *stats,
+  YV12_BUFFER_CONFIG *buf,
+  int mb_y_offset,
+  YV12_BUFFER_CONFIG *golden_ref,
+  const MV *prev_golden_ref_mv,
+  YV12_BUFFER_CONFIG *alt_ref,
+  int mb_row,
+  int mb_col
+) {
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int intra_error;
+  VP10_COMMON *cm = &cpi->common;
+
+  // FIXME in practice we're completely ignoring chroma here
+  x->plane[0].src.buf = buf->y_buffer + mb_y_offset;
+  x->plane[0].src.stride = buf->y_stride;
+
+  xd->plane[0].dst.buf = get_frame_new_buffer(cm)->y_buffer + mb_y_offset;
+  xd->plane[0].dst.stride = get_frame_new_buffer(cm)->y_stride;
+
+  // do intra 16x16 prediction
+  intra_error = find_best_16x16_intra(cpi,
+                                      &stats->ref[INTRA_FRAME].m.mode);
+  if (intra_error <= 0)
+    intra_error = 1;
+  stats->ref[INTRA_FRAME].err = intra_error;
+
+  // Golden frame MV search, if it exists and is different than last frame
+  if (golden_ref) {
+    int g_motion_error;
+    xd->plane[0].pre[0].buf = golden_ref->y_buffer + mb_y_offset;
+    xd->plane[0].pre[0].stride = golden_ref->y_stride;
+    g_motion_error = do_16x16_motion_search(cpi,
+                                            prev_golden_ref_mv,
+                                            &stats->ref[GOLDEN_FRAME].m.mv,
+                                            mb_row, mb_col);
+    stats->ref[GOLDEN_FRAME].err = g_motion_error;
+  } else {
+    stats->ref[GOLDEN_FRAME].err = INT_MAX;
+    stats->ref[GOLDEN_FRAME].m.mv.as_int = 0;
+  }
+
+  // Do an Alt-ref frame MV search, if it exists and is different than
+  // last/golden frame.
+  if (alt_ref) {
+    int a_motion_error;
+    xd->plane[0].pre[0].buf = alt_ref->y_buffer + mb_y_offset;
+    xd->plane[0].pre[0].stride = alt_ref->y_stride;
+    a_motion_error = do_16x16_zerozero_search(cpi,
+                                              &stats->ref[ALTREF_FRAME].m.mv);
+
+    stats->ref[ALTREF_FRAME].err = a_motion_error;
+  } else {
+    stats->ref[ALTREF_FRAME].err = INT_MAX;
+    stats->ref[ALTREF_FRAME].m.mv.as_int = 0;
+  }
+}
+
+static void update_mbgraph_frame_stats(VP10_COMP *cpi,
+                                       MBGRAPH_FRAME_STATS *stats,
+                                       YV12_BUFFER_CONFIG *buf,
+                                       YV12_BUFFER_CONFIG *golden_ref,
+                                       YV12_BUFFER_CONFIG *alt_ref) {
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  VP10_COMMON *const cm = &cpi->common;
+
+  int mb_col, mb_row, offset = 0;
+  int mb_y_offset = 0, arf_y_offset = 0, gld_y_offset = 0;
+  MV gld_top_mv = {0, 0};
+  MODE_INFO mi_local;
+
+  vp10_zero(mi_local);
+  // Set up limit values for motion vectors to prevent them extending outside
+  // the UMV borders.
+  x->mv_row_min     = -BORDER_MV_PIXELS_B16;
+  x->mv_row_max     = (cm->mb_rows - 1) * 8 + BORDER_MV_PIXELS_B16;
+  xd->up_available  = 0;
+  xd->plane[0].dst.stride  = buf->y_stride;
+  xd->plane[0].pre[0].stride  = buf->y_stride;
+  xd->plane[1].dst.stride = buf->uv_stride;
+  xd->mi[0] = &mi_local;
+  mi_local.mbmi.sb_type = BLOCK_16X16;
+  mi_local.mbmi.ref_frame[0] = LAST_FRAME;
+  mi_local.mbmi.ref_frame[1] = NONE;
+
+  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
+    MV gld_left_mv = gld_top_mv;
+    int mb_y_in_offset  = mb_y_offset;
+    int arf_y_in_offset = arf_y_offset;
+    int gld_y_in_offset = gld_y_offset;
+
+    // Set up limit values for motion vectors to prevent them extending outside
+    // the UMV borders.
+    x->mv_col_min      = -BORDER_MV_PIXELS_B16;
+    x->mv_col_max      = (cm->mb_cols - 1) * 8 + BORDER_MV_PIXELS_B16;
+    xd->left_available = 0;
+
+    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
+      MBGRAPH_MB_STATS *mb_stats = &stats->mb_stats[offset + mb_col];
+
+      update_mbgraph_mb_stats(cpi, mb_stats, buf, mb_y_in_offset,
+                              golden_ref, &gld_left_mv, alt_ref,
+                              mb_row, mb_col);
+      gld_left_mv = mb_stats->ref[GOLDEN_FRAME].m.mv.as_mv;
+      if (mb_col == 0) {
+        gld_top_mv = gld_left_mv;
+      }
+      xd->left_available = 1;
+      mb_y_in_offset    += 16;
+      gld_y_in_offset   += 16;
+      arf_y_in_offset   += 16;
+      x->mv_col_min     -= 16;
+      x->mv_col_max     -= 16;
+    }
+    xd->up_available = 1;
+    mb_y_offset     += buf->y_stride * 16;
+    gld_y_offset    += golden_ref->y_stride * 16;
+    if (alt_ref)
+      arf_y_offset    += alt_ref->y_stride * 16;
+    x->mv_row_min   -= 16;
+    x->mv_row_max   -= 16;
+    offset          += cm->mb_cols;
+  }
+}
+
+// void separate_arf_mbs_byzz
+static void separate_arf_mbs(VP10_COMP *cpi) {
+  VP10_COMMON *const cm = &cpi->common;
+  int mb_col, mb_row, offset, i;
+  int mi_row, mi_col;
+  int ncnt[4] = { 0 };
+  int n_frames = cpi->mbgraph_n_frames;
+
+  int *arf_not_zz;
+
+  CHECK_MEM_ERROR(cm, arf_not_zz,
+                  vpx_calloc(cm->mb_rows * cm->mb_cols * sizeof(*arf_not_zz),
+                             1));
+
+  // We are not interested in results beyond the alt ref itself.
+  if (n_frames > cpi->rc.frames_till_gf_update_due)
+    n_frames = cpi->rc.frames_till_gf_update_due;
+
+  // defer cost to reference frames
+  for (i = n_frames - 1; i >= 0; i--) {
+    MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i];
+
+    for (offset = 0, mb_row = 0; mb_row < cm->mb_rows;
+         offset += cm->mb_cols, mb_row++) {
+      for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
+        MBGRAPH_MB_STATS *mb_stats = &frame_stats->mb_stats[offset + mb_col];
+
+        int altref_err = mb_stats->ref[ALTREF_FRAME].err;
+        int intra_err  = mb_stats->ref[INTRA_FRAME ].err;
+        int golden_err = mb_stats->ref[GOLDEN_FRAME].err;
+
+        // Test for altref vs intra and gf and that its mv was 0,0.
+        if (altref_err > 1000 ||
+            altref_err > intra_err ||
+            altref_err > golden_err) {
+          arf_not_zz[offset + mb_col]++;
+        }
+      }
+    }
+  }
+
+  // arf_not_zz is indexed by MB, but this loop is indexed by MI to avoid out
+  // of bound access in segmentation_map
+  for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) {
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) {
+      // If any of the blocks in the sequence failed then the MB
+      // goes in segment 0
+      if (arf_not_zz[mi_row / 2 * cm->mb_cols + mi_col / 2]) {
+        ncnt[0]++;
+        cpi->segmentation_map[mi_row * cm->mi_cols + mi_col] = 0;
+      } else {
+        cpi->segmentation_map[mi_row * cm->mi_cols + mi_col] = 1;
+        ncnt[1]++;
+      }
+    }
+  }
+
+  // Only bother with segmentation if over 10% of the MBs in static segment
+  // if ( ncnt[1] && (ncnt[0] / ncnt[1] < 10) )
+  if (1) {
+    // Note % of blocks that are marked as static
+    if (cm->MBs)
+      cpi->static_mb_pct = (ncnt[1] * 100) / (cm->mi_rows * cm->mi_cols);
+
+    // This error case should not be reachable as this function should
+    // never be called with the common data structure uninitialized.
+    else
+      cpi->static_mb_pct = 0;
+
+    vp10_enable_segmentation(&cm->seg);
+  } else {
+    cpi->static_mb_pct = 0;
+    vp10_disable_segmentation(&cm->seg);
+  }
+
+  // Free localy allocated storage
+  vpx_free(arf_not_zz);
+}
+
+void vp10_update_mbgraph_stats(VP10_COMP *cpi) {
+  VP10_COMMON *const cm = &cpi->common;
+  int i, n_frames = vp10_lookahead_depth(cpi->lookahead);
+  YV12_BUFFER_CONFIG *golden_ref = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+
+  assert(golden_ref != NULL);
+
+  // we need to look ahead beyond where the ARF transitions into
+  // being a GF - so exit if we don't look ahead beyond that
+  if (n_frames <= cpi->rc.frames_till_gf_update_due)
+    return;
+
+  if (n_frames > MAX_LAG_BUFFERS)
+    n_frames = MAX_LAG_BUFFERS;
+
+  cpi->mbgraph_n_frames = n_frames;
+  for (i = 0; i < n_frames; i++) {
+    MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i];
+    memset(frame_stats->mb_stats, 0,
+           cm->mb_rows * cm->mb_cols * sizeof(*cpi->mbgraph_stats[i].mb_stats));
+  }
+
+  // do motion search to find contribution of each reference to data
+  // later on in this GF group
+  // FIXME really, the GF/last MC search should be done forward, and
+  // the ARF MC search backwards, to get optimal results for MV caching
+  for (i = 0; i < n_frames; i++) {
+    MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i];
+    struct lookahead_entry *q_cur = vp10_lookahead_peek(cpi->lookahead, i);
+
+    assert(q_cur != NULL);
+
+    update_mbgraph_frame_stats(cpi, frame_stats, &q_cur->img,
+                               golden_ref, cpi->Source);
+  }
+
+  vpx_clear_system_state();
+
+  separate_arf_mbs(cpi);
+}
diff --git a/libs/libvpx/vp10/encoder/mbgraph.h b/libs/libvpx/vp10/encoder/mbgraph.h
new file mode 100644
index 0000000000..3408464c55
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/mbgraph.h
@@ -0,0 +1,40 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_ENCODER_MBGRAPH_H_
+#define VP10_ENCODER_MBGRAPH_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+  struct {
+    int err;
+    union {
+      int_mv mv;
+      PREDICTION_MODE mode;
+    } m;
+  } ref[MAX_REF_FRAMES];
+} MBGRAPH_MB_STATS;
+
+typedef struct {
+  MBGRAPH_MB_STATS *mb_stats;
+} MBGRAPH_FRAME_STATS;
+
+struct VP10_COMP;
+
+void vp10_update_mbgraph_stats(struct VP10_COMP *cpi);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_ENCODER_MBGRAPH_H_
diff --git a/libs/libvpx/vp10/encoder/mcomp.c b/libs/libvpx/vp10/encoder/mcomp.c
new file mode 100644
index 0000000000..2c1c591c50
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/mcomp.c
@@ -0,0 +1,2498 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+
+#include "vp10/common/common.h"
+#include "vp10/common/reconinter.h"
+
+#include "vp10/encoder/encoder.h"
+#include "vp10/encoder/mcomp.h"
+
+// #define NEW_DIAMOND_SEARCH
+
+static INLINE const uint8_t *get_buf_from_mv(const struct buf_2d *buf,
+                                             const MV *mv) {
+  return &buf->buf[mv->row * buf->stride + mv->col];
+}
+
+void vp10_set_mv_search_range(MACROBLOCK *x, const MV *mv) {
+  int col_min = (mv->col >> 3) - MAX_FULL_PEL_VAL + (mv->col & 7 ? 1 : 0);
+  int row_min = (mv->row >> 3) - MAX_FULL_PEL_VAL + (mv->row & 7 ? 1 : 0);
+  int col_max = (mv->col >> 3) + MAX_FULL_PEL_VAL;
+  int row_max = (mv->row >> 3) + MAX_FULL_PEL_VAL;
+
+  col_min = VPXMAX(col_min, (MV_LOW >> 3) + 1);
+  row_min = VPXMAX(row_min, (MV_LOW >> 3) + 1);
+  col_max = VPXMIN(col_max, (MV_UPP >> 3) - 1);
+  row_max = VPXMIN(row_max, (MV_UPP >> 3) - 1);
+
+  // Get intersection of UMV window and valid MV window to reduce # of checks
+  // in diamond search.
+  if (x->mv_col_min < col_min)
+    x->mv_col_min = col_min;
+  if (x->mv_col_max > col_max)
+    x->mv_col_max = col_max;
+  if (x->mv_row_min < row_min)
+    x->mv_row_min = row_min;
+  if (x->mv_row_max > row_max)
+    x->mv_row_max = row_max;
+}
+
+int vp10_init_search_range(int size) {
+  int sr = 0;
+  // Minimum search size no matter what the passed in value.
+  size = VPXMAX(16, size);
+
+  while ((size << sr) < MAX_FULL_PEL_VAL)
+    sr++;
+
+  sr = VPXMIN(sr, MAX_MVSEARCH_STEPS - 2);
+  return sr;
+}
+
+static INLINE int mv_cost(const MV *mv,
+                          const int *joint_cost, int *const comp_cost[2]) {
+  return joint_cost[vp10_get_mv_joint(mv)] +
+             comp_cost[0][mv->row] + comp_cost[1][mv->col];
+}
+
+int vp10_mv_bit_cost(const MV *mv, const MV *ref,
+                    const int *mvjcost, int *mvcost[2], int weight) {
+  const MV diff = { mv->row - ref->row,
+                    mv->col - ref->col };
+  return ROUND_POWER_OF_TWO(mv_cost(&diff, mvjcost, mvcost) * weight, 7);
+}
+
+static int mv_err_cost(const MV *mv, const MV *ref,
+                       const int *mvjcost, int *mvcost[2],
+                       int error_per_bit) {
+  if (mvcost) {
+    const MV diff = { mv->row - ref->row,
+                      mv->col - ref->col };
+    return ROUND_POWER_OF_TWO(mv_cost(&diff, mvjcost, mvcost) *
+                                  error_per_bit, 13);
+  }
+  return 0;
+}
+
+static int mvsad_err_cost(const MACROBLOCK *x, const MV *mv, const MV *ref,
+                          int error_per_bit) {
+  const MV diff = { mv->row - ref->row,
+                    mv->col - ref->col };
+  return ROUND_POWER_OF_TWO(mv_cost(&diff, x->nmvjointsadcost,
+                                    x->nmvsadcost) * error_per_bit, 8);
+}
+
+void vp10_init_dsmotion_compensation(search_site_config *cfg, int stride) {
+  int len, ss_count = 1;
+
+  cfg->ss[0].mv.col = cfg->ss[0].mv.row = 0;
+  cfg->ss[0].offset = 0;
+
+  for (len = MAX_FIRST_STEP; len > 0; len /= 2) {
+    // Generate offsets for 4 search sites per step.
+    const MV ss_mvs[] = {{-len, 0}, {len, 0}, {0, -len}, {0, len}};
+    int i;
+    for (i = 0; i < 4; ++i) {
+      search_site *const ss = &cfg->ss[ss_count++];
+      ss->mv = ss_mvs[i];
+      ss->offset = ss->mv.row * stride + ss->mv.col;
+    }
+  }
+
+  cfg->ss_count = ss_count;
+  cfg->searches_per_step = 4;
+}
+
+void vp10_init3smotion_compensation(search_site_config *cfg, int stride) {
+  int len, ss_count = 1;
+
+  cfg->ss[0].mv.col = cfg->ss[0].mv.row = 0;
+  cfg->ss[0].offset = 0;
+
+  for (len = MAX_FIRST_STEP; len > 0; len /= 2) {
+    // Generate offsets for 8 search sites per step.
+    const MV ss_mvs[8] = {
+      {-len,  0  }, {len,  0  }, { 0,   -len}, {0,    len},
+      {-len, -len}, {-len, len}, {len,  -len}, {len,  len}
+    };
+    int i;
+    for (i = 0; i < 8; ++i) {
+      search_site *const ss = &cfg->ss[ss_count++];
+      ss->mv = ss_mvs[i];
+      ss->offset = ss->mv.row * stride + ss->mv.col;
+    }
+  }
+
+  cfg->ss_count = ss_count;
+  cfg->searches_per_step = 8;
+}
+
+/*
+ * To avoid the penalty for crossing cache-line read, preload the reference
+ * area in a small buffer, which is aligned to make sure there won't be crossing
+ * cache-line read while reading from this buffer. This reduced the cpu
+ * cycles spent on reading ref data in sub-pixel filter functions.
+ * TODO: Currently, since sub-pixel search range here is -3 ~ 3, copy 22 rows x
+ * 32 cols area that is enough for 16x16 macroblock. Later, for SPLITMV, we
+ * could reduce the area.
+ */
+
+/* estimated cost of a motion vector (r,c) */
+#define MVC(r, c)                                       \
+    (mvcost ?                                           \
+     ((mvjcost[((r) != rr) * 2 + ((c) != rc)] +         \
+       mvcost[0][((r) - rr)] + mvcost[1][((c) - rc)]) * \
+      error_per_bit + 4096) >> 13 : 0)
+
+
+// convert motion vector component to offset for sv[a]f calc
+static INLINE int sp(int x) {
+  return x & 7;
+}
+
+static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
+  return &buf[(r >> 3) * stride + (c >> 3)];
+}
+
+/* checks if (r, c) has better score than previous best */
+#define CHECK_BETTER(v, r, c) \
+  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {              \
+    if (second_pred == NULL)                                           \
+      thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \
+                             src_stride, &sse);                        \
+    else                                                               \
+      thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), \
+                              z, src_stride, &sse, second_pred);       \
+    if ((v = MVC(r, c) + thismse) < besterr) {                         \
+      besterr = v;                                                     \
+      br = r;                                                          \
+      bc = c;                                                          \
+      *distortion = thismse;                                           \
+      *sse1 = sse;                                                     \
+    }                                                                  \
+  } else {                                                             \
+    v = INT_MAX;                                                       \
+  }
+
+#define FIRST_LEVEL_CHECKS                              \
+  {                                                     \
+    unsigned int left, right, up, down, diag;           \
+    CHECK_BETTER(left, tr, tc - hstep);                 \
+    CHECK_BETTER(right, tr, tc + hstep);                \
+    CHECK_BETTER(up, tr - hstep, tc);                   \
+    CHECK_BETTER(down, tr + hstep, tc);                 \
+    whichdir = (left < right ? 0 : 1) +                 \
+               (up < down ? 0 : 2);                     \
+    switch (whichdir) {                                 \
+      case 0:                                           \
+        CHECK_BETTER(diag, tr - hstep, tc - hstep);     \
+        break;                                          \
+      case 1:                                           \
+        CHECK_BETTER(diag, tr - hstep, tc + hstep);     \
+        break;                                          \
+      case 2:                                           \
+        CHECK_BETTER(diag, tr + hstep, tc - hstep);     \
+        break;                                          \
+      case 3:                                           \
+        CHECK_BETTER(diag, tr + hstep, tc + hstep);     \
+        break;                                          \
+    }                                                   \
+  }
+
+#define SECOND_LEVEL_CHECKS                             \
+  {                                                     \
+    int kr, kc;                                         \
+    unsigned int second;                                \
+    if (tr != br && tc != bc) {                         \
+      kr = br - tr;                                     \
+      kc = bc - tc;                                     \
+      CHECK_BETTER(second, tr + kr, tc + 2 * kc);       \
+      CHECK_BETTER(second, tr + 2 * kr, tc + kc);       \
+    } else if (tr == br && tc != bc) {                  \
+      kc = bc - tc;                                     \
+      CHECK_BETTER(second, tr + hstep, tc + 2 * kc);    \
+      CHECK_BETTER(second, tr - hstep, tc + 2 * kc);    \
+      switch (whichdir) {                               \
+        case 0:                                         \
+        case 1:                                         \
+          CHECK_BETTER(second, tr + hstep, tc + kc);    \
+          break;                                        \
+        case 2:                                         \
+        case 3:                                         \
+          CHECK_BETTER(second, tr - hstep, tc + kc);    \
+          break;                                        \
+      }                                                 \
+    } else if (tr != br && tc == bc) {                  \
+      kr = br - tr;                                     \
+      CHECK_BETTER(second, tr + 2 * kr, tc + hstep);    \
+      CHECK_BETTER(second, tr + 2 * kr, tc - hstep);    \
+      switch (whichdir) {                               \
+        case 0:                                         \
+        case 2:                                         \
+          CHECK_BETTER(second, tr + kr, tc + hstep);    \
+          break;                                        \
+        case 1:                                         \
+        case 3:                                         \
+          CHECK_BETTER(second, tr + kr, tc - hstep);    \
+          break;                                        \
+      }                                                 \
+    }                                                   \
+  }
+
+// TODO(yunqingwang): SECOND_LEVEL_CHECKS_BEST was a rewrote of
+// SECOND_LEVEL_CHECKS, and SECOND_LEVEL_CHECKS should be rewritten
+// later in the same way.
+#define SECOND_LEVEL_CHECKS_BEST                        \
+  {                                                     \
+    unsigned int second;                                \
+    int br0 = br;                                       \
+    int bc0 = bc;                                       \
+    assert(tr == br || tc == bc);                       \
+    if (tr == br && tc != bc) {                         \
+      kc = bc - tc;                                     \
+    } else if (tr != br && tc == bc) {                  \
+      kr = br - tr;                                     \
+    }                                                   \
+    CHECK_BETTER(second, br0 + kr, bc0);                \
+    CHECK_BETTER(second, br0, bc0 + kc);                \
+    if (br0 != br || bc0 != bc) {                       \
+      CHECK_BETTER(second, br0 + kr, bc0 + kc);         \
+    }                                                   \
+  }
+
+#define SETUP_SUBPEL_SEARCH                                                \
+  const uint8_t *const z = x->plane[0].src.buf;                            \
+  const int src_stride = x->plane[0].src.stride;                           \
+  const MACROBLOCKD *xd = &x->e_mbd;                                       \
+  unsigned int besterr = INT_MAX;                                          \
+  unsigned int sse;                                                        \
+  unsigned int whichdir;                                                   \
+  int thismse;                                                             \
+  const unsigned int halfiters = iters_per_step;                           \
+  const unsigned int quarteriters = iters_per_step;                        \
+  const unsigned int eighthiters = iters_per_step;                         \
+  const int y_stride = xd->plane[0].pre[0].stride;                         \
+  const int offset = bestmv->row * y_stride + bestmv->col;                 \
+  const uint8_t *const y = xd->plane[0].pre[0].buf;                        \
+                                                                           \
+  int rr = ref_mv->row;                                                    \
+  int rc = ref_mv->col;                                                    \
+  int br = bestmv->row * 8;                                                \
+  int bc = bestmv->col * 8;                                                \
+  int hstep = 4;                                                           \
+  const int minc = VPXMAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);        \
+  const int maxc = VPXMIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);        \
+  const int minr = VPXMAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);        \
+  const int maxr = VPXMIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);        \
+  int tr = br;                                                             \
+  int tc = bc;                                                             \
+                                                                           \
+  bestmv->row *= 8;                                                        \
+  bestmv->col *= 8;
+
+static unsigned int setup_center_error(const MACROBLOCKD *xd,
+                                       const MV *bestmv,
+                                       const MV *ref_mv,
+                                       int error_per_bit,
+                                       const vp9_variance_fn_ptr_t *vfp,
+                                       const uint8_t *const src,
+                                       const int src_stride,
+                                       const uint8_t *const y,
+                                       int y_stride,
+                                       const uint8_t *second_pred,
+                                       int w, int h, int offset,
+                                       int *mvjcost, int *mvcost[2],
+                                       unsigned int *sse1,
+                                       int *distortion) {
+  unsigned int besterr;
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (second_pred != NULL) {
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      DECLARE_ALIGNED(16, uint16_t, comp_pred16[64 * 64]);
+      vpx_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset,
+                               y_stride);
+      besterr = vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride,
+                        sse1);
+    } else {
+      DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+      vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
+      besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
+    }
+  } else {
+    besterr = vfp->vf(y + offset, y_stride, src, src_stride, sse1);
+  }
+  *distortion = besterr;
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+#else
+  (void) xd;
+  if (second_pred != NULL) {
+    DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+    vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
+    besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
+  } else {
+    besterr = vfp->vf(y + offset, y_stride, src, src_stride, sse1);
+  }
+  *distortion = besterr;
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  return besterr;
+}
+
+static INLINE int divide_and_round(const int n, const int d) {
+  return ((n < 0) ^ (d < 0)) ? ((n - d / 2) / d) : ((n + d / 2) / d);
+}
+
+static INLINE int is_cost_list_wellbehaved(int *cost_list) {
+  return cost_list[0] < cost_list[1] &&
+         cost_list[0] < cost_list[2] &&
+         cost_list[0] < cost_list[3] &&
+         cost_list[0] < cost_list[4];
+}
+
+// Returns surface minima estimate at given precision in 1/2^n bits.
+// Assume a model for the cost surface: S = A(x - x0)^2 + B(y - y0)^2 + C
+// For a given set of costs S0, S1, S2, S3, S4 at points
+// (y, x) = (0, 0), (0, -1), (1, 0), (0, 1) and (-1, 0) respectively,
+// the solution for the location of the minima (x0, y0) is given by:
+// x0 = 1/2 (S1 - S3)/(S1 + S3 - 2*S0),
+// y0 = 1/2 (S4 - S2)/(S4 + S2 - 2*S0).
+// The code below is an integerized version of that.
+static void get_cost_surf_min(int *cost_list, int *ir, int *ic,
+                              int bits) {
+  *ic = divide_and_round((cost_list[1] - cost_list[3]) * (1 << (bits - 1)),
+                         (cost_list[1] - 2 * cost_list[0] + cost_list[3]));
+  *ir = divide_and_round((cost_list[4] - cost_list[2]) * (1 << (bits - 1)),
+                         (cost_list[4] - 2 * cost_list[0] + cost_list[2]));
+}
+
+int vp10_find_best_sub_pixel_tree_pruned_evenmore(
+    const MACROBLOCK *x,
+    MV *bestmv, const MV *ref_mv,
+    int allow_hp,
+    int error_per_bit,
+    const vp9_variance_fn_ptr_t *vfp,
+    int forced_stop,
+    int iters_per_step,
+    int *cost_list,
+    int *mvjcost, int *mvcost[2],
+    int *distortion,
+    unsigned int *sse1,
+    const uint8_t *second_pred,
+    int w, int h) {
+  SETUP_SUBPEL_SEARCH;
+  besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
+                               z, src_stride, y, y_stride, second_pred,
+                               w, h, offset, mvjcost, mvcost,
+                               sse1, distortion);
+  (void) halfiters;
+  (void) quarteriters;
+  (void) eighthiters;
+  (void) whichdir;
+  (void) allow_hp;
+  (void) forced_stop;
+  (void) hstep;
+
+  if (cost_list &&
+      cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
+      cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
+      cost_list[4] != INT_MAX &&
+      is_cost_list_wellbehaved(cost_list)) {
+    int ir, ic;
+    unsigned int minpt;
+    get_cost_surf_min(cost_list, &ir, &ic, 2);
+    if (ir != 0 || ic != 0) {
+      CHECK_BETTER(minpt, tr + 2 * ir, tc + 2 * ic);
+    }
+  } else {
+    FIRST_LEVEL_CHECKS;
+    if (halfiters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+
+    tr = br;
+    tc = bc;
+
+    // Each subsequent iteration checks at least one point in common with
+    // the last iteration could be 2 ( if diag selected) 1/4 pel
+    // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
+    if (forced_stop != 2) {
+      hstep >>= 1;
+      FIRST_LEVEL_CHECKS;
+      if (quarteriters > 1) {
+        SECOND_LEVEL_CHECKS;
+      }
+    }
+  }
+
+  tr = br;
+  tc = bc;
+
+  if (allow_hp && vp10_use_mv_hp(ref_mv) && forced_stop == 0) {
+    hstep >>= 1;
+    FIRST_LEVEL_CHECKS;
+    if (eighthiters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+  }
+
+  bestmv->row = br;
+  bestmv->col = bc;
+
+  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
+      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
+    return INT_MAX;
+
+  return besterr;
+}
+
+int vp10_find_best_sub_pixel_tree_pruned_more(const MACROBLOCK *x,
+                                             MV *bestmv, const MV *ref_mv,
+                                             int allow_hp,
+                                             int error_per_bit,
+                                             const vp9_variance_fn_ptr_t *vfp,
+                                             int forced_stop,
+                                             int iters_per_step,
+                                             int *cost_list,
+                                             int *mvjcost, int *mvcost[2],
+                                             int *distortion,
+                                             unsigned int *sse1,
+                                             const uint8_t *second_pred,
+                                             int w, int h) {
+  SETUP_SUBPEL_SEARCH;
+  besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
+                               z, src_stride, y, y_stride, second_pred,
+                               w, h, offset, mvjcost, mvcost,
+                               sse1, distortion);
+  if (cost_list &&
+      cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
+      cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
+      cost_list[4] != INT_MAX &&
+      is_cost_list_wellbehaved(cost_list)) {
+    unsigned int minpt;
+    int ir, ic;
+    get_cost_surf_min(cost_list, &ir, &ic, 1);
+    if (ir != 0 || ic != 0) {
+      CHECK_BETTER(minpt, tr + ir * hstep, tc + ic * hstep);
+    }
+  } else {
+    FIRST_LEVEL_CHECKS;
+    if (halfiters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+  }
+
+  // Each subsequent iteration checks at least one point in common with
+  // the last iteration could be 2 ( if diag selected) 1/4 pel
+
+  // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
+  if (forced_stop != 2) {
+    tr = br;
+    tc = bc;
+    hstep >>= 1;
+    FIRST_LEVEL_CHECKS;
+    if (quarteriters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+  }
+
+  if (allow_hp && vp10_use_mv_hp(ref_mv) && forced_stop == 0) {
+    tr = br;
+    tc = bc;
+    hstep >>= 1;
+    FIRST_LEVEL_CHECKS;
+    if (eighthiters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+  }
+  // These lines insure static analysis doesn't warn that
+  // tr and tc aren't used after the above point.
+  (void) tr;
+  (void) tc;
+
+  bestmv->row = br;
+  bestmv->col = bc;
+
+  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
+      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
+    return INT_MAX;
+
+  return besterr;
+}
+
+int vp10_find_best_sub_pixel_tree_pruned(const MACROBLOCK *x,
+                                        MV *bestmv, const MV *ref_mv,
+                                        int allow_hp,
+                                        int error_per_bit,
+                                        const vp9_variance_fn_ptr_t *vfp,
+                                        int forced_stop,
+                                        int iters_per_step,
+                                        int *cost_list,
+                                        int *mvjcost, int *mvcost[2],
+                                        int *distortion,
+                                        unsigned int *sse1,
+                                        const uint8_t *second_pred,
+                                        int w, int h) {
+  SETUP_SUBPEL_SEARCH;
+  besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
+                               z, src_stride, y, y_stride, second_pred,
+                               w, h, offset, mvjcost, mvcost,
+                               sse1, distortion);
+  if (cost_list &&
+      cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
+      cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
+      cost_list[4] != INT_MAX) {
+    unsigned int left, right, up, down, diag;
+    whichdir = (cost_list[1] < cost_list[3] ? 0 : 1) +
+               (cost_list[2] < cost_list[4] ? 0 : 2);
+    switch (whichdir) {
+      case 0:
+        CHECK_BETTER(left, tr, tc - hstep);
+        CHECK_BETTER(down, tr + hstep, tc);
+        CHECK_BETTER(diag, tr + hstep, tc - hstep);
+        break;
+      case 1:
+        CHECK_BETTER(right, tr, tc + hstep);
+        CHECK_BETTER(down, tr + hstep, tc);
+        CHECK_BETTER(diag, tr + hstep, tc + hstep);
+        break;
+      case 2:
+        CHECK_BETTER(left, tr, tc - hstep);
+        CHECK_BETTER(up, tr - hstep, tc);
+        CHECK_BETTER(diag, tr - hstep, tc - hstep);
+        break;
+      case 3:
+        CHECK_BETTER(right, tr, tc + hstep);
+        CHECK_BETTER(up, tr - hstep, tc);
+        CHECK_BETTER(diag, tr - hstep, tc + hstep);
+        break;
+    }
+  } else {
+    FIRST_LEVEL_CHECKS;
+    if (halfiters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+  }
+
+  tr = br;
+  tc = bc;
+
+  // Each subsequent iteration checks at least one point in common with
+  // the last iteration could be 2 ( if diag selected) 1/4 pel
+
+  // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
+  if (forced_stop != 2) {
+    hstep >>= 1;
+    FIRST_LEVEL_CHECKS;
+    if (quarteriters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+    tr = br;
+    tc = bc;
+  }
+
+  if (allow_hp && vp10_use_mv_hp(ref_mv) && forced_stop == 0) {
+    hstep >>= 1;
+    FIRST_LEVEL_CHECKS;
+    if (eighthiters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+    tr = br;
+    tc = bc;
+  }
+  // These lines insure static analysis doesn't warn that
+  // tr and tc aren't used after the above point.
+  (void) tr;
+  (void) tc;
+
+  bestmv->row = br;
+  bestmv->col = bc;
+
+  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
+      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
+    return INT_MAX;
+
+  return besterr;
+}
+
+static const MV search_step_table[12] = {
+    // left, right, up, down
+    {0, -4}, {0, 4}, {-4, 0}, {4, 0},
+    {0, -2}, {0, 2}, {-2, 0}, {2, 0},
+    {0, -1}, {0, 1}, {-1, 0}, {1, 0}
+};
+
+int vp10_find_best_sub_pixel_tree(const MACROBLOCK *x,
+                                 MV *bestmv, const MV *ref_mv,
+                                 int allow_hp,
+                                 int error_per_bit,
+                                 const vp9_variance_fn_ptr_t *vfp,
+                                 int forced_stop,
+                                 int iters_per_step,
+                                 int *cost_list,
+                                 int *mvjcost, int *mvcost[2],
+                                 int *distortion,
+                                 unsigned int *sse1,
+                                 const uint8_t *second_pred,
+                                 int w, int h) {
+  const uint8_t *const z = x->plane[0].src.buf;
+  const uint8_t *const src_address = z;
+  const int src_stride = x->plane[0].src.stride;
+  const MACROBLOCKD *xd = &x->e_mbd;
+  unsigned int besterr = INT_MAX;
+  unsigned int sse;
+  int thismse;
+  const int y_stride = xd->plane[0].pre[0].stride;
+  const int offset = bestmv->row * y_stride + bestmv->col;
+  const uint8_t *const y = xd->plane[0].pre[0].buf;
+
+  int rr = ref_mv->row;
+  int rc = ref_mv->col;
+  int br = bestmv->row * 8;
+  int bc = bestmv->col * 8;
+  int hstep = 4;
+  int iter, round = 3 - forced_stop;
+  const int minc = VPXMAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);
+  const int maxc = VPXMIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);
+  const int minr = VPXMAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);
+  const int maxr = VPXMIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);
+  int tr = br;
+  int tc = bc;
+  const MV *search_step = search_step_table;
+  int idx, best_idx = -1;
+  unsigned int cost_array[5];
+  int kr, kc;
+
+  if (!(allow_hp && vp10_use_mv_hp(ref_mv)))
+    if (round == 3)
+      round = 2;
+
+  bestmv->row *= 8;
+  bestmv->col *= 8;
+
+  besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
+                               z, src_stride, y, y_stride, second_pred,
+                               w, h, offset, mvjcost, mvcost,
+                               sse1, distortion);
+
+  (void) cost_list;  // to silence compiler warning
+
+  for (iter = 0; iter < round; ++iter) {
+    // Check vertical and horizontal sub-pixel positions.
+    for (idx = 0; idx < 4; ++idx) {
+      tr = br + search_step[idx].row;
+      tc = bc + search_step[idx].col;
+      if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
+        const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
+        MV this_mv;
+        this_mv.row = tr;
+        this_mv.col = tc;
+        if (second_pred == NULL)
+          thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr),
+                             src_address, src_stride, &sse);
+        else
+          thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr),
+                              src_address, src_stride, &sse, second_pred);
+        cost_array[idx] = thismse +
+            mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);
+
+        if (cost_array[idx] < besterr) {
+          best_idx = idx;
+          besterr = cost_array[idx];
+          *distortion = thismse;
+          *sse1 = sse;
+        }
+      } else {
+        cost_array[idx] = INT_MAX;
+      }
+    }
+
+    // Check diagonal sub-pixel position
+    kc = (cost_array[0] <= cost_array[1] ? -hstep : hstep);
+    kr = (cost_array[2] <= cost_array[3] ? -hstep : hstep);
+
+    tc = bc + kc;
+    tr = br + kr;
+    if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
+      const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
+      MV this_mv = {tr, tc};
+      if (second_pred == NULL)
+        thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr),
+                           src_address, src_stride, &sse);
+      else
+        thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr),
+                            src_address, src_stride, &sse, second_pred);
+      cost_array[4] = thismse +
+          mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);
+
+      if (cost_array[4] < besterr) {
+        best_idx = 4;
+        besterr = cost_array[4];
+        *distortion = thismse;
+        *sse1 = sse;
+      }
+    } else {
+      cost_array[idx] = INT_MAX;
+    }
+
+    if (best_idx < 4 && best_idx >= 0) {
+      br += search_step[best_idx].row;
+      bc += search_step[best_idx].col;
+    } else if (best_idx == 4) {
+      br = tr;
+      bc = tc;
+    }
+
+    if (iters_per_step > 1 && best_idx != -1)
+      SECOND_LEVEL_CHECKS_BEST;
+
+    tr = br;
+    tc = bc;
+
+    search_step += 4;
+    hstep >>= 1;
+    best_idx = -1;
+  }
+
+  // Each subsequent iteration checks at least one point in common with
+  // the last iteration could be 2 ( if diag selected) 1/4 pel
+
+  // These lines insure static analysis doesn't warn that
+  // tr and tc aren't used after the above point.
+  (void) tr;
+  (void) tc;
+
+  bestmv->row = br;
+  bestmv->col = bc;
+
+  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
+      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
+    return INT_MAX;
+
+  return besterr;
+}
+
+#undef MVC
+#undef PRE
+#undef CHECK_BETTER
+
+static INLINE int check_bounds(const MACROBLOCK *x, int row, int col,
+                               int range) {
+  return ((row - range) >= x->mv_row_min) &
+         ((row + range) <= x->mv_row_max) &
+         ((col - range) >= x->mv_col_min) &
+         ((col + range) <= x->mv_col_max);
+}
+
+static INLINE int is_mv_in(const MACROBLOCK *x, const MV *mv) {
+  return (mv->col >= x->mv_col_min) && (mv->col <= x->mv_col_max) &&
+         (mv->row >= x->mv_row_min) && (mv->row <= x->mv_row_max);
+}
+
+#define CHECK_BETTER \
+  {\
+    if (thissad < bestsad) {\
+      if (use_mvcost) \
+        thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);\
+      if (thissad < bestsad) {\
+        bestsad = thissad;\
+        best_site = i;\
+      }\
+    }\
+  }
+
+#define MAX_PATTERN_SCALES         11
+#define MAX_PATTERN_CANDIDATES      8  // max number of canddiates per scale
+#define PATTERN_CANDIDATES_REF      3  // number of refinement candidates
+
+// Calculate and return a sad+mvcost list around an integer best pel.
+static INLINE void calc_int_cost_list(const MACROBLOCK *x,
+                                      const MV *ref_mv,
+                                      int sadpb,
+                                      const vp9_variance_fn_ptr_t *fn_ptr,
+                                      const MV *best_mv,
+                                      int *cost_list) {
+  static const MV neighbors[4] = {{0, -1}, {1, 0}, {0, 1}, {-1, 0}};
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &x->e_mbd.plane[0].pre[0];
+  const MV fcenter_mv = {ref_mv->row >> 3, ref_mv->col >> 3};
+  int br = best_mv->row;
+  int bc = best_mv->col;
+  MV this_mv;
+  int i;
+  unsigned int sse;
+
+  this_mv.row = br;
+  this_mv.col = bc;
+  cost_list[0] = fn_ptr->vf(what->buf, what->stride,
+                            get_buf_from_mv(in_what, &this_mv),
+                            in_what->stride, &sse) +
+      mvsad_err_cost(x, &this_mv, &fcenter_mv, sadpb);
+  if (check_bounds(x, br, bc, 1)) {
+    for (i = 0; i < 4; i++) {
+      const MV this_mv = {br + neighbors[i].row,
+        bc + neighbors[i].col};
+      cost_list[i + 1] = fn_ptr->vf(what->buf, what->stride,
+                                    get_buf_from_mv(in_what, &this_mv),
+                                    in_what->stride, &sse) +
+          // mvsad_err_cost(x, &this_mv, &fcenter_mv, sadpb);
+          mv_err_cost(&this_mv, &fcenter_mv, x->nmvjointcost, x->mvcost,
+                      x->errorperbit);
+    }
+  } else {
+    for (i = 0; i < 4; i++) {
+      const MV this_mv = {br + neighbors[i].row,
+        bc + neighbors[i].col};
+      if (!is_mv_in(x, &this_mv))
+        cost_list[i + 1] = INT_MAX;
+      else
+        cost_list[i + 1] = fn_ptr->vf(what->buf, what->stride,
+                                      get_buf_from_mv(in_what, &this_mv),
+                                      in_what->stride, &sse) +
+            // mvsad_err_cost(x, &this_mv, &fcenter_mv, sadpb);
+            mv_err_cost(&this_mv, &fcenter_mv, x->nmvjointcost, x->mvcost,
+                        x->errorperbit);
+    }
+  }
+}
+
+// Generic pattern search function that searches over multiple scales.
+// Each scale can have a different number of candidates and shape of
+// candidates as indicated in the num_candidates and candidates arrays
+// passed into this function
+//
+static int vp10_pattern_search(const MACROBLOCK *x,
+                              MV *ref_mv,
+                              int search_param,
+                              int sad_per_bit,
+                              int do_init_search,
+                              int *cost_list,
+                              const vp9_variance_fn_ptr_t *vfp,
+                              int use_mvcost,
+                              const MV *center_mv,
+                              MV *best_mv,
+                              const int num_candidates[MAX_PATTERN_SCALES],
+                              const MV candidates[MAX_PATTERN_SCALES]
+                                                 [MAX_PATTERN_CANDIDATES]) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  static const int search_param_to_steps[MAX_MVSEARCH_STEPS] = {
+    10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+  };
+  int i, s, t;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  int br, bc;
+  int bestsad = INT_MAX;
+  int thissad;
+  int k = -1;
+  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+  int best_init_s = search_param_to_steps[search_param];
+  // adjust ref_mv to make sure it is within MV range
+  clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
+  br = ref_mv->row;
+  bc = ref_mv->col;
+
+  // Work out the start point for the search
+  bestsad = vfp->sdf(what->buf, what->stride,
+                     get_buf_from_mv(in_what, ref_mv), in_what->stride) +
+      mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
+
+  // Search all possible scales upto the search param around the center point
+  // pick the scale of the point that is best as the starting scale of
+  // further steps around it.
+  if (do_init_search) {
+    s = best_init_s;
+    best_init_s = -1;
+    for (t = 0; t <= s; ++t) {
+      int best_site = -1;
+      if (check_bounds(x, br, bc, 1 << t)) {
+        for (i = 0; i < num_candidates[t]; i++) {
+          const MV this_mv = {br + candidates[t][i].row,
+                              bc + candidates[t][i].col};
+          thissad = vfp->sdf(what->buf, what->stride,
+                             get_buf_from_mv(in_what, &this_mv),
+                             in_what->stride);
+          CHECK_BETTER
+        }
+      } else {
+        for (i = 0; i < num_candidates[t]; i++) {
+          const MV this_mv = {br + candidates[t][i].row,
+                              bc + candidates[t][i].col};
+          if (!is_mv_in(x, &this_mv))
+            continue;
+          thissad = vfp->sdf(what->buf, what->stride,
+                             get_buf_from_mv(in_what, &this_mv),
+                             in_what->stride);
+          CHECK_BETTER
+        }
+      }
+      if (best_site == -1) {
+        continue;
+      } else {
+        best_init_s = t;
+        k = best_site;
+      }
+    }
+    if (best_init_s != -1) {
+      br += candidates[best_init_s][k].row;
+      bc += candidates[best_init_s][k].col;
+    }
+  }
+
+  // If the center point is still the best, just skip this and move to
+  // the refinement step.
+  if (best_init_s != -1) {
+    int best_site = -1;
+    s = best_init_s;
+
+    do {
+      // No need to search all 6 points the 1st time if initial search was used
+      if (!do_init_search || s != best_init_s) {
+        if (check_bounds(x, br, bc, 1 << s)) {
+          for (i = 0; i < num_candidates[s]; i++) {
+            const MV this_mv = {br + candidates[s][i].row,
+                                bc + candidates[s][i].col};
+            thissad = vfp->sdf(what->buf, what->stride,
+                               get_buf_from_mv(in_what, &this_mv),
+                               in_what->stride);
+            CHECK_BETTER
+          }
+        } else {
+          for (i = 0; i < num_candidates[s]; i++) {
+            const MV this_mv = {br + candidates[s][i].row,
+                                bc + candidates[s][i].col};
+            if (!is_mv_in(x, &this_mv))
+              continue;
+            thissad = vfp->sdf(what->buf, what->stride,
+                               get_buf_from_mv(in_what, &this_mv),
+                               in_what->stride);
+            CHECK_BETTER
+          }
+        }
+
+        if (best_site == -1) {
+          continue;
+        } else {
+          br += candidates[s][best_site].row;
+          bc += candidates[s][best_site].col;
+          k = best_site;
+        }
+      }
+
+      do {
+        int next_chkpts_indices[PATTERN_CANDIDATES_REF];
+        best_site = -1;
+        next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1;
+        next_chkpts_indices[1] = k;
+        next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1;
+
+        if (check_bounds(x, br, bc, 1 << s)) {
+          for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
+            const MV this_mv = {br + candidates[s][next_chkpts_indices[i]].row,
+                                bc + candidates[s][next_chkpts_indices[i]].col};
+            thissad = vfp->sdf(what->buf, what->stride,
+                               get_buf_from_mv(in_what, &this_mv),
+                               in_what->stride);
+            CHECK_BETTER
+          }
+        } else {
+          for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
+            const MV this_mv = {br + candidates[s][next_chkpts_indices[i]].row,
+                                bc + candidates[s][next_chkpts_indices[i]].col};
+            if (!is_mv_in(x, &this_mv))
+              continue;
+            thissad = vfp->sdf(what->buf, what->stride,
+                               get_buf_from_mv(in_what, &this_mv),
+                               in_what->stride);
+            CHECK_BETTER
+          }
+        }
+
+        if (best_site != -1) {
+          k = next_chkpts_indices[best_site];
+          br += candidates[s][k].row;
+          bc += candidates[s][k].col;
+        }
+      } while (best_site != -1);
+    } while (s--);
+  }
+
+  // Returns the one-away integer pel sad values around the best as follows:
+  // cost_list[0]: cost at the best integer pel
+  // cost_list[1]: cost at delta {0, -1} (left)   from the best integer pel
+  // cost_list[2]: cost at delta { 1, 0} (bottom) from the best integer pel
+  // cost_list[3]: cost at delta { 0, 1} (right)  from the best integer pel
+  // cost_list[4]: cost at delta {-1, 0} (top)    from the best integer pel
+  if (cost_list) {
+    const MV best_mv = { br, bc };
+    calc_int_cost_list(x, &fcenter_mv, sad_per_bit, vfp, &best_mv, cost_list);
+  }
+  best_mv->row = br;
+  best_mv->col = bc;
+  return bestsad;
+}
+
+// A specialized function where the smallest scale search candidates
+// are 4 1-away neighbors, and cost_list is non-null
+// TODO(debargha): Merge this function with the one above. Also remove
+// use_mvcost option since it is always 1, to save unnecessary branches.
+static int vp10_pattern_search_sad(const MACROBLOCK *x,
+                                  MV *ref_mv,
+                                  int search_param,
+                                  int sad_per_bit,
+                                  int do_init_search,
+                                  int *cost_list,
+                                  const vp9_variance_fn_ptr_t *vfp,
+                                  int use_mvcost,
+                                  const MV *center_mv,
+                                  MV *best_mv,
+                                  const int num_candidates[MAX_PATTERN_SCALES],
+                                  const MV candidates[MAX_PATTERN_SCALES]
+                                                     [MAX_PATTERN_CANDIDATES]) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  static const int search_param_to_steps[MAX_MVSEARCH_STEPS] = {
+    10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+  };
+  int i, s, t;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  int br, bc;
+  int bestsad = INT_MAX;
+  int thissad;
+  int k = -1;
+  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+  int best_init_s = search_param_to_steps[search_param];
+  // adjust ref_mv to make sure it is within MV range
+  clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
+  br = ref_mv->row;
+  bc = ref_mv->col;
+  if (cost_list != NULL) {
+    cost_list[0] = cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] =
+        INT_MAX;
+  }
+
+  // Work out the start point for the search
+  bestsad = vfp->sdf(what->buf, what->stride,
+                     get_buf_from_mv(in_what, ref_mv), in_what->stride) +
+      mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
+
+  // Search all possible scales upto the search param around the center point
+  // pick the scale of the point that is best as the starting scale of
+  // further steps around it.
+  if (do_init_search) {
+    s = best_init_s;
+    best_init_s = -1;
+    for (t = 0; t <= s; ++t) {
+      int best_site = -1;
+      if (check_bounds(x, br, bc, 1 << t)) {
+        for (i = 0; i < num_candidates[t]; i++) {
+          const MV this_mv = {br + candidates[t][i].row,
+                              bc + candidates[t][i].col};
+          thissad = vfp->sdf(what->buf, what->stride,
+                             get_buf_from_mv(in_what, &this_mv),
+                             in_what->stride);
+          CHECK_BETTER
+        }
+      } else {
+        for (i = 0; i < num_candidates[t]; i++) {
+          const MV this_mv = {br + candidates[t][i].row,
+                              bc + candidates[t][i].col};
+          if (!is_mv_in(x, &this_mv))
+            continue;
+          thissad = vfp->sdf(what->buf, what->stride,
+                             get_buf_from_mv(in_what, &this_mv),
+                             in_what->stride);
+          CHECK_BETTER
+        }
+      }
+      if (best_site == -1) {
+        continue;
+      } else {
+        best_init_s = t;
+        k = best_site;
+      }
+    }
+    if (best_init_s != -1) {
+      br += candidates[best_init_s][k].row;
+      bc += candidates[best_init_s][k].col;
+    }
+  }
+
+  // If the center point is still the best, just skip this and move to
+  // the refinement step.
+  if (best_init_s != -1) {
+    int do_sad = (num_candidates[0] == 4 && cost_list != NULL);
+    int best_site = -1;
+    s = best_init_s;
+
+    for (; s >= do_sad; s--) {
+      if (!do_init_search || s != best_init_s) {
+        if (check_bounds(x, br, bc, 1 << s)) {
+          for (i = 0; i < num_candidates[s]; i++) {
+            const MV this_mv = {br + candidates[s][i].row,
+                                bc + candidates[s][i].col};
+            thissad = vfp->sdf(what->buf, what->stride,
+                               get_buf_from_mv(in_what, &this_mv),
+                               in_what->stride);
+            CHECK_BETTER
+          }
+        } else {
+          for (i = 0; i < num_candidates[s]; i++) {
+            const MV this_mv = {br + candidates[s][i].row,
+                                bc + candidates[s][i].col};
+            if (!is_mv_in(x, &this_mv))
+              continue;
+            thissad = vfp->sdf(what->buf, what->stride,
+                               get_buf_from_mv(in_what, &this_mv),
+                               in_what->stride);
+            CHECK_BETTER
+          }
+        }
+
+        if (best_site == -1) {
+          continue;
+        } else {
+          br += candidates[s][best_site].row;
+          bc += candidates[s][best_site].col;
+          k = best_site;
+        }
+      }
+
+      do {
+        int next_chkpts_indices[PATTERN_CANDIDATES_REF];
+        best_site = -1;
+        next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1;
+        next_chkpts_indices[1] = k;
+        next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1;
+
+        if (check_bounds(x, br, bc, 1 << s)) {
+          for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
+            const MV this_mv = {br + candidates[s][next_chkpts_indices[i]].row,
+                                bc + candidates[s][next_chkpts_indices[i]].col};
+            thissad = vfp->sdf(what->buf, what->stride,
+                               get_buf_from_mv(in_what, &this_mv),
+                               in_what->stride);
+            CHECK_BETTER
+          }
+        } else {
+          for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
+            const MV this_mv = {br + candidates[s][next_chkpts_indices[i]].row,
+                                bc + candidates[s][next_chkpts_indices[i]].col};
+            if (!is_mv_in(x, &this_mv))
+              continue;
+            thissad = vfp->sdf(what->buf, what->stride,
+                               get_buf_from_mv(in_what, &this_mv),
+                               in_what->stride);
+            CHECK_BETTER
+          }
+        }
+
+        if (best_site != -1) {
+          k = next_chkpts_indices[best_site];
+          br += candidates[s][k].row;
+          bc += candidates[s][k].col;
+        }
+      } while (best_site != -1);
+    }
+
+    // Note: If we enter the if below, then cost_list must be non-NULL.
+    if (s == 0) {
+      cost_list[0] = bestsad;
+      if (!do_init_search || s != best_init_s) {
+        if (check_bounds(x, br, bc, 1 << s)) {
+          for (i = 0; i < num_candidates[s]; i++) {
+            const MV this_mv = {br + candidates[s][i].row,
+                                bc + candidates[s][i].col};
+            cost_list[i + 1] =
+            thissad = vfp->sdf(what->buf, what->stride,
+                               get_buf_from_mv(in_what, &this_mv),
+                               in_what->stride);
+            CHECK_BETTER
+          }
+        } else {
+          for (i = 0; i < num_candidates[s]; i++) {
+            const MV this_mv = {br + candidates[s][i].row,
+                                bc + candidates[s][i].col};
+            if (!is_mv_in(x, &this_mv))
+              continue;
+            cost_list[i + 1] =
+            thissad = vfp->sdf(what->buf, what->stride,
+                               get_buf_from_mv(in_what, &this_mv),
+                               in_what->stride);
+            CHECK_BETTER
+          }
+        }
+
+        if (best_site != -1) {
+          br += candidates[s][best_site].row;
+          bc += candidates[s][best_site].col;
+          k = best_site;
+        }
+      }
+      while (best_site != -1) {
+        int next_chkpts_indices[PATTERN_CANDIDATES_REF];
+        best_site = -1;
+        next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1;
+        next_chkpts_indices[1] = k;
+        next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1;
+        cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] = INT_MAX;
+        cost_list[((k + 2) % 4) + 1] = cost_list[0];
+        cost_list[0] = bestsad;
+
+        if (check_bounds(x, br, bc, 1 << s)) {
+          for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
+            const MV this_mv = {br + candidates[s][next_chkpts_indices[i]].row,
+                                bc + candidates[s][next_chkpts_indices[i]].col};
+            cost_list[next_chkpts_indices[i] + 1] =
+            thissad = vfp->sdf(what->buf, what->stride,
+                               get_buf_from_mv(in_what, &this_mv),
+                               in_what->stride);
+            CHECK_BETTER
+          }
+        } else {
+          for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
+            const MV this_mv = {br + candidates[s][next_chkpts_indices[i]].row,
+                                bc + candidates[s][next_chkpts_indices[i]].col};
+            if (!is_mv_in(x, &this_mv)) {
+              cost_list[next_chkpts_indices[i] + 1] = INT_MAX;
+              continue;
+            }
+            cost_list[next_chkpts_indices[i] + 1] =
+            thissad = vfp->sdf(what->buf, what->stride,
+                               get_buf_from_mv(in_what, &this_mv),
+                               in_what->stride);
+            CHECK_BETTER
+          }
+        }
+
+        if (best_site != -1) {
+          k = next_chkpts_indices[best_site];
+          br += candidates[s][k].row;
+          bc += candidates[s][k].col;
+        }
+      }
+    }
+  }
+
+  // Returns the one-away integer pel sad values around the best as follows:
+  // cost_list[0]: sad at the best integer pel
+  // cost_list[1]: sad at delta {0, -1} (left)   from the best integer pel
+  // cost_list[2]: sad at delta { 1, 0} (bottom) from the best integer pel
+  // cost_list[3]: sad at delta { 0, 1} (right)  from the best integer pel
+  // cost_list[4]: sad at delta {-1, 0} (top)    from the best integer pel
+  if (cost_list) {
+    static const MV neighbors[4] = {{0, -1}, {1, 0}, {0, 1}, {-1, 0}};
+    if (cost_list[0] == INT_MAX) {
+      cost_list[0] = bestsad;
+      if (check_bounds(x, br, bc, 1)) {
+        for (i = 0; i < 4; i++) {
+          const MV this_mv = { br + neighbors[i].row,
+                               bc + neighbors[i].col };
+          cost_list[i + 1] = vfp->sdf(what->buf, what->stride,
+                                     get_buf_from_mv(in_what, &this_mv),
+                                     in_what->stride);
+        }
+      } else {
+        for (i = 0; i < 4; i++) {
+          const MV this_mv = {br + neighbors[i].row,
+            bc + neighbors[i].col};
+          if (!is_mv_in(x, &this_mv))
+            cost_list[i + 1] = INT_MAX;
+          else
+            cost_list[i + 1] = vfp->sdf(what->buf, what->stride,
+                                       get_buf_from_mv(in_what, &this_mv),
+                                       in_what->stride);
+        }
+      }
+    } else {
+      if (use_mvcost) {
+        for (i = 0; i < 4; i++) {
+          const MV this_mv = {br + neighbors[i].row,
+            bc + neighbors[i].col};
+          if (cost_list[i + 1] != INT_MAX) {
+            cost_list[i + 1] +=
+                mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
+          }
+        }
+      }
+    }
+  }
+  best_mv->row = br;
+  best_mv->col = bc;
+  return bestsad;
+}
+
+int vp10_get_mvpred_var(const MACROBLOCK *x,
+                       const MV *best_mv, const MV *center_mv,
+                       const vp9_variance_fn_ptr_t *vfp,
+                       int use_mvcost) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  const MV mv = {best_mv->row * 8, best_mv->col * 8};
+  unsigned int unused;
+
+  return vfp->vf(what->buf, what->stride,
+                 get_buf_from_mv(in_what, best_mv), in_what->stride, &unused) +
+      (use_mvcost ?  mv_err_cost(&mv, center_mv, x->nmvjointcost,
+                                 x->mvcost, x->errorperbit) : 0);
+}
+
+int vp10_get_mvpred_av_var(const MACROBLOCK *x,
+                          const MV *best_mv, const MV *center_mv,
+                          const uint8_t *second_pred,
+                          const vp9_variance_fn_ptr_t *vfp,
+                          int use_mvcost) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  const MV mv = {best_mv->row * 8, best_mv->col * 8};
+  unsigned int unused;
+
+  return vfp->svaf(get_buf_from_mv(in_what, best_mv), in_what->stride, 0, 0,
+                   what->buf, what->stride, &unused, second_pred) +
+      (use_mvcost ?  mv_err_cost(&mv, center_mv, x->nmvjointcost,
+                                 x->mvcost, x->errorperbit) : 0);
+}
+
+int vp10_hex_search(const MACROBLOCK *x,
+                   MV *ref_mv,
+                   int search_param,
+                   int sad_per_bit,
+                   int do_init_search,
+                   int *cost_list,
+                   const vp9_variance_fn_ptr_t *vfp,
+                   int use_mvcost,
+                   const MV *center_mv, MV *best_mv) {
+  // First scale has 8-closest points, the rest have 6 points in hex shape
+  // at increasing scales
+  static const int hex_num_candidates[MAX_PATTERN_SCALES] = {
+    8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6
+  };
+  // Note that the largest candidate step at each scale is 2^scale
+  static const MV hex_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = {
+    {{-1, -1}, {0, -1}, {1, -1}, {1, 0}, {1, 1}, { 0, 1}, { -1, 1}, {-1, 0}},
+    {{-1, -2}, {1, -2}, {2, 0}, {1, 2}, { -1, 2}, { -2, 0}},
+    {{-2, -4}, {2, -4}, {4, 0}, {2, 4}, { -2, 4}, { -4, 0}},
+    {{-4, -8}, {4, -8}, {8, 0}, {4, 8}, { -4, 8}, { -8, 0}},
+    {{-8, -16}, {8, -16}, {16, 0}, {8, 16}, { -8, 16}, { -16, 0}},
+    {{-16, -32}, {16, -32}, {32, 0}, {16, 32}, { -16, 32}, { -32, 0}},
+    {{-32, -64}, {32, -64}, {64, 0}, {32, 64}, { -32, 64}, { -64, 0}},
+    {{-64, -128}, {64, -128}, {128, 0}, {64, 128}, { -64, 128}, { -128, 0}},
+    {{-128, -256}, {128, -256}, {256, 0}, {128, 256}, { -128, 256}, { -256, 0}},
+    {{-256, -512}, {256, -512}, {512, 0}, {256, 512}, { -256, 512}, { -512, 0}},
+    {{-512, -1024}, {512, -1024}, {1024, 0}, {512, 1024}, { -512, 1024},
+      { -1024, 0}},
+  };
+  return vp10_pattern_search(x, ref_mv, search_param, sad_per_bit,
+                            do_init_search, cost_list, vfp, use_mvcost,
+                            center_mv, best_mv,
+                            hex_num_candidates, hex_candidates);
+}
+
+int vp10_bigdia_search(const MACROBLOCK *x,
+                      MV *ref_mv,
+                      int search_param,
+                      int sad_per_bit,
+                      int do_init_search,
+                      int *cost_list,
+                      const vp9_variance_fn_ptr_t *vfp,
+                      int use_mvcost,
+                      const MV *center_mv,
+                      MV *best_mv) {
+  // First scale has 4-closest points, the rest have 8 points in diamond
+  // shape at increasing scales
+  static const int bigdia_num_candidates[MAX_PATTERN_SCALES] = {
+    4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  };
+  // Note that the largest candidate step at each scale is 2^scale
+  static const MV bigdia_candidates[MAX_PATTERN_SCALES]
+                                   [MAX_PATTERN_CANDIDATES] = {
+    {{0, -1}, {1, 0}, { 0, 1}, {-1, 0}},
+    {{-1, -1}, {0, -2}, {1, -1}, {2, 0}, {1, 1}, {0, 2}, {-1, 1}, {-2, 0}},
+    {{-2, -2}, {0, -4}, {2, -2}, {4, 0}, {2, 2}, {0, 4}, {-2, 2}, {-4, 0}},
+    {{-4, -4}, {0, -8}, {4, -4}, {8, 0}, {4, 4}, {0, 8}, {-4, 4}, {-8, 0}},
+    {{-8, -8}, {0, -16}, {8, -8}, {16, 0}, {8, 8}, {0, 16}, {-8, 8}, {-16, 0}},
+    {{-16, -16}, {0, -32}, {16, -16}, {32, 0}, {16, 16}, {0, 32},
+      {-16, 16}, {-32, 0}},
+    {{-32, -32}, {0, -64}, {32, -32}, {64, 0}, {32, 32}, {0, 64},
+      {-32, 32}, {-64, 0}},
+    {{-64, -64}, {0, -128}, {64, -64}, {128, 0}, {64, 64}, {0, 128},
+      {-64, 64}, {-128, 0}},
+    {{-128, -128}, {0, -256}, {128, -128}, {256, 0}, {128, 128}, {0, 256},
+      {-128, 128}, {-256, 0}},
+    {{-256, -256}, {0, -512}, {256, -256}, {512, 0}, {256, 256}, {0, 512},
+      {-256, 256}, {-512, 0}},
+    {{-512, -512}, {0, -1024}, {512, -512}, {1024, 0}, {512, 512}, {0, 1024},
+      {-512, 512}, {-1024, 0}},
+  };
+  return vp10_pattern_search_sad(x, ref_mv, search_param, sad_per_bit,
+                                do_init_search, cost_list, vfp, use_mvcost,
+                                center_mv, best_mv,
+                                bigdia_num_candidates, bigdia_candidates);
+}
+
+int vp10_square_search(const MACROBLOCK *x,
+                      MV *ref_mv,
+                      int search_param,
+                      int sad_per_bit,
+                      int do_init_search,
+                      int *cost_list,
+                      const vp9_variance_fn_ptr_t *vfp,
+                      int use_mvcost,
+                      const MV *center_mv,
+                      MV *best_mv) {
+  // All scales have 8 closest points in square shape
+  static const int square_num_candidates[MAX_PATTERN_SCALES] = {
+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  };
+  // Note that the largest candidate step at each scale is 2^scale
+  static const MV square_candidates[MAX_PATTERN_SCALES]
+                                   [MAX_PATTERN_CANDIDATES] = {
+    {{-1, -1}, {0, -1}, {1, -1}, {1, 0}, {1, 1}, {0, 1}, {-1, 1}, {-1, 0}},
+    {{-2, -2}, {0, -2}, {2, -2}, {2, 0}, {2, 2}, {0, 2}, {-2, 2}, {-2, 0}},
+    {{-4, -4}, {0, -4}, {4, -4}, {4, 0}, {4, 4}, {0, 4}, {-4, 4}, {-4, 0}},
+    {{-8, -8}, {0, -8}, {8, -8}, {8, 0}, {8, 8}, {0, 8}, {-8, 8}, {-8, 0}},
+    {{-16, -16}, {0, -16}, {16, -16}, {16, 0}, {16, 16}, {0, 16},
+      {-16, 16}, {-16, 0}},
+    {{-32, -32}, {0, -32}, {32, -32}, {32, 0}, {32, 32}, {0, 32},
+      {-32, 32}, {-32, 0}},
+    {{-64, -64}, {0, -64}, {64, -64}, {64, 0}, {64, 64}, {0, 64},
+      {-64, 64}, {-64, 0}},
+    {{-128, -128}, {0, -128}, {128, -128}, {128, 0}, {128, 128}, {0, 128},
+      {-128, 128}, {-128, 0}},
+    {{-256, -256}, {0, -256}, {256, -256}, {256, 0}, {256, 256}, {0, 256},
+      {-256, 256}, {-256, 0}},
+    {{-512, -512}, {0, -512}, {512, -512}, {512, 0}, {512, 512}, {0, 512},
+      {-512, 512}, {-512, 0}},
+    {{-1024, -1024}, {0, -1024}, {1024, -1024}, {1024, 0}, {1024, 1024},
+      {0, 1024}, {-1024, 1024}, {-1024, 0}},
+  };
+  return vp10_pattern_search(x, ref_mv, search_param, sad_per_bit,
+                            do_init_search, cost_list, vfp, use_mvcost,
+                            center_mv, best_mv,
+                            square_num_candidates, square_candidates);
+}
+
+int vp10_fast_hex_search(const MACROBLOCK *x,
+                        MV *ref_mv,
+                        int search_param,
+                        int sad_per_bit,
+                        int do_init_search,  // must be zero for fast_hex
+                        int *cost_list,
+                        const vp9_variance_fn_ptr_t *vfp,
+                        int use_mvcost,
+                        const MV *center_mv,
+                        MV *best_mv) {
+  return vp10_hex_search(
+      x, ref_mv, VPXMAX(MAX_MVSEARCH_STEPS - 2, search_param), sad_per_bit,
+      do_init_search, cost_list, vfp, use_mvcost, center_mv, best_mv);
+}
+
+int vp10_fast_dia_search(const MACROBLOCK *x,
+                        MV *ref_mv,
+                        int search_param,
+                        int sad_per_bit,
+                        int do_init_search,
+                        int *cost_list,
+                        const vp9_variance_fn_ptr_t *vfp,
+                        int use_mvcost,
+                        const MV *center_mv,
+                        MV *best_mv) {
+  return vp10_bigdia_search(
+      x, ref_mv, VPXMAX(MAX_MVSEARCH_STEPS - 2, search_param), sad_per_bit,
+      do_init_search, cost_list, vfp, use_mvcost, center_mv, best_mv);
+}
+
+#undef CHECK_BETTER
+
+// Exhuastive motion search around a given centre position with a given
+// step size.
+static int exhuastive_mesh_search(const MACROBLOCK *x,
+                                  MV *ref_mv, MV *best_mv,
+                                  int range, int step, int sad_per_bit,
+                                  const vp9_variance_fn_ptr_t *fn_ptr,
+                                  const MV *center_mv) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  MV fcenter_mv = {center_mv->row, center_mv->col};
+  unsigned int best_sad = INT_MAX;
+  int r, c, i;
+  int start_col, end_col, start_row, end_row;
+  int col_step = (step > 1) ? step : 4;
+
+  assert(step >= 1);
+
+  clamp_mv(&fcenter_mv, x->mv_col_min, x->mv_col_max,
+           x->mv_row_min, x->mv_row_max);
+  *best_mv = fcenter_mv;
+  best_sad = fn_ptr->sdf(what->buf, what->stride,
+             get_buf_from_mv(in_what, &fcenter_mv), in_what->stride) +
+             mvsad_err_cost(x, &fcenter_mv, ref_mv, sad_per_bit);
+  start_row = VPXMAX(-range, x->mv_row_min - fcenter_mv.row);
+  start_col = VPXMAX(-range, x->mv_col_min - fcenter_mv.col);
+  end_row = VPXMIN(range, x->mv_row_max - fcenter_mv.row);
+  end_col = VPXMIN(range, x->mv_col_max - fcenter_mv.col);
+
+  for (r = start_row; r <= end_row; r += step) {
+    for (c = start_col; c <= end_col; c += col_step) {
+      // Step > 1 means we are not checking every location in this pass.
+      if (step > 1) {
+        const MV mv = {fcenter_mv.row + r, fcenter_mv.col + c};
+        unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
+                           get_buf_from_mv(in_what, &mv), in_what->stride);
+        if (sad < best_sad) {
+          sad += mvsad_err_cost(x, &mv, ref_mv, sad_per_bit);
+          if (sad < best_sad) {
+            best_sad = sad;
+            *best_mv = mv;
+          }
+        }
+      } else {
+        // 4 sads in a single call if we are checking every location
+        if (c + 3 <= end_col) {
+          unsigned int sads[4];
+          const uint8_t *addrs[4];
+          for (i = 0; i < 4; ++i) {
+            const MV mv = {fcenter_mv.row + r, fcenter_mv.col + c + i};
+            addrs[i] = get_buf_from_mv(in_what, &mv);
+          }
+          fn_ptr->sdx4df(what->buf, what->stride, addrs,
+                         in_what->stride, sads);
+
+          for (i = 0; i < 4; ++i) {
+            if (sads[i] < best_sad) {
+              const MV mv = {fcenter_mv.row + r, fcenter_mv.col + c + i};
+              const unsigned int sad = sads[i] +
+                  mvsad_err_cost(x, &mv, ref_mv, sad_per_bit);
+              if (sad < best_sad) {
+                best_sad = sad;
+                *best_mv = mv;
+              }
+            }
+          }
+        } else {
+          for (i = 0; i < end_col - c; ++i) {
+            const MV mv = {fcenter_mv.row + r, fcenter_mv.col + c + i};
+            unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
+                get_buf_from_mv(in_what, &mv), in_what->stride);
+            if (sad < best_sad) {
+              sad += mvsad_err_cost(x, &mv, ref_mv, sad_per_bit);
+              if (sad < best_sad) {
+                best_sad = sad;
+                *best_mv = mv;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return best_sad;
+}
+
+int vp10_diamond_search_sad_c(const MACROBLOCK *x,
+                             const search_site_config *cfg,
+                             MV *ref_mv, MV *best_mv, int search_param,
+                             int sad_per_bit, int *num00,
+                             const vp9_variance_fn_ptr_t *fn_ptr,
+                             const MV *center_mv) {
+  int i, j, step;
+
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  uint8_t *what = x->plane[0].src.buf;
+  const int what_stride = x->plane[0].src.stride;
+  const uint8_t *in_what;
+  const int in_what_stride = xd->plane[0].pre[0].stride;
+  const uint8_t *best_address;
+
+  unsigned int bestsad = INT_MAX;
+  int best_site = 0;
+  int last_site = 0;
+
+  int ref_row;
+  int ref_col;
+
+  // search_param determines the length of the initial step and hence the number
+  // of iterations.
+  // 0 = initial step (MAX_FIRST_STEP) pel
+  // 1 = (MAX_FIRST_STEP/2) pel,
+  // 2 = (MAX_FIRST_STEP/4) pel...
+  const search_site *ss = &cfg->ss[search_param * cfg->searches_per_step];
+  const int tot_steps = (cfg->ss_count / cfg->searches_per_step) - search_param;
+
+  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+  clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
+  ref_row = ref_mv->row;
+  ref_col = ref_mv->col;
+  *num00 = 0;
+  best_mv->row = ref_row;
+  best_mv->col = ref_col;
+
+  // Work out the start point for the search
+  in_what = xd->plane[0].pre[0].buf + ref_row * in_what_stride + ref_col;
+  best_address = in_what;
+
+  // Check the starting position
+  bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride)
+                + mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit);
+
+  i = 1;
+
+  for (step = 0; step < tot_steps; step++) {
+    int all_in = 1, t;
+
+    // All_in is true if every one of the points we are checking are within
+    // the bounds of the image.
+    all_in &= ((best_mv->row + ss[i].mv.row) > x->mv_row_min);
+    all_in &= ((best_mv->row + ss[i + 1].mv.row) < x->mv_row_max);
+    all_in &= ((best_mv->col + ss[i + 2].mv.col) > x->mv_col_min);
+    all_in &= ((best_mv->col + ss[i + 3].mv.col) < x->mv_col_max);
+
+    // If all the pixels are within the bounds we don't check whether the
+    // search point is valid in this loop,  otherwise we check each point
+    // for validity..
+    if (all_in) {
+      unsigned int sad_array[4];
+
+      for (j = 0; j < cfg->searches_per_step; j += 4) {
+        unsigned char const *block_offset[4];
+
+        for (t = 0; t < 4; t++)
+          block_offset[t] = ss[i + t].offset + best_address;
+
+        fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride,
+                       sad_array);
+
+        for (t = 0; t < 4; t++, i++) {
+          if (sad_array[t] < bestsad) {
+            const MV this_mv = {best_mv->row + ss[i].mv.row,
+                                best_mv->col + ss[i].mv.col};
+            sad_array[t] += mvsad_err_cost(x, &this_mv, &fcenter_mv,
+                                           sad_per_bit);
+            if (sad_array[t] < bestsad) {
+              bestsad = sad_array[t];
+              best_site = i;
+            }
+          }
+        }
+      }
+    } else {
+      for (j = 0; j < cfg->searches_per_step; j++) {
+        // Trap illegal vectors
+        const MV this_mv = {best_mv->row + ss[i].mv.row,
+                            best_mv->col + ss[i].mv.col};
+
+        if (is_mv_in(x, &this_mv)) {
+          const uint8_t *const check_here = ss[i].offset + best_address;
+          unsigned int thissad = fn_ptr->sdf(what, what_stride, check_here,
+                                             in_what_stride);
+
+          if (thissad < bestsad) {
+            thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
+            if (thissad < bestsad) {
+              bestsad = thissad;
+              best_site = i;
+            }
+          }
+        }
+        i++;
+      }
+    }
+    if (best_site != last_site) {
+      best_mv->row += ss[best_site].mv.row;
+      best_mv->col += ss[best_site].mv.col;
+      best_address += ss[best_site].offset;
+      last_site = best_site;
+#if defined(NEW_DIAMOND_SEARCH)
+      while (1) {
+        const MV this_mv = {best_mv->row + ss[best_site].mv.row,
+                            best_mv->col + ss[best_site].mv.col};
+        if (is_mv_in(x, &this_mv)) {
+          const uint8_t *const check_here = ss[best_site].offset + best_address;
+          unsigned int thissad = fn_ptr->sdf(what, what_stride, check_here,
+                                             in_what_stride);
+          if (thissad < bestsad) {
+            thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
+            if (thissad < bestsad) {
+              bestsad = thissad;
+              best_mv->row += ss[best_site].mv.row;
+              best_mv->col += ss[best_site].mv.col;
+              best_address += ss[best_site].offset;
+              continue;
+            }
+          }
+        }
+        break;
+      }
+#endif
+    } else if (best_address == in_what) {
+      (*num00)++;
+    }
+  }
+  return bestsad;
+}
+
+static int vector_match(int16_t *ref, int16_t *src, int bwl) {
+  int best_sad = INT_MAX;
+  int this_sad;
+  int d;
+  int center, offset = 0;
+  int bw = 4 << bwl;  // redundant variable, to be changed in the experiments.
+  for (d = 0; d <= bw; d += 16) {
+    this_sad = vpx_vector_var(&ref[d], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      offset = d;
+    }
+  }
+  center = offset;
+
+  for (d = -8; d <= 8; d += 16) {
+    int this_pos = offset + d;
+    // check limit
+    if (this_pos < 0 || this_pos > bw)
+      continue;
+    this_sad = vpx_vector_var(&ref[this_pos], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      center = this_pos;
+    }
+  }
+  offset = center;
+
+  for (d = -4; d <= 4; d += 8) {
+    int this_pos = offset + d;
+    // check limit
+    if (this_pos < 0 || this_pos > bw)
+      continue;
+    this_sad = vpx_vector_var(&ref[this_pos], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      center = this_pos;
+    }
+  }
+  offset = center;
+
+  for (d = -2; d <= 2; d += 4) {
+    int this_pos = offset + d;
+    // check limit
+    if (this_pos < 0 || this_pos > bw)
+      continue;
+    this_sad = vpx_vector_var(&ref[this_pos], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      center = this_pos;
+    }
+  }
+  offset = center;
+
+  for (d = -1; d <= 1; d += 2) {
+    int this_pos = offset + d;
+    // check limit
+    if (this_pos < 0 || this_pos > bw)
+      continue;
+    this_sad = vpx_vector_var(&ref[this_pos], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      center = this_pos;
+    }
+  }
+
+  return (center - (bw >> 1));
+}
+
+static const MV search_pos[4] = {
+    {-1, 0}, {0, -1}, {0, 1}, {1, 0},
+};
+
+unsigned int vp10_int_pro_motion_estimation(const VP10_COMP *cpi, MACROBLOCK *x,
+                                           BLOCK_SIZE bsize,
+                                           int mi_row, int mi_col) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0, 0}};
+  DECLARE_ALIGNED(16, int16_t, hbuf[128]);
+  DECLARE_ALIGNED(16, int16_t, vbuf[128]);
+  DECLARE_ALIGNED(16, int16_t, src_hbuf[64]);
+  DECLARE_ALIGNED(16, int16_t, src_vbuf[64]);
+  int idx;
+  const int bw = 4 << b_width_log2_lookup[bsize];
+  const int bh = 4 << b_height_log2_lookup[bsize];
+  const int search_width = bw << 1;
+  const int search_height = bh << 1;
+  const int src_stride = x->plane[0].src.stride;
+  const int ref_stride = xd->plane[0].pre[0].stride;
+  uint8_t const *ref_buf, *src_buf;
+  MV *tmp_mv = &xd->mi[0]->mbmi.mv[0].as_mv;
+  unsigned int best_sad, tmp_sad, this_sad[4];
+  MV this_mv;
+  const int norm_factor = 3 + (bw >> 5);
+  const YV12_BUFFER_CONFIG *scaled_ref_frame =
+      vp10_get_scaled_ref_frame(cpi, mbmi->ref_frame[0]);
+
+  if (scaled_ref_frame) {
+    int i;
+    // Swap out the reference frame for a version that's been scaled to
+    // match the resolution of the current frame, allowing the existing
+    // motion search code to be used without additional modifications.
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      backup_yv12[i] = xd->plane[i].pre[0];
+    vp10_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
+  }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  {
+    unsigned int this_sad;
+    tmp_mv->row = 0;
+    tmp_mv->col = 0;
+    this_sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride,
+                                      xd->plane[0].pre[0].buf, ref_stride);
+
+    if (scaled_ref_frame) {
+      int i;
+      for (i = 0; i < MAX_MB_PLANE; i++)
+        xd->plane[i].pre[0] = backup_yv12[i];
+    }
+    return this_sad;
+  }
+#endif
+
+  // Set up prediction 1-D reference set
+  ref_buf = xd->plane[0].pre[0].buf - (bw >> 1);
+  for (idx = 0; idx < search_width; idx += 16) {
+    vpx_int_pro_row(&hbuf[idx], ref_buf, ref_stride, bh);
+    ref_buf += 16;
+  }
+
+  ref_buf = xd->plane[0].pre[0].buf - (bh >> 1) * ref_stride;
+  for (idx = 0; idx < search_height; ++idx) {
+    vbuf[idx] = vpx_int_pro_col(ref_buf, bw) >> norm_factor;
+    ref_buf += ref_stride;
+  }
+
+  // Set up src 1-D reference set
+  for (idx = 0; idx < bw; idx += 16) {
+    src_buf = x->plane[0].src.buf + idx;
+    vpx_int_pro_row(&src_hbuf[idx], src_buf, src_stride, bh);
+  }
+
+  src_buf = x->plane[0].src.buf;
+  for (idx = 0; idx < bh; ++idx) {
+    src_vbuf[idx] = vpx_int_pro_col(src_buf, bw) >> norm_factor;
+    src_buf += src_stride;
+  }
+
+  // Find the best match per 1-D search
+  tmp_mv->col = vector_match(hbuf, src_hbuf, b_width_log2_lookup[bsize]);
+  tmp_mv->row = vector_match(vbuf, src_vbuf, b_height_log2_lookup[bsize]);
+
+  this_mv = *tmp_mv;
+  src_buf = x->plane[0].src.buf;
+  ref_buf = xd->plane[0].pre[0].buf + this_mv.row * ref_stride + this_mv.col;
+  best_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
+
+  {
+    const uint8_t * const pos[4] = {
+        ref_buf - ref_stride,
+        ref_buf - 1,
+        ref_buf + 1,
+        ref_buf + ref_stride,
+    };
+
+    cpi->fn_ptr[bsize].sdx4df(src_buf, src_stride, pos, ref_stride, this_sad);
+  }
+
+  for (idx = 0; idx < 4; ++idx) {
+    if (this_sad[idx] < best_sad) {
+      best_sad = this_sad[idx];
+      tmp_mv->row = search_pos[idx].row + this_mv.row;
+      tmp_mv->col = search_pos[idx].col + this_mv.col;
+    }
+  }
+
+  if (this_sad[0] < this_sad[3])
+    this_mv.row -= 1;
+  else
+    this_mv.row += 1;
+
+  if (this_sad[1] < this_sad[2])
+    this_mv.col -= 1;
+  else
+    this_mv.col += 1;
+
+  ref_buf = xd->plane[0].pre[0].buf + this_mv.row * ref_stride + this_mv.col;
+
+  tmp_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride,
+                                   ref_buf, ref_stride);
+  if (best_sad > tmp_sad) {
+    *tmp_mv = this_mv;
+    best_sad = tmp_sad;
+  }
+
+  tmp_mv->row *= 8;
+  tmp_mv->col *= 8;
+
+  if (scaled_ref_frame) {
+    int i;
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      xd->plane[i].pre[0] = backup_yv12[i];
+  }
+
+  return best_sad;
+}
+
+/* do_refine: If last step (1-away) of n-step search doesn't pick the center
+              point as the best match, we will do a final 1-away diamond
+              refining search  */
+int vp10_full_pixel_diamond(const VP10_COMP *cpi, MACROBLOCK *x,
+                           MV *mvp_full, int step_param,
+                           int sadpb, int further_steps, int do_refine,
+                           int *cost_list,
+                           const vp9_variance_fn_ptr_t *fn_ptr,
+                           const MV *ref_mv, MV *dst_mv) {
+  MV temp_mv;
+  int thissme, n, num00 = 0;
+  int bestsme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, &temp_mv,
+                                        step_param, sadpb, &n,
+                                        fn_ptr, ref_mv);
+  if (bestsme < INT_MAX)
+    bestsme = vp10_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
+  *dst_mv = temp_mv;
+
+  // If there won't be more n-step search, check to see if refining search is
+  // needed.
+  if (n > further_steps)
+    do_refine = 0;
+
+  while (n < further_steps) {
+    ++n;
+
+    if (num00) {
+      num00--;
+    } else {
+      thissme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, &temp_mv,
+                                        step_param + n, sadpb, &num00,
+                                        fn_ptr, ref_mv);
+      if (thissme < INT_MAX)
+        thissme = vp10_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
+
+      // check to see if refining search is needed.
+      if (num00 > further_steps - n)
+        do_refine = 0;
+
+      if (thissme < bestsme) {
+        bestsme = thissme;
+        *dst_mv = temp_mv;
+      }
+    }
+  }
+
+  // final 1-away diamond refining search
+  if (do_refine) {
+    const int search_range = 8;
+    MV best_mv = *dst_mv;
+    thissme = vp10_refining_search_sad(x, &best_mv, sadpb, search_range,
+                                       fn_ptr, ref_mv);
+    if (thissme < INT_MAX)
+      thissme = vp10_get_mvpred_var(x, &best_mv, ref_mv, fn_ptr, 1);
+    if (thissme < bestsme) {
+      bestsme = thissme;
+      *dst_mv = best_mv;
+    }
+  }
+
+  // Return cost list.
+  if (cost_list) {
+    calc_int_cost_list(x, ref_mv, sadpb, fn_ptr, dst_mv, cost_list);
+  }
+  return bestsme;
+}
+
+#define MIN_RANGE 7
+#define MAX_RANGE 256
+#define MIN_INTERVAL 1
+// Runs an limited range exhaustive mesh search using a pattern set
+// according to the encode speed profile.
+static int full_pixel_exhaustive(VP10_COMP *cpi, MACROBLOCK *x,
+                                 MV *centre_mv_full, int sadpb,  int *cost_list,
+                                 const vp9_variance_fn_ptr_t *fn_ptr,
+                                 const MV *ref_mv, MV *dst_mv) {
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  MV temp_mv = {centre_mv_full->row, centre_mv_full->col};
+  MV f_ref_mv = {ref_mv->row >> 3, ref_mv->col >> 3};
+  int bestsme;
+  int i;
+  int interval = sf->mesh_patterns[0].interval;
+  int range = sf->mesh_patterns[0].range;
+  int baseline_interval_divisor;
+
+  // Keep track of number of exhaustive calls (this frame in this thread).
+  ++(*x->ex_search_count_ptr);
+
+  // Trap illegal values for interval and range for this function.
+  if ((range < MIN_RANGE) || (range > MAX_RANGE) ||
+      (interval < MIN_INTERVAL) || (interval > range))
+    return INT_MAX;
+
+  baseline_interval_divisor = range / interval;
+
+  // Check size of proposed first range against magnitude of the centre
+  // value used as a starting point.
+  range = VPXMAX(range, (5 * VPXMAX(abs(temp_mv.row), abs(temp_mv.col))) / 4);
+  range = VPXMIN(range, MAX_RANGE);
+  interval = VPXMAX(interval, range / baseline_interval_divisor);
+
+  // initial search
+  bestsme = exhuastive_mesh_search(x, &f_ref_mv, &temp_mv, range,
+                                  interval, sadpb, fn_ptr, &temp_mv);
+
+  if ((interval > MIN_INTERVAL) && (range > MIN_RANGE)) {
+    // Progressive searches with range and step size decreasing each time
+    // till we reach a step size of 1. Then break out.
+    for (i = 1; i < MAX_MESH_STEP; ++i) {
+      // First pass with coarser step and longer range
+      bestsme = exhuastive_mesh_search(x, &f_ref_mv, &temp_mv,
+                                       sf->mesh_patterns[i].range,
+                                       sf->mesh_patterns[i].interval,
+                                       sadpb, fn_ptr, &temp_mv);
+
+      if (sf->mesh_patterns[i].interval == 1)
+        break;
+    }
+  }
+
+  if (bestsme < INT_MAX)
+    bestsme = vp10_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
+  *dst_mv = temp_mv;
+
+  // Return cost list.
+  if (cost_list) {
+    calc_int_cost_list(x, ref_mv, sadpb, fn_ptr, dst_mv, cost_list);
+  }
+  return bestsme;
+}
+
+int vp10_full_search_sad_c(const MACROBLOCK *x, const MV *ref_mv,
+                          int sad_per_bit, int distance,
+                          const vp9_variance_fn_ptr_t *fn_ptr,
+                          const MV *center_mv, MV *best_mv) {
+  int r, c;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  const int row_min = VPXMAX(ref_mv->row - distance, x->mv_row_min);
+  const int row_max = VPXMIN(ref_mv->row + distance, x->mv_row_max);
+  const int col_min = VPXMAX(ref_mv->col - distance, x->mv_col_min);
+  const int col_max = VPXMIN(ref_mv->col + distance, x->mv_col_max);
+  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+  int best_sad = fn_ptr->sdf(what->buf, what->stride,
+      get_buf_from_mv(in_what, ref_mv), in_what->stride) +
+      mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
+  *best_mv = *ref_mv;
+
+  for (r = row_min; r < row_max; ++r) {
+    for (c = col_min; c < col_max; ++c) {
+      const MV mv = {r, c};
+      const int sad = fn_ptr->sdf(what->buf, what->stride,
+          get_buf_from_mv(in_what, &mv), in_what->stride) +
+              mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+      if (sad < best_sad) {
+        best_sad = sad;
+        *best_mv = mv;
+      }
+    }
+  }
+  return best_sad;
+}
+
+int vp10_full_search_sadx3(const MACROBLOCK *x, const MV *ref_mv,
+                          int sad_per_bit, int distance,
+                          const vp9_variance_fn_ptr_t *fn_ptr,
+                          const MV *center_mv, MV *best_mv) {
+  int r;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  const int row_min = VPXMAX(ref_mv->row - distance, x->mv_row_min);
+  const int row_max = VPXMIN(ref_mv->row + distance, x->mv_row_max);
+  const int col_min = VPXMAX(ref_mv->col - distance, x->mv_col_min);
+  const int col_max = VPXMIN(ref_mv->col + distance, x->mv_col_max);
+  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+  unsigned int best_sad = fn_ptr->sdf(what->buf, what->stride,
+      get_buf_from_mv(in_what, ref_mv), in_what->stride) +
+      mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
+  *best_mv = *ref_mv;
+
+  for (r = row_min; r < row_max; ++r) {
+    int c = col_min;
+    const uint8_t *check_here = &in_what->buf[r * in_what->stride + c];
+
+    if (fn_ptr->sdx3f != NULL) {
+      while ((c + 2) < col_max) {
+        int i;
+        DECLARE_ALIGNED(16, uint32_t, sads[3]);
+
+        fn_ptr->sdx3f(what->buf, what->stride, check_here, in_what->stride,
+                      sads);
+
+        for (i = 0; i < 3; ++i) {
+          unsigned int sad = sads[i];
+          if (sad < best_sad) {
+            const MV mv = {r, c};
+            sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+            if (sad < best_sad) {
+              best_sad = sad;
+              *best_mv = mv;
+            }
+          }
+          ++check_here;
+          ++c;
+        }
+      }
+    }
+
+    while (c < col_max) {
+      unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
+                                     check_here, in_what->stride);
+      if (sad < best_sad) {
+        const MV mv = {r, c};
+        sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+        if (sad < best_sad) {
+          best_sad = sad;
+          *best_mv = mv;
+        }
+      }
+      ++check_here;
+      ++c;
+    }
+  }
+
+  return best_sad;
+}
+
+int vp10_full_search_sadx8(const MACROBLOCK *x, const MV *ref_mv,
+                          int sad_per_bit, int distance,
+                          const vp9_variance_fn_ptr_t *fn_ptr,
+                          const MV *center_mv, MV *best_mv) {
+  int r;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  const int row_min = VPXMAX(ref_mv->row - distance, x->mv_row_min);
+  const int row_max = VPXMIN(ref_mv->row + distance, x->mv_row_max);
+  const int col_min = VPXMAX(ref_mv->col - distance, x->mv_col_min);
+  const int col_max = VPXMIN(ref_mv->col + distance, x->mv_col_max);
+  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+  unsigned int best_sad = fn_ptr->sdf(what->buf, what->stride,
+      get_buf_from_mv(in_what, ref_mv), in_what->stride) +
+      mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
+  *best_mv = *ref_mv;
+
+  for (r = row_min; r < row_max; ++r) {
+    int c = col_min;
+    const uint8_t *check_here = &in_what->buf[r * in_what->stride + c];
+
+    if (fn_ptr->sdx8f != NULL) {
+      while ((c + 7) < col_max) {
+        int i;
+        DECLARE_ALIGNED(16, uint32_t, sads[8]);
+
+        fn_ptr->sdx8f(what->buf, what->stride, check_here, in_what->stride,
+                      sads);
+
+        for (i = 0; i < 8; ++i) {
+          unsigned int sad = sads[i];
+          if (sad < best_sad) {
+            const MV mv = {r, c};
+            sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+            if (sad < best_sad) {
+              best_sad = sad;
+              *best_mv = mv;
+            }
+          }
+          ++check_here;
+          ++c;
+        }
+      }
+    }
+
+    if (fn_ptr->sdx3f != NULL) {
+      while ((c + 2) < col_max) {
+        int i;
+        DECLARE_ALIGNED(16, uint32_t, sads[3]);
+
+        fn_ptr->sdx3f(what->buf, what->stride, check_here, in_what->stride,
+                      sads);
+
+        for (i = 0; i < 3; ++i) {
+          unsigned int sad = sads[i];
+          if (sad < best_sad) {
+            const MV mv = {r, c};
+            sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+            if (sad < best_sad) {
+              best_sad = sad;
+              *best_mv = mv;
+            }
+          }
+          ++check_here;
+          ++c;
+        }
+      }
+    }
+
+    while (c < col_max) {
+      unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
+                                     check_here, in_what->stride);
+      if (sad < best_sad) {
+        const MV mv = {r, c};
+        sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+        if (sad < best_sad) {
+          best_sad = sad;
+          *best_mv = mv;
+        }
+      }
+      ++check_here;
+      ++c;
+    }
+  }
+
+  return best_sad;
+}
+
+int vp10_refining_search_sad(const MACROBLOCK *x,
+                            MV *ref_mv, int error_per_bit,
+                            int search_range,
+                            const vp9_variance_fn_ptr_t *fn_ptr,
+                            const MV *center_mv) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MV neighbors[4] = {{ -1, 0}, {0, -1}, {0, 1}, {1, 0}};
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+  const uint8_t *best_address = get_buf_from_mv(in_what, ref_mv);
+  unsigned int best_sad = fn_ptr->sdf(what->buf, what->stride, best_address,
+                                    in_what->stride) +
+      mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
+  int i, j;
+
+  for (i = 0; i < search_range; i++) {
+    int best_site = -1;
+    const int all_in = ((ref_mv->row - 1) > x->mv_row_min) &
+                       ((ref_mv->row + 1) < x->mv_row_max) &
+                       ((ref_mv->col - 1) > x->mv_col_min) &
+                       ((ref_mv->col + 1) < x->mv_col_max);
+
+    if (all_in) {
+      unsigned int sads[4];
+      const uint8_t *const positions[4] = {
+        best_address - in_what->stride,
+        best_address - 1,
+        best_address + 1,
+        best_address + in_what->stride
+      };
+
+      fn_ptr->sdx4df(what->buf, what->stride, positions, in_what->stride, sads);
+
+      for (j = 0; j < 4; ++j) {
+        if (sads[j] < best_sad) {
+          const MV mv = {ref_mv->row + neighbors[j].row,
+                         ref_mv->col + neighbors[j].col};
+          sads[j] += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
+          if (sads[j] < best_sad) {
+            best_sad = sads[j];
+            best_site = j;
+          }
+        }
+      }
+    } else {
+      for (j = 0; j < 4; ++j) {
+        const MV mv = {ref_mv->row + neighbors[j].row,
+                       ref_mv->col + neighbors[j].col};
+
+        if (is_mv_in(x, &mv)) {
+          unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
+                                         get_buf_from_mv(in_what, &mv),
+                                         in_what->stride);
+          if (sad < best_sad) {
+            sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
+            if (sad < best_sad) {
+              best_sad = sad;
+              best_site = j;
+            }
+          }
+        }
+      }
+    }
+
+    if (best_site == -1) {
+      break;
+    } else {
+      ref_mv->row += neighbors[best_site].row;
+      ref_mv->col += neighbors[best_site].col;
+      best_address = get_buf_from_mv(in_what, ref_mv);
+    }
+  }
+
+  return best_sad;
+}
+
+// This function is called when we do joint motion search in comp_inter_inter
+// mode.
+int vp10_refining_search_8p_c(const MACROBLOCK *x,
+                             MV *ref_mv, int error_per_bit,
+                             int search_range,
+                             const vp9_variance_fn_ptr_t *fn_ptr,
+                             const MV *center_mv,
+                             const uint8_t *second_pred) {
+  const MV neighbors[8] = {{-1, 0}, {0, -1}, {0, 1}, {1, 0},
+                           {-1, -1}, {1, -1}, {-1, 1}, {1, 1}};
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+  unsigned int best_sad = fn_ptr->sdaf(what->buf, what->stride,
+      get_buf_from_mv(in_what, ref_mv), in_what->stride, second_pred) +
+      mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
+  int i, j;
+
+  for (i = 0; i < search_range; ++i) {
+    int best_site = -1;
+
+    for (j = 0; j < 8; ++j) {
+      const MV mv = {ref_mv->row + neighbors[j].row,
+                     ref_mv->col + neighbors[j].col};
+
+      if (is_mv_in(x, &mv)) {
+        unsigned int sad = fn_ptr->sdaf(what->buf, what->stride,
+            get_buf_from_mv(in_what, &mv), in_what->stride, second_pred);
+        if (sad < best_sad) {
+          sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
+          if (sad < best_sad) {
+            best_sad = sad;
+            best_site = j;
+          }
+        }
+      }
+    }
+
+    if (best_site == -1) {
+      break;
+    } else {
+      ref_mv->row += neighbors[best_site].row;
+      ref_mv->col += neighbors[best_site].col;
+    }
+  }
+  return best_sad;
+}
+
+#define MIN_EX_SEARCH_LIMIT 128
+static int is_exhaustive_allowed(VP10_COMP *cpi, MACROBLOCK *x) {
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  const int max_ex = VPXMAX(MIN_EX_SEARCH_LIMIT,
+      (*x->m_search_count_ptr * sf->max_exaustive_pct) / 100);
+
+  return sf->allow_exhaustive_searches &&
+      (sf->exhaustive_searches_thresh < INT_MAX) &&
+      (*x->ex_search_count_ptr <= max_ex) &&
+      !cpi->rc.is_src_frame_alt_ref;
+}
+
+int vp10_full_pixel_search(VP10_COMP *cpi, MACROBLOCK *x,
+                          BLOCK_SIZE bsize, MV *mvp_full,
+                          int step_param, int error_per_bit,
+                          int *cost_list,
+                          const MV *ref_mv, MV *tmp_mv,
+                          int var_max, int rd) {
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  const SEARCH_METHODS method = sf->mv.search_method;
+  vp9_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize];
+  int var = 0;
+  if (cost_list) {
+    cost_list[0] = INT_MAX;
+    cost_list[1] = INT_MAX;
+    cost_list[2] = INT_MAX;
+    cost_list[3] = INT_MAX;
+    cost_list[4] = INT_MAX;
+  }
+
+  // Keep track of number of searches (this frame in this thread).
+  ++(*x->m_search_count_ptr);
+
+  switch (method) {
+    case FAST_DIAMOND:
+      var = vp10_fast_dia_search(x, mvp_full, step_param, error_per_bit, 0,
+                                cost_list, fn_ptr, 1, ref_mv, tmp_mv);
+      break;
+    case FAST_HEX:
+      var = vp10_fast_hex_search(x, mvp_full, step_param, error_per_bit, 0,
+                                cost_list, fn_ptr, 1, ref_mv, tmp_mv);
+      break;
+    case HEX:
+      var = vp10_hex_search(x, mvp_full, step_param, error_per_bit, 1,
+                           cost_list, fn_ptr, 1, ref_mv, tmp_mv);
+      break;
+    case SQUARE:
+      var = vp10_square_search(x, mvp_full, step_param, error_per_bit, 1,
+                              cost_list, fn_ptr, 1, ref_mv, tmp_mv);
+      break;
+    case BIGDIA:
+      var = vp10_bigdia_search(x, mvp_full, step_param, error_per_bit, 1,
+                              cost_list, fn_ptr, 1, ref_mv, tmp_mv);
+      break;
+    case NSTEP:
+      var = vp10_full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit,
+                                   MAX_MVSEARCH_STEPS - 1 - step_param,
+                                   1, cost_list, fn_ptr, ref_mv, tmp_mv);
+
+      // Should we allow a follow on exhaustive search?
+      if (is_exhaustive_allowed(cpi, x)) {
+        int64_t exhuastive_thr = sf->exhaustive_searches_thresh;
+        exhuastive_thr >>= 8 - (b_width_log2_lookup[bsize] +
+                                b_height_log2_lookup[bsize]);
+
+        // Threshold variance for an exhaustive full search.
+        if (var > exhuastive_thr) {
+            int var_ex;
+          MV tmp_mv_ex;
+          var_ex = full_pixel_exhaustive(cpi, x, tmp_mv,
+                                         error_per_bit, cost_list, fn_ptr,
+                                         ref_mv, &tmp_mv_ex);
+
+          if (var_ex < var) {
+            var = var_ex;
+            *tmp_mv = tmp_mv_ex;
+          }
+        }
+      }
+      break;
+
+      break;
+    default:
+      assert(0 && "Invalid search method.");
+  }
+
+  if (method != NSTEP && rd && var < var_max)
+    var = vp10_get_mvpred_var(x, tmp_mv, ref_mv, fn_ptr, 1);
+
+  return var;
+}
diff --git a/libs/libvpx/vp10/encoder/mcomp.h b/libs/libvpx/vp10/encoder/mcomp.h
new file mode 100644
index 0000000000..9d1ab2aabe
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/mcomp.h
@@ -0,0 +1,165 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP10_ENCODER_MCOMP_H_
+#define VP10_ENCODER_MCOMP_H_
+
+#include "vp10/encoder/block.h"
+#include "vpx_dsp/variance.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// The maximum number of steps in a step search given the largest
+// allowed initial step
+#define MAX_MVSEARCH_STEPS 11
+// Max full pel mv specified in the unit of full pixel
+// Enable the use of motion vector in range [-1023, 1023].
+#define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS - 1)) - 1)
+// Maximum size of the first step in full pel units
+#define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS-1))
+// Allowed motion vector pixel distance outside image border
+// for Block_16x16
+#define BORDER_MV_PIXELS_B16 (16 + VP9_INTERP_EXTEND)
+
+// motion search site
+typedef struct search_site {
+  MV mv;
+  int offset;
+} search_site;
+
+typedef struct search_site_config {
+  search_site ss[8 * MAX_MVSEARCH_STEPS + 1];
+  int ss_count;
+  int searches_per_step;
+} search_site_config;
+
+void vp10_init_dsmotion_compensation(search_site_config *cfg, int stride);
+void vp10_init3smotion_compensation(search_site_config *cfg,  int stride);
+
+void vp10_set_mv_search_range(MACROBLOCK *x, const MV *mv);
+int vp10_mv_bit_cost(const MV *mv, const MV *ref,
+                    const int *mvjcost, int *mvcost[2], int weight);
+
+// Utility to compute variance + MV rate cost for a given MV
+int vp10_get_mvpred_var(const MACROBLOCK *x,
+                       const MV *best_mv, const MV *center_mv,
+                       const vp9_variance_fn_ptr_t *vfp,
+                       int use_mvcost);
+int vp10_get_mvpred_av_var(const MACROBLOCK *x,
+                          const MV *best_mv, const MV *center_mv,
+                          const uint8_t *second_pred,
+                          const vp9_variance_fn_ptr_t *vfp,
+                          int use_mvcost);
+
+struct VP10_COMP;
+struct SPEED_FEATURES;
+
+int vp10_init_search_range(int size);
+
+int vp10_refining_search_sad(const struct macroblock *x,
+                            struct mv *ref_mv,
+                            int sad_per_bit, int distance,
+                            const struct vp9_variance_vtable *fn_ptr,
+                            const struct mv *center_mv);
+
+// Runs sequence of diamond searches in smaller steps for RD.
+int vp10_full_pixel_diamond(const struct VP10_COMP *cpi, MACROBLOCK *x,
+                           MV *mvp_full, int step_param,
+                           int sadpb, int further_steps, int do_refine,
+                           int *cost_list,
+                           const vp9_variance_fn_ptr_t *fn_ptr,
+                           const MV *ref_mv, MV *dst_mv);
+
+// Perform integral projection based motion estimation.
+unsigned int vp10_int_pro_motion_estimation(const struct VP10_COMP *cpi,
+                                           MACROBLOCK *x,
+                                           BLOCK_SIZE bsize,
+                                           int mi_row, int mi_col);
+
+typedef int (integer_mv_pattern_search_fn) (
+    const MACROBLOCK *x,
+    MV *ref_mv,
+    int search_param,
+    int error_per_bit,
+    int do_init_search,
+    int *cost_list,
+    const vp9_variance_fn_ptr_t *vf,
+    int use_mvcost,
+    const MV *center_mv,
+    MV *best_mv);
+
+integer_mv_pattern_search_fn vp10_hex_search;
+integer_mv_pattern_search_fn vp10_bigdia_search;
+integer_mv_pattern_search_fn vp10_square_search;
+integer_mv_pattern_search_fn vp10_fast_hex_search;
+integer_mv_pattern_search_fn vp10_fast_dia_search;
+
+typedef int (fractional_mv_step_fp) (
+    const MACROBLOCK *x,
+    MV *bestmv, const MV *ref_mv,
+    int allow_hp,
+    int error_per_bit,
+    const vp9_variance_fn_ptr_t *vfp,
+    int forced_stop,  // 0 - full, 1 - qtr only, 2 - half only
+    int iters_per_step,
+    int *cost_list,
+    int *mvjcost, int *mvcost[2],
+    int *distortion, unsigned int *sse1,
+    const uint8_t *second_pred,
+    int w, int h);
+
+extern fractional_mv_step_fp vp10_find_best_sub_pixel_tree;
+extern fractional_mv_step_fp vp10_find_best_sub_pixel_tree_pruned;
+extern fractional_mv_step_fp vp10_find_best_sub_pixel_tree_pruned_more;
+extern fractional_mv_step_fp vp10_find_best_sub_pixel_tree_pruned_evenmore;
+
+typedef int (*vp10_full_search_fn_t)(const MACROBLOCK *x,
+                                    const MV *ref_mv, int sad_per_bit,
+                                    int distance,
+                                    const vp9_variance_fn_ptr_t *fn_ptr,
+                                    const MV *center_mv, MV *best_mv);
+
+typedef int (*vp10_refining_search_fn_t)(const MACROBLOCK *x,
+                                        MV *ref_mv, int sad_per_bit,
+                                        int distance,
+                                        const vp9_variance_fn_ptr_t *fn_ptr,
+                                        const MV *center_mv);
+
+typedef int (*vp10_diamond_search_fn_t)(const MACROBLOCK *x,
+                                       const search_site_config *cfg,
+                                       MV *ref_mv, MV *best_mv,
+                                       int search_param, int sad_per_bit,
+                                       int *num00,
+                                       const vp9_variance_fn_ptr_t *fn_ptr,
+                                       const MV *center_mv);
+
+int vp10_refining_search_8p_c(const MACROBLOCK *x,
+                             MV *ref_mv, int error_per_bit,
+                             int search_range,
+                             const vp9_variance_fn_ptr_t *fn_ptr,
+                             const MV *center_mv, const uint8_t *second_pred);
+
+struct VP10_COMP;
+
+int vp10_full_pixel_search(struct VP10_COMP *cpi, MACROBLOCK *x,
+                          BLOCK_SIZE bsize, MV *mvp_full,
+                          int step_param, int error_per_bit,
+                          int *cost_list,
+                          const MV *ref_mv, MV *tmp_mv,
+                          int var_max, int rd);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_ENCODER_MCOMP_H_
diff --git a/libs/libvpx/vp10/encoder/mips/msa/error_msa.c b/libs/libvpx/vp10/encoder/mips/msa/error_msa.c
new file mode 100644
index 0000000000..dacca32c05
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/mips/msa/error_msa.c
@@ -0,0 +1,114 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp10_rtcd.h"
+#include "vpx_dsp/mips/macros_msa.h"
+
+#define BLOCK_ERROR_BLOCKSIZE_MSA(BSize)                                   \
+static int64_t block_error_##BSize##size_msa(const int16_t *coeff_ptr,     \
+                                             const int16_t *dq_coeff_ptr,  \
+                                             int64_t *ssz) {               \
+  int64_t err = 0;                                                         \
+  uint32_t loop_cnt;                                                       \
+  v8i16 coeff, dq_coeff, coeff_r_h, coeff_l_h;                             \
+  v4i32 diff_r, diff_l, coeff_r_w, coeff_l_w;                              \
+  v2i64 sq_coeff_r, sq_coeff_l;                                            \
+  v2i64 err0, err_dup0, err1, err_dup1;                                    \
+                                                                           \
+  coeff = LD_SH(coeff_ptr);                                                \
+  dq_coeff = LD_SH(dq_coeff_ptr);                                          \
+  UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w);                                \
+  ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h);                      \
+  HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l);                       \
+  DOTP_SW2_SD(coeff_r_w, coeff_l_w, coeff_r_w, coeff_l_w,                  \
+              sq_coeff_r, sq_coeff_l);                                     \
+  DOTP_SW2_SD(diff_r, diff_l, diff_r, diff_l, err0, err1);                 \
+                                                                           \
+  coeff = LD_SH(coeff_ptr + 8);                                            \
+  dq_coeff = LD_SH(dq_coeff_ptr + 8);                                      \
+  UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w);                                \
+  ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h);                      \
+  HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l);                       \
+  DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l);              \
+  DPADD_SD2_SD(diff_r, diff_l, err0, err1);                                \
+                                                                           \
+  coeff_ptr += 16;                                                         \
+  dq_coeff_ptr += 16;                                                      \
+                                                                           \
+  for (loop_cnt = ((BSize >> 4) - 1); loop_cnt--;) {                       \
+    coeff = LD_SH(coeff_ptr);                                              \
+    dq_coeff = LD_SH(dq_coeff_ptr);                                        \
+    UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w);                              \
+    ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h);                    \
+    HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l);                     \
+    DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l);            \
+    DPADD_SD2_SD(diff_r, diff_l, err0, err1);                              \
+                                                                           \
+    coeff = LD_SH(coeff_ptr + 8);                                          \
+    dq_coeff = LD_SH(dq_coeff_ptr + 8);                                    \
+    UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w);                              \
+    ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h);                    \
+    HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l);                     \
+    DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l);            \
+    DPADD_SD2_SD(diff_r, diff_l, err0, err1);                              \
+                                                                           \
+    coeff_ptr += 16;                                                       \
+    dq_coeff_ptr += 16;                                                    \
+  }                                                                        \
+                                                                           \
+  err_dup0 = __msa_splati_d(sq_coeff_r, 1);                                \
+  err_dup1 = __msa_splati_d(sq_coeff_l, 1);                                \
+  sq_coeff_r += err_dup0;                                                  \
+  sq_coeff_l += err_dup1;                                                  \
+  *ssz = __msa_copy_s_d(sq_coeff_r, 0);                                    \
+  *ssz += __msa_copy_s_d(sq_coeff_l, 0);                                   \
+                                                                           \
+  err_dup0 = __msa_splati_d(err0, 1);                                      \
+  err_dup1 = __msa_splati_d(err1, 1);                                      \
+  err0 += err_dup0;                                                        \
+  err1 += err_dup1;                                                        \
+  err = __msa_copy_s_d(err0, 0);                                           \
+  err += __msa_copy_s_d(err1, 0);                                          \
+                                                                           \
+  return err;                                                              \
+}
+
+BLOCK_ERROR_BLOCKSIZE_MSA(16);
+BLOCK_ERROR_BLOCKSIZE_MSA(64);
+BLOCK_ERROR_BLOCKSIZE_MSA(256);
+BLOCK_ERROR_BLOCKSIZE_MSA(1024);
+
+int64_t vp10_block_error_msa(const tran_low_t *coeff_ptr,
+                            const tran_low_t *dq_coeff_ptr,
+                            intptr_t blk_size, int64_t *ssz) {
+  int64_t err;
+  const int16_t *coeff = (const int16_t *)coeff_ptr;
+  const int16_t *dq_coeff = (const int16_t *)dq_coeff_ptr;
+
+  switch (blk_size) {
+    case 16:
+      err = block_error_16size_msa(coeff, dq_coeff, ssz);
+      break;
+    case 64:
+      err = block_error_64size_msa(coeff, dq_coeff, ssz);
+      break;
+    case 256:
+      err = block_error_256size_msa(coeff, dq_coeff, ssz);
+      break;
+    case 1024:
+      err = block_error_1024size_msa(coeff, dq_coeff, ssz);
+      break;
+    default:
+      err = vp10_block_error_c(coeff_ptr, dq_coeff_ptr, blk_size, ssz);
+      break;
+  }
+
+  return err;
+}
diff --git a/libs/libvpx/vp10/encoder/mips/msa/fdct16x16_msa.c b/libs/libvpx/vp10/encoder/mips/msa/fdct16x16_msa.c
new file mode 100644
index 0000000000..d78fc6473e
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/mips/msa/fdct16x16_msa.c
@@ -0,0 +1,507 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vp10/common/enums.h"
+#include "vp10/encoder/mips/msa/fdct_msa.h"
+#include "vpx_dsp/mips/fwd_txfm_msa.h"
+
+static void fadst16_cols_step1_msa(const int16_t *input, int32_t stride,
+                                   const int32_t *const0, int16_t *int_buf) {
+  v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+  v8i16 tp0, tp1, tp2, tp3, g0, g1, g2, g3, g8, g9, g10, g11, h0, h1, h2, h3;
+  v4i32 k0, k1, k2, k3;
+
+  /* load input data */
+  r0 = LD_SH(input);
+  r15 = LD_SH(input + 15 * stride);
+  r7 = LD_SH(input + 7 * stride);
+  r8 = LD_SH(input + 8 * stride);
+  SLLI_4V(r0, r15, r7, r8, 2);
+
+  /* stage 1 */
+  LD_SW2(const0, 4, k0, k1);
+  LD_SW2(const0 + 8, 4, k2, k3);
+  MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3);
+
+  r3 = LD_SH(input + 3 * stride);
+  r4 = LD_SH(input + 4 * stride);
+  r11 = LD_SH(input + 11 * stride);
+  r12 = LD_SH(input + 12 * stride);
+  SLLI_4V(r3, r4, r11, r12, 2);
+
+  LD_SW2(const0 + 4 * 4, 4, k0, k1);
+  LD_SW2(const0 + 4 * 6, 4, k2, k3);
+  MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11);
+
+  /* stage 2 */
+  BUTTERFLY_4(g0, g2, g10, g8, tp0, tp2, tp3, tp1);
+  ST_SH2(tp0, tp2, int_buf, 8);
+  ST_SH2(tp1, tp3, int_buf + 4 * 8, 8);
+
+  LD_SW2(const0 + 4 * 8, 4, k0, k1);
+  k2 = LD_SW(const0 + 4 * 10);
+  MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3);
+
+  ST_SH2(h0, h1, int_buf + 8 * 8, 8);
+  ST_SH2(h3, h2, int_buf + 12 * 8, 8);
+
+  r9 = LD_SH(input + 9 * stride);
+  r6 = LD_SH(input + 6 * stride);
+  r1 = LD_SH(input + stride);
+  r14 = LD_SH(input + 14 * stride);
+  SLLI_4V(r9, r6, r1, r14, 2);
+
+  LD_SW2(const0 + 4 * 11, 4, k0, k1);
+  LD_SW2(const0 + 4 * 13, 4, k2, k3);
+  MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g0, g1, g2, g3);
+
+  ST_SH2(g1, g3, int_buf + 3 * 8, 4 * 8);
+
+  r13 = LD_SH(input + 13 * stride);
+  r2 = LD_SH(input + 2 * stride);
+  r5 = LD_SH(input + 5 * stride);
+  r10 = LD_SH(input + 10 * stride);
+  SLLI_4V(r13, r2, r5, r10, 2);
+
+  LD_SW2(const0 + 4 * 15, 4, k0, k1);
+  LD_SW2(const0 + 4 * 17, 4, k2, k3);
+  MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, h0, h1, h2, h3);
+
+  ST_SH2(h1, h3, int_buf + 11 * 8, 4 * 8);
+
+  BUTTERFLY_4(h0, h2, g2, g0, tp0, tp1, tp2, tp3);
+  ST_SH4(tp0, tp1, tp2, tp3, int_buf + 2 * 8, 4 * 8);
+}
+
+static void fadst16_cols_step2_msa(int16_t *int_buf, const int32_t *const0,
+                                   int16_t *out) {
+  int16_t *out_ptr = out + 128;
+  v8i16 tp0, tp1, tp2, tp3, g5, g7, g13, g15;
+  v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h10, h11;
+  v8i16 out0, out1, out2, out3, out4, out5, out6, out7;
+  v8i16 out8, out9, out10, out11, out12, out13, out14, out15;
+  v4i32 k0, k1, k2, k3;
+
+  LD_SH2(int_buf + 3 * 8, 4 * 8, g13, g15);
+  LD_SH2(int_buf + 11 * 8, 4 * 8, g5, g7);
+  LD_SW2(const0 + 4 * 19, 4, k0, k1);
+  k2 = LD_SW(const0 + 4 * 21);
+  MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7);
+
+  tp0 = LD_SH(int_buf + 4 * 8);
+  tp1 = LD_SH(int_buf + 5 * 8);
+  tp3 = LD_SH(int_buf + 10 * 8);
+  tp2 = LD_SH(int_buf + 14 * 8);
+  LD_SW2(const0 + 4 * 22, 4, k0, k1);
+  k2 = LD_SW(const0 + 4 * 24);
+  MADD_BF(tp0, tp1, tp2, tp3, k0, k1, k2, k0, out4, out6, out5, out7);
+  out4 = -out4;
+  ST_SH(out4, (out + 3 * 16));
+  ST_SH(out5, (out_ptr + 4 * 16));
+
+  h1 = LD_SH(int_buf + 9 * 8);
+  h3 = LD_SH(int_buf + 12 * 8);
+  MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15);
+  out13 = -out13;
+  ST_SH(out12, (out + 2 * 16));
+  ST_SH(out13, (out_ptr + 5 * 16));
+
+  tp0 = LD_SH(int_buf);
+  tp1 = LD_SH(int_buf + 8);
+  tp2 = LD_SH(int_buf + 2 * 8);
+  tp3 = LD_SH(int_buf + 6 * 8);
+
+  BUTTERFLY_4(tp0, tp1, tp3, tp2, out0, out1, h11, h10);
+  out1 = -out1;
+  ST_SH(out0, (out));
+  ST_SH(out1, (out_ptr + 7 * 16));
+
+  h0 = LD_SH(int_buf + 8 * 8);
+  h2 = LD_SH(int_buf + 13 * 8);
+
+  BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10);
+  out8 = -out8;
+  ST_SH(out8, (out + 16));
+  ST_SH(out9, (out_ptr + 6 * 16));
+
+  /* stage 4 */
+  LD_SW2(const0 + 4 * 25, 4, k0, k1);
+  LD_SW2(const0 + 4 * 27, 4, k2, k3);
+  MADD_SHORT(h10, h11, k1, k2, out2, out3);
+  ST_SH(out2, (out + 7 * 16));
+  ST_SH(out3, (out_ptr));
+
+  MADD_SHORT(out6, out7, k0, k3, out6, out7);
+  ST_SH(out6, (out + 4 * 16));
+  ST_SH(out7, (out_ptr + 3 * 16));
+
+  MADD_SHORT(out10, out11, k0, k3, out10, out11);
+  ST_SH(out10, (out + 6 * 16));
+  ST_SH(out11, (out_ptr + 16));
+
+  MADD_SHORT(out14, out15, k1, k2, out14, out15);
+  ST_SH(out14, (out + 5 * 16));
+  ST_SH(out15, (out_ptr + 2 * 16));
+}
+
+static void fadst16_transpose_postproc_msa(int16_t *input, int16_t *out) {
+  v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+  v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15;
+
+  /* load input data */
+  LD_SH8(input, 16, l0, l1, l2, l3, l4, l5, l6, l7);
+  TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7,
+                     r0, r1, r2, r3, r4, r5, r6, r7);
+  FDCT_POSTPROC_2V_NEG_H(r0, r1);
+  FDCT_POSTPROC_2V_NEG_H(r2, r3);
+  FDCT_POSTPROC_2V_NEG_H(r4, r5);
+  FDCT_POSTPROC_2V_NEG_H(r6, r7);
+  ST_SH8(r0, r1, r2, r3, r4, r5, r6, r7, out, 8);
+  out += 64;
+
+  LD_SH8(input + 8, 16, l8, l9, l10, l11, l12, l13, l14, l15);
+  TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15,
+                     r8, r9, r10, r11, r12, r13, r14, r15);
+  FDCT_POSTPROC_2V_NEG_H(r8, r9);
+  FDCT_POSTPROC_2V_NEG_H(r10, r11);
+  FDCT_POSTPROC_2V_NEG_H(r12, r13);
+  FDCT_POSTPROC_2V_NEG_H(r14, r15);
+  ST_SH8(r8, r9, r10, r11, r12, r13, r14, r15, out, 8);
+  out += 64;
+
+  /* load input data */
+  input += 128;
+  LD_SH8(input, 16, l0, l1, l2, l3, l4, l5, l6, l7);
+  TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7,
+                     r0, r1, r2, r3, r4, r5, r6, r7);
+  FDCT_POSTPROC_2V_NEG_H(r0, r1);
+  FDCT_POSTPROC_2V_NEG_H(r2, r3);
+  FDCT_POSTPROC_2V_NEG_H(r4, r5);
+  FDCT_POSTPROC_2V_NEG_H(r6, r7);
+  ST_SH8(r0, r1, r2, r3, r4, r5, r6, r7, out, 8);
+  out += 64;
+
+  LD_SH8(input + 8, 16, l8, l9, l10, l11, l12, l13, l14, l15);
+  TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15,
+                     r8, r9, r10, r11, r12, r13, r14, r15);
+  FDCT_POSTPROC_2V_NEG_H(r8, r9);
+  FDCT_POSTPROC_2V_NEG_H(r10, r11);
+  FDCT_POSTPROC_2V_NEG_H(r12, r13);
+  FDCT_POSTPROC_2V_NEG_H(r14, r15);
+  ST_SH8(r8, r9, r10, r11, r12, r13, r14, r15, out, 8);
+}
+
+static void fadst16_rows_step1_msa(int16_t *input, const int32_t *const0,
+                                   int16_t *int_buf) {
+  v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+  v8i16 tp0, tp1, tp2, tp3, g0, g1, g2, g3, g8, g9, g10, g11, h0, h1, h2, h3;
+  v4i32 k0, k1, k2, k3;
+
+  /* load input data */
+  r0 = LD_SH(input);
+  r7 = LD_SH(input + 7 * 8);
+  r8 = LD_SH(input + 8 * 8);
+  r15 = LD_SH(input + 15 * 8);
+
+  /* stage 1 */
+  LD_SW2(const0, 4, k0, k1);
+  LD_SW2(const0 + 4 * 2, 4, k2, k3);
+  MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3);
+
+  r3 = LD_SH(input + 3 * 8);
+  r4 = LD_SH(input + 4 * 8);
+  r11 = LD_SH(input + 11 * 8);
+  r12 = LD_SH(input + 12 * 8);
+
+  LD_SW2(const0 + 4 * 4, 4, k0, k1);
+  LD_SW2(const0 + 4 * 6, 4, k2, k3);
+  MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11);
+
+  /* stage 2 */
+  BUTTERFLY_4(g0, g2, g10, g8, tp0, tp2, tp3, tp1);
+  ST_SH2(tp0, tp1, int_buf, 4 * 8);
+  ST_SH2(tp2, tp3, int_buf + 8, 4 * 8);
+
+  LD_SW2(const0 + 4 * 8, 4, k0, k1);
+  k2 = LD_SW(const0 + 4 * 10);
+  MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3);
+  ST_SH2(h0, h3, int_buf + 8 * 8, 4 * 8);
+  ST_SH2(h1, h2, int_buf + 9 * 8, 4 * 8);
+
+  r1 = LD_SH(input + 8);
+  r6 = LD_SH(input + 6 * 8);
+  r9 = LD_SH(input + 9 * 8);
+  r14 = LD_SH(input + 14 * 8);
+
+  LD_SW2(const0 + 4 * 11, 4, k0, k1);
+  LD_SW2(const0 + 4 * 13, 4, k2, k3);
+  MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g0, g1, g2, g3);
+  ST_SH2(g1, g3, int_buf + 3 * 8, 4 * 8);
+
+  r2 = LD_SH(input + 2 * 8);
+  r5 = LD_SH(input + 5 * 8);
+  r10 = LD_SH(input + 10 * 8);
+  r13 = LD_SH(input + 13 * 8);
+
+  LD_SW2(const0 + 4 * 15, 4, k0, k1);
+  LD_SW2(const0 + 4 * 17, 4, k2, k3);
+  MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, h0, h1, h2, h3);
+  ST_SH2(h1, h3, int_buf + 11 * 8, 4 * 8);
+  BUTTERFLY_4(h0, h2, g2, g0, tp0, tp1, tp2, tp3);
+  ST_SH4(tp0, tp1, tp2, tp3, int_buf + 2 * 8, 4 * 8);
+}
+
+static void fadst16_rows_step2_msa(int16_t *int_buf, const int32_t *const0,
+                                   int16_t *out) {
+  int16_t *out_ptr = out + 8;
+  v8i16 tp0, tp1, tp2, tp3, g5, g7, g13, g15;
+  v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h10, h11;
+  v8i16 out0, out1, out2, out3, out4, out5, out6, out7;
+  v8i16 out8, out9, out10, out11, out12, out13, out14, out15;
+  v4i32 k0, k1, k2, k3;
+
+  g13 = LD_SH(int_buf + 3 * 8);
+  g15 = LD_SH(int_buf + 7 * 8);
+  g5 = LD_SH(int_buf + 11 * 8);
+  g7 = LD_SH(int_buf + 15 * 8);
+
+  LD_SW2(const0 + 4 * 19, 4, k0, k1);
+  k2 = LD_SW(const0 + 4 * 21);
+  MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7);
+
+  tp0 = LD_SH(int_buf + 4 * 8);
+  tp1 = LD_SH(int_buf + 5 * 8);
+  tp3 = LD_SH(int_buf + 10 * 8);
+  tp2 = LD_SH(int_buf + 14 * 8);
+
+  LD_SW2(const0 + 4 * 22, 4, k0, k1);
+  k2 = LD_SW(const0 + 4 * 24);
+  MADD_BF(tp0, tp1, tp2, tp3, k0, k1, k2, k0, out4, out6, out5, out7);
+  out4 = -out4;
+  ST_SH(out4, (out + 3 * 16));
+  ST_SH(out5, (out_ptr + 4 * 16));
+
+  h1 = LD_SH(int_buf + 9 * 8);
+  h3 = LD_SH(int_buf + 12 * 8);
+  MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15);
+  out13 = -out13;
+  ST_SH(out12, (out + 2 * 16));
+  ST_SH(out13, (out_ptr + 5 * 16));
+
+  tp0 = LD_SH(int_buf);
+  tp1 = LD_SH(int_buf + 8);
+  tp2 = LD_SH(int_buf + 2 * 8);
+  tp3 = LD_SH(int_buf + 6 * 8);
+
+  BUTTERFLY_4(tp0, tp1, tp3, tp2, out0, out1, h11, h10);
+  out1 = -out1;
+  ST_SH(out0, (out));
+  ST_SH(out1, (out_ptr + 7 * 16));
+
+  h0 = LD_SH(int_buf + 8 * 8);
+  h2 = LD_SH(int_buf + 13 * 8);
+  BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10);
+  out8 = -out8;
+  ST_SH(out8, (out + 16));
+  ST_SH(out9, (out_ptr + 6 * 16));
+
+  /* stage 4 */
+  LD_SW2(const0 + 4 * 25, 4, k0, k1);
+  LD_SW2(const0 + 4 * 27, 4, k2, k3);
+  MADD_SHORT(h10, h11, k1, k2, out2, out3);
+  ST_SH(out2, (out + 7 * 16));
+  ST_SH(out3, (out_ptr));
+
+  MADD_SHORT(out6, out7, k0, k3, out6, out7);
+  ST_SH(out6, (out + 4 * 16));
+  ST_SH(out7, (out_ptr + 3 * 16));
+
+  MADD_SHORT(out10, out11, k0, k3, out10, out11);
+  ST_SH(out10, (out + 6 * 16));
+  ST_SH(out11, (out_ptr + 16));
+
+  MADD_SHORT(out14, out15, k1, k2, out14, out15);
+  ST_SH(out14, (out + 5 * 16));
+  ST_SH(out15, (out_ptr + 2 * 16));
+}
+
+static void fadst16_transpose_msa(int16_t *input, int16_t *out) {
+  v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+  v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15;
+
+  /* load input data */
+  LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11,
+          l4, l12, l5, l13, l6, l14, l7, l15);
+  TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7,
+                     r0, r1, r2, r3, r4, r5, r6, r7);
+  TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15,
+                     r8, r9, r10, r11, r12, r13, r14, r15);
+  ST_SH8(r0, r8, r1, r9, r2, r10, r3, r11, out, 8);
+  ST_SH8(r4, r12, r5, r13, r6, r14, r7, r15, (out + 64), 8);
+  out += 16 * 8;
+
+  /* load input data */
+  input += 128;
+  LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11,
+          l4, l12, l5, l13, l6, l14, l7, l15);
+  TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7,
+                     r0, r1, r2, r3, r4, r5, r6, r7);
+  TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15,
+                     r8, r9, r10, r11, r12, r13, r14, r15);
+  ST_SH8(r0, r8, r1, r9, r2, r10, r3, r11, out, 8);
+  ST_SH8(r4, r12, r5, r13, r6, r14, r7, r15, (out + 64), 8);
+}
+
+static void postproc_fdct16x8_1d_row(int16_t *intermediate, int16_t *output) {
+  int16_t *temp = intermediate;
+  int16_t *out = output;
+  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11;
+  v8i16 in12, in13, in14, in15;
+
+  LD_SH8(temp, 16, in0, in1, in2, in3, in4, in5, in6, in7);
+  temp = intermediate + 8;
+  LD_SH8(temp, 16, in8, in9, in10, in11, in12, in13, in14, in15);
+  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                     in0, in1, in2, in3, in4, in5, in6, in7);
+  TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15,
+                     in8, in9, in10, in11, in12, in13, in14, in15);
+  FDCT_POSTPROC_2V_NEG_H(in0, in1);
+  FDCT_POSTPROC_2V_NEG_H(in2, in3);
+  FDCT_POSTPROC_2V_NEG_H(in4, in5);
+  FDCT_POSTPROC_2V_NEG_H(in6, in7);
+  FDCT_POSTPROC_2V_NEG_H(in8, in9);
+  FDCT_POSTPROC_2V_NEG_H(in10, in11);
+  FDCT_POSTPROC_2V_NEG_H(in12, in13);
+  FDCT_POSTPROC_2V_NEG_H(in14, in15);
+  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
+               in8, in9, in10, in11, in12, in13, in14, in15,
+               tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
+               in8, in9, in10, in11, in12, in13, in14, in15);
+  temp = intermediate;
+  ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, temp, 16);
+  FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
+                tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
+  temp = intermediate;
+  LD_SH8(temp, 16, in8, in9, in10, in11, in12, in13, in14, in15);
+  FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15,
+               in0, in1, in2, in3, in4, in5, in6, in7);
+  TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3,
+                     tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3);
+  ST_SH8(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, out, 16);
+  TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7,
+                     tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7);
+  out = output + 8;
+  ST_SH8(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, out, 16);
+}
+
+void vp10_fht16x16_msa(const int16_t *input, int16_t *output,
+                      int32_t stride, int32_t tx_type) {
+  DECLARE_ALIGNED(32, int16_t, tmp[256]);
+  DECLARE_ALIGNED(32, int16_t, trans_buf[256]);
+  DECLARE_ALIGNED(32, int16_t, tmp_buf[128]);
+  int32_t i;
+  int16_t *ptmpbuf = &tmp_buf[0];
+  int16_t *trans = &trans_buf[0];
+  const int32_t const_arr[29 * 4] = {
+    52707308, 52707308, 52707308, 52707308,
+    -1072430300, -1072430300, -1072430300, -1072430300,
+    795618043, 795618043, 795618043, 795618043,
+    -721080468, -721080468, -721080468, -721080468,
+    459094491, 459094491, 459094491, 459094491,
+    -970646691, -970646691, -970646691, -970646691,
+    1010963856, 1010963856, 1010963856, 1010963856,
+    -361743294, -361743294, -361743294, -361743294,
+    209469125, 209469125, 209469125, 209469125,
+    -1053094788, -1053094788, -1053094788, -1053094788,
+    1053160324, 1053160324, 1053160324, 1053160324,
+    639644520, 639644520, 639644520, 639644520,
+    -862444000, -862444000, -862444000, -862444000,
+    1062144356, 1062144356, 1062144356, 1062144356,
+    -157532337, -157532337, -157532337, -157532337,
+    260914709, 260914709, 260914709, 260914709,
+    -1041559667, -1041559667, -1041559667, -1041559667,
+    920985831, 920985831, 920985831, 920985831,
+    -551995675, -551995675, -551995675, -551995675,
+    596522295, 596522295, 596522295, 596522295,
+    892853362, 892853362, 892853362, 892853362,
+    -892787826, -892787826, -892787826, -892787826,
+    410925857, 410925857, 410925857, 410925857,
+    -992012162, -992012162, -992012162, -992012162,
+    992077698, 992077698, 992077698, 992077698,
+    759246145, 759246145, 759246145, 759246145,
+    -759180609, -759180609, -759180609, -759180609,
+    -759222975, -759222975, -759222975, -759222975,
+    759288511, 759288511, 759288511, 759288511 };
+
+  switch (tx_type) {
+    case DCT_DCT:
+      /* column transform */
+      for (i = 0; i < 2; ++i) {
+        fdct8x16_1d_column(input + 8 * i, tmp + 8 * i, stride);
+      }
+
+      /* row transform */
+      for (i = 0; i < 2; ++i) {
+        fdct16x8_1d_row(tmp + (128 * i), output + (128 * i));
+      }
+      break;
+    case ADST_DCT:
+      /* column transform */
+      for (i = 0; i < 2; ++i) {
+        fadst16_cols_step1_msa(input + (i << 3), stride, const_arr, ptmpbuf);
+        fadst16_cols_step2_msa(ptmpbuf, const_arr, tmp + (i << 3));
+      }
+
+      /* row transform */
+      for (i = 0; i < 2; ++i) {
+        postproc_fdct16x8_1d_row(tmp + (128 * i), output + (128 * i));
+      }
+      break;
+    case DCT_ADST:
+      /* column transform */
+      for (i = 0; i < 2; ++i) {
+        fdct8x16_1d_column(input + 8 * i, tmp + 8 * i, stride);
+      }
+
+      fadst16_transpose_postproc_msa(tmp, trans);
+
+      /* row transform */
+      for (i = 0; i < 2; ++i) {
+        fadst16_rows_step1_msa(trans + (i << 7), const_arr, ptmpbuf);
+        fadst16_rows_step2_msa(ptmpbuf, const_arr, tmp + (i << 7));
+      }
+
+      fadst16_transpose_msa(tmp, output);
+      break;
+    case ADST_ADST:
+      /* column transform */
+      for (i = 0; i < 2; ++i) {
+        fadst16_cols_step1_msa(input + (i << 3), stride, const_arr, ptmpbuf);
+        fadst16_cols_step2_msa(ptmpbuf, const_arr, tmp + (i << 3));
+      }
+
+      fadst16_transpose_postproc_msa(tmp, trans);
+
+      /* row transform */
+      for (i = 0; i < 2; ++i) {
+        fadst16_rows_step1_msa(trans + (i << 7), const_arr, ptmpbuf);
+        fadst16_rows_step2_msa(ptmpbuf, const_arr, tmp + (i << 7));
+      }
+
+      fadst16_transpose_msa(tmp, output);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
diff --git a/libs/libvpx/vp10/encoder/mips/msa/fdct4x4_msa.c b/libs/libvpx/vp10/encoder/mips/msa/fdct4x4_msa.c
new file mode 100644
index 0000000000..37269f0a43
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/mips/msa/fdct4x4_msa.c
@@ -0,0 +1,99 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vp10/common/enums.h"
+#include "vp10/encoder/mips/msa/fdct_msa.h"
+
+void vp10_fwht4x4_msa(const int16_t *input, int16_t *output,
+                     int32_t src_stride) {
+  v8i16 in0, in1, in2, in3, in4;
+
+  LD_SH4(input, src_stride, in0, in1, in2, in3);
+
+  in0 += in1;
+  in3 -= in2;
+  in4 = (in0 - in3) >> 1;
+  SUB2(in4, in1, in4, in2, in1, in2);
+  in0 -= in2;
+  in3 += in1;
+
+  TRANSPOSE4x4_SH_SH(in0, in2, in3, in1, in0, in2, in3, in1);
+
+  in0 += in2;
+  in1 -= in3;
+  in4 = (in0 - in1) >> 1;
+  SUB2(in4, in2, in4, in3, in2, in3);
+  in0 -= in3;
+  in1 += in2;
+
+  SLLI_4V(in0, in1, in2, in3, 2);
+
+  TRANSPOSE4x4_SH_SH(in0, in3, in1, in2, in0, in3, in1, in2);
+
+  ST4x2_UB(in0, output, 4);
+  ST4x2_UB(in3, output + 4, 4);
+  ST4x2_UB(in1, output + 8, 4);
+  ST4x2_UB(in2, output + 12, 4);
+}
+
+void vp10_fht4x4_msa(const int16_t *input, int16_t *output, int32_t stride,
+                    int32_t tx_type) {
+  v8i16 in0, in1, in2, in3;
+
+  LD_SH4(input, stride, in0, in1, in2, in3);
+
+  /* fdct4 pre-process */
+  {
+    v8i16 temp, mask;
+    v16i8 zero = { 0 };
+    v16i8 one = __msa_ldi_b(1);
+
+    mask = (v8i16)__msa_sldi_b(zero, one, 15);
+    SLLI_4V(in0, in1, in2, in3, 4);
+    temp = __msa_ceqi_h(in0, 0);
+    temp = (v8i16)__msa_xori_b((v16u8)temp, 255);
+    temp = mask & temp;
+    in0 += temp;
+  }
+
+  switch (tx_type) {
+    case DCT_DCT:
+      VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+      TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+      VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+      break;
+    case ADST_DCT:
+      VP9_FADST4(in0, in1, in2, in3, in0, in1, in2, in3);
+      TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+      VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+      break;
+    case DCT_ADST:
+      VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+      TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+      VP9_FADST4(in0, in1, in2, in3, in0, in1, in2, in3);
+      break;
+    case ADST_ADST:
+      VP9_FADST4(in0, in1, in2, in3, in0, in1, in2, in3);
+      TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+      VP9_FADST4(in0, in1, in2, in3, in0, in1, in2, in3);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+
+  TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+  ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
+  SRA_4V(in0, in1, in2, in3, 2);
+  PCKEV_D2_SH(in1, in0, in3, in2, in0, in2);
+  ST_SH2(in0, in2, output, 8);
+}
diff --git a/libs/libvpx/vp10/encoder/mips/msa/fdct8x8_msa.c b/libs/libvpx/vp10/encoder/mips/msa/fdct8x8_msa.c
new file mode 100644
index 0000000000..4283eb946c
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/mips/msa/fdct8x8_msa.c
@@ -0,0 +1,66 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vp10/common/enums.h"
+#include "vp10/encoder/mips/msa/fdct_msa.h"
+
+void vp10_fht8x8_msa(const int16_t *input, int16_t *output, int32_t stride,
+                    int32_t tx_type) {
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+  LD_SH8(input, stride, in0, in1, in2, in3, in4, in5, in6, in7);
+  SLLI_4V(in0, in1, in2, in3, 2);
+  SLLI_4V(in4, in5, in6, in7, 2);
+
+  switch (tx_type) {
+    case DCT_DCT:
+      VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
+                in0, in1, in2, in3, in4, in5, in6, in7);
+      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                         in0, in1, in2, in3, in4, in5, in6, in7);
+      VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
+                in0, in1, in2, in3, in4, in5, in6, in7);
+      break;
+    case ADST_DCT:
+      VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,
+                in0, in1, in2, in3, in4, in5, in6, in7);
+      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                         in0, in1, in2, in3, in4, in5, in6, in7);
+      VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
+                in0, in1, in2, in3, in4, in5, in6, in7);
+      break;
+    case DCT_ADST:
+      VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
+                in0, in1, in2, in3, in4, in5, in6, in7);
+      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                         in0, in1, in2, in3, in4, in5, in6, in7);
+      VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,
+                in0, in1, in2, in3, in4, in5, in6, in7);
+      break;
+    case ADST_ADST:
+      VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,
+                in0, in1, in2, in3, in4, in5, in6, in7);
+      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                         in0, in1, in2, in3, in4, in5, in6, in7);
+      VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,
+                in0, in1, in2, in3, in4, in5, in6, in7);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+
+  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                     in0, in1, in2, in3, in4, in5, in6, in7);
+  SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7);
+  ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8);
+}
diff --git a/libs/libvpx/vp10/encoder/mips/msa/fdct_msa.h b/libs/libvpx/vp10/encoder/mips/msa/fdct_msa.h
new file mode 100644
index 0000000000..d7d40cb72c
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/mips/msa/fdct_msa.h
@@ -0,0 +1,117 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_
+#define VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_
+
+#include "vpx_dsp/mips/fwd_txfm_msa.h"
+#include "vpx_dsp/mips/txfm_macros_msa.h"
+#include "vpx_ports/mem.h"
+
+#define VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,                   \
+                  out0, out1, out2, out3, out4, out5, out6, out7) {         \
+  v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst4_m;                        \
+  v8i16 vec0_m, vec1_m, vec2_m, vec3_m, s0_m, s1_m;                         \
+  v8i16 coeff0_m = { cospi_2_64, cospi_6_64, cospi_10_64, cospi_14_64,      \
+                     cospi_18_64, cospi_22_64, cospi_26_64, cospi_30_64 };  \
+  v8i16 coeff1_m = { cospi_8_64, -cospi_8_64, cospi_16_64, -cospi_16_64,    \
+                     cospi_24_64, -cospi_24_64, 0, 0 };                     \
+                                                                            \
+  SPLATI_H2_SH(coeff0_m, 0, 7, cnst0_m, cnst1_m);                           \
+  cnst2_m = -cnst0_m;                                                       \
+  ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m);        \
+  SPLATI_H2_SH(coeff0_m, 4, 3, cnst2_m, cnst3_m);                           \
+  cnst4_m = -cnst2_m;                                                       \
+  ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m);        \
+                                                                            \
+  ILVRL_H2_SH(in0, in7, vec1_m, vec0_m);                                    \
+  ILVRL_H2_SH(in4, in3, vec3_m, vec2_m);                                    \
+  DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m,            \
+                        cnst1_m, cnst2_m, cnst3_m, in7, in0,                \
+                        in4, in3);                                          \
+                                                                            \
+  SPLATI_H2_SH(coeff0_m, 2, 5, cnst0_m, cnst1_m);                           \
+  cnst2_m = -cnst0_m;                                                       \
+  ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m);        \
+  SPLATI_H2_SH(coeff0_m, 6, 1, cnst2_m, cnst3_m);                           \
+  cnst4_m = -cnst2_m;                                                       \
+  ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m);        \
+                                                                            \
+  ILVRL_H2_SH(in2, in5, vec1_m, vec0_m);                                    \
+  ILVRL_H2_SH(in6, in1, vec3_m, vec2_m);                                    \
+                                                                            \
+  DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m,            \
+                        cnst1_m, cnst2_m, cnst3_m, in5, in2,                \
+                        in6, in1);                                          \
+  BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5);                    \
+  out7 = -s0_m;                                                             \
+  out0 = s1_m;                                                              \
+                                                                            \
+  SPLATI_H4_SH(coeff1_m, 0, 4, 1, 5, cnst0_m, cnst1_m, cnst2_m, cnst3_m);   \
+                                                                            \
+  ILVEV_H2_SH(cnst3_m, cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst2_m);        \
+  cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m);                                \
+  cnst1_m = cnst0_m;                                                        \
+                                                                            \
+  ILVRL_H2_SH(in4, in3, vec1_m, vec0_m);                                    \
+  ILVRL_H2_SH(in6, in1, vec3_m, vec2_m);                                    \
+  DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m,            \
+                        cnst2_m, cnst3_m, cnst1_m, out1, out6,              \
+                        s0_m, s1_m);                                        \
+                                                                            \
+  SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m);                           \
+  cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m);                                \
+                                                                            \
+  ILVRL_H2_SH(in2, in5, vec1_m, vec0_m);                                    \
+  ILVRL_H2_SH(s0_m, s1_m, vec3_m, vec2_m);                                  \
+  out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);                    \
+  out4 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m);                    \
+  out2 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m);                    \
+  out5 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m);                    \
+                                                                            \
+  out1 = -out1;                                                             \
+  out3 = -out3;                                                             \
+  out5 = -out5;                                                             \
+}
+
+#define VP9_FADST4(in0, in1, in2, in3, out0, out1, out2, out3) {  \
+  v4i32 s0_m, s1_m, s2_m, s3_m, constant_m;                       \
+  v4i32 in0_r_m, in1_r_m, in2_r_m, in3_r_m;                       \
+                                                                  \
+  UNPCK_R_SH_SW(in0, in0_r_m);                                    \
+  UNPCK_R_SH_SW(in1, in1_r_m);                                    \
+  UNPCK_R_SH_SW(in2, in2_r_m);                                    \
+  UNPCK_R_SH_SW(in3, in3_r_m);                                    \
+                                                                  \
+  constant_m = __msa_fill_w(sinpi_4_9);                           \
+  MUL2(in0_r_m, constant_m, in3_r_m, constant_m, s1_m, s0_m);     \
+                                                                  \
+  constant_m = __msa_fill_w(sinpi_1_9);                           \
+  s0_m += in0_r_m * constant_m;                                   \
+  s1_m -= in1_r_m * constant_m;                                   \
+                                                                  \
+  constant_m = __msa_fill_w(sinpi_2_9);                           \
+  s0_m += in1_r_m * constant_m;                                   \
+  s1_m += in3_r_m * constant_m;                                   \
+                                                                  \
+  s2_m = in0_r_m + in1_r_m - in3_r_m;                             \
+                                                                  \
+  constant_m = __msa_fill_w(sinpi_3_9);                           \
+  MUL2(in2_r_m, constant_m, s2_m, constant_m, s3_m, in1_r_m);     \
+                                                                  \
+  in0_r_m = s0_m + s3_m;                                          \
+  s2_m = s1_m - s3_m;                                             \
+  s3_m = s1_m - s0_m + s3_m;                                      \
+                                                                  \
+  SRARI_W4_SW(in0_r_m, in1_r_m, s2_m, s3_m, DCT_CONST_BITS);      \
+  PCKEV_H4_SH(in0_r_m, in0_r_m, in1_r_m, in1_r_m, s2_m, s2_m,     \
+              s3_m, s3_m, out0, out1, out2, out3);                \
+}
+#endif  /* VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_ */
diff --git a/libs/libvpx/vp10/encoder/mips/msa/temporal_filter_msa.c b/libs/libvpx/vp10/encoder/mips/msa/temporal_filter_msa.c
new file mode 100644
index 0000000000..5d4558b94c
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/mips/msa/temporal_filter_msa.c
@@ -0,0 +1,289 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp10_rtcd.h"
+#include "vpx_dsp/mips/macros_msa.h"
+
+static void temporal_filter_apply_8size_msa(uint8_t *frm1_ptr,
+                                            uint32_t stride,
+                                            uint8_t *frm2_ptr,
+                                            int32_t filt_sth,
+                                            int32_t filt_wgt,
+                                            uint32_t *acc,
+                                            uint16_t *cnt) {
+  uint32_t row;
+  uint64_t f0, f1, f2, f3;
+  v16i8 frm2, frm1 = { 0 };
+  v16i8 frm4, frm3 = { 0 };
+  v16u8 frm_r, frm_l;
+  v8i16 frm2_r, frm2_l;
+  v8i16 diff0, diff1, mod0_h, mod1_h;
+  v4i32 cnst3, cnst16, filt_wt, strength;
+  v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
+  v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
+  v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll;
+  v4i32 acc0, acc1, acc2, acc3;
+  v8i16 cnt0, cnt1;
+
+  filt_wt = __msa_fill_w(filt_wgt);
+  strength = __msa_fill_w(filt_sth);
+  cnst3 = __msa_ldi_w(3);
+  cnst16 = __msa_ldi_w(16);
+
+  for (row = 2; row--;) {
+    LD4(frm1_ptr, stride, f0, f1, f2, f3);
+    frm1_ptr += (4 * stride);
+
+    LD_SB2(frm2_ptr, 16, frm2, frm4);
+    frm2_ptr += 32;
+
+    LD_SW2(acc, 4, acc0, acc1);
+    LD_SW2(acc + 8, 4, acc2, acc3);
+    LD_SH2(cnt, 8, cnt0, cnt1);
+
+    INSERT_D2_SB(f0, f1, frm1);
+    INSERT_D2_SB(f2, f3, frm3);
+    ILVRL_B2_UB(frm1, frm2, frm_r, frm_l);
+    HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
+    UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+    UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
+         diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
+    MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+
+    diff0_r = (mod0_w < cnst16);
+    diff0_l = (mod1_w < cnst16);
+    diff1_r = (mod2_w < cnst16);
+    diff1_l = (mod3_w < cnst16);
+
+    SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+
+    mod0_w = diff0_r & mod0_w;
+    mod1_w = diff0_l & mod1_w;
+    mod2_w = diff1_r & mod2_w;
+    mod3_w = diff1_l & mod3_w;
+
+    MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+    ST_SH2(mod0_h, mod1_h, cnt, 8);
+    cnt += 16;
+
+    UNPCK_UB_SH(frm2, frm2_r, frm2_l);
+    UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
+    UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
+    MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+
+    ST_SW2(mod0_w, mod1_w, acc, 4);
+    acc += 8;
+    ST_SW2(mod2_w, mod3_w, acc, 4);
+    acc += 8;
+
+    LD_SW2(acc, 4, acc0, acc1);
+    LD_SW2(acc + 8, 4, acc2, acc3);
+    LD_SH2(cnt, 8, cnt0, cnt1);
+
+    ILVRL_B2_UB(frm3, frm4, frm_r, frm_l);
+    HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
+    UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+    UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
+         diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
+    MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+
+    diff0_r = (mod0_w < cnst16);
+    diff0_l = (mod1_w < cnst16);
+    diff1_r = (mod2_w < cnst16);
+    diff1_l = (mod3_w < cnst16);
+
+    SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+
+    mod0_w = diff0_r & mod0_w;
+    mod1_w = diff0_l & mod1_w;
+    mod2_w = diff1_r & mod2_w;
+    mod3_w = diff1_l & mod3_w;
+
+    MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+    ST_SH2(mod0_h, mod1_h, cnt, 8);
+    cnt += 16;
+    UNPCK_UB_SH(frm4, frm2_r, frm2_l);
+    UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
+    UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
+    MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+
+    ST_SW2(mod0_w, mod1_w, acc, 4);
+    acc += 8;
+    ST_SW2(mod2_w, mod3_w, acc, 4);
+    acc += 8;
+  }
+}
+
+static void temporal_filter_apply_16size_msa(uint8_t *frm1_ptr,
+                                             uint32_t stride,
+                                             uint8_t *frm2_ptr,
+                                             int32_t filt_sth,
+                                             int32_t filt_wgt,
+                                             uint32_t *acc,
+                                             uint16_t *cnt) {
+  uint32_t row;
+  v16i8 frm1, frm2, frm3, frm4;
+  v16u8 frm_r, frm_l;
+  v16i8 zero = { 0 };
+  v8u16 frm2_r, frm2_l;
+  v8i16 diff0, diff1, mod0_h, mod1_h;
+  v4i32 cnst3, cnst16, filt_wt, strength;
+  v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
+  v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
+  v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll;
+  v4i32 acc0, acc1, acc2, acc3;
+  v8i16 cnt0, cnt1;
+
+  filt_wt = __msa_fill_w(filt_wgt);
+  strength = __msa_fill_w(filt_sth);
+  cnst3 = __msa_ldi_w(3);
+  cnst16 = __msa_ldi_w(16);
+
+  for (row = 8; row--;) {
+    LD_SB2(frm1_ptr, stride, frm1, frm3);
+    frm1_ptr += stride;
+
+    LD_SB2(frm2_ptr, 16, frm2, frm4);
+    frm2_ptr += 16;
+
+    LD_SW2(acc, 4, acc0, acc1);
+    LD_SW2(acc, 4, acc2, acc3);
+    LD_SH2(cnt, 8, cnt0, cnt1);
+
+    ILVRL_B2_UB(frm1, frm2, frm_r, frm_l);
+    HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
+    UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+    UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+
+    diff0_r = (mod0_w < cnst16);
+    diff0_l = (mod1_w < cnst16);
+    diff1_r = (mod2_w < cnst16);
+    diff1_l = (mod3_w < cnst16);
+
+    SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+
+    mod0_w = diff0_r & mod0_w;
+    mod1_w = diff0_l & mod1_w;
+    mod2_w = diff1_r & mod2_w;
+    mod3_w = diff1_l & mod3_w;
+
+    MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+    ST_SH2(mod0_h, mod1_h, cnt, 8);
+    cnt += 16;
+
+    ILVRL_B2_UH(zero, frm2, frm2_r, frm2_l);
+    UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
+    UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
+    MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+
+    ST_SW2(mod0_w, mod1_w, acc, 4);
+    acc += 8;
+    ST_SW2(mod2_w, mod3_w, acc, 4);
+    acc += 8;
+
+    LD_SW2(acc, 4, acc0, acc1);
+    LD_SW2(acc + 8, 4, acc2, acc3);
+    LD_SH2(cnt, 8, cnt0, cnt1);
+
+    ILVRL_B2_UB(frm3, frm4, frm_r, frm_l);
+    HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
+    UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+    UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+
+    diff0_r = (mod0_w < cnst16);
+    diff0_l = (mod1_w < cnst16);
+    diff1_r = (mod2_w < cnst16);
+    diff1_l = (mod3_w < cnst16);
+
+    SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+
+    mod0_w = diff0_r & mod0_w;
+    mod1_w = diff0_l & mod1_w;
+    mod2_w = diff1_r & mod2_w;
+    mod3_w = diff1_l & mod3_w;
+
+    MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+    ST_SH2(mod0_h, mod1_h, cnt, 8);
+    cnt += 16;
+
+    ILVRL_B2_UH(zero, frm4, frm2_r, frm2_l);
+    UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
+    UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
+    MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    ST_SW2(mod0_w, mod1_w, acc, 4);
+    acc += 8;
+    ST_SW2(mod2_w, mod3_w, acc, 4);
+    acc += 8;
+
+    frm1_ptr += stride;
+    frm2_ptr += 16;
+  }
+}
+
+void vp10_temporal_filter_apply_msa(uint8_t *frame1_ptr, uint32_t stride,
+                                   uint8_t *frame2_ptr, uint32_t blk_w,
+                                   uint32_t blk_h, int32_t strength,
+                                   int32_t filt_wgt, uint32_t *accu,
+                                   uint16_t *cnt) {
+  if (8 == (blk_w * blk_h)) {
+    temporal_filter_apply_8size_msa(frame1_ptr, stride, frame2_ptr,
+                                    strength, filt_wgt, accu, cnt);
+  } else if (16 == (blk_w * blk_h)) {
+    temporal_filter_apply_16size_msa(frame1_ptr, stride, frame2_ptr,
+                                     strength, filt_wgt, accu, cnt);
+  } else {
+    vp10_temporal_filter_apply_c(frame1_ptr, stride, frame2_ptr, blk_w, blk_h,
+                                strength, filt_wgt, accu, cnt);
+  }
+}
diff --git a/libs/libvpx/vp10/encoder/picklpf.c b/libs/libvpx/vp10/encoder/picklpf.c
new file mode 100644
index 0000000000..045e03d1d0
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/picklpf.c
@@ -0,0 +1,193 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <limits.h>
+
+#include "./vpx_scale_rtcd.h"
+
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+
+#include "vp10/common/loopfilter.h"
+#include "vp10/common/onyxc_int.h"
+#include "vp10/common/quant_common.h"
+
+#include "vp10/encoder/encoder.h"
+#include "vp10/encoder/picklpf.h"
+#include "vp10/encoder/quantize.h"
+
+static int get_max_filter_level(const VP10_COMP *cpi) {
+  if (cpi->oxcf.pass == 2) {
+    return cpi->twopass.section_intra_rating > 8 ? MAX_LOOP_FILTER * 3 / 4
+                                                 : MAX_LOOP_FILTER;
+  } else {
+    return MAX_LOOP_FILTER;
+  }
+}
+
+
+static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd,
+                                VP10_COMP *const cpi,
+                                int filt_level, int partial_frame) {
+  VP10_COMMON *const cm = &cpi->common;
+  int64_t filt_err;
+
+  if (cpi->num_workers > 1)
+    vp10_loop_filter_frame_mt(cm->frame_to_show, cm, cpi->td.mb.e_mbd.plane,
+                             filt_level, 1, partial_frame,
+                             cpi->workers, cpi->num_workers, &cpi->lf_row_sync);
+  else
+    vp10_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level,
+                          1, partial_frame);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (cm->use_highbitdepth) {
+    filt_err = vp10_highbd_get_y_sse(sd, cm->frame_to_show);
+  } else {
+    filt_err = vp10_get_y_sse(sd, cm->frame_to_show);
+  }
+#else
+  filt_err = vp10_get_y_sse(sd, cm->frame_to_show);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  // Re-instate the unfiltered frame
+  vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
+
+  return filt_err;
+}
+
+static int search_filter_level(const YV12_BUFFER_CONFIG *sd, VP10_COMP *cpi,
+                               int partial_frame) {
+  const VP10_COMMON *const cm = &cpi->common;
+  const struct loopfilter *const lf = &cm->lf;
+  const int min_filter_level = 0;
+  const int max_filter_level = get_max_filter_level(cpi);
+  int filt_direction = 0;
+  int64_t best_err;
+  int filt_best;
+
+  // Start the search at the previous frame filter level unless it is now out of
+  // range.
+  int filt_mid = clamp(lf->filter_level, min_filter_level, max_filter_level);
+  int filter_step = filt_mid < 16 ? 4 : filt_mid / 4;
+  // Sum squared error at each filter level
+  int64_t ss_err[MAX_LOOP_FILTER + 1];
+
+  // Set each entry to -1
+  memset(ss_err, 0xFF, sizeof(ss_err));
+
+  //  Make a copy of the unfiltered / processed recon buffer
+  vpx_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf);
+
+  best_err = try_filter_frame(sd, cpi, filt_mid, partial_frame);
+  filt_best = filt_mid;
+  ss_err[filt_mid] = best_err;
+
+  while (filter_step > 0) {
+    const int filt_high = VPXMIN(filt_mid + filter_step, max_filter_level);
+    const int filt_low = VPXMAX(filt_mid - filter_step, min_filter_level);
+
+    // Bias against raising loop filter in favor of lowering it.
+    int64_t bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
+
+    if ((cpi->oxcf.pass == 2) && (cpi->twopass.section_intra_rating < 20))
+      bias = (bias * cpi->twopass.section_intra_rating) / 20;
+
+    // yx, bias less for large block size
+    if (cm->tx_mode != ONLY_4X4)
+      bias >>= 1;
+
+    if (filt_direction <= 0 && filt_low != filt_mid) {
+      // Get Low filter error score
+      if (ss_err[filt_low] < 0) {
+        ss_err[filt_low] = try_filter_frame(sd, cpi, filt_low, partial_frame);
+      }
+      // If value is close to the best so far then bias towards a lower loop
+      // filter value.
+      if ((ss_err[filt_low] - bias) < best_err) {
+        // Was it actually better than the previous best?
+        if (ss_err[filt_low] < best_err)
+          best_err = ss_err[filt_low];
+
+        filt_best = filt_low;
+      }
+    }
+
+    // Now look at filt_high
+    if (filt_direction >= 0 && filt_high != filt_mid) {
+      if (ss_err[filt_high] < 0) {
+        ss_err[filt_high] = try_filter_frame(sd, cpi, filt_high, partial_frame);
+      }
+      // Was it better than the previous best?
+      if (ss_err[filt_high] < (best_err - bias)) {
+        best_err = ss_err[filt_high];
+        filt_best = filt_high;
+      }
+    }
+
+    // Half the step distance if the best filter value was the same as last time
+    if (filt_best == filt_mid) {
+      filter_step /= 2;
+      filt_direction = 0;
+    } else {
+      filt_direction = (filt_best < filt_mid) ? -1 : 1;
+      filt_mid = filt_best;
+    }
+  }
+
+  return filt_best;
+}
+
+void vp10_pick_filter_level(const YV12_BUFFER_CONFIG *sd, VP10_COMP *cpi,
+                           LPF_PICK_METHOD method) {
+  VP10_COMMON *const cm = &cpi->common;
+  struct loopfilter *const lf = &cm->lf;
+
+  lf->sharpness_level = cm->frame_type == KEY_FRAME ? 0
+                                                    : cpi->oxcf.sharpness;
+
+  if (method == LPF_PICK_MINIMAL_LPF && lf->filter_level) {
+      lf->filter_level = 0;
+  } else if (method >= LPF_PICK_FROM_Q) {
+    const int min_filter_level = 0;
+    const int max_filter_level = get_max_filter_level(cpi);
+    const int q = vp10_ac_quant(cm->base_qindex, 0, cm->bit_depth);
+    // These values were determined by linear fitting the result of the
+    // searched level, filt_guess = q * 0.316206 + 3.87252
+#if CONFIG_VP9_HIGHBITDEPTH
+    int filt_guess;
+    switch (cm->bit_depth) {
+      case VPX_BITS_8:
+        filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18);
+        break;
+      case VPX_BITS_10:
+        filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 4060632, 20);
+        break;
+      case VPX_BITS_12:
+        filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 16242526, 22);
+        break;
+      default:
+        assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 "
+                    "or VPX_BITS_12");
+        return;
+    }
+#else
+    int filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    if (cm->frame_type == KEY_FRAME)
+      filt_guess -= 4;
+    lf->filter_level = clamp(filt_guess, min_filter_level, max_filter_level);
+  } else {
+    lf->filter_level = search_filter_level(sd, cpi,
+                                           method == LPF_PICK_FROM_SUBIMAGE);
+  }
+}
diff --git a/libs/libvpx/vp10/encoder/picklpf.h b/libs/libvpx/vp10/encoder/picklpf.h
new file mode 100644
index 0000000000..21a8758ef4
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/picklpf.h
@@ -0,0 +1,30 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP10_ENCODER_PICKLPF_H_
+#define VP10_ENCODER_PICKLPF_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "vp10/encoder/encoder.h"
+
+struct yv12_buffer_config;
+struct VP10_COMP;
+
+void vp10_pick_filter_level(const struct yv12_buffer_config *sd,
+                           struct VP10_COMP *cpi, LPF_PICK_METHOD method);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_ENCODER_PICKLPF_H_
diff --git a/libs/libvpx/vp10/encoder/quantize.c b/libs/libvpx/vp10/encoder/quantize.c
new file mode 100644
index 0000000000..86b324f1a1
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/quantize.c
@@ -0,0 +1,389 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+
+#include "vp10/common/quant_common.h"
+#include "vp10/common/seg_common.h"
+
+#include "vp10/encoder/encoder.h"
+#include "vp10/encoder/quantize.h"
+#include "vp10/encoder/rd.h"
+
+void vp10_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                       int skip_block,
+                       const int16_t *zbin_ptr, const int16_t *round_ptr,
+                       const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
+                       tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                       const int16_t *dequant_ptr,
+                       uint16_t *eob_ptr,
+                       const int16_t *scan, const int16_t *iscan) {
+  int i, eob = -1;
+  // TODO(jingning) Decide the need of these arguments after the
+  // quantization process is completed.
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Quantization pass: All coefficients with index >= zero_flag are
+    // skippable. Note: zero_flag can be zero.
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+      int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+      tmp = (tmp * quant_ptr[rc != 0]) >> 16;
+
+      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+
+      if (tmp)
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp10_highbd_quantize_fp_c(const tran_low_t *coeff_ptr,
+                              intptr_t count,
+                              int skip_block,
+                              const int16_t *zbin_ptr,
+                              const int16_t *round_ptr,
+                              const int16_t *quant_ptr,
+                              const int16_t *quant_shift_ptr,
+                              tran_low_t *qcoeff_ptr,
+                              tran_low_t *dqcoeff_ptr,
+                              const int16_t *dequant_ptr,
+                              uint16_t *eob_ptr,
+                              const int16_t *scan,
+                              const int16_t *iscan) {
+  int i;
+  int eob = -1;
+  // TODO(jingning) Decide the need of these arguments after the
+  // quantization process is completed.
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Quantization pass: All coefficients with index >= zero_flag are
+    // skippable. Note: zero_flag can be zero.
+    for (i = 0; i < count; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int64_t tmp = abs_coeff + round_ptr[rc != 0];
+      const uint32_t abs_qcoeff = (uint32_t)((tmp * quant_ptr[rc != 0]) >> 16);
+      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+      if (abs_qcoeff)
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+#endif
+
+// TODO(jingning) Refactor this file and combine functions with similar
+// operations.
+void vp10_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             int skip_block,
+                             const int16_t *zbin_ptr, const int16_t *round_ptr,
+                             const int16_t *quant_ptr,
+                             const int16_t *quant_shift_ptr,
+                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                             const int16_t *dequant_ptr,
+                             uint16_t *eob_ptr,
+                             const int16_t *scan, const int16_t *iscan) {
+  int i, eob = -1;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      int tmp = 0;
+      int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+      if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) {
+        abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+        abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
+        tmp = (abs_coeff * quant_ptr[rc != 0]) >> 15;
+        qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+      }
+
+      if (tmp)
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp10_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr,
+                                    intptr_t n_coeffs, int skip_block,
+                                    const int16_t *zbin_ptr,
+                                    const int16_t *round_ptr,
+                                    const int16_t *quant_ptr,
+                                    const int16_t *quant_shift_ptr,
+                                    tran_low_t *qcoeff_ptr,
+                                    tran_low_t *dqcoeff_ptr,
+                                    const int16_t *dequant_ptr,
+                                    uint16_t *eob_ptr,
+                                    const int16_t *scan, const int16_t *iscan) {
+  int i, eob = -1;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    for (i = 0; i < n_coeffs; i++) {
+      uint32_t abs_qcoeff = 0;
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+      if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) {
+        const int64_t tmp = abs_coeff
+                           + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+        abs_qcoeff = (uint32_t) ((tmp * quant_ptr[rc != 0]) >> 15);
+        qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+      }
+
+      if (abs_qcoeff)
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+#endif
+
+void vp10_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
+                                const int16_t *scan, const int16_t *iscan) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *p = &x->plane[plane];
+  struct macroblockd_plane *pd = &xd->plane[plane];
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    vpx_highbd_quantize_b(BLOCK_OFFSET(p->coeff, block),
+                          16, x->skip_block,
+                          p->zbin, p->round, p->quant, p->quant_shift,
+                          BLOCK_OFFSET(p->qcoeff, block),
+                          BLOCK_OFFSET(pd->dqcoeff, block),
+                          pd->dequant, &p->eobs[block],
+                          scan, iscan);
+    return;
+  }
+#endif
+  vpx_quantize_b(BLOCK_OFFSET(p->coeff, block),
+                 16, x->skip_block,
+                 p->zbin, p->round, p->quant, p->quant_shift,
+                 BLOCK_OFFSET(p->qcoeff, block),
+                 BLOCK_OFFSET(pd->dqcoeff, block),
+                 pd->dequant, &p->eobs[block], scan, iscan);
+}
+
+static void invert_quant(int16_t *quant, int16_t *shift, int d) {
+  unsigned t;
+  int l;
+  t = d;
+  for (l = 0; t > 1; l++)
+    t >>= 1;
+  t = 1 + (1 << (16 + l)) / d;
+  *quant = (int16_t)(t - (1 << 16));
+  *shift = 1 << (16 - l);
+}
+
+static int get_qzbin_factor(int q, vpx_bit_depth_t bit_depth) {
+  const int quant = vp10_dc_quant(q, 0, bit_depth);
+#if CONFIG_VP9_HIGHBITDEPTH
+  switch (bit_depth) {
+    case VPX_BITS_8:
+      return q == 0 ? 64 : (quant < 148 ? 84 : 80);
+    case VPX_BITS_10:
+      return q == 0 ? 64 : (quant < 592 ? 84 : 80);
+    case VPX_BITS_12:
+      return q == 0 ? 64 : (quant < 2368 ? 84 : 80);
+    default:
+      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+      return -1;
+  }
+#else
+  (void) bit_depth;
+  return q == 0 ? 64 : (quant < 148 ? 84 : 80);
+#endif
+}
+
+void vp10_init_quantizer(VP10_COMP *cpi) {
+  VP10_COMMON *const cm = &cpi->common;
+  QUANTS *const quants = &cpi->quants;
+  int i, q, quant;
+
+  for (q = 0; q < QINDEX_RANGE; q++) {
+    const int qzbin_factor = get_qzbin_factor(q, cm->bit_depth);
+    const int qrounding_factor = q == 0 ? 64 : 48;
+
+    for (i = 0; i < 2; ++i) {
+      int qrounding_factor_fp = i == 0 ? 48 : 42;
+      if (q == 0)
+        qrounding_factor_fp = 64;
+
+      // y
+      quant = i == 0 ? vp10_dc_quant(q, cm->y_dc_delta_q, cm->bit_depth)
+                     : vp10_ac_quant(q, 0, cm->bit_depth);
+      invert_quant(&quants->y_quant[q][i], &quants->y_quant_shift[q][i], quant);
+      quants->y_quant_fp[q][i] = (1 << 16) / quant;
+      quants->y_round_fp[q][i] = (qrounding_factor_fp * quant) >> 7;
+      quants->y_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
+      quants->y_round[q][i] = (qrounding_factor * quant) >> 7;
+      cpi->y_dequant[q][i] = quant;
+
+      // uv
+      quant = i == 0 ? vp10_dc_quant(q, cm->uv_dc_delta_q, cm->bit_depth)
+                     : vp10_ac_quant(q, cm->uv_ac_delta_q, cm->bit_depth);
+      invert_quant(&quants->uv_quant[q][i],
+                   &quants->uv_quant_shift[q][i], quant);
+      quants->uv_quant_fp[q][i] = (1 << 16) / quant;
+      quants->uv_round_fp[q][i] = (qrounding_factor_fp * quant) >> 7;
+      quants->uv_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
+      quants->uv_round[q][i] = (qrounding_factor * quant) >> 7;
+      cpi->uv_dequant[q][i] = quant;
+    }
+
+    for (i = 2; i < 8; i++) {
+      quants->y_quant[q][i] = quants->y_quant[q][1];
+      quants->y_quant_fp[q][i] = quants->y_quant_fp[q][1];
+      quants->y_round_fp[q][i] = quants->y_round_fp[q][1];
+      quants->y_quant_shift[q][i] = quants->y_quant_shift[q][1];
+      quants->y_zbin[q][i] = quants->y_zbin[q][1];
+      quants->y_round[q][i] = quants->y_round[q][1];
+      cpi->y_dequant[q][i] = cpi->y_dequant[q][1];
+
+      quants->uv_quant[q][i] = quants->uv_quant[q][1];
+      quants->uv_quant_fp[q][i] = quants->uv_quant_fp[q][1];
+      quants->uv_round_fp[q][i] = quants->uv_round_fp[q][1];
+      quants->uv_quant_shift[q][i] = quants->uv_quant_shift[q][1];
+      quants->uv_zbin[q][i] = quants->uv_zbin[q][1];
+      quants->uv_round[q][i] = quants->uv_round[q][1];
+      cpi->uv_dequant[q][i] = cpi->uv_dequant[q][1];
+    }
+  }
+}
+
+void vp10_init_plane_quantizers(VP10_COMP *cpi, MACROBLOCK *x) {
+  const VP10_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  QUANTS *const quants = &cpi->quants;
+  const int segment_id = xd->mi[0]->mbmi.segment_id;
+  const int qindex = vp10_get_qindex(&cm->seg, segment_id, cm->base_qindex);
+  const int rdmult = vp10_compute_rd_mult(cpi, qindex + cm->y_dc_delta_q);
+  int i;
+
+  // Y
+  x->plane[0].quant = quants->y_quant[qindex];
+  x->plane[0].quant_fp = quants->y_quant_fp[qindex];
+  x->plane[0].round_fp = quants->y_round_fp[qindex];
+  x->plane[0].quant_shift = quants->y_quant_shift[qindex];
+  x->plane[0].zbin = quants->y_zbin[qindex];
+  x->plane[0].round = quants->y_round[qindex];
+  xd->plane[0].dequant = cpi->y_dequant[qindex];
+
+  x->plane[0].quant_thred[0] = x->plane[0].zbin[0] * x->plane[0].zbin[0];
+  x->plane[0].quant_thred[1] = x->plane[0].zbin[1] * x->plane[0].zbin[1];
+
+  // UV
+  for (i = 1; i < 3; i++) {
+    x->plane[i].quant = quants->uv_quant[qindex];
+    x->plane[i].quant_fp = quants->uv_quant_fp[qindex];
+    x->plane[i].round_fp = quants->uv_round_fp[qindex];
+    x->plane[i].quant_shift = quants->uv_quant_shift[qindex];
+    x->plane[i].zbin = quants->uv_zbin[qindex];
+    x->plane[i].round = quants->uv_round[qindex];
+    xd->plane[i].dequant = cpi->uv_dequant[qindex];
+
+    x->plane[i].quant_thred[0] = x->plane[i].zbin[0] * x->plane[i].zbin[0];
+    x->plane[i].quant_thred[1] = x->plane[i].zbin[1] * x->plane[i].zbin[1];
+  }
+
+  x->skip_block = segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP);
+  x->q_index = qindex;
+
+  x->errorperbit = rdmult >> 6;
+  x->errorperbit += (x->errorperbit == 0);
+
+  vp10_initialize_me_consts(cpi, x, x->q_index);
+}
+
+void vp10_frame_init_quantizer(VP10_COMP *cpi) {
+  vp10_init_plane_quantizers(cpi, &cpi->td.mb);
+}
+
+void vp10_set_quantizer(VP10_COMMON *cm, int q) {
+  // quantizer has to be reinitialized with vp10_init_quantizer() if any
+  // delta_q changes.
+  cm->base_qindex = q;
+  cm->y_dc_delta_q = 0;
+  cm->uv_dc_delta_q = 0;
+  cm->uv_ac_delta_q = 0;
+}
+
+// Table that converts 0-63 Q-range values passed in outside to the Qindex
+// range used internally.
+static const int quantizer_to_qindex[] = {
+  0,    4,   8,  12,  16,  20,  24,  28,
+  32,   36,  40,  44,  48,  52,  56,  60,
+  64,   68,  72,  76,  80,  84,  88,  92,
+  96,  100, 104, 108, 112, 116, 120, 124,
+  128, 132, 136, 140, 144, 148, 152, 156,
+  160, 164, 168, 172, 176, 180, 184, 188,
+  192, 196, 200, 204, 208, 212, 216, 220,
+  224, 228, 232, 236, 240, 244, 249, 255,
+};
+
+int vp10_quantizer_to_qindex(int quantizer) {
+  return quantizer_to_qindex[quantizer];
+}
+
+int vp10_qindex_to_quantizer(int qindex) {
+  int quantizer;
+
+  for (quantizer = 0; quantizer < 64; ++quantizer)
+    if (quantizer_to_qindex[quantizer] >= qindex)
+      return quantizer;
+
+  return 63;
+}
diff --git a/libs/libvpx/vp10/encoder/quantize.h b/libs/libvpx/vp10/encoder/quantize.h
new file mode 100644
index 0000000000..b44088ecc6
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/quantize.h
@@ -0,0 +1,62 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_ENCODER_QUANTIZE_H_
+#define VP10_ENCODER_QUANTIZE_H_
+
+#include "./vpx_config.h"
+#include "vp10/encoder/block.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+  DECLARE_ALIGNED(16, int16_t, y_quant[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, y_quant_shift[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, y_zbin[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, y_round[QINDEX_RANGE][8]);
+
+  // TODO(jingning): in progress of re-working the quantization. will decide
+  // if we want to deprecate the current use of y_quant.
+  DECLARE_ALIGNED(16, int16_t, y_quant_fp[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, uv_quant_fp[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, y_round_fp[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, uv_round_fp[QINDEX_RANGE][8]);
+
+  DECLARE_ALIGNED(16, int16_t, uv_quant[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, uv_quant_shift[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, uv_zbin[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, uv_round[QINDEX_RANGE][8]);
+} QUANTS;
+
+void vp10_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
+                                const int16_t *scan, const int16_t *iscan);
+
+struct VP10_COMP;
+struct VP10Common;
+
+void vp10_frame_init_quantizer(struct VP10_COMP *cpi);
+
+void vp10_init_plane_quantizers(struct VP10_COMP *cpi, MACROBLOCK *x);
+
+void vp10_init_quantizer(struct VP10_COMP *cpi);
+
+void vp10_set_quantizer(struct VP10Common *cm, int q);
+
+int vp10_quantizer_to_qindex(int quantizer);
+
+int vp10_qindex_to_quantizer(int qindex);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_ENCODER_QUANTIZE_H_
diff --git a/libs/libvpx/vp10/encoder/ratectrl.c b/libs/libvpx/vp10/encoder/ratectrl.c
new file mode 100644
index 0000000000..6068775942
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/ratectrl.c
@@ -0,0 +1,1781 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/system_state.h"
+
+#include "vp10/common/alloccommon.h"
+#include "vp10/encoder/aq_cyclicrefresh.h"
+#include "vp10/common/common.h"
+#include "vp10/common/entropymode.h"
+#include "vp10/common/quant_common.h"
+#include "vp10/common/seg_common.h"
+
+#include "vp10/encoder/encodemv.h"
+#include "vp10/encoder/ratectrl.h"
+
+// Max rate target for 1080P and below encodes under normal circumstances
+// (1920 * 1080 / (16 * 16)) * MAX_MB_RATE bits per MB
+#define MAX_MB_RATE 250
+#define MAXRATE_1080P 2025000
+
+#define DEFAULT_KF_BOOST 2000
+#define DEFAULT_GF_BOOST 2000
+
+#define LIMIT_QRANGE_FOR_ALTREF_AND_KEY 1
+
+#define MIN_BPB_FACTOR 0.005
+#define MAX_BPB_FACTOR 50
+
+#define FRAME_OVERHEAD_BITS 200
+
+#if CONFIG_VP9_HIGHBITDEPTH
+#define ASSIGN_MINQ_TABLE(bit_depth, name) \
+  do { \
+    switch (bit_depth) { \
+      case VPX_BITS_8: \
+        name = name##_8; \
+        break; \
+      case VPX_BITS_10: \
+        name = name##_10; \
+        break; \
+      case VPX_BITS_12: \
+        name = name##_12; \
+        break; \
+      default: \
+        assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10" \
+                    " or VPX_BITS_12"); \
+        name = NULL; \
+    } \
+  } while (0)
+#else
+#define ASSIGN_MINQ_TABLE(bit_depth, name) \
+  do { \
+    (void) bit_depth; \
+    name = name##_8; \
+  } while (0)
+#endif
+
+// Tables relating active max Q to active min Q
+static int kf_low_motion_minq_8[QINDEX_RANGE];
+static int kf_high_motion_minq_8[QINDEX_RANGE];
+static int arfgf_low_motion_minq_8[QINDEX_RANGE];
+static int arfgf_high_motion_minq_8[QINDEX_RANGE];
+static int inter_minq_8[QINDEX_RANGE];
+static int rtc_minq_8[QINDEX_RANGE];
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static int kf_low_motion_minq_10[QINDEX_RANGE];
+static int kf_high_motion_minq_10[QINDEX_RANGE];
+static int arfgf_low_motion_minq_10[QINDEX_RANGE];
+static int arfgf_high_motion_minq_10[QINDEX_RANGE];
+static int inter_minq_10[QINDEX_RANGE];
+static int rtc_minq_10[QINDEX_RANGE];
+static int kf_low_motion_minq_12[QINDEX_RANGE];
+static int kf_high_motion_minq_12[QINDEX_RANGE];
+static int arfgf_low_motion_minq_12[QINDEX_RANGE];
+static int arfgf_high_motion_minq_12[QINDEX_RANGE];
+static int inter_minq_12[QINDEX_RANGE];
+static int rtc_minq_12[QINDEX_RANGE];
+#endif
+
+static int gf_high = 2000;
+static int gf_low = 400;
+static int kf_high = 5000;
+static int kf_low = 400;
+
+// Functions to compute the active minq lookup table entries based on a
+// formulaic approach to facilitate easier adjustment of the Q tables.
+// The formulae were derived from computing a 3rd order polynomial best
+// fit to the original data (after plotting real maxq vs minq (not q index))
+static int get_minq_index(double maxq, double x3, double x2, double x1,
+                          vpx_bit_depth_t bit_depth) {
+  int i;
+  const double minqtarget = VPXMIN(((x3 * maxq + x2) * maxq + x1) * maxq, maxq);
+
+  // Special case handling to deal with the step from q2.0
+  // down to lossless mode represented by q 1.0.
+  if (minqtarget <= 2.0)
+    return 0;
+
+  for (i = 0; i < QINDEX_RANGE; i++) {
+    if (minqtarget <= vp10_convert_qindex_to_q(i, bit_depth))
+      return i;
+  }
+
+  return QINDEX_RANGE - 1;
+}
+
+static void init_minq_luts(int *kf_low_m, int *kf_high_m,
+                           int *arfgf_low, int *arfgf_high,
+                           int *inter, int *rtc, vpx_bit_depth_t bit_depth) {
+  int i;
+  for (i = 0; i < QINDEX_RANGE; i++) {
+    const double maxq = vp10_convert_qindex_to_q(i, bit_depth);
+    kf_low_m[i] = get_minq_index(maxq, 0.000001, -0.0004, 0.150, bit_depth);
+    kf_high_m[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55, bit_depth);
+    arfgf_low[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.30, bit_depth);
+    arfgf_high[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55, bit_depth);
+    inter[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.90, bit_depth);
+    rtc[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.70, bit_depth);
+  }
+}
+
+void vp10_rc_init_minq_luts(void) {
+  init_minq_luts(kf_low_motion_minq_8, kf_high_motion_minq_8,
+                 arfgf_low_motion_minq_8, arfgf_high_motion_minq_8,
+                 inter_minq_8, rtc_minq_8, VPX_BITS_8);
+#if CONFIG_VP9_HIGHBITDEPTH
+  init_minq_luts(kf_low_motion_minq_10, kf_high_motion_minq_10,
+                 arfgf_low_motion_minq_10, arfgf_high_motion_minq_10,
+                 inter_minq_10, rtc_minq_10, VPX_BITS_10);
+  init_minq_luts(kf_low_motion_minq_12, kf_high_motion_minq_12,
+                 arfgf_low_motion_minq_12, arfgf_high_motion_minq_12,
+                 inter_minq_12, rtc_minq_12, VPX_BITS_12);
+#endif
+}
+
+// These functions use formulaic calculations to make playing with the
+// quantizer tables easier. If necessary they can be replaced by lookup
+// tables if and when things settle down in the experimental bitstream
+double vp10_convert_qindex_to_q(int qindex, vpx_bit_depth_t bit_depth) {
+  // Convert the index to a real Q value (scaled down to match old Q values)
+#if CONFIG_VP9_HIGHBITDEPTH
+  switch (bit_depth) {
+    case VPX_BITS_8:
+      return vp10_ac_quant(qindex, 0, bit_depth) / 4.0;
+    case VPX_BITS_10:
+      return vp10_ac_quant(qindex, 0, bit_depth) / 16.0;
+    case VPX_BITS_12:
+      return vp10_ac_quant(qindex, 0, bit_depth) / 64.0;
+    default:
+      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+      return -1.0;
+  }
+#else
+  return vp10_ac_quant(qindex, 0, bit_depth) / 4.0;
+#endif
+}
+
+int vp10_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
+                       double correction_factor,
+                       vpx_bit_depth_t bit_depth) {
+  const double q = vp10_convert_qindex_to_q(qindex, bit_depth);
+  int enumerator = frame_type == KEY_FRAME ? 2700000 : 1800000;
+
+  assert(correction_factor <= MAX_BPB_FACTOR &&
+         correction_factor >= MIN_BPB_FACTOR);
+
+  // q based adjustment to baseline enumerator
+  enumerator += (int)(enumerator * q) >> 12;
+  return (int)(enumerator * correction_factor / q);
+}
+
+int vp10_estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs,
+                           double correction_factor,
+                           vpx_bit_depth_t bit_depth) {
+  const int bpm = (int)(vp10_rc_bits_per_mb(frame_type, q, correction_factor,
+                                           bit_depth));
+  return VPXMAX(FRAME_OVERHEAD_BITS,
+                (int)((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS);
+}
+
+int vp10_rc_clamp_pframe_target_size(const VP10_COMP *const cpi, int target) {
+  const RATE_CONTROL *rc = &cpi->rc;
+  const VP10EncoderConfig *oxcf = &cpi->oxcf;
+  const int min_frame_target = VPXMAX(rc->min_frame_bandwidth,
+                                      rc->avg_frame_bandwidth >> 5);
+  if (target < min_frame_target)
+    target = min_frame_target;
+  if (cpi->refresh_golden_frame && rc->is_src_frame_alt_ref) {
+    // If there is an active ARF at this location use the minimum
+    // bits on this frame even if it is a constructed arf.
+    // The active maximum quantizer insures that an appropriate
+    // number of bits will be spent if needed for constructed ARFs.
+    target = min_frame_target;
+  }
+  // Clip the frame target to the maximum allowed value.
+  if (target > rc->max_frame_bandwidth)
+    target = rc->max_frame_bandwidth;
+  if (oxcf->rc_max_inter_bitrate_pct) {
+    const int max_rate = rc->avg_frame_bandwidth *
+                         oxcf->rc_max_inter_bitrate_pct / 100;
+    target = VPXMIN(target, max_rate);
+  }
+  return target;
+}
+
+int vp10_rc_clamp_iframe_target_size(const VP10_COMP *const cpi, int target) {
+  const RATE_CONTROL *rc = &cpi->rc;
+  const VP10EncoderConfig *oxcf = &cpi->oxcf;
+  if (oxcf->rc_max_intra_bitrate_pct) {
+    const int max_rate = rc->avg_frame_bandwidth *
+                             oxcf->rc_max_intra_bitrate_pct / 100;
+    target = VPXMIN(target, max_rate);
+  }
+  if (target > rc->max_frame_bandwidth)
+    target = rc->max_frame_bandwidth;
+  return target;
+}
+
+// Update the buffer level: leaky bucket model.
+static void update_buffer_level(VP10_COMP *cpi, int encoded_frame_size) {
+  const VP10_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+
+  // Non-viewable frames are a special case and are treated as pure overhead.
+  if (!cm->show_frame) {
+    rc->bits_off_target -= encoded_frame_size;
+  } else {
+    rc->bits_off_target += rc->avg_frame_bandwidth - encoded_frame_size;
+  }
+
+  // Clip the buffer level to the maximum specified buffer size.
+  rc->bits_off_target = VPXMIN(rc->bits_off_target, rc->maximum_buffer_size);
+  rc->buffer_level = rc->bits_off_target;
+}
+
+int vp10_rc_get_default_min_gf_interval(
+    int width, int height, double framerate) {
+  // Assume we do not need any constraint lower than 4K 20 fps
+  static const double factor_safe = 3840 * 2160 * 20.0;
+  const double factor = width * height * framerate;
+  const int default_interval =
+      clamp((int)(framerate * 0.125), MIN_GF_INTERVAL, MAX_GF_INTERVAL);
+
+  if (factor <= factor_safe)
+    return default_interval;
+  else
+    return VPXMAX(default_interval,
+                  (int)(MIN_GF_INTERVAL * factor / factor_safe + 0.5));
+  // Note this logic makes:
+  // 4K24: 5
+  // 4K30: 6
+  // 4K60: 12
+}
+
+int vp10_rc_get_default_max_gf_interval(double framerate, int min_gf_interval) {
+  int interval = VPXMIN(MAX_GF_INTERVAL, (int)(framerate * 0.75));
+  interval += (interval & 0x01);  // Round to even value
+  return VPXMAX(interval, min_gf_interval);
+}
+
+void vp10_rc_init(const VP10EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) {
+  int i;
+
+  if (pass == 0 && oxcf->rc_mode == VPX_CBR) {
+    rc->avg_frame_qindex[KEY_FRAME] = oxcf->worst_allowed_q;
+    rc->avg_frame_qindex[INTER_FRAME] = oxcf->worst_allowed_q;
+  } else {
+    rc->avg_frame_qindex[KEY_FRAME] = (oxcf->worst_allowed_q +
+                                       oxcf->best_allowed_q) / 2;
+    rc->avg_frame_qindex[INTER_FRAME] = (oxcf->worst_allowed_q +
+                                         oxcf->best_allowed_q) / 2;
+  }
+
+  rc->last_q[KEY_FRAME] = oxcf->best_allowed_q;
+  rc->last_q[INTER_FRAME] = oxcf->worst_allowed_q;
+
+  rc->buffer_level =    rc->starting_buffer_level;
+  rc->bits_off_target = rc->starting_buffer_level;
+
+  rc->rolling_target_bits      = rc->avg_frame_bandwidth;
+  rc->rolling_actual_bits      = rc->avg_frame_bandwidth;
+  rc->long_rolling_target_bits = rc->avg_frame_bandwidth;
+  rc->long_rolling_actual_bits = rc->avg_frame_bandwidth;
+
+  rc->total_actual_bits = 0;
+  rc->total_target_bits = 0;
+  rc->total_target_vs_actual = 0;
+
+  rc->frames_since_key = 8;  // Sensible default for first frame.
+  rc->this_key_frame_forced = 0;
+  rc->next_key_frame_forced = 0;
+  rc->source_alt_ref_pending = 0;
+  rc->source_alt_ref_active = 0;
+
+  rc->frames_till_gf_update_due = 0;
+  rc->ni_av_qi = oxcf->worst_allowed_q;
+  rc->ni_tot_qi = 0;
+  rc->ni_frames = 0;
+
+  rc->tot_q = 0.0;
+  rc->avg_q = vp10_convert_qindex_to_q(oxcf->worst_allowed_q, oxcf->bit_depth);
+
+  for (i = 0; i < RATE_FACTOR_LEVELS; ++i) {
+    rc->rate_correction_factors[i] = 1.0;
+  }
+
+  rc->min_gf_interval = oxcf->min_gf_interval;
+  rc->max_gf_interval = oxcf->max_gf_interval;
+  if (rc->min_gf_interval == 0)
+    rc->min_gf_interval = vp10_rc_get_default_min_gf_interval(
+        oxcf->width, oxcf->height, oxcf->init_framerate);
+  if (rc->max_gf_interval == 0)
+    rc->max_gf_interval = vp10_rc_get_default_max_gf_interval(
+        oxcf->init_framerate, rc->min_gf_interval);
+  rc->baseline_gf_interval = (rc->min_gf_interval + rc->max_gf_interval) / 2;
+}
+
+int vp10_rc_drop_frame(VP10_COMP *cpi) {
+  const VP10EncoderConfig *oxcf = &cpi->oxcf;
+  RATE_CONTROL *const rc = &cpi->rc;
+
+  if (!oxcf->drop_frames_water_mark) {
+    return 0;
+  } else {
+    if (rc->buffer_level < 0) {
+      // Always drop if buffer is below 0.
+      return 1;
+    } else {
+      // If buffer is below drop_mark, for now just drop every other frame
+      // (starting with the next frame) until it increases back over drop_mark.
+      int drop_mark = (int)(oxcf->drop_frames_water_mark *
+          rc->optimal_buffer_level / 100);
+      if ((rc->buffer_level > drop_mark) &&
+          (rc->decimation_factor > 0)) {
+        --rc->decimation_factor;
+      } else if (rc->buffer_level <= drop_mark &&
+          rc->decimation_factor == 0) {
+        rc->decimation_factor = 1;
+      }
+      if (rc->decimation_factor > 0) {
+        if (rc->decimation_count > 0) {
+          --rc->decimation_count;
+          return 1;
+        } else {
+          rc->decimation_count = rc->decimation_factor;
+          return 0;
+        }
+      } else {
+        rc->decimation_count = 0;
+        return 0;
+      }
+    }
+  }
+}
+
+static double get_rate_correction_factor(const VP10_COMP *cpi) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  double rcf;
+
+  if (cpi->common.frame_type == KEY_FRAME) {
+    rcf = rc->rate_correction_factors[KF_STD];
+  } else if (cpi->oxcf.pass == 2) {
+    RATE_FACTOR_LEVEL rf_lvl =
+      cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index];
+    rcf = rc->rate_correction_factors[rf_lvl];
+  } else {
+    if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) &&
+        !rc->is_src_frame_alt_ref &&
+        (cpi->oxcf.rc_mode != VPX_CBR || cpi->oxcf.gf_cbr_boost_pct > 20))
+      rcf = rc->rate_correction_factors[GF_ARF_STD];
+    else
+      rcf = rc->rate_correction_factors[INTER_NORMAL];
+  }
+  rcf *= rcf_mult[rc->frame_size_selector];
+  return fclamp(rcf, MIN_BPB_FACTOR, MAX_BPB_FACTOR);
+}
+
+static void set_rate_correction_factor(VP10_COMP *cpi, double factor) {
+  RATE_CONTROL *const rc = &cpi->rc;
+
+  // Normalize RCF to account for the size-dependent scaling factor.
+  factor /= rcf_mult[cpi->rc.frame_size_selector];
+
+  factor = fclamp(factor, MIN_BPB_FACTOR, MAX_BPB_FACTOR);
+
+  if (cpi->common.frame_type == KEY_FRAME) {
+    rc->rate_correction_factors[KF_STD] = factor;
+  } else if (cpi->oxcf.pass == 2) {
+    RATE_FACTOR_LEVEL rf_lvl =
+      cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index];
+    rc->rate_correction_factors[rf_lvl] = factor;
+  } else {
+    if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) &&
+        !rc->is_src_frame_alt_ref &&
+        (cpi->oxcf.rc_mode != VPX_CBR || cpi->oxcf.gf_cbr_boost_pct > 20))
+      rc->rate_correction_factors[GF_ARF_STD] = factor;
+    else
+      rc->rate_correction_factors[INTER_NORMAL] = factor;
+  }
+}
+
+void vp10_rc_update_rate_correction_factors(VP10_COMP *cpi) {
+  const VP10_COMMON *const cm = &cpi->common;
+  int correction_factor = 100;
+  double rate_correction_factor = get_rate_correction_factor(cpi);
+  double adjustment_limit;
+
+  int projected_size_based_on_q = 0;
+
+  // Do not update the rate factors for arf overlay frames.
+  if (cpi->rc.is_src_frame_alt_ref)
+    return;
+
+  // Clear down mmx registers to allow floating point in what follows
+  vpx_clear_system_state();
+
+  // Work out how big we would have expected the frame to be at this Q given
+  // the current correction factor.
+  // Stay in double to avoid int overflow when values are large
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->common.seg.enabled) {
+    projected_size_based_on_q =
+        vp10_cyclic_refresh_estimate_bits_at_q(cpi, rate_correction_factor);
+  } else {
+    projected_size_based_on_q = vp10_estimate_bits_at_q(cpi->common.frame_type,
+                                                       cm->base_qindex,
+                                                       cm->MBs,
+                                                       rate_correction_factor,
+                                                       cm->bit_depth);
+  }
+  // Work out a size correction factor.
+  if (projected_size_based_on_q > FRAME_OVERHEAD_BITS)
+    correction_factor = (int)((100 * (int64_t)cpi->rc.projected_frame_size) /
+                        projected_size_based_on_q);
+
+  // More heavily damped adjustment used if we have been oscillating either side
+  // of target.
+  adjustment_limit = 0.25 +
+      0.5 * VPXMIN(1, fabs(log10(0.01 * correction_factor)));
+
+  cpi->rc.q_2_frame = cpi->rc.q_1_frame;
+  cpi->rc.q_1_frame = cm->base_qindex;
+  cpi->rc.rc_2_frame = cpi->rc.rc_1_frame;
+  if (correction_factor > 110)
+    cpi->rc.rc_1_frame = -1;
+  else if (correction_factor < 90)
+    cpi->rc.rc_1_frame = 1;
+  else
+    cpi->rc.rc_1_frame = 0;
+
+  if (correction_factor > 102) {
+    // We are not already at the worst allowable quality
+    correction_factor = (int)(100 + ((correction_factor - 100) *
+                                  adjustment_limit));
+    rate_correction_factor = (rate_correction_factor * correction_factor) / 100;
+    // Keep rate_correction_factor within limits
+    if (rate_correction_factor > MAX_BPB_FACTOR)
+      rate_correction_factor = MAX_BPB_FACTOR;
+  } else if (correction_factor < 99) {
+    // We are not already at the best allowable quality
+    correction_factor = (int)(100 - ((100 - correction_factor) *
+                                  adjustment_limit));
+    rate_correction_factor = (rate_correction_factor * correction_factor) / 100;
+
+    // Keep rate_correction_factor within limits
+    if (rate_correction_factor < MIN_BPB_FACTOR)
+      rate_correction_factor = MIN_BPB_FACTOR;
+  }
+
+  set_rate_correction_factor(cpi, rate_correction_factor);
+}
+
+
+int vp10_rc_regulate_q(const VP10_COMP *cpi, int target_bits_per_frame,
+                      int active_best_quality, int active_worst_quality) {
+  const VP10_COMMON *const cm = &cpi->common;
+  int q = active_worst_quality;
+  int last_error = INT_MAX;
+  int i, target_bits_per_mb, bits_per_mb_at_this_q;
+  const double correction_factor = get_rate_correction_factor(cpi);
+
+  // Calculate required scaling factor based on target frame size and size of
+  // frame produced using previous Q.
+  target_bits_per_mb =
+      ((uint64_t)target_bits_per_frame << BPER_MB_NORMBITS) / cm->MBs;
+
+  i = active_best_quality;
+
+  do {
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) {
+      bits_per_mb_at_this_q =
+          (int)vp10_cyclic_refresh_rc_bits_per_mb(cpi, i, correction_factor);
+    } else {
+      bits_per_mb_at_this_q = (int)vp10_rc_bits_per_mb(cm->frame_type, i,
+                                                      correction_factor,
+                                                      cm->bit_depth);
+    }
+
+    if (bits_per_mb_at_this_q <= target_bits_per_mb) {
+      if ((target_bits_per_mb - bits_per_mb_at_this_q) <= last_error)
+        q = i;
+      else
+        q = i - 1;
+
+      break;
+    } else {
+      last_error = bits_per_mb_at_this_q - target_bits_per_mb;
+    }
+  } while (++i <= active_worst_quality);
+
+  // In CBR mode, this makes sure q is between oscillating Qs to prevent
+  // resonance.
+  if (cpi->oxcf.rc_mode == VPX_CBR &&
+      (cpi->rc.rc_1_frame * cpi->rc.rc_2_frame == -1) &&
+      cpi->rc.q_1_frame != cpi->rc.q_2_frame) {
+    q = clamp(q, VPXMIN(cpi->rc.q_1_frame, cpi->rc.q_2_frame),
+              VPXMAX(cpi->rc.q_1_frame, cpi->rc.q_2_frame));
+  }
+  return q;
+}
+
+static int get_active_quality(int q, int gfu_boost, int low, int high,
+                              int *low_motion_minq, int *high_motion_minq) {
+  if (gfu_boost > high) {
+    return low_motion_minq[q];
+  } else if (gfu_boost < low) {
+    return high_motion_minq[q];
+  } else {
+    const int gap = high - low;
+    const int offset = high - gfu_boost;
+    const int qdiff = high_motion_minq[q] - low_motion_minq[q];
+    const int adjustment = ((offset * qdiff) + (gap >> 1)) / gap;
+    return low_motion_minq[q] + adjustment;
+  }
+}
+
+static int get_kf_active_quality(const RATE_CONTROL *const rc, int q,
+                                 vpx_bit_depth_t bit_depth) {
+  int *kf_low_motion_minq;
+  int *kf_high_motion_minq;
+  ASSIGN_MINQ_TABLE(bit_depth, kf_low_motion_minq);
+  ASSIGN_MINQ_TABLE(bit_depth, kf_high_motion_minq);
+  return get_active_quality(q, rc->kf_boost, kf_low, kf_high,
+                            kf_low_motion_minq, kf_high_motion_minq);
+}
+
+static int get_gf_active_quality(const RATE_CONTROL *const rc, int q,
+                                 vpx_bit_depth_t bit_depth) {
+  int *arfgf_low_motion_minq;
+  int *arfgf_high_motion_minq;
+  ASSIGN_MINQ_TABLE(bit_depth, arfgf_low_motion_minq);
+  ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq);
+  return get_active_quality(q, rc->gfu_boost, gf_low, gf_high,
+                            arfgf_low_motion_minq, arfgf_high_motion_minq);
+}
+
+static int calc_active_worst_quality_one_pass_vbr(const VP10_COMP *cpi) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const unsigned int curr_frame = cpi->common.current_video_frame;
+  int active_worst_quality;
+
+  if (cpi->common.frame_type == KEY_FRAME) {
+    active_worst_quality = curr_frame == 0 ? rc->worst_quality
+                                           : rc->last_q[KEY_FRAME] * 2;
+  } else {
+    if (!rc->is_src_frame_alt_ref &&
+        (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+      active_worst_quality =  curr_frame == 1 ? rc->last_q[KEY_FRAME] * 5 / 4
+                                              : rc->last_q[INTER_FRAME];
+    } else {
+      active_worst_quality = curr_frame == 1 ? rc->last_q[KEY_FRAME] * 2
+                                             : rc->last_q[INTER_FRAME] * 2;
+    }
+  }
+  return VPXMIN(active_worst_quality, rc->worst_quality);
+}
+
+// Adjust active_worst_quality level based on buffer level.
+static int calc_active_worst_quality_one_pass_cbr(const VP10_COMP *cpi) {
+  // Adjust active_worst_quality: If buffer is above the optimal/target level,
+  // bring active_worst_quality down depending on fullness of buffer.
+  // If buffer is below the optimal level, let the active_worst_quality go from
+  // ambient Q (at buffer = optimal level) to worst_quality level
+  // (at buffer = critical level).
+  const VP10_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *rc = &cpi->rc;
+  // Buffer level below which we push active_worst to worst_quality.
+  int64_t critical_level = rc->optimal_buffer_level >> 3;
+  int64_t buff_lvl_step = 0;
+  int adjustment = 0;
+  int active_worst_quality;
+  int ambient_qp;
+  if (cm->frame_type == KEY_FRAME)
+    return rc->worst_quality;
+  // For ambient_qp we use minimum of avg_frame_qindex[KEY_FRAME/INTER_FRAME]
+  // for the first few frames following key frame. These are both initialized
+  // to worst_quality and updated with (3/4, 1/4) average in postencode_update.
+  // So for first few frames following key, the qp of that key frame is weighted
+  // into the active_worst_quality setting.
+  ambient_qp = (cm->current_video_frame < 5) ?
+                   VPXMIN(rc->avg_frame_qindex[INTER_FRAME],
+                          rc->avg_frame_qindex[KEY_FRAME]) :
+                   rc->avg_frame_qindex[INTER_FRAME];
+  active_worst_quality = VPXMIN(rc->worst_quality, ambient_qp * 5 / 4);
+  if (rc->buffer_level > rc->optimal_buffer_level) {
+    // Adjust down.
+    // Maximum limit for down adjustment, ~30%.
+    int max_adjustment_down = active_worst_quality / 3;
+    if (max_adjustment_down) {
+      buff_lvl_step = ((rc->maximum_buffer_size -
+                        rc->optimal_buffer_level) / max_adjustment_down);
+      if (buff_lvl_step)
+        adjustment = (int)((rc->buffer_level - rc->optimal_buffer_level) /
+                            buff_lvl_step);
+      active_worst_quality -= adjustment;
+    }
+  } else if (rc->buffer_level > critical_level) {
+    // Adjust up from ambient Q.
+    if (critical_level) {
+      buff_lvl_step = (rc->optimal_buffer_level - critical_level);
+      if (buff_lvl_step) {
+        adjustment = (int)((rc->worst_quality - ambient_qp) *
+                           (rc->optimal_buffer_level - rc->buffer_level) /
+                           buff_lvl_step);
+      }
+      active_worst_quality = ambient_qp + adjustment;
+    }
+  } else {
+    // Set to worst_quality if buffer is below critical level.
+    active_worst_quality = rc->worst_quality;
+  }
+  return active_worst_quality;
+}
+
+static int rc_pick_q_and_bounds_one_pass_cbr(const VP10_COMP *cpi,
+                                             int *bottom_index,
+                                             int *top_index) {
+  const VP10_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  int active_best_quality;
+  int active_worst_quality = calc_active_worst_quality_one_pass_cbr(cpi);
+  int q;
+  int *rtc_minq;
+  ASSIGN_MINQ_TABLE(cm->bit_depth, rtc_minq);
+
+  if (frame_is_intra_only(cm)) {
+    active_best_quality = rc->best_quality;
+    // Handle the special case for key frames forced when we have reached
+    // the maximum key frame interval. Here force the Q to a range
+    // based on the ambient Q to reduce the risk of popping.
+    if (rc->this_key_frame_forced) {
+      int qindex = rc->last_boosted_qindex;
+      double last_boosted_q = vp10_convert_qindex_to_q(qindex, cm->bit_depth);
+      int delta_qindex = vp10_compute_qdelta(rc, last_boosted_q,
+                                            (last_boosted_q * 0.75),
+                                            cm->bit_depth);
+      active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
+    } else if (cm->current_video_frame > 0) {
+      // not first frame of one pass and kf_boost is set
+      double q_adj_factor = 1.0;
+      double q_val;
+
+      active_best_quality =
+          get_kf_active_quality(rc, rc->avg_frame_qindex[KEY_FRAME],
+                                cm->bit_depth);
+
+      // Allow somewhat lower kf minq with small image formats.
+      if ((cm->width * cm->height) <= (352 * 288)) {
+        q_adj_factor -= 0.25;
+      }
+
+      // Convert the adjustment factor to a qindex delta
+      // on active_best_quality.
+      q_val = vp10_convert_qindex_to_q(active_best_quality, cm->bit_depth);
+      active_best_quality += vp10_compute_qdelta(rc, q_val,
+                                                q_val * q_adj_factor,
+                                                cm->bit_depth);
+    }
+  } else if (!rc->is_src_frame_alt_ref &&
+             (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+    // Use the lower of active_worst_quality and recent
+    // average Q as basis for GF/ARF best Q limit unless last frame was
+    // a key frame.
+    if (rc->frames_since_key > 1 &&
+        rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) {
+      q = rc->avg_frame_qindex[INTER_FRAME];
+    } else {
+      q = active_worst_quality;
+    }
+    active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+  } else {
+    // Use the lower of active_worst_quality and recent/average Q.
+    if (cm->current_video_frame > 1) {
+      if (rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality)
+        active_best_quality = rtc_minq[rc->avg_frame_qindex[INTER_FRAME]];
+      else
+        active_best_quality = rtc_minq[active_worst_quality];
+    } else {
+      if (rc->avg_frame_qindex[KEY_FRAME] < active_worst_quality)
+        active_best_quality = rtc_minq[rc->avg_frame_qindex[KEY_FRAME]];
+      else
+        active_best_quality = rtc_minq[active_worst_quality];
+    }
+  }
+
+  // Clip the active best and worst quality values to limits
+  active_best_quality = clamp(active_best_quality,
+                              rc->best_quality, rc->worst_quality);
+  active_worst_quality = clamp(active_worst_quality,
+                               active_best_quality, rc->worst_quality);
+
+  *top_index = active_worst_quality;
+  *bottom_index = active_best_quality;
+
+#if LIMIT_QRANGE_FOR_ALTREF_AND_KEY
+  // Limit Q range for the adaptive loop.
+  if (cm->frame_type == KEY_FRAME &&
+      !rc->this_key_frame_forced  &&
+      !(cm->current_video_frame == 0)) {
+    int qdelta = 0;
+    vpx_clear_system_state();
+    qdelta = vp10_compute_qdelta_by_rate(&cpi->rc, cm->frame_type,
+                                        active_worst_quality, 2.0,
+                                        cm->bit_depth);
+    *top_index = active_worst_quality + qdelta;
+    *top_index = (*top_index > *bottom_index) ? *top_index : *bottom_index;
+  }
+#endif
+
+  // Special case code to try and match quality with forced key frames
+  if (cm->frame_type == KEY_FRAME && rc->this_key_frame_forced) {
+    q = rc->last_boosted_qindex;
+  } else {
+    q = vp10_rc_regulate_q(cpi, rc->this_frame_target,
+                          active_best_quality, active_worst_quality);
+    if (q > *top_index) {
+      // Special case when we are targeting the max allowed rate
+      if (rc->this_frame_target >= rc->max_frame_bandwidth)
+        *top_index = q;
+      else
+        q = *top_index;
+    }
+  }
+  assert(*top_index <= rc->worst_quality &&
+         *top_index >= rc->best_quality);
+  assert(*bottom_index <= rc->worst_quality &&
+         *bottom_index >= rc->best_quality);
+  assert(q <= rc->worst_quality && q >= rc->best_quality);
+  return q;
+}
+
+static int get_active_cq_level(const RATE_CONTROL *rc,
+                               const VP10EncoderConfig *const oxcf) {
+  static const double cq_adjust_threshold = 0.1;
+  int active_cq_level = oxcf->cq_level;
+  if (oxcf->rc_mode == VPX_CQ &&
+      rc->total_target_bits > 0) {
+    const double x = (double)rc->total_actual_bits / rc->total_target_bits;
+    if (x < cq_adjust_threshold) {
+      active_cq_level = (int)(active_cq_level * x / cq_adjust_threshold);
+    }
+  }
+  return active_cq_level;
+}
+
+static int rc_pick_q_and_bounds_one_pass_vbr(const VP10_COMP *cpi,
+                                             int *bottom_index,
+                                             int *top_index) {
+  const VP10_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const VP10EncoderConfig *const oxcf = &cpi->oxcf;
+  const int cq_level = get_active_cq_level(rc, oxcf);
+  int active_best_quality;
+  int active_worst_quality = calc_active_worst_quality_one_pass_vbr(cpi);
+  int q;
+  int *inter_minq;
+  ASSIGN_MINQ_TABLE(cm->bit_depth, inter_minq);
+
+  if (frame_is_intra_only(cm)) {
+    if (oxcf->rc_mode == VPX_Q) {
+      int qindex = cq_level;
+      double q = vp10_convert_qindex_to_q(qindex, cm->bit_depth);
+      int delta_qindex = vp10_compute_qdelta(rc, q, q * 0.25,
+                                             cm->bit_depth);
+      active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
+    } else if (rc->this_key_frame_forced) {
+      int qindex = rc->last_boosted_qindex;
+      double last_boosted_q = vp10_convert_qindex_to_q(qindex, cm->bit_depth);
+      int delta_qindex = vp10_compute_qdelta(rc, last_boosted_q,
+                                             last_boosted_q * 0.75,
+                                             cm->bit_depth);
+      active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
+    } else {
+      // not first frame of one pass and kf_boost is set
+      double q_adj_factor = 1.0;
+      double q_val;
+
+      active_best_quality =
+          get_kf_active_quality(rc, rc->avg_frame_qindex[KEY_FRAME],
+                                cm->bit_depth);
+
+      // Allow somewhat lower kf minq with small image formats.
+      if ((cm->width * cm->height) <= (352 * 288)) {
+        q_adj_factor -= 0.25;
+      }
+
+      // Convert the adjustment factor to a qindex delta
+      // on active_best_quality.
+      q_val = vp10_convert_qindex_to_q(active_best_quality, cm->bit_depth);
+      active_best_quality += vp10_compute_qdelta(rc, q_val,
+                                                 q_val * q_adj_factor,
+                                                 cm->bit_depth);
+    }
+  } else if (!rc->is_src_frame_alt_ref &&
+             (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+    // Use the lower of active_worst_quality and recent
+    // average Q as basis for GF/ARF best Q limit unless last frame was
+    // a key frame.
+    if (rc->frames_since_key > 1 &&
+        rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) {
+      q = rc->avg_frame_qindex[INTER_FRAME];
+    } else {
+      q = rc->avg_frame_qindex[KEY_FRAME];
+    }
+    // For constrained quality dont allow Q less than the cq level
+    if (oxcf->rc_mode == VPX_CQ) {
+      if (q < cq_level)
+        q = cq_level;
+
+      active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+
+      // Constrained quality use slightly lower active best.
+      active_best_quality = active_best_quality * 15 / 16;
+
+    } else if (oxcf->rc_mode == VPX_Q) {
+      int qindex = cq_level;
+      double q = vp10_convert_qindex_to_q(qindex, cm->bit_depth);
+      int delta_qindex;
+      if (cpi->refresh_alt_ref_frame)
+        delta_qindex = vp10_compute_qdelta(rc, q, q * 0.40, cm->bit_depth);
+      else
+        delta_qindex = vp10_compute_qdelta(rc, q, q * 0.50, cm->bit_depth);
+      active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
+    } else {
+      active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+    }
+  } else {
+    if (oxcf->rc_mode == VPX_Q) {
+      int qindex = cq_level;
+      double q = vp10_convert_qindex_to_q(qindex, cm->bit_depth);
+      double delta_rate[FIXED_GF_INTERVAL] =
+          {0.50, 1.0, 0.85, 1.0, 0.70, 1.0, 0.85, 1.0};
+      int delta_qindex =
+          vp10_compute_qdelta(rc, q,
+                              q * delta_rate[cm->current_video_frame %
+                              FIXED_GF_INTERVAL], cm->bit_depth);
+      active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
+    } else {
+      // Use the lower of active_worst_quality and recent/average Q.
+      if (cm->current_video_frame > 1)
+        active_best_quality = inter_minq[rc->avg_frame_qindex[INTER_FRAME]];
+      else
+        active_best_quality = inter_minq[rc->avg_frame_qindex[KEY_FRAME]];
+      // For the constrained quality mode we don't want
+      // q to fall below the cq level.
+      if ((oxcf->rc_mode == VPX_CQ) &&
+          (active_best_quality < cq_level)) {
+        active_best_quality = cq_level;
+      }
+    }
+  }
+
+  // Clip the active best and worst quality values to limits
+  active_best_quality = clamp(active_best_quality,
+                              rc->best_quality, rc->worst_quality);
+  active_worst_quality = clamp(active_worst_quality,
+                               active_best_quality, rc->worst_quality);
+
+  *top_index = active_worst_quality;
+  *bottom_index = active_best_quality;
+
+#if LIMIT_QRANGE_FOR_ALTREF_AND_KEY
+  {
+    int qdelta = 0;
+    vpx_clear_system_state();
+
+    // Limit Q range for the adaptive loop.
+    if (cm->frame_type == KEY_FRAME &&
+        !rc->this_key_frame_forced &&
+        !(cm->current_video_frame == 0)) {
+      qdelta = vp10_compute_qdelta_by_rate(&cpi->rc, cm->frame_type,
+                                          active_worst_quality, 2.0,
+                                          cm->bit_depth);
+    } else if (!rc->is_src_frame_alt_ref &&
+               (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+      qdelta = vp10_compute_qdelta_by_rate(&cpi->rc, cm->frame_type,
+                                          active_worst_quality, 1.75,
+                                          cm->bit_depth);
+    }
+    *top_index = active_worst_quality + qdelta;
+    *top_index = (*top_index > *bottom_index) ? *top_index : *bottom_index;
+  }
+#endif
+
+  if (oxcf->rc_mode == VPX_Q) {
+    q = active_best_quality;
+  // Special case code to try and match quality with forced key frames
+  } else if ((cm->frame_type == KEY_FRAME) && rc->this_key_frame_forced) {
+    q = rc->last_boosted_qindex;
+  } else {
+    q = vp10_rc_regulate_q(cpi, rc->this_frame_target,
+                          active_best_quality, active_worst_quality);
+    if (q > *top_index) {
+      // Special case when we are targeting the max allowed rate
+      if (rc->this_frame_target >= rc->max_frame_bandwidth)
+        *top_index = q;
+      else
+        q = *top_index;
+    }
+  }
+
+  assert(*top_index <= rc->worst_quality &&
+         *top_index >= rc->best_quality);
+  assert(*bottom_index <= rc->worst_quality &&
+         *bottom_index >= rc->best_quality);
+  assert(q <= rc->worst_quality && q >= rc->best_quality);
+  return q;
+}
+
+int vp10_frame_type_qdelta(const VP10_COMP *cpi, int rf_level, int q) {
+  static const double rate_factor_deltas[RATE_FACTOR_LEVELS] = {
+    1.00,  // INTER_NORMAL
+    1.00,  // INTER_HIGH
+    1.50,  // GF_ARF_LOW
+    1.75,  // GF_ARF_STD
+    2.00,  // KF_STD
+  };
+  static const FRAME_TYPE frame_type[RATE_FACTOR_LEVELS] =
+      {INTER_FRAME, INTER_FRAME, INTER_FRAME, INTER_FRAME, KEY_FRAME};
+  const VP10_COMMON *const cm = &cpi->common;
+  int qdelta = vp10_compute_qdelta_by_rate(&cpi->rc, frame_type[rf_level],
+                                          q, rate_factor_deltas[rf_level],
+                                          cm->bit_depth);
+  return qdelta;
+}
+
+#define STATIC_MOTION_THRESH 95
+static int rc_pick_q_and_bounds_two_pass(const VP10_COMP *cpi,
+                                         int *bottom_index,
+                                         int *top_index) {
+  const VP10_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const VP10EncoderConfig *const oxcf = &cpi->oxcf;
+  const GF_GROUP *gf_group = &cpi->twopass.gf_group;
+  const int cq_level = get_active_cq_level(rc, oxcf);
+  int active_best_quality;
+  int active_worst_quality = cpi->twopass.active_worst_quality;
+  int q;
+  int *inter_minq;
+  ASSIGN_MINQ_TABLE(cm->bit_depth, inter_minq);
+
+  if (frame_is_intra_only(cm)) {
+    // Handle the special case for key frames forced when we have reached
+    // the maximum key frame interval. Here force the Q to a range
+    // based on the ambient Q to reduce the risk of popping.
+    if (rc->this_key_frame_forced) {
+      double last_boosted_q;
+      int delta_qindex;
+      int qindex;
+
+      if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
+        qindex = VPXMIN(rc->last_kf_qindex, rc->last_boosted_qindex);
+        active_best_quality = qindex;
+        last_boosted_q = vp10_convert_qindex_to_q(qindex, cm->bit_depth);
+        delta_qindex = vp10_compute_qdelta(rc, last_boosted_q,
+                                              last_boosted_q * 1.25,
+                                              cm->bit_depth);
+        active_worst_quality =
+            VPXMIN(qindex + delta_qindex, active_worst_quality);
+      } else {
+        qindex = rc->last_boosted_qindex;
+        last_boosted_q = vp10_convert_qindex_to_q(qindex, cm->bit_depth);
+        delta_qindex = vp10_compute_qdelta(rc, last_boosted_q,
+                                              last_boosted_q * 0.75,
+                                              cm->bit_depth);
+        active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
+      }
+    } else {
+      // Not forced keyframe.
+      double q_adj_factor = 1.0;
+      double q_val;
+      // Baseline value derived from cpi->active_worst_quality and kf boost.
+      active_best_quality = get_kf_active_quality(rc, active_worst_quality,
+                                                  cm->bit_depth);
+
+      // Allow somewhat lower kf minq with small image formats.
+      if ((cm->width * cm->height) <= (352 * 288)) {
+        q_adj_factor -= 0.25;
+      }
+
+      // Make a further adjustment based on the kf zero motion measure.
+      q_adj_factor += 0.05 - (0.001 * (double)cpi->twopass.kf_zeromotion_pct);
+
+      // Convert the adjustment factor to a qindex delta
+      // on active_best_quality.
+      q_val = vp10_convert_qindex_to_q(active_best_quality, cm->bit_depth);
+      active_best_quality += vp10_compute_qdelta(rc, q_val,
+                                                q_val * q_adj_factor,
+                                                cm->bit_depth);
+    }
+  } else if (!rc->is_src_frame_alt_ref &&
+             (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+    // Use the lower of active_worst_quality and recent
+    // average Q as basis for GF/ARF best Q limit unless last frame was
+    // a key frame.
+    if (rc->frames_since_key > 1 &&
+        rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) {
+      q = rc->avg_frame_qindex[INTER_FRAME];
+    } else {
+      q = active_worst_quality;
+    }
+    // For constrained quality dont allow Q less than the cq level
+    if (oxcf->rc_mode == VPX_CQ) {
+      if (q < cq_level)
+        q = cq_level;
+
+      active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+
+      // Constrained quality use slightly lower active best.
+      active_best_quality = active_best_quality * 15 / 16;
+
+    } else if (oxcf->rc_mode == VPX_Q) {
+      if (!cpi->refresh_alt_ref_frame) {
+        active_best_quality = cq_level;
+      } else {
+       active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+
+        // Modify best quality for second level arfs. For mode VPX_Q this
+        // becomes the baseline frame q.
+        if (gf_group->rf_level[gf_group->index] == GF_ARF_LOW)
+          active_best_quality = (active_best_quality + cq_level + 1) / 2;
+      }
+    } else {
+      active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+    }
+  } else {
+    if (oxcf->rc_mode == VPX_Q) {
+      active_best_quality = cq_level;
+    } else {
+      active_best_quality = inter_minq[active_worst_quality];
+
+      // For the constrained quality mode we don't want
+      // q to fall below the cq level.
+      if ((oxcf->rc_mode == VPX_CQ) &&
+          (active_best_quality < cq_level)) {
+        active_best_quality = cq_level;
+      }
+    }
+  }
+
+  // Extension to max or min Q if undershoot or overshoot is outside
+  // the permitted range.
+  if ((cpi->oxcf.rc_mode != VPX_Q) &&
+      (cpi->twopass.gf_zeromotion_pct < VLOW_MOTION_THRESHOLD)) {
+    if (frame_is_intra_only(cm) ||
+        (!rc->is_src_frame_alt_ref &&
+         (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) {
+      active_best_quality -=
+        (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast);
+      active_worst_quality += (cpi->twopass.extend_maxq / 2);
+    } else {
+      active_best_quality -=
+        (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast) / 2;
+      active_worst_quality += cpi->twopass.extend_maxq;
+    }
+  }
+
+#if LIMIT_QRANGE_FOR_ALTREF_AND_KEY
+  vpx_clear_system_state();
+  // Static forced key frames Q restrictions dealt with elsewhere.
+  if (!(frame_is_intra_only(cm)) ||
+      !rc->this_key_frame_forced ||
+      (cpi->twopass.last_kfgroup_zeromotion_pct < STATIC_MOTION_THRESH)) {
+    int qdelta = vp10_frame_type_qdelta(cpi, gf_group->rf_level[gf_group->index],
+                                       active_worst_quality);
+    active_worst_quality = VPXMAX(active_worst_quality + qdelta,
+                                  active_best_quality);
+  }
+#endif
+
+  // Modify active_best_quality for downscaled normal frames.
+  if (rc->frame_size_selector != UNSCALED && !frame_is_kf_gf_arf(cpi)) {
+    int qdelta = vp10_compute_qdelta_by_rate(rc, cm->frame_type,
+                                            active_best_quality, 2.0,
+                                            cm->bit_depth);
+    active_best_quality =
+        VPXMAX(active_best_quality + qdelta, rc->best_quality);
+  }
+
+  active_best_quality = clamp(active_best_quality,
+                              rc->best_quality, rc->worst_quality);
+  active_worst_quality = clamp(active_worst_quality,
+                               active_best_quality, rc->worst_quality);
+
+  if (oxcf->rc_mode == VPX_Q) {
+    q = active_best_quality;
+  // Special case code to try and match quality with forced key frames.
+  } else if (frame_is_intra_only(cm) && rc->this_key_frame_forced) {
+    // If static since last kf use better of last boosted and last kf q.
+    if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
+      q = VPXMIN(rc->last_kf_qindex, rc->last_boosted_qindex);
+    } else {
+      q = rc->last_boosted_qindex;
+    }
+  } else {
+    q = vp10_rc_regulate_q(cpi, rc->this_frame_target,
+                          active_best_quality, active_worst_quality);
+    if (q > active_worst_quality) {
+      // Special case when we are targeting the max allowed rate.
+      if (rc->this_frame_target >= rc->max_frame_bandwidth)
+        active_worst_quality = q;
+      else
+        q = active_worst_quality;
+    }
+  }
+  clamp(q, active_best_quality, active_worst_quality);
+
+  *top_index = active_worst_quality;
+  *bottom_index = active_best_quality;
+
+  assert(*top_index <= rc->worst_quality &&
+         *top_index >= rc->best_quality);
+  assert(*bottom_index <= rc->worst_quality &&
+         *bottom_index >= rc->best_quality);
+  assert(q <= rc->worst_quality && q >= rc->best_quality);
+  return q;
+}
+
+int vp10_rc_pick_q_and_bounds(const VP10_COMP *cpi,
+                             int *bottom_index, int *top_index) {
+  int q;
+  if (cpi->oxcf.pass == 0) {
+    if (cpi->oxcf.rc_mode == VPX_CBR)
+      q = rc_pick_q_and_bounds_one_pass_cbr(cpi, bottom_index, top_index);
+    else
+      q = rc_pick_q_and_bounds_one_pass_vbr(cpi, bottom_index, top_index);
+  } else {
+    q = rc_pick_q_and_bounds_two_pass(cpi, bottom_index, top_index);
+  }
+
+  return q;
+}
+
+void vp10_rc_compute_frame_size_bounds(const VP10_COMP *cpi,
+                                      int frame_target,
+                                      int *frame_under_shoot_limit,
+                                      int *frame_over_shoot_limit) {
+  if (cpi->oxcf.rc_mode == VPX_Q) {
+    *frame_under_shoot_limit = 0;
+    *frame_over_shoot_limit  = INT_MAX;
+  } else {
+    // For very small rate targets where the fractional adjustment
+    // may be tiny make sure there is at least a minimum range.
+    const int tolerance = (cpi->sf.recode_tolerance * frame_target) / 100;
+    *frame_under_shoot_limit = VPXMAX(frame_target - tolerance - 200, 0);
+    *frame_over_shoot_limit = VPXMIN(frame_target + tolerance + 200,
+                                     cpi->rc.max_frame_bandwidth);
+  }
+}
+
+void vp10_rc_set_frame_target(VP10_COMP *cpi, int target) {
+  const VP10_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+
+  rc->this_frame_target = target;
+
+  // Modify frame size target when down-scaling.
+  if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC &&
+      rc->frame_size_selector != UNSCALED)
+    rc->this_frame_target = (int)(rc->this_frame_target
+        * rate_thresh_mult[rc->frame_size_selector]);
+
+  // Target rate per SB64 (including partial SB64s.
+  rc->sb64_target_rate = ((int64_t)rc->this_frame_target * 64 * 64) /
+                             (cm->width * cm->height);
+}
+
+static void update_alt_ref_frame_stats(VP10_COMP *cpi) {
+  // this frame refreshes means next frames don't unless specified by user
+  RATE_CONTROL *const rc = &cpi->rc;
+  rc->frames_since_golden = 0;
+
+  // Mark the alt ref as done (setting to 0 means no further alt refs pending).
+  rc->source_alt_ref_pending = 0;
+
+  // Set the alternate reference frame active flag
+  rc->source_alt_ref_active = 1;
+}
+
+static void update_golden_frame_stats(VP10_COMP *cpi) {
+  RATE_CONTROL *const rc = &cpi->rc;
+
+  // Update the Golden frame usage counts.
+  if (cpi->refresh_golden_frame) {
+    // this frame refreshes means next frames don't unless specified by user
+    rc->frames_since_golden = 0;
+
+    // If we are not using alt ref in the up and coming group clear the arf
+    // active flag. In multi arf group case, if the index is not 0 then
+    // we are overlaying a mid group arf so should not reset the flag.
+    if (cpi->oxcf.pass == 2) {
+      if (!rc->source_alt_ref_pending && (cpi->twopass.gf_group.index == 0))
+        rc->source_alt_ref_active = 0;
+    } else if (!rc->source_alt_ref_pending) {
+      rc->source_alt_ref_active = 0;
+    }
+
+    // Decrement count down till next gf
+    if (rc->frames_till_gf_update_due > 0)
+      rc->frames_till_gf_update_due--;
+
+  } else if (!cpi->refresh_alt_ref_frame) {
+    // Decrement count down till next gf
+    if (rc->frames_till_gf_update_due > 0)
+      rc->frames_till_gf_update_due--;
+
+    rc->frames_since_golden++;
+  }
+}
+
+void vp10_rc_postencode_update(VP10_COMP *cpi, uint64_t bytes_used) {
+  const VP10_COMMON *const cm = &cpi->common;
+  const VP10EncoderConfig *const oxcf = &cpi->oxcf;
+  RATE_CONTROL *const rc = &cpi->rc;
+  const int qindex = cm->base_qindex;
+
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) {
+    vp10_cyclic_refresh_postencode(cpi);
+  }
+
+  // Update rate control heuristics
+  rc->projected_frame_size = (int)(bytes_used << 3);
+
+  // Post encode loop adjustment of Q prediction.
+  vp10_rc_update_rate_correction_factors(cpi);
+
+  // Keep a record of last Q and ambient average Q.
+  if (cm->frame_type == KEY_FRAME) {
+    rc->last_q[KEY_FRAME] = qindex;
+    rc->avg_frame_qindex[KEY_FRAME] =
+        ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[KEY_FRAME] + qindex, 2);
+  } else {
+    if (!rc->is_src_frame_alt_ref &&
+        !(cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+      rc->last_q[INTER_FRAME] = qindex;
+      rc->avg_frame_qindex[INTER_FRAME] =
+        ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[INTER_FRAME] + qindex, 2);
+      rc->ni_frames++;
+      rc->tot_q += vp10_convert_qindex_to_q(qindex, cm->bit_depth);
+      rc->avg_q = rc->tot_q / rc->ni_frames;
+      // Calculate the average Q for normal inter frames (not key or GFU
+      // frames).
+      rc->ni_tot_qi += qindex;
+      rc->ni_av_qi = rc->ni_tot_qi / rc->ni_frames;
+    }
+  }
+
+  // Keep record of last boosted (KF/KF/ARF) Q value.
+  // If the current frame is coded at a lower Q then we also update it.
+  // If all mbs in this group are skipped only update if the Q value is
+  // better than that already stored.
+  // This is used to help set quality in forced key frames to reduce popping
+  if ((qindex < rc->last_boosted_qindex) ||
+      (cm->frame_type == KEY_FRAME) ||
+      (!rc->constrained_gf_group &&
+       (cpi->refresh_alt_ref_frame ||
+        (cpi->refresh_golden_frame && !rc->is_src_frame_alt_ref)))) {
+    rc->last_boosted_qindex = qindex;
+  }
+  if (cm->frame_type == KEY_FRAME)
+    rc->last_kf_qindex = qindex;
+
+  update_buffer_level(cpi, rc->projected_frame_size);
+
+  // Rolling monitors of whether we are over or underspending used to help
+  // regulate min and Max Q in two pass.
+  if (cm->frame_type != KEY_FRAME) {
+    rc->rolling_target_bits = ROUND_POWER_OF_TWO(
+        rc->rolling_target_bits * 3 + rc->this_frame_target, 2);
+    rc->rolling_actual_bits = ROUND_POWER_OF_TWO(
+        rc->rolling_actual_bits * 3 + rc->projected_frame_size, 2);
+    rc->long_rolling_target_bits = ROUND_POWER_OF_TWO(
+        rc->long_rolling_target_bits * 31 + rc->this_frame_target, 5);
+    rc->long_rolling_actual_bits = ROUND_POWER_OF_TWO(
+        rc->long_rolling_actual_bits * 31 + rc->projected_frame_size, 5);
+  }
+
+  // Actual bits spent
+  rc->total_actual_bits += rc->projected_frame_size;
+  rc->total_target_bits += cm->show_frame ? rc->avg_frame_bandwidth : 0;
+
+  rc->total_target_vs_actual = rc->total_actual_bits - rc->total_target_bits;
+
+  if (is_altref_enabled(cpi) && cpi->refresh_alt_ref_frame &&
+      (cm->frame_type != KEY_FRAME))
+    // Update the alternate reference frame stats as appropriate.
+    update_alt_ref_frame_stats(cpi);
+  else
+    // Update the Golden frame stats as appropriate.
+    update_golden_frame_stats(cpi);
+
+  if (cm->frame_type == KEY_FRAME)
+    rc->frames_since_key = 0;
+  if (cm->show_frame) {
+    rc->frames_since_key++;
+    rc->frames_to_key--;
+  }
+
+  // Trigger the resizing of the next frame if it is scaled.
+  if (oxcf->pass != 0) {
+    cpi->resize_pending =
+        rc->next_frame_size_selector != rc->frame_size_selector;
+    rc->frame_size_selector = rc->next_frame_size_selector;
+  }
+}
+
+void vp10_rc_postencode_update_drop_frame(VP10_COMP *cpi) {
+  // Update buffer level with zero size, update frame counters, and return.
+  update_buffer_level(cpi, 0);
+  cpi->rc.frames_since_key++;
+  cpi->rc.frames_to_key--;
+  cpi->rc.rc_2_frame = 0;
+  cpi->rc.rc_1_frame = 0;
+}
+
+// Use this macro to turn on/off use of alt-refs in one-pass mode.
+#define USE_ALTREF_FOR_ONE_PASS   1
+
+static int calc_pframe_target_size_one_pass_vbr(const VP10_COMP *const cpi) {
+  static const int af_ratio = 10;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  int target;
+#if USE_ALTREF_FOR_ONE_PASS
+  target = (!rc->is_src_frame_alt_ref &&
+            (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) ?
+      (rc->avg_frame_bandwidth * rc->baseline_gf_interval * af_ratio) /
+      (rc->baseline_gf_interval + af_ratio - 1) :
+      (rc->avg_frame_bandwidth * rc->baseline_gf_interval) /
+      (rc->baseline_gf_interval + af_ratio - 1);
+#else
+  target = rc->avg_frame_bandwidth;
+#endif
+  return vp10_rc_clamp_pframe_target_size(cpi, target);
+}
+
+static int calc_iframe_target_size_one_pass_vbr(const VP10_COMP *const cpi) {
+  static const int kf_ratio = 25;
+  const RATE_CONTROL *rc = &cpi->rc;
+  const int target = rc->avg_frame_bandwidth * kf_ratio;
+  return vp10_rc_clamp_iframe_target_size(cpi, target);
+}
+
+void vp10_rc_get_one_pass_vbr_params(VP10_COMP *cpi) {
+  VP10_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  int target;
+  // TODO(yaowu): replace the "auto_key && 0" below with proper decision logic.
+  if (!cpi->refresh_alt_ref_frame &&
+      (cm->current_video_frame == 0 ||
+       (cpi->frame_flags & FRAMEFLAGS_KEY) ||
+       rc->frames_to_key == 0 ||
+       (cpi->oxcf.auto_key && 0))) {
+    cm->frame_type = KEY_FRAME;
+    rc->this_key_frame_forced = cm->current_video_frame != 0 &&
+                                rc->frames_to_key == 0;
+    rc->frames_to_key = cpi->oxcf.key_freq;
+    rc->kf_boost = DEFAULT_KF_BOOST;
+    rc->source_alt_ref_active = 0;
+  } else {
+    cm->frame_type = INTER_FRAME;
+  }
+  if (rc->frames_till_gf_update_due == 0) {
+    rc->baseline_gf_interval = (rc->min_gf_interval + rc->max_gf_interval) / 2;
+    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+    // NOTE: frames_till_gf_update_due must be <= frames_to_key.
+    if (rc->frames_till_gf_update_due > rc->frames_to_key) {
+      rc->frames_till_gf_update_due = rc->frames_to_key;
+      rc->constrained_gf_group = 1;
+    } else {
+      rc->constrained_gf_group = 0;
+    }
+    cpi->refresh_golden_frame = 1;
+    rc->source_alt_ref_pending = USE_ALTREF_FOR_ONE_PASS;
+    rc->gfu_boost = DEFAULT_GF_BOOST;
+  }
+  if (cm->frame_type == KEY_FRAME)
+    target = calc_iframe_target_size_one_pass_vbr(cpi);
+  else
+    target = calc_pframe_target_size_one_pass_vbr(cpi);
+  vp10_rc_set_frame_target(cpi, target);
+}
+
+static int calc_pframe_target_size_one_pass_cbr(const VP10_COMP *cpi) {
+  const VP10EncoderConfig *oxcf = &cpi->oxcf;
+  const RATE_CONTROL *rc = &cpi->rc;
+  const int64_t diff = rc->optimal_buffer_level - rc->buffer_level;
+  const int64_t one_pct_bits = 1 + rc->optimal_buffer_level / 100;
+  int min_frame_target =
+      VPXMAX(rc->avg_frame_bandwidth >> 4, FRAME_OVERHEAD_BITS);
+  int target;
+
+  if (oxcf->gf_cbr_boost_pct) {
+    const int af_ratio_pct = oxcf->gf_cbr_boost_pct + 100;
+    target =  cpi->refresh_golden_frame ?
+      (rc->avg_frame_bandwidth * rc->baseline_gf_interval * af_ratio_pct) /
+      (rc->baseline_gf_interval * 100 + af_ratio_pct - 100) :
+      (rc->avg_frame_bandwidth * rc->baseline_gf_interval * 100) /
+      (rc->baseline_gf_interval * 100 + af_ratio_pct - 100);
+  } else {
+    target = rc->avg_frame_bandwidth;
+  }
+
+  if (diff > 0) {
+    // Lower the target bandwidth for this frame.
+    const int pct_low = (int)VPXMIN(diff / one_pct_bits, oxcf->under_shoot_pct);
+    target -= (target * pct_low) / 200;
+  } else if (diff < 0) {
+    // Increase the target bandwidth for this frame.
+    const int pct_high =
+        (int)VPXMIN(-diff / one_pct_bits, oxcf->over_shoot_pct);
+    target += (target * pct_high) / 200;
+  }
+  if (oxcf->rc_max_inter_bitrate_pct) {
+    const int max_rate = rc->avg_frame_bandwidth *
+                         oxcf->rc_max_inter_bitrate_pct / 100;
+    target = VPXMIN(target, max_rate);
+  }
+  return VPXMAX(min_frame_target, target);
+}
+
+static int calc_iframe_target_size_one_pass_cbr(const VP10_COMP *cpi) {
+  const RATE_CONTROL *rc = &cpi->rc;
+  int target;
+  if (cpi->common.current_video_frame == 0) {
+    target = ((rc->starting_buffer_level / 2) > INT_MAX)
+      ? INT_MAX : (int)(rc->starting_buffer_level / 2);
+  } else {
+    int kf_boost = 32;
+    double framerate = cpi->framerate;
+
+    kf_boost = VPXMAX(kf_boost, (int)(2 * framerate - 16));
+    if (rc->frames_since_key <  framerate / 2) {
+      kf_boost = (int)(kf_boost * rc->frames_since_key /
+                       (framerate / 2));
+    }
+    target = ((16 + kf_boost) * rc->avg_frame_bandwidth) >> 4;
+  }
+  return vp10_rc_clamp_iframe_target_size(cpi, target);
+}
+
+void vp10_rc_get_one_pass_cbr_params(VP10_COMP *cpi) {
+  VP10_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  int target;
+  // TODO(yaowu): replace the "auto_key && 0" below with proper decision logic.
+  if ((cm->current_video_frame == 0 ||
+      (cpi->frame_flags & FRAMEFLAGS_KEY) ||
+      rc->frames_to_key == 0 ||
+      (cpi->oxcf.auto_key && 0))) {
+    cm->frame_type = KEY_FRAME;
+    rc->this_key_frame_forced = cm->current_video_frame != 0 &&
+                                rc->frames_to_key == 0;
+    rc->frames_to_key = cpi->oxcf.key_freq;
+    rc->kf_boost = DEFAULT_KF_BOOST;
+    rc->source_alt_ref_active = 0;
+  } else {
+    cm->frame_type = INTER_FRAME;
+  }
+  if (rc->frames_till_gf_update_due == 0) {
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+      vp10_cyclic_refresh_set_golden_update(cpi);
+    else
+      rc->baseline_gf_interval =
+          (rc->min_gf_interval + rc->max_gf_interval) / 2;
+    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+    // NOTE: frames_till_gf_update_due must be <= frames_to_key.
+    if (rc->frames_till_gf_update_due > rc->frames_to_key)
+      rc->frames_till_gf_update_due = rc->frames_to_key;
+    cpi->refresh_golden_frame = 1;
+    rc->gfu_boost = DEFAULT_GF_BOOST;
+  }
+
+  // Any update/change of global cyclic refresh parameters (amount/delta-qp)
+  // should be done here, before the frame qp is selected.
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+    vp10_cyclic_refresh_update_parameters(cpi);
+
+  if (cm->frame_type == KEY_FRAME)
+    target = calc_iframe_target_size_one_pass_cbr(cpi);
+  else
+    target = calc_pframe_target_size_one_pass_cbr(cpi);
+
+  vp10_rc_set_frame_target(cpi, target);
+  if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC)
+    cpi->resize_pending = vp10_resize_one_pass_cbr(cpi);
+  else
+    cpi->resize_pending = 0;
+}
+
+int vp10_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget,
+                       vpx_bit_depth_t bit_depth) {
+  int start_index = rc->worst_quality;
+  int target_index = rc->worst_quality;
+  int i;
+
+  // Convert the average q value to an index.
+  for (i = rc->best_quality; i < rc->worst_quality; ++i) {
+    start_index = i;
+    if (vp10_convert_qindex_to_q(i, bit_depth) >= qstart)
+      break;
+  }
+
+  // Convert the q target to an index
+  for (i = rc->best_quality; i < rc->worst_quality; ++i) {
+    target_index = i;
+    if (vp10_convert_qindex_to_q(i, bit_depth) >= qtarget)
+      break;
+  }
+
+  return target_index - start_index;
+}
+
+int vp10_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type,
+                               int qindex, double rate_target_ratio,
+                               vpx_bit_depth_t bit_depth) {
+  int target_index = rc->worst_quality;
+  int i;
+
+  // Look up the current projected bits per block for the base index
+  const int base_bits_per_mb = vp10_rc_bits_per_mb(frame_type, qindex, 1.0,
+                                                  bit_depth);
+
+  // Find the target bits per mb based on the base value and given ratio.
+  const int target_bits_per_mb = (int)(rate_target_ratio * base_bits_per_mb);
+
+  // Convert the q target to an index
+  for (i = rc->best_quality; i < rc->worst_quality; ++i) {
+    if (vp10_rc_bits_per_mb(frame_type, i, 1.0, bit_depth) <=
+        target_bits_per_mb) {
+      target_index = i;
+      break;
+    }
+  }
+  return target_index - qindex;
+}
+
+void vp10_rc_set_gf_interval_range(const VP10_COMP *const cpi,
+                                  RATE_CONTROL *const rc) {
+  const VP10EncoderConfig *const oxcf = &cpi->oxcf;
+
+  // Special case code for 1 pass fixed Q mode tests
+  if ((oxcf->pass == 0) && (oxcf->rc_mode == VPX_Q)) {
+    rc->max_gf_interval = FIXED_GF_INTERVAL;
+    rc->min_gf_interval = FIXED_GF_INTERVAL;
+    rc->static_scene_max_gf_interval = FIXED_GF_INTERVAL;
+  } else {
+    // Set Maximum gf/arf interval
+    rc->max_gf_interval = oxcf->max_gf_interval;
+    rc->min_gf_interval = oxcf->min_gf_interval;
+    if (rc->min_gf_interval == 0)
+      rc->min_gf_interval = vp10_rc_get_default_min_gf_interval(
+          oxcf->width, oxcf->height, cpi->framerate);
+    if (rc->max_gf_interval == 0)
+      rc->max_gf_interval = vp10_rc_get_default_max_gf_interval(
+          cpi->framerate, rc->min_gf_interval);
+
+    // Extended interval for genuinely static scenes
+    rc->static_scene_max_gf_interval = MAX_LAG_BUFFERS * 2;
+
+    if (is_altref_enabled(cpi)) {
+      if (rc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1)
+        rc->static_scene_max_gf_interval = oxcf->lag_in_frames - 1;
+    }
+
+    if (rc->max_gf_interval > rc->static_scene_max_gf_interval)
+      rc->max_gf_interval = rc->static_scene_max_gf_interval;
+
+    // Clamp min to max
+    rc->min_gf_interval = VPXMIN(rc->min_gf_interval, rc->max_gf_interval);
+  }
+}
+
+void vp10_rc_update_framerate(VP10_COMP *cpi) {
+  const VP10_COMMON *const cm = &cpi->common;
+  const VP10EncoderConfig *const oxcf = &cpi->oxcf;
+  RATE_CONTROL *const rc = &cpi->rc;
+  int vbr_max_bits;
+
+  rc->avg_frame_bandwidth = (int)(oxcf->target_bandwidth / cpi->framerate);
+  rc->min_frame_bandwidth = (int)(rc->avg_frame_bandwidth *
+                                oxcf->two_pass_vbrmin_section / 100);
+
+  rc->min_frame_bandwidth =
+      VPXMAX(rc->min_frame_bandwidth, FRAME_OVERHEAD_BITS);
+
+  // A maximum bitrate for a frame is defined.
+  // The baseline for this aligns with HW implementations that
+  // can support decode of 1080P content up to a bitrate of MAX_MB_RATE bits
+  // per 16x16 MB (averaged over a frame). However this limit is extended if
+  // a very high rate is given on the command line or the the rate cannnot
+  // be acheived because of a user specificed max q (e.g. when the user
+  // specifies lossless encode.
+  vbr_max_bits = (int)(((int64_t)rc->avg_frame_bandwidth *
+                     oxcf->two_pass_vbrmax_section) / 100);
+  rc->max_frame_bandwidth =
+      VPXMAX(VPXMAX((cm->MBs * MAX_MB_RATE), MAXRATE_1080P), vbr_max_bits);
+
+  vp10_rc_set_gf_interval_range(cpi, rc);
+}
+
+#define VBR_PCT_ADJUSTMENT_LIMIT 50
+// For VBR...adjustment to the frame target based on error from previous frames
+static void vbr_rate_correction(VP10_COMP *cpi, int *this_frame_target) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  int64_t vbr_bits_off_target = rc->vbr_bits_off_target;
+  int max_delta;
+  double position_factor = 1.0;
+
+  // How far through the clip are we.
+  // This number is used to damp the per frame rate correction.
+  // Range 0 - 1.0
+  if (cpi->twopass.total_stats.count) {
+    position_factor = sqrt((double)cpi->common.current_video_frame /
+                           cpi->twopass.total_stats.count);
+  }
+  max_delta = (int)(position_factor *
+                    ((*this_frame_target * VBR_PCT_ADJUSTMENT_LIMIT) / 100));
+
+  // vbr_bits_off_target > 0 means we have extra bits to spend
+  if (vbr_bits_off_target > 0) {
+    *this_frame_target +=
+      (vbr_bits_off_target > max_delta) ? max_delta
+                                        : (int)vbr_bits_off_target;
+  } else {
+    *this_frame_target -=
+      (vbr_bits_off_target < -max_delta) ? max_delta
+                                         : (int)-vbr_bits_off_target;
+  }
+
+  // Fast redistribution of bits arising from massive local undershoot.
+  // Dont do it for kf,arf,gf or overlay frames.
+  if (!frame_is_kf_gf_arf(cpi) && !rc->is_src_frame_alt_ref &&
+      rc->vbr_bits_off_target_fast) {
+    int one_frame_bits = VPXMAX(rc->avg_frame_bandwidth, *this_frame_target);
+    int fast_extra_bits;
+    fast_extra_bits = (int)VPXMIN(rc->vbr_bits_off_target_fast, one_frame_bits);
+    fast_extra_bits = (int)VPXMIN(
+        fast_extra_bits,
+        VPXMAX(one_frame_bits / 8, rc->vbr_bits_off_target_fast / 8));
+    *this_frame_target += (int)fast_extra_bits;
+    rc->vbr_bits_off_target_fast -= fast_extra_bits;
+  }
+}
+
+void vp10_set_target_rate(VP10_COMP *cpi) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  int target_rate = rc->base_frame_target;
+
+  // Correction to rate target based on prior over or under shoot.
+  if (cpi->oxcf.rc_mode == VPX_VBR || cpi->oxcf.rc_mode == VPX_CQ)
+    vbr_rate_correction(cpi, &target_rate);
+  vp10_rc_set_frame_target(cpi, target_rate);
+}
+
+// Check if we should resize, based on average QP from past x frames.
+// Only allow for resize at most one scale down for now, scaling factor is 2.
+int vp10_resize_one_pass_cbr(VP10_COMP *cpi) {
+  const VP10_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  int resize_now = 0;
+  cpi->resize_scale_num = 1;
+  cpi->resize_scale_den = 1;
+  // Don't resize on key frame; reset the counters on key frame.
+  if (cm->frame_type == KEY_FRAME) {
+    cpi->resize_avg_qp = 0;
+    cpi->resize_count = 0;
+    return 0;
+  }
+  // Resize based on average buffer underflow and QP over some window.
+  // Ignore samples close to key frame, since QP is usually high after key.
+  if (cpi->rc.frames_since_key > 2 * cpi->framerate) {
+    const int window = (int)(5 * cpi->framerate);
+    cpi->resize_avg_qp += cm->base_qindex;
+    if (cpi->rc.buffer_level < (int)(30 * rc->optimal_buffer_level / 100))
+      ++cpi->resize_buffer_underflow;
+    ++cpi->resize_count;
+    // Check for resize action every "window" frames.
+    if (cpi->resize_count >= window) {
+      int avg_qp = cpi->resize_avg_qp / cpi->resize_count;
+      // Resize down if buffer level has underflowed sufficent amount in past
+      // window, and we are at original resolution.
+      // Resize back up if average QP is low, and we are currently in a resized
+      // down state.
+      if (cpi->resize_state == 0 &&
+          cpi->resize_buffer_underflow > (cpi->resize_count >> 2)) {
+        resize_now = 1;
+        cpi->resize_state = 1;
+      } else if (cpi->resize_state == 1 &&
+                 avg_qp < 40 * cpi->rc.worst_quality / 100) {
+        resize_now = -1;
+        cpi->resize_state = 0;
+      }
+      // Reset for next window measurement.
+      cpi->resize_avg_qp = 0;
+      cpi->resize_count = 0;
+      cpi->resize_buffer_underflow = 0;
+    }
+  }
+  // If decision is to resize, reset some quantities, and check is we should
+  // reduce rate correction factor,
+  if (resize_now != 0) {
+    int target_bits_per_frame;
+    int active_worst_quality;
+    int qindex;
+    int tot_scale_change;
+    // For now, resize is by 1/2 x 1/2.
+    cpi->resize_scale_num = 1;
+    cpi->resize_scale_den = 2;
+    tot_scale_change = (cpi->resize_scale_den * cpi->resize_scale_den) /
+        (cpi->resize_scale_num * cpi->resize_scale_num);
+    // Reset buffer level to optimal, update target size.
+    rc->buffer_level = rc->optimal_buffer_level;
+    rc->bits_off_target = rc->optimal_buffer_level;
+    rc->this_frame_target = calc_pframe_target_size_one_pass_cbr(cpi);
+    // Reset cyclic refresh parameters.
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled)
+      vp10_cyclic_refresh_reset_resize(cpi);
+    // Get the projected qindex, based on the scaled target frame size (scaled
+    // so target_bits_per_mb in vp10_rc_regulate_q will be correct target).
+    target_bits_per_frame = (resize_now == 1) ?
+        rc->this_frame_target * tot_scale_change :
+        rc->this_frame_target / tot_scale_change;
+    active_worst_quality = calc_active_worst_quality_one_pass_cbr(cpi);
+    qindex = vp10_rc_regulate_q(cpi,
+                               target_bits_per_frame,
+                               rc->best_quality,
+                               active_worst_quality);
+    // If resize is down, check if projected q index is close to worst_quality,
+    // and if so, reduce the rate correction factor (since likely can afford
+    // lower q for resized frame).
+    if (resize_now == 1 &&
+        qindex > 90 * cpi->rc.worst_quality / 100) {
+      rc->rate_correction_factors[INTER_NORMAL] *= 0.85;
+    }
+    // If resize is back up, check if projected q index is too much above the
+    // current base_qindex, and if so, reduce the rate correction factor
+    // (since prefer to keep q for resized frame at least close to previous q).
+    if (resize_now == -1 &&
+       qindex > 130 * cm->base_qindex / 100) {
+      rc->rate_correction_factors[INTER_NORMAL] *= 0.9;
+    }
+  }
+  return resize_now;
+}
diff --git a/libs/libvpx/vp10/encoder/ratectrl.h b/libs/libvpx/vp10/encoder/ratectrl.h
new file mode 100644
index 0000000000..0b9fd456df
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/ratectrl.h
@@ -0,0 +1,262 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP10_ENCODER_RATECTRL_H_
+#define VP10_ENCODER_RATECTRL_H_
+
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_integer.h"
+
+#include "vp10/common/blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Bits Per MB at different Q (Multiplied by 512)
+#define BPER_MB_NORMBITS    9
+
+#define MIN_GF_INTERVAL     4
+#define MAX_GF_INTERVAL     16
+#define FIXED_GF_INTERVAL   8    // Used in some testing modes only
+
+typedef enum {
+  INTER_NORMAL = 0,
+  INTER_HIGH = 1,
+  GF_ARF_LOW = 2,
+  GF_ARF_STD = 3,
+  KF_STD = 4,
+  RATE_FACTOR_LEVELS = 5
+} RATE_FACTOR_LEVEL;
+
+// Internal frame scaling level.
+typedef enum {
+  UNSCALED = 0,     // Frame is unscaled.
+  SCALE_STEP1 = 1,  // First-level down-scaling.
+  FRAME_SCALE_STEPS
+} FRAME_SCALE_LEVEL;
+
+// Frame dimensions multiplier wrt the native frame size, in 1/16ths,
+// specified for the scale-up case.
+// e.g. 24 => 16/24 = 2/3 of native size. The restriction to 1/16th is
+// intended to match the capabilities of the normative scaling filters,
+// giving precedence to the up-scaling accuracy.
+static const int frame_scale_factor[FRAME_SCALE_STEPS] = {16, 24};
+
+// Multiplier of the target rate to be used as threshold for triggering scaling.
+static const double rate_thresh_mult[FRAME_SCALE_STEPS] = {1.0, 2.0};
+
+// Scale dependent Rate Correction Factor multipliers. Compensates for the
+// greater number of bits per pixel generated in down-scaled frames.
+static const double rcf_mult[FRAME_SCALE_STEPS] = {1.0, 2.0};
+
+typedef struct {
+  // Rate targetting variables
+  int base_frame_target;           // A baseline frame target before adjustment
+                                   // for previous under or over shoot.
+  int this_frame_target;           // Actual frame target after rc adjustment.
+  int projected_frame_size;
+  int sb64_target_rate;
+  int last_q[FRAME_TYPES];         // Separate values for Intra/Inter
+  int last_boosted_qindex;         // Last boosted GF/KF/ARF q
+  int last_kf_qindex;              // Q index of the last key frame coded.
+
+  int gfu_boost;
+  int last_boost;
+  int kf_boost;
+
+  double rate_correction_factors[RATE_FACTOR_LEVELS];
+
+  int frames_since_golden;
+  int frames_till_gf_update_due;
+  int min_gf_interval;
+  int max_gf_interval;
+  int static_scene_max_gf_interval;
+  int baseline_gf_interval;
+  int constrained_gf_group;
+  int frames_to_key;
+  int frames_since_key;
+  int this_key_frame_forced;
+  int next_key_frame_forced;
+  int source_alt_ref_pending;
+  int source_alt_ref_active;
+  int is_src_frame_alt_ref;
+
+  int avg_frame_bandwidth;  // Average frame size target for clip
+  int min_frame_bandwidth;  // Minimum allocation used for any frame
+  int max_frame_bandwidth;  // Maximum burst rate allowed for a frame.
+
+  int ni_av_qi;
+  int ni_tot_qi;
+  int ni_frames;
+  int avg_frame_qindex[FRAME_TYPES];
+  double tot_q;
+  double avg_q;
+
+  int64_t buffer_level;
+  int64_t bits_off_target;
+  int64_t vbr_bits_off_target;
+  int64_t vbr_bits_off_target_fast;
+
+  int decimation_factor;
+  int decimation_count;
+
+  int rolling_target_bits;
+  int rolling_actual_bits;
+
+  int long_rolling_target_bits;
+  int long_rolling_actual_bits;
+
+  int rate_error_estimate;
+
+  int64_t total_actual_bits;
+  int64_t total_target_bits;
+  int64_t total_target_vs_actual;
+
+  int worst_quality;
+  int best_quality;
+
+  int64_t starting_buffer_level;
+  int64_t optimal_buffer_level;
+  int64_t maximum_buffer_size;
+
+  // rate control history for last frame(1) and the frame before(2).
+  // -1: undershot
+  //  1: overshoot
+  //  0: not initialized.
+  int rc_1_frame;
+  int rc_2_frame;
+  int q_1_frame;
+  int q_2_frame;
+
+  // Auto frame-scaling variables.
+  FRAME_SCALE_LEVEL frame_size_selector;
+  FRAME_SCALE_LEVEL next_frame_size_selector;
+  int frame_width[FRAME_SCALE_STEPS];
+  int frame_height[FRAME_SCALE_STEPS];
+  int rf_level_maxq[RATE_FACTOR_LEVELS];
+} RATE_CONTROL;
+
+struct VP10_COMP;
+struct VP10EncoderConfig;
+
+void vp10_rc_init(const struct VP10EncoderConfig *oxcf, int pass,
+                 RATE_CONTROL *rc);
+
+int vp10_estimate_bits_at_q(FRAME_TYPE frame_kind, int q, int mbs,
+                           double correction_factor,
+                           vpx_bit_depth_t bit_depth);
+
+double vp10_convert_qindex_to_q(int qindex, vpx_bit_depth_t bit_depth);
+
+void vp10_rc_init_minq_luts(void);
+
+int vp10_rc_get_default_min_gf_interval(int width, int height, double framerate);
+// Note vp10_rc_get_default_max_gf_interval() requires the min_gf_interval to
+// be passed in to ensure that the max_gf_interval returned is at least as bis
+// as that.
+int vp10_rc_get_default_max_gf_interval(double framerate, int min_frame_rate);
+
+// Generally at the high level, the following flow is expected
+// to be enforced for rate control:
+// First call per frame, one of:
+//   vp10_rc_get_one_pass_vbr_params()
+//   vp10_rc_get_one_pass_cbr_params()
+//   vp10_rc_get_first_pass_params()
+//   vp10_rc_get_second_pass_params()
+// depending on the usage to set the rate control encode parameters desired.
+//
+// Then, call encode_frame_to_data_rate() to perform the
+// actual encode. This function will in turn call encode_frame()
+// one or more times, followed by one of:
+//   vp10_rc_postencode_update()
+//   vp10_rc_postencode_update_drop_frame()
+//
+// The majority of rate control parameters are only expected
+// to be set in the vp10_rc_get_..._params() functions and
+// updated during the vp10_rc_postencode_update...() functions.
+// The only exceptions are vp10_rc_drop_frame() and
+// vp10_rc_update_rate_correction_factors() functions.
+
+// Functions to set parameters for encoding before the actual
+// encode_frame_to_data_rate() function.
+void vp10_rc_get_one_pass_vbr_params(struct VP10_COMP *cpi);
+void vp10_rc_get_one_pass_cbr_params(struct VP10_COMP *cpi);
+
+// Post encode update of the rate control parameters based
+// on bytes used
+void vp10_rc_postencode_update(struct VP10_COMP *cpi, uint64_t bytes_used);
+// Post encode update of the rate control parameters for dropped frames
+void vp10_rc_postencode_update_drop_frame(struct VP10_COMP *cpi);
+
+// Updates rate correction factors
+// Changes only the rate correction factors in the rate control structure.
+void vp10_rc_update_rate_correction_factors(struct VP10_COMP *cpi);
+
+// Decide if we should drop this frame: For 1-pass CBR.
+// Changes only the decimation count in the rate control structure
+int vp10_rc_drop_frame(struct VP10_COMP *cpi);
+
+// Computes frame size bounds.
+void vp10_rc_compute_frame_size_bounds(const struct VP10_COMP *cpi,
+                                      int this_frame_target,
+                                      int *frame_under_shoot_limit,
+                                      int *frame_over_shoot_limit);
+
+// Picks q and q bounds given the target for bits
+int vp10_rc_pick_q_and_bounds(const struct VP10_COMP *cpi,
+                             int *bottom_index,
+                             int *top_index);
+
+// Estimates q to achieve a target bits per frame
+int vp10_rc_regulate_q(const struct VP10_COMP *cpi, int target_bits_per_frame,
+                      int active_best_quality, int active_worst_quality);
+
+// Estimates bits per mb for a given qindex and correction factor.
+int vp10_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
+                       double correction_factor, vpx_bit_depth_t bit_depth);
+
+// Clamping utilities for bitrate targets for iframes and pframes.
+int vp10_rc_clamp_iframe_target_size(const struct VP10_COMP *const cpi,
+                                    int target);
+int vp10_rc_clamp_pframe_target_size(const struct VP10_COMP *const cpi,
+                                    int target);
+// Utility to set frame_target into the RATE_CONTROL structure
+// This function is called only from the vp10_rc_get_..._params() functions.
+void vp10_rc_set_frame_target(struct VP10_COMP *cpi, int target);
+
+// Computes a q delta (in "q index" terms) to get from a starting q value
+// to a target q value
+int vp10_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget,
+                       vpx_bit_depth_t bit_depth);
+
+// Computes a q delta (in "q index" terms) to get from a starting q value
+// to a value that should equate to the given rate ratio.
+int vp10_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type,
+                               int qindex, double rate_target_ratio,
+                               vpx_bit_depth_t bit_depth);
+
+int vp10_frame_type_qdelta(const struct VP10_COMP *cpi, int rf_level, int q);
+
+void vp10_rc_update_framerate(struct VP10_COMP *cpi);
+
+void vp10_rc_set_gf_interval_range(const struct VP10_COMP *const cpi,
+                                  RATE_CONTROL *const rc);
+
+void vp10_set_target_rate(struct VP10_COMP *cpi);
+
+int vp10_resize_one_pass_cbr(struct VP10_COMP *cpi);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_ENCODER_RATECTRL_H_
diff --git a/libs/libvpx/vp10/encoder/rd.c b/libs/libvpx/vp10/encoder/rd.c
new file mode 100644
index 0000000000..f4fdb2417c
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/rd.c
@@ -0,0 +1,673 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "./vp10_rtcd.h"
+
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/bitops.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/system_state.h"
+
+#include "vp10/common/common.h"
+#include "vp10/common/entropy.h"
+#include "vp10/common/entropymode.h"
+#include "vp10/common/mvref_common.h"
+#include "vp10/common/pred_common.h"
+#include "vp10/common/quant_common.h"
+#include "vp10/common/reconinter.h"
+#include "vp10/common/reconintra.h"
+#include "vp10/common/seg_common.h"
+
+#include "vp10/encoder/cost.h"
+#include "vp10/encoder/encodemb.h"
+#include "vp10/encoder/encodemv.h"
+#include "vp10/encoder/encoder.h"
+#include "vp10/encoder/mcomp.h"
+#include "vp10/encoder/quantize.h"
+#include "vp10/encoder/ratectrl.h"
+#include "vp10/encoder/rd.h"
+#include "vp10/encoder/tokenize.h"
+
+#define RD_THRESH_POW      1.25
+#define RD_MULT_EPB_RATIO  64
+
+// Factor to weigh the rate for switchable interp filters.
+#define SWITCHABLE_INTERP_RATE_FACTOR 1
+
+void vp10_rd_cost_reset(RD_COST *rd_cost) {
+  rd_cost->rate = INT_MAX;
+  rd_cost->dist = INT64_MAX;
+  rd_cost->rdcost = INT64_MAX;
+}
+
+void vp10_rd_cost_init(RD_COST *rd_cost) {
+  rd_cost->rate = 0;
+  rd_cost->dist = 0;
+  rd_cost->rdcost = 0;
+}
+
+// The baseline rd thresholds for breaking out of the rd loop for
+// certain modes are assumed to be based on 8x8 blocks.
+// This table is used to correct for block size.
+// The factors here are << 2 (2 = x0.5, 32 = x8 etc).
+static const uint8_t rd_thresh_block_size_factor[BLOCK_SIZES] = {
+  2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32
+};
+
+static void fill_mode_costs(VP10_COMP *cpi) {
+  const FRAME_CONTEXT *const fc = cpi->common.fc;
+  int i, j;
+
+  for (i = 0; i < INTRA_MODES; ++i)
+    for (j = 0; j < INTRA_MODES; ++j)
+      vp10_cost_tokens(cpi->y_mode_costs[i][j], vp10_kf_y_mode_prob[i][j],
+                      vp10_intra_mode_tree);
+
+  vp10_cost_tokens(cpi->mbmode_cost, fc->y_mode_prob[1], vp10_intra_mode_tree);
+  for (i = 0; i < INTRA_MODES; ++i)
+    vp10_cost_tokens(cpi->intra_uv_mode_cost[i],
+                     fc->uv_mode_prob[i], vp10_intra_mode_tree);
+
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+    vp10_cost_tokens(cpi->switchable_interp_costs[i],
+                    fc->switchable_interp_prob[i], vp10_switchable_interp_tree);
+
+  for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+    for (j = 0; j < TX_TYPES; ++j)
+      vp10_cost_tokens(cpi->intra_tx_type_costs[i][j],
+                       fc->intra_ext_tx_prob[i][j],
+                       vp10_ext_tx_tree);
+  }
+  for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+    vp10_cost_tokens(cpi->inter_tx_type_costs[i],
+                     fc->inter_ext_tx_prob[i],
+                     vp10_ext_tx_tree);
+  }
+}
+
+static void fill_token_costs(vp10_coeff_cost *c,
+                             vp10_coeff_probs_model (*p)[PLANE_TYPES]) {
+  int i, j, k, l;
+  TX_SIZE t;
+  for (t = TX_4X4; t <= TX_32X32; ++t)
+    for (i = 0; i < PLANE_TYPES; ++i)
+      for (j = 0; j < REF_TYPES; ++j)
+        for (k = 0; k < COEF_BANDS; ++k)
+          for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
+            vpx_prob probs[ENTROPY_NODES];
+            vp10_model_to_full_probs(p[t][i][j][k][l], probs);
+            vp10_cost_tokens((int *)c[t][i][j][k][0][l], probs,
+                            vp10_coef_tree);
+            vp10_cost_tokens_skip((int *)c[t][i][j][k][1][l], probs,
+                                 vp10_coef_tree);
+            assert(c[t][i][j][k][0][l][EOB_TOKEN] ==
+                   c[t][i][j][k][1][l][EOB_TOKEN]);
+          }
+}
+
+// Values are now correlated to quantizer.
+static int sad_per_bit16lut_8[QINDEX_RANGE];
+static int sad_per_bit4lut_8[QINDEX_RANGE];
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static int sad_per_bit16lut_10[QINDEX_RANGE];
+static int sad_per_bit4lut_10[QINDEX_RANGE];
+static int sad_per_bit16lut_12[QINDEX_RANGE];
+static int sad_per_bit4lut_12[QINDEX_RANGE];
+#endif
+
+static void init_me_luts_bd(int *bit16lut, int *bit4lut, int range,
+                            vpx_bit_depth_t bit_depth) {
+  int i;
+  // Initialize the sad lut tables using a formulaic calculation for now.
+  // This is to make it easier to resolve the impact of experimental changes
+  // to the quantizer tables.
+  for (i = 0; i < range; i++) {
+    const double q = vp10_convert_qindex_to_q(i, bit_depth);
+    bit16lut[i] = (int)(0.0418 * q + 2.4107);
+    bit4lut[i] = (int)(0.063 * q + 2.742);
+  }
+}
+
+void vp10_init_me_luts(void) {
+  init_me_luts_bd(sad_per_bit16lut_8, sad_per_bit4lut_8, QINDEX_RANGE,
+                  VPX_BITS_8);
+#if CONFIG_VP9_HIGHBITDEPTH
+  init_me_luts_bd(sad_per_bit16lut_10, sad_per_bit4lut_10, QINDEX_RANGE,
+                  VPX_BITS_10);
+  init_me_luts_bd(sad_per_bit16lut_12, sad_per_bit4lut_12, QINDEX_RANGE,
+                  VPX_BITS_12);
+#endif
+}
+
+static const int rd_boost_factor[16] = {
+  64, 32, 32, 32, 24, 16, 12, 12,
+  8, 8, 4, 4, 2, 2, 1, 0
+};
+static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = {
+  128, 144, 128, 128, 144
+};
+
+int vp10_compute_rd_mult(const VP10_COMP *cpi, int qindex) {
+  const int64_t q = vp10_dc_quant(qindex, 0, cpi->common.bit_depth);
+#if CONFIG_VP9_HIGHBITDEPTH
+  int64_t rdmult = 0;
+  switch (cpi->common.bit_depth) {
+    case VPX_BITS_8:
+      rdmult = 88 * q * q / 24;
+      break;
+    case VPX_BITS_10:
+      rdmult = ROUND_POWER_OF_TWO(88 * q * q / 24, 4);
+      break;
+    case VPX_BITS_12:
+      rdmult = ROUND_POWER_OF_TWO(88 * q * q / 24, 8);
+      break;
+    default:
+      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+      return -1;
+  }
+#else
+  int64_t rdmult = 88 * q * q / 24;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  if (cpi->oxcf.pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
+    const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+    const FRAME_UPDATE_TYPE frame_type = gf_group->update_type[gf_group->index];
+    const int boost_index = VPXMIN(15, (cpi->rc.gfu_boost / 100));
+
+    rdmult = (rdmult * rd_frame_type_factor[frame_type]) >> 7;
+    rdmult += ((rdmult * rd_boost_factor[boost_index]) >> 7);
+  }
+  if (rdmult < 1)
+    rdmult = 1;
+  return (int)rdmult;
+}
+
+static int compute_rd_thresh_factor(int qindex, vpx_bit_depth_t bit_depth) {
+  double q;
+#if CONFIG_VP9_HIGHBITDEPTH
+  switch (bit_depth) {
+    case VPX_BITS_8:
+      q = vp10_dc_quant(qindex, 0, VPX_BITS_8) / 4.0;
+      break;
+    case VPX_BITS_10:
+      q = vp10_dc_quant(qindex, 0, VPX_BITS_10) / 16.0;
+      break;
+    case VPX_BITS_12:
+      q = vp10_dc_quant(qindex, 0, VPX_BITS_12) / 64.0;
+      break;
+    default:
+      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+      return -1;
+  }
+#else
+  (void) bit_depth;
+  q = vp10_dc_quant(qindex, 0, VPX_BITS_8) / 4.0;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  // TODO(debargha): Adjust the function below.
+  return VPXMAX((int)(pow(q, RD_THRESH_POW) * 5.12), 8);
+}
+
+void vp10_initialize_me_consts(VP10_COMP *cpi, MACROBLOCK *x, int qindex) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  switch (cpi->common.bit_depth) {
+    case VPX_BITS_8:
+      x->sadperbit16 = sad_per_bit16lut_8[qindex];
+      x->sadperbit4 = sad_per_bit4lut_8[qindex];
+      break;
+    case VPX_BITS_10:
+      x->sadperbit16 = sad_per_bit16lut_10[qindex];
+      x->sadperbit4 = sad_per_bit4lut_10[qindex];
+      break;
+    case VPX_BITS_12:
+      x->sadperbit16 = sad_per_bit16lut_12[qindex];
+      x->sadperbit4 = sad_per_bit4lut_12[qindex];
+      break;
+    default:
+      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+  }
+#else
+  (void)cpi;
+  x->sadperbit16 = sad_per_bit16lut_8[qindex];
+  x->sadperbit4 = sad_per_bit4lut_8[qindex];
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}
+
+static void set_block_thresholds(const VP10_COMMON *cm, RD_OPT *rd) {
+  int i, bsize, segment_id;
+
+  for (segment_id = 0; segment_id < MAX_SEGMENTS; ++segment_id) {
+    const int qindex =
+        clamp(vp10_get_qindex(&cm->seg, segment_id, cm->base_qindex) +
+              cm->y_dc_delta_q, 0, MAXQ);
+    const int q = compute_rd_thresh_factor(qindex, cm->bit_depth);
+
+    for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) {
+      // Threshold here seems unnecessarily harsh but fine given actual
+      // range of values used for cpi->sf.thresh_mult[].
+      const int t = q * rd_thresh_block_size_factor[bsize];
+      const int thresh_max = INT_MAX / t;
+
+      if (bsize >= BLOCK_8X8) {
+        for (i = 0; i < MAX_MODES; ++i)
+          rd->threshes[segment_id][bsize][i] =
+              rd->thresh_mult[i] < thresh_max
+                  ? rd->thresh_mult[i] * t / 4
+                  : INT_MAX;
+      } else {
+        for (i = 0; i < MAX_REFS; ++i)
+          rd->threshes[segment_id][bsize][i] =
+              rd->thresh_mult_sub8x8[i] < thresh_max
+                  ? rd->thresh_mult_sub8x8[i] * t / 4
+                  : INT_MAX;
+      }
+    }
+  }
+}
+
+void vp10_initialize_rd_consts(VP10_COMP *cpi) {
+  VP10_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->td.mb;
+  RD_OPT *const rd = &cpi->rd;
+  int i;
+
+  vpx_clear_system_state();
+
+  rd->RDDIV = RDDIV_BITS;  // In bits (to multiply D by 128).
+  rd->RDMULT = vp10_compute_rd_mult(cpi, cm->base_qindex + cm->y_dc_delta_q);
+
+  x->errorperbit = rd->RDMULT / RD_MULT_EPB_RATIO;
+  x->errorperbit += (x->errorperbit == 0);
+
+  x->select_tx_size = (cpi->sf.tx_size_search_method == USE_LARGESTALL &&
+                       cm->frame_type != KEY_FRAME) ? 0 : 1;
+
+  set_block_thresholds(cm, rd);
+
+  fill_token_costs(x->token_costs, cm->fc->coef_probs);
+
+  if (cpi->sf.partition_search_type != VAR_BASED_PARTITION ||
+      cm->frame_type == KEY_FRAME) {
+    for (i = 0; i < PARTITION_CONTEXTS; ++i)
+      vp10_cost_tokens(cpi->partition_cost[i], cm->fc->partition_prob[i],
+                      vp10_partition_tree);
+  }
+
+  fill_mode_costs(cpi);
+
+  if (!frame_is_intra_only(cm)) {
+    vp10_build_nmv_cost_table(x->nmvjointcost,
+                             cm->allow_high_precision_mv ? x->nmvcost_hp
+                                                         : x->nmvcost,
+                             &cm->fc->nmvc, cm->allow_high_precision_mv);
+
+    for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
+      vp10_cost_tokens((int *)cpi->inter_mode_cost[i],
+                      cm->fc->inter_mode_probs[i], vp10_inter_mode_tree);
+  }
+}
+
+static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) {
+  // NOTE: The tables below must be of the same size.
+
+  // The functions described below are sampled at the four most significant
+  // bits of x^2 + 8 / 256.
+
+  // Normalized rate:
+  // This table models the rate for a Laplacian source with given variance
+  // when quantized with a uniform quantizer with given stepsize. The
+  // closed form expression is:
+  // Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)],
+  // where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance),
+  // and H(x) is the binary entropy function.
+  static const int rate_tab_q10[] = {
+    65536,  6086,  5574,  5275,  5063,  4899,  4764,  4651,
+     4553,  4389,  4255,  4142,  4044,  3958,  3881,  3811,
+     3748,  3635,  3538,  3453,  3376,  3307,  3244,  3186,
+     3133,  3037,  2952,  2877,  2809,  2747,  2690,  2638,
+     2589,  2501,  2423,  2353,  2290,  2232,  2179,  2130,
+     2084,  2001,  1928,  1862,  1802,  1748,  1698,  1651,
+     1608,  1530,  1460,  1398,  1342,  1290,  1243,  1199,
+     1159,  1086,  1021,   963,   911,   864,   821,   781,
+      745,   680,   623,   574,   530,   490,   455,   424,
+      395,   345,   304,   269,   239,   213,   190,   171,
+      154,   126,   104,    87,    73,    61,    52,    44,
+       38,    28,    21,    16,    12,    10,     8,     6,
+        5,     3,     2,     1,     1,     1,     0,     0,
+  };
+  // Normalized distortion:
+  // This table models the normalized distortion for a Laplacian source
+  // with given variance when quantized with a uniform quantizer
+  // with given stepsize. The closed form expression is:
+  // Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2))
+  // where x = qpstep / sqrt(variance).
+  // Note the actual distortion is Dn * variance.
+  static const int dist_tab_q10[] = {
+       0,     0,     1,     1,     1,     2,     2,     2,
+       3,     3,     4,     5,     5,     6,     7,     7,
+       8,     9,    11,    12,    13,    15,    16,    17,
+      18,    21,    24,    26,    29,    31,    34,    36,
+      39,    44,    49,    54,    59,    64,    69,    73,
+      78,    88,    97,   106,   115,   124,   133,   142,
+     151,   167,   184,   200,   215,   231,   245,   260,
+     274,   301,   327,   351,   375,   397,   418,   439,
+     458,   495,   528,   559,   587,   613,   637,   659,
+     680,   717,   749,   777,   801,   823,   842,   859,
+     874,   899,   919,   936,   949,   960,   969,   977,
+     983,   994,  1001,  1006,  1010,  1013,  1015,  1017,
+    1018,  1020,  1022,  1022,  1023,  1023,  1023,  1024,
+  };
+  static const int xsq_iq_q10[] = {
+         0,      4,      8,     12,     16,     20,     24,     28,
+        32,     40,     48,     56,     64,     72,     80,     88,
+        96,    112,    128,    144,    160,    176,    192,    208,
+       224,    256,    288,    320,    352,    384,    416,    448,
+       480,    544,    608,    672,    736,    800,    864,    928,
+       992,   1120,   1248,   1376,   1504,   1632,   1760,   1888,
+      2016,   2272,   2528,   2784,   3040,   3296,   3552,   3808,
+      4064,   4576,   5088,   5600,   6112,   6624,   7136,   7648,
+      8160,   9184,  10208,  11232,  12256,  13280,  14304,  15328,
+     16352,  18400,  20448,  22496,  24544,  26592,  28640,  30688,
+     32736,  36832,  40928,  45024,  49120,  53216,  57312,  61408,
+     65504,  73696,  81888,  90080,  98272, 106464, 114656, 122848,
+    131040, 147424, 163808, 180192, 196576, 212960, 229344, 245728,
+  };
+  const int tmp = (xsq_q10 >> 2) + 8;
+  const int k = get_msb(tmp) - 3;
+  const int xq = (k << 3) + ((tmp >> k) & 0x7);
+  const int one_q10 = 1 << 10;
+  const int a_q10 = ((xsq_q10 - xsq_iq_q10[xq]) << 10) >> (2 + k);
+  const int b_q10 = one_q10 - a_q10;
+  *r_q10 = (rate_tab_q10[xq] * b_q10 + rate_tab_q10[xq + 1] * a_q10) >> 10;
+  *d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10;
+}
+
+void vp10_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2,
+                                  unsigned int qstep, int *rate,
+                                  int64_t *dist) {
+  // This function models the rate and distortion for a Laplacian
+  // source with given variance when quantized with a uniform quantizer
+  // with given stepsize. The closed form expressions are in:
+  // Hang and Chen, "Source Model for transform video coder and its
+  // application - Part I: Fundamental Theory", IEEE Trans. Circ.
+  // Sys. for Video Tech., April 1997.
+  if (var == 0) {
+    *rate = 0;
+    *dist = 0;
+  } else {
+    int d_q10, r_q10;
+    static const uint32_t MAX_XSQ_Q10 = 245727;
+    const uint64_t xsq_q10_64 =
+        (((uint64_t)qstep * qstep << (n_log2 + 10)) + (var >> 1)) / var;
+    const int xsq_q10 = (int)VPXMIN(xsq_q10_64, MAX_XSQ_Q10);
+    model_rd_norm(xsq_q10, &r_q10, &d_q10);
+    *rate = ((r_q10 << n_log2) + 2) >> 2;
+    *dist = (var * (int64_t)d_q10 + 512) >> 10;
+  }
+}
+
+void vp10_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
+                              const struct macroblockd_plane *pd,
+                              ENTROPY_CONTEXT t_above[16],
+                              ENTROPY_CONTEXT t_left[16]) {
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+  const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+  const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+  const ENTROPY_CONTEXT *const above = pd->above_context;
+  const ENTROPY_CONTEXT *const left = pd->left_context;
+
+  int i;
+  switch (tx_size) {
+    case TX_4X4:
+      memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w);
+      memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h);
+      break;
+    case TX_8X8:
+      for (i = 0; i < num_4x4_w; i += 2)
+        t_above[i] = !!*(const uint16_t *)&above[i];
+      for (i = 0; i < num_4x4_h; i += 2)
+        t_left[i] = !!*(const uint16_t *)&left[i];
+      break;
+    case TX_16X16:
+      for (i = 0; i < num_4x4_w; i += 4)
+        t_above[i] = !!*(const uint32_t *)&above[i];
+      for (i = 0; i < num_4x4_h; i += 4)
+        t_left[i] = !!*(const uint32_t *)&left[i];
+      break;
+    case TX_32X32:
+      for (i = 0; i < num_4x4_w; i += 8)
+        t_above[i] = !!*(const uint64_t *)&above[i];
+      for (i = 0; i < num_4x4_h; i += 8)
+        t_left[i] = !!*(const uint64_t *)&left[i];
+      break;
+    default:
+      assert(0 && "Invalid transform size.");
+      break;
+  }
+}
+
+void vp10_mv_pred(VP10_COMP *cpi, MACROBLOCK *x,
+                 uint8_t *ref_y_buffer, int ref_y_stride,
+                 int ref_frame, BLOCK_SIZE block_size) {
+  int i;
+  int zero_seen = 0;
+  int best_index = 0;
+  int best_sad = INT_MAX;
+  int this_sad = INT_MAX;
+  int max_mv = 0;
+  int near_same_nearest;
+  uint8_t *src_y_ptr = x->plane[0].src.buf;
+  uint8_t *ref_y_ptr;
+  const int num_mv_refs = MAX_MV_REF_CANDIDATES +
+                    (cpi->sf.adaptive_motion_search &&
+                     block_size < x->max_partition_size);
+
+  MV pred_mv[3];
+  pred_mv[0] = x->mbmi_ext->ref_mvs[ref_frame][0].as_mv;
+  pred_mv[1] = x->mbmi_ext->ref_mvs[ref_frame][1].as_mv;
+  pred_mv[2] = x->pred_mv[ref_frame];
+  assert(num_mv_refs <= (int)(sizeof(pred_mv) / sizeof(pred_mv[0])));
+
+  near_same_nearest =
+      x->mbmi_ext->ref_mvs[ref_frame][0].as_int ==
+          x->mbmi_ext->ref_mvs[ref_frame][1].as_int;
+  // Get the sad for each candidate reference mv.
+  for (i = 0; i < num_mv_refs; ++i) {
+    const MV *this_mv = &pred_mv[i];
+    int fp_row, fp_col;
+
+    if (i == 1 && near_same_nearest)
+      continue;
+    fp_row = (this_mv->row + 3 + (this_mv->row >= 0)) >> 3;
+    fp_col = (this_mv->col + 3 + (this_mv->col >= 0)) >> 3;
+    max_mv = VPXMAX(max_mv, VPXMAX(abs(this_mv->row), abs(this_mv->col)) >> 3);
+
+    if (fp_row ==0 && fp_col == 0 && zero_seen)
+      continue;
+    zero_seen |= (fp_row ==0 && fp_col == 0);
+
+    ref_y_ptr =&ref_y_buffer[ref_y_stride * fp_row + fp_col];
+    // Find sad for current vector.
+    this_sad = cpi->fn_ptr[block_size].sdf(src_y_ptr, x->plane[0].src.stride,
+                                           ref_y_ptr, ref_y_stride);
+    // Note if it is the best so far.
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      best_index = i;
+    }
+  }
+
+  // Note the index of the mv that worked best in the reference list.
+  x->mv_best_ref_index[ref_frame] = best_index;
+  x->max_mv_context[ref_frame] = max_mv;
+  x->pred_mv_sad[ref_frame] = best_sad;
+}
+
+void vp10_setup_pred_block(const MACROBLOCKD *xd,
+                          struct buf_2d dst[MAX_MB_PLANE],
+                          const YV12_BUFFER_CONFIG *src,
+                          int mi_row, int mi_col,
+                          const struct scale_factors *scale,
+                          const struct scale_factors *scale_uv) {
+  int i;
+
+  dst[0].buf = src->y_buffer;
+  dst[0].stride = src->y_stride;
+  dst[1].buf = src->u_buffer;
+  dst[2].buf = src->v_buffer;
+  dst[1].stride = dst[2].stride = src->uv_stride;
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    setup_pred_plane(dst + i, dst[i].buf, dst[i].stride, mi_row, mi_col,
+                     i ? scale_uv : scale,
+                     xd->plane[i].subsampling_x, xd->plane[i].subsampling_y);
+  }
+}
+
+int vp10_raster_block_offset(BLOCK_SIZE plane_bsize,
+                            int raster_block, int stride) {
+  const int bw = b_width_log2_lookup[plane_bsize];
+  const int y = 4 * (raster_block >> bw);
+  const int x = 4 * (raster_block & ((1 << bw) - 1));
+  return y * stride + x;
+}
+
+int16_t* vp10_raster_block_offset_int16(BLOCK_SIZE plane_bsize,
+                                       int raster_block, int16_t *base) {
+  const int stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+  return base + vp10_raster_block_offset(plane_bsize, raster_block, stride);
+}
+
+YV12_BUFFER_CONFIG *vp10_get_scaled_ref_frame(const VP10_COMP *cpi,
+                                             int ref_frame) {
+  const VP10_COMMON *const cm = &cpi->common;
+  const int scaled_idx = cpi->scaled_ref_idx[ref_frame - 1];
+  const int ref_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+  return
+      (scaled_idx != ref_idx && scaled_idx != INVALID_IDX) ?
+          &cm->buffer_pool->frame_bufs[scaled_idx].buf : NULL;
+}
+
+int vp10_get_switchable_rate(const VP10_COMP *cpi,
+                             const MACROBLOCKD *const xd) {
+  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const int ctx = vp10_get_pred_context_switchable_interp(xd);
+  return SWITCHABLE_INTERP_RATE_FACTOR *
+             cpi->switchable_interp_costs[ctx][mbmi->interp_filter];
+}
+
+void vp10_set_rd_speed_thresholds(VP10_COMP *cpi) {
+  int i;
+  RD_OPT *const rd = &cpi->rd;
+  SPEED_FEATURES *const sf = &cpi->sf;
+
+  // Set baseline threshold values.
+  for (i = 0; i < MAX_MODES; ++i)
+    rd->thresh_mult[i] = cpi->oxcf.mode == BEST ? -500 : 0;
+
+  if (sf->adaptive_rd_thresh) {
+    rd->thresh_mult[THR_NEARESTMV] = 300;
+    rd->thresh_mult[THR_NEARESTG] = 300;
+    rd->thresh_mult[THR_NEARESTA] = 300;
+  } else {
+    rd->thresh_mult[THR_NEARESTMV] = 0;
+    rd->thresh_mult[THR_NEARESTG] = 0;
+    rd->thresh_mult[THR_NEARESTA] = 0;
+  }
+
+  rd->thresh_mult[THR_DC] += 1000;
+
+  rd->thresh_mult[THR_NEWMV] += 1000;
+  rd->thresh_mult[THR_NEWA] += 1000;
+  rd->thresh_mult[THR_NEWG] += 1000;
+
+  rd->thresh_mult[THR_NEARMV] += 1000;
+  rd->thresh_mult[THR_NEARA] += 1000;
+  rd->thresh_mult[THR_COMP_NEARESTLA] += 1000;
+  rd->thresh_mult[THR_COMP_NEARESTGA] += 1000;
+
+  rd->thresh_mult[THR_TM] += 1000;
+
+  rd->thresh_mult[THR_COMP_NEARLA] += 1500;
+  rd->thresh_mult[THR_COMP_NEWLA] += 2000;
+  rd->thresh_mult[THR_NEARG] += 1000;
+  rd->thresh_mult[THR_COMP_NEARGA] += 1500;
+  rd->thresh_mult[THR_COMP_NEWGA] += 2000;
+
+  rd->thresh_mult[THR_ZEROMV] += 2000;
+  rd->thresh_mult[THR_ZEROG] += 2000;
+  rd->thresh_mult[THR_ZEROA] += 2000;
+  rd->thresh_mult[THR_COMP_ZEROLA] += 2500;
+  rd->thresh_mult[THR_COMP_ZEROGA] += 2500;
+
+  rd->thresh_mult[THR_H_PRED] += 2000;
+  rd->thresh_mult[THR_V_PRED] += 2000;
+  rd->thresh_mult[THR_D45_PRED ] += 2500;
+  rd->thresh_mult[THR_D135_PRED] += 2500;
+  rd->thresh_mult[THR_D117_PRED] += 2500;
+  rd->thresh_mult[THR_D153_PRED] += 2500;
+  rd->thresh_mult[THR_D207_PRED] += 2500;
+  rd->thresh_mult[THR_D63_PRED] += 2500;
+}
+
+void vp10_set_rd_speed_thresholds_sub8x8(VP10_COMP *cpi) {
+  static const int thresh_mult[2][MAX_REFS] =
+      {{2500, 2500, 2500, 4500, 4500, 2500},
+       {2000, 2000, 2000, 4000, 4000, 2000}};
+  RD_OPT *const rd = &cpi->rd;
+  const int idx = cpi->oxcf.mode == BEST;
+  memcpy(rd->thresh_mult_sub8x8, thresh_mult[idx], sizeof(thresh_mult[idx]));
+}
+
+void vp10_update_rd_thresh_fact(int (*factor_buf)[MAX_MODES], int rd_thresh,
+                               int bsize, int best_mode_index) {
+  if (rd_thresh > 0) {
+    const int top_mode = bsize < BLOCK_8X8 ? MAX_REFS : MAX_MODES;
+    int mode;
+    for (mode = 0; mode < top_mode; ++mode) {
+      const BLOCK_SIZE min_size = VPXMAX(bsize - 1, BLOCK_4X4);
+      const BLOCK_SIZE max_size = VPXMIN(bsize + 2, BLOCK_64X64);
+      BLOCK_SIZE bs;
+      for (bs = min_size; bs <= max_size; ++bs) {
+        int *const fact = &factor_buf[bs][mode];
+        if (mode == best_mode_index) {
+          *fact -= (*fact >> 4);
+        } else {
+          *fact = VPXMIN(*fact + RD_THRESH_INC, rd_thresh * RD_THRESH_MAX_FACT);
+        }
+      }
+    }
+  }
+}
+
+int vp10_get_intra_cost_penalty(int qindex, int qdelta,
+                               vpx_bit_depth_t bit_depth) {
+  const int q = vp10_dc_quant(qindex, qdelta, bit_depth);
+#if CONFIG_VP9_HIGHBITDEPTH
+  switch (bit_depth) {
+    case VPX_BITS_8:
+      return 20 * q;
+    case VPX_BITS_10:
+      return 5 * q;
+    case VPX_BITS_12:
+      return ROUND_POWER_OF_TWO(5 * q, 2);
+    default:
+      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+      return -1;
+  }
+#else
+  return 20 * q;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}
+
diff --git a/libs/libvpx/vp10/encoder/rd.h b/libs/libvpx/vp10/encoder/rd.h
new file mode 100644
index 0000000000..cd58bf84f2
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/rd.h
@@ -0,0 +1,189 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_ENCODER_RD_H_
+#define VP10_ENCODER_RD_H_
+
+#include <limits.h>
+
+#include "vp10/common/blockd.h"
+
+#include "vp10/encoder/block.h"
+#include "vp10/encoder/context_tree.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define RDDIV_BITS          7
+
+#define RDCOST(RM, DM, R, D) \
+  (((128 + ((int64_t)R) * (RM)) >> 8) + (D << DM))
+#define QIDX_SKIP_THRESH     115
+
+#define MV_COST_WEIGHT      108
+#define MV_COST_WEIGHT_SUB  120
+
+#define INVALID_MV 0x80008000
+
+#define MAX_MODES 30
+#define MAX_REFS  6
+
+#define RD_THRESH_MAX_FACT 64
+#define RD_THRESH_INC      1
+
+// This enumerator type needs to be kept aligned with the mode order in
+// const MODE_DEFINITION vp10_mode_order[MAX_MODES] used in the rd code.
+typedef enum {
+  THR_NEARESTMV,
+  THR_NEARESTA,
+  THR_NEARESTG,
+
+  THR_DC,
+
+  THR_NEWMV,
+  THR_NEWA,
+  THR_NEWG,
+
+  THR_NEARMV,
+  THR_NEARA,
+  THR_NEARG,
+
+  THR_ZEROMV,
+  THR_ZEROG,
+  THR_ZEROA,
+
+  THR_COMP_NEARESTLA,
+  THR_COMP_NEARESTGA,
+
+  THR_TM,
+
+  THR_COMP_NEARLA,
+  THR_COMP_NEWLA,
+  THR_COMP_NEARGA,
+  THR_COMP_NEWGA,
+
+  THR_COMP_ZEROLA,
+  THR_COMP_ZEROGA,
+
+  THR_H_PRED,
+  THR_V_PRED,
+  THR_D135_PRED,
+  THR_D207_PRED,
+  THR_D153_PRED,
+  THR_D63_PRED,
+  THR_D117_PRED,
+  THR_D45_PRED,
+} THR_MODES;
+
+typedef enum {
+  THR_LAST,
+  THR_GOLD,
+  THR_ALTR,
+  THR_COMP_LA,
+  THR_COMP_GA,
+  THR_INTRA,
+} THR_MODES_SUB8X8;
+
+typedef struct RD_OPT {
+  // Thresh_mult is used to set a threshold for the rd score. A higher value
+  // means that we will accept the best mode so far more often. This number
+  // is used in combination with the current block size, and thresh_freq_fact
+  // to pick a threshold.
+  int thresh_mult[MAX_MODES];
+  int thresh_mult_sub8x8[MAX_REFS];
+
+  int threshes[MAX_SEGMENTS][BLOCK_SIZES][MAX_MODES];
+
+  int64_t prediction_type_threshes[MAX_REF_FRAMES][REFERENCE_MODES];
+
+  int64_t filter_threshes[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS];
+
+  int RDMULT;
+  int RDDIV;
+} RD_OPT;
+
+typedef struct RD_COST {
+  int rate;
+  int64_t dist;
+  int64_t rdcost;
+} RD_COST;
+
+// Reset the rate distortion cost values to maximum (invalid) value.
+void vp10_rd_cost_reset(RD_COST *rd_cost);
+// Initialize the rate distortion cost values to zero.
+void vp10_rd_cost_init(RD_COST *rd_cost);
+
+struct TileInfo;
+struct TileDataEnc;
+struct VP10_COMP;
+struct macroblock;
+
+int vp10_compute_rd_mult(const struct VP10_COMP *cpi, int qindex);
+
+void vp10_initialize_rd_consts(struct VP10_COMP *cpi);
+
+void vp10_initialize_me_consts(struct VP10_COMP *cpi,
+                               MACROBLOCK *x, int qindex);
+
+void vp10_model_rd_from_var_lapndz(unsigned int var, unsigned int n,
+                                  unsigned int qstep, int *rate,
+                                  int64_t *dist);
+
+int vp10_get_switchable_rate(const struct VP10_COMP *cpi,
+                            const MACROBLOCKD *const xd);
+
+int vp10_raster_block_offset(BLOCK_SIZE plane_bsize,
+                            int raster_block, int stride);
+
+int16_t* vp10_raster_block_offset_int16(BLOCK_SIZE plane_bsize,
+                                       int raster_block, int16_t *base);
+
+YV12_BUFFER_CONFIG *vp10_get_scaled_ref_frame(const struct VP10_COMP *cpi,
+                                             int ref_frame);
+
+void vp10_init_me_luts(void);
+
+void vp10_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
+                              const struct macroblockd_plane *pd,
+                              ENTROPY_CONTEXT t_above[16],
+                              ENTROPY_CONTEXT t_left[16]);
+
+void vp10_set_rd_speed_thresholds(struct VP10_COMP *cpi);
+
+void vp10_set_rd_speed_thresholds_sub8x8(struct VP10_COMP *cpi);
+
+void vp10_update_rd_thresh_fact(int (*fact)[MAX_MODES], int rd_thresh,
+                               int bsize, int best_mode_index);
+
+static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh,
+                                      int thresh_fact) {
+    return best_rd < ((int64_t)thresh * thresh_fact >> 5) || thresh == INT_MAX;
+}
+
+void vp10_mv_pred(struct VP10_COMP *cpi, MACROBLOCK *x,
+                 uint8_t *ref_y_buffer, int ref_y_stride,
+                 int ref_frame, BLOCK_SIZE block_size);
+
+void vp10_setup_pred_block(const MACROBLOCKD *xd,
+                          struct buf_2d dst[MAX_MB_PLANE],
+                          const YV12_BUFFER_CONFIG *src,
+                          int mi_row, int mi_col,
+                          const struct scale_factors *scale,
+                          const struct scale_factors *scale_uv);
+
+int vp10_get_intra_cost_penalty(int qindex, int qdelta,
+                               vpx_bit_depth_t bit_depth);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_ENCODER_RD_H_
diff --git a/libs/libvpx/vp10/encoder/rdopt.c b/libs/libvpx/vp10/encoder/rdopt.c
new file mode 100644
index 0000000000..c62da964aa
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/rdopt.c
@@ -0,0 +1,4310 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <math.h>
+
+#include "./vp10_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/system_state.h"
+
+#include "vp10/common/common.h"
+#include "vp10/common/entropy.h"
+#include "vp10/common/entropymode.h"
+#include "vp10/common/idct.h"
+#include "vp10/common/mvref_common.h"
+#include "vp10/common/pred_common.h"
+#include "vp10/common/quant_common.h"
+#include "vp10/common/reconinter.h"
+#include "vp10/common/reconintra.h"
+#include "vp10/common/scan.h"
+#include "vp10/common/seg_common.h"
+
+#include "vp10/encoder/cost.h"
+#include "vp10/encoder/encodemb.h"
+#include "vp10/encoder/encodemv.h"
+#include "vp10/encoder/encoder.h"
+#include "vp10/encoder/mcomp.h"
+#include "vp10/encoder/quantize.h"
+#include "vp10/encoder/ratectrl.h"
+#include "vp10/encoder/rd.h"
+#include "vp10/encoder/rdopt.h"
+#include "vp10/encoder/aq_variance.h"
+
+#define LAST_FRAME_MODE_MASK    ((1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME) | \
+                                 (1 << INTRA_FRAME))
+#define GOLDEN_FRAME_MODE_MASK  ((1 << LAST_FRAME) | (1 << ALTREF_FRAME) | \
+                                 (1 << INTRA_FRAME))
+#define ALT_REF_MODE_MASK       ((1 << LAST_FRAME) | (1 << GOLDEN_FRAME) | \
+                                 (1 << INTRA_FRAME))
+
+#define SECOND_REF_FRAME_MASK   ((1 << ALTREF_FRAME) | 0x01)
+
+#define MIN_EARLY_TERM_INDEX    3
+#define NEW_MV_DISCOUNT_FACTOR  8
+
+const double ext_tx_th = 0.99;
+
+typedef struct {
+  PREDICTION_MODE mode;
+  MV_REFERENCE_FRAME ref_frame[2];
+} MODE_DEFINITION;
+
+typedef struct {
+  MV_REFERENCE_FRAME ref_frame[2];
+} REF_DEFINITION;
+
+struct rdcost_block_args {
+  MACROBLOCK *x;
+  ENTROPY_CONTEXT t_above[16];
+  ENTROPY_CONTEXT t_left[16];
+  int this_rate;
+  int64_t this_dist;
+  int64_t this_sse;
+  int64_t this_rd;
+  int64_t best_rd;
+  int exit_early;
+  int use_fast_coef_costing;
+  const scan_order *so;
+  uint8_t skippable;
+};
+
+#define LAST_NEW_MV_INDEX 6
+static const MODE_DEFINITION vp10_mode_order[MAX_MODES] = {
+  {NEARESTMV, {LAST_FRAME,   NONE}},
+  {NEARESTMV, {ALTREF_FRAME, NONE}},
+  {NEARESTMV, {GOLDEN_FRAME, NONE}},
+
+  {DC_PRED,   {INTRA_FRAME,  NONE}},
+
+  {NEWMV,     {LAST_FRAME,   NONE}},
+  {NEWMV,     {ALTREF_FRAME, NONE}},
+  {NEWMV,     {GOLDEN_FRAME, NONE}},
+
+  {NEARMV,    {LAST_FRAME,   NONE}},
+  {NEARMV,    {ALTREF_FRAME, NONE}},
+  {NEARMV,    {GOLDEN_FRAME, NONE}},
+
+  {ZEROMV,    {LAST_FRAME,   NONE}},
+  {ZEROMV,    {GOLDEN_FRAME, NONE}},
+  {ZEROMV,    {ALTREF_FRAME, NONE}},
+
+  {NEARESTMV, {LAST_FRAME,   ALTREF_FRAME}},
+  {NEARESTMV, {GOLDEN_FRAME, ALTREF_FRAME}},
+
+  {TM_PRED,   {INTRA_FRAME,  NONE}},
+
+  {NEARMV,    {LAST_FRAME,   ALTREF_FRAME}},
+  {NEWMV,     {LAST_FRAME,   ALTREF_FRAME}},
+  {NEARMV,    {GOLDEN_FRAME, ALTREF_FRAME}},
+  {NEWMV,     {GOLDEN_FRAME, ALTREF_FRAME}},
+
+  {ZEROMV,    {LAST_FRAME,   ALTREF_FRAME}},
+  {ZEROMV,    {GOLDEN_FRAME, ALTREF_FRAME}},
+
+  {H_PRED,    {INTRA_FRAME,  NONE}},
+  {V_PRED,    {INTRA_FRAME,  NONE}},
+  {D135_PRED, {INTRA_FRAME,  NONE}},
+  {D207_PRED, {INTRA_FRAME,  NONE}},
+  {D153_PRED, {INTRA_FRAME,  NONE}},
+  {D63_PRED,  {INTRA_FRAME,  NONE}},
+  {D117_PRED, {INTRA_FRAME,  NONE}},
+  {D45_PRED,  {INTRA_FRAME,  NONE}},
+};
+
+static const REF_DEFINITION vp10_ref_order[MAX_REFS] = {
+  {{LAST_FRAME,   NONE}},
+  {{GOLDEN_FRAME, NONE}},
+  {{ALTREF_FRAME, NONE}},
+  {{LAST_FRAME,   ALTREF_FRAME}},
+  {{GOLDEN_FRAME, ALTREF_FRAME}},
+  {{INTRA_FRAME,  NONE}},
+};
+
+static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
+                           int m, int n, int min_plane, int max_plane) {
+  int i;
+
+  for (i = min_plane; i < max_plane; ++i) {
+    struct macroblock_plane *const p = &x->plane[i];
+    struct macroblockd_plane *const pd = &x->e_mbd.plane[i];
+
+    p->coeff    = ctx->coeff_pbuf[i][m];
+    p->qcoeff   = ctx->qcoeff_pbuf[i][m];
+    pd->dqcoeff = ctx->dqcoeff_pbuf[i][m];
+    p->eobs     = ctx->eobs_pbuf[i][m];
+
+    ctx->coeff_pbuf[i][m]   = ctx->coeff_pbuf[i][n];
+    ctx->qcoeff_pbuf[i][m]  = ctx->qcoeff_pbuf[i][n];
+    ctx->dqcoeff_pbuf[i][m] = ctx->dqcoeff_pbuf[i][n];
+    ctx->eobs_pbuf[i][m]    = ctx->eobs_pbuf[i][n];
+
+    ctx->coeff_pbuf[i][n]   = p->coeff;
+    ctx->qcoeff_pbuf[i][n]  = p->qcoeff;
+    ctx->dqcoeff_pbuf[i][n] = pd->dqcoeff;
+    ctx->eobs_pbuf[i][n]    = p->eobs;
+  }
+}
+
+static void model_rd_for_sb(VP10_COMP *cpi, BLOCK_SIZE bsize,
+                            MACROBLOCK *x, MACROBLOCKD *xd,
+                            int *out_rate_sum, int64_t *out_dist_sum,
+                            int *skip_txfm_sb, int64_t *skip_sse_sb) {
+  // Note our transform coeffs are 8 times an orthogonal transform.
+  // Hence quantizer step is also 8 times. To get effective quantizer
+  // we need to divide by 8 before sending to modeling function.
+  int i;
+  int64_t rate_sum = 0;
+  int64_t dist_sum = 0;
+  const int ref = xd->mi[0]->mbmi.ref_frame[0];
+  unsigned int sse;
+  unsigned int var = 0;
+  unsigned int sum_sse = 0;
+  int64_t total_sse = 0;
+  int skip_flag = 1;
+  const int shift = 6;
+  int rate;
+  int64_t dist;
+  const int dequant_shift =
+#if CONFIG_VP9_HIGHBITDEPTH
+      (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ?
+          xd->bd - 5 :
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+          3;
+
+  x->pred_sse[ref] = 0;
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    struct macroblock_plane *const p = &x->plane[i];
+    struct macroblockd_plane *const pd = &xd->plane[i];
+    const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
+    const TX_SIZE max_tx_size = max_txsize_lookup[bs];
+    const BLOCK_SIZE unit_size = txsize_to_bsize[max_tx_size];
+    const int64_t dc_thr = p->quant_thred[0] >> shift;
+    const int64_t ac_thr = p->quant_thred[1] >> shift;
+    // The low thresholds are used to measure if the prediction errors are
+    // low enough so that we can skip the mode search.
+    const int64_t low_dc_thr = VPXMIN(50, dc_thr >> 2);
+    const int64_t low_ac_thr = VPXMIN(80, ac_thr >> 2);
+    int bw = 1 << (b_width_log2_lookup[bs] - b_width_log2_lookup[unit_size]);
+    int bh = 1 << (b_height_log2_lookup[bs] - b_width_log2_lookup[unit_size]);
+    int idx, idy;
+    int lw = b_width_log2_lookup[unit_size] + 2;
+    int lh = b_height_log2_lookup[unit_size] + 2;
+
+    sum_sse = 0;
+
+    for (idy = 0; idy < bh; ++idy) {
+      for (idx = 0; idx < bw; ++idx) {
+        uint8_t *src = p->src.buf + (idy * p->src.stride << lh) + (idx << lw);
+        uint8_t *dst = pd->dst.buf + (idy * pd->dst.stride << lh) + (idx << lh);
+        int block_idx = (idy << 1) + idx;
+        int low_err_skip = 0;
+
+        var = cpi->fn_ptr[unit_size].vf(src, p->src.stride,
+                                        dst, pd->dst.stride, &sse);
+        x->bsse[(i << 2) + block_idx] = sse;
+        sum_sse += sse;
+
+        x->skip_txfm[(i << 2) + block_idx] = SKIP_TXFM_NONE;
+        if (!x->select_tx_size) {
+          // Check if all ac coefficients can be quantized to zero.
+          if (var < ac_thr || var == 0) {
+            x->skip_txfm[(i << 2) + block_idx] = SKIP_TXFM_AC_ONLY;
+
+            // Check if dc coefficient can be quantized to zero.
+            if (sse - var < dc_thr || sse == var) {
+              x->skip_txfm[(i << 2) + block_idx] = SKIP_TXFM_AC_DC;
+
+              if (!sse || (var < low_ac_thr && sse - var < low_dc_thr))
+                low_err_skip = 1;
+            }
+          }
+        }
+
+        if (skip_flag && !low_err_skip)
+          skip_flag = 0;
+
+        if (i == 0)
+          x->pred_sse[ref] += sse;
+      }
+    }
+
+    total_sse += sum_sse;
+
+    // Fast approximate the modelling function.
+    if (cpi->sf.simple_model_rd_from_var) {
+      int64_t rate;
+      const int64_t square_error = sum_sse;
+      int quantizer = (pd->dequant[1] >> dequant_shift);
+
+      if (quantizer < 120)
+        rate = (square_error * (280 - quantizer)) >> 8;
+      else
+        rate = 0;
+      dist = (square_error * quantizer) >> 8;
+      rate_sum += rate;
+      dist_sum += dist;
+    } else {
+      vp10_model_rd_from_var_lapndz(sum_sse, num_pels_log2_lookup[bs],
+                                   pd->dequant[1] >> dequant_shift,
+                                   &rate, &dist);
+      rate_sum += rate;
+      dist_sum += dist;
+    }
+  }
+
+  *skip_txfm_sb = skip_flag;
+  *skip_sse_sb = total_sse << 4;
+  *out_rate_sum = (int)rate_sum;
+  *out_dist_sum = dist_sum << 4;
+}
+
+int64_t vp10_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
+                          intptr_t block_size, int64_t *ssz) {
+  int i;
+  int64_t error = 0, sqcoeff = 0;
+
+  for (i = 0; i < block_size; i++) {
+    const int diff = coeff[i] - dqcoeff[i];
+    error +=  diff * diff;
+    sqcoeff += coeff[i] * coeff[i];
+  }
+
+  *ssz = sqcoeff;
+  return error;
+}
+
+int64_t vp10_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff,
+                             int block_size) {
+  int i;
+  int64_t error = 0;
+
+  for (i = 0; i < block_size; i++) {
+    const int diff = coeff[i] - dqcoeff[i];
+    error +=  diff * diff;
+  }
+
+  return error;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+int64_t vp10_highbd_block_error_c(const tran_low_t *coeff,
+                                 const tran_low_t *dqcoeff,
+                                 intptr_t block_size,
+                                 int64_t *ssz, int bd) {
+  int i;
+  int64_t error = 0, sqcoeff = 0;
+  int shift = 2 * (bd - 8);
+  int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+
+  for (i = 0; i < block_size; i++) {
+    const int64_t diff = coeff[i] - dqcoeff[i];
+    error +=  diff * diff;
+    sqcoeff += (int64_t)coeff[i] * (int64_t)coeff[i];
+  }
+  assert(error >= 0 && sqcoeff >= 0);
+  error = (error + rounding) >> shift;
+  sqcoeff = (sqcoeff + rounding) >> shift;
+
+  *ssz = sqcoeff;
+  return error;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+/* The trailing '0' is a terminator which is used inside cost_coeffs() to
+ * decide whether to include cost of a trailing EOB node or not (i.e. we
+ * can skip this if the last coefficient in this transform block, e.g. the
+ * 16th coefficient in a 4x4 block or the 64th coefficient in a 8x8 block,
+ * were non-zero). */
+static const int16_t band_counts[TX_SIZES][8] = {
+  { 1, 2, 3, 4,  3,   16 - 13, 0 },
+  { 1, 2, 3, 4, 11,   64 - 21, 0 },
+  { 1, 2, 3, 4, 11,  256 - 21, 0 },
+  { 1, 2, 3, 4, 11, 1024 - 21, 0 },
+};
+static int cost_coeffs(MACROBLOCK *x,
+                       int plane, int block,
+                       ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L,
+                       TX_SIZE tx_size,
+                       const int16_t *scan, const int16_t *nb,
+                       int use_fast_coef_costing) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  const struct macroblock_plane *p = &x->plane[plane];
+  const struct macroblockd_plane *pd = &xd->plane[plane];
+  const PLANE_TYPE type = pd->plane_type;
+  const int16_t *band_count = &band_counts[tx_size][1];
+  const int eob = p->eobs[block];
+  const tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  unsigned int (*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
+                   x->token_costs[tx_size][type][is_inter_block(mbmi)];
+  uint8_t token_cache[32 * 32];
+  int pt = combine_entropy_contexts(*A, *L);
+  int c, cost;
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int16_t *cat6_high_cost = vp10_get_high_cost_table(xd->bd);
+#else
+  const int16_t *cat6_high_cost = vp10_get_high_cost_table(8);
+#endif
+
+  // Check for consistency of tx_size with mode info
+  assert(type == PLANE_TYPE_Y ? mbmi->tx_size == tx_size
+                              : get_uv_tx_size(mbmi, pd) == tx_size);
+
+  if (eob == 0) {
+    // single eob token
+    cost = token_costs[0][0][pt][EOB_TOKEN];
+    c = 0;
+  } else {
+    int band_left = *band_count++;
+
+    // dc token
+    int v = qcoeff[0];
+    int16_t prev_t;
+    EXTRABIT e;
+    vp10_get_token_extra(v, &prev_t, &e);
+    cost = (*token_costs)[0][pt][prev_t] +
+        vp10_get_cost(prev_t, e, cat6_high_cost);
+
+    token_cache[0] = vp10_pt_energy_class[prev_t];
+    ++token_costs;
+
+    // ac tokens
+    for (c = 1; c < eob; c++) {
+      const int rc = scan[c];
+      int16_t t;
+
+      v = qcoeff[rc];
+      vp10_get_token_extra(v, &t, &e);
+      if (use_fast_coef_costing) {
+        cost += (*token_costs)[!prev_t][!prev_t][t] +
+            vp10_get_cost(t, e, cat6_high_cost);
+      } else {
+        pt = get_coef_context(nb, token_cache, c);
+        cost += (*token_costs)[!prev_t][pt][t] +
+            vp10_get_cost(t, e, cat6_high_cost);
+        token_cache[rc] = vp10_pt_energy_class[t];
+      }
+      prev_t = t;
+      if (!--band_left) {
+        band_left = *band_count++;
+        ++token_costs;
+      }
+    }
+
+    // eob token
+    if (band_left) {
+      if (use_fast_coef_costing) {
+        cost += (*token_costs)[0][!prev_t][EOB_TOKEN];
+      } else {
+        pt = get_coef_context(nb, token_cache, c);
+        cost += (*token_costs)[0][pt][EOB_TOKEN];
+      }
+    }
+  }
+
+  // is eob first coefficient;
+  *A = *L = (c > 0);
+
+  return cost;
+}
+
+static void dist_block(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size,
+                       int64_t *out_dist, int64_t *out_sse) {
+  const int ss_txfrm_size = tx_size << 1;
+  MACROBLOCKD* const xd = &x->e_mbd;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  int64_t this_sse;
+  int shift = tx_size == TX_32X32 ? 0 : 2;
+  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8;
+  *out_dist = vp10_highbd_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
+                                     &this_sse, bd) >> shift;
+#else
+  *out_dist = vp10_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
+                              &this_sse) >> shift;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  *out_sse = this_sse >> shift;
+}
+
+static int rate_block(int plane, int block, int blk_row, int blk_col,
+                      TX_SIZE tx_size, struct rdcost_block_args* args) {
+  return cost_coeffs(args->x, plane, block, args->t_above + blk_col,
+                     args->t_left + blk_row, tx_size,
+                     args->so->scan, args->so->neighbors,
+                     args->use_fast_coef_costing);
+}
+
+static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
+                          BLOCK_SIZE plane_bsize,
+                          TX_SIZE tx_size, void *arg) {
+  struct rdcost_block_args *args = arg;
+  MACROBLOCK *const x = args->x;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  int64_t rd1, rd2, rd;
+  int rate;
+  int64_t dist;
+  int64_t sse;
+
+  if (args->exit_early)
+    return;
+
+  if (!is_inter_block(mbmi)) {
+    struct encode_b_args arg = {x, NULL, &mbmi->skip};
+    vp10_encode_block_intra(plane, block, blk_row, blk_col,
+                            plane_bsize, tx_size, &arg);
+    dist_block(x, plane, block, tx_size, &dist, &sse);
+  } else if (max_txsize_lookup[plane_bsize] == tx_size) {
+    if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] ==
+        SKIP_TXFM_NONE) {
+      // full forward transform and quantization
+      vp10_xform_quant(x, plane, block, blk_row, blk_col,
+                       plane_bsize, tx_size);
+      dist_block(x, plane, block, tx_size, &dist, &sse);
+    } else if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] ==
+               SKIP_TXFM_AC_ONLY) {
+      // compute DC coefficient
+      tran_low_t *const coeff   = BLOCK_OFFSET(x->plane[plane].coeff, block);
+      tran_low_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block);
+      vp10_xform_quant_dc(x, plane, block, blk_row, blk_col,
+                          plane_bsize, tx_size);
+      sse  = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4;
+      dist = sse;
+      if (x->plane[plane].eobs[block]) {
+        const int64_t orig_sse = (int64_t)coeff[0] * coeff[0];
+        const int64_t resd_sse = coeff[0] - dqcoeff[0];
+        int64_t dc_correct = orig_sse - resd_sse * resd_sse;
+#if CONFIG_VP9_HIGHBITDEPTH
+        dc_correct >>= ((xd->bd - 8) * 2);
+#endif
+        if (tx_size != TX_32X32)
+          dc_correct >>= 2;
+
+        dist = VPXMAX(0, sse - dc_correct);
+      }
+    } else {
+      // SKIP_TXFM_AC_DC
+      // skip forward transform
+      x->plane[plane].eobs[block] = 0;
+      sse  = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4;
+      dist = sse;
+    }
+  } else {
+    // full forward transform and quantization
+    vp10_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size);
+    dist_block(x, plane, block, tx_size, &dist, &sse);
+  }
+
+  rd = RDCOST(x->rdmult, x->rddiv, 0, dist);
+  if (args->this_rd + rd > args->best_rd) {
+    args->exit_early = 1;
+    return;
+  }
+
+  rate = rate_block(plane, block, blk_row, blk_col, tx_size, args);
+  rd1 = RDCOST(x->rdmult, x->rddiv, rate, dist);
+  rd2 = RDCOST(x->rdmult, x->rddiv, 0, sse);
+
+  // TODO(jingning): temporarily enabled only for luma component
+  rd = VPXMIN(rd1, rd2);
+  if (plane == 0)
+    x->zcoeff_blk[tx_size][block] = !x->plane[plane].eobs[block] ||
+        (rd1 > rd2 && !xd->lossless[mbmi->segment_id]);
+
+  args->this_rate += rate;
+  args->this_dist += dist;
+  args->this_sse += sse;
+  args->this_rd += rd;
+
+  if (args->this_rd > args->best_rd) {
+    args->exit_early = 1;
+    return;
+  }
+
+  args->skippable &= !x->plane[plane].eobs[block];
+}
+
+static void txfm_rd_in_plane(MACROBLOCK *x,
+                             int *rate, int64_t *distortion,
+                             int *skippable, int64_t *sse,
+                             int64_t ref_best_rd, int plane,
+                             BLOCK_SIZE bsize, TX_SIZE tx_size,
+                             int use_fast_coef_casting) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  TX_TYPE tx_type;
+  struct rdcost_block_args args;
+  vp10_zero(args);
+  args.x = x;
+  args.best_rd = ref_best_rd;
+  args.use_fast_coef_costing = use_fast_coef_casting;
+  args.skippable = 1;
+
+  if (plane == 0)
+    xd->mi[0]->mbmi.tx_size = tx_size;
+
+  vp10_get_entropy_contexts(bsize, tx_size, pd, args.t_above, args.t_left);
+
+  tx_type = get_tx_type(pd->plane_type, xd, 0);
+  args.so = get_scan(tx_size, tx_type);
+
+  vp10_foreach_transformed_block_in_plane(xd, bsize, plane,
+                                         block_rd_txfm, &args);
+  if (args.exit_early) {
+    *rate       = INT_MAX;
+    *distortion = INT64_MAX;
+    *sse        = INT64_MAX;
+    *skippable  = 0;
+  } else {
+    *distortion = args.this_dist;
+    *rate       = args.this_rate;
+    *sse        = args.this_sse;
+    *skippable  = args.skippable;
+  }
+}
+
+static void choose_largest_tx_size(VP10_COMP *cpi, MACROBLOCK *x,
+                                   int *rate, int64_t *distortion,
+                                   int *skip, int64_t *sse,
+                                   int64_t ref_best_rd,
+                                   BLOCK_SIZE bs) {
+  const TX_SIZE max_tx_size = max_txsize_lookup[bs];
+  VP10_COMMON *const cm = &cpi->common;
+  const TX_SIZE largest_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+
+  TX_TYPE tx_type, best_tx_type = DCT_DCT;
+  int r, s;
+  int64_t d, psse, this_rd, best_rd = INT64_MAX;
+  vpx_prob skip_prob = vp10_get_skip_prob(cm, xd);
+  int  s0 = vp10_cost_bit(skip_prob, 0);
+  int  s1 = vp10_cost_bit(skip_prob, 1);
+  const int is_inter = is_inter_block(mbmi);
+
+  mbmi->tx_size = VPXMIN(max_tx_size, largest_tx_size);
+  if (mbmi->tx_size < TX_32X32 &&
+      !xd->lossless[mbmi->segment_id]) {
+    for (tx_type = 0; tx_type < TX_TYPES; ++tx_type) {
+      mbmi->tx_type = tx_type;
+      txfm_rd_in_plane(x, &r, &d, &s,
+                       &psse, ref_best_rd, 0, bs, mbmi->tx_size,
+                       cpi->sf.use_fast_coef_costing);
+      if (r == INT_MAX)
+        continue;
+      if (is_inter)
+        r += cpi->inter_tx_type_costs[mbmi->tx_size][mbmi->tx_type];
+      else
+        r += cpi->intra_tx_type_costs[mbmi->tx_size]
+                                     [intra_mode_to_tx_type_context[mbmi->mode]]
+                                     [mbmi->tx_type];
+      if (s)
+        this_rd = RDCOST(x->rdmult, x->rddiv, s1, psse);
+      else
+        this_rd = RDCOST(x->rdmult, x->rddiv, r + s0, d);
+      if (is_inter && !xd->lossless[mbmi->segment_id] && !s)
+        this_rd = VPXMIN(this_rd, RDCOST(x->rdmult, x->rddiv, s1, psse));
+
+      if (this_rd < ((best_tx_type == DCT_DCT) ? ext_tx_th : 1) * best_rd) {
+        best_rd = this_rd;
+        best_tx_type = mbmi->tx_type;
+      }
+    }
+  }
+  mbmi->tx_type = best_tx_type;
+  txfm_rd_in_plane(x, rate, distortion, skip,
+                   sse, ref_best_rd, 0, bs,
+                   mbmi->tx_size, cpi->sf.use_fast_coef_costing);
+  if (mbmi->tx_size < TX_32X32 && !xd->lossless[mbmi->segment_id] &&
+      *rate != INT_MAX) {
+    if (is_inter)
+      *rate += cpi->inter_tx_type_costs[mbmi->tx_size][mbmi->tx_type];
+    else
+      *rate += cpi->intra_tx_type_costs[mbmi->tx_size]
+          [intra_mode_to_tx_type_context[mbmi->mode]]
+          [mbmi->tx_type];
+  }
+}
+
+static void choose_smallest_tx_size(VP10_COMP *cpi, MACROBLOCK *x,
+                                    int *rate, int64_t *distortion,
+                                    int *skip, int64_t *sse,
+                                    int64_t ref_best_rd,
+                                    BLOCK_SIZE bs) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+
+  mbmi->tx_size = TX_4X4;
+
+  txfm_rd_in_plane(x, rate, distortion, skip,
+                   sse, ref_best_rd, 0, bs,
+                   mbmi->tx_size, cpi->sf.use_fast_coef_costing);
+}
+
+static void choose_tx_size_from_rd(VP10_COMP *cpi, MACROBLOCK *x,
+                                   int *rate,
+                                   int64_t *distortion,
+                                   int *skip,
+                                   int64_t *psse,
+                                   int64_t ref_best_rd,
+                                   BLOCK_SIZE bs) {
+  const TX_SIZE max_tx_size = max_txsize_lookup[bs];
+  VP10_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  vpx_prob skip_prob = vp10_get_skip_prob(cm, xd);
+  int r, s;
+  int64_t d, sse;
+  int64_t rd = INT64_MAX;
+  int n, m;
+  int s0, s1;
+  int64_t best_rd = INT64_MAX, last_rd = INT64_MAX;
+  TX_SIZE best_tx = max_tx_size;
+  int start_tx, end_tx;
+  const int tx_select = cm->tx_mode == TX_MODE_SELECT;
+  TX_TYPE tx_type, best_tx_type = DCT_DCT;
+  const int is_inter = is_inter_block(mbmi);
+
+  const vpx_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc->tx_probs);
+  assert(skip_prob > 0);
+  s0 = vp10_cost_bit(skip_prob, 0);
+  s1 = vp10_cost_bit(skip_prob, 1);
+
+  if (tx_select) {
+    start_tx = max_tx_size;
+    end_tx = 0;
+  } else {
+    const TX_SIZE chosen_tx_size =
+        VPXMIN(max_tx_size, tx_mode_to_biggest_tx_size[cm->tx_mode]);
+    start_tx = chosen_tx_size;
+    end_tx = chosen_tx_size;
+  }
+
+  *distortion = INT64_MAX;
+  *rate       = INT_MAX;
+  *skip       = 0;
+  *psse       = INT64_MAX;
+
+  for (tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) {
+    last_rd = INT64_MAX;
+    for (n = start_tx; n >= end_tx; --n) {
+      int r_tx_size = 0;
+      for (m = 0; m <= n - (n == (int) max_tx_size); ++m) {
+        if (m == n)
+          r_tx_size += vp10_cost_zero(tx_probs[m]);
+        else
+          r_tx_size += vp10_cost_one(tx_probs[m]);
+      }
+
+      if (n >= TX_32X32 && tx_type != DCT_DCT) {
+        continue;
+      }
+      mbmi->tx_type = tx_type;
+      txfm_rd_in_plane(x, &r, &d, &s,
+                       &sse, ref_best_rd, 0, bs, n,
+                       cpi->sf.use_fast_coef_costing);
+      if (n < TX_32X32 &&
+          !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
+          r != INT_MAX) {
+        if (is_inter)
+          r += cpi->inter_tx_type_costs[mbmi->tx_size][mbmi->tx_type];
+        else
+          r += cpi->intra_tx_type_costs[mbmi->tx_size]
+              [intra_mode_to_tx_type_context[mbmi->mode]]
+              [mbmi->tx_type];
+      }
+
+      if (r == INT_MAX)
+        continue;
+
+      if (s) {
+        if (is_inter) {
+          rd = RDCOST(x->rdmult, x->rddiv, s1, sse);
+        } else {
+          rd =  RDCOST(x->rdmult, x->rddiv, s1 + r_tx_size * tx_select, sse);
+        }
+      } else {
+        rd = RDCOST(x->rdmult, x->rddiv, r + s0 + r_tx_size * tx_select, d);
+      }
+
+      if (tx_select && !(s && is_inter))
+        r += r_tx_size;
+
+      if (is_inter && !xd->lossless[xd->mi[0]->mbmi.segment_id] && !s)
+        rd = VPXMIN(rd, RDCOST(x->rdmult, x->rddiv, s1, sse));
+
+      // Early termination in transform size search.
+      if (cpi->sf.tx_size_search_breakout &&
+          (rd == INT64_MAX ||
+           (s == 1 && tx_type != DCT_DCT && n < start_tx) ||
+           (n < (int) max_tx_size && rd > last_rd)))
+        break;
+
+      last_rd = rd;
+      if (rd <
+          (is_inter && best_tx_type == DCT_DCT ? ext_tx_th : 1) *
+          best_rd) {
+        best_tx = n;
+        best_rd = rd;
+        *distortion = d;
+        *rate       = r;
+        *skip       = s;
+        *psse       = sse;
+        best_tx_type = mbmi->tx_type;
+      }
+    }
+  }
+
+  mbmi->tx_size = best_tx;
+  mbmi->tx_type = best_tx_type;
+  if (mbmi->tx_size >= TX_32X32)
+    assert(mbmi->tx_type == DCT_DCT);
+  txfm_rd_in_plane(x, &r, &d, &s,
+                   &sse, ref_best_rd, 0, bs, best_tx,
+                   cpi->sf.use_fast_coef_costing);
+}
+
+static void super_block_yrd(VP10_COMP *cpi, MACROBLOCK *x, int *rate,
+                            int64_t *distortion, int *skip,
+                            int64_t *psse, BLOCK_SIZE bs,
+                            int64_t ref_best_rd) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  int64_t sse;
+  int64_t *ret_sse = psse ? psse : &sse;
+
+  assert(bs == xd->mi[0]->mbmi.sb_type);
+
+  if (CONFIG_MISC_FIXES && xd->lossless[0]) {
+    choose_smallest_tx_size(cpi, x, rate, distortion, skip, ret_sse,
+                            ref_best_rd, bs);
+  } else if (cpi->sf.tx_size_search_method == USE_LARGESTALL ||
+             xd->lossless[xd->mi[0]->mbmi.segment_id]) {
+    choose_largest_tx_size(cpi, x, rate, distortion, skip, ret_sse, ref_best_rd,
+                           bs);
+  } else {
+    choose_tx_size_from_rd(cpi, x, rate, distortion, skip, ret_sse,
+                           ref_best_rd, bs);
+  }
+}
+
+static int conditional_skipintra(PREDICTION_MODE mode,
+                                 PREDICTION_MODE best_intra_mode) {
+  if (mode == D117_PRED &&
+      best_intra_mode != V_PRED &&
+      best_intra_mode != D135_PRED)
+    return 1;
+  if (mode == D63_PRED &&
+      best_intra_mode != V_PRED &&
+      best_intra_mode != D45_PRED)
+    return 1;
+  if (mode == D207_PRED &&
+      best_intra_mode != H_PRED &&
+      best_intra_mode != D45_PRED)
+    return 1;
+  if (mode == D153_PRED &&
+      best_intra_mode != H_PRED &&
+      best_intra_mode != D135_PRED)
+    return 1;
+  return 0;
+}
+
+static int64_t rd_pick_intra4x4block(VP10_COMP *cpi, MACROBLOCK *x,
+                                     int row, int col,
+                                     PREDICTION_MODE *best_mode,
+                                     const int *bmode_costs,
+                                     ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
+                                     int *bestrate, int *bestratey,
+                                     int64_t *bestdistortion,
+                                     BLOCK_SIZE bsize, int64_t rd_thresh) {
+  PREDICTION_MODE mode;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int64_t best_rd = rd_thresh;
+  struct macroblock_plane *p = &x->plane[0];
+  struct macroblockd_plane *pd = &xd->plane[0];
+  const int src_stride = p->src.stride;
+  const int dst_stride = pd->dst.stride;
+  const uint8_t *src_init = &p->src.buf[row * 4 * src_stride + col * 4];
+  uint8_t *dst_init = &pd->dst.buf[row * 4 * src_stride + col * 4];
+  ENTROPY_CONTEXT ta[2], tempa[2];
+  ENTROPY_CONTEXT tl[2], templ[2];
+  const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+  const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
+  int idx, idy;
+  uint8_t best_dst[8 * 8];
+#if CONFIG_VP9_HIGHBITDEPTH
+  uint16_t best_dst16[8 * 8];
+#endif
+
+  memcpy(ta, a, sizeof(ta));
+  memcpy(tl, l, sizeof(tl));
+  xd->mi[0]->mbmi.tx_size = TX_4X4;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
+      int64_t this_rd;
+      int ratey = 0;
+      int64_t distortion = 0;
+      int rate = bmode_costs[mode];
+
+      if (!(cpi->sf.intra_y_mode_mask[TX_4X4] & (1 << mode)))
+        continue;
+
+      // Only do the oblique modes if the best so far is
+      // one of the neighboring directional modes
+      if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
+        if (conditional_skipintra(mode, *best_mode))
+            continue;
+      }
+
+      memcpy(tempa, ta, sizeof(ta));
+      memcpy(templ, tl, sizeof(tl));
+
+      for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
+        for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
+          const int block = (row + idy) * 2 + (col + idx);
+          const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
+          uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
+          int16_t *const src_diff = vp10_raster_block_offset_int16(BLOCK_8X8,
+                                                                  block,
+                                                                  p->src_diff);
+          tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
+          xd->mi[0]->bmi[block].as_mode = mode;
+          vp10_predict_intra_block(xd, 1, 1, TX_4X4, mode, dst, dst_stride,
+                                  dst, dst_stride,
+                                  col + idx, row + idy, 0);
+          vpx_highbd_subtract_block(4, 4, src_diff, 8, src, src_stride,
+                                    dst, dst_stride, xd->bd);
+          if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
+            TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block);
+            const scan_order *so = get_scan(TX_4X4, tx_type);
+            vp10_highbd_fwd_txfm_4x4(src_diff, coeff, 8, DCT_DCT, 1);
+            vp10_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
+            ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
+                                 so->scan, so->neighbors,
+                                 cpi->sf.use_fast_coef_costing);
+            if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
+              goto next_highbd;
+            vp10_highbd_inv_txfm_add_4x4(BLOCK_OFFSET(pd->dqcoeff, block),
+                                         dst, dst_stride, p->eobs[block],
+                                         xd->bd, DCT_DCT, 1);
+          } else {
+            int64_t unused;
+            TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block);
+            const scan_order *so = get_scan(TX_4X4, tx_type);
+            vp10_highbd_fwd_txfm_4x4(src_diff, coeff, 8, tx_type, 0);
+            vp10_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
+            ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
+                                 so->scan, so->neighbors,
+                                 cpi->sf.use_fast_coef_costing);
+            distortion += vp10_highbd_block_error(
+                coeff, BLOCK_OFFSET(pd->dqcoeff, block),
+                16, &unused, xd->bd) >> 2;
+            if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
+              goto next_highbd;
+            vp10_highbd_inv_txfm_add_4x4(BLOCK_OFFSET(pd->dqcoeff, block),
+                                         dst, dst_stride, p->eobs[block],
+                                         xd->bd, tx_type, 0);
+          }
+        }
+      }
+
+      rate += ratey;
+      this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+
+      if (this_rd < best_rd) {
+        *bestrate = rate;
+        *bestratey = ratey;
+        *bestdistortion = distortion;
+        best_rd = this_rd;
+        *best_mode = mode;
+        memcpy(a, tempa, sizeof(tempa));
+        memcpy(l, templ, sizeof(templ));
+        for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy) {
+          memcpy(best_dst16 + idy * 8,
+                 CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride),
+                 num_4x4_blocks_wide * 4 * sizeof(uint16_t));
+        }
+      }
+    next_highbd:
+      {}
+    }
+    if (best_rd >= rd_thresh)
+      return best_rd;
+
+    for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy) {
+      memcpy(CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride),
+             best_dst16 + idy * 8,
+             num_4x4_blocks_wide * 4 * sizeof(uint16_t));
+    }
+
+    return best_rd;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
+    int64_t this_rd;
+    int ratey = 0;
+    int64_t distortion = 0;
+    int rate = bmode_costs[mode];
+
+    if (!(cpi->sf.intra_y_mode_mask[TX_4X4] & (1 << mode)))
+      continue;
+
+    // Only do the oblique modes if the best so far is
+    // one of the neighboring directional modes
+    if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
+      if (conditional_skipintra(mode, *best_mode))
+          continue;
+    }
+
+    memcpy(tempa, ta, sizeof(ta));
+    memcpy(templ, tl, sizeof(tl));
+
+    for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
+      for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
+        const int block = (row + idy) * 2 + (col + idx);
+        const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
+        uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
+        int16_t *const src_diff =
+            vp10_raster_block_offset_int16(BLOCK_8X8, block, p->src_diff);
+        tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
+        xd->mi[0]->bmi[block].as_mode = mode;
+        vp10_predict_intra_block(xd, 1, 1, TX_4X4, mode, dst, dst_stride,
+                                dst, dst_stride, col + idx, row + idy, 0);
+        vpx_subtract_block(4, 4, src_diff, 8, src, src_stride, dst, dst_stride);
+
+        if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
+          TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block);
+          const scan_order *so = get_scan(TX_4X4, tx_type);
+          vp10_fwd_txfm_4x4(src_diff, coeff, 8, DCT_DCT, 1);
+          vp10_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
+          ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
+                               so->scan, so->neighbors,
+                               cpi->sf.use_fast_coef_costing);
+          if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
+            goto next;
+          vp10_inv_txfm_add_4x4(BLOCK_OFFSET(pd->dqcoeff, block),
+                                dst, dst_stride, p->eobs[block], DCT_DCT, 1);
+        } else {
+          int64_t unused;
+          TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block);
+          const scan_order *so = get_scan(TX_4X4, tx_type);
+          vp10_fwd_txfm_4x4(src_diff, coeff, 8, tx_type, 0);
+          vp10_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
+          ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
+                             so->scan, so->neighbors,
+                             cpi->sf.use_fast_coef_costing);
+          distortion += vp10_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block),
+                                        16, &unused) >> 2;
+          if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
+            goto next;
+          vp10_inv_txfm_add_4x4(BLOCK_OFFSET(pd->dqcoeff, block),
+                                dst, dst_stride, p->eobs[block], tx_type, 0);
+        }
+      }
+    }
+
+    rate += ratey;
+    this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+
+    if (this_rd < best_rd) {
+      *bestrate = rate;
+      *bestratey = ratey;
+      *bestdistortion = distortion;
+      best_rd = this_rd;
+      *best_mode = mode;
+      memcpy(a, tempa, sizeof(tempa));
+      memcpy(l, templ, sizeof(templ));
+      for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy)
+        memcpy(best_dst + idy * 8, dst_init + idy * dst_stride,
+               num_4x4_blocks_wide * 4);
+    }
+  next:
+    {}
+  }
+
+  if (best_rd >= rd_thresh)
+    return best_rd;
+
+  for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy)
+    memcpy(dst_init + idy * dst_stride, best_dst + idy * 8,
+           num_4x4_blocks_wide * 4);
+
+  return best_rd;
+}
+
+static int64_t rd_pick_intra_sub_8x8_y_mode(VP10_COMP *cpi, MACROBLOCK *mb,
+                                            int *rate, int *rate_y,
+                                            int64_t *distortion,
+                                            int64_t best_rd) {
+  int i, j;
+  const MACROBLOCKD *const xd = &mb->e_mbd;
+  MODE_INFO *const mic = xd->mi[0];
+  const MODE_INFO *above_mi = xd->above_mi;
+  const MODE_INFO *left_mi = xd->left_mi;
+  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+  const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
+  int idx, idy;
+  int cost = 0;
+  int64_t total_distortion = 0;
+  int tot_rate_y = 0;
+  int64_t total_rd = 0;
+  ENTROPY_CONTEXT t_above[4], t_left[4];
+  const int *bmode_costs = cpi->mbmode_cost;
+
+  memcpy(t_above, xd->plane[0].above_context, sizeof(t_above));
+  memcpy(t_left, xd->plane[0].left_context, sizeof(t_left));
+
+  // TODO(any): Add search of the tx_type to improve rd performance at the
+  // expense of speed.
+  mic->mbmi.tx_type = DCT_DCT;
+
+  // Later we can add search of the tx_type to improve results.
+  // For now just set it to DCT_DCT
+  // Pick modes for each sub-block (of size 4x4, 4x8, or 8x4) in an 8x8 block.
+  for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
+    for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
+      PREDICTION_MODE best_mode = DC_PRED;
+      int r = INT_MAX, ry = INT_MAX;
+      int64_t d = INT64_MAX, this_rd = INT64_MAX;
+      i = idy * 2 + idx;
+      if (cpi->common.frame_type == KEY_FRAME) {
+        const PREDICTION_MODE A = vp10_above_block_mode(mic, above_mi, i);
+        const PREDICTION_MODE L = vp10_left_block_mode(mic, left_mi, i);
+
+        bmode_costs  = cpi->y_mode_costs[A][L];
+      }
+
+      this_rd = rd_pick_intra4x4block(cpi, mb, idy, idx, &best_mode,
+                                      bmode_costs, t_above + idx, t_left + idy,
+                                      &r, &ry, &d, bsize, best_rd - total_rd);
+      if (this_rd >= best_rd - total_rd)
+        return INT64_MAX;
+
+      total_rd += this_rd;
+      cost += r;
+      total_distortion += d;
+      tot_rate_y += ry;
+
+      mic->bmi[i].as_mode = best_mode;
+      for (j = 1; j < num_4x4_blocks_high; ++j)
+        mic->bmi[i + j * 2].as_mode = best_mode;
+      for (j = 1; j < num_4x4_blocks_wide; ++j)
+        mic->bmi[i + j].as_mode = best_mode;
+
+      if (total_rd >= best_rd)
+        return INT64_MAX;
+    }
+  }
+
+  *rate = cost;
+  *rate_y = tot_rate_y;
+  *distortion = total_distortion;
+  mic->mbmi.mode = mic->bmi[3].as_mode;
+
+  return RDCOST(mb->rdmult, mb->rddiv, cost, total_distortion);
+}
+
+// This function is used only for intra_only frames
+static int64_t rd_pick_intra_sby_mode(VP10_COMP *cpi, MACROBLOCK *x,
+                                      int *rate, int *rate_tokenonly,
+                                      int64_t *distortion, int *skippable,
+                                      BLOCK_SIZE bsize,
+                                      int64_t best_rd) {
+  PREDICTION_MODE mode;
+  PREDICTION_MODE mode_selected = DC_PRED;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *const mic = xd->mi[0];
+  int this_rate, this_rate_tokenonly, s;
+  int64_t this_distortion, this_rd;
+  TX_SIZE best_tx = TX_4X4;
+  TX_TYPE best_tx_type = DCT_DCT;
+  int *bmode_costs;
+  const MODE_INFO *above_mi = xd->above_mi;
+  const MODE_INFO *left_mi = xd->left_mi;
+  const PREDICTION_MODE A = vp10_above_block_mode(mic, above_mi, 0);
+  const PREDICTION_MODE L = vp10_left_block_mode(mic, left_mi, 0);
+  bmode_costs = cpi->y_mode_costs[A][L];
+
+  memset(x->skip_txfm, SKIP_TXFM_NONE, sizeof(x->skip_txfm));
+
+  /* Y Search for intra prediction mode */
+  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
+    mic->mbmi.mode = mode;
+
+    super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
+        &s, NULL, bsize, best_rd);
+
+    if (this_rate_tokenonly == INT_MAX)
+      continue;
+
+    this_rate = this_rate_tokenonly + bmode_costs[mode];
+    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+
+    if (this_rd < best_rd) {
+      mode_selected   = mode;
+      best_rd         = this_rd;
+      best_tx         = mic->mbmi.tx_size;
+      best_tx_type    = mic->mbmi.tx_type;
+      *rate           = this_rate;
+      *rate_tokenonly = this_rate_tokenonly;
+      *distortion     = this_distortion;
+      *skippable      = s;
+    }
+  }
+
+  mic->mbmi.mode = mode_selected;
+  mic->mbmi.tx_size = best_tx;
+  mic->mbmi.tx_type = best_tx_type;
+
+  return best_rd;
+}
+
+// Return value 0: early termination triggered, no valid rd cost available;
+//              1: rd cost values are valid.
+static int super_block_uvrd(const VP10_COMP *cpi, MACROBLOCK *x,
+                            int *rate, int64_t *distortion, int *skippable,
+                            int64_t *sse, BLOCK_SIZE bsize,
+                            int64_t ref_best_rd) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const TX_SIZE uv_tx_size = get_uv_tx_size(mbmi, &xd->plane[1]);
+  int plane;
+  int pnrate = 0, pnskip = 1;
+  int64_t pndist = 0, pnsse = 0;
+  int is_cost_valid = 1;
+
+  if (ref_best_rd < 0)
+    is_cost_valid = 0;
+
+  if (is_inter_block(mbmi) && is_cost_valid) {
+    int plane;
+    for (plane = 1; plane < MAX_MB_PLANE; ++plane)
+      vp10_subtract_plane(x, bsize, plane);
+  }
+
+  *rate = 0;
+  *distortion = 0;
+  *sse = 0;
+  *skippable = 1;
+
+  for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
+    txfm_rd_in_plane(x, &pnrate, &pndist, &pnskip, &pnsse,
+                     ref_best_rd, plane, bsize, uv_tx_size,
+                     cpi->sf.use_fast_coef_costing);
+    if (pnrate == INT_MAX) {
+      is_cost_valid = 0;
+      break;
+    }
+    *rate += pnrate;
+    *distortion += pndist;
+    *sse += pnsse;
+    *skippable &= pnskip;
+  }
+
+  if (!is_cost_valid) {
+    // reset cost value
+    *rate = INT_MAX;
+    *distortion = INT64_MAX;
+    *sse = INT64_MAX;
+    *skippable = 0;
+  }
+
+  return is_cost_valid;
+}
+
+static int64_t rd_pick_intra_sbuv_mode(VP10_COMP *cpi, MACROBLOCK *x,
+                                       PICK_MODE_CONTEXT *ctx,
+                                       int *rate, int *rate_tokenonly,
+                                       int64_t *distortion, int *skippable,
+                                       BLOCK_SIZE bsize, TX_SIZE max_tx_size) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  PREDICTION_MODE mode;
+  PREDICTION_MODE mode_selected = DC_PRED;
+  int64_t best_rd = INT64_MAX, this_rd;
+  int this_rate_tokenonly, this_rate, s;
+  int64_t this_distortion, this_sse;
+
+  memset(x->skip_txfm, SKIP_TXFM_NONE, sizeof(x->skip_txfm));
+  for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
+    if (!(cpi->sf.intra_uv_mode_mask[max_tx_size] & (1 << mode)))
+      continue;
+
+    xd->mi[0]->mbmi.uv_mode = mode;
+
+    if (!super_block_uvrd(cpi, x, &this_rate_tokenonly,
+                          &this_distortion, &s, &this_sse, bsize, best_rd))
+      continue;
+    this_rate = this_rate_tokenonly +
+        cpi->intra_uv_mode_cost[xd->mi[0]->mbmi.mode][mode];
+    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+
+    if (this_rd < best_rd) {
+      mode_selected   = mode;
+      best_rd         = this_rd;
+      *rate           = this_rate;
+      *rate_tokenonly = this_rate_tokenonly;
+      *distortion     = this_distortion;
+      *skippable      = s;
+      if (!x->select_tx_size)
+        swap_block_ptr(x, ctx, 2, 0, 1, MAX_MB_PLANE);
+    }
+  }
+
+  xd->mi[0]->mbmi.uv_mode = mode_selected;
+  return best_rd;
+}
+
+static int64_t rd_sbuv_dcpred(const VP10_COMP *cpi, MACROBLOCK *x,
+                              int *rate, int *rate_tokenonly,
+                              int64_t *distortion, int *skippable,
+                              BLOCK_SIZE bsize) {
+  int64_t unused;
+
+  x->e_mbd.mi[0]->mbmi.uv_mode = DC_PRED;
+  memset(x->skip_txfm, SKIP_TXFM_NONE, sizeof(x->skip_txfm));
+  super_block_uvrd(cpi, x, rate_tokenonly, distortion,
+                   skippable, &unused, bsize, INT64_MAX);
+  *rate = *rate_tokenonly +
+      cpi->intra_uv_mode_cost[x->e_mbd.mi[0]->mbmi.mode][DC_PRED];
+  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
+}
+
+static void choose_intra_uv_mode(VP10_COMP *cpi, MACROBLOCK *const x,
+                                 PICK_MODE_CONTEXT *ctx,
+                                 BLOCK_SIZE bsize, TX_SIZE max_tx_size,
+                                 int *rate_uv, int *rate_uv_tokenonly,
+                                 int64_t *dist_uv, int *skip_uv,
+                                 PREDICTION_MODE *mode_uv) {
+  // Use an estimated rd for uv_intra based on DC_PRED if the
+  // appropriate speed flag is set.
+  if (cpi->sf.use_uv_intra_rd_estimate) {
+    rd_sbuv_dcpred(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv,
+                   skip_uv, bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize);
+  // Else do a proper rd search for each possible transform size that may
+  // be considered in the main rd loop.
+  } else {
+    rd_pick_intra_sbuv_mode(cpi, x, ctx,
+                            rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
+                            bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize, max_tx_size);
+  }
+  *mode_uv = x->e_mbd.mi[0]->mbmi.uv_mode;
+}
+
+static int cost_mv_ref(const VP10_COMP *cpi, PREDICTION_MODE mode,
+                       int mode_context) {
+  assert(is_inter_mode(mode));
+  return cpi->inter_mode_cost[mode_context][INTER_OFFSET(mode)];
+}
+
+static int set_and_cost_bmi_mvs(VP10_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
+                                int i,
+                                PREDICTION_MODE mode, int_mv this_mv[2],
+                                int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
+                                int_mv seg_mvs[MAX_REF_FRAMES],
+                                int_mv *best_ref_mv[2], const int *mvjcost,
+                                int *mvcost[2]) {
+  MODE_INFO *const mic = xd->mi[0];
+  const MB_MODE_INFO *const mbmi = &mic->mbmi;
+  const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  int thismvcost = 0;
+  int idx, idy;
+  const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
+  const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
+  const int is_compound = has_second_ref(mbmi);
+
+  switch (mode) {
+    case NEWMV:
+      this_mv[0].as_int = seg_mvs[mbmi->ref_frame[0]].as_int;
+      thismvcost += vp10_mv_bit_cost(&this_mv[0].as_mv, &best_ref_mv[0]->as_mv,
+                                    mvjcost, mvcost, MV_COST_WEIGHT_SUB);
+      if (is_compound) {
+        this_mv[1].as_int = seg_mvs[mbmi->ref_frame[1]].as_int;
+        thismvcost += vp10_mv_bit_cost(&this_mv[1].as_mv, &best_ref_mv[1]->as_mv,
+                                      mvjcost, mvcost, MV_COST_WEIGHT_SUB);
+      }
+      break;
+    case NEARMV:
+    case NEARESTMV:
+      this_mv[0].as_int = frame_mv[mode][mbmi->ref_frame[0]].as_int;
+      if (is_compound)
+        this_mv[1].as_int = frame_mv[mode][mbmi->ref_frame[1]].as_int;
+      break;
+    case ZEROMV:
+      this_mv[0].as_int = 0;
+      if (is_compound)
+        this_mv[1].as_int = 0;
+      break;
+    default:
+      break;
+  }
+
+  mic->bmi[i].as_mv[0].as_int = this_mv[0].as_int;
+  if (is_compound)
+    mic->bmi[i].as_mv[1].as_int = this_mv[1].as_int;
+
+  mic->bmi[i].as_mode = mode;
+
+  for (idy = 0; idy < num_4x4_blocks_high; ++idy)
+    for (idx = 0; idx < num_4x4_blocks_wide; ++idx)
+      memmove(&mic->bmi[i + idy * 2 + idx], &mic->bmi[i], sizeof(mic->bmi[i]));
+
+  return cost_mv_ref(cpi, mode, mbmi_ext->mode_context[mbmi->ref_frame[0]]) +
+            thismvcost;
+}
+
+static int64_t encode_inter_mb_segment(VP10_COMP *cpi,
+                                       MACROBLOCK *x,
+                                       int64_t best_yrd,
+                                       int i,
+                                       int *labelyrate,
+                                       int64_t *distortion, int64_t *sse,
+                                       ENTROPY_CONTEXT *ta,
+                                       ENTROPY_CONTEXT *tl,
+                                       int ir, int ic,
+                                       int mi_row, int mi_col) {
+  int k;
+  MACROBLOCKD *xd = &x->e_mbd;
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  struct macroblock_plane *const p = &x->plane[0];
+  MODE_INFO *const mi = xd->mi[0];
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(mi->mbmi.sb_type, pd);
+  const int width = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+  const int height = 4 * num_4x4_blocks_high_lookup[plane_bsize];
+  int idx, idy;
+  void (*fwd_txm4x4)(const int16_t *input, tran_low_t *output, int stride);
+
+  const uint8_t *const src =
+      &p->src.buf[vp10_raster_block_offset(BLOCK_8X8, i, p->src.stride)];
+  uint8_t *const dst = &pd->dst.buf[vp10_raster_block_offset(BLOCK_8X8, i,
+                                                            pd->dst.stride)];
+  int64_t thisdistortion = 0, thissse = 0;
+  int thisrate = 0;
+  TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, i);
+  const scan_order *so = get_scan(TX_4X4, tx_type);
+
+  vp10_build_inter_predictor_sub8x8(xd, 0, i, ir, ic, mi_row, mi_col);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    fwd_txm4x4 = xd->lossless[mi->mbmi.segment_id] ? vp10_highbd_fwht4x4
+                                                   : vpx_highbd_fdct4x4;
+  } else {
+    fwd_txm4x4 = xd->lossless[mi->mbmi.segment_id] ? vp10_fwht4x4 : vpx_fdct4x4;
+  }
+#else
+  fwd_txm4x4 = xd->lossless[mi->mbmi.segment_id] ? vp10_fwht4x4 : vpx_fdct4x4;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    vpx_highbd_subtract_block(
+        height, width, vp10_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
+        8, src, p->src.stride, dst, pd->dst.stride, xd->bd);
+  } else {
+    vpx_subtract_block(
+        height, width, vp10_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
+        8, src, p->src.stride, dst, pd->dst.stride);
+  }
+#else
+  vpx_subtract_block(height, width,
+                     vp10_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
+                     8, src, p->src.stride, dst, pd->dst.stride);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  k = i;
+  for (idy = 0; idy < height / 4; ++idy) {
+    for (idx = 0; idx < width / 4; ++idx) {
+      int64_t ssz, rd, rd1, rd2;
+      tran_low_t* coeff;
+
+      k += (idy * 2 + idx);
+      coeff = BLOCK_OFFSET(p->coeff, k);
+      fwd_txm4x4(vp10_raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
+                 coeff, 8);
+      vp10_regular_quantize_b_4x4(x, 0, k, so->scan, so->iscan);
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        thisdistortion += vp10_highbd_block_error(coeff,
+                                                 BLOCK_OFFSET(pd->dqcoeff, k),
+                                                 16, &ssz, xd->bd);
+      } else {
+        thisdistortion += vp10_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k),
+                                          16, &ssz);
+      }
+#else
+      thisdistortion += vp10_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k),
+                                        16, &ssz);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      thissse += ssz;
+      thisrate += cost_coeffs(x, 0, k, ta + (k & 1), tl + (k >> 1), TX_4X4,
+                              so->scan, so->neighbors,
+                              cpi->sf.use_fast_coef_costing);
+      rd1 = RDCOST(x->rdmult, x->rddiv, thisrate, thisdistortion >> 2);
+      rd2 = RDCOST(x->rdmult, x->rddiv, 0, thissse >> 2);
+      rd = VPXMIN(rd1, rd2);
+      if (rd >= best_yrd)
+        return INT64_MAX;
+    }
+  }
+
+  *distortion = thisdistortion >> 2;
+  *labelyrate = thisrate;
+  *sse = thissse >> 2;
+
+  return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);
+}
+
+typedef struct {
+  int eobs;
+  int brate;
+  int byrate;
+  int64_t bdist;
+  int64_t bsse;
+  int64_t brdcost;
+  int_mv mvs[2];
+  ENTROPY_CONTEXT ta[2];
+  ENTROPY_CONTEXT tl[2];
+} SEG_RDSTAT;
+
+typedef struct {
+  int_mv *ref_mv[2];
+  int_mv mvp;
+
+  int64_t segment_rd;
+  int r;
+  int64_t d;
+  int64_t sse;
+  int segment_yrate;
+  PREDICTION_MODE modes[4];
+  SEG_RDSTAT rdstat[4][INTER_MODES];
+  int mvthresh;
+} BEST_SEG_INFO;
+
+static INLINE int mv_check_bounds(const MACROBLOCK *x, const MV *mv) {
+  return (mv->row >> 3) < x->mv_row_min ||
+         (mv->row >> 3) > x->mv_row_max ||
+         (mv->col >> 3) < x->mv_col_min ||
+         (mv->col >> 3) > x->mv_col_max;
+}
+
+static INLINE void mi_buf_shift(MACROBLOCK *x, int i) {
+  MB_MODE_INFO *const mbmi = &x->e_mbd.mi[0]->mbmi;
+  struct macroblock_plane *const p = &x->plane[0];
+  struct macroblockd_plane *const pd = &x->e_mbd.plane[0];
+
+  p->src.buf = &p->src.buf[vp10_raster_block_offset(BLOCK_8X8, i,
+                                                   p->src.stride)];
+  assert(((intptr_t)pd->pre[0].buf & 0x7) == 0);
+  pd->pre[0].buf = &pd->pre[0].buf[vp10_raster_block_offset(BLOCK_8X8, i,
+                                                           pd->pre[0].stride)];
+  if (has_second_ref(mbmi))
+    pd->pre[1].buf = &pd->pre[1].buf[vp10_raster_block_offset(BLOCK_8X8, i,
+                                                           pd->pre[1].stride)];
+}
+
+static INLINE void mi_buf_restore(MACROBLOCK *x, struct buf_2d orig_src,
+                                  struct buf_2d orig_pre[2]) {
+  MB_MODE_INFO *mbmi = &x->e_mbd.mi[0]->mbmi;
+  x->plane[0].src = orig_src;
+  x->e_mbd.plane[0].pre[0] = orig_pre[0];
+  if (has_second_ref(mbmi))
+    x->e_mbd.plane[0].pre[1] = orig_pre[1];
+}
+
+static INLINE int mv_has_subpel(const MV *mv) {
+  return (mv->row & 0x0F) || (mv->col & 0x0F);
+}
+
+// Check if NEARESTMV/NEARMV/ZEROMV is the cheapest way encode zero motion.
+// TODO(aconverse): Find out if this is still productive then clean up or remove
+static int check_best_zero_mv(
+    const VP10_COMP *cpi, const uint8_t mode_context[MAX_REF_FRAMES],
+    int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES], int this_mode,
+    const MV_REFERENCE_FRAME ref_frames[2]) {
+  if ((this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) &&
+      frame_mv[this_mode][ref_frames[0]].as_int == 0 &&
+      (ref_frames[1] == NONE ||
+       frame_mv[this_mode][ref_frames[1]].as_int == 0)) {
+    int rfc = mode_context[ref_frames[0]];
+    int c1 = cost_mv_ref(cpi, NEARMV, rfc);
+    int c2 = cost_mv_ref(cpi, NEARESTMV, rfc);
+    int c3 = cost_mv_ref(cpi, ZEROMV, rfc);
+
+    if (this_mode == NEARMV) {
+      if (c1 > c3) return 0;
+    } else if (this_mode == NEARESTMV) {
+      if (c2 > c3) return 0;
+    } else {
+      assert(this_mode == ZEROMV);
+      if (ref_frames[1] == NONE) {
+        if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frames[0]].as_int == 0) ||
+            (c3 >= c1 && frame_mv[NEARMV][ref_frames[0]].as_int == 0))
+          return 0;
+      } else {
+        if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frames[0]].as_int == 0 &&
+             frame_mv[NEARESTMV][ref_frames[1]].as_int == 0) ||
+            (c3 >= c1 && frame_mv[NEARMV][ref_frames[0]].as_int == 0 &&
+             frame_mv[NEARMV][ref_frames[1]].as_int == 0))
+          return 0;
+      }
+    }
+  }
+  return 1;
+}
+
+static void joint_motion_search(VP10_COMP *cpi, MACROBLOCK *x,
+                                BLOCK_SIZE bsize,
+                                int_mv *frame_mv,
+                                int mi_row, int mi_col,
+                                int_mv single_newmv[MAX_REF_FRAMES],
+                                int *rate_mv) {
+  const VP10_COMMON *const cm = &cpi->common;
+  const int pw = 4 * num_4x4_blocks_wide_lookup[bsize];
+  const int ph = 4 * num_4x4_blocks_high_lookup[bsize];
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  const int refs[2] = {mbmi->ref_frame[0],
+                       mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]};
+  int_mv ref_mv[2];
+  int ite, ref;
+  const InterpKernel *kernel = vp10_filter_kernels[mbmi->interp_filter];
+  struct scale_factors sf;
+
+  // Do joint motion search in compound mode to get more accurate mv.
+  struct buf_2d backup_yv12[2][MAX_MB_PLANE];
+  int last_besterr[2] = {INT_MAX, INT_MAX};
+  const YV12_BUFFER_CONFIG *const scaled_ref_frame[2] = {
+    vp10_get_scaled_ref_frame(cpi, mbmi->ref_frame[0]),
+    vp10_get_scaled_ref_frame(cpi, mbmi->ref_frame[1])
+  };
+
+  // Prediction buffer from second frame.
+#if CONFIG_VP9_HIGHBITDEPTH
+  DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[64 * 64]);
+  uint8_t *second_pred;
+#else
+  DECLARE_ALIGNED(16, uint8_t, second_pred[64 * 64]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  for (ref = 0; ref < 2; ++ref) {
+    ref_mv[ref] = x->mbmi_ext->ref_mvs[refs[ref]][0];
+
+    if (scaled_ref_frame[ref]) {
+      int i;
+      // Swap out the reference frame for a version that's been scaled to
+      // match the resolution of the current frame, allowing the existing
+      // motion search code to be used without additional modifications.
+      for (i = 0; i < MAX_MB_PLANE; i++)
+        backup_yv12[ref][i] = xd->plane[i].pre[ref];
+      vp10_setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col,
+                           NULL);
+    }
+
+    frame_mv[refs[ref]].as_int = single_newmv[refs[ref]].as_int;
+  }
+
+  // Since we have scaled the reference frames to match the size of the current
+  // frame we must use a unit scaling factor during mode selection.
+#if CONFIG_VP9_HIGHBITDEPTH
+  vp10_setup_scale_factors_for_frame(&sf, cm->width, cm->height,
+                                    cm->width, cm->height,
+                                    cm->use_highbitdepth);
+#else
+  vp10_setup_scale_factors_for_frame(&sf, cm->width, cm->height,
+                                    cm->width, cm->height);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  // Allow joint search multiple times iteratively for each reference frame
+  // and break out of the search loop if it couldn't find a better mv.
+  for (ite = 0; ite < 4; ite++) {
+    struct buf_2d ref_yv12[2];
+    int bestsme = INT_MAX;
+    int sadpb = x->sadperbit16;
+    MV tmp_mv;
+    int search_range = 3;
+
+    int tmp_col_min = x->mv_col_min;
+    int tmp_col_max = x->mv_col_max;
+    int tmp_row_min = x->mv_row_min;
+    int tmp_row_max = x->mv_row_max;
+    int id = ite % 2;  // Even iterations search in the first reference frame,
+                       // odd iterations search in the second. The predictor
+                       // found for the 'other' reference frame is factored in.
+
+    // Initialized here because of compiler problem in Visual Studio.
+    ref_yv12[0] = xd->plane[0].pre[0];
+    ref_yv12[1] = xd->plane[0].pre[1];
+
+    // Get the prediction block from the 'other' reference frame.
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16);
+      vp10_highbd_build_inter_predictor(ref_yv12[!id].buf,
+                                       ref_yv12[!id].stride,
+                                       second_pred, pw,
+                                       &frame_mv[refs[!id]].as_mv,
+                                       &sf, pw, ph, 0,
+                                       kernel, MV_PRECISION_Q3,
+                                       mi_col * MI_SIZE, mi_row * MI_SIZE,
+                                       xd->bd);
+    } else {
+      second_pred = (uint8_t *)second_pred_alloc_16;
+      vp10_build_inter_predictor(ref_yv12[!id].buf,
+                                ref_yv12[!id].stride,
+                                second_pred, pw,
+                                &frame_mv[refs[!id]].as_mv,
+                                &sf, pw, ph, 0,
+                                kernel, MV_PRECISION_Q3,
+                                mi_col * MI_SIZE, mi_row * MI_SIZE);
+    }
+#else
+    vp10_build_inter_predictor(ref_yv12[!id].buf,
+                              ref_yv12[!id].stride,
+                              second_pred, pw,
+                              &frame_mv[refs[!id]].as_mv,
+                              &sf, pw, ph, 0,
+                              kernel, MV_PRECISION_Q3,
+                              mi_col * MI_SIZE, mi_row * MI_SIZE);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    // Do compound motion search on the current reference frame.
+    if (id)
+      xd->plane[0].pre[0] = ref_yv12[id];
+    vp10_set_mv_search_range(x, &ref_mv[id].as_mv);
+
+    // Use the mv result from the single mode as mv predictor.
+    tmp_mv = frame_mv[refs[id]].as_mv;
+
+    tmp_mv.col >>= 3;
+    tmp_mv.row >>= 3;
+
+    // Small-range full-pixel motion search.
+    bestsme = vp10_refining_search_8p_c(x, &tmp_mv, sadpb,
+                                       search_range,
+                                       &cpi->fn_ptr[bsize],
+                                       &ref_mv[id].as_mv, second_pred);
+    if (bestsme < INT_MAX)
+      bestsme = vp10_get_mvpred_av_var(x, &tmp_mv, &ref_mv[id].as_mv,
+                                      second_pred, &cpi->fn_ptr[bsize], 1);
+
+    x->mv_col_min = tmp_col_min;
+    x->mv_col_max = tmp_col_max;
+    x->mv_row_min = tmp_row_min;
+    x->mv_row_max = tmp_row_max;
+
+    if (bestsme < INT_MAX) {
+      int dis; /* TODO: use dis in distortion calculation later. */
+      unsigned int sse;
+      bestsme = cpi->find_fractional_mv_step(
+          x, &tmp_mv,
+          &ref_mv[id].as_mv,
+          cpi->common.allow_high_precision_mv,
+          x->errorperbit,
+          &cpi->fn_ptr[bsize],
+          0, cpi->sf.mv.subpel_iters_per_step,
+          NULL,
+          x->nmvjointcost, x->mvcost,
+          &dis, &sse, second_pred,
+          pw, ph);
+    }
+
+    // Restore the pointer to the first (possibly scaled) prediction buffer.
+    if (id)
+      xd->plane[0].pre[0] = ref_yv12[0];
+
+    if (bestsme < last_besterr[id]) {
+      frame_mv[refs[id]].as_mv = tmp_mv;
+      last_besterr[id] = bestsme;
+    } else {
+      break;
+    }
+  }
+
+  *rate_mv = 0;
+
+  for (ref = 0; ref < 2; ++ref) {
+    if (scaled_ref_frame[ref]) {
+      // Restore the prediction frame pointers to their unscaled versions.
+      int i;
+      for (i = 0; i < MAX_MB_PLANE; i++)
+        xd->plane[i].pre[ref] = backup_yv12[ref][i];
+    }
+
+    *rate_mv += vp10_mv_bit_cost(&frame_mv[refs[ref]].as_mv,
+                                &x->mbmi_ext->ref_mvs[refs[ref]][0].as_mv,
+                                x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+  }
+}
+
+static int64_t rd_pick_best_sub8x8_mode(VP10_COMP *cpi, MACROBLOCK *x,
+                                        int_mv *best_ref_mv,
+                                        int_mv *second_best_ref_mv,
+                                        int64_t best_rd, int *returntotrate,
+                                        int *returnyrate,
+                                        int64_t *returndistortion,
+                                        int *skippable, int64_t *psse,
+                                        int mvthresh,
+                                        int_mv seg_mvs[4][MAX_REF_FRAMES],
+                                        BEST_SEG_INFO *bsi_buf, int filter_idx,
+                                        int mi_row, int mi_col) {
+  int i;
+  BEST_SEG_INFO *bsi = bsi_buf + filter_idx;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MODE_INFO *mi = xd->mi[0];
+  MB_MODE_INFO *mbmi = &mi->mbmi;
+  int mode_idx;
+  int k, br = 0, idx, idy;
+  int64_t bd = 0, block_sse = 0;
+  PREDICTION_MODE this_mode;
+  VP10_COMMON *cm = &cpi->common;
+  struct macroblock_plane *const p = &x->plane[0];
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  const int label_count = 4;
+  int64_t this_segment_rd = 0;
+  int label_mv_thresh;
+  int segmentyrate = 0;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+  const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
+  ENTROPY_CONTEXT t_above[2], t_left[2];
+  int subpelmv = 1, have_ref = 0;
+  const int has_second_rf = has_second_ref(mbmi);
+  const int inter_mode_mask = cpi->sf.inter_mode_mask[bsize];
+  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+
+  vp10_zero(*bsi);
+
+  bsi->segment_rd = best_rd;
+  bsi->ref_mv[0] = best_ref_mv;
+  bsi->ref_mv[1] = second_best_ref_mv;
+  bsi->mvp.as_int = best_ref_mv->as_int;
+  bsi->mvthresh = mvthresh;
+
+  for (i = 0; i < 4; i++)
+    bsi->modes[i] = ZEROMV;
+
+  memcpy(t_above, pd->above_context, sizeof(t_above));
+  memcpy(t_left, pd->left_context, sizeof(t_left));
+
+  // 64 makes this threshold really big effectively
+  // making it so that we very rarely check mvs on
+  // segments.   setting this to 1 would make mv thresh
+  // roughly equal to what it is for macroblocks
+  label_mv_thresh = 1 * bsi->mvthresh / label_count;
+
+  // Segmentation method overheads
+  for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
+    for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
+      // TODO(jingning,rbultje): rewrite the rate-distortion optimization
+      // loop for 4x4/4x8/8x4 block coding. to be replaced with new rd loop
+      int_mv mode_mv[MB_MODE_COUNT][2];
+      int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
+      PREDICTION_MODE mode_selected = ZEROMV;
+      int64_t best_rd = INT64_MAX;
+      const int i = idy * 2 + idx;
+      int ref;
+
+      for (ref = 0; ref < 1 + has_second_rf; ++ref) {
+        const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
+        frame_mv[ZEROMV][frame].as_int = 0;
+        vp10_append_sub8x8_mvs_for_idx(cm, xd, i, ref, mi_row, mi_col,
+                                      &frame_mv[NEARESTMV][frame],
+                                      &frame_mv[NEARMV][frame],
+                                      mbmi_ext->mode_context);
+      }
+
+      // search for the best motion vector on this segment
+      for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
+        const struct buf_2d orig_src = x->plane[0].src;
+        struct buf_2d orig_pre[2];
+
+        mode_idx = INTER_OFFSET(this_mode);
+        bsi->rdstat[i][mode_idx].brdcost = INT64_MAX;
+        if (!(inter_mode_mask & (1 << this_mode)))
+          continue;
+
+        if (!check_best_zero_mv(cpi, mbmi_ext->mode_context, frame_mv,
+                                this_mode, mbmi->ref_frame))
+          continue;
+
+        memcpy(orig_pre, pd->pre, sizeof(orig_pre));
+        memcpy(bsi->rdstat[i][mode_idx].ta, t_above,
+               sizeof(bsi->rdstat[i][mode_idx].ta));
+        memcpy(bsi->rdstat[i][mode_idx].tl, t_left,
+               sizeof(bsi->rdstat[i][mode_idx].tl));
+
+        // motion search for newmv (single predictor case only)
+        if (!has_second_rf && this_mode == NEWMV &&
+            seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV) {
+          MV *const new_mv = &mode_mv[NEWMV][0].as_mv;
+          int step_param = 0;
+          int bestsme = INT_MAX;
+          int sadpb = x->sadperbit4;
+          MV mvp_full;
+          int max_mv;
+          int cost_list[5];
+
+          /* Is the best so far sufficiently good that we cant justify doing
+           * and new motion search. */
+          if (best_rd < label_mv_thresh)
+            break;
+
+          if (cpi->oxcf.mode != BEST) {
+            // use previous block's result as next block's MV predictor.
+            if (i > 0) {
+              bsi->mvp.as_int = mi->bmi[i - 1].as_mv[0].as_int;
+              if (i == 2)
+                bsi->mvp.as_int = mi->bmi[i - 2].as_mv[0].as_int;
+            }
+          }
+          if (i == 0)
+            max_mv = x->max_mv_context[mbmi->ref_frame[0]];
+          else
+            max_mv =
+                VPXMAX(abs(bsi->mvp.as_mv.row), abs(bsi->mvp.as_mv.col)) >> 3;
+
+          if (cpi->sf.mv.auto_mv_step_size && cm->show_frame) {
+            // Take wtd average of the step_params based on the last frame's
+            // max mv magnitude and the best ref mvs of the current block for
+            // the given reference.
+            step_param = (vp10_init_search_range(max_mv) +
+                              cpi->mv_step_param) / 2;
+          } else {
+            step_param = cpi->mv_step_param;
+          }
+
+          mvp_full.row = bsi->mvp.as_mv.row >> 3;
+          mvp_full.col = bsi->mvp.as_mv.col >> 3;
+
+          if (cpi->sf.adaptive_motion_search) {
+            mvp_full.row = x->pred_mv[mbmi->ref_frame[0]].row >> 3;
+            mvp_full.col = x->pred_mv[mbmi->ref_frame[0]].col >> 3;
+            step_param = VPXMAX(step_param, 8);
+          }
+
+          // adjust src pointer for this block
+          mi_buf_shift(x, i);
+
+          vp10_set_mv_search_range(x, &bsi->ref_mv[0]->as_mv);
+
+          bestsme = vp10_full_pixel_search(
+              cpi, x, bsize, &mvp_full, step_param, sadpb,
+              cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL,
+              &bsi->ref_mv[0]->as_mv, new_mv,
+              INT_MAX, 1);
+
+          if (bestsme < INT_MAX) {
+            int distortion;
+            cpi->find_fractional_mv_step(
+                x,
+                new_mv,
+                &bsi->ref_mv[0]->as_mv,
+                cm->allow_high_precision_mv,
+                x->errorperbit, &cpi->fn_ptr[bsize],
+                cpi->sf.mv.subpel_force_stop,
+                cpi->sf.mv.subpel_iters_per_step,
+                cond_cost_list(cpi, cost_list),
+                x->nmvjointcost, x->mvcost,
+                &distortion,
+                &x->pred_sse[mbmi->ref_frame[0]],
+                NULL, 0, 0);
+
+            // save motion search result for use in compound prediction
+            seg_mvs[i][mbmi->ref_frame[0]].as_mv = *new_mv;
+          }
+
+          if (cpi->sf.adaptive_motion_search)
+            x->pred_mv[mbmi->ref_frame[0]] = *new_mv;
+
+          // restore src pointers
+          mi_buf_restore(x, orig_src, orig_pre);
+        }
+
+        if (has_second_rf) {
+          if (seg_mvs[i][mbmi->ref_frame[1]].as_int == INVALID_MV ||
+              seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV)
+            continue;
+        }
+
+        if (has_second_rf && this_mode == NEWMV &&
+            mbmi->interp_filter == EIGHTTAP) {
+          // adjust src pointers
+          mi_buf_shift(x, i);
+          if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
+            int rate_mv;
+            joint_motion_search(cpi, x, bsize, frame_mv[this_mode],
+                                mi_row, mi_col, seg_mvs[i],
+                                &rate_mv);
+            seg_mvs[i][mbmi->ref_frame[0]].as_int =
+                frame_mv[this_mode][mbmi->ref_frame[0]].as_int;
+            seg_mvs[i][mbmi->ref_frame[1]].as_int =
+                frame_mv[this_mode][mbmi->ref_frame[1]].as_int;
+          }
+          // restore src pointers
+          mi_buf_restore(x, orig_src, orig_pre);
+        }
+
+        bsi->rdstat[i][mode_idx].brate =
+            set_and_cost_bmi_mvs(cpi, x, xd, i, this_mode, mode_mv[this_mode],
+                                 frame_mv, seg_mvs[i], bsi->ref_mv,
+                                 x->nmvjointcost, x->mvcost);
+
+        for (ref = 0; ref < 1 + has_second_rf; ++ref) {
+          bsi->rdstat[i][mode_idx].mvs[ref].as_int =
+              mode_mv[this_mode][ref].as_int;
+          if (num_4x4_blocks_wide > 1)
+            bsi->rdstat[i + 1][mode_idx].mvs[ref].as_int =
+                mode_mv[this_mode][ref].as_int;
+          if (num_4x4_blocks_high > 1)
+            bsi->rdstat[i + 2][mode_idx].mvs[ref].as_int =
+                mode_mv[this_mode][ref].as_int;
+        }
+
+        // Trap vectors that reach beyond the UMV borders
+        if (mv_check_bounds(x, &mode_mv[this_mode][0].as_mv) ||
+            (has_second_rf &&
+             mv_check_bounds(x, &mode_mv[this_mode][1].as_mv)))
+          continue;
+
+        if (filter_idx > 0) {
+          BEST_SEG_INFO *ref_bsi = bsi_buf;
+          subpelmv = 0;
+          have_ref = 1;
+
+          for (ref = 0; ref < 1 + has_second_rf; ++ref) {
+            subpelmv |= mv_has_subpel(&mode_mv[this_mode][ref].as_mv);
+            have_ref &= mode_mv[this_mode][ref].as_int ==
+                ref_bsi->rdstat[i][mode_idx].mvs[ref].as_int;
+          }
+
+          if (filter_idx > 1 && !subpelmv && !have_ref) {
+            ref_bsi = bsi_buf + 1;
+            have_ref = 1;
+            for (ref = 0; ref < 1 + has_second_rf; ++ref)
+              have_ref &= mode_mv[this_mode][ref].as_int ==
+                  ref_bsi->rdstat[i][mode_idx].mvs[ref].as_int;
+          }
+
+          if (!subpelmv && have_ref &&
+              ref_bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) {
+            memcpy(&bsi->rdstat[i][mode_idx], &ref_bsi->rdstat[i][mode_idx],
+                   sizeof(SEG_RDSTAT));
+            if (num_4x4_blocks_wide > 1)
+              bsi->rdstat[i + 1][mode_idx].eobs =
+                  ref_bsi->rdstat[i + 1][mode_idx].eobs;
+            if (num_4x4_blocks_high > 1)
+              bsi->rdstat[i + 2][mode_idx].eobs =
+                  ref_bsi->rdstat[i + 2][mode_idx].eobs;
+
+            if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
+              mode_selected = this_mode;
+              best_rd = bsi->rdstat[i][mode_idx].brdcost;
+            }
+            continue;
+          }
+        }
+
+        bsi->rdstat[i][mode_idx].brdcost =
+            encode_inter_mb_segment(cpi, x,
+                                    bsi->segment_rd - this_segment_rd, i,
+                                    &bsi->rdstat[i][mode_idx].byrate,
+                                    &bsi->rdstat[i][mode_idx].bdist,
+                                    &bsi->rdstat[i][mode_idx].bsse,
+                                    bsi->rdstat[i][mode_idx].ta,
+                                    bsi->rdstat[i][mode_idx].tl,
+                                    idy, idx,
+                                    mi_row, mi_col);
+        if (bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) {
+          bsi->rdstat[i][mode_idx].brdcost += RDCOST(x->rdmult, x->rddiv,
+                                            bsi->rdstat[i][mode_idx].brate, 0);
+          bsi->rdstat[i][mode_idx].brate += bsi->rdstat[i][mode_idx].byrate;
+          bsi->rdstat[i][mode_idx].eobs = p->eobs[i];
+          if (num_4x4_blocks_wide > 1)
+            bsi->rdstat[i + 1][mode_idx].eobs = p->eobs[i + 1];
+          if (num_4x4_blocks_high > 1)
+            bsi->rdstat[i + 2][mode_idx].eobs = p->eobs[i + 2];
+        }
+
+        if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
+          mode_selected = this_mode;
+          best_rd = bsi->rdstat[i][mode_idx].brdcost;
+        }
+      } /*for each 4x4 mode*/
+
+      if (best_rd == INT64_MAX) {
+        int iy, midx;
+        for (iy = i + 1; iy < 4; ++iy)
+          for (midx = 0; midx < INTER_MODES; ++midx)
+            bsi->rdstat[iy][midx].brdcost = INT64_MAX;
+        bsi->segment_rd = INT64_MAX;
+        return INT64_MAX;
+      }
+
+      mode_idx = INTER_OFFSET(mode_selected);
+      memcpy(t_above, bsi->rdstat[i][mode_idx].ta, sizeof(t_above));
+      memcpy(t_left, bsi->rdstat[i][mode_idx].tl, sizeof(t_left));
+
+      set_and_cost_bmi_mvs(cpi, x, xd, i, mode_selected, mode_mv[mode_selected],
+                           frame_mv, seg_mvs[i], bsi->ref_mv, x->nmvjointcost,
+                           x->mvcost);
+
+      br += bsi->rdstat[i][mode_idx].brate;
+      bd += bsi->rdstat[i][mode_idx].bdist;
+      block_sse += bsi->rdstat[i][mode_idx].bsse;
+      segmentyrate += bsi->rdstat[i][mode_idx].byrate;
+      this_segment_rd += bsi->rdstat[i][mode_idx].brdcost;
+
+      if (this_segment_rd > bsi->segment_rd) {
+        int iy, midx;
+        for (iy = i + 1; iy < 4; ++iy)
+          for (midx = 0; midx < INTER_MODES; ++midx)
+            bsi->rdstat[iy][midx].brdcost = INT64_MAX;
+        bsi->segment_rd = INT64_MAX;
+        return INT64_MAX;
+      }
+    }
+  } /* for each label */
+
+  bsi->r = br;
+  bsi->d = bd;
+  bsi->segment_yrate = segmentyrate;
+  bsi->segment_rd = this_segment_rd;
+  bsi->sse = block_sse;
+
+  // update the coding decisions
+  for (k = 0; k < 4; ++k)
+    bsi->modes[k] = mi->bmi[k].as_mode;
+
+  if (bsi->segment_rd > best_rd)
+    return INT64_MAX;
+  /* set it to the best */
+  for (i = 0; i < 4; i++) {
+    mode_idx = INTER_OFFSET(bsi->modes[i]);
+    mi->bmi[i].as_mv[0].as_int = bsi->rdstat[i][mode_idx].mvs[0].as_int;
+    if (has_second_ref(mbmi))
+      mi->bmi[i].as_mv[1].as_int = bsi->rdstat[i][mode_idx].mvs[1].as_int;
+    x->plane[0].eobs[i] = bsi->rdstat[i][mode_idx].eobs;
+    mi->bmi[i].as_mode = bsi->modes[i];
+  }
+
+  /*
+   * used to set mbmi->mv.as_int
+   */
+  *returntotrate = bsi->r;
+  *returndistortion = bsi->d;
+  *returnyrate = bsi->segment_yrate;
+  *skippable = vp10_is_skippable_in_plane(x, BLOCK_8X8, 0);
+  *psse = bsi->sse;
+  mbmi->mode = bsi->modes[3];
+
+  return bsi->segment_rd;
+}
+
+static void estimate_ref_frame_costs(const VP10_COMMON *cm,
+                                     const MACROBLOCKD *xd,
+                                     int segment_id,
+                                     unsigned int *ref_costs_single,
+                                     unsigned int *ref_costs_comp,
+                                     vpx_prob *comp_mode_p) {
+  int seg_ref_active = segfeature_active(&cm->seg, segment_id,
+                                         SEG_LVL_REF_FRAME);
+  if (seg_ref_active) {
+    memset(ref_costs_single, 0, MAX_REF_FRAMES * sizeof(*ref_costs_single));
+    memset(ref_costs_comp,   0, MAX_REF_FRAMES * sizeof(*ref_costs_comp));
+    *comp_mode_p = 128;
+  } else {
+    vpx_prob intra_inter_p = vp10_get_intra_inter_prob(cm, xd);
+    vpx_prob comp_inter_p = 128;
+
+    if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+      comp_inter_p = vp10_get_reference_mode_prob(cm, xd);
+      *comp_mode_p = comp_inter_p;
+    } else {
+      *comp_mode_p = 128;
+    }
+
+    ref_costs_single[INTRA_FRAME] = vp10_cost_bit(intra_inter_p, 0);
+
+    if (cm->reference_mode != COMPOUND_REFERENCE) {
+      vpx_prob ref_single_p1 = vp10_get_pred_prob_single_ref_p1(cm, xd);
+      vpx_prob ref_single_p2 = vp10_get_pred_prob_single_ref_p2(cm, xd);
+      unsigned int base_cost = vp10_cost_bit(intra_inter_p, 1);
+
+      if (cm->reference_mode == REFERENCE_MODE_SELECT)
+        base_cost += vp10_cost_bit(comp_inter_p, 0);
+
+      ref_costs_single[LAST_FRAME] = ref_costs_single[GOLDEN_FRAME] =
+          ref_costs_single[ALTREF_FRAME] = base_cost;
+      ref_costs_single[LAST_FRAME]   += vp10_cost_bit(ref_single_p1, 0);
+      ref_costs_single[GOLDEN_FRAME] += vp10_cost_bit(ref_single_p1, 1);
+      ref_costs_single[ALTREF_FRAME] += vp10_cost_bit(ref_single_p1, 1);
+      ref_costs_single[GOLDEN_FRAME] += vp10_cost_bit(ref_single_p2, 0);
+      ref_costs_single[ALTREF_FRAME] += vp10_cost_bit(ref_single_p2, 1);
+    } else {
+      ref_costs_single[LAST_FRAME]   = 512;
+      ref_costs_single[GOLDEN_FRAME] = 512;
+      ref_costs_single[ALTREF_FRAME] = 512;
+    }
+    if (cm->reference_mode != SINGLE_REFERENCE) {
+      vpx_prob ref_comp_p = vp10_get_pred_prob_comp_ref_p(cm, xd);
+      unsigned int base_cost = vp10_cost_bit(intra_inter_p, 1);
+
+      if (cm->reference_mode == REFERENCE_MODE_SELECT)
+        base_cost += vp10_cost_bit(comp_inter_p, 1);
+
+      ref_costs_comp[LAST_FRAME]   = base_cost + vp10_cost_bit(ref_comp_p, 0);
+      ref_costs_comp[GOLDEN_FRAME] = base_cost + vp10_cost_bit(ref_comp_p, 1);
+    } else {
+      ref_costs_comp[LAST_FRAME]   = 512;
+      ref_costs_comp[GOLDEN_FRAME] = 512;
+    }
+  }
+}
+
+static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
+                         int mode_index,
+                         int64_t comp_pred_diff[REFERENCE_MODES],
+                         int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS],
+                         int skippable) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  // Take a snapshot of the coding context so it can be
+  // restored if we decide to encode this way
+  ctx->skip = x->skip;
+  ctx->skippable = skippable;
+  ctx->best_mode_index = mode_index;
+  ctx->mic = *xd->mi[0];
+  ctx->mbmi_ext = *x->mbmi_ext;
+  ctx->single_pred_diff = (int)comp_pred_diff[SINGLE_REFERENCE];
+  ctx->comp_pred_diff   = (int)comp_pred_diff[COMPOUND_REFERENCE];
+  ctx->hybrid_pred_diff = (int)comp_pred_diff[REFERENCE_MODE_SELECT];
+
+  memcpy(ctx->best_filter_diff, best_filter_diff,
+         sizeof(*best_filter_diff) * SWITCHABLE_FILTER_CONTEXTS);
+}
+
+static void setup_buffer_inter(VP10_COMP *cpi, MACROBLOCK *x,
+                               MV_REFERENCE_FRAME ref_frame,
+                               BLOCK_SIZE block_size,
+                               int mi_row, int mi_col,
+                               int_mv frame_nearest_mv[MAX_REF_FRAMES],
+                               int_mv frame_near_mv[MAX_REF_FRAMES],
+                               struct buf_2d yv12_mb[4][MAX_MB_PLANE]) {
+  const VP10_COMMON *cm = &cpi->common;
+  const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *const mi = xd->mi[0];
+  int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame];
+  const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
+  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+
+  assert(yv12 != NULL);
+
+  // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this
+  // use the UV scaling factors.
+  vp10_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf);
+
+  // Gets an initial list of candidate vectors from neighbours and orders them
+  vp10_find_mv_refs(cm, xd, mi, ref_frame, candidates, mi_row, mi_col,
+                   NULL, NULL, mbmi_ext->mode_context);
+
+  // Candidate refinement carried out at encoder and decoder
+  vp10_find_best_ref_mvs(cm->allow_high_precision_mv, candidates,
+                         &frame_nearest_mv[ref_frame],
+                         &frame_near_mv[ref_frame]);
+
+  // Further refinement that is encode side only to test the top few candidates
+  // in full and choose the best as the centre point for subsequent searches.
+  // The current implementation doesn't support scaling.
+  if (!vp10_is_scaled(sf) && block_size >= BLOCK_8X8)
+    vp10_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride,
+                ref_frame, block_size);
+}
+
+static void single_motion_search(VP10_COMP *cpi, MACROBLOCK *x,
+                                 BLOCK_SIZE bsize,
+                                 int mi_row, int mi_col,
+                                 int_mv *tmp_mv, int *rate_mv) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const VP10_COMMON *cm = &cpi->common;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0, 0}};
+  int bestsme = INT_MAX;
+  int step_param;
+  int sadpb = x->sadperbit16;
+  MV mvp_full;
+  int ref = mbmi->ref_frame[0];
+  MV ref_mv = x->mbmi_ext->ref_mvs[ref][0].as_mv;
+
+  int tmp_col_min = x->mv_col_min;
+  int tmp_col_max = x->mv_col_max;
+  int tmp_row_min = x->mv_row_min;
+  int tmp_row_max = x->mv_row_max;
+  int cost_list[5];
+
+  const YV12_BUFFER_CONFIG *scaled_ref_frame = vp10_get_scaled_ref_frame(cpi,
+                                                                        ref);
+
+  MV pred_mv[3];
+  pred_mv[0] = x->mbmi_ext->ref_mvs[ref][0].as_mv;
+  pred_mv[1] = x->mbmi_ext->ref_mvs[ref][1].as_mv;
+  pred_mv[2] = x->pred_mv[ref];
+
+  if (scaled_ref_frame) {
+    int i;
+    // Swap out the reference frame for a version that's been scaled to
+    // match the resolution of the current frame, allowing the existing
+    // motion search code to be used without additional modifications.
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      backup_yv12[i] = xd->plane[i].pre[0];
+
+    vp10_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
+  }
+
+  vp10_set_mv_search_range(x, &ref_mv);
+
+  // Work out the size of the first step in the mv step search.
+  // 0 here is maximum length first step. 1 is VPXMAX >> 1 etc.
+  if (cpi->sf.mv.auto_mv_step_size && cm->show_frame) {
+    // Take wtd average of the step_params based on the last frame's
+    // max mv magnitude and that based on the best ref mvs of the current
+    // block for the given reference.
+    step_param = (vp10_init_search_range(x->max_mv_context[ref]) +
+                    cpi->mv_step_param) / 2;
+  } else {
+    step_param = cpi->mv_step_param;
+  }
+
+  if (cpi->sf.adaptive_motion_search && bsize < BLOCK_64X64) {
+    int boffset =
+        2 * (b_width_log2_lookup[BLOCK_64X64] -
+             VPXMIN(b_height_log2_lookup[bsize], b_width_log2_lookup[bsize]));
+    step_param = VPXMAX(step_param, boffset);
+  }
+
+  if (cpi->sf.adaptive_motion_search) {
+    int bwl = b_width_log2_lookup[bsize];
+    int bhl = b_height_log2_lookup[bsize];
+    int tlevel = x->pred_mv_sad[ref] >> (bwl + bhl + 4);
+
+    if (tlevel < 5)
+      step_param += 2;
+
+    // prev_mv_sad is not setup for dynamically scaled frames.
+    if (cpi->oxcf.resize_mode != RESIZE_DYNAMIC) {
+      int i;
+      for (i = LAST_FRAME; i <= ALTREF_FRAME && cm->show_frame; ++i) {
+        if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
+          x->pred_mv[ref].row = 0;
+          x->pred_mv[ref].col = 0;
+          tmp_mv->as_int = INVALID_MV;
+
+          if (scaled_ref_frame) {
+            int i;
+            for (i = 0; i < MAX_MB_PLANE; ++i)
+              xd->plane[i].pre[0] = backup_yv12[i];
+          }
+          return;
+        }
+      }
+    }
+  }
+
+  mvp_full = pred_mv[x->mv_best_ref_index[ref]];
+
+  mvp_full.col >>= 3;
+  mvp_full.row >>= 3;
+
+  bestsme = vp10_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb,
+                                  cond_cost_list(cpi, cost_list),
+                                  &ref_mv, &tmp_mv->as_mv, INT_MAX, 1);
+
+  x->mv_col_min = tmp_col_min;
+  x->mv_col_max = tmp_col_max;
+  x->mv_row_min = tmp_row_min;
+  x->mv_row_max = tmp_row_max;
+
+  if (bestsme < INT_MAX) {
+    int dis;  /* TODO: use dis in distortion calculation later. */
+    cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv,
+                                 cm->allow_high_precision_mv,
+                                 x->errorperbit,
+                                 &cpi->fn_ptr[bsize],
+                                 cpi->sf.mv.subpel_force_stop,
+                                 cpi->sf.mv.subpel_iters_per_step,
+                                 cond_cost_list(cpi, cost_list),
+                                 x->nmvjointcost, x->mvcost,
+                                 &dis, &x->pred_sse[ref], NULL, 0, 0);
+  }
+  *rate_mv = vp10_mv_bit_cost(&tmp_mv->as_mv, &ref_mv,
+                             x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+
+  if (cpi->sf.adaptive_motion_search)
+    x->pred_mv[ref] = tmp_mv->as_mv;
+
+  if (scaled_ref_frame) {
+    int i;
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      xd->plane[i].pre[0] = backup_yv12[i];
+  }
+}
+
+
+
+static INLINE void restore_dst_buf(MACROBLOCKD *xd,
+                                   uint8_t *orig_dst[MAX_MB_PLANE],
+                                   int orig_dst_stride[MAX_MB_PLANE]) {
+  int i;
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    xd->plane[i].dst.buf = orig_dst[i];
+    xd->plane[i].dst.stride = orig_dst_stride[i];
+  }
+}
+
+// In some situations we want to discount tha pparent cost of a new motion
+// vector. Where there is a subtle motion field and especially where there is
+// low spatial complexity then it can be hard to cover the cost of a new motion
+// vector in a single block, even if that motion vector reduces distortion.
+// However, once established that vector may be usable through the nearest and
+// near mv modes to reduce distortion in subsequent blocks and also improve
+// visual quality.
+static int discount_newmv_test(const VP10_COMP *cpi,
+                               int this_mode,
+                               int_mv this_mv,
+                               int_mv (*mode_mv)[MAX_REF_FRAMES],
+                               int ref_frame) {
+  return (!cpi->rc.is_src_frame_alt_ref &&
+          (this_mode == NEWMV) &&
+          (this_mv.as_int != 0) &&
+          ((mode_mv[NEARESTMV][ref_frame].as_int == 0) ||
+           (mode_mv[NEARESTMV][ref_frame].as_int == INVALID_MV)) &&
+          ((mode_mv[NEARMV][ref_frame].as_int == 0) ||
+           (mode_mv[NEARMV][ref_frame].as_int == INVALID_MV)));
+}
+
+#define LEFT_TOP_MARGIN ((VP9_ENC_BORDER_IN_PIXELS - VP9_INTERP_EXTEND) << 3)
+#define RIGHT_BOTTOM_MARGIN ((VP9_ENC_BORDER_IN_PIXELS -\
+                                VP9_INTERP_EXTEND) << 3)
+
+// TODO(jingning): this mv clamping function should be block size dependent.
+static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
+  clamp_mv(mv, xd->mb_to_left_edge - LEFT_TOP_MARGIN,
+               xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
+               xd->mb_to_top_edge - LEFT_TOP_MARGIN,
+               xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
+}
+
+static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x,
+                                 BLOCK_SIZE bsize,
+                                 int *rate2, int64_t *distortion,
+                                 int *skippable,
+                                 int *rate_y, int *rate_uv,
+                                 int *disable_skip,
+                                 int_mv (*mode_mv)[MAX_REF_FRAMES],
+                                 int mi_row, int mi_col,
+                                 int_mv single_newmv[MAX_REF_FRAMES],
+                                 INTERP_FILTER (*single_filter)[MAX_REF_FRAMES],
+                                 int (*single_skippable)[MAX_REF_FRAMES],
+                                 int64_t *psse,
+                                 const int64_t ref_best_rd,
+                                 int64_t *mask_filter,
+                                 int64_t filter_cache[]) {
+  VP10_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  const int is_comp_pred = has_second_ref(mbmi);
+  const int this_mode = mbmi->mode;
+  int_mv *frame_mv = mode_mv[this_mode];
+  int i;
+  int refs[2] = { mbmi->ref_frame[0],
+    (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
+  int_mv cur_mv[2];
+#if CONFIG_VP9_HIGHBITDEPTH
+  DECLARE_ALIGNED(16, uint16_t, tmp_buf16[MAX_MB_PLANE * 64 * 64]);
+  uint8_t *tmp_buf;
+#else
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf[MAX_MB_PLANE * 64 * 64]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  int pred_exists = 0;
+  int intpel_mv;
+  int64_t rd, tmp_rd, best_rd = INT64_MAX;
+  int best_needs_copy = 0;
+  uint8_t *orig_dst[MAX_MB_PLANE];
+  int orig_dst_stride[MAX_MB_PLANE];
+  int rs = 0;
+  INTERP_FILTER best_filter = SWITCHABLE;
+  uint8_t skip_txfm[MAX_MB_PLANE << 2] = {0};
+  int64_t bsse[MAX_MB_PLANE << 2] = {0};
+
+  int bsl = mi_width_log2_lookup[bsize];
+  int pred_filter_search = cpi->sf.cb_pred_filter_search ?
+      (((mi_row + mi_col) >> bsl) +
+       get_chessboard_index(cm->current_video_frame)) & 0x1 : 0;
+
+  int skip_txfm_sb = 0;
+  int64_t skip_sse_sb = INT64_MAX;
+  int64_t distortion_y = 0, distortion_uv = 0;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    tmp_buf = CONVERT_TO_BYTEPTR(tmp_buf16);
+  } else {
+    tmp_buf = (uint8_t *)tmp_buf16;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  if (pred_filter_search) {
+    INTERP_FILTER af = SWITCHABLE, lf = SWITCHABLE;
+    if (xd->up_available)
+      af = xd->mi[-xd->mi_stride]->mbmi.interp_filter;
+    if (xd->left_available)
+      lf = xd->mi[-1]->mbmi.interp_filter;
+
+    if ((this_mode != NEWMV) || (af == lf))
+      best_filter = af;
+  }
+
+  if (is_comp_pred) {
+    if (frame_mv[refs[0]].as_int == INVALID_MV ||
+        frame_mv[refs[1]].as_int == INVALID_MV)
+      return INT64_MAX;
+
+    if (cpi->sf.adaptive_mode_search) {
+      if (single_filter[this_mode][refs[0]] ==
+          single_filter[this_mode][refs[1]])
+        best_filter = single_filter[this_mode][refs[0]];
+    }
+  }
+
+  if (this_mode == NEWMV) {
+    int rate_mv;
+    if (is_comp_pred) {
+      // Initialize mv using single prediction mode result.
+      frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
+      frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
+
+      if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
+        joint_motion_search(cpi, x, bsize, frame_mv,
+                            mi_row, mi_col, single_newmv, &rate_mv);
+      } else {
+        rate_mv  = vp10_mv_bit_cost(&frame_mv[refs[0]].as_mv,
+                                   &x->mbmi_ext->ref_mvs[refs[0]][0].as_mv,
+                                   x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+        rate_mv += vp10_mv_bit_cost(&frame_mv[refs[1]].as_mv,
+                                   &x->mbmi_ext->ref_mvs[refs[1]][0].as_mv,
+                                   x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+      }
+      *rate2 += rate_mv;
+    } else {
+      int_mv tmp_mv;
+      single_motion_search(cpi, x, bsize, mi_row, mi_col,
+                           &tmp_mv, &rate_mv);
+      if (tmp_mv.as_int == INVALID_MV)
+        return INT64_MAX;
+
+      frame_mv[refs[0]].as_int =
+          xd->mi[0]->bmi[0].as_mv[0].as_int = tmp_mv.as_int;
+      single_newmv[refs[0]].as_int = tmp_mv.as_int;
+
+      // Estimate the rate implications of a new mv but discount this
+      // under certain circumstances where we want to help initiate a weak
+      // motion field, where the distortion gain for a single block may not
+      // be enough to overcome the cost of a new mv.
+      if (discount_newmv_test(cpi, this_mode, tmp_mv, mode_mv, refs[0])) {
+        *rate2 += VPXMAX((rate_mv / NEW_MV_DISCOUNT_FACTOR), 1);
+      } else {
+        *rate2 += rate_mv;
+      }
+    }
+  }
+
+  for (i = 0; i < is_comp_pred + 1; ++i) {
+    cur_mv[i] = frame_mv[refs[i]];
+    // Clip "next_nearest" so that it does not extend to far out of image
+    if (this_mode != NEWMV)
+      clamp_mv2(&cur_mv[i].as_mv, xd);
+
+    if (mv_check_bounds(x, &cur_mv[i].as_mv))
+      return INT64_MAX;
+    mbmi->mv[i].as_int = cur_mv[i].as_int;
+  }
+
+  // do first prediction into the destination buffer. Do the next
+  // prediction into a temporary buffer. Then keep track of which one
+  // of these currently holds the best predictor, and use the other
+  // one for future predictions. In the end, copy from tmp_buf to
+  // dst if necessary.
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    orig_dst[i] = xd->plane[i].dst.buf;
+    orig_dst_stride[i] = xd->plane[i].dst.stride;
+  }
+
+  // We don't include the cost of the second reference here, because there
+  // are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other
+  // words if you present them in that order, the second one is always known
+  // if the first is known.
+  //
+  // Under some circumstances we discount the cost of new mv mode to encourage
+  // initiation of a motion field.
+  if (discount_newmv_test(cpi, this_mode, frame_mv[refs[0]],
+                          mode_mv, refs[0])) {
+    *rate2 += VPXMIN(cost_mv_ref(cpi, this_mode,
+                                 mbmi_ext->mode_context[refs[0]]),
+                     cost_mv_ref(cpi, NEARESTMV,
+                                 mbmi_ext->mode_context[refs[0]]));
+  } else {
+    *rate2 += cost_mv_ref(cpi, this_mode, mbmi_ext->mode_context[refs[0]]);
+  }
+
+  if (RDCOST(x->rdmult, x->rddiv, *rate2, 0) > ref_best_rd &&
+      mbmi->mode != NEARESTMV)
+    return INT64_MAX;
+
+  pred_exists = 0;
+  // Are all MVs integer pel for Y and UV
+  intpel_mv = !mv_has_subpel(&mbmi->mv[0].as_mv);
+  if (is_comp_pred)
+    intpel_mv &= !mv_has_subpel(&mbmi->mv[1].as_mv);
+
+  // Search for best switchable filter by checking the variance of
+  // pred error irrespective of whether the filter will be used
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+    filter_cache[i] = INT64_MAX;
+
+  if (cm->interp_filter != BILINEAR) {
+    if (x->source_variance < cpi->sf.disable_filter_search_var_thresh) {
+      best_filter = EIGHTTAP;
+    } else if (best_filter == SWITCHABLE) {
+      int newbest;
+      int tmp_rate_sum = 0;
+      int64_t tmp_dist_sum = 0;
+
+      for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
+        int j;
+        int64_t rs_rd;
+        int tmp_skip_sb = 0;
+        int64_t tmp_skip_sse = INT64_MAX;
+
+        mbmi->interp_filter = i;
+        rs = vp10_get_switchable_rate(cpi, xd);
+        rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
+
+        if (i > 0 && intpel_mv) {
+          rd = RDCOST(x->rdmult, x->rddiv, tmp_rate_sum, tmp_dist_sum);
+          filter_cache[i] = rd;
+          filter_cache[SWITCHABLE_FILTERS] =
+              VPXMIN(filter_cache[SWITCHABLE_FILTERS], rd + rs_rd);
+          if (cm->interp_filter == SWITCHABLE)
+            rd += rs_rd;
+          *mask_filter = VPXMAX(*mask_filter, rd);
+        } else {
+          int rate_sum = 0;
+          int64_t dist_sum = 0;
+          if (i > 0 && cpi->sf.adaptive_interp_filter_search &&
+              (cpi->sf.interp_filter_search_mask & (1 << i))) {
+            rate_sum = INT_MAX;
+            dist_sum = INT64_MAX;
+            continue;
+          }
+
+          if ((cm->interp_filter == SWITCHABLE &&
+               (!i || best_needs_copy)) ||
+              (cm->interp_filter != SWITCHABLE &&
+               (cm->interp_filter == mbmi->interp_filter ||
+                (i == 0 && intpel_mv)))) {
+            restore_dst_buf(xd, orig_dst, orig_dst_stride);
+          } else {
+            for (j = 0; j < MAX_MB_PLANE; j++) {
+              xd->plane[j].dst.buf = tmp_buf + j * 64 * 64;
+              xd->plane[j].dst.stride = 64;
+            }
+          }
+          vp10_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
+          model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum,
+                          &tmp_skip_sb, &tmp_skip_sse);
+
+          rd = RDCOST(x->rdmult, x->rddiv, rate_sum, dist_sum);
+          filter_cache[i] = rd;
+          filter_cache[SWITCHABLE_FILTERS] =
+              VPXMIN(filter_cache[SWITCHABLE_FILTERS], rd + rs_rd);
+          if (cm->interp_filter == SWITCHABLE)
+            rd += rs_rd;
+          *mask_filter = VPXMAX(*mask_filter, rd);
+
+          if (i == 0 && intpel_mv) {
+            tmp_rate_sum = rate_sum;
+            tmp_dist_sum = dist_sum;
+          }
+        }
+
+        if (i == 0 && cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
+          if (rd / 2 > ref_best_rd) {
+            restore_dst_buf(xd, orig_dst, orig_dst_stride);
+            return INT64_MAX;
+          }
+        }
+        newbest = i == 0 || rd < best_rd;
+
+        if (newbest) {
+          best_rd = rd;
+          best_filter = mbmi->interp_filter;
+          if (cm->interp_filter == SWITCHABLE && i && !intpel_mv)
+            best_needs_copy = !best_needs_copy;
+        }
+
+        if ((cm->interp_filter == SWITCHABLE && newbest) ||
+            (cm->interp_filter != SWITCHABLE &&
+             cm->interp_filter == mbmi->interp_filter)) {
+          pred_exists = 1;
+          tmp_rd = best_rd;
+
+          skip_txfm_sb = tmp_skip_sb;
+          skip_sse_sb = tmp_skip_sse;
+          memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm));
+          memcpy(bsse, x->bsse, sizeof(bsse));
+        }
+      }
+      restore_dst_buf(xd, orig_dst, orig_dst_stride);
+    }
+  }
+  // Set the appropriate filter
+  mbmi->interp_filter = cm->interp_filter != SWITCHABLE ?
+      cm->interp_filter : best_filter;
+  rs = cm->interp_filter == SWITCHABLE ? vp10_get_switchable_rate(cpi, xd) : 0;
+
+  if (pred_exists) {
+    if (best_needs_copy) {
+      // again temporarily set the buffers to local memory to prevent a memcpy
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = tmp_buf + i * 64 * 64;
+        xd->plane[i].dst.stride = 64;
+      }
+    }
+    rd = tmp_rd + RDCOST(x->rdmult, x->rddiv, rs, 0);
+  } else {
+    int tmp_rate;
+    int64_t tmp_dist;
+    // Handles the special case when a filter that is not in the
+    // switchable list (ex. bilinear) is indicated at the frame level, or
+    // skip condition holds.
+    vp10_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
+    model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist,
+                    &skip_txfm_sb, &skip_sse_sb);
+    rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
+    memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm));
+    memcpy(bsse, x->bsse, sizeof(bsse));
+  }
+
+  if (!is_comp_pred)
+    single_filter[this_mode][refs[0]] = mbmi->interp_filter;
+
+  if (cpi->sf.adaptive_mode_search)
+    if (is_comp_pred)
+      if (single_skippable[this_mode][refs[0]] &&
+          single_skippable[this_mode][refs[1]])
+        memset(skip_txfm, SKIP_TXFM_AC_DC, sizeof(skip_txfm));
+
+  if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
+    // if current pred_error modeled rd is substantially more than the best
+    // so far, do not bother doing full rd
+    if (rd / 2 > ref_best_rd) {
+      restore_dst_buf(xd, orig_dst, orig_dst_stride);
+      return INT64_MAX;
+    }
+  }
+
+  if (cm->interp_filter == SWITCHABLE)
+    *rate2 += rs;
+
+  memcpy(x->skip_txfm, skip_txfm, sizeof(skip_txfm));
+  memcpy(x->bsse, bsse, sizeof(bsse));
+
+  if (!skip_txfm_sb) {
+    int skippable_y, skippable_uv;
+    int64_t sseuv = INT64_MAX;
+    int64_t rdcosty = INT64_MAX;
+
+    // Y cost and distortion
+    vp10_subtract_plane(x, bsize, 0);
+    super_block_yrd(cpi, x, rate_y, &distortion_y, &skippable_y, psse,
+                    bsize, ref_best_rd);
+
+    if (*rate_y == INT_MAX) {
+      *rate2 = INT_MAX;
+      *distortion = INT64_MAX;
+      restore_dst_buf(xd, orig_dst, orig_dst_stride);
+      return INT64_MAX;
+    }
+
+    *rate2 += *rate_y;
+    *distortion += distortion_y;
+
+    rdcosty = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
+    rdcosty = VPXMIN(rdcosty, RDCOST(x->rdmult, x->rddiv, 0, *psse));
+
+    if (!super_block_uvrd(cpi, x, rate_uv, &distortion_uv, &skippable_uv,
+                          &sseuv, bsize, ref_best_rd - rdcosty)) {
+      *rate2 = INT_MAX;
+      *distortion = INT64_MAX;
+      restore_dst_buf(xd, orig_dst, orig_dst_stride);
+      return INT64_MAX;
+    }
+
+    *psse += sseuv;
+    *rate2 += *rate_uv;
+    *distortion += distortion_uv;
+    *skippable = skippable_y && skippable_uv;
+  } else {
+    x->skip = 1;
+    *disable_skip = 1;
+
+    // The cost of skip bit needs to be added.
+    *rate2 += vp10_cost_bit(vp10_get_skip_prob(cm, xd), 1);
+
+    *distortion = skip_sse_sb;
+  }
+
+  if (!is_comp_pred)
+    single_skippable[this_mode][refs[0]] = *skippable;
+
+  restore_dst_buf(xd, orig_dst, orig_dst_stride);
+  return 0;  // The rate-distortion cost will be re-calculated by caller.
+}
+
+void vp10_rd_pick_intra_mode_sb(VP10_COMP *cpi, MACROBLOCK *x,
+                               RD_COST *rd_cost, BLOCK_SIZE bsize,
+                               PICK_MODE_CONTEXT *ctx, int64_t best_rd) {
+  VP10_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblockd_plane *const pd = xd->plane;
+  int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0;
+  int y_skip = 0, uv_skip = 0;
+  int64_t dist_y = 0, dist_uv = 0;
+  TX_SIZE max_uv_tx_size;
+  ctx->skip = 0;
+  xd->mi[0]->mbmi.ref_frame[0] = INTRA_FRAME;
+  xd->mi[0]->mbmi.ref_frame[1] = NONE;
+
+  if (bsize >= BLOCK_8X8) {
+    if (rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
+                               &dist_y, &y_skip, bsize,
+                               best_rd) >= best_rd) {
+      rd_cost->rate = INT_MAX;
+      return;
+    }
+  } else {
+    y_skip = 0;
+    if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate_y, &rate_y_tokenonly,
+                                     &dist_y, best_rd) >= best_rd) {
+      rd_cost->rate = INT_MAX;
+      return;
+    }
+  }
+  max_uv_tx_size = get_uv_tx_size_impl(xd->mi[0]->mbmi.tx_size, bsize,
+                                       pd[1].subsampling_x,
+                                       pd[1].subsampling_y);
+  rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv, &rate_uv_tokenonly,
+                          &dist_uv, &uv_skip, VPXMAX(BLOCK_8X8, bsize),
+                          max_uv_tx_size);
+
+  if (y_skip && uv_skip) {
+    rd_cost->rate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
+                    vp10_cost_bit(vp10_get_skip_prob(cm, xd), 1);
+    rd_cost->dist = dist_y + dist_uv;
+  } else {
+    rd_cost->rate = rate_y + rate_uv +
+                      vp10_cost_bit(vp10_get_skip_prob(cm, xd), 0);
+    rd_cost->dist = dist_y + dist_uv;
+  }
+
+  ctx->mic = *xd->mi[0];
+  ctx->mbmi_ext = *x->mbmi_ext;
+  rd_cost->rdcost = RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist);
+}
+
+// This function is designed to apply a bias or adjustment to an rd value based
+// on the relative variance of the source and reconstruction.
+#define LOW_VAR_THRESH 16
+#define VLOW_ADJ_MAX 25
+#define VHIGH_ADJ_MAX 8
+static void rd_variance_adjustment(VP10_COMP *cpi,
+                                   MACROBLOCK *x,
+                                   BLOCK_SIZE bsize,
+                                   int64_t *this_rd,
+                                   MV_REFERENCE_FRAME ref_frame,
+                                   unsigned int source_variance) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  unsigned int recon_variance;
+  unsigned int absvar_diff = 0;
+  int64_t var_error = 0;
+  int64_t var_factor = 0;
+
+  if (*this_rd == INT64_MAX)
+    return;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    recon_variance =
+      vp10_high_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize, xd->bd);
+  } else {
+    recon_variance =
+      vp10_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize);
+  }
+#else
+  recon_variance =
+    vp10_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  if ((source_variance + recon_variance) > LOW_VAR_THRESH) {
+    absvar_diff = (source_variance > recon_variance)
+      ? (source_variance - recon_variance)
+      : (recon_variance - source_variance);
+
+    var_error = ((int64_t)200 * source_variance * recon_variance) /
+      (((int64_t)source_variance * source_variance) +
+       ((int64_t)recon_variance * recon_variance));
+    var_error = 100 - var_error;
+  }
+
+  // Source variance above a threshold and ref frame is intra.
+  // This case is targeted mainly at discouraging intra modes that give rise
+  // to a predictor with a low spatial complexity compared to the source.
+  if ((source_variance > LOW_VAR_THRESH) && (ref_frame == INTRA_FRAME) &&
+      (source_variance > recon_variance)) {
+    var_factor = VPXMIN(absvar_diff, VPXMIN(VLOW_ADJ_MAX, var_error));
+  // A second possible case of interest is where the source variance
+  // is very low and we wish to discourage false texture or motion trails.
+  } else if ((source_variance < (LOW_VAR_THRESH >> 1)) &&
+             (recon_variance > source_variance)) {
+    var_factor = VPXMIN(absvar_diff, VPXMIN(VHIGH_ADJ_MAX, var_error));
+  }
+  *this_rd += (*this_rd * var_factor) / 100;
+}
+
+
+// Do we have an internal image edge (e.g. formatting bars).
+int vp10_internal_image_edge(VP10_COMP *cpi) {
+  return (cpi->oxcf.pass == 2) &&
+    ((cpi->twopass.this_frame_stats.inactive_zone_rows > 0) ||
+    (cpi->twopass.this_frame_stats.inactive_zone_cols > 0));
+}
+
+// Checks to see if a super block is on a horizontal image edge.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+int vp10_active_h_edge(VP10_COMP *cpi, int mi_row, int mi_step) {
+  int top_edge = 0;
+  int bottom_edge = cpi->common.mi_rows;
+  int is_active_h_edge = 0;
+
+  // For two pass account for any formatting bars detected.
+  if (cpi->oxcf.pass == 2) {
+    TWO_PASS *twopass = &cpi->twopass;
+
+    // The inactive region is specified in MBs not mi units.
+    // The image edge is in the following MB row.
+    top_edge += (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
+
+    bottom_edge -= (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
+    bottom_edge = VPXMAX(top_edge, bottom_edge);
+  }
+
+  if (((top_edge >= mi_row) && (top_edge < (mi_row + mi_step))) ||
+      ((bottom_edge >= mi_row) && (bottom_edge < (mi_row + mi_step)))) {
+    is_active_h_edge = 1;
+  }
+  return is_active_h_edge;
+}
+
+// Checks to see if a super block is on a vertical image edge.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+int vp10_active_v_edge(VP10_COMP *cpi, int mi_col, int mi_step) {
+  int left_edge = 0;
+  int right_edge = cpi->common.mi_cols;
+  int is_active_v_edge = 0;
+
+  // For two pass account for any formatting bars detected.
+  if (cpi->oxcf.pass == 2) {
+    TWO_PASS *twopass = &cpi->twopass;
+
+    // The inactive region is specified in MBs not mi units.
+    // The image edge is in the following MB row.
+    left_edge += (int)(twopass->this_frame_stats.inactive_zone_cols * 2);
+
+    right_edge -= (int)(twopass->this_frame_stats.inactive_zone_cols * 2);
+    right_edge = VPXMAX(left_edge, right_edge);
+  }
+
+  if (((left_edge >= mi_col) && (left_edge < (mi_col + mi_step))) ||
+      ((right_edge >= mi_col) && (right_edge < (mi_col + mi_step)))) {
+    is_active_v_edge = 1;
+  }
+  return is_active_v_edge;
+}
+
+// Checks to see if a super block is at the edge of the active image.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+int vp10_active_edge_sb(VP10_COMP *cpi,
+                       int mi_row, int mi_col) {
+  return vp10_active_h_edge(cpi, mi_row, MI_BLOCK_SIZE) ||
+         vp10_active_v_edge(cpi, mi_col, MI_BLOCK_SIZE);
+}
+
+void vp10_rd_pick_inter_mode_sb(VP10_COMP *cpi,
+                                TileDataEnc *tile_data,
+                                MACROBLOCK *x,
+                                int mi_row, int mi_col,
+                                RD_COST *rd_cost, BLOCK_SIZE bsize,
+                                PICK_MODE_CONTEXT *ctx,
+                                int64_t best_rd_so_far) {
+  VP10_COMMON *const cm = &cpi->common;
+  RD_OPT *const rd_opt = &cpi->rd;
+  SPEED_FEATURES *const sf = &cpi->sf;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  const struct segmentation *const seg = &cm->seg;
+  PREDICTION_MODE this_mode;
+  MV_REFERENCE_FRAME ref_frame, second_ref_frame;
+  unsigned char segment_id = mbmi->segment_id;
+  int comp_pred, i, k;
+  int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
+  struct buf_2d yv12_mb[4][MAX_MB_PLANE];
+  int_mv single_newmv[MAX_REF_FRAMES] = { { 0 } };
+  INTERP_FILTER single_inter_filter[MB_MODE_COUNT][MAX_REF_FRAMES];
+  int single_skippable[MB_MODE_COUNT][MAX_REF_FRAMES];
+  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
+                                    VP9_ALT_FLAG };
+  int64_t best_rd = best_rd_so_far;
+  int64_t best_pred_diff[REFERENCE_MODES];
+  int64_t best_pred_rd[REFERENCE_MODES];
+  int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
+  int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
+  MB_MODE_INFO best_mbmode;
+  int best_mode_skippable = 0;
+  int midx, best_mode_index = -1;
+  unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
+  vpx_prob comp_mode_p;
+  int64_t best_intra_rd = INT64_MAX;
+  unsigned int best_pred_sse = UINT_MAX;
+  PREDICTION_MODE best_intra_mode = DC_PRED;
+  int rate_uv_intra[TX_SIZES], rate_uv_tokenonly[TX_SIZES];
+  int64_t dist_uv[TX_SIZES];
+  int skip_uv[TX_SIZES];
+  PREDICTION_MODE mode_uv[TX_SIZES];
+  const int intra_cost_penalty = vp10_get_intra_cost_penalty(
+      cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
+  int best_skip2 = 0;
+  uint8_t ref_frame_skip_mask[2] = { 0 };
+  uint16_t mode_skip_mask[MAX_REF_FRAMES] = { 0 };
+  int mode_skip_start = sf->mode_skip_start + 1;
+  const int *const rd_threshes = rd_opt->threshes[segment_id][bsize];
+  const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize];
+  int64_t mode_threshold[MAX_MODES];
+  int *mode_map = tile_data->mode_map[bsize];
+  const int mode_search_skip_flags = sf->mode_search_skip_flags;
+  int64_t mask_filter = 0;
+  int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS];
+
+  vp10_zero(best_mbmode);
+
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+    filter_cache[i] = INT64_MAX;
+
+  estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
+                           &comp_mode_p);
+
+  for (i = 0; i < REFERENCE_MODES; ++i)
+    best_pred_rd[i] = INT64_MAX;
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
+    best_filter_rd[i] = INT64_MAX;
+  for (i = 0; i < TX_SIZES; i++)
+    rate_uv_intra[i] = INT_MAX;
+  for (i = 0; i < MAX_REF_FRAMES; ++i)
+    x->pred_sse[i] = INT_MAX;
+  for (i = 0; i < MB_MODE_COUNT; ++i) {
+    for (k = 0; k < MAX_REF_FRAMES; ++k) {
+      single_inter_filter[i][k] = SWITCHABLE;
+      single_skippable[i][k] = 0;
+    }
+  }
+
+  rd_cost->rate = INT_MAX;
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    x->pred_mv_sad[ref_frame] = INT_MAX;
+    if (cpi->ref_frame_flags & flag_list[ref_frame]) {
+      assert(get_ref_frame_buffer(cpi, ref_frame) != NULL);
+      setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
+                         frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
+    }
+    frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
+    frame_mv[ZEROMV][ref_frame].as_int = 0;
+  }
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    if (!(cpi->ref_frame_flags & flag_list[ref_frame])) {
+      // Skip checking missing references in both single and compound reference
+      // modes. Note that a mode will be skipped iff both reference frames
+      // are masked out.
+      ref_frame_skip_mask[0] |= (1 << ref_frame);
+      ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+    } else {
+      for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+        // Skip fixed mv modes for poor references
+        if ((x->pred_mv_sad[ref_frame] >> 2) > x->pred_mv_sad[i]) {
+          mode_skip_mask[ref_frame] |= INTER_NEAREST_NEAR_ZERO;
+          break;
+        }
+      }
+    }
+    // If the segment reference frame feature is enabled....
+    // then do nothing if the current ref frame is not allowed..
+    if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+        get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
+      ref_frame_skip_mask[0] |= (1 << ref_frame);
+      ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+    }
+  }
+
+  // Disable this drop out case if the ref frame
+  // segment level feature is enabled for this segment. This is to
+  // prevent the possibility that we end up unable to pick any mode.
+  if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
+    // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
+    // unless ARNR filtering is enabled in which case we want
+    // an unfiltered alternative. We allow near/nearest as well
+    // because they may result in zero-zero MVs but be cheaper.
+    if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
+      ref_frame_skip_mask[0] = (1 << LAST_FRAME) | (1 << GOLDEN_FRAME);
+      ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK;
+      mode_skip_mask[ALTREF_FRAME] = ~INTER_NEAREST_NEAR_ZERO;
+      if (frame_mv[NEARMV][ALTREF_FRAME].as_int != 0)
+        mode_skip_mask[ALTREF_FRAME] |= (1 << NEARMV);
+      if (frame_mv[NEARESTMV][ALTREF_FRAME].as_int != 0)
+        mode_skip_mask[ALTREF_FRAME] |= (1 << NEARESTMV);
+    }
+  }
+
+  if (cpi->rc.is_src_frame_alt_ref) {
+    if (sf->alt_ref_search_fp) {
+      mode_skip_mask[ALTREF_FRAME] = 0;
+      ref_frame_skip_mask[0] = ~(1 << ALTREF_FRAME);
+      ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK;
+    }
+  }
+
+  if (sf->alt_ref_search_fp)
+    if (!cm->show_frame && x->pred_mv_sad[GOLDEN_FRAME] < INT_MAX)
+      if (x->pred_mv_sad[ALTREF_FRAME] > (x->pred_mv_sad[GOLDEN_FRAME] << 1))
+        mode_skip_mask[ALTREF_FRAME] |= INTER_ALL;
+
+  if (sf->adaptive_mode_search) {
+    if (cm->show_frame && !cpi->rc.is_src_frame_alt_ref &&
+        cpi->rc.frames_since_golden >= 3)
+      if (x->pred_mv_sad[GOLDEN_FRAME] > (x->pred_mv_sad[LAST_FRAME] << 1))
+        mode_skip_mask[GOLDEN_FRAME] |= INTER_ALL;
+  }
+
+  if (bsize > sf->max_intra_bsize) {
+    ref_frame_skip_mask[0] |= (1 << INTRA_FRAME);
+    ref_frame_skip_mask[1] |= (1 << INTRA_FRAME);
+  }
+
+  mode_skip_mask[INTRA_FRAME] |=
+      ~(sf->intra_y_mode_mask[max_txsize_lookup[bsize]]);
+
+  for (i = 0; i <= LAST_NEW_MV_INDEX; ++i)
+    mode_threshold[i] = 0;
+  for (i = LAST_NEW_MV_INDEX + 1; i < MAX_MODES; ++i)
+    mode_threshold[i] = ((int64_t)rd_threshes[i] * rd_thresh_freq_fact[i]) >> 5;
+
+  midx =  sf->schedule_mode_search ? mode_skip_start : 0;
+  while (midx > 4) {
+    uint8_t end_pos = 0;
+    for (i = 5; i < midx; ++i) {
+      if (mode_threshold[mode_map[i - 1]] > mode_threshold[mode_map[i]]) {
+        uint8_t tmp = mode_map[i];
+        mode_map[i] = mode_map[i - 1];
+        mode_map[i - 1] = tmp;
+        end_pos = i;
+      }
+    }
+    midx = end_pos;
+  }
+
+  for (midx = 0; midx < MAX_MODES; ++midx) {
+    int mode_index = mode_map[midx];
+    int mode_excluded = 0;
+    int64_t this_rd = INT64_MAX;
+    int disable_skip = 0;
+    int compmode_cost = 0;
+    int rate2 = 0, rate_y = 0, rate_uv = 0;
+    int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
+    int skippable = 0;
+    int this_skip2 = 0;
+    int64_t total_sse = INT64_MAX;
+    int early_term = 0;
+
+    this_mode = vp10_mode_order[mode_index].mode;
+    ref_frame = vp10_mode_order[mode_index].ref_frame[0];
+    second_ref_frame = vp10_mode_order[mode_index].ref_frame[1];
+
+    // Look at the reference frame of the best mode so far and set the
+    // skip mask to look at a subset of the remaining modes.
+    if (midx == mode_skip_start && best_mode_index >= 0) {
+      switch (best_mbmode.ref_frame[0]) {
+        case INTRA_FRAME:
+          break;
+        case LAST_FRAME:
+          ref_frame_skip_mask[0] |= LAST_FRAME_MODE_MASK;
+          ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+          break;
+        case GOLDEN_FRAME:
+          ref_frame_skip_mask[0] |= GOLDEN_FRAME_MODE_MASK;
+          ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+          break;
+        case ALTREF_FRAME:
+          ref_frame_skip_mask[0] |= ALT_REF_MODE_MASK;
+          break;
+        case NONE:
+        case MAX_REF_FRAMES:
+          assert(0 && "Invalid Reference frame");
+          break;
+      }
+    }
+
+    if ((ref_frame_skip_mask[0] & (1 << ref_frame)) &&
+        (ref_frame_skip_mask[1] & (1 << VPXMAX(0, second_ref_frame))))
+      continue;
+
+    if (mode_skip_mask[ref_frame] & (1 << this_mode))
+      continue;
+
+    // Test best rd so far against threshold for trying this mode.
+    if (best_mode_skippable && sf->schedule_mode_search)
+      mode_threshold[mode_index] <<= 1;
+
+    if (best_rd < mode_threshold[mode_index])
+      continue;
+
+    comp_pred = second_ref_frame > INTRA_FRAME;
+    if (comp_pred) {
+      if (!cpi->allow_comp_inter_inter)
+        continue;
+
+      // Skip compound inter modes if ARF is not available.
+      if (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))
+        continue;
+
+      // Do not allow compound prediction if the segment level reference frame
+      // feature is in use as in this case there can only be one reference.
+      if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
+        continue;
+
+      if ((mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
+          best_mode_index >= 0 && best_mbmode.ref_frame[0] == INTRA_FRAME)
+        continue;
+
+      mode_excluded = cm->reference_mode == SINGLE_REFERENCE;
+    } else {
+      if (ref_frame != INTRA_FRAME)
+        mode_excluded = cm->reference_mode == COMPOUND_REFERENCE;
+    }
+
+    if (ref_frame == INTRA_FRAME) {
+      if (sf->adaptive_mode_search)
+        if ((x->source_variance << num_pels_log2_lookup[bsize]) > best_pred_sse)
+          continue;
+
+      if (this_mode != DC_PRED) {
+        // Disable intra modes other than DC_PRED for blocks with low variance
+        // Threshold for intra skipping based on source variance
+        // TODO(debargha): Specialize the threshold for super block sizes
+        const unsigned int skip_intra_var_thresh = 64;
+        if ((mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) &&
+            x->source_variance < skip_intra_var_thresh)
+          continue;
+        // Only search the oblique modes if the best so far is
+        // one of the neighboring directional modes
+        if ((mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
+            (this_mode >= D45_PRED && this_mode <= TM_PRED)) {
+          if (best_mode_index >= 0 &&
+              best_mbmode.ref_frame[0] > INTRA_FRAME)
+            continue;
+        }
+        if (mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
+          if (conditional_skipintra(this_mode, best_intra_mode))
+              continue;
+        }
+      }
+    } else {
+      const MV_REFERENCE_FRAME ref_frames[2] = {ref_frame, second_ref_frame};
+      if (!check_best_zero_mv(cpi, mbmi_ext->mode_context, frame_mv,
+                              this_mode, ref_frames))
+        continue;
+    }
+
+    mbmi->mode = this_mode;
+    mbmi->uv_mode = DC_PRED;
+    mbmi->ref_frame[0] = ref_frame;
+    mbmi->ref_frame[1] = second_ref_frame;
+    // Evaluate all sub-pel filters irrespective of whether we can use
+    // them for this frame.
+    mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP
+                                                          : cm->interp_filter;
+    mbmi->mv[0].as_int = mbmi->mv[1].as_int = 0;
+
+    x->skip = 0;
+    set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
+
+    // Select prediction reference frames.
+    for (i = 0; i < MAX_MB_PLANE; i++) {
+      xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
+      if (comp_pred)
+        xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
+    }
+
+    if (ref_frame == INTRA_FRAME) {
+      TX_SIZE uv_tx;
+      struct macroblockd_plane *const pd = &xd->plane[1];
+      memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
+      super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable,
+                      NULL, bsize, best_rd);
+      if (rate_y == INT_MAX)
+        continue;
+
+      uv_tx = get_uv_tx_size_impl(mbmi->tx_size, bsize, pd->subsampling_x,
+                                  pd->subsampling_y);
+      if (rate_uv_intra[uv_tx] == INT_MAX) {
+        choose_intra_uv_mode(cpi, x, ctx, bsize, uv_tx,
+                             &rate_uv_intra[uv_tx], &rate_uv_tokenonly[uv_tx],
+                             &dist_uv[uv_tx], &skip_uv[uv_tx], &mode_uv[uv_tx]);
+      }
+
+      rate_uv = rate_uv_tokenonly[uv_tx];
+      distortion_uv = dist_uv[uv_tx];
+      skippable = skippable && skip_uv[uv_tx];
+      mbmi->uv_mode = mode_uv[uv_tx];
+
+      rate2 = rate_y + cpi->mbmode_cost[mbmi->mode] + rate_uv_intra[uv_tx];
+      if (this_mode != DC_PRED && this_mode != TM_PRED)
+        rate2 += intra_cost_penalty;
+      distortion2 = distortion_y + distortion_uv;
+    } else {
+      this_rd = handle_inter_mode(cpi, x, bsize,
+                                  &rate2, &distortion2, &skippable,
+                                  &rate_y, &rate_uv,
+                                  &disable_skip, frame_mv,
+                                  mi_row, mi_col,
+                                  single_newmv, single_inter_filter,
+                                  single_skippable, &total_sse, best_rd,
+                                  &mask_filter, filter_cache);
+      if (this_rd == INT64_MAX)
+        continue;
+
+      compmode_cost = vp10_cost_bit(comp_mode_p, comp_pred);
+
+      if (cm->reference_mode == REFERENCE_MODE_SELECT)
+        rate2 += compmode_cost;
+    }
+
+    // Estimate the reference frame signaling cost and add it
+    // to the rolling cost variable.
+    if (comp_pred) {
+      rate2 += ref_costs_comp[ref_frame];
+    } else {
+      rate2 += ref_costs_single[ref_frame];
+    }
+
+    if (!disable_skip) {
+      if (skippable) {
+        // Back out the coefficient coding costs
+        rate2 -= (rate_y + rate_uv);
+
+        // Cost the skip mb case
+        rate2 += vp10_cost_bit(vp10_get_skip_prob(cm, xd), 1);
+      } else if (ref_frame != INTRA_FRAME && !xd->lossless[mbmi->segment_id]) {
+        if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
+            RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
+          // Add in the cost of the no skip flag.
+          rate2 += vp10_cost_bit(vp10_get_skip_prob(cm, xd), 0);
+        } else {
+          // FIXME(rbultje) make this work for splitmv also
+          rate2 += vp10_cost_bit(vp10_get_skip_prob(cm, xd), 1);
+          distortion2 = total_sse;
+          assert(total_sse >= 0);
+          rate2 -= (rate_y + rate_uv);
+          this_skip2 = 1;
+        }
+      } else {
+        // Add in the cost of the no skip flag.
+        rate2 += vp10_cost_bit(vp10_get_skip_prob(cm, xd), 0);
+      }
+
+      // Calculate the final RD estimate for this mode.
+      this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+    }
+
+    // Apply an adjustment to the rd value based on the similarity of the
+    // source variance and reconstructed variance.
+    rd_variance_adjustment(cpi, x, bsize, &this_rd,
+                           ref_frame, x->source_variance);
+
+    if (ref_frame == INTRA_FRAME) {
+    // Keep record of best intra rd
+      if (this_rd < best_intra_rd) {
+        best_intra_rd = this_rd;
+        best_intra_mode = mbmi->mode;
+      }
+    }
+
+    if (!disable_skip && ref_frame == INTRA_FRAME) {
+      for (i = 0; i < REFERENCE_MODES; ++i)
+        best_pred_rd[i] = VPXMIN(best_pred_rd[i], this_rd);
+      for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
+        best_filter_rd[i] = VPXMIN(best_filter_rd[i], this_rd);
+    }
+
+    // Did this mode help.. i.e. is it the new best mode
+    if (this_rd < best_rd || x->skip) {
+      int max_plane = MAX_MB_PLANE;
+      if (!mode_excluded) {
+        // Note index of best mode so far
+        best_mode_index = mode_index;
+
+        if (ref_frame == INTRA_FRAME) {
+          /* required for left and above block mv */
+          mbmi->mv[0].as_int = 0;
+          max_plane = 1;
+        } else {
+          best_pred_sse = x->pred_sse[ref_frame];
+        }
+
+        rd_cost->rate = rate2;
+        rd_cost->dist = distortion2;
+        rd_cost->rdcost = this_rd;
+        best_rd = this_rd;
+        best_mbmode = *mbmi;
+        best_skip2 = this_skip2;
+        best_mode_skippable = skippable;
+
+        if (!x->select_tx_size)
+          swap_block_ptr(x, ctx, 1, 0, 0, max_plane);
+        memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
+               sizeof(ctx->zcoeff_blk[0]) * ctx->num_4x4_blk);
+
+        // TODO(debargha): enhance this test with a better distortion prediction
+        // based on qp, activity mask and history
+        if ((mode_search_skip_flags & FLAG_EARLY_TERMINATE) &&
+            (mode_index > MIN_EARLY_TERM_INDEX)) {
+          int qstep = xd->plane[0].dequant[1];
+          // TODO(debargha): Enhance this by specializing for each mode_index
+          int scale = 4;
+#if CONFIG_VP9_HIGHBITDEPTH
+          if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+            qstep >>= (xd->bd - 8);
+          }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+          if (x->source_variance < UINT_MAX) {
+            const int var_adjust = (x->source_variance < 16);
+            scale -= var_adjust;
+          }
+          if (ref_frame > INTRA_FRAME &&
+              distortion2 * scale < qstep * qstep) {
+            early_term = 1;
+          }
+        }
+      }
+    }
+
+    /* keep record of best compound/single-only prediction */
+    if (!disable_skip && ref_frame != INTRA_FRAME) {
+      int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
+
+      if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+        single_rate = rate2 - compmode_cost;
+        hybrid_rate = rate2;
+      } else {
+        single_rate = rate2;
+        hybrid_rate = rate2 + compmode_cost;
+      }
+
+      single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
+      hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
+
+      if (!comp_pred) {
+        if (single_rd < best_pred_rd[SINGLE_REFERENCE])
+          best_pred_rd[SINGLE_REFERENCE] = single_rd;
+      } else {
+        if (single_rd < best_pred_rd[COMPOUND_REFERENCE])
+          best_pred_rd[COMPOUND_REFERENCE] = single_rd;
+      }
+      if (hybrid_rd < best_pred_rd[REFERENCE_MODE_SELECT])
+        best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
+
+      /* keep record of best filter type */
+      if (!mode_excluded && cm->interp_filter != BILINEAR) {
+        int64_t ref = filter_cache[cm->interp_filter == SWITCHABLE ?
+                              SWITCHABLE_FILTERS : cm->interp_filter];
+
+        for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
+          int64_t adj_rd;
+          if (ref == INT64_MAX)
+            adj_rd = 0;
+          else if (filter_cache[i] == INT64_MAX)
+            // when early termination is triggered, the encoder does not have
+            // access to the rate-distortion cost. it only knows that the cost
+            // should be above the maximum valid value. hence it takes the known
+            // maximum plus an arbitrary constant as the rate-distortion cost.
+            adj_rd = mask_filter - ref + 10;
+          else
+            adj_rd = filter_cache[i] - ref;
+
+          adj_rd += this_rd;
+          best_filter_rd[i] = VPXMIN(best_filter_rd[i], adj_rd);
+        }
+      }
+    }
+
+    if (early_term)
+      break;
+
+    if (x->skip && !comp_pred)
+      break;
+  }
+
+  // The inter modes' rate costs are not calculated precisely in some cases.
+  // Therefore, sometimes, NEWMV is chosen instead of NEARESTMV, NEARMV, and
+  // ZEROMV. Here, checks are added for those cases, and the mode decisions
+  // are corrected.
+  if (best_mbmode.mode == NEWMV) {
+    const MV_REFERENCE_FRAME refs[2] = {best_mbmode.ref_frame[0],
+        best_mbmode.ref_frame[1]};
+    int comp_pred_mode = refs[1] > INTRA_FRAME;
+
+    if (frame_mv[NEARESTMV][refs[0]].as_int == best_mbmode.mv[0].as_int &&
+        ((comp_pred_mode && frame_mv[NEARESTMV][refs[1]].as_int ==
+            best_mbmode.mv[1].as_int) || !comp_pred_mode))
+      best_mbmode.mode = NEARESTMV;
+    else if (frame_mv[NEARMV][refs[0]].as_int == best_mbmode.mv[0].as_int &&
+        ((comp_pred_mode && frame_mv[NEARMV][refs[1]].as_int ==
+            best_mbmode.mv[1].as_int) || !comp_pred_mode))
+      best_mbmode.mode = NEARMV;
+    else if (best_mbmode.mv[0].as_int == 0 &&
+        ((comp_pred_mode && best_mbmode.mv[1].as_int == 0) || !comp_pred_mode))
+      best_mbmode.mode = ZEROMV;
+  }
+
+  if (best_mode_index < 0 || best_rd >= best_rd_so_far) {
+    rd_cost->rate = INT_MAX;
+    rd_cost->rdcost = INT64_MAX;
+    return;
+  }
+
+  // If we used an estimate for the uv intra rd in the loop above...
+  if (sf->use_uv_intra_rd_estimate) {
+    // Do Intra UV best rd mode selection if best mode choice above was intra.
+    if (best_mbmode.ref_frame[0] == INTRA_FRAME) {
+      TX_SIZE uv_tx_size;
+      *mbmi = best_mbmode;
+      uv_tx_size = get_uv_tx_size(mbmi, &xd->plane[1]);
+      rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra[uv_tx_size],
+                              &rate_uv_tokenonly[uv_tx_size],
+                              &dist_uv[uv_tx_size],
+                              &skip_uv[uv_tx_size],
+                              bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize,
+                              uv_tx_size);
+    }
+  }
+
+  assert((cm->interp_filter == SWITCHABLE) ||
+         (cm->interp_filter == best_mbmode.interp_filter) ||
+         !is_inter_block(&best_mbmode));
+
+  if (!cpi->rc.is_src_frame_alt_ref)
+    vp10_update_rd_thresh_fact(tile_data->thresh_freq_fact,
+                              sf->adaptive_rd_thresh, bsize, best_mode_index);
+
+  // macroblock modes
+  *mbmi = best_mbmode;
+  x->skip |= best_skip2;
+
+  for (i = 0; i < REFERENCE_MODES; ++i) {
+    if (best_pred_rd[i] == INT64_MAX)
+      best_pred_diff[i] = INT_MIN;
+    else
+      best_pred_diff[i] = best_rd - best_pred_rd[i];
+  }
+
+  if (!x->skip) {
+    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
+      if (best_filter_rd[i] == INT64_MAX)
+        best_filter_diff[i] = 0;
+      else
+        best_filter_diff[i] = best_rd - best_filter_rd[i];
+    }
+    if (cm->interp_filter == SWITCHABLE)
+      assert(best_filter_diff[SWITCHABLE_FILTERS] == 0);
+  } else {
+    vp10_zero(best_filter_diff);
+  }
+
+  // TODO(yunqingwang): Moving this line in front of the above best_filter_diff
+  // updating code causes PSNR loss. Need to figure out the confliction.
+  x->skip |= best_mode_skippable;
+
+  if (!x->skip && !x->select_tx_size) {
+    int has_high_freq_coeff = 0;
+    int plane;
+    int max_plane = is_inter_block(&xd->mi[0]->mbmi)
+                        ? MAX_MB_PLANE : 1;
+    for (plane = 0; plane < max_plane; ++plane) {
+      x->plane[plane].eobs = ctx->eobs_pbuf[plane][1];
+      has_high_freq_coeff |= vp10_has_high_freq_in_plane(x, bsize, plane);
+    }
+
+    for (plane = max_plane; plane < MAX_MB_PLANE; ++plane) {
+      x->plane[plane].eobs = ctx->eobs_pbuf[plane][2];
+      has_high_freq_coeff |= vp10_has_high_freq_in_plane(x, bsize, plane);
+    }
+
+    best_mode_skippable |= !has_high_freq_coeff;
+  }
+
+  assert(best_mode_index >= 0);
+
+  store_coding_context(x, ctx, best_mode_index, best_pred_diff,
+                       best_filter_diff, best_mode_skippable);
+}
+
+void vp10_rd_pick_inter_mode_sb_seg_skip(VP10_COMP *cpi,
+                                        TileDataEnc *tile_data,
+                                        MACROBLOCK *x,
+                                        RD_COST *rd_cost,
+                                        BLOCK_SIZE bsize,
+                                        PICK_MODE_CONTEXT *ctx,
+                                        int64_t best_rd_so_far) {
+  VP10_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  unsigned char segment_id = mbmi->segment_id;
+  const int comp_pred = 0;
+  int i;
+  int64_t best_pred_diff[REFERENCE_MODES];
+  int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
+  unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
+  vpx_prob comp_mode_p;
+  INTERP_FILTER best_filter = SWITCHABLE;
+  int64_t this_rd = INT64_MAX;
+  int rate2 = 0;
+  const int64_t distortion2 = 0;
+
+  estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
+                           &comp_mode_p);
+
+  for (i = 0; i < MAX_REF_FRAMES; ++i)
+    x->pred_sse[i] = INT_MAX;
+  for (i = LAST_FRAME; i < MAX_REF_FRAMES; ++i)
+    x->pred_mv_sad[i] = INT_MAX;
+
+  rd_cost->rate = INT_MAX;
+
+  assert(segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP));
+
+  mbmi->mode = ZEROMV;
+  mbmi->uv_mode = DC_PRED;
+  mbmi->ref_frame[0] = LAST_FRAME;
+  mbmi->ref_frame[1] = NONE;
+  mbmi->mv[0].as_int = 0;
+  x->skip = 1;
+
+  if (cm->interp_filter != BILINEAR) {
+    best_filter = EIGHTTAP;
+    if (cm->interp_filter == SWITCHABLE &&
+        x->source_variance >= cpi->sf.disable_filter_search_var_thresh) {
+      int rs;
+      int best_rs = INT_MAX;
+      for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
+        mbmi->interp_filter = i;
+        rs = vp10_get_switchable_rate(cpi, xd);
+        if (rs < best_rs) {
+          best_rs = rs;
+          best_filter = mbmi->interp_filter;
+        }
+      }
+    }
+  }
+  // Set the appropriate filter
+  if (cm->interp_filter == SWITCHABLE) {
+    mbmi->interp_filter = best_filter;
+    rate2 += vp10_get_switchable_rate(cpi, xd);
+  } else {
+    mbmi->interp_filter = cm->interp_filter;
+  }
+
+  if (cm->reference_mode == REFERENCE_MODE_SELECT)
+    rate2 += vp10_cost_bit(comp_mode_p, comp_pred);
+
+  // Estimate the reference frame signaling cost and add it
+  // to the rolling cost variable.
+  rate2 += ref_costs_single[LAST_FRAME];
+  this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+
+  rd_cost->rate = rate2;
+  rd_cost->dist = distortion2;
+  rd_cost->rdcost = this_rd;
+
+  if (this_rd >= best_rd_so_far) {
+    rd_cost->rate = INT_MAX;
+    rd_cost->rdcost = INT64_MAX;
+    return;
+  }
+
+  assert((cm->interp_filter == SWITCHABLE) ||
+         (cm->interp_filter == mbmi->interp_filter));
+
+  vp10_update_rd_thresh_fact(tile_data->thresh_freq_fact,
+                            cpi->sf.adaptive_rd_thresh, bsize, THR_ZEROMV);
+
+  vp10_zero(best_pred_diff);
+  vp10_zero(best_filter_diff);
+
+  if (!x->select_tx_size)
+    swap_block_ptr(x, ctx, 1, 0, 0, MAX_MB_PLANE);
+  store_coding_context(x, ctx, THR_ZEROMV,
+                       best_pred_diff, best_filter_diff, 0);
+}
+
+void vp10_rd_pick_inter_mode_sub8x8(VP10_COMP *cpi,
+                                   TileDataEnc *tile_data,
+                                   MACROBLOCK *x,
+                                   int mi_row, int mi_col,
+                                   RD_COST *rd_cost,
+                                   BLOCK_SIZE bsize,
+                                   PICK_MODE_CONTEXT *ctx,
+                                   int64_t best_rd_so_far) {
+  VP10_COMMON *const cm = &cpi->common;
+  RD_OPT *const rd_opt = &cpi->rd;
+  SPEED_FEATURES *const sf = &cpi->sf;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const struct segmentation *const seg = &cm->seg;
+  MV_REFERENCE_FRAME ref_frame, second_ref_frame;
+  unsigned char segment_id = mbmi->segment_id;
+  int comp_pred, i;
+  int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
+  struct buf_2d yv12_mb[4][MAX_MB_PLANE];
+  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
+                                    VP9_ALT_FLAG };
+  int64_t best_rd = best_rd_so_far;
+  int64_t best_yrd = best_rd_so_far;  // FIXME(rbultje) more precise
+  int64_t best_pred_diff[REFERENCE_MODES];
+  int64_t best_pred_rd[REFERENCE_MODES];
+  int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
+  int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
+  MB_MODE_INFO best_mbmode;
+  int ref_index, best_ref_index = 0;
+  unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
+  vpx_prob comp_mode_p;
+  INTERP_FILTER tmp_best_filter = SWITCHABLE;
+  int rate_uv_intra, rate_uv_tokenonly;
+  int64_t dist_uv;
+  int skip_uv;
+  PREDICTION_MODE mode_uv = DC_PRED;
+  const int intra_cost_penalty = vp10_get_intra_cost_penalty(
+    cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
+  int_mv seg_mvs[4][MAX_REF_FRAMES];
+  b_mode_info best_bmodes[4];
+  int best_skip2 = 0;
+  int ref_frame_skip_mask[2] = { 0 };
+  int64_t mask_filter = 0;
+  int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS];
+  int internal_active_edge =
+    vp10_active_edge_sb(cpi, mi_row, mi_col) && vp10_internal_image_edge(cpi);
+
+  memset(x->zcoeff_blk[TX_4X4], 0, 4);
+  vp10_zero(best_mbmode);
+
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+    filter_cache[i] = INT64_MAX;
+
+  for (i = 0; i < 4; i++) {
+    int j;
+    for (j = 0; j < MAX_REF_FRAMES; j++)
+      seg_mvs[i][j].as_int = INVALID_MV;
+  }
+
+  estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
+                           &comp_mode_p);
+
+  for (i = 0; i < REFERENCE_MODES; ++i)
+    best_pred_rd[i] = INT64_MAX;
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
+    best_filter_rd[i] = INT64_MAX;
+  rate_uv_intra = INT_MAX;
+
+  rd_cost->rate = INT_MAX;
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+    if (cpi->ref_frame_flags & flag_list[ref_frame]) {
+      setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
+                         frame_mv[NEARESTMV], frame_mv[NEARMV],
+                         yv12_mb);
+    } else {
+      ref_frame_skip_mask[0] |= (1 << ref_frame);
+      ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+    }
+    frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
+    frame_mv[ZEROMV][ref_frame].as_int = 0;
+  }
+
+  for (ref_index = 0; ref_index < MAX_REFS; ++ref_index) {
+    int mode_excluded = 0;
+    int64_t this_rd = INT64_MAX;
+    int disable_skip = 0;
+    int compmode_cost = 0;
+    int rate2 = 0, rate_y = 0, rate_uv = 0;
+    int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
+    int skippable = 0;
+    int i;
+    int this_skip2 = 0;
+    int64_t total_sse = INT_MAX;
+    int early_term = 0;
+
+    ref_frame = vp10_ref_order[ref_index].ref_frame[0];
+    second_ref_frame = vp10_ref_order[ref_index].ref_frame[1];
+
+    // Look at the reference frame of the best mode so far and set the
+    // skip mask to look at a subset of the remaining modes.
+    if (ref_index > 2 && sf->mode_skip_start < MAX_MODES) {
+      if (ref_index == 3) {
+        switch (best_mbmode.ref_frame[0]) {
+          case INTRA_FRAME:
+            break;
+          case LAST_FRAME:
+            ref_frame_skip_mask[0] |= (1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME);
+            ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+            break;
+          case GOLDEN_FRAME:
+            ref_frame_skip_mask[0] |= (1 << LAST_FRAME) | (1 << ALTREF_FRAME);
+            ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+            break;
+          case ALTREF_FRAME:
+            ref_frame_skip_mask[0] |= (1 << GOLDEN_FRAME) | (1 << LAST_FRAME);
+            break;
+          case NONE:
+          case MAX_REF_FRAMES:
+            assert(0 && "Invalid Reference frame");
+            break;
+        }
+      }
+    }
+
+    if ((ref_frame_skip_mask[0] & (1 << ref_frame)) &&
+        (ref_frame_skip_mask[1] & (1 << VPXMAX(0, second_ref_frame))))
+      continue;
+
+    // Test best rd so far against threshold for trying this mode.
+    if (!internal_active_edge &&
+        rd_less_than_thresh(best_rd,
+                            rd_opt->threshes[segment_id][bsize][ref_index],
+                            tile_data->thresh_freq_fact[bsize][ref_index]))
+      continue;
+
+    comp_pred = second_ref_frame > INTRA_FRAME;
+    if (comp_pred) {
+      if (!cpi->allow_comp_inter_inter)
+        continue;
+      if (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))
+        continue;
+      // Do not allow compound prediction if the segment level reference frame
+      // feature is in use as in this case there can only be one reference.
+      if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
+        continue;
+
+      if ((sf->mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
+          best_mbmode.ref_frame[0] == INTRA_FRAME)
+        continue;
+    }
+
+    // TODO(jingning, jkoleszar): scaling reference frame not supported for
+    // sub8x8 blocks.
+    if (ref_frame > INTRA_FRAME &&
+        vp10_is_scaled(&cm->frame_refs[ref_frame - 1].sf))
+      continue;
+
+    if (second_ref_frame > INTRA_FRAME &&
+        vp10_is_scaled(&cm->frame_refs[second_ref_frame - 1].sf))
+      continue;
+
+    if (comp_pred)
+      mode_excluded = cm->reference_mode == SINGLE_REFERENCE;
+    else if (ref_frame != INTRA_FRAME)
+      mode_excluded = cm->reference_mode == COMPOUND_REFERENCE;
+
+    // If the segment reference frame feature is enabled....
+    // then do nothing if the current ref frame is not allowed..
+    if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+        get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
+      continue;
+    // Disable this drop out case if the ref frame
+    // segment level feature is enabled for this segment. This is to
+    // prevent the possibility that we end up unable to pick any mode.
+    } else if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
+      // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
+      // unless ARNR filtering is enabled in which case we want
+      // an unfiltered alternative. We allow near/nearest as well
+      // because they may result in zero-zero MVs but be cheaper.
+      if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0))
+        continue;
+    }
+
+    mbmi->tx_size = TX_4X4;
+    mbmi->uv_mode = DC_PRED;
+    mbmi->ref_frame[0] = ref_frame;
+    mbmi->ref_frame[1] = second_ref_frame;
+    // Evaluate all sub-pel filters irrespective of whether we can use
+    // them for this frame.
+    mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP
+                                                          : cm->interp_filter;
+    x->skip = 0;
+    set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
+
+    // Select prediction reference frames.
+    for (i = 0; i < MAX_MB_PLANE; i++) {
+      xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
+      if (comp_pred)
+        xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
+    }
+
+    if (ref_frame == INTRA_FRAME) {
+      int rate;
+      if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate, &rate_y,
+                                       &distortion_y, best_rd) >= best_rd)
+        continue;
+      rate2 += rate;
+      rate2 += intra_cost_penalty;
+      distortion2 += distortion_y;
+
+      if (rate_uv_intra == INT_MAX) {
+        choose_intra_uv_mode(cpi, x, ctx, bsize, TX_4X4,
+                             &rate_uv_intra,
+                             &rate_uv_tokenonly,
+                             &dist_uv, &skip_uv,
+                             &mode_uv);
+      }
+      rate2 += rate_uv_intra;
+      rate_uv = rate_uv_tokenonly;
+      distortion2 += dist_uv;
+      distortion_uv = dist_uv;
+      mbmi->uv_mode = mode_uv;
+    } else {
+      int rate;
+      int64_t distortion;
+      int64_t this_rd_thresh;
+      int64_t tmp_rd, tmp_best_rd = INT64_MAX, tmp_best_rdu = INT64_MAX;
+      int tmp_best_rate = INT_MAX, tmp_best_ratey = INT_MAX;
+      int64_t tmp_best_distortion = INT_MAX, tmp_best_sse, uv_sse;
+      int tmp_best_skippable = 0;
+      int switchable_filter_index;
+      int_mv *second_ref = comp_pred ?
+                             &x->mbmi_ext->ref_mvs[second_ref_frame][0] : NULL;
+      b_mode_info tmp_best_bmodes[16];
+      MB_MODE_INFO tmp_best_mbmode;
+      BEST_SEG_INFO bsi[SWITCHABLE_FILTERS];
+      int pred_exists = 0;
+      int uv_skippable;
+
+      this_rd_thresh = (ref_frame == LAST_FRAME) ?
+          rd_opt->threshes[segment_id][bsize][THR_LAST] :
+          rd_opt->threshes[segment_id][bsize][THR_ALTR];
+      this_rd_thresh = (ref_frame == GOLDEN_FRAME) ?
+      rd_opt->threshes[segment_id][bsize][THR_GOLD] : this_rd_thresh;
+      for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+        filter_cache[i] = INT64_MAX;
+
+      // TODO(any): Add search of the tx_type to improve rd performance at the
+      // expense of speed.
+      mbmi->tx_type = DCT_DCT;
+
+      if (cm->interp_filter != BILINEAR) {
+        tmp_best_filter = EIGHTTAP;
+        if (x->source_variance < sf->disable_filter_search_var_thresh) {
+          tmp_best_filter = EIGHTTAP;
+        } else if (sf->adaptive_pred_interp_filter == 1 &&
+                   ctx->pred_interp_filter < SWITCHABLE) {
+          tmp_best_filter = ctx->pred_interp_filter;
+        } else if (sf->adaptive_pred_interp_filter == 2) {
+          tmp_best_filter = ctx->pred_interp_filter < SWITCHABLE ?
+                              ctx->pred_interp_filter : 0;
+        } else {
+          for (switchable_filter_index = 0;
+               switchable_filter_index < SWITCHABLE_FILTERS;
+               ++switchable_filter_index) {
+            int newbest, rs;
+            int64_t rs_rd;
+            MB_MODE_INFO_EXT *mbmi_ext = x->mbmi_ext;
+            mbmi->interp_filter = switchable_filter_index;
+            tmp_rd = rd_pick_best_sub8x8_mode(cpi, x,
+                                              &mbmi_ext->ref_mvs[ref_frame][0],
+                                              second_ref, best_yrd, &rate,
+                                              &rate_y, &distortion,
+                                              &skippable, &total_sse,
+                                              (int) this_rd_thresh, seg_mvs,
+                                              bsi, switchable_filter_index,
+                                              mi_row, mi_col);
+
+            if (tmp_rd == INT64_MAX)
+              continue;
+            rs = vp10_get_switchable_rate(cpi, xd);
+            rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
+            filter_cache[switchable_filter_index] = tmp_rd;
+            filter_cache[SWITCHABLE_FILTERS] =
+                VPXMIN(filter_cache[SWITCHABLE_FILTERS], tmp_rd + rs_rd);
+            if (cm->interp_filter == SWITCHABLE)
+              tmp_rd += rs_rd;
+
+            mask_filter = VPXMAX(mask_filter, tmp_rd);
+
+            newbest = (tmp_rd < tmp_best_rd);
+            if (newbest) {
+              tmp_best_filter = mbmi->interp_filter;
+              tmp_best_rd = tmp_rd;
+            }
+            if ((newbest && cm->interp_filter == SWITCHABLE) ||
+                (mbmi->interp_filter == cm->interp_filter &&
+                 cm->interp_filter != SWITCHABLE)) {
+              tmp_best_rdu = tmp_rd;
+              tmp_best_rate = rate;
+              tmp_best_ratey = rate_y;
+              tmp_best_distortion = distortion;
+              tmp_best_sse = total_sse;
+              tmp_best_skippable = skippable;
+              tmp_best_mbmode = *mbmi;
+              for (i = 0; i < 4; i++) {
+                tmp_best_bmodes[i] = xd->mi[0]->bmi[i];
+                x->zcoeff_blk[TX_4X4][i] = !x->plane[0].eobs[i];
+              }
+              pred_exists = 1;
+              if (switchable_filter_index == 0 &&
+                  sf->use_rd_breakout &&
+                  best_rd < INT64_MAX) {
+                if (tmp_best_rdu / 2 > best_rd) {
+                  // skip searching the other filters if the first is
+                  // already substantially larger than the best so far
+                  tmp_best_filter = mbmi->interp_filter;
+                  tmp_best_rdu = INT64_MAX;
+                  break;
+                }
+              }
+            }
+          }  // switchable_filter_index loop
+        }
+      }
+
+      if (tmp_best_rdu == INT64_MAX && pred_exists)
+        continue;
+
+      mbmi->interp_filter = (cm->interp_filter == SWITCHABLE ?
+                             tmp_best_filter : cm->interp_filter);
+      if (!pred_exists) {
+        // Handles the special case when a filter that is not in the
+        // switchable list (bilinear, 6-tap) is indicated at the frame level
+        tmp_rd = rd_pick_best_sub8x8_mode(cpi, x,
+                                          &x->mbmi_ext->ref_mvs[ref_frame][0],
+                                          second_ref, best_yrd, &rate, &rate_y,
+                                          &distortion, &skippable, &total_sse,
+                                          (int) this_rd_thresh, seg_mvs, bsi, 0,
+                                          mi_row, mi_col);
+        if (tmp_rd == INT64_MAX)
+          continue;
+      } else {
+        total_sse = tmp_best_sse;
+        rate = tmp_best_rate;
+        rate_y = tmp_best_ratey;
+        distortion = tmp_best_distortion;
+        skippable = tmp_best_skippable;
+        *mbmi = tmp_best_mbmode;
+        for (i = 0; i < 4; i++)
+          xd->mi[0]->bmi[i] = tmp_best_bmodes[i];
+      }
+
+      rate2 += rate;
+      distortion2 += distortion;
+
+      if (cm->interp_filter == SWITCHABLE)
+        rate2 += vp10_get_switchable_rate(cpi, xd);
+
+      if (!mode_excluded)
+        mode_excluded = comp_pred ? cm->reference_mode == SINGLE_REFERENCE
+                                  : cm->reference_mode == COMPOUND_REFERENCE;
+
+      compmode_cost = vp10_cost_bit(comp_mode_p, comp_pred);
+
+      tmp_best_rdu = best_rd -
+          VPXMIN(RDCOST(x->rdmult, x->rddiv, rate2, distortion2),
+                 RDCOST(x->rdmult, x->rddiv, 0, total_sse));
+
+      if (tmp_best_rdu > 0) {
+        // If even the 'Y' rd value of split is higher than best so far
+        // then dont bother looking at UV
+        vp10_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col,
+                                        BLOCK_8X8);
+        memset(x->skip_txfm, SKIP_TXFM_NONE, sizeof(x->skip_txfm));
+        if (!super_block_uvrd(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,
+                              &uv_sse, BLOCK_8X8, tmp_best_rdu))
+          continue;
+
+        rate2 += rate_uv;
+        distortion2 += distortion_uv;
+        skippable = skippable && uv_skippable;
+        total_sse += uv_sse;
+      }
+    }
+
+    if (cm->reference_mode == REFERENCE_MODE_SELECT)
+      rate2 += compmode_cost;
+
+    // Estimate the reference frame signaling cost and add it
+    // to the rolling cost variable.
+    if (second_ref_frame > INTRA_FRAME) {
+      rate2 += ref_costs_comp[ref_frame];
+    } else {
+      rate2 += ref_costs_single[ref_frame];
+    }
+
+    if (!disable_skip) {
+      // Skip is never coded at the segment level for sub8x8 blocks and instead
+      // always coded in the bitstream at the mode info level.
+
+      if (ref_frame != INTRA_FRAME && !xd->lossless[mbmi->segment_id]) {
+        if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
+            RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
+          // Add in the cost of the no skip flag.
+          rate2 += vp10_cost_bit(vp10_get_skip_prob(cm, xd), 0);
+        } else {
+          // FIXME(rbultje) make this work for splitmv also
+          rate2 += vp10_cost_bit(vp10_get_skip_prob(cm, xd), 1);
+          distortion2 = total_sse;
+          assert(total_sse >= 0);
+          rate2 -= (rate_y + rate_uv);
+          rate_y = 0;
+          rate_uv = 0;
+          this_skip2 = 1;
+        }
+      } else {
+        // Add in the cost of the no skip flag.
+        rate2 += vp10_cost_bit(vp10_get_skip_prob(cm, xd), 0);
+      }
+
+      // Calculate the final RD estimate for this mode.
+      this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+    }
+
+    if (!disable_skip && ref_frame == INTRA_FRAME) {
+      for (i = 0; i < REFERENCE_MODES; ++i)
+        best_pred_rd[i] = VPXMIN(best_pred_rd[i], this_rd);
+      for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
+        best_filter_rd[i] = VPXMIN(best_filter_rd[i], this_rd);
+    }
+
+    // Did this mode help.. i.e. is it the new best mode
+    if (this_rd < best_rd || x->skip) {
+      if (!mode_excluded) {
+        int max_plane = MAX_MB_PLANE;
+        // Note index of best mode so far
+        best_ref_index = ref_index;
+
+        if (ref_frame == INTRA_FRAME) {
+          /* required for left and above block mv */
+          mbmi->mv[0].as_int = 0;
+          max_plane = 1;
+        }
+
+        rd_cost->rate = rate2;
+        rd_cost->dist = distortion2;
+        rd_cost->rdcost = this_rd;
+        best_rd = this_rd;
+        best_yrd = best_rd -
+                   RDCOST(x->rdmult, x->rddiv, rate_uv, distortion_uv);
+        best_mbmode = *mbmi;
+        best_skip2 = this_skip2;
+        if (!x->select_tx_size)
+          swap_block_ptr(x, ctx, 1, 0, 0, max_plane);
+        memcpy(ctx->zcoeff_blk, x->zcoeff_blk[TX_4X4],
+               sizeof(ctx->zcoeff_blk[0]) * ctx->num_4x4_blk);
+
+        for (i = 0; i < 4; i++)
+          best_bmodes[i] = xd->mi[0]->bmi[i];
+
+        // TODO(debargha): enhance this test with a better distortion prediction
+        // based on qp, activity mask and history
+        if ((sf->mode_search_skip_flags & FLAG_EARLY_TERMINATE) &&
+            (ref_index > MIN_EARLY_TERM_INDEX)) {
+          int qstep = xd->plane[0].dequant[1];
+          // TODO(debargha): Enhance this by specializing for each mode_index
+          int scale = 4;
+#if CONFIG_VP9_HIGHBITDEPTH
+          if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+            qstep >>= (xd->bd - 8);
+          }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+          if (x->source_variance < UINT_MAX) {
+            const int var_adjust = (x->source_variance < 16);
+            scale -= var_adjust;
+          }
+          if (ref_frame > INTRA_FRAME &&
+              distortion2 * scale < qstep * qstep) {
+            early_term = 1;
+          }
+        }
+      }
+    }
+
+    /* keep record of best compound/single-only prediction */
+    if (!disable_skip && ref_frame != INTRA_FRAME) {
+      int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
+
+      if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+        single_rate = rate2 - compmode_cost;
+        hybrid_rate = rate2;
+      } else {
+        single_rate = rate2;
+        hybrid_rate = rate2 + compmode_cost;
+      }
+
+      single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
+      hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
+
+      if (!comp_pred && single_rd < best_pred_rd[SINGLE_REFERENCE])
+        best_pred_rd[SINGLE_REFERENCE] = single_rd;
+      else if (comp_pred && single_rd < best_pred_rd[COMPOUND_REFERENCE])
+        best_pred_rd[COMPOUND_REFERENCE] = single_rd;
+
+      if (hybrid_rd < best_pred_rd[REFERENCE_MODE_SELECT])
+        best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
+    }
+
+    /* keep record of best filter type */
+    if (!mode_excluded && !disable_skip && ref_frame != INTRA_FRAME &&
+        cm->interp_filter != BILINEAR) {
+      int64_t ref = filter_cache[cm->interp_filter == SWITCHABLE ?
+                              SWITCHABLE_FILTERS : cm->interp_filter];
+      int64_t adj_rd;
+      for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
+        if (ref == INT64_MAX)
+          adj_rd = 0;
+        else if (filter_cache[i] == INT64_MAX)
+          // when early termination is triggered, the encoder does not have
+          // access to the rate-distortion cost. it only knows that the cost
+          // should be above the maximum valid value. hence it takes the known
+          // maximum plus an arbitrary constant as the rate-distortion cost.
+          adj_rd = mask_filter - ref + 10;
+        else
+          adj_rd = filter_cache[i] - ref;
+
+        adj_rd += this_rd;
+        best_filter_rd[i] = VPXMIN(best_filter_rd[i], adj_rd);
+      }
+    }
+
+    if (early_term)
+      break;
+
+    if (x->skip && !comp_pred)
+      break;
+  }
+
+  if (best_rd >= best_rd_so_far) {
+    rd_cost->rate = INT_MAX;
+    rd_cost->rdcost = INT64_MAX;
+    return;
+  }
+
+  // If we used an estimate for the uv intra rd in the loop above...
+  if (sf->use_uv_intra_rd_estimate) {
+    // Do Intra UV best rd mode selection if best mode choice above was intra.
+    if (best_mbmode.ref_frame[0] == INTRA_FRAME) {
+      *mbmi = best_mbmode;
+      rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra,
+                              &rate_uv_tokenonly,
+                              &dist_uv,
+                              &skip_uv,
+                              BLOCK_8X8, TX_4X4);
+    }
+  }
+
+  if (best_rd == INT64_MAX) {
+    rd_cost->rate = INT_MAX;
+    rd_cost->dist = INT64_MAX;
+    rd_cost->rdcost = INT64_MAX;
+    return;
+  }
+
+  assert((cm->interp_filter == SWITCHABLE) ||
+         (cm->interp_filter == best_mbmode.interp_filter) ||
+         !is_inter_block(&best_mbmode));
+
+  vp10_update_rd_thresh_fact(tile_data->thresh_freq_fact,
+                            sf->adaptive_rd_thresh, bsize, best_ref_index);
+
+  // macroblock modes
+  *mbmi = best_mbmode;
+  x->skip |= best_skip2;
+  if (!is_inter_block(&best_mbmode)) {
+    for (i = 0; i < 4; i++)
+      xd->mi[0]->bmi[i].as_mode = best_bmodes[i].as_mode;
+  } else {
+    for (i = 0; i < 4; ++i)
+      memcpy(&xd->mi[0]->bmi[i], &best_bmodes[i], sizeof(b_mode_info));
+
+    mbmi->mv[0].as_int = xd->mi[0]->bmi[3].as_mv[0].as_int;
+    mbmi->mv[1].as_int = xd->mi[0]->bmi[3].as_mv[1].as_int;
+  }
+
+  for (i = 0; i < REFERENCE_MODES; ++i) {
+    if (best_pred_rd[i] == INT64_MAX)
+      best_pred_diff[i] = INT_MIN;
+    else
+      best_pred_diff[i] = best_rd - best_pred_rd[i];
+  }
+
+  if (!x->skip) {
+    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
+      if (best_filter_rd[i] == INT64_MAX)
+        best_filter_diff[i] = 0;
+      else
+        best_filter_diff[i] = best_rd - best_filter_rd[i];
+    }
+    if (cm->interp_filter == SWITCHABLE)
+      assert(best_filter_diff[SWITCHABLE_FILTERS] == 0);
+  } else {
+    vp10_zero(best_filter_diff);
+  }
+
+  store_coding_context(x, ctx, best_ref_index,
+                       best_pred_diff, best_filter_diff, 0);
+}
diff --git a/libs/libvpx/vp10/encoder/rdopt.h b/libs/libvpx/vp10/encoder/rdopt.h
new file mode 100644
index 0000000000..b1a8036279
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/rdopt.h
@@ -0,0 +1,74 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_ENCODER_RDOPT_H_
+#define VP10_ENCODER_RDOPT_H_
+
+#include "vp10/common/blockd.h"
+
+#include "vp10/encoder/block.h"
+#include "vp10/encoder/context_tree.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct TileInfo;
+struct VP10_COMP;
+struct macroblock;
+struct RD_COST;
+
+void vp10_rd_pick_intra_mode_sb(struct VP10_COMP *cpi, struct macroblock *x,
+                               struct RD_COST *rd_cost, BLOCK_SIZE bsize,
+                               PICK_MODE_CONTEXT *ctx, int64_t best_rd);
+
+unsigned int vp10_get_sby_perpixel_variance(VP10_COMP *cpi,
+                                           const struct buf_2d *ref,
+                                           BLOCK_SIZE bs);
+#if CONFIG_VP9_HIGHBITDEPTH
+unsigned int vp10_high_get_sby_perpixel_variance(VP10_COMP *cpi,
+                                                const struct buf_2d *ref,
+                                                BLOCK_SIZE bs, int bd);
+#endif
+
+void vp10_rd_pick_inter_mode_sb(struct VP10_COMP *cpi,
+                               struct TileDataEnc *tile_data,
+                               struct macroblock *x,
+                               int mi_row, int mi_col,
+                               struct RD_COST *rd_cost,
+                               BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                               int64_t best_rd_so_far);
+
+void vp10_rd_pick_inter_mode_sb_seg_skip(struct VP10_COMP *cpi,
+                                        struct TileDataEnc *tile_data,
+                                        struct macroblock *x,
+                                        struct RD_COST *rd_cost,
+                                        BLOCK_SIZE bsize,
+                                        PICK_MODE_CONTEXT *ctx,
+                                        int64_t best_rd_so_far);
+
+int vp10_internal_image_edge(struct VP10_COMP *cpi);
+int vp10_active_h_edge(struct VP10_COMP *cpi, int mi_row, int mi_step);
+int vp10_active_v_edge(struct VP10_COMP *cpi, int mi_col, int mi_step);
+int vp10_active_edge_sb(struct VP10_COMP *cpi, int mi_row, int mi_col);
+
+void vp10_rd_pick_inter_mode_sub8x8(struct VP10_COMP *cpi,
+                                   struct TileDataEnc *tile_data,
+                                   struct macroblock *x,
+                                   int mi_row, int mi_col,
+                                   struct RD_COST *rd_cost,
+                                   BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                                   int64_t best_rd_so_far);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_ENCODER_RDOPT_H_
diff --git a/libs/libvpx/vp10/encoder/resize.c b/libs/libvpx/vp10/encoder/resize.c
new file mode 100644
index 0000000000..5572c17ad7
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/resize.c
@@ -0,0 +1,928 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#if CONFIG_VP9_HIGHBITDEPTH
+#include "vpx_dsp/vpx_dsp_common.h"
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#include "vpx_ports/mem.h"
+#include "vp10/common/common.h"
+#include "vp10/encoder/resize.h"
+
+#define FILTER_BITS               7
+
+#define INTERP_TAPS               8
+#define SUBPEL_BITS               5
+#define SUBPEL_MASK               ((1 << SUBPEL_BITS) - 1)
+#define INTERP_PRECISION_BITS     32
+
+typedef int16_t interp_kernel[INTERP_TAPS];
+
+// Filters for interpolation (0.5-band) - note this also filters integer pels.
+static const interp_kernel filteredinterp_filters500[(1 << SUBPEL_BITS)] = {
+  {-3,  0, 35, 64, 35,  0, -3, 0},
+  {-3, -1, 34, 64, 36,  1, -3, 0},
+  {-3, -1, 32, 64, 38,  1, -3, 0},
+  {-2, -2, 31, 63, 39,  2, -3, 0},
+  {-2, -2, 29, 63, 41,  2, -3, 0},
+  {-2, -2, 28, 63, 42,  3, -4, 0},
+  {-2, -3, 27, 63, 43,  4, -4, 0},
+  {-2, -3, 25, 62, 45,  5, -4, 0},
+  {-2, -3, 24, 62, 46,  5, -4, 0},
+  {-2, -3, 23, 61, 47,  6, -4, 0},
+  {-2, -3, 21, 60, 49,  7, -4, 0},
+  {-1, -4, 20, 60, 50,  8, -4, -1},
+  {-1, -4, 19, 59, 51,  9, -4, -1},
+  {-1, -4, 17, 58, 52, 10, -4, 0},
+  {-1, -4, 16, 57, 53, 12, -4, -1},
+  {-1, -4, 15, 56, 54, 13, -4, -1},
+  {-1, -4, 14, 55, 55, 14, -4, -1},
+  {-1, -4, 13, 54, 56, 15, -4, -1},
+  {-1, -4, 12, 53, 57, 16, -4, -1},
+  {0, -4, 10, 52, 58, 17, -4, -1},
+  {-1, -4,  9, 51, 59, 19, -4, -1},
+  {-1, -4,  8, 50, 60, 20, -4, -1},
+  {0, -4,  7, 49, 60, 21, -3, -2},
+  {0, -4,  6, 47, 61, 23, -3, -2},
+  {0, -4,  5, 46, 62, 24, -3, -2},
+  {0, -4,  5, 45, 62, 25, -3, -2},
+  {0, -4,  4, 43, 63, 27, -3, -2},
+  {0, -4,  3, 42, 63, 28, -2, -2},
+  {0, -3,  2, 41, 63, 29, -2, -2},
+  {0, -3,  2, 39, 63, 31, -2, -2},
+  {0, -3,  1, 38, 64, 32, -1, -3},
+  {0, -3,  1, 36, 64, 34, -1, -3}
+};
+
+// Filters for interpolation (0.625-band) - note this also filters integer pels.
+static const interp_kernel filteredinterp_filters625[(1 << SUBPEL_BITS)] = {
+  {-1, -8, 33, 80, 33, -8, -1, 0},
+  {-1, -8, 30, 80, 35, -8, -1, 1},
+  {-1, -8, 28, 80, 37, -7, -2, 1},
+  {0, -8, 26, 79, 39, -7, -2, 1},
+  {0, -8, 24, 79, 41, -7, -2, 1},
+  {0, -8, 22, 78, 43, -6, -2, 1},
+  {0, -8, 20, 78, 45, -5, -3, 1},
+  {0, -8, 18, 77, 48, -5, -3, 1},
+  {0, -8, 16, 76, 50, -4, -3, 1},
+  {0, -8, 15, 75, 52, -3, -4, 1},
+  {0, -7, 13, 74, 54, -3, -4, 1},
+  {0, -7, 11, 73, 56, -2, -4, 1},
+  {0, -7, 10, 71, 58, -1, -4, 1},
+  {1, -7,  8, 70, 60,  0, -5, 1},
+  {1, -6,  6, 68, 62,  1, -5, 1},
+  {1, -6,  5, 67, 63,  2, -5, 1},
+  {1, -6,  4, 65, 65,  4, -6, 1},
+  {1, -5,  2, 63, 67,  5, -6, 1},
+  {1, -5,  1, 62, 68,  6, -6, 1},
+  {1, -5,  0, 60, 70,  8, -7, 1},
+  {1, -4, -1, 58, 71, 10, -7, 0},
+  {1, -4, -2, 56, 73, 11, -7, 0},
+  {1, -4, -3, 54, 74, 13, -7, 0},
+  {1, -4, -3, 52, 75, 15, -8, 0},
+  {1, -3, -4, 50, 76, 16, -8, 0},
+  {1, -3, -5, 48, 77, 18, -8, 0},
+  {1, -3, -5, 45, 78, 20, -8, 0},
+  {1, -2, -6, 43, 78, 22, -8, 0},
+  {1, -2, -7, 41, 79, 24, -8, 0},
+  {1, -2, -7, 39, 79, 26, -8, 0},
+  {1, -2, -7, 37, 80, 28, -8, -1},
+  {1, -1, -8, 35, 80, 30, -8, -1},
+};
+
+// Filters for interpolation (0.75-band) - note this also filters integer pels.
+static const interp_kernel filteredinterp_filters750[(1 << SUBPEL_BITS)] = {
+  {2, -11,  25,  96,  25, -11,   2, 0},
+  {2, -11,  22,  96,  28, -11,   2, 0},
+  {2, -10,  19,  95,  31, -11,   2, 0},
+  {2, -10,  17,  95,  34, -12,   2, 0},
+  {2,  -9,  14,  94,  37, -12,   2, 0},
+  {2,  -8,  12,  93,  40, -12,   1, 0},
+  {2,  -8,   9,  92,  43, -12,   1, 1},
+  {2,  -7,   7,  91,  46, -12,   1, 0},
+  {2,  -7,   5,  90,  49, -12,   1, 0},
+  {2,  -6,   3,  88,  52, -12,   0, 1},
+  {2,  -5,   1,  86,  55, -12,   0, 1},
+  {2,  -5,  -1,  84,  58, -11,   0, 1},
+  {2,  -4,  -2,  82,  61, -11,  -1, 1},
+  {2,  -4,  -4,  80,  64, -10,  -1, 1},
+  {1, -3, -5, 77, 67, -9, -1, 1},
+  {1, -3, -6, 75, 70, -8, -2, 1},
+  {1, -2, -7, 72, 72, -7, -2, 1},
+  {1, -2, -8, 70, 75, -6, -3, 1},
+  {1, -1, -9, 67, 77, -5, -3, 1},
+  {1,  -1, -10,  64,  80,  -4,  -4, 2},
+  {1,  -1, -11,  61,  82,  -2,  -4, 2},
+  {1,   0, -11,  58,  84,  -1,  -5, 2},
+  {1,   0, -12,  55,  86,   1,  -5, 2},
+  {1,   0, -12,  52,  88,   3,  -6, 2},
+  {0,   1, -12,  49,  90,   5,  -7, 2},
+  {0,   1, -12,  46,  91,   7,  -7, 2},
+  {1,   1, -12,  43,  92,   9,  -8, 2},
+  {0,   1, -12,  40,  93,  12,  -8, 2},
+  {0,   2, -12,  37,  94,  14,  -9, 2},
+  {0,   2, -12,  34,  95,  17, -10, 2},
+  {0,   2, -11,  31,  95,  19, -10, 2},
+  {0,   2, -11,  28,  96,  22, -11, 2}
+};
+
+// Filters for interpolation (0.875-band) - note this also filters integer pels.
+static const interp_kernel filteredinterp_filters875[(1 << SUBPEL_BITS)] = {
+  {3,  -8,  13, 112,  13,  -8,   3, 0},
+  {3,  -7,  10, 112,  17,  -9,   3, -1},
+  {2,  -6,   7, 111,  21,  -9,   3, -1},
+  {2,  -5,   4, 111,  24, -10,   3, -1},
+  {2,  -4,   1, 110,  28, -11,   3, -1},
+  {1,  -3,  -1, 108,  32, -12,   4, -1},
+  {1,  -2,  -3, 106,  36, -13,   4, -1},
+  {1,  -1,  -6, 105,  40, -14,   4, -1},
+  {1,  -1,  -7, 102,  44, -14,   4, -1},
+  {1,   0,  -9, 100,  48, -15,   4, -1},
+  {1,   1, -11,  97,  53, -16,   4, -1},
+  {0,   1, -12,  95,  57, -16,   4, -1},
+  {0,   2, -13,  91,  61, -16,   4, -1},
+  {0,   2, -14,  88,  65, -16,   4, -1},
+  {0,   3, -15,  84,  69, -17,   4, 0},
+  {0,   3, -16,  81,  73, -16,   3, 0},
+  {0,   3, -16,  77,  77, -16,   3, 0},
+  {0,   3, -16,  73,  81, -16,   3, 0},
+  {0,   4, -17,  69,  84, -15,   3, 0},
+  {-1,   4, -16,  65,  88, -14,   2, 0},
+  {-1,   4, -16,  61,  91, -13,   2, 0},
+  {-1,   4, -16,  57,  95, -12,   1, 0},
+  {-1,   4, -16,  53,  97, -11,   1, 1},
+  {-1,   4, -15,  48, 100,  -9,   0, 1},
+  {-1,   4, -14,  44, 102,  -7,  -1, 1},
+  {-1,   4, -14,  40, 105,  -6,  -1, 1},
+  {-1,   4, -13,  36, 106,  -3,  -2, 1},
+  {-1,   4, -12,  32, 108,  -1,  -3, 1},
+  {-1,   3, -11,  28, 110,   1,  -4, 2},
+  {-1,   3, -10,  24, 111,   4,  -5, 2},
+  {-1,   3,  -9,  21, 111,   7,  -6, 2},
+  {-1,   3,  -9,  17, 112,  10,  -7, 3}
+};
+
+// Filters for interpolation (full-band) - no filtering for integer pixels
+static const interp_kernel filteredinterp_filters1000[(1 << SUBPEL_BITS)] = {
+  {0,   0,   0, 128,   0,   0,   0, 0},
+  {0,   1,  -3, 128,   3,  -1,   0, 0},
+  {-1,   2,  -6, 127,   7,  -2,   1, 0},
+  {-1,   3,  -9, 126,  12,  -4,   1, 0},
+  {-1,   4, -12, 125,  16,  -5,   1, 0},
+  {-1,   4, -14, 123,  20,  -6,   2, 0},
+  {-1,   5, -15, 120,  25,  -8,   2, 0},
+  {-1,   5, -17, 118,  30,  -9,   3, -1},
+  {-1,   6, -18, 114,  35, -10,   3, -1},
+  {-1,   6, -19, 111,  41, -12,   3, -1},
+  {-1,   6, -20, 107,  46, -13,   4, -1},
+  {-1,   6, -21, 103,  52, -14,   4, -1},
+  {-1,   6, -21,  99,  57, -16,   5, -1},
+  {-1,   6, -21,  94,  63, -17,   5, -1},
+  {-1,   6, -20,  89,  68, -18,   5, -1},
+  {-1,   6, -20,  84,  73, -19,   6, -1},
+  {-1,   6, -20,  79,  79, -20,   6, -1},
+  {-1,   6, -19,  73,  84, -20,   6, -1},
+  {-1,   5, -18,  68,  89, -20,   6, -1},
+  {-1,   5, -17,  63,  94, -21,   6, -1},
+  {-1,   5, -16,  57,  99, -21,   6, -1},
+  {-1,   4, -14,  52, 103, -21,   6, -1},
+  {-1,   4, -13,  46, 107, -20,   6, -1},
+  {-1,   3, -12,  41, 111, -19,   6, -1},
+  {-1,   3, -10,  35, 114, -18,   6, -1},
+  {-1,   3,  -9,  30, 118, -17,   5, -1},
+  {0,   2,  -8,  25, 120, -15,   5, -1},
+  {0,   2,  -6,  20, 123, -14,   4, -1},
+  {0,   1,  -5,  16, 125, -12,   4, -1},
+  {0,   1,  -4,  12, 126,  -9,   3, -1},
+  {0,   1,  -2,   7, 127,  -6,   2, -1},
+  {0,   0,  -1,   3, 128,  -3,   1, 0}
+};
+
+// Filters for factor of 2 downsampling.
+static const int16_t vp10_down2_symeven_half_filter[] = {56, 12, -3, -1};
+static const int16_t vp10_down2_symodd_half_filter[] = {64, 35, 0, -3};
+
+static const interp_kernel *choose_interp_filter(int inlength, int outlength) {
+  int outlength16 = outlength * 16;
+  if (outlength16 >= inlength * 16)
+    return filteredinterp_filters1000;
+  else if (outlength16 >= inlength * 13)
+    return filteredinterp_filters875;
+  else if (outlength16 >= inlength * 11)
+    return filteredinterp_filters750;
+  else if (outlength16 >= inlength * 9)
+    return filteredinterp_filters625;
+  else
+    return filteredinterp_filters500;
+}
+
+static void interpolate(const uint8_t *const input, int inlength,
+                        uint8_t *output, int outlength) {
+  const int64_t delta = (((uint64_t)inlength << 32) + outlength / 2) /
+      outlength;
+  const int64_t offset = inlength > outlength ?
+      (((int64_t)(inlength - outlength) << 31) + outlength / 2) / outlength :
+      -(((int64_t)(outlength - inlength) << 31) + outlength / 2) / outlength;
+  uint8_t *optr = output;
+  int x, x1, x2, sum, k, int_pel, sub_pel;
+  int64_t y;
+
+  const interp_kernel *interp_filters =
+      choose_interp_filter(inlength, outlength);
+
+  x = 0;
+  y = offset;
+  while ((y >> INTERP_PRECISION_BITS) < (INTERP_TAPS / 2 - 1)) {
+    x++;
+    y += delta;
+  }
+  x1 = x;
+  x = outlength - 1;
+  y = delta * x + offset;
+  while ((y >> INTERP_PRECISION_BITS) +
+         (int64_t)(INTERP_TAPS / 2) >= inlength) {
+    x--;
+    y -= delta;
+  }
+  x2 = x;
+  if (x1 > x2) {
+    for (x = 0, y = offset; x < outlength; ++x, y += delta) {
+      const int16_t *filter;
+      int_pel = y >> INTERP_PRECISION_BITS;
+      sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+      filter = interp_filters[sub_pel];
+      sum = 0;
+      for (k = 0; k < INTERP_TAPS; ++k) {
+        const int pk = int_pel - INTERP_TAPS / 2 + 1 + k;
+        sum += filter[k] * input[(pk < 0 ? 0 :
+                                  (pk >= inlength ? inlength - 1 : pk))];
+      }
+      *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+    }
+  } else {
+    // Initial part.
+    for (x = 0, y = offset; x < x1; ++x, y += delta) {
+      const int16_t *filter;
+      int_pel = y >> INTERP_PRECISION_BITS;
+      sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+      filter = interp_filters[sub_pel];
+      sum = 0;
+      for (k = 0; k < INTERP_TAPS; ++k)
+        sum += filter[k] * input[(int_pel - INTERP_TAPS / 2 + 1 + k < 0 ?
+                                  0 :
+                                  int_pel - INTERP_TAPS / 2 + 1 + k)];
+      *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+    }
+    // Middle part.
+    for (; x <= x2; ++x, y += delta) {
+      const int16_t *filter;
+      int_pel = y >> INTERP_PRECISION_BITS;
+      sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+      filter = interp_filters[sub_pel];
+      sum = 0;
+      for (k = 0; k < INTERP_TAPS; ++k)
+        sum += filter[k] * input[int_pel - INTERP_TAPS / 2 + 1 + k];
+      *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+    }
+    // End part.
+    for (; x < outlength; ++x, y += delta) {
+      const int16_t *filter;
+      int_pel = y >> INTERP_PRECISION_BITS;
+      sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+      filter = interp_filters[sub_pel];
+      sum = 0;
+      for (k = 0; k < INTERP_TAPS; ++k)
+        sum += filter[k] * input[(int_pel - INTERP_TAPS / 2 + 1 + k >=
+                                  inlength ?  inlength - 1 :
+                                  int_pel - INTERP_TAPS / 2 + 1 + k)];
+      *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+    }
+  }
+}
+
+static void down2_symeven(const uint8_t *const input, int length,
+                          uint8_t *output) {
+  // Actual filter len = 2 * filter_len_half.
+  const int16_t *filter = vp10_down2_symeven_half_filter;
+  const int filter_len_half = sizeof(vp10_down2_symeven_half_filter) / 2;
+  int i, j;
+  uint8_t *optr = output;
+  int l1 = filter_len_half;
+  int l2 = (length - filter_len_half);
+  l1 += (l1 & 1);
+  l2 += (l2 & 1);
+  if (l1 > l2) {
+    // Short input length.
+    for (i = 0; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum += (input[(i - j < 0 ? 0 : i - j)] +
+                input[(i + 1 + j >= length ? length - 1 : i + 1 + j)]) *
+            filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+  } else {
+    // Initial part.
+    for (i = 0; i < l1; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum += (input[(i - j < 0 ? 0 : i - j)] + input[i + 1 + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+    // Middle part.
+    for (; i < l2; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum += (input[i - j] + input[i + 1 + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+    // End part.
+    for (; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum += (input[i - j] +
+                input[(i + 1 + j >= length ? length - 1 : i + 1 + j)]) *
+            filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+  }
+}
+
+static void down2_symodd(const uint8_t *const input, int length,
+                         uint8_t *output) {
+  // Actual filter len = 2 * filter_len_half - 1.
+  const int16_t *filter = vp10_down2_symodd_half_filter;
+  const int filter_len_half = sizeof(vp10_down2_symodd_half_filter) / 2;
+  int i, j;
+  uint8_t *optr = output;
+  int l1 = filter_len_half - 1;
+  int l2 = (length - filter_len_half + 1);
+  l1 += (l1 & 1);
+  l2 += (l2 & 1);
+  if (l1 > l2) {
+    // Short input length.
+    for (i = 0; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[(i - j < 0 ? 0 : i - j)] +
+                input[(i + j >= length ? length - 1 : i + j)]) *
+            filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+  } else {
+    // Initial part.
+    for (i = 0; i < l1; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[(i - j < 0 ? 0 : i - j)] + input[i + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+    // Middle part.
+    for (; i < l2; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[i - j] + input[i + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+    // End part.
+    for (; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[i - j] + input[(i + j >= length ? length - 1 : i + j)]) *
+            filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+  }
+}
+
+static int get_down2_length(int length, int steps) {
+  int s;
+  for (s = 0; s < steps; ++s)
+    length = (length + 1) >> 1;
+  return length;
+}
+
+static int get_down2_steps(int in_length, int out_length) {
+  int steps = 0;
+  int proj_in_length;
+  while ((proj_in_length = get_down2_length(in_length, 1)) >= out_length) {
+    ++steps;
+    in_length = proj_in_length;
+  }
+  return steps;
+}
+
+static void resize_multistep(const uint8_t *const input,
+                             int length,
+                             uint8_t *output,
+                             int olength,
+                             uint8_t *buf) {
+  int steps;
+  if (length == olength) {
+    memcpy(output, input, sizeof(output[0]) * length);
+    return;
+  }
+  steps = get_down2_steps(length, olength);
+
+  if (steps > 0) {
+    int s;
+    uint8_t *out = NULL;
+    uint8_t *tmpbuf = NULL;
+    uint8_t *otmp, *otmp2;
+    int filteredlength = length;
+    if (!tmpbuf) {
+      tmpbuf = (uint8_t *)malloc(sizeof(uint8_t) * length);
+      otmp = tmpbuf;
+    } else {
+      otmp = buf;
+    }
+    otmp2 = otmp + get_down2_length(length, 1);
+    for (s = 0; s < steps; ++s) {
+      const int proj_filteredlength = get_down2_length(filteredlength, 1);
+      const uint8_t *const in = (s == 0 ? input : out);
+      if (s == steps - 1 && proj_filteredlength == olength)
+        out = output;
+      else
+        out = (s & 1 ? otmp2 : otmp);
+      if (filteredlength & 1)
+        down2_symodd(in, filteredlength, out);
+      else
+        down2_symeven(in, filteredlength, out);
+      filteredlength = proj_filteredlength;
+    }
+    if (filteredlength != olength) {
+      interpolate(out, filteredlength, output, olength);
+    }
+    if (tmpbuf)
+      free(tmpbuf);
+  } else {
+    interpolate(input, length, output, olength);
+  }
+}
+
+static void fill_col_to_arr(uint8_t *img, int stride, int len, uint8_t *arr) {
+  int i;
+  uint8_t *iptr = img;
+  uint8_t *aptr = arr;
+  for (i = 0; i < len; ++i, iptr += stride) {
+    *aptr++ = *iptr;
+  }
+}
+
+static void fill_arr_to_col(uint8_t *img, int stride, int len, uint8_t *arr) {
+  int i;
+  uint8_t *iptr = img;
+  uint8_t *aptr = arr;
+  for (i = 0; i < len; ++i, iptr += stride) {
+    *iptr = *aptr++;
+  }
+}
+
+void vp10_resize_plane(const uint8_t *const input,
+                      int height,
+                      int width,
+                      int in_stride,
+                      uint8_t *output,
+                      int height2,
+                      int width2,
+                      int out_stride) {
+  int i;
+  uint8_t *intbuf = (uint8_t *)malloc(sizeof(uint8_t) * width2 * height);
+  uint8_t *tmpbuf = (uint8_t *)malloc(sizeof(uint8_t) *
+                                      (width < height ? height : width));
+  uint8_t *arrbuf = (uint8_t *)malloc(sizeof(uint8_t) * (height + height2));
+  assert(width > 0);
+  assert(height > 0);
+  assert(width2 > 0);
+  assert(height2 > 0);
+  for (i = 0; i < height; ++i)
+    resize_multistep(input + in_stride * i, width,
+                        intbuf + width2 * i, width2, tmpbuf);
+  for (i = 0; i < width2; ++i) {
+    fill_col_to_arr(intbuf + i, width2, height, arrbuf);
+    resize_multistep(arrbuf, height, arrbuf + height, height2, tmpbuf);
+    fill_arr_to_col(output + i, out_stride, height2, arrbuf + height);
+  }
+  free(intbuf);
+  free(tmpbuf);
+  free(arrbuf);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_interpolate(const uint16_t *const input, int inlength,
+                               uint16_t *output, int outlength, int bd) {
+  const int64_t delta =
+      (((uint64_t)inlength << 32) + outlength / 2) / outlength;
+  const int64_t offset = inlength > outlength ?
+      (((int64_t)(inlength - outlength) << 31) + outlength / 2) / outlength :
+      -(((int64_t)(outlength - inlength) << 31) + outlength / 2) / outlength;
+  uint16_t *optr = output;
+  int x, x1, x2, sum, k, int_pel, sub_pel;
+  int64_t y;
+
+  const interp_kernel *interp_filters =
+      choose_interp_filter(inlength, outlength);
+
+  x = 0;
+  y = offset;
+  while ((y >> INTERP_PRECISION_BITS) < (INTERP_TAPS / 2 - 1)) {
+    x++;
+    y += delta;
+  }
+  x1 = x;
+  x = outlength - 1;
+  y = delta * x + offset;
+  while ((y >> INTERP_PRECISION_BITS) +
+         (int64_t)(INTERP_TAPS / 2) >= inlength) {
+    x--;
+    y -= delta;
+  }
+  x2 = x;
+  if (x1 > x2) {
+    for (x = 0, y = offset; x < outlength; ++x, y += delta) {
+      const int16_t *filter;
+      int_pel = y >> INTERP_PRECISION_BITS;
+      sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+      filter = interp_filters[sub_pel];
+      sum = 0;
+      for (k = 0; k < INTERP_TAPS; ++k) {
+        const int pk = int_pel - INTERP_TAPS / 2 + 1 + k;
+        sum += filter[k] *
+            input[(pk < 0 ? 0 : (pk >= inlength ? inlength - 1 : pk))];
+      }
+      *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+    }
+  } else {
+    // Initial part.
+    for (x = 0, y = offset; x < x1; ++x, y += delta) {
+      const int16_t *filter;
+      int_pel = y >> INTERP_PRECISION_BITS;
+      sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+      filter = interp_filters[sub_pel];
+      sum = 0;
+      for (k = 0; k < INTERP_TAPS; ++k)
+        sum += filter[k] *
+            input[(int_pel - INTERP_TAPS / 2 + 1 + k < 0 ?
+                   0 : int_pel - INTERP_TAPS / 2 + 1 + k)];
+      *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+    }
+    // Middle part.
+    for (; x <= x2; ++x, y += delta) {
+      const int16_t *filter;
+      int_pel = y >> INTERP_PRECISION_BITS;
+      sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+      filter = interp_filters[sub_pel];
+      sum = 0;
+      for (k = 0; k < INTERP_TAPS; ++k)
+        sum += filter[k] * input[int_pel - INTERP_TAPS / 2 + 1 + k];
+      *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+    }
+    // End part.
+    for (; x < outlength; ++x, y += delta) {
+      const int16_t *filter;
+      int_pel = y >> INTERP_PRECISION_BITS;
+      sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+      filter = interp_filters[sub_pel];
+      sum = 0;
+      for (k = 0; k < INTERP_TAPS; ++k)
+        sum += filter[k] * input[(int_pel - INTERP_TAPS / 2 + 1 + k >=
+                                  inlength ?  inlength - 1 :
+                                  int_pel - INTERP_TAPS / 2 + 1 + k)];
+      *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+    }
+  }
+}
+
+static void highbd_down2_symeven(const uint16_t *const input, int length,
+                                 uint16_t *output, int bd) {
+  // Actual filter len = 2 * filter_len_half.
+  static const int16_t *filter = vp10_down2_symeven_half_filter;
+  const int filter_len_half = sizeof(vp10_down2_symeven_half_filter) / 2;
+  int i, j;
+  uint16_t *optr = output;
+  int l1 = filter_len_half;
+  int l2 = (length - filter_len_half);
+  l1 += (l1 & 1);
+  l2 += (l2 & 1);
+  if (l1 > l2) {
+    // Short input length.
+    for (i = 0; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum += (input[(i - j < 0 ? 0 : i - j)] +
+                input[(i + 1 + j >= length ? length - 1 : i + 1 + j)]) *
+            filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+  } else {
+    // Initial part.
+    for (i = 0; i < l1; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum += (input[(i - j < 0 ? 0 : i - j)] + input[i + 1 + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+    // Middle part.
+    for (; i < l2; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum += (input[i - j] + input[i + 1 + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+    // End part.
+    for (; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum += (input[i - j] +
+                input[(i + 1 + j >= length ? length - 1 : i + 1 + j)]) *
+            filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+  }
+}
+
+static void highbd_down2_symodd(const uint16_t *const input, int length,
+                              uint16_t *output, int bd) {
+  // Actual filter len = 2 * filter_len_half - 1.
+  static const int16_t *filter = vp10_down2_symodd_half_filter;
+  const int filter_len_half = sizeof(vp10_down2_symodd_half_filter) / 2;
+  int i, j;
+  uint16_t *optr = output;
+  int l1 = filter_len_half - 1;
+  int l2 = (length - filter_len_half + 1);
+  l1 += (l1 & 1);
+  l2 += (l2 & 1);
+  if (l1 > l2) {
+    // Short input length.
+    for (i = 0; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[(i - j < 0 ? 0 : i - j)] +
+                input[(i + j >= length ? length - 1 : i + j)]) *
+            filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+  } else {
+    // Initial part.
+    for (i = 0; i < l1; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[(i - j < 0 ? 0 : i - j)] + input[i + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+    // Middle part.
+    for (; i < l2; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[i - j] + input[i + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+    // End part.
+    for (; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[i - j] + input[(i + j >= length ? length - 1 : i + j)]) *
+            filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+  }
+}
+
+static void highbd_resize_multistep(const uint16_t *const input,
+                                    int length,
+                                    uint16_t *output,
+                                    int olength,
+                                    uint16_t *buf,
+                                    int bd) {
+  int steps;
+  if (length == olength) {
+    memcpy(output, input, sizeof(output[0]) * length);
+    return;
+  }
+  steps = get_down2_steps(length, olength);
+
+  if (steps > 0) {
+    int s;
+    uint16_t *out = NULL;
+    uint16_t *tmpbuf = NULL;
+    uint16_t *otmp, *otmp2;
+    int filteredlength = length;
+    if (!tmpbuf) {
+      tmpbuf = (uint16_t *)malloc(sizeof(uint16_t) * length);
+      otmp = tmpbuf;
+    } else {
+      otmp = buf;
+    }
+    otmp2 = otmp + get_down2_length(length, 1);
+    for (s = 0; s < steps; ++s) {
+      const int proj_filteredlength = get_down2_length(filteredlength, 1);
+      const uint16_t *const in = (s == 0 ? input : out);
+      if (s == steps - 1 && proj_filteredlength == olength)
+        out = output;
+      else
+        out = (s & 1 ? otmp2 : otmp);
+      if (filteredlength & 1)
+        highbd_down2_symodd(in, filteredlength, out, bd);
+      else
+        highbd_down2_symeven(in, filteredlength, out, bd);
+      filteredlength = proj_filteredlength;
+    }
+    if (filteredlength != olength) {
+      highbd_interpolate(out, filteredlength, output, olength, bd);
+    }
+    if (tmpbuf)
+      free(tmpbuf);
+  } else {
+    highbd_interpolate(input, length, output, olength, bd);
+  }
+}
+
+static void highbd_fill_col_to_arr(uint16_t *img, int stride, int len,
+                                   uint16_t *arr) {
+  int i;
+  uint16_t *iptr = img;
+  uint16_t *aptr = arr;
+  for (i = 0; i < len; ++i, iptr += stride) {
+    *aptr++ = *iptr;
+  }
+}
+
+static void highbd_fill_arr_to_col(uint16_t *img, int stride, int len,
+                                   uint16_t *arr) {
+  int i;
+  uint16_t *iptr = img;
+  uint16_t *aptr = arr;
+  for (i = 0; i < len; ++i, iptr += stride) {
+    *iptr = *aptr++;
+  }
+}
+
+void vp10_highbd_resize_plane(const uint8_t *const input,
+                             int height,
+                             int width,
+                             int in_stride,
+                             uint8_t *output,
+                             int height2,
+                             int width2,
+                             int out_stride,
+                             int bd) {
+  int i;
+  uint16_t *intbuf = (uint16_t *)malloc(sizeof(uint16_t) * width2 * height);
+  uint16_t *tmpbuf = (uint16_t *)malloc(sizeof(uint16_t) *
+                                        (width < height ? height : width));
+  uint16_t *arrbuf = (uint16_t *)malloc(sizeof(uint16_t) * (height + height2));
+  for (i = 0; i < height; ++i) {
+    highbd_resize_multistep(CONVERT_TO_SHORTPTR(input + in_stride * i), width,
+                            intbuf + width2 * i, width2, tmpbuf, bd);
+  }
+  for (i = 0; i < width2; ++i) {
+    highbd_fill_col_to_arr(intbuf + i, width2, height, arrbuf);
+    highbd_resize_multistep(arrbuf, height, arrbuf + height, height2, tmpbuf,
+                            bd);
+    highbd_fill_arr_to_col(CONVERT_TO_SHORTPTR(output + i), out_stride, height2,
+                           arrbuf + height);
+  }
+  free(intbuf);
+  free(tmpbuf);
+  free(arrbuf);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+void vp10_resize_frame420(const uint8_t *const y,
+                         int y_stride,
+                         const uint8_t *const u, const uint8_t *const v,
+                         int uv_stride,
+                         int height, int width,
+                         uint8_t *oy, int oy_stride,
+                         uint8_t *ou, uint8_t *ov, int ouv_stride,
+                         int oheight, int owidth) {
+  vp10_resize_plane(y, height, width, y_stride,
+                   oy, oheight, owidth, oy_stride);
+  vp10_resize_plane(u, height / 2, width / 2, uv_stride,
+                   ou, oheight / 2, owidth / 2, ouv_stride);
+  vp10_resize_plane(v, height / 2, width / 2, uv_stride,
+                   ov, oheight / 2, owidth / 2, ouv_stride);
+}
+
+void vp10_resize_frame422(const uint8_t *const y, int y_stride,
+                         const uint8_t *const u, const uint8_t *const v,
+                         int uv_stride,
+                         int height, int width,
+                         uint8_t *oy, int oy_stride,
+                         uint8_t *ou, uint8_t *ov, int ouv_stride,
+                         int oheight, int owidth) {
+  vp10_resize_plane(y, height, width, y_stride,
+                   oy, oheight, owidth, oy_stride);
+  vp10_resize_plane(u, height, width / 2, uv_stride,
+                   ou, oheight, owidth / 2, ouv_stride);
+  vp10_resize_plane(v, height, width / 2, uv_stride,
+                   ov, oheight, owidth / 2, ouv_stride);
+}
+
+void vp10_resize_frame444(const uint8_t *const y, int y_stride,
+                         const uint8_t *const u, const uint8_t *const v,
+                         int uv_stride,
+                         int height, int width,
+                         uint8_t *oy, int oy_stride,
+                         uint8_t *ou, uint8_t *ov, int ouv_stride,
+                         int oheight, int owidth) {
+  vp10_resize_plane(y, height, width, y_stride,
+                   oy, oheight, owidth, oy_stride);
+  vp10_resize_plane(u, height, width, uv_stride,
+                   ou, oheight, owidth, ouv_stride);
+  vp10_resize_plane(v, height, width, uv_stride,
+                   ov, oheight, owidth, ouv_stride);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp10_highbd_resize_frame420(const uint8_t *const y,
+                                int y_stride,
+                                const uint8_t *const u, const uint8_t *const v,
+                                int uv_stride,
+                                int height, int width,
+                                uint8_t *oy, int oy_stride,
+                                uint8_t *ou, uint8_t *ov, int ouv_stride,
+                                int oheight, int owidth, int bd) {
+  vp10_highbd_resize_plane(y, height, width, y_stride,
+                          oy, oheight, owidth, oy_stride, bd);
+  vp10_highbd_resize_plane(u, height / 2, width / 2, uv_stride,
+                          ou, oheight / 2, owidth / 2, ouv_stride, bd);
+  vp10_highbd_resize_plane(v, height / 2, width / 2, uv_stride,
+                          ov, oheight / 2, owidth / 2, ouv_stride, bd);
+}
+
+void vp10_highbd_resize_frame422(const uint8_t *const y, int y_stride,
+                                const uint8_t *const u, const uint8_t *const v,
+                                int uv_stride,
+                                int height, int width,
+                                uint8_t *oy, int oy_stride,
+                                uint8_t *ou, uint8_t *ov, int ouv_stride,
+                                int oheight, int owidth, int bd) {
+  vp10_highbd_resize_plane(y, height, width, y_stride,
+                          oy, oheight, owidth, oy_stride, bd);
+  vp10_highbd_resize_plane(u, height, width / 2, uv_stride,
+                          ou, oheight, owidth / 2, ouv_stride, bd);
+  vp10_highbd_resize_plane(v, height, width / 2, uv_stride,
+                          ov, oheight, owidth / 2, ouv_stride, bd);
+}
+
+void vp10_highbd_resize_frame444(const uint8_t *const y, int y_stride,
+                                const uint8_t *const u, const uint8_t *const v,
+                                int uv_stride,
+                                int height, int width,
+                                uint8_t *oy, int oy_stride,
+                                uint8_t *ou, uint8_t *ov, int ouv_stride,
+                                int oheight, int owidth, int bd) {
+  vp10_highbd_resize_plane(y, height, width, y_stride,
+                          oy, oheight, owidth, oy_stride, bd);
+  vp10_highbd_resize_plane(u, height, width, uv_stride,
+                          ou, oheight, owidth, ouv_stride, bd);
+  vp10_highbd_resize_plane(v, height, width, uv_stride,
+                          ov, oheight, owidth, ouv_stride, bd);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/libs/libvpx/vp10/encoder/resize.h b/libs/libvpx/vp10/encoder/resize.h
new file mode 100644
index 0000000000..bf6377097e
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/resize.h
@@ -0,0 +1,133 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_ENCODER_RESIZE_H_
+#define VP10_ENCODER_RESIZE_H_
+
+#include <stdio.h>
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp10_resize_plane(const uint8_t *const input,
+                      int height,
+                      int width,
+                      int in_stride,
+                      uint8_t *output,
+                      int height2,
+                      int width2,
+                      int out_stride);
+void vp10_resize_frame420(const uint8_t *const y,
+                         int y_stride,
+                         const uint8_t *const u,
+                         const uint8_t *const v,
+                         int uv_stride,
+                         int height,
+                         int width,
+                         uint8_t *oy,
+                         int oy_stride,
+                         uint8_t *ou,
+                         uint8_t *ov,
+                         int ouv_stride,
+                         int oheight,
+                         int owidth);
+void vp10_resize_frame422(const uint8_t *const y,
+                         int y_stride,
+                         const uint8_t *const u,
+                         const uint8_t *const v,
+                         int uv_stride,
+                         int height,
+                         int width,
+                         uint8_t *oy,
+                         int oy_stride,
+                         uint8_t *ou,
+                         uint8_t *ov,
+                         int ouv_stride,
+                         int oheight,
+                         int owidth);
+void vp10_resize_frame444(const uint8_t *const y,
+                         int y_stride,
+                         const uint8_t *const u,
+                         const uint8_t *const v,
+                         int uv_stride,
+                         int height,
+                         int width,
+                         uint8_t *oy,
+                         int oy_stride,
+                         uint8_t *ou,
+                         uint8_t *ov,
+                         int ouv_stride,
+                         int oheight,
+                         int owidth);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp10_highbd_resize_plane(const uint8_t *const input,
+                             int height,
+                             int width,
+                             int in_stride,
+                             uint8_t *output,
+                             int height2,
+                             int width2,
+                             int out_stride,
+                             int bd);
+void vp10_highbd_resize_frame420(const uint8_t *const y,
+                                int y_stride,
+                                const uint8_t *const u,
+                                const uint8_t *const v,
+                                int uv_stride,
+                                int height,
+                                int width,
+                                uint8_t *oy,
+                                int oy_stride,
+                                uint8_t *ou,
+                                uint8_t *ov,
+                                int ouv_stride,
+                                int oheight,
+                                int owidth,
+                                int bd);
+void vp10_highbd_resize_frame422(const uint8_t *const y,
+                                int y_stride,
+                                const uint8_t *const u,
+                                const uint8_t *const v,
+                                int uv_stride,
+                                int height,
+                                int width,
+                                uint8_t *oy,
+                                int oy_stride,
+                                uint8_t *ou,
+                                uint8_t *ov,
+                                int ouv_stride,
+                                int oheight,
+                                int owidth,
+                                int bd);
+void vp10_highbd_resize_frame444(const uint8_t *const y,
+                                int y_stride,
+                                const uint8_t *const u,
+                                const uint8_t *const v,
+                                int uv_stride,
+                                int height,
+                                int width,
+                                uint8_t *oy,
+                                int oy_stride,
+                                uint8_t *ou,
+                                uint8_t *ov,
+                                int ouv_stride,
+                                int oheight,
+                                int owidth,
+                                int bd);
+#endif    // CONFIG_VP9_HIGHBITDEPTH
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif    // VP10_ENCODER_RESIZE_H_
diff --git a/libs/libvpx/vp10/encoder/segmentation.c b/libs/libvpx/vp10/encoder/segmentation.c
new file mode 100644
index 0000000000..677910fa37
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/segmentation.c
@@ -0,0 +1,331 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <limits.h>
+
+#include "vpx_mem/vpx_mem.h"
+
+#include "vp10/common/pred_common.h"
+#include "vp10/common/tile_common.h"
+
+#include "vp10/encoder/cost.h"
+#include "vp10/encoder/segmentation.h"
+#include "vp10/encoder/subexp.h"
+
+void vp10_enable_segmentation(struct segmentation *seg) {
+  seg->enabled = 1;
+  seg->update_map = 1;
+  seg->update_data = 1;
+}
+
+void vp10_disable_segmentation(struct segmentation *seg) {
+  seg->enabled = 0;
+  seg->update_map = 0;
+  seg->update_data = 0;
+}
+
+void vp10_set_segment_data(struct segmentation *seg,
+                          signed char *feature_data,
+                          unsigned char abs_delta) {
+  seg->abs_delta = abs_delta;
+
+  memcpy(seg->feature_data, feature_data, sizeof(seg->feature_data));
+}
+void vp10_disable_segfeature(struct segmentation *seg, int segment_id,
+                            SEG_LVL_FEATURES feature_id) {
+  seg->feature_mask[segment_id] &= ~(1 << feature_id);
+}
+
+void vp10_clear_segdata(struct segmentation *seg, int segment_id,
+                       SEG_LVL_FEATURES feature_id) {
+  seg->feature_data[segment_id][feature_id] = 0;
+}
+
+// Based on set of segment counts calculate a probability tree
+static void calc_segtree_probs(unsigned *segcounts,
+    vpx_prob *segment_tree_probs, const vpx_prob *cur_tree_probs) {
+  // Work out probabilities of each segment
+  const unsigned cc[4] = {
+    segcounts[0] + segcounts[1], segcounts[2] + segcounts[3],
+    segcounts[4] + segcounts[5], segcounts[6] + segcounts[7]
+  };
+  const unsigned ccc[2] = { cc[0] + cc[1], cc[2] + cc[3] };
+#if CONFIG_MISC_FIXES
+  int i;
+#endif
+
+  segment_tree_probs[0] = get_binary_prob(ccc[0], ccc[1]);
+  segment_tree_probs[1] = get_binary_prob(cc[0], cc[1]);
+  segment_tree_probs[2] = get_binary_prob(cc[2], cc[3]);
+  segment_tree_probs[3] = get_binary_prob(segcounts[0], segcounts[1]);
+  segment_tree_probs[4] = get_binary_prob(segcounts[2], segcounts[3]);
+  segment_tree_probs[5] = get_binary_prob(segcounts[4], segcounts[5]);
+  segment_tree_probs[6] = get_binary_prob(segcounts[6], segcounts[7]);
+
+#if CONFIG_MISC_FIXES
+  for (i = 0; i < 7; i++) {
+    const unsigned *ct = i == 0 ? ccc : i < 3 ? cc + (i & 2)
+        : segcounts + (i - 3) * 2;
+    vp10_prob_diff_update_savings_search(ct,
+        cur_tree_probs[i], &segment_tree_probs[i], DIFF_UPDATE_PROB);
+  }
+#else
+  (void) cur_tree_probs;
+#endif
+}
+
+// Based on set of segment counts and probabilities calculate a cost estimate
+static int cost_segmap(unsigned *segcounts, vpx_prob *probs) {
+  const int c01 = segcounts[0] + segcounts[1];
+  const int c23 = segcounts[2] + segcounts[3];
+  const int c45 = segcounts[4] + segcounts[5];
+  const int c67 = segcounts[6] + segcounts[7];
+  const int c0123 = c01 + c23;
+  const int c4567 = c45 + c67;
+
+  // Cost the top node of the tree
+  int cost = c0123 * vp10_cost_zero(probs[0]) +
+             c4567 * vp10_cost_one(probs[0]);
+
+  // Cost subsequent levels
+  if (c0123 > 0) {
+    cost += c01 * vp10_cost_zero(probs[1]) +
+            c23 * vp10_cost_one(probs[1]);
+
+    if (c01 > 0)
+      cost += segcounts[0] * vp10_cost_zero(probs[3]) +
+              segcounts[1] * vp10_cost_one(probs[3]);
+    if (c23 > 0)
+      cost += segcounts[2] * vp10_cost_zero(probs[4]) +
+              segcounts[3] * vp10_cost_one(probs[4]);
+  }
+
+  if (c4567 > 0) {
+    cost += c45 * vp10_cost_zero(probs[2]) +
+            c67 * vp10_cost_one(probs[2]);
+
+    if (c45 > 0)
+      cost += segcounts[4] * vp10_cost_zero(probs[5]) +
+              segcounts[5] * vp10_cost_one(probs[5]);
+    if (c67 > 0)
+      cost += segcounts[6] * vp10_cost_zero(probs[6]) +
+              segcounts[7] * vp10_cost_one(probs[6]);
+  }
+
+  return cost;
+}
+
+static void count_segs(const VP10_COMMON *cm, MACROBLOCKD *xd,
+                       const TileInfo *tile, MODE_INFO **mi,
+                       unsigned *no_pred_segcounts,
+                       unsigned (*temporal_predictor_count)[2],
+                       unsigned *t_unpred_seg_counts,
+                       int bw, int bh, int mi_row, int mi_col) {
+  int segment_id;
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
+
+  xd->mi = mi;
+  segment_id = xd->mi[0]->mbmi.segment_id;
+
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
+
+  // Count the number of hits on each segment with no prediction
+  no_pred_segcounts[segment_id]++;
+
+  // Temporal prediction not allowed on key frames
+  if (cm->frame_type != KEY_FRAME) {
+    const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+    // Test to see if the segment id matches the predicted value.
+    const int pred_segment_id = get_segment_id(cm, cm->last_frame_seg_map,
+                                               bsize, mi_row, mi_col);
+    const int pred_flag = pred_segment_id == segment_id;
+    const int pred_context = vp10_get_pred_context_seg_id(xd);
+
+    // Store the prediction status for this mb and update counts
+    // as appropriate
+    xd->mi[0]->mbmi.seg_id_predicted = pred_flag;
+    temporal_predictor_count[pred_context][pred_flag]++;
+
+    // Update the "unpredicted" segment count
+    if (!pred_flag)
+      t_unpred_seg_counts[segment_id]++;
+  }
+}
+
+static void count_segs_sb(const VP10_COMMON *cm, MACROBLOCKD *xd,
+                          const TileInfo *tile, MODE_INFO **mi,
+                          unsigned *no_pred_segcounts,
+                          unsigned (*temporal_predictor_count)[2],
+                          unsigned *t_unpred_seg_counts,
+                          int mi_row, int mi_col,
+                          BLOCK_SIZE bsize) {
+  const int mis = cm->mi_stride;
+  int bw, bh;
+  const int bs = num_8x8_blocks_wide_lookup[bsize], hbs = bs / 2;
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
+
+  bw = num_8x8_blocks_wide_lookup[mi[0]->mbmi.sb_type];
+  bh = num_8x8_blocks_high_lookup[mi[0]->mbmi.sb_type];
+
+  if (bw == bs && bh == bs) {
+    count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
+               t_unpred_seg_counts, bs, bs, mi_row, mi_col);
+  } else if (bw == bs && bh < bs) {
+    count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
+               t_unpred_seg_counts, bs, hbs, mi_row, mi_col);
+    count_segs(cm, xd, tile, mi + hbs * mis, no_pred_segcounts,
+               temporal_predictor_count, t_unpred_seg_counts, bs, hbs,
+               mi_row + hbs, mi_col);
+  } else if (bw < bs && bh == bs) {
+    count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
+               t_unpred_seg_counts, hbs, bs, mi_row, mi_col);
+    count_segs(cm, xd, tile, mi + hbs,
+               no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts,
+               hbs, bs, mi_row, mi_col + hbs);
+  } else {
+    const BLOCK_SIZE subsize = subsize_lookup[PARTITION_SPLIT][bsize];
+    int n;
+
+    assert(bw < bs && bh < bs);
+
+    for (n = 0; n < 4; n++) {
+      const int mi_dc = hbs * (n & 1);
+      const int mi_dr = hbs * (n >> 1);
+
+      count_segs_sb(cm, xd, tile, &mi[mi_dr * mis + mi_dc],
+                    no_pred_segcounts, temporal_predictor_count,
+                    t_unpred_seg_counts,
+                    mi_row + mi_dr, mi_col + mi_dc, subsize);
+    }
+  }
+}
+
+void vp10_choose_segmap_coding_method(VP10_COMMON *cm, MACROBLOCKD *xd) {
+  struct segmentation *seg = &cm->seg;
+#if CONFIG_MISC_FIXES
+  struct segmentation_probs *segp = &cm->fc->seg;
+#else
+  struct segmentation_probs *segp = &cm->segp;
+#endif
+
+  int no_pred_cost;
+  int t_pred_cost = INT_MAX;
+
+  int i, tile_col, mi_row, mi_col;
+
+#if CONFIG_MISC_FIXES
+  unsigned (*temporal_predictor_count)[2] = cm->counts.seg.pred;
+  unsigned *no_pred_segcounts = cm->counts.seg.tree_total;
+  unsigned *t_unpred_seg_counts = cm->counts.seg.tree_mispred;
+#else
+  unsigned temporal_predictor_count[PREDICTION_PROBS][2] = { { 0 } };
+  unsigned no_pred_segcounts[MAX_SEGMENTS] = { 0 };
+  unsigned t_unpred_seg_counts[MAX_SEGMENTS] = { 0 };
+#endif
+
+  vpx_prob no_pred_tree[SEG_TREE_PROBS];
+  vpx_prob t_pred_tree[SEG_TREE_PROBS];
+  vpx_prob t_nopred_prob[PREDICTION_PROBS];
+
+#if CONFIG_MISC_FIXES
+  (void) xd;
+#else
+  // Set default state for the segment tree probabilities and the
+  // temporal coding probabilities
+  memset(segp->tree_probs, 255, sizeof(segp->tree_probs));
+  memset(segp->pred_probs, 255, sizeof(segp->pred_probs));
+#endif
+
+  // First of all generate stats regarding how well the last segment map
+  // predicts this one
+  for (tile_col = 0; tile_col < 1 << cm->log2_tile_cols; tile_col++) {
+    TileInfo tile;
+    MODE_INFO **mi_ptr;
+    vp10_tile_init(&tile, cm, 0, tile_col);
+
+    mi_ptr = cm->mi_grid_visible + tile.mi_col_start;
+    for (mi_row = 0; mi_row < cm->mi_rows;
+         mi_row += 8, mi_ptr += 8 * cm->mi_stride) {
+      MODE_INFO **mi = mi_ptr;
+      for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end;
+           mi_col += 8, mi += 8)
+        count_segs_sb(cm, xd, &tile, mi, no_pred_segcounts,
+                      temporal_predictor_count, t_unpred_seg_counts,
+                      mi_row, mi_col, BLOCK_64X64);
+    }
+  }
+
+  // Work out probability tree for coding segments without prediction
+  // and the cost.
+  calc_segtree_probs(no_pred_segcounts, no_pred_tree, segp->tree_probs);
+  no_pred_cost = cost_segmap(no_pred_segcounts, no_pred_tree);
+
+  // Key frames cannot use temporal prediction
+  if (!frame_is_intra_only(cm) && !cm->error_resilient_mode) {
+    // Work out probability tree for coding those segments not
+    // predicted using the temporal method and the cost.
+    calc_segtree_probs(t_unpred_seg_counts, t_pred_tree, segp->tree_probs);
+    t_pred_cost = cost_segmap(t_unpred_seg_counts, t_pred_tree);
+
+    // Add in the cost of the signaling for each prediction context.
+    for (i = 0; i < PREDICTION_PROBS; i++) {
+      const int count0 = temporal_predictor_count[i][0];
+      const int count1 = temporal_predictor_count[i][1];
+
+#if CONFIG_MISC_FIXES
+      vp10_prob_diff_update_savings_search(temporal_predictor_count[i],
+                                           segp->pred_probs[i],
+                                           &t_nopred_prob[i], DIFF_UPDATE_PROB);
+#else
+      t_nopred_prob[i] = get_binary_prob(count0, count1);
+#endif
+
+      // Add in the predictor signaling cost
+      t_pred_cost += count0 * vp10_cost_zero(t_nopred_prob[i]) +
+                     count1 * vp10_cost_one(t_nopred_prob[i]);
+    }
+  }
+
+  // Now choose which coding method to use.
+  if (t_pred_cost < no_pred_cost) {
+    assert(!cm->error_resilient_mode);
+    seg->temporal_update = 1;
+#if !CONFIG_MISC_FIXES
+    memcpy(segp->tree_probs, t_pred_tree, sizeof(t_pred_tree));
+    memcpy(segp->pred_probs, t_nopred_prob, sizeof(t_nopred_prob));
+#endif
+  } else {
+    seg->temporal_update = 0;
+#if !CONFIG_MISC_FIXES
+    memcpy(segp->tree_probs, no_pred_tree, sizeof(no_pred_tree));
+#endif
+  }
+}
+
+void vp10_reset_segment_features(VP10_COMMON *cm) {
+  struct segmentation *seg = &cm->seg;
+#if !CONFIG_MISC_FIXES
+  struct segmentation_probs *segp = &cm->segp;
+#endif
+
+  // Set up default state for MB feature flags
+  seg->enabled = 0;
+  seg->update_map = 0;
+  seg->update_data = 0;
+#if !CONFIG_MISC_FIXES
+  memset(segp->tree_probs, 255, sizeof(segp->tree_probs));
+#endif
+  vp10_clearall_segfeatures(seg);
+}
diff --git a/libs/libvpx/vp10/encoder/segmentation.h b/libs/libvpx/vp10/encoder/segmentation.h
new file mode 100644
index 0000000000..b8e6c06c69
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/segmentation.h
@@ -0,0 +1,53 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP10_ENCODER_SEGMENTATION_H_
+#define VP10_ENCODER_SEGMENTATION_H_
+
+#include "vp10/common/blockd.h"
+#include "vp10/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp10_enable_segmentation(struct segmentation *seg);
+void vp10_disable_segmentation(struct segmentation *seg);
+
+void vp10_disable_segfeature(struct segmentation *seg,
+                            int segment_id,
+                            SEG_LVL_FEATURES feature_id);
+void vp10_clear_segdata(struct segmentation *seg,
+                       int segment_id,
+                       SEG_LVL_FEATURES feature_id);
+
+// The values given for each segment can be either deltas (from the default
+// value chosen for the frame) or absolute values.
+//
+// Valid range for abs values is (0-127 for MB_LVL_ALT_Q), (0-63 for
+// SEGMENT_ALT_LF)
+// Valid range for delta values are (+/-127 for MB_LVL_ALT_Q), (+/-63 for
+// SEGMENT_ALT_LF)
+//
+// abs_delta = SEGMENT_DELTADATA (deltas) abs_delta = SEGMENT_ABSDATA (use
+// the absolute values given).
+void vp10_set_segment_data(struct segmentation *seg, signed char *feature_data,
+                          unsigned char abs_delta);
+
+void vp10_choose_segmap_coding_method(VP10_COMMON *cm, MACROBLOCKD *xd);
+
+void vp10_reset_segment_features(VP10_COMMON *cm);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_ENCODER_SEGMENTATION_H_
diff --git a/libs/libvpx/vp10/encoder/skin_detection.c b/libs/libvpx/vp10/encoder/skin_detection.c
new file mode 100644
index 0000000000..9aac477a8a
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/skin_detection.c
@@ -0,0 +1,104 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <limits.h>
+#include <math.h>
+
+#include "vp10/common/blockd.h"
+#include "vp10/encoder/encoder.h"
+#include "vp10/encoder/skin_detection.h"
+
+// Fixed-point skin color model parameters.
+static const int skin_mean[2] = {7463, 9614};                 // q6
+static const int skin_inv_cov[4] = {4107, 1663, 1663, 2157};  // q16
+static const int skin_threshold = 1570636;                    // q18
+
+// Thresholds on luminance.
+static const int y_low = 20;
+static const int y_high = 220;
+
+// Evaluates the Mahalanobis distance measure for the input CbCr values.
+static int evaluate_skin_color_difference(int cb, int cr) {
+  const int cb_q6 = cb << 6;
+  const int cr_q6 = cr << 6;
+  const int cb_diff_q12 = (cb_q6 - skin_mean[0]) * (cb_q6 - skin_mean[0]);
+  const int cbcr_diff_q12 = (cb_q6 - skin_mean[0]) * (cr_q6 - skin_mean[1]);
+  const int cr_diff_q12 = (cr_q6 - skin_mean[1]) * (cr_q6 - skin_mean[1]);
+  const int cb_diff_q2 = (cb_diff_q12 + (1 << 9)) >> 10;
+  const int cbcr_diff_q2 = (cbcr_diff_q12 + (1 << 9)) >> 10;
+  const int cr_diff_q2 = (cr_diff_q12 + (1 << 9)) >> 10;
+  const int skin_diff = skin_inv_cov[0] * cb_diff_q2 +
+      skin_inv_cov[1] * cbcr_diff_q2 +
+      skin_inv_cov[2] * cbcr_diff_q2 +
+      skin_inv_cov[3] * cr_diff_q2;
+  return skin_diff;
+}
+
+int vp10_skin_pixel(const uint8_t y, const uint8_t cb, const uint8_t cr) {
+  if (y < y_low || y > y_high)
+    return 0;
+  else
+    return (evaluate_skin_color_difference(cb, cr) < skin_threshold);
+}
+
+#ifdef OUTPUT_YUV_SKINMAP
+// For viewing skin map on input source.
+void vp10_compute_skin_map(VP10_COMP *const cpi, FILE *yuv_skinmap_file) {
+  int i, j, mi_row, mi_col;
+  VP10_COMMON *const cm = &cpi->common;
+  uint8_t *y;
+  const uint8_t *src_y = cpi->Source->y_buffer;
+  const uint8_t *src_u = cpi->Source->u_buffer;
+  const uint8_t *src_v = cpi->Source->v_buffer;
+  const int src_ystride = cpi->Source->y_stride;
+  const int src_uvstride = cpi->Source->uv_stride;
+  YV12_BUFFER_CONFIG skinmap;
+  memset(&skinmap, 0, sizeof(YV12_BUFFER_CONFIG));
+  if (vpx_alloc_frame_buffer(&skinmap, cm->width, cm->height,
+                               cm->subsampling_x, cm->subsampling_y,
+                               VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment)) {
+      vpx_free_frame_buffer(&skinmap);
+      return;
+  }
+  memset(skinmap.buffer_alloc, 128, skinmap.frame_size);
+  y = skinmap.y_buffer;
+  // Loop through 8x8 blocks and set skin map based on center pixel of block.
+  // Set y to white for skin block, otherwise set to source with gray scale.
+  // Ignore rightmost/bottom boundary blocks.
+  for (mi_row = 0; mi_row < cm->mi_rows - 1; ++mi_row) {
+    for (mi_col = 0; mi_col < cm->mi_cols - 1; ++mi_col) {
+      // Use middle pixel for each 8x8 block for skin detection.
+      // If middle pixel is skin, assign whole 8x8 block to skin.
+      const uint8_t ysource = src_y[4 * src_ystride + 4];
+      const uint8_t usource = src_u[2 * src_uvstride + 2];
+      const uint8_t vsource = src_v[2 * src_uvstride + 2];
+      const int is_skin = vp10_skin_pixel(ysource, usource, vsource);
+      for (i = 0; i < 8; i++) {
+        for (j = 0; j < 8; j++) {
+          if (is_skin)
+            y[i * src_ystride + j] = 255;
+          else
+            y[i * src_ystride + j] = src_y[i * src_ystride + j];
+        }
+      }
+      y += 8;
+      src_y += 8;
+      src_u += 4;
+      src_v += 4;
+    }
+    y += (src_ystride << 3) - ((cm->mi_cols - 1) << 3);
+    src_y += (src_ystride << 3) - ((cm->mi_cols - 1) << 3);
+    src_u += (src_uvstride << 2) - ((cm->mi_cols - 1) << 2);
+    src_v += (src_uvstride << 2) - ((cm->mi_cols - 1) << 2);
+  }
+  vp10_write_yuv_frame_420(&skinmap, yuv_skinmap_file);
+  vpx_free_frame_buffer(&skinmap);
+}
+#endif
diff --git a/libs/libvpx/vp10/encoder/skin_detection.h b/libs/libvpx/vp10/encoder/skin_detection.h
new file mode 100644
index 0000000000..26b7d5e7c6
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/skin_detection.h
@@ -0,0 +1,35 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_ENCODER_SKIN_MAP_H_
+#define VP10_ENCODER_SKIN_MAP_H_
+
+#include "vp10/common/blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP10_COMP;
+
+// #define OUTPUT_YUV_SKINMAP
+
+int vp10_skin_pixel(const uint8_t y, const uint8_t cb, const uint8_t cr);
+
+#ifdef OUTPUT_YUV_SKINMAP
+// For viewing skin map on input source.
+void vp10_compute_skin_map(VP10_COMP *const cpi, FILE *yuv_skinmap_file);
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_ENCODER_SKIN_MAP_H_
diff --git a/libs/libvpx/vp10/encoder/speed_features.c b/libs/libvpx/vp10/encoder/speed_features.c
new file mode 100644
index 0000000000..ce0aebeab0
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/speed_features.c
@@ -0,0 +1,580 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <limits.h>
+
+#include "vp10/encoder/encoder.h"
+#include "vp10/encoder/speed_features.h"
+#include "vp10/encoder/rdopt.h"
+
+#include "vpx_dsp/vpx_dsp_common.h"
+
+// Mesh search patters for various speed settings
+static MESH_PATTERN best_quality_mesh_pattern[MAX_MESH_STEP] =
+    {{64, 4}, {28, 2}, {15, 1}, {7, 1}};
+
+#define MAX_MESH_SPEED 5  // Max speed setting for mesh motion method
+static MESH_PATTERN good_quality_mesh_patterns[MAX_MESH_SPEED + 1]
+                                              [MAX_MESH_STEP] =
+    {{{64, 8}, {28, 4}, {15, 1}, {7, 1}},
+     {{64, 8}, {28, 4}, {15, 1}, {7, 1}},
+     {{64, 8},  {14, 2}, {7, 1},  {7, 1}},
+     {{64, 16}, {24, 8}, {12, 4}, {7, 1}},
+     {{64, 16}, {24, 8}, {12, 4}, {7, 1}},
+     {{64, 16}, {24, 8}, {12, 4}, {7, 1}},
+    };
+static unsigned char good_quality_max_mesh_pct[MAX_MESH_SPEED + 1] =
+    {50, 25, 15, 5, 1, 1};
+
+// Intra only frames, golden frames (except alt ref overlays) and
+// alt ref frames tend to be coded at a higher than ambient quality
+static int frame_is_boosted(const VP10_COMP *cpi) {
+  return frame_is_kf_gf_arf(cpi);
+}
+
+// Sets a partition size down to which the auto partition code will always
+// search (can go lower), based on the image dimensions. The logic here
+// is that the extent to which ringing artefacts are offensive, depends
+// partly on the screen area that over which they propogate. Propogation is
+// limited by transform block size but the screen area take up by a given block
+// size will be larger for a small image format stretched to full screen.
+static BLOCK_SIZE set_partition_min_limit(VP10_COMMON *const cm) {
+  unsigned int screen_area = (cm->width * cm->height);
+
+  // Select block size based on image format size.
+  if (screen_area < 1280 * 720) {
+    // Formats smaller in area than 720P
+    return BLOCK_4X4;
+  } else if (screen_area < 1920 * 1080) {
+    // Format >= 720P and < 1080P
+    return BLOCK_8X8;
+  } else {
+    // Formats 1080P and up
+    return BLOCK_16X16;
+  }
+}
+
+static void set_good_speed_feature_framesize_dependent(VP10_COMP *cpi,
+                                                       SPEED_FEATURES *sf,
+                                                       int speed) {
+  VP10_COMMON *const cm = &cpi->common;
+
+  if (speed >= 1) {
+    if (VPXMIN(cm->width, cm->height) >= 720) {
+      sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
+                                              : DISABLE_ALL_INTER_SPLIT;
+      sf->partition_search_breakout_dist_thr = (1 << 23);
+    } else {
+      sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
+      sf->partition_search_breakout_dist_thr = (1 << 21);
+    }
+  }
+
+  if (speed >= 2) {
+    if (VPXMIN(cm->width, cm->height) >= 720) {
+      sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
+                                              : DISABLE_ALL_INTER_SPLIT;
+      sf->adaptive_pred_interp_filter = 0;
+      sf->partition_search_breakout_dist_thr = (1 << 24);
+      sf->partition_search_breakout_rate_thr = 120;
+    } else {
+      sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
+      sf->partition_search_breakout_dist_thr = (1 << 22);
+      sf->partition_search_breakout_rate_thr = 100;
+    }
+    sf->rd_auto_partition_min_limit = set_partition_min_limit(cm);
+  }
+
+  if (speed >= 3) {
+    if (VPXMIN(cm->width, cm->height) >= 720) {
+      sf->disable_split_mask = DISABLE_ALL_SPLIT;
+      sf->schedule_mode_search = cm->base_qindex < 220 ? 1 : 0;
+      sf->partition_search_breakout_dist_thr = (1 << 25);
+      sf->partition_search_breakout_rate_thr = 200;
+    } else {
+      sf->max_intra_bsize = BLOCK_32X32;
+      sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
+      sf->schedule_mode_search = cm->base_qindex < 175 ? 1 : 0;
+      sf->partition_search_breakout_dist_thr = (1 << 23);
+      sf->partition_search_breakout_rate_thr = 120;
+    }
+  }
+
+  // If this is a two pass clip that fits the criteria for animated or
+  // graphics content then reset disable_split_mask for speeds 1-4.
+  // Also if the image edge is internal to the coded area.
+  if ((speed >= 1) && (cpi->oxcf.pass == 2) &&
+      ((cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ||
+       (vp10_internal_image_edge(cpi)))) {
+    sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
+  }
+
+  if (speed >= 4) {
+    if (VPXMIN(cm->width, cm->height) >= 720) {
+      sf->partition_search_breakout_dist_thr = (1 << 26);
+    } else {
+      sf->partition_search_breakout_dist_thr = (1 << 24);
+    }
+    sf->disable_split_mask = DISABLE_ALL_SPLIT;
+  }
+}
+
+static void set_good_speed_feature(VP10_COMP *cpi, VP10_COMMON *cm,
+                                   SPEED_FEATURES *sf, int speed) {
+  const int boosted = frame_is_boosted(cpi);
+
+  sf->adaptive_rd_thresh = 1;
+  sf->allow_skip_recode = 1;
+
+  if (speed >= 1) {
+    if ((cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ||
+        vp10_internal_image_edge(cpi)) {
+      sf->use_square_partition_only = !frame_is_boosted(cpi);
+    } else {
+      sf->use_square_partition_only = !frame_is_intra_only(cm);
+    }
+
+    sf->less_rectangular_check  = 1;
+
+    sf->use_rd_breakout = 1;
+    sf->adaptive_motion_search = 1;
+    sf->mv.auto_mv_step_size = 1;
+    sf->adaptive_rd_thresh = 2;
+    sf->mv.subpel_iters_per_step = 1;
+    sf->mode_skip_start = 10;
+    sf->adaptive_pred_interp_filter = 1;
+
+    sf->recode_loop = ALLOW_RECODE_KFARFGF;
+    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
+    sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
+
+    sf->tx_size_search_breakout = 1;
+    sf->partition_search_breakout_rate_thr = 80;
+  }
+
+  if (speed >= 2) {
+    sf->tx_size_search_method = frame_is_boosted(cpi) ? USE_FULL_RD
+                                                      : USE_LARGESTALL;
+
+    sf->mode_search_skip_flags = (cm->frame_type == KEY_FRAME) ? 0 :
+                                 FLAG_SKIP_INTRA_DIRMISMATCH |
+                                 FLAG_SKIP_INTRA_BESTINTER |
+                                 FLAG_SKIP_COMP_BESTINTRA |
+                                 FLAG_SKIP_INTRA_LOWVAR;
+    sf->disable_filter_search_var_thresh = 100;
+    sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
+    sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
+    sf->allow_partition_search_skip = 1;
+  }
+
+  if (speed >= 3) {
+    sf->use_square_partition_only = !frame_is_intra_only(cm);
+    sf->tx_size_search_method = frame_is_intra_only(cm) ? USE_FULL_RD
+                                                        : USE_LARGESTALL;
+    sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED;
+    sf->adaptive_pred_interp_filter = 0;
+    sf->adaptive_mode_search = 1;
+    sf->cb_partition_search = !boosted;
+    sf->cb_pred_filter_search = 1;
+    sf->alt_ref_search_fp = 1;
+    sf->recode_loop = ALLOW_RECODE_KFMAXBW;
+    sf->adaptive_rd_thresh = 3;
+    sf->mode_skip_start = 6;
+    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
+    sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC;
+    sf->adaptive_interp_filter_search = 1;
+  }
+
+  if (speed >= 4) {
+    sf->use_square_partition_only = 1;
+    sf->tx_size_search_method = USE_LARGESTALL;
+    sf->mv.search_method = BIGDIA;
+    sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
+    sf->adaptive_rd_thresh = 4;
+    if (cm->frame_type != KEY_FRAME)
+      sf->mode_search_skip_flags |= FLAG_EARLY_TERMINATE;
+    sf->disable_filter_search_var_thresh = 200;
+    sf->use_lp32x32fdct = 1;
+    sf->use_fast_coef_updates = ONE_LOOP_REDUCED;
+    sf->use_fast_coef_costing = 1;
+    sf->partition_search_breakout_rate_thr = 300;
+  }
+
+  if (speed >= 5) {
+    int i;
+    sf->optimize_coefficients = 0;
+    sf->mv.search_method = HEX;
+    sf->disable_filter_search_var_thresh = 500;
+    for (i = 0; i < TX_SIZES; ++i) {
+      sf->intra_y_mode_mask[i] = INTRA_DC;
+      sf->intra_uv_mode_mask[i] = INTRA_DC;
+    }
+    sf->partition_search_breakout_rate_thr = 500;
+    sf->mv.reduce_first_step_size = 1;
+    sf->simple_model_rd_from_var = 1;
+  }
+}
+
+static void set_rt_speed_feature_framesize_dependent(VP10_COMP *cpi,
+    SPEED_FEATURES *sf, int speed) {
+  VP10_COMMON *const cm = &cpi->common;
+
+  if (speed >= 1) {
+    if (VPXMIN(cm->width, cm->height) >= 720) {
+      sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
+                                              : DISABLE_ALL_INTER_SPLIT;
+    } else {
+      sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
+    }
+  }
+
+  if (speed >= 2) {
+    if (VPXMIN(cm->width, cm->height) >= 720) {
+      sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
+                                              : DISABLE_ALL_INTER_SPLIT;
+    } else {
+      sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
+    }
+  }
+
+  if (speed >= 5) {
+    if (VPXMIN(cm->width, cm->height) >= 720) {
+      sf->partition_search_breakout_dist_thr = (1 << 25);
+    } else {
+      sf->partition_search_breakout_dist_thr = (1 << 23);
+    }
+  }
+
+  if (speed >= 7) {
+    sf->encode_breakout_thresh = (VPXMIN(cm->width, cm->height) >= 720) ?
+        800 : 300;
+  }
+}
+
+static void set_rt_speed_feature(VP10_COMP *cpi, SPEED_FEATURES *sf,
+                                 int speed, vp9e_tune_content content) {
+  VP10_COMMON *const cm = &cpi->common;
+  const int is_keyframe = cm->frame_type == KEY_FRAME;
+  const int frames_since_key = is_keyframe ? 0 : cpi->rc.frames_since_key;
+  sf->static_segmentation = 0;
+  sf->adaptive_rd_thresh = 1;
+  sf->use_fast_coef_costing = 1;
+  sf->allow_exhaustive_searches = 0;
+  sf->exhaustive_searches_thresh = INT_MAX;
+
+  if (speed >= 1) {
+    sf->use_square_partition_only = !frame_is_intra_only(cm);
+    sf->less_rectangular_check = 1;
+    sf->tx_size_search_method = frame_is_intra_only(cm) ? USE_FULL_RD
+                                                        : USE_LARGESTALL;
+
+    sf->use_rd_breakout = 1;
+
+    sf->adaptive_motion_search = 1;
+    sf->adaptive_pred_interp_filter = 1;
+    sf->mv.auto_mv_step_size = 1;
+    sf->adaptive_rd_thresh = 2;
+    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
+  }
+
+  if (speed >= 2) {
+    sf->mode_search_skip_flags = (cm->frame_type == KEY_FRAME) ? 0 :
+                                 FLAG_SKIP_INTRA_DIRMISMATCH |
+                                 FLAG_SKIP_INTRA_BESTINTER |
+                                 FLAG_SKIP_COMP_BESTINTRA |
+                                 FLAG_SKIP_INTRA_LOWVAR;
+    sf->adaptive_pred_interp_filter = 2;
+    sf->disable_filter_search_var_thresh = 50;
+    sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
+    sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
+    sf->lf_motion_threshold = LOW_MOTION_THRESHOLD;
+    sf->adjust_partitioning_from_last_frame = 1;
+    sf->last_partitioning_redo_frequency = 3;
+    sf->use_lp32x32fdct = 1;
+    sf->mode_skip_start = 11;
+    sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
+  }
+
+  if (speed >= 3) {
+    sf->use_square_partition_only = 1;
+    sf->disable_filter_search_var_thresh = 100;
+    sf->use_uv_intra_rd_estimate = 1;
+    sf->mv.subpel_iters_per_step = 1;
+    sf->adaptive_rd_thresh = 4;
+    sf->mode_skip_start = 6;
+    sf->allow_skip_recode = 0;
+    sf->optimize_coefficients = 0;
+    sf->disable_split_mask = DISABLE_ALL_SPLIT;
+    sf->lpf_pick = LPF_PICK_FROM_Q;
+  }
+
+  if (speed >= 4) {
+    int i;
+    sf->last_partitioning_redo_frequency = 4;
+    sf->adaptive_rd_thresh = 5;
+    sf->use_fast_coef_costing = 0;
+    sf->auto_min_max_partition_size = STRICT_NEIGHBORING_MIN_MAX;
+    sf->adjust_partitioning_from_last_frame =
+        cm->last_frame_type != cm->frame_type || (0 ==
+        (frames_since_key + 1) % sf->last_partitioning_redo_frequency);
+    sf->mv.subpel_force_stop = 1;
+    for (i = 0; i < TX_SIZES; i++) {
+      sf->intra_y_mode_mask[i] = INTRA_DC_H_V;
+      sf->intra_uv_mode_mask[i] = INTRA_DC;
+    }
+    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
+    sf->frame_parameter_update = 0;
+    sf->mv.search_method = FAST_HEX;
+
+    sf->inter_mode_mask[BLOCK_32X32] = INTER_NEAREST_NEAR_NEW;
+    sf->inter_mode_mask[BLOCK_32X64] = INTER_NEAREST;
+    sf->inter_mode_mask[BLOCK_64X32] = INTER_NEAREST;
+    sf->inter_mode_mask[BLOCK_64X64] = INTER_NEAREST;
+    sf->max_intra_bsize = BLOCK_32X32;
+    sf->allow_skip_recode = 1;
+  }
+
+  if (speed >= 5) {
+    sf->use_quant_fp = !is_keyframe;
+    sf->auto_min_max_partition_size = is_keyframe ? RELAXED_NEIGHBORING_MIN_MAX
+                                                  : STRICT_NEIGHBORING_MIN_MAX;
+    sf->default_max_partition_size = BLOCK_32X32;
+    sf->default_min_partition_size = BLOCK_8X8;
+    sf->force_frame_boost = is_keyframe ||
+        (frames_since_key % (sf->last_partitioning_redo_frequency << 1) == 1);
+    sf->max_delta_qindex = is_keyframe ? 20 : 15;
+    sf->partition_search_type = REFERENCE_PARTITION;
+    sf->allow_skip_recode = 0;
+    sf->inter_mode_mask[BLOCK_32X32] = INTER_NEAREST_NEW_ZERO;
+    sf->inter_mode_mask[BLOCK_32X64] = INTER_NEAREST_NEW_ZERO;
+    sf->inter_mode_mask[BLOCK_64X32] = INTER_NEAREST_NEW_ZERO;
+    sf->inter_mode_mask[BLOCK_64X64] = INTER_NEAREST_NEW_ZERO;
+    sf->adaptive_rd_thresh = 2;
+    // This feature is only enabled when partition search is disabled.
+    sf->reuse_inter_pred_sby = 1;
+    sf->partition_search_breakout_rate_thr = 200;
+    sf->coeff_prob_appx_step = 4;
+    sf->use_fast_coef_updates = is_keyframe ? TWO_LOOP : ONE_LOOP_REDUCED;
+    sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH;
+    sf->tx_size_search_method = is_keyframe ? USE_LARGESTALL : USE_TX_8X8;
+    sf->simple_model_rd_from_var = 1;
+
+    if (!is_keyframe) {
+      int i;
+      if (content == VP9E_CONTENT_SCREEN) {
+        for (i = 0; i < BLOCK_SIZES; ++i)
+          sf->intra_y_mode_bsize_mask[i] = INTRA_DC_TM_H_V;
+      } else {
+        for (i = 0; i < BLOCK_SIZES; ++i)
+          if (i >= BLOCK_16X16)
+            sf->intra_y_mode_bsize_mask[i] = INTRA_DC;
+          else
+            // Use H and V intra mode for block sizes <= 16X16.
+            sf->intra_y_mode_bsize_mask[i] = INTRA_DC_H_V;
+      }
+    }
+  }
+
+  if (speed >= 6) {
+    // Adaptively switch between SOURCE_VAR_BASED_PARTITION and FIXED_PARTITION.
+    sf->partition_search_type = VAR_BASED_PARTITION;
+    // Turn on this to use non-RD key frame coding mode.
+    sf->mv.search_method = NSTEP;
+    sf->mv.reduce_first_step_size = 1;
+  }
+
+  if (speed >= 7) {
+    sf->adaptive_rd_thresh = 3;
+    sf->mv.search_method = FAST_DIAMOND;
+    sf->mv.fullpel_search_step_param = 10;
+  }
+  if (speed >= 8) {
+    sf->adaptive_rd_thresh = 4;
+    sf->mv.subpel_force_stop = 2;
+    sf->lpf_pick = LPF_PICK_MINIMAL_LPF;
+  }
+}
+
+void vp10_set_speed_features_framesize_dependent(VP10_COMP *cpi) {
+  SPEED_FEATURES *const sf = &cpi->sf;
+  const VP10EncoderConfig *const oxcf = &cpi->oxcf;
+  RD_OPT *const rd = &cpi->rd;
+  int i;
+
+  if (oxcf->mode == REALTIME) {
+    set_rt_speed_feature_framesize_dependent(cpi, sf, oxcf->speed);
+  } else if (oxcf->mode == GOOD) {
+    set_good_speed_feature_framesize_dependent(cpi, sf, oxcf->speed);
+  }
+
+  if (sf->disable_split_mask == DISABLE_ALL_SPLIT) {
+    sf->adaptive_pred_interp_filter = 0;
+  }
+
+  if (cpi->encode_breakout && oxcf->mode == REALTIME &&
+      sf->encode_breakout_thresh > cpi->encode_breakout) {
+    cpi->encode_breakout = sf->encode_breakout_thresh;
+  }
+
+  // Check for masked out split cases.
+  for (i = 0; i < MAX_REFS; ++i) {
+    if (sf->disable_split_mask & (1 << i)) {
+      rd->thresh_mult_sub8x8[i] = INT_MAX;
+    }
+  }
+}
+
+void vp10_set_speed_features_framesize_independent(VP10_COMP *cpi) {
+  SPEED_FEATURES *const sf = &cpi->sf;
+  VP10_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->td.mb;
+  const VP10EncoderConfig *const oxcf = &cpi->oxcf;
+  int i;
+
+  // best quality defaults
+  sf->frame_parameter_update = 1;
+  sf->mv.search_method = NSTEP;
+  sf->recode_loop = ALLOW_RECODE;
+  sf->mv.subpel_search_method = SUBPEL_TREE;
+  sf->mv.subpel_iters_per_step = 2;
+  sf->mv.subpel_force_stop = 0;
+  sf->optimize_coefficients = !is_lossless_requested(&cpi->oxcf);
+  sf->mv.reduce_first_step_size = 0;
+  sf->coeff_prob_appx_step = 1;
+  sf->mv.auto_mv_step_size = 0;
+  sf->mv.fullpel_search_step_param = 6;
+  sf->comp_inter_joint_search_thresh = BLOCK_4X4;
+  sf->adaptive_rd_thresh = 0;
+  sf->tx_size_search_method = USE_FULL_RD;
+  sf->use_lp32x32fdct = 0;
+  sf->adaptive_motion_search = 0;
+  sf->adaptive_pred_interp_filter = 0;
+  sf->adaptive_mode_search = 0;
+  sf->cb_pred_filter_search = 0;
+  sf->cb_partition_search = 0;
+  sf->alt_ref_search_fp = 0;
+  sf->use_quant_fp = 0;
+  sf->partition_search_type = SEARCH_PARTITION;
+  sf->less_rectangular_check = 0;
+  sf->use_square_partition_only = 0;
+  sf->auto_min_max_partition_size = NOT_IN_USE;
+  sf->rd_auto_partition_min_limit = BLOCK_4X4;
+  sf->default_max_partition_size = BLOCK_64X64;
+  sf->default_min_partition_size = BLOCK_4X4;
+  sf->adjust_partitioning_from_last_frame = 0;
+  sf->last_partitioning_redo_frequency = 4;
+  sf->disable_split_mask = 0;
+  sf->mode_search_skip_flags = 0;
+  sf->force_frame_boost = 0;
+  sf->max_delta_qindex = 0;
+  sf->disable_filter_search_var_thresh = 0;
+  sf->adaptive_interp_filter_search = 0;
+  sf->allow_partition_search_skip = 0;
+
+  for (i = 0; i < TX_SIZES; i++) {
+    sf->intra_y_mode_mask[i] = INTRA_ALL;
+    sf->intra_uv_mode_mask[i] = INTRA_ALL;
+  }
+  sf->use_rd_breakout = 0;
+  sf->use_uv_intra_rd_estimate = 0;
+  sf->allow_skip_recode = 0;
+  sf->lpf_pick = LPF_PICK_FROM_FULL_IMAGE;
+  sf->use_fast_coef_updates = TWO_LOOP;
+  sf->use_fast_coef_costing = 0;
+  sf->mode_skip_start = MAX_MODES;  // Mode index at which mode skip mask set
+  sf->schedule_mode_search = 0;
+  for (i = 0; i < BLOCK_SIZES; ++i)
+    sf->inter_mode_mask[i] = INTER_ALL;
+  sf->max_intra_bsize = BLOCK_64X64;
+  sf->reuse_inter_pred_sby = 0;
+  // This setting only takes effect when partition_search_type is set
+  // to FIXED_PARTITION.
+  sf->always_this_block_size = BLOCK_16X16;
+  sf->search_type_check_frequency = 50;
+  sf->encode_breakout_thresh = 0;
+  // Recode loop tolerance %.
+  sf->recode_tolerance = 25;
+  sf->default_interp_filter = SWITCHABLE;
+  sf->tx_size_search_breakout = 0;
+  sf->partition_search_breakout_dist_thr = 0;
+  sf->partition_search_breakout_rate_thr = 0;
+  sf->simple_model_rd_from_var = 0;
+
+  if (oxcf->mode == REALTIME)
+    set_rt_speed_feature(cpi, sf, oxcf->speed, oxcf->content);
+  else if (oxcf->mode == GOOD)
+    set_good_speed_feature(cpi, cm, sf, oxcf->speed);
+
+  cpi->full_search_sad = vp10_full_search_sad;
+  cpi->diamond_search_sad = vp10_diamond_search_sad;
+
+  sf->allow_exhaustive_searches = 1;
+  if (oxcf->mode == BEST) {
+    if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION)
+      sf->exhaustive_searches_thresh = (1 << 20);
+    else
+      sf->exhaustive_searches_thresh = (1 << 21);
+    sf->max_exaustive_pct = 100;
+    for (i = 0; i < MAX_MESH_STEP; ++i) {
+      sf->mesh_patterns[i].range = best_quality_mesh_pattern[i].range;
+      sf->mesh_patterns[i].interval = best_quality_mesh_pattern[i].interval;
+    }
+  } else {
+    int speed = (oxcf->speed > MAX_MESH_SPEED) ? MAX_MESH_SPEED : oxcf->speed;
+    if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION)
+      sf->exhaustive_searches_thresh = (1 << 22);
+    else
+      sf->exhaustive_searches_thresh = (1 << 23);
+    sf->max_exaustive_pct = good_quality_max_mesh_pct[speed];
+    if (speed > 0)
+      sf->exhaustive_searches_thresh = sf->exhaustive_searches_thresh << 1;
+
+    for (i = 0; i < MAX_MESH_STEP; ++i) {
+      sf->mesh_patterns[i].range =
+          good_quality_mesh_patterns[speed][i].range;
+      sf->mesh_patterns[i].interval =
+          good_quality_mesh_patterns[speed][i].interval;
+    }
+  }
+
+  // Slow quant, dct and trellis not worthwhile for first pass
+  // so make sure they are always turned off.
+  if (oxcf->pass == 1)
+    sf->optimize_coefficients = 0;
+
+  // No recode for 1 pass.
+  if (oxcf->pass == 0) {
+    sf->recode_loop = DISALLOW_RECODE;
+    sf->optimize_coefficients = 0;
+  }
+
+  if (sf->mv.subpel_search_method == SUBPEL_TREE) {
+    cpi->find_fractional_mv_step = vp10_find_best_sub_pixel_tree;
+  } else if (sf->mv.subpel_search_method == SUBPEL_TREE_PRUNED) {
+    cpi->find_fractional_mv_step = vp10_find_best_sub_pixel_tree_pruned;
+  } else if (sf->mv.subpel_search_method == SUBPEL_TREE_PRUNED_MORE) {
+    cpi->find_fractional_mv_step = vp10_find_best_sub_pixel_tree_pruned_more;
+  } else if (sf->mv.subpel_search_method == SUBPEL_TREE_PRUNED_EVENMORE) {
+    cpi->find_fractional_mv_step = vp10_find_best_sub_pixel_tree_pruned_evenmore;
+  }
+
+  x->optimize = sf->optimize_coefficients == 1 && oxcf->pass != 1;
+
+  x->min_partition_size = sf->default_min_partition_size;
+  x->max_partition_size = sf->default_max_partition_size;
+
+  if (!cpi->oxcf.frame_periodic_boost) {
+    sf->max_delta_qindex = 0;
+  }
+}
diff --git a/libs/libvpx/vp10/encoder/speed_features.h b/libs/libvpx/vp10/encoder/speed_features.h
new file mode 100644
index 0000000000..3b91999298
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/speed_features.h
@@ -0,0 +1,438 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_ENCODER_SPEED_FEATURES_H_
+#define VP10_ENCODER_SPEED_FEATURES_H_
+
+#include "vp10/common/enums.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum {
+  INTRA_ALL       = (1 << DC_PRED) |
+                    (1 << V_PRED) | (1 << H_PRED) |
+                    (1 << D45_PRED) | (1 << D135_PRED) |
+                    (1 << D117_PRED) | (1 << D153_PRED) |
+                    (1 << D207_PRED) | (1 << D63_PRED) |
+                    (1 << TM_PRED),
+  INTRA_DC        = (1 << DC_PRED),
+  INTRA_DC_TM     = (1 << DC_PRED) | (1 << TM_PRED),
+  INTRA_DC_H_V    = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED),
+  INTRA_DC_TM_H_V = (1 << DC_PRED) | (1 << TM_PRED) | (1 << V_PRED) |
+                    (1 << H_PRED)
+};
+
+enum {
+  INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) | (1 << NEWMV),
+  INTER_NEAREST = (1 << NEARESTMV),
+  INTER_NEAREST_NEW = (1 << NEARESTMV) | (1 << NEWMV),
+  INTER_NEAREST_ZERO = (1 << NEARESTMV) | (1 << ZEROMV),
+  INTER_NEAREST_NEW_ZERO = (1 << NEARESTMV) | (1 << ZEROMV) | (1 << NEWMV),
+  INTER_NEAREST_NEAR_NEW = (1 << NEARESTMV) | (1 << NEARMV) | (1 << NEWMV),
+  INTER_NEAREST_NEAR_ZERO = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV),
+};
+
+enum {
+  DISABLE_ALL_INTER_SPLIT   = (1 << THR_COMP_GA) |
+                              (1 << THR_COMP_LA) |
+                              (1 << THR_ALTR) |
+                              (1 << THR_GOLD) |
+                              (1 << THR_LAST),
+
+  DISABLE_ALL_SPLIT         = (1 << THR_INTRA) | DISABLE_ALL_INTER_SPLIT,
+
+  DISABLE_COMPOUND_SPLIT    = (1 << THR_COMP_GA) | (1 << THR_COMP_LA),
+
+  LAST_AND_INTRA_SPLIT_ONLY = (1 << THR_COMP_GA) |
+                              (1 << THR_COMP_LA) |
+                              (1 << THR_ALTR) |
+                              (1 << THR_GOLD)
+};
+
+typedef enum {
+  DIAMOND = 0,
+  NSTEP = 1,
+  HEX = 2,
+  BIGDIA = 3,
+  SQUARE = 4,
+  FAST_HEX = 5,
+  FAST_DIAMOND = 6
+} SEARCH_METHODS;
+
+typedef enum {
+  // No recode.
+  DISALLOW_RECODE = 0,
+  // Allow recode for KF and exceeding maximum frame bandwidth.
+  ALLOW_RECODE_KFMAXBW = 1,
+  // Allow recode only for KF/ARF/GF frames.
+  ALLOW_RECODE_KFARFGF = 2,
+  // Allow recode for all frames based on bitrate constraints.
+  ALLOW_RECODE = 3,
+} RECODE_LOOP_TYPE;
+
+typedef enum {
+  SUBPEL_TREE = 0,
+  SUBPEL_TREE_PRUNED = 1,           // Prunes 1/2-pel searches
+  SUBPEL_TREE_PRUNED_MORE = 2,      // Prunes 1/2-pel searches more aggressively
+  SUBPEL_TREE_PRUNED_EVENMORE = 3,  // Prunes 1/2- and 1/4-pel searches
+  // Other methods to come
+} SUBPEL_SEARCH_METHODS;
+
+typedef enum {
+  NO_MOTION_THRESHOLD = 0,
+  LOW_MOTION_THRESHOLD = 7
+} MOTION_THRESHOLD;
+
+typedef enum {
+  USE_FULL_RD = 0,
+  USE_LARGESTALL,
+  USE_TX_8X8
+} TX_SIZE_SEARCH_METHOD;
+
+typedef enum {
+  NOT_IN_USE = 0,
+  RELAXED_NEIGHBORING_MIN_MAX = 1,
+  STRICT_NEIGHBORING_MIN_MAX = 2
+} AUTO_MIN_MAX_MODE;
+
+typedef enum {
+  // Try the full image with different values.
+  LPF_PICK_FROM_FULL_IMAGE,
+  // Try a small portion of the image with different values.
+  LPF_PICK_FROM_SUBIMAGE,
+  // Estimate the level based on quantizer and frame type
+  LPF_PICK_FROM_Q,
+  // Pick 0 to disable LPF if LPF was enabled last frame
+  LPF_PICK_MINIMAL_LPF
+} LPF_PICK_METHOD;
+
+typedef enum {
+  // Terminate search early based on distortion so far compared to
+  // qp step, distortion in the neighborhood of the frame, etc.
+  FLAG_EARLY_TERMINATE = 1 << 0,
+
+  // Skips comp inter modes if the best so far is an intra mode.
+  FLAG_SKIP_COMP_BESTINTRA = 1 << 1,
+
+  // Skips oblique intra modes if the best so far is an inter mode.
+  FLAG_SKIP_INTRA_BESTINTER = 1 << 3,
+
+  // Skips oblique intra modes  at angles 27, 63, 117, 153 if the best
+  // intra so far is not one of the neighboring directions.
+  FLAG_SKIP_INTRA_DIRMISMATCH = 1 << 4,
+
+  // Skips intra modes other than DC_PRED if the source variance is small
+  FLAG_SKIP_INTRA_LOWVAR = 1 << 5,
+} MODE_SEARCH_SKIP_LOGIC;
+
+typedef enum {
+  FLAG_SKIP_EIGHTTAP = 1 << EIGHTTAP,
+  FLAG_SKIP_EIGHTTAP_SMOOTH = 1 << EIGHTTAP_SMOOTH,
+  FLAG_SKIP_EIGHTTAP_SHARP = 1 << EIGHTTAP_SHARP,
+} INTERP_FILTER_MASK;
+
+typedef enum {
+  // Search partitions using RD criterion
+  SEARCH_PARTITION,
+
+  // Always use a fixed size partition
+  FIXED_PARTITION,
+
+  REFERENCE_PARTITION,
+
+  // Use an arbitrary partitioning scheme based on source variance within
+  // a 64X64 SB
+  VAR_BASED_PARTITION,
+
+  // Use non-fixed partitions based on source variance
+  SOURCE_VAR_BASED_PARTITION
+} PARTITION_SEARCH_TYPE;
+
+typedef enum {
+  // Does a dry run to see if any of the contexts need to be updated or not,
+  // before the final run.
+  TWO_LOOP = 0,
+
+  // No dry run, also only half the coef contexts and bands are updated.
+  // The rest are not updated at all.
+  ONE_LOOP_REDUCED = 1
+} FAST_COEFF_UPDATE;
+
+typedef struct MV_SPEED_FEATURES {
+  // Motion search method (Diamond, NSTEP, Hex, Big Diamond, Square, etc).
+  SEARCH_METHODS search_method;
+
+  // This parameter controls which step in the n-step process we start at.
+  // It's changed adaptively based on circumstances.
+  int reduce_first_step_size;
+
+  // If this is set to 1, we limit the motion search range to 2 times the
+  // largest motion vector found in the last frame.
+  int auto_mv_step_size;
+
+  // Subpel_search_method can only be subpel_tree which does a subpixel
+  // logarithmic search that keeps stepping at 1/2 pixel units until
+  // you stop getting a gain, and then goes on to 1/4 and repeats
+  // the same process. Along the way it skips many diagonals.
+  SUBPEL_SEARCH_METHODS subpel_search_method;
+
+  // Maximum number of steps in logarithmic subpel search before giving up.
+  int subpel_iters_per_step;
+
+  // Control when to stop subpel search
+  int subpel_force_stop;
+
+  // This variable sets the step_param used in full pel motion search.
+  int fullpel_search_step_param;
+} MV_SPEED_FEATURES;
+
+#define MAX_MESH_STEP 4
+
+typedef struct MESH_PATTERN {
+  int range;
+  int interval;
+} MESH_PATTERN;
+
+typedef struct SPEED_FEATURES {
+  MV_SPEED_FEATURES mv;
+
+  // Frame level coding parameter update
+  int frame_parameter_update;
+
+  RECODE_LOOP_TYPE recode_loop;
+
+  // Trellis (dynamic programming) optimization of quantized values (+1, 0).
+  int optimize_coefficients;
+
+  // Always set to 0. If on it enables 0 cost background transmission
+  // (except for the initial transmission of the segmentation). The feature is
+  // disabled because the addition of very large block sizes make the
+  // backgrounds very to cheap to encode, and the segmentation we have
+  // adds overhead.
+  int static_segmentation;
+
+  // If 1 we iterate finding a best reference for 2 ref frames together - via
+  // a log search that iterates 4 times (check around mv for last for best
+  // error of combined predictor then check around mv for alt). If 0 we
+  // we just use the best motion vector found for each frame by itself.
+  BLOCK_SIZE comp_inter_joint_search_thresh;
+
+  // This variable is used to cap the maximum number of times we skip testing a
+  // mode to be evaluated. A high value means we will be faster.
+  int adaptive_rd_thresh;
+
+  // Speed feature to allow or disallow skipping of recode at block
+  // level within a frame.
+  int allow_skip_recode;
+
+  // Coefficient probability model approximation step size
+  int coeff_prob_appx_step;
+
+  // The threshold is to determine how slow the motino is, it is used when
+  // use_lastframe_partitioning is set to LAST_FRAME_PARTITION_LOW_MOTION
+  MOTION_THRESHOLD lf_motion_threshold;
+
+  // Determine which method we use to determine transform size. We can choose
+  // between options like full rd, largest for prediction size, largest
+  // for intra and model coefs for the rest.
+  TX_SIZE_SEARCH_METHOD tx_size_search_method;
+
+  // Low precision 32x32 fdct keeps everything in 16 bits and thus is less
+  // precise but significantly faster than the non lp version.
+  int use_lp32x32fdct;
+
+  // After looking at the first set of modes (set by index here), skip
+  // checking modes for reference frames that don't match the reference frame
+  // of the best so far.
+  int mode_skip_start;
+
+  PARTITION_SEARCH_TYPE partition_search_type;
+
+  // Used if partition_search_type = FIXED_SIZE_PARTITION
+  BLOCK_SIZE always_this_block_size;
+
+  // Skip rectangular partition test when partition type none gives better
+  // rd than partition type split.
+  int less_rectangular_check;
+
+  // Disable testing non square partitions. (eg 16x32)
+  int use_square_partition_only;
+
+  // Sets min and max partition sizes for this 64x64 region based on the
+  // same 64x64 in last encoded frame, and the left and above neighbor.
+  AUTO_MIN_MAX_MODE auto_min_max_partition_size;
+  // Ensures the rd based auto partition search will always
+  // go down at least to the specified level.
+  BLOCK_SIZE rd_auto_partition_min_limit;
+
+  // Min and max partition size we enable (block_size) as per auto
+  // min max, but also used by adjust partitioning, and pick_partitioning.
+  BLOCK_SIZE default_min_partition_size;
+  BLOCK_SIZE default_max_partition_size;
+
+  // Whether or not we allow partitions one smaller or one greater than the last
+  // frame's partitioning. Only used if use_lastframe_partitioning is set.
+  int adjust_partitioning_from_last_frame;
+
+  // How frequently we re do the partitioning from scratch. Only used if
+  // use_lastframe_partitioning is set.
+  int last_partitioning_redo_frequency;
+
+  // Disables sub 8x8 blocksizes in different scenarios: Choices are to disable
+  // it always, to allow it for only Last frame and Intra, disable it for all
+  // inter modes or to enable it always.
+  int disable_split_mask;
+
+  // TODO(jingning): combine the related motion search speed features
+  // This allows us to use motion search at other sizes as a starting
+  // point for this motion search and limits the search range around it.
+  int adaptive_motion_search;
+
+  // Flag for allowing some use of exhaustive searches;
+  int allow_exhaustive_searches;
+
+  // Threshold for allowing exhaistive motion search.
+  int exhaustive_searches_thresh;
+
+  // Maximum number of exhaustive searches for a frame.
+  int max_exaustive_pct;
+
+  // Pattern to be used for any exhaustive mesh searches.
+  MESH_PATTERN mesh_patterns[MAX_MESH_STEP];
+
+  int schedule_mode_search;
+
+  // Allows sub 8x8 modes to use the prediction filter that was determined
+  // best for 8x8 mode. If set to 0 we always re check all the filters for
+  // sizes less than 8x8, 1 means we check all filter modes if no 8x8 filter
+  // was selected, and 2 means we use 8 tap if no 8x8 filter mode was selected.
+  int adaptive_pred_interp_filter;
+
+  // Adaptive prediction mode search
+  int adaptive_mode_search;
+
+  // Chessboard pattern prediction filter type search
+  int cb_pred_filter_search;
+
+  int cb_partition_search;
+
+  int alt_ref_search_fp;
+
+  // Fast quantization process path
+  int use_quant_fp;
+
+  // Use finer quantizer in every other few frames that run variable block
+  // partition type search.
+  int force_frame_boost;
+
+  // Maximally allowed base quantization index fluctuation.
+  int max_delta_qindex;
+
+  // Implements various heuristics to skip searching modes
+  // The heuristics selected are based on  flags
+  // defined in the MODE_SEARCH_SKIP_HEURISTICS enum
+  unsigned int mode_search_skip_flags;
+
+  // A source variance threshold below which filter search is disabled
+  // Choose a very large value (UINT_MAX) to use 8-tap always
+  unsigned int disable_filter_search_var_thresh;
+
+  // These bit masks allow you to enable or disable intra modes for each
+  // transform size separately.
+  int intra_y_mode_mask[TX_SIZES];
+  int intra_uv_mode_mask[TX_SIZES];
+
+  // These bit masks allow you to enable or disable intra modes for each
+  // prediction block size separately.
+  int intra_y_mode_bsize_mask[BLOCK_SIZES];
+
+  // This variable enables an early break out of mode testing if the model for
+  // rd built from the prediction signal indicates a value that's much
+  // higher than the best rd we've seen so far.
+  int use_rd_breakout;
+
+  // This enables us to use an estimate for intra rd based on dc mode rather
+  // than choosing an actual uv mode in the stage of encoding before the actual
+  // final encode.
+  int use_uv_intra_rd_estimate;
+
+  // This feature controls how the loop filter level is determined.
+  LPF_PICK_METHOD lpf_pick;
+
+  // This feature limits the number of coefficients updates we actually do
+  // by only looking at counts from 1/2 the bands.
+  FAST_COEFF_UPDATE use_fast_coef_updates;
+
+  // A binary mask indicating if NEARESTMV, NEARMV, ZEROMV, NEWMV
+  // modes are used in order from LSB to MSB for each BLOCK_SIZE.
+  int inter_mode_mask[BLOCK_SIZES];
+
+  // This feature controls whether we do the expensive context update and
+  // calculation in the rd coefficient costing loop.
+  int use_fast_coef_costing;
+
+  // This feature controls the tolerence vs target used in deciding whether to
+  // recode a frame. It has no meaning if recode is disabled.
+  int recode_tolerance;
+
+  // This variable controls the maximum block size where intra blocks can be
+  // used in inter frames.
+  // TODO(aconverse): Fold this into one of the other many mode skips
+  BLOCK_SIZE max_intra_bsize;
+
+  // The frequency that we check if SOURCE_VAR_BASED_PARTITION or
+  // FIXED_PARTITION search type should be used.
+  int search_type_check_frequency;
+
+  // When partition is pre-set, the inter prediction result from pick_inter_mode
+  // can be reused in final block encoding process. It is enabled only for real-
+  // time mode speed 6.
+  int reuse_inter_pred_sby;
+
+  // This variable sets the encode_breakout threshold. Currently, it is only
+  // enabled in real time mode.
+  int encode_breakout_thresh;
+
+  // default interp filter choice
+  INTERP_FILTER default_interp_filter;
+
+  // Early termination in transform size search, which only applies while
+  // tx_size_search_method is USE_FULL_RD.
+  int tx_size_search_breakout;
+
+  // adaptive interp_filter search to allow skip of certain filter types.
+  int adaptive_interp_filter_search;
+
+  // mask for skip evaluation of certain interp_filter type.
+  INTERP_FILTER_MASK interp_filter_search_mask;
+
+  // Partition search early breakout thresholds.
+  int64_t partition_search_breakout_dist_thr;
+  int partition_search_breakout_rate_thr;
+
+  // Allow skipping partition search for still image frame
+  int allow_partition_search_skip;
+
+  // Fast approximation of vp10_model_rd_from_var_lapndz
+  int simple_model_rd_from_var;
+} SPEED_FEATURES;
+
+struct VP10_COMP;
+
+void vp10_set_speed_features_framesize_independent(struct VP10_COMP *cpi);
+void vp10_set_speed_features_framesize_dependent(struct VP10_COMP *cpi);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_ENCODER_SPEED_FEATURES_H_
diff --git a/libs/libvpx/vp10/encoder/subexp.c b/libs/libvpx/vp10/encoder/subexp.c
new file mode 100644
index 0000000000..eccee8e747
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/subexp.c
@@ -0,0 +1,207 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "vpx_dsp/bitwriter.h"
+
+#include "vp10/common/common.h"
+#include "vp10/common/entropy.h"
+#include "vp10/encoder/cost.h"
+#include "vp10/encoder/subexp.h"
+
+#define vp10_cost_upd256  ((int)(vp10_cost_one(upd) - vp10_cost_zero(upd)))
+
+static const uint8_t update_bits[255] = {
+   5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
+   6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
+   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+  10, 11 - CONFIG_MISC_FIXES,
+          11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  0,
+};
+
+static int recenter_nonneg(int v, int m) {
+  if (v > (m << 1))
+    return v;
+  else if (v >= m)
+    return ((v - m) << 1);
+  else
+    return ((m - v) << 1) - 1;
+}
+
+static int remap_prob(int v, int m) {
+  int i;
+  static const uint8_t map_table[MAX_PROB - 1] = {
+    // generated by:
+    //   map_table[j] = split_index(j, MAX_PROB - 1, MODULUS_PARAM);
+     20,  21,  22,  23,  24,  25,   0,  26,  27,  28,  29,  30,  31,  32,  33,
+     34,  35,  36,  37,   1,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,
+     48,  49,   2,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,
+      3,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,   4,  74,
+     75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,   5,  86,  87,  88,
+     89,  90,  91,  92,  93,  94,  95,  96,  97,   6,  98,  99, 100, 101, 102,
+    103, 104, 105, 106, 107, 108, 109,   7, 110, 111, 112, 113, 114, 115, 116,
+    117, 118, 119, 120, 121,   8, 122, 123, 124, 125, 126, 127, 128, 129, 130,
+    131, 132, 133,   9, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144,
+    145,  10, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157,  11,
+    158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,  12, 170, 171,
+    172, 173, 174, 175, 176, 177, 178, 179, 180, 181,  13, 182, 183, 184, 185,
+    186, 187, 188, 189, 190, 191, 192, 193,  14, 194, 195, 196, 197, 198, 199,
+    200, 201, 202, 203, 204, 205,  15, 206, 207, 208, 209, 210, 211, 212, 213,
+    214, 215, 216, 217,  16, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227,
+    228, 229,  17, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241,
+     18, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253,  19,
+  };
+  v--;
+  m--;
+  if ((m << 1) <= MAX_PROB)
+    i = recenter_nonneg(v, m) - 1;
+  else
+    i = recenter_nonneg(MAX_PROB - 1 - v, MAX_PROB - 1 - m) - 1;
+
+  i = map_table[i];
+  return i;
+}
+
+static int prob_diff_update_cost(vpx_prob newp, vpx_prob oldp) {
+  int delp = remap_prob(newp, oldp);
+  return update_bits[delp] * 256;
+}
+
+static void encode_uniform(vpx_writer *w, int v) {
+  const int l = 8;
+  const int m = (1 << l) - 191 + CONFIG_MISC_FIXES;
+  if (v < m) {
+    vpx_write_literal(w, v, l - 1);
+  } else {
+    vpx_write_literal(w, m + ((v - m) >> 1), l - 1);
+    vpx_write_literal(w, (v - m) & 1, 1);
+  }
+}
+
+static INLINE int write_bit_gte(vpx_writer *w, int word, int test) {
+  vpx_write_literal(w, word >= test, 1);
+  return word >= test;
+}
+
+static void encode_term_subexp(vpx_writer *w, int word) {
+  if (!write_bit_gte(w, word, 16)) {
+    vpx_write_literal(w, word, 4);
+  } else if (!write_bit_gte(w, word, 32)) {
+    vpx_write_literal(w, word - 16, 4);
+  } else if (!write_bit_gte(w, word, 64)) {
+    vpx_write_literal(w, word - 32, 5);
+  } else {
+    encode_uniform(w, word - 64);
+  }
+}
+
+void vp10_write_prob_diff_update(vpx_writer *w, vpx_prob newp, vpx_prob oldp) {
+  const int delp = remap_prob(newp, oldp);
+  encode_term_subexp(w, delp);
+}
+
+int vp10_prob_diff_update_savings_search(const unsigned int *ct,
+                                        vpx_prob oldp, vpx_prob *bestp,
+                                        vpx_prob upd) {
+  const int old_b = cost_branch256(ct, oldp);
+  int bestsavings = 0;
+  vpx_prob newp, bestnewp = oldp;
+  const int step = *bestp > oldp ? -1 : 1;
+
+  for (newp = *bestp; newp != oldp; newp += step) {
+    const int new_b = cost_branch256(ct, newp);
+    const int update_b = prob_diff_update_cost(newp, oldp) + vp10_cost_upd256;
+    const int savings = old_b - new_b - update_b;
+    if (savings > bestsavings) {
+      bestsavings = savings;
+      bestnewp = newp;
+    }
+  }
+  *bestp = bestnewp;
+  return bestsavings;
+}
+
+int vp10_prob_diff_update_savings_search_model(const unsigned int *ct,
+                                              const vpx_prob *oldp,
+                                              vpx_prob *bestp,
+                                              vpx_prob upd,
+                                              int stepsize) {
+  int i, old_b, new_b, update_b, savings, bestsavings;
+  int newp;
+  const int step_sign = *bestp > oldp[PIVOT_NODE] ? -1 : 1;
+  const int step = stepsize * step_sign;
+  vpx_prob bestnewp, newplist[ENTROPY_NODES], oldplist[ENTROPY_NODES];
+  vp10_model_to_full_probs(oldp, oldplist);
+  memcpy(newplist, oldp, sizeof(vpx_prob) * UNCONSTRAINED_NODES);
+  for (i = UNCONSTRAINED_NODES, old_b = 0; i < ENTROPY_NODES; ++i)
+    old_b += cost_branch256(ct + 2 * i, oldplist[i]);
+  old_b += cost_branch256(ct + 2 * PIVOT_NODE, oldplist[PIVOT_NODE]);
+
+  bestsavings = 0;
+  bestnewp = oldp[PIVOT_NODE];
+
+  assert(stepsize > 0);
+
+  for (newp = *bestp; (newp - oldp[PIVOT_NODE]) * step_sign < 0;
+      newp += step) {
+    if (newp < 1 || newp > 255)
+      continue;
+    newplist[PIVOT_NODE] = newp;
+    vp10_model_to_full_probs(newplist, newplist);
+    for (i = UNCONSTRAINED_NODES, new_b = 0; i < ENTROPY_NODES; ++i)
+      new_b += cost_branch256(ct + 2 * i, newplist[i]);
+    new_b += cost_branch256(ct + 2 * PIVOT_NODE, newplist[PIVOT_NODE]);
+    update_b = prob_diff_update_cost(newp, oldp[PIVOT_NODE]) +
+        vp10_cost_upd256;
+    savings = old_b - new_b - update_b;
+    if (savings > bestsavings) {
+      bestsavings = savings;
+      bestnewp = newp;
+    }
+  }
+
+  *bestp = bestnewp;
+  return bestsavings;
+}
+
+void vp10_cond_prob_diff_update(vpx_writer *w, vpx_prob *oldp,
+                               const unsigned int ct[2]) {
+  const vpx_prob upd = DIFF_UPDATE_PROB;
+  vpx_prob newp = get_binary_prob(ct[0], ct[1]);
+  const int savings = vp10_prob_diff_update_savings_search(ct, *oldp, &newp,
+                                                          upd);
+  assert(newp >= 1);
+  if (savings > 0) {
+    vpx_write(w, 1, upd);
+    vp10_write_prob_diff_update(w, newp, *oldp);
+    *oldp = newp;
+  } else {
+    vpx_write(w, 0, upd);
+  }
+}
+
+int vp10_cond_prob_diff_update_savings(vpx_prob *oldp,
+                                       const unsigned int ct[2]) {
+  const vpx_prob upd = DIFF_UPDATE_PROB;
+  vpx_prob newp = get_binary_prob(ct[0], ct[1]);
+  const int savings = vp10_prob_diff_update_savings_search(ct, *oldp, &newp,
+                                                           upd);
+  return savings;
+}
diff --git a/libs/libvpx/vp10/encoder/subexp.h b/libs/libvpx/vp10/encoder/subexp.h
new file mode 100644
index 0000000000..091334f1f2
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/subexp.h
@@ -0,0 +1,46 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP10_ENCODER_SUBEXP_H_
+#define VP10_ENCODER_SUBEXP_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "vpx_dsp/prob.h"
+
+struct vpx_writer;
+
+void vp10_write_prob_diff_update(struct vpx_writer *w,
+                                vpx_prob newp, vpx_prob oldp);
+
+void vp10_cond_prob_diff_update(struct vpx_writer *w, vpx_prob *oldp,
+                               const unsigned int ct[2]);
+
+int vp10_prob_diff_update_savings_search(const unsigned int *ct,
+                                        vpx_prob oldp, vpx_prob *bestp,
+                                        vpx_prob upd);
+
+
+int vp10_prob_diff_update_savings_search_model(const unsigned int *ct,
+                                              const vpx_prob *oldp,
+                                              vpx_prob *bestp,
+                                              vpx_prob upd,
+                                              int stepsize);
+
+int vp10_cond_prob_diff_update_savings(vpx_prob *oldp,
+                                       const unsigned int ct[2]);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_ENCODER_SUBEXP_H_
diff --git a/libs/libvpx/vp10/encoder/temporal_filter.c b/libs/libvpx/vp10/encoder/temporal_filter.c
new file mode 100644
index 0000000000..5278d3b736
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/temporal_filter.c
@@ -0,0 +1,702 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <limits.h>
+
+#include "vp10/common/alloccommon.h"
+#include "vp10/common/onyxc_int.h"
+#include "vp10/common/quant_common.h"
+#include "vp10/common/reconinter.h"
+#include "vp10/encoder/extend.h"
+#include "vp10/encoder/firstpass.h"
+#include "vp10/encoder/mcomp.h"
+#include "vp10/encoder/encoder.h"
+#include "vp10/encoder/quantize.h"
+#include "vp10/encoder/ratectrl.h"
+#include "vp10/encoder/segmentation.h"
+#include "vp10/encoder/temporal_filter.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/vpx_timer.h"
+#include "vpx_scale/vpx_scale.h"
+
+static int fixed_divide[512];
+
+static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
+                                            uint8_t *y_mb_ptr,
+                                            uint8_t *u_mb_ptr,
+                                            uint8_t *v_mb_ptr,
+                                            int stride,
+                                            int uv_block_width,
+                                            int uv_block_height,
+                                            int mv_row,
+                                            int mv_col,
+                                            uint8_t *pred,
+                                            struct scale_factors *scale,
+                                            int x, int y) {
+  const int which_mv = 0;
+  const MV mv = { mv_row, mv_col };
+  const InterpKernel *const kernel =
+    vp10_filter_kernels[xd->mi[0]->mbmi.interp_filter];
+
+  enum mv_precision mv_precision_uv;
+  int uv_stride;
+  if (uv_block_width == 8) {
+    uv_stride = (stride + 1) >> 1;
+    mv_precision_uv = MV_PRECISION_Q4;
+  } else {
+    uv_stride = stride;
+    mv_precision_uv = MV_PRECISION_Q3;
+  }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    vp10_highbd_build_inter_predictor(y_mb_ptr, stride,
+                                     &pred[0], 16,
+                                     &mv,
+                                     scale,
+                                     16, 16,
+                                     which_mv,
+                                     kernel, MV_PRECISION_Q3, x, y, xd->bd);
+
+    vp10_highbd_build_inter_predictor(u_mb_ptr, uv_stride,
+                                     &pred[256], uv_block_width,
+                                     &mv,
+                                     scale,
+                                     uv_block_width, uv_block_height,
+                                     which_mv,
+                                     kernel, mv_precision_uv, x, y, xd->bd);
+
+    vp10_highbd_build_inter_predictor(v_mb_ptr, uv_stride,
+                                     &pred[512], uv_block_width,
+                                     &mv,
+                                     scale,
+                                     uv_block_width, uv_block_height,
+                                     which_mv,
+                                     kernel, mv_precision_uv, x, y, xd->bd);
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  vp10_build_inter_predictor(y_mb_ptr, stride,
+                            &pred[0], 16,
+                            &mv,
+                            scale,
+                            16, 16,
+                            which_mv,
+                            kernel, MV_PRECISION_Q3, x, y);
+
+  vp10_build_inter_predictor(u_mb_ptr, uv_stride,
+                            &pred[256], uv_block_width,
+                            &mv,
+                            scale,
+                            uv_block_width, uv_block_height,
+                            which_mv,
+                            kernel, mv_precision_uv, x, y);
+
+  vp10_build_inter_predictor(v_mb_ptr, uv_stride,
+                            &pred[512], uv_block_width,
+                            &mv,
+                            scale,
+                            uv_block_width, uv_block_height,
+                            which_mv,
+                            kernel, mv_precision_uv, x, y);
+}
+
+void vp10_temporal_filter_init(void) {
+  int i;
+
+  fixed_divide[0] = 0;
+  for (i = 1; i < 512; ++i)
+    fixed_divide[i] = 0x80000 / i;
+}
+
+void vp10_temporal_filter_apply_c(uint8_t *frame1,
+                                 unsigned int stride,
+                                 uint8_t *frame2,
+                                 unsigned int block_width,
+                                 unsigned int block_height,
+                                 int strength,
+                                 int filter_weight,
+                                 unsigned int *accumulator,
+                                 uint16_t *count) {
+  unsigned int i, j, k;
+  int modifier;
+  int byte = 0;
+  const int rounding = strength > 0 ? 1 << (strength - 1) : 0;
+
+  for (i = 0, k = 0; i < block_height; i++) {
+    for (j = 0; j < block_width; j++, k++) {
+      int src_byte = frame1[byte];
+      int pixel_value = *frame2++;
+
+      modifier   = src_byte - pixel_value;
+      // This is an integer approximation of:
+      // float coeff = (3.0 * modifer * modifier) / pow(2, strength);
+      // modifier =  (int)roundf(coeff > 16 ? 0 : 16-coeff);
+      modifier  *= modifier;
+      modifier  *= 3;
+      modifier  += rounding;
+      modifier >>= strength;
+
+      if (modifier > 16)
+        modifier = 16;
+
+      modifier = 16 - modifier;
+      modifier *= filter_weight;
+
+      count[k] += modifier;
+      accumulator[k] += modifier * pixel_value;
+
+      byte++;
+    }
+
+    byte += stride - block_width;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp10_highbd_temporal_filter_apply_c(uint8_t *frame1_8,
+                                        unsigned int stride,
+                                        uint8_t *frame2_8,
+                                        unsigned int block_width,
+                                        unsigned int block_height,
+                                        int strength,
+                                        int filter_weight,
+                                        unsigned int *accumulator,
+                                        uint16_t *count) {
+  uint16_t *frame1 = CONVERT_TO_SHORTPTR(frame1_8);
+  uint16_t *frame2 = CONVERT_TO_SHORTPTR(frame2_8);
+  unsigned int i, j, k;
+  int modifier;
+  int byte = 0;
+  const int rounding = strength > 0 ? 1 << (strength - 1) : 0;
+
+  for (i = 0, k = 0; i < block_height; i++) {
+    for (j = 0; j < block_width; j++, k++) {
+      int src_byte = frame1[byte];
+      int pixel_value = *frame2++;
+
+      modifier   = src_byte - pixel_value;
+      // This is an integer approximation of:
+      // float coeff = (3.0 * modifer * modifier) / pow(2, strength);
+      // modifier =  (int)roundf(coeff > 16 ? 0 : 16-coeff);
+      modifier *= modifier;
+      modifier *= 3;
+      modifier += rounding;
+      modifier >>= strength;
+
+      if (modifier > 16)
+        modifier = 16;
+
+      modifier = 16 - modifier;
+      modifier *= filter_weight;
+
+      count[k] += modifier;
+      accumulator[k] += modifier * pixel_value;
+
+      byte++;
+    }
+
+    byte += stride - block_width;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static int temporal_filter_find_matching_mb_c(VP10_COMP *cpi,
+                                              uint8_t *arf_frame_buf,
+                                              uint8_t *frame_ptr_buf,
+                                              int stride) {
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
+  int step_param;
+  int sadpb = x->sadperbit16;
+  int bestsme = INT_MAX;
+  int distortion;
+  unsigned int sse;
+  int cost_list[5];
+
+  MV best_ref_mv1 = {0, 0};
+  MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
+  MV *ref_mv = &x->e_mbd.mi[0]->bmi[0].as_mv[0].as_mv;
+
+  // Save input state
+  struct buf_2d src = x->plane[0].src;
+  struct buf_2d pre = xd->plane[0].pre[0];
+
+  best_ref_mv1_full.col = best_ref_mv1.col >> 3;
+  best_ref_mv1_full.row = best_ref_mv1.row >> 3;
+
+  // Setup frame pointers
+  x->plane[0].src.buf = arf_frame_buf;
+  x->plane[0].src.stride = stride;
+  xd->plane[0].pre[0].buf = frame_ptr_buf;
+  xd->plane[0].pre[0].stride = stride;
+
+  step_param = mv_sf->reduce_first_step_size;
+  step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2);
+
+  // Ignore mv costing by sending NULL pointer instead of cost arrays
+  vp10_hex_search(x, &best_ref_mv1_full, step_param, sadpb, 1,
+                 cond_cost_list(cpi, cost_list),
+                 &cpi->fn_ptr[BLOCK_16X16], 0, &best_ref_mv1, ref_mv);
+
+  // Ignore mv costing by sending NULL pointer instead of cost array
+  bestsme = cpi->find_fractional_mv_step(x, ref_mv,
+                                         &best_ref_mv1,
+                                         cpi->common.allow_high_precision_mv,
+                                         x->errorperbit,
+                                         &cpi->fn_ptr[BLOCK_16X16],
+                                         0, mv_sf->subpel_iters_per_step,
+                                         cond_cost_list(cpi, cost_list),
+                                         NULL, NULL,
+                                         &distortion, &sse, NULL, 0, 0);
+
+  // Restore input state
+  x->plane[0].src = src;
+  xd->plane[0].pre[0] = pre;
+
+  return bestsme;
+}
+
+static void temporal_filter_iterate_c(VP10_COMP *cpi,
+                                      YV12_BUFFER_CONFIG **frames,
+                                      int frame_count,
+                                      int alt_ref_index,
+                                      int strength,
+                                      struct scale_factors *scale) {
+  int byte;
+  int frame;
+  int mb_col, mb_row;
+  unsigned int filter_weight;
+  int mb_cols = (frames[alt_ref_index]->y_crop_width + 15) >> 4;
+  int mb_rows = (frames[alt_ref_index]->y_crop_height + 15) >> 4;
+  int mb_y_offset = 0;
+  int mb_uv_offset = 0;
+  DECLARE_ALIGNED(16, unsigned int, accumulator[16 * 16 * 3]);
+  DECLARE_ALIGNED(16, uint16_t, count[16 * 16 * 3]);
+  MACROBLOCKD *mbd = &cpi->td.mb.e_mbd;
+  YV12_BUFFER_CONFIG *f = frames[alt_ref_index];
+  uint8_t *dst1, *dst2;
+#if CONFIG_VP9_HIGHBITDEPTH
+  DECLARE_ALIGNED(16, uint16_t,  predictor16[16 * 16 * 3]);
+  DECLARE_ALIGNED(16, uint8_t,  predictor8[16 * 16 * 3]);
+  uint8_t *predictor;
+#else
+  DECLARE_ALIGNED(16, uint8_t,  predictor[16 * 16 * 3]);
+#endif
+  const int mb_uv_height = 16 >> mbd->plane[1].subsampling_y;
+  const int mb_uv_width  = 16 >> mbd->plane[1].subsampling_x;
+
+  // Save input state
+  uint8_t* input_buffer[MAX_MB_PLANE];
+  int i;
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    predictor = CONVERT_TO_BYTEPTR(predictor16);
+  } else {
+    predictor = predictor8;
+  }
+#endif
+
+  for (i = 0; i < MAX_MB_PLANE; i++)
+    input_buffer[i] = mbd->plane[i].pre[0].buf;
+
+  for (mb_row = 0; mb_row < mb_rows; mb_row++) {
+    // Source frames are extended to 16 pixels. This is different than
+    //  L/A/G reference frames that have a border of 32 (VP9ENCBORDERINPIXELS)
+    // A 6/8 tap filter is used for motion search.  This requires 2 pixels
+    //  before and 3 pixels after.  So the largest Y mv on a border would
+    //  then be 16 - VP9_INTERP_EXTEND. The UV blocks are half the size of the
+    //  Y and therefore only extended by 8.  The largest mv that a UV block
+    //  can support is 8 - VP9_INTERP_EXTEND.  A UV mv is half of a Y mv.
+    //  (16 - VP9_INTERP_EXTEND) >> 1 which is greater than
+    //  8 - VP9_INTERP_EXTEND.
+    // To keep the mv in play for both Y and UV planes the max that it
+    //  can be on a border is therefore 16 - (2*VP9_INTERP_EXTEND+1).
+    cpi->td.mb.mv_row_min = -((mb_row * 16) + (17 - 2 * VP9_INTERP_EXTEND));
+    cpi->td.mb.mv_row_max = ((mb_rows - 1 - mb_row) * 16)
+                         + (17 - 2 * VP9_INTERP_EXTEND);
+
+    for (mb_col = 0; mb_col < mb_cols; mb_col++) {
+      int i, j, k;
+      int stride;
+
+      memset(accumulator, 0, 16 * 16 * 3 * sizeof(accumulator[0]));
+      memset(count, 0, 16 * 16 * 3 * sizeof(count[0]));
+
+      cpi->td.mb.mv_col_min = -((mb_col * 16) + (17 - 2 * VP9_INTERP_EXTEND));
+      cpi->td.mb.mv_col_max = ((mb_cols - 1 - mb_col) * 16)
+                           + (17 - 2 * VP9_INTERP_EXTEND);
+
+      for (frame = 0; frame < frame_count; frame++) {
+        const int thresh_low  = 10000;
+        const int thresh_high = 20000;
+
+        if (frames[frame] == NULL)
+          continue;
+
+        mbd->mi[0]->bmi[0].as_mv[0].as_mv.row = 0;
+        mbd->mi[0]->bmi[0].as_mv[0].as_mv.col = 0;
+
+        if (frame == alt_ref_index) {
+          filter_weight = 2;
+        } else {
+          // Find best match in this frame by MC
+          int err = temporal_filter_find_matching_mb_c(cpi,
+              frames[alt_ref_index]->y_buffer + mb_y_offset,
+              frames[frame]->y_buffer + mb_y_offset,
+              frames[frame]->y_stride);
+
+          // Assign higher weight to matching MB if it's error
+          // score is lower. If not applying MC default behavior
+          // is to weight all MBs equal.
+          filter_weight = err < thresh_low
+                          ? 2 : err < thresh_high ? 1 : 0;
+        }
+
+        if (filter_weight != 0) {
+          // Construct the predictors
+          temporal_filter_predictors_mb_c(mbd,
+              frames[frame]->y_buffer + mb_y_offset,
+              frames[frame]->u_buffer + mb_uv_offset,
+              frames[frame]->v_buffer + mb_uv_offset,
+              frames[frame]->y_stride,
+              mb_uv_width, mb_uv_height,
+              mbd->mi[0]->bmi[0].as_mv[0].as_mv.row,
+              mbd->mi[0]->bmi[0].as_mv[0].as_mv.col,
+              predictor, scale,
+              mb_col * 16, mb_row * 16);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+          if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+            int adj_strength = strength + 2 * (mbd->bd - 8);
+            // Apply the filter (YUV)
+            vp10_highbd_temporal_filter_apply(f->y_buffer + mb_y_offset,
+                                             f->y_stride,
+                                             predictor, 16, 16, adj_strength,
+                                             filter_weight,
+                                             accumulator, count);
+            vp10_highbd_temporal_filter_apply(f->u_buffer + mb_uv_offset,
+                                             f->uv_stride, predictor + 256,
+                                             mb_uv_width, mb_uv_height,
+                                             adj_strength,
+                                             filter_weight, accumulator + 256,
+                                             count + 256);
+            vp10_highbd_temporal_filter_apply(f->v_buffer + mb_uv_offset,
+                                             f->uv_stride, predictor + 512,
+                                             mb_uv_width, mb_uv_height,
+                                             adj_strength, filter_weight,
+                                             accumulator + 512, count + 512);
+          } else {
+            // Apply the filter (YUV)
+            vp10_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride,
+                                      predictor, 16, 16,
+                                      strength, filter_weight,
+                                      accumulator, count);
+            vp10_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride,
+                                      predictor + 256,
+                                      mb_uv_width, mb_uv_height, strength,
+                                      filter_weight, accumulator + 256,
+                                      count + 256);
+            vp10_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride,
+                                      predictor + 512,
+                                      mb_uv_width, mb_uv_height, strength,
+                                      filter_weight, accumulator + 512,
+                                      count + 512);
+          }
+#else
+          // Apply the filter (YUV)
+          vp10_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride,
+                                    predictor, 16, 16,
+                                    strength, filter_weight,
+                                    accumulator, count);
+          vp10_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride,
+                                    predictor + 256,
+                                    mb_uv_width, mb_uv_height, strength,
+                                    filter_weight, accumulator + 256,
+                                    count + 256);
+          vp10_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride,
+                                    predictor + 512,
+                                    mb_uv_width, mb_uv_height, strength,
+                                    filter_weight, accumulator + 512,
+                                    count + 512);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        }
+      }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        uint16_t *dst1_16;
+        uint16_t *dst2_16;
+        // Normalize filter output to produce AltRef frame
+        dst1 = cpi->alt_ref_buffer.y_buffer;
+        dst1_16 = CONVERT_TO_SHORTPTR(dst1);
+        stride = cpi->alt_ref_buffer.y_stride;
+        byte = mb_y_offset;
+        for (i = 0, k = 0; i < 16; i++) {
+          for (j = 0; j < 16; j++, k++) {
+            unsigned int pval = accumulator[k] + (count[k] >> 1);
+            pval *= fixed_divide[count[k]];
+            pval >>= 19;
+
+            dst1_16[byte] = (uint16_t)pval;
+
+            // move to next pixel
+            byte++;
+          }
+
+          byte += stride - 16;
+        }
+
+        dst1 = cpi->alt_ref_buffer.u_buffer;
+        dst2 = cpi->alt_ref_buffer.v_buffer;
+        dst1_16 = CONVERT_TO_SHORTPTR(dst1);
+        dst2_16 = CONVERT_TO_SHORTPTR(dst2);
+        stride = cpi->alt_ref_buffer.uv_stride;
+        byte = mb_uv_offset;
+        for (i = 0, k = 256; i < mb_uv_height; i++) {
+          for (j = 0; j < mb_uv_width; j++, k++) {
+            int m = k + 256;
+
+            // U
+            unsigned int pval = accumulator[k] + (count[k] >> 1);
+            pval *= fixed_divide[count[k]];
+            pval >>= 19;
+            dst1_16[byte] = (uint16_t)pval;
+
+            // V
+            pval = accumulator[m] + (count[m] >> 1);
+            pval *= fixed_divide[count[m]];
+            pval >>= 19;
+            dst2_16[byte] = (uint16_t)pval;
+
+            // move to next pixel
+            byte++;
+          }
+
+          byte += stride - mb_uv_width;
+        }
+      } else {
+        // Normalize filter output to produce AltRef frame
+        dst1 = cpi->alt_ref_buffer.y_buffer;
+        stride = cpi->alt_ref_buffer.y_stride;
+        byte = mb_y_offset;
+        for (i = 0, k = 0; i < 16; i++) {
+          for (j = 0; j < 16; j++, k++) {
+            unsigned int pval = accumulator[k] + (count[k] >> 1);
+            pval *= fixed_divide[count[k]];
+            pval >>= 19;
+
+            dst1[byte] = (uint8_t)pval;
+
+            // move to next pixel
+            byte++;
+          }
+          byte += stride - 16;
+        }
+
+        dst1 = cpi->alt_ref_buffer.u_buffer;
+        dst2 = cpi->alt_ref_buffer.v_buffer;
+        stride = cpi->alt_ref_buffer.uv_stride;
+        byte = mb_uv_offset;
+        for (i = 0, k = 256; i < mb_uv_height; i++) {
+          for (j = 0; j < mb_uv_width; j++, k++) {
+            int m = k + 256;
+
+            // U
+            unsigned int pval = accumulator[k] + (count[k] >> 1);
+            pval *= fixed_divide[count[k]];
+            pval >>= 19;
+            dst1[byte] = (uint8_t)pval;
+
+            // V
+            pval = accumulator[m] + (count[m] >> 1);
+            pval *= fixed_divide[count[m]];
+            pval >>= 19;
+            dst2[byte] = (uint8_t)pval;
+
+            // move to next pixel
+            byte++;
+          }
+          byte += stride - mb_uv_width;
+        }
+      }
+#else
+      // Normalize filter output to produce AltRef frame
+      dst1 = cpi->alt_ref_buffer.y_buffer;
+      stride = cpi->alt_ref_buffer.y_stride;
+      byte = mb_y_offset;
+      for (i = 0, k = 0; i < 16; i++) {
+        for (j = 0; j < 16; j++, k++) {
+          unsigned int pval = accumulator[k] + (count[k] >> 1);
+          pval *= fixed_divide[count[k]];
+          pval >>= 19;
+
+          dst1[byte] = (uint8_t)pval;
+
+          // move to next pixel
+          byte++;
+        }
+        byte += stride - 16;
+      }
+
+      dst1 = cpi->alt_ref_buffer.u_buffer;
+      dst2 = cpi->alt_ref_buffer.v_buffer;
+      stride = cpi->alt_ref_buffer.uv_stride;
+      byte = mb_uv_offset;
+      for (i = 0, k = 256; i < mb_uv_height; i++) {
+        for (j = 0; j < mb_uv_width; j++, k++) {
+          int m = k + 256;
+
+          // U
+          unsigned int pval = accumulator[k] + (count[k] >> 1);
+          pval *= fixed_divide[count[k]];
+          pval >>= 19;
+          dst1[byte] = (uint8_t)pval;
+
+          // V
+          pval = accumulator[m] + (count[m] >> 1);
+          pval *= fixed_divide[count[m]];
+          pval >>= 19;
+          dst2[byte] = (uint8_t)pval;
+
+          // move to next pixel
+          byte++;
+        }
+        byte += stride - mb_uv_width;
+      }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      mb_y_offset += 16;
+      mb_uv_offset += mb_uv_width;
+    }
+    mb_y_offset += 16 * (f->y_stride - mb_cols);
+    mb_uv_offset += mb_uv_height * f->uv_stride - mb_uv_width * mb_cols;
+  }
+
+  // Restore input state
+  for (i = 0; i < MAX_MB_PLANE; i++)
+    mbd->plane[i].pre[0].buf = input_buffer[i];
+}
+
+// Apply buffer limits and context specific adjustments to arnr filter.
+static void adjust_arnr_filter(VP10_COMP *cpi,
+                               int distance, int group_boost,
+                               int *arnr_frames, int *arnr_strength) {
+  const VP10EncoderConfig *const oxcf = &cpi->oxcf;
+  const int frames_after_arf =
+      vp10_lookahead_depth(cpi->lookahead) - distance - 1;
+  int frames_fwd = (cpi->oxcf.arnr_max_frames - 1) >> 1;
+  int frames_bwd;
+  int q, frames, strength;
+
+  // Define the forward and backwards filter limits for this arnr group.
+  if (frames_fwd > frames_after_arf)
+    frames_fwd = frames_after_arf;
+  if (frames_fwd > distance)
+    frames_fwd = distance;
+
+  frames_bwd = frames_fwd;
+
+  // For even length filter there is one more frame backward
+  // than forward: e.g. len=6 ==> bbbAff, len=7 ==> bbbAfff.
+  if (frames_bwd < distance)
+    frames_bwd += (oxcf->arnr_max_frames + 1) & 0x1;
+
+  // Set the baseline active filter size.
+  frames = frames_bwd + 1 + frames_fwd;
+
+  // Adjust the strength based on active max q.
+  if (cpi->common.current_video_frame > 1)
+    q = ((int)vp10_convert_qindex_to_q(
+        cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.bit_depth));
+  else
+    q = ((int)vp10_convert_qindex_to_q(
+        cpi->rc.avg_frame_qindex[KEY_FRAME], cpi->common.bit_depth));
+  if (q > 16) {
+    strength = oxcf->arnr_strength;
+  } else {
+    strength = oxcf->arnr_strength - ((16 - q) / 2);
+    if (strength < 0)
+      strength = 0;
+  }
+
+  // Adjust number of frames in filter and strength based on gf boost level.
+  if (frames > group_boost / 150) {
+    frames = group_boost / 150;
+    frames += !(frames & 1);
+  }
+
+  if (strength > group_boost / 300) {
+    strength = group_boost / 300;
+  }
+
+  // Adjustments for second level arf in multi arf case.
+  if (cpi->oxcf.pass == 2 && cpi->multi_arf_allowed) {
+    const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+    if (gf_group->rf_level[gf_group->index] != GF_ARF_STD) {
+      strength >>= 1;
+    }
+  }
+
+  *arnr_frames = frames;
+  *arnr_strength = strength;
+}
+
+void vp10_temporal_filter(VP10_COMP *cpi, int distance) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  int frame;
+  int frames_to_blur;
+  int start_frame;
+  int strength;
+  int frames_to_blur_backward;
+  int frames_to_blur_forward;
+  struct scale_factors sf;
+  YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS] = {NULL};
+
+  // Apply context specific adjustments to the arnr filter parameters.
+  adjust_arnr_filter(cpi, distance, rc->gfu_boost, &frames_to_blur, &strength);
+  frames_to_blur_backward = (frames_to_blur / 2);
+  frames_to_blur_forward = ((frames_to_blur - 1) / 2);
+  start_frame = distance + frames_to_blur_forward;
+
+  // Setup frame pointers, NULL indicates frame not included in filter.
+  for (frame = 0; frame < frames_to_blur; ++frame) {
+    const int which_buffer = start_frame - frame;
+    struct lookahead_entry *buf = vp10_lookahead_peek(cpi->lookahead,
+                                                     which_buffer);
+    frames[frames_to_blur - 1 - frame] = &buf->img;
+  }
+
+  if (frames_to_blur > 0) {
+    // Setup scaling factors. Scaling on each of the arnr frames is not
+    // supported.
+    // ARF is produced at the native frame size and resized when coded.
+#if CONFIG_VP9_HIGHBITDEPTH
+    vp10_setup_scale_factors_for_frame(&sf,
+                                      frames[0]->y_crop_width,
+                                      frames[0]->y_crop_height,
+                                      frames[0]->y_crop_width,
+                                      frames[0]->y_crop_height,
+                                      cpi->common.use_highbitdepth);
+#else
+    vp10_setup_scale_factors_for_frame(&sf,
+                                      frames[0]->y_crop_width,
+                                      frames[0]->y_crop_height,
+                                      frames[0]->y_crop_width,
+                                      frames[0]->y_crop_height);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  }
+
+  temporal_filter_iterate_c(cpi, frames, frames_to_blur,
+                            frames_to_blur_backward, strength, &sf);
+}
diff --git a/libs/libvpx/vp10/encoder/temporal_filter.h b/libs/libvpx/vp10/encoder/temporal_filter.h
new file mode 100644
index 0000000000..6e331e6ad0
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/temporal_filter.h
@@ -0,0 +1,25 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_ENCODER_TEMPORAL_FILTER_H_
+#define VP10_ENCODER_TEMPORAL_FILTER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp10_temporal_filter_init(void);
+void vp10_temporal_filter(VP10_COMP *cpi, int distance);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_ENCODER_TEMPORAL_FILTER_H_
diff --git a/libs/libvpx/vp10/encoder/tokenize.c b/libs/libvpx/vp10/encoder/tokenize.c
new file mode 100644
index 0000000000..a665a3cfea
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/tokenize.c
@@ -0,0 +1,643 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "vpx_mem/vpx_mem.h"
+
+#include "vp10/common/entropy.h"
+#include "vp10/common/pred_common.h"
+#include "vp10/common/scan.h"
+#include "vp10/common/seg_common.h"
+
+#include "vp10/encoder/cost.h"
+#include "vp10/encoder/encoder.h"
+#include "vp10/encoder/tokenize.h"
+
+static const TOKENVALUE dct_cat_lt_10_value_tokens[] = {
+  {9, 63}, {9, 61}, {9, 59}, {9, 57}, {9, 55}, {9, 53}, {9, 51}, {9, 49},
+  {9, 47}, {9, 45}, {9, 43}, {9, 41}, {9, 39}, {9, 37}, {9, 35}, {9, 33},
+  {9, 31}, {9, 29}, {9, 27}, {9, 25}, {9, 23}, {9, 21}, {9, 19}, {9, 17},
+  {9, 15}, {9, 13}, {9, 11}, {9, 9}, {9, 7}, {9, 5}, {9, 3}, {9, 1},
+  {8, 31}, {8, 29}, {8, 27}, {8, 25}, {8, 23}, {8, 21},
+  {8, 19}, {8, 17}, {8, 15}, {8, 13}, {8, 11}, {8, 9},
+  {8, 7}, {8, 5}, {8, 3}, {8, 1},
+  {7, 15}, {7, 13}, {7, 11}, {7, 9}, {7, 7}, {7, 5}, {7, 3}, {7, 1},
+  {6, 7}, {6, 5}, {6, 3}, {6, 1}, {5, 3}, {5, 1},
+  {4, 1}, {3, 1}, {2, 1}, {1, 1}, {0, 0},
+  {1, 0},  {2, 0}, {3, 0}, {4, 0},
+  {5, 0}, {5, 2}, {6, 0}, {6, 2}, {6, 4}, {6, 6},
+  {7, 0}, {7, 2}, {7, 4}, {7, 6}, {7, 8}, {7, 10}, {7, 12}, {7, 14},
+  {8, 0}, {8, 2}, {8, 4}, {8, 6}, {8, 8}, {8, 10}, {8, 12},
+  {8, 14}, {8, 16}, {8, 18}, {8, 20}, {8, 22}, {8, 24},
+  {8, 26}, {8, 28}, {8, 30}, {9, 0}, {9, 2},
+  {9, 4}, {9, 6}, {9, 8}, {9, 10}, {9, 12}, {9, 14}, {9, 16},
+  {9, 18}, {9, 20}, {9, 22}, {9, 24}, {9, 26}, {9, 28},
+  {9, 30}, {9, 32}, {9, 34}, {9, 36}, {9, 38}, {9, 40},
+  {9, 42}, {9, 44}, {9, 46}, {9, 48}, {9, 50}, {9, 52},
+  {9, 54}, {9, 56}, {9, 58}, {9, 60}, {9, 62}
+};
+const TOKENVALUE *vp10_dct_cat_lt_10_value_tokens = dct_cat_lt_10_value_tokens +
+    (sizeof(dct_cat_lt_10_value_tokens) / sizeof(*dct_cat_lt_10_value_tokens))
+    / 2;
+
+// Array indices are identical to previously-existing CONTEXT_NODE indices
+const vpx_tree_index vp10_coef_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
+  -EOB_TOKEN, 2,                       // 0  = EOB
+  -ZERO_TOKEN, 4,                      // 1  = ZERO
+  -ONE_TOKEN, 6,                       // 2  = ONE
+  8, 12,                               // 3  = LOW_VAL
+  -TWO_TOKEN, 10,                      // 4  = TWO
+  -THREE_TOKEN, -FOUR_TOKEN,           // 5  = THREE
+  14, 16,                              // 6  = HIGH_LOW
+  -CATEGORY1_TOKEN, -CATEGORY2_TOKEN,  // 7  = CAT_ONE
+  18, 20,                              // 8  = CAT_THREEFOUR
+  -CATEGORY3_TOKEN, -CATEGORY4_TOKEN,  // 9  = CAT_THREE
+  -CATEGORY5_TOKEN, -CATEGORY6_TOKEN   // 10 = CAT_FIVE
+};
+
+static const vpx_tree_index cat1[2] = {0, 0};
+static const vpx_tree_index cat2[4] = {2, 2, 0, 0};
+static const vpx_tree_index cat3[6] = {2, 2, 4, 4, 0, 0};
+static const vpx_tree_index cat4[8] = {2, 2, 4, 4, 6, 6, 0, 0};
+static const vpx_tree_index cat5[10] = {2, 2, 4, 4, 6, 6, 8, 8, 0, 0};
+static const vpx_tree_index cat6[28] = {2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12,
+    14, 14, 16, 16, 18, 18, 20, 20, 22, 22, 24, 24, 26, 26, 0, 0};
+
+static const int16_t zero_cost[] = {0};
+static const int16_t sign_cost[] = {255, 257};
+static const int16_t cat1_cost[] = {429, 431, 616, 618};
+static const int16_t cat2_cost[] = {624, 626, 727, 729, 848, 850, 951, 953};
+static const int16_t cat3_cost[] = {
+  820, 822, 893, 895, 940, 942, 1013, 1015, 1096, 1098, 1169, 1171, 1216, 1218,
+  1289, 1291
+};
+static const int16_t cat4_cost[] = {
+  1032, 1034, 1075, 1077, 1105, 1107, 1148, 1150, 1194, 1196, 1237, 1239,
+  1267, 1269, 1310, 1312, 1328, 1330, 1371, 1373, 1401, 1403, 1444, 1446,
+  1490, 1492, 1533, 1535, 1563, 1565, 1606, 1608
+};
+static const int16_t cat5_cost[] = {
+  1269, 1271, 1283, 1285, 1306, 1308, 1320,
+  1322, 1347, 1349, 1361, 1363, 1384, 1386, 1398, 1400, 1443, 1445, 1457,
+  1459, 1480, 1482, 1494, 1496, 1521, 1523, 1535, 1537, 1558, 1560, 1572,
+  1574, 1592, 1594, 1606, 1608, 1629, 1631, 1643, 1645, 1670, 1672, 1684,
+  1686, 1707, 1709, 1721, 1723, 1766, 1768, 1780, 1782, 1803, 1805, 1817,
+  1819, 1844, 1846, 1858, 1860, 1881, 1883, 1895, 1897
+};
+const int16_t vp10_cat6_low_cost[256] = {
+  1638, 1640, 1646, 1648, 1652, 1654, 1660, 1662,
+  1670, 1672, 1678, 1680, 1684, 1686, 1692, 1694, 1711, 1713, 1719, 1721,
+  1725, 1727, 1733, 1735, 1743, 1745, 1751, 1753, 1757, 1759, 1765, 1767,
+  1787, 1789, 1795, 1797, 1801, 1803, 1809, 1811, 1819, 1821, 1827, 1829,
+  1833, 1835, 1841, 1843, 1860, 1862, 1868, 1870, 1874, 1876, 1882, 1884,
+  1892, 1894, 1900, 1902, 1906, 1908, 1914, 1916, 1940, 1942, 1948, 1950,
+  1954, 1956, 1962, 1964, 1972, 1974, 1980, 1982, 1986, 1988, 1994, 1996,
+  2013, 2015, 2021, 2023, 2027, 2029, 2035, 2037, 2045, 2047, 2053, 2055,
+  2059, 2061, 2067, 2069, 2089, 2091, 2097, 2099, 2103, 2105, 2111, 2113,
+  2121, 2123, 2129, 2131, 2135, 2137, 2143, 2145, 2162, 2164, 2170, 2172,
+  2176, 2178, 2184, 2186, 2194, 2196, 2202, 2204, 2208, 2210, 2216, 2218,
+  2082, 2084, 2090, 2092, 2096, 2098, 2104, 2106, 2114, 2116, 2122, 2124,
+  2128, 2130, 2136, 2138, 2155, 2157, 2163, 2165, 2169, 2171, 2177, 2179,
+  2187, 2189, 2195, 2197, 2201, 2203, 2209, 2211, 2231, 2233, 2239, 2241,
+  2245, 2247, 2253, 2255, 2263, 2265, 2271, 2273, 2277, 2279, 2285, 2287,
+  2304, 2306, 2312, 2314, 2318, 2320, 2326, 2328, 2336, 2338, 2344, 2346,
+  2350, 2352, 2358, 2360, 2384, 2386, 2392, 2394, 2398, 2400, 2406, 2408,
+  2416, 2418, 2424, 2426, 2430, 2432, 2438, 2440, 2457, 2459, 2465, 2467,
+  2471, 2473, 2479, 2481, 2489, 2491, 2497, 2499, 2503, 2505, 2511, 2513,
+  2533, 2535, 2541, 2543, 2547, 2549, 2555, 2557, 2565, 2567, 2573, 2575,
+  2579, 2581, 2587, 2589, 2606, 2608, 2614, 2616, 2620, 2622, 2628, 2630,
+  2638, 2640, 2646, 2648, 2652, 2654, 2660, 2662
+};
+const int16_t vp10_cat6_high_cost[128] = {
+  72, 892, 1183, 2003, 1448, 2268, 2559, 3379,
+  1709, 2529, 2820, 3640, 3085, 3905, 4196, 5016, 2118, 2938, 3229, 4049,
+  3494, 4314, 4605, 5425, 3755, 4575, 4866, 5686, 5131, 5951, 6242, 7062,
+  2118, 2938, 3229, 4049, 3494, 4314, 4605, 5425, 3755, 4575, 4866, 5686,
+  5131, 5951, 6242, 7062, 4164, 4984, 5275, 6095, 5540, 6360, 6651, 7471,
+  5801, 6621, 6912, 7732, 7177, 7997, 8288, 9108, 2118, 2938, 3229, 4049,
+  3494, 4314, 4605, 5425, 3755, 4575, 4866, 5686, 5131, 5951, 6242, 7062,
+  4164, 4984, 5275, 6095, 5540, 6360, 6651, 7471, 5801, 6621, 6912, 7732,
+  7177, 7997, 8288, 9108, 4164, 4984, 5275, 6095, 5540, 6360, 6651, 7471,
+  5801, 6621, 6912, 7732, 7177, 7997, 8288, 9108, 6210, 7030, 7321, 8141,
+  7586, 8406, 8697, 9517, 7847, 8667, 8958, 9778, 9223, 10043, 10334, 11154
+};
+
+#if CONFIG_VP9_HIGHBITDEPTH
+const int16_t vp10_cat6_high10_high_cost[512] = {
+  74, 894, 1185, 2005, 1450, 2270, 2561,
+  3381, 1711, 2531, 2822, 3642, 3087, 3907, 4198, 5018, 2120, 2940, 3231,
+  4051, 3496, 4316, 4607, 5427, 3757, 4577, 4868, 5688, 5133, 5953, 6244,
+  7064, 2120, 2940, 3231, 4051, 3496, 4316, 4607, 5427, 3757, 4577, 4868,
+  5688, 5133, 5953, 6244, 7064, 4166, 4986, 5277, 6097, 5542, 6362, 6653,
+  7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290, 9110, 2120, 2940, 3231,
+  4051, 3496, 4316, 4607, 5427, 3757, 4577, 4868, 5688, 5133, 5953, 6244,
+  7064, 4166, 4986, 5277, 6097, 5542, 6362, 6653, 7473, 5803, 6623, 6914,
+  7734, 7179, 7999, 8290, 9110, 4166, 4986, 5277, 6097, 5542, 6362, 6653,
+  7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290, 9110, 6212, 7032, 7323,
+  8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960, 9780, 9225, 10045, 10336,
+  11156, 2120, 2940, 3231, 4051, 3496, 4316, 4607, 5427, 3757, 4577, 4868,
+  5688, 5133, 5953, 6244, 7064, 4166, 4986, 5277, 6097, 5542, 6362, 6653,
+  7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290, 9110, 4166, 4986, 5277,
+  6097, 5542, 6362, 6653, 7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290,
+  9110, 6212, 7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960,
+  9780, 9225, 10045, 10336, 11156, 4166, 4986, 5277, 6097, 5542, 6362, 6653,
+  7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290, 9110, 6212, 7032, 7323,
+  8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960, 9780, 9225, 10045, 10336,
+  11156, 6212, 7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960,
+  9780, 9225, 10045, 10336, 11156, 8258, 9078, 9369, 10189, 9634, 10454,
+  10745, 11565, 9895, 10715, 11006, 11826, 11271, 12091, 12382, 13202, 2120,
+  2940, 3231, 4051, 3496, 4316, 4607, 5427, 3757, 4577, 4868, 5688, 5133,
+  5953, 6244, 7064, 4166, 4986, 5277, 6097, 5542, 6362, 6653, 7473, 5803,
+  6623, 6914, 7734, 7179, 7999, 8290, 9110, 4166, 4986, 5277, 6097, 5542,
+  6362, 6653, 7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290, 9110, 6212,
+  7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960, 9780, 9225,
+  10045, 10336, 11156, 4166, 4986, 5277, 6097, 5542, 6362, 6653, 7473, 5803,
+  6623, 6914, 7734, 7179, 7999, 8290, 9110, 6212, 7032, 7323, 8143, 7588,
+  8408, 8699, 9519, 7849, 8669, 8960, 9780, 9225, 10045, 10336, 11156, 6212,
+  7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960, 9780, 9225,
+  10045, 10336, 11156, 8258, 9078, 9369, 10189, 9634, 10454, 10745, 11565,
+  9895, 10715, 11006, 11826, 11271, 12091, 12382, 13202, 4166, 4986, 5277,
+  6097, 5542, 6362, 6653, 7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290,
+  9110, 6212, 7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960,
+  9780, 9225, 10045, 10336, 11156, 6212, 7032, 7323, 8143, 7588, 8408, 8699,
+  9519, 7849, 8669, 8960, 9780, 9225, 10045, 10336, 11156, 8258, 9078, 9369,
+  10189, 9634, 10454, 10745, 11565, 9895, 10715, 11006, 11826, 11271, 12091,
+  12382, 13202, 6212, 7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669,
+  8960, 9780, 9225, 10045, 10336, 11156, 8258, 9078, 9369, 10189, 9634, 10454,
+  10745, 11565, 9895, 10715, 11006, 11826, 11271, 12091, 12382, 13202, 8258,
+  9078, 9369, 10189, 9634, 10454, 10745, 11565, 9895, 10715, 11006, 11826,
+  11271, 12091, 12382, 13202, 10304, 11124, 11415, 12235, 11680, 12500, 12791,
+  13611, 11941, 12761, 13052, 13872, 13317, 14137, 14428, 15248,
+};
+const int16_t vp10_cat6_high12_high_cost[2048] = {
+  76, 896, 1187, 2007, 1452, 2272, 2563,
+  3383, 1713, 2533, 2824, 3644, 3089, 3909, 4200, 5020, 2122, 2942, 3233,
+  4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870, 5690, 5135, 5955, 6246,
+  7066, 2122, 2942, 3233, 4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870,
+  5690, 5135, 5955, 6246, 7066, 4168, 4988, 5279, 6099, 5544, 6364, 6655,
+  7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 2122, 2942, 3233,
+  4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870, 5690, 5135, 5955, 6246,
+  7066, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916,
+  7736, 7181, 8001, 8292, 9112, 4168, 4988, 5279, 6099, 5544, 6364, 6655,
+  7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325,
+  8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338,
+  11158, 2122, 2942, 3233, 4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870,
+  5690, 5135, 5955, 6246, 7066, 4168, 4988, 5279, 6099, 5544, 6364, 6655,
+  7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 4168, 4988, 5279,
+  6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292,
+  9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962,
+  9782, 9227, 10047, 10338, 11158, 4168, 4988, 5279, 6099, 5544, 6364, 6655,
+  7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325,
+  8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338,
+  11158, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962,
+  9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456,
+  10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 2122,
+  2942, 3233, 4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870, 5690, 5135,
+  5955, 6246, 7066, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805,
+  6625, 6916, 7736, 7181, 8001, 8292, 9112, 4168, 4988, 5279, 6099, 5544,
+  6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214,
+  7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227,
+  10047, 10338, 11158, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805,
+  6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145, 7590,
+  8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 6214,
+  7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227,
+  10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567,
+  9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 4168, 4988, 5279,
+  6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292,
+  9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962,
+  9782, 9227, 10047, 10338, 11158, 6214, 7034, 7325, 8145, 7590, 8410, 8701,
+  9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371,
+  10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093,
+  12384, 13204, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671,
+  8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456,
+  10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 8260,
+  9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828,
+  11273, 12093, 12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793,
+  13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 2122, 2942,
+  3233, 4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870, 5690, 5135, 5955,
+  6246, 7066, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625,
+  6916, 7736, 7181, 8001, 8292, 9112, 4168, 4988, 5279, 6099, 5544, 6364,
+  6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034,
+  7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047,
+  10338, 11158, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625,
+  6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145, 7590, 8410,
+  8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 6214, 7034,
+  7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047,
+  10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897,
+  10717, 11008, 11828, 11273, 12093, 12384, 13204, 4168, 4988, 5279, 6099,
+  5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112,
+  6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782,
+  9227, 10047, 10338, 11158, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521,
+  7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191,
+  9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384,
+  13204, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962,
+  9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456,
+  10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 8260,
+  9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828,
+  11273, 12093, 12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793,
+  13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 4168, 4988,
+  5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001,
+  8292, 9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671,
+  8962, 9782, 9227, 10047, 10338, 11158, 6214, 7034, 7325, 8145, 7590, 8410,
+  8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080,
+  9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273,
+  12093, 12384, 13204, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851,
+  8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636,
+  10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204,
+  8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008,
+  11828, 11273, 12093, 12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502,
+  12793, 13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 6214,
+  7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227,
+  10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567,
+  9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 8260, 9080, 9371,
+  10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093,
+  12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943,
+  12763, 13054, 13874, 13319, 14139, 14430, 15250, 8260, 9080, 9371, 10191,
+  9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384,
+  13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763,
+  13054, 13874, 13319, 14139, 14430, 15250, 10306, 11126, 11417, 12237, 11682,
+  12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250,
+  12352, 13172, 13463, 14283, 13728, 14548, 14839, 15659, 13989, 14809, 15100,
+  15920, 15365, 16185, 16476, 17296, 2122, 2942, 3233, 4053, 3498, 4318, 4609,
+  5429, 3759, 4579, 4870, 5690, 5135, 5955, 6246, 7066, 4168, 4988, 5279,
+  6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292,
+  9112, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916,
+  7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701,
+  9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 4168, 4988, 5279,
+  6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292,
+  9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962,
+  9782, 9227, 10047, 10338, 11158, 6214, 7034, 7325, 8145, 7590, 8410, 8701,
+  9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371,
+  10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093,
+  12384, 13204, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625,
+  6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145, 7590, 8410,
+  8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 6214, 7034,
+  7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047,
+  10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897,
+  10717, 11008, 11828, 11273, 12093, 12384, 13204, 6214, 7034, 7325, 8145,
+  7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158,
+  8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008,
+  11828, 11273, 12093, 12384, 13204, 8260, 9080, 9371, 10191, 9636, 10456,
+  10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306,
+  11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874,
+  13319, 14139, 14430, 15250, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475,
+  5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145,
+  7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158,
+  6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782,
+  9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747,
+  11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 6214, 7034,
+  7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047,
+  10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897,
+  10717, 11008, 11828, 11273, 12093, 12384, 13204, 8260, 9080, 9371, 10191,
+  9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384,
+  13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763,
+  13054, 13874, 13319, 14139, 14430, 15250, 6214, 7034, 7325, 8145, 7590,
+  8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260,
+  9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828,
+  11273, 12093, 12384, 13204, 8260, 9080, 9371, 10191, 9636, 10456, 10747,
+  11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306, 11126,
+  11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319,
+  14139, 14430, 15250, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567,
+  9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306, 11126, 11417,
+  12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319, 14139,
+  14430, 15250, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943,
+  12763, 13054, 13874, 13319, 14139, 14430, 15250, 12352, 13172, 13463, 14283,
+  13728, 14548, 14839, 15659, 13989, 14809, 15100, 15920, 15365, 16185, 16476,
+  17296, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916,
+  7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701,
+  9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 6214, 7034, 7325,
+  8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338,
+  11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717,
+  11008, 11828, 11273, 12093, 12384, 13204, 6214, 7034, 7325, 8145, 7590,
+  8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260,
+  9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828,
+  11273, 12093, 12384, 13204, 8260, 9080, 9371, 10191, 9636, 10456, 10747,
+  11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306, 11126,
+  11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319,
+  14139, 14430, 15250, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851,
+  8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636,
+  10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204,
+  8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008,
+  11828, 11273, 12093, 12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502,
+  12793, 13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 8260,
+  9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828,
+  11273, 12093, 12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793,
+  13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 10306, 11126,
+  11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319,
+  14139, 14430, 15250, 12352, 13172, 13463, 14283, 13728, 14548, 14839, 15659,
+  13989, 14809, 15100, 15920, 15365, 16185, 16476, 17296, 6214, 7034, 7325,
+  8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338,
+  11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717,
+  11008, 11828, 11273, 12093, 12384, 13204, 8260, 9080, 9371, 10191, 9636,
+  10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204,
+  10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054,
+  13874, 13319, 14139, 14430, 15250, 8260, 9080, 9371, 10191, 9636, 10456,
+  10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306,
+  11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874,
+  13319, 14139, 14430, 15250, 10306, 11126, 11417, 12237, 11682, 12502, 12793,
+  13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 12352, 13172,
+  13463, 14283, 13728, 14548, 14839, 15659, 13989, 14809, 15100, 15920, 15365,
+  16185, 16476, 17296, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567,
+  9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306, 11126, 11417,
+  12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319, 14139,
+  14430, 15250, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943,
+  12763, 13054, 13874, 13319, 14139, 14430, 15250, 12352, 13172, 13463, 14283,
+  13728, 14548, 14839, 15659, 13989, 14809, 15100, 15920, 15365, 16185, 16476,
+  17296, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763,
+  13054, 13874, 13319, 14139, 14430, 15250, 12352, 13172, 13463, 14283, 13728,
+  14548, 14839, 15659, 13989, 14809, 15100, 15920, 15365, 16185, 16476, 17296,
+  12352, 13172, 13463, 14283, 13728, 14548, 14839, 15659, 13989, 14809, 15100,
+  15920, 15365, 16185, 16476, 17296, 14398, 15218, 15509, 16329, 15774, 16594,
+  16885, 17705, 16035, 16855, 17146, 17966, 17411, 18231, 18522, 19342
+};
+#endif
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static const vpx_tree_index cat1_high10[2] = {0, 0};
+static const vpx_tree_index cat2_high10[4] = {2, 2, 0, 0};
+static const vpx_tree_index cat3_high10[6] = {2, 2, 4, 4, 0, 0};
+static const vpx_tree_index cat4_high10[8] = {2, 2, 4, 4, 6, 6, 0, 0};
+static const vpx_tree_index cat5_high10[10] = {2, 2, 4, 4, 6, 6, 8, 8, 0, 0};
+static const vpx_tree_index cat6_high10[32] = {2, 2, 4, 4, 6, 6, 8, 8, 10, 10,
+  12, 12, 14, 14, 16, 16, 18, 18, 20, 20, 22, 22, 24, 24, 26, 26, 28, 28,
+  30, 30, 0, 0};
+static const vpx_tree_index cat1_high12[2] = {0, 0};
+static const vpx_tree_index cat2_high12[4] = {2, 2, 0, 0};
+static const vpx_tree_index cat3_high12[6] = {2, 2, 4, 4, 0, 0};
+static const vpx_tree_index cat4_high12[8] = {2, 2, 4, 4, 6, 6, 0, 0};
+static const vpx_tree_index cat5_high12[10] = {2, 2, 4, 4, 6, 6, 8, 8, 0, 0};
+static const vpx_tree_index cat6_high12[36] = {2, 2, 4, 4, 6, 6, 8, 8, 10, 10,
+  12, 12, 14, 14, 16, 16, 18, 18, 20, 20, 22, 22, 24, 24, 26, 26, 28, 28,
+  30, 30, 32, 32, 34, 34, 0, 0};
+#endif
+
+const vp10_extra_bit vp10_extra_bits[ENTROPY_TOKENS] = {
+  {0, 0, 0, 0, zero_cost},                              // ZERO_TOKEN
+  {0, 0, 0, 1, sign_cost},                              // ONE_TOKEN
+  {0, 0, 0, 2, sign_cost},                              // TWO_TOKEN
+  {0, 0, 0, 3, sign_cost},                              // THREE_TOKEN
+  {0, 0, 0, 4, sign_cost},                              // FOUR_TOKEN
+  {cat1, vp10_cat1_prob, 1,  CAT1_MIN_VAL, cat1_cost},  // CATEGORY1_TOKEN
+  {cat2, vp10_cat2_prob, 2,  CAT2_MIN_VAL, cat2_cost},  // CATEGORY2_TOKEN
+  {cat3, vp10_cat3_prob, 3,  CAT3_MIN_VAL, cat3_cost},  // CATEGORY3_TOKEN
+  {cat4, vp10_cat4_prob, 4,  CAT4_MIN_VAL, cat4_cost},  // CATEGORY4_TOKEN
+  {cat5, vp10_cat5_prob, 5,  CAT5_MIN_VAL, cat5_cost},  // CATEGORY5_TOKEN
+  {cat6, vp10_cat6_prob, 14, CAT6_MIN_VAL, 0},          // CATEGORY6_TOKEN
+  {0, 0, 0, 0, zero_cost}                               // EOB_TOKEN
+};
+
+#if CONFIG_VP9_HIGHBITDEPTH
+const vp10_extra_bit vp10_extra_bits_high10[ENTROPY_TOKENS] = {
+  {0, 0, 0, 0, zero_cost},                                            // ZERO
+  {0, 0, 0, 1, sign_cost},                                            // ONE
+  {0, 0, 0, 2, sign_cost},                                            // TWO
+  {0, 0, 0, 3, sign_cost},                                            // THREE
+  {0, 0, 0, 4, sign_cost},                                            // FOUR
+  {cat1_high10, vp10_cat1_prob_high10, 1,  CAT1_MIN_VAL, cat1_cost},  // CAT1
+  {cat2_high10, vp10_cat2_prob_high10, 2,  CAT2_MIN_VAL, cat2_cost},  // CAT2
+  {cat3_high10, vp10_cat3_prob_high10, 3,  CAT3_MIN_VAL, cat3_cost},  // CAT3
+  {cat4_high10, vp10_cat4_prob_high10, 4,  CAT4_MIN_VAL, cat4_cost},  // CAT4
+  {cat5_high10, vp10_cat5_prob_high10, 5,  CAT5_MIN_VAL, cat5_cost},  // CAT5
+  {cat6_high10, vp10_cat6_prob_high10, 16, CAT6_MIN_VAL, 0},          // CAT6
+  {0, 0, 0, 0, zero_cost}                                             // EOB
+};
+const vp10_extra_bit vp10_extra_bits_high12[ENTROPY_TOKENS] = {
+  {0, 0, 0, 0, zero_cost},                                            // ZERO
+  {0, 0, 0, 1, sign_cost},                                            // ONE
+  {0, 0, 0, 2, sign_cost},                                            // TWO
+  {0, 0, 0, 3, sign_cost},                                            // THREE
+  {0, 0, 0, 4, sign_cost},                                            // FOUR
+  {cat1_high12, vp10_cat1_prob_high12, 1,  CAT1_MIN_VAL, cat1_cost},  // CAT1
+  {cat2_high12, vp10_cat2_prob_high12, 2,  CAT2_MIN_VAL, cat2_cost},  // CAT2
+  {cat3_high12, vp10_cat3_prob_high12, 3,  CAT3_MIN_VAL, cat3_cost},  // CAT3
+  {cat4_high12, vp10_cat4_prob_high12, 4,  CAT4_MIN_VAL, cat4_cost},  // CAT4
+  {cat5_high12, vp10_cat5_prob_high12, 5,  CAT5_MIN_VAL, cat5_cost},  // CAT5
+  {cat6_high12, vp10_cat6_prob_high12, 18, CAT6_MIN_VAL, 0},          // CAT6
+  {0, 0, 0, 0, zero_cost}                                             // EOB
+};
+#endif
+
+const struct vp10_token vp10_coef_encodings[ENTROPY_TOKENS] = {
+  {2, 2}, {6, 3}, {28, 5}, {58, 6}, {59, 6}, {60, 6}, {61, 6}, {124, 7},
+  {125, 7}, {126, 7}, {127, 7}, {0, 1}
+};
+
+
+struct tokenize_b_args {
+  VP10_COMP *cpi;
+  ThreadData *td;
+  TOKENEXTRA **tp;
+};
+
+static void set_entropy_context_b(int plane, int block,
+                                  int blk_row, int blk_col,
+                                  BLOCK_SIZE plane_bsize,
+                                  TX_SIZE tx_size, void *arg) {
+  struct tokenize_b_args* const args = arg;
+  ThreadData *const td = args->td;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *p = &x->plane[plane];
+  struct macroblockd_plane *pd = &xd->plane[plane];
+  vp10_set_contexts(xd, pd, plane_bsize, tx_size, p->eobs[block] > 0,
+                    blk_col, blk_row);
+}
+
+static INLINE void add_token(TOKENEXTRA **t, const vpx_prob *context_tree,
+                             int32_t extra, uint8_t token,
+                             uint8_t skip_eob_node,
+                             unsigned int *counts) {
+  (*t)->token = token;
+  (*t)->extra = extra;
+  (*t)->context_tree = context_tree;
+  (*t)->skip_eob_node = skip_eob_node;
+  (*t)++;
+  ++counts[token];
+}
+
+static INLINE void add_token_no_extra(TOKENEXTRA **t,
+                                      const vpx_prob *context_tree,
+                                      uint8_t token,
+                                      uint8_t skip_eob_node,
+                                      unsigned int *counts) {
+  (*t)->token = token;
+  (*t)->context_tree = context_tree;
+  (*t)->skip_eob_node = skip_eob_node;
+  (*t)++;
+  ++counts[token];
+}
+
+static INLINE int get_tx_eob(const struct segmentation *seg, int segment_id,
+                             TX_SIZE tx_size) {
+  const int eob_max = 16 << (tx_size << 1);
+  return segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
+}
+
+static void tokenize_b(int plane, int block, int blk_row, int blk_col,
+                       BLOCK_SIZE plane_bsize,
+                       TX_SIZE tx_size, void *arg) {
+  struct tokenize_b_args* const args = arg;
+  VP10_COMP *cpi = args->cpi;
+  ThreadData *const td = args->td;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  TOKENEXTRA **tp = args->tp;
+  uint8_t token_cache[32 * 32];
+  struct macroblock_plane *p = &x->plane[plane];
+  struct macroblockd_plane *pd = &xd->plane[plane];
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  int pt; /* near block/prev token context index */
+  int c;
+  TOKENEXTRA *t = *tp;        /* store tokens starting here */
+  int eob = p->eobs[block];
+  const PLANE_TYPE type = pd->plane_type;
+  const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  const int segment_id = mbmi->segment_id;
+  const int16_t *scan, *nb;
+  const TX_TYPE tx_type = get_tx_type(type, xd, block);
+  const scan_order *const so = get_scan(tx_size, tx_type);
+  const int ref = is_inter_block(mbmi);
+  unsigned int (*const counts)[COEFF_CONTEXTS][ENTROPY_TOKENS] =
+      td->rd_counts.coef_counts[tx_size][type][ref];
+  vpx_prob (*const coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
+      cpi->common.fc->coef_probs[tx_size][type][ref];
+  unsigned int (*const eob_branch)[COEFF_CONTEXTS] =
+      td->counts->eob_branch[tx_size][type][ref];
+  const uint8_t *const band = get_band_translate(tx_size);
+  const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size);
+  int16_t token;
+  EXTRABIT extra;
+  pt = get_entropy_context(tx_size, pd->above_context + blk_col,
+                           pd->left_context + blk_row);
+  scan = so->scan;
+  nb = so->neighbors;
+  c = 0;
+
+  while (c < eob) {
+    int v = 0;
+    int skip_eob = 0;
+    v = qcoeff[scan[c]];
+
+    while (!v) {
+      add_token_no_extra(&t, coef_probs[band[c]][pt], ZERO_TOKEN, skip_eob,
+                         counts[band[c]][pt]);
+      eob_branch[band[c]][pt] += !skip_eob;
+
+      skip_eob = 1;
+      token_cache[scan[c]] = 0;
+      ++c;
+      pt = get_coef_context(nb, token_cache, c);
+      v = qcoeff[scan[c]];
+    }
+
+    vp10_get_token_extra(v, &token, &extra);
+
+    add_token(&t, coef_probs[band[c]][pt], extra, (uint8_t)token,
+              (uint8_t)skip_eob, counts[band[c]][pt]);
+    eob_branch[band[c]][pt] += !skip_eob;
+
+    token_cache[scan[c]] = vp10_pt_energy_class[token];
+    ++c;
+    pt = get_coef_context(nb, token_cache, c);
+  }
+  if (c < seg_eob) {
+    add_token_no_extra(&t, coef_probs[band[c]][pt], EOB_TOKEN, 0,
+                       counts[band[c]][pt]);
+    ++eob_branch[band[c]][pt];
+  }
+
+  *tp = t;
+
+  vp10_set_contexts(xd, pd, plane_bsize, tx_size, c > 0, blk_col, blk_row);
+}
+
+struct is_skippable_args {
+  uint16_t *eobs;
+  int *skippable;
+};
+static void is_skippable(int plane, int block, int blk_row, int blk_col,
+                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                         void *argv) {
+  struct is_skippable_args *args = argv;
+  (void)plane;
+  (void)plane_bsize;
+  (void)tx_size;
+  (void)blk_row;
+  (void)blk_col;
+  args->skippable[0] &= (!args->eobs[block]);
+}
+
+// TODO(yaowu): rewrite and optimize this function to remove the usage of
+//              vp10_foreach_transform_block() and simplify is_skippable().
+int vp10_is_skippable_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
+  int result = 1;
+  struct is_skippable_args args = {x->plane[plane].eobs, &result};
+  vp10_foreach_transformed_block_in_plane(&x->e_mbd, bsize, plane, is_skippable,
+                                         &args);
+  return result;
+}
+
+static void has_high_freq_coeff(int plane, int block, int blk_row, int blk_col,
+                                BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                                void *argv) {
+  struct is_skippable_args *args = argv;
+  int eobs = (tx_size == TX_4X4) ? 3 : 10;
+  (void) plane;
+  (void) plane_bsize;
+  (void) blk_row;
+  (void) blk_col;
+
+  *(args->skippable) |= (args->eobs[block] > eobs);
+}
+
+int vp10_has_high_freq_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
+  int result = 0;
+  struct is_skippable_args args = {x->plane[plane].eobs, &result};
+  vp10_foreach_transformed_block_in_plane(&x->e_mbd, bsize, plane,
+                                         has_high_freq_coeff, &args);
+  return result;
+}
+
+void vp10_tokenize_sb(VP10_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
+                     int dry_run, BLOCK_SIZE bsize) {
+  VP10_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const int ctx = vp10_get_skip_context(xd);
+  const int skip_inc = !segfeature_active(&cm->seg, mbmi->segment_id,
+                                          SEG_LVL_SKIP);
+  struct tokenize_b_args arg = {cpi, td, t};
+  if (mbmi->skip) {
+    if (!dry_run)
+      td->counts->skip[ctx][1] += skip_inc;
+    reset_skip_context(xd, bsize);
+    return;
+  }
+
+  if (!dry_run) {
+    int plane;
+
+    td->counts->skip[ctx][0] += skip_inc;
+
+    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+      vp10_foreach_transformed_block_in_plane(xd, bsize, plane, tokenize_b,
+                                              &arg);
+      (*t)->token = EOSB_TOKEN;
+      (*t)++;
+    }
+  } else {
+    vp10_foreach_transformed_block(xd, bsize, set_entropy_context_b, &arg);
+  }
+}
diff --git a/libs/libvpx/vp10/encoder/tokenize.h b/libs/libvpx/vp10/encoder/tokenize.h
new file mode 100644
index 0000000000..5bad415a9a
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/tokenize.h
@@ -0,0 +1,112 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_ENCODER_TOKENIZE_H_
+#define VP10_ENCODER_TOKENIZE_H_
+
+#include "vp10/common/entropy.h"
+
+#include "vp10/encoder/block.h"
+#include "vp10/encoder/treewriter.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define EOSB_TOKEN 127     // Not signalled, encoder only
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  typedef int32_t EXTRABIT;
+#else
+  typedef int16_t EXTRABIT;
+#endif
+
+
+typedef struct {
+  int16_t token;
+  EXTRABIT extra;
+} TOKENVALUE;
+
+typedef struct {
+  const vpx_prob *context_tree;
+  EXTRABIT extra;
+  uint8_t token;
+  uint8_t skip_eob_node;
+} TOKENEXTRA;
+
+extern const vpx_tree_index vp10_coef_tree[];
+extern const vpx_tree_index vp10_coef_con_tree[];
+extern const struct vp10_token vp10_coef_encodings[];
+
+int vp10_is_skippable_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
+int vp10_has_high_freq_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
+
+struct VP10_COMP;
+struct ThreadData;
+
+void vp10_tokenize_sb(struct VP10_COMP *cpi, struct ThreadData *td,
+                     TOKENEXTRA **t, int dry_run, BLOCK_SIZE bsize);
+
+extern const int16_t *vp10_dct_value_cost_ptr;
+/* TODO: The Token field should be broken out into a separate char array to
+ *  improve cache locality, since it's needed for costing when the rest of the
+ *  fields are not.
+ */
+extern const TOKENVALUE *vp10_dct_value_tokens_ptr;
+extern const TOKENVALUE *vp10_dct_cat_lt_10_value_tokens;
+extern const int16_t vp10_cat6_low_cost[256];
+extern const int16_t vp10_cat6_high_cost[128];
+extern const int16_t vp10_cat6_high10_high_cost[512];
+extern const int16_t vp10_cat6_high12_high_cost[2048];
+static INLINE int16_t vp10_get_cost(int16_t token, EXTRABIT extrabits,
+                                   const int16_t *cat6_high_table) {
+  if (token != CATEGORY6_TOKEN)
+    return vp10_extra_bits[token].cost[extrabits];
+  return vp10_cat6_low_cost[extrabits & 0xff]
+      + cat6_high_table[extrabits >> 8];
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE const int16_t* vp10_get_high_cost_table(int bit_depth) {
+  return bit_depth == 8 ? vp10_cat6_high_cost
+      : (bit_depth == 10 ? vp10_cat6_high10_high_cost :
+         vp10_cat6_high12_high_cost);
+}
+#else
+static INLINE const int16_t* vp10_get_high_cost_table(int bit_depth) {
+  (void) bit_depth;
+  return vp10_cat6_high_cost;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static INLINE void vp10_get_token_extra(int v, int16_t *token, EXTRABIT *extra) {
+  if (v >= CAT6_MIN_VAL || v <= -CAT6_MIN_VAL) {
+    *token = CATEGORY6_TOKEN;
+    if (v >= CAT6_MIN_VAL)
+      *extra = 2 * v - 2 * CAT6_MIN_VAL;
+    else
+      *extra = -2 * v - 2 * CAT6_MIN_VAL + 1;
+    return;
+  }
+  *token = vp10_dct_cat_lt_10_value_tokens[v].token;
+  *extra = vp10_dct_cat_lt_10_value_tokens[v].extra;
+}
+static INLINE int16_t vp10_get_token(int v) {
+  if (v >= CAT6_MIN_VAL || v <= -CAT6_MIN_VAL)
+    return 10;
+  return vp10_dct_cat_lt_10_value_tokens[v].token;
+}
+
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_ENCODER_TOKENIZE_H_
diff --git a/libs/libvpx/vp10/encoder/treewriter.c b/libs/libvpx/vp10/encoder/treewriter.c
new file mode 100644
index 0000000000..1f42f32a11
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/treewriter.c
@@ -0,0 +1,58 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp10/encoder/treewriter.h"
+
+static void tree2tok(struct vp10_token *tokens, const vpx_tree_index *tree,
+                     int i, int v, int l) {
+  v += v;
+  ++l;
+
+  do {
+    const vpx_tree_index j = tree[i++];
+    if (j <= 0) {
+      tokens[-j].value = v;
+      tokens[-j].len = l;
+    } else {
+      tree2tok(tokens, tree, j, v, l);
+    }
+  } while (++v & 1);
+}
+
+void vp10_tokens_from_tree(struct vp10_token *tokens,
+                          const vpx_tree_index *tree) {
+  tree2tok(tokens, tree, 0, 0, 0);
+}
+
+static unsigned int convert_distribution(unsigned int i, vpx_tree tree,
+                                         unsigned int branch_ct[][2],
+                                         const unsigned int num_events[]) {
+  unsigned int left, right;
+
+  if (tree[i] <= 0)
+    left = num_events[-tree[i]];
+  else
+    left = convert_distribution(tree[i], tree, branch_ct, num_events);
+
+  if (tree[i + 1] <= 0)
+    right = num_events[-tree[i + 1]];
+  else
+    right = convert_distribution(tree[i + 1], tree, branch_ct, num_events);
+
+  branch_ct[i >> 1][0] = left;
+  branch_ct[i >> 1][1] = right;
+  return left + right;
+}
+
+void vp10_tree_probs_from_distribution(vpx_tree tree,
+                                      unsigned int branch_ct[/* n-1 */][2],
+                                      const unsigned int num_events[/* n */]) {
+  convert_distribution(0, tree, branch_ct, num_events);
+}
diff --git a/libs/libvpx/vp10/encoder/treewriter.h b/libs/libvpx/vp10/encoder/treewriter.h
new file mode 100644
index 0000000000..6b76a03e47
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/treewriter.h
@@ -0,0 +1,51 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_ENCODER_TREEWRITER_H_
+#define VP10_ENCODER_TREEWRITER_H_
+
+#include "vpx_dsp/bitwriter.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp10_tree_probs_from_distribution(vpx_tree tree,
+                                      unsigned int branch_ct[ /* n - 1 */ ][2],
+                                      const unsigned int num_events[ /* n */ ]);
+
+struct vp10_token {
+  int value;
+  int len;
+};
+
+void vp10_tokens_from_tree(struct vp10_token*, const vpx_tree_index *);
+
+static INLINE void vp10_write_tree(vpx_writer *w, const vpx_tree_index *tree,
+                                  const vpx_prob *probs, int bits, int len,
+                                  vpx_tree_index i) {
+  do {
+    const int bit = (bits >> --len) & 1;
+    vpx_write(w, bit, probs[i >> 1]);
+    i = tree[i + bit];
+  } while (len);
+}
+
+static INLINE void vp10_write_token(vpx_writer *w, const vpx_tree_index *tree,
+                                   const vpx_prob *probs,
+                                   const struct vp10_token *token) {
+  vp10_write_tree(w, tree, probs, token->value, token->len, 0);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_ENCODER_TREEWRITER_H_
diff --git a/libs/libvpx/vp10/encoder/x86/dct_mmx.asm b/libs/libvpx/vp10/encoder/x86/dct_mmx.asm
new file mode 100644
index 0000000000..2327fe9e6c
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/x86/dct_mmx.asm
@@ -0,0 +1,104 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%define private_prefix vp10
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro TRANSFORM_COLS 0
+  paddw           m0,        m1
+  movq            m4,        m0
+  psubw           m3,        m2
+  psubw           m4,        m3
+  psraw           m4,        1
+  movq            m5,        m4
+  psubw           m5,        m1 ;b1
+  psubw           m4,        m2 ;c1
+  psubw           m0,        m4
+  paddw           m3,        m5
+                                ; m0 a0
+  SWAP            1,         4  ; m1 c1
+  SWAP            2,         3  ; m2 d1
+  SWAP            3,         5  ; m3 b1
+%endmacro
+
+%macro TRANSPOSE_4X4 0
+  movq            m4,        m0
+  movq            m5,        m2
+  punpcklwd       m4,        m1
+  punpckhwd       m0,        m1
+  punpcklwd       m5,        m3
+  punpckhwd       m2,        m3
+  movq            m1,        m4
+  movq            m3,        m0
+  punpckldq       m1,        m5
+  punpckhdq       m4,        m5
+  punpckldq       m3,        m2
+  punpckhdq       m0,        m2
+  SWAP            2, 3, 0, 1, 4
+%endmacro
+
+INIT_MMX mmx
+cglobal fwht4x4, 3, 4, 8, input, output, stride
+  lea             r3q,       [inputq + strideq*4]
+  movq            m0,        [inputq] ;a1
+  movq            m1,        [inputq + strideq*2] ;b1
+  movq            m2,        [r3q] ;c1
+  movq            m3,        [r3q + strideq*2] ;d1
+
+  TRANSFORM_COLS
+  TRANSPOSE_4X4
+  TRANSFORM_COLS
+  TRANSPOSE_4X4
+
+  psllw           m0,        2
+  psllw           m1,        2
+  psllw           m2,        2
+  psllw           m3,        2
+
+%if CONFIG_VP9_HIGHBITDEPTH
+  pxor            m4,             m4
+  pxor            m5,             m5
+  pcmpgtw         m4,             m0
+  pcmpgtw         m5,             m1
+  movq            m6,             m0
+  movq            m7,             m1
+  punpcklwd       m0,             m4
+  punpcklwd       m1,             m5
+  punpckhwd       m6,             m4
+  punpckhwd       m7,             m5
+  movq            [outputq],      m0
+  movq            [outputq + 8],  m6
+  movq            [outputq + 16], m1
+  movq            [outputq + 24], m7
+  pxor            m4,             m4
+  pxor            m5,             m5
+  pcmpgtw         m4,             m2
+  pcmpgtw         m5,             m3
+  movq            m6,             m2
+  movq            m7,             m3
+  punpcklwd       m2,             m4
+  punpcklwd       m3,             m5
+  punpckhwd       m6,             m4
+  punpckhwd       m7,             m5
+  movq            [outputq + 32], m2
+  movq            [outputq + 40], m6
+  movq            [outputq + 48], m3
+  movq            [outputq + 56], m7
+%else
+  movq            [outputq],      m0
+  movq            [outputq + 8],  m1
+  movq            [outputq + 16], m2
+  movq            [outputq + 24], m3
+%endif
+
+  RET
diff --git a/libs/libvpx/vp10/encoder/x86/dct_sse2.c b/libs/libvpx/vp10/encoder/x86/dct_sse2.c
new file mode 100644
index 0000000000..e1111570a2
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/x86/dct_sse2.c
@@ -0,0 +1,2058 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>  // SSE2
+
+#include "./vp10_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/x86/fwd_txfm_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+#include "vpx_ports/mem.h"
+
+static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
+                                   int stride) {
+  const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
+  const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
+  __m128i mask;
+
+  in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+  in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+  in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+  in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+
+  in[0] = _mm_slli_epi16(in[0], 4);
+  in[1] = _mm_slli_epi16(in[1], 4);
+  in[2] = _mm_slli_epi16(in[2], 4);
+  in[3] = _mm_slli_epi16(in[3], 4);
+
+  mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a);
+  in[0] = _mm_add_epi16(in[0], mask);
+  in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b);
+}
+
+static INLINE void write_buffer_4x4(tran_low_t *output, __m128i *res) {
+  const __m128i kOne = _mm_set1_epi16(1);
+  __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]);
+  __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]);
+  __m128i out01 = _mm_add_epi16(in01, kOne);
+  __m128i out23 = _mm_add_epi16(in23, kOne);
+  out01 = _mm_srai_epi16(out01, 2);
+  out23 = _mm_srai_epi16(out23, 2);
+  store_output(&out01, (output + 0 * 8));
+  store_output(&out23, (output + 1 * 8));
+}
+
+static INLINE void transpose_4x4(__m128i *res) {
+  // Combine and transpose
+  // 00 01 02 03 20 21 22 23
+  // 10 11 12 13 30 31 32 33
+  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
+  const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
+
+  // 00 10 01 11 02 12 03 13
+  // 20 30 21 31 22 32 23 33
+  res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1);
+
+  // 00 10 20 30 01 11 21 31
+  // 02 12 22 32 03 13 23 33
+  // only use the first 4 16-bit integers
+  res[1] = _mm_unpackhi_epi64(res[0], res[0]);
+  res[3] = _mm_unpackhi_epi64(res[2], res[2]);
+}
+
+static void fdct4_sse2(__m128i *in) {
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+  __m128i u[4], v[4];
+  u[0]=_mm_unpacklo_epi16(in[0], in[1]);
+  u[1]=_mm_unpacklo_epi16(in[3], in[2]);
+
+  v[0] = _mm_add_epi16(u[0], u[1]);
+  v[1] = _mm_sub_epi16(u[0], u[1]);
+
+  u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);  // 0
+  u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16);  // 2
+  u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24);  // 1
+  u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08);  // 3
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+
+  in[0] = _mm_packs_epi32(u[0], u[1]);
+  in[1] = _mm_packs_epi32(u[2], u[3]);
+  transpose_4x4(in);
+}
+
+static void fadst4_sse2(__m128i *in) {
+  const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9);
+  const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9);
+  const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
+  const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9);
+  const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
+  const __m128i kZero = _mm_set1_epi16(0);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  __m128i u[8], v[8];
+  __m128i in7 = _mm_add_epi16(in[0], in[1]);
+
+  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
+  u[1] = _mm_unpacklo_epi16(in[2], in[3]);
+  u[2] = _mm_unpacklo_epi16(in7, kZero);
+  u[3] = _mm_unpacklo_epi16(in[2], kZero);
+  u[4] = _mm_unpacklo_epi16(in[3], kZero);
+
+  v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02);  // s0 + s2
+  v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04);  // s4 + s5
+  v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x1
+  v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01);  // s1 - s3
+  v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02);  // -s4 + s6
+  v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s4
+  v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03);
+
+  u[0] = _mm_add_epi32(v[0], v[1]);
+  u[1] = _mm_sub_epi32(v[2], v[6]);
+  u[2] = _mm_add_epi32(v[3], v[4]);
+  u[3] = _mm_sub_epi32(u[2], u[0]);
+  u[4] = _mm_slli_epi32(v[5], 2);
+  u[5] = _mm_sub_epi32(u[4], v[5]);
+  u[6] = _mm_add_epi32(u[3], u[5]);
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+
+  in[0] = _mm_packs_epi32(u[0], u[2]);
+  in[1] = _mm_packs_epi32(u[1], u[3]);
+  transpose_4x4(in);
+}
+
+void vp10_fht4x4_sse2(const int16_t *input, tran_low_t *output,
+                     int stride, int tx_type) {
+  __m128i in[4];
+
+  switch (tx_type) {
+    case DCT_DCT:
+      vpx_fdct4x4_sse2(input, output, stride);
+      break;
+    case ADST_DCT:
+      load_buffer_4x4(input, in, stride);
+      fadst4_sse2(in);
+      fdct4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+    case DCT_ADST:
+      load_buffer_4x4(input, in, stride);
+      fdct4_sse2(in);
+      fadst4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+    case ADST_ADST:
+      load_buffer_4x4(input, in, stride);
+      fadst4_sse2(in);
+      fadst4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+   default:
+     assert(0);
+     break;
+  }
+}
+
+void vp10_fdct8x8_quant_sse2(const int16_t *input, int stride,
+                            int16_t* coeff_ptr, intptr_t n_coeffs,
+                            int skip_block, const int16_t* zbin_ptr,
+                            const int16_t* round_ptr, const int16_t* quant_ptr,
+                            const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr,
+                            int16_t* dqcoeff_ptr, const int16_t* dequant_ptr,
+                            uint16_t* eob_ptr,
+                            const int16_t* scan_ptr,
+                            const int16_t* iscan_ptr) {
+  __m128i zero;
+  int pass;
+  // Constants
+  //    When we use them, in one case, they are all the same. In all others
+  //    it's a pair of them that we need to repeat four times. This is done
+  //    by constructing the 32 bit constant corresponding to that pair.
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  // Load input
+  __m128i in0  = _mm_load_si128((const __m128i *)(input + 0 * stride));
+  __m128i in1  = _mm_load_si128((const __m128i *)(input + 1 * stride));
+  __m128i in2  = _mm_load_si128((const __m128i *)(input + 2 * stride));
+  __m128i in3  = _mm_load_si128((const __m128i *)(input + 3 * stride));
+  __m128i in4  = _mm_load_si128((const __m128i *)(input + 4 * stride));
+  __m128i in5  = _mm_load_si128((const __m128i *)(input + 5 * stride));
+  __m128i in6  = _mm_load_si128((const __m128i *)(input + 6 * stride));
+  __m128i in7  = _mm_load_si128((const __m128i *)(input + 7 * stride));
+  __m128i *in[8];
+  int index = 0;
+
+  (void)scan_ptr;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)coeff_ptr;
+
+  // Pre-condition input (shift by two)
+  in0 = _mm_slli_epi16(in0, 2);
+  in1 = _mm_slli_epi16(in1, 2);
+  in2 = _mm_slli_epi16(in2, 2);
+  in3 = _mm_slli_epi16(in3, 2);
+  in4 = _mm_slli_epi16(in4, 2);
+  in5 = _mm_slli_epi16(in5, 2);
+  in6 = _mm_slli_epi16(in6, 2);
+  in7 = _mm_slli_epi16(in7, 2);
+
+  in[0] = &in0;
+  in[1] = &in1;
+  in[2] = &in2;
+  in[3] = &in3;
+  in[4] = &in4;
+  in[5] = &in5;
+  in[6] = &in6;
+  in[7] = &in7;
+
+  // We do two passes, first the columns, then the rows. The results of the
+  // first pass are transposed so that the same column code can be reused. The
+  // results of the second pass are also transposed so that the rows (processed
+  // as columns) are put back in row positions.
+  for (pass = 0; pass < 2; pass++) {
+    // To store results of each pass before the transpose.
+    __m128i res0, res1, res2, res3, res4, res5, res6, res7;
+    // Add/subtract
+    const __m128i q0 = _mm_add_epi16(in0, in7);
+    const __m128i q1 = _mm_add_epi16(in1, in6);
+    const __m128i q2 = _mm_add_epi16(in2, in5);
+    const __m128i q3 = _mm_add_epi16(in3, in4);
+    const __m128i q4 = _mm_sub_epi16(in3, in4);
+    const __m128i q5 = _mm_sub_epi16(in2, in5);
+    const __m128i q6 = _mm_sub_epi16(in1, in6);
+    const __m128i q7 = _mm_sub_epi16(in0, in7);
+    // Work on first four results
+    {
+      // Add/subtract
+      const __m128i r0 = _mm_add_epi16(q0, q3);
+      const __m128i r1 = _mm_add_epi16(q1, q2);
+      const __m128i r2 = _mm_sub_epi16(q1, q2);
+      const __m128i r3 = _mm_sub_epi16(q0, q3);
+      // Interleave to do the multiply by constants which gets us into 32bits
+      const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+      const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
+      const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+      const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
+      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
+      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
+      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
+      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
+      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
+      // dct_const_round_shift
+      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+      // Combine
+      res0 = _mm_packs_epi32(w0, w1);
+      res4 = _mm_packs_epi32(w2, w3);
+      res2 = _mm_packs_epi32(w4, w5);
+      res6 = _mm_packs_epi32(w6, w7);
+    }
+    // Work on next four results
+    {
+      // Interleave to do the multiply by constants which gets us into 32bits
+      const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
+      const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
+      const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
+      const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
+      const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
+      const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
+      // dct_const_round_shift
+      const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
+      const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
+      const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
+      const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
+      const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
+      const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
+      const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
+      const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
+      // Combine
+      const __m128i r0 = _mm_packs_epi32(s0, s1);
+      const __m128i r1 = _mm_packs_epi32(s2, s3);
+      // Add/subtract
+      const __m128i x0 = _mm_add_epi16(q4, r0);
+      const __m128i x1 = _mm_sub_epi16(q4, r0);
+      const __m128i x2 = _mm_sub_epi16(q7, r1);
+      const __m128i x3 = _mm_add_epi16(q7, r1);
+      // Interleave to do the multiply by constants which gets us into 32bits
+      const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
+      const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
+      const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
+      const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
+      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
+      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
+      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
+      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
+      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
+      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
+      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
+      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
+      // dct_const_round_shift
+      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+      // Combine
+      res1 = _mm_packs_epi32(w0, w1);
+      res7 = _mm_packs_epi32(w2, w3);
+      res5 = _mm_packs_epi32(w4, w5);
+      res3 = _mm_packs_epi32(w6, w7);
+    }
+    // Transpose the 8x8.
+    {
+      // 00 01 02 03 04 05 06 07
+      // 10 11 12 13 14 15 16 17
+      // 20 21 22 23 24 25 26 27
+      // 30 31 32 33 34 35 36 37
+      // 40 41 42 43 44 45 46 47
+      // 50 51 52 53 54 55 56 57
+      // 60 61 62 63 64 65 66 67
+      // 70 71 72 73 74 75 76 77
+      const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
+      const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
+      const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
+      const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
+      const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
+      const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
+      const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
+      const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
+      // 00 10 01 11 02 12 03 13
+      // 20 30 21 31 22 32 23 33
+      // 04 14 05 15 06 16 07 17
+      // 24 34 25 35 26 36 27 37
+      // 40 50 41 51 42 52 43 53
+      // 60 70 61 71 62 72 63 73
+      // 54 54 55 55 56 56 57 57
+      // 64 74 65 75 66 76 67 77
+      const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+      const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+      const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+      const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+      const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+      const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+      const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+      const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+      // 00 10 20 30 01 11 21 31
+      // 40 50 60 70 41 51 61 71
+      // 02 12 22 32 03 13 23 33
+      // 42 52 62 72 43 53 63 73
+      // 04 14 24 34 05 15 21 36
+      // 44 54 64 74 45 55 61 76
+      // 06 16 26 36 07 17 27 37
+      // 46 56 66 76 47 57 67 77
+      in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+      in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+      in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+      in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+      in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+      in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+      in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+      in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+      // 00 10 20 30 40 50 60 70
+      // 01 11 21 31 41 51 61 71
+      // 02 12 22 32 42 52 62 72
+      // 03 13 23 33 43 53 63 73
+      // 04 14 24 34 44 54 64 74
+      // 05 15 25 35 45 55 65 75
+      // 06 16 26 36 46 56 66 76
+      // 07 17 27 37 47 57 67 77
+    }
+  }
+  // Post-condition output and store it
+  {
+    // Post-condition (division by two)
+    //    division of two 16 bits signed numbers using shifts
+    //    n / 2 = (n - (n >> 15)) >> 1
+    const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
+    const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
+    const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
+    const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
+    const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
+    const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
+    const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
+    const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
+    in0 = _mm_sub_epi16(in0, sign_in0);
+    in1 = _mm_sub_epi16(in1, sign_in1);
+    in2 = _mm_sub_epi16(in2, sign_in2);
+    in3 = _mm_sub_epi16(in3, sign_in3);
+    in4 = _mm_sub_epi16(in4, sign_in4);
+    in5 = _mm_sub_epi16(in5, sign_in5);
+    in6 = _mm_sub_epi16(in6, sign_in6);
+    in7 = _mm_sub_epi16(in7, sign_in7);
+    in0 = _mm_srai_epi16(in0, 1);
+    in1 = _mm_srai_epi16(in1, 1);
+    in2 = _mm_srai_epi16(in2, 1);
+    in3 = _mm_srai_epi16(in3, 1);
+    in4 = _mm_srai_epi16(in4, 1);
+    in5 = _mm_srai_epi16(in5, 1);
+    in6 = _mm_srai_epi16(in6, 1);
+    in7 = _mm_srai_epi16(in7, 1);
+  }
+
+  iscan_ptr += n_coeffs;
+  qcoeff_ptr += n_coeffs;
+  dqcoeff_ptr += n_coeffs;
+  n_coeffs = -n_coeffs;
+  zero = _mm_setzero_si128();
+
+  if (!skip_block) {
+    __m128i eob;
+    __m128i round, quant, dequant;
+    {
+      __m128i coeff0, coeff1;
+
+      // Setup global values
+      {
+        round = _mm_load_si128((const __m128i*)round_ptr);
+        quant = _mm_load_si128((const __m128i*)quant_ptr);
+        dequant = _mm_load_si128((const __m128i*)dequant_ptr);
+      }
+
+      {
+        __m128i coeff0_sign, coeff1_sign;
+        __m128i qcoeff0, qcoeff1;
+        __m128i qtmp0, qtmp1;
+        // Do DC and first 15 AC
+        coeff0 = *in[0];
+        coeff1 = *in[1];
+
+        // Poor man's sign extract
+        coeff0_sign = _mm_srai_epi16(coeff0, 15);
+        coeff1_sign = _mm_srai_epi16(coeff1, 15);
+        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+        round = _mm_unpackhi_epi64(round, round);
+        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+        quant = _mm_unpackhi_epi64(quant, quant);
+        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+        // Reinsert signs
+        qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+
+        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+        dequant = _mm_unpackhi_epi64(dequant, dequant);
+        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+      }
+
+      {
+        // Scan for eob
+        __m128i zero_coeff0, zero_coeff1;
+        __m128i nzero_coeff0, nzero_coeff1;
+        __m128i iscan0, iscan1;
+        __m128i eob1;
+        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+        iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
+        iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
+        // Add one to convert from indices to counts
+        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+        eob = _mm_and_si128(iscan0, nzero_coeff0);
+        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+        eob = _mm_max_epi16(eob, eob1);
+      }
+      n_coeffs += 8 * 2;
+    }
+
+    // AC only loop
+    index = 2;
+    while (n_coeffs < 0) {
+      __m128i coeff0, coeff1;
+      {
+        __m128i coeff0_sign, coeff1_sign;
+        __m128i qcoeff0, qcoeff1;
+        __m128i qtmp0, qtmp1;
+
+        assert(index < (int)(sizeof(in) / sizeof(in[0])) - 1);
+        coeff0 = *in[index];
+        coeff1 = *in[index + 1];
+
+        // Poor man's sign extract
+        coeff0_sign = _mm_srai_epi16(coeff0, 15);
+        coeff1_sign = _mm_srai_epi16(coeff1, 15);
+        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+        // Reinsert signs
+        qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+
+        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+      }
+
+      {
+        // Scan for eob
+        __m128i zero_coeff0, zero_coeff1;
+        __m128i nzero_coeff0, nzero_coeff1;
+        __m128i iscan0, iscan1;
+        __m128i eob0, eob1;
+        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+        iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
+        iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
+        // Add one to convert from indices to counts
+        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+        eob0 = _mm_and_si128(iscan0, nzero_coeff0);
+        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+        eob0 = _mm_max_epi16(eob0, eob1);
+        eob = _mm_max_epi16(eob, eob0);
+      }
+      n_coeffs += 8 * 2;
+      index += 2;
+    }
+
+    // Accumulate EOB
+    {
+      __m128i eob_shuffled;
+      eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      *eob_ptr = _mm_extract_epi16(eob, 1);
+    }
+  } else {
+    do {
+      _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
+      _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
+      _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
+      _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
+      n_coeffs += 8 * 2;
+    } while (n_coeffs < 0);
+    *eob_ptr = 0;
+  }
+}
+
+// load 8x8 array
+static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
+                                   int stride) {
+  in[0]  = _mm_load_si128((const __m128i *)(input + 0 * stride));
+  in[1]  = _mm_load_si128((const __m128i *)(input + 1 * stride));
+  in[2]  = _mm_load_si128((const __m128i *)(input + 2 * stride));
+  in[3]  = _mm_load_si128((const __m128i *)(input + 3 * stride));
+  in[4]  = _mm_load_si128((const __m128i *)(input + 4 * stride));
+  in[5]  = _mm_load_si128((const __m128i *)(input + 5 * stride));
+  in[6]  = _mm_load_si128((const __m128i *)(input + 6 * stride));
+  in[7]  = _mm_load_si128((const __m128i *)(input + 7 * stride));
+
+  in[0] = _mm_slli_epi16(in[0], 2);
+  in[1] = _mm_slli_epi16(in[1], 2);
+  in[2] = _mm_slli_epi16(in[2], 2);
+  in[3] = _mm_slli_epi16(in[3], 2);
+  in[4] = _mm_slli_epi16(in[4], 2);
+  in[5] = _mm_slli_epi16(in[5], 2);
+  in[6] = _mm_slli_epi16(in[6], 2);
+  in[7] = _mm_slli_epi16(in[7], 2);
+}
+
+// right shift and rounding
+static INLINE void right_shift_8x8(__m128i *res, const int bit) {
+  __m128i sign0 = _mm_srai_epi16(res[0], 15);
+  __m128i sign1 = _mm_srai_epi16(res[1], 15);
+  __m128i sign2 = _mm_srai_epi16(res[2], 15);
+  __m128i sign3 = _mm_srai_epi16(res[3], 15);
+  __m128i sign4 = _mm_srai_epi16(res[4], 15);
+  __m128i sign5 = _mm_srai_epi16(res[5], 15);
+  __m128i sign6 = _mm_srai_epi16(res[6], 15);
+  __m128i sign7 = _mm_srai_epi16(res[7], 15);
+
+  if (bit == 2) {
+    const __m128i const_rounding = _mm_set1_epi16(1);
+    res[0] = _mm_add_epi16(res[0], const_rounding);
+    res[1] = _mm_add_epi16(res[1], const_rounding);
+    res[2] = _mm_add_epi16(res[2], const_rounding);
+    res[3] = _mm_add_epi16(res[3], const_rounding);
+    res[4] = _mm_add_epi16(res[4], const_rounding);
+    res[5] = _mm_add_epi16(res[5], const_rounding);
+    res[6] = _mm_add_epi16(res[6], const_rounding);
+    res[7] = _mm_add_epi16(res[7], const_rounding);
+  }
+
+  res[0] = _mm_sub_epi16(res[0], sign0);
+  res[1] = _mm_sub_epi16(res[1], sign1);
+  res[2] = _mm_sub_epi16(res[2], sign2);
+  res[3] = _mm_sub_epi16(res[3], sign3);
+  res[4] = _mm_sub_epi16(res[4], sign4);
+  res[5] = _mm_sub_epi16(res[5], sign5);
+  res[6] = _mm_sub_epi16(res[6], sign6);
+  res[7] = _mm_sub_epi16(res[7], sign7);
+
+  if (bit == 1) {
+    res[0] = _mm_srai_epi16(res[0], 1);
+    res[1] = _mm_srai_epi16(res[1], 1);
+    res[2] = _mm_srai_epi16(res[2], 1);
+    res[3] = _mm_srai_epi16(res[3], 1);
+    res[4] = _mm_srai_epi16(res[4], 1);
+    res[5] = _mm_srai_epi16(res[5], 1);
+    res[6] = _mm_srai_epi16(res[6], 1);
+    res[7] = _mm_srai_epi16(res[7], 1);
+  } else {
+    res[0] = _mm_srai_epi16(res[0], 2);
+    res[1] = _mm_srai_epi16(res[1], 2);
+    res[2] = _mm_srai_epi16(res[2], 2);
+    res[3] = _mm_srai_epi16(res[3], 2);
+    res[4] = _mm_srai_epi16(res[4], 2);
+    res[5] = _mm_srai_epi16(res[5], 2);
+    res[6] = _mm_srai_epi16(res[6], 2);
+    res[7] = _mm_srai_epi16(res[7], 2);
+  }
+}
+
+// write 8x8 array
+static INLINE void write_buffer_8x8(tran_low_t *output, __m128i *res,
+                                    int stride) {
+  store_output(&res[0], (output + 0 * stride));
+  store_output(&res[1], (output + 1 * stride));
+  store_output(&res[2], (output + 2 * stride));
+  store_output(&res[3], (output + 3 * stride));
+  store_output(&res[4], (output + 4 * stride));
+  store_output(&res[5], (output + 5 * stride));
+  store_output(&res[6], (output + 6 * stride));
+  store_output(&res[7], (output + 7 * stride));
+}
+
+// perform in-place transpose
+static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
+  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
+  const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
+  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
+  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
+  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
+  // 00 10 01 11 02 12 03 13
+  // 20 30 21 31 22 32 23 33
+  // 04 14 05 15 06 16 07 17
+  // 24 34 25 35 26 36 27 37
+  // 40 50 41 51 42 52 43 53
+  // 60 70 61 71 62 72 63 73
+  // 44 54 45 55 46 56 47 57
+  // 64 74 65 75 66 76 67 77
+  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+  // 00 10 20 30 01 11 21 31
+  // 40 50 60 70 41 51 61 71
+  // 02 12 22 32 03 13 23 33
+  // 42 52 62 72 43 53 63 73
+  // 04 14 24 34 05 15 25 35
+  // 44 54 64 74 45 55 65 75
+  // 06 16 26 36 07 17 27 37
+  // 46 56 66 76 47 57 67 77
+  res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
+  res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
+  res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
+  res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
+  res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
+  res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
+  res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
+  res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
+  // 00 10 20 30 40 50 60 70
+  // 01 11 21 31 41 51 61 71
+  // 02 12 22 32 42 52 62 72
+  // 03 13 23 33 43 53 63 73
+  // 04 14 24 34 44 54 64 74
+  // 05 15 25 35 45 55 65 75
+  // 06 16 26 36 46 56 66 76
+  // 07 17 27 37 47 57 67 77
+}
+
+static void fdct8_sse2(__m128i *in) {
+  // constants
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+
+  // stage 1
+  s0 = _mm_add_epi16(in[0], in[7]);
+  s1 = _mm_add_epi16(in[1], in[6]);
+  s2 = _mm_add_epi16(in[2], in[5]);
+  s3 = _mm_add_epi16(in[3], in[4]);
+  s4 = _mm_sub_epi16(in[3], in[4]);
+  s5 = _mm_sub_epi16(in[2], in[5]);
+  s6 = _mm_sub_epi16(in[1], in[6]);
+  s7 = _mm_sub_epi16(in[0], in[7]);
+
+  u0 = _mm_add_epi16(s0, s3);
+  u1 = _mm_add_epi16(s1, s2);
+  u2 = _mm_sub_epi16(s1, s2);
+  u3 = _mm_sub_epi16(s0, s3);
+  // interleave and perform butterfly multiplication/addition
+  v0 = _mm_unpacklo_epi16(u0, u1);
+  v1 = _mm_unpackhi_epi16(u0, u1);
+  v2 = _mm_unpacklo_epi16(u2, u3);
+  v3 = _mm_unpackhi_epi16(u2, u3);
+
+  u0 = _mm_madd_epi16(v0, k__cospi_p16_p16);
+  u1 = _mm_madd_epi16(v1, k__cospi_p16_p16);
+  u2 = _mm_madd_epi16(v0, k__cospi_p16_m16);
+  u3 = _mm_madd_epi16(v1, k__cospi_p16_m16);
+  u4 = _mm_madd_epi16(v2, k__cospi_p24_p08);
+  u5 = _mm_madd_epi16(v3, k__cospi_p24_p08);
+  u6 = _mm_madd_epi16(v2, k__cospi_m08_p24);
+  u7 = _mm_madd_epi16(v3, k__cospi_m08_p24);
+
+  // shift and rounding
+  v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+  v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+  v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+  v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+  v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+  v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+  v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+  v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+
+  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+
+  in[0] = _mm_packs_epi32(u0, u1);
+  in[2] = _mm_packs_epi32(u4, u5);
+  in[4] = _mm_packs_epi32(u2, u3);
+  in[6] = _mm_packs_epi32(u6, u7);
+
+  // stage 2
+  // interleave and perform butterfly multiplication/addition
+  u0 = _mm_unpacklo_epi16(s6, s5);
+  u1 = _mm_unpackhi_epi16(s6, s5);
+  v0 = _mm_madd_epi16(u0, k__cospi_p16_m16);
+  v1 = _mm_madd_epi16(u1, k__cospi_p16_m16);
+  v2 = _mm_madd_epi16(u0, k__cospi_p16_p16);
+  v3 = _mm_madd_epi16(u1, k__cospi_p16_p16);
+
+  // shift and rounding
+  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+
+  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+
+  u0 = _mm_packs_epi32(v0, v1);
+  u1 = _mm_packs_epi32(v2, v3);
+
+  // stage 3
+  s0 = _mm_add_epi16(s4, u0);
+  s1 = _mm_sub_epi16(s4, u0);
+  s2 = _mm_sub_epi16(s7, u1);
+  s3 = _mm_add_epi16(s7, u1);
+
+  // stage 4
+  u0 = _mm_unpacklo_epi16(s0, s3);
+  u1 = _mm_unpackhi_epi16(s0, s3);
+  u2 = _mm_unpacklo_epi16(s1, s2);
+  u3 = _mm_unpackhi_epi16(s1, s2);
+
+  v0 = _mm_madd_epi16(u0, k__cospi_p28_p04);
+  v1 = _mm_madd_epi16(u1, k__cospi_p28_p04);
+  v2 = _mm_madd_epi16(u2, k__cospi_p12_p20);
+  v3 = _mm_madd_epi16(u3, k__cospi_p12_p20);
+  v4 = _mm_madd_epi16(u2, k__cospi_m20_p12);
+  v5 = _mm_madd_epi16(u3, k__cospi_m20_p12);
+  v6 = _mm_madd_epi16(u0, k__cospi_m04_p28);
+  v7 = _mm_madd_epi16(u1, k__cospi_m04_p28);
+
+  // shift and rounding
+  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
+  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
+  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
+  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
+
+  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
+  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
+  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
+  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
+
+  in[1] = _mm_packs_epi32(v0, v1);
+  in[3] = _mm_packs_epi32(v4, v5);
+  in[5] = _mm_packs_epi32(v2, v3);
+  in[7] = _mm_packs_epi32(v6, v7);
+
+  // transpose
+  array_transpose_8x8(in, in);
+}
+
+static void fadst8_sse2(__m128i *in) {
+  // Constants
+  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
+  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
+  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
+  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
+  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__const_0 = _mm_set1_epi16(0);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
+  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
+  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+
+  // properly aligned for butterfly input
+  in0  = in[7];
+  in1  = in[0];
+  in2  = in[5];
+  in3  = in[2];
+  in4  = in[3];
+  in5  = in[4];
+  in6  = in[1];
+  in7  = in[6];
+
+  // column transformation
+  // stage 1
+  // interleave and multiply/add into 32-bit integer
+  s0 = _mm_unpacklo_epi16(in0, in1);
+  s1 = _mm_unpackhi_epi16(in0, in1);
+  s2 = _mm_unpacklo_epi16(in2, in3);
+  s3 = _mm_unpackhi_epi16(in2, in3);
+  s4 = _mm_unpacklo_epi16(in4, in5);
+  s5 = _mm_unpackhi_epi16(in4, in5);
+  s6 = _mm_unpacklo_epi16(in6, in7);
+  s7 = _mm_unpackhi_epi16(in6, in7);
+
+  u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
+  u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
+  u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
+  u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
+  u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
+  u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
+  u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
+  u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
+  u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
+  u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
+  u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
+  u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
+  u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
+  u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
+  u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
+  u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
+
+  // addition
+  w0 = _mm_add_epi32(u0, u8);
+  w1 = _mm_add_epi32(u1, u9);
+  w2 = _mm_add_epi32(u2, u10);
+  w3 = _mm_add_epi32(u3, u11);
+  w4 = _mm_add_epi32(u4, u12);
+  w5 = _mm_add_epi32(u5, u13);
+  w6 = _mm_add_epi32(u6, u14);
+  w7 = _mm_add_epi32(u7, u15);
+  w8 = _mm_sub_epi32(u0, u8);
+  w9 = _mm_sub_epi32(u1, u9);
+  w10 = _mm_sub_epi32(u2, u10);
+  w11 = _mm_sub_epi32(u3, u11);
+  w12 = _mm_sub_epi32(u4, u12);
+  w13 = _mm_sub_epi32(u5, u13);
+  w14 = _mm_sub_epi32(u6, u14);
+  w15 = _mm_sub_epi32(u7, u15);
+
+  // shift and rounding
+  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
+  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
+  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
+  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
+  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
+  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
+  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
+  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
+  v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
+  v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
+  v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
+  v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
+  v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
+  v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
+  v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
+  v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
+
+  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+  u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
+  u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
+  u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
+  u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
+  u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
+  u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
+  u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
+  u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
+
+  // back to 16-bit and pack 8 integers into __m128i
+  in[0] = _mm_packs_epi32(u0, u1);
+  in[1] = _mm_packs_epi32(u2, u3);
+  in[2] = _mm_packs_epi32(u4, u5);
+  in[3] = _mm_packs_epi32(u6, u7);
+  in[4] = _mm_packs_epi32(u8, u9);
+  in[5] = _mm_packs_epi32(u10, u11);
+  in[6] = _mm_packs_epi32(u12, u13);
+  in[7] = _mm_packs_epi32(u14, u15);
+
+  // stage 2
+  s0 = _mm_add_epi16(in[0], in[2]);
+  s1 = _mm_add_epi16(in[1], in[3]);
+  s2 = _mm_sub_epi16(in[0], in[2]);
+  s3 = _mm_sub_epi16(in[1], in[3]);
+  u0 = _mm_unpacklo_epi16(in[4], in[5]);
+  u1 = _mm_unpackhi_epi16(in[4], in[5]);
+  u2 = _mm_unpacklo_epi16(in[6], in[7]);
+  u3 = _mm_unpackhi_epi16(in[6], in[7]);
+
+  v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
+  v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
+  v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
+  v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
+  v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
+  v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
+  v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
+  v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
+
+  w0 = _mm_add_epi32(v0, v4);
+  w1 = _mm_add_epi32(v1, v5);
+  w2 = _mm_add_epi32(v2, v6);
+  w3 = _mm_add_epi32(v3, v7);
+  w4 = _mm_sub_epi32(v0, v4);
+  w5 = _mm_sub_epi32(v1, v5);
+  w6 = _mm_sub_epi32(v2, v6);
+  w7 = _mm_sub_epi32(v3, v7);
+
+  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
+  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
+  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
+  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
+  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
+  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
+  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
+  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
+
+  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+
+  // back to 16-bit intergers
+  s4 = _mm_packs_epi32(u0, u1);
+  s5 = _mm_packs_epi32(u2, u3);
+  s6 = _mm_packs_epi32(u4, u5);
+  s7 = _mm_packs_epi32(u6, u7);
+
+  // stage 3
+  u0 = _mm_unpacklo_epi16(s2, s3);
+  u1 = _mm_unpackhi_epi16(s2, s3);
+  u2 = _mm_unpacklo_epi16(s6, s7);
+  u3 = _mm_unpackhi_epi16(s6, s7);
+
+  v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
+  v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
+  v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
+  v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
+  v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
+  v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
+  v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
+  v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
+
+  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
+  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
+  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
+  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
+
+  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
+  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
+  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
+  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
+
+  s2 = _mm_packs_epi32(v0, v1);
+  s3 = _mm_packs_epi32(v2, v3);
+  s6 = _mm_packs_epi32(v4, v5);
+  s7 = _mm_packs_epi32(v6, v7);
+
+  // FIXME(jingning): do subtract using bit inversion?
+  in[0] = s0;
+  in[1] = _mm_sub_epi16(k__const_0, s4);
+  in[2] = s6;
+  in[3] = _mm_sub_epi16(k__const_0, s2);
+  in[4] = s3;
+  in[5] = _mm_sub_epi16(k__const_0, s7);
+  in[6] = s5;
+  in[7] = _mm_sub_epi16(k__const_0, s1);
+
+  // transpose
+  array_transpose_8x8(in, in);
+}
+
+void vp10_fht8x8_sse2(const int16_t *input, tran_low_t *output,
+                     int stride, int tx_type) {
+  __m128i in[8];
+
+  switch (tx_type) {
+    case DCT_DCT:
+      vpx_fdct8x8_sse2(input, output, stride);
+      break;
+    case ADST_DCT:
+      load_buffer_8x8(input, in, stride);
+      fadst8_sse2(in);
+      fdct8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case DCT_ADST:
+      load_buffer_8x8(input, in, stride);
+      fdct8_sse2(in);
+      fadst8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case ADST_ADST:
+      load_buffer_8x8(input, in, stride);
+      fadst8_sse2(in);
+      fadst8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
+
+static INLINE void load_buffer_16x16(const int16_t* input, __m128i *in0,
+                                     __m128i *in1, int stride) {
+  // load first 8 columns
+  load_buffer_8x8(input, in0, stride);
+  load_buffer_8x8(input + 8 * stride, in0 + 8, stride);
+
+  input += 8;
+  // load second 8 columns
+  load_buffer_8x8(input, in1, stride);
+  load_buffer_8x8(input + 8 * stride, in1 + 8, stride);
+}
+
+static INLINE void write_buffer_16x16(tran_low_t *output, __m128i *in0,
+                                      __m128i *in1, int stride) {
+  // write first 8 columns
+  write_buffer_8x8(output, in0, stride);
+  write_buffer_8x8(output + 8 * stride, in0 + 8, stride);
+  // write second 8 columns
+  output += 8;
+  write_buffer_8x8(output, in1, stride);
+  write_buffer_8x8(output + 8 * stride, in1 + 8, stride);
+}
+
+static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
+  __m128i tbuf[8];
+  array_transpose_8x8(res0, res0);
+  array_transpose_8x8(res1, tbuf);
+  array_transpose_8x8(res0 + 8, res1);
+  array_transpose_8x8(res1 + 8, res1 + 8);
+
+  res0[8] = tbuf[0];
+  res0[9] = tbuf[1];
+  res0[10] = tbuf[2];
+  res0[11] = tbuf[3];
+  res0[12] = tbuf[4];
+  res0[13] = tbuf[5];
+  res0[14] = tbuf[6];
+  res0[15] = tbuf[7];
+}
+
+static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) {
+  // perform rounding operations
+  right_shift_8x8(res0, 2);
+  right_shift_8x8(res0 + 8, 2);
+  right_shift_8x8(res1, 2);
+  right_shift_8x8(res1 + 8, 2);
+}
+
+static void fdct16_8col(__m128i *in) {
+  // perform 16x16 1-D DCT for 8 columns
+  __m128i i[8], s[8], p[8], t[8], u[16], v[16];
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
+  const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
+  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
+  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
+  const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
+  const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
+  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
+  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+  // stage 1
+  i[0] = _mm_add_epi16(in[0], in[15]);
+  i[1] = _mm_add_epi16(in[1], in[14]);
+  i[2] = _mm_add_epi16(in[2], in[13]);
+  i[3] = _mm_add_epi16(in[3], in[12]);
+  i[4] = _mm_add_epi16(in[4], in[11]);
+  i[5] = _mm_add_epi16(in[5], in[10]);
+  i[6] = _mm_add_epi16(in[6], in[9]);
+  i[7] = _mm_add_epi16(in[7], in[8]);
+
+  s[0] = _mm_sub_epi16(in[7], in[8]);
+  s[1] = _mm_sub_epi16(in[6], in[9]);
+  s[2] = _mm_sub_epi16(in[5], in[10]);
+  s[3] = _mm_sub_epi16(in[4], in[11]);
+  s[4] = _mm_sub_epi16(in[3], in[12]);
+  s[5] = _mm_sub_epi16(in[2], in[13]);
+  s[6] = _mm_sub_epi16(in[1], in[14]);
+  s[7] = _mm_sub_epi16(in[0], in[15]);
+
+  p[0] = _mm_add_epi16(i[0], i[7]);
+  p[1] = _mm_add_epi16(i[1], i[6]);
+  p[2] = _mm_add_epi16(i[2], i[5]);
+  p[3] = _mm_add_epi16(i[3], i[4]);
+  p[4] = _mm_sub_epi16(i[3], i[4]);
+  p[5] = _mm_sub_epi16(i[2], i[5]);
+  p[6] = _mm_sub_epi16(i[1], i[6]);
+  p[7] = _mm_sub_epi16(i[0], i[7]);
+
+  u[0] = _mm_add_epi16(p[0], p[3]);
+  u[1] = _mm_add_epi16(p[1], p[2]);
+  u[2] = _mm_sub_epi16(p[1], p[2]);
+  u[3] = _mm_sub_epi16(p[0], p[3]);
+
+  v[0] = _mm_unpacklo_epi16(u[0], u[1]);
+  v[1] = _mm_unpackhi_epi16(u[0], u[1]);
+  v[2] = _mm_unpacklo_epi16(u[2], u[3]);
+  v[3] = _mm_unpackhi_epi16(u[2], u[3]);
+
+  u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);
+  u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16);
+  u[2] = _mm_madd_epi16(v[0], k__cospi_p16_m16);
+  u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16);
+  u[4] = _mm_madd_epi16(v[2], k__cospi_p24_p08);
+  u[5] = _mm_madd_epi16(v[3], k__cospi_p24_p08);
+  u[6] = _mm_madd_epi16(v[2], k__cospi_m08_p24);
+  u[7] = _mm_madd_epi16(v[3], k__cospi_m08_p24);
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+
+  in[0] = _mm_packs_epi32(u[0], u[1]);
+  in[4] = _mm_packs_epi32(u[4], u[5]);
+  in[8] = _mm_packs_epi32(u[2], u[3]);
+  in[12] = _mm_packs_epi32(u[6], u[7]);
+
+  u[0] = _mm_unpacklo_epi16(p[5], p[6]);
+  u[1] = _mm_unpackhi_epi16(p[5], p[6]);
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+
+  u[0] = _mm_packs_epi32(v[0], v[1]);
+  u[1] = _mm_packs_epi32(v[2], v[3]);
+
+  t[0] = _mm_add_epi16(p[4], u[0]);
+  t[1] = _mm_sub_epi16(p[4], u[0]);
+  t[2] = _mm_sub_epi16(p[7], u[1]);
+  t[3] = _mm_add_epi16(p[7], u[1]);
+
+  u[0] = _mm_unpacklo_epi16(t[0], t[3]);
+  u[1] = _mm_unpackhi_epi16(t[0], t[3]);
+  u[2] = _mm_unpacklo_epi16(t[1], t[2]);
+  u[3] = _mm_unpackhi_epi16(t[1], t[2]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p28_p04);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p28_p04);
+  v[2] = _mm_madd_epi16(u[2], k__cospi_p12_p20);
+  v[3] = _mm_madd_epi16(u[3], k__cospi_p12_p20);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_m20_p12);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_m20_p12);
+  v[6] = _mm_madd_epi16(u[0], k__cospi_m04_p28);
+  v[7] = _mm_madd_epi16(u[1], k__cospi_m04_p28);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+  in[2] = _mm_packs_epi32(v[0], v[1]);
+  in[6] = _mm_packs_epi32(v[4], v[5]);
+  in[10] = _mm_packs_epi32(v[2], v[3]);
+  in[14] = _mm_packs_epi32(v[6], v[7]);
+
+  // stage 2
+  u[0] = _mm_unpacklo_epi16(s[2], s[5]);
+  u[1] = _mm_unpackhi_epi16(s[2], s[5]);
+  u[2] = _mm_unpacklo_epi16(s[3], s[4]);
+  u[3] = _mm_unpackhi_epi16(s[3], s[4]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
+  v[2] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
+  v[3] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
+  v[6] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+  v[7] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+  t[2] = _mm_packs_epi32(v[0], v[1]);
+  t[3] = _mm_packs_epi32(v[2], v[3]);
+  t[4] = _mm_packs_epi32(v[4], v[5]);
+  t[5] = _mm_packs_epi32(v[6], v[7]);
+
+  // stage 3
+  p[0] = _mm_add_epi16(s[0], t[3]);
+  p[1] = _mm_add_epi16(s[1], t[2]);
+  p[2] = _mm_sub_epi16(s[1], t[2]);
+  p[3] = _mm_sub_epi16(s[0], t[3]);
+  p[4] = _mm_sub_epi16(s[7], t[4]);
+  p[5] = _mm_sub_epi16(s[6], t[5]);
+  p[6] = _mm_add_epi16(s[6], t[5]);
+  p[7] = _mm_add_epi16(s[7], t[4]);
+
+  // stage 4
+  u[0] = _mm_unpacklo_epi16(p[1], p[6]);
+  u[1] = _mm_unpackhi_epi16(p[1], p[6]);
+  u[2] = _mm_unpacklo_epi16(p[2], p[5]);
+  u[3] = _mm_unpackhi_epi16(p[2], p[5]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24);
+  v[2] = _mm_madd_epi16(u[2], k__cospi_p24_p08);
+  v[3] = _mm_madd_epi16(u[3], k__cospi_p24_p08);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p08_m24);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p08_m24);
+  v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08);
+  v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+  t[1] = _mm_packs_epi32(v[0], v[1]);
+  t[2] = _mm_packs_epi32(v[2], v[3]);
+  t[5] = _mm_packs_epi32(v[4], v[5]);
+  t[6] = _mm_packs_epi32(v[6], v[7]);
+
+  // stage 5
+  s[0] = _mm_add_epi16(p[0], t[1]);
+  s[1] = _mm_sub_epi16(p[0], t[1]);
+  s[2] = _mm_add_epi16(p[3], t[2]);
+  s[3] = _mm_sub_epi16(p[3], t[2]);
+  s[4] = _mm_sub_epi16(p[4], t[5]);
+  s[5] = _mm_add_epi16(p[4], t[5]);
+  s[6] = _mm_sub_epi16(p[7], t[6]);
+  s[7] = _mm_add_epi16(p[7], t[6]);
+
+  // stage 6
+  u[0] = _mm_unpacklo_epi16(s[0], s[7]);
+  u[1] = _mm_unpackhi_epi16(s[0], s[7]);
+  u[2] = _mm_unpacklo_epi16(s[1], s[6]);
+  u[3] = _mm_unpackhi_epi16(s[1], s[6]);
+  u[4] = _mm_unpacklo_epi16(s[2], s[5]);
+  u[5] = _mm_unpackhi_epi16(s[2], s[5]);
+  u[6] = _mm_unpacklo_epi16(s[3], s[4]);
+  u[7] = _mm_unpackhi_epi16(s[3], s[4]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p30_p02);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p30_p02);
+  v[2] = _mm_madd_epi16(u[2], k__cospi_p14_p18);
+  v[3] = _mm_madd_epi16(u[3], k__cospi_p14_p18);
+  v[4] = _mm_madd_epi16(u[4], k__cospi_p22_p10);
+  v[5] = _mm_madd_epi16(u[5], k__cospi_p22_p10);
+  v[6] = _mm_madd_epi16(u[6], k__cospi_p06_p26);
+  v[7] = _mm_madd_epi16(u[7], k__cospi_p06_p26);
+  v[8] = _mm_madd_epi16(u[6], k__cospi_m26_p06);
+  v[9] = _mm_madd_epi16(u[7], k__cospi_m26_p06);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_m10_p22);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_m10_p22);
+  v[12] = _mm_madd_epi16(u[2], k__cospi_m18_p14);
+  v[13] = _mm_madd_epi16(u[3], k__cospi_m18_p14);
+  v[14] = _mm_madd_epi16(u[0], k__cospi_m02_p30);
+  v[15] = _mm_madd_epi16(u[1], k__cospi_m02_p30);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+  in[1]  = _mm_packs_epi32(v[0], v[1]);
+  in[9]  = _mm_packs_epi32(v[2], v[3]);
+  in[5]  = _mm_packs_epi32(v[4], v[5]);
+  in[13] = _mm_packs_epi32(v[6], v[7]);
+  in[3]  = _mm_packs_epi32(v[8], v[9]);
+  in[11] = _mm_packs_epi32(v[10], v[11]);
+  in[7]  = _mm_packs_epi32(v[12], v[13]);
+  in[15] = _mm_packs_epi32(v[14], v[15]);
+}
+
+static void fadst16_8col(__m128i *in) {
+  // perform 16x16 1-D ADST for 8 columns
+  __m128i s[16], x[16], u[32], v[32];
+  const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
+  const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+  const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
+  const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+  const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
+  const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
+  const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
+  const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
+  const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
+  const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
+  const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
+  const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
+  const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
+  const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+  const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
+  const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
+  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+  const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i kZero = _mm_set1_epi16(0);
+
+  u[0] = _mm_unpacklo_epi16(in[15], in[0]);
+  u[1] = _mm_unpackhi_epi16(in[15], in[0]);
+  u[2] = _mm_unpacklo_epi16(in[13], in[2]);
+  u[3] = _mm_unpackhi_epi16(in[13], in[2]);
+  u[4] = _mm_unpacklo_epi16(in[11], in[4]);
+  u[5] = _mm_unpackhi_epi16(in[11], in[4]);
+  u[6] = _mm_unpacklo_epi16(in[9], in[6]);
+  u[7] = _mm_unpackhi_epi16(in[9], in[6]);
+  u[8] = _mm_unpacklo_epi16(in[7], in[8]);
+  u[9] = _mm_unpackhi_epi16(in[7], in[8]);
+  u[10] = _mm_unpacklo_epi16(in[5], in[10]);
+  u[11] = _mm_unpackhi_epi16(in[5], in[10]);
+  u[12] = _mm_unpacklo_epi16(in[3], in[12]);
+  u[13] = _mm_unpackhi_epi16(in[3], in[12]);
+  u[14] = _mm_unpacklo_epi16(in[1], in[14]);
+  u[15] = _mm_unpackhi_epi16(in[1], in[14]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
+  v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
+  v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
+  v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
+  v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
+  v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
+  v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
+  v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
+  v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
+  v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
+  v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
+  v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
+  v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
+  v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
+  v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
+  v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
+  v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
+
+  u[0] = _mm_add_epi32(v[0], v[16]);
+  u[1] = _mm_add_epi32(v[1], v[17]);
+  u[2] = _mm_add_epi32(v[2], v[18]);
+  u[3] = _mm_add_epi32(v[3], v[19]);
+  u[4] = _mm_add_epi32(v[4], v[20]);
+  u[5] = _mm_add_epi32(v[5], v[21]);
+  u[6] = _mm_add_epi32(v[6], v[22]);
+  u[7] = _mm_add_epi32(v[7], v[23]);
+  u[8] = _mm_add_epi32(v[8], v[24]);
+  u[9] = _mm_add_epi32(v[9], v[25]);
+  u[10] = _mm_add_epi32(v[10], v[26]);
+  u[11] = _mm_add_epi32(v[11], v[27]);
+  u[12] = _mm_add_epi32(v[12], v[28]);
+  u[13] = _mm_add_epi32(v[13], v[29]);
+  u[14] = _mm_add_epi32(v[14], v[30]);
+  u[15] = _mm_add_epi32(v[15], v[31]);
+  u[16] = _mm_sub_epi32(v[0], v[16]);
+  u[17] = _mm_sub_epi32(v[1], v[17]);
+  u[18] = _mm_sub_epi32(v[2], v[18]);
+  u[19] = _mm_sub_epi32(v[3], v[19]);
+  u[20] = _mm_sub_epi32(v[4], v[20]);
+  u[21] = _mm_sub_epi32(v[5], v[21]);
+  u[22] = _mm_sub_epi32(v[6], v[22]);
+  u[23] = _mm_sub_epi32(v[7], v[23]);
+  u[24] = _mm_sub_epi32(v[8], v[24]);
+  u[25] = _mm_sub_epi32(v[9], v[25]);
+  u[26] = _mm_sub_epi32(v[10], v[26]);
+  u[27] = _mm_sub_epi32(v[11], v[27]);
+  u[28] = _mm_sub_epi32(v[12], v[28]);
+  u[29] = _mm_sub_epi32(v[13], v[29]);
+  u[30] = _mm_sub_epi32(v[14], v[30]);
+  u[31] = _mm_sub_epi32(v[15], v[31]);
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+  v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
+  v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
+  v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
+  v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
+  v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
+  v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
+  v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
+  v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
+  v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
+  v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
+  v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
+  v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
+  v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
+  v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
+  v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
+  v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+  u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
+  u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
+  u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
+  u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
+  u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
+  u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
+  u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
+  u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
+  u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
+  u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
+  u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
+  u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
+  u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
+  u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
+  u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
+  u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
+
+  s[0] = _mm_packs_epi32(u[0], u[1]);
+  s[1] = _mm_packs_epi32(u[2], u[3]);
+  s[2] = _mm_packs_epi32(u[4], u[5]);
+  s[3] = _mm_packs_epi32(u[6], u[7]);
+  s[4] = _mm_packs_epi32(u[8], u[9]);
+  s[5] = _mm_packs_epi32(u[10], u[11]);
+  s[6] = _mm_packs_epi32(u[12], u[13]);
+  s[7] = _mm_packs_epi32(u[14], u[15]);
+  s[8] = _mm_packs_epi32(u[16], u[17]);
+  s[9] = _mm_packs_epi32(u[18], u[19]);
+  s[10] = _mm_packs_epi32(u[20], u[21]);
+  s[11] = _mm_packs_epi32(u[22], u[23]);
+  s[12] = _mm_packs_epi32(u[24], u[25]);
+  s[13] = _mm_packs_epi32(u[26], u[27]);
+  s[14] = _mm_packs_epi32(u[28], u[29]);
+  s[15] = _mm_packs_epi32(u[30], u[31]);
+
+  // stage 2
+  u[0] = _mm_unpacklo_epi16(s[8], s[9]);
+  u[1] = _mm_unpackhi_epi16(s[8], s[9]);
+  u[2] = _mm_unpacklo_epi16(s[10], s[11]);
+  u[3] = _mm_unpackhi_epi16(s[10], s[11]);
+  u[4] = _mm_unpacklo_epi16(s[12], s[13]);
+  u[5] = _mm_unpackhi_epi16(s[12], s[13]);
+  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
+  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
+
+  u[0] = _mm_add_epi32(v[0], v[8]);
+  u[1] = _mm_add_epi32(v[1], v[9]);
+  u[2] = _mm_add_epi32(v[2], v[10]);
+  u[3] = _mm_add_epi32(v[3], v[11]);
+  u[4] = _mm_add_epi32(v[4], v[12]);
+  u[5] = _mm_add_epi32(v[5], v[13]);
+  u[6] = _mm_add_epi32(v[6], v[14]);
+  u[7] = _mm_add_epi32(v[7], v[15]);
+  u[8] = _mm_sub_epi32(v[0], v[8]);
+  u[9] = _mm_sub_epi32(v[1], v[9]);
+  u[10] = _mm_sub_epi32(v[2], v[10]);
+  u[11] = _mm_sub_epi32(v[3], v[11]);
+  u[12] = _mm_sub_epi32(v[4], v[12]);
+  u[13] = _mm_sub_epi32(v[5], v[13]);
+  u[14] = _mm_sub_epi32(v[6], v[14]);
+  u[15] = _mm_sub_epi32(v[7], v[15]);
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+
+  x[0] = _mm_add_epi16(s[0], s[4]);
+  x[1] = _mm_add_epi16(s[1], s[5]);
+  x[2] = _mm_add_epi16(s[2], s[6]);
+  x[3] = _mm_add_epi16(s[3], s[7]);
+  x[4] = _mm_sub_epi16(s[0], s[4]);
+  x[5] = _mm_sub_epi16(s[1], s[5]);
+  x[6] = _mm_sub_epi16(s[2], s[6]);
+  x[7] = _mm_sub_epi16(s[3], s[7]);
+  x[8] = _mm_packs_epi32(u[0], u[1]);
+  x[9] = _mm_packs_epi32(u[2], u[3]);
+  x[10] = _mm_packs_epi32(u[4], u[5]);
+  x[11] = _mm_packs_epi32(u[6], u[7]);
+  x[12] = _mm_packs_epi32(u[8], u[9]);
+  x[13] = _mm_packs_epi32(u[10], u[11]);
+  x[14] = _mm_packs_epi32(u[12], u[13]);
+  x[15] = _mm_packs_epi32(u[14], u[15]);
+
+  // stage 3
+  u[0] = _mm_unpacklo_epi16(x[4], x[5]);
+  u[1] = _mm_unpackhi_epi16(x[4], x[5]);
+  u[2] = _mm_unpacklo_epi16(x[6], x[7]);
+  u[3] = _mm_unpackhi_epi16(x[6], x[7]);
+  u[4] = _mm_unpacklo_epi16(x[12], x[13]);
+  u[5] = _mm_unpackhi_epi16(x[12], x[13]);
+  u[6] = _mm_unpacklo_epi16(x[14], x[15]);
+  u[7] = _mm_unpackhi_epi16(x[14], x[15]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
+
+  u[0] = _mm_add_epi32(v[0], v[4]);
+  u[1] = _mm_add_epi32(v[1], v[5]);
+  u[2] = _mm_add_epi32(v[2], v[6]);
+  u[3] = _mm_add_epi32(v[3], v[7]);
+  u[4] = _mm_sub_epi32(v[0], v[4]);
+  u[5] = _mm_sub_epi32(v[1], v[5]);
+  u[6] = _mm_sub_epi32(v[2], v[6]);
+  u[7] = _mm_sub_epi32(v[3], v[7]);
+  u[8] = _mm_add_epi32(v[8], v[12]);
+  u[9] = _mm_add_epi32(v[9], v[13]);
+  u[10] = _mm_add_epi32(v[10], v[14]);
+  u[11] = _mm_add_epi32(v[11], v[15]);
+  u[12] = _mm_sub_epi32(v[8], v[12]);
+  u[13] = _mm_sub_epi32(v[9], v[13]);
+  u[14] = _mm_sub_epi32(v[10], v[14]);
+  u[15] = _mm_sub_epi32(v[11], v[15]);
+
+  u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+  u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+  u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+  u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+  u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+  u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+  u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+  u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+  u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+  s[0] = _mm_add_epi16(x[0], x[2]);
+  s[1] = _mm_add_epi16(x[1], x[3]);
+  s[2] = _mm_sub_epi16(x[0], x[2]);
+  s[3] = _mm_sub_epi16(x[1], x[3]);
+  s[4] = _mm_packs_epi32(v[0], v[1]);
+  s[5] = _mm_packs_epi32(v[2], v[3]);
+  s[6] = _mm_packs_epi32(v[4], v[5]);
+  s[7] = _mm_packs_epi32(v[6], v[7]);
+  s[8] = _mm_add_epi16(x[8], x[10]);
+  s[9] = _mm_add_epi16(x[9], x[11]);
+  s[10] = _mm_sub_epi16(x[8], x[10]);
+  s[11] = _mm_sub_epi16(x[9], x[11]);
+  s[12] = _mm_packs_epi32(v[8], v[9]);
+  s[13] = _mm_packs_epi32(v[10], v[11]);
+  s[14] = _mm_packs_epi32(v[12], v[13]);
+  s[15] = _mm_packs_epi32(v[14], v[15]);
+
+  // stage 4
+  u[0] = _mm_unpacklo_epi16(s[2], s[3]);
+  u[1] = _mm_unpackhi_epi16(s[2], s[3]);
+  u[2] = _mm_unpacklo_epi16(s[6], s[7]);
+  u[3] = _mm_unpackhi_epi16(s[6], s[7]);
+  u[4] = _mm_unpacklo_epi16(s[10], s[11]);
+  u[5] = _mm_unpackhi_epi16(s[10], s[11]);
+  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
+  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+  in[0] = s[0];
+  in[1] = _mm_sub_epi16(kZero, s[8]);
+  in[2] = s[12];
+  in[3] = _mm_sub_epi16(kZero, s[4]);
+  in[4] = _mm_packs_epi32(v[4], v[5]);
+  in[5] = _mm_packs_epi32(v[12], v[13]);
+  in[6] = _mm_packs_epi32(v[8], v[9]);
+  in[7] = _mm_packs_epi32(v[0], v[1]);
+  in[8] = _mm_packs_epi32(v[2], v[3]);
+  in[9] = _mm_packs_epi32(v[10], v[11]);
+  in[10] = _mm_packs_epi32(v[14], v[15]);
+  in[11] = _mm_packs_epi32(v[6], v[7]);
+  in[12] = s[5];
+  in[13] = _mm_sub_epi16(kZero, s[13]);
+  in[14] = s[9];
+  in[15] = _mm_sub_epi16(kZero, s[1]);
+}
+
+static void fdct16_sse2(__m128i *in0, __m128i *in1) {
+  fdct16_8col(in0);
+  fdct16_8col(in1);
+  array_transpose_16x16(in0, in1);
+}
+
+static void fadst16_sse2(__m128i *in0, __m128i *in1) {
+  fadst16_8col(in0);
+  fadst16_8col(in1);
+  array_transpose_16x16(in0, in1);
+}
+
+void vp10_fht16x16_sse2(const int16_t *input, tran_low_t *output,
+                       int stride, int tx_type) {
+  __m128i in0[16], in1[16];
+
+  switch (tx_type) {
+    case DCT_DCT:
+      vpx_fdct16x16_sse2(input, output, stride);
+      break;
+    case ADST_DCT:
+      load_buffer_16x16(input, in0, in1, stride);
+      fadst16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fdct16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case DCT_ADST:
+      load_buffer_16x16(input, in0, in1, stride);
+      fdct16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case ADST_ADST:
+      load_buffer_16x16(input, in0, in1, stride);
+      fadst16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
diff --git a/libs/libvpx/vp10/encoder/x86/dct_ssse3.c b/libs/libvpx/vp10/encoder/x86/dct_ssse3.c
new file mode 100644
index 0000000000..df298d8711
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/x86/dct_ssse3.c
@@ -0,0 +1,472 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#if defined(_MSC_VER) && _MSC_VER <= 1500
+// Need to include math.h before calling tmmintrin.h/intrin.h
+// in certain versions of MSVS.
+#include <math.h>
+#endif
+#include <tmmintrin.h>  // SSSE3
+
+#include "./vp10_rtcd.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+void vp10_fdct8x8_quant_ssse3(const int16_t *input, int stride,
+                             int16_t* coeff_ptr, intptr_t n_coeffs,
+                             int skip_block, const int16_t* zbin_ptr,
+                             const int16_t* round_ptr, const int16_t* quant_ptr,
+                             const int16_t* quant_shift_ptr,
+                             int16_t* qcoeff_ptr,
+                             int16_t* dqcoeff_ptr, const int16_t* dequant_ptr,
+                             uint16_t* eob_ptr,
+                             const int16_t* scan_ptr,
+                             const int16_t* iscan_ptr) {
+  __m128i zero;
+  int pass;
+  // Constants
+  //    When we use them, in one case, they are all the same. In all others
+  //    it's a pair of them that we need to repeat four times. This is done
+  //    by constructing the 32 bit constant corresponding to that pair.
+  const __m128i k__dual_p16_p16 = dual_set_epi16(23170, 23170);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  // Load input
+  __m128i in0  = _mm_load_si128((const __m128i *)(input + 0 * stride));
+  __m128i in1  = _mm_load_si128((const __m128i *)(input + 1 * stride));
+  __m128i in2  = _mm_load_si128((const __m128i *)(input + 2 * stride));
+  __m128i in3  = _mm_load_si128((const __m128i *)(input + 3 * stride));
+  __m128i in4  = _mm_load_si128((const __m128i *)(input + 4 * stride));
+  __m128i in5  = _mm_load_si128((const __m128i *)(input + 5 * stride));
+  __m128i in6  = _mm_load_si128((const __m128i *)(input + 6 * stride));
+  __m128i in7  = _mm_load_si128((const __m128i *)(input + 7 * stride));
+  __m128i *in[8];
+  int index = 0;
+
+  (void)scan_ptr;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)coeff_ptr;
+
+  // Pre-condition input (shift by two)
+  in0 = _mm_slli_epi16(in0, 2);
+  in1 = _mm_slli_epi16(in1, 2);
+  in2 = _mm_slli_epi16(in2, 2);
+  in3 = _mm_slli_epi16(in3, 2);
+  in4 = _mm_slli_epi16(in4, 2);
+  in5 = _mm_slli_epi16(in5, 2);
+  in6 = _mm_slli_epi16(in6, 2);
+  in7 = _mm_slli_epi16(in7, 2);
+
+  in[0] = &in0;
+  in[1] = &in1;
+  in[2] = &in2;
+  in[3] = &in3;
+  in[4] = &in4;
+  in[5] = &in5;
+  in[6] = &in6;
+  in[7] = &in7;
+
+  // We do two passes, first the columns, then the rows. The results of the
+  // first pass are transposed so that the same column code can be reused. The
+  // results of the second pass are also transposed so that the rows (processed
+  // as columns) are put back in row positions.
+  for (pass = 0; pass < 2; pass++) {
+    // To store results of each pass before the transpose.
+    __m128i res0, res1, res2, res3, res4, res5, res6, res7;
+    // Add/subtract
+    const __m128i q0 = _mm_add_epi16(in0, in7);
+    const __m128i q1 = _mm_add_epi16(in1, in6);
+    const __m128i q2 = _mm_add_epi16(in2, in5);
+    const __m128i q3 = _mm_add_epi16(in3, in4);
+    const __m128i q4 = _mm_sub_epi16(in3, in4);
+    const __m128i q5 = _mm_sub_epi16(in2, in5);
+    const __m128i q6 = _mm_sub_epi16(in1, in6);
+    const __m128i q7 = _mm_sub_epi16(in0, in7);
+    // Work on first four results
+    {
+      // Add/subtract
+      const __m128i r0 = _mm_add_epi16(q0, q3);
+      const __m128i r1 = _mm_add_epi16(q1, q2);
+      const __m128i r2 = _mm_sub_epi16(q1, q2);
+      const __m128i r3 = _mm_sub_epi16(q0, q3);
+      // Interleave to do the multiply by constants which gets us into 32bits
+      const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+      const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
+      const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+      const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
+
+      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
+      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
+
+      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
+      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
+      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
+      // dct_const_round_shift
+
+      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+
+      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+
+      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+
+      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+      // Combine
+
+      res0 = _mm_packs_epi32(w0, w1);
+      res4 = _mm_packs_epi32(w2, w3);
+      res2 = _mm_packs_epi32(w4, w5);
+      res6 = _mm_packs_epi32(w6, w7);
+    }
+    // Work on next four results
+    {
+      // Interleave to do the multiply by constants which gets us into 32bits
+      const __m128i d0 = _mm_sub_epi16(q6, q5);
+      const __m128i d1 = _mm_add_epi16(q6, q5);
+      const __m128i r0 = _mm_mulhrs_epi16(d0, k__dual_p16_p16);
+      const __m128i r1 = _mm_mulhrs_epi16(d1, k__dual_p16_p16);
+
+      // Add/subtract
+      const __m128i x0 = _mm_add_epi16(q4, r0);
+      const __m128i x1 = _mm_sub_epi16(q4, r0);
+      const __m128i x2 = _mm_sub_epi16(q7, r1);
+      const __m128i x3 = _mm_add_epi16(q7, r1);
+      // Interleave to do the multiply by constants which gets us into 32bits
+      const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
+      const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
+      const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
+      const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
+      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
+      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
+      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
+      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
+      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
+      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
+      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
+      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
+      // dct_const_round_shift
+      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+      // Combine
+      res1 = _mm_packs_epi32(w0, w1);
+      res7 = _mm_packs_epi32(w2, w3);
+      res5 = _mm_packs_epi32(w4, w5);
+      res3 = _mm_packs_epi32(w6, w7);
+    }
+    // Transpose the 8x8.
+    {
+      // 00 01 02 03 04 05 06 07
+      // 10 11 12 13 14 15 16 17
+      // 20 21 22 23 24 25 26 27
+      // 30 31 32 33 34 35 36 37
+      // 40 41 42 43 44 45 46 47
+      // 50 51 52 53 54 55 56 57
+      // 60 61 62 63 64 65 66 67
+      // 70 71 72 73 74 75 76 77
+      const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
+      const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
+      const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
+      const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
+      const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
+      const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
+      const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
+      const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
+      // 00 10 01 11 02 12 03 13
+      // 20 30 21 31 22 32 23 33
+      // 04 14 05 15 06 16 07 17
+      // 24 34 25 35 26 36 27 37
+      // 40 50 41 51 42 52 43 53
+      // 60 70 61 71 62 72 63 73
+      // 54 54 55 55 56 56 57 57
+      // 64 74 65 75 66 76 67 77
+      const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+      const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+      const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+      const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+      const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+      const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+      const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+      const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+      // 00 10 20 30 01 11 21 31
+      // 40 50 60 70 41 51 61 71
+      // 02 12 22 32 03 13 23 33
+      // 42 52 62 72 43 53 63 73
+      // 04 14 24 34 05 15 21 36
+      // 44 54 64 74 45 55 61 76
+      // 06 16 26 36 07 17 27 37
+      // 46 56 66 76 47 57 67 77
+      in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+      in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+      in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+      in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+      in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+      in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+      in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+      in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+      // 00 10 20 30 40 50 60 70
+      // 01 11 21 31 41 51 61 71
+      // 02 12 22 32 42 52 62 72
+      // 03 13 23 33 43 53 63 73
+      // 04 14 24 34 44 54 64 74
+      // 05 15 25 35 45 55 65 75
+      // 06 16 26 36 46 56 66 76
+      // 07 17 27 37 47 57 67 77
+    }
+  }
+  // Post-condition output and store it
+  {
+    // Post-condition (division by two)
+    //    division of two 16 bits signed numbers using shifts
+    //    n / 2 = (n - (n >> 15)) >> 1
+    const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
+    const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
+    const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
+    const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
+    const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
+    const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
+    const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
+    const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
+    in0 = _mm_sub_epi16(in0, sign_in0);
+    in1 = _mm_sub_epi16(in1, sign_in1);
+    in2 = _mm_sub_epi16(in2, sign_in2);
+    in3 = _mm_sub_epi16(in3, sign_in3);
+    in4 = _mm_sub_epi16(in4, sign_in4);
+    in5 = _mm_sub_epi16(in5, sign_in5);
+    in6 = _mm_sub_epi16(in6, sign_in6);
+    in7 = _mm_sub_epi16(in7, sign_in7);
+    in0 = _mm_srai_epi16(in0, 1);
+    in1 = _mm_srai_epi16(in1, 1);
+    in2 = _mm_srai_epi16(in2, 1);
+    in3 = _mm_srai_epi16(in3, 1);
+    in4 = _mm_srai_epi16(in4, 1);
+    in5 = _mm_srai_epi16(in5, 1);
+    in6 = _mm_srai_epi16(in6, 1);
+    in7 = _mm_srai_epi16(in7, 1);
+  }
+
+  iscan_ptr += n_coeffs;
+  qcoeff_ptr += n_coeffs;
+  dqcoeff_ptr += n_coeffs;
+  n_coeffs = -n_coeffs;
+  zero = _mm_setzero_si128();
+
+  if (!skip_block) {
+    __m128i eob;
+    __m128i round, quant, dequant, thr;
+    int16_t nzflag;
+    {
+      __m128i coeff0, coeff1;
+
+      // Setup global values
+      {
+        round = _mm_load_si128((const __m128i*)round_ptr);
+        quant = _mm_load_si128((const __m128i*)quant_ptr);
+        dequant = _mm_load_si128((const __m128i*)dequant_ptr);
+      }
+
+      {
+        __m128i coeff0_sign, coeff1_sign;
+        __m128i qcoeff0, qcoeff1;
+        __m128i qtmp0, qtmp1;
+        // Do DC and first 15 AC
+        coeff0 = *in[0];
+        coeff1 = *in[1];
+
+        // Poor man's sign extract
+        coeff0_sign = _mm_srai_epi16(coeff0, 15);
+        coeff1_sign = _mm_srai_epi16(coeff1, 15);
+        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+        round = _mm_unpackhi_epi64(round, round);
+        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+        quant = _mm_unpackhi_epi64(quant, quant);
+        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+        // Reinsert signs
+        qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+
+        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+        dequant = _mm_unpackhi_epi64(dequant, dequant);
+        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+      }
+
+      {
+        // Scan for eob
+        __m128i zero_coeff0, zero_coeff1;
+        __m128i nzero_coeff0, nzero_coeff1;
+        __m128i iscan0, iscan1;
+        __m128i eob1;
+        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+        iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
+        iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
+        // Add one to convert from indices to counts
+        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+        eob = _mm_and_si128(iscan0, nzero_coeff0);
+        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+        eob = _mm_max_epi16(eob, eob1);
+      }
+      n_coeffs += 8 * 2;
+    }
+
+    // AC only loop
+    index = 2;
+    thr = _mm_srai_epi16(dequant, 1);
+    while (n_coeffs < 0) {
+      __m128i coeff0, coeff1;
+      {
+        __m128i coeff0_sign, coeff1_sign;
+        __m128i qcoeff0, qcoeff1;
+        __m128i qtmp0, qtmp1;
+
+        assert(index < (int)(sizeof(in) / sizeof(in[0])) - 1);
+        coeff0 = *in[index];
+        coeff1 = *in[index + 1];
+
+        // Poor man's sign extract
+        coeff0_sign = _mm_srai_epi16(coeff0, 15);
+        coeff1_sign = _mm_srai_epi16(coeff1, 15);
+        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
+            _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
+
+        if (nzflag) {
+          qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+          qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+          qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+          qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+          // Reinsert signs
+          qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+          qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+          qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+          qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+          _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
+          _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+
+          coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+          coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+          _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
+          _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+        } else {
+          _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
+          _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
+
+          _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
+          _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
+        }
+      }
+
+      if (nzflag) {
+        // Scan for eob
+        __m128i zero_coeff0, zero_coeff1;
+        __m128i nzero_coeff0, nzero_coeff1;
+        __m128i iscan0, iscan1;
+        __m128i eob0, eob1;
+        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+        iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
+        iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
+        // Add one to convert from indices to counts
+        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+        eob0 = _mm_and_si128(iscan0, nzero_coeff0);
+        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+        eob0 = _mm_max_epi16(eob0, eob1);
+        eob = _mm_max_epi16(eob, eob0);
+      }
+      n_coeffs += 8 * 2;
+      index += 2;
+    }
+
+    // Accumulate EOB
+    {
+      __m128i eob_shuffled;
+      eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      *eob_ptr = _mm_extract_epi16(eob, 1);
+    }
+  } else {
+    do {
+      _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
+      _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
+      _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
+      _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
+      n_coeffs += 8 * 2;
+    } while (n_coeffs < 0);
+    *eob_ptr = 0;
+  }
+}
diff --git a/libs/libvpx/vp10/encoder/x86/denoiser_sse2.c b/libs/libvpx/vp10/encoder/x86/denoiser_sse2.c
new file mode 100644
index 0000000000..047974ef80
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/x86/denoiser_sse2.c
@@ -0,0 +1,375 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>
+
+#include "./vpx_config.h"
+#include "./vp10_rtcd.h"
+
+#include "vpx_ports/emmintrin_compat.h"
+#include "vpx/vpx_integer.h"
+#include "vp10/common/reconinter.h"
+#include "vp10/encoder/context_tree.h"
+#include "vp10/encoder/denoiser.h"
+#include "vpx_mem/vpx_mem.h"
+
+// Compute the sum of all pixel differences of this MB.
+static INLINE int sum_diff_16x1(__m128i acc_diff) {
+  const __m128i k_1 = _mm_set1_epi16(1);
+  const __m128i acc_diff_lo =
+      _mm_srai_epi16(_mm_unpacklo_epi8(acc_diff, acc_diff), 8);
+  const __m128i acc_diff_hi =
+      _mm_srai_epi16(_mm_unpackhi_epi8(acc_diff, acc_diff), 8);
+  const __m128i acc_diff_16 = _mm_add_epi16(acc_diff_lo, acc_diff_hi);
+  const __m128i hg_fe_dc_ba = _mm_madd_epi16(acc_diff_16, k_1);
+  const __m128i hgfe_dcba =
+      _mm_add_epi32(hg_fe_dc_ba, _mm_srli_si128(hg_fe_dc_ba, 8));
+  const __m128i hgfedcba =
+      _mm_add_epi32(hgfe_dcba, _mm_srli_si128(hgfe_dcba, 4));
+  return _mm_cvtsi128_si32(hgfedcba);
+}
+
+// Denoise a 16x1 vector.
+static INLINE __m128i vp10_denoiser_16x1_sse2(const uint8_t *sig,
+                                             const uint8_t *mc_running_avg_y,
+                                             uint8_t *running_avg_y,
+                                             const __m128i *k_0,
+                                             const __m128i *k_4,
+                                             const __m128i *k_8,
+                                             const __m128i *k_16,
+                                             const __m128i *l3,
+                                             const __m128i *l32,
+                                             const __m128i *l21,
+                                             __m128i acc_diff) {
+  // Calculate differences
+  const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0]));
+  const __m128i v_mc_running_avg_y =
+      _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0]));
+  __m128i v_running_avg_y;
+  const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
+  const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
+  // Obtain the sign. FF if diff is negative.
+  const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, *k_0);
+  // Clamp absolute difference to 16 to be used to get mask. Doing this
+  // allows us to use _mm_cmpgt_epi8, which operates on signed byte.
+  const __m128i clamped_absdiff =
+      _mm_min_epu8(_mm_or_si128(pdiff, ndiff), *k_16);
+  // Get masks for l2 l1 and l0 adjustments.
+  const __m128i mask2 = _mm_cmpgt_epi8(*k_16, clamped_absdiff);
+  const __m128i mask1 = _mm_cmpgt_epi8(*k_8, clamped_absdiff);
+  const __m128i mask0 = _mm_cmpgt_epi8(*k_4, clamped_absdiff);
+  // Get adjustments for l2, l1, and l0.
+  __m128i adj2 = _mm_and_si128(mask2, *l32);
+  const __m128i adj1 = _mm_and_si128(mask1, *l21);
+  const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff);
+  __m128i adj,  padj, nadj;
+
+  // Combine the adjustments and get absolute adjustments.
+  adj2 = _mm_add_epi8(adj2, adj1);
+  adj = _mm_sub_epi8(*l3, adj2);
+  adj = _mm_andnot_si128(mask0, adj);
+  adj = _mm_or_si128(adj, adj0);
+
+  // Restore the sign and get positive and negative adjustments.
+  padj = _mm_andnot_si128(diff_sign, adj);
+  nadj = _mm_and_si128(diff_sign, adj);
+
+  // Calculate filtered value.
+  v_running_avg_y = _mm_adds_epu8(v_sig, padj);
+  v_running_avg_y = _mm_subs_epu8(v_running_avg_y, nadj);
+  _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);
+
+  // Adjustments <=7, and each element in acc_diff can fit in signed
+  // char.
+  acc_diff = _mm_adds_epi8(acc_diff, padj);
+  acc_diff = _mm_subs_epi8(acc_diff, nadj);
+  return acc_diff;
+}
+
+// Denoise a 16x1 vector with a weaker filter.
+static INLINE __m128i vp10_denoiser_adj_16x1_sse2(
+    const uint8_t *sig, const uint8_t *mc_running_avg_y,
+    uint8_t *running_avg_y, const __m128i k_0,
+    const __m128i k_delta, __m128i acc_diff) {
+  __m128i v_running_avg_y = _mm_loadu_si128((__m128i *)(&running_avg_y[0]));
+  // Calculate differences.
+  const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0]));
+  const __m128i v_mc_running_avg_y =
+      _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0]));
+  const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
+  const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
+  // Obtain the sign. FF if diff is negative.
+  const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
+  // Clamp absolute difference to delta to get the adjustment.
+  const __m128i adj =
+      _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta);
+  // Restore the sign and get positive and negative adjustments.
+  __m128i padj, nadj;
+  padj = _mm_andnot_si128(diff_sign, adj);
+  nadj = _mm_and_si128(diff_sign, adj);
+  // Calculate filtered value.
+  v_running_avg_y = _mm_subs_epu8(v_running_avg_y, padj);
+  v_running_avg_y = _mm_adds_epu8(v_running_avg_y, nadj);
+  _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);
+
+  // Accumulate the adjustments.
+  acc_diff = _mm_subs_epi8(acc_diff, padj);
+  acc_diff = _mm_adds_epi8(acc_diff, nadj);
+  return acc_diff;
+}
+
+// Denoiser for 4xM and 8xM blocks.
+static int vp10_denoiser_NxM_sse2_small(
+    const uint8_t *sig, int sig_stride, const uint8_t *mc_running_avg_y,
+    int mc_avg_y_stride, uint8_t *running_avg_y, int avg_y_stride,
+    int increase_denoising, BLOCK_SIZE bs, int motion_magnitude, int width) {
+  int sum_diff_thresh, r, sum_diff = 0;
+  const int shift_inc  = (increase_denoising &&
+                          motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ?
+                         1 : 0;
+  uint8_t sig_buffer[8][16], mc_running_buffer[8][16], running_buffer[8][16];
+  __m128i acc_diff = _mm_setzero_si128();
+  const __m128i k_0 = _mm_setzero_si128();
+  const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
+  const __m128i k_8 = _mm_set1_epi8(8);
+  const __m128i k_16 = _mm_set1_epi8(16);
+  // Modify each level's adjustment according to motion_magnitude.
+  const __m128i l3 = _mm_set1_epi8(
+      (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6);
+  // Difference between level 3 and level 2 is 2.
+  const __m128i l32 = _mm_set1_epi8(2);
+  // Difference between level 2 and level 1 is 1.
+  const __m128i l21 = _mm_set1_epi8(1);
+  const uint8_t shift = (width == 4) ? 2 : 1;
+
+  for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> shift); ++r) {
+    memcpy(sig_buffer[r], sig, width);
+    memcpy(sig_buffer[r] + width, sig + sig_stride, width);
+    memcpy(mc_running_buffer[r], mc_running_avg_y, width);
+    memcpy(mc_running_buffer[r] + width,
+           mc_running_avg_y + mc_avg_y_stride, width);
+    memcpy(running_buffer[r], running_avg_y, width);
+    memcpy(running_buffer[r] + width, running_avg_y + avg_y_stride, width);
+    if (width == 4) {
+      memcpy(sig_buffer[r] + width * 2, sig + sig_stride * 2, width);
+      memcpy(sig_buffer[r] + width * 3, sig + sig_stride * 3, width);
+      memcpy(mc_running_buffer[r] + width * 2,
+             mc_running_avg_y + mc_avg_y_stride * 2, width);
+      memcpy(mc_running_buffer[r] + width * 3,
+             mc_running_avg_y + mc_avg_y_stride * 3, width);
+      memcpy(running_buffer[r] + width * 2,
+             running_avg_y + avg_y_stride * 2, width);
+      memcpy(running_buffer[r] + width * 3,
+             running_avg_y + avg_y_stride * 3, width);
+    }
+    acc_diff = vp10_denoiser_16x1_sse2(sig_buffer[r],
+                                      mc_running_buffer[r],
+                                      running_buffer[r],
+                                      &k_0, &k_4, &k_8, &k_16,
+                                      &l3, &l32, &l21, acc_diff);
+    memcpy(running_avg_y, running_buffer[r], width);
+    memcpy(running_avg_y + avg_y_stride, running_buffer[r] + width, width);
+    if (width == 4) {
+      memcpy(running_avg_y + avg_y_stride * 2,
+             running_buffer[r] + width * 2, width);
+      memcpy(running_avg_y + avg_y_stride * 3,
+             running_buffer[r] + width * 3, width);
+    }
+    // Update pointers for next iteration.
+    sig += (sig_stride << shift);
+    mc_running_avg_y += (mc_avg_y_stride << shift);
+    running_avg_y += (avg_y_stride << shift);
+  }
+
+  {
+    sum_diff = sum_diff_16x1(acc_diff);
+    sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
+    if (abs(sum_diff) > sum_diff_thresh) {
+      // Before returning to copy the block (i.e., apply no denoising),
+      // check if we can still apply some (weaker) temporal filtering to
+      // this block, that would otherwise not be denoised at all. Simplest
+      // is to apply an additional adjustment to running_avg_y to bring it
+      // closer to sig. The adjustment is capped by a maximum delta, and
+      // chosen such that in most cases the resulting sum_diff will be
+      // within the acceptable range given by sum_diff_thresh.
+
+      // The delta is set by the excess of absolute pixel diff over the
+      // threshold.
+      const int delta = ((abs(sum_diff) - sum_diff_thresh) >>
+                         num_pels_log2_lookup[bs]) + 1;
+      // Only apply the adjustment for max delta up to 3.
+      if (delta < 4) {
+        const __m128i k_delta = _mm_set1_epi8(delta);
+        running_avg_y -= avg_y_stride * (4 << b_height_log2_lookup[bs]);
+        for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> shift); ++r) {
+          acc_diff = vp10_denoiser_adj_16x1_sse2(
+              sig_buffer[r], mc_running_buffer[r], running_buffer[r],
+              k_0, k_delta, acc_diff);
+          memcpy(running_avg_y, running_buffer[r], width);
+          memcpy(running_avg_y + avg_y_stride,
+                 running_buffer[r] + width, width);
+          if (width == 4) {
+            memcpy(running_avg_y + avg_y_stride * 2,
+                   running_buffer[r] + width * 2, width);
+            memcpy(running_avg_y + avg_y_stride * 3,
+                   running_buffer[r] + width * 3, width);
+          }
+          // Update pointers for next iteration.
+          running_avg_y += (avg_y_stride << shift);
+        }
+        sum_diff = sum_diff_16x1(acc_diff);
+        if (abs(sum_diff) > sum_diff_thresh) {
+          return COPY_BLOCK;
+        }
+      } else {
+        return COPY_BLOCK;
+      }
+    }
+  }
+  return FILTER_BLOCK;
+}
+
+// Denoiser for 16xM, 32xM and 64xM blocks
+static int vp10_denoiser_NxM_sse2_big(const uint8_t *sig, int sig_stride,
+                                     const uint8_t *mc_running_avg_y,
+                                     int mc_avg_y_stride,
+                                     uint8_t *running_avg_y,
+                                     int avg_y_stride,
+                                     int increase_denoising, BLOCK_SIZE bs,
+                                     int motion_magnitude) {
+  int sum_diff_thresh, r, c, sum_diff = 0;
+  const int shift_inc  = (increase_denoising &&
+                          motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ?
+                         1 : 0;
+  __m128i acc_diff[4][4];
+  const __m128i k_0 = _mm_setzero_si128();
+  const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
+  const __m128i k_8 = _mm_set1_epi8(8);
+  const __m128i k_16 = _mm_set1_epi8(16);
+  // Modify each level's adjustment according to motion_magnitude.
+  const __m128i l3 = _mm_set1_epi8(
+      (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6);
+  // Difference between level 3 and level 2 is 2.
+  const __m128i l32 = _mm_set1_epi8(2);
+  // Difference between level 2 and level 1 is 1.
+  const __m128i l21 = _mm_set1_epi8(1);
+
+  for (c = 0; c < 4; ++c) {
+    for (r = 0; r < 4; ++r) {
+      acc_diff[c][r] = _mm_setzero_si128();
+    }
+  }
+
+  for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) {
+    for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) {
+      acc_diff[c>>4][r>>4] = vp10_denoiser_16x1_sse2(
+          sig, mc_running_avg_y, running_avg_y, &k_0, &k_4,
+          &k_8, &k_16, &l3, &l32, &l21, acc_diff[c>>4][r>>4]);
+      // Update pointers for next iteration.
+      sig += 16;
+      mc_running_avg_y += 16;
+      running_avg_y += 16;
+    }
+
+    if ((r + 1) % 16 == 0 || (bs == BLOCK_16X8 && r == 7)) {
+      for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) {
+        sum_diff += sum_diff_16x1(acc_diff[c>>4][r>>4]);
+      }
+    }
+
+    // Update pointers for next iteration.
+    sig = sig - 16 * ((4 << b_width_log2_lookup[bs]) >> 4) + sig_stride;
+    mc_running_avg_y = mc_running_avg_y -
+                       16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
+                       mc_avg_y_stride;
+    running_avg_y = running_avg_y -
+                    16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
+                    avg_y_stride;
+  }
+
+  {
+    sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
+    if (abs(sum_diff) > sum_diff_thresh) {
+      const int delta = ((abs(sum_diff) - sum_diff_thresh) >>
+                         num_pels_log2_lookup[bs]) + 1;
+
+      // Only apply the adjustment for max delta up to 3.
+      if (delta < 4) {
+        const __m128i k_delta = _mm_set1_epi8(delta);
+        sig -= sig_stride * (4 << b_height_log2_lookup[bs]);
+        mc_running_avg_y -= mc_avg_y_stride * (4 << b_height_log2_lookup[bs]);
+        running_avg_y -= avg_y_stride * (4 << b_height_log2_lookup[bs]);
+        sum_diff = 0;
+        for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) {
+          for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) {
+            acc_diff[c>>4][r>>4] = vp10_denoiser_adj_16x1_sse2(
+                sig, mc_running_avg_y, running_avg_y, k_0,
+                k_delta, acc_diff[c>>4][r>>4]);
+            // Update pointers for next iteration.
+            sig += 16;
+            mc_running_avg_y += 16;
+            running_avg_y += 16;
+          }
+
+          if ((r + 1) % 16 == 0 || (bs == BLOCK_16X8 && r == 7)) {
+            for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) {
+              sum_diff += sum_diff_16x1(acc_diff[c>>4][r>>4]);
+            }
+          }
+          sig = sig - 16 * ((4 << b_width_log2_lookup[bs]) >> 4) + sig_stride;
+          mc_running_avg_y = mc_running_avg_y -
+                             16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
+                             mc_avg_y_stride;
+          running_avg_y = running_avg_y -
+                          16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
+                          avg_y_stride;
+        }
+        if (abs(sum_diff) > sum_diff_thresh) {
+          return COPY_BLOCK;
+        }
+      } else {
+        return COPY_BLOCK;
+      }
+    }
+  }
+  return FILTER_BLOCK;
+}
+
+int vp10_denoiser_filter_sse2(const uint8_t *sig, int sig_stride,
+                             const uint8_t *mc_avg,
+                             int mc_avg_stride,
+                             uint8_t *avg, int avg_stride,
+                             int increase_denoising,
+                             BLOCK_SIZE bs,
+                             int motion_magnitude) {
+  if (bs == BLOCK_4X4 || bs == BLOCK_4X8) {
+    return vp10_denoiser_NxM_sse2_small(sig, sig_stride,
+                                       mc_avg, mc_avg_stride,
+                                       avg, avg_stride,
+                                       increase_denoising,
+                                       bs, motion_magnitude, 4);
+  } else if (bs == BLOCK_8X4 || bs == BLOCK_8X8 || bs == BLOCK_8X16) {
+    return vp10_denoiser_NxM_sse2_small(sig, sig_stride,
+                                       mc_avg, mc_avg_stride,
+                                       avg, avg_stride,
+                                       increase_denoising,
+                                       bs, motion_magnitude, 8);
+  } else if (bs == BLOCK_16X8 || bs == BLOCK_16X16 || bs == BLOCK_16X32 ||
+             bs == BLOCK_32X16|| bs == BLOCK_32X32 || bs == BLOCK_32X64 ||
+             bs == BLOCK_64X32 || bs == BLOCK_64X64) {
+    return vp10_denoiser_NxM_sse2_big(sig, sig_stride,
+                                     mc_avg, mc_avg_stride,
+                                     avg, avg_stride,
+                                     increase_denoising,
+                                     bs, motion_magnitude);
+  } else {
+    return COPY_BLOCK;
+  }
+}
diff --git a/libs/libvpx/vp10/encoder/x86/error_intrin_avx2.c b/libs/libvpx/vp10/encoder/x86/error_intrin_avx2.c
new file mode 100644
index 0000000000..9766be27bf
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/x86/error_intrin_avx2.c
@@ -0,0 +1,73 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Usee of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>  // AVX2
+
+#include "./vp10_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+int64_t vp10_block_error_avx2(const int16_t *coeff,
+                             const int16_t *dqcoeff,
+                             intptr_t block_size,
+                             int64_t *ssz) {
+  __m256i sse_reg, ssz_reg, coeff_reg, dqcoeff_reg;
+  __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi;
+  __m256i sse_reg_64hi, ssz_reg_64hi;
+  __m128i sse_reg128, ssz_reg128;
+  int64_t sse;
+  int i;
+  const __m256i zero_reg = _mm256_set1_epi16(0);
+
+  // init sse and ssz registerd to zero
+  sse_reg = _mm256_set1_epi16(0);
+  ssz_reg = _mm256_set1_epi16(0);
+
+  for (i = 0 ; i < block_size ; i+= 16) {
+    // load 32 bytes from coeff and dqcoeff
+    coeff_reg = _mm256_loadu_si256((const __m256i *)(coeff + i));
+    dqcoeff_reg = _mm256_loadu_si256((const __m256i *)(dqcoeff + i));
+    // dqcoeff - coeff
+    dqcoeff_reg = _mm256_sub_epi16(dqcoeff_reg, coeff_reg);
+    // madd (dqcoeff - coeff)
+    dqcoeff_reg = _mm256_madd_epi16(dqcoeff_reg, dqcoeff_reg);
+    // madd coeff
+    coeff_reg = _mm256_madd_epi16(coeff_reg, coeff_reg);
+    // expand each double word of madd (dqcoeff - coeff) to quad word
+    exp_dqcoeff_lo = _mm256_unpacklo_epi32(dqcoeff_reg, zero_reg);
+    exp_dqcoeff_hi = _mm256_unpackhi_epi32(dqcoeff_reg, zero_reg);
+    // expand each double word of madd (coeff) to quad word
+    exp_coeff_lo = _mm256_unpacklo_epi32(coeff_reg, zero_reg);
+    exp_coeff_hi = _mm256_unpackhi_epi32(coeff_reg, zero_reg);
+    // add each quad word of madd (dqcoeff - coeff) and madd (coeff)
+    sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_lo);
+    ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_lo);
+    sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_hi);
+    ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_hi);
+  }
+  // save the higher 64 bit of each 128 bit lane
+  sse_reg_64hi = _mm256_srli_si256(sse_reg, 8);
+  ssz_reg_64hi = _mm256_srli_si256(ssz_reg, 8);
+  // add the higher 64 bit to the low 64 bit
+  sse_reg = _mm256_add_epi64(sse_reg, sse_reg_64hi);
+  ssz_reg = _mm256_add_epi64(ssz_reg, ssz_reg_64hi);
+
+  // add each 64 bit from each of the 128 bit lane of the 256 bit
+  sse_reg128 = _mm_add_epi64(_mm256_castsi256_si128(sse_reg),
+                             _mm256_extractf128_si256(sse_reg, 1));
+
+  ssz_reg128 = _mm_add_epi64(_mm256_castsi256_si128(ssz_reg),
+                             _mm256_extractf128_si256(ssz_reg, 1));
+
+  // store the results
+  _mm_storel_epi64((__m128i*)(&sse), sse_reg128);
+
+  _mm_storel_epi64((__m128i*)(ssz), ssz_reg128);
+  return sse;
+}
diff --git a/libs/libvpx/vp10/encoder/x86/error_sse2.asm b/libs/libvpx/vp10/encoder/x86/error_sse2.asm
new file mode 100644
index 0000000000..0772da418e
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/x86/error_sse2.asm
@@ -0,0 +1,122 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%define private_prefix vp10
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; int64_t vp10_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size,
+;                         int64_t *ssz)
+
+INIT_XMM sse2
+cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
+  pxor      m4, m4                 ; sse accumulator
+  pxor      m6, m6                 ; ssz accumulator
+  pxor      m5, m5                 ; dedicated zero register
+  lea     uqcq, [uqcq+sizeq*2]
+  lea     dqcq, [dqcq+sizeq*2]
+  neg    sizeq
+.loop:
+  mova      m2, [uqcq+sizeq*2]
+  mova      m0, [dqcq+sizeq*2]
+  mova      m3, [uqcq+sizeq*2+mmsize]
+  mova      m1, [dqcq+sizeq*2+mmsize]
+  psubw     m0, m2
+  psubw     m1, m3
+  ; individual errors are max. 15bit+sign, so squares are 30bit, and
+  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
+  pmaddwd   m0, m0
+  pmaddwd   m1, m1
+  pmaddwd   m2, m2
+  pmaddwd   m3, m3
+  ; accumulate in 64bit
+  punpckldq m7, m0, m5
+  punpckhdq m0, m5
+  paddq     m4, m7
+  punpckldq m7, m1, m5
+  paddq     m4, m0
+  punpckhdq m1, m5
+  paddq     m4, m7
+  punpckldq m7, m2, m5
+  paddq     m4, m1
+  punpckhdq m2, m5
+  paddq     m6, m7
+  punpckldq m7, m3, m5
+  paddq     m6, m2
+  punpckhdq m3, m5
+  paddq     m6, m7
+  paddq     m6, m3
+  add    sizeq, mmsize
+  jl .loop
+
+  ; accumulate horizontally and store in return value
+  movhlps   m5, m4
+  movhlps   m7, m6
+  paddq     m4, m5
+  paddq     m6, m7
+%if ARCH_X86_64
+  movq    rax, m4
+  movq [sszq], m6
+%else
+  mov     eax, sszm
+  pshufd   m5, m4, 0x1
+  movq  [eax], m6
+  movd    eax, m4
+  movd    edx, m5
+%endif
+  RET
+
+; Compute the sum of squared difference between two int16_t vectors.
+; int64_t vp10_block_error_fp(int16_t *coeff, int16_t *dqcoeff,
+;                            intptr_t block_size)
+
+INIT_XMM sse2
+cglobal block_error_fp, 3, 3, 6, uqc, dqc, size
+  pxor      m4, m4                 ; sse accumulator
+  pxor      m5, m5                 ; dedicated zero register
+  lea     uqcq, [uqcq+sizeq*2]
+  lea     dqcq, [dqcq+sizeq*2]
+  neg    sizeq
+.loop:
+  mova      m2, [uqcq+sizeq*2]
+  mova      m0, [dqcq+sizeq*2]
+  mova      m3, [uqcq+sizeq*2+mmsize]
+  mova      m1, [dqcq+sizeq*2+mmsize]
+  psubw     m0, m2
+  psubw     m1, m3
+  ; individual errors are max. 15bit+sign, so squares are 30bit, and
+  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
+  pmaddwd   m0, m0
+  pmaddwd   m1, m1
+  ; accumulate in 64bit
+  punpckldq m3, m0, m5
+  punpckhdq m0, m5
+  paddq     m4, m3
+  punpckldq m3, m1, m5
+  paddq     m4, m0
+  punpckhdq m1, m5
+  paddq     m4, m3
+  paddq     m4, m1
+  add    sizeq, mmsize
+  jl .loop
+
+  ; accumulate horizontally and store in return value
+  movhlps   m5, m4
+  paddq     m4, m5
+%if ARCH_X86_64
+  movq    rax, m4
+%else
+  pshufd   m5, m4, 0x1
+  movd    eax, m4
+  movd    edx, m5
+%endif
+  RET
diff --git a/libs/libvpx/vp10/encoder/x86/highbd_block_error_intrin_sse2.c b/libs/libvpx/vp10/encoder/x86/highbd_block_error_intrin_sse2.c
new file mode 100644
index 0000000000..6b4cf50994
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/x86/highbd_block_error_intrin_sse2.c
@@ -0,0 +1,71 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>
+#include <stdio.h>
+
+#include "vp10/common/common.h"
+
+int64_t vp10_highbd_block_error_sse2(tran_low_t *coeff, tran_low_t *dqcoeff,
+                                    intptr_t block_size, int64_t *ssz,
+                                    int bps) {
+  int i, j, test;
+  uint32_t temp[4];
+  __m128i max, min, cmp0, cmp1, cmp2, cmp3;
+  int64_t error = 0, sqcoeff = 0;
+  const int shift = 2 * (bps - 8);
+  const int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+
+  for (i = 0; i < block_size; i+=8) {
+    // Load the data into xmm registers
+    __m128i mm_coeff = _mm_load_si128((__m128i*) (coeff + i));
+    __m128i mm_coeff2 = _mm_load_si128((__m128i*) (coeff + i + 4));
+    __m128i mm_dqcoeff = _mm_load_si128((__m128i*) (dqcoeff + i));
+    __m128i mm_dqcoeff2 = _mm_load_si128((__m128i*) (dqcoeff + i + 4));
+    // Check if any values require more than 15 bit
+    max = _mm_set1_epi32(0x3fff);
+    min = _mm_set1_epi32(0xffffc000);
+    cmp0 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff, max),
+            _mm_cmplt_epi32(mm_coeff, min));
+    cmp1 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff2, max),
+            _mm_cmplt_epi32(mm_coeff2, min));
+    cmp2 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff, max),
+            _mm_cmplt_epi32(mm_dqcoeff, min));
+    cmp3 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff2, max),
+            _mm_cmplt_epi32(mm_dqcoeff2, min));
+    test = _mm_movemask_epi8(_mm_or_si128(_mm_or_si128(cmp0, cmp1),
+            _mm_or_si128(cmp2, cmp3)));
+
+    if (!test) {
+      __m128i mm_diff, error_sse2, sqcoeff_sse2;;
+      mm_coeff = _mm_packs_epi32(mm_coeff, mm_coeff2);
+      mm_dqcoeff = _mm_packs_epi32(mm_dqcoeff, mm_dqcoeff2);
+      mm_diff = _mm_sub_epi16(mm_coeff, mm_dqcoeff);
+      error_sse2 = _mm_madd_epi16(mm_diff, mm_diff);
+      sqcoeff_sse2 = _mm_madd_epi16(mm_coeff, mm_coeff);
+      _mm_storeu_si128((__m128i*)temp, error_sse2);
+      error = error + temp[0] + temp[1] + temp[2] + temp[3];
+      _mm_storeu_si128((__m128i*)temp, sqcoeff_sse2);
+      sqcoeff += temp[0] + temp[1] + temp[2] + temp[3];
+    } else {
+      for (j = 0; j < 8; j++) {
+        const int64_t diff = coeff[i + j] - dqcoeff[i + j];
+        error +=  diff * diff;
+        sqcoeff += (int64_t)coeff[i + j] * (int64_t)coeff[i + j];
+      }
+    }
+  }
+  assert(error >= 0 && sqcoeff >= 0);
+  error = (error + rounding) >> shift;
+  sqcoeff = (sqcoeff + rounding) >> shift;
+
+  *ssz = sqcoeff;
+  return error;
+}
diff --git a/libs/libvpx/vp10/encoder/x86/quantize_sse2.c b/libs/libvpx/vp10/encoder/x86/quantize_sse2.c
new file mode 100644
index 0000000000..dabd3bd127
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/x86/quantize_sse2.c
@@ -0,0 +1,211 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>
+#include <xmmintrin.h>
+
+#include "./vp10_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+void vp10_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
+                          int skip_block, const int16_t* zbin_ptr,
+                          const int16_t* round_ptr, const int16_t* quant_ptr,
+                          const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr,
+                          int16_t* dqcoeff_ptr, const int16_t* dequant_ptr,
+                          uint16_t* eob_ptr,
+                          const int16_t* scan_ptr,
+                          const int16_t* iscan_ptr) {
+  __m128i zero;
+  __m128i thr;
+  int16_t nzflag;
+  (void)scan_ptr;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+
+  coeff_ptr += n_coeffs;
+  iscan_ptr += n_coeffs;
+  qcoeff_ptr += n_coeffs;
+  dqcoeff_ptr += n_coeffs;
+  n_coeffs = -n_coeffs;
+  zero = _mm_setzero_si128();
+
+  if (!skip_block) {
+    __m128i eob;
+    __m128i round, quant, dequant;
+    {
+      __m128i coeff0, coeff1;
+
+      // Setup global values
+      {
+        round = _mm_load_si128((const __m128i*)round_ptr);
+        quant = _mm_load_si128((const __m128i*)quant_ptr);
+        dequant = _mm_load_si128((const __m128i*)dequant_ptr);
+      }
+
+      {
+        __m128i coeff0_sign, coeff1_sign;
+        __m128i qcoeff0, qcoeff1;
+        __m128i qtmp0, qtmp1;
+        // Do DC and first 15 AC
+        coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs));
+        coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1);
+
+        // Poor man's sign extract
+        coeff0_sign = _mm_srai_epi16(coeff0, 15);
+        coeff1_sign = _mm_srai_epi16(coeff1, 15);
+        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+        round = _mm_unpackhi_epi64(round, round);
+        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+        quant = _mm_unpackhi_epi64(quant, quant);
+        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+        // Reinsert signs
+        qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+
+        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+        dequant = _mm_unpackhi_epi64(dequant, dequant);
+        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+      }
+
+      {
+        // Scan for eob
+        __m128i zero_coeff0, zero_coeff1;
+        __m128i nzero_coeff0, nzero_coeff1;
+        __m128i iscan0, iscan1;
+        __m128i eob1;
+        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+        iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
+        iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
+        // Add one to convert from indices to counts
+        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+        eob = _mm_and_si128(iscan0, nzero_coeff0);
+        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+        eob = _mm_max_epi16(eob, eob1);
+      }
+      n_coeffs += 8 * 2;
+    }
+
+    thr = _mm_srai_epi16(dequant, 1);
+
+    // AC only loop
+    while (n_coeffs < 0) {
+      __m128i coeff0, coeff1;
+      {
+        __m128i coeff0_sign, coeff1_sign;
+        __m128i qcoeff0, qcoeff1;
+        __m128i qtmp0, qtmp1;
+
+        coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs));
+        coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1);
+
+        // Poor man's sign extract
+        coeff0_sign = _mm_srai_epi16(coeff0, 15);
+        coeff1_sign = _mm_srai_epi16(coeff1, 15);
+        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
+            _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
+
+        if (nzflag) {
+          qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+          qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+          qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+          qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+          // Reinsert signs
+          qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+          qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+          qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+          qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+          _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
+          _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+
+          coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+          coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+          _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
+          _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+        } else {
+          _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
+          _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
+
+          _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
+          _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
+        }
+      }
+
+      if (nzflag) {
+        // Scan for eob
+        __m128i zero_coeff0, zero_coeff1;
+        __m128i nzero_coeff0, nzero_coeff1;
+        __m128i iscan0, iscan1;
+        __m128i eob0, eob1;
+        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+        iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
+        iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
+        // Add one to convert from indices to counts
+        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+        eob0 = _mm_and_si128(iscan0, nzero_coeff0);
+        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+        eob0 = _mm_max_epi16(eob0, eob1);
+        eob = _mm_max_epi16(eob, eob0);
+      }
+      n_coeffs += 8 * 2;
+    }
+
+    // Accumulate EOB
+    {
+      __m128i eob_shuffled;
+      eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      *eob_ptr = _mm_extract_epi16(eob, 1);
+    }
+  } else {
+    do {
+      _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
+      _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
+      _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
+      _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
+      n_coeffs += 8 * 2;
+    } while (n_coeffs < 0);
+    *eob_ptr = 0;
+  }
+}
diff --git a/libs/libvpx/vp10/encoder/x86/quantize_ssse3_x86_64.asm b/libs/libvpx/vp10/encoder/x86/quantize_ssse3_x86_64.asm
new file mode 100644
index 0000000000..b8fefa2f16
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/x86/quantize_ssse3_x86_64.asm
@@ -0,0 +1,201 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%define private_prefix vp10
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_1: times 8 dw 1
+
+SECTION .text
+
+%macro QUANTIZE_FP 2
+cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
+                                shift, qcoeff, dqcoeff, dequant, \
+                                eob, scan, iscan
+  cmp                    dword skipm, 0
+  jne .blank
+
+  ; actual quantize loop - setup pointers, rounders, etc.
+  movifnidn                   coeffq, coeffmp
+  movifnidn                  ncoeffq, ncoeffmp
+  mov                             r2, dequantmp
+  movifnidn                    zbinq, zbinmp
+  movifnidn                   roundq, roundmp
+  movifnidn                   quantq, quantmp
+  mova                            m1, [roundq]             ; m1 = round
+  mova                            m2, [quantq]             ; m2 = quant
+%ifidn %1, fp_32x32
+  pcmpeqw                         m5, m5
+  psrlw                           m5, 15
+  paddw                           m1, m5
+  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
+%endif
+  mova                            m3, [r2q]                ; m3 = dequant
+  mov                             r3, qcoeffmp
+  mov                             r4, dqcoeffmp
+  mov                             r5, iscanmp
+%ifidn %1, fp_32x32
+  psllw                           m2, 1
+%endif
+  pxor                            m5, m5                   ; m5 = dedicated zero
+
+  lea                         coeffq, [  coeffq+ncoeffq*2]
+  lea                            r5q, [  r5q+ncoeffq*2]
+  lea                            r3q, [ r3q+ncoeffq*2]
+  lea                            r4q, [r4q+ncoeffq*2]
+  neg                        ncoeffq
+
+  ; get DC and first 15 AC coeffs
+  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
+  pabsw                           m6, m9                   ; m6 = abs(m9)
+  pabsw                          m11, m10                  ; m11 = abs(m10)
+  pcmpeqw                         m7, m7
+
+  paddsw                          m6, m1                   ; m6 += round
+  punpckhqdq                      m1, m1
+  paddsw                         m11, m1                   ; m11 += round
+  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
+  punpckhqdq                      m2, m2
+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
+  psignw                          m8, m9                   ; m8 = reinsert sign
+  psignw                         m13, m10                  ; m13 = reinsert sign
+  mova            [r3q+ncoeffq*2+ 0], m8
+  mova            [r3q+ncoeffq*2+16], m13
+%ifidn %1, fp_32x32
+  pabsw                           m8, m8
+  pabsw                          m13, m13
+%endif
+  pmullw                          m8, m3                   ; r4[i] = r3[i] * q
+  punpckhqdq                      m3, m3
+  pmullw                         m13, m3                   ; r4[i] = r3[i] * q
+%ifidn %1, fp_32x32
+  psrlw                           m8, 1
+  psrlw                          m13, 1
+  psignw                          m8, m9
+  psignw                         m13, m10
+  psrlw                           m0, m3, 2
+%else
+  psrlw                           m0, m3, 1
+%endif
+  mova            [r4q+ncoeffq*2+ 0], m8
+  mova            [r4q+ncoeffq*2+16], m13
+  pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
+  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
+  mova                            m6, [  r5q+ncoeffq*2+ 0] ; m6 = scan[i]
+  mova                           m11, [  r5q+ncoeffq*2+16] ; m11 = scan[i]
+  psubw                           m6, m7                   ; m6 = scan[i] + 1
+  psubw                          m11, m7                   ; m11 = scan[i] + 1
+  pandn                           m8, m6                   ; m8 = max(eob)
+  pandn                          m13, m11                  ; m13 = max(eob)
+  pmaxsw                          m8, m13
+  add                        ncoeffq, mmsize
+  jz .accumulate_eob
+
+.ac_only_loop:
+  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
+  pabsw                           m6, m9                   ; m6 = abs(m9)
+  pabsw                          m11, m10                  ; m11 = abs(m10)
+
+  pcmpgtw                         m7, m6,  m0
+  pcmpgtw                        m12, m11, m0
+  pmovmskb                       r6d, m7
+  pmovmskb                       r2d, m12
+
+  or                              r6, r2
+  jz .skip_iter
+
+  pcmpeqw                         m7, m7
+
+  paddsw                          m6, m1                   ; m6 += round
+  paddsw                         m11, m1                   ; m11 += round
+  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
+  psignw                         m14, m9                   ; m14 = reinsert sign
+  psignw                         m13, m10                  ; m13 = reinsert sign
+  mova            [r3q+ncoeffq*2+ 0], m14
+  mova            [r3q+ncoeffq*2+16], m13
+%ifidn %1, fp_32x32
+  pabsw                          m14, m14
+  pabsw                          m13, m13
+%endif
+  pmullw                         m14, m3                   ; r4[i] = r3[i] * q
+  pmullw                         m13, m3                   ; r4[i] = r3[i] * q
+%ifidn %1, fp_32x32
+  psrlw                          m14, 1
+  psrlw                          m13, 1
+  psignw                         m14, m9
+  psignw                         m13, m10
+%endif
+  mova            [r4q+ncoeffq*2+ 0], m14
+  mova            [r4q+ncoeffq*2+16], m13
+  pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
+  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
+  mova                            m6, [  r5q+ncoeffq*2+ 0] ; m6 = scan[i]
+  mova                           m11, [  r5q+ncoeffq*2+16] ; m11 = scan[i]
+  psubw                           m6, m7                   ; m6 = scan[i] + 1
+  psubw                          m11, m7                   ; m11 = scan[i] + 1
+  pandn                          m14, m6                   ; m14 = max(eob)
+  pandn                          m13, m11                  ; m13 = max(eob)
+  pmaxsw                          m8, m14
+  pmaxsw                          m8, m13
+  add                        ncoeffq, mmsize
+  jl .ac_only_loop
+
+  jmp .accumulate_eob
+.skip_iter:
+  mova            [r3q+ncoeffq*2+ 0], m5
+  mova            [r3q+ncoeffq*2+16], m5
+  mova            [r4q+ncoeffq*2+ 0], m5
+  mova            [r4q+ncoeffq*2+16], m5
+  add                        ncoeffq, mmsize
+  jl .ac_only_loop
+
+.accumulate_eob:
+  ; horizontally accumulate/max eobs and write into [eob] memory pointer
+  mov                             r2, eobmp
+  pshufd                          m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0x1
+  pmaxsw                          m8, m7
+  pextrw                          r6, m8, 0
+  mov                           [r2], r6
+  RET
+
+  ; skip-block, i.e. just write all zeroes
+.blank:
+  mov                             r0, dqcoeffmp
+  movifnidn                  ncoeffq, ncoeffmp
+  mov                             r2, qcoeffmp
+  mov                             r3, eobmp
+
+  lea                            r0q, [r0q+ncoeffq*2]
+  lea                            r2q, [r2q+ncoeffq*2]
+  neg                        ncoeffq
+  pxor                            m7, m7
+.blank_loop:
+  mova            [r0q+ncoeffq*2+ 0], m7
+  mova            [r0q+ncoeffq*2+16], m7
+  mova            [r2q+ncoeffq*2+ 0], m7
+  mova            [r2q+ncoeffq*2+16], m7
+  add                        ncoeffq, mmsize
+  jl .blank_loop
+  mov                     word [r3q], 0
+  RET
+%endmacro
+
+INIT_XMM ssse3
+QUANTIZE_FP fp, 7
+QUANTIZE_FP fp_32x32, 7
diff --git a/libs/libvpx/vp10/encoder/x86/ssim_opt_x86_64.asm b/libs/libvpx/vp10/encoder/x86/ssim_opt_x86_64.asm
new file mode 100644
index 0000000000..b45f0095d8
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/x86/ssim_opt_x86_64.asm
@@ -0,0 +1,216 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "vpx_ports/x86_abi_support.asm"
+
+; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
+%macro TABULATE_SSIM 0
+        paddusw         xmm15, xmm3  ; sum_s
+        paddusw         xmm14, xmm4  ; sum_r
+        movdqa          xmm1, xmm3
+        pmaddwd         xmm1, xmm1
+        paddd           xmm13, xmm1 ; sum_sq_s
+        movdqa          xmm2, xmm4
+        pmaddwd         xmm2, xmm2
+        paddd           xmm12, xmm2 ; sum_sq_r
+        pmaddwd         xmm3, xmm4
+        paddd           xmm11, xmm3  ; sum_sxr
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_Q 1
+        movdqa          xmm2,%1
+        punpckldq       %1,xmm0
+        punpckhdq       xmm2,xmm0
+        paddq           %1,xmm2
+        movdqa          xmm2,%1
+        punpcklqdq      %1,xmm0
+        punpckhqdq      xmm2,xmm0
+        paddq           %1,xmm2
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_W 1
+        movdqa          xmm1, %1
+        punpcklwd       %1,xmm0
+        punpckhwd       xmm1,xmm0
+        paddd           %1, xmm1
+        SUM_ACROSS_Q    %1
+%endmacro
+;void ssim_parms_sse2(
+;    unsigned char *s,
+;    int sp,
+;    unsigned char *r,
+;    int rp
+;    unsigned long *sum_s,
+;    unsigned long *sum_r,
+;    unsigned long *sum_sq_s,
+;    unsigned long *sum_sq_r,
+;    unsigned long *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+global sym(vp10_ssim_parms_16x16_sse2) PRIVATE
+sym(vp10_ssim_parms_16x16_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    SAVE_XMM 15
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov             rsi,        arg(0) ;s
+    mov             rcx,        arg(1) ;sp
+    mov             rdi,        arg(2) ;r
+    mov             rax,        arg(3) ;rp
+
+    pxor            xmm0, xmm0
+    pxor            xmm15,xmm15  ;sum_s
+    pxor            xmm14,xmm14  ;sum_r
+    pxor            xmm13,xmm13  ;sum_sq_s
+    pxor            xmm12,xmm12  ;sum_sq_r
+    pxor            xmm11,xmm11  ;sum_sxr
+
+    mov             rdx, 16      ;row counter
+.NextRow:
+
+    ;grab source and reference pixels
+    movdqu          xmm5, [rsi]
+    movdqu          xmm6, [rdi]
+    movdqa          xmm3, xmm5
+    movdqa          xmm4, xmm6
+    punpckhbw       xmm3, xmm0 ; high_s
+    punpckhbw       xmm4, xmm0 ; high_r
+
+    TABULATE_SSIM
+
+    movdqa          xmm3, xmm5
+    movdqa          xmm4, xmm6
+    punpcklbw       xmm3, xmm0 ; low_s
+    punpcklbw       xmm4, xmm0 ; low_r
+
+    TABULATE_SSIM
+
+    add             rsi, rcx   ; next s row
+    add             rdi, rax   ; next r row
+
+    dec             rdx        ; counter
+    jnz .NextRow
+
+    SUM_ACROSS_W    xmm15
+    SUM_ACROSS_W    xmm14
+    SUM_ACROSS_Q    xmm13
+    SUM_ACROSS_Q    xmm12
+    SUM_ACROSS_Q    xmm11
+
+    mov             rdi,arg(4)
+    movd            [rdi], xmm15;
+    mov             rdi,arg(5)
+    movd            [rdi], xmm14;
+    mov             rdi,arg(6)
+    movd            [rdi], xmm13;
+    mov             rdi,arg(7)
+    movd            [rdi], xmm12;
+    mov             rdi,arg(8)
+    movd            [rdi], xmm11;
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void ssim_parms_sse2(
+;    unsigned char *s,
+;    int sp,
+;    unsigned char *r,
+;    int rp
+;    unsigned long *sum_s,
+;    unsigned long *sum_r,
+;    unsigned long *sum_sq_s,
+;    unsigned long *sum_sq_r,
+;    unsigned long *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+global sym(vp10_ssim_parms_8x8_sse2) PRIVATE
+sym(vp10_ssim_parms_8x8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    SAVE_XMM 15
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov             rsi,        arg(0) ;s
+    mov             rcx,        arg(1) ;sp
+    mov             rdi,        arg(2) ;r
+    mov             rax,        arg(3) ;rp
+
+    pxor            xmm0, xmm0
+    pxor            xmm15,xmm15  ;sum_s
+    pxor            xmm14,xmm14  ;sum_r
+    pxor            xmm13,xmm13  ;sum_sq_s
+    pxor            xmm12,xmm12  ;sum_sq_r
+    pxor            xmm11,xmm11  ;sum_sxr
+
+    mov             rdx, 8      ;row counter
+.NextRow:
+
+    ;grab source and reference pixels
+    movq            xmm3, [rsi]
+    movq            xmm4, [rdi]
+    punpcklbw       xmm3, xmm0 ; low_s
+    punpcklbw       xmm4, xmm0 ; low_r
+
+    TABULATE_SSIM
+
+    add             rsi, rcx   ; next s row
+    add             rdi, rax   ; next r row
+
+    dec             rdx        ; counter
+    jnz .NextRow
+
+    SUM_ACROSS_W    xmm15
+    SUM_ACROSS_W    xmm14
+    SUM_ACROSS_Q    xmm13
+    SUM_ACROSS_Q    xmm12
+    SUM_ACROSS_Q    xmm11
+
+    mov             rdi,arg(4)
+    movd            [rdi], xmm15;
+    mov             rdi,arg(5)
+    movd            [rdi], xmm14;
+    mov             rdi,arg(6)
+    movd            [rdi], xmm13;
+    mov             rdi,arg(7)
+    movd            [rdi], xmm12;
+    mov             rdi,arg(8)
+    movd            [rdi], xmm11;
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/libs/libvpx/vp10/encoder/x86/temporal_filter_apply_sse2.asm b/libs/libvpx/vp10/encoder/x86/temporal_filter_apply_sse2.asm
new file mode 100644
index 0000000000..7171807133
--- /dev/null
+++ b/libs/libvpx/vp10/encoder/x86/temporal_filter_apply_sse2.asm
@@ -0,0 +1,212 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+; void vp10_temporal_filter_apply_sse2 | arg
+;  (unsigned char  *frame1,           |  0
+;   unsigned int    stride,           |  1
+;   unsigned char  *frame2,           |  2
+;   unsigned int    block_width,      |  3
+;   unsigned int    block_height,     |  4
+;   int             strength,         |  5
+;   int             filter_weight,    |  6
+;   unsigned int   *accumulator,      |  7
+;   unsigned short *count)            |  8
+global sym(vp10_temporal_filter_apply_sse2) PRIVATE
+sym(vp10_temporal_filter_apply_sse2):
+
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ALIGN_STACK 16, rax
+    %define block_width    0
+    %define block_height  16
+    %define strength      32
+    %define filter_weight 48
+    %define rounding_bit  64
+    %define rbp_backup    80
+    %define stack_size    96
+    sub         rsp,           stack_size
+    mov         [rsp + rbp_backup], rbp
+    ; end prolog
+
+        mov         edx,            arg(3)
+        mov         [rsp + block_width], rdx
+        mov         edx,            arg(4)
+        mov         [rsp + block_height], rdx
+        movd        xmm6,           arg(5)
+        movdqa      [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read
+
+        ; calculate the rounding bit outside the loop
+        ; 0x8000 >> (16 - strength)
+        mov         rdx,            16
+        sub         rdx,            arg(5) ; 16 - strength
+        movq        xmm4,           rdx    ; can't use rdx w/ shift
+        movdqa      xmm5,           [GLOBAL(_const_top_bit)]
+        psrlw       xmm5,           xmm4
+        movdqa      [rsp + rounding_bit], xmm5
+
+        mov         rsi,            arg(0) ; src/frame1
+        mov         rdx,            arg(2) ; predictor frame
+        mov         rdi,            arg(7) ; accumulator
+        mov         rax,            arg(8) ; count
+
+        ; dup the filter weight and store for later
+        movd        xmm0,           arg(6) ; filter_weight
+        pshuflw     xmm0,           xmm0, 0
+        punpcklwd   xmm0,           xmm0
+        movdqa      [rsp + filter_weight], xmm0
+
+        mov         rbp,            arg(1) ; stride
+        pxor        xmm7,           xmm7   ; zero for extraction
+
+        mov         rcx,            [rsp + block_width]
+        imul        rcx,            [rsp + block_height]
+        add         rcx,            rdx
+        cmp         dword ptr [rsp + block_width], 8
+        jne         .temporal_filter_apply_load_16
+
+.temporal_filter_apply_load_8:
+        movq        xmm0,           [rsi]  ; first row
+        lea         rsi,            [rsi + rbp] ; += stride
+        punpcklbw   xmm0,           xmm7   ; src[ 0- 7]
+        movq        xmm1,           [rsi]  ; second row
+        lea         rsi,            [rsi + rbp] ; += stride
+        punpcklbw   xmm1,           xmm7   ; src[ 8-15]
+        jmp         .temporal_filter_apply_load_finished
+
+.temporal_filter_apply_load_16:
+        movdqa      xmm0,           [rsi]  ; src (frame1)
+        lea         rsi,            [rsi + rbp] ; += stride
+        movdqa      xmm1,           xmm0
+        punpcklbw   xmm0,           xmm7   ; src[ 0- 7]
+        punpckhbw   xmm1,           xmm7   ; src[ 8-15]
+
+.temporal_filter_apply_load_finished:
+        movdqa      xmm2,           [rdx]  ; predictor (frame2)
+        movdqa      xmm3,           xmm2
+        punpcklbw   xmm2,           xmm7   ; pred[ 0- 7]
+        punpckhbw   xmm3,           xmm7   ; pred[ 8-15]
+
+        ; modifier = src_byte - pixel_value
+        psubw       xmm0,           xmm2   ; src - pred[ 0- 7]
+        psubw       xmm1,           xmm3   ; src - pred[ 8-15]
+
+        ; modifier *= modifier
+        pmullw      xmm0,           xmm0   ; modifer[ 0- 7]^2
+        pmullw      xmm1,           xmm1   ; modifer[ 8-15]^2
+
+        ; modifier *= 3
+        pmullw      xmm0,           [GLOBAL(_const_3w)]
+        pmullw      xmm1,           [GLOBAL(_const_3w)]
+
+        ; modifer += 0x8000 >> (16 - strength)
+        paddw       xmm0,           [rsp + rounding_bit]
+        paddw       xmm1,           [rsp + rounding_bit]
+
+        ; modifier >>= strength
+        psrlw       xmm0,           [rsp + strength]
+        psrlw       xmm1,           [rsp + strength]
+
+        ; modifier = 16 - modifier
+        ; saturation takes care of modifier > 16
+        movdqa      xmm3,           [GLOBAL(_const_16w)]
+        movdqa      xmm2,           [GLOBAL(_const_16w)]
+        psubusw     xmm3,           xmm1
+        psubusw     xmm2,           xmm0
+
+        ; modifier *= filter_weight
+        pmullw      xmm2,           [rsp + filter_weight]
+        pmullw      xmm3,           [rsp + filter_weight]
+
+        ; count
+        movdqa      xmm4,           [rax]
+        movdqa      xmm5,           [rax+16]
+        ; += modifier
+        paddw       xmm4,           xmm2
+        paddw       xmm5,           xmm3
+        ; write back
+        movdqa      [rax],          xmm4
+        movdqa      [rax+16],       xmm5
+        lea         rax,            [rax + 16*2] ; count += 16*(sizeof(short))
+
+        ; load and extract the predictor up to shorts
+        pxor        xmm7,           xmm7
+        movdqa      xmm0,           [rdx]
+        lea         rdx,            [rdx + 16*1] ; pred += 16*(sizeof(char))
+        movdqa      xmm1,           xmm0
+        punpcklbw   xmm0,           xmm7   ; pred[ 0- 7]
+        punpckhbw   xmm1,           xmm7   ; pred[ 8-15]
+
+        ; modifier *= pixel_value
+        pmullw      xmm0,           xmm2
+        pmullw      xmm1,           xmm3
+
+        ; expand to double words
+        movdqa      xmm2,           xmm0
+        punpcklwd   xmm0,           xmm7   ; [ 0- 3]
+        punpckhwd   xmm2,           xmm7   ; [ 4- 7]
+        movdqa      xmm3,           xmm1
+        punpcklwd   xmm1,           xmm7   ; [ 8-11]
+        punpckhwd   xmm3,           xmm7   ; [12-15]
+
+        ; accumulator
+        movdqa      xmm4,           [rdi]
+        movdqa      xmm5,           [rdi+16]
+        movdqa      xmm6,           [rdi+32]
+        movdqa      xmm7,           [rdi+48]
+        ; += modifier
+        paddd       xmm4,           xmm0
+        paddd       xmm5,           xmm2
+        paddd       xmm6,           xmm1
+        paddd       xmm7,           xmm3
+        ; write back
+        movdqa      [rdi],          xmm4
+        movdqa      [rdi+16],       xmm5
+        movdqa      [rdi+32],       xmm6
+        movdqa      [rdi+48],       xmm7
+        lea         rdi,            [rdi + 16*4] ; accumulator += 16*(sizeof(int))
+
+        cmp         rdx,            rcx
+        je          .temporal_filter_apply_epilog
+        pxor        xmm7,           xmm7   ; zero for extraction
+        cmp         dword ptr [rsp + block_width], 16
+        je          .temporal_filter_apply_load_16
+        jmp         .temporal_filter_apply_load_8
+
+.temporal_filter_apply_epilog:
+    ; begin epilog
+    mov         rbp,            [rsp + rbp_backup]
+    add         rsp,            stack_size
+    pop         rsp
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+_const_3w:
+    times 8 dw 3
+align 16
+_const_top_bit:
+    times 8 dw 1<<15
+align 16
+_const_16w
+    times 8 dw 16
diff --git a/libs/libvpx/vp10/exports_dec b/libs/libvpx/vp10/exports_dec
new file mode 100644
index 0000000000..71c8369ba4
--- /dev/null
+++ b/libs/libvpx/vp10/exports_dec
@@ -0,0 +1,2 @@
+data vpx_codec_vp10_dx_algo
+text vpx_codec_vp10_dx
diff --git a/libs/libvpx/vp10/exports_enc b/libs/libvpx/vp10/exports_enc
new file mode 100644
index 0000000000..d1644f2605
--- /dev/null
+++ b/libs/libvpx/vp10/exports_enc
@@ -0,0 +1,2 @@
+data vpx_codec_vp10_cx_algo
+text vpx_codec_vp10_cx
diff --git a/libs/libvpx/vp10/vp10_common.mk b/libs/libvpx/vp10/vp10_common.mk
new file mode 100644
index 0000000000..2eb348873b
--- /dev/null
+++ b/libs/libvpx/vp10/vp10_common.mk
@@ -0,0 +1,104 @@
+##
+##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+VP10_COMMON_SRCS-yes += vp10_common.mk
+VP10_COMMON_SRCS-yes += vp10_iface_common.h
+VP10_COMMON_SRCS-yes += common/ppflags.h
+VP10_COMMON_SRCS-yes += common/alloccommon.c
+VP10_COMMON_SRCS-yes += common/blockd.c
+VP10_COMMON_SRCS-yes += common/debugmodes.c
+VP10_COMMON_SRCS-yes += common/entropy.c
+VP10_COMMON_SRCS-yes += common/entropymode.c
+VP10_COMMON_SRCS-yes += common/entropymv.c
+VP10_COMMON_SRCS-yes += common/frame_buffers.c
+VP10_COMMON_SRCS-yes += common/frame_buffers.h
+VP10_COMMON_SRCS-yes += common/alloccommon.h
+VP10_COMMON_SRCS-yes += common/blockd.h
+VP10_COMMON_SRCS-yes += common/common.h
+VP10_COMMON_SRCS-yes += common/entropy.h
+VP10_COMMON_SRCS-yes += common/entropymode.h
+VP10_COMMON_SRCS-yes += common/entropymv.h
+VP10_COMMON_SRCS-yes += common/enums.h
+VP10_COMMON_SRCS-yes += common/filter.h
+VP10_COMMON_SRCS-yes += common/filter.c
+VP10_COMMON_SRCS-yes += common/idct.h
+VP10_COMMON_SRCS-yes += common/idct.c
+VP10_COMMON_SRCS-yes += common/vp10_inv_txfm.h
+VP10_COMMON_SRCS-yes += common/vp10_inv_txfm.c
+VP10_COMMON_SRCS-yes += common/loopfilter.h
+VP10_COMMON_SRCS-yes += common/thread_common.h
+VP10_COMMON_SRCS-yes += common/mv.h
+VP10_COMMON_SRCS-yes += common/onyxc_int.h
+VP10_COMMON_SRCS-yes += common/pred_common.h
+VP10_COMMON_SRCS-yes += common/pred_common.c
+VP10_COMMON_SRCS-yes += common/quant_common.h
+VP10_COMMON_SRCS-yes += common/reconinter.h
+VP10_COMMON_SRCS-yes += common/reconintra.h
+VP10_COMMON_SRCS-yes += common/vp10_rtcd.c
+VP10_COMMON_SRCS-yes += common/vp10_rtcd_defs.pl
+VP10_COMMON_SRCS-yes += common/scale.h
+VP10_COMMON_SRCS-yes += common/scale.c
+VP10_COMMON_SRCS-yes += common/seg_common.h
+VP10_COMMON_SRCS-yes += common/seg_common.c
+VP10_COMMON_SRCS-yes += common/textblit.h
+VP10_COMMON_SRCS-yes += common/tile_common.h
+VP10_COMMON_SRCS-yes += common/tile_common.c
+VP10_COMMON_SRCS-yes += common/loopfilter.c
+VP10_COMMON_SRCS-yes += common/thread_common.c
+VP10_COMMON_SRCS-yes += common/mvref_common.c
+VP10_COMMON_SRCS-yes += common/mvref_common.h
+VP10_COMMON_SRCS-yes += common/quant_common.c
+VP10_COMMON_SRCS-yes += common/reconinter.c
+VP10_COMMON_SRCS-yes += common/reconintra.c
+VP10_COMMON_SRCS-$(CONFIG_POSTPROC_VISUALIZER) += common/textblit.c
+VP10_COMMON_SRCS-yes += common/common_data.h
+VP10_COMMON_SRCS-yes += common/scan.c
+VP10_COMMON_SRCS-yes += common/scan.h
+VP10_COMMON_SRCS-yes += common/vp10_fwd_txfm.h
+VP10_COMMON_SRCS-yes += common/vp10_fwd_txfm.c
+
+VP10_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/postproc.h
+VP10_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/postproc.c
+VP10_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/mfqe.h
+VP10_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/mfqe.c
+ifeq ($(CONFIG_VP9_POSTPROC),yes)
+VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/mfqe_sse2.asm
+VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/postproc_sse2.asm
+endif
+
+ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+VP10_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/itrans4_dspr2.c
+VP10_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/itrans8_dspr2.c
+VP10_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/itrans16_dspr2.c
+endif
+
+# common (msa)
+VP10_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/idct4x4_msa.c
+VP10_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/idct8x8_msa.c
+VP10_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/idct16x16_msa.c
+
+ifeq ($(CONFIG_VP9_POSTPROC),yes)
+VP10_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/mfqe_msa.c
+endif
+
+VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idct_intrin_sse2.c
+VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp10_fwd_txfm_sse2.c
+VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp10_fwd_dct32x32_impl_sse2.h
+VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp10_fwd_txfm_impl_sse2.h
+
+ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+VP10_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/iht4x4_add_neon.c
+VP10_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/iht8x8_add_neon.c
+endif
+
+VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp10_inv_txfm_sse2.c
+VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp10_inv_txfm_sse2.h
+
+$(eval $(call rtcd_h_template,vp10_rtcd,vp10/common/vp10_rtcd_defs.pl))
diff --git a/libs/libvpx/vp10/vp10_cx_iface.c b/libs/libvpx/vp10/vp10_cx_iface.c
new file mode 100644
index 0000000000..63d3adc1ff
--- /dev/null
+++ b/libs/libvpx/vp10/vp10_cx_iface.c
@@ -0,0 +1,1395 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_encoder.h"
+#include "vpx_ports/vpx_once.h"
+#include "vpx/internal/vpx_codec_internal.h"
+#include "./vpx_version.h"
+#include "vp10/encoder/encoder.h"
+#include "vpx/vp8cx.h"
+#include "vp10/encoder/firstpass.h"
+#include "vp10/vp10_iface_common.h"
+
+struct vp10_extracfg {
+  int                         cpu_used;  // available cpu percentage in 1/16
+  unsigned int                enable_auto_alt_ref;
+  unsigned int                noise_sensitivity;
+  unsigned int                sharpness;
+  unsigned int                static_thresh;
+  unsigned int                tile_columns;
+  unsigned int                tile_rows;
+  unsigned int                arnr_max_frames;
+  unsigned int                arnr_strength;
+  unsigned int                min_gf_interval;
+  unsigned int                max_gf_interval;
+  vp8e_tuning                 tuning;
+  unsigned int                cq_level;  // constrained quality level
+  unsigned int                rc_max_intra_bitrate_pct;
+  unsigned int                rc_max_inter_bitrate_pct;
+  unsigned int                gf_cbr_boost_pct;
+  unsigned int                lossless;
+  unsigned int                frame_parallel_decoding_mode;
+  AQ_MODE                     aq_mode;
+  unsigned int                frame_periodic_boost;
+  vpx_bit_depth_t             bit_depth;
+  vp9e_tune_content           content;
+  vpx_color_space_t           color_space;
+  int                         color_range;
+  int                         render_width;
+  int                         render_height;
+};
+
+static struct vp10_extracfg default_extra_cfg = {
+  0,                          // cpu_used
+  1,                          // enable_auto_alt_ref
+  0,                          // noise_sensitivity
+  0,                          // sharpness
+  0,                          // static_thresh
+  6,                          // tile_columns
+  0,                          // tile_rows
+  7,                          // arnr_max_frames
+  5,                          // arnr_strength
+  0,                          // min_gf_interval; 0 -> default decision
+  0,                          // max_gf_interval; 0 -> default decision
+  VP8_TUNE_PSNR,              // tuning
+  10,                         // cq_level
+  0,                          // rc_max_intra_bitrate_pct
+  0,                          // rc_max_inter_bitrate_pct
+  0,                          // gf_cbr_boost_pct
+  0,                          // lossless
+  1,                          // frame_parallel_decoding_mode
+  NO_AQ,                      // aq_mode
+  0,                          // frame_periodic_delta_q
+  VPX_BITS_8,                 // Bit depth
+  VP9E_CONTENT_DEFAULT,       // content
+  VPX_CS_UNKNOWN,             // color space
+  0,                          // color range
+  0,                          // render width
+  0,                          // render height
+};
+
+struct vpx_codec_alg_priv {
+  vpx_codec_priv_t        base;
+  vpx_codec_enc_cfg_t     cfg;
+  struct vp10_extracfg    extra_cfg;
+  VP10EncoderConfig       oxcf;
+  VP10_COMP               *cpi;
+  unsigned char           *cx_data;
+  size_t                  cx_data_sz;
+  unsigned char           *pending_cx_data;
+  size_t                  pending_cx_data_sz;
+  int                     pending_frame_count;
+  size_t                  pending_frame_sizes[8];
+#if !CONFIG_MISC_FIXES
+  size_t                  pending_frame_magnitude;
+#endif
+  vpx_image_t             preview_img;
+  vpx_enc_frame_flags_t   next_frame_flags;
+  vp8_postproc_cfg_t      preview_ppcfg;
+  vpx_codec_pkt_list_decl(256) pkt_list;
+  unsigned int            fixed_kf_cntr;
+  vpx_codec_priv_output_cx_pkt_cb_pair_t output_cx_pkt_cb;
+  // BufferPool that holds all reference frames.
+  BufferPool              *buffer_pool;
+};
+
+static VP9_REFFRAME ref_frame_to_vp10_reframe(vpx_ref_frame_type_t frame) {
+  switch (frame) {
+    case VP8_LAST_FRAME:
+      return VP9_LAST_FLAG;
+    case VP8_GOLD_FRAME:
+      return VP9_GOLD_FLAG;
+    case VP8_ALTR_FRAME:
+      return VP9_ALT_FLAG;
+  }
+  assert(0 && "Invalid Reference Frame");
+  return VP9_LAST_FLAG;
+}
+
+static vpx_codec_err_t update_error_state(vpx_codec_alg_priv_t *ctx,
+    const struct vpx_internal_error_info *error) {
+  const vpx_codec_err_t res = error->error_code;
+
+  if (res != VPX_CODEC_OK)
+    ctx->base.err_detail = error->has_detail ? error->detail : NULL;
+
+  return res;
+}
+
+
+#undef ERROR
+#define ERROR(str) do {\
+    ctx->base.err_detail = str;\
+    return VPX_CODEC_INVALID_PARAM;\
+  } while (0)
+
+#define RANGE_CHECK(p, memb, lo, hi) do {\
+    if (!(((p)->memb == lo || (p)->memb > (lo)) && (p)->memb <= hi)) \
+      ERROR(#memb " out of range ["#lo".."#hi"]");\
+  } while (0)
+
+#define RANGE_CHECK_HI(p, memb, hi) do {\
+    if (!((p)->memb <= (hi))) \
+      ERROR(#memb " out of range [.."#hi"]");\
+  } while (0)
+
+#define RANGE_CHECK_LO(p, memb, lo) do {\
+    if (!((p)->memb >= (lo))) \
+      ERROR(#memb " out of range ["#lo"..]");\
+  } while (0)
+
+#define RANGE_CHECK_BOOL(p, memb) do {\
+    if (!!((p)->memb) != (p)->memb) ERROR(#memb " expected boolean");\
+  } while (0)
+
+static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
+                                       const vpx_codec_enc_cfg_t *cfg,
+                                       const struct vp10_extracfg *extra_cfg) {
+  RANGE_CHECK(cfg, g_w,                   1, 65535);  // 16 bits available
+  RANGE_CHECK(cfg, g_h,                   1, 65535);  // 16 bits available
+  RANGE_CHECK(cfg, g_timebase.den,        1, 1000000000);
+  RANGE_CHECK(cfg, g_timebase.num,        1, cfg->g_timebase.den);
+  RANGE_CHECK_HI(cfg, g_profile,          3);
+
+  RANGE_CHECK_HI(cfg, rc_max_quantizer,   63);
+  RANGE_CHECK_HI(cfg, rc_min_quantizer,   cfg->rc_max_quantizer);
+  RANGE_CHECK_BOOL(extra_cfg, lossless);
+  RANGE_CHECK(extra_cfg, aq_mode,           0, AQ_MODE_COUNT - 1);
+  RANGE_CHECK(extra_cfg, frame_periodic_boost, 0, 1);
+  RANGE_CHECK_HI(cfg, g_threads,          64);
+  RANGE_CHECK_HI(cfg, g_lag_in_frames,    MAX_LAG_BUFFERS);
+  RANGE_CHECK(cfg, rc_end_usage,          VPX_VBR, VPX_Q);
+  RANGE_CHECK_HI(cfg, rc_undershoot_pct,  100);
+  RANGE_CHECK_HI(cfg, rc_overshoot_pct,   100);
+  RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100);
+  RANGE_CHECK(cfg, kf_mode,               VPX_KF_DISABLED, VPX_KF_AUTO);
+  RANGE_CHECK_BOOL(cfg,                   rc_resize_allowed);
+  RANGE_CHECK_HI(cfg, rc_dropframe_thresh,   100);
+  RANGE_CHECK_HI(cfg, rc_resize_up_thresh,   100);
+  RANGE_CHECK_HI(cfg, rc_resize_down_thresh, 100);
+  RANGE_CHECK(cfg,        g_pass,         VPX_RC_ONE_PASS, VPX_RC_LAST_PASS);
+  RANGE_CHECK(extra_cfg, min_gf_interval, 0, (MAX_LAG_BUFFERS - 1));
+  RANGE_CHECK(extra_cfg, max_gf_interval, 0, (MAX_LAG_BUFFERS - 1));
+  if (extra_cfg->max_gf_interval > 0) {
+    RANGE_CHECK(extra_cfg, max_gf_interval, 2, (MAX_LAG_BUFFERS - 1));
+  }
+  if (extra_cfg->min_gf_interval > 0 && extra_cfg->max_gf_interval > 0) {
+    RANGE_CHECK(extra_cfg, max_gf_interval, extra_cfg->min_gf_interval,
+      (MAX_LAG_BUFFERS - 1));
+  }
+
+  if (cfg->rc_resize_allowed == 1) {
+    RANGE_CHECK(cfg, rc_scaled_width, 0, cfg->g_w);
+    RANGE_CHECK(cfg, rc_scaled_height, 0, cfg->g_h);
+  }
+
+  // Spatial/temporal scalability are not yet supported in VP10.
+  // Only accept the default value for range checking.
+  RANGE_CHECK(cfg, ss_number_layers, 1, 1);
+  RANGE_CHECK(cfg, ts_number_layers, 1, 1);
+  // VP9 does not support a lower bound on the keyframe interval in
+  // automatic keyframe placement mode.
+  if (cfg->kf_mode != VPX_KF_DISABLED &&
+      cfg->kf_min_dist != cfg->kf_max_dist &&
+      cfg->kf_min_dist > 0)
+    ERROR("kf_min_dist not supported in auto mode, use 0 "
+          "or kf_max_dist instead.");
+
+  RANGE_CHECK(extra_cfg, enable_auto_alt_ref, 0, 2);
+  RANGE_CHECK(extra_cfg, cpu_used, -8, 8);
+  RANGE_CHECK_HI(extra_cfg, noise_sensitivity, 6);
+  RANGE_CHECK(extra_cfg, tile_columns, 0, 6);
+  RANGE_CHECK(extra_cfg, tile_rows, 0, 2);
+  RANGE_CHECK_HI(extra_cfg, sharpness, 7);
+  RANGE_CHECK(extra_cfg, arnr_max_frames, 0, 15);
+  RANGE_CHECK_HI(extra_cfg, arnr_strength, 6);
+  RANGE_CHECK(extra_cfg, cq_level, 0, 63);
+  RANGE_CHECK(cfg, g_bit_depth, VPX_BITS_8, VPX_BITS_12);
+  RANGE_CHECK(cfg, g_input_bit_depth, 8, 12);
+  RANGE_CHECK(extra_cfg, content,
+              VP9E_CONTENT_DEFAULT, VP9E_CONTENT_INVALID - 1);
+
+  // TODO(yaowu): remove this when ssim tuning is implemented for vp9
+  if (extra_cfg->tuning == VP8_TUNE_SSIM)
+      ERROR("Option --tune=ssim is not currently supported in VP9.");
+
+  if (cfg->g_pass == VPX_RC_LAST_PASS) {
+    const size_t packet_sz = sizeof(FIRSTPASS_STATS);
+    const int n_packets = (int)(cfg->rc_twopass_stats_in.sz / packet_sz);
+    const FIRSTPASS_STATS *stats;
+
+    if (cfg->rc_twopass_stats_in.buf == NULL)
+      ERROR("rc_twopass_stats_in.buf not set.");
+
+    if (cfg->rc_twopass_stats_in.sz % packet_sz)
+      ERROR("rc_twopass_stats_in.sz indicates truncated packet.");
+
+    if (cfg->rc_twopass_stats_in.sz < 2 * packet_sz)
+      ERROR("rc_twopass_stats_in requires at least two packets.");
+
+    stats =
+        (const FIRSTPASS_STATS *)cfg->rc_twopass_stats_in.buf + n_packets - 1;
+
+    if ((int)(stats->count + 0.5) != n_packets - 1)
+      ERROR("rc_twopass_stats_in missing EOS stats packet");
+  }
+
+#if !CONFIG_VP9_HIGHBITDEPTH
+  if (cfg->g_profile > (unsigned int)PROFILE_1) {
+    ERROR("Profile > 1 not supported in this build configuration");
+  }
+#endif
+  if (cfg->g_profile <= (unsigned int)PROFILE_1 &&
+      cfg->g_bit_depth > VPX_BITS_8) {
+    ERROR("Codec high bit-depth not supported in profile < 2");
+  }
+  if (cfg->g_profile <= (unsigned int)PROFILE_1 &&
+      cfg->g_input_bit_depth > 8) {
+    ERROR("Source high bit-depth not supported in profile < 2");
+  }
+  if (cfg->g_profile > (unsigned int)PROFILE_1 &&
+      cfg->g_bit_depth == VPX_BITS_8) {
+    ERROR("Codec bit-depth 8 not supported in profile > 1");
+  }
+  RANGE_CHECK(extra_cfg, color_space, VPX_CS_UNKNOWN, VPX_CS_SRGB);
+  RANGE_CHECK(extra_cfg, color_range, 0, 1);
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t validate_img(vpx_codec_alg_priv_t *ctx,
+                                    const vpx_image_t *img) {
+  switch (img->fmt) {
+    case VPX_IMG_FMT_YV12:
+    case VPX_IMG_FMT_I420:
+    case VPX_IMG_FMT_I42016:
+      break;
+    case VPX_IMG_FMT_I422:
+    case VPX_IMG_FMT_I444:
+    case VPX_IMG_FMT_I440:
+      if (ctx->cfg.g_profile != (unsigned int)PROFILE_1) {
+        ERROR("Invalid image format. I422, I444, I440 images are "
+              "not supported in profile.");
+      }
+      break;
+    case VPX_IMG_FMT_I42216:
+    case VPX_IMG_FMT_I44416:
+    case VPX_IMG_FMT_I44016:
+      if (ctx->cfg.g_profile != (unsigned int)PROFILE_1 &&
+          ctx->cfg.g_profile != (unsigned int)PROFILE_3) {
+        ERROR("Invalid image format. 16-bit I422, I444, I440 images are "
+              "not supported in profile.");
+      }
+      break;
+    default:
+      ERROR("Invalid image format. Only YV12, I420, I422, I444 images are "
+            "supported.");
+      break;
+  }
+
+  if (img->d_w != ctx->cfg.g_w || img->d_h != ctx->cfg.g_h)
+    ERROR("Image size must match encoder init configuration size");
+
+  return VPX_CODEC_OK;
+}
+
+static int get_image_bps(const vpx_image_t *img) {
+  switch (img->fmt) {
+    case VPX_IMG_FMT_YV12:
+    case VPX_IMG_FMT_I420: return 12;
+    case VPX_IMG_FMT_I422: return 16;
+    case VPX_IMG_FMT_I444: return 24;
+    case VPX_IMG_FMT_I440: return 16;
+    case VPX_IMG_FMT_I42016: return 24;
+    case VPX_IMG_FMT_I42216: return 32;
+    case VPX_IMG_FMT_I44416: return 48;
+    case VPX_IMG_FMT_I44016: return 32;
+    default: assert(0 && "Invalid image format"); break;
+  }
+  return 0;
+}
+
+static vpx_codec_err_t set_encoder_config(
+  VP10EncoderConfig *oxcf,
+  const vpx_codec_enc_cfg_t *cfg,
+  const struct vp10_extracfg *extra_cfg) {
+  const int is_vbr = cfg->rc_end_usage == VPX_VBR;
+  oxcf->profile = cfg->g_profile;
+  oxcf->max_threads = (int)cfg->g_threads;
+  oxcf->width   = cfg->g_w;
+  oxcf->height  = cfg->g_h;
+  oxcf->bit_depth = cfg->g_bit_depth;
+  oxcf->input_bit_depth = cfg->g_input_bit_depth;
+  // guess a frame rate if out of whack, use 30
+  oxcf->init_framerate = (double)cfg->g_timebase.den / cfg->g_timebase.num;
+  if (oxcf->init_framerate > 180)
+    oxcf->init_framerate = 30;
+
+  oxcf->mode = GOOD;
+
+  switch (cfg->g_pass) {
+    case VPX_RC_ONE_PASS:
+      oxcf->pass = 0;
+      break;
+    case VPX_RC_FIRST_PASS:
+      oxcf->pass = 1;
+      break;
+    case VPX_RC_LAST_PASS:
+      oxcf->pass = 2;
+      break;
+  }
+
+  oxcf->lag_in_frames = cfg->g_pass == VPX_RC_FIRST_PASS ? 0
+                                                         : cfg->g_lag_in_frames;
+  oxcf->rc_mode = cfg->rc_end_usage;
+
+  // Convert target bandwidth from Kbit/s to Bit/s
+  oxcf->target_bandwidth = 1000 * cfg->rc_target_bitrate;
+  oxcf->rc_max_intra_bitrate_pct = extra_cfg->rc_max_intra_bitrate_pct;
+  oxcf->rc_max_inter_bitrate_pct = extra_cfg->rc_max_inter_bitrate_pct;
+  oxcf->gf_cbr_boost_pct = extra_cfg->gf_cbr_boost_pct;
+
+  oxcf->best_allowed_q =
+      extra_cfg->lossless ? 0 : vp10_quantizer_to_qindex(cfg->rc_min_quantizer);
+  oxcf->worst_allowed_q =
+      extra_cfg->lossless ? 0 : vp10_quantizer_to_qindex(cfg->rc_max_quantizer);
+  oxcf->cq_level        = vp10_quantizer_to_qindex(extra_cfg->cq_level);
+  oxcf->fixed_q = -1;
+
+  oxcf->under_shoot_pct         = cfg->rc_undershoot_pct;
+  oxcf->over_shoot_pct          = cfg->rc_overshoot_pct;
+
+  oxcf->scaled_frame_width  = cfg->rc_scaled_width;
+  oxcf->scaled_frame_height = cfg->rc_scaled_height;
+  if (cfg->rc_resize_allowed == 1) {
+    oxcf->resize_mode =
+        (oxcf->scaled_frame_width == 0 || oxcf->scaled_frame_height == 0) ?
+            RESIZE_DYNAMIC : RESIZE_FIXED;
+  } else {
+    oxcf->resize_mode = RESIZE_NONE;
+  }
+
+  oxcf->maximum_buffer_size_ms   = is_vbr ? 240000 : cfg->rc_buf_sz;
+  oxcf->starting_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_initial_sz;
+  oxcf->optimal_buffer_level_ms  = is_vbr ? 60000 : cfg->rc_buf_optimal_sz;
+
+  oxcf->drop_frames_water_mark   = cfg->rc_dropframe_thresh;
+
+  oxcf->two_pass_vbrbias         = cfg->rc_2pass_vbr_bias_pct;
+  oxcf->two_pass_vbrmin_section  = cfg->rc_2pass_vbr_minsection_pct;
+  oxcf->two_pass_vbrmax_section  = cfg->rc_2pass_vbr_maxsection_pct;
+
+  oxcf->auto_key               = cfg->kf_mode == VPX_KF_AUTO &&
+                                 cfg->kf_min_dist != cfg->kf_max_dist;
+
+  oxcf->key_freq               = cfg->kf_max_dist;
+
+  oxcf->speed                  =  abs(extra_cfg->cpu_used);
+  oxcf->encode_breakout        =  extra_cfg->static_thresh;
+  oxcf->enable_auto_arf        =  extra_cfg->enable_auto_alt_ref;
+  oxcf->noise_sensitivity      =  extra_cfg->noise_sensitivity;
+  oxcf->sharpness              =  extra_cfg->sharpness;
+
+  oxcf->two_pass_stats_in      =  cfg->rc_twopass_stats_in;
+
+#if CONFIG_FP_MB_STATS
+  oxcf->firstpass_mb_stats_in  = cfg->rc_firstpass_mb_stats_in;
+#endif
+
+  oxcf->color_space = extra_cfg->color_space;
+  oxcf->color_range = extra_cfg->color_range;
+  oxcf->render_width  = extra_cfg->render_width;
+  oxcf->render_height = extra_cfg->render_height;
+  oxcf->arnr_max_frames = extra_cfg->arnr_max_frames;
+  oxcf->arnr_strength   = extra_cfg->arnr_strength;
+  oxcf->min_gf_interval = extra_cfg->min_gf_interval;
+  oxcf->max_gf_interval = extra_cfg->max_gf_interval;
+
+  oxcf->tuning = extra_cfg->tuning;
+  oxcf->content = extra_cfg->content;
+
+  oxcf->tile_columns = extra_cfg->tile_columns;
+  oxcf->tile_rows    = extra_cfg->tile_rows;
+
+  oxcf->error_resilient_mode         = cfg->g_error_resilient;
+  oxcf->frame_parallel_decoding_mode = extra_cfg->frame_parallel_decoding_mode;
+
+  oxcf->aq_mode = extra_cfg->aq_mode;
+
+  oxcf->frame_periodic_boost =  extra_cfg->frame_periodic_boost;
+
+  /*
+  printf("Current VP9 Settings: \n");
+  printf("target_bandwidth: %d\n", oxcf->target_bandwidth);
+  printf("noise_sensitivity: %d\n", oxcf->noise_sensitivity);
+  printf("sharpness: %d\n",    oxcf->sharpness);
+  printf("cpu_used: %d\n",  oxcf->cpu_used);
+  printf("Mode: %d\n",     oxcf->mode);
+  printf("auto_key: %d\n",  oxcf->auto_key);
+  printf("key_freq: %d\n", oxcf->key_freq);
+  printf("end_usage: %d\n", oxcf->end_usage);
+  printf("under_shoot_pct: %d\n", oxcf->under_shoot_pct);
+  printf("over_shoot_pct: %d\n", oxcf->over_shoot_pct);
+  printf("starting_buffer_level: %d\n", oxcf->starting_buffer_level);
+  printf("optimal_buffer_level: %d\n",  oxcf->optimal_buffer_level);
+  printf("maximum_buffer_size: %d\n", oxcf->maximum_buffer_size);
+  printf("fixed_q: %d\n",  oxcf->fixed_q);
+  printf("worst_allowed_q: %d\n", oxcf->worst_allowed_q);
+  printf("best_allowed_q: %d\n", oxcf->best_allowed_q);
+  printf("allow_spatial_resampling: %d\n", oxcf->allow_spatial_resampling);
+  printf("scaled_frame_width: %d\n", oxcf->scaled_frame_width);
+  printf("scaled_frame_height: %d\n", oxcf->scaled_frame_height);
+  printf("two_pass_vbrbias: %d\n",  oxcf->two_pass_vbrbias);
+  printf("two_pass_vbrmin_section: %d\n", oxcf->two_pass_vbrmin_section);
+  printf("two_pass_vbrmax_section: %d\n", oxcf->two_pass_vbrmax_section);
+  printf("lag_in_frames: %d\n", oxcf->lag_in_frames);
+  printf("enable_auto_arf: %d\n", oxcf->enable_auto_arf);
+  printf("Version: %d\n", oxcf->Version);
+  printf("encode_breakout: %d\n", oxcf->encode_breakout);
+  printf("error resilient: %d\n", oxcf->error_resilient_mode);
+  printf("frame parallel detokenization: %d\n",
+         oxcf->frame_parallel_decoding_mode);
+  */
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx,
+                                          const vpx_codec_enc_cfg_t  *cfg) {
+  vpx_codec_err_t res;
+  int force_key = 0;
+
+  if (cfg->g_w != ctx->cfg.g_w || cfg->g_h != ctx->cfg.g_h) {
+    if (cfg->g_lag_in_frames > 1 || cfg->g_pass != VPX_RC_ONE_PASS)
+      ERROR("Cannot change width or height after initialization");
+    if (!valid_ref_frame_size(ctx->cfg.g_w, ctx->cfg.g_h, cfg->g_w, cfg->g_h) ||
+        (ctx->cpi->initial_width && (int)cfg->g_w > ctx->cpi->initial_width) ||
+        (ctx->cpi->initial_height && (int)cfg->g_h > ctx->cpi->initial_height))
+      force_key = 1;
+  }
+
+  // Prevent increasing lag_in_frames. This check is stricter than it needs
+  // to be -- the limit is not increasing past the first lag_in_frames
+  // value, but we don't track the initial config, only the last successful
+  // config.
+  if (cfg->g_lag_in_frames > ctx->cfg.g_lag_in_frames)
+    ERROR("Cannot increase lag_in_frames");
+
+  res = validate_config(ctx, cfg, &ctx->extra_cfg);
+
+  if (res == VPX_CODEC_OK) {
+    ctx->cfg = *cfg;
+    set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
+    // On profile change, request a key frame
+    force_key |= ctx->cpi->common.profile != ctx->oxcf.profile;
+    vp10_change_config(ctx->cpi, &ctx->oxcf);
+  }
+
+  if (force_key)
+    ctx->next_frame_flags |= VPX_EFLAG_FORCE_KF;
+
+  return res;
+}
+
+static vpx_codec_err_t ctrl_get_quantizer(vpx_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  int *const arg = va_arg(args, int *);
+  if (arg == NULL)
+    return VPX_CODEC_INVALID_PARAM;
+  *arg = vp10_get_quantizer(ctx->cpi);
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t ctrl_get_quantizer64(vpx_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  int *const arg = va_arg(args, int *);
+  if (arg == NULL)
+    return VPX_CODEC_INVALID_PARAM;
+  *arg = vp10_qindex_to_quantizer(vp10_get_quantizer(ctx->cpi));
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t update_extra_cfg(vpx_codec_alg_priv_t *ctx,
+                                        const struct vp10_extracfg *extra_cfg) {
+  const vpx_codec_err_t res = validate_config(ctx, &ctx->cfg, extra_cfg);
+  if (res == VPX_CODEC_OK) {
+    ctx->extra_cfg = *extra_cfg;
+    set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
+    vp10_change_config(ctx->cpi, &ctx->oxcf);
+  }
+  return res;
+}
+
+static vpx_codec_err_t ctrl_set_cpuused(vpx_codec_alg_priv_t *ctx,
+                                        va_list args) {
+  struct vp10_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.cpu_used = CAST(VP8E_SET_CPUUSED, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_enable_auto_alt_ref(vpx_codec_alg_priv_t *ctx,
+                                                    va_list args) {
+  struct vp10_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_auto_alt_ref = CAST(VP8E_SET_ENABLEAUTOALTREF, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_noise_sensitivity(vpx_codec_alg_priv_t *ctx,
+                                                  va_list args) {
+  struct vp10_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.noise_sensitivity = CAST(VP9E_SET_NOISE_SENSITIVITY, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_sharpness(vpx_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  struct vp10_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.sharpness = CAST(VP8E_SET_SHARPNESS, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_static_thresh(vpx_codec_alg_priv_t *ctx,
+                                              va_list args) {
+  struct vp10_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.static_thresh = CAST(VP8E_SET_STATIC_THRESHOLD, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_tile_columns(vpx_codec_alg_priv_t *ctx,
+                                             va_list args) {
+  struct vp10_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.tile_columns = CAST(VP9E_SET_TILE_COLUMNS, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_tile_rows(vpx_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  struct vp10_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.tile_rows = CAST(VP9E_SET_TILE_ROWS, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_arnr_max_frames(vpx_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  struct vp10_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.arnr_max_frames = CAST(VP8E_SET_ARNR_MAXFRAMES, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_arnr_strength(vpx_codec_alg_priv_t *ctx,
+                                              va_list args) {
+  struct vp10_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.arnr_strength = CAST(VP8E_SET_ARNR_STRENGTH, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_arnr_type(vpx_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  (void)ctx;
+  (void)args;
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t ctrl_set_tuning(vpx_codec_alg_priv_t *ctx,
+                                       va_list args) {
+  struct vp10_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.tuning = CAST(VP8E_SET_TUNING, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_cq_level(vpx_codec_alg_priv_t *ctx,
+                                         va_list args) {
+  struct vp10_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.cq_level = CAST(VP8E_SET_CQ_LEVEL, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_rc_max_intra_bitrate_pct(
+    vpx_codec_alg_priv_t *ctx, va_list args) {
+  struct vp10_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.rc_max_intra_bitrate_pct =
+      CAST(VP8E_SET_MAX_INTRA_BITRATE_PCT, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_rc_max_inter_bitrate_pct(
+    vpx_codec_alg_priv_t *ctx, va_list args) {
+  struct vp10_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.rc_max_inter_bitrate_pct =
+      CAST(VP8E_SET_MAX_INTER_BITRATE_PCT, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_rc_gf_cbr_boost_pct(
+    vpx_codec_alg_priv_t *ctx, va_list args) {
+  struct vp10_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.gf_cbr_boost_pct =
+      CAST(VP9E_SET_GF_CBR_BOOST_PCT, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_lossless(vpx_codec_alg_priv_t *ctx,
+                                         va_list args) {
+  struct vp10_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.lossless = CAST(VP9E_SET_LOSSLESS, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_frame_parallel_decoding_mode(
+    vpx_codec_alg_priv_t *ctx, va_list args) {
+  struct vp10_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.frame_parallel_decoding_mode =
+      CAST(VP9E_SET_FRAME_PARALLEL_DECODING, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_aq_mode(vpx_codec_alg_priv_t *ctx,
+                                        va_list args) {
+  struct vp10_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.aq_mode = CAST(VP9E_SET_AQ_MODE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_min_gf_interval(vpx_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  struct vp10_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.min_gf_interval = CAST(VP9E_SET_MIN_GF_INTERVAL, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_max_gf_interval(vpx_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  struct vp10_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.max_gf_interval = CAST(VP9E_SET_MAX_GF_INTERVAL, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_frame_periodic_boost(vpx_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  struct vp10_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.frame_periodic_boost = CAST(VP9E_SET_FRAME_PERIODIC_BOOST, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx,
+                                    vpx_codec_priv_enc_mr_cfg_t *data) {
+  vpx_codec_err_t res = VPX_CODEC_OK;
+  (void)data;
+
+  if (ctx->priv == NULL) {
+    vpx_codec_alg_priv_t *const priv = vpx_calloc(1, sizeof(*priv));
+    if (priv == NULL)
+      return VPX_CODEC_MEM_ERROR;
+
+    ctx->priv = (vpx_codec_priv_t *)priv;
+    ctx->priv->init_flags = ctx->init_flags;
+    ctx->priv->enc.total_encoders = 1;
+    priv->buffer_pool =
+        (BufferPool *)vpx_calloc(1, sizeof(BufferPool));
+    if (priv->buffer_pool == NULL)
+      return VPX_CODEC_MEM_ERROR;
+
+#if CONFIG_MULTITHREAD
+    if (pthread_mutex_init(&priv->buffer_pool->pool_mutex, NULL)) {
+      return VPX_CODEC_MEM_ERROR;
+    }
+#endif
+
+    if (ctx->config.enc) {
+      // Update the reference to the config structure to an internal copy.
+      priv->cfg = *ctx->config.enc;
+      ctx->config.enc = &priv->cfg;
+    }
+
+    priv->extra_cfg = default_extra_cfg;
+    once(vp10_initialize_enc);
+
+    res = validate_config(priv, &priv->cfg, &priv->extra_cfg);
+
+    if (res == VPX_CODEC_OK) {
+      set_encoder_config(&priv->oxcf, &priv->cfg, &priv->extra_cfg);
+#if CONFIG_VP9_HIGHBITDEPTH
+      priv->oxcf.use_highbitdepth =
+          (ctx->init_flags & VPX_CODEC_USE_HIGHBITDEPTH) ? 1 : 0;
+#endif
+      priv->cpi = vp10_create_compressor(&priv->oxcf, priv->buffer_pool);
+      if (priv->cpi == NULL)
+        res = VPX_CODEC_MEM_ERROR;
+      else
+        priv->cpi->output_pkt_list = &priv->pkt_list.head;
+    }
+  }
+
+  return res;
+}
+
+static vpx_codec_err_t encoder_destroy(vpx_codec_alg_priv_t *ctx) {
+  free(ctx->cx_data);
+  vp10_remove_compressor(ctx->cpi);
+#if CONFIG_MULTITHREAD
+  pthread_mutex_destroy(&ctx->buffer_pool->pool_mutex);
+#endif
+  vpx_free(ctx->buffer_pool);
+  vpx_free(ctx);
+  return VPX_CODEC_OK;
+}
+
+static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
+                                    unsigned long duration,
+                                    unsigned long deadline) {
+  MODE new_mode = BEST;
+
+  switch (ctx->cfg.g_pass) {
+    case VPX_RC_ONE_PASS:
+      if (deadline > 0) {
+        const vpx_codec_enc_cfg_t *const cfg = &ctx->cfg;
+
+        // Convert duration parameter from stream timebase to microseconds.
+        const uint64_t duration_us = (uint64_t)duration * 1000000 *
+           (uint64_t)cfg->g_timebase.num /(uint64_t)cfg->g_timebase.den;
+
+        // If the deadline is more that the duration this frame is to be shown,
+        // use good quality mode. Otherwise use realtime mode.
+        new_mode = (deadline > duration_us) ? GOOD : REALTIME;
+      } else {
+        new_mode = BEST;
+      }
+      break;
+    case VPX_RC_FIRST_PASS:
+      break;
+    case VPX_RC_LAST_PASS:
+      new_mode = deadline > 0 ? GOOD : BEST;
+      break;
+  }
+
+  if (ctx->oxcf.mode != new_mode) {
+    ctx->oxcf.mode = new_mode;
+    vp10_change_config(ctx->cpi, &ctx->oxcf);
+  }
+}
+
+// Turn on to test if supplemental superframe data breaks decoding
+// #define TEST_SUPPLEMENTAL_SUPERFRAME_DATA
+static int write_superframe_index(vpx_codec_alg_priv_t *ctx) {
+  uint8_t marker = 0xc0;
+  unsigned int mask;
+  int mag, index_sz;
+#if CONFIG_MISC_FIXES
+  int i;
+  size_t max_frame_sz = 0;
+#endif
+
+  assert(ctx->pending_frame_count);
+  assert(ctx->pending_frame_count <= 8);
+
+  // Add the number of frames to the marker byte
+  marker |= ctx->pending_frame_count - 1;
+#if CONFIG_MISC_FIXES
+  for (i = 0; i < ctx->pending_frame_count - 1; i++) {
+    const size_t frame_sz = (unsigned int) ctx->pending_frame_sizes[i] - 1;
+    max_frame_sz = frame_sz > max_frame_sz ? frame_sz : max_frame_sz;
+  }
+#endif
+
+  // Choose the magnitude
+  for (mag = 0, mask = 0xff; mag < 4; mag++) {
+#if CONFIG_MISC_FIXES
+    if (max_frame_sz <= mask)
+      break;
+#else
+    if (ctx->pending_frame_magnitude < mask)
+      break;
+#endif
+    mask <<= 8;
+    mask |= 0xff;
+  }
+  marker |= mag << 3;
+
+  // Write the index
+  index_sz = 2 + (mag + 1) * (ctx->pending_frame_count - CONFIG_MISC_FIXES);
+  if (ctx->pending_cx_data_sz + index_sz < ctx->cx_data_sz) {
+    uint8_t *x = ctx->pending_cx_data + ctx->pending_cx_data_sz;
+    int i, j;
+#ifdef TEST_SUPPLEMENTAL_SUPERFRAME_DATA
+    uint8_t marker_test = 0xc0;
+    int mag_test = 2;     // 1 - 4
+    int frames_test = 4;  // 1 - 8
+    int index_sz_test = 2 + mag_test * frames_test;
+    marker_test |= frames_test - 1;
+    marker_test |= (mag_test - 1) << 3;
+    *x++ = marker_test;
+    for (i = 0; i < mag_test * frames_test; ++i)
+      *x++ = 0;  // fill up with arbitrary data
+    *x++ = marker_test;
+    ctx->pending_cx_data_sz += index_sz_test;
+    printf("Added supplemental superframe data\n");
+#endif
+
+    *x++ = marker;
+    for (i = 0; i < ctx->pending_frame_count - CONFIG_MISC_FIXES; i++) {
+      unsigned int this_sz;
+
+      assert(ctx->pending_frame_sizes[i] > 0);
+      this_sz = (unsigned int)ctx->pending_frame_sizes[i] - CONFIG_MISC_FIXES;
+      for (j = 0; j <= mag; j++) {
+        *x++ = this_sz & 0xff;
+        this_sz >>= 8;
+      }
+    }
+    *x++ = marker;
+    ctx->pending_cx_data_sz += index_sz;
+#ifdef TEST_SUPPLEMENTAL_SUPERFRAME_DATA
+    index_sz += index_sz_test;
+#endif
+  }
+  return index_sz;
+}
+
+// vp9 uses 10,000,000 ticks/second as time stamp
+#define TICKS_PER_SEC 10000000LL
+
+static int64_t timebase_units_to_ticks(const vpx_rational_t *timebase,
+                                       int64_t n) {
+  return n * TICKS_PER_SEC * timebase->num / timebase->den;
+}
+
+static int64_t ticks_to_timebase_units(const vpx_rational_t *timebase,
+                                       int64_t n) {
+  const int64_t round = TICKS_PER_SEC * timebase->num / 2 - 1;
+  return (n * timebase->den + round) / timebase->num / TICKS_PER_SEC;
+}
+
+static vpx_codec_frame_flags_t get_frame_pkt_flags(const VP10_COMP *cpi,
+                                                   unsigned int lib_flags) {
+  vpx_codec_frame_flags_t flags = lib_flags << 16;
+
+  if (lib_flags & FRAMEFLAGS_KEY)
+    flags |= VPX_FRAME_IS_KEY;
+
+  if (cpi->droppable)
+    flags |= VPX_FRAME_IS_DROPPABLE;
+
+  return flags;
+}
+
+static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t  *ctx,
+                                      const vpx_image_t *img,
+                                      vpx_codec_pts_t pts,
+                                      unsigned long duration,
+                                      vpx_enc_frame_flags_t flags,
+                                      unsigned long deadline) {
+  vpx_codec_err_t res = VPX_CODEC_OK;
+  VP10_COMP *const cpi = ctx->cpi;
+  const vpx_rational_t *const timebase = &ctx->cfg.g_timebase;
+  size_t data_sz;
+
+  if (img != NULL) {
+    res = validate_img(ctx, img);
+    // TODO(jzern) the checks related to cpi's validity should be treated as a
+    // failure condition, encoder setup is done fully in init() currently.
+    if (res == VPX_CODEC_OK && cpi != NULL) {
+      // There's no codec control for multiple alt-refs so check the encoder
+      // instance for its status to determine the compressed data size.
+      data_sz = ctx->cfg.g_w * ctx->cfg.g_h * get_image_bps(img) / 8 *
+                (cpi->multi_arf_allowed ? 8 : 2);
+      if (data_sz < 4096)
+        data_sz = 4096;
+      if (ctx->cx_data == NULL || ctx->cx_data_sz < data_sz) {
+        ctx->cx_data_sz = data_sz;
+        free(ctx->cx_data);
+        ctx->cx_data = (unsigned char*)malloc(ctx->cx_data_sz);
+        if (ctx->cx_data == NULL) {
+          return VPX_CODEC_MEM_ERROR;
+        }
+      }
+    }
+  }
+
+  pick_quickcompress_mode(ctx, duration, deadline);
+  vpx_codec_pkt_list_init(&ctx->pkt_list);
+
+  // Handle Flags
+  if (((flags & VP8_EFLAG_NO_UPD_GF) && (flags & VP8_EFLAG_FORCE_GF)) ||
+       ((flags & VP8_EFLAG_NO_UPD_ARF) && (flags & VP8_EFLAG_FORCE_ARF))) {
+    ctx->base.err_detail = "Conflicting flags.";
+    return VPX_CODEC_INVALID_PARAM;
+  }
+
+  vp10_apply_encoding_flags(cpi, flags);
+
+  // Handle fixed keyframe intervals
+  if (ctx->cfg.kf_mode == VPX_KF_AUTO &&
+      ctx->cfg.kf_min_dist == ctx->cfg.kf_max_dist) {
+    if (++ctx->fixed_kf_cntr > ctx->cfg.kf_min_dist) {
+      flags |= VPX_EFLAG_FORCE_KF;
+      ctx->fixed_kf_cntr = 1;
+    }
+  }
+
+  // Initialize the encoder instance on the first frame.
+  if (res == VPX_CODEC_OK && cpi != NULL) {
+    unsigned int lib_flags = 0;
+    YV12_BUFFER_CONFIG sd;
+    int64_t dst_time_stamp = timebase_units_to_ticks(timebase, pts);
+    int64_t dst_end_time_stamp =
+        timebase_units_to_ticks(timebase, pts + duration);
+    size_t size, cx_data_sz;
+    unsigned char *cx_data;
+
+    // Set up internal flags
+    if (ctx->base.init_flags & VPX_CODEC_USE_PSNR)
+      cpi->b_calculate_psnr = 1;
+
+    if (img != NULL) {
+      res = image2yuvconfig(img, &sd);
+
+      // Store the original flags in to the frame buffer. Will extract the
+      // key frame flag when we actually encode this frame.
+      if (vp10_receive_raw_frame(cpi, flags | ctx->next_frame_flags,
+                                &sd, dst_time_stamp, dst_end_time_stamp)) {
+        res = update_error_state(ctx, &cpi->common.error);
+      }
+      ctx->next_frame_flags = 0;
+    }
+
+    cx_data = ctx->cx_data;
+    cx_data_sz = ctx->cx_data_sz;
+
+    /* Any pending invisible frames? */
+    if (ctx->pending_cx_data) {
+      memmove(cx_data, ctx->pending_cx_data, ctx->pending_cx_data_sz);
+      ctx->pending_cx_data = cx_data;
+      cx_data += ctx->pending_cx_data_sz;
+      cx_data_sz -= ctx->pending_cx_data_sz;
+
+      /* TODO: this is a minimal check, the underlying codec doesn't respect
+       * the buffer size anyway.
+       */
+      if (cx_data_sz < ctx->cx_data_sz / 2) {
+        ctx->base.err_detail = "Compressed data buffer too small";
+        return VPX_CODEC_ERROR;
+      }
+    }
+
+    while (cx_data_sz >= ctx->cx_data_sz / 2 &&
+           -1 != vp10_get_compressed_data(cpi, &lib_flags, &size,
+                                         cx_data, &dst_time_stamp,
+                                         &dst_end_time_stamp, !img)) {
+      if (size) {
+        vpx_codec_cx_pkt_t pkt;
+
+        // Pack invisible frames with the next visible frame
+        if (!cpi->common.show_frame) {
+          if (ctx->pending_cx_data == 0)
+            ctx->pending_cx_data = cx_data;
+          ctx->pending_cx_data_sz += size;
+          ctx->pending_frame_sizes[ctx->pending_frame_count++] = size;
+#if !CONFIG_MISC_FIXES
+          ctx->pending_frame_magnitude |= size;
+#endif
+          cx_data += size;
+          cx_data_sz -= size;
+
+          if (ctx->output_cx_pkt_cb.output_cx_pkt) {
+            pkt.kind = VPX_CODEC_CX_FRAME_PKT;
+            pkt.data.frame.pts = ticks_to_timebase_units(timebase,
+                                                         dst_time_stamp);
+            pkt.data.frame.duration =
+               (unsigned long)ticks_to_timebase_units(timebase,
+                   dst_end_time_stamp - dst_time_stamp);
+            pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags);
+            pkt.data.frame.buf = ctx->pending_cx_data;
+            pkt.data.frame.sz  = size;
+            ctx->pending_cx_data = NULL;
+            ctx->pending_cx_data_sz = 0;
+            ctx->pending_frame_count = 0;
+#if !CONFIG_MISC_FIXES
+            ctx->pending_frame_magnitude = 0;
+#endif
+            ctx->output_cx_pkt_cb.output_cx_pkt(
+                &pkt, ctx->output_cx_pkt_cb.user_priv);
+          }
+          continue;
+        }
+
+        // Add the frame packet to the list of returned packets.
+        pkt.kind = VPX_CODEC_CX_FRAME_PKT;
+        pkt.data.frame.pts = ticks_to_timebase_units(timebase, dst_time_stamp);
+        pkt.data.frame.duration =
+           (unsigned long)ticks_to_timebase_units(timebase,
+               dst_end_time_stamp - dst_time_stamp);
+        pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags);
+
+        if (ctx->pending_cx_data) {
+          ctx->pending_frame_sizes[ctx->pending_frame_count++] = size;
+#if !CONFIG_MISC_FIXES
+          ctx->pending_frame_magnitude |= size;
+#endif
+          ctx->pending_cx_data_sz += size;
+          // write the superframe only for the case when
+          if (!ctx->output_cx_pkt_cb.output_cx_pkt)
+            size += write_superframe_index(ctx);
+          pkt.data.frame.buf = ctx->pending_cx_data;
+          pkt.data.frame.sz  = ctx->pending_cx_data_sz;
+          ctx->pending_cx_data = NULL;
+          ctx->pending_cx_data_sz = 0;
+          ctx->pending_frame_count = 0;
+#if !CONFIG_MISC_FIXES
+          ctx->pending_frame_magnitude = 0;
+#endif
+        } else {
+          pkt.data.frame.buf = cx_data;
+          pkt.data.frame.sz  = size;
+        }
+        pkt.data.frame.partition_id = -1;
+
+        if(ctx->output_cx_pkt_cb.output_cx_pkt)
+          ctx->output_cx_pkt_cb.output_cx_pkt(&pkt,
+                                              ctx->output_cx_pkt_cb.user_priv);
+        else
+          vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt);
+
+        cx_data += size;
+        cx_data_sz -= size;
+      }
+    }
+  }
+
+  return res;
+}
+
+static const vpx_codec_cx_pkt_t *encoder_get_cxdata(vpx_codec_alg_priv_t *ctx,
+                                                    vpx_codec_iter_t *iter) {
+  return vpx_codec_pkt_list_get(&ctx->pkt_list.head, iter);
+}
+
+static vpx_codec_err_t ctrl_set_reference(vpx_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  vpx_ref_frame_t *const frame = va_arg(args, vpx_ref_frame_t *);
+
+  if (frame != NULL) {
+    YV12_BUFFER_CONFIG sd;
+
+    image2yuvconfig(&frame->img, &sd);
+    vp10_set_reference_enc(ctx->cpi, ref_frame_to_vp10_reframe(frame->frame_type),
+                          &sd);
+    return VPX_CODEC_OK;
+  } else {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+}
+
+static vpx_codec_err_t ctrl_copy_reference(vpx_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  vpx_ref_frame_t *const frame = va_arg(args, vpx_ref_frame_t *);
+
+  if (frame != NULL) {
+    YV12_BUFFER_CONFIG sd;
+
+    image2yuvconfig(&frame->img, &sd);
+    vp10_copy_reference_enc(ctx->cpi,
+                           ref_frame_to_vp10_reframe(frame->frame_type), &sd);
+    return VPX_CODEC_OK;
+  } else {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+}
+
+static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  vp9_ref_frame_t *const frame = va_arg(args, vp9_ref_frame_t *);
+
+  if (frame != NULL) {
+    YV12_BUFFER_CONFIG *fb = get_ref_frame(&ctx->cpi->common, frame->idx);
+    if (fb == NULL) return VPX_CODEC_ERROR;
+
+    yuvconfig2image(&frame->img, fb, NULL);
+    return VPX_CODEC_OK;
+  } else {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+}
+
+static vpx_codec_err_t ctrl_set_previewpp(vpx_codec_alg_priv_t *ctx,
+                                          va_list args) {
+#if CONFIG_VP9_POSTPROC
+  vp8_postproc_cfg_t *config = va_arg(args, vp8_postproc_cfg_t *);
+  if (config != NULL) {
+    ctx->preview_ppcfg = *config;
+    return VPX_CODEC_OK;
+  } else {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+#else
+  (void)ctx;
+  (void)args;
+  return VPX_CODEC_INCAPABLE;
+#endif
+}
+
+
+static vpx_image_t *encoder_get_preview(vpx_codec_alg_priv_t *ctx) {
+  YV12_BUFFER_CONFIG sd;
+  vp10_ppflags_t flags;
+  vp10_zero(flags);
+
+  if (ctx->preview_ppcfg.post_proc_flag) {
+    flags.post_proc_flag   = ctx->preview_ppcfg.post_proc_flag;
+    flags.deblocking_level = ctx->preview_ppcfg.deblocking_level;
+    flags.noise_level      = ctx->preview_ppcfg.noise_level;
+  }
+
+  if (vp10_get_preview_raw_frame(ctx->cpi, &sd, &flags) == 0) {
+    yuvconfig2image(&ctx->preview_img, &sd, NULL);
+    return &ctx->preview_img;
+  } else {
+    return NULL;
+  }
+}
+
+static vpx_codec_err_t ctrl_set_roi_map(vpx_codec_alg_priv_t *ctx,
+                                        va_list args) {
+  (void)ctx;
+  (void)args;
+
+  // TODO(yaowu): Need to re-implement and test for VP9.
+  return VPX_CODEC_INVALID_PARAM;
+}
+
+
+static vpx_codec_err_t ctrl_set_active_map(vpx_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  vpx_active_map_t *const map = va_arg(args, vpx_active_map_t *);
+
+  if (map) {
+    if (!vp10_set_active_map(ctx->cpi, map->active_map,
+                            (int)map->rows, (int)map->cols))
+      return VPX_CODEC_OK;
+    else
+      return VPX_CODEC_INVALID_PARAM;
+  } else {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+}
+
+static vpx_codec_err_t ctrl_get_active_map(vpx_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  vpx_active_map_t *const map = va_arg(args, vpx_active_map_t *);
+
+  if (map) {
+    if (!vp10_get_active_map(ctx->cpi, map->active_map,
+                            (int)map->rows, (int)map->cols))
+      return VPX_CODEC_OK;
+    else
+      return VPX_CODEC_INVALID_PARAM;
+  } else {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+}
+
+static vpx_codec_err_t ctrl_set_scale_mode(vpx_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  vpx_scaling_mode_t *const mode = va_arg(args, vpx_scaling_mode_t *);
+
+  if (mode) {
+    const int res = vp10_set_internal_size(ctx->cpi,
+                                          (VPX_SCALING)mode->h_scaling_mode,
+                                          (VPX_SCALING)mode->v_scaling_mode);
+    return (res == 0) ? VPX_CODEC_OK : VPX_CODEC_INVALID_PARAM;
+  } else {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+}
+
+static vpx_codec_err_t ctrl_register_cx_callback(vpx_codec_alg_priv_t *ctx,
+                                                 va_list args) {
+  vpx_codec_priv_output_cx_pkt_cb_pair_t *cbp =
+      (vpx_codec_priv_output_cx_pkt_cb_pair_t *)va_arg(args, void *);
+  ctx->output_cx_pkt_cb.output_cx_pkt = cbp->output_cx_pkt;
+  ctx->output_cx_pkt_cb.user_priv = cbp->user_priv;
+
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t ctrl_set_tune_content(vpx_codec_alg_priv_t *ctx,
+                                             va_list args) {
+  struct vp10_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.content = CAST(VP9E_SET_TUNE_CONTENT, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_color_space(vpx_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  struct vp10_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.color_space = CAST(VP9E_SET_COLOR_SPACE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_color_range(vpx_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  struct vp10_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.color_range = CAST(VP9E_SET_COLOR_RANGE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_render_size(vpx_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  struct vp10_extracfg extra_cfg = ctx->extra_cfg;
+  int *const render_size = va_arg(args, int *);
+  extra_cfg.render_width  = render_size[0];
+  extra_cfg.render_height = render_size[1];
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
+  {VP8_COPY_REFERENCE,                ctrl_copy_reference},
+
+  // Setters
+  {VP8_SET_REFERENCE,                 ctrl_set_reference},
+  {VP8_SET_POSTPROC,                  ctrl_set_previewpp},
+  {VP8E_SET_ROI_MAP,                  ctrl_set_roi_map},
+  {VP8E_SET_ACTIVEMAP,                ctrl_set_active_map},
+  {VP8E_SET_SCALEMODE,                ctrl_set_scale_mode},
+  {VP8E_SET_CPUUSED,                  ctrl_set_cpuused},
+  {VP8E_SET_ENABLEAUTOALTREF,         ctrl_set_enable_auto_alt_ref},
+  {VP8E_SET_SHARPNESS,                ctrl_set_sharpness},
+  {VP8E_SET_STATIC_THRESHOLD,         ctrl_set_static_thresh},
+  {VP9E_SET_TILE_COLUMNS,             ctrl_set_tile_columns},
+  {VP9E_SET_TILE_ROWS,                ctrl_set_tile_rows},
+  {VP8E_SET_ARNR_MAXFRAMES,           ctrl_set_arnr_max_frames},
+  {VP8E_SET_ARNR_STRENGTH,            ctrl_set_arnr_strength},
+  {VP8E_SET_ARNR_TYPE,                ctrl_set_arnr_type},
+  {VP8E_SET_TUNING,                   ctrl_set_tuning},
+  {VP8E_SET_CQ_LEVEL,                 ctrl_set_cq_level},
+  {VP8E_SET_MAX_INTRA_BITRATE_PCT,    ctrl_set_rc_max_intra_bitrate_pct},
+  {VP9E_SET_MAX_INTER_BITRATE_PCT,    ctrl_set_rc_max_inter_bitrate_pct},
+  {VP9E_SET_GF_CBR_BOOST_PCT,         ctrl_set_rc_gf_cbr_boost_pct},
+  {VP9E_SET_LOSSLESS,                 ctrl_set_lossless},
+  {VP9E_SET_FRAME_PARALLEL_DECODING,  ctrl_set_frame_parallel_decoding_mode},
+  {VP9E_SET_AQ_MODE,                  ctrl_set_aq_mode},
+  {VP9E_SET_FRAME_PERIODIC_BOOST,     ctrl_set_frame_periodic_boost},
+  {VP9E_REGISTER_CX_CALLBACK,         ctrl_register_cx_callback},
+  {VP9E_SET_TUNE_CONTENT,             ctrl_set_tune_content},
+  {VP9E_SET_COLOR_SPACE,              ctrl_set_color_space},
+  {VP9E_SET_COLOR_RANGE,              ctrl_set_color_range},
+  {VP9E_SET_NOISE_SENSITIVITY,        ctrl_set_noise_sensitivity},
+  {VP9E_SET_MIN_GF_INTERVAL,          ctrl_set_min_gf_interval},
+  {VP9E_SET_MAX_GF_INTERVAL,          ctrl_set_max_gf_interval},
+  {VP9E_SET_RENDER_SIZE,              ctrl_set_render_size},
+
+  // Getters
+  {VP8E_GET_LAST_QUANTIZER,           ctrl_get_quantizer},
+  {VP8E_GET_LAST_QUANTIZER_64,        ctrl_get_quantizer64},
+  {VP9_GET_REFERENCE,                 ctrl_get_reference},
+  {VP9E_GET_ACTIVEMAP,                ctrl_get_active_map},
+
+  { -1, NULL},
+};
+
+static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
+  {
+    0,
+    {  // NOLINT
+      0,                  // g_usage
+      8,                  // g_threads
+      0,                  // g_profile
+
+      320,                // g_width
+      240,                // g_height
+      VPX_BITS_8,         // g_bit_depth
+      8,                  // g_input_bit_depth
+
+      {1, 30},            // g_timebase
+
+      0,                  // g_error_resilient
+
+      VPX_RC_ONE_PASS,    // g_pass
+
+      25,                 // g_lag_in_frames
+
+      0,                  // rc_dropframe_thresh
+      0,                  // rc_resize_allowed
+      0,                  // rc_scaled_width
+      0,                  // rc_scaled_height
+      60,                 // rc_resize_down_thresold
+      30,                 // rc_resize_up_thresold
+
+      VPX_VBR,            // rc_end_usage
+      {NULL, 0},          // rc_twopass_stats_in
+      {NULL, 0},          // rc_firstpass_mb_stats_in
+      256,                // rc_target_bandwidth
+      0,                  // rc_min_quantizer
+      63,                 // rc_max_quantizer
+      25,                 // rc_undershoot_pct
+      25,                 // rc_overshoot_pct
+
+      6000,               // rc_max_buffer_size
+      4000,               // rc_buffer_initial_size
+      5000,               // rc_buffer_optimal_size
+
+      50,                 // rc_two_pass_vbrbias
+      0,                  // rc_two_pass_vbrmin_section
+      2000,               // rc_two_pass_vbrmax_section
+
+      // keyframing settings (kf)
+      VPX_KF_AUTO,        // g_kfmode
+      0,                  // kf_min_dist
+      9999,               // kf_max_dist
+
+      // TODO(yunqingwang): Spatial/temporal scalability are not supported
+      // in VP10. The following 10 parameters are not used, which should
+      // be removed later.
+      1,                      // ss_number_layers
+      {0},
+      {0},                    // ss_target_bitrate
+      1,                      // ts_number_layers
+      {0},                    // ts_target_bitrate
+      {0},                    // ts_rate_decimator
+      0,                      // ts_periodicity
+      {0},                    // ts_layer_id
+      {0},                  // layer_taget_bitrate
+      0                     // temporal_layering_mode
+    }
+  },
+};
+
+#ifndef VERSION_STRING
+#define VERSION_STRING
+#endif
+CODEC_INTERFACE(vpx_codec_vp10_cx) = {
+  "WebM Project VP10 Encoder" VERSION_STRING,
+  VPX_CODEC_INTERNAL_ABI_VERSION,
+#if CONFIG_VP9_HIGHBITDEPTH
+  VPX_CODEC_CAP_HIGHBITDEPTH |
+#endif
+  VPX_CODEC_CAP_ENCODER | VPX_CODEC_CAP_PSNR,  // vpx_codec_caps_t
+  encoder_init,       // vpx_codec_init_fn_t
+  encoder_destroy,    // vpx_codec_destroy_fn_t
+  encoder_ctrl_maps,  // vpx_codec_ctrl_fn_map_t
+  {  // NOLINT
+    NULL,  // vpx_codec_peek_si_fn_t
+    NULL,  // vpx_codec_get_si_fn_t
+    NULL,  // vpx_codec_decode_fn_t
+    NULL,  // vpx_codec_frame_get_fn_t
+    NULL   // vpx_codec_set_fb_fn_t
+  },
+  {  // NOLINT
+    1,                      // 1 cfg map
+    encoder_usage_cfg_map,  // vpx_codec_enc_cfg_map_t
+    encoder_encode,         // vpx_codec_encode_fn_t
+    encoder_get_cxdata,     // vpx_codec_get_cx_data_fn_t
+    encoder_set_config,     // vpx_codec_enc_config_set_fn_t
+    NULL,        // vpx_codec_get_global_headers_fn_t
+    encoder_get_preview,    // vpx_codec_get_preview_frame_fn_t
+    NULL         // vpx_codec_enc_mr_get_mem_loc_fn_t
+  }
+};
diff --git a/libs/libvpx/vp10/vp10_dx_iface.c b/libs/libvpx/vp10/vp10_dx_iface.c
new file mode 100644
index 0000000000..33337a4bd2
--- /dev/null
+++ b/libs/libvpx/vp10/vp10_dx_iface.c
@@ -0,0 +1,1132 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "./vpx_config.h"
+#include "./vpx_version.h"
+
+#include "vpx/internal/vpx_codec_internal.h"
+#include "vpx/vp8dx.h"
+#include "vpx/vpx_decoder.h"
+#include "vpx_dsp/bitreader_buffer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_util/vpx_thread.h"
+
+#include "vp10/common/alloccommon.h"
+#include "vp10/common/frame_buffers.h"
+
+#include "vp10/decoder/decoder.h"
+#include "vp10/decoder/decodeframe.h"
+
+#include "vp10/vp10_iface_common.h"
+
+#define VP9_CAP_POSTPROC (CONFIG_VP9_POSTPROC ? VPX_CODEC_CAP_POSTPROC : 0)
+
+typedef vpx_codec_stream_info_t vp10_stream_info_t;
+
+// This limit is due to framebuffer numbers.
+// TODO(hkuang): Remove this limit after implementing ondemand framebuffers.
+#define FRAME_CACHE_SIZE 6   // Cache maximum 6 decoded frames.
+
+typedef struct cache_frame {
+  int fb_idx;
+  vpx_image_t img;
+} cache_frame;
+
+struct vpx_codec_alg_priv {
+  vpx_codec_priv_t        base;
+  vpx_codec_dec_cfg_t     cfg;
+  vp10_stream_info_t       si;
+  int                     postproc_cfg_set;
+  vp8_postproc_cfg_t      postproc_cfg;
+  vpx_decrypt_cb          decrypt_cb;
+  void                    *decrypt_state;
+  vpx_image_t             img;
+  int                     img_avail;
+  int                     flushed;
+  int                     invert_tile_order;
+  int                     last_show_frame;  // Index of last output frame.
+  int                     byte_alignment;
+  int                     skip_loop_filter;
+
+  // Frame parallel related.
+  int                     frame_parallel_decode;  // frame-based threading.
+  VPxWorker               *frame_workers;
+  int                     num_frame_workers;
+  int                     next_submit_worker_id;
+  int                     last_submit_worker_id;
+  int                     next_output_worker_id;
+  int                     available_threads;
+  cache_frame             frame_cache[FRAME_CACHE_SIZE];
+  int                     frame_cache_write;
+  int                     frame_cache_read;
+  int                     num_cache_frames;
+  int                     need_resync;      // wait for key/intra-only frame
+  // BufferPool that holds all reference frames. Shared by all the FrameWorkers.
+  BufferPool              *buffer_pool;
+
+  // External frame buffer info to save for VP9 common.
+  void *ext_priv;  // Private data associated with the external frame buffers.
+  vpx_get_frame_buffer_cb_fn_t get_ext_fb_cb;
+  vpx_release_frame_buffer_cb_fn_t release_ext_fb_cb;
+};
+
+static vpx_codec_err_t decoder_init(vpx_codec_ctx_t *ctx,
+                                    vpx_codec_priv_enc_mr_cfg_t *data) {
+  // This function only allocates space for the vpx_codec_alg_priv_t
+  // structure. More memory may be required at the time the stream
+  // information becomes known.
+  (void)data;
+
+  if (!ctx->priv) {
+    vpx_codec_alg_priv_t *const priv =
+        (vpx_codec_alg_priv_t *)vpx_calloc(1, sizeof(*priv));
+    if (priv == NULL)
+      return VPX_CODEC_MEM_ERROR;
+
+    ctx->priv = (vpx_codec_priv_t *)priv;
+    ctx->priv->init_flags = ctx->init_flags;
+    priv->si.sz = sizeof(priv->si);
+    priv->flushed = 0;
+    // Only do frame parallel decode when threads > 1.
+    priv->frame_parallel_decode =
+        (ctx->config.dec && (ctx->config.dec->threads > 1) &&
+         (ctx->init_flags & VPX_CODEC_USE_FRAME_THREADING)) ? 1 : 0;
+    if (ctx->config.dec) {
+      priv->cfg = *ctx->config.dec;
+      ctx->config.dec = &priv->cfg;
+    }
+  }
+
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t decoder_destroy(vpx_codec_alg_priv_t *ctx) {
+  if (ctx->frame_workers != NULL) {
+    int i;
+    for (i = 0; i < ctx->num_frame_workers; ++i) {
+      VPxWorker *const worker = &ctx->frame_workers[i];
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      vpx_get_worker_interface()->end(worker);
+      vp10_remove_common(&frame_worker_data->pbi->common);
+#if CONFIG_VP9_POSTPROC
+      vp10_free_postproc_buffers(&frame_worker_data->pbi->common);
+#endif
+      vp10_decoder_remove(frame_worker_data->pbi);
+      vpx_free(frame_worker_data->scratch_buffer);
+#if CONFIG_MULTITHREAD
+      pthread_mutex_destroy(&frame_worker_data->stats_mutex);
+      pthread_cond_destroy(&frame_worker_data->stats_cond);
+#endif
+      vpx_free(frame_worker_data);
+    }
+#if CONFIG_MULTITHREAD
+    pthread_mutex_destroy(&ctx->buffer_pool->pool_mutex);
+#endif
+  }
+
+  if (ctx->buffer_pool) {
+    vp10_free_ref_frame_buffers(ctx->buffer_pool);
+    vp10_free_internal_frame_buffers(&ctx->buffer_pool->int_frame_buffers);
+  }
+
+  vpx_free(ctx->frame_workers);
+  vpx_free(ctx->buffer_pool);
+  vpx_free(ctx);
+  return VPX_CODEC_OK;
+}
+
+static int parse_bitdepth_colorspace_sampling(
+    BITSTREAM_PROFILE profile, struct vpx_read_bit_buffer *rb) {
+  vpx_color_space_t color_space;
+  if (profile >= PROFILE_2)
+    rb->bit_offset += 1;  // Bit-depth 10 or 12.
+  color_space = (vpx_color_space_t)vpx_rb_read_literal(rb, 3);
+  if (color_space != VPX_CS_SRGB) {
+    rb->bit_offset += 1;  // [16,235] (including xvycc) vs [0,255] range.
+    if (profile == PROFILE_1 || profile == PROFILE_3) {
+      rb->bit_offset += 2;  // subsampling x/y.
+      rb->bit_offset += 1;  // unused.
+    }
+  } else {
+    if (profile == PROFILE_1 || profile == PROFILE_3) {
+      rb->bit_offset += 1;  // unused
+    } else {
+      // RGB is only available in version 1.
+      return 0;
+    }
+  }
+  return 1;
+}
+
+static vpx_codec_err_t decoder_peek_si_internal(const uint8_t *data,
+                                                unsigned int data_sz,
+                                                vpx_codec_stream_info_t *si,
+                                                int *is_intra_only,
+                                                vpx_decrypt_cb decrypt_cb,
+                                                void *decrypt_state) {
+  int intra_only_flag = 0;
+  uint8_t clear_buffer[9];
+
+  if (data + data_sz <= data)
+    return VPX_CODEC_INVALID_PARAM;
+
+  si->is_kf = 0;
+  si->w = si->h = 0;
+
+  if (decrypt_cb) {
+    data_sz = VPXMIN(sizeof(clear_buffer), data_sz);
+    decrypt_cb(decrypt_state, data, clear_buffer, data_sz);
+    data = clear_buffer;
+  }
+
+  {
+    int show_frame;
+    int error_resilient;
+    struct vpx_read_bit_buffer rb = { data, data + data_sz, 0, NULL, NULL };
+    const int frame_marker = vpx_rb_read_literal(&rb, 2);
+    const BITSTREAM_PROFILE profile = vp10_read_profile(&rb);
+
+    if (frame_marker != VP9_FRAME_MARKER)
+      return VPX_CODEC_UNSUP_BITSTREAM;
+
+    if (profile >= MAX_PROFILES)
+      return VPX_CODEC_UNSUP_BITSTREAM;
+
+    if ((profile >= 2 && data_sz <= 1) || data_sz < 1)
+      return VPX_CODEC_UNSUP_BITSTREAM;
+
+    if (vpx_rb_read_bit(&rb)) {  // show an existing frame
+      vpx_rb_read_literal(&rb, 3);  // Frame buffer to show.
+      return VPX_CODEC_OK;
+    }
+
+    if (data_sz <= 8)
+      return VPX_CODEC_UNSUP_BITSTREAM;
+
+    si->is_kf = !vpx_rb_read_bit(&rb);
+    show_frame = vpx_rb_read_bit(&rb);
+    error_resilient = vpx_rb_read_bit(&rb);
+
+    if (si->is_kf) {
+      if (!vp10_read_sync_code(&rb))
+        return VPX_CODEC_UNSUP_BITSTREAM;
+
+      if (!parse_bitdepth_colorspace_sampling(profile, &rb))
+        return VPX_CODEC_UNSUP_BITSTREAM;
+      vp10_read_frame_size(&rb, (int *)&si->w, (int *)&si->h);
+    } else {
+      intra_only_flag = show_frame ? 0 : vpx_rb_read_bit(&rb);
+
+      rb.bit_offset += error_resilient ? 0 : 2;  // reset_frame_context
+
+      if (intra_only_flag) {
+        if (!vp10_read_sync_code(&rb))
+          return VPX_CODEC_UNSUP_BITSTREAM;
+        if (profile > PROFILE_0) {
+          if (!parse_bitdepth_colorspace_sampling(profile, &rb))
+            return VPX_CODEC_UNSUP_BITSTREAM;
+        }
+        rb.bit_offset += REF_FRAMES;  // refresh_frame_flags
+        vp10_read_frame_size(&rb, (int *)&si->w, (int *)&si->h);
+      }
+    }
+  }
+  if (is_intra_only != NULL)
+    *is_intra_only = intra_only_flag;
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t decoder_peek_si(const uint8_t *data,
+                                       unsigned int data_sz,
+                                       vpx_codec_stream_info_t *si) {
+  return decoder_peek_si_internal(data, data_sz, si, NULL, NULL, NULL);
+}
+
+static vpx_codec_err_t decoder_get_si(vpx_codec_alg_priv_t *ctx,
+                                      vpx_codec_stream_info_t *si) {
+  const size_t sz = (si->sz >= sizeof(vp10_stream_info_t))
+                       ? sizeof(vp10_stream_info_t)
+                       : sizeof(vpx_codec_stream_info_t);
+  memcpy(si, &ctx->si, sz);
+  si->sz = (unsigned int)sz;
+
+  return VPX_CODEC_OK;
+}
+
+static void set_error_detail(vpx_codec_alg_priv_t *ctx,
+                             const char *const error) {
+  ctx->base.err_detail = error;
+}
+
+static vpx_codec_err_t update_error_state(vpx_codec_alg_priv_t *ctx,
+                           const struct vpx_internal_error_info *error) {
+  if (error->error_code)
+    set_error_detail(ctx, error->has_detail ? error->detail : NULL);
+
+  return error->error_code;
+}
+
+static void init_buffer_callbacks(vpx_codec_alg_priv_t *ctx) {
+  int i;
+
+  for (i = 0; i < ctx->num_frame_workers; ++i) {
+    VPxWorker *const worker = &ctx->frame_workers[i];
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    VP10_COMMON *const cm = &frame_worker_data->pbi->common;
+    BufferPool *const pool = cm->buffer_pool;
+
+    cm->new_fb_idx = INVALID_IDX;
+    cm->byte_alignment = ctx->byte_alignment;
+    cm->skip_loop_filter = ctx->skip_loop_filter;
+
+    if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) {
+      pool->get_fb_cb = ctx->get_ext_fb_cb;
+      pool->release_fb_cb = ctx->release_ext_fb_cb;
+      pool->cb_priv = ctx->ext_priv;
+    } else {
+      pool->get_fb_cb = vp10_get_frame_buffer;
+      pool->release_fb_cb = vp10_release_frame_buffer;
+
+      if (vp10_alloc_internal_frame_buffers(&pool->int_frame_buffers))
+        vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                           "Failed to initialize internal frame buffers");
+
+      pool->cb_priv = &pool->int_frame_buffers;
+    }
+  }
+}
+
+static void set_default_ppflags(vp8_postproc_cfg_t *cfg) {
+  cfg->post_proc_flag = VP8_DEBLOCK | VP8_DEMACROBLOCK;
+  cfg->deblocking_level = 4;
+  cfg->noise_level = 0;
+}
+
+static void set_ppflags(const vpx_codec_alg_priv_t *ctx,
+                        vp10_ppflags_t *flags) {
+  flags->post_proc_flag =
+      ctx->postproc_cfg.post_proc_flag;
+
+  flags->deblocking_level = ctx->postproc_cfg.deblocking_level;
+  flags->noise_level = ctx->postproc_cfg.noise_level;
+}
+
+static int frame_worker_hook(void *arg1, void *arg2) {
+  FrameWorkerData *const frame_worker_data = (FrameWorkerData *)arg1;
+  const uint8_t *data = frame_worker_data->data;
+  (void)arg2;
+
+  frame_worker_data->result =
+      vp10_receive_compressed_data(frame_worker_data->pbi,
+                                  frame_worker_data->data_size,
+                                  &data);
+  frame_worker_data->data_end = data;
+
+  if (frame_worker_data->pbi->common.frame_parallel_decode) {
+    // In frame parallel decoding, a worker thread must successfully decode all
+    // the compressed data.
+    if (frame_worker_data->result != 0 ||
+        frame_worker_data->data + frame_worker_data->data_size - 1 > data) {
+      VPxWorker *const worker = frame_worker_data->pbi->frame_worker_owner;
+      BufferPool *const pool = frame_worker_data->pbi->common.buffer_pool;
+      // Signal all the other threads that are waiting for this frame.
+      vp10_frameworker_lock_stats(worker);
+      frame_worker_data->frame_context_ready = 1;
+      lock_buffer_pool(pool);
+      frame_worker_data->pbi->cur_buf->buf.corrupted = 1;
+      unlock_buffer_pool(pool);
+      frame_worker_data->pbi->need_resync = 1;
+      vp10_frameworker_signal_stats(worker);
+      vp10_frameworker_unlock_stats(worker);
+      return 0;
+    }
+  } else if (frame_worker_data->result != 0) {
+    // Check decode result in serial decode.
+    frame_worker_data->pbi->cur_buf->buf.corrupted = 1;
+    frame_worker_data->pbi->need_resync = 1;
+  }
+  return !frame_worker_data->result;
+}
+
+static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) {
+  int i;
+  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
+
+  ctx->last_show_frame = -1;
+  ctx->next_submit_worker_id = 0;
+  ctx->last_submit_worker_id = 0;
+  ctx->next_output_worker_id = 0;
+  ctx->frame_cache_read = 0;
+  ctx->frame_cache_write = 0;
+  ctx->num_cache_frames = 0;
+  ctx->need_resync = 1;
+  ctx->num_frame_workers =
+      (ctx->frame_parallel_decode == 1) ? ctx->cfg.threads: 1;
+  if (ctx->num_frame_workers > MAX_DECODE_THREADS)
+    ctx->num_frame_workers = MAX_DECODE_THREADS;
+  ctx->available_threads = ctx->num_frame_workers;
+  ctx->flushed = 0;
+
+  ctx->buffer_pool = (BufferPool *)vpx_calloc(1, sizeof(BufferPool));
+  if (ctx->buffer_pool == NULL)
+    return VPX_CODEC_MEM_ERROR;
+
+#if CONFIG_MULTITHREAD
+    if (pthread_mutex_init(&ctx->buffer_pool->pool_mutex, NULL)) {
+      set_error_detail(ctx, "Failed to allocate buffer pool mutex");
+      return VPX_CODEC_MEM_ERROR;
+    }
+#endif
+
+  ctx->frame_workers = (VPxWorker *)
+      vpx_malloc(ctx->num_frame_workers * sizeof(*ctx->frame_workers));
+  if (ctx->frame_workers == NULL) {
+    set_error_detail(ctx, "Failed to allocate frame_workers");
+    return VPX_CODEC_MEM_ERROR;
+  }
+
+  for (i = 0; i < ctx->num_frame_workers; ++i) {
+    VPxWorker *const worker = &ctx->frame_workers[i];
+    FrameWorkerData *frame_worker_data = NULL;
+    winterface->init(worker);
+    worker->data1 = vpx_memalign(32, sizeof(FrameWorkerData));
+    if (worker->data1 == NULL) {
+      set_error_detail(ctx, "Failed to allocate frame_worker_data");
+      return VPX_CODEC_MEM_ERROR;
+    }
+    frame_worker_data = (FrameWorkerData *)worker->data1;
+    frame_worker_data->pbi = vp10_decoder_create(ctx->buffer_pool);
+    if (frame_worker_data->pbi == NULL) {
+      set_error_detail(ctx, "Failed to allocate frame_worker_data");
+      return VPX_CODEC_MEM_ERROR;
+    }
+    frame_worker_data->pbi->frame_worker_owner = worker;
+    frame_worker_data->worker_id = i;
+    frame_worker_data->scratch_buffer = NULL;
+    frame_worker_data->scratch_buffer_size = 0;
+    frame_worker_data->frame_context_ready = 0;
+    frame_worker_data->received_frame = 0;
+#if CONFIG_MULTITHREAD
+    if (pthread_mutex_init(&frame_worker_data->stats_mutex, NULL)) {
+      set_error_detail(ctx, "Failed to allocate frame_worker_data mutex");
+      return VPX_CODEC_MEM_ERROR;
+    }
+
+    if (pthread_cond_init(&frame_worker_data->stats_cond, NULL)) {
+      set_error_detail(ctx, "Failed to allocate frame_worker_data cond");
+      return VPX_CODEC_MEM_ERROR;
+    }
+#endif
+    // If decoding in serial mode, FrameWorker thread could create tile worker
+    // thread or loopfilter thread.
+    frame_worker_data->pbi->max_threads =
+        (ctx->frame_parallel_decode == 0) ? ctx->cfg.threads : 0;
+
+    frame_worker_data->pbi->inv_tile_order = ctx->invert_tile_order;
+    frame_worker_data->pbi->common.frame_parallel_decode =
+        ctx->frame_parallel_decode;
+    worker->hook = (VPxWorkerHook)frame_worker_hook;
+    if (!winterface->reset(worker)) {
+      set_error_detail(ctx, "Frame Worker thread creation failed");
+      return VPX_CODEC_MEM_ERROR;
+    }
+  }
+
+  // If postprocessing was enabled by the application and a
+  // configuration has not been provided, default it.
+  if (!ctx->postproc_cfg_set &&
+      (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC))
+    set_default_ppflags(&ctx->postproc_cfg);
+
+  init_buffer_callbacks(ctx);
+
+  return VPX_CODEC_OK;
+}
+
+static INLINE void check_resync(vpx_codec_alg_priv_t *const ctx,
+                                const VP10Decoder *const pbi) {
+  // Clear resync flag if worker got a key frame or intra only frame.
+  if (ctx->need_resync == 1 && pbi->need_resync == 0 &&
+      (pbi->common.intra_only || pbi->common.frame_type == KEY_FRAME))
+    ctx->need_resync = 0;
+}
+
+static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
+                                  const uint8_t **data, unsigned int data_sz,
+                                  void *user_priv, int64_t deadline) {
+  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
+  (void)deadline;
+
+  // Determine the stream parameters. Note that we rely on peek_si to
+  // validate that we have a buffer that does not wrap around the top
+  // of the heap.
+  if (!ctx->si.h) {
+    int is_intra_only = 0;
+    const vpx_codec_err_t res =
+        decoder_peek_si_internal(*data, data_sz, &ctx->si, &is_intra_only,
+                                 ctx->decrypt_cb, ctx->decrypt_state);
+    if (res != VPX_CODEC_OK)
+      return res;
+
+    if (!ctx->si.is_kf && !is_intra_only)
+      return VPX_CODEC_ERROR;
+  }
+
+  if (!ctx->frame_parallel_decode) {
+    VPxWorker *const worker = ctx->frame_workers;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    frame_worker_data->data = *data;
+    frame_worker_data->data_size = data_sz;
+    frame_worker_data->user_priv = user_priv;
+    frame_worker_data->received_frame = 1;
+
+    // Set these even if already initialized.  The caller may have changed the
+    // decrypt config between frames.
+    frame_worker_data->pbi->decrypt_cb = ctx->decrypt_cb;
+    frame_worker_data->pbi->decrypt_state = ctx->decrypt_state;
+
+    worker->had_error = 0;
+    winterface->execute(worker);
+
+    // Update data pointer after decode.
+    *data = frame_worker_data->data_end;
+
+    if (worker->had_error)
+      return update_error_state(ctx, &frame_worker_data->pbi->common.error);
+
+    check_resync(ctx, frame_worker_data->pbi);
+  } else {
+    VPxWorker *const worker = &ctx->frame_workers[ctx->next_submit_worker_id];
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    // Copy context from last worker thread to next worker thread.
+    if (ctx->next_submit_worker_id != ctx->last_submit_worker_id)
+      vp10_frameworker_copy_context(
+          &ctx->frame_workers[ctx->next_submit_worker_id],
+          &ctx->frame_workers[ctx->last_submit_worker_id]);
+
+    frame_worker_data->pbi->ready_for_new_data = 0;
+    // Copy the compressed data into worker's internal buffer.
+    // TODO(hkuang): Will all the workers allocate the same size
+    // as the size of the first intra frame be better? This will
+    // avoid too many deallocate and allocate.
+    if (frame_worker_data->scratch_buffer_size < data_sz) {
+      frame_worker_data->scratch_buffer =
+          (uint8_t *)vpx_realloc(frame_worker_data->scratch_buffer, data_sz);
+      if (frame_worker_data->scratch_buffer == NULL) {
+        set_error_detail(ctx, "Failed to reallocate scratch buffer");
+        return VPX_CODEC_MEM_ERROR;
+      }
+      frame_worker_data->scratch_buffer_size = data_sz;
+    }
+    frame_worker_data->data_size = data_sz;
+    memcpy(frame_worker_data->scratch_buffer, *data, data_sz);
+
+    frame_worker_data->frame_decoded = 0;
+    frame_worker_data->frame_context_ready = 0;
+    frame_worker_data->received_frame = 1;
+    frame_worker_data->data = frame_worker_data->scratch_buffer;
+    frame_worker_data->user_priv = user_priv;
+
+    if (ctx->next_submit_worker_id != ctx->last_submit_worker_id)
+      ctx->last_submit_worker_id =
+          (ctx->last_submit_worker_id + 1) % ctx->num_frame_workers;
+
+    ctx->next_submit_worker_id =
+        (ctx->next_submit_worker_id + 1) % ctx->num_frame_workers;
+    --ctx->available_threads;
+    worker->had_error = 0;
+    winterface->launch(worker);
+  }
+
+  return VPX_CODEC_OK;
+}
+
+static void wait_worker_and_cache_frame(vpx_codec_alg_priv_t *ctx) {
+  YV12_BUFFER_CONFIG sd;
+  vp10_ppflags_t flags = {0, 0, 0};
+  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
+  VPxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
+  FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+  ctx->next_output_worker_id =
+      (ctx->next_output_worker_id + 1) % ctx->num_frame_workers;
+  // TODO(hkuang): Add worker error handling here.
+  winterface->sync(worker);
+  frame_worker_data->received_frame = 0;
+  ++ctx->available_threads;
+
+  check_resync(ctx, frame_worker_data->pbi);
+
+  if (vp10_get_raw_frame(frame_worker_data->pbi, &sd, &flags) == 0) {
+    VP10_COMMON *const cm = &frame_worker_data->pbi->common;
+    RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+    ctx->frame_cache[ctx->frame_cache_write].fb_idx = cm->new_fb_idx;
+    yuvconfig2image(&ctx->frame_cache[ctx->frame_cache_write].img, &sd,
+                    frame_worker_data->user_priv);
+    ctx->frame_cache[ctx->frame_cache_write].img.fb_priv =
+        frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
+    ctx->frame_cache_write =
+        (ctx->frame_cache_write + 1) % FRAME_CACHE_SIZE;
+    ++ctx->num_cache_frames;
+  }
+}
+
+static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
+                                      const uint8_t *data, unsigned int data_sz,
+                                      void *user_priv, long deadline) {
+  const uint8_t *data_start = data;
+  const uint8_t * const data_end = data + data_sz;
+  vpx_codec_err_t res;
+  uint32_t frame_sizes[8];
+  int frame_count;
+
+  if (data == NULL && data_sz == 0) {
+    ctx->flushed = 1;
+    return VPX_CODEC_OK;
+  }
+
+  // Reset flushed when receiving a valid frame.
+  ctx->flushed = 0;
+
+  // Initialize the decoder workers on the first frame.
+  if (ctx->frame_workers == NULL) {
+    const vpx_codec_err_t res = init_decoder(ctx);
+    if (res != VPX_CODEC_OK)
+      return res;
+  }
+
+  res = vp10_parse_superframe_index(data, data_sz, frame_sizes, &frame_count,
+                                   ctx->decrypt_cb, ctx->decrypt_state);
+  if (res != VPX_CODEC_OK)
+    return res;
+
+  if (ctx->frame_parallel_decode) {
+    // Decode in frame parallel mode. When decoding in this mode, the frame
+    // passed to the decoder must be either a normal frame or a superframe with
+    // superframe index so the decoder could get each frame's start position
+    // in the superframe.
+    if (frame_count > 0) {
+      int i;
+
+      for (i = 0; i < frame_count; ++i) {
+        const uint8_t *data_start_copy = data_start;
+        const uint32_t frame_size = frame_sizes[i];
+        if (data_start < data
+            || frame_size > (uint32_t) (data_end - data_start)) {
+          set_error_detail(ctx, "Invalid frame size in index");
+          return VPX_CODEC_CORRUPT_FRAME;
+        }
+
+        if (ctx->available_threads == 0) {
+          // No more threads for decoding. Wait until the next output worker
+          // finishes decoding. Then copy the decoded frame into cache.
+          if (ctx->num_cache_frames < FRAME_CACHE_SIZE) {
+            wait_worker_and_cache_frame(ctx);
+          } else {
+            // TODO(hkuang): Add unit test to test this path.
+            set_error_detail(ctx, "Frame output cache is full.");
+            return VPX_CODEC_ERROR;
+          }
+        }
+
+        res = decode_one(ctx, &data_start_copy, frame_size, user_priv,
+                         deadline);
+        if (res != VPX_CODEC_OK)
+          return res;
+        data_start += frame_size;
+      }
+    } else {
+      if (ctx->available_threads == 0) {
+        // No more threads for decoding. Wait until the next output worker
+        // finishes decoding. Then copy the decoded frame into cache.
+        if (ctx->num_cache_frames < FRAME_CACHE_SIZE) {
+          wait_worker_and_cache_frame(ctx);
+        } else {
+          // TODO(hkuang): Add unit test to test this path.
+          set_error_detail(ctx, "Frame output cache is full.");
+          return VPX_CODEC_ERROR;
+        }
+      }
+
+      res = decode_one(ctx, &data, data_sz, user_priv, deadline);
+      if (res != VPX_CODEC_OK)
+        return res;
+    }
+  } else {
+    // Decode in serial mode.
+    if (frame_count > 0) {
+      int i;
+
+      for (i = 0; i < frame_count; ++i) {
+        const uint8_t *data_start_copy = data_start;
+        const uint32_t frame_size = frame_sizes[i];
+        vpx_codec_err_t res;
+        if (data_start < data
+            || frame_size > (uint32_t) (data_end - data_start)) {
+          set_error_detail(ctx, "Invalid frame size in index");
+          return VPX_CODEC_CORRUPT_FRAME;
+        }
+
+        res = decode_one(ctx, &data_start_copy, frame_size, user_priv,
+                         deadline);
+        if (res != VPX_CODEC_OK)
+          return res;
+
+        data_start += frame_size;
+      }
+    } else {
+      while (data_start < data_end) {
+        const uint32_t frame_size = (uint32_t) (data_end - data_start);
+        const vpx_codec_err_t res = decode_one(ctx, &data_start, frame_size,
+                                               user_priv, deadline);
+        if (res != VPX_CODEC_OK)
+          return res;
+
+        // Account for suboptimal termination by the encoder.
+        while (data_start < data_end) {
+          const uint8_t marker = read_marker(ctx->decrypt_cb,
+                                             ctx->decrypt_state, data_start);
+          if (marker)
+            break;
+          ++data_start;
+        }
+      }
+    }
+  }
+
+  return res;
+}
+
+static void release_last_output_frame(vpx_codec_alg_priv_t *ctx) {
+  RefCntBuffer *const frame_bufs = ctx->buffer_pool->frame_bufs;
+  // Decrease reference count of last output frame in frame parallel mode.
+  if (ctx->frame_parallel_decode && ctx->last_show_frame >= 0) {
+    BufferPool *const pool = ctx->buffer_pool;
+    lock_buffer_pool(pool);
+    decrease_ref_count(ctx->last_show_frame, frame_bufs, pool);
+    unlock_buffer_pool(pool);
+  }
+}
+
+static vpx_image_t *decoder_get_frame(vpx_codec_alg_priv_t *ctx,
+                                      vpx_codec_iter_t *iter) {
+  vpx_image_t *img = NULL;
+
+  // Only return frame when all the cpu are busy or
+  // application fluhsed the decoder in frame parallel decode.
+  if (ctx->frame_parallel_decode && ctx->available_threads > 0 &&
+      !ctx->flushed) {
+    return NULL;
+  }
+
+  // Output the frames in the cache first.
+  if (ctx->num_cache_frames > 0) {
+    release_last_output_frame(ctx);
+    ctx->last_show_frame  = ctx->frame_cache[ctx->frame_cache_read].fb_idx;
+    if (ctx->need_resync)
+      return NULL;
+    img = &ctx->frame_cache[ctx->frame_cache_read].img;
+    ctx->frame_cache_read = (ctx->frame_cache_read + 1) % FRAME_CACHE_SIZE;
+    --ctx->num_cache_frames;
+    return img;
+  }
+
+  // iter acts as a flip flop, so an image is only returned on the first
+  // call to get_frame.
+  if (*iter == NULL && ctx->frame_workers != NULL) {
+    do {
+      YV12_BUFFER_CONFIG sd;
+      vp10_ppflags_t flags = {0, 0, 0};
+      const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
+      VPxWorker *const worker =
+          &ctx->frame_workers[ctx->next_output_worker_id];
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      ctx->next_output_worker_id =
+          (ctx->next_output_worker_id + 1) % ctx->num_frame_workers;
+      if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)
+        set_ppflags(ctx, &flags);
+      // Wait for the frame from worker thread.
+      if (winterface->sync(worker)) {
+        // Check if worker has received any frames.
+        if (frame_worker_data->received_frame == 1) {
+          ++ctx->available_threads;
+          frame_worker_data->received_frame = 0;
+          check_resync(ctx, frame_worker_data->pbi);
+        }
+        if (vp10_get_raw_frame(frame_worker_data->pbi, &sd, &flags) == 0) {
+          VP10_COMMON *const cm = &frame_worker_data->pbi->common;
+          RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+          release_last_output_frame(ctx);
+          ctx->last_show_frame = frame_worker_data->pbi->common.new_fb_idx;
+          if (ctx->need_resync)
+            return NULL;
+          yuvconfig2image(&ctx->img, &sd, frame_worker_data->user_priv);
+          ctx->img.fb_priv = frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
+          img = &ctx->img;
+          return img;
+        }
+      } else {
+        // Decoding failed. Release the worker thread.
+        frame_worker_data->received_frame = 0;
+        ++ctx->available_threads;
+        ctx->need_resync = 1;
+        if (ctx->flushed != 1)
+          return NULL;
+      }
+    } while (ctx->next_output_worker_id != ctx->next_submit_worker_id);
+  }
+  return NULL;
+}
+
+static vpx_codec_err_t decoder_set_fb_fn(
+    vpx_codec_alg_priv_t *ctx,
+    vpx_get_frame_buffer_cb_fn_t cb_get,
+    vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) {
+  if (cb_get == NULL || cb_release == NULL) {
+    return VPX_CODEC_INVALID_PARAM;
+  } else if (ctx->frame_workers == NULL) {
+    // If the decoder has already been initialized, do not accept changes to
+    // the frame buffer functions.
+    ctx->get_ext_fb_cb = cb_get;
+    ctx->release_ext_fb_cb = cb_release;
+    ctx->ext_priv = cb_priv;
+    return VPX_CODEC_OK;
+  }
+
+  return VPX_CODEC_ERROR;
+}
+
+static vpx_codec_err_t ctrl_set_reference(vpx_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  vpx_ref_frame_t *const data = va_arg(args, vpx_ref_frame_t *);
+
+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return VPX_CODEC_INCAPABLE;
+  }
+
+  if (data) {
+    vpx_ref_frame_t *const frame = (vpx_ref_frame_t *)data;
+    YV12_BUFFER_CONFIG sd;
+    VPxWorker *const worker = ctx->frame_workers;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    image2yuvconfig(&frame->img, &sd);
+    return vp10_set_reference_dec(&frame_worker_data->pbi->common,
+                                 (VP9_REFFRAME)frame->frame_type, &sd);
+  } else {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+}
+
+static vpx_codec_err_t ctrl_copy_reference(vpx_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
+
+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return VPX_CODEC_INCAPABLE;
+  }
+
+  if (data) {
+    vpx_ref_frame_t *frame = (vpx_ref_frame_t *) data;
+    YV12_BUFFER_CONFIG sd;
+    VPxWorker *const worker = ctx->frame_workers;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    image2yuvconfig(&frame->img, &sd);
+    return vp10_copy_reference_dec(frame_worker_data->pbi,
+                                  (VP9_REFFRAME)frame->frame_type, &sd);
+  } else {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+}
+
+static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  vp9_ref_frame_t *data = va_arg(args, vp9_ref_frame_t *);
+
+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return VPX_CODEC_INCAPABLE;
+  }
+
+  if (data) {
+    YV12_BUFFER_CONFIG* fb;
+    VPxWorker *const worker = ctx->frame_workers;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    fb = get_ref_frame(&frame_worker_data->pbi->common, data->idx);
+    if (fb == NULL) return VPX_CODEC_ERROR;
+    yuvconfig2image(&data->img, fb, NULL);
+    return VPX_CODEC_OK;
+  } else {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+}
+
+static vpx_codec_err_t ctrl_set_postproc(vpx_codec_alg_priv_t *ctx,
+                                         va_list args) {
+#if CONFIG_VP9_POSTPROC
+  vp8_postproc_cfg_t *data = va_arg(args, vp8_postproc_cfg_t *);
+
+  if (data) {
+    ctx->postproc_cfg_set = 1;
+    ctx->postproc_cfg = *((vp8_postproc_cfg_t *)data);
+    return VPX_CODEC_OK;
+  } else {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+#else
+  (void)ctx;
+  (void)args;
+  return VPX_CODEC_INCAPABLE;
+#endif
+}
+
+static vpx_codec_err_t ctrl_set_dbg_options(vpx_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  (void)ctx;
+  (void)args;
+  return VPX_CODEC_INCAPABLE;
+}
+
+static vpx_codec_err_t ctrl_get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
+                                                 va_list args) {
+  int *const update_info = va_arg(args, int *);
+
+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return VPX_CODEC_INCAPABLE;
+  }
+
+  if (update_info) {
+    if (ctx->frame_workers) {
+      VPxWorker *const worker = ctx->frame_workers;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      *update_info = frame_worker_data->pbi->refresh_frame_flags;
+      return VPX_CODEC_OK;
+    } else {
+      return VPX_CODEC_ERROR;
+    }
+  }
+
+  return VPX_CODEC_INVALID_PARAM;
+}
+
+static vpx_codec_err_t ctrl_get_frame_corrupted(vpx_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  int *corrupted = va_arg(args, int *);
+
+  if (corrupted) {
+    if (ctx->frame_workers) {
+      VPxWorker *const worker = ctx->frame_workers;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      RefCntBuffer *const frame_bufs =
+          frame_worker_data->pbi->common.buffer_pool->frame_bufs;
+      if (frame_worker_data->pbi->common.frame_to_show == NULL)
+        return VPX_CODEC_ERROR;
+      if (ctx->last_show_frame >= 0)
+        *corrupted = frame_bufs[ctx->last_show_frame].buf.corrupted;
+      return VPX_CODEC_OK;
+    } else {
+      return VPX_CODEC_ERROR;
+    }
+  }
+
+  return VPX_CODEC_INVALID_PARAM;
+}
+
+static vpx_codec_err_t ctrl_get_frame_size(vpx_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  int *const frame_size = va_arg(args, int *);
+
+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return VPX_CODEC_INCAPABLE;
+  }
+
+  if (frame_size) {
+    if (ctx->frame_workers) {
+      VPxWorker *const worker = ctx->frame_workers;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      const VP10_COMMON *const cm = &frame_worker_data->pbi->common;
+      frame_size[0] = cm->width;
+      frame_size[1] = cm->height;
+      return VPX_CODEC_OK;
+    } else {
+      return VPX_CODEC_ERROR;
+    }
+  }
+
+  return VPX_CODEC_INVALID_PARAM;
+}
+
+static vpx_codec_err_t ctrl_get_render_size(vpx_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  int *const render_size = va_arg(args, int *);
+
+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return VPX_CODEC_INCAPABLE;
+  }
+
+  if (render_size) {
+    if (ctx->frame_workers) {
+      VPxWorker *const worker = ctx->frame_workers;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      const VP10_COMMON *const cm = &frame_worker_data->pbi->common;
+      render_size[0] = cm->render_width;
+      render_size[1] = cm->render_height;
+      return VPX_CODEC_OK;
+    } else {
+      return VPX_CODEC_ERROR;
+    }
+  }
+
+  return VPX_CODEC_INVALID_PARAM;
+}
+
+static vpx_codec_err_t ctrl_get_bit_depth(vpx_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  unsigned int *const bit_depth = va_arg(args, unsigned int *);
+  VPxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
+
+  if (bit_depth) {
+    if (worker) {
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      const VP10_COMMON *const cm = &frame_worker_data->pbi->common;
+      *bit_depth = cm->bit_depth;
+      return VPX_CODEC_OK;
+    } else {
+      return VPX_CODEC_ERROR;
+    }
+  }
+
+  return VPX_CODEC_INVALID_PARAM;
+}
+
+static vpx_codec_err_t ctrl_set_invert_tile_order(vpx_codec_alg_priv_t *ctx,
+                                                  va_list args) {
+  ctx->invert_tile_order = va_arg(args, int);
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t ctrl_set_decryptor(vpx_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  vpx_decrypt_init *init = va_arg(args, vpx_decrypt_init *);
+  ctx->decrypt_cb = init ? init->decrypt_cb : NULL;
+  ctx->decrypt_state = init ? init->decrypt_state : NULL;
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t ctrl_set_byte_alignment(vpx_codec_alg_priv_t *ctx,
+                                               va_list args) {
+  const int legacy_byte_alignment = 0;
+  const int min_byte_alignment = 32;
+  const int max_byte_alignment = 1024;
+  const int byte_alignment = va_arg(args, int);
+
+  if (byte_alignment != legacy_byte_alignment &&
+      (byte_alignment < min_byte_alignment ||
+       byte_alignment > max_byte_alignment ||
+       (byte_alignment & (byte_alignment - 1)) != 0))
+    return VPX_CODEC_INVALID_PARAM;
+
+  ctx->byte_alignment = byte_alignment;
+  if (ctx->frame_workers) {
+    VPxWorker *const worker = ctx->frame_workers;
+    FrameWorkerData *const frame_worker_data =
+        (FrameWorkerData *)worker->data1;
+    frame_worker_data->pbi->common.byte_alignment = byte_alignment;
+  }
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t ctrl_set_skip_loop_filter(vpx_codec_alg_priv_t *ctx,
+                                                 va_list args) {
+  ctx->skip_loop_filter = va_arg(args, int);
+
+  if (ctx->frame_workers) {
+    VPxWorker *const worker = ctx->frame_workers;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    frame_worker_data->pbi->common.skip_loop_filter = ctx->skip_loop_filter;
+  }
+
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
+  {VP8_COPY_REFERENCE,            ctrl_copy_reference},
+
+  // Setters
+  {VP8_SET_REFERENCE,             ctrl_set_reference},
+  {VP8_SET_POSTPROC,              ctrl_set_postproc},
+  {VP8_SET_DBG_COLOR_REF_FRAME,   ctrl_set_dbg_options},
+  {VP8_SET_DBG_COLOR_MB_MODES,    ctrl_set_dbg_options},
+  {VP8_SET_DBG_COLOR_B_MODES,     ctrl_set_dbg_options},
+  {VP8_SET_DBG_DISPLAY_MV,        ctrl_set_dbg_options},
+  {VP9_INVERT_TILE_DECODE_ORDER,  ctrl_set_invert_tile_order},
+  {VPXD_SET_DECRYPTOR,            ctrl_set_decryptor},
+  {VP9_SET_BYTE_ALIGNMENT,        ctrl_set_byte_alignment},
+  {VP9_SET_SKIP_LOOP_FILTER,      ctrl_set_skip_loop_filter},
+
+  // Getters
+  {VP8D_GET_LAST_REF_UPDATES,     ctrl_get_last_ref_updates},
+  {VP8D_GET_FRAME_CORRUPTED,      ctrl_get_frame_corrupted},
+  {VP9_GET_REFERENCE,             ctrl_get_reference},
+  {VP9D_GET_DISPLAY_SIZE,         ctrl_get_render_size},
+  {VP9D_GET_BIT_DEPTH,            ctrl_get_bit_depth},
+  {VP9D_GET_FRAME_SIZE,           ctrl_get_frame_size},
+
+  { -1, NULL},
+};
+
+#ifndef VERSION_STRING
+#define VERSION_STRING
+#endif
+CODEC_INTERFACE(vpx_codec_vp10_dx) = {
+  "WebM Project VP10 Decoder" VERSION_STRING,
+  VPX_CODEC_INTERNAL_ABI_VERSION,
+  VPX_CODEC_CAP_DECODER | VP9_CAP_POSTPROC |
+      VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER,  // vpx_codec_caps_t
+  decoder_init,       // vpx_codec_init_fn_t
+  decoder_destroy,    // vpx_codec_destroy_fn_t
+  decoder_ctrl_maps,  // vpx_codec_ctrl_fn_map_t
+  { // NOLINT
+    decoder_peek_si,    // vpx_codec_peek_si_fn_t
+    decoder_get_si,     // vpx_codec_get_si_fn_t
+    decoder_decode,     // vpx_codec_decode_fn_t
+    decoder_get_frame,  // vpx_codec_frame_get_fn_t
+    decoder_set_fb_fn,  // vpx_codec_set_fb_fn_t
+  },
+  { // NOLINT
+    0,
+    NULL,  // vpx_codec_enc_cfg_map_t
+    NULL,  // vpx_codec_encode_fn_t
+    NULL,  // vpx_codec_get_cx_data_fn_t
+    NULL,  // vpx_codec_enc_config_set_fn_t
+    NULL,  // vpx_codec_get_global_headers_fn_t
+    NULL,  // vpx_codec_get_preview_frame_fn_t
+    NULL   // vpx_codec_enc_mr_get_mem_loc_fn_t
+  }
+};
diff --git a/libs/libvpx/vp10/vp10_iface_common.h b/libs/libvpx/vp10/vp10_iface_common.h
new file mode 100644
index 0000000000..b2b4b7d8fc
--- /dev/null
+++ b/libs/libvpx/vp10/vp10_iface_common.h
@@ -0,0 +1,136 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef VP10_VP10_IFACE_COMMON_H_
+#define VP10_VP10_IFACE_COMMON_H_
+
+#include "vpx_ports/mem.h"
+
+static void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG  *yv12,
+                            void *user_priv) {
+  /** vpx_img_wrap() doesn't allow specifying independent strides for
+    * the Y, U, and V planes, nor other alignment adjustments that
+    * might be representable by a YV12_BUFFER_CONFIG, so we just
+    * initialize all the fields.*/
+  int bps;
+  if (!yv12->subsampling_y) {
+    if (!yv12->subsampling_x) {
+      img->fmt = VPX_IMG_FMT_I444;
+      bps = 24;
+    } else {
+      img->fmt = VPX_IMG_FMT_I422;
+      bps = 16;
+    }
+  } else {
+    if (!yv12->subsampling_x) {
+      img->fmt = VPX_IMG_FMT_I440;
+      bps = 16;
+    } else {
+      img->fmt = VPX_IMG_FMT_I420;
+      bps = 12;
+    }
+  }
+  img->cs = yv12->color_space;
+  img->range = yv12->color_range;
+  img->bit_depth = 8;
+  img->w = yv12->y_stride;
+  img->h = ALIGN_POWER_OF_TWO(yv12->y_height + 2 * VP9_ENC_BORDER_IN_PIXELS, 3);
+  img->d_w = yv12->y_crop_width;
+  img->d_h = yv12->y_crop_height;
+  img->r_w = yv12->render_width;
+  img->r_h = yv12->render_height;
+  img->x_chroma_shift = yv12->subsampling_x;
+  img->y_chroma_shift = yv12->subsampling_y;
+  img->planes[VPX_PLANE_Y] = yv12->y_buffer;
+  img->planes[VPX_PLANE_U] = yv12->u_buffer;
+  img->planes[VPX_PLANE_V] = yv12->v_buffer;
+  img->planes[VPX_PLANE_ALPHA] = NULL;
+  img->stride[VPX_PLANE_Y] = yv12->y_stride;
+  img->stride[VPX_PLANE_U] = yv12->uv_stride;
+  img->stride[VPX_PLANE_V] = yv12->uv_stride;
+  img->stride[VPX_PLANE_ALPHA] = yv12->y_stride;
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (yv12->flags & YV12_FLAG_HIGHBITDEPTH) {
+    // vpx_image_t uses byte strides and a pointer to the first byte
+    // of the image.
+    img->fmt = (vpx_img_fmt_t)(img->fmt | VPX_IMG_FMT_HIGHBITDEPTH);
+    img->bit_depth = yv12->bit_depth;
+    img->planes[VPX_PLANE_Y] = (uint8_t*)CONVERT_TO_SHORTPTR(yv12->y_buffer);
+    img->planes[VPX_PLANE_U] = (uint8_t*)CONVERT_TO_SHORTPTR(yv12->u_buffer);
+    img->planes[VPX_PLANE_V] = (uint8_t*)CONVERT_TO_SHORTPTR(yv12->v_buffer);
+    img->planes[VPX_PLANE_ALPHA] = NULL;
+    img->stride[VPX_PLANE_Y] = 2 * yv12->y_stride;
+    img->stride[VPX_PLANE_U] = 2 * yv12->uv_stride;
+    img->stride[VPX_PLANE_V] = 2 * yv12->uv_stride;
+    img->stride[VPX_PLANE_ALPHA] = 2 * yv12->y_stride;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  img->bps = bps;
+  img->user_priv = user_priv;
+  img->img_data = yv12->buffer_alloc;
+  img->img_data_owner = 0;
+  img->self_allocd = 0;
+}
+
+static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img,
+                                       YV12_BUFFER_CONFIG *yv12) {
+  yv12->y_buffer = img->planes[VPX_PLANE_Y];
+  yv12->u_buffer = img->planes[VPX_PLANE_U];
+  yv12->v_buffer = img->planes[VPX_PLANE_V];
+
+  yv12->y_crop_width  = img->d_w;
+  yv12->y_crop_height = img->d_h;
+  yv12->render_width  = img->r_w;
+  yv12->render_height = img->r_h;
+  yv12->y_width  = img->d_w;
+  yv12->y_height = img->d_h;
+
+  yv12->uv_width = img->x_chroma_shift == 1 ? (1 + yv12->y_width) / 2
+                                            : yv12->y_width;
+  yv12->uv_height = img->y_chroma_shift == 1 ? (1 + yv12->y_height) / 2
+                                             : yv12->y_height;
+  yv12->uv_crop_width = yv12->uv_width;
+  yv12->uv_crop_height = yv12->uv_height;
+
+  yv12->y_stride = img->stride[VPX_PLANE_Y];
+  yv12->uv_stride = img->stride[VPX_PLANE_U];
+  yv12->color_space = img->cs;
+  yv12->color_range = img->range;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
+    // In vpx_image_t
+    //     planes point to uint8 address of start of data
+    //     stride counts uint8s to reach next row
+    // In YV12_BUFFER_CONFIG
+    //     y_buffer, u_buffer, v_buffer point to uint16 address of data
+    //     stride and border counts in uint16s
+    // This means that all the address calculations in the main body of code
+    // should work correctly.
+    // However, before we do any pixel operations we need to cast the address
+    // to a uint16 ponter and double its value.
+    yv12->y_buffer = CONVERT_TO_BYTEPTR(yv12->y_buffer);
+    yv12->u_buffer = CONVERT_TO_BYTEPTR(yv12->u_buffer);
+    yv12->v_buffer = CONVERT_TO_BYTEPTR(yv12->v_buffer);
+    yv12->y_stride >>= 1;
+    yv12->uv_stride >>= 1;
+    yv12->flags = YV12_FLAG_HIGHBITDEPTH;
+  } else {
+    yv12->flags = 0;
+  }
+  yv12->border  = (yv12->y_stride - img->w) / 2;
+#else
+  yv12->border  = (img->stride[VPX_PLANE_Y] - img->w) / 2;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  yv12->subsampling_x = img->x_chroma_shift;
+  yv12->subsampling_y = img->y_chroma_shift;
+  return VPX_CODEC_OK;
+}
+
+#endif  // VP10_VP10_IFACE_COMMON_H_
diff --git a/libs/libvpx/vp10/vp10cx.mk b/libs/libvpx/vp10/vp10cx.mk
new file mode 100644
index 0000000000..dc3b27139c
--- /dev/null
+++ b/libs/libvpx/vp10/vp10cx.mk
@@ -0,0 +1,128 @@
+##
+##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+VP10_CX_EXPORTS += exports_enc
+
+VP10_CX_SRCS-yes += $(VP10_COMMON_SRCS-yes)
+VP10_CX_SRCS-no  += $(VP10_COMMON_SRCS-no)
+VP10_CX_SRCS_REMOVE-yes += $(VP10_COMMON_SRCS_REMOVE-yes)
+VP10_CX_SRCS_REMOVE-no  += $(VP10_COMMON_SRCS_REMOVE-no)
+
+VP10_CX_SRCS-yes += vp10_cx_iface.c
+
+VP10_CX_SRCS-yes += encoder/bitstream.c
+VP10_CX_SRCS-yes += encoder/context_tree.c
+VP10_CX_SRCS-yes += encoder/context_tree.h
+VP10_CX_SRCS-yes += encoder/cost.h
+VP10_CX_SRCS-yes += encoder/cost.c
+VP10_CX_SRCS-yes += encoder/dct.c
+VP10_CX_SRCS-$(CONFIG_VP9_TEMPORAL_DENOISING) += encoder/denoiser.c
+VP10_CX_SRCS-$(CONFIG_VP9_TEMPORAL_DENOISING) += encoder/denoiser.h
+VP10_CX_SRCS-yes += encoder/encodeframe.c
+VP10_CX_SRCS-yes += encoder/encodeframe.h
+VP10_CX_SRCS-yes += encoder/encodemb.c
+VP10_CX_SRCS-yes += encoder/encodemv.c
+VP10_CX_SRCS-yes += encoder/ethread.h
+VP10_CX_SRCS-yes += encoder/ethread.c
+VP10_CX_SRCS-yes += encoder/extend.c
+VP10_CX_SRCS-yes += encoder/firstpass.c
+VP10_CX_SRCS-yes += encoder/block.h
+VP10_CX_SRCS-yes += encoder/bitstream.h
+VP10_CX_SRCS-yes += encoder/encodemb.h
+VP10_CX_SRCS-yes += encoder/encodemv.h
+VP10_CX_SRCS-yes += encoder/extend.h
+VP10_CX_SRCS-yes += encoder/firstpass.h
+VP10_CX_SRCS-yes += encoder/lookahead.c
+VP10_CX_SRCS-yes += encoder/lookahead.h
+VP10_CX_SRCS-yes += encoder/mcomp.h
+VP10_CX_SRCS-yes += encoder/encoder.h
+VP10_CX_SRCS-yes += encoder/quantize.h
+VP10_CX_SRCS-yes += encoder/ratectrl.h
+VP10_CX_SRCS-yes += encoder/rd.h
+VP10_CX_SRCS-yes += encoder/rdopt.h
+VP10_CX_SRCS-yes += encoder/tokenize.h
+VP10_CX_SRCS-yes += encoder/treewriter.h
+VP10_CX_SRCS-yes += encoder/mcomp.c
+VP10_CX_SRCS-yes += encoder/encoder.c
+VP10_CX_SRCS-yes += encoder/picklpf.c
+VP10_CX_SRCS-yes += encoder/picklpf.h
+VP10_CX_SRCS-yes += encoder/quantize.c
+VP10_CX_SRCS-yes += encoder/ratectrl.c
+VP10_CX_SRCS-yes += encoder/rd.c
+VP10_CX_SRCS-yes += encoder/rdopt.c
+VP10_CX_SRCS-yes += encoder/segmentation.c
+VP10_CX_SRCS-yes += encoder/segmentation.h
+VP10_CX_SRCS-yes += encoder/speed_features.c
+VP10_CX_SRCS-yes += encoder/speed_features.h
+VP10_CX_SRCS-yes += encoder/subexp.c
+VP10_CX_SRCS-yes += encoder/subexp.h
+VP10_CX_SRCS-yes += encoder/resize.c
+VP10_CX_SRCS-yes += encoder/resize.h
+VP10_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/blockiness.c
+
+VP10_CX_SRCS-yes += encoder/tokenize.c
+VP10_CX_SRCS-yes += encoder/treewriter.c
+VP10_CX_SRCS-yes += encoder/aq_variance.c
+VP10_CX_SRCS-yes += encoder/aq_variance.h
+VP10_CX_SRCS-yes += encoder/aq_cyclicrefresh.c
+VP10_CX_SRCS-yes += encoder/aq_cyclicrefresh.h
+VP10_CX_SRCS-yes += encoder/aq_complexity.c
+VP10_CX_SRCS-yes += encoder/aq_complexity.h
+VP10_CX_SRCS-yes += encoder/skin_detection.c
+VP10_CX_SRCS-yes += encoder/skin_detection.h
+ifeq ($(CONFIG_VP9_POSTPROC),yes)
+VP10_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.h
+VP10_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.c
+endif
+VP10_CX_SRCS-yes += encoder/temporal_filter.c
+VP10_CX_SRCS-yes += encoder/temporal_filter.h
+VP10_CX_SRCS-yes += encoder/mbgraph.c
+VP10_CX_SRCS-yes += encoder/mbgraph.h
+
+VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
+VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.c
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/highbd_block_error_intrin_sse2.c
+endif
+
+ifeq ($(CONFIG_USE_X86INC),yes)
+VP10_CX_SRCS-$(HAVE_MMX) += encoder/x86/dct_mmx.asm
+VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/error_sse2.asm
+endif
+
+ifeq ($(ARCH_X86_64),yes)
+ifeq ($(CONFIG_USE_X86INC),yes)
+VP10_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3_x86_64.asm
+endif
+endif
+
+VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.c
+VP10_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/dct_ssse3.c
+
+ifeq ($(CONFIG_VP9_TEMPORAL_DENOISING),yes)
+VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/denoiser_sse2.c
+endif
+
+VP10_CX_SRCS-$(HAVE_AVX2) += encoder/x86/error_intrin_avx2.c
+
+ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+VP10_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/dct_neon.c
+VP10_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/error_neon.c
+endif
+VP10_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/quantize_neon.c
+
+VP10_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/error_msa.c
+VP10_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/fdct4x4_msa.c
+VP10_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/fdct8x8_msa.c
+VP10_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/fdct16x16_msa.c
+VP10_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/fdct_msa.h
+VP10_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/temporal_filter_msa.c
+
+VP10_CX_SRCS-yes := $(filter-out $(VP10_CX_SRCS_REMOVE-yes),$(VP10_CX_SRCS-yes))
diff --git a/libs/libvpx/vp10/vp10dx.mk b/libs/libvpx/vp10/vp10dx.mk
new file mode 100644
index 0000000000..fce6d0d7d7
--- /dev/null
+++ b/libs/libvpx/vp10/vp10dx.mk
@@ -0,0 +1,33 @@
+##
+##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+VP10_DX_EXPORTS += exports_dec
+
+VP10_DX_SRCS-yes += $(VP10_COMMON_SRCS-yes)
+VP10_DX_SRCS-no  += $(VP10_COMMON_SRCS-no)
+VP10_DX_SRCS_REMOVE-yes += $(VP10_COMMON_SRCS_REMOVE-yes)
+VP10_DX_SRCS_REMOVE-no  += $(VP10_COMMON_SRCS_REMOVE-no)
+
+VP10_DX_SRCS-yes += vp10_dx_iface.c
+
+VP10_DX_SRCS-yes += decoder/decodemv.c
+VP10_DX_SRCS-yes += decoder/decodeframe.c
+VP10_DX_SRCS-yes += decoder/decodeframe.h
+VP10_DX_SRCS-yes += decoder/detokenize.c
+VP10_DX_SRCS-yes += decoder/decodemv.h
+VP10_DX_SRCS-yes += decoder/detokenize.h
+VP10_DX_SRCS-yes += decoder/dthread.c
+VP10_DX_SRCS-yes += decoder/dthread.h
+VP10_DX_SRCS-yes += decoder/decoder.c
+VP10_DX_SRCS-yes += decoder/decoder.h
+VP10_DX_SRCS-yes += decoder/dsubexp.c
+VP10_DX_SRCS-yes += decoder/dsubexp.h
+
+VP10_DX_SRCS-yes := $(filter-out $(VP10_DX_SRCS_REMOVE-yes),$(VP10_DX_SRCS-yes))
diff --git a/libs/libvpx/vp8/common/alloccommon.c b/libs/libvpx/vp8/common/alloccommon.c
new file mode 100644
index 0000000000..8dfd4ce203
--- /dev/null
+++ b/libs/libvpx/vp8/common/alloccommon.c
@@ -0,0 +1,190 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "alloccommon.h"
+#include "blockd.h"
+#include "vpx_mem/vpx_mem.h"
+#include "onyxc_int.h"
+#include "findnearmv.h"
+#include "entropymode.h"
+#include "systemdependent.h"
+
+void vp8_de_alloc_frame_buffers(VP8_COMMON *oci)
+{
+    int i;
+    for (i = 0; i < NUM_YV12_BUFFERS; i++)
+        vp8_yv12_de_alloc_frame_buffer(&oci->yv12_fb[i]);
+
+    vp8_yv12_de_alloc_frame_buffer(&oci->temp_scale_frame);
+#if CONFIG_POSTPROC
+    vp8_yv12_de_alloc_frame_buffer(&oci->post_proc_buffer);
+    if (oci->post_proc_buffer_int_used)
+        vp8_yv12_de_alloc_frame_buffer(&oci->post_proc_buffer_int);
+
+    vpx_free(oci->pp_limits_buffer);
+    oci->pp_limits_buffer = NULL;
+#endif
+
+    vpx_free(oci->above_context);
+    vpx_free(oci->mip);
+#if CONFIG_ERROR_CONCEALMENT
+    vpx_free(oci->prev_mip);
+    oci->prev_mip = NULL;
+#endif
+
+    oci->above_context = NULL;
+    oci->mip = NULL;
+}
+
+int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height)
+{
+    int i;
+
+    vp8_de_alloc_frame_buffers(oci);
+
+    /* our internal buffers are always multiples of 16 */
+    if ((width & 0xf) != 0)
+        width += 16 - (width & 0xf);
+
+    if ((height & 0xf) != 0)
+        height += 16 - (height & 0xf);
+
+
+    for (i = 0; i < NUM_YV12_BUFFERS; i++)
+    {
+        oci->fb_idx_ref_cnt[i] = 0;
+        oci->yv12_fb[i].flags = 0;
+        if (vp8_yv12_alloc_frame_buffer(&oci->yv12_fb[i], width, height, VP8BORDERINPIXELS) < 0)
+            goto allocation_fail;
+    }
+
+    oci->new_fb_idx = 0;
+    oci->lst_fb_idx = 1;
+    oci->gld_fb_idx = 2;
+    oci->alt_fb_idx = 3;
+
+    oci->fb_idx_ref_cnt[0] = 1;
+    oci->fb_idx_ref_cnt[1] = 1;
+    oci->fb_idx_ref_cnt[2] = 1;
+    oci->fb_idx_ref_cnt[3] = 1;
+
+    if (vp8_yv12_alloc_frame_buffer(&oci->temp_scale_frame,   width, 16, VP8BORDERINPIXELS) < 0)
+        goto allocation_fail;
+
+    oci->mb_rows = height >> 4;
+    oci->mb_cols = width >> 4;
+    oci->MBs = oci->mb_rows * oci->mb_cols;
+    oci->mode_info_stride = oci->mb_cols + 1;
+    oci->mip = vpx_calloc((oci->mb_cols + 1) * (oci->mb_rows + 1), sizeof(MODE_INFO));
+
+    if (!oci->mip)
+        goto allocation_fail;
+
+    oci->mi = oci->mip + oci->mode_info_stride + 1;
+
+    /* Allocation of previous mode info will be done in vp8_decode_frame()
+     * as it is a decoder only data */
+
+    oci->above_context = vpx_calloc(sizeof(ENTROPY_CONTEXT_PLANES) * oci->mb_cols, 1);
+
+    if (!oci->above_context)
+        goto allocation_fail;
+
+#if CONFIG_POSTPROC
+    if (vp8_yv12_alloc_frame_buffer(&oci->post_proc_buffer, width, height, VP8BORDERINPIXELS) < 0)
+        goto allocation_fail;
+
+    oci->post_proc_buffer_int_used = 0;
+    memset(&oci->postproc_state, 0, sizeof(oci->postproc_state));
+    memset(oci->post_proc_buffer.buffer_alloc, 128,
+           oci->post_proc_buffer.frame_size);
+
+    /* Allocate buffer to store post-processing filter coefficients.
+     *
+     * Note: Round up mb_cols to support SIMD reads
+     */
+    oci->pp_limits_buffer = vpx_memalign(16, 24 * ((oci->mb_cols + 1) & ~1));
+    if (!oci->pp_limits_buffer)
+        goto allocation_fail;
+#endif
+
+    return 0;
+
+allocation_fail:
+    vp8_de_alloc_frame_buffers(oci);
+    return 1;
+}
+
+void vp8_setup_version(VP8_COMMON *cm)
+{
+    switch (cm->version)
+    {
+    case 0:
+        cm->no_lpf = 0;
+        cm->filter_type = NORMAL_LOOPFILTER;
+        cm->use_bilinear_mc_filter = 0;
+        cm->full_pixel = 0;
+        break;
+    case 1:
+        cm->no_lpf = 0;
+        cm->filter_type = SIMPLE_LOOPFILTER;
+        cm->use_bilinear_mc_filter = 1;
+        cm->full_pixel = 0;
+        break;
+    case 2:
+        cm->no_lpf = 1;
+        cm->filter_type = NORMAL_LOOPFILTER;
+        cm->use_bilinear_mc_filter = 1;
+        cm->full_pixel = 0;
+        break;
+    case 3:
+        cm->no_lpf = 1;
+        cm->filter_type = SIMPLE_LOOPFILTER;
+        cm->use_bilinear_mc_filter = 1;
+        cm->full_pixel = 1;
+        break;
+    default:
+        /*4,5,6,7 are reserved for future use*/
+        cm->no_lpf = 0;
+        cm->filter_type = NORMAL_LOOPFILTER;
+        cm->use_bilinear_mc_filter = 0;
+        cm->full_pixel = 0;
+        break;
+    }
+}
+void vp8_create_common(VP8_COMMON *oci)
+{
+    vp8_machine_specific_config(oci);
+
+    vp8_init_mbmode_probs(oci);
+    vp8_default_bmode_probs(oci->fc.bmode_prob);
+
+    oci->mb_no_coeff_skip = 1;
+    oci->no_lpf = 0;
+    oci->filter_type = NORMAL_LOOPFILTER;
+    oci->use_bilinear_mc_filter = 0;
+    oci->full_pixel = 0;
+    oci->multi_token_partition = ONE_PARTITION;
+    oci->clamp_type = RECON_CLAMP_REQUIRED;
+
+    /* Initialize reference frame sign bias structure to defaults */
+    memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias));
+
+    /* Default disable buffer to buffer copying */
+    oci->copy_buffer_to_gf = 0;
+    oci->copy_buffer_to_arf = 0;
+}
+
+void vp8_remove_common(VP8_COMMON *oci)
+{
+    vp8_de_alloc_frame_buffers(oci);
+}
diff --git a/libs/libvpx/vp8/common/alloccommon.h b/libs/libvpx/vp8/common/alloccommon.h
new file mode 100644
index 0000000000..93e99d76b1
--- /dev/null
+++ b/libs/libvpx/vp8/common/alloccommon.h
@@ -0,0 +1,31 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_COMMON_ALLOCCOMMON_H_
+#define VP8_COMMON_ALLOCCOMMON_H_
+
+#include "onyxc_int.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp8_create_common(VP8_COMMON *oci);
+void vp8_remove_common(VP8_COMMON *oci);
+void vp8_de_alloc_frame_buffers(VP8_COMMON *oci);
+int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height);
+void vp8_setup_version(VP8_COMMON *oci);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_COMMON_ALLOCCOMMON_H_
diff --git a/libs/libvpx/vp8/common/arm/armv6/bilinearfilter_v6.asm b/libs/libvpx/vp8/common/arm/armv6/bilinearfilter_v6.asm
new file mode 100644
index 0000000000..9704b42105
--- /dev/null
+++ b/libs/libvpx/vp8/common/arm/armv6/bilinearfilter_v6.asm
@@ -0,0 +1,237 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_filter_block2d_bil_first_pass_armv6|
+    EXPORT  |vp8_filter_block2d_bil_second_pass_armv6|
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+
+;-------------------------------------
+; r0    unsigned char  *src_ptr,
+; r1    unsigned short *dst_ptr,
+; r2    unsigned int    src_pitch,
+; r3    unsigned int    height,
+; stack unsigned int    width,
+; stack const short    *vp8_filter
+;-------------------------------------
+; The output is transposed stroed in output array to make it easy for second pass filtering.
+|vp8_filter_block2d_bil_first_pass_armv6| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #40]                  ; vp8_filter address
+    ldr     r4, [sp, #36]                   ; width
+
+    mov     r12, r3                         ; outer-loop counter
+
+    add     r7, r2, r4                      ; preload next row
+    pld     [r0, r7]
+
+    sub     r2, r2, r4                      ; src increment for height loop
+
+    ldr     r5, [r11]                       ; load up filter coefficients
+
+    mov     r3, r3, lsl #1                  ; height*2
+    add     r3, r3, #2                      ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1)
+
+    mov     r11, r1                         ; save dst_ptr for each row
+
+    cmp     r5, #128                        ; if filter coef = 128, then skip the filter
+    beq     bil_null_1st_filter
+
+|bil_height_loop_1st_v6|
+    ldrb    r6, [r0]                        ; load source data
+    ldrb    r7, [r0, #1]
+    ldrb    r8, [r0, #2]
+    mov     lr, r4, lsr #2                  ; 4-in-parellel loop counter
+
+|bil_width_loop_1st_v6|
+    ldrb    r9, [r0, #3]
+    ldrb    r10, [r0, #4]
+
+    pkhbt   r6, r6, r7, lsl #16             ; src[1] | src[0]
+    pkhbt   r7, r7, r8, lsl #16             ; src[2] | src[1]
+
+    smuad   r6, r6, r5                      ; apply the filter
+    pkhbt   r8, r8, r9, lsl #16             ; src[3] | src[2]
+    smuad   r7, r7, r5
+    pkhbt   r9, r9, r10, lsl #16            ; src[4] | src[3]
+
+    smuad   r8, r8, r5
+    smuad   r9, r9, r5
+
+    add     r0, r0, #4
+    subs    lr, lr, #1
+
+    add     r6, r6, #0x40                   ; round_shift_and_clamp
+    add     r7, r7, #0x40
+    usat    r6, #16, r6, asr #7
+    usat    r7, #16, r7, asr #7
+
+    strh    r6, [r1], r3                    ; result is transposed and stored
+
+    add     r8, r8, #0x40                   ; round_shift_and_clamp
+    strh    r7, [r1], r3
+    add     r9, r9, #0x40
+    usat    r8, #16, r8, asr #7
+    usat    r9, #16, r9, asr #7
+
+    strh    r8, [r1], r3                    ; result is transposed and stored
+
+    ldrneb  r6, [r0]                        ; load source data
+    strh    r9, [r1], r3
+
+    ldrneb  r7, [r0, #1]
+    ldrneb  r8, [r0, #2]
+
+    bne     bil_width_loop_1st_v6
+
+    add     r0, r0, r2                      ; move to next input row
+    subs    r12, r12, #1
+
+    add     r9, r2, r4, lsl #1              ; adding back block width
+    pld     [r0, r9]                        ; preload next row
+
+    add     r11, r11, #2                    ; move over to next column
+    mov     r1, r11
+
+    bne     bil_height_loop_1st_v6
+
+    ldmia   sp!, {r4 - r11, pc}
+
+|bil_null_1st_filter|
+|bil_height_loop_null_1st|
+    mov     lr, r4, lsr #2                  ; loop counter
+
+|bil_width_loop_null_1st|
+    ldrb    r6, [r0]                        ; load data
+    ldrb    r7, [r0, #1]
+    ldrb    r8, [r0, #2]
+    ldrb    r9, [r0, #3]
+
+    strh    r6, [r1], r3                    ; store it to immediate buffer
+    add     r0, r0, #4
+    strh    r7, [r1], r3
+    subs    lr, lr, #1
+    strh    r8, [r1], r3
+    strh    r9, [r1], r3
+
+    bne     bil_width_loop_null_1st
+
+    subs    r12, r12, #1
+    add     r0, r0, r2                      ; move to next input line
+    add     r11, r11, #2                    ; move over to next column
+    mov     r1, r11
+
+    bne     bil_height_loop_null_1st
+
+    ldmia   sp!, {r4 - r11, pc}
+
+    ENDP  ; |vp8_filter_block2d_bil_first_pass_armv6|
+
+
+;---------------------------------
+; r0    unsigned short *src_ptr,
+; r1    unsigned char  *dst_ptr,
+; r2    int             dst_pitch,
+; r3    unsigned int    height,
+; stack unsigned int    width,
+; stack const short    *vp8_filter
+;---------------------------------
+|vp8_filter_block2d_bil_second_pass_armv6| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #40]                  ; vp8_filter address
+    ldr     r4, [sp, #36]                   ; width
+
+    ldr     r5, [r11]                       ; load up filter coefficients
+    mov     r12, r4                         ; outer-loop counter = width, since we work on transposed data matrix
+    mov     r11, r1
+
+    cmp     r5, #128                        ; if filter coef = 128, then skip the filter
+    beq     bil_null_2nd_filter
+
+|bil_height_loop_2nd|
+    ldr     r6, [r0]                        ; load the data
+    ldr     r8, [r0, #4]
+    ldrh    r10, [r0, #8]
+    mov     lr, r3, lsr #2                  ; loop counter
+
+|bil_width_loop_2nd|
+    pkhtb   r7, r6, r8                      ; src[1] | src[2]
+    pkhtb   r9, r8, r10                     ; src[3] | src[4]
+
+    smuad   r6, r6, r5                      ; apply filter
+    smuad   r8, r8, r5                      ; apply filter
+
+    subs    lr, lr, #1
+
+    smuadx  r7, r7, r5                      ; apply filter
+    smuadx  r9, r9, r5                      ; apply filter
+
+    add     r0, r0, #8
+
+    add     r6, r6, #0x40                   ; round_shift_and_clamp
+    add     r7, r7, #0x40
+    usat    r6, #8, r6, asr #7
+    usat    r7, #8, r7, asr #7
+    strb    r6, [r1], r2                    ; the result is transposed back and stored
+
+    add     r8, r8, #0x40                   ; round_shift_and_clamp
+    strb    r7, [r1], r2
+    add     r9, r9, #0x40
+    usat    r8, #8, r8, asr #7
+    usat    r9, #8, r9, asr #7
+    strb    r8, [r1], r2                    ; the result is transposed back and stored
+
+    ldrne   r6, [r0]                        ; load data
+    strb    r9, [r1], r2
+    ldrne   r8, [r0, #4]
+    ldrneh  r10, [r0, #8]
+
+    bne     bil_width_loop_2nd
+
+    subs    r12, r12, #1
+    add     r0, r0, #4                      ; update src for next row
+    add     r11, r11, #1
+    mov     r1, r11
+
+    bne     bil_height_loop_2nd
+    ldmia   sp!, {r4 - r11, pc}
+
+|bil_null_2nd_filter|
+|bil_height_loop_null_2nd|
+    mov     lr, r3, lsr #2
+
+|bil_width_loop_null_2nd|
+    ldr     r6, [r0], #4                    ; load data
+    subs    lr, lr, #1
+    ldr     r8, [r0], #4
+
+    strb    r6, [r1], r2                    ; store data
+    mov     r7, r6, lsr #16
+    strb    r7, [r1], r2
+    mov     r9, r8, lsr #16
+    strb    r8, [r1], r2
+    strb    r9, [r1], r2
+
+    bne     bil_width_loop_null_2nd
+
+    subs    r12, r12, #1
+    add     r0, r0, #4
+    add     r11, r11, #1
+    mov     r1, r11
+
+    bne     bil_height_loop_null_2nd
+
+    ldmia   sp!, {r4 - r11, pc}
+    ENDP  ; |vp8_filter_block2d_second_pass_armv6|
+
+    END
diff --git a/libs/libvpx/vp8/common/arm/armv6/copymem16x16_v6.asm b/libs/libvpx/vp8/common/arm/armv6/copymem16x16_v6.asm
new file mode 100644
index 0000000000..abf048c2fa
--- /dev/null
+++ b/libs/libvpx/vp8/common/arm/armv6/copymem16x16_v6.asm
@@ -0,0 +1,186 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_copy_mem16x16_v6|
+    ; ARM
+    ; REQUIRE8
+    ; PRESERVE8
+
+    AREA    Block, CODE, READONLY ; name this block of code
+;void copy_mem16x16_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+|vp8_copy_mem16x16_v6| PROC
+    stmdb       sp!, {r4 - r7}
+    ;push   {r4-r7}
+
+    ;preload
+    pld     [r0, #31]                ; preload for next 16x16 block
+
+    ands    r4, r0, #15
+    beq     copy_mem16x16_fast
+
+    ands    r4, r0, #7
+    beq     copy_mem16x16_8
+
+    ands    r4, r0, #3
+    beq     copy_mem16x16_4
+
+    ;copy one byte each time
+    ldrb    r4, [r0]
+    ldrb    r5, [r0, #1]
+    ldrb    r6, [r0, #2]
+    ldrb    r7, [r0, #3]
+
+    mov     r12, #16
+
+copy_mem16x16_1_loop
+    strb    r4, [r2]
+    strb    r5, [r2, #1]
+    strb    r6, [r2, #2]
+    strb    r7, [r2, #3]
+
+    ldrb    r4, [r0, #4]
+    ldrb    r5, [r0, #5]
+    ldrb    r6, [r0, #6]
+    ldrb    r7, [r0, #7]
+
+    subs    r12, r12, #1
+
+    strb    r4, [r2, #4]
+    strb    r5, [r2, #5]
+    strb    r6, [r2, #6]
+    strb    r7, [r2, #7]
+
+    ldrb    r4, [r0, #8]
+    ldrb    r5, [r0, #9]
+    ldrb    r6, [r0, #10]
+    ldrb    r7, [r0, #11]
+
+    strb    r4, [r2, #8]
+    strb    r5, [r2, #9]
+    strb    r6, [r2, #10]
+    strb    r7, [r2, #11]
+
+    ldrb    r4, [r0, #12]
+    ldrb    r5, [r0, #13]
+    ldrb    r6, [r0, #14]
+    ldrb    r7, [r0, #15]
+
+    add     r0, r0, r1
+
+    strb    r4, [r2, #12]
+    strb    r5, [r2, #13]
+    strb    r6, [r2, #14]
+    strb    r7, [r2, #15]
+
+    add     r2, r2, r3
+
+    ldrneb  r4, [r0]
+    ldrneb  r5, [r0, #1]
+    ldrneb  r6, [r0, #2]
+    ldrneb  r7, [r0, #3]
+
+    pld     [r0, #31]               ; preload for next 16x16 block
+
+    bne     copy_mem16x16_1_loop
+
+    ldmia       sp!, {r4 - r7}
+    ;pop        {r4-r7}
+    mov     pc, lr
+
+;copy 4 bytes each time
+copy_mem16x16_4
+    ldr     r4, [r0]
+    ldr     r5, [r0, #4]
+    ldr     r6, [r0, #8]
+    ldr     r7, [r0, #12]
+
+    mov     r12, #16
+
+copy_mem16x16_4_loop
+    subs    r12, r12, #1
+    add     r0, r0, r1
+
+    str     r4, [r2]
+    str     r5, [r2, #4]
+    str     r6, [r2, #8]
+    str     r7, [r2, #12]
+
+    add     r2, r2, r3
+
+    ldrne   r4, [r0]
+    ldrne   r5, [r0, #4]
+    ldrne   r6, [r0, #8]
+    ldrne   r7, [r0, #12]
+
+    pld     [r0, #31]               ; preload for next 16x16 block
+
+    bne     copy_mem16x16_4_loop
+
+    ldmia       sp!, {r4 - r7}
+    ;pop        {r4-r7}
+    mov     pc, lr
+
+;copy 8 bytes each time
+copy_mem16x16_8
+    sub     r1, r1, #16
+    sub     r3, r3, #16
+
+    mov     r12, #16
+
+copy_mem16x16_8_loop
+    ldmia   r0!, {r4-r5}
+    ;ldm        r0, {r4-r5}
+    ldmia   r0!, {r6-r7}
+
+    add     r0, r0, r1
+
+    stmia   r2!, {r4-r5}
+    subs    r12, r12, #1
+    ;stm        r2, {r4-r5}
+    stmia   r2!, {r6-r7}
+
+    add     r2, r2, r3
+
+    pld     [r0, #31]               ; preload for next 16x16 block
+    bne     copy_mem16x16_8_loop
+
+    ldmia       sp!, {r4 - r7}
+    ;pop        {r4-r7}
+    mov     pc, lr
+
+;copy 16 bytes each time
+copy_mem16x16_fast
+    ;sub        r1, r1, #16
+    ;sub        r3, r3, #16
+
+    mov     r12, #16
+
+copy_mem16x16_fast_loop
+    ldmia   r0, {r4-r7}
+    ;ldm        r0, {r4-r7}
+    add     r0, r0, r1
+
+    subs    r12, r12, #1
+    stmia   r2, {r4-r7}
+    ;stm        r2, {r4-r7}
+    add     r2, r2, r3
+
+    pld     [r0, #31]               ; preload for next 16x16 block
+    bne     copy_mem16x16_fast_loop
+
+    ldmia       sp!, {r4 - r7}
+    ;pop        {r4-r7}
+    mov     pc, lr
+
+    ENDP  ; |vp8_copy_mem16x16_v6|
+
+    END
diff --git a/libs/libvpx/vp8/common/arm/armv6/copymem8x4_v6.asm b/libs/libvpx/vp8/common/arm/armv6/copymem8x4_v6.asm
new file mode 100644
index 0000000000..d8362ef052
--- /dev/null
+++ b/libs/libvpx/vp8/common/arm/armv6/copymem8x4_v6.asm
@@ -0,0 +1,128 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_copy_mem8x4_v6|
+    ; ARM
+    ; REQUIRE8
+    ; PRESERVE8
+
+    AREA    Block, CODE, READONLY ; name this block of code
+;void vp8_copy_mem8x4_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+|vp8_copy_mem8x4_v6| PROC
+    ;push   {r4-r5}
+    stmdb  sp!, {r4-r5}
+
+    ;preload
+    pld     [r0]
+    pld     [r0, r1]
+    pld     [r0, r1, lsl #1]
+
+    ands    r4, r0, #7
+    beq     copy_mem8x4_fast
+
+    ands    r4, r0, #3
+    beq     copy_mem8x4_4
+
+    ;copy 1 byte each time
+    ldrb    r4, [r0]
+    ldrb    r5, [r0, #1]
+
+    mov     r12, #4
+
+copy_mem8x4_1_loop
+    strb    r4, [r2]
+    strb    r5, [r2, #1]
+
+    ldrb    r4, [r0, #2]
+    ldrb    r5, [r0, #3]
+
+    subs    r12, r12, #1
+
+    strb    r4, [r2, #2]
+    strb    r5, [r2, #3]
+
+    ldrb    r4, [r0, #4]
+    ldrb    r5, [r0, #5]
+
+    strb    r4, [r2, #4]
+    strb    r5, [r2, #5]
+
+    ldrb    r4, [r0, #6]
+    ldrb    r5, [r0, #7]
+
+    add     r0, r0, r1
+
+    strb    r4, [r2, #6]
+    strb    r5, [r2, #7]
+
+    add     r2, r2, r3
+
+    ldrneb  r4, [r0]
+    ldrneb  r5, [r0, #1]
+
+    bne     copy_mem8x4_1_loop
+
+    ldmia       sp!, {r4 - r5}
+    ;pop        {r4-r5}
+    mov     pc, lr
+
+;copy 4 bytes each time
+copy_mem8x4_4
+    ldr     r4, [r0]
+    ldr     r5, [r0, #4]
+
+    mov     r12, #4
+
+copy_mem8x4_4_loop
+    subs    r12, r12, #1
+    add     r0, r0, r1
+
+    str     r4, [r2]
+    str     r5, [r2, #4]
+
+    add     r2, r2, r3
+
+    ldrne   r4, [r0]
+    ldrne   r5, [r0, #4]
+
+    bne     copy_mem8x4_4_loop
+
+    ldmia  sp!, {r4-r5}
+    ;pop        {r4-r5}
+    mov     pc, lr
+
+;copy 8 bytes each time
+copy_mem8x4_fast
+    ;sub        r1, r1, #8
+    ;sub        r3, r3, #8
+
+    mov     r12, #4
+
+copy_mem8x4_fast_loop
+    ldmia   r0, {r4-r5}
+    ;ldm        r0, {r4-r5}
+    add     r0, r0, r1
+
+    subs    r12, r12, #1
+    stmia   r2, {r4-r5}
+    ;stm        r2, {r4-r5}
+    add     r2, r2, r3
+
+    bne     copy_mem8x4_fast_loop
+
+    ldmia  sp!, {r4-r5}
+    ;pop        {r4-r5}
+    mov     pc, lr
+
+    ENDP  ; |vp8_copy_mem8x4_v6|
+
+    END
diff --git a/libs/libvpx/vp8/common/arm/armv6/copymem8x8_v6.asm b/libs/libvpx/vp8/common/arm/armv6/copymem8x8_v6.asm
new file mode 100644
index 0000000000..c6a60c610b
--- /dev/null
+++ b/libs/libvpx/vp8/common/arm/armv6/copymem8x8_v6.asm
@@ -0,0 +1,128 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_copy_mem8x8_v6|
+    ; ARM
+    ; REQUIRE8
+    ; PRESERVE8
+
+    AREA    Block, CODE, READONLY ; name this block of code
+;void copy_mem8x8_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+|vp8_copy_mem8x8_v6| PROC
+    ;push   {r4-r5}
+    stmdb  sp!, {r4-r5}
+
+    ;preload
+    pld     [r0]
+    pld     [r0, r1]
+    pld     [r0, r1, lsl #1]
+
+    ands    r4, r0, #7
+    beq     copy_mem8x8_fast
+
+    ands    r4, r0, #3
+    beq     copy_mem8x8_4
+
+    ;copy 1 byte each time
+    ldrb    r4, [r0]
+    ldrb    r5, [r0, #1]
+
+    mov     r12, #8
+
+copy_mem8x8_1_loop
+    strb    r4, [r2]
+    strb    r5, [r2, #1]
+
+    ldrb    r4, [r0, #2]
+    ldrb    r5, [r0, #3]
+
+    subs    r12, r12, #1
+
+    strb    r4, [r2, #2]
+    strb    r5, [r2, #3]
+
+    ldrb    r4, [r0, #4]
+    ldrb    r5, [r0, #5]
+
+    strb    r4, [r2, #4]
+    strb    r5, [r2, #5]
+
+    ldrb    r4, [r0, #6]
+    ldrb    r5, [r0, #7]
+
+    add     r0, r0, r1
+
+    strb    r4, [r2, #6]
+    strb    r5, [r2, #7]
+
+    add     r2, r2, r3
+
+    ldrneb  r4, [r0]
+    ldrneb  r5, [r0, #1]
+
+    bne     copy_mem8x8_1_loop
+
+    ldmia       sp!, {r4 - r5}
+    ;pop        {r4-r5}
+    mov     pc, lr
+
+;copy 4 bytes each time
+copy_mem8x8_4
+    ldr     r4, [r0]
+    ldr     r5, [r0, #4]
+
+    mov     r12, #8
+
+copy_mem8x8_4_loop
+    subs    r12, r12, #1
+    add     r0, r0, r1
+
+    str     r4, [r2]
+    str     r5, [r2, #4]
+
+    add     r2, r2, r3
+
+    ldrne   r4, [r0]
+    ldrne   r5, [r0, #4]
+
+    bne     copy_mem8x8_4_loop
+
+    ldmia       sp!, {r4 - r5}
+    ;pop        {r4-r5}
+    mov     pc, lr
+
+;copy 8 bytes each time
+copy_mem8x8_fast
+    ;sub        r1, r1, #8
+    ;sub        r3, r3, #8
+
+    mov     r12, #8
+
+copy_mem8x8_fast_loop
+    ldmia   r0, {r4-r5}
+    ;ldm        r0, {r4-r5}
+    add     r0, r0, r1
+
+    subs    r12, r12, #1
+    stmia   r2, {r4-r5}
+    ;stm        r2, {r4-r5}
+    add     r2, r2, r3
+
+    bne     copy_mem8x8_fast_loop
+
+    ldmia  sp!, {r4-r5}
+    ;pop        {r4-r5}
+    mov     pc, lr
+
+    ENDP  ; |vp8_copy_mem8x8_v6|
+
+    END
diff --git a/libs/libvpx/vp8/common/arm/armv6/dc_only_idct_add_v6.asm b/libs/libvpx/vp8/common/arm/armv6/dc_only_idct_add_v6.asm
new file mode 100644
index 0000000000..9aa659fa70
--- /dev/null
+++ b/libs/libvpx/vp8/common/arm/armv6/dc_only_idct_add_v6.asm
@@ -0,0 +1,70 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+    EXPORT  |vp8_dc_only_idct_add_v6|
+
+    AREA    |.text|, CODE, READONLY
+
+;void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
+;                            int pred_stride, unsigned char *dst_ptr,
+;                            int dst_stride)
+; r0  input_dc
+; r1  pred_ptr
+; r2  pred_stride
+; r3  dst_ptr
+; sp  dst_stride
+
+|vp8_dc_only_idct_add_v6| PROC
+    stmdb       sp!, {r4 - r7}
+
+    add         r0, r0, #4                ; input_dc += 4
+    ldr         r12, c0x0000FFFF
+    ldr         r4, [r1], r2
+    and         r0, r12, r0, asr #3       ; input_dc >> 3 + mask
+    ldr         r6, [r1], r2
+    orr         r0, r0, r0, lsl #16       ; a1 | a1
+
+    ldr         r12, [sp, #16]            ; dst stride
+
+    uxtab16     r5, r0, r4                ; a1+2 | a1+0
+    uxtab16     r4, r0, r4, ror #8        ; a1+3 | a1+1
+    uxtab16     r7, r0, r6
+    uxtab16     r6, r0, r6, ror #8
+    usat16      r5, #8, r5
+    usat16      r4, #8, r4
+    usat16      r7, #8, r7
+    usat16      r6, #8, r6
+    orr         r5, r5, r4, lsl #8
+    orr         r7, r7, r6, lsl #8
+    ldr         r4, [r1], r2
+    str         r5, [r3], r12
+    ldr         r6, [r1]
+    str         r7, [r3], r12
+
+    uxtab16     r5, r0, r4
+    uxtab16     r4, r0, r4, ror #8
+    uxtab16     r7, r0, r6
+    uxtab16     r6, r0, r6, ror #8
+    usat16      r5, #8, r5
+    usat16      r4, #8, r4
+    usat16      r7, #8, r7
+    usat16      r6, #8, r6
+    orr         r5, r5, r4, lsl #8
+    orr         r7, r7, r6, lsl #8
+    str         r5, [r3], r12
+    str         r7, [r3]
+
+    ldmia       sp!, {r4 - r7}
+    bx          lr
+
+    ENDP  ; |vp8_dc_only_idct_add_v6|
+
+; Constant Pool
+c0x0000FFFF DCD 0x0000FFFF
+    END
diff --git a/libs/libvpx/vp8/common/arm/armv6/dequant_idct_v6.asm b/libs/libvpx/vp8/common/arm/armv6/dequant_idct_v6.asm
new file mode 100644
index 0000000000..db48ded582
--- /dev/null
+++ b/libs/libvpx/vp8/common/arm/armv6/dequant_idct_v6.asm
@@ -0,0 +1,190 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+    EXPORT |vp8_dequant_idct_add_v6|
+
+    AREA |.text|, CODE, READONLY
+;void vp8_dequant_idct_v6(short *input, short *dq,
+;                         unsigned char *dest, int stride)
+; r0 = q
+; r1 = dq
+; r2 = dst
+; r3 = stride
+
+|vp8_dequant_idct_add_v6| PROC
+    stmdb   sp!, {r4-r11, lr}
+
+    ldr     r4, [r0]                ;input
+    ldr     r5, [r1], #4            ;dq
+
+    sub     sp, sp, #4
+    str     r3, [sp]
+
+    mov     r12, #4
+
+vp8_dequant_add_loop
+    smulbb  r6, r4, r5
+    smultt  r7, r4, r5
+
+    ldr     r4, [r0, #4]            ;input
+    ldr     r5, [r1], #4            ;dq
+
+    strh    r6, [r0], #2
+    strh    r7, [r0], #2
+
+    smulbb  r6, r4, r5
+    smultt  r7, r4, r5
+
+    subs    r12, r12, #1
+
+    ldrne   r4, [r0, #4]
+    ldrne   r5, [r1], #4
+
+    strh    r6, [r0], #2
+    strh    r7, [r0], #2
+
+    bne     vp8_dequant_add_loop
+
+    sub     r0, r0, #32
+    mov     r1, r0
+
+; short_idct4x4llm_v6_dual
+    ldr     r3, cospi8sqrt2minus1
+    ldr     r4, sinpi8sqrt2
+    ldr     r6, [r0, #8]
+    mov     r5, #2
+vp8_dequant_idct_loop1_v6
+    ldr     r12, [r0, #24]
+    ldr     r14, [r0, #16]
+    smulwt  r9, r3, r6
+    smulwb  r7, r3, r6
+    smulwt  r10, r4, r6
+    smulwb  r8, r4, r6
+    pkhbt   r7, r7, r9, lsl #16
+    smulwt  r11, r3, r12
+    pkhbt   r8, r8, r10, lsl #16
+    uadd16  r6, r6, r7
+    smulwt  r7, r4, r12
+    smulwb  r9, r3, r12
+    smulwb  r10, r4, r12
+    subs    r5, r5, #1
+    pkhbt   r9, r9, r11, lsl #16
+    ldr     r11, [r0], #4
+    pkhbt   r10, r10, r7, lsl #16
+    uadd16  r7, r12, r9
+    usub16  r7, r8, r7
+    uadd16  r6, r6, r10
+    uadd16  r10, r11, r14
+    usub16  r8, r11, r14
+    uadd16  r9, r10, r6
+    usub16  r10, r10, r6
+    uadd16  r6, r8, r7
+    usub16  r7, r8, r7
+    str     r6, [r1, #8]
+    ldrne   r6, [r0, #8]
+    str     r7, [r1, #16]
+    str     r10, [r1, #24]
+    str     r9, [r1], #4
+    bne     vp8_dequant_idct_loop1_v6
+
+    mov     r5, #2
+    sub     r0, r1, #8
+vp8_dequant_idct_loop2_v6
+    ldr     r6, [r0], #4
+    ldr     r7, [r0], #4
+    ldr     r8, [r0], #4
+    ldr     r9, [r0], #4
+    smulwt  r1, r3, r6
+    smulwt  r12, r4, r6
+    smulwt  lr, r3, r8
+    smulwt  r10, r4, r8
+    pkhbt   r11, r8, r6, lsl #16
+    pkhbt   r1, lr, r1, lsl #16
+    pkhbt   r12, r10, r12, lsl #16
+    pkhtb   r6, r6, r8, asr #16
+    uadd16  r6, r1, r6
+    pkhbt   lr, r9, r7, lsl #16
+    uadd16  r10, r11, lr
+    usub16  lr, r11, lr
+    pkhtb   r8, r7, r9, asr #16
+    subs    r5, r5, #1
+    smulwt  r1, r3, r8
+    smulwb  r7, r3, r8
+    smulwt  r11, r4, r8
+    smulwb  r9, r4, r8
+    pkhbt   r1, r7, r1, lsl #16
+    uadd16  r8, r1, r8
+    pkhbt   r11, r9, r11, lsl #16
+    usub16  r1, r12, r8
+    uadd16  r8, r11, r6
+    ldr     r9, c0x00040004
+    ldr     r12, [sp]               ; get stride from stack
+    uadd16  r6, r10, r8
+    usub16  r7, r10, r8
+    uadd16  r7, r7, r9
+    uadd16  r6, r6, r9
+    uadd16  r10, r14, r1
+    usub16  r1, r14, r1
+    uadd16  r10, r10, r9
+    uadd16  r1, r1, r9
+    ldr     r11, [r2]               ; load input from dst
+    mov     r8, r7, asr #3
+    pkhtb   r9, r8, r10, asr #19
+    mov     r8, r1, asr #3
+    pkhtb   r8, r8, r6, asr #19
+    uxtb16  lr, r11, ror #8
+    qadd16  r9, r9, lr
+    uxtb16  lr, r11
+    qadd16  r8, r8, lr
+    usat16  r9, #8, r9
+    usat16  r8, #8, r8
+    orr     r9, r8, r9, lsl #8
+    ldr     r11, [r2, r12]          ; load input from dst
+    mov     r7, r7, lsl #16
+    mov     r1, r1, lsl #16
+    mov     r10, r10, lsl #16
+    mov     r6, r6, lsl #16
+    mov     r7, r7, asr #3
+    pkhtb   r7, r7, r10, asr #19
+    mov     r1, r1, asr #3
+    pkhtb   r1, r1, r6, asr #19
+    uxtb16  r8, r11, ror #8
+    qadd16  r7, r7, r8
+    uxtb16  r8, r11
+    qadd16  r1, r1, r8
+    usat16  r7, #8, r7
+    usat16  r1, #8, r1
+    orr     r1, r1, r7, lsl #8
+    str     r9, [r2], r12           ; store output to dst
+    str     r1, [r2], r12           ; store output to dst
+    bne     vp8_dequant_idct_loop2_v6
+
+; memset
+    sub     r0, r0, #32
+    add     sp, sp, #4
+
+    mov     r12, #0
+    str     r12, [r0]
+    str     r12, [r0, #4]
+    str     r12, [r0, #8]
+    str     r12, [r0, #12]
+    str     r12, [r0, #16]
+    str     r12, [r0, #20]
+    str     r12, [r0, #24]
+    str     r12, [r0, #28]
+
+    ldmia   sp!, {r4 - r11, pc}
+    ENDP    ; |vp8_dequant_idct_add_v6|
+
+; Constant Pool
+cospi8sqrt2minus1 DCD 0x00004E7B
+sinpi8sqrt2       DCD 0x00008A8C
+c0x00040004       DCD 0x00040004
+
+    END
diff --git a/libs/libvpx/vp8/common/arm/armv6/dequantize_v6.asm b/libs/libvpx/vp8/common/arm/armv6/dequantize_v6.asm
new file mode 100644
index 0000000000..72f7e0ee57
--- /dev/null
+++ b/libs/libvpx/vp8/common/arm/armv6/dequantize_v6.asm
@@ -0,0 +1,69 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_dequantize_b_loop_v6|
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+;-------------------------------
+;void   vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ);
+; r0    short *Q,
+; r1    short *DQC
+; r2    short *DQ
+|vp8_dequantize_b_loop_v6| PROC
+    stmdb   sp!, {r4-r9, lr}
+
+    ldr     r3, [r0]                ;load Q
+    ldr     r4, [r1]                ;load DQC
+    ldr     r5, [r0, #4]
+    ldr     r6, [r1, #4]
+
+    mov     r12, #2                 ;loop counter
+
+dequant_loop
+    smulbb  r7, r3, r4              ;multiply
+    smultt  r8, r3, r4
+    smulbb  r9, r5, r6
+    smultt  lr, r5, r6
+
+    ldr     r3, [r0, #8]
+    ldr     r4, [r1, #8]
+    ldr     r5, [r0, #12]
+    ldr     r6, [r1, #12]
+
+    strh    r7, [r2], #2            ;store result
+    smulbb  r7, r3, r4              ;multiply
+    strh    r8, [r2], #2
+    smultt  r8, r3, r4
+    strh    r9, [r2], #2
+    smulbb  r9, r5, r6
+    strh    lr, [r2], #2
+    smultt  lr, r5, r6
+
+    subs    r12, r12, #1
+
+    add     r0, r0, #16
+    add     r1, r1, #16
+
+    ldrne       r3, [r0]
+    strh    r7, [r2], #2            ;store result
+    ldrne       r4, [r1]
+    strh    r8, [r2], #2
+    ldrne       r5, [r0, #4]
+    strh    r9, [r2], #2
+    ldrne       r6, [r1, #4]
+    strh    lr, [r2], #2
+
+    bne     dequant_loop
+
+    ldmia   sp!, {r4-r9, pc}
+    ENDP    ;|vp8_dequantize_b_loop_v6|
+
+    END
diff --git a/libs/libvpx/vp8/common/arm/armv6/filter_v6.asm b/libs/libvpx/vp8/common/arm/armv6/filter_v6.asm
new file mode 100644
index 0000000000..eb4b75bd80
--- /dev/null
+++ b/libs/libvpx/vp8/common/arm/armv6/filter_v6.asm
@@ -0,0 +1,624 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_filter_block2d_first_pass_armv6|
+    EXPORT  |vp8_filter_block2d_first_pass_16x16_armv6|
+    EXPORT  |vp8_filter_block2d_first_pass_8x8_armv6|
+    EXPORT  |vp8_filter_block2d_second_pass_armv6|
+    EXPORT  |vp8_filter4_block2d_second_pass_armv6|
+    EXPORT  |vp8_filter_block2d_first_pass_only_armv6|
+    EXPORT  |vp8_filter_block2d_second_pass_only_armv6|
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+;-------------------------------------
+; r0    unsigned char *src_ptr
+; r1    short         *output_ptr
+; r2    unsigned int src_pixels_per_line
+; r3    unsigned int output_width
+; stack unsigned int output_height
+; stack const short *vp8_filter
+;-------------------------------------
+; vp8_filter the input and put in the output array.  Apply the 6 tap FIR filter with
+; the output being a 2 byte value and the intput being a 1 byte value.
+|vp8_filter_block2d_first_pass_armv6| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #40]                  ; vp8_filter address
+    ldr     r7, [sp, #36]                   ; output height
+
+    sub     r2, r2, r3                      ; inside loop increments input array,
+                                            ; so the height loop only needs to add
+                                            ; r2 - width to the input pointer
+
+    mov     r3, r3, lsl #1                  ; multiply width by 2 because using shorts
+    add     r12, r3, #16                    ; square off the output
+    sub     sp, sp, #4
+
+    ldr     r4, [r11]                       ; load up packed filter coefficients
+    ldr     r5, [r11, #4]
+    ldr     r6, [r11, #8]
+
+    str     r1, [sp]                        ; push destination to stack
+    mov     r7, r7, lsl #16                 ; height is top part of counter
+
+; six tap filter
+|height_loop_1st_6|
+    ldrb    r8, [r0, #-2]                   ; load source data
+    ldrb    r9, [r0, #-1]
+    ldrb    r10, [r0], #2
+    orr     r7, r7, r3, lsr #2              ; construct loop counter
+
+|width_loop_1st_6|
+    ldrb    r11, [r0, #-1]
+
+    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8
+    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9
+
+    ldrb    r9, [r0]
+
+    smuad   lr, lr, r4                      ; apply the filter
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+    smuad   r8, r8, r4
+    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11
+
+    smlad   lr, r10, r5, lr
+    ldrb    r10, [r0, #1]
+    smlad   r8, r11, r5, r8
+    ldrb    r11, [r0, #2]
+
+    sub     r7, r7, #1
+
+    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+
+    smlad   lr, r9, r6, lr
+    smlad   r11, r10, r6, r8
+
+    ands    r10, r7, #0xff                  ; test loop counter
+
+    add     lr, lr, #0x40                   ; round_shift_and_clamp
+    ldrneb  r8, [r0, #-2]                   ; load data for next loop
+    usat    lr, #8, lr, asr #7
+    add     r11, r11, #0x40
+    ldrneb  r9, [r0, #-1]
+    usat    r11, #8, r11, asr #7
+
+    strh    lr, [r1], r12                   ; result is transposed and stored, which
+                                            ; will make second pass filtering easier.
+    ldrneb  r10, [r0], #2
+    strh    r11, [r1], r12
+
+    bne     width_loop_1st_6
+
+    ldr     r1, [sp]                        ; load and update dst address
+    subs    r7, r7, #0x10000
+    add     r0, r0, r2                      ; move to next input line
+
+    add     r1, r1, #2                      ; move over to next column
+    str     r1, [sp]
+
+    bne     height_loop_1st_6
+
+    add     sp, sp, #4
+    ldmia   sp!, {r4 - r11, pc}
+
+    ENDP
+
+; --------------------------
+; 16x16 version
+; -----------------------------
+|vp8_filter_block2d_first_pass_16x16_armv6| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #40]                  ; vp8_filter address
+    ldr     r7, [sp, #36]                   ; output height
+
+    add     r4, r2, #18                     ; preload next low
+    pld     [r0, r4]
+
+    sub     r2, r2, r3                      ; inside loop increments input array,
+                                            ; so the height loop only needs to add
+                                            ; r2 - width to the input pointer
+
+    mov     r3, r3, lsl #1                  ; multiply width by 2 because using shorts
+    add     r12, r3, #16                    ; square off the output
+    sub     sp, sp, #4
+
+    ldr     r4, [r11]                       ; load up packed filter coefficients
+    ldr     r5, [r11, #4]
+    ldr     r6, [r11, #8]
+
+    str     r1, [sp]                        ; push destination to stack
+    mov     r7, r7, lsl #16                 ; height is top part of counter
+
+; six tap filter
+|height_loop_1st_16_6|
+    ldrb    r8, [r0, #-2]                   ; load source data
+    ldrb    r9, [r0, #-1]
+    ldrb    r10, [r0], #2
+    orr     r7, r7, r3, lsr #2              ; construct loop counter
+
+|width_loop_1st_16_6|
+    ldrb    r11, [r0, #-1]
+
+    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8
+    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9
+
+    ldrb    r9, [r0]
+
+    smuad   lr, lr, r4                      ; apply the filter
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+    smuad   r8, r8, r4
+    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11
+
+    smlad   lr, r10, r5, lr
+    ldrb    r10, [r0, #1]
+    smlad   r8, r11, r5, r8
+    ldrb    r11, [r0, #2]
+
+    sub     r7, r7, #1
+
+    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+
+    smlad   lr, r9, r6, lr
+    smlad   r11, r10, r6, r8
+
+    ands    r10, r7, #0xff                  ; test loop counter
+
+    add     lr, lr, #0x40                   ; round_shift_and_clamp
+    ldrneb  r8, [r0, #-2]                   ; load data for next loop
+    usat    lr, #8, lr, asr #7
+    add     r11, r11, #0x40
+    ldrneb  r9, [r0, #-1]
+    usat    r11, #8, r11, asr #7
+
+    strh    lr, [r1], r12                   ; result is transposed and stored, which
+                                            ; will make second pass filtering easier.
+    ldrneb  r10, [r0], #2
+    strh    r11, [r1], r12
+
+    bne     width_loop_1st_16_6
+
+    ldr     r1, [sp]                        ; load and update dst address
+    subs    r7, r7, #0x10000
+    add     r0, r0, r2                      ; move to next input line
+
+    add     r11, r2, #34                    ; adding back block width(=16)
+    pld     [r0, r11]                       ; preload next low
+
+    add     r1, r1, #2                      ; move over to next column
+    str     r1, [sp]
+
+    bne     height_loop_1st_16_6
+
+    add     sp, sp, #4
+    ldmia   sp!, {r4 - r11, pc}
+
+    ENDP
+
+; --------------------------
+; 8x8 version
+; -----------------------------
+|vp8_filter_block2d_first_pass_8x8_armv6| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #40]                  ; vp8_filter address
+    ldr     r7, [sp, #36]                   ; output height
+
+    add     r4, r2, #10                     ; preload next low
+    pld     [r0, r4]
+
+    sub     r2, r2, r3                      ; inside loop increments input array,
+                                            ; so the height loop only needs to add
+                                            ; r2 - width to the input pointer
+
+    mov     r3, r3, lsl #1                  ; multiply width by 2 because using shorts
+    add     r12, r3, #16                    ; square off the output
+    sub     sp, sp, #4
+
+    ldr     r4, [r11]                       ; load up packed filter coefficients
+    ldr     r5, [r11, #4]
+    ldr     r6, [r11, #8]
+
+    str     r1, [sp]                        ; push destination to stack
+    mov     r7, r7, lsl #16                 ; height is top part of counter
+
+; six tap filter
+|height_loop_1st_8_6|
+    ldrb    r8, [r0, #-2]                   ; load source data
+    ldrb    r9, [r0, #-1]
+    ldrb    r10, [r0], #2
+    orr     r7, r7, r3, lsr #2              ; construct loop counter
+
+|width_loop_1st_8_6|
+    ldrb    r11, [r0, #-1]
+
+    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8
+    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9
+
+    ldrb    r9, [r0]
+
+    smuad   lr, lr, r4                      ; apply the filter
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+    smuad   r8, r8, r4
+    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11
+
+    smlad   lr, r10, r5, lr
+    ldrb    r10, [r0, #1]
+    smlad   r8, r11, r5, r8
+    ldrb    r11, [r0, #2]
+
+    sub     r7, r7, #1
+
+    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+
+    smlad   lr, r9, r6, lr
+    smlad   r11, r10, r6, r8
+
+    ands    r10, r7, #0xff                  ; test loop counter
+
+    add     lr, lr, #0x40                   ; round_shift_and_clamp
+    ldrneb  r8, [r0, #-2]                   ; load data for next loop
+    usat    lr, #8, lr, asr #7
+    add     r11, r11, #0x40
+    ldrneb  r9, [r0, #-1]
+    usat    r11, #8, r11, asr #7
+
+    strh    lr, [r1], r12                   ; result is transposed and stored, which
+                                            ; will make second pass filtering easier.
+    ldrneb  r10, [r0], #2
+    strh    r11, [r1], r12
+
+    bne     width_loop_1st_8_6
+
+    ldr     r1, [sp]                        ; load and update dst address
+    subs    r7, r7, #0x10000
+    add     r0, r0, r2                      ; move to next input line
+
+    add     r11, r2, #18                    ; adding back block width(=8)
+    pld     [r0, r11]                       ; preload next low
+
+    add     r1, r1, #2                      ; move over to next column
+    str     r1, [sp]
+
+    bne     height_loop_1st_8_6
+
+    add     sp, sp, #4
+    ldmia   sp!, {r4 - r11, pc}
+
+    ENDP
+
+;---------------------------------
+; r0    short         *src_ptr,
+; r1    unsigned char *output_ptr,
+; r2    unsigned int output_pitch,
+; r3    unsigned int cnt,
+; stack const short *vp8_filter
+;---------------------------------
+|vp8_filter_block2d_second_pass_armv6| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #36]                  ; vp8_filter address
+    sub     sp, sp, #4
+    mov     r7, r3, lsl #16                 ; height is top part of counter
+    str     r1, [sp]                        ; push destination to stack
+
+    ldr     r4, [r11]                       ; load up packed filter coefficients
+    ldr     r5, [r11, #4]
+    ldr     r6, [r11, #8]
+
+    pkhbt   r12, r5, r4                     ; pack the filter differently
+    pkhbt   r11, r6, r5
+
+    sub     r0, r0, #4                      ; offset input buffer
+
+|height_loop_2nd|
+    ldr     r8, [r0]                        ; load the data
+    ldr     r9, [r0, #4]
+    orr     r7, r7, r3, lsr #1              ; loop counter
+
+|width_loop_2nd|
+    smuad   lr, r4, r8                      ; apply filter
+    sub     r7, r7, #1
+    smulbt  r8, r4, r8
+
+    ldr     r10, [r0, #8]
+
+    smlad   lr, r5, r9, lr
+    smladx  r8, r12, r9, r8
+
+    ldrh    r9, [r0, #12]
+
+    smlad   lr, r6, r10, lr
+    smladx  r8, r11, r10, r8
+
+    add     r0, r0, #4
+    smlatb  r10, r6, r9, r8
+
+    add     lr, lr, #0x40                   ; round_shift_and_clamp
+    ands    r8, r7, #0xff
+    usat    lr, #8, lr, asr #7
+    add     r10, r10, #0x40
+    strb    lr, [r1], r2                    ; the result is transposed back and stored
+    usat    r10, #8, r10, asr #7
+
+    ldrne   r8, [r0]                        ; load data for next loop
+    ldrne   r9, [r0, #4]
+    strb    r10, [r1], r2
+
+    bne     width_loop_2nd
+
+    ldr     r1, [sp]                        ; update dst for next loop
+    subs    r7, r7, #0x10000
+    add     r0, r0, #16                     ; updata src for next loop
+    add     r1, r1, #1
+    str     r1, [sp]
+
+    bne     height_loop_2nd
+
+    add     sp, sp, #4
+    ldmia   sp!, {r4 - r11, pc}
+
+    ENDP
+
+;---------------------------------
+; r0    short         *src_ptr,
+; r1    unsigned char *output_ptr,
+; r2    unsigned int output_pitch,
+; r3    unsigned int cnt,
+; stack const short *vp8_filter
+;---------------------------------
+|vp8_filter4_block2d_second_pass_armv6| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #36]                  ; vp8_filter address
+    mov     r7, r3, lsl #16                 ; height is top part of counter
+
+    ldr     r4, [r11]                       ; load up packed filter coefficients
+    add     lr, r1, r3                      ; save final destination pointer
+    ldr     r5, [r11, #4]
+    ldr     r6, [r11, #8]
+
+    pkhbt   r12, r5, r4                     ; pack the filter differently
+    pkhbt   r11, r6, r5
+    mov     r4, #0x40                       ; rounding factor (for smlad{x})
+
+|height_loop_2nd_4|
+    ldrd    r8, r9, [r0, #-4]               ; load the data
+    orr     r7, r7, r3, lsr #1              ; loop counter
+
+|width_loop_2nd_4|
+    ldr     r10, [r0, #4]!
+    smladx  r6, r9, r12, r4                 ; apply filter
+    pkhbt   r8, r9, r8
+    smlad   r5, r8, r12, r4
+    pkhbt   r8, r10, r9
+    smladx  r6, r10, r11, r6
+    sub     r7, r7, #1
+    smlad   r5, r8, r11, r5
+
+    mov     r8, r9                          ; shift the data for the next loop
+    mov     r9, r10
+
+    usat    r6, #8, r6, asr #7              ; shift and clamp
+    usat    r5, #8, r5, asr #7
+
+    strb    r5, [r1], r2                    ; the result is transposed back and stored
+    tst     r7, #0xff
+    strb    r6, [r1], r2
+
+    bne     width_loop_2nd_4
+
+    subs    r7, r7, #0x10000
+    add     r0, r0, #16                     ; update src for next loop
+    sub     r1, lr, r7, lsr #16             ; update dst for next loop
+
+    bne     height_loop_2nd_4
+
+    ldmia   sp!, {r4 - r11, pc}
+
+    ENDP
+
+;------------------------------------
+; r0    unsigned char *src_ptr
+; r1    unsigned char *output_ptr,
+; r2    unsigned int src_pixels_per_line
+; r3    unsigned int cnt,
+; stack unsigned int output_pitch,
+; stack const short *vp8_filter
+;------------------------------------
+|vp8_filter_block2d_first_pass_only_armv6| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    add     r7, r2, r3                      ; preload next low
+    add     r7, r7, #2
+    pld     [r0, r7]
+
+    ldr     r4, [sp, #36]                   ; output pitch
+    ldr     r11, [sp, #40]                  ; HFilter address
+    sub     sp, sp, #8
+
+    mov     r7, r3
+    sub     r2, r2, r3                      ; inside loop increments input array,
+                                            ; so the height loop only needs to add
+                                            ; r2 - width to the input pointer
+
+    sub     r4, r4, r3
+    str     r4, [sp]                        ; save modified output pitch
+    str     r2, [sp, #4]
+
+    mov     r2, #0x40
+
+    ldr     r4, [r11]                       ; load up packed filter coefficients
+    ldr     r5, [r11, #4]
+    ldr     r6, [r11, #8]
+
+; six tap filter
+|height_loop_1st_only_6|
+    ldrb    r8, [r0, #-2]                   ; load data
+    ldrb    r9, [r0, #-1]
+    ldrb    r10, [r0], #2
+
+    mov     r12, r3, lsr #1                 ; loop counter
+
+|width_loop_1st_only_6|
+    ldrb    r11, [r0, #-1]
+
+    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8
+    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9
+
+    ldrb    r9, [r0]
+
+;;  smuad   lr, lr, r4
+    smlad   lr, lr, r4, r2
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+;;  smuad   r8, r8, r4
+    smlad   r8, r8, r4, r2
+    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11
+
+    smlad   lr, r10, r5, lr
+    ldrb    r10, [r0, #1]
+    smlad   r8, r11, r5, r8
+    ldrb    r11, [r0, #2]
+
+    subs    r12, r12, #1
+
+    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+
+    smlad   lr, r9, r6, lr
+    smlad   r10, r10, r6, r8
+
+;;  add     lr, lr, #0x40                   ; round_shift_and_clamp
+    ldrneb  r8, [r0, #-2]                   ; load data for next loop
+    usat    lr, #8, lr, asr #7
+;;  add     r10, r10, #0x40
+    strb    lr, [r1], #1                    ; store the result
+    usat    r10, #8, r10, asr #7
+
+    ldrneb  r9, [r0, #-1]
+    strb    r10, [r1], #1
+    ldrneb  r10, [r0], #2
+
+    bne     width_loop_1st_only_6
+
+    ldr     lr, [sp]                        ; load back output pitch
+    ldr     r12, [sp, #4]                   ; load back output pitch
+    subs    r7, r7, #1
+    add     r0, r0, r12                     ; updata src for next loop
+
+    add     r11, r12, r3                    ; preload next low
+    add     r11, r11, #2
+    pld     [r0, r11]
+
+    add     r1, r1, lr                      ; update dst for next loop
+
+    bne     height_loop_1st_only_6
+
+    add     sp, sp, #8
+    ldmia   sp!, {r4 - r11, pc}
+    ENDP  ; |vp8_filter_block2d_first_pass_only_armv6|
+
+
+;------------------------------------
+; r0    unsigned char *src_ptr,
+; r1    unsigned char *output_ptr,
+; r2    unsigned int src_pixels_per_line
+; r3    unsigned int cnt,
+; stack unsigned int output_pitch,
+; stack const short *vp8_filter
+;------------------------------------
+|vp8_filter_block2d_second_pass_only_armv6| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #40]                  ; VFilter address
+    ldr     r12, [sp, #36]                  ; output pitch
+
+    mov     r7, r3, lsl #16                 ; height is top part of counter
+    sub     r0, r0, r2, lsl #1              ; need 6 elements for filtering, 2 before, 3 after
+
+    sub     sp, sp, #8
+
+    ldr     r4, [r11]                       ; load up packed filter coefficients
+    ldr     r5, [r11, #4]
+    ldr     r6, [r11, #8]
+
+    str     r0, [sp]                        ; save r0 to stack
+    str     r1, [sp, #4]                    ; save dst to stack
+
+; six tap filter
+|width_loop_2nd_only_6|
+    ldrb    r8, [r0], r2                    ; load data
+    orr     r7, r7, r3                      ; loop counter
+    ldrb    r9, [r0], r2
+    ldrb    r10, [r0], r2
+
+|height_loop_2nd_only_6|
+    ; filter first column in this inner loop, than, move to next colum.
+    ldrb    r11, [r0], r2
+
+    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8
+    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9
+
+    ldrb    r9, [r0], r2
+
+    smuad   lr, lr, r4
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+    smuad   r8, r8, r4
+    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11
+
+    smlad   lr, r10, r5, lr
+    ldrb    r10, [r0], r2
+    smlad   r8, r11, r5, r8
+    ldrb    r11, [r0]
+
+    sub     r7, r7, #2
+    sub     r0, r0, r2, lsl #2
+
+    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+
+    smlad   lr, r9, r6, lr
+    smlad   r10, r10, r6, r8
+
+    ands    r9, r7, #0xff
+
+    add     lr, lr, #0x40                   ; round_shift_and_clamp
+    ldrneb  r8, [r0], r2                    ; load data for next loop
+    usat    lr, #8, lr, asr #7
+    add     r10, r10, #0x40
+    strb    lr, [r1], r12                   ; store the result for the column
+    usat    r10, #8, r10, asr #7
+
+    ldrneb  r9, [r0], r2
+    strb    r10, [r1], r12
+    ldrneb  r10, [r0], r2
+
+    bne     height_loop_2nd_only_6
+
+    ldr     r0, [sp]
+    ldr     r1, [sp, #4]
+    subs    r7, r7, #0x10000
+    add     r0, r0, #1                      ; move to filter next column
+    str     r0, [sp]
+    add     r1, r1, #1
+    str     r1, [sp, #4]
+
+    bne     width_loop_2nd_only_6
+
+    add     sp, sp, #8
+
+    ldmia   sp!, {r4 - r11, pc}
+    ENDP  ; |vp8_filter_block2d_second_pass_only_armv6|
+
+    END
diff --git a/libs/libvpx/vp8/common/arm/armv6/idct_blk_v6.c b/libs/libvpx/vp8/common/arm/armv6/idct_blk_v6.c
new file mode 100644
index 0000000000..c94f84a62b
--- /dev/null
+++ b/libs/libvpx/vp8/common/arm/armv6/idct_blk_v6.c
@@ -0,0 +1,115 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+
+
+void vp8_dequant_idct_add_y_block_v6(short *q, short *dq,
+                                     unsigned char *dst,
+                                     int stride, char *eobs)
+{
+    int i;
+
+    for (i = 0; i < 4; i++)
+    {
+        if (eobs[0] > 1)
+            vp8_dequant_idct_add_v6 (q, dq, dst, stride);
+        else if (eobs[0] == 1)
+        {
+            vp8_dc_only_idct_add_v6 (q[0]*dq[0], dst, stride, dst, stride);
+            ((int *)q)[0] = 0;
+        }
+
+        if (eobs[1] > 1)
+            vp8_dequant_idct_add_v6 (q+16, dq, dst+4, stride);
+        else if (eobs[1] == 1)
+        {
+            vp8_dc_only_idct_add_v6 (q[16]*dq[0], dst+4, stride, dst+4, stride);
+            ((int *)(q+16))[0] = 0;
+        }
+
+        if (eobs[2] > 1)
+            vp8_dequant_idct_add_v6 (q+32, dq, dst+8, stride);
+        else if (eobs[2] == 1)
+        {
+            vp8_dc_only_idct_add_v6 (q[32]*dq[0], dst+8, stride, dst+8, stride);
+            ((int *)(q+32))[0] = 0;
+        }
+
+        if (eobs[3] > 1)
+            vp8_dequant_idct_add_v6 (q+48, dq, dst+12, stride);
+        else if (eobs[3] == 1)
+        {
+            vp8_dc_only_idct_add_v6 (q[48]*dq[0], dst+12, stride,dst+12,stride);
+            ((int *)(q+48))[0] = 0;
+        }
+
+        q    += 64;
+        dst  += 4*stride;
+        eobs += 4;
+    }
+}
+
+void vp8_dequant_idct_add_uv_block_v6(short *q, short *dq,
+                                      unsigned char *dstu,
+                                      unsigned char *dstv,
+                                      int stride, char *eobs)
+{
+    int i;
+
+    for (i = 0; i < 2; i++)
+    {
+        if (eobs[0] > 1)
+            vp8_dequant_idct_add_v6 (q, dq, dstu, stride);
+        else if (eobs[0] == 1)
+        {
+            vp8_dc_only_idct_add_v6 (q[0]*dq[0], dstu, stride, dstu, stride);
+            ((int *)q)[0] = 0;
+        }
+
+        if (eobs[1] > 1)
+            vp8_dequant_idct_add_v6 (q+16, dq, dstu+4, stride);
+        else if (eobs[1] == 1)
+        {
+            vp8_dc_only_idct_add_v6 (q[16]*dq[0], dstu+4, stride,
+                                                  dstu+4, stride);
+            ((int *)(q+16))[0] = 0;
+        }
+
+        q    += 32;
+        dstu += 4*stride;
+        eobs += 2;
+    }
+
+    for (i = 0; i < 2; i++)
+    {
+        if (eobs[0] > 1)
+            vp8_dequant_idct_add_v6 (q, dq, dstv, stride);
+        else if (eobs[0] == 1)
+        {
+            vp8_dc_only_idct_add_v6 (q[0]*dq[0], dstv, stride, dstv, stride);
+            ((int *)q)[0] = 0;
+        }
+
+        if (eobs[1] > 1)
+            vp8_dequant_idct_add_v6 (q+16, dq, dstv+4, stride);
+        else if (eobs[1] == 1)
+        {
+            vp8_dc_only_idct_add_v6 (q[16]*dq[0], dstv+4, stride,
+                                                  dstv+4, stride);
+            ((int *)(q+16))[0] = 0;
+        }
+
+        q    += 32;
+        dstv += 4*stride;
+        eobs += 2;
+    }
+}
diff --git a/libs/libvpx/vp8/common/arm/armv6/idct_v6.asm b/libs/libvpx/vp8/common/arm/armv6/idct_v6.asm
new file mode 100644
index 0000000000..b4d44cbeba
--- /dev/null
+++ b/libs/libvpx/vp8/common/arm/armv6/idct_v6.asm
@@ -0,0 +1,202 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_short_idct4x4llm_v6_dual|
+
+    AREA    |.text|, CODE, READONLY
+
+
+; void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch,
+;                             unsigned char *dst, int stride)
+; r0    short* input
+; r1    unsigned char* pred
+; r2    int pitch
+; r3    unsigned char* dst
+; sp    int stride
+
+|vp8_short_idct4x4llm_v6_dual| PROC
+    stmdb   sp!, {r4-r11, lr}
+
+    sub     sp, sp, #4
+
+    mov     r4, #0x00008A00         ; sin
+    orr     r4, r4, #0x0000008C     ; sinpi8sqrt2
+
+    mov     r5, #0x00004E00         ; cos
+    orr     r5, r5, #0x0000007B     ; cospi8sqrt2minus1
+    orr     r5, r5, #1<<31          ; loop counter on top bit
+
+loop1_dual
+    ldr     r6, [r0, #(4*2)]        ; i5 | i4
+    ldr     r12, [r0, #(12*2)]      ; i13|i12
+    ldr     r14, [r0, #(8*2)]       ; i9 | i8
+
+    smulbt  r9, r5, r6              ; (ip[5] * cospi8sqrt2minus1) >> 16
+    smulbb  r7, r5, r6              ; (ip[4] * cospi8sqrt2minus1) >> 16
+    smulwt  r10, r4, r6             ; (ip[5] * sinpi8sqrt2) >> 16
+    smulwb  r8, r4, r6              ; (ip[4] * sinpi8sqrt2) >> 16
+
+    smulbt  r11, r5, r12            ; (ip[13] * cospi8sqrt2minus1) >> 16
+    pkhtb   r7, r9, r7, asr #16     ; 5c | 4c
+    pkhbt   r8, r8, r10, lsl #16    ; 5s | 4s
+    uadd16  r6, r6, r7              ; 5c+5 | 4c+4
+
+    smulwt  r7, r4, r12             ; (ip[13] * sinpi8sqrt2) >> 16
+    smulbb  r9, r5, r12             ; (ip[12] * cospi8sqrt2minus1) >> 16
+    smulwb  r10, r4, r12            ; (ip[12] * sinpi8sqrt2) >> 16
+
+    subs    r5, r5, #1<<31          ; i--
+
+    pkhtb   r9, r11, r9, asr #16    ; 13c | 12c
+    ldr     r11, [r0]               ; i1 | i0
+    pkhbt   r10, r10, r7, lsl #16   ; 13s | 12s
+    uadd16  r7, r12, r9             ; 13c+13 | 12c+12
+
+    usub16  r7, r8, r7              ; c
+    uadd16  r6, r6, r10             ; d
+    uadd16  r10, r11, r14           ; a
+    usub16  r8, r11, r14            ; b
+
+    uadd16  r9, r10, r6             ; a+d
+    usub16  r10, r10, r6            ; a-d
+    uadd16  r6, r8, r7              ; b+c
+    usub16  r7, r8, r7              ; b-c
+
+    ; use input buffer to store intermediate results
+    str      r6, [r0, #(4*2)]       ; o5 | o4
+    str      r7, [r0, #(8*2)]       ; o9 | o8
+    str      r10,[r0, #(12*2)]      ; o13|o12
+    str      r9, [r0], #4           ; o1 | o0
+
+    bcs loop1_dual
+
+    sub     r0, r0, #8              ; reset input/output
+    str     r0, [sp]
+
+loop2_dual
+
+    ldr     r6, [r0, #(4*2)]        ; i5 | i4
+    ldr     r12,[r0, #(2*2)]        ; i3 | i2
+    ldr     r14,[r0, #(6*2)]        ; i7 | i6
+    ldr     r0, [r0, #(0*2)]        ; i1 | i0
+
+    smulbt  r9, r5, r6              ; (ip[5] * cospi8sqrt2minus1) >> 16
+    smulbt  r7, r5, r0              ; (ip[1] * cospi8sqrt2minus1) >> 16
+    smulwt  r10, r4, r6             ; (ip[5] * sinpi8sqrt2) >> 16
+    smulwt  r8, r4, r0              ; (ip[1] * sinpi8sqrt2) >> 16
+
+    pkhbt   r11, r6, r0, lsl #16    ; i0 | i4
+    pkhtb   r7, r7, r9, asr #16     ; 1c | 5c
+    pkhtb   r0, r0, r6, asr #16     ; i1 | i5
+    pkhbt   r8, r10, r8, lsl #16    ; 1s | 5s = temp1
+
+    uadd16  r0, r7, r0              ; 1c+1 | 5c+5 = temp2
+    pkhbt   r9, r14, r12, lsl #16   ; i2 | i6
+    uadd16  r10, r11, r9            ; a
+    usub16  r9, r11, r9             ; b
+    pkhtb   r6, r12, r14, asr #16   ; i3 | i7
+
+    subs    r5, r5, #1<<31          ; i--
+
+    smulbt  r7, r5, r6              ; (ip[3] * cospi8sqrt2minus1) >> 16
+    smulwt  r11, r4, r6             ; (ip[3] * sinpi8sqrt2) >> 16
+    smulbb  r12, r5, r6             ; (ip[7] * cospi8sqrt2minus1) >> 16
+    smulwb  r14, r4, r6             ; (ip[7] * sinpi8sqrt2) >> 16
+
+    pkhtb   r7, r7, r12, asr #16    ; 3c | 7c
+    pkhbt   r11, r14, r11, lsl #16  ; 3s | 7s = temp1
+
+    uadd16  r6, r7, r6              ; 3c+3 | 7c+7 = temp2
+    usub16  r12, r8, r6             ; c (o1 | o5)
+    uadd16  r6, r11, r0             ; d (o3 | o7)
+    uadd16  r7, r10, r6             ; a+d
+
+    mov     r8, #4                  ; set up 4's
+    orr     r8, r8, #0x40000        ; 4|4
+
+    usub16  r6, r10, r6             ; a-d
+    uadd16  r6, r6, r8              ; a-d+4, 3|7
+    uadd16  r7, r7, r8              ; a+d+4, 0|4
+    uadd16  r10, r9, r12            ; b+c
+    usub16  r0, r9, r12             ; b-c
+    uadd16  r10, r10, r8            ; b+c+4, 1|5
+    uadd16  r8, r0, r8              ; b-c+4, 2|6
+
+    ldr     lr, [sp, #40]           ; dst stride
+
+    ldrb    r0, [r1]                ; pred p0
+    ldrb    r11, [r1, #1]           ; pred p1
+    ldrb    r12, [r1, #2]           ; pred p2
+
+    add     r0, r0, r7, asr #19     ; p0 + o0
+    add     r11, r11, r10, asr #19  ; p1 + o1
+    add     r12, r12, r8, asr #19   ; p2 + o2
+
+    usat    r0, #8, r0              ; d0 = clip8(p0 + o0)
+    usat    r11, #8, r11            ; d1 = clip8(p1 + o1)
+    usat    r12, #8, r12            ; d2 = clip8(p2 + o2)
+
+    add     r0, r0, r11, lsl #8     ; |--|--|d1|d0|
+
+    ldrb    r11, [r1, #3]           ; pred p3
+
+    add     r0, r0, r12, lsl #16    ; |--|d2|d1|d0|
+
+    add     r11, r11, r6, asr #19   ; p3 + o3
+
+    sxth    r7, r7                  ;
+    sxth    r10, r10                ;
+
+    usat    r11, #8, r11            ; d3 = clip8(p3 + o3)
+
+    sxth    r8, r8                  ;
+    sxth    r6, r6                  ;
+
+    add     r0, r0, r11, lsl #24    ; |d3|d2|d1|d0|
+
+    ldrb    r12, [r1, r2]!          ; pred p4
+    str     r0, [r3], lr
+    ldrb    r11, [r1, #1]           ; pred p5
+
+    add     r12, r12, r7, asr #3    ; p4 + o4
+    add     r11, r11, r10, asr #3   ; p5 + o5
+
+    usat    r12, #8, r12            ; d4 = clip8(p4 + o4)
+    usat    r11, #8, r11            ; d5 = clip8(p5 + o5)
+
+    ldrb    r7, [r1, #2]            ; pred p6
+    ldrb    r10, [r1, #3]           ; pred p6
+
+    add     r12, r12, r11, lsl #8   ; |--|--|d5|d4|
+
+    add     r7, r7, r8, asr #3      ; p6 + o6
+    add     r10, r10, r6, asr #3    ; p7 + o7
+
+    ldr     r0, [sp]                ; load input pointer
+
+    usat    r7, #8, r7              ; d6 = clip8(p6 + o6)
+    usat    r10, #8, r10            ; d7 = clip8(p7 + o7)
+
+    add     r12, r12, r7, lsl #16   ; |--|d6|d5|d4|
+    add     r12, r12, r10, lsl #24  ; |d7|d6|d5|d4|
+
+    str     r12, [r3], lr
+    add     r0, r0, #16
+    add     r1, r1, r2              ; pred + pitch
+
+    bcs loop2_dual
+
+    add     sp, sp, #4              ; idct_output buffer
+    ldmia   sp!, {r4 - r11, pc}
+
+    ENDP
+
+    END
diff --git a/libs/libvpx/vp8/common/arm/armv6/iwalsh_v6.asm b/libs/libvpx/vp8/common/arm/armv6/iwalsh_v6.asm
new file mode 100644
index 0000000000..31ef09cada
--- /dev/null
+++ b/libs/libvpx/vp8/common/arm/armv6/iwalsh_v6.asm
@@ -0,0 +1,136 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT |vp8_short_inv_walsh4x4_v6|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+
+;short vp8_short_inv_walsh4x4_v6(short *input, short *mb_dqcoeff)
+|vp8_short_inv_walsh4x4_v6| PROC
+
+    stmdb       sp!, {r4 - r12, lr}
+
+    ldr         r2, [r0, #0]         ; [1  |  0]
+    ldr         r3, [r0, #4]         ; [3  |  2]
+    ldr         r4, [r0, #8]         ; [5  |  4]
+    ldr         r5, [r0, #12]        ; [7  |  6]
+    ldr         r6, [r0, #16]        ; [9  |  8]
+    ldr         r7, [r0, #20]        ; [11 | 10]
+    ldr         r8, [r0, #24]        ; [13 | 12]
+    ldr         r9, [r0, #28]        ; [15 | 14]
+
+    qadd16      r10, r2, r8          ; a1 [1+13  |  0+12]
+    qadd16      r11, r4, r6          ; b1 [5+9   |  4+8]
+    qsub16      r12, r4, r6          ; c1 [5-9   |  4-8]
+    qsub16      lr, r2, r8           ; d1 [1-13  |  0-12]
+
+    qadd16      r2, r10, r11         ; a1 + b1 [1  |  0]
+    qadd16      r4, r12, lr          ; c1 + d1 [5  |  4]
+    qsub16      r6, r10, r11         ; a1 - b1 [9  |  8]
+    qsub16      r8, lr, r12          ; d1 - c1 [13 | 12]
+
+    qadd16      r10, r3, r9          ; a1 [3+15  |  2+14]
+    qadd16      r11, r5, r7          ; b1 [7+11  |  6+10]
+    qsub16      r12, r5, r7          ; c1 [7-11  |  6-10]
+    qsub16      lr, r3, r9           ; d1 [3-15  |  2-14]
+
+    qadd16      r3, r10, r11         ; a1 + b1 [3  |  2]
+    qadd16      r5, r12, lr          ; c1 + d1 [7  |  6]
+    qsub16      r7, r10, r11         ; a1 - b1 [11 | 10]
+    qsub16      r9, lr, r12          ; d1 - c1 [15 | 14]
+
+    ; first transform complete
+
+    qsubaddx    r10, r2, r3          ; [c1|a1] [1-2   |   0+3]
+    qaddsubx    r11, r2, r3          ; [b1|d1] [1+2   |   0-3]
+    qsubaddx    r12, r4, r5          ; [c1|a1] [5-6   |   4+7]
+    qaddsubx    lr, r4, r5           ; [b1|d1] [5+6   |   4-7]
+
+    qaddsubx    r2, r10, r11         ; [b2|c2] [c1+d1 | a1-b1]
+    qaddsubx    r3, r11, r10         ; [a2|d2] [b1+a1 | d1-c1]
+    ldr         r10, c0x00030003
+    qaddsubx    r4, r12, lr          ; [b2|c2] [c1+d1 | a1-b1]
+    qaddsubx    r5, lr, r12          ; [a2|d2] [b1+a1 | d1-c1]
+
+    qadd16      r2, r2, r10          ; [b2+3|c2+3]
+    qadd16      r3, r3, r10          ; [a2+3|d2+3]
+    qadd16      r4, r4, r10          ; [b2+3|c2+3]
+    qadd16      r5, r5, r10          ; [a2+3|d2+3]
+
+    asr         r12, r3, #19         ; [0]
+    strh        r12, [r1], #32
+    asr         lr, r2, #19          ; [1]
+    strh        lr, [r1], #32
+    sxth        r2, r2
+    sxth        r3, r3
+    asr         r2, r2, #3           ; [2]
+    strh        r2, [r1], #32
+    asr         r3, r3, #3           ; [3]
+    strh        r3, [r1], #32
+
+    asr         r12, r5, #19         ; [4]
+    strh        r12, [r1], #32
+    asr         lr, r4, #19          ; [5]
+    strh        lr, [r1], #32
+    sxth        r4, r4
+    sxth        r5, r5
+    asr         r4, r4, #3           ; [6]
+    strh        r4, [r1], #32
+    asr         r5, r5, #3           ; [7]
+    strh        r5, [r1], #32
+
+    qsubaddx    r2, r6, r7           ; [c1|a1] [9-10  |  8+11]
+    qaddsubx    r3, r6, r7           ; [b1|d1] [9+10  |  8-11]
+    qsubaddx    r4, r8, r9           ; [c1|a1] [13-14 | 12+15]
+    qaddsubx    r5, r8, r9           ; [b1|d1] [13+14 | 12-15]
+
+    qaddsubx    r6, r2, r3           ; [b2|c2] [c1+d1 | a1-b1]
+    qaddsubx    r7, r3, r2           ; [a2|d2] [b1+a1 | d1-c1]
+    qaddsubx    r8, r4, r5           ; [b2|c2] [c1+d1 | a1-b1]
+    qaddsubx    r9, r5, r4           ; [a2|d2] [b1+a1 | d1-c1]
+
+    qadd16      r6, r6, r10          ; [b2+3|c2+3]
+    qadd16      r7, r7, r10          ; [a2+3|d2+3]
+    qadd16      r8, r8, r10          ; [b2+3|c2+3]
+    qadd16      r9, r9, r10          ; [a2+3|d2+3]
+
+    asr         r12, r7, #19         ; [8]
+    strh        r12, [r1], #32
+    asr         lr, r6, #19          ; [9]
+    strh        lr, [r1], #32
+    sxth        r6, r6
+    sxth        r7, r7
+    asr         r6, r6, #3           ; [10]
+    strh        r6, [r1], #32
+    asr         r7, r7, #3           ; [11]
+    strh        r7, [r1], #32
+
+    asr         r12, r9, #19         ; [12]
+    strh        r12, [r1], #32
+    asr         lr, r8, #19          ; [13]
+    strh        lr, [r1], #32
+    sxth        r8, r8
+    sxth        r9, r9
+    asr         r8, r8, #3           ; [14]
+    strh        r8, [r1], #32
+    asr         r9, r9, #3           ; [15]
+    strh        r9, [r1], #32
+
+    ldmia       sp!, {r4 - r12, pc}
+    ENDP        ; |vp8_short_inv_walsh4x4_v6|
+
+
+; Constant Pool
+c0x00030003 DCD 0x00030003
+    END
diff --git a/libs/libvpx/vp8/common/arm/armv6/loopfilter_v6.asm b/libs/libvpx/vp8/common/arm/armv6/loopfilter_v6.asm
new file mode 100644
index 0000000000..1cbbbcdef5
--- /dev/null
+++ b/libs/libvpx/vp8/common/arm/armv6/loopfilter_v6.asm
@@ -0,0 +1,1282 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT |vp8_loop_filter_horizontal_edge_armv6|
+    EXPORT |vp8_mbloop_filter_horizontal_edge_armv6|
+    EXPORT |vp8_loop_filter_vertical_edge_armv6|
+    EXPORT |vp8_mbloop_filter_vertical_edge_armv6|
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+
+    MACRO
+    TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3
+    ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3
+    ; a0: 03 02 01 00
+    ; a1: 13 12 11 10
+    ; a2: 23 22 21 20
+    ; a3: 33 32 31 30
+    ;     b3 b2 b1 b0
+
+    uxtb16      $b1, $a1                    ; xx 12 xx 10
+    uxtb16      $b0, $a0                    ; xx 02 xx 00
+    uxtb16      $b3, $a3                    ; xx 32 xx 30
+    uxtb16      $b2, $a2                    ; xx 22 xx 20
+    orr         $b1, $b0, $b1, lsl #8       ; 12 02 10 00
+    orr         $b3, $b2, $b3, lsl #8       ; 32 22 30 20
+
+    uxtb16      $a1, $a1, ror #8            ; xx 13 xx 11
+    uxtb16      $a3, $a3, ror #8            ; xx 33 xx 31
+    uxtb16      $a0, $a0, ror #8            ; xx 03 xx 01
+    uxtb16      $a2, $a2, ror #8            ; xx 23 xx 21
+    orr         $a0, $a0, $a1, lsl #8       ; 13 03 11 01
+    orr         $a2, $a2, $a3, lsl #8       ; 33 23 31 21
+
+    pkhtb       $b2, $b3, $b1, asr #16      ; 32 22 12 02   -- p1
+    pkhbt       $b0, $b1, $b3, lsl #16      ; 30 20 10 00   -- p3
+
+    pkhtb       $b3, $a2, $a0, asr #16      ; 33 23 13 03   -- p0
+    pkhbt       $b1, $a0, $a2, lsl #16      ; 31 21 11 01   -- p2
+    MEND
+
+
+src         RN  r0
+pstep       RN  r1
+count       RN  r5
+
+;r0     unsigned char *src_ptr,
+;r1     int src_pixel_step,
+;r2     const char *blimit,
+;r3     const char *limit,
+;stack  const char *thresh,
+;stack  int  count
+
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+|vp8_loop_filter_horizontal_edge_armv6| PROC
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+    stmdb       sp!, {r4 - r11, lr}
+
+    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
+    ldr         count, [sp, #40]            ; count for 8-in-parallel
+    ldr         r6, [sp, #36]               ; load thresh address
+    sub         sp, sp, #16                 ; create temp buffer
+
+    ldr         r9, [src], pstep            ; p3
+    ldrb        r4, [r2]                    ; blimit
+    ldr         r10, [src], pstep           ; p2
+    ldrb        r2, [r3]                    ; limit
+    ldr         r11, [src], pstep           ; p1
+    orr         r4, r4, r4, lsl #8
+    ldrb        r3, [r6]                    ; thresh
+    orr         r2, r2, r2, lsl #8
+    mov         count, count, lsl #1        ; 4-in-parallel
+    orr         r4, r4, r4, lsl #16
+    orr         r3, r3, r3, lsl #8
+    orr         r2, r2, r2, lsl #16
+    orr         r3, r3, r3, lsl #16
+
+|Hnext8|
+    ; vp8_filter_mask() function
+    ; calculate breakout conditions
+    ldr         r12, [src], pstep           ; p0
+
+    uqsub8      r6, r9, r10                 ; p3 - p2
+    uqsub8      r7, r10, r9                 ; p2 - p3
+    uqsub8      r8, r10, r11                ; p2 - p1
+    uqsub8      r10, r11, r10               ; p1 - p2
+
+    orr         r6, r6, r7                  ; abs (p3-p2)
+    orr         r8, r8, r10                 ; abs (p2-p1)
+    uqsub8      lr, r6, r2                  ; compare to limit. lr: vp8_filter_mask
+    uqsub8      r8, r8, r2                  ; compare to limit
+    uqsub8      r6, r11, r12                ; p1 - p0
+    orr         lr, lr, r8
+    uqsub8      r7, r12, r11                ; p0 - p1
+    ldr         r9, [src], pstep            ; q0
+    ldr         r10, [src], pstep           ; q1
+    orr         r6, r6, r7                  ; abs (p1-p0)
+    uqsub8      r7, r6, r2                  ; compare to limit
+    uqsub8      r8, r6, r3                  ; compare to thresh  -- save r8 for later
+    orr         lr, lr, r7
+
+    uqsub8      r6, r11, r10                ; p1 - q1
+    uqsub8      r7, r10, r11                ; q1 - p1
+    uqsub8      r11, r12, r9                ; p0 - q0
+    uqsub8      r12, r9, r12                ; q0 - p0
+    orr         r6, r6, r7                  ; abs (p1-q1)
+    ldr         r7, c0x7F7F7F7F
+    orr         r12, r11, r12               ; abs (p0-q0)
+    ldr         r11, [src], pstep           ; q2
+    uqadd8      r12, r12, r12               ; abs (p0-q0) * 2
+    and         r6, r7, r6, lsr #1          ; abs (p1-q1) / 2
+    uqsub8      r7, r9, r10                 ; q0 - q1
+    uqadd8      r12, r12, r6                ; abs (p0-q0)*2 + abs (p1-q1)/2
+    uqsub8      r6, r10, r9                 ; q1 - q0
+    uqsub8      r12, r12, r4                ; compare to flimit
+    uqsub8      r9, r11, r10                ; q2 - q1
+
+    orr         lr, lr, r12
+
+    ldr         r12, [src], pstep           ; q3
+    uqsub8      r10, r10, r11               ; q1 - q2
+    orr         r6, r7, r6                  ; abs (q1-q0)
+    orr         r10, r9, r10                ; abs (q2-q1)
+    uqsub8      r7, r6, r2                  ; compare to limit
+    uqsub8      r10, r10, r2                ; compare to limit
+    uqsub8      r6, r6, r3                  ; compare to thresh -- save r6 for later
+    orr         lr, lr, r7
+    orr         lr, lr, r10
+
+    uqsub8      r10, r12, r11               ; q3 - q2
+    uqsub8      r9, r11, r12                ; q2 - q3
+
+    mvn         r11, #0                     ; r11 == -1
+
+    orr         r10, r10, r9                ; abs (q3-q2)
+    uqsub8      r10, r10, r2                ; compare to limit
+
+    mov         r12, #0
+    orr         lr, lr, r10
+    sub         src, src, pstep, lsl #2
+
+    usub8       lr, r12, lr                 ; use usub8 instead of ssub8
+    sel         lr, r11, r12                ; filter mask: lr
+
+    cmp         lr, #0
+    beq         hskip_filter                 ; skip filtering
+
+    sub         src, src, pstep, lsl #1     ; move src pointer down by 6 lines
+
+    ;vp8_hevmask() function
+    ;calculate high edge variance
+    orr         r10, r6, r8                 ; calculate vp8_hevmask
+
+    ldr         r7, [src], pstep            ; p1
+
+    usub8       r10, r12, r10               ; use usub8 instead of ssub8
+    sel         r6, r12, r11                ; obtain vp8_hevmask: r6
+
+    ;vp8_filter() function
+    ldr         r8, [src], pstep            ; p0
+    ldr         r12, c0x80808080
+    ldr         r9, [src], pstep            ; q0
+    ldr         r10, [src], pstep           ; q1
+
+    eor         r7, r7, r12                 ; p1 offset to convert to a signed value
+    eor         r8, r8, r12                 ; p0 offset to convert to a signed value
+    eor         r9, r9, r12                 ; q0 offset to convert to a signed value
+    eor         r10, r10, r12               ; q1 offset to convert to a signed value
+
+    str         r9, [sp]                    ; store qs0 temporarily
+    str         r8, [sp, #4]                ; store ps0 temporarily
+    str         r10, [sp, #8]               ; store qs1 temporarily
+    str         r7, [sp, #12]               ; store ps1 temporarily
+
+    qsub8       r7, r7, r10                 ; vp8_signed_char_clamp(ps1-qs1)
+    qsub8       r8, r9, r8                  ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
+
+    and         r7, r7, r6                  ; vp8_filter (r7) &= hev
+
+    qadd8       r7, r7, r8
+    ldr         r9, c0x03030303             ; r9 = 3 --modified for vp8
+
+    qadd8       r7, r7, r8
+    ldr         r10, c0x04040404
+
+    qadd8       r7, r7, r8
+    and         r7, r7, lr                  ; vp8_filter &= mask;
+
+    ;modify code for vp8 -- Filter1 = vp8_filter (r7)
+    qadd8       r8 , r7 , r9                ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3)
+    qadd8       r7 , r7 , r10               ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4)
+
+    mov         r9, #0
+    shadd8      r8 , r8 , r9                ; Filter2 >>= 3
+    shadd8      r7 , r7 , r9                ; vp8_filter >>= 3
+    shadd8      r8 , r8 , r9
+    shadd8      r7 , r7 , r9
+    shadd8      lr , r8 , r9                ; lr: Filter2
+    shadd8      r7 , r7 , r9                ; r7: filter
+
+    ;usub8      lr, r8, r10                 ; s = (s==4)*-1
+    ;sel        lr, r11, r9
+    ;usub8      r8, r10, r8
+    ;sel        r8, r11, r9
+    ;and        r8, r8, lr                  ; -1 for each element that equals 4
+
+    ;calculate output
+    ;qadd8      lr, r8, r7                  ; u = vp8_signed_char_clamp(s + vp8_filter)
+
+    ldr         r8, [sp]                    ; load qs0
+    ldr         r9, [sp, #4]                ; load ps0
+
+    ldr         r10, c0x01010101
+
+    qsub8       r8 ,r8, r7                  ; u = vp8_signed_char_clamp(qs0 - vp8_filter)
+    qadd8       r9, r9, lr                  ; u = vp8_signed_char_clamp(ps0 + Filter2)
+
+    ;end of modification for vp8
+
+    mov         lr, #0
+    sadd8       r7, r7 , r10                ; vp8_filter += 1
+    shadd8      r7, r7, lr                  ; vp8_filter >>= 1
+
+    ldr         r11, [sp, #12]              ; load ps1
+    ldr         r10, [sp, #8]               ; load qs1
+
+    bic         r7, r7, r6                  ; vp8_filter &= ~hev
+    sub         src, src, pstep, lsl #2
+
+    qadd8       r11, r11, r7                ; u = vp8_signed_char_clamp(ps1 + vp8_filter)
+    qsub8       r10, r10,r7                 ; u = vp8_signed_char_clamp(qs1 - vp8_filter)
+
+    eor         r11, r11, r12               ; *op1 = u^0x80
+    str         r11, [src], pstep           ; store op1
+    eor         r9, r9, r12                 ; *op0 = u^0x80
+    str         r9, [src], pstep            ; store op0 result
+    eor         r8, r8, r12                 ; *oq0 = u^0x80
+    str         r8, [src], pstep            ; store oq0 result
+    eor         r10, r10, r12               ; *oq1 = u^0x80
+    str         r10, [src], pstep           ; store oq1
+
+    sub         src, src, pstep, lsl #1
+
+|hskip_filter|
+    add         src, src, #4
+    sub         src, src, pstep, lsl #2
+
+    subs        count, count, #1
+
+    ldrne       r9, [src], pstep            ; p3
+    ldrne       r10, [src], pstep           ; p2
+    ldrne       r11, [src], pstep           ; p1
+
+    bne         Hnext8
+
+    add         sp, sp, #16
+    ldmia       sp!, {r4 - r11, pc}
+    ENDP        ; |vp8_loop_filter_horizontal_edge_armv6|
+
+
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+|vp8_mbloop_filter_horizontal_edge_armv6| PROC
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+    stmdb       sp!, {r4 - r11, lr}
+
+    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
+    ldr         count, [sp, #40]            ; count for 8-in-parallel
+    ldr         r6, [sp, #36]               ; load thresh address
+    sub         sp, sp, #16                 ; create temp buffer
+
+    ldr         r9, [src], pstep            ; p3
+    ldrb        r4, [r2]                    ; blimit
+    ldr         r10, [src], pstep           ; p2
+    ldrb        r2, [r3]                    ; limit
+    ldr         r11, [src], pstep           ; p1
+    orr         r4, r4, r4, lsl #8
+    ldrb        r3, [r6]                    ; thresh
+    orr         r2, r2, r2, lsl #8
+    mov         count, count, lsl #1        ; 4-in-parallel
+    orr         r4, r4, r4, lsl #16
+    orr         r3, r3, r3, lsl #8
+    orr         r2, r2, r2, lsl #16
+    orr         r3, r3, r3, lsl #16
+
+|MBHnext8|
+
+    ; vp8_filter_mask() function
+    ; calculate breakout conditions
+    ldr         r12, [src], pstep           ; p0
+
+    uqsub8      r6, r9, r10                 ; p3 - p2
+    uqsub8      r7, r10, r9                 ; p2 - p3
+    uqsub8      r8, r10, r11                ; p2 - p1
+    uqsub8      r10, r11, r10               ; p1 - p2
+
+    orr         r6, r6, r7                  ; abs (p3-p2)
+    orr         r8, r8, r10                 ; abs (p2-p1)
+    uqsub8      lr, r6, r2                  ; compare to limit. lr: vp8_filter_mask
+    uqsub8      r8, r8, r2                  ; compare to limit
+
+    uqsub8      r6, r11, r12                ; p1 - p0
+    orr         lr, lr, r8
+    uqsub8      r7, r12, r11                ; p0 - p1
+    ldr         r9, [src], pstep            ; q0
+    ldr         r10, [src], pstep           ; q1
+    orr         r6, r6, r7                  ; abs (p1-p0)
+    uqsub8      r7, r6, r2                  ; compare to limit
+    uqsub8      r8, r6, r3                  ; compare to thresh  -- save r8 for later
+    orr         lr, lr, r7
+
+    uqsub8      r6, r11, r10                ; p1 - q1
+    uqsub8      r7, r10, r11                ; q1 - p1
+    uqsub8      r11, r12, r9                ; p0 - q0
+    uqsub8      r12, r9, r12                ; q0 - p0
+    orr         r6, r6, r7                  ; abs (p1-q1)
+    ldr         r7, c0x7F7F7F7F
+    orr         r12, r11, r12               ; abs (p0-q0)
+    ldr         r11, [src], pstep           ; q2
+    uqadd8      r12, r12, r12               ; abs (p0-q0) * 2
+    and         r6, r7, r6, lsr #1          ; abs (p1-q1) / 2
+    uqsub8      r7, r9, r10                 ; q0 - q1
+    uqadd8      r12, r12, r6                ; abs (p0-q0)*2 + abs (p1-q1)/2
+    uqsub8      r6, r10, r9                 ; q1 - q0
+    uqsub8      r12, r12, r4                ; compare to flimit
+    uqsub8      r9, r11, r10                ; q2 - q1
+
+    orr         lr, lr, r12
+
+    ldr         r12, [src], pstep           ; q3
+
+    uqsub8      r10, r10, r11               ; q1 - q2
+    orr         r6, r7, r6                  ; abs (q1-q0)
+    orr         r10, r9, r10                ; abs (q2-q1)
+    uqsub8      r7, r6, r2                  ; compare to limit
+    uqsub8      r10, r10, r2                ; compare to limit
+    uqsub8      r6, r6, r3                  ; compare to thresh -- save r6 for later
+    orr         lr, lr, r7
+    orr         lr, lr, r10
+
+    uqsub8      r10, r12, r11               ; q3 - q2
+    uqsub8      r9, r11, r12                ; q2 - q3
+
+    mvn         r11, #0                     ; r11 == -1
+
+    orr         r10, r10, r9                ; abs (q3-q2)
+    uqsub8      r10, r10, r2                ; compare to limit
+
+    mov         r12, #0
+
+    orr         lr, lr, r10
+
+    usub8       lr, r12, lr                 ; use usub8 instead of ssub8
+    sel         lr, r11, r12                ; filter mask: lr
+
+    cmp         lr, #0
+    beq         mbhskip_filter               ; skip filtering
+
+    ;vp8_hevmask() function
+    ;calculate high edge variance
+    sub         src, src, pstep, lsl #2     ; move src pointer down by 6 lines
+    sub         src, src, pstep, lsl #1
+
+    orr         r10, r6, r8
+    ldr         r7, [src], pstep            ; p1
+
+    usub8       r10, r12, r10
+    sel         r6, r12, r11                ; hev mask: r6
+
+    ;vp8_mbfilter() function
+    ;p2, q2 are only needed at the end. Don't need to load them in now.
+    ldr         r8, [src], pstep            ; p0
+    ldr         r12, c0x80808080
+    ldr         r9, [src], pstep            ; q0
+    ldr         r10, [src]                  ; q1
+
+    eor         r7, r7, r12                 ; ps1
+    eor         r8, r8, r12                 ; ps0
+    eor         r9, r9, r12                 ; qs0
+    eor         r10, r10, r12               ; qs1
+
+    qsub8       r12, r9, r8                 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
+    str         r7, [sp, #12]               ; store ps1 temporarily
+    qsub8       r7, r7, r10                 ; vp8_signed_char_clamp(ps1-qs1)
+    str         r10, [sp, #8]               ; store qs1 temporarily
+    qadd8       r7, r7, r12
+    str         r9, [sp]                    ; store qs0 temporarily
+    qadd8       r7, r7, r12
+    str         r8, [sp, #4]                ; store ps0 temporarily
+    qadd8       r7, r7, r12                 ; vp8_filter: r7
+
+    ldr         r10, c0x03030303            ; r10 = 3 --modified for vp8
+    ldr         r9, c0x04040404
+
+    and         r7, r7, lr                  ; vp8_filter &= mask (lr is free)
+
+    mov         r12, r7                     ; Filter2: r12
+    and         r12, r12, r6                ; Filter2 &= hev
+
+    ;modify code for vp8
+    ;save bottom 3 bits so that we round one side +4 and the other +3
+    qadd8       r8 , r12 , r9               ; Filter1 (r8) = vp8_signed_char_clamp(Filter2+4)
+    qadd8       r12 , r12 , r10             ; Filter2 (r12) = vp8_signed_char_clamp(Filter2+3)
+
+    mov         r10, #0
+    shadd8      r8 , r8 , r10               ; Filter1 >>= 3
+    shadd8      r12 , r12 , r10             ; Filter2 >>= 3
+    shadd8      r8 , r8 , r10
+    shadd8      r12 , r12 , r10
+    shadd8      r8 , r8 , r10               ; r8: Filter1
+    shadd8      r12 , r12 , r10             ; r12: Filter2
+
+    ldr         r9, [sp]                    ; load qs0
+    ldr         r11, [sp, #4]               ; load ps0
+
+    qsub8       r9 , r9, r8                 ; qs0 = vp8_signed_char_clamp(qs0 - Filter1)
+    qadd8       r11, r11, r12               ; ps0 = vp8_signed_char_clamp(ps0 + Filter2)
+
+    ;save bottom 3 bits so that we round one side +4 and the other +3
+    ;and            r8, r12, r10                ; s = Filter2 & 7 (s: r8)
+    ;qadd8      r12 , r12 , r9              ; Filter2 = vp8_signed_char_clamp(Filter2+4)
+    ;mov            r10, #0
+    ;shadd8     r12 , r12 , r10             ; Filter2 >>= 3
+    ;usub8      lr, r8, r9                  ; s = (s==4)*-1
+    ;sel            lr, r11, r10
+    ;shadd8     r12 , r12 , r10
+    ;usub8      r8, r9, r8
+    ;sel            r8, r11, r10
+    ;ldr            r9, [sp]                    ; load qs0
+    ;ldr            r11, [sp, #4]               ; load ps0
+    ;shadd8     r12 , r12 , r10
+    ;and            r8, r8, lr                  ; -1 for each element that equals 4
+    ;qadd8      r10, r8, r12                ; u = vp8_signed_char_clamp(s + Filter2)
+    ;qsub8      r9 , r9, r12                ; qs0 = vp8_signed_char_clamp(qs0 - Filter2)
+    ;qadd8      r11, r11, r10               ; ps0 = vp8_signed_char_clamp(ps0 + u)
+
+    ;end of modification for vp8
+
+    bic         r12, r7, r6                 ; vp8_filter &= ~hev    ( r6 is free)
+    ;mov        r12, r7
+
+    ;roughly 3/7th difference across boundary
+    mov         lr, #0x1b                   ; 27
+    mov         r7, #0x3f                   ; 63
+
+    sxtb16      r6, r12
+    sxtb16      r10, r12, ror #8
+    smlabb      r8, r6, lr, r7
+    smlatb      r6, r6, lr, r7
+    smlabb      r7, r10, lr, r7
+    smultb      r10, r10, lr
+    ssat        r8, #8, r8, asr #7
+    ssat        r6, #8, r6, asr #7
+    add         r10, r10, #63
+    ssat        r7, #8, r7, asr #7
+    ssat        r10, #8, r10, asr #7
+
+    ldr         lr, c0x80808080
+
+    pkhbt       r6, r8, r6, lsl #16
+    pkhbt       r10, r7, r10, lsl #16
+    uxtb16      r6, r6
+    uxtb16      r10, r10
+
+    sub         src, src, pstep
+
+    orr         r10, r6, r10, lsl #8        ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
+
+    qsub8       r8, r9, r10                 ; s = vp8_signed_char_clamp(qs0 - u)
+    qadd8       r10, r11, r10               ; s = vp8_signed_char_clamp(ps0 + u)
+    eor         r8, r8, lr                  ; *oq0 = s^0x80
+    str         r8, [src]                   ; store *oq0
+    sub         src, src, pstep
+    eor         r10, r10, lr                ; *op0 = s^0x80
+    str         r10, [src]                  ; store *op0
+
+    ;roughly 2/7th difference across boundary
+    mov         lr, #0x12                   ; 18
+    mov         r7, #0x3f                   ; 63
+
+    sxtb16      r6, r12
+    sxtb16      r10, r12, ror #8
+    smlabb      r8, r6, lr, r7
+    smlatb      r6, r6, lr, r7
+    smlabb      r9, r10, lr, r7
+    smlatb      r10, r10, lr, r7
+    ssat        r8, #8, r8, asr #7
+    ssat        r6, #8, r6, asr #7
+    ssat        r9, #8, r9, asr #7
+    ssat        r10, #8, r10, asr #7
+
+    ldr         lr, c0x80808080
+
+    pkhbt       r6, r8, r6, lsl #16
+    pkhbt       r10, r9, r10, lsl #16
+
+    ldr         r9, [sp, #8]                ; load qs1
+    ldr         r11, [sp, #12]              ; load ps1
+
+    uxtb16      r6, r6
+    uxtb16      r10, r10
+
+    sub         src, src, pstep
+
+    orr         r10, r6, r10, lsl #8        ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
+
+    qadd8       r11, r11, r10               ; s = vp8_signed_char_clamp(ps1 + u)
+    qsub8       r8, r9, r10                 ; s = vp8_signed_char_clamp(qs1 - u)
+    eor         r11, r11, lr                ; *op1 = s^0x80
+    str         r11, [src], pstep           ; store *op1
+    eor         r8, r8, lr                  ; *oq1 = s^0x80
+    add         src, src, pstep, lsl #1
+
+    mov         r7, #0x3f                   ; 63
+
+    str         r8, [src], pstep            ; store *oq1
+
+    ;roughly 1/7th difference across boundary
+    mov         lr, #0x9                    ; 9
+    ldr         r9, [src]                   ; load q2
+
+    sxtb16      r6, r12
+    sxtb16      r10, r12, ror #8
+    smlabb      r8, r6, lr, r7
+    smlatb      r6, r6, lr, r7
+    smlabb      r12, r10, lr, r7
+    smlatb      r10, r10, lr, r7
+    ssat        r8, #8, r8, asr #7
+    ssat        r6, #8, r6, asr #7
+    ssat        r12, #8, r12, asr #7
+    ssat        r10, #8, r10, asr #7
+
+    sub         src, src, pstep, lsl #2
+
+    pkhbt       r6, r8, r6, lsl #16
+    pkhbt       r10, r12, r10, lsl #16
+
+    sub         src, src, pstep
+    ldr         lr, c0x80808080
+
+    ldr         r11, [src]                  ; load p2
+
+    uxtb16      r6, r6
+    uxtb16      r10, r10
+
+    eor         r9, r9, lr
+    eor         r11, r11, lr
+
+    orr         r10, r6, r10, lsl #8        ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
+
+    qadd8       r8, r11, r10                ; s = vp8_signed_char_clamp(ps2 + u)
+    qsub8       r10, r9, r10                ; s = vp8_signed_char_clamp(qs2 - u)
+    eor         r8, r8, lr                  ; *op2 = s^0x80
+    str         r8, [src], pstep, lsl #2    ; store *op2
+    add         src, src, pstep
+    eor         r10, r10, lr                ; *oq2 = s^0x80
+    str         r10, [src], pstep, lsl #1   ; store *oq2
+
+|mbhskip_filter|
+    add         src, src, #4
+    sub         src, src, pstep, lsl #3
+    subs        count, count, #1
+
+    ldrne       r9, [src], pstep            ; p3
+    ldrne       r10, [src], pstep           ; p2
+    ldrne       r11, [src], pstep           ; p1
+
+    bne         MBHnext8
+
+    add         sp, sp, #16
+    ldmia       sp!, {r4 - r11, pc}
+    ENDP        ; |vp8_mbloop_filter_horizontal_edge_armv6|
+
+
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+|vp8_loop_filter_vertical_edge_armv6| PROC
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+    stmdb       sp!, {r4 - r11, lr}
+
+    sub         src, src, #4                ; move src pointer down by 4
+    ldr         count, [sp, #40]            ; count for 8-in-parallel
+    ldr         r12, [sp, #36]              ; load thresh address
+    sub         sp, sp, #16                 ; create temp buffer
+
+    ldr         r6, [src], pstep            ; load source data
+    ldrb        r4, [r2]                    ; blimit
+    ldr         r7, [src], pstep
+    ldrb        r2, [r3]                    ; limit
+    ldr         r8, [src], pstep
+    orr         r4, r4, r4, lsl #8
+    ldrb        r3, [r12]                   ; thresh
+    orr         r2, r2, r2, lsl #8
+    ldr         lr, [src], pstep
+    mov         count, count, lsl #1        ; 4-in-parallel
+    orr         r4, r4, r4, lsl #16
+    orr         r3, r3, r3, lsl #8
+    orr         r2, r2, r2, lsl #16
+    orr         r3, r3, r3, lsl #16
+
+|Vnext8|
+
+    ; vp8_filter_mask() function
+    ; calculate breakout conditions
+    ; transpose the source data for 4-in-parallel operation
+    TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
+
+    uqsub8      r7, r9, r10                 ; p3 - p2
+    uqsub8      r8, r10, r9                 ; p2 - p3
+    uqsub8      r9, r10, r11                ; p2 - p1
+    uqsub8      r10, r11, r10               ; p1 - p2
+    orr         r7, r7, r8                  ; abs (p3-p2)
+    orr         r10, r9, r10                ; abs (p2-p1)
+    uqsub8      lr, r7, r2                  ; compare to limit. lr: vp8_filter_mask
+    uqsub8      r10, r10, r2                ; compare to limit
+
+    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
+
+    orr         lr, lr, r10
+
+    uqsub8      r6, r11, r12                ; p1 - p0
+    uqsub8      r7, r12, r11                ; p0 - p1
+    add         src, src, #4                ; move src pointer up by 4
+    orr         r6, r6, r7                  ; abs (p1-p0)
+    str         r11, [sp, #12]              ; save p1
+    uqsub8      r10, r6, r2                 ; compare to limit
+    uqsub8      r11, r6, r3                 ; compare to thresh
+    orr         lr, lr, r10
+
+    ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now
+    ; transpose the source data for 4-in-parallel operation
+    ldr         r6, [src], pstep            ; load source data
+    str         r11, [sp]                   ; push r11 to stack
+    ldr         r7, [src], pstep
+    str         r12, [sp, #4]               ; save current reg before load q0 - q3 data
+    ldr         r8, [src], pstep
+    str         lr, [sp, #8]
+    ldr         lr, [src], pstep
+
+    TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
+
+    ldr         lr, [sp, #8]                ; load back (f)limit accumulator
+
+    uqsub8      r6, r12, r11                ; q3 - q2
+    uqsub8      r7, r11, r12                ; q2 - q3
+    uqsub8      r12, r11, r10               ; q2 - q1
+    uqsub8      r11, r10, r11               ; q1 - q2
+    orr         r6, r6, r7                  ; abs (q3-q2)
+    orr         r7, r12, r11                ; abs (q2-q1)
+    uqsub8      r6, r6, r2                  ; compare to limit
+    uqsub8      r7, r7, r2                  ; compare to limit
+    ldr         r11, [sp, #4]               ; load back p0
+    ldr         r12, [sp, #12]              ; load back p1
+    orr         lr, lr, r6
+    orr         lr, lr, r7
+
+    uqsub8      r6, r11, r9                 ; p0 - q0
+    uqsub8      r7, r9, r11                 ; q0 - p0
+    uqsub8      r8, r12, r10                ; p1 - q1
+    uqsub8      r11, r10, r12               ; q1 - p1
+    orr         r6, r6, r7                  ; abs (p0-q0)
+    ldr         r7, c0x7F7F7F7F
+    orr         r8, r8, r11                 ; abs (p1-q1)
+    uqadd8      r6, r6, r6                  ; abs (p0-q0) * 2
+    and         r8, r7, r8, lsr #1          ; abs (p1-q1) / 2
+    uqsub8      r11, r10, r9                ; q1 - q0
+    uqadd8      r6, r8, r6                  ; abs (p0-q0)*2 + abs (p1-q1)/2
+    uqsub8      r12, r9, r10                ; q0 - q1
+    uqsub8      r6, r6, r4                  ; compare to flimit
+
+    orr         r9, r11, r12                ; abs (q1-q0)
+    uqsub8      r8, r9, r2                  ; compare to limit
+    uqsub8      r10, r9, r3                 ; compare to thresh
+    orr         lr, lr, r6
+    orr         lr, lr, r8
+
+    mvn         r11, #0                     ; r11 == -1
+    mov         r12, #0
+
+    usub8       lr, r12, lr
+    ldr         r9, [sp]                    ; load the compared result
+    sel         lr, r11, r12                ; filter mask: lr
+
+    cmp         lr, #0
+    beq         vskip_filter                 ; skip filtering
+
+    ;vp8_hevmask() function
+    ;calculate high edge variance
+
+    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
+
+    orr         r9, r9, r10
+
+    ldrh        r7, [src, #-2]
+    ldrh        r8, [src], pstep
+
+    usub8       r9, r12, r9
+    sel         r6, r12, r11                ; hev mask: r6
+
+    ;vp8_filter() function
+    ; load soure data to r6, r11, r12, lr
+    ldrh        r9, [src, #-2]
+    ldrh        r10, [src], pstep
+
+    pkhbt       r12, r7, r8, lsl #16
+
+    ldrh        r7, [src, #-2]
+    ldrh        r8, [src], pstep
+
+    pkhbt       r11, r9, r10, lsl #16
+
+    ldrh        r9, [src, #-2]
+    ldrh        r10, [src], pstep
+
+    ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first
+    str         r6, [sp]
+    str         lr, [sp, #4]
+
+    pkhbt       r6, r7, r8, lsl #16
+    pkhbt       lr, r9, r10, lsl #16
+
+    ;transpose r12, r11, r6, lr to r7, r8, r9, r10
+    TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10
+
+    ;load back hev_mask r6 and filter_mask lr
+    ldr         r12, c0x80808080
+    ldr         r6, [sp]
+    ldr         lr, [sp, #4]
+
+    eor         r7, r7, r12                 ; p1 offset to convert to a signed value
+    eor         r8, r8, r12                 ; p0 offset to convert to a signed value
+    eor         r9, r9, r12                 ; q0 offset to convert to a signed value
+    eor         r10, r10, r12               ; q1 offset to convert to a signed value
+
+    str         r9, [sp]                    ; store qs0 temporarily
+    str         r8, [sp, #4]                ; store ps0 temporarily
+    str         r10, [sp, #8]               ; store qs1 temporarily
+    str         r7, [sp, #12]               ; store ps1 temporarily
+
+    qsub8       r7, r7, r10                 ; vp8_signed_char_clamp(ps1-qs1)
+    qsub8       r8, r9, r8                  ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
+
+    and         r7, r7, r6                  ;  vp8_filter (r7) &= hev (r7 : filter)
+
+    qadd8       r7, r7, r8
+    ldr         r9, c0x03030303             ; r9 = 3 --modified for vp8
+
+    qadd8       r7, r7, r8
+    ldr         r10, c0x04040404
+
+    qadd8       r7, r7, r8
+    ;mvn         r11, #0                     ; r11 == -1
+
+    and         r7, r7, lr                  ; vp8_filter &= mask
+
+    ;modify code for vp8 -- Filter1 = vp8_filter (r7)
+    qadd8       r8 , r7 , r9                ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3)
+    qadd8       r7 , r7 , r10               ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4)
+
+    mov         r9, #0
+    shadd8      r8 , r8 , r9                ; Filter2 >>= 3
+    shadd8      r7 , r7 , r9                ; vp8_filter >>= 3
+    shadd8      r8 , r8 , r9
+    shadd8      r7 , r7 , r9
+    shadd8      lr , r8 , r9                ; lr: filter2
+    shadd8      r7 , r7 , r9                ; r7: filter
+
+    ;usub8      lr, r8, r10                 ; s = (s==4)*-1
+    ;sel            lr, r11, r9
+    ;usub8      r8, r10, r8
+    ;sel            r8, r11, r9
+    ;and            r8, r8, lr                  ; -1 for each element that equals 4 -- r8: s
+
+    ;calculate output
+    ;qadd8      lr, r8, r7                  ; u = vp8_signed_char_clamp(s + vp8_filter)
+
+    ldr         r8, [sp]                    ; load qs0
+    ldr         r9, [sp, #4]                ; load ps0
+
+    ldr         r10, c0x01010101
+
+    qsub8       r8, r8, r7                  ; u = vp8_signed_char_clamp(qs0 - vp8_filter)
+    qadd8       r9, r9, lr                  ; u = vp8_signed_char_clamp(ps0 + Filter2)
+    ;end of modification for vp8
+
+    eor         r8, r8, r12
+    eor         r9, r9, r12
+
+    mov         lr, #0
+
+    sadd8       r7, r7, r10
+    shadd8      r7, r7, lr
+
+    ldr         r10, [sp, #8]               ; load qs1
+    ldr         r11, [sp, #12]              ; load ps1
+
+    bic         r7, r7, r6                  ; r7: vp8_filter
+
+    qsub8       r10 , r10, r7               ; u = vp8_signed_char_clamp(qs1 - vp8_filter)
+    qadd8       r11, r11, r7                ; u = vp8_signed_char_clamp(ps1 + vp8_filter)
+    eor         r10, r10, r12
+    eor         r11, r11, r12
+
+    sub         src, src, pstep, lsl #2
+
+    ;we can use TRANSPOSE_MATRIX macro to transpose output - input: q1, q0, p0, p1
+    ;output is b0, b1, b2, b3
+    ;b0: 03 02 01 00
+    ;b1: 13 12 11 10
+    ;b2: 23 22 21 20
+    ;b3: 33 32 31 30
+    ;    p1 p0 q0 q1
+    ;   (a3 a2 a1 a0)
+    TRANSPOSE_MATRIX r11, r9, r8, r10, r6, r7, r12, lr
+
+    strh        r6, [src, #-2]              ; store the result
+    mov         r6, r6, lsr #16
+    strh        r6, [src], pstep
+
+    strh        r7, [src, #-2]
+    mov         r7, r7, lsr #16
+    strh        r7, [src], pstep
+
+    strh        r12, [src, #-2]
+    mov         r12, r12, lsr #16
+    strh        r12, [src], pstep
+
+    strh        lr, [src, #-2]
+    mov         lr, lr, lsr #16
+    strh        lr, [src], pstep
+
+|vskip_filter|
+    sub         src, src, #4
+    subs        count, count, #1
+
+    ldrne       r6, [src], pstep            ; load source data
+    ldrne       r7, [src], pstep
+    ldrne       r8, [src], pstep
+    ldrne       lr, [src], pstep
+
+    bne         Vnext8
+
+    add         sp, sp, #16
+
+    ldmia       sp!, {r4 - r11, pc}
+    ENDP        ; |vp8_loop_filter_vertical_edge_armv6|
+
+
+
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+|vp8_mbloop_filter_vertical_edge_armv6| PROC
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+    stmdb       sp!, {r4 - r11, lr}
+
+    sub         src, src, #4                ; move src pointer down by 4
+    ldr         count, [sp, #40]            ; count for 8-in-parallel
+    ldr         r12, [sp, #36]              ; load thresh address
+    pld         [src, #23]                  ; preload for next block
+    sub         sp, sp, #16                 ; create temp buffer
+
+    ldr         r6, [src], pstep            ; load source data
+    ldrb        r4, [r2]                    ; blimit
+    pld         [src, #23]
+    ldr         r7, [src], pstep
+    ldrb        r2, [r3]                    ; limit
+    pld         [src, #23]
+    ldr         r8, [src], pstep
+    orr         r4, r4, r4, lsl #8
+    ldrb        r3, [r12]                   ; thresh
+    orr         r2, r2, r2, lsl #8
+    pld         [src, #23]
+    ldr         lr, [src], pstep
+    mov         count, count, lsl #1        ; 4-in-parallel
+    orr         r4, r4, r4, lsl #16
+    orr         r3, r3, r3, lsl #8
+    orr         r2, r2, r2, lsl #16
+    orr         r3, r3, r3, lsl #16
+
+|MBVnext8|
+    ; vp8_filter_mask() function
+    ; calculate breakout conditions
+    ; transpose the source data for 4-in-parallel operation
+    TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
+
+    uqsub8      r7, r9, r10                 ; p3 - p2
+    uqsub8      r8, r10, r9                 ; p2 - p3
+    uqsub8      r9, r10, r11                ; p2 - p1
+    uqsub8      r10, r11, r10               ; p1 - p2
+    orr         r7, r7, r8                  ; abs (p3-p2)
+    orr         r10, r9, r10                ; abs (p2-p1)
+    uqsub8      lr, r7, r2                  ; compare to limit. lr: vp8_filter_mask
+    uqsub8      r10, r10, r2                ; compare to limit
+
+    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
+
+    orr         lr, lr, r10
+
+    uqsub8      r6, r11, r12                ; p1 - p0
+    uqsub8      r7, r12, r11                ; p0 - p1
+    add         src, src, #4                ; move src pointer up by 4
+    orr         r6, r6, r7                  ; abs (p1-p0)
+    str         r11, [sp, #12]              ; save p1
+    uqsub8      r10, r6, r2                 ; compare to limit
+    uqsub8      r11, r6, r3                 ; compare to thresh
+    orr         lr, lr, r10
+
+    ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now
+    ; transpose the source data for 4-in-parallel operation
+    ldr         r6, [src], pstep            ; load source data
+    str         r11, [sp]                   ; push r11 to stack
+    ldr         r7, [src], pstep
+    str         r12, [sp, #4]               ; save current reg before load q0 - q3 data
+    ldr         r8, [src], pstep
+    str         lr, [sp, #8]
+    ldr         lr, [src], pstep
+
+
+    TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
+
+    ldr         lr, [sp, #8]                ; load back (f)limit accumulator
+
+    uqsub8      r6, r12, r11                ; q3 - q2
+    uqsub8      r7, r11, r12                ; q2 - q3
+    uqsub8      r12, r11, r10               ; q2 - q1
+    uqsub8      r11, r10, r11               ; q1 - q2
+    orr         r6, r6, r7                  ; abs (q3-q2)
+    orr         r7, r12, r11                ; abs (q2-q1)
+    uqsub8      r6, r6, r2                  ; compare to limit
+    uqsub8      r7, r7, r2                  ; compare to limit
+    ldr         r11, [sp, #4]               ; load back p0
+    ldr         r12, [sp, #12]              ; load back p1
+    orr         lr, lr, r6
+    orr         lr, lr, r7
+
+    uqsub8      r6, r11, r9                 ; p0 - q0
+    uqsub8      r7, r9, r11                 ; q0 - p0
+    uqsub8      r8, r12, r10                ; p1 - q1
+    uqsub8      r11, r10, r12               ; q1 - p1
+    orr         r6, r6, r7                  ; abs (p0-q0)
+    ldr         r7, c0x7F7F7F7F
+    orr         r8, r8, r11                 ; abs (p1-q1)
+    uqadd8      r6, r6, r6                  ; abs (p0-q0) * 2
+    and         r8, r7, r8, lsr #1          ; abs (p1-q1) / 2
+    uqsub8      r11, r10, r9                ; q1 - q0
+    uqadd8      r6, r8, r6                  ; abs (p0-q0)*2 + abs (p1-q1)/2
+    uqsub8      r12, r9, r10                ; q0 - q1
+    uqsub8      r6, r6, r4                  ; compare to flimit
+
+    orr         r9, r11, r12                ; abs (q1-q0)
+    uqsub8      r8, r9, r2                  ; compare to limit
+    uqsub8      r10, r9, r3                 ; compare to thresh
+    orr         lr, lr, r6
+    orr         lr, lr, r8
+
+    mvn         r11, #0                     ; r11 == -1
+    mov         r12, #0
+
+    usub8       lr, r12, lr
+    ldr         r9, [sp]                    ; load the compared result
+    sel         lr, r11, r12                ; filter mask: lr
+
+    cmp         lr, #0
+    beq         mbvskip_filter               ; skip filtering
+
+
+
+    ;vp8_hevmask() function
+    ;calculate high edge variance
+
+    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
+
+    orr         r9, r9, r10
+
+    ldrh        r7, [src, #-2]
+    ldrh        r8, [src], pstep
+
+    usub8       r9, r12, r9
+    sel         r6, r12, r11                ; hev mask: r6
+
+
+    ; vp8_mbfilter() function
+    ; p2, q2 are only needed at the end. Don't need to load them in now.
+    ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first
+    ; load soure data to r6, r11, r12, lr
+    ldrh        r9, [src, #-2]
+    ldrh        r10, [src], pstep
+
+    pkhbt       r12, r7, r8, lsl #16
+
+    ldrh        r7, [src, #-2]
+    ldrh        r8, [src], pstep
+
+    pkhbt       r11, r9, r10, lsl #16
+
+    ldrh        r9, [src, #-2]
+    ldrh        r10, [src], pstep
+
+    str         r6, [sp]                    ; save r6
+    str         lr, [sp, #4]                ; save lr
+
+    pkhbt       r6, r7, r8, lsl #16
+    pkhbt       lr, r9, r10, lsl #16
+
+    ;transpose r12, r11, r6, lr to p1, p0, q0, q1
+    TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10
+
+    ;load back hev_mask r6 and filter_mask lr
+    ldr         r12, c0x80808080
+    ldr         r6, [sp]
+    ldr         lr, [sp, #4]
+
+    eor         r7, r7, r12                 ; ps1
+    eor         r8, r8, r12                 ; ps0
+    eor         r9, r9, r12                 ; qs0
+    eor         r10, r10, r12               ; qs1
+
+    qsub8       r12, r9, r8                 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
+    str         r7, [sp, #12]               ; store ps1 temporarily
+    qsub8       r7, r7, r10                 ; vp8_signed_char_clamp(ps1-qs1)
+    str         r10, [sp, #8]               ; store qs1 temporarily
+    qadd8       r7, r7, r12
+    str         r9, [sp]                    ; store qs0 temporarily
+    qadd8       r7, r7, r12
+    str         r8, [sp, #4]                ; store ps0 temporarily
+    qadd8       r7, r7, r12                 ; vp8_filter: r7
+
+    ldr         r10, c0x03030303            ; r10 = 3 --modified for vp8
+    ldr         r9, c0x04040404
+    ;mvn         r11, #0                     ; r11 == -1
+
+    and         r7, r7, lr                  ; vp8_filter &= mask (lr is free)
+
+    mov         r12, r7                     ; Filter2: r12
+    and         r12, r12, r6                ; Filter2 &= hev
+
+    ;modify code for vp8
+    ;save bottom 3 bits so that we round one side +4 and the other +3
+    qadd8       r8 , r12 , r9               ; Filter1 (r8) = vp8_signed_char_clamp(Filter2+4)
+    qadd8       r12 , r12 , r10             ; Filter2 (r12) = vp8_signed_char_clamp(Filter2+3)
+
+    mov         r10, #0
+    shadd8      r8 , r8 , r10               ; Filter1 >>= 3
+    shadd8      r12 , r12 , r10             ; Filter2 >>= 3
+    shadd8      r8 , r8 , r10
+    shadd8      r12 , r12 , r10
+    shadd8      r8 , r8 , r10               ; r8: Filter1
+    shadd8      r12 , r12 , r10             ; r12: Filter2
+
+    ldr         r9, [sp]                    ; load qs0
+    ldr         r11, [sp, #4]               ; load ps0
+
+    qsub8       r9 , r9, r8                 ; qs0 = vp8_signed_char_clamp(qs0 - Filter1)
+    qadd8       r11, r11, r12               ; ps0 = vp8_signed_char_clamp(ps0 + Filter2)
+
+    ;save bottom 3 bits so that we round one side +4 and the other +3
+    ;and            r8, r12, r10                ; s = Filter2 & 7 (s: r8)
+    ;qadd8      r12 , r12 , r9              ; Filter2 = vp8_signed_char_clamp(Filter2+4)
+    ;mov            r10, #0
+    ;shadd8     r12 , r12 , r10             ; Filter2 >>= 3
+    ;usub8      lr, r8, r9                  ; s = (s==4)*-1
+    ;sel            lr, r11, r10
+    ;shadd8     r12 , r12 , r10
+    ;usub8      r8, r9, r8
+    ;sel            r8, r11, r10
+    ;ldr            r9, [sp]                    ; load qs0
+    ;ldr            r11, [sp, #4]               ; load ps0
+    ;shadd8     r12 , r12 , r10
+    ;and            r8, r8, lr                  ; -1 for each element that equals 4
+    ;qadd8      r10, r8, r12                ; u = vp8_signed_char_clamp(s + Filter2)
+    ;qsub8      r9 , r9, r12                ; qs0 = vp8_signed_char_clamp(qs0 - Filter2)
+    ;qadd8      r11, r11, r10               ; ps0 = vp8_signed_char_clamp(ps0 + u)
+
+    ;end of modification for vp8
+
+    bic         r12, r7, r6                 ;vp8_filter &= ~hev    ( r6 is free)
+    ;mov            r12, r7
+
+    ;roughly 3/7th difference across boundary
+    mov         lr, #0x1b                   ; 27
+    mov         r7, #0x3f                   ; 63
+
+    sxtb16      r6, r12
+    sxtb16      r10, r12, ror #8
+    smlabb      r8, r6, lr, r7
+    smlatb      r6, r6, lr, r7
+    smlabb      r7, r10, lr, r7
+    smultb      r10, r10, lr
+    ssat        r8, #8, r8, asr #7
+    ssat        r6, #8, r6, asr #7
+    add         r10, r10, #63
+    ssat        r7, #8, r7, asr #7
+    ssat        r10, #8, r10, asr #7
+
+    ldr         lr, c0x80808080
+
+    pkhbt       r6, r8, r6, lsl #16
+    pkhbt       r10, r7, r10, lsl #16
+    uxtb16      r6, r6
+    uxtb16      r10, r10
+
+    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
+
+    orr         r10, r6, r10, lsl #8        ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
+
+    qsub8       r8, r9, r10                 ; s = vp8_signed_char_clamp(qs0 - u)
+    qadd8       r10, r11, r10               ; s = vp8_signed_char_clamp(ps0 + u)
+    eor         r8, r8, lr                  ; *oq0 = s^0x80
+    eor         r10, r10, lr                ; *op0 = s^0x80
+
+    strb        r10, [src, #-1]             ; store op0 result
+    strb        r8, [src], pstep            ; store oq0 result
+    mov         r10, r10, lsr #8
+    mov         r8, r8, lsr #8
+    strb        r10, [src, #-1]
+    strb        r8, [src], pstep
+    mov         r10, r10, lsr #8
+    mov         r8, r8, lsr #8
+    strb        r10, [src, #-1]
+    strb        r8, [src], pstep
+    mov         r10, r10, lsr #8
+    mov         r8, r8, lsr #8
+    strb        r10, [src, #-1]
+    strb        r8, [src], pstep
+
+    ;roughly 2/7th difference across boundary
+    mov         lr, #0x12                   ; 18
+    mov         r7, #0x3f                   ; 63
+
+    sxtb16      r6, r12
+    sxtb16      r10, r12, ror #8
+    smlabb      r8, r6, lr, r7
+    smlatb      r6, r6, lr, r7
+    smlabb      r9, r10, lr, r7
+
+    smlatb      r10, r10, lr, r7
+    ssat        r8, #8, r8, asr #7
+    ssat        r6, #8, r6, asr #7
+    ssat        r9, #8, r9, asr #7
+    ssat        r10, #8, r10, asr #7
+
+    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
+
+    pkhbt       r6, r8, r6, lsl #16
+    pkhbt       r10, r9, r10, lsl #16
+
+    ldr         r9, [sp, #8]                ; load qs1
+    ldr         r11, [sp, #12]              ; load ps1
+    ldr         lr, c0x80808080
+
+    uxtb16      r6, r6
+    uxtb16      r10, r10
+
+    add         src, src, #2
+
+    orr         r10, r6, r10, lsl #8        ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
+
+    qsub8       r8, r9, r10                 ; s = vp8_signed_char_clamp(qs1 - u)
+    qadd8       r10, r11, r10               ; s = vp8_signed_char_clamp(ps1 + u)
+    eor         r8, r8, lr                  ; *oq1 = s^0x80
+    eor         r10, r10, lr                ; *op1 = s^0x80
+
+    ldrb        r11, [src, #-5]             ; load p2 for 1/7th difference across boundary
+    strb        r10, [src, #-4]             ; store op1
+    strb        r8, [src, #-1]              ; store oq1
+    ldrb        r9, [src], pstep            ; load q2 for 1/7th difference across boundary
+
+    mov         r10, r10, lsr #8
+    mov         r8, r8, lsr #8
+
+    ldrb        r6, [src, #-5]
+    strb        r10, [src, #-4]
+    strb        r8, [src, #-1]
+    ldrb        r7, [src], pstep
+
+    mov         r10, r10, lsr #8
+    mov         r8, r8, lsr #8
+    orr         r11, r11, r6, lsl #8
+    orr         r9, r9, r7, lsl #8
+
+    ldrb        r6, [src, #-5]
+    strb        r10, [src, #-4]
+    strb        r8, [src, #-1]
+    ldrb        r7, [src], pstep
+
+    mov         r10, r10, lsr #8
+    mov         r8, r8, lsr #8
+    orr         r11, r11, r6, lsl #16
+    orr         r9, r9, r7, lsl #16
+
+    ldrb        r6, [src, #-5]
+    strb        r10, [src, #-4]
+    strb        r8, [src, #-1]
+    ldrb        r7, [src], pstep
+    orr         r11, r11, r6, lsl #24
+    orr         r9, r9, r7, lsl #24
+
+    ;roughly 1/7th difference across boundary
+    eor         r9, r9, lr
+    eor         r11, r11, lr
+
+    mov         lr, #0x9                    ; 9
+    mov         r7, #0x3f                   ; 63
+
+    sxtb16      r6, r12
+    sxtb16      r10, r12, ror #8
+    smlabb      r8, r6, lr, r7
+    smlatb      r6, r6, lr, r7
+    smlabb      r12, r10, lr, r7
+    smlatb      r10, r10, lr, r7
+    ssat        r8, #8, r8, asr #7
+    ssat        r6, #8, r6, asr #7
+    ssat        r12, #8, r12, asr #7
+    ssat        r10, #8, r10, asr #7
+
+    sub         src, src, pstep, lsl #2
+
+    pkhbt       r6, r8, r6, lsl #16
+    pkhbt       r10, r12, r10, lsl #16
+
+    uxtb16      r6, r6
+    uxtb16      r10, r10
+
+    ldr         lr, c0x80808080
+
+    orr         r10, r6, r10, lsl #8        ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
+
+    qadd8       r8, r11, r10                ; s = vp8_signed_char_clamp(ps2 + u)
+    qsub8       r10, r9, r10                ; s = vp8_signed_char_clamp(qs2 - u)
+    eor         r8, r8, lr                  ; *op2 = s^0x80
+    eor         r10, r10, lr                ; *oq2 = s^0x80
+
+    strb        r8, [src, #-5]              ; store *op2
+    strb        r10, [src], pstep           ; store *oq2
+    mov         r8, r8, lsr #8
+    mov         r10, r10, lsr #8
+    strb        r8, [src, #-5]
+    strb        r10, [src], pstep
+    mov         r8, r8, lsr #8
+    mov         r10, r10, lsr #8
+    strb        r8, [src, #-5]
+    strb        r10, [src], pstep
+    mov         r8, r8, lsr #8
+    mov         r10, r10, lsr #8
+    strb        r8, [src, #-5]
+    strb        r10, [src], pstep
+
+    ;adjust src pointer for next loop
+    sub         src, src, #2
+
+|mbvskip_filter|
+    sub         src, src, #4
+    subs        count, count, #1
+
+    pld         [src, #23]                  ; preload for next block
+    ldrne       r6, [src], pstep            ; load source data
+    pld         [src, #23]
+    ldrne       r7, [src], pstep
+    pld         [src, #23]
+    ldrne       r8, [src], pstep
+    pld         [src, #23]
+    ldrne       lr, [src], pstep
+
+    bne         MBVnext8
+
+    add         sp, sp, #16
+
+    ldmia       sp!, {r4 - r11, pc}
+    ENDP        ; |vp8_mbloop_filter_vertical_edge_armv6|
+
+; Constant Pool
+c0x80808080 DCD     0x80808080
+c0x03030303 DCD     0x03030303
+c0x04040404 DCD     0x04040404
+c0x01010101 DCD     0x01010101
+c0x7F7F7F7F DCD     0x7F7F7F7F
+
+    END
diff --git a/libs/libvpx/vp8/common/arm/armv6/simpleloopfilter_v6.asm b/libs/libvpx/vp8/common/arm/armv6/simpleloopfilter_v6.asm
new file mode 100644
index 0000000000..5e00cf01bb
--- /dev/null
+++ b/libs/libvpx/vp8/common/arm/armv6/simpleloopfilter_v6.asm
@@ -0,0 +1,286 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT |vp8_loop_filter_simple_horizontal_edge_armv6|
+    EXPORT |vp8_loop_filter_simple_vertical_edge_armv6|
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+
+    MACRO
+    TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3
+    ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3
+    ; a0: 03 02 01 00
+    ; a1: 13 12 11 10
+    ; a2: 23 22 21 20
+    ; a3: 33 32 31 30
+    ;     b3 b2 b1 b0
+
+    uxtb16      $b1, $a1                    ; xx 12 xx 10
+    uxtb16      $b0, $a0                    ; xx 02 xx 00
+    uxtb16      $b3, $a3                    ; xx 32 xx 30
+    uxtb16      $b2, $a2                    ; xx 22 xx 20
+    orr         $b1, $b0, $b1, lsl #8       ; 12 02 10 00
+    orr         $b3, $b2, $b3, lsl #8       ; 32 22 30 20
+
+    uxtb16      $a1, $a1, ror #8            ; xx 13 xx 11
+    uxtb16      $a3, $a3, ror #8            ; xx 33 xx 31
+    uxtb16      $a0, $a0, ror #8            ; xx 03 xx 01
+    uxtb16      $a2, $a2, ror #8            ; xx 23 xx 21
+    orr         $a0, $a0, $a1, lsl #8       ; 13 03 11 01
+    orr         $a2, $a2, $a3, lsl #8       ; 33 23 31 21
+
+    pkhtb       $b2, $b3, $b1, asr #16      ; 32 22 12 02   -- p1
+    pkhbt       $b0, $b1, $b3, lsl #16      ; 30 20 10 00   -- p3
+
+    pkhtb       $b3, $a2, $a0, asr #16      ; 33 23 13 03   -- p0
+    pkhbt       $b1, $a0, $a2, lsl #16      ; 31 21 11 01   -- p2
+    MEND
+
+
+
+src         RN  r0
+pstep       RN  r1
+
+;r0     unsigned char *src_ptr,
+;r1     int src_pixel_step,
+;r2     const char *blimit
+
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+|vp8_loop_filter_simple_horizontal_edge_armv6| PROC
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+    stmdb       sp!, {r4 - r11, lr}
+
+    ldrb        r12, [r2]                   ; blimit
+    ldr         r3, [src, -pstep, lsl #1]   ; p1
+    ldr         r4, [src, -pstep]           ; p0
+    ldr         r5, [src]                   ; q0
+    ldr         r6, [src, pstep]            ; q1
+    orr         r12, r12, r12, lsl #8       ; blimit
+    ldr         r2, c0x80808080
+    orr         r12, r12, r12, lsl #16      ; blimit
+    mov         r9, #4                      ; double the count. we're doing 4 at a time
+    mov         lr, #0                      ; need 0 in a couple places
+
+|simple_hnext8|
+    ; vp8_simple_filter_mask()
+
+    uqsub8      r7, r3, r6                  ; p1 - q1
+    uqsub8      r8, r6, r3                  ; q1 - p1
+    uqsub8      r10, r4, r5                 ; p0 - q0
+    uqsub8      r11, r5, r4                 ; q0 - p0
+    orr         r8, r8, r7                  ; abs(p1 - q1)
+    orr         r10, r10, r11               ; abs(p0 - q0)
+    uqadd8      r10, r10, r10               ; abs(p0 - q0) * 2
+    uhadd8      r8, r8, lr                  ; abs(p1 - q2) >> 1
+    uqadd8      r10, r10, r8                ; abs(p0 - q0)*2 + abs(p1 - q1)/2
+    mvn         r8, #0
+    usub8       r10, r12, r10               ; compare to flimit. usub8 sets GE flags
+    sel         r10, r8, lr                 ; filter mask: F or 0
+    cmp         r10, #0
+    beq         simple_hskip_filter         ; skip filtering if all masks are 0x00
+
+    ;vp8_simple_filter()
+
+    eor         r3, r3, r2                  ; p1 offset to convert to a signed value
+    eor         r6, r6, r2                  ; q1 offset to convert to a signed value
+    eor         r4, r4, r2                  ; p0 offset to convert to a signed value
+    eor         r5, r5, r2                  ; q0 offset to convert to a signed value
+
+    qsub8       r3, r3, r6                  ; vp8_filter = p1 - q1
+    qsub8       r6, r5, r4                  ; q0 - p0
+    qadd8       r3, r3, r6                  ; += q0 - p0
+    ldr         r7, c0x04040404
+    qadd8       r3, r3, r6                  ; += q0 - p0
+    ldr         r8, c0x03030303
+    qadd8       r3, r3, r6                  ; vp8_filter = p1-q1 + 3*(q0-p0))
+    ;STALL
+    and         r3, r3, r10                 ; vp8_filter &= mask
+
+    qadd8       r7 , r3 , r7                ; Filter1 = vp8_filter + 4
+    qadd8       r8 , r3 , r8                ; Filter2 = vp8_filter + 3
+
+    shadd8      r7 , r7 , lr
+    shadd8      r8 , r8 , lr
+    shadd8      r7 , r7 , lr
+    shadd8      r8 , r8 , lr
+    shadd8      r7 , r7 , lr                ; Filter1 >>= 3
+    shadd8      r8 , r8 , lr                ; Filter2 >>= 3
+
+    qsub8       r5 ,r5, r7                  ; u = q0 - Filter1
+    qadd8       r4, r4, r8                  ; u = p0 + Filter2
+    eor         r5, r5, r2                  ; *oq0 = u^0x80
+    str         r5, [src]                   ; store oq0 result
+    eor         r4, r4, r2                  ; *op0 = u^0x80
+    str         r4, [src, -pstep]           ; store op0 result
+
+|simple_hskip_filter|
+    subs        r9, r9, #1
+    addne       src, src, #4                ; next row
+
+    ldrne       r3, [src, -pstep, lsl #1]   ; p1
+    ldrne       r4, [src, -pstep]           ; p0
+    ldrne       r5, [src]                   ; q0
+    ldrne       r6, [src, pstep]            ; q1
+
+    bne         simple_hnext8
+
+    ldmia       sp!, {r4 - r11, pc}
+    ENDP        ; |vp8_loop_filter_simple_horizontal_edge_armv6|
+
+
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+|vp8_loop_filter_simple_vertical_edge_armv6| PROC
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+    stmdb       sp!, {r4 - r11, lr}
+
+    ldrb        r12, [r2]                   ; r12: blimit
+    ldr         r2, c0x80808080
+    orr         r12, r12, r12, lsl #8
+
+    ; load soure data to r7, r8, r9, r10
+    ldrh        r3, [src, #-2]
+    pld         [src, #23]                  ; preload for next block
+    ldrh        r4, [src], pstep
+    orr         r12, r12, r12, lsl #16
+
+    ldrh        r5, [src, #-2]
+    pld         [src, #23]
+    ldrh        r6, [src], pstep
+
+    pkhbt       r7, r3, r4, lsl #16
+
+    ldrh        r3, [src, #-2]
+    pld         [src, #23]
+    ldrh        r4, [src], pstep
+
+    pkhbt       r8, r5, r6, lsl #16
+
+    ldrh        r5, [src, #-2]
+    pld         [src, #23]
+    ldrh        r6, [src], pstep
+    mov         r11, #4                     ; double the count. we're doing 4 at a time
+
+|simple_vnext8|
+    ; vp8_simple_filter_mask() function
+    pkhbt       r9, r3, r4, lsl #16
+    pkhbt       r10, r5, r6, lsl #16
+
+    ;transpose r7, r8, r9, r10 to r3, r4, r5, r6
+    TRANSPOSE_MATRIX r7, r8, r9, r10, r3, r4, r5, r6
+
+    uqsub8      r7, r3, r6                  ; p1 - q1
+    uqsub8      r8, r6, r3                  ; q1 - p1
+    uqsub8      r9, r4, r5                  ; p0 - q0
+    uqsub8      r10, r5, r4                 ; q0 - p0
+    orr         r7, r7, r8                  ; abs(p1 - q1)
+    orr         r9, r9, r10                 ; abs(p0 - q0)
+    mov         r8, #0
+    uqadd8      r9, r9, r9                  ; abs(p0 - q0) * 2
+    uhadd8      r7, r7, r8                  ; abs(p1 - q1) / 2
+    uqadd8      r7, r7, r9                  ; abs(p0 - q0)*2 + abs(p1 - q1)/2
+    mvn         r10, #0                     ; r10 == -1
+
+    usub8       r7, r12, r7                 ; compare to flimit
+    sel         lr, r10, r8                 ; filter mask
+
+    cmp         lr, #0
+    beq         simple_vskip_filter         ; skip filtering
+
+    ;vp8_simple_filter() function
+    eor         r3, r3, r2                  ; p1 offset to convert to a signed value
+    eor         r6, r6, r2                  ; q1 offset to convert to a signed value
+    eor         r4, r4, r2                  ; p0 offset to convert to a signed value
+    eor         r5, r5, r2                  ; q0 offset to convert to a signed value
+
+    qsub8       r3, r3, r6                  ; vp8_filter = p1 - q1
+    qsub8       r6, r5, r4                  ; q0 - p0
+
+    qadd8       r3, r3, r6                  ; vp8_filter += q0 - p0
+    ldr         r9, c0x03030303             ; r9 = 3
+
+    qadd8       r3, r3, r6                  ; vp8_filter += q0 - p0
+    ldr         r7, c0x04040404
+
+    qadd8       r3, r3, r6                  ; vp8_filter = p1-q1 + 3*(q0-p0))
+    ;STALL
+    and         r3, r3, lr                  ; vp8_filter &= mask
+
+    qadd8       r9 , r3 , r9                ; Filter2 = vp8_filter + 3
+    qadd8       r3 , r3 , r7                ; Filter1 = vp8_filter + 4
+
+    shadd8      r9 , r9 , r8
+    shadd8      r3 , r3 , r8
+    shadd8      r9 , r9 , r8
+    shadd8      r3 , r3 , r8
+    shadd8      r9 , r9 , r8                ; Filter2 >>= 3
+    shadd8      r3 , r3 , r8                ; Filter1 >>= 3
+
+    ;calculate output
+    sub         src, src, pstep, lsl #2
+
+    qadd8       r4, r4, r9                  ; u = p0 + Filter2
+    qsub8       r5, r5, r3                  ; u = q0 - Filter1
+    eor         r4, r4, r2                  ; *op0 = u^0x80
+    eor         r5, r5, r2                  ; *oq0 = u^0x80
+
+    strb        r4, [src, #-1]              ; store the result
+    mov         r4, r4, lsr #8
+    strb        r5, [src], pstep
+    mov         r5, r5, lsr #8
+
+    strb        r4, [src, #-1]
+    mov         r4, r4, lsr #8
+    strb        r5, [src], pstep
+    mov         r5, r5, lsr #8
+
+    strb        r4, [src, #-1]
+    mov         r4, r4, lsr #8
+    strb        r5, [src], pstep
+    mov         r5, r5, lsr #8
+
+    strb        r4, [src, #-1]
+    strb        r5, [src], pstep
+
+|simple_vskip_filter|
+    subs        r11, r11, #1
+
+    ; load soure data to r7, r8, r9, r10
+    ldrneh      r3, [src, #-2]
+    pld         [src, #23]                  ; preload for next block
+    ldrneh      r4, [src], pstep
+
+    ldrneh      r5, [src, #-2]
+    pld         [src, #23]
+    ldrneh      r6, [src], pstep
+
+    pkhbt       r7, r3, r4, lsl #16
+
+    ldrneh      r3, [src, #-2]
+    pld         [src, #23]
+    ldrneh      r4, [src], pstep
+
+    pkhbt       r8, r5, r6, lsl #16
+
+    ldrneh      r5, [src, #-2]
+    pld         [src, #23]
+    ldrneh      r6, [src], pstep
+
+    bne         simple_vnext8
+
+    ldmia       sp!, {r4 - r11, pc}
+    ENDP        ; |vp8_loop_filter_simple_vertical_edge_armv6|
+
+; Constant Pool
+c0x80808080 DCD     0x80808080
+c0x03030303 DCD     0x03030303
+c0x04040404 DCD     0x04040404
+
+    END
diff --git a/libs/libvpx/vp8/common/arm/armv6/sixtappredict8x4_v6.asm b/libs/libvpx/vp8/common/arm/armv6/sixtappredict8x4_v6.asm
new file mode 100644
index 0000000000..e81aef53d5
--- /dev/null
+++ b/libs/libvpx/vp8/common/arm/armv6/sixtappredict8x4_v6.asm
@@ -0,0 +1,273 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_sixtap_predict8x4_armv6|
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+;-------------------------------------
+; r0    unsigned char *src_ptr,
+; r1    int  src_pixels_per_line,
+; r2    int  xoffset,
+; r3    int  yoffset,
+; stack unsigned char *dst_ptr,
+; stack int  dst_pitch
+;-------------------------------------
+;note: In first pass, store the result in transpose(8linesx9columns) on stack. Temporary stack size is 184.
+;Line width is 20 that is 9 short data plus 2 to make it 4bytes aligned. In second pass, load data from stack,
+;and the result is stored in transpose.
+|vp8_sixtap_predict8x4_armv6| PROC
+    stmdb       sp!, {r4 - r11, lr}
+    str         r3, [sp, #-184]!            ;reserve space on stack for temporary storage, store yoffset
+
+    cmp         r2, #0                      ;skip first_pass filter if xoffset=0
+    add         lr, sp, #4                  ;point to temporary buffer
+    beq         skip_firstpass_filter
+
+;first-pass filter
+    adr         r12, filter8_coeff
+    sub         r0, r0, r1, lsl #1
+
+    add         r3, r1, #10                 ; preload next low
+    pld         [r0, r3]
+
+    add         r2, r12, r2, lsl #4         ;calculate filter location
+    add         r0, r0, #3                  ;adjust src only for loading convinience
+
+    ldr         r3, [r2]                    ; load up packed filter coefficients
+    ldr         r4, [r2, #4]
+    ldr         r5, [r2, #8]
+
+    mov         r2, #0x90000                ; height=9 is top part of counter
+
+    sub         r1, r1, #8
+
+|first_pass_hloop_v6|
+    ldrb        r6, [r0, #-5]               ; load source data
+    ldrb        r7, [r0, #-4]
+    ldrb        r8, [r0, #-3]
+    ldrb        r9, [r0, #-2]
+    ldrb        r10, [r0, #-1]
+
+    orr         r2, r2, #0x4                ; construct loop counter. width=8=4x2
+
+    pkhbt       r6, r6, r7, lsl #16         ; r7 | r6
+    pkhbt       r7, r7, r8, lsl #16         ; r8 | r7
+
+    pkhbt       r8, r8, r9, lsl #16         ; r9 | r8
+    pkhbt       r9, r9, r10, lsl #16        ; r10 | r9
+
+|first_pass_wloop_v6|
+    smuad       r11, r6, r3                 ; vp8_filter[0], vp8_filter[1]
+    smuad       r12, r7, r3
+
+    ldrb        r6, [r0], #1
+
+    smlad       r11, r8, r4, r11            ; vp8_filter[2], vp8_filter[3]
+    ldrb        r7, [r0], #1
+    smlad       r12, r9, r4, r12
+
+    pkhbt       r10, r10, r6, lsl #16       ; r10 | r9
+    pkhbt       r6, r6, r7, lsl #16         ; r11 | r10
+    smlad       r11, r10, r5, r11           ; vp8_filter[4], vp8_filter[5]
+    smlad       r12, r6, r5, r12
+
+    sub         r2, r2, #1
+
+    add         r11, r11, #0x40             ; round_shift_and_clamp
+    tst         r2, #0xff                   ; test loop counter
+    usat        r11, #8, r11, asr #7
+    add         r12, r12, #0x40
+    strh        r11, [lr], #20              ; result is transposed and stored, which
+    usat        r12, #8, r12, asr #7
+
+    strh        r12, [lr], #20
+
+    movne       r11, r6
+    movne       r12, r7
+
+    movne       r6, r8
+    movne       r7, r9
+    movne       r8, r10
+    movne       r9, r11
+    movne       r10, r12
+
+    bne         first_pass_wloop_v6
+
+    ;;add       r9, ppl, #30                ; attempt to load 2 adjacent cache lines
+    ;;IF ARCHITECTURE=6
+    ;pld        [src, ppl]
+    ;;pld       [src, r9]
+    ;;ENDIF
+
+    subs        r2, r2, #0x10000
+
+    sub         lr, lr, #158
+
+    add         r0, r0, r1                  ; move to next input line
+
+    add         r11, r1, #18                ; preload next low. adding back block width(=8), which is subtracted earlier
+    pld         [r0, r11]
+
+    bne         first_pass_hloop_v6
+
+;second pass filter
+secondpass_filter
+    ldr         r3, [sp], #4                ; load back yoffset
+    ldr         r0, [sp, #216]              ; load dst address from stack 180+36
+    ldr         r1, [sp, #220]              ; load dst stride from stack 180+40
+
+    cmp         r3, #0
+    beq         skip_secondpass_filter
+
+    adr         r12, filter8_coeff
+    add         lr, r12, r3, lsl #4         ;calculate filter location
+
+    mov         r2, #0x00080000
+
+    ldr         r3, [lr]                    ; load up packed filter coefficients
+    ldr         r4, [lr, #4]
+    ldr         r5, [lr, #8]
+
+    pkhbt       r12, r4, r3                 ; pack the filter differently
+    pkhbt       r11, r5, r4
+
+second_pass_hloop_v6
+    ldr         r6, [sp]                    ; load the data
+    ldr         r7, [sp, #4]
+
+    orr         r2, r2, #2                  ; loop counter
+
+second_pass_wloop_v6
+    smuad       lr, r3, r6                  ; apply filter
+    smulbt      r10, r3, r6
+
+    ldr         r8, [sp, #8]
+
+    smlad       lr, r4, r7, lr
+    smladx      r10, r12, r7, r10
+
+    ldrh        r9, [sp, #12]
+
+    smlad       lr, r5, r8, lr
+    smladx      r10, r11, r8, r10
+
+    add         sp, sp, #4
+    smlatb      r10, r5, r9, r10
+
+    sub         r2, r2, #1
+
+    add         lr, lr, #0x40               ; round_shift_and_clamp
+    tst         r2, #0xff
+    usat        lr, #8, lr, asr #7
+    add         r10, r10, #0x40
+    strb        lr, [r0], r1                ; the result is transposed back and stored
+    usat        r10, #8, r10, asr #7
+
+    strb        r10, [r0],r1
+
+    movne       r6, r7
+    movne       r7, r8
+
+    bne         second_pass_wloop_v6
+
+    subs        r2, r2, #0x10000
+    add         sp, sp, #12                 ; updata src for next loop (20-8)
+    sub         r0, r0, r1, lsl #2
+    add         r0, r0, #1
+
+    bne         second_pass_hloop_v6
+
+    add         sp, sp, #20
+    ldmia       sp!, {r4 - r11, pc}
+
+;--------------------
+skip_firstpass_filter
+    sub         r0, r0, r1, lsl #1
+    sub         r1, r1, #8
+    mov         r2, #9
+
+skip_firstpass_hloop
+    ldrb        r4, [r0], #1                ; load data
+    subs        r2, r2, #1
+    ldrb        r5, [r0], #1
+    strh        r4, [lr], #20               ; store it to immediate buffer
+    ldrb        r6, [r0], #1                ; load data
+    strh        r5, [lr], #20
+    ldrb        r7, [r0], #1
+    strh        r6, [lr], #20
+    ldrb        r8, [r0], #1
+    strh        r7, [lr], #20
+    ldrb        r9, [r0], #1
+    strh        r8, [lr], #20
+    ldrb        r10, [r0], #1
+    strh        r9, [lr], #20
+    ldrb        r11, [r0], #1
+    strh        r10, [lr], #20
+    add         r0, r0, r1                  ; move to next input line
+    strh        r11, [lr], #20
+
+    sub         lr, lr, #158                ; move over to next column
+    bne         skip_firstpass_hloop
+
+    b           secondpass_filter
+
+;--------------------
+skip_secondpass_filter
+    mov         r2, #8
+    add         sp, sp, #4                  ;start from src[0] instead of src[-2]
+
+skip_secondpass_hloop
+    ldr         r6, [sp], #4
+    subs        r2, r2, #1
+    ldr         r8, [sp], #4
+
+    mov         r7, r6, lsr #16             ; unpack
+    strb        r6, [r0], r1
+    mov         r9, r8, lsr #16
+    strb        r7, [r0], r1
+    add         sp, sp, #12                 ; 20-8
+    strb        r8, [r0], r1
+    strb        r9, [r0], r1
+
+    sub         r0, r0, r1, lsl #2
+    add         r0, r0, #1
+
+    bne         skip_secondpass_hloop
+
+    add         sp, sp, #16                 ; 180 - (160 +4)
+
+    ldmia       sp!, {r4 - r11, pc}
+
+    ENDP
+
+;-----------------
+;One word each is reserved. Label filter_coeff can be used to access the data.
+;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+filter8_coeff
+    DCD     0x00000000,     0x00000080,     0x00000000,     0x00000000
+    DCD     0xfffa0000,     0x000c007b,     0x0000ffff,     0x00000000
+    DCD     0xfff50002,     0x0024006c,     0x0001fff8,     0x00000000
+    DCD     0xfff70000,     0x0032005d,     0x0000fffa,     0x00000000
+    DCD     0xfff00003,     0x004d004d,     0x0003fff0,     0x00000000
+    DCD     0xfffa0000,     0x005d0032,     0x0000fff7,     0x00000000
+    DCD     0xfff80001,     0x006c0024,     0x0002fff5,     0x00000000
+    DCD     0xffff0000,     0x007b000c,     0x0000fffa,     0x00000000
+
+    ;DCD        0,  0,  128,    0,   0,  0
+    ;DCD        0, -6,  123,   12,  -1,  0
+    ;DCD        2, -11, 108,   36,  -8,  1
+    ;DCD        0, -9,   93,   50,  -6,  0
+    ;DCD        3, -16,  77,   77, -16,  3
+    ;DCD        0, -6,   50,   93,  -9,  0
+    ;DCD        1, -8,   36,  108, -11,  2
+    ;DCD        0, -1,   12,  123,  -6,  0
+
+    END
diff --git a/libs/libvpx/vp8/common/arm/bilinearfilter_arm.c b/libs/libvpx/vp8/common/arm/bilinearfilter_arm.c
new file mode 100644
index 0000000000..799c8bd964
--- /dev/null
+++ b/libs/libvpx/vp8/common/arm/bilinearfilter_arm.c
@@ -0,0 +1,113 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+#include <math.h>
+#include "vp8/common/filter.h"
+#include "bilinearfilter_arm.h"
+
+void vp8_filter_block2d_bil_armv6
+(
+    unsigned char *src_ptr,
+    unsigned char *dst_ptr,
+    unsigned int   src_pitch,
+    unsigned int   dst_pitch,
+    const short   *HFilter,
+    const short   *VFilter,
+    int            Width,
+    int            Height
+)
+{
+    unsigned short FData[36*16]; /* Temp data buffer used in filtering */
+
+    /* First filter 1-D horizontally... */
+    vp8_filter_block2d_bil_first_pass_armv6(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);
+
+    /* then 1-D vertically... */
+    vp8_filter_block2d_bil_second_pass_armv6(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
+}
+
+
+void vp8_bilinear_predict4x4_armv6
+(
+    unsigned char  *src_ptr,
+    int   src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pitch
+)
+{
+    const short  *HFilter;
+    const short  *VFilter;
+
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
+
+    vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);
+}
+
+void vp8_bilinear_predict8x8_armv6
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const short  *HFilter;
+    const short  *VFilter;
+
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
+
+    vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);
+}
+
+void vp8_bilinear_predict8x4_armv6
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const short  *HFilter;
+    const short  *VFilter;
+
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
+
+    vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);
+}
+
+void vp8_bilinear_predict16x16_armv6
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const short  *HFilter;
+    const short  *VFilter;
+
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
+
+    vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);
+}
diff --git a/libs/libvpx/vp8/common/arm/bilinearfilter_arm.h b/libs/libvpx/vp8/common/arm/bilinearfilter_arm.h
new file mode 100644
index 0000000000..6b84e6f3b5
--- /dev/null
+++ b/libs/libvpx/vp8/common/arm/bilinearfilter_arm.h
@@ -0,0 +1,43 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_COMMON_ARM_BILINEARFILTER_ARM_H_
+#define VP8_COMMON_ARM_BILINEARFILTER_ARM_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern void vp8_filter_block2d_bil_first_pass_armv6
+(
+    const unsigned char  *src_ptr,
+    unsigned short       *dst_ptr,
+    unsigned int          src_pitch,
+    unsigned int          height,
+    unsigned int          width,
+    const short          *vp8_filter
+);
+
+extern void vp8_filter_block2d_bil_second_pass_armv6
+(
+    const unsigned short *src_ptr,
+    unsigned char        *dst_ptr,
+    int                   dst_pitch,
+    unsigned int          height,
+    unsigned int          width,
+    const short         *vp8_filter
+);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_COMMON_ARM_BILINEARFILTER_ARM_H_
diff --git a/libs/libvpx/vp8/common/arm/dequantize_arm.c b/libs/libvpx/vp8/common/arm/dequantize_arm.c
new file mode 100644
index 0000000000..1f8157f0b1
--- /dev/null
+++ b/libs/libvpx/vp8/common/arm/dequantize_arm.c
@@ -0,0 +1,25 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vp8/common/blockd.h"
+
+#if HAVE_MEDIA
+extern void vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ);
+
+void vp8_dequantize_b_v6(BLOCKD *d, short *DQC)
+{
+    short *DQ  = d->dqcoeff;
+    short *Q   = d->qcoeff;
+
+    vp8_dequantize_b_loop_v6(Q, DQC, DQ);
+}
+#endif
diff --git a/libs/libvpx/vp8/common/arm/filter_arm.c b/libs/libvpx/vp8/common/arm/filter_arm.c
new file mode 100644
index 0000000000..d6a6781d86
--- /dev/null
+++ b/libs/libvpx/vp8/common/arm/filter_arm.c
@@ -0,0 +1,221 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+#include <math.h>
+#include "vp8/common/filter.h"
+#include "vpx_ports/mem.h"
+
+extern void vp8_filter_block2d_first_pass_armv6
+(
+    unsigned char *src_ptr,
+    short         *output_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int output_width,
+    unsigned int output_height,
+    const short *vp8_filter
+);
+
+// 8x8
+extern void vp8_filter_block2d_first_pass_8x8_armv6
+(
+    unsigned char *src_ptr,
+    short         *output_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int output_width,
+    unsigned int output_height,
+    const short *vp8_filter
+);
+
+// 16x16
+extern void vp8_filter_block2d_first_pass_16x16_armv6
+(
+    unsigned char *src_ptr,
+    short         *output_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int output_width,
+    unsigned int output_height,
+    const short *vp8_filter
+);
+
+extern void vp8_filter_block2d_second_pass_armv6
+(
+    short         *src_ptr,
+    unsigned char *output_ptr,
+    unsigned int output_pitch,
+    unsigned int cnt,
+    const short *vp8_filter
+);
+
+extern void vp8_filter4_block2d_second_pass_armv6
+(
+    short         *src_ptr,
+    unsigned char *output_ptr,
+    unsigned int output_pitch,
+    unsigned int cnt,
+    const short *vp8_filter
+);
+
+extern void vp8_filter_block2d_first_pass_only_armv6
+(
+    unsigned char *src_ptr,
+    unsigned char *output_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int cnt,
+    unsigned int output_pitch,
+    const short *vp8_filter
+);
+
+
+extern void vp8_filter_block2d_second_pass_only_armv6
+(
+    unsigned char *src_ptr,
+    unsigned char *output_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int cnt,
+    unsigned int output_pitch,
+    const short *vp8_filter
+);
+
+#if HAVE_MEDIA
+void vp8_sixtap_predict4x4_armv6
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const short  *HFilter;
+    const short  *VFilter;
+    DECLARE_ALIGNED(4, short, FData[12*4]); /* Temp data buffer used in filtering */
+
+
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
+
+    /* Vfilter is null. First pass only */
+    if (xoffset && !yoffset)
+    {
+        /*vp8_filter_block2d_first_pass_armv6 ( src_ptr, FData+2, src_pixels_per_line, 4, 4, HFilter );
+        vp8_filter_block2d_second_pass_armv6 ( FData+2, dst_ptr, dst_pitch, 4, VFilter );*/
+
+        vp8_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, HFilter);
+    }
+    /* Hfilter is null. Second pass only */
+    else if (!xoffset && yoffset)
+    {
+        vp8_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, VFilter);
+    }
+    else
+    {
+        /* Vfilter is a 4 tap filter */
+        if (yoffset & 0x1)
+        {
+            vp8_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 4, 7, HFilter);
+            vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter);
+        }
+        /* Vfilter is 6 tap filter */
+        else
+        {
+            vp8_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 4, 9, HFilter);
+            vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter);
+        }
+    }
+}
+
+void vp8_sixtap_predict8x8_armv6
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const short  *HFilter;
+    const short  *VFilter;
+    DECLARE_ALIGNED(4, short, FData[16*8]); /* Temp data buffer used in filtering */
+
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
+
+    if (xoffset && !yoffset)
+    {
+        vp8_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, HFilter);
+    }
+    /* Hfilter is null. Second pass only */
+    else if (!xoffset && yoffset)
+    {
+        vp8_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, VFilter);
+    }
+    else
+    {
+        if (yoffset & 0x1)
+        {
+            vp8_filter_block2d_first_pass_8x8_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 8, 11, HFilter);
+            vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter);
+        }
+        else
+        {
+            vp8_filter_block2d_first_pass_8x8_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 8, 13, HFilter);
+            vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter);
+        }
+    }
+}
+
+
+void vp8_sixtap_predict16x16_armv6
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const short  *HFilter;
+    const short  *VFilter;
+    DECLARE_ALIGNED(4, short, FData[24*16]);    /* Temp data buffer used in filtering */
+
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
+
+    if (xoffset && !yoffset)
+    {
+        vp8_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, HFilter);
+    }
+    /* Hfilter is null. Second pass only */
+    else if (!xoffset && yoffset)
+    {
+        vp8_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, VFilter);
+    }
+    else
+    {
+        if (yoffset & 0x1)
+        {
+            vp8_filter_block2d_first_pass_16x16_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 16, 19, HFilter);
+            vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter);
+        }
+        else
+        {
+            vp8_filter_block2d_first_pass_16x16_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 16, 21, HFilter);
+            vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter);
+        }
+    }
+
+}
+#endif
diff --git a/libs/libvpx/vp8/common/arm/loopfilter_arm.c b/libs/libvpx/vp8/common/arm/loopfilter_arm.c
new file mode 100644
index 0000000000..5840c2bbaa
--- /dev/null
+++ b/libs/libvpx/vp8/common/arm/loopfilter_arm.c
@@ -0,0 +1,181 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+#include "vp8/common/loopfilter.h"
+#include "vp8/common/onyxc_int.h"
+
+#define prototype_loopfilter(sym) \
+    void sym(unsigned char *src, int pitch, const unsigned char *blimit,\
+             const unsigned char *limit, const unsigned char *thresh, int count)
+
+#if HAVE_MEDIA
+extern prototype_loopfilter(vp8_loop_filter_horizontal_edge_armv6);
+extern prototype_loopfilter(vp8_loop_filter_vertical_edge_armv6);
+extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_armv6);
+extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_armv6);
+#endif
+
+#if HAVE_NEON
+typedef void loopfilter_y_neon(unsigned char *src, int pitch,
+        unsigned char blimit, unsigned char limit, unsigned char thresh);
+typedef void loopfilter_uv_neon(unsigned char *u, int pitch,
+        unsigned char blimit, unsigned char limit, unsigned char thresh,
+        unsigned char *v);
+
+extern loopfilter_y_neon vp8_loop_filter_horizontal_edge_y_neon;
+extern loopfilter_y_neon vp8_loop_filter_vertical_edge_y_neon;
+extern loopfilter_uv_neon vp8_loop_filter_horizontal_edge_uv_neon;
+extern loopfilter_uv_neon vp8_loop_filter_vertical_edge_uv_neon;
+
+extern loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon;
+extern loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon;
+extern loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon;
+extern loopfilter_uv_neon vp8_mbloop_filter_vertical_edge_uv_neon;
+#endif
+
+#if HAVE_MEDIA
+/* ARMV6/MEDIA loopfilter functions*/
+/* Horizontal MB filtering */
+void vp8_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                               int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    vp8_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
+
+    if (u_ptr)
+        vp8_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+
+    if (v_ptr)
+        vp8_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+}
+
+/* Vertical MB Filtering */
+void vp8_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                               int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    vp8_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
+
+    if (u_ptr)
+        vp8_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+
+    if (v_ptr)
+        vp8_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+}
+
+/* Horizontal B Filtering */
+void vp8_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    vp8_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+    if (u_ptr)
+        vp8_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+    if (v_ptr)
+        vp8_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+}
+
+void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, int y_stride,
+                               const unsigned char *blimit)
+{
+    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, blimit);
+    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, blimit);
+    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, blimit);
+}
+
+/* Vertical B Filtering */
+void vp8_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    vp8_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+    if (u_ptr)
+        vp8_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+    if (v_ptr)
+        vp8_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+}
+
+void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, int y_stride,
+                               const unsigned char *blimit)
+{
+    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, blimit);
+    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, blimit);
+    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, blimit);
+}
+#endif
+
+#if HAVE_NEON
+/* NEON loopfilter functions */
+/* Horizontal MB filtering */
+void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    unsigned char mblim = *lfi->mblim;
+    unsigned char lim = *lfi->lim;
+    unsigned char hev_thr = *lfi->hev_thr;
+    vp8_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr);
+
+    if (u_ptr)
+        vp8_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr);
+}
+
+/* Vertical MB Filtering */
+void vp8_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    unsigned char mblim = *lfi->mblim;
+    unsigned char lim = *lfi->lim;
+    unsigned char hev_thr = *lfi->hev_thr;
+
+    vp8_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr);
+
+    if (u_ptr)
+        vp8_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr);
+}
+
+/* Horizontal B Filtering */
+void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                             int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    unsigned char blim = *lfi->blim;
+    unsigned char lim = *lfi->lim;
+    unsigned char hev_thr = *lfi->hev_thr;
+
+    vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 4 * y_stride, y_stride, blim, lim, hev_thr);
+    vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 8 * y_stride, y_stride, blim, lim, hev_thr);
+    vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, blim, lim, hev_thr);
+
+    if (u_ptr)
+        vp8_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, blim, lim, hev_thr, v_ptr + 4 * uv_stride);
+}
+
+/* Vertical B Filtering */
+void vp8_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                             int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    unsigned char blim = *lfi->blim;
+    unsigned char lim = *lfi->lim;
+    unsigned char hev_thr = *lfi->hev_thr;
+
+    vp8_loop_filter_vertical_edge_y_neon(y_ptr + 4, y_stride, blim, lim, hev_thr);
+    vp8_loop_filter_vertical_edge_y_neon(y_ptr + 8, y_stride, blim, lim, hev_thr);
+    vp8_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, blim, lim, hev_thr);
+
+    if (u_ptr)
+        vp8_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, blim, lim, hev_thr, v_ptr + 4);
+}
+#endif
diff --git a/libs/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c b/libs/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c
new file mode 100644
index 0000000000..9824a31936
--- /dev/null
+++ b/libs/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c
@@ -0,0 +1,699 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+static const uint8_t bifilter4_coeff[8][2] = {
+    {128,   0},
+    {112,  16},
+    { 96,  32},
+    { 80,  48},
+    { 64,  64},
+    { 48,  80},
+    { 32,  96},
+    { 16, 112}
+};
+
+void vp8_bilinear_predict4x4_neon(
+        unsigned char *src_ptr,
+        int src_pixels_per_line,
+        int xoffset,
+        int yoffset,
+        unsigned char *dst_ptr,
+        int dst_pitch) {
+    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8;
+    uint8x8_t d26u8, d27u8, d28u8, d29u8, d30u8;
+    uint8x16_t q1u8, q2u8;
+    uint16x8_t q1u16, q2u16;
+    uint16x8_t q7u16, q8u16, q9u16;
+    uint64x2_t q4u64, q5u64;
+    uint64x1_t d12u64;
+    uint32x2x2_t d0u32x2, d1u32x2, d2u32x2, d3u32x2;
+
+    if (xoffset == 0) {  // skip_1stpass_filter
+        uint32x2_t d28u32 = vdup_n_u32(0);
+        uint32x2_t d29u32 = vdup_n_u32(0);
+        uint32x2_t d30u32 = vdup_n_u32(0);
+
+        d28u32 = vld1_lane_u32((const uint32_t *)src_ptr, d28u32, 0);
+        src_ptr += src_pixels_per_line;
+        d28u32 = vld1_lane_u32((const uint32_t *)src_ptr, d28u32, 1);
+        src_ptr += src_pixels_per_line;
+        d29u32 = vld1_lane_u32((const uint32_t *)src_ptr, d29u32, 0);
+        src_ptr += src_pixels_per_line;
+        d29u32 = vld1_lane_u32((const uint32_t *)src_ptr, d29u32, 1);
+        src_ptr += src_pixels_per_line;
+        d30u32 = vld1_lane_u32((const uint32_t *)src_ptr, d30u32, 0);
+        d28u8 = vreinterpret_u8_u32(d28u32);
+        d29u8 = vreinterpret_u8_u32(d29u32);
+        d30u8 = vreinterpret_u8_u32(d30u32);
+    } else {
+        d2u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+        d3u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+        d4u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+        d5u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+        d6u8 = vld1_u8(src_ptr);
+
+        q1u8 = vcombine_u8(d2u8, d3u8);
+        q2u8 = vcombine_u8(d4u8, d5u8);
+
+        d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
+        d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
+
+        q4u64  = vshrq_n_u64(vreinterpretq_u64_u8(q1u8), 8);
+        q5u64  = vshrq_n_u64(vreinterpretq_u64_u8(q2u8), 8);
+        d12u64 = vshr_n_u64(vreinterpret_u64_u8(d6u8), 8);
+
+        d0u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q1u8)),
+                           vreinterpret_u32_u8(vget_high_u8(q1u8)));
+        d1u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q2u8)),
+                           vreinterpret_u32_u8(vget_high_u8(q2u8)));
+        d2u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q4u64)),
+                           vreinterpret_u32_u64(vget_high_u64(q4u64)));
+        d3u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)),
+                           vreinterpret_u32_u64(vget_high_u64(q5u64)));
+
+        q7u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d0u8);
+        q8u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d0u8);
+        q9u16 = vmull_u8(d6u8, d0u8);
+
+        q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d2u32x2.val[0]), d1u8);
+        q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d3u32x2.val[0]), d1u8);
+        q9u16 = vmlal_u8(q9u16, vreinterpret_u8_u64(d12u64), d1u8);
+
+        d28u8 = vqrshrn_n_u16(q7u16, 7);
+        d29u8 = vqrshrn_n_u16(q8u16, 7);
+        d30u8 = vqrshrn_n_u16(q9u16, 7);
+    }
+
+    // secondpass_filter
+    if (yoffset == 0) {  // skip_2ndpass_filter
+        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 0);
+        dst_ptr += dst_pitch;
+        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 1);
+        dst_ptr += dst_pitch;
+        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d29u8), 0);
+        dst_ptr += dst_pitch;
+        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d29u8), 1);
+    } else {
+        d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
+        d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]);
+
+        q1u16 = vmull_u8(d28u8, d0u8);
+        q2u16 = vmull_u8(d29u8, d0u8);
+
+        d26u8 = vext_u8(d28u8, d29u8, 4);
+        d27u8 = vext_u8(d29u8, d30u8, 4);
+
+        q1u16 = vmlal_u8(q1u16, d26u8, d1u8);
+        q2u16 = vmlal_u8(q2u16, d27u8, d1u8);
+
+        d2u8 = vqrshrn_n_u16(q1u16, 7);
+        d3u8 = vqrshrn_n_u16(q2u16, 7);
+
+        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 0);
+        dst_ptr += dst_pitch;
+        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 1);
+        dst_ptr += dst_pitch;
+        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 0);
+        dst_ptr += dst_pitch;
+        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 1);
+    }
+    return;
+}
+
+void vp8_bilinear_predict8x4_neon(
+        unsigned char *src_ptr,
+        int src_pixels_per_line,
+        int xoffset,
+        int yoffset,
+        unsigned char *dst_ptr,
+        int dst_pitch) {
+    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8;
+    uint8x8_t d7u8, d9u8, d11u8, d22u8, d23u8, d24u8, d25u8, d26u8;
+    uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8;
+    uint16x8_t q1u16, q2u16, q3u16, q4u16;
+    uint16x8_t q6u16, q7u16, q8u16, q9u16, q10u16;
+
+    if (xoffset == 0) {  // skip_1stpass_filter
+        d22u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+        d23u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+        d24u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+        d25u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+        d26u8 = vld1_u8(src_ptr);
+    } else {
+        q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+        q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+        q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+        q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+        q5u8 = vld1q_u8(src_ptr);
+
+        d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
+        d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
+
+        q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8);
+        q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8);
+        q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
+        q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
+        q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
+
+        d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1);
+        d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1);
+        d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
+        d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
+        d11u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
+
+        q6u16 = vmlal_u8(q6u16, d3u8, d1u8);
+        q7u16 = vmlal_u8(q7u16, d5u8, d1u8);
+        q8u16 = vmlal_u8(q8u16, d7u8, d1u8);
+        q9u16 = vmlal_u8(q9u16, d9u8, d1u8);
+        q10u16 = vmlal_u8(q10u16, d11u8, d1u8);
+
+        d22u8 = vqrshrn_n_u16(q6u16, 7);
+        d23u8 = vqrshrn_n_u16(q7u16, 7);
+        d24u8 = vqrshrn_n_u16(q8u16, 7);
+        d25u8 = vqrshrn_n_u16(q9u16, 7);
+        d26u8 = vqrshrn_n_u16(q10u16, 7);
+    }
+
+    // secondpass_filter
+    if (yoffset == 0) {  // skip_2ndpass_filter
+        vst1_u8((uint8_t *)dst_ptr, d22u8); dst_ptr += dst_pitch;
+        vst1_u8((uint8_t *)dst_ptr, d23u8); dst_ptr += dst_pitch;
+        vst1_u8((uint8_t *)dst_ptr, d24u8); dst_ptr += dst_pitch;
+        vst1_u8((uint8_t *)dst_ptr, d25u8);
+    } else {
+        d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
+        d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]);
+
+        q1u16 = vmull_u8(d22u8, d0u8);
+        q2u16 = vmull_u8(d23u8, d0u8);
+        q3u16 = vmull_u8(d24u8, d0u8);
+        q4u16 = vmull_u8(d25u8, d0u8);
+
+        q1u16 = vmlal_u8(q1u16, d23u8, d1u8);
+        q2u16 = vmlal_u8(q2u16, d24u8, d1u8);
+        q3u16 = vmlal_u8(q3u16, d25u8, d1u8);
+        q4u16 = vmlal_u8(q4u16, d26u8, d1u8);
+
+        d2u8 = vqrshrn_n_u16(q1u16, 7);
+        d3u8 = vqrshrn_n_u16(q2u16, 7);
+        d4u8 = vqrshrn_n_u16(q3u16, 7);
+        d5u8 = vqrshrn_n_u16(q4u16, 7);
+
+        vst1_u8((uint8_t *)dst_ptr, d2u8); dst_ptr += dst_pitch;
+        vst1_u8((uint8_t *)dst_ptr, d3u8); dst_ptr += dst_pitch;
+        vst1_u8((uint8_t *)dst_ptr, d4u8); dst_ptr += dst_pitch;
+        vst1_u8((uint8_t *)dst_ptr, d5u8);
+    }
+    return;
+}
+
+void vp8_bilinear_predict8x8_neon(
+        unsigned char *src_ptr,
+        int src_pixels_per_line,
+        int xoffset,
+        int yoffset,
+        unsigned char *dst_ptr,
+        int dst_pitch) {
+    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8, d11u8;
+    uint8x8_t d22u8, d23u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8, d30u8;
+    uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8;
+    uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16;
+    uint16x8_t q6u16, q7u16, q8u16, q9u16, q10u16;
+
+    if (xoffset == 0) {  // skip_1stpass_filter
+        d22u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+        d23u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+        d24u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+        d25u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+        d26u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+        d27u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+        d28u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+        d29u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
+        d30u8 = vld1_u8(src_ptr);
+    } else {
+        q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+        q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+        q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+        q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+
+        d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
+        d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
+
+        q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8);
+        q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8);
+        q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
+        q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
+
+        d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1);
+        d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1);
+        d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
+        d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
+
+        q6u16 = vmlal_u8(q6u16, d3u8, d1u8);
+        q7u16 = vmlal_u8(q7u16, d5u8, d1u8);
+        q8u16 = vmlal_u8(q8u16, d7u8, d1u8);
+        q9u16 = vmlal_u8(q9u16, d9u8, d1u8);
+
+        d22u8 = vqrshrn_n_u16(q6u16, 7);
+        d23u8 = vqrshrn_n_u16(q7u16, 7);
+        d24u8 = vqrshrn_n_u16(q8u16, 7);
+        d25u8 = vqrshrn_n_u16(q9u16, 7);
+
+        // first_pass filtering on the rest 5-line data
+        q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+        q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+        q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+        q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+        q5u8 = vld1q_u8(src_ptr);
+
+        q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8);
+        q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8);
+        q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
+        q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
+        q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
+
+        d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1);
+        d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1);
+        d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
+        d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
+        d11u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
+
+        q6u16 = vmlal_u8(q6u16, d3u8, d1u8);
+        q7u16 = vmlal_u8(q7u16, d5u8, d1u8);
+        q8u16 = vmlal_u8(q8u16, d7u8, d1u8);
+        q9u16 = vmlal_u8(q9u16, d9u8, d1u8);
+        q10u16 = vmlal_u8(q10u16, d11u8, d1u8);
+
+        d26u8 = vqrshrn_n_u16(q6u16, 7);
+        d27u8 = vqrshrn_n_u16(q7u16, 7);
+        d28u8 = vqrshrn_n_u16(q8u16, 7);
+        d29u8 = vqrshrn_n_u16(q9u16, 7);
+        d30u8 = vqrshrn_n_u16(q10u16, 7);
+    }
+
+    // secondpass_filter
+    if (yoffset == 0) {  // skip_2ndpass_filter
+        vst1_u8((uint8_t *)dst_ptr, d22u8); dst_ptr += dst_pitch;
+        vst1_u8((uint8_t *)dst_ptr, d23u8); dst_ptr += dst_pitch;
+        vst1_u8((uint8_t *)dst_ptr, d24u8); dst_ptr += dst_pitch;
+        vst1_u8((uint8_t *)dst_ptr, d25u8); dst_ptr += dst_pitch;
+        vst1_u8((uint8_t *)dst_ptr, d26u8); dst_ptr += dst_pitch;
+        vst1_u8((uint8_t *)dst_ptr, d27u8); dst_ptr += dst_pitch;
+        vst1_u8((uint8_t *)dst_ptr, d28u8); dst_ptr += dst_pitch;
+        vst1_u8((uint8_t *)dst_ptr, d29u8);
+    } else {
+        d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
+        d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]);
+
+        q1u16 = vmull_u8(d22u8, d0u8);
+        q2u16 = vmull_u8(d23u8, d0u8);
+        q3u16 = vmull_u8(d24u8, d0u8);
+        q4u16 = vmull_u8(d25u8, d0u8);
+        q5u16 = vmull_u8(d26u8, d0u8);
+        q6u16 = vmull_u8(d27u8, d0u8);
+        q7u16 = vmull_u8(d28u8, d0u8);
+        q8u16 = vmull_u8(d29u8, d0u8);
+
+        q1u16 = vmlal_u8(q1u16, d23u8, d1u8);
+        q2u16 = vmlal_u8(q2u16, d24u8, d1u8);
+        q3u16 = vmlal_u8(q3u16, d25u8, d1u8);
+        q4u16 = vmlal_u8(q4u16, d26u8, d1u8);
+        q5u16 = vmlal_u8(q5u16, d27u8, d1u8);
+        q6u16 = vmlal_u8(q6u16, d28u8, d1u8);
+        q7u16 = vmlal_u8(q7u16, d29u8, d1u8);
+        q8u16 = vmlal_u8(q8u16, d30u8, d1u8);
+
+        d2u8 = vqrshrn_n_u16(q1u16, 7);
+        d3u8 = vqrshrn_n_u16(q2u16, 7);
+        d4u8 = vqrshrn_n_u16(q3u16, 7);
+        d5u8 = vqrshrn_n_u16(q4u16, 7);
+        d6u8 = vqrshrn_n_u16(q5u16, 7);
+        d7u8 = vqrshrn_n_u16(q6u16, 7);
+        d8u8 = vqrshrn_n_u16(q7u16, 7);
+        d9u8 = vqrshrn_n_u16(q8u16, 7);
+
+        vst1_u8((uint8_t *)dst_ptr, d2u8); dst_ptr += dst_pitch;
+        vst1_u8((uint8_t *)dst_ptr, d3u8); dst_ptr += dst_pitch;
+        vst1_u8((uint8_t *)dst_ptr, d4u8); dst_ptr += dst_pitch;
+        vst1_u8((uint8_t *)dst_ptr, d5u8); dst_ptr += dst_pitch;
+        vst1_u8((uint8_t *)dst_ptr, d6u8); dst_ptr += dst_pitch;
+        vst1_u8((uint8_t *)dst_ptr, d7u8); dst_ptr += dst_pitch;
+        vst1_u8((uint8_t *)dst_ptr, d8u8); dst_ptr += dst_pitch;
+        vst1_u8((uint8_t *)dst_ptr, d9u8);
+    }
+    return;
+}
+
+void vp8_bilinear_predict16x16_neon(
+        unsigned char *src_ptr,
+        int src_pixels_per_line,
+        int xoffset,
+        int yoffset,
+        unsigned char *dst_ptr,
+        int dst_pitch) {
+    int i;
+    unsigned char tmp[272];
+    unsigned char *tmpp;
+    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
+    uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d16u8, d17u8, d18u8;
+    uint8x8_t d19u8, d20u8, d21u8;
+    uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8;
+    uint8x16_t q11u8, q12u8, q13u8, q14u8, q15u8;
+    uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16, q8u16;
+    uint16x8_t q9u16, q10u16, q11u16, q12u16, q13u16, q14u16;
+
+    if (xoffset == 0) {  // secondpass_bfilter16x16_only
+        d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
+        d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]);
+
+        q11u8 = vld1q_u8(src_ptr);
+        src_ptr += src_pixels_per_line;
+        for (i = 4; i > 0; i--) {
+            q12u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+            q13u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+            q14u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+            q15u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
+
+            q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8);
+            q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8);
+            q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8);
+            q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8);
+            q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8);
+            q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8);
+            q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8);
+            q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8);
+
+            q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8);
+            q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8);
+            q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8);
+            q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8);
+            q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8);
+            q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8);
+            q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8);
+            q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8);
+
+            d2u8 = vqrshrn_n_u16(q1u16, 7);
+            d3u8 = vqrshrn_n_u16(q2u16, 7);
+            d4u8 = vqrshrn_n_u16(q3u16, 7);
+            d5u8 = vqrshrn_n_u16(q4u16, 7);
+            d6u8 = vqrshrn_n_u16(q5u16, 7);
+            d7u8 = vqrshrn_n_u16(q6u16, 7);
+            d8u8 = vqrshrn_n_u16(q7u16, 7);
+            d9u8 = vqrshrn_n_u16(q8u16, 7);
+
+            q1u8 = vcombine_u8(d2u8, d3u8);
+            q2u8 = vcombine_u8(d4u8, d5u8);
+            q3u8 = vcombine_u8(d6u8, d7u8);
+            q4u8 = vcombine_u8(d8u8, d9u8);
+
+            q11u8 = q15u8;
+
+            vst1q_u8((uint8_t *)dst_ptr, q1u8); dst_ptr += dst_pitch;
+            vst1q_u8((uint8_t *)dst_ptr, q2u8); dst_ptr += dst_pitch;
+            vst1q_u8((uint8_t *)dst_ptr, q3u8); dst_ptr += dst_pitch;
+            vst1q_u8((uint8_t *)dst_ptr, q4u8); dst_ptr += dst_pitch;
+        }
+        return;
+    }
+
+    if (yoffset == 0) {  // firstpass_bfilter16x16_only
+        d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
+        d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
+
+        for (i = 4; i > 0 ; i--) {
+            d2u8 = vld1_u8(src_ptr);
+            d3u8 = vld1_u8(src_ptr + 8);
+            d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
+            d5u8 = vld1_u8(src_ptr);
+            d6u8 = vld1_u8(src_ptr + 8);
+            d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
+            d8u8 = vld1_u8(src_ptr);
+            d9u8 = vld1_u8(src_ptr + 8);
+            d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
+            d11u8 = vld1_u8(src_ptr);
+            d12u8 = vld1_u8(src_ptr + 8);
+            d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
+
+            q7u16  = vmull_u8(d2u8, d0u8);
+            q8u16  = vmull_u8(d3u8, d0u8);
+            q9u16  = vmull_u8(d5u8, d0u8);
+            q10u16 = vmull_u8(d6u8, d0u8);
+            q11u16 = vmull_u8(d8u8, d0u8);
+            q12u16 = vmull_u8(d9u8, d0u8);
+            q13u16 = vmull_u8(d11u8, d0u8);
+            q14u16 = vmull_u8(d12u8, d0u8);
+
+            d2u8  = vext_u8(d2u8, d3u8, 1);
+            d5u8  = vext_u8(d5u8, d6u8, 1);
+            d8u8  = vext_u8(d8u8, d9u8, 1);
+            d11u8 = vext_u8(d11u8, d12u8, 1);
+
+            q7u16  = vmlal_u8(q7u16, d2u8, d1u8);
+            q9u16  = vmlal_u8(q9u16, d5u8, d1u8);
+            q11u16 = vmlal_u8(q11u16, d8u8, d1u8);
+            q13u16 = vmlal_u8(q13u16, d11u8, d1u8);
+
+            d3u8  = vext_u8(d3u8, d4u8, 1);
+            d6u8  = vext_u8(d6u8, d7u8, 1);
+            d9u8  = vext_u8(d9u8, d10u8, 1);
+            d12u8 = vext_u8(d12u8, d13u8, 1);
+
+            q8u16  = vmlal_u8(q8u16,  d3u8, d1u8);
+            q10u16 = vmlal_u8(q10u16, d6u8, d1u8);
+            q12u16 = vmlal_u8(q12u16, d9u8, d1u8);
+            q14u16 = vmlal_u8(q14u16, d12u8, d1u8);
+
+            d14u8 = vqrshrn_n_u16(q7u16, 7);
+            d15u8 = vqrshrn_n_u16(q8u16, 7);
+            d16u8 = vqrshrn_n_u16(q9u16, 7);
+            d17u8 = vqrshrn_n_u16(q10u16, 7);
+            d18u8 = vqrshrn_n_u16(q11u16, 7);
+            d19u8 = vqrshrn_n_u16(q12u16, 7);
+            d20u8 = vqrshrn_n_u16(q13u16, 7);
+            d21u8 = vqrshrn_n_u16(q14u16, 7);
+
+            q7u8 = vcombine_u8(d14u8, d15u8);
+            q8u8 = vcombine_u8(d16u8, d17u8);
+            q9u8 = vcombine_u8(d18u8, d19u8);
+            q10u8 =vcombine_u8(d20u8, d21u8);
+
+            vst1q_u8((uint8_t *)dst_ptr, q7u8); dst_ptr += dst_pitch;
+            vst1q_u8((uint8_t *)dst_ptr, q8u8); dst_ptr += dst_pitch;
+            vst1q_u8((uint8_t *)dst_ptr, q9u8); dst_ptr += dst_pitch;
+            vst1q_u8((uint8_t *)dst_ptr, q10u8); dst_ptr += dst_pitch;
+        }
+        return;
+    }
+
+    d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
+    d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
+
+    d2u8 = vld1_u8(src_ptr);
+    d3u8 = vld1_u8(src_ptr + 8);
+    d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
+    d5u8 = vld1_u8(src_ptr);
+    d6u8 = vld1_u8(src_ptr + 8);
+    d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
+    d8u8 = vld1_u8(src_ptr);
+    d9u8 = vld1_u8(src_ptr + 8);
+    d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
+    d11u8 = vld1_u8(src_ptr);
+    d12u8 = vld1_u8(src_ptr + 8);
+    d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
+
+    // First Pass: output_height lines x output_width columns (17x16)
+    tmpp = tmp;
+    for (i = 3; i > 0; i--) {
+        q7u16  = vmull_u8(d2u8, d0u8);
+        q8u16  = vmull_u8(d3u8, d0u8);
+        q9u16  = vmull_u8(d5u8, d0u8);
+        q10u16 = vmull_u8(d6u8, d0u8);
+        q11u16 = vmull_u8(d8u8, d0u8);
+        q12u16 = vmull_u8(d9u8, d0u8);
+        q13u16 = vmull_u8(d11u8, d0u8);
+        q14u16 = vmull_u8(d12u8, d0u8);
+
+        d2u8  = vext_u8(d2u8, d3u8, 1);
+        d5u8  = vext_u8(d5u8, d6u8, 1);
+        d8u8  = vext_u8(d8u8, d9u8, 1);
+        d11u8 = vext_u8(d11u8, d12u8, 1);
+
+        q7u16  = vmlal_u8(q7u16, d2u8, d1u8);
+        q9u16  = vmlal_u8(q9u16, d5u8, d1u8);
+        q11u16 = vmlal_u8(q11u16, d8u8, d1u8);
+        q13u16 = vmlal_u8(q13u16, d11u8, d1u8);
+
+        d3u8  = vext_u8(d3u8, d4u8, 1);
+        d6u8  = vext_u8(d6u8, d7u8, 1);
+        d9u8  = vext_u8(d9u8, d10u8, 1);
+        d12u8 = vext_u8(d12u8, d13u8, 1);
+
+        q8u16  = vmlal_u8(q8u16,  d3u8, d1u8);
+        q10u16 = vmlal_u8(q10u16, d6u8, d1u8);
+        q12u16 = vmlal_u8(q12u16, d9u8, d1u8);
+        q14u16 = vmlal_u8(q14u16, d12u8, d1u8);
+
+        d14u8 = vqrshrn_n_u16(q7u16, 7);
+        d15u8 = vqrshrn_n_u16(q8u16, 7);
+        d16u8 = vqrshrn_n_u16(q9u16, 7);
+        d17u8 = vqrshrn_n_u16(q10u16, 7);
+        d18u8 = vqrshrn_n_u16(q11u16, 7);
+        d19u8 = vqrshrn_n_u16(q12u16, 7);
+        d20u8 = vqrshrn_n_u16(q13u16, 7);
+        d21u8 = vqrshrn_n_u16(q14u16, 7);
+
+        d2u8 = vld1_u8(src_ptr);
+        d3u8 = vld1_u8(src_ptr + 8);
+        d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
+        d5u8 = vld1_u8(src_ptr);
+        d6u8 = vld1_u8(src_ptr + 8);
+        d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
+        d8u8 = vld1_u8(src_ptr);
+        d9u8 = vld1_u8(src_ptr + 8);
+        d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
+        d11u8 = vld1_u8(src_ptr);
+        d12u8 = vld1_u8(src_ptr + 8);
+        d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
+
+        q7u8 = vcombine_u8(d14u8, d15u8);
+        q8u8 = vcombine_u8(d16u8, d17u8);
+        q9u8 = vcombine_u8(d18u8, d19u8);
+        q10u8 = vcombine_u8(d20u8, d21u8);
+
+        vst1q_u8((uint8_t *)tmpp, q7u8); tmpp += 16;
+        vst1q_u8((uint8_t *)tmpp, q8u8); tmpp += 16;
+        vst1q_u8((uint8_t *)tmpp, q9u8); tmpp += 16;
+        vst1q_u8((uint8_t *)tmpp, q10u8); tmpp += 16;
+    }
+
+    // First-pass filtering for rest 5 lines
+    d14u8 = vld1_u8(src_ptr);
+    d15u8 = vld1_u8(src_ptr + 8);
+    d16u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
+
+    q9u16  = vmull_u8(d2u8, d0u8);
+    q10u16 = vmull_u8(d3u8, d0u8);
+    q11u16 = vmull_u8(d5u8, d0u8);
+    q12u16 = vmull_u8(d6u8, d0u8);
+    q13u16 = vmull_u8(d8u8, d0u8);
+    q14u16 = vmull_u8(d9u8, d0u8);
+
+    d2u8  = vext_u8(d2u8, d3u8, 1);
+    d5u8  = vext_u8(d5u8, d6u8, 1);
+    d8u8  = vext_u8(d8u8, d9u8, 1);
+
+    q9u16  = vmlal_u8(q9u16, d2u8, d1u8);
+    q11u16 = vmlal_u8(q11u16, d5u8, d1u8);
+    q13u16 = vmlal_u8(q13u16, d8u8, d1u8);
+
+    d3u8  = vext_u8(d3u8, d4u8, 1);
+    d6u8  = vext_u8(d6u8, d7u8, 1);
+    d9u8  = vext_u8(d9u8, d10u8, 1);
+
+    q10u16 = vmlal_u8(q10u16, d3u8, d1u8);
+    q12u16 = vmlal_u8(q12u16, d6u8, d1u8);
+    q14u16 = vmlal_u8(q14u16, d9u8, d1u8);
+
+    q1u16 = vmull_u8(d11u8, d0u8);
+    q2u16 = vmull_u8(d12u8, d0u8);
+    q3u16 = vmull_u8(d14u8, d0u8);
+    q4u16 = vmull_u8(d15u8, d0u8);
+
+    d11u8 = vext_u8(d11u8, d12u8, 1);
+    d14u8 = vext_u8(d14u8, d15u8, 1);
+
+    q1u16 = vmlal_u8(q1u16, d11u8, d1u8);
+    q3u16 = vmlal_u8(q3u16, d14u8, d1u8);
+
+    d12u8 = vext_u8(d12u8, d13u8, 1);
+    d15u8 = vext_u8(d15u8, d16u8, 1);
+
+    q2u16 = vmlal_u8(q2u16, d12u8, d1u8);
+    q4u16 = vmlal_u8(q4u16, d15u8, d1u8);
+
+    d10u8 = vqrshrn_n_u16(q9u16, 7);
+    d11u8 = vqrshrn_n_u16(q10u16, 7);
+    d12u8 = vqrshrn_n_u16(q11u16, 7);
+    d13u8 = vqrshrn_n_u16(q12u16, 7);
+    d14u8 = vqrshrn_n_u16(q13u16, 7);
+    d15u8 = vqrshrn_n_u16(q14u16, 7);
+    d16u8 = vqrshrn_n_u16(q1u16, 7);
+    d17u8 = vqrshrn_n_u16(q2u16, 7);
+    d18u8 = vqrshrn_n_u16(q3u16, 7);
+    d19u8 = vqrshrn_n_u16(q4u16, 7);
+
+    q5u8 = vcombine_u8(d10u8, d11u8);
+    q6u8 = vcombine_u8(d12u8, d13u8);
+    q7u8 = vcombine_u8(d14u8, d15u8);
+    q8u8 = vcombine_u8(d16u8, d17u8);
+    q9u8 = vcombine_u8(d18u8, d19u8);
+
+    vst1q_u8((uint8_t *)tmpp, q5u8); tmpp += 16;
+    vst1q_u8((uint8_t *)tmpp, q6u8); tmpp += 16;
+    vst1q_u8((uint8_t *)tmpp, q7u8); tmpp += 16;
+    vst1q_u8((uint8_t *)tmpp, q8u8); tmpp += 16;
+    vst1q_u8((uint8_t *)tmpp, q9u8);
+
+    // secondpass_filter
+    d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
+    d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]);
+
+    tmpp = tmp;
+    q11u8 = vld1q_u8(tmpp);
+    tmpp += 16;
+    for (i = 4; i > 0; i--) {
+        q12u8 = vld1q_u8(tmpp); tmpp += 16;
+        q13u8 = vld1q_u8(tmpp); tmpp += 16;
+        q14u8 = vld1q_u8(tmpp); tmpp += 16;
+        q15u8 = vld1q_u8(tmpp); tmpp += 16;
+
+        q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8);
+        q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8);
+        q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8);
+        q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8);
+        q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8);
+        q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8);
+        q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8);
+        q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8);
+
+        q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8);
+        q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8);
+        q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8);
+        q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8);
+        q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8);
+        q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8);
+        q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8);
+        q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8);
+
+        d2u8 = vqrshrn_n_u16(q1u16, 7);
+        d3u8 = vqrshrn_n_u16(q2u16, 7);
+        d4u8 = vqrshrn_n_u16(q3u16, 7);
+        d5u8 = vqrshrn_n_u16(q4u16, 7);
+        d6u8 = vqrshrn_n_u16(q5u16, 7);
+        d7u8 = vqrshrn_n_u16(q6u16, 7);
+        d8u8 = vqrshrn_n_u16(q7u16, 7);
+        d9u8 = vqrshrn_n_u16(q8u16, 7);
+
+        q1u8 = vcombine_u8(d2u8, d3u8);
+        q2u8 = vcombine_u8(d4u8, d5u8);
+        q3u8 = vcombine_u8(d6u8, d7u8);
+        q4u8 = vcombine_u8(d8u8, d9u8);
+
+        q11u8 = q15u8;
+
+        vst1q_u8((uint8_t *)dst_ptr, q1u8); dst_ptr += dst_pitch;
+        vst1q_u8((uint8_t *)dst_ptr, q2u8); dst_ptr += dst_pitch;
+        vst1q_u8((uint8_t *)dst_ptr, q3u8); dst_ptr += dst_pitch;
+        vst1q_u8((uint8_t *)dst_ptr, q4u8); dst_ptr += dst_pitch;
+    }
+    return;
+}
diff --git a/libs/libvpx/vp8/common/arm/neon/copymem_neon.c b/libs/libvpx/vp8/common/arm/neon/copymem_neon.c
new file mode 100644
index 0000000000..deced115c1
--- /dev/null
+++ b/libs/libvpx/vp8/common/arm/neon/copymem_neon.c
@@ -0,0 +1,59 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+void vp8_copy_mem8x4_neon(
+        unsigned char *src,
+        int src_stride,
+        unsigned char *dst,
+        int dst_stride) {
+    uint8x8_t vtmp;
+    int r;
+
+    for (r = 0; r < 4; r++) {
+        vtmp = vld1_u8(src);
+        vst1_u8(dst, vtmp);
+        src += src_stride;
+        dst += dst_stride;
+    }
+}
+
+void vp8_copy_mem8x8_neon(
+        unsigned char *src,
+        int src_stride,
+        unsigned char *dst,
+        int dst_stride) {
+    uint8x8_t vtmp;
+    int r;
+
+    for (r = 0; r < 8; r++) {
+        vtmp = vld1_u8(src);
+        vst1_u8(dst, vtmp);
+        src += src_stride;
+        dst += dst_stride;
+    }
+}
+
+void vp8_copy_mem16x16_neon(
+        unsigned char *src,
+        int src_stride,
+        unsigned char *dst,
+        int dst_stride) {
+    int r;
+    uint8x16_t qtmp;
+
+    for (r = 0; r < 16; r++) {
+        qtmp = vld1q_u8(src);
+        vst1q_u8(dst, qtmp);
+        src += src_stride;
+        dst += dst_stride;
+    }
+}
diff --git a/libs/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c b/libs/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c
new file mode 100644
index 0000000000..ad5f41d7de
--- /dev/null
+++ b/libs/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c
@@ -0,0 +1,42 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+void vp8_dc_only_idct_add_neon(
+        int16_t input_dc,
+        unsigned char *pred_ptr,
+        int pred_stride,
+        unsigned char *dst_ptr,
+        int dst_stride) {
+    int i;
+    uint16_t a1 = ((input_dc + 4) >> 3);
+    uint32x2_t d2u32 = vdup_n_u32(0);
+    uint8x8_t d2u8;
+    uint16x8_t q1u16;
+    uint16x8_t qAdd;
+
+    qAdd = vdupq_n_u16(a1);
+
+    for (i = 0; i < 2; i++) {
+        d2u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d2u32, 0);
+        pred_ptr += pred_stride;
+        d2u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d2u32, 1);
+        pred_ptr += pred_stride;
+
+        q1u16 = vaddw_u8(qAdd, vreinterpret_u8_u32(d2u32));
+        d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
+
+        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 0);
+        dst_ptr += dst_stride;
+        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 1);
+        dst_ptr += dst_stride;
+    }
+}
diff --git a/libs/libvpx/vp8/common/arm/neon/dequant_idct_neon.c b/libs/libvpx/vp8/common/arm/neon/dequant_idct_neon.c
new file mode 100644
index 0000000000..58e11922c7
--- /dev/null
+++ b/libs/libvpx/vp8/common/arm/neon/dequant_idct_neon.c
@@ -0,0 +1,142 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+static const int16_t cospi8sqrt2minus1 = 20091;
+static const int16_t sinpi8sqrt2       = 35468;
+
+void vp8_dequant_idct_add_neon(
+        int16_t *input,
+        int16_t *dq,
+        unsigned char *dst,
+        int stride) {
+    unsigned char *dst0;
+    int32x2_t d14, d15;
+    int16x4_t d2, d3, d4, d5, d10, d11, d12, d13;
+    int16x8_t q1, q2, q3, q4, q5, q6;
+    int16x8_t qEmpty = vdupq_n_s16(0);
+    int32x2x2_t d2tmp0, d2tmp1;
+    int16x4x2_t d2tmp2, d2tmp3;
+
+    d14 = d15 = vdup_n_s32(0);
+
+    // load input
+    q3 = vld1q_s16(input);
+    vst1q_s16(input, qEmpty);
+    input += 8;
+    q4 = vld1q_s16(input);
+    vst1q_s16(input, qEmpty);
+
+    // load dq
+    q5 = vld1q_s16(dq);
+    dq += 8;
+    q6 = vld1q_s16(dq);
+
+    // load src from dst
+    dst0 = dst;
+    d14 = vld1_lane_s32((const int32_t *)dst0, d14, 0);
+    dst0 += stride;
+    d14 = vld1_lane_s32((const int32_t *)dst0, d14, 1);
+    dst0 += stride;
+    d15 = vld1_lane_s32((const int32_t *)dst0, d15, 0);
+    dst0 += stride;
+    d15 = vld1_lane_s32((const int32_t *)dst0, d15, 1);
+
+    q1 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q3),
+                                         vreinterpretq_u16_s16(q5)));
+    q2 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q4),
+                                         vreinterpretq_u16_s16(q6)));
+
+    d12 = vqadd_s16(vget_low_s16(q1), vget_low_s16(q2));
+    d13 = vqsub_s16(vget_low_s16(q1), vget_low_s16(q2));
+
+    q2 = vcombine_s16(vget_high_s16(q1), vget_high_s16(q2));
+
+    q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2);
+    q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1);
+
+    q3 = vshrq_n_s16(q3, 1);
+    q4 = vshrq_n_s16(q4, 1);
+
+    q3 = vqaddq_s16(q3, q2);
+    q4 = vqaddq_s16(q4, q2);
+
+    d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));
+    d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4));
+
+    d2 = vqadd_s16(d12, d11);
+    d3 = vqadd_s16(d13, d10);
+    d4 = vqsub_s16(d13, d10);
+    d5 = vqsub_s16(d12, d11);
+
+    d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
+    d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
+    d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]),
+                      vreinterpret_s16_s32(d2tmp1.val[0]));
+    d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]),
+                      vreinterpret_s16_s32(d2tmp1.val[1]));
+
+    // loop 2
+    q2 = vcombine_s16(d2tmp2.val[1], d2tmp3.val[1]);
+
+    q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2);
+    q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1);
+
+    d12 = vqadd_s16(d2tmp2.val[0], d2tmp3.val[0]);
+    d13 = vqsub_s16(d2tmp2.val[0], d2tmp3.val[0]);
+
+    q3 = vshrq_n_s16(q3, 1);
+    q4 = vshrq_n_s16(q4, 1);
+
+    q3 = vqaddq_s16(q3, q2);
+    q4 = vqaddq_s16(q4, q2);
+
+    d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));
+    d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4));
+
+    d2 = vqadd_s16(d12, d11);
+    d3 = vqadd_s16(d13, d10);
+    d4 = vqsub_s16(d13, d10);
+    d5 = vqsub_s16(d12, d11);
+
+    d2 = vrshr_n_s16(d2, 3);
+    d3 = vrshr_n_s16(d3, 3);
+    d4 = vrshr_n_s16(d4, 3);
+    d5 = vrshr_n_s16(d5, 3);
+
+    d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
+    d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
+    d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]),
+                      vreinterpret_s16_s32(d2tmp1.val[0]));
+    d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]),
+                      vreinterpret_s16_s32(d2tmp1.val[1]));
+
+    q1 = vcombine_s16(d2tmp2.val[0], d2tmp2.val[1]);
+    q2 = vcombine_s16(d2tmp3.val[0], d2tmp3.val[1]);
+
+    q1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q1),
+                                        vreinterpret_u8_s32(d14)));
+    q2 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2),
+                                        vreinterpret_u8_s32(d15)));
+
+    d14 = vreinterpret_s32_u8(vqmovun_s16(q1));
+    d15 = vreinterpret_s32_u8(vqmovun_s16(q2));
+
+    dst0 = dst;
+    vst1_lane_s32((int32_t *)dst0, d14, 0);
+    dst0 += stride;
+    vst1_lane_s32((int32_t *)dst0, d14, 1);
+    dst0 += stride;
+    vst1_lane_s32((int32_t *)dst0, d15, 0);
+    dst0 += stride;
+    vst1_lane_s32((int32_t *)dst0, d15, 1);
+    return;
+}
diff --git a/libs/libvpx/vp8/common/arm/neon/dequantizeb_neon.c b/libs/libvpx/vp8/common/arm/neon/dequantizeb_neon.c
new file mode 100644
index 0000000000..54e709dd3c
--- /dev/null
+++ b/libs/libvpx/vp8/common/arm/neon/dequantizeb_neon.c
@@ -0,0 +1,25 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "vp8/common/blockd.h"
+
+void vp8_dequantize_b_neon(BLOCKD *d, short *DQC) {
+    int16x8x2_t qQ, qDQC, qDQ;
+
+    qQ   = vld2q_s16(d->qcoeff);
+    qDQC = vld2q_s16(DQC);
+
+    qDQ.val[0] = vmulq_s16(qQ.val[0], qDQC.val[0]);
+    qDQ.val[1] = vmulq_s16(qQ.val[1], qDQC.val[1]);
+
+    vst2q_s16(d->dqcoeff, qDQ);
+}
diff --git a/libs/libvpx/vp8/common/arm/neon/idct_blk_neon.c b/libs/libvpx/vp8/common/arm/neon/idct_blk_neon.c
new file mode 100644
index 0000000000..fb327a7260
--- /dev/null
+++ b/libs/libvpx/vp8/common/arm/neon/idct_blk_neon.c
@@ -0,0 +1,96 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+
+/* place these declarations here because we don't want to maintain them
+ * outside of this scope
+ */
+void idct_dequant_full_2x_neon(short *q, short *dq,
+                               unsigned char *dst, int stride);
+void idct_dequant_0_2x_neon(short *q, short dq,
+                            unsigned char *dst, int stride);
+
+
+void vp8_dequant_idct_add_y_block_neon(short *q, short *dq,
+                                       unsigned char *dst,
+                                       int stride, char *eobs)
+{
+    int i;
+
+    for (i = 0; i < 4; i++)
+    {
+        if (((short *)(eobs))[0])
+        {
+            if (((short *)eobs)[0] & 0xfefe)
+                idct_dequant_full_2x_neon (q, dq, dst, stride);
+            else
+                idct_dequant_0_2x_neon (q, dq[0], dst, stride);
+        }
+
+        if (((short *)(eobs))[1])
+        {
+            if (((short *)eobs)[1] & 0xfefe)
+                idct_dequant_full_2x_neon (q+32, dq, dst+8, stride);
+            else
+                idct_dequant_0_2x_neon (q+32, dq[0], dst+8, stride);
+        }
+        q    += 64;
+        dst  += 4*stride;
+        eobs += 4;
+    }
+}
+
+void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq,
+                                        unsigned char *dstu,
+                                        unsigned char *dstv,
+                                        int stride, char *eobs)
+{
+    if (((short *)(eobs))[0])
+    {
+        if (((short *)eobs)[0] & 0xfefe)
+            idct_dequant_full_2x_neon (q, dq, dstu, stride);
+        else
+            idct_dequant_0_2x_neon (q, dq[0], dstu, stride);
+    }
+
+    q    += 32;
+    dstu += 4*stride;
+
+    if (((short *)(eobs))[1])
+    {
+        if (((short *)eobs)[1] & 0xfefe)
+            idct_dequant_full_2x_neon (q, dq, dstu, stride);
+        else
+            idct_dequant_0_2x_neon (q, dq[0], dstu, stride);
+    }
+
+    q += 32;
+
+    if (((short *)(eobs))[2])
+    {
+        if (((short *)eobs)[2] & 0xfefe)
+            idct_dequant_full_2x_neon (q, dq, dstv, stride);
+        else
+            idct_dequant_0_2x_neon (q, dq[0], dstv, stride);
+    }
+
+    q    += 32;
+    dstv += 4*stride;
+
+    if (((short *)(eobs))[3])
+    {
+        if (((short *)eobs)[3] & 0xfefe)
+            idct_dequant_full_2x_neon (q, dq, dstv, stride);
+        else
+            idct_dequant_0_2x_neon (q, dq[0], dstv, stride);
+    }
+}
diff --git a/libs/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c b/libs/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c
new file mode 100644
index 0000000000..e6f862fa89
--- /dev/null
+++ b/libs/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c
@@ -0,0 +1,63 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+void idct_dequant_0_2x_neon(
+        int16_t *q,
+        int16_t dq,
+        unsigned char *dst,
+        int stride) {
+    unsigned char *dst0;
+    int i, a0, a1;
+    int16x8x2_t q2Add;
+    int32x2_t d2s32 = vdup_n_s32(0),
+              d4s32 = vdup_n_s32(0);
+    uint8x8_t d2u8, d4u8;
+    uint16x8_t q1u16, q2u16;
+
+    a0 = ((q[0] * dq) + 4) >> 3;
+    a1 = ((q[16] * dq) + 4) >> 3;
+    q[0] = q[16] = 0;
+    q2Add.val[0] = vdupq_n_s16((int16_t)a0);
+    q2Add.val[1] = vdupq_n_s16((int16_t)a1);
+
+    for (i = 0; i < 2; i++, dst += 4) {
+        dst0 = dst;
+        d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 0);
+        dst0 += stride;
+        d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 1);
+        dst0 += stride;
+        d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 0);
+        dst0 += stride;
+        d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 1);
+
+        q1u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
+                         vreinterpret_u8_s32(d2s32));
+        q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
+                         vreinterpret_u8_s32(d4s32));
+
+        d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
+        d4u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
+
+        d2s32 = vreinterpret_s32_u8(d2u8);
+        d4s32 = vreinterpret_s32_u8(d4u8);
+
+        dst0 = dst;
+        vst1_lane_s32((int32_t *)dst0, d2s32, 0);
+        dst0 += stride;
+        vst1_lane_s32((int32_t *)dst0, d2s32, 1);
+        dst0 += stride;
+        vst1_lane_s32((int32_t *)dst0, d4s32, 0);
+        dst0 += stride;
+        vst1_lane_s32((int32_t *)dst0, d4s32, 1);
+    }
+    return;
+}
diff --git a/libs/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c b/libs/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c
new file mode 100644
index 0000000000..a60ed46b76
--- /dev/null
+++ b/libs/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c
@@ -0,0 +1,185 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+static const int16_t cospi8sqrt2minus1 = 20091;
+static const int16_t sinpi8sqrt2       = 17734;
+// because the lowest bit in 0x8a8c is 0, we can pre-shift this
+
+void idct_dequant_full_2x_neon(
+        int16_t *q,
+        int16_t *dq,
+        unsigned char *dst,
+        int stride) {
+    unsigned char *dst0, *dst1;
+    int32x2_t d28, d29, d30, d31;
+    int16x8_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
+    int16x8_t qEmpty = vdupq_n_s16(0);
+    int32x4x2_t q2tmp0, q2tmp1;
+    int16x8x2_t q2tmp2, q2tmp3;
+    int16x4_t dLow0, dLow1, dHigh0, dHigh1;
+
+    d28 = d29 = d30 = d31 = vdup_n_s32(0);
+
+    // load dq
+    q0 = vld1q_s16(dq);
+    dq += 8;
+    q1 = vld1q_s16(dq);
+
+    // load q
+    q2 = vld1q_s16(q);
+    vst1q_s16(q, qEmpty);
+    q += 8;
+    q3 = vld1q_s16(q);
+    vst1q_s16(q, qEmpty);
+    q += 8;
+    q4 = vld1q_s16(q);
+    vst1q_s16(q, qEmpty);
+    q += 8;
+    q5 = vld1q_s16(q);
+    vst1q_s16(q, qEmpty);
+
+    // load src from dst
+    dst0 = dst;
+    dst1 = dst + 4;
+    d28 = vld1_lane_s32((const int32_t *)dst0, d28, 0);
+    dst0 += stride;
+    d28 = vld1_lane_s32((const int32_t *)dst1, d28, 1);
+    dst1 += stride;
+    d29 = vld1_lane_s32((const int32_t *)dst0, d29, 0);
+    dst0 += stride;
+    d29 = vld1_lane_s32((const int32_t *)dst1, d29, 1);
+    dst1 += stride;
+
+    d30 = vld1_lane_s32((const int32_t *)dst0, d30, 0);
+    dst0 += stride;
+    d30 = vld1_lane_s32((const int32_t *)dst1, d30, 1);
+    dst1 += stride;
+    d31 = vld1_lane_s32((const int32_t *)dst0, d31, 0);
+    d31 = vld1_lane_s32((const int32_t *)dst1, d31, 1);
+
+    q2 = vmulq_s16(q2, q0);
+    q3 = vmulq_s16(q3, q1);
+    q4 = vmulq_s16(q4, q0);
+    q5 = vmulq_s16(q5, q1);
+
+    // vswp
+    dLow0 = vget_low_s16(q2);
+    dHigh0 = vget_high_s16(q2);
+    dLow1 = vget_low_s16(q4);
+    dHigh1 = vget_high_s16(q4);
+    q2 = vcombine_s16(dLow0, dLow1);
+    q4 = vcombine_s16(dHigh0, dHigh1);
+
+    dLow0 = vget_low_s16(q3);
+    dHigh0 = vget_high_s16(q3);
+    dLow1 = vget_low_s16(q5);
+    dHigh1 = vget_high_s16(q5);
+    q3 = vcombine_s16(dLow0, dLow1);
+    q5 = vcombine_s16(dHigh0, dHigh1);
+
+    q6 = vqdmulhq_n_s16(q4, sinpi8sqrt2);
+    q7 = vqdmulhq_n_s16(q5, sinpi8sqrt2);
+    q8 = vqdmulhq_n_s16(q4, cospi8sqrt2minus1);
+    q9 = vqdmulhq_n_s16(q5, cospi8sqrt2minus1);
+
+    q10 = vqaddq_s16(q2, q3);
+    q11 = vqsubq_s16(q2, q3);
+
+    q8 = vshrq_n_s16(q8, 1);
+    q9 = vshrq_n_s16(q9, 1);
+
+    q4 = vqaddq_s16(q4, q8);
+    q5 = vqaddq_s16(q5, q9);
+
+    q2 = vqsubq_s16(q6, q5);
+    q3 = vqaddq_s16(q7, q4);
+
+    q4 = vqaddq_s16(q10, q3);
+    q5 = vqaddq_s16(q11, q2);
+    q6 = vqsubq_s16(q11, q2);
+    q7 = vqsubq_s16(q10, q3);
+
+    q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
+    q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
+    q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
+                       vreinterpretq_s16_s32(q2tmp1.val[0]));
+    q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
+                       vreinterpretq_s16_s32(q2tmp1.val[1]));
+
+    // loop 2
+    q8  = vqdmulhq_n_s16(q2tmp2.val[1], sinpi8sqrt2);
+    q9  = vqdmulhq_n_s16(q2tmp3.val[1], sinpi8sqrt2);
+    q10 = vqdmulhq_n_s16(q2tmp2.val[1], cospi8sqrt2minus1);
+    q11 = vqdmulhq_n_s16(q2tmp3.val[1], cospi8sqrt2minus1);
+
+    q2 = vqaddq_s16(q2tmp2.val[0], q2tmp3.val[0]);
+    q3 = vqsubq_s16(q2tmp2.val[0], q2tmp3.val[0]);
+
+    q10 = vshrq_n_s16(q10, 1);
+    q11 = vshrq_n_s16(q11, 1);
+
+    q10 = vqaddq_s16(q2tmp2.val[1], q10);
+    q11 = vqaddq_s16(q2tmp3.val[1], q11);
+
+    q8 = vqsubq_s16(q8, q11);
+    q9 = vqaddq_s16(q9, q10);
+
+    q4 = vqaddq_s16(q2, q9);
+    q5 = vqaddq_s16(q3, q8);
+    q6 = vqsubq_s16(q3, q8);
+    q7 = vqsubq_s16(q2, q9);
+
+    q4 = vrshrq_n_s16(q4, 3);
+    q5 = vrshrq_n_s16(q5, 3);
+    q6 = vrshrq_n_s16(q6, 3);
+    q7 = vrshrq_n_s16(q7, 3);
+
+    q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
+    q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
+    q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
+                       vreinterpretq_s16_s32(q2tmp1.val[0]));
+    q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
+                       vreinterpretq_s16_s32(q2tmp1.val[1]));
+
+    q4 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[0]),
+                                          vreinterpret_u8_s32(d28)));
+    q5 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[1]),
+                                          vreinterpret_u8_s32(d29)));
+    q6 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[0]),
+                                          vreinterpret_u8_s32(d30)));
+    q7 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[1]),
+                                          vreinterpret_u8_s32(d31)));
+
+    d28 = vreinterpret_s32_u8(vqmovun_s16(q4));
+    d29 = vreinterpret_s32_u8(vqmovun_s16(q5));
+    d30 = vreinterpret_s32_u8(vqmovun_s16(q6));
+    d31 = vreinterpret_s32_u8(vqmovun_s16(q7));
+
+    dst0 = dst;
+    dst1 = dst + 4;
+    vst1_lane_s32((int32_t *)dst0, d28, 0);
+    dst0 += stride;
+    vst1_lane_s32((int32_t *)dst1, d28, 1);
+    dst1 += stride;
+    vst1_lane_s32((int32_t *)dst0, d29, 0);
+    dst0 += stride;
+    vst1_lane_s32((int32_t *)dst1, d29, 1);
+    dst1 += stride;
+
+    vst1_lane_s32((int32_t *)dst0, d30, 0);
+    dst0 += stride;
+    vst1_lane_s32((int32_t *)dst1, d30, 1);
+    dst1 += stride;
+    vst1_lane_s32((int32_t *)dst0, d31, 0);
+    vst1_lane_s32((int32_t *)dst1, d31, 1);
+    return;
+}
diff --git a/libs/libvpx/vp8/common/arm/neon/iwalsh_neon.c b/libs/libvpx/vp8/common/arm/neon/iwalsh_neon.c
new file mode 100644
index 0000000000..6ea9dd712a
--- /dev/null
+++ b/libs/libvpx/vp8/common/arm/neon/iwalsh_neon.c
@@ -0,0 +1,102 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+void vp8_short_inv_walsh4x4_neon(
+        int16_t *input,
+        int16_t *mb_dqcoeff) {
+    int16x8_t q0s16, q1s16, q2s16, q3s16;
+    int16x4_t d4s16, d5s16, d6s16, d7s16;
+    int16x4x2_t v2tmp0, v2tmp1;
+    int32x2x2_t v2tmp2, v2tmp3;
+    int16x8_t qAdd3;
+
+    q0s16 = vld1q_s16(input);
+    q1s16 = vld1q_s16(input + 8);
+
+    // 1st for loop
+    d4s16 = vadd_s16(vget_low_s16(q0s16), vget_high_s16(q1s16));
+    d6s16 = vadd_s16(vget_high_s16(q0s16), vget_low_s16(q1s16));
+    d5s16 = vsub_s16(vget_low_s16(q0s16), vget_high_s16(q1s16));
+    d7s16 = vsub_s16(vget_high_s16(q0s16), vget_low_s16(q1s16));
+
+    q2s16 = vcombine_s16(d4s16, d5s16);
+    q3s16 = vcombine_s16(d6s16, d7s16);
+
+    q0s16 = vaddq_s16(q2s16, q3s16);
+    q1s16 = vsubq_s16(q2s16, q3s16);
+
+    v2tmp2 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(q0s16)),
+                      vreinterpret_s32_s16(vget_low_s16(q1s16)));
+    v2tmp3 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(q0s16)),
+                      vreinterpret_s32_s16(vget_high_s16(q1s16)));
+    v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]),
+                      vreinterpret_s16_s32(v2tmp3.val[0]));
+    v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]),
+                      vreinterpret_s16_s32(v2tmp3.val[1]));
+
+    // 2nd for loop
+    d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]);
+    d6s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]);
+    d5s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]);
+    d7s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]);
+    q2s16 = vcombine_s16(d4s16, d5s16);
+    q3s16 = vcombine_s16(d6s16, d7s16);
+
+    qAdd3 = vdupq_n_s16(3);
+
+    q0s16 = vaddq_s16(q2s16, q3s16);
+    q1s16 = vsubq_s16(q2s16, q3s16);
+
+    q0s16 = vaddq_s16(q0s16, qAdd3);
+    q1s16 = vaddq_s16(q1s16, qAdd3);
+
+    q0s16 = vshrq_n_s16(q0s16, 3);
+    q1s16 = vshrq_n_s16(q1s16, 3);
+
+    // store
+    vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16),  0);
+    mb_dqcoeff += 16;
+    vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 0);
+    mb_dqcoeff += 16;
+    vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16),  0);
+    mb_dqcoeff += 16;
+    vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 0);
+    mb_dqcoeff += 16;
+
+    vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16),  1);
+    mb_dqcoeff += 16;
+    vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 1);
+    mb_dqcoeff += 16;
+    vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16),  1);
+    mb_dqcoeff += 16;
+    vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 1);
+    mb_dqcoeff += 16;
+
+    vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16),  2);
+    mb_dqcoeff += 16;
+    vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 2);
+    mb_dqcoeff += 16;
+    vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16),  2);
+    mb_dqcoeff += 16;
+    vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 2);
+    mb_dqcoeff += 16;
+
+    vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16),  3);
+    mb_dqcoeff += 16;
+    vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 3);
+    mb_dqcoeff += 16;
+    vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16),  3);
+    mb_dqcoeff += 16;
+    vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 3);
+    mb_dqcoeff += 16;
+    return;
+}
diff --git a/libs/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c b/libs/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c
new file mode 100644
index 0000000000..b25686ffb8
--- /dev/null
+++ b/libs/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c
@@ -0,0 +1,111 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vpx_config.h"
+
+static INLINE void vp8_loop_filter_simple_horizontal_edge_neon(
+        unsigned char *s,
+        int p,
+        const unsigned char *blimit) {
+    uint8_t *sp;
+    uint8x16_t qblimit, q0u8;
+    uint8x16_t q5u8, q6u8, q7u8, q8u8, q9u8, q10u8, q14u8, q15u8;
+    int16x8_t q2s16, q3s16, q13s16;
+    int8x8_t d8s8, d9s8;
+    int8x16_t q2s8, q3s8, q4s8, q10s8, q11s8, q14s8;
+
+    qblimit = vdupq_n_u8(*blimit);
+
+    sp = s - (p << 1);
+    q5u8 = vld1q_u8(sp);
+    sp += p;
+    q6u8 = vld1q_u8(sp);
+    sp += p;
+    q7u8 = vld1q_u8(sp);
+    sp += p;
+    q8u8 = vld1q_u8(sp);
+
+    q15u8 = vabdq_u8(q6u8, q7u8);
+    q14u8 = vabdq_u8(q5u8, q8u8);
+
+    q15u8 = vqaddq_u8(q15u8, q15u8);
+    q14u8 = vshrq_n_u8(q14u8, 1);
+    q0u8 = vdupq_n_u8(0x80);
+    q13s16 = vdupq_n_s16(3);
+    q15u8 = vqaddq_u8(q15u8, q14u8);
+
+    q5u8 = veorq_u8(q5u8, q0u8);
+    q6u8 = veorq_u8(q6u8, q0u8);
+    q7u8 = veorq_u8(q7u8, q0u8);
+    q8u8 = veorq_u8(q8u8, q0u8);
+
+    q15u8 = vcgeq_u8(qblimit, q15u8);
+
+    q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7u8)),
+                     vget_low_s8(vreinterpretq_s8_u8(q6u8)));
+    q3s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7u8)),
+                     vget_high_s8(vreinterpretq_s8_u8(q6u8)));
+
+    q4s8 = vqsubq_s8(vreinterpretq_s8_u8(q5u8),
+                     vreinterpretq_s8_u8(q8u8));
+
+    q2s16 = vmulq_s16(q2s16, q13s16);
+    q3s16 = vmulq_s16(q3s16, q13s16);
+
+    q10u8 = vdupq_n_u8(3);
+    q9u8 = vdupq_n_u8(4);
+
+    q2s16 = vaddw_s8(q2s16, vget_low_s8(q4s8));
+    q3s16 = vaddw_s8(q3s16, vget_high_s8(q4s8));
+
+    d8s8 = vqmovn_s16(q2s16);
+    d9s8 = vqmovn_s16(q3s16);
+    q4s8 = vcombine_s8(d8s8, d9s8);
+
+    q14s8 = vandq_s8(q4s8, vreinterpretq_s8_u8(q15u8));
+
+    q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q10u8));
+    q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q9u8));
+    q2s8 = vshrq_n_s8(q2s8, 3);
+    q3s8 = vshrq_n_s8(q3s8, 3);
+
+    q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6u8), q2s8);
+    q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7u8), q3s8);
+
+    q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
+    q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
+
+    vst1q_u8(s, q7u8);
+    s -= p;
+    vst1q_u8(s, q6u8);
+    return;
+}
+
+void vp8_loop_filter_bhs_neon(
+        unsigned char *y_ptr,
+        int y_stride,
+        const unsigned char *blimit) {
+    y_ptr += y_stride * 4;
+    vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
+    y_ptr += y_stride * 4;
+    vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
+    y_ptr += y_stride * 4;
+    vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
+    return;
+}
+
+void vp8_loop_filter_mbhs_neon(
+        unsigned char *y_ptr,
+        int y_stride,
+        const unsigned char *blimit) {
+    vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
+    return;
+}
diff --git a/libs/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c b/libs/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c
new file mode 100644
index 0000000000..921bcad698
--- /dev/null
+++ b/libs/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c
@@ -0,0 +1,283 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vpx_config.h"
+#include "vpx_ports/arm.h"
+
+#ifdef VPX_INCOMPATIBLE_GCC
+static INLINE void write_2x4(unsigned char *dst, int pitch,
+                             const uint8x8x2_t result) {
+    /*
+     * uint8x8x2_t result
+    00 01 02 03 | 04 05 06 07
+    10 11 12 13 | 14 15 16 17
+    ---
+    * after vtrn_u8
+    00 10 02 12 | 04 14 06 16
+    01 11 03 13 | 05 15 07 17
+    */
+    const uint8x8x2_t r01_u8 = vtrn_u8(result.val[0],
+                                       result.val[1]);
+    const uint16x4_t x_0_4 = vreinterpret_u16_u8(r01_u8.val[0]);
+    const uint16x4_t x_1_5 = vreinterpret_u16_u8(r01_u8.val[1]);
+    vst1_lane_u16((uint16_t *)dst, x_0_4, 0);
+    dst += pitch;
+    vst1_lane_u16((uint16_t *)dst, x_1_5, 0);
+    dst += pitch;
+    vst1_lane_u16((uint16_t *)dst, x_0_4, 1);
+    dst += pitch;
+    vst1_lane_u16((uint16_t *)dst, x_1_5, 1);
+    dst += pitch;
+    vst1_lane_u16((uint16_t *)dst, x_0_4, 2);
+    dst += pitch;
+    vst1_lane_u16((uint16_t *)dst, x_1_5, 2);
+    dst += pitch;
+    vst1_lane_u16((uint16_t *)dst, x_0_4, 3);
+    dst += pitch;
+    vst1_lane_u16((uint16_t *)dst, x_1_5, 3);
+}
+
+static INLINE void write_2x8(unsigned char *dst, int pitch,
+                             const uint8x8x2_t result,
+                             const uint8x8x2_t result2) {
+  write_2x4(dst, pitch, result);
+  dst += pitch * 8;
+  write_2x4(dst, pitch, result2);
+}
+#else
+static INLINE void write_2x8(unsigned char *dst, int pitch,
+                             const uint8x8x2_t result,
+                             const uint8x8x2_t result2) {
+  vst2_lane_u8(dst, result, 0);
+  dst += pitch;
+  vst2_lane_u8(dst, result, 1);
+  dst += pitch;
+  vst2_lane_u8(dst, result, 2);
+  dst += pitch;
+  vst2_lane_u8(dst, result, 3);
+  dst += pitch;
+  vst2_lane_u8(dst, result, 4);
+  dst += pitch;
+  vst2_lane_u8(dst, result, 5);
+  dst += pitch;
+  vst2_lane_u8(dst, result, 6);
+  dst += pitch;
+  vst2_lane_u8(dst, result, 7);
+  dst += pitch;
+
+  vst2_lane_u8(dst, result2, 0);
+  dst += pitch;
+  vst2_lane_u8(dst, result2, 1);
+  dst += pitch;
+  vst2_lane_u8(dst, result2, 2);
+  dst += pitch;
+  vst2_lane_u8(dst, result2, 3);
+  dst += pitch;
+  vst2_lane_u8(dst, result2, 4);
+  dst += pitch;
+  vst2_lane_u8(dst, result2, 5);
+  dst += pitch;
+  vst2_lane_u8(dst, result2, 6);
+  dst += pitch;
+  vst2_lane_u8(dst, result2, 7);
+}
+#endif  // VPX_INCOMPATIBLE_GCC
+
+
+#ifdef VPX_INCOMPATIBLE_GCC
+static INLINE
+uint8x8x4_t read_4x8(unsigned char *src, int pitch) {
+    uint8x8x4_t x;
+    const uint8x8_t a = vld1_u8(src);
+    const uint8x8_t b = vld1_u8(src + pitch * 1);
+    const uint8x8_t c = vld1_u8(src + pitch * 2);
+    const uint8x8_t d = vld1_u8(src + pitch * 3);
+    const uint8x8_t e = vld1_u8(src + pitch * 4);
+    const uint8x8_t f = vld1_u8(src + pitch * 5);
+    const uint8x8_t g = vld1_u8(src + pitch * 6);
+    const uint8x8_t h = vld1_u8(src + pitch * 7);
+    const uint32x2x2_t r04_u32 = vtrn_u32(vreinterpret_u32_u8(a),
+                                          vreinterpret_u32_u8(e));
+    const uint32x2x2_t r15_u32 = vtrn_u32(vreinterpret_u32_u8(b),
+                                          vreinterpret_u32_u8(f));
+    const uint32x2x2_t r26_u32 = vtrn_u32(vreinterpret_u32_u8(c),
+                                          vreinterpret_u32_u8(g));
+    const uint32x2x2_t r37_u32 = vtrn_u32(vreinterpret_u32_u8(d),
+                                          vreinterpret_u32_u8(h));
+    const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u32(r04_u32.val[0]),
+                                          vreinterpret_u16_u32(r26_u32.val[0]));
+    const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u32(r15_u32.val[0]),
+                                          vreinterpret_u16_u32(r37_u32.val[0]));
+    const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]),
+                                       vreinterpret_u8_u16(r13_u16.val[0]));
+    const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]),
+                                       vreinterpret_u8_u16(r13_u16.val[1]));
+    /*
+     * after vtrn_u32
+    00 01 02 03 | 40 41 42 43
+    10 11 12 13 | 50 51 52 53
+    20 21 22 23 | 60 61 62 63
+    30 31 32 33 | 70 71 72 73
+    ---
+    * after vtrn_u16
+    00 01 20 21 | 40 41 60 61
+    02 03 22 23 | 42 43 62 63
+    10 11 30 31 | 50 51 70 71
+    12 13 32 33 | 52 52 72 73
+
+    00 01 20 21 | 40 41 60 61
+    10 11 30 31 | 50 51 70 71
+    02 03 22 23 | 42 43 62 63
+    12 13 32 33 | 52 52 72 73
+    ---
+    * after vtrn_u8
+    00 10 20 30 | 40 50 60 70
+    01 11 21 31 | 41 51 61 71
+    02 12 22 32 | 42 52 62 72
+    03 13 23 33 | 43 53 63 73
+    */
+    x.val[0] = r01_u8.val[0];
+    x.val[1] = r01_u8.val[1];
+    x.val[2] = r23_u8.val[0];
+    x.val[3] = r23_u8.val[1];
+
+    return x;
+}
+#else
+static INLINE
+uint8x8x4_t read_4x8(unsigned char *src, int pitch) {
+    uint8x8x4_t x;
+    x.val[0] = x.val[1] = x.val[2] = x.val[3] = vdup_n_u8(0);
+    x = vld4_lane_u8(src, x, 0);
+    src += pitch;
+    x = vld4_lane_u8(src, x, 1);
+    src += pitch;
+    x = vld4_lane_u8(src, x, 2);
+    src += pitch;
+    x = vld4_lane_u8(src, x, 3);
+    src += pitch;
+    x = vld4_lane_u8(src, x, 4);
+    src += pitch;
+    x = vld4_lane_u8(src, x, 5);
+    src += pitch;
+    x = vld4_lane_u8(src, x, 6);
+    src += pitch;
+    x = vld4_lane_u8(src, x, 7);
+    return x;
+}
+#endif  // VPX_INCOMPATIBLE_GCC
+
+static INLINE void vp8_loop_filter_simple_vertical_edge_neon(
+        unsigned char *s,
+        int p,
+        const unsigned char *blimit) {
+    unsigned char *src1;
+    uint8x16_t qblimit, q0u8;
+    uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q11u8, q12u8, q14u8, q15u8;
+    int16x8_t q2s16, q13s16, q11s16;
+    int8x8_t d28s8, d29s8;
+    int8x16_t q2s8, q3s8, q10s8, q11s8, q14s8;
+    uint8x8x4_t d0u8x4;  // d6, d7, d8, d9
+    uint8x8x4_t d1u8x4;  // d10, d11, d12, d13
+    uint8x8x2_t d2u8x2;  // d12, d13
+    uint8x8x2_t d3u8x2;  // d14, d15
+
+    qblimit = vdupq_n_u8(*blimit);
+
+    src1 = s - 2;
+    d0u8x4 = read_4x8(src1, p);
+    src1 += p * 8;
+    d1u8x4 = read_4x8(src1, p);
+
+    q3u8 = vcombine_u8(d0u8x4.val[0], d1u8x4.val[0]);  // d6 d10
+    q4u8 = vcombine_u8(d0u8x4.val[2], d1u8x4.val[2]);  // d8 d12
+    q5u8 = vcombine_u8(d0u8x4.val[1], d1u8x4.val[1]);  // d7 d11
+    q6u8 = vcombine_u8(d0u8x4.val[3], d1u8x4.val[3]);  // d9 d13
+
+    q15u8 = vabdq_u8(q5u8, q4u8);
+    q14u8 = vabdq_u8(q3u8, q6u8);
+
+    q15u8 = vqaddq_u8(q15u8, q15u8);
+    q14u8 = vshrq_n_u8(q14u8, 1);
+    q0u8 = vdupq_n_u8(0x80);
+    q11s16 = vdupq_n_s16(3);
+    q15u8 = vqaddq_u8(q15u8, q14u8);
+
+    q3u8 = veorq_u8(q3u8, q0u8);
+    q4u8 = veorq_u8(q4u8, q0u8);
+    q5u8 = veorq_u8(q5u8, q0u8);
+    q6u8 = veorq_u8(q6u8, q0u8);
+
+    q15u8 = vcgeq_u8(qblimit, q15u8);
+
+    q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q4u8)),
+                     vget_low_s8(vreinterpretq_s8_u8(q5u8)));
+    q13s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q4u8)),
+                      vget_high_s8(vreinterpretq_s8_u8(q5u8)));
+
+    q14s8 = vqsubq_s8(vreinterpretq_s8_u8(q3u8),
+                      vreinterpretq_s8_u8(q6u8));
+
+    q2s16 = vmulq_s16(q2s16, q11s16);
+    q13s16 = vmulq_s16(q13s16, q11s16);
+
+    q11u8 = vdupq_n_u8(3);
+    q12u8 = vdupq_n_u8(4);
+
+    q2s16 = vaddw_s8(q2s16, vget_low_s8(q14s8));
+    q13s16 = vaddw_s8(q13s16, vget_high_s8(q14s8));
+
+    d28s8 = vqmovn_s16(q2s16);
+    d29s8 = vqmovn_s16(q13s16);
+    q14s8 = vcombine_s8(d28s8, d29s8);
+
+    q14s8 = vandq_s8(q14s8, vreinterpretq_s8_u8(q15u8));
+
+    q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q11u8));
+    q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q12u8));
+    q2s8 = vshrq_n_s8(q2s8, 3);
+    q14s8 = vshrq_n_s8(q3s8, 3);
+
+    q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q5u8), q2s8);
+    q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q4u8), q14s8);
+
+    q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
+    q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
+
+    d2u8x2.val[0] = vget_low_u8(q6u8);   // d12
+    d2u8x2.val[1] = vget_low_u8(q7u8);   // d14
+    d3u8x2.val[0] = vget_high_u8(q6u8);  // d13
+    d3u8x2.val[1] = vget_high_u8(q7u8);  // d15
+
+    src1 = s - 1;
+    write_2x8(src1, p, d2u8x2, d3u8x2);
+}
+
+void vp8_loop_filter_bvs_neon(
+        unsigned char *y_ptr,
+        int y_stride,
+        const unsigned char *blimit) {
+    y_ptr += 4;
+    vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
+    y_ptr += 4;
+    vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
+    y_ptr += 4;
+    vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
+    return;
+}
+
+void vp8_loop_filter_mbvs_neon(
+        unsigned char *y_ptr,
+        int y_stride,
+        const unsigned char *blimit) {
+    vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
+    return;
+}
diff --git a/libs/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c b/libs/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c
new file mode 100644
index 0000000000..5351f4be66
--- /dev/null
+++ b/libs/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c
@@ -0,0 +1,625 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vpx_config.h"
+
+static INLINE void vp8_mbloop_filter_neon(
+        uint8x16_t qblimit,  // mblimit
+        uint8x16_t qlimit,   // limit
+        uint8x16_t qthresh,  // thresh
+        uint8x16_t q3,       // p2
+        uint8x16_t q4,       // p2
+        uint8x16_t q5,       // p1
+        uint8x16_t q6,       // p0
+        uint8x16_t q7,       // q0
+        uint8x16_t q8,       // q1
+        uint8x16_t q9,       // q2
+        uint8x16_t q10,      // q3
+        uint8x16_t *q4r,     // p1
+        uint8x16_t *q5r,     // p1
+        uint8x16_t *q6r,     // p0
+        uint8x16_t *q7r,     // q0
+        uint8x16_t *q8r,     // q1
+        uint8x16_t *q9r) {   // q1
+    uint8x16_t q0u8, q1u8, q11u8, q12u8, q13u8, q14u8, q15u8;
+    int16x8_t q0s16, q2s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+    int8x16_t q1s8, q6s8, q7s8, q2s8, q11s8, q13s8;
+    uint16x8_t q0u16, q11u16, q12u16, q13u16, q14u16, q15u16;
+    int8x16_t q0s8, q12s8, q14s8, q15s8;
+    int8x8_t d0, d1, d2, d3, d4, d5, d24, d25, d28, d29;
+
+    q11u8 = vabdq_u8(q3, q4);
+    q12u8 = vabdq_u8(q4, q5);
+    q13u8 = vabdq_u8(q5, q6);
+    q14u8 = vabdq_u8(q8, q7);
+    q1u8  = vabdq_u8(q9, q8);
+    q0u8  = vabdq_u8(q10, q9);
+
+    q11u8 = vmaxq_u8(q11u8, q12u8);
+    q12u8 = vmaxq_u8(q13u8, q14u8);
+    q1u8  = vmaxq_u8(q1u8, q0u8);
+    q15u8 = vmaxq_u8(q11u8, q12u8);
+
+    q12u8 = vabdq_u8(q6, q7);
+
+    // vp8_hevmask
+    q13u8 = vcgtq_u8(q13u8, qthresh);
+    q14u8 = vcgtq_u8(q14u8, qthresh);
+    q15u8 = vmaxq_u8(q15u8, q1u8);
+
+    q15u8 = vcgeq_u8(qlimit, q15u8);
+
+    q1u8 = vabdq_u8(q5, q8);
+    q12u8 = vqaddq_u8(q12u8, q12u8);
+
+    // vp8_filter() function
+    // convert to signed
+    q0u8 = vdupq_n_u8(0x80);
+    q9 = veorq_u8(q9, q0u8);
+    q8 = veorq_u8(q8, q0u8);
+    q7 = veorq_u8(q7, q0u8);
+    q6 = veorq_u8(q6, q0u8);
+    q5 = veorq_u8(q5, q0u8);
+    q4 = veorq_u8(q4, q0u8);
+
+    q1u8 = vshrq_n_u8(q1u8, 1);
+    q12u8 = vqaddq_u8(q12u8, q1u8);
+
+    q14u8 = vorrq_u8(q13u8, q14u8);
+    q12u8 = vcgeq_u8(qblimit, q12u8);
+
+    q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
+                     vget_low_s8(vreinterpretq_s8_u8(q6)));
+    q13s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
+                      vget_high_s8(vreinterpretq_s8_u8(q6)));
+
+    q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5),
+                     vreinterpretq_s8_u8(q8));
+
+    q11s16 = vdupq_n_s16(3);
+    q2s16  = vmulq_s16(q2s16, q11s16);
+    q13s16 = vmulq_s16(q13s16, q11s16);
+
+    q15u8 = vandq_u8(q15u8, q12u8);
+
+    q2s16  = vaddw_s8(q2s16, vget_low_s8(q1s8));
+    q13s16 = vaddw_s8(q13s16, vget_high_s8(q1s8));
+
+    q12u8 = vdupq_n_u8(3);
+    q11u8 = vdupq_n_u8(4);
+    // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
+    d2 = vqmovn_s16(q2s16);
+    d3 = vqmovn_s16(q13s16);
+    q1s8 = vcombine_s8(d2, d3);
+    q1s8 = vandq_s8(q1s8, vreinterpretq_s8_u8(q15u8));
+    q13s8 = vandq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
+
+    q2s8 = vqaddq_s8(q13s8, vreinterpretq_s8_u8(q11u8));
+    q13s8 = vqaddq_s8(q13s8, vreinterpretq_s8_u8(q12u8));
+    q2s8 = vshrq_n_s8(q2s8, 3);
+    q13s8 = vshrq_n_s8(q13s8, 3);
+
+    q7s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q2s8);
+    q6s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q13s8);
+
+    q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
+
+    q0u16 = q11u16 = q12u16 = q13u16 = q14u16 = q15u16 = vdupq_n_u16(63);
+    d5 = vdup_n_s8(9);
+    d4 = vdup_n_s8(18);
+
+    q0s16  = vmlal_s8(vreinterpretq_s16_u16(q0u16),  vget_low_s8(q1s8),  d5);
+    q11s16 = vmlal_s8(vreinterpretq_s16_u16(q11u16), vget_high_s8(q1s8), d5);
+    d5 = vdup_n_s8(27);
+    q12s16 = vmlal_s8(vreinterpretq_s16_u16(q12u16), vget_low_s8(q1s8),  d4);
+    q13s16 = vmlal_s8(vreinterpretq_s16_u16(q13u16), vget_high_s8(q1s8), d4);
+    q14s16 = vmlal_s8(vreinterpretq_s16_u16(q14u16), vget_low_s8(q1s8),  d5);
+    q15s16 = vmlal_s8(vreinterpretq_s16_u16(q15u16), vget_high_s8(q1s8), d5);
+
+    d0  = vqshrn_n_s16(q0s16 , 7);
+    d1  = vqshrn_n_s16(q11s16, 7);
+    d24 = vqshrn_n_s16(q12s16, 7);
+    d25 = vqshrn_n_s16(q13s16, 7);
+    d28 = vqshrn_n_s16(q14s16, 7);
+    d29 = vqshrn_n_s16(q15s16, 7);
+
+    q0s8  = vcombine_s8(d0, d1);
+    q12s8 = vcombine_s8(d24, d25);
+    q14s8 = vcombine_s8(d28, d29);
+
+    q11s8 = vqsubq_s8(vreinterpretq_s8_u8(q9), q0s8);
+    q0s8  = vqaddq_s8(vreinterpretq_s8_u8(q4), q0s8);
+    q13s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q12s8);
+    q12s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q12s8);
+    q15s8 = vqsubq_s8((q7s8), q14s8);
+    q14s8 = vqaddq_s8((q6s8), q14s8);
+
+    q1u8 = vdupq_n_u8(0x80);
+    *q9r = veorq_u8(vreinterpretq_u8_s8(q11s8), q1u8);
+    *q8r = veorq_u8(vreinterpretq_u8_s8(q13s8), q1u8);
+    *q7r = veorq_u8(vreinterpretq_u8_s8(q15s8), q1u8);
+    *q6r = veorq_u8(vreinterpretq_u8_s8(q14s8), q1u8);
+    *q5r = veorq_u8(vreinterpretq_u8_s8(q12s8), q1u8);
+    *q4r = veorq_u8(vreinterpretq_u8_s8(q0s8), q1u8);
+    return;
+}
+
+void vp8_mbloop_filter_horizontal_edge_y_neon(
+        unsigned char *src,
+        int pitch,
+        unsigned char blimit,
+        unsigned char limit,
+        unsigned char thresh) {
+    uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+    uint8x16_t q5, q6, q7, q8, q9, q10;
+
+    qblimit = vdupq_n_u8(blimit);
+    qlimit = vdupq_n_u8(limit);
+    qthresh = vdupq_n_u8(thresh);
+
+    src -= (pitch << 2);
+
+    q3 = vld1q_u8(src);
+    src += pitch;
+    q4 = vld1q_u8(src);
+    src += pitch;
+    q5 = vld1q_u8(src);
+    src += pitch;
+    q6 = vld1q_u8(src);
+    src += pitch;
+    q7 = vld1q_u8(src);
+    src += pitch;
+    q8 = vld1q_u8(src);
+    src += pitch;
+    q9 = vld1q_u8(src);
+    src += pitch;
+    q10 = vld1q_u8(src);
+
+    vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+                         q5, q6, q7, q8, q9, q10,
+                         &q4, &q5, &q6, &q7, &q8, &q9);
+
+    src -= (pitch * 6);
+    vst1q_u8(src, q4);
+    src += pitch;
+    vst1q_u8(src, q5);
+    src += pitch;
+    vst1q_u8(src, q6);
+    src += pitch;
+    vst1q_u8(src, q7);
+    src += pitch;
+    vst1q_u8(src, q8);
+    src += pitch;
+    vst1q_u8(src, q9);
+    return;
+}
+
+void vp8_mbloop_filter_horizontal_edge_uv_neon(
+        unsigned char *u,
+        int pitch,
+        unsigned char blimit,
+        unsigned char limit,
+        unsigned char thresh,
+        unsigned char *v) {
+    uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+    uint8x16_t q5, q6, q7, q8, q9, q10;
+    uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
+    uint8x8_t d15, d16, d17, d18, d19, d20, d21;
+
+    qblimit = vdupq_n_u8(blimit);
+    qlimit = vdupq_n_u8(limit);
+    qthresh = vdupq_n_u8(thresh);
+
+    u -= (pitch << 2);
+    v -= (pitch << 2);
+
+    d6 = vld1_u8(u);
+    u += pitch;
+    d7 = vld1_u8(v);
+    v += pitch;
+    d8 = vld1_u8(u);
+    u += pitch;
+    d9 = vld1_u8(v);
+    v += pitch;
+    d10 = vld1_u8(u);
+    u += pitch;
+    d11 = vld1_u8(v);
+    v += pitch;
+    d12 = vld1_u8(u);
+    u += pitch;
+    d13 = vld1_u8(v);
+    v += pitch;
+    d14 = vld1_u8(u);
+    u += pitch;
+    d15 = vld1_u8(v);
+    v += pitch;
+    d16 = vld1_u8(u);
+    u += pitch;
+    d17 = vld1_u8(v);
+    v += pitch;
+    d18 = vld1_u8(u);
+    u += pitch;
+    d19 = vld1_u8(v);
+    v += pitch;
+    d20 = vld1_u8(u);
+    d21 = vld1_u8(v);
+
+    q3 = vcombine_u8(d6, d7);
+    q4 = vcombine_u8(d8, d9);
+    q5 = vcombine_u8(d10, d11);
+    q6 = vcombine_u8(d12, d13);
+    q7 = vcombine_u8(d14, d15);
+    q8 = vcombine_u8(d16, d17);
+    q9 = vcombine_u8(d18, d19);
+    q10 = vcombine_u8(d20, d21);
+
+    vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+                         q5, q6, q7, q8, q9, q10,
+                         &q4, &q5, &q6, &q7, &q8, &q9);
+
+    u -= (pitch * 6);
+    v -= (pitch * 6);
+    vst1_u8(u, vget_low_u8(q4));
+    u += pitch;
+    vst1_u8(v, vget_high_u8(q4));
+    v += pitch;
+    vst1_u8(u, vget_low_u8(q5));
+    u += pitch;
+    vst1_u8(v, vget_high_u8(q5));
+    v += pitch;
+    vst1_u8(u, vget_low_u8(q6));
+    u += pitch;
+    vst1_u8(v, vget_high_u8(q6));
+    v += pitch;
+    vst1_u8(u, vget_low_u8(q7));
+    u += pitch;
+    vst1_u8(v, vget_high_u8(q7));
+    v += pitch;
+    vst1_u8(u, vget_low_u8(q8));
+    u += pitch;
+    vst1_u8(v, vget_high_u8(q8));
+    v += pitch;
+    vst1_u8(u, vget_low_u8(q9));
+    vst1_u8(v, vget_high_u8(q9));
+    return;
+}
+
+void vp8_mbloop_filter_vertical_edge_y_neon(
+        unsigned char *src,
+        int pitch,
+        unsigned char blimit,
+        unsigned char limit,
+        unsigned char thresh) {
+    unsigned char *s1, *s2;
+    uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+    uint8x16_t q5, q6, q7, q8, q9, q10;
+    uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
+    uint8x8_t d15, d16, d17, d18, d19, d20, d21;
+    uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
+    uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
+    uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
+
+    qblimit = vdupq_n_u8(blimit);
+    qlimit = vdupq_n_u8(limit);
+    qthresh = vdupq_n_u8(thresh);
+
+    s1 = src - 4;
+    s2 = s1 + 8 * pitch;
+    d6  = vld1_u8(s1);
+    s1 += pitch;
+    d7  = vld1_u8(s2);
+    s2 += pitch;
+    d8  = vld1_u8(s1);
+    s1 += pitch;
+    d9  = vld1_u8(s2);
+    s2 += pitch;
+    d10 = vld1_u8(s1);
+    s1 += pitch;
+    d11 = vld1_u8(s2);
+    s2 += pitch;
+    d12 = vld1_u8(s1);
+    s1 += pitch;
+    d13 = vld1_u8(s2);
+    s2 += pitch;
+    d14 = vld1_u8(s1);
+    s1 += pitch;
+    d15 = vld1_u8(s2);
+    s2 += pitch;
+    d16 = vld1_u8(s1);
+    s1 += pitch;
+    d17 = vld1_u8(s2);
+    s2 += pitch;
+    d18 = vld1_u8(s1);
+    s1 += pitch;
+    d19 = vld1_u8(s2);
+    s2 += pitch;
+    d20 = vld1_u8(s1);
+    d21 = vld1_u8(s2);
+
+    q3 = vcombine_u8(d6, d7);
+    q4 = vcombine_u8(d8, d9);
+    q5 = vcombine_u8(d10, d11);
+    q6 = vcombine_u8(d12, d13);
+    q7 = vcombine_u8(d14, d15);
+    q8 = vcombine_u8(d16, d17);
+    q9 = vcombine_u8(d18, d19);
+    q10 = vcombine_u8(d20, d21);
+
+    q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
+    q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
+    q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
+    q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
+
+    q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
+                       vreinterpretq_u16_u32(q2tmp2.val[0]));
+    q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
+                       vreinterpretq_u16_u32(q2tmp3.val[0]));
+    q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
+                       vreinterpretq_u16_u32(q2tmp2.val[1]));
+    q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
+                       vreinterpretq_u16_u32(q2tmp3.val[1]));
+
+    q2tmp8  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
+                       vreinterpretq_u8_u16(q2tmp5.val[0]));
+    q2tmp9  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
+                       vreinterpretq_u8_u16(q2tmp5.val[1]));
+    q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
+                       vreinterpretq_u8_u16(q2tmp7.val[0]));
+    q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
+                       vreinterpretq_u8_u16(q2tmp7.val[1]));
+
+    q3 = q2tmp8.val[0];
+    q4 = q2tmp8.val[1];
+    q5 = q2tmp9.val[0];
+    q6 = q2tmp9.val[1];
+    q7 = q2tmp10.val[0];
+    q8 = q2tmp10.val[1];
+    q9 = q2tmp11.val[0];
+    q10 = q2tmp11.val[1];
+
+    vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+                         q5, q6, q7, q8, q9, q10,
+                         &q4, &q5, &q6, &q7, &q8, &q9);
+
+    q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
+    q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
+    q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
+    q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
+
+    q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
+                       vreinterpretq_u16_u32(q2tmp2.val[0]));
+    q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
+                       vreinterpretq_u16_u32(q2tmp3.val[0]));
+    q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
+                       vreinterpretq_u16_u32(q2tmp2.val[1]));
+    q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
+                       vreinterpretq_u16_u32(q2tmp3.val[1]));
+
+    q2tmp8  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
+                       vreinterpretq_u8_u16(q2tmp5.val[0]));
+    q2tmp9  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
+                       vreinterpretq_u8_u16(q2tmp5.val[1]));
+    q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
+                       vreinterpretq_u8_u16(q2tmp7.val[0]));
+    q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
+                       vreinterpretq_u8_u16(q2tmp7.val[1]));
+
+    q3 = q2tmp8.val[0];
+    q4 = q2tmp8.val[1];
+    q5 = q2tmp9.val[0];
+    q6 = q2tmp9.val[1];
+    q7 = q2tmp10.val[0];
+    q8 = q2tmp10.val[1];
+    q9 = q2tmp11.val[0];
+    q10 = q2tmp11.val[1];
+
+    s1 -= 7 * pitch;
+    s2 -= 7 * pitch;
+
+    vst1_u8(s1, vget_low_u8(q3));
+    s1 += pitch;
+    vst1_u8(s2, vget_high_u8(q3));
+    s2 += pitch;
+    vst1_u8(s1, vget_low_u8(q4));
+    s1 += pitch;
+    vst1_u8(s2, vget_high_u8(q4));
+    s2 += pitch;
+    vst1_u8(s1, vget_low_u8(q5));
+    s1 += pitch;
+    vst1_u8(s2, vget_high_u8(q5));
+    s2 += pitch;
+    vst1_u8(s1, vget_low_u8(q6));
+    s1 += pitch;
+    vst1_u8(s2, vget_high_u8(q6));
+    s2 += pitch;
+    vst1_u8(s1, vget_low_u8(q7));
+    s1 += pitch;
+    vst1_u8(s2, vget_high_u8(q7));
+    s2 += pitch;
+    vst1_u8(s1, vget_low_u8(q8));
+    s1 += pitch;
+    vst1_u8(s2, vget_high_u8(q8));
+    s2 += pitch;
+    vst1_u8(s1, vget_low_u8(q9));
+    s1 += pitch;
+    vst1_u8(s2, vget_high_u8(q9));
+    s2 += pitch;
+    vst1_u8(s1, vget_low_u8(q10));
+    vst1_u8(s2, vget_high_u8(q10));
+    return;
+}
+
+void vp8_mbloop_filter_vertical_edge_uv_neon(
+        unsigned char *u,
+        int pitch,
+        unsigned char blimit,
+        unsigned char limit,
+        unsigned char thresh,
+        unsigned char *v) {
+    unsigned char *us, *ud;
+    unsigned char *vs, *vd;
+    uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+    uint8x16_t q5, q6, q7, q8, q9, q10;
+    uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
+    uint8x8_t d15, d16, d17, d18, d19, d20, d21;
+    uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
+    uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
+    uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
+
+    qblimit = vdupq_n_u8(blimit);
+    qlimit = vdupq_n_u8(limit);
+    qthresh = vdupq_n_u8(thresh);
+
+    us = u - 4;
+    vs = v - 4;
+    d6 = vld1_u8(us);
+    us += pitch;
+    d7 = vld1_u8(vs);
+    vs += pitch;
+    d8 = vld1_u8(us);
+    us += pitch;
+    d9 = vld1_u8(vs);
+    vs += pitch;
+    d10 = vld1_u8(us);
+    us += pitch;
+    d11 = vld1_u8(vs);
+    vs += pitch;
+    d12 = vld1_u8(us);
+    us += pitch;
+    d13 = vld1_u8(vs);
+    vs += pitch;
+    d14 = vld1_u8(us);
+    us += pitch;
+    d15 = vld1_u8(vs);
+    vs += pitch;
+    d16 = vld1_u8(us);
+    us += pitch;
+    d17 = vld1_u8(vs);
+    vs += pitch;
+    d18 = vld1_u8(us);
+    us += pitch;
+    d19 = vld1_u8(vs);
+    vs += pitch;
+    d20 = vld1_u8(us);
+    d21 = vld1_u8(vs);
+
+    q3 = vcombine_u8(d6, d7);
+    q4 = vcombine_u8(d8, d9);
+    q5 = vcombine_u8(d10, d11);
+    q6 = vcombine_u8(d12, d13);
+    q7 = vcombine_u8(d14, d15);
+    q8 = vcombine_u8(d16, d17);
+    q9 = vcombine_u8(d18, d19);
+    q10 = vcombine_u8(d20, d21);
+
+    q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
+    q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
+    q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
+    q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
+
+    q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
+                       vreinterpretq_u16_u32(q2tmp2.val[0]));
+    q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
+                       vreinterpretq_u16_u32(q2tmp3.val[0]));
+    q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
+                       vreinterpretq_u16_u32(q2tmp2.val[1]));
+    q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
+                       vreinterpretq_u16_u32(q2tmp3.val[1]));
+
+    q2tmp8  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
+                       vreinterpretq_u8_u16(q2tmp5.val[0]));
+    q2tmp9  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
+                       vreinterpretq_u8_u16(q2tmp5.val[1]));
+    q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
+                       vreinterpretq_u8_u16(q2tmp7.val[0]));
+    q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
+                       vreinterpretq_u8_u16(q2tmp7.val[1]));
+
+    q3 = q2tmp8.val[0];
+    q4 = q2tmp8.val[1];
+    q5 = q2tmp9.val[0];
+    q6 = q2tmp9.val[1];
+    q7 = q2tmp10.val[0];
+    q8 = q2tmp10.val[1];
+    q9 = q2tmp11.val[0];
+    q10 = q2tmp11.val[1];
+
+    vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+                         q5, q6, q7, q8, q9, q10,
+                         &q4, &q5, &q6, &q7, &q8, &q9);
+
+    q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
+    q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
+    q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
+    q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
+
+    q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
+                       vreinterpretq_u16_u32(q2tmp2.val[0]));
+    q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
+                       vreinterpretq_u16_u32(q2tmp3.val[0]));
+    q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
+                       vreinterpretq_u16_u32(q2tmp2.val[1]));
+    q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
+                       vreinterpretq_u16_u32(q2tmp3.val[1]));
+
+    q2tmp8  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
+                       vreinterpretq_u8_u16(q2tmp5.val[0]));
+    q2tmp9  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
+                       vreinterpretq_u8_u16(q2tmp5.val[1]));
+    q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
+                       vreinterpretq_u8_u16(q2tmp7.val[0]));
+    q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
+                       vreinterpretq_u8_u16(q2tmp7.val[1]));
+
+    q3 = q2tmp8.val[0];
+    q4 = q2tmp8.val[1];
+    q5 = q2tmp9.val[0];
+    q6 = q2tmp9.val[1];
+    q7 = q2tmp10.val[0];
+    q8 = q2tmp10.val[1];
+    q9 = q2tmp11.val[0];
+    q10 = q2tmp11.val[1];
+
+    ud = u - 4;
+    vst1_u8(ud, vget_low_u8(q3));
+    ud += pitch;
+    vst1_u8(ud, vget_low_u8(q4));
+    ud += pitch;
+    vst1_u8(ud, vget_low_u8(q5));
+    ud += pitch;
+    vst1_u8(ud, vget_low_u8(q6));
+    ud += pitch;
+    vst1_u8(ud, vget_low_u8(q7));
+    ud += pitch;
+    vst1_u8(ud, vget_low_u8(q8));
+    ud += pitch;
+    vst1_u8(ud, vget_low_u8(q9));
+    ud += pitch;
+    vst1_u8(ud, vget_low_u8(q10));
+
+    vd = v - 4;
+    vst1_u8(vd, vget_high_u8(q3));
+    vd += pitch;
+    vst1_u8(vd, vget_high_u8(q4));
+    vd += pitch;
+    vst1_u8(vd, vget_high_u8(q5));
+    vd += pitch;
+    vst1_u8(vd, vget_high_u8(q6));
+    vd += pitch;
+    vst1_u8(vd, vget_high_u8(q7));
+    vd += pitch;
+    vst1_u8(vd, vget_high_u8(q8));
+    vd += pitch;
+    vst1_u8(vd, vget_high_u8(q9));
+    vd += pitch;
+    vst1_u8(vd, vget_high_u8(q10));
+    return;
+}
diff --git a/libs/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c b/libs/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c
new file mode 100644
index 0000000000..373afa6ed3
--- /dev/null
+++ b/libs/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c
@@ -0,0 +1,123 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+static const int16_t cospi8sqrt2minus1 = 20091;
+static const int16_t sinpi8sqrt2       = 35468;
+
+void vp8_short_idct4x4llm_neon(
+        int16_t *input,
+        unsigned char *pred_ptr,
+        int pred_stride,
+        unsigned char *dst_ptr,
+        int dst_stride) {
+    int i;
+    uint32x2_t d6u32 = vdup_n_u32(0);
+    uint8x8_t d1u8;
+    int16x4_t d2, d3, d4, d5, d10, d11, d12, d13;
+    uint16x8_t q1u16;
+    int16x8_t q1s16, q2s16, q3s16, q4s16;
+    int32x2x2_t v2tmp0, v2tmp1;
+    int16x4x2_t v2tmp2, v2tmp3;
+
+    d2 = vld1_s16(input);
+    d3 = vld1_s16(input + 4);
+    d4 = vld1_s16(input + 8);
+    d5 = vld1_s16(input + 12);
+
+    // 1st for loop
+    q1s16 = vcombine_s16(d2, d4);  // Swap d3 d4 here
+    q2s16 = vcombine_s16(d3, d5);
+
+    q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2);
+    q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1);
+
+    d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // a1
+    d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // b1
+
+    q3s16 = vshrq_n_s16(q3s16, 1);
+    q4s16 = vshrq_n_s16(q4s16, 1);
+
+    q3s16 = vqaddq_s16(q3s16, q2s16);
+    q4s16 = vqaddq_s16(q4s16, q2s16);
+
+    d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16));  // c1
+    d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16));  // d1
+
+    d2 = vqadd_s16(d12, d11);
+    d3 = vqadd_s16(d13, d10);
+    d4 = vqsub_s16(d13, d10);
+    d5 = vqsub_s16(d12, d11);
+
+    v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
+    v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
+    v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]),
+                      vreinterpret_s16_s32(v2tmp1.val[0]));
+    v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]),
+                      vreinterpret_s16_s32(v2tmp1.val[1]));
+
+    // 2nd for loop
+    q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp3.val[0]);
+    q2s16 = vcombine_s16(v2tmp2.val[1], v2tmp3.val[1]);
+
+    q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2);
+    q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1);
+
+    d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // a1
+    d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // b1
+
+    q3s16 = vshrq_n_s16(q3s16, 1);
+    q4s16 = vshrq_n_s16(q4s16, 1);
+
+    q3s16 = vqaddq_s16(q3s16, q2s16);
+    q4s16 = vqaddq_s16(q4s16, q2s16);
+
+    d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16));  // c1
+    d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16));  // d1
+
+    d2 = vqadd_s16(d12, d11);
+    d3 = vqadd_s16(d13, d10);
+    d4 = vqsub_s16(d13, d10);
+    d5 = vqsub_s16(d12, d11);
+
+    d2 = vrshr_n_s16(d2, 3);
+    d3 = vrshr_n_s16(d3, 3);
+    d4 = vrshr_n_s16(d4, 3);
+    d5 = vrshr_n_s16(d5, 3);
+
+    v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
+    v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
+    v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]),
+                      vreinterpret_s16_s32(v2tmp1.val[0]));
+    v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]),
+                      vreinterpret_s16_s32(v2tmp1.val[1]));
+
+    q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp2.val[1]);
+    q2s16 = vcombine_s16(v2tmp3.val[0], v2tmp3.val[1]);
+
+    // dc_only_idct_add
+    for (i = 0; i < 2; i++, q1s16 = q2s16) {
+        d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 0);
+        pred_ptr += pred_stride;
+        d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 1);
+        pred_ptr += pred_stride;
+
+        q1u16 = vaddw_u8(vreinterpretq_u16_s16(q1s16),
+                         vreinterpret_u8_u32(d6u32));
+        d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
+
+        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 0);
+        dst_ptr += dst_stride;
+        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 1);
+        dst_ptr += dst_stride;
+    }
+    return;
+}
diff --git a/libs/libvpx/vp8/common/arm/neon/sixtappredict_neon.c b/libs/libvpx/vp8/common/arm/neon/sixtappredict_neon.c
new file mode 100644
index 0000000000..4c2efc92b1
--- /dev/null
+++ b/libs/libvpx/vp8/common/arm/neon/sixtappredict_neon.c
@@ -0,0 +1,1754 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "vpx_ports/mem.h"
+
+static const int8_t vp8_sub_pel_filters[8][8] = {
+    {0,  0,  128,   0,   0, 0, 0, 0},  /* note that 1/8 pel positionyys are */
+    {0, -6,  123,  12,  -1, 0, 0, 0},  /*    just as per alpha -0.5 bicubic */
+    {2, -11, 108,  36,  -8, 1, 0, 0},  /* New 1/4 pel 6 tap filter */
+    {0, -9,   93,  50,  -6, 0, 0, 0},
+    {3, -16,  77,  77, -16, 3, 0, 0},  /* New 1/2 pel 6 tap filter */
+    {0, -6,   50,  93,  -9, 0, 0, 0},
+    {1, -8,   36, 108, -11, 2, 0, 0},  /* New 1/4 pel 6 tap filter */
+    {0, -1,   12, 123,  -6, 0, 0, 0},
+};
+
+void vp8_sixtap_predict4x4_neon(
+        unsigned char *src_ptr,
+        int src_pixels_per_line,
+        int xoffset,
+        int yoffset,
+        unsigned char *dst_ptr,
+        int dst_pitch) {
+    unsigned char *src;
+    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d18u8, d19u8, d20u8, d21u8;
+    uint8x8_t d23u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
+    int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
+    uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
+    uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
+    int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
+    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
+    uint8x16_t q3u8, q4u8, q5u8, q6u8, q11u8;
+    uint64x2_t q3u64, q4u64, q5u64, q6u64, q9u64, q10u64;
+    uint32x2x2_t d0u32x2, d1u32x2;
+
+    if (xoffset == 0) {  // secondpass_filter4x4_only
+        uint32x2_t d27u32 = vdup_n_u32(0);
+        uint32x2_t d28u32 = vdup_n_u32(0);
+        uint32x2_t d29u32 = vdup_n_u32(0);
+        uint32x2_t d30u32 = vdup_n_u32(0);
+        uint32x2_t d31u32 = vdup_n_u32(0);
+
+        // load second_pass filter
+        dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
+        d0s8 = vdup_lane_s8(dtmps8, 0);
+        d1s8 = vdup_lane_s8(dtmps8, 1);
+        d2s8 = vdup_lane_s8(dtmps8, 2);
+        d3s8 = vdup_lane_s8(dtmps8, 3);
+        d4s8 = vdup_lane_s8(dtmps8, 4);
+        d5s8 = vdup_lane_s8(dtmps8, 5);
+        d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+        d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+        d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+        d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+        d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+        d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+        // load src data
+        src = src_ptr - src_pixels_per_line * 2;
+        d27u32 = vld1_lane_u32((const uint32_t *)src, d27u32, 0);
+        src += src_pixels_per_line;
+        d27u32 = vld1_lane_u32((const uint32_t *)src, d27u32, 1);
+        src += src_pixels_per_line;
+        d28u32 = vld1_lane_u32((const uint32_t *)src, d28u32, 0);
+        src += src_pixels_per_line;
+        d28u32 = vld1_lane_u32((const uint32_t *)src, d28u32, 1);
+        src += src_pixels_per_line;
+        d29u32 = vld1_lane_u32((const uint32_t *)src, d29u32, 0);
+        src += src_pixels_per_line;
+        d29u32 = vld1_lane_u32((const uint32_t *)src, d29u32, 1);
+        src += src_pixels_per_line;
+        d30u32 = vld1_lane_u32((const uint32_t *)src, d30u32, 0);
+        src += src_pixels_per_line;
+        d30u32 = vld1_lane_u32((const uint32_t *)src, d30u32, 1);
+        src += src_pixels_per_line;
+        d31u32 = vld1_lane_u32((const uint32_t *)src, d31u32, 0);
+
+        d27u8 = vreinterpret_u8_u32(d27u32);
+        d28u8 = vreinterpret_u8_u32(d28u32);
+        d29u8 = vreinterpret_u8_u32(d29u32);
+        d30u8 = vreinterpret_u8_u32(d30u32);
+        d31u8 = vreinterpret_u8_u32(d31u32);
+
+        d23u8 = vext_u8(d27u8, d28u8, 4);
+        d24u8 = vext_u8(d28u8, d29u8, 4);
+        d25u8 = vext_u8(d29u8, d30u8, 4);
+        d26u8 = vext_u8(d30u8, d31u8, 4);
+
+        q3u16 = vmull_u8(d27u8, d0u8);
+        q4u16 = vmull_u8(d28u8, d0u8);
+        q5u16 = vmull_u8(d25u8, d5u8);
+        q6u16 = vmull_u8(d26u8, d5u8);
+
+        q3u16 = vmlsl_u8(q3u16, d29u8, d4u8);
+        q4u16 = vmlsl_u8(q4u16, d30u8, d4u8);
+        q5u16 = vmlsl_u8(q5u16, d23u8, d1u8);
+        q6u16 = vmlsl_u8(q6u16, d24u8, d1u8);
+
+        q3u16 = vmlal_u8(q3u16, d28u8, d2u8);
+        q4u16 = vmlal_u8(q4u16, d29u8, d2u8);
+        q5u16 = vmlal_u8(q5u16, d24u8, d3u8);
+        q6u16 = vmlal_u8(q6u16, d25u8, d3u8);
+
+        q3s16 = vreinterpretq_s16_u16(q3u16);
+        q4s16 = vreinterpretq_s16_u16(q4u16);
+        q5s16 = vreinterpretq_s16_u16(q5u16);
+        q6s16 = vreinterpretq_s16_u16(q6u16);
+
+        q5s16 = vqaddq_s16(q5s16, q3s16);
+        q6s16 = vqaddq_s16(q6s16, q4s16);
+
+        d3u8 = vqrshrun_n_s16(q5s16, 7);
+        d4u8 = vqrshrun_n_s16(q6s16, 7);
+
+        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 0);
+        dst_ptr += dst_pitch;
+        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 1);
+        dst_ptr += dst_pitch;
+        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 0);
+        dst_ptr += dst_pitch;
+        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 1);
+        return;
+    }
+
+    // load first_pass filter
+    dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
+    d0s8 = vdup_lane_s8(dtmps8, 0);
+    d1s8 = vdup_lane_s8(dtmps8, 1);
+    d2s8 = vdup_lane_s8(dtmps8, 2);
+    d3s8 = vdup_lane_s8(dtmps8, 3);
+    d4s8 = vdup_lane_s8(dtmps8, 4);
+    d5s8 = vdup_lane_s8(dtmps8, 5);
+    d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+    d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+    d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+    d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+    d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+    d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+    // First pass: output_height lines x output_width columns (9x4)
+
+    if (yoffset == 0)  // firstpass_filter4x4_only
+        src = src_ptr - 2;
+    else
+        src = src_ptr - 2 - (src_pixels_per_line * 2);
+
+    q3u8 = vld1q_u8(src);
+    src += src_pixels_per_line;
+    q4u8 = vld1q_u8(src);
+    src += src_pixels_per_line;
+    q5u8 = vld1q_u8(src);
+    src += src_pixels_per_line;
+    q6u8 = vld1q_u8(src);
+    src += src_pixels_per_line;
+
+    d18u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
+    d19u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
+    d20u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
+    d21u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
+
+    // vswp here
+    q3u8 = vcombine_u8(vget_low_u8(q3u8), vget_low_u8(q4u8));
+    q5u8 = vcombine_u8(vget_low_u8(q5u8), vget_low_u8(q6u8));
+
+    d0u32x2 = vzip_u32(vreinterpret_u32_u8(d18u8),  // d18 d19
+                       vreinterpret_u32_u8(d19u8));
+    d1u32x2 = vzip_u32(vreinterpret_u32_u8(d20u8),  // d20 d21
+                       vreinterpret_u32_u8(d21u8));
+    q7u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d5u8);
+    q8u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d5u8);
+
+    // keep original src data in q4 q6
+    q4u64 = vreinterpretq_u64_u8(q3u8);
+    q6u64 = vreinterpretq_u64_u8(q5u8);
+
+    d0u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q3u8)),  // d6 d7
+                       vreinterpret_u32_u8(vget_high_u8(q3u8)));
+    d1u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q5u8)),  // d10 d11
+                       vreinterpret_u32_u8(vget_high_u8(q5u8)));
+    q9u64 = vshrq_n_u64(q4u64, 8);
+    q10u64 = vshrq_n_u64(q6u64, 8);
+    q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d0u8);
+    q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d0u8);
+
+    d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)),   // d18 d19
+                       vreinterpret_u32_u64(vget_high_u64(q9u64)));
+    d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)),  // d20 d211
+                       vreinterpret_u32_u64(vget_high_u64(q10u64)));
+    q3u64 = vshrq_n_u64(q4u64, 32);
+    q5u64 = vshrq_n_u64(q6u64, 32);
+    q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d1u8);
+    q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d1u8);
+
+    d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)),  // d6 d7
+                       vreinterpret_u32_u64(vget_high_u64(q3u64)));
+    d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)),  // d10 d11
+                       vreinterpret_u32_u64(vget_high_u64(q5u64)));
+    q9u64 = vshrq_n_u64(q4u64, 16);
+    q10u64 = vshrq_n_u64(q6u64, 16);
+    q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d4u8);
+    q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d4u8);
+
+    d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)),   // d18 d19
+                       vreinterpret_u32_u64(vget_high_u64(q9u64)));
+    d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)),  // d20 d211
+                       vreinterpret_u32_u64(vget_high_u64(q10u64)));
+    q3u64 = vshrq_n_u64(q4u64, 24);
+    q5u64 = vshrq_n_u64(q6u64, 24);
+    q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d2u8);
+    q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d2u8);
+
+    d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)),  // d6 d7
+                       vreinterpret_u32_u64(vget_high_u64(q3u64)));
+    d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)),  // d10 d11
+                       vreinterpret_u32_u64(vget_high_u64(q5u64)));
+    q9u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d3u8);
+    q10u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d3u8);
+
+    q7s16 = vreinterpretq_s16_u16(q7u16);
+    q8s16 = vreinterpretq_s16_u16(q8u16);
+    q9s16 = vreinterpretq_s16_u16(q9u16);
+    q10s16 = vreinterpretq_s16_u16(q10u16);
+    q7s16 = vqaddq_s16(q7s16, q9s16);
+    q8s16 = vqaddq_s16(q8s16, q10s16);
+
+    d27u8 = vqrshrun_n_s16(q7s16, 7);
+    d28u8 = vqrshrun_n_s16(q8s16, 7);
+
+    if (yoffset == 0) {  // firstpass_filter4x4_only
+        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d27u8), 0);
+        dst_ptr += dst_pitch;
+        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d27u8), 1);
+        dst_ptr += dst_pitch;
+        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 0);
+        dst_ptr += dst_pitch;
+        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 1);
+        return;
+    }
+
+    // First Pass on rest 5-line data
+    q3u8 = vld1q_u8(src);
+    src += src_pixels_per_line;
+    q4u8 = vld1q_u8(src);
+    src += src_pixels_per_line;
+    q5u8 = vld1q_u8(src);
+    src += src_pixels_per_line;
+    q6u8 = vld1q_u8(src);
+    src += src_pixels_per_line;
+    q11u8 = vld1q_u8(src);
+
+    d18u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
+    d19u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
+    d20u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
+    d21u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
+
+    // vswp here
+    q3u8 = vcombine_u8(vget_low_u8(q3u8), vget_low_u8(q4u8));
+    q5u8 = vcombine_u8(vget_low_u8(q5u8), vget_low_u8(q6u8));
+
+    d0u32x2 = vzip_u32(vreinterpret_u32_u8(d18u8),  // d18 d19
+                       vreinterpret_u32_u8(d19u8));
+    d1u32x2 = vzip_u32(vreinterpret_u32_u8(d20u8),  // d20 d21
+                       vreinterpret_u32_u8(d21u8));
+    d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 5);
+    q7u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d5u8);
+    q8u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d5u8);
+    q12u16 = vmull_u8(d31u8, d5u8);
+
+    q4u64 = vreinterpretq_u64_u8(q3u8);
+    q6u64 = vreinterpretq_u64_u8(q5u8);
+
+    d0u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q3u8)),  // d6 d7
+                       vreinterpret_u32_u8(vget_high_u8(q3u8)));
+    d1u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q5u8)),  // d10 d11
+                       vreinterpret_u32_u8(vget_high_u8(q5u8)));
+    q9u64 = vshrq_n_u64(q4u64, 8);
+    q10u64 = vshrq_n_u64(q6u64, 8);
+    q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d0u8);
+    q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d0u8);
+    q12u16 = vmlal_u8(q12u16, vget_low_u8(q11u8), d0u8);
+
+    d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)),   // d18 d19
+                       vreinterpret_u32_u64(vget_high_u64(q9u64)));
+    d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)),  // d20 d211
+                       vreinterpret_u32_u64(vget_high_u64(q10u64)));
+    q3u64 = vshrq_n_u64(q4u64, 32);
+    q5u64 = vshrq_n_u64(q6u64, 32);
+    d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 1);
+    q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d1u8);
+    q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d1u8);
+    q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
+
+    d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)),  // d6 d7
+                       vreinterpret_u32_u64(vget_high_u64(q3u64)));
+    d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)),  // d10 d11
+                       vreinterpret_u32_u64(vget_high_u64(q5u64)));
+    q9u64 = vshrq_n_u64(q4u64, 16);
+    q10u64 = vshrq_n_u64(q6u64, 16);
+    d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 4);
+    q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d4u8);
+    q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d4u8);
+    q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
+
+    d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)),   // d18 d19
+                       vreinterpret_u32_u64(vget_high_u64(q9u64)));
+    d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)),  // d20 d211
+                       vreinterpret_u32_u64(vget_high_u64(q10u64)));
+    q3u64 = vshrq_n_u64(q4u64, 24);
+    q5u64 = vshrq_n_u64(q6u64, 24);
+    d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 2);
+    q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d2u8);
+    q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d2u8);
+    q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
+
+    d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)),  // d6 d7
+                       vreinterpret_u32_u64(vget_high_u64(q3u64)));
+    d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)),  // d10 d11
+                       vreinterpret_u32_u64(vget_high_u64(q5u64)));
+    d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 3);
+    q9u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d3u8);
+    q10u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d3u8);
+    q11u16 = vmull_u8(d31u8, d3u8);
+
+    q7s16 = vreinterpretq_s16_u16(q7u16);
+    q8s16 = vreinterpretq_s16_u16(q8u16);
+    q9s16 = vreinterpretq_s16_u16(q9u16);
+    q10s16 = vreinterpretq_s16_u16(q10u16);
+    q11s16 = vreinterpretq_s16_u16(q11u16);
+    q12s16 = vreinterpretq_s16_u16(q12u16);
+    q7s16 = vqaddq_s16(q7s16, q9s16);
+    q8s16 = vqaddq_s16(q8s16, q10s16);
+    q12s16 = vqaddq_s16(q12s16, q11s16);
+
+    d29u8 = vqrshrun_n_s16(q7s16, 7);
+    d30u8 = vqrshrun_n_s16(q8s16, 7);
+    d31u8 = vqrshrun_n_s16(q12s16, 7);
+
+    // Second pass: 4x4
+    dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
+    d0s8 = vdup_lane_s8(dtmps8, 0);
+    d1s8 = vdup_lane_s8(dtmps8, 1);
+    d2s8 = vdup_lane_s8(dtmps8, 2);
+    d3s8 = vdup_lane_s8(dtmps8, 3);
+    d4s8 = vdup_lane_s8(dtmps8, 4);
+    d5s8 = vdup_lane_s8(dtmps8, 5);
+    d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+    d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+    d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+    d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+    d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+    d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+    d23u8 = vext_u8(d27u8, d28u8, 4);
+    d24u8 = vext_u8(d28u8, d29u8, 4);
+    d25u8 = vext_u8(d29u8, d30u8, 4);
+    d26u8 = vext_u8(d30u8, d31u8, 4);
+
+    q3u16 = vmull_u8(d27u8, d0u8);
+    q4u16 = vmull_u8(d28u8, d0u8);
+    q5u16 = vmull_u8(d25u8, d5u8);
+    q6u16 = vmull_u8(d26u8, d5u8);
+
+    q3u16 = vmlsl_u8(q3u16, d29u8, d4u8);
+    q4u16 = vmlsl_u8(q4u16, d30u8, d4u8);
+    q5u16 = vmlsl_u8(q5u16, d23u8, d1u8);
+    q6u16 = vmlsl_u8(q6u16, d24u8, d1u8);
+
+    q3u16 = vmlal_u8(q3u16, d28u8, d2u8);
+    q4u16 = vmlal_u8(q4u16, d29u8, d2u8);
+    q5u16 = vmlal_u8(q5u16, d24u8, d3u8);
+    q6u16 = vmlal_u8(q6u16, d25u8, d3u8);
+
+    q3s16 = vreinterpretq_s16_u16(q3u16);
+    q4s16 = vreinterpretq_s16_u16(q4u16);
+    q5s16 = vreinterpretq_s16_u16(q5u16);
+    q6s16 = vreinterpretq_s16_u16(q6u16);
+
+    q5s16 = vqaddq_s16(q5s16, q3s16);
+    q6s16 = vqaddq_s16(q6s16, q4s16);
+
+    d3u8 = vqrshrun_n_s16(q5s16, 7);
+    d4u8 = vqrshrun_n_s16(q6s16, 7);
+
+    vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 0);
+    dst_ptr += dst_pitch;
+    vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 1);
+    dst_ptr += dst_pitch;
+    vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 0);
+    dst_ptr += dst_pitch;
+    vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 1);
+    return;
+}
+
+void vp8_sixtap_predict8x4_neon(
+        unsigned char *src_ptr,
+        int src_pixels_per_line,
+        int xoffset,
+        int yoffset,
+        unsigned char *dst_ptr,
+        int dst_pitch) {
+    unsigned char *src;
+    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
+    uint8x8_t d22u8, d23u8, d24u8, d25u8, d26u8;
+    uint8x8_t d27u8, d28u8, d29u8, d30u8, d31u8;
+    int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
+    uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
+    uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
+    int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
+    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
+    uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8;
+
+    if (xoffset == 0) {  // secondpass_filter8x4_only
+        // load second_pass filter
+        dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
+        d0s8 = vdup_lane_s8(dtmps8, 0);
+        d1s8 = vdup_lane_s8(dtmps8, 1);
+        d2s8 = vdup_lane_s8(dtmps8, 2);
+        d3s8 = vdup_lane_s8(dtmps8, 3);
+        d4s8 = vdup_lane_s8(dtmps8, 4);
+        d5s8 = vdup_lane_s8(dtmps8, 5);
+        d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+        d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+        d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+        d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+        d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+        d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+        // load src data
+        src = src_ptr - src_pixels_per_line * 2;
+        d22u8 = vld1_u8(src);
+        src += src_pixels_per_line;
+        d23u8 = vld1_u8(src);
+        src += src_pixels_per_line;
+        d24u8 = vld1_u8(src);
+        src += src_pixels_per_line;
+        d25u8 = vld1_u8(src);
+        src += src_pixels_per_line;
+        d26u8 = vld1_u8(src);
+        src += src_pixels_per_line;
+        d27u8 = vld1_u8(src);
+        src += src_pixels_per_line;
+        d28u8 = vld1_u8(src);
+        src += src_pixels_per_line;
+        d29u8 = vld1_u8(src);
+        src += src_pixels_per_line;
+        d30u8 = vld1_u8(src);
+
+        q3u16 = vmull_u8(d22u8, d0u8);
+        q4u16 = vmull_u8(d23u8, d0u8);
+        q5u16 = vmull_u8(d24u8, d0u8);
+        q6u16 = vmull_u8(d25u8, d0u8);
+
+        q3u16 = vmlsl_u8(q3u16, d23u8, d1u8);
+        q4u16 = vmlsl_u8(q4u16, d24u8, d1u8);
+        q5u16 = vmlsl_u8(q5u16, d25u8, d1u8);
+        q6u16 = vmlsl_u8(q6u16, d26u8, d1u8);
+
+        q3u16 = vmlsl_u8(q3u16, d26u8, d4u8);
+        q4u16 = vmlsl_u8(q4u16, d27u8, d4u8);
+        q5u16 = vmlsl_u8(q5u16, d28u8, d4u8);
+        q6u16 = vmlsl_u8(q6u16, d29u8, d4u8);
+
+        q3u16 = vmlal_u8(q3u16, d24u8, d2u8);
+        q4u16 = vmlal_u8(q4u16, d25u8, d2u8);
+        q5u16 = vmlal_u8(q5u16, d26u8, d2u8);
+        q6u16 = vmlal_u8(q6u16, d27u8, d2u8);
+
+        q3u16 = vmlal_u8(q3u16, d27u8, d5u8);
+        q4u16 = vmlal_u8(q4u16, d28u8, d5u8);
+        q5u16 = vmlal_u8(q5u16, d29u8, d5u8);
+        q6u16 = vmlal_u8(q6u16, d30u8, d5u8);
+
+        q7u16 = vmull_u8(d25u8, d3u8);
+        q8u16 = vmull_u8(d26u8, d3u8);
+        q9u16 = vmull_u8(d27u8, d3u8);
+        q10u16 = vmull_u8(d28u8, d3u8);
+
+        q3s16 = vreinterpretq_s16_u16(q3u16);
+        q4s16 = vreinterpretq_s16_u16(q4u16);
+        q5s16 = vreinterpretq_s16_u16(q5u16);
+        q6s16 = vreinterpretq_s16_u16(q6u16);
+        q7s16 = vreinterpretq_s16_u16(q7u16);
+        q8s16 = vreinterpretq_s16_u16(q8u16);
+        q9s16 = vreinterpretq_s16_u16(q9u16);
+        q10s16 = vreinterpretq_s16_u16(q10u16);
+
+        q7s16 = vqaddq_s16(q7s16, q3s16);
+        q8s16 = vqaddq_s16(q8s16, q4s16);
+        q9s16 = vqaddq_s16(q9s16, q5s16);
+        q10s16 = vqaddq_s16(q10s16, q6s16);
+
+        d6u8 = vqrshrun_n_s16(q7s16, 7);
+        d7u8 = vqrshrun_n_s16(q8s16, 7);
+        d8u8 = vqrshrun_n_s16(q9s16, 7);
+        d9u8 = vqrshrun_n_s16(q10s16, 7);
+
+        vst1_u8(dst_ptr, d6u8);
+        dst_ptr += dst_pitch;
+        vst1_u8(dst_ptr, d7u8);
+        dst_ptr += dst_pitch;
+        vst1_u8(dst_ptr, d8u8);
+        dst_ptr += dst_pitch;
+        vst1_u8(dst_ptr, d9u8);
+        return;
+    }
+
+    // load first_pass filter
+    dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
+    d0s8 = vdup_lane_s8(dtmps8, 0);
+    d1s8 = vdup_lane_s8(dtmps8, 1);
+    d2s8 = vdup_lane_s8(dtmps8, 2);
+    d3s8 = vdup_lane_s8(dtmps8, 3);
+    d4s8 = vdup_lane_s8(dtmps8, 4);
+    d5s8 = vdup_lane_s8(dtmps8, 5);
+    d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+    d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+    d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+    d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+    d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+    d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+    // First pass: output_height lines x output_width columns (9x4)
+    if (yoffset == 0)  // firstpass_filter4x4_only
+        src = src_ptr - 2;
+    else
+        src = src_ptr - 2 - (src_pixels_per_line * 2);
+    q3u8 = vld1q_u8(src);
+    src += src_pixels_per_line;
+    q4u8 = vld1q_u8(src);
+    src += src_pixels_per_line;
+    q5u8 = vld1q_u8(src);
+    src += src_pixels_per_line;
+    q6u8 = vld1q_u8(src);
+
+    q7u16  = vmull_u8(vget_low_u8(q3u8), d0u8);
+    q8u16  = vmull_u8(vget_low_u8(q4u8), d0u8);
+    q9u16  = vmull_u8(vget_low_u8(q5u8), d0u8);
+    q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
+
+    d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
+    d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
+    d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
+    d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
+
+    q7u16  = vmlsl_u8(q7u16, d28u8, d1u8);
+    q8u16  = vmlsl_u8(q8u16, d29u8, d1u8);
+    q9u16  = vmlsl_u8(q9u16, d30u8, d1u8);
+    q10u16 = vmlsl_u8(q10u16, d31u8, d1u8);
+
+    d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
+    d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
+    d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
+    d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
+
+    q7u16  = vmlsl_u8(q7u16, d28u8, d4u8);
+    q8u16  = vmlsl_u8(q8u16, d29u8, d4u8);
+    q9u16  = vmlsl_u8(q9u16, d30u8, d4u8);
+    q10u16 = vmlsl_u8(q10u16, d31u8, d4u8);
+
+    d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
+    d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
+    d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
+    d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
+
+    q7u16  = vmlal_u8(q7u16, d28u8, d2u8);
+    q8u16  = vmlal_u8(q8u16, d29u8, d2u8);
+    q9u16  = vmlal_u8(q9u16, d30u8, d2u8);
+    q10u16 = vmlal_u8(q10u16, d31u8, d2u8);
+
+    d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
+    d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
+    d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
+    d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
+
+    q7u16 = vmlal_u8(q7u16, d28u8, d5u8);
+    q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
+    q9u16 = vmlal_u8(q9u16, d30u8, d5u8);
+    q10u16 = vmlal_u8(q10u16, d31u8, d5u8);
+
+    d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
+    d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
+    d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
+    d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
+
+    q3u16 = vmull_u8(d28u8, d3u8);
+    q4u16 = vmull_u8(d29u8, d3u8);
+    q5u16 = vmull_u8(d30u8, d3u8);
+    q6u16 = vmull_u8(d31u8, d3u8);
+
+    q3s16 = vreinterpretq_s16_u16(q3u16);
+    q4s16 = vreinterpretq_s16_u16(q4u16);
+    q5s16 = vreinterpretq_s16_u16(q5u16);
+    q6s16 = vreinterpretq_s16_u16(q6u16);
+    q7s16 = vreinterpretq_s16_u16(q7u16);
+    q8s16 = vreinterpretq_s16_u16(q8u16);
+    q9s16 = vreinterpretq_s16_u16(q9u16);
+    q10s16 = vreinterpretq_s16_u16(q10u16);
+
+    q7s16 = vqaddq_s16(q7s16, q3s16);
+    q8s16 = vqaddq_s16(q8s16, q4s16);
+    q9s16 = vqaddq_s16(q9s16, q5s16);
+    q10s16 = vqaddq_s16(q10s16, q6s16);
+
+    d22u8 = vqrshrun_n_s16(q7s16, 7);
+    d23u8 = vqrshrun_n_s16(q8s16, 7);
+    d24u8 = vqrshrun_n_s16(q9s16, 7);
+    d25u8 = vqrshrun_n_s16(q10s16, 7);
+
+    if (yoffset == 0) {  // firstpass_filter8x4_only
+        vst1_u8(dst_ptr, d22u8);
+        dst_ptr += dst_pitch;
+        vst1_u8(dst_ptr, d23u8);
+        dst_ptr += dst_pitch;
+        vst1_u8(dst_ptr, d24u8);
+        dst_ptr += dst_pitch;
+        vst1_u8(dst_ptr, d25u8);
+        return;
+    }
+
+    // First Pass on rest 5-line data
+    src += src_pixels_per_line;
+    q3u8 = vld1q_u8(src);
+    src += src_pixels_per_line;
+    q4u8 = vld1q_u8(src);
+    src += src_pixels_per_line;
+    q5u8 = vld1q_u8(src);
+    src += src_pixels_per_line;
+    q6u8 = vld1q_u8(src);
+    src += src_pixels_per_line;
+    q7u8 = vld1q_u8(src);
+
+    q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
+    q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
+    q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
+    q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
+    q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8);
+
+    d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
+    d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
+    d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
+    d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
+    d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1);
+
+    q8u16  = vmlsl_u8(q8u16, d27u8, d1u8);
+    q9u16  = vmlsl_u8(q9u16, d28u8, d1u8);
+    q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
+    q11u16 = vmlsl_u8(q11u16, d30u8, d1u8);
+    q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
+
+    d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
+    d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
+    d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
+    d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
+    d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4);
+
+    q8u16  = vmlsl_u8(q8u16, d27u8, d4u8);
+    q9u16  = vmlsl_u8(q9u16, d28u8, d4u8);
+    q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
+    q11u16 = vmlsl_u8(q11u16, d30u8, d4u8);
+    q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
+
+    d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
+    d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
+    d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
+    d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
+    d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2);
+
+    q8u16  = vmlal_u8(q8u16, d27u8, d2u8);
+    q9u16  = vmlal_u8(q9u16, d28u8, d2u8);
+    q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
+    q11u16 = vmlal_u8(q11u16, d30u8, d2u8);
+    q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
+
+    d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
+    d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
+    d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
+    d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
+    d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5);
+
+    q8u16  = vmlal_u8(q8u16, d27u8, d5u8);
+    q9u16  = vmlal_u8(q9u16, d28u8, d5u8);
+    q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
+    q11u16 = vmlal_u8(q11u16, d30u8, d5u8);
+    q12u16 = vmlal_u8(q12u16, d31u8, d5u8);
+
+    d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
+    d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
+    d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
+    d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
+    d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3);
+
+    q3u16 = vmull_u8(d27u8, d3u8);
+    q4u16 = vmull_u8(d28u8, d3u8);
+    q5u16 = vmull_u8(d29u8, d3u8);
+    q6u16 = vmull_u8(d30u8, d3u8);
+    q7u16 = vmull_u8(d31u8, d3u8);
+
+    q3s16 = vreinterpretq_s16_u16(q3u16);
+    q4s16 = vreinterpretq_s16_u16(q4u16);
+    q5s16 = vreinterpretq_s16_u16(q5u16);
+    q6s16 = vreinterpretq_s16_u16(q6u16);
+    q7s16 = vreinterpretq_s16_u16(q7u16);
+    q8s16 = vreinterpretq_s16_u16(q8u16);
+    q9s16 = vreinterpretq_s16_u16(q9u16);
+    q10s16 = vreinterpretq_s16_u16(q10u16);
+    q11s16 = vreinterpretq_s16_u16(q11u16);
+    q12s16 = vreinterpretq_s16_u16(q12u16);
+
+    q8s16 = vqaddq_s16(q8s16, q3s16);
+    q9s16 = vqaddq_s16(q9s16, q4s16);
+    q10s16 = vqaddq_s16(q10s16, q5s16);
+    q11s16 = vqaddq_s16(q11s16, q6s16);
+    q12s16 = vqaddq_s16(q12s16, q7s16);
+
+    d26u8 = vqrshrun_n_s16(q8s16, 7);
+    d27u8 = vqrshrun_n_s16(q9s16, 7);
+    d28u8 = vqrshrun_n_s16(q10s16, 7);
+    d29u8 = vqrshrun_n_s16(q11s16, 7);
+    d30u8 = vqrshrun_n_s16(q12s16, 7);
+
+    // Second pass: 8x4
+    dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
+    d0s8 = vdup_lane_s8(dtmps8, 0);
+    d1s8 = vdup_lane_s8(dtmps8, 1);
+    d2s8 = vdup_lane_s8(dtmps8, 2);
+    d3s8 = vdup_lane_s8(dtmps8, 3);
+    d4s8 = vdup_lane_s8(dtmps8, 4);
+    d5s8 = vdup_lane_s8(dtmps8, 5);
+    d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+    d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+    d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+    d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+    d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+    d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+    q3u16 = vmull_u8(d22u8, d0u8);
+    q4u16 = vmull_u8(d23u8, d0u8);
+    q5u16 = vmull_u8(d24u8, d0u8);
+    q6u16 = vmull_u8(d25u8, d0u8);
+
+    q3u16 = vmlsl_u8(q3u16, d23u8, d1u8);
+    q4u16 = vmlsl_u8(q4u16, d24u8, d1u8);
+    q5u16 = vmlsl_u8(q5u16, d25u8, d1u8);
+    q6u16 = vmlsl_u8(q6u16, d26u8, d1u8);
+
+    q3u16 = vmlsl_u8(q3u16, d26u8, d4u8);
+    q4u16 = vmlsl_u8(q4u16, d27u8, d4u8);
+    q5u16 = vmlsl_u8(q5u16, d28u8, d4u8);
+    q6u16 = vmlsl_u8(q6u16, d29u8, d4u8);
+
+    q3u16 = vmlal_u8(q3u16, d24u8, d2u8);
+    q4u16 = vmlal_u8(q4u16, d25u8, d2u8);
+    q5u16 = vmlal_u8(q5u16, d26u8, d2u8);
+    q6u16 = vmlal_u8(q6u16, d27u8, d2u8);
+
+    q3u16 = vmlal_u8(q3u16, d27u8, d5u8);
+    q4u16 = vmlal_u8(q4u16, d28u8, d5u8);
+    q5u16 = vmlal_u8(q5u16, d29u8, d5u8);
+    q6u16 = vmlal_u8(q6u16, d30u8, d5u8);
+
+    q7u16 = vmull_u8(d25u8, d3u8);
+    q8u16 = vmull_u8(d26u8, d3u8);
+    q9u16 = vmull_u8(d27u8, d3u8);
+    q10u16 = vmull_u8(d28u8, d3u8);
+
+    q3s16 = vreinterpretq_s16_u16(q3u16);
+    q4s16 = vreinterpretq_s16_u16(q4u16);
+    q5s16 = vreinterpretq_s16_u16(q5u16);
+    q6s16 = vreinterpretq_s16_u16(q6u16);
+    q7s16 = vreinterpretq_s16_u16(q7u16);
+    q8s16 = vreinterpretq_s16_u16(q8u16);
+    q9s16 = vreinterpretq_s16_u16(q9u16);
+    q10s16 = vreinterpretq_s16_u16(q10u16);
+
+    q7s16 = vqaddq_s16(q7s16, q3s16);
+    q8s16 = vqaddq_s16(q8s16, q4s16);
+    q9s16 = vqaddq_s16(q9s16, q5s16);
+    q10s16 = vqaddq_s16(q10s16, q6s16);
+
+    d6u8 = vqrshrun_n_s16(q7s16, 7);
+    d7u8 = vqrshrun_n_s16(q8s16, 7);
+    d8u8 = vqrshrun_n_s16(q9s16, 7);
+    d9u8 = vqrshrun_n_s16(q10s16, 7);
+
+    vst1_u8(dst_ptr, d6u8);
+    dst_ptr += dst_pitch;
+    vst1_u8(dst_ptr, d7u8);
+    dst_ptr += dst_pitch;
+    vst1_u8(dst_ptr, d8u8);
+    dst_ptr += dst_pitch;
+    vst1_u8(dst_ptr, d9u8);
+    return;
+}
+
+void vp8_sixtap_predict8x8_neon(
+        unsigned char *src_ptr,
+        int src_pixels_per_line,
+        int xoffset,
+        int yoffset,
+        unsigned char *dst_ptr,
+        int dst_pitch) {
+    unsigned char *src, *tmpp;
+    unsigned char tmp[64];
+    int i;
+    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
+    uint8x8_t d18u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8, d25u8;
+    uint8x8_t d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
+    int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
+    uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
+    uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
+    int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
+    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
+    uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q9u8, q10u8, q11u8, q12u8;
+
+    if (xoffset == 0) {  // secondpass_filter8x8_only
+        // load second_pass filter
+        dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
+        d0s8 = vdup_lane_s8(dtmps8, 0);
+        d1s8 = vdup_lane_s8(dtmps8, 1);
+        d2s8 = vdup_lane_s8(dtmps8, 2);
+        d3s8 = vdup_lane_s8(dtmps8, 3);
+        d4s8 = vdup_lane_s8(dtmps8, 4);
+        d5s8 = vdup_lane_s8(dtmps8, 5);
+        d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+        d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+        d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+        d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+        d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+        d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+        // load src data
+        src = src_ptr - src_pixels_per_line * 2;
+        d18u8 = vld1_u8(src);
+        src += src_pixels_per_line;
+        d19u8 = vld1_u8(src);
+        src += src_pixels_per_line;
+        d20u8 = vld1_u8(src);
+        src += src_pixels_per_line;
+        d21u8 = vld1_u8(src);
+        src += src_pixels_per_line;
+        d22u8 = vld1_u8(src);
+        src += src_pixels_per_line;
+        d23u8 = vld1_u8(src);
+        src += src_pixels_per_line;
+        d24u8 = vld1_u8(src);
+        src += src_pixels_per_line;
+        d25u8 = vld1_u8(src);
+        src += src_pixels_per_line;
+        d26u8 = vld1_u8(src);
+        src += src_pixels_per_line;
+        d27u8 = vld1_u8(src);
+        src += src_pixels_per_line;
+        d28u8 = vld1_u8(src);
+        src += src_pixels_per_line;
+        d29u8 = vld1_u8(src);
+        src += src_pixels_per_line;
+        d30u8 = vld1_u8(src);
+
+        for (i = 2; i > 0; i--) {
+            q3u16 = vmull_u8(d18u8, d0u8);
+            q4u16 = vmull_u8(d19u8, d0u8);
+            q5u16 = vmull_u8(d20u8, d0u8);
+            q6u16 = vmull_u8(d21u8, d0u8);
+
+            q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
+            q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
+            q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
+            q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
+
+            q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
+            q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
+            q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
+            q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
+
+            q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
+            q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
+            q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
+            q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
+
+            q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
+            q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
+            q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
+            q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
+
+            q7u16 = vmull_u8(d21u8, d3u8);
+            q8u16 = vmull_u8(d22u8, d3u8);
+            q9u16 = vmull_u8(d23u8, d3u8);
+            q10u16 = vmull_u8(d24u8, d3u8);
+
+            q3s16 = vreinterpretq_s16_u16(q3u16);
+            q4s16 = vreinterpretq_s16_u16(q4u16);
+            q5s16 = vreinterpretq_s16_u16(q5u16);
+            q6s16 = vreinterpretq_s16_u16(q6u16);
+            q7s16 = vreinterpretq_s16_u16(q7u16);
+            q8s16 = vreinterpretq_s16_u16(q8u16);
+            q9s16 = vreinterpretq_s16_u16(q9u16);
+            q10s16 = vreinterpretq_s16_u16(q10u16);
+
+            q7s16 = vqaddq_s16(q7s16, q3s16);
+            q8s16 = vqaddq_s16(q8s16, q4s16);
+            q9s16 = vqaddq_s16(q9s16, q5s16);
+            q10s16 = vqaddq_s16(q10s16, q6s16);
+
+            d6u8 = vqrshrun_n_s16(q7s16, 7);
+            d7u8 = vqrshrun_n_s16(q8s16, 7);
+            d8u8 = vqrshrun_n_s16(q9s16, 7);
+            d9u8 = vqrshrun_n_s16(q10s16, 7);
+
+            d18u8 = d22u8;
+            d19u8 = d23u8;
+            d20u8 = d24u8;
+            d21u8 = d25u8;
+            d22u8 = d26u8;
+            d23u8 = d27u8;
+            d24u8 = d28u8;
+            d25u8 = d29u8;
+            d26u8 = d30u8;
+
+            vst1_u8(dst_ptr, d6u8);
+            dst_ptr += dst_pitch;
+            vst1_u8(dst_ptr, d7u8);
+            dst_ptr += dst_pitch;
+            vst1_u8(dst_ptr, d8u8);
+            dst_ptr += dst_pitch;
+            vst1_u8(dst_ptr, d9u8);
+            dst_ptr += dst_pitch;
+        }
+        return;
+    }
+
+    // load first_pass filter
+    dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
+    d0s8 = vdup_lane_s8(dtmps8, 0);
+    d1s8 = vdup_lane_s8(dtmps8, 1);
+    d2s8 = vdup_lane_s8(dtmps8, 2);
+    d3s8 = vdup_lane_s8(dtmps8, 3);
+    d4s8 = vdup_lane_s8(dtmps8, 4);
+    d5s8 = vdup_lane_s8(dtmps8, 5);
+    d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+    d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+    d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+    d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+    d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+    d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+    // First pass: output_height lines x output_width columns (9x4)
+    if (yoffset == 0)  // firstpass_filter4x4_only
+        src = src_ptr - 2;
+    else
+        src = src_ptr - 2 - (src_pixels_per_line * 2);
+
+    tmpp = tmp;
+    for (i = 2; i > 0; i--) {
+        q3u8 = vld1q_u8(src);
+        src += src_pixels_per_line;
+        q4u8 = vld1q_u8(src);
+        src += src_pixels_per_line;
+        q5u8 = vld1q_u8(src);
+        src += src_pixels_per_line;
+        q6u8 = vld1q_u8(src);
+        src += src_pixels_per_line;
+
+        __builtin_prefetch(src);
+        __builtin_prefetch(src + src_pixels_per_line);
+        __builtin_prefetch(src + src_pixels_per_line * 2);
+
+        q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
+        q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
+        q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
+        q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
+
+        d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
+        d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
+        d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
+        d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
+
+        q7u16 = vmlsl_u8(q7u16, d28u8, d1u8);
+        q8u16 = vmlsl_u8(q8u16, d29u8, d1u8);
+        q9u16 = vmlsl_u8(q9u16, d30u8, d1u8);
+        q10u16 = vmlsl_u8(q10u16, d31u8, d1u8);
+
+        d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
+        d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
+        d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
+        d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
+
+        q7u16 = vmlsl_u8(q7u16, d28u8, d4u8);
+        q8u16 = vmlsl_u8(q8u16, d29u8, d4u8);
+        q9u16 = vmlsl_u8(q9u16, d30u8, d4u8);
+        q10u16 = vmlsl_u8(q10u16, d31u8, d4u8);
+
+        d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
+        d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
+        d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
+        d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
+
+        q7u16 = vmlal_u8(q7u16, d28u8, d2u8);
+        q8u16 = vmlal_u8(q8u16, d29u8, d2u8);
+        q9u16 = vmlal_u8(q9u16, d30u8, d2u8);
+        q10u16 = vmlal_u8(q10u16, d31u8, d2u8);
+
+        d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
+        d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
+        d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
+        d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
+
+        q7u16 = vmlal_u8(q7u16, d28u8, d5u8);
+        q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
+        q9u16 = vmlal_u8(q9u16, d30u8, d5u8);
+        q10u16 = vmlal_u8(q10u16, d31u8, d5u8);
+
+        d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
+        d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
+        d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
+        d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
+
+        q3u16 = vmull_u8(d28u8, d3u8);
+        q4u16 = vmull_u8(d29u8, d3u8);
+        q5u16 = vmull_u8(d30u8, d3u8);
+        q6u16 = vmull_u8(d31u8, d3u8);
+
+        q3s16 = vreinterpretq_s16_u16(q3u16);
+        q4s16 = vreinterpretq_s16_u16(q4u16);
+        q5s16 = vreinterpretq_s16_u16(q5u16);
+        q6s16 = vreinterpretq_s16_u16(q6u16);
+        q7s16 = vreinterpretq_s16_u16(q7u16);
+        q8s16 = vreinterpretq_s16_u16(q8u16);
+        q9s16 = vreinterpretq_s16_u16(q9u16);
+        q10s16 = vreinterpretq_s16_u16(q10u16);
+
+        q7s16 = vqaddq_s16(q7s16, q3s16);
+        q8s16 = vqaddq_s16(q8s16, q4s16);
+        q9s16 = vqaddq_s16(q9s16, q5s16);
+        q10s16 = vqaddq_s16(q10s16, q6s16);
+
+        d22u8 = vqrshrun_n_s16(q7s16, 7);
+        d23u8 = vqrshrun_n_s16(q8s16, 7);
+        d24u8 = vqrshrun_n_s16(q9s16, 7);
+        d25u8 = vqrshrun_n_s16(q10s16, 7);
+
+        if (yoffset == 0) {  // firstpass_filter8x4_only
+            vst1_u8(dst_ptr, d22u8);
+            dst_ptr += dst_pitch;
+            vst1_u8(dst_ptr, d23u8);
+            dst_ptr += dst_pitch;
+            vst1_u8(dst_ptr, d24u8);
+            dst_ptr += dst_pitch;
+            vst1_u8(dst_ptr, d25u8);
+            dst_ptr += dst_pitch;
+        } else {
+            vst1_u8(tmpp, d22u8);
+            tmpp += 8;
+            vst1_u8(tmpp, d23u8);
+            tmpp += 8;
+            vst1_u8(tmpp, d24u8);
+            tmpp += 8;
+            vst1_u8(tmpp, d25u8);
+            tmpp += 8;
+        }
+    }
+    if (yoffset == 0)
+        return;
+
+    // First Pass on rest 5-line data
+    q3u8 = vld1q_u8(src);
+    src += src_pixels_per_line;
+    q4u8 = vld1q_u8(src);
+    src += src_pixels_per_line;
+    q5u8 = vld1q_u8(src);
+    src += src_pixels_per_line;
+    q6u8 = vld1q_u8(src);
+    src += src_pixels_per_line;
+    q7u8 = vld1q_u8(src);
+
+    q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
+    q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
+    q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
+    q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
+    q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8);
+
+    d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
+    d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
+    d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
+    d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
+    d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1);
+
+    q8u16 = vmlsl_u8(q8u16, d27u8, d1u8);
+    q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
+    q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
+    q11u16 = vmlsl_u8(q11u16, d30u8, d1u8);
+    q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
+
+    d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
+    d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
+    d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
+    d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
+    d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4);
+
+    q8u16 = vmlsl_u8(q8u16, d27u8, d4u8);
+    q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
+    q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
+    q11u16 = vmlsl_u8(q11u16, d30u8, d4u8);
+    q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
+
+    d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
+    d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
+    d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
+    d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
+    d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2);
+
+    q8u16 = vmlal_u8(q8u16, d27u8, d2u8);
+    q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
+    q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
+    q11u16 = vmlal_u8(q11u16, d30u8, d2u8);
+    q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
+
+    d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
+    d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
+    d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
+    d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
+    d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5);
+
+    q8u16 = vmlal_u8(q8u16, d27u8, d5u8);
+    q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
+    q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
+    q11u16 = vmlal_u8(q11u16, d30u8, d5u8);
+    q12u16 = vmlal_u8(q12u16, d31u8, d5u8);
+
+    d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
+    d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
+    d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
+    d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
+    d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3);
+
+    q3u16 = vmull_u8(d27u8, d3u8);
+    q4u16 = vmull_u8(d28u8, d3u8);
+    q5u16 = vmull_u8(d29u8, d3u8);
+    q6u16 = vmull_u8(d30u8, d3u8);
+    q7u16 = vmull_u8(d31u8, d3u8);
+
+    q3s16 = vreinterpretq_s16_u16(q3u16);
+    q4s16 = vreinterpretq_s16_u16(q4u16);
+    q5s16 = vreinterpretq_s16_u16(q5u16);
+    q6s16 = vreinterpretq_s16_u16(q6u16);
+    q7s16 = vreinterpretq_s16_u16(q7u16);
+    q8s16 = vreinterpretq_s16_u16(q8u16);
+    q9s16 = vreinterpretq_s16_u16(q9u16);
+    q10s16 = vreinterpretq_s16_u16(q10u16);
+    q11s16 = vreinterpretq_s16_u16(q11u16);
+    q12s16 = vreinterpretq_s16_u16(q12u16);
+
+    q8s16 = vqaddq_s16(q8s16, q3s16);
+    q9s16 = vqaddq_s16(q9s16, q4s16);
+    q10s16 = vqaddq_s16(q10s16, q5s16);
+    q11s16 = vqaddq_s16(q11s16, q6s16);
+    q12s16 = vqaddq_s16(q12s16, q7s16);
+
+    d26u8 = vqrshrun_n_s16(q8s16, 7);
+    d27u8 = vqrshrun_n_s16(q9s16, 7);
+    d28u8 = vqrshrun_n_s16(q10s16, 7);
+    d29u8 = vqrshrun_n_s16(q11s16, 7);
+    d30u8 = vqrshrun_n_s16(q12s16, 7);
+
+    // Second pass: 8x8
+    dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
+    d0s8 = vdup_lane_s8(dtmps8, 0);
+    d1s8 = vdup_lane_s8(dtmps8, 1);
+    d2s8 = vdup_lane_s8(dtmps8, 2);
+    d3s8 = vdup_lane_s8(dtmps8, 3);
+    d4s8 = vdup_lane_s8(dtmps8, 4);
+    d5s8 = vdup_lane_s8(dtmps8, 5);
+    d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+    d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+    d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+    d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+    d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+    d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+    tmpp = tmp;
+    q9u8 = vld1q_u8(tmpp);
+    tmpp += 16;
+    q10u8 = vld1q_u8(tmpp);
+    tmpp += 16;
+    q11u8 = vld1q_u8(tmpp);
+    tmpp += 16;
+    q12u8 = vld1q_u8(tmpp);
+
+    d18u8 = vget_low_u8(q9u8);
+    d19u8 = vget_high_u8(q9u8);
+    d20u8 = vget_low_u8(q10u8);
+    d21u8 = vget_high_u8(q10u8);
+    d22u8 = vget_low_u8(q11u8);
+    d23u8 = vget_high_u8(q11u8);
+    d24u8 = vget_low_u8(q12u8);
+    d25u8 = vget_high_u8(q12u8);
+
+    for (i = 2; i > 0; i--) {
+        q3u16 = vmull_u8(d18u8, d0u8);
+        q4u16 = vmull_u8(d19u8, d0u8);
+        q5u16 = vmull_u8(d20u8, d0u8);
+        q6u16 = vmull_u8(d21u8, d0u8);
+
+        q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
+        q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
+        q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
+        q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
+
+        q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
+        q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
+        q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
+        q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
+
+        q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
+        q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
+        q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
+        q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
+
+        q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
+        q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
+        q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
+        q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
+
+        q7u16 = vmull_u8(d21u8, d3u8);
+        q8u16 = vmull_u8(d22u8, d3u8);
+        q9u16 = vmull_u8(d23u8, d3u8);
+        q10u16 = vmull_u8(d24u8, d3u8);
+
+        q3s16 = vreinterpretq_s16_u16(q3u16);
+        q4s16 = vreinterpretq_s16_u16(q4u16);
+        q5s16 = vreinterpretq_s16_u16(q5u16);
+        q6s16 = vreinterpretq_s16_u16(q6u16);
+        q7s16 = vreinterpretq_s16_u16(q7u16);
+        q8s16 = vreinterpretq_s16_u16(q8u16);
+        q9s16 = vreinterpretq_s16_u16(q9u16);
+        q10s16 = vreinterpretq_s16_u16(q10u16);
+
+        q7s16 = vqaddq_s16(q7s16, q3s16);
+        q8s16 = vqaddq_s16(q8s16, q4s16);
+        q9s16 = vqaddq_s16(q9s16, q5s16);
+        q10s16 = vqaddq_s16(q10s16, q6s16);
+
+        d6u8 = vqrshrun_n_s16(q7s16, 7);
+        d7u8 = vqrshrun_n_s16(q8s16, 7);
+        d8u8 = vqrshrun_n_s16(q9s16, 7);
+        d9u8 = vqrshrun_n_s16(q10s16, 7);
+
+        d18u8 = d22u8;
+        d19u8 = d23u8;
+        d20u8 = d24u8;
+        d21u8 = d25u8;
+        d22u8 = d26u8;
+        d23u8 = d27u8;
+        d24u8 = d28u8;
+        d25u8 = d29u8;
+        d26u8 = d30u8;
+
+        vst1_u8(dst_ptr, d6u8);
+        dst_ptr += dst_pitch;
+        vst1_u8(dst_ptr, d7u8);
+        dst_ptr += dst_pitch;
+        vst1_u8(dst_ptr, d8u8);
+        dst_ptr += dst_pitch;
+        vst1_u8(dst_ptr, d9u8);
+        dst_ptr += dst_pitch;
+    }
+    return;
+}
+
+void vp8_sixtap_predict16x16_neon(
+        unsigned char *src_ptr,
+        int src_pixels_per_line,
+        int xoffset,
+        int yoffset,
+        unsigned char *dst_ptr,
+        int dst_pitch) {
+    unsigned char *src, *src_tmp, *dst, *tmpp;
+    unsigned char tmp[336];
+    int i, j;
+    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
+    uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d18u8, d19u8;
+    uint8x8_t d20u8, d21u8, d22u8, d23u8, d24u8, d25u8, d26u8, d27u8;
+    uint8x8_t d28u8, d29u8, d30u8, d31u8;
+    int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
+    uint8x16_t q3u8, q4u8;
+    uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16, q8u16, q9u16, q10u16;
+    uint16x8_t q11u16, q12u16, q13u16, q15u16;
+    int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16, q8s16, q9s16, q10s16;
+    int16x8_t q11s16, q12s16, q13s16, q15s16;
+
+    if (xoffset == 0) {  // secondpass_filter8x8_only
+        // load second_pass filter
+        dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
+        d0s8 = vdup_lane_s8(dtmps8, 0);
+        d1s8 = vdup_lane_s8(dtmps8, 1);
+        d2s8 = vdup_lane_s8(dtmps8, 2);
+        d3s8 = vdup_lane_s8(dtmps8, 3);
+        d4s8 = vdup_lane_s8(dtmps8, 4);
+        d5s8 = vdup_lane_s8(dtmps8, 5);
+        d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+        d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+        d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+        d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+        d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+        d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+        // load src data
+        src_tmp = src_ptr - src_pixels_per_line * 2;
+        for (i = 0; i < 2; i++) {
+            src = src_tmp + i * 8;
+            dst = dst_ptr + i * 8;
+            d18u8 = vld1_u8(src);
+            src += src_pixels_per_line;
+            d19u8 = vld1_u8(src);
+            src += src_pixels_per_line;
+            d20u8 = vld1_u8(src);
+            src += src_pixels_per_line;
+            d21u8 = vld1_u8(src);
+            src += src_pixels_per_line;
+            d22u8 = vld1_u8(src);
+            src += src_pixels_per_line;
+            for (j = 0; j < 4; j++) {
+                d23u8 = vld1_u8(src);
+                src += src_pixels_per_line;
+                d24u8 = vld1_u8(src);
+                src += src_pixels_per_line;
+                d25u8 = vld1_u8(src);
+                src += src_pixels_per_line;
+                d26u8 = vld1_u8(src);
+                src += src_pixels_per_line;
+
+                q3u16 = vmull_u8(d18u8, d0u8);
+                q4u16 = vmull_u8(d19u8, d0u8);
+                q5u16 = vmull_u8(d20u8, d0u8);
+                q6u16 = vmull_u8(d21u8, d0u8);
+
+                q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
+                q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
+                q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
+                q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
+
+                q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
+                q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
+                q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
+                q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
+
+                q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
+                q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
+                q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
+                q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
+
+                q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
+                q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
+                q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
+                q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
+
+                q7u16 = vmull_u8(d21u8, d3u8);
+                q8u16 = vmull_u8(d22u8, d3u8);
+                q9u16 = vmull_u8(d23u8, d3u8);
+                q10u16 = vmull_u8(d24u8, d3u8);
+
+                q3s16 = vreinterpretq_s16_u16(q3u16);
+                q4s16 = vreinterpretq_s16_u16(q4u16);
+                q5s16 = vreinterpretq_s16_u16(q5u16);
+                q6s16 = vreinterpretq_s16_u16(q6u16);
+                q7s16 = vreinterpretq_s16_u16(q7u16);
+                q8s16 = vreinterpretq_s16_u16(q8u16);
+                q9s16 = vreinterpretq_s16_u16(q9u16);
+                q10s16 = vreinterpretq_s16_u16(q10u16);
+
+                q7s16 = vqaddq_s16(q7s16, q3s16);
+                q8s16 = vqaddq_s16(q8s16, q4s16);
+                q9s16 = vqaddq_s16(q9s16, q5s16);
+                q10s16 = vqaddq_s16(q10s16, q6s16);
+
+                d6u8 = vqrshrun_n_s16(q7s16, 7);
+                d7u8 = vqrshrun_n_s16(q8s16, 7);
+                d8u8 = vqrshrun_n_s16(q9s16, 7);
+                d9u8 = vqrshrun_n_s16(q10s16, 7);
+
+                d18u8 = d22u8;
+                d19u8 = d23u8;
+                d20u8 = d24u8;
+                d21u8 = d25u8;
+                d22u8 = d26u8;
+
+                vst1_u8(dst, d6u8);
+                dst += dst_pitch;
+                vst1_u8(dst, d7u8);
+                dst += dst_pitch;
+                vst1_u8(dst, d8u8);
+                dst += dst_pitch;
+                vst1_u8(dst, d9u8);
+                dst += dst_pitch;
+            }
+        }
+        return;
+    }
+
+    // load first_pass filter
+    dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
+    d0s8 = vdup_lane_s8(dtmps8, 0);
+    d1s8 = vdup_lane_s8(dtmps8, 1);
+    d2s8 = vdup_lane_s8(dtmps8, 2);
+    d3s8 = vdup_lane_s8(dtmps8, 3);
+    d4s8 = vdup_lane_s8(dtmps8, 4);
+    d5s8 = vdup_lane_s8(dtmps8, 5);
+    d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+    d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+    d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+    d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+    d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+    d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+    // First pass: output_height lines x output_width columns (9x4)
+    if (yoffset == 0) {  // firstpass_filter4x4_only
+        src = src_ptr - 2;
+        dst = dst_ptr;
+        for (i = 0; i < 8; i++) {
+            d6u8 = vld1_u8(src);
+            d7u8 = vld1_u8(src + 8);
+            d8u8 = vld1_u8(src + 16);
+            src += src_pixels_per_line;
+            d9u8 = vld1_u8(src);
+            d10u8 = vld1_u8(src + 8);
+            d11u8 = vld1_u8(src + 16);
+            src += src_pixels_per_line;
+
+            __builtin_prefetch(src);
+            __builtin_prefetch(src + src_pixels_per_line);
+
+            q6u16 = vmull_u8(d6u8, d0u8);
+            q7u16 = vmull_u8(d7u8, d0u8);
+            q8u16 = vmull_u8(d9u8, d0u8);
+            q9u16 = vmull_u8(d10u8, d0u8);
+
+            d20u8 = vext_u8(d6u8, d7u8, 1);
+            d21u8 = vext_u8(d9u8, d10u8, 1);
+            d22u8 = vext_u8(d7u8, d8u8, 1);
+            d23u8 = vext_u8(d10u8, d11u8, 1);
+            d24u8 = vext_u8(d6u8, d7u8, 4);
+            d25u8 = vext_u8(d9u8, d10u8, 4);
+            d26u8 = vext_u8(d7u8, d8u8, 4);
+            d27u8 = vext_u8(d10u8, d11u8, 4);
+            d28u8 = vext_u8(d6u8, d7u8, 5);
+            d29u8 = vext_u8(d9u8, d10u8, 5);
+
+            q6u16 = vmlsl_u8(q6u16, d20u8, d1u8);
+            q8u16 = vmlsl_u8(q8u16, d21u8, d1u8);
+            q7u16 = vmlsl_u8(q7u16, d22u8, d1u8);
+            q9u16 = vmlsl_u8(q9u16, d23u8, d1u8);
+            q6u16 = vmlsl_u8(q6u16, d24u8, d4u8);
+            q8u16 = vmlsl_u8(q8u16, d25u8, d4u8);
+            q7u16 = vmlsl_u8(q7u16, d26u8, d4u8);
+            q9u16 = vmlsl_u8(q9u16, d27u8, d4u8);
+            q6u16 = vmlal_u8(q6u16, d28u8, d5u8);
+            q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
+
+            d20u8 = vext_u8(d7u8, d8u8, 5);
+            d21u8 = vext_u8(d10u8, d11u8, 5);
+            d22u8 = vext_u8(d6u8, d7u8, 2);
+            d23u8 = vext_u8(d9u8, d10u8, 2);
+            d24u8 = vext_u8(d7u8, d8u8, 2);
+            d25u8 = vext_u8(d10u8, d11u8, 2);
+            d26u8 = vext_u8(d6u8, d7u8, 3);
+            d27u8 = vext_u8(d9u8, d10u8, 3);
+            d28u8 = vext_u8(d7u8, d8u8, 3);
+            d29u8 = vext_u8(d10u8, d11u8, 3);
+
+            q7u16 = vmlal_u8(q7u16, d20u8, d5u8);
+            q9u16 = vmlal_u8(q9u16, d21u8, d5u8);
+            q6u16 = vmlal_u8(q6u16, d22u8, d2u8);
+            q8u16 = vmlal_u8(q8u16, d23u8, d2u8);
+            q7u16 = vmlal_u8(q7u16, d24u8, d2u8);
+            q9u16 = vmlal_u8(q9u16, d25u8, d2u8);
+
+            q10u16 = vmull_u8(d26u8, d3u8);
+            q11u16 = vmull_u8(d27u8, d3u8);
+            q12u16 = vmull_u8(d28u8, d3u8);
+            q15u16 = vmull_u8(d29u8, d3u8);
+
+            q6s16 = vreinterpretq_s16_u16(q6u16);
+            q7s16 = vreinterpretq_s16_u16(q7u16);
+            q8s16 = vreinterpretq_s16_u16(q8u16);
+            q9s16 = vreinterpretq_s16_u16(q9u16);
+            q10s16 = vreinterpretq_s16_u16(q10u16);
+            q11s16 = vreinterpretq_s16_u16(q11u16);
+            q12s16 = vreinterpretq_s16_u16(q12u16);
+            q15s16 = vreinterpretq_s16_u16(q15u16);
+
+            q6s16 = vqaddq_s16(q6s16, q10s16);
+            q8s16 = vqaddq_s16(q8s16, q11s16);
+            q7s16 = vqaddq_s16(q7s16, q12s16);
+            q9s16 = vqaddq_s16(q9s16, q15s16);
+
+            d6u8 = vqrshrun_n_s16(q6s16, 7);
+            d7u8 = vqrshrun_n_s16(q7s16, 7);
+            d8u8 = vqrshrun_n_s16(q8s16, 7);
+            d9u8 = vqrshrun_n_s16(q9s16, 7);
+
+            q3u8 = vcombine_u8(d6u8, d7u8);
+            q4u8 = vcombine_u8(d8u8, d9u8);
+            vst1q_u8(dst, q3u8);
+            dst += dst_pitch;
+            vst1q_u8(dst, q4u8);
+            dst += dst_pitch;
+        }
+        return;
+    }
+
+    src = src_ptr - 2 - src_pixels_per_line * 2;
+    tmpp = tmp;
+    for (i = 0; i < 7; i++) {
+        d6u8 = vld1_u8(src);
+        d7u8 = vld1_u8(src + 8);
+        d8u8 = vld1_u8(src + 16);
+        src += src_pixels_per_line;
+        d9u8 = vld1_u8(src);
+        d10u8 = vld1_u8(src + 8);
+        d11u8 = vld1_u8(src + 16);
+        src += src_pixels_per_line;
+        d12u8 = vld1_u8(src);
+        d13u8 = vld1_u8(src + 8);
+        d14u8 = vld1_u8(src + 16);
+        src += src_pixels_per_line;
+
+        __builtin_prefetch(src);
+        __builtin_prefetch(src + src_pixels_per_line);
+        __builtin_prefetch(src + src_pixels_per_line * 2);
+
+        q8u16 = vmull_u8(d6u8, d0u8);
+        q9u16 = vmull_u8(d7u8, d0u8);
+        q10u16 = vmull_u8(d9u8, d0u8);
+        q11u16 = vmull_u8(d10u8, d0u8);
+        q12u16 = vmull_u8(d12u8, d0u8);
+        q13u16 = vmull_u8(d13u8, d0u8);
+
+        d28u8 = vext_u8(d6u8, d7u8, 1);
+        d29u8 = vext_u8(d9u8, d10u8, 1);
+        d30u8 = vext_u8(d12u8, d13u8, 1);
+        q8u16 = vmlsl_u8(q8u16, d28u8, d1u8);
+        q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
+        q12u16 = vmlsl_u8(q12u16, d30u8, d1u8);
+        d28u8 = vext_u8(d7u8, d8u8, 1);
+        d29u8 = vext_u8(d10u8, d11u8, 1);
+        d30u8 = vext_u8(d13u8, d14u8, 1);
+        q9u16  = vmlsl_u8(q9u16, d28u8, d1u8);
+        q11u16 = vmlsl_u8(q11u16, d29u8, d1u8);
+        q13u16 = vmlsl_u8(q13u16, d30u8, d1u8);
+
+        d28u8 = vext_u8(d6u8, d7u8, 4);
+        d29u8 = vext_u8(d9u8, d10u8, 4);
+        d30u8 = vext_u8(d12u8, d13u8, 4);
+        q8u16 = vmlsl_u8(q8u16, d28u8, d4u8);
+        q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
+        q12u16 = vmlsl_u8(q12u16, d30u8, d4u8);
+        d28u8 = vext_u8(d7u8, d8u8, 4);
+        d29u8 = vext_u8(d10u8, d11u8, 4);
+        d30u8 = vext_u8(d13u8, d14u8, 4);
+        q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
+        q11u16 = vmlsl_u8(q11u16, d29u8, d4u8);
+        q13u16 = vmlsl_u8(q13u16, d30u8, d4u8);
+
+        d28u8 = vext_u8(d6u8, d7u8, 5);
+        d29u8 = vext_u8(d9u8, d10u8, 5);
+        d30u8 = vext_u8(d12u8, d13u8, 5);
+        q8u16 = vmlal_u8(q8u16, d28u8, d5u8);
+        q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
+        q12u16 = vmlal_u8(q12u16, d30u8, d5u8);
+        d28u8 = vext_u8(d7u8, d8u8, 5);
+        d29u8 = vext_u8(d10u8, d11u8, 5);
+        d30u8 = vext_u8(d13u8, d14u8, 5);
+        q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
+        q11u16 = vmlal_u8(q11u16, d29u8, d5u8);
+        q13u16 = vmlal_u8(q13u16, d30u8, d5u8);
+
+        d28u8 = vext_u8(d6u8, d7u8, 2);
+        d29u8 = vext_u8(d9u8, d10u8, 2);
+        d30u8 = vext_u8(d12u8, d13u8, 2);
+        q8u16 = vmlal_u8(q8u16, d28u8, d2u8);
+        q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
+        q12u16 = vmlal_u8(q12u16, d30u8, d2u8);
+        d28u8 = vext_u8(d7u8, d8u8, 2);
+        d29u8 = vext_u8(d10u8, d11u8, 2);
+        d30u8 = vext_u8(d13u8, d14u8, 2);
+        q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
+        q11u16 = vmlal_u8(q11u16, d29u8, d2u8);
+        q13u16 = vmlal_u8(q13u16, d30u8, d2u8);
+
+        d28u8 = vext_u8(d6u8, d7u8, 3);
+        d29u8 = vext_u8(d9u8, d10u8, 3);
+        d30u8 = vext_u8(d12u8, d13u8, 3);
+        d15u8 = vext_u8(d7u8, d8u8, 3);
+        d31u8 = vext_u8(d10u8, d11u8, 3);
+        d6u8  = vext_u8(d13u8, d14u8, 3);
+        q4u16 = vmull_u8(d28u8, d3u8);
+        q5u16 = vmull_u8(d29u8, d3u8);
+        q6u16 = vmull_u8(d30u8, d3u8);
+        q4s16 = vreinterpretq_s16_u16(q4u16);
+        q5s16 = vreinterpretq_s16_u16(q5u16);
+        q6s16 = vreinterpretq_s16_u16(q6u16);
+        q8s16 = vreinterpretq_s16_u16(q8u16);
+        q10s16 = vreinterpretq_s16_u16(q10u16);
+        q12s16 = vreinterpretq_s16_u16(q12u16);
+        q8s16 = vqaddq_s16(q8s16, q4s16);
+        q10s16 = vqaddq_s16(q10s16, q5s16);
+        q12s16 = vqaddq_s16(q12s16, q6s16);
+
+        q6u16 = vmull_u8(d15u8, d3u8);
+        q7u16 = vmull_u8(d31u8, d3u8);
+        q3u16 = vmull_u8(d6u8, d3u8);
+        q3s16 = vreinterpretq_s16_u16(q3u16);
+        q6s16 = vreinterpretq_s16_u16(q6u16);
+        q7s16 = vreinterpretq_s16_u16(q7u16);
+        q9s16 = vreinterpretq_s16_u16(q9u16);
+        q11s16 = vreinterpretq_s16_u16(q11u16);
+        q13s16 = vreinterpretq_s16_u16(q13u16);
+        q9s16 = vqaddq_s16(q9s16, q6s16);
+        q11s16 = vqaddq_s16(q11s16, q7s16);
+        q13s16 = vqaddq_s16(q13s16, q3s16);
+
+        d6u8 = vqrshrun_n_s16(q8s16, 7);
+        d7u8 = vqrshrun_n_s16(q9s16, 7);
+        d8u8 = vqrshrun_n_s16(q10s16, 7);
+        d9u8 = vqrshrun_n_s16(q11s16, 7);
+        d10u8 = vqrshrun_n_s16(q12s16, 7);
+        d11u8 = vqrshrun_n_s16(q13s16, 7);
+
+        vst1_u8(tmpp, d6u8);
+        tmpp += 8;
+        vst1_u8(tmpp, d7u8);
+        tmpp += 8;
+        vst1_u8(tmpp, d8u8);
+        tmpp += 8;
+        vst1_u8(tmpp, d9u8);
+        tmpp += 8;
+        vst1_u8(tmpp, d10u8);
+        tmpp += 8;
+        vst1_u8(tmpp, d11u8);
+        tmpp += 8;
+    }
+
+    // Second pass: 16x16
+    dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
+    d0s8 = vdup_lane_s8(dtmps8, 0);
+    d1s8 = vdup_lane_s8(dtmps8, 1);
+    d2s8 = vdup_lane_s8(dtmps8, 2);
+    d3s8 = vdup_lane_s8(dtmps8, 3);
+    d4s8 = vdup_lane_s8(dtmps8, 4);
+    d5s8 = vdup_lane_s8(dtmps8, 5);
+    d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
+    d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
+    d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
+    d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
+    d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
+    d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
+
+    for (i = 0; i < 2; i++) {
+        dst = dst_ptr + 8 * i;
+        tmpp = tmp + 8 * i;
+        d18u8 = vld1_u8(tmpp);
+        tmpp += 16;
+        d19u8 = vld1_u8(tmpp);
+        tmpp += 16;
+        d20u8 = vld1_u8(tmpp);
+        tmpp += 16;
+        d21u8 = vld1_u8(tmpp);
+        tmpp += 16;
+        d22u8 = vld1_u8(tmpp);
+        tmpp += 16;
+        for (j = 0; j < 4; j++) {
+            d23u8 = vld1_u8(tmpp);
+            tmpp += 16;
+            d24u8 = vld1_u8(tmpp);
+            tmpp += 16;
+            d25u8 = vld1_u8(tmpp);
+            tmpp += 16;
+            d26u8 = vld1_u8(tmpp);
+            tmpp += 16;
+
+            q3u16 = vmull_u8(d18u8, d0u8);
+            q4u16 = vmull_u8(d19u8, d0u8);
+            q5u16 = vmull_u8(d20u8, d0u8);
+            q6u16 = vmull_u8(d21u8, d0u8);
+
+            q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
+            q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
+            q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
+            q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
+
+            q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
+            q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
+            q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
+            q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
+
+            q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
+            q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
+            q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
+            q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
+
+            q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
+            q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
+            q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
+            q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
+
+            q7u16 = vmull_u8(d21u8, d3u8);
+            q8u16 = vmull_u8(d22u8, d3u8);
+            q9u16 = vmull_u8(d23u8, d3u8);
+            q10u16 = vmull_u8(d24u8, d3u8);
+
+            q3s16 = vreinterpretq_s16_u16(q3u16);
+            q4s16 = vreinterpretq_s16_u16(q4u16);
+            q5s16 = vreinterpretq_s16_u16(q5u16);
+            q6s16 = vreinterpretq_s16_u16(q6u16);
+            q7s16 = vreinterpretq_s16_u16(q7u16);
+            q8s16 = vreinterpretq_s16_u16(q8u16);
+            q9s16 = vreinterpretq_s16_u16(q9u16);
+            q10s16 = vreinterpretq_s16_u16(q10u16);
+
+            q7s16 = vqaddq_s16(q7s16, q3s16);
+            q8s16 = vqaddq_s16(q8s16, q4s16);
+            q9s16 = vqaddq_s16(q9s16, q5s16);
+            q10s16 = vqaddq_s16(q10s16, q6s16);
+
+            d6u8 = vqrshrun_n_s16(q7s16, 7);
+            d7u8 = vqrshrun_n_s16(q8s16, 7);
+            d8u8 = vqrshrun_n_s16(q9s16, 7);
+            d9u8 = vqrshrun_n_s16(q10s16, 7);
+
+            d18u8 = d22u8;
+            d19u8 = d23u8;
+            d20u8 = d24u8;
+            d21u8 = d25u8;
+            d22u8 = d26u8;
+
+            vst1_u8(dst, d6u8);
+            dst += dst_pitch;
+            vst1_u8(dst, d7u8);
+            dst += dst_pitch;
+            vst1_u8(dst, d8u8);
+            dst += dst_pitch;
+            vst1_u8(dst, d9u8);
+            dst += dst_pitch;
+        }
+    }
+    return;
+}
diff --git a/libs/libvpx/vp8/common/arm/neon/vp8_loopfilter_neon.c b/libs/libvpx/vp8/common/arm/neon/vp8_loopfilter_neon.c
new file mode 100644
index 0000000000..9d6807af71
--- /dev/null
+++ b/libs/libvpx/vp8/common/arm/neon/vp8_loopfilter_neon.c
@@ -0,0 +1,550 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vpx_config.h"
+#include "vpx_ports/arm.h"
+
+static INLINE void vp8_loop_filter_neon(
+        uint8x16_t qblimit,  // flimit
+        uint8x16_t qlimit,   // limit
+        uint8x16_t qthresh,  // thresh
+        uint8x16_t q3,       // p3
+        uint8x16_t q4,       // p2
+        uint8x16_t q5,       // p1
+        uint8x16_t q6,       // p0
+        uint8x16_t q7,       // q0
+        uint8x16_t q8,       // q1
+        uint8x16_t q9,       // q2
+        uint8x16_t q10,      // q3
+        uint8x16_t *q5r,     // p1
+        uint8x16_t *q6r,     // p0
+        uint8x16_t *q7r,     // q0
+        uint8x16_t *q8r) {   // q1
+    uint8x16_t q0u8, q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8;
+    int16x8_t q2s16, q11s16;
+    uint16x8_t q4u16;
+    int8x16_t q1s8, q2s8, q10s8, q11s8, q12s8, q13s8;
+    int8x8_t d2s8, d3s8;
+
+    q11u8 = vabdq_u8(q3, q4);
+    q12u8 = vabdq_u8(q4, q5);
+    q13u8 = vabdq_u8(q5, q6);
+    q14u8 = vabdq_u8(q8, q7);
+    q3    = vabdq_u8(q9, q8);
+    q4    = vabdq_u8(q10, q9);
+
+    q11u8 = vmaxq_u8(q11u8, q12u8);
+    q12u8 = vmaxq_u8(q13u8, q14u8);
+    q3    = vmaxq_u8(q3, q4);
+    q15u8 = vmaxq_u8(q11u8, q12u8);
+
+    q9 = vabdq_u8(q6, q7);
+
+    // vp8_hevmask
+    q13u8 = vcgtq_u8(q13u8, qthresh);
+    q14u8 = vcgtq_u8(q14u8, qthresh);
+    q15u8 = vmaxq_u8(q15u8, q3);
+
+    q2u8 = vabdq_u8(q5, q8);
+    q9 = vqaddq_u8(q9, q9);
+
+    q15u8 = vcgeq_u8(qlimit, q15u8);
+
+    // vp8_filter() function
+    // convert to signed
+    q10 = vdupq_n_u8(0x80);
+    q8 = veorq_u8(q8, q10);
+    q7 = veorq_u8(q7, q10);
+    q6 = veorq_u8(q6, q10);
+    q5 = veorq_u8(q5, q10);
+
+    q2u8 = vshrq_n_u8(q2u8, 1);
+    q9 = vqaddq_u8(q9, q2u8);
+
+    q10 = vdupq_n_u8(3);
+
+    q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
+                     vget_low_s8(vreinterpretq_s8_u8(q6)));
+    q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
+                      vget_high_s8(vreinterpretq_s8_u8(q6)));
+
+    q9 = vcgeq_u8(qblimit, q9);
+
+    q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5),
+                    vreinterpretq_s8_u8(q8));
+
+    q14u8 = vorrq_u8(q13u8, q14u8);
+
+    q4u16 = vmovl_u8(vget_low_u8(q10));
+    q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16));
+    q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16));
+
+    q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8);
+    q15u8 = vandq_u8(q15u8, q9);
+
+    q1s8 = vreinterpretq_s8_u8(q1u8);
+    q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
+    q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8));
+
+    q9 = vdupq_n_u8(4);
+    // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
+    d2s8 = vqmovn_s16(q2s16);
+    d3s8 = vqmovn_s16(q11s16);
+    q1s8 = vcombine_s8(d2s8, d3s8);
+    q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8);
+    q1s8 = vreinterpretq_s8_u8(q1u8);
+
+    q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q10));
+    q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9));
+    q2s8 = vshrq_n_s8(q2s8, 3);
+    q1s8 = vshrq_n_s8(q1s8, 3);
+
+    q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8);
+    q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8);
+
+    q1s8 = vrshrq_n_s8(q1s8, 1);
+    q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
+
+    q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8);
+    q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8);
+
+    q0u8 = vdupq_n_u8(0x80);
+    *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q0u8);
+    *q7r = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
+    *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
+    *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q0u8);
+    return;
+}
+
+void vp8_loop_filter_horizontal_edge_y_neon(
+        unsigned char *src,
+        int pitch,
+        unsigned char blimit,
+        unsigned char limit,
+        unsigned char thresh) {
+    uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+    uint8x16_t q5, q6, q7, q8, q9, q10;
+
+    qblimit = vdupq_n_u8(blimit);
+    qlimit  = vdupq_n_u8(limit);
+    qthresh = vdupq_n_u8(thresh);
+    src -= (pitch << 2);
+
+    q3 = vld1q_u8(src);
+    src += pitch;
+    q4 = vld1q_u8(src);
+    src += pitch;
+    q5 = vld1q_u8(src);
+    src += pitch;
+    q6 = vld1q_u8(src);
+    src += pitch;
+    q7 = vld1q_u8(src);
+    src += pitch;
+    q8 = vld1q_u8(src);
+    src += pitch;
+    q9 = vld1q_u8(src);
+    src += pitch;
+    q10 = vld1q_u8(src);
+
+    vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+                         q5, q6, q7, q8, q9, q10,
+                         &q5, &q6, &q7, &q8);
+
+    src -= (pitch * 5);
+    vst1q_u8(src, q5);
+    src += pitch;
+    vst1q_u8(src, q6);
+    src += pitch;
+    vst1q_u8(src, q7);
+    src += pitch;
+    vst1q_u8(src, q8);
+    return;
+}
+
+void vp8_loop_filter_horizontal_edge_uv_neon(
+        unsigned char *u,
+        int pitch,
+        unsigned char blimit,
+        unsigned char limit,
+        unsigned char thresh,
+        unsigned char *v) {
+    uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+    uint8x16_t q5, q6, q7, q8, q9, q10;
+    uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
+    uint8x8_t d15, d16, d17, d18, d19, d20, d21;
+
+    qblimit = vdupq_n_u8(blimit);
+    qlimit  = vdupq_n_u8(limit);
+    qthresh = vdupq_n_u8(thresh);
+
+    u -= (pitch << 2);
+    v -= (pitch << 2);
+
+    d6  = vld1_u8(u);
+    u += pitch;
+    d7  = vld1_u8(v);
+    v += pitch;
+    d8  = vld1_u8(u);
+    u += pitch;
+    d9  = vld1_u8(v);
+    v += pitch;
+    d10 = vld1_u8(u);
+    u += pitch;
+    d11 = vld1_u8(v);
+    v += pitch;
+    d12 = vld1_u8(u);
+    u += pitch;
+    d13 = vld1_u8(v);
+    v += pitch;
+    d14 = vld1_u8(u);
+    u += pitch;
+    d15 = vld1_u8(v);
+    v += pitch;
+    d16 = vld1_u8(u);
+    u += pitch;
+    d17 = vld1_u8(v);
+    v += pitch;
+    d18 = vld1_u8(u);
+    u += pitch;
+    d19 = vld1_u8(v);
+    v += pitch;
+    d20 = vld1_u8(u);
+    d21 = vld1_u8(v);
+
+    q3 = vcombine_u8(d6, d7);
+    q4 = vcombine_u8(d8, d9);
+    q5 = vcombine_u8(d10, d11);
+    q6 = vcombine_u8(d12, d13);
+    q7 = vcombine_u8(d14, d15);
+    q8 = vcombine_u8(d16, d17);
+    q9 = vcombine_u8(d18, d19);
+    q10 = vcombine_u8(d20, d21);
+
+    vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+                         q5, q6, q7, q8, q9, q10,
+                         &q5, &q6, &q7, &q8);
+
+    u -= (pitch * 5);
+    vst1_u8(u, vget_low_u8(q5));
+    u += pitch;
+    vst1_u8(u, vget_low_u8(q6));
+    u += pitch;
+    vst1_u8(u, vget_low_u8(q7));
+    u += pitch;
+    vst1_u8(u, vget_low_u8(q8));
+
+    v -= (pitch * 5);
+    vst1_u8(v, vget_high_u8(q5));
+    v += pitch;
+    vst1_u8(v, vget_high_u8(q6));
+    v += pitch;
+    vst1_u8(v, vget_high_u8(q7));
+    v += pitch;
+    vst1_u8(v, vget_high_u8(q8));
+    return;
+}
+
+static INLINE void write_4x8(unsigned char *dst, int pitch,
+                             const uint8x8x4_t result) {
+#ifdef VPX_INCOMPATIBLE_GCC
+    /*
+     * uint8x8x4_t result
+    00 01 02 03 | 04 05 06 07
+    10 11 12 13 | 14 15 16 17
+    20 21 22 23 | 24 25 26 27
+    30 31 32 33 | 34 35 36 37
+    ---
+    * after vtrn_u16
+    00 01 20 21 | 04 05 24 25
+    02 03 22 23 | 06 07 26 27
+    10 11 30 31 | 14 15 34 35
+    12 13 32 33 | 16 17 36 37
+    ---
+    * after vtrn_u8
+    00 10 20 30 | 04 14 24 34
+    01 11 21 31 | 05 15 25 35
+    02 12 22 32 | 06 16 26 36
+    03 13 23 33 | 07 17 27 37
+    */
+    const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[0]),
+                                          vreinterpret_u16_u8(result.val[2]));
+    const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[1]),
+                                          vreinterpret_u16_u8(result.val[3]));
+    const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]),
+                                       vreinterpret_u8_u16(r13_u16.val[0]));
+    const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]),
+                                       vreinterpret_u8_u16(r13_u16.val[1]));
+    const uint32x2_t x_0_4 = vreinterpret_u32_u8(r01_u8.val[0]);
+    const uint32x2_t x_1_5 = vreinterpret_u32_u8(r01_u8.val[1]);
+    const uint32x2_t x_2_6 = vreinterpret_u32_u8(r23_u8.val[0]);
+    const uint32x2_t x_3_7 = vreinterpret_u32_u8(r23_u8.val[1]);
+    vst1_lane_u32((uint32_t *)dst, x_0_4, 0);
+    dst += pitch;
+    vst1_lane_u32((uint32_t *)dst, x_1_5, 0);
+    dst += pitch;
+    vst1_lane_u32((uint32_t *)dst, x_2_6, 0);
+    dst += pitch;
+    vst1_lane_u32((uint32_t *)dst, x_3_7, 0);
+    dst += pitch;
+    vst1_lane_u32((uint32_t *)dst, x_0_4, 1);
+    dst += pitch;
+    vst1_lane_u32((uint32_t *)dst, x_1_5, 1);
+    dst += pitch;
+    vst1_lane_u32((uint32_t *)dst, x_2_6, 1);
+    dst += pitch;
+    vst1_lane_u32((uint32_t *)dst, x_3_7, 1);
+#else
+    vst4_lane_u8(dst, result, 0);
+    dst += pitch;
+    vst4_lane_u8(dst, result, 1);
+    dst += pitch;
+    vst4_lane_u8(dst, result, 2);
+    dst += pitch;
+    vst4_lane_u8(dst, result, 3);
+    dst += pitch;
+    vst4_lane_u8(dst, result, 4);
+    dst += pitch;
+    vst4_lane_u8(dst, result, 5);
+    dst += pitch;
+    vst4_lane_u8(dst, result, 6);
+    dst += pitch;
+    vst4_lane_u8(dst, result, 7);
+#endif  // VPX_INCOMPATIBLE_GCC
+}
+
+void vp8_loop_filter_vertical_edge_y_neon(
+        unsigned char *src,
+        int pitch,
+        unsigned char blimit,
+        unsigned char limit,
+        unsigned char thresh) {
+    unsigned char *s, *d;
+    uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+    uint8x16_t q5, q6, q7, q8, q9, q10;
+    uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
+    uint8x8_t d15, d16, d17, d18, d19, d20, d21;
+    uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
+    uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
+    uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
+    uint8x8x4_t q4ResultH, q4ResultL;
+
+    qblimit = vdupq_n_u8(blimit);
+    qlimit  = vdupq_n_u8(limit);
+    qthresh = vdupq_n_u8(thresh);
+
+    s = src - 4;
+    d6  = vld1_u8(s);
+    s += pitch;
+    d8  = vld1_u8(s);
+    s += pitch;
+    d10 = vld1_u8(s);
+    s += pitch;
+    d12 = vld1_u8(s);
+    s += pitch;
+    d14 = vld1_u8(s);
+    s += pitch;
+    d16 = vld1_u8(s);
+    s += pitch;
+    d18 = vld1_u8(s);
+    s += pitch;
+    d20 = vld1_u8(s);
+    s += pitch;
+    d7  = vld1_u8(s);
+    s += pitch;
+    d9  = vld1_u8(s);
+    s += pitch;
+    d11 = vld1_u8(s);
+    s += pitch;
+    d13 = vld1_u8(s);
+    s += pitch;
+    d15 = vld1_u8(s);
+    s += pitch;
+    d17 = vld1_u8(s);
+    s += pitch;
+    d19 = vld1_u8(s);
+    s += pitch;
+    d21 = vld1_u8(s);
+
+    q3 = vcombine_u8(d6, d7);
+    q4 = vcombine_u8(d8, d9);
+    q5 = vcombine_u8(d10, d11);
+    q6 = vcombine_u8(d12, d13);
+    q7 = vcombine_u8(d14, d15);
+    q8 = vcombine_u8(d16, d17);
+    q9 = vcombine_u8(d18, d19);
+    q10 = vcombine_u8(d20, d21);
+
+    q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
+    q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
+    q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
+    q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
+
+    q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
+                       vreinterpretq_u16_u32(q2tmp2.val[0]));
+    q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
+                       vreinterpretq_u16_u32(q2tmp3.val[0]));
+    q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
+                       vreinterpretq_u16_u32(q2tmp2.val[1]));
+    q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
+                       vreinterpretq_u16_u32(q2tmp3.val[1]));
+
+    q2tmp8  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
+                       vreinterpretq_u8_u16(q2tmp5.val[0]));
+    q2tmp9  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
+                       vreinterpretq_u8_u16(q2tmp5.val[1]));
+    q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
+                       vreinterpretq_u8_u16(q2tmp7.val[0]));
+    q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
+                       vreinterpretq_u8_u16(q2tmp7.val[1]));
+
+    q3 = q2tmp8.val[0];
+    q4 = q2tmp8.val[1];
+    q5 = q2tmp9.val[0];
+    q6 = q2tmp9.val[1];
+    q7 = q2tmp10.val[0];
+    q8 = q2tmp10.val[1];
+    q9 = q2tmp11.val[0];
+    q10 = q2tmp11.val[1];
+
+    vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+                         q5, q6, q7, q8, q9, q10,
+                         &q5, &q6, &q7, &q8);
+
+    q4ResultL.val[0] = vget_low_u8(q5);   // d10
+    q4ResultL.val[1] = vget_low_u8(q6);   // d12
+    q4ResultL.val[2] = vget_low_u8(q7);   // d14
+    q4ResultL.val[3] = vget_low_u8(q8);   // d16
+    q4ResultH.val[0] = vget_high_u8(q5);  // d11
+    q4ResultH.val[1] = vget_high_u8(q6);  // d13
+    q4ResultH.val[2] = vget_high_u8(q7);  // d15
+    q4ResultH.val[3] = vget_high_u8(q8);  // d17
+
+    d = src - 2;
+    write_4x8(d, pitch, q4ResultL);
+    d += pitch * 8;
+    write_4x8(d, pitch, q4ResultH);
+}
+
+void vp8_loop_filter_vertical_edge_uv_neon(
+        unsigned char *u,
+        int pitch,
+        unsigned char blimit,
+        unsigned char limit,
+        unsigned char thresh,
+        unsigned char *v) {
+    unsigned char *us, *ud;
+    unsigned char *vs, *vd;
+    uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+    uint8x16_t q5, q6, q7, q8, q9, q10;
+    uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
+    uint8x8_t d15, d16, d17, d18, d19, d20, d21;
+    uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
+    uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
+    uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
+    uint8x8x4_t q4ResultH, q4ResultL;
+
+    qblimit = vdupq_n_u8(blimit);
+    qlimit  = vdupq_n_u8(limit);
+    qthresh = vdupq_n_u8(thresh);
+
+    us = u - 4;
+    d6 = vld1_u8(us);
+    us += pitch;
+    d8 = vld1_u8(us);
+    us += pitch;
+    d10 = vld1_u8(us);
+    us += pitch;
+    d12 = vld1_u8(us);
+    us += pitch;
+    d14 = vld1_u8(us);
+    us += pitch;
+    d16 = vld1_u8(us);
+    us += pitch;
+    d18 = vld1_u8(us);
+    us += pitch;
+    d20 = vld1_u8(us);
+
+    vs = v - 4;
+    d7 = vld1_u8(vs);
+    vs += pitch;
+    d9 = vld1_u8(vs);
+    vs += pitch;
+    d11 = vld1_u8(vs);
+    vs += pitch;
+    d13 = vld1_u8(vs);
+    vs += pitch;
+    d15 = vld1_u8(vs);
+    vs += pitch;
+    d17 = vld1_u8(vs);
+    vs += pitch;
+    d19 = vld1_u8(vs);
+    vs += pitch;
+    d21 = vld1_u8(vs);
+
+    q3 = vcombine_u8(d6, d7);
+    q4 = vcombine_u8(d8, d9);
+    q5 = vcombine_u8(d10, d11);
+    q6 = vcombine_u8(d12, d13);
+    q7 = vcombine_u8(d14, d15);
+    q8 = vcombine_u8(d16, d17);
+    q9 = vcombine_u8(d18, d19);
+    q10 = vcombine_u8(d20, d21);
+
+    q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
+    q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
+    q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
+    q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
+
+    q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
+                       vreinterpretq_u16_u32(q2tmp2.val[0]));
+    q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
+                       vreinterpretq_u16_u32(q2tmp3.val[0]));
+    q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
+                       vreinterpretq_u16_u32(q2tmp2.val[1]));
+    q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
+                       vreinterpretq_u16_u32(q2tmp3.val[1]));
+
+    q2tmp8  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
+                       vreinterpretq_u8_u16(q2tmp5.val[0]));
+    q2tmp9  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
+                       vreinterpretq_u8_u16(q2tmp5.val[1]));
+    q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
+                       vreinterpretq_u8_u16(q2tmp7.val[0]));
+    q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
+                       vreinterpretq_u8_u16(q2tmp7.val[1]));
+
+    q3 = q2tmp8.val[0];
+    q4 = q2tmp8.val[1];
+    q5 = q2tmp9.val[0];
+    q6 = q2tmp9.val[1];
+    q7 = q2tmp10.val[0];
+    q8 = q2tmp10.val[1];
+    q9 = q2tmp11.val[0];
+    q10 = q2tmp11.val[1];
+
+    vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+                         q5, q6, q7, q8, q9, q10,
+                         &q5, &q6, &q7, &q8);
+
+    q4ResultL.val[0] = vget_low_u8(q5);   // d10
+    q4ResultL.val[1] = vget_low_u8(q6);   // d12
+    q4ResultL.val[2] = vget_low_u8(q7);   // d14
+    q4ResultL.val[3] = vget_low_u8(q8);   // d16
+    ud = u - 2;
+    write_4x8(ud, pitch, q4ResultL);
+
+    q4ResultH.val[0] = vget_high_u8(q5);  // d11
+    q4ResultH.val[1] = vget_high_u8(q6);  // d13
+    q4ResultH.val[2] = vget_high_u8(q7);  // d15
+    q4ResultH.val[3] = vget_high_u8(q8);  // d17
+    vd = v - 2;
+    write_4x8(vd, pitch, q4ResultH);
+}
diff --git a/libs/libvpx/vp8/common/blockd.c b/libs/libvpx/vp8/common/blockd.c
new file mode 100644
index 0000000000..1fc3cd0ca7
--- /dev/null
+++ b/libs/libvpx/vp8/common/blockd.c
@@ -0,0 +1,22 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "blockd.h"
+#include "vpx_mem/vpx_mem.h"
+
+const unsigned char vp8_block2left[25] =
+{
+    0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+};
+const unsigned char vp8_block2above[25] =
+{
+    0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8
+};
diff --git a/libs/libvpx/vp8/common/blockd.h b/libs/libvpx/vp8/common/blockd.h
new file mode 100644
index 0000000000..192108a06d
--- /dev/null
+++ b/libs/libvpx/vp8/common/blockd.h
@@ -0,0 +1,312 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_COMMON_BLOCKD_H_
+#define VP8_COMMON_BLOCKD_H_
+
+void vpx_log(const char *format, ...);
+
+#include "vpx_config.h"
+#include "vpx_scale/yv12config.h"
+#include "mv.h"
+#include "treecoder.h"
+#include "vpx_ports/mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*#define DCPRED 1*/
+#define DCPREDSIMTHRESH 0
+#define DCPREDCNTTHRESH 3
+
+#define MB_FEATURE_TREE_PROBS   3
+#define MAX_MB_SEGMENTS         4
+
+#define MAX_REF_LF_DELTAS       4
+#define MAX_MODE_LF_DELTAS      4
+
+/* Segment Feature Masks */
+#define SEGMENT_DELTADATA   0
+#define SEGMENT_ABSDATA     1
+
+typedef struct
+{
+    int r, c;
+} POS;
+
+#define PLANE_TYPE_Y_NO_DC    0
+#define PLANE_TYPE_Y2         1
+#define PLANE_TYPE_UV         2
+#define PLANE_TYPE_Y_WITH_DC  3
+
+
+typedef char ENTROPY_CONTEXT;
+typedef struct
+{
+    ENTROPY_CONTEXT y1[4];
+    ENTROPY_CONTEXT u[2];
+    ENTROPY_CONTEXT v[2];
+    ENTROPY_CONTEXT y2;
+} ENTROPY_CONTEXT_PLANES;
+
+extern const unsigned char vp8_block2left[25];
+extern const unsigned char vp8_block2above[25];
+
+#define VP8_COMBINEENTROPYCONTEXTS( Dest, A, B) \
+    Dest = (A)+(B);
+
+
+typedef enum
+{
+    KEY_FRAME = 0,
+    INTER_FRAME = 1
+} FRAME_TYPE;
+
+typedef enum
+{
+    DC_PRED,            /* average of above and left pixels */
+    V_PRED,             /* vertical prediction */
+    H_PRED,             /* horizontal prediction */
+    TM_PRED,            /* Truemotion prediction */
+    B_PRED,             /* block based prediction, each block has its own prediction mode */
+
+    NEARESTMV,
+    NEARMV,
+    ZEROMV,
+    NEWMV,
+    SPLITMV,
+
+    MB_MODE_COUNT
+} MB_PREDICTION_MODE;
+
+/* Macroblock level features */
+typedef enum
+{
+    MB_LVL_ALT_Q = 0,               /* Use alternate Quantizer .... */
+    MB_LVL_ALT_LF = 1,              /* Use alternate loop filter value... */
+    MB_LVL_MAX = 2                  /* Number of MB level features supported */
+
+} MB_LVL_FEATURES;
+
+/* Segment Feature Masks */
+#define SEGMENT_ALTQ    0x01
+#define SEGMENT_ALT_LF  0x02
+
+#define VP8_YMODES  (B_PRED + 1)
+#define VP8_UV_MODES (TM_PRED + 1)
+
+#define VP8_MVREFS (1 + SPLITMV - NEARESTMV)
+
+typedef enum
+{
+    B_DC_PRED,          /* average of above and left pixels */
+    B_TM_PRED,
+
+    B_VE_PRED,           /* vertical prediction */
+    B_HE_PRED,           /* horizontal prediction */
+
+    B_LD_PRED,
+    B_RD_PRED,
+
+    B_VR_PRED,
+    B_VL_PRED,
+    B_HD_PRED,
+    B_HU_PRED,
+
+    LEFT4X4,
+    ABOVE4X4,
+    ZERO4X4,
+    NEW4X4,
+
+    B_MODE_COUNT
+} B_PREDICTION_MODE;
+
+#define VP8_BINTRAMODES (B_HU_PRED + 1)  /* 10 */
+#define VP8_SUBMVREFS (1 + NEW4X4 - LEFT4X4)
+
+/* For keyframes, intra block modes are predicted by the (already decoded)
+   modes for the Y blocks to the left and above us; for interframes, there
+   is a single probability table. */
+
+union b_mode_info
+{
+    B_PREDICTION_MODE as_mode;
+    int_mv mv;
+};
+
+typedef enum
+{
+    INTRA_FRAME = 0,
+    LAST_FRAME = 1,
+    GOLDEN_FRAME = 2,
+    ALTREF_FRAME = 3,
+    MAX_REF_FRAMES = 4
+} MV_REFERENCE_FRAME;
+
+typedef struct
+{
+    uint8_t mode, uv_mode;
+    uint8_t ref_frame;
+    uint8_t is_4x4;
+    int_mv mv;
+
+    uint8_t partitioning;
+    uint8_t mb_skip_coeff;                                /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */
+    uint8_t need_to_clamp_mvs;
+    uint8_t segment_id;                  /* Which set of segmentation parameters should be used for this MB */
+} MB_MODE_INFO;
+
+typedef struct modeinfo
+{
+    MB_MODE_INFO mbmi;
+    union b_mode_info bmi[16];
+} MODE_INFO;
+
+#if CONFIG_MULTI_RES_ENCODING
+/* The mb-level information needed to be stored for higher-resolution encoder */
+typedef struct
+{
+    MB_PREDICTION_MODE mode;
+    MV_REFERENCE_FRAME ref_frame;
+    int_mv mv;
+    int dissim;    /* dissimilarity level of the macroblock */
+} LOWER_RES_MB_INFO;
+
+/* The frame-level information needed to be stored for higher-resolution
+ *  encoder */
+typedef struct
+{
+    FRAME_TYPE frame_type;
+    int is_frame_dropped;
+    // The frame rate for the lowest resolution.
+    double low_res_framerate;
+    /* The frame number of each reference frames */
+    unsigned int low_res_ref_frames[MAX_REF_FRAMES];
+    // The video frame counter value for the key frame, for lowest resolution.
+    unsigned int key_frame_counter_value;
+    LOWER_RES_MB_INFO *mb_info;
+} LOWER_RES_FRAME_INFO;
+#endif
+
+typedef struct blockd
+{
+    short *qcoeff;
+    short *dqcoeff;
+    unsigned char  *predictor;
+    short *dequant;
+
+    int offset;
+    char *eob;
+
+    union b_mode_info bmi;
+} BLOCKD;
+
+typedef void (*vp8_subpix_fn_t)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+
+typedef struct macroblockd
+{
+    DECLARE_ALIGNED(16, unsigned char,  predictor[384]);
+    DECLARE_ALIGNED(16, short, qcoeff[400]);
+    DECLARE_ALIGNED(16, short, dqcoeff[400]);
+    DECLARE_ALIGNED(16, char,  eobs[25]);
+
+    DECLARE_ALIGNED(16, short,  dequant_y1[16]);
+    DECLARE_ALIGNED(16, short,  dequant_y1_dc[16]);
+    DECLARE_ALIGNED(16, short,  dequant_y2[16]);
+    DECLARE_ALIGNED(16, short,  dequant_uv[16]);
+
+    /* 16 Y blocks, 4 U, 4 V, 1 DC 2nd order block, each with 16 entries. */
+    BLOCKD block[25];
+    int fullpixel_mask;
+
+    YV12_BUFFER_CONFIG pre; /* Filtered copy of previous frame reconstruction */
+    YV12_BUFFER_CONFIG dst;
+
+    MODE_INFO *mode_info_context;
+    int mode_info_stride;
+
+    FRAME_TYPE frame_type;
+
+    int up_available;
+    int left_available;
+
+    unsigned char *recon_above[3];
+    unsigned char *recon_left[3];
+    int recon_left_stride[2];
+
+    /* Y,U,V,Y2 */
+    ENTROPY_CONTEXT_PLANES *above_context;
+    ENTROPY_CONTEXT_PLANES *left_context;
+
+    /* 0 indicates segmentation at MB level is not enabled. Otherwise the individual bits indicate which features are active. */
+    unsigned char segmentation_enabled;
+
+    /* 0 (do not update) 1 (update) the macroblock segmentation map. */
+    unsigned char update_mb_segmentation_map;
+
+    /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */
+    unsigned char update_mb_segmentation_data;
+
+    /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */
+    unsigned char mb_segement_abs_delta;
+
+    /* Per frame flags that define which MB level features (such as quantizer or loop filter level) */
+    /* are enabled and when enabled the proabilities used to decode the per MB flags in MB_MODE_INFO */
+    vp8_prob mb_segment_tree_probs[MB_FEATURE_TREE_PROBS];         /* Probability Tree used to code Segment number */
+
+    signed char segment_feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS];            /* Segment parameters */
+
+    /* mode_based Loop filter adjustment */
+    unsigned char mode_ref_lf_delta_enabled;
+    unsigned char mode_ref_lf_delta_update;
+
+    /* Delta values have the range +/- MAX_LOOP_FILTER */
+    signed char last_ref_lf_deltas[MAX_REF_LF_DELTAS];                /* 0 = Intra, Last, GF, ARF */
+    signed char ref_lf_deltas[MAX_REF_LF_DELTAS];                     /* 0 = Intra, Last, GF, ARF */
+    signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS];                      /* 0 = BPRED, ZERO_MV, MV, SPLIT */
+    signed char mode_lf_deltas[MAX_MODE_LF_DELTAS];                           /* 0 = BPRED, ZERO_MV, MV, SPLIT */
+
+    /* Distance of MB away from frame edges */
+    int mb_to_left_edge;
+    int mb_to_right_edge;
+    int mb_to_top_edge;
+    int mb_to_bottom_edge;
+
+
+
+    vp8_subpix_fn_t  subpixel_predict;
+    vp8_subpix_fn_t  subpixel_predict8x4;
+    vp8_subpix_fn_t  subpixel_predict8x8;
+    vp8_subpix_fn_t  subpixel_predict16x16;
+
+    void *current_bc;
+
+    int corrupted;
+
+#if ARCH_X86 || ARCH_X86_64
+    /* This is an intermediate buffer currently used in sub-pixel motion search
+     * to keep a copy of the reference area. This buffer can be used for other
+     * purpose.
+     */
+    DECLARE_ALIGNED(32, unsigned char, y_buf[22*32]);
+#endif
+} MACROBLOCKD;
+
+
+extern void vp8_build_block_doffsets(MACROBLOCKD *x);
+extern void vp8_setup_block_dptrs(MACROBLOCKD *x);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_COMMON_BLOCKD_H_
diff --git a/libs/libvpx/vp8/common/coefupdateprobs.h b/libs/libvpx/vp8/common/coefupdateprobs.h
new file mode 100644
index 0000000000..d96a19e747
--- /dev/null
+++ b/libs/libvpx/vp8/common/coefupdateprobs.h
@@ -0,0 +1,197 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_COMMON_COEFUPDATEPROBS_H_
+#define VP8_COMMON_COEFUPDATEPROBS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Update probabilities for the nodes in the token entropy tree.
+   Generated file included by entropy.c */
+
+const vp8_prob vp8_coef_update_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES] =
+{
+    {
+        {
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {176, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {223, 241, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {249, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 244, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {234, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 246, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {239, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {251, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {251, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 254, 253, 255, 254, 255, 255, 255, 255, 255, 255, },
+            {250, 255, 254, 255, 254, 255, 255, 255, 255, 255, 255, },
+            {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+    },
+    {
+        {
+            {217, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {225, 252, 241, 253, 255, 255, 254, 255, 255, 255, 255, },
+            {234, 250, 241, 250, 253, 255, 253, 254, 255, 255, 255, },
+        },
+        {
+            {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {223, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {238, 253, 254, 254, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {249, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {247, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+    },
+    {
+        {
+            {186, 251, 250, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {234, 251, 244, 254, 255, 255, 255, 255, 255, 255, 255, },
+            {251, 251, 243, 253, 254, 255, 254, 255, 255, 255, 255, },
+        },
+        {
+            {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {236, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {251, 253, 253, 254, 254, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+    },
+    {
+        {
+            {248, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {250, 254, 252, 254, 255, 255, 255, 255, 255, 255, 255, },
+            {248, 254, 249, 253, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {246, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {252, 254, 251, 254, 254, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 254, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {248, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {253, 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {245, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {253, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 251, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {249, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 255, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+    },
+};
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_COMMON_COEFUPDATEPROBS_H_
diff --git a/libs/libvpx/vp8/common/common.h b/libs/libvpx/vp8/common/common.h
new file mode 100644
index 0000000000..e58a9cc23b
--- /dev/null
+++ b/libs/libvpx/vp8/common/common.h
@@ -0,0 +1,48 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_COMMON_COMMON_H_
+#define VP8_COMMON_COMMON_H_
+
+#include <assert.h>
+
+/* Interface header for common constant data structures and lookup tables */
+
+#include "vpx_mem/vpx_mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Only need this for fixed-size arrays, for structs just assign. */
+
+#define vp8_copy( Dest, Src) { \
+        assert( sizeof( Dest) == sizeof( Src)); \
+        memcpy( Dest, Src, sizeof( Src)); \
+    }
+
+/* Use this for variably-sized arrays. */
+
+#define vp8_copy_array( Dest, Src, N) { \
+        assert( sizeof( *Dest) == sizeof( *Src)); \
+        memcpy( Dest, Src, N * sizeof( *Src)); \
+    }
+
+#define vp8_zero( Dest)  memset( &Dest, 0, sizeof( Dest));
+
+#define vp8_zero_array( Dest, N)  memset( Dest, 0, N * sizeof( *Dest));
+
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_COMMON_COMMON_H_
diff --git a/libs/libvpx/vp8/common/context.c b/libs/libvpx/vp8/common/context.c
new file mode 100644
index 0000000000..99e95d30ff
--- /dev/null
+++ b/libs/libvpx/vp8/common/context.c
@@ -0,0 +1,399 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "entropy.h"
+
+/* *** GENERATED FILE: DO NOT EDIT *** */
+
+#if 0
+int Contexts[vp8_coef_counter_dimen];
+
+const int default_contexts[vp8_coef_counter_dimen] =
+{
+    {
+        // Block Type ( 0 )
+        {
+            // Coeff Band ( 0 )
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+        },
+        {
+            // Coeff Band ( 1 )
+            {30190, 26544, 225,  24,   4,   0,   0,   0,   0,   0,   0, 4171593,},
+            {26846, 25157, 1241, 130,  26,   6,   1,   0,   0,   0,   0, 149987,},
+            {10484, 9538, 1006, 160,  36,  18,   0,   0,   0,   0,   0, 15104,},
+        },
+        {
+            // Coeff Band ( 2 )
+            {25842, 40456, 1126,  83,  11,   2,   0,   0,   0,   0,   0,   0,},
+            {9338, 8010, 512,  73,   7,   3,   2,   0,   0,   0,   0, 43294,},
+            {1047, 751, 149,  31,  13,   6,   1,   0,   0,   0,   0, 879,},
+        },
+        {
+            // Coeff Band ( 3 )
+            {26136, 9826, 252,  13,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {8134, 5574, 191,  14,   2,   0,   0,   0,   0,   0,   0, 35302,},
+            { 605, 677, 116,   9,   1,   0,   0,   0,   0,   0,   0, 611,},
+        },
+        {
+            // Coeff Band ( 4 )
+            {10263, 15463, 283,  17,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {2773, 2191, 128,   9,   2,   2,   0,   0,   0,   0,   0, 10073,},
+            { 134, 125,  32,   4,   0,   2,   0,   0,   0,   0,   0,  50,},
+        },
+        {
+            // Coeff Band ( 5 )
+            {10483, 2663,  23,   1,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {2137, 1251,  27,   1,   1,   0,   0,   0,   0,   0,   0, 14362,},
+            { 116, 156,  14,   2,   1,   0,   0,   0,   0,   0,   0, 190,},
+        },
+        {
+            // Coeff Band ( 6 )
+            {40977, 27614, 412,  28,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {6113, 5213, 261,  22,   3,   0,   0,   0,   0,   0,   0, 26164,},
+            { 382, 312,  50,  14,   2,   0,   0,   0,   0,   0,   0, 345,},
+        },
+        {
+            // Coeff Band ( 7 )
+            {   0,  26,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0,  13,   0,   0,   0,   0,   0,   0,   0,   0,   0, 319,},
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   8,},
+        },
+    },
+    {
+        // Block Type ( 1 )
+        {
+            // Coeff Band ( 0 )
+            {3268, 19382, 1043, 250,  93,  82,  49,  26,  17,   8,  25, 82289,},
+            {8758, 32110, 5436, 1832, 827, 668, 420, 153,  24,   0,   3, 52914,},
+            {9337, 23725, 8487, 3954, 2107, 1836, 1069, 399,  59,   0,   0, 18620,},
+        },
+        {
+            // Coeff Band ( 1 )
+            {12419, 8420, 452,  62,   9,   1,   0,   0,   0,   0,   0,   0,},
+            {11715, 8705, 693,  92,  15,   7,   2,   0,   0,   0,   0, 53988,},
+            {7603, 8585, 2306, 778, 270, 145,  39,   5,   0,   0,   0, 9136,},
+        },
+        {
+            // Coeff Band ( 2 )
+            {15938, 14335, 1207, 184,  55,  13,   4,   1,   0,   0,   0,   0,},
+            {7415, 6829, 1138, 244,  71,  26,   7,   0,   0,   0,   0, 9980,},
+            {1580, 1824, 655, 241,  89,  46,  10,   2,   0,   0,   0, 429,},
+        },
+        {
+            // Coeff Band ( 3 )
+            {19453, 5260, 201,  19,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {9173, 3758, 213,  22,   1,   1,   0,   0,   0,   0,   0, 9820,},
+            {1689, 1277, 276,  51,  17,   4,   0,   0,   0,   0,   0, 679,},
+        },
+        {
+            // Coeff Band ( 4 )
+            {12076, 10667, 620,  85,  19,   9,   5,   0,   0,   0,   0,   0,},
+            {4665, 3625, 423,  55,  19,   9,   0,   0,   0,   0,   0, 5127,},
+            { 415, 440, 143,  34,  20,   7,   2,   0,   0,   0,   0, 101,},
+        },
+        {
+            // Coeff Band ( 5 )
+            {12183, 4846, 115,  11,   1,   0,   0,   0,   0,   0,   0,   0,},
+            {4226, 3149, 177,  21,   2,   0,   0,   0,   0,   0,   0, 7157,},
+            { 375, 621, 189,  51,  11,   4,   1,   0,   0,   0,   0, 198,},
+        },
+        {
+            // Coeff Band ( 6 )
+            {61658, 37743, 1203,  94,  10,   3,   0,   0,   0,   0,   0,   0,},
+            {15514, 11563, 903, 111,  14,   5,   0,   0,   0,   0,   0, 25195,},
+            { 929, 1077, 291,  78,  14,   7,   1,   0,   0,   0,   0, 507,},
+        },
+        {
+            // Coeff Band ( 7 )
+            {   0, 990,  15,   3,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0, 412,  13,   0,   0,   0,   0,   0,   0,   0,   0, 1641,},
+            {   0,  18,   7,   1,   0,   0,   0,   0,   0,   0,   0,  30,},
+        },
+    },
+    {
+        // Block Type ( 2 )
+        {
+            // Coeff Band ( 0 )
+            { 953, 24519, 628, 120,  28,  12,   4,   0,   0,   0,   0, 2248798,},
+            {1525, 25654, 2647, 617, 239, 143,  42,   5,   0,   0,   0, 66837,},
+            {1180, 11011, 3001, 1237, 532, 448, 239,  54,   5,   0,   0, 7122,},
+        },
+        {
+            // Coeff Band ( 1 )
+            {1356, 2220,  67,  10,   4,   1,   0,   0,   0,   0,   0,   0,},
+            {1450, 2544, 102,  18,   4,   3,   0,   0,   0,   0,   0, 57063,},
+            {1182, 2110, 470, 130,  41,  21,   0,   0,   0,   0,   0, 6047,},
+        },
+        {
+            // Coeff Band ( 2 )
+            { 370, 3378, 200,  30,   5,   4,   1,   0,   0,   0,   0,   0,},
+            { 293, 1006, 131,  29,  11,   0,   0,   0,   0,   0,   0, 5404,},
+            { 114, 387,  98,  23,   4,   8,   1,   0,   0,   0,   0, 236,},
+        },
+        {
+            // Coeff Band ( 3 )
+            { 579, 194,   4,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            { 395, 213,   5,   1,   0,   0,   0,   0,   0,   0,   0, 4157,},
+            { 119, 122,   4,   0,   0,   0,   0,   0,   0,   0,   0, 300,},
+        },
+        {
+            // Coeff Band ( 4 )
+            {  38, 557,  19,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {  21, 114,  12,   1,   0,   0,   0,   0,   0,   0,   0, 427,},
+            {   0,   5,   0,   0,   0,   0,   0,   0,   0,   0,   0,   7,},
+        },
+        {
+            // Coeff Band ( 5 )
+            {  52,   7,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {  18,   6,   0,   0,   0,   0,   0,   0,   0,   0,   0, 652,},
+            {   1,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,  30,},
+        },
+        {
+            // Coeff Band ( 6 )
+            { 640, 569,  10,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {  25,  77,   2,   0,   0,   0,   0,   0,   0,   0,   0, 517,},
+            {   4,   7,   0,   0,   0,   0,   0,   0,   0,   0,   0,   3,},
+        },
+        {
+            // Coeff Band ( 7 )
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+        },
+    },
+    {
+        // Block Type ( 3 )
+        {
+            // Coeff Band ( 0 )
+            {2506, 20161, 2707, 767, 261, 178, 107,  30,  14,   3,   0, 100694,},
+            {8806, 36478, 8817, 3268, 1280, 850, 401, 114,  42,   0,   0, 58572,},
+            {11003, 27214, 11798, 5716, 2482, 2072, 1048, 175,  32,   0,   0, 19284,},
+        },
+        {
+            // Coeff Band ( 1 )
+            {9738, 11313, 959, 205,  70,  18,  11,   1,   0,   0,   0,   0,},
+            {12628, 15085, 1507, 273,  52,  19,   9,   0,   0,   0,   0, 54280,},
+            {10701, 15846, 5561, 1926, 813, 570, 249,  36,   0,   0,   0, 6460,},
+        },
+        {
+            // Coeff Band ( 2 )
+            {6781, 22539, 2784, 634, 182, 123,  20,   4,   0,   0,   0,   0,},
+            {6263, 11544, 2649, 790, 259, 168,  27,   5,   0,   0,   0, 20539,},
+            {3109, 4075, 2031, 896, 457, 386, 158,  29,   0,   0,   0, 1138,},
+        },
+        {
+            // Coeff Band ( 3 )
+            {11515, 4079, 465,  73,   5,  14,   2,   0,   0,   0,   0,   0,},
+            {9361, 5834, 650,  96,  24,   8,   4,   0,   0,   0,   0, 22181,},
+            {4343, 3974, 1360, 415, 132,  96,  14,   1,   0,   0,   0, 1267,},
+        },
+        {
+            // Coeff Band ( 4 )
+            {4787, 9297, 823, 168,  44,  12,   4,   0,   0,   0,   0,   0,},
+            {3619, 4472, 719, 198,  60,  31,   3,   0,   0,   0,   0, 8401,},
+            {1157, 1175, 483, 182,  88,  31,   8,   0,   0,   0,   0, 268,},
+        },
+        {
+            // Coeff Band ( 5 )
+            {8299, 1226,  32,   5,   1,   0,   0,   0,   0,   0,   0,   0,},
+            {3502, 1568,  57,   4,   1,   1,   0,   0,   0,   0,   0, 9811,},
+            {1055, 1070, 166,  29,   6,   1,   0,   0,   0,   0,   0, 527,},
+        },
+        {
+            // Coeff Band ( 6 )
+            {27414, 27927, 1989, 347,  69,  26,   0,   0,   0,   0,   0,   0,},
+            {5876, 10074, 1574, 341,  91,  24,   4,   0,   0,   0,   0, 21954,},
+            {1571, 2171, 778, 324, 124,  65,  16,   0,   0,   0,   0, 979,},
+        },
+        {
+            // Coeff Band ( 7 )
+            {   0,  29,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0,  23,   0,   0,   0,   0,   0,   0,   0,   0,   0, 459,},
+            {   0,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,  13,},
+        },
+    },
+};
+
+//Update probabilities for the nodes in the token entropy tree.
+const vp8_prob tree_update_probs[vp8_coef_tree_dimen] =
+{
+    {
+        {
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {176, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {223, 241, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {249, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 244, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {234, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 246, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {239, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {251, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {251, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 254, 253, 255, 254, 255, 255, 255, 255, 255, 255, },
+            {250, 255, 254, 255, 254, 255, 255, 255, 255, 255, 255, },
+            {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+    },
+    {
+        {
+            {217, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {225, 252, 241, 253, 255, 255, 254, 255, 255, 255, 255, },
+            {234, 250, 241, 250, 253, 255, 253, 254, 255, 255, 255, },
+        },
+        {
+            {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {223, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {238, 253, 254, 254, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {249, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {247, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+    },
+    {
+        {
+            {186, 251, 250, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {234, 251, 244, 254, 255, 255, 255, 255, 255, 255, 255, },
+            {251, 251, 243, 253, 254, 255, 254, 255, 255, 255, 255, },
+        },
+        {
+            {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {236, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {251, 253, 253, 254, 254, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+    },
+    {
+        {
+            {248, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {250, 254, 252, 254, 255, 255, 255, 255, 255, 255, 255, },
+            {248, 254, 249, 253, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {246, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {252, 254, 251, 254, 254, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 254, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {248, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {253, 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {245, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {253, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 251, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {249, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 255, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+    },
+};
+#endif
diff --git a/libs/libvpx/vp8/common/copy_c.c b/libs/libvpx/vp8/common/copy_c.c
new file mode 100644
index 0000000000..e3392913f6
--- /dev/null
+++ b/libs/libvpx/vp8/common/copy_c.c
@@ -0,0 +1,32 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <string.h>
+
+#include "./vp8_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+/* Copy 2 macroblocks to a buffer */
+void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride,
+                    unsigned char *dst_ptr, int dst_stride,
+                    int height)
+{
+    int r;
+
+    for (r = 0; r < height; r++)
+    {
+        memcpy(dst_ptr, src_ptr, 32);
+
+        src_ptr += src_stride;
+        dst_ptr += dst_stride;
+
+    }
+}
diff --git a/libs/libvpx/vp8/common/debugmodes.c b/libs/libvpx/vp8/common/debugmodes.c
new file mode 100644
index 0000000000..159fddc6a7
--- /dev/null
+++ b/libs/libvpx/vp8/common/debugmodes.c
@@ -0,0 +1,155 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <stdio.h>
+#include "blockd.h"
+
+
+void vp8_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols, int frame)
+{
+
+    int mb_row;
+    int mb_col;
+    int mb_index = 0;
+    FILE *mvs = fopen("mvs.stt", "a");
+
+    /* print out the macroblock Y modes */
+    mb_index = 0;
+    fprintf(mvs, "Mb Modes for Frame %d\n", frame);
+
+    for (mb_row = 0; mb_row < rows; mb_row++)
+    {
+        for (mb_col = 0; mb_col < cols; mb_col++)
+        {
+
+            fprintf(mvs, "%2d ", mi[mb_index].mbmi.mode);
+
+            mb_index++;
+        }
+
+        fprintf(mvs, "\n");
+        mb_index++;
+    }
+
+    fprintf(mvs, "\n");
+
+    mb_index = 0;
+    fprintf(mvs, "Mb mv ref for Frame %d\n", frame);
+
+    for (mb_row = 0; mb_row < rows; mb_row++)
+    {
+        for (mb_col = 0; mb_col < cols; mb_col++)
+        {
+
+            fprintf(mvs, "%2d ", mi[mb_index].mbmi.ref_frame);
+
+            mb_index++;
+        }
+
+        fprintf(mvs, "\n");
+        mb_index++;
+    }
+
+    fprintf(mvs, "\n");
+
+    /* print out the macroblock UV modes */
+    mb_index = 0;
+    fprintf(mvs, "UV Modes for Frame %d\n", frame);
+
+    for (mb_row = 0; mb_row < rows; mb_row++)
+    {
+        for (mb_col = 0; mb_col < cols; mb_col++)
+        {
+
+            fprintf(mvs, "%2d ", mi[mb_index].mbmi.uv_mode);
+
+            mb_index++;
+        }
+
+        mb_index++;
+        fprintf(mvs, "\n");
+    }
+
+    fprintf(mvs, "\n");
+
+    /* print out the block modes */
+    fprintf(mvs, "Mbs for Frame %d\n", frame);
+    {
+        int b_row;
+
+        for (b_row = 0; b_row < 4 * rows; b_row++)
+        {
+            int b_col;
+            int bindex;
+
+            for (b_col = 0; b_col < 4 * cols; b_col++)
+            {
+                mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2);
+                bindex = (b_row & 3) * 4 + (b_col & 3);
+
+                if (mi[mb_index].mbmi.mode == B_PRED)
+                    fprintf(mvs, "%2d ", mi[mb_index].bmi[bindex].as_mode);
+                else
+                    fprintf(mvs, "xx ");
+
+            }
+
+            fprintf(mvs, "\n");
+        }
+    }
+    fprintf(mvs, "\n");
+
+    /* print out the macroblock mvs */
+    mb_index = 0;
+    fprintf(mvs, "MVs for Frame %d\n", frame);
+
+    for (mb_row = 0; mb_row < rows; mb_row++)
+    {
+        for (mb_col = 0; mb_col < cols; mb_col++)
+        {
+            fprintf(mvs, "%5d:%-5d", mi[mb_index].mbmi.mv.as_mv.row / 2, mi[mb_index].mbmi.mv.as_mv.col / 2);
+
+            mb_index++;
+        }
+
+        mb_index++;
+        fprintf(mvs, "\n");
+    }
+
+    fprintf(mvs, "\n");
+
+
+    /* print out the block modes */
+    fprintf(mvs, "MVs for Frame %d\n", frame);
+    {
+        int b_row;
+
+        for (b_row = 0; b_row < 4 * rows; b_row++)
+        {
+            int b_col;
+            int bindex;
+
+            for (b_col = 0; b_col < 4 * cols; b_col++)
+            {
+                mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2);
+                bindex = (b_row & 3) * 4 + (b_col & 3);
+                fprintf(mvs, "%3d:%-3d ", mi[mb_index].bmi[bindex].mv.as_mv.row, mi[mb_index].bmi[bindex].mv.as_mv.col);
+
+            }
+
+            fprintf(mvs, "\n");
+        }
+    }
+    fprintf(mvs, "\n");
+
+
+    fclose(mvs);
+}
diff --git a/libs/libvpx/vp8/common/default_coef_probs.h b/libs/libvpx/vp8/common/default_coef_probs.h
new file mode 100644
index 0000000000..4d69e4be66
--- /dev/null
+++ b/libs/libvpx/vp8/common/default_coef_probs.h
@@ -0,0 +1,200 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+*/
+
+#ifndef VP8_COMMON_DEFAULT_COEF_PROBS_H_
+#define VP8_COMMON_DEFAULT_COEF_PROBS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*Generated file, included by entropy.c*/
+
+
+static const vp8_prob default_coef_probs [BLOCK_TYPES]
+                                         [COEF_BANDS]
+                                         [PREV_COEF_CONTEXTS]
+                                         [ENTROPY_NODES] =
+{
+    { /* Block Type ( 0 ) */
+        { /* Coeff Band ( 0 )*/
+            { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+            { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+            { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 1 )*/
+            { 253, 136, 254, 255, 228, 219, 128, 128, 128, 128, 128 },
+            { 189, 129, 242, 255, 227, 213, 255, 219, 128, 128, 128 },
+            { 106, 126, 227, 252, 214, 209, 255, 255, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 2 )*/
+            {   1,  98, 248, 255, 236, 226, 255, 255, 128, 128, 128 },
+            { 181, 133, 238, 254, 221, 234, 255, 154, 128, 128, 128 },
+            {  78, 134, 202, 247, 198, 180, 255, 219, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 3 )*/
+            {   1, 185, 249, 255, 243, 255, 128, 128, 128, 128, 128 },
+            { 184, 150, 247, 255, 236, 224, 128, 128, 128, 128, 128 },
+            {  77, 110, 216, 255, 236, 230, 128, 128, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 4 )*/
+            {   1, 101, 251, 255, 241, 255, 128, 128, 128, 128, 128 },
+            { 170, 139, 241, 252, 236, 209, 255, 255, 128, 128, 128 },
+            {  37, 116, 196, 243, 228, 255, 255, 255, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 5 )*/
+            {   1, 204, 254, 255, 245, 255, 128, 128, 128, 128, 128 },
+            { 207, 160, 250, 255, 238, 128, 128, 128, 128, 128, 128 },
+            { 102, 103, 231, 255, 211, 171, 128, 128, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 6 )*/
+            {   1, 152, 252, 255, 240, 255, 128, 128, 128, 128, 128 },
+            { 177, 135, 243, 255, 234, 225, 128, 128, 128, 128, 128 },
+            {  80, 129, 211, 255, 194, 224, 128, 128, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 7 )*/
+            {   1,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+            { 246,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+            { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+        }
+    },
+    { /* Block Type ( 1 ) */
+        { /* Coeff Band ( 0 )*/
+            { 198,  35, 237, 223, 193, 187, 162, 160, 145, 155,  62 },
+            { 131,  45, 198, 221, 172, 176, 220, 157, 252, 221,   1 },
+            {  68,  47, 146, 208, 149, 167, 221, 162, 255, 223, 128 }
+        },
+        { /* Coeff Band ( 1 )*/
+            {   1, 149, 241, 255, 221, 224, 255, 255, 128, 128, 128 },
+            { 184, 141, 234, 253, 222, 220, 255, 199, 128, 128, 128 },
+            {  81,  99, 181, 242, 176, 190, 249, 202, 255, 255, 128 }
+        },
+        { /* Coeff Band ( 2 )*/
+            {   1, 129, 232, 253, 214, 197, 242, 196, 255, 255, 128 },
+            {  99, 121, 210, 250, 201, 198, 255, 202, 128, 128, 128 },
+            {  23,  91, 163, 242, 170, 187, 247, 210, 255, 255, 128 }
+        },
+        { /* Coeff Band ( 3 )*/
+            {   1, 200, 246, 255, 234, 255, 128, 128, 128, 128, 128 },
+            { 109, 178, 241, 255, 231, 245, 255, 255, 128, 128, 128 },
+            {  44, 130, 201, 253, 205, 192, 255, 255, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 4 )*/
+            {   1, 132, 239, 251, 219, 209, 255, 165, 128, 128, 128 },
+            {  94, 136, 225, 251, 218, 190, 255, 255, 128, 128, 128 },
+            {  22, 100, 174, 245, 186, 161, 255, 199, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 5 )*/
+            {   1, 182, 249, 255, 232, 235, 128, 128, 128, 128, 128 },
+            { 124, 143, 241, 255, 227, 234, 128, 128, 128, 128, 128 },
+            {  35,  77, 181, 251, 193, 211, 255, 205, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 6 )*/
+            {   1, 157, 247, 255, 236, 231, 255, 255, 128, 128, 128 },
+            { 121, 141, 235, 255, 225, 227, 255, 255, 128, 128, 128 },
+            {  45,  99, 188, 251, 195, 217, 255, 224, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 7 )*/
+            {   1,   1, 251, 255, 213, 255, 128, 128, 128, 128, 128 },
+            { 203,   1, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
+            { 137,   1, 177, 255, 224, 255, 128, 128, 128, 128, 128 }
+        }
+    },
+    { /* Block Type ( 2 ) */
+        { /* Coeff Band ( 0 )*/
+            { 253,   9, 248, 251, 207, 208, 255, 192, 128, 128, 128 },
+            { 175,  13, 224, 243, 193, 185, 249, 198, 255, 255, 128 },
+            {  73,  17, 171, 221, 161, 179, 236, 167, 255, 234, 128 }
+        },
+        { /* Coeff Band ( 1 )*/
+            {   1,  95, 247, 253, 212, 183, 255, 255, 128, 128, 128 },
+            { 239,  90, 244, 250, 211, 209, 255, 255, 128, 128, 128 },
+            { 155,  77, 195, 248, 188, 195, 255, 255, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 2 )*/
+            {   1,  24, 239, 251, 218, 219, 255, 205, 128, 128, 128 },
+            { 201,  51, 219, 255, 196, 186, 128, 128, 128, 128, 128 },
+            {  69,  46, 190, 239, 201, 218, 255, 228, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 3 )*/
+            {   1, 191, 251, 255, 255, 128, 128, 128, 128, 128, 128 },
+            { 223, 165, 249, 255, 213, 255, 128, 128, 128, 128, 128 },
+            { 141, 124, 248, 255, 255, 128, 128, 128, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 4 )*/
+            {   1,  16, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
+            { 190,  36, 230, 255, 236, 255, 128, 128, 128, 128, 128 },
+            { 149,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 5 )*/
+            {   1, 226, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+            { 247, 192, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+            { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 6 )*/
+            {   1, 134, 252, 255, 255, 128, 128, 128, 128, 128, 128 },
+            { 213,  62, 250, 255, 255, 128, 128, 128, 128, 128, 128 },
+            {  55,  93, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 7 )*/
+            { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+            { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+            { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+        }
+    },
+    { /* Block Type ( 3 ) */
+        { /* Coeff Band ( 0 )*/
+            { 202,  24, 213, 235, 186, 191, 220, 160, 240, 175, 255 },
+            { 126,  38, 182, 232, 169, 184, 228, 174, 255, 187, 128 },
+            {  61,  46, 138, 219, 151, 178, 240, 170, 255, 216, 128 }
+        },
+        { /* Coeff Band ( 1 )*/
+            {   1, 112, 230, 250, 199, 191, 247, 159, 255, 255, 128 },
+            { 166, 109, 228, 252, 211, 215, 255, 174, 128, 128, 128 },
+            {  39,  77, 162, 232, 172, 180, 245, 178, 255, 255, 128 }
+        },
+        { /* Coeff Band ( 2 )*/
+            {   1,  52, 220, 246, 198, 199, 249, 220, 255, 255, 128 },
+            { 124,  74, 191, 243, 183, 193, 250, 221, 255, 255, 128 },
+            {  24,  71, 130, 219, 154, 170, 243, 182, 255, 255, 128 }
+        },
+        { /* Coeff Band ( 3 )*/
+            {   1, 182, 225, 249, 219, 240, 255, 224, 128, 128, 128 },
+            { 149, 150, 226, 252, 216, 205, 255, 171, 128, 128, 128 },
+            {  28, 108, 170, 242, 183, 194, 254, 223, 255, 255, 128 }
+        },
+        { /* Coeff Band ( 4 )*/
+            {   1,  81, 230, 252, 204, 203, 255, 192, 128, 128, 128 },
+            { 123, 102, 209, 247, 188, 196, 255, 233, 128, 128, 128 },
+            {  20,  95, 153, 243, 164, 173, 255, 203, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 5 )*/
+            {   1, 222, 248, 255, 216, 213, 128, 128, 128, 128, 128 },
+            { 168, 175, 246, 252, 235, 205, 255, 255, 128, 128, 128 },
+            {  47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 6 )*/
+            {   1, 121, 236, 253, 212, 214, 255, 255, 128, 128, 128 },
+            { 141,  84, 213, 252, 201, 202, 255, 219, 128, 128, 128 },
+            {  42,  80, 160, 240, 162, 185, 255, 205, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 7 )*/
+            {   1,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+            { 244,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+            { 238,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
+        }
+    }
+};
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_COMMON_DEFAULT_COEF_PROBS_H_
diff --git a/libs/libvpx/vp8/common/dequantize.c b/libs/libvpx/vp8/common/dequantize.c
new file mode 100644
index 0000000000..f8b04fa4ee
--- /dev/null
+++ b/libs/libvpx/vp8/common/dequantize.c
@@ -0,0 +1,43 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+#include "vp8/common/blockd.h"
+#include "vpx_mem/vpx_mem.h"
+
+void vp8_dequantize_b_c(BLOCKD *d, short *DQC)
+{
+    int i;
+    short *DQ  = d->dqcoeff;
+    short *Q   = d->qcoeff;
+
+    for (i = 0; i < 16; i++)
+    {
+        DQ[i] = Q[i] * DQC[i];
+    }
+}
+
+void vp8_dequant_idct_add_c(short *input, short *dq,
+                            unsigned char *dest, int stride)
+{
+    int i;
+
+    for (i = 0; i < 16; i++)
+    {
+        input[i] = dq[i] * input[i];
+    }
+
+    vp8_short_idct4x4llm_c(input, dest, stride, dest, stride);
+
+    memset(input, 0, 32);
+
+}
diff --git a/libs/libvpx/vp8/common/entropy.c b/libs/libvpx/vp8/common/entropy.c
new file mode 100644
index 0000000000..c00e565f06
--- /dev/null
+++ b/libs/libvpx/vp8/common/entropy.c
@@ -0,0 +1,188 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "entropy.h"
+#include "blockd.h"
+#include "onyxc_int.h"
+#include "vpx_mem/vpx_mem.h"
+
+#include "coefupdateprobs.h"
+
+DECLARE_ALIGNED(16, const unsigned char, vp8_norm[256]) =
+{
+    0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
+    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+DECLARE_ALIGNED(16, const unsigned char, vp8_coef_bands[16]) =
+{ 0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7};
+
+DECLARE_ALIGNED(16, const unsigned char,
+                vp8_prev_token_class[MAX_ENTROPY_TOKENS]) =
+{ 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0};
+
+DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]) =
+{
+    0,  1,  4,  8,
+    5,  2,  3,  6,
+    9, 12, 13, 10,
+    7, 11, 14, 15,
+};
+
+DECLARE_ALIGNED(16, const short, vp8_default_inv_zig_zag[16]) =
+{
+    1,  2,  6,  7,
+    3,  5,  8, 13,
+    4,  9, 12, 14,
+   10, 11, 15, 16
+};
+
+/* vp8_default_zig_zag_mask generated with:
+
+    void vp8_init_scan_order_mask()
+    {
+        int i;
+
+        for (i = 0; i < 16; i++)
+        {
+            vp8_default_zig_zag_mask[vp8_default_zig_zag1d[i]] = 1 << i;
+        }
+
+    }
+*/
+DECLARE_ALIGNED(16, const short, vp8_default_zig_zag_mask[16]) =
+{
+     1,    2,    32,     64,
+     4,   16,   128,   4096,
+     8,  256,  2048,   8192,
+   512, 1024, 16384, -32768
+};
+
+const int vp8_mb_feature_data_bits[MB_LVL_MAX] = {7, 6};
+
+/* Array indices are identical to previously-existing CONTEXT_NODE indices */
+
+const vp8_tree_index vp8_coef_tree[ 22] =     /* corresponding _CONTEXT_NODEs */
+{
+    -DCT_EOB_TOKEN, 2,                             /* 0 = EOB */
+    -ZERO_TOKEN, 4,                               /* 1 = ZERO */
+    -ONE_TOKEN, 6,                               /* 2 = ONE */
+    8, 12,                                      /* 3 = LOW_VAL */
+    -TWO_TOKEN, 10,                            /* 4 = TWO */
+    -THREE_TOKEN, -FOUR_TOKEN,                /* 5 = THREE */
+    14, 16,                                    /* 6 = HIGH_LOW */
+    -DCT_VAL_CATEGORY1, -DCT_VAL_CATEGORY2,   /* 7 = CAT_ONE */
+    18, 20,                                   /* 8 = CAT_THREEFOUR */
+    -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4,  /* 9 = CAT_THREE */
+    -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6   /* 10 = CAT_FIVE */
+};
+
+/* vp8_coef_encodings generated with:
+    vp8_tokens_from_tree(vp8_coef_encodings, vp8_coef_tree);
+*/
+vp8_token vp8_coef_encodings[MAX_ENTROPY_TOKENS] =
+{
+    {2, 2},
+    {6, 3},
+    {28, 5},
+    {58, 6},
+    {59, 6},
+    {60, 6},
+    {61, 6},
+    {124, 7},
+    {125, 7},
+    {126, 7},
+    {127, 7},
+    {0, 1}
+};
+
+/* Trees for extra bits.  Probabilities are constant and
+   do not depend on previously encoded bits */
+
+static const vp8_prob Pcat1[] = { 159};
+static const vp8_prob Pcat2[] = { 165, 145};
+static const vp8_prob Pcat3[] = { 173, 148, 140};
+static const vp8_prob Pcat4[] = { 176, 155, 140, 135};
+static const vp8_prob Pcat5[] = { 180, 157, 141, 134, 130};
+static const vp8_prob Pcat6[] =
+{ 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129};
+
+
+/* tree index tables generated with:
+
+    void init_bit_tree(vp8_tree_index *p, int n)
+    {
+        int i = 0;
+
+        while (++i < n)
+        {
+            p[0] = p[1] = i << 1;
+            p += 2;
+        }
+
+        p[0] = p[1] = 0;
+    }
+
+    void init_bit_trees()
+    {
+        init_bit_tree(cat1, 1);
+        init_bit_tree(cat2, 2);
+        init_bit_tree(cat3, 3);
+        init_bit_tree(cat4, 4);
+        init_bit_tree(cat5, 5);
+        init_bit_tree(cat6, 11);
+    }
+*/
+
+static const vp8_tree_index cat1[2] = { 0, 0 };
+static const vp8_tree_index cat2[4] = { 2, 2, 0, 0 };
+static const vp8_tree_index cat3[6] = { 2, 2, 4, 4, 0, 0 };
+static const vp8_tree_index cat4[8] = { 2, 2, 4, 4, 6, 6, 0, 0 };
+static const vp8_tree_index cat5[10] = { 2, 2, 4, 4, 6, 6, 8, 8, 0, 0 };
+static const vp8_tree_index cat6[22] = { 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12,
+                                        14, 14, 16, 16, 18, 18, 20, 20, 0, 0 };
+
+const vp8_extra_bit_struct vp8_extra_bits[12] =
+{
+    { 0, 0, 0, 0},
+    { 0, 0, 0, 1},
+    { 0, 0, 0, 2},
+    { 0, 0, 0, 3},
+    { 0, 0, 0, 4},
+    { cat1, Pcat1, 1, 5},
+    { cat2, Pcat2, 2, 7},
+    { cat3, Pcat3, 3, 11},
+    { cat4, Pcat4, 4, 19},
+    { cat5, Pcat5, 5, 35},
+    { cat6, Pcat6, 11, 67},
+    { 0, 0, 0, 0}
+};
+
+#include "default_coef_probs.h"
+
+void vp8_default_coef_probs(VP8_COMMON *pc)
+{
+    memcpy(pc->fc.coef_probs, default_coef_probs, sizeof(default_coef_probs));
+}
+
diff --git a/libs/libvpx/vp8/common/entropy.h b/libs/libvpx/vp8/common/entropy.h
new file mode 100644
index 0000000000..a90bab4bac
--- /dev/null
+++ b/libs/libvpx/vp8/common/entropy.h
@@ -0,0 +1,109 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_COMMON_ENTROPY_H_
+#define VP8_COMMON_ENTROPY_H_
+
+#include "treecoder.h"
+#include "blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Coefficient token alphabet */
+
+#define ZERO_TOKEN              0       /* 0         Extra Bits 0+0 */
+#define ONE_TOKEN               1       /* 1         Extra Bits 0+1 */
+#define TWO_TOKEN               2       /* 2         Extra Bits 0+1 */
+#define THREE_TOKEN             3       /* 3         Extra Bits 0+1 */
+#define FOUR_TOKEN              4       /* 4         Extra Bits 0+1 */
+#define DCT_VAL_CATEGORY1       5       /* 5-6       Extra Bits 1+1 */
+#define DCT_VAL_CATEGORY2       6       /* 7-10      Extra Bits 2+1 */
+#define DCT_VAL_CATEGORY3       7       /* 11-18     Extra Bits 3+1 */
+#define DCT_VAL_CATEGORY4       8       /* 19-34     Extra Bits 4+1 */
+#define DCT_VAL_CATEGORY5       9       /* 35-66     Extra Bits 5+1 */
+#define DCT_VAL_CATEGORY6       10      /* 67+       Extra Bits 11+1 */
+#define DCT_EOB_TOKEN           11      /* EOB       Extra Bits 0+0 */
+
+#define MAX_ENTROPY_TOKENS 12
+#define ENTROPY_NODES 11
+
+extern const vp8_tree_index vp8_coef_tree[];
+
+extern const struct vp8_token_struct vp8_coef_encodings[MAX_ENTROPY_TOKENS];
+
+typedef struct
+{
+    vp8_tree_p tree;
+    const vp8_prob *prob;
+    int Len;
+    int base_val;
+} vp8_extra_bit_struct;
+
+extern const vp8_extra_bit_struct vp8_extra_bits[12];    /* indexed by token value */
+
+#define PROB_UPDATE_BASELINE_COST   7
+
+#define MAX_PROB                255
+#define DCT_MAX_VALUE           2048
+
+
+/* Coefficients are predicted via a 3-dimensional probability table. */
+
+/* Outside dimension.  0 = Y no DC, 1 = Y2, 2 = UV, 3 = Y with DC */
+
+#define BLOCK_TYPES 4
+
+/* Middle dimension is a coarsening of the coefficient's
+   position within the 4x4 DCT. */
+
+#define COEF_BANDS 8
+extern DECLARE_ALIGNED(16, const unsigned char, vp8_coef_bands[16]);
+
+/* Inside dimension is 3-valued measure of nearby complexity, that is,
+   the extent to which nearby coefficients are nonzero.  For the first
+   coefficient (DC, unless block type is 0), we look at the (already encoded)
+   blocks above and to the left of the current block.  The context index is
+   then the number (0,1,or 2) of these blocks having nonzero coefficients.
+   After decoding a coefficient, the measure is roughly the size of the
+   most recently decoded coefficient (0 for 0, 1 for 1, 2 for >1).
+   Note that the intuitive meaning of this measure changes as coefficients
+   are decoded, e.g., prior to the first token, a zero means that my neighbors
+   are empty while, after the first token, because of the use of end-of-block,
+   a zero means we just decoded a zero and hence guarantees that a non-zero
+   coefficient will appear later in this block.  However, this shift
+   in meaning is perfectly OK because our context depends also on the
+   coefficient band (and since zigzag positions 0, 1, and 2 are in
+   distinct bands). */
+
+/*# define DC_TOKEN_CONTEXTS        3*/ /* 00, 0!0, !0!0 */
+#   define PREV_COEF_CONTEXTS       3
+
+extern DECLARE_ALIGNED(16, const unsigned char, vp8_prev_token_class[MAX_ENTROPY_TOKENS]);
+
+extern const vp8_prob vp8_coef_update_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+
+
+struct VP8Common;
+void vp8_default_coef_probs(struct VP8Common *);
+
+extern DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]);
+extern DECLARE_ALIGNED(16, const short, vp8_default_inv_zig_zag[16]);
+extern DECLARE_ALIGNED(16, const short, vp8_default_zig_zag_mask[16]);
+extern const int vp8_mb_feature_data_bits[MB_LVL_MAX];
+
+void vp8_coef_tree_initialize(void);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_COMMON_ENTROPY_H_
diff --git a/libs/libvpx/vp8/common/entropymode.c b/libs/libvpx/vp8/common/entropymode.c
new file mode 100644
index 0000000000..8981a8d3c2
--- /dev/null
+++ b/libs/libvpx/vp8/common/entropymode.c
@@ -0,0 +1,171 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#define USE_PREBUILT_TABLES
+
+#include "entropymode.h"
+#include "entropy.h"
+#include "vpx_mem/vpx_mem.h"
+
+#include "vp8_entropymodedata.h"
+
+int vp8_mv_cont(const int_mv *l, const int_mv *a)
+{
+    int lez = (l->as_int == 0);
+    int aez = (a->as_int == 0);
+    int lea = (l->as_int == a->as_int);
+
+    if (lea && lez)
+        return SUBMVREF_LEFT_ABOVE_ZED;
+
+    if (lea)
+        return SUBMVREF_LEFT_ABOVE_SAME;
+
+    if (aez)
+        return SUBMVREF_ABOVE_ZED;
+
+    if (lez)
+        return SUBMVREF_LEFT_ZED;
+
+    return SUBMVREF_NORMAL;
+}
+
+static const vp8_prob sub_mv_ref_prob [VP8_SUBMVREFS-1] = { 180, 162, 25};
+
+const vp8_prob vp8_sub_mv_ref_prob2 [SUBMVREF_COUNT][VP8_SUBMVREFS-1] =
+{
+    { 147, 136, 18 },
+    { 106, 145, 1  },
+    { 179, 121, 1  },
+    { 223, 1  , 34 },
+    { 208, 1  , 1  }
+};
+
+
+
+const vp8_mbsplit vp8_mbsplits [VP8_NUMMBSPLITS] =
+{
+    {
+        0,  0,  0,  0,
+        0,  0,  0,  0,
+        1,  1,  1,  1,
+        1,  1,  1,  1,
+    },
+    {
+        0,  0,  1,  1,
+        0,  0,  1,  1,
+        0,  0,  1,  1,
+        0,  0,  1,  1,
+    },
+    {
+        0,  0,  1,  1,
+        0,  0,  1,  1,
+        2,  2,  3,  3,
+        2,  2,  3,  3,
+    },
+    {
+        0,  1,  2,  3,
+        4,  5,  6,  7,
+        8,  9,  10, 11,
+        12, 13, 14, 15,
+    }
+};
+
+const int vp8_mbsplit_count [VP8_NUMMBSPLITS] = { 2, 2, 4, 16};
+
+const vp8_prob vp8_mbsplit_probs [VP8_NUMMBSPLITS-1] = { 110, 111, 150};
+
+
+/* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */
+
+const vp8_tree_index vp8_bmode_tree[18] =     /* INTRAMODECONTEXTNODE value */
+{
+    -B_DC_PRED, 2,                             /* 0 = DC_NODE */
+    -B_TM_PRED, 4,                            /* 1 = TM_NODE */
+    -B_VE_PRED, 6,                           /* 2 = VE_NODE */
+    8, 12,                                  /* 3 = COM_NODE */
+    -B_HE_PRED, 10,                        /* 4 = HE_NODE */
+    -B_RD_PRED, -B_VR_PRED,               /* 5 = RD_NODE */
+    -B_LD_PRED, 14,                        /* 6 = LD_NODE */
+    -B_VL_PRED, 16,                      /* 7 = VL_NODE */
+    -B_HD_PRED, -B_HU_PRED             /* 8 = HD_NODE */
+};
+
+/* Again, these trees use the same probability indices as their
+   explicitly-programmed predecessors. */
+
+const vp8_tree_index vp8_ymode_tree[8] =
+{
+    -DC_PRED, 2,
+    4, 6,
+    -V_PRED, -H_PRED,
+    -TM_PRED, -B_PRED
+};
+
+const vp8_tree_index vp8_kf_ymode_tree[8] =
+{
+    -B_PRED, 2,
+    4, 6,
+    -DC_PRED, -V_PRED,
+    -H_PRED, -TM_PRED
+};
+
+const vp8_tree_index vp8_uv_mode_tree[6] =
+{
+    -DC_PRED, 2,
+    -V_PRED, 4,
+    -H_PRED, -TM_PRED
+};
+
+const vp8_tree_index vp8_mbsplit_tree[6] =
+{
+    -3, 2,
+    -2, 4,
+    -0, -1
+};
+
+const vp8_tree_index vp8_mv_ref_tree[8] =
+{
+    -ZEROMV, 2,
+    -NEARESTMV, 4,
+    -NEARMV, 6,
+    -NEWMV, -SPLITMV
+};
+
+const vp8_tree_index vp8_sub_mv_ref_tree[6] =
+{
+    -LEFT4X4, 2,
+    -ABOVE4X4, 4,
+    -ZERO4X4, -NEW4X4
+};
+
+const vp8_tree_index vp8_small_mvtree [14] =
+{
+    2, 8,
+    4, 6,
+    -0, -1,
+    -2, -3,
+    10, 12,
+    -4, -5,
+    -6, -7
+};
+
+void vp8_init_mbmode_probs(VP8_COMMON *x)
+{
+    memcpy(x->fc.ymode_prob, vp8_ymode_prob, sizeof(vp8_ymode_prob));
+    memcpy(x->fc.uv_mode_prob, vp8_uv_mode_prob, sizeof(vp8_uv_mode_prob));
+    memcpy(x->fc.sub_mv_ref_prob, sub_mv_ref_prob, sizeof(sub_mv_ref_prob));
+}
+
+void vp8_default_bmode_probs(vp8_prob p [VP8_BINTRAMODES-1])
+{
+    memcpy(p, vp8_bmode_prob, sizeof(vp8_bmode_prob));
+}
+
diff --git a/libs/libvpx/vp8/common/entropymode.h b/libs/libvpx/vp8/common/entropymode.h
new file mode 100644
index 0000000000..81bdfc4b8b
--- /dev/null
+++ b/libs/libvpx/vp8/common/entropymode.h
@@ -0,0 +1,88 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_COMMON_ENTROPYMODE_H_
+#define VP8_COMMON_ENTROPYMODE_H_
+
+#include "onyxc_int.h"
+#include "treecoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum
+{
+    SUBMVREF_NORMAL,
+    SUBMVREF_LEFT_ZED,
+    SUBMVREF_ABOVE_ZED,
+    SUBMVREF_LEFT_ABOVE_SAME,
+    SUBMVREF_LEFT_ABOVE_ZED
+} sumvfref_t;
+
+typedef int vp8_mbsplit[16];
+
+#define VP8_NUMMBSPLITS 4
+
+extern const vp8_mbsplit vp8_mbsplits [VP8_NUMMBSPLITS];
+
+extern const int vp8_mbsplit_count [VP8_NUMMBSPLITS];    /* # of subsets */
+
+extern const vp8_prob vp8_mbsplit_probs [VP8_NUMMBSPLITS-1];
+
+extern int vp8_mv_cont(const int_mv *l, const int_mv *a);
+#define SUBMVREF_COUNT 5
+extern const vp8_prob vp8_sub_mv_ref_prob2 [SUBMVREF_COUNT][VP8_SUBMVREFS-1];
+
+
+extern const unsigned int vp8_kf_default_bmode_counts [VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES];
+
+
+extern const vp8_tree_index vp8_bmode_tree[];
+
+extern const vp8_tree_index  vp8_ymode_tree[];
+extern const vp8_tree_index  vp8_kf_ymode_tree[];
+extern const vp8_tree_index  vp8_uv_mode_tree[];
+
+extern const vp8_tree_index  vp8_mbsplit_tree[];
+extern const vp8_tree_index  vp8_mv_ref_tree[];
+extern const vp8_tree_index  vp8_sub_mv_ref_tree[];
+
+extern const struct vp8_token_struct vp8_bmode_encodings[VP8_BINTRAMODES];
+extern const struct vp8_token_struct vp8_ymode_encodings[VP8_YMODES];
+extern const struct vp8_token_struct vp8_kf_ymode_encodings[VP8_YMODES];
+extern const struct vp8_token_struct vp8_uv_mode_encodings[VP8_UV_MODES];
+extern const struct vp8_token_struct vp8_mbsplit_encodings[VP8_NUMMBSPLITS];
+
+/* Inter mode values do not start at zero */
+
+extern const struct vp8_token_struct vp8_mv_ref_encoding_array[VP8_MVREFS];
+extern const struct vp8_token_struct vp8_sub_mv_ref_encoding_array[VP8_SUBMVREFS];
+
+extern const vp8_tree_index vp8_small_mvtree[];
+
+extern const struct vp8_token_struct vp8_small_mvencodings[8];
+
+/* Key frame default mode probs */
+extern const vp8_prob vp8_kf_bmode_prob[VP8_BINTRAMODES][VP8_BINTRAMODES]
+[VP8_BINTRAMODES-1];
+extern const vp8_prob vp8_kf_uv_mode_prob[VP8_UV_MODES-1];
+extern const vp8_prob vp8_kf_ymode_prob[VP8_YMODES-1];
+
+void vp8_init_mbmode_probs(VP8_COMMON *x);
+void vp8_default_bmode_probs(vp8_prob dest [VP8_BINTRAMODES-1]);
+void vp8_kf_default_bmode_probs(vp8_prob dest [VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES-1]);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_COMMON_ENTROPYMODE_H_
diff --git a/libs/libvpx/vp8/common/entropymv.c b/libs/libvpx/vp8/common/entropymv.c
new file mode 100644
index 0000000000..e5df1f0955
--- /dev/null
+++ b/libs/libvpx/vp8/common/entropymv.c
@@ -0,0 +1,49 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "entropymv.h"
+
+const MV_CONTEXT vp8_mv_update_probs[2] =
+{
+    {{
+        237,
+        246,
+        253, 253, 254, 254, 254, 254, 254,
+        254, 254, 254, 254, 254, 250, 250, 252, 254, 254
+    }},
+    {{
+        231,
+        243,
+        245, 253, 254, 254, 254, 254, 254,
+        254, 254, 254, 254, 254, 251, 251, 254, 254, 254
+    }}
+};
+const MV_CONTEXT vp8_default_mv_context[2] =
+{
+    {{
+        /* row */
+        162,                                        /* is short */
+        128,                                        /* sign */
+        225, 146, 172, 147, 214,  39, 156,          /* short tree */
+        128, 129, 132,  75, 145, 178, 206, 239, 254, 254 /* long bits */
+    }},
+
+
+
+    {{
+        /* same for column */
+        164,                                        /* is short */
+        128,
+        204, 170, 119, 235, 140, 230, 228,
+        128, 130, 130,  74, 148, 180, 203, 236, 254, 254 /* long bits */
+
+    }}
+};
diff --git a/libs/libvpx/vp8/common/entropymv.h b/libs/libvpx/vp8/common/entropymv.h
new file mode 100644
index 0000000000..42840d58ad
--- /dev/null
+++ b/libs/libvpx/vp8/common/entropymv.h
@@ -0,0 +1,52 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_COMMON_ENTROPYMV_H_
+#define VP8_COMMON_ENTROPYMV_H_
+
+#include "treecoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum
+{
+    mv_max  = 1023,              /* max absolute value of a MV component */
+    MVvals = (2 * mv_max) + 1,   /* # possible values "" */
+    mvfp_max  = 255,              /* max absolute value of a full pixel MV component */
+    MVfpvals = (2 * mvfp_max) +1, /* # possible full pixel MV values */
+
+    mvlong_width = 10,       /* Large MVs have 9 bit magnitudes */
+    mvnum_short = 8,         /* magnitudes 0 through 7 */
+
+    /* probability offsets for coding each MV component */
+
+    mvpis_short = 0,         /* short (<= 7) vs long (>= 8) */
+    MVPsign,                /* sign for non-zero */
+    MVPshort,               /* 8 short values = 7-position tree */
+
+    MVPbits = MVPshort + mvnum_short - 1, /* mvlong_width long value bits */
+    MVPcount = MVPbits + mvlong_width    /* (with independent probabilities) */
+};
+
+typedef struct mv_context
+{
+    vp8_prob prob[MVPcount];  /* often come in row, col pairs */
+} MV_CONTEXT;
+
+extern const MV_CONTEXT vp8_mv_update_probs[2], vp8_default_mv_context[2];
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_COMMON_ENTROPYMV_H_
diff --git a/libs/libvpx/vp8/common/extend.c b/libs/libvpx/vp8/common/extend.c
new file mode 100644
index 0000000000..2d938ad782
--- /dev/null
+++ b/libs/libvpx/vp8/common/extend.c
@@ -0,0 +1,188 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "extend.h"
+#include "vpx_mem/vpx_mem.h"
+
+
+static void copy_and_extend_plane
+(
+    unsigned char *s, /* source */
+    int sp,           /* source pitch */
+    unsigned char *d, /* destination */
+    int dp,           /* destination pitch */
+    int h,            /* height */
+    int w,            /* width */
+    int et,           /* extend top border */
+    int el,           /* extend left border */
+    int eb,           /* extend bottom border */
+    int er            /* extend right border */
+)
+{
+    int i;
+    unsigned char *src_ptr1, *src_ptr2;
+    unsigned char *dest_ptr1, *dest_ptr2;
+    int linesize;
+
+    /* copy the left and right most columns out */
+    src_ptr1 = s;
+    src_ptr2 = s + w - 1;
+    dest_ptr1 = d - el;
+    dest_ptr2 = d + w;
+
+    for (i = 0; i < h; i++)
+    {
+        memset(dest_ptr1, src_ptr1[0], el);
+        memcpy(dest_ptr1 + el, src_ptr1, w);
+        memset(dest_ptr2, src_ptr2[0], er);
+        src_ptr1  += sp;
+        src_ptr2  += sp;
+        dest_ptr1 += dp;
+        dest_ptr2 += dp;
+    }
+
+    /* Now copy the top and bottom lines into each line of the respective
+     * borders
+     */
+    src_ptr1 = d - el;
+    src_ptr2 = d + dp * (h - 1) - el;
+    dest_ptr1 = d + dp * (-et) - el;
+    dest_ptr2 = d + dp * (h) - el;
+    linesize = el + er + w;
+
+    for (i = 0; i < et; i++)
+    {
+        memcpy(dest_ptr1, src_ptr1, linesize);
+        dest_ptr1 += dp;
+    }
+
+    for (i = 0; i < eb; i++)
+    {
+        memcpy(dest_ptr2, src_ptr2, linesize);
+        dest_ptr2 += dp;
+    }
+}
+
+
+void vp8_copy_and_extend_frame(YV12_BUFFER_CONFIG *src,
+                               YV12_BUFFER_CONFIG *dst)
+{
+    int et = dst->border;
+    int el = dst->border;
+    int eb = dst->border + dst->y_height - src->y_height;
+    int er = dst->border + dst->y_width - src->y_width;
+
+    copy_and_extend_plane(src->y_buffer, src->y_stride,
+                          dst->y_buffer, dst->y_stride,
+                          src->y_height, src->y_width,
+                          et, el, eb, er);
+
+    et = dst->border >> 1;
+    el = dst->border >> 1;
+    eb = (dst->border >> 1) + dst->uv_height - src->uv_height;
+    er = (dst->border >> 1) + dst->uv_width - src->uv_width;
+
+    copy_and_extend_plane(src->u_buffer, src->uv_stride,
+                          dst->u_buffer, dst->uv_stride,
+                          src->uv_height, src->uv_width,
+                          et, el, eb, er);
+
+    copy_and_extend_plane(src->v_buffer, src->uv_stride,
+                          dst->v_buffer, dst->uv_stride,
+                          src->uv_height, src->uv_width,
+                          et, el, eb, er);
+}
+
+
+void vp8_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src,
+                                         YV12_BUFFER_CONFIG *dst,
+                                         int srcy, int srcx,
+                                         int srch, int srcw)
+{
+    int et = dst->border;
+    int el = dst->border;
+    int eb = dst->border + dst->y_height - src->y_height;
+    int er = dst->border + dst->y_width - src->y_width;
+    int src_y_offset = srcy * src->y_stride + srcx;
+    int dst_y_offset = srcy * dst->y_stride + srcx;
+    int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1);
+    int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1);
+
+    /* If the side is not touching the bounder then don't extend. */
+    if (srcy)
+      et = 0;
+    if (srcx)
+      el = 0;
+    if (srcy + srch != src->y_height)
+      eb = 0;
+    if (srcx + srcw != src->y_width)
+      er = 0;
+
+    copy_and_extend_plane(src->y_buffer + src_y_offset,
+                          src->y_stride,
+                          dst->y_buffer + dst_y_offset,
+                          dst->y_stride,
+                          srch, srcw,
+                          et, el, eb, er);
+
+    et = (et + 1) >> 1;
+    el = (el + 1) >> 1;
+    eb = (eb + 1) >> 1;
+    er = (er + 1) >> 1;
+    srch = (srch + 1) >> 1;
+    srcw = (srcw + 1) >> 1;
+
+    copy_and_extend_plane(src->u_buffer + src_uv_offset,
+                          src->uv_stride,
+                          dst->u_buffer + dst_uv_offset,
+                          dst->uv_stride,
+                          srch, srcw,
+                          et, el, eb, er);
+
+    copy_and_extend_plane(src->v_buffer + src_uv_offset,
+                          src->uv_stride,
+                          dst->v_buffer + dst_uv_offset,
+                          dst->uv_stride,
+                          srch, srcw,
+                          et, el, eb, er);
+}
+
+
+/* note the extension is only for the last row, for intra prediction purpose */
+void vp8_extend_mb_row(YV12_BUFFER_CONFIG *ybf,
+                       unsigned char *YPtr,
+                       unsigned char *UPtr,
+                       unsigned char *VPtr)
+{
+    int i;
+
+    YPtr += ybf->y_stride * 14;
+    UPtr += ybf->uv_stride * 6;
+    VPtr += ybf->uv_stride * 6;
+
+    for (i = 0; i < 4; i++)
+    {
+        YPtr[i] = YPtr[-1];
+        UPtr[i] = UPtr[-1];
+        VPtr[i] = VPtr[-1];
+    }
+
+    YPtr += ybf->y_stride;
+    UPtr += ybf->uv_stride;
+    VPtr += ybf->uv_stride;
+
+    for (i = 0; i < 4; i++)
+    {
+        YPtr[i] = YPtr[-1];
+        UPtr[i] = UPtr[-1];
+        VPtr[i] = VPtr[-1];
+    }
+}
diff --git a/libs/libvpx/vp8/common/extend.h b/libs/libvpx/vp8/common/extend.h
new file mode 100644
index 0000000000..068f4ac523
--- /dev/null
+++ b/libs/libvpx/vp8/common/extend.h
@@ -0,0 +1,33 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_COMMON_EXTEND_H_
+#define VP8_COMMON_EXTEND_H_
+
+#include "vpx_scale/yv12config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp8_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr, unsigned char *UPtr, unsigned char *VPtr);
+void vp8_copy_and_extend_frame(YV12_BUFFER_CONFIG *src,
+                               YV12_BUFFER_CONFIG *dst);
+void vp8_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src,
+                                         YV12_BUFFER_CONFIG *dst,
+                                         int srcy, int srcx,
+                                         int srch, int srcw);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_COMMON_EXTEND_H_
diff --git a/libs/libvpx/vp8/common/filter.c b/libs/libvpx/vp8/common/filter.c
new file mode 100644
index 0000000000..84c608effa
--- /dev/null
+++ b/libs/libvpx/vp8/common/filter.c
@@ -0,0 +1,493 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "filter.h"
+#include "./vp8_rtcd.h"
+
+DECLARE_ALIGNED(16, const short, vp8_bilinear_filters[8][2]) =
+{
+    { 128,   0 },
+    { 112,  16 },
+    {  96,  32 },
+    {  80,  48 },
+    {  64,  64 },
+    {  48,  80 },
+    {  32,  96 },
+    {  16, 112 }
+};
+
+DECLARE_ALIGNED(16, const short, vp8_sub_pel_filters[8][6]) =
+{
+
+    { 0,  0,  128,    0,   0,  0 },         /* note that 1/8 pel positions are just as per alpha -0.5 bicubic */
+    { 0, -6,  123,   12,  -1,  0 },
+    { 2, -11, 108,   36,  -8,  1 },         /* New 1/4 pel 6 tap filter */
+    { 0, -9,   93,   50,  -6,  0 },
+    { 3, -16,  77,   77, -16,  3 },         /* New 1/2 pel 6 tap filter */
+    { 0, -6,   50,   93,  -9,  0 },
+    { 1, -8,   36,  108, -11,  2 },         /* New 1/4 pel 6 tap filter */
+    { 0, -1,   12,  123,  -6,  0 },
+};
+
+static void filter_block2d_first_pass
+(
+    unsigned char *src_ptr,
+    int *output_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int pixel_step,
+    unsigned int output_height,
+    unsigned int output_width,
+    const short *vp8_filter
+)
+{
+    unsigned int i, j;
+    int  Temp;
+
+    for (i = 0; i < output_height; i++)
+    {
+        for (j = 0; j < output_width; j++)
+        {
+            Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp8_filter[0]) +
+                   ((int)src_ptr[-1 * (int)pixel_step] * vp8_filter[1]) +
+                   ((int)src_ptr[0]                 * vp8_filter[2]) +
+                   ((int)src_ptr[pixel_step]         * vp8_filter[3]) +
+                   ((int)src_ptr[2*pixel_step]       * vp8_filter[4]) +
+                   ((int)src_ptr[3*pixel_step]       * vp8_filter[5]) +
+                   (VP8_FILTER_WEIGHT >> 1);      /* Rounding */
+
+            /* Normalize back to 0-255 */
+            Temp = Temp >> VP8_FILTER_SHIFT;
+
+            if (Temp < 0)
+                Temp = 0;
+            else if (Temp > 255)
+                Temp = 255;
+
+            output_ptr[j] = Temp;
+            src_ptr++;
+        }
+
+        /* Next row... */
+        src_ptr    += src_pixels_per_line - output_width;
+        output_ptr += output_width;
+    }
+}
+
+static void filter_block2d_second_pass
+(
+    int *src_ptr,
+    unsigned char *output_ptr,
+    int output_pitch,
+    unsigned int src_pixels_per_line,
+    unsigned int pixel_step,
+    unsigned int output_height,
+    unsigned int output_width,
+    const short *vp8_filter
+)
+{
+    unsigned int i, j;
+    int  Temp;
+
+    for (i = 0; i < output_height; i++)
+    {
+        for (j = 0; j < output_width; j++)
+        {
+            /* Apply filter */
+            Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp8_filter[0]) +
+                   ((int)src_ptr[-1 * (int)pixel_step] * vp8_filter[1]) +
+                   ((int)src_ptr[0]                 * vp8_filter[2]) +
+                   ((int)src_ptr[pixel_step]         * vp8_filter[3]) +
+                   ((int)src_ptr[2*pixel_step]       * vp8_filter[4]) +
+                   ((int)src_ptr[3*pixel_step]       * vp8_filter[5]) +
+                   (VP8_FILTER_WEIGHT >> 1);   /* Rounding */
+
+            /* Normalize back to 0-255 */
+            Temp = Temp >> VP8_FILTER_SHIFT;
+
+            if (Temp < 0)
+                Temp = 0;
+            else if (Temp > 255)
+                Temp = 255;
+
+            output_ptr[j] = (unsigned char)Temp;
+            src_ptr++;
+        }
+
+        /* Start next row */
+        src_ptr    += src_pixels_per_line - output_width;
+        output_ptr += output_pitch;
+    }
+}
+
+
+static void filter_block2d
+(
+    unsigned char  *src_ptr,
+    unsigned char  *output_ptr,
+    unsigned int src_pixels_per_line,
+    int output_pitch,
+    const short  *HFilter,
+    const short  *VFilter
+)
+{
+    int FData[9*4]; /* Temp data buffer used in filtering */
+
+    /* First filter 1-D horizontally... */
+    filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 4, HFilter);
+
+    /* then filter verticaly... */
+    filter_block2d_second_pass(FData + 8, output_ptr, output_pitch, 4, 4, 4, 4, VFilter);
+}
+
+
+void vp8_sixtap_predict4x4_c
+(
+    unsigned char  *src_ptr,
+    int   src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pitch
+)
+{
+    const short  *HFilter;
+    const short  *VFilter;
+
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
+
+    filter_block2d(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter);
+}
+void vp8_sixtap_predict8x8_c
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const short  *HFilter;
+    const short  *VFilter;
+    int FData[13*16];   /* Temp data buffer used in filtering */
+
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
+
+    /* First filter 1-D horizontally... */
+    filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 13, 8, HFilter);
+
+
+    /* then filter verticaly... */
+    filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 8, 8, VFilter);
+
+}
+
+void vp8_sixtap_predict8x4_c
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const short  *HFilter;
+    const short  *VFilter;
+    int FData[13*16];   /* Temp data buffer used in filtering */
+
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
+
+    /* First filter 1-D horizontally... */
+    filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 8, HFilter);
+
+
+    /* then filter verticaly... */
+    filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 4, 8, VFilter);
+
+}
+
+void vp8_sixtap_predict16x16_c
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const short  *HFilter;
+    const short  *VFilter;
+    int FData[21*24];   /* Temp data buffer used in filtering */
+
+
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
+
+    /* First filter 1-D horizontally... */
+    filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 21, 16, HFilter);
+
+    /* then filter verticaly... */
+    filter_block2d_second_pass(FData + 32, dst_ptr, dst_pitch, 16, 16, 16, 16, VFilter);
+
+}
+
+
+/****************************************************************************
+ *
+ *  ROUTINE       : filter_block2d_bil_first_pass
+ *
+ *  INPUTS        : UINT8  *src_ptr    : Pointer to source block.
+ *                  UINT32  src_stride : Stride of source block.
+ *                  UINT32  height     : Block height.
+ *                  UINT32  width      : Block width.
+ *                  INT32  *vp8_filter : Array of 2 bi-linear filter taps.
+ *
+ *  OUTPUTS       : INT32  *dst_ptr    : Pointer to filtered block.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block
+ *                  in the horizontal direction to produce the filtered output
+ *                  block. Used to implement first-pass of 2-D separable filter.
+ *
+ *  SPECIAL NOTES : Produces INT32 output to retain precision for next pass.
+ *                  Two filter taps should sum to VP8_FILTER_WEIGHT.
+ *
+ ****************************************************************************/
+static void filter_block2d_bil_first_pass
+(
+    unsigned char  *src_ptr,
+    unsigned short *dst_ptr,
+    unsigned int    src_stride,
+    unsigned int    height,
+    unsigned int    width,
+    const short    *vp8_filter
+)
+{
+    unsigned int i, j;
+
+    for (i = 0; i < height; i++)
+    {
+        for (j = 0; j < width; j++)
+        {
+            /* Apply bilinear filter */
+            dst_ptr[j] = (((int)src_ptr[0] * vp8_filter[0]) +
+                          ((int)src_ptr[1] * vp8_filter[1]) +
+                          (VP8_FILTER_WEIGHT / 2)) >> VP8_FILTER_SHIFT;
+            src_ptr++;
+        }
+
+        /* Next row... */
+        src_ptr += src_stride - width;
+        dst_ptr += width;
+    }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : filter_block2d_bil_second_pass
+ *
+ *  INPUTS        : INT32  *src_ptr    : Pointer to source block.
+ *                  UINT32  dst_pitch  : Destination block pitch.
+ *                  UINT32  height     : Block height.
+ *                  UINT32  width      : Block width.
+ *                  INT32  *vp8_filter : Array of 2 bi-linear filter taps.
+ *
+ *  OUTPUTS       : UINT16 *dst_ptr    : Pointer to filtered block.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block
+ *                  in the vertical direction to produce the filtered output
+ *                  block. Used to implement second-pass of 2-D separable filter.
+ *
+ *  SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass.
+ *                  Two filter taps should sum to VP8_FILTER_WEIGHT.
+ *
+ ****************************************************************************/
+static void filter_block2d_bil_second_pass
+(
+    unsigned short *src_ptr,
+    unsigned char  *dst_ptr,
+    int             dst_pitch,
+    unsigned int    height,
+    unsigned int    width,
+    const short    *vp8_filter
+)
+{
+    unsigned int  i, j;
+    int  Temp;
+
+    for (i = 0; i < height; i++)
+    {
+        for (j = 0; j < width; j++)
+        {
+            /* Apply filter */
+            Temp = ((int)src_ptr[0]     * vp8_filter[0]) +
+                   ((int)src_ptr[width] * vp8_filter[1]) +
+                   (VP8_FILTER_WEIGHT / 2);
+            dst_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT);
+            src_ptr++;
+        }
+
+        /* Next row... */
+        dst_ptr += dst_pitch;
+    }
+}
+
+
+/****************************************************************************
+ *
+ *  ROUTINE       : filter_block2d_bil
+ *
+ *  INPUTS        : UINT8  *src_ptr          : Pointer to source block.
+ *                  UINT32  src_pitch        : Stride of source block.
+ *                  UINT32  dst_pitch        : Stride of destination block.
+ *                  INT32  *HFilter          : Array of 2 horizontal filter taps.
+ *                  INT32  *VFilter          : Array of 2 vertical filter taps.
+ *                  INT32  Width             : Block width
+ *                  INT32  Height            : Block height
+ *
+ *  OUTPUTS       : UINT16 *dst_ptr       : Pointer to filtered block.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : 2-D filters an input block by applying a 2-tap
+ *                  bi-linear filter horizontally followed by a 2-tap
+ *                  bi-linear filter vertically on the result.
+ *
+ *  SPECIAL NOTES : The largest block size can be handled here is 16x16
+ *
+ ****************************************************************************/
+static void filter_block2d_bil
+(
+    unsigned char *src_ptr,
+    unsigned char *dst_ptr,
+    unsigned int   src_pitch,
+    unsigned int   dst_pitch,
+    const short   *HFilter,
+    const short   *VFilter,
+    int            Width,
+    int            Height
+)
+{
+
+    unsigned short FData[17*16];    /* Temp data buffer used in filtering */
+
+    /* First filter 1-D horizontally... */
+    filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);
+
+    /* then 1-D vertically... */
+    filter_block2d_bil_second_pass(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
+}
+
+
+void vp8_bilinear_predict4x4_c
+(
+    unsigned char  *src_ptr,
+    int   src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pitch
+)
+{
+    const short *HFilter;
+    const short *VFilter;
+
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
+#if 0
+    {
+        int i;
+        unsigned char temp1[16];
+        unsigned char temp2[16];
+
+        bilinear_predict4x4_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, temp1, 4);
+        filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4);
+
+        for (i = 0; i < 16; i++)
+        {
+            if (temp1[i] != temp2[i])
+            {
+                bilinear_predict4x4_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, temp1, 4);
+                filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4);
+            }
+        }
+    }
+#endif
+    filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);
+
+}
+
+void vp8_bilinear_predict8x8_c
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const short *HFilter;
+    const short *VFilter;
+
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
+
+    filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);
+
+}
+
+void vp8_bilinear_predict8x4_c
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const short *HFilter;
+    const short *VFilter;
+
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
+
+    filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);
+
+}
+
+void vp8_bilinear_predict16x16_c
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const short *HFilter;
+    const short *VFilter;
+
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
+
+    filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);
+}
diff --git a/libs/libvpx/vp8/common/filter.h b/libs/libvpx/vp8/common/filter.h
new file mode 100644
index 0000000000..cfba775fce
--- /dev/null
+++ b/libs/libvpx/vp8/common/filter.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_COMMON_FILTER_H_
+#define VP8_COMMON_FILTER_H_
+
+#include "vpx_ports/mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define BLOCK_HEIGHT_WIDTH 4
+#define VP8_FILTER_WEIGHT 128
+#define VP8_FILTER_SHIFT  7
+
+extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters[8][2]);
+extern DECLARE_ALIGNED(16, const short, vp8_sub_pel_filters[8][6]);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_COMMON_FILTER_H_
diff --git a/libs/libvpx/vp8/common/findnearmv.c b/libs/libvpx/vp8/common/findnearmv.c
new file mode 100644
index 0000000000..e8ee40f56c
--- /dev/null
+++ b/libs/libvpx/vp8/common/findnearmv.c
@@ -0,0 +1,193 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "findnearmv.h"
+
+const unsigned char vp8_mbsplit_offset[4][16] = {
+    { 0,  8,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
+    { 0,  2,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
+    { 0,  2,  8, 10,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
+    { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15}
+};
+
+/* Predict motion vectors using those from already-decoded nearby blocks.
+   Note that we only consider one 4x4 subblock from each candidate 16x16
+   macroblock.   */
+void vp8_find_near_mvs
+(
+    MACROBLOCKD *xd,
+    const MODE_INFO *here,
+    int_mv *nearest,
+    int_mv *nearby,
+    int_mv *best_mv,
+    int cnt[4],
+    int refframe,
+    int *ref_frame_sign_bias
+)
+{
+    const MODE_INFO *above = here - xd->mode_info_stride;
+    const MODE_INFO *left = here - 1;
+    const MODE_INFO *aboveleft = above - 1;
+    int_mv            near_mvs[4];
+    int_mv           *mv = near_mvs;
+    int             *cntx = cnt;
+    enum {CNT_INTRA, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV};
+
+    /* Zero accumulators */
+    mv[0].as_int = mv[1].as_int = mv[2].as_int = 0;
+    cnt[0] = cnt[1] = cnt[2] = cnt[3] = 0;
+
+    /* Process above */
+    if (above->mbmi.ref_frame != INTRA_FRAME)
+    {
+        if (above->mbmi.mv.as_int)
+        {
+            (++mv)->as_int = above->mbmi.mv.as_int;
+            mv_bias(ref_frame_sign_bias[above->mbmi.ref_frame], refframe, mv, ref_frame_sign_bias);
+            ++cntx;
+        }
+
+        *cntx += 2;
+    }
+
+    /* Process left */
+    if (left->mbmi.ref_frame != INTRA_FRAME)
+    {
+        if (left->mbmi.mv.as_int)
+        {
+            int_mv this_mv;
+
+            this_mv.as_int = left->mbmi.mv.as_int;
+            mv_bias(ref_frame_sign_bias[left->mbmi.ref_frame], refframe, &this_mv, ref_frame_sign_bias);
+
+            if (this_mv.as_int != mv->as_int)
+            {
+                (++mv)->as_int = this_mv.as_int;
+                ++cntx;
+            }
+
+            *cntx += 2;
+        }
+        else
+            cnt[CNT_INTRA] += 2;
+    }
+
+    /* Process above left */
+    if (aboveleft->mbmi.ref_frame != INTRA_FRAME)
+    {
+        if (aboveleft->mbmi.mv.as_int)
+        {
+            int_mv this_mv;
+
+            this_mv.as_int = aboveleft->mbmi.mv.as_int;
+            mv_bias(ref_frame_sign_bias[aboveleft->mbmi.ref_frame], refframe, &this_mv, ref_frame_sign_bias);
+
+            if (this_mv.as_int != mv->as_int)
+            {
+                (++mv)->as_int = this_mv.as_int;
+                ++cntx;
+            }
+
+            *cntx += 1;
+        }
+        else
+            cnt[CNT_INTRA] += 1;
+    }
+
+    /* If we have three distinct MV's ... */
+    if (cnt[CNT_SPLITMV])
+    {
+        /* See if above-left MV can be merged with NEAREST */
+        if (mv->as_int == near_mvs[CNT_NEAREST].as_int)
+            cnt[CNT_NEAREST] += 1;
+    }
+
+    cnt[CNT_SPLITMV] = ((above->mbmi.mode == SPLITMV)
+                        + (left->mbmi.mode == SPLITMV)) * 2
+                       + (aboveleft->mbmi.mode == SPLITMV);
+
+    /* Swap near and nearest if necessary */
+    if (cnt[CNT_NEAR] > cnt[CNT_NEAREST])
+    {
+        int tmp;
+        tmp = cnt[CNT_NEAREST];
+        cnt[CNT_NEAREST] = cnt[CNT_NEAR];
+        cnt[CNT_NEAR] = tmp;
+        tmp = near_mvs[CNT_NEAREST].as_int;
+        near_mvs[CNT_NEAREST].as_int = near_mvs[CNT_NEAR].as_int;
+        near_mvs[CNT_NEAR].as_int = tmp;
+    }
+
+    /* Use near_mvs[0] to store the "best" MV */
+    if (cnt[CNT_NEAREST] >= cnt[CNT_INTRA])
+        near_mvs[CNT_INTRA] = near_mvs[CNT_NEAREST];
+
+    /* Set up return values */
+    best_mv->as_int = near_mvs[0].as_int;
+    nearest->as_int = near_mvs[CNT_NEAREST].as_int;
+    nearby->as_int = near_mvs[CNT_NEAR].as_int;
+}
+
+
+static void invert_and_clamp_mvs(int_mv *inv, int_mv *src, MACROBLOCKD *xd)
+{
+    inv->as_mv.row = src->as_mv.row * -1;
+    inv->as_mv.col = src->as_mv.col * -1;
+    vp8_clamp_mv2(inv, xd);
+    vp8_clamp_mv2(src, xd);
+}
+
+
+int vp8_find_near_mvs_bias
+(
+    MACROBLOCKD *xd,
+    const MODE_INFO *here,
+    int_mv mode_mv_sb[2][MB_MODE_COUNT],
+    int_mv best_mv_sb[2],
+    int cnt[4],
+    int refframe,
+    int *ref_frame_sign_bias
+)
+{
+    int sign_bias = ref_frame_sign_bias[refframe];
+
+    vp8_find_near_mvs(xd,
+                      here,
+                      &mode_mv_sb[sign_bias][NEARESTMV],
+                      &mode_mv_sb[sign_bias][NEARMV],
+                      &best_mv_sb[sign_bias],
+                      cnt,
+                      refframe,
+                      ref_frame_sign_bias);
+
+    invert_and_clamp_mvs(&mode_mv_sb[!sign_bias][NEARESTMV],
+                         &mode_mv_sb[sign_bias][NEARESTMV], xd);
+    invert_and_clamp_mvs(&mode_mv_sb[!sign_bias][NEARMV],
+                         &mode_mv_sb[sign_bias][NEARMV], xd);
+    invert_and_clamp_mvs(&best_mv_sb[!sign_bias],
+                         &best_mv_sb[sign_bias], xd);
+
+    return sign_bias;
+}
+
+
+vp8_prob *vp8_mv_ref_probs(
+    vp8_prob p[VP8_MVREFS-1], const int near_mv_ref_ct[4]
+)
+{
+    p[0] = vp8_mode_contexts [near_mv_ref_ct[0]] [0];
+    p[1] = vp8_mode_contexts [near_mv_ref_ct[1]] [1];
+    p[2] = vp8_mode_contexts [near_mv_ref_ct[2]] [2];
+    p[3] = vp8_mode_contexts [near_mv_ref_ct[3]] [3];
+    /*p[3] = vp8_mode_contexts [near_mv_ref_ct[1] + near_mv_ref_ct[2] + near_mv_ref_ct[3]] [3];*/
+    return p;
+}
+
diff --git a/libs/libvpx/vp8/common/findnearmv.h b/libs/libvpx/vp8/common/findnearmv.h
new file mode 100644
index 0000000000..155847ca24
--- /dev/null
+++ b/libs/libvpx/vp8/common/findnearmv.h
@@ -0,0 +1,194 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_COMMON_FINDNEARMV_H_
+#define VP8_COMMON_FINDNEARMV_H_
+
+#include "./vpx_config.h"
+#include "mv.h"
+#include "blockd.h"
+#include "modecont.h"
+#include "treecoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+static INLINE void mv_bias(int refmb_ref_frame_sign_bias, int refframe,
+                           int_mv *mvp, const int *ref_frame_sign_bias)
+{
+    if (refmb_ref_frame_sign_bias != ref_frame_sign_bias[refframe])
+    {
+        mvp->as_mv.row *= -1;
+        mvp->as_mv.col *= -1;
+    }
+}
+
+#define LEFT_TOP_MARGIN (16 << 3)
+#define RIGHT_BOTTOM_MARGIN (16 << 3)
+static INLINE void vp8_clamp_mv2(int_mv *mv, const MACROBLOCKD *xd)
+{
+    if (mv->as_mv.col < (xd->mb_to_left_edge - LEFT_TOP_MARGIN))
+        mv->as_mv.col = xd->mb_to_left_edge - LEFT_TOP_MARGIN;
+    else if (mv->as_mv.col > xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN)
+        mv->as_mv.col = xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN;
+
+    if (mv->as_mv.row < (xd->mb_to_top_edge - LEFT_TOP_MARGIN))
+        mv->as_mv.row = xd->mb_to_top_edge - LEFT_TOP_MARGIN;
+    else if (mv->as_mv.row > xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN)
+        mv->as_mv.row = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN;
+}
+
+static INLINE void vp8_clamp_mv(int_mv *mv, int mb_to_left_edge,
+                                int mb_to_right_edge, int mb_to_top_edge,
+                                int mb_to_bottom_edge)
+{
+    mv->as_mv.col = (mv->as_mv.col < mb_to_left_edge) ?
+        mb_to_left_edge : mv->as_mv.col;
+    mv->as_mv.col = (mv->as_mv.col > mb_to_right_edge) ?
+        mb_to_right_edge : mv->as_mv.col;
+    mv->as_mv.row = (mv->as_mv.row < mb_to_top_edge) ?
+        mb_to_top_edge : mv->as_mv.row;
+    mv->as_mv.row = (mv->as_mv.row > mb_to_bottom_edge) ?
+        mb_to_bottom_edge : mv->as_mv.row;
+}
+static INLINE unsigned int vp8_check_mv_bounds(int_mv *mv, int mb_to_left_edge,
+                                               int mb_to_right_edge,
+                                               int mb_to_top_edge,
+                                               int mb_to_bottom_edge)
+{
+    unsigned int need_to_clamp;
+    need_to_clamp = (mv->as_mv.col < mb_to_left_edge);
+    need_to_clamp |= (mv->as_mv.col > mb_to_right_edge);
+    need_to_clamp |= (mv->as_mv.row < mb_to_top_edge);
+    need_to_clamp |= (mv->as_mv.row > mb_to_bottom_edge);
+    return need_to_clamp;
+}
+
+void vp8_find_near_mvs
+(
+    MACROBLOCKD *xd,
+    const MODE_INFO *here,
+    int_mv *nearest, int_mv *nearby, int_mv *best,
+    int near_mv_ref_cts[4],
+    int refframe,
+    int *ref_frame_sign_bias
+);
+
+
+int vp8_find_near_mvs_bias
+(
+    MACROBLOCKD *xd,
+    const MODE_INFO *here,
+    int_mv mode_mv_sb[2][MB_MODE_COUNT],
+    int_mv best_mv_sb[2],
+    int cnt[4],
+    int refframe,
+    int *ref_frame_sign_bias
+);
+
+
+vp8_prob *vp8_mv_ref_probs(
+    vp8_prob p[VP8_MVREFS-1], const int near_mv_ref_ct[4]
+);
+
+extern const unsigned char vp8_mbsplit_offset[4][16];
+
+
+static INLINE int left_block_mv(const MODE_INFO *cur_mb, int b)
+{
+    if (!(b & 3))
+    {
+        /* On L edge, get from MB to left of us */
+        --cur_mb;
+
+        if(cur_mb->mbmi.mode != SPLITMV)
+            return cur_mb->mbmi.mv.as_int;
+        b += 4;
+    }
+
+    return (cur_mb->bmi + b - 1)->mv.as_int;
+}
+
+static INLINE int above_block_mv(const MODE_INFO *cur_mb, int b, int mi_stride)
+{
+    if (!(b >> 2))
+    {
+        /* On top edge, get from MB above us */
+        cur_mb -= mi_stride;
+
+        if(cur_mb->mbmi.mode != SPLITMV)
+            return cur_mb->mbmi.mv.as_int;
+        b += 16;
+    }
+
+    return (cur_mb->bmi + (b - 4))->mv.as_int;
+}
+static INLINE B_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b)
+{
+    if (!(b & 3))
+    {
+        /* On L edge, get from MB to left of us */
+        --cur_mb;
+        switch (cur_mb->mbmi.mode)
+        {
+            case B_PRED:
+              return (cur_mb->bmi + b + 3)->as_mode;
+            case DC_PRED:
+                return B_DC_PRED;
+            case V_PRED:
+                return B_VE_PRED;
+            case H_PRED:
+                return B_HE_PRED;
+            case TM_PRED:
+                return B_TM_PRED;
+            default:
+                return B_DC_PRED;
+        }
+    }
+
+    return (cur_mb->bmi + b - 1)->as_mode;
+}
+
+static INLINE B_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb, int b,
+                                                 int mi_stride)
+{
+    if (!(b >> 2))
+    {
+        /* On top edge, get from MB above us */
+        cur_mb -= mi_stride;
+
+        switch (cur_mb->mbmi.mode)
+        {
+            case B_PRED:
+              return (cur_mb->bmi + b + 12)->as_mode;
+            case DC_PRED:
+                return B_DC_PRED;
+            case V_PRED:
+                return B_VE_PRED;
+            case H_PRED:
+                return B_HE_PRED;
+            case TM_PRED:
+                return B_TM_PRED;
+            default:
+                return B_DC_PRED;
+        }
+    }
+
+    return (cur_mb->bmi + b - 4)->as_mode;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_COMMON_FINDNEARMV_H_
diff --git a/libs/libvpx/vp8/common/generic/systemdependent.c b/libs/libvpx/vp8/common/generic/systemdependent.c
new file mode 100644
index 0000000000..28dc262ae5
--- /dev/null
+++ b/libs/libvpx/vp8/common/generic/systemdependent.c
@@ -0,0 +1,104 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+#if ARCH_ARM
+#include "vpx_ports/arm.h"
+#elif ARCH_X86 || ARCH_X86_64
+#include "vpx_ports/x86.h"
+#endif
+#include "vp8/common/onyxc_int.h"
+#include "vp8/common/systemdependent.h"
+
+#if CONFIG_MULTITHREAD
+#if HAVE_UNISTD_H && !defined(__OS2__)
+#include <unistd.h>
+#elif defined(_WIN32)
+#include <windows.h>
+typedef void (WINAPI *PGNSI)(LPSYSTEM_INFO);
+#elif defined(__OS2__)
+#define INCL_DOS
+#define INCL_DOSSPINLOCK
+#include <os2.h>
+#endif
+#endif
+
+#if CONFIG_MULTITHREAD
+static int get_cpu_count()
+{
+    int core_count = 16;
+
+#if HAVE_UNISTD_H && !defined(__OS2__)
+#if defined(_SC_NPROCESSORS_ONLN)
+    core_count = sysconf(_SC_NPROCESSORS_ONLN);
+#elif defined(_SC_NPROC_ONLN)
+    core_count = sysconf(_SC_NPROC_ONLN);
+#endif
+#elif defined(_WIN32)
+    {
+#if _WIN32_WINNT >= 0x0501
+        SYSTEM_INFO sysinfo;
+        GetNativeSystemInfo(&sysinfo);
+#else
+        PGNSI pGNSI;
+        SYSTEM_INFO sysinfo;
+
+        /* Call GetNativeSystemInfo if supported or
+         * GetSystemInfo otherwise. */
+
+        pGNSI = (PGNSI) GetProcAddress(
+                GetModuleHandle(TEXT("kernel32.dll")), "GetNativeSystemInfo");
+        if (pGNSI != NULL)
+            pGNSI(&sysinfo);
+        else
+            GetSystemInfo(&sysinfo);
+#endif
+
+        core_count = sysinfo.dwNumberOfProcessors;
+    }
+#elif defined(__OS2__)
+    {
+        ULONG proc_id;
+        ULONG status;
+
+        core_count = 0;
+        for (proc_id = 1; ; proc_id++)
+        {
+            if (DosGetProcessorStatus(proc_id, &status))
+                break;
+
+            if (status == PROC_ONLINE)
+                core_count++;
+        }
+    }
+#else
+    /* other platforms */
+#endif
+
+    return core_count > 0 ? core_count : 1;
+}
+#endif
+
+void vp8_clear_system_state_c() {};
+
+void vp8_machine_specific_config(VP8_COMMON *ctx)
+{
+#if CONFIG_MULTITHREAD
+    ctx->processor_core_count = get_cpu_count();
+#endif /* CONFIG_MULTITHREAD */
+
+#if ARCH_ARM
+    ctx->cpu_caps = arm_cpu_caps();
+#elif ARCH_X86 || ARCH_X86_64
+    ctx->cpu_caps = x86_simd_caps();
+#endif
+}
diff --git a/libs/libvpx/vp8/common/header.h b/libs/libvpx/vp8/common/header.h
new file mode 100644
index 0000000000..e27bca16bd
--- /dev/null
+++ b/libs/libvpx/vp8/common/header.h
@@ -0,0 +1,51 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_COMMON_HEADER_H_
+#define VP8_COMMON_HEADER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* 24 bits total */
+typedef struct
+{
+    unsigned int type: 1;
+    unsigned int version: 3;
+    unsigned int show_frame: 1;
+
+    /* Allow 2^20 bytes = 8 megabits for first partition */
+
+    unsigned int first_partition_length_in_bytes: 19;
+
+#ifdef PACKET_TESTING
+    unsigned int frame_number;
+    unsigned int update_gold: 1;
+    unsigned int uses_gold: 1;
+    unsigned int update_last: 1;
+    unsigned int uses_last: 1;
+#endif
+
+} VP8_HEADER;
+
+#ifdef PACKET_TESTING
+#define VP8_HEADER_SIZE 8
+#else
+#define VP8_HEADER_SIZE 3
+#endif
+
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_COMMON_HEADER_H_
diff --git a/libs/libvpx/vp8/common/idct_blk.c b/libs/libvpx/vp8/common/idct_blk.c
new file mode 100644
index 0000000000..8aa7d9bf0f
--- /dev/null
+++ b/libs/libvpx/vp8/common/idct_blk.c
@@ -0,0 +1,90 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+#include "vpx_mem/vpx_mem.h"
+
+void vp8_dequant_idct_add_c(short *input, short *dq,
+                            unsigned char *dest, int stride);
+void vp8_dc_only_idct_add_c(short input_dc, unsigned char * pred,
+                            int pred_stride, unsigned char *dst_ptr,
+                            int dst_stride);
+
+void vp8_dequant_idct_add_y_block_c
+            (short *q, short *dq,
+             unsigned char *dst, int stride, char *eobs)
+{
+    int i, j;
+
+    for (i = 0; i < 4; i++)
+    {
+        for (j = 0; j < 4; j++)
+        {
+            if (*eobs++ > 1)
+                vp8_dequant_idct_add_c (q, dq, dst, stride);
+            else
+            {
+                vp8_dc_only_idct_add_c (q[0]*dq[0], dst, stride, dst, stride);
+                memset(q, 0, 2 * sizeof(q[0]));
+            }
+
+            q   += 16;
+            dst += 4;
+        }
+
+        dst += 4*stride - 16;
+    }
+}
+
+void vp8_dequant_idct_add_uv_block_c
+            (short *q, short *dq,
+             unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
+{
+    int i, j;
+
+    for (i = 0; i < 2; i++)
+    {
+        for (j = 0; j < 2; j++)
+        {
+            if (*eobs++ > 1)
+                vp8_dequant_idct_add_c (q, dq, dstu, stride);
+            else
+            {
+                vp8_dc_only_idct_add_c (q[0]*dq[0], dstu, stride, dstu, stride);
+                memset(q, 0, 2 * sizeof(q[0]));
+            }
+
+            q    += 16;
+            dstu += 4;
+        }
+
+        dstu += 4*stride - 8;
+    }
+
+    for (i = 0; i < 2; i++)
+    {
+        for (j = 0; j < 2; j++)
+        {
+            if (*eobs++ > 1)
+                vp8_dequant_idct_add_c (q, dq, dstv, stride);
+            else
+            {
+                vp8_dc_only_idct_add_c (q[0]*dq[0], dstv, stride, dstv, stride);
+                memset(q, 0, 2 * sizeof(q[0]));
+            }
+
+            q    += 16;
+            dstv += 4;
+        }
+
+        dstv += 4*stride - 8;
+    }
+}
diff --git a/libs/libvpx/vp8/common/idctllm.c b/libs/libvpx/vp8/common/idctllm.c
new file mode 100644
index 0000000000..f5403c5aaf
--- /dev/null
+++ b/libs/libvpx/vp8/common/idctllm.c
@@ -0,0 +1,205 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+
+/****************************************************************************
+ * Notes:
+ *
+ * This implementation makes use of 16 bit fixed point verio of two multiply
+ * constants:
+ *         1.   sqrt(2) * cos (pi/8)
+ *         2.   sqrt(2) * sin (pi/8)
+ * Becuase the first constant is bigger than 1, to maintain the same 16 bit
+ * fixed point precision as the second one, we use a trick of
+ *         x * a = x + x*(a-1)
+ * so
+ *         x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
+ **************************************************************************/
+static const int cospi8sqrt2minus1 = 20091;
+static const int sinpi8sqrt2      = 35468;
+
+void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr,
+                            int pred_stride, unsigned char *dst_ptr,
+                            int dst_stride)
+{
+    int i;
+    int r, c;
+    int a1, b1, c1, d1;
+    short output[16];
+    short *ip = input;
+    short *op = output;
+    int temp1, temp2;
+    int shortpitch = 4;
+
+    for (i = 0; i < 4; i++)
+    {
+        a1 = ip[0] + ip[8];
+        b1 = ip[0] - ip[8];
+
+        temp1 = (ip[4] * sinpi8sqrt2) >> 16;
+        temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
+        c1 = temp1 - temp2;
+
+        temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1) >> 16);
+        temp2 = (ip[12] * sinpi8sqrt2) >> 16;
+        d1 = temp1 + temp2;
+
+        op[shortpitch*0] = a1 + d1;
+        op[shortpitch*3] = a1 - d1;
+
+        op[shortpitch*1] = b1 + c1;
+        op[shortpitch*2] = b1 - c1;
+
+        ip++;
+        op++;
+    }
+
+    ip = output;
+    op = output;
+
+    for (i = 0; i < 4; i++)
+    {
+        a1 = ip[0] + ip[2];
+        b1 = ip[0] - ip[2];
+
+        temp1 = (ip[1] * sinpi8sqrt2) >> 16;
+        temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1) >> 16);
+        c1 = temp1 - temp2;
+
+        temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1) >> 16);
+        temp2 = (ip[3] * sinpi8sqrt2) >> 16;
+        d1 = temp1 + temp2;
+
+
+        op[0] = (a1 + d1 + 4) >> 3;
+        op[3] = (a1 - d1 + 4) >> 3;
+
+        op[1] = (b1 + c1 + 4) >> 3;
+        op[2] = (b1 - c1 + 4) >> 3;
+
+        ip += shortpitch;
+        op += shortpitch;
+    }
+
+    ip = output;
+    for (r = 0; r < 4; r++)
+    {
+        for (c = 0; c < 4; c++)
+        {
+            int a = ip[c] + pred_ptr[c] ;
+
+            if (a < 0)
+                a = 0;
+
+            if (a > 255)
+                a = 255;
+
+            dst_ptr[c] = (unsigned char) a ;
+        }
+        ip += 4;
+        dst_ptr += dst_stride;
+        pred_ptr += pred_stride;
+    }
+}
+
+void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
+                            int pred_stride, unsigned char *dst_ptr,
+                            int dst_stride)
+{
+    int a1 = ((input_dc + 4) >> 3);
+    int r, c;
+
+    for (r = 0; r < 4; r++)
+    {
+        for (c = 0; c < 4; c++)
+        {
+            int a = a1 + pred_ptr[c] ;
+
+            if (a < 0)
+                a = 0;
+
+            if (a > 255)
+                a = 255;
+
+            dst_ptr[c] = (unsigned char) a ;
+        }
+
+        dst_ptr += dst_stride;
+        pred_ptr += pred_stride;
+    }
+
+}
+
+void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff)
+{
+    short output[16];
+    int i;
+    int a1, b1, c1, d1;
+    int a2, b2, c2, d2;
+    short *ip = input;
+    short *op = output;
+
+    for (i = 0; i < 4; i++)
+    {
+        a1 = ip[0] + ip[12];
+        b1 = ip[4] + ip[8];
+        c1 = ip[4] - ip[8];
+        d1 = ip[0] - ip[12];
+
+        op[0] = a1 + b1;
+        op[4] = c1 + d1;
+        op[8] = a1 - b1;
+        op[12] = d1 - c1;
+        ip++;
+        op++;
+    }
+
+    ip = output;
+    op = output;
+
+    for (i = 0; i < 4; i++)
+    {
+        a1 = ip[0] + ip[3];
+        b1 = ip[1] + ip[2];
+        c1 = ip[1] - ip[2];
+        d1 = ip[0] - ip[3];
+
+        a2 = a1 + b1;
+        b2 = c1 + d1;
+        c2 = a1 - b1;
+        d2 = d1 - c1;
+
+        op[0] = (a2 + 3) >> 3;
+        op[1] = (b2 + 3) >> 3;
+        op[2] = (c2 + 3) >> 3;
+        op[3] = (d2 + 3) >> 3;
+
+        ip += 4;
+        op += 4;
+    }
+
+    for(i = 0; i < 16; i++)
+    {
+        mb_dqcoeff[i * 16] = output[i];
+    }
+}
+
+void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff)
+{
+    int i;
+    int a1;
+
+    a1 = ((input[0] + 3) >> 3);
+    for(i = 0; i < 16; i++)
+    {
+        mb_dqcoeff[i * 16] = a1;
+    }
+}
diff --git a/libs/libvpx/vp8/common/invtrans.h b/libs/libvpx/vp8/common/invtrans.h
new file mode 100644
index 0000000000..9cfea8d513
--- /dev/null
+++ b/libs/libvpx/vp8/common/invtrans.h
@@ -0,0 +1,70 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_COMMON_INVTRANS_H_
+#define VP8_COMMON_INVTRANS_H_
+
+#include "./vpx_config.h"
+#include "vp8_rtcd.h"
+#include "blockd.h"
+#include "onyxc_int.h"
+
+#if CONFIG_MULTITHREAD
+#include "vpx_mem/vpx_mem.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static void eob_adjust(char *eobs, short *diff)
+{
+    /* eob adjust.... the idct can only skip if both the dc and eob are zero */
+    int js;
+    for(js = 0; js < 16; js++)
+    {
+        if((eobs[js] == 0) && (diff[0] != 0))
+            eobs[js]++;
+        diff+=16;
+    }
+}
+
+static INLINE void vp8_inverse_transform_mby(MACROBLOCKD *xd)
+{
+    short *DQC = xd->dequant_y1;
+
+    if (xd->mode_info_context->mbmi.mode != SPLITMV)
+    {
+        /* do 2nd order transform on the dc block */
+        if (xd->eobs[24] > 1)
+        {
+            vp8_short_inv_walsh4x4
+                (&xd->block[24].dqcoeff[0], xd->qcoeff);
+        }
+        else
+        {
+            vp8_short_inv_walsh4x4_1
+                (&xd->block[24].dqcoeff[0], xd->qcoeff);
+        }
+        eob_adjust(xd->eobs, xd->qcoeff);
+
+        DQC = xd->dequant_y1_dc;
+    }
+    vp8_dequant_idct_add_y_block
+                    (xd->qcoeff, DQC,
+                     xd->dst.y_buffer,
+                     xd->dst.y_stride, xd->eobs);
+}
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_COMMON_INVTRANS_H_
diff --git a/libs/libvpx/vp8/common/loopfilter.h b/libs/libvpx/vp8/common/loopfilter.h
new file mode 100644
index 0000000000..20a6bd375b
--- /dev/null
+++ b/libs/libvpx/vp8/common/loopfilter.h
@@ -0,0 +1,113 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_COMMON_LOOPFILTER_H_
+#define VP8_COMMON_LOOPFILTER_H_
+
+#include "vpx_ports/mem.h"
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_LOOP_FILTER             63
+/* fraction of total macroblock rows to be used in fast filter level picking */
+/* has to be > 2 */
+#define PARTIAL_FRAME_FRACTION      8
+
+typedef enum
+{
+    NORMAL_LOOPFILTER = 0,
+    SIMPLE_LOOPFILTER = 1
+} LOOPFILTERTYPE;
+
+#if ARCH_ARM
+#define SIMD_WIDTH 1
+#else
+#define SIMD_WIDTH 16
+#endif
+
+/* Need to align this structure so when it is declared and
+ * passed it can be loaded into vector registers.
+ */
+typedef struct
+{
+    DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, mblim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
+    DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, blim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
+    DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, lim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
+    DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, hev_thr[4][SIMD_WIDTH]);
+    unsigned char lvl[4][4][4];
+    unsigned char hev_thr_lut[2][MAX_LOOP_FILTER + 1];
+    unsigned char mode_lf_lut[10];
+} loop_filter_info_n;
+
+typedef struct loop_filter_info
+{
+    const unsigned char * mblim;
+    const unsigned char * blim;
+    const unsigned char * lim;
+    const unsigned char * hev_thr;
+} loop_filter_info;
+
+
+typedef void loop_filter_uvfunction
+(
+    unsigned char *u,   /* source pointer */
+    int p,              /* pitch */
+    const unsigned char *blimit,
+    const unsigned char *limit,
+    const unsigned char *thresh,
+    unsigned char *v
+);
+
+/* assorted loopfilter functions which get used elsewhere */
+struct VP8Common;
+struct macroblockd;
+struct modeinfo;
+
+void vp8_loop_filter_init(struct VP8Common *cm);
+
+void vp8_loop_filter_frame_init(struct VP8Common *cm,
+                                struct macroblockd *mbd,
+                                int default_filt_lvl);
+
+void vp8_loop_filter_frame(struct VP8Common *cm, struct macroblockd *mbd,
+                           int frame_type);
+
+void vp8_loop_filter_partial_frame(struct VP8Common *cm,
+                                   struct macroblockd *mbd,
+                                   int default_filt_lvl);
+
+void vp8_loop_filter_frame_yonly(struct VP8Common *cm,
+                                 struct macroblockd *mbd,
+                                 int default_filt_lvl);
+
+void vp8_loop_filter_update_sharpness(loop_filter_info_n *lfi,
+                                      int sharpness_lvl);
+
+void vp8_loop_filter_row_normal(struct VP8Common *cm,
+                                struct modeinfo *mode_info_context,
+                                int mb_row, int post_ystride, int post_uvstride,
+                                unsigned char *y_ptr, unsigned char *u_ptr,
+                                unsigned char *v_ptr);
+
+void vp8_loop_filter_row_simple(struct VP8Common *cm,
+                                struct modeinfo *mode_info_context,
+                                int mb_row, int post_ystride, int post_uvstride,
+                                unsigned char *y_ptr, unsigned char *u_ptr,
+                                unsigned char *v_ptr);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_COMMON_LOOPFILTER_H_
diff --git a/libs/libvpx/vp8/common/loopfilter_filters.c b/libs/libvpx/vp8/common/loopfilter_filters.c
new file mode 100644
index 0000000000..1d51696ff7
--- /dev/null
+++ b/libs/libvpx/vp8/common/loopfilter_filters.c
@@ -0,0 +1,430 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <stdlib.h>
+#include "loopfilter.h"
+#include "onyxc_int.h"
+
+typedef unsigned char uc;
+
+static signed char vp8_signed_char_clamp(int t)
+{
+    t = (t < -128 ? -128 : t);
+    t = (t > 127 ? 127 : t);
+    return (signed char) t;
+}
+
+
+/* should we apply any filter at all ( 11111111 yes, 00000000 no) */
+static signed char vp8_filter_mask(uc limit, uc blimit,
+                            uc p3, uc p2, uc p1, uc p0,
+                            uc q0, uc q1, uc q2, uc q3)
+{
+    signed char mask = 0;
+    mask |= (abs(p3 - p2) > limit);
+    mask |= (abs(p2 - p1) > limit);
+    mask |= (abs(p1 - p0) > limit);
+    mask |= (abs(q1 - q0) > limit);
+    mask |= (abs(q2 - q1) > limit);
+    mask |= (abs(q3 - q2) > limit);
+    mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit);
+    return mask - 1;
+}
+
+/* is there high variance internal edge ( 11111111 yes, 00000000 no) */
+static signed char vp8_hevmask(uc thresh, uc p1, uc p0, uc q0, uc q1)
+{
+    signed char hev = 0;
+    hev  |= (abs(p1 - p0) > thresh) * -1;
+    hev  |= (abs(q1 - q0) > thresh) * -1;
+    return hev;
+}
+
+static void vp8_filter(signed char mask, uc hev, uc *op1,
+        uc *op0, uc *oq0, uc *oq1)
+
+{
+    signed char ps0, qs0;
+    signed char ps1, qs1;
+    signed char filter_value, Filter1, Filter2;
+    signed char u;
+
+    ps1 = (signed char) * op1 ^ 0x80;
+    ps0 = (signed char) * op0 ^ 0x80;
+    qs0 = (signed char) * oq0 ^ 0x80;
+    qs1 = (signed char) * oq1 ^ 0x80;
+
+    /* add outer taps if we have high edge variance */
+    filter_value = vp8_signed_char_clamp(ps1 - qs1);
+    filter_value &= hev;
+
+    /* inner taps */
+    filter_value = vp8_signed_char_clamp(filter_value + 3 * (qs0 - ps0));
+    filter_value &= mask;
+
+    /* save bottom 3 bits so that we round one side +4 and the other +3
+     * if it equals 4 we'll set to adjust by -1 to account for the fact
+     * we'd round 3 the other way
+     */
+    Filter1 = vp8_signed_char_clamp(filter_value + 4);
+    Filter2 = vp8_signed_char_clamp(filter_value + 3);
+    Filter1 >>= 3;
+    Filter2 >>= 3;
+    u = vp8_signed_char_clamp(qs0 - Filter1);
+    *oq0 = u ^ 0x80;
+    u = vp8_signed_char_clamp(ps0 + Filter2);
+    *op0 = u ^ 0x80;
+    filter_value = Filter1;
+
+    /* outer tap adjustments */
+    filter_value += 1;
+    filter_value >>= 1;
+    filter_value &= ~hev;
+
+    u = vp8_signed_char_clamp(qs1 - filter_value);
+    *oq1 = u ^ 0x80;
+    u = vp8_signed_char_clamp(ps1 + filter_value);
+    *op1 = u ^ 0x80;
+
+}
+void vp8_loop_filter_horizontal_edge_c
+(
+    unsigned char *s,
+    int p, /* pitch */
+    const unsigned char *blimit,
+    const unsigned char *limit,
+    const unsigned char *thresh,
+    int count
+)
+{
+    int  hev = 0; /* high edge variance */
+    signed char mask = 0;
+    int i = 0;
+
+    /* loop filter designed to work using chars so that we can make maximum use
+     * of 8 bit simd instructions.
+     */
+    do
+    {
+        mask = vp8_filter_mask(limit[0], blimit[0],
+                               s[-4*p], s[-3*p], s[-2*p], s[-1*p],
+                               s[0*p], s[1*p], s[2*p], s[3*p]);
+
+        hev = vp8_hevmask(thresh[0], s[-2*p], s[-1*p], s[0*p], s[1*p]);
+
+        vp8_filter(mask, hev, s - 2 * p, s - 1 * p, s, s + 1 * p);
+
+        ++s;
+    }
+    while (++i < count * 8);
+}
+
+void vp8_loop_filter_vertical_edge_c
+(
+    unsigned char *s,
+    int p,
+    const unsigned char *blimit,
+    const unsigned char *limit,
+    const unsigned char *thresh,
+    int count
+)
+{
+    int  hev = 0; /* high edge variance */
+    signed char mask = 0;
+    int i = 0;
+
+    /* loop filter designed to work using chars so that we can make maximum use
+     * of 8 bit simd instructions.
+     */
+    do
+    {
+        mask = vp8_filter_mask(limit[0], blimit[0],
+                               s[-4], s[-3], s[-2], s[-1], s[0], s[1], s[2], s[3]);
+
+        hev = vp8_hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);
+
+        vp8_filter(mask, hev, s - 2, s - 1, s, s + 1);
+
+        s += p;
+    }
+    while (++i < count * 8);
+}
+
+static void vp8_mbfilter(signed char mask, uc hev,
+                           uc *op2, uc *op1, uc *op0, uc *oq0, uc *oq1, uc *oq2)
+{
+    signed char s, u;
+    signed char filter_value, Filter1, Filter2;
+    signed char ps2 = (signed char) * op2 ^ 0x80;
+    signed char ps1 = (signed char) * op1 ^ 0x80;
+    signed char ps0 = (signed char) * op0 ^ 0x80;
+    signed char qs0 = (signed char) * oq0 ^ 0x80;
+    signed char qs1 = (signed char) * oq1 ^ 0x80;
+    signed char qs2 = (signed char) * oq2 ^ 0x80;
+
+    /* add outer taps if we have high edge variance */
+    filter_value = vp8_signed_char_clamp(ps1 - qs1);
+    filter_value = vp8_signed_char_clamp(filter_value + 3 * (qs0 - ps0));
+    filter_value &= mask;
+
+    Filter2 = filter_value;
+    Filter2 &= hev;
+
+    /* save bottom 3 bits so that we round one side +4 and the other +3 */
+    Filter1 = vp8_signed_char_clamp(Filter2 + 4);
+    Filter2 = vp8_signed_char_clamp(Filter2 + 3);
+    Filter1 >>= 3;
+    Filter2 >>= 3;
+    qs0 = vp8_signed_char_clamp(qs0 - Filter1);
+    ps0 = vp8_signed_char_clamp(ps0 + Filter2);
+
+
+    /* only apply wider filter if not high edge variance */
+    filter_value &= ~hev;
+    Filter2 = filter_value;
+
+    /* roughly 3/7th difference across boundary */
+    u = vp8_signed_char_clamp((63 + Filter2 * 27) >> 7);
+    s = vp8_signed_char_clamp(qs0 - u);
+    *oq0 = s ^ 0x80;
+    s = vp8_signed_char_clamp(ps0 + u);
+    *op0 = s ^ 0x80;
+
+    /* roughly 2/7th difference across boundary */
+    u = vp8_signed_char_clamp((63 + Filter2 * 18) >> 7);
+    s = vp8_signed_char_clamp(qs1 - u);
+    *oq1 = s ^ 0x80;
+    s = vp8_signed_char_clamp(ps1 + u);
+    *op1 = s ^ 0x80;
+
+    /* roughly 1/7th difference across boundary */
+    u = vp8_signed_char_clamp((63 + Filter2 * 9) >> 7);
+    s = vp8_signed_char_clamp(qs2 - u);
+    *oq2 = s ^ 0x80;
+    s = vp8_signed_char_clamp(ps2 + u);
+    *op2 = s ^ 0x80;
+}
+
+void vp8_mbloop_filter_horizontal_edge_c
+(
+    unsigned char *s,
+    int p,
+    const unsigned char *blimit,
+    const unsigned char *limit,
+    const unsigned char *thresh,
+    int count
+)
+{
+    signed char hev = 0; /* high edge variance */
+    signed char mask = 0;
+    int i = 0;
+
+    /* loop filter designed to work using chars so that we can make maximum use
+     * of 8 bit simd instructions.
+     */
+    do
+    {
+
+        mask = vp8_filter_mask(limit[0], blimit[0],
+                               s[-4*p], s[-3*p], s[-2*p], s[-1*p],
+                               s[0*p], s[1*p], s[2*p], s[3*p]);
+
+        hev = vp8_hevmask(thresh[0], s[-2*p], s[-1*p], s[0*p], s[1*p]);
+
+        vp8_mbfilter(mask, hev, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p);
+
+        ++s;
+    }
+    while (++i < count * 8);
+
+}
+
+
+void vp8_mbloop_filter_vertical_edge_c
+(
+    unsigned char *s,
+    int p,
+    const unsigned char *blimit,
+    const unsigned char *limit,
+    const unsigned char *thresh,
+    int count
+)
+{
+    signed char hev = 0; /* high edge variance */
+    signed char mask = 0;
+    int i = 0;
+
+    do
+    {
+
+        mask = vp8_filter_mask(limit[0], blimit[0],
+                               s[-4], s[-3], s[-2], s[-1], s[0], s[1], s[2], s[3]);
+
+        hev = vp8_hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);
+
+        vp8_mbfilter(mask, hev, s - 3, s - 2, s - 1, s, s + 1, s + 2);
+
+        s += p;
+    }
+    while (++i < count * 8);
+
+}
+
+/* should we apply any filter at all ( 11111111 yes, 00000000 no) */
+static signed char vp8_simple_filter_mask(uc blimit, uc p1, uc p0, uc q0, uc q1)
+{
+/* Why does this cause problems for win32?
+ * error C2143: syntax error : missing ';' before 'type'
+ *  (void) limit;
+ */
+    signed char mask = (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  <= blimit) * -1;
+    return mask;
+}
+
+static void vp8_simple_filter(signed char mask, uc *op1, uc *op0, uc *oq0, uc *oq1)
+{
+    signed char filter_value, Filter1, Filter2;
+    signed char p1 = (signed char) * op1 ^ 0x80;
+    signed char p0 = (signed char) * op0 ^ 0x80;
+    signed char q0 = (signed char) * oq0 ^ 0x80;
+    signed char q1 = (signed char) * oq1 ^ 0x80;
+    signed char u;
+
+    filter_value = vp8_signed_char_clamp(p1 - q1);
+    filter_value = vp8_signed_char_clamp(filter_value + 3 * (q0 - p0));
+    filter_value &= mask;
+
+    /* save bottom 3 bits so that we round one side +4 and the other +3 */
+    Filter1 = vp8_signed_char_clamp(filter_value + 4);
+    Filter1 >>= 3;
+    u = vp8_signed_char_clamp(q0 - Filter1);
+    *oq0  = u ^ 0x80;
+
+    Filter2 = vp8_signed_char_clamp(filter_value + 3);
+    Filter2 >>= 3;
+    u = vp8_signed_char_clamp(p0 + Filter2);
+    *op0 = u ^ 0x80;
+}
+
+void vp8_loop_filter_simple_horizontal_edge_c
+(
+    unsigned char *s,
+    int p,
+    const unsigned char *blimit
+)
+{
+    signed char mask = 0;
+    int i = 0;
+
+    do
+    {
+        mask = vp8_simple_filter_mask(blimit[0], s[-2*p], s[-1*p], s[0*p], s[1*p]);
+        vp8_simple_filter(mask, s - 2 * p, s - 1 * p, s, s + 1 * p);
+        ++s;
+    }
+    while (++i < 16);
+}
+
+void vp8_loop_filter_simple_vertical_edge_c
+(
+    unsigned char *s,
+    int p,
+    const unsigned char *blimit
+)
+{
+    signed char mask = 0;
+    int i = 0;
+
+    do
+    {
+        mask = vp8_simple_filter_mask(blimit[0], s[-2], s[-1], s[0], s[1]);
+        vp8_simple_filter(mask, s - 2, s - 1, s, s + 1);
+        s += p;
+    }
+    while (++i < 16);
+
+}
+
+/* Horizontal MB filtering */
+void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr,
+                           unsigned char *v_ptr, int y_stride, int uv_stride,
+                           loop_filter_info *lfi)
+{
+    vp8_mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
+
+    if (u_ptr)
+        vp8_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+
+    if (v_ptr)
+        vp8_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+}
+
+/* Vertical MB Filtering */
+void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr,
+                           unsigned char *v_ptr, int y_stride, int uv_stride,
+                           loop_filter_info *lfi)
+{
+    vp8_mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
+
+    if (u_ptr)
+        vp8_mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+
+    if (v_ptr)
+        vp8_mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+}
+
+/* Horizontal B Filtering */
+void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr,
+                          unsigned char *v_ptr, int y_stride, int uv_stride,
+                          loop_filter_info *lfi)
+{
+    vp8_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+    if (u_ptr)
+        vp8_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+    if (v_ptr)
+        vp8_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+}
+
+void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride,
+                           const unsigned char *blimit)
+{
+    vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, blimit);
+    vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, blimit);
+    vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, blimit);
+}
+
+/* Vertical B Filtering */
+void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr,
+                          unsigned char *v_ptr, int y_stride, int uv_stride,
+                          loop_filter_info *lfi)
+{
+    vp8_loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+    if (u_ptr)
+        vp8_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+    if (v_ptr)
+        vp8_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+}
+
+void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride,
+                           const unsigned char *blimit)
+{
+    vp8_loop_filter_simple_vertical_edge_c(y_ptr + 4, y_stride, blimit);
+    vp8_loop_filter_simple_vertical_edge_c(y_ptr + 8, y_stride, blimit);
+    vp8_loop_filter_simple_vertical_edge_c(y_ptr + 12, y_stride, blimit);
+}
diff --git a/libs/libvpx/vp8/common/mbpitch.c b/libs/libvpx/vp8/common/mbpitch.c
new file mode 100644
index 0000000000..32e1b66409
--- /dev/null
+++ b/libs/libvpx/vp8/common/mbpitch.c
@@ -0,0 +1,68 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "blockd.h"
+
+void vp8_setup_block_dptrs(MACROBLOCKD *x)
+{
+    int r, c;
+
+    for (r = 0; r < 4; r++)
+    {
+        for (c = 0; c < 4; c++)
+        {
+            x->block[r*4+c].predictor = x->predictor + r * 4 * 16 + c * 4;
+        }
+    }
+
+    for (r = 0; r < 2; r++)
+    {
+        for (c = 0; c < 2; c++)
+        {
+            x->block[16+r*2+c].predictor = x->predictor + 256 + r * 4 * 8 + c * 4;
+
+        }
+    }
+
+    for (r = 0; r < 2; r++)
+    {
+        for (c = 0; c < 2; c++)
+        {
+            x->block[20+r*2+c].predictor = x->predictor + 320 + r * 4 * 8 + c * 4;
+
+        }
+    }
+
+    for (r = 0; r < 25; r++)
+    {
+        x->block[r].qcoeff  = x->qcoeff  + r * 16;
+        x->block[r].dqcoeff = x->dqcoeff + r * 16;
+        x->block[r].eob     = x->eobs + r;
+    }
+}
+
+void vp8_build_block_doffsets(MACROBLOCKD *x)
+{
+    int block;
+
+    for (block = 0; block < 16; block++) /* y blocks */
+    {
+        x->block[block].offset =
+            (block >> 2) * 4 * x->dst.y_stride + (block & 3) * 4;
+    }
+
+    for (block = 16; block < 20; block++) /* U and V blocks */
+    {
+        x->block[block+4].offset =
+        x->block[block].offset =
+            ((block - 16) >> 1) * 4 * x->dst.uv_stride + (block & 1) * 4;
+    }
+}
diff --git a/libs/libvpx/vp8/common/mfqe.c b/libs/libvpx/vp8/common/mfqe.c
new file mode 100644
index 0000000000..2bfefb126d
--- /dev/null
+++ b/libs/libvpx/vp8/common/mfqe.c
@@ -0,0 +1,386 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+/* MFQE: Multiframe Quality Enhancement
+ * In rate limited situations keyframes may cause significant visual artifacts
+ * commonly referred to as "popping." This file implements a postproccesing
+ * algorithm which blends data from the preceeding frame when there is no
+ * motion and the q from the previous frame is lower which indicates that it is
+ * higher quality.
+ */
+
+#include "./vp8_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vp8/common/postproc.h"
+#include "vpx_dsp/variance.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_scale/yv12config.h"
+
+#include <limits.h>
+#include <stdlib.h>
+
+static void filter_by_weight(unsigned char *src, int src_stride,
+                             unsigned char *dst, int dst_stride,
+                             int block_size, int src_weight)
+{
+    int dst_weight = (1 << MFQE_PRECISION) - src_weight;
+    int rounding_bit = 1 << (MFQE_PRECISION - 1);
+    int r, c;
+
+    for (r = 0; r < block_size; r++)
+    {
+        for (c = 0; c < block_size; c++)
+        {
+            dst[c] = (src[c] * src_weight +
+                      dst[c] * dst_weight +
+                      rounding_bit) >> MFQE_PRECISION;
+        }
+        src += src_stride;
+        dst += dst_stride;
+    }
+}
+
+void vp8_filter_by_weight16x16_c(unsigned char *src, int src_stride,
+                                 unsigned char *dst, int dst_stride,
+                                 int src_weight)
+{
+    filter_by_weight(src, src_stride, dst, dst_stride, 16, src_weight);
+}
+
+void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride,
+                               unsigned char *dst, int dst_stride,
+                               int src_weight)
+{
+    filter_by_weight(src, src_stride, dst, dst_stride, 8, src_weight);
+}
+
+void vp8_filter_by_weight4x4_c(unsigned char *src, int src_stride,
+                               unsigned char *dst, int dst_stride,
+                               int src_weight)
+{
+    filter_by_weight(src, src_stride, dst, dst_stride, 4, src_weight);
+}
+
+static void apply_ifactor(unsigned char *y_src,
+                          int y_src_stride,
+                          unsigned char *y_dst,
+                          int y_dst_stride,
+                          unsigned char *u_src,
+                          unsigned char *v_src,
+                          int uv_src_stride,
+                          unsigned char *u_dst,
+                          unsigned char *v_dst,
+                          int uv_dst_stride,
+                          int block_size,
+                          int src_weight)
+{
+    if (block_size == 16)
+    {
+        vp8_filter_by_weight16x16(y_src, y_src_stride, y_dst, y_dst_stride, src_weight);
+        vp8_filter_by_weight8x8(u_src, uv_src_stride, u_dst, uv_dst_stride, src_weight);
+        vp8_filter_by_weight8x8(v_src, uv_src_stride, v_dst, uv_dst_stride, src_weight);
+    }
+    else /* if (block_size == 8) */
+    {
+        vp8_filter_by_weight8x8(y_src, y_src_stride, y_dst, y_dst_stride, src_weight);
+        vp8_filter_by_weight4x4(u_src, uv_src_stride, u_dst, uv_dst_stride, src_weight);
+        vp8_filter_by_weight4x4(v_src, uv_src_stride, v_dst, uv_dst_stride, src_weight);
+    }
+}
+
+static unsigned int int_sqrt(unsigned int x)
+{
+    unsigned int y = x;
+    unsigned int guess;
+    int p = 1;
+    while (y>>=1) p++;
+    p>>=1;
+
+    guess=0;
+    while (p>=0)
+    {
+        guess |= (1<<p);
+        if (x<guess*guess)
+            guess -= (1<<p);
+        p--;
+    }
+    /* choose between guess or guess+1 */
+    return guess+(guess*guess+guess+1<=x);
+}
+
+#define USE_SSD
+static void multiframe_quality_enhance_block
+(
+    int blksize, /* Currently only values supported are 16, 8 */
+    int qcurr,
+    int qprev,
+    unsigned char *y,
+    unsigned char *u,
+    unsigned char *v,
+    int y_stride,
+    int uv_stride,
+    unsigned char *yd,
+    unsigned char *ud,
+    unsigned char *vd,
+    int yd_stride,
+    int uvd_stride
+)
+{
+    static const unsigned char VP8_ZEROS[16]=
+    {
+         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+    };
+    int uvblksize = blksize >> 1;
+    int qdiff = qcurr - qprev;
+
+    int i;
+    unsigned char *up;
+    unsigned char *udp;
+    unsigned char *vp;
+    unsigned char *vdp;
+
+    unsigned int act, actd, sad, usad, vsad, sse, thr, thrsq, actrisk;
+
+    if (blksize == 16)
+    {
+        actd = (vpx_variance16x16(yd, yd_stride, VP8_ZEROS, 0, &sse)+128)>>8;
+        act = (vpx_variance16x16(y, y_stride, VP8_ZEROS, 0, &sse)+128)>>8;
+#ifdef USE_SSD
+        vpx_variance16x16(y, y_stride, yd, yd_stride, &sse);
+        sad = (sse + 128)>>8;
+        vpx_variance8x8(u, uv_stride, ud, uvd_stride, &sse);
+        usad = (sse + 32)>>6;
+        vpx_variance8x8(v, uv_stride, vd, uvd_stride, &sse);
+        vsad = (sse + 32)>>6;
+#else
+        sad = (vpx_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8;
+        usad = (vpx_sad8x8(u, uv_stride, ud, uvd_stride) + 32) >> 6;
+        vsad = (vpx_sad8x8(v, uv_stride, vd, uvd_stride)+ 32) >> 6;
+#endif
+    }
+    else /* if (blksize == 8) */
+    {
+        actd = (vpx_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse)+32)>>6;
+        act = (vpx_variance8x8(y, y_stride, VP8_ZEROS, 0, &sse)+32)>>6;
+#ifdef USE_SSD
+        vpx_variance8x8(y, y_stride, yd, yd_stride, &sse);
+        sad = (sse + 32)>>6;
+        vpx_variance4x4(u, uv_stride, ud, uvd_stride, &sse);
+        usad = (sse + 8)>>4;
+        vpx_variance4x4(v, uv_stride, vd, uvd_stride, &sse);
+        vsad = (sse + 8)>>4;
+#else
+        sad = (vpx_sad8x8(y, y_stride, yd, yd_stride) + 32) >> 6;
+        usad = (vpx_sad4x4(u, uv_stride, ud, uvd_stride) + 8) >> 4;
+        vsad = (vpx_sad4x4(v, uv_stride, vd, uvd_stride) + 8) >> 4;
+#endif
+    }
+
+    actrisk = (actd > act * 5);
+
+    /* thr = qdiff/16 + log2(act) + log4(qprev) */
+    thr = (qdiff >> 4);
+    while (actd >>= 1) thr++;
+    while (qprev >>= 2) thr++;
+
+#ifdef USE_SSD
+    thrsq = thr * thr;
+    if (sad < thrsq &&
+        /* additional checks for color mismatch and excessive addition of
+         * high-frequencies */
+        4 * usad < thrsq && 4 * vsad < thrsq && !actrisk)
+#else
+    if (sad < thr &&
+        /* additional checks for color mismatch and excessive addition of
+         * high-frequencies */
+        2 * usad < thr && 2 * vsad < thr && !actrisk)
+#endif
+    {
+        int ifactor;
+#ifdef USE_SSD
+        /* TODO: optimize this later to not need sqr root */
+        sad = int_sqrt(sad);
+#endif
+        ifactor = (sad << MFQE_PRECISION) / thr;
+        ifactor >>= (qdiff >> 5);
+
+        if (ifactor)
+        {
+            apply_ifactor(y, y_stride, yd, yd_stride,
+                          u, v, uv_stride,
+                          ud, vd, uvd_stride,
+                          blksize, ifactor);
+        }
+    }
+    else  /* else implicitly copy from previous frame */
+    {
+        if (blksize == 16)
+        {
+            vp8_copy_mem16x16(y, y_stride, yd, yd_stride);
+            vp8_copy_mem8x8(u, uv_stride, ud, uvd_stride);
+            vp8_copy_mem8x8(v, uv_stride, vd, uvd_stride);
+        }
+        else  /* if (blksize == 8) */
+        {
+            vp8_copy_mem8x8(y, y_stride, yd, yd_stride);
+            for (up = u, udp = ud, i = 0; i < uvblksize; ++i, up += uv_stride, udp += uvd_stride)
+                memcpy(udp, up, uvblksize);
+            for (vp = v, vdp = vd, i = 0; i < uvblksize; ++i, vp += uv_stride, vdp += uvd_stride)
+                memcpy(vdp, vp, uvblksize);
+        }
+    }
+}
+
+static int qualify_inter_mb(const MODE_INFO *mode_info_context, int *map)
+{
+    if (mode_info_context->mbmi.mb_skip_coeff)
+        map[0] = map[1] = map[2] = map[3] = 1;
+    else if (mode_info_context->mbmi.mode==SPLITMV)
+    {
+        static int ndx[4][4] =
+        {
+            {0, 1, 4, 5},
+            {2, 3, 6, 7},
+            {8, 9, 12, 13},
+            {10, 11, 14, 15}
+        };
+        int i, j;
+        for (i=0; i<4; ++i)
+        {
+            map[i] = 1;
+            for (j=0; j<4 && map[j]; ++j)
+                map[i] &= (mode_info_context->bmi[ndx[i][j]].mv.as_mv.row <= 2 &&
+                           mode_info_context->bmi[ndx[i][j]].mv.as_mv.col <= 2);
+        }
+    }
+    else
+    {
+        map[0] = map[1] = map[2] = map[3] =
+            (mode_info_context->mbmi.mode > B_PRED &&
+             abs(mode_info_context->mbmi.mv.as_mv.row) <= 2 &&
+             abs(mode_info_context->mbmi.mv.as_mv.col) <= 2);
+    }
+    return (map[0]+map[1]+map[2]+map[3]);
+}
+
+void vp8_multiframe_quality_enhance
+(
+    VP8_COMMON *cm
+)
+{
+    YV12_BUFFER_CONFIG *show = cm->frame_to_show;
+    YV12_BUFFER_CONFIG *dest = &cm->post_proc_buffer;
+
+    FRAME_TYPE frame_type = cm->frame_type;
+    /* Point at base of Mb MODE_INFO list has motion vectors etc */
+    const MODE_INFO *mode_info_context = cm->show_frame_mi;
+    int mb_row;
+    int mb_col;
+    int totmap, map[4];
+    int qcurr = cm->base_qindex;
+    int qprev = cm->postproc_state.last_base_qindex;
+
+    unsigned char *y_ptr, *u_ptr, *v_ptr;
+    unsigned char *yd_ptr, *ud_ptr, *vd_ptr;
+
+    /* Set up the buffer pointers */
+    y_ptr = show->y_buffer;
+    u_ptr = show->u_buffer;
+    v_ptr = show->v_buffer;
+    yd_ptr = dest->y_buffer;
+    ud_ptr = dest->u_buffer;
+    vd_ptr = dest->v_buffer;
+
+    /* postprocess each macro block */
+    for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
+    {
+        for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+        {
+            /* if motion is high there will likely be no benefit */
+            if (frame_type == INTER_FRAME) totmap = qualify_inter_mb(mode_info_context, map);
+            else totmap = (frame_type == KEY_FRAME ? 4 : 0);
+            if (totmap)
+            {
+                if (totmap < 4)
+                {
+                    int i, j;
+                    for (i=0; i<2; ++i)
+                        for (j=0; j<2; ++j)
+                        {
+                            if (map[i*2+j])
+                            {
+                                multiframe_quality_enhance_block(8, qcurr, qprev,
+                                                                 y_ptr + 8*(i*show->y_stride+j),
+                                                                 u_ptr + 4*(i*show->uv_stride+j),
+                                                                 v_ptr + 4*(i*show->uv_stride+j),
+                                                                 show->y_stride,
+                                                                 show->uv_stride,
+                                                                 yd_ptr + 8*(i*dest->y_stride+j),
+                                                                 ud_ptr + 4*(i*dest->uv_stride+j),
+                                                                 vd_ptr + 4*(i*dest->uv_stride+j),
+                                                                 dest->y_stride,
+                                                                 dest->uv_stride);
+                            }
+                            else
+                            {
+                                /* copy a 8x8 block */
+                                int k;
+                                unsigned char *up = u_ptr + 4*(i*show->uv_stride+j);
+                                unsigned char *udp = ud_ptr + 4*(i*dest->uv_stride+j);
+                                unsigned char *vp = v_ptr + 4*(i*show->uv_stride+j);
+                                unsigned char *vdp = vd_ptr + 4*(i*dest->uv_stride+j);
+                                vp8_copy_mem8x8(y_ptr + 8*(i*show->y_stride+j), show->y_stride,
+                                                yd_ptr + 8*(i*dest->y_stride+j), dest->y_stride);
+                                for (k = 0; k < 4; ++k, up += show->uv_stride, udp += dest->uv_stride,
+                                                        vp += show->uv_stride, vdp += dest->uv_stride)
+                                {
+                                    memcpy(udp, up, 4);
+                                    memcpy(vdp, vp, 4);
+                                }
+                            }
+                        }
+                }
+                else /* totmap = 4 */
+                {
+                    multiframe_quality_enhance_block(16, qcurr, qprev, y_ptr,
+                                                     u_ptr, v_ptr,
+                                                     show->y_stride,
+                                                     show->uv_stride,
+                                                     yd_ptr, ud_ptr, vd_ptr,
+                                                     dest->y_stride,
+                                                     dest->uv_stride);
+                }
+            }
+            else
+            {
+                vp8_copy_mem16x16(y_ptr, show->y_stride, yd_ptr, dest->y_stride);
+                vp8_copy_mem8x8(u_ptr, show->uv_stride, ud_ptr, dest->uv_stride);
+                vp8_copy_mem8x8(v_ptr, show->uv_stride, vd_ptr, dest->uv_stride);
+            }
+            y_ptr += 16;
+            u_ptr += 8;
+            v_ptr += 8;
+            yd_ptr += 16;
+            ud_ptr += 8;
+            vd_ptr += 8;
+            mode_info_context++;     /* step to next MB */
+        }
+
+        y_ptr += show->y_stride  * 16 - 16 * cm->mb_cols;
+        u_ptr += show->uv_stride *  8 - 8 * cm->mb_cols;
+        v_ptr += show->uv_stride *  8 - 8 * cm->mb_cols;
+        yd_ptr += dest->y_stride  * 16 - 16 * cm->mb_cols;
+        ud_ptr += dest->uv_stride *  8 - 8 * cm->mb_cols;
+        vd_ptr += dest->uv_stride *  8 - 8 * cm->mb_cols;
+
+        mode_info_context++;         /* Skip border mb */
+    }
+}
diff --git a/libs/libvpx/vp8/common/mips/dspr2/dequantize_dspr2.c b/libs/libvpx/vp8/common/mips/dspr2/dequantize_dspr2.c
new file mode 100644
index 0000000000..fc3bb8ad9d
--- /dev/null
+++ b/libs/libvpx/vp8/common/mips/dspr2/dequantize_dspr2.c
@@ -0,0 +1,33 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+#include "vpx_mem/vpx_mem.h"
+
+#if HAVE_DSPR2
+void vp8_dequant_idct_add_dspr2(short *input, short *dq,
+                                unsigned char *dest, int stride)
+{
+    int i;
+
+    for (i = 0; i < 16; i++)
+    {
+        input[i] = dq[i] * input[i];
+    }
+
+    vp8_short_idct4x4llm_dspr2(input, dest, stride, dest, stride);
+
+    memset(input, 0, 32);
+
+}
+
+#endif
diff --git a/libs/libvpx/vp8/common/mips/dspr2/filter_dspr2.c b/libs/libvpx/vp8/common/mips/dspr2/filter_dspr2.c
new file mode 100644
index 0000000000..ace5d400cb
--- /dev/null
+++ b/libs/libvpx/vp8/common/mips/dspr2/filter_dspr2.c
@@ -0,0 +1,2823 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <stdlib.h>
+#include "vp8_rtcd.h"
+#include "vpx_ports/mem.h"
+
+#if HAVE_DSPR2
+#define CROP_WIDTH 256
+unsigned char ff_cropTbl[256 + 2 * CROP_WIDTH];
+
+static const unsigned short sub_pel_filterss[8][3] =
+{
+    {      0,      0,      0},
+    {      0, 0x0601, 0x7b0c},
+    { 0x0201, 0x0b08, 0x6c24},
+    {      0, 0x0906, 0x5d32},
+    { 0x0303, 0x1010, 0x4d4d},
+    {      0, 0x0609, 0x325d},
+    { 0x0102, 0x080b, 0x246c},
+    {      0, 0x0106, 0x0c7b},
+};
+
+
+static const int sub_pel_filters_int[8][3] =
+{
+    {          0,          0,          0},
+    { 0x0000fffa, 0x007b000c, 0xffff0000},
+    { 0x0002fff5, 0x006c0024, 0xfff80001},
+    { 0x0000fff7, 0x005d0032, 0xfffa0000},
+    { 0x0003fff0, 0x004d004d, 0xfff00003},
+    { 0x0000fffa, 0x0032005d, 0xfff70000},
+    { 0x0001fff8, 0x0024006c, 0xfff50002},
+    { 0x0000ffff, 0x000c007b, 0xfffa0000},
+};
+
+
+static const int sub_pel_filters_inv[8][3] =
+{
+    {          0,          0,          0},
+    { 0xfffa0000, 0x000c007b, 0x0000ffff},
+    { 0xfff50002, 0x0024006c, 0x0001fff8},
+    { 0xfff70000, 0x0032005d, 0x0000fffa},
+    { 0xfff00003, 0x004d004d, 0x0003fff0},
+    { 0xfffa0000, 0x005d0032, 0x0000fff7},
+    { 0xfff80001, 0x006c0024, 0x0002fff5},
+    { 0xffff0000, 0x007b000c, 0x0000fffa},
+};
+
+
+static const int sub_pel_filters_int_tap_4[8][2] =
+{
+    {          0,          0},
+    { 0xfffa007b, 0x000cffff},
+    {          0,          0},
+    { 0xfff7005d, 0x0032fffa},
+    {          0,          0},
+    { 0xfffa0032, 0x005dfff7},
+    {          0,          0},
+    { 0xffff000c, 0x007bfffa},
+};
+
+
+static const int sub_pel_filters_inv_tap_4[8][2] =
+{
+    {          0,          0},
+    { 0x007bfffa, 0xffff000c},
+    {          0,          0},
+    { 0x005dfff7, 0xfffa0032},
+    {          0,          0},
+    { 0x0032fffa, 0xfff7005d},
+    {          0,          0},
+    { 0x000cffff, 0xfffa007b},
+};
+
+inline void prefetch_load(unsigned char *src)
+{
+    __asm__ __volatile__ (
+        "pref   0,  0(%[src])   \n\t"
+        :
+        : [src] "r" (src)
+    );
+}
+
+
+inline void prefetch_store(unsigned char *dst)
+{
+    __asm__ __volatile__ (
+        "pref   1,  0(%[dst])   \n\t"
+        :
+        : [dst] "r" (dst)
+    );
+}
+
+void dsputil_static_init(void)
+{
+    int i;
+
+    for (i = 0; i < 256; i++) ff_cropTbl[i + CROP_WIDTH] = i;
+
+    for (i = 0; i < CROP_WIDTH; i++)
+    {
+        ff_cropTbl[i] = 0;
+        ff_cropTbl[i + CROP_WIDTH + 256] = 255;
+    }
+}
+
+void vp8_filter_block2d_first_pass_4
+(
+    unsigned char *RESTRICT src_ptr,
+    unsigned char *RESTRICT dst_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int output_height,
+    int xoffset,
+    int pitch
+)
+{
+    unsigned int i;
+    int Temp1, Temp2, Temp3, Temp4;
+
+    unsigned int vector4a = 64;
+    int vector1b, vector2b, vector3b;
+    unsigned int tp1, tp2, tn1, tn2;
+    unsigned int p1, p2, p3;
+    unsigned int n1, n2, n3;
+    unsigned char *cm = ff_cropTbl + CROP_WIDTH;
+
+    vector3b = sub_pel_filters_inv[xoffset][2];
+
+    /* if (xoffset == 0) we don't need any filtering */
+    if (vector3b == 0)
+    {
+        for (i = 0; i < output_height; i++)
+        {
+            /* prefetch src_ptr data to cache memory */
+            prefetch_load(src_ptr + src_pixels_per_line);
+            dst_ptr[0] = src_ptr[0];
+            dst_ptr[1] = src_ptr[1];
+            dst_ptr[2] = src_ptr[2];
+            dst_ptr[3] = src_ptr[3];
+
+            /* next row... */
+            src_ptr += src_pixels_per_line;
+            dst_ptr += 4;
+        }
+    }
+    else
+    {
+        if (vector3b > 65536)
+        {
+            /* 6 tap filter */
+
+            vector1b = sub_pel_filters_inv[xoffset][0];
+            vector2b = sub_pel_filters_inv[xoffset][1];
+
+            /* prefetch src_ptr data to cache memory */
+            prefetch_load(src_ptr + src_pixels_per_line);
+
+            for (i = output_height; i--;)
+            {
+                /* apply filter with vectors pairs */
+                __asm__ __volatile__ (
+                    "ulw              %[tp1],      -2(%[src_ptr])                 \n\t"
+                    "ulw              %[tp2],      2(%[src_ptr])                  \n\t"
+
+                    /* even 1. pixel */
+                    "mtlo             %[vector4a], $ac3                           \n\t"
+                    "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
+                    "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
+                    "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
+                    "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
+
+                    /* even 2. pixel */
+                    "mtlo             %[vector4a], $ac2                           \n\t"
+                    "preceu.ph.qbl    %[p1],       %[tp2]                         \n\t"
+                    "balign           %[tp2],      %[tp1],         3              \n\t"
+                    "extp             %[Temp1],    $ac3,           9              \n\t"
+                    "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[p1],          %[vector3b]    \n\t"
+
+                    /* odd 1. pixel */
+                    "ulw              %[tn2],      3(%[src_ptr])                  \n\t"
+                    "mtlo             %[vector4a], $ac3                           \n\t"
+                    "preceu.ph.qbr    %[n1],       %[tp2]                         \n\t"
+                    "preceu.ph.qbl    %[n2],       %[tp2]                         \n\t"
+                    "preceu.ph.qbr    %[n3],       %[tn2]                         \n\t"
+                    "extp             %[Temp3],    $ac2,           9              \n\t"
+                    "dpa.w.ph         $ac3,        %[n1],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[n2],          %[vector2b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[n3],          %[vector3b]    \n\t"
+
+                    /* even 2. pixel */
+                    "mtlo             %[vector4a], $ac2                           \n\t"
+                    "preceu.ph.qbl    %[n1],       %[tn2]                         \n\t"
+                    "extp             %[Temp2],    $ac3,           9              \n\t"
+                    "dpa.w.ph         $ac2,        %[n2],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[n3],          %[vector2b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[n1],          %[vector3b]    \n\t"
+                    "extp             %[Temp4],    $ac2,           9              \n\t"
+
+                    /* clamp */
+                    "lbux             %[tp1],      %[Temp1](%[cm])                \n\t"
+                    "lbux             %[tn1],      %[Temp2](%[cm])                \n\t"
+                    "lbux             %[tp2],      %[Temp3](%[cm])                \n\t"
+                    "lbux             %[n2],       %[Temp4](%[cm])                \n\t"
+
+                    /* store bytes */
+                    "sb               %[tp1],      0(%[dst_ptr])                  \n\t"
+                    "sb               %[tn1],      1(%[dst_ptr])                  \n\t"
+                    "sb               %[tp2],      2(%[dst_ptr])                  \n\t"
+                    "sb               %[n2],       3(%[dst_ptr])                  \n\t"
+
+                    : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn1] "=&r" (tn1),
+                      [tn2] "=&r" (tn2), [p1] "=&r" (p1), [p2] "=&r" (p2),
+                      [p3] "=&r" (p3), [n1] "=&r" (n1), [n2] "=&r" (n2),
+                      [n3] "=&r" (n3), [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+                      [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
+                    : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+                      [vector4a] "r" (vector4a), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr),
+                      [vector3b] "r" (vector3b), [src_ptr] "r" (src_ptr)
+                );
+
+                /* Next row... */
+                src_ptr += src_pixels_per_line;
+                dst_ptr += pitch;
+            }
+        }
+        else
+        {
+            /* 4 tap filter */
+
+            vector1b = sub_pel_filters_inv_tap_4[xoffset][0];
+            vector2b = sub_pel_filters_inv_tap_4[xoffset][1];
+
+            for (i = output_height; i--;)
+            {
+                /* apply filter with vectors pairs */
+                __asm__ __volatile__ (
+                    "ulw              %[tp1],      -1(%[src_ptr])                 \n\t"
+                    "ulw              %[tp2],      3(%[src_ptr])                  \n\t"
+
+                    /* even 1. pixel */
+                    "mtlo             %[vector4a], $ac3                           \n\t"
+                    "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
+                    "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
+                    "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
+                    "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
+
+                    /* even 2. pixel */
+                    "mtlo             %[vector4a], $ac2                           \n\t"
+                    "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
+                    "extp             %[Temp1],    $ac3,           9              \n\t"
+
+                    /* odd 1. pixel */
+                    "srl              %[tn1],      %[tp2],         8              \n\t"
+                    "balign           %[tp2],      %[tp1],         3              \n\t"
+                    "mtlo             %[vector4a], $ac3                           \n\t"
+                    "preceu.ph.qbr    %[n1],       %[tp2]                         \n\t"
+                    "preceu.ph.qbl    %[n2],       %[tp2]                         \n\t"
+                    "preceu.ph.qbr    %[n3],       %[tn1]                         \n\t"
+                    "extp             %[Temp3],    $ac2,           9              \n\t"
+                    "dpa.w.ph         $ac3,        %[n1],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[n2],          %[vector2b]    \n\t"
+
+                    /* odd 2. pixel */
+                    "mtlo             %[vector4a], $ac2                           \n\t"
+                    "extp             %[Temp2],    $ac3,           9              \n\t"
+                    "dpa.w.ph         $ac2,        %[n2],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[n3],          %[vector2b]    \n\t"
+                    "extp             %[Temp4],    $ac2,           9              \n\t"
+
+                    /* clamp and store results */
+                    "lbux             %[tp1],      %[Temp1](%[cm])                \n\t"
+                    "lbux             %[tn1],      %[Temp2](%[cm])                \n\t"
+                    "lbux             %[tp2],      %[Temp3](%[cm])                \n\t"
+                    "sb               %[tp1],      0(%[dst_ptr])                  \n\t"
+                    "sb               %[tn1],      1(%[dst_ptr])                  \n\t"
+                    "lbux             %[n2],       %[Temp4](%[cm])                \n\t"
+                    "sb               %[tp2],      2(%[dst_ptr])                  \n\t"
+                    "sb               %[n2],       3(%[dst_ptr])                  \n\t"
+
+                    : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn1] "=&r" (tn1),
+                      [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3),
+                      [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3),
+                      [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+                      [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
+                    : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+                      [vector4a] "r" (vector4a), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr),
+                      [src_ptr] "r" (src_ptr)
+                );
+                /*  Next row... */
+                src_ptr += src_pixels_per_line;
+                dst_ptr += pitch;
+            }
+        }
+    }
+}
+
+void vp8_filter_block2d_first_pass_8_all
+(
+    unsigned char *RESTRICT src_ptr,
+    unsigned char *RESTRICT dst_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int output_height,
+    int xoffset,
+    int pitch
+)
+{
+    unsigned int i;
+    int Temp1, Temp2, Temp3, Temp4;
+
+    unsigned int vector4a = 64;
+    unsigned int vector1b, vector2b, vector3b;
+    unsigned int tp1, tp2, tn1, tn2;
+    unsigned int p1, p2, p3, p4;
+    unsigned int n1, n2, n3, n4;
+
+    unsigned char *cm = ff_cropTbl + CROP_WIDTH;
+
+    /* if (xoffset == 0) we don't need any filtering */
+    if (xoffset == 0)
+    {
+        for (i = 0; i < output_height; i++)
+        {
+            /* prefetch src_ptr data to cache memory */
+            prefetch_load(src_ptr + src_pixels_per_line);
+
+            dst_ptr[0] = src_ptr[0];
+            dst_ptr[1] = src_ptr[1];
+            dst_ptr[2] = src_ptr[2];
+            dst_ptr[3] = src_ptr[3];
+            dst_ptr[4] = src_ptr[4];
+            dst_ptr[5] = src_ptr[5];
+            dst_ptr[6] = src_ptr[6];
+            dst_ptr[7] = src_ptr[7];
+
+            /* next row... */
+            src_ptr += src_pixels_per_line;
+            dst_ptr += 8;
+        }
+    }
+    else
+    {
+        vector3b = sub_pel_filters_inv[xoffset][2];
+
+        if (vector3b > 65536)
+        {
+            /* 6 tap filter */
+
+            vector1b = sub_pel_filters_inv[xoffset][0];
+            vector2b = sub_pel_filters_inv[xoffset][1];
+
+            for (i = output_height; i--;)
+            {
+                /* prefetch src_ptr data to cache memory */
+                prefetch_load(src_ptr + src_pixels_per_line);
+
+                /* apply filter with vectors pairs */
+                __asm__ __volatile__ (
+                    "ulw              %[tp1],      -2(%[src_ptr])                 \n\t"
+                    "ulw              %[tp2],      2(%[src_ptr])                  \n\t"
+
+                    /* even 1. pixel */
+                    "mtlo             %[vector4a], $ac3                           \n\t"
+                    "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
+                    "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
+                    "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
+                    "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
+
+                    /* even 2. pixel */
+                    "mtlo             %[vector4a], $ac2                           \n\t"
+                    "preceu.ph.qbl    %[p1],       %[tp2]                         \n\t"
+                    "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[p1],          %[vector3b]    \n\t"
+
+                    "balign           %[tp2],      %[tp1],         3              \n\t"
+                    "extp             %[Temp1],    $ac3,           9              \n\t"
+                    "ulw              %[tn2],      3(%[src_ptr])                  \n\t"
+
+                    /* odd 1. pixel */
+                    "mtlo             %[vector4a], $ac3                           \n\t"
+                    "preceu.ph.qbr    %[n1],       %[tp2]                         \n\t"
+                    "preceu.ph.qbl    %[n2],       %[tp2]                         \n\t"
+                    "preceu.ph.qbr    %[n3],       %[tn2]                         \n\t"
+                    "extp             %[Temp3],    $ac2,           9              \n\t"
+                    "dpa.w.ph         $ac3,        %[n1],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[n2],          %[vector2b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[n3],          %[vector3b]    \n\t"
+
+                    /* odd 2. pixel */
+                    "mtlo             %[vector4a], $ac2                           \n\t"
+                    "preceu.ph.qbl    %[n1],       %[tn2]                         \n\t"
+                    "dpa.w.ph         $ac2,        %[n2],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[n3],          %[vector2b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[n1],          %[vector3b]    \n\t"
+                    "ulw              %[tp1],      6(%[src_ptr])                  \n\t"
+                    "extp             %[Temp2],    $ac3,           9              \n\t"
+                    "mtlo             %[vector4a], $ac3                           \n\t"
+                    "preceu.ph.qbr    %[p2],       %[tp1]                         \n\t"
+                    "extp             %[Temp4],    $ac2,           9              \n\t"
+
+                    : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn2] "=&r" (tn2),
+                      [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3),
+                      [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3),
+                      [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+                      [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
+                    : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+                      [vector4a] "r" (vector4a), [vector3b] "r" (vector3b),
+                      [src_ptr] "r" (src_ptr)
+                );
+
+                /* clamp and store results */
+                dst_ptr[0] = cm[Temp1];
+                dst_ptr[1] = cm[Temp2];
+                dst_ptr[2] = cm[Temp3];
+                dst_ptr[3] = cm[Temp4];
+
+                /* next 4 pixels */
+                __asm__ __volatile__ (
+                    /* even 3. pixel */
+                    "dpa.w.ph         $ac3,        %[p3],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[p1],          %[vector2b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[p2],          %[vector3b]    \n\t"
+
+                    /* even 4. pixel */
+                    "mtlo             %[vector4a], $ac2                           \n\t"
+                    "preceu.ph.qbl    %[p4],       %[tp1]                         \n\t"
+                    "dpa.w.ph         $ac2,        %[p1],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[p2],          %[vector2b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[p4],          %[vector3b]    \n\t"
+
+                    "ulw              %[tn1],      7(%[src_ptr])                  \n\t"
+                    "extp             %[Temp1],    $ac3,           9              \n\t"
+
+                    /* odd 3. pixel */
+                    "mtlo             %[vector4a], $ac3                           \n\t"
+                    "preceu.ph.qbr    %[n2],       %[tn1]                         \n\t"
+                    "dpa.w.ph         $ac3,        %[n3],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[n1],          %[vector2b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[n2],          %[vector3b]    \n\t"
+                    "extp             %[Temp3],    $ac2,           9              \n\t"
+
+                    /* odd 4. pixel */
+                    "mtlo             %[vector4a], $ac2                           \n\t"
+                    "preceu.ph.qbl    %[n4],       %[tn1]                         \n\t"
+                    "dpa.w.ph         $ac2,        %[n1],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[n2],          %[vector2b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[n4],          %[vector3b]    \n\t"
+                    "extp             %[Temp2],    $ac3,           9              \n\t"
+                    "extp             %[Temp4],    $ac2,           9              \n\t"
+
+                    : [tn1] "=&r" (tn1), [n2] "=&r" (n2),
+                      [p4] "=&r" (p4), [n4] "=&r" (n4),
+                      [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+                      [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
+                    : [tp1] "r" (tp1), [vector1b] "r" (vector1b), [p2] "r" (p2),
+                      [vector2b] "r" (vector2b), [n1] "r" (n1), [p1] "r" (p1),
+                      [vector4a] "r" (vector4a), [vector3b] "r" (vector3b),
+                      [p3] "r" (p3), [n3] "r" (n3), [src_ptr] "r" (src_ptr)
+                );
+
+                /* clamp and store results */
+                dst_ptr[4] = cm[Temp1];
+                dst_ptr[5] = cm[Temp2];
+                dst_ptr[6] = cm[Temp3];
+                dst_ptr[7] = cm[Temp4];
+
+                src_ptr += src_pixels_per_line;
+                dst_ptr += pitch;
+            }
+        }
+        else
+        {
+            /* 4 tap filter */
+
+            vector1b = sub_pel_filters_inv_tap_4[xoffset][0];
+            vector2b = sub_pel_filters_inv_tap_4[xoffset][1];
+
+            for (i = output_height; i--;)
+            {
+                /* prefetch src_ptr data to cache memory */
+                prefetch_load(src_ptr + src_pixels_per_line);
+
+                /* apply filter with vectors pairs */
+                __asm__ __volatile__ (
+                    "ulw              %[tp1],      -1(%[src_ptr])                 \n\t"
+
+                    /* even 1. pixel */
+                    "mtlo             %[vector4a], $ac3                           \n\t"
+                    "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
+                    "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
+                    "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
+
+                    "ulw              %[tp2],      3(%[src_ptr])                  \n\t"
+
+                    /* even 2. pixel  */
+                    "mtlo             %[vector4a], $ac2                           \n\t"
+                    "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
+                    "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
+                    "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
+                    "extp             %[Temp1],    $ac3,           9              \n\t"
+
+                    "balign           %[tp2],      %[tp1],         3              \n\t"
+
+                    /* odd 1. pixel */
+                    "mtlo             %[vector4a], $ac3                           \n\t"
+                    "preceu.ph.qbr    %[n1],       %[tp2]                         \n\t"
+                    "preceu.ph.qbl    %[n2],       %[tp2]                         \n\t"
+                    "dpa.w.ph         $ac3,        %[n1],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[n2],          %[vector2b]    \n\t"
+                    "extp             %[Temp3],    $ac2,           9              \n\t"
+
+                    "ulw              %[tn2],      4(%[src_ptr])                  \n\t"
+
+                    /* odd 2. pixel */
+                    "mtlo             %[vector4a], $ac2                           \n\t"
+                    "preceu.ph.qbr    %[n3],       %[tn2]                         \n\t"
+                    "preceu.ph.qbl    %[n4],       %[tn2]                         \n\t"
+                    "dpa.w.ph         $ac2,        %[n2],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[n3],          %[vector2b]    \n\t"
+                    "ulw              %[tp1],      7(%[src_ptr])                  \n\t"
+                    "extp             %[Temp2],    $ac3,           9              \n\t"
+                    "mtlo             %[vector4a], $ac3                           \n\t"
+                    "extp             %[Temp4],    $ac2,           9              \n\t"
+
+                    : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+                      [tn2] "=&r" (tn2), [p1] "=&r" (p1), [p2] "=&r" (p2),
+                      [p3] "=&r" (p3), [p4] "=&r" (p4), [n1] "=&r" (n1),
+                      [n2] "=&r" (n2), [n3] "=&r" (n3), [n4] "=&r" (n4),
+                      [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+                      [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
+                    : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+                      [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr)
+                );
+
+                /* clamp and store results */
+                dst_ptr[0] = cm[Temp1];
+                dst_ptr[1] = cm[Temp2];
+                dst_ptr[2] = cm[Temp3];
+                dst_ptr[3] = cm[Temp4];
+
+                /* next 4 pixels */
+                __asm__ __volatile__ (
+                    /* even 3. pixel */
+                    "dpa.w.ph         $ac3,        %[p3],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[p4],          %[vector2b]    \n\t"
+
+                    /* even 4. pixel */
+                    "mtlo             %[vector4a], $ac2                           \n\t"
+                    "preceu.ph.qbr    %[p2],       %[tp1]                         \n\t"
+                    "dpa.w.ph         $ac2,        %[p4],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[p2],          %[vector2b]    \n\t"
+                    "extp             %[Temp1],    $ac3,           9              \n\t"
+
+                    /* odd 3. pixel */
+                    "mtlo             %[vector4a], $ac3                           \n\t"
+                    "dpa.w.ph         $ac3,        %[n3],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[n4],          %[vector2b]    \n\t"
+                    "ulw              %[tn1],      8(%[src_ptr])                  \n\t"
+                    "extp             %[Temp3],    $ac2,           9              \n\t"
+
+                    /* odd 4. pixel */
+                    "mtlo             %[vector4a], $ac2                           \n\t"
+                    "preceu.ph.qbr    %[n2],       %[tn1]                         \n\t"
+                    "dpa.w.ph         $ac2,        %[n4],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[n2],          %[vector2b]    \n\t"
+                    "extp             %[Temp2],    $ac3,           9              \n\t"
+                    "extp             %[Temp4],    $ac2,           9              \n\t"
+
+                    : [tn1] "=&r" (tn1), [p2] "=&r" (p2), [n2] "=&r" (n2),
+                      [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+                      [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
+                    : [tp1] "r" (tp1), [p3] "r" (p3), [p4] "r" (p4),
+                      [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+                      [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr),
+                      [n3] "r" (n3), [n4] "r" (n4)
+                );
+
+                /* clamp and store results */
+                dst_ptr[4] = cm[Temp1];
+                dst_ptr[5] = cm[Temp2];
+                dst_ptr[6] = cm[Temp3];
+                dst_ptr[7] = cm[Temp4];
+
+                /* next row... */
+                src_ptr += src_pixels_per_line;
+                dst_ptr += pitch;
+            }
+        }
+    }
+}
+
+
+void vp8_filter_block2d_first_pass16_6tap
+(
+    unsigned char *RESTRICT src_ptr,
+    unsigned char *RESTRICT dst_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int output_height,
+    int xoffset,
+    int pitch
+)
+{
+    unsigned int i;
+    int Temp1, Temp2, Temp3, Temp4;
+
+    unsigned int vector4a;
+    unsigned int vector1b, vector2b, vector3b;
+    unsigned int tp1, tp2, tn1, tn2;
+    unsigned int p1, p2, p3, p4;
+    unsigned int n1, n2, n3, n4;
+    unsigned char *cm = ff_cropTbl + CROP_WIDTH;
+
+    vector1b = sub_pel_filters_inv[xoffset][0];
+    vector2b = sub_pel_filters_inv[xoffset][1];
+    vector3b = sub_pel_filters_inv[xoffset][2];
+    vector4a = 64;
+
+    for (i = output_height; i--;)
+    {
+        /* prefetch src_ptr data to cache memory */
+        prefetch_load(src_ptr + src_pixels_per_line);
+
+        /* apply filter with vectors pairs */
+        __asm__ __volatile__ (
+            "ulw                %[tp1],      -2(%[src_ptr])                 \n\t"
+            "ulw                %[tp2],      2(%[src_ptr])                  \n\t"
+
+            /* even 1. pixel */
+            "mtlo               %[vector4a], $ac3                           \n\t"
+            "preceu.ph.qbr      %[p1],       %[tp1]                         \n\t"
+            "preceu.ph.qbl      %[p2],       %[tp1]                         \n\t"
+            "preceu.ph.qbr      %[p3],       %[tp2]                         \n\t"
+            "dpa.w.ph           $ac3,        %[p1],           %[vector1b]   \n\t"
+            "dpa.w.ph           $ac3,        %[p2],           %[vector2b]   \n\t"
+            "dpa.w.ph           $ac3,        %[p3],           %[vector3b]   \n\t"
+
+            /* even 2. pixel */
+            "mtlo               %[vector4a], $ac2                           \n\t"
+            "preceu.ph.qbl      %[p1],       %[tp2]                         \n\t"
+            "dpa.w.ph           $ac2,        %[p2],           %[vector1b]   \n\t"
+            "dpa.w.ph           $ac2,        %[p3],           %[vector2b]   \n\t"
+            "dpa.w.ph           $ac2,        %[p1],           %[vector3b]   \n\t"
+
+            "balign             %[tp2],      %[tp1],          3             \n\t"
+            "ulw                %[tn2],      3(%[src_ptr])                  \n\t"
+            "extp               %[Temp1],    $ac3,            9             \n\t"
+
+            /* odd 1. pixel */
+            "mtlo               %[vector4a], $ac3                           \n\t"
+            "preceu.ph.qbr      %[n1],       %[tp2]                         \n\t"
+            "preceu.ph.qbl      %[n2],       %[tp2]                         \n\t"
+            "preceu.ph.qbr      %[n3],       %[tn2]                         \n\t"
+            "extp               %[Temp3],    $ac2,            9             \n\t"
+            "dpa.w.ph           $ac3,        %[n1],           %[vector1b]   \n\t"
+            "dpa.w.ph           $ac3,        %[n2],           %[vector2b]   \n\t"
+            "dpa.w.ph           $ac3,        %[n3],           %[vector3b]   \n\t"
+
+            /* odd 2. pixel */
+            "mtlo               %[vector4a], $ac2                           \n\t"
+            "preceu.ph.qbl      %[n1],       %[tn2]                         \n\t"
+            "dpa.w.ph           $ac2,        %[n2],           %[vector1b]   \n\t"
+            "dpa.w.ph           $ac2,        %[n3],           %[vector2b]   \n\t"
+            "dpa.w.ph           $ac2,        %[n1],           %[vector3b]   \n\t"
+            "ulw                %[tp1],      6(%[src_ptr])                  \n\t"
+            "extp               %[Temp2],    $ac3,            9             \n\t"
+            "mtlo               %[vector4a], $ac3                           \n\t"
+            "preceu.ph.qbr      %[p2],       %[tp1]                         \n\t"
+            "extp               %[Temp4],    $ac2,            9             \n\t"
+
+            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn2] "=&r" (tn2),
+              [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3),
+              [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3),
+              [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+              [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
+            : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+              [vector4a] "r" (vector4a), [vector3b] "r" (vector3b),
+              [src_ptr] "r" (src_ptr)
+        );
+
+        /* clamp and store results */
+        dst_ptr[0] = cm[Temp1];
+        dst_ptr[1] = cm[Temp2];
+        dst_ptr[2] = cm[Temp3];
+        dst_ptr[3] = cm[Temp4];
+
+        /* next 4 pixels */
+        __asm__ __volatile__ (
+            /* even 3. pixel */
+            "dpa.w.ph           $ac3,        %[p3],           %[vector1b]   \n\t"
+            "dpa.w.ph           $ac3,        %[p1],           %[vector2b]   \n\t"
+            "dpa.w.ph           $ac3,        %[p2],           %[vector3b]   \n\t"
+
+            /* even 4. pixel */
+            "mtlo               %[vector4a], $ac2                           \n\t"
+            "preceu.ph.qbl      %[p4],       %[tp1]                         \n\t"
+            "dpa.w.ph           $ac2,        %[p1],           %[vector1b]   \n\t"
+            "dpa.w.ph           $ac2,        %[p2],           %[vector2b]   \n\t"
+            "dpa.w.ph           $ac2,        %[p4],           %[vector3b]   \n\t"
+            "ulw                %[tn1],      7(%[src_ptr])                  \n\t"
+            "extp               %[Temp1],    $ac3,            9             \n\t"
+
+            /* odd 3. pixel */
+            "mtlo               %[vector4a], $ac3                           \n\t"
+            "preceu.ph.qbr      %[n2],       %[tn1]                         \n\t"
+            "dpa.w.ph           $ac3,        %[n3],           %[vector1b]   \n\t"
+            "dpa.w.ph           $ac3,        %[n1],           %[vector2b]   \n\t"
+            "dpa.w.ph           $ac3,        %[n2],           %[vector3b]   \n\t"
+            "extp               %[Temp3],    $ac2,            9             \n\t"
+
+            /* odd 4. pixel */
+            "mtlo               %[vector4a], $ac2                           \n\t"
+            "preceu.ph.qbl      %[n4],       %[tn1]                         \n\t"
+            "dpa.w.ph           $ac2,        %[n1],           %[vector1b]   \n\t"
+            "dpa.w.ph           $ac2,        %[n2],           %[vector2b]   \n\t"
+            "dpa.w.ph           $ac2,        %[n4],           %[vector3b]   \n\t"
+            "ulw                %[tp2],      10(%[src_ptr])                 \n\t"
+            "extp               %[Temp2],    $ac3,            9             \n\t"
+            "mtlo               %[vector4a], $ac3                           \n\t"
+            "preceu.ph.qbr      %[p1],       %[tp2]                         \n\t"
+            "extp               %[Temp4],    $ac2,            9             \n\t"
+
+            : [tn1] "=&r" (tn1), [tp2] "=&r" (tp2), [n2] "=&r" (n2),
+              [p4] "=&r" (p4), [n4] "=&r" (n4),
+              [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+              [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
+            : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+              [tp1] "r" (tp1), [n1] "r" (n1), [p1] "r" (p1),
+              [vector4a] "r" (vector4a), [p2] "r" (p2), [vector3b] "r" (vector3b),
+              [p3] "r" (p3), [n3] "r" (n3), [src_ptr] "r" (src_ptr)
+        );
+
+        /* clamp and store results */
+        dst_ptr[4] = cm[Temp1];
+        dst_ptr[5] = cm[Temp2];
+        dst_ptr[6] = cm[Temp3];
+        dst_ptr[7] = cm[Temp4];
+
+        /* next 4 pixels */
+        __asm__ __volatile__ (
+            /* even 5. pixel */
+            "dpa.w.ph           $ac3,        %[p2],           %[vector1b]   \n\t"
+            "dpa.w.ph           $ac3,        %[p4],           %[vector2b]   \n\t"
+            "dpa.w.ph           $ac3,        %[p1],           %[vector3b]   \n\t"
+
+            /* even 6. pixel */
+            "mtlo               %[vector4a], $ac2                           \n\t"
+            "preceu.ph.qbl      %[p3],       %[tp2]                         \n\t"
+            "dpa.w.ph           $ac2,        %[p4],           %[vector1b]   \n\t"
+            "dpa.w.ph           $ac2,        %[p1],           %[vector2b]   \n\t"
+            "dpa.w.ph           $ac2,        %[p3],           %[vector3b]   \n\t"
+
+            "ulw                %[tn1],      11(%[src_ptr])                 \n\t"
+            "extp               %[Temp1],    $ac3,            9             \n\t"
+
+            /* odd 5. pixel */
+            "mtlo               %[vector4a], $ac3                           \n\t"
+            "preceu.ph.qbr      %[n1],       %[tn1]                         \n\t"
+            "dpa.w.ph           $ac3,        %[n2],           %[vector1b]   \n\t"
+            "dpa.w.ph           $ac3,        %[n4],           %[vector2b]   \n\t"
+            "dpa.w.ph           $ac3,        %[n1],           %[vector3b]   \n\t"
+            "extp               %[Temp3],    $ac2,            9             \n\t"
+
+            /* odd 6. pixel */
+            "mtlo               %[vector4a], $ac2                           \n\t"
+            "preceu.ph.qbl      %[n3],       %[tn1]                         \n\t"
+            "dpa.w.ph           $ac2,        %[n4],           %[vector1b]   \n\t"
+            "dpa.w.ph           $ac2,        %[n1],           %[vector2b]   \n\t"
+            "dpa.w.ph           $ac2,        %[n3],           %[vector3b]   \n\t"
+            "ulw                %[tp1],      14(%[src_ptr])                 \n\t"
+            "extp               %[Temp2],    $ac3,            9             \n\t"
+            "mtlo               %[vector4a], $ac3                           \n\t"
+            "preceu.ph.qbr      %[p4],       %[tp1]                         \n\t"
+            "extp               %[Temp4],    $ac2,            9             \n\t"
+
+            : [tn1] "=&r" (tn1), [tp1] "=&r" (tp1),
+              [n1] "=&r" (n1), [p3] "=&r" (p3), [n3] "=&r" (n3),
+              [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+              [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
+            : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+              [tp2] "r" (tp2), [p2] "r" (p2), [n2] "r" (n2),
+              [p4] "r" (p4), [n4] "r" (n4), [p1] "r" (p1), [src_ptr] "r" (src_ptr),
+              [vector4a] "r" (vector4a), [vector3b] "r" (vector3b)
+        );
+
+        /* clamp and store results */
+        dst_ptr[8] = cm[Temp1];
+        dst_ptr[9] = cm[Temp2];
+        dst_ptr[10] = cm[Temp3];
+        dst_ptr[11] = cm[Temp4];
+
+        /* next 4 pixels */
+        __asm__ __volatile__ (
+            /* even 7. pixel */
+            "dpa.w.ph           $ac3,        %[p1],           %[vector1b]   \n\t"
+            "dpa.w.ph           $ac3,        %[p3],           %[vector2b]   \n\t"
+            "dpa.w.ph           $ac3,        %[p4],           %[vector3b]   \n\t"
+
+            /* even 8. pixel */
+            "mtlo               %[vector4a], $ac2                           \n\t"
+            "preceu.ph.qbl      %[p2],       %[tp1]                         \n\t"
+            "dpa.w.ph           $ac2,        %[p3],           %[vector1b]   \n\t"
+            "dpa.w.ph           $ac2,        %[p4],           %[vector2b]   \n\t"
+            "dpa.w.ph           $ac2,        %[p2],           %[vector3b]   \n\t"
+            "ulw                %[tn1],      15(%[src_ptr])                 \n\t"
+            "extp               %[Temp1],    $ac3,            9             \n\t"
+
+            /* odd 7. pixel */
+            "mtlo               %[vector4a], $ac3                           \n\t"
+            "preceu.ph.qbr      %[n4],       %[tn1]                         \n\t"
+            "dpa.w.ph           $ac3,        %[n1],           %[vector1b]   \n\t"
+            "dpa.w.ph           $ac3,        %[n3],           %[vector2b]   \n\t"
+            "dpa.w.ph           $ac3,        %[n4],           %[vector3b]   \n\t"
+            "extp               %[Temp3],    $ac2,            9             \n\t"
+
+            /* odd 8. pixel */
+            "mtlo               %[vector4a], $ac2                           \n\t"
+            "preceu.ph.qbl      %[n2],       %[tn1]                         \n\t"
+            "dpa.w.ph           $ac2,        %[n3],           %[vector1b]   \n\t"
+            "dpa.w.ph           $ac2,        %[n4],           %[vector2b]   \n\t"
+            "dpa.w.ph           $ac2,        %[n2],           %[vector3b]   \n\t"
+            "extp               %[Temp2],    $ac3,            9             \n\t"
+            "extp               %[Temp4],    $ac2,            9             \n\t"
+
+            /* clamp and store results */
+            "lbux               %[tp1],      %[Temp1](%[cm])                \n\t"
+            "lbux               %[tn1],      %[Temp2](%[cm])                \n\t"
+            "lbux               %[p2],       %[Temp3](%[cm])                \n\t"
+            "sb                 %[tp1],      12(%[dst_ptr])                 \n\t"
+            "sb                 %[tn1],      13(%[dst_ptr])                 \n\t"
+            "lbux               %[n2],       %[Temp4](%[cm])                \n\t"
+            "sb                 %[p2],       14(%[dst_ptr])                 \n\t"
+            "sb                 %[n2],       15(%[dst_ptr])                 \n\t"
+
+            : [tn1] "=&r" (tn1), [p2] "=&r" (p2), [n2] "=&r" (n2), [n4] "=&r" (n4),
+              [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+              [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
+            : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+              [tp1] "r" (tp1), [p4] "r" (p4), [n1] "r" (n1), [p1] "r" (p1),
+              [vector4a] "r" (vector4a), [vector3b] "r" (vector3b), [p3] "r" (p3),
+              [n3] "r" (n3), [src_ptr] "r" (src_ptr),
+              [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
+        );
+
+        src_ptr += src_pixels_per_line;
+        dst_ptr += pitch;
+    }
+}
+
+
+void vp8_filter_block2d_first_pass16_0
+(
+    unsigned char *RESTRICT src_ptr,
+    unsigned char *RESTRICT output_ptr,
+    unsigned int src_pixels_per_line
+)
+{
+    int Temp1, Temp2, Temp3, Temp4;
+    int i;
+
+    /* prefetch src_ptr data to cache memory */
+    prefetch_store(output_ptr + 32);
+
+    /* copy memory from src buffer to dst buffer */
+    for (i = 0; i < 7; i++)
+    {
+        __asm__ __volatile__ (
+            "ulw    %[Temp1],   0(%[src_ptr])                               \n\t"
+            "ulw    %[Temp2],   4(%[src_ptr])                               \n\t"
+            "ulw    %[Temp3],   8(%[src_ptr])                               \n\t"
+            "ulw    %[Temp4],   12(%[src_ptr])                              \n\t"
+            "sw     %[Temp1],   0(%[output_ptr])                            \n\t"
+            "sw     %[Temp2],   4(%[output_ptr])                            \n\t"
+            "sw     %[Temp3],   8(%[output_ptr])                            \n\t"
+            "sw     %[Temp4],   12(%[output_ptr])                           \n\t"
+            "addu   %[src_ptr], %[src_ptr],        %[src_pixels_per_line]   \n\t"
+
+            : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+              [Temp4] "=&r" (Temp4), [src_ptr] "+r" (src_ptr)
+            : [src_pixels_per_line] "r" (src_pixels_per_line),
+              [output_ptr] "r" (output_ptr)
+        );
+
+        __asm__ __volatile__ (
+            "ulw    %[Temp1],   0(%[src_ptr])                               \n\t"
+            "ulw    %[Temp2],   4(%[src_ptr])                               \n\t"
+            "ulw    %[Temp3],   8(%[src_ptr])                               \n\t"
+            "ulw    %[Temp4],   12(%[src_ptr])                              \n\t"
+            "sw     %[Temp1],   16(%[output_ptr])                           \n\t"
+            "sw     %[Temp2],   20(%[output_ptr])                           \n\t"
+            "sw     %[Temp3],   24(%[output_ptr])                           \n\t"
+            "sw     %[Temp4],   28(%[output_ptr])                           \n\t"
+            "addu   %[src_ptr], %[src_ptr],        %[src_pixels_per_line]   \n\t"
+
+            : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+              [Temp4] "=&r" (Temp4), [src_ptr] "+r" (src_ptr)
+            : [src_pixels_per_line] "r" (src_pixels_per_line),
+              [output_ptr] "r" (output_ptr)
+        );
+
+        __asm__ __volatile__ (
+            "ulw    %[Temp1],   0(%[src_ptr])                               \n\t"
+            "ulw    %[Temp2],   4(%[src_ptr])                               \n\t"
+            "ulw    %[Temp3],   8(%[src_ptr])                               \n\t"
+            "ulw    %[Temp4],   12(%[src_ptr])                              \n\t"
+            "sw     %[Temp1],   32(%[output_ptr])                           \n\t"
+            "sw     %[Temp2],   36(%[output_ptr])                           \n\t"
+            "sw     %[Temp3],   40(%[output_ptr])                           \n\t"
+            "sw     %[Temp4],   44(%[output_ptr])                           \n\t"
+            "addu   %[src_ptr], %[src_ptr],        %[src_pixels_per_line]   \n\t"
+
+            : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+              [Temp4] "=&r" (Temp4), [src_ptr] "+r" (src_ptr)
+            : [src_pixels_per_line] "r" (src_pixels_per_line),
+              [output_ptr] "r" (output_ptr)
+        );
+
+        output_ptr += 48;
+    }
+}
+
+
+void vp8_filter_block2d_first_pass16_4tap
+(
+    unsigned char *RESTRICT src_ptr,
+    unsigned char *RESTRICT output_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int output_width,
+    unsigned int output_height,
+    int xoffset,
+    int yoffset,
+    unsigned char *RESTRICT dst_ptr,
+    int pitch
+)
+{
+    unsigned int i, j;
+    int Temp1, Temp2, Temp3, Temp4;
+
+    unsigned int vector4a;
+    int vector1b, vector2b;
+    unsigned int tp1, tp2, tp3, tn1;
+    unsigned int p1, p2, p3;
+    unsigned int n1, n2, n3;
+    unsigned char *cm = ff_cropTbl + CROP_WIDTH;
+
+    vector4a = 64;
+
+    vector1b = sub_pel_filters_inv_tap_4[xoffset][0];
+    vector2b = sub_pel_filters_inv_tap_4[xoffset][1];
+
+    /* if (yoffset == 0) don't need temp buffer, data will be stored in dst_ptr */
+    if (yoffset == 0)
+    {
+        output_height -= 5;
+        src_ptr += (src_pixels_per_line + src_pixels_per_line);
+
+        for (i = output_height; i--;)
+        {
+            __asm__ __volatile__ (
+                "ulw     %[tp3],   -1(%[src_ptr])               \n\t"
+                : [tp3] "=&r" (tp3)
+                : [src_ptr] "r" (src_ptr)
+            );
+
+            /* processing 4 adjacent pixels */
+            for (j = 0; j < 16; j += 4)
+            {
+                /* apply filter with vectors pairs */
+                __asm__ __volatile__ (
+                    "ulw              %[tp2],      3(%[src_ptr])                    \n\t"
+                    "move             %[tp1],      %[tp3]                           \n\t"
+
+                    /* even 1. pixel */
+                    "mtlo             %[vector4a], $ac3                             \n\t"
+                    "mthi             $0,          $ac3                             \n\t"
+                    "move             %[tp3],      %[tp2]                           \n\t"
+                    "preceu.ph.qbr    %[p1],       %[tp1]                           \n\t"
+                    "preceu.ph.qbl    %[p2],       %[tp1]                           \n\t"
+                    "preceu.ph.qbr    %[p3],       %[tp2]                           \n\t"
+                    "dpa.w.ph         $ac3,        %[p1],           %[vector1b]     \n\t"
+                    "dpa.w.ph         $ac3,        %[p2],           %[vector2b]     \n\t"
+
+                    /* even 2. pixel */
+                    "mtlo             %[vector4a], $ac2                             \n\t"
+                    "mthi             $0,          $ac2                             \n\t"
+                    "dpa.w.ph         $ac2,        %[p2],           %[vector1b]     \n\t"
+                    "dpa.w.ph         $ac2,        %[p3],           %[vector2b]     \n\t"
+                    "extr.w           %[Temp1],    $ac3,            7               \n\t"
+
+                    /* odd 1. pixel */
+                    "ulw              %[tn1],      4(%[src_ptr])                    \n\t"
+                    "balign           %[tp2],      %[tp1],          3               \n\t"
+                    "mtlo             %[vector4a], $ac3                             \n\t"
+                    "mthi             $0,          $ac3                             \n\t"
+                    "preceu.ph.qbr    %[n1],       %[tp2]                           \n\t"
+                    "preceu.ph.qbl    %[n2],       %[tp2]                           \n\t"
+                    "preceu.ph.qbr    %[n3],       %[tn1]                           \n\t"
+                    "extr.w           %[Temp3],    $ac2,            7               \n\t"
+                    "dpa.w.ph         $ac3,        %[n1],           %[vector1b]     \n\t"
+                    "dpa.w.ph         $ac3,        %[n2],           %[vector2b]     \n\t"
+
+                    /* odd 2. pixel */
+                    "mtlo             %[vector4a], $ac2                             \n\t"
+                    "mthi             $0,          $ac2                             \n\t"
+                    "extr.w           %[Temp2],    $ac3,            7               \n\t"
+                    "dpa.w.ph         $ac2,        %[n2],           %[vector1b]     \n\t"
+                    "dpa.w.ph         $ac2,        %[n3],           %[vector2b]     \n\t"
+                    "extr.w           %[Temp4],    $ac2,            7               \n\t"
+
+                    /* clamp and store results */
+                    "lbux             %[tp1],      %[Temp1](%[cm])                  \n\t"
+                    "lbux             %[tn1],      %[Temp2](%[cm])                  \n\t"
+                    "lbux             %[tp2],      %[Temp3](%[cm])                  \n\t"
+                    "sb               %[tp1],      0(%[dst_ptr])                    \n\t"
+                    "sb               %[tn1],      1(%[dst_ptr])                    \n\t"
+                    "lbux             %[n2],       %[Temp4](%[cm])                  \n\t"
+                    "sb               %[tp2],      2(%[dst_ptr])                    \n\t"
+                    "sb               %[n2],       3(%[dst_ptr])                    \n\t"
+
+                    : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3),
+                      [tn1] "=&r" (tn1), [p1] "=&r" (p1), [p2] "=&r" (p2),
+                      [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3),
+                      [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [p3] "=&r" (p3),
+                      [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
+                    : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+                      [vector4a] "r" (vector4a), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr),
+                      [src_ptr] "r" (src_ptr)
+                );
+
+                src_ptr += 4;
+            }
+
+            /* Next row... */
+            src_ptr += src_pixels_per_line - 16;
+            dst_ptr += pitch;
+        }
+    }
+    else
+    {
+        for (i = output_height; i--;)
+        {
+            /* processing 4 adjacent pixels */
+            for (j = 0; j < 16; j += 4)
+            {
+                /* apply filter with vectors pairs */
+                __asm__ __volatile__ (
+                    "ulw              %[tp1],      -1(%[src_ptr])                   \n\t"
+                    "ulw              %[tp2],      3(%[src_ptr])                    \n\t"
+
+                    /* even 1. pixel */
+                    "mtlo             %[vector4a], $ac3                             \n\t"
+                    "mthi             $0,          $ac3                             \n\t"
+                    "preceu.ph.qbr    %[p1],       %[tp1]                           \n\t"
+                    "preceu.ph.qbl    %[p2],       %[tp1]                           \n\t"
+                    "preceu.ph.qbr    %[p3],       %[tp2]                           \n\t"
+                    "dpa.w.ph         $ac3,        %[p1],           %[vector1b]     \n\t"
+                    "dpa.w.ph         $ac3,        %[p2],           %[vector2b]     \n\t"
+
+                    /* even 2. pixel */
+                    "mtlo             %[vector4a], $ac2                             \n\t"
+                    "mthi             $0,          $ac2                             \n\t"
+                    "dpa.w.ph         $ac2,        %[p2],           %[vector1b]     \n\t"
+                    "dpa.w.ph         $ac2,        %[p3],           %[vector2b]     \n\t"
+                    "extr.w           %[Temp1],    $ac3,            7               \n\t"
+
+                    /* odd 1. pixel */
+                    "ulw              %[tn1],      4(%[src_ptr])                    \n\t"
+                    "balign           %[tp2],      %[tp1],          3               \n\t"
+                    "mtlo             %[vector4a], $ac3                             \n\t"
+                    "mthi             $0,          $ac3                             \n\t"
+                    "preceu.ph.qbr    %[n1],       %[tp2]                           \n\t"
+                    "preceu.ph.qbl    %[n2],       %[tp2]                           \n\t"
+                    "preceu.ph.qbr    %[n3],       %[tn1]                           \n\t"
+                    "extr.w           %[Temp3],    $ac2,            7               \n\t"
+                    "dpa.w.ph         $ac3,        %[n1],           %[vector1b]     \n\t"
+                    "dpa.w.ph         $ac3,        %[n2],           %[vector2b]     \n\t"
+
+                    /* odd 2. pixel */
+                    "mtlo             %[vector4a], $ac2                             \n\t"
+                    "mthi             $0,          $ac2                             \n\t"
+                    "extr.w           %[Temp2],    $ac3,            7               \n\t"
+                    "dpa.w.ph         $ac2,        %[n2],           %[vector1b]     \n\t"
+                    "dpa.w.ph         $ac2,        %[n3],           %[vector2b]     \n\t"
+                    "extr.w           %[Temp4],    $ac2,            7               \n\t"
+
+                    /* clamp and store results */
+                    "lbux             %[tp1],      %[Temp1](%[cm])                  \n\t"
+                    "lbux             %[tn1],      %[Temp2](%[cm])                  \n\t"
+                    "lbux             %[tp2],      %[Temp3](%[cm])                  \n\t"
+                    "sb               %[tp1],      0(%[output_ptr])                 \n\t"
+                    "sb               %[tn1],      1(%[output_ptr])                 \n\t"
+                    "lbux             %[n2],       %[Temp4](%[cm])                  \n\t"
+                    "sb               %[tp2],      2(%[output_ptr])                 \n\t"
+                    "sb               %[n2],       3(%[output_ptr])                 \n\t"
+
+                    : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn1] "=&r" (tn1),
+                      [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3),
+                      [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3),
+                      [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+                      [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
+                    : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+                      [vector4a] "r" (vector4a), [cm] "r" (cm),
+                      [output_ptr] "r" (output_ptr), [src_ptr] "r" (src_ptr)
+                );
+
+                src_ptr += 4;
+            }
+
+            /* next row... */
+            src_ptr += src_pixels_per_line;
+            output_ptr += output_width;
+        }
+    }
+}
+
+
+void vp8_filter_block2d_second_pass4
+(
+    unsigned char *RESTRICT src_ptr,
+    unsigned char *RESTRICT output_ptr,
+    int output_pitch,
+    int yoffset
+)
+{
+    unsigned int i;
+
+    int Temp1, Temp2, Temp3, Temp4;
+    unsigned int vector1b, vector2b, vector3b, vector4a;
+
+    unsigned char src_ptr_l2;
+    unsigned char src_ptr_l1;
+    unsigned char src_ptr_0;
+    unsigned char src_ptr_r1;
+    unsigned char src_ptr_r2;
+    unsigned char src_ptr_r3;
+
+    unsigned char *cm = ff_cropTbl + CROP_WIDTH;
+
+    vector4a = 64;
+
+    /* load filter coefficients */
+    vector1b = sub_pel_filterss[yoffset][0];
+    vector2b = sub_pel_filterss[yoffset][2];
+    vector3b = sub_pel_filterss[yoffset][1];
+
+    if (vector1b)
+    {
+        /* 6 tap filter */
+
+        for (i = 2; i--;)
+        {
+            /* prefetch src_ptr data to cache memory */
+            prefetch_load(src_ptr);
+
+            /* do not allow compiler to reorder instructions */
+            __asm__ __volatile__ (
+                ".set noreorder                                                 \n\t"
+                :
+                :
+            );
+
+            /* apply filter with vectors pairs */
+            __asm__ __volatile__ (
+                "lbu            %[src_ptr_l2],  -8(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_l1],  -4(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   0(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  4(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r2],  8(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r3],  12(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac2                            \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -7(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_l1],  -3(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   1(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  5(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r2],  9(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r3],  13(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac3                            \n\t"
+                "extp           %[Temp1],       $ac2,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -6(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_l1],  -2(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   2(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  6(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r2],  10(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  14(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac0                            \n\t"
+                "extp           %[Temp2],       $ac3,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -5(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_l1],  -1(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   3(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  7(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r2],  11(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  15(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac1                            \n\t"
+                "extp           %[Temp3],       $ac0,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
+                "extp           %[Temp4],       $ac1,           9               \n\t"
+
+                : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+                  [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4),
+                  [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
+                  [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2),
+                  [src_ptr_l2] "=&r" (src_ptr_l2), [src_ptr_r3] "=&r" (src_ptr_r3)
+                : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+                  [vector3b] "r" (vector3b), [vector4a] "r" (vector4a),
+                  [src_ptr] "r" (src_ptr)
+            );
+
+            /* clamp and store results */
+            output_ptr[0] = cm[Temp1];
+            output_ptr[1] = cm[Temp2];
+            output_ptr[2] = cm[Temp3];
+            output_ptr[3] = cm[Temp4];
+
+            output_ptr += output_pitch;
+
+            /* apply filter with vectors pairs */
+            __asm__ __volatile__ (
+                "lbu            %[src_ptr_l2],  -4(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_l1],  0(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_0],   4(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  8(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r2],  12(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  16(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac2                            \n\t"
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -3(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_l1],  1(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_0],   5(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  9(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r2],  13(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  17(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac3                            \n\t"
+                "extp           %[Temp1],       $ac2,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -2(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_l1],  2(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_0],   6(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  10(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  14(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  18(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac0                            \n\t"
+                "extp           %[Temp2],       $ac3,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -1(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_l1],  3(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_0],   7(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  11(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  15(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  19(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac1                            \n\t"
+                "extp           %[Temp3],       $ac0,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
+                "extp           %[Temp4],       $ac1,           9               \n\t"
+
+                : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+                  [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4),
+                  [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
+                  [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2),
+                  [src_ptr_l2] "=&r" (src_ptr_l2), [src_ptr_r3] "=&r" (src_ptr_r3)
+                : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+                  [vector3b] "r" (vector3b), [vector4a] "r" (vector4a),
+                  [src_ptr] "r" (src_ptr)
+            );
+
+            /* clamp and store results */
+            output_ptr[0] = cm[Temp1];
+            output_ptr[1] = cm[Temp2];
+            output_ptr[2] = cm[Temp3];
+            output_ptr[3] = cm[Temp4];
+
+            src_ptr += 8;
+            output_ptr += output_pitch;
+        }
+    }
+    else
+    {
+        /* 4 tap filter */
+
+        /* prefetch src_ptr data to cache memory */
+        prefetch_load(src_ptr);
+
+        for (i = 2; i--;)
+        {
+            /* do not allow compiler to reorder instructions */
+            __asm__ __volatile__ (
+                ".set noreorder                                                 \n\t"
+                :
+                :
+            );
+
+            /* apply filter with vectors pairs */
+            __asm__ __volatile__ (
+                "lbu            %[src_ptr_l1],  -4(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   0(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  4(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r2],  8(%[src_ptr])                   \n\t"
+                "mtlo           %[vector4a],    $ac2                            \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l1],  -3(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   1(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  5(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r2],  9(%[src_ptr])                   \n\t"
+                "mtlo           %[vector4a],    $ac3                            \n\t"
+                "extp           %[Temp1],       $ac2,           9               \n\t"
+
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l1],  -2(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   2(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  6(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r2],  10(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac0                            \n\t"
+                "extp           %[Temp2],       $ac3,           9               \n\t"
+
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l1],  -1(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   3(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  7(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r2],  11(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac1                            \n\t"
+                "extp           %[Temp3],       $ac0,           9               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
+                "extp           %[Temp4],       $ac1,           9               \n\t"
+
+                : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+                  [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4),
+                  [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
+                  [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2)
+                : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
+                  [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr)
+            );
+
+            /* clamp and store results */
+            output_ptr[0] = cm[Temp1];
+            output_ptr[1] = cm[Temp2];
+            output_ptr[2] = cm[Temp3];
+            output_ptr[3] = cm[Temp4];
+
+            output_ptr += output_pitch;
+
+            /* apply filter with vectors pairs */
+            __asm__ __volatile__ (
+                "lbu            %[src_ptr_l1],  0(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_0],   4(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  8(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r2],  12(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac2                            \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l1],  1(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_0],   5(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  9(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r2],  13(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac3                            \n\t"
+                "extp           %[Temp1],       $ac2,           9               \n\t"
+
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l1],  2(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_0],   6(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  10(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  14(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac0                            \n\t"
+                "extp           %[Temp2],       $ac3,           9               \n\t"
+
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l1],  3(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_0],   7(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  11(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  15(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac1                            \n\t"
+                "extp           %[Temp3],       $ac0,           9               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
+                "extp           %[Temp4],       $ac1,           9               \n\t"
+
+                : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+                  [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4),
+                  [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
+                  [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2)
+                : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
+                  [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr)
+            );
+
+            /* clamp and store results */
+            output_ptr[0] = cm[Temp1];
+            output_ptr[1] = cm[Temp2];
+            output_ptr[2] = cm[Temp3];
+            output_ptr[3] = cm[Temp4];
+
+            src_ptr += 8;
+            output_ptr += output_pitch;
+        }
+    }
+}
+
+
+void vp8_filter_block2d_second_pass_8
+(
+    unsigned char *RESTRICT src_ptr,
+    unsigned char *RESTRICT output_ptr,
+    int output_pitch,
+    unsigned int output_height,
+    unsigned int output_width,
+    unsigned int yoffset
+)
+{
+    unsigned int i;
+
+    int Temp1, Temp2, Temp3, Temp4, Temp5, Temp6, Temp7, Temp8;
+    unsigned int vector1b, vector2b, vector3b, vector4a;
+
+    unsigned char src_ptr_l2;
+    unsigned char src_ptr_l1;
+    unsigned char src_ptr_0;
+    unsigned char src_ptr_r1;
+    unsigned char src_ptr_r2;
+    unsigned char src_ptr_r3;
+    unsigned char *cm = ff_cropTbl + CROP_WIDTH;
+
+    vector4a = 64;
+
+    vector1b = sub_pel_filterss[yoffset][0];
+    vector2b = sub_pel_filterss[yoffset][2];
+    vector3b = sub_pel_filterss[yoffset][1];
+
+    if (vector1b)
+    {
+        /* 6 tap filter */
+
+        /* prefetch src_ptr data to cache memory */
+        prefetch_load(src_ptr);
+
+        for (i = output_height; i--;)
+        {
+            /* apply filter with vectors pairs */
+            __asm__ __volatile__ (
+                "lbu            %[src_ptr_l2],  -16(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -8(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   0(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  8(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r2],  16(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  24(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac2                            \n\t"
+
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -15(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -7(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   1(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  9(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r2],  17(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  25(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac3                            \n\t"
+                "extp           %[Temp1],       $ac2,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -14(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -6(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   2(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  10(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  18(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  26(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac0                            \n\t"
+                "extp           %[Temp2],       $ac3,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -13(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -5(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   3(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  11(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  19(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  27(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac1                            \n\t"
+                "extp           %[Temp3],       $ac0,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+                  [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
+                  [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2),
+                  [src_ptr_l2] "=&r" (src_ptr_l2), [src_ptr_r3] "=&r" (src_ptr_r3)
+                : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+                  [vector3b] "r" (vector3b), [vector4a] "r" (vector4a),
+                  [src_ptr] "r" (src_ptr)
+            );
+
+            /* apply filter with vectors pairs */
+            __asm__ __volatile__ (
+                "lbu            %[src_ptr_l2],  -12(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -4(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   4(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  12(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  20(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  28(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac2                            \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
+                "extp           %[Temp4],       $ac1,           9               \n\t"
+
+                "lbu            %[src_ptr_l2],  -11(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -3(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   5(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  13(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  21(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  29(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac3                            \n\t"
+                "extp           %[Temp5],       $ac2,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -10(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -2(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   6(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  14(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  22(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  30(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac0                            \n\t"
+                "extp           %[Temp6],       $ac3,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -9(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_l1],  -1(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   7(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  15(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  23(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  31(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac1                            \n\t"
+                "extp           %[Temp7],       $ac0,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
+                "extp           %[Temp8],       $ac1,           9               \n\t"
+
+                : [Temp4] "=&r" (Temp4), [Temp5] "=&r" (Temp5),
+                  [Temp6] "=&r" (Temp6), [Temp7] "=&r" (Temp7), [Temp8] "=r" (Temp8),
+                  [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
+                  [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2),
+                  [src_ptr_l2] "=&r" (src_ptr_l2),[src_ptr_r3] "=&r" (src_ptr_r3)
+                : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+                  [vector3b] "r" (vector3b), [vector4a] "r" (vector4a),
+                  [src_ptr] "r" (src_ptr)
+            );
+
+            /* clamp and store results */
+            output_ptr[0] = cm[Temp1];
+            output_ptr[1] = cm[Temp2];
+            output_ptr[2] = cm[Temp3];
+            output_ptr[3] = cm[Temp4];
+            output_ptr[4] = cm[Temp5];
+            output_ptr[5] = cm[Temp6];
+            output_ptr[6] = cm[Temp7];
+            output_ptr[7] = cm[Temp8];
+
+            src_ptr += 8;
+            output_ptr += output_pitch;
+        }
+    }
+    else
+    {
+        /* 4 tap filter */
+
+        /* prefetch src_ptr data to cache memory */
+        prefetch_load(src_ptr);
+
+        for (i = output_height; i--;)
+        {
+            __asm__ __volatile__ (
+                "lbu            %[src_ptr_l1],  -8(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   0(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  8(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r2],  16(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac2                            \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                : [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
+                  [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2)
+                : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
+                  [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr)
+            );
+
+            __asm__ __volatile__ (
+                "lbu            %[src_ptr_l1],  -7(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   1(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  9(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r2],  17(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac3                            \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
+                "extp           %[Temp1],       $ac2,           9               \n\t"
+
+                : [Temp1] "=r" (Temp1),
+                  [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
+                  [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2)
+                : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
+                  [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr)
+            );
+
+            src_ptr_l1 = src_ptr[-6];
+            src_ptr_0  = src_ptr[2];
+            src_ptr_r1 = src_ptr[10];
+            src_ptr_r2 = src_ptr[18];
+
+            __asm__ __volatile__ (
+                "mtlo           %[vector4a],    $ac0                            \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
+                "extp           %[Temp2],       $ac3,           9               \n\t"
+
+                : [Temp2] "=r" (Temp2)
+                : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
+                  [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0),
+                  [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2),
+                  [vector4a] "r" (vector4a)
+            );
+
+            src_ptr_l1 = src_ptr[-5];
+            src_ptr_0  = src_ptr[3];
+            src_ptr_r1 = src_ptr[11];
+            src_ptr_r2 = src_ptr[19];
+
+            __asm__ __volatile__ (
+                "mtlo           %[vector4a],    $ac1                            \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
+                "extp           %[Temp3],       $ac0,           9               \n\t"
+
+                : [Temp3] "=r" (Temp3)
+                : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
+                  [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0),
+                  [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2),
+                  [vector4a] "r" (vector4a)
+            );
+
+            src_ptr_l1 = src_ptr[-4];
+            src_ptr_0  = src_ptr[4];
+            src_ptr_r1 = src_ptr[12];
+            src_ptr_r2 = src_ptr[20];
+
+            __asm__ __volatile__ (
+                "mtlo           %[vector4a],    $ac2                            \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
+                "extp           %[Temp4],       $ac1,           9               \n\t"
+
+                : [Temp4] "=r" (Temp4)
+                : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
+                  [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0),
+                  [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2),
+                  [vector4a] "r" (vector4a)
+            );
+
+            src_ptr_l1 = src_ptr[-3];
+            src_ptr_0  = src_ptr[5];
+            src_ptr_r1 = src_ptr[13];
+            src_ptr_r2 = src_ptr[21];
+
+            __asm__ __volatile__ (
+                "mtlo           %[vector4a],    $ac3                            \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
+                "extp           %[Temp5],       $ac2,           9               \n\t"
+
+                : [Temp5] "=&r" (Temp5)
+                : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
+                  [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0),
+                  [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2),
+                  [vector4a] "r" (vector4a)
+            );
+
+            src_ptr_l1 = src_ptr[-2];
+            src_ptr_0  = src_ptr[6];
+            src_ptr_r1 = src_ptr[14];
+            src_ptr_r2 = src_ptr[22];
+
+            __asm__ __volatile__ (
+                "mtlo           %[vector4a],    $ac0                            \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
+                "extp           %[Temp6],       $ac3,           9               \n\t"
+
+                : [Temp6] "=r" (Temp6)
+                : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
+                  [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0),
+                  [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2),
+                  [vector4a] "r" (vector4a)
+            );
+
+            src_ptr_l1 = src_ptr[-1];
+            src_ptr_0  = src_ptr[7];
+            src_ptr_r1 = src_ptr[15];
+            src_ptr_r2 = src_ptr[23];
+
+            __asm__ __volatile__ (
+                "mtlo           %[vector4a],    $ac1                            \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
+                "extp           %[Temp7],       $ac0,           9               \n\t"
+                "extp           %[Temp8],       $ac1,           9               \n\t"
+
+                : [Temp7] "=&r" (Temp7), [Temp8] "=r" (Temp8)
+                : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
+                  [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0),
+                  [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2),
+                  [vector4a] "r" (vector4a)
+            );
+
+            /* clamp and store results */
+            output_ptr[0] = cm[Temp1];
+            output_ptr[1] = cm[Temp2];
+            output_ptr[2] = cm[Temp3];
+            output_ptr[3] = cm[Temp4];
+            output_ptr[4] = cm[Temp5];
+            output_ptr[5] = cm[Temp6];
+            output_ptr[6] = cm[Temp7];
+            output_ptr[7] = cm[Temp8];
+
+            src_ptr += 8;
+            output_ptr += output_pitch;
+        }
+    }
+}
+
+
+void vp8_filter_block2d_second_pass161
+(
+    unsigned char *RESTRICT src_ptr,
+    unsigned char *RESTRICT output_ptr,
+    int output_pitch,
+    const unsigned short *vp8_filter
+)
+{
+    unsigned int i, j;
+
+    int Temp1, Temp2, Temp3, Temp4, Temp5, Temp6, Temp7, Temp8;
+    unsigned int vector4a;
+    unsigned int vector1b, vector2b, vector3b;
+
+    unsigned char src_ptr_l2;
+    unsigned char src_ptr_l1;
+    unsigned char src_ptr_0;
+    unsigned char src_ptr_r1;
+    unsigned char src_ptr_r2;
+    unsigned char src_ptr_r3;
+    unsigned char *cm = ff_cropTbl + CROP_WIDTH;
+
+    vector4a = 64;
+
+    vector1b = vp8_filter[0];
+    vector2b = vp8_filter[2];
+    vector3b = vp8_filter[1];
+
+    if (vector1b == 0)
+    {
+        /* 4 tap filter */
+
+        /* prefetch src_ptr data to cache memory */
+        prefetch_load(src_ptr + 16);
+
+        for (i = 16; i--;)
+        {
+            /* unrolling for loop */
+            for (j = 0; j < 16; j += 8)
+            {
+                /* apply filter with vectors pairs */
+                __asm__ __volatile__ (
+                    "lbu            %[src_ptr_l1],  -16(%[src_ptr])                 \n\t"
+                    "lbu            %[src_ptr_0],   0(%[src_ptr])                   \n\t"
+                    "lbu            %[src_ptr_r1],  16(%[src_ptr])                  \n\t"
+                    "lbu            %[src_ptr_r2],  32(%[src_ptr])                  \n\t"
+                    "mtlo           %[vector4a],    $ac2                            \n\t"
+                    "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                    "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                    "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
+                    "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                    "lbu            %[src_ptr_l1],  -15(%[src_ptr])                 \n\t"
+                    "lbu            %[src_ptr_0],   1(%[src_ptr])                   \n\t"
+                    "lbu            %[src_ptr_r1],  17(%[src_ptr])                  \n\t"
+                    "lbu            %[src_ptr_r2],  33(%[src_ptr])                  \n\t"
+                    "mtlo           %[vector4a],    $ac3                            \n\t"
+                    "extp           %[Temp1],       $ac2,           9               \n\t"
+
+                    "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                    "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                    "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
+                    "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                    "lbu            %[src_ptr_l1],  -14(%[src_ptr])                 \n\t"
+                    "lbu            %[src_ptr_0],   2(%[src_ptr])                   \n\t"
+                    "lbu            %[src_ptr_r1],  18(%[src_ptr])                  \n\t"
+                    "lbu            %[src_ptr_r2],  34(%[src_ptr])                  \n\t"
+                    "mtlo           %[vector4a],    $ac1                            \n\t"
+                    "extp           %[Temp2],       $ac3,           9               \n\t"
+
+                    "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                    "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                    "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
+                    "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                    "lbu            %[src_ptr_l1],  -13(%[src_ptr])                 \n\t"
+                    "lbu            %[src_ptr_0],   3(%[src_ptr])                   \n\t"
+                    "lbu            %[src_ptr_r1],  19(%[src_ptr])                  \n\t"
+                    "lbu            %[src_ptr_r2],  35(%[src_ptr])                  \n\t"
+                    "mtlo           %[vector4a],    $ac3                            \n\t"
+                    "extp           %[Temp3],       $ac1,           9               \n\t"
+
+                    "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                    "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                    "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
+                    "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                    "lbu            %[src_ptr_l1],  -12(%[src_ptr])                 \n\t"
+                    "lbu            %[src_ptr_0],   4(%[src_ptr])                   \n\t"
+                    "lbu            %[src_ptr_r1],  20(%[src_ptr])                  \n\t"
+                    "lbu            %[src_ptr_r2],  36(%[src_ptr])                  \n\t"
+                    "mtlo           %[vector4a],    $ac2                            \n\t"
+                    "extp           %[Temp4],       $ac3,           9               \n\t"
+
+                    "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                    "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                    "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
+                    "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                    "lbu            %[src_ptr_l1],  -11(%[src_ptr])                 \n\t"
+                    "lbu            %[src_ptr_0],   5(%[src_ptr])                   \n\t"
+                    "lbu            %[src_ptr_r1],  21(%[src_ptr])                  \n\t"
+                    "lbu            %[src_ptr_r2],  37(%[src_ptr])                  \n\t"
+                    "mtlo           %[vector4a],    $ac3                            \n\t"
+                    "extp           %[Temp5],       $ac2,           9               \n\t"
+
+                    "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                    "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                    "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
+                    "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                    "lbu            %[src_ptr_l1],  -10(%[src_ptr])                 \n\t"
+                    "lbu            %[src_ptr_0],   6(%[src_ptr])                   \n\t"
+                    "lbu            %[src_ptr_r1],  22(%[src_ptr])                  \n\t"
+                    "lbu            %[src_ptr_r2],  38(%[src_ptr])                  \n\t"
+                    "mtlo           %[vector4a],    $ac1                            \n\t"
+                    "extp           %[Temp6],       $ac3,           9               \n\t"
+
+                    "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                    "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                    "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
+                    "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                    "lbu            %[src_ptr_l1],  -9(%[src_ptr])                  \n\t"
+                    "lbu            %[src_ptr_0],   7(%[src_ptr])                   \n\t"
+                    "lbu            %[src_ptr_r1],  23(%[src_ptr])                  \n\t"
+                    "lbu            %[src_ptr_r2],  39(%[src_ptr])                  \n\t"
+                    "mtlo           %[vector4a],    $ac3                            \n\t"
+                    "extp           %[Temp7],       $ac1,           9               \n\t"
+
+                    "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                    "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                    "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
+                    "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
+                    "extp           %[Temp8],       $ac3,           9               \n\t"
+
+                    : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+                      [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4),
+                      [Temp5] "=&r" (Temp5), [Temp6] "=&r" (Temp6),
+                      [Temp7] "=&r" (Temp7), [Temp8] "=r" (Temp8),
+                      [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
+                      [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2)
+                    : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
+                      [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr)
+                );
+
+                /* clamp and store results */
+                output_ptr[j] = cm[Temp1];
+                output_ptr[j + 1] = cm[Temp2];
+                output_ptr[j + 2] = cm[Temp3];
+                output_ptr[j + 3] = cm[Temp4];
+                output_ptr[j + 4] = cm[Temp5];
+                output_ptr[j + 5] = cm[Temp6];
+                output_ptr[j + 6] = cm[Temp7];
+                output_ptr[j + 7] = cm[Temp8];
+
+                src_ptr += 8;
+            }
+
+            output_ptr += output_pitch;
+        }
+    }
+    else
+    {
+        /* 4 tap filter */
+
+        /* prefetch src_ptr data to cache memory */
+        prefetch_load(src_ptr + 16);
+
+        /* unroll for loop */
+        for (i = 16; i--;)
+        {
+            /* apply filter with vectors pairs */
+            __asm__ __volatile__ (
+                "lbu            %[src_ptr_l2],  -32(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -16(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_0],   0(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  16(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  32(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  48(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac2                            \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -31(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -15(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_0],   1(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  17(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  33(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  49(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac0                            \n\t"
+                "extp           %[Temp1],       $ac2,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -30(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -14(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_0],   2(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  18(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  34(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  50(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac1                            \n\t"
+                "extp           %[Temp2],       $ac0,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -29(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -13(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_0],   3(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  19(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  35(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  51(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac3                            \n\t"
+                "extp           %[Temp3],       $ac1,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -28(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -12(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_0],   4(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  20(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  36(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  52(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac2                            \n\t"
+                "extp           %[Temp4],       $ac3,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -27(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -11(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_0],   5(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  21(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  37(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  53(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac0                            \n\t"
+                "extp           %[Temp5],       $ac2,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -26(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -10(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_0],   6(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  22(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  38(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  54(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac1                            \n\t"
+                "extp           %[Temp6],       $ac0,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -25(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -9(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   7(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  23(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  39(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  55(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac3                            \n\t"
+                "extp           %[Temp7],       $ac1,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
+                "extp           %[Temp8],       $ac3,           9               \n\t"
+
+                : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+                  [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4),
+                  [Temp5] "=&r" (Temp5), [Temp6] "=&r" (Temp6),
+                  [Temp7] "=&r" (Temp7), [Temp8] "=r" (Temp8),
+                  [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
+                  [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2),
+                  [src_ptr_l2] "=&r" (src_ptr_l2),[src_ptr_r3] "=&r" (src_ptr_r3)
+                : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+                  [vector3b] "r" (vector3b), [vector4a] "r" (vector4a),
+                  [src_ptr] "r" (src_ptr)
+            );
+
+            /* clamp and store results */
+            output_ptr[0] = cm[Temp1];
+            output_ptr[1] = cm[Temp2];
+            output_ptr[2] = cm[Temp3];
+            output_ptr[3] = cm[Temp4];
+            output_ptr[4] = cm[Temp5];
+            output_ptr[5] = cm[Temp6];
+            output_ptr[6] = cm[Temp7];
+            output_ptr[7] = cm[Temp8];
+
+            /* apply filter with vectors pairs */
+            __asm__ __volatile__ (
+                "lbu            %[src_ptr_l2],  -24(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -8(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   8(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  24(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  40(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  56(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac2                            \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -23(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -7(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   9(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  25(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  41(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  57(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac0                            \n\t"
+                "extp           %[Temp1],       $ac2,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -22(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -6(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   10(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r1],  26(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  42(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  58(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac1                            \n\t"
+                "extp           %[Temp2],       $ac0,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -21(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -5(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   11(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r1],  27(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  43(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  59(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac3                            \n\t"
+                "extp           %[Temp3],       $ac1,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -20(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -4(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   12(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r1],  28(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  44(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  60(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac2                            \n\t"
+                "extp           %[Temp4],       $ac3,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -19(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -3(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   13(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r1],  29(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  45(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  61(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac0                            \n\t"
+                "extp           %[Temp5],       $ac2,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -18(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -2(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   14(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r1],  30(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  46(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  62(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac1                            \n\t"
+                "extp           %[Temp6],       $ac0,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -17(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -1(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   15(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r1],  31(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  47(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  63(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac3                            \n\t"
+                "extp           %[Temp7],       $ac1,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
+                "extp           %[Temp8],       $ac3,           9               \n\t"
+
+                : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+                  [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4),
+                  [Temp5] "=&r" (Temp5), [Temp6] "=&r" (Temp6),
+                  [Temp7] "=&r" (Temp7), [Temp8] "=r" (Temp8),
+                  [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
+                  [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2),
+                  [src_ptr_l2] "=&r" (src_ptr_l2), [src_ptr_r3] "=&r" (src_ptr_r3)
+                : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+                  [vector3b] "r" (vector3b), [vector4a] "r" (vector4a),
+                  [src_ptr] "r" (src_ptr)
+            );
+
+            src_ptr += 16;
+            output_ptr[8] = cm[Temp1];
+            output_ptr[9] = cm[Temp2];
+            output_ptr[10] = cm[Temp3];
+            output_ptr[11] = cm[Temp4];
+            output_ptr[12] = cm[Temp5];
+            output_ptr[13] = cm[Temp6];
+            output_ptr[14] = cm[Temp7];
+            output_ptr[15] = cm[Temp8];
+
+            output_ptr += output_pitch;
+        }
+    }
+}
+
+
+void vp8_sixtap_predict4x4_dspr2
+(
+    unsigned char *RESTRICT src_ptr,
+    int   src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *RESTRICT dst_ptr,
+    int dst_pitch
+)
+{
+    unsigned char FData[9 * 4]; /* Temp data bufffer used in filtering */
+    unsigned int pos = 16;
+
+    /* bit positon for extract from acc */
+    __asm__ __volatile__ (
+        "wrdsp      %[pos],     1           \n\t"
+        :
+        : [pos] "r" (pos)
+    );
+
+    if (yoffset)
+    {
+        /* First filter 1-D horizontally... */
+        vp8_filter_block2d_first_pass_4(src_ptr - (2 * src_pixels_per_line), FData,
+                                        src_pixels_per_line, 9, xoffset, 4);
+        /* then filter verticaly... */
+        vp8_filter_block2d_second_pass4(FData + 8, dst_ptr, dst_pitch, yoffset);
+    }
+    else
+        /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */
+        vp8_filter_block2d_first_pass_4(src_ptr, dst_ptr, src_pixels_per_line,
+                                        4, xoffset, dst_pitch);
+}
+
+
+void vp8_sixtap_predict8x8_dspr2
+(
+    unsigned char   *RESTRICT src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *RESTRICT dst_ptr,
+    int  dst_pitch
+)
+{
+
+    unsigned char FData[13 * 8]; /* Temp data bufffer used in filtering */
+    unsigned int pos, Temp1, Temp2;
+
+    pos = 16;
+
+    /* bit positon for extract from acc */
+    __asm__ __volatile__ (
+        "wrdsp      %[pos],     1               \n\t"
+        :
+        : [pos] "r" (pos)
+    );
+
+    if (yoffset)
+    {
+
+        src_ptr = src_ptr - (2 * src_pixels_per_line);
+
+        if (xoffset)
+            /* filter 1-D horizontally... */
+            vp8_filter_block2d_first_pass_8_all(src_ptr, FData, src_pixels_per_line,
+                                                13, xoffset, 8);
+
+        else
+        {
+            /* prefetch src_ptr data to cache memory */
+            prefetch_load(src_ptr + 2 * src_pixels_per_line);
+
+            __asm__ __volatile__ (
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   0(%[FData])                             \n\t"
+                "sw     %[Temp2],   4(%[FData])                             \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   8(%[FData])                             \n\t"
+                "sw     %[Temp2],   12(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   16(%[FData])                            \n\t"
+                "sw     %[Temp2],   20(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   24(%[FData])                            \n\t"
+                "sw     %[Temp2],   28(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   32(%[FData])                            \n\t"
+                "sw     %[Temp2],   36(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   40(%[FData])                            \n\t"
+                "sw     %[Temp2],   44(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   48(%[FData])                            \n\t"
+                "sw     %[Temp2],   52(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   56(%[FData])                            \n\t"
+                "sw     %[Temp2],   60(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   64(%[FData])                            \n\t"
+                "sw     %[Temp2],   68(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   72(%[FData])                            \n\t"
+                "sw     %[Temp2],   76(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   80(%[FData])                            \n\t"
+                "sw     %[Temp2],   84(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   88(%[FData])                            \n\t"
+                "sw     %[Temp2],   92(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   96(%[FData])                            \n\t"
+                "sw     %[Temp2],   100(%[FData])                           \n\t"
+
+                : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2)
+                : [FData] "r" (FData), [src_ptr] "r" (src_ptr),
+                  [src_pixels_per_line] "r" (src_pixels_per_line)
+            );
+        }
+
+        /* filter verticaly... */
+        vp8_filter_block2d_second_pass_8(FData + 16, dst_ptr, dst_pitch, 8, 8, yoffset);
+    }
+
+    /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */
+    else
+    {
+        if (xoffset)
+            vp8_filter_block2d_first_pass_8_all(src_ptr, dst_ptr, src_pixels_per_line,
+                                                8, xoffset, dst_pitch);
+
+        else
+        {
+            /* copy from src buffer to dst buffer */
+            __asm__ __volatile__ (
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   0(%[dst_ptr])                           \n\t"
+                "sw     %[Temp2],   4(%[dst_ptr])                           \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   8(%[dst_ptr])                           \n\t"
+                "sw     %[Temp2],   12(%[dst_ptr])                          \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   16(%[dst_ptr])                          \n\t"
+                "sw     %[Temp2],   20(%[dst_ptr])                          \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   24(%[dst_ptr])                          \n\t"
+                "sw     %[Temp2],   28(%[dst_ptr])                          \n\t"
+                "addu   %[src_ptr], %[src_ptr],   %[src_pixels_per_line]    \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   32(%[dst_ptr])                          \n\t"
+                "sw     %[Temp2],   36(%[dst_ptr])                          \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   40(%[dst_ptr])                          \n\t"
+                "sw     %[Temp2],   44(%[dst_ptr])                          \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   48(%[dst_ptr])                          \n\t"
+                "sw     %[Temp2],   52(%[dst_ptr])                          \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   56(%[dst_ptr])                          \n\t"
+                "sw     %[Temp2],   60(%[dst_ptr])                          \n\t"
+
+                : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2)
+                : [dst_ptr] "r" (dst_ptr), [src_ptr] "r" (src_ptr),
+                  [src_pixels_per_line] "r" (src_pixels_per_line)
+            );
+        }
+    }
+}
+
+
+void vp8_sixtap_predict8x4_dspr2
+(
+    unsigned char   *RESTRICT src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *RESTRICT dst_ptr,
+    int  dst_pitch
+)
+{
+    unsigned char FData[9 * 8]; /* Temp data bufffer used in filtering */
+    unsigned int pos, Temp1, Temp2;
+
+    pos = 16;
+
+    /* bit positon for extract from acc */
+    __asm__ __volatile__ (
+        "wrdsp      %[pos],     1           \n\t"
+        :
+        : [pos] "r" (pos)
+    );
+
+    if (yoffset)
+    {
+
+        src_ptr = src_ptr - (2 * src_pixels_per_line);
+
+        if (xoffset)
+            /* filter 1-D horizontally... */
+            vp8_filter_block2d_first_pass_8_all(src_ptr, FData, src_pixels_per_line,
+                                                9, xoffset, 8);
+
+        else
+        {
+            /* prefetch src_ptr data to cache memory */
+            prefetch_load(src_ptr + 2 * src_pixels_per_line);
+
+            __asm__ __volatile__ (
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   0(%[FData])                             \n\t"
+                "sw     %[Temp2],   4(%[FData])                             \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   8(%[FData])                             \n\t"
+                "sw     %[Temp2],   12(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   16(%[FData])                            \n\t"
+                "sw     %[Temp2],   20(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   24(%[FData])                            \n\t"
+                "sw     %[Temp2],   28(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   32(%[FData])                            \n\t"
+                "sw     %[Temp2],   36(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   40(%[FData])                            \n\t"
+                "sw     %[Temp2],   44(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   48(%[FData])                            \n\t"
+                "sw     %[Temp2],   52(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   56(%[FData])                            \n\t"
+                "sw     %[Temp2],   60(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   64(%[FData])                            \n\t"
+                "sw     %[Temp2],   68(%[FData])                            \n\t"
+
+                : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2)
+                : [FData] "r" (FData), [src_ptr] "r" (src_ptr),
+                  [src_pixels_per_line] "r" (src_pixels_per_line)
+            );
+        }
+
+        /* filter verticaly... */
+        vp8_filter_block2d_second_pass_8(FData + 16, dst_ptr, dst_pitch, 4, 8, yoffset);
+    }
+
+    /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */
+    else
+    {
+        if (xoffset)
+            vp8_filter_block2d_first_pass_8_all(src_ptr, dst_ptr, src_pixels_per_line,
+                                                4, xoffset, dst_pitch);
+
+        else
+        {
+            /* copy from src buffer to dst buffer */
+            __asm__ __volatile__ (
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   0(%[dst_ptr])                           \n\t"
+                "sw     %[Temp2],   4(%[dst_ptr])                           \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   8(%[dst_ptr])                           \n\t"
+                "sw     %[Temp2],   12(%[dst_ptr])                          \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   16(%[dst_ptr])                          \n\t"
+                "sw     %[Temp2],   20(%[dst_ptr])                          \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   24(%[dst_ptr])                          \n\t"
+                "sw     %[Temp2],   28(%[dst_ptr])                          \n\t"
+
+                : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2)
+                : [dst_ptr] "r" (dst_ptr), [src_ptr] "r" (src_ptr),
+                  [src_pixels_per_line] "r" (src_pixels_per_line)
+            );
+        }
+    }
+}
+
+
+void vp8_sixtap_predict16x16_dspr2
+(
+    unsigned char   *RESTRICT src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *RESTRICT dst_ptr,
+    int  dst_pitch
+)
+{
+    const unsigned short *VFilter;
+    unsigned char FData[21 * 16]; /* Temp data bufffer used in filtering */
+    unsigned int pos;
+
+    VFilter = sub_pel_filterss[yoffset];
+
+    pos = 16;
+
+    /* bit positon for extract from acc */
+    __asm__ __volatile__ (
+        "wrdsp      %[pos],     1           \n\t"
+        :
+        : [pos] "r" (pos)
+    );
+
+    if (yoffset)
+    {
+
+        src_ptr = src_ptr - (2 * src_pixels_per_line);
+
+        switch (xoffset)
+        {
+            /* filter 1-D horizontally... */
+        case 2:
+        case 4:
+        case 6:
+            /* 6 tap filter */
+            vp8_filter_block2d_first_pass16_6tap(src_ptr, FData, src_pixels_per_line,
+                                                 21, xoffset, 16);
+            break;
+
+        case 0:
+            /* only copy buffer */
+            vp8_filter_block2d_first_pass16_0(src_ptr, FData, src_pixels_per_line);
+            break;
+
+        case 1:
+        case 3:
+        case 5:
+        case 7:
+            /* 4 tap filter */
+            vp8_filter_block2d_first_pass16_4tap(src_ptr, FData, src_pixels_per_line, 16,
+                                                 21, xoffset, yoffset, dst_ptr, dst_pitch);
+            break;
+        }
+
+        /* filter verticaly... */
+        vp8_filter_block2d_second_pass161(FData + 32, dst_ptr, dst_pitch, VFilter);
+    }
+    else
+    {
+        /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */
+        switch (xoffset)
+        {
+        case 2:
+        case 4:
+        case 6:
+            /* 6 tap filter */
+            vp8_filter_block2d_first_pass16_6tap(src_ptr, dst_ptr, src_pixels_per_line,
+                                                 16, xoffset, dst_pitch);
+            break;
+
+        case 1:
+        case 3:
+        case 5:
+        case 7:
+            /* 4 tap filter */
+            vp8_filter_block2d_first_pass16_4tap(src_ptr, dst_ptr, src_pixels_per_line, 16,
+                                                 21, xoffset, yoffset, dst_ptr, dst_pitch);
+            break;
+        }
+    }
+}
+
+#endif
diff --git a/libs/libvpx/vp8/common/mips/dspr2/idct_blk_dspr2.c b/libs/libvpx/vp8/common/mips/dspr2/idct_blk_dspr2.c
new file mode 100644
index 0000000000..ab938cd6af
--- /dev/null
+++ b/libs/libvpx/vp8/common/mips/dspr2/idct_blk_dspr2.c
@@ -0,0 +1,88 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+
+#if HAVE_DSPR2
+
+void vp8_dequant_idct_add_y_block_dspr2
+(short *q, short *dq,
+ unsigned char *dst, int stride, char *eobs)
+{
+    int i, j;
+
+    for (i = 0; i < 4; i++)
+    {
+        for (j = 0; j < 4; j++)
+        {
+            if (*eobs++ > 1)
+                vp8_dequant_idct_add_dspr2(q, dq, dst, stride);
+            else
+            {
+                vp8_dc_only_idct_add_dspr2(q[0]*dq[0], dst, stride, dst, stride);
+                ((int *)q)[0] = 0;
+            }
+
+            q   += 16;
+            dst += 4;
+        }
+
+        dst += 4 * stride - 16;
+    }
+}
+
+void vp8_dequant_idct_add_uv_block_dspr2
+(short *q, short *dq,
+ unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
+{
+    int i, j;
+
+    for (i = 0; i < 2; i++)
+    {
+        for (j = 0; j < 2; j++)
+        {
+            if (*eobs++ > 1)
+                vp8_dequant_idct_add_dspr2(q, dq, dstu, stride);
+            else
+            {
+                vp8_dc_only_idct_add_dspr2(q[0]*dq[0], dstu, stride, dstu, stride);
+                ((int *)q)[0] = 0;
+            }
+
+            q    += 16;
+            dstu += 4;
+        }
+
+        dstu += 4 * stride - 8;
+    }
+
+    for (i = 0; i < 2; i++)
+    {
+        for (j = 0; j < 2; j++)
+        {
+            if (*eobs++ > 1)
+                vp8_dequant_idct_add_dspr2(q, dq, dstv, stride);
+            else
+            {
+                vp8_dc_only_idct_add_dspr2(q[0]*dq[0], dstv, stride, dstv, stride);
+                ((int *)q)[0] = 0;
+            }
+
+            q    += 16;
+            dstv += 4;
+        }
+
+        dstv += 4 * stride - 8;
+    }
+}
+
+#endif
+
diff --git a/libs/libvpx/vp8/common/mips/dspr2/idctllm_dspr2.c b/libs/libvpx/vp8/common/mips/dspr2/idctllm_dspr2.c
new file mode 100644
index 0000000000..2eff71069d
--- /dev/null
+++ b/libs/libvpx/vp8/common/mips/dspr2/idctllm_dspr2.c
@@ -0,0 +1,369 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp8_rtcd.h"
+
+#if HAVE_DSPR2
+#define CROP_WIDTH 256
+
+/******************************************************************************
+ * Notes:
+ *
+ * This implementation makes use of 16 bit fixed point version of two multiply
+ * constants:
+ *         1.   sqrt(2) * cos (pi/8)
+ *         2.   sqrt(2) * sin (pi/8)
+ * Since the first constant is bigger than 1, to maintain the same 16 bit
+ * fixed point precision as the second one, we use a trick of
+ *         x * a = x + x*(a-1)
+ * so
+ *         x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
+ ****************************************************************************/
+extern unsigned char ff_cropTbl[256 + 2 * CROP_WIDTH];
+static const int cospi8sqrt2minus1 = 20091;
+static const int sinpi8sqrt2      = 35468;
+
+inline void prefetch_load_short(short *src)
+{
+    __asm__ __volatile__ (
+        "pref   0,  0(%[src])   \n\t"
+        :
+        : [src] "r" (src)
+    );
+}
+
+void vp8_short_idct4x4llm_dspr2(short *input, unsigned char *pred_ptr,
+                                int pred_stride, unsigned char *dst_ptr,
+                                int dst_stride)
+{
+    int r, c;
+    int a1, b1, c1, d1;
+    short output[16];
+    short *ip = input;
+    short *op = output;
+    int temp1, temp2;
+    int shortpitch = 4;
+
+    int c2, d2;
+    int temp3, temp4;
+    unsigned char *cm = ff_cropTbl + CROP_WIDTH;
+
+    /* prepare data for load */
+    prefetch_load_short(ip + 8);
+
+    /* first loop is unrolled */
+    a1 = ip[0] + ip[8];
+    b1 = ip[0] - ip[8];
+
+    temp1 = (ip[4] * sinpi8sqrt2) >> 16;
+    temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
+    c1 = temp1 - temp2;
+
+    temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1) >> 16);
+    temp2 = (ip[12] * sinpi8sqrt2) >> 16;
+    d1 = temp1 + temp2;
+
+    temp3 = (ip[5] * sinpi8sqrt2) >> 16;
+    temp4 = ip[13] + ((ip[13] * cospi8sqrt2minus1) >> 16);
+    c2 = temp3 - temp4;
+
+    temp3 = ip[5] + ((ip[5] * cospi8sqrt2minus1) >> 16);
+    temp4 = (ip[13] * sinpi8sqrt2) >> 16;
+    d2 = temp3 + temp4;
+
+    op[0] = a1 + d1;
+    op[12] = a1 - d1;
+    op[4] = b1 + c1;
+    op[8] = b1 - c1;
+
+    a1 = ip[1] + ip[9];
+    b1 = ip[1] - ip[9];
+
+    op[1] = a1 + d2;
+    op[13] = a1 - d2;
+    op[5] = b1 + c2;
+    op[9] = b1 - c2;
+
+    a1 = ip[2] + ip[10];
+    b1 = ip[2] - ip[10];
+
+    temp1 = (ip[6] * sinpi8sqrt2) >> 16;
+    temp2 = ip[14] + ((ip[14] * cospi8sqrt2minus1) >> 16);
+    c1 = temp1 - temp2;
+
+    temp1 = ip[6] + ((ip[6] * cospi8sqrt2minus1) >> 16);
+    temp2 = (ip[14] * sinpi8sqrt2) >> 16;
+    d1 = temp1 + temp2;
+
+    temp3 = (ip[7] * sinpi8sqrt2) >> 16;
+    temp4 = ip[15] + ((ip[15] * cospi8sqrt2minus1) >> 16);
+    c2 = temp3 - temp4;
+
+    temp3 = ip[7] + ((ip[7] * cospi8sqrt2minus1) >> 16);
+    temp4 = (ip[15] * sinpi8sqrt2) >> 16;
+    d2 = temp3 + temp4;
+
+    op[2] = a1 + d1;
+    op[14] = a1 - d1;
+    op[6] = b1 + c1;
+    op[10] = b1 - c1;
+
+    a1 = ip[3] + ip[11];
+    b1 = ip[3] - ip[11];
+
+    op[3] = a1 + d2;
+    op[15] = a1 - d2;
+    op[7] = b1 + c2;
+    op[11] = b1 - c2;
+
+    ip = output;
+
+    /* prepare data for load */
+    prefetch_load_short(ip + shortpitch);
+
+    /* second loop is unrolled */
+    a1 = ip[0] + ip[2];
+    b1 = ip[0] - ip[2];
+
+    temp1 = (ip[1] * sinpi8sqrt2) >> 16;
+    temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1) >> 16);
+    c1 = temp1 - temp2;
+
+    temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1) >> 16);
+    temp2 = (ip[3] * sinpi8sqrt2) >> 16;
+    d1 = temp1 + temp2;
+
+    temp3 = (ip[5] * sinpi8sqrt2) >> 16;
+    temp4 = ip[7] + ((ip[7] * cospi8sqrt2minus1) >> 16);
+    c2 = temp3 - temp4;
+
+    temp3 = ip[5] + ((ip[5] * cospi8sqrt2minus1) >> 16);
+    temp4 = (ip[7] * sinpi8sqrt2) >> 16;
+    d2 = temp3 + temp4;
+
+    op[0] = (a1 + d1 + 4) >> 3;
+    op[3] = (a1 - d1 + 4) >> 3;
+    op[1] = (b1 + c1 + 4) >> 3;
+    op[2] = (b1 - c1 + 4) >> 3;
+
+    a1 = ip[4] + ip[6];
+    b1 = ip[4] - ip[6];
+
+    op[4] = (a1 + d2 + 4) >> 3;
+    op[7] = (a1 - d2 + 4) >> 3;
+    op[5] = (b1 + c2 + 4) >> 3;
+    op[6] = (b1 - c2 + 4) >> 3;
+
+    a1 = ip[8] + ip[10];
+    b1 = ip[8] - ip[10];
+
+    temp1 = (ip[9] * sinpi8sqrt2) >> 16;
+    temp2 = ip[11] + ((ip[11] * cospi8sqrt2minus1) >> 16);
+    c1 = temp1 - temp2;
+
+    temp1 = ip[9] + ((ip[9] * cospi8sqrt2minus1) >> 16);
+    temp2 = (ip[11] * sinpi8sqrt2) >> 16;
+    d1 = temp1 + temp2;
+
+    temp3 = (ip[13] * sinpi8sqrt2) >> 16;
+    temp4 = ip[15] + ((ip[15] * cospi8sqrt2minus1) >> 16);
+    c2 = temp3 - temp4;
+
+    temp3 = ip[13] + ((ip[13] * cospi8sqrt2minus1) >> 16);
+    temp4 = (ip[15] * sinpi8sqrt2) >> 16;
+    d2 = temp3 + temp4;
+
+    op[8] = (a1 + d1 + 4) >> 3;
+    op[11] = (a1 - d1 + 4) >> 3;
+    op[9] = (b1 + c1 + 4) >> 3;
+    op[10] = (b1 - c1 + 4) >> 3;
+
+    a1 = ip[12] + ip[14];
+    b1 = ip[12] - ip[14];
+
+    op[12] = (a1 + d2 + 4) >> 3;
+    op[15] = (a1 - d2 + 4) >> 3;
+    op[13] = (b1 + c2 + 4) >> 3;
+    op[14] = (b1 - c2 + 4) >> 3;
+
+    ip = output;
+
+    for (r = 0; r < 4; r++)
+    {
+        for (c = 0; c < 4; c++)
+        {
+            short a = ip[c] + pred_ptr[c] ;
+            dst_ptr[c] = cm[a] ;
+        }
+
+        ip += 4;
+        dst_ptr += dst_stride;
+        pred_ptr += pred_stride;
+    }
+}
+
+void vp8_dc_only_idct_add_dspr2(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride)
+{
+    int a1;
+    int i, absa1;
+    int t2, vector_a1, vector_a;
+
+    /* a1 = ((input_dc + 4) >> 3); */
+    __asm__ __volatile__ (
+        "addi  %[a1], %[input_dc], 4   \n\t"
+        "sra   %[a1], %[a1],       3   \n\t"
+        : [a1] "=r" (a1)
+        : [input_dc] "r" (input_dc)
+    );
+
+    if (a1 < 0)
+    {
+        /* use quad-byte
+         * input and output memory are four byte aligned
+         */
+        __asm__ __volatile__ (
+            "abs        %[absa1],     %[a1]         \n\t"
+            "replv.qb   %[vector_a1], %[absa1]      \n\t"
+            : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
+            : [a1] "r" (a1)
+        );
+
+        /* use (a1 - predptr[c]) instead a1 + predptr[c] */
+        for (i = 4; i--;)
+        {
+            __asm__ __volatile__ (
+                "lw             %[t2],       0(%[pred_ptr])                     \n\t"
+                "add            %[pred_ptr], %[pred_ptr],    %[pred_stride]     \n\t"
+                "subu_s.qb      %[vector_a], %[t2],          %[vector_a1]       \n\t"
+                "sw             %[vector_a], 0(%[dst_ptr])                      \n\t"
+                "add            %[dst_ptr],  %[dst_ptr],     %[dst_stride]      \n\t"
+                : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a),
+                  [dst_ptr] "+&r" (dst_ptr), [pred_ptr] "+&r" (pred_ptr)
+                : [dst_stride] "r" (dst_stride), [pred_stride] "r" (pred_stride), [vector_a1] "r" (vector_a1)
+            );
+        }
+    }
+    else
+    {
+        /* use quad-byte
+         * input and output memory are four byte aligned
+         */
+        __asm__ __volatile__ (
+            "replv.qb       %[vector_a1], %[a1]     \n\t"
+            : [vector_a1] "=r" (vector_a1)
+            : [a1] "r" (a1)
+        );
+
+        for (i = 4; i--;)
+        {
+            __asm__ __volatile__ (
+                "lw             %[t2],       0(%[pred_ptr])                 \n\t"
+                "add            %[pred_ptr], %[pred_ptr],    %[pred_stride] \n\t"
+                "addu_s.qb      %[vector_a], %[vector_a1],   %[t2]          \n\t"
+                "sw             %[vector_a], 0(%[dst_ptr])                  \n\t"
+                "add            %[dst_ptr],  %[dst_ptr],     %[dst_stride]  \n\t"
+                : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a),
+                  [dst_ptr] "+&r" (dst_ptr), [pred_ptr] "+&r" (pred_ptr)
+                : [dst_stride] "r" (dst_stride), [pred_stride] "r" (pred_stride), [vector_a1] "r" (vector_a1)
+            );
+        }
+    }
+
+}
+
+void vp8_short_inv_walsh4x4_dspr2(short *input, short *mb_dqcoeff)
+{
+    short output[16];
+    int i;
+    int a1, b1, c1, d1;
+    int a2, b2, c2, d2;
+    short *ip = input;
+    short *op = output;
+
+    prefetch_load_short(ip);
+
+    for (i = 4; i--;)
+    {
+        a1 = ip[0] + ip[12];
+        b1 = ip[4] + ip[8];
+        c1 = ip[4] - ip[8];
+        d1 = ip[0] - ip[12];
+
+        op[0] = a1 + b1;
+        op[4] = c1 + d1;
+        op[8] = a1 - b1;
+        op[12] = d1 - c1;
+
+        ip++;
+        op++;
+    }
+
+    ip = output;
+    op = output;
+
+    prefetch_load_short(ip);
+
+    for (i = 4; i--;)
+    {
+        a1 = ip[0] + ip[3] + 3;
+        b1 = ip[1] + ip[2];
+        c1 = ip[1] - ip[2];
+        d1 = ip[0] - ip[3] + 3;
+
+        a2 = a1 + b1;
+        b2 = d1 + c1;
+        c2 = a1 - b1;
+        d2 = d1 - c1;
+
+        op[0] = a2 >> 3;
+        op[1] = b2 >> 3;
+        op[2] = c2 >> 3;
+        op[3] = d2 >> 3;
+
+        ip += 4;
+        op += 4;
+    }
+
+    for (i = 0; i < 16; i++)
+    {
+        mb_dqcoeff[i * 16] = output[i];
+    }
+}
+
+void vp8_short_inv_walsh4x4_1_dspr2(short *input, short *mb_dqcoeff)
+{
+    int a1;
+
+    a1 = ((input[0] + 3) >> 3);
+
+    __asm__ __volatile__ (
+        "sh             %[a1], 0(%[mb_dqcoeff])                    \n\t"
+        "sh             %[a1], 32(%[mb_dqcoeff])                   \n\t"
+        "sh             %[a1], 64(%[mb_dqcoeff])                   \n\t"
+        "sh             %[a1], 96(%[mb_dqcoeff])                   \n\t"
+        "sh             %[a1], 128(%[mb_dqcoeff])                  \n\t"
+        "sh             %[a1], 160(%[mb_dqcoeff])                  \n\t"
+        "sh             %[a1], 192(%[mb_dqcoeff])                  \n\t"
+        "sh             %[a1], 224(%[mb_dqcoeff])                  \n\t"
+        "sh             %[a1], 256(%[mb_dqcoeff])                  \n\t"
+        "sh             %[a1], 288(%[mb_dqcoeff])                  \n\t"
+        "sh             %[a1], 320(%[mb_dqcoeff])                  \n\t"
+        "sh             %[a1], 352(%[mb_dqcoeff])                  \n\t"
+        "sh             %[a1], 384(%[mb_dqcoeff])                  \n\t"
+        "sh             %[a1], 416(%[mb_dqcoeff])                  \n\t"
+        "sh             %[a1], 448(%[mb_dqcoeff])                  \n\t"
+        "sh             %[a1], 480(%[mb_dqcoeff])                  \n\t"
+
+        :
+        : [a1] "r" (a1), [mb_dqcoeff] "r" (mb_dqcoeff)
+    );
+}
+
+#endif
diff --git a/libs/libvpx/vp8/common/mips/dspr2/reconinter_dspr2.c b/libs/libvpx/vp8/common/mips/dspr2/reconinter_dspr2.c
new file mode 100644
index 0000000000..a14b397d8f
--- /dev/null
+++ b/libs/libvpx/vp8/common/mips/dspr2/reconinter_dspr2.c
@@ -0,0 +1,121 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+#if HAVE_DSPR2
+inline void prefetch_load_int(unsigned char *src)
+{
+    __asm__ __volatile__ (
+        "pref   0,  0(%[src])   \n\t"
+        :
+        : [src] "r" (src)
+    );
+}
+
+
+__inline void vp8_copy_mem16x16_dspr2(
+    unsigned char *RESTRICT src,
+    int src_stride,
+    unsigned char *RESTRICT dst,
+    int dst_stride)
+{
+    int r;
+    unsigned int a0, a1, a2, a3;
+
+    for (r = 16; r--;)
+    {
+        /* load src data in cache memory */
+        prefetch_load_int(src + src_stride);
+
+        /* use unaligned memory load and store */
+        __asm__ __volatile__ (
+            "ulw    %[a0], 0(%[src])            \n\t"
+            "ulw    %[a1], 4(%[src])            \n\t"
+            "ulw    %[a2], 8(%[src])            \n\t"
+            "ulw    %[a3], 12(%[src])           \n\t"
+            "sw     %[a0], 0(%[dst])            \n\t"
+            "sw     %[a1], 4(%[dst])            \n\t"
+            "sw     %[a2], 8(%[dst])            \n\t"
+            "sw     %[a3], 12(%[dst])           \n\t"
+            : [a0] "=&r" (a0), [a1] "=&r" (a1),
+              [a2] "=&r" (a2), [a3] "=&r" (a3)
+            : [src] "r" (src), [dst] "r" (dst)
+        );
+
+        src += src_stride;
+        dst += dst_stride;
+    }
+}
+
+
+__inline void vp8_copy_mem8x8_dspr2(
+    unsigned char *RESTRICT src,
+    int src_stride,
+    unsigned char *RESTRICT dst,
+    int dst_stride)
+{
+    int r;
+    unsigned int a0, a1;
+
+    /* load src data in cache memory */
+    prefetch_load_int(src + src_stride);
+
+    for (r = 8; r--;)
+    {
+        /* use unaligned memory load and store */
+        __asm__ __volatile__ (
+            "ulw    %[a0], 0(%[src])            \n\t"
+            "ulw    %[a1], 4(%[src])            \n\t"
+            "sw     %[a0], 0(%[dst])            \n\t"
+            "sw     %[a1], 4(%[dst])            \n\t"
+            : [a0] "=&r" (a0), [a1] "=&r" (a1)
+            : [src] "r" (src), [dst] "r" (dst)
+        );
+
+        src += src_stride;
+        dst += dst_stride;
+    }
+}
+
+
+__inline void vp8_copy_mem8x4_dspr2(
+    unsigned char *RESTRICT src,
+    int src_stride,
+    unsigned char *RESTRICT dst,
+    int dst_stride)
+{
+    int r;
+    unsigned int a0, a1;
+
+    /* load src data in cache memory */
+    prefetch_load_int(src + src_stride);
+
+    for (r = 4; r--;)
+    {
+        /* use unaligned memory load and store */
+        __asm__ __volatile__ (
+            "ulw    %[a0], 0(%[src])            \n\t"
+            "ulw    %[a1], 4(%[src])            \n\t"
+            "sw     %[a0], 0(%[dst])            \n\t"
+            "sw     %[a1], 4(%[dst])            \n\t"
+           : [a0] "=&r" (a0), [a1] "=&r" (a1)
+           : [src] "r" (src), [dst] "r" (dst)
+        );
+
+        src += src_stride;
+        dst += dst_stride;
+    }
+}
+
+#endif
diff --git a/libs/libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c b/libs/libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c
new file mode 100644
index 0000000000..9ae6bc8f92
--- /dev/null
+++ b/libs/libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c
@@ -0,0 +1,2622 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <stdlib.h>
+#include "vp8_rtcd.h"
+#include "vp8/common/onyxc_int.h"
+
+#if HAVE_DSPR2
+typedef unsigned char uc;
+
+/* prefetch data for load */
+inline void prefetch_load_lf(unsigned char *src)
+{
+    __asm__ __volatile__ (
+        "pref   0,  0(%[src])   \n\t"
+        :
+        : [src] "r" (src)
+    );
+}
+
+
+/* prefetch data for store */
+inline void prefetch_store_lf(unsigned char *dst)
+{
+    __asm__ __volatile__ (
+        "pref   1,  0(%[dst])   \n\t"
+        :
+        : [dst] "r" (dst)
+    );
+}
+
+/* processing 4 pixels at the same time
+ * compute hev and mask in the same function
+ */
+static __inline void vp8_filter_mask_vec_mips
+(
+    uint32_t limit,
+    uint32_t flimit,
+    uint32_t p1,
+    uint32_t p0,
+    uint32_t p3,
+    uint32_t p2,
+    uint32_t q0,
+    uint32_t q1,
+    uint32_t q2,
+    uint32_t q3,
+    uint32_t thresh,
+    uint32_t *hev,
+    uint32_t *mask
+)
+{
+    uint32_t c, r, r3, r_k;
+    uint32_t s1, s2, s3;
+    uint32_t ones = 0xFFFFFFFF;
+    uint32_t hev1;
+
+    __asm__ __volatile__ (
+        /* mask |= (abs(p3 - p2) > limit) */
+        "subu_s.qb      %[c],   %[p3],     %[p2]        \n\t"
+        "subu_s.qb      %[r_k], %[p2],     %[p3]        \n\t"
+        "or             %[r_k], %[r_k],    %[c]         \n\t"
+        "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+        "or             %[r],   $0,        %[c]         \n\t"
+
+        /* mask |= (abs(p2 - p1) > limit) */
+        "subu_s.qb      %[c],   %[p2],     %[p1]        \n\t"
+        "subu_s.qb      %[r_k], %[p1],     %[p2]        \n\t"
+        "or             %[r_k], %[r_k],    %[c]         \n\t"
+        "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+        "or             %[r],   %[r],      %[c]         \n\t"
+
+        /* mask |= (abs(p1 - p0) > limit)
+         * hev  |= (abs(p1 - p0) > thresh)
+         */
+        "subu_s.qb      %[c],   %[p1],     %[p0]        \n\t"
+        "subu_s.qb      %[r_k], %[p0],     %[p1]        \n\t"
+        "or             %[r_k], %[r_k],    %[c]         \n\t"
+        "cmpgu.lt.qb    %[c],   %[thresh], %[r_k]       \n\t"
+        "or             %[r3],  $0,        %[c]         \n\t"
+        "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+        "or             %[r],   %[r],      %[c]         \n\t"
+
+        /* mask |= (abs(q1 - q0) > limit)
+         * hev  |= (abs(q1 - q0) > thresh)
+         */
+        "subu_s.qb      %[c],   %[q1],     %[q0]        \n\t"
+        "subu_s.qb      %[r_k], %[q0],     %[q1]        \n\t"
+        "or             %[r_k], %[r_k],    %[c]         \n\t"
+        "cmpgu.lt.qb    %[c],   %[thresh], %[r_k]       \n\t"
+        "or             %[r3],  %[r3],     %[c]         \n\t"
+        "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+        "or             %[r],   %[r],      %[c]         \n\t"
+
+        /* mask |= (abs(q2 - q1) > limit) */
+        "subu_s.qb      %[c],   %[q2],     %[q1]        \n\t"
+        "subu_s.qb      %[r_k], %[q1],     %[q2]        \n\t"
+        "or             %[r_k], %[r_k],    %[c]         \n\t"
+        "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+        "or             %[r],   %[r],      %[c]         \n\t"
+        "sll            %[r3],    %[r3],    24          \n\t"
+
+        /* mask |= (abs(q3 - q2) > limit) */
+        "subu_s.qb      %[c],   %[q3],     %[q2]        \n\t"
+        "subu_s.qb      %[r_k], %[q2],     %[q3]        \n\t"
+        "or             %[r_k], %[r_k],    %[c]         \n\t"
+        "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+        "or             %[r],   %[r],      %[c]         \n\t"
+
+        : [c] "=&r" (c), [r_k] "=&r" (r_k),
+          [r] "=&r" (r), [r3] "=&r" (r3)
+        : [limit] "r" (limit), [p3] "r" (p3), [p2] "r" (p2),
+          [p1] "r" (p1), [p0] "r" (p0), [q1] "r" (q1), [q0] "r" (q0),
+          [q2] "r" (q2), [q3] "r" (q3), [thresh] "r" (thresh)
+    );
+
+    __asm__ __volatile__ (
+        /* abs(p0 - q0) */
+        "subu_s.qb      %[c],   %[p0],     %[q0]        \n\t"
+        "subu_s.qb      %[r_k], %[q0],     %[p0]        \n\t"
+        "wrdsp          %[r3]                           \n\t"
+        "or             %[s1],  %[r_k],    %[c]         \n\t"
+
+        /* abs(p1 - q1) */
+        "subu_s.qb      %[c],    %[p1],    %[q1]        \n\t"
+        "addu_s.qb      %[s3],   %[s1],    %[s1]        \n\t"
+        "pick.qb        %[hev1], %[ones],  $0           \n\t"
+        "subu_s.qb      %[r_k],  %[q1],    %[p1]        \n\t"
+        "or             %[s2],   %[r_k],   %[c]         \n\t"
+
+        /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > flimit * 2 + limit */
+        "shrl.qb        %[s2],   %[s2],     1           \n\t"
+        "addu_s.qb      %[s1],   %[s2],     %[s3]       \n\t"
+        "cmpgu.lt.qb    %[c],    %[flimit], %[s1]       \n\t"
+        "or             %[r],    %[r],      %[c]        \n\t"
+        "sll            %[r],    %[r],      24          \n\t"
+
+        "wrdsp          %[r]                            \n\t"
+        "pick.qb        %[s2],  $0,         %[ones]     \n\t"
+
+        : [c] "=&r" (c), [r_k] "=&r" (r_k), [s1] "=&r" (s1), [hev1] "=&r" (hev1),
+          [s2] "=&r" (s2), [r] "+r" (r), [s3] "=&r" (s3)
+        : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [r3] "r" (r3),
+          [q1] "r" (q1), [ones] "r" (ones), [flimit] "r" (flimit)
+    );
+
+    *hev = hev1;
+    *mask = s2;
+}
+
+
+/* inputs & outputs are quad-byte vectors */
+static __inline void vp8_filter_mips
+(
+    uint32_t mask,
+    uint32_t hev,
+    uint32_t *ps1,
+    uint32_t *ps0,
+    uint32_t *qs0,
+    uint32_t *qs1
+)
+{
+    int32_t vp8_filter_l, vp8_filter_r;
+    int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r;
+    int32_t subr_r, subr_l;
+    uint32_t t1, t2, HWM, t3;
+    uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
+
+    int32_t vps1, vps0, vqs0, vqs1;
+    int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
+    uint32_t N128;
+
+    N128 = 0x80808080;
+    t1  = 0x03000300;
+    t2  = 0x04000400;
+    t3  = 0x01000100;
+    HWM = 0xFF00FF00;
+
+    vps0 = (*ps0) ^ N128;
+    vps1 = (*ps1) ^ N128;
+    vqs0 = (*qs0) ^ N128;
+    vqs1 = (*qs1) ^ N128;
+
+    /* use halfword pairs instead quad-bytes because of accuracy */
+    vps0_l = vps0 & HWM;
+    vps0_r = vps0 << 8;
+    vps0_r = vps0_r & HWM;
+
+    vps1_l = vps1 & HWM;
+    vps1_r = vps1 << 8;
+    vps1_r = vps1_r & HWM;
+
+    vqs0_l = vqs0 & HWM;
+    vqs0_r = vqs0 << 8;
+    vqs0_r = vqs0_r & HWM;
+
+    vqs1_l = vqs1 & HWM;
+    vqs1_r = vqs1 << 8;
+    vqs1_r = vqs1_r & HWM;
+
+    mask_l = mask & HWM;
+    mask_r = mask << 8;
+    mask_r = mask_r & HWM;
+
+    hev_l = hev & HWM;
+    hev_r = hev << 8;
+    hev_r = hev_r & HWM;
+
+    __asm__ __volatile__ (
+        /* vp8_filter = vp8_signed_char_clamp(ps1 - qs1); */
+        "subq_s.ph    %[vp8_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
+        "subq_s.ph    %[vp8_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
+
+        /* qs0 - ps0 */
+        "subq_s.ph    %[subr_l],       %[vqs0_l],       %[vps0_l]       \n\t"
+        "subq_s.ph    %[subr_r],       %[vqs0_r],       %[vps0_r]       \n\t"
+
+        /* vp8_filter &= hev; */
+        "and          %[vp8_filter_l], %[vp8_filter_l], %[hev_l]        \n\t"
+        "and          %[vp8_filter_r], %[vp8_filter_r], %[hev_r]        \n\t"
+
+        /* vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0)); */
+        "addq_s.ph    %[vp8_filter_l], %[vp8_filter_l], %[subr_l]       \n\t"
+        "addq_s.ph    %[vp8_filter_r], %[vp8_filter_r], %[subr_r]       \n\t"
+        "xor          %[invhev_l],     %[hev_l],        %[HWM]          \n\t"
+        "addq_s.ph    %[vp8_filter_l], %[vp8_filter_l], %[subr_l]       \n\t"
+        "addq_s.ph    %[vp8_filter_r], %[vp8_filter_r], %[subr_r]       \n\t"
+        "xor          %[invhev_r],     %[hev_r],        %[HWM]          \n\t"
+        "addq_s.ph    %[vp8_filter_l], %[vp8_filter_l], %[subr_l]       \n\t"
+        "addq_s.ph    %[vp8_filter_r], %[vp8_filter_r], %[subr_r]       \n\t"
+
+        /* vp8_filter &= mask; */
+        "and          %[vp8_filter_l], %[vp8_filter_l], %[mask_l]       \n\t"
+        "and          %[vp8_filter_r], %[vp8_filter_r], %[mask_r]       \n\t"
+
+        : [vp8_filter_l] "=&r" (vp8_filter_l), [vp8_filter_r] "=&r" (vp8_filter_r),
+          [subr_l] "=&r" (subr_l), [subr_r] "=&r" (subr_r),
+          [invhev_l] "=&r" (invhev_l), [invhev_r] "=&r" (invhev_r)
+
+        : [vps0_l] "r" (vps0_l), [vps0_r] "r" (vps0_r), [vps1_l] "r" (vps1_l),
+          [vps1_r] "r" (vps1_r), [vqs0_l] "r" (vqs0_l), [vqs0_r] "r" (vqs0_r),
+          [vqs1_l] "r" (vqs1_l), [vqs1_r] "r" (vqs1_r),
+          [mask_l] "r" (mask_l), [mask_r] "r" (mask_r),
+          [hev_l] "r" (hev_l), [hev_r] "r" (hev_r),
+          [HWM] "r" (HWM)
+    );
+
+    /* save bottom 3 bits so that we round one side +4 and the other +3 */
+    __asm__ __volatile__ (
+        /* Filter2 = vp8_signed_char_clamp(vp8_filter + 3) >>= 3; */
+        "addq_s.ph    %[Filter1_l],    %[vp8_filter_l], %[t2]           \n\t"
+        "addq_s.ph    %[Filter1_r],    %[vp8_filter_r], %[t2]           \n\t"
+
+        /* Filter1 = vp8_signed_char_clamp(vp8_filter + 4) >>= 3; */
+        "addq_s.ph    %[Filter2_l],    %[vp8_filter_l], %[t1]           \n\t"
+        "addq_s.ph    %[Filter2_r],    %[vp8_filter_r], %[t1]           \n\t"
+        "shra.ph      %[Filter1_r],    %[Filter1_r],    3               \n\t"
+        "shra.ph      %[Filter1_l],    %[Filter1_l],    3               \n\t"
+
+        "shra.ph      %[Filter2_l],    %[Filter2_l],    3               \n\t"
+        "shra.ph      %[Filter2_r],    %[Filter2_r],    3               \n\t"
+
+        "and          %[Filter1_l],    %[Filter1_l],    %[HWM]          \n\t"
+        "and          %[Filter1_r],    %[Filter1_r],    %[HWM]          \n\t"
+
+        /* vps0 = vp8_signed_char_clamp(ps0 + Filter2); */
+        "addq_s.ph    %[vps0_l],       %[vps0_l],       %[Filter2_l]    \n\t"
+        "addq_s.ph    %[vps0_r],       %[vps0_r],       %[Filter2_r]    \n\t"
+
+        /* vqs0 = vp8_signed_char_clamp(qs0 - Filter1); */
+        "subq_s.ph    %[vqs0_l],       %[vqs0_l],       %[Filter1_l]    \n\t"
+        "subq_s.ph    %[vqs0_r],       %[vqs0_r],       %[Filter1_r]    \n\t"
+
+        : [Filter1_l] "=&r" (Filter1_l), [Filter1_r] "=&r" (Filter1_r),
+          [Filter2_l] "=&r" (Filter2_l), [Filter2_r] "=&r" (Filter2_r),
+          [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
+          [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
+
+        : [t1] "r" (t1), [t2] "r" (t2),
+          [vp8_filter_l] "r" (vp8_filter_l), [vp8_filter_r] "r" (vp8_filter_r),
+          [HWM] "r" (HWM)
+    );
+
+    __asm__ __volatile__ (
+        /* (vp8_filter += 1) >>= 1 */
+        "addqh.ph    %[Filter1_l],    %[Filter1_l],     %[t3]           \n\t"
+        "addqh.ph    %[Filter1_r],    %[Filter1_r],     %[t3]           \n\t"
+
+        /* vp8_filter &= ~hev; */
+        "and          %[Filter1_l],    %[Filter1_l],    %[invhev_l]     \n\t"
+        "and          %[Filter1_r],    %[Filter1_r],    %[invhev_r]     \n\t"
+
+        /* vps1 = vp8_signed_char_clamp(ps1 + vp8_filter); */
+        "addq_s.ph    %[vps1_l],       %[vps1_l],       %[Filter1_l]    \n\t"
+        "addq_s.ph    %[vps1_r],       %[vps1_r],       %[Filter1_r]    \n\t"
+
+        /* vqs1 = vp8_signed_char_clamp(qs1 - vp8_filter); */
+        "subq_s.ph    %[vqs1_l],       %[vqs1_l],       %[Filter1_l]    \n\t"
+        "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[Filter1_r]    \n\t"
+
+        : [Filter1_l] "+r" (Filter1_l), [Filter1_r] "+r" (Filter1_r),
+          [vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r),
+          [vqs1_l] "+r" (vqs1_l), [vqs1_r] "+r" (vqs1_r)
+
+        : [t3] "r" (t3), [invhev_l] "r" (invhev_l), [invhev_r] "r" (invhev_r)
+    );
+
+    /* Create quad-bytes from halfword pairs */
+    vqs0_l = vqs0_l & HWM;
+    vqs1_l = vqs1_l & HWM;
+    vps0_l = vps0_l & HWM;
+    vps1_l = vps1_l & HWM;
+
+    __asm__ __volatile__ (
+        "shrl.ph      %[vqs0_r],       %[vqs0_r],       8               \n\t"
+        "shrl.ph      %[vps0_r],       %[vps0_r],       8               \n\t"
+        "shrl.ph      %[vqs1_r],       %[vqs1_r],       8               \n\t"
+        "shrl.ph      %[vps1_r],       %[vps1_r],       8               \n\t"
+
+        : [vps1_r] "+r" (vps1_r), [vqs1_r] "+r" (vqs1_r),
+          [vps0_r] "+r" (vps0_r), [vqs0_r] "+r" (vqs0_r)
+        :
+    );
+
+    vqs0 = vqs0_l | vqs0_r;
+    vqs1 = vqs1_l | vqs1_r;
+    vps0 = vps0_l | vps0_r;
+    vps1 = vps1_l | vps1_r;
+
+    *ps0 = vps0 ^ N128;
+    *ps1 = vps1 ^ N128;
+    *qs0 = vqs0 ^ N128;
+    *qs1 = vqs1 ^ N128;
+}
+
+void vp8_loop_filter_horizontal_edge_mips
+(
+    unsigned char *s,
+    int p,
+    unsigned int flimit,
+    unsigned int limit,
+    unsigned int thresh,
+    int count
+)
+{
+    uint32_t mask;
+    uint32_t hev;
+    uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+    unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
+
+    mask = 0;
+    hev = 0;
+    p1 = 0;
+    p2 = 0;
+    p3 = 0;
+    p4 = 0;
+
+    /* prefetch data for store */
+    prefetch_store_lf(s);
+
+    /* loop filter designed to work using chars so that we can make maximum use
+     * of 8 bit simd instructions.
+     */
+
+    sm1 = s - (p << 2);
+    s0 = s - p - p - p;
+    s1 = s - p - p ;
+    s2 = s - p;
+    s3 = s;
+    s4 = s + p;
+    s5 = s + p + p;
+    s6 = s + p + p + p;
+
+    /* load quad-byte vectors
+     * memory is 4 byte aligned
+     */
+    p1 = *((uint32_t *)(s1));
+    p2 = *((uint32_t *)(s2));
+    p3 = *((uint32_t *)(s3));
+    p4 = *((uint32_t *)(s4));
+
+    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+     * mask will be zero and filtering is not needed
+     */
+    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+    {
+
+        pm1 = *((uint32_t *)(sm1));
+        p0  = *((uint32_t *)(s0));
+        p5  = *((uint32_t *)(s5));
+        p6  = *((uint32_t *)(s6));
+
+        vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+                                 thresh, &hev, &mask);
+
+        /* if mask == 0 do filtering is not needed */
+        if (mask)
+        {
+            /* filtering */
+            vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
+
+            /* unpack processed 4x4 neighborhood */
+            *((uint32_t *)s1) = p1;
+            *((uint32_t *)s2) = p2;
+            *((uint32_t *)s3) = p3;
+            *((uint32_t *)s4) = p4;
+        }
+    }
+
+    sm1 += 4;
+    s0  += 4;
+    s1  += 4;
+    s2  += 4;
+    s3  += 4;
+    s4  += 4;
+    s5  += 4;
+    s6  += 4;
+
+    /* load quad-byte vectors
+     * memory is 4 byte aligned
+     */
+    p1 = *((uint32_t *)(s1));
+    p2 = *((uint32_t *)(s2));
+    p3 = *((uint32_t *)(s3));
+    p4 = *((uint32_t *)(s4));
+
+    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+     * mask will be zero and filtering is not needed
+     */
+    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+    {
+
+        pm1 = *((uint32_t *)(sm1));
+        p0  = *((uint32_t *)(s0));
+        p5  = *((uint32_t *)(s5));
+        p6  = *((uint32_t *)(s6));
+
+        vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+                                 thresh, &hev, &mask);
+
+        /* if mask == 0 do filtering is not needed */
+        if (mask)
+        {
+            /* filtering */
+            vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
+
+            /* unpack processed 4x4 neighborhood */
+            *((uint32_t *)s1) = p1;
+            *((uint32_t *)s2) = p2;
+            *((uint32_t *)s3) = p3;
+            *((uint32_t *)s4) = p4;
+        }
+    }
+
+    sm1 += 4;
+    s0  += 4;
+    s1  += 4;
+    s2  += 4;
+    s3  += 4;
+    s4  += 4;
+    s5  += 4;
+    s6  += 4;
+
+    /* load quad-byte vectors
+     * memory is 4 byte aligned
+     */
+    p1 = *((uint32_t *)(s1));
+    p2 = *((uint32_t *)(s2));
+    p3 = *((uint32_t *)(s3));
+    p4 = *((uint32_t *)(s4));
+
+    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+     * mask will be zero and filtering is not needed
+     */
+    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+    {
+
+        pm1 = *((uint32_t *)(sm1));
+        p0  = *((uint32_t *)(s0));
+        p5  = *((uint32_t *)(s5));
+        p6  = *((uint32_t *)(s6));
+
+        vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+                                 thresh, &hev, &mask);
+
+        /* if mask == 0 do filtering is not needed */
+        if (mask)
+        {
+            /* filtering */
+            vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
+
+            /* unpack processed 4x4 neighborhood */
+            *((uint32_t *)s1) = p1;
+            *((uint32_t *)s2) = p2;
+            *((uint32_t *)s3) = p3;
+            *((uint32_t *)s4) = p4;
+        }
+    }
+
+    sm1 += 4;
+    s0  += 4;
+    s1  += 4;
+    s2  += 4;
+    s3  += 4;
+    s4  += 4;
+    s5  += 4;
+    s6  += 4;
+
+    /* load quad-byte vectors
+     * memory is 4 byte aligned
+     */
+    p1 = *((uint32_t *)(s1));
+    p2 = *((uint32_t *)(s2));
+    p3 = *((uint32_t *)(s3));
+    p4 = *((uint32_t *)(s4));
+
+    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+     * mask will be zero and filtering is not needed
+     */
+    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+    {
+
+        pm1 = *((uint32_t *)(sm1));
+        p0  = *((uint32_t *)(s0));
+        p5  = *((uint32_t *)(s5));
+        p6  = *((uint32_t *)(s6));
+
+        vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+                                 thresh, &hev, &mask);
+
+        /* if mask == 0 do filtering is not needed */
+        if (mask)
+        {
+            /* filtering */
+            vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
+
+            /* unpack processed 4x4 neighborhood */
+            *((uint32_t *)s1) = p1;
+            *((uint32_t *)s2) = p2;
+            *((uint32_t *)s3) = p3;
+            *((uint32_t *)s4) = p4;
+        }
+    }
+}
+
+void vp8_loop_filter_uvhorizontal_edge_mips
+(
+    unsigned char *s,
+    int p,
+    unsigned int flimit,
+    unsigned int limit,
+    unsigned int thresh,
+    int count
+)
+{
+    uint32_t mask;
+    uint32_t hev;
+    uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+    unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
+
+    mask = 0;
+    hev = 0;
+    p1 = 0;
+    p2 = 0;
+    p3 = 0;
+    p4 = 0;
+
+    /* loop filter designed to work using chars so that we can make maximum use
+     * of 8 bit simd instructions.
+     */
+
+    sm1 = s - (p << 2);
+    s0  = s - p - p - p;
+    s1  = s - p - p ;
+    s2  = s - p;
+    s3  = s;
+    s4  = s + p;
+    s5  = s + p + p;
+    s6  = s + p + p + p;
+
+    /* load quad-byte vectors
+     * memory is 4 byte aligned
+     */
+    p1 = *((uint32_t *)(s1));
+    p2 = *((uint32_t *)(s2));
+    p3 = *((uint32_t *)(s3));
+    p4 = *((uint32_t *)(s4));
+
+    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+     * mask will be zero and filtering is not needed
+     */
+    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+    {
+
+        pm1 = *((uint32_t *)(sm1));
+        p0  = *((uint32_t *)(s0));
+        p5  = *((uint32_t *)(s5));
+        p6  = *((uint32_t *)(s6));
+
+        vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+                                 thresh, &hev, &mask);
+
+        /* if mask == 0 do filtering is not needed */
+        if (mask)
+        {
+            /* filtering */
+            vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
+
+            /* unpack processed 4x4 neighborhood */
+            *((uint32_t *)s1) = p1;
+            *((uint32_t *)s2) = p2;
+            *((uint32_t *)s3) = p3;
+            *((uint32_t *)s4) = p4;
+        }
+    }
+
+    sm1 += 4;
+    s0  += 4;
+    s1  += 4;
+    s2  += 4;
+    s3  += 4;
+    s4  += 4;
+    s5  += 4;
+    s6  += 4;
+
+    /* load quad-byte vectors
+     * memory is 4 byte aligned
+     */
+    p1 = *((uint32_t *)(s1));
+    p2 = *((uint32_t *)(s2));
+    p3 = *((uint32_t *)(s3));
+    p4 = *((uint32_t *)(s4));
+
+    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+     * mask will be zero and filtering is not needed
+     */
+    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+    {
+
+        pm1 = *((uint32_t *)(sm1));
+        p0  = *((uint32_t *)(s0));
+        p5  = *((uint32_t *)(s5));
+        p6  = *((uint32_t *)(s6));
+
+        vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+                                 thresh, &hev, &mask);
+
+        /* if mask == 0 do filtering is not needed */
+        if (mask)
+        {
+            /* filtering */
+            vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
+
+            /* unpack processed 4x4 neighborhood */
+            *((uint32_t *)s1) = p1;
+            *((uint32_t *)s2) = p2;
+            *((uint32_t *)s3) = p3;
+            *((uint32_t *)s4) = p4;
+        }
+    }
+}
+
+void vp8_loop_filter_vertical_edge_mips
+(
+    unsigned char *s,
+    int p,
+    const unsigned int flimit,
+    const unsigned int limit,
+    const unsigned int thresh,
+    int count
+)
+{
+    int i;
+    uint32_t mask, hev;
+    uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+    unsigned char *s1, *s2, *s3, *s4;
+    uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
+
+    hev = 0;
+    mask = 0;
+    i = 0;
+    pm1 = 0;
+    p0 = 0;
+    p1 = 0;
+    p2 = 0;
+    p3 = 0;
+    p4 = 0;
+    p5 = 0;
+    p6 = 0;
+
+    /* loop filter designed to work using chars so that we can make maximum use
+     * of 8 bit simd instructions.
+     */
+
+    /* apply filter on 4 pixesl at the same time */
+    do
+    {
+
+        /* prefetch data for store */
+        prefetch_store_lf(s + p);
+
+        s1 = s;
+        s2 = s + p;
+        s3 = s2 + p;
+        s4 = s3 + p;
+        s  = s4 + p;
+
+        /* load quad-byte vectors
+         * memory is 4 byte aligned
+         */
+        p2  = *((uint32_t *)(s1 - 4));
+        p6  = *((uint32_t *)(s1));
+        p1  = *((uint32_t *)(s2 - 4));
+        p5  = *((uint32_t *)(s2));
+        p0  = *((uint32_t *)(s3 - 4));
+        p4  = *((uint32_t *)(s3));
+        pm1 = *((uint32_t *)(s4 - 4));
+        p3  = *((uint32_t *)(s4));
+
+        /* transpose pm1, p0, p1, p2 */
+        __asm__ __volatile__ (
+            "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
+            "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
+            "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
+            "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
+
+            "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
+            "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
+            "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+            "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+            "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
+            "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
+            "append         %[p1],      %[sec3],    16          \n\t"
+            "append         %[pm1],     %[sec4],    16          \n\t"
+
+            : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+              [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+              [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
+              [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+            :
+        );
+
+        /* transpose p3, p4, p5, p6 */
+        __asm__ __volatile__ (
+            "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
+            "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
+            "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
+            "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
+
+            "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
+            "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
+            "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+            "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+            "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
+            "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
+            "append         %[p5],      %[sec3],    16          \n\t"
+            "append         %[p3],      %[sec4],    16          \n\t"
+
+            : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+              [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+              [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+              [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+            :
+        );
+
+        /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+         * mask will be zero and filtering is not needed
+         */
+        if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+        {
+
+            vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+                                     thresh, &hev, &mask);
+
+            /* if mask == 0 do filtering is not needed */
+            if (mask)
+            {
+                /* filtering */
+                vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
+
+                /* unpack processed 4x4 neighborhood
+                 * don't use transpose on output data
+                 * because memory isn't aligned
+                 */
+                __asm__ __volatile__ (
+                    "sb         %[p4],  1(%[s4])    \n\t"
+                    "sb         %[p3],  0(%[s4])    \n\t"
+                    "sb         %[p2], -1(%[s4])    \n\t"
+                    "sb         %[p1], -2(%[s4])    \n\t"
+                    :
+                    : [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4),
+                      [p2] "r" (p2), [p1] "r" (p1)
+                );
+
+                __asm__ __volatile__ (
+                    "srl        %[p4], %[p4], 8     \n\t"
+                    "srl        %[p3], %[p3], 8     \n\t"
+                    "srl        %[p2], %[p2], 8     \n\t"
+                    "srl        %[p1], %[p1], 8     \n\t"
+                    : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+                    :
+                );
+
+                __asm__ __volatile__ (
+                    "sb         %[p4],  1(%[s3])    \n\t"
+                    "sb         %[p3],  0(%[s3])    \n\t"
+                    "sb         %[p2], -1(%[s3])    \n\t"
+                    "sb         %[p1], -2(%[s3])    \n\t"
+                    : [p1] "+r" (p1)
+                    : [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3), [p2] "r" (p2)
+                );
+
+                __asm__ __volatile__ (
+                    "srl        %[p4], %[p4], 8     \n\t"
+                    "srl        %[p3], %[p3], 8     \n\t"
+                    "srl        %[p2], %[p2], 8     \n\t"
+                    "srl        %[p1], %[p1], 8     \n\t"
+                    : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+                    :
+                );
+
+                __asm__ __volatile__ (
+                    "sb         %[p4],  1(%[s2])    \n\t"
+                    "sb         %[p3],  0(%[s2])    \n\t"
+                    "sb         %[p2], -1(%[s2])    \n\t"
+                    "sb         %[p1], -2(%[s2])    \n\t"
+                    :
+                    : [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2),
+                      [p2] "r" (p2), [p1] "r" (p1)
+                );
+
+                __asm__ __volatile__ (
+                    "srl        %[p4], %[p4], 8     \n\t"
+                    "srl        %[p3], %[p3], 8     \n\t"
+                    "srl        %[p2], %[p2], 8     \n\t"
+                    "srl        %[p1], %[p1], 8     \n\t"
+                    : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+                    :
+                );
+
+                __asm__ __volatile__ (
+                    "sb         %[p4],  1(%[s1])    \n\t"
+                    "sb         %[p3],  0(%[s1])    \n\t"
+                    "sb         %[p2], -1(%[s1])    \n\t"
+                    "sb         %[p1], -2(%[s1])    \n\t"
+                    :
+                    : [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1),
+                      [p2] "r" (p2), [p1] "r" (p1)
+                );
+            }
+        }
+
+        s1 = s;
+        s2 = s + p;
+        s3 = s2 + p;
+        s4 = s3 + p;
+        s  = s4 + p;
+
+        /* load quad-byte vectors
+         * memory is 4 byte aligned
+         */
+        p2  = *((uint32_t *)(s1 - 4));
+        p6  = *((uint32_t *)(s1));
+        p1  = *((uint32_t *)(s2 - 4));
+        p5  = *((uint32_t *)(s2));
+        p0  = *((uint32_t *)(s3 - 4));
+        p4  = *((uint32_t *)(s3));
+        pm1 = *((uint32_t *)(s4 - 4));
+        p3  = *((uint32_t *)(s4));
+
+        /* transpose pm1, p0, p1, p2 */
+        __asm__ __volatile__ (
+            "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
+            "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
+            "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
+            "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
+
+            "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
+            "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
+            "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+            "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+            "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
+            "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
+            "append         %[p1],      %[sec3],    16          \n\t"
+            "append         %[pm1],     %[sec4],    16          \n\t"
+
+            : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+              [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+              [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
+              [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+            :
+        );
+
+        /* transpose p3, p4, p5, p6 */
+        __asm__ __volatile__ (
+            "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
+            "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
+            "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
+            "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
+
+            "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
+            "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
+            "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+            "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+            "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
+            "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
+            "append         %[p5],      %[sec3],    16          \n\t"
+            "append         %[p3],      %[sec4],    16          \n\t"
+
+            : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+              [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+              [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+              [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+            :
+        );
+
+        /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+         * mask will be zero and filtering is not needed
+         */
+        if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+        {
+
+            vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+                                     thresh, &hev, &mask);
+
+            /* if mask == 0 do filtering is not needed */
+            if (mask)
+            {
+                /* filtering */
+                vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
+
+                /* unpack processed 4x4 neighborhood
+                 * don't use transpose on output data
+                 * because memory isn't aligned
+                 */
+                __asm__ __volatile__ (
+                    "sb         %[p4],  1(%[s4])    \n\t"
+                    "sb         %[p3],  0(%[s4])    \n\t"
+                    "sb         %[p2], -1(%[s4])    \n\t"
+                    "sb         %[p1], -2(%[s4])    \n\t"
+                    :
+                    : [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4),
+                      [p2] "r" (p2), [p1] "r" (p1)
+                );
+
+                __asm__ __volatile__ (
+                    "srl        %[p4], %[p4], 8     \n\t"
+                    "srl        %[p3], %[p3], 8     \n\t"
+                    "srl        %[p2], %[p2], 8     \n\t"
+                    "srl        %[p1], %[p1], 8     \n\t"
+                    : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+                    :
+                );
+
+                __asm__ __volatile__ (
+                    "sb         %[p4],  1(%[s3])    \n\t"
+                    "sb         %[p3],  0(%[s3])    \n\t"
+                    "sb         %[p2], -1(%[s3])    \n\t"
+                    "sb         %[p1], -2(%[s3])    \n\t"
+                    : [p1] "+r" (p1)
+                    : [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3), [p2] "r" (p2)
+                );
+
+                __asm__ __volatile__ (
+                    "srl        %[p4], %[p4], 8     \n\t"
+                    "srl        %[p3], %[p3], 8     \n\t"
+                    "srl        %[p2], %[p2], 8     \n\t"
+                    "srl        %[p1], %[p1], 8     \n\t"
+                    : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+                    :
+                );
+
+                __asm__ __volatile__ (
+                    "sb         %[p4],  1(%[s2])    \n\t"
+                    "sb         %[p3],  0(%[s2])    \n\t"
+                    "sb         %[p2], -1(%[s2])    \n\t"
+                    "sb         %[p1], -2(%[s2])    \n\t"
+                    :
+                    : [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2),
+                      [p2] "r" (p2), [p1] "r" (p1)
+                );
+
+                __asm__ __volatile__ (
+                    "srl        %[p4], %[p4], 8     \n\t"
+                    "srl        %[p3], %[p3], 8     \n\t"
+                    "srl        %[p2], %[p2], 8     \n\t"
+                    "srl        %[p1], %[p1], 8     \n\t"
+                    : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+                    :
+                );
+
+                __asm__ __volatile__ (
+                    "sb         %[p4],  1(%[s1])    \n\t"
+                    "sb         %[p3],  0(%[s1])    \n\t"
+                    "sb         %[p2], -1(%[s1])    \n\t"
+                    "sb         %[p1], -2(%[s1])    \n\t"
+                    :
+                    : [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1),
+                      [p2] "r" (p2), [p1] "r" (p1)
+                );
+            }
+        }
+
+        i += 8;
+    }
+
+    while (i < count);
+}
+
+void vp8_loop_filter_uvvertical_edge_mips
+(
+    unsigned char *s,
+    int p,
+    unsigned int flimit,
+    unsigned int limit,
+    unsigned int thresh,
+    int count
+)
+{
+    uint32_t mask, hev;
+    uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+    unsigned char *s1, *s2, *s3, *s4;
+    uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
+
+    /* loop filter designed to work using chars so that we can make maximum use
+     * of 8 bit simd instructions.
+     */
+
+    /* apply filter on 4 pixesl at the same time */
+
+    s1 = s;
+    s2 = s + p;
+    s3 = s2 + p;
+    s4 = s3 + p;
+
+    /* load quad-byte vectors
+    * memory is 4 byte aligned
+    */
+    p2  = *((uint32_t *)(s1 - 4));
+    p6  = *((uint32_t *)(s1));
+    p1  = *((uint32_t *)(s2 - 4));
+    p5  = *((uint32_t *)(s2));
+    p0  = *((uint32_t *)(s3 - 4));
+    p4  = *((uint32_t *)(s3));
+    pm1 = *((uint32_t *)(s4 - 4));
+    p3  = *((uint32_t *)(s4));
+
+    /* transpose pm1, p0, p1, p2 */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
+        "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
+
+        "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
+        "append         %[p1],      %[sec3],    16          \n\t"
+        "append         %[pm1],     %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    /* transpose p3, p4, p5, p6 */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
+        "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
+
+        "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
+        "append         %[p5],      %[sec3],    16          \n\t"
+        "append         %[p3],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+    * mask will be zero and filtering is not needed
+    */
+    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+    {
+
+        vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+                                 thresh, &hev, &mask);
+
+        /* if mask == 0 do filtering is not needed */
+        if (mask)
+        {
+            /* filtering */
+            vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
+
+            /* unpack processed 4x4 neighborhood
+             * don't use transpose on output data
+             * because memory isn't aligned
+             */
+            __asm__ __volatile__ (
+                "sb         %[p4],  1(%[s4])    \n\t"
+                "sb         %[p3],  0(%[s4])    \n\t"
+                "sb         %[p2], -1(%[s4])    \n\t"
+                "sb         %[p1], -2(%[s4])    \n\t"
+                :
+                : [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4),
+                  [p2] "r" (p2), [p1] "r" (p1)
+            );
+
+            __asm__ __volatile__ (
+                "srl        %[p4], %[p4], 8     \n\t"
+                "srl        %[p3], %[p3], 8     \n\t"
+                "srl        %[p2], %[p2], 8     \n\t"
+                "srl        %[p1], %[p1], 8     \n\t"
+                : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+                :
+            );
+
+            __asm__ __volatile__ (
+                "sb         %[p4],  1(%[s3])    \n\t"
+                "sb         %[p3],  0(%[s3])    \n\t"
+                "sb         %[p2], -1(%[s3])    \n\t"
+                "sb         %[p1], -2(%[s3])    \n\t"
+                : [p1] "+r" (p1)
+                : [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3), [p2] "r" (p2)
+            );
+
+            __asm__ __volatile__ (
+                "srl        %[p4], %[p4], 8     \n\t"
+                "srl        %[p3], %[p3], 8     \n\t"
+                "srl        %[p2], %[p2], 8     \n\t"
+                "srl        %[p1], %[p1], 8     \n\t"
+                : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+                :
+            );
+
+            __asm__ __volatile__ (
+                "sb         %[p4],  1(%[s2])    \n\t"
+                "sb         %[p3],  0(%[s2])    \n\t"
+                "sb         %[p2], -1(%[s2])    \n\t"
+                "sb         %[p1], -2(%[s2])    \n\t"
+                :
+                : [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2),
+                  [p2] "r" (p2), [p1] "r" (p1)
+            );
+
+            __asm__ __volatile__ (
+                "srl        %[p4], %[p4], 8     \n\t"
+                "srl        %[p3], %[p3], 8     \n\t"
+                "srl        %[p2], %[p2], 8     \n\t"
+                "srl        %[p1], %[p1], 8     \n\t"
+                : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+                :
+            );
+
+            __asm__ __volatile__ (
+                "sb         %[p4],  1(%[s1])    \n\t"
+                "sb         %[p3],  0(%[s1])    \n\t"
+                "sb         %[p2], -1(%[s1])    \n\t"
+                "sb         %[p1], -2(%[s1])    \n\t"
+                :
+                : [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1), [p2] "r" (p2), [p1] "r" (p1)
+            );
+        }
+    }
+
+    s1 = s4 + p;
+    s2 = s1 + p;
+    s3 = s2 + p;
+    s4 = s3 + p;
+
+    /* load quad-byte vectors
+     * memory is 4 byte aligned
+     */
+    p2  = *((uint32_t *)(s1 - 4));
+    p6  = *((uint32_t *)(s1));
+    p1  = *((uint32_t *)(s2 - 4));
+    p5  = *((uint32_t *)(s2));
+    p0  = *((uint32_t *)(s3 - 4));
+    p4  = *((uint32_t *)(s3));
+    pm1 = *((uint32_t *)(s4 - 4));
+    p3  = *((uint32_t *)(s4));
+
+    /* transpose pm1, p0, p1, p2 */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
+        "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
+
+        "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
+        "append         %[p1],      %[sec3],    16          \n\t"
+        "append         %[pm1],     %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    /* transpose p3, p4, p5, p6 */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
+        "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
+
+        "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
+        "append         %[p5],      %[sec3],    16          \n\t"
+        "append         %[p3],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+     * mask will be zero and filtering is not needed
+     */
+    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+    {
+
+        vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+                                 thresh, &hev, &mask);
+
+        /* if mask == 0 do filtering is not needed */
+        if (mask)
+        {
+            /* filtering */
+            vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
+
+            /* unpack processed 4x4 neighborhood
+             * don't use transpose on output data
+             * because memory isn't aligned
+             */
+            __asm__ __volatile__ (
+                "sb         %[p4],  1(%[s4])    \n\t"
+                "sb         %[p3],  0(%[s4])    \n\t"
+                "sb         %[p2], -1(%[s4])    \n\t"
+                "sb         %[p1], -2(%[s4])    \n\t"
+                :
+                : [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4),
+                  [p2] "r" (p2), [p1] "r" (p1)
+            );
+
+            __asm__ __volatile__ (
+                "srl        %[p4], %[p4], 8     \n\t"
+                "srl        %[p3], %[p3], 8     \n\t"
+                "srl        %[p2], %[p2], 8     \n\t"
+                "srl        %[p1], %[p1], 8     \n\t"
+                : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+                :
+            );
+
+            __asm__ __volatile__ (
+                "sb         %[p4],  1(%[s3])    \n\t"
+                "sb         %[p3],  0(%[s3])    \n\t"
+                "sb         %[p2], -1(%[s3])    \n\t"
+                "sb         %[p1], -2(%[s3])    \n\t"
+                : [p1] "+r" (p1)
+                : [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3), [p2] "r" (p2)
+            );
+
+            __asm__ __volatile__ (
+                "srl        %[p4], %[p4], 8     \n\t"
+                "srl        %[p3], %[p3], 8     \n\t"
+                "srl        %[p2], %[p2], 8     \n\t"
+                "srl        %[p1], %[p1], 8     \n\t"
+                : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+                :
+            );
+
+            __asm__ __volatile__ (
+                "sb         %[p4],  1(%[s2])    \n\t"
+                "sb         %[p3],  0(%[s2])    \n\t"
+                "sb         %[p2], -1(%[s2])    \n\t"
+                "sb         %[p1], -2(%[s2])    \n\t"
+                :
+                : [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2),
+                  [p2] "r" (p2), [p1] "r" (p1)
+            );
+
+            __asm__ __volatile__ (
+                "srl        %[p4], %[p4], 8     \n\t"
+                "srl        %[p3], %[p3], 8     \n\t"
+                "srl        %[p2], %[p2], 8     \n\t"
+                "srl        %[p1], %[p1], 8     \n\t"
+                : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+                :
+            );
+
+            __asm__ __volatile__ (
+                "sb         %[p4],  1(%[s1])    \n\t"
+                "sb         %[p3],  0(%[s1])    \n\t"
+                "sb         %[p2], -1(%[s1])    \n\t"
+                "sb         %[p1], -2(%[s1])    \n\t"
+                :
+                : [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1),
+                  [p2] "r" (p2), [p1] "r" (p1)
+            );
+        }
+    }
+}
+
+/* inputs & outputs are quad-byte vectors */
+static __inline void vp8_mbfilter_mips
+(
+    uint32_t mask,
+    uint32_t hev,
+    uint32_t *ps2,
+    uint32_t *ps1,
+    uint32_t *ps0,
+    uint32_t *qs0,
+    uint32_t *qs1,
+    uint32_t *qs2
+)
+{
+    int32_t vps2, vps1, vps0, vqs0, vqs1, vqs2;
+    int32_t vps2_l, vps1_l, vps0_l, vqs0_l, vqs1_l, vqs2_l;
+    int32_t vps2_r, vps1_r, vps0_r, vqs0_r, vqs1_r, vqs2_r;
+    uint32_t HWM, vp8_filter_l, vp8_filter_r, mask_l, mask_r, hev_l, hev_r, subr_r, subr_l;
+    uint32_t Filter2_l, Filter2_r, t1, t2, Filter1_l, Filter1_r, invhev_l, invhev_r;
+    uint32_t N128, R63;
+    uint32_t u1_l, u1_r, u2_l, u2_r, u3_l, u3_r;
+
+    R63  = 0x003F003F;
+    HWM  = 0xFF00FF00;
+    N128 = 0x80808080;
+    t1   = 0x03000300;
+    t2   = 0x04000400;
+
+    vps0 = (*ps0) ^ N128;
+    vps1 = (*ps1) ^ N128;
+    vps2 = (*ps2) ^ N128;
+    vqs0 = (*qs0) ^ N128;
+    vqs1 = (*qs1) ^ N128;
+    vqs2 = (*qs2) ^ N128;
+
+    /* use halfword pairs instead quad-bytes because of accuracy */
+    vps0_l = vps0 & HWM;
+    vps0_r = vps0 << 8;
+    vps0_r = vps0_r & HWM;
+
+    vqs0_l = vqs0 & HWM;
+    vqs0_r = vqs0 << 8;
+    vqs0_r = vqs0_r & HWM;
+
+    vps1_l = vps1 & HWM;
+    vps1_r = vps1 << 8;
+    vps1_r = vps1_r & HWM;
+
+    vqs1_l = vqs1 & HWM;
+    vqs1_r = vqs1 << 8;
+    vqs1_r = vqs1_r & HWM;
+
+    vqs2_l = vqs2 & HWM;
+    vqs2_r = vqs2 << 8;
+    vqs2_r = vqs2_r & HWM;
+
+    __asm__ __volatile__ (
+        /* qs0 - ps0 */
+        "subq_s.ph    %[subr_l],       %[vqs0_l],       %[vps0_l]       \n\t"
+        "subq_s.ph    %[subr_r],       %[vqs0_r],       %[vps0_r]       \n\t"
+
+        /* vp8_filter = vp8_signed_char_clamp(ps1 - qs1); */
+        "subq_s.ph    %[vp8_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
+        "subq_s.ph    %[vp8_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
+
+        : [vp8_filter_l] "=&r" (vp8_filter_l), [vp8_filter_r] "=r" (vp8_filter_r),
+          [subr_l] "=&r" (subr_l), [subr_r] "=&r" (subr_r)
+        : [vps0_l] "r" (vps0_l), [vps0_r] "r" (vps0_r), [vps1_l] "r" (vps1_l),
+          [vps1_r] "r" (vps1_r), [vqs0_l] "r" (vqs0_l), [vqs0_r] "r" (vqs0_r),
+          [vqs1_l] "r" (vqs1_l), [vqs1_r] "r" (vqs1_r)
+    );
+
+    vps2_l = vps2 & HWM;
+    vps2_r = vps2 << 8;
+    vps2_r = vps2_r & HWM;
+
+    /* add outer taps if we have high edge variance */
+    __asm__ __volatile__ (
+        /* vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0)); */
+        "addq_s.ph    %[vp8_filter_l], %[vp8_filter_l], %[subr_l]       \n\t"
+        "addq_s.ph    %[vp8_filter_r], %[vp8_filter_r], %[subr_r]       \n\t"
+        "and          %[mask_l],       %[HWM],          %[mask]         \n\t"
+        "sll          %[mask_r],       %[mask],         8               \n\t"
+        "and          %[mask_r],       %[HWM],          %[mask_r]       \n\t"
+        "addq_s.ph    %[vp8_filter_l], %[vp8_filter_l], %[subr_l]       \n\t"
+        "addq_s.ph    %[vp8_filter_r], %[vp8_filter_r], %[subr_r]       \n\t"
+        "and          %[hev_l],        %[HWM],          %[hev]          \n\t"
+        "sll          %[hev_r],        %[hev],          8               \n\t"
+        "and          %[hev_r],        %[HWM],          %[hev_r]        \n\t"
+        "addq_s.ph    %[vp8_filter_l], %[vp8_filter_l], %[subr_l]       \n\t"
+        "addq_s.ph    %[vp8_filter_r], %[vp8_filter_r], %[subr_r]       \n\t"
+
+        /* vp8_filter &= mask; */
+        "and          %[vp8_filter_l], %[vp8_filter_l], %[mask_l]       \n\t"
+        "and          %[vp8_filter_r], %[vp8_filter_r], %[mask_r]       \n\t"
+
+        /* Filter2 = vp8_filter & hev; */
+        "and          %[Filter2_l],    %[vp8_filter_l], %[hev_l]        \n\t"
+        "and          %[Filter2_r],    %[vp8_filter_r], %[hev_r]        \n\t"
+
+        : [vp8_filter_l] "+r" (vp8_filter_l), [vp8_filter_r] "+r" (vp8_filter_r),
+          [hev_l] "=&r" (hev_l), [hev_r] "=&r" (hev_r),
+          [mask_l] "=&r" (mask_l), [mask_r] "=&r" (mask_r),
+          [Filter2_l] "=&r" (Filter2_l), [Filter2_r] "=&r" (Filter2_r)
+        : [subr_l] "r" (subr_l), [subr_r] "r" (subr_r),
+          [HWM] "r" (HWM), [hev]  "r" (hev), [mask] "r" (mask)
+    );
+
+    /* save bottom 3 bits so that we round one side +4 and the other +3 */
+    __asm__ __volatile__ (
+        /* Filter1 = vp8_signed_char_clamp(Filter2 + 4) >>= 3; */
+        "addq_s.ph    %[Filter1_l],    %[Filter2_l],    %[t2]           \n\t"
+        "xor          %[invhev_l],     %[hev_l],        %[HWM]          \n\t"
+        "addq_s.ph    %[Filter1_r],    %[Filter2_r],    %[t2]           \n\t"
+
+        /* Filter2 = vp8_signed_char_clamp(Filter2 + 3) >>= 3; */
+        "addq_s.ph    %[Filter2_l],    %[Filter2_l],    %[t1]           \n\t"
+        "addq_s.ph    %[Filter2_r],    %[Filter2_r],    %[t1]           \n\t"
+
+        "shra.ph      %[Filter1_l],    %[Filter1_l],    3               \n\t"
+        "shra.ph      %[Filter1_r],    %[Filter1_r],    3               \n\t"
+
+        "shra.ph      %[Filter2_l],    %[Filter2_l],    3               \n\t"
+        "shra.ph      %[Filter2_r],    %[Filter2_r],    3               \n\t"
+        "and          %[Filter1_l],    %[Filter1_l],    %[HWM]          \n\t"
+        "and          %[Filter1_r],    %[Filter1_r],    %[HWM]          \n\t"
+        "xor          %[invhev_r],     %[hev_r],        %[HWM]          \n\t"
+
+        /* qs0 = vp8_signed_char_clamp(qs0 - Filter1); */
+        "subq_s.ph    %[vqs0_l],       %[vqs0_l],       %[Filter1_l]    \n\t"
+        "subq_s.ph    %[vqs0_r],       %[vqs0_r],       %[Filter1_r]    \n\t"
+
+        /* ps0 = vp8_signed_char_clamp(ps0 + Filter2); */
+        "addq_s.ph    %[vps0_l],       %[vps0_l],       %[Filter2_l]    \n\t"
+        "addq_s.ph    %[vps0_r],       %[vps0_r],       %[Filter2_r]    \n\t"
+
+        : [invhev_l] "=&r" (invhev_l), [invhev_r] "=&r" (invhev_r),
+          [Filter1_l] "=&r" (Filter1_l), [Filter1_r] "=&r" (Filter1_r),
+          [Filter2_l] "+r" (Filter2_l), [Filter2_r] "+r" (Filter2_r),
+          [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
+          [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
+        : [t1] "r" (t1), [t2] "r" (t2), [HWM] "r" (HWM),
+          [hev_l] "r" (hev_l), [hev_r] "r" (hev_r)
+    );
+
+    /* only apply wider filter if not high edge variance */
+    __asm__ __volatile__ (
+        /* vp8_filter &= ~hev; */
+        "and          %[Filter2_l],    %[vp8_filter_l], %[invhev_l]     \n\t"
+        "and          %[Filter2_r],    %[vp8_filter_r], %[invhev_r]     \n\t"
+
+        "shra.ph      %[Filter2_l],    %[Filter2_l],    8               \n\t"
+        "shra.ph      %[Filter2_r],    %[Filter2_r],    8               \n\t"
+
+        : [Filter2_l] "=&r" (Filter2_l), [Filter2_r] "=&r" (Filter2_r)
+        : [vp8_filter_l] "r" (vp8_filter_l), [vp8_filter_r] "r" (vp8_filter_r),
+          [invhev_l] "r" (invhev_l), [invhev_r] "r" (invhev_r)
+    );
+
+    /* roughly 3/7th difference across boundary */
+    __asm__ __volatile__ (
+        "shll.ph      %[u3_l],         %[Filter2_l],    3               \n\t"
+        "shll.ph      %[u3_r],         %[Filter2_r],    3               \n\t"
+
+        "addq.ph      %[u3_l],         %[u3_l],         %[Filter2_l]    \n\t"
+        "addq.ph      %[u3_r],         %[u3_r],         %[Filter2_r]    \n\t"
+
+        "shll.ph      %[u2_l],         %[u3_l],         1               \n\t"
+        "shll.ph      %[u2_r],         %[u3_r],         1               \n\t"
+
+        "addq.ph      %[u1_l],         %[u3_l],         %[u2_l]         \n\t"
+        "addq.ph      %[u1_r],         %[u3_r],         %[u2_r]         \n\t"
+
+        "addq.ph      %[u2_l],         %[u2_l],         %[R63]          \n\t"
+        "addq.ph      %[u2_r],         %[u2_r],         %[R63]          \n\t"
+
+        "addq.ph      %[u3_l],         %[u3_l],         %[R63]          \n\t"
+        "addq.ph      %[u3_r],         %[u3_r],         %[R63]          \n\t"
+
+        /* vp8_signed_char_clamp((63 + Filter2 * 27) >> 7)
+         * vp8_signed_char_clamp((63 + Filter2 * 18) >> 7)
+         */
+        "addq.ph      %[u1_l],         %[u1_l],         %[R63]          \n\t"
+        "addq.ph      %[u1_r],         %[u1_r],         %[R63]          \n\t"
+        "shra.ph      %[u1_l],         %[u1_l],         7               \n\t"
+        "shra.ph      %[u1_r],         %[u1_r],         7               \n\t"
+        "shra.ph      %[u2_l],         %[u2_l],         7               \n\t"
+        "shra.ph      %[u2_r],         %[u2_r],         7               \n\t"
+        "shll.ph      %[u1_l],         %[u1_l],         8               \n\t"
+        "shll.ph      %[u1_r],         %[u1_r],         8               \n\t"
+        "shll.ph      %[u2_l],         %[u2_l],         8               \n\t"
+        "shll.ph      %[u2_r],         %[u2_r],         8               \n\t"
+
+        /* vqs0 = vp8_signed_char_clamp(qs0 - u); */
+        "subq_s.ph    %[vqs0_l],       %[vqs0_l],       %[u1_l]         \n\t"
+        "subq_s.ph    %[vqs0_r],       %[vqs0_r],       %[u1_r]         \n\t"
+
+        /* vps0 = vp8_signed_char_clamp(ps0 + u); */
+        "addq_s.ph    %[vps0_l],       %[vps0_l],       %[u1_l]         \n\t"
+        "addq_s.ph    %[vps0_r],       %[vps0_r],       %[u1_r]         \n\t"
+
+        : [u1_l] "=&r" (u1_l), [u1_r] "=&r" (u1_r), [u2_l] "=&r" (u2_l),
+          [u2_r] "=&r" (u2_r), [u3_l] "=&r" (u3_l), [u3_r] "=&r" (u3_r),
+          [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
+          [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
+        : [R63]  "r" (R63),
+          [Filter2_l] "r" (Filter2_l), [Filter2_r] "r" (Filter2_r)
+    );
+
+    __asm__ __volatile__ (
+        /* vqs1 = vp8_signed_char_clamp(qs1 - u); */
+        "subq_s.ph    %[vqs1_l],       %[vqs1_l],       %[u2_l]         \n\t"
+        "addq_s.ph    %[vps1_l],       %[vps1_l],       %[u2_l]         \n\t"
+
+        /* vps1 = vp8_signed_char_clamp(ps1 + u); */
+        "addq_s.ph    %[vps1_r],       %[vps1_r],       %[u2_r]         \n\t"
+        "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[u2_r]         \n\t"
+
+        : [vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r),
+          [vqs1_l] "+r" (vqs1_l), [vqs1_r] "+r" (vqs1_r)
+        : [u2_l] "r" (u2_l), [u2_r] "r" (u2_r)
+    );
+
+    /* roughly 1/7th difference across boundary */
+    __asm__ __volatile__ (
+        /* u = vp8_signed_char_clamp((63 + Filter2 * 9) >> 7); */
+        "shra.ph      %[u3_l],         %[u3_l],         7               \n\t"
+        "shra.ph      %[u3_r],         %[u3_r],         7               \n\t"
+        "shll.ph      %[u3_l],         %[u3_l],         8               \n\t"
+        "shll.ph      %[u3_r],         %[u3_r],         8               \n\t"
+
+        /* vqs2 = vp8_signed_char_clamp(qs2 - u); */
+        "subq_s.ph    %[vqs2_l],       %[vqs2_l],       %[u3_l]         \n\t"
+        "subq_s.ph    %[vqs2_r],       %[vqs2_r],       %[u3_r]         \n\t"
+
+        /* vps2 = vp8_signed_char_clamp(ps2 + u); */
+        "addq_s.ph    %[vps2_l],       %[vps2_l],       %[u3_l]         \n\t"
+        "addq_s.ph    %[vps2_r],       %[vps2_r],       %[u3_r]         \n\t"
+
+        : [u3_l] "+r" (u3_l), [u3_r] "+r" (u3_r), [vps2_l] "+r" (vps2_l),
+          [vps2_r] "+r" (vps2_r), [vqs2_l] "+r" (vqs2_l), [vqs2_r] "+r" (vqs2_r)
+        :
+    );
+
+    /* Create quad-bytes from halfword pairs */
+    __asm__ __volatile__ (
+        "and          %[vqs0_l],       %[vqs0_l],       %[HWM]          \n\t"
+        "shrl.ph      %[vqs0_r],       %[vqs0_r],       8               \n\t"
+
+        "and          %[vps0_l],       %[vps0_l],       %[HWM]          \n\t"
+        "shrl.ph      %[vps0_r],       %[vps0_r],       8               \n\t"
+
+        "and          %[vqs1_l],       %[vqs1_l],       %[HWM]          \n\t"
+        "shrl.ph      %[vqs1_r],       %[vqs1_r],       8               \n\t"
+
+        "and          %[vps1_l],       %[vps1_l],       %[HWM]          \n\t"
+        "shrl.ph      %[vps1_r],       %[vps1_r],       8               \n\t"
+
+        "and          %[vqs2_l],       %[vqs2_l],       %[HWM]          \n\t"
+        "shrl.ph      %[vqs2_r],       %[vqs2_r],       8               \n\t"
+
+        "and          %[vps2_l],       %[vps2_l],       %[HWM]          \n\t"
+        "shrl.ph      %[vps2_r],       %[vps2_r],       8               \n\t"
+
+        "or           %[vqs0_r],       %[vqs0_l],       %[vqs0_r]       \n\t"
+        "or           %[vps0_r],       %[vps0_l],       %[vps0_r]       \n\t"
+        "or           %[vqs1_r],       %[vqs1_l],       %[vqs1_r]       \n\t"
+        "or           %[vps1_r],       %[vps1_l],       %[vps1_r]       \n\t"
+        "or           %[vqs2_r],       %[vqs2_l],       %[vqs2_r]       \n\t"
+        "or           %[vps2_r],       %[vps2_l],       %[vps2_r]       \n\t"
+
+        : [vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r), [vqs1_l] "+r" (vqs1_l),
+          [vqs1_r] "+r" (vqs1_r), [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
+          [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r), [vqs2_l] "+r" (vqs2_l),
+          [vqs2_r] "+r" (vqs2_r), [vps2_r] "+r" (vps2_r), [vps2_l] "+r" (vps2_l)
+        : [HWM] "r" (HWM)
+    );
+
+    *ps0 = vps0_r ^ N128;
+    *ps1 = vps1_r ^ N128;
+    *ps2 = vps2_r ^ N128;
+    *qs0 = vqs0_r ^ N128;
+    *qs1 = vqs1_r ^ N128;
+    *qs2 = vqs2_r ^ N128;
+}
+
+void vp8_mbloop_filter_horizontal_edge_mips
+(
+    unsigned char *s,
+    int p,
+    unsigned int flimit,
+    unsigned int limit,
+    unsigned int thresh,
+    int count
+)
+{
+    int i;
+    uint32_t mask, hev;
+    uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+    unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
+
+    mask = 0;
+    hev = 0;
+    i = 0;
+    p1 = 0;
+    p2 = 0;
+    p3 = 0;
+    p4 = 0;
+
+    /* loop filter designed to work using chars so that we can make maximum use
+     * of 8 bit simd instructions.
+     */
+
+    sm1 = s - (p << 2);
+    s0  = s - p - p - p;
+    s1  = s - p - p;
+    s2  = s - p;
+    s3  = s;
+    s4  = s + p;
+    s5  = s + p + p;
+    s6  = s + p + p + p;
+
+    /* prefetch data for load */
+    prefetch_load_lf(s + p);
+
+    /* apply filter on 4 pixesl at the same time */
+    do
+    {
+        /* load quad-byte vectors
+         * memory is 4 byte aligned
+         */
+        p1 = *((uint32_t *)(s1));
+        p2 = *((uint32_t *)(s2));
+        p3 = *((uint32_t *)(s3));
+        p4 = *((uint32_t *)(s4));
+
+        /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+         * mask will be zero and filtering is not needed
+         */
+        if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+        {
+
+            pm1 = *((uint32_t *)(sm1));
+            p0  = *((uint32_t *)(s0));
+            p5  = *((uint32_t *)(s5));
+            p6  = *((uint32_t *)(s6));
+
+            vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+                                     thresh, &hev, &mask);
+
+            /* if mask == 0 do filtering is not needed */
+            if (mask)
+            {
+                /* filtering */
+                vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
+
+                /* unpack processed 4x4 neighborhood
+                 * memory is 4 byte aligned
+                 */
+                *((uint32_t *)s0) = p0;
+                *((uint32_t *)s1) = p1;
+                *((uint32_t *)s2) = p2;
+                *((uint32_t *)s3) = p3;
+                *((uint32_t *)s4) = p4;
+                *((uint32_t *)s5) = p5;
+            }
+        }
+
+        sm1 += 4;
+        s0  += 4;
+        s1  += 4;
+        s2  += 4;
+        s3  += 4;
+        s4  += 4;
+        s5  += 4;
+        s6  += 4;
+
+        /* load quad-byte vectors
+         * memory is 4 byte aligned
+         */
+        p1 = *((uint32_t *)(s1));
+        p2 = *((uint32_t *)(s2));
+        p3 = *((uint32_t *)(s3));
+        p4 = *((uint32_t *)(s4));
+
+        /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+         * mask will be zero and filtering is not needed
+         */
+        if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+        {
+
+            pm1 = *((uint32_t *)(sm1));
+            p0  = *((uint32_t *)(s0));
+            p5  = *((uint32_t *)(s5));
+            p6  = *((uint32_t *)(s6));
+
+            vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+                                     thresh, &hev, &mask);
+
+            /* if mask == 0 do filtering is not needed */
+            if (mask)
+            {
+                /* filtering */
+                vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
+
+                /* unpack processed 4x4 neighborhood
+                 * memory is 4 byte aligned
+                 */
+                *((uint32_t *)s0) = p0;
+                *((uint32_t *)s1) = p1;
+                *((uint32_t *)s2) = p2;
+                *((uint32_t *)s3) = p3;
+                *((uint32_t *)s4) = p4;
+                *((uint32_t *)s5) = p5;
+            }
+        }
+
+        sm1 += 4;
+        s0  += 4;
+        s1  += 4;
+        s2  += 4;
+        s3  += 4;
+        s4  += 4;
+        s5  += 4;
+        s6  += 4;
+
+        i += 8;
+    }
+
+    while (i < count);
+}
+
+void vp8_mbloop_filter_uvhorizontal_edge_mips
+(
+    unsigned char *s,
+    int p,
+    unsigned int flimit,
+    unsigned int limit,
+    unsigned int thresh,
+    int count
+)
+{
+    uint32_t mask, hev;
+    uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+    unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
+
+    mask = 0;
+    hev = 0;
+    p1 = 0;
+    p2 = 0;
+    p3 = 0;
+    p4 = 0;
+
+    /* loop filter designed to work using chars so that we can make maximum use
+     * of 8 bit simd instructions.
+     */
+
+    sm1 = s - (p << 2);
+    s0  = s - p - p - p;
+    s1  = s - p - p;
+    s2  = s - p;
+    s3  = s;
+    s4  = s + p;
+    s5  = s + p + p;
+    s6  = s + p + p + p;
+
+    /* load quad-byte vectors
+     * memory is 4 byte aligned
+     */
+    p1 = *((uint32_t *)(s1));
+    p2 = *((uint32_t *)(s2));
+    p3 = *((uint32_t *)(s3));
+    p4 = *((uint32_t *)(s4));
+
+    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+     * mask will be zero and filtering is not needed
+     */
+    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+    {
+
+        pm1 = *((uint32_t *)(sm1));
+        p0  = *((uint32_t *)(s0));
+        p5  = *((uint32_t *)(s5));
+        p6  = *((uint32_t *)(s6));
+
+        /* if mask == 0 do filtering is not needed */
+        vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+                                 thresh, &hev, &mask);
+
+        if (mask)
+        {
+            /* filtering */
+            vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
+
+            /* unpack processed 4x4 neighborhood
+             * memory is 4 byte aligned
+             */
+            *((uint32_t *)s0) = p0;
+            *((uint32_t *)s1) = p1;
+            *((uint32_t *)s2) = p2;
+            *((uint32_t *)s3) = p3;
+            *((uint32_t *)s4) = p4;
+            *((uint32_t *)s5) = p5;
+        }
+    }
+
+    sm1 += 4;
+    s0  += 4;
+    s1  += 4;
+    s2  += 4;
+    s3  += 4;
+    s4  += 4;
+    s5  += 4;
+    s6  += 4;
+
+    /* load quad-byte vectors
+     * memory is 4 byte aligned
+     */
+    p1 = *((uint32_t *)(s1));
+    p2 = *((uint32_t *)(s2));
+    p3 = *((uint32_t *)(s3));
+    p4 = *((uint32_t *)(s4));
+
+    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+     * mask will be zero and filtering is not needed
+     */
+    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+    {
+
+        pm1 = *((uint32_t *)(sm1));
+        p0  = *((uint32_t *)(s0));
+        p5  = *((uint32_t *)(s5));
+        p6  = *((uint32_t *)(s6));
+
+        vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+                                 thresh, &hev, &mask);
+
+        /* if mask == 0 do filtering is not needed */
+        if (mask)
+        {
+            /* filtering */
+            vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
+
+            /* unpack processed 4x4 neighborhood
+             * memory is 4 byte aligned
+             */
+            *((uint32_t *)s0) = p0;
+            *((uint32_t *)s1) = p1;
+            *((uint32_t *)s2) = p2;
+            *((uint32_t *)s3) = p3;
+            *((uint32_t *)s4) = p4;
+            *((uint32_t *)s5) = p5;
+        }
+    }
+}
+
+
+void vp8_mbloop_filter_vertical_edge_mips
+(
+    unsigned char *s,
+    int p,
+    unsigned int flimit,
+    unsigned int limit,
+    unsigned int thresh,
+    int count
+)
+{
+
+    int i;
+    uint32_t mask, hev;
+    uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+    unsigned char *s1, *s2, *s3, *s4;
+    uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
+
+    mask = 0;
+    hev = 0;
+    i = 0;
+    pm1 = 0;
+    p0 = 0;
+    p1 = 0;
+    p2 = 0;
+    p3 = 0;
+    p4 = 0;
+    p5 = 0;
+    p6 = 0;
+
+    /* loop filter designed to work using chars so that we can make maximum use
+     * of 8 bit simd instructions.
+     */
+
+    /* apply filter on 4 pixesl at the same time */
+    do
+    {
+        s1 = s;
+        s2 = s + p;
+        s3 = s2 + p;
+        s4 = s3 + p;
+        s  = s4 + p;
+
+        /* load quad-byte vectors
+         * memory is 4 byte aligned
+         */
+        p2  = *((uint32_t *)(s1 - 4));
+        p6  = *((uint32_t *)(s1));
+        p1  = *((uint32_t *)(s2 - 4));
+        p5  = *((uint32_t *)(s2));
+        p0  = *((uint32_t *)(s3 - 4));
+        p4  = *((uint32_t *)(s3));
+        pm1 = *((uint32_t *)(s4 - 4));
+        p3  = *((uint32_t *)(s4));
+
+        /* transpose pm1, p0, p1, p2 */
+        __asm__ __volatile__ (
+            "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
+            "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
+            "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
+            "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
+
+            "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
+            "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
+            "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+            "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+            "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
+            "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
+            "append         %[p1],      %[sec3],    16          \n\t"
+            "append         %[pm1],     %[sec4],    16          \n\t"
+
+            : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+              [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+              [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
+              [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+            :
+        );
+
+        /* transpose p3, p4, p5, p6 */
+        __asm__ __volatile__ (
+            "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
+            "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
+            "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
+            "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
+
+            "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
+            "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
+            "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+            "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+            "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
+            "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
+            "append         %[p5],      %[sec3],    16          \n\t"
+            "append         %[p3],      %[sec4],    16          \n\t"
+
+            : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+              [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+              [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+              [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+            :
+        );
+
+        /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+         * mask will be zero and filtering is not needed
+         */
+        if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+        {
+
+            vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+                                     thresh, &hev, &mask);
+
+            /* if mask == 0 do filtering is not needed */
+            if (mask)
+            {
+                /* filtering */
+                vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
+
+                /* don't use transpose on output data
+                 * because memory isn't aligned
+                 */
+                __asm__ __volatile__ (
+                    "sb         %[p5],  2(%[s4])        \n\t"
+                    "sb         %[p4],  1(%[s4])        \n\t"
+                    "sb         %[p3],  0(%[s4])        \n\t"
+                    "sb         %[p2], -1(%[s4])        \n\t"
+                    "sb         %[p1], -2(%[s4])        \n\t"
+                    "sb         %[p0], -3(%[s4])        \n\t"
+                    :
+                    : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4),
+                      [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
+                );
+
+                __asm__ __volatile__ (
+                    "srl        %[p5], %[p5], 8         \n\t"
+                    "srl        %[p4], %[p4], 8         \n\t"
+                    "srl        %[p3], %[p3], 8         \n\t"
+                    "srl        %[p2], %[p2], 8         \n\t"
+                    "srl        %[p1], %[p1], 8         \n\t"
+                    "srl        %[p0], %[p0], 8         \n\t"
+                    : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+                      [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
+                    :
+                );
+
+                __asm__ __volatile__ (
+                    "sb         %[p5],  2(%[s3])        \n\t"
+                    "sb         %[p4],  1(%[s3])        \n\t"
+                    "sb         %[p3],  0(%[s3])        \n\t"
+                    "sb         %[p2], -1(%[s3])        \n\t"
+                    "sb         %[p1], -2(%[s3])        \n\t"
+                    "sb         %[p0], -3(%[s3])        \n\t"
+                    :
+                    : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3),
+                      [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
+                );
+
+                __asm__ __volatile__ (
+                    "srl        %[p5], %[p5], 8         \n\t"
+                    "srl        %[p4], %[p4], 8         \n\t"
+                    "srl        %[p3], %[p3], 8         \n\t"
+                    "srl        %[p2], %[p2], 8         \n\t"
+                    "srl        %[p1], %[p1], 8         \n\t"
+                    "srl        %[p0], %[p0], 8         \n\t"
+                    : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+                      [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
+                    :
+                );
+
+                __asm__ __volatile__ (
+                    "sb         %[p5],  2(%[s2])        \n\t"
+                    "sb         %[p4],  1(%[s2])        \n\t"
+                    "sb         %[p3],  0(%[s2])        \n\t"
+                    "sb         %[p2], -1(%[s2])        \n\t"
+                    "sb         %[p1], -2(%[s2])        \n\t"
+                    "sb         %[p0], -3(%[s2])        \n\t"
+                    :
+                    : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2),
+                      [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
+                );
+
+                __asm__ __volatile__ (
+                    "srl        %[p5], %[p5], 8         \n\t"
+                    "srl        %[p4], %[p4], 8         \n\t"
+                    "srl        %[p3], %[p3], 8         \n\t"
+                    "srl        %[p2], %[p2], 8         \n\t"
+                    "srl        %[p1], %[p1], 8         \n\t"
+                    "srl        %[p0], %[p0], 8         \n\t"
+                    : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+                      [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
+                    :
+                );
+
+                __asm__ __volatile__ (
+                    "sb         %[p5],  2(%[s1])        \n\t"
+                    "sb         %[p4],  1(%[s1])        \n\t"
+                    "sb         %[p3],  0(%[s1])        \n\t"
+                    "sb         %[p2], -1(%[s1])        \n\t"
+                    "sb         %[p1], -2(%[s1])        \n\t"
+                    "sb         %[p0], -3(%[s1])        \n\t"
+                    :
+                    : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1),
+                      [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
+                );
+            }
+        }
+
+        i += 4;
+    }
+
+    while (i < count);
+}
+
+void vp8_mbloop_filter_uvvertical_edge_mips
+(
+    unsigned char *s,
+    int p,
+    unsigned int flimit,
+    unsigned int limit,
+    unsigned int thresh,
+    int count
+)
+{
+    uint32_t mask, hev;
+    uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+    unsigned char *s1, *s2, *s3, *s4;
+    uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
+
+    mask = 0;
+    hev = 0;
+    pm1 = 0;
+    p0 = 0;
+    p1 = 0;
+    p2 = 0;
+    p3 = 0;
+    p4 = 0;
+    p5 = 0;
+    p6 = 0;
+
+    /* loop filter designed to work using chars so that we can make maximum use
+     * of 8 bit simd instructions.
+     */
+
+    /* apply filter on 4 pixesl at the same time */
+
+    s1 = s;
+    s2 = s + p;
+    s3 = s2 + p;
+    s4 = s3 + p;
+
+    /* prefetch data for load */
+    prefetch_load_lf(s + 2 * p);
+
+    /* load quad-byte vectors
+     * memory is 4 byte aligned
+     */
+    p2  = *((uint32_t *)(s1 - 4));
+    p6  = *((uint32_t *)(s1));
+    p1  = *((uint32_t *)(s2 - 4));
+    p5  = *((uint32_t *)(s2));
+    p0  = *((uint32_t *)(s3 - 4));
+    p4  = *((uint32_t *)(s3));
+    pm1 = *((uint32_t *)(s4 - 4));
+    p3  = *((uint32_t *)(s4));
+
+    /* transpose pm1, p0, p1, p2 */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
+        "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
+
+        "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
+        "append         %[p1],      %[sec3],    16          \n\t"
+        "append         %[pm1],     %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    /* transpose p3, p4, p5, p6 */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
+        "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
+
+        "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
+        "append         %[p5],      %[sec3],    16          \n\t"
+        "append         %[p3],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+     * mask will be zero and filtering is not needed
+     */
+    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+    {
+
+        vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+                                 thresh, &hev, &mask);
+
+        /* if mask == 0 do filtering is not needed */
+        if (mask)
+        {
+            /* filtering */
+            vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
+
+            /* don't use transpose on output data
+             * because memory isn't aligned
+             */
+            __asm__ __volatile__ (
+                "sb         %[p5],  2(%[s4])        \n\t"
+                "sb         %[p4],  1(%[s4])        \n\t"
+                "sb         %[p3],  0(%[s4])        \n\t"
+                "sb         %[p2], -1(%[s4])        \n\t"
+                "sb         %[p1], -2(%[s4])        \n\t"
+                "sb         %[p0], -3(%[s4])        \n\t"
+                :
+                : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4),
+                  [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
+            );
+
+            __asm__ __volatile__ (
+                "srl        %[p5], %[p5], 8         \n\t"
+                "srl        %[p4], %[p4], 8         \n\t"
+                "srl        %[p3], %[p3], 8         \n\t"
+                "srl        %[p2], %[p2], 8         \n\t"
+                "srl        %[p1], %[p1], 8         \n\t"
+                "srl        %[p0], %[p0], 8         \n\t"
+                : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+                  [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
+                :
+            );
+
+            __asm__ __volatile__ (
+                "sb         %[p5],  2(%[s3])        \n\t"
+                "sb         %[p4],  1(%[s3])        \n\t"
+                "sb         %[p3],  0(%[s3])        \n\t"
+                "sb         %[p2], -1(%[s3])        \n\t"
+                "sb         %[p1], -2(%[s3])        \n\t"
+                "sb         %[p0], -3(%[s3])        \n\t"
+                :
+                : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3),
+                  [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
+            );
+
+            __asm__ __volatile__ (
+                "srl        %[p5], %[p5], 8         \n\t"
+                "srl        %[p4], %[p4], 8         \n\t"
+                "srl        %[p3], %[p3], 8         \n\t"
+                "srl        %[p2], %[p2], 8         \n\t"
+                "srl        %[p1], %[p1], 8         \n\t"
+                "srl        %[p0], %[p0], 8         \n\t"
+                : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+                  [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
+                :
+            );
+
+            __asm__ __volatile__ (
+                "sb         %[p5],  2(%[s2])        \n\t"
+                "sb         %[p4],  1(%[s2])        \n\t"
+                "sb         %[p3],  0(%[s2])        \n\t"
+                "sb         %[p2], -1(%[s2])        \n\t"
+                "sb         %[p1], -2(%[s2])        \n\t"
+                "sb         %[p0], -3(%[s2])        \n\t"
+                :
+                : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2),
+                  [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
+            );
+
+            __asm__ __volatile__ (
+                "srl        %[p5], %[p5], 8         \n\t"
+                "srl        %[p4], %[p4], 8         \n\t"
+                "srl        %[p3], %[p3], 8         \n\t"
+                "srl        %[p2], %[p2], 8         \n\t"
+                "srl        %[p1], %[p1], 8         \n\t"
+                "srl        %[p0], %[p0], 8         \n\t"
+                : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+                  [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
+                :
+            );
+
+            __asm__ __volatile__ (
+                "sb         %[p5],  2(%[s1])        \n\t"
+                "sb         %[p4],  1(%[s1])        \n\t"
+                "sb         %[p3],  0(%[s1])        \n\t"
+                "sb         %[p2], -1(%[s1])        \n\t"
+                "sb         %[p1], -2(%[s1])        \n\t"
+                "sb         %[p0], -3(%[s1])        \n\t"
+                :
+                : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1),
+                  [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
+            );
+        }
+    }
+
+    s1 = s4 + p;
+    s2 = s1 + p;
+    s3 = s2 + p;
+    s4 = s3 + p;
+
+    /* load quad-byte vectors
+    * memory is 4 byte aligned
+    */
+    p2  = *((uint32_t *)(s1 - 4));
+    p6  = *((uint32_t *)(s1));
+    p1  = *((uint32_t *)(s2 - 4));
+    p5  = *((uint32_t *)(s2));
+    p0  = *((uint32_t *)(s3 - 4));
+    p4  = *((uint32_t *)(s3));
+    pm1 = *((uint32_t *)(s4 - 4));
+    p3  = *((uint32_t *)(s4));
+
+    /* transpose pm1, p0, p1, p2 */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
+        "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
+
+        "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
+        "append         %[p1],      %[sec3],    16          \n\t"
+        "append         %[pm1],     %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    /* transpose p3, p4, p5, p6 */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
+        "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
+
+        "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
+        "append         %[p5],      %[sec3],    16          \n\t"
+        "append         %[p3],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+     * mask will be zero and filtering is not needed
+     */
+    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+    {
+
+        vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, thresh, &hev, &mask);
+
+        /* if mask == 0 do filtering is not needed */
+        if (mask)
+        {
+            /* filtering */
+            vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
+
+            /* don't use transpose on output data
+             * because memory isn't aligned
+             */
+            __asm__ __volatile__ (
+                "sb         %[p5],  2(%[s4])        \n\t"
+                "sb         %[p4],  1(%[s4])        \n\t"
+                "sb         %[p3],  0(%[s4])        \n\t"
+                "sb         %[p2], -1(%[s4])        \n\t"
+                "sb         %[p1], -2(%[s4])        \n\t"
+                "sb         %[p0], -3(%[s4])        \n\t"
+                :
+                : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4),
+                  [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
+            );
+
+            __asm__ __volatile__ (
+                "srl        %[p5], %[p5], 8         \n\t"
+                "srl        %[p4], %[p4], 8         \n\t"
+                "srl        %[p3], %[p3], 8         \n\t"
+                "srl        %[p2], %[p2], 8         \n\t"
+                "srl        %[p1], %[p1], 8         \n\t"
+                "srl        %[p0], %[p0], 8         \n\t"
+                : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+                  [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
+                :
+            );
+
+            __asm__ __volatile__ (
+                "sb         %[p5],  2(%[s3])        \n\t"
+                "sb         %[p4],  1(%[s3])        \n\t"
+                "sb         %[p3],  0(%[s3])        \n\t"
+                "sb         %[p2], -1(%[s3])        \n\t"
+                "sb         %[p1], -2(%[s3])        \n\t"
+                "sb         %[p0], -3(%[s3])        \n\t"
+                :
+                : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3),
+                  [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
+            );
+
+            __asm__ __volatile__ (
+                "srl        %[p5], %[p5], 8         \n\t"
+                "srl        %[p4], %[p4], 8         \n\t"
+                "srl        %[p3], %[p3], 8         \n\t"
+                "srl        %[p2], %[p2], 8         \n\t"
+                "srl        %[p1], %[p1], 8         \n\t"
+                "srl        %[p0], %[p0], 8         \n\t"
+                : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+                  [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
+                :
+            );
+
+            __asm__ __volatile__ (
+                "sb         %[p5],  2(%[s2])        \n\t"
+                "sb         %[p4],  1(%[s2])        \n\t"
+                "sb         %[p3],  0(%[s2])        \n\t"
+                "sb         %[p2], -1(%[s2])        \n\t"
+                "sb         %[p1], -2(%[s2])        \n\t"
+                "sb         %[p0], -3(%[s2])        \n\t"
+                :
+                : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2),
+                  [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
+            );
+
+            __asm__ __volatile__ (
+                "srl        %[p5], %[p5], 8         \n\t"
+                "srl        %[p4], %[p4], 8         \n\t"
+                "srl        %[p3], %[p3], 8         \n\t"
+                "srl        %[p2], %[p2], 8         \n\t"
+                "srl        %[p1], %[p1], 8         \n\t"
+                "srl        %[p0], %[p0], 8         \n\t"
+                : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+                  [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
+                :
+            );
+
+            __asm__ __volatile__ (
+                "sb         %[p5],  2(%[s1])        \n\t"
+                "sb         %[p4],  1(%[s1])        \n\t"
+                "sb         %[p3],  0(%[s1])        \n\t"
+                "sb         %[p2], -1(%[s1])        \n\t"
+                "sb         %[p1], -2(%[s1])        \n\t"
+                "sb         %[p0], -3(%[s1])        \n\t"
+                :
+                : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1),
+                  [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
+            );
+        }
+    }
+}
+
+/* Horizontal MB filtering */
+void vp8_loop_filter_mbh_dspr2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                               int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    unsigned int thresh_vec, flimit_vec, limit_vec;
+    unsigned char thresh, flimit, limit, flimit_temp;
+
+    /* use direct value instead pointers */
+    limit = *(lfi->lim);
+    flimit_temp = *(lfi->mblim);
+    thresh = *(lfi->hev_thr);
+    flimit = flimit_temp;
+
+    /* create quad-byte */
+    __asm__ __volatile__ (
+        "replv.qb       %[thresh_vec], %[thresh]    \n\t"
+        "replv.qb       %[flimit_vec], %[flimit]    \n\t"
+        "replv.qb       %[limit_vec],  %[limit]     \n\t"
+        : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), [limit_vec] "=r" (limit_vec)
+        : [thresh] "r" (thresh), [flimit] "r" (flimit), [limit] "r" (limit)
+    );
+
+    vp8_mbloop_filter_horizontal_edge_mips(y_ptr, y_stride, flimit_vec, limit_vec, thresh_vec, 16);
+
+    if (u_ptr)
+    {
+        vp8_mbloop_filter_uvhorizontal_edge_mips(u_ptr, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
+    }
+
+    if (v_ptr)
+    {
+        vp8_mbloop_filter_uvhorizontal_edge_mips(v_ptr, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
+    }
+}
+
+
+/* Vertical MB Filtering */
+void vp8_loop_filter_mbv_dspr2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                               int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    unsigned int thresh_vec, flimit_vec, limit_vec;
+    unsigned char thresh, flimit, limit, flimit_temp;
+
+    /* use direct value instead pointers */
+    limit = *(lfi->lim);
+    flimit_temp = *(lfi->mblim);
+    thresh = *(lfi->hev_thr);
+    flimit = flimit_temp;
+
+    /* create quad-byte */
+    __asm__ __volatile__ (
+        "replv.qb       %[thresh_vec], %[thresh]    \n\t"
+        "replv.qb       %[flimit_vec], %[flimit]    \n\t"
+        "replv.qb       %[limit_vec],  %[limit]     \n\t"
+        : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), [limit_vec] "=r" (limit_vec)
+        : [thresh] "r" (thresh), [flimit] "r" (flimit), [limit] "r" (limit)
+    );
+
+    vp8_mbloop_filter_vertical_edge_mips(y_ptr, y_stride, flimit_vec, limit_vec, thresh_vec, 16);
+
+    if (u_ptr)
+        vp8_mbloop_filter_uvvertical_edge_mips(u_ptr, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
+
+    if (v_ptr)
+        vp8_mbloop_filter_uvvertical_edge_mips(v_ptr, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
+}
+
+
+/* Horizontal B Filtering */
+void vp8_loop_filter_bh_dspr2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    unsigned int thresh_vec, flimit_vec, limit_vec;
+    unsigned char thresh, flimit, limit, flimit_temp;
+
+    /* use direct value instead pointers */
+    limit = *(lfi->lim);
+    flimit_temp = *(lfi->blim);
+    thresh = *(lfi->hev_thr);
+    flimit = flimit_temp;
+
+    /* create quad-byte */
+    __asm__ __volatile__ (
+        "replv.qb       %[thresh_vec], %[thresh]    \n\t"
+        "replv.qb       %[flimit_vec], %[flimit]    \n\t"
+        "replv.qb       %[limit_vec],  %[limit]     \n\t"
+        : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), [limit_vec] "=r" (limit_vec)
+        : [thresh] "r" (thresh), [flimit] "r" (flimit), [limit] "r" (limit)
+    );
+
+    vp8_loop_filter_horizontal_edge_mips(y_ptr + 4 * y_stride, y_stride, flimit_vec, limit_vec, thresh_vec, 16);
+    vp8_loop_filter_horizontal_edge_mips(y_ptr + 8 * y_stride, y_stride, flimit_vec, limit_vec, thresh_vec, 16);
+    vp8_loop_filter_horizontal_edge_mips(y_ptr + 12 * y_stride, y_stride, flimit_vec, limit_vec, thresh_vec, 16);
+
+    if (u_ptr)
+        vp8_loop_filter_uvhorizontal_edge_mips(u_ptr + 4 * uv_stride, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
+
+    if (v_ptr)
+        vp8_loop_filter_uvhorizontal_edge_mips(v_ptr + 4 * uv_stride, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
+}
+
+
+/* Vertical B Filtering */
+void vp8_loop_filter_bv_dspr2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    unsigned int thresh_vec, flimit_vec, limit_vec;
+    unsigned char thresh, flimit, limit, flimit_temp;
+
+    /* use direct value instead pointers */
+    limit = *(lfi->lim);
+    flimit_temp = *(lfi->blim);
+    thresh = *(lfi->hev_thr);
+    flimit = flimit_temp;
+
+    /* create quad-byte */
+    __asm__ __volatile__ (
+        "replv.qb       %[thresh_vec], %[thresh]    \n\t"
+        "replv.qb       %[flimit_vec], %[flimit]    \n\t"
+        "replv.qb       %[limit_vec],  %[limit]     \n\t"
+        : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), [limit_vec] "=r" (limit_vec)
+        : [thresh] "r" (thresh), [flimit] "r" (flimit), [limit] "r" (limit)
+    );
+
+    vp8_loop_filter_vertical_edge_mips(y_ptr + 4, y_stride, flimit_vec, limit_vec, thresh_vec, 16);
+    vp8_loop_filter_vertical_edge_mips(y_ptr + 8, y_stride, flimit_vec, limit_vec, thresh_vec, 16);
+    vp8_loop_filter_vertical_edge_mips(y_ptr + 12, y_stride, flimit_vec, limit_vec, thresh_vec, 16);
+
+    if (u_ptr)
+        vp8_loop_filter_uvvertical_edge_mips(u_ptr + 4, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
+
+    if (v_ptr)
+        vp8_loop_filter_uvvertical_edge_mips(v_ptr + 4, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
+}
+
+#endif
diff --git a/libs/libvpx/vp8/common/mips/msa/bilinear_filter_msa.c b/libs/libvpx/vp8/common/mips/msa/bilinear_filter_msa.c
new file mode 100644
index 0000000000..1054ed3997
--- /dev/null
+++ b/libs/libvpx/vp8/common/mips/msa/bilinear_filter_msa.c
@@ -0,0 +1,911 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vpx_ports/mem.h"
+#include "vp8/common/filter.h"
+#include "vp8/common/mips/msa/vp8_macros_msa.h"
+
+DECLARE_ALIGNED(16, static const int8_t, vp8_bilinear_filters_msa[7][2]) =
+{
+    { 112, 16 },
+    { 96, 32 },
+    { 80, 48 },
+    { 64, 64 },
+    { 48, 80 },
+    { 32, 96 },
+    { 16, 112 }
+};
+
+static const uint8_t vp8_mc_filt_mask_arr[16 * 3] =
+{
+    /* 8 width cases */
+    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+    /* 4 width cases */
+    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+    /* 4 width cases */
+    8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+static void common_hz_2t_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, mask;
+    v16u8 filt0, vec0, vec1, res0, res1;
+    v8u16 vec2, vec3, filt;
+
+    mask = LD_SB(&vp8_mc_filt_mask_arr[16]);
+
+    filt = LD_UH(filter);
+    filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
+    SRARI_H2_UH(vec2, vec3, VP8_FILTER_SHIFT);
+    PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
+    ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hz_2t_4x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16u8 vec0, vec1, vec2, vec3, filt0;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+    v16i8 res0, res1, res2, res3;
+    v8u16 vec4, vec5, vec6, vec7, filt;
+
+    mask = LD_SB(&vp8_mc_filt_mask_arr[16]);
+
+    filt = LD_UH(filter);
+    filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
+    VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                vec4, vec5, vec6, vec7);
+    SRARI_H4_UH(vec4, vec5, vec6, vec7, VP8_FILTER_SHIFT);
+    PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
+                res0, res1, res2, res3);
+    ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hz_2t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height)
+{
+    if (4 == height)
+    {
+        common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+    }
+    else if (8 == height)
+    {
+        common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+    }
+}
+
+static void common_hz_2t_8x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16u8 filt0;
+    v16i8 src0, src1, src2, src3, mask;
+    v8u16 vec0, vec1, vec2, vec3, filt;
+
+    mask = LD_SB(&vp8_mc_filt_mask_arr[0]);
+
+    filt = LD_UH(filter);
+    filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                vec0, vec1, vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, VP8_FILTER_SHIFT);
+    PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
+    ST8x4_UB(src0, src1, dst, dst_stride);
+}
+
+static void common_hz_2t_8x8mult_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                     uint8_t *RESTRICT dst, int32_t dst_stride,
+                                     const int8_t *filter, int32_t height)
+{
+    v16u8 filt0;
+    v16i8 src0, src1, src2, src3, mask, out0, out1;
+    v8u16 vec0, vec1, vec2, vec3, filt;
+
+    mask = LD_SB(&vp8_mc_filt_mask_arr[0]);
+
+    filt = LD_UH(filter);
+    filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                vec0, vec1, vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, VP8_FILTER_SHIFT);
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+
+    PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                vec0, vec1, vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, VP8_FILTER_SHIFT);
+    PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    if (16 == height)
+    {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+        VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    vec0, vec1, vec2, vec3);
+        SRARI_H4_UH(vec0, vec1, vec2, vec3, VP8_FILTER_SHIFT);
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+
+        VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+        VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    vec0, vec1, vec2, vec3);
+        SRARI_H4_UH(vec0, vec1, vec2, vec3, VP8_FILTER_SHIFT);
+        PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+        ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
+    }
+}
+
+static void common_hz_2t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height)
+{
+    if (4 == height)
+    {
+        common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
+    }
+    else
+    {
+        common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
+    }
+}
+
+static void common_hz_2t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+    v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
+
+    mask = LD_SB(&vp8_mc_filt_mask_arr[0]);
+
+    loop_cnt = (height >> 2) - 1;
+
+    filt = LD_UH(filter);
+    filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+    LD_SB4(src, src_stride, src0, src2, src4, src6);
+    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+    src += (4 * src_stride);
+
+    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                out0, out1, out2, out3);
+    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
+                out4, out5, out6, out7);
+    SRARI_H4_UH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
+    SRARI_H4_UH(out4, out5, out6, out7, VP8_FILTER_SHIFT);
+    PCKEV_ST_SB(out0, out1, dst);
+    dst += dst_stride;
+    PCKEV_ST_SB(out2, out3, dst);
+    dst += dst_stride;
+    PCKEV_ST_SB(out4, out5, dst);
+    dst += dst_stride;
+    PCKEV_ST_SB(out6, out7, dst);
+    dst += dst_stride;
+
+    for (; loop_cnt--;)
+    {
+        LD_SB4(src, src_stride, src0, src2, src4, src6);
+        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+        src += (4 * src_stride);
+
+        VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+        VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+        VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+        VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    out0, out1, out2, out3);
+        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
+                    out4, out5, out6, out7);
+        SRARI_H4_UH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
+        SRARI_H4_UH(out4, out5, out6, out7, VP8_FILTER_SHIFT);
+        PCKEV_ST_SB(out0, out1, dst);
+        dst += dst_stride;
+        PCKEV_ST_SB(out2, out3, dst);
+        dst += dst_stride;
+        PCKEV_ST_SB(out4, out5, dst);
+        dst += dst_stride;
+        PCKEV_ST_SB(out6, out7, dst);
+        dst += dst_stride;
+    }
+}
+
+static void common_vt_2t_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
+    v16u8 filt0;
+    v8i16 filt;
+    v8u16 tmp0, tmp1;
+
+    filt = LD_SH(filter);
+    filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_r, src21_r, src32_r, src43_r);
+    ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+    DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, VP8_FILTER_SHIFT);
+    src2110 = __msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+    ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_vt_2t_4x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r;
+    v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776;
+    v8u16 tmp0, tmp1, tmp2, tmp3;
+    v16u8 filt0;
+    v8i16 filt;
+
+    filt = LD_SH(filter);
+    filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+
+    src8 = LD_SB(src);
+    src += src_stride;
+
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+               src32_r, src43_r);
+    ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
+               src76_r, src87_r);
+    ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
+               src87_r, src76_r, src2110, src4332, src6554, src8776);
+    DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
+                tmp0, tmp1, tmp2, tmp3);
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, VP8_FILTER_SHIFT);
+    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
+    ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
+    ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
+}
+
+static void common_vt_2t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height)
+{
+    if (4 == height)
+    {
+        common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+    }
+    else if (8 == height)
+    {
+        common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+    }
+}
+
+static void common_vt_2t_8x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
+    v16i8 out0, out1;
+    v8u16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    filt = LD_SH(filter);
+    filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+    LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
+    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
+    ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                tmp0, tmp1, tmp2, tmp3);
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, VP8_FILTER_SHIFT);
+    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+}
+
+static void common_vt_2t_8x8mult_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                     uint8_t *RESTRICT dst, int32_t dst_stride,
+                                     const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+    v16i8 out0, out1;
+    v8u16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    filt = LD_SH(filter);
+    filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;)
+    {
+        LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
+        src += (8 * src_stride);
+
+        ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
+                   vec0, vec1, vec2, vec3);
+        ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
+                   vec4, vec5, vec6, vec7);
+        DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                    tmp0, tmp1, tmp2, tmp3);
+        SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, VP8_FILTER_SHIFT);
+        PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
+                    tmp0, tmp1, tmp2, tmp3);
+        SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, VP8_FILTER_SHIFT);
+        PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src0 = src8;
+    }
+}
+
+static void common_vt_2t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height)
+{
+    if (4 == height)
+    {
+        common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
+    }
+    else
+    {
+        common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
+                                 height);
+    }
+}
+
+static void common_vt_2t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16u8 src0, src1, src2, src3, src4;
+    v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+    v8u16 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    filt = LD_SH(filter);
+    filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+    src0 = LD_UB(src);
+    src += src_stride;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;)
+    {
+        LD_UB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+
+        ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+        ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+        DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, VP8_FILTER_SHIFT);
+        PCKEV_ST_SB(tmp0, tmp1, dst);
+        dst += dst_stride;
+
+        ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
+        ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
+        DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, VP8_FILTER_SHIFT);
+        PCKEV_ST_SB(tmp2, tmp3, dst);
+        dst += dst_stride;
+
+        DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+        SRARI_H2_UH(tmp0, tmp1, VP8_FILTER_SHIFT);
+        PCKEV_ST_SB(tmp0, tmp1, dst);
+        dst += dst_stride;
+
+        DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+        SRARI_H2_UH(tmp2, tmp3, VP8_FILTER_SHIFT);
+        PCKEV_ST_SB(tmp2, tmp3, dst);
+        dst += dst_stride;
+
+        src0 = src4;
+    }
+}
+
+static void common_hv_2ht_2vt_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                      uint8_t *RESTRICT dst, int32_t dst_stride,
+                                      const int8_t *filter_horiz,
+                                      const int8_t *filter_vert)
+{
+    v16i8 src0, src1, src2, src3, src4, mask;
+    v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1;
+    v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1;
+
+    mask = LD_SB(&vp8_mc_filt_mask_arr[16]);
+
+    filt = LD_UH(filter_horiz);
+    filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
+    filt = LD_UH(filter_vert);
+    filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, VP8_FILTER_SHIFT);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, VP8_FILTER_SHIFT);
+    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, VP8_FILTER_SHIFT);
+    hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
+    hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
+
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, VP8_FILTER_SHIFT);
+    PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
+    ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_4x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                      uint8_t *RESTRICT dst, int32_t dst_stride,
+                                      const int8_t *filter_horiz,
+                                      const int8_t *filter_vert)
+{
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
+    v16i8 res0, res1, res2, res3;
+    v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
+    v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt;
+
+    mask = LD_SB(&vp8_mc_filt_mask_arr[16]);
+
+    filt = LD_UH(filter_horiz);
+    filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
+    filt = LD_UH(filter_vert);
+    filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+    src8 = LD_SB(src);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, VP8_FILTER_SHIFT);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, VP8_FILTER_SHIFT);
+    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, VP8_FILTER_SHIFT);
+    hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, VP8_FILTER_SHIFT);
+    hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, VP8_FILTER_SHIFT);
+    SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
+               hz_out3, hz_out5, 8);
+    hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6);
+
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
+                vec4, vec5, vec6, vec7);
+    SRARI_H4_UH(vec4, vec5, vec6, vec7, VP8_FILTER_SHIFT);
+    PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
+                res0, res1, res2, res3);
+    ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+    dst += (4 * dst_stride);
+    ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                     uint8_t *RESTRICT dst, int32_t dst_stride,
+                                     const int8_t *filter_horiz,
+                                     const int8_t *filter_vert,
+                                     int32_t height)
+{
+    if (4 == height)
+    {
+        common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride,
+                                  filter_horiz, filter_vert);
+    }
+    else if (8 == height)
+    {
+        common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride,
+                                  filter_horiz, filter_vert);
+    }
+}
+
+static void common_hv_2ht_2vt_8x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                      uint8_t *RESTRICT dst, int32_t dst_stride,
+                                      const int8_t *filter_horiz,
+                                      const int8_t *filter_vert)
+{
+    v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
+    v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
+    v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt;
+
+    mask = LD_SB(&vp8_mc_filt_mask_arr[0]);
+
+    filt = LD_SH(filter_horiz);
+    filt_hz = (v16u8)__msa_splati_h(filt, 0);
+    filt = LD_SH(filter_vert);
+    filt_vt = (v16u8)__msa_splati_h(filt, 0);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, VP8_FILTER_SHIFT);
+    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, VP8_FILTER_SHIFT);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+    tmp0 = __msa_dotp_u_h(vec0, filt_vt);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, VP8_FILTER_SHIFT);
+    vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+    tmp1 = __msa_dotp_u_h(vec1, filt_vt);
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, VP8_FILTER_SHIFT);
+    vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+    tmp2 = __msa_dotp_u_h(vec2, filt_vt);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, VP8_FILTER_SHIFT);
+    vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+    tmp3 = __msa_dotp_u_h(vec3, filt_vt);
+
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, VP8_FILTER_SHIFT);
+    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_8x8mult_msa(uint8_t *RESTRICT src,
+                                          int32_t src_stride,
+                                          uint8_t *RESTRICT dst,
+                                          int32_t dst_stride,
+                                          const int8_t *filter_horiz,
+                                          const int8_t *filter_vert,
+                                          int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
+    v16u8 filt_hz, filt_vt, vec0;
+    v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+    v8i16 filt;
+
+    mask = LD_SB(&vp8_mc_filt_mask_arr[0]);
+
+    filt = LD_SH(filter_horiz);
+    filt_hz = (v16u8)__msa_splati_h(filt, 0);
+    filt = LD_SH(filter_vert);
+    filt_vt = (v16u8)__msa_splati_h(filt, 0);
+
+    src0 = LD_SB(src);
+    src += src_stride;
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, VP8_FILTER_SHIFT);
+
+    for (loop_cnt = (height >> 3); loop_cnt--;)
+    {
+        LD_SB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz,
+                                     VP8_FILTER_SHIFT);
+        vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+        tmp1 = __msa_dotp_u_h(vec0, filt_vt);
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz,
+                                     VP8_FILTER_SHIFT);
+        vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+        tmp2 = __msa_dotp_u_h(vec0, filt_vt);
+
+        SRARI_H2_UH(tmp1, tmp2, VP8_FILTER_SHIFT);
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz,
+                                     VP8_FILTER_SHIFT);
+        vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+        tmp3 = __msa_dotp_u_h(vec0, filt_vt);
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz,
+                                     VP8_FILTER_SHIFT);
+        LD_SB4(src, src_stride, src1, src2, src3, src4);
+        src += (4 * src_stride);
+        vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+        tmp4 = __msa_dotp_u_h(vec0, filt_vt);
+
+        SRARI_H2_UH(tmp3, tmp4, VP8_FILTER_SHIFT);
+        PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz,
+                                     VP8_FILTER_SHIFT);
+        vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+        tmp5 = __msa_dotp_u_h(vec0, filt_vt);
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz,
+                                     VP8_FILTER_SHIFT);
+        vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+        tmp6 = __msa_dotp_u_h(vec0, filt_vt);
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz,
+                                     VP8_FILTER_SHIFT);
+        vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+        tmp7 = __msa_dotp_u_h(vec0, filt_vt);
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz,
+                                     VP8_FILTER_SHIFT);
+        vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+        tmp8 = __msa_dotp_u_h(vec0, filt_vt);
+
+        SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, VP8_FILTER_SHIFT);
+        PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void common_hv_2ht_2vt_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                     uint8_t *RESTRICT dst, int32_t dst_stride,
+                                     const int8_t *filter_horiz,
+                                     const int8_t *filter_vert,
+                                     int32_t height)
+{
+    if (4 == height)
+    {
+        common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride,
+                                  filter_horiz, filter_vert);
+    }
+    else
+    {
+        common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride,
+                                      filter_horiz, filter_vert, height);
+    }
+}
+
+static void common_hv_2ht_2vt_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                      uint8_t *RESTRICT dst, int32_t dst_stride,
+                                      const int8_t *filter_horiz,
+                                      const int8_t *filter_vert,
+                                      int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+    v16u8 filt_hz, filt_vt, vec0, vec1;
+    v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
+    v8i16 filt;
+
+    mask = LD_SB(&vp8_mc_filt_mask_arr[0]);
+
+    /* rearranging filter */
+    filt = LD_SH(filter_horiz);
+    filt_hz = (v16u8)__msa_splati_h(filt, 0);
+    filt = LD_SH(filter_vert);
+    filt_vt = (v16u8)__msa_splati_h(filt, 0);
+
+    LD_SB2(src, 8, src0, src1);
+    src += src_stride;
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, VP8_FILTER_SHIFT);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, VP8_FILTER_SHIFT);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;)
+    {
+        LD_SB4(src, src_stride, src0, src2, src4, src6);
+        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+        src += (4 * src_stride);
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz,
+                                     VP8_FILTER_SHIFT);
+        hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz,
+                                     VP8_FILTER_SHIFT);
+        ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+        SRARI_H2_UH(tmp1, tmp2, VP8_FILTER_SHIFT);
+        PCKEV_ST_SB(tmp1, tmp2, dst);
+        dst += dst_stride;
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz,
+                                     VP8_FILTER_SHIFT);
+        hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz,
+                                     VP8_FILTER_SHIFT);
+        ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+        SRARI_H2_UH(tmp1, tmp2, VP8_FILTER_SHIFT);
+        PCKEV_ST_SB(tmp1, tmp2, dst);
+        dst += dst_stride;
+
+        hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz,
+                                     VP8_FILTER_SHIFT);
+        hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz,
+                                     VP8_FILTER_SHIFT);
+        ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+        SRARI_H2_UH(tmp1, tmp2, VP8_FILTER_SHIFT);
+        PCKEV_ST_SB(tmp1, tmp2, dst);
+        dst += dst_stride;
+
+        hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz,
+                                     VP8_FILTER_SHIFT);
+        hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz,
+                                     VP8_FILTER_SHIFT);
+        ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+        DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+        SRARI_H2_UH(tmp1, tmp2, VP8_FILTER_SHIFT);
+        PCKEV_ST_SB(tmp1, tmp2, dst);
+        dst += dst_stride;
+    }
+}
+
+void vp8_bilinear_predict4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                 int32_t xoffset, int32_t yoffset,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride)
+{
+    const int8_t *h_filter = vp8_bilinear_filters_msa[xoffset - 1];
+    const int8_t *v_filter = vp8_bilinear_filters_msa[yoffset - 1];
+
+    if (yoffset)
+    {
+        if (xoffset)
+        {
+            common_hv_2ht_2vt_4w_msa(src, src_stride, dst, dst_stride,
+                                     h_filter, v_filter, 4);
+        }
+        else
+        {
+            common_vt_2t_4w_msa(src, src_stride, dst, dst_stride, v_filter, 4);
+        }
+    }
+    else
+    {
+        if (xoffset)
+        {
+            common_hz_2t_4w_msa(src, src_stride, dst, dst_stride, h_filter, 4);
+        }
+        else
+        {
+            uint32_t tp0, tp1, tp2, tp3;
+
+            LW4(src, src_stride, tp0, tp1, tp2, tp3);
+            SW4(tp0, tp1, tp2, tp3, dst, dst_stride);
+        }
+    }
+}
+
+void vp8_bilinear_predict8x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                 int32_t xoffset, int32_t yoffset,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride)
+{
+    const int8_t *h_filter = vp8_bilinear_filters_msa[xoffset - 1];
+    const int8_t *v_filter = vp8_bilinear_filters_msa[yoffset - 1];
+
+    if (yoffset)
+    {
+        if (xoffset)
+        {
+            common_hv_2ht_2vt_8w_msa(src, src_stride, dst, dst_stride,
+                                     h_filter, v_filter, 4);
+        }
+        else
+        {
+            common_vt_2t_8w_msa(src, src_stride, dst, dst_stride, v_filter, 4);
+        }
+    }
+    else
+    {
+        if (xoffset)
+        {
+            common_hz_2t_8w_msa(src, src_stride, dst, dst_stride, h_filter, 4);
+        }
+        else
+        {
+            vp8_copy_mem8x4(src, src_stride, dst, dst_stride);
+        }
+    }
+}
+
+void vp8_bilinear_predict8x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                 int32_t xoffset, int32_t yoffset,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride)
+{
+    const int8_t *h_filter = vp8_bilinear_filters_msa[xoffset - 1];
+    const int8_t *v_filter = vp8_bilinear_filters_msa[yoffset - 1];
+
+    if (yoffset)
+    {
+        if (xoffset)
+        {
+            common_hv_2ht_2vt_8w_msa(src, src_stride, dst, dst_stride,
+                                     h_filter, v_filter, 8);
+        }
+        else
+        {
+            common_vt_2t_8w_msa(src, src_stride, dst, dst_stride, v_filter, 8);
+        }
+    }
+    else
+    {
+        if (xoffset)
+        {
+            common_hz_2t_8w_msa(src, src_stride, dst, dst_stride, h_filter, 8);
+        }
+        else
+        {
+            vp8_copy_mem8x8(src, src_stride, dst, dst_stride);
+        }
+    }
+}
+
+void vp8_bilinear_predict16x16_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                   int32_t xoffset, int32_t yoffset,
+                                   uint8_t *RESTRICT dst, int32_t dst_stride)
+{
+    const int8_t *h_filter = vp8_bilinear_filters_msa[xoffset - 1];
+    const int8_t *v_filter = vp8_bilinear_filters_msa[yoffset - 1];
+
+    if (yoffset)
+    {
+        if (xoffset)
+        {
+            common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride,
+                                      h_filter, v_filter, 16);
+        }
+        else
+        {
+            common_vt_2t_16w_msa(src, src_stride, dst, dst_stride, v_filter,
+                                 16);
+        }
+    }
+    else
+    {
+        if (xoffset)
+        {
+            common_hz_2t_16w_msa(src, src_stride, dst, dst_stride, h_filter,
+                                 16);
+        }
+        else
+        {
+            vp8_copy_mem16x16(src, src_stride, dst, dst_stride);
+        }
+    }
+}
diff --git a/libs/libvpx/vp8/common/mips/msa/copymem_msa.c b/libs/libvpx/vp8/common/mips/msa/copymem_msa.c
new file mode 100644
index 0000000000..002a5ed91d
--- /dev/null
+++ b/libs/libvpx/vp8/common/mips/msa/copymem_msa.c
@@ -0,0 +1,70 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/mips/msa/vp8_macros_msa.h"
+
+static void copy_8x4_msa(uint8_t *src, int32_t src_stride,
+                         uint8_t *dst, int32_t dst_stride)
+{
+    uint64_t src0, src1, src2, src3;
+
+    LD4(src, src_stride, src0, src1, src2, src3);
+    SD4(src0, src1, src2, src3, dst, dst_stride);
+}
+
+static void copy_8x8_msa(uint8_t *src, int32_t src_stride,
+                         uint8_t *dst, int32_t dst_stride)
+{
+    uint64_t src0, src1, src2, src3;
+
+    LD4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    SD4(src0, src1, src2, src3, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    LD4(src, src_stride, src0, src1, src2, src3);
+    SD4(src0, src1, src2, src3, dst, dst_stride);
+}
+
+static void copy_16x16_msa(uint8_t *src, int32_t src_stride,
+                           uint8_t *dst, int32_t dst_stride)
+{
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
+
+    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+    LD_UB8(src, src_stride, src8, src9, src10, src11, src12, src13, src14,
+           src15);
+
+    ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
+    dst += (8 * dst_stride);
+    ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15, dst,
+           dst_stride);
+}
+
+void vp8_copy_mem16x16_msa(uint8_t *src, int32_t src_stride,
+                           uint8_t *dst, int32_t dst_stride)
+{
+    copy_16x16_msa(src, src_stride, dst, dst_stride);
+}
+
+void vp8_copy_mem8x8_msa(uint8_t *src, int32_t src_stride,
+                         uint8_t *dst, int32_t dst_stride)
+{
+    copy_8x8_msa(src, src_stride, dst, dst_stride);
+}
+
+void vp8_copy_mem8x4_msa(uint8_t *src, int32_t src_stride,
+                         uint8_t *dst, int32_t dst_stride)
+{
+    copy_8x4_msa(src, src_stride, dst, dst_stride);
+}
diff --git a/libs/libvpx/vp8/common/mips/msa/idct_msa.c b/libs/libvpx/vp8/common/mips/msa/idct_msa.c
new file mode 100644
index 0000000000..e537a3ffc9
--- /dev/null
+++ b/libs/libvpx/vp8/common/mips/msa/idct_msa.c
@@ -0,0 +1,457 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/blockd.h"
+#include "vp8/common/mips/msa/vp8_macros_msa.h"
+
+static const int32_t cospi8sqrt2minus1 = 20091;
+static const int32_t sinpi8sqrt2 = 35468;
+
+#define TRANSPOSE_TWO_4x4_H(in0, in1, in2, in3, out0, out1, out2, out3)  \
+{                                                                        \
+    v8i16 s4_m, s5_m, s6_m, s7_m;                                        \
+                                                                         \
+    TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, s4_m, s5_m, s6_m, s7_m);      \
+    ILVR_D2_SH(s6_m, s4_m, s7_m, s5_m, out0, out2);                      \
+    out1 = (v8i16)__msa_ilvl_d((v2i64)s6_m, (v2i64)s4_m);                \
+    out3 = (v8i16)__msa_ilvl_d((v2i64)s7_m, (v2i64)s5_m);                \
+}
+
+#define EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in)     \
+({                                                        \
+    v8i16 out_m;                                          \
+    v8i16 zero_m = { 0 };                                 \
+    v4i32 tmp1_m, tmp2_m;                                 \
+    v4i32 sinpi8_sqrt2_m = __msa_fill_w(sinpi8sqrt2);     \
+                                                          \
+    ILVRL_H2_SW(in, zero_m, tmp1_m, tmp2_m);              \
+    tmp1_m >>= 16;                                        \
+    tmp2_m >>= 16;                                        \
+    tmp1_m = (tmp1_m * sinpi8_sqrt2_m) >> 16;             \
+    tmp2_m = (tmp2_m * sinpi8_sqrt2_m) >> 16;             \
+    out_m = __msa_pckev_h((v8i16)tmp2_m, (v8i16)tmp1_m);  \
+                                                          \
+    out_m;                                                \
+})
+
+#define VP8_IDCT_1D_H(in0, in1, in2, in3, out0, out1, out2, out3)  \
+{                                                                  \
+    v8i16 a1_m, b1_m, c1_m, d1_m;                                  \
+    v8i16 c_tmp1_m, c_tmp2_m, d_tmp1_m, d_tmp2_m;                  \
+    v8i16 const_cospi8sqrt2minus1_m;                               \
+                                                                   \
+    const_cospi8sqrt2minus1_m = __msa_fill_h(cospi8sqrt2minus1);   \
+    a1_m = in0 + in2;                                              \
+    b1_m = in0 - in2;                                              \
+    c_tmp1_m = EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in1);     \
+    c_tmp2_m = __msa_mul_q_h(in3, const_cospi8sqrt2minus1_m);      \
+    c_tmp2_m = c_tmp2_m >> 1;                                      \
+    c_tmp2_m = in3 + c_tmp2_m;                                     \
+    c1_m = c_tmp1_m - c_tmp2_m;                                    \
+    d_tmp1_m = __msa_mul_q_h(in1, const_cospi8sqrt2minus1_m);      \
+    d_tmp1_m = d_tmp1_m >> 1;                                      \
+    d_tmp1_m = in1 + d_tmp1_m;                                     \
+    d_tmp2_m = EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in3);     \
+    d1_m = d_tmp1_m + d_tmp2_m;                                    \
+    BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3);   \
+}
+
+#define VP8_IDCT_1D_W(in0, in1, in2, in3, out0, out1, out2, out3)  \
+{                                                                  \
+    v4i32 a1_m, b1_m, c1_m, d1_m;                                  \
+    v4i32 c_tmp1_m, c_tmp2_m, d_tmp1_m, d_tmp2_m;                  \
+    v4i32 const_cospi8sqrt2minus1_m, sinpi8_sqrt2_m;               \
+                                                                   \
+    const_cospi8sqrt2minus1_m = __msa_fill_w(cospi8sqrt2minus1);   \
+    sinpi8_sqrt2_m = __msa_fill_w(sinpi8sqrt2);                    \
+    a1_m = in0 + in2;                                              \
+    b1_m = in0 - in2;                                              \
+    c_tmp1_m = (in1 * sinpi8_sqrt2_m) >> 16;                       \
+    c_tmp2_m = in3 + ((in3 * const_cospi8sqrt2minus1_m) >> 16);    \
+    c1_m = c_tmp1_m - c_tmp2_m;                                    \
+    d_tmp1_m = in1 + ((in1 * const_cospi8sqrt2minus1_m) >> 16);    \
+    d_tmp2_m = (in3 * sinpi8_sqrt2_m) >> 16;                       \
+    d1_m = d_tmp1_m + d_tmp2_m;                                    \
+    BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3);   \
+}
+
+static void idct4x4_addblk_msa(int16_t *input, uint8_t *pred,
+                               int32_t pred_stride,
+                               uint8_t *dest, int32_t dest_stride)
+{
+    v8i16 input0, input1;
+    v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
+    v4i32 res0, res1, res2, res3;
+    v16i8 zero = { 0 };
+    v16i8 pred0, pred1, pred2, pred3, dest0, dest1, dest2, dest3;
+    v16i8 mask = { 0, 4, 8, 12, 20, 21, 22, 23, 24,
+                   25, 26, 27, 28, 29, 30, 31 };
+
+    LD_SH2(input, 8, input0, input1);
+    UNPCK_SH_SW(input0, in0, in1);
+    UNPCK_SH_SW(input1, in2, in3);
+    VP8_IDCT_1D_W(in0, in1, in2, in3, hz0, hz1, hz2, hz3);
+    TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
+    VP8_IDCT_1D_W(hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3);
+    SRARI_W4_SW(vt0, vt1, vt2, vt3, 3);
+    TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);
+    LD_SB4(pred, pred_stride, pred0, pred1, pred2, pred3);
+    ILVR_B4_SW(zero, pred0, zero, pred1, zero, pred2, zero, pred3, res0, res1,
+               res2, res3);
+    ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3, res0, res1,
+               res2, res3);
+    ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3);
+    res0 = CLIP_SW_0_255(res0);
+    res1 = CLIP_SW_0_255(res1);
+    res2 = CLIP_SW_0_255(res2);
+    res3 = CLIP_SW_0_255(res3);
+    LD_SB4(dest, dest_stride, dest0, dest1, dest2, dest3);
+    VSHF_B2_SB(res0, dest0, res1, dest1, mask, mask, dest0, dest1);
+    VSHF_B2_SB(res2, dest2, res3, dest3, mask, mask, dest2, dest3);
+    ST_SB4(dest0, dest1, dest2, dest3, dest, dest_stride);
+}
+
+static void idct4x4_addconst_msa(int16_t in_dc, uint8_t *pred,
+                                 int32_t pred_stride,
+                                 uint8_t *dest, int32_t dest_stride)
+{
+    v8i16 vec;
+    v8i16 res0, res1, res2, res3;
+    v16i8 zero = { 0 };
+    v16i8 pred0, pred1, pred2, pred3, dest0, dest1, dest2, dest3;
+    v16i8 mask = { 0, 2, 4, 6, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 };
+
+    vec = __msa_fill_h(in_dc);
+    vec = __msa_srari_h(vec, 3);
+    LD_SB4(pred, pred_stride, pred0, pred1, pred2, pred3);
+    ILVR_B4_SH(zero, pred0, zero, pred1, zero, pred2, zero, pred3, res0, res1,
+               res2, res3);
+    ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3);
+    CLIP_SH4_0_255(res0, res1, res2, res3);
+    LD_SB4(dest, dest_stride, dest0, dest1, dest2, dest3);
+    VSHF_B2_SB(res0, dest0, res1, dest1, mask, mask, dest0, dest1);
+    VSHF_B2_SB(res2, dest2, res3, dest3, mask, mask, dest2, dest3);
+    ST_SB4(dest0, dest1, dest2, dest3, dest, dest_stride);
+}
+
+void vp8_short_inv_walsh4x4_msa(int16_t *input, int16_t *mb_dq_coeff)
+{
+    v8i16 input0, input1;
+    v4i32 in0, in1, in2, in3, a1, b1, c1, d1;
+    v4i32 hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
+
+    LD_SH2(input, 8, input0, input1);
+    UNPCK_SH_SW(input0, in0, in1);
+    UNPCK_SH_SW(input1, in2, in3);
+    BUTTERFLY_4(in0, in1, in2, in3, a1, b1, c1, d1);
+    BUTTERFLY_4(a1, d1, c1, b1, hz0, hz1, hz3, hz2);
+    TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
+    BUTTERFLY_4(hz0, hz1, hz2, hz3, a1, b1, c1, d1);
+    BUTTERFLY_4(a1, d1, c1, b1, vt0, vt1, vt3, vt2);
+    ADD4(vt0, 3, vt1, 3, vt2, 3, vt3, 3, vt0, vt1, vt2, vt3);
+    SRA_4V(vt0, vt1, vt2, vt3, 3);
+    mb_dq_coeff[0] = __msa_copy_s_h((v8i16)vt0, 0);
+    mb_dq_coeff[16] = __msa_copy_s_h((v8i16)vt1, 0);
+    mb_dq_coeff[32] = __msa_copy_s_h((v8i16)vt2, 0);
+    mb_dq_coeff[48] = __msa_copy_s_h((v8i16)vt3, 0);
+    mb_dq_coeff[64] = __msa_copy_s_h((v8i16)vt0, 2);
+    mb_dq_coeff[80] = __msa_copy_s_h((v8i16)vt1, 2);
+    mb_dq_coeff[96] = __msa_copy_s_h((v8i16)vt2, 2);
+    mb_dq_coeff[112] = __msa_copy_s_h((v8i16)vt3, 2);
+    mb_dq_coeff[128] = __msa_copy_s_h((v8i16)vt0, 4);
+    mb_dq_coeff[144] = __msa_copy_s_h((v8i16)vt1, 4);
+    mb_dq_coeff[160] = __msa_copy_s_h((v8i16)vt2, 4);
+    mb_dq_coeff[176] = __msa_copy_s_h((v8i16)vt3, 4);
+    mb_dq_coeff[192] = __msa_copy_s_h((v8i16)vt0, 6);
+    mb_dq_coeff[208] = __msa_copy_s_h((v8i16)vt1, 6);
+    mb_dq_coeff[224] = __msa_copy_s_h((v8i16)vt2, 6);
+    mb_dq_coeff[240] = __msa_copy_s_h((v8i16)vt3, 6);
+}
+
+static void dequant_idct4x4_addblk_msa(int16_t *input, int16_t *dequant_input,
+                                       uint8_t *dest, int32_t dest_stride)
+{
+    v8i16 input0, input1, dequant_in0, dequant_in1, mul0, mul1;
+    v8i16 in0, in1, in2, in3;
+    v8i16 hz0_h, hz1_h, hz2_h, hz3_h;
+    v16i8 dest0, dest1, dest2, dest3;
+    v4i32 hz0_w, hz1_w, hz2_w, hz3_w;
+    v4i32 vt0, vt1, vt2, vt3, res0, res1, res2, res3;
+    v2i64 zero = { 0 };
+    v16i8 mask = { 0, 4, 8, 12, 20, 21, 22, 23, 24,
+                   25, 26, 27, 28, 29, 30, 31 };
+
+    LD_SH2(input, 8, input0, input1);
+    LD_SH2(dequant_input, 8, dequant_in0, dequant_in1);
+    MUL2(input0, dequant_in0, input1, dequant_in1, mul0, mul1);
+    PCKEV_D2_SH(zero, mul0, zero, mul1, in0, in2);
+    PCKOD_D2_SH(zero, mul0, zero, mul1, in1, in3);
+    VP8_IDCT_1D_H(in0, in1, in2, in3, hz0_h, hz1_h, hz2_h, hz3_h);
+    PCKEV_D2_SH(hz1_h, hz0_h, hz3_h, hz2_h, mul0, mul1);
+    UNPCK_SH_SW(mul0, hz0_w, hz1_w);
+    UNPCK_SH_SW(mul1, hz2_w, hz3_w);
+    TRANSPOSE4x4_SW_SW(hz0_w, hz1_w, hz2_w, hz3_w, hz0_w, hz1_w, hz2_w, hz3_w);
+    VP8_IDCT_1D_W(hz0_w, hz1_w, hz2_w, hz3_w, vt0, vt1, vt2, vt3);
+    SRARI_W4_SW(vt0, vt1, vt2, vt3, 3);
+    TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);
+    LD_SB4(dest, dest_stride, dest0, dest1, dest2, dest3);
+    ILVR_B4_SW(zero, dest0, zero, dest1, zero, dest2, zero, dest3, res0, res1,
+               res2, res3);
+    ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3, res0, res1,
+               res2, res3);
+    ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3);
+    res0 = CLIP_SW_0_255(res0);
+    res1 = CLIP_SW_0_255(res1);
+    res2 = CLIP_SW_0_255(res2);
+    res3 = CLIP_SW_0_255(res3);
+    VSHF_B2_SB(res0, dest0, res1, dest1, mask, mask, dest0, dest1);
+    VSHF_B2_SB(res2, dest2, res3, dest3, mask, mask, dest2, dest3);
+    ST_SB4(dest0, dest1, dest2, dest3, dest, dest_stride);
+}
+
+static void dequant_idct4x4_addblk_2x_msa(int16_t *input,
+                                          int16_t *dequant_input,
+                                          uint8_t *dest, int32_t dest_stride)
+{
+    v16u8 dest0, dest1, dest2, dest3;
+    v8i16 in0, in1, in2, in3;
+    v8i16 mul0, mul1, mul2, mul3, dequant_in0, dequant_in1;
+    v8i16 hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
+    v8i16 res0, res1, res2, res3;
+    v4i32 hz0l, hz1l, hz2l, hz3l, hz0r, hz1r, hz2r, hz3r;
+    v4i32 vt0l, vt1l, vt2l, vt3l, vt0r, vt1r, vt2r, vt3r;
+    v16i8 zero = { 0 };
+
+    LD_SH4(input, 8, in0, in1, in2, in3);
+    LD_SH2(dequant_input, 8, dequant_in0, dequant_in1);
+    MUL4(in0, dequant_in0, in1, dequant_in1, in2, dequant_in0, in3, dequant_in1,
+         mul0, mul1, mul2, mul3);
+    PCKEV_D2_SH(mul2, mul0, mul3, mul1, in0, in2);
+    PCKOD_D2_SH(mul2, mul0, mul3, mul1, in1, in3);
+    VP8_IDCT_1D_H(in0, in1, in2, in3, hz0, hz1, hz2, hz3);
+    TRANSPOSE_TWO_4x4_H(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
+    UNPCK_SH_SW(hz0, hz0r, hz0l);
+    UNPCK_SH_SW(hz1, hz1r, hz1l);
+    UNPCK_SH_SW(hz2, hz2r, hz2l);
+    UNPCK_SH_SW(hz3, hz3r, hz3l);
+    VP8_IDCT_1D_W(hz0l, hz1l, hz2l, hz3l, vt0l, vt1l, vt2l, vt3l);
+    SRARI_W4_SW(vt0l, vt1l, vt2l, vt3l, 3);
+    VP8_IDCT_1D_W(hz0r, hz1r, hz2r, hz3r, vt0r, vt1r, vt2r, vt3r);
+    SRARI_W4_SW(vt0r, vt1r, vt2r, vt3r, 3);
+    PCKEV_H4_SH(vt0l, vt0r, vt1l, vt1r, vt2l, vt2r, vt3l, vt3r, vt0, vt1, vt2,
+                vt3);
+    TRANSPOSE_TWO_4x4_H(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);
+    LD_UB4(dest, dest_stride, dest0, dest1, dest2, dest3);
+    ILVR_B4_SH(zero, dest0, zero, dest1, zero, dest2, zero, dest3, res0, res1,
+               res2, res3);
+    ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3);
+    CLIP_SH4_0_255(res0, res1, res2, res3);
+    PCKEV_B4_SH(res0, res0, res1, res1, res2, res2, res3, res3, res0, res1,
+                res2, res3);
+    PCKOD_D2_UB(dest0, res0, dest1, res1, dest0, dest1);
+    PCKOD_D2_UB(dest2, res2, dest3, res3, dest2, dest3);
+    ST_UB4(dest0, dest1, dest2, dest3, dest, dest_stride);
+
+    __asm__ __volatile__(
+        "sw   $zero,    0(%[input])  \n\t"
+        "sw   $zero,    4(%[input])  \n\t"
+        "sw   $zero,    8(%[input])  \n\t"
+        "sw   $zero,   12(%[input])  \n\t"
+        "sw   $zero,   16(%[input])  \n\t"
+        "sw   $zero,   20(%[input])  \n\t"
+        "sw   $zero,   24(%[input])  \n\t"
+        "sw   $zero,   28(%[input])  \n\t"
+        "sw   $zero,   32(%[input])  \n\t"
+        "sw   $zero,   36(%[input])  \n\t"
+        "sw   $zero,   40(%[input])  \n\t"
+        "sw   $zero,   44(%[input])  \n\t"
+        "sw   $zero,   48(%[input])  \n\t"
+        "sw   $zero,   52(%[input])  \n\t"
+        "sw   $zero,   56(%[input])  \n\t"
+        "sw   $zero,   60(%[input])  \n\t"::
+
+        [input] "r"(input)
+    );
+}
+
+static void dequant_idct_addconst_2x_msa(int16_t *input, int16_t *dequant_input,
+                                         uint8_t *dest, int32_t dest_stride)
+{
+    v8i16 input_dc0, input_dc1, vec;
+    v16u8 dest0, dest1, dest2, dest3;
+    v16i8 zero = { 0 };
+    v8i16 res0, res1, res2, res3;
+
+    input_dc0 = __msa_fill_h(input[0] * dequant_input[0]);
+    input_dc1 = __msa_fill_h(input[16] * dequant_input[0]);
+    SRARI_H2_SH(input_dc0, input_dc1, 3);
+    vec = (v8i16)__msa_pckev_d((v2i64)input_dc1, (v2i64)input_dc0);
+    input[0] = 0;
+    input[16] = 0;
+    LD_UB4(dest, dest_stride, dest0, dest1, dest2, dest3);
+    ILVR_B4_SH(zero, dest0, zero, dest1, zero, dest2, zero, dest3, res0,
+               res1, res2, res3);
+    ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3);
+    CLIP_SH4_0_255(res0, res1, res2, res3);
+    PCKEV_B4_SH(res0, res0, res1, res1, res2, res2, res3, res3, res0, res1,
+                res2, res3);
+    PCKOD_D2_UB(dest0, res0, dest1, res1, dest0, dest1);
+    PCKOD_D2_UB(dest2, res2, dest3, res3, dest2, dest3);
+    ST_UB4(dest0, dest1, dest2, dest3, dest, dest_stride);
+}
+
+void vp8_short_idct4x4llm_msa(int16_t *input, uint8_t *pred_ptr,
+                              int32_t pred_stride, uint8_t *dst_ptr,
+                              int32_t dst_stride)
+{
+    idct4x4_addblk_msa(input, pred_ptr, pred_stride, dst_ptr, dst_stride);
+}
+
+void vp8_dc_only_idct_add_msa(int16_t input_dc, uint8_t *pred_ptr,
+                              int32_t pred_stride, uint8_t *dst_ptr,
+                              int32_t dst_stride)
+{
+    idct4x4_addconst_msa(input_dc, pred_ptr, pred_stride, dst_ptr, dst_stride);
+}
+
+void vp8_dequantize_b_msa(BLOCKD *d, int16_t *DQC)
+{
+    v8i16 dqc0, dqc1, q0, q1, dq0, dq1;
+
+    LD_SH2(DQC, 8, dqc0, dqc1);
+    LD_SH2(d->qcoeff, 8, q0, q1);
+    MUL2(dqc0, q0, dqc1, q1, dq0, dq1);
+    ST_SH2(dq0, dq1, d->dqcoeff, 8);
+}
+
+void vp8_dequant_idct_add_msa(int16_t *input, int16_t *dq,
+                              uint8_t *dest, int32_t stride)
+{
+    dequant_idct4x4_addblk_msa(input, dq, dest, stride);
+
+    __asm__ __volatile__ (
+        "sw     $zero,    0(%[input])     \n\t"
+        "sw     $zero,    4(%[input])     \n\t"
+        "sw     $zero,    8(%[input])     \n\t"
+        "sw     $zero,   12(%[input])     \n\t"
+        "sw     $zero,   16(%[input])     \n\t"
+        "sw     $zero,   20(%[input])     \n\t"
+        "sw     $zero,   24(%[input])     \n\t"
+        "sw     $zero,   28(%[input])     \n\t"
+
+        :
+        : [input] "r" (input)
+    );
+}
+
+void vp8_dequant_idct_add_y_block_msa(int16_t *q, int16_t *dq,
+                                      uint8_t *dst, int32_t stride,
+                                      char *eobs)
+{
+    int16_t *eobs_h = (int16_t *)eobs;
+    uint8_t i;
+
+    for (i = 4; i--;)
+    {
+        if (eobs_h[0])
+        {
+            if (eobs_h[0] & 0xfefe)
+            {
+                dequant_idct4x4_addblk_2x_msa(q, dq, dst, stride);
+            }
+            else
+            {
+                dequant_idct_addconst_2x_msa(q, dq, dst, stride);
+            }
+        }
+
+        q += 32;
+
+        if (eobs_h[1])
+        {
+            if (eobs_h[1] & 0xfefe)
+            {
+                dequant_idct4x4_addblk_2x_msa(q, dq, dst + 8, stride);
+            }
+            else
+            {
+                dequant_idct_addconst_2x_msa(q, dq, dst + 8, stride);
+            }
+        }
+
+        q += 32;
+        dst += (4 * stride);
+        eobs_h += 2;
+    }
+}
+
+void vp8_dequant_idct_add_uv_block_msa(int16_t *q, int16_t *dq,
+                                       uint8_t *dstu, uint8_t *dstv,
+                                       int32_t stride, char *eobs)
+{
+    int16_t *eobs_h = (int16_t *)eobs;
+
+    if (eobs_h[0])
+    {
+        if (eobs_h[0] & 0xfefe)
+        {
+            dequant_idct4x4_addblk_2x_msa(q, dq, dstu, stride);
+        }
+        else
+        {
+            dequant_idct_addconst_2x_msa(q, dq, dstu, stride);
+        }
+    }
+
+    q += 32;
+    dstu += (stride * 4);
+
+    if (eobs_h[1])
+    {
+        if (eobs_h[1] & 0xfefe)
+        {
+            dequant_idct4x4_addblk_2x_msa(q, dq, dstu, stride);
+        }
+        else
+        {
+            dequant_idct_addconst_2x_msa(q, dq, dstu, stride);
+        }
+    }
+
+    q += 32;
+
+    if (eobs_h[2])
+    {
+        if (eobs_h[2] & 0xfefe)
+        {
+            dequant_idct4x4_addblk_2x_msa(q, dq, dstv, stride);
+        }
+        else
+        {
+            dequant_idct_addconst_2x_msa(q, dq, dstv, stride);
+        }
+    }
+
+    q += 32;
+    dstv += (stride * 4);
+
+    if (eobs_h[3])
+    {
+        if (eobs_h[3] & 0xfefe)
+        {
+            dequant_idct4x4_addblk_2x_msa(q, dq, dstv, stride);
+        }
+        else
+        {
+            dequant_idct_addconst_2x_msa(q, dq, dstv, stride);
+        }
+    }
+}
diff --git a/libs/libvpx/vp8/common/mips/msa/loopfilter_filters_msa.c b/libs/libvpx/vp8/common/mips/msa/loopfilter_filters_msa.c
new file mode 100644
index 0000000000..a40f378098
--- /dev/null
+++ b/libs/libvpx/vp8/common/mips/msa/loopfilter_filters_msa.c
@@ -0,0 +1,826 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/loopfilter.h"
+#include "vp8/common/mips/msa/vp8_macros_msa.h"
+
+#define VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask)         \
+{                                                              \
+    v16u8 p1_a_sub_q1, p0_a_sub_q0;                            \
+                                                               \
+    p0_a_sub_q0 = __msa_asub_u_b(p0, q0);                      \
+    p1_a_sub_q1 = __msa_asub_u_b(p1, q1);                      \
+    p1_a_sub_q1 = (v16u8)__msa_srli_b((v16i8)p1_a_sub_q1, 1);  \
+    p0_a_sub_q0 = __msa_adds_u_b(p0_a_sub_q0, p0_a_sub_q0);    \
+    mask = __msa_adds_u_b(p0_a_sub_q0, p1_a_sub_q1);           \
+    mask = ((v16u8)mask <= b_limit);                           \
+}
+
+#define VP8_LPF_FILTER4_4W(p1_in_out, p0_in_out, q0_in_out, q1_in_out,  \
+                           mask_in, hev_in)                             \
+{                                                                       \
+    v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign;                 \
+    v16i8 filt, filt1, filt2, cnst4b, cnst3b;                           \
+    v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h;             \
+                                                                        \
+    p1_m = (v16i8)__msa_xori_b(p1_in_out, 0x80);                        \
+    p0_m = (v16i8)__msa_xori_b(p0_in_out, 0x80);                        \
+    q0_m = (v16i8)__msa_xori_b(q0_in_out, 0x80);                        \
+    q1_m = (v16i8)__msa_xori_b(q1_in_out, 0x80);                        \
+                                                                        \
+    filt = __msa_subs_s_b(p1_m, q1_m);                                  \
+                                                                        \
+    filt = filt & (v16i8)hev_in;                                        \
+                                                                        \
+    q0_sub_p0 = q0_m - p0_m;                                            \
+    filt_sign = __msa_clti_s_b(filt, 0);                                \
+                                                                        \
+    cnst3h = __msa_ldi_h(3);                                            \
+    q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0);            \
+    q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h);    \
+    filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt);                      \
+    filt_r += q0_sub_p0_r;                                              \
+    filt_r = __msa_sat_s_h(filt_r, 7);                                  \
+                                                                        \
+    q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0, q0_sub_p0);            \
+    q0_sub_p0_l = __msa_dotp_s_h((v16i8)q0_sub_p0_l, (v16i8)cnst3h);    \
+    filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt);                      \
+    filt_l += q0_sub_p0_l;                                              \
+    filt_l = __msa_sat_s_h(filt_l, 7);                                  \
+                                                                        \
+    filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r);                 \
+    filt = filt & (v16i8)mask_in;                                       \
+                                                                        \
+    cnst4b = __msa_ldi_b(4);                                            \
+    filt1 = __msa_adds_s_b(filt, cnst4b);                               \
+    filt1 >>= 3;                                                        \
+                                                                        \
+    cnst3b = __msa_ldi_b(3);                                            \
+    filt2 = __msa_adds_s_b(filt, cnst3b);                               \
+    filt2 >>= 3;                                                        \
+                                                                        \
+    q0_m = __msa_subs_s_b(q0_m, filt1);                                 \
+    q0_in_out = __msa_xori_b((v16u8)q0_m, 0x80);                        \
+    p0_m = __msa_adds_s_b(p0_m, filt2);                                 \
+    p0_in_out = __msa_xori_b((v16u8)p0_m, 0x80);                        \
+                                                                        \
+    filt = __msa_srari_b(filt1, 1);                                     \
+    hev_in = __msa_xori_b((v16u8)hev_in, 0xff);                         \
+    filt = filt & (v16i8)hev_in;                                        \
+                                                                        \
+    q1_m = __msa_subs_s_b(q1_m, filt);                                  \
+    q1_in_out = __msa_xori_b((v16u8)q1_m, 0x80);                        \
+    p1_m = __msa_adds_s_b(p1_m, filt);                                  \
+    p1_in_out = __msa_xori_b((v16u8)p1_m, 0x80);                        \
+}
+
+#define VP8_SIMPLE_FILT(p1_in, p0_in, q0_in, q1_in, mask)          \
+{                                                                  \
+    v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, q0_sub_p0_sign;       \
+    v16i8 filt, filt1, filt2, cnst4b, cnst3b, filt_sign;           \
+    v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h;        \
+                                                                   \
+    p1_m = (v16i8)__msa_xori_b(p1_in, 0x80);                       \
+    p0_m = (v16i8)__msa_xori_b(p0_in, 0x80);                       \
+    q0_m = (v16i8)__msa_xori_b(q0_in, 0x80);                       \
+    q1_m = (v16i8)__msa_xori_b(q1_in, 0x80);                       \
+                                                                   \
+    filt = __msa_subs_s_b(p1_m, q1_m);                             \
+                                                                   \
+    q0_sub_p0 = q0_m - p0_m;                                       \
+    filt_sign = __msa_clti_s_b(filt, 0);                           \
+                                                                   \
+    cnst3h = __msa_ldi_h(3);                                       \
+    q0_sub_p0_sign = __msa_clti_s_b(q0_sub_p0, 0);                 \
+    q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0_sign, q0_sub_p0);  \
+    q0_sub_p0_r *= cnst3h;                                         \
+    filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt);                 \
+    filt_r += q0_sub_p0_r;                                         \
+    filt_r = __msa_sat_s_h(filt_r, 7);                             \
+                                                                   \
+    q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0_sign, q0_sub_p0);  \
+    q0_sub_p0_l *= cnst3h;                                         \
+    filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt);                 \
+    filt_l += q0_sub_p0_l;                                         \
+    filt_l = __msa_sat_s_h(filt_l, 7);                             \
+                                                                   \
+    filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r);            \
+    filt = filt & (v16i8)(mask);                                   \
+                                                                   \
+    cnst4b = __msa_ldi_b(4);                                       \
+    filt1 = __msa_adds_s_b(filt, cnst4b);                          \
+    filt1 >>= 3;                                                   \
+                                                                   \
+    cnst3b = __msa_ldi_b(3);                                       \
+    filt2 = __msa_adds_s_b(filt, cnst3b);                          \
+    filt2 >>= 3;                                                   \
+                                                                   \
+    q0_m = __msa_subs_s_b(q0_m, filt1);                            \
+    p0_m = __msa_adds_s_b(p0_m, filt2);                            \
+    q0_in = __msa_xori_b((v16u8)q0_m, 0x80);                       \
+    p0_in = __msa_xori_b((v16u8)p0_m, 0x80);                       \
+}
+
+#define VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev)            \
+{                                                                  \
+    v16i8 p2_m, p1_m, p0_m, q2_m, q1_m, q0_m;                      \
+    v16i8 filt, q0_sub_p0, cnst4b, cnst3b;                         \
+    v16i8 u, filt1, filt2, filt_sign, q0_sub_p0_sign;              \
+    v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_r, u_r, u_l, filt_l;      \
+    v8i16 cnst3h, cnst27h, cnst18h, cnst63h;                       \
+                                                                   \
+    cnst3h = __msa_ldi_h(3);                                       \
+                                                                   \
+    p2_m = (v16i8)__msa_xori_b(p2, 0x80);                          \
+    p1_m = (v16i8)__msa_xori_b(p1, 0x80);                          \
+    p0_m = (v16i8)__msa_xori_b(p0, 0x80);                          \
+    q0_m = (v16i8)__msa_xori_b(q0, 0x80);                          \
+    q1_m = (v16i8)__msa_xori_b(q1, 0x80);                          \
+    q2_m = (v16i8)__msa_xori_b(q2, 0x80);                          \
+                                                                   \
+    filt = __msa_subs_s_b(p1_m, q1_m);                             \
+    q0_sub_p0 = q0_m - p0_m;                                       \
+    q0_sub_p0_sign = __msa_clti_s_b(q0_sub_p0, 0);                 \
+    filt_sign = __msa_clti_s_b(filt, 0);                           \
+                                                                   \
+    q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0_sign, q0_sub_p0);  \
+    q0_sub_p0_r *= cnst3h;                                         \
+    filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt);                 \
+    filt_r = filt_r + q0_sub_p0_r;                                 \
+    filt_r = __msa_sat_s_h(filt_r, 7);                             \
+                                                                   \
+    q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0_sign, q0_sub_p0);  \
+    q0_sub_p0_l *= cnst3h;                                         \
+    filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt);                 \
+    filt_l = filt_l + q0_sub_p0_l;                                 \
+    filt_l = __msa_sat_s_h(filt_l, 7);                             \
+                                                                   \
+    filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r);            \
+    filt = filt & (v16i8)mask;                                     \
+    filt2 = filt & (v16i8)hev;                                     \
+                                                                   \
+    hev = __msa_xori_b(hev, 0xff);                                 \
+    filt = filt & (v16i8)hev;                                      \
+    cnst4b = __msa_ldi_b(4);                                       \
+    filt1 = __msa_adds_s_b(filt2, cnst4b);                         \
+    filt1 >>= 3;                                                   \
+    cnst3b = __msa_ldi_b(3);                                       \
+    filt2 = __msa_adds_s_b(filt2, cnst3b);                         \
+    filt2 >>= 3;                                                   \
+    q0_m = __msa_subs_s_b(q0_m, filt1);                            \
+    p0_m = __msa_adds_s_b(p0_m, filt2);                            \
+                                                                   \
+    filt_sign = __msa_clti_s_b(filt, 0);                           \
+    ILVRL_B2_SH(filt_sign, filt, filt_r, filt_l);                  \
+                                                                   \
+    cnst27h = __msa_ldi_h(27);                                     \
+    cnst63h = __msa_ldi_h(63);                                     \
+                                                                   \
+    u_r = filt_r * cnst27h;                                        \
+    u_r += cnst63h;                                                \
+    u_r >>= 7;                                                     \
+    u_r = __msa_sat_s_h(u_r, 7);                                   \
+    u_l = filt_l * cnst27h;                                        \
+    u_l += cnst63h;                                                \
+    u_l >>= 7;                                                     \
+    u_l = __msa_sat_s_h(u_l, 7);                                   \
+    u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r);                     \
+    q0_m = __msa_subs_s_b(q0_m, u);                                \
+    q0 = __msa_xori_b((v16u8)q0_m, 0x80);                          \
+    p0_m = __msa_adds_s_b(p0_m, u);                                \
+    p0 = __msa_xori_b((v16u8)p0_m, 0x80);                          \
+    cnst18h = __msa_ldi_h(18);                                     \
+    u_r = filt_r * cnst18h;                                        \
+    u_r += cnst63h;                                                \
+    u_r >>= 7;                                                     \
+    u_r = __msa_sat_s_h(u_r, 7);                                   \
+                                                                   \
+    u_l = filt_l * cnst18h;                                        \
+    u_l += cnst63h;                                                \
+    u_l >>= 7;                                                     \
+    u_l = __msa_sat_s_h(u_l, 7);                                   \
+    u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r);                     \
+    q1_m = __msa_subs_s_b(q1_m, u);                                \
+    q1 = __msa_xori_b((v16u8)q1_m, 0x80);                          \
+    p1_m = __msa_adds_s_b(p1_m, u);                                \
+    p1 = __msa_xori_b((v16u8)p1_m, 0x80);                          \
+    u_r = filt_r << 3;                                             \
+    u_r += filt_r + cnst63h;                                       \
+    u_r >>= 7;                                                     \
+    u_r = __msa_sat_s_h(u_r, 7);                                   \
+                                                                   \
+    u_l = filt_l << 3;                                             \
+    u_l += filt_l + cnst63h;                                       \
+    u_l >>= 7;                                                     \
+    u_l = __msa_sat_s_h(u_l, 7);                                   \
+    u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r);                     \
+    q2_m = __msa_subs_s_b(q2_m, u);                                \
+    q2 = __msa_xori_b((v16u8)q2_m, 0x80);                          \
+    p2_m = __msa_adds_s_b(p2_m, u);                                \
+    p2 = __msa_xori_b((v16u8)p2_m, 0x80);                          \
+}
+
+#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in,                   \
+                     q0_in, q1_in, q2_in, q3_in,                   \
+                     limit_in, b_limit_in, thresh_in,              \
+                     hev_out, mask_out, flat_out)                  \
+{                                                                  \
+    v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m;  \
+    v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m;  \
+                                                                   \
+    p3_asub_p2_m = __msa_asub_u_b((p3_in), (p2_in));               \
+    p2_asub_p1_m = __msa_asub_u_b((p2_in), (p1_in));               \
+    p1_asub_p0_m = __msa_asub_u_b((p1_in), (p0_in));               \
+    q1_asub_q0_m = __msa_asub_u_b((q1_in), (q0_in));               \
+    q2_asub_q1_m = __msa_asub_u_b((q2_in), (q1_in));               \
+    q3_asub_q2_m = __msa_asub_u_b((q3_in), (q2_in));               \
+    p0_asub_q0_m = __msa_asub_u_b((p0_in), (q0_in));               \
+    p1_asub_q1_m = __msa_asub_u_b((p1_in), (q1_in));               \
+    flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m);          \
+    hev_out = (thresh_in) < (v16u8)flat_out;                       \
+    p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m);     \
+    p1_asub_q1_m >>= 1;                                            \
+    p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m);     \
+    mask_out = (b_limit_in) < p0_asub_q0_m;                        \
+    mask_out = __msa_max_u_b(flat_out, mask_out);                  \
+    p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m);      \
+    mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out);              \
+    q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m);      \
+    mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out);              \
+    mask_out = (limit_in) < (v16u8)mask_out;                       \
+    mask_out = __msa_xori_b(mask_out, 0xff);                       \
+}
+
+#define VP8_ST6x1_UB(in0, in0_idx, in1, in1_idx, pdst, stride)  \
+{                                                               \
+    uint16_t tmp0_h;                                            \
+    uint32_t tmp0_w;                                            \
+                                                                \
+    tmp0_w = __msa_copy_u_w((v4i32)in0, in0_idx);               \
+    tmp0_h = __msa_copy_u_h((v8i16)in1, in1_idx);               \
+    SW(tmp0_w, pdst);                                           \
+    SH(tmp0_h, pdst + stride);                                  \
+}
+
+
+static void loop_filter_horizontal_4_dual_msa(uint8_t *src, int32_t pitch,
+                                              const uint8_t *b_limit0_ptr,
+                                              const uint8_t *limit0_ptr,
+                                              const uint8_t *thresh0_ptr,
+                                              const uint8_t *b_limit1_ptr,
+                                              const uint8_t *limit1_ptr,
+                                              const uint8_t *thresh1_ptr)
+{
+    v16u8 mask, hev, flat;
+    v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+
+    LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+    thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr);
+    thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr);
+    thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0);
+
+    b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr);
+    b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr);
+    b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0);
+
+    limit0 = (v16u8)__msa_fill_b(*limit0_ptr);
+    limit1 = (v16u8)__msa_fill_b(*limit1_ptr);
+    limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
+                 hev, mask, flat);
+    VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+
+    ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch);
+}
+
+static void loop_filter_vertical_4_dual_msa(uint8_t *src, int32_t pitch,
+                                            const uint8_t *b_limit0_ptr,
+                                            const uint8_t *limit0_ptr,
+                                            const uint8_t *thresh0_ptr,
+                                            const uint8_t *b_limit1_ptr,
+                                            const uint8_t *limit1_ptr,
+                                            const uint8_t *thresh1_ptr)
+{
+    v16u8 mask, hev, flat;
+    v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
+    v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+
+    LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+    LD_UB8(src - 4 + (8 * pitch), pitch,
+           row8, row9, row10, row11, row12, row13, row14, row15);
+    TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                        row8, row9, row10, row11, row12, row13, row14, row15,
+                        p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr);
+    thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr);
+    thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0);
+
+    b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr);
+    b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr);
+    b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0);
+
+    limit0 = (v16u8)__msa_fill_b(*limit0_ptr);
+    limit1 = (v16u8)__msa_fill_b(*limit1_ptr);
+    limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
+                 hev, mask, flat);
+    VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+    ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
+    ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3);
+    ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
+    ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5);
+
+    src -= 2;
+    ST4x8_UB(tmp2, tmp3, src, pitch);
+    src += (8 * pitch);
+    ST4x8_UB(tmp4, tmp5, src, pitch);
+}
+
+static void mbloop_filter_horizontal_edge_y_msa(uint8_t *src, int32_t pitch,
+                                                const uint8_t b_limit_in,
+                                                const uint8_t limit_in,
+                                                const uint8_t thresh_in)
+{
+    uint8_t *temp_src;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 mask, hev, flat, thresh, limit, b_limit;
+
+    b_limit = (v16u8)__msa_fill_b(b_limit_in);
+    limit = (v16u8)__msa_fill_b(limit_in);
+    thresh = (v16u8)__msa_fill_b(thresh_in);
+    temp_src = src - (pitch << 2);
+    LD_UB8(temp_src, pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+    temp_src = src - 3 * pitch;
+    ST_UB4(p2, p1, p0, q0, temp_src, pitch);
+    temp_src += (4 * pitch);
+    ST_UB2(q1, q2, temp_src, pitch);
+}
+
+static void mbloop_filter_horizontal_edge_uv_msa(uint8_t *src_u, uint8_t *src_v,
+                                                 int32_t pitch,
+                                                 const uint8_t b_limit_in,
+                                                 const uint8_t limit_in,
+                                                 const uint8_t thresh_in)
+{
+    uint8_t *temp_src;
+    uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 mask, hev, flat, thresh, limit, b_limit;
+    v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
+    v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;
+
+    b_limit = (v16u8)__msa_fill_b(b_limit_in);
+    limit = (v16u8)__msa_fill_b(limit_in);
+    thresh = (v16u8)__msa_fill_b(thresh_in);
+
+    temp_src = src_u - (pitch << 2);
+    LD_UB8(temp_src, pitch, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u);
+    temp_src = src_v - (pitch << 2);
+    LD_UB8(temp_src, pitch, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v);
+
+    ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0);
+    ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3);
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+
+    p2_d = __msa_copy_u_d((v2i64)p2, 0);
+    p1_d = __msa_copy_u_d((v2i64)p1, 0);
+    p0_d = __msa_copy_u_d((v2i64)p0, 0);
+    q0_d = __msa_copy_u_d((v2i64)q0, 0);
+    q1_d = __msa_copy_u_d((v2i64)q1, 0);
+    q2_d = __msa_copy_u_d((v2i64)q2, 0);
+    src_u -= (pitch * 3);
+    SD4(p2_d, p1_d, p0_d, q0_d, src_u, pitch);
+    src_u += 4 * pitch;
+    SD(q1_d, src_u);
+    src_u += pitch;
+    SD(q2_d, src_u);
+
+    p2_d = __msa_copy_u_d((v2i64)p2, 1);
+    p1_d = __msa_copy_u_d((v2i64)p1, 1);
+    p0_d = __msa_copy_u_d((v2i64)p0, 1);
+    q0_d = __msa_copy_u_d((v2i64)q0, 1);
+    q1_d = __msa_copy_u_d((v2i64)q1, 1);
+    q2_d = __msa_copy_u_d((v2i64)q2, 1);
+    src_v -= (pitch * 3);
+    SD4(p2_d, p1_d, p0_d, q0_d, src_v, pitch);
+    src_v += 4 * pitch;
+    SD(q1_d, src_v);
+    src_v += pitch;
+    SD(q2_d, src_v);
+}
+
+static void mbloop_filter_vertical_edge_y_msa(uint8_t *src, int32_t pitch,
+                                              const uint8_t b_limit_in,
+                                              const uint8_t limit_in,
+                                              const uint8_t thresh_in)
+{
+    uint8_t *temp_src;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 mask, hev, flat, thresh, limit, b_limit;
+    v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
+    v16u8 row9, row10, row11, row12, row13, row14, row15;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+    b_limit = (v16u8)__msa_fill_b(b_limit_in);
+    limit = (v16u8)__msa_fill_b(limit_in);
+    thresh = (v16u8)__msa_fill_b(thresh_in);
+    temp_src = src - 4;
+    LD_UB8(temp_src, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+    temp_src += (8 * pitch);
+    LD_UB8(temp_src, pitch,
+           row8, row9, row10, row11, row12, row13, row14, row15);
+    TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                        row8, row9, row10, row11, row12, row13, row14, row15,
+                        p3, p2, p1, p0, q0, q1, q2, q3);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+    ILVR_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
+    ILVRL_H2_SH(tmp1, tmp0, tmp3, tmp4);
+    ILVL_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
+    ILVRL_H2_SH(tmp1, tmp0, tmp6, tmp7);
+    ILVRL_B2_SH(q2, q1, tmp2, tmp5);
+
+    temp_src = src - 3;
+    VP8_ST6x1_UB(tmp3, 0, tmp2, 0, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp3, 1, tmp2, 1, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp3, 2, tmp2, 2, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp3, 3, tmp2, 3, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp4, 0, tmp2, 4, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp4, 1, tmp2, 5, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp4, 2, tmp2, 6, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp4, 3, tmp2, 7, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp6, 0, tmp5, 0, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp6, 1, tmp5, 1, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp6, 2, tmp5, 2, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp6, 3, tmp5, 3, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp7, 0, tmp5, 4, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp7, 1, tmp5, 5, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp7, 2, tmp5, 6, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp7, 3, tmp5, 7, temp_src, 4);
+}
+
+static void mbloop_filter_vertical_edge_uv_msa(uint8_t *src_u, uint8_t *src_v,
+                                               int32_t pitch,
+                                               const uint8_t b_limit_in,
+                                               const uint8_t limit_in,
+                                               const uint8_t thresh_in)
+{
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 mask, hev, flat, thresh, limit, b_limit;
+    v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
+    v16u8 row9, row10, row11, row12, row13, row14, row15;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+    b_limit = (v16u8)__msa_fill_b(b_limit_in);
+    limit = (v16u8)__msa_fill_b(limit_in);
+    thresh = (v16u8)__msa_fill_b(thresh_in);
+
+    LD_UB8(src_u - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+    LD_UB8(src_v - 4, pitch,
+           row8, row9, row10, row11, row12, row13, row14, row15);
+    TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                        row8, row9, row10, row11, row12, row13, row14, row15,
+                        p3, p2, p1, p0, q0, q1, q2, q3);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+
+    ILVR_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
+    ILVRL_H2_SH(tmp1, tmp0, tmp3, tmp4);
+    ILVL_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
+    ILVRL_H2_SH(tmp1, tmp0, tmp6, tmp7);
+    ILVRL_B2_SH(q2, q1, tmp2, tmp5);
+
+    src_u -= 3;
+    VP8_ST6x1_UB(tmp3, 0, tmp2, 0, src_u, 4);
+    src_u += pitch;
+    VP8_ST6x1_UB(tmp3, 1, tmp2, 1, src_u, 4);
+    src_u += pitch;
+    VP8_ST6x1_UB(tmp3, 2, tmp2, 2, src_u, 4);
+    src_u += pitch;
+    VP8_ST6x1_UB(tmp3, 3, tmp2, 3, src_u, 4);
+    src_u += pitch;
+    VP8_ST6x1_UB(tmp4, 0, tmp2, 4, src_u, 4);
+    src_u += pitch;
+    VP8_ST6x1_UB(tmp4, 1, tmp2, 5, src_u, 4);
+    src_u += pitch;
+    VP8_ST6x1_UB(tmp4, 2, tmp2, 6, src_u, 4);
+    src_u += pitch;
+    VP8_ST6x1_UB(tmp4, 3, tmp2, 7, src_u, 4);
+
+    src_v -= 3;
+    VP8_ST6x1_UB(tmp6, 0, tmp5, 0, src_v, 4);
+    src_v += pitch;
+    VP8_ST6x1_UB(tmp6, 1, tmp5, 1, src_v, 4);
+    src_v += pitch;
+    VP8_ST6x1_UB(tmp6, 2, tmp5, 2, src_v, 4);
+    src_v += pitch;
+    VP8_ST6x1_UB(tmp6, 3, tmp5, 3, src_v, 4);
+    src_v += pitch;
+    VP8_ST6x1_UB(tmp7, 0, tmp5, 4, src_v, 4);
+    src_v += pitch;
+    VP8_ST6x1_UB(tmp7, 1, tmp5, 5, src_v, 4);
+    src_v += pitch;
+    VP8_ST6x1_UB(tmp7, 2, tmp5, 6, src_v, 4);
+    src_v += pitch;
+    VP8_ST6x1_UB(tmp7, 3, tmp5, 7, src_v, 4);
+}
+
+void vp8_loop_filter_simple_horizontal_edge_msa(uint8_t *src, int32_t pitch,
+                                                const uint8_t *b_limit_ptr)
+{
+    v16u8 p1, p0, q1, q0;
+    v16u8 mask, b_limit;
+
+    b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+    LD_UB4(src - (pitch << 1), pitch, p1, p0, q0, q1);
+    VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask);
+    VP8_SIMPLE_FILT(p1, p0, q0, q1, mask);
+    ST_UB2(p0, q0, (src - pitch), pitch);
+}
+
+void vp8_loop_filter_simple_vertical_edge_msa(uint8_t *src, int32_t pitch,
+                                              const uint8_t *b_limit_ptr)
+{
+    uint8_t *temp_src;
+    v16u8 p1, p0, q1, q0;
+    v16u8 mask, b_limit;
+    v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
+    v16u8 row9, row10, row11, row12, row13, row14, row15;
+    v8i16 tmp0, tmp1;
+
+    b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+    temp_src = src - 2;
+    LD_UB8(temp_src, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+    temp_src += (8 * pitch);
+    LD_UB8(temp_src, pitch,
+           row8, row9, row10, row11, row12, row13, row14, row15);
+    TRANSPOSE16x4_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                        row8, row9, row10, row11, row12, row13, row14, row15,
+                        p1, p0, q0, q1);
+    VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask);
+    VP8_SIMPLE_FILT(p1, p0, q0, q1, mask);
+    ILVRL_B2_SH(q0, p0, tmp1, tmp0);
+
+    src -= 1;
+    ST2x4_UB(tmp1, 0, src, pitch);
+    src += 4 * pitch;
+    ST2x4_UB(tmp1, 4, src, pitch);
+    src += 4 * pitch;
+    ST2x4_UB(tmp0, 0, src, pitch);
+    src += 4 * pitch;
+    ST2x4_UB(tmp0, 4, src, pitch);
+    src += 4 * pitch;
+}
+
+static void loop_filter_horizontal_edge_uv_msa(uint8_t *src_u, uint8_t *src_v,
+                                               int32_t pitch,
+                                               const uint8_t b_limit_in,
+                                               const uint8_t limit_in,
+                                               const uint8_t thresh_in)
+{
+    uint64_t p1_d, p0_d, q0_d, q1_d;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 mask, hev, flat, thresh, limit, b_limit;
+    v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
+    v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;
+
+    thresh = (v16u8)__msa_fill_b(thresh_in);
+    limit = (v16u8)__msa_fill_b(limit_in);
+    b_limit = (v16u8)__msa_fill_b(b_limit_in);
+
+    src_u = src_u - (pitch << 2);
+    LD_UB8(src_u, pitch, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u);
+    src_u += (5 * pitch);
+    src_v = src_v - (pitch << 2);
+    LD_UB8(src_v, pitch, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v);
+    src_v += (5 * pitch);
+
+    /* right 8 element of p3 are u pixel and
+       left 8 element of p3 are v pixel */
+    ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0);
+    ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3);
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+
+    p1_d = __msa_copy_u_d((v2i64)p1, 0);
+    p0_d = __msa_copy_u_d((v2i64)p0, 0);
+    q0_d = __msa_copy_u_d((v2i64)q0, 0);
+    q1_d = __msa_copy_u_d((v2i64)q1, 0);
+    SD4(q1_d, q0_d, p0_d, p1_d, src_u, (- pitch));
+
+    p1_d = __msa_copy_u_d((v2i64)p1, 1);
+    p0_d = __msa_copy_u_d((v2i64)p0, 1);
+    q0_d = __msa_copy_u_d((v2i64)q0, 1);
+    q1_d = __msa_copy_u_d((v2i64)q1, 1);
+    SD4(q1_d, q0_d, p0_d, p1_d, src_v, (- pitch));
+}
+
+static void loop_filter_vertical_edge_uv_msa(uint8_t *src_u, uint8_t *src_v,
+                                             int32_t pitch,
+                                             const uint8_t b_limit_in,
+                                             const uint8_t limit_in,
+                                             const uint8_t thresh_in)
+{
+    uint8_t *temp_src_u, *temp_src_v;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 mask, hev, flat, thresh, limit, b_limit;
+    v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
+    v16u8 row9, row10, row11, row12, row13, row14, row15;
+    v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+
+    thresh = (v16u8)__msa_fill_b(thresh_in);
+    limit = (v16u8)__msa_fill_b(limit_in);
+    b_limit = (v16u8)__msa_fill_b(b_limit_in);
+
+    LD_UB8(src_u - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+    LD_UB8(src_v - 4, pitch,
+           row8, row9, row10, row11, row12, row13, row14, row15);
+    TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                        row8, row9, row10, row11, row12, row13, row14, row15,
+                        p3, p2, p1, p0, q0, q1, q2, q3);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+    ILVR_B2_SW(p0, p1, q1, q0, tmp0, tmp1);
+    ILVRL_H2_SW(tmp1, tmp0, tmp2, tmp3);
+    tmp0 = (v4i32)__msa_ilvl_b((v16i8)p0, (v16i8)p1);
+    tmp1 = (v4i32)__msa_ilvl_b((v16i8)q1, (v16i8)q0);
+    ILVRL_H2_SW(tmp1, tmp0, tmp4, tmp5);
+
+    temp_src_u = src_u - 2;
+    ST4x4_UB(tmp2, tmp2, 0, 1, 2, 3, temp_src_u, pitch);
+    temp_src_u += 4 * pitch;
+    ST4x4_UB(tmp3, tmp3, 0, 1, 2, 3, temp_src_u, pitch);
+
+    temp_src_v = src_v - 2;
+    ST4x4_UB(tmp4, tmp4, 0, 1, 2, 3, temp_src_v, pitch);
+    temp_src_v += 4 * pitch;
+    ST4x4_UB(tmp5, tmp5, 0, 1, 2, 3, temp_src_v, pitch);
+}
+
+void vp8_loop_filter_mbh_msa(uint8_t *src_y, uint8_t *src_u,
+                             uint8_t *src_v, int32_t pitch_y,
+                             int32_t pitch_u_v,
+                             loop_filter_info *lpf_info_ptr)
+{
+    mbloop_filter_horizontal_edge_y_msa(src_y, pitch_y,
+                                        *lpf_info_ptr->mblim,
+                                        *lpf_info_ptr->lim,
+                                        *lpf_info_ptr->hev_thr);
+    if (src_u)
+    {
+        mbloop_filter_horizontal_edge_uv_msa(src_u, src_v, pitch_u_v,
+                                             *lpf_info_ptr->mblim,
+                                             *lpf_info_ptr->lim,
+                                             *lpf_info_ptr->hev_thr);
+    }
+}
+
+void vp8_loop_filter_mbv_msa(uint8_t *src_y, uint8_t *src_u,
+                             uint8_t *src_v, int32_t pitch_y,
+                             int32_t pitch_u_v,
+                             loop_filter_info *lpf_info_ptr)
+{
+    mbloop_filter_vertical_edge_y_msa(src_y, pitch_y,
+                                      *lpf_info_ptr->mblim,
+                                      *lpf_info_ptr->lim,
+                                      *lpf_info_ptr->hev_thr);
+    if (src_u)
+    {
+        mbloop_filter_vertical_edge_uv_msa(src_u, src_v, pitch_u_v,
+                                           *lpf_info_ptr->mblim,
+                                           *lpf_info_ptr->lim,
+                                           *lpf_info_ptr->hev_thr);
+    }
+}
+
+void vp8_loop_filter_bh_msa(uint8_t *src_y, uint8_t *src_u,
+                            uint8_t *src_v, int32_t pitch_y,
+                            int32_t pitch_u_v,
+                            loop_filter_info *lpf_info_ptr)
+{
+    loop_filter_horizontal_4_dual_msa(src_y + 4 * pitch_y, pitch_y,
+                                      lpf_info_ptr->blim,
+                                      lpf_info_ptr->lim,
+                                      lpf_info_ptr->hev_thr,
+                                      lpf_info_ptr->blim,
+                                      lpf_info_ptr->lim,
+                                      lpf_info_ptr->hev_thr);
+    loop_filter_horizontal_4_dual_msa(src_y + 8 * pitch_y, pitch_y,
+                                      lpf_info_ptr->blim,
+                                      lpf_info_ptr->lim,
+                                      lpf_info_ptr->hev_thr,
+                                      lpf_info_ptr->blim,
+                                      lpf_info_ptr->lim,
+                                      lpf_info_ptr->hev_thr);
+    loop_filter_horizontal_4_dual_msa(src_y + 12 * pitch_y, pitch_y,
+                                      lpf_info_ptr->blim,
+                                      lpf_info_ptr->lim,
+                                      lpf_info_ptr->hev_thr,
+                                      lpf_info_ptr->blim,
+                                      lpf_info_ptr->lim,
+                                      lpf_info_ptr->hev_thr);
+    if (src_u)
+    {
+        loop_filter_horizontal_edge_uv_msa(src_u + (4 * pitch_u_v),
+                                           src_v + (4 * pitch_u_v),
+                                           pitch_u_v,
+                                           *lpf_info_ptr->blim,
+                                           *lpf_info_ptr->lim,
+                                           *lpf_info_ptr->hev_thr);
+    }
+}
+
+void vp8_loop_filter_bv_msa(uint8_t *src_y, uint8_t *src_u,
+                            uint8_t *src_v, int32_t pitch_y,
+                            int32_t pitch_u_v,
+                            loop_filter_info *lpf_info_ptr)
+{
+    loop_filter_vertical_4_dual_msa(src_y + 4, pitch_y, lpf_info_ptr->blim,
+                                    lpf_info_ptr->lim,
+                                    lpf_info_ptr->hev_thr,
+                                    lpf_info_ptr->blim,
+                                    lpf_info_ptr->lim,
+                                    lpf_info_ptr->hev_thr);
+    loop_filter_vertical_4_dual_msa(src_y + 8, pitch_y,
+                                    lpf_info_ptr->blim,
+                                    lpf_info_ptr->lim,
+                                    lpf_info_ptr->hev_thr,
+                                    lpf_info_ptr->blim,
+                                    lpf_info_ptr->lim,
+                                    lpf_info_ptr->hev_thr);
+    loop_filter_vertical_4_dual_msa(src_y + 12, pitch_y,
+                                    lpf_info_ptr->blim,
+                                    lpf_info_ptr->lim,
+                                    lpf_info_ptr->hev_thr,
+                                    lpf_info_ptr->blim,
+                                    lpf_info_ptr->lim,
+                                    lpf_info_ptr->hev_thr);
+    if (src_u)
+    {
+        loop_filter_vertical_edge_uv_msa(src_u + 4, src_v + 4, pitch_u_v,
+                                         *lpf_info_ptr->blim,
+                                         *lpf_info_ptr->lim,
+                                         *lpf_info_ptr->hev_thr);
+    }
+}
+
+void vp8_loop_filter_bhs_msa(uint8_t *src_y, int32_t pitch_y,
+                             const uint8_t *b_limit_ptr)
+{
+    vp8_loop_filter_simple_horizontal_edge_msa(src_y + (4 * pitch_y),
+                                               pitch_y, b_limit_ptr);
+    vp8_loop_filter_simple_horizontal_edge_msa(src_y + (8 * pitch_y),
+                                               pitch_y, b_limit_ptr);
+    vp8_loop_filter_simple_horizontal_edge_msa(src_y + (12 * pitch_y),
+                                               pitch_y, b_limit_ptr);
+}
+
+void vp8_loop_filter_bvs_msa(uint8_t *src_y, int32_t pitch_y,
+                             const uint8_t *b_limit_ptr)
+{
+    vp8_loop_filter_simple_vertical_edge_msa(src_y + 4, pitch_y, b_limit_ptr);
+    vp8_loop_filter_simple_vertical_edge_msa(src_y + 8, pitch_y, b_limit_ptr);
+    vp8_loop_filter_simple_vertical_edge_msa(src_y + 12, pitch_y, b_limit_ptr);
+}
diff --git a/libs/libvpx/vp8/common/mips/msa/mfqe_msa.c b/libs/libvpx/vp8/common/mips/msa/mfqe_msa.c
new file mode 100644
index 0000000000..3e7629f3a1
--- /dev/null
+++ b/libs/libvpx/vp8/common/mips/msa/mfqe_msa.c
@@ -0,0 +1,146 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/postproc.h"
+#include "vp8/common/mips/msa/vp8_macros_msa.h"
+
+static void filter_by_weight8x8_msa(uint8_t *src_ptr, int32_t src_stride,
+                                    uint8_t *dst_ptr, int32_t dst_stride,
+                                    int32_t src_weight)
+{
+    int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight;
+    int32_t row;
+    uint64_t src0_d, src1_d, dst0_d, dst1_d;
+    v16i8 src0 = { 0 };
+    v16i8 src1 = { 0 };
+    v16i8 dst0 = { 0 };
+    v16i8 dst1 = { 0 };
+    v8i16 src_wt, dst_wt, res_h_r, res_h_l, src_r, src_l, dst_r, dst_l;
+
+    src_wt = __msa_fill_h(src_weight);
+    dst_wt = __msa_fill_h(dst_weight);
+
+    for (row = 2; row--;)
+    {
+        LD2(src_ptr, src_stride, src0_d, src1_d);
+        src_ptr += (2 * src_stride);
+        LD2(dst_ptr, dst_stride, dst0_d, dst1_d);
+        INSERT_D2_SB(src0_d, src1_d, src0);
+        INSERT_D2_SB(dst0_d, dst1_d, dst0);
+
+        LD2(src_ptr, src_stride, src0_d, src1_d);
+        src_ptr += (2 * src_stride);
+        LD2((dst_ptr + 2 * dst_stride), dst_stride, dst0_d, dst1_d);
+        INSERT_D2_SB(src0_d, src1_d, src1);
+        INSERT_D2_SB(dst0_d, dst1_d, dst1);
+
+        UNPCK_UB_SH(src0, src_r, src_l);
+        UNPCK_UB_SH(dst0, dst_r, dst_l);
+        res_h_r = (src_r * src_wt);
+        res_h_r += (dst_r * dst_wt);
+        res_h_l = (src_l * src_wt);
+        res_h_l += (dst_l * dst_wt);
+        SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+        dst0 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r);
+        ST8x2_UB(dst0, dst_ptr, dst_stride);
+        dst_ptr += (2 * dst_stride);
+
+        UNPCK_UB_SH(src1, src_r, src_l);
+        UNPCK_UB_SH(dst1, dst_r, dst_l);
+        res_h_r = (src_r * src_wt);
+        res_h_r += (dst_r * dst_wt);
+        res_h_l = (src_l * src_wt);
+        res_h_l += (dst_l * dst_wt);
+        SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+        dst1 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r);
+        ST8x2_UB(dst1, dst_ptr, dst_stride);
+        dst_ptr += (2 * dst_stride);
+    }
+}
+
+static void filter_by_weight16x16_msa(uint8_t *src_ptr, int32_t src_stride,
+                                      uint8_t *dst_ptr, int32_t dst_stride,
+                                      int32_t src_weight)
+{
+    int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight;
+    int32_t row;
+    v16i8 src0, src1, src2, src3;
+    v16i8 dst0, dst1, dst2, dst3;
+    v8i16 src_wt, dst_wt;
+    v8i16 res_h_r, res_h_l;
+    v8i16 src_r, src_l, dst_r, dst_l;
+
+    src_wt = __msa_fill_h(src_weight);
+    dst_wt = __msa_fill_h(dst_weight);
+
+    for (row = 4; row--;)
+    {
+        LD_SB4(src_ptr, src_stride, src0, src1, src2, src3);
+        src_ptr += (4 * src_stride);
+        LD_SB4(dst_ptr, dst_stride, dst0, dst1, dst2, dst3);
+
+        UNPCK_UB_SH(src0, src_r, src_l);
+        UNPCK_UB_SH(dst0, dst_r, dst_l);
+        res_h_r = (src_r * src_wt);
+        res_h_r += (dst_r * dst_wt);
+        res_h_l = (src_l * src_wt);
+        res_h_l += (dst_l * dst_wt);
+        SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+        PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
+        dst_ptr += dst_stride;
+
+        UNPCK_UB_SH(src1, src_r, src_l);
+        UNPCK_UB_SH(dst1, dst_r, dst_l);
+        res_h_r = (src_r * src_wt);
+        res_h_r += (dst_r * dst_wt);
+        res_h_l = (src_l * src_wt);
+        res_h_l += (dst_l * dst_wt);
+        SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+        PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
+        dst_ptr += dst_stride;
+
+        UNPCK_UB_SH(src2, src_r, src_l);
+        UNPCK_UB_SH(dst2, dst_r, dst_l);
+        res_h_r = (src_r * src_wt);
+        res_h_r += (dst_r * dst_wt);
+        res_h_l = (src_l * src_wt);
+        res_h_l += (dst_l * dst_wt);
+        SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+        PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
+        dst_ptr += dst_stride;
+
+        UNPCK_UB_SH(src3, src_r, src_l);
+        UNPCK_UB_SH(dst3, dst_r, dst_l);
+        res_h_r = (src_r * src_wt);
+        res_h_r += (dst_r * dst_wt);
+        res_h_l = (src_l * src_wt);
+        res_h_l += (dst_l * dst_wt);
+        SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+        PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
+        dst_ptr += dst_stride;
+    }
+}
+
+void vp8_filter_by_weight16x16_msa(uint8_t *src_ptr, int32_t src_stride,
+                                   uint8_t *dst_ptr, int32_t dst_stride,
+                                   int32_t src_weight)
+{
+    filter_by_weight16x16_msa(src_ptr, src_stride, dst_ptr, dst_stride,
+                              src_weight);
+}
+
+void vp8_filter_by_weight8x8_msa(uint8_t *src_ptr, int32_t src_stride,
+                                 uint8_t *dst_ptr, int32_t dst_stride,
+                                 int32_t src_weight)
+{
+    filter_by_weight8x8_msa(src_ptr, src_stride, dst_ptr, dst_stride,
+                            src_weight);
+}
diff --git a/libs/libvpx/vp8/common/mips/msa/postproc_msa.c b/libs/libvpx/vp8/common/mips/msa/postproc_msa.c
new file mode 100644
index 0000000000..c88f30238b
--- /dev/null
+++ b/libs/libvpx/vp8/common/mips/msa/postproc_msa.c
@@ -0,0 +1,851 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include "./vp8_rtcd.h"
+#include "vp8/common/mips/msa/vp8_macros_msa.h"
+
+static const int16_t vp8_rv_msa[] =
+{
+    8, 5, 2, 2, 8, 12, 4, 9, 8, 3,
+    0, 3, 9, 0, 0, 0, 8, 3, 14, 4,
+    10, 1, 11, 14, 1, 14, 9, 6, 12, 11,
+    8, 6, 10, 0, 0, 8, 9, 0, 3, 14,
+    8, 11, 13, 4, 2, 9, 0, 3, 9, 6,
+    1, 2, 3, 14, 13, 1, 8, 2, 9, 7,
+    3, 3, 1, 13, 13, 6, 6, 5, 2, 7,
+    11, 9, 11, 8, 7, 3, 2, 0, 13, 13,
+    14, 4, 12, 5, 12, 10, 8, 10, 13, 10,
+    4, 14, 4, 10, 0, 8, 11, 1, 13, 7,
+    7, 14, 6, 14, 13, 2, 13, 5, 4, 4,
+    0, 10, 0, 5, 13, 2, 12, 7, 11, 13,
+    8, 0, 4, 10, 7, 2, 7, 2, 2, 5,
+    3, 4, 7, 3, 3, 14, 14, 5, 9, 13,
+    3, 14, 3, 6, 3, 0, 11, 8, 13, 1,
+    13, 1, 12, 0, 10, 9, 7, 6, 2, 8,
+    5, 2, 13, 7, 1, 13, 14, 7, 6, 7,
+    9, 6, 10, 11, 7, 8, 7, 5, 14, 8,
+    4, 4, 0, 8, 7, 10, 0, 8, 14, 11,
+    3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
+    11, 12, 12, 8, 0, 11, 13, 1, 2, 0,
+    5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
+    0, 3, 10, 5, 8, 0, 11, 6, 7, 8,
+    10, 7, 13, 9, 2, 5, 1, 5, 10, 2,
+    4, 3, 5, 6, 10, 8, 9, 4, 11, 14,
+    0, 10, 0, 5, 13, 2, 12, 7, 11, 13,
+    8, 0, 4, 10, 7, 2, 7, 2, 2, 5,
+    3, 4, 7, 3, 3, 14, 14, 5, 9, 13,
+    3, 14, 3, 6, 3, 0, 11, 8, 13, 1,
+    13, 1, 12, 0, 10, 9, 7, 6, 2, 8,
+    5, 2, 13, 7, 1, 13, 14, 7, 6, 7,
+    9, 6, 10, 11, 7, 8, 7, 5, 14, 8,
+    4, 4, 0, 8, 7, 10, 0, 8, 14, 11,
+    3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
+    11, 12, 12, 8, 0, 11, 13, 1, 2, 0,
+    5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
+    0, 3, 10, 5, 8, 0, 11, 6, 7, 8,
+    10, 7, 13, 9, 2, 5, 1, 5, 10, 2,
+    4, 3, 5, 6, 10, 8, 9, 4, 11, 14,
+    3, 8, 3, 7, 8, 5, 11, 4, 12, 3,
+    11, 9, 14, 8, 14, 13, 4, 3, 1, 2,
+    14, 6, 5, 4, 4, 11, 4, 6, 2, 1,
+    5, 8, 8, 12, 13, 5, 14, 10, 12, 13,
+    0, 9, 5, 5, 11, 10, 13, 9, 10, 13,
+};
+
+#define VP8_TRANSPOSE8x16_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,  \
+                                out0, out1, out2, out3,                  \
+                                out4, out5, out6, out7,                  \
+                                out8, out9, out10, out11,                \
+                                out12, out13, out14, out15)              \
+{                                                                        \
+    v8i16 temp0, temp1, temp2, temp3, temp4;                             \
+    v8i16 temp5, temp6, temp7, temp8, temp9;                             \
+                                                                         \
+    ILVR_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6,                   \
+               temp0, temp1, temp2, temp3);                              \
+    ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                \
+    ILVRL_W2_SH(temp5, temp4, temp6, temp7);                             \
+    ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                \
+    ILVRL_W2_SH(temp5, temp4, temp8, temp9);                             \
+    ILVL_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6,                   \
+               temp0, temp1, temp2, temp3);                              \
+    ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                \
+    ILVRL_W2_UB(temp5, temp4, out8, out10);                              \
+    ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                \
+    ILVRL_W2_UB(temp5, temp4, out12, out14);                             \
+    out0 = (v16u8)temp6;                                                 \
+    out2 = (v16u8)temp7;                                                 \
+    out4 = (v16u8)temp8;                                                 \
+    out6 = (v16u8)temp9;                                                 \
+    out9 = (v16u8)__msa_ilvl_d((v2i64)out8, (v2i64)out8);                \
+    out11 = (v16u8)__msa_ilvl_d((v2i64)out10, (v2i64)out10);             \
+    out13 = (v16u8)__msa_ilvl_d((v2i64)out12, (v2i64)out12);             \
+    out15 = (v16u8)__msa_ilvl_d((v2i64)out14, (v2i64)out14);             \
+    out1 = (v16u8)__msa_ilvl_d((v2i64)out0, (v2i64)out0);                \
+    out3 = (v16u8)__msa_ilvl_d((v2i64)out2, (v2i64)out2);                \
+    out5 = (v16u8)__msa_ilvl_d((v2i64)out4, (v2i64)out4);                \
+    out7 = (v16u8)__msa_ilvl_d((v2i64)out6, (v2i64)out6);                \
+}
+
+#define VP8_AVER_IF_RETAIN(above2_in, above1_in, src_in,    \
+                           below1_in, below2_in, ref, out)  \
+{                                                           \
+    v16u8 temp0, temp1;                                     \
+                                                            \
+    temp1 = __msa_aver_u_b(above2_in, above1_in);           \
+    temp0 = __msa_aver_u_b(below2_in, below1_in);           \
+    temp1 = __msa_aver_u_b(temp1, temp0);                   \
+    out = __msa_aver_u_b(src_in, temp1);                    \
+    temp0 = __msa_asub_u_b(src_in, above2_in);              \
+    temp1 = __msa_asub_u_b(src_in, above1_in);              \
+    temp0 = (temp0 < ref);                                  \
+    temp1 = (temp1 < ref);                                  \
+    temp0 = temp0 & temp1;                                  \
+    temp1 = __msa_asub_u_b(src_in, below1_in);              \
+    temp1 = (temp1 < ref);                                  \
+    temp0 = temp0 & temp1;                                  \
+    temp1 = __msa_asub_u_b(src_in, below2_in);              \
+    temp1 = (temp1 < ref);                                  \
+    temp0 = temp0 & temp1;                                  \
+    out = __msa_bmz_v(out, src_in, temp0);                  \
+}
+
+#define TRANSPOSE12x16_B(in0, in1, in2, in3, in4, in5, in6, in7,        \
+                         in8, in9, in10, in11, in12, in13, in14, in15)  \
+{                                                                       \
+    v8i16 temp0, temp1, temp2, temp3, temp4;                            \
+    v8i16 temp5, temp6, temp7, temp8, temp9;                            \
+                                                                        \
+    ILVR_B2_SH(in1, in0, in3, in2, temp0, temp1);                       \
+    ILVRL_H2_SH(temp1, temp0, temp2, temp3);                            \
+    ILVR_B2_SH(in5, in4, in7, in6, temp0, temp1);                       \
+    ILVRL_H2_SH(temp1, temp0, temp4, temp5);                            \
+    ILVRL_W2_SH(temp4, temp2, temp0, temp1);                            \
+    ILVRL_W2_SH(temp5, temp3, temp2, temp3);                            \
+    ILVR_B2_SH(in9, in8, in11, in10, temp4, temp5);                     \
+    ILVR_B2_SH(in9, in8, in11, in10, temp4, temp5);                     \
+    ILVRL_H2_SH(temp5, temp4, temp6, temp7);                            \
+    ILVR_B2_SH(in13, in12, in15, in14, temp4, temp5);                   \
+    ILVRL_H2_SH(temp5, temp4, temp8, temp9);                            \
+    ILVRL_W2_SH(temp8, temp6, temp4, temp5);                            \
+    ILVRL_W2_SH(temp9, temp7, temp6, temp7);                            \
+    ILVL_B2_SH(in1, in0, in3, in2, temp8, temp9);                       \
+    ILVR_D2_UB(temp4, temp0, temp5, temp1, in0, in2);                   \
+    in1 = (v16u8)__msa_ilvl_d((v2i64)temp4, (v2i64)temp0);              \
+    in3 = (v16u8)__msa_ilvl_d((v2i64)temp5, (v2i64)temp1);              \
+    ILVL_B2_SH(in5, in4, in7, in6, temp0, temp1);                       \
+    ILVR_D2_UB(temp6, temp2, temp7, temp3, in4, in6);                   \
+    in5 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp2);              \
+    in7 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp3);              \
+    ILVL_B4_SH(in9, in8, in11, in10, in13, in12, in15, in14,            \
+               temp2, temp3, temp4, temp5);                             \
+    ILVR_H4_SH(temp9, temp8, temp1, temp0, temp3, temp2, temp5, temp4,  \
+               temp6, temp7, temp8, temp9);                             \
+    ILVR_W2_SH(temp7, temp6, temp9, temp8, temp0, temp1);               \
+    in8 = (v16u8)__msa_ilvr_d((v2i64)temp1, (v2i64)temp0);              \
+    in9 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp0);              \
+    ILVL_W2_SH(temp7, temp6, temp9, temp8, temp2, temp3);               \
+    in10 = (v16u8)__msa_ilvr_d((v2i64)temp3, (v2i64)temp2);             \
+    in11 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp2);             \
+}
+
+#define VP8_TRANSPOSE12x8_UB_UB(in0, in1, in2, in3, in4, in5,    \
+                                in6, in7, in8, in9, in10, in11)  \
+{                                                                \
+    v8i16 temp0, temp1, temp2, temp3;                            \
+    v8i16 temp4, temp5, temp6, temp7;                            \
+                                                                 \
+    ILVR_B2_SH(in1, in0, in3, in2, temp0, temp1);                \
+    ILVRL_H2_SH(temp1, temp0, temp2, temp3);                     \
+    ILVR_B2_SH(in5, in4, in7, in6, temp0, temp1);                \
+    ILVRL_H2_SH(temp1, temp0, temp4, temp5);                     \
+    ILVRL_W2_SH(temp4, temp2, temp0, temp1);                     \
+    ILVRL_W2_SH(temp5, temp3, temp2, temp3);                     \
+    ILVL_B2_SH(in1, in0, in3, in2, temp4, temp5);                \
+    temp4 = __msa_ilvr_h(temp5, temp4);                          \
+    ILVL_B2_SH(in5, in4, in7, in6, temp6, temp7);                \
+    temp5 = __msa_ilvr_h(temp7, temp6);                          \
+    ILVRL_W2_SH(temp5, temp4, temp6, temp7);                     \
+    in0 = (v16u8)temp0;                                          \
+    in2 = (v16u8)temp1;                                          \
+    in4 = (v16u8)temp2;                                          \
+    in6 = (v16u8)temp3;                                          \
+    in8 = (v16u8)temp6;                                          \
+    in10 = (v16u8)temp7;                                         \
+    in1 = (v16u8)__msa_ilvl_d((v2i64)temp0, (v2i64)temp0);       \
+    in3 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp1);       \
+    in5 = (v16u8)__msa_ilvl_d((v2i64)temp2, (v2i64)temp2);       \
+    in7 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp3);       \
+    in9 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp6);       \
+    in11 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp7);      \
+}
+
+static void postproc_down_across_chroma_msa(uint8_t *src_ptr, uint8_t *dst_ptr,
+                                            int32_t src_stride,
+                                            int32_t dst_stride,
+                                            int32_t cols, uint8_t *f)
+{
+    uint8_t *p_src = src_ptr;
+    uint8_t *p_dst = dst_ptr;
+    uint8_t *f_orig = f;
+    uint8_t *p_dst_st = dst_ptr;
+    uint16_t col;
+    uint64_t out0, out1, out2, out3;
+    v16u8 above2, above1, below2, below1, src, ref, ref_temp;
+    v16u8 inter0, inter1, inter2, inter3, inter4, inter5;
+    v16u8 inter6, inter7, inter8, inter9, inter10, inter11;
+
+    for (col = (cols / 16); col--;)
+    {
+        ref = LD_UB(f);
+        LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1);
+        src = LD_UB(p_src);
+        LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2);
+        VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0);
+        above2 = LD_UB(p_src + 3 * src_stride);
+        VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1);
+        above1 = LD_UB(p_src + 4 * src_stride);
+        VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2);
+        src = LD_UB(p_src + 5 * src_stride);
+        VP8_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3);
+        below1 = LD_UB(p_src + 6 * src_stride);
+        VP8_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4);
+        below2 = LD_UB(p_src + 7 * src_stride);
+        VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5);
+        above2 = LD_UB(p_src + 8 * src_stride);
+        VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
+        above1 = LD_UB(p_src + 9 * src_stride);
+        VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
+        ST_UB8(inter0, inter1, inter2, inter3, inter4, inter5, inter6, inter7,
+               p_dst, dst_stride);
+
+        p_dst += 16;
+        p_src += 16;
+        f += 16;
+    }
+
+    if (0 != (cols / 16))
+    {
+        ref = LD_UB(f);
+        LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1);
+        src = LD_UB(p_src);
+        LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2);
+        VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0);
+        above2 = LD_UB(p_src + 3 * src_stride);
+        VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1);
+        above1 = LD_UB(p_src + 4 * src_stride);
+        VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2);
+        src = LD_UB(p_src + 5 * src_stride);
+        VP8_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3);
+        below1 = LD_UB(p_src + 6 * src_stride);
+        VP8_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4);
+        below2 = LD_UB(p_src + 7 * src_stride);
+        VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5);
+        above2 = LD_UB(p_src + 8 * src_stride);
+        VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
+        above1 = LD_UB(p_src + 9 * src_stride);
+        VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
+        out0 = __msa_copy_u_d((v2i64)inter0, 0);
+        out1 = __msa_copy_u_d((v2i64)inter1, 0);
+        out2 = __msa_copy_u_d((v2i64)inter2, 0);
+        out3 = __msa_copy_u_d((v2i64)inter3, 0);
+        SD4(out0, out1, out2, out3, p_dst, dst_stride);
+
+        out0 = __msa_copy_u_d((v2i64)inter4, 0);
+        out1 = __msa_copy_u_d((v2i64)inter5, 0);
+        out2 = __msa_copy_u_d((v2i64)inter6, 0);
+        out3 = __msa_copy_u_d((v2i64)inter7, 0);
+        SD4(out0, out1, out2, out3, p_dst + 4 * dst_stride, dst_stride);
+    }
+
+    f = f_orig;
+    p_dst = dst_ptr - 2;
+    LD_UB8(p_dst, dst_stride,
+           inter0, inter1, inter2, inter3, inter4, inter5, inter6, inter7);
+
+    for (col = 0; col < (cols / 8); ++col)
+    {
+        ref = LD_UB(f);
+        f += 8;
+        VP8_TRANSPOSE12x8_UB_UB(inter0, inter1, inter2, inter3,
+                                inter4, inter5, inter6, inter7,
+                                inter8, inter9, inter10, inter11);
+        if (0 == col)
+        {
+            above2 = inter2;
+            above1 = inter2;
+        }
+        else
+        {
+            above2 = inter0;
+            above1 = inter1;
+        }
+        src = inter2;
+        below1 = inter3;
+        below2 = inter4;
+        ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 0);
+        VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2,
+                           ref_temp, inter2);
+        above2 = inter5;
+        ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 1);
+        VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2,
+                           ref_temp, inter3);
+        above1 = inter6;
+        ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 2);
+        VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1,
+                           ref_temp, inter4);
+        src = inter7;
+        ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 3);
+        VP8_AVER_IF_RETAIN(below1, below2, above2, above1, src,
+                           ref_temp, inter5);
+        below1 = inter8;
+        ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 4);
+        VP8_AVER_IF_RETAIN(below2, above2, above1, src, below1,
+                           ref_temp, inter6);
+        below2 = inter9;
+        ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 5);
+        VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2,
+                           ref_temp, inter7);
+        if (col == (cols / 8 - 1))
+        {
+            above2 = inter9;
+        }
+        else
+        {
+            above2 = inter10;
+        }
+        ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 6);
+        VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2,
+                           ref_temp, inter8);
+        if (col == (cols / 8 - 1))
+        {
+            above1 = inter9;
+        }
+        else
+        {
+            above1 = inter11;
+        }
+        ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 7);
+        VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1,
+                           ref_temp, inter9);
+        TRANSPOSE8x8_UB_UB(inter2, inter3, inter4, inter5, inter6, inter7,
+                           inter8, inter9, inter2, inter3, inter4, inter5,
+                           inter6, inter7, inter8, inter9);
+        p_dst += 8;
+        LD_UB2(p_dst, dst_stride, inter0, inter1);
+        ST8x1_UB(inter2, p_dst_st);
+        ST8x1_UB(inter3, (p_dst_st + 1 * dst_stride));
+        LD_UB2(p_dst + 2 * dst_stride, dst_stride, inter2, inter3);
+        ST8x1_UB(inter4, (p_dst_st + 2 * dst_stride));
+        ST8x1_UB(inter5, (p_dst_st + 3 * dst_stride));
+        LD_UB2(p_dst + 4 * dst_stride, dst_stride, inter4, inter5);
+        ST8x1_UB(inter6, (p_dst_st + 4 * dst_stride));
+        ST8x1_UB(inter7, (p_dst_st + 5 * dst_stride));
+        LD_UB2(p_dst + 6 * dst_stride, dst_stride, inter6, inter7);
+        ST8x1_UB(inter8, (p_dst_st + 6 * dst_stride));
+        ST8x1_UB(inter9, (p_dst_st + 7 * dst_stride));
+        p_dst_st += 8;
+    }
+}
+
+static void postproc_down_across_luma_msa(uint8_t *src_ptr, uint8_t *dst_ptr,
+                                          int32_t src_stride,
+                                          int32_t dst_stride,
+                                          int32_t cols, uint8_t *f)
+{
+    uint8_t *p_src = src_ptr;
+    uint8_t *p_dst = dst_ptr;
+    uint8_t *p_dst_st = dst_ptr;
+    uint8_t *f_orig = f;
+    uint16_t col;
+    v16u8 above2, above1, below2, below1;
+    v16u8 src, ref, ref_temp;
+    v16u8 inter0, inter1, inter2, inter3, inter4, inter5, inter6;
+    v16u8 inter7, inter8, inter9, inter10, inter11;
+    v16u8 inter12, inter13, inter14, inter15;
+
+    for (col = (cols / 16); col--;)
+    {
+        ref = LD_UB(f);
+        LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1);
+        src = LD_UB(p_src);
+        LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2);
+        VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0);
+        above2 = LD_UB(p_src + 3 * src_stride);
+        VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1);
+        above1 = LD_UB(p_src + 4 * src_stride);
+        VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2);
+        src = LD_UB(p_src + 5 * src_stride);
+        VP8_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3);
+        below1 = LD_UB(p_src + 6 * src_stride);
+        VP8_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4);
+        below2 = LD_UB(p_src + 7 * src_stride);
+        VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5);
+        above2 = LD_UB(p_src + 8 * src_stride);
+        VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
+        above1 = LD_UB(p_src + 9 * src_stride);
+        VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
+        src = LD_UB(p_src + 10 * src_stride);
+        VP8_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter8);
+        below1 = LD_UB(p_src + 11 * src_stride);
+        VP8_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter9);
+        below2 = LD_UB(p_src + 12 * src_stride);
+        VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter10);
+        above2 = LD_UB(p_src + 13 * src_stride);
+        VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter11);
+        above1 = LD_UB(p_src + 14 * src_stride);
+        VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter12);
+        src = LD_UB(p_src + 15 * src_stride);
+        VP8_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter13);
+        below1 = LD_UB(p_src + 16 * src_stride);
+        VP8_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter14);
+        below2 = LD_UB(p_src + 17 * src_stride);
+        VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter15);
+        ST_UB8(inter0, inter1, inter2, inter3, inter4, inter5, inter6, inter7,
+               p_dst, dst_stride);
+        ST_UB8(inter8, inter9, inter10, inter11, inter12, inter13,
+               inter14, inter15, p_dst + 8 * dst_stride, dst_stride);
+        p_src += 16;
+        p_dst += 16;
+        f += 16;
+    }
+
+    f = f_orig;
+    p_dst = dst_ptr - 2;
+    LD_UB8(p_dst, dst_stride,
+           inter0, inter1, inter2, inter3, inter4, inter5, inter6, inter7);
+    LD_UB8(p_dst + 8 * dst_stride, dst_stride,
+           inter8, inter9, inter10, inter11, inter12, inter13,
+           inter14, inter15);
+
+    for (col = 0; col < cols / 8; ++col)
+    {
+        ref = LD_UB(f);
+        f += 8;
+        TRANSPOSE12x16_B(inter0, inter1, inter2, inter3, inter4, inter5,
+                         inter6, inter7, inter8, inter9, inter10, inter11,
+                         inter12, inter13, inter14, inter15);
+        if (0 == col)
+        {
+            above2 = inter2;
+            above1 = inter2;
+        }
+        else
+        {
+            above2 = inter0;
+            above1 = inter1;
+        }
+
+        src = inter2;
+        below1 = inter3;
+        below2 = inter4;
+        ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 0);
+        VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2,
+                           ref_temp, inter2);
+        above2 = inter5;
+        ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 1);
+        VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2,
+                           ref_temp, inter3);
+        above1 = inter6;
+        ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 2);
+        VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1,
+                           ref_temp, inter4);
+        src = inter7;
+        ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 3);
+        VP8_AVER_IF_RETAIN(below1, below2, above2, above1, src,
+                           ref_temp, inter5);
+        below1 = inter8;
+        ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 4);
+        VP8_AVER_IF_RETAIN(below2, above2, above1, src, below1,
+                           ref_temp, inter6);
+        below2 = inter9;
+        ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 5);
+        VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2,
+                           ref_temp, inter7);
+        if (col == (cols / 8 - 1))
+        {
+            above2 = inter9;
+        }
+        else
+        {
+            above2 = inter10;
+        }
+        ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 6);
+        VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2,
+                           ref_temp, inter8);
+        if (col == (cols / 8 - 1))
+        {
+            above1 = inter9;
+        }
+        else
+        {
+            above1 = inter11;
+        }
+        ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 7);
+        VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1,
+                           ref_temp, inter9);
+        VP8_TRANSPOSE8x16_UB_UB(inter2, inter3, inter4, inter5,
+                                inter6, inter7, inter8, inter9,
+                                inter2, inter3, inter4, inter5,
+                                inter6, inter7, inter8, inter9,
+                                inter10, inter11, inter12, inter13,
+                                inter14, inter15, above2, above1);
+
+        p_dst += 8;
+        LD_UB2(p_dst, dst_stride, inter0, inter1);
+        ST8x1_UB(inter2, p_dst_st);
+        ST8x1_UB(inter3, (p_dst_st + 1 * dst_stride));
+        LD_UB2(p_dst + 2 * dst_stride, dst_stride, inter2, inter3);
+        ST8x1_UB(inter4, (p_dst_st + 2 * dst_stride));
+        ST8x1_UB(inter5, (p_dst_st + 3 * dst_stride));
+        LD_UB2(p_dst + 4 * dst_stride, dst_stride, inter4, inter5);
+        ST8x1_UB(inter6, (p_dst_st + 4 * dst_stride));
+        ST8x1_UB(inter7, (p_dst_st + 5 * dst_stride));
+        LD_UB2(p_dst + 6 * dst_stride, dst_stride, inter6, inter7);
+        ST8x1_UB(inter8, (p_dst_st + 6 * dst_stride));
+        ST8x1_UB(inter9, (p_dst_st + 7 * dst_stride));
+        LD_UB2(p_dst + 8 * dst_stride, dst_stride, inter8, inter9);
+        ST8x1_UB(inter10, (p_dst_st + 8 * dst_stride));
+        ST8x1_UB(inter11, (p_dst_st + 9 * dst_stride));
+        LD_UB2(p_dst + 10 * dst_stride, dst_stride, inter10, inter11);
+        ST8x1_UB(inter12, (p_dst_st + 10 * dst_stride));
+        ST8x1_UB(inter13, (p_dst_st + 11 * dst_stride));
+        LD_UB2(p_dst + 12 * dst_stride, dst_stride, inter12, inter13);
+        ST8x1_UB(inter14, (p_dst_st + 12 * dst_stride));
+        ST8x1_UB(inter15, (p_dst_st + 13 * dst_stride));
+        LD_UB2(p_dst + 14 * dst_stride, dst_stride, inter14, inter15);
+        ST8x1_UB(above2, (p_dst_st + 14 * dst_stride));
+        ST8x1_UB(above1, (p_dst_st + 15 * dst_stride));
+        p_dst_st += 8;
+    }
+}
+
+void vp8_post_proc_down_and_across_mb_row_msa(uint8_t *src, uint8_t *dst,
+                                              int32_t src_stride,
+                                              int32_t dst_stride,
+                                              int32_t cols, uint8_t *f,
+                                              int32_t size)
+{
+    if (8 == size)
+    {
+        postproc_down_across_chroma_msa(src, dst, src_stride, dst_stride,
+                                        cols, f);
+    }
+    else if (16 == size)
+    {
+        postproc_down_across_luma_msa(src, dst, src_stride, dst_stride,
+                                      cols, f);
+    }
+}
+
+void vp8_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
+                                   int32_t rows, int32_t cols, int32_t flimit)
+{
+    int32_t row, col, cnt;
+    uint8_t *src_dup = src_ptr;
+    v16u8 src0, src, tmp_orig;
+    v16u8 tmp = { 0 };
+    v16i8 zero = { 0 };
+    v8u16 sum_h, src_r_h, src_l_h;
+    v4u32 src_r_w, src_l_w;
+    v4i32 flimit_vec;
+
+    flimit_vec = __msa_fill_w(flimit);
+    for (row = rows; row--;)
+    {
+        int32_t sum_sq = 0;
+        int32_t sum = 0;
+        src0 = (v16u8)__msa_fill_b(src_dup[0]);
+        ST8x1_UB(src0, (src_dup - 8));
+
+        src0 = (v16u8)__msa_fill_b(src_dup[cols - 1]);
+        ST_UB(src0, src_dup + cols);
+        src_dup[cols + 16] = src_dup[cols - 1];
+        tmp_orig = (v16u8)__msa_ldi_b(0);
+        tmp_orig[15] = tmp[15];
+        src = LD_UB(src_dup - 8);
+        src[15] = 0;
+        ILVRL_B2_UH(zero, src, src_r_h, src_l_h);
+        src_r_w = __msa_dotp_u_w(src_r_h, src_r_h);
+        src_l_w = __msa_dotp_u_w(src_l_h, src_l_h);
+        sum_sq = HADD_SW_S32(src_r_w);
+        sum_sq += HADD_SW_S32(src_l_w);
+        sum_h = __msa_hadd_u_h(src, src);
+        sum = HADD_UH_U32(sum_h);
+        {
+            v16u8 src7, src8, src_r, src_l;
+            v16i8 mask;
+            v8u16 add_r, add_l;
+            v8i16 sub_r, sub_l, sum_r, sum_l, mask0, mask1;
+            v4i32 sum_sq0, sum_sq1, sum_sq2, sum_sq3;
+            v4i32 sub0, sub1, sub2, sub3;
+            v4i32 sum0_w, sum1_w, sum2_w, sum3_w;
+            v4i32 mul0, mul1, mul2, mul3;
+            v4i32 total0, total1, total2, total3;
+            v8i16 const8 = __msa_fill_h(8);
+
+            src7 = LD_UB(src_dup + 7);
+            src8 = LD_UB(src_dup - 8);
+            for (col = 0; col < (cols >> 4); ++col)
+            {
+                ILVRL_B2_UB(src7, src8, src_r, src_l);
+                HSUB_UB2_SH(src_r, src_l, sub_r, sub_l);
+
+                sum_r[0] = sum + sub_r[0];
+                for (cnt = 0; cnt < 7; ++cnt)
+                {
+                    sum_r[cnt + 1] = sum_r[cnt] + sub_r[cnt + 1];
+                }
+                sum_l[0] = sum_r[7] + sub_l[0];
+                for (cnt = 0; cnt < 7; ++cnt)
+                {
+                    sum_l[cnt + 1] = sum_l[cnt] + sub_l[cnt + 1];
+                }
+                sum = sum_l[7];
+                src = LD_UB(src_dup + 16 * col);
+                ILVRL_B2_UH(zero, src, src_r_h, src_l_h);
+                src7 = (v16u8)((const8 + sum_r + (v8i16)src_r_h) >> 4);
+                src8 = (v16u8)((const8 + sum_l + (v8i16)src_l_h) >> 4);
+                tmp = (v16u8)__msa_pckev_b((v16i8)src8, (v16i8)src7);
+
+                HADD_UB2_UH(src_r, src_l, add_r, add_l);
+                UNPCK_SH_SW(sub_r, sub0, sub1);
+                UNPCK_SH_SW(sub_l, sub2, sub3);
+                ILVR_H2_SW(zero, add_r, zero, add_l, sum0_w, sum2_w);
+                ILVL_H2_SW(zero, add_r, zero, add_l, sum1_w, sum3_w);
+                MUL4(sum0_w, sub0, sum1_w, sub1, sum2_w, sub2, sum3_w, sub3,
+                     mul0, mul1, mul2, mul3);
+                sum_sq0[0] = sum_sq + mul0[0];
+                for (cnt = 0; cnt < 3; ++cnt)
+                {
+                    sum_sq0[cnt + 1] = sum_sq0[cnt] + mul0[cnt + 1];
+                }
+                sum_sq1[0] = sum_sq0[3] + mul1[0];
+                for (cnt = 0; cnt < 3; ++cnt)
+                {
+                    sum_sq1[cnt + 1] = sum_sq1[cnt] + mul1[cnt + 1];
+                }
+                sum_sq2[0] = sum_sq1[3] + mul2[0];
+                for (cnt = 0; cnt < 3; ++cnt)
+                {
+                    sum_sq2[cnt + 1] = sum_sq2[cnt] + mul2[cnt + 1];
+                }
+                sum_sq3[0] = sum_sq2[3] + mul3[0];
+                for (cnt = 0; cnt < 3; ++cnt)
+                {
+                    sum_sq3[cnt + 1] = sum_sq3[cnt] + mul3[cnt + 1];
+                }
+                sum_sq = sum_sq3[3];
+
+                UNPCK_SH_SW(sum_r, sum0_w, sum1_w);
+                UNPCK_SH_SW(sum_l, sum2_w, sum3_w);
+                total0 = sum_sq0 * __msa_ldi_w(15);
+                total0 -= sum0_w * sum0_w;
+                total1 = sum_sq1 * __msa_ldi_w(15);
+                total1 -= sum1_w * sum1_w;
+                total2 = sum_sq2 * __msa_ldi_w(15);
+                total2 -= sum2_w * sum2_w;
+                total3 = sum_sq3 * __msa_ldi_w(15);
+                total3 -= sum3_w * sum3_w;
+                total0 = (total0 < flimit_vec);
+                total1 = (total1 < flimit_vec);
+                total2 = (total2 < flimit_vec);
+                total3 = (total3 < flimit_vec);
+                PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1);
+                mask = __msa_pckev_b((v16i8)mask1, (v16i8)mask0);
+                tmp = __msa_bmz_v(tmp, src, (v16u8)mask);
+
+                if (col == 0)
+                {
+                    uint64_t src_d;
+
+                    src_d = __msa_copy_u_d((v2i64)tmp_orig, 1);
+                    SD(src_d, (src_dup - 8));
+                }
+
+                src7 = LD_UB(src_dup + 16 * (col + 1) + 7);
+                src8 = LD_UB(src_dup + 16 * (col + 1) - 8);
+                ST_UB(tmp, (src_dup + (16 * col)));
+            }
+
+            src_dup += pitch;
+        }
+    }
+}
+
+void vp8_mbpost_proc_down_msa(uint8_t *dst_ptr, int32_t pitch, int32_t rows,
+                              int32_t cols, int32_t flimit)
+{
+    int32_t row, col, cnt, i;
+    const int16_t *rv3 = &vp8_rv_msa[63 & rand()];
+    v4i32 flimit_vec;
+    v16u8 dst7, dst8, dst_r_b, dst_l_b;
+    v16i8 mask;
+    v8u16 add_r, add_l;
+    v8i16 dst_r_h, dst_l_h, sub_r, sub_l, mask0, mask1;
+    v4i32 sub0, sub1, sub2, sub3, total0, total1, total2, total3;
+
+    flimit_vec = __msa_fill_w(flimit);
+
+    for (col = 0; col < (cols >> 4); ++col)
+    {
+        uint8_t *dst_tmp = &dst_ptr[col << 4];
+        v16u8 dst;
+        v16i8 zero = { 0 };
+        v16u8 tmp[16];
+        v8i16 mult0, mult1, rv2_0, rv2_1;
+        v8i16 sum0_h = { 0 };
+        v8i16 sum1_h = { 0 };
+        v4i32 mul0 = { 0 };
+        v4i32 mul1 = { 0 };
+        v4i32 mul2 = { 0 };
+        v4i32 mul3 = { 0 };
+        v4i32 sum0_w, sum1_w, sum2_w, sum3_w;
+        v4i32 add0, add1, add2, add3;
+        const int16_t *rv2[16];
+
+        dst = LD_UB(dst_tmp);
+        for (cnt = (col << 4), i = 0; i < 16; ++cnt)
+        {
+            rv2[i] = rv3 + ((cnt * 17) & 127);
+            ++i;
+        }
+        for (cnt = -8; cnt < 0; ++cnt)
+        {
+            ST_UB(dst, dst_tmp + cnt * pitch);
+        }
+
+        dst = LD_UB((dst_tmp + (rows - 1) * pitch));
+        for (cnt = rows; cnt < rows + 17; ++cnt)
+        {
+            ST_UB(dst, dst_tmp + cnt * pitch);
+        }
+        for (cnt = -8; cnt <= 6; ++cnt)
+        {
+            dst = LD_UB(dst_tmp + (cnt * pitch));
+            UNPCK_UB_SH(dst, dst_r_h, dst_l_h);
+            MUL2(dst_r_h, dst_r_h, dst_l_h, dst_l_h, mult0, mult1);
+            mul0 += (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)mult0);
+            mul1 += (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)mult0);
+            mul2 += (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)mult1);
+            mul3 += (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)mult1);
+            ADD2(sum0_h, dst_r_h, sum1_h, dst_l_h, sum0_h, sum1_h);
+        }
+
+        for (row = 0; row < (rows + 8); ++row)
+        {
+            for (i = 0; i < 8; ++i)
+            {
+                rv2_0[i] = *(rv2[i] + (row & 127));
+                rv2_1[i] = *(rv2[i + 8] + (row & 127));
+            }
+            dst7 = LD_UB(dst_tmp + (7 * pitch));
+            dst8 = LD_UB(dst_tmp - (8 * pitch));
+            ILVRL_B2_UB(dst7, dst8, dst_r_b, dst_l_b);
+
+            HSUB_UB2_SH(dst_r_b, dst_l_b, sub_r, sub_l);
+            UNPCK_SH_SW(sub_r, sub0, sub1);
+            UNPCK_SH_SW(sub_l, sub2, sub3);
+            sum0_h += sub_r;
+            sum1_h += sub_l;
+
+            HADD_UB2_UH(dst_r_b, dst_l_b, add_r, add_l);
+
+            ILVRL_H2_SW(zero, add_r, add0, add1);
+            ILVRL_H2_SW(zero, add_l, add2, add3);
+            mul0 += add0 * sub0;
+            mul1 += add1 * sub1;
+            mul2 += add2 * sub2;
+            mul3 += add3 * sub3;
+            dst = LD_UB(dst_tmp);
+            ILVRL_B2_SH(zero, dst, dst_r_h, dst_l_h);
+            dst7 = (v16u8)((rv2_0 + sum0_h + dst_r_h) >> 4);
+            dst8 = (v16u8)((rv2_1 + sum1_h + dst_l_h) >> 4);
+            tmp[row & 15] = (v16u8)__msa_pckev_b((v16i8)dst8, (v16i8)dst7);
+
+            UNPCK_SH_SW(sum0_h, sum0_w, sum1_w);
+            UNPCK_SH_SW(sum1_h, sum2_w, sum3_w);
+            total0 = mul0 * __msa_ldi_w(15);
+            total0 -= sum0_w * sum0_w;
+            total1 = mul1 * __msa_ldi_w(15);
+            total1 -= sum1_w * sum1_w;
+            total2 = mul2 * __msa_ldi_w(15);
+            total2 -= sum2_w * sum2_w;
+            total3 = mul3 * __msa_ldi_w(15);
+            total3 -= sum3_w * sum3_w;
+            total0 = (total0 < flimit_vec);
+            total1 = (total1 < flimit_vec);
+            total2 = (total2 < flimit_vec);
+            total3 = (total3 < flimit_vec);
+            PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1);
+            mask = __msa_pckev_b((v16i8)mask1, (v16i8)mask0);
+            tmp[row & 15] = __msa_bmz_v(tmp[row & 15], dst, (v16u8)mask);
+
+            if (row >= 8)
+            {
+                ST_UB(tmp[(row - 8) & 15], (dst_tmp - 8 * pitch));
+            }
+
+            dst_tmp += pitch;
+        }
+    }
+}
+
+void vp8_plane_add_noise_msa(uint8_t *start_ptr, char *noise,
+                             char blackclamp[16], char whiteclamp[16],
+                             char bothclamp[16],
+                             uint32_t width, uint32_t height,
+                             int32_t pitch)
+{
+    uint32_t i, j;
+
+    for (i = 0; i < height / 2; ++i)
+    {
+        uint8_t *pos0_ptr = start_ptr + (2 * i) * pitch;
+        int8_t *ref0_ptr = (int8_t *) (noise + (rand() & 0xff));
+        uint8_t *pos1_ptr = start_ptr + (2 * i + 1) * pitch;
+        int8_t *ref1_ptr = (int8_t *) (noise + (rand() & 0xff));
+        for (j = width / 16; j--;)
+        {
+            v16i8 temp00_s, temp01_s;
+            v16u8 temp00, temp01, black_clamp, white_clamp;
+            v16u8 pos0, ref0, pos1, ref1;
+            v16i8 const127 = __msa_ldi_b(127);
+
+            pos0 = LD_UB(pos0_ptr);
+            ref0 = LD_UB(ref0_ptr);
+            pos1 = LD_UB(pos1_ptr);
+            ref1 = LD_UB(ref1_ptr);
+            black_clamp = (v16u8)__msa_fill_b(blackclamp[0]);
+            white_clamp = (v16u8)__msa_fill_b(whiteclamp[0]);
+            temp00 = (pos0 < black_clamp);
+            pos0 = __msa_bmnz_v(pos0, black_clamp, temp00);
+            temp01 = (pos1 < black_clamp);
+            pos1 = __msa_bmnz_v(pos1, black_clamp, temp01);
+            XORI_B2_128_UB(pos0, pos1);
+            temp00_s = __msa_adds_s_b((v16i8)white_clamp, const127);
+            temp00 = (v16u8)(temp00_s < pos0);
+            pos0 = (v16u8)__msa_bmnz_v((v16u8)pos0, (v16u8)temp00_s, temp00);
+            temp01_s = __msa_adds_s_b((v16i8)white_clamp, const127);
+            temp01 = (temp01_s < pos1);
+            pos1 = (v16u8)__msa_bmnz_v((v16u8)pos1, (v16u8)temp01_s, temp01);
+            XORI_B2_128_UB(pos0, pos1);
+            pos0 += ref0;
+            ST_UB(pos0, pos0_ptr);
+            pos1 += ref1;
+            ST_UB(pos1, pos1_ptr);
+            pos0_ptr += 16;
+            pos1_ptr += 16;
+            ref0_ptr += 16;
+            ref1_ptr += 16;
+        }
+    }
+}
diff --git a/libs/libvpx/vp8/common/mips/msa/sixtap_filter_msa.c b/libs/libvpx/vp8/common/mips/msa/sixtap_filter_msa.c
new file mode 100644
index 0000000000..fb60fc1346
--- /dev/null
+++ b/libs/libvpx/vp8/common/mips/msa/sixtap_filter_msa.c
@@ -0,0 +1,1850 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vpx_ports/mem.h"
+#include "vp8/common/filter.h"
+#include "vp8/common/mips/msa/vp8_macros_msa.h"
+
+DECLARE_ALIGNED(16, static const int8_t, vp8_subpel_filters_msa[7][8]) =
+{
+    { 0, -6, 123, 12, -1, 0, 0, 0 },
+    { 2, -11, 108, 36, -8, 1, 0, 0 },  /* New 1/4 pel 6 tap filter */
+    { 0, -9, 93, 50, -6, 0, 0, 0 },
+    { 3, -16, 77, 77, -16, 3, 0, 0 },  /* New 1/2 pel 6 tap filter */
+    { 0, -6, 50, 93, -9, 0, 0, 0 },
+    { 1, -8, 36, 108, -11, 2, 0, 0 },  /* New 1/4 pel 6 tap filter */
+    { 0, -1, 12, 123, -6, 0, 0, 0 },
+};
+
+static const uint8_t vp8_mc_filt_mask_arr[16 * 3] =
+{
+    /* 8 width cases */
+    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+    /* 4 width cases */
+    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+    /* 4 width cases */
+    8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+#define HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2,                 \
+                        filt_h0, filt_h1, filt_h2)                       \
+({                                                                       \
+    v16i8 vec0_m, vec1_m, vec2_m;                                        \
+    v8i16 hz_out_m;                                                      \
+                                                                         \
+    VSHF_B3_SB(src0, src1, src0, src1, src0, src1, mask0, mask1, mask2,  \
+               vec0_m, vec1_m, vec2_m);                                  \
+    hz_out_m = DPADD_SH3_SH(vec0_m, vec1_m, vec2_m,                      \
+                            filt_h0, filt_h1, filt_h2);                  \
+                                                                         \
+    hz_out_m = __msa_srari_h(hz_out_m, VP8_FILTER_SHIFT);                \
+    hz_out_m = __msa_sat_s_h(hz_out_m, 7);                               \
+                                                                         \
+    hz_out_m;                                                            \
+})
+
+#define HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3,             \
+                                   mask0, mask1, mask2,                \
+                                   filt0, filt1, filt2,                \
+                                   out0, out1)                         \
+{                                                                      \
+    v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m;              \
+                                                                       \
+    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);  \
+    DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1);             \
+    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);  \
+    DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1);            \
+    VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m);  \
+    DPADD_SB2_SH(vec4_m, vec5_m, filt2, filt2, out0, out1);            \
+}
+
+#define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3,                    \
+                                   mask0, mask1, mask2,                       \
+                                   filt0, filt1, filt2,                       \
+                                   out0, out1, out2, out3)                    \
+{                                                                             \
+    v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;     \
+                                                                              \
+    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);         \
+    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,   \
+                out0, out1, out2, out3);                                      \
+    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);         \
+    VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec4_m, vec5_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec6_m, vec7_m);         \
+    DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,  \
+                 out0, out1, out2, out3);                                     \
+    DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt2, filt2, filt2, filt2,  \
+                 out0, out1, out2, out3);                                     \
+}
+
+#define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1)         \
+({                                                            \
+    v8i16 tmp0;                                               \
+                                                              \
+    tmp0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0);         \
+    tmp0 = __msa_dpadd_s_h(tmp0, (v16i8)vec1, (v16i8)filt1);  \
+                                                              \
+    tmp0;                                                     \
+})
+
+#define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1)    \
+({                                                                     \
+    v16i8 vec0_m, vec1_m;                                              \
+    v8i16 hz_out_m;                                                    \
+                                                                       \
+    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0_m, vec1_m);  \
+    hz_out_m = FILT_4TAP_DPADD_S_H(vec0_m, vec1_m, filt_h0, filt_h1);  \
+                                                                       \
+    hz_out_m = __msa_srari_h(hz_out_m, VP8_FILTER_SHIFT);              \
+    hz_out_m = __msa_sat_s_h(hz_out_m, 7);                             \
+                                                                       \
+    hz_out_m;                                                          \
+})
+
+#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3,             \
+                                   mask0, mask1, filt0, filt1,         \
+                                   out0, out1)                         \
+{                                                                      \
+    v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                              \
+                                                                       \
+    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);  \
+    DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1);             \
+    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);  \
+    DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1);            \
+}
+
+#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3,                    \
+                                   mask0, mask1, filt0, filt1,                \
+                                   out0, out1, out2, out3)                    \
+{                                                                             \
+    v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                                     \
+                                                                              \
+    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);         \
+    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,   \
+                out0, out1, out2, out3);                                      \
+    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);         \
+    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);         \
+    DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,  \
+                 out0, out1, out2, out3);                                     \
+}
+
+static void common_hz_6t_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
+    v16u8 mask0, mask1, mask2, out;
+    v8i16 filt, out0, out1;
+
+    mask0 = LD_UB(&vp8_mc_filt_mask_arr[16]);
+    src -= 2;
+
+    filt = LD_SH(filter);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               filt0, filt1, filt2, out0, out1);
+    SRARI_H2_SH(out0, out1, VP8_FILTER_SHIFT);
+    SAT_SH2_SH(out0, out1, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_6t_4x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
+    v16u8 mask0, mask1, mask2, out;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_UB(&vp8_mc_filt_mask_arr[16]);
+    src -= 2;
+
+    filt = LD_SH(filter);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    src += (4 * src_stride);
+    HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               filt0, filt1, filt2, out0, out1);
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               filt0, filt1, filt2, out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_6t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height)
+{
+    if (4 == height)
+    {
+        common_hz_6t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+    }
+    else if (8 == height)
+    {
+        common_hz_6t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+    }
+}
+
+static void common_hz_6t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
+    v16u8 mask0, mask1, mask2, tmp0, tmp1;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_UB(&vp8_mc_filt_mask_arr[0]);
+    src -= 2;
+
+    filt = LD_SH(filter);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    src += (4 * src_stride);
+    HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               filt0, filt1, filt2, out0, out1, out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    tmp0 = PCKEV_XORI128_UB(out0, out1);
+    tmp1 = PCKEV_XORI128_UB(out2, out3);
+    ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    for (loop_cnt = (height >> 2) - 1; loop_cnt--;)
+    {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        src += (4 * src_stride);
+        HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                                   filt0, filt1, filt2, out0, out1, out2, out3);
+        SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        tmp0 = PCKEV_XORI128_UB(out0, out1);
+        tmp1 = PCKEV_XORI128_UB(out2, out3);
+        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void common_hz_6t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, filt0, filt1, filt2;
+    v16u8 mask0, mask1, mask2, out;
+    v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
+
+    mask0 = LD_UB(&vp8_mc_filt_mask_arr[0]);
+    src -= 2;
+
+    filt = LD_SH(filter);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;)
+    {
+        LD_SB4(src, src_stride, src0, src2, src4, src6);
+        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+        src += (4 * src_stride);
+
+        HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                                   filt0, filt1, filt2, out0, out1, out2, out3);
+        HORIZ_6TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
+                                   filt0, filt1, filt2, out4, out5, out6, out7);
+        SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
+        SRARI_H4_SH(out4, out5, out6, out7, VP8_FILTER_SHIFT);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        SAT_SH4_SH(out4, out5, out6, out7, 7);
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out4, out5);
+        ST_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out6, out7);
+        ST_UB(out, dst);
+        dst += dst_stride;
+    }
+}
+
+static void common_vt_6t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+    v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
+    v16u8 out;
+    v8i16 filt, out10, out32;
+
+    src -= (2 * src_stride);
+
+    filt = LD_SH(filter);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+               src32_r, src43_r);
+    ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+    XORI_B2_128_SB(src2110, src4332);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;)
+    {
+        LD_SB4(src, src_stride, src5, src6, src7, src8);
+        src += (4 * src_stride);
+
+        ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
+                   src65_r, src76_r, src87_r);
+        ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
+        XORI_B2_128_SB(src6554, src8776);
+        out10 = DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
+        out32 = DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
+        SRARI_H2_SH(out10, out32, VP8_FILTER_SHIFT);
+        SAT_SH2_SH(out10, out32, 7);
+        out = PCKEV_XORI128_UB(out10, out32);
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src2110 = src6554;
+        src4332 = src8776;
+        src4 = src8;
+    }
+}
+
+static void common_vt_6t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10;
+    v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
+    v16i8 src109_r, filt0, filt1, filt2;
+    v16u8 tmp0, tmp1;
+    v8i16 filt, out0_r, out1_r, out2_r, out3_r;
+
+    src -= (2 * src_stride);
+
+    filt = LD_SH(filter);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    ILVR_B4_SB(src1, src0, src3, src2, src2, src1, src4, src3, src10_r, src32_r,
+               src21_r, src43_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;)
+    {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r,
+                   src87_r, src98_r, src109_r);
+        out0_r = DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
+        out1_r = DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
+        out2_r = DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
+        out3_r = DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, VP8_FILTER_SHIFT);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
+        tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
+        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src76_r;
+        src32_r = src98_r;
+        src21_r = src87_r;
+        src43_r = src109_r;
+        src4 = src10;
+    }
+}
+
+static void common_vt_6t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+    v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
+    v16i8 src65_l, src87_l, filt0, filt1, filt2;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt;
+
+    src -= (2 * src_stride);
+
+    filt = LD_SH(filter);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    ILVR_B4_SB(src1, src0, src3, src2, src4, src3, src2, src1, src10_r,
+               src32_r, src43_r, src21_r);
+    ILVL_B4_SB(src1, src0, src3, src2, src4, src3, src2, src1, src10_l,
+               src32_l, src43_l, src21_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;)
+    {
+        LD_SB4(src, src_stride, src5, src6, src7, src8);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src5, src6, src7, src8);
+        ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
+                   src65_r, src76_r, src87_r);
+        ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
+                   src65_l, src76_l, src87_l);
+        out0_r = DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
+        out1_r = DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
+        out2_r = DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
+        out3_r = DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
+        out0_l = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
+        out1_l = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
+        out2_l = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
+        out3_l = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, VP8_FILTER_SHIFT);
+        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, VP8_FILTER_SHIFT);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+                    out3_r, tmp0, tmp1, tmp2, tmp3);
+        XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
+        ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src10_l = src54_l;
+        src32_l = src76_l;
+        src21_l = src65_l;
+        src43_l = src87_l;
+        src4 = src8;
+    }
+}
+
+static void common_hv_6ht_6vt_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                     uint8_t *RESTRICT dst, int32_t dst_stride,
+                                     const int8_t *filter_horiz,
+                                     const int8_t *filter_vert,
+                                     int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 filt_hz0, filt_hz1, filt_hz2;
+    v16u8 mask0, mask1, mask2, out;
+    v8i16 tmp0, tmp1;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8i16 hz_out7, filt, filt_vt0, filt_vt1, filt_vt2, out0, out1, out2, out3;
+
+    mask0 = LD_UB(&vp8_mc_filt_mask_arr[16]);
+    src -= (2 + 2 * src_stride);
+
+    filt = LD_SH(filter_horiz);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
+    filt = LD_SH(filter_vert);
+    SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out2 = HORIZ_6TAP_FILT(src2, src3, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out1 = (v8i16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
+    hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;)
+    {
+        LD_SB2(src, src_stride, src5, src6);
+        src += (2 * src_stride);
+
+        XORI_B2_128_SB(src5, src6);
+        hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        hz_out4 = (v8i16)__msa_sldi_b((v16i8)hz_out5, (v16i8)hz_out3, 8);
+
+        LD_SB2(src, src_stride, src7, src8);
+        src += (2 * src_stride);
+
+        XORI_B2_128_SB(src7, src8);
+        hz_out7 = HORIZ_6TAP_FILT(src7, src8, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
+
+        out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
+        tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+
+        out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
+        tmp1 = DPADD_SH3_SH(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
+
+        SRARI_H2_SH(tmp0, tmp1, 7);
+        SAT_SH2_SH(tmp0, tmp1, 7);
+        out = PCKEV_XORI128_UB(tmp0, tmp1);
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        hz_out3 = hz_out7;
+        out0 = out2;
+        out1 = out3;
+    }
+}
+
+static void common_hv_6ht_6vt_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                     uint8_t *RESTRICT dst, int32_t dst_stride,
+                                     const int8_t *filter_horiz,
+                                     const int8_t *filter_vert,
+                                     int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 filt_hz0, filt_hz1, filt_hz2;
+    v16u8 mask0, mask1, mask2, vec0, vec1;
+    v8i16 filt, filt_vt0, filt_vt1, filt_vt2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4,  hz_out5, hz_out6;
+    v8i16 hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
+    v8i16 tmp0, tmp1, tmp2, tmp3;
+
+    mask0 = LD_UB(&vp8_mc_filt_mask_arr[0]);
+    src -= (2 + 2 * src_stride);
+
+    filt = LD_SH(filter_horiz);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out4 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+
+    filt = LD_SH(filter_vert);
+    SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
+
+    ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
+    ILVEV_B2_SH(hz_out1, hz_out2, hz_out3, hz_out4, out3, out4);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;)
+    {
+        LD_SB4(src, src_stride, src5, src6, src7, src8);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src5, src6, src7, src8);
+        hz_out5 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
+        tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+
+        hz_out6 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        out5 = (v8i16)__msa_ilvev_b((v16i8)hz_out6, (v16i8)hz_out5);
+        tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
+
+        hz_out7 = HORIZ_6TAP_FILT(src7, src7, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
+        tmp2 = DPADD_SH3_SH(out1, out2, out7, filt_vt0, filt_vt1, filt_vt2);
+
+        hz_out8 = HORIZ_6TAP_FILT(src8, src8, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        out6 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
+        tmp3 = DPADD_SH3_SH(out4, out5, out6, filt_vt0, filt_vt1, filt_vt2);
+
+        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
+        vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
+        ST8x4_UB(vec0, vec1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        hz_out4 = hz_out8;
+        out0 = out2;
+        out1 = out7;
+        out3 = out5;
+        out4 = out6;
+    }
+}
+
+static void common_hv_6ht_6vt_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                      uint8_t *RESTRICT dst, int32_t dst_stride,
+                                      const int8_t *filter_horiz,
+                                      const int8_t *filter_vert,
+                                      int32_t height)
+{
+    int32_t multiple8_cnt;
+    for (multiple8_cnt = 2; multiple8_cnt--;)
+    {
+        common_hv_6ht_6vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+                                 filter_vert, height);
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void common_hz_4t_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
+    v8i16 filt, out0, out1;
+    v16u8 out;
+
+    mask0 = LD_SB(&vp8_mc_filt_mask_arr[16]);
+    src -= 1;
+
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
+                               filt0, filt1, out0, out1);
+    SRARI_H2_SH(out0, out1, VP8_FILTER_SHIFT);
+    SAT_SH2_SH(out0, out1, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_4t_4x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter)
+{
+    v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
+    v16u8 out;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_SB(&vp8_mc_filt_mask_arr[16]);
+    src -= 1;
+
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
+                               filt0, filt1, out0, out1);
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
+                               filt0, filt1, out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_4t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height)
+{
+    if (4 == height)
+    {
+        common_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+    }
+    else if (8 == height)
+    {
+        common_hz_4t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+    }
+}
+
+static void common_hz_4t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
+    v16u8 tmp0, tmp1;
+    v8i16 filt, out0, out1, out2, out3;
+
+    mask0 = LD_SB(&vp8_mc_filt_mask_arr[0]);
+    src -= 1;
+
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;)
+    {
+        LD_SB4(src, src_stride, src0, src1, src2, src3);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src0, src1, src2, src3);
+        HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
+                                   filt1, out0, out1, out2, out3);
+        SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        tmp0 = PCKEV_XORI128_UB(out0, out1);
+        tmp1 = PCKEV_XORI128_UB(out2, out3);
+        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void common_hz_4t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 filt0, filt1, mask0, mask1;
+    v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
+    v16u8 out;
+
+    mask0 = LD_SB(&vp8_mc_filt_mask_arr[0]);
+    src -= 1;
+
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    mask1 = mask0 + 2;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;)
+    {
+        LD_SB4(src, src_stride, src0, src2, src4, src6);
+        LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+        src += (4 * src_stride);
+
+        XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
+        HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
+                                   filt1, out0, out1, out2, out3);
+        HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, filt0,
+                                   filt1, out4, out5, out6, out7);
+        SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
+        SRARI_H4_SH(out4, out5, out6, out7, VP8_FILTER_SHIFT);
+        SAT_SH4_SH(out0, out1, out2, out3, 7);
+        SAT_SH4_SH(out4, out5, out6, out7, 7);
+        out = PCKEV_XORI128_UB(out0, out1);
+        ST_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out2, out3);
+        ST_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out4, out5);
+        ST_UB(out, dst);
+        dst += dst_stride;
+        out = PCKEV_XORI128_UB(out6, out7);
+        ST_UB(out, dst);
+        dst += dst_stride;
+    }
+}
+
+static void common_vt_4t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5;
+    v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
+    v16i8 src2110, src4332, filt0, filt1;
+    v8i16 filt, out10, out32;
+    v16u8 out;
+
+    src -= src_stride;
+
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    src2110 = (v16i8)__msa_ilvr_d((v2i64)src21_r, (v2i64)src10_r);
+    src2110 = (v16i8)__msa_xori_b((v16u8)src2110, 128);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;)
+    {
+        LD_SB3(src, src_stride, src3, src4, src5);
+        src += (3 * src_stride);
+        ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
+        src4332 = (v16i8)__msa_ilvr_d((v2i64)src43_r, (v2i64)src32_r);
+        src4332 = (v16i8)__msa_xori_b((v16u8)src4332, 128);
+        out10 = FILT_4TAP_DPADD_S_H(src2110, src4332, filt0, filt1);
+
+        src2 = LD_SB(src);
+        src += (src_stride);
+        ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r);
+        src2110 = (v16i8)__msa_ilvr_d((v2i64)src65_r, (v2i64)src54_r);
+        src2110 = (v16i8)__msa_xori_b((v16u8)src2110, 128);
+        out32 = FILT_4TAP_DPADD_S_H(src4332, src2110, filt0, filt1);
+        SRARI_H2_SH(out10, out32, VP8_FILTER_SHIFT);
+        SAT_SH2_SH(out10, out32, 7);
+        out = PCKEV_XORI128_UB(out10, out32);
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void common_vt_4t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src7, src8, src9, src10;
+    v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
+    v16u8 tmp0, tmp1;
+    v8i16 filt, out0_r, out1_r, out2_r, out3_r;
+
+    src -= src_stride;
+
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;)
+    {
+        LD_SB4(src, src_stride, src7, src8, src9, src10);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src7, src8, src9, src10);
+        ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9,
+                   src72_r, src87_r, src98_r, src109_r);
+        out0_r = FILT_4TAP_DPADD_S_H(src10_r, src72_r, filt0, filt1);
+        out1_r = FILT_4TAP_DPADD_S_H(src21_r, src87_r, filt0, filt1);
+        out2_r = FILT_4TAP_DPADD_S_H(src72_r, src98_r, filt0, filt1);
+        out3_r = FILT_4TAP_DPADD_S_H(src87_r, src109_r, filt0, filt1);
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, VP8_FILTER_SHIFT);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
+        tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
+        ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src98_r;
+        src21_r = src109_r;
+        src2 = src10;
+    }
+}
+
+static void common_vt_4t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
+    v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+
+    src -= src_stride;
+
+    filt = LD_SH(filter);
+    SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+    ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
+    ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;)
+    {
+        LD_SB4(src, src_stride, src3, src4, src5, src6);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src3, src4, src5, src6);
+        ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
+                   src32_r, src43_r, src54_r, src65_r);
+        ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
+                   src32_l, src43_l, src54_l, src65_l);
+        out0_r = FILT_4TAP_DPADD_S_H(src10_r, src32_r, filt0, filt1);
+        out1_r = FILT_4TAP_DPADD_S_H(src21_r, src43_r, filt0, filt1);
+        out2_r = FILT_4TAP_DPADD_S_H(src32_r, src54_r, filt0, filt1);
+        out3_r = FILT_4TAP_DPADD_S_H(src43_r, src65_r, filt0, filt1);
+        out0_l = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1);
+        out1_l = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1);
+        out2_l = FILT_4TAP_DPADD_S_H(src32_l, src54_l, filt0, filt1);
+        out3_l = FILT_4TAP_DPADD_S_H(src43_l, src65_l, filt0, filt1);
+        SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, VP8_FILTER_SHIFT);
+        SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, VP8_FILTER_SHIFT);
+        SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+        SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+        PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+                    out3_r, tmp0, tmp1, tmp2, tmp3);
+        XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
+        ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        src10_r = src54_r;
+        src21_r = src65_r;
+        src10_l = src54_l;
+        src21_l = src65_l;
+        src2 = src6;
+    }
+}
+
+static void common_hv_4ht_4vt_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                     uint8_t *RESTRICT dst, int32_t dst_stride,
+                                     const int8_t *filter_horiz,
+                                     const int8_t *filter_vert,
+                                     int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
+    v16u8 mask0, mask1, out;
+    v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
+
+    mask0 = LD_UB(&vp8_mc_filt_mask_arr[16]);
+    src -= (1 + 1 * src_stride);
+
+    filt = LD_SH(filter_horiz);
+    SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+    hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out1 = HORIZ_4TAP_FILT(src1, src2, mask0, mask1, filt_hz0, filt_hz1);
+    vec0 = (v8i16)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+
+    filt = LD_SH(filter_vert);
+    SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;)
+    {
+        LD_SB4(src, src_stride, src3, src4, src5, src6);
+        src += (4 * src_stride);
+
+        XORI_B2_128_SB(src3, src4);
+        hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
+        hz_out2 = (v8i16)__msa_sldi_b((v16i8)hz_out3, (v16i8)hz_out1, 8);
+        vec1 = (v8i16)__msa_ilvev_b((v16i8)hz_out3, (v16i8)hz_out2);
+        tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
+
+        XORI_B2_128_SB(src5, src6);
+        hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
+        hz_out4 = (v8i16)__msa_sldi_b((v16i8)hz_out5, (v16i8)hz_out3, 8);
+        vec2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
+        tmp1 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
+
+        SRARI_H2_SH(tmp0, tmp1, 7);
+        SAT_SH2_SH(tmp0, tmp1, 7);
+        out = PCKEV_XORI128_UB(tmp0, tmp1);
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        hz_out1 = hz_out5;
+        vec0 = vec2;
+    }
+}
+
+static void common_hv_4ht_4vt_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                     uint8_t *RESTRICT dst, int32_t dst_stride,
+                                     const int8_t *filter_horiz,
+                                     const int8_t *filter_vert,
+                                     int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
+    v16u8 mask0, mask1, out0, out1;
+    v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, tmp2, tmp3;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
+    v8i16 vec0, vec1, vec2, vec3, vec4;
+
+    mask0 = LD_UB(&vp8_mc_filt_mask_arr[0]);
+    src -= (1 + 1 * src_stride);
+
+    filt = LD_SH(filter_horiz);
+    SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+    hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
+    ILVEV_B2_SH(hz_out0, hz_out1, hz_out1, hz_out2, vec0, vec2);
+
+    filt = LD_SH(filter_vert);
+    SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;)
+    {
+        LD_SB4(src, src_stride, src3, src4, src5, src6);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src3, src4, src5, src6);
+        hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
+        vec1 = (v8i16)__msa_ilvev_b((v16i8)hz_out3, (v16i8)hz_out2);
+        tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
+
+        hz_out0 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
+        vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out3);
+        tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt_vt0, filt_vt1);
+
+        hz_out1 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
+        vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+        tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec4, filt_vt0, filt_vt1);
+
+        hz_out2 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
+        ILVEV_B2_SH(hz_out3, hz_out0, hz_out1, hz_out2, vec0, vec1);
+        tmp3 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
+
+        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        out0 = PCKEV_XORI128_UB(tmp0, tmp1);
+        out1 = PCKEV_XORI128_UB(tmp2, tmp3);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        vec0 = vec4;
+        vec2 = vec1;
+    }
+}
+
+static void common_hv_4ht_4vt_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                      uint8_t *RESTRICT dst, int32_t dst_stride,
+                                      const int8_t *filter_horiz,
+                                      const int8_t *filter_vert,
+                                      int32_t height)
+{
+    int32_t multiple8_cnt;
+    for (multiple8_cnt = 2; multiple8_cnt--;)
+    {
+        common_hv_4ht_4vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+                                 filter_vert, height);
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void common_hv_6ht_4vt_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                     uint8_t *RESTRICT dst, int32_t dst_stride,
+                                     const int8_t *filter_horiz,
+                                     const int8_t *filter_vert,
+                                     int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v16i8 filt_hz0, filt_hz1, filt_hz2;
+    v16u8 res0, res1, mask0, mask1, mask2;
+    v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
+
+    mask0 = LD_UB(&vp8_mc_filt_mask_arr[16]);
+    src -= (2 + 1 * src_stride);
+
+    filt = LD_SH(filter_horiz);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+    hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out1 = HORIZ_6TAP_FILT(src1, src2, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    vec0 = (v8i16)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+
+    filt = LD_SH(filter_vert);
+    SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;)
+    {
+        LD_SB4(src, src_stride, src3, src4, src5, src6);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src3, src4, src5, src6);
+        hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        hz_out2 = (v8i16)__msa_sldi_b((v16i8)hz_out3, (v16i8)hz_out1, 8);
+        vec1 = (v8i16)__msa_ilvev_b((v16i8)hz_out3, (v16i8)hz_out2);
+        tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
+
+        hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        hz_out4 = (v8i16)__msa_sldi_b((v16i8)hz_out5, (v16i8)hz_out3, 8);
+        vec2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
+        tmp1 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
+
+        SRARI_H2_SH(tmp0, tmp1, 7);
+        SAT_SH2_SH(tmp0, tmp1, 7);
+        PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
+        XORI_B2_128_UB(res0, res1);
+        ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        hz_out1 = hz_out5;
+        vec0 = vec2;
+    }
+}
+
+static void common_hv_6ht_4vt_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                     uint8_t *RESTRICT dst, int32_t dst_stride,
+                                     const int8_t *filter_horiz,
+                                     const int8_t *filter_vert,
+                                     int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6;
+    v16i8 filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2;
+    v8i16 filt, filt_vt0, filt_vt1, hz_out0, hz_out1, hz_out2, hz_out3;
+    v8i16 tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3;
+    v16u8 out0, out1;
+
+    mask0 = LD_SB(&vp8_mc_filt_mask_arr[0]);
+    src -= (2 + src_stride);
+
+    filt = LD_SH(filter_horiz);
+    SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
+
+    mask1 = mask0 + 2;
+    mask2 = mask0 + 4;
+
+    LD_SB3(src, src_stride, src0, src1, src2);
+    src += (3 * src_stride);
+
+    XORI_B3_128_SB(src0, src1, src2);
+    hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    ILVEV_B2_SH(hz_out0, hz_out1, hz_out1, hz_out2, vec0, vec2);
+
+    filt = LD_SH(filter_vert);
+    SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;)
+    {
+        LD_SB4(src, src_stride, src3, src4, src5, src6);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src3, src4, src5, src6);
+
+        hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        vec1 = (v8i16)__msa_ilvev_b((v16i8)hz_out3, (v16i8)hz_out2);
+        tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
+
+        hz_out0 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out3);
+        tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt_vt0, filt_vt1);
+
+        hz_out1 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        vec0 = (v8i16)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+        tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec0, filt_vt0, filt_vt1);
+
+        hz_out2 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0,
+                                  filt_hz1, filt_hz2);
+        ILVEV_B2_SH(hz_out3, hz_out0, hz_out1, hz_out2, vec1, vec2);
+        tmp3 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
+
+        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        out0 = PCKEV_XORI128_UB(tmp0, tmp1);
+        out1 = PCKEV_XORI128_UB(tmp2, tmp3);
+        ST8x4_UB(out0, out1, dst, dst_stride);
+        dst += (4 * dst_stride);
+    }
+}
+
+static void common_hv_6ht_4vt_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                      uint8_t *RESTRICT dst, int32_t dst_stride,
+                                      const int8_t *filter_horiz,
+                                      const int8_t *filter_vert,
+                                      int32_t height)
+{
+    int32_t multiple8_cnt;
+    for (multiple8_cnt = 2; multiple8_cnt--;)
+    {
+        common_hv_6ht_4vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+                                 filter_vert, height);
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void common_hv_4ht_6vt_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                     uint8_t *RESTRICT dst, int32_t dst_stride,
+                                     const int8_t *filter_horiz,
+                                     const int8_t *filter_vert,
+                                     int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 filt_hz0, filt_hz1, mask0, mask1;
+    v16u8 out;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8i16 hz_out7, tmp0, tmp1, out0, out1, out2, out3;
+    v8i16 filt, filt_vt0, filt_vt1, filt_vt2;
+
+    mask0 = LD_SB(&vp8_mc_filt_mask_arr[16]);
+
+    src -= (1 + 2 * src_stride);
+
+    filt = LD_SH(filter_horiz);
+    SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out2 = HORIZ_4TAP_FILT(src2, src3, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out1 = (v8i16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
+    ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
+
+    filt = LD_SH(filter_vert);
+    SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;)
+    {
+        LD_SB4(src, src_stride, src5, src6, src7, src8);
+        XORI_B4_128_SB(src5, src6, src7, src8);
+        src += (4 * src_stride);
+
+        hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
+        hz_out4 = (v8i16)__msa_sldi_b((v16i8)hz_out5, (v16i8)hz_out3, 8);
+        out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
+        tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+
+        hz_out7 = HORIZ_4TAP_FILT(src7, src8, mask0, mask1, filt_hz0, filt_hz1);
+        hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
+        out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
+        tmp1 = DPADD_SH3_SH(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
+
+        SRARI_H2_SH(tmp0, tmp1, 7);
+        SAT_SH2_SH(tmp0, tmp1, 7);
+        out = PCKEV_XORI128_UB(tmp0, tmp1);
+        ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        hz_out3 = hz_out7;
+        out0 = out2;
+        out1 = out3;
+    }
+}
+
+static void common_hv_4ht_6vt_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                     uint8_t *RESTRICT dst, int32_t dst_stride,
+                                     const int8_t *filter_horiz,
+                                     const int8_t *filter_vert,
+                                     int32_t height)
+{
+    uint32_t loop_cnt;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 filt_hz0, filt_hz1, mask0, mask1;
+    v8i16 filt, filt_vt0, filt_vt1, filt_vt2, tmp0, tmp1, tmp2, tmp3;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+    v8i16 hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
+    v16u8 vec0, vec1;
+
+    mask0 = LD_SB(&vp8_mc_filt_mask_arr[0]);
+    src -= (1 + 2 * src_stride);
+
+    filt = LD_SH(filter_horiz);
+    SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
+
+    mask1 = mask0 + 2;
+
+    LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+    src += (5 * src_stride);
+
+    XORI_B5_128_SB(src0, src1, src2, src3, src4);
+    hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out4 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
+    ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
+    ILVEV_B2_SH(hz_out1, hz_out2, hz_out3, hz_out4, out3, out4);
+
+    filt = LD_SH(filter_vert);
+    SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;)
+    {
+        LD_SB4(src, src_stride, src5, src6, src7, src8);
+        src += (4 * src_stride);
+
+        XORI_B4_128_SB(src5, src6, src7, src8);
+
+        hz_out5 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
+        out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
+        tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+
+        hz_out6 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
+        out5 = (v8i16)__msa_ilvev_b((v16i8)hz_out6, (v16i8)hz_out5);
+        tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
+
+        hz_out7 = HORIZ_4TAP_FILT(src7, src7, mask0, mask1, filt_hz0, filt_hz1);
+        out6 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
+        tmp2 = DPADD_SH3_SH(out1, out2, out6, filt_vt0, filt_vt1, filt_vt2);
+
+        hz_out8 = HORIZ_4TAP_FILT(src8, src8, mask0, mask1, filt_hz0, filt_hz1);
+        out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
+        tmp3 = DPADD_SH3_SH(out4, out5, out7, filt_vt0, filt_vt1, filt_vt2);
+
+        SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+        vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
+        vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
+        ST8x4_UB(vec0, vec1, dst, dst_stride);
+        dst += (4 * dst_stride);
+
+        hz_out4 = hz_out8;
+        out0 = out2;
+        out1 = out6;
+        out3 = out5;
+        out4 = out7;
+    }
+}
+
+static void common_hv_4ht_6vt_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                      uint8_t *RESTRICT dst, int32_t dst_stride,
+                                      const int8_t *filter_horiz,
+                                      const int8_t *filter_vert,
+                                      int32_t height)
+{
+    int32_t multiple8_cnt;
+    for (multiple8_cnt = 2; multiple8_cnt--;)
+    {
+        common_hv_4ht_6vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+                                 filter_vert, height);
+        src += 8;
+        dst += 8;
+    }
+}
+
+void vp8_sixtap_predict4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                               int32_t xoffset, int32_t yoffset,
+                               uint8_t *RESTRICT dst, int32_t dst_stride)
+{
+    const int8_t *h_filter = vp8_subpel_filters_msa[xoffset - 1];
+    const int8_t *v_filter = vp8_subpel_filters_msa[yoffset - 1];
+
+    if (yoffset)
+    {
+        if (xoffset)
+        {
+            switch (xoffset)
+            {
+                case 2:
+                case 4:
+                case 6:
+                    switch (yoffset)
+                    {
+                        case 2:
+                        case 4:
+                        case 6:
+                            common_hv_6ht_6vt_4w_msa(src, src_stride, dst,
+                                                     dst_stride, h_filter,
+                                                     v_filter, 4);
+                            break;
+
+                        case 1:
+                        case 3:
+                        case 5:
+                        case 7:
+                            common_hv_6ht_4vt_4w_msa(src, src_stride, dst,
+                                                     dst_stride, h_filter,
+                                                     v_filter + 1, 4);
+                            break;
+                    }
+                    break;
+
+                case 1:
+                case 3:
+                case 5:
+                case 7:
+                    switch (yoffset)
+                    {
+                        case 2:
+                        case 4:
+                        case 6:
+                            common_hv_4ht_6vt_4w_msa(src, src_stride, dst,
+                                                     dst_stride, h_filter + 1,
+                                                     v_filter, 4);
+                            break;
+
+                        case 1:
+                        case 3:
+                        case 5:
+                        case 7:
+                            common_hv_4ht_4vt_4w_msa(src, src_stride, dst,
+                                                     dst_stride, h_filter + 1,
+                                                     v_filter + 1, 4);
+                            break;
+                    }
+                    break;
+            }
+        }
+        else
+        {
+            switch (yoffset)
+            {
+                case 2:
+                case 4:
+                case 6:
+                    common_vt_6t_4w_msa(src, src_stride, dst, dst_stride,
+                                        v_filter, 4);
+                    break;
+
+                case 1:
+                case 3:
+                case 5:
+                case 7:
+                    common_vt_4t_4w_msa(src, src_stride, dst, dst_stride,
+                                        v_filter + 1, 4);
+                    break;
+            }
+        }
+    }
+    else
+    {
+        switch (xoffset)
+        {
+            case 0:
+                {
+                uint32_t tp0, tp1, tp2, tp3;
+
+                LW4(src, src_stride, tp0, tp1, tp2, tp3);
+                SW4(tp0, tp1, tp2, tp3, dst, dst_stride);
+                break;
+                }
+            case 2:
+            case 4:
+            case 6:
+                common_hz_6t_4w_msa(src, src_stride, dst, dst_stride,
+                                    h_filter, 4);
+                break;
+
+            case 1:
+            case 3:
+            case 5:
+            case 7:
+                common_hz_4t_4w_msa(src, src_stride, dst, dst_stride,
+                                    h_filter + 1, 4);
+                break;
+        }
+    }
+}
+
+void vp8_sixtap_predict8x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                               int32_t xoffset, int32_t yoffset,
+                               uint8_t *RESTRICT dst, int32_t dst_stride)
+{
+    const int8_t *h_filter = vp8_subpel_filters_msa[xoffset - 1];
+    const int8_t *v_filter = vp8_subpel_filters_msa[yoffset - 1];
+
+    if (yoffset)
+    {
+        if (xoffset)
+        {
+            switch (xoffset)
+            {
+                case 2:
+                case 4:
+                case 6:
+                    switch (yoffset)
+                    {
+                        case 2:
+                        case 4:
+                        case 6:
+                            common_hv_6ht_6vt_8w_msa(src, src_stride, dst,
+                                                     dst_stride, h_filter,
+                                                     v_filter, 4);
+                            break;
+
+                        case 1:
+                        case 3:
+                        case 5:
+                        case 7:
+                            common_hv_6ht_4vt_8w_msa(src, src_stride, dst,
+                                                     dst_stride, h_filter,
+                                                     v_filter + 1, 4);
+                            break;
+                    }
+                    break;
+
+                case 1:
+                case 3:
+                case 5:
+                case 7:
+                    switch (yoffset)
+                    {
+                        case 2:
+                        case 4:
+                        case 6:
+                            common_hv_4ht_6vt_8w_msa(src, src_stride, dst,
+                                                     dst_stride, h_filter + 1,
+                                                     v_filter, 4);
+                            break;
+
+                        case 1:
+                        case 3:
+                        case 5:
+                        case 7:
+                            common_hv_4ht_4vt_8w_msa(src, src_stride, dst,
+                                                     dst_stride, h_filter + 1,
+                                                     v_filter + 1, 4);
+                            break;
+                    }
+                    break;
+            }
+        }
+        else
+        {
+            switch (yoffset)
+            {
+                case 2:
+                case 4:
+                case 6:
+                    common_vt_6t_8w_msa(src, src_stride, dst, dst_stride,
+                                        v_filter, 4);
+                    break;
+
+                case 1:
+                case 3:
+                case 5:
+                case 7:
+                    common_vt_4t_8w_msa(src, src_stride, dst, dst_stride,
+                                        v_filter + 1, 4);
+                    break;
+            }
+        }
+    }
+    else
+    {
+        switch (xoffset)
+        {
+            case 0:
+                vp8_copy_mem8x4(src, src_stride, dst, dst_stride);
+                break;
+            case 2:
+            case 4:
+            case 6:
+                common_hz_6t_8w_msa(src, src_stride, dst, dst_stride,
+                                    h_filter, 4);
+                break;
+
+            case 1:
+            case 3:
+            case 5:
+            case 7:
+                common_hz_4t_8w_msa(src, src_stride, dst, dst_stride,
+                                    h_filter + 1, 4);
+                break;
+        }
+    }
+}
+
+void vp8_sixtap_predict8x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                               int32_t xoffset, int32_t yoffset,
+                               uint8_t *RESTRICT dst, int32_t dst_stride)
+{
+    const int8_t *h_filter = vp8_subpel_filters_msa[xoffset - 1];
+    const int8_t *v_filter = vp8_subpel_filters_msa[yoffset - 1];
+
+    if (yoffset)
+    {
+        if (xoffset)
+        {
+            switch (xoffset)
+            {
+                case 2:
+                case 4:
+                case 6:
+                    switch (yoffset)
+                    {
+                        case 2:
+                        case 4:
+                        case 6:
+                            common_hv_6ht_6vt_8w_msa(src, src_stride, dst,
+                                                     dst_stride, h_filter,
+                                                     v_filter, 8);
+                            break;
+
+                        case 1:
+                        case 3:
+                        case 5:
+                        case 7:
+                            common_hv_6ht_4vt_8w_msa(src, src_stride, dst,
+                                                     dst_stride, h_filter,
+                                                     v_filter + 1, 8);
+                            break;
+                    }
+                    break;
+
+                case 1:
+                case 3:
+                case 5:
+                case 7:
+                    switch (yoffset)
+                    {
+                        case 2:
+                        case 4:
+                        case 6:
+                            common_hv_4ht_6vt_8w_msa(src, src_stride, dst,
+                                                     dst_stride, h_filter + 1,
+                                                     v_filter, 8);
+                            break;
+
+                        case 1:
+                        case 3:
+                        case 5:
+                        case 7:
+                            common_hv_4ht_4vt_8w_msa(src, src_stride, dst,
+                                                     dst_stride, h_filter + 1,
+                                                     v_filter + 1, 8);
+                            break;
+                    }
+                    break;
+                }
+        }
+        else
+        {
+            switch (yoffset)
+            {
+                case 2:
+                case 4:
+                case 6:
+                    common_vt_6t_8w_msa(src, src_stride, dst, dst_stride,
+                                        v_filter, 8);
+                    break;
+
+                case 1:
+                case 3:
+                case 5:
+                case 7:
+                    common_vt_4t_8w_msa(src, src_stride, dst, dst_stride,
+                                        v_filter + 1, 8);
+                    break;
+            }
+        }
+    }
+    else
+    {
+        switch (xoffset)
+        {
+            case 0:
+                vp8_copy_mem8x8(src, src_stride, dst, dst_stride);
+                break;
+            case 2:
+            case 4:
+            case 6:
+                common_hz_6t_8w_msa(src, src_stride, dst, dst_stride, h_filter,
+                                    8);
+                break;
+
+            case 1:
+            case 3:
+            case 5:
+            case 7:
+                common_hz_4t_8w_msa(src, src_stride, dst, dst_stride,
+                                    h_filter + 1, 8);
+                break;
+        }
+    }
+}
+
+void vp8_sixtap_predict16x16_msa(uint8_t *RESTRICT src, int32_t src_stride,
+                                 int32_t xoffset, int32_t yoffset,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride)
+{
+    const int8_t *h_filter = vp8_subpel_filters_msa[xoffset - 1];
+    const int8_t *v_filter = vp8_subpel_filters_msa[yoffset - 1];
+
+    if (yoffset)
+    {
+        if (xoffset)
+        {
+            switch (xoffset)
+            {
+                case 2:
+                case 4:
+                case 6:
+                    switch (yoffset)
+                    {
+                        case 2:
+                        case 4:
+                        case 6:
+                            common_hv_6ht_6vt_16w_msa(src, src_stride, dst,
+                                                      dst_stride, h_filter,
+                                                      v_filter, 16);
+                            break;
+
+                        case 1:
+                        case 3:
+                        case 5:
+                        case 7:
+                            common_hv_6ht_4vt_16w_msa(src, src_stride, dst,
+                                                      dst_stride, h_filter,
+                                                      v_filter + 1, 16);
+                            break;
+                    }
+                    break;
+
+                case 1:
+                case 3:
+                case 5:
+                case 7:
+                    switch (yoffset)
+                    {
+                        case 2:
+                        case 4:
+                        case 6:
+                            common_hv_4ht_6vt_16w_msa(src, src_stride, dst,
+                                                      dst_stride, h_filter + 1,
+                                                      v_filter, 16);
+                            break;
+
+                        case 1:
+                        case 3:
+                        case 5:
+                        case 7:
+                            common_hv_4ht_4vt_16w_msa(src, src_stride, dst,
+                                                      dst_stride, h_filter + 1,
+                                                      v_filter + 1, 16);
+                            break;
+                    }
+                    break;
+            }
+        }
+        else
+        {
+            switch (yoffset)
+            {
+                case 2:
+                case 4:
+                case 6:
+                    common_vt_6t_16w_msa(src, src_stride, dst, dst_stride,
+                                         v_filter, 16);
+                    break;
+
+                case 1:
+                case 3:
+                case 5:
+                case 7:
+                    common_vt_4t_16w_msa(src, src_stride, dst, dst_stride,
+                                         v_filter + 1, 16);
+                    break;
+            }
+        }
+    }
+    else
+    {
+        switch (xoffset)
+        {
+            case 0:
+                vp8_copy_mem16x16(src, src_stride, dst, dst_stride);
+                break;
+            case 2:
+            case 4:
+            case 6:
+                common_hz_6t_16w_msa(src, src_stride, dst, dst_stride,
+                                     h_filter, 16);
+                break;
+
+            case 1:
+            case 3:
+            case 5:
+            case 7:
+                common_hz_4t_16w_msa(src, src_stride, dst, dst_stride,
+                                     h_filter + 1, 16);
+                break;
+        }
+    }
+}
diff --git a/libs/libvpx/vp8/common/mips/msa/vp8_macros_msa.h b/libs/libvpx/vp8/common/mips/msa/vp8_macros_msa.h
new file mode 100644
index 0000000000..27d5929956
--- /dev/null
+++ b/libs/libvpx/vp8/common/mips/msa/vp8_macros_msa.h
@@ -0,0 +1,1783 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_
+#define VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_
+
+#include <msa.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+#define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc))
+#define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
+#define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
+
+#define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc))
+#define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
+#define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
+
+#define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc))
+#define LD_UW(...) LD_W(v4u32, __VA_ARGS__)
+#define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
+
+#define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
+#define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
+#define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
+
+#define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
+#define ST_UH(...) ST_H(v8u16, __VA_ARGS__)
+#define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
+
+#define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
+#define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
+
+#if (__mips_isa_rev >= 6)
+#define LW(psrc)                                      \
+({                                                    \
+    const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
+    uint32_t val_m;                                   \
+                                                      \
+    asm volatile (                                    \
+        "lw  %[val_m],  %[psrc_m]  \n\t"              \
+                                                      \
+        : [val_m] "=r" (val_m)                        \
+        : [psrc_m] "m" (*psrc_m)                      \
+    );                                                \
+                                                      \
+    val_m;                                            \
+})
+
+#if (__mips == 64)
+#define LD(psrc)                                      \
+({                                                    \
+    const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
+    uint64_t val_m = 0;                               \
+                                                      \
+    asm volatile (                                    \
+        "ld  %[val_m],  %[psrc_m]  \n\t"              \
+                                                      \
+        : [val_m] "=r" (val_m)                        \
+        : [psrc_m] "m" (*psrc_m)                      \
+    );                                                \
+                                                      \
+    val_m;                                            \
+})
+#else  // !(__mips == 64)
+#define LD(psrc)                                             \
+({                                                           \
+    const uint8_t *psrc_m = (const uint8_t *)(psrc);         \
+    uint32_t val0_m, val1_m;                                 \
+    uint64_t val_m = 0;                                      \
+                                                             \
+    val0_m = LW(psrc_m);                                     \
+    val1_m = LW(psrc_m + 4);                                 \
+                                                             \
+    val_m = (uint64_t)(val1_m);                              \
+    val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000);  \
+    val_m = (uint64_t)(val_m | (uint64_t)val0_m);            \
+                                                             \
+    val_m;                                                   \
+})
+#endif  // (__mips == 64)
+
+#define SH(val, pdst)                     \
+{                                         \
+    uint8_t *pdst_m = (uint8_t *)(pdst);  \
+    const uint16_t val_m = (val);         \
+                                          \
+    asm volatile (                        \
+        "sh  %[val_m],  %[pdst_m]  \n\t"  \
+                                          \
+        : [pdst_m] "=m" (*pdst_m)         \
+        : [val_m] "r" (val_m)             \
+    );                                    \
+}
+
+#define SW(val, pdst)                     \
+{                                         \
+    uint8_t *pdst_m = (uint8_t *)(pdst);  \
+    const uint32_t val_m = (val);         \
+                                          \
+    asm volatile (                        \
+        "sw  %[val_m],  %[pdst_m]  \n\t"  \
+                                          \
+        : [pdst_m] "=m" (*pdst_m)         \
+        : [val_m] "r" (val_m)             \
+    );                                    \
+}
+
+#define SD(val, pdst)                     \
+{                                         \
+    uint8_t *pdst_m = (uint8_t *)(pdst);  \
+    const uint64_t val_m = (val);         \
+                                          \
+    asm volatile (                        \
+        "sd  %[val_m],  %[pdst_m]  \n\t"  \
+                                          \
+        : [pdst_m] "=m" (*pdst_m)         \
+        : [val_m] "r" (val_m)             \
+    );                                    \
+}
+#else  // !(__mips_isa_rev >= 6)
+#define LW(psrc)                                      \
+({                                                    \
+    const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
+    uint32_t val_m;                                   \
+                                                      \
+    asm volatile (                                    \
+        "ulw  %[val_m],  %[psrc_m]  \n\t"             \
+                                                      \
+        : [val_m] "=r" (val_m)                        \
+        : [psrc_m] "m" (*psrc_m)                      \
+    );                                                \
+                                                      \
+    val_m;                                            \
+})
+
+#if (__mips == 64)
+#define LD(psrc)                                      \
+({                                                    \
+    const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
+    uint64_t val_m = 0;                               \
+                                                      \
+    asm volatile (                                    \
+        "uld  %[val_m],  %[psrc_m]  \n\t"             \
+                                                      \
+        : [val_m] "=r" (val_m)                        \
+        : [psrc_m] "m" (*psrc_m)                      \
+    );                                                \
+                                                      \
+    val_m;                                            \
+})
+#else  // !(__mips == 64)
+#define LD(psrc)                                             \
+({                                                           \
+    const uint8_t *psrc_m1 = (const uint8_t *)(psrc);        \
+    uint32_t val0_m, val1_m;                                 \
+    uint64_t val_m = 0;                                      \
+                                                             \
+    val0_m = LW(psrc_m1);                                    \
+    val1_m = LW(psrc_m1 + 4);                                \
+                                                             \
+    val_m = (uint64_t)(val1_m);                              \
+    val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000);  \
+    val_m = (uint64_t)(val_m | (uint64_t)val0_m);            \
+                                                             \
+    val_m;                                                   \
+})
+#endif  // (__mips == 64)
+#define SH(val, pdst)                      \
+{                                          \
+    uint8_t *pdst_m = (uint8_t *)(pdst);   \
+    const uint16_t val_m = (val);          \
+                                           \
+    asm volatile (                         \
+        "ush  %[val_m],  %[pdst_m]  \n\t"  \
+                                           \
+        : [pdst_m] "=m" (*pdst_m)          \
+        : [val_m] "r" (val_m)              \
+    );                                     \
+}
+
+#define SW(val, pdst)                      \
+{                                          \
+    uint8_t *pdst_m = (uint8_t *)(pdst);   \
+    const uint32_t val_m = (val);          \
+                                           \
+    asm volatile (                         \
+        "usw  %[val_m],  %[pdst_m]  \n\t"  \
+                                           \
+        : [pdst_m] "=m" (*pdst_m)          \
+        : [val_m] "r" (val_m)              \
+    );                                     \
+}
+
+#define SD(val, pdst)                                         \
+{                                                             \
+    uint8_t *pdst_m1 = (uint8_t *)(pdst);                     \
+    uint32_t val0_m, val1_m;                                  \
+                                                              \
+    val0_m = (uint32_t)((val) & 0x00000000FFFFFFFF);          \
+    val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF);  \
+                                                              \
+    SW(val0_m, pdst_m1);                                      \
+    SW(val1_m, pdst_m1 + 4);                                  \
+}
+#endif  // (__mips_isa_rev >= 6)
+
+/* Description : Load 4 words with stride
+   Arguments   : Inputs  - psrc, stride
+                 Outputs - out0, out1, out2, out3
+   Details     : Load word in 'out0' from (psrc)
+                 Load word in 'out1' from (psrc + stride)
+                 Load word in 'out2' from (psrc + 2 * stride)
+                 Load word in 'out3' from (psrc + 3 * stride)
+*/
+#define LW4(psrc, stride, out0, out1, out2, out3)  \
+{                                                  \
+    out0 = LW((psrc));                             \
+    out1 = LW((psrc) + stride);                    \
+    out2 = LW((psrc) + 2 * stride);                \
+    out3 = LW((psrc) + 3 * stride);                \
+}
+
+/* Description : Load double words with stride
+   Arguments   : Inputs  - psrc, stride
+                 Outputs - out0, out1
+   Details     : Load double word in 'out0' from (psrc)
+                 Load double word in 'out1' from (psrc + stride)
+*/
+#define LD2(psrc, stride, out0, out1)  \
+{                                      \
+    out0 = LD((psrc));                 \
+    out1 = LD((psrc) + stride);        \
+}
+#define LD4(psrc, stride, out0, out1, out2, out3)  \
+{                                                  \
+    LD2((psrc), stride, out0, out1);               \
+    LD2((psrc) + 2 * stride, stride, out2, out3);  \
+}
+
+/* Description : Store 4 words with stride
+   Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
+   Details     : Store word from 'in0' to (pdst)
+                 Store word from 'in1' to (pdst + stride)
+                 Store word from 'in2' to (pdst + 2 * stride)
+                 Store word from 'in3' to (pdst + 3 * stride)
+*/
+#define SW4(in0, in1, in2, in3, pdst, stride)  \
+{                                              \
+    SW(in0, (pdst));                           \
+    SW(in1, (pdst) + stride);                  \
+    SW(in2, (pdst) + 2 * stride);              \
+    SW(in3, (pdst) + 3 * stride);              \
+}
+
+/* Description : Store 4 double words with stride
+   Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
+   Details     : Store double word from 'in0' to (pdst)
+                 Store double word from 'in1' to (pdst + stride)
+                 Store double word from 'in2' to (pdst + 2 * stride)
+                 Store double word from 'in3' to (pdst + 3 * stride)
+*/
+#define SD4(in0, in1, in2, in3, pdst, stride)  \
+{                                              \
+    SD(in0, (pdst));                           \
+    SD(in1, (pdst) + stride);                  \
+    SD(in2, (pdst) + 2 * stride);              \
+    SD(in3, (pdst) + 3 * stride);              \
+}
+
+/* Description : Load vectors with 16 byte elements with stride
+   Arguments   : Inputs  - psrc, stride
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Load 16 byte elements in 'out0' from (psrc)
+                 Load 16 byte elements in 'out1' from (psrc + stride)
+*/
+#define LD_B2(RTYPE, psrc, stride, out0, out1)  \
+{                                               \
+    out0 = LD_B(RTYPE, (psrc));                 \
+    out1 = LD_B(RTYPE, (psrc) + stride);        \
+}
+#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
+#define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
+
+#define LD_B3(RTYPE, psrc, stride, out0, out1, out2)  \
+{                                                     \
+    LD_B2(RTYPE, (psrc), stride, out0, out1);         \
+    out2 = LD_B(RTYPE, (psrc) + 2 * stride);          \
+}
+#define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__)
+#define LD_SB3(...) LD_B3(v16i8, __VA_ARGS__)
+
+#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3)   \
+{                                                            \
+    LD_B2(RTYPE, (psrc), stride, out0, out1);                \
+    LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3);  \
+}
+#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
+#define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
+
+#define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4)  \
+{                                                                 \
+    LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);         \
+    out4 = LD_B(RTYPE, (psrc) + 4 * stride);                      \
+}
+#define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)
+#define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__)
+
+#define LD_B8(RTYPE, psrc, stride,                                      \
+              out0, out1, out2, out3, out4, out5, out6, out7)           \
+{                                                                       \
+    LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \
+    LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);  \
+}
+#define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
+#define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
+
+/* Description : Load vectors with 8 halfword elements with stride
+   Arguments   : Inputs  - psrc, stride
+                 Outputs - out0, out1
+   Details     : Load 8 halfword elements in 'out0' from (psrc)
+                 Load 8 halfword elements in 'out1' from (psrc + stride)
+*/
+#define LD_H2(RTYPE, psrc, stride, out0, out1)  \
+{                                               \
+    out0 = LD_H(RTYPE, (psrc));                 \
+    out1 = LD_H(RTYPE, (psrc) + (stride));      \
+}
+#define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
+
+#define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3)  \
+{                                                           \
+    LD_H2(RTYPE, (psrc), stride, out0, out1);               \
+    LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3);  \
+}
+#define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
+
+/* Description : Load 2 vectors of signed word elements with stride
+   Arguments   : Inputs  - psrc, stride
+                 Outputs - out0, out1
+                 Return Type - signed word
+*/
+#define LD_SW2(psrc, stride, out0, out1)  \
+{                                         \
+    out0 = LD_SW((psrc));                 \
+    out1 = LD_SW((psrc) + stride);        \
+}
+
+/* Description : Store vectors of 16 byte elements with stride
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : Store 16 byte elements from 'in0' to (pdst)
+                 Store 16 byte elements from 'in1' to (pdst + stride)
+*/
+#define ST_B2(RTYPE, in0, in1, pdst, stride)  \
+{                                             \
+    ST_B(RTYPE, in0, (pdst));                 \
+    ST_B(RTYPE, in1, (pdst) + stride);        \
+}
+#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
+
+#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride)    \
+{                                                         \
+    ST_B2(RTYPE, in0, in1, (pdst), stride);               \
+    ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride);  \
+}
+#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
+#define ST_SB4(...) ST_B4(v16i8, __VA_ARGS__)
+
+#define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,        \
+              pdst, stride)                                         \
+{                                                                   \
+    ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride);                 \
+    ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);  \
+}
+#define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
+
+/* Description : Store vectors of 8 halfword elements with stride
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : Store 8 halfword elements from 'in0' to (pdst)
+                 Store 8 halfword elements from 'in1' to (pdst + stride)
+*/
+#define ST_H2(RTYPE, in0, in1, pdst, stride)  \
+{                                             \
+    ST_H(RTYPE, in0, (pdst));                 \
+    ST_H(RTYPE, in1, (pdst) + stride);        \
+}
+#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
+
+/* Description : Store vectors of word elements with stride
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : Store 4 word elements from 'in0' to (pdst)
+                 Store 4 word elements from 'in1' to (pdst + stride)
+*/
+#define ST_SW2(in0, in1, pdst, stride)  \
+{                                       \
+    ST_SW(in0, (pdst));                 \
+    ST_SW(in1, (pdst) + stride);        \
+}
+
+/* Description : Store 2x4 byte block to destination memory from input vector
+   Arguments   : Inputs - in, stidx, pdst, stride
+   Details     : Index 'stidx' halfword element from 'in' vector is copied to
+                 the GP register and stored to (pdst)
+                 Index 'stidx+1' halfword element from 'in' vector is copied to
+                 the GP register and stored to (pdst + stride)
+                 Index 'stidx+2' halfword element from 'in' vector is copied to
+                 the GP register and stored to (pdst + 2 * stride)
+                 Index 'stidx+3' halfword element from 'in' vector is copied to
+                 the GP register and stored to (pdst + 3 * stride)
+*/
+#define ST2x4_UB(in, stidx, pdst, stride)             \
+{                                                     \
+    uint16_t out0_m, out1_m, out2_m, out3_m;          \
+    uint8_t *pblk_2x4_m = (uint8_t *)(pdst);          \
+                                                      \
+    out0_m = __msa_copy_u_h((v8i16)in, (stidx));      \
+    out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1));  \
+    out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2));  \
+    out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3));  \
+                                                      \
+    SH(out0_m, pblk_2x4_m);                           \
+    SH(out1_m, pblk_2x4_m + stride);                  \
+    SH(out2_m, pblk_2x4_m + 2 * stride);              \
+    SH(out3_m, pblk_2x4_m + 3 * stride);              \
+}
+
+/* Description : Store 4x4 byte block to destination memory from input vector
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : 'Idx0' word element from input vector 'in0' is copied to the
+                 GP register and stored to (pdst)
+                 'Idx1' word element from input vector 'in0' is copied to the
+                 GP register and stored to (pdst + stride)
+                 'Idx2' word element from input vector 'in0' is copied to the
+                 GP register and stored to (pdst + 2 * stride)
+                 'Idx3' word element from input vector 'in0' is copied to the
+                 GP register and stored to (pdst + 3 * stride)
+*/
+#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)  \
+{                                                                 \
+    uint32_t out0_m, out1_m, out2_m, out3_m;                      \
+    uint8_t *pblk_4x4_m = (uint8_t *)(pdst);                      \
+                                                                  \
+    out0_m = __msa_copy_u_w((v4i32)in0, idx0);                    \
+    out1_m = __msa_copy_u_w((v4i32)in0, idx1);                    \
+    out2_m = __msa_copy_u_w((v4i32)in1, idx2);                    \
+    out3_m = __msa_copy_u_w((v4i32)in1, idx3);                    \
+                                                                  \
+    SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride);      \
+}
+#define ST4x8_UB(in0, in1, pdst, stride)                            \
+{                                                                   \
+    uint8_t *pblk_4x8 = (uint8_t *)(pdst);                          \
+                                                                    \
+    ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride);               \
+    ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride);  \
+}
+
+/* Description : Store 8x1 byte block to destination memory from input vector
+   Arguments   : Inputs - in, pdst
+   Details     : Index 0 double word element from 'in' vector is copied to the
+                 GP register and stored to (pdst)
+*/
+#define ST8x1_UB(in, pdst)                  \
+{                                           \
+    uint64_t out0_m;                        \
+                                            \
+    out0_m = __msa_copy_u_d((v2i64)in, 0);  \
+    SD(out0_m, pdst);                       \
+}
+
+/* Description : Store 8x2 byte block to destination memory from input vector
+   Arguments   : Inputs - in, pdst, stride
+   Details     : Index 0 double word element from 'in' vector is copied to the
+                 GP register and stored to (pdst)
+                 Index 1 double word element from 'in' vector is copied to the
+                 GP register and stored to (pdst + stride)
+*/
+#define ST8x2_UB(in, pdst, stride)            \
+{                                             \
+    uint64_t out0_m, out1_m;                  \
+    uint8_t *pblk_8x2_m = (uint8_t *)(pdst);  \
+                                              \
+    out0_m = __msa_copy_u_d((v2i64)in, 0);    \
+    out1_m = __msa_copy_u_d((v2i64)in, 1);    \
+                                              \
+    SD(out0_m, pblk_8x2_m);                   \
+    SD(out1_m, pblk_8x2_m + stride);          \
+}
+
+/* Description : Store 8x4 byte block to destination memory from input
+                 vectors
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : Index 0 double word element from 'in0' vector is copied to the
+                 GP register and stored to (pdst)
+                 Index 1 double word element from 'in0' vector is copied to the
+                 GP register and stored to (pdst + stride)
+                 Index 0 double word element from 'in1' vector is copied to the
+                 GP register and stored to (pdst + 2 * stride)
+                 Index 1 double word element from 'in1' vector is copied to the
+                 GP register and stored to (pdst + 3 * stride)
+*/
+#define ST8x4_UB(in0, in1, pdst, stride)                      \
+{                                                             \
+    uint64_t out0_m, out1_m, out2_m, out3_m;                  \
+    uint8_t *pblk_8x4_m = (uint8_t *)(pdst);                  \
+                                                              \
+    out0_m = __msa_copy_u_d((v2i64)in0, 0);                   \
+    out1_m = __msa_copy_u_d((v2i64)in0, 1);                   \
+    out2_m = __msa_copy_u_d((v2i64)in1, 0);                   \
+    out3_m = __msa_copy_u_d((v2i64)in1, 1);                   \
+                                                              \
+    SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride);  \
+}
+
+/* Description : Immediate number of elements to slide with zero
+   Arguments   : Inputs  - in0, in1, slide_val
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Byte elements from 'zero_m' vector are slid into 'in0' by
+                 value specified in the 'slide_val'
+*/
+#define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val)              \
+{                                                                      \
+    v16i8 zero_m = { 0 };                                              \
+                                                                       \
+    out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val);  \
+    out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val);  \
+}
+#define SLDI_B2_0_UB(...) SLDI_B2_0(v16u8, __VA_ARGS__)
+
+/* Description : Immediate number of elements to slide
+   Arguments   : Inputs  - in0_0, in0_1, in1_0, in1_1, slide_val
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Byte elements from 'in0_0' vector are slid into 'in1_0' by
+                 value specified in the 'slide_val'
+*/
+#define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val)  \
+{                                                                          \
+    out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val);     \
+    out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val);     \
+}
+
+#define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2,        \
+                out0, out1, out2, slide_val)                            \
+{                                                                       \
+    SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val);  \
+    out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val);  \
+}
+#define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
+
+/* Description : Shuffle byte vector elements as per mask vector
+   Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Byte elements from 'in0' & 'in1' are copied selectively to
+                 'out0' as per control vector 'mask0'
+*/
+#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)   \
+{                                                                      \
+    out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0);  \
+    out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2);  \
+}
+#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
+#define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
+#define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
+
+#define VSHF_B3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2,  \
+                out0, out1, out2)                                          \
+{                                                                          \
+    VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1);          \
+    out2 = (RTYPE)__msa_vshf_b((v16i8)mask2, (v16i8)in5, (v16i8)in4);      \
+}
+#define VSHF_B3_SB(...) VSHF_B3(v16i8, __VA_ARGS__)
+
+/* Description : Shuffle halfword vector elements as per mask vector
+   Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : halfword elements from 'in0' & 'in1' are copied selectively to
+                 'out0' as per control vector 'mask0'
+*/
+#define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)   \
+{                                                                      \
+    out0 = (RTYPE)__msa_vshf_h((v8i16)mask0, (v8i16)in1, (v8i16)in0);  \
+    out1 = (RTYPE)__msa_vshf_h((v8i16)mask1, (v8i16)in3, (v8i16)in2);  \
+}
+#define VSHF_H2_SH(...) VSHF_H2(v8i16, __VA_ARGS__)
+
+/* Description : Dot product of byte vector elements
+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Unsigned byte elements from 'mult0' are multiplied with
+                 unsigned byte elements from 'cnst0' producing a result
+                 twice the size of input i.e. unsigned halfword.
+                 The multiplication result of adjacent odd-even elements
+                 are added together and written to the 'out0' vector
+*/
+#define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)  \
+{                                                                \
+    out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0);    \
+    out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1);    \
+}
+#define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
+
+#define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3,           \
+                 cnst0, cnst1, cnst2, cnst3,                  \
+                 out0, out1, out2, out3)                      \
+{                                                             \
+    DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);  \
+    DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);  \
+}
+#define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
+
+/* Description : Dot product of byte vector elements
+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Signed byte elements from 'mult0' are multiplied with
+                 signed byte elements from 'cnst0' producing a result
+                 twice the size of input i.e. signed halfword.
+                 The multiplication result of adjacent odd-even elements
+                 are added together and written to the 'out0' vector
+*/
+#define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)  \
+{                                                                \
+    out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0);    \
+    out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1);    \
+}
+#define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
+
+#define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3,                   \
+                 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3)  \
+{                                                                     \
+    DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);          \
+    DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);          \
+}
+#define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
+
+/* Description : Dot product of halfword vector elements
+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Signed halfword elements from 'mult0' are multiplied with
+                 signed halfword elements from 'cnst0' producing a result
+                 twice the size of input i.e. signed word.
+                 The multiplication result of adjacent odd-even elements
+                 are added together and written to the 'out0' vector
+*/
+#define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)  \
+{                                                                \
+    out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0);    \
+    out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1);    \
+}
+
+#define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3,           \
+                 cnst0, cnst1, cnst2, cnst3,                  \
+                 out0, out1, out2, out3)                      \
+{                                                             \
+    DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);  \
+    DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);  \
+}
+#define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
+
+/* Description : Dot product of word vector elements
+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Signed word elements from 'mult0' are multiplied with
+                 signed word elements from 'cnst0' producing a result
+                 twice the size of input i.e. signed double word.
+                 The multiplication result of adjacent odd-even elements
+                 are added together and written to the 'out0' vector
+*/
+#define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)  \
+{                                                                \
+    out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0);    \
+    out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1);    \
+}
+#define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__)
+
+/* Description : Dot product & addition of byte vector elements
+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Signed byte elements from 'mult0' are multiplied with
+                 signed byte elements from 'cnst0' producing a result
+                 twice the size of input i.e. signed halfword.
+                 The multiplication result of adjacent odd-even elements
+                 are added to the 'out0' vector
+*/
+#define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)             \
+{                                                                            \
+    out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0);  \
+    out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1);  \
+}
+#define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
+
+#define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3,                   \
+                  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3)  \
+{                                                                      \
+    DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);          \
+    DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);          \
+}
+#define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
+
+/* Description : Dot product & addition of halfword vector elements
+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Signed halfword elements from 'mult0' are multiplied with
+                 signed halfword elements from 'cnst0' producing a result
+                 twice the size of input i.e. signed word.
+                 The multiplication result of adjacent odd-even elements
+                 are added to the 'out0' vector
+*/
+#define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)             \
+{                                                                            \
+    out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0);  \
+    out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1);  \
+}
+#define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
+
+#define DPADD_SH4(RTYPE, mult0, mult1, mult2, mult3,                   \
+                  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3)  \
+{                                                                      \
+    DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);          \
+    DPADD_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);          \
+}
+#define DPADD_SH4_SW(...) DPADD_SH4(v4i32, __VA_ARGS__)
+
+/* Description : Dot product & addition of double word vector elements
+   Arguments   : Inputs  - mult0, mult1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Each signed word element from 'mult0' is multiplied with itself
+                 producing an intermediate result twice the size of it
+                 i.e. signed double word
+                 The multiplication result of adjacent odd-even elements
+                 are added to the 'out0' vector
+*/
+#define DPADD_SD2(RTYPE, mult0, mult1, out0, out1)                           \
+{                                                                            \
+    out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0);  \
+    out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1);  \
+}
+#define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__)
+
+/* Description : Clips all signed halfword elements of input vector
+                 between 0 & 255
+   Arguments   : Input  - in
+                 Output - out_m
+                 Return Type - signed halfword
+*/
+#define CLIP_SH_0_255(in)                               \
+({                                                      \
+    v8i16 max_m = __msa_ldi_h(255);                     \
+    v8i16 out_m;                                        \
+                                                        \
+    out_m = __msa_maxi_s_h((v8i16)in, 0);               \
+    out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m);  \
+    out_m;                                              \
+})
+#define CLIP_SH2_0_255(in0, in1)  \
+{                                 \
+    in0 = CLIP_SH_0_255(in0);     \
+    in1 = CLIP_SH_0_255(in1);     \
+}
+#define CLIP_SH4_0_255(in0, in1, in2, in3)  \
+{                                           \
+    CLIP_SH2_0_255(in0, in1);               \
+    CLIP_SH2_0_255(in2, in3);               \
+}
+
+/* Description : Clips all signed word elements of input vector
+                 between 0 & 255
+   Arguments   : Input  - in
+                 Output - out_m
+                 Return Type - signed word
+*/
+#define CLIP_SW_0_255(in)                               \
+({                                                      \
+    v4i32 max_m = __msa_ldi_w(255);                     \
+    v4i32 out_m;                                        \
+                                                        \
+    out_m = __msa_maxi_s_w((v4i32)in, 0);               \
+    out_m = __msa_min_s_w((v4i32)max_m, (v4i32)out_m);  \
+    out_m;                                              \
+})
+
+/* Description : Horizontal addition of 4 signed word elements of input vector
+   Arguments   : Input  - in       (signed word vector)
+                 Output - sum_m    (i32 sum)
+                 Return Type - signed word (GP)
+   Details     : 4 signed word elements of 'in' vector are added together and
+                 the resulting integer sum is returned
+*/
+#define HADD_SW_S32(in)                             \
+({                                                  \
+    v2i64 res0_m, res1_m;                           \
+    int32_t sum_m;                                  \
+                                                    \
+    res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in);  \
+    res1_m = __msa_splati_d(res0_m, 1);             \
+    res0_m = res0_m + res1_m;                       \
+    sum_m = __msa_copy_s_w((v4i32)res0_m, 0);       \
+    sum_m;                                          \
+})
+
+/* Description : Horizontal addition of 8 unsigned halfword elements
+   Arguments   : Inputs  - in       (unsigned halfword vector)
+                 Outputs - sum_m    (u32 sum)
+                 Return Type - unsigned word
+   Details     : 8 unsigned halfword elements of input vector are added
+                 together and the resulting integer sum is returned
+*/
+#define HADD_UH_U32(in)                                \
+({                                                     \
+    v4u32 res_m;                                       \
+    v2u64 res0_m, res1_m;                              \
+    uint32_t sum_m;                                    \
+                                                       \
+    res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in);      \
+    res0_m = __msa_hadd_u_d(res_m, res_m);             \
+    res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1);  \
+    res0_m = res0_m + res1_m;                          \
+    sum_m = __msa_copy_u_w((v4i32)res0_m, 0);          \
+    sum_m;                                             \
+})
+
+/* Description : Horizontal addition of unsigned byte vector elements
+   Arguments   : Inputs  - in0, in1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Each unsigned odd byte element from 'in0' is added to
+                 even unsigned byte element from 'in0' (pairwise) and the
+                 halfword result is written to 'out0'
+*/
+#define HADD_UB2(RTYPE, in0, in1, out0, out1)              \
+{                                                          \
+    out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0);  \
+    out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1);  \
+}
+#define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
+
+/* Description : Horizontal subtraction of unsigned byte vector elements
+   Arguments   : Inputs  - in0, in1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Each unsigned odd byte element from 'in0' is subtracted from
+                 even unsigned byte element from 'in0' (pairwise) and the
+                 halfword result is written to 'out0'
+*/
+#define HSUB_UB2(RTYPE, in0, in1, out0, out1)              \
+{                                                          \
+    out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0);  \
+    out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1);  \
+}
+#define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
+
+/* Description : Horizontal subtraction of signed halfword vector elements
+   Arguments   : Inputs  - in0, in1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Each signed odd halfword element from 'in0' is subtracted from
+                 even signed halfword element from 'in0' (pairwise) and the
+                 word result is written to 'out0'
+*/
+#define HSUB_UH2(RTYPE, in0, in1, out0, out1)              \
+{                                                          \
+    out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0);  \
+    out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1);  \
+}
+#define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__)
+
+/* Description : Set element n input vector to GPR value
+   Arguments   : Inputs - in0, in1, in2, in3
+                 Output - out
+                 Return Type - as per RTYPE
+   Details     : Set element 0 in vector 'out' to value specified in 'in0'
+*/
+#define INSERT_D2(RTYPE, in0, in1, out)               \
+{                                                     \
+    out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0);  \
+    out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1);  \
+}
+#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
+
+/* Description : Interleave even byte elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even byte elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'
+*/
+#define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)   \
+{                                                         \
+    out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0);  \
+    out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2);  \
+}
+#define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
+#define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
+#define ILVEV_B2_SD(...) ILVEV_B2(v2i64, __VA_ARGS__)
+
+/* Description : Interleave even halfword elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even halfword elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'
+*/
+#define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)   \
+{                                                         \
+    out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0);  \
+    out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2);  \
+}
+#define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
+#define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
+
+/* Description : Interleave even word elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even word elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'
+*/
+#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1)   \
+{                                                         \
+    out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0);  \
+    out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2);  \
+}
+#define ILVEV_W2_SD(...) ILVEV_W2(v2i64, __VA_ARGS__)
+
+/* Description : Interleave even double word elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even double word elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'
+*/
+#define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)   \
+{                                                         \
+    out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0);  \
+    out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2);  \
+}
+#define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
+
+/* Description : Interleave left half of byte elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Left half of byte elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'.
+*/
+#define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1)   \
+{                                                        \
+    out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1);  \
+    out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3);  \
+}
+#define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
+#define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
+#define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
+
+#define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                out0, out1, out2, out3)                         \
+{                                                               \
+    ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1);             \
+    ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3);             \
+}
+#define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
+#define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
+
+/* Description : Interleave left half of halfword elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Left half of halfword elements of 'in0' and 'in1' are
+                 interleaved and written to 'out0'.
+*/
+#define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1)   \
+{                                                        \
+    out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1);  \
+    out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3);  \
+}
+#define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
+#define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
+
+/* Description : Interleave left half of word elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Left half of word elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'.
+*/
+#define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1)   \
+{                                                        \
+    out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1);  \
+    out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3);  \
+}
+#define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
+
+/* Description : Interleave right half of byte elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Right half of byte elements of 'in0' and 'in1' are interleaved
+                 and written to out0.
+*/
+#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1)   \
+{                                                        \
+    out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1);  \
+    out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3);  \
+}
+#define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
+#define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
+#define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
+#define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__)
+
+#define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                out0, out1, out2, out3)                         \
+{                                                               \
+    ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);             \
+    ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3);             \
+}
+#define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
+#define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
+#define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
+#define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
+#define ILVR_B4_SW(...) ILVR_B4(v4i32, __VA_ARGS__)
+
+/* Description : Interleave right half of halfword elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Right half of halfword elements of 'in0' and 'in1' are
+                 interleaved and written to 'out0'.
+*/
+#define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1)   \
+{                                                        \
+    out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1);  \
+    out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3);  \
+}
+#define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
+#define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
+
+#define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                out0, out1, out2, out3)                         \
+{                                                               \
+    ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);             \
+    ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3);             \
+}
+#define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
+#define ILVR_H4_SW(...) ILVR_H4(v4i32, __VA_ARGS__)
+
+#define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1)   \
+{                                                        \
+    out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1);  \
+    out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3);  \
+}
+#define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
+
+/* Description : Interleave right half of double word elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Right half of double word elements of 'in0' and 'in1' are
+                 interleaved and written to 'out0'.
+*/
+#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1)       \
+{                                                            \
+    out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1));  \
+    out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3));  \
+}
+#define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
+#define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
+#define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
+
+#define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                out0, out1, out2, out3)                         \
+{                                                               \
+    ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);             \
+    ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3);             \
+}
+#define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
+#define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
+
+/* Description : Interleave both left and right half of input vectors
+   Arguments   : Inputs  - in0, in1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Right half of byte elements from 'in0' and 'in1' are
+                 interleaved and written to 'out0'
+*/
+#define ILVRL_B2(RTYPE, in0, in1, out0, out1)            \
+{                                                        \
+    out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1);  \
+    out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1);  \
+}
+#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
+#define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
+#define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
+#define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
+
+#define ILVRL_H2(RTYPE, in0, in1, out0, out1)            \
+{                                                        \
+    out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1);  \
+    out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1);  \
+}
+#define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
+#define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
+
+#define ILVRL_W2(RTYPE, in0, in1, out0, out1)            \
+{                                                        \
+    out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1);  \
+    out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1);  \
+}
+#define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
+#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
+#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
+
+/* Description : Maximum values between signed elements of vector and
+                 5-bit signed immediate value are copied to the output vector
+   Arguments   : Inputs  - in0, in1, in2, in3, max_val
+                 Outputs - in place operation
+                 Return Type - unsigned halfword
+   Details     : Maximum of signed halfword element values from 'in0' and
+                 'max_val' are written in place
+*/
+#define MAXI_SH2(RTYPE, in0, in1, max_val)               \
+{                                                        \
+    in0 = (RTYPE)__msa_maxi_s_h((v8i16)in0, (max_val));  \
+    in1 = (RTYPE)__msa_maxi_s_h((v8i16)in1, (max_val));  \
+}
+#define MAXI_SH2_SH(...) MAXI_SH2(v8i16, __VA_ARGS__)
+
+/* Description : Saturate the halfword element values to the max
+                 unsigned value of (sat_val + 1) bits
+                 The element data width remains unchanged
+   Arguments   : Inputs  - in0, in1, sat_val
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
+   Details     : Each unsigned halfword element from 'in0' is saturated to the
+                 value generated with (sat_val + 1) bit range.
+                 The results are written in place
+*/
+#define SAT_UH2(RTYPE, in0, in1, sat_val)             \
+{                                                     \
+    in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val);  \
+    in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val);  \
+}
+#define SAT_UH2_SH(...) SAT_UH2(v8i16, __VA_ARGS__)
+
+/* Description : Saturate the halfword element values to the max
+                 unsigned value of (sat_val + 1) bits
+                 The element data width remains unchanged
+   Arguments   : Inputs  - in0, in1, sat_val
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
+   Details     : Each unsigned halfword element from 'in0' is saturated to the
+                 value generated with (sat_val + 1) bit range
+                 The results are written in place
+*/
+#define SAT_SH2(RTYPE, in0, in1, sat_val)             \
+{                                                     \
+    in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val);  \
+    in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val);  \
+}
+#define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
+
+#define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val)  \
+{                                                    \
+    SAT_SH2(RTYPE, in0, in1, sat_val);               \
+    SAT_SH2(RTYPE, in2, in3, sat_val);               \
+}
+#define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
+
+/* Description : Indexed halfword element values are replicated to all
+                 elements in output vector
+   Arguments   : Inputs  - in, idx0, idx1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : 'idx0' element value from 'in' vector is replicated to all
+                  elements in 'out0' vector
+                  Valid index range for halfword operation is 0-7
+*/
+#define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1)  \
+{                                                     \
+    out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0);    \
+    out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1);    \
+}
+#define SPLATI_H2_SB(...) SPLATI_H2(v16i8, __VA_ARGS__)
+#define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
+
+#define SPLATI_H3(RTYPE, in, idx0, idx1, idx2,      \
+                  out0, out1, out2)                 \
+{                                                   \
+    SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1);   \
+    out2 = (RTYPE)__msa_splati_h((v8i16)in, idx2);  \
+}
+#define SPLATI_H3_SB(...) SPLATI_H3(v16i8, __VA_ARGS__)
+#define SPLATI_H3_SH(...) SPLATI_H3(v8i16, __VA_ARGS__)
+
+/* Description : Indexed word element values are replicated to all
+                 elements in output vector
+   Arguments   : Inputs  - in, stidx
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : 'stidx' element value from 'in' vector is replicated to all
+                 elements in 'out0' vector
+                 'stidx + 1' element value from 'in' vector is replicated to all
+                 elements in 'out1' vector
+                 Valid index range for word operation is 0-3
+*/
+#define SPLATI_W2(RTYPE, in, stidx, out0, out1)          \
+{                                                        \
+    out0 = (RTYPE)__msa_splati_w((v4i32)in, stidx);      \
+    out1 = (RTYPE)__msa_splati_w((v4i32)in, (stidx+1));  \
+}
+#define SPLATI_W2_SW(...) SPLATI_W2(v4i32, __VA_ARGS__)
+
+/* Description : Pack even byte elements of vector pairs
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even byte elements of 'in0' are copied to the left half of
+                 'out0' & even byte elements of 'in1' are copied to the right
+                 half of 'out0'.
+*/
+#define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)   \
+{                                                         \
+    out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1);  \
+    out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3);  \
+}
+#define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
+#define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
+
+#define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                 out0, out1, out2, out3)                         \
+{                                                                \
+    PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);             \
+    PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3);             \
+}
+#define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
+#define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
+#define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
+
+/* Description : Pack even halfword elements of vector pairs
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even halfword elements of 'in0' are copied to the left half of
+                 'out0' & even halfword elements of 'in1' are copied to the
+                 right half of 'out0'.
+*/
+#define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)   \
+{                                                         \
+    out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1);  \
+    out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3);  \
+}
+#define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
+
+#define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                 out0, out1, out2, out3)                         \
+{                                                                \
+    PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1);             \
+    PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3);             \
+}
+#define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
+
+/* Description : Pack even double word elements of vector pairs
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even double elements of 'in0' are copied to the left half of
+                 'out0' & even double elements of 'in1' are copied to the right
+                 half of 'out0'.
+*/
+#define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)   \
+{                                                         \
+    out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1);  \
+    out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3);  \
+}
+#define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
+#define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
+
+/* Description : Pack odd double word elements of vector pairs
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Odd double word elements of 'in0' are copied to the left half
+                 of 'out0' & odd double word elements of 'in1' are copied to
+                 the right half of 'out0'.
+*/
+#define PCKOD_D2(RTYPE, in0, in1, in2, in3, out0, out1)   \
+{                                                         \
+    out0 = (RTYPE)__msa_pckod_d((v2i64)in0, (v2i64)in1);  \
+    out1 = (RTYPE)__msa_pckod_d((v2i64)in2, (v2i64)in3);  \
+}
+#define PCKOD_D2_UB(...) PCKOD_D2(v16u8, __VA_ARGS__)
+#define PCKOD_D2_SH(...) PCKOD_D2(v8i16, __VA_ARGS__)
+
+/* Description : Each byte element is logically xor'ed with immediate 128
+   Arguments   : Inputs  - in0, in1
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
+   Details     : Each unsigned byte element from input vector 'in0' is
+                 logically xor'ed with 128 and the result is stored in-place.
+*/
+#define XORI_B2_128(RTYPE, in0, in1)             \
+{                                                \
+    in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128);  \
+    in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128);  \
+}
+#define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
+#define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
+
+#define XORI_B3_128(RTYPE, in0, in1, in2)        \
+{                                                \
+    XORI_B2_128(RTYPE, in0, in1);                \
+    in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128);  \
+}
+#define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
+
+#define XORI_B4_128(RTYPE, in0, in1, in2, in3)  \
+{                                               \
+    XORI_B2_128(RTYPE, in0, in1);               \
+    XORI_B2_128(RTYPE, in2, in3);               \
+}
+#define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
+#define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
+
+#define XORI_B5_128(RTYPE, in0, in1, in2, in3, in4)  \
+{                                                    \
+    XORI_B3_128(RTYPE, in0, in1, in2);               \
+    XORI_B2_128(RTYPE, in3, in4);                    \
+}
+#define XORI_B5_128_SB(...) XORI_B5_128(v16i8, __VA_ARGS__)
+
+#define XORI_B8_128(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7)  \
+{                                                                   \
+    XORI_B4_128(RTYPE, in0, in1, in2, in3);                         \
+    XORI_B4_128(RTYPE, in4, in5, in6, in7);                         \
+}
+#define XORI_B8_128_SB(...) XORI_B8_128(v16i8, __VA_ARGS__)
+
+/* Description : Shift left all elements of vector (generic for all data types)
+   Arguments   : Inputs  - in0, in1, in2, in3, shift
+                 Outputs - in place operation
+                 Return Type - as per input vector RTYPE
+   Details     : Each element of vector 'in0' is left shifted by 'shift' and
+                 the result is written in-place.
+*/
+#define SLLI_4V(in0, in1, in2, in3, shift)  \
+{                                           \
+    in0 = in0 << shift;                     \
+    in1 = in1 << shift;                     \
+    in2 = in2 << shift;                     \
+    in3 = in3 << shift;                     \
+}
+
+/* Description : Arithmetic shift right all elements of vector
+                 (generic for all data types)
+   Arguments   : Inputs  - in0, in1, in2, in3, shift
+                 Outputs - in place operation
+                 Return Type - as per input vector RTYPE
+   Details     : Each element of vector 'in0' is right shifted by 'shift' and
+                 the result is written in-place. 'shift' is a GP variable.
+*/
+#define SRA_4V(in0, in1, in2, in3, shift)  \
+{                                          \
+    in0 = in0 >> shift;                    \
+    in1 = in1 >> shift;                    \
+    in2 = in2 >> shift;                    \
+    in3 = in3 >> shift;                    \
+}
+
+/* Description : Shift right arithmetic rounded words
+   Arguments   : Inputs  - in0, in1, shift
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
+   Details     : Each element of vector 'in0' is shifted right arithmetically by
+                 the number of bits in the corresponding element in the vector
+                 'shift'. The last discarded bit is added to shifted value for
+                 rounding and the result is written in-place.
+                 'shift' is a vector.
+*/
+#define SRAR_W2(RTYPE, in0, in1, shift)                   \
+{                                                         \
+    in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift);  \
+    in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift);  \
+}
+
+#define SRAR_W4(RTYPE, in0, in1, in2, in3, shift)  \
+{                                                  \
+    SRAR_W2(RTYPE, in0, in1, shift);               \
+    SRAR_W2(RTYPE, in2, in3, shift);               \
+}
+#define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
+
+/* Description : Shift right arithmetic rounded (immediate)
+   Arguments   : Inputs  - in0, in1, shift
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
+   Details     : Each element of vector 'in0' is shifted right arithmetically by
+                 the value in 'shift'. The last discarded bit is added to the
+                 shifted value for rounding and the result is written in-place.
+                 'shift' is an immediate value.
+*/
+#define SRARI_H2(RTYPE, in0, in1, shift)            \
+{                                                   \
+    in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift);  \
+    in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift);  \
+}
+#define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
+#define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
+
+#define SRARI_H4(RTYPE, in0, in1, in2, in3, shift)  \
+{                                                   \
+    SRARI_H2(RTYPE, in0, in1, shift);               \
+    SRARI_H2(RTYPE, in2, in3, shift);               \
+}
+#define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
+#define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
+
+#define SRARI_W2(RTYPE, in0, in1, shift)            \
+{                                                   \
+    in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift);  \
+    in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift);  \
+}
+
+#define SRARI_W4(RTYPE, in0, in1, in2, in3, shift)  \
+{                                                   \
+    SRARI_W2(RTYPE, in0, in1, shift);               \
+    SRARI_W2(RTYPE, in2, in3, shift);               \
+}
+#define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
+
+/* Description : Multiplication of pairs of vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+   Details     : Each element from 'in0' is multiplied with elements from 'in1'
+                 and the result is written to 'out0'
+*/
+#define MUL2(in0, in1, in2, in3, out0, out1)  \
+{                                             \
+    out0 = in0 * in1;                         \
+    out1 = in2 * in3;                         \
+}
+#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7,  \
+             out0, out1, out2, out3)                  \
+{                                                     \
+    MUL2(in0, in1, in2, in3, out0, out1);             \
+    MUL2(in4, in5, in6, in7, out2, out3);             \
+}
+
+/* Description : Addition of 2 pairs of vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+   Details     : Each element in 'in0' is added to 'in1' and result is written
+                 to 'out0'.
+*/
+#define ADD2(in0, in1, in2, in3, out0, out1)  \
+{                                             \
+    out0 = in0 + in1;                         \
+    out1 = in2 + in3;                         \
+}
+#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7,  \
+             out0, out1, out2, out3)                  \
+{                                                     \
+    ADD2(in0, in1, in2, in3, out0, out1);             \
+    ADD2(in4, in5, in6, in7, out2, out3);             \
+}
+
+/* Description : Subtraction of 2 pairs of vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+   Details     : Each element in 'in1' is subtracted from 'in0' and result is
+                 written to 'out0'.
+*/
+#define SUB2(in0, in1, in2, in3, out0, out1)  \
+{                                             \
+    out0 = in0 - in1;                         \
+    out1 = in2 - in3;                         \
+}
+#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7,  \
+             out0, out1, out2, out3)                  \
+{                                                     \
+    out0 = in0 - in1;                                 \
+    out1 = in2 - in3;                                 \
+    out2 = in4 - in5;                                 \
+    out3 = in6 - in7;                                 \
+}
+
+/* Description : Sign extend halfword elements from right half of the vector
+   Arguments   : Input  - in    (halfword vector)
+                 Output - out   (sign extended word vector)
+                 Return Type - signed word
+   Details     : Sign bit of halfword elements from input vector 'in' is
+                 extracted and interleaved with same vector 'in0' to generate
+                 4 word elements keeping sign intact
+*/
+#define UNPCK_R_SH_SW(in, out)                     \
+{                                                  \
+    v8i16 sign_m;                                  \
+                                                   \
+    sign_m = __msa_clti_s_h((v8i16)in, 0);         \
+    out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in);  \
+}
+
+/* Description : Zero extend unsigned byte elements to halfword elements
+   Arguments   : Input   - in          (unsigned byte vector)
+                 Outputs - out0, out1  (unsigned  halfword vectors)
+                 Return Type - signed halfword
+   Details     : Zero extended right half of vector is returned in 'out0'
+                 Zero extended left half of vector is returned in 'out1'
+*/
+#define UNPCK_UB_SH(in, out0, out1)       \
+{                                         \
+    v16i8 zero_m = { 0 };                 \
+                                          \
+    ILVRL_B2_SH(zero_m, in, out0, out1);  \
+}
+
+/* Description : Sign extend halfword elements from input vector and return
+                 the result in pair of vectors
+   Arguments   : Input   - in            (halfword vector)
+                 Outputs - out0, out1   (sign extended word vectors)
+                 Return Type - signed word
+   Details     : Sign bit of halfword elements from input vector 'in' is
+                 extracted and interleaved right with same vector 'in0' to
+                 generate 4 signed word elements in 'out0'
+                 Then interleaved left with same vector 'in0' to
+                 generate 4 signed word elements in 'out1'
+*/
+#define UNPCK_SH_SW(in, out0, out1)        \
+{                                          \
+    v8i16 tmp_m;                           \
+                                           \
+    tmp_m = __msa_clti_s_h((v8i16)in, 0);  \
+    ILVRL_H2_SW(tmp_m, in, out0, out1);    \
+}
+
+/* Description : Butterfly of 4 input vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1, out2, out3
+   Details     : Butterfly operation
+*/
+#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3)  \
+{                                                                \
+    out0 = in0 + in3;                                            \
+    out1 = in1 + in2;                                            \
+                                                                 \
+    out2 = in1 - in2;                                            \
+    out3 = in0 - in3;                                            \
+}
+
+/* Description : Transpose input 8x8 byte block
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+                 Return Type - as per RTYPE
+*/
+#define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,   \
+                        out0, out1, out2, out3, out4, out5, out6, out7)  \
+{                                                                        \
+    v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                \
+    v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                \
+                                                                         \
+    ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5,                   \
+               tmp0_m, tmp1_m, tmp2_m, tmp3_m);                          \
+    ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m);                         \
+    ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m);                         \
+    ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2);                         \
+    ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6);                         \
+    SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8);                         \
+    SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8);                         \
+}
+#define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
+
+/* Description : Transpose 16x4 block into 4x16 with byte elements in vectors
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
+                           in8, in9, in10, in11, in12, in13, in14, in15
+                 Outputs - out0, out1, out2, out3
+                 Return Type - unsigned byte
+*/
+#define TRANSPOSE16x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,        \
+                            in8, in9, in10, in11, in12, in13, in14, in15,  \
+                            out0, out1, out2, out3)                        \
+{                                                                          \
+    v2i64 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                  \
+                                                                           \
+    ILVEV_W2_SD(in0, in4, in8, in12, tmp0_m, tmp1_m);                      \
+    out1 = (v16u8)__msa_ilvev_d(tmp1_m, tmp0_m);                           \
+                                                                           \
+    ILVEV_W2_SD(in1, in5, in9, in13, tmp0_m, tmp1_m);                      \
+    out3 = (v16u8)__msa_ilvev_d(tmp1_m, tmp0_m);                           \
+                                                                           \
+    ILVEV_W2_SD(in2, in6, in10, in14, tmp0_m, tmp1_m);                     \
+                                                                           \
+    tmp2_m = __msa_ilvev_d(tmp1_m, tmp0_m);                                \
+    ILVEV_W2_SD(in3, in7, in11, in15, tmp0_m, tmp1_m);                     \
+                                                                           \
+    tmp3_m = __msa_ilvev_d(tmp1_m, tmp0_m);                                \
+    ILVEV_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m);               \
+    out0 = (v16u8)__msa_ilvev_h((v8i16)tmp1_m, (v8i16)tmp0_m);             \
+    out2 = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m);             \
+                                                                           \
+    tmp0_m = (v2i64)__msa_ilvod_b((v16i8)out3, (v16i8)out1);               \
+    tmp1_m = (v2i64)__msa_ilvod_b((v16i8)tmp3_m, (v16i8)tmp2_m);           \
+    out1 = (v16u8)__msa_ilvev_h((v8i16)tmp1_m, (v8i16)tmp0_m);             \
+    out3 = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m);             \
+}
+
+/* Description : Transpose 16x8 block into 8x16 with byte elements in vectors
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
+                           in8, in9, in10, in11, in12, in13, in14, in15
+                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+                 Return Type - unsigned byte
+*/
+#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,          \
+                            in8, in9, in10, in11, in12, in13, in14, in15,    \
+                            out0, out1, out2, out3, out4, out5, out6, out7)  \
+{                                                                            \
+    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                    \
+    v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                    \
+                                                                             \
+    ILVEV_D2_UB(in0, in8, in1, in9, out7, out6);                             \
+    ILVEV_D2_UB(in2, in10, in3, in11, out5, out4);                           \
+    ILVEV_D2_UB(in4, in12, in5, in13, out3, out2);                           \
+    ILVEV_D2_UB(in6, in14, in7, in15, out1, out0);                           \
+                                                                             \
+    tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7);                 \
+    tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7);                 \
+    tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5);                 \
+    tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5);                 \
+    out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3);                   \
+    tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3);                 \
+    out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1);                   \
+    tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1);                 \
+                                                                             \
+    ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m);                 \
+    out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);               \
+    out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);               \
+                                                                             \
+    tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m);             \
+    tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5);                 \
+    out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);               \
+    out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);               \
+                                                                             \
+    ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m);             \
+    out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);               \
+    out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);               \
+                                                                             \
+    tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);             \
+    tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);             \
+    tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);             \
+    tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);             \
+    out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);               \
+    out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);               \
+}
+
+/* Description : Transpose 4x4 block with half word elements in vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1, out2, out3
+                 Return Type - signed halfword
+*/
+#define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3)  \
+{                                                                       \
+    v8i16 s0_m, s1_m;                                                   \
+                                                                        \
+    ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m);                         \
+    ILVRL_W2_SH(s1_m, s0_m, out0, out2);                                \
+    out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0);               \
+    out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2);               \
+}
+
+/* Description : Transpose 8x4 block with half word elements in vectors
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+                 Return Type - signed halfword
+*/
+#define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3)  \
+{                                                                       \
+    v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                               \
+                                                                        \
+    ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m);                     \
+    ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m);                     \
+    ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2);             \
+    ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3);             \
+}
+
+/* Description : Transpose 4x4 block with word elements in vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1, out2, out3
+                 Return Type - signed word
+*/
+#define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3)  \
+{                                                                       \
+    v4i32 s0_m, s1_m, s2_m, s3_m;                                       \
+                                                                        \
+    ILVRL_W2_SW(in1, in0, s0_m, s1_m);                                  \
+    ILVRL_W2_SW(in3, in2, s2_m, s3_m);                                  \
+                                                                        \
+    out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m);               \
+    out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m);               \
+    out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m);               \
+    out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m);               \
+}
+
+/* Description : Dot product and addition of 3 signed halfword input vectors
+   Arguments   : Inputs - in0, in1, in2, coeff0, coeff1, coeff2
+                 Output - out0_m
+                 Return Type - signed halfword
+   Details     : Dot product of 'in0' with 'coeff0'
+                 Dot product of 'in1' with 'coeff1'
+                 Dot product of 'in2' with 'coeff2'
+                 Addition of all the 3 vector results
+                 out0_m = (in0 * coeff0) + (in1 * coeff1) + (in2 * coeff2)
+*/
+#define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2)       \
+({                                                                \
+    v8i16 tmp1_m;                                                 \
+    v8i16 out0_m;                                                 \
+                                                                  \
+    out0_m = __msa_dotp_s_h((v16i8)in0, (v16i8)coeff0);           \
+    out0_m = __msa_dpadd_s_h(out0_m, (v16i8)in1, (v16i8)coeff1);  \
+    tmp1_m = __msa_dotp_s_h((v16i8)in2, (v16i8)coeff2);           \
+    out0_m = __msa_adds_s_h(out0_m, tmp1_m);                      \
+                                                                  \
+    out0_m;                                                       \
+})
+
+/* Description : Pack even elements of input vectors & xor with 128
+   Arguments   : Inputs - in0, in1
+                 Output - out_m
+                 Return Type - unsigned byte
+   Details     : Signed byte even elements from 'in0' and 'in1' are packed
+                 together in one vector and the resulting vector is xor'ed with
+                 128 to shift the range from signed to unsigned byte
+*/
+#define PCKEV_XORI128_UB(in0, in1)                         \
+({                                                         \
+    v16u8 out_m;                                           \
+    out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0);  \
+    out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128);        \
+    out_m;                                                 \
+})
+
+/* Description : Pack even byte elements and store byte vector in destination
+                 memory
+   Arguments   : Inputs - in0, in1, pdst
+*/
+#define PCKEV_ST_SB(in0, in1, pdst)                 \
+{                                                   \
+    v16i8 tmp_m;                                    \
+    tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0);  \
+    ST_SB(tmp_m, (pdst));                           \
+}
+
+/* Description : Horizontal 2 tap filter kernel code
+   Arguments   : Inputs - in0, in1, mask, coeff, shift
+*/
+#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift)         \
+({                                                               \
+    v16i8 tmp0_m;                                                \
+    v8u16 tmp1_m;                                                \
+                                                                 \
+    tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0);  \
+    tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff);        \
+    tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift);         \
+                                                                 \
+    tmp1_m;                                                      \
+})
+#endif  /* VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_ */
diff --git a/libs/libvpx/vp8/common/modecont.c b/libs/libvpx/vp8/common/modecont.c
new file mode 100644
index 0000000000..86a74bc0ff
--- /dev/null
+++ b/libs/libvpx/vp8/common/modecont.c
@@ -0,0 +1,40 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "entropy.h"
+
+const int vp8_mode_contexts[6][4] =
+{
+    {
+        /* 0 */
+        7,     1,     1,   143,
+    },
+    {
+        /* 1 */
+        14,    18,    14,   107,
+    },
+    {
+        /* 2 */
+        135,    64,    57,    68,
+    },
+    {
+        /* 3 */
+        60,    56,   128,    65,
+    },
+    {
+        /* 4 */
+        159,   134,   128,    34,
+    },
+    {
+        /* 5 */
+        234,   188,   128,    28,
+    },
+};
diff --git a/libs/libvpx/vp8/common/modecont.h b/libs/libvpx/vp8/common/modecont.h
new file mode 100644
index 0000000000..ff34c33c55
--- /dev/null
+++ b/libs/libvpx/vp8/common/modecont.h
@@ -0,0 +1,25 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_COMMON_MODECONT_H_
+#define VP8_COMMON_MODECONT_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern const int vp8_mode_contexts[6][4];
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_COMMON_MODECONT_H_
diff --git a/libs/libvpx/vp8/common/mv.h b/libs/libvpx/vp8/common/mv.h
new file mode 100644
index 0000000000..111ccd63c7
--- /dev/null
+++ b/libs/libvpx/vp8/common/mv.h
@@ -0,0 +1,36 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_COMMON_MV_H_
+#define VP8_COMMON_MV_H_
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct
+{
+    short row;
+    short col;
+} MV;
+
+typedef union int_mv
+{
+    uint32_t  as_int;
+    MV        as_mv;
+} int_mv;        /* facilitates faster equality tests and copies */
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_COMMON_MV_H_
diff --git a/libs/libvpx/vp8/common/onyx.h b/libs/libvpx/vp8/common/onyx.h
new file mode 100644
index 0000000000..febe81505a
--- /dev/null
+++ b/libs/libvpx/vp8/common/onyx.h
@@ -0,0 +1,282 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_COMMON_ONYX_H_
+#define VP8_COMMON_ONYX_H_
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#include "vpx_config.h"
+#include "vpx/internal/vpx_codec_internal.h"
+#include "vpx/vp8cx.h"
+#include "vpx/vpx_encoder.h"
+#include "vpx_scale/yv12config.h"
+#include "ppflags.h"
+
+    struct VP8_COMP;
+
+    /* Create/destroy static data structures. */
+
+    typedef enum
+    {
+        NORMAL      = 0,
+        FOURFIVE    = 1,
+        THREEFIVE   = 2,
+        ONETWO      = 3
+
+    } VPX_SCALING;
+
+    typedef enum
+    {
+        USAGE_LOCAL_FILE_PLAYBACK   = 0x0,
+        USAGE_STREAM_FROM_SERVER    = 0x1,
+        USAGE_CONSTRAINED_QUALITY   = 0x2,
+        USAGE_CONSTANT_QUALITY      = 0x3
+    } END_USAGE;
+
+
+    typedef enum
+    {
+        MODE_REALTIME       = 0x0,
+        MODE_GOODQUALITY    = 0x1,
+        MODE_BESTQUALITY    = 0x2,
+        MODE_FIRSTPASS      = 0x3,
+        MODE_SECONDPASS     = 0x4,
+        MODE_SECONDPASS_BEST = 0x5
+    } MODE;
+
+    typedef enum
+    {
+        FRAMEFLAGS_KEY    = 1,
+        FRAMEFLAGS_GOLDEN = 2,
+        FRAMEFLAGS_ALTREF = 4
+    } FRAMETYPE_FLAGS;
+
+
+#include <assert.h>
+    static INLINE void Scale2Ratio(int mode, int *hr, int *hs)
+    {
+        switch (mode)
+        {
+        case    NORMAL:
+            *hr = 1;
+            *hs = 1;
+            break;
+        case    FOURFIVE:
+            *hr = 4;
+            *hs = 5;
+            break;
+        case    THREEFIVE:
+            *hr = 3;
+            *hs = 5;
+            break;
+        case    ONETWO:
+            *hr = 1;
+            *hs = 2;
+            break;
+        default:
+            *hr = 1;
+            *hs = 1;
+            assert(0);
+            break;
+        }
+    }
+
+    typedef struct
+    {
+        /* 4 versions of bitstream defined:
+         *   0 best quality/slowest decode, 3 lowest quality/fastest decode
+         */
+        int Version;
+        int Width;
+        int Height;
+        struct vpx_rational  timebase;
+        unsigned int target_bandwidth;    /* kilobits per second */
+
+        /* Parameter used for applying denoiser.
+         * For temporal denoiser: noise_sensitivity = 0 means off,
+         * noise_sensitivity = 1 means temporal denoiser on for Y channel only,
+         * noise_sensitivity = 2 means temporal denoiser on for all channels.
+         * noise_sensitivity = 3 means aggressive denoising mode.
+         * noise_sensitivity >= 4 means adaptive denoising mode.
+         * Temporal denoiser is enabled via the configuration option:
+         * CONFIG_TEMPORAL_DENOISING.
+         * For spatial denoiser: noise_sensitivity controls the amount of
+         * pre-processing blur: noise_sensitivity = 0 means off.
+         * Spatial denoiser invoked under !CONFIG_TEMPORAL_DENOISING.
+         */
+        int noise_sensitivity;
+
+        /* parameter used for sharpening output: recommendation 0: */
+        int Sharpness;
+        int cpu_used;
+        unsigned int rc_max_intra_bitrate_pct;
+        unsigned int screen_content_mode;
+
+        /* mode ->
+         *(0)=Realtime/Live Encoding. This mode is optimized for realtim
+         *    encoding (for example, capturing a television signal or feed
+         *    from a live camera). ( speed setting controls how fast )
+         *(1)=Good Quality Fast Encoding. The encoder balances quality with
+         *    the amount of time it takes to encode the output. ( speed
+         *    setting controls how fast )
+         *(2)=One Pass - Best Quality. The encoder places priority on the
+         *    quality of the output over encoding speed. The output is
+         *    compressed at the highest possible quality. This option takes
+         *    the longest amount of time to encode. ( speed setting ignored
+         *    )
+         *(3)=Two Pass - First Pass. The encoder generates a file of
+         *    statistics for use in the second encoding pass. ( speed
+         *    setting controls how fast )
+         *(4)=Two Pass - Second Pass. The encoder uses the statistics that
+         *    were generated in the first encoding pass to create the
+         *    compressed output. ( speed setting controls how fast )
+         *(5)=Two Pass - Second Pass Best.  The encoder uses the statistics
+         *    that were generated in the first encoding pass to create the
+         *    compressed output using the highest possible quality, and
+         *    taking a longer amount of time to encode.. ( speed setting
+         *    ignored )
+         */
+        int Mode;
+
+        /* Key Framing Operations */
+        int auto_key;       /* automatically detect cut scenes */
+        int key_freq;       /* maximum distance to key frame. */
+
+        /* lagged compression (if allow_lag == 0 lag_in_frames is ignored) */
+        int allow_lag;
+        int lag_in_frames; /* how many frames lag before we start encoding */
+
+        /*
+         * DATARATE CONTROL OPTIONS
+         */
+
+        int end_usage; /* vbr or cbr */
+
+        /* buffer targeting aggressiveness */
+        int under_shoot_pct;
+        int over_shoot_pct;
+
+        /* buffering parameters */
+        int64_t starting_buffer_level;
+        int64_t optimal_buffer_level;
+        int64_t maximum_buffer_size;
+
+        int64_t starting_buffer_level_in_ms;
+        int64_t optimal_buffer_level_in_ms;
+        int64_t maximum_buffer_size_in_ms;
+
+        /* controlling quality */
+        int fixed_q;
+        int worst_allowed_q;
+        int best_allowed_q;
+        int cq_level;
+
+        /* allow internal resizing */
+        int allow_spatial_resampling;
+        int resample_down_water_mark;
+        int resample_up_water_mark;
+
+        /* allow internal frame rate alterations */
+        int allow_df;
+        int drop_frames_water_mark;
+
+        /* two pass datarate control */
+        int two_pass_vbrbias;
+        int two_pass_vbrmin_section;
+        int two_pass_vbrmax_section;
+
+        /*
+         * END DATARATE CONTROL OPTIONS
+         */
+
+        /* these parameters aren't to be used in final build don't use!!! */
+        int play_alternate;
+        int alt_freq;
+        int alt_q;
+        int key_q;
+        int gold_q;
+
+
+        int multi_threaded;   /* how many threads to run the encoder on */
+        int token_partitions; /* how many token partitions to create */
+
+        /* early breakout threshold: for video conf recommend 800 */
+        int encode_breakout;
+
+        /* Bitfield defining the error resiliency features to enable.
+         * Can provide decodable frames after losses in previous
+         * frames and decodable partitions after losses in the same frame.
+         */
+        unsigned int error_resilient_mode;
+
+        int arnr_max_frames;
+        int arnr_strength;
+        int arnr_type;
+
+        vpx_fixed_buf_t        two_pass_stats_in;
+        struct vpx_codec_pkt_list  *output_pkt_list;
+
+        vp8e_tuning tuning;
+
+        /* Temporal scaling parameters */
+        unsigned int number_of_layers;
+        unsigned int target_bitrate[VPX_TS_MAX_PERIODICITY];
+        unsigned int rate_decimator[VPX_TS_MAX_PERIODICITY];
+        unsigned int periodicity;
+        unsigned int layer_id[VPX_TS_MAX_PERIODICITY];
+
+#if CONFIG_MULTI_RES_ENCODING
+        /* Number of total resolutions encoded */
+        unsigned int mr_total_resolutions;
+
+        /* Current encoder ID */
+        unsigned int mr_encoder_id;
+
+        /* Down-sampling factor */
+        vpx_rational_t mr_down_sampling_factor;
+
+        /* Memory location to store low-resolution encoder's mode info */
+        void* mr_low_res_mode_info;
+#endif
+    } VP8_CONFIG;
+
+
+    void vp8_initialize();
+
+    struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf);
+    void vp8_remove_compressor(struct VP8_COMP* *comp);
+
+    void vp8_init_config(struct VP8_COMP* onyx, VP8_CONFIG *oxcf);
+    void vp8_change_config(struct VP8_COMP* onyx, VP8_CONFIG *oxcf);
+
+    int vp8_receive_raw_frame(struct VP8_COMP* comp, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, int64_t time_stamp, int64_t end_time_stamp);
+    int vp8_get_compressed_data(struct VP8_COMP* comp, unsigned int *frame_flags, unsigned long *size, unsigned char *dest, unsigned char *dest_end, int64_t *time_stamp, int64_t *time_end, int flush);
+    int vp8_get_preview_raw_frame(struct VP8_COMP* comp, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *flags);
+
+    int vp8_use_as_reference(struct VP8_COMP* comp, int ref_frame_flags);
+    int vp8_update_reference(struct VP8_COMP* comp, int ref_frame_flags);
+    int vp8_get_reference(struct VP8_COMP* comp, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd);
+    int vp8_set_reference(struct VP8_COMP* comp, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd);
+    int vp8_update_entropy(struct VP8_COMP* comp, int update);
+    int vp8_set_roimap(struct VP8_COMP* comp, unsigned char *map, unsigned int rows, unsigned int cols, int delta_q[4], int delta_lf[4], unsigned int threshold[4]);
+    int vp8_set_active_map(struct VP8_COMP* comp, unsigned char *map, unsigned int rows, unsigned int cols);
+    int vp8_set_internal_size(struct VP8_COMP* comp, VPX_SCALING horiz_mode, VPX_SCALING vert_mode);
+    int vp8_get_quantizer(struct VP8_COMP* c);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // VP8_COMMON_ONYX_H_
diff --git a/libs/libvpx/vp8/common/onyxc_int.h b/libs/libvpx/vp8/common/onyxc_int.h
new file mode 100644
index 0000000000..6d89865c60
--- /dev/null
+++ b/libs/libvpx/vp8/common/onyxc_int.h
@@ -0,0 +1,185 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_COMMON_ONYXC_INT_H_
+#define VP8_COMMON_ONYXC_INT_H_
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+#include "vpx/internal/vpx_codec_internal.h"
+#include "loopfilter.h"
+#include "entropymv.h"
+#include "entropy.h"
+#if CONFIG_POSTPROC
+#include "postproc.h"
+#endif
+
+/*#ifdef PACKET_TESTING*/
+#include "header.h"
+/*#endif*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MINQ 0
+#define MAXQ 127
+#define QINDEX_RANGE (MAXQ + 1)
+
+#define NUM_YV12_BUFFERS 4
+
+#define MAX_PARTITIONS 9
+
+typedef struct frame_contexts
+{
+    vp8_prob bmode_prob [VP8_BINTRAMODES-1];
+    vp8_prob ymode_prob [VP8_YMODES-1];   /* interframe intra mode probs */
+    vp8_prob uv_mode_prob [VP8_UV_MODES-1];
+    vp8_prob sub_mv_ref_prob [VP8_SUBMVREFS-1];
+    vp8_prob coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+    MV_CONTEXT mvc[2];
+} FRAME_CONTEXT;
+
+typedef enum
+{
+    ONE_PARTITION  = 0,
+    TWO_PARTITION  = 1,
+    FOUR_PARTITION = 2,
+    EIGHT_PARTITION = 3
+} TOKEN_PARTITION;
+
+typedef enum
+{
+    RECON_CLAMP_REQUIRED        = 0,
+    RECON_CLAMP_NOTREQUIRED     = 1
+} CLAMP_TYPE;
+
+typedef struct VP8Common
+
+{
+    struct vpx_internal_error_info  error;
+
+    DECLARE_ALIGNED(16, short, Y1dequant[QINDEX_RANGE][2]);
+    DECLARE_ALIGNED(16, short, Y2dequant[QINDEX_RANGE][2]);
+    DECLARE_ALIGNED(16, short, UVdequant[QINDEX_RANGE][2]);
+
+    int Width;
+    int Height;
+    int horiz_scale;
+    int vert_scale;
+
+    CLAMP_TYPE  clamp_type;
+
+    YV12_BUFFER_CONFIG *frame_to_show;
+
+    YV12_BUFFER_CONFIG yv12_fb[NUM_YV12_BUFFERS];
+    int fb_idx_ref_cnt[NUM_YV12_BUFFERS];
+    int new_fb_idx, lst_fb_idx, gld_fb_idx, alt_fb_idx;
+
+    YV12_BUFFER_CONFIG temp_scale_frame;
+
+#if CONFIG_POSTPROC
+    YV12_BUFFER_CONFIG post_proc_buffer;
+    YV12_BUFFER_CONFIG post_proc_buffer_int;
+    int post_proc_buffer_int_used;
+    unsigned char *pp_limits_buffer;   /* post-processing filter coefficients */
+#endif
+
+    FRAME_TYPE last_frame_type;  /* Save last frame's frame type for motion search. */
+    FRAME_TYPE frame_type;
+
+    int show_frame;
+
+    int frame_flags;
+    int MBs;
+    int mb_rows;
+    int mb_cols;
+    int mode_info_stride;
+
+    /* profile settings */
+    int mb_no_coeff_skip;
+    int no_lpf;
+    int use_bilinear_mc_filter;
+    int full_pixel;
+
+    int base_qindex;
+
+    int y1dc_delta_q;
+    int y2dc_delta_q;
+    int y2ac_delta_q;
+    int uvdc_delta_q;
+    int uvac_delta_q;
+
+    /* We allocate a MODE_INFO struct for each macroblock, together with
+       an extra row on top and column on the left to simplify prediction. */
+
+    MODE_INFO *mip; /* Base of allocated array */
+    MODE_INFO *mi;  /* Corresponds to upper left visible macroblock */
+#if CONFIG_ERROR_CONCEALMENT
+    MODE_INFO *prev_mip; /* MODE_INFO array 'mip' from last decoded frame */
+    MODE_INFO *prev_mi;  /* 'mi' from last frame (points into prev_mip) */
+#endif
+    MODE_INFO *show_frame_mi;  /* MODE_INFO for the last decoded frame
+                                  to show */
+    LOOPFILTERTYPE filter_type;
+
+    loop_filter_info_n lf_info;
+
+    int filter_level;
+    int last_sharpness_level;
+    int sharpness_level;
+
+    int refresh_last_frame;       /* Two state 0 = NO, 1 = YES */
+    int refresh_golden_frame;     /* Two state 0 = NO, 1 = YES */
+    int refresh_alt_ref_frame;     /* Two state 0 = NO, 1 = YES */
+
+    int copy_buffer_to_gf;         /* 0 none, 1 Last to GF, 2 ARF to GF */
+    int copy_buffer_to_arf;        /* 0 none, 1 Last to ARF, 2 GF to ARF */
+
+    int refresh_entropy_probs;    /* Two state 0 = NO, 1 = YES */
+
+    int ref_frame_sign_bias[MAX_REF_FRAMES];    /* Two state 0, 1 */
+
+    /* Y,U,V,Y2 */
+    ENTROPY_CONTEXT_PLANES *above_context;   /* row of context for each plane */
+    ENTROPY_CONTEXT_PLANES left_context;  /* (up to) 4 contexts "" */
+
+    FRAME_CONTEXT lfc; /* last frame entropy */
+    FRAME_CONTEXT fc;  /* this frame entropy */
+
+    unsigned int current_video_frame;
+
+    int version;
+
+    TOKEN_PARTITION multi_token_partition;
+
+#ifdef PACKET_TESTING
+    VP8_HEADER oh;
+#endif
+#if CONFIG_POSTPROC_VISUALIZER
+    double bitrate;
+    double framerate;
+#endif
+
+#if CONFIG_MULTITHREAD
+    int processor_core_count;
+#endif
+#if CONFIG_POSTPROC
+    struct postproc_state  postproc_state;
+#endif
+    int cpu_caps;
+} VP8_COMMON;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_COMMON_ONYXC_INT_H_
diff --git a/libs/libvpx/vp8/common/onyxd.h b/libs/libvpx/vp8/common/onyxd.h
new file mode 100644
index 0000000000..e37b29f32c
--- /dev/null
+++ b/libs/libvpx/vp8/common/onyxd.h
@@ -0,0 +1,63 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_COMMON_ONYXD_H_
+#define VP8_COMMON_ONYXD_H_
+
+
+/* Create/destroy static data structures. */
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+#include "vpx_scale/yv12config.h"
+#include "ppflags.h"
+#include "vpx_ports/mem.h"
+#include "vpx/vpx_codec.h"
+#include "vpx/vp8.h"
+
+    struct VP8D_COMP;
+
+    typedef struct
+    {
+        int     Width;
+        int     Height;
+        int     Version;
+        int     postprocess;
+        int     max_threads;
+        int     error_concealment;
+    } VP8D_CONFIG;
+
+    typedef enum
+    {
+        VP8D_OK = 0
+    } VP8D_SETTING;
+
+    void vp8dx_initialize(void);
+
+    void vp8dx_set_setting(struct VP8D_COMP* comp, VP8D_SETTING oxst, int x);
+
+    int vp8dx_get_setting(struct VP8D_COMP* comp, VP8D_SETTING oxst);
+
+    int vp8dx_receive_compressed_data(struct VP8D_COMP* comp,
+                                      size_t size, const uint8_t *dest,
+                                      int64_t time_stamp);
+    int vp8dx_get_raw_frame(struct VP8D_COMP* comp, YV12_BUFFER_CONFIG *sd, int64_t *time_stamp, int64_t *time_end_stamp, vp8_ppflags_t *flags);
+
+    vpx_codec_err_t vp8dx_get_reference(struct VP8D_COMP* comp, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd);
+    vpx_codec_err_t vp8dx_set_reference(struct VP8D_COMP* comp, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd);
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif  // VP8_COMMON_ONYXD_H_
diff --git a/libs/libvpx/vp8/common/postproc.c b/libs/libvpx/vp8/common/postproc.c
new file mode 100644
index 0000000000..322b61383b
--- /dev/null
+++ b/libs/libvpx/vp8/common/postproc.c
@@ -0,0 +1,1208 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+#include "vpx_scale_rtcd.h"
+#include "vpx_scale/yv12config.h"
+#include "postproc.h"
+#include "common.h"
+#include "vpx_scale/vpx_scale.h"
+#include "systemdependent.h"
+
+#include <limits.h>
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#define RGB_TO_YUV(t)                                                                       \
+    ( (0.257*(float)(t>>16)) + (0.504*(float)(t>>8&0xff)) + (0.098*(float)(t&0xff)) + 16),  \
+    (-(0.148*(float)(t>>16)) - (0.291*(float)(t>>8&0xff)) + (0.439*(float)(t&0xff)) + 128), \
+    ( (0.439*(float)(t>>16)) - (0.368*(float)(t>>8&0xff)) - (0.071*(float)(t&0xff)) + 128)
+
+/* global constants */
+#if CONFIG_POSTPROC_VISUALIZER
+static const unsigned char MB_PREDICTION_MODE_colors[MB_MODE_COUNT][3] =
+{
+    { RGB_TO_YUV(0x98FB98) },   /* PaleGreen */
+    { RGB_TO_YUV(0x00FF00) },   /* Green */
+    { RGB_TO_YUV(0xADFF2F) },   /* GreenYellow */
+    { RGB_TO_YUV(0x228B22) },   /* ForestGreen */
+    { RGB_TO_YUV(0x006400) },   /* DarkGreen */
+    { RGB_TO_YUV(0x98F5FF) },   /* Cadet Blue */
+    { RGB_TO_YUV(0x6CA6CD) },   /* Sky Blue */
+    { RGB_TO_YUV(0x00008B) },   /* Dark blue */
+    { RGB_TO_YUV(0x551A8B) },   /* Purple */
+    { RGB_TO_YUV(0xFF0000) }    /* Red */
+};
+
+static const unsigned char B_PREDICTION_MODE_colors[B_MODE_COUNT][3] =
+{
+    { RGB_TO_YUV(0x6633ff) },   /* Purple */
+    { RGB_TO_YUV(0xcc33ff) },   /* Magenta */
+    { RGB_TO_YUV(0xff33cc) },   /* Pink */
+    { RGB_TO_YUV(0xff3366) },   /* Coral */
+    { RGB_TO_YUV(0x3366ff) },   /* Blue */
+    { RGB_TO_YUV(0xed00f5) },   /* Dark Blue */
+    { RGB_TO_YUV(0x2e00b8) },   /* Dark Purple */
+    { RGB_TO_YUV(0xff6633) },   /* Orange */
+    { RGB_TO_YUV(0x33ccff) },   /* Light Blue */
+    { RGB_TO_YUV(0x8ab800) },   /* Green */
+    { RGB_TO_YUV(0xffcc33) },   /* Light Orange */
+    { RGB_TO_YUV(0x33ffcc) },   /* Aqua */
+    { RGB_TO_YUV(0x66ff33) },   /* Light Green */
+    { RGB_TO_YUV(0xccff33) },   /* Yellow */
+};
+
+static const unsigned char MV_REFERENCE_FRAME_colors[MAX_REF_FRAMES][3] =
+{
+    { RGB_TO_YUV(0x00ff00) },   /* Blue */
+    { RGB_TO_YUV(0x0000ff) },   /* Green */
+    { RGB_TO_YUV(0xffff00) },   /* Yellow */
+    { RGB_TO_YUV(0xff0000) },   /* Red */
+};
+#endif
+
+const short vp8_rv[] =
+{
+    8, 5, 2, 2, 8, 12, 4, 9, 8, 3,
+    0, 3, 9, 0, 0, 0, 8, 3, 14, 4,
+    10, 1, 11, 14, 1, 14, 9, 6, 12, 11,
+    8, 6, 10, 0, 0, 8, 9, 0, 3, 14,
+    8, 11, 13, 4, 2, 9, 0, 3, 9, 6,
+    1, 2, 3, 14, 13, 1, 8, 2, 9, 7,
+    3, 3, 1, 13, 13, 6, 6, 5, 2, 7,
+    11, 9, 11, 8, 7, 3, 2, 0, 13, 13,
+    14, 4, 12, 5, 12, 10, 8, 10, 13, 10,
+    4, 14, 4, 10, 0, 8, 11, 1, 13, 7,
+    7, 14, 6, 14, 13, 2, 13, 5, 4, 4,
+    0, 10, 0, 5, 13, 2, 12, 7, 11, 13,
+    8, 0, 4, 10, 7, 2, 7, 2, 2, 5,
+    3, 4, 7, 3, 3, 14, 14, 5, 9, 13,
+    3, 14, 3, 6, 3, 0, 11, 8, 13, 1,
+    13, 1, 12, 0, 10, 9, 7, 6, 2, 8,
+    5, 2, 13, 7, 1, 13, 14, 7, 6, 7,
+    9, 6, 10, 11, 7, 8, 7, 5, 14, 8,
+    4, 4, 0, 8, 7, 10, 0, 8, 14, 11,
+    3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
+    11, 12, 12, 8, 0, 11, 13, 1, 2, 0,
+    5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
+    0, 3, 10, 5, 8, 0, 11, 6, 7, 8,
+    10, 7, 13, 9, 2, 5, 1, 5, 10, 2,
+    4, 3, 5, 6, 10, 8, 9, 4, 11, 14,
+    0, 10, 0, 5, 13, 2, 12, 7, 11, 13,
+    8, 0, 4, 10, 7, 2, 7, 2, 2, 5,
+    3, 4, 7, 3, 3, 14, 14, 5, 9, 13,
+    3, 14, 3, 6, 3, 0, 11, 8, 13, 1,
+    13, 1, 12, 0, 10, 9, 7, 6, 2, 8,
+    5, 2, 13, 7, 1, 13, 14, 7, 6, 7,
+    9, 6, 10, 11, 7, 8, 7, 5, 14, 8,
+    4, 4, 0, 8, 7, 10, 0, 8, 14, 11,
+    3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
+    11, 12, 12, 8, 0, 11, 13, 1, 2, 0,
+    5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
+    0, 3, 10, 5, 8, 0, 11, 6, 7, 8,
+    10, 7, 13, 9, 2, 5, 1, 5, 10, 2,
+    4, 3, 5, 6, 10, 8, 9, 4, 11, 14,
+    3, 8, 3, 7, 8, 5, 11, 4, 12, 3,
+    11, 9, 14, 8, 14, 13, 4, 3, 1, 2,
+    14, 6, 5, 4, 4, 11, 4, 6, 2, 1,
+    5, 8, 8, 12, 13, 5, 14, 10, 12, 13,
+    0, 9, 5, 5, 11, 10, 13, 9, 10, 13,
+};
+
+extern void vp8_blit_text(const char *msg, unsigned char *address, const int pitch);
+extern void vp8_blit_line(int x0, int x1, int y0, int y1, unsigned char *image, const int pitch);
+/***********************************************************************************************************
+ */
+void vp8_post_proc_down_and_across_mb_row_c
+(
+    unsigned char *src_ptr,
+    unsigned char *dst_ptr,
+    int src_pixels_per_line,
+    int dst_pixels_per_line,
+    int cols,
+    unsigned char *f,
+    int size
+)
+{
+    unsigned char *p_src, *p_dst;
+    int row;
+    int col;
+    unsigned char v;
+    unsigned char d[4];
+
+    for (row = 0; row < size; row++)
+    {
+        /* post_proc_down for one row */
+        p_src = src_ptr;
+        p_dst = dst_ptr;
+
+        for (col = 0; col < cols; col++)
+        {
+            unsigned char p_above2 = p_src[col - 2 * src_pixels_per_line];
+            unsigned char p_above1 = p_src[col - src_pixels_per_line];
+            unsigned char p_below1 = p_src[col + src_pixels_per_line];
+            unsigned char p_below2 = p_src[col + 2 * src_pixels_per_line];
+
+            v = p_src[col];
+
+            if ((abs(v - p_above2) < f[col]) && (abs(v - p_above1) < f[col])
+                && (abs(v - p_below1) < f[col]) && (abs(v - p_below2) < f[col]))
+            {
+                unsigned char k1, k2, k3;
+                k1 = (p_above2 + p_above1 + 1) >> 1;
+                k2 = (p_below2 + p_below1 + 1) >> 1;
+                k3 = (k1 + k2 + 1) >> 1;
+                v = (k3 + v + 1) >> 1;
+            }
+
+            p_dst[col] = v;
+        }
+
+        /* now post_proc_across */
+        p_src = dst_ptr;
+        p_dst = dst_ptr;
+
+        p_src[-2] = p_src[-1] = p_src[0];
+        p_src[cols] = p_src[cols + 1] = p_src[cols - 1];
+
+        for (col = 0; col < cols; col++)
+        {
+            v = p_src[col];
+
+            if ((abs(v - p_src[col - 2]) < f[col])
+                && (abs(v - p_src[col - 1]) < f[col])
+                && (abs(v - p_src[col + 1]) < f[col])
+                && (abs(v - p_src[col + 2]) < f[col]))
+            {
+                unsigned char k1, k2, k3;
+                k1 = (p_src[col - 2] + p_src[col - 1] + 1) >> 1;
+                k2 = (p_src[col + 2] + p_src[col + 1] + 1) >> 1;
+                k3 = (k1 + k2 + 1) >> 1;
+                v = (k3 + v + 1) >> 1;
+            }
+
+            d[col & 3] = v;
+
+            if (col >= 2)
+                p_dst[col - 2] = d[(col - 2) & 3];
+        }
+
+        /* handle the last two pixels */
+        p_dst[col - 2] = d[(col - 2) & 3];
+        p_dst[col - 1] = d[(col - 1) & 3];
+
+        /* next row */
+        src_ptr += src_pixels_per_line;
+        dst_ptr += dst_pixels_per_line;
+    }
+}
+
+static int q2mbl(int x)
+{
+    if (x < 20) x = 20;
+
+    x = 50 + (x - 50) * 10 / 8;
+    return x * x / 3;
+}
+
+void vp8_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols, int flimit)
+{
+    int r, c, i;
+
+    unsigned char *s = src;
+    unsigned char d[16];
+
+    for (r = 0; r < rows; r++)
+    {
+        int sumsq = 0;
+        int sum   = 0;
+
+        for (i = -8; i < 0; i++)
+          s[i]=s[0];
+
+        /* 17 avoids valgrind warning - we buffer values in c in d
+         * and only write them when we've read 8 ahead...
+         */
+        for (i = 0; i < 17; i++)
+          s[i+cols]=s[cols-1];
+
+        for (i = -8; i <= 6; i++)
+        {
+            sumsq += s[i] * s[i];
+            sum   += s[i];
+            d[i+8] = 0;
+        }
+
+        for (c = 0; c < cols + 8; c++)
+        {
+            int x = s[c+7] - s[c-8];
+            int y = s[c+7] + s[c-8];
+
+            sum  += x;
+            sumsq += x * y;
+
+            d[c&15] = s[c];
+
+            if (sumsq * 15 - sum * sum < flimit)
+            {
+                d[c&15] = (8 + sum + s[c]) >> 4;
+            }
+
+            s[c-8] = d[(c-8)&15];
+        }
+
+        s += pitch;
+    }
+}
+
+void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, int flimit)
+{
+    int r, c, i;
+    const short *rv3 = &vp8_rv[63&rand()];
+
+    for (c = 0; c < cols; c++ )
+    {
+        unsigned char *s = &dst[c];
+        int sumsq = 0;
+        int sum   = 0;
+        unsigned char d[16];
+        const short *rv2 = rv3 + ((c * 17) & 127);
+
+        for (i = -8; i < 0; i++)
+          s[i*pitch]=s[0];
+
+        /* 17 avoids valgrind warning - we buffer values in c in d
+         * and only write them when we've read 8 ahead...
+         */
+        for (i = 0; i < 17; i++)
+          s[(i+rows)*pitch]=s[(rows-1)*pitch];
+
+        for (i = -8; i <= 6; i++)
+        {
+            sumsq += s[i*pitch] * s[i*pitch];
+            sum   += s[i*pitch];
+        }
+
+        for (r = 0; r < rows + 8; r++)
+        {
+            sumsq += s[7*pitch] * s[ 7*pitch] - s[-8*pitch] * s[-8*pitch];
+            sum  += s[7*pitch] - s[-8*pitch];
+            d[r&15] = s[0];
+
+            if (sumsq * 15 - sum * sum < flimit)
+            {
+                d[r&15] = (rv2[r&127] + sum + s[0]) >> 4;
+            }
+            if (r >= 8)
+              s[-8*pitch] = d[(r-8)&15];
+            s += pitch;
+        }
+    }
+}
+
+#if CONFIG_POSTPROC
+static void vp8_de_mblock(YV12_BUFFER_CONFIG         *post,
+                          int                         q)
+{
+    vp8_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height,
+                              post->y_width, q2mbl(q));
+    vp8_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height,
+                         post->y_width, q2mbl(q));
+}
+
+void vp8_deblock(VP8_COMMON                 *cm,
+                 YV12_BUFFER_CONFIG         *source,
+                 YV12_BUFFER_CONFIG         *post,
+                 int                         q,
+                 int                         low_var_thresh,
+                 int                         flag)
+{
+    double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
+    int ppl = (int)(level + .5);
+
+    const MODE_INFO *mode_info_context = cm->show_frame_mi;
+    int mbr, mbc;
+
+    /* The pixel thresholds are adjusted according to if or not the macroblock
+     * is a skipped block.  */
+    unsigned char *ylimits = cm->pp_limits_buffer;
+    unsigned char *uvlimits = cm->pp_limits_buffer + 16 * cm->mb_cols;
+    (void) low_var_thresh;
+    (void) flag;
+
+    if (ppl > 0)
+    {
+        for (mbr = 0; mbr < cm->mb_rows; mbr++)
+        {
+            unsigned char *ylptr = ylimits;
+            unsigned char *uvlptr = uvlimits;
+            for (mbc = 0; mbc < cm->mb_cols; mbc++)
+            {
+                unsigned char mb_ppl;
+
+                if (mode_info_context->mbmi.mb_skip_coeff)
+                    mb_ppl = (unsigned char)ppl >> 1;
+                else
+                    mb_ppl = (unsigned char)ppl;
+
+                memset(ylptr, mb_ppl, 16);
+                memset(uvlptr, mb_ppl, 8);
+
+                ylptr += 16;
+                uvlptr += 8;
+                mode_info_context++;
+            }
+            mode_info_context++;
+
+            vp8_post_proc_down_and_across_mb_row(
+                source->y_buffer + 16 * mbr * source->y_stride,
+                post->y_buffer + 16 * mbr * post->y_stride, source->y_stride,
+                post->y_stride, source->y_width, ylimits, 16);
+
+            vp8_post_proc_down_and_across_mb_row(
+                source->u_buffer + 8 * mbr * source->uv_stride,
+                post->u_buffer + 8 * mbr * post->uv_stride, source->uv_stride,
+                post->uv_stride, source->uv_width, uvlimits, 8);
+            vp8_post_proc_down_and_across_mb_row(
+                source->v_buffer + 8 * mbr * source->uv_stride,
+                post->v_buffer + 8 * mbr * post->uv_stride, source->uv_stride,
+                post->uv_stride, source->uv_width, uvlimits, 8);
+        }
+    } else
+    {
+        vp8_yv12_copy_frame(source, post);
+    }
+}
+#endif
+
+void vp8_de_noise(VP8_COMMON                 *cm,
+                  YV12_BUFFER_CONFIG         *source,
+                  YV12_BUFFER_CONFIG         *post,
+                  int                         q,
+                  int                         low_var_thresh,
+                  int                         flag,
+                  int                         uvfilter)
+{
+    int mbr;
+    double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
+    int ppl = (int)(level + .5);
+    int mb_rows = cm->mb_rows;
+    int mb_cols = cm->mb_cols;
+    unsigned char *limits = cm->pp_limits_buffer;;
+    (void) post;
+    (void) low_var_thresh;
+    (void) flag;
+
+    memset(limits, (unsigned char)ppl, 16 * mb_cols);
+
+    /* TODO: The original code don't filter the 2 outer rows and columns. */
+    for (mbr = 0; mbr < mb_rows; mbr++)
+    {
+        vp8_post_proc_down_and_across_mb_row(
+            source->y_buffer + 16 * mbr * source->y_stride,
+            source->y_buffer + 16 * mbr * source->y_stride,
+            source->y_stride, source->y_stride, source->y_width, limits, 16);
+        if (uvfilter == 1) {
+          vp8_post_proc_down_and_across_mb_row(
+              source->u_buffer + 8 * mbr * source->uv_stride,
+              source->u_buffer + 8 * mbr * source->uv_stride,
+              source->uv_stride, source->uv_stride, source->uv_width, limits,
+              8);
+          vp8_post_proc_down_and_across_mb_row(
+              source->v_buffer + 8 * mbr * source->uv_stride,
+              source->v_buffer + 8 * mbr * source->uv_stride,
+              source->uv_stride, source->uv_stride, source->uv_width, limits,
+              8);
+        }
+    }
+}
+
+static double gaussian(double sigma, double mu, double x)
+{
+    return 1 / (sigma * sqrt(2.0 * 3.14159265)) *
+           (exp(-(x - mu) * (x - mu) / (2 * sigma * sigma)));
+}
+
+static void fillrd(struct postproc_state *state, int q, int a)
+{
+    char char_dist[300];
+
+    double sigma;
+    int i;
+
+    vp8_clear_system_state();
+
+
+    sigma = a + .5 + .6 * (63 - q) / 63.0;
+
+    /* set up a lookup table of 256 entries that matches
+     * a gaussian distribution with sigma determined by q.
+     */
+    {
+        int next, j;
+
+        next = 0;
+
+        for (i = -32; i < 32; i++)
+        {
+            const int v = (int)(.5 + 256 * gaussian(sigma, 0, i));
+
+            if (v)
+            {
+                for (j = 0; j < v; j++)
+                {
+                    char_dist[next+j] = (char) i;
+                }
+
+                next = next + j;
+            }
+
+        }
+
+        for (; next < 256; next++)
+            char_dist[next] = 0;
+
+    }
+
+    for (i = 0; i < 3072; i++)
+    {
+        state->noise[i] = char_dist[rand() & 0xff];
+    }
+
+    for (i = 0; i < 16; i++)
+    {
+        state->blackclamp[i] = -char_dist[0];
+        state->whiteclamp[i] = -char_dist[0];
+        state->bothclamp[i] = -2 * char_dist[0];
+    }
+
+    state->last_q = q;
+    state->last_noise = a;
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : plane_add_noise_c
+ *
+ *  INPUTS        : unsigned char *Start    starting address of buffer to add gaussian
+ *                                  noise to
+ *                  unsigned int Width    width of plane
+ *                  unsigned int Height   height of plane
+ *                  int  Pitch    distance between subsequent lines of frame
+ *                  int  q        quantizer used to determine amount of noise
+ *                                  to add
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void.
+ *
+ *  FUNCTION      : adds gaussian noise to a plane of pixels
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+void vp8_plane_add_noise_c(unsigned char *Start, char *noise,
+                           char blackclamp[16],
+                           char whiteclamp[16],
+                           char bothclamp[16],
+                           unsigned int Width, unsigned int Height, int Pitch)
+{
+    unsigned int i, j;
+    (void)bothclamp;
+
+    for (i = 0; i < Height; i++)
+    {
+        unsigned char *Pos = Start + i * Pitch;
+        char  *Ref = (char *)(noise + (rand() & 0xff));
+
+        for (j = 0; j < Width; j++)
+        {
+            if (Pos[j] < blackclamp[0])
+                Pos[j] = blackclamp[0];
+
+            if (Pos[j] > 255 + whiteclamp[0])
+                Pos[j] = 255 + whiteclamp[0];
+
+            Pos[j] += Ref[j];
+        }
+    }
+}
+
+/* Blend the macro block with a solid colored square.  Leave the
+ * edges unblended to give distinction to macro blocks in areas
+ * filled with the same color block.
+ */
+void vp8_blend_mb_inner_c (unsigned char *y, unsigned char *u, unsigned char *v,
+                        int y_1, int u_1, int v_1, int alpha, int stride)
+{
+    int i, j;
+    int y1_const = y_1*((1<<16)-alpha);
+    int u1_const = u_1*((1<<16)-alpha);
+    int v1_const = v_1*((1<<16)-alpha);
+
+    y += 2*stride + 2;
+    for (i = 0; i < 12; i++)
+    {
+        for (j = 0; j < 12; j++)
+        {
+            y[j] = (y[j]*alpha + y1_const)>>16;
+        }
+        y += stride;
+    }
+
+    stride >>= 1;
+
+    u += stride + 1;
+    v += stride + 1;
+
+    for (i = 0; i < 6; i++)
+    {
+        for (j = 0; j < 6; j++)
+        {
+            u[j] = (u[j]*alpha + u1_const)>>16;
+            v[j] = (v[j]*alpha + v1_const)>>16;
+        }
+        u += stride;
+        v += stride;
+    }
+}
+
+/* Blend only the edge of the macro block.  Leave center
+ * unblended to allow for other visualizations to be layered.
+ */
+void vp8_blend_mb_outer_c (unsigned char *y, unsigned char *u, unsigned char *v,
+                        int y_1, int u_1, int v_1, int alpha, int stride)
+{
+    int i, j;
+    int y1_const = y_1*((1<<16)-alpha);
+    int u1_const = u_1*((1<<16)-alpha);
+    int v1_const = v_1*((1<<16)-alpha);
+
+    for (i = 0; i < 2; i++)
+    {
+        for (j = 0; j < 16; j++)
+        {
+            y[j] = (y[j]*alpha + y1_const)>>16;
+        }
+        y += stride;
+    }
+
+    for (i = 0; i < 12; i++)
+    {
+        y[0]  = (y[0]*alpha  + y1_const)>>16;
+        y[1]  = (y[1]*alpha  + y1_const)>>16;
+        y[14] = (y[14]*alpha + y1_const)>>16;
+        y[15] = (y[15]*alpha + y1_const)>>16;
+        y += stride;
+    }
+
+    for (i = 0; i < 2; i++)
+    {
+        for (j = 0; j < 16; j++)
+        {
+            y[j] = (y[j]*alpha + y1_const)>>16;
+        }
+        y += stride;
+    }
+
+    stride >>= 1;
+
+    for (j = 0; j < 8; j++)
+    {
+        u[j] = (u[j]*alpha + u1_const)>>16;
+        v[j] = (v[j]*alpha + v1_const)>>16;
+    }
+    u += stride;
+    v += stride;
+
+    for (i = 0; i < 6; i++)
+    {
+        u[0] = (u[0]*alpha + u1_const)>>16;
+        v[0] = (v[0]*alpha + v1_const)>>16;
+
+        u[7] = (u[7]*alpha + u1_const)>>16;
+        v[7] = (v[7]*alpha + v1_const)>>16;
+
+        u += stride;
+        v += stride;
+    }
+
+    for (j = 0; j < 8; j++)
+    {
+        u[j] = (u[j]*alpha + u1_const)>>16;
+        v[j] = (v[j]*alpha + v1_const)>>16;
+    }
+}
+
+void vp8_blend_b_c (unsigned char *y, unsigned char *u, unsigned char *v,
+                        int y_1, int u_1, int v_1, int alpha, int stride)
+{
+    int i, j;
+    int y1_const = y_1*((1<<16)-alpha);
+    int u1_const = u_1*((1<<16)-alpha);
+    int v1_const = v_1*((1<<16)-alpha);
+
+    for (i = 0; i < 4; i++)
+    {
+        for (j = 0; j < 4; j++)
+        {
+            y[j] = (y[j]*alpha + y1_const)>>16;
+        }
+        y += stride;
+    }
+
+    stride >>= 1;
+
+    for (i = 0; i < 2; i++)
+    {
+        for (j = 0; j < 2; j++)
+        {
+            u[j] = (u[j]*alpha + u1_const)>>16;
+            v[j] = (v[j]*alpha + v1_const)>>16;
+        }
+        u += stride;
+        v += stride;
+    }
+}
+
+#if CONFIG_POSTPROC_VISUALIZER
+static void constrain_line (int x_0, int *x_1, int y_0, int *y_1, int width, int height)
+{
+    int dx;
+    int dy;
+
+    if (*x_1 > width)
+    {
+        dx = *x_1 - x_0;
+        dy = *y_1 - y_0;
+
+        *x_1 = width;
+        if (dx)
+            *y_1 = ((width-x_0)*dy)/dx + y_0;
+    }
+    if (*x_1 < 0)
+    {
+        dx = *x_1 - x_0;
+        dy = *y_1 - y_0;
+
+        *x_1 = 0;
+        if (dx)
+            *y_1 = ((0-x_0)*dy)/dx + y_0;
+    }
+    if (*y_1 > height)
+    {
+        dx = *x_1 - x_0;
+        dy = *y_1 - y_0;
+
+        *y_1 = height;
+        if (dy)
+            *x_1 = ((height-y_0)*dx)/dy + x_0;
+    }
+    if (*y_1 < 0)
+    {
+        dx = *x_1 - x_0;
+        dy = *y_1 - y_0;
+
+        *y_1 = 0;
+        if (dy)
+            *x_1 = ((0-y_0)*dx)/dy + x_0;
+    }
+}
+#endif  // CONFIG_POSTPROC_VISUALIZER
+
+#if CONFIG_POSTPROC
+int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *ppflags)
+{
+    int q = oci->filter_level * 10 / 6;
+    int flags = ppflags->post_proc_flag;
+    int deblock_level = ppflags->deblocking_level;
+    int noise_level = ppflags->noise_level;
+
+    if (!oci->frame_to_show)
+        return -1;
+
+    if (q > 63)
+        q = 63;
+
+    if (!flags)
+    {
+        *dest = *oci->frame_to_show;
+
+        /* handle problem with extending borders */
+        dest->y_width = oci->Width;
+        dest->y_height = oci->Height;
+        dest->uv_height = dest->y_height / 2;
+        oci->postproc_state.last_base_qindex = oci->base_qindex;
+        oci->postproc_state.last_frame_valid = 1;
+        return 0;
+    }
+
+    /* Allocate post_proc_buffer_int if needed */
+    if ((flags & VP8D_MFQE) && !oci->post_proc_buffer_int_used)
+    {
+        if ((flags & VP8D_DEBLOCK) || (flags & VP8D_DEMACROBLOCK))
+        {
+            int width = (oci->Width + 15) & ~15;
+            int height = (oci->Height + 15) & ~15;
+
+            if (vp8_yv12_alloc_frame_buffer(&oci->post_proc_buffer_int,
+                                            width, height, VP8BORDERINPIXELS))
+                vpx_internal_error(&oci->error, VPX_CODEC_MEM_ERROR,
+                                   "Failed to allocate MFQE framebuffer");
+
+            oci->post_proc_buffer_int_used = 1;
+
+            /* insure that postproc is set to all 0's so that post proc
+             * doesn't pull random data in from edge
+             */
+            memset((&oci->post_proc_buffer_int)->buffer_alloc,128,(&oci->post_proc_buffer)->frame_size);
+
+        }
+    }
+
+    vp8_clear_system_state();
+
+    if ((flags & VP8D_MFQE) &&
+         oci->postproc_state.last_frame_valid &&
+         oci->current_video_frame >= 2 &&
+         oci->postproc_state.last_base_qindex < 60 &&
+         oci->base_qindex - oci->postproc_state.last_base_qindex >= 20)
+    {
+        vp8_multiframe_quality_enhance(oci);
+        if (((flags & VP8D_DEBLOCK) || (flags & VP8D_DEMACROBLOCK)) &&
+            oci->post_proc_buffer_int_used)
+        {
+            vp8_yv12_copy_frame(&oci->post_proc_buffer, &oci->post_proc_buffer_int);
+            if (flags & VP8D_DEMACROBLOCK)
+            {
+                vp8_deblock(oci, &oci->post_proc_buffer_int, &oci->post_proc_buffer,
+                                               q + (deblock_level - 5) * 10, 1, 0);
+                vp8_de_mblock(&oci->post_proc_buffer,
+                              q + (deblock_level - 5) * 10);
+            }
+            else if (flags & VP8D_DEBLOCK)
+            {
+                vp8_deblock(oci, &oci->post_proc_buffer_int, &oci->post_proc_buffer,
+                            q, 1, 0);
+            }
+        }
+        /* Move partially towards the base q of the previous frame */
+        oci->postproc_state.last_base_qindex = (3*oci->postproc_state.last_base_qindex + oci->base_qindex)>>2;
+    }
+    else if (flags & VP8D_DEMACROBLOCK)
+    {
+        vp8_deblock(oci, oci->frame_to_show, &oci->post_proc_buffer,
+                                     q + (deblock_level - 5) * 10, 1, 0);
+        vp8_de_mblock(&oci->post_proc_buffer, q + (deblock_level - 5) * 10);
+
+        oci->postproc_state.last_base_qindex = oci->base_qindex;
+    }
+    else if (flags & VP8D_DEBLOCK)
+    {
+        vp8_deblock(oci, oci->frame_to_show, &oci->post_proc_buffer,
+                    q, 1, 0);
+        oci->postproc_state.last_base_qindex = oci->base_qindex;
+    }
+    else
+    {
+        vp8_yv12_copy_frame(oci->frame_to_show, &oci->post_proc_buffer);
+        oci->postproc_state.last_base_qindex = oci->base_qindex;
+    }
+    oci->postproc_state.last_frame_valid = 1;
+
+    if (flags & VP8D_ADDNOISE)
+    {
+        if (oci->postproc_state.last_q != q
+            || oci->postproc_state.last_noise != noise_level)
+        {
+            fillrd(&oci->postproc_state, 63 - q, noise_level);
+        }
+
+        vp8_plane_add_noise
+        (oci->post_proc_buffer.y_buffer,
+         oci->postproc_state.noise,
+         oci->postproc_state.blackclamp,
+         oci->postproc_state.whiteclamp,
+         oci->postproc_state.bothclamp,
+         oci->post_proc_buffer.y_width, oci->post_proc_buffer.y_height,
+         oci->post_proc_buffer.y_stride);
+    }
+
+#if CONFIG_POSTPROC_VISUALIZER
+    if (flags & VP8D_DEBUG_TXT_FRAME_INFO)
+    {
+        char message[512];
+        sprintf(message, "F%1dG%1dQ%3dF%3dP%d_s%dx%d",
+                (oci->frame_type == KEY_FRAME),
+                oci->refresh_golden_frame,
+                oci->base_qindex,
+                oci->filter_level,
+                flags,
+                oci->mb_cols, oci->mb_rows);
+        vp8_blit_text(message, oci->post_proc_buffer.y_buffer, oci->post_proc_buffer.y_stride);
+    }
+
+    if (flags & VP8D_DEBUG_TXT_MBLK_MODES)
+    {
+        int i, j;
+        unsigned char *y_ptr;
+        YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
+        int mb_rows = post->y_height >> 4;
+        int mb_cols = post->y_width  >> 4;
+        int mb_index = 0;
+        MODE_INFO *mi = oci->mi;
+
+        y_ptr = post->y_buffer + 4 * post->y_stride + 4;
+
+        /* vp8_filter each macro block */
+        for (i = 0; i < mb_rows; i++)
+        {
+            for (j = 0; j < mb_cols; j++)
+            {
+                char zz[4];
+
+                sprintf(zz, "%c", mi[mb_index].mbmi.mode + 'a');
+
+                vp8_blit_text(zz, y_ptr, post->y_stride);
+                mb_index ++;
+                y_ptr += 16;
+            }
+
+            mb_index ++; /* border */
+            y_ptr += post->y_stride  * 16 - post->y_width;
+
+        }
+    }
+
+    if (flags & VP8D_DEBUG_TXT_DC_DIFF)
+    {
+        int i, j;
+        unsigned char *y_ptr;
+        YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
+        int mb_rows = post->y_height >> 4;
+        int mb_cols = post->y_width  >> 4;
+        int mb_index = 0;
+        MODE_INFO *mi = oci->mi;
+
+        y_ptr = post->y_buffer + 4 * post->y_stride + 4;
+
+        /* vp8_filter each macro block */
+        for (i = 0; i < mb_rows; i++)
+        {
+            for (j = 0; j < mb_cols; j++)
+            {
+                char zz[4];
+                int dc_diff = !(mi[mb_index].mbmi.mode != B_PRED &&
+                              mi[mb_index].mbmi.mode != SPLITMV &&
+                              mi[mb_index].mbmi.mb_skip_coeff);
+
+                if (oci->frame_type == KEY_FRAME)
+                    sprintf(zz, "a");
+                else
+                    sprintf(zz, "%c", dc_diff + '0');
+
+                vp8_blit_text(zz, y_ptr, post->y_stride);
+                mb_index ++;
+                y_ptr += 16;
+            }
+
+            mb_index ++; /* border */
+            y_ptr += post->y_stride  * 16 - post->y_width;
+
+        }
+    }
+
+    if (flags & VP8D_DEBUG_TXT_RATE_INFO)
+    {
+        char message[512];
+        sprintf(message, "Bitrate: %10.2f framerate: %10.2f ", oci->bitrate, oci->framerate);
+        vp8_blit_text(message, oci->post_proc_buffer.y_buffer, oci->post_proc_buffer.y_stride);
+    }
+
+    /* Draw motion vectors */
+    if ((flags & VP8D_DEBUG_DRAW_MV) && ppflags->display_mv_flag)
+    {
+        YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
+        int width  = post->y_width;
+        int height = post->y_height;
+        unsigned char *y_buffer = oci->post_proc_buffer.y_buffer;
+        int y_stride = oci->post_proc_buffer.y_stride;
+        MODE_INFO *mi = oci->mi;
+        int x0, y0;
+
+        for (y0 = 0; y0 < height; y0 += 16)
+        {
+            for (x0 = 0; x0 < width; x0 += 16)
+            {
+                int x1, y1;
+
+                if (!(ppflags->display_mv_flag & (1<<mi->mbmi.mode)))
+                {
+                    mi++;
+                    continue;
+                }
+
+                if (mi->mbmi.mode == SPLITMV)
+                {
+                    switch (mi->mbmi.partitioning)
+                    {
+                        case 0 :    /* mv_top_bottom */
+                        {
+                            union b_mode_info *bmi = &mi->bmi[0];
+                            MV *mv = &bmi->mv.as_mv;
+
+                            x1 = x0 + 8 + (mv->col >> 3);
+                            y1 = y0 + 4 + (mv->row >> 3);
+
+                            constrain_line (x0+8, &x1, y0+4, &y1, width, height);
+                            vp8_blit_line  (x0+8,  x1, y0+4,  y1, y_buffer, y_stride);
+
+                            bmi = &mi->bmi[8];
+
+                            x1 = x0 + 8 + (mv->col >> 3);
+                            y1 = y0 +12 + (mv->row >> 3);
+
+                            constrain_line (x0+8, &x1, y0+12, &y1, width, height);
+                            vp8_blit_line  (x0+8,  x1, y0+12,  y1, y_buffer, y_stride);
+
+                            break;
+                        }
+                        case 1 :    /* mv_left_right */
+                        {
+                            union b_mode_info *bmi = &mi->bmi[0];
+                            MV *mv = &bmi->mv.as_mv;
+
+                            x1 = x0 + 4 + (mv->col >> 3);
+                            y1 = y0 + 8 + (mv->row >> 3);
+
+                            constrain_line (x0+4, &x1, y0+8, &y1, width, height);
+                            vp8_blit_line  (x0+4,  x1, y0+8,  y1, y_buffer, y_stride);
+
+                            bmi = &mi->bmi[2];
+
+                            x1 = x0 +12 + (mv->col >> 3);
+                            y1 = y0 + 8 + (mv->row >> 3);
+
+                            constrain_line (x0+12, &x1, y0+8, &y1, width, height);
+                            vp8_blit_line  (x0+12,  x1, y0+8,  y1, y_buffer, y_stride);
+
+                            break;
+                        }
+                        case 2 :    /* mv_quarters   */
+                        {
+                            union b_mode_info *bmi = &mi->bmi[0];
+                            MV *mv = &bmi->mv.as_mv;
+
+                            x1 = x0 + 4 + (mv->col >> 3);
+                            y1 = y0 + 4 + (mv->row >> 3);
+
+                            constrain_line (x0+4, &x1, y0+4, &y1, width, height);
+                            vp8_blit_line  (x0+4,  x1, y0+4,  y1, y_buffer, y_stride);
+
+                            bmi = &mi->bmi[2];
+
+                            x1 = x0 +12 + (mv->col >> 3);
+                            y1 = y0 + 4 + (mv->row >> 3);
+
+                            constrain_line (x0+12, &x1, y0+4, &y1, width, height);
+                            vp8_blit_line  (x0+12,  x1, y0+4,  y1, y_buffer, y_stride);
+
+                            bmi = &mi->bmi[8];
+
+                            x1 = x0 + 4 + (mv->col >> 3);
+                            y1 = y0 +12 + (mv->row >> 3);
+
+                            constrain_line (x0+4, &x1, y0+12, &y1, width, height);
+                            vp8_blit_line  (x0+4,  x1, y0+12,  y1, y_buffer, y_stride);
+
+                            bmi = &mi->bmi[10];
+
+                            x1 = x0 +12 + (mv->col >> 3);
+                            y1 = y0 +12 + (mv->row >> 3);
+
+                            constrain_line (x0+12, &x1, y0+12, &y1, width, height);
+                            vp8_blit_line  (x0+12,  x1, y0+12,  y1, y_buffer, y_stride);
+                            break;
+                        }
+                        default :
+                        {
+                            union b_mode_info *bmi = mi->bmi;
+                            int bx0, by0;
+
+                            for (by0 = y0; by0 < (y0+16); by0 += 4)
+                            {
+                                for (bx0 = x0; bx0 < (x0+16); bx0 += 4)
+                                {
+                                    MV *mv = &bmi->mv.as_mv;
+
+                                    x1 = bx0 + 2 + (mv->col >> 3);
+                                    y1 = by0 + 2 + (mv->row >> 3);
+
+                                    constrain_line (bx0+2, &x1, by0+2, &y1, width, height);
+                                    vp8_blit_line  (bx0+2,  x1, by0+2,  y1, y_buffer, y_stride);
+
+                                    bmi++;
+                                }
+                            }
+                        }
+                    }
+                }
+                else if (mi->mbmi.mode >= NEARESTMV)
+                {
+                    MV *mv = &mi->mbmi.mv.as_mv;
+                    const int lx0 = x0 + 8;
+                    const int ly0 = y0 + 8;
+
+                    x1 = lx0 + (mv->col >> 3);
+                    y1 = ly0 + (mv->row >> 3);
+
+                    if (x1 != lx0 && y1 != ly0)
+                    {
+                        constrain_line (lx0, &x1, ly0-1, &y1, width, height);
+                        vp8_blit_line  (lx0,  x1, ly0-1,  y1, y_buffer, y_stride);
+
+                        constrain_line (lx0, &x1, ly0+1, &y1, width, height);
+                        vp8_blit_line  (lx0,  x1, ly0+1,  y1, y_buffer, y_stride);
+                    }
+                    else
+                        vp8_blit_line  (lx0,  x1, ly0,  y1, y_buffer, y_stride);
+                }
+
+                mi++;
+            }
+            mi++;
+        }
+    }
+
+    /* Color in block modes */
+    if ((flags & VP8D_DEBUG_CLR_BLK_MODES)
+        && (ppflags->display_mb_modes_flag || ppflags->display_b_modes_flag))
+    {
+        int y, x;
+        YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
+        int width  = post->y_width;
+        int height = post->y_height;
+        unsigned char *y_ptr = oci->post_proc_buffer.y_buffer;
+        unsigned char *u_ptr = oci->post_proc_buffer.u_buffer;
+        unsigned char *v_ptr = oci->post_proc_buffer.v_buffer;
+        int y_stride = oci->post_proc_buffer.y_stride;
+        MODE_INFO *mi = oci->mi;
+
+        for (y = 0; y < height; y += 16)
+        {
+            for (x = 0; x < width; x += 16)
+            {
+                int Y = 0, U = 0, V = 0;
+
+                if (mi->mbmi.mode == B_PRED &&
+                    ((ppflags->display_mb_modes_flag & B_PRED) || ppflags->display_b_modes_flag))
+                {
+                    int by, bx;
+                    unsigned char *yl, *ul, *vl;
+                    union b_mode_info *bmi = mi->bmi;
+
+                    yl = y_ptr + x;
+                    ul = u_ptr + (x>>1);
+                    vl = v_ptr + (x>>1);
+
+                    for (by = 0; by < 16; by += 4)
+                    {
+                        for (bx = 0; bx < 16; bx += 4)
+                        {
+                            if ((ppflags->display_b_modes_flag & (1<<mi->mbmi.mode))
+                                || (ppflags->display_mb_modes_flag & B_PRED))
+                            {
+                                Y = B_PREDICTION_MODE_colors[bmi->as_mode][0];
+                                U = B_PREDICTION_MODE_colors[bmi->as_mode][1];
+                                V = B_PREDICTION_MODE_colors[bmi->as_mode][2];
+
+                                vp8_blend_b
+                                    (yl+bx, ul+(bx>>1), vl+(bx>>1), Y, U, V, 0xc000, y_stride);
+                            }
+                            bmi++;
+                        }
+
+                        yl += y_stride*4;
+                        ul += y_stride*1;
+                        vl += y_stride*1;
+                    }
+                }
+                else if (ppflags->display_mb_modes_flag & (1<<mi->mbmi.mode))
+                {
+                    Y = MB_PREDICTION_MODE_colors[mi->mbmi.mode][0];
+                    U = MB_PREDICTION_MODE_colors[mi->mbmi.mode][1];
+                    V = MB_PREDICTION_MODE_colors[mi->mbmi.mode][2];
+
+                    vp8_blend_mb_inner
+                        (y_ptr+x, u_ptr+(x>>1), v_ptr+(x>>1), Y, U, V, 0xc000, y_stride);
+                }
+
+                mi++;
+            }
+            y_ptr += y_stride*16;
+            u_ptr += y_stride*4;
+            v_ptr += y_stride*4;
+
+            mi++;
+        }
+    }
+
+    /* Color in frame reference blocks */
+    if ((flags & VP8D_DEBUG_CLR_FRM_REF_BLKS) && ppflags->display_ref_frame_flag)
+    {
+        int y, x;
+        YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
+        int width  = post->y_width;
+        int height = post->y_height;
+        unsigned char *y_ptr = oci->post_proc_buffer.y_buffer;
+        unsigned char *u_ptr = oci->post_proc_buffer.u_buffer;
+        unsigned char *v_ptr = oci->post_proc_buffer.v_buffer;
+        int y_stride = oci->post_proc_buffer.y_stride;
+        MODE_INFO *mi = oci->mi;
+
+        for (y = 0; y < height; y += 16)
+        {
+            for (x = 0; x < width; x +=16)
+            {
+                int Y = 0, U = 0, V = 0;
+
+                if (ppflags->display_ref_frame_flag & (1<<mi->mbmi.ref_frame))
+                {
+                    Y = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][0];
+                    U = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][1];
+                    V = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][2];
+
+                    vp8_blend_mb_outer
+                        (y_ptr+x, u_ptr+(x>>1), v_ptr+(x>>1), Y, U, V, 0xc000, y_stride);
+                }
+
+                mi++;
+            }
+            y_ptr += y_stride*16;
+            u_ptr += y_stride*4;
+            v_ptr += y_stride*4;
+
+            mi++;
+        }
+    }
+#endif
+
+    *dest = oci->post_proc_buffer;
+
+    /* handle problem with extending borders */
+    dest->y_width = oci->Width;
+    dest->y_height = oci->Height;
+    dest->uv_height = dest->y_height / 2;
+    return 0;
+}
+#endif
diff --git a/libs/libvpx/vp8/common/postproc.h b/libs/libvpx/vp8/common/postproc.h
new file mode 100644
index 0000000000..0fa12a7c67
--- /dev/null
+++ b/libs/libvpx/vp8/common/postproc.h
@@ -0,0 +1,59 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_COMMON_POSTPROC_H_
+#define VP8_COMMON_POSTPROC_H_
+
+#include "vpx_ports/mem.h"
+struct postproc_state
+{
+    int           last_q;
+    int           last_noise;
+    char          noise[3072];
+    int           last_base_qindex;
+    int           last_frame_valid;
+    DECLARE_ALIGNED(16, char, blackclamp[16]);
+    DECLARE_ALIGNED(16, char, whiteclamp[16]);
+    DECLARE_ALIGNED(16, char, bothclamp[16]);
+};
+#include "onyxc_int.h"
+#include "ppflags.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+int vp8_post_proc_frame(struct VP8Common *oci, YV12_BUFFER_CONFIG *dest,
+                        vp8_ppflags_t *flags);
+
+
+void vp8_de_noise(struct VP8Common           *oci,
+                  YV12_BUFFER_CONFIG         *source,
+                  YV12_BUFFER_CONFIG         *post,
+                  int                         q,
+                  int                         low_var_thresh,
+                  int                         flag,
+                  int                         uvfilter);
+
+void vp8_deblock(struct VP8Common           *oci,
+                 YV12_BUFFER_CONFIG         *source,
+                 YV12_BUFFER_CONFIG         *post,
+                 int                         q,
+                 int                         low_var_thresh,
+                 int                         flag);
+
+#define MFQE_PRECISION 4
+
+void vp8_multiframe_quality_enhance(struct VP8Common *cm);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_COMMON_POSTPROC_H_
diff --git a/libs/libvpx/vp8/common/ppflags.h b/libs/libvpx/vp8/common/ppflags.h
new file mode 100644
index 0000000000..768224aad5
--- /dev/null
+++ b/libs/libvpx/vp8/common/ppflags.h
@@ -0,0 +1,49 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_COMMON_PPFLAGS_H_
+#define VP8_COMMON_PPFLAGS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+enum
+{
+    VP8D_NOFILTERING            = 0,
+    VP8D_DEBLOCK                = 1<<0,
+    VP8D_DEMACROBLOCK           = 1<<1,
+    VP8D_ADDNOISE               = 1<<2,
+    VP8D_DEBUG_TXT_FRAME_INFO   = 1<<3,
+    VP8D_DEBUG_TXT_MBLK_MODES   = 1<<4,
+    VP8D_DEBUG_TXT_DC_DIFF      = 1<<5,
+    VP8D_DEBUG_TXT_RATE_INFO    = 1<<6,
+    VP8D_DEBUG_DRAW_MV          = 1<<7,
+    VP8D_DEBUG_CLR_BLK_MODES    = 1<<8,
+    VP8D_DEBUG_CLR_FRM_REF_BLKS = 1<<9,
+    VP8D_MFQE                   = 1<<10
+};
+
+typedef struct
+{
+    int post_proc_flag;
+    int deblocking_level;
+    int noise_level;
+    int display_ref_frame_flag;
+    int display_mb_modes_flag;
+    int display_b_modes_flag;
+    int display_mv_flag;
+} vp8_ppflags_t;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_COMMON_PPFLAGS_H_
diff --git a/libs/libvpx/vp8/common/quant_common.c b/libs/libvpx/vp8/common/quant_common.c
new file mode 100644
index 0000000000..05f9210702
--- /dev/null
+++ b/libs/libvpx/vp8/common/quant_common.c
@@ -0,0 +1,135 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "quant_common.h"
+
+static const int dc_qlookup[QINDEX_RANGE] =
+{
+    4,    5,    6,    7,    8,    9,   10,   10,   11,   12,   13,   14,   15,   16,   17,   17,
+    18,   19,   20,   20,   21,   21,   22,   22,   23,   23,   24,   25,   25,   26,   27,   28,
+    29,   30,   31,   32,   33,   34,   35,   36,   37,   37,   38,   39,   40,   41,   42,   43,
+    44,   45,   46,   46,   47,   48,   49,   50,   51,   52,   53,   54,   55,   56,   57,   58,
+    59,   60,   61,   62,   63,   64,   65,   66,   67,   68,   69,   70,   71,   72,   73,   74,
+    75,   76,   76,   77,   78,   79,   80,   81,   82,   83,   84,   85,   86,   87,   88,   89,
+    91,   93,   95,   96,   98,  100,  101,  102,  104,  106,  108,  110,  112,  114,  116,  118,
+    122,  124,  126,  128,  130,  132,  134,  136,  138,  140,  143,  145,  148,  151,  154,  157,
+};
+
+static const int ac_qlookup[QINDEX_RANGE] =
+{
+    4,    5,    6,    7,    8,    9,   10,   11,   12,   13,   14,   15,   16,   17,   18,   19,
+    20,   21,   22,   23,   24,   25,   26,   27,   28,   29,   30,   31,   32,   33,   34,   35,
+    36,   37,   38,   39,   40,   41,   42,   43,   44,   45,   46,   47,   48,   49,   50,   51,
+    52,   53,   54,   55,   56,   57,   58,   60,   62,   64,   66,   68,   70,   72,   74,   76,
+    78,   80,   82,   84,   86,   88,   90,   92,   94,   96,   98,  100,  102,  104,  106,  108,
+    110,  112,  114,  116,  119,  122,  125,  128,  131,  134,  137,  140,  143,  146,  149,  152,
+    155,  158,  161,  164,  167,  170,  173,  177,  181,  185,  189,  193,  197,  201,  205,  209,
+    213,  217,  221,  225,  229,  234,  239,  245,  249,  254,  259,  264,  269,  274,  279,  284,
+};
+
+
+int vp8_dc_quant(int QIndex, int Delta)
+{
+    int retval;
+
+    QIndex = QIndex + Delta;
+
+    if (QIndex > 127)
+        QIndex = 127;
+    else if (QIndex < 0)
+        QIndex = 0;
+
+    retval = dc_qlookup[ QIndex ];
+    return retval;
+}
+
+int vp8_dc2quant(int QIndex, int Delta)
+{
+    int retval;
+
+    QIndex = QIndex + Delta;
+
+    if (QIndex > 127)
+        QIndex = 127;
+    else if (QIndex < 0)
+        QIndex = 0;
+
+    retval = dc_qlookup[ QIndex ] * 2;
+    return retval;
+
+}
+int vp8_dc_uv_quant(int QIndex, int Delta)
+{
+    int retval;
+
+    QIndex = QIndex + Delta;
+
+    if (QIndex > 127)
+        QIndex = 127;
+    else if (QIndex < 0)
+        QIndex = 0;
+
+    retval = dc_qlookup[ QIndex ];
+
+    if (retval > 132)
+        retval = 132;
+
+    return retval;
+}
+
+int vp8_ac_yquant(int QIndex)
+{
+    int retval;
+
+    if (QIndex > 127)
+        QIndex = 127;
+    else if (QIndex < 0)
+        QIndex = 0;
+
+    retval = ac_qlookup[ QIndex ];
+    return retval;
+}
+
+int vp8_ac2quant(int QIndex, int Delta)
+{
+    int retval;
+
+    QIndex = QIndex + Delta;
+
+    if (QIndex > 127)
+        QIndex = 127;
+    else if (QIndex < 0)
+        QIndex = 0;
+
+    /* For all x in [0..284], x*155/100 is bitwise equal to (x*101581) >> 16.
+     * The smallest precision for that is '(x*6349) >> 12' but 16 is a good
+     * word size. */
+    retval = (ac_qlookup[ QIndex ] * 101581) >> 16;
+
+    if (retval < 8)
+        retval = 8;
+
+    return retval;
+}
+int vp8_ac_uv_quant(int QIndex, int Delta)
+{
+    int retval;
+
+    QIndex = QIndex + Delta;
+
+    if (QIndex > 127)
+        QIndex = 127;
+    else if (QIndex < 0)
+        QIndex = 0;
+
+    retval = ac_qlookup[ QIndex ];
+    return retval;
+}
diff --git a/libs/libvpx/vp8/common/quant_common.h b/libs/libvpx/vp8/common/quant_common.h
new file mode 100644
index 0000000000..700b5e6d72
--- /dev/null
+++ b/libs/libvpx/vp8/common/quant_common.h
@@ -0,0 +1,34 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_COMMON_QUANT_COMMON_H_
+#define VP8_COMMON_QUANT_COMMON_H_
+
+
+#include "string.h"
+#include "blockd.h"
+#include "onyxc_int.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern int vp8_ac_yquant(int QIndex);
+extern int vp8_dc_quant(int QIndex, int Delta);
+extern int vp8_dc2quant(int QIndex, int Delta);
+extern int vp8_ac2quant(int QIndex, int Delta);
+extern int vp8_dc_uv_quant(int QIndex, int Delta);
+extern int vp8_ac_uv_quant(int QIndex, int Delta);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_COMMON_QUANT_COMMON_H_
diff --git a/libs/libvpx/vp8/common/reconinter.c b/libs/libvpx/vp8/common/reconinter.c
new file mode 100644
index 0000000000..e302595587
--- /dev/null
+++ b/libs/libvpx/vp8/common/reconinter.c
@@ -0,0 +1,544 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <limits.h>
+#include <string.h>
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "blockd.h"
+#include "reconinter.h"
+#if CONFIG_RUNTIME_CPU_DETECT
+#include "onyxc_int.h"
+#endif
+
+void vp8_copy_mem16x16_c(
+    unsigned char *src,
+    int src_stride,
+    unsigned char *dst,
+    int dst_stride)
+{
+
+    int r;
+
+    for (r = 0; r < 16; r++)
+    {
+        memcpy(dst, src, 16);
+
+        src += src_stride;
+        dst += dst_stride;
+
+    }
+
+}
+
+void vp8_copy_mem8x8_c(
+    unsigned char *src,
+    int src_stride,
+    unsigned char *dst,
+    int dst_stride)
+{
+    int r;
+
+    for (r = 0; r < 8; r++)
+    {
+        memcpy(dst, src, 8);
+
+        src += src_stride;
+        dst += dst_stride;
+
+    }
+
+}
+
+void vp8_copy_mem8x4_c(
+    unsigned char *src,
+    int src_stride,
+    unsigned char *dst,
+    int dst_stride)
+{
+    int r;
+
+    for (r = 0; r < 4; r++)
+    {
+        memcpy(dst, src, 8);
+
+        src += src_stride;
+        dst += dst_stride;
+
+    }
+
+}
+
+
+void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, unsigned char *base_pre, int pre_stride, vp8_subpix_fn_t sppf)
+{
+    int r;
+    unsigned char *pred_ptr = d->predictor;
+    unsigned char *ptr;
+    ptr = base_pre + d->offset + (d->bmi.mv.as_mv.row >> 3) * pre_stride + (d->bmi.mv.as_mv.col >> 3);
+
+    if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
+    {
+        sppf(ptr, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, pred_ptr, pitch);
+    }
+    else
+    {
+        for (r = 0; r < 4; r++)
+        {
+            pred_ptr[0]  = ptr[0];
+            pred_ptr[1]  = ptr[1];
+            pred_ptr[2]  = ptr[2];
+            pred_ptr[3]  = ptr[3];
+            pred_ptr     += pitch;
+            ptr         += pre_stride;
+        }
+    }
+}
+
+static void build_inter_predictors4b(MACROBLOCKD *x, BLOCKD *d, unsigned char *dst, int dst_stride, unsigned char *base_pre, int pre_stride)
+{
+    unsigned char *ptr;
+    ptr = base_pre + d->offset + (d->bmi.mv.as_mv.row >> 3) * pre_stride + (d->bmi.mv.as_mv.col >> 3);
+
+    if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
+    {
+        x->subpixel_predict8x8(ptr, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst, dst_stride);
+    }
+    else
+    {
+        vp8_copy_mem8x8(ptr, pre_stride, dst, dst_stride);
+    }
+}
+
+static void build_inter_predictors2b(MACROBLOCKD *x, BLOCKD *d, unsigned char *dst, int dst_stride, unsigned char *base_pre, int pre_stride)
+{
+    unsigned char *ptr;
+    ptr = base_pre + d->offset + (d->bmi.mv.as_mv.row >> 3) * pre_stride + (d->bmi.mv.as_mv.col >> 3);
+
+    if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
+    {
+        x->subpixel_predict8x4(ptr, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst, dst_stride);
+    }
+    else
+    {
+        vp8_copy_mem8x4(ptr, pre_stride, dst, dst_stride);
+    }
+}
+
+static void build_inter_predictors_b(BLOCKD *d, unsigned char *dst, int dst_stride, unsigned char *base_pre, int pre_stride, vp8_subpix_fn_t sppf)
+{
+    int r;
+    unsigned char *ptr;
+    ptr = base_pre + d->offset + (d->bmi.mv.as_mv.row >> 3) * pre_stride + (d->bmi.mv.as_mv.col >> 3);
+
+    if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
+    {
+        sppf(ptr, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst, dst_stride);
+    }
+    else
+    {
+        for (r = 0; r < 4; r++)
+        {
+          dst[0]  = ptr[0];
+          dst[1]  = ptr[1];
+          dst[2]  = ptr[2];
+          dst[3]  = ptr[3];
+          dst     += dst_stride;
+          ptr     += pre_stride;
+        }
+    }
+}
+
+
+/*encoder only*/
+void vp8_build_inter16x16_predictors_mbuv(MACROBLOCKD *x)
+{
+    unsigned char *uptr, *vptr;
+    unsigned char *upred_ptr = &x->predictor[256];
+    unsigned char *vpred_ptr = &x->predictor[320];
+
+    int mv_row = x->mode_info_context->mbmi.mv.as_mv.row;
+    int mv_col = x->mode_info_context->mbmi.mv.as_mv.col;
+    int offset;
+    int pre_stride = x->pre.uv_stride;
+
+    /* calc uv motion vectors */
+    mv_row += 1 | (mv_row >> (sizeof(int) * CHAR_BIT - 1));
+    mv_col += 1 | (mv_col >> (sizeof(int) * CHAR_BIT - 1));
+    mv_row /= 2;
+    mv_col /= 2;
+    mv_row &= x->fullpixel_mask;
+    mv_col &= x->fullpixel_mask;
+
+    offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
+    uptr = x->pre.u_buffer + offset;
+    vptr = x->pre.v_buffer + offset;
+
+    if ((mv_row | mv_col) & 7)
+    {
+        x->subpixel_predict8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, upred_ptr, 8);
+        x->subpixel_predict8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, vpred_ptr, 8);
+    }
+    else
+    {
+        vp8_copy_mem8x8(uptr, pre_stride, upred_ptr, 8);
+        vp8_copy_mem8x8(vptr, pre_stride, vpred_ptr, 8);
+    }
+}
+
+/*encoder only*/
+void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x)
+{
+    int i, j;
+    int pre_stride = x->pre.uv_stride;
+    unsigned char *base_pre;
+
+    /* build uv mvs */
+    for (i = 0; i < 2; i++)
+    {
+        for (j = 0; j < 2; j++)
+        {
+            int yoffset = i * 8 + j * 2;
+            int uoffset = 16 + i * 2 + j;
+            int voffset = 20 + i * 2 + j;
+
+            int temp;
+
+            temp = x->block[yoffset  ].bmi.mv.as_mv.row
+                   + x->block[yoffset+1].bmi.mv.as_mv.row
+                   + x->block[yoffset+4].bmi.mv.as_mv.row
+                   + x->block[yoffset+5].bmi.mv.as_mv.row;
+
+            temp += 4 + ((temp >> (sizeof(temp) * CHAR_BIT - 1)) * 8);
+
+            x->block[uoffset].bmi.mv.as_mv.row = (temp / 8) & x->fullpixel_mask;
+
+            temp = x->block[yoffset  ].bmi.mv.as_mv.col
+                   + x->block[yoffset+1].bmi.mv.as_mv.col
+                   + x->block[yoffset+4].bmi.mv.as_mv.col
+                   + x->block[yoffset+5].bmi.mv.as_mv.col;
+
+            temp += 4 + ((temp >> (sizeof(temp) * CHAR_BIT - 1)) * 8);
+
+            x->block[uoffset].bmi.mv.as_mv.col = (temp / 8) & x->fullpixel_mask;
+
+            x->block[voffset].bmi.mv.as_int = x->block[uoffset].bmi.mv.as_int;
+        }
+    }
+
+    base_pre = x->pre.u_buffer;
+    for (i = 16; i < 20; i += 2)
+    {
+        BLOCKD *d0 = &x->block[i];
+        BLOCKD *d1 = &x->block[i+1];
+
+        if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
+            build_inter_predictors2b(x, d0, d0->predictor, 8, base_pre, pre_stride);
+        else
+        {
+            vp8_build_inter_predictors_b(d0, 8, base_pre, pre_stride, x->subpixel_predict);
+            vp8_build_inter_predictors_b(d1, 8, base_pre, pre_stride, x->subpixel_predict);
+        }
+    }
+
+    base_pre = x->pre.v_buffer;
+    for (i = 20; i < 24; i += 2)
+    {
+        BLOCKD *d0 = &x->block[i];
+        BLOCKD *d1 = &x->block[i+1];
+
+        if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
+            build_inter_predictors2b(x, d0, d0->predictor, 8, base_pre, pre_stride);
+        else
+        {
+            vp8_build_inter_predictors_b(d0, 8, base_pre, pre_stride, x->subpixel_predict);
+            vp8_build_inter_predictors_b(d1, 8, base_pre, pre_stride, x->subpixel_predict);
+        }
+    }
+}
+
+
+/*encoder only*/
+void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x,
+                                         unsigned char *dst_y,
+                                         int dst_ystride)
+{
+    unsigned char *ptr_base;
+    unsigned char *ptr;
+    int mv_row = x->mode_info_context->mbmi.mv.as_mv.row;
+    int mv_col = x->mode_info_context->mbmi.mv.as_mv.col;
+    int pre_stride = x->pre.y_stride;
+
+    ptr_base = x->pre.y_buffer;
+    ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3);
+
+    if ((mv_row | mv_col) & 7)
+    {
+        x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7,
+                                 dst_y, dst_ystride);
+    }
+    else
+    {
+        vp8_copy_mem16x16(ptr, pre_stride, dst_y,
+            dst_ystride);
+    }
+}
+
+static void clamp_mv_to_umv_border(MV *mv, const MACROBLOCKD *xd)
+{
+    /* If the MV points so far into the UMV border that no visible pixels
+     * are used for reconstruction, the subpel part of the MV can be
+     * discarded and the MV limited to 16 pixels with equivalent results.
+     *
+     * This limit kicks in at 19 pixels for the top and left edges, for
+     * the 16 pixels plus 3 taps right of the central pixel when subpel
+     * filtering. The bottom and right edges use 16 pixels plus 2 pixels
+     * left of the central pixel when filtering.
+     */
+    if (mv->col < (xd->mb_to_left_edge - (19 << 3)))
+        mv->col = xd->mb_to_left_edge - (16 << 3);
+    else if (mv->col > xd->mb_to_right_edge + (18 << 3))
+        mv->col = xd->mb_to_right_edge + (16 << 3);
+
+    if (mv->row < (xd->mb_to_top_edge - (19 << 3)))
+        mv->row = xd->mb_to_top_edge - (16 << 3);
+    else if (mv->row > xd->mb_to_bottom_edge + (18 << 3))
+        mv->row = xd->mb_to_bottom_edge + (16 << 3);
+}
+
+/* A version of the above function for chroma block MVs.*/
+static void clamp_uvmv_to_umv_border(MV *mv, const MACROBLOCKD *xd)
+{
+    mv->col = (2*mv->col < (xd->mb_to_left_edge - (19 << 3))) ?
+        (xd->mb_to_left_edge - (16 << 3)) >> 1 : mv->col;
+    mv->col = (2*mv->col > xd->mb_to_right_edge + (18 << 3)) ?
+        (xd->mb_to_right_edge + (16 << 3)) >> 1 : mv->col;
+
+    mv->row = (2*mv->row < (xd->mb_to_top_edge - (19 << 3))) ?
+        (xd->mb_to_top_edge - (16 << 3)) >> 1 : mv->row;
+    mv->row = (2*mv->row > xd->mb_to_bottom_edge + (18 << 3)) ?
+        (xd->mb_to_bottom_edge + (16 << 3)) >> 1 : mv->row;
+}
+
+void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x,
+                                        unsigned char *dst_y,
+                                        unsigned char *dst_u,
+                                        unsigned char *dst_v,
+                                        int dst_ystride,
+                                        int dst_uvstride)
+{
+    int offset;
+    unsigned char *ptr;
+    unsigned char *uptr, *vptr;
+
+    int_mv _16x16mv;
+
+    unsigned char *ptr_base = x->pre.y_buffer;
+    int pre_stride = x->pre.y_stride;
+
+    _16x16mv.as_int = x->mode_info_context->mbmi.mv.as_int;
+
+    if (x->mode_info_context->mbmi.need_to_clamp_mvs)
+    {
+        clamp_mv_to_umv_border(&_16x16mv.as_mv, x);
+    }
+
+    ptr = ptr_base + ( _16x16mv.as_mv.row >> 3) * pre_stride + (_16x16mv.as_mv.col >> 3);
+
+    if ( _16x16mv.as_int & 0x00070007)
+    {
+        x->subpixel_predict16x16(ptr, pre_stride, _16x16mv.as_mv.col & 7,  _16x16mv.as_mv.row & 7, dst_y, dst_ystride);
+    }
+    else
+    {
+        vp8_copy_mem16x16(ptr, pre_stride, dst_y, dst_ystride);
+    }
+
+    /* calc uv motion vectors */
+    _16x16mv.as_mv.row += 1 | (_16x16mv.as_mv.row >> (sizeof(int) * CHAR_BIT - 1));
+    _16x16mv.as_mv.col += 1 | (_16x16mv.as_mv.col >> (sizeof(int) * CHAR_BIT - 1));
+    _16x16mv.as_mv.row /= 2;
+    _16x16mv.as_mv.col /= 2;
+    _16x16mv.as_mv.row &= x->fullpixel_mask;
+    _16x16mv.as_mv.col &= x->fullpixel_mask;
+
+    pre_stride >>= 1;
+    offset = ( _16x16mv.as_mv.row >> 3) * pre_stride + (_16x16mv.as_mv.col >> 3);
+    uptr = x->pre.u_buffer + offset;
+    vptr = x->pre.v_buffer + offset;
+
+    if ( _16x16mv.as_int & 0x00070007)
+    {
+        x->subpixel_predict8x8(uptr, pre_stride, _16x16mv.as_mv.col & 7,  _16x16mv.as_mv.row & 7, dst_u, dst_uvstride);
+        x->subpixel_predict8x8(vptr, pre_stride, _16x16mv.as_mv.col & 7,  _16x16mv.as_mv.row & 7, dst_v, dst_uvstride);
+    }
+    else
+    {
+        vp8_copy_mem8x8(uptr, pre_stride, dst_u, dst_uvstride);
+        vp8_copy_mem8x8(vptr, pre_stride, dst_v, dst_uvstride);
+    }
+}
+
+static void build_inter4x4_predictors_mb(MACROBLOCKD *x)
+{
+    int i;
+    unsigned char *base_dst = x->dst.y_buffer;
+    unsigned char *base_pre = x->pre.y_buffer;
+
+    if (x->mode_info_context->mbmi.partitioning < 3)
+    {
+        BLOCKD *b;
+        int dst_stride = x->dst.y_stride;
+
+        x->block[ 0].bmi = x->mode_info_context->bmi[ 0];
+        x->block[ 2].bmi = x->mode_info_context->bmi[ 2];
+        x->block[ 8].bmi = x->mode_info_context->bmi[ 8];
+        x->block[10].bmi = x->mode_info_context->bmi[10];
+        if (x->mode_info_context->mbmi.need_to_clamp_mvs)
+        {
+            clamp_mv_to_umv_border(&x->block[ 0].bmi.mv.as_mv, x);
+            clamp_mv_to_umv_border(&x->block[ 2].bmi.mv.as_mv, x);
+            clamp_mv_to_umv_border(&x->block[ 8].bmi.mv.as_mv, x);
+            clamp_mv_to_umv_border(&x->block[10].bmi.mv.as_mv, x);
+        }
+
+        b = &x->block[ 0];
+        build_inter_predictors4b(x, b, base_dst + b->offset, dst_stride, base_pre, dst_stride);
+        b = &x->block[ 2];
+        build_inter_predictors4b(x, b, base_dst + b->offset, dst_stride, base_pre, dst_stride);
+        b = &x->block[ 8];
+        build_inter_predictors4b(x, b, base_dst + b->offset, dst_stride, base_pre, dst_stride);
+        b = &x->block[10];
+        build_inter_predictors4b(x, b, base_dst + b->offset, dst_stride, base_pre, dst_stride);
+    }
+    else
+    {
+        for (i = 0; i < 16; i += 2)
+        {
+            BLOCKD *d0 = &x->block[i];
+            BLOCKD *d1 = &x->block[i+1];
+            int dst_stride = x->dst.y_stride;
+
+            x->block[i+0].bmi = x->mode_info_context->bmi[i+0];
+            x->block[i+1].bmi = x->mode_info_context->bmi[i+1];
+            if (x->mode_info_context->mbmi.need_to_clamp_mvs)
+            {
+                clamp_mv_to_umv_border(&x->block[i+0].bmi.mv.as_mv, x);
+                clamp_mv_to_umv_border(&x->block[i+1].bmi.mv.as_mv, x);
+            }
+
+            if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
+                build_inter_predictors2b(x, d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride);
+            else
+            {
+                build_inter_predictors_b(d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict);
+                build_inter_predictors_b(d1, base_dst + d1->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict);
+            }
+
+        }
+
+    }
+    base_dst = x->dst.u_buffer;
+    base_pre = x->pre.u_buffer;
+    for (i = 16; i < 20; i += 2)
+    {
+        BLOCKD *d0 = &x->block[i];
+        BLOCKD *d1 = &x->block[i+1];
+        int dst_stride = x->dst.uv_stride;
+
+        /* Note: uv mvs already clamped in build_4x4uvmvs() */
+
+        if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
+            build_inter_predictors2b(x, d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride);
+        else
+        {
+            build_inter_predictors_b(d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict);
+            build_inter_predictors_b(d1, base_dst + d1->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict);
+        }
+    }
+
+    base_dst = x->dst.v_buffer;
+    base_pre = x->pre.v_buffer;
+    for (i = 20; i < 24; i += 2)
+    {
+        BLOCKD *d0 = &x->block[i];
+        BLOCKD *d1 = &x->block[i+1];
+        int dst_stride = x->dst.uv_stride;
+
+        /* Note: uv mvs already clamped in build_4x4uvmvs() */
+
+        if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
+            build_inter_predictors2b(x, d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride);
+        else
+        {
+            build_inter_predictors_b(d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict);
+            build_inter_predictors_b(d1, base_dst + d1->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict);
+        }
+    }
+}
+
+static
+void build_4x4uvmvs(MACROBLOCKD *x)
+{
+    int i, j;
+
+    for (i = 0; i < 2; i++)
+    {
+        for (j = 0; j < 2; j++)
+        {
+            int yoffset = i * 8 + j * 2;
+            int uoffset = 16 + i * 2 + j;
+            int voffset = 20 + i * 2 + j;
+
+            int temp;
+
+            temp = x->mode_info_context->bmi[yoffset + 0].mv.as_mv.row
+                 + x->mode_info_context->bmi[yoffset + 1].mv.as_mv.row
+                 + x->mode_info_context->bmi[yoffset + 4].mv.as_mv.row
+                 + x->mode_info_context->bmi[yoffset + 5].mv.as_mv.row;
+
+            temp += 4 + ((temp >> (sizeof(temp) * CHAR_BIT - 1)) * 8);
+
+            x->block[uoffset].bmi.mv.as_mv.row = (temp / 8) & x->fullpixel_mask;
+
+            temp = x->mode_info_context->bmi[yoffset + 0].mv.as_mv.col
+                 + x->mode_info_context->bmi[yoffset + 1].mv.as_mv.col
+                 + x->mode_info_context->bmi[yoffset + 4].mv.as_mv.col
+                 + x->mode_info_context->bmi[yoffset + 5].mv.as_mv.col;
+
+            temp += 4 + ((temp >> (sizeof(temp) * CHAR_BIT - 1)) * 8);
+
+            x->block[uoffset].bmi.mv.as_mv.col = (temp / 8) & x->fullpixel_mask;
+
+            if (x->mode_info_context->mbmi.need_to_clamp_mvs)
+                clamp_uvmv_to_umv_border(&x->block[uoffset].bmi.mv.as_mv, x);
+
+            x->block[voffset].bmi.mv.as_int = x->block[uoffset].bmi.mv.as_int;
+        }
+    }
+}
+
+void vp8_build_inter_predictors_mb(MACROBLOCKD *xd)
+{
+    if (xd->mode_info_context->mbmi.mode != SPLITMV)
+    {
+        vp8_build_inter16x16_predictors_mb(xd, xd->dst.y_buffer,
+                                           xd->dst.u_buffer, xd->dst.v_buffer,
+                                           xd->dst.y_stride, xd->dst.uv_stride);
+    }
+    else
+    {
+        build_4x4uvmvs(xd);
+        build_inter4x4_predictors_mb(xd);
+    }
+}
diff --git a/libs/libvpx/vp8/common/reconinter.h b/libs/libvpx/vp8/common/reconinter.h
new file mode 100644
index 0000000000..ba979b9664
--- /dev/null
+++ b/libs/libvpx/vp8/common/reconinter.h
@@ -0,0 +1,43 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_COMMON_RECONINTER_H_
+#define VP8_COMMON_RECONINTER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern void vp8_build_inter_predictors_mb(MACROBLOCKD *x);
+extern void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x,
+                                               unsigned char *dst_y,
+                                               unsigned char *dst_u,
+                                               unsigned char *dst_v,
+                                               int dst_ystride,
+                                               int dst_uvstride);
+
+
+extern void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x,
+                                                unsigned char *dst_y,
+                                                int dst_ystride);
+extern void vp8_build_inter_predictors_b(BLOCKD *d, int pitch,
+                                         unsigned char *base_pre,
+                                         int pre_stride,
+                                         vp8_subpix_fn_t sppf);
+
+extern void vp8_build_inter16x16_predictors_mbuv(MACROBLOCKD *x);
+extern void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_COMMON_RECONINTER_H_
diff --git a/libs/libvpx/vp8/common/reconintra.c b/libs/libvpx/vp8/common/reconintra.c
new file mode 100644
index 0000000000..356655dac7
--- /dev/null
+++ b/libs/libvpx/vp8/common/reconintra.c
@@ -0,0 +1,117 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "./vp8_rtcd.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/vpx_once.h"
+#include "blockd.h"
+#include "vp8/common/reconintra.h"
+#include "vp8/common/reconintra4x4.h"
+
+enum {
+    SIZE_16,
+    SIZE_8,
+    NUM_SIZES,
+};
+
+typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left);
+
+static intra_pred_fn pred[4][NUM_SIZES];
+static intra_pred_fn dc_pred[2][2][NUM_SIZES];
+
+static void vp8_init_intra_predictors_internal(void)
+{
+#define INIT_SIZE(sz) \
+    pred[V_PRED][SIZE_##sz] = vpx_v_predictor_##sz##x##sz; \
+    pred[H_PRED][SIZE_##sz] = vpx_h_predictor_##sz##x##sz; \
+    pred[TM_PRED][SIZE_##sz] = vpx_tm_predictor_##sz##x##sz; \
+ \
+    dc_pred[0][0][SIZE_##sz] = vpx_dc_128_predictor_##sz##x##sz; \
+    dc_pred[0][1][SIZE_##sz] = vpx_dc_top_predictor_##sz##x##sz; \
+    dc_pred[1][0][SIZE_##sz] = vpx_dc_left_predictor_##sz##x##sz; \
+    dc_pred[1][1][SIZE_##sz] = vpx_dc_predictor_##sz##x##sz
+
+    INIT_SIZE(16);
+    INIT_SIZE(8);
+    vp8_init_intra4x4_predictors_internal();
+}
+
+void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x,
+                                      unsigned char * yabove_row,
+                                      unsigned char * yleft,
+                                      int left_stride,
+                                      unsigned char * ypred_ptr,
+                                      int y_stride)
+{
+    MB_PREDICTION_MODE mode = x->mode_info_context->mbmi.mode;
+    DECLARE_ALIGNED(16, uint8_t, yleft_col[16]);
+    int i;
+    intra_pred_fn fn;
+
+    for (i = 0; i < 16; i++)
+    {
+        yleft_col[i] = yleft[i* left_stride];
+    }
+
+    if (mode == DC_PRED)
+    {
+        fn = dc_pred[x->left_available][x->up_available][SIZE_16];
+    }
+    else
+    {
+        fn = pred[mode][SIZE_16];
+    }
+
+    fn(ypred_ptr, y_stride, yabove_row, yleft_col);
+}
+
+void vp8_build_intra_predictors_mbuv_s(MACROBLOCKD *x,
+                                       unsigned char * uabove_row,
+                                       unsigned char * vabove_row,
+                                       unsigned char * uleft,
+                                       unsigned char * vleft,
+                                       int left_stride,
+                                       unsigned char * upred_ptr,
+                                       unsigned char * vpred_ptr,
+                                       int pred_stride)
+{
+    MB_PREDICTION_MODE uvmode = x->mode_info_context->mbmi.uv_mode;
+    unsigned char uleft_col[8];
+    unsigned char vleft_col[8];
+    int i;
+    intra_pred_fn fn;
+
+    for (i = 0; i < 8; i++)
+    {
+        uleft_col[i] = uleft[i * left_stride];
+        vleft_col[i] = vleft[i * left_stride];
+    }
+
+    if (uvmode == DC_PRED)
+    {
+        fn = dc_pred[x->left_available][x->up_available][SIZE_8];
+    }
+    else
+    {
+        fn = pred[uvmode][SIZE_8];
+    }
+
+    fn(upred_ptr, pred_stride, uabove_row, uleft_col);
+    fn(vpred_ptr, pred_stride, vabove_row, vleft_col);
+}
+
+void vp8_init_intra_predictors(void)
+{
+    once(vp8_init_intra_predictors_internal);
+}
diff --git a/libs/libvpx/vp8/common/reconintra.h b/libs/libvpx/vp8/common/reconintra.h
new file mode 100644
index 0000000000..b6225a6637
--- /dev/null
+++ b/libs/libvpx/vp8/common/reconintra.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_COMMON_RECONINTRA_H_
+#define VP8_COMMON_RECONINTRA_H_
+
+#include "vp8/common/blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x,
+                                      unsigned char *yabove_row,
+                                      unsigned char *yleft,
+                                      int left_stride,
+                                      unsigned char *ypred_ptr,
+                                      int y_stride);
+
+void vp8_build_intra_predictors_mbuv_s(MACROBLOCKD *x,
+                                       unsigned char * uabove_row,
+                                       unsigned char * vabove_row,
+                                       unsigned char * uleft,
+                                       unsigned char * vleft,
+                                       int left_stride,
+                                       unsigned char * upred_ptr,
+                                       unsigned char * vpred_ptr,
+                                       int pred_stride);
+
+void vp8_init_intra_predictors(void);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_COMMON_RECONINTRA_H_
diff --git a/libs/libvpx/vp8/common/reconintra4x4.c b/libs/libvpx/vp8/common/reconintra4x4.c
new file mode 100644
index 0000000000..35ad891eff
--- /dev/null
+++ b/libs/libvpx/vp8/common/reconintra4x4.c
@@ -0,0 +1,54 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string.h>
+
+#include "vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vp8_rtcd.h"
+#include "blockd.h"
+
+typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left);
+
+static intra_pred_fn pred[10];
+
+void vp8_init_intra4x4_predictors_internal(void)
+{
+    pred[B_DC_PRED] = vpx_dc_predictor_4x4;
+    pred[B_TM_PRED] = vpx_tm_predictor_4x4;
+    pred[B_VE_PRED] = vpx_ve_predictor_4x4;
+    pred[B_HE_PRED] = vpx_he_predictor_4x4;
+    pred[B_LD_PRED] = vpx_d45e_predictor_4x4;
+    pred[B_RD_PRED] = vpx_d135_predictor_4x4;
+    pred[B_VR_PRED] = vpx_d117_predictor_4x4;
+    pred[B_VL_PRED] = vpx_d63f_predictor_4x4;
+    pred[B_HD_PRED] = vpx_d153_predictor_4x4;
+    pred[B_HU_PRED] = vpx_d207_predictor_4x4;
+}
+
+void vp8_intra4x4_predict(unsigned char *above,
+                          unsigned char *yleft, int left_stride,
+                          B_PREDICTION_MODE b_mode,
+                          unsigned char *dst, int dst_stride,
+                          unsigned char top_left)
+{
+    unsigned char Left[4];
+    unsigned char Aboveb[12], *Above = Aboveb + 4;
+
+    Left[0] = yleft[0];
+    Left[1] = yleft[left_stride];
+    Left[2] = yleft[2 * left_stride];
+    Left[3] = yleft[3 * left_stride];
+    memcpy(Above, above, 8);
+    Above[-1] = top_left;
+
+    pred[b_mode](dst, dst_stride, Above, Left);
+}
diff --git a/libs/libvpx/vp8/common/reconintra4x4.h b/libs/libvpx/vp8/common/reconintra4x4.h
new file mode 100644
index 0000000000..5dc5d13a5c
--- /dev/null
+++ b/libs/libvpx/vp8/common/reconintra4x4.h
@@ -0,0 +1,48 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_COMMON_RECONINTRA4X4_H_
+#define VP8_COMMON_RECONINTRA4X4_H_
+#include "vp8/common/blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE void intra_prediction_down_copy(MACROBLOCKD *xd,
+                                              unsigned char *above_right_src)
+{
+    int dst_stride = xd->dst.y_stride;
+    unsigned char *above_right_dst = xd->dst.y_buffer - dst_stride + 16;
+
+    unsigned int *src_ptr = (unsigned int *)above_right_src;
+    unsigned int *dst_ptr0 = (unsigned int *)(above_right_dst + 4 * dst_stride);
+    unsigned int *dst_ptr1 = (unsigned int *)(above_right_dst + 8 * dst_stride);
+    unsigned int *dst_ptr2 = (unsigned int *)(above_right_dst + 12 * dst_stride);
+
+    *dst_ptr0 = *src_ptr;
+    *dst_ptr1 = *src_ptr;
+    *dst_ptr2 = *src_ptr;
+}
+
+void vp8_intra4x4_predict(unsigned char *Above,
+                          unsigned char *yleft, int left_stride,
+                          B_PREDICTION_MODE b_mode,
+                          unsigned char *dst, int dst_stride,
+                          unsigned char top_left);
+
+void vp8_init_intra4x4_predictors_internal(void);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_COMMON_RECONINTRA4X4_H_
diff --git a/libs/libvpx/vp8/common/rtcd.c b/libs/libvpx/vp8/common/rtcd.c
new file mode 100644
index 0000000000..ab0e9b47fe
--- /dev/null
+++ b/libs/libvpx/vp8/common/rtcd.c
@@ -0,0 +1,19 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "./vpx_config.h"
+#define RTCD_C
+#include "./vp8_rtcd.h"
+#include "vpx_ports/vpx_once.h"
+
+
+void vp8_rtcd()
+{
+    once(setup_rtcd_internal);
+}
diff --git a/libs/libvpx/vp8/common/rtcd_defs.pl b/libs/libvpx/vp8/common/rtcd_defs.pl
new file mode 100644
index 0000000000..6799c2787a
--- /dev/null
+++ b/libs/libvpx/vp8/common/rtcd_defs.pl
@@ -0,0 +1,321 @@
+sub vp8_common_forward_decls() {
+print <<EOF
+/*
+ * VP8
+ */
+
+struct blockd;
+struct macroblockd;
+struct loop_filter_info;
+
+/* Encoder forward decls */
+struct block;
+struct macroblock;
+struct variance_vtable;
+union int_mv;
+struct yv12_buffer_config;
+EOF
+}
+forward_decls qw/vp8_common_forward_decls/;
+
+#
+# system state
+#
+add_proto qw/void vp8_clear_system_state/, "";
+specialize qw/vp8_clear_system_state mmx/;
+$vp8_clear_system_state_mmx=vpx_reset_mmx_state;
+
+#
+# Dequant
+#
+add_proto qw/void vp8_dequantize_b/, "struct blockd*, short *dqc";
+specialize qw/vp8_dequantize_b mmx media neon msa/;
+$vp8_dequantize_b_media=vp8_dequantize_b_v6;
+
+add_proto qw/void vp8_dequant_idct_add/, "short *input, short *dq, unsigned char *output, int stride";
+specialize qw/vp8_dequant_idct_add mmx media neon dspr2 msa/;
+$vp8_dequant_idct_add_media=vp8_dequant_idct_add_v6;
+$vp8_dequant_idct_add_dspr2=vp8_dequant_idct_add_dspr2;
+
+add_proto qw/void vp8_dequant_idct_add_y_block/, "short *q, short *dq, unsigned char *dst, int stride, char *eobs";
+specialize qw/vp8_dequant_idct_add_y_block mmx sse2 media neon dspr2 msa/;
+$vp8_dequant_idct_add_y_block_media=vp8_dequant_idct_add_y_block_v6;
+$vp8_dequant_idct_add_y_block_dspr2=vp8_dequant_idct_add_y_block_dspr2;
+
+add_proto qw/void vp8_dequant_idct_add_uv_block/, "short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs";
+specialize qw/vp8_dequant_idct_add_uv_block mmx sse2 media neon dspr2 msa/;
+$vp8_dequant_idct_add_uv_block_media=vp8_dequant_idct_add_uv_block_v6;
+$vp8_dequant_idct_add_y_block_dspr2=vp8_dequant_idct_add_y_block_dspr2;
+
+#
+# Loopfilter
+#
+add_proto qw/void vp8_loop_filter_mbv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
+specialize qw/vp8_loop_filter_mbv mmx sse2 media neon dspr2 msa/;
+$vp8_loop_filter_mbv_media=vp8_loop_filter_mbv_armv6;
+$vp8_loop_filter_mbv_dspr2=vp8_loop_filter_mbv_dspr2;
+
+add_proto qw/void vp8_loop_filter_bv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
+specialize qw/vp8_loop_filter_bv mmx sse2 media neon dspr2 msa/;
+$vp8_loop_filter_bv_media=vp8_loop_filter_bv_armv6;
+$vp8_loop_filter_bv_dspr2=vp8_loop_filter_bv_dspr2;
+
+add_proto qw/void vp8_loop_filter_mbh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
+specialize qw/vp8_loop_filter_mbh mmx sse2 media neon dspr2 msa/;
+$vp8_loop_filter_mbh_media=vp8_loop_filter_mbh_armv6;
+$vp8_loop_filter_mbh_dspr2=vp8_loop_filter_mbh_dspr2;
+
+add_proto qw/void vp8_loop_filter_bh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
+specialize qw/vp8_loop_filter_bh mmx sse2 media neon dspr2 msa/;
+$vp8_loop_filter_bh_media=vp8_loop_filter_bh_armv6;
+$vp8_loop_filter_bh_dspr2=vp8_loop_filter_bh_dspr2;
+
+
+add_proto qw/void vp8_loop_filter_simple_mbv/, "unsigned char *y, int ystride, const unsigned char *blimit";
+specialize qw/vp8_loop_filter_simple_mbv mmx sse2 media neon msa/;
+$vp8_loop_filter_simple_mbv_c=vp8_loop_filter_simple_vertical_edge_c;
+$vp8_loop_filter_simple_mbv_mmx=vp8_loop_filter_simple_vertical_edge_mmx;
+$vp8_loop_filter_simple_mbv_sse2=vp8_loop_filter_simple_vertical_edge_sse2;
+$vp8_loop_filter_simple_mbv_media=vp8_loop_filter_simple_vertical_edge_armv6;
+$vp8_loop_filter_simple_mbv_neon=vp8_loop_filter_mbvs_neon;
+$vp8_loop_filter_simple_mbv_msa=vp8_loop_filter_simple_vertical_edge_msa;
+
+add_proto qw/void vp8_loop_filter_simple_mbh/, "unsigned char *y, int ystride, const unsigned char *blimit";
+specialize qw/vp8_loop_filter_simple_mbh mmx sse2 media neon msa/;
+$vp8_loop_filter_simple_mbh_c=vp8_loop_filter_simple_horizontal_edge_c;
+$vp8_loop_filter_simple_mbh_mmx=vp8_loop_filter_simple_horizontal_edge_mmx;
+$vp8_loop_filter_simple_mbh_sse2=vp8_loop_filter_simple_horizontal_edge_sse2;
+$vp8_loop_filter_simple_mbh_media=vp8_loop_filter_simple_horizontal_edge_armv6;
+$vp8_loop_filter_simple_mbh_neon=vp8_loop_filter_mbhs_neon;
+$vp8_loop_filter_simple_mbh_msa=vp8_loop_filter_simple_horizontal_edge_msa;
+
+add_proto qw/void vp8_loop_filter_simple_bv/, "unsigned char *y, int ystride, const unsigned char *blimit";
+specialize qw/vp8_loop_filter_simple_bv mmx sse2 media neon msa/;
+$vp8_loop_filter_simple_bv_c=vp8_loop_filter_bvs_c;
+$vp8_loop_filter_simple_bv_mmx=vp8_loop_filter_bvs_mmx;
+$vp8_loop_filter_simple_bv_sse2=vp8_loop_filter_bvs_sse2;
+$vp8_loop_filter_simple_bv_media=vp8_loop_filter_bvs_armv6;
+$vp8_loop_filter_simple_bv_neon=vp8_loop_filter_bvs_neon;
+$vp8_loop_filter_simple_bv_msa=vp8_loop_filter_bvs_msa;
+
+add_proto qw/void vp8_loop_filter_simple_bh/, "unsigned char *y, int ystride, const unsigned char *blimit";
+specialize qw/vp8_loop_filter_simple_bh mmx sse2 media neon msa/;
+$vp8_loop_filter_simple_bh_c=vp8_loop_filter_bhs_c;
+$vp8_loop_filter_simple_bh_mmx=vp8_loop_filter_bhs_mmx;
+$vp8_loop_filter_simple_bh_sse2=vp8_loop_filter_bhs_sse2;
+$vp8_loop_filter_simple_bh_media=vp8_loop_filter_bhs_armv6;
+$vp8_loop_filter_simple_bh_neon=vp8_loop_filter_bhs_neon;
+$vp8_loop_filter_simple_bh_msa=vp8_loop_filter_bhs_msa;
+
+#
+# IDCT
+#
+#idct16
+add_proto qw/void vp8_short_idct4x4llm/, "short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride";
+specialize qw/vp8_short_idct4x4llm mmx media neon dspr2 msa/;
+$vp8_short_idct4x4llm_media=vp8_short_idct4x4llm_v6_dual;
+$vp8_short_idct4x4llm_dspr2=vp8_short_idct4x4llm_dspr2;
+
+#iwalsh1
+add_proto qw/void vp8_short_inv_walsh4x4_1/, "short *input, short *output";
+specialize qw/vp8_short_inv_walsh4x4_1 dspr2/;
+$vp8_short_inv_walsh4x4_1_dspr2=vp8_short_inv_walsh4x4_1_dspr2;
+# no asm yet
+
+#iwalsh16
+add_proto qw/void vp8_short_inv_walsh4x4/, "short *input, short *output";
+specialize qw/vp8_short_inv_walsh4x4 mmx sse2 media neon dspr2 msa/;
+$vp8_short_inv_walsh4x4_media=vp8_short_inv_walsh4x4_v6;
+$vp8_short_inv_walsh4x4_dspr2=vp8_short_inv_walsh4x4_dspr2;
+
+#idct1_scalar_add
+add_proto qw/void vp8_dc_only_idct_add/, "short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride";
+specialize qw/vp8_dc_only_idct_add	mmx media neon dspr2 msa/;
+$vp8_dc_only_idct_add_media=vp8_dc_only_idct_add_v6;
+$vp8_dc_only_idct_add_dspr2=vp8_dc_only_idct_add_dspr2;
+
+#
+# RECON
+#
+add_proto qw/void vp8_copy_mem16x16/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch";
+specialize qw/vp8_copy_mem16x16 mmx sse2 media neon dspr2 msa/;
+$vp8_copy_mem16x16_media=vp8_copy_mem16x16_v6;
+$vp8_copy_mem16x16_dspr2=vp8_copy_mem16x16_dspr2;
+
+add_proto qw/void vp8_copy_mem8x8/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch";
+specialize qw/vp8_copy_mem8x8 mmx media neon dspr2 msa/;
+$vp8_copy_mem8x8_media=vp8_copy_mem8x8_v6;
+$vp8_copy_mem8x8_dspr2=vp8_copy_mem8x8_dspr2;
+
+add_proto qw/void vp8_copy_mem8x4/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch";
+specialize qw/vp8_copy_mem8x4 mmx media neon dspr2 msa/;
+$vp8_copy_mem8x4_media=vp8_copy_mem8x4_v6;
+$vp8_copy_mem8x4_dspr2=vp8_copy_mem8x4_dspr2;
+
+#
+# Postproc
+#
+if (vpx_config("CONFIG_POSTPROC") eq "yes") {
+    add_proto qw/void vp8_mbpost_proc_down/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";
+    specialize qw/vp8_mbpost_proc_down mmx sse2 msa/;
+    $vp8_mbpost_proc_down_sse2=vp8_mbpost_proc_down_xmm;
+
+    add_proto qw/void vp8_mbpost_proc_across_ip/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";
+    specialize qw/vp8_mbpost_proc_across_ip sse2 msa/;
+    $vp8_mbpost_proc_across_ip_sse2=vp8_mbpost_proc_across_ip_xmm;
+
+    add_proto qw/void vp8_post_proc_down_and_across_mb_row/, "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size";
+    specialize qw/vp8_post_proc_down_and_across_mb_row sse2 msa/;
+
+    add_proto qw/void vp8_plane_add_noise/, "unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch";
+    specialize qw/vp8_plane_add_noise mmx sse2 msa/;
+    $vp8_plane_add_noise_sse2=vp8_plane_add_noise_wmt;
+
+    add_proto qw/void vp8_blend_mb_inner/, "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride";
+    # no asm yet
+
+    add_proto qw/void vp8_blend_mb_outer/, "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride";
+    # no asm yet
+
+    add_proto qw/void vp8_blend_b/, "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride";
+    # no asm yet
+
+    add_proto qw/void vp8_filter_by_weight16x16/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight";
+    specialize qw/vp8_filter_by_weight16x16 sse2 msa/;
+
+    add_proto qw/void vp8_filter_by_weight8x8/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight";
+    specialize qw/vp8_filter_by_weight8x8 sse2 msa/;
+
+    add_proto qw/void vp8_filter_by_weight4x4/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight";
+    # no asm yet
+}
+
+#
+# Subpixel
+#
+add_proto qw/void vp8_sixtap_predict16x16/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
+specialize qw/vp8_sixtap_predict16x16 mmx sse2 ssse3 media neon dspr2 msa/;
+$vp8_sixtap_predict16x16_media=vp8_sixtap_predict16x16_armv6;
+$vp8_sixtap_predict16x16_dspr2=vp8_sixtap_predict16x16_dspr2;
+
+add_proto qw/void vp8_sixtap_predict8x8/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
+specialize qw/vp8_sixtap_predict8x8 mmx sse2 ssse3 media neon dspr2 msa/;
+$vp8_sixtap_predict8x8_media=vp8_sixtap_predict8x8_armv6;
+$vp8_sixtap_predict8x8_dspr2=vp8_sixtap_predict8x8_dspr2;
+
+add_proto qw/void vp8_sixtap_predict8x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
+specialize qw/vp8_sixtap_predict8x4 mmx sse2 ssse3 media neon dspr2 msa/;
+$vp8_sixtap_predict8x4_media=vp8_sixtap_predict8x4_armv6;
+$vp8_sixtap_predict8x4_dspr2=vp8_sixtap_predict8x4_dspr2;
+
+add_proto qw/void vp8_sixtap_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
+#TODO(johannkoenig): fix the neon version https://code.google.com/p/webm/issues/detail?id=817
+specialize qw/vp8_sixtap_predict4x4 mmx ssse3 media dspr2 msa/;
+$vp8_sixtap_predict4x4_media=vp8_sixtap_predict4x4_armv6;
+$vp8_sixtap_predict4x4_dspr2=vp8_sixtap_predict4x4_dspr2;
+
+add_proto qw/void vp8_bilinear_predict16x16/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
+specialize qw/vp8_bilinear_predict16x16 mmx sse2 ssse3 media neon msa/;
+$vp8_bilinear_predict16x16_media=vp8_bilinear_predict16x16_armv6;
+
+add_proto qw/void vp8_bilinear_predict8x8/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
+specialize qw/vp8_bilinear_predict8x8 mmx sse2 ssse3 media neon msa/;
+$vp8_bilinear_predict8x8_media=vp8_bilinear_predict8x8_armv6;
+
+add_proto qw/void vp8_bilinear_predict8x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
+specialize qw/vp8_bilinear_predict8x4 mmx media neon msa/;
+$vp8_bilinear_predict8x4_media=vp8_bilinear_predict8x4_armv6;
+
+add_proto qw/void vp8_bilinear_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
+#TODO(johannkoenig): fix the neon version https://code.google.com/p/webm/issues/detail?id=892
+specialize qw/vp8_bilinear_predict4x4 mmx media msa/;
+$vp8_bilinear_predict4x4_media=vp8_bilinear_predict4x4_armv6;
+
+#
+# Encoder functions below this point.
+#
+if (vpx_config("CONFIG_VP8_ENCODER") eq "yes") {
+
+#
+# Block copy
+#
+if ($opts{arch} =~ /x86/) {
+    add_proto qw/void vp8_copy32xn/, "const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n";
+    specialize qw/vp8_copy32xn sse2 sse3/;
+}
+
+#
+# Forward DCT
+#
+add_proto qw/void vp8_short_fdct4x4/, "short *input, short *output, int pitch";
+specialize qw/vp8_short_fdct4x4 mmx sse2 media neon msa/;
+$vp8_short_fdct4x4_media=vp8_short_fdct4x4_armv6;
+
+add_proto qw/void vp8_short_fdct8x4/, "short *input, short *output, int pitch";
+specialize qw/vp8_short_fdct8x4 mmx sse2 media neon msa/;
+$vp8_short_fdct8x4_media=vp8_short_fdct8x4_armv6;
+
+add_proto qw/void vp8_short_walsh4x4/, "short *input, short *output, int pitch";
+specialize qw/vp8_short_walsh4x4 sse2 media neon msa/;
+$vp8_short_walsh4x4_media=vp8_short_walsh4x4_armv6;
+
+#
+# Quantizer
+#
+add_proto qw/void vp8_regular_quantize_b/, "struct block *, struct blockd *";
+specialize qw/vp8_regular_quantize_b sse2 sse4_1 msa/;
+
+add_proto qw/void vp8_fast_quantize_b/, "struct block *, struct blockd *";
+specialize qw/vp8_fast_quantize_b sse2 ssse3 neon msa/;
+
+#
+# Block subtraction
+#
+add_proto qw/int vp8_block_error/, "short *coeff, short *dqcoeff";
+specialize qw/vp8_block_error mmx sse2 msa/;
+$vp8_block_error_sse2=vp8_block_error_xmm;
+
+add_proto qw/int vp8_mbblock_error/, "struct macroblock *mb, int dc";
+specialize qw/vp8_mbblock_error mmx sse2 msa/;
+$vp8_mbblock_error_sse2=vp8_mbblock_error_xmm;
+
+add_proto qw/int vp8_mbuverror/, "struct macroblock *mb";
+specialize qw/vp8_mbuverror mmx sse2 msa/;
+$vp8_mbuverror_sse2=vp8_mbuverror_xmm;
+
+#
+# Motion search
+#
+add_proto qw/int vp8_full_search_sad/, "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv";
+specialize qw/vp8_full_search_sad sse3 sse4_1/;
+$vp8_full_search_sad_sse3=vp8_full_search_sadx3;
+$vp8_full_search_sad_sse4_1=vp8_full_search_sadx8;
+
+add_proto qw/int vp8_refining_search_sad/, "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv";
+specialize qw/vp8_refining_search_sad sse3/;
+$vp8_refining_search_sad_sse3=vp8_refining_search_sadx4;
+
+add_proto qw/int vp8_diamond_search_sad/, "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv";
+$vp8_diamond_search_sad_sse3=vp8_diamond_search_sadx4;
+
+#
+# Alt-ref Noise Reduction (ARNR)
+#
+if (vpx_config("CONFIG_REALTIME_ONLY") ne "yes") {
+    add_proto qw/void vp8_temporal_filter_apply/, "unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count";
+    specialize qw/vp8_temporal_filter_apply sse2 msa/;
+}
+
+#
+# Denoiser filter
+#
+if (vpx_config("CONFIG_TEMPORAL_DENOISING") eq "yes") {
+    add_proto qw/int vp8_denoiser_filter/, "unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising";
+    specialize qw/vp8_denoiser_filter sse2 neon msa/;
+    add_proto qw/int vp8_denoiser_filter_uv/, "unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising";
+    specialize qw/vp8_denoiser_filter_uv sse2 neon msa/;
+}
+
+# End of encoder only functions
+}
+1;
diff --git a/libs/libvpx/vp8/common/setupintrarecon.c b/libs/libvpx/vp8/common/setupintrarecon.c
new file mode 100644
index 0000000000..669564db42
--- /dev/null
+++ b/libs/libvpx/vp8/common/setupintrarecon.c
@@ -0,0 +1,39 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "setupintrarecon.h"
+#include "vpx_mem/vpx_mem.h"
+
+void vp8_setup_intra_recon(YV12_BUFFER_CONFIG *ybf)
+{
+    int i;
+
+    /* set up frame new frame for intra coded blocks */
+    memset(ybf->y_buffer - 1 - ybf->y_stride, 127, ybf->y_width + 5);
+    for (i = 0; i < ybf->y_height; i++)
+        ybf->y_buffer[ybf->y_stride *i - 1] = (unsigned char) 129;
+
+    memset(ybf->u_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
+    for (i = 0; i < ybf->uv_height; i++)
+        ybf->u_buffer[ybf->uv_stride *i - 1] = (unsigned char) 129;
+
+    memset(ybf->v_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
+    for (i = 0; i < ybf->uv_height; i++)
+        ybf->v_buffer[ybf->uv_stride *i - 1] = (unsigned char) 129;
+
+}
+
+void vp8_setup_intra_recon_top_line(YV12_BUFFER_CONFIG *ybf)
+{
+    memset(ybf->y_buffer - 1 - ybf->y_stride, 127, ybf->y_width + 5);
+    memset(ybf->u_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
+    memset(ybf->v_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
+}
diff --git a/libs/libvpx/vp8/common/setupintrarecon.h b/libs/libvpx/vp8/common/setupintrarecon.h
new file mode 100644
index 0000000000..1857c4e26a
--- /dev/null
+++ b/libs/libvpx/vp8/common/setupintrarecon.h
@@ -0,0 +1,45 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_COMMON_SETUPINTRARECON_H_
+#define VP8_COMMON_SETUPINTRARECON_H_
+
+#include "./vpx_config.h"
+#include "vpx_scale/yv12config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern void vp8_setup_intra_recon(YV12_BUFFER_CONFIG *ybf);
+extern void vp8_setup_intra_recon_top_line(YV12_BUFFER_CONFIG *ybf);
+
+static INLINE void setup_intra_recon_left(unsigned char *y_buffer,
+                                          unsigned char *u_buffer,
+                                          unsigned char *v_buffer,
+                                          int y_stride,
+                                          int uv_stride)
+{
+    int i;
+
+    for (i = 0; i < 16; i++)
+        y_buffer[y_stride *i] = (unsigned char) 129;
+
+    for (i = 0; i < 8; i++)
+        u_buffer[uv_stride *i] = (unsigned char) 129;
+
+    for (i = 0; i < 8; i++)
+        v_buffer[uv_stride *i] = (unsigned char) 129;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_COMMON_SETUPINTRARECON_H_
diff --git a/libs/libvpx/vp8/common/swapyv12buffer.c b/libs/libvpx/vp8/common/swapyv12buffer.c
new file mode 100644
index 0000000000..73656b3d72
--- /dev/null
+++ b/libs/libvpx/vp8/common/swapyv12buffer.c
@@ -0,0 +1,34 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "swapyv12buffer.h"
+
+void vp8_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame, YV12_BUFFER_CONFIG *last_frame)
+{
+    unsigned char *temp;
+
+    temp = last_frame->buffer_alloc;
+    last_frame->buffer_alloc = new_frame->buffer_alloc;
+    new_frame->buffer_alloc = temp;
+
+    temp = last_frame->y_buffer;
+    last_frame->y_buffer = new_frame->y_buffer;
+    new_frame->y_buffer = temp;
+
+    temp = last_frame->u_buffer;
+    last_frame->u_buffer = new_frame->u_buffer;
+    new_frame->u_buffer = temp;
+
+    temp = last_frame->v_buffer;
+    last_frame->v_buffer = new_frame->v_buffer;
+    new_frame->v_buffer = temp;
+
+}
diff --git a/libs/libvpx/vp8/common/swapyv12buffer.h b/libs/libvpx/vp8/common/swapyv12buffer.h
new file mode 100644
index 0000000000..1d66cd3d62
--- /dev/null
+++ b/libs/libvpx/vp8/common/swapyv12buffer.h
@@ -0,0 +1,27 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_COMMON_SWAPYV12BUFFER_H_
+#define VP8_COMMON_SWAPYV12BUFFER_H_
+
+#include "vpx_scale/yv12config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp8_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame, YV12_BUFFER_CONFIG *last_frame);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_COMMON_SWAPYV12BUFFER_H_
diff --git a/libs/libvpx/vp8/common/systemdependent.h b/libs/libvpx/vp8/common/systemdependent.h
new file mode 100644
index 0000000000..3d44e37cf2
--- /dev/null
+++ b/libs/libvpx/vp8/common/systemdependent.h
@@ -0,0 +1,27 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_COMMON_SYSTEMDEPENDENT_H_
+#define VP8_COMMON_SYSTEMDEPENDENT_H_
+
+#include "vpx_config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP8Common;
+void vp8_machine_specific_config(struct VP8Common *);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_COMMON_SYSTEMDEPENDENT_H_
diff --git a/libs/libvpx/vp8/common/textblit.c b/libs/libvpx/vp8/common/textblit.c
new file mode 100644
index 0000000000..1756100a7e
--- /dev/null
+++ b/libs/libvpx/vp8/common/textblit.c
@@ -0,0 +1,130 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+
+void vp8_blit_text(const char *msg, unsigned char *address, const int pitch)
+{
+    int letter_bitmap;
+    unsigned char *output_pos = address;
+    int colpos;
+    const int font[] =
+    {
+        0x0, 0x5C00, 0x8020, 0xAFABEA, 0xD7EC0, 0x1111111, 0x1855740, 0x18000,
+        0x45C0, 0x74400, 0x51140, 0x23880, 0xC4000, 0x21080, 0x80000, 0x111110,
+        0xE9D72E, 0x87E40, 0x12AD732, 0xAAD62A, 0x4F94C4, 0x4D6B7, 0x456AA,
+        0x3E8423, 0xAAD6AA, 0xAAD6A2, 0x2800, 0x2A00, 0x8A880, 0x52940, 0x22A20,
+        0x15422, 0x6AD62E, 0x1E4A53E, 0xAAD6BF, 0x8C62E, 0xE8C63F, 0x118D6BF,
+        0x1094BF, 0xCAC62E, 0x1F2109F, 0x118FE31, 0xF8C628, 0x8A89F, 0x108421F,
+        0x1F1105F, 0x1F4105F, 0xE8C62E, 0x2294BF, 0x164C62E, 0x12694BF, 0x8AD6A2,
+        0x10FC21, 0x1F8421F, 0x744107, 0xF8220F, 0x1151151, 0x117041, 0x119D731,
+        0x47E0, 0x1041041, 0xFC400, 0x10440, 0x1084210, 0x820
+    };
+    colpos = 0;
+
+    while (msg[colpos] != 0)
+    {
+        char letter = msg[colpos];
+        int fontcol, fontrow;
+
+        if (letter <= 'Z' && letter >= ' ')
+            letter_bitmap = font[letter-' '];
+        else if (letter <= 'z' && letter >= 'a')
+            letter_bitmap = font[letter-'a'+'A' - ' '];
+        else
+            letter_bitmap = font[0];
+
+        for (fontcol = 6; fontcol >= 0 ; fontcol--)
+            for (fontrow = 0; fontrow < 5; fontrow++)
+                output_pos[fontrow *pitch + fontcol] =
+                    ((letter_bitmap >> (fontcol * 5)) & (1 << fontrow) ? 255 : 0);
+
+        output_pos += 7;
+        colpos++;
+    }
+}
+
+static void plot (const int x, const int y, unsigned char *image, const int pitch)
+{
+    image [x+y*pitch] ^= 255;
+}
+
+/* Bresenham line algorithm */
+void vp8_blit_line(int x0, int x1, int y0, int y1, unsigned char *image, const int pitch)
+{
+    int steep = abs(y1 - y0) > abs(x1 - x0);
+    int deltax, deltay;
+    int error, ystep, y, x;
+
+    if (steep)
+    {
+        int t;
+        t = x0;
+        x0 = y0;
+        y0 = t;
+
+        t = x1;
+        x1 = y1;
+        y1 = t;
+    }
+
+    if (x0 > x1)
+    {
+        int t;
+        t = x0;
+        x0 = x1;
+        x1 = t;
+
+        t = y0;
+        y0 = y1;
+        y1 = t;
+    }
+
+    deltax = x1 - x0;
+    deltay = abs(y1 - y0);
+    error  = deltax / 2;
+
+    y = y0;
+
+    if (y0 < y1)
+        ystep = 1;
+    else
+        ystep = -1;
+
+    if (steep)
+    {
+        for (x = x0; x <= x1; x++)
+        {
+            plot(y,x, image, pitch);
+
+            error = error - deltay;
+            if (error < 0)
+            {
+                y = y + ystep;
+                error = error + deltax;
+            }
+        }
+    }
+    else
+    {
+        for (x = x0; x <= x1; x++)
+        {
+            plot(x,y, image, pitch);
+
+            error = error - deltay;
+            if (error < 0)
+            {
+                y = y + ystep;
+                error = error + deltax;
+            }
+        }
+    }
+}
diff --git a/libs/libvpx/vp8/common/threading.h b/libs/libvpx/vp8/common/threading.h
new file mode 100644
index 0000000000..c00e517a78
--- /dev/null
+++ b/libs/libvpx/vp8/common/threading.h
@@ -0,0 +1,232 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_COMMON_THREADING_H_
+#define VP8_COMMON_THREADING_H_
+
+#include "./vpx_config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD
+
+/* Thread management macros */
+#if defined(_WIN32) && !HAVE_PTHREAD_H
+/* Win32 */
+#include <process.h>
+#include <windows.h>
+#define THREAD_FUNCTION unsigned int __stdcall
+#define THREAD_FUNCTION_RETURN DWORD
+#define THREAD_SPECIFIC_INDEX DWORD
+#define pthread_t HANDLE
+#define pthread_attr_t DWORD
+#define pthread_detach(thread) if(thread!=NULL)CloseHandle(thread)
+#define thread_sleep(nms) Sleep(nms)
+#define pthread_cancel(thread) terminate_thread(thread,0)
+#define ts_key_create(ts_key, destructor) {ts_key = TlsAlloc();};
+#define pthread_getspecific(ts_key) TlsGetValue(ts_key)
+#define pthread_setspecific(ts_key, value) TlsSetValue(ts_key, (void *)value)
+#define pthread_self() GetCurrentThreadId()
+
+#elif defined(__OS2__)
+/* OS/2 */
+#define INCL_DOS
+#include <os2.h>
+
+#include <stdlib.h>
+#define THREAD_FUNCTION void
+#define THREAD_FUNCTION_RETURN void
+#define THREAD_SPECIFIC_INDEX PULONG
+#define pthread_t TID
+#define pthread_attr_t ULONG
+#define pthread_detach(thread) 0
+#define thread_sleep(nms) DosSleep(nms)
+#define pthread_cancel(thread) DosKillThread(thread)
+#define ts_key_create(ts_key, destructor) \
+    DosAllocThreadLocalMemory(1, &(ts_key));
+#define pthread_getspecific(ts_key) ((void *)(*(ts_key)))
+#define pthread_setspecific(ts_key, value) (*(ts_key)=(ULONG)(value))
+#define pthread_self() _gettid()
+#else
+#ifdef __APPLE__
+#include <mach/mach_init.h>
+#include <mach/semaphore.h>
+#include <mach/task.h>
+#include <time.h>
+#include <unistd.h>
+
+#else
+#include <semaphore.h>
+#endif
+
+#include <pthread.h>
+/* pthreads */
+/* Nearly everything is already defined */
+#define THREAD_FUNCTION void *
+#define THREAD_FUNCTION_RETURN void *
+#define THREAD_SPECIFIC_INDEX pthread_key_t
+#define ts_key_create(ts_key, destructor) pthread_key_create (&(ts_key), destructor);
+#endif
+
+/* Synchronization macros: Win32 and Pthreads */
+#if defined(_WIN32) && !HAVE_PTHREAD_H
+#define sem_t HANDLE
+#define pause(voidpara) __asm PAUSE
+#define sem_init(sem, sem_attr1, sem_init_value) (int)((*sem = CreateSemaphore(NULL,0,32768,NULL))==NULL)
+#define sem_wait(sem) (int)(WAIT_OBJECT_0 != WaitForSingleObject(*sem,INFINITE))
+#define sem_post(sem) ReleaseSemaphore(*sem,1,NULL)
+#define sem_destroy(sem) if(*sem)((int)(CloseHandle(*sem))==TRUE)
+#define thread_sleep(nms) Sleep(nms)
+
+#elif defined(__OS2__)
+typedef struct
+{
+    HEV  event;
+    HMTX wait_mutex;
+    HMTX count_mutex;
+    int  count;
+} sem_t;
+
+static inline int sem_init(sem_t *sem, int pshared, unsigned int value)
+{
+    DosCreateEventSem(NULL, &sem->event, pshared ? DC_SEM_SHARED : 0,
+                      value > 0 ? TRUE : FALSE);
+    DosCreateMutexSem(NULL, &sem->wait_mutex, 0, FALSE);
+    DosCreateMutexSem(NULL, &sem->count_mutex, 0, FALSE);
+
+    sem->count = value;
+
+    return 0;
+}
+
+static inline int sem_wait(sem_t * sem)
+{
+    DosRequestMutexSem(sem->wait_mutex, -1);
+
+    DosWaitEventSem(sem->event, -1);
+
+    DosRequestMutexSem(sem->count_mutex, -1);
+
+    sem->count--;
+    if (sem->count == 0)
+    {
+        ULONG post_count;
+
+        DosResetEventSem(sem->event, &post_count);
+    }
+
+    DosReleaseMutexSem(sem->count_mutex);
+
+    DosReleaseMutexSem(sem->wait_mutex);
+
+    return 0;
+}
+
+static inline int sem_post(sem_t * sem)
+{
+    DosRequestMutexSem(sem->count_mutex, -1);
+
+    if (sem->count < 32768)
+    {
+        sem->count++;
+        DosPostEventSem(sem->event);
+    }
+
+    DosReleaseMutexSem(sem->count_mutex);
+
+    return 0;
+}
+
+static inline int sem_destroy(sem_t * sem)
+{
+    DosCloseEventSem(sem->event);
+    DosCloseMutexSem(sem->wait_mutex);
+    DosCloseMutexSem(sem->count_mutex);
+
+    return 0;
+}
+
+#define thread_sleep(nms) DosSleep(nms)
+
+#else
+
+#ifdef __APPLE__
+#define sem_t semaphore_t
+#define sem_init(X,Y,Z) semaphore_create(mach_task_self(), X, SYNC_POLICY_FIFO, Z)
+#define sem_wait(sem) (semaphore_wait(*sem) )
+#define sem_post(sem) semaphore_signal(*sem)
+#define sem_destroy(sem) semaphore_destroy(mach_task_self(),*sem)
+#define thread_sleep(nms) /* { struct timespec ts;ts.tv_sec=0; ts.tv_nsec = 1000*nms;nanosleep(&ts, NULL);} */
+#else
+#include <unistd.h>
+#include <sched.h>
+#define thread_sleep(nms) sched_yield();/* {struct timespec ts;ts.tv_sec=0; ts.tv_nsec = 1000*nms;nanosleep(&ts, NULL);} */
+#endif
+/* Not Windows. Assume pthreads */
+
+#endif
+
+#if ARCH_X86 || ARCH_X86_64
+#include "vpx_ports/x86.h"
+#else
+#define x86_pause_hint()
+#endif
+
+#include "vpx_util/vpx_thread.h"
+
+static INLINE void mutex_lock(pthread_mutex_t *const mutex) {
+    const int kMaxTryLocks = 4000;
+    int locked = 0;
+    int i;
+
+    for (i = 0; i < kMaxTryLocks; ++i) {
+        if (!pthread_mutex_trylock(mutex)) {
+            locked = 1;
+            break;
+        }
+    }
+
+    if (!locked)
+        pthread_mutex_lock(mutex);
+}
+
+static INLINE int protected_read(pthread_mutex_t *const mutex, const int *p) {
+    int ret;
+    mutex_lock(mutex);
+    ret = *p;
+    pthread_mutex_unlock(mutex);
+    return ret;
+}
+
+static INLINE void sync_read(pthread_mutex_t *const mutex, int mb_col,
+                             const int *last_row_current_mb_col,
+                             const int nsync) {
+    while (mb_col > (protected_read(mutex, last_row_current_mb_col) - nsync)) {
+        x86_pause_hint();
+        thread_sleep(0);
+    }
+}
+
+static INLINE void protected_write(pthread_mutex_t *mutex, int *p, int v) {
+    mutex_lock(mutex);
+    *p = v;
+    pthread_mutex_unlock(mutex);
+}
+
+#endif /* CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD */
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_COMMON_THREADING_H_
diff --git a/libs/libvpx/vp8/common/treecoder.c b/libs/libvpx/vp8/common/treecoder.c
new file mode 100644
index 0000000000..d80c64bdfa
--- /dev/null
+++ b/libs/libvpx/vp8/common/treecoder.c
@@ -0,0 +1,143 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#if CONFIG_DEBUG
+#include <assert.h>
+#endif
+#include <stdio.h>
+
+#include "treecoder.h"
+
+static void tree2tok(
+    struct vp8_token_struct *const p,
+    vp8_tree t,
+    int i,
+    int v,
+    int L
+)
+{
+    v += v;
+    ++L;
+
+    do
+    {
+        const vp8_tree_index j = t[i++];
+
+        if (j <= 0)
+        {
+            p[-j].value = v;
+            p[-j].Len = L;
+        }
+        else
+            tree2tok(p, t, j, v, L);
+    }
+    while (++v & 1);
+}
+
+void vp8_tokens_from_tree(struct vp8_token_struct *p, vp8_tree t)
+{
+    tree2tok(p, t, 0, 0, 0);
+}
+
+void vp8_tokens_from_tree_offset(struct vp8_token_struct *p, vp8_tree t,
+                                 int offset)
+{
+    tree2tok(p - offset, t, 0, 0, 0);
+}
+
+static void branch_counts(
+    int n,                      /* n = size of alphabet */
+    vp8_token tok               [ /* n */ ],
+    vp8_tree tree,
+    unsigned int branch_ct       [ /* n-1 */ ] [2],
+    const unsigned int num_events[ /* n */ ]
+)
+{
+    const int tree_len = n - 1;
+    int t = 0;
+
+#if CONFIG_DEBUG
+    assert(tree_len);
+#endif
+
+    do
+    {
+        branch_ct[t][0] = branch_ct[t][1] = 0;
+    }
+    while (++t < tree_len);
+
+    t = 0;
+
+    do
+    {
+        int L = tok[t].Len;
+        const int enc = tok[t].value;
+        const unsigned int ct = num_events[t];
+
+        vp8_tree_index i = 0;
+
+        do
+        {
+            const int b = (enc >> --L) & 1;
+            const int j = i >> 1;
+#if CONFIG_DEBUG
+            assert(j < tree_len  &&  0 <= L);
+#endif
+
+            branch_ct [j] [b] += ct;
+            i = tree[ i + b];
+        }
+        while (i > 0);
+
+#if CONFIG_DEBUG
+        assert(!L);
+#endif
+    }
+    while (++t < n);
+
+}
+
+
+void vp8_tree_probs_from_distribution(
+    int n,                      /* n = size of alphabet */
+    vp8_token tok               [ /* n */ ],
+    vp8_tree tree,
+    vp8_prob probs          [ /* n-1 */ ],
+    unsigned int branch_ct       [ /* n-1 */ ] [2],
+    const unsigned int num_events[ /* n */ ],
+    unsigned int Pfac,
+    int rd
+)
+{
+    const int tree_len = n - 1;
+    int t = 0;
+
+    branch_counts(n, tok, tree, branch_ct, num_events);
+
+    do
+    {
+        const unsigned int *const c = branch_ct[t];
+        const unsigned int tot = c[0] + c[1];
+
+#if CONFIG_DEBUG
+        assert(tot < (1 << 24));        /* no overflow below */
+#endif
+
+        if (tot)
+        {
+            const unsigned int p = ((c[0] * Pfac) + (rd ? tot >> 1 : 0)) / tot;
+            probs[t] = p < 256 ? (p ? p : 1) : 255; /* agree w/old version for now */
+        }
+        else
+            probs[t] = vp8_prob_half;
+    }
+    while (++t < tree_len);
+}
diff --git a/libs/libvpx/vp8/common/treecoder.h b/libs/libvpx/vp8/common/treecoder.h
new file mode 100644
index 0000000000..d22b7c570c
--- /dev/null
+++ b/libs/libvpx/vp8/common/treecoder.h
@@ -0,0 +1,98 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_COMMON_TREECODER_H_
+#define VP8_COMMON_TREECODER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef unsigned char vp8bc_index_t; /* probability index */
+
+
+typedef unsigned char vp8_prob;
+
+#define vp8_prob_half ( (vp8_prob) 128)
+
+typedef signed char vp8_tree_index;
+struct bool_coder_spec;
+
+typedef struct bool_coder_spec bool_coder_spec;
+typedef struct bool_writer bool_writer;
+typedef struct bool_reader bool_reader;
+
+typedef const bool_coder_spec c_bool_coder_spec;
+typedef const bool_writer c_bool_writer;
+typedef const bool_reader c_bool_reader;
+
+
+
+# define vp8_complement( x) (255 - x)
+
+
+/* We build coding trees compactly in arrays.
+   Each node of the tree is a pair of vp8_tree_indices.
+   Array index often references a corresponding probability table.
+   Index <= 0 means done encoding/decoding and value = -Index,
+   Index > 0 means need another bit, specification at index.
+   Nonnegative indices are always even;  processing begins at node 0. */
+
+typedef const vp8_tree_index vp8_tree[], *vp8_tree_p;
+
+
+typedef const struct vp8_token_struct
+{
+    int value;
+    int Len;
+} vp8_token;
+
+/* Construct encoding array from tree. */
+
+void vp8_tokens_from_tree(struct vp8_token_struct *, vp8_tree);
+void vp8_tokens_from_tree_offset(struct vp8_token_struct *, vp8_tree,
+                                 int offset);
+
+
+/* Convert array of token occurrence counts into a table of probabilities
+   for the associated binary encoding tree.  Also writes count of branches
+   taken for each node on the tree; this facilitiates decisions as to
+   probability updates. */
+
+void vp8_tree_probs_from_distribution(
+    int n,                      /* n = size of alphabet */
+    vp8_token tok               [ /* n */ ],
+    vp8_tree tree,
+    vp8_prob probs          [ /* n-1 */ ],
+    unsigned int branch_ct       [ /* n-1 */ ] [2],
+    const unsigned int num_events[ /* n */ ],
+    unsigned int Pfactor,
+    int Round
+);
+
+/* Variant of above using coder spec rather than hardwired 8-bit probs. */
+
+void vp8bc_tree_probs_from_distribution(
+    int n,                      /* n = size of alphabet */
+    vp8_token tok               [ /* n */ ],
+    vp8_tree tree,
+    vp8_prob probs          [ /* n-1 */ ],
+    unsigned int branch_ct       [ /* n-1 */ ] [2],
+    const unsigned int num_events[ /* n */ ],
+    c_bool_coder_spec *s
+);
+
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_COMMON_TREECODER_H_
diff --git a/libs/libvpx/vp8/common/vp8_entropymodedata.h b/libs/libvpx/vp8/common/vp8_entropymodedata.h
new file mode 100644
index 0000000000..c4aed49897
--- /dev/null
+++ b/libs/libvpx/vp8/common/vp8_entropymodedata.h
@@ -0,0 +1,254 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+*/
+
+#ifndef VP8_COMMON_VP8_ENTROPYMODEDATA_H_
+#define VP8_COMMON_VP8_ENTROPYMODEDATA_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*Generated file, included by entropymode.c*/
+
+
+const struct vp8_token_struct vp8_bmode_encodings[VP8_BINTRAMODES] =
+{
+    { 0, 1 },
+    { 2, 2 },
+    { 6, 3 },
+    { 28, 5 },
+    { 30, 5 },
+    { 58, 6 },
+    { 59, 6 },
+    { 62, 6 },
+    { 126, 7 },
+    { 127, 7 }
+};
+
+const struct vp8_token_struct vp8_ymode_encodings[VP8_YMODES] =
+{
+    { 0, 1 },
+    { 4, 3 },
+    { 5, 3 },
+    { 6, 3 },
+    { 7, 3 }
+};
+
+const struct vp8_token_struct vp8_kf_ymode_encodings[VP8_YMODES] =
+{
+    { 4, 3 },
+    { 5, 3 },
+    { 6, 3 },
+    { 7, 3 },
+    { 0, 1 }
+};
+
+const struct vp8_token_struct vp8_uv_mode_encodings[VP8_UV_MODES] =
+{
+    { 0, 1 },
+    { 2, 2 },
+    { 6, 3 },
+    { 7, 3 }
+};
+
+const struct vp8_token_struct vp8_mbsplit_encodings[VP8_NUMMBSPLITS] =
+{
+    { 6, 3 },
+    { 7, 3 },
+    { 2, 2 },
+    { 0, 1 }
+};
+
+const struct vp8_token_struct vp8_mv_ref_encoding_array[VP8_MVREFS] =
+{
+    { 2, 2 },
+    { 6, 3 },
+    { 0, 1 },
+    { 14, 4 },
+    { 15, 4 }
+};
+
+const struct vp8_token_struct vp8_sub_mv_ref_encoding_array[VP8_SUBMVREFS] =
+{
+    { 0, 1 },
+    { 2, 2 },
+    { 6, 3 },
+    { 7, 3 }
+};
+
+const struct vp8_token_struct vp8_small_mvencodings[8] =
+{
+    { 0, 3 },
+    { 1, 3 },
+    { 2, 3 },
+    { 3, 3 },
+    { 4, 3 },
+    { 5, 3 },
+    { 6, 3 },
+    { 7, 3 }
+};
+
+const vp8_prob vp8_ymode_prob[VP8_YMODES-1] =
+{
+    112, 86, 140, 37
+};
+
+const vp8_prob vp8_kf_ymode_prob[VP8_YMODES-1] =
+{
+    145, 156, 163, 128
+};
+
+const vp8_prob vp8_uv_mode_prob[VP8_UV_MODES-1] =
+{
+    162, 101, 204
+};
+
+const vp8_prob vp8_kf_uv_mode_prob[VP8_UV_MODES-1] =
+{
+    142, 114, 183
+};
+
+const vp8_prob vp8_bmode_prob[VP8_BINTRAMODES-1] =
+{
+    120, 90, 79, 133, 87, 85, 80, 111, 151
+};
+
+
+
+const vp8_prob vp8_kf_bmode_prob
+[VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES-1] =
+{
+    {
+        { 231, 120,  48,  89, 115, 113, 120, 152, 112 },
+        { 152, 179,  64, 126, 170, 118,  46,  70,  95 },
+        { 175,  69, 143,  80,  85,  82,  72, 155, 103 },
+        {  56,  58,  10, 171, 218, 189,  17,  13, 152 },
+        { 144,  71,  10,  38, 171, 213, 144,  34,  26 },
+        { 114,  26,  17, 163,  44, 195,  21,  10, 173 },
+        { 121,  24,  80, 195,  26,  62,  44,  64,  85 },
+        { 170,  46,  55,  19, 136, 160,  33, 206,  71 },
+        {  63,  20,   8, 114, 114, 208,  12,   9, 226 },
+        {  81,  40,  11,  96, 182,  84,  29,  16,  36 }
+    },
+    {
+        { 134, 183,  89, 137,  98, 101, 106, 165, 148 },
+        {  72, 187, 100, 130, 157, 111,  32,  75,  80 },
+        {  66, 102, 167,  99,  74,  62,  40, 234, 128 },
+        {  41,  53,   9, 178, 241, 141,  26,   8, 107 },
+        { 104,  79,  12,  27, 217, 255,  87,  17,   7 },
+        {  74,  43,  26, 146,  73, 166,  49,  23, 157 },
+        {  65,  38, 105, 160,  51,  52,  31, 115, 128 },
+        {  87,  68,  71,  44, 114,  51,  15, 186,  23 },
+        {  47,  41,  14, 110, 182, 183,  21,  17, 194 },
+        {  66,  45,  25, 102, 197, 189,  23,  18,  22 }
+    },
+    {
+        {  88,  88, 147, 150,  42,  46,  45, 196, 205 },
+        {  43,  97, 183, 117,  85,  38,  35, 179,  61 },
+        {  39,  53, 200,  87,  26,  21,  43, 232, 171 },
+        {  56,  34,  51, 104, 114, 102,  29,  93,  77 },
+        { 107,  54,  32,  26,  51,   1,  81,  43,  31 },
+        {  39,  28,  85, 171,  58, 165,  90,  98,  64 },
+        {  34,  22, 116, 206,  23,  34,  43, 166,  73 },
+        {  68,  25, 106,  22,  64, 171,  36, 225, 114 },
+        {  34,  19,  21, 102, 132, 188,  16,  76, 124 },
+        {  62,  18,  78,  95,  85,  57,  50,  48,  51 }
+    },
+    {
+        { 193, 101,  35, 159, 215, 111,  89,  46, 111 },
+        {  60, 148,  31, 172, 219, 228,  21,  18, 111 },
+        { 112, 113,  77,  85, 179, 255,  38, 120, 114 },
+        {  40,  42,   1, 196, 245, 209,  10,  25, 109 },
+        { 100,  80,   8,  43, 154,   1,  51,  26,  71 },
+        {  88,  43,  29, 140, 166, 213,  37,  43, 154 },
+        {  61,  63,  30, 155,  67,  45,  68,   1, 209 },
+        { 142,  78,  78,  16, 255, 128,  34, 197, 171 },
+        {  41,  40,   5, 102, 211, 183,   4,   1, 221 },
+        {  51,  50,  17, 168, 209, 192,  23,  25,  82 }
+    },
+    {
+        { 125,  98,  42,  88, 104,  85, 117, 175,  82 },
+        {  95,  84,  53,  89, 128, 100, 113, 101,  45 },
+        {  75,  79, 123,  47,  51, 128,  81, 171,   1 },
+        {  57,  17,   5,  71, 102,  57,  53,  41,  49 },
+        { 115,  21,   2,  10, 102, 255, 166,  23,   6 },
+        {  38,  33,  13, 121,  57,  73,  26,   1,  85 },
+        {  41,  10,  67, 138,  77, 110,  90,  47, 114 },
+        { 101,  29,  16,  10,  85, 128, 101, 196,  26 },
+        {  57,  18,  10, 102, 102, 213,  34,  20,  43 },
+        { 117,  20,  15,  36, 163, 128,  68,   1,  26 }
+    },
+    {
+        { 138,  31,  36, 171,  27, 166,  38,  44, 229 },
+        {  67,  87,  58, 169,  82, 115,  26,  59, 179 },
+        {  63,  59,  90, 180,  59, 166,  93,  73, 154 },
+        {  40,  40,  21, 116, 143, 209,  34,  39, 175 },
+        {  57,  46,  22,  24, 128,   1,  54,  17,  37 },
+        {  47,  15,  16, 183,  34, 223,  49,  45, 183 },
+        {  46,  17,  33, 183,   6,  98,  15,  32, 183 },
+        {  65,  32,  73, 115,  28, 128,  23, 128, 205 },
+        {  40,   3,   9, 115,  51, 192,  18,   6, 223 },
+        {  87,  37,   9, 115,  59,  77,  64,  21,  47 }
+    },
+    {
+        { 104,  55,  44, 218,   9,  54,  53, 130, 226 },
+        {  64,  90,  70, 205,  40,  41,  23,  26,  57 },
+        {  54,  57, 112, 184,   5,  41,  38, 166, 213 },
+        {  30,  34,  26, 133, 152, 116,  10,  32, 134 },
+        {  75,  32,  12,  51, 192, 255, 160,  43,  51 },
+        {  39,  19,  53, 221,  26, 114,  32,  73, 255 },
+        {  31,   9,  65, 234,   2,  15,   1, 118,  73 },
+        {  88,  31,  35,  67, 102,  85,  55, 186,  85 },
+        {  56,  21,  23, 111,  59, 205,  45,  37, 192 },
+        {  55,  38,  70, 124,  73, 102,   1,  34,  98 }
+    },
+    {
+        { 102,  61,  71,  37,  34,  53,  31, 243, 192 },
+        {  69,  60,  71,  38,  73, 119,  28, 222,  37 },
+        {  68,  45, 128,  34,   1,  47,  11, 245, 171 },
+        {  62,  17,  19,  70, 146,  85,  55,  62,  70 },
+        {  75,  15,   9,   9,  64, 255, 184, 119,  16 },
+        {  37,  43,  37, 154, 100, 163,  85, 160,   1 },
+        {  63,   9,  92, 136,  28,  64,  32, 201,  85 },
+        {  86,   6,  28,   5,  64, 255,  25, 248,   1 },
+        {  56,   8,  17, 132, 137, 255,  55, 116, 128 },
+        {  58,  15,  20,  82, 135,  57,  26, 121,  40 }
+    },
+    {
+        { 164,  50,  31, 137, 154, 133,  25,  35, 218 },
+        {  51, 103,  44, 131, 131, 123,  31,   6, 158 },
+        {  86,  40,  64, 135, 148, 224,  45, 183, 128 },
+        {  22,  26,  17, 131, 240, 154,  14,   1, 209 },
+        {  83,  12,  13,  54, 192, 255,  68,  47,  28 },
+        {  45,  16,  21,  91,  64, 222,   7,   1, 197 },
+        {  56,  21,  39, 155,  60, 138,  23, 102, 213 },
+        {  85,  26,  85,  85, 128, 128,  32, 146, 171 },
+        {  18,  11,   7,  63, 144, 171,   4,   4, 246 },
+        {  35,  27,  10, 146, 174, 171,  12,  26, 128 }
+    },
+    {
+        { 190,  80,  35,  99, 180,  80, 126,  54,  45 },
+        {  85, 126,  47,  87, 176,  51,  41,  20,  32 },
+        { 101,  75, 128, 139, 118, 146, 116, 128,  85 },
+        {  56,  41,  15, 176, 236,  85,  37,   9,  62 },
+        { 146,  36,  19,  30, 171, 255,  97,  27,  20 },
+        {  71,  30,  17, 119, 118, 255,  17,  18, 138 },
+        { 101,  38,  60, 138,  55,  70,  43,  26, 142 },
+        { 138,  45,  61,  62, 219,   1,  81, 188,  64 },
+        {  32,  41,  20, 117, 151, 142,  20,  21, 163 },
+        { 112,  19,  12,  61, 195, 128,  48,   4,  24 }
+    }
+};
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_COMMON_VP8_ENTROPYMODEDATA_H_
diff --git a/libs/libvpx/vp8/common/vp8_loopfilter.c b/libs/libvpx/vp8/common/vp8_loopfilter.c
new file mode 100644
index 0000000000..756ad488f9
--- /dev/null
+++ b/libs/libvpx/vp8/common/vp8_loopfilter.c
@@ -0,0 +1,661 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+#include "loopfilter.h"
+#include "onyxc_int.h"
+#include "vpx_mem/vpx_mem.h"
+
+
+static void lf_init_lut(loop_filter_info_n *lfi)
+{
+    int filt_lvl;
+
+    for (filt_lvl = 0; filt_lvl <= MAX_LOOP_FILTER; filt_lvl++)
+    {
+        if (filt_lvl >= 40)
+        {
+            lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 2;
+            lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 3;
+        }
+        else if (filt_lvl >= 20)
+        {
+            lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1;
+            lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 2;
+        }
+        else if (filt_lvl >= 15)
+        {
+            lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1;
+            lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 1;
+        }
+        else
+        {
+            lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 0;
+            lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 0;
+        }
+    }
+
+    lfi->mode_lf_lut[DC_PRED] = 1;
+    lfi->mode_lf_lut[V_PRED] = 1;
+    lfi->mode_lf_lut[H_PRED] = 1;
+    lfi->mode_lf_lut[TM_PRED] = 1;
+    lfi->mode_lf_lut[B_PRED]  = 0;
+
+    lfi->mode_lf_lut[ZEROMV]  = 1;
+    lfi->mode_lf_lut[NEARESTMV] = 2;
+    lfi->mode_lf_lut[NEARMV] = 2;
+    lfi->mode_lf_lut[NEWMV] = 2;
+    lfi->mode_lf_lut[SPLITMV] = 3;
+
+}
+
+void vp8_loop_filter_update_sharpness(loop_filter_info_n *lfi,
+                                      int sharpness_lvl)
+{
+    int i;
+
+    /* For each possible value for the loop filter fill out limits */
+    for (i = 0; i <= MAX_LOOP_FILTER; i++)
+    {
+        int filt_lvl = i;
+        int block_inside_limit = 0;
+
+        /* Set loop filter paramaeters that control sharpness. */
+        block_inside_limit = filt_lvl >> (sharpness_lvl > 0);
+        block_inside_limit = block_inside_limit >> (sharpness_lvl > 4);
+
+        if (sharpness_lvl > 0)
+        {
+            if (block_inside_limit > (9 - sharpness_lvl))
+                block_inside_limit = (9 - sharpness_lvl);
+        }
+
+        if (block_inside_limit < 1)
+            block_inside_limit = 1;
+
+        memset(lfi->lim[i], block_inside_limit, SIMD_WIDTH);
+        memset(lfi->blim[i], (2 * filt_lvl + block_inside_limit), SIMD_WIDTH);
+        memset(lfi->mblim[i], (2 * (filt_lvl + 2) + block_inside_limit),
+               SIMD_WIDTH);
+    }
+}
+
+void vp8_loop_filter_init(VP8_COMMON *cm)
+{
+    loop_filter_info_n *lfi = &cm->lf_info;
+    int i;
+
+    /* init limits for given sharpness*/
+    vp8_loop_filter_update_sharpness(lfi, cm->sharpness_level);
+    cm->last_sharpness_level = cm->sharpness_level;
+
+    /* init LUT for lvl  and hev thr picking */
+    lf_init_lut(lfi);
+
+    /* init hev threshold const vectors */
+    for(i = 0; i < 4 ; i++)
+    {
+        memset(lfi->hev_thr[i], i, SIMD_WIDTH);
+    }
+}
+
+void vp8_loop_filter_frame_init(VP8_COMMON *cm,
+                                MACROBLOCKD *mbd,
+                                int default_filt_lvl)
+{
+    int seg,  /* segment number */
+        ref,  /* index in ref_lf_deltas */
+        mode; /* index in mode_lf_deltas */
+
+    loop_filter_info_n *lfi = &cm->lf_info;
+
+    /* update limits if sharpness has changed */
+    if(cm->last_sharpness_level != cm->sharpness_level)
+    {
+        vp8_loop_filter_update_sharpness(lfi, cm->sharpness_level);
+        cm->last_sharpness_level = cm->sharpness_level;
+    }
+
+    for(seg = 0; seg < MAX_MB_SEGMENTS; seg++)
+    {
+        int lvl_seg = default_filt_lvl;
+        int lvl_ref, lvl_mode;
+
+        /* Note the baseline filter values for each segment */
+        if (mbd->segmentation_enabled)
+        {
+            /* Abs value */
+            if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA)
+            {
+                lvl_seg = mbd->segment_feature_data[MB_LVL_ALT_LF][seg];
+            }
+            else  /* Delta Value */
+            {
+                lvl_seg += mbd->segment_feature_data[MB_LVL_ALT_LF][seg];
+            }
+            lvl_seg = (lvl_seg > 0) ? ((lvl_seg > 63) ? 63: lvl_seg) : 0;
+        }
+
+        if (!mbd->mode_ref_lf_delta_enabled)
+        {
+            /* we could get rid of this if we assume that deltas are set to
+             * zero when not in use; encoder always uses deltas
+             */
+            memset(lfi->lvl[seg][0], lvl_seg, 4 * 4 );
+            continue;
+        }
+
+        /* INTRA_FRAME */
+        ref = INTRA_FRAME;
+
+        /* Apply delta for reference frame */
+        lvl_ref = lvl_seg + mbd->ref_lf_deltas[ref];
+
+        /* Apply delta for Intra modes */
+        mode = 0; /* B_PRED */
+        /* Only the split mode BPRED has a further special case */
+        lvl_mode = lvl_ref + mbd->mode_lf_deltas[mode];
+        /* clamp */
+        lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0;
+
+        lfi->lvl[seg][ref][mode] = lvl_mode;
+
+        mode = 1; /* all the rest of Intra modes */
+        /* clamp */
+        lvl_mode = (lvl_ref > 0) ? (lvl_ref > 63 ? 63 : lvl_ref) : 0;
+        lfi->lvl[seg][ref][mode] = lvl_mode;
+
+        /* LAST, GOLDEN, ALT */
+        for(ref = 1; ref < MAX_REF_FRAMES; ref++)
+        {
+            /* Apply delta for reference frame */
+            lvl_ref = lvl_seg + mbd->ref_lf_deltas[ref];
+
+            /* Apply delta for Inter modes */
+            for (mode = 1; mode < 4; mode++)
+            {
+                lvl_mode = lvl_ref + mbd->mode_lf_deltas[mode];
+                /* clamp */
+                lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0;
+
+                lfi->lvl[seg][ref][mode] = lvl_mode;
+            }
+        }
+    }
+}
+
+
+void vp8_loop_filter_row_normal(VP8_COMMON *cm, MODE_INFO *mode_info_context,
+                         int mb_row, int post_ystride, int post_uvstride,
+                         unsigned char *y_ptr, unsigned char *u_ptr,
+                         unsigned char *v_ptr)
+{
+    int mb_col;
+    int filter_level;
+    loop_filter_info_n *lfi_n = &cm->lf_info;
+    loop_filter_info lfi;
+    FRAME_TYPE frame_type = cm->frame_type;
+
+    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+    {
+        int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
+                        mode_info_context->mbmi.mode != SPLITMV &&
+                        mode_info_context->mbmi.mb_skip_coeff);
+
+        const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
+        const int seg = mode_info_context->mbmi.segment_id;
+        const int ref_frame = mode_info_context->mbmi.ref_frame;
+
+        filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
+
+        if (filter_level)
+        {
+            const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
+            lfi.mblim = lfi_n->mblim[filter_level];
+            lfi.blim = lfi_n->blim[filter_level];
+            lfi.lim = lfi_n->lim[filter_level];
+            lfi.hev_thr = lfi_n->hev_thr[hev_index];
+
+            if (mb_col > 0)
+                vp8_loop_filter_mbv
+                (y_ptr, u_ptr, v_ptr, post_ystride, post_uvstride, &lfi);
+
+            if (!skip_lf)
+                vp8_loop_filter_bv
+                (y_ptr, u_ptr, v_ptr, post_ystride, post_uvstride, &lfi);
+
+            /* don't apply across umv border */
+            if (mb_row > 0)
+                vp8_loop_filter_mbh
+                (y_ptr, u_ptr, v_ptr, post_ystride, post_uvstride, &lfi);
+
+            if (!skip_lf)
+                vp8_loop_filter_bh
+                (y_ptr, u_ptr, v_ptr, post_ystride, post_uvstride, &lfi);
+        }
+
+        y_ptr += 16;
+        u_ptr += 8;
+        v_ptr += 8;
+
+        mode_info_context++;     /* step to next MB */
+    }
+
+}
+
+void vp8_loop_filter_row_simple(VP8_COMMON *cm, MODE_INFO *mode_info_context,
+                         int mb_row, int post_ystride, int post_uvstride,
+                         unsigned char *y_ptr, unsigned char *u_ptr,
+                         unsigned char *v_ptr)
+{
+    int mb_col;
+    int filter_level;
+    loop_filter_info_n *lfi_n = &cm->lf_info;
+    (void)post_uvstride;
+
+    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+    {
+        int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
+                        mode_info_context->mbmi.mode != SPLITMV &&
+                        mode_info_context->mbmi.mb_skip_coeff);
+
+        const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
+        const int seg = mode_info_context->mbmi.segment_id;
+        const int ref_frame = mode_info_context->mbmi.ref_frame;
+
+        filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
+
+        if (filter_level)
+        {
+            if (mb_col > 0)
+                vp8_loop_filter_simple_mbv
+                (y_ptr, post_ystride, lfi_n->mblim[filter_level]);
+
+            if (!skip_lf)
+                vp8_loop_filter_simple_bv
+                (y_ptr, post_ystride, lfi_n->blim[filter_level]);
+
+            /* don't apply across umv border */
+            if (mb_row > 0)
+                vp8_loop_filter_simple_mbh
+                (y_ptr, post_ystride, lfi_n->mblim[filter_level]);
+
+            if (!skip_lf)
+                vp8_loop_filter_simple_bh
+                (y_ptr, post_ystride, lfi_n->blim[filter_level]);
+        }
+
+        y_ptr += 16;
+        u_ptr += 8;
+        v_ptr += 8;
+
+        mode_info_context++;     /* step to next MB */
+    }
+
+}
+void vp8_loop_filter_frame(VP8_COMMON *cm,
+                           MACROBLOCKD *mbd,
+                           int frame_type)
+{
+    YV12_BUFFER_CONFIG *post = cm->frame_to_show;
+    loop_filter_info_n *lfi_n = &cm->lf_info;
+    loop_filter_info lfi;
+
+    int mb_row;
+    int mb_col;
+    int mb_rows = cm->mb_rows;
+    int mb_cols = cm->mb_cols;
+
+    int filter_level;
+
+    unsigned char *y_ptr, *u_ptr, *v_ptr;
+
+    /* Point at base of Mb MODE_INFO list */
+    const MODE_INFO *mode_info_context = cm->mi;
+    int post_y_stride = post->y_stride;
+    int post_uv_stride = post->uv_stride;
+
+    /* Initialize the loop filter for this frame. */
+    vp8_loop_filter_frame_init(cm, mbd, cm->filter_level);
+
+    /* Set up the buffer pointers */
+    y_ptr = post->y_buffer;
+    u_ptr = post->u_buffer;
+    v_ptr = post->v_buffer;
+
+    /* vp8_filter each macro block */
+    if (cm->filter_type == NORMAL_LOOPFILTER)
+    {
+        for (mb_row = 0; mb_row < mb_rows; mb_row++)
+        {
+            for (mb_col = 0; mb_col < mb_cols; mb_col++)
+            {
+                int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
+                                mode_info_context->mbmi.mode != SPLITMV &&
+                                mode_info_context->mbmi.mb_skip_coeff);
+
+                const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
+                const int seg = mode_info_context->mbmi.segment_id;
+                const int ref_frame = mode_info_context->mbmi.ref_frame;
+
+                filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
+
+                if (filter_level)
+                {
+                    const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
+                    lfi.mblim = lfi_n->mblim[filter_level];
+                    lfi.blim = lfi_n->blim[filter_level];
+                    lfi.lim = lfi_n->lim[filter_level];
+                    lfi.hev_thr = lfi_n->hev_thr[hev_index];
+
+                    if (mb_col > 0)
+                        vp8_loop_filter_mbv
+                        (y_ptr, u_ptr, v_ptr, post_y_stride, post_uv_stride, &lfi);
+
+                    if (!skip_lf)
+                        vp8_loop_filter_bv
+                        (y_ptr, u_ptr, v_ptr, post_y_stride, post_uv_stride, &lfi);
+
+                    /* don't apply across umv border */
+                    if (mb_row > 0)
+                        vp8_loop_filter_mbh
+                        (y_ptr, u_ptr, v_ptr, post_y_stride, post_uv_stride, &lfi);
+
+                    if (!skip_lf)
+                        vp8_loop_filter_bh
+                        (y_ptr, u_ptr, v_ptr, post_y_stride, post_uv_stride, &lfi);
+                }
+
+                y_ptr += 16;
+                u_ptr += 8;
+                v_ptr += 8;
+
+                mode_info_context++;     /* step to next MB */
+            }
+            y_ptr += post_y_stride  * 16 - post->y_width;
+            u_ptr += post_uv_stride *  8 - post->uv_width;
+            v_ptr += post_uv_stride *  8 - post->uv_width;
+
+            mode_info_context++;         /* Skip border mb */
+
+        }
+    }
+    else /* SIMPLE_LOOPFILTER */
+    {
+        for (mb_row = 0; mb_row < mb_rows; mb_row++)
+        {
+            for (mb_col = 0; mb_col < mb_cols; mb_col++)
+            {
+                int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
+                                mode_info_context->mbmi.mode != SPLITMV &&
+                                mode_info_context->mbmi.mb_skip_coeff);
+
+                const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
+                const int seg = mode_info_context->mbmi.segment_id;
+                const int ref_frame = mode_info_context->mbmi.ref_frame;
+
+                filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
+                if (filter_level)
+                {
+                    const unsigned char * mblim = lfi_n->mblim[filter_level];
+                    const unsigned char * blim = lfi_n->blim[filter_level];
+
+                    if (mb_col > 0)
+                        vp8_loop_filter_simple_mbv
+                        (y_ptr, post_y_stride, mblim);
+
+                    if (!skip_lf)
+                        vp8_loop_filter_simple_bv
+                        (y_ptr, post_y_stride, blim);
+
+                    /* don't apply across umv border */
+                    if (mb_row > 0)
+                        vp8_loop_filter_simple_mbh
+                        (y_ptr, post_y_stride, mblim);
+
+                    if (!skip_lf)
+                        vp8_loop_filter_simple_bh
+                        (y_ptr, post_y_stride, blim);
+                }
+
+                y_ptr += 16;
+                u_ptr += 8;
+                v_ptr += 8;
+
+                mode_info_context++;     /* step to next MB */
+            }
+            y_ptr += post_y_stride  * 16 - post->y_width;
+            u_ptr += post_uv_stride *  8 - post->uv_width;
+            v_ptr += post_uv_stride *  8 - post->uv_width;
+
+            mode_info_context++;         /* Skip border mb */
+
+        }
+    }
+}
+
+void vp8_loop_filter_frame_yonly
+(
+    VP8_COMMON *cm,
+    MACROBLOCKD *mbd,
+    int default_filt_lvl
+)
+{
+    YV12_BUFFER_CONFIG *post = cm->frame_to_show;
+
+    unsigned char *y_ptr;
+    int mb_row;
+    int mb_col;
+
+    loop_filter_info_n *lfi_n = &cm->lf_info;
+    loop_filter_info lfi;
+
+    int filter_level;
+    FRAME_TYPE frame_type = cm->frame_type;
+
+    /* Point at base of Mb MODE_INFO list */
+    const MODE_INFO *mode_info_context = cm->mi;
+
+#if 0
+    if(default_filt_lvl == 0) /* no filter applied */
+        return;
+#endif
+
+    /* Initialize the loop filter for this frame. */
+    vp8_loop_filter_frame_init( cm, mbd, default_filt_lvl);
+
+    /* Set up the buffer pointers */
+    y_ptr = post->y_buffer;
+
+    /* vp8_filter each macro block */
+    for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
+    {
+        for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+        {
+            int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
+                            mode_info_context->mbmi.mode != SPLITMV &&
+                            mode_info_context->mbmi.mb_skip_coeff);
+
+            const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
+            const int seg = mode_info_context->mbmi.segment_id;
+            const int ref_frame = mode_info_context->mbmi.ref_frame;
+
+            filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
+
+            if (filter_level)
+            {
+                if (cm->filter_type == NORMAL_LOOPFILTER)
+                {
+                    const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
+                    lfi.mblim = lfi_n->mblim[filter_level];
+                    lfi.blim = lfi_n->blim[filter_level];
+                    lfi.lim = lfi_n->lim[filter_level];
+                    lfi.hev_thr = lfi_n->hev_thr[hev_index];
+
+                    if (mb_col > 0)
+                        vp8_loop_filter_mbv
+                        (y_ptr, 0, 0, post->y_stride, 0, &lfi);
+
+                    if (!skip_lf)
+                        vp8_loop_filter_bv
+                        (y_ptr, 0, 0, post->y_stride, 0, &lfi);
+
+                    /* don't apply across umv border */
+                    if (mb_row > 0)
+                        vp8_loop_filter_mbh
+                        (y_ptr, 0, 0, post->y_stride, 0, &lfi);
+
+                    if (!skip_lf)
+                        vp8_loop_filter_bh
+                        (y_ptr, 0, 0, post->y_stride, 0, &lfi);
+                }
+                else
+                {
+                    if (mb_col > 0)
+                        vp8_loop_filter_simple_mbv
+                        (y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
+
+                    if (!skip_lf)
+                        vp8_loop_filter_simple_bv
+                        (y_ptr, post->y_stride, lfi_n->blim[filter_level]);
+
+                    /* don't apply across umv border */
+                    if (mb_row > 0)
+                        vp8_loop_filter_simple_mbh
+                        (y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
+
+                    if (!skip_lf)
+                        vp8_loop_filter_simple_bh
+                        (y_ptr, post->y_stride, lfi_n->blim[filter_level]);
+                }
+            }
+
+            y_ptr += 16;
+            mode_info_context ++;        /* step to next MB */
+
+        }
+
+        y_ptr += post->y_stride  * 16 - post->y_width;
+        mode_info_context ++;            /* Skip border mb */
+    }
+
+}
+
+void vp8_loop_filter_partial_frame
+(
+    VP8_COMMON *cm,
+    MACROBLOCKD *mbd,
+    int default_filt_lvl
+)
+{
+    YV12_BUFFER_CONFIG *post = cm->frame_to_show;
+
+    unsigned char *y_ptr;
+    int mb_row;
+    int mb_col;
+    int mb_cols = post->y_width >> 4;
+    int mb_rows = post->y_height >> 4;
+
+    int linestocopy;
+
+    loop_filter_info_n *lfi_n = &cm->lf_info;
+    loop_filter_info lfi;
+
+    int filter_level;
+    FRAME_TYPE frame_type = cm->frame_type;
+
+    const MODE_INFO *mode_info_context;
+
+#if 0
+    if(default_filt_lvl == 0) /* no filter applied */
+        return;
+#endif
+
+    /* Initialize the loop filter for this frame. */
+    vp8_loop_filter_frame_init( cm, mbd, default_filt_lvl);
+
+    /* number of MB rows to use in partial filtering */
+    linestocopy = mb_rows / PARTIAL_FRAME_FRACTION;
+    linestocopy = linestocopy ? linestocopy << 4 : 16;     /* 16 lines per MB */
+
+    /* Set up the buffer pointers; partial image starts at ~middle of frame */
+    y_ptr = post->y_buffer + ((post->y_height >> 5) * 16) * post->y_stride;
+    mode_info_context = cm->mi + (post->y_height >> 5) * (mb_cols + 1);
+
+    /* vp8_filter each macro block */
+    for (mb_row = 0; mb_row<(linestocopy >> 4); mb_row++)
+    {
+        for (mb_col = 0; mb_col < mb_cols; mb_col++)
+        {
+            int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
+                           mode_info_context->mbmi.mode != SPLITMV &&
+                           mode_info_context->mbmi.mb_skip_coeff);
+
+            const int mode_index =
+                lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
+            const int seg = mode_info_context->mbmi.segment_id;
+            const int ref_frame = mode_info_context->mbmi.ref_frame;
+
+            filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
+
+            if (filter_level)
+            {
+                if (cm->filter_type == NORMAL_LOOPFILTER)
+                {
+                    const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
+                    lfi.mblim = lfi_n->mblim[filter_level];
+                    lfi.blim = lfi_n->blim[filter_level];
+                    lfi.lim = lfi_n->lim[filter_level];
+                    lfi.hev_thr = lfi_n->hev_thr[hev_index];
+
+                    if (mb_col > 0)
+                        vp8_loop_filter_mbv
+                        (y_ptr, 0, 0, post->y_stride, 0, &lfi);
+
+                    if (!skip_lf)
+                        vp8_loop_filter_bv
+                        (y_ptr, 0, 0, post->y_stride, 0, &lfi);
+
+                    vp8_loop_filter_mbh
+                        (y_ptr, 0, 0, post->y_stride, 0, &lfi);
+
+                    if (!skip_lf)
+                        vp8_loop_filter_bh
+                        (y_ptr, 0, 0, post->y_stride, 0, &lfi);
+                }
+                else
+                {
+                    if (mb_col > 0)
+                        vp8_loop_filter_simple_mbv
+                        (y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
+
+                    if (!skip_lf)
+                        vp8_loop_filter_simple_bv
+                        (y_ptr, post->y_stride, lfi_n->blim[filter_level]);
+
+                    vp8_loop_filter_simple_mbh
+                        (y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
+
+                    if (!skip_lf)
+                        vp8_loop_filter_simple_bh
+                        (y_ptr, post->y_stride, lfi_n->blim[filter_level]);
+                }
+            }
+
+            y_ptr += 16;
+            mode_info_context += 1;      /* step to next MB */
+        }
+
+        y_ptr += post->y_stride  * 16 - post->y_width;
+        mode_info_context += 1;          /* Skip border mb */
+    }
+}
diff --git a/libs/libvpx/vp8/common/x86/copy_sse2.asm b/libs/libvpx/vp8/common/x86/copy_sse2.asm
new file mode 100644
index 0000000000..86fae26956
--- /dev/null
+++ b/libs/libvpx/vp8/common/x86/copy_sse2.asm
@@ -0,0 +1,93 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+
+;void vp8_copy32xn_sse2(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *dst_ptr,
+;    int  dst_stride,
+;    int height);
+global sym(vp8_copy32xn_sse2) PRIVATE
+sym(vp8_copy32xn_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;dst_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;dst_stride
+        movsxd          rcx,        dword ptr arg(4) ;height
+
+.block_copy_sse2_loopx4:
+        movdqu          xmm0,       XMMWORD PTR [rsi]
+        movdqu          xmm1,       XMMWORD PTR [rsi + 16]
+        movdqu          xmm2,       XMMWORD PTR [rsi + rax]
+        movdqu          xmm3,       XMMWORD PTR [rsi + rax + 16]
+
+        lea             rsi,        [rsi+rax*2]
+
+        movdqu          xmm4,       XMMWORD PTR [rsi]
+        movdqu          xmm5,       XMMWORD PTR [rsi + 16]
+        movdqu          xmm6,       XMMWORD PTR [rsi + rax]
+        movdqu          xmm7,       XMMWORD PTR [rsi + rax + 16]
+
+        lea             rsi,    [rsi+rax*2]
+
+        movdqa          XMMWORD PTR [rdi], xmm0
+        movdqa          XMMWORD PTR [rdi + 16], xmm1
+        movdqa          XMMWORD PTR [rdi + rdx], xmm2
+        movdqa          XMMWORD PTR [rdi + rdx + 16], xmm3
+
+        lea             rdi,    [rdi+rdx*2]
+
+        movdqa          XMMWORD PTR [rdi], xmm4
+        movdqa          XMMWORD PTR [rdi + 16], xmm5
+        movdqa          XMMWORD PTR [rdi + rdx], xmm6
+        movdqa          XMMWORD PTR [rdi + rdx + 16], xmm7
+
+        lea             rdi,    [rdi+rdx*2]
+
+        sub             rcx,     4
+        cmp             rcx,     4
+        jge             .block_copy_sse2_loopx4
+
+        cmp             rcx, 0
+        je              .copy_is_done
+
+.block_copy_sse2_loop:
+        movdqu          xmm0,       XMMWORD PTR [rsi]
+        movdqu          xmm1,       XMMWORD PTR [rsi + 16]
+        lea             rsi,    [rsi+rax]
+
+        movdqa          XMMWORD PTR [rdi], xmm0
+        movdqa          XMMWORD PTR [rdi + 16], xmm1
+        lea             rdi,    [rdi+rdx]
+
+        sub             rcx,     1
+        jne             .block_copy_sse2_loop
+
+.copy_is_done:
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/libs/libvpx/vp8/common/x86/copy_sse3.asm b/libs/libvpx/vp8/common/x86/copy_sse3.asm
new file mode 100644
index 0000000000..d789a40ccf
--- /dev/null
+++ b/libs/libvpx/vp8/common/x86/copy_sse3.asm
@@ -0,0 +1,146 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro STACK_FRAME_CREATE_X3 0
+%if ABI_IS_32BIT
+  %define     src_ptr       rsi
+  %define     src_stride    rax
+  %define     ref_ptr       rdi
+  %define     ref_stride    rdx
+  %define     end_ptr       rcx
+  %define     ret_var       rbx
+  %define     result_ptr    arg(4)
+  %define     max_sad       arg(4)
+  %define     height        dword ptr arg(4)
+    push        rbp
+    mov         rbp,        rsp
+    push        rsi
+    push        rdi
+    push        rbx
+
+    mov         rsi,        arg(0)              ; src_ptr
+    mov         rdi,        arg(2)              ; ref_ptr
+
+    movsxd      rax,        dword ptr arg(1)    ; src_stride
+    movsxd      rdx,        dword ptr arg(3)    ; ref_stride
+%else
+  %if LIBVPX_YASM_WIN64
+    SAVE_XMM 7, u
+    %define     src_ptr     rcx
+    %define     src_stride  rdx
+    %define     ref_ptr     r8
+    %define     ref_stride  r9
+    %define     end_ptr     r10
+    %define     ret_var     r11
+    %define     result_ptr  [rsp+xmm_stack_space+8+4*8]
+    %define     max_sad     [rsp+xmm_stack_space+8+4*8]
+    %define     height      dword ptr [rsp+xmm_stack_space+8+4*8]
+  %else
+    %define     src_ptr     rdi
+    %define     src_stride  rsi
+    %define     ref_ptr     rdx
+    %define     ref_stride  rcx
+    %define     end_ptr     r9
+    %define     ret_var     r10
+    %define     result_ptr  r8
+    %define     max_sad     r8
+    %define     height      r8
+  %endif
+%endif
+
+%endmacro
+
+%macro STACK_FRAME_DESTROY_X3 0
+  %define     src_ptr
+  %define     src_stride
+  %define     ref_ptr
+  %define     ref_stride
+  %define     end_ptr
+  %define     ret_var
+  %define     result_ptr
+  %define     max_sad
+  %define     height
+
+%if ABI_IS_32BIT
+    pop         rbx
+    pop         rdi
+    pop         rsi
+    pop         rbp
+%else
+  %if LIBVPX_YASM_WIN64
+    RESTORE_XMM
+  %endif
+%endif
+    ret
+%endmacro
+
+
+;void vp8_copy32xn_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *dst_ptr,
+;    int  dst_stride,
+;    int height);
+global sym(vp8_copy32xn_sse3) PRIVATE
+sym(vp8_copy32xn_sse3):
+
+    STACK_FRAME_CREATE_X3
+
+.block_copy_sse3_loopx4:
+        lea             end_ptr,    [src_ptr+src_stride*2]
+
+        movdqu          xmm0,       XMMWORD PTR [src_ptr]
+        movdqu          xmm1,       XMMWORD PTR [src_ptr + 16]
+        movdqu          xmm2,       XMMWORD PTR [src_ptr + src_stride]
+        movdqu          xmm3,       XMMWORD PTR [src_ptr + src_stride + 16]
+        movdqu          xmm4,       XMMWORD PTR [end_ptr]
+        movdqu          xmm5,       XMMWORD PTR [end_ptr + 16]
+        movdqu          xmm6,       XMMWORD PTR [end_ptr + src_stride]
+        movdqu          xmm7,       XMMWORD PTR [end_ptr + src_stride + 16]
+
+        lea             src_ptr,    [src_ptr+src_stride*4]
+
+        lea             end_ptr,    [ref_ptr+ref_stride*2]
+
+        movdqa          XMMWORD PTR [ref_ptr], xmm0
+        movdqa          XMMWORD PTR [ref_ptr + 16], xmm1
+        movdqa          XMMWORD PTR [ref_ptr + ref_stride], xmm2
+        movdqa          XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3
+        movdqa          XMMWORD PTR [end_ptr], xmm4
+        movdqa          XMMWORD PTR [end_ptr + 16], xmm5
+        movdqa          XMMWORD PTR [end_ptr + ref_stride], xmm6
+        movdqa          XMMWORD PTR [end_ptr + ref_stride + 16], xmm7
+
+        lea             ref_ptr,    [ref_ptr+ref_stride*4]
+
+        sub             height,     4
+        cmp             height,     4
+        jge             .block_copy_sse3_loopx4
+
+        ;Check to see if there is more rows need to be copied.
+        cmp             height, 0
+        je              .copy_is_done
+
+.block_copy_sse3_loop:
+        movdqu          xmm0,       XMMWORD PTR [src_ptr]
+        movdqu          xmm1,       XMMWORD PTR [src_ptr + 16]
+        lea             src_ptr,    [src_ptr+src_stride]
+
+        movdqa          XMMWORD PTR [ref_ptr], xmm0
+        movdqa          XMMWORD PTR [ref_ptr + 16], xmm1
+        lea             ref_ptr,    [ref_ptr+ref_stride]
+
+        sub             height,     1
+        jne             .block_copy_sse3_loop
+
+.copy_is_done:
+    STACK_FRAME_DESTROY_X3
diff --git a/libs/libvpx/vp8/common/x86/dequantize_mmx.asm b/libs/libvpx/vp8/common/x86/dequantize_mmx.asm
new file mode 100644
index 0000000000..4e551f00aa
--- /dev/null
+++ b/libs/libvpx/vp8/common/x86/dequantize_mmx.asm
@@ -0,0 +1,258 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+
+;void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q)
+global sym(vp8_dequantize_b_impl_mmx) PRIVATE
+sym(vp8_dequantize_b_impl_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 3
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov       rsi, arg(0) ;sq
+        mov       rdi, arg(1) ;dq
+        mov       rax, arg(2) ;q
+
+        movq      mm1, [rsi]
+        pmullw    mm1, [rax+0]            ; mm4 *= kernel 0 modifiers.
+        movq      [rdi], mm1
+
+        movq      mm1, [rsi+8]
+        pmullw    mm1, [rax+8]            ; mm4 *= kernel 0 modifiers.
+        movq      [rdi+8], mm1
+
+        movq      mm1, [rsi+16]
+        pmullw    mm1, [rax+16]            ; mm4 *= kernel 0 modifiers.
+        movq      [rdi+16], mm1
+
+        movq      mm1, [rsi+24]
+        pmullw    mm1, [rax+24]            ; mm4 *= kernel 0 modifiers.
+        movq      [rdi+24], mm1
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void dequant_idct_add_mmx(
+;short *input,            0
+;short *dq,               1
+;unsigned char *dest,     2
+;int stride)              3
+global sym(vp8_dequant_idct_add_mmx) PRIVATE
+sym(vp8_dequant_idct_add_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    GET_GOT     rbx
+    push        rdi
+    ; end prolog
+
+        mov         rax,    arg(0) ;input
+        mov         rdx,    arg(1) ;dq
+
+
+        movq        mm0,    [rax   ]
+        pmullw      mm0,    [rdx]
+
+        movq        mm1,    [rax +8]
+        pmullw      mm1,    [rdx +8]
+
+        movq        mm2,    [rax+16]
+        pmullw      mm2,    [rdx+16]
+
+        movq        mm3,    [rax+24]
+        pmullw      mm3,    [rdx+24]
+
+        mov         rdx,    arg(2) ;dest
+
+        pxor        mm7,    mm7
+
+
+        movq        [rax],   mm7
+        movq        [rax+8], mm7
+
+        movq        [rax+16],mm7
+        movq        [rax+24],mm7
+
+
+        movsxd      rdi,            dword ptr arg(3) ;stride
+
+        psubw       mm0,            mm2             ; b1= 0-2
+        paddw       mm2,            mm2             ;
+
+        movq        mm5,            mm1
+        paddw       mm2,            mm0             ; a1 =0+2
+
+        pmulhw      mm5,            [GLOBAL(x_s1sqr2)];
+        paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
+
+        movq        mm7,            mm3             ;
+        pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];
+
+        paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
+        psubw       mm7,            mm5             ; c1
+
+        movq        mm5,            mm1
+        movq        mm4,            mm3
+
+        pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
+        paddw       mm5,            mm1
+
+        pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
+        paddw       mm3,            mm4
+
+        paddw       mm3,            mm5             ; d1
+        movq        mm6,            mm2             ; a1
+
+        movq        mm4,            mm0             ; b1
+        paddw       mm2,            mm3             ;0
+
+        paddw       mm4,            mm7             ;1
+        psubw       mm0,            mm7             ;2
+
+        psubw       mm6,            mm3             ;3
+
+        movq        mm1,            mm2             ; 03 02 01 00
+        movq        mm3,            mm4             ; 23 22 21 20
+
+        punpcklwd   mm1,            mm0             ; 11 01 10 00
+        punpckhwd   mm2,            mm0             ; 13 03 12 02
+
+        punpcklwd   mm3,            mm6             ; 31 21 30 20
+        punpckhwd   mm4,            mm6             ; 33 23 32 22
+
+        movq        mm0,            mm1             ; 11 01 10 00
+        movq        mm5,            mm2             ; 13 03 12 02
+
+        punpckldq   mm0,            mm3             ; 30 20 10 00
+        punpckhdq   mm1,            mm3             ; 31 21 11 01
+
+        punpckldq   mm2,            mm4             ; 32 22 12 02
+        punpckhdq   mm5,            mm4             ; 33 23 13 03
+
+        movq        mm3,            mm5             ; 33 23 13 03
+
+        psubw       mm0,            mm2             ; b1= 0-2
+        paddw       mm2,            mm2             ;
+
+        movq        mm5,            mm1
+        paddw       mm2,            mm0             ; a1 =0+2
+
+        pmulhw      mm5,            [GLOBAL(x_s1sqr2)];
+        paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
+
+        movq        mm7,            mm3             ;
+        pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];
+
+        paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
+        psubw       mm7,            mm5             ; c1
+
+        movq        mm5,            mm1
+        movq        mm4,            mm3
+
+        pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
+        paddw       mm5,            mm1
+
+        pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
+        paddw       mm3,            mm4
+
+        paddw       mm3,            mm5             ; d1
+        paddw       mm0,            [GLOBAL(fours)]
+
+        paddw       mm2,            [GLOBAL(fours)]
+        movq        mm6,            mm2             ; a1
+
+        movq        mm4,            mm0             ; b1
+        paddw       mm2,            mm3             ;0
+
+        paddw       mm4,            mm7             ;1
+        psubw       mm0,            mm7             ;2
+
+        psubw       mm6,            mm3             ;3
+        psraw       mm2,            3
+
+        psraw       mm0,            3
+        psraw       mm4,            3
+
+        psraw       mm6,            3
+
+        movq        mm1,            mm2             ; 03 02 01 00
+        movq        mm3,            mm4             ; 23 22 21 20
+
+        punpcklwd   mm1,            mm0             ; 11 01 10 00
+        punpckhwd   mm2,            mm0             ; 13 03 12 02
+
+        punpcklwd   mm3,            mm6             ; 31 21 30 20
+        punpckhwd   mm4,            mm6             ; 33 23 32 22
+
+        movq        mm0,            mm1             ; 11 01 10 00
+        movq        mm5,            mm2             ; 13 03 12 02
+
+        punpckldq   mm0,            mm3             ; 30 20 10 00
+        punpckhdq   mm1,            mm3             ; 31 21 11 01
+
+        punpckldq   mm2,            mm4             ; 32 22 12 02
+        punpckhdq   mm5,            mm4             ; 33 23 13 03
+
+        pxor        mm7,            mm7
+
+        movd        mm4,            [rdx]
+        punpcklbw   mm4,            mm7
+        paddsw      mm0,            mm4
+        packuswb    mm0,            mm7
+        movd        [rdx],          mm0
+
+        movd        mm4,            [rdx+rdi]
+        punpcklbw   mm4,            mm7
+        paddsw      mm1,            mm4
+        packuswb    mm1,            mm7
+        movd        [rdx+rdi],      mm1
+
+        movd        mm4,            [rdx+2*rdi]
+        punpcklbw   mm4,            mm7
+        paddsw      mm2,            mm4
+        packuswb    mm2,            mm7
+        movd        [rdx+rdi*2],    mm2
+
+        add         rdx,            rdi
+
+        movd        mm4,            [rdx+2*rdi]
+        punpcklbw   mm4,            mm7
+        paddsw      mm5,            mm4
+        packuswb    mm5,            mm7
+        movd        [rdx+rdi*2],    mm5
+
+    ; begin epilog
+    pop rdi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+x_s1sqr2:
+    times 4 dw 0x8A8C
+align 16
+x_c1sqr2less1:
+    times 4 dw 0x4E7B
+align 16
+fours:
+    times 4 dw 0x0004
diff --git a/libs/libvpx/vp8/common/x86/filter_x86.c b/libs/libvpx/vp8/common/x86/filter_x86.c
new file mode 100644
index 0000000000..7f496ed7db
--- /dev/null
+++ b/libs/libvpx/vp8/common/x86/filter_x86.c
@@ -0,0 +1,35 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp8/common/x86/filter_x86.h"
+
+DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]) =
+{
+    { 128, 128, 128, 128,   0,   0,   0,   0 },
+    { 112, 112, 112, 112,  16,  16,  16,  16 },
+    {  96,  96,  96,  96,  32,  32,  32,  32 },
+    {  80,  80,  80,  80,  48,  48,  48,  48 },
+    {  64,  64,  64,  64,  64,  64,  64,  64 },
+    {  48,  48,  48,  48,  80,  80,  80,  80 },
+    {  32,  32,  32,  32,  96,  96,  96,  96 },
+    {  16,  16,  16,  16, 112, 112, 112, 112 }
+};
+
+DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]) =
+{
+    { 128, 128, 128, 128, 128, 128, 128, 128,   0,   0,   0,   0,   0,   0,   0,   0 },
+    { 112, 112, 112, 112, 112, 112, 112, 112,  16,  16,  16,  16,  16,  16,  16,  16 },
+    {  96,  96,  96,  96,  96,  96,  96,  96,  32,  32,  32,  32,  32,  32,  32,  32 },
+    {  80,  80,  80,  80,  80,  80,  80,  80,  48,  48,  48,  48,  48,  48,  48,  48 },
+    {  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64 },
+    {  48,  48,  48,  48,  48,  48,  48,  48,  80,  80,  80,  80,  80,  80,  80,  80 },
+    {  32,  32,  32,  32,  32,  32,  32,  32,  96,  96,  96,  96,  96,  96,  96,  96 },
+    {  16,  16,  16,  16,  16,  16,  16,  16, 112, 112, 112, 112, 112, 112, 112, 112 }
+};
diff --git a/libs/libvpx/vp8/common/x86/filter_x86.h b/libs/libvpx/vp8/common/x86/filter_x86.h
new file mode 100644
index 0000000000..d282841bee
--- /dev/null
+++ b/libs/libvpx/vp8/common/x86/filter_x86.h
@@ -0,0 +1,33 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_COMMON_X86_FILTER_X86_H_
+#define VP8_COMMON_X86_FILTER_X86_H_
+
+#include "vpx_ports/mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* x86 assembly specific copy of vp8/common/filter.c:vp8_bilinear_filters with
+ * duplicated values */
+
+/* duplicated 4x */
+extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]);
+
+/* duplicated 8x */
+extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_COMMON_X86_FILTER_X86_H_
diff --git a/libs/libvpx/vp8/common/x86/idct_blk_mmx.c b/libs/libvpx/vp8/common/x86/idct_blk_mmx.c
new file mode 100644
index 0000000000..f2532b34da
--- /dev/null
+++ b/libs/libvpx/vp8/common/x86/idct_blk_mmx.c
@@ -0,0 +1,128 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+#include "vp8/common/blockd.h"
+#include "vpx_mem/vpx_mem.h"
+
+extern void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q);
+
+void vp8_dequantize_b_mmx(BLOCKD *d, short *DQC)
+{
+    short *sq = (short *) d->qcoeff;
+    short *dq = (short *) d->dqcoeff;
+
+    vp8_dequantize_b_impl_mmx(sq, dq, DQC);
+}
+
+void vp8_dequant_idct_add_y_block_mmx
+            (short *q, short *dq,
+             unsigned char *dst, int stride, char *eobs)
+{
+    int i;
+
+    for (i = 0; i < 4; i++)
+    {
+        if (eobs[0] > 1)
+            vp8_dequant_idct_add_mmx (q, dq, dst, stride);
+        else if (eobs[0] == 1)
+        {
+            vp8_dc_only_idct_add_mmx (q[0]*dq[0], dst, stride, dst, stride);
+            memset(q, 0, 2 * sizeof(q[0]));
+        }
+
+        if (eobs[1] > 1)
+            vp8_dequant_idct_add_mmx (q+16, dq, dst+4, stride);
+        else if (eobs[1] == 1)
+        {
+            vp8_dc_only_idct_add_mmx (q[16]*dq[0], dst+4, stride,
+                                      dst+4, stride);
+            memset(q + 16, 0, 2 * sizeof(q[0]));
+        }
+
+        if (eobs[2] > 1)
+            vp8_dequant_idct_add_mmx (q+32, dq, dst+8, stride);
+        else if (eobs[2] == 1)
+        {
+            vp8_dc_only_idct_add_mmx (q[32]*dq[0], dst+8, stride,
+                                      dst+8, stride);
+            memset(q + 32, 0, 2 * sizeof(q[0]));
+        }
+
+        if (eobs[3] > 1)
+            vp8_dequant_idct_add_mmx (q+48, dq, dst+12, stride);
+        else if (eobs[3] == 1)
+        {
+            vp8_dc_only_idct_add_mmx (q[48]*dq[0], dst+12, stride,
+                                      dst+12, stride);
+            memset(q + 48, 0, 2 * sizeof(q[0]));
+        }
+
+        q    += 64;
+        dst  += 4*stride;
+        eobs += 4;
+    }
+}
+
+void vp8_dequant_idct_add_uv_block_mmx
+            (short *q, short *dq,
+             unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
+{
+    int i;
+
+    for (i = 0; i < 2; i++)
+    {
+        if (eobs[0] > 1)
+            vp8_dequant_idct_add_mmx (q, dq, dstu, stride);
+        else if (eobs[0] == 1)
+        {
+            vp8_dc_only_idct_add_mmx (q[0]*dq[0], dstu, stride, dstu, stride);
+            memset(q, 0, 2 * sizeof(q[0]));
+        }
+
+        if (eobs[1] > 1)
+            vp8_dequant_idct_add_mmx (q+16, dq, dstu+4, stride);
+        else if (eobs[1] == 1)
+        {
+            vp8_dc_only_idct_add_mmx (q[16]*dq[0], dstu+4, stride,
+                                      dstu+4, stride);
+            memset(q + 16, 0, 2 * sizeof(q[0]));
+        }
+
+        q    += 32;
+        dstu += 4*stride;
+        eobs += 2;
+    }
+
+    for (i = 0; i < 2; i++)
+    {
+        if (eobs[0] > 1)
+            vp8_dequant_idct_add_mmx (q, dq, dstv, stride);
+        else if (eobs[0] == 1)
+        {
+            vp8_dc_only_idct_add_mmx (q[0]*dq[0], dstv, stride, dstv, stride);
+            memset(q, 0, 2 * sizeof(q[0]));
+        }
+
+        if (eobs[1] > 1)
+            vp8_dequant_idct_add_mmx (q+16, dq, dstv+4, stride);
+        else if (eobs[1] == 1)
+        {
+            vp8_dc_only_idct_add_mmx (q[16]*dq[0], dstv+4, stride,
+                                      dstv+4, stride);
+            memset(q + 16, 0, 2 * sizeof(q[0]));
+        }
+
+        q    += 32;
+        dstv += 4*stride;
+        eobs += 2;
+    }
+}
diff --git a/libs/libvpx/vp8/common/x86/idct_blk_sse2.c b/libs/libvpx/vp8/common/x86/idct_blk_sse2.c
new file mode 100644
index 0000000000..ae96ec858c
--- /dev/null
+++ b/libs/libvpx/vp8/common/x86/idct_blk_sse2.c
@@ -0,0 +1,89 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+
+void vp8_idct_dequant_0_2x_sse2
+            (short *q, short *dq ,
+             unsigned char *dst, int dst_stride);
+void vp8_idct_dequant_full_2x_sse2
+            (short *q, short *dq ,
+             unsigned char *dst, int dst_stride);
+
+void vp8_dequant_idct_add_y_block_sse2
+            (short *q, short *dq,
+             unsigned char *dst, int stride, char *eobs)
+{
+    int i;
+
+    for (i = 0; i < 4; i++)
+    {
+        if (((short *)(eobs))[0])
+        {
+            if (((short *)(eobs))[0] & 0xfefe)
+                vp8_idct_dequant_full_2x_sse2 (q, dq, dst, stride);
+            else
+                vp8_idct_dequant_0_2x_sse2 (q, dq, dst, stride);
+        }
+        if (((short *)(eobs))[1])
+        {
+            if (((short *)(eobs))[1] & 0xfefe)
+                vp8_idct_dequant_full_2x_sse2 (q+32, dq, dst+8, stride);
+            else
+                vp8_idct_dequant_0_2x_sse2 (q+32, dq, dst+8, stride);
+        }
+        q    += 64;
+        dst  += stride*4;
+        eobs += 4;
+    }
+}
+
+void vp8_dequant_idct_add_uv_block_sse2
+            (short *q, short *dq,
+             unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
+{
+    if (((short *)(eobs))[0])
+    {
+        if (((short *)(eobs))[0] & 0xfefe)
+            vp8_idct_dequant_full_2x_sse2 (q, dq, dstu, stride);
+        else
+            vp8_idct_dequant_0_2x_sse2 (q, dq, dstu, stride);
+    }
+    q    += 32;
+    dstu += stride*4;
+
+    if (((short *)(eobs))[1])
+    {
+        if (((short *)(eobs))[1] & 0xfefe)
+            vp8_idct_dequant_full_2x_sse2 (q, dq, dstu, stride);
+        else
+            vp8_idct_dequant_0_2x_sse2 (q, dq, dstu, stride);
+    }
+    q    += 32;
+
+    if (((short *)(eobs))[2])
+    {
+        if (((short *)(eobs))[2] & 0xfefe)
+            vp8_idct_dequant_full_2x_sse2 (q, dq, dstv, stride);
+        else
+            vp8_idct_dequant_0_2x_sse2 (q, dq, dstv, stride);
+    }
+    q    += 32;
+    dstv += stride*4;
+
+    if (((short *)(eobs))[3])
+    {
+      if (((short *)(eobs))[3] & 0xfefe)
+          vp8_idct_dequant_full_2x_sse2 (q, dq, dstv, stride);
+      else
+          vp8_idct_dequant_0_2x_sse2 (q, dq, dstv, stride);
+    }
+}
diff --git a/libs/libvpx/vp8/common/x86/idctllm_mmx.asm b/libs/libvpx/vp8/common/x86/idctllm_mmx.asm
new file mode 100644
index 0000000000..96fa2c60d0
--- /dev/null
+++ b/libs/libvpx/vp8/common/x86/idctllm_mmx.asm
@@ -0,0 +1,295 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+; /****************************************************************************
+; * Notes:
+; *
+; * This implementation makes use of 16 bit fixed point version of two multiply
+; * constants:
+; *        1.   sqrt(2) * cos (pi/8)
+; *        2.   sqrt(2) * sin (pi/8)
+; * Because the first constant is bigger than 1, to maintain the same 16 bit
+; * fixed point precision as the second one, we use a trick of
+; *        x * a = x + x*(a-1)
+; * so
+; *        x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
+; *
+; * For the second constant, because of the 16bit version is 35468, which
+; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative
+; * number.
+; *        (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x
+; *
+; **************************************************************************/
+
+
+;void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred,
+;int pitch, unsigned char *dest,int stride)
+global sym(vp8_short_idct4x4llm_mmx) PRIVATE
+sym(vp8_short_idct4x4llm_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov         rax,    arg(0)              ;input
+    mov         rsi,    arg(1)              ;pred
+
+    movq        mm0,    [rax   ]
+    movq        mm1,    [rax+ 8]
+    movq        mm2,    [rax+16]
+    movq        mm3,    [rax+24]
+
+%if 0
+    pxor        mm7,    mm7
+    movq        [rax],   mm7
+    movq        [rax+8], mm7
+    movq        [rax+16],mm7
+    movq        [rax+24],mm7
+%endif
+    movsxd      rax,    dword ptr arg(2)    ;pitch
+    mov         rdx,    arg(3)              ;dest
+    movsxd      rdi,    dword ptr arg(4)    ;stride
+
+
+    psubw       mm0,            mm2             ; b1= 0-2
+    paddw       mm2,            mm2             ;
+
+    movq        mm5,            mm1
+    paddw       mm2,            mm0             ; a1 =0+2
+
+    pmulhw      mm5,            [GLOBAL(x_s1sqr2)];
+    paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
+
+    movq        mm7,            mm3             ;
+    pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];
+
+    paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
+    psubw       mm7,            mm5             ; c1
+
+    movq        mm5,            mm1
+    movq        mm4,            mm3
+
+    pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
+    paddw       mm5,            mm1
+
+    pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
+    paddw       mm3,            mm4
+
+    paddw       mm3,            mm5             ; d1
+    movq        mm6,            mm2             ; a1
+
+    movq        mm4,            mm0             ; b1
+    paddw       mm2,            mm3             ;0
+
+    paddw       mm4,            mm7             ;1
+    psubw       mm0,            mm7             ;2
+
+    psubw       mm6,            mm3             ;3
+
+    movq        mm1,            mm2             ; 03 02 01 00
+    movq        mm3,            mm4             ; 23 22 21 20
+
+    punpcklwd   mm1,            mm0             ; 11 01 10 00
+    punpckhwd   mm2,            mm0             ; 13 03 12 02
+
+    punpcklwd   mm3,            mm6             ; 31 21 30 20
+    punpckhwd   mm4,            mm6             ; 33 23 32 22
+
+    movq        mm0,            mm1             ; 11 01 10 00
+    movq        mm5,            mm2             ; 13 03 12 02
+
+    punpckldq   mm0,            mm3             ; 30 20 10 00
+    punpckhdq   mm1,            mm3             ; 31 21 11 01
+
+    punpckldq   mm2,            mm4             ; 32 22 12 02
+    punpckhdq   mm5,            mm4             ; 33 23 13 03
+
+    movq        mm3,            mm5             ; 33 23 13 03
+
+    psubw       mm0,            mm2             ; b1= 0-2
+    paddw       mm2,            mm2             ;
+
+    movq        mm5,            mm1
+    paddw       mm2,            mm0             ; a1 =0+2
+
+    pmulhw      mm5,            [GLOBAL(x_s1sqr2)];
+    paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
+
+    movq        mm7,            mm3             ;
+    pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];
+
+    paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
+    psubw       mm7,            mm5             ; c1
+
+    movq        mm5,            mm1
+    movq        mm4,            mm3
+
+    pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
+    paddw       mm5,            mm1
+
+    pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
+    paddw       mm3,            mm4
+
+    paddw       mm3,            mm5             ; d1
+    paddw       mm0,            [GLOBAL(fours)]
+
+    paddw       mm2,            [GLOBAL(fours)]
+    movq        mm6,            mm2             ; a1
+
+    movq        mm4,            mm0             ; b1
+    paddw       mm2,            mm3             ;0
+
+    paddw       mm4,            mm7             ;1
+    psubw       mm0,            mm7             ;2
+
+    psubw       mm6,            mm3             ;3
+    psraw       mm2,            3
+
+    psraw       mm0,            3
+    psraw       mm4,            3
+
+    psraw       mm6,            3
+
+    movq        mm1,            mm2             ; 03 02 01 00
+    movq        mm3,            mm4             ; 23 22 21 20
+
+    punpcklwd   mm1,            mm0             ; 11 01 10 00
+    punpckhwd   mm2,            mm0             ; 13 03 12 02
+
+    punpcklwd   mm3,            mm6             ; 31 21 30 20
+    punpckhwd   mm4,            mm6             ; 33 23 32 22
+
+    movq        mm0,            mm1             ; 11 01 10 00
+    movq        mm5,            mm2             ; 13 03 12 02
+
+    punpckldq   mm0,            mm3             ; 30 20 10 00
+    punpckhdq   mm1,            mm3             ; 31 21 11 01
+
+    punpckldq   mm2,            mm4             ; 32 22 12 02
+    punpckhdq   mm5,            mm4             ; 33 23 13 03
+
+    pxor        mm7,            mm7
+
+    movd        mm4,            [rsi]
+    punpcklbw   mm4,            mm7
+    paddsw      mm0,            mm4
+    packuswb    mm0,            mm7
+    movd        [rdx],          mm0
+
+    movd        mm4,            [rsi+rax]
+    punpcklbw   mm4,            mm7
+    paddsw      mm1,            mm4
+    packuswb    mm1,            mm7
+    movd        [rdx+rdi],      mm1
+
+    movd        mm4,            [rsi+2*rax]
+    punpcklbw   mm4,            mm7
+    paddsw      mm2,            mm4
+    packuswb    mm2,            mm7
+    movd        [rdx+rdi*2],    mm2
+
+    add         rdx,            rdi
+    add         rsi,            rax
+
+    movd        mm4,            [rsi+2*rax]
+    punpcklbw   mm4,            mm7
+    paddsw      mm5,            mm4
+    packuswb    mm5,            mm7
+    movd        [rdx+rdi*2],    mm5
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_dc_only_idct_add_mmx(
+;short input_dc,
+;unsigned char *pred_ptr,
+;int pred_stride,
+;unsigned char *dst_ptr,
+;int stride)
+global sym(vp8_dc_only_idct_add_mmx) PRIVATE
+sym(vp8_dc_only_idct_add_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    GET_GOT     rbx
+    ; end prolog
+
+        movd        mm5,            arg(0) ;input_dc
+        mov         rax,            arg(1) ;pred_ptr
+        movsxd      rdx,            dword ptr arg(2) ;pred_stride
+
+        pxor        mm0,            mm0
+
+        paddw       mm5,            [GLOBAL(fours)]
+        lea         rcx,            [rdx + rdx*2]
+
+        psraw       mm5,            3
+
+        punpcklwd   mm5,            mm5
+
+        punpckldq   mm5,            mm5
+
+        movd        mm1,            [rax]
+        movd        mm2,            [rax+rdx]
+        movd        mm3,            [rax+2*rdx]
+        movd        mm4,            [rax+rcx]
+
+        mov         rax,            arg(3) ;d -- destination
+        movsxd      rdx,            dword ptr arg(4) ;dst_stride
+
+        punpcklbw   mm1,            mm0
+        paddsw      mm1,            mm5
+        packuswb    mm1,            mm0              ; pack and unpack to saturate
+        lea         rcx,            [rdx + rdx*2]
+
+        punpcklbw   mm2,            mm0
+        paddsw      mm2,            mm5
+        packuswb    mm2,            mm0              ; pack and unpack to saturate
+
+        punpcklbw   mm3,            mm0
+        paddsw      mm3,            mm5
+        packuswb    mm3,            mm0              ; pack and unpack to saturate
+
+        punpcklbw   mm4,            mm0
+        paddsw      mm4,            mm5
+        packuswb    mm4,            mm0              ; pack and unpack to saturate
+
+        movd        [rax],          mm1
+        movd        [rax+rdx],      mm2
+        movd        [rax+2*rdx],    mm3
+        movd        [rax+rcx],      mm4
+
+    ; begin epilog
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+x_s1sqr2:
+    times 4 dw 0x8A8C
+align 16
+x_c1sqr2less1:
+    times 4 dw 0x4E7B
+align 16
+fours:
+    times 4 dw 0x0004
diff --git a/libs/libvpx/vp8/common/x86/idctllm_sse2.asm b/libs/libvpx/vp8/common/x86/idctllm_sse2.asm
new file mode 100644
index 0000000000..bf8e2c4021
--- /dev/null
+++ b/libs/libvpx/vp8/common/x86/idctllm_sse2.asm
@@ -0,0 +1,708 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp8_idct_dequant_0_2x_sse2
+; (
+;   short *qcoeff       - 0
+;   short *dequant      - 1
+;   unsigned char *dst  - 2
+;   int dst_stride      - 3
+; )
+
+global sym(vp8_idct_dequant_0_2x_sse2) PRIVATE
+sym(vp8_idct_dequant_0_2x_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    GET_GOT     rbx
+    ; end prolog
+
+        mov         rdx,            arg(1) ; dequant
+        mov         rax,            arg(0) ; qcoeff
+
+        movd        xmm4,           [rax]
+        movd        xmm5,           [rdx]
+
+        pinsrw      xmm4,           [rax+32],   4
+        pinsrw      xmm5,           [rdx],      4
+
+        pmullw      xmm4,           xmm5
+
+    ; Zero out xmm5, for use unpacking
+        pxor        xmm5,           xmm5
+
+    ; clear coeffs
+        movd        [rax],          xmm5
+        movd        [rax+32],       xmm5
+;pshufb
+        mov         rax,            arg(2) ; dst
+        movsxd      rdx,            dword ptr arg(3) ; dst_stride
+
+        pshuflw     xmm4,           xmm4,       00000000b
+        pshufhw     xmm4,           xmm4,       00000000b
+
+        lea         rcx,            [rdx + rdx*2]
+        paddw       xmm4,           [GLOBAL(fours)]
+
+        psraw       xmm4,           3
+
+        movq        xmm0,           [rax]
+        movq        xmm1,           [rax+rdx]
+        movq        xmm2,           [rax+2*rdx]
+        movq        xmm3,           [rax+rcx]
+
+        punpcklbw   xmm0,           xmm5
+        punpcklbw   xmm1,           xmm5
+        punpcklbw   xmm2,           xmm5
+        punpcklbw   xmm3,           xmm5
+
+
+    ; Add to predict buffer
+        paddw       xmm0,           xmm4
+        paddw       xmm1,           xmm4
+        paddw       xmm2,           xmm4
+        paddw       xmm3,           xmm4
+
+    ; pack up before storing
+        packuswb    xmm0,           xmm5
+        packuswb    xmm1,           xmm5
+        packuswb    xmm2,           xmm5
+        packuswb    xmm3,           xmm5
+
+    ; store blocks back out
+        movq        [rax],          xmm0
+        movq        [rax + rdx],    xmm1
+
+        lea         rax,            [rax + 2*rdx]
+
+        movq        [rax],          xmm2
+        movq        [rax + rdx],    xmm3
+
+    ; begin epilog
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_idct_dequant_full_2x_sse2
+; (
+;   short *qcoeff       - 0
+;   short *dequant      - 1
+;   unsigned char *dst  - 2
+;   int dst_stride      - 3
+; )
+global sym(vp8_idct_dequant_full_2x_sse2) PRIVATE
+sym(vp8_idct_dequant_full_2x_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ; special case when 2 blocks have 0 or 1 coeffs
+    ; dc is set as first coeff, so no need to load qcoeff
+        mov         rax,            arg(0) ; qcoeff
+        mov         rdx,            arg(1)  ; dequant
+        mov         rdi,            arg(2) ; dst
+
+
+    ; Zero out xmm7, for use unpacking
+        pxor        xmm7,           xmm7
+
+
+    ; note the transpose of xmm1 and xmm2, necessary for shuffle
+    ;   to spit out sensicle data
+        movdqa      xmm0,           [rax]
+        movdqa      xmm2,           [rax+16]
+        movdqa      xmm1,           [rax+32]
+        movdqa      xmm3,           [rax+48]
+
+    ; Clear out coeffs
+        movdqa      [rax],          xmm7
+        movdqa      [rax+16],       xmm7
+        movdqa      [rax+32],       xmm7
+        movdqa      [rax+48],       xmm7
+
+    ; dequantize qcoeff buffer
+        pmullw      xmm0,           [rdx]
+        pmullw      xmm2,           [rdx+16]
+        pmullw      xmm1,           [rdx]
+        pmullw      xmm3,           [rdx+16]
+        movsxd      rdx,            dword ptr arg(3) ; dst_stride
+
+    ; repack so block 0 row x and block 1 row x are together
+        movdqa      xmm4,           xmm0
+        punpckldq   xmm0,           xmm1
+        punpckhdq   xmm4,           xmm1
+
+        pshufd      xmm0,           xmm0,       11011000b
+        pshufd      xmm1,           xmm4,       11011000b
+
+        movdqa      xmm4,           xmm2
+        punpckldq   xmm2,           xmm3
+        punpckhdq   xmm4,           xmm3
+
+        pshufd      xmm2,           xmm2,       11011000b
+        pshufd      xmm3,           xmm4,       11011000b
+
+    ; first pass
+        psubw       xmm0,           xmm2        ; b1 = 0-2
+        paddw       xmm2,           xmm2        ;
+
+        movdqa      xmm5,           xmm1
+        paddw       xmm2,           xmm0        ; a1 = 0+2
+
+        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
+        lea         rcx,            [rdx + rdx*2]   ;dst_stride * 3
+        paddw       xmm5,           xmm1        ; ip1 * sin(pi/8) * sqrt(2)
+
+        movdqa      xmm7,           xmm3
+        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
+
+        paddw       xmm7,           xmm3        ; ip3 * cos(pi/8) * sqrt(2)
+        psubw       xmm7,           xmm5        ; c1
+
+        movdqa      xmm5,           xmm1
+        movdqa      xmm4,           xmm3
+
+        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
+        paddw       xmm5,           xmm1
+
+        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
+        paddw       xmm3,           xmm4
+
+        paddw       xmm3,           xmm5        ; d1
+        movdqa      xmm6,           xmm2        ; a1
+
+        movdqa      xmm4,           xmm0        ; b1
+        paddw       xmm2,           xmm3        ;0
+
+        paddw       xmm4,           xmm7        ;1
+        psubw       xmm0,           xmm7        ;2
+
+        psubw       xmm6,           xmm3        ;3
+
+    ; transpose for the second pass
+        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
+        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
+        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
+
+        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
+        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
+        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
+
+
+        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
+        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
+        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
+
+        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
+        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
+        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
+
+
+        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
+        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
+        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
+
+        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
+        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
+        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
+
+        pshufd      xmm0,           xmm2,       11011000b
+        pshufd      xmm2,           xmm1,       11011000b
+
+        pshufd      xmm1,           xmm5,       11011000b
+        pshufd      xmm3,           xmm7,       11011000b
+
+    ; second pass
+        psubw       xmm0,           xmm2            ; b1 = 0-2
+        paddw       xmm2,           xmm2
+
+        movdqa      xmm5,           xmm1
+        paddw       xmm2,           xmm0            ; a1 = 0+2
+
+        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
+        paddw       xmm5,           xmm1            ; ip1 * sin(pi/8) * sqrt(2)
+
+        movdqa      xmm7,           xmm3
+        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
+
+        paddw       xmm7,           xmm3            ; ip3 * cos(pi/8) * sqrt(2)
+        psubw       xmm7,           xmm5            ; c1
+
+        movdqa      xmm5,           xmm1
+        movdqa      xmm4,           xmm3
+
+        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
+        paddw       xmm5,           xmm1
+
+        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
+        paddw       xmm3,           xmm4
+
+        paddw       xmm3,           xmm5            ; d1
+        paddw       xmm0,           [GLOBAL(fours)]
+
+        paddw       xmm2,           [GLOBAL(fours)]
+        movdqa      xmm6,           xmm2            ; a1
+
+        movdqa      xmm4,           xmm0            ; b1
+        paddw       xmm2,           xmm3            ;0
+
+        paddw       xmm4,           xmm7            ;1
+        psubw       xmm0,           xmm7            ;2
+
+        psubw       xmm6,           xmm3            ;3
+        psraw       xmm2,           3
+
+        psraw       xmm0,           3
+        psraw       xmm4,           3
+
+        psraw       xmm6,           3
+
+    ; transpose to save
+        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
+        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
+        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
+
+        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
+        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
+        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
+
+
+        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
+        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
+        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
+
+        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
+        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
+        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
+
+
+        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
+        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
+        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
+
+        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
+        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
+        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
+
+        pshufd      xmm0,           xmm2,       11011000b
+        pshufd      xmm2,           xmm1,       11011000b
+
+        pshufd      xmm1,           xmm5,       11011000b
+        pshufd      xmm3,           xmm7,       11011000b
+
+        pxor        xmm7,           xmm7
+
+    ; Load up predict blocks
+        movq        xmm4,           [rdi]
+        movq        xmm5,           [rdi+rdx]
+
+        punpcklbw   xmm4,           xmm7
+        punpcklbw   xmm5,           xmm7
+
+        paddw       xmm0,           xmm4
+        paddw       xmm1,           xmm5
+
+        movq        xmm4,           [rdi+2*rdx]
+        movq        xmm5,           [rdi+rcx]
+
+        punpcklbw   xmm4,           xmm7
+        punpcklbw   xmm5,           xmm7
+
+        paddw       xmm2,           xmm4
+        paddw       xmm3,           xmm5
+
+.finish:
+
+    ; pack up before storing
+        packuswb    xmm0,           xmm7
+        packuswb    xmm1,           xmm7
+        packuswb    xmm2,           xmm7
+        packuswb    xmm3,           xmm7
+
+    ; store blocks back out
+        movq        [rdi],          xmm0
+        movq        [rdi + rdx],    xmm1
+        movq        [rdi + rdx*2],  xmm2
+        movq        [rdi + rcx],    xmm3
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_idct_dequant_dc_0_2x_sse2
+; (
+;   short *qcoeff       - 0
+;   short *dequant      - 1
+;   unsigned char *dst  - 2
+;   int dst_stride      - 3
+;   short *dc           - 4
+; )
+global sym(vp8_idct_dequant_dc_0_2x_sse2) PRIVATE
+sym(vp8_idct_dequant_dc_0_2x_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    GET_GOT     rbx
+    push        rdi
+    ; end prolog
+
+    ; special case when 2 blocks have 0 or 1 coeffs
+    ; dc is set as first coeff, so no need to load qcoeff
+        mov         rax,            arg(0) ; qcoeff
+
+        mov         rdi,            arg(2) ; dst
+        mov         rdx,            arg(4) ; dc
+
+    ; Zero out xmm5, for use unpacking
+        pxor        xmm5,           xmm5
+
+    ; load up 2 dc words here == 2*16 = doubleword
+        movd        xmm4,           [rdx]
+
+        movsxd      rdx,            dword ptr arg(3) ; dst_stride
+        lea         rcx, [rdx + rdx*2]
+    ; Load up predict blocks
+        movq        xmm0,           [rdi]
+        movq        xmm1,           [rdi+rdx*1]
+        movq        xmm2,           [rdi+rdx*2]
+        movq        xmm3,           [rdi+rcx]
+
+    ; Duplicate and expand dc across
+        punpcklwd   xmm4,           xmm4
+        punpckldq   xmm4,           xmm4
+
+    ; Rounding to dequant and downshift
+        paddw       xmm4,           [GLOBAL(fours)]
+        psraw       xmm4,           3
+
+    ; Predict buffer needs to be expanded from bytes to words
+        punpcklbw   xmm0,           xmm5
+        punpcklbw   xmm1,           xmm5
+        punpcklbw   xmm2,           xmm5
+        punpcklbw   xmm3,           xmm5
+
+    ; Add to predict buffer
+        paddw       xmm0,           xmm4
+        paddw       xmm1,           xmm4
+        paddw       xmm2,           xmm4
+        paddw       xmm3,           xmm4
+
+    ; pack up before storing
+        packuswb    xmm0,           xmm5
+        packuswb    xmm1,           xmm5
+        packuswb    xmm2,           xmm5
+        packuswb    xmm3,           xmm5
+
+    ; store blocks back out
+        movq        [rdi],          xmm0
+        movq        [rdi + rdx],    xmm1
+        movq        [rdi + rdx*2],  xmm2
+        movq        [rdi + rcx],    xmm3
+
+    ; begin epilog
+    pop         rdi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+;void vp8_idct_dequant_dc_full_2x_sse2
+; (
+;   short *qcoeff       - 0
+;   short *dequant      - 1
+;   unsigned char *dst  - 2
+;   int dst_stride      - 3
+;   short *dc           - 4
+; )
+global sym(vp8_idct_dequant_dc_full_2x_sse2) PRIVATE
+sym(vp8_idct_dequant_dc_full_2x_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rdi
+    ; end prolog
+
+    ; special case when 2 blocks have 0 or 1 coeffs
+    ; dc is set as first coeff, so no need to load qcoeff
+        mov         rax,            arg(0) ; qcoeff
+        mov         rdx,            arg(1)  ; dequant
+
+        mov         rdi,            arg(2) ; dst
+
+    ; Zero out xmm7, for use unpacking
+        pxor        xmm7,           xmm7
+
+
+    ; note the transpose of xmm1 and xmm2, necessary for shuffle
+    ;   to spit out sensicle data
+        movdqa      xmm0,           [rax]
+        movdqa      xmm2,           [rax+16]
+        movdqa      xmm1,           [rax+32]
+        movdqa      xmm3,           [rax+48]
+
+    ; Clear out coeffs
+        movdqa      [rax],          xmm7
+        movdqa      [rax+16],       xmm7
+        movdqa      [rax+32],       xmm7
+        movdqa      [rax+48],       xmm7
+
+    ; dequantize qcoeff buffer
+        pmullw      xmm0,           [rdx]
+        pmullw      xmm2,           [rdx+16]
+        pmullw      xmm1,           [rdx]
+        pmullw      xmm3,           [rdx+16]
+
+    ; DC component
+        mov         rdx,            arg(4)
+
+    ; repack so block 0 row x and block 1 row x are together
+        movdqa      xmm4,           xmm0
+        punpckldq   xmm0,           xmm1
+        punpckhdq   xmm4,           xmm1
+
+        pshufd      xmm0,           xmm0,       11011000b
+        pshufd      xmm1,           xmm4,       11011000b
+
+        movdqa      xmm4,           xmm2
+        punpckldq   xmm2,           xmm3
+        punpckhdq   xmm4,           xmm3
+
+        pshufd      xmm2,           xmm2,       11011000b
+        pshufd      xmm3,           xmm4,       11011000b
+
+    ; insert DC component
+        pinsrw      xmm0,           [rdx],      0
+        pinsrw      xmm0,           [rdx+2],    4
+
+    ; first pass
+        psubw       xmm0,           xmm2        ; b1 = 0-2
+        paddw       xmm2,           xmm2        ;
+
+        movdqa      xmm5,           xmm1
+        paddw       xmm2,           xmm0        ; a1 = 0+2
+
+        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
+        paddw       xmm5,           xmm1        ; ip1 * sin(pi/8) * sqrt(2)
+
+        movdqa      xmm7,           xmm3
+        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
+
+        paddw       xmm7,           xmm3        ; ip3 * cos(pi/8) * sqrt(2)
+        psubw       xmm7,           xmm5        ; c1
+
+        movdqa      xmm5,           xmm1
+        movdqa      xmm4,           xmm3
+
+        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
+        paddw       xmm5,           xmm1
+
+        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
+        paddw       xmm3,           xmm4
+
+        paddw       xmm3,           xmm5        ; d1
+        movdqa      xmm6,           xmm2        ; a1
+
+        movdqa      xmm4,           xmm0        ; b1
+        paddw       xmm2,           xmm3        ;0
+
+        paddw       xmm4,           xmm7        ;1
+        psubw       xmm0,           xmm7        ;2
+
+        psubw       xmm6,           xmm3        ;3
+
+    ; transpose for the second pass
+        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
+        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
+        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
+
+        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
+        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
+        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
+
+
+        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
+        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
+        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
+
+        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
+        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
+        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
+
+
+        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
+        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
+        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
+
+        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
+        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
+        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
+
+        pshufd      xmm0,           xmm2,       11011000b
+        pshufd      xmm2,           xmm1,       11011000b
+
+        pshufd      xmm1,           xmm5,       11011000b
+        pshufd      xmm3,           xmm7,       11011000b
+
+    ; second pass
+        psubw       xmm0,           xmm2            ; b1 = 0-2
+        paddw       xmm2,           xmm2
+
+        movdqa      xmm5,           xmm1
+        paddw       xmm2,           xmm0            ; a1 = 0+2
+
+        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
+        paddw       xmm5,           xmm1            ; ip1 * sin(pi/8) * sqrt(2)
+
+        movdqa      xmm7,           xmm3
+        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
+
+        paddw       xmm7,           xmm3            ; ip3 * cos(pi/8) * sqrt(2)
+        psubw       xmm7,           xmm5            ; c1
+
+        movdqa      xmm5,           xmm1
+        movdqa      xmm4,           xmm3
+
+        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
+        paddw       xmm5,           xmm1
+
+        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
+        paddw       xmm3,           xmm4
+
+        paddw       xmm3,           xmm5            ; d1
+        paddw       xmm0,           [GLOBAL(fours)]
+
+        paddw       xmm2,           [GLOBAL(fours)]
+        movdqa      xmm6,           xmm2            ; a1
+
+        movdqa      xmm4,           xmm0            ; b1
+        paddw       xmm2,           xmm3            ;0
+
+        paddw       xmm4,           xmm7            ;1
+        psubw       xmm0,           xmm7            ;2
+
+        psubw       xmm6,           xmm3            ;3
+        psraw       xmm2,           3
+
+        psraw       xmm0,           3
+        psraw       xmm4,           3
+
+        psraw       xmm6,           3
+
+    ; transpose to save
+        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
+        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
+        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
+
+        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
+        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
+        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
+
+
+        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
+        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
+        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
+
+        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
+        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
+        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
+
+
+        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
+        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
+        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
+
+        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
+        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
+        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
+
+        pshufd      xmm0,           xmm2,       11011000b
+        pshufd      xmm2,           xmm1,       11011000b
+
+        pshufd      xmm1,           xmm5,       11011000b
+        pshufd      xmm3,           xmm7,       11011000b
+
+        pxor        xmm7,           xmm7
+
+    ; Load up predict blocks
+        movsxd      rdx,            dword ptr arg(3) ; dst_stride
+        movq        xmm4,           [rdi]
+        movq        xmm5,           [rdi+rdx]
+        lea         rcx,            [rdx + rdx*2]
+
+        punpcklbw   xmm4,           xmm7
+        punpcklbw   xmm5,           xmm7
+
+        paddw       xmm0,           xmm4
+        paddw       xmm1,           xmm5
+
+        movq        xmm4,           [rdi+rdx*2]
+        movq        xmm5,           [rdi+rcx]
+
+        punpcklbw   xmm4,           xmm7
+        punpcklbw   xmm5,           xmm7
+
+        paddw       xmm2,           xmm4
+        paddw       xmm3,           xmm5
+
+.finish:
+
+    ; pack up before storing
+        packuswb    xmm0,           xmm7
+        packuswb    xmm1,           xmm7
+        packuswb    xmm2,           xmm7
+        packuswb    xmm3,           xmm7
+
+    ; Load destination stride before writing out,
+    ;   doesn't need to persist
+        movsxd      rdx,            dword ptr arg(3) ; dst_stride
+
+    ; store blocks back out
+        movq        [rdi],          xmm0
+        movq        [rdi + rdx],    xmm1
+
+        lea         rdi,            [rdi + 2*rdx]
+
+        movq        [rdi],          xmm2
+        movq        [rdi + rdx],    xmm3
+
+
+    ; begin epilog
+    pop         rdi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+fours:
+    times 8 dw 0x0004
+align 16
+x_s1sqr2:
+    times 8 dw 0x8A8C
+align 16
+x_c1sqr2less1:
+    times 8 dw 0x4E7B
diff --git a/libs/libvpx/vp8/common/x86/iwalsh_mmx.asm b/libs/libvpx/vp8/common/x86/iwalsh_mmx.asm
new file mode 100644
index 0000000000..158c3b7458
--- /dev/null
+++ b/libs/libvpx/vp8/common/x86/iwalsh_mmx.asm
@@ -0,0 +1,140 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp8_short_inv_walsh4x4_mmx(short *input, short *output)
+global sym(vp8_short_inv_walsh4x4_mmx) PRIVATE
+sym(vp8_short_inv_walsh4x4_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 2
+    ; end prolog
+
+    mov         rdx, arg(0)
+    mov         rax, 30003h
+
+    movq        mm0, [rdx + 0]    ;ip[0]
+    movq        mm1, [rdx + 8]    ;ip[4]
+    movq        mm7, rax
+
+    movq        mm2, [rdx + 16]   ;ip[8]
+    movq        mm3, [rdx + 24]   ;ip[12]
+    punpcklwd   mm7, mm7          ;0003000300030003h
+    mov         rdx, arg(1)
+
+    movq        mm4, mm0
+    movq        mm5, mm1
+
+    paddw       mm4, mm3          ;ip[0] + ip[12] aka al
+    paddw       mm5, mm2          ;ip[4] + ip[8] aka bl
+
+    movq        mm6, mm4          ;temp al
+    paddw       mm4, mm5          ;al + bl
+    psubw       mm6, mm5          ;al - bl
+
+    psubw       mm0, mm3          ;ip[0] - ip[12] aka d1
+    psubw       mm1, mm2          ;ip[4] - ip[8] aka c1
+
+    movq        mm5, mm0          ;temp dl
+    paddw       mm0, mm1          ;dl + cl
+    psubw       mm5, mm1          ;dl - cl
+
+    ; 03 02 01 00
+    ; 13 12 11 10
+    ; 23 22 21 20
+    ; 33 32 31 30
+
+    movq        mm3, mm4          ; 03 02 01 00
+    punpcklwd   mm4, mm0          ; 11 01 10 00
+    punpckhwd   mm3, mm0          ; 13 03 12 02
+
+    movq        mm1, mm6          ; 23 22 21 20
+    punpcklwd   mm6, mm5          ; 31 21 30 20
+    punpckhwd   mm1, mm5          ; 33 23 32 22
+
+    movq        mm0, mm4          ; 11 01 10 00
+    movq        mm2, mm3          ; 13 03 12 02
+
+    punpckldq   mm0, mm6          ; 30 20 10 00 aka ip[0]
+    punpckhdq   mm4, mm6          ; 31 21 11 01 aka ip[4]
+
+    punpckldq   mm2, mm1          ; 32 22 12 02 aka ip[8]
+    punpckhdq   mm3, mm1          ; 33 23 13 03 aka ip[12]
+;~~~~~~~~~~~~~~~~~~~~~
+    movq        mm1, mm0
+    movq        mm5, mm4
+    paddw       mm1, mm3          ;ip[0] + ip[12] aka al
+    paddw       mm5, mm2          ;ip[4] + ip[8] aka bl
+
+    movq        mm6, mm1          ;temp al
+    paddw       mm1, mm5          ;al + bl
+    psubw       mm6, mm5          ;al - bl
+    paddw       mm1, mm7
+    paddw       mm6, mm7
+    psraw       mm1, 3
+    psraw       mm6, 3
+
+    psubw       mm0, mm3          ;ip[0] - ip[12] aka d1
+    psubw       mm4, mm2          ;ip[4] - ip[8] aka c1
+
+    movq        mm5, mm0          ;temp dl
+    paddw       mm0, mm4          ;dl + cl
+    psubw       mm5, mm4          ;dl - cl
+    paddw       mm0, mm7
+    paddw       mm5, mm7
+    psraw       mm0, 3
+    psraw       mm5, 3
+;~~~~~~~~~~~~~~~~~~~~~
+
+    movd        eax, mm1
+    movd        ecx, mm0
+    psrlq       mm0, 32
+    psrlq       mm1, 32
+    mov         word ptr[rdx+32*0], ax
+    mov         word ptr[rdx+32*1], cx
+    shr         eax, 16
+    shr         ecx, 16
+    mov         word ptr[rdx+32*4], ax
+    mov         word ptr[rdx+32*5], cx
+    movd        eax, mm1
+    movd        ecx, mm0
+    mov         word ptr[rdx+32*8], ax
+    mov         word ptr[rdx+32*9], cx
+    shr         eax, 16
+    shr         ecx, 16
+    mov         word ptr[rdx+32*12], ax
+    mov         word ptr[rdx+32*13], cx
+
+    movd        eax, mm6
+    movd        ecx, mm5
+    psrlq       mm5, 32
+    psrlq       mm6, 32
+    mov         word ptr[rdx+32*2], ax
+    mov         word ptr[rdx+32*3], cx
+    shr         eax, 16
+    shr         ecx, 16
+    mov         word ptr[rdx+32*6], ax
+    mov         word ptr[rdx+32*7], cx
+    movd        eax, mm6
+    movd        ecx, mm5
+    mov         word ptr[rdx+32*10], ax
+    mov         word ptr[rdx+32*11], cx
+    shr         eax, 16
+    shr         ecx, 16
+    mov         word ptr[rdx+32*14], ax
+    mov         word ptr[rdx+32*15], cx
+
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
diff --git a/libs/libvpx/vp8/common/x86/iwalsh_sse2.asm b/libs/libvpx/vp8/common/x86/iwalsh_sse2.asm
new file mode 100644
index 0000000000..06e86a80b6
--- /dev/null
+++ b/libs/libvpx/vp8/common/x86/iwalsh_sse2.asm
@@ -0,0 +1,121 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp8_short_inv_walsh4x4_sse2(short *input, short *output)
+global sym(vp8_short_inv_walsh4x4_sse2) PRIVATE
+sym(vp8_short_inv_walsh4x4_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 2
+    ; end prolog
+
+    mov         rcx, arg(0)
+    mov         rdx, arg(1)
+    mov         rax, 30003h
+
+    movdqa      xmm0, [rcx + 0]     ;ip[4] ip[0]
+    movdqa      xmm1, [rcx + 16]    ;ip[12] ip[8]
+
+
+    pshufd      xmm2, xmm1, 4eh     ;ip[8] ip[12]
+    movdqa      xmm3, xmm0          ;ip[4] ip[0]
+
+    paddw       xmm0, xmm2          ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
+    psubw       xmm3, xmm2          ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
+
+    movdqa      xmm4, xmm0
+    punpcklqdq  xmm0, xmm3          ;d1 a1
+    punpckhqdq  xmm4, xmm3          ;c1 b1
+
+    movdqa      xmm1, xmm4          ;c1 b1
+    paddw       xmm4, xmm0          ;dl+cl a1+b1 aka op[4] op[0]
+    psubw       xmm0, xmm1          ;d1-c1 a1-b1 aka op[12] op[8]
+
+    ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    ; 13 12 11 10 03 02 01 00
+    ;
+    ; 33 32 31 30 23 22 21 20
+    ;
+    movdqa      xmm3, xmm4          ; 13 12 11 10 03 02 01 00
+    punpcklwd   xmm4, xmm0          ; 23 03 22 02 21 01 20 00
+    punpckhwd   xmm3, xmm0          ; 33 13 32 12 31 11 30 10
+    movdqa      xmm1, xmm4          ; 23 03 22 02 21 01 20 00
+    punpcklwd   xmm4, xmm3          ; 31 21 11 01 30 20 10 00
+    punpckhwd   xmm1, xmm3          ; 33 23 13 03 32 22 12 02
+    ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    movd        xmm0, eax
+    pshufd      xmm2, xmm1, 4eh     ;ip[8] ip[12]
+    movdqa      xmm3, xmm4          ;ip[4] ip[0]
+
+    pshufd      xmm0, xmm0, 0       ;03 03 03 03 03 03 03 03
+
+    paddw       xmm4, xmm2          ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
+    psubw       xmm3, xmm2          ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
+
+    movdqa      xmm5, xmm4
+    punpcklqdq  xmm4, xmm3          ;d1 a1
+    punpckhqdq  xmm5, xmm3          ;c1 b1
+
+    movdqa      xmm1, xmm5          ;c1 b1
+    paddw       xmm5, xmm4          ;dl+cl a1+b1 aka op[4] op[0]
+    psubw       xmm4, xmm1          ;d1-c1 a1-b1 aka op[12] op[8]
+
+    paddw       xmm5, xmm0
+    paddw       xmm4, xmm0
+    psraw       xmm5, 3
+    psraw       xmm4, 3
+
+    movd        eax, xmm5
+    movd        ecx, xmm4
+    psrldq      xmm5, 4
+    psrldq      xmm4, 4
+    mov         word ptr[rdx+32*0], ax
+    mov         word ptr[rdx+32*2], cx
+    shr         eax, 16
+    shr         ecx, 16
+    mov         word ptr[rdx+32*4], ax
+    mov         word ptr[rdx+32*6], cx
+    movd        eax, xmm5
+    movd        ecx, xmm4
+    psrldq      xmm5, 4
+    psrldq      xmm4, 4
+    mov         word ptr[rdx+32*8], ax
+    mov         word ptr[rdx+32*10], cx
+    shr         eax, 16
+    shr         ecx, 16
+    mov         word ptr[rdx+32*12], ax
+    mov         word ptr[rdx+32*14], cx
+
+    movd        eax, xmm5
+    movd        ecx, xmm4
+    psrldq      xmm5, 4
+    psrldq      xmm4, 4
+    mov         word ptr[rdx+32*1], ax
+    mov         word ptr[rdx+32*3], cx
+    shr         eax, 16
+    shr         ecx, 16
+    mov         word ptr[rdx+32*5], ax
+    mov         word ptr[rdx+32*7], cx
+    movd        eax, xmm5
+    movd        ecx, xmm4
+    mov         word ptr[rdx+32*9], ax
+    mov         word ptr[rdx+32*11], cx
+    shr         eax, 16
+    shr         ecx, 16
+    mov         word ptr[rdx+32*13], ax
+    mov         word ptr[rdx+32*15], cx
+
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/libs/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm b/libs/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm
new file mode 100644
index 0000000000..6d5aaa19db
--- /dev/null
+++ b/libs/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm
@@ -0,0 +1,815 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro LF_ABS 2
+        ; %1 value not preserved
+        ; %2 value preserved
+        ; output in %1
+        movdqa      scratch1, %2            ; v2
+
+        psubusb     scratch1, %1            ; v2 - v1
+        psubusb     %1, %2                  ; v1 - v2
+        por         %1, scratch1            ; abs(v2 - v1)
+%endmacro
+
+%macro LF_FILTER_HEV_MASK 8-9
+
+        LF_ABS      %1, %2                  ; abs(p3 - p2)
+        LF_ABS      %2, %3                  ; abs(p2 - p1)
+        pmaxub      %1, %2                  ; accumulate mask
+%if %0 == 8
+        movdqa      scratch2, %3            ; save p1
+        LF_ABS      scratch2, %4            ; abs(p1 - p0)
+%endif
+        LF_ABS      %4, %5                  ; abs(p0 - q0)
+        LF_ABS      %5, %6                  ; abs(q0 - q1)
+%if %0 == 8
+        pmaxub      %5, scratch2            ; accumulate hev
+%else
+        pmaxub      %5, %9
+%endif
+        pmaxub      %1, %5                  ; accumulate mask
+
+        LF_ABS      %3, %6                  ; abs(p1 - q1)
+        LF_ABS      %6, %7                  ; abs(q1 - q2)
+        pmaxub      %1, %6                  ; accumulate mask
+        LF_ABS      %7, %8                  ; abs(q2 - q3)
+        pmaxub      %1, %7                  ; accumulate mask
+
+        paddusb     %4, %4                  ; 2 * abs(p0 - q0)
+        pand        %3, [GLOBAL(tfe)]
+        psrlw       %3, 1                   ; abs(p1 - q1) / 2
+        paddusb     %4, %3                  ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
+
+        psubusb     %1, [limit]
+        psubusb     %4, [blimit]
+        por         %1, %4
+        pcmpeqb     %1, zero                ; mask
+
+        psubusb     %5, [thresh]
+        pcmpeqb     %5, zero                ; ~hev
+%endmacro
+
+%macro LF_FILTER 6
+        ; %1-%4: p1-q1
+        ; %5: mask
+        ; %6: hev
+
+        movdqa      scratch2, %6            ; save hev
+
+        pxor        %1, [GLOBAL(t80)]       ; ps1
+        pxor        %4, [GLOBAL(t80)]       ; qs1
+        movdqa      scratch1, %1
+        psubsb      scratch1, %4            ; signed_char_clamp(ps1 - qs1)
+        pandn       scratch2, scratch1      ; vp8_filter &= hev
+
+        pxor        %2, [GLOBAL(t80)]       ; ps0
+        pxor        %3, [GLOBAL(t80)]       ; qs0
+        movdqa      scratch1, %3
+        psubsb      scratch1, %2            ; qs0 - ps0
+        paddsb      scratch2, scratch1      ; vp8_filter += (qs0 - ps0)
+        paddsb      scratch2, scratch1      ; vp8_filter += (qs0 - ps0)
+        paddsb      scratch2, scratch1      ; vp8_filter += (qs0 - ps0)
+        pand        %5, scratch2            ; &= mask
+
+        movdqa      scratch2, %5
+        paddsb      %5, [GLOBAL(t4)]        ; Filter1
+        paddsb      scratch2, [GLOBAL(t3)]  ; Filter2
+
+        ; Filter1 >> 3
+        movdqa      scratch1, zero
+        pcmpgtb     scratch1, %5
+        psrlw       %5, 3
+        pand        scratch1, [GLOBAL(te0)]
+        pand        %5, [GLOBAL(t1f)]
+        por         %5, scratch1
+
+        psubsb      %3, %5                  ; qs0 - Filter1
+        pxor        %3, [GLOBAL(t80)]
+
+        ; Filter2 >> 3
+        movdqa      scratch1, zero
+        pcmpgtb     scratch1, scratch2
+        psrlw       scratch2, 3
+        pand        scratch1, [GLOBAL(te0)]
+        pand        scratch2, [GLOBAL(t1f)]
+        por         scratch2, scratch1
+
+        paddsb      %2, scratch2            ; ps0 + Filter2
+        pxor        %2, [GLOBAL(t80)]
+
+        ; outer tap adjustments
+        paddsb      %5, [GLOBAL(t1)]
+        movdqa      scratch1, zero
+        pcmpgtb     scratch1, %5
+        psrlw       %5, 1
+        pand        scratch1, [GLOBAL(t80)]
+        pand        %5, [GLOBAL(t7f)]
+        por         %5, scratch1
+        pand        %5, %6                  ; vp8_filter &= ~hev
+
+        psubsb      %4, %5                  ; qs1 - vp8_filter
+        pxor        %4, [GLOBAL(t80)]
+
+        paddsb      %1, %5                  ; ps1 + vp8_filter
+        pxor        %1, [GLOBAL(t80)]
+%endmacro
+
+;void vp8_loop_filter_bh_y_sse2
+;(
+;    unsigned char *src_ptr,
+;    int            src_pixel_step,
+;    const char    *blimit,
+;    const char    *limit,
+;    const char    *thresh
+;)
+global sym(vp8_loop_filter_bh_y_sse2) PRIVATE
+sym(vp8_loop_filter_bh_y_sse2):
+
+%if LIBVPX_YASM_WIN64
+    %define src      rcx ; src_ptr
+    %define stride   rdx ; src_pixel_step
+    %define blimit   r8
+    %define limit    r9
+    %define thresh   r10
+
+    %define spp      rax
+    %define stride3  r11
+    %define stride5  r12
+    %define stride7  r13
+
+    push    rbp
+    mov     rbp, rsp
+    SAVE_XMM 11
+    push    r12
+    push    r13
+    mov     thresh, arg(4)
+%else
+    %define src      rdi ; src_ptr
+    %define stride   rsi ; src_pixel_step
+    %define blimit   rdx
+    %define limit    rcx
+    %define thresh   r8
+
+    %define spp      rax
+    %define stride3  r9
+    %define stride5  r10
+    %define stride7  r11
+%endif
+
+    %define scratch1 xmm5
+    %define scratch2 xmm6
+    %define zero     xmm7
+
+    %define i0       [src]
+    %define i1       [spp]
+    %define i2       [src + 2 * stride]
+    %define i3       [spp + 2 * stride]
+    %define i4       [src + 4 * stride]
+    %define i5       [spp + 4 * stride]
+    %define i6       [src + 2 * stride3]
+    %define i7       [spp + 2 * stride3]
+    %define i8       [src + 8 * stride]
+    %define i9       [spp + 8 * stride]
+    %define i10      [src + 2 * stride5]
+    %define i11      [spp + 2 * stride5]
+    %define i12      [src + 4 * stride3]
+    %define i13      [spp + 4 * stride3]
+    %define i14      [src + 2 * stride7]
+    %define i15      [spp + 2 * stride7]
+
+    ; prep work
+    lea         spp, [src + stride]
+    lea         stride3, [stride + 2 * stride]
+    lea         stride5, [stride3 + 2 * stride]
+    lea         stride7, [stride3 + 4 * stride]
+    pxor        zero, zero
+
+        ; load the first set into registers
+        movdqa       xmm0, i0
+        movdqa       xmm1, i1
+        movdqa       xmm2, i2
+        movdqa       xmm3, i3
+        movdqa       xmm4, i4
+        movdqa       xmm8, i5
+        movdqa       xmm9, i6   ; q2, will contain abs(p1-p0)
+        movdqa       xmm10, i7
+LF_FILTER_HEV_MASK xmm0, xmm1, xmm2, xmm3, xmm4, xmm8, xmm9, xmm10
+
+        movdqa       xmm1, i2
+        movdqa       xmm2, i3
+        movdqa       xmm3, i4
+        movdqa       xmm8, i5
+LF_FILTER xmm1, xmm2, xmm3, xmm8, xmm0, xmm4
+        movdqa       i2, xmm1
+        movdqa       i3, xmm2
+
+; second set
+        movdqa       i4, xmm3
+        movdqa       i5, xmm8
+
+        movdqa       xmm0, i6
+        movdqa       xmm1, i7
+        movdqa       xmm2, i8
+        movdqa       xmm4, i9
+        movdqa       xmm10, i10   ; q2, will contain abs(p1-p0)
+        movdqa       xmm11, i11
+LF_FILTER_HEV_MASK xmm3, xmm8, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm9
+
+        movdqa       xmm0, i6
+        movdqa       xmm1, i7
+        movdqa       xmm4, i8
+        movdqa       xmm8, i9
+LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2
+        movdqa       i6, xmm0
+        movdqa       i7, xmm1
+
+; last set
+        movdqa       i8, xmm4
+        movdqa       i9, xmm8
+
+        movdqa       xmm0, i10
+        movdqa       xmm1, i11
+        movdqa       xmm2, i12
+        movdqa       xmm3, i13
+        movdqa       xmm9, i14   ; q2, will contain abs(p1-p0)
+        movdqa       xmm11, i15
+LF_FILTER_HEV_MASK xmm4, xmm8, xmm0, xmm1, xmm2, xmm3, xmm9, xmm11, xmm10
+
+        movdqa       xmm0, i10
+        movdqa       xmm1, i11
+        movdqa       xmm3, i12
+        movdqa       xmm8, i13
+LF_FILTER xmm0, xmm1, xmm3, xmm8, xmm4, xmm2
+        movdqa       i10, xmm0
+        movdqa       i11, xmm1
+        movdqa       i12, xmm3
+        movdqa       i13, xmm8
+
+%if LIBVPX_YASM_WIN64
+    pop    r13
+    pop    r12
+    RESTORE_XMM
+    pop    rbp
+%endif
+
+    ret
+
+
+;void vp8_loop_filter_bv_y_sse2
+;(
+;    unsigned char *src_ptr,
+;    int            src_pixel_step,
+;    const char    *blimit,
+;    const char    *limit,
+;    const char    *thresh
+;)
+
+global sym(vp8_loop_filter_bv_y_sse2) PRIVATE
+sym(vp8_loop_filter_bv_y_sse2):
+
+%if LIBVPX_YASM_WIN64
+    %define src      rcx ; src_ptr
+    %define stride   rdx ; src_pixel_step
+    %define blimit   r8
+    %define limit    r9
+    %define thresh   r10
+
+    %define spp      rax
+    %define stride3  r11
+    %define stride5  r12
+    %define stride7  r13
+
+    push    rbp
+    mov     rbp, rsp
+    SAVE_XMM 15
+    push    r12
+    push    r13
+    mov     thresh, arg(4)
+%else
+    %define src      rdi
+    %define stride   rsi
+    %define blimit   rdx
+    %define limit    rcx
+    %define thresh   r8
+
+    %define spp      rax
+    %define stride3  r9
+    %define stride5  r10
+    %define stride7  r11
+%endif
+
+    %define scratch1 xmm5
+    %define scratch2 xmm6
+    %define zero     xmm7
+
+    %define s0       [src]
+    %define s1       [spp]
+    %define s2       [src + 2 * stride]
+    %define s3       [spp + 2 * stride]
+    %define s4       [src + 4 * stride]
+    %define s5       [spp + 4 * stride]
+    %define s6       [src + 2 * stride3]
+    %define s7       [spp + 2 * stride3]
+    %define s8       [src + 8 * stride]
+    %define s9       [spp + 8 * stride]
+    %define s10      [src + 2 * stride5]
+    %define s11      [spp + 2 * stride5]
+    %define s12      [src + 4 * stride3]
+    %define s13      [spp + 4 * stride3]
+    %define s14      [src + 2 * stride7]
+    %define s15      [spp + 2 * stride7]
+
+    %define i0       [rsp]
+    %define i1       [rsp + 16]
+    %define i2       [rsp + 32]
+    %define i3       [rsp + 48]
+    %define i4       [rsp + 64]
+    %define i5       [rsp + 80]
+    %define i6       [rsp + 96]
+    %define i7       [rsp + 112]
+    %define i8       [rsp + 128]
+    %define i9       [rsp + 144]
+    %define i10      [rsp + 160]
+    %define i11      [rsp + 176]
+    %define i12      [rsp + 192]
+    %define i13      [rsp + 208]
+    %define i14      [rsp + 224]
+    %define i15      [rsp + 240]
+
+    ALIGN_STACK 16, rax
+
+    ; reserve stack space
+    %define      temp_storage  0 ; size is 256 (16*16)
+    %define      stack_size 256
+    sub          rsp, stack_size
+
+    ; prep work
+    lea         spp, [src + stride]
+    lea         stride3, [stride + 2 * stride]
+    lea         stride5, [stride3 + 2 * stride]
+    lea         stride7, [stride3 + 4 * stride]
+
+        ; 8-f
+        movdqa      xmm0, s8
+        movdqa      xmm1, xmm0
+        punpcklbw   xmm0, s9                ; 80 90
+        punpckhbw   xmm1, s9                ; 88 98
+
+        movdqa      xmm2, s10
+        movdqa      xmm3, xmm2
+        punpcklbw   xmm2, s11 ; a0 b0
+        punpckhbw   xmm3, s11 ; a8 b8
+
+        movdqa      xmm4, xmm0
+        punpcklwd   xmm0, xmm2              ; 80 90 a0 b0
+        punpckhwd   xmm4, xmm2              ; 84 94 a4 b4
+
+        movdqa      xmm2, xmm1
+        punpcklwd   xmm1, xmm3              ; 88 98 a8 b8
+        punpckhwd   xmm2, xmm3              ; 8c 9c ac bc
+
+        ; using xmm[0124]
+        ; work on next 4 rows
+
+        movdqa      xmm3, s12
+        movdqa      xmm5, xmm3
+        punpcklbw   xmm3, s13 ; c0 d0
+        punpckhbw   xmm5, s13 ; c8 d8
+
+        movdqa      xmm6, s14
+        movdqa      xmm7, xmm6
+        punpcklbw   xmm6, s15 ; e0 f0
+        punpckhbw   xmm7, s15 ; e8 f8
+
+        movdqa      xmm8, xmm3
+        punpcklwd   xmm3, xmm6              ; c0 d0 e0 f0
+        punpckhwd   xmm8, xmm6              ; c4 d4 e4 f4
+
+        movdqa      xmm6, xmm5
+        punpcklwd   xmm5, xmm7              ; c8 d8 e8 f8
+        punpckhwd   xmm6, xmm7              ; cc dc ec fc
+
+        ; pull the third and fourth sets together
+
+        movdqa      xmm7, xmm0
+        punpckldq   xmm0, xmm3              ; 80 90 a0 b0 c0 d0 e0 f0
+        punpckhdq   xmm7, xmm3              ; 82 92 a2 b2 c2 d2 e2 f2
+
+        movdqa      xmm3, xmm4
+        punpckldq   xmm4, xmm8              ; 84 94 a4 b4 c4 d4 e4 f4
+        punpckhdq   xmm3, xmm8              ; 86 96 a6 b6 c6 d6 e6 f6
+
+        movdqa      xmm8, xmm1
+        punpckldq   xmm1, xmm5              ; 88 88 a8 b8 c8 d8 e8 f8
+        punpckhdq   xmm8, xmm5              ; 8a 9a aa ba ca da ea fa
+
+        movdqa      xmm5, xmm2
+        punpckldq   xmm2, xmm6              ; 8c 9c ac bc cc dc ec fc
+        punpckhdq   xmm5, xmm6              ; 8e 9e ae be ce de ee fe
+
+        ; save the calculations. we only have 15 registers ...
+        movdqa      i0, xmm0
+        movdqa      i1, xmm7
+        movdqa      i2, xmm4
+        movdqa      i3, xmm3
+        movdqa      i4, xmm1
+        movdqa      i5, xmm8
+        movdqa      i6, xmm2
+        movdqa      i7, xmm5
+
+        ; 0-7
+        movdqa      xmm0, s0
+        movdqa      xmm1, xmm0
+        punpcklbw   xmm0, s1 ; 00 10
+        punpckhbw   xmm1, s1 ; 08 18
+
+        movdqa      xmm2, s2
+        movdqa      xmm3, xmm2
+        punpcklbw   xmm2, s3 ; 20 30
+        punpckhbw   xmm3, s3 ; 28 38
+
+        movdqa      xmm4, xmm0
+        punpcklwd   xmm0, xmm2              ; 00 10 20 30
+        punpckhwd   xmm4, xmm2              ; 04 14 24 34
+
+        movdqa      xmm2, xmm1
+        punpcklwd   xmm1, xmm3              ; 08 18 28 38
+        punpckhwd   xmm2, xmm3              ; 0c 1c 2c 3c
+
+        ; using xmm[0124]
+        ; work on next 4 rows
+
+        movdqa      xmm3, s4
+        movdqa      xmm5, xmm3
+        punpcklbw   xmm3, s5 ; 40 50
+        punpckhbw   xmm5, s5 ; 48 58
+
+        movdqa      xmm6, s6
+        movdqa      xmm7, xmm6
+        punpcklbw   xmm6, s7   ; 60 70
+        punpckhbw   xmm7, s7   ; 68 78
+
+        movdqa      xmm8, xmm3
+        punpcklwd   xmm3, xmm6              ; 40 50 60 70
+        punpckhwd   xmm8, xmm6              ; 44 54 64 74
+
+        movdqa      xmm6, xmm5
+        punpcklwd   xmm5, xmm7              ; 48 58 68 78
+        punpckhwd   xmm6, xmm7              ; 4c 5c 6c 7c
+
+        ; pull the first two sets together
+
+        movdqa      xmm7, xmm0
+        punpckldq   xmm0, xmm3              ; 00 10 20 30 40 50 60 70
+        punpckhdq   xmm7, xmm3              ; 02 12 22 32 42 52 62 72
+
+        movdqa      xmm3, xmm4
+        punpckldq   xmm4, xmm8              ; 04 14 24 34 44 54 64 74
+        punpckhdq   xmm3, xmm8              ; 06 16 26 36 46 56 66 76
+
+        movdqa      xmm8, xmm1
+        punpckldq   xmm1, xmm5              ; 08 18 28 38 48 58 68 78
+        punpckhdq   xmm8, xmm5              ; 0a 1a 2a 3a 4a 5a 6a 7a
+
+        movdqa      xmm5, xmm2
+        punpckldq   xmm2, xmm6              ; 0c 1c 2c 3c 4c 5c 6c 7c
+        punpckhdq   xmm5, xmm6              ; 0e 1e 2e 3e 4e 5e 6e 7e
+        ; final combination
+
+        movdqa      xmm6, xmm0
+        punpcklqdq  xmm0, i0
+        punpckhqdq  xmm6, i0
+
+        movdqa      xmm9, xmm7
+        punpcklqdq  xmm7, i1
+        punpckhqdq  xmm9, i1
+
+        movdqa      xmm10, xmm4
+        punpcklqdq  xmm4, i2
+        punpckhqdq  xmm10, i2
+
+        movdqa      xmm11, xmm3
+        punpcklqdq  xmm3, i3
+        punpckhqdq  xmm11, i3
+
+        movdqa      xmm12, xmm1
+        punpcklqdq  xmm1, i4
+        punpckhqdq  xmm12, i4
+
+        movdqa      xmm13, xmm8
+        punpcklqdq  xmm8, i5
+        punpckhqdq  xmm13, i5
+
+        movdqa      xmm14, xmm2
+        punpcklqdq  xmm2, i6
+        punpckhqdq  xmm14, i6
+
+        movdqa      xmm15, xmm5
+        punpcklqdq  xmm5, i7
+        punpckhqdq  xmm15, i7
+
+        movdqa      i0, xmm0
+        movdqa      i1, xmm6
+        movdqa      i2, xmm7
+        movdqa      i3, xmm9
+        movdqa      i4, xmm4
+        movdqa      i5, xmm10
+        movdqa      i6, xmm3
+        movdqa      i7, xmm11
+        movdqa      i8, xmm1
+        movdqa      i9, xmm12
+        movdqa      i10, xmm8
+        movdqa      i11, xmm13
+        movdqa      i12, xmm2
+        movdqa      i13, xmm14
+        movdqa      i14, xmm5
+        movdqa      i15, xmm15
+
+; TRANSPOSED DATA AVAILABLE ON THE STACK
+
+        movdqa      xmm12, xmm6
+        movdqa      xmm13, xmm7
+
+        pxor        zero, zero
+
+LF_FILTER_HEV_MASK xmm0, xmm12, xmm13, xmm9, xmm4, xmm10, xmm3, xmm11
+
+        movdqa       xmm1, i2
+        movdqa       xmm2, i3
+        movdqa       xmm8, i4
+        movdqa       xmm9, i5
+LF_FILTER xmm1, xmm2, xmm8, xmm9, xmm0, xmm4
+        movdqa       i2, xmm1
+        movdqa       i3, xmm2
+
+; second set
+        movdqa       i4, xmm8
+        movdqa       i5, xmm9
+
+        movdqa       xmm0, i6
+        movdqa       xmm1, i7
+        movdqa       xmm2, i8
+        movdqa       xmm4, i9
+        movdqa       xmm10, i10   ; q2, will contain abs(p1-p0)
+        movdqa       xmm11, i11
+LF_FILTER_HEV_MASK xmm8, xmm9, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm3
+
+        movdqa       xmm0, i6
+        movdqa       xmm1, i7
+        movdqa       xmm3, i8
+        movdqa       xmm4, i9
+LF_FILTER xmm0, xmm1, xmm3, xmm4, xmm8, xmm2
+        movdqa       i6, xmm0
+        movdqa       i7, xmm1
+
+; last set
+        movdqa       i8, xmm3
+        movdqa       i9, xmm4
+
+        movdqa       xmm0, i10
+        movdqa       xmm1, i11
+        movdqa       xmm2, i12
+        movdqa       xmm8, i13
+        movdqa       xmm9, i14   ; q2, will contain abs(p1-p0)
+        movdqa       xmm11, i15
+LF_FILTER_HEV_MASK xmm3, xmm4, xmm0, xmm1, xmm2, xmm8, xmm9, xmm11, xmm10
+
+        movdqa       xmm0, i10
+        movdqa       xmm1, i11
+        movdqa       xmm4, i12
+        movdqa       xmm8, i13
+LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2
+        movdqa       i10, xmm0
+        movdqa       i11, xmm1
+        movdqa       i12, xmm4
+        movdqa       i13, xmm8
+
+
+; RESHUFFLE AND WRITE OUT
+        ; 8-f
+        movdqa      xmm0, i8
+        movdqa      xmm1, xmm0
+        punpcklbw   xmm0, i9                ; 80 90
+        punpckhbw   xmm1, i9                ; 88 98
+
+        movdqa      xmm2, i10
+        movdqa      xmm3, xmm2
+        punpcklbw   xmm2, i11               ; a0 b0
+        punpckhbw   xmm3, i11               ; a8 b8
+
+        movdqa      xmm4, xmm0
+        punpcklwd   xmm0, xmm2              ; 80 90 a0 b0
+        punpckhwd   xmm4, xmm2              ; 84 94 a4 b4
+
+        movdqa      xmm2, xmm1
+        punpcklwd   xmm1, xmm3              ; 88 98 a8 b8
+        punpckhwd   xmm2, xmm3              ; 8c 9c ac bc
+
+        ; using xmm[0124]
+        ; work on next 4 rows
+
+        movdqa      xmm3, i12
+        movdqa      xmm5, xmm3
+        punpcklbw   xmm3, i13               ; c0 d0
+        punpckhbw   xmm5, i13               ; c8 d8
+
+        movdqa      xmm6, i14
+        movdqa      xmm7, xmm6
+        punpcklbw   xmm6, i15               ; e0 f0
+        punpckhbw   xmm7, i15               ; e8 f8
+
+        movdqa      xmm8, xmm3
+        punpcklwd   xmm3, xmm6              ; c0 d0 e0 f0
+        punpckhwd   xmm8, xmm6              ; c4 d4 e4 f4
+
+        movdqa      xmm6, xmm5
+        punpcklwd   xmm5, xmm7              ; c8 d8 e8 f8
+        punpckhwd   xmm6, xmm7              ; cc dc ec fc
+
+        ; pull the third and fourth sets together
+
+        movdqa      xmm7, xmm0
+        punpckldq   xmm0, xmm3              ; 80 90 a0 b0 c0 d0 e0 f0
+        punpckhdq   xmm7, xmm3              ; 82 92 a2 b2 c2 d2 e2 f2
+
+        movdqa      xmm3, xmm4
+        punpckldq   xmm4, xmm8              ; 84 94 a4 b4 c4 d4 e4 f4
+        punpckhdq   xmm3, xmm8              ; 86 96 a6 b6 c6 d6 e6 f6
+
+        movdqa      xmm8, xmm1
+        punpckldq   xmm1, xmm5              ; 88 88 a8 b8 c8 d8 e8 f8
+        punpckhdq   xmm8, xmm5              ; 8a 9a aa ba ca da ea fa
+
+        movdqa      xmm5, xmm2
+        punpckldq   xmm2, xmm6              ; 8c 9c ac bc cc dc ec fc
+        punpckhdq   xmm5, xmm6              ; 8e 9e ae be ce de ee fe
+
+        ; save the calculations. we only have 15 registers ...
+        movdqa      i8, xmm0
+        movdqa      i9, xmm7
+        movdqa      i10, xmm4
+        movdqa      i11, xmm3
+        movdqa      i12, xmm1
+        movdqa      i13, xmm8
+        movdqa      i14, xmm2
+        movdqa      i15, xmm5
+
+        ; 0-7
+        movdqa      xmm0, i0
+        movdqa      xmm1, xmm0
+        punpcklbw   xmm0, i1                ; 00 10
+        punpckhbw   xmm1, i1                ; 08 18
+
+        movdqa      xmm2, i2
+        movdqa      xmm3, xmm2
+        punpcklbw   xmm2, i3                ; 20 30
+        punpckhbw   xmm3, i3                ; 28 38
+
+        movdqa      xmm4, xmm0
+        punpcklwd   xmm0, xmm2              ; 00 10 20 30
+        punpckhwd   xmm4, xmm2              ; 04 14 24 34
+
+        movdqa      xmm2, xmm1
+        punpcklwd   xmm1, xmm3              ; 08 18 28 38
+        punpckhwd   xmm2, xmm3              ; 0c 1c 2c 3c
+
+        ; using xmm[0124]
+        ; work on next 4 rows
+
+        movdqa      xmm3, i4
+        movdqa      xmm5, xmm3
+        punpcklbw   xmm3, i5                ; 40 50
+        punpckhbw   xmm5, i5                ; 48 58
+
+        movdqa      xmm6, i6
+        movdqa      xmm7, xmm6
+        punpcklbw   xmm6, i7                ; 60 70
+        punpckhbw   xmm7, i7                ; 68 78
+
+        movdqa      xmm8, xmm3
+        punpcklwd   xmm3, xmm6              ; 40 50 60 70
+        punpckhwd   xmm8, xmm6              ; 44 54 64 74
+
+        movdqa      xmm6, xmm5
+        punpcklwd   xmm5, xmm7              ; 48 58 68 78
+        punpckhwd   xmm6, xmm7              ; 4c 5c 6c 7c
+
+        ; pull the first two sets together
+
+        movdqa      xmm7, xmm0
+        punpckldq   xmm0, xmm3              ; 00 10 20 30 40 50 60 70
+        punpckhdq   xmm7, xmm3              ; 02 12 22 32 42 52 62 72
+
+        movdqa      xmm3, xmm4
+        punpckldq   xmm4, xmm8              ; 04 14 24 34 44 54 64 74
+        punpckhdq   xmm3, xmm8              ; 06 16 26 36 46 56 66 76
+
+        movdqa      xmm8, xmm1
+        punpckldq   xmm1, xmm5              ; 08 18 28 38 48 58 68 78
+        punpckhdq   xmm8, xmm5              ; 0a 1a 2a 3a 4a 5a 6a 7a
+
+        movdqa      xmm5, xmm2
+        punpckldq   xmm2, xmm6              ; 0c 1c 2c 3c 4c 5c 6c 7c
+        punpckhdq   xmm5, xmm6              ; 0e 1e 2e 3e 4e 5e 6e 7e
+        ; final combination
+
+        movdqa      xmm6, xmm0
+        punpcklqdq  xmm0, i8
+        punpckhqdq  xmm6, i8
+
+        movdqa      xmm9, xmm7
+        punpcklqdq  xmm7, i9
+        punpckhqdq  xmm9, i9
+
+        movdqa      xmm10, xmm4
+        punpcklqdq  xmm4, i10
+        punpckhqdq  xmm10, i10
+
+        movdqa      xmm11, xmm3
+        punpcklqdq  xmm3, i11
+        punpckhqdq  xmm11, i11
+
+        movdqa      xmm12, xmm1
+        punpcklqdq  xmm1, i12
+        punpckhqdq  xmm12, i12
+
+        movdqa      xmm13, xmm8
+        punpcklqdq  xmm8, i13
+        punpckhqdq  xmm13, i13
+
+        movdqa      xmm14, xmm2
+        punpcklqdq  xmm2, i14
+        punpckhqdq  xmm14, i14
+
+        movdqa      xmm15, xmm5
+        punpcklqdq  xmm5, i15
+        punpckhqdq  xmm15, i15
+
+        movdqa      s0, xmm0
+        movdqa      s1, xmm6
+        movdqa      s2, xmm7
+        movdqa      s3, xmm9
+        movdqa      s4, xmm4
+        movdqa      s5, xmm10
+        movdqa      s6, xmm3
+        movdqa      s7, xmm11
+        movdqa      s8, xmm1
+        movdqa      s9, xmm12
+        movdqa      s10, xmm8
+        movdqa      s11, xmm13
+        movdqa      s12, xmm2
+        movdqa      s13, xmm14
+        movdqa      s14, xmm5
+        movdqa      s15, xmm15
+
+    ; free stack space
+    add          rsp, stack_size
+
+    ; un-ALIGN_STACK
+    pop          rsp
+
+%if LIBVPX_YASM_WIN64
+    pop    r13
+    pop    r12
+    RESTORE_XMM
+    pop    rbp
+%endif
+
+    ret
+
+SECTION_RODATA
+align 16
+te0:
+    times 16 db 0xe0
+align 16
+t7f:
+    times 16 db 0x7f
+align 16
+tfe:
+    times 16 db 0xfe
+align 16
+t1f:
+    times 16 db 0x1f
+align 16
+t80:
+    times 16 db 0x80
+align 16
+t1:
+    times 16 db 0x01
+align 16
+t3:
+    times 16 db 0x03
+align 16
+t4:
+    times 16 db 0x04
diff --git a/libs/libvpx/vp8/common/x86/loopfilter_sse2.asm b/libs/libvpx/vp8/common/x86/loopfilter_sse2.asm
new file mode 100644
index 0000000000..1913abc69b
--- /dev/null
+++ b/libs/libvpx/vp8/common/x86/loopfilter_sse2.asm
@@ -0,0 +1,1640 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+%define _t0 0
+%define _t1 _t0 + 16
+%define _p3 _t1 + 16
+%define _p2 _p3 + 16
+%define _p1 _p2 + 16
+%define _p0 _p1 + 16
+%define _q0 _p0 + 16
+%define _q1 _q0 + 16
+%define _q2 _q1 + 16
+%define _q3 _q2 + 16
+%define lf_var_size 160
+
+; Use of pmaxub instead of psubusb to compute filter mask was seen
+; in ffvp8
+
+%macro LFH_FILTER_AND_HEV_MASK 1
+%if %1
+        movdqa      xmm2,                   [rdi+2*rax]       ; q3
+        movdqa      xmm1,                   [rsi+2*rax]       ; q2
+        movdqa      xmm4,                   [rsi+rax]         ; q1
+        movdqa      xmm5,                   [rsi]             ; q0
+        neg         rax                     ; negate pitch to deal with above border
+%else
+        movlps      xmm2,                   [rsi + rcx*2]     ; q3
+        movlps      xmm1,                   [rsi + rcx]       ; q2
+        movlps      xmm4,                   [rsi]             ; q1
+        movlps      xmm5,                   [rsi + rax]       ; q0
+
+        movhps      xmm2,                   [rdi + rcx*2]
+        movhps      xmm1,                   [rdi + rcx]
+        movhps      xmm4,                   [rdi]
+        movhps      xmm5,                   [rdi + rax]
+
+        lea         rsi,                    [rsi + rax*4]
+        lea         rdi,                    [rdi + rax*4]
+
+        movdqa      [rsp+_q2],              xmm1              ; store q2
+        movdqa      [rsp+_q1],              xmm4              ; store q1
+%endif
+        movdqa      xmm7,                   [rdx]             ;limit
+
+        movdqa      xmm6,                   xmm1              ; q2
+        movdqa      xmm3,                   xmm4              ; q1
+
+        psubusb     xmm1,                   xmm2              ; q2-=q3
+        psubusb     xmm2,                   xmm6              ; q3-=q2
+
+        psubusb     xmm4,                   xmm6              ; q1-=q2
+        psubusb     xmm6,                   xmm3              ; q2-=q1
+
+        por         xmm4,                   xmm6              ; abs(q2-q1)
+        por         xmm1,                   xmm2              ; abs(q3-q2)
+
+        movdqa      xmm0,                   xmm5              ; q0
+        pmaxub      xmm1,                   xmm4
+
+        psubusb     xmm5,                   xmm3              ; q0-=q1
+        psubusb     xmm3,                   xmm0              ; q1-=q0
+
+        por         xmm5,                   xmm3              ; abs(q0-q1)
+        movdqa      [rsp+_t0],              xmm5              ; save to t0
+
+        pmaxub      xmm1,                   xmm5
+
+%if %1
+        movdqa      xmm2,                   [rsi+4*rax]       ; p3
+        movdqa      xmm4,                   [rdi+4*rax]       ; p2
+        movdqa      xmm6,                   [rsi+2*rax]       ; p1
+%else
+        movlps      xmm2,                   [rsi + rax]       ; p3
+        movlps      xmm4,                   [rsi]             ; p2
+        movlps      xmm6,                   [rsi + rcx]       ; p1
+
+        movhps      xmm2,                   [rdi + rax]
+        movhps      xmm4,                   [rdi]
+        movhps      xmm6,                   [rdi + rcx]
+
+        movdqa      [rsp+_p2],              xmm4              ; store p2
+        movdqa      [rsp+_p1],              xmm6              ; store p1
+%endif
+
+        movdqa      xmm5,                   xmm4              ; p2
+        movdqa      xmm3,                   xmm6              ; p1
+
+        psubusb     xmm4,                   xmm2              ; p2-=p3
+        psubusb     xmm2,                   xmm5              ; p3-=p2
+
+        psubusb     xmm3,                   xmm5              ; p1-=p2
+        pmaxub      xmm1,                   xmm4              ; abs(p3 - p2)
+
+        psubusb     xmm5,                   xmm6              ; p2-=p1
+        pmaxub      xmm1,                   xmm2              ; abs(p3 - p2)
+
+        pmaxub      xmm1,                   xmm5              ; abs(p2 - p1)
+        movdqa      xmm2,                   xmm6              ; p1
+
+        pmaxub      xmm1,                   xmm3              ; abs(p2 - p1)
+%if %1
+        movdqa      xmm4,                   [rsi+rax]         ; p0
+        movdqa      xmm3,                   [rdi]             ; q1
+%else
+        movlps      xmm4,                   [rsi + rcx*2]     ; p0
+        movhps      xmm4,                   [rdi + rcx*2]
+        movdqa      xmm3,                   [rsp+_q1]                ; q1
+%endif
+
+        movdqa      xmm5,                   xmm4              ; p0
+        psubusb     xmm4,                   xmm6              ; p0-=p1
+
+        psubusb     xmm6,                   xmm5              ; p1-=p0
+
+        por         xmm6,                   xmm4              ; abs(p1 - p0)
+        mov         rdx,                    arg(2)            ; get blimit
+
+        movdqa     [rsp+_t1],               xmm6              ; save to t1
+
+        movdqa      xmm4,                   xmm3              ; q1
+        pmaxub      xmm1,                   xmm6
+
+        psubusb     xmm3,                   xmm2              ; q1-=p1
+        psubusb     xmm2,                   xmm4              ; p1-=q1
+
+        psubusb     xmm1,                   xmm7
+        por         xmm2,                   xmm3              ; abs(p1-q1)
+
+        movdqa      xmm7,                   [rdx]             ; blimit
+        mov         rdx,                    arg(4)            ; hev get thresh
+
+        movdqa      xmm3,                   xmm0              ; q0
+        pand        xmm2,                   [GLOBAL(tfe)]     ; set lsb of each byte to zero
+
+        movdqa      xmm6,                   xmm5              ; p0
+        psrlw       xmm2,                   1                 ; abs(p1-q1)/2
+
+        psubusb     xmm5,                   xmm3              ; p0-=q0
+        psubusb     xmm3,                   xmm6              ; q0-=p0
+        por         xmm5,                   xmm3              ; abs(p0 - q0)
+
+        paddusb     xmm5,                   xmm5              ; abs(p0-q0)*2
+
+        movdqa      xmm4,                   [rsp+_t0]                ; hev get abs (q1 - q0)
+        movdqa      xmm3,                   [rsp+_t1]                ; get abs (p1 - p0)
+
+        paddusb     xmm5,                   xmm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+        movdqa      xmm2,                   [rdx]             ; hev
+
+        psubusb     xmm5,                   xmm7              ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
+        psubusb     xmm4,                   xmm2              ; hev
+
+        psubusb     xmm3,                   xmm2              ; hev
+        por         xmm1,                   xmm5
+
+        pxor        xmm7,                   xmm7
+        paddb       xmm4,                   xmm3              ; hev abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+
+        pcmpeqb     xmm4,                   xmm5              ; hev
+        pcmpeqb     xmm3,                   xmm3              ; hev
+
+        pcmpeqb     xmm1,                   xmm7              ; mask xmm1
+        pxor        xmm4,                   xmm3              ; hev
+%endmacro
+
+%macro B_FILTER 1
+        movdqa      xmm3,                   [GLOBAL(t80)]
+%if %1 == 0
+        movdqa      xmm2,                   [rsp+_p1]                ; p1
+        movdqa      xmm7,                   [rsp+_q1]                ; q1
+%elif %1 == 1
+        movdqa      xmm2,                   [rsi+2*rax]       ; p1
+        movdqa      xmm7,                   [rdi]             ; q1
+%elif %1 == 2
+        movdqa      xmm2,                   [rsp+_p1]         ; p1
+        movdqa      xmm6,                   [rsp+_p0]         ; p0
+        movdqa      xmm0,                   [rsp+_q0]         ; q0
+        movdqa      xmm7,                   [rsp+_q1]         ; q1
+%endif
+
+        pxor        xmm2,                   xmm3              ; p1 offset to convert to signed values
+        pxor        xmm7,                   xmm3              ; q1 offset to convert to signed values
+
+        psubsb      xmm2,                   xmm7              ; p1 - q1
+        pxor        xmm6,                   xmm3              ; offset to convert to signed values
+
+        pand        xmm2,                   xmm4              ; high var mask (hvm)(p1 - q1)
+        pxor        xmm0,                   xmm3              ; offset to convert to signed values
+
+        movdqa      xmm3,                   xmm0              ; q0
+        psubsb      xmm0,                   xmm6              ; q0 - p0
+        paddsb      xmm2,                   xmm0              ; 1 * (q0 - p0) + hvm(p1 - q1)
+        paddsb      xmm2,                   xmm0              ; 2 * (q0 - p0) + hvm(p1 - q1)
+        paddsb      xmm2,                   xmm0              ; 3 * (q0 - p0) + hvm(p1 - q1)
+        pand        xmm1,                   xmm2              ; mask filter values we don't care about
+
+        movdqa      xmm2,                   xmm1
+        paddsb      xmm1,                   [GLOBAL(t4)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 4
+        paddsb      xmm2,                   [GLOBAL(t3)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 3
+
+        punpckhbw   xmm5,                   xmm2              ; axbxcxdx
+        punpcklbw   xmm2,                   xmm2              ; exfxgxhx
+
+        punpcklbw   xmm0,                   xmm1              ; exfxgxhx
+        psraw       xmm5,                   11                ; sign extended shift right by 3
+
+        punpckhbw   xmm1,                   xmm1              ; axbxcxdx
+        psraw       xmm2,                   11                ; sign extended shift right by 3
+
+        packsswb    xmm2,                   xmm5              ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
+        psraw       xmm0,                   11                ; sign extended shift right by 3
+
+        psraw       xmm1,                   11                ; sign extended shift right by 3
+        movdqa      xmm5,                   xmm0              ; save results
+
+        packsswb    xmm0,                   xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
+
+        paddsb      xmm6,                   xmm2              ; p0+= p0 add
+
+        movdqa      xmm2,                   [GLOBAL(ones)]
+        paddsw      xmm5,                   xmm2
+        paddsw      xmm1,                   xmm2
+        psraw       xmm5,                   1                 ; partial shifted one more time for 2nd tap
+        psraw       xmm1,                   1                 ; partial shifted one more time for 2nd tap
+        packsswb    xmm5,                   xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
+        movdqa      xmm2,                   [GLOBAL(t80)]
+
+%if %1 == 0
+        movdqa      xmm1,                   [rsp+_p1]         ; p1
+        lea         rsi,                    [rsi + rcx*2]
+        lea         rdi,                    [rdi + rcx*2]
+%elif %1 == 1
+        movdqa      xmm1,                   [rsi+2*rax]       ; p1
+%elif %1 == 2
+        movdqa      xmm1,                   [rsp+_p1]         ; p1
+%endif
+
+        pandn       xmm4,                   xmm5              ; high edge variance additive
+        pxor        xmm6,                   xmm2              ; unoffset
+
+        pxor        xmm1,                   xmm2              ; reoffset
+        psubsb      xmm3,                   xmm0              ; q0-= q0 add
+
+        paddsb      xmm1,                   xmm4              ; p1+= p1 add
+        pxor        xmm3,                   xmm2              ; unoffset
+
+        pxor        xmm1,                   xmm2              ; unoffset
+        psubsb      xmm7,                   xmm4              ; q1-= q1 add
+
+        pxor        xmm7,                   xmm2              ; unoffset
+%if %1 == 0
+        movq        [rsi],                  xmm6              ; p0
+        movhps      [rdi],                  xmm6
+        movq        [rsi + rax],            xmm1              ; p1
+        movhps      [rdi + rax],            xmm1
+        movq        [rsi + rcx],            xmm3              ; q0
+        movhps      [rdi + rcx],            xmm3
+        movq        [rsi + rcx*2],          xmm7              ; q1
+        movhps      [rdi + rcx*2],          xmm7
+%elif %1 == 1
+        movdqa      [rsi+rax],              xmm6              ; write back
+        movdqa      [rsi+2*rax],            xmm1              ; write back
+        movdqa      [rsi],                  xmm3              ; write back
+        movdqa      [rdi],                  xmm7              ; write back
+%endif
+
+%endmacro
+
+%if ABI_IS_32BIT
+
+;void vp8_loop_filter_horizontal_edge_sse2
+;(
+;    unsigned char *src_ptr,
+;    int            src_pixel_step,
+;    const char    *blimit,
+;    const char    *limit,
+;    const char    *thresh,
+;)
+global sym(vp8_loop_filter_horizontal_edge_sse2) PRIVATE
+sym(vp8_loop_filter_horizontal_edge_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, lf_var_size
+
+        mov         rsi,                    arg(0)           ;src_ptr
+        movsxd      rax,                    dword ptr arg(1) ;src_pixel_step
+
+        mov         rdx,                    arg(3)           ;limit
+
+        lea         rdi,                    [rsi+rax]        ; rdi points to row +1 for indirect addressing
+
+        ; calculate breakout conditions and high edge variance
+        LFH_FILTER_AND_HEV_MASK 1
+        ; filter and write back the result
+        B_FILTER 1
+
+    add rsp, lf_var_size
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+%endif
+
+;void vp8_loop_filter_horizontal_edge_uv_sse2
+;(
+;    unsigned char *src_ptr,
+;    int            src_pixel_step,
+;    const char    *blimit,
+;    const char    *limit,
+;    const char    *thresh,
+;    int            count
+;)
+global sym(vp8_loop_filter_horizontal_edge_uv_sse2) PRIVATE
+sym(vp8_loop_filter_horizontal_edge_uv_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, lf_var_size
+
+        mov         rsi,                    arg(0)             ; u
+        mov         rdi,                    arg(5)             ; v
+        movsxd      rax,                    dword ptr arg(1)   ; src_pixel_step
+        mov         rcx,                    rax
+        neg         rax                     ; negate pitch to deal with above border
+
+        mov         rdx,                    arg(3)             ;limit
+
+        lea         rsi,                    [rsi + rcx]
+        lea         rdi,                    [rdi + rcx]
+
+        ; calculate breakout conditions and high edge variance
+        LFH_FILTER_AND_HEV_MASK 0
+        ; filter and write back the result
+        B_FILTER 0
+
+    add rsp, lf_var_size
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+%macro MB_FILTER_AND_WRITEBACK 1
+        movdqa      xmm3,                   [GLOBAL(t80)]
+%if %1 == 0
+        movdqa      xmm2,                   [rsp+_p1]              ; p1
+        movdqa      xmm7,                   [rsp+_q1]              ; q1
+%elif %1 == 1
+        movdqa      xmm2,                   [rsi+2*rax]     ; p1
+        movdqa      xmm7,                   [rdi]           ; q1
+
+        mov         rcx,                    rax
+        neg         rcx
+%elif %1 == 2
+        movdqa      xmm2,                   [rsp+_p1]       ; p1
+        movdqa      xmm6,                   [rsp+_p0]       ; p0
+        movdqa      xmm0,                   [rsp+_q0]       ; q0
+        movdqa      xmm7,                   [rsp+_q1]       ; q1
+%endif
+
+        pxor        xmm2,                   xmm3            ; p1 offset to convert to signed values
+        pxor        xmm7,                   xmm3            ; q1 offset to convert to signed values
+        pxor        xmm6,                   xmm3            ; offset to convert to signed values
+        pxor        xmm0,                   xmm3            ; offset to convert to signed values
+
+        psubsb      xmm2,                   xmm7            ; p1 - q1
+
+        movdqa      xmm3,                   xmm0            ; q0
+        psubsb      xmm0,                   xmm6            ; q0 - p0
+        paddsb      xmm2,                   xmm0            ; 1 * (q0 - p0) + (p1 - q1)
+        paddsb      xmm2,                   xmm0            ; 2 * (q0 - p0)
+        paddsb      xmm2,                   xmm0            ; 3 * (q0 - p0) + (p1 - q1)
+        pand        xmm1,                   xmm2            ; mask filter values we don't care about
+
+        movdqa      xmm2,                   xmm1            ; vp8_filter
+
+        pand        xmm2,                   xmm4            ; Filter2 = vp8_filter & hev
+        pxor        xmm0,                   xmm0
+
+        pandn       xmm4,                   xmm1            ; vp8_filter&=~hev
+        pxor        xmm1,                   xmm1
+
+        punpcklbw   xmm0,                   xmm4            ; Filter 2 (hi)
+        punpckhbw   xmm1,                   xmm4            ; Filter 2 (lo)
+
+        movdqa      xmm5,                   xmm2
+
+        movdqa      xmm4,                   [GLOBAL(s9)]
+        paddsb      xmm5,                   [GLOBAL(t3)]    ; vp8_signed_char_clamp(Filter2 + 3)
+        paddsb      xmm2,                   [GLOBAL(t4)]    ; vp8_signed_char_clamp(Filter2 + 4)
+
+        pmulhw      xmm1,                   xmm4            ; Filter 2 (lo) * 9
+        pmulhw      xmm0,                   xmm4            ; Filter 2 (hi) * 9
+
+        punpckhbw   xmm7,                   xmm5            ; axbxcxdx
+        punpcklbw   xmm5,                   xmm5            ; exfxgxhx
+
+        psraw       xmm7,                   11              ; sign extended shift right by 3
+
+        psraw       xmm5,                   11              ; sign extended shift right by 3
+        punpckhbw   xmm4,                   xmm2            ; axbxcxdx
+
+        punpcklbw   xmm2,                   xmm2            ; exfxgxhx
+        psraw       xmm4,                   11              ; sign extended shift right by 3
+
+        packsswb    xmm5,                   xmm7            ; Filter2 >>=3;
+        psraw       xmm2,                   11              ; sign extended shift right by 3
+
+        packsswb    xmm2,                   xmm4            ; Filter1 >>=3;
+
+        paddsb      xmm6,                   xmm5            ; ps0 =ps0 + Fitler2
+
+        psubsb      xmm3,                   xmm2            ; qs0 =qs0 - Filter1
+        movdqa      xmm7,                   xmm1
+
+        movdqa      xmm4,                   [GLOBAL(s63)]
+        movdqa      xmm5,                   xmm0
+        movdqa      xmm2,                   xmm5
+        paddw       xmm0,                   xmm4            ; Filter 2 (hi) * 9 + 63
+        paddw       xmm1,                   xmm4            ; Filter 2 (lo) * 9 + 63
+        movdqa      xmm4,                   xmm7
+
+        paddw       xmm5,                   xmm5            ; Filter 2 (hi) * 18
+
+        paddw       xmm7,                   xmm7            ; Filter 2 (lo) * 18
+        paddw       xmm5,                   xmm0            ; Filter 2 (hi) * 27 + 63
+
+        paddw       xmm7,                   xmm1            ; Filter 2 (lo) * 27 + 63
+        paddw       xmm2,                   xmm0            ; Filter 2 (hi) * 18 + 63
+        psraw       xmm0,                   7               ; (Filter 2 (hi) * 9 + 63) >> 7
+
+        paddw       xmm4,                   xmm1            ; Filter 2 (lo) * 18 + 63
+        psraw       xmm1,                   7               ; (Filter 2 (lo) * 9 + 63) >> 7
+        psraw       xmm2,                   7               ; (Filter 2 (hi) * 18 + 63) >> 7
+
+        packsswb    xmm0,                   xmm1            ; u1 = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
+
+        psraw       xmm4,                   7               ; (Filter 2 (lo) * 18 + 63) >> 7
+        psraw       xmm5,                   7               ; (Filter 2 (hi) * 27 + 63) >> 7
+        psraw       xmm7,                   7               ; (Filter 2 (lo) * 27 + 63) >> 7
+
+        packsswb    xmm5,                   xmm7            ; u3 = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
+        packsswb    xmm2,                   xmm4            ; u2 = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
+        movdqa      xmm7,                   [GLOBAL(t80)]
+
+%if %1 == 0
+        movdqa      xmm1,                   [rsp+_q1]       ; q1
+        movdqa      xmm4,                   [rsp+_p1]       ; p1
+        lea         rsi,                    [rsi+rcx*2]
+        lea         rdi,                    [rdi+rcx*2]
+
+%elif %1 == 1
+        movdqa      xmm1,                   [rdi]           ; q1
+        movdqa      xmm4,                   [rsi+rax*2]     ; p1
+%elif %1 == 2
+        movdqa      xmm4,                   [rsp+_p1]       ; p1
+        movdqa      xmm1,                   [rsp+_q1]       ; q1
+%endif
+
+        pxor        xmm1,                   xmm7
+        pxor        xmm4,                   xmm7
+
+        psubsb      xmm3,                   xmm5            ; sq = vp8_signed_char_clamp(qs0 - u3)
+        paddsb      xmm6,                   xmm5            ; sp = vp8_signed_char_clamp(ps0 - u3)
+        psubsb      xmm1,                   xmm2            ; sq = vp8_signed_char_clamp(qs1 - u2)
+        paddsb      xmm4,                   xmm2            ; sp = vp8_signed_char_clamp(ps1 - u2)
+
+%if %1 == 1
+        movdqa      xmm2,                   [rdi+rax*4]     ; p2
+        movdqa      xmm5,                   [rdi+rcx]       ; q2
+%else
+        movdqa      xmm2,                   [rsp+_p2]       ; p2
+        movdqa      xmm5,                   [rsp+_q2]       ; q2
+%endif
+
+        pxor        xmm1,                   xmm7            ; *oq1 = sq^0x80;
+        pxor        xmm4,                   xmm7            ; *op1 = sp^0x80;
+        pxor        xmm2,                   xmm7
+        pxor        xmm5,                   xmm7
+        paddsb      xmm2,                   xmm0            ; sp = vp8_signed_char_clamp(ps2 - u)
+        psubsb      xmm5,                   xmm0            ; sq = vp8_signed_char_clamp(qs2 - u)
+        pxor        xmm2,                   xmm7            ; *op2 = sp^0x80;
+        pxor        xmm5,                   xmm7            ; *oq2 = sq^0x80;
+        pxor        xmm3,                   xmm7            ; *oq0 = sq^0x80
+        pxor        xmm6,                   xmm7            ; *oq0 = sp^0x80
+%if %1 == 0
+        movq        [rsi],                  xmm6            ; p0
+        movhps      [rdi],                  xmm6
+        movq        [rsi + rcx],            xmm3            ; q0
+        movhps      [rdi + rcx],            xmm3
+        lea         rdx,                    [rcx + rcx*2]
+        movq        [rsi+rcx*2],            xmm1            ; q1
+        movhps      [rdi+rcx*2],            xmm1
+
+        movq        [rsi + rax],            xmm4            ; p1
+        movhps      [rdi + rax],            xmm4
+
+        movq        [rsi+rax*2],            xmm2            ; p2
+        movhps      [rdi+rax*2],            xmm2
+
+        movq        [rsi+rdx],              xmm5            ; q2
+        movhps      [rdi+rdx],              xmm5
+%elif %1 == 1
+        movdqa      [rdi+rcx],              xmm5            ; q2
+        movdqa      [rdi],                  xmm1            ; q1
+        movdqa      [rsi],                  xmm3            ; q0
+        movdqa      [rsi+rax  ],            xmm6            ; p0
+        movdqa      [rsi+rax*2],            xmm4            ; p1
+        movdqa      [rdi+rax*4],            xmm2            ; p2
+%elif %1 == 2
+        movdqa      [rsp+_p1],              xmm4            ; p1
+        movdqa      [rsp+_p0],              xmm6            ; p0
+        movdqa      [rsp+_q0],              xmm3            ; q0
+        movdqa      [rsp+_q1],              xmm1            ; q1
+%endif
+
+%endmacro
+
+
+;void vp8_mbloop_filter_horizontal_edge_sse2
+;(
+;    unsigned char *src_ptr,
+;    int            src_pixel_step,
+;    const char    *blimit,
+;    const char    *limit,
+;    const char    *thresh,
+;)
+global sym(vp8_mbloop_filter_horizontal_edge_sse2) PRIVATE
+sym(vp8_mbloop_filter_horizontal_edge_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, lf_var_size
+
+        mov         rsi,                    arg(0)            ;src_ptr
+        movsxd      rax,                    dword ptr arg(1)  ;src_pixel_step
+        mov         rdx,                    arg(3)            ;limit
+
+        lea         rdi,                    [rsi+rax]         ; rdi points to row +1 for indirect addressing
+
+        ; calculate breakout conditions and high edge variance
+        LFH_FILTER_AND_HEV_MASK 1
+        ; filter and write back the results
+        MB_FILTER_AND_WRITEBACK 1
+
+    add rsp, lf_var_size
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_mbloop_filter_horizontal_edge_uv_sse2
+;(
+;    unsigned char *u,
+;    int            src_pixel_step,
+;    const char    *blimit,
+;    const char    *limit,
+;    const char    *thresh,
+;    unsigned char *v
+;)
+global sym(vp8_mbloop_filter_horizontal_edge_uv_sse2) PRIVATE
+sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, lf_var_size
+
+        mov         rsi,                    arg(0)             ; u
+        mov         rdi,                    arg(5)             ; v
+        movsxd      rax,                    dword ptr arg(1)   ; src_pixel_step
+        mov         rcx,                    rax
+        neg         rax                     ; negate pitch to deal with above border
+        mov         rdx,                    arg(3)             ;limit
+
+        lea         rsi,                    [rsi + rcx]
+        lea         rdi,                    [rdi + rcx]
+
+        ; calculate breakout conditions and high edge variance
+        LFH_FILTER_AND_HEV_MASK 0
+        ; filter and write back the results
+        MB_FILTER_AND_WRITEBACK 0
+
+    add rsp, lf_var_size
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+%macro TRANSPOSE_16X8 2
+        movq        xmm4,               [rsi]           ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00
+        movq        xmm1,               [rdi]           ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10
+        movq        xmm0,               [rsi+2*rax]     ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20
+        movq        xmm7,               [rdi+2*rax]     ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30
+        movq        xmm5,               [rsi+4*rax]     ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40
+        movq        xmm2,               [rdi+4*rax]     ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50
+
+        punpcklbw   xmm4,               xmm1            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
+
+        movq        xmm1,               [rdi+2*rcx]     ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70
+
+        movdqa      xmm3,               xmm4            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
+        punpcklbw   xmm0,               xmm7            ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20
+
+        movq        xmm7,               [rsi+2*rcx]     ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60
+
+        punpcklbw   xmm5,               xmm2            ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
+%if %1
+        lea         rsi,                [rsi+rax*8]
+        lea         rdi,                [rdi+rax*8]
+%else
+        mov         rsi,                arg(5)          ; v_ptr
+%endif
+
+        movdqa      xmm6,               xmm5            ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
+        punpcklbw   xmm7,               xmm1            ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60
+        punpcklwd   xmm5,               xmm7            ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
+        punpckhwd   xmm6,               xmm7            ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44
+        punpcklwd   xmm3,               xmm0            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
+
+%if %1 == 0
+        lea         rdi,                [rsi + rax - 4] ; rdi points to row +1 for indirect addressing
+        lea         rsi,                [rsi - 4]
+%endif
+
+        movdqa      xmm2,               xmm3            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
+        punpckhwd   xmm4,               xmm0            ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
+
+        movdqa      xmm7,               xmm4            ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
+        punpckhdq   xmm3,               xmm5            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
+
+        punpckhdq   xmm7,               xmm6            ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
+
+        punpckldq   xmm4,               xmm6            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
+
+        punpckldq   xmm2,               xmm5            ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
+
+        movdqa      [rsp+_t0],          xmm2            ; save to free XMM2
+
+        movq        xmm2,               [rsi]           ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80
+        movq        xmm6,               [rdi]           ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90
+        movq        xmm0,               [rsi+2*rax]     ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0
+        movq        xmm5,               [rdi+2*rax]     ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0
+        movq        xmm1,               [rsi+4*rax]     ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0
+
+        punpcklbw   xmm2,               xmm6            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
+
+        movq        xmm6,               [rdi+4*rax]     ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0
+
+        punpcklbw   xmm0,               xmm5            ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0
+
+        movq        xmm5,               [rsi+2*rcx]     ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0
+
+        punpcklbw   xmm1,               xmm6            ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0
+
+        movq        xmm6,               [rdi+2*rcx]     ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0
+
+        punpcklbw   xmm5,               xmm6            ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0
+
+        movdqa      xmm6,               xmm1            ;
+        punpckhwd   xmm6,               xmm5            ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4
+
+        punpcklwd   xmm1,               xmm5            ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
+        movdqa      xmm5,               xmm2            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
+
+        punpcklwd   xmm5,               xmm0            ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
+
+        punpckhwd   xmm2,               xmm0            ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
+
+        movdqa      xmm0,               xmm5
+        punpckldq   xmm0,               xmm1            ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
+
+        punpckhdq   xmm5,               xmm1            ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
+        movdqa      xmm1,               xmm2            ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
+
+        punpckldq   xmm1,               xmm6            ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84
+
+        punpckhdq   xmm2,               xmm6            ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86
+        movdqa      xmm6,               xmm7            ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
+
+        punpcklqdq  xmm6,               xmm2            ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
+
+        punpckhqdq  xmm7,               xmm2            ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07
+
+%if %2 == 0
+        movdqa      [rsp+_q3],          xmm7            ; save 7
+        movdqa      [rsp+_q2],          xmm6            ; save 6
+%endif
+        movdqa      xmm2,               xmm3            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
+        punpckhqdq  xmm3,               xmm5            ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+        punpcklqdq  xmm2,               xmm5            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+        movdqa      [rsp+_p1],          xmm2            ; save 2
+
+        movdqa      xmm5,               xmm4            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
+        punpcklqdq  xmm4,               xmm1            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+        movdqa      [rsp+_p0],          xmm3            ; save 3
+
+        punpckhqdq  xmm5,               xmm1            ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
+
+        movdqa      [rsp+_q0],          xmm4            ; save 4
+        movdqa      [rsp+_q1],          xmm5            ; save 5
+        movdqa      xmm1,               [rsp+_t0]
+
+        movdqa      xmm2,               xmm1            ;
+        punpckhqdq  xmm1,               xmm0            ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
+        punpcklqdq  xmm2,               xmm0            ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+
+%if %2 == 0
+        movdqa      [rsp+_p2],          xmm1
+        movdqa      [rsp+_p3],          xmm2
+%endif
+
+%endmacro
+
+%macro LFV_FILTER_MASK_HEV_MASK 0
+        movdqa      xmm0,               xmm6            ; q2
+        psubusb     xmm0,               xmm7            ; q2-q3
+
+        psubusb     xmm7,               xmm6            ; q3-q2
+        movdqa      xmm4,               xmm5            ; q1
+
+        por         xmm7,               xmm0            ; abs (q3-q2)
+        psubusb     xmm4,               xmm6            ; q1-q2
+
+        movdqa      xmm0,               xmm1
+        psubusb     xmm6,               xmm5            ; q2-q1
+
+        por         xmm6,               xmm4            ; abs (q2-q1)
+        psubusb     xmm0,               xmm2            ; p2 - p3;
+
+        psubusb     xmm2,               xmm1            ; p3 - p2;
+        por         xmm0,               xmm2            ; abs(p2-p3)
+
+        movdqa      xmm5,               [rsp+_p1]       ; p1
+        pmaxub      xmm0,               xmm7
+
+        movdqa      xmm2,               xmm5            ; p1
+        psubusb     xmm5,               xmm1            ; p1-p2
+        psubusb     xmm1,               xmm2            ; p2-p1
+
+        movdqa      xmm7,               xmm3            ; p0
+        psubusb     xmm7,               xmm2            ; p0-p1
+
+        por         xmm1,               xmm5            ; abs(p2-p1)
+        pmaxub      xmm0,               xmm6
+
+        pmaxub      xmm0,               xmm1
+        movdqa      xmm1,               xmm2            ; p1
+
+        psubusb     xmm2,               xmm3            ; p1-p0
+
+        por         xmm2,               xmm7            ; abs(p1-p0)
+
+        pmaxub      xmm0,               xmm2
+
+        movdqa      xmm5,               [rsp+_q0]       ; q0
+        movdqa      xmm7,               [rsp+_q1]       ; q1
+
+        mov         rdx,                arg(3)          ; limit
+
+        movdqa      xmm6,               xmm5            ; q0
+        movdqa      xmm4,               xmm7            ; q1
+
+        psubusb     xmm5,               xmm7            ; q0-q1
+        psubusb     xmm7,               xmm6            ; q1-q0
+
+        por         xmm7,               xmm5            ; abs(q1-q0)
+
+        pmaxub      xmm0,               xmm7
+
+        psubusb     xmm0,               [rdx]           ; limit
+
+        mov         rdx,                arg(2)          ; blimit
+        movdqa      xmm5,               xmm4            ; q1
+
+        psubusb     xmm5,               xmm1            ; q1-=p1
+        psubusb     xmm1,               xmm4            ; p1-=q1
+
+        por         xmm5,               xmm1            ; abs(p1-q1)
+        movdqa      xmm1,               xmm3            ; p0
+
+        pand        xmm5,               [GLOBAL(tfe)]   ; set lsb of each byte to zero
+        psubusb     xmm1,               xmm6            ; p0-q0
+
+        movdqa      xmm4,               [rdx]           ; blimit
+        mov         rdx,                arg(4)          ; get thresh
+
+        psrlw       xmm5,               1               ; abs(p1-q1)/2
+        psubusb     xmm6,               xmm3            ; q0-p0
+
+        por         xmm1,               xmm6            ; abs(q0-p0)
+        paddusb     xmm1,               xmm1            ; abs(q0-p0)*2
+        movdqa      xmm3,               [rdx]
+
+        paddusb     xmm1,               xmm5            ; abs (p0 - q0) *2 + abs(p1-q1)/2
+        psubusb     xmm2,               xmm3            ; abs(q1 - q0) > thresh
+
+        psubusb     xmm7,               xmm3            ; abs(p1 - p0)> thresh
+
+        psubusb     xmm1,               xmm4            ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
+        por         xmm2,               xmm7            ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+
+        por         xmm1,               xmm0            ; mask
+        pcmpeqb     xmm2,               xmm0
+
+        pxor        xmm0,               xmm0
+        pcmpeqb     xmm4,               xmm4
+
+        pcmpeqb     xmm1,               xmm0
+        pxor        xmm4,               xmm2
+%endmacro
+
+%macro BV_TRANSPOSE 0
+        ; xmm1 =    f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+        ; xmm6 =    f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+        ; xmm3 =    f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+        ; xmm7 =    f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
+        movdqa      xmm2,               xmm1            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+        punpcklbw   xmm2,               xmm6            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
+
+        movdqa      xmm4,               xmm3            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+        punpckhbw   xmm1,               xmm6            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
+
+        punpcklbw   xmm4,               xmm7            ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
+
+        punpckhbw   xmm3,               xmm7            ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
+
+        movdqa      xmm6,               xmm2            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
+        punpcklwd   xmm2,               xmm4            ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
+
+        punpckhwd   xmm6,               xmm4            ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
+        movdqa      xmm5,               xmm1            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
+
+        punpcklwd   xmm1,               xmm3            ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
+
+        punpckhwd   xmm5,               xmm3            ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
+        ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
+        ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
+        ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
+        ; xmm5 = f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
+%endmacro
+
+%macro BV_WRITEBACK 2
+        movd        [rsi+2],            %1
+        movd        [rsi+4*rax+2],      %2
+        psrldq      %1,                 4
+        psrldq      %2,                 4
+        movd        [rdi+2],            %1
+        movd        [rdi+4*rax+2],      %2
+        psrldq      %1,                 4
+        psrldq      %2,                 4
+        movd        [rsi+2*rax+2],      %1
+        movd        [rsi+2*rcx+2],      %2
+        psrldq      %1,                 4
+        psrldq      %2,                 4
+        movd        [rdi+2*rax+2],      %1
+        movd        [rdi+2*rcx+2],      %2
+%endmacro
+
+%if ABI_IS_32BIT
+
+;void vp8_loop_filter_vertical_edge_sse2
+;(
+;    unsigned char *src_ptr,
+;    int            src_pixel_step,
+;    const char    *blimit,
+;    const char    *limit,
+;    const char    *thresh,
+;)
+global sym(vp8_loop_filter_vertical_edge_sse2) PRIVATE
+sym(vp8_loop_filter_vertical_edge_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub             rsp, lf_var_size
+
+        mov         rsi,        arg(0)                  ; src_ptr
+        movsxd      rax,        dword ptr arg(1)        ; src_pixel_step
+
+        lea         rsi,        [rsi - 4]
+        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
+        lea         rcx,        [rax*2+rax]
+
+        ;transpose 16x8 to 8x16, and store the 8-line result on stack.
+        TRANSPOSE_16X8 1, 1
+
+        ; calculate filter mask and high edge variance
+        LFV_FILTER_MASK_HEV_MASK
+
+        ; start work on filters
+        B_FILTER 2
+
+        ; transpose and write back - only work on q1, q0, p0, p1
+        BV_TRANSPOSE
+        ; store 16-line result
+
+        lea         rdx,        [rax]
+        neg         rdx
+
+        BV_WRITEBACK xmm1, xmm5
+
+        lea         rsi,        [rsi+rdx*8]
+        lea         rdi,        [rdi+rdx*8]
+        BV_WRITEBACK xmm2, xmm6
+
+    add rsp, lf_var_size
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+%endif
+
+;void vp8_loop_filter_vertical_edge_uv_sse2
+;(
+;    unsigned char *u,
+;    int            src_pixel_step,
+;    const char    *blimit,
+;    const char    *limit,
+;    const char    *thresh,
+;    unsigned char *v
+;)
+global sym(vp8_loop_filter_vertical_edge_uv_sse2) PRIVATE
+sym(vp8_loop_filter_vertical_edge_uv_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub             rsp, lf_var_size
+
+        mov         rsi,        arg(0)                  ; u_ptr
+        movsxd      rax,        dword ptr arg(1)        ; src_pixel_step
+
+        lea         rsi,        [rsi - 4]
+        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
+        lea         rcx,        [rax+2*rax]
+
+        ;transpose 16x8 to 8x16, and store the 8-line result on stack.
+        TRANSPOSE_16X8 0, 1
+
+        ; calculate filter mask and high edge variance
+        LFV_FILTER_MASK_HEV_MASK
+
+        ; start work on filters
+        B_FILTER 2
+
+        ; transpose and write back - only work on q1, q0, p0, p1
+        BV_TRANSPOSE
+
+        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
+
+        ; store 16-line result
+        BV_WRITEBACK xmm1, xmm5
+
+        mov         rsi,        arg(0)                  ; u_ptr
+        lea         rsi,        [rsi - 4]
+        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
+        BV_WRITEBACK xmm2, xmm6
+
+    add rsp, lf_var_size
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+%macro MBV_TRANSPOSE 0
+        movdqa      xmm0,               [rsp+_p3]           ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+        movdqa      xmm1,               xmm0                ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+
+        punpcklbw   xmm0,               xmm2                ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
+        punpckhbw   xmm1,               xmm2                ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
+
+        movdqa      xmm7,               [rsp+_p1]           ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+        movdqa      xmm6,               xmm7                ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+
+        punpcklbw   xmm7,               [rsp+_p0]           ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
+        punpckhbw   xmm6,               [rsp+_p0]           ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
+
+        movdqa      xmm3,               xmm0                ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
+        punpcklwd   xmm0,               xmm7                ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
+
+        punpckhwd   xmm3,               xmm7                ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
+        movdqa      xmm4,               xmm1                ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
+
+        punpcklwd   xmm1,               xmm6                ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
+        punpckhwd   xmm4,               xmm6                ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
+
+        movdqa      xmm7,               [rsp+_q0]           ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+        punpcklbw   xmm7,               [rsp+_q1]           ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
+
+        movdqa      xmm6,               xmm5                ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
+        punpcklbw   xmm6,               [rsp+_q3]           ; 77 76 67 66 57 56 47 46 37 36 27 26 17 16 07 06
+
+        movdqa      xmm2,               xmm7                ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
+        punpcklwd   xmm7,               xmm6                ; 37 36 35 34 27 26 25 24 17 16 15 14 07 06 05 04
+
+        punpckhwd   xmm2,               xmm6                ; 77 76 75 74 67 66 65 64 57 56 55 54 47 46 45 44
+        movdqa      xmm6,               xmm0                ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
+
+        punpckldq   xmm0,               xmm7                ; 17 16 15 14 13 12 11 10 07 06 05 04 03 02 01 00
+        punpckhdq   xmm6,               xmm7                ; 37 36 35 34 33 32 31 30 27 26 25 24 23 22 21 20
+%endmacro
+
+%macro MBV_WRITEBACK_1 0
+        movq        [rsi],              xmm0
+        movhps      [rdi],              xmm0
+
+        movq        [rsi+2*rax],        xmm6
+        movhps      [rdi+2*rax],        xmm6
+
+        movdqa      xmm0,               xmm3                ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
+        punpckldq   xmm0,               xmm2                ; 57 56 55 54 53 52 51 50 47 46 45 44 43 42 41 40
+        punpckhdq   xmm3,               xmm2                ; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60
+
+        movq        [rsi+4*rax],        xmm0
+        movhps      [rdi+4*rax],        xmm0
+
+        movq        [rsi+2*rcx],        xmm3
+        movhps      [rdi+2*rcx],        xmm3
+
+        movdqa      xmm7,               [rsp+_q0]           ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+        punpckhbw   xmm7,               [rsp+_q1]           ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
+        punpckhbw   xmm5,               [rsp+_q3]           ; f7 f6 e7 e6 d7 d6 c7 c6 b7 b6 a7 a6 97 96 87 86
+
+        movdqa      xmm0,               xmm7
+        punpcklwd   xmm0,               xmm5                ; b7 b6 b4 b4 a7 a6 a5 a4 97 96 95 94 87 86 85 84
+        punpckhwd   xmm7,               xmm5                ; f7 f6 f5 f4 e7 e6 e5 e4 d7 d6 d5 d4 c7 c6 c5 c4
+
+        movdqa      xmm5,               xmm1                ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
+        punpckldq   xmm1,               xmm0                ; 97 96 95 94 93 92 91 90 87 86 85 83 84 82 81 80
+        punpckhdq   xmm5,               xmm0                ; b7 b6 b5 b4 b3 b2 b1 b0 a7 a6 a5 a4 a3 a2 a1 a0
+%endmacro
+
+%macro MBV_WRITEBACK_2 0
+        movq        [rsi],              xmm1
+        movhps      [rdi],              xmm1
+
+        movq        [rsi+2*rax],        xmm5
+        movhps      [rdi+2*rax],        xmm5
+
+        movdqa      xmm1,               xmm4                ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
+        punpckldq   xmm1,               xmm7                ; d7 d6 d5 d4 d3 d2 d1 d0 c7 c6 c5 c4 c3 c2 c1 c0
+        punpckhdq   xmm4,               xmm7                ; f7 f6 f4 f4 f3 f2 f1 f0 e7 e6 e5 e4 e3 e2 e1 e0
+
+        movq        [rsi+4*rax],        xmm1
+        movhps      [rdi+4*rax],        xmm1
+
+        movq        [rsi+2*rcx],        xmm4
+        movhps      [rdi+2*rcx],        xmm4
+%endmacro
+
+
+;void vp8_mbloop_filter_vertical_edge_sse2
+;(
+;    unsigned char *src_ptr,
+;    int            src_pixel_step,
+;    const char    *blimit,
+;    const char    *limit,
+;    const char    *thresh,
+;)
+global sym(vp8_mbloop_filter_vertical_edge_sse2) PRIVATE
+sym(vp8_mbloop_filter_vertical_edge_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub          rsp, lf_var_size
+
+        mov         rsi,                arg(0)              ; src_ptr
+        movsxd      rax,                dword ptr arg(1)    ; src_pixel_step
+
+        lea         rsi,                [rsi - 4]
+        lea         rdi,                [rsi + rax]         ; rdi points to row +1 for indirect addressing
+        lea         rcx,                [rax*2+rax]
+
+        ; Transpose
+        TRANSPOSE_16X8 1, 0
+
+        ; calculate filter mask and high edge variance
+        LFV_FILTER_MASK_HEV_MASK
+
+        neg         rax
+        ; start work on filters
+        MB_FILTER_AND_WRITEBACK 2
+
+        lea         rsi,                [rsi+rax*8]
+        lea         rdi,                [rdi+rax*8]
+
+        ; transpose and write back
+        MBV_TRANSPOSE
+
+        neg         rax
+
+        MBV_WRITEBACK_1
+
+
+        lea         rsi,                [rsi+rax*8]
+        lea         rdi,                [rdi+rax*8]
+        MBV_WRITEBACK_2
+
+    add rsp, lf_var_size
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_mbloop_filter_vertical_edge_uv_sse2
+;(
+;    unsigned char *u,
+;    int            src_pixel_step,
+;    const char    *blimit,
+;    const char    *limit,
+;    const char    *thresh,
+;    unsigned char *v
+;)
+global sym(vp8_mbloop_filter_vertical_edge_uv_sse2) PRIVATE
+sym(vp8_mbloop_filter_vertical_edge_uv_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub          rsp, lf_var_size
+
+        mov         rsi,                arg(0)              ; u_ptr
+        movsxd      rax,                dword ptr arg(1)    ; src_pixel_step
+
+        lea         rsi,                [rsi - 4]
+        lea         rdi,                [rsi + rax]         ; rdi points to row +1 for indirect addressing
+        lea         rcx,                [rax+2*rax]
+
+        ; Transpose
+        TRANSPOSE_16X8 0, 0
+
+        ; calculate filter mask and high edge variance
+        LFV_FILTER_MASK_HEV_MASK
+
+        ; start work on filters
+        MB_FILTER_AND_WRITEBACK 2
+
+        ; transpose and write back
+        MBV_TRANSPOSE
+
+        mov         rsi,                arg(0)             ;u_ptr
+        lea         rsi,                [rsi - 4]
+        lea         rdi,                [rsi + rax]
+        MBV_WRITEBACK_1
+        mov         rsi,                arg(5)             ;v_ptr
+        lea         rsi,                [rsi - 4]
+        lea         rdi,                [rsi + rax]
+        MBV_WRITEBACK_2
+
+    add rsp, lf_var_size
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_loop_filter_simple_horizontal_edge_sse2
+;(
+;    unsigned char *src_ptr,
+;    int  src_pixel_step,
+;    const char *blimit,
+;)
+global sym(vp8_loop_filter_simple_horizontal_edge_sse2) PRIVATE
+sym(vp8_loop_filter_simple_horizontal_edge_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 3
+    SAVE_XMM 7
+    GET_GOT     rbx
+    ; end prolog
+
+        mov         rcx, arg(0)             ;src_ptr
+        movsxd      rax, dword ptr arg(1)   ;src_pixel_step     ; destination pitch?
+        movdqa      xmm6, [GLOBAL(tfe)]
+        lea         rdx, [rcx + rax]
+        neg         rax
+
+        ; calculate mask
+        movdqa      xmm0, [rdx]             ; q1
+        mov         rdx, arg(2)             ;blimit
+        movdqa      xmm1, [rcx+2*rax]       ; p1
+
+        movdqa      xmm2, xmm1
+        movdqa      xmm3, xmm0
+
+        psubusb     xmm0, xmm1              ; q1-=p1
+        psubusb     xmm1, xmm3              ; p1-=q1
+        por         xmm1, xmm0              ; abs(p1-q1)
+        pand        xmm1, xmm6              ; set lsb of each byte to zero
+        psrlw       xmm1, 1                 ; abs(p1-q1)/2
+
+        movdqa      xmm7, XMMWORD PTR [rdx]
+
+        movdqa      xmm5, [rcx+rax]         ; p0
+        movdqa      xmm4, [rcx]             ; q0
+        movdqa      xmm0, xmm4              ; q0
+        movdqa      xmm6, xmm5              ; p0
+        psubusb     xmm5, xmm4              ; p0-=q0
+        psubusb     xmm4, xmm6              ; q0-=p0
+        por         xmm5, xmm4              ; abs(p0 - q0)
+
+        movdqa      xmm4, [GLOBAL(t80)]
+
+        paddusb     xmm5, xmm5              ; abs(p0-q0)*2
+        paddusb     xmm5, xmm1              ; abs (p0 - q0) *2 + abs(p1-q1)/2
+        psubusb     xmm5, xmm7              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
+        pxor        xmm7, xmm7
+        pcmpeqb     xmm5, xmm7
+
+
+        ; start work on filters
+        pxor        xmm2, xmm4     ; p1 offset to convert to signed values
+        pxor        xmm3, xmm4     ; q1 offset to convert to signed values
+        psubsb      xmm2, xmm3              ; p1 - q1
+
+        pxor        xmm6, xmm4     ; offset to convert to signed values
+        pxor        xmm0, xmm4     ; offset to convert to signed values
+        movdqa      xmm3, xmm0              ; q0
+        psubsb      xmm0, xmm6              ; q0 - p0
+        paddsb      xmm2, xmm0              ; p1 - q1 + 1 * (q0 - p0)
+        paddsb      xmm2, xmm0              ; p1 - q1 + 2 * (q0 - p0)
+        paddsb      xmm2, xmm0              ; p1 - q1 + 3 * (q0 - p0)
+        pand        xmm5, xmm2              ; mask filter values we don't care about
+
+        movdqa      xmm0, xmm5
+        paddsb      xmm5,        [GLOBAL(t3)]                  ;  3* (q0 - p0) + (p1 - q1) + 4
+        paddsb      xmm0,        [GLOBAL(t4)]                  ; +3 instead of +4
+
+        movdqa      xmm1, [GLOBAL(te0)]
+        movdqa      xmm2, [GLOBAL(t1f)]
+
+;        pxor        xmm7, xmm7
+        pcmpgtb     xmm7, xmm0              ;save sign
+        pand        xmm7, xmm1              ;preserve the upper 3 bits
+        psrlw       xmm0, 3
+        pand        xmm0, xmm2              ;clear out upper 3 bits
+        por         xmm0, xmm7              ;add sign
+        psubsb      xmm3, xmm0              ; q0-= q0sz add
+
+        pxor        xmm7, xmm7
+        pcmpgtb     xmm7, xmm5              ;save sign
+        pand        xmm7, xmm1              ;preserve the upper 3 bits
+        psrlw       xmm5, 3
+        pand        xmm5, xmm2              ;clear out upper 3 bits
+        por         xmm5, xmm7              ;add sign
+        paddsb      xmm6, xmm5              ; p0+= p0 add
+
+        pxor        xmm3, xmm4     ; unoffset
+        movdqa      [rcx], xmm3             ; write back
+
+        pxor        xmm6, xmm4     ; unoffset
+        movdqa      [rcx+rax], xmm6         ; write back
+
+    ; begin epilog
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_loop_filter_simple_vertical_edge_sse2
+;(
+;    unsigned char *src_ptr,
+;    int  src_pixel_step,
+;    const char *blimit,
+;)
+global sym(vp8_loop_filter_simple_vertical_edge_sse2) PRIVATE
+sym(vp8_loop_filter_simple_vertical_edge_sse2):
+    push        rbp         ; save old base pointer value.
+    mov         rbp, rsp    ; set new base pointer value.
+    SHADOW_ARGS_TO_STACK 3
+    SAVE_XMM 7
+    GET_GOT     rbx         ; save callee-saved reg
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 32                         ; reserve 32 bytes
+    %define t0  [rsp + 0]    ;__declspec(align(16)) char t0[16];
+    %define t1  [rsp + 16]   ;__declspec(align(16)) char t1[16];
+
+        mov         rsi, arg(0) ;src_ptr
+        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
+
+        lea         rsi,        [rsi - 2 ]
+        lea         rdi,        [rsi + rax]
+        lea         rdx,        [rsi + rax*4]
+        lea         rcx,        [rdx + rax]
+
+        movd        xmm0,       [rsi]                   ; (high 96 bits unused) 03 02 01 00
+        movd        xmm1,       [rdx]                   ; (high 96 bits unused) 43 42 41 40
+        movd        xmm2,       [rdi]                   ; 13 12 11 10
+        movd        xmm3,       [rcx]                   ; 53 52 51 50
+        punpckldq   xmm0,       xmm1                    ; (high 64 bits unused) 43 42 41 40 03 02 01 00
+        punpckldq   xmm2,       xmm3                    ; 53 52 51 50 13 12 11 10
+
+        movd        xmm4,       [rsi + rax*2]           ; 23 22 21 20
+        movd        xmm5,       [rdx + rax*2]           ; 63 62 61 60
+        movd        xmm6,       [rdi + rax*2]           ; 33 32 31 30
+        movd        xmm7,       [rcx + rax*2]           ; 73 72 71 70
+        punpckldq   xmm4,       xmm5                    ; 63 62 61 60 23 22 21 20
+        punpckldq   xmm6,       xmm7                    ; 73 72 71 70 33 32 31 30
+
+        punpcklbw   xmm0,       xmm2                    ; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00
+        punpcklbw   xmm4,       xmm6                    ; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20
+
+        movdqa      xmm1,       xmm0
+        punpcklwd   xmm0,       xmm4                    ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
+        punpckhwd   xmm1,       xmm4                    ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
+
+        movdqa      xmm2,       xmm0
+        punpckldq   xmm0,       xmm1                    ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
+        punpckhdq   xmm2,       xmm1                    ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
+
+        lea         rsi,        [rsi + rax*8]
+        lea         rdi,        [rsi + rax]
+        lea         rdx,        [rsi + rax*4]
+        lea         rcx,        [rdx + rax]
+
+        movd        xmm4,       [rsi]                   ; 83 82 81 80
+        movd        xmm1,       [rdx]                   ; c3 c2 c1 c0
+        movd        xmm6,       [rdi]                   ; 93 92 91 90
+        movd        xmm3,       [rcx]                   ; d3 d2 d1 d0
+        punpckldq   xmm4,       xmm1                    ; c3 c2 c1 c0 83 82 81 80
+        punpckldq   xmm6,       xmm3                    ; d3 d2 d1 d0 93 92 91 90
+
+        movd        xmm1,       [rsi + rax*2]           ; a3 a2 a1 a0
+        movd        xmm5,       [rdx + rax*2]           ; e3 e2 e1 e0
+        movd        xmm3,       [rdi + rax*2]           ; b3 b2 b1 b0
+        movd        xmm7,       [rcx + rax*2]           ; f3 f2 f1 f0
+        punpckldq   xmm1,       xmm5                    ; e3 e2 e1 e0 a3 a2 a1 a0
+        punpckldq   xmm3,       xmm7                    ; f3 f2 f1 f0 b3 b2 b1 b0
+
+        punpcklbw   xmm4,       xmm6                    ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80
+        punpcklbw   xmm1,       xmm3                    ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0
+
+        movdqa      xmm7,       xmm4
+        punpcklwd   xmm4,       xmm1                    ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
+        punpckhwd   xmm7,       xmm1                    ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
+
+        movdqa      xmm6,       xmm4
+        punpckldq   xmm4,       xmm7                    ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
+        punpckhdq   xmm6,       xmm7                    ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
+
+        movdqa      xmm1,       xmm0
+        movdqa      xmm3,       xmm2
+
+        punpcklqdq  xmm0,       xmm4                    ; p1  f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+        punpckhqdq  xmm1,       xmm4                    ; p0  f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
+        punpcklqdq  xmm2,       xmm6                    ; q0  f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+        punpckhqdq  xmm3,       xmm6                    ; q1  f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+
+        mov         rdx,        arg(2)                          ;blimit
+
+        ; calculate mask
+        movdqa      xmm6,       xmm0                            ; p1
+        movdqa      xmm7,       xmm3                            ; q1
+        psubusb     xmm7,       xmm0                            ; q1-=p1
+        psubusb     xmm6,       xmm3                            ; p1-=q1
+        por         xmm6,       xmm7                            ; abs(p1-q1)
+        pand        xmm6,       [GLOBAL(tfe)]                   ; set lsb of each byte to zero
+        psrlw       xmm6,       1                               ; abs(p1-q1)/2
+
+        movdqa      xmm7, [rdx]
+
+        movdqa      xmm5,       xmm1                            ; p0
+        movdqa      xmm4,       xmm2                            ; q0
+        psubusb     xmm5,       xmm2                            ; p0-=q0
+        psubusb     xmm4,       xmm1                            ; q0-=p0
+        por         xmm5,       xmm4                            ; abs(p0 - q0)
+        paddusb     xmm5,       xmm5                            ; abs(p0-q0)*2
+        paddusb     xmm5,       xmm6                            ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+        movdqa      xmm4, [GLOBAL(t80)]
+
+        psubusb     xmm5,        xmm7                           ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
+        pxor        xmm7,        xmm7
+        pcmpeqb     xmm5,        xmm7                           ; mm5 = mask
+
+        ; start work on filters
+        movdqa        t0,        xmm0
+        movdqa        t1,        xmm3
+
+        pxor        xmm0,        xmm4                  ; p1 offset to convert to signed values
+        pxor        xmm3,        xmm4                  ; q1 offset to convert to signed values
+        psubsb      xmm0,        xmm3                           ; p1 - q1
+
+        pxor        xmm1,        xmm4                  ; offset to convert to signed values
+        pxor        xmm2,        xmm4                  ; offset to convert to signed values
+
+        movdqa      xmm3,        xmm2                           ; offseted ; q0
+        psubsb      xmm2,        xmm1                           ; q0 - p0
+        paddsb      xmm0,        xmm2                           ; p1 - q1 + 1 * (q0 - p0)
+        paddsb      xmm0,        xmm2                           ; p1 - q1 + 2 * (q0 - p0)
+        paddsb      xmm0,        xmm2                           ; p1 - q1 + 3 * (q0 - p0)
+        pand        xmm5,        xmm0                           ; mask filter values we don't care about
+
+        movdqa      xmm0, xmm5
+        paddsb      xmm5,        [GLOBAL(t3)]                  ;  3* (q0 - p0) + (p1 - q1) + 4
+        paddsb      xmm0,        [GLOBAL(t4)]                  ; +3 instead of +4
+
+        movdqa  xmm6, [GLOBAL(te0)]
+        movdqa  xmm2, [GLOBAL(t1f)]
+
+;        pxor        xmm7, xmm7
+        pcmpgtb     xmm7, xmm0              ;save sign
+        pand        xmm7, xmm6              ;preserve the upper 3 bits
+        psrlw       xmm0, 3
+        pand        xmm0, xmm2              ;clear out upper 3 bits
+        por         xmm0, xmm7              ;add sign
+        psubsb      xmm3, xmm0              ; q0-= q0sz add
+
+        pxor        xmm7, xmm7
+        pcmpgtb     xmm7, xmm5              ;save sign
+        pand        xmm7, xmm6              ;preserve the upper 3 bits
+        psrlw       xmm5, 3
+        pand        xmm5, xmm2              ;clear out upper 3 bits
+        por         xmm5, xmm7              ;add sign
+        paddsb      xmm1, xmm5              ; p0+= p0 add
+
+        pxor        xmm3,        xmm4                  ; unoffset   q0
+        pxor        xmm1,        xmm4                  ; unoffset   p0
+
+        movdqa      xmm0,        t0                             ; p1
+        movdqa      xmm4,        t1                             ; q1
+
+        ; write out order: xmm0 xmm2 xmm1 xmm3
+        lea         rdx,        [rsi + rax*4]
+
+        ; transpose back to write out
+        ; p1  f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+        ; p0  f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
+        ; q0  f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+        ; q1  f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+        movdqa      xmm6,       xmm0
+        punpcklbw   xmm0,       xmm1                               ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
+        punpckhbw   xmm6,       xmm1                               ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
+
+        movdqa      xmm5,       xmm3
+        punpcklbw   xmm3,       xmm4                               ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
+        punpckhbw   xmm5,       xmm4                               ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
+
+        movdqa      xmm2,       xmm0
+        punpcklwd   xmm0,       xmm3                               ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
+        punpckhwd   xmm2,       xmm3                               ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
+
+        movdqa      xmm3,       xmm6
+        punpcklwd   xmm6,       xmm5                               ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
+        punpckhwd   xmm3,       xmm5                               ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
+
+        movd        [rsi],      xmm6                               ; write the second 8-line result
+        movd        [rdx],      xmm3
+        psrldq      xmm6,       4
+        psrldq      xmm3,       4
+        movd        [rdi],      xmm6
+        movd        [rcx],      xmm3
+        psrldq      xmm6,       4
+        psrldq      xmm3,       4
+        movd        [rsi + rax*2], xmm6
+        movd        [rdx + rax*2], xmm3
+        psrldq      xmm6,       4
+        psrldq      xmm3,       4
+        movd        [rdi + rax*2], xmm6
+        movd        [rcx + rax*2], xmm3
+
+        neg         rax
+        lea         rsi,        [rsi + rax*8]
+        neg         rax
+        lea         rdi,        [rsi + rax]
+        lea         rdx,        [rsi + rax*4]
+        lea         rcx,        [rdx + rax]
+
+        movd        [rsi],      xmm0                                ; write the first 8-line result
+        movd        [rdx],      xmm2
+        psrldq      xmm0,       4
+        psrldq      xmm2,       4
+        movd        [rdi],      xmm0
+        movd        [rcx],      xmm2
+        psrldq      xmm0,       4
+        psrldq      xmm2,       4
+        movd        [rsi + rax*2], xmm0
+        movd        [rdx + rax*2], xmm2
+        psrldq      xmm0,       4
+        psrldq      xmm2,       4
+        movd        [rdi + rax*2], xmm0
+        movd        [rcx + rax*2], xmm2
+
+    add rsp, 32
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+tfe:
+    times 16 db 0xfe
+align 16
+t80:
+    times 16 db 0x80
+align 16
+t1s:
+    times 16 db 0x01
+align 16
+t3:
+    times 16 db 0x03
+align 16
+t4:
+    times 16 db 0x04
+align 16
+ones:
+    times 8 dw 0x0001
+align 16
+s9:
+    times 8 dw 0x0900
+align 16
+s63:
+    times 8 dw 0x003f
+align 16
+te0:
+    times 16 db 0xe0
+align 16
+t1f:
+    times 16 db 0x1f
diff --git a/libs/libvpx/vp8/common/x86/loopfilter_x86.c b/libs/libvpx/vp8/common/x86/loopfilter_x86.c
new file mode 100644
index 0000000000..6586004600
--- /dev/null
+++ b/libs/libvpx/vp8/common/x86/loopfilter_x86.c
@@ -0,0 +1,198 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vp8/common/loopfilter.h"
+
+#define prototype_loopfilter(sym) \
+    void sym(unsigned char *src, int pitch, const unsigned char *blimit,\
+             const unsigned char *limit, const unsigned char *thresh, int count)
+
+#define prototype_loopfilter_nc(sym) \
+    void sym(unsigned char *src, int pitch, const unsigned char *blimit,\
+             const unsigned char *limit, const unsigned char *thresh)
+
+#define prototype_simple_loopfilter(sym) \
+    void sym(unsigned char *y, int ystride, const unsigned char *blimit)
+
+prototype_loopfilter(vp8_mbloop_filter_vertical_edge_mmx);
+prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_mmx);
+prototype_loopfilter(vp8_loop_filter_vertical_edge_mmx);
+prototype_loopfilter(vp8_loop_filter_horizontal_edge_mmx);
+prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_mmx);
+prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_mmx);
+
+#if HAVE_SSE2 && ARCH_X86_64
+prototype_loopfilter(vp8_loop_filter_bv_y_sse2);
+prototype_loopfilter(vp8_loop_filter_bh_y_sse2);
+#else
+prototype_loopfilter_nc(vp8_loop_filter_vertical_edge_sse2);
+prototype_loopfilter_nc(vp8_loop_filter_horizontal_edge_sse2);
+#endif
+prototype_loopfilter_nc(vp8_mbloop_filter_vertical_edge_sse2);
+prototype_loopfilter_nc(vp8_mbloop_filter_horizontal_edge_sse2);
+
+extern loop_filter_uvfunction vp8_loop_filter_horizontal_edge_uv_sse2;
+extern loop_filter_uvfunction vp8_loop_filter_vertical_edge_uv_sse2;
+extern loop_filter_uvfunction vp8_mbloop_filter_horizontal_edge_uv_sse2;
+extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_sse2;
+
+#if HAVE_MMX
+/* Horizontal MB filtering */
+void vp8_loop_filter_mbh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                             int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    vp8_mbloop_filter_horizontal_edge_mmx(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
+
+    if (u_ptr)
+        vp8_mbloop_filter_horizontal_edge_mmx(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+
+    if (v_ptr)
+        vp8_mbloop_filter_horizontal_edge_mmx(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+}
+
+
+/* Vertical MB Filtering */
+void vp8_loop_filter_mbv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                             int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    vp8_mbloop_filter_vertical_edge_mmx(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
+
+    if (u_ptr)
+        vp8_mbloop_filter_vertical_edge_mmx(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+
+    if (v_ptr)
+        vp8_mbloop_filter_vertical_edge_mmx(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+}
+
+
+/* Horizontal B Filtering */
+void vp8_loop_filter_bh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                            int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    vp8_loop_filter_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+    if (u_ptr)
+        vp8_loop_filter_horizontal_edge_mmx(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+    if (v_ptr)
+        vp8_loop_filter_horizontal_edge_mmx(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+}
+
+
+void vp8_loop_filter_bhs_mmx(unsigned char *y_ptr, int y_stride, const unsigned char *blimit)
+{
+    vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, blimit);
+    vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, blimit);
+    vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, blimit);
+}
+
+
+/* Vertical B Filtering */
+void vp8_loop_filter_bv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                            int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    vp8_loop_filter_vertical_edge_mmx(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_vertical_edge_mmx(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_vertical_edge_mmx(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+    if (u_ptr)
+        vp8_loop_filter_vertical_edge_mmx(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+    if (v_ptr)
+        vp8_loop_filter_vertical_edge_mmx(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+}
+
+
+void vp8_loop_filter_bvs_mmx(unsigned char *y_ptr, int y_stride, const unsigned char *blimit)
+{
+    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, blimit);
+    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, blimit);
+    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, blimit);
+}
+#endif
+
+
+/* Horizontal MB filtering */
+#if HAVE_SSE2
+void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    vp8_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr);
+
+    if (u_ptr)
+        vp8_mbloop_filter_horizontal_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, v_ptr);
+}
+
+
+/* Vertical MB Filtering */
+void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    vp8_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr);
+
+    if (u_ptr)
+        vp8_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, v_ptr);
+}
+
+
+/* Horizontal B Filtering */
+void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                             int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+#if ARCH_X86_64
+    vp8_loop_filter_bh_y_sse2(y_ptr, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+#else
+    vp8_loop_filter_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
+    vp8_loop_filter_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
+    vp8_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
+#endif
+
+    if (u_ptr)
+        vp8_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, v_ptr + 4 * uv_stride);
+}
+
+
+void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit)
+{
+    vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, blimit);
+    vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, blimit);
+    vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, blimit);
+}
+
+
+/* Vertical B Filtering */
+void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                             int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+#if ARCH_X86_64
+    vp8_loop_filter_bv_y_sse2(y_ptr, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+#else
+    vp8_loop_filter_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
+    vp8_loop_filter_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
+    vp8_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
+#endif
+
+    if (u_ptr)
+        vp8_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, v_ptr + 4);
+}
+
+
+void vp8_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit)
+{
+    vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, blimit);
+    vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, blimit);
+    vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, blimit);
+}
+
+#endif
diff --git a/libs/libvpx/vp8/common/x86/mfqe_sse2.asm b/libs/libvpx/vp8/common/x86/mfqe_sse2.asm
new file mode 100644
index 0000000000..a8a7f568dc
--- /dev/null
+++ b/libs/libvpx/vp8/common/x86/mfqe_sse2.asm
@@ -0,0 +1,287 @@
+;
+;  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp8_filter_by_weight16x16_sse2
+;(
+;    unsigned char *src,
+;    int            src_stride,
+;    unsigned char *dst,
+;    int            dst_stride,
+;    int            src_weight
+;)
+global sym(vp8_filter_by_weight16x16_sse2) PRIVATE
+sym(vp8_filter_by_weight16x16_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    SAVE_XMM 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    movd        xmm0, arg(4)                ; src_weight
+    pshuflw     xmm0, xmm0, 0x0             ; replicate to all low words
+    punpcklqdq  xmm0, xmm0                  ; replicate to all hi words
+
+    movdqa      xmm1, [GLOBAL(tMFQE)]
+    psubw       xmm1, xmm0                  ; dst_weight
+
+    mov         rax, arg(0)                 ; src
+    mov         rsi, arg(1)                 ; src_stride
+    mov         rdx, arg(2)                 ; dst
+    mov         rdi, arg(3)                 ; dst_stride
+
+    mov         rcx, 16                     ; loop count
+    pxor        xmm6, xmm6
+
+.combine
+    movdqa      xmm2, [rax]
+    movdqa      xmm4, [rdx]
+    add         rax, rsi
+
+    ; src * src_weight
+    movdqa      xmm3, xmm2
+    punpcklbw   xmm2, xmm6
+    punpckhbw   xmm3, xmm6
+    pmullw      xmm2, xmm0
+    pmullw      xmm3, xmm0
+
+    ; dst * dst_weight
+    movdqa      xmm5, xmm4
+    punpcklbw   xmm4, xmm6
+    punpckhbw   xmm5, xmm6
+    pmullw      xmm4, xmm1
+    pmullw      xmm5, xmm1
+
+    ; sum, round and shift
+    paddw       xmm2, xmm4
+    paddw       xmm3, xmm5
+    paddw       xmm2, [GLOBAL(tMFQE_round)]
+    paddw       xmm3, [GLOBAL(tMFQE_round)]
+    psrlw       xmm2, 4
+    psrlw       xmm3, 4
+
+    packuswb    xmm2, xmm3
+    movdqa      [rdx], xmm2
+    add         rdx, rdi
+
+    dec         rcx
+    jnz         .combine
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+
+    ret
+
+;void vp8_filter_by_weight8x8_sse2
+;(
+;    unsigned char *src,
+;    int            src_stride,
+;    unsigned char *dst,
+;    int            dst_stride,
+;    int            src_weight
+;)
+global sym(vp8_filter_by_weight8x8_sse2) PRIVATE
+sym(vp8_filter_by_weight8x8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    movd        xmm0, arg(4)                ; src_weight
+    pshuflw     xmm0, xmm0, 0x0             ; replicate to all low words
+    punpcklqdq  xmm0, xmm0                  ; replicate to all hi words
+
+    movdqa      xmm1, [GLOBAL(tMFQE)]
+    psubw       xmm1, xmm0                  ; dst_weight
+
+    mov         rax, arg(0)                 ; src
+    mov         rsi, arg(1)                 ; src_stride
+    mov         rdx, arg(2)                 ; dst
+    mov         rdi, arg(3)                 ; dst_stride
+
+    mov         rcx, 8                      ; loop count
+    pxor        xmm4, xmm4
+
+.combine
+    movq        xmm2, [rax]
+    movq        xmm3, [rdx]
+    add         rax, rsi
+
+    ; src * src_weight
+    punpcklbw   xmm2, xmm4
+    pmullw      xmm2, xmm0
+
+    ; dst * dst_weight
+    punpcklbw   xmm3, xmm4
+    pmullw      xmm3, xmm1
+
+    ; sum, round and shift
+    paddw       xmm2, xmm3
+    paddw       xmm2, [GLOBAL(tMFQE_round)]
+    psrlw       xmm2, 4
+
+    packuswb    xmm2, xmm4
+    movq        [rdx], xmm2
+    add         rdx, rdi
+
+    dec         rcx
+    jnz         .combine
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+
+    ret
+
+;void vp8_variance_and_sad_16x16_sse2 | arg
+;(
+;    unsigned char *src1,          0
+;    int            stride1,       1
+;    unsigned char *src2,          2
+;    int            stride2,       3
+;    unsigned int  *variance,      4
+;    unsigned int  *sad,           5
+;)
+global sym(vp8_variance_and_sad_16x16_sse2) PRIVATE
+sym(vp8_variance_and_sad_16x16_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov         rax,        arg(0)          ; src1
+    mov         rcx,        arg(1)          ; stride1
+    mov         rdx,        arg(2)          ; src2
+    mov         rdi,        arg(3)          ; stride2
+
+    mov         rsi,        16              ; block height
+
+    ; Prep accumulator registers
+    pxor        xmm3, xmm3                  ; SAD
+    pxor        xmm4, xmm4                  ; sum of src2
+    pxor        xmm5, xmm5                  ; sum of src2^2
+
+    ; Because we're working with the actual output frames
+    ; we can't depend on any kind of data alignment.
+.accumulate
+    movdqa      xmm0, [rax]                 ; src1
+    movdqa      xmm1, [rdx]                 ; src2
+    add         rax, rcx                    ; src1 + stride1
+    add         rdx, rdi                    ; src2 + stride2
+
+    ; SAD(src1, src2)
+    psadbw      xmm0, xmm1
+    paddusw     xmm3, xmm0
+
+    ; SUM(src2)
+    pxor        xmm2, xmm2
+    psadbw      xmm2, xmm1                  ; sum src2 by misusing SAD against 0
+    paddusw     xmm4, xmm2
+
+    ; pmaddubsw would be ideal if it took two unsigned values. instead,
+    ; it expects a signed and an unsigned value. so instead we zero extend
+    ; and operate on words.
+    pxor        xmm2, xmm2
+    movdqa      xmm0, xmm1
+    punpcklbw   xmm0, xmm2
+    punpckhbw   xmm1, xmm2
+    pmaddwd     xmm0, xmm0
+    pmaddwd     xmm1, xmm1
+    paddd       xmm5, xmm0
+    paddd       xmm5, xmm1
+
+    sub         rsi,        1
+    jnz         .accumulate
+
+    ; phaddd only operates on adjacent double words.
+    ; Finalize SAD and store
+    movdqa      xmm0, xmm3
+    psrldq      xmm0, 8
+    paddusw     xmm0, xmm3
+    paddd       xmm0, [GLOBAL(t128)]
+    psrld       xmm0, 8
+
+    mov         rax,  arg(5)
+    movd        [rax], xmm0
+
+    ; Accumulate sum of src2
+    movdqa      xmm0, xmm4
+    psrldq      xmm0, 8
+    paddusw     xmm0, xmm4
+    ; Square src2. Ignore high value
+    pmuludq     xmm0, xmm0
+    psrld       xmm0, 8
+
+    ; phaddw could be used to sum adjacent values but we want
+    ; all the values summed. promote to doubles, accumulate,
+    ; shift and sum
+    pxor        xmm2, xmm2
+    movdqa      xmm1, xmm5
+    punpckldq   xmm1, xmm2
+    punpckhdq   xmm5, xmm2
+    paddd       xmm1, xmm5
+    movdqa      xmm2, xmm1
+    psrldq      xmm1, 8
+    paddd       xmm1, xmm2
+
+    psubd       xmm1, xmm0
+
+    ; (variance + 128) >> 8
+    paddd       xmm1, [GLOBAL(t128)]
+    psrld       xmm1, 8
+    mov         rax,  arg(4)
+
+    movd        [rax], xmm1
+
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+t128:
+%ifndef __NASM_VER__
+    ddq 128
+%elif CONFIG_BIG_ENDIAN
+    dq  0, 128
+%else
+    dq  128, 0
+%endif
+align 16
+tMFQE: ; 1 << MFQE_PRECISION
+    times 8 dw 0x10
+align 16
+tMFQE_round: ; 1 << (MFQE_PRECISION - 1)
+    times 8 dw 0x08
+
diff --git a/libs/libvpx/vp8/common/x86/postproc_mmx.asm b/libs/libvpx/vp8/common/x86/postproc_mmx.asm
new file mode 100644
index 0000000000..a2b16327f0
--- /dev/null
+++ b/libs/libvpx/vp8/common/x86/postproc_mmx.asm
@@ -0,0 +1,315 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%define VP8_FILTER_WEIGHT 128
+%define VP8_FILTER_SHIFT  7
+
+;void vp8_mbpost_proc_down_mmx(unsigned char *dst,
+;                             int pitch, int rows, int cols,int flimit)
+extern sym(vp8_rv)
+global sym(vp8_mbpost_proc_down_mmx) PRIVATE
+sym(vp8_mbpost_proc_down_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 136
+
+    ; unsigned char d[16][8] at [rsp]
+    ; create flimit2 at [rsp+128]
+    mov         eax, dword ptr arg(4) ;flimit
+    mov         [rsp+128], eax
+    mov         [rsp+128+4], eax
+%define flimit2 [rsp+128]
+
+%if ABI_IS_32BIT=0
+    lea         r8,       [GLOBAL(sym(vp8_rv))]
+%endif
+
+    ;rows +=8;
+    add         dword ptr arg(2), 8
+
+    ;for(c=0; c<cols; c+=4)
+.loop_col:
+            mov         rsi,        arg(0)  ;s
+            pxor        mm0,        mm0     ;
+
+            movsxd      rax,        dword ptr arg(1) ;pitch       ;
+
+            ; this copies the last row down into the border 8 rows
+            mov         rdi,        rsi
+            mov         rdx,        arg(2)
+            sub         rdx,        9
+            imul        rdx,        rax
+            lea         rdi,        [rdi+rdx]
+            movq        mm1,        QWORD ptr[rdi]              ; first row
+            mov         rcx,        8
+.init_borderd                                                    ; initialize borders
+            lea         rdi,        [rdi + rax]
+            movq        [rdi],      mm1
+
+            dec         rcx
+            jne         .init_borderd
+
+            neg         rax                                     ; rax = -pitch
+
+            ; this copies the first row up into the border 8 rows
+            mov         rdi,        rsi
+            movq        mm1,        QWORD ptr[rdi]              ; first row
+            mov         rcx,        8
+.init_border                                                    ; initialize borders
+            lea         rdi,        [rdi + rax]
+            movq        [rdi],      mm1
+
+            dec         rcx
+            jne         .init_border
+
+
+            lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
+            neg         rax
+
+
+            pxor        mm5,        mm5
+            pxor        mm6,        mm6     ;
+
+            pxor        mm7,        mm7     ;
+            mov         rdi,        rsi
+
+            mov         rcx,        15          ;
+
+.loop_initvar:
+            movd        mm1,        DWORD PTR [rdi];
+            punpcklbw   mm1,        mm0     ;
+
+            paddw       mm5,        mm1     ;
+            pmullw      mm1,        mm1     ;
+
+            movq        mm2,        mm1     ;
+            punpcklwd   mm1,        mm0     ;
+
+            punpckhwd   mm2,        mm0     ;
+            paddd       mm6,        mm1     ;
+
+            paddd       mm7,        mm2     ;
+            lea         rdi,        [rdi+rax]   ;
+
+            dec         rcx
+            jne         .loop_initvar
+            ;save the var and sum
+            xor         rdx,        rdx
+.loop_row:
+            movd        mm1,        DWORD PTR [rsi]     ; [s-pitch*8]
+            movd        mm2,        DWORD PTR [rdi]     ; [s+pitch*7]
+
+            punpcklbw   mm1,        mm0
+            punpcklbw   mm2,        mm0
+
+            paddw       mm5,        mm2
+            psubw       mm5,        mm1
+
+            pmullw      mm2,        mm2
+            movq        mm4,        mm2
+
+            punpcklwd   mm2,        mm0
+            punpckhwd   mm4,        mm0
+
+            paddd       mm6,        mm2
+            paddd       mm7,        mm4
+
+            pmullw      mm1,        mm1
+            movq        mm2,        mm1
+
+            punpcklwd   mm1,        mm0
+            psubd       mm6,        mm1
+
+            punpckhwd   mm2,        mm0
+            psubd       mm7,        mm2
+
+
+            movq        mm3,        mm6
+            pslld       mm3,        4
+
+            psubd       mm3,        mm6
+            movq        mm1,        mm5
+
+            movq        mm4,        mm5
+            pmullw      mm1,        mm1
+
+            pmulhw      mm4,        mm4
+            movq        mm2,        mm1
+
+            punpcklwd   mm1,        mm4
+            punpckhwd   mm2,        mm4
+
+            movq        mm4,        mm7
+            pslld       mm4,        4
+
+            psubd       mm4,        mm7
+
+            psubd       mm3,        mm1
+            psubd       mm4,        mm2
+
+            psubd       mm3,        flimit2
+            psubd       mm4,        flimit2
+
+            psrad       mm3,        31
+            psrad       mm4,        31
+
+            packssdw    mm3,        mm4
+            packsswb    mm3,        mm0
+
+            movd        mm1,        DWORD PTR [rsi+rax*8]
+
+            movq        mm2,        mm1
+            punpcklbw   mm1,        mm0
+
+            paddw       mm1,        mm5
+            mov         rcx,        rdx
+
+            and         rcx,        127
+%if ABI_IS_32BIT=1 && CONFIG_PIC=1
+            push        rax
+            lea         rax,        [GLOBAL(sym(vp8_rv))]
+            movq        mm4,        [rax + rcx*2] ;vp8_rv[rcx*2]
+            pop         rax
+%elif ABI_IS_32BIT=0
+            movq        mm4,        [r8 + rcx*2] ;vp8_rv[rcx*2]
+%else
+            movq        mm4,        [sym(vp8_rv) + rcx*2]
+%endif
+            paddw       mm1,        mm4
+            psraw       mm1,        4
+
+            packuswb    mm1,        mm0
+            pand        mm1,        mm3
+
+            pandn       mm3,        mm2
+            por         mm1,        mm3
+
+            and         rcx,        15
+            movd        DWORD PTR   [rsp+rcx*4], mm1 ;d[rcx*4]
+
+            cmp         edx,        8
+            jl          .skip_assignment
+
+            mov         rcx,        rdx
+            sub         rcx,        8
+            and         rcx,        15
+            movd        mm1,        DWORD PTR [rsp+rcx*4] ;d[rcx*4]
+            movd        [rsi],      mm1
+
+.skip_assignment
+            lea         rsi,        [rsi+rax]
+
+            lea         rdi,        [rdi+rax]
+            add         rdx,        1
+
+            cmp         edx,        dword arg(2) ;rows
+            jl          .loop_row
+
+
+        add         dword arg(0), 4 ; s += 4
+        sub         dword arg(3), 4 ; cols -= 4
+        cmp         dword arg(3), 0
+        jg          .loop_col
+
+    add         rsp, 136
+    pop         rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%undef flimit2
+
+
+;void vp8_plane_add_noise_mmx (unsigned char *Start, unsigned char *noise,
+;                            unsigned char blackclamp[16],
+;                            unsigned char whiteclamp[16],
+;                            unsigned char bothclamp[16],
+;                            unsigned int Width, unsigned int Height, int Pitch)
+global sym(vp8_plane_add_noise_mmx) PRIVATE
+sym(vp8_plane_add_noise_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 8
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+.addnoise_loop:
+    call sym(LIBVPX_RAND) WRT_PLT
+    mov     rcx, arg(1) ;noise
+    and     rax, 0xff
+    add     rcx, rax
+
+    ; we rely on the fact that the clamping vectors are stored contiguously
+    ; in black/white/both order. Note that we have to reload this here because
+    ; rdx could be trashed by rand()
+    mov     rdx, arg(2) ; blackclamp
+
+
+            mov     rdi, rcx
+            movsxd  rcx, dword arg(5) ;[Width]
+            mov     rsi, arg(0) ;Pos
+            xor         rax,rax
+
+.addnoise_nextset:
+            movq        mm1,[rsi+rax]         ; get the source
+
+            psubusb     mm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
+            paddusb     mm1, [rdx+32] ;bothclamp
+            psubusb     mm1, [rdx+16] ;whiteclamp
+
+            movq        mm2,[rdi+rax]         ; get the noise for this line
+            paddb       mm1,mm2              ; add it in
+            movq        [rsi+rax],mm1         ; store the result
+
+            add         rax,8                 ; move to the next line
+
+            cmp         rax, rcx
+            jl          .addnoise_nextset
+
+    movsxd  rax, dword arg(7) ; Pitch
+    add     arg(0), rax ; Start += Pitch
+    sub     dword arg(6), 1   ; Height -= 1
+    jg      .addnoise_loop
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+SECTION_RODATA
+align 16
+Blur:
+    times 16 dw 16
+    times  8 dw 64
+    times 16 dw 16
+    times  8 dw  0
+
+rd:
+    times 4 dw 0x40
diff --git a/libs/libvpx/vp8/common/x86/postproc_sse2.asm b/libs/libvpx/vp8/common/x86/postproc_sse2.asm
new file mode 100644
index 0000000000..fed4ee5ccf
--- /dev/null
+++ b/libs/libvpx/vp8/common/x86/postproc_sse2.asm
@@ -0,0 +1,723 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;macro in deblock functions
+%macro FIRST_2_ROWS 0
+        movdqa      xmm4,       xmm0
+        movdqa      xmm6,       xmm0
+        movdqa      xmm5,       xmm1
+        pavgb       xmm5,       xmm3
+
+        ;calculate absolute value
+        psubusb     xmm4,       xmm1
+        psubusb     xmm1,       xmm0
+        psubusb     xmm6,       xmm3
+        psubusb     xmm3,       xmm0
+        paddusb     xmm4,       xmm1
+        paddusb     xmm6,       xmm3
+
+        ;get threshold
+        movdqa      xmm2,       flimit
+        pxor        xmm1,       xmm1
+        movdqa      xmm7,       xmm2
+
+        ;get mask
+        psubusb     xmm2,       xmm4
+        psubusb     xmm7,       xmm6
+        pcmpeqb     xmm2,       xmm1
+        pcmpeqb     xmm7,       xmm1
+        por         xmm7,       xmm2
+%endmacro
+
+%macro SECOND_2_ROWS 0
+        movdqa      xmm6,       xmm0
+        movdqa      xmm4,       xmm0
+        movdqa      xmm2,       xmm1
+        pavgb       xmm1,       xmm3
+
+        ;calculate absolute value
+        psubusb     xmm6,       xmm2
+        psubusb     xmm2,       xmm0
+        psubusb     xmm4,       xmm3
+        psubusb     xmm3,       xmm0
+        paddusb     xmm6,       xmm2
+        paddusb     xmm4,       xmm3
+
+        pavgb       xmm5,       xmm1
+
+        ;get threshold
+        movdqa      xmm2,       flimit
+        pxor        xmm1,       xmm1
+        movdqa      xmm3,       xmm2
+
+        ;get mask
+        psubusb     xmm2,       xmm6
+        psubusb     xmm3,       xmm4
+        pcmpeqb     xmm2,       xmm1
+        pcmpeqb     xmm3,       xmm1
+
+        por         xmm7,       xmm2
+        por         xmm7,       xmm3
+
+        pavgb       xmm5,       xmm0
+
+        ;decide if or not to use filtered value
+        pand        xmm0,       xmm7
+        pandn       xmm7,       xmm5
+        paddusb     xmm0,       xmm7
+%endmacro
+
+%macro UPDATE_FLIMIT 0
+        movdqa      xmm2,       XMMWORD PTR [rbx]
+        movdqa      [rsp],      xmm2
+        add         rbx,        16
+%endmacro
+
+;void vp8_post_proc_down_and_across_mb_row_sse2
+;(
+;    unsigned char *src_ptr,
+;    unsigned char *dst_ptr,
+;    int src_pixels_per_line,
+;    int dst_pixels_per_line,
+;    int cols,
+;    int *flimits,
+;    int size
+;)
+global sym(vp8_post_proc_down_and_across_mb_row_sse2) PRIVATE
+sym(vp8_post_proc_down_and_across_mb_row_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+    ALIGN_STACK 16, rax
+    sub         rsp, 16
+
+        ; put flimit on stack
+        mov         rbx,        arg(5)           ;flimits ptr
+        UPDATE_FLIMIT
+
+%define flimit [rsp]
+
+        mov         rsi,        arg(0)           ;src_ptr
+        mov         rdi,        arg(1)           ;dst_ptr
+
+        movsxd      rax,        DWORD PTR arg(2) ;src_pixels_per_line
+        movsxd      rcx,        DWORD PTR arg(6) ;rows in a macroblock
+.nextrow:
+        xor         rdx,        rdx              ;col
+.nextcol:
+        ;load current and next 2 rows
+        movdqu      xmm0,       XMMWORD PTR [rsi]
+        movdqu      xmm1,       XMMWORD PTR [rsi + rax]
+        movdqu      xmm3,       XMMWORD PTR [rsi + 2*rax]
+
+        FIRST_2_ROWS
+
+        ;load above 2 rows
+        neg         rax
+        movdqu      xmm1,       XMMWORD PTR [rsi + 2*rax]
+        movdqu      xmm3,       XMMWORD PTR [rsi + rax]
+
+        SECOND_2_ROWS
+
+        movdqu      XMMWORD PTR [rdi], xmm0
+
+        neg         rax                          ; positive stride
+        add         rsi,        16
+        add         rdi,        16
+
+        add         rdx,        16
+        cmp         edx,        dword arg(4)     ;cols
+        jge         .downdone
+        UPDATE_FLIMIT
+        jmp         .nextcol
+
+.downdone:
+        ; done with the all cols, start the across filtering in place
+        sub         rsi,        rdx
+        sub         rdi,        rdx
+
+        mov         rbx,        arg(5) ; flimits
+        UPDATE_FLIMIT
+
+        ; dup the first byte into the left border 8 times
+        movq        mm1,   [rdi]
+        punpcklbw   mm1,   mm1
+        punpcklwd   mm1,   mm1
+        punpckldq   mm1,   mm1
+        mov         rdx,    -8
+        movq        [rdi+rdx], mm1
+
+        ; dup the last byte into the right border
+        movsxd      rdx,    dword arg(4)
+        movq        mm1,   [rdi + rdx + -1]
+        punpcklbw   mm1,   mm1
+        punpcklwd   mm1,   mm1
+        punpckldq   mm1,   mm1
+        movq        [rdi+rdx], mm1
+
+        xor         rdx,        rdx
+        movq        mm0,        QWORD PTR [rdi-16];
+        movq        mm1,        QWORD PTR [rdi-8];
+
+.acrossnextcol:
+        movdqu      xmm0,       XMMWORD PTR [rdi + rdx]
+        movdqu      xmm1,       XMMWORD PTR [rdi + rdx -2]
+        movdqu      xmm3,       XMMWORD PTR [rdi + rdx -1]
+
+        FIRST_2_ROWS
+
+        movdqu      xmm1,       XMMWORD PTR [rdi + rdx +1]
+        movdqu      xmm3,       XMMWORD PTR [rdi + rdx +2]
+
+        SECOND_2_ROWS
+
+        movq        QWORD PTR [rdi+rdx-16], mm0  ; store previous 8 bytes
+        movq        QWORD PTR [rdi+rdx-8], mm1   ; store previous 8 bytes
+        movdq2q     mm0,        xmm0
+        psrldq      xmm0,       8
+        movdq2q     mm1,        xmm0
+
+        add         rdx,        16
+        cmp         edx,        dword arg(4)     ;cols
+        jge         .acrossdone
+        UPDATE_FLIMIT
+        jmp         .acrossnextcol
+
+.acrossdone
+        ; last 16 pixels
+        movq        QWORD PTR [rdi+rdx-16], mm0
+
+        cmp         edx,        dword arg(4)
+        jne         .throw_last_8
+        movq        QWORD PTR [rdi+rdx-8], mm1
+.throw_last_8:
+        ; done with this rwo
+        add         rsi,rax                      ;next src line
+        mov         eax, dword arg(3)            ;dst_pixels_per_line
+        add         rdi,rax                      ;next destination
+        mov         eax, dword arg(2)            ;src_pixels_per_line
+
+        mov         rbx,        arg(5)           ;flimits
+        UPDATE_FLIMIT
+
+        dec         rcx                          ;decrement count
+        jnz         .nextrow                     ;next row
+
+    add rsp, 16
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    pop rbx
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%undef flimit
+
+;void vp8_mbpost_proc_down_xmm(unsigned char *dst,
+;                            int pitch, int rows, int cols,int flimit)
+extern sym(vp8_rv)
+global sym(vp8_mbpost_proc_down_xmm) PRIVATE
+sym(vp8_mbpost_proc_down_xmm):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 128+16
+
+    ; unsigned char d[16][8] at [rsp]
+    ; create flimit2 at [rsp+128]
+    mov         eax, dword ptr arg(4) ;flimit
+    mov         [rsp+128], eax
+    mov         [rsp+128+4], eax
+    mov         [rsp+128+8], eax
+    mov         [rsp+128+12], eax
+%define flimit4 [rsp+128]
+
+%if ABI_IS_32BIT=0
+    lea         r8,       [GLOBAL(sym(vp8_rv))]
+%endif
+
+    ;rows +=8;
+    add         dword arg(2), 8
+
+    ;for(c=0; c<cols; c+=8)
+.loop_col:
+            mov         rsi,        arg(0) ; s
+            pxor        xmm0,       xmm0        ;
+
+            movsxd      rax,        dword ptr arg(1) ;pitch       ;
+
+            ; this copies the last row down into the border 8 rows
+            mov         rdi,        rsi
+            mov         rdx,        arg(2)
+            sub         rdx,        9
+            imul        rdx,        rax
+            lea         rdi,        [rdi+rdx]
+            movq        xmm1,       QWORD ptr[rdi]              ; first row
+            mov         rcx,        8
+.init_borderd                                                    ; initialize borders
+            lea         rdi,        [rdi + rax]
+            movq        [rdi],      xmm1
+
+            dec         rcx
+            jne         .init_borderd
+
+            neg         rax                                     ; rax = -pitch
+
+            ; this copies the first row up into the border 8 rows
+            mov         rdi,        rsi
+            movq        xmm1,       QWORD ptr[rdi]              ; first row
+            mov         rcx,        8
+.init_border                                                    ; initialize borders
+            lea         rdi,        [rdi + rax]
+            movq        [rdi],      xmm1
+
+            dec         rcx
+            jne         .init_border
+
+
+
+            lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
+            neg         rax
+
+            pxor        xmm5,       xmm5
+            pxor        xmm6,       xmm6        ;
+
+            pxor        xmm7,       xmm7        ;
+            mov         rdi,        rsi
+
+            mov         rcx,        15          ;
+
+.loop_initvar:
+            movq        xmm1,       QWORD PTR [rdi];
+            punpcklbw   xmm1,       xmm0        ;
+
+            paddw       xmm5,       xmm1        ;
+            pmullw      xmm1,       xmm1        ;
+
+            movdqa      xmm2,       xmm1        ;
+            punpcklwd   xmm1,       xmm0        ;
+
+            punpckhwd   xmm2,       xmm0        ;
+            paddd       xmm6,       xmm1        ;
+
+            paddd       xmm7,       xmm2        ;
+            lea         rdi,        [rdi+rax]   ;
+
+            dec         rcx
+            jne         .loop_initvar
+            ;save the var and sum
+            xor         rdx,        rdx
+.loop_row:
+            movq        xmm1,       QWORD PTR [rsi]     ; [s-pitch*8]
+            movq        xmm2,       QWORD PTR [rdi]     ; [s+pitch*7]
+
+            punpcklbw   xmm1,       xmm0
+            punpcklbw   xmm2,       xmm0
+
+            paddw       xmm5,       xmm2
+            psubw       xmm5,       xmm1
+
+            pmullw      xmm2,       xmm2
+            movdqa      xmm4,       xmm2
+
+            punpcklwd   xmm2,       xmm0
+            punpckhwd   xmm4,       xmm0
+
+            paddd       xmm6,       xmm2
+            paddd       xmm7,       xmm4
+
+            pmullw      xmm1,       xmm1
+            movdqa      xmm2,       xmm1
+
+            punpcklwd   xmm1,       xmm0
+            psubd       xmm6,       xmm1
+
+            punpckhwd   xmm2,       xmm0
+            psubd       xmm7,       xmm2
+
+
+            movdqa      xmm3,       xmm6
+            pslld       xmm3,       4
+
+            psubd       xmm3,       xmm6
+            movdqa      xmm1,       xmm5
+
+            movdqa      xmm4,       xmm5
+            pmullw      xmm1,       xmm1
+
+            pmulhw      xmm4,       xmm4
+            movdqa      xmm2,       xmm1
+
+            punpcklwd   xmm1,       xmm4
+            punpckhwd   xmm2,       xmm4
+
+            movdqa      xmm4,       xmm7
+            pslld       xmm4,       4
+
+            psubd       xmm4,       xmm7
+
+            psubd       xmm3,       xmm1
+            psubd       xmm4,       xmm2
+
+            psubd       xmm3,       flimit4
+            psubd       xmm4,       flimit4
+
+            psrad       xmm3,       31
+            psrad       xmm4,       31
+
+            packssdw    xmm3,       xmm4
+            packsswb    xmm3,       xmm0
+
+            movq        xmm1,       QWORD PTR [rsi+rax*8]
+
+            movq        xmm2,       xmm1
+            punpcklbw   xmm1,       xmm0
+
+            paddw       xmm1,       xmm5
+            mov         rcx,        rdx
+
+            and         rcx,        127
+%if ABI_IS_32BIT=1 && CONFIG_PIC=1
+            push        rax
+            lea         rax,        [GLOBAL(sym(vp8_rv))]
+            movdqu      xmm4,       [rax + rcx*2] ;vp8_rv[rcx*2]
+            pop         rax
+%elif ABI_IS_32BIT=0
+            movdqu      xmm4,       [r8 + rcx*2] ;vp8_rv[rcx*2]
+%else
+            movdqu      xmm4,       [sym(vp8_rv) + rcx*2]
+%endif
+
+            paddw       xmm1,       xmm4
+            ;paddw     xmm1,       eight8s
+            psraw       xmm1,       4
+
+            packuswb    xmm1,       xmm0
+            pand        xmm1,       xmm3
+
+            pandn       xmm3,       xmm2
+            por         xmm1,       xmm3
+
+            and         rcx,        15
+            movq        QWORD PTR   [rsp + rcx*8], xmm1 ;d[rcx*8]
+
+            cmp         edx,        8
+            jl          .skip_assignment
+
+            mov         rcx,        rdx
+            sub         rcx,        8
+            and         rcx,        15
+            movq        mm0,        [rsp + rcx*8] ;d[rcx*8]
+            movq        [rsi],      mm0
+
+.skip_assignment
+            lea         rsi,        [rsi+rax]
+
+            lea         rdi,        [rdi+rax]
+            add         rdx,        1
+
+            cmp         edx,        dword arg(2) ;rows
+            jl          .loop_row
+
+        add         dword arg(0), 8 ; s += 8
+        sub         dword arg(3), 8 ; cols -= 8
+        cmp         dword arg(3), 0
+        jg          .loop_col
+
+    add         rsp, 128+16
+    pop         rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%undef flimit4
+
+
+;void vp8_mbpost_proc_across_ip_xmm(unsigned char *src,
+;                                int pitch, int rows, int cols,int flimit)
+global sym(vp8_mbpost_proc_across_ip_xmm) PRIVATE
+sym(vp8_mbpost_proc_across_ip_xmm):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16
+
+    ; create flimit4 at [rsp]
+    mov         eax, dword ptr arg(4) ;flimit
+    mov         [rsp], eax
+    mov         [rsp+4], eax
+    mov         [rsp+8], eax
+    mov         [rsp+12], eax
+%define flimit4 [rsp]
+
+
+    ;for(r=0;r<rows;r++)
+.ip_row_loop:
+
+        xor         rdx,    rdx ;sumsq=0;
+        xor         rcx,    rcx ;sum=0;
+        mov         rsi,    arg(0); s
+
+
+        ; dup the first byte into the left border 8 times
+        movq        mm1,   [rsi]
+        punpcklbw   mm1,   mm1
+        punpcklwd   mm1,   mm1
+        punpckldq   mm1,   mm1
+
+        mov         rdi,    -8
+        movq        [rsi+rdi], mm1
+
+        ; dup the last byte into the right border
+        movsxd      rdx,    dword arg(3)
+        movq        mm1,   [rsi + rdx + -1]
+        punpcklbw   mm1,   mm1
+        punpcklwd   mm1,   mm1
+        punpckldq   mm1,   mm1
+        movq        [rsi+rdx], mm1
+
+.ip_var_loop:
+        ;for(i=-8;i<=6;i++)
+        ;{
+        ;    sumsq += s[i]*s[i];
+        ;    sum   += s[i];
+        ;}
+        movzx       eax, byte [rsi+rdi]
+        add         ecx, eax
+        mul         al
+        add         edx, eax
+        add         rdi, 1
+        cmp         rdi, 6
+        jle         .ip_var_loop
+
+
+            ;mov         rax,    sumsq
+            ;movd        xmm7,   rax
+            movd        xmm7,   edx
+
+            ;mov         rax,    sum
+            ;movd        xmm6,   rax
+            movd        xmm6,   ecx
+
+            mov         rsi,    arg(0) ;s
+            xor         rcx,    rcx
+
+            movsxd      rdx,    dword arg(3) ;cols
+            add         rdx,    8
+            pxor        mm0,    mm0
+            pxor        mm1,    mm1
+
+            pxor        xmm0,   xmm0
+.nextcol4:
+
+            movd        xmm1,   DWORD PTR [rsi+rcx-8]   ; -8 -7 -6 -5
+            movd        xmm2,   DWORD PTR [rsi+rcx+7]   ; +7 +8 +9 +10
+
+            punpcklbw   xmm1,   xmm0                    ; expanding
+            punpcklbw   xmm2,   xmm0                    ; expanding
+
+            punpcklwd   xmm1,   xmm0                    ; expanding to dwords
+            punpcklwd   xmm2,   xmm0                    ; expanding to dwords
+
+            psubd       xmm2,   xmm1                    ; 7--8   8--7   9--6 10--5
+            paddd       xmm1,   xmm1                    ; -8*2   -7*2   -6*2 -5*2
+
+            paddd       xmm1,   xmm2                    ; 7+-8   8+-7   9+-6 10+-5
+            pmaddwd     xmm1,   xmm2                    ; squared of 7+-8   8+-7   9+-6 10+-5
+
+            paddd       xmm6,   xmm2
+            paddd       xmm7,   xmm1
+
+            pshufd      xmm6,   xmm6,   0               ; duplicate the last ones
+            pshufd      xmm7,   xmm7,   0               ; duplicate the last ones
+
+            psrldq      xmm1,       4                   ; 8--7   9--6 10--5  0000
+            psrldq      xmm2,       4                   ; 8--7   9--6 10--5  0000
+
+            pshufd      xmm3,   xmm1,   3               ; 0000  8--7   8--7   8--7 squared
+            pshufd      xmm4,   xmm2,   3               ; 0000  8--7   8--7   8--7 squared
+
+            paddd       xmm6,   xmm4
+            paddd       xmm7,   xmm3
+
+            pshufd      xmm3,   xmm1,   01011111b       ; 0000  0000   9--6   9--6 squared
+            pshufd      xmm4,   xmm2,   01011111b       ; 0000  0000   9--6   9--6 squared
+
+            paddd       xmm7,   xmm3
+            paddd       xmm6,   xmm4
+
+            pshufd      xmm3,   xmm1,   10111111b       ; 0000  0000   8--7   8--7 squared
+            pshufd      xmm4,   xmm2,   10111111b       ; 0000  0000   8--7   8--7 squared
+
+            paddd       xmm7,   xmm3
+            paddd       xmm6,   xmm4
+
+            movdqa      xmm3,   xmm6
+            pmaddwd     xmm3,   xmm3
+
+            movdqa      xmm5,   xmm7
+            pslld       xmm5,   4
+
+            psubd       xmm5,   xmm7
+            psubd       xmm5,   xmm3
+
+            psubd       xmm5,   flimit4
+            psrad       xmm5,   31
+
+            packssdw    xmm5,   xmm0
+            packsswb    xmm5,   xmm0
+
+            movd        xmm1,   DWORD PTR [rsi+rcx]
+            movq        xmm2,   xmm1
+
+            punpcklbw   xmm1,   xmm0
+            punpcklwd   xmm1,   xmm0
+
+            paddd       xmm1,   xmm6
+            paddd       xmm1,   [GLOBAL(four8s)]
+
+            psrad       xmm1,   4
+            packssdw    xmm1,   xmm0
+
+            packuswb    xmm1,   xmm0
+            pand        xmm1,   xmm5
+
+            pandn       xmm5,   xmm2
+            por         xmm5,   xmm1
+
+            movd        [rsi+rcx-8],  mm0
+            movq        mm0,    mm1
+
+            movdq2q     mm1,    xmm5
+            psrldq      xmm7,   12
+
+            psrldq      xmm6,   12
+            add         rcx,    4
+
+            cmp         rcx,    rdx
+            jl          .nextcol4
+
+        ;s+=pitch;
+        movsxd rax, dword arg(1)
+        add    arg(0), rax
+
+        sub dword arg(2), 1 ;rows-=1
+        cmp dword arg(2), 0
+        jg .ip_row_loop
+
+    add         rsp, 16
+    pop         rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%undef flimit4
+
+
+;void vp8_plane_add_noise_wmt (unsigned char *Start, unsigned char *noise,
+;                            unsigned char blackclamp[16],
+;                            unsigned char whiteclamp[16],
+;                            unsigned char bothclamp[16],
+;                            unsigned int Width, unsigned int Height, int Pitch)
+global sym(vp8_plane_add_noise_wmt) PRIVATE
+sym(vp8_plane_add_noise_wmt):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 8
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+.addnoise_loop:
+    call sym(LIBVPX_RAND) WRT_PLT
+    mov     rcx, arg(1) ;noise
+    and     rax, 0xff
+    add     rcx, rax
+
+    ; we rely on the fact that the clamping vectors are stored contiguously
+    ; in black/white/both order. Note that we have to reload this here because
+    ; rdx could be trashed by rand()
+    mov     rdx, arg(2) ; blackclamp
+
+
+            mov     rdi, rcx
+            movsxd  rcx, dword arg(5) ;[Width]
+            mov     rsi, arg(0) ;Pos
+            xor         rax,rax
+
+.addnoise_nextset:
+            movdqu      xmm1,[rsi+rax]         ; get the source
+
+            psubusb     xmm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
+            paddusb     xmm1, [rdx+32] ;bothclamp
+            psubusb     xmm1, [rdx+16] ;whiteclamp
+
+            movdqu      xmm2,[rdi+rax]         ; get the noise for this line
+            paddb       xmm1,xmm2              ; add it in
+            movdqu      [rsi+rax],xmm1         ; store the result
+
+            add         rax,16                 ; move to the next line
+
+            cmp         rax, rcx
+            jl          .addnoise_nextset
+
+    movsxd  rax, dword arg(7) ; Pitch
+    add     arg(0), rax ; Start += Pitch
+    sub     dword arg(6), 1   ; Height -= 1
+    jg      .addnoise_loop
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+SECTION_RODATA
+align 16
+four8s:
+    times 4 dd 8
diff --git a/libs/libvpx/vp8/common/x86/recon_mmx.asm b/libs/libvpx/vp8/common/x86/recon_mmx.asm
new file mode 100644
index 0000000000..15e98713c7
--- /dev/null
+++ b/libs/libvpx/vp8/common/x86/recon_mmx.asm
@@ -0,0 +1,274 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+
+;void copy_mem8x8_mmx(
+;    unsigned char *src,
+;    int src_stride,
+;    unsigned char *dst,
+;    int dst_stride
+;    )
+global sym(vp8_copy_mem8x8_mmx) PRIVATE
+sym(vp8_copy_mem8x8_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rsi,        arg(0) ;src;
+        movq        mm0,        [rsi]
+
+        movsxd      rax,        dword ptr arg(1) ;src_stride;
+        mov         rdi,        arg(2) ;dst;
+
+        movq        mm1,        [rsi+rax]
+        movq        mm2,        [rsi+rax*2]
+
+        movsxd      rcx,        dword ptr arg(3) ;dst_stride
+        lea         rsi,        [rsi+rax*2]
+
+        movq        [rdi],      mm0
+        add         rsi,        rax
+
+        movq        [rdi+rcx],      mm1
+        movq        [rdi+rcx*2],    mm2
+
+
+        lea         rdi,        [rdi+rcx*2]
+        movq        mm3,        [rsi]
+
+        add         rdi,        rcx
+        movq        mm4,        [rsi+rax]
+
+        movq        mm5,        [rsi+rax*2]
+        movq        [rdi],      mm3
+
+        lea         rsi,        [rsi+rax*2]
+        movq        [rdi+rcx],  mm4
+
+        movq        [rdi+rcx*2],    mm5
+        lea         rdi,        [rdi+rcx*2]
+
+        movq        mm0,        [rsi+rax]
+        movq        mm1,        [rsi+rax*2]
+
+        movq        [rdi+rcx],  mm0
+        movq        [rdi+rcx*2],mm1
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void copy_mem8x4_mmx(
+;    unsigned char *src,
+;    int src_stride,
+;    unsigned char *dst,
+;    int dst_stride
+;    )
+global sym(vp8_copy_mem8x4_mmx) PRIVATE
+sym(vp8_copy_mem8x4_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rsi,        arg(0) ;src;
+        movq        mm0,        [rsi]
+
+        movsxd      rax,        dword ptr arg(1) ;src_stride;
+        mov         rdi,        arg(2) ;dst;
+
+        movq        mm1,        [rsi+rax]
+        movq        mm2,        [rsi+rax*2]
+
+        movsxd      rcx,        dword ptr arg(3) ;dst_stride
+        lea         rsi,        [rsi+rax*2]
+
+        movq        [rdi],      mm0
+        movq        [rdi+rcx],      mm1
+
+        movq        [rdi+rcx*2],    mm2
+        lea         rdi,        [rdi+rcx*2]
+
+        movq        mm3,        [rsi+rax]
+        movq        [rdi+rcx],      mm3
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void copy_mem16x16_mmx(
+;    unsigned char *src,
+;    int src_stride,
+;    unsigned char *dst,
+;    int dst_stride
+;    )
+global sym(vp8_copy_mem16x16_mmx) PRIVATE
+sym(vp8_copy_mem16x16_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rsi,        arg(0) ;src;
+        movsxd      rax,        dword ptr arg(1) ;src_stride;
+
+        mov         rdi,        arg(2) ;dst;
+        movsxd      rcx,        dword ptr arg(3) ;dst_stride
+
+        movq        mm0,            [rsi]
+        movq        mm3,            [rsi+8];
+
+        movq        mm1,            [rsi+rax]
+        movq        mm4,            [rsi+rax+8]
+
+        movq        mm2,            [rsi+rax*2]
+        movq        mm5,            [rsi+rax*2+8]
+
+        lea         rsi,            [rsi+rax*2]
+        add         rsi,            rax
+
+        movq        [rdi],          mm0
+        movq        [rdi+8],        mm3
+
+        movq        [rdi+rcx],      mm1
+        movq        [rdi+rcx+8],    mm4
+
+        movq        [rdi+rcx*2],    mm2
+        movq        [rdi+rcx*2+8],  mm5
+
+        lea         rdi,            [rdi+rcx*2]
+        add         rdi,            rcx
+
+        movq        mm0,            [rsi]
+        movq        mm3,            [rsi+8];
+
+        movq        mm1,            [rsi+rax]
+        movq        mm4,            [rsi+rax+8]
+
+        movq        mm2,            [rsi+rax*2]
+        movq        mm5,            [rsi+rax*2+8]
+
+        lea         rsi,            [rsi+rax*2]
+        add         rsi,            rax
+
+        movq        [rdi],          mm0
+        movq        [rdi+8],        mm3
+
+        movq        [rdi+rcx],      mm1
+        movq        [rdi+rcx+8],    mm4
+
+        movq        [rdi+rcx*2],    mm2
+        movq        [rdi+rcx*2+8],  mm5
+
+        lea         rdi,            [rdi+rcx*2]
+        add         rdi,            rcx
+
+        movq        mm0,            [rsi]
+        movq        mm3,            [rsi+8];
+
+        movq        mm1,            [rsi+rax]
+        movq        mm4,            [rsi+rax+8]
+
+        movq        mm2,            [rsi+rax*2]
+        movq        mm5,            [rsi+rax*2+8]
+
+        lea         rsi,            [rsi+rax*2]
+        add         rsi,            rax
+
+        movq        [rdi],          mm0
+        movq        [rdi+8],        mm3
+
+        movq        [rdi+rcx],      mm1
+        movq        [rdi+rcx+8],    mm4
+
+        movq        [rdi+rcx*2],    mm2
+        movq        [rdi+rcx*2+8],  mm5
+
+        lea         rdi,            [rdi+rcx*2]
+        add         rdi,            rcx
+
+        movq        mm0,            [rsi]
+        movq        mm3,            [rsi+8];
+
+        movq        mm1,            [rsi+rax]
+        movq        mm4,            [rsi+rax+8]
+
+        movq        mm2,            [rsi+rax*2]
+        movq        mm5,            [rsi+rax*2+8]
+
+        lea         rsi,            [rsi+rax*2]
+        add         rsi,            rax
+
+        movq        [rdi],          mm0
+        movq        [rdi+8],        mm3
+
+        movq        [rdi+rcx],      mm1
+        movq        [rdi+rcx+8],    mm4
+
+        movq        [rdi+rcx*2],    mm2
+        movq        [rdi+rcx*2+8],  mm5
+
+        lea         rdi,            [rdi+rcx*2]
+        add         rdi,            rcx
+
+        movq        mm0,            [rsi]
+        movq        mm3,            [rsi+8];
+
+        movq        mm1,            [rsi+rax]
+        movq        mm4,            [rsi+rax+8]
+
+        movq        mm2,            [rsi+rax*2]
+        movq        mm5,            [rsi+rax*2+8]
+
+        lea         rsi,            [rsi+rax*2]
+        add         rsi,            rax
+
+        movq        [rdi],          mm0
+        movq        [rdi+8],        mm3
+
+        movq        [rdi+rcx],      mm1
+        movq        [rdi+rcx+8],    mm4
+
+        movq        [rdi+rcx*2],    mm2
+        movq        [rdi+rcx*2+8],  mm5
+
+        lea         rdi,            [rdi+rcx*2]
+        add         rdi,            rcx
+
+        movq        mm0,            [rsi]
+        movq        mm3,            [rsi+8];
+
+        movq        [rdi],          mm0
+        movq        [rdi+8],        mm3
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/libs/libvpx/vp8/common/x86/recon_sse2.asm b/libs/libvpx/vp8/common/x86/recon_sse2.asm
new file mode 100644
index 0000000000..cb89537f76
--- /dev/null
+++ b/libs/libvpx/vp8/common/x86/recon_sse2.asm
@@ -0,0 +1,116 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void copy_mem16x16_sse2(
+;    unsigned char *src,
+;    int src_stride,
+;    unsigned char *dst,
+;    int dst_stride
+;    )
+global sym(vp8_copy_mem16x16_sse2) PRIVATE
+sym(vp8_copy_mem16x16_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rsi,        arg(0) ;src;
+        movdqu      xmm0,       [rsi]
+
+        movsxd      rax,        dword ptr arg(1) ;src_stride;
+        mov         rdi,        arg(2) ;dst;
+
+        movdqu      xmm1,       [rsi+rax]
+        movdqu      xmm2,       [rsi+rax*2]
+
+        movsxd      rcx,        dword ptr arg(3) ;dst_stride
+        lea         rsi,        [rsi+rax*2]
+
+        movdqa      [rdi],      xmm0
+        add         rsi,        rax
+
+        movdqa      [rdi+rcx],  xmm1
+        movdqa      [rdi+rcx*2],xmm2
+
+        lea         rdi,        [rdi+rcx*2]
+        movdqu      xmm3,       [rsi]
+
+        add         rdi,        rcx
+        movdqu      xmm4,       [rsi+rax]
+
+        movdqu      xmm5,       [rsi+rax*2]
+        lea         rsi,        [rsi+rax*2]
+
+        movdqa      [rdi],  xmm3
+        add         rsi,        rax
+
+        movdqa      [rdi+rcx],  xmm4
+        movdqa      [rdi+rcx*2],xmm5
+
+        lea         rdi,        [rdi+rcx*2]
+        movdqu      xmm0,       [rsi]
+
+        add         rdi,        rcx
+        movdqu      xmm1,       [rsi+rax]
+
+        movdqu      xmm2,       [rsi+rax*2]
+        lea         rsi,        [rsi+rax*2]
+
+        movdqa      [rdi],      xmm0
+        add         rsi,        rax
+
+        movdqa      [rdi+rcx],  xmm1
+
+        movdqa      [rdi+rcx*2],    xmm2
+        movdqu      xmm3,       [rsi]
+
+        movdqu      xmm4,       [rsi+rax]
+        lea         rdi,        [rdi+rcx*2]
+
+        add         rdi,        rcx
+        movdqu      xmm5,       [rsi+rax*2]
+
+        lea         rsi,        [rsi+rax*2]
+        movdqa      [rdi],  xmm3
+
+        add         rsi,        rax
+        movdqa      [rdi+rcx],  xmm4
+
+        movdqa      [rdi+rcx*2],xmm5
+        movdqu      xmm0,       [rsi]
+
+        lea         rdi,        [rdi+rcx*2]
+        movdqu      xmm1,       [rsi+rax]
+
+        add         rdi,        rcx
+        movdqu      xmm2,       [rsi+rax*2]
+
+        lea         rsi,        [rsi+rax*2]
+        movdqa      [rdi],      xmm0
+
+        movdqa      [rdi+rcx],  xmm1
+        movdqa      [rdi+rcx*2],xmm2
+
+        movdqu      xmm3,       [rsi+rax]
+        lea         rdi,        [rdi+rcx*2]
+
+        movdqa      [rdi+rcx],  xmm3
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/libs/libvpx/vp8/common/x86/subpixel_mmx.asm b/libs/libvpx/vp8/common/x86/subpixel_mmx.asm
new file mode 100644
index 0000000000..47dd452297
--- /dev/null
+++ b/libs/libvpx/vp8/common/x86/subpixel_mmx.asm
@@ -0,0 +1,702 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+extern sym(vp8_bilinear_filters_x86_8)
+
+
+%define BLOCK_HEIGHT_WIDTH 4
+%define vp8_filter_weight 128
+%define VP8_FILTER_SHIFT  7
+
+
+;void vp8_filter_block1d_h6_mmx
+;(
+;    unsigned char   *src_ptr,
+;    unsigned short  *output_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned int    pixel_step,
+;    unsigned int    output_height,
+;    unsigned int    output_width,
+;    short           * vp8_filter
+;)
+global sym(vp8_filter_block1d_h6_mmx) PRIVATE
+sym(vp8_filter_block1d_h6_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rdx,    arg(6) ;vp8_filter
+
+        movq        mm1,    [rdx + 16]             ; do both the negative taps first!!!
+        movq        mm2,    [rdx + 32]         ;
+        movq        mm6,    [rdx + 48]        ;
+        movq        mm7,    [rdx + 64]        ;
+
+        mov         rdi,    arg(1) ;output_ptr
+        mov         rsi,    arg(0) ;src_ptr
+        movsxd      rcx,    dword ptr arg(4) ;output_height
+        movsxd      rax,    dword ptr arg(5) ;output_width      ; destination pitch?
+        pxor        mm0,    mm0              ; mm0 = 00000000
+
+.nextrow:
+        movq        mm3,    [rsi-2]          ; mm3 = p-2..p5
+        movq        mm4,    mm3              ; mm4 = p-2..p5
+        psrlq       mm3,    8                ; mm3 = p-1..p5
+        punpcklbw   mm3,    mm0              ; mm3 = p-1..p2
+        pmullw      mm3,    mm1              ; mm3 *= kernel 1 modifiers.
+
+        movq        mm5,    mm4              ; mm5 = p-2..p5
+        punpckhbw   mm4,    mm0              ; mm5 = p2..p5
+        pmullw      mm4,    mm7              ; mm5 *= kernel 4 modifiers
+        paddsw      mm3,    mm4              ; mm3 += mm5
+
+        movq        mm4,    mm5              ; mm4 = p-2..p5;
+        psrlq       mm5,    16               ; mm5 = p0..p5;
+        punpcklbw   mm5,    mm0              ; mm5 = p0..p3
+        pmullw      mm5,    mm2              ; mm5 *= kernel 2 modifiers
+        paddsw      mm3,    mm5              ; mm3 += mm5
+
+        movq        mm5,    mm4              ; mm5 = p-2..p5
+        psrlq       mm4,    24               ; mm4 = p1..p5
+        punpcklbw   mm4,    mm0              ; mm4 = p1..p4
+        pmullw      mm4,    mm6              ; mm5 *= kernel 3 modifiers
+        paddsw      mm3,    mm4              ; mm3 += mm5
+
+        ; do outer positive taps
+        movd        mm4,    [rsi+3]
+        punpcklbw   mm4,    mm0              ; mm5 = p3..p6
+        pmullw      mm4,    [rdx+80]         ; mm5 *= kernel 0 modifiers
+        paddsw      mm3,    mm4              ; mm3 += mm5
+
+        punpcklbw   mm5,    mm0              ; mm5 = p-2..p1
+        pmullw      mm5,    [rdx]            ; mm5 *= kernel 5 modifiers
+        paddsw      mm3,    mm5              ; mm3 += mm5
+
+        paddsw      mm3,    [GLOBAL(rd)]              ; mm3 += round value
+        psraw       mm3,    VP8_FILTER_SHIFT     ; mm3 /= 128
+        packuswb    mm3,    mm0              ; pack and unpack to saturate
+        punpcklbw   mm3,    mm0              ;
+
+        movq        [rdi],  mm3              ; store the results in the destination
+
+%if ABI_IS_32BIT
+        add         rsi,    dword ptr arg(2) ;src_pixels_per_line ; next line
+        add         rdi,    rax;
+%else
+        movsxd      r8,     dword ptr arg(2) ;src_pixels_per_line
+        add         rdi,    rax;
+
+        add         rsi,    r8               ; next line
+%endif
+
+        dec         rcx                      ; decrement count
+        jnz         .nextrow                 ; next row
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_filter_block1dc_v6_mmx
+;(
+;   short *src_ptr,
+;   unsigned char *output_ptr,
+;    int output_pitch,
+;   unsigned int pixels_per_line,
+;   unsigned int pixel_step,
+;   unsigned int output_height,
+;   unsigned int output_width,
+;   short * vp8_filter
+;)
+global sym(vp8_filter_block1dc_v6_mmx) PRIVATE
+sym(vp8_filter_block1dc_v6_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 8
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        movq      mm5, [GLOBAL(rd)]
+        push        rbx
+        mov         rbx, arg(7) ;vp8_filter
+        movq      mm1, [rbx + 16]             ; do both the negative taps first!!!
+        movq      mm2, [rbx + 32]         ;
+        movq      mm6, [rbx + 48]        ;
+        movq      mm7, [rbx + 64]        ;
+
+        movsxd      rdx, dword ptr arg(3) ;pixels_per_line
+        mov         rdi, arg(1) ;output_ptr
+        mov         rsi, arg(0) ;src_ptr
+        sub         rsi, rdx
+        sub         rsi, rdx
+        movsxd      rcx, DWORD PTR arg(5) ;output_height
+        movsxd      rax, DWORD PTR arg(2) ;output_pitch      ; destination pitch?
+        pxor        mm0, mm0              ; mm0 = 00000000
+
+
+.nextrow_cv:
+        movq        mm3, [rsi+rdx]        ; mm3 = p0..p8  = row -1
+        pmullw      mm3, mm1              ; mm3 *= kernel 1 modifiers.
+
+
+        movq        mm4, [rsi + 4*rdx]      ; mm4 = p0..p3  = row 2
+        pmullw      mm4, mm7              ; mm4 *= kernel 4 modifiers.
+        paddsw      mm3, mm4              ; mm3 += mm4
+
+        movq        mm4, [rsi + 2*rdx]           ; mm4 = p0..p3  = row 0
+        pmullw      mm4, mm2              ; mm4 *= kernel 2 modifiers.
+        paddsw      mm3, mm4              ; mm3 += mm4
+
+        movq        mm4, [rsi]            ; mm4 = p0..p3  = row -2
+        pmullw      mm4, [rbx]            ; mm4 *= kernel 0 modifiers.
+        paddsw      mm3, mm4              ; mm3 += mm4
+
+
+        add         rsi, rdx              ; move source forward 1 line to avoid 3 * pitch
+        movq        mm4, [rsi + 2*rdx]     ; mm4 = p0..p3  = row 1
+        pmullw      mm4, mm6              ; mm4 *= kernel 3 modifiers.
+        paddsw      mm3, mm4              ; mm3 += mm4
+
+        movq        mm4, [rsi + 4*rdx]    ; mm4 = p0..p3  = row 3
+        pmullw      mm4, [rbx +80]        ; mm4 *= kernel 3 modifiers.
+        paddsw      mm3, mm4              ; mm3 += mm4
+
+
+        paddsw      mm3, mm5               ; mm3 += round value
+        psraw       mm3, VP8_FILTER_SHIFT     ; mm3 /= 128
+        packuswb    mm3, mm0              ; pack and saturate
+
+        movd        [rdi],mm3             ; store the results in the destination
+        ; the subsequent iterations repeat 3 out of 4 of these reads.  Since the
+        ; recon block should be in cache this shouldn't cost much.  Its obviously
+        ; avoidable!!!.
+        lea         rdi,  [rdi+rax] ;
+        dec         rcx                   ; decrement count
+        jnz         .nextrow_cv           ; next row
+
+        pop         rbx
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void bilinear_predict8x8_mmx
+;(
+;    unsigned char  *src_ptr,
+;    int   src_pixels_per_line,
+;    int  xoffset,
+;    int  yoffset,
+;   unsigned char *dst_ptr,
+;    int dst_pitch
+;)
+global sym(vp8_bilinear_predict8x8_mmx) PRIVATE
+sym(vp8_bilinear_predict8x8_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
+    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
+
+        movsxd      rax,        dword ptr arg(2) ;xoffset
+        mov         rdi,        arg(4) ;dst_ptr           ;
+
+        shl         rax,        5 ; offset * 32
+        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
+
+        add         rax,        rcx ; HFilter
+        mov         rsi,        arg(0) ;src_ptr              ;
+
+        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
+        movq        mm1,        [rax]               ;
+
+        movq        mm2,        [rax+16]            ;
+        movsxd      rax,        dword ptr arg(3) ;yoffset
+
+        pxor        mm0,        mm0                 ;
+
+        shl         rax,        5 ; offset*32
+        add         rax,        rcx ; VFilter
+
+        lea         rcx,        [rdi+rdx*8]          ;
+        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
+
+
+
+        ; get the first horizontal line done       ;
+        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+        movq        mm4,        mm3                 ; make a copy of current line
+
+        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
+        punpckhbw   mm4,        mm0                 ;
+
+        pmullw      mm3,        mm1                 ;
+        pmullw      mm4,        mm1                 ;
+
+        movq        mm5,        [rsi+1]             ;
+        movq        mm6,        mm5                 ;
+
+        punpcklbw   mm5,        mm0                 ;
+        punpckhbw   mm6,        mm0                 ;
+
+        pmullw      mm5,        mm2                 ;
+        pmullw      mm6,        mm2                 ;
+
+        paddw       mm3,        mm5                 ;
+        paddw       mm4,        mm6                 ;
+
+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
+        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
+
+        paddw       mm4,        [GLOBAL(rd)]                 ;
+        psraw       mm4,        VP8_FILTER_SHIFT        ;
+
+        movq        mm7,        mm3                 ;
+        packuswb    mm7,        mm4                 ;
+
+        add         rsi,        rdx                 ; next line
+.next_row_8x8:
+        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+        movq        mm4,        mm3                 ; make a copy of current line
+
+        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
+        punpckhbw   mm4,        mm0                 ;
+
+        pmullw      mm3,        mm1                 ;
+        pmullw      mm4,        mm1                 ;
+
+        movq        mm5,        [rsi+1]             ;
+        movq        mm6,        mm5                 ;
+
+        punpcklbw   mm5,        mm0                 ;
+        punpckhbw   mm6,        mm0                 ;
+
+        pmullw      mm5,        mm2                 ;
+        pmullw      mm6,        mm2                 ;
+
+        paddw       mm3,        mm5                 ;
+        paddw       mm4,        mm6                 ;
+
+        movq        mm5,        mm7                 ;
+        movq        mm6,        mm7                 ;
+
+        punpcklbw   mm5,        mm0                 ;
+        punpckhbw   mm6,        mm0
+
+        pmullw      mm5,        [rax]               ;
+        pmullw      mm6,        [rax]               ;
+
+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
+        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
+
+        paddw       mm4,        [GLOBAL(rd)]                 ;
+        psraw       mm4,        VP8_FILTER_SHIFT        ;
+
+        movq        mm7,        mm3                 ;
+        packuswb    mm7,        mm4                 ;
+
+
+        pmullw      mm3,        [rax+16]            ;
+        pmullw      mm4,        [rax+16]            ;
+
+        paddw       mm3,        mm5                 ;
+        paddw       mm4,        mm6                 ;
+
+
+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
+        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
+
+        paddw       mm4,        [GLOBAL(rd)]                 ;
+        psraw       mm4,        VP8_FILTER_SHIFT        ;
+
+        packuswb    mm3,        mm4
+
+        movq        [rdi],      mm3                 ; store the results in the destination
+
+%if ABI_IS_32BIT
+        add         rsi,        rdx                 ; next line
+        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
+%else
+        movsxd      r8,         dword ptr arg(5) ;dst_pitch
+        add         rsi,        rdx                 ; next line
+        add         rdi,        r8                  ;dst_pitch
+%endif
+        cmp         rdi,        rcx                 ;
+        jne         .next_row_8x8
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void bilinear_predict8x4_mmx
+;(
+;    unsigned char  *src_ptr,
+;    int   src_pixels_per_line,
+;    int  xoffset,
+;    int  yoffset,
+;    unsigned char *dst_ptr,
+;    int dst_pitch
+;)
+global sym(vp8_bilinear_predict8x4_mmx) PRIVATE
+sym(vp8_bilinear_predict8x4_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
+    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
+
+        movsxd      rax,        dword ptr arg(2) ;xoffset
+        mov         rdi,        arg(4) ;dst_ptr           ;
+
+        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
+        shl         rax,        5
+
+        mov         rsi,        arg(0) ;src_ptr              ;
+        add         rax,        rcx
+
+        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
+        movq        mm1,        [rax]               ;
+
+        movq        mm2,        [rax+16]            ;
+        movsxd      rax,        dword ptr arg(3) ;yoffset
+
+        pxor        mm0,        mm0                 ;
+        shl         rax,        5
+
+        add         rax,        rcx
+        lea         rcx,        [rdi+rdx*4]          ;
+
+        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
+
+        ; get the first horizontal line done       ;
+        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+        movq        mm4,        mm3                 ; make a copy of current line
+
+        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
+        punpckhbw   mm4,        mm0                 ;
+
+        pmullw      mm3,        mm1                 ;
+        pmullw      mm4,        mm1                 ;
+
+        movq        mm5,        [rsi+1]             ;
+        movq        mm6,        mm5                 ;
+
+        punpcklbw   mm5,        mm0                 ;
+        punpckhbw   mm6,        mm0                 ;
+
+        pmullw      mm5,        mm2                 ;
+        pmullw      mm6,        mm2                 ;
+
+        paddw       mm3,        mm5                 ;
+        paddw       mm4,        mm6                 ;
+
+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
+        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
+
+        paddw       mm4,        [GLOBAL(rd)]                 ;
+        psraw       mm4,        VP8_FILTER_SHIFT        ;
+
+        movq        mm7,        mm3                 ;
+        packuswb    mm7,        mm4                 ;
+
+        add         rsi,        rdx                 ; next line
+.next_row_8x4:
+        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+        movq        mm4,        mm3                 ; make a copy of current line
+
+        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
+        punpckhbw   mm4,        mm0                 ;
+
+        pmullw      mm3,        mm1                 ;
+        pmullw      mm4,        mm1                 ;
+
+        movq        mm5,        [rsi+1]             ;
+        movq        mm6,        mm5                 ;
+
+        punpcklbw   mm5,        mm0                 ;
+        punpckhbw   mm6,        mm0                 ;
+
+        pmullw      mm5,        mm2                 ;
+        pmullw      mm6,        mm2                 ;
+
+        paddw       mm3,        mm5                 ;
+        paddw       mm4,        mm6                 ;
+
+        movq        mm5,        mm7                 ;
+        movq        mm6,        mm7                 ;
+
+        punpcklbw   mm5,        mm0                 ;
+        punpckhbw   mm6,        mm0
+
+        pmullw      mm5,        [rax]               ;
+        pmullw      mm6,        [rax]               ;
+
+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
+        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
+
+        paddw       mm4,        [GLOBAL(rd)]                 ;
+        psraw       mm4,        VP8_FILTER_SHIFT        ;
+
+        movq        mm7,        mm3                 ;
+        packuswb    mm7,        mm4                 ;
+
+
+        pmullw      mm3,        [rax+16]            ;
+        pmullw      mm4,        [rax+16]            ;
+
+        paddw       mm3,        mm5                 ;
+        paddw       mm4,        mm6                 ;
+
+
+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
+        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
+
+        paddw       mm4,        [GLOBAL(rd)]                 ;
+        psraw       mm4,        VP8_FILTER_SHIFT        ;
+
+        packuswb    mm3,        mm4
+
+        movq        [rdi],      mm3                 ; store the results in the destination
+
+%if ABI_IS_32BIT
+        add         rsi,        rdx                 ; next line
+        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
+%else
+        movsxd      r8,         dword ptr arg(5) ;dst_pitch
+        add         rsi,        rdx                 ; next line
+        add         rdi,        r8
+%endif
+        cmp         rdi,        rcx                 ;
+        jne         .next_row_8x4
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void bilinear_predict4x4_mmx
+;(
+;    unsigned char  *src_ptr,
+;    int   src_pixels_per_line,
+;    int  xoffset,
+;    int  yoffset,
+;    unsigned char *dst_ptr,
+;    int dst_pitch
+;)
+global sym(vp8_bilinear_predict4x4_mmx) PRIVATE
+sym(vp8_bilinear_predict4x4_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
+    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
+
+        movsxd      rax,        dword ptr arg(2) ;xoffset
+        mov         rdi,        arg(4) ;dst_ptr           ;
+
+        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
+        shl         rax,        5
+
+        add         rax,        rcx ; HFilter
+        mov         rsi,        arg(0) ;src_ptr              ;
+
+        movsxd      rdx,        dword ptr arg(5) ;ldst_pitch
+        movq        mm1,        [rax]               ;
+
+        movq        mm2,        [rax+16]            ;
+        movsxd      rax,        dword ptr arg(3) ;yoffset
+
+        pxor        mm0,        mm0                 ;
+        shl         rax,        5
+
+        add         rax,        rcx
+        lea         rcx,        [rdi+rdx*4]          ;
+
+        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
+
+        ; get the first horizontal line done       ;
+        movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
+
+        pmullw      mm3,        mm1                 ;
+        movd        mm5,        [rsi+1]             ;
+
+        punpcklbw   mm5,        mm0                 ;
+        pmullw      mm5,        mm2                 ;
+
+        paddw       mm3,        mm5                 ;
+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
+
+        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
+
+        movq        mm7,        mm3                 ;
+        packuswb    mm7,        mm0                 ;
+
+        add         rsi,        rdx                 ; next line
+.next_row_4x4:
+        movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
+
+        pmullw      mm3,        mm1                 ;
+        movd        mm5,        [rsi+1]             ;
+
+        punpcklbw   mm5,        mm0                 ;
+        pmullw      mm5,        mm2                 ;
+
+        paddw       mm3,        mm5                 ;
+
+        movq        mm5,        mm7                 ;
+        punpcklbw   mm5,        mm0                 ;
+
+        pmullw      mm5,        [rax]               ;
+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
+
+        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
+        movq        mm7,        mm3                 ;
+
+        packuswb    mm7,        mm0                 ;
+
+        pmullw      mm3,        [rax+16]            ;
+        paddw       mm3,        mm5                 ;
+
+
+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
+        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
+
+        packuswb    mm3,        mm0
+        movd        [rdi],      mm3                 ; store the results in the destination
+
+%if ABI_IS_32BIT
+        add         rsi,        rdx                 ; next line
+        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
+%else
+        movsxd      r8,         dword ptr arg(5) ;dst_pitch                   ;
+        add         rsi,        rdx                 ; next line
+        add         rdi,        r8
+%endif
+
+        cmp         rdi,        rcx                 ;
+        jne         .next_row_4x4
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+
+SECTION_RODATA
+align 16
+rd:
+    times 4 dw 0x40
+
+align 16
+global HIDDEN_DATA(sym(vp8_six_tap_mmx))
+sym(vp8_six_tap_mmx):
+    times 8 dw 0
+    times 8 dw 0
+    times 8 dw 128
+    times 8 dw 0
+    times 8 dw 0
+    times 8 dw 0
+
+    times 8 dw 0
+    times 8 dw -6
+    times 8 dw 123
+    times 8 dw 12
+    times 8 dw -1
+    times 8 dw 0
+
+    times 8 dw 2
+    times 8 dw -11
+    times 8 dw 108
+    times 8 dw 36
+    times 8 dw -8
+    times 8 dw 1
+
+    times 8 dw 0
+    times 8 dw -9
+    times 8 dw 93
+    times 8 dw 50
+    times 8 dw -6
+    times 8 dw 0
+
+    times 8 dw 3
+    times 8 dw -16
+    times 8 dw 77
+    times 8 dw 77
+    times 8 dw -16
+    times 8 dw 3
+
+    times 8 dw 0
+    times 8 dw -6
+    times 8 dw 50
+    times 8 dw 93
+    times 8 dw -9
+    times 8 dw 0
+
+    times 8 dw 1
+    times 8 dw -8
+    times 8 dw 36
+    times 8 dw 108
+    times 8 dw -11
+    times 8 dw 2
+
+    times 8 dw 0
+    times 8 dw -1
+    times 8 dw 12
+    times 8 dw 123
+    times 8 dw -6
+    times 8 dw 0
+
+
diff --git a/libs/libvpx/vp8/common/x86/subpixel_sse2.asm b/libs/libvpx/vp8/common/x86/subpixel_sse2.asm
new file mode 100644
index 0000000000..69f8d103c1
--- /dev/null
+++ b/libs/libvpx/vp8/common/x86/subpixel_sse2.asm
@@ -0,0 +1,1372 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+extern sym(vp8_bilinear_filters_x86_8)
+
+%define BLOCK_HEIGHT_WIDTH 4
+%define VP8_FILTER_WEIGHT 128
+%define VP8_FILTER_SHIFT  7
+
+
+;/************************************************************************************
+; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
+; input pixel array has output_height rows. This routine assumes that output_height is an
+; even number. This function handles 8 pixels in horizontal direction, calculating ONE
+; rows each iteration to take advantage of the 128 bits operations.
+;*************************************************************************************/
+;void vp8_filter_block1d8_h6_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned short *output_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned int    pixel_step,
+;    unsigned int    output_height,
+;    unsigned int    output_width,
+;    short           *vp8_filter
+;)
+global sym(vp8_filter_block1d8_h6_sse2) PRIVATE
+sym(vp8_filter_block1d8_h6_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rdx,        arg(6) ;vp8_filter
+        mov         rsi,        arg(0) ;src_ptr
+
+        mov         rdi,        arg(1) ;output_ptr
+
+        movsxd      rcx,        dword ptr arg(4) ;output_height
+        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
+%if ABI_IS_32BIT=0
+        movsxd      r8,         dword ptr arg(5) ;output_width
+%endif
+        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
+
+.filter_block1d8_h6_rowloop:
+        movq        xmm3,       MMWORD PTR [rsi - 2]
+        movq        xmm1,       MMWORD PTR [rsi + 6]
+
+        prefetcht2  [rsi+rax-2]
+
+        pslldq      xmm1,       8
+        por         xmm1,       xmm3
+
+        movdqa      xmm4,       xmm1
+        movdqa      xmm5,       xmm1
+
+        movdqa      xmm6,       xmm1
+        movdqa      xmm7,       xmm1
+
+        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
+
+        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
+        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
+
+        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
+
+
+        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
+        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+
+        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
+
+        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
+        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+
+        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
+
+        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
+        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
+
+
+        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
+
+        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
+        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
+
+
+        paddsw      xmm4,       xmm7
+        paddsw      xmm4,       xmm5
+
+        paddsw      xmm4,       xmm3
+        paddsw      xmm4,       xmm6
+
+        paddsw      xmm4,       xmm1
+        paddsw      xmm4,       [GLOBAL(rd)]
+
+        psraw       xmm4,       7
+
+        packuswb    xmm4,       xmm0
+        punpcklbw   xmm4,       xmm0
+
+        movdqa      XMMWORD Ptr [rdi],         xmm4
+        lea         rsi,        [rsi + rax]
+
+%if ABI_IS_32BIT
+        add         rdi,        DWORD Ptr arg(5) ;[output_width]
+%else
+        add         rdi,        r8
+%endif
+        dec         rcx
+
+        jnz         .filter_block1d8_h6_rowloop                ; next row
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_filter_block1d16_h6_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned short *output_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned int    pixel_step,
+;    unsigned int    output_height,
+;    unsigned int    output_width,
+;    short           *vp8_filter
+;)
+;/************************************************************************************
+; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
+; input pixel array has output_height rows. This routine assumes that output_height is an
+; even number. This function handles 8 pixels in horizontal direction, calculating ONE
+; rows each iteration to take advantage of the 128 bits operations.
+;*************************************************************************************/
+global sym(vp8_filter_block1d16_h6_sse2) PRIVATE
+sym(vp8_filter_block1d16_h6_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rdx,        arg(6) ;vp8_filter
+        mov         rsi,        arg(0) ;src_ptr
+
+        mov         rdi,        arg(1) ;output_ptr
+
+        movsxd      rcx,        dword ptr arg(4) ;output_height
+        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
+%if ABI_IS_32BIT=0
+        movsxd      r8,         dword ptr arg(5) ;output_width
+%endif
+
+        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
+
+.filter_block1d16_h6_sse2_rowloop:
+        movq        xmm3,       MMWORD PTR [rsi - 2]
+        movq        xmm1,       MMWORD PTR [rsi + 6]
+
+        movq        xmm2,       MMWORD PTR [rsi +14]
+        pslldq      xmm2,       8
+
+        por         xmm2,       xmm1
+        prefetcht2  [rsi+rax-2]
+
+        pslldq      xmm1,       8
+        por         xmm1,       xmm3
+
+        movdqa      xmm4,       xmm1
+        movdqa      xmm5,       xmm1
+
+        movdqa      xmm6,       xmm1
+        movdqa      xmm7,       xmm1
+
+        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
+
+        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
+        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
+
+        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
+
+
+        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
+        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+
+        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
+
+        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
+        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+
+        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
+
+        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
+        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
+
+
+        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
+
+        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
+        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
+
+        paddsw      xmm4,       xmm7
+        paddsw      xmm4,       xmm5
+
+        paddsw      xmm4,       xmm3
+        paddsw      xmm4,       xmm6
+
+        paddsw      xmm4,       xmm1
+        paddsw      xmm4,       [GLOBAL(rd)]
+
+        psraw       xmm4,       7
+
+        packuswb    xmm4,       xmm0
+        punpcklbw   xmm4,       xmm0
+
+        movdqa      XMMWORD Ptr [rdi],         xmm4
+
+        movdqa      xmm3,       xmm2
+        movdqa      xmm4,       xmm2
+
+        movdqa      xmm5,       xmm2
+        movdqa      xmm6,       xmm2
+
+        movdqa      xmm7,       xmm2
+
+        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
+
+        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
+        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
+
+        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
+
+
+        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
+        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+
+        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
+
+        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
+        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+
+        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
+
+        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
+        psrldq      xmm2,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
+
+        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
+
+        punpcklbw   xmm2,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
+        pmullw      xmm2,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
+
+
+        paddsw      xmm4,       xmm7
+        paddsw      xmm4,       xmm5
+
+        paddsw      xmm4,       xmm3
+        paddsw      xmm4,       xmm6
+
+        paddsw      xmm4,       xmm2
+        paddsw      xmm4,       [GLOBAL(rd)]
+
+        psraw       xmm4,       7
+
+        packuswb    xmm4,       xmm0
+        punpcklbw   xmm4,       xmm0
+
+        movdqa      XMMWORD Ptr [rdi+16],      xmm4
+
+        lea         rsi,        [rsi + rax]
+%if ABI_IS_32BIT
+        add         rdi,        DWORD Ptr arg(5) ;[output_width]
+%else
+        add         rdi,        r8
+%endif
+
+        dec         rcx
+        jnz         .filter_block1d16_h6_sse2_rowloop                ; next row
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_filter_block1d8_v6_sse2
+;(
+;    short *src_ptr,
+;    unsigned char *output_ptr,
+;    int dst_ptich,
+;    unsigned int pixels_per_line,
+;    unsigned int pixel_step,
+;    unsigned int output_height,
+;    unsigned int output_width,
+;    short * vp8_filter
+;)
+;/************************************************************************************
+; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The
+; input pixel array has output_height rows.
+;*************************************************************************************/
+global sym(vp8_filter_block1d8_v6_sse2) PRIVATE
+sym(vp8_filter_block1d8_v6_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 8
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rax,        arg(7) ;vp8_filter
+        movsxd      rdx,        dword ptr arg(3) ;pixels_per_line
+
+        mov         rdi,        arg(1) ;output_ptr
+        mov         rsi,        arg(0) ;src_ptr
+
+        sub         rsi,        rdx
+        sub         rsi,        rdx
+
+        movsxd      rcx,        DWORD PTR arg(5) ;[output_height]
+        pxor        xmm0,       xmm0                        ; clear xmm0
+
+        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
+%if ABI_IS_32BIT=0
+        movsxd      r8,         dword ptr arg(2) ; dst_ptich
+%endif
+
+.vp8_filter_block1d8_v6_sse2_loop:
+        movdqa      xmm1,       XMMWORD PTR [rsi]
+        pmullw      xmm1,       [rax]
+
+        movdqa      xmm2,       XMMWORD PTR [rsi + rdx]
+        pmullw      xmm2,       [rax + 16]
+
+        movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 2]
+        pmullw      xmm3,       [rax + 32]
+
+        movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 4]
+        pmullw      xmm5,       [rax + 64]
+
+        add         rsi,        rdx
+        movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 2]
+
+        pmullw      xmm4,       [rax + 48]
+        movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 4]
+
+        pmullw      xmm6,       [rax + 80]
+
+        paddsw      xmm2,       xmm5
+        paddsw      xmm2,       xmm3
+
+        paddsw      xmm2,       xmm1
+        paddsw      xmm2,       xmm4
+
+        paddsw      xmm2,       xmm6
+        paddsw      xmm2,       xmm7
+
+        psraw       xmm2,       7
+        packuswb    xmm2,       xmm0              ; pack and saturate
+
+        movq        QWORD PTR [rdi], xmm2         ; store the results in the destination
+%if ABI_IS_32BIT
+        add         rdi,        DWORD PTR arg(2) ;[dst_ptich]
+%else
+        add         rdi,        r8
+%endif
+        dec         rcx         ; decrement count
+        jnz         .vp8_filter_block1d8_v6_sse2_loop               ; next row
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_filter_block1d16_v6_sse2
+;(
+;    unsigned short *src_ptr,
+;    unsigned char *output_ptr,
+;    int dst_ptich,
+;    unsigned int pixels_per_line,
+;    unsigned int pixel_step,
+;    unsigned int output_height,
+;    unsigned int output_width,
+;    const short    *vp8_filter
+;)
+;/************************************************************************************
+; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The
+; input pixel array has output_height rows.
+;*************************************************************************************/
+global sym(vp8_filter_block1d16_v6_sse2) PRIVATE
+sym(vp8_filter_block1d16_v6_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 8
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rax,        arg(7) ;vp8_filter
+        movsxd      rdx,        dword ptr arg(3) ;pixels_per_line
+
+        mov         rdi,        arg(1) ;output_ptr
+        mov         rsi,        arg(0) ;src_ptr
+
+        sub         rsi,        rdx
+        sub         rsi,        rdx
+
+        movsxd      rcx,        DWORD PTR arg(5) ;[output_height]
+%if ABI_IS_32BIT=0
+        movsxd      r8,         dword ptr arg(2) ; dst_ptich
+%endif
+
+.vp8_filter_block1d16_v6_sse2_loop:
+; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order.
+        movdqa      xmm1,       XMMWORD PTR [rsi + rdx]       ; line 2
+        movdqa      xmm2,       XMMWORD PTR [rsi + rdx + 16]
+        pmullw      xmm1,       [rax + 16]
+        pmullw      xmm2,       [rax + 16]
+
+        movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 4]       ; line 5
+        movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 4 + 16]
+        pmullw      xmm3,       [rax + 64]
+        pmullw      xmm4,       [rax + 64]
+
+        movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 2]       ; line 3
+        movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 2 + 16]
+        pmullw      xmm5,       [rax + 32]
+        pmullw      xmm6,       [rax + 32]
+
+        movdqa      xmm7,       XMMWORD PTR [rsi]       ; line 1
+        movdqa      xmm0,       XMMWORD PTR [rsi + 16]
+        pmullw      xmm7,       [rax]
+        pmullw      xmm0,       [rax]
+
+        paddsw      xmm1,       xmm3
+        paddsw      xmm2,       xmm4
+        paddsw      xmm1,       xmm5
+        paddsw      xmm2,       xmm6
+        paddsw      xmm1,       xmm7
+        paddsw      xmm2,       xmm0
+
+        add         rsi,        rdx
+
+        movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 2]       ; line 4
+        movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 2 + 16]
+        pmullw      xmm3,       [rax + 48]
+        pmullw      xmm4,       [rax + 48]
+
+        movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 4]       ; line 6
+        movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 4 + 16]
+        pmullw      xmm5,       [rax + 80]
+        pmullw      xmm6,       [rax + 80]
+
+        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
+        pxor        xmm0,       xmm0                        ; clear xmm0
+
+        paddsw      xmm1,       xmm3
+        paddsw      xmm2,       xmm4
+        paddsw      xmm1,       xmm5
+        paddsw      xmm2,       xmm6
+
+        paddsw      xmm1,       xmm7
+        paddsw      xmm2,       xmm7
+
+        psraw       xmm1,       7
+        psraw       xmm2,       7
+
+        packuswb    xmm1,       xmm2              ; pack and saturate
+        movdqa      XMMWORD PTR [rdi], xmm1       ; store the results in the destination
+%if ABI_IS_32BIT
+        add         rdi,        DWORD PTR arg(2) ;[dst_ptich]
+%else
+        add         rdi,        r8
+%endif
+        dec         rcx         ; decrement count
+        jnz         .vp8_filter_block1d16_v6_sse2_loop              ; next row
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_filter_block1d8_h6_only_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    int dst_ptich,
+;    unsigned int    output_height,
+;    const short    *vp8_filter
+;)
+; First-pass filter only when yoffset==0
+global sym(vp8_filter_block1d8_h6_only_sse2) PRIVATE
+sym(vp8_filter_block1d8_h6_only_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rdx,        arg(5) ;vp8_filter
+        mov         rsi,        arg(0) ;src_ptr
+
+        mov         rdi,        arg(2) ;output_ptr
+
+        movsxd      rcx,        dword ptr arg(4) ;output_height
+        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line            ; Pitch for Source
+%if ABI_IS_32BIT=0
+        movsxd      r8,         dword ptr arg(3) ;dst_ptich
+%endif
+        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
+
+.filter_block1d8_h6_only_rowloop:
+        movq        xmm3,       MMWORD PTR [rsi - 2]
+        movq        xmm1,       MMWORD PTR [rsi + 6]
+
+        prefetcht2  [rsi+rax-2]
+
+        pslldq      xmm1,       8
+        por         xmm1,       xmm3
+
+        movdqa      xmm4,       xmm1
+        movdqa      xmm5,       xmm1
+
+        movdqa      xmm6,       xmm1
+        movdqa      xmm7,       xmm1
+
+        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
+
+        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
+        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
+
+        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
+
+
+        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
+        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+
+        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
+
+        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
+        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+
+        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
+
+        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
+        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
+
+
+        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
+
+        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
+        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
+
+
+        paddsw      xmm4,       xmm7
+        paddsw      xmm4,       xmm5
+
+        paddsw      xmm4,       xmm3
+        paddsw      xmm4,       xmm6
+
+        paddsw      xmm4,       xmm1
+        paddsw      xmm4,       [GLOBAL(rd)]
+
+        psraw       xmm4,       7
+
+        packuswb    xmm4,       xmm0
+
+        movq        QWORD PTR [rdi],   xmm4       ; store the results in the destination
+        lea         rsi,        [rsi + rax]
+
+%if ABI_IS_32BIT
+        add         rdi,        DWORD Ptr arg(3) ;dst_ptich
+%else
+        add         rdi,        r8
+%endif
+        dec         rcx
+
+        jnz         .filter_block1d8_h6_only_rowloop               ; next row
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_filter_block1d16_h6_only_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    int dst_ptich,
+;    unsigned int    output_height,
+;    const short    *vp8_filter
+;)
+; First-pass filter only when yoffset==0
+global sym(vp8_filter_block1d16_h6_only_sse2) PRIVATE
+sym(vp8_filter_block1d16_h6_only_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rdx,        arg(5) ;vp8_filter
+        mov         rsi,        arg(0) ;src_ptr
+
+        mov         rdi,        arg(2) ;output_ptr
+
+        movsxd      rcx,        dword ptr arg(4) ;output_height
+        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line            ; Pitch for Source
+%if ABI_IS_32BIT=0
+        movsxd      r8,         dword ptr arg(3) ;dst_ptich
+%endif
+
+        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
+
+.filter_block1d16_h6_only_sse2_rowloop:
+        movq        xmm3,       MMWORD PTR [rsi - 2]
+        movq        xmm1,       MMWORD PTR [rsi + 6]
+
+        movq        xmm2,       MMWORD PTR [rsi +14]
+        pslldq      xmm2,       8
+
+        por         xmm2,       xmm1
+        prefetcht2  [rsi+rax-2]
+
+        pslldq      xmm1,       8
+        por         xmm1,       xmm3
+
+        movdqa      xmm4,       xmm1
+        movdqa      xmm5,       xmm1
+
+        movdqa      xmm6,       xmm1
+        movdqa      xmm7,       xmm1
+
+        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
+
+        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
+        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
+
+        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
+
+        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
+        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+
+        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
+
+        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
+        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+
+        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
+
+        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
+        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
+
+        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
+
+        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
+        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
+
+        paddsw      xmm4,       xmm7
+        paddsw      xmm4,       xmm5
+
+        paddsw      xmm4,       xmm3
+        paddsw      xmm4,       xmm6
+
+        paddsw      xmm4,       xmm1
+        paddsw      xmm4,       [GLOBAL(rd)]
+
+        psraw       xmm4,       7
+
+        packuswb    xmm4,       xmm0                        ; lower 8 bytes
+
+        movq        QWORD Ptr [rdi],         xmm4           ; store the results in the destination
+
+        movdqa      xmm3,       xmm2
+        movdqa      xmm4,       xmm2
+
+        movdqa      xmm5,       xmm2
+        movdqa      xmm6,       xmm2
+
+        movdqa      xmm7,       xmm2
+
+        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
+
+        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
+        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
+
+        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
+
+        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
+        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+
+        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
+
+        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
+        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+
+        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
+
+        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
+        psrldq      xmm2,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
+
+        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
+
+        punpcklbw   xmm2,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
+        pmullw      xmm2,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
+
+        paddsw      xmm4,       xmm7
+        paddsw      xmm4,       xmm5
+
+        paddsw      xmm4,       xmm3
+        paddsw      xmm4,       xmm6
+
+        paddsw      xmm4,       xmm2
+        paddsw      xmm4,       [GLOBAL(rd)]
+
+        psraw       xmm4,       7
+
+        packuswb    xmm4,       xmm0                        ; higher 8 bytes
+
+        movq        QWORD Ptr [rdi+8],      xmm4            ; store the results in the destination
+
+        lea         rsi,        [rsi + rax]
+%if ABI_IS_32BIT
+        add         rdi,        DWORD Ptr arg(3) ;dst_ptich
+%else
+        add         rdi,        r8
+%endif
+
+        dec         rcx
+        jnz         .filter_block1d16_h6_only_sse2_rowloop               ; next row
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_filter_block1d8_v6_only_sse2
+;(
+;    unsigned char *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char *output_ptr,
+;    int dst_ptich,
+;    unsigned int output_height,
+;    const short    *vp8_filter
+;)
+; Second-pass filter only when xoffset==0
+global sym(vp8_filter_block1d8_v6_only_sse2) PRIVATE
+sym(vp8_filter_block1d8_v6_only_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rsi,        arg(0) ;src_ptr
+        mov         rdi,        arg(2) ;output_ptr
+
+        movsxd      rcx,        dword ptr arg(4) ;output_height
+        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
+
+        mov         rax,        arg(5) ;vp8_filter
+
+        pxor        xmm0,       xmm0                        ; clear xmm0
+
+        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
+%if ABI_IS_32BIT=0
+        movsxd      r8,         dword ptr arg(3) ; dst_ptich
+%endif
+
+.vp8_filter_block1d8_v6_only_sse2_loop:
+        movq        xmm1,       MMWORD PTR [rsi]
+        movq        xmm2,       MMWORD PTR [rsi + rdx]
+        movq        xmm3,       MMWORD PTR [rsi + rdx * 2]
+        movq        xmm5,       MMWORD PTR [rsi + rdx * 4]
+        add         rsi,        rdx
+        movq        xmm4,       MMWORD PTR [rsi + rdx * 2]
+        movq        xmm6,       MMWORD PTR [rsi + rdx * 4]
+
+        punpcklbw   xmm1,       xmm0
+        pmullw      xmm1,       [rax]
+
+        punpcklbw   xmm2,       xmm0
+        pmullw      xmm2,       [rax + 16]
+
+        punpcklbw   xmm3,       xmm0
+        pmullw      xmm3,       [rax + 32]
+
+        punpcklbw   xmm5,       xmm0
+        pmullw      xmm5,       [rax + 64]
+
+        punpcklbw   xmm4,       xmm0
+        pmullw      xmm4,       [rax + 48]
+
+        punpcklbw   xmm6,       xmm0
+        pmullw      xmm6,       [rax + 80]
+
+        paddsw      xmm2,       xmm5
+        paddsw      xmm2,       xmm3
+
+        paddsw      xmm2,       xmm1
+        paddsw      xmm2,       xmm4
+
+        paddsw      xmm2,       xmm6
+        paddsw      xmm2,       xmm7
+
+        psraw       xmm2,       7
+        packuswb    xmm2,       xmm0              ; pack and saturate
+
+        movq        QWORD PTR [rdi], xmm2         ; store the results in the destination
+%if ABI_IS_32BIT
+        add         rdi,        DWORD PTR arg(3) ;[dst_ptich]
+%else
+        add         rdi,        r8
+%endif
+        dec         rcx         ; decrement count
+        jnz         .vp8_filter_block1d8_v6_only_sse2_loop              ; next row
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_unpack_block1d16_h6_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned short *output_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned int    output_height,
+;    unsigned int    output_width
+;)
+global sym(vp8_unpack_block1d16_h6_sse2) PRIVATE
+sym(vp8_unpack_block1d16_h6_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rsi,        arg(0) ;src_ptr
+        mov         rdi,        arg(1) ;output_ptr
+
+        movsxd      rcx,        dword ptr arg(3) ;output_height
+        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
+
+        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
+%if ABI_IS_32BIT=0
+        movsxd      r8,         dword ptr arg(4) ;output_width            ; Pitch for Source
+%endif
+
+.unpack_block1d16_h6_sse2_rowloop:
+        movq        xmm1,       MMWORD PTR [rsi]            ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2
+        movq        xmm3,       MMWORD PTR [rsi+8]          ; make copy of xmm1
+
+        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+        punpcklbw   xmm1,       xmm0
+
+        movdqa      XMMWORD Ptr [rdi],         xmm1
+        movdqa      XMMWORD Ptr [rdi + 16],    xmm3
+
+        lea         rsi,        [rsi + rax]
+%if ABI_IS_32BIT
+        add         rdi,        DWORD Ptr arg(4) ;[output_width]
+%else
+        add         rdi,        r8
+%endif
+        dec         rcx
+        jnz         .unpack_block1d16_h6_sse2_rowloop               ; next row
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_bilinear_predict16x16_sse2
+;(
+;    unsigned char  *src_ptr,
+;    int   src_pixels_per_line,
+;    int  xoffset,
+;    int  yoffset,
+;    unsigned char *dst_ptr,
+;    int dst_pitch
+;)
+extern sym(vp8_bilinear_filters_x86_8)
+global sym(vp8_bilinear_predict16x16_sse2) PRIVATE
+sym(vp8_bilinear_predict16x16_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]
+    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]
+
+        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
+        movsxd      rax,        dword ptr arg(2) ;xoffset
+
+        cmp         rax,        0      ;skip first_pass filter if xoffset=0
+        je          .b16x16_sp_only
+
+        shl         rax,        5
+        add         rax,        rcx    ;HFilter
+
+        mov         rdi,        arg(4) ;dst_ptr
+        mov         rsi,        arg(0) ;src_ptr
+        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
+
+        movdqa      xmm1,       [rax]
+        movdqa      xmm2,       [rax+16]
+
+        movsxd      rax,        dword ptr arg(3) ;yoffset
+
+        cmp         rax,        0      ;skip second_pass filter if yoffset=0
+        je          .b16x16_fp_only
+
+        shl         rax,        5
+        add         rax,        rcx    ;VFilter
+
+        lea         rcx,        [rdi+rdx*8]
+        lea         rcx,        [rcx+rdx*8]
+        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
+
+        pxor        xmm0,       xmm0
+
+%if ABI_IS_32BIT=0
+        movsxd      r8,         dword ptr arg(5) ;dst_pitch
+%endif
+        ; get the first horizontal line done
+        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+        movdqa      xmm4,       xmm3                 ; make a copy of current line
+
+        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
+        punpckhbw   xmm4,       xmm0
+
+        pmullw      xmm3,       xmm1
+        pmullw      xmm4,       xmm1
+
+        movdqu      xmm5,       [rsi+1]
+        movdqa      xmm6,       xmm5
+
+        punpcklbw   xmm5,       xmm0
+        punpckhbw   xmm6,       xmm0
+
+        pmullw      xmm5,       xmm2
+        pmullw      xmm6,       xmm2
+
+        paddw       xmm3,       xmm5
+        paddw       xmm4,       xmm6
+
+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
+        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
+
+        paddw       xmm4,       [GLOBAL(rd)]
+        psraw       xmm4,       VP8_FILTER_SHIFT
+
+        movdqa      xmm7,       xmm3
+        packuswb    xmm7,       xmm4
+
+        add         rsi,        rdx                 ; next line
+.next_row:
+        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+        movdqa      xmm4,       xmm3                 ; make a copy of current line
+
+        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
+        punpckhbw   xmm4,       xmm0
+
+        pmullw      xmm3,       xmm1
+        pmullw      xmm4,       xmm1
+
+        movdqu      xmm5,       [rsi+1]
+        movdqa      xmm6,       xmm5
+
+        punpcklbw   xmm5,       xmm0
+        punpckhbw   xmm6,       xmm0
+
+        pmullw      xmm5,       xmm2
+        pmullw      xmm6,       xmm2
+
+        paddw       xmm3,       xmm5
+        paddw       xmm4,       xmm6
+
+        movdqa      xmm5,       xmm7
+        movdqa      xmm6,       xmm7
+
+        punpcklbw   xmm5,       xmm0
+        punpckhbw   xmm6,       xmm0
+
+        pmullw      xmm5,       [rax]
+        pmullw      xmm6,       [rax]
+
+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
+        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
+
+        paddw       xmm4,       [GLOBAL(rd)]
+        psraw       xmm4,       VP8_FILTER_SHIFT
+
+        movdqa      xmm7,       xmm3
+        packuswb    xmm7,       xmm4
+
+        pmullw      xmm3,       [rax+16]
+        pmullw      xmm4,       [rax+16]
+
+        paddw       xmm3,       xmm5
+        paddw       xmm4,       xmm6
+
+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
+        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
+
+        paddw       xmm4,       [GLOBAL(rd)]
+        psraw       xmm4,       VP8_FILTER_SHIFT
+
+        packuswb    xmm3,       xmm4
+        movdqa      [rdi],      xmm3                 ; store the results in the destination
+
+        add         rsi,        rdx                 ; next line
+%if ABI_IS_32BIT
+        add         rdi,        DWORD PTR arg(5) ;dst_pitch
+%else
+        add         rdi,        r8
+%endif
+
+        cmp         rdi,        rcx
+        jne         .next_row
+
+        jmp         .done
+
+.b16x16_sp_only:
+        movsxd      rax,        dword ptr arg(3) ;yoffset
+        shl         rax,        5
+        add         rax,        rcx    ;VFilter
+
+        mov         rdi,        arg(4) ;dst_ptr
+        mov         rsi,        arg(0) ;src_ptr
+        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
+
+        movdqa      xmm1,       [rax]
+        movdqa      xmm2,       [rax+16]
+
+        lea         rcx,        [rdi+rdx*8]
+        lea         rcx,        [rcx+rdx*8]
+        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line
+
+        pxor        xmm0,       xmm0
+
+        ; get the first horizontal line done
+        movdqu      xmm7,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+
+        add         rsi,        rax                 ; next line
+.next_row_spo:
+        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+
+        movdqa      xmm5,       xmm7
+        movdqa      xmm6,       xmm7
+
+        movdqa      xmm4,       xmm3                 ; make a copy of current line
+        movdqa      xmm7,       xmm3
+
+        punpcklbw   xmm5,       xmm0
+        punpckhbw   xmm6,       xmm0
+        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
+        punpckhbw   xmm4,       xmm0
+
+        pmullw      xmm5,       xmm1
+        pmullw      xmm6,       xmm1
+        pmullw      xmm3,       xmm2
+        pmullw      xmm4,       xmm2
+
+        paddw       xmm3,       xmm5
+        paddw       xmm4,       xmm6
+
+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
+        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
+
+        paddw       xmm4,       [GLOBAL(rd)]
+        psraw       xmm4,       VP8_FILTER_SHIFT
+
+        packuswb    xmm3,       xmm4
+        movdqa      [rdi],      xmm3                 ; store the results in the destination
+
+        add         rsi,        rax                 ; next line
+        add         rdi,        rdx                 ;dst_pitch
+        cmp         rdi,        rcx
+        jne         .next_row_spo
+
+        jmp         .done
+
+.b16x16_fp_only:
+        lea         rcx,        [rdi+rdx*8]
+        lea         rcx,        [rcx+rdx*8]
+        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line
+        pxor        xmm0,       xmm0
+
+.next_row_fpo:
+        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+        movdqa      xmm4,       xmm3                 ; make a copy of current line
+
+        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
+        punpckhbw   xmm4,       xmm0
+
+        pmullw      xmm3,       xmm1
+        pmullw      xmm4,       xmm1
+
+        movdqu      xmm5,       [rsi+1]
+        movdqa      xmm6,       xmm5
+
+        punpcklbw   xmm5,       xmm0
+        punpckhbw   xmm6,       xmm0
+
+        pmullw      xmm5,       xmm2
+        pmullw      xmm6,       xmm2
+
+        paddw       xmm3,       xmm5
+        paddw       xmm4,       xmm6
+
+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
+        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
+
+        paddw       xmm4,       [GLOBAL(rd)]
+        psraw       xmm4,       VP8_FILTER_SHIFT
+
+        packuswb    xmm3,       xmm4
+        movdqa      [rdi],      xmm3                 ; store the results in the destination
+
+        add         rsi,        rax                 ; next line
+        add         rdi,        rdx                 ; dst_pitch
+        cmp         rdi,        rcx
+        jne         .next_row_fpo
+
+.done:
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_bilinear_predict8x8_sse2
+;(
+;    unsigned char  *src_ptr,
+;    int   src_pixels_per_line,
+;    int  xoffset,
+;    int  yoffset,
+;    unsigned char *dst_ptr,
+;    int dst_pitch
+;)
+global sym(vp8_bilinear_predict8x8_sse2) PRIVATE
+sym(vp8_bilinear_predict8x8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 144                         ; reserve 144 bytes
+
+    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]
+    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]
+        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
+
+        mov         rsi,        arg(0) ;src_ptr
+        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
+
+    ;Read 9-line unaligned data in and put them on stack. This gives a big
+    ;performance boost.
+        movdqu      xmm0,       [rsi]
+        lea         rax,        [rdx + rdx*2]
+        movdqu      xmm1,       [rsi+rdx]
+        movdqu      xmm2,       [rsi+rdx*2]
+        add         rsi,        rax
+        movdqu      xmm3,       [rsi]
+        movdqu      xmm4,       [rsi+rdx]
+        movdqu      xmm5,       [rsi+rdx*2]
+        add         rsi,        rax
+        movdqu      xmm6,       [rsi]
+        movdqu      xmm7,       [rsi+rdx]
+
+        movdqa      XMMWORD PTR [rsp],            xmm0
+
+        movdqu      xmm0,       [rsi+rdx*2]
+
+        movdqa      XMMWORD PTR [rsp+16],         xmm1
+        movdqa      XMMWORD PTR [rsp+32],         xmm2
+        movdqa      XMMWORD PTR [rsp+48],         xmm3
+        movdqa      XMMWORD PTR [rsp+64],         xmm4
+        movdqa      XMMWORD PTR [rsp+80],         xmm5
+        movdqa      XMMWORD PTR [rsp+96],         xmm6
+        movdqa      XMMWORD PTR [rsp+112],        xmm7
+        movdqa      XMMWORD PTR [rsp+128],        xmm0
+
+        movsxd      rax,        dword ptr arg(2) ;xoffset
+        shl         rax,        5
+        add         rax,        rcx    ;HFilter
+
+        mov         rdi,        arg(4) ;dst_ptr
+        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
+
+        movdqa      xmm1,       [rax]
+        movdqa      xmm2,       [rax+16]
+
+        movsxd      rax,        dword ptr arg(3) ;yoffset
+        shl         rax,        5
+        add         rax,        rcx    ;VFilter
+
+        lea         rcx,        [rdi+rdx*8]
+
+        movdqa      xmm5,       [rax]
+        movdqa      xmm6,       [rax+16]
+
+        pxor        xmm0,       xmm0
+
+        ; get the first horizontal line done
+        movdqa      xmm3,       XMMWORD PTR [rsp]
+        movdqa      xmm4,       xmm3                 ; make a copy of current line
+        psrldq      xmm4,       1
+
+        punpcklbw   xmm3,       xmm0                 ; 00 01 02 03 04 05 06 07
+        punpcklbw   xmm4,       xmm0                 ; 01 02 03 04 05 06 07 08
+
+        pmullw      xmm3,       xmm1
+        pmullw      xmm4,       xmm2
+
+        paddw       xmm3,       xmm4
+
+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
+        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
+
+        movdqa      xmm7,       xmm3
+        add         rsp,        16                 ; next line
+.next_row8x8:
+        movdqa      xmm3,       XMMWORD PTR [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
+        movdqa      xmm4,       xmm3                 ; make a copy of current line
+        psrldq      xmm4,       1
+
+        punpcklbw   xmm3,       xmm0                 ; 00 01 02 03 04 05 06 07
+        punpcklbw   xmm4,       xmm0                 ; 01 02 03 04 05 06 07 08
+
+        pmullw      xmm3,       xmm1
+        pmullw      xmm4,       xmm2
+
+        paddw       xmm3,       xmm4
+        pmullw      xmm7,       xmm5
+
+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
+        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
+
+        movdqa      xmm4,       xmm3
+
+        pmullw      xmm3,       xmm6
+        paddw       xmm3,       xmm7
+
+        movdqa      xmm7,       xmm4
+
+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
+        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
+
+        packuswb    xmm3,       xmm0
+        movq        [rdi],      xmm3                 ; store the results in the destination
+
+        add         rsp,        16                 ; next line
+        add         rdi,        rdx
+
+        cmp         rdi,        rcx
+        jne         .next_row8x8
+
+    ;add rsp, 144
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+SECTION_RODATA
+align 16
+rd:
+    times 8 dw 0x40
diff --git a/libs/libvpx/vp8/common/x86/subpixel_ssse3.asm b/libs/libvpx/vp8/common/x86/subpixel_ssse3.asm
new file mode 100644
index 0000000000..c06f24556e
--- /dev/null
+++ b/libs/libvpx/vp8/common/x86/subpixel_ssse3.asm
@@ -0,0 +1,1508 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%define BLOCK_HEIGHT_WIDTH 4
+%define VP8_FILTER_WEIGHT 128
+%define VP8_FILTER_SHIFT  7
+
+
+;/************************************************************************************
+; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
+; input pixel array has output_height rows. This routine assumes that output_height is an
+; even number. This function handles 8 pixels in horizontal direction, calculating ONE
+; rows each iteration to take advantage of the 128 bits operations.
+;
+; This is an implementation of some of the SSE optimizations first seen in ffvp8
+;
+;*************************************************************************************/
+;void vp8_filter_block1d8_h6_ssse3
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    unsigned int    vp8_filter_index
+;)
+global sym(vp8_filter_block1d8_h6_ssse3) PRIVATE
+sym(vp8_filter_block1d8_h6_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    movsxd      rdx, DWORD PTR arg(5)   ;table index
+    xor         rsi, rsi
+    shl         rdx, 4
+
+    movdqa      xmm7, [GLOBAL(rd)]
+
+    lea         rax, [GLOBAL(k0_k5)]
+    add         rax, rdx
+    mov         rdi, arg(2)             ;output_ptr
+
+    cmp         esi, DWORD PTR [rax]
+    je          vp8_filter_block1d8_h4_ssse3
+
+    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
+    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
+    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
+
+    mov         rsi, arg(0)             ;src_ptr
+    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
+    movsxd      rcx, dword ptr arg(4)   ;output_height
+
+    movsxd      rdx, dword ptr arg(3)   ;output_pitch
+
+    sub         rdi, rdx
+;xmm3 free
+.filter_block1d8_h6_rowloop_ssse3:
+    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
+
+    movq        xmm2,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
+
+    punpcklbw   xmm0,   xmm2                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
+
+    movdqa      xmm1,   xmm0
+    pmaddubsw   xmm0,   xmm4
+
+    movdqa      xmm2,   xmm1
+    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
+
+    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
+    pmaddubsw   xmm1,   xmm5
+
+    lea         rdi,    [rdi + rdx]
+    pmaddubsw   xmm2,   xmm6
+
+    lea         rsi,    [rsi + rax]
+    dec         rcx
+
+    paddsw      xmm0,   xmm1
+    paddsw      xmm2,   xmm7
+
+    paddsw      xmm0,   xmm2
+
+    psraw       xmm0,   7
+
+    packuswb    xmm0,   xmm0
+
+    movq        MMWORD Ptr [rdi], xmm0
+    jnz         .filter_block1d8_h6_rowloop_ssse3
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+vp8_filter_block1d8_h4_ssse3:
+    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
+    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
+
+    movdqa      xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)]
+    movdqa      xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)]
+
+    mov         rsi, arg(0)             ;src_ptr
+
+    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
+    movsxd      rcx, dword ptr arg(4)   ;output_height
+
+    movsxd      rdx, dword ptr arg(3)   ;output_pitch
+
+    sub         rdi, rdx
+
+.filter_block1d8_h4_rowloop_ssse3:
+    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
+
+    movq        xmm1,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
+
+    punpcklbw   xmm0,   xmm1                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
+
+    movdqa      xmm2,   xmm0
+    pshufb      xmm0,   xmm3
+
+    pshufb      xmm2,   xmm4
+    pmaddubsw   xmm0,   xmm5
+
+    lea         rdi,    [rdi + rdx]
+    pmaddubsw   xmm2,   xmm6
+
+    lea         rsi,    [rsi + rax]
+    dec         rcx
+
+    paddsw      xmm0,   xmm7
+
+    paddsw      xmm0,   xmm2
+
+    psraw       xmm0,   7
+
+    packuswb    xmm0,   xmm0
+
+    movq        MMWORD Ptr [rdi], xmm0
+
+    jnz         .filter_block1d8_h4_rowloop_ssse3
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+;void vp8_filter_block1d16_h6_ssse3
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    unsigned int    vp8_filter_index
+;)
+global sym(vp8_filter_block1d16_h6_ssse3) PRIVATE
+sym(vp8_filter_block1d16_h6_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    movsxd      rdx, DWORD PTR arg(5)           ;table index
+    xor         rsi, rsi
+    shl         rdx, 4      ;
+
+    lea         rax, [GLOBAL(k0_k5)]
+    add         rax, rdx
+
+    mov         rdi, arg(2)                     ;output_ptr
+
+    mov         rsi, arg(0)                     ;src_ptr
+
+    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
+    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
+    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
+
+    movsxd      rax, dword ptr arg(1)           ;src_pixels_per_line
+    movsxd      rcx, dword ptr arg(4)           ;output_height
+    movsxd      rdx, dword ptr arg(3)           ;output_pitch
+
+.filter_block1d16_h6_rowloop_ssse3:
+    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
+
+    movq        xmm3,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
+
+    punpcklbw   xmm0,   xmm3                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
+
+    movdqa      xmm1,   xmm0
+    pmaddubsw   xmm0,   xmm4
+
+    movdqa      xmm2,   xmm1
+    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
+
+    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
+    movq        xmm3,   MMWORD PTR [rsi +  6]
+
+    pmaddubsw   xmm1,   xmm5
+    movq        xmm7,   MMWORD PTR [rsi + 11]
+
+    pmaddubsw   xmm2,   xmm6
+    punpcklbw   xmm3,   xmm7
+
+    paddsw      xmm0,   xmm1
+    movdqa      xmm1,   xmm3
+
+    pmaddubsw   xmm3,   xmm4
+    paddsw      xmm0,   xmm2
+
+    movdqa      xmm2,   xmm1
+    paddsw      xmm0,   [GLOBAL(rd)]
+
+    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
+    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
+
+    psraw       xmm0,   7
+    pmaddubsw   xmm1,   xmm5
+
+    pmaddubsw   xmm2,   xmm6
+    packuswb    xmm0,   xmm0
+
+    lea         rsi,    [rsi + rax]
+    paddsw      xmm3,   xmm1
+
+    paddsw      xmm3,   xmm2
+
+    paddsw      xmm3,   [GLOBAL(rd)]
+
+    psraw       xmm3,   7
+
+    packuswb    xmm3,   xmm3
+
+    punpcklqdq  xmm0,   xmm3
+
+    movdqa      XMMWORD Ptr [rdi], xmm0
+
+    lea         rdi,    [rdi + rdx]
+    dec         rcx
+    jnz         .filter_block1d16_h6_rowloop_ssse3
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_filter_block1d4_h6_ssse3
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    unsigned int    vp8_filter_index
+;)
+global sym(vp8_filter_block1d4_h6_ssse3) PRIVATE
+sym(vp8_filter_block1d4_h6_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    movsxd      rdx, DWORD PTR arg(5)   ;table index
+    xor         rsi, rsi
+    shl         rdx, 4      ;
+
+    lea         rax, [GLOBAL(k0_k5)]
+    add         rax, rdx
+    movdqa      xmm7, [GLOBAL(rd)]
+
+    cmp         esi, DWORD PTR [rax]
+    je          .vp8_filter_block1d4_h4_ssse3
+
+    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
+    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
+    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
+
+    mov         rsi, arg(0)             ;src_ptr
+    mov         rdi, arg(2)             ;output_ptr
+    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
+    movsxd      rcx, dword ptr arg(4)   ;output_height
+
+    movsxd      rdx, dword ptr arg(3)   ;output_pitch
+
+;xmm3 free
+.filter_block1d4_h6_rowloop_ssse3:
+    movdqu      xmm0,   XMMWORD PTR [rsi - 2]
+
+    movdqa      xmm1, xmm0
+    pshufb      xmm0, [GLOBAL(shuf1b)]
+
+    movdqa      xmm2, xmm1
+    pshufb      xmm1, [GLOBAL(shuf2b)]
+    pmaddubsw   xmm0, xmm4
+    pshufb      xmm2, [GLOBAL(shuf3b)]
+    pmaddubsw   xmm1, xmm5
+
+;--
+    pmaddubsw   xmm2, xmm6
+
+    lea         rsi,    [rsi + rax]
+;--
+    paddsw      xmm0, xmm1
+    paddsw      xmm0, xmm7
+    pxor        xmm1, xmm1
+    paddsw      xmm0, xmm2
+    psraw       xmm0, 7
+    packuswb    xmm0, xmm0
+
+    movd        DWORD PTR [rdi], xmm0
+
+    add         rdi, rdx
+    dec         rcx
+    jnz         .filter_block1d4_h6_rowloop_ssse3
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+.vp8_filter_block1d4_h4_ssse3:
+    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
+    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
+    movdqa      xmm0, XMMWORD PTR [GLOBAL(shuf2b)]
+    movdqa      xmm3, XMMWORD PTR [GLOBAL(shuf3b)]
+
+    mov         rsi, arg(0)             ;src_ptr
+    mov         rdi, arg(2)             ;output_ptr
+    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
+    movsxd      rcx, dword ptr arg(4)   ;output_height
+
+    movsxd      rdx, dword ptr arg(3)   ;output_pitch
+
+.filter_block1d4_h4_rowloop_ssse3:
+    movdqu      xmm1,   XMMWORD PTR [rsi - 2]
+
+    movdqa      xmm2, xmm1
+    pshufb      xmm1, xmm0 ;;[GLOBAL(shuf2b)]
+    pshufb      xmm2, xmm3 ;;[GLOBAL(shuf3b)]
+    pmaddubsw   xmm1, xmm5
+
+;--
+    pmaddubsw   xmm2, xmm6
+
+    lea         rsi,    [rsi + rax]
+;--
+    paddsw      xmm1, xmm7
+    paddsw      xmm1, xmm2
+    psraw       xmm1, 7
+    packuswb    xmm1, xmm1
+
+    movd        DWORD PTR [rdi], xmm1
+
+    add         rdi, rdx
+    dec         rcx
+    jnz         .filter_block1d4_h4_rowloop_ssse3
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+
+;void vp8_filter_block1d16_v6_ssse3
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    unsigned int   vp8_filter_index
+;)
+global sym(vp8_filter_block1d16_v6_ssse3) PRIVATE
+sym(vp8_filter_block1d16_v6_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    movsxd      rdx, DWORD PTR arg(5)   ;table index
+    xor         rsi, rsi
+    shl         rdx, 4      ;
+
+    lea         rax, [GLOBAL(k0_k5)]
+    add         rax, rdx
+
+    cmp         esi, DWORD PTR [rax]
+    je          .vp8_filter_block1d16_v4_ssse3
+
+    movdqa      xmm5, XMMWORD PTR [rax]         ;k0_k5
+    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
+    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
+
+    mov         rsi, arg(0)             ;src_ptr
+    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
+    mov         rdi, arg(2)             ;output_ptr
+
+%if ABI_IS_32BIT=0
+    movsxd      r8, DWORD PTR arg(3)    ;out_pitch
+%endif
+    mov         rax, rsi
+    movsxd      rcx, DWORD PTR arg(4)   ;output_height
+    add         rax, rdx
+
+
+.vp8_filter_block1d16_v6_ssse3_loop:
+    movq        xmm1, MMWORD PTR [rsi]                  ;A
+    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
+    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
+    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
+    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
+
+    punpcklbw   xmm2, xmm4                  ;B D
+    punpcklbw   xmm3, xmm0                  ;C E
+
+    movq        xmm0, MMWORD PTR [rax + rdx * 4]        ;F
+
+    pmaddubsw   xmm3, xmm6
+    punpcklbw   xmm1, xmm0                  ;A F
+    pmaddubsw   xmm2, xmm7
+    pmaddubsw   xmm1, xmm5
+
+    paddsw      xmm2, xmm3
+    paddsw      xmm2, xmm1
+    paddsw      xmm2, [GLOBAL(rd)]
+    psraw       xmm2, 7
+    packuswb    xmm2, xmm2
+
+    movq        MMWORD PTR [rdi], xmm2          ;store the results
+
+    movq        xmm1, MMWORD PTR [rsi + 8]                  ;A
+    movq        xmm2, MMWORD PTR [rsi + rdx + 8]            ;B
+    movq        xmm3, MMWORD PTR [rsi + rdx * 2 + 8]        ;C
+    movq        xmm4, MMWORD PTR [rax + rdx * 2 + 8]        ;D
+    movq        xmm0, MMWORD PTR [rsi + rdx * 4 + 8]        ;E
+
+    punpcklbw   xmm2, xmm4                  ;B D
+    punpcklbw   xmm3, xmm0                  ;C E
+
+    movq        xmm0, MMWORD PTR [rax + rdx * 4 + 8]        ;F
+    pmaddubsw   xmm3, xmm6
+    punpcklbw   xmm1, xmm0                  ;A F
+    pmaddubsw   xmm2, xmm7
+    pmaddubsw   xmm1, xmm5
+
+    add         rsi,  rdx
+    add         rax,  rdx
+;--
+;--
+    paddsw      xmm2, xmm3
+    paddsw      xmm2, xmm1
+    paddsw      xmm2, [GLOBAL(rd)]
+    psraw       xmm2, 7
+    packuswb    xmm2, xmm2
+
+    movq        MMWORD PTR [rdi+8], xmm2
+
+%if ABI_IS_32BIT
+    add         rdi,        DWORD PTR arg(3) ;out_pitch
+%else
+    add         rdi,        r8
+%endif
+    dec         rcx
+    jnz         .vp8_filter_block1d16_v6_ssse3_loop
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+.vp8_filter_block1d16_v4_ssse3:
+    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
+    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
+
+    mov         rsi, arg(0)             ;src_ptr
+    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
+    mov         rdi, arg(2)             ;output_ptr
+
+%if ABI_IS_32BIT=0
+    movsxd      r8, DWORD PTR arg(3)    ;out_pitch
+%endif
+    mov         rax, rsi
+    movsxd      rcx, DWORD PTR arg(4)   ;output_height
+    add         rax, rdx
+
+.vp8_filter_block1d16_v4_ssse3_loop:
+    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
+    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
+    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
+    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
+
+    punpcklbw   xmm2, xmm4                  ;B D
+    punpcklbw   xmm3, xmm0                  ;C E
+
+    pmaddubsw   xmm3, xmm6
+    pmaddubsw   xmm2, xmm7
+    movq        xmm5, MMWORD PTR [rsi + rdx + 8]            ;B
+    movq        xmm1, MMWORD PTR [rsi + rdx * 2 + 8]        ;C
+    movq        xmm4, MMWORD PTR [rax + rdx * 2 + 8]        ;D
+    movq        xmm0, MMWORD PTR [rsi + rdx * 4 + 8]        ;E
+
+    paddsw      xmm2, [GLOBAL(rd)]
+    paddsw      xmm2, xmm3
+    psraw       xmm2, 7
+    packuswb    xmm2, xmm2
+
+    punpcklbw   xmm5, xmm4                  ;B D
+    punpcklbw   xmm1, xmm0                  ;C E
+
+    pmaddubsw   xmm1, xmm6
+    pmaddubsw   xmm5, xmm7
+
+    movdqa      xmm4, [GLOBAL(rd)]
+    add         rsi,  rdx
+    add         rax,  rdx
+;--
+;--
+    paddsw      xmm5, xmm1
+    paddsw      xmm5, xmm4
+    psraw       xmm5, 7
+    packuswb    xmm5, xmm5
+
+    punpcklqdq  xmm2, xmm5
+
+    movdqa       XMMWORD PTR [rdi], xmm2
+
+%if ABI_IS_32BIT
+    add         rdi,        DWORD PTR arg(3) ;out_pitch
+%else
+    add         rdi,        r8
+%endif
+    dec         rcx
+    jnz         .vp8_filter_block1d16_v4_ssse3_loop
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_filter_block1d8_v6_ssse3
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    unsigned int   vp8_filter_index
+;)
+global sym(vp8_filter_block1d8_v6_ssse3) PRIVATE
+sym(vp8_filter_block1d8_v6_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    movsxd      rdx, DWORD PTR arg(5)   ;table index
+    xor         rsi, rsi
+    shl         rdx, 4      ;
+
+    lea         rax, [GLOBAL(k0_k5)]
+    add         rax, rdx
+
+    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
+    mov         rdi, arg(2)             ;output_ptr
+%if ABI_IS_32BIT=0
+    movsxd      r8, DWORD PTR arg(3)    ; out_pitch
+%endif
+    movsxd      rcx, DWORD PTR arg(4)   ;[output_height]
+
+    cmp         esi, DWORD PTR [rax]
+    je          .vp8_filter_block1d8_v4_ssse3
+
+    movdqa      xmm5, XMMWORD PTR [rax]         ;k0_k5
+    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
+    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
+
+    mov         rsi, arg(0)             ;src_ptr
+
+    mov         rax, rsi
+    add         rax, rdx
+
+.vp8_filter_block1d8_v6_ssse3_loop:
+    movq        xmm1, MMWORD PTR [rsi]                  ;A
+    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
+    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
+    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
+    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
+
+    punpcklbw   xmm2, xmm4                  ;B D
+    punpcklbw   xmm3, xmm0                  ;C E
+
+    movq        xmm0, MMWORD PTR [rax + rdx * 4]        ;F
+    movdqa      xmm4, [GLOBAL(rd)]
+
+    pmaddubsw   xmm3, xmm6
+    punpcklbw   xmm1, xmm0                  ;A F
+    pmaddubsw   xmm2, xmm7
+    pmaddubsw   xmm1, xmm5
+    add         rsi,  rdx
+    add         rax,  rdx
+;--
+;--
+    paddsw      xmm2, xmm3
+    paddsw      xmm2, xmm1
+    paddsw      xmm2, xmm4
+    psraw       xmm2, 7
+    packuswb    xmm2, xmm2
+
+    movq        MMWORD PTR [rdi], xmm2
+
+%if ABI_IS_32BIT
+    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
+%else
+    add         rdi,        r8
+%endif
+    dec         rcx
+    jnz         .vp8_filter_block1d8_v6_ssse3_loop
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+.vp8_filter_block1d8_v4_ssse3:
+    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
+    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
+    movdqa      xmm5, [GLOBAL(rd)]
+
+    mov         rsi, arg(0)             ;src_ptr
+
+    mov         rax, rsi
+    add         rax, rdx
+
+.vp8_filter_block1d8_v4_ssse3_loop:
+    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
+    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
+    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
+    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
+
+    punpcklbw   xmm2, xmm4                  ;B D
+    punpcklbw   xmm3, xmm0                  ;C E
+
+    pmaddubsw   xmm3, xmm6
+    pmaddubsw   xmm2, xmm7
+    add         rsi,  rdx
+    add         rax,  rdx
+;--
+;--
+    paddsw      xmm2, xmm3
+    paddsw      xmm2, xmm5
+    psraw       xmm2, 7
+    packuswb    xmm2, xmm2
+
+    movq        MMWORD PTR [rdi], xmm2
+
+%if ABI_IS_32BIT
+    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
+%else
+    add         rdi,        r8
+%endif
+    dec         rcx
+    jnz         .vp8_filter_block1d8_v4_ssse3_loop
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+;void vp8_filter_block1d4_v6_ssse3
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    unsigned int   vp8_filter_index
+;)
+global sym(vp8_filter_block1d4_v6_ssse3) PRIVATE
+sym(vp8_filter_block1d4_v6_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    movsxd      rdx, DWORD PTR arg(5)   ;table index
+    xor         rsi, rsi
+    shl         rdx, 4      ;
+
+    lea         rax, [GLOBAL(k0_k5)]
+    add         rax, rdx
+
+    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
+    mov         rdi, arg(2)             ;output_ptr
+%if ABI_IS_32BIT=0
+    movsxd      r8, DWORD PTR arg(3)    ; out_pitch
+%endif
+    movsxd      rcx, DWORD PTR arg(4)   ;[output_height]
+
+    cmp         esi, DWORD PTR [rax]
+    je          .vp8_filter_block1d4_v4_ssse3
+
+    movq        mm5, MMWORD PTR [rax]         ;k0_k5
+    movq        mm6, MMWORD PTR [rax+256]     ;k2_k4
+    movq        mm7, MMWORD PTR [rax+128]     ;k1_k3
+
+    mov         rsi, arg(0)             ;src_ptr
+
+    mov         rax, rsi
+    add         rax, rdx
+
+.vp8_filter_block1d4_v6_ssse3_loop:
+    movd        mm1, DWORD PTR [rsi]                  ;A
+    movd        mm2, DWORD PTR [rsi + rdx]            ;B
+    movd        mm3, DWORD PTR [rsi + rdx * 2]        ;C
+    movd        mm4, DWORD PTR [rax + rdx * 2]        ;D
+    movd        mm0, DWORD PTR [rsi + rdx * 4]        ;E
+
+    punpcklbw   mm2, mm4                  ;B D
+    punpcklbw   mm3, mm0                  ;C E
+
+    movd        mm0, DWORD PTR [rax + rdx * 4]        ;F
+
+    movq        mm4, [GLOBAL(rd)]
+
+    pmaddubsw   mm3, mm6
+    punpcklbw   mm1, mm0                  ;A F
+    pmaddubsw   mm2, mm7
+    pmaddubsw   mm1, mm5
+    add         rsi,  rdx
+    add         rax,  rdx
+;--
+;--
+    paddsw      mm2, mm3
+    paddsw      mm2, mm1
+    paddsw      mm2, mm4
+    psraw       mm2, 7
+    packuswb    mm2, mm2
+
+    movd        DWORD PTR [rdi], mm2
+
+%if ABI_IS_32BIT
+    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
+%else
+    add         rdi,        r8
+%endif
+    dec         rcx
+    jnz         .vp8_filter_block1d4_v6_ssse3_loop
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+.vp8_filter_block1d4_v4_ssse3:
+    movq        mm6, MMWORD PTR [rax+256]     ;k2_k4
+    movq        mm7, MMWORD PTR [rax+128]     ;k1_k3
+    movq        mm5, MMWORD PTR [GLOBAL(rd)]
+
+    mov         rsi, arg(0)             ;src_ptr
+
+    mov         rax, rsi
+    add         rax, rdx
+
+.vp8_filter_block1d4_v4_ssse3_loop:
+    movd        mm2, DWORD PTR [rsi + rdx]            ;B
+    movd        mm3, DWORD PTR [rsi + rdx * 2]        ;C
+    movd        mm4, DWORD PTR [rax + rdx * 2]        ;D
+    movd        mm0, DWORD PTR [rsi + rdx * 4]        ;E
+
+    punpcklbw   mm2, mm4                  ;B D
+    punpcklbw   mm3, mm0                  ;C E
+
+    pmaddubsw   mm3, mm6
+    pmaddubsw   mm2, mm7
+    add         rsi,  rdx
+    add         rax,  rdx
+;--
+;--
+    paddsw      mm2, mm3
+    paddsw      mm2, mm5
+    psraw       mm2, 7
+    packuswb    mm2, mm2
+
+    movd        DWORD PTR [rdi], mm2
+
+%if ABI_IS_32BIT
+    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
+%else
+    add         rdi,        r8
+%endif
+    dec         rcx
+    jnz         .vp8_filter_block1d4_v4_ssse3_loop
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_bilinear_predict16x16_ssse3
+;(
+;    unsigned char  *src_ptr,
+;    int   src_pixels_per_line,
+;    int  xoffset,
+;    int  yoffset,
+;    unsigned char *dst_ptr,
+;    int dst_pitch
+;)
+global sym(vp8_bilinear_predict16x16_ssse3) PRIVATE
+sym(vp8_bilinear_predict16x16_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        lea         rcx,        [GLOBAL(vp8_bilinear_filters_ssse3)]
+        movsxd      rax,        dword ptr arg(2)    ; xoffset
+
+        cmp         rax,        0                   ; skip first_pass filter if xoffset=0
+        je          .b16x16_sp_only
+
+        shl         rax,        4
+        lea         rax,        [rax + rcx]         ; HFilter
+
+        mov         rdi,        arg(4)              ; dst_ptr
+        mov         rsi,        arg(0)              ; src_ptr
+        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
+
+        movdqa      xmm1,       [rax]
+
+        movsxd      rax,        dword ptr arg(3)    ; yoffset
+
+        cmp         rax,        0                   ; skip second_pass filter if yoffset=0
+        je          .b16x16_fp_only
+
+        shl         rax,        4
+        lea         rax,        [rax + rcx]         ; VFilter
+
+        lea         rcx,        [rdi+rdx*8]
+        lea         rcx,        [rcx+rdx*8]
+        movsxd      rdx,        dword ptr arg(1)    ; src_pixels_per_line
+
+        movdqa      xmm2,       [rax]
+
+%if ABI_IS_32BIT=0
+        movsxd      r8,         dword ptr arg(5)    ; dst_pitch
+%endif
+        movq        xmm3,       [rsi]               ; 00 01 02 03 04 05 06 07
+        movq        xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08
+
+        punpcklbw   xmm3,       xmm5                ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
+        movq        xmm4,       [rsi+8]             ; 08 09 10 11 12 13 14 15
+
+        movq        xmm5,       [rsi+9]             ; 09 10 11 12 13 14 15 16
+
+        lea         rsi,        [rsi + rdx]         ; next line
+
+        pmaddubsw   xmm3,       xmm1                ; 00 02 04 06 08 10 12 14
+
+        punpcklbw   xmm4,       xmm5                ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16
+        pmaddubsw   xmm4,       xmm1                ; 01 03 05 07 09 11 13 15
+
+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
+        psraw       xmm3,       VP8_FILTER_SHIFT    ; xmm3 /= 128
+
+        paddw       xmm4,       [GLOBAL(rd)]        ; xmm4 += round value
+        psraw       xmm4,       VP8_FILTER_SHIFT    ; xmm4 /= 128
+
+        movdqa      xmm7,       xmm3
+        packuswb    xmm7,       xmm4                ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
+
+.next_row:
+        movq        xmm6,       [rsi]               ; 00 01 02 03 04 05 06 07
+        movq        xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08
+
+        punpcklbw   xmm6,       xmm5
+        movq        xmm4,       [rsi+8]             ; 08 09 10 11 12 13 14 15
+
+        movq        xmm5,       [rsi+9]             ; 09 10 11 12 13 14 15 16
+        lea         rsi,        [rsi + rdx]         ; next line
+
+        pmaddubsw   xmm6,       xmm1
+
+        punpcklbw   xmm4,       xmm5
+        pmaddubsw   xmm4,       xmm1
+
+        paddw       xmm6,       [GLOBAL(rd)]        ; xmm6 += round value
+        psraw       xmm6,       VP8_FILTER_SHIFT    ; xmm6 /= 128
+
+        paddw       xmm4,       [GLOBAL(rd)]        ; xmm4 += round value
+        psraw       xmm4,       VP8_FILTER_SHIFT    ; xmm4 /= 128
+
+        packuswb    xmm6,       xmm4
+        movdqa      xmm5,       xmm7
+
+        punpcklbw   xmm5,       xmm6
+        pmaddubsw   xmm5,       xmm2
+
+        punpckhbw   xmm7,       xmm6
+        pmaddubsw   xmm7,       xmm2
+
+        paddw       xmm5,       [GLOBAL(rd)]        ; xmm5 += round value
+        psraw       xmm5,       VP8_FILTER_SHIFT    ; xmm5 /= 128
+
+        paddw       xmm7,       [GLOBAL(rd)]        ; xmm7 += round value
+        psraw       xmm7,       VP8_FILTER_SHIFT    ; xmm7 /= 128
+
+        packuswb    xmm5,       xmm7
+        movdqa      xmm7,       xmm6
+
+        movdqa      [rdi],      xmm5                ; store the results in the destination
+%if ABI_IS_32BIT
+        add         rdi,        DWORD PTR arg(5)    ; dst_pitch
+%else
+        add         rdi,        r8
+%endif
+
+        cmp         rdi,        rcx
+        jne         .next_row
+
+        jmp         .done
+
+.b16x16_sp_only:
+        movsxd      rax,        dword ptr arg(3)    ; yoffset
+        shl         rax,        4
+        lea         rax,        [rax + rcx]         ; VFilter
+
+        mov         rdi,        arg(4)              ; dst_ptr
+        mov         rsi,        arg(0)              ; src_ptr
+        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
+
+        movdqa      xmm1,       [rax]               ; VFilter
+
+        lea         rcx,        [rdi+rdx*8]
+        lea         rcx,        [rcx+rdx*8]
+        movsxd      rax,        dword ptr arg(1)    ; src_pixels_per_line
+
+        ; get the first horizontal line done
+        movq        xmm4,       [rsi]               ; load row 0
+        movq        xmm2,       [rsi + 8]           ; load row 0
+
+        lea         rsi,        [rsi + rax]         ; next line
+.next_row_sp:
+        movq        xmm3,       [rsi]               ; load row + 1
+        movq        xmm5,       [rsi + 8]           ; load row + 1
+
+        punpcklbw   xmm4,       xmm3
+        punpcklbw   xmm2,       xmm5
+
+        pmaddubsw   xmm4,       xmm1
+        movq        xmm7,       [rsi + rax]         ; load row + 2
+
+        pmaddubsw   xmm2,       xmm1
+        movq        xmm6,       [rsi + rax + 8]     ; load row + 2
+
+        punpcklbw   xmm3,       xmm7
+        punpcklbw   xmm5,       xmm6
+
+        pmaddubsw   xmm3,       xmm1
+        paddw       xmm4,       [GLOBAL(rd)]
+
+        pmaddubsw   xmm5,       xmm1
+        paddw       xmm2,       [GLOBAL(rd)]
+
+        psraw       xmm4,       VP8_FILTER_SHIFT
+        psraw       xmm2,       VP8_FILTER_SHIFT
+
+        packuswb    xmm4,       xmm2
+        paddw       xmm3,       [GLOBAL(rd)]
+
+        movdqa      [rdi],      xmm4                ; store row 0
+        paddw       xmm5,       [GLOBAL(rd)]
+
+        psraw       xmm3,       VP8_FILTER_SHIFT
+        psraw       xmm5,       VP8_FILTER_SHIFT
+
+        packuswb    xmm3,       xmm5
+        movdqa      xmm4,       xmm7
+
+        movdqa      [rdi + rdx],xmm3                ; store row 1
+        lea         rsi,        [rsi + 2*rax]
+
+        movdqa      xmm2,       xmm6
+        lea         rdi,        [rdi + 2*rdx]
+
+        cmp         rdi,        rcx
+        jne         .next_row_sp
+
+        jmp         .done
+
+.b16x16_fp_only:
+        lea         rcx,        [rdi+rdx*8]
+        lea         rcx,        [rcx+rdx*8]
+        movsxd      rax,        dword ptr arg(1)    ; src_pixels_per_line
+
+.next_row_fp:
+        movq        xmm2,       [rsi]               ; 00 01 02 03 04 05 06 07
+        movq        xmm4,       [rsi+1]             ; 01 02 03 04 05 06 07 08
+
+        punpcklbw   xmm2,       xmm4
+        movq        xmm3,       [rsi+8]             ; 08 09 10 11 12 13 14 15
+
+        pmaddubsw   xmm2,       xmm1
+        movq        xmm4,       [rsi+9]             ; 09 10 11 12 13 14 15 16
+
+        lea         rsi,        [rsi + rax]         ; next line
+        punpcklbw   xmm3,       xmm4
+
+        pmaddubsw   xmm3,       xmm1
+        movq        xmm5,       [rsi]
+
+        paddw       xmm2,       [GLOBAL(rd)]
+        movq        xmm7,       [rsi+1]
+
+        movq        xmm6,       [rsi+8]
+        psraw       xmm2,       VP8_FILTER_SHIFT
+
+        punpcklbw   xmm5,       xmm7
+        movq        xmm7,       [rsi+9]
+
+        paddw       xmm3,       [GLOBAL(rd)]
+        pmaddubsw   xmm5,       xmm1
+
+        psraw       xmm3,       VP8_FILTER_SHIFT
+        punpcklbw   xmm6,       xmm7
+
+        packuswb    xmm2,       xmm3
+        pmaddubsw   xmm6,       xmm1
+
+        movdqa      [rdi],      xmm2                ; store the results in the destination
+        paddw       xmm5,       [GLOBAL(rd)]
+
+        lea         rdi,        [rdi + rdx]         ; dst_pitch
+        psraw       xmm5,       VP8_FILTER_SHIFT
+
+        paddw       xmm6,       [GLOBAL(rd)]
+        psraw       xmm6,       VP8_FILTER_SHIFT
+
+        packuswb    xmm5,       xmm6
+        lea         rsi,        [rsi + rax]         ; next line
+
+        movdqa      [rdi],      xmm5                ; store the results in the destination
+        lea         rdi,        [rdi + rdx]         ; dst_pitch
+
+        cmp         rdi,        rcx
+
+        jne         .next_row_fp
+
+.done:
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_bilinear_predict8x8_ssse3
+;(
+;    unsigned char  *src_ptr,
+;    int   src_pixels_per_line,
+;    int  xoffset,
+;    int  yoffset,
+;    unsigned char *dst_ptr,
+;    int dst_pitch
+;)
+global sym(vp8_bilinear_predict8x8_ssse3) PRIVATE
+sym(vp8_bilinear_predict8x8_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 144                         ; reserve 144 bytes
+
+        lea         rcx,        [GLOBAL(vp8_bilinear_filters_ssse3)]
+
+        mov         rsi,        arg(0) ;src_ptr
+        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
+
+    ;Read 9-line unaligned data in and put them on stack. This gives a big
+    ;performance boost.
+        movdqu      xmm0,       [rsi]
+        lea         rax,        [rdx + rdx*2]
+        movdqu      xmm1,       [rsi+rdx]
+        movdqu      xmm2,       [rsi+rdx*2]
+        add         rsi,        rax
+        movdqu      xmm3,       [rsi]
+        movdqu      xmm4,       [rsi+rdx]
+        movdqu      xmm5,       [rsi+rdx*2]
+        add         rsi,        rax
+        movdqu      xmm6,       [rsi]
+        movdqu      xmm7,       [rsi+rdx]
+
+        movdqa      XMMWORD PTR [rsp],            xmm0
+
+        movdqu      xmm0,       [rsi+rdx*2]
+
+        movdqa      XMMWORD PTR [rsp+16],         xmm1
+        movdqa      XMMWORD PTR [rsp+32],         xmm2
+        movdqa      XMMWORD PTR [rsp+48],         xmm3
+        movdqa      XMMWORD PTR [rsp+64],         xmm4
+        movdqa      XMMWORD PTR [rsp+80],         xmm5
+        movdqa      XMMWORD PTR [rsp+96],         xmm6
+        movdqa      XMMWORD PTR [rsp+112],        xmm7
+        movdqa      XMMWORD PTR [rsp+128],        xmm0
+
+        movsxd      rax,        dword ptr arg(2)    ; xoffset
+        cmp         rax,        0                   ; skip first_pass filter if xoffset=0
+        je          .b8x8_sp_only
+
+        shl         rax,        4
+        add         rax,        rcx                 ; HFilter
+
+        mov         rdi,        arg(4)              ; dst_ptr
+        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
+
+        movdqa      xmm0,       [rax]
+
+        movsxd      rax,        dword ptr arg(3)    ; yoffset
+        cmp         rax,        0                   ; skip second_pass filter if yoffset=0
+        je          .b8x8_fp_only
+
+        shl         rax,        4
+        lea         rax,        [rax + rcx]         ; VFilter
+
+        lea         rcx,        [rdi+rdx*8]
+
+        movdqa      xmm1,       [rax]
+
+        ; get the first horizontal line done
+        movdqa      xmm3,       [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
+        movdqa      xmm5,       xmm3                ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx
+
+        psrldq      xmm5,       1
+        lea         rsp,        [rsp + 16]          ; next line
+
+        punpcklbw   xmm3,       xmm5                ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
+        pmaddubsw   xmm3,       xmm0                ; 00 02 04 06 08 10 12 14
+
+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
+        psraw       xmm3,       VP8_FILTER_SHIFT    ; xmm3 /= 128
+
+        movdqa      xmm7,       xmm3
+        packuswb    xmm7,       xmm7                ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
+
+.next_row:
+        movdqa      xmm6,       [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
+        lea         rsp,        [rsp + 16]          ; next line
+
+        movdqa      xmm5,       xmm6
+
+        psrldq      xmm5,       1
+
+        punpcklbw   xmm6,       xmm5
+        pmaddubsw   xmm6,       xmm0
+
+        paddw       xmm6,       [GLOBAL(rd)]        ; xmm6 += round value
+        psraw       xmm6,       VP8_FILTER_SHIFT    ; xmm6 /= 128
+
+        packuswb    xmm6,       xmm6
+
+        punpcklbw   xmm7,       xmm6
+        pmaddubsw   xmm7,       xmm1
+
+        paddw       xmm7,       [GLOBAL(rd)]        ; xmm7 += round value
+        psraw       xmm7,       VP8_FILTER_SHIFT    ; xmm7 /= 128
+
+        packuswb    xmm7,       xmm7
+
+        movq        [rdi],      xmm7                ; store the results in the destination
+        lea         rdi,        [rdi + rdx]
+
+        movdqa      xmm7,       xmm6
+
+        cmp         rdi,        rcx
+        jne         .next_row
+
+        jmp         .done8x8
+
+.b8x8_sp_only:
+        movsxd      rax,        dword ptr arg(3)    ; yoffset
+        shl         rax,        4
+        lea         rax,        [rax + rcx]         ; VFilter
+
+        mov         rdi,        arg(4) ;dst_ptr
+        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
+
+        movdqa      xmm0,       [rax]               ; VFilter
+
+        movq        xmm1,       XMMWORD PTR [rsp]
+        movq        xmm2,       XMMWORD PTR [rsp+16]
+
+        movq        xmm3,       XMMWORD PTR [rsp+32]
+        punpcklbw   xmm1,       xmm2
+
+        movq        xmm4,       XMMWORD PTR [rsp+48]
+        punpcklbw   xmm2,       xmm3
+
+        movq        xmm5,       XMMWORD PTR [rsp+64]
+        punpcklbw   xmm3,       xmm4
+
+        movq        xmm6,       XMMWORD PTR [rsp+80]
+        punpcklbw   xmm4,       xmm5
+
+        movq        xmm7,       XMMWORD PTR [rsp+96]
+        punpcklbw   xmm5,       xmm6
+
+        pmaddubsw   xmm1,       xmm0
+        pmaddubsw   xmm2,       xmm0
+
+        pmaddubsw   xmm3,       xmm0
+        pmaddubsw   xmm4,       xmm0
+
+        pmaddubsw   xmm5,       xmm0
+        punpcklbw   xmm6,       xmm7
+
+        pmaddubsw   xmm6,       xmm0
+        paddw       xmm1,       [GLOBAL(rd)]
+
+        paddw       xmm2,       [GLOBAL(rd)]
+        psraw       xmm1,       VP8_FILTER_SHIFT
+
+        paddw       xmm3,       [GLOBAL(rd)]
+        psraw       xmm2,       VP8_FILTER_SHIFT
+
+        paddw       xmm4,       [GLOBAL(rd)]
+        psraw       xmm3,       VP8_FILTER_SHIFT
+
+        paddw       xmm5,       [GLOBAL(rd)]
+        psraw       xmm4,       VP8_FILTER_SHIFT
+
+        paddw       xmm6,       [GLOBAL(rd)]
+        psraw       xmm5,       VP8_FILTER_SHIFT
+
+        psraw       xmm6,       VP8_FILTER_SHIFT
+        packuswb    xmm1,       xmm1
+
+        packuswb    xmm2,       xmm2
+        movq        [rdi],      xmm1
+
+        packuswb    xmm3,       xmm3
+        movq        [rdi+rdx],  xmm2
+
+        packuswb    xmm4,       xmm4
+        movq        xmm1,       XMMWORD PTR [rsp+112]
+
+        lea         rdi,        [rdi + 2*rdx]
+        movq        xmm2,       XMMWORD PTR [rsp+128]
+
+        packuswb    xmm5,       xmm5
+        movq        [rdi],      xmm3
+
+        packuswb    xmm6,       xmm6
+        movq        [rdi+rdx],  xmm4
+
+        lea         rdi,        [rdi + 2*rdx]
+        punpcklbw   xmm7,       xmm1
+
+        movq        [rdi],      xmm5
+        pmaddubsw   xmm7,       xmm0
+
+        movq        [rdi+rdx],  xmm6
+        punpcklbw   xmm1,       xmm2
+
+        pmaddubsw   xmm1,       xmm0
+        paddw       xmm7,       [GLOBAL(rd)]
+
+        psraw       xmm7,       VP8_FILTER_SHIFT
+        paddw       xmm1,       [GLOBAL(rd)]
+
+        psraw       xmm1,       VP8_FILTER_SHIFT
+        packuswb    xmm7,       xmm7
+
+        packuswb    xmm1,       xmm1
+        lea         rdi,        [rdi + 2*rdx]
+
+        movq        [rdi],      xmm7
+
+        movq        [rdi+rdx],  xmm1
+        lea         rsp,        [rsp + 144]
+
+        jmp         .done8x8
+
+.b8x8_fp_only:
+        lea         rcx,        [rdi+rdx*8]
+
+.next_row_fp:
+        movdqa      xmm1,       XMMWORD PTR [rsp]
+        movdqa      xmm3,       XMMWORD PTR [rsp+16]
+
+        movdqa      xmm2,       xmm1
+        movdqa      xmm5,       XMMWORD PTR [rsp+32]
+
+        psrldq      xmm2,       1
+        movdqa      xmm7,       XMMWORD PTR [rsp+48]
+
+        movdqa      xmm4,       xmm3
+        psrldq      xmm4,       1
+
+        movdqa      xmm6,       xmm5
+        psrldq      xmm6,       1
+
+        punpcklbw   xmm1,       xmm2
+        pmaddubsw   xmm1,       xmm0
+
+        punpcklbw   xmm3,       xmm4
+        pmaddubsw   xmm3,       xmm0
+
+        punpcklbw   xmm5,       xmm6
+        pmaddubsw   xmm5,       xmm0
+
+        movdqa      xmm2,       xmm7
+        psrldq      xmm2,       1
+
+        punpcklbw   xmm7,       xmm2
+        pmaddubsw   xmm7,       xmm0
+
+        paddw       xmm1,       [GLOBAL(rd)]
+        psraw       xmm1,       VP8_FILTER_SHIFT
+
+        paddw       xmm3,       [GLOBAL(rd)]
+        psraw       xmm3,       VP8_FILTER_SHIFT
+
+        paddw       xmm5,       [GLOBAL(rd)]
+        psraw       xmm5,       VP8_FILTER_SHIFT
+
+        paddw       xmm7,       [GLOBAL(rd)]
+        psraw       xmm7,       VP8_FILTER_SHIFT
+
+        packuswb    xmm1,       xmm1
+        packuswb    xmm3,       xmm3
+
+        packuswb    xmm5,       xmm5
+        movq        [rdi],      xmm1
+
+        packuswb    xmm7,       xmm7
+        movq        [rdi+rdx],  xmm3
+
+        lea         rdi,        [rdi + 2*rdx]
+        movq        [rdi],      xmm5
+
+        lea         rsp,        [rsp + 4*16]
+        movq        [rdi+rdx],  xmm7
+
+        lea         rdi,        [rdi + 2*rdx]
+        cmp         rdi,        rcx
+
+        jne         .next_row_fp
+
+        lea         rsp,        [rsp + 16]
+
+.done8x8:
+    ;add rsp, 144
+    pop         rsp
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+shuf1b:
+    db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
+shuf2b:
+    db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11
+shuf3b:
+    db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10
+
+align 16
+shuf2bfrom1:
+    db  4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13
+align 16
+shuf3bfrom1:
+    db  2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11
+
+align 16
+rd:
+    times 8 dw 0x40
+
+align 16
+k0_k5:
+    times 8 db 0, 0             ;placeholder
+    times 8 db 0, 0
+    times 8 db 2, 1
+    times 8 db 0, 0
+    times 8 db 3, 3
+    times 8 db 0, 0
+    times 8 db 1, 2
+    times 8 db 0, 0
+k1_k3:
+    times 8 db  0,    0         ;placeholder
+    times 8 db  -6,  12
+    times 8 db -11,  36
+    times 8 db  -9,  50
+    times 8 db -16,  77
+    times 8 db  -6,  93
+    times 8 db  -8, 108
+    times 8 db  -1, 123
+k2_k4:
+    times 8 db 128,    0        ;placeholder
+    times 8 db 123,   -1
+    times 8 db 108,   -8
+    times 8 db  93,   -6
+    times 8 db  77,  -16
+    times 8 db  50,   -9
+    times 8 db  36,  -11
+    times 8 db  12,   -6
+align 16
+vp8_bilinear_filters_ssse3:
+    times 8 db 128, 0
+    times 8 db 112, 16
+    times 8 db 96,  32
+    times 8 db 80,  48
+    times 8 db 64,  64
+    times 8 db 48,  80
+    times 8 db 32,  96
+    times 8 db 16,  112
+
diff --git a/libs/libvpx/vp8/common/x86/vp8_asm_stubs.c b/libs/libvpx/vp8/common/x86/vp8_asm_stubs.c
new file mode 100644
index 0000000000..fb0b57eb1c
--- /dev/null
+++ b/libs/libvpx/vp8/common/x86/vp8_asm_stubs.c
@@ -0,0 +1,625 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+#include "vpx_ports/mem.h"
+#include "filter_x86.h"
+
+extern const short vp8_six_tap_mmx[8][6*8];
+
+extern void vp8_filter_block1d_h6_mmx
+(
+    unsigned char   *src_ptr,
+    unsigned short  *output_ptr,
+    unsigned int    src_pixels_per_line,
+    unsigned int    pixel_step,
+    unsigned int    output_height,
+    unsigned int    output_width,
+    const short      *vp8_filter
+);
+extern void vp8_filter_block1dc_v6_mmx
+(
+    unsigned short *src_ptr,
+    unsigned char  *output_ptr,
+    int             output_pitch,
+    unsigned int    pixels_per_line,
+    unsigned int    pixel_step,
+    unsigned int    output_height,
+    unsigned int    output_width,
+    const short    *vp8_filter
+);
+extern void vp8_filter_block1d8_h6_sse2
+(
+    unsigned char  *src_ptr,
+    unsigned short *output_ptr,
+    unsigned int    src_pixels_per_line,
+    unsigned int    pixel_step,
+    unsigned int    output_height,
+    unsigned int    output_width,
+    const short    *vp8_filter
+);
+extern void vp8_filter_block1d16_h6_sse2
+(
+    unsigned char  *src_ptr,
+    unsigned short *output_ptr,
+    unsigned int    src_pixels_per_line,
+    unsigned int    pixel_step,
+    unsigned int    output_height,
+    unsigned int    output_width,
+    const short    *vp8_filter
+);
+extern void vp8_filter_block1d8_v6_sse2
+(
+    unsigned short *src_ptr,
+    unsigned char *output_ptr,
+    int dst_ptich,
+    unsigned int pixels_per_line,
+    unsigned int pixel_step,
+    unsigned int output_height,
+    unsigned int output_width,
+    const short    *vp8_filter
+);
+extern void vp8_filter_block1d16_v6_sse2
+(
+    unsigned short *src_ptr,
+    unsigned char *output_ptr,
+    int dst_ptich,
+    unsigned int pixels_per_line,
+    unsigned int pixel_step,
+    unsigned int output_height,
+    unsigned int output_width,
+    const short    *vp8_filter
+);
+extern void vp8_unpack_block1d16_h6_sse2
+(
+    unsigned char  *src_ptr,
+    unsigned short *output_ptr,
+    unsigned int    src_pixels_per_line,
+    unsigned int    output_height,
+    unsigned int    output_width
+);
+extern void vp8_filter_block1d8_h6_only_sse2
+(
+    unsigned char  *src_ptr,
+    unsigned int    src_pixels_per_line,
+    unsigned char  *output_ptr,
+    int dst_ptich,
+    unsigned int    output_height,
+    const short    *vp8_filter
+);
+extern void vp8_filter_block1d16_h6_only_sse2
+(
+    unsigned char  *src_ptr,
+    unsigned int    src_pixels_per_line,
+    unsigned char  *output_ptr,
+    int dst_ptich,
+    unsigned int    output_height,
+    const short    *vp8_filter
+);
+extern void vp8_filter_block1d8_v6_only_sse2
+(
+    unsigned char *src_ptr,
+    unsigned int   src_pixels_per_line,
+    unsigned char *output_ptr,
+    int dst_ptich,
+    unsigned int   output_height,
+    const short   *vp8_filter
+);
+
+
+#if HAVE_MMX
+void vp8_sixtap_predict4x4_mmx
+(
+    unsigned char  *src_ptr,
+    int   src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pitch
+)
+{
+    DECLARE_ALIGNED(16, unsigned short, FData2[16*16]);  /* Temp data bufffer used in filtering */
+    const short *HFilter, *VFilter;
+    HFilter = vp8_six_tap_mmx[xoffset];
+    vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 8, HFilter);
+    VFilter = vp8_six_tap_mmx[yoffset];
+    vp8_filter_block1dc_v6_mmx(FData2 + 8, dst_ptr, dst_pitch, 8, 4 , 4, 4, VFilter);
+
+}
+
+
+void vp8_sixtap_predict16x16_mmx
+(
+    unsigned char  *src_ptr,
+    int   src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pitch
+)
+{
+
+    DECLARE_ALIGNED(16, unsigned short, FData2[24*24]);  /* Temp data bufffer used in filtering */
+
+    const short *HFilter, *VFilter;
+
+
+    HFilter = vp8_six_tap_mmx[xoffset];
+
+    vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),    FData2,   src_pixels_per_line, 1, 21, 32, HFilter);
+    vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,  FData2 + 4, src_pixels_per_line, 1, 21, 32, HFilter);
+    vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 8,  FData2 + 8, src_pixels_per_line, 1, 21, 32, HFilter);
+    vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 12, FData2 + 12, src_pixels_per_line, 1, 21, 32, HFilter);
+
+    VFilter = vp8_six_tap_mmx[yoffset];
+    vp8_filter_block1dc_v6_mmx(FData2 + 32, dst_ptr,   dst_pitch, 32, 16 , 16, 16, VFilter);
+    vp8_filter_block1dc_v6_mmx(FData2 + 36, dst_ptr + 4, dst_pitch, 32, 16 , 16, 16, VFilter);
+    vp8_filter_block1dc_v6_mmx(FData2 + 40, dst_ptr + 8, dst_pitch, 32, 16 , 16, 16, VFilter);
+    vp8_filter_block1dc_v6_mmx(FData2 + 44, dst_ptr + 12, dst_pitch, 32, 16 , 16, 16, VFilter);
+
+}
+
+
+void vp8_sixtap_predict8x8_mmx
+(
+    unsigned char  *src_ptr,
+    int   src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pitch
+)
+{
+
+    DECLARE_ALIGNED(16, unsigned short, FData2[256]);    /* Temp data bufffer used in filtering */
+
+    const short *HFilter, *VFilter;
+
+    HFilter = vp8_six_tap_mmx[xoffset];
+    vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),    FData2,   src_pixels_per_line, 1, 13, 16, HFilter);
+    vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,  FData2 + 4, src_pixels_per_line, 1, 13, 16, HFilter);
+
+    VFilter = vp8_six_tap_mmx[yoffset];
+    vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr,   dst_pitch, 16, 8 , 8, 8, VFilter);
+    vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8 , 8, 8, VFilter);
+
+}
+
+
+void vp8_sixtap_predict8x4_mmx
+(
+    unsigned char  *src_ptr,
+    int   src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pitch
+)
+{
+
+    DECLARE_ALIGNED(16, unsigned short, FData2[256]);    /* Temp data bufffer used in filtering */
+
+    const short *HFilter, *VFilter;
+
+    HFilter = vp8_six_tap_mmx[xoffset];
+    vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),    FData2,   src_pixels_per_line, 1, 9, 16, HFilter);
+    vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,  FData2 + 4, src_pixels_per_line, 1, 9, 16, HFilter);
+
+    VFilter = vp8_six_tap_mmx[yoffset];
+    vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr,   dst_pitch, 16, 8 , 4, 8, VFilter);
+    vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8 , 4, 8, VFilter);
+
+}
+
+
+
+void vp8_bilinear_predict16x16_mmx
+(
+    unsigned char  *src_ptr,
+    int   src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pitch
+)
+{
+    vp8_bilinear_predict8x8_mmx(src_ptr,   src_pixels_per_line, xoffset, yoffset, dst_ptr,   dst_pitch);
+    vp8_bilinear_predict8x8_mmx(src_ptr + 8, src_pixels_per_line, xoffset, yoffset, dst_ptr + 8, dst_pitch);
+    vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line,   src_pixels_per_line, xoffset, yoffset, dst_ptr + dst_pitch * 8,   dst_pitch);
+    vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line + 8, src_pixels_per_line, xoffset, yoffset, dst_ptr + dst_pitch * 8 + 8, dst_pitch);
+}
+#endif
+
+
+#if HAVE_SSE2
+void vp8_sixtap_predict16x16_sse2
+(
+    unsigned char  *src_ptr,
+    int   src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pitch
+
+)
+{
+    DECLARE_ALIGNED(16, unsigned short, FData2[24*24]);    /* Temp data bufffer used in filtering */
+
+    const short *HFilter, *VFilter;
+
+    if (xoffset)
+    {
+        if (yoffset)
+        {
+            HFilter = vp8_six_tap_mmx[xoffset];
+            vp8_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,   src_pixels_per_line, 1, 21, 32, HFilter);
+            VFilter = vp8_six_tap_mmx[yoffset];
+            vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr,   dst_pitch, 32, 16 , 16, dst_pitch, VFilter);
+        }
+        else
+        {
+            /* First-pass only */
+            HFilter = vp8_six_tap_mmx[xoffset];
+            vp8_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 16, HFilter);
+        }
+    }
+    else
+    {
+        /* Second-pass only */
+        VFilter = vp8_six_tap_mmx[yoffset];
+        vp8_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,   src_pixels_per_line, 21, 32);
+        vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr,   dst_pitch, 32, 16 , 16, dst_pitch, VFilter);
+    }
+}
+
+
+void vp8_sixtap_predict8x8_sse2
+(
+    unsigned char  *src_ptr,
+    int   src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pitch
+)
+{
+    DECLARE_ALIGNED(16, unsigned short, FData2[256]);  /* Temp data bufffer used in filtering */
+    const short *HFilter, *VFilter;
+
+    if (xoffset)
+    {
+        if (yoffset)
+        {
+            HFilter = vp8_six_tap_mmx[xoffset];
+            vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,   src_pixels_per_line, 1, 13, 16, HFilter);
+            VFilter = vp8_six_tap_mmx[yoffset];
+            vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr,   dst_pitch, 16, 8 , 8, dst_pitch, VFilter);
+        }
+        else
+        {
+            /* First-pass only */
+            HFilter = vp8_six_tap_mmx[xoffset];
+            vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 8, HFilter);
+        }
+    }
+    else
+    {
+        /* Second-pass only */
+        VFilter = vp8_six_tap_mmx[yoffset];
+        vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 8, VFilter);
+    }
+}
+
+
+void vp8_sixtap_predict8x4_sse2
+(
+    unsigned char  *src_ptr,
+    int   src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pitch
+)
+{
+    DECLARE_ALIGNED(16, unsigned short, FData2[256]);  /* Temp data bufffer used in filtering */
+    const short *HFilter, *VFilter;
+
+    if (xoffset)
+    {
+        if (yoffset)
+        {
+            HFilter = vp8_six_tap_mmx[xoffset];
+            vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,   src_pixels_per_line, 1, 9, 16, HFilter);
+            VFilter = vp8_six_tap_mmx[yoffset];
+            vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr,   dst_pitch, 16, 8 , 4, dst_pitch, VFilter);
+        }
+        else
+        {
+            /* First-pass only */
+            HFilter = vp8_six_tap_mmx[xoffset];
+            vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 4, HFilter);
+        }
+    }
+    else
+    {
+        /* Second-pass only */
+        VFilter = vp8_six_tap_mmx[yoffset];
+        vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 4, VFilter);
+    }
+}
+
+#endif
+
+#if HAVE_SSSE3
+
+extern void vp8_filter_block1d8_h6_ssse3
+(
+    unsigned char  *src_ptr,
+    unsigned int    src_pixels_per_line,
+    unsigned char  *output_ptr,
+    unsigned int    output_pitch,
+    unsigned int    output_height,
+    unsigned int    vp8_filter_index
+);
+
+extern void vp8_filter_block1d16_h6_ssse3
+(
+    unsigned char  *src_ptr,
+    unsigned int    src_pixels_per_line,
+    unsigned char  *output_ptr,
+    unsigned int    output_pitch,
+    unsigned int    output_height,
+    unsigned int    vp8_filter_index
+);
+
+extern void vp8_filter_block1d16_v6_ssse3
+(
+    unsigned char *src_ptr,
+    unsigned int   src_pitch,
+    unsigned char *output_ptr,
+    unsigned int   out_pitch,
+    unsigned int   output_height,
+    unsigned int   vp8_filter_index
+);
+
+extern void vp8_filter_block1d8_v6_ssse3
+(
+    unsigned char *src_ptr,
+    unsigned int   src_pitch,
+    unsigned char *output_ptr,
+    unsigned int   out_pitch,
+    unsigned int   output_height,
+    unsigned int   vp8_filter_index
+);
+
+extern void vp8_filter_block1d4_h6_ssse3
+(
+    unsigned char  *src_ptr,
+    unsigned int    src_pixels_per_line,
+    unsigned char  *output_ptr,
+    unsigned int    output_pitch,
+    unsigned int    output_height,
+    unsigned int    vp8_filter_index
+);
+
+extern void vp8_filter_block1d4_v6_ssse3
+(
+    unsigned char *src_ptr,
+    unsigned int   src_pitch,
+    unsigned char *output_ptr,
+    unsigned int   out_pitch,
+    unsigned int   output_height,
+    unsigned int   vp8_filter_index
+);
+
+void vp8_sixtap_predict16x16_ssse3
+(
+    unsigned char  *src_ptr,
+    int   src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pitch
+
+)
+{
+    DECLARE_ALIGNED(16, unsigned char, FData2[24*24]);
+
+    if (xoffset)
+    {
+        if (yoffset)
+        {
+            vp8_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
+                                          src_pixels_per_line, FData2,
+                                          16, 21, xoffset);
+            vp8_filter_block1d16_v6_ssse3(FData2 , 16, dst_ptr, dst_pitch,
+                                          16, yoffset);
+        }
+        else
+        {
+            /* First-pass only */
+            vp8_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line,
+                                          dst_ptr, dst_pitch, 16, xoffset);
+        }
+    }
+    else
+    {
+        if (yoffset)
+        {
+            /* Second-pass only */
+            vp8_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
+                                          src_pixels_per_line,
+                                          dst_ptr, dst_pitch, 16, yoffset);
+        }
+        else
+        {
+            /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
+             * yoffset==0) case correctly. Add copy function here to guarantee
+             * six-tap function handles all possible offsets. */
+            vp8_copy_mem16x16(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
+        }
+    }
+}
+
+void vp8_sixtap_predict8x8_ssse3
+(
+    unsigned char  *src_ptr,
+    int   src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pitch
+)
+{
+    DECLARE_ALIGNED(16, unsigned char, FData2[256]);
+
+    if (xoffset)
+    {
+        if (yoffset)
+        {
+            vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
+                                         src_pixels_per_line, FData2,
+                                         8, 13, xoffset);
+            vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch,
+                                         8, yoffset);
+        }
+        else
+        {
+            vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,
+                                         dst_ptr, dst_pitch, 8, xoffset);
+        }
+    }
+    else
+    {
+        if (yoffset)
+        {
+            /* Second-pass only */
+            vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
+                                         src_pixels_per_line,
+                                         dst_ptr, dst_pitch, 8, yoffset);
+        }
+        else
+        {
+            /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
+             * yoffset==0) case correctly. Add copy function here to guarantee
+             * six-tap function handles all possible offsets. */
+            vp8_copy_mem8x8(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
+        }
+    }
+}
+
+
+void vp8_sixtap_predict8x4_ssse3
+(
+    unsigned char  *src_ptr,
+    int   src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pitch
+)
+{
+    DECLARE_ALIGNED(16, unsigned char, FData2[256]);
+
+    if (xoffset)
+    {
+        if (yoffset)
+        {
+            vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
+                                         src_pixels_per_line, FData2,
+                                         8, 9, xoffset);
+            vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch,
+                                         4, yoffset);
+        }
+        else
+        {
+            /* First-pass only */
+            vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,
+                                         dst_ptr, dst_pitch, 4, xoffset);
+        }
+    }
+    else
+    {
+        if (yoffset)
+        {
+            /* Second-pass only */
+            vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
+                                         src_pixels_per_line,
+                                         dst_ptr, dst_pitch, 4, yoffset);
+        }
+        else
+        {
+            /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
+             * yoffset==0) case correctly. Add copy function here to guarantee
+             * six-tap function handles all possible offsets. */
+            vp8_copy_mem8x4(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
+        }
+    }
+}
+
+void vp8_sixtap_predict4x4_ssse3
+(
+    unsigned char  *src_ptr,
+    int   src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pitch
+)
+{
+  DECLARE_ALIGNED(16, unsigned char, FData2[4*9]);
+
+  if (xoffset)
+  {
+      if (yoffset)
+      {
+          vp8_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
+                                       src_pixels_per_line,
+                                       FData2, 4, 9, xoffset);
+          vp8_filter_block1d4_v6_ssse3(FData2, 4, dst_ptr, dst_pitch,
+                                       4, yoffset);
+      }
+      else
+      {
+          vp8_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line,
+                                       dst_ptr, dst_pitch, 4, xoffset);
+      }
+  }
+  else
+  {
+      if (yoffset)
+      {
+          vp8_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
+                                       src_pixels_per_line,
+                                       dst_ptr, dst_pitch, 4, yoffset);
+      }
+      else
+      {
+        /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
+          * yoffset==0) case correctly. Add copy function here to guarantee
+          * six-tap function handles all possible offsets. */
+          int r;
+
+          for (r = 0; r < 4; r++)
+          {
+            dst_ptr[0]  = src_ptr[0];
+            dst_ptr[1]  = src_ptr[1];
+            dst_ptr[2]  = src_ptr[2];
+            dst_ptr[3]  = src_ptr[3];
+            dst_ptr     += dst_pitch;
+            src_ptr     += src_pixels_per_line;
+          }
+      }
+  }
+}
+
+#endif
diff --git a/libs/libvpx/vp8/common/x86/vp8_loopfilter_mmx.asm b/libs/libvpx/vp8/common/x86/vp8_loopfilter_mmx.asm
new file mode 100644
index 0000000000..88a07b9f3f
--- /dev/null
+++ b/libs/libvpx/vp8/common/x86/vp8_loopfilter_mmx.asm
@@ -0,0 +1,1753 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+
+;void vp8_loop_filter_horizontal_edge_mmx
+;(
+;    unsigned char *src_ptr,
+;    int src_pixel_step,
+;    const char *blimit,
+;    const char *limit,
+;    const char *thresh,
+;    int  count
+;)
+global sym(vp8_loop_filter_horizontal_edge_mmx) PRIVATE
+sym(vp8_loop_filter_horizontal_edge_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 32                         ; reserve 32 bytes
+    %define t0 [rsp + 0]    ;__declspec(align(16)) char t0[8];
+    %define t1 [rsp + 16]   ;__declspec(align(16)) char t1[8];
+
+        mov         rsi, arg(0) ;src_ptr
+        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
+
+        movsxd      rcx, dword ptr arg(5) ;count
+.next8_h:
+        mov         rdx, arg(3) ;limit
+        movq        mm7, [rdx]
+        mov         rdi, rsi              ; rdi points to row +1 for indirect addressing
+        add         rdi, rax
+
+        ; calculate breakout conditions
+        movq        mm2, [rdi+2*rax]      ; q3
+        movq        mm1, [rsi+2*rax]      ; q2
+        movq        mm6, mm1              ; q2
+        psubusb     mm1, mm2              ; q2-=q3
+        psubusb     mm2, mm6              ; q3-=q2
+        por         mm1, mm2              ; abs(q3-q2)
+        psubusb     mm1, mm7              ;
+
+
+        movq        mm4, [rsi+rax]        ; q1
+        movq        mm3, mm4              ; q1
+        psubusb     mm4, mm6              ; q1-=q2
+        psubusb     mm6, mm3              ; q2-=q1
+        por         mm4, mm6              ; abs(q2-q1)
+
+        psubusb     mm4, mm7
+        por        mm1, mm4
+
+        movq        mm4, [rsi]            ; q0
+        movq        mm0, mm4              ; q0
+        psubusb     mm4, mm3              ; q0-=q1
+        psubusb     mm3, mm0              ; q1-=q0
+        por         mm4, mm3              ; abs(q0-q1)
+        movq        t0, mm4               ; save to t0
+        psubusb     mm4, mm7
+        por        mm1, mm4
+
+
+        neg         rax                   ; negate pitch to deal with above border
+
+        movq        mm2, [rsi+4*rax]      ; p3
+        movq        mm4, [rdi+4*rax]      ; p2
+        movq        mm5, mm4              ; p2
+        psubusb     mm4, mm2              ; p2-=p3
+        psubusb     mm2, mm5              ; p3-=p2
+        por         mm4, mm2              ; abs(p3 - p2)
+        psubusb     mm4, mm7
+        por        mm1, mm4
+
+
+        movq        mm4, [rsi+2*rax]      ; p1
+        movq        mm3, mm4              ; p1
+        psubusb     mm4, mm5              ; p1-=p2
+        psubusb     mm5, mm3              ; p2-=p1
+        por         mm4, mm5              ; abs(p2 - p1)
+        psubusb     mm4, mm7
+        por        mm1, mm4
+
+        movq        mm2, mm3              ; p1
+
+        movq        mm4, [rsi+rax]        ; p0
+        movq        mm5, mm4              ; p0
+        psubusb     mm4, mm3              ; p0-=p1
+        psubusb     mm3, mm5              ; p1-=p0
+        por         mm4, mm3              ; abs(p1 - p0)
+        movq        t1, mm4               ; save to t1
+        psubusb     mm4, mm7
+        por        mm1, mm4
+
+        movq        mm3, [rdi]            ; q1
+        movq        mm4, mm3              ; q1
+        psubusb     mm3, mm2              ; q1-=p1
+        psubusb     mm2, mm4              ; p1-=q1
+        por         mm2, mm3              ; abs(p1-q1)
+        pand        mm2, [GLOBAL(tfe)]    ; set lsb of each byte to zero
+        psrlw       mm2, 1                ; abs(p1-q1)/2
+
+        movq        mm6, mm5              ; p0
+        movq        mm3, [rsi]            ; q0
+        psubusb     mm5, mm3              ; p0-=q0
+        psubusb     mm3, mm6              ; q0-=p0
+        por         mm5, mm3              ; abs(p0 - q0)
+        paddusb     mm5, mm5              ; abs(p0-q0)*2
+        paddusb     mm5, mm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+        mov         rdx, arg(2) ;blimit           ; get blimit
+        movq        mm7, [rdx]            ; blimit
+
+        psubusb     mm5,    mm7           ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
+        por         mm1,    mm5
+        pxor        mm5,    mm5
+        pcmpeqb     mm1,    mm5           ; mask mm1
+
+        ; calculate high edge variance
+        mov         rdx, arg(4) ;thresh           ; get thresh
+        movq        mm7, [rdx]            ;
+        movq        mm4, t0               ; get abs (q1 - q0)
+        psubusb     mm4, mm7
+        movq        mm3, t1               ; get abs (p1 - p0)
+        psubusb     mm3, mm7
+        paddb       mm4, mm3              ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+
+        pcmpeqb     mm4,        mm5
+
+        pcmpeqb     mm5,        mm5
+        pxor        mm4,        mm5
+
+
+        ; start work on filters
+        movq        mm2, [rsi+2*rax]      ; p1
+        movq        mm7, [rdi]            ; q1
+        pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values
+        pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values
+        psubsb      mm2, mm7              ; p1 - q1
+        pand        mm2, mm4              ; high var mask (hvm)(p1 - q1)
+        pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values
+        pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values
+        movq        mm3, mm0              ; q0
+        psubsb      mm0, mm6              ; q0 - p0
+        paddsb      mm2, mm0              ; 1 * (q0 - p0) + hvm(p1 - q1)
+        paddsb      mm2, mm0              ; 2 * (q0 - p0) + hvm(p1 - q1)
+        paddsb      mm2, mm0              ; 3 * (q0 - p0) + hvm(p1 - q1)
+        pand        mm1, mm2                  ; mask filter values we don't care about
+        movq        mm2, mm1
+        paddsb      mm1, [GLOBAL(t4)]     ; 3* (q0 - p0) + hvm(p1 - q1) + 4
+        paddsb      mm2, [GLOBAL(t3)]     ; 3* (q0 - p0) + hvm(p1 - q1) + 3
+
+        pxor        mm0, mm0             ;
+        pxor        mm5, mm5
+        punpcklbw   mm0, mm2            ;
+        punpckhbw   mm5, mm2            ;
+        psraw       mm0, 11             ;
+        psraw       mm5, 11
+        packsswb    mm0, mm5
+        movq        mm2, mm0            ;  (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
+
+        pxor        mm0, mm0              ; 0
+        movq        mm5, mm1              ; abcdefgh
+        punpcklbw   mm0, mm1              ; e0f0g0h0
+        psraw       mm0, 11               ; sign extended shift right by 3
+        pxor        mm1, mm1              ; 0
+        punpckhbw   mm1, mm5              ; a0b0c0d0
+        psraw       mm1, 11               ; sign extended shift right by 3
+        movq        mm5, mm0              ; save results
+
+        packsswb    mm0, mm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
+        paddsw      mm5, [GLOBAL(ones)]
+        paddsw      mm1, [GLOBAL(ones)]
+        psraw       mm5, 1                ; partial shifted one more time for 2nd tap
+        psraw       mm1, 1                ; partial shifted one more time for 2nd tap
+        packsswb    mm5, mm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
+        pandn       mm4, mm5              ; high edge variance additive
+
+        paddsb      mm6, mm2              ; p0+= p0 add
+        pxor        mm6, [GLOBAL(t80)]    ; unoffset
+        movq        [rsi+rax], mm6        ; write back
+
+        movq        mm6, [rsi+2*rax]      ; p1
+        pxor        mm6, [GLOBAL(t80)]    ; reoffset
+        paddsb      mm6, mm4              ; p1+= p1 add
+        pxor        mm6, [GLOBAL(t80)]    ; unoffset
+        movq        [rsi+2*rax], mm6      ; write back
+
+        psubsb      mm3, mm0              ; q0-= q0 add
+        pxor        mm3, [GLOBAL(t80)]    ; unoffset
+        movq        [rsi], mm3            ; write back
+
+        psubsb      mm7, mm4              ; q1-= q1 add
+        pxor        mm7, [GLOBAL(t80)]    ; unoffset
+        movq        [rdi], mm7            ; write back
+
+        add         rsi,8
+        neg         rax
+        dec         rcx
+        jnz         .next8_h
+
+    add rsp, 32
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_loop_filter_vertical_edge_mmx
+;(
+;    unsigned char *src_ptr,
+;    int  src_pixel_step,
+;    const char *blimit,
+;    const char *limit,
+;    const char *thresh,
+;    int count
+;)
+global sym(vp8_loop_filter_vertical_edge_mmx) PRIVATE
+sym(vp8_loop_filter_vertical_edge_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub          rsp, 64      ; reserve 64 bytes
+    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
+    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
+    %define srct [rsp + 32]   ;__declspec(align(16)) char srct[32];
+
+        mov         rsi,        arg(0) ;src_ptr
+        movsxd      rax,        dword ptr arg(1) ;src_pixel_step     ; destination pitch?
+
+        lea         rsi,        [rsi + rax*4 - 4]
+
+        movsxd      rcx,        dword ptr arg(5) ;count
+.next8_v:
+        mov         rdi,        rsi           ; rdi points to row +1 for indirect addressing
+        add         rdi,        rax
+
+
+        ;transpose
+        movq        mm6,        [rsi+2*rax]                 ; 67 66 65 64 63 62 61 60
+        movq        mm7,        mm6                         ; 77 76 75 74 73 72 71 70
+
+        punpckhbw   mm7,        [rdi+2*rax]                 ; 77 67 76 66 75 65 74 64
+        punpcklbw   mm6,        [rdi+2*rax]                 ; 73 63 72 62 71 61 70 60
+
+        movq        mm4,        [rsi]                       ; 47 46 45 44 43 42 41 40
+        movq        mm5,        mm4                         ; 47 46 45 44 43 42 41 40
+
+        punpckhbw   mm5,        [rsi+rax]                   ; 57 47 56 46 55 45 54 44
+        punpcklbw   mm4,        [rsi+rax]                   ; 53 43 52 42 51 41 50 40
+
+        movq        mm3,        mm5                         ; 57 47 56 46 55 45 54 44
+        punpckhwd   mm5,        mm7                         ; 77 67 57 47 76 66 56 46
+
+        punpcklwd   mm3,        mm7                         ; 75 65 55 45 74 64 54 44
+        movq        mm2,        mm4                         ; 53 43 52 42 51 41 50 40
+
+        punpckhwd   mm4,        mm6                         ; 73 63 53 43 72 62 52 42
+        punpcklwd   mm2,        mm6                         ; 71 61 51 41 70 60 50 40
+
+        neg         rax
+        movq        mm6,        [rsi+rax*2]                 ; 27 26 25 24 23 22 21 20
+
+        movq        mm1,        mm6                         ; 27 26 25 24 23 22 21 20
+        punpckhbw   mm6,        [rsi+rax]                   ; 37 27 36 36 35 25 34 24
+
+        punpcklbw   mm1,        [rsi+rax]                   ; 33 23 32 22 31 21 30 20
+        movq        mm7,        [rsi+rax*4];                ; 07 06 05 04 03 02 01 00
+
+        punpckhbw   mm7,        [rdi+rax*4]                 ; 17 07 16 06 15 05 14 04
+        movq        mm0,        mm7                         ; 17 07 16 06 15 05 14 04
+
+        punpckhwd   mm7,        mm6                         ; 37 27 17 07 36 26 16 06
+        punpcklwd   mm0,        mm6                         ; 35 25 15 05 34 24 14 04
+
+        movq        mm6,        mm7                         ; 37 27 17 07 36 26 16 06
+        punpckhdq   mm7,        mm5                         ; 77 67 57 47 37 27 17 07  = q3
+
+        punpckldq   mm6,        mm5                         ; 76 66 56 46 36 26 16 06  = q2
+
+        movq        mm5,        mm6                         ; 76 66 56 46 36 26 16 06
+        psubusb     mm5,        mm7                         ; q2-q3
+
+        psubusb     mm7,        mm6                         ; q3-q2
+        por         mm7,        mm5;                        ; mm7=abs (q3-q2)
+
+        movq        mm5,        mm0                         ; 35 25 15 05 34 24 14 04
+        punpckhdq   mm5,        mm3                         ; 75 65 55 45 35 25 15 05 = q1
+
+        punpckldq   mm0,        mm3                         ; 74 64 54 44 34 24 15 04 = q0
+        movq        mm3,        mm5                         ; 75 65 55 45 35 25 15 05 = q1
+
+        psubusb     mm3,        mm6                         ; q1-q2
+        psubusb     mm6,        mm5                         ; q2-q1
+
+        por         mm6,        mm3                         ; mm6=abs(q2-q1)
+        lea         rdx,        srct
+
+        movq        [rdx+24],   mm5                         ; save q1
+        movq        [rdx+16],   mm0                         ; save q0
+
+        movq        mm3,        [rsi+rax*4]                 ; 07 06 05 04 03 02 01 00
+        punpcklbw   mm3,        [rdi+rax*4]                 ; 13 03 12 02 11 01 10 00
+
+        movq        mm0,        mm3                         ; 13 03 12 02 11 01 10 00
+        punpcklwd   mm0,        mm1                         ; 31 21 11 01 30 20 10 00
+
+        punpckhwd   mm3,        mm1                         ; 33 23 13 03 32 22 12 02
+        movq        mm1,        mm0                         ; 31 21 11 01 30 20 10 00
+
+        punpckldq   mm0,        mm2                         ; 70 60 50 40 30 20 10 00  =p3
+        punpckhdq   mm1,        mm2                         ; 71 61 51 41 31 21 11 01  =p2
+
+        movq        mm2,        mm1                         ; 71 61 51 41 31 21 11 01  =p2
+        psubusb     mm2,        mm0                         ; p2-p3
+
+        psubusb     mm0,        mm1                         ; p3-p2
+        por         mm0,        mm2                         ; mm0=abs(p3-p2)
+
+        movq        mm2,        mm3                         ; 33 23 13 03 32 22 12 02
+        punpckldq   mm2,        mm4                         ; 72 62 52 42 32 22 12 02 = p1
+
+        punpckhdq   mm3,        mm4                         ; 73 63 53 43 33 23 13 03 = p0
+        movq        [rdx+8],    mm3                         ; save p0
+
+        movq        [rdx],      mm2                         ; save p1
+        movq        mm5,        mm2                         ; mm5 = p1
+
+        psubusb     mm2,        mm1                         ; p1-p2
+        psubusb     mm1,        mm5                         ; p2-p1
+
+        por         mm1,        mm2                         ; mm1=abs(p2-p1)
+        mov         rdx,        arg(3) ;limit
+
+        movq        mm4,        [rdx]                       ; mm4 = limit
+        psubusb     mm7,        mm4
+
+        psubusb     mm0,        mm4
+        psubusb     mm1,        mm4
+
+        psubusb     mm6,        mm4
+        por         mm7,        mm6
+
+        por         mm0,        mm1
+        por         mm0,        mm7                         ;   abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
+
+        movq        mm1,        mm5                         ; p1
+
+        movq        mm7,        mm3                         ; mm3=mm7=p0
+        psubusb     mm7,        mm5                         ; p0 - p1
+
+        psubusb     mm5,        mm3                         ; p1 - p0
+        por         mm5,        mm7                         ; abs(p1-p0)
+
+        movq        t0,         mm5                         ; save abs(p1-p0)
+        lea         rdx,        srct
+
+        psubusb     mm5,        mm4
+        por         mm0,        mm5                         ; mm0=mask
+
+        movq        mm5,        [rdx+16]                    ; mm5=q0
+        movq        mm7,        [rdx+24]                    ; mm7=q1
+
+        movq        mm6,        mm5                         ; mm6=q0
+        movq        mm2,        mm7                         ; q1
+        psubusb     mm5,        mm7                         ; q0-q1
+
+        psubusb     mm7,        mm6                         ; q1-q0
+        por         mm7,        mm5                         ; abs(q1-q0)
+
+        movq        t1,         mm7                         ; save abs(q1-q0)
+        psubusb     mm7,        mm4
+
+        por         mm0,        mm7                         ; mask
+
+        movq        mm5,        mm2                         ; q1
+        psubusb     mm5,        mm1                         ; q1-=p1
+        psubusb     mm1,        mm2                         ; p1-=q1
+        por         mm5,        mm1                         ; abs(p1-q1)
+        pand        mm5,        [GLOBAL(tfe)]               ; set lsb of each byte to zero
+        psrlw       mm5,        1                           ; abs(p1-q1)/2
+
+        mov         rdx,        arg(2) ;blimit                      ;
+
+        movq        mm4,        [rdx]                       ;blimit
+        movq        mm1,        mm3                         ; mm1=mm3=p0
+
+        movq        mm7,        mm6                         ; mm7=mm6=q0
+        psubusb     mm1,        mm7                         ; p0-q0
+
+        psubusb     mm7,        mm3                         ; q0-p0
+        por         mm1,        mm7                         ; abs(q0-p0)
+        paddusb     mm1,        mm1                         ; abs(q0-p0)*2
+        paddusb     mm1,        mm5                         ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+        psubusb     mm1,        mm4                         ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
+        por         mm1,        mm0;                        ; mask
+
+        pxor        mm0,        mm0
+        pcmpeqb     mm1,        mm0
+
+        ; calculate high edge variance
+        mov         rdx,        arg(4) ;thresh            ; get thresh
+        movq        mm7,        [rdx]
+        ;
+        movq        mm4,        t0              ; get abs (q1 - q0)
+        psubusb     mm4,        mm7
+
+        movq        mm3,        t1              ; get abs (p1 - p0)
+        psubusb     mm3,        mm7
+
+        por         mm4,        mm3             ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+        pcmpeqb     mm4,        mm0
+
+        pcmpeqb     mm0,        mm0
+        pxor        mm4,        mm0
+
+
+
+        ; start work on filters
+        lea         rdx,        srct
+
+        movq        mm2,        [rdx]           ; p1
+        movq        mm7,        [rdx+24]        ; q1
+
+        movq        mm6,        [rdx+8]         ; p0
+        movq        mm0,        [rdx+16]        ; q0
+
+        pxor        mm2,        [GLOBAL(t80)]   ; p1 offset to convert to signed values
+        pxor        mm7,        [GLOBAL(t80)]   ; q1 offset to convert to signed values
+
+        psubsb      mm2,        mm7             ; p1 - q1
+        pand        mm2,        mm4             ; high var mask (hvm)(p1 - q1)
+
+        pxor        mm6,        [GLOBAL(t80)]   ; offset to convert to signed values
+        pxor        mm0,        [GLOBAL(t80)]   ; offset to convert to signed values
+
+        movq        mm3,        mm0             ; q0
+        psubsb      mm0,        mm6             ; q0 - p0
+
+        paddsb      mm2,        mm0             ; 1 * (q0 - p0) + hvm(p1 - q1)
+        paddsb      mm2,        mm0             ; 2 * (q0 - p0) + hvm(p1 - q1)
+
+        paddsb      mm2,        mm0             ; 3 * (q0 - p0) + hvm(p1 - q1)
+        pand       mm1,        mm2              ; mask filter values we don't care about
+
+        movq        mm2,        mm1
+        paddsb      mm1,        [GLOBAL(t4)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 4
+
+        paddsb      mm2,        [GLOBAL(t3)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 3
+        pxor        mm0,        mm0          ;
+
+        pxor        mm5,        mm5
+        punpcklbw   mm0,        mm2         ;
+
+        punpckhbw   mm5,        mm2         ;
+        psraw       mm0,        11              ;
+
+        psraw       mm5,        11
+        packsswb    mm0,        mm5
+
+        movq        mm2,        mm0         ;  (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
+
+        pxor        mm0,        mm0           ; 0
+        movq        mm5,        mm1           ; abcdefgh
+
+        punpcklbw   mm0,        mm1           ; e0f0g0h0
+        psraw       mm0,        11                ; sign extended shift right by 3
+
+        pxor        mm1,        mm1           ; 0
+        punpckhbw   mm1,        mm5           ; a0b0c0d0
+
+        psraw       mm1,        11                ; sign extended shift right by 3
+        movq        mm5,        mm0              ; save results
+
+        packsswb    mm0,        mm1           ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
+        paddsw      mm5,        [GLOBAL(ones)]
+
+        paddsw      mm1,        [GLOBAL(ones)]
+        psraw       mm5,        1                 ; partial shifted one more time for 2nd tap
+
+        psraw       mm1,        1                 ; partial shifted one more time for 2nd tap
+        packsswb    mm5,        mm1           ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
+
+        pandn       mm4,        mm5             ; high edge variance additive
+
+        paddsb      mm6,        mm2             ; p0+= p0 add
+        pxor        mm6,        [GLOBAL(t80)]   ; unoffset
+
+        ; mm6=p0                               ;
+        movq        mm1,        [rdx]           ; p1
+        pxor        mm1,        [GLOBAL(t80)]   ; reoffset
+
+        paddsb      mm1,        mm4                 ; p1+= p1 add
+        pxor        mm1,        [GLOBAL(t80)]       ; unoffset
+        ; mm6 = p0 mm1 = p1
+
+        psubsb      mm3,        mm0                 ; q0-= q0 add
+        pxor        mm3,        [GLOBAL(t80)]       ; unoffset
+
+        ; mm3 = q0
+        psubsb      mm7,        mm4                 ; q1-= q1 add
+        pxor        mm7,        [GLOBAL(t80)]       ; unoffset
+        ; mm7 = q1
+
+        ; transpose and write back
+        ; mm1 =    72 62 52 42 32 22 12 02
+        ; mm6 =    73 63 53 43 33 23 13 03
+        ; mm3 =    74 64 54 44 34 24 14 04
+        ; mm7 =    75 65 55 45 35 25 15 05
+
+        movq        mm2,        mm1             ; 72 62 52 42 32 22 12 02
+        punpcklbw   mm2,        mm6             ; 33 32 23 22 13 12 03 02
+
+        movq        mm4,        mm3             ; 74 64 54 44 34 24 14 04
+        punpckhbw   mm1,        mm6             ; 73 72 63 62 53 52 43 42
+
+        punpcklbw   mm4,        mm7             ; 35 34 25 24 15 14 05 04
+        punpckhbw   mm3,        mm7             ; 75 74 65 64 55 54 45 44
+
+        movq        mm6,        mm2             ; 33 32 23 22 13 12 03 02
+        punpcklwd   mm2,        mm4             ; 15 14 13 12 05 04 03 02
+
+        punpckhwd   mm6,        mm4             ; 35 34 33 32 25 24 23 22
+        movq        mm5,        mm1             ; 73 72 63 62 53 52 43 42
+
+        punpcklwd   mm1,        mm3             ; 55 54 53 52 45 44 43 42
+        punpckhwd   mm5,        mm3             ; 75 74 73 72 65 64 63 62
+
+
+        ; mm2 = 15 14 13 12 05 04 03 02
+        ; mm6 = 35 34 33 32 25 24 23 22
+        ; mm5 = 55 54 53 52 45 44 43 42
+        ; mm1 = 75 74 73 72 65 64 63 62
+
+
+
+        movd        [rsi+rax*4+2], mm2
+        psrlq       mm2,        32
+
+        movd        [rdi+rax*4+2], mm2
+        movd        [rsi+rax*2+2], mm6
+
+        psrlq       mm6,        32
+        movd        [rsi+rax+2],mm6
+
+        movd        [rsi+2],    mm1
+        psrlq       mm1,        32
+
+        movd        [rdi+2],    mm1
+        neg         rax
+
+        movd        [rdi+rax+2],mm5
+        psrlq       mm5,        32
+
+        movd        [rdi+rax*2+2], mm5
+
+        lea         rsi,        [rsi+rax*8]
+        dec         rcx
+        jnz         .next8_v
+
+    add rsp, 64
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_mbloop_filter_horizontal_edge_mmx
+;(
+;    unsigned char *src_ptr,
+;    int  src_pixel_step,
+;    const char *blimit,
+;    const char *limit,
+;    const char *thresh,
+;    int count
+;)
+global sym(vp8_mbloop_filter_horizontal_edge_mmx) PRIVATE
+sym(vp8_mbloop_filter_horizontal_edge_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub          rsp, 32      ; reserve 32 bytes
+    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
+    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
+
+        mov         rsi, arg(0) ;src_ptr
+        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
+
+        movsxd      rcx, dword ptr arg(5) ;count
+.next8_mbh:
+        mov         rdx, arg(3) ;limit
+        movq        mm7, [rdx]
+        mov         rdi, rsi              ; rdi points to row +1 for indirect addressing
+        add         rdi, rax
+
+        ; calculate breakout conditions
+        movq        mm2, [rdi+2*rax]      ; q3
+
+        movq        mm1, [rsi+2*rax]      ; q2
+        movq        mm6, mm1              ; q2
+        psubusb     mm1, mm2              ; q2-=q3
+        psubusb     mm2, mm6              ; q3-=q2
+        por         mm1, mm2              ; abs(q3-q2)
+        psubusb     mm1, mm7
+
+
+        ; mm1 = abs(q3-q2), mm6 =q2, mm7 = limit
+        movq        mm4, [rsi+rax]        ; q1
+        movq        mm3, mm4              ; q1
+        psubusb     mm4, mm6              ; q1-=q2
+        psubusb     mm6, mm3              ; q2-=q1
+        por         mm4, mm6              ; abs(q2-q1)
+        psubusb     mm4, mm7
+        por        mm1, mm4
+
+
+        ; mm1 = mask,      mm3=q1, mm7 = limit
+
+        movq        mm4, [rsi]            ; q0
+        movq        mm0, mm4              ; q0
+        psubusb     mm4, mm3              ; q0-=q1
+        psubusb     mm3, mm0              ; q1-=q0
+        por         mm4, mm3              ; abs(q0-q1)
+        movq        t0, mm4               ; save to t0
+        psubusb     mm4, mm7
+        por        mm1, mm4
+
+
+        ; mm1 = mask, mm0=q0,  mm7 = limit, t0 = abs(q0-q1)
+
+        neg         rax                   ; negate pitch to deal with above border
+
+        movq        mm2, [rsi+4*rax]      ; p3
+        movq        mm4, [rdi+4*rax]      ; p2
+        movq        mm5, mm4              ; p2
+        psubusb     mm4, mm2              ; p2-=p3
+        psubusb     mm2, mm5              ; p3-=p2
+        por         mm4, mm2              ; abs(p3 - p2)
+        psubusb     mm4, mm7
+        por        mm1, mm4
+        ; mm1 = mask, mm0=q0,  mm7 = limit, t0 = abs(q0-q1)
+
+        movq        mm4, [rsi+2*rax]      ; p1
+        movq        mm3, mm4              ; p1
+        psubusb     mm4, mm5              ; p1-=p2
+        psubusb     mm5, mm3              ; p2-=p1
+        por         mm4, mm5              ; abs(p2 - p1)
+        psubusb     mm4, mm7
+        por        mm1, mm4
+
+        movq        mm2, mm3              ; p1
+
+
+        ; mm1 = mask, mm0=q0,  mm7 = limit, t0 = abs(q0-q1)
+
+        movq        mm4, [rsi+rax]        ; p0
+        movq        mm5, mm4              ; p0
+        psubusb     mm4, mm3              ; p0-=p1
+        psubusb     mm3, mm5              ; p1-=p0
+        por         mm4, mm3              ; abs(p1 - p0)
+        movq        t1, mm4               ; save to t1
+        psubusb     mm4, mm7
+        por        mm1, mm4
+        ; mm1 = mask, mm0=q0,  mm7 = limit, t0 = abs(q0-q1) t1 = abs(p1-p0)
+        ; mm5 = p0
+        movq        mm3, [rdi]            ; q1
+        movq        mm4, mm3              ; q1
+        psubusb     mm3, mm2              ; q1-=p1
+        psubusb     mm2, mm4              ; p1-=q1
+        por         mm2, mm3              ; abs(p1-q1)
+        pand        mm2, [GLOBAL(tfe)]    ; set lsb of each byte to zero
+        psrlw       mm2, 1                ; abs(p1-q1)/2
+
+        movq        mm6, mm5              ; p0
+        movq        mm3, mm0              ; q0
+        psubusb     mm5, mm3              ; p0-=q0
+        psubusb     mm3, mm6              ; q0-=p0
+        por         mm5, mm3              ; abs(p0 - q0)
+        paddusb     mm5, mm5              ; abs(p0-q0)*2
+        paddusb     mm5, mm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+        mov         rdx, arg(2) ;blimit           ; get blimit
+        movq        mm7, [rdx]            ; blimit
+
+        psubusb     mm5,    mm7           ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
+        por         mm1,    mm5
+        pxor        mm5,    mm5
+        pcmpeqb     mm1,    mm5           ; mask mm1
+
+        ; mm1 = mask, mm0=q0,  mm7 = blimit, t0 = abs(q0-q1) t1 = abs(p1-p0)
+        ; mm6 = p0,
+
+        ; calculate high edge variance
+        mov         rdx, arg(4) ;thresh           ; get thresh
+        movq        mm7, [rdx]            ;
+        movq        mm4, t0               ; get abs (q1 - q0)
+        psubusb     mm4, mm7
+        movq        mm3, t1               ; get abs (p1 - p0)
+        psubusb     mm3, mm7
+        paddb       mm4, mm3              ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+
+        pcmpeqb     mm4,        mm5
+
+        pcmpeqb     mm5,        mm5
+        pxor        mm4,        mm5
+
+
+
+        ; mm1 = mask, mm0=q0,  mm7 = thresh, t0 = abs(q0-q1) t1 = abs(p1-p0)
+        ; mm6 = p0, mm4=hev
+        ; start work on filters
+        movq        mm2, [rsi+2*rax]      ; p1
+        movq        mm7, [rdi]            ; q1
+        pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values
+        pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values
+        psubsb      mm2, mm7              ; p1 - q1
+
+        pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values
+        pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values
+        movq        mm3, mm0              ; q0
+        psubsb      mm0, mm6              ; q0 - p0
+        paddsb      mm2, mm0              ; 1 * (q0 - p0) + (p1 - q1)
+        paddsb      mm2, mm0              ; 2 * (q0 - p0)
+        paddsb      mm2, mm0              ; 3 * (q0 - p0) + (p1 - q1)
+        pand        mm1, mm2              ; mask filter values we don't care about
+
+
+        ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0
+        movq        mm2, mm1              ; vp8_filter
+        pand        mm2, mm4;             ; Filter2 = vp8_filter & hev
+
+        movq        mm5,        mm2       ;
+        paddsb      mm5,        [GLOBAL(t3)];
+
+        pxor        mm0, mm0              ; 0
+        pxor        mm7, mm7              ; 0
+
+        punpcklbw   mm0, mm5              ; e0f0g0h0
+        psraw       mm0, 11               ; sign extended shift right by 3
+        punpckhbw   mm7, mm5              ; a0b0c0d0
+        psraw       mm7, 11               ; sign extended shift right by 3
+        packsswb    mm0, mm7              ; Filter2 >>=3;
+
+        movq        mm5, mm0              ; Filter2
+
+        paddsb      mm2, [GLOBAL(t4)]     ; vp8_signed_char_clamp(Filter2 + 4)
+        pxor        mm0, mm0              ; 0
+        pxor        mm7, mm7              ; 0
+
+        punpcklbw   mm0, mm2              ; e0f0g0h0
+        psraw       mm0, 11               ; sign extended shift right by 3
+        punpckhbw   mm7, mm2              ; a0b0c0d0
+        psraw       mm7, 11               ; sign extended shift right by 3
+        packsswb    mm0, mm7              ; Filter2 >>=3;
+
+        ; mm0= filter2 mm1 = vp8_filter,  mm3 =qs0 mm5=s mm4 =hev mm6=ps0
+        psubsb      mm3, mm0              ; qs0 =qs0 - filter1
+        paddsb      mm6, mm5              ; ps0 =ps0 + Fitler2
+
+        ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0
+        ; vp8_filter &= ~hev;
+        ; Filter2 = vp8_filter;
+        pandn       mm4, mm1              ; vp8_filter&=~hev
+
+
+        ; mm3=qs0, mm4=filter2, mm6=ps0
+
+        ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
+        ; s = vp8_signed_char_clamp(qs0 - u);
+        ; *oq0 = s^0x80;
+        ; s = vp8_signed_char_clamp(ps0 + u);
+        ; *op0 = s^0x80;
+        pxor        mm0, mm0
+
+        pxor        mm1, mm1
+        pxor        mm2, mm2
+        punpcklbw   mm1, mm4
+        punpckhbw   mm2, mm4
+        pmulhw      mm1, [GLOBAL(s27)]
+        pmulhw      mm2, [GLOBAL(s27)]
+        paddw       mm1, [GLOBAL(s63)]
+        paddw       mm2, [GLOBAL(s63)]
+        psraw       mm1, 7
+        psraw       mm2, 7
+        packsswb    mm1, mm2
+
+        psubsb      mm3, mm1
+        paddsb      mm6, mm1
+
+        pxor        mm3, [GLOBAL(t80)]
+        pxor        mm6, [GLOBAL(t80)]
+        movq        [rsi+rax], mm6
+        movq        [rsi],     mm3
+
+        ; roughly 2/7th difference across boundary
+        ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
+        ; s = vp8_signed_char_clamp(qs1 - u);
+        ; *oq1 = s^0x80;
+        ; s = vp8_signed_char_clamp(ps1 + u);
+        ; *op1 = s^0x80;
+        pxor        mm1, mm1
+        pxor        mm2, mm2
+        punpcklbw   mm1, mm4
+        punpckhbw   mm2, mm4
+        pmulhw      mm1, [GLOBAL(s18)]
+        pmulhw      mm2, [GLOBAL(s18)]
+        paddw       mm1, [GLOBAL(s63)]
+        paddw       mm2, [GLOBAL(s63)]
+        psraw       mm1, 7
+        psraw       mm2, 7
+        packsswb    mm1, mm2
+
+        movq        mm3, [rdi]
+        movq        mm6, [rsi+rax*2]       ; p1
+
+        pxor        mm3, [GLOBAL(t80)]
+        pxor        mm6, [GLOBAL(t80)]
+
+        paddsb      mm6, mm1
+        psubsb      mm3, mm1
+
+        pxor        mm6, [GLOBAL(t80)]
+        pxor        mm3, [GLOBAL(t80)]
+        movq        [rdi], mm3
+        movq        [rsi+rax*2], mm6
+
+        ; roughly 1/7th difference across boundary
+        ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
+        ; s = vp8_signed_char_clamp(qs2 - u);
+        ; *oq2 = s^0x80;
+        ; s = vp8_signed_char_clamp(ps2 + u);
+        ; *op2 = s^0x80;
+        pxor        mm1, mm1
+        pxor        mm2, mm2
+        punpcklbw   mm1, mm4
+        punpckhbw   mm2, mm4
+        pmulhw      mm1, [GLOBAL(s9)]
+        pmulhw      mm2, [GLOBAL(s9)]
+        paddw       mm1, [GLOBAL(s63)]
+        paddw       mm2, [GLOBAL(s63)]
+        psraw       mm1, 7
+        psraw       mm2, 7
+        packsswb    mm1, mm2
+
+
+        movq        mm6, [rdi+rax*4]
+        neg         rax
+        movq        mm3, [rdi+rax  ]
+
+        pxor        mm6, [GLOBAL(t80)]
+        pxor        mm3, [GLOBAL(t80)]
+
+        paddsb      mm6, mm1
+        psubsb      mm3, mm1
+
+        pxor        mm6, [GLOBAL(t80)]
+        pxor        mm3, [GLOBAL(t80)]
+        movq        [rdi+rax  ], mm3
+        neg         rax
+        movq        [rdi+rax*4], mm6
+
+;EARLY_BREAK_OUT:
+        neg         rax
+        add         rsi,8
+        dec         rcx
+        jnz         .next8_mbh
+
+    add rsp, 32
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_mbloop_filter_vertical_edge_mmx
+;(
+;    unsigned char *src_ptr,
+;    int  src_pixel_step,
+;    const char *blimit,
+;    const char *limit,
+;    const char *thresh,
+;    int count
+;)
+global sym(vp8_mbloop_filter_vertical_edge_mmx) PRIVATE
+sym(vp8_mbloop_filter_vertical_edge_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub          rsp, 96      ; reserve 96 bytes
+    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
+    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
+    %define srct [rsp + 32]   ;__declspec(align(16)) char srct[64];
+
+        mov         rsi,        arg(0) ;src_ptr
+        movsxd      rax,        dword ptr arg(1) ;src_pixel_step     ; destination pitch?
+
+        lea         rsi,        [rsi + rax*4 - 4]
+
+        movsxd      rcx,        dword ptr arg(5) ;count
+.next8_mbv:
+        lea         rdi,        [rsi + rax]  ; rdi points to row +1 for indirect addressing
+
+        ;transpose
+        movq        mm0,        [rdi+2*rax]                 ; 77 76 75 74 73 72 71 70
+        movq        mm6,        [rsi+2*rax]                 ; 67 66 65 64 63 62 61 60
+
+        movq        mm7,        mm6                         ; 77 76 75 74 73 72 71 70
+        punpckhbw   mm7,        mm0                         ; 77 67 76 66 75 65 74 64
+
+        punpcklbw   mm6,        mm0                         ; 73 63 72 62 71 61 70 60
+        movq        mm0,        [rsi+rax]                   ; 57 56 55 54 53 52 51 50
+
+        movq        mm4,        [rsi]                       ; 47 46 45 44 43 42 41 40
+        movq        mm5,        mm4                         ; 47 46 45 44 43 42 41 40
+
+        punpckhbw   mm5,        mm0                         ; 57 47 56 46 55 45 54 44
+        punpcklbw   mm4,        mm0                         ; 53 43 52 42 51 41 50 40
+
+        movq        mm3,        mm5                         ; 57 47 56 46 55 45 54 44
+        punpckhwd   mm5,        mm7                         ; 77 67 57 47 76 66 56 46
+
+        punpcklwd   mm3,        mm7                         ; 75 65 55 45 74 64 54 44
+        movq        mm2,        mm4                         ; 53 43 52 42 51 41 50 40
+
+        punpckhwd   mm4,        mm6                         ; 73 63 53 43 72 62 52 42
+        punpcklwd   mm2,        mm6                         ; 71 61 51 41 70 60 50 40
+
+        neg         rax
+
+        movq        mm7,        [rsi+rax]                   ; 37 36 35 34 33 32 31 30
+        movq        mm6,        [rsi+rax*2]                 ; 27 26 25 24 23 22 21 20
+
+        movq        mm1,        mm6                         ; 27 26 25 24 23 22 21 20
+        punpckhbw   mm6,        mm7                         ; 37 27 36 36 35 25 34 24
+
+        punpcklbw   mm1,        mm7                         ; 33 23 32 22 31 21 30 20
+
+        movq        mm7,        [rsi+rax*4];                ; 07 06 05 04 03 02 01 00
+        punpckhbw   mm7,        [rdi+rax*4]                 ; 17 07 16 06 15 05 14 04
+
+        movq        mm0,        mm7                         ; 17 07 16 06 15 05 14 04
+        punpckhwd   mm7,        mm6                         ; 37 27 17 07 36 26 16 06
+
+        punpcklwd   mm0,        mm6                         ; 35 25 15 05 34 24 14 04
+        movq        mm6,        mm7                         ; 37 27 17 07 36 26 16 06
+
+        punpckhdq   mm7,        mm5                         ; 77 67 57 47 37 27 17 07  = q3
+        punpckldq   mm6,        mm5                         ; 76 66 56 46 36 26 16 06  = q2
+
+        lea         rdx,        srct
+        movq        mm5,        mm6                         ; 76 66 56 46 36 26 16 06
+
+        movq        [rdx+56],   mm7
+        psubusb     mm5,        mm7                         ; q2-q3
+
+
+        movq        [rdx+48],   mm6
+        psubusb     mm7,        mm6                         ; q3-q2
+
+        por         mm7,        mm5;                        ; mm7=abs (q3-q2)
+        movq        mm5,        mm0                         ; 35 25 15 05 34 24 14 04
+
+        punpckhdq   mm5,        mm3                         ; 75 65 55 45 35 25 15 05 = q1
+        punpckldq   mm0,        mm3                         ; 74 64 54 44 34 24 15 04 = q0
+
+        movq        mm3,        mm5                         ; 75 65 55 45 35 25 15 05 = q1
+        psubusb     mm3,        mm6                         ; q1-q2
+
+        psubusb     mm6,        mm5                         ; q2-q1
+        por         mm6,        mm3                         ; mm6=abs(q2-q1)
+
+        movq        [rdx+40],   mm5                         ; save q1
+        movq        [rdx+32],   mm0                         ; save q0
+
+        movq        mm3,        [rsi+rax*4]                 ; 07 06 05 04 03 02 01 00
+        punpcklbw   mm3,        [rdi+rax*4]                 ; 13 03 12 02 11 01 10 00
+
+        movq        mm0,        mm3                         ; 13 03 12 02 11 01 10 00
+        punpcklwd   mm0,        mm1                         ; 31 21 11 01 30 20 10 00
+
+        punpckhwd   mm3,        mm1                         ; 33 23 13 03 32 22 12 02
+        movq        mm1,        mm0                         ; 31 21 11 01 30 20 10 00
+
+        punpckldq   mm0,        mm2                         ; 70 60 50 40 30 20 10 00  =p3
+        punpckhdq   mm1,        mm2                         ; 71 61 51 41 31 21 11 01  =p2
+
+        movq        [rdx],      mm0                         ; save p3
+        movq        [rdx+8],    mm1                         ; save p2
+
+        movq        mm2,        mm1                         ; 71 61 51 41 31 21 11 01  =p2
+        psubusb     mm2,        mm0                         ; p2-p3
+
+        psubusb     mm0,        mm1                         ; p3-p2
+        por         mm0,        mm2                         ; mm0=abs(p3-p2)
+
+        movq        mm2,        mm3                         ; 33 23 13 03 32 22 12 02
+        punpckldq   mm2,        mm4                         ; 72 62 52 42 32 22 12 02 = p1
+
+        punpckhdq   mm3,        mm4                         ; 73 63 53 43 33 23 13 03 = p0
+        movq        [rdx+24],   mm3                         ; save p0
+
+        movq        [rdx+16],   mm2                         ; save p1
+        movq        mm5,        mm2                         ; mm5 = p1
+
+        psubusb     mm2,        mm1                         ; p1-p2
+        psubusb     mm1,        mm5                         ; p2-p1
+
+        por         mm1,        mm2                         ; mm1=abs(p2-p1)
+        mov         rdx,        arg(3) ;limit
+
+        movq        mm4,        [rdx]                       ; mm4 = limit
+        psubusb     mm7,        mm4                         ; abs(q3-q2) > limit
+
+        psubusb     mm0,        mm4                         ; abs(p3-p2) > limit
+        psubusb     mm1,        mm4                         ; abs(p2-p1) > limit
+
+        psubusb     mm6,        mm4                         ; abs(q2-q1) > limit
+        por         mm7,        mm6                         ; or
+
+        por         mm0,        mm1                         ;
+        por         mm0,        mm7                         ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
+
+        movq        mm1,        mm5                         ; p1
+
+        movq        mm7,        mm3                         ; mm3=mm7=p0
+        psubusb     mm7,        mm5                         ; p0 - p1
+
+        psubusb     mm5,        mm3                         ; p1 - p0
+        por         mm5,        mm7                         ; abs(p1-p0)
+
+        movq        t0,         mm5                         ; save abs(p1-p0)
+        lea         rdx,        srct
+
+        psubusb     mm5,        mm4                         ; mm5 = abs(p1-p0) > limit
+        por         mm0,        mm5                         ; mm0=mask
+
+        movq        mm5,        [rdx+32]                    ; mm5=q0
+        movq        mm7,        [rdx+40]                    ; mm7=q1
+
+        movq        mm6,        mm5                         ; mm6=q0
+        movq        mm2,        mm7                         ; q1
+        psubusb     mm5,        mm7                         ; q0-q1
+
+        psubusb     mm7,        mm6                         ; q1-q0
+        por         mm7,        mm5                         ; abs(q1-q0)
+
+        movq        t1,         mm7                         ; save abs(q1-q0)
+        psubusb     mm7,        mm4                         ; mm7=abs(q1-q0)> limit
+
+        por         mm0,        mm7                         ; mask
+
+        movq        mm5,        mm2                         ; q1
+        psubusb     mm5,        mm1                         ; q1-=p1
+        psubusb     mm1,        mm2                         ; p1-=q1
+        por         mm5,        mm1                         ; abs(p1-q1)
+        pand        mm5,        [GLOBAL(tfe)]               ; set lsb of each byte to zero
+        psrlw       mm5,        1                           ; abs(p1-q1)/2
+
+        mov         rdx,        arg(2) ;blimit                      ;
+
+        movq        mm4,        [rdx]                       ;blimit
+        movq        mm1,        mm3                         ; mm1=mm3=p0
+
+        movq        mm7,        mm6                         ; mm7=mm6=q0
+        psubusb     mm1,        mm7                         ; p0-q0
+
+        psubusb     mm7,        mm3                         ; q0-p0
+        por         mm1,        mm7                         ; abs(q0-p0)
+        paddusb     mm1,        mm1                         ; abs(q0-p0)*2
+        paddusb     mm1,        mm5                         ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+        psubusb     mm1,        mm4                         ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
+        por         mm1,        mm0;                        ; mask
+
+        pxor        mm0,        mm0
+        pcmpeqb     mm1,        mm0
+
+        ; calculate high edge variance
+        mov         rdx,        arg(4) ;thresh            ; get thresh
+        movq        mm7,        [rdx]
+        ;
+        movq        mm4,        t0              ; get abs (q1 - q0)
+        psubusb     mm4,        mm7             ; abs(q1 - q0) > thresh
+
+        movq        mm3,        t1              ; get abs (p1 - p0)
+        psubusb     mm3,        mm7             ; abs(p1 - p0)> thresh
+
+        por         mm4,        mm3             ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+        pcmpeqb     mm4,        mm0
+
+        pcmpeqb     mm0,        mm0
+        pxor        mm4,        mm0
+
+
+
+
+        ; start work on filters
+        lea         rdx,        srct
+
+        ; start work on filters
+        movq        mm2, [rdx+16]         ; p1
+        movq        mm7, [rdx+40]         ; q1
+        pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values
+        pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values
+        psubsb      mm2, mm7              ; p1 - q1
+
+        movq        mm6, [rdx+24]         ; p0
+        movq        mm0, [rdx+32]         ; q0
+        pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values
+        pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values
+
+        movq        mm3, mm0              ; q0
+        psubsb      mm0, mm6              ; q0 - p0
+        paddsb      mm2, mm0              ; 1 * (q0 - p0) + (p1 - q1)
+        paddsb      mm2, mm0              ; 2 * (q0 - p0)
+        paddsb      mm2, mm0              ; 3 * (q0 - p0) + (p1 - q1)
+        pand       mm1, mm2           ; mask filter values we don't care about
+
+        ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0
+        movq        mm2, mm1              ; vp8_filter
+        pand        mm2, mm4;             ; Filter2 = vp8_filter & hev
+
+        movq        mm5,        mm2       ;
+        paddsb      mm5,        [GLOBAL(t3)];
+
+        pxor        mm0, mm0              ; 0
+        pxor        mm7, mm7              ; 0
+
+        punpcklbw   mm0, mm5              ; e0f0g0h0
+        psraw       mm0, 11               ; sign extended shift right by 3
+        punpckhbw   mm7, mm5              ; a0b0c0d0
+        psraw       mm7, 11               ; sign extended shift right by 3
+        packsswb    mm0, mm7              ; Filter2 >>=3;
+
+        movq        mm5, mm0              ; Filter2
+
+        paddsb      mm2, [GLOBAL(t4)]     ; vp8_signed_char_clamp(Filter2 + 4)
+        pxor        mm0, mm0              ; 0
+        pxor        mm7, mm7              ; 0
+
+        punpcklbw   mm0, mm2              ; e0f0g0h0
+        psraw       mm0, 11               ; sign extended shift right by 3
+        punpckhbw   mm7, mm2              ; a0b0c0d0
+        psraw       mm7, 11               ; sign extended shift right by 3
+        packsswb    mm0, mm7              ; Filter2 >>=3;
+
+        ; mm0= filter2 mm1 = vp8_filter,  mm3 =qs0 mm5=s mm4 =hev mm6=ps0
+        psubsb      mm3, mm0              ; qs0 =qs0 - filter1
+        paddsb      mm6, mm5              ; ps0 =ps0 + Fitler2
+
+        ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0
+        ; vp8_filter &= ~hev;
+        ; Filter2 = vp8_filter;
+        pandn       mm4, mm1              ; vp8_filter&=~hev
+
+
+        ; mm3=qs0, mm4=filter2, mm6=ps0
+
+        ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
+        ; s = vp8_signed_char_clamp(qs0 - u);
+        ; *oq0 = s^0x80;
+        ; s = vp8_signed_char_clamp(ps0 + u);
+        ; *op0 = s^0x80;
+        pxor        mm0, mm0
+
+        pxor        mm1, mm1
+        pxor        mm2, mm2
+        punpcklbw   mm1, mm4
+        punpckhbw   mm2, mm4
+        pmulhw      mm1, [GLOBAL(s27)]
+        pmulhw      mm2, [GLOBAL(s27)]
+        paddw       mm1, [GLOBAL(s63)]
+        paddw       mm2, [GLOBAL(s63)]
+        psraw       mm1, 7
+        psraw       mm2, 7
+        packsswb    mm1, mm2
+
+        psubsb      mm3, mm1
+        paddsb      mm6, mm1
+
+        pxor        mm3, [GLOBAL(t80)]
+        pxor        mm6, [GLOBAL(t80)]
+        movq        [rdx+24], mm6
+        movq        [rdx+32], mm3
+
+        ; roughly 2/7th difference across boundary
+        ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
+        ; s = vp8_signed_char_clamp(qs1 - u);
+        ; *oq1 = s^0x80;
+        ; s = vp8_signed_char_clamp(ps1 + u);
+        ; *op1 = s^0x80;
+        pxor        mm1, mm1
+        pxor        mm2, mm2
+        punpcklbw   mm1, mm4
+        punpckhbw   mm2, mm4
+        pmulhw      mm1, [GLOBAL(s18)]
+        pmulhw      mm2, [GLOBAL(s18)]
+        paddw       mm1, [GLOBAL(s63)]
+        paddw       mm2, [GLOBAL(s63)]
+        psraw       mm1, 7
+        psraw       mm2, 7
+        packsswb    mm1, mm2
+
+        movq        mm3, [rdx + 40]
+        movq        mm6, [rdx + 16]       ; p1
+        pxor        mm3, [GLOBAL(t80)]
+        pxor        mm6, [GLOBAL(t80)]
+
+        paddsb      mm6, mm1
+        psubsb      mm3, mm1
+
+        pxor        mm6, [GLOBAL(t80)]
+        pxor        mm3, [GLOBAL(t80)]
+        movq        [rdx + 40], mm3
+        movq        [rdx + 16], mm6
+
+        ; roughly 1/7th difference across boundary
+        ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
+        ; s = vp8_signed_char_clamp(qs2 - u);
+        ; *oq2 = s^0x80;
+        ; s = vp8_signed_char_clamp(ps2 + u);
+        ; *op2 = s^0x80;
+        pxor        mm1, mm1
+        pxor        mm2, mm2
+        punpcklbw   mm1, mm4
+        punpckhbw   mm2, mm4
+        pmulhw      mm1, [GLOBAL(s9)]
+        pmulhw      mm2, [GLOBAL(s9)]
+        paddw       mm1, [GLOBAL(s63)]
+        paddw       mm2, [GLOBAL(s63)]
+        psraw       mm1, 7
+        psraw       mm2, 7
+        packsswb    mm1, mm2
+
+        movq        mm6, [rdx+ 8]
+        movq        mm3, [rdx+48]
+
+        pxor        mm6, [GLOBAL(t80)]
+        pxor        mm3, [GLOBAL(t80)]
+
+        paddsb      mm6, mm1
+        psubsb      mm3, mm1
+
+        pxor        mm6, [GLOBAL(t80)]          ; mm6 = 71 61 51 41 31 21 11 01
+        pxor        mm3, [GLOBAL(t80)]          ; mm3 = 76 66 56 46 36 26 15 06
+
+        ; transpose and write back
+        movq        mm0,    [rdx]               ; mm0 = 70 60 50 40 30 20 10 00
+        movq        mm1,    mm0                 ; mm0 = 70 60 50 40 30 20 10 00
+
+        punpcklbw   mm0,    mm6                 ; mm0 = 31 30 21 20 11 10 01 00
+        punpckhbw   mm1,    mm6                 ; mm3 = 71 70 61 60 51 50 41 40
+
+        movq        mm2,    [rdx+16]            ; mm2 = 72 62 52 42 32 22 12 02
+        movq        mm6,    mm2                 ; mm3 = 72 62 52 42 32 22 12 02
+
+        punpcklbw   mm2,    [rdx+24]            ; mm2 = 33 32 23 22 13 12 03 02
+        punpckhbw   mm6,    [rdx+24]            ; mm3 = 73 72 63 62 53 52 43 42
+
+        movq        mm5,    mm0                 ; mm5 = 31 30 21 20 11 10 01 00
+        punpcklwd   mm0,    mm2                 ; mm0 = 13 12 11 10 03 02 01 00
+
+        punpckhwd   mm5,    mm2                 ; mm5 = 33 32 31 30 23 22 21 20
+        movq        mm4,    mm1                 ; mm4 = 71 70 61 60 51 50 41 40
+
+        punpcklwd   mm1,    mm6                 ; mm1 = 53 52 51 50 43 42 41 40
+        punpckhwd   mm4,    mm6                 ; mm4 = 73 72 71 70 63 62 61 60
+
+        movq        mm2,    [rdx+32]            ; mm2 = 74 64 54 44 34 24 14 04
+        punpcklbw   mm2,    [rdx+40]            ; mm2 = 35 34 25 24 15 14 05 04
+
+        movq        mm6,    mm3                 ; mm6 = 76 66 56 46 36 26 15 06
+        punpcklbw   mm6,    [rdx+56]            ; mm6 = 37 36 27 26 17 16 07 06
+
+        movq        mm7,    mm2                 ; mm7 = 35 34 25 24 15 14 05 04
+        punpcklwd   mm2,    mm6                 ; mm2 = 17 16 15 14 07 06 05 04
+
+        punpckhwd   mm7,    mm6                 ; mm7 = 37 36 35 34 27 26 25 24
+        movq        mm6,    mm0                 ; mm6 = 13 12 11 10 03 02 01 00
+
+        punpckldq   mm0,    mm2                 ; mm0 = 07 06 05 04 03 02 01 00
+        punpckhdq   mm6,    mm2                 ; mm6 = 17 16 15 14 13 12 11 10
+
+        movq        [rsi+rax*4], mm0            ; write out
+        movq        [rdi+rax*4], mm6            ; write out
+
+        movq        mm0,    mm5                 ; mm0 = 33 32 31 30 23 22 21 20
+        punpckldq   mm0,    mm7                 ; mm0 = 27 26 25 24 23 22 20 20
+
+        punpckhdq   mm5,    mm7                 ; mm5 = 37 36 35 34 33 32 31 30
+        movq        [rsi+rax*2], mm0            ; write out
+
+        movq        [rdi+rax*2], mm5            ; write out
+        movq        mm2,    [rdx+32]            ; mm2 = 74 64 54 44 34 24 14 04
+
+        punpckhbw   mm2,    [rdx+40]            ; mm2 = 75 74 65 64 54 54 45 44
+        punpckhbw   mm3,    [rdx+56]            ; mm3 = 77 76 67 66 57 56 47 46
+
+        movq        mm5,    mm2                 ; mm5 = 75 74 65 64 54 54 45 44
+        punpcklwd   mm2,    mm3                 ; mm2 = 57 56 55 54 47 46 45 44
+
+        punpckhwd   mm5,    mm3                 ; mm5 = 77 76 75 74 67 66 65 64
+        movq        mm0,    mm1                 ; mm0=  53 52 51 50 43 42 41 40
+
+        movq        mm3,    mm4                 ; mm4 = 73 72 71 70 63 62 61 60
+        punpckldq   mm0,    mm2                 ; mm0 = 47 46 45 44 43 42 41 40
+
+        punpckhdq   mm1,    mm2                 ; mm1 = 57 56 55 54 53 52 51 50
+        movq        [rsi],  mm0                 ; write out
+
+        movq        [rdi],  mm1                 ; write out
+        neg         rax
+
+        punpckldq   mm3,    mm5                 ; mm3 = 67 66 65 64 63 62 61 60
+        punpckhdq   mm4,    mm5                 ; mm4 = 77 76 75 74 73 72 71 60
+
+        movq        [rsi+rax*2], mm3
+        movq        [rdi+rax*2], mm4
+
+        lea         rsi,        [rsi+rax*8]
+        dec         rcx
+
+        jnz         .next8_mbv
+
+    add rsp, 96
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_loop_filter_simple_horizontal_edge_mmx
+;(
+;    unsigned char *src_ptr,
+;    int  src_pixel_step,
+;    const char *blimit
+;)
+global sym(vp8_loop_filter_simple_horizontal_edge_mmx) PRIVATE
+sym(vp8_loop_filter_simple_horizontal_edge_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 3
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rsi, arg(0) ;src_ptr
+        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
+
+        mov         rcx, 2                ; count
+.nexts8_h:
+        mov         rdx, arg(2) ;blimit           ; get blimit
+        movq        mm3, [rdx]            ;
+
+        mov         rdi, rsi              ; rdi points to row +1 for indirect addressing
+        add         rdi, rax
+        neg         rax
+
+        ; calculate mask
+        movq        mm1, [rsi+2*rax]      ; p1
+        movq        mm0, [rdi]            ; q1
+        movq        mm2, mm1
+        movq        mm7, mm0
+        movq        mm4, mm0
+        psubusb     mm0, mm1              ; q1-=p1
+        psubusb     mm1, mm4              ; p1-=q1
+        por         mm1, mm0              ; abs(p1-q1)
+        pand        mm1, [GLOBAL(tfe)]    ; set lsb of each byte to zero
+        psrlw       mm1, 1                ; abs(p1-q1)/2
+
+        movq        mm5, [rsi+rax]        ; p0
+        movq        mm4, [rsi]            ; q0
+        movq        mm0, mm4              ; q0
+        movq        mm6, mm5              ; p0
+        psubusb     mm5, mm4              ; p0-=q0
+        psubusb     mm4, mm6              ; q0-=p0
+        por         mm5, mm4              ; abs(p0 - q0)
+        paddusb     mm5, mm5              ; abs(p0-q0)*2
+        paddusb     mm5, mm1              ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+        psubusb     mm5, mm3              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
+        pxor        mm3, mm3
+        pcmpeqb     mm5, mm3
+
+        ; start work on filters
+        pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values
+        pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values
+        psubsb      mm2, mm7              ; p1 - q1
+
+        pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values
+        pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values
+        movq        mm3, mm0              ; q0
+        psubsb      mm0, mm6              ; q0 - p0
+        paddsb      mm2, mm0              ; p1 - q1 + 1 * (q0 - p0)
+        paddsb      mm2, mm0              ; p1 - q1 + 2 * (q0 - p0)
+        paddsb      mm2, mm0              ; p1 - q1 + 3 * (q0 - p0)
+        pand        mm5, mm2              ; mask filter values we don't care about
+
+        ; do + 4 side
+        paddsb      mm5, [GLOBAL(t4)]     ; 3* (q0 - p0) + (p1 - q1) + 4
+
+        movq        mm0, mm5              ; get a copy of filters
+        psllw       mm0, 8                ; shift left 8
+        psraw       mm0, 3                ; arithmetic shift right 11
+        psrlw       mm0, 8
+        movq        mm1, mm5              ; get a copy of filters
+        psraw       mm1, 11               ; arithmetic shift right 11
+        psllw       mm1, 8                ; shift left 8 to put it back
+
+        por         mm0, mm1              ; put the two together to get result
+
+        psubsb      mm3, mm0              ; q0-= q0 add
+        pxor        mm3, [GLOBAL(t80)]    ; unoffset
+        movq        [rsi], mm3            ; write back
+
+
+        ; now do +3 side
+        psubsb      mm5, [GLOBAL(t1s)]     ; +3 instead of +4
+
+        movq        mm0, mm5              ; get a copy of filters
+        psllw       mm0, 8                ; shift left 8
+        psraw       mm0, 3                ; arithmetic shift right 11
+        psrlw       mm0, 8
+        psraw       mm5, 11               ; arithmetic shift right 11
+        psllw       mm5, 8                ; shift left 8 to put it back
+        por         mm0, mm5              ; put the two together to get result
+
+
+        paddsb      mm6, mm0              ; p0+= p0 add
+        pxor        mm6, [GLOBAL(t80)]    ; unoffset
+        movq        [rsi+rax], mm6        ; write back
+
+        add         rsi,8
+        neg         rax
+        dec         rcx
+        jnz         .nexts8_h
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_loop_filter_simple_vertical_edge_mmx
+;(
+;    unsigned char *src_ptr,
+;    int  src_pixel_step,
+;    const char *blimit
+;)
+global sym(vp8_loop_filter_simple_vertical_edge_mmx) PRIVATE
+sym(vp8_loop_filter_simple_vertical_edge_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 3
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub          rsp, 32      ; reserve 32 bytes
+    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
+    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
+
+        mov         rsi, arg(0) ;src_ptr
+        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
+
+        lea         rsi, [rsi + rax*4- 2];  ;
+        mov         rcx, 2                                      ; count
+.nexts8_v:
+
+        lea         rdi,        [rsi + rax];
+        movd        mm0,        [rdi + rax * 2]                 ; xx xx xx xx 73 72 71 70
+
+        movd        mm6,        [rsi + rax * 2]                 ; xx xx xx xx 63 62 61 60
+        punpcklbw   mm6,        mm0                             ; 73 63 72 62 71 61 70 60
+
+        movd        mm0,        [rsi + rax]                     ; xx xx xx xx 53 52 51 50
+        movd        mm4,        [rsi]                           ; xx xx xx xx 43 42 41 40
+
+        punpcklbw   mm4,        mm0                             ; 53 43 52 42 51 41 50 40
+        movq        mm5,        mm4                             ; 53 43 52 42 51 41 50 40
+
+        punpcklwd   mm4,        mm6                             ; 71 61 51 41 70 60 50 40
+        punpckhwd   mm5,        mm6                             ; 73 63 53 43 72 62 52 42
+
+        neg         rax
+
+        movd        mm7,        [rsi + rax]                     ; xx xx xx xx 33 32 31 30
+        movd        mm6,        [rsi + rax * 2]                 ; xx xx xx xx 23 22 21 20
+
+        punpcklbw   mm6,        mm7                             ; 33 23 32 22 31 21 30 20
+        movd        mm1,        [rdi + rax * 4]                 ; xx xx xx xx 13 12 11 10
+
+        movd        mm0,        [rsi + rax * 4]                 ; xx xx xx xx 03 02 01 00
+        punpcklbw   mm0,        mm1                             ; 13 03 12 02 11 01 10 00
+
+        movq        mm2,        mm0                             ; 13 03 12 02 11 01 10 00
+        punpcklwd   mm0,        mm6                             ; 31 21 11 01 30 20 10 00
+
+        punpckhwd   mm2,        mm6                             ; 33 23 13 03 32 22 12 02
+        movq        mm1,        mm0                             ; 13 03 12 02 11 01 10 00
+
+        punpckldq   mm0,        mm4                             ; 70 60 50 40 30 20 10 00       = p1
+        movq        mm3,        mm2                             ; 33 23 13 03 32 22 12 02
+
+        punpckhdq   mm1,        mm4                             ; 71 61 51 41 31 21 11 01       = p0
+        punpckldq   mm2,        mm5                             ; 72 62 52 42 32 22 12 02       = q0
+
+        punpckhdq   mm3,        mm5                             ; 73 63 53 43 33 23 13 03       = q1
+
+
+        ; calculate mask
+        movq        mm6,        mm0                             ; p1
+        movq        mm7,        mm3                             ; q1
+        psubusb     mm7,        mm6                             ; q1-=p1
+        psubusb     mm6,        mm3                             ; p1-=q1
+        por         mm6,        mm7                             ; abs(p1-q1)
+        pand        mm6,        [GLOBAL(tfe)]                   ; set lsb of each byte to zero
+        psrlw       mm6,        1                               ; abs(p1-q1)/2
+
+        movq        mm5,        mm1                             ; p0
+        movq        mm4,        mm2                             ; q0
+
+        psubusb     mm5,        mm2                             ; p0-=q0
+        psubusb     mm4,        mm1                             ; q0-=p0
+
+        por         mm5,        mm4                             ; abs(p0 - q0)
+        paddusb     mm5,        mm5                             ; abs(p0-q0)*2
+        paddusb     mm5,        mm6                             ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+        mov         rdx,        arg(2) ;blimit                          ; get blimit
+        movq        mm7,        [rdx]
+
+        psubusb     mm5,        mm7                             ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
+        pxor        mm7,        mm7
+        pcmpeqb     mm5,        mm7                             ; mm5 = mask
+
+        ; start work on filters
+        movq        t0,         mm0
+        movq        t1,         mm3
+
+        pxor        mm0,        [GLOBAL(t80)]                   ; p1 offset to convert to signed values
+        pxor        mm3,        [GLOBAL(t80)]                   ; q1 offset to convert to signed values
+
+        psubsb      mm0,        mm3                             ; p1 - q1
+        movq        mm6,        mm1                             ; p0
+
+        movq        mm7,        mm2                             ; q0
+        pxor        mm6,        [GLOBAL(t80)]                   ; offset to convert to signed values
+
+        pxor        mm7,        [GLOBAL(t80)]                   ; offset to convert to signed values
+        movq        mm3,        mm7                             ; offseted ; q0
+
+        psubsb      mm7,        mm6                             ; q0 - p0
+        paddsb      mm0,        mm7                             ; p1 - q1 + 1 * (q0 - p0)
+
+        paddsb      mm0,        mm7                             ; p1 - q1 + 2 * (q0 - p0)
+        paddsb      mm0,        mm7                             ; p1 - q1 + 3 * (q0 - p0)
+
+        pand        mm5,        mm0                             ; mask filter values we don't care about
+
+        paddsb      mm5,        [GLOBAL(t4)]                    ;  3* (q0 - p0) + (p1 - q1) + 4
+
+        movq        mm0,        mm5                             ; get a copy of filters
+        psllw       mm0,        8                               ; shift left 8
+        psraw       mm0,        3                               ; arithmetic shift right 11
+        psrlw       mm0,        8
+
+        movq        mm7,        mm5                             ; get a copy of filters
+        psraw       mm7,        11                              ; arithmetic shift right 11
+        psllw       mm7,        8                               ; shift left 8 to put it back
+
+        por         mm0,        mm7                             ; put the two together to get result
+
+        psubsb      mm3,        mm0                             ; q0-= q0sz add
+        pxor        mm3,        [GLOBAL(t80)]                   ; unoffset
+
+        ; now do +3 side
+        psubsb      mm5, [GLOBAL(t1s)]                          ; +3 instead of +4
+
+        movq        mm0, mm5                                    ; get a copy of filters
+        psllw       mm0, 8                                      ; shift left 8
+        psraw       mm0, 3                                      ; arithmetic shift right 11
+        psrlw       mm0, 8
+
+        psraw       mm5, 11                                     ; arithmetic shift right 11
+        psllw       mm5, 8                                      ; shift left 8 to put it back
+        por         mm0, mm5                                    ; put the two together to get result
+
+        paddsb      mm6, mm0                                    ; p0+= p0 add
+        pxor        mm6, [GLOBAL(t80)]                          ; unoffset
+
+
+        movq        mm0,        t0
+        movq        mm4,        t1
+
+        ; mm0 = 70 60 50 40 30 20 10 00
+        ; mm6 = 71 61 51 41 31 21 11 01
+        ; mm3 = 72 62 52 42 32 22 12 02
+        ; mm4 = 73 63 53 43 33 23 13 03
+        ; transpose back to write out
+
+        movq        mm1,        mm0                         ;
+        punpcklbw   mm0,        mm6                         ; 31 30 21 20 11 10 01 00
+
+        punpckhbw   mm1,        mm6                         ; 71 70 61 60 51 50 41 40
+        movq        mm2,        mm3                         ;
+
+        punpcklbw   mm2,        mm4                         ; 33 32 23 22 13 12 03 02
+        movq        mm5,        mm1                         ; 71 70 61 60 51 50 41 40
+
+        punpckhbw   mm3,        mm4                         ; 73 72 63 62 53 52 43 42
+        movq        mm6,        mm0                         ; 31 30 21 20 11 10 01 00
+
+        punpcklwd   mm0,        mm2                         ; 13 12 11 10 03 02 01 00
+        punpckhwd   mm6,        mm2                         ; 33 32 31 30 23 22 21 20
+
+        movd        [rsi+rax*4], mm0                        ; write 03 02 01 00
+        punpcklwd   mm1,        mm3                         ; 53 52 51 50 43 42 41 40
+
+        psrlq       mm0,        32                          ; xx xx xx xx 13 12 11 10
+        punpckhwd   mm5,        mm3                         ; 73 72 71 70 63 62 61 60
+
+        movd        [rdi+rax*4], mm0                        ; write 13 12 11 10
+        movd        [rsi+rax*2], mm6                        ; write 23 22 21 20
+
+        psrlq       mm6,        32                          ; 33 32 31 30
+        movd        [rsi],      mm1                         ; write 43 42 41 40
+
+        movd        [rsi + rax], mm6                        ; write 33 32 31 30
+        neg         rax
+
+        movd        [rsi + rax*2], mm5                      ; write 63 62 61 60
+        psrlq       mm1,        32                          ; 53 52 51 50
+
+        movd        [rdi],      mm1                         ; write out 53 52 51 50
+        psrlq       mm5,        32                          ; 73 72 71 70
+
+        movd        [rdi + rax*2], mm5                      ; write 73 72 71 70
+
+        lea         rsi,        [rsi+rax*8]                 ; next 8
+
+        dec         rcx
+        jnz         .nexts8_v
+
+    add rsp, 32
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+
+;void fast_loop_filter_vertical_edges_mmx(unsigned char *y_ptr,
+;                  int y_stride,
+;                  loop_filter_info *lfi)
+;{
+;
+;
+;    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+4, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
+;    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+8, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
+;    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+12, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
+;}
+
+SECTION_RODATA
+align 16
+tfe:
+    times 8 db 0xfe
+align 16
+t80:
+    times 8 db 0x80
+align 16
+t1s:
+    times 8 db 0x01
+align 16
+t3:
+    times 8 db 0x03
+align 16
+t4:
+    times 8 db 0x04
+align 16
+ones:
+    times 4 dw 0x0001
+align 16
+s27:
+    times 4 dw 0x1b00
+align 16
+s18:
+    times 4 dw 0x1200
+align 16
+s9:
+    times 4 dw 0x0900
+align 16
+s63:
+    times 4 dw 0x003f
diff --git a/libs/libvpx/vp8/decoder/dboolhuff.c b/libs/libvpx/vp8/decoder/dboolhuff.c
new file mode 100644
index 0000000000..8a7e33205b
--- /dev/null
+++ b/libs/libvpx/vp8/decoder/dboolhuff.c
@@ -0,0 +1,77 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "dboolhuff.h"
+#include "vp8/common/common.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+int vp8dx_start_decode(BOOL_DECODER *br,
+                       const unsigned char *source,
+                       unsigned int source_sz,
+                       vpx_decrypt_cb decrypt_cb,
+                       void *decrypt_state)
+{
+    br->user_buffer_end = source+source_sz;
+    br->user_buffer     = source;
+    br->value    = 0;
+    br->count    = -8;
+    br->range    = 255;
+    br->decrypt_cb = decrypt_cb;
+    br->decrypt_state = decrypt_state;
+
+    if (source_sz && !source)
+        return 1;
+
+    /* Populate the buffer */
+    vp8dx_bool_decoder_fill(br);
+
+    return 0;
+}
+
+void vp8dx_bool_decoder_fill(BOOL_DECODER *br)
+{
+    const unsigned char *bufptr = br->user_buffer;
+    VP8_BD_VALUE value = br->value;
+    int count = br->count;
+    int shift = VP8_BD_VALUE_SIZE - CHAR_BIT - (count + CHAR_BIT);
+    size_t bytes_left = br->user_buffer_end - bufptr;
+    size_t bits_left = bytes_left * CHAR_BIT;
+    int x = (int)(shift + CHAR_BIT - bits_left);
+    int loop_end = 0;
+    unsigned char decrypted[sizeof(VP8_BD_VALUE) + 1];
+
+    if (br->decrypt_cb) {
+        size_t n = VPXMIN(sizeof(decrypted), bytes_left);
+        br->decrypt_cb(br->decrypt_state, bufptr, decrypted, (int)n);
+        bufptr = decrypted;
+    }
+
+    if(x >= 0)
+    {
+        count += VP8_LOTS_OF_BITS;
+        loop_end = x;
+    }
+
+    if (x < 0 || bits_left)
+    {
+        while(shift >= loop_end)
+        {
+            count += CHAR_BIT;
+            value |= (VP8_BD_VALUE)*bufptr << shift;
+            ++bufptr;
+            ++br->user_buffer;
+            shift -= CHAR_BIT;
+        }
+    }
+
+    br->value = value;
+    br->count = count;
+}
diff --git a/libs/libvpx/vp8/decoder/dboolhuff.h b/libs/libvpx/vp8/decoder/dboolhuff.h
new file mode 100644
index 0000000000..cc9eaaf439
--- /dev/null
+++ b/libs/libvpx/vp8/decoder/dboolhuff.h
@@ -0,0 +1,141 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_DECODER_DBOOLHUFF_H_
+#define VP8_DECODER_DBOOLHUFF_H_
+
+#include <stddef.h>
+#include <limits.h>
+
+#include "./vpx_config.h"
+#include "vpx_ports/mem.h"
+#include "vpx/vp8dx.h"
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef size_t VP8_BD_VALUE;
+
+#define VP8_BD_VALUE_SIZE ((int)sizeof(VP8_BD_VALUE)*CHAR_BIT)
+
+/*This is meant to be a large, positive constant that can still be efficiently
+   loaded as an immediate (on platforms like ARM, for example).
+  Even relatively modest values like 100 would work fine.*/
+#define VP8_LOTS_OF_BITS (0x40000000)
+
+typedef struct
+{
+    const unsigned char *user_buffer_end;
+    const unsigned char *user_buffer;
+    VP8_BD_VALUE         value;
+    int                  count;
+    unsigned int         range;
+    vpx_decrypt_cb       decrypt_cb;
+    void                *decrypt_state;
+} BOOL_DECODER;
+
+DECLARE_ALIGNED(16, extern const unsigned char, vp8_norm[256]);
+
+int vp8dx_start_decode(BOOL_DECODER *br,
+                       const unsigned char *source,
+                       unsigned int source_sz,
+                       vpx_decrypt_cb decrypt_cb,
+                       void *decrypt_state);
+
+void vp8dx_bool_decoder_fill(BOOL_DECODER *br);
+
+
+static int vp8dx_decode_bool(BOOL_DECODER *br, int probability) {
+    unsigned int bit = 0;
+    VP8_BD_VALUE value;
+    unsigned int split;
+    VP8_BD_VALUE bigsplit;
+    int count;
+    unsigned int range;
+
+    split = 1 + (((br->range - 1) * probability) >> 8);
+
+    if(br->count < 0)
+        vp8dx_bool_decoder_fill(br);
+
+    value = br->value;
+    count = br->count;
+
+    bigsplit = (VP8_BD_VALUE)split << (VP8_BD_VALUE_SIZE - 8);
+
+    range = split;
+
+    if (value >= bigsplit)
+    {
+        range = br->range - split;
+        value = value - bigsplit;
+        bit = 1;
+    }
+
+    {
+        register unsigned int shift = vp8_norm[range];
+        range <<= shift;
+        value <<= shift;
+        count -= shift;
+    }
+    br->value = value;
+    br->count = count;
+    br->range = range;
+
+    return bit;
+}
+
+static INLINE int vp8_decode_value(BOOL_DECODER *br, int bits)
+{
+    int z = 0;
+    int bit;
+
+    for (bit = bits - 1; bit >= 0; bit--)
+    {
+        z |= (vp8dx_decode_bool(br, 0x80) << bit);
+    }
+
+    return z;
+}
+
+static INLINE int vp8dx_bool_error(BOOL_DECODER *br)
+{
+    /* Check if we have reached the end of the buffer.
+     *
+     * Variable 'count' stores the number of bits in the 'value' buffer, minus
+     * 8. The top byte is part of the algorithm, and the remainder is buffered
+     * to be shifted into it. So if count == 8, the top 16 bits of 'value' are
+     * occupied, 8 for the algorithm and 8 in the buffer.
+     *
+     * When reading a byte from the user's buffer, count is filled with 8 and
+     * one byte is filled into the value buffer. When we reach the end of the
+     * data, count is additionally filled with VP8_LOTS_OF_BITS. So when
+     * count == VP8_LOTS_OF_BITS - 1, the user's data has been exhausted.
+     */
+    if ((br->count > VP8_BD_VALUE_SIZE) && (br->count < VP8_LOTS_OF_BITS))
+    {
+       /* We have tried to decode bits after the end of
+        * stream was encountered.
+        */
+        return 1;
+    }
+
+    /* No error. */
+    return 0;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_DECODER_DBOOLHUFF_H_
diff --git a/libs/libvpx/vp8/decoder/decodeframe.c b/libs/libvpx/vp8/decoder/decodeframe.c
new file mode 100644
index 0000000000..4bc87eb134
--- /dev/null
+++ b/libs/libvpx/vp8/decoder/decodeframe.c
@@ -0,0 +1,1399 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+#include "./vpx_scale_rtcd.h"
+#include "onyxd_int.h"
+#include "vp8/common/header.h"
+#include "vp8/common/reconintra4x4.h"
+#include "vp8/common/reconinter.h"
+#include "detokenize.h"
+#include "vp8/common/common.h"
+#include "vp8/common/invtrans.h"
+#include "vp8/common/alloccommon.h"
+#include "vp8/common/entropymode.h"
+#include "vp8/common/quant_common.h"
+#include "vpx_scale/vpx_scale.h"
+#include "vp8/common/reconintra.h"
+#include "vp8/common/setupintrarecon.h"
+
+#include "decodemv.h"
+#include "vp8/common/extend.h"
+#if CONFIG_ERROR_CONCEALMENT
+#include "error_concealment.h"
+#endif
+#include "vpx_mem/vpx_mem.h"
+#include "vp8/common/threading.h"
+#include "decoderthreading.h"
+#include "dboolhuff.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+#include <assert.h>
+#include <stdio.h>
+
+void vp8cx_init_de_quantizer(VP8D_COMP *pbi)
+{
+    int Q;
+    VP8_COMMON *const pc = & pbi->common;
+
+    for (Q = 0; Q < QINDEX_RANGE; Q++)
+    {
+        pc->Y1dequant[Q][0] = (short)vp8_dc_quant(Q, pc->y1dc_delta_q);
+        pc->Y2dequant[Q][0] = (short)vp8_dc2quant(Q, pc->y2dc_delta_q);
+        pc->UVdequant[Q][0] = (short)vp8_dc_uv_quant(Q, pc->uvdc_delta_q);
+
+        pc->Y1dequant[Q][1] = (short)vp8_ac_yquant(Q);
+        pc->Y2dequant[Q][1] = (short)vp8_ac2quant(Q, pc->y2ac_delta_q);
+        pc->UVdequant[Q][1] = (short)vp8_ac_uv_quant(Q, pc->uvac_delta_q);
+    }
+}
+
+void vp8_mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd)
+{
+    int i;
+    int QIndex;
+    MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
+    VP8_COMMON *const pc = & pbi->common;
+
+    /* Decide whether to use the default or alternate baseline Q value. */
+    if (xd->segmentation_enabled)
+    {
+        /* Abs Value */
+        if (xd->mb_segement_abs_delta == SEGMENT_ABSDATA)
+            QIndex = xd->segment_feature_data[MB_LVL_ALT_Q][mbmi->segment_id];
+
+        /* Delta Value */
+        else
+            QIndex = pc->base_qindex + xd->segment_feature_data[MB_LVL_ALT_Q][mbmi->segment_id];
+
+        QIndex = (QIndex >= 0) ? ((QIndex <= MAXQ) ? QIndex : MAXQ) : 0;    /* Clamp to valid range */
+    }
+    else
+        QIndex = pc->base_qindex;
+
+    /* Set up the macroblock dequant constants */
+    xd->dequant_y1_dc[0] = 1;
+    xd->dequant_y1[0] = pc->Y1dequant[QIndex][0];
+    xd->dequant_y2[0] = pc->Y2dequant[QIndex][0];
+    xd->dequant_uv[0] = pc->UVdequant[QIndex][0];
+
+    for (i = 1; i < 16; i++)
+    {
+        xd->dequant_y1_dc[i] =
+        xd->dequant_y1[i] = pc->Y1dequant[QIndex][1];
+        xd->dequant_y2[i] = pc->Y2dequant[QIndex][1];
+        xd->dequant_uv[i] = pc->UVdequant[QIndex][1];
+    }
+}
+
+static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
+                              unsigned int mb_idx)
+{
+    MB_PREDICTION_MODE mode;
+    int i;
+#if CONFIG_ERROR_CONCEALMENT
+    int corruption_detected = 0;
+#else
+    (void)mb_idx;
+#endif
+
+    if (xd->mode_info_context->mbmi.mb_skip_coeff)
+    {
+        vp8_reset_mb_tokens_context(xd);
+    }
+    else if (!vp8dx_bool_error(xd->current_bc))
+    {
+        int eobtotal;
+        eobtotal = vp8_decode_mb_tokens(pbi, xd);
+
+        /* Special case:  Force the loopfilter to skip when eobtotal is zero */
+        xd->mode_info_context->mbmi.mb_skip_coeff = (eobtotal==0);
+    }
+
+    mode = xd->mode_info_context->mbmi.mode;
+
+    if (xd->segmentation_enabled)
+        vp8_mb_init_dequantizer(pbi, xd);
+
+
+#if CONFIG_ERROR_CONCEALMENT
+
+    if(pbi->ec_active)
+    {
+        int throw_residual;
+        /* When we have independent partitions we can apply residual even
+         * though other partitions within the frame are corrupt.
+         */
+        throw_residual = (!pbi->independent_partitions &&
+                          pbi->frame_corrupt_residual);
+        throw_residual = (throw_residual || vp8dx_bool_error(xd->current_bc));
+
+        if ((mb_idx >= pbi->mvs_corrupt_from_mb || throw_residual))
+        {
+            /* MB with corrupt residuals or corrupt mode/motion vectors.
+             * Better to use the predictor as reconstruction.
+             */
+            pbi->frame_corrupt_residual = 1;
+            memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
+            vp8_conceal_corrupt_mb(xd);
+
+
+            corruption_detected = 1;
+
+            /* force idct to be skipped for B_PRED and use the
+             * prediction only for reconstruction
+             * */
+            memset(xd->eobs, 0, 25);
+        }
+    }
+#endif
+
+    /* do prediction */
+    if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
+    {
+        vp8_build_intra_predictors_mbuv_s(xd,
+                                          xd->recon_above[1],
+                                          xd->recon_above[2],
+                                          xd->recon_left[1],
+                                          xd->recon_left[2],
+                                          xd->recon_left_stride[1],
+                                          xd->dst.u_buffer, xd->dst.v_buffer,
+                                          xd->dst.uv_stride);
+
+        if (mode != B_PRED)
+        {
+            vp8_build_intra_predictors_mby_s(xd,
+                                                 xd->recon_above[0],
+                                                 xd->recon_left[0],
+                                                 xd->recon_left_stride[0],
+                                                 xd->dst.y_buffer,
+                                                 xd->dst.y_stride);
+        }
+        else
+        {
+            short *DQC = xd->dequant_y1;
+            int dst_stride = xd->dst.y_stride;
+
+            /* clear out residual eob info */
+            if(xd->mode_info_context->mbmi.mb_skip_coeff)
+                memset(xd->eobs, 0, 25);
+
+            intra_prediction_down_copy(xd, xd->recon_above[0] + 16);
+
+            for (i = 0; i < 16; i++)
+            {
+                BLOCKD *b = &xd->block[i];
+                unsigned char *dst = xd->dst.y_buffer + b->offset;
+                B_PREDICTION_MODE b_mode =
+                    xd->mode_info_context->bmi[i].as_mode;
+                unsigned char *Above = dst - dst_stride;
+                unsigned char *yleft = dst - 1;
+                int left_stride = dst_stride;
+                unsigned char top_left = Above[-1];
+
+                vp8_intra4x4_predict(Above, yleft, left_stride, b_mode,
+                                     dst, dst_stride, top_left);
+
+                if (xd->eobs[i])
+                {
+                    if (xd->eobs[i] > 1)
+                    {
+                    vp8_dequant_idct_add(b->qcoeff, DQC, dst, dst_stride);
+                    }
+                    else
+                    {
+                        vp8_dc_only_idct_add
+                            (b->qcoeff[0] * DQC[0],
+                                dst, dst_stride,
+                                dst, dst_stride);
+                        memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0]));
+                    }
+                }
+            }
+        }
+    }
+    else
+    {
+        vp8_build_inter_predictors_mb(xd);
+    }
+
+
+#if CONFIG_ERROR_CONCEALMENT
+    if (corruption_detected)
+    {
+        return;
+    }
+#endif
+
+    if(!xd->mode_info_context->mbmi.mb_skip_coeff)
+    {
+        /* dequantization and idct */
+        if (mode != B_PRED)
+        {
+            short *DQC = xd->dequant_y1;
+
+            if (mode != SPLITMV)
+            {
+                BLOCKD *b = &xd->block[24];
+
+                /* do 2nd order transform on the dc block */
+                if (xd->eobs[24] > 1)
+                {
+                    vp8_dequantize_b(b, xd->dequant_y2);
+
+                    vp8_short_inv_walsh4x4(&b->dqcoeff[0],
+                        xd->qcoeff);
+                    memset(b->qcoeff, 0, 16 * sizeof(b->qcoeff[0]));
+                }
+                else
+                {
+                    b->dqcoeff[0] = b->qcoeff[0] * xd->dequant_y2[0];
+                    vp8_short_inv_walsh4x4_1(&b->dqcoeff[0],
+                        xd->qcoeff);
+                    memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0]));
+                }
+
+                /* override the dc dequant constant in order to preserve the
+                 * dc components
+                 */
+                DQC = xd->dequant_y1_dc;
+            }
+
+            vp8_dequant_idct_add_y_block
+                            (xd->qcoeff, DQC,
+                             xd->dst.y_buffer,
+                             xd->dst.y_stride, xd->eobs);
+        }
+
+        vp8_dequant_idct_add_uv_block
+                        (xd->qcoeff+16*16, xd->dequant_uv,
+                         xd->dst.u_buffer, xd->dst.v_buffer,
+                         xd->dst.uv_stride, xd->eobs+16);
+    }
+}
+
+static int get_delta_q(vp8_reader *bc, int prev, int *q_update)
+{
+    int ret_val = 0;
+
+    if (vp8_read_bit(bc))
+    {
+        ret_val = vp8_read_literal(bc, 4);
+
+        if (vp8_read_bit(bc))
+            ret_val = -ret_val;
+    }
+
+    /* Trigger a quantizer update if the delta-q value has changed */
+    if (ret_val != prev)
+        *q_update = 1;
+
+    return ret_val;
+}
+
+#ifdef PACKET_TESTING
+#include <stdio.h>
+FILE *vpxlog = 0;
+#endif
+
+static void yv12_extend_frame_top_c(YV12_BUFFER_CONFIG *ybf)
+{
+    int i;
+    unsigned char *src_ptr1;
+    unsigned char *dest_ptr1;
+
+    unsigned int Border;
+    int plane_stride;
+
+    /***********/
+    /* Y Plane */
+    /***********/
+    Border = ybf->border;
+    plane_stride = ybf->y_stride;
+    src_ptr1 = ybf->y_buffer - Border;
+    dest_ptr1 = src_ptr1 - (Border * plane_stride);
+
+    for (i = 0; i < (int)Border; i++)
+    {
+        memcpy(dest_ptr1, src_ptr1, plane_stride);
+        dest_ptr1 += plane_stride;
+    }
+
+
+    /***********/
+    /* U Plane */
+    /***********/
+    plane_stride = ybf->uv_stride;
+    Border /= 2;
+    src_ptr1 = ybf->u_buffer - Border;
+    dest_ptr1 = src_ptr1 - (Border * plane_stride);
+
+    for (i = 0; i < (int)(Border); i++)
+    {
+        memcpy(dest_ptr1, src_ptr1, plane_stride);
+        dest_ptr1 += plane_stride;
+    }
+
+    /***********/
+    /* V Plane */
+    /***********/
+
+    src_ptr1 = ybf->v_buffer - Border;
+    dest_ptr1 = src_ptr1 - (Border * plane_stride);
+
+    for (i = 0; i < (int)(Border); i++)
+    {
+        memcpy(dest_ptr1, src_ptr1, plane_stride);
+        dest_ptr1 += plane_stride;
+    }
+}
+
+static void yv12_extend_frame_bottom_c(YV12_BUFFER_CONFIG *ybf)
+{
+    int i;
+    unsigned char *src_ptr1, *src_ptr2;
+    unsigned char *dest_ptr2;
+
+    unsigned int Border;
+    int plane_stride;
+    int plane_height;
+
+    /***********/
+    /* Y Plane */
+    /***********/
+    Border = ybf->border;
+    plane_stride = ybf->y_stride;
+    plane_height = ybf->y_height;
+
+    src_ptr1 = ybf->y_buffer - Border;
+    src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
+    dest_ptr2 = src_ptr2 + plane_stride;
+
+    for (i = 0; i < (int)Border; i++)
+    {
+        memcpy(dest_ptr2, src_ptr2, plane_stride);
+        dest_ptr2 += plane_stride;
+    }
+
+
+    /***********/
+    /* U Plane */
+    /***********/
+    plane_stride = ybf->uv_stride;
+    plane_height = ybf->uv_height;
+    Border /= 2;
+
+    src_ptr1 = ybf->u_buffer - Border;
+    src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
+    dest_ptr2 = src_ptr2 + plane_stride;
+
+    for (i = 0; i < (int)(Border); i++)
+    {
+        memcpy(dest_ptr2, src_ptr2, plane_stride);
+        dest_ptr2 += plane_stride;
+    }
+
+    /***********/
+    /* V Plane */
+    /***********/
+
+    src_ptr1 = ybf->v_buffer - Border;
+    src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
+    dest_ptr2 = src_ptr2 + plane_stride;
+
+    for (i = 0; i < (int)(Border); i++)
+    {
+        memcpy(dest_ptr2, src_ptr2, plane_stride);
+        dest_ptr2 += plane_stride;
+    }
+}
+
+static void yv12_extend_frame_left_right_c(YV12_BUFFER_CONFIG *ybf,
+                                           unsigned char *y_src,
+                                           unsigned char *u_src,
+                                           unsigned char *v_src)
+{
+    int i;
+    unsigned char *src_ptr1, *src_ptr2;
+    unsigned char *dest_ptr1, *dest_ptr2;
+
+    unsigned int Border;
+    int plane_stride;
+    int plane_height;
+    int plane_width;
+
+    /***********/
+    /* Y Plane */
+    /***********/
+    Border = ybf->border;
+    plane_stride = ybf->y_stride;
+    plane_height = 16;
+    plane_width = ybf->y_width;
+
+    /* copy the left and right most columns out */
+    src_ptr1 = y_src;
+    src_ptr2 = src_ptr1 + plane_width - 1;
+    dest_ptr1 = src_ptr1 - Border;
+    dest_ptr2 = src_ptr2 + 1;
+
+    for (i = 0; i < plane_height; i++)
+    {
+        memset(dest_ptr1, src_ptr1[0], Border);
+        memset(dest_ptr2, src_ptr2[0], Border);
+        src_ptr1  += plane_stride;
+        src_ptr2  += plane_stride;
+        dest_ptr1 += plane_stride;
+        dest_ptr2 += plane_stride;
+    }
+
+    /***********/
+    /* U Plane */
+    /***********/
+    plane_stride = ybf->uv_stride;
+    plane_height = 8;
+    plane_width = ybf->uv_width;
+    Border /= 2;
+
+    /* copy the left and right most columns out */
+    src_ptr1 = u_src;
+    src_ptr2 = src_ptr1 + plane_width - 1;
+    dest_ptr1 = src_ptr1 - Border;
+    dest_ptr2 = src_ptr2 + 1;
+
+    for (i = 0; i < plane_height; i++)
+    {
+        memset(dest_ptr1, src_ptr1[0], Border);
+        memset(dest_ptr2, src_ptr2[0], Border);
+        src_ptr1  += plane_stride;
+        src_ptr2  += plane_stride;
+        dest_ptr1 += plane_stride;
+        dest_ptr2 += plane_stride;
+    }
+
+    /***********/
+    /* V Plane */
+    /***********/
+
+    /* copy the left and right most columns out */
+    src_ptr1 = v_src;
+    src_ptr2 = src_ptr1 + plane_width - 1;
+    dest_ptr1 = src_ptr1 - Border;
+    dest_ptr2 = src_ptr2 + 1;
+
+    for (i = 0; i < plane_height; i++)
+    {
+        memset(dest_ptr1, src_ptr1[0], Border);
+        memset(dest_ptr2, src_ptr2[0], Border);
+        src_ptr1  += plane_stride;
+        src_ptr2  += plane_stride;
+        dest_ptr1 += plane_stride;
+        dest_ptr2 += plane_stride;
+    }
+}
+
+static void decode_mb_rows(VP8D_COMP *pbi)
+{
+    VP8_COMMON *const pc = & pbi->common;
+    MACROBLOCKD *const xd  = & pbi->mb;
+
+    MODE_INFO *lf_mic = xd->mode_info_context;
+
+    int ibc = 0;
+    int num_part = 1 << pc->multi_token_partition;
+
+    int recon_yoffset, recon_uvoffset;
+    int mb_row, mb_col;
+    int mb_idx = 0;
+
+    YV12_BUFFER_CONFIG *yv12_fb_new = pbi->dec_fb_ref[INTRA_FRAME];
+
+    int recon_y_stride = yv12_fb_new->y_stride;
+    int recon_uv_stride = yv12_fb_new->uv_stride;
+
+    unsigned char *ref_buffer[MAX_REF_FRAMES][3];
+    unsigned char *dst_buffer[3];
+    unsigned char *lf_dst[3];
+    unsigned char *eb_dst[3];
+    int i;
+    int ref_fb_corrupted[MAX_REF_FRAMES];
+
+    ref_fb_corrupted[INTRA_FRAME] = 0;
+
+    for(i = 1; i < MAX_REF_FRAMES; i++)
+    {
+        YV12_BUFFER_CONFIG *this_fb = pbi->dec_fb_ref[i];
+
+        ref_buffer[i][0] = this_fb->y_buffer;
+        ref_buffer[i][1] = this_fb->u_buffer;
+        ref_buffer[i][2] = this_fb->v_buffer;
+
+        ref_fb_corrupted[i] = this_fb->corrupted;
+    }
+
+    /* Set up the buffer pointers */
+    eb_dst[0] = lf_dst[0] = dst_buffer[0] = yv12_fb_new->y_buffer;
+    eb_dst[1] = lf_dst[1] = dst_buffer[1] = yv12_fb_new->u_buffer;
+    eb_dst[2] = lf_dst[2] = dst_buffer[2] = yv12_fb_new->v_buffer;
+
+    xd->up_available = 0;
+
+    /* Initialize the loop filter for this frame. */
+    if(pc->filter_level)
+        vp8_loop_filter_frame_init(pc, xd, pc->filter_level);
+
+    vp8_setup_intra_recon_top_line(yv12_fb_new);
+
+    /* Decode the individual macro block */
+    for (mb_row = 0; mb_row < pc->mb_rows; mb_row++)
+    {
+        if (num_part > 1)
+        {
+            xd->current_bc = & pbi->mbc[ibc];
+            ibc++;
+
+            if (ibc == num_part)
+                ibc = 0;
+        }
+
+        recon_yoffset = mb_row * recon_y_stride * 16;
+        recon_uvoffset = mb_row * recon_uv_stride * 8;
+
+        /* reset contexts */
+        xd->above_context = pc->above_context;
+        memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
+
+        xd->left_available = 0;
+
+        xd->mb_to_top_edge = -((mb_row * 16) << 3);
+        xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;
+
+        xd->recon_above[0] = dst_buffer[0] + recon_yoffset;
+        xd->recon_above[1] = dst_buffer[1] + recon_uvoffset;
+        xd->recon_above[2] = dst_buffer[2] + recon_uvoffset;
+
+        xd->recon_left[0] = xd->recon_above[0] - 1;
+        xd->recon_left[1] = xd->recon_above[1] - 1;
+        xd->recon_left[2] = xd->recon_above[2] - 1;
+
+        xd->recon_above[0] -= xd->dst.y_stride;
+        xd->recon_above[1] -= xd->dst.uv_stride;
+        xd->recon_above[2] -= xd->dst.uv_stride;
+
+        /* TODO: move to outside row loop */
+        xd->recon_left_stride[0] = xd->dst.y_stride;
+        xd->recon_left_stride[1] = xd->dst.uv_stride;
+
+        setup_intra_recon_left(xd->recon_left[0], xd->recon_left[1],
+                               xd->recon_left[2], xd->dst.y_stride,
+                               xd->dst.uv_stride);
+
+        for (mb_col = 0; mb_col < pc->mb_cols; mb_col++)
+        {
+            /* Distance of Mb to the various image edges.
+             * These are specified to 8th pel as they are always compared to values
+             * that are in 1/8th pel units
+             */
+            xd->mb_to_left_edge = -((mb_col * 16) << 3);
+            xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
+
+#if CONFIG_ERROR_CONCEALMENT
+            {
+                int corrupt_residual = (!pbi->independent_partitions &&
+                                       pbi->frame_corrupt_residual) ||
+                                       vp8dx_bool_error(xd->current_bc);
+                if (pbi->ec_active &&
+                    xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME &&
+                    corrupt_residual)
+                {
+                    /* We have an intra block with corrupt coefficients, better to
+                     * conceal with an inter block. Interpolate MVs from neighboring
+                     * MBs.
+                     *
+                     * Note that for the first mb with corrupt residual in a frame,
+                     * we might not discover that before decoding the residual. That
+                     * happens after this check, and therefore no inter concealment
+                     * will be done.
+                     */
+                    vp8_interpolate_motion(xd,
+                                           mb_row, mb_col,
+                                           pc->mb_rows, pc->mb_cols,
+                                           pc->mode_info_stride);
+                }
+            }
+#endif
+
+            xd->dst.y_buffer = dst_buffer[0] + recon_yoffset;
+            xd->dst.u_buffer = dst_buffer[1] + recon_uvoffset;
+            xd->dst.v_buffer = dst_buffer[2] + recon_uvoffset;
+
+            if (xd->mode_info_context->mbmi.ref_frame >= LAST_FRAME) {
+              const MV_REFERENCE_FRAME ref = xd->mode_info_context->mbmi.ref_frame;
+              xd->pre.y_buffer = ref_buffer[ref][0] + recon_yoffset;
+              xd->pre.u_buffer = ref_buffer[ref][1] + recon_uvoffset;
+              xd->pre.v_buffer = ref_buffer[ref][2] + recon_uvoffset;
+            } else {
+              // ref_frame is INTRA_FRAME, pre buffer should not be used.
+              xd->pre.y_buffer = 0;
+              xd->pre.u_buffer = 0;
+              xd->pre.v_buffer = 0;
+            }
+
+            /* propagate errors from reference frames */
+            xd->corrupted |= ref_fb_corrupted[xd->mode_info_context->mbmi.ref_frame];
+
+            decode_macroblock(pbi, xd, mb_idx);
+
+            mb_idx++;
+            xd->left_available = 1;
+
+            /* check if the boolean decoder has suffered an error */
+            xd->corrupted |= vp8dx_bool_error(xd->current_bc);
+
+            xd->recon_above[0] += 16;
+            xd->recon_above[1] += 8;
+            xd->recon_above[2] += 8;
+            xd->recon_left[0] += 16;
+            xd->recon_left[1] += 8;
+            xd->recon_left[2] += 8;
+
+            recon_yoffset += 16;
+            recon_uvoffset += 8;
+
+            ++xd->mode_info_context;  /* next mb */
+
+            xd->above_context++;
+        }
+
+        /* adjust to the next row of mbs */
+        vp8_extend_mb_row(yv12_fb_new, xd->dst.y_buffer + 16,
+                          xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
+
+        ++xd->mode_info_context;      /* skip prediction column */
+        xd->up_available = 1;
+
+        if(pc->filter_level)
+        {
+            if(mb_row > 0)
+            {
+                if (pc->filter_type == NORMAL_LOOPFILTER)
+                    vp8_loop_filter_row_normal(pc, lf_mic, mb_row-1,
+                                               recon_y_stride, recon_uv_stride,
+                                               lf_dst[0], lf_dst[1], lf_dst[2]);
+                else
+                    vp8_loop_filter_row_simple(pc, lf_mic, mb_row-1,
+                                               recon_y_stride, recon_uv_stride,
+                                               lf_dst[0], lf_dst[1], lf_dst[2]);
+                if(mb_row > 1)
+                {
+                    yv12_extend_frame_left_right_c(yv12_fb_new,
+                                                   eb_dst[0],
+                                                   eb_dst[1],
+                                                   eb_dst[2]);
+
+                    eb_dst[0] += recon_y_stride  * 16;
+                    eb_dst[1] += recon_uv_stride *  8;
+                    eb_dst[2] += recon_uv_stride *  8;
+                }
+
+                lf_dst[0] += recon_y_stride  * 16;
+                lf_dst[1] += recon_uv_stride *  8;
+                lf_dst[2] += recon_uv_stride *  8;
+                lf_mic += pc->mb_cols;
+                lf_mic++;         /* Skip border mb */
+            }
+        }
+        else
+        {
+            if(mb_row > 0)
+            {
+                /**/
+                yv12_extend_frame_left_right_c(yv12_fb_new,
+                                               eb_dst[0],
+                                               eb_dst[1],
+                                               eb_dst[2]);
+                eb_dst[0] += recon_y_stride  * 16;
+                eb_dst[1] += recon_uv_stride *  8;
+                eb_dst[2] += recon_uv_stride *  8;
+            }
+        }
+    }
+
+    if(pc->filter_level)
+    {
+        if (pc->filter_type == NORMAL_LOOPFILTER)
+            vp8_loop_filter_row_normal(pc, lf_mic, mb_row-1, recon_y_stride,
+                                       recon_uv_stride, lf_dst[0], lf_dst[1],
+                                       lf_dst[2]);
+        else
+            vp8_loop_filter_row_simple(pc, lf_mic, mb_row-1, recon_y_stride,
+                                       recon_uv_stride, lf_dst[0], lf_dst[1],
+                                       lf_dst[2]);
+
+        yv12_extend_frame_left_right_c(yv12_fb_new,
+                                       eb_dst[0],
+                                       eb_dst[1],
+                                       eb_dst[2]);
+        eb_dst[0] += recon_y_stride  * 16;
+        eb_dst[1] += recon_uv_stride *  8;
+        eb_dst[2] += recon_uv_stride *  8;
+    }
+    yv12_extend_frame_left_right_c(yv12_fb_new,
+                                   eb_dst[0],
+                                   eb_dst[1],
+                                   eb_dst[2]);
+    yv12_extend_frame_top_c(yv12_fb_new);
+    yv12_extend_frame_bottom_c(yv12_fb_new);
+
+}
+
+static unsigned int read_partition_size(VP8D_COMP *pbi,
+                                        const unsigned char *cx_size)
+{
+    unsigned char temp[3];
+    if (pbi->decrypt_cb)
+    {
+        pbi->decrypt_cb(pbi->decrypt_state, cx_size, temp, 3);
+        cx_size = temp;
+    }
+    return cx_size[0] + (cx_size[1] << 8) + (cx_size[2] << 16);
+}
+
+static int read_is_valid(const unsigned char *start,
+                         size_t               len,
+                         const unsigned char *end)
+{
+    return (start + len > start && start + len <= end);
+}
+
+static unsigned int read_available_partition_size(
+                                       VP8D_COMP *pbi,
+                                       const unsigned char *token_part_sizes,
+                                       const unsigned char *fragment_start,
+                                       const unsigned char *first_fragment_end,
+                                       const unsigned char *fragment_end,
+                                       int i,
+                                       int num_part)
+{
+    VP8_COMMON* pc = &pbi->common;
+    const unsigned char *partition_size_ptr = token_part_sizes + i * 3;
+    unsigned int partition_size = 0;
+    ptrdiff_t bytes_left = fragment_end - fragment_start;
+    /* Calculate the length of this partition. The last partition
+     * size is implicit. If the partition size can't be read, then
+     * either use the remaining data in the buffer (for EC mode)
+     * or throw an error.
+     */
+    if (i < num_part - 1)
+    {
+        if (read_is_valid(partition_size_ptr, 3, first_fragment_end))
+            partition_size = read_partition_size(pbi, partition_size_ptr);
+        else if (pbi->ec_active)
+            partition_size = (unsigned int)bytes_left;
+        else
+            vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+                               "Truncated partition size data");
+    }
+    else
+        partition_size = (unsigned int)bytes_left;
+
+    /* Validate the calculated partition length. If the buffer
+     * described by the partition can't be fully read, then restrict
+     * it to the portion that can be (for EC mode) or throw an error.
+     */
+    if (!read_is_valid(fragment_start, partition_size, fragment_end))
+    {
+        if (pbi->ec_active)
+            partition_size = (unsigned int)bytes_left;
+        else
+            vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+                               "Truncated packet or corrupt partition "
+                               "%d length", i + 1);
+    }
+    return partition_size;
+}
+
+
+static void setup_token_decoder(VP8D_COMP *pbi,
+                                const unsigned char* token_part_sizes)
+{
+    vp8_reader *bool_decoder = &pbi->mbc[0];
+    unsigned int partition_idx;
+    unsigned int fragment_idx;
+    unsigned int num_token_partitions;
+    const unsigned char *first_fragment_end = pbi->fragments.ptrs[0] +
+                                          pbi->fragments.sizes[0];
+
+    TOKEN_PARTITION multi_token_partition =
+            (TOKEN_PARTITION)vp8_read_literal(&pbi->mbc[8], 2);
+    if (!vp8dx_bool_error(&pbi->mbc[8]))
+        pbi->common.multi_token_partition = multi_token_partition;
+    num_token_partitions = 1 << pbi->common.multi_token_partition;
+
+    /* Check for partitions within the fragments and unpack the fragments
+     * so that each fragment pointer points to its corresponding partition. */
+    for (fragment_idx = 0; fragment_idx < pbi->fragments.count; ++fragment_idx)
+    {
+        unsigned int fragment_size = pbi->fragments.sizes[fragment_idx];
+        const unsigned char *fragment_end = pbi->fragments.ptrs[fragment_idx] +
+                                            fragment_size;
+        /* Special case for handling the first partition since we have already
+         * read its size. */
+        if (fragment_idx == 0)
+        {
+            /* Size of first partition + token partition sizes element */
+            ptrdiff_t ext_first_part_size = token_part_sizes -
+                pbi->fragments.ptrs[0] + 3 * (num_token_partitions - 1);
+            fragment_size -= (unsigned int)ext_first_part_size;
+            if (fragment_size > 0)
+            {
+                pbi->fragments.sizes[0] = (unsigned int)ext_first_part_size;
+                /* The fragment contains an additional partition. Move to
+                 * next. */
+                fragment_idx++;
+                pbi->fragments.ptrs[fragment_idx] = pbi->fragments.ptrs[0] +
+                  pbi->fragments.sizes[0];
+            }
+        }
+        /* Split the chunk into partitions read from the bitstream */
+        while (fragment_size > 0)
+        {
+            ptrdiff_t partition_size = read_available_partition_size(
+                                                 pbi,
+                                                 token_part_sizes,
+                                                 pbi->fragments.ptrs[fragment_idx],
+                                                 first_fragment_end,
+                                                 fragment_end,
+                                                 fragment_idx - 1,
+                                                 num_token_partitions);
+            pbi->fragments.sizes[fragment_idx] = (unsigned int)partition_size;
+            fragment_size -= (unsigned int)partition_size;
+            assert(fragment_idx <= num_token_partitions);
+            if (fragment_size > 0)
+            {
+                /* The fragment contains an additional partition.
+                 * Move to next. */
+                fragment_idx++;
+                pbi->fragments.ptrs[fragment_idx] =
+                    pbi->fragments.ptrs[fragment_idx - 1] + partition_size;
+            }
+        }
+    }
+
+    pbi->fragments.count = num_token_partitions + 1;
+
+    for (partition_idx = 1; partition_idx < pbi->fragments.count; ++partition_idx)
+    {
+        if (vp8dx_start_decode(bool_decoder,
+                               pbi->fragments.ptrs[partition_idx],
+                               pbi->fragments.sizes[partition_idx],
+                               pbi->decrypt_cb, pbi->decrypt_state))
+            vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,
+                               "Failed to allocate bool decoder %d",
+                               partition_idx);
+
+        bool_decoder++;
+    }
+
+#if CONFIG_MULTITHREAD
+    /* Clamp number of decoder threads */
+    if (pbi->decoding_thread_count > num_token_partitions - 1)
+        pbi->decoding_thread_count = num_token_partitions - 1;
+#endif
+}
+
+
+static void init_frame(VP8D_COMP *pbi)
+{
+    VP8_COMMON *const pc = & pbi->common;
+    MACROBLOCKD *const xd  = & pbi->mb;
+
+    if (pc->frame_type == KEY_FRAME)
+    {
+        /* Various keyframe initializations */
+        memcpy(pc->fc.mvc, vp8_default_mv_context, sizeof(vp8_default_mv_context));
+
+        vp8_init_mbmode_probs(pc);
+
+        vp8_default_coef_probs(pc);
+
+        /* reset the segment feature data to 0 with delta coding (Default state). */
+        memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data));
+        xd->mb_segement_abs_delta = SEGMENT_DELTADATA;
+
+        /* reset the mode ref deltasa for loop filter */
+        memset(xd->ref_lf_deltas, 0, sizeof(xd->ref_lf_deltas));
+        memset(xd->mode_lf_deltas, 0, sizeof(xd->mode_lf_deltas));
+
+        /* All buffers are implicitly updated on key frames. */
+        pc->refresh_golden_frame = 1;
+        pc->refresh_alt_ref_frame = 1;
+        pc->copy_buffer_to_gf = 0;
+        pc->copy_buffer_to_arf = 0;
+
+        /* Note that Golden and Altref modes cannot be used on a key frame so
+         * ref_frame_sign_bias[] is undefined and meaningless
+         */
+        pc->ref_frame_sign_bias[GOLDEN_FRAME] = 0;
+        pc->ref_frame_sign_bias[ALTREF_FRAME] = 0;
+    }
+    else
+    {
+        /* To enable choice of different interploation filters */
+        if (!pc->use_bilinear_mc_filter)
+        {
+            xd->subpixel_predict        = vp8_sixtap_predict4x4;
+            xd->subpixel_predict8x4     = vp8_sixtap_predict8x4;
+            xd->subpixel_predict8x8     = vp8_sixtap_predict8x8;
+            xd->subpixel_predict16x16   = vp8_sixtap_predict16x16;
+        }
+        else
+        {
+            xd->subpixel_predict        = vp8_bilinear_predict4x4;
+            xd->subpixel_predict8x4     = vp8_bilinear_predict8x4;
+            xd->subpixel_predict8x8     = vp8_bilinear_predict8x8;
+            xd->subpixel_predict16x16   = vp8_bilinear_predict16x16;
+        }
+
+        if (pbi->decoded_key_frame && pbi->ec_enabled && !pbi->ec_active)
+            pbi->ec_active = 1;
+    }
+
+    xd->left_context = &pc->left_context;
+    xd->mode_info_context = pc->mi;
+    xd->frame_type = pc->frame_type;
+    xd->mode_info_context->mbmi.mode = DC_PRED;
+    xd->mode_info_stride = pc->mode_info_stride;
+    xd->corrupted = 0; /* init without corruption */
+
+    xd->fullpixel_mask = 0xffffffff;
+    if(pc->full_pixel)
+        xd->fullpixel_mask = 0xfffffff8;
+
+}
+
+int vp8_decode_frame(VP8D_COMP *pbi)
+{
+    vp8_reader *const bc = &pbi->mbc[8];
+    VP8_COMMON *const pc = &pbi->common;
+    MACROBLOCKD *const xd  = &pbi->mb;
+    const unsigned char *data = pbi->fragments.ptrs[0];
+    const unsigned char *data_end =  data + pbi->fragments.sizes[0];
+    ptrdiff_t first_partition_length_in_bytes;
+
+    int i, j, k, l;
+    const int *const mb_feature_data_bits = vp8_mb_feature_data_bits;
+    int corrupt_tokens = 0;
+    int prev_independent_partitions = pbi->independent_partitions;
+
+    YV12_BUFFER_CONFIG *yv12_fb_new = pbi->dec_fb_ref[INTRA_FRAME];
+
+    /* start with no corruption of current frame */
+    xd->corrupted = 0;
+    yv12_fb_new->corrupted = 0;
+
+    if (data_end - data < 3)
+    {
+        if (!pbi->ec_active)
+        {
+            vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+                               "Truncated packet");
+        }
+
+        /* Declare the missing frame as an inter frame since it will
+           be handled as an inter frame when we have estimated its
+           motion vectors. */
+        pc->frame_type = INTER_FRAME;
+        pc->version = 0;
+        pc->show_frame = 1;
+        first_partition_length_in_bytes = 0;
+    }
+    else
+    {
+        unsigned char clear_buffer[10];
+        const unsigned char *clear = data;
+        if (pbi->decrypt_cb)
+        {
+            int n = (int)VPXMIN(sizeof(clear_buffer), data_end - data);
+            pbi->decrypt_cb(pbi->decrypt_state, data, clear_buffer, n);
+            clear = clear_buffer;
+        }
+
+        pc->frame_type = (FRAME_TYPE)(clear[0] & 1);
+        pc->version = (clear[0] >> 1) & 7;
+        pc->show_frame = (clear[0] >> 4) & 1;
+        first_partition_length_in_bytes =
+            (clear[0] | (clear[1] << 8) | (clear[2] << 16)) >> 5;
+
+        if (!pbi->ec_active &&
+            (data + first_partition_length_in_bytes > data_end
+            || data + first_partition_length_in_bytes < data))
+            vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+                               "Truncated packet or corrupt partition 0 length");
+
+        data += 3;
+        clear += 3;
+
+        vp8_setup_version(pc);
+
+
+        if (pc->frame_type == KEY_FRAME)
+        {
+            /* vet via sync code */
+            /* When error concealment is enabled we should only check the sync
+             * code if we have enough bits available
+             */
+            if (!pbi->ec_active || data + 3 < data_end)
+            {
+                if (clear[0] != 0x9d || clear[1] != 0x01 || clear[2] != 0x2a)
+                    vpx_internal_error(&pc->error, VPX_CODEC_UNSUP_BITSTREAM,
+                                   "Invalid frame sync code");
+            }
+
+            /* If error concealment is enabled we should only parse the new size
+             * if we have enough data. Otherwise we will end up with the wrong
+             * size.
+             */
+            if (!pbi->ec_active || data + 6 < data_end)
+            {
+                pc->Width = (clear[3] | (clear[4] << 8)) & 0x3fff;
+                pc->horiz_scale = clear[4] >> 6;
+                pc->Height = (clear[5] | (clear[6] << 8)) & 0x3fff;
+                pc->vert_scale = clear[6] >> 6;
+            }
+            data += 7;
+        }
+        else
+        {
+          memcpy(&xd->pre, yv12_fb_new, sizeof(YV12_BUFFER_CONFIG));
+          memcpy(&xd->dst, yv12_fb_new, sizeof(YV12_BUFFER_CONFIG));
+        }
+    }
+    if ((!pbi->decoded_key_frame && pc->frame_type != KEY_FRAME))
+    {
+        return -1;
+    }
+
+    init_frame(pbi);
+
+    if (vp8dx_start_decode(bc, data, (unsigned int)(data_end - data),
+                           pbi->decrypt_cb, pbi->decrypt_state))
+        vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
+                           "Failed to allocate bool decoder 0");
+    if (pc->frame_type == KEY_FRAME) {
+        (void)vp8_read_bit(bc);  // colorspace
+        pc->clamp_type  = (CLAMP_TYPE)vp8_read_bit(bc);
+    }
+
+    /* Is segmentation enabled */
+    xd->segmentation_enabled = (unsigned char)vp8_read_bit(bc);
+
+    if (xd->segmentation_enabled)
+    {
+        /* Signal whether or not the segmentation map is being explicitly updated this frame. */
+        xd->update_mb_segmentation_map = (unsigned char)vp8_read_bit(bc);
+        xd->update_mb_segmentation_data = (unsigned char)vp8_read_bit(bc);
+
+        if (xd->update_mb_segmentation_data)
+        {
+            xd->mb_segement_abs_delta = (unsigned char)vp8_read_bit(bc);
+
+            memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data));
+
+            /* For each segmentation feature (Quant and loop filter level) */
+            for (i = 0; i < MB_LVL_MAX; i++)
+            {
+                for (j = 0; j < MAX_MB_SEGMENTS; j++)
+                {
+                    /* Frame level data */
+                    if (vp8_read_bit(bc))
+                    {
+                        xd->segment_feature_data[i][j] = (signed char)vp8_read_literal(bc, mb_feature_data_bits[i]);
+
+                        if (vp8_read_bit(bc))
+                            xd->segment_feature_data[i][j] = -xd->segment_feature_data[i][j];
+                    }
+                    else
+                        xd->segment_feature_data[i][j] = 0;
+                }
+            }
+        }
+
+        if (xd->update_mb_segmentation_map)
+        {
+            /* Which macro block level features are enabled */
+            memset(xd->mb_segment_tree_probs, 255, sizeof(xd->mb_segment_tree_probs));
+
+            /* Read the probs used to decode the segment id for each macro block. */
+            for (i = 0; i < MB_FEATURE_TREE_PROBS; i++)
+            {
+                /* If not explicitly set value is defaulted to 255 by memset above */
+                if (vp8_read_bit(bc))
+                    xd->mb_segment_tree_probs[i] = (vp8_prob)vp8_read_literal(bc, 8);
+            }
+        }
+    }
+    else
+    {
+        /* No segmentation updates on this frame */
+        xd->update_mb_segmentation_map = 0;
+        xd->update_mb_segmentation_data = 0;
+    }
+
+    /* Read the loop filter level and type */
+    pc->filter_type = (LOOPFILTERTYPE) vp8_read_bit(bc);
+    pc->filter_level = vp8_read_literal(bc, 6);
+    pc->sharpness_level = vp8_read_literal(bc, 3);
+
+    /* Read in loop filter deltas applied at the MB level based on mode or ref frame. */
+    xd->mode_ref_lf_delta_update = 0;
+    xd->mode_ref_lf_delta_enabled = (unsigned char)vp8_read_bit(bc);
+
+    if (xd->mode_ref_lf_delta_enabled)
+    {
+        /* Do the deltas need to be updated */
+        xd->mode_ref_lf_delta_update = (unsigned char)vp8_read_bit(bc);
+
+        if (xd->mode_ref_lf_delta_update)
+        {
+            /* Send update */
+            for (i = 0; i < MAX_REF_LF_DELTAS; i++)
+            {
+                if (vp8_read_bit(bc))
+                {
+                    /*sign = vp8_read_bit( bc );*/
+                    xd->ref_lf_deltas[i] = (signed char)vp8_read_literal(bc, 6);
+
+                    if (vp8_read_bit(bc))        /* Apply sign */
+                        xd->ref_lf_deltas[i] = xd->ref_lf_deltas[i] * -1;
+                }
+            }
+
+            /* Send update */
+            for (i = 0; i < MAX_MODE_LF_DELTAS; i++)
+            {
+                if (vp8_read_bit(bc))
+                {
+                    /*sign = vp8_read_bit( bc );*/
+                    xd->mode_lf_deltas[i] = (signed char)vp8_read_literal(bc, 6);
+
+                    if (vp8_read_bit(bc))        /* Apply sign */
+                        xd->mode_lf_deltas[i] = xd->mode_lf_deltas[i] * -1;
+                }
+            }
+        }
+    }
+
+    setup_token_decoder(pbi, data + first_partition_length_in_bytes);
+
+    xd->current_bc = &pbi->mbc[0];
+
+    /* Read the default quantizers. */
+    {
+        int Q, q_update;
+
+        Q = vp8_read_literal(bc, 7);  /* AC 1st order Q = default */
+        pc->base_qindex = Q;
+        q_update = 0;
+        pc->y1dc_delta_q = get_delta_q(bc, pc->y1dc_delta_q, &q_update);
+        pc->y2dc_delta_q = get_delta_q(bc, pc->y2dc_delta_q, &q_update);
+        pc->y2ac_delta_q = get_delta_q(bc, pc->y2ac_delta_q, &q_update);
+        pc->uvdc_delta_q = get_delta_q(bc, pc->uvdc_delta_q, &q_update);
+        pc->uvac_delta_q = get_delta_q(bc, pc->uvac_delta_q, &q_update);
+
+        if (q_update)
+            vp8cx_init_de_quantizer(pbi);
+
+        /* MB level dequantizer setup */
+        vp8_mb_init_dequantizer(pbi, &pbi->mb);
+    }
+
+    /* Determine if the golden frame or ARF buffer should be updated and how.
+     * For all non key frames the GF and ARF refresh flags and sign bias
+     * flags must be set explicitly.
+     */
+    if (pc->frame_type != KEY_FRAME)
+    {
+        /* Should the GF or ARF be updated from the current frame */
+        pc->refresh_golden_frame = vp8_read_bit(bc);
+#if CONFIG_ERROR_CONCEALMENT
+        /* Assume we shouldn't refresh golden if the bit is missing */
+        xd->corrupted |= vp8dx_bool_error(bc);
+        if (pbi->ec_active && xd->corrupted)
+            pc->refresh_golden_frame = 0;
+#endif
+
+        pc->refresh_alt_ref_frame = vp8_read_bit(bc);
+#if CONFIG_ERROR_CONCEALMENT
+        /* Assume we shouldn't refresh altref if the bit is missing */
+        xd->corrupted |= vp8dx_bool_error(bc);
+        if (pbi->ec_active && xd->corrupted)
+            pc->refresh_alt_ref_frame = 0;
+#endif
+
+        /* Buffer to buffer copy flags. */
+        pc->copy_buffer_to_gf = 0;
+
+        if (!pc->refresh_golden_frame)
+            pc->copy_buffer_to_gf = vp8_read_literal(bc, 2);
+
+#if CONFIG_ERROR_CONCEALMENT
+        /* Assume we shouldn't copy to the golden if the bit is missing */
+        xd->corrupted |= vp8dx_bool_error(bc);
+        if (pbi->ec_active && xd->corrupted)
+            pc->copy_buffer_to_gf = 0;
+#endif
+
+        pc->copy_buffer_to_arf = 0;
+
+        if (!pc->refresh_alt_ref_frame)
+            pc->copy_buffer_to_arf = vp8_read_literal(bc, 2);
+
+#if CONFIG_ERROR_CONCEALMENT
+        /* Assume we shouldn't copy to the alt-ref if the bit is missing */
+        xd->corrupted |= vp8dx_bool_error(bc);
+        if (pbi->ec_active && xd->corrupted)
+            pc->copy_buffer_to_arf = 0;
+#endif
+
+
+        pc->ref_frame_sign_bias[GOLDEN_FRAME] = vp8_read_bit(bc);
+        pc->ref_frame_sign_bias[ALTREF_FRAME] = vp8_read_bit(bc);
+    }
+
+    pc->refresh_entropy_probs = vp8_read_bit(bc);
+#if CONFIG_ERROR_CONCEALMENT
+    /* Assume we shouldn't refresh the probabilities if the bit is
+     * missing */
+    xd->corrupted |= vp8dx_bool_error(bc);
+    if (pbi->ec_active && xd->corrupted)
+        pc->refresh_entropy_probs = 0;
+#endif
+    if (pc->refresh_entropy_probs == 0)
+    {
+        memcpy(&pc->lfc, &pc->fc, sizeof(pc->fc));
+    }
+
+    pc->refresh_last_frame = pc->frame_type == KEY_FRAME  ||  vp8_read_bit(bc);
+
+#if CONFIG_ERROR_CONCEALMENT
+    /* Assume we should refresh the last frame if the bit is missing */
+    xd->corrupted |= vp8dx_bool_error(bc);
+    if (pbi->ec_active && xd->corrupted)
+        pc->refresh_last_frame = 1;
+#endif
+
+    if (0)
+    {
+        FILE *z = fopen("decodestats.stt", "a");
+        fprintf(z, "%6d F:%d,G:%d,A:%d,L:%d,Q:%d\n",
+                pc->current_video_frame,
+                pc->frame_type,
+                pc->refresh_golden_frame,
+                pc->refresh_alt_ref_frame,
+                pc->refresh_last_frame,
+                pc->base_qindex);
+        fclose(z);
+    }
+
+    {
+        pbi->independent_partitions = 1;
+
+        /* read coef probability tree */
+        for (i = 0; i < BLOCK_TYPES; i++)
+            for (j = 0; j < COEF_BANDS; j++)
+                for (k = 0; k < PREV_COEF_CONTEXTS; k++)
+                    for (l = 0; l < ENTROPY_NODES; l++)
+                    {
+
+                        vp8_prob *const p = pc->fc.coef_probs [i][j][k] + l;
+
+                        if (vp8_read(bc, vp8_coef_update_probs [i][j][k][l]))
+                        {
+                            *p = (vp8_prob)vp8_read_literal(bc, 8);
+
+                        }
+                        if (k > 0 && *p != pc->fc.coef_probs[i][j][k-1][l])
+                            pbi->independent_partitions = 0;
+
+                    }
+    }
+
+    /* clear out the coeff buffer */
+    memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
+
+    vp8_decode_mode_mvs(pbi);
+
+#if CONFIG_ERROR_CONCEALMENT
+    if (pbi->ec_active &&
+            pbi->mvs_corrupt_from_mb < (unsigned int)pc->mb_cols * pc->mb_rows)
+    {
+        /* Motion vectors are missing in this frame. We will try to estimate
+         * them and then continue decoding the frame as usual */
+        vp8_estimate_missing_mvs(pbi);
+    }
+#endif
+
+    memset(pc->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * pc->mb_cols);
+    pbi->frame_corrupt_residual = 0;
+
+#if CONFIG_MULTITHREAD
+    if (pbi->b_multithreaded_rd && pc->multi_token_partition != ONE_PARTITION)
+    {
+        unsigned int thread;
+        vp8mt_decode_mb_rows(pbi, xd);
+        vp8_yv12_extend_frame_borders(yv12_fb_new);
+        for (thread = 0; thread < pbi->decoding_thread_count; ++thread)
+            corrupt_tokens |= pbi->mb_row_di[thread].mbd.corrupted;
+    }
+    else
+#endif
+    {
+        decode_mb_rows(pbi);
+        corrupt_tokens |= xd->corrupted;
+    }
+
+    /* Collect information about decoder corruption. */
+    /* 1. Check first boolean decoder for errors. */
+    yv12_fb_new->corrupted = vp8dx_bool_error(bc);
+    /* 2. Check the macroblock information */
+    yv12_fb_new->corrupted |= corrupt_tokens;
+
+    if (!pbi->decoded_key_frame)
+    {
+        if (pc->frame_type == KEY_FRAME &&
+            !yv12_fb_new->corrupted)
+            pbi->decoded_key_frame = 1;
+        else
+            vpx_internal_error(&pbi->common.error, VPX_CODEC_CORRUPT_FRAME,
+                               "A stream must start with a complete key frame");
+    }
+
+    /* vpx_log("Decoder: Frame Decoded, Size Roughly:%d bytes  \n",bc->pos+pbi->bc2.pos); */
+
+    if (pc->refresh_entropy_probs == 0)
+    {
+        memcpy(&pc->fc, &pc->lfc, sizeof(pc->fc));
+        pbi->independent_partitions = prev_independent_partitions;
+    }
+
+#ifdef PACKET_TESTING
+    {
+        FILE *f = fopen("decompressor.VP8", "ab");
+        unsigned int size = pbi->bc2.pos + pbi->bc.pos + 8;
+        fwrite((void *) &size, 4, 1, f);
+        fwrite((void *) pbi->Source, size, 1, f);
+        fclose(f);
+    }
+#endif
+
+    return 0;
+}
diff --git a/libs/libvpx/vp8/decoder/decodemv.c b/libs/libvpx/vp8/decoder/decodemv.c
new file mode 100644
index 0000000000..1d155e7e16
--- /dev/null
+++ b/libs/libvpx/vp8/decoder/decodemv.c
@@ -0,0 +1,670 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "treereader.h"
+#include "vp8/common/entropymv.h"
+#include "vp8/common/entropymode.h"
+#include "onyxd_int.h"
+#include "vp8/common/findnearmv.h"
+
+#if CONFIG_DEBUG
+#include <assert.h>
+#endif
+static B_PREDICTION_MODE read_bmode(vp8_reader *bc, const vp8_prob *p)
+{
+    const int i = vp8_treed_read(bc, vp8_bmode_tree, p);
+
+    return (B_PREDICTION_MODE)i;
+}
+
+static MB_PREDICTION_MODE read_ymode(vp8_reader *bc, const vp8_prob *p)
+{
+    const int i = vp8_treed_read(bc, vp8_ymode_tree, p);
+
+    return (MB_PREDICTION_MODE)i;
+}
+
+static MB_PREDICTION_MODE read_kf_ymode(vp8_reader *bc, const vp8_prob *p)
+{
+    const int i = vp8_treed_read(bc, vp8_kf_ymode_tree, p);
+
+    return (MB_PREDICTION_MODE)i;
+}
+
+static MB_PREDICTION_MODE read_uv_mode(vp8_reader *bc, const vp8_prob *p)
+{
+    const int i = vp8_treed_read(bc, vp8_uv_mode_tree, p);
+
+    return (MB_PREDICTION_MODE)i;
+}
+
+static void read_kf_modes(VP8D_COMP *pbi, MODE_INFO *mi)
+{
+    vp8_reader *const bc = & pbi->mbc[8];
+    const int mis = pbi->common.mode_info_stride;
+
+    mi->mbmi.ref_frame = INTRA_FRAME;
+    mi->mbmi.mode = read_kf_ymode(bc, vp8_kf_ymode_prob);
+
+    if (mi->mbmi.mode == B_PRED)
+    {
+        int i = 0;
+        mi->mbmi.is_4x4 = 1;
+
+        do
+        {
+            const B_PREDICTION_MODE A = above_block_mode(mi, i, mis);
+            const B_PREDICTION_MODE L = left_block_mode(mi, i);
+
+            mi->bmi[i].as_mode =
+                read_bmode(bc, vp8_kf_bmode_prob [A] [L]);
+        }
+        while (++i < 16);
+    }
+
+    mi->mbmi.uv_mode = read_uv_mode(bc, vp8_kf_uv_mode_prob);
+}
+
+static int read_mvcomponent(vp8_reader *r, const MV_CONTEXT *mvc)
+{
+    const vp8_prob *const p = (const vp8_prob *) mvc;
+    int x = 0;
+
+    if (vp8_read(r, p [mvpis_short]))  /* Large */
+    {
+        int i = 0;
+
+        do
+        {
+            x += vp8_read(r, p [MVPbits + i]) << i;
+        }
+        while (++i < 3);
+
+        i = mvlong_width - 1;  /* Skip bit 3, which is sometimes implicit */
+
+        do
+        {
+            x += vp8_read(r, p [MVPbits + i]) << i;
+        }
+        while (--i > 3);
+
+        if (!(x & 0xFFF0)  ||  vp8_read(r, p [MVPbits + 3]))
+            x += 8;
+    }
+    else   /* small */
+        x = vp8_treed_read(r, vp8_small_mvtree, p + MVPshort);
+
+    if (x  &&  vp8_read(r, p [MVPsign]))
+        x = -x;
+
+    return x;
+}
+
+static void read_mv(vp8_reader *r, MV *mv, const MV_CONTEXT *mvc)
+{
+    mv->row = (short)(read_mvcomponent(r,   mvc) * 2);
+    mv->col = (short)(read_mvcomponent(r, ++mvc) * 2);
+}
+
+
+static void read_mvcontexts(vp8_reader *bc, MV_CONTEXT *mvc)
+{
+    int i = 0;
+
+    do
+    {
+        const vp8_prob *up = vp8_mv_update_probs[i].prob;
+        vp8_prob *p = (vp8_prob *)(mvc + i);
+        vp8_prob *const pstop = p + MVPcount;
+
+        do
+        {
+            if (vp8_read(bc, *up++))
+            {
+                const vp8_prob x = (vp8_prob)vp8_read_literal(bc, 7);
+
+                *p = x ? x << 1 : 1;
+            }
+        }
+        while (++p < pstop);
+    }
+    while (++i < 2);
+}
+
+static const unsigned char mbsplit_fill_count[4] = {8, 8, 4, 1};
+static const unsigned char mbsplit_fill_offset[4][16] = {
+    { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15},
+    { 0,  1,  4,  5,  8,  9, 12, 13,  2,  3,   6,  7, 10, 11, 14, 15},
+    { 0,  1,  4,  5,  2,  3,  6,  7,  8,  9,  12, 13, 10, 11, 14, 15},
+    { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15}
+};
+
+
+static void mb_mode_mv_init(VP8D_COMP *pbi)
+{
+    vp8_reader *const bc = & pbi->mbc[8];
+    MV_CONTEXT *const mvc = pbi->common.fc.mvc;
+
+#if CONFIG_ERROR_CONCEALMENT
+    /* Default is that no macroblock is corrupt, therefore we initialize
+     * mvs_corrupt_from_mb to something very big, which we can be sure is
+     * outside the frame. */
+    pbi->mvs_corrupt_from_mb = UINT_MAX;
+#endif
+    /* Read the mb_no_coeff_skip flag */
+    pbi->common.mb_no_coeff_skip = (int)vp8_read_bit(bc);
+
+    pbi->prob_skip_false = 0;
+    if (pbi->common.mb_no_coeff_skip)
+        pbi->prob_skip_false = (vp8_prob)vp8_read_literal(bc, 8);
+
+    if(pbi->common.frame_type != KEY_FRAME)
+    {
+        pbi->prob_intra = (vp8_prob)vp8_read_literal(bc, 8);
+        pbi->prob_last  = (vp8_prob)vp8_read_literal(bc, 8);
+        pbi->prob_gf    = (vp8_prob)vp8_read_literal(bc, 8);
+
+        if (vp8_read_bit(bc))
+        {
+            int i = 0;
+
+            do
+            {
+                pbi->common.fc.ymode_prob[i] =
+                    (vp8_prob) vp8_read_literal(bc, 8);
+            }
+            while (++i < 4);
+        }
+
+        if (vp8_read_bit(bc))
+        {
+            int i = 0;
+
+            do
+            {
+                pbi->common.fc.uv_mode_prob[i] =
+                    (vp8_prob) vp8_read_literal(bc, 8);
+            }
+            while (++i < 3);
+        }
+
+        read_mvcontexts(bc, mvc);
+    }
+}
+
+const vp8_prob vp8_sub_mv_ref_prob3 [8][VP8_SUBMVREFS-1] =
+{
+    { 147, 136, 18 },   /* SUBMVREF_NORMAL          */
+    { 223, 1  , 34 },   /* SUBMVREF_LEFT_ABOVE_SAME */
+    { 106, 145, 1  },   /* SUBMVREF_LEFT_ZED        */
+    { 208, 1  , 1  },   /* SUBMVREF_LEFT_ABOVE_ZED  */
+    { 179, 121, 1  },   /* SUBMVREF_ABOVE_ZED       */
+    { 223, 1  , 34 },   /* SUBMVREF_LEFT_ABOVE_SAME */
+    { 179, 121, 1  },   /* SUBMVREF_ABOVE_ZED       */
+    { 208, 1  , 1  }    /* SUBMVREF_LEFT_ABOVE_ZED  */
+};
+
+static
+const vp8_prob * get_sub_mv_ref_prob(const int left, const int above)
+{
+    int lez = (left == 0);
+    int aez = (above == 0);
+    int lea = (left == above);
+    const vp8_prob * prob;
+
+    prob = vp8_sub_mv_ref_prob3[(aez << 2) |
+                                (lez << 1) |
+                                (lea)];
+
+    return prob;
+}
+
+static void decode_split_mv(vp8_reader *const bc, MODE_INFO *mi,
+                        const MODE_INFO *left_mb, const MODE_INFO *above_mb,
+                        MB_MODE_INFO *mbmi, int_mv best_mv,
+                        MV_CONTEXT *const mvc, int mb_to_left_edge,
+                        int mb_to_right_edge, int mb_to_top_edge,
+                        int mb_to_bottom_edge)
+{
+    int s;      /* split configuration (16x8, 8x16, 8x8, 4x4) */
+    int num_p;  /* number of partitions in the split configuration
+                  (see vp8_mbsplit_count) */
+    int j = 0;
+
+    s = 3;
+    num_p = 16;
+    if( vp8_read(bc, 110) )
+    {
+        s = 2;
+        num_p = 4;
+        if( vp8_read(bc, 111) )
+        {
+            s = vp8_read(bc, 150);
+            num_p = 2;
+        }
+    }
+
+    do  /* for each subset j */
+    {
+        int_mv leftmv, abovemv;
+        int_mv blockmv;
+        int k;  /* first block in subset j */
+
+        const vp8_prob *prob;
+        k = vp8_mbsplit_offset[s][j];
+
+        if (!(k & 3))
+        {
+            /* On L edge, get from MB to left of us */
+            if(left_mb->mbmi.mode != SPLITMV)
+                leftmv.as_int =  left_mb->mbmi.mv.as_int;
+            else
+                leftmv.as_int =  (left_mb->bmi + k + 4 - 1)->mv.as_int;
+        }
+        else
+            leftmv.as_int =  (mi->bmi + k - 1)->mv.as_int;
+
+        if (!(k >> 2))
+        {
+            /* On top edge, get from MB above us */
+            if(above_mb->mbmi.mode != SPLITMV)
+                abovemv.as_int =  above_mb->mbmi.mv.as_int;
+            else
+                abovemv.as_int =  (above_mb->bmi + k + 16 - 4)->mv.as_int;
+        }
+        else
+            abovemv.as_int = (mi->bmi + k - 4)->mv.as_int;
+
+        prob = get_sub_mv_ref_prob(leftmv.as_int, abovemv.as_int);
+
+        if( vp8_read(bc, prob[0]) )
+        {
+            if( vp8_read(bc, prob[1]) )
+            {
+                blockmv.as_int = 0;
+                if( vp8_read(bc, prob[2]) )
+                {
+                    blockmv.as_mv.row = read_mvcomponent(bc, &mvc[0]) * 2;
+                    blockmv.as_mv.row += best_mv.as_mv.row;
+                    blockmv.as_mv.col = read_mvcomponent(bc, &mvc[1]) * 2;
+                    blockmv.as_mv.col += best_mv.as_mv.col;
+                }
+            }
+            else
+            {
+                blockmv.as_int = abovemv.as_int;
+            }
+        }
+        else
+        {
+            blockmv.as_int = leftmv.as_int;
+        }
+
+        mbmi->need_to_clamp_mvs |= vp8_check_mv_bounds(&blockmv,
+                                                  mb_to_left_edge,
+                                                  mb_to_right_edge,
+                                                  mb_to_top_edge,
+                                                  mb_to_bottom_edge);
+
+        {
+            /* Fill (uniform) modes, mvs of jth subset.
+             Must do it here because ensuing subsets can
+             refer back to us via "left" or "above". */
+            const unsigned char *fill_offset;
+            unsigned int fill_count = mbsplit_fill_count[s];
+
+            fill_offset = &mbsplit_fill_offset[s]
+                             [(unsigned char)j * mbsplit_fill_count[s]];
+
+            do {
+                mi->bmi[ *fill_offset].mv.as_int = blockmv.as_int;
+                fill_offset++;
+            }while (--fill_count);
+        }
+
+    }
+    while (++j < num_p);
+
+    mbmi->partitioning = s;
+}
+
+static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi)
+{
+    vp8_reader *const bc = & pbi->mbc[8];
+    mbmi->ref_frame = (MV_REFERENCE_FRAME) vp8_read(bc, pbi->prob_intra);
+    if (mbmi->ref_frame)    /* inter MB */
+    {
+        enum {CNT_INTRA, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV};
+        int cnt[4];
+        int *cntx = cnt;
+        int_mv near_mvs[4];
+        int_mv *nmv = near_mvs;
+        const int mis = pbi->mb.mode_info_stride;
+        const MODE_INFO *above = mi - mis;
+        const MODE_INFO *left = mi - 1;
+        const MODE_INFO *aboveleft = above - 1;
+        int *ref_frame_sign_bias = pbi->common.ref_frame_sign_bias;
+
+        mbmi->need_to_clamp_mvs = 0;
+
+        if (vp8_read(bc, pbi->prob_last))
+        {
+            mbmi->ref_frame =
+                (MV_REFERENCE_FRAME)((int)(2 + vp8_read(bc, pbi->prob_gf)));
+        }
+
+        /* Zero accumulators */
+        nmv[0].as_int = nmv[1].as_int = nmv[2].as_int = 0;
+        cnt[0] = cnt[1] = cnt[2] = cnt[3] = 0;
+
+        /* Process above */
+        if (above->mbmi.ref_frame != INTRA_FRAME)
+        {
+            if (above->mbmi.mv.as_int)
+            {
+                (++nmv)->as_int = above->mbmi.mv.as_int;
+                mv_bias(ref_frame_sign_bias[above->mbmi.ref_frame],
+                        mbmi->ref_frame, nmv, ref_frame_sign_bias);
+                ++cntx;
+            }
+
+            *cntx += 2;
+        }
+
+        /* Process left */
+        if (left->mbmi.ref_frame != INTRA_FRAME)
+        {
+            if (left->mbmi.mv.as_int)
+            {
+                int_mv this_mv;
+
+                this_mv.as_int = left->mbmi.mv.as_int;
+                mv_bias(ref_frame_sign_bias[left->mbmi.ref_frame],
+                        mbmi->ref_frame, &this_mv, ref_frame_sign_bias);
+
+                if (this_mv.as_int != nmv->as_int)
+                {
+                    (++nmv)->as_int = this_mv.as_int;
+                    ++cntx;
+                }
+
+                *cntx += 2;
+            }
+            else
+                cnt[CNT_INTRA] += 2;
+        }
+
+        /* Process above left */
+        if (aboveleft->mbmi.ref_frame != INTRA_FRAME)
+        {
+            if (aboveleft->mbmi.mv.as_int)
+            {
+                int_mv this_mv;
+
+                this_mv.as_int = aboveleft->mbmi.mv.as_int;
+                mv_bias(ref_frame_sign_bias[aboveleft->mbmi.ref_frame],
+                        mbmi->ref_frame, &this_mv, ref_frame_sign_bias);
+
+                if (this_mv.as_int != nmv->as_int)
+                {
+                    (++nmv)->as_int = this_mv.as_int;
+                    ++cntx;
+                }
+
+                *cntx += 1;
+            }
+            else
+                cnt[CNT_INTRA] += 1;
+        }
+
+        if( vp8_read(bc, vp8_mode_contexts [cnt[CNT_INTRA]] [0]) )
+        {
+
+            /* If we have three distinct MV's ... */
+            /* See if above-left MV can be merged with NEAREST */
+            cnt[CNT_NEAREST] += ( (cnt[CNT_SPLITMV] > 0) &
+                (nmv->as_int == near_mvs[CNT_NEAREST].as_int));
+
+            /* Swap near and nearest if necessary */
+            if (cnt[CNT_NEAR] > cnt[CNT_NEAREST])
+            {
+                int tmp;
+                tmp = cnt[CNT_NEAREST];
+                cnt[CNT_NEAREST] = cnt[CNT_NEAR];
+                cnt[CNT_NEAR] = tmp;
+                tmp = near_mvs[CNT_NEAREST].as_int;
+                near_mvs[CNT_NEAREST].as_int = near_mvs[CNT_NEAR].as_int;
+                near_mvs[CNT_NEAR].as_int = tmp;
+            }
+
+            if( vp8_read(bc, vp8_mode_contexts [cnt[CNT_NEAREST]] [1]) )
+            {
+
+                if( vp8_read(bc, vp8_mode_contexts [cnt[CNT_NEAR]] [2]) )
+                {
+                    int mb_to_top_edge;
+                    int mb_to_bottom_edge;
+                    int mb_to_left_edge;
+                    int mb_to_right_edge;
+                    MV_CONTEXT *const mvc = pbi->common.fc.mvc;
+                    int near_index;
+
+                    mb_to_top_edge = pbi->mb.mb_to_top_edge;
+                    mb_to_bottom_edge = pbi->mb.mb_to_bottom_edge;
+                    mb_to_top_edge -= LEFT_TOP_MARGIN;
+                    mb_to_bottom_edge += RIGHT_BOTTOM_MARGIN;
+                    mb_to_right_edge = pbi->mb.mb_to_right_edge;
+                    mb_to_right_edge += RIGHT_BOTTOM_MARGIN;
+                    mb_to_left_edge = pbi->mb.mb_to_left_edge;
+                    mb_to_left_edge -= LEFT_TOP_MARGIN;
+
+                    /* Use near_mvs[0] to store the "best" MV */
+                    near_index = CNT_INTRA +
+                        (cnt[CNT_NEAREST] >= cnt[CNT_INTRA]);
+
+                    vp8_clamp_mv2(&near_mvs[near_index], &pbi->mb);
+
+                    cnt[CNT_SPLITMV] = ((above->mbmi.mode == SPLITMV)
+                                        + (left->mbmi.mode == SPLITMV)) * 2
+                                       + (aboveleft->mbmi.mode == SPLITMV);
+
+                    if( vp8_read(bc, vp8_mode_contexts [cnt[CNT_SPLITMV]] [3]) )
+                    {
+                        decode_split_mv(bc, mi, left, above,
+                                                    mbmi,
+                                                    near_mvs[near_index],
+                                                    mvc, mb_to_left_edge,
+                                                    mb_to_right_edge,
+                                                    mb_to_top_edge,
+                                                    mb_to_bottom_edge);
+                        mbmi->mv.as_int = mi->bmi[15].mv.as_int;
+                        mbmi->mode =  SPLITMV;
+                        mbmi->is_4x4 = 1;
+                    }
+                    else
+                    {
+                        int_mv *const mbmi_mv = & mbmi->mv;
+                        read_mv(bc, &mbmi_mv->as_mv, (const MV_CONTEXT *) mvc);
+                        mbmi_mv->as_mv.row += near_mvs[near_index].as_mv.row;
+                        mbmi_mv->as_mv.col += near_mvs[near_index].as_mv.col;
+
+                        /* Don't need to check this on NEARMV and NEARESTMV
+                         * modes since those modes clamp the MV. The NEWMV mode
+                         * does not, so signal to the prediction stage whether
+                         * special handling may be required.
+                         */
+                        mbmi->need_to_clamp_mvs =
+                            vp8_check_mv_bounds(mbmi_mv, mb_to_left_edge,
+                                                mb_to_right_edge,
+                                                mb_to_top_edge,
+                                                mb_to_bottom_edge);
+                        mbmi->mode =  NEWMV;
+                    }
+                }
+                else
+                {
+                    mbmi->mode =  NEARMV;
+                    mbmi->mv.as_int = near_mvs[CNT_NEAR].as_int;
+                    vp8_clamp_mv2(&mbmi->mv, &pbi->mb);
+                }
+            }
+            else
+            {
+                mbmi->mode =  NEARESTMV;
+                mbmi->mv.as_int = near_mvs[CNT_NEAREST].as_int;
+                vp8_clamp_mv2(&mbmi->mv, &pbi->mb);
+            }
+        }
+        else
+        {
+            mbmi->mode =  ZEROMV;
+            mbmi->mv.as_int = 0;
+        }
+
+#if CONFIG_ERROR_CONCEALMENT
+        if(pbi->ec_enabled && (mbmi->mode != SPLITMV))
+        {
+            mi->bmi[ 0].mv.as_int =
+            mi->bmi[ 1].mv.as_int =
+            mi->bmi[ 2].mv.as_int =
+            mi->bmi[ 3].mv.as_int =
+            mi->bmi[ 4].mv.as_int =
+            mi->bmi[ 5].mv.as_int =
+            mi->bmi[ 6].mv.as_int =
+            mi->bmi[ 7].mv.as_int =
+            mi->bmi[ 8].mv.as_int =
+            mi->bmi[ 9].mv.as_int =
+            mi->bmi[10].mv.as_int =
+            mi->bmi[11].mv.as_int =
+            mi->bmi[12].mv.as_int =
+            mi->bmi[13].mv.as_int =
+            mi->bmi[14].mv.as_int =
+            mi->bmi[15].mv.as_int = mbmi->mv.as_int;
+        }
+#endif
+    }
+    else
+    {
+        /* required for left and above block mv */
+        mbmi->mv.as_int = 0;
+
+        /* MB is intra coded */
+        if ((mbmi->mode = read_ymode(bc, pbi->common.fc.ymode_prob)) == B_PRED)
+        {
+            int j = 0;
+            mbmi->is_4x4 = 1;
+            do
+            {
+                mi->bmi[j].as_mode = read_bmode(bc, pbi->common.fc.bmode_prob);
+            }
+            while (++j < 16);
+        }
+
+        mbmi->uv_mode = read_uv_mode(bc, pbi->common.fc.uv_mode_prob);
+    }
+
+}
+
+static void read_mb_features(vp8_reader *r, MB_MODE_INFO *mi, MACROBLOCKD *x)
+{
+    /* Is segmentation enabled */
+    if (x->segmentation_enabled && x->update_mb_segmentation_map)
+    {
+        /* If so then read the segment id. */
+        if (vp8_read(r, x->mb_segment_tree_probs[0]))
+            mi->segment_id =
+                (unsigned char)(2 + vp8_read(r, x->mb_segment_tree_probs[2]));
+        else
+            mi->segment_id =
+                (unsigned char)(vp8_read(r, x->mb_segment_tree_probs[1]));
+    }
+}
+
+static void decode_mb_mode_mvs(VP8D_COMP *pbi, MODE_INFO *mi,
+                               MB_MODE_INFO *mbmi)
+{
+    (void)mbmi;
+
+    /* Read the Macroblock segmentation map if it is being updated explicitly
+     * this frame (reset to 0 above by default)
+     * By default on a key frame reset all MBs to segment 0
+     */
+    if (pbi->mb.update_mb_segmentation_map)
+        read_mb_features(&pbi->mbc[8], &mi->mbmi, &pbi->mb);
+    else if(pbi->common.frame_type == KEY_FRAME)
+        mi->mbmi.segment_id = 0;
+
+    /* Read the macroblock coeff skip flag if this feature is in use,
+     * else default to 0 */
+    if (pbi->common.mb_no_coeff_skip)
+        mi->mbmi.mb_skip_coeff = vp8_read(&pbi->mbc[8], pbi->prob_skip_false);
+    else
+        mi->mbmi.mb_skip_coeff = 0;
+
+    mi->mbmi.is_4x4 = 0;
+    if(pbi->common.frame_type == KEY_FRAME)
+        read_kf_modes(pbi, mi);
+    else
+        read_mb_modes_mv(pbi, mi, &mi->mbmi);
+
+}
+
+void vp8_decode_mode_mvs(VP8D_COMP *pbi)
+{
+    MODE_INFO *mi = pbi->common.mi;
+    int mb_row = -1;
+    int mb_to_right_edge_start;
+
+    mb_mode_mv_init(pbi);
+
+    pbi->mb.mb_to_top_edge = 0;
+    pbi->mb.mb_to_bottom_edge = ((pbi->common.mb_rows - 1) * 16) << 3;
+    mb_to_right_edge_start = ((pbi->common.mb_cols - 1) * 16) << 3;
+
+    while (++mb_row < pbi->common.mb_rows)
+    {
+        int mb_col = -1;
+
+        pbi->mb.mb_to_left_edge =  0;
+        pbi->mb.mb_to_right_edge = mb_to_right_edge_start;
+
+        while (++mb_col < pbi->common.mb_cols)
+        {
+#if CONFIG_ERROR_CONCEALMENT
+            int mb_num = mb_row * pbi->common.mb_cols + mb_col;
+#endif
+
+            decode_mb_mode_mvs(pbi, mi, &mi->mbmi);
+
+#if CONFIG_ERROR_CONCEALMENT
+            /* look for corruption. set mvs_corrupt_from_mb to the current
+             * mb_num if the frame is corrupt from this macroblock. */
+            if (vp8dx_bool_error(&pbi->mbc[8]) && mb_num <
+                (int)pbi->mvs_corrupt_from_mb)
+            {
+                pbi->mvs_corrupt_from_mb = mb_num;
+                /* no need to continue since the partition is corrupt from
+                 * here on.
+                 */
+                return;
+            }
+#endif
+
+            pbi->mb.mb_to_left_edge -= (16 << 3);
+            pbi->mb.mb_to_right_edge -= (16 << 3);
+            mi++;       /* next macroblock */
+        }
+        pbi->mb.mb_to_top_edge -= (16 << 3);
+        pbi->mb.mb_to_bottom_edge -= (16 << 3);
+
+        mi++;           /* skip left predictor each row */
+    }
+}
diff --git a/libs/libvpx/vp8/decoder/decodemv.h b/libs/libvpx/vp8/decoder/decodemv.h
new file mode 100644
index 0000000000..f33b07351d
--- /dev/null
+++ b/libs/libvpx/vp8/decoder/decodemv.h
@@ -0,0 +1,26 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_DECODER_DECODEMV_H_
+#define VP8_DECODER_DECODEMV_H_
+
+#include "onyxd_int.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp8_decode_mode_mvs(VP8D_COMP *);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_DECODER_DECODEMV_H_
diff --git a/libs/libvpx/vp8/decoder/decoderthreading.h b/libs/libvpx/vp8/decoder/decoderthreading.h
new file mode 100644
index 0000000000..c563cf6e93
--- /dev/null
+++ b/libs/libvpx/vp8/decoder/decoderthreading.h
@@ -0,0 +1,30 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_DECODER_DECODERTHREADING_H_
+#define VP8_DECODER_DECODERTHREADING_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if CONFIG_MULTITHREAD
+void vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd);
+void vp8_decoder_remove_threads(VP8D_COMP *pbi);
+void vp8_decoder_create_threads(VP8D_COMP *pbi);
+void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows);
+void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows);
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_DECODER_DECODERTHREADING_H_
diff --git a/libs/libvpx/vp8/decoder/detokenize.c b/libs/libvpx/vp8/decoder/detokenize.c
new file mode 100644
index 0000000000..fcc7533c50
--- /dev/null
+++ b/libs/libvpx/vp8/decoder/detokenize.c
@@ -0,0 +1,245 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp8/common/blockd.h"
+#include "onyxd_int.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+#include "detokenize.h"
+
+void vp8_reset_mb_tokens_context(MACROBLOCKD *x)
+{
+    ENTROPY_CONTEXT *a_ctx = ((ENTROPY_CONTEXT *)x->above_context);
+    ENTROPY_CONTEXT *l_ctx = ((ENTROPY_CONTEXT *)x->left_context);
+
+    memset(a_ctx, 0, sizeof(ENTROPY_CONTEXT_PLANES)-1);
+    memset(l_ctx, 0, sizeof(ENTROPY_CONTEXT_PLANES)-1);
+
+    /* Clear entropy contexts for Y2 blocks */
+    if (!x->mode_info_context->mbmi.is_4x4)
+    {
+        a_ctx[8] = l_ctx[8] = 0;
+    }
+}
+
+/*
+    ------------------------------------------------------------------------------
+    Residual decoding (Paragraph 13.2 / 13.3)
+*/
+static const uint8_t kBands[16 + 1] = {
+  0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7,
+  0  /* extra entry as sentinel */
+};
+
+static const uint8_t kCat3[] = { 173, 148, 140, 0 };
+static const uint8_t kCat4[] = { 176, 155, 140, 135, 0 };
+static const uint8_t kCat5[] = { 180, 157, 141, 134, 130, 0 };
+static const uint8_t kCat6[] =
+  { 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0 };
+static const uint8_t* const kCat3456[] = { kCat3, kCat4, kCat5, kCat6 };
+static const uint8_t kZigzag[16] = {
+  0, 1, 4, 8,  5, 2, 3, 6,  9, 12, 13, 10,  7, 11, 14, 15
+};
+
+#define VP8GetBit vp8dx_decode_bool
+#define NUM_PROBAS  11
+#define NUM_CTX  3
+
+/* for const-casting */
+typedef const uint8_t (*ProbaArray)[NUM_CTX][NUM_PROBAS];
+
+static int GetSigned(BOOL_DECODER *br, int value_to_sign)
+{
+    int split = (br->range + 1) >> 1;
+    VP8_BD_VALUE bigsplit = (VP8_BD_VALUE)split << (VP8_BD_VALUE_SIZE - 8);
+    int v;
+
+    if(br->count < 0)
+        vp8dx_bool_decoder_fill(br);
+
+    if ( br->value < bigsplit )
+    {
+        br->range = split;
+        v= value_to_sign;
+    }
+    else
+    {
+        br->range = br->range-split;
+        br->value = br->value-bigsplit;
+        v = -value_to_sign;
+    }
+    br->range +=br->range;
+    br->value +=br->value;
+    br->count--;
+
+    return v;
+}
+/*
+   Returns the position of the last non-zero coeff plus one
+   (and 0 if there's no coeff at all)
+*/
+static int GetCoeffs(BOOL_DECODER *br, ProbaArray prob,
+                     int ctx, int n, int16_t* out)
+{
+    const uint8_t* p = prob[n][ctx];
+    if (!VP8GetBit(br, p[0]))
+    {   /* first EOB is more a 'CBP' bit. */
+        return 0;
+    }
+    while (1)
+    {
+        ++n;
+        if (!VP8GetBit(br, p[1]))
+        {
+            p = prob[kBands[n]][0];
+        }
+        else
+        {  /* non zero coeff */
+            int v, j;
+            if (!VP8GetBit(br, p[2]))
+            {
+                p = prob[kBands[n]][1];
+                v = 1;
+            }
+            else
+            {
+                if (!VP8GetBit(br, p[3]))
+                {
+                    if (!VP8GetBit(br, p[4]))
+                    {
+                        v = 2;
+                    }
+                    else
+                    {
+                        v = 3 + VP8GetBit(br, p[5]);
+                    }
+                }
+                else
+                {
+                    if (!VP8GetBit(br, p[6]))
+                    {
+                        if (!VP8GetBit(br, p[7]))
+                        {
+                            v = 5 + VP8GetBit(br, 159);
+                        } else
+                        {
+                            v = 7 + 2 * VP8GetBit(br, 165);
+                            v += VP8GetBit(br, 145);
+                        }
+                    }
+                    else
+                    {
+                        const uint8_t* tab;
+                        const int bit1 = VP8GetBit(br, p[8]);
+                        const int bit0 = VP8GetBit(br, p[9 + bit1]);
+                        const int cat = 2 * bit1 + bit0;
+                        v = 0;
+                        for (tab = kCat3456[cat]; *tab; ++tab)
+                        {
+                            v += v + VP8GetBit(br, *tab);
+                        }
+                        v += 3 + (8 << cat);
+                    }
+                }
+                p = prob[kBands[n]][2];
+            }
+            j = kZigzag[n - 1];
+
+            out[j] = GetSigned(br, v);
+
+            if (n == 16 || !VP8GetBit(br, p[0]))
+            {   /* EOB */
+                return n;
+            }
+        }
+        if (n == 16)
+        {
+            return 16;
+        }
+    }
+}
+
+int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *x)
+{
+    BOOL_DECODER *bc = x->current_bc;
+    const FRAME_CONTEXT * const fc = &dx->common.fc;
+    char *eobs = x->eobs;
+
+    int i;
+    int nonzeros;
+    int eobtotal = 0;
+
+    short *qcoeff_ptr;
+    ProbaArray coef_probs;
+    ENTROPY_CONTEXT *a_ctx = ((ENTROPY_CONTEXT *)x->above_context);
+    ENTROPY_CONTEXT *l_ctx = ((ENTROPY_CONTEXT *)x->left_context);
+    ENTROPY_CONTEXT *a;
+    ENTROPY_CONTEXT *l;
+    int skip_dc = 0;
+
+    qcoeff_ptr = &x->qcoeff[0];
+
+    if (!x->mode_info_context->mbmi.is_4x4)
+    {
+        a = a_ctx + 8;
+        l = l_ctx + 8;
+
+        coef_probs = fc->coef_probs [1];
+
+        nonzeros = GetCoeffs(bc, coef_probs, (*a + *l), 0, qcoeff_ptr + 24 * 16);
+        *a = *l = (nonzeros > 0);
+
+        eobs[24] = nonzeros;
+        eobtotal += nonzeros - 16;
+
+        coef_probs = fc->coef_probs [0];
+        skip_dc = 1;
+    }
+    else
+    {
+        coef_probs = fc->coef_probs [3];
+        skip_dc = 0;
+    }
+
+    for (i = 0; i < 16; ++i)
+    {
+        a = a_ctx + (i&3);
+        l = l_ctx + ((i&0xc)>>2);
+
+        nonzeros = GetCoeffs(bc, coef_probs, (*a + *l), skip_dc, qcoeff_ptr);
+        *a = *l = (nonzeros > 0);
+
+        nonzeros += skip_dc;
+        eobs[i] = nonzeros;
+        eobtotal += nonzeros;
+        qcoeff_ptr += 16;
+    }
+
+    coef_probs = fc->coef_probs [2];
+
+    a_ctx += 4;
+    l_ctx += 4;
+    for (i = 16; i < 24; ++i)
+    {
+        a = a_ctx + ((i > 19)<<1) + (i&1);
+        l = l_ctx + ((i > 19)<<1) + ((i&3)>1);
+
+        nonzeros = GetCoeffs(bc, coef_probs, (*a + *l), 0, qcoeff_ptr);
+        *a = *l = (nonzeros > 0);
+
+        eobs[i] = nonzeros;
+        eobtotal += nonzeros;
+        qcoeff_ptr += 16;
+    }
+
+    return eobtotal;
+}
+
diff --git a/libs/libvpx/vp8/decoder/detokenize.h b/libs/libvpx/vp8/decoder/detokenize.h
new file mode 100644
index 0000000000..f0b125444f
--- /dev/null
+++ b/libs/libvpx/vp8/decoder/detokenize.h
@@ -0,0 +1,27 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_DECODER_DETOKENIZE_H_
+#define VP8_DECODER_DETOKENIZE_H_
+
+#include "onyxd_int.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp8_reset_mb_tokens_context(MACROBLOCKD *x);
+int vp8_decode_mb_tokens(VP8D_COMP *, MACROBLOCKD *);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_DECODER_DETOKENIZE_H_
diff --git a/libs/libvpx/vp8/decoder/ec_types.h b/libs/libvpx/vp8/decoder/ec_types.h
new file mode 100644
index 0000000000..3af5ca86b4
--- /dev/null
+++ b/libs/libvpx/vp8/decoder/ec_types.h
@@ -0,0 +1,58 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_DECODER_EC_TYPES_H_
+#define VP8_DECODER_EC_TYPES_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_OVERLAPS 16
+
+
+/* The area (pixel area in Q6) the block pointed to by bmi overlaps
+ * another block with.
+ */
+typedef struct
+{
+    int overlap;
+    union b_mode_info *bmi;
+} OVERLAP_NODE;
+
+/* Structure to keep track of overlapping blocks on a block level. */
+typedef struct
+{
+    /* TODO(holmer): This array should be exchanged for a linked list */
+    OVERLAP_NODE overlaps[MAX_OVERLAPS];
+} B_OVERLAP;
+
+/* Structure used to hold all the overlaps of a macroblock. The overlaps of a
+ * macroblock is further divided into block overlaps.
+ */
+typedef struct
+{
+    B_OVERLAP overlaps[16];
+} MB_OVERLAP;
+
+/* Structure for keeping track of motion vectors and which reference frame they
+ * refer to. Used for motion vector interpolation.
+ */
+typedef struct
+{
+    MV mv;
+    MV_REFERENCE_FRAME ref_frame;
+} EC_BLOCK;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_DECODER_EC_TYPES_H_
diff --git a/libs/libvpx/vp8/decoder/error_concealment.c b/libs/libvpx/vp8/decoder/error_concealment.c
new file mode 100644
index 0000000000..0b846a08b4
--- /dev/null
+++ b/libs/libvpx/vp8/decoder/error_concealment.c
@@ -0,0 +1,597 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "error_concealment.h"
+#include "onyxd_int.h"
+#include "decodemv.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp8/common/findnearmv.h"
+#include "vp8/common/common.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+#define FLOOR(x,q) ((x) & -(1 << (q)))
+
+#define NUM_NEIGHBORS 20
+
+typedef struct ec_position
+{
+    int row;
+    int col;
+} EC_POS;
+
+/*
+ * Regenerate the table in Matlab with:
+ * x = meshgrid((1:4), (1:4));
+ * y = meshgrid((1:4), (1:4))';
+ * W = round((1./(sqrt(x.^2 + y.^2))*2^7));
+ * W(1,1) = 0;
+ */
+static const int weights_q7[5][5] = {
+       {  0,   128,    64,    43,    32 },
+       {128,    91,    57,    40,    31 },
+       { 64,    57,    45,    36,    29 },
+       { 43,    40,    36,    30,    26 },
+       { 32,    31,    29,    26,    23 }
+};
+
+int vp8_alloc_overlap_lists(VP8D_COMP *pbi)
+{
+    if (pbi->overlaps != NULL)
+    {
+        vpx_free(pbi->overlaps);
+        pbi->overlaps = NULL;
+    }
+
+    pbi->overlaps = vpx_calloc(pbi->common.mb_rows * pbi->common.mb_cols,
+                               sizeof(MB_OVERLAP));
+
+    if (pbi->overlaps == NULL)
+        return -1;
+
+    return 0;
+}
+
+void vp8_de_alloc_overlap_lists(VP8D_COMP *pbi)
+{
+    vpx_free(pbi->overlaps);
+    pbi->overlaps = NULL;
+}
+
+/* Inserts a new overlap area value to the list of overlaps of a block */
+static void assign_overlap(OVERLAP_NODE* overlaps,
+                           union b_mode_info *bmi,
+                           int overlap)
+{
+    int i;
+    if (overlap <= 0)
+        return;
+    /* Find and assign to the next empty overlap node in the list of overlaps.
+     * Empty is defined as bmi == NULL */
+    for (i = 0; i < MAX_OVERLAPS; i++)
+    {
+        if (overlaps[i].bmi == NULL)
+        {
+            overlaps[i].bmi = bmi;
+            overlaps[i].overlap = overlap;
+            break;
+        }
+    }
+}
+
+/* Calculates the overlap area between two 4x4 squares, where the first
+ * square has its upper-left corner at (b1_row, b1_col) and the second
+ * square has its upper-left corner at (b2_row, b2_col). Doesn't
+ * properly handle squares which do not overlap.
+ */
+static int block_overlap(int b1_row, int b1_col, int b2_row, int b2_col)
+{
+    const int int_top = VPXMAX(b1_row, b2_row); // top
+    const int int_left = VPXMAX(b1_col, b2_col); // left
+    /* Since each block is 4x4 pixels, adding 4 (Q3) to the left/top edge
+     * gives us the right/bottom edge.
+     */
+    const int int_right = VPXMIN(b1_col + (4<<3), b2_col + (4<<3)); // right
+    const int int_bottom = VPXMIN(b1_row + (4<<3), b2_row + (4<<3)); // bottom
+    return (int_bottom - int_top) * (int_right - int_left);
+}
+
+/* Calculates the overlap area for all blocks in a macroblock at position
+ * (mb_row, mb_col) in macroblocks, which are being overlapped by a given
+ * overlapping block at position (new_row, new_col) (in pixels, Q3). The
+ * first block being overlapped in the macroblock has position (first_blk_row,
+ * first_blk_col) in blocks relative the upper-left corner of the image.
+ */
+static void calculate_overlaps_mb(B_OVERLAP *b_overlaps, union b_mode_info *bmi,
+                                  int new_row, int new_col,
+                                  int mb_row, int mb_col,
+                                  int first_blk_row, int first_blk_col)
+{
+    /* Find the blocks within this MB (defined by mb_row, mb_col) which are
+     * overlapped by bmi and calculate and assign overlap for each of those
+     * blocks. */
+
+    /* Block coordinates relative the upper-left block */
+    const int rel_ol_blk_row = first_blk_row - mb_row * 4;
+    const int rel_ol_blk_col = first_blk_col - mb_col * 4;
+    /* If the block partly overlaps any previous MB, these coordinates
+     * can be < 0. We don't want to access blocks in previous MBs.
+     */
+    const int blk_idx = VPXMAX(rel_ol_blk_row,0) * 4 + VPXMAX(rel_ol_blk_col,0);
+    /* Upper left overlapping block */
+    B_OVERLAP *b_ol_ul = &(b_overlaps[blk_idx]);
+
+    /* Calculate and assign overlaps for all blocks in this MB
+     * which the motion compensated block overlaps
+     */
+    /* Avoid calculating overlaps for blocks in later MBs */
+    int end_row = VPXMIN(4 + mb_row * 4 - first_blk_row, 2);
+    int end_col = VPXMIN(4 + mb_col * 4 - first_blk_col, 2);
+    int row, col;
+
+    /* Check if new_row and new_col are evenly divisible by 4 (Q3),
+     * and if so we shouldn't check neighboring blocks
+     */
+    if (new_row >= 0 && (new_row & 0x1F) == 0)
+        end_row = 1;
+    if (new_col >= 0 && (new_col & 0x1F) == 0)
+        end_col = 1;
+
+    /* Check if the overlapping block partly overlaps a previous MB
+     * and if so, we're overlapping fewer blocks in this MB.
+     */
+    if (new_row < (mb_row*16)<<3)
+        end_row = 1;
+    if (new_col < (mb_col*16)<<3)
+        end_col = 1;
+
+    for (row = 0; row < end_row; ++row)
+    {
+        for (col = 0; col < end_col; ++col)
+        {
+            /* input in Q3, result in Q6 */
+            const int overlap = block_overlap(new_row, new_col,
+                                                  (((first_blk_row + row) *
+                                                      4) << 3),
+                                                  (((first_blk_col + col) *
+                                                      4) << 3));
+            assign_overlap(b_ol_ul[row * 4 + col].overlaps, bmi, overlap);
+        }
+    }
+}
+
+void vp8_calculate_overlaps(MB_OVERLAP *overlap_ul,
+                            int mb_rows, int mb_cols,
+                            union b_mode_info *bmi,
+                            int b_row, int b_col)
+{
+    MB_OVERLAP *mb_overlap;
+    int row, col, rel_row, rel_col;
+    int new_row, new_col;
+    int end_row, end_col;
+    int overlap_b_row, overlap_b_col;
+    int overlap_mb_row, overlap_mb_col;
+
+    /* mb subpixel position */
+    row = (4 * b_row) << 3; /* Q3 */
+    col = (4 * b_col) << 3; /* Q3 */
+
+    /* reverse compensate for motion */
+    new_row = row - bmi->mv.as_mv.row;
+    new_col = col - bmi->mv.as_mv.col;
+
+    if (new_row >= ((16*mb_rows) << 3) || new_col >= ((16*mb_cols) << 3))
+    {
+        /* the new block ended up outside the frame */
+        return;
+    }
+
+    if (new_row <= (-4 << 3) || new_col <= (-4 << 3))
+    {
+        /* outside the frame */
+        return;
+    }
+    /* overlapping block's position in blocks */
+    overlap_b_row = FLOOR(new_row / 4, 3) >> 3;
+    overlap_b_col = FLOOR(new_col / 4, 3) >> 3;
+
+    /* overlapping block's MB position in MBs
+     * operations are done in Q3
+     */
+    overlap_mb_row = FLOOR((overlap_b_row << 3) / 4, 3) >> 3;
+    overlap_mb_col = FLOOR((overlap_b_col << 3) / 4, 3) >> 3;
+
+    end_row = VPXMIN(mb_rows - overlap_mb_row, 2);
+    end_col = VPXMIN(mb_cols - overlap_mb_col, 2);
+
+    /* Don't calculate overlap for MBs we don't overlap */
+    /* Check if the new block row starts at the last block row of the MB */
+    if (abs(new_row - ((16*overlap_mb_row) << 3)) < ((3*4) << 3))
+        end_row = 1;
+    /* Check if the new block col starts at the last block col of the MB */
+    if (abs(new_col - ((16*overlap_mb_col) << 3)) < ((3*4) << 3))
+        end_col = 1;
+
+    /* find the MB(s) this block is overlapping */
+    for (rel_row = 0; rel_row < end_row; ++rel_row)
+    {
+        for (rel_col = 0; rel_col < end_col; ++rel_col)
+        {
+            if (overlap_mb_row + rel_row < 0 ||
+                overlap_mb_col + rel_col < 0)
+                continue;
+            mb_overlap = overlap_ul + (overlap_mb_row + rel_row) * mb_cols +
+                 overlap_mb_col + rel_col;
+
+            calculate_overlaps_mb(mb_overlap->overlaps, bmi,
+                                  new_row, new_col,
+                                  overlap_mb_row + rel_row,
+                                  overlap_mb_col + rel_col,
+                                  overlap_b_row + rel_row,
+                                  overlap_b_col + rel_col);
+        }
+    }
+}
+
+/* Estimates a motion vector given the overlapping blocks' motion vectors.
+ * Filters out all overlapping blocks which do not refer to the correct
+ * reference frame type.
+ */
+static void estimate_mv(const OVERLAP_NODE *overlaps, union b_mode_info *bmi)
+{
+    int i;
+    int overlap_sum = 0;
+    int row_acc = 0;
+    int col_acc = 0;
+
+    bmi->mv.as_int = 0;
+    for (i=0; i < MAX_OVERLAPS; ++i)
+    {
+        if (overlaps[i].bmi == NULL)
+            break;
+        col_acc += overlaps[i].overlap * overlaps[i].bmi->mv.as_mv.col;
+        row_acc += overlaps[i].overlap * overlaps[i].bmi->mv.as_mv.row;
+        overlap_sum += overlaps[i].overlap;
+    }
+    if (overlap_sum > 0)
+    {
+        /* Q9 / Q6 = Q3 */
+        bmi->mv.as_mv.col = col_acc / overlap_sum;
+        bmi->mv.as_mv.row = row_acc / overlap_sum;
+    }
+    else
+    {
+        bmi->mv.as_mv.col = 0;
+        bmi->mv.as_mv.row = 0;
+    }
+}
+
+/* Estimates all motion vectors for a macroblock given the lists of
+ * overlaps for each block. Decides whether or not the MVs must be clamped.
+ */
+static void estimate_mb_mvs(const B_OVERLAP *block_overlaps,
+                            MODE_INFO *mi,
+                            int mb_to_left_edge,
+                            int mb_to_right_edge,
+                            int mb_to_top_edge,
+                            int mb_to_bottom_edge)
+{
+    int row, col;
+    int non_zero_count = 0;
+    MV * const filtered_mv = &(mi->mbmi.mv.as_mv);
+    union b_mode_info * const bmi = mi->bmi;
+    filtered_mv->col = 0;
+    filtered_mv->row = 0;
+    mi->mbmi.need_to_clamp_mvs = 0;
+    for (row = 0; row < 4; ++row)
+    {
+        int this_b_to_top_edge = mb_to_top_edge + ((row*4)<<3);
+        int this_b_to_bottom_edge = mb_to_bottom_edge - ((row*4)<<3);
+        for (col = 0; col < 4; ++col)
+        {
+            int i = row * 4 + col;
+            int this_b_to_left_edge = mb_to_left_edge + ((col*4)<<3);
+            int this_b_to_right_edge = mb_to_right_edge - ((col*4)<<3);
+            /* Estimate vectors for all blocks which are overlapped by this */
+            /* type. Interpolate/extrapolate the rest of the block's MVs */
+            estimate_mv(block_overlaps[i].overlaps, &(bmi[i]));
+            mi->mbmi.need_to_clamp_mvs |= vp8_check_mv_bounds(
+                                                         &bmi[i].mv,
+                                                         this_b_to_left_edge,
+                                                         this_b_to_right_edge,
+                                                         this_b_to_top_edge,
+                                                         this_b_to_bottom_edge);
+            if (bmi[i].mv.as_int != 0)
+            {
+                ++non_zero_count;
+                filtered_mv->col += bmi[i].mv.as_mv.col;
+                filtered_mv->row += bmi[i].mv.as_mv.row;
+            }
+        }
+    }
+    if (non_zero_count > 0)
+    {
+        filtered_mv->col /= non_zero_count;
+        filtered_mv->row /= non_zero_count;
+    }
+}
+
+static void calc_prev_mb_overlaps(MB_OVERLAP *overlaps, MODE_INFO *prev_mi,
+                                    int mb_row, int mb_col,
+                                    int mb_rows, int mb_cols)
+{
+    int sub_row;
+    int sub_col;
+    for (sub_row = 0; sub_row < 4; ++sub_row)
+    {
+        for (sub_col = 0; sub_col < 4; ++sub_col)
+        {
+            vp8_calculate_overlaps(
+                                overlaps, mb_rows, mb_cols,
+                                &(prev_mi->bmi[sub_row * 4 + sub_col]),
+                                4 * mb_row + sub_row,
+                                4 * mb_col + sub_col);
+        }
+    }
+}
+
+/* Estimate all missing motion vectors. This function does the same as the one
+ * above, but has different input arguments. */
+static void estimate_missing_mvs(MB_OVERLAP *overlaps,
+                                 MODE_INFO *mi, MODE_INFO *prev_mi,
+                                 int mb_rows, int mb_cols,
+                                 unsigned int first_corrupt)
+{
+    int mb_row, mb_col;
+    memset(overlaps, 0, sizeof(MB_OVERLAP) * mb_rows * mb_cols);
+    /* First calculate the overlaps for all blocks */
+    for (mb_row = 0; mb_row < mb_rows; ++mb_row)
+    {
+        for (mb_col = 0; mb_col < mb_cols; ++mb_col)
+        {
+            /* We're only able to use blocks referring to the last frame
+             * when extrapolating new vectors.
+             */
+            if (prev_mi->mbmi.ref_frame == LAST_FRAME)
+            {
+                calc_prev_mb_overlaps(overlaps, prev_mi,
+                                      mb_row, mb_col,
+                                      mb_rows, mb_cols);
+            }
+            ++prev_mi;
+        }
+        ++prev_mi;
+    }
+
+    mb_row = first_corrupt / mb_cols;
+    mb_col = first_corrupt - mb_row * mb_cols;
+    mi += mb_row*(mb_cols + 1) + mb_col;
+    /* Go through all macroblocks in the current image with missing MVs
+     * and calculate new MVs using the overlaps.
+     */
+    for (; mb_row < mb_rows; ++mb_row)
+    {
+        int mb_to_top_edge = -((mb_row * 16)) << 3;
+        int mb_to_bottom_edge = ((mb_rows - 1 - mb_row) * 16) << 3;
+        for (; mb_col < mb_cols; ++mb_col)
+        {
+            int mb_to_left_edge = -((mb_col * 16) << 3);
+            int mb_to_right_edge = ((mb_cols - 1 - mb_col) * 16) << 3;
+            const B_OVERLAP *block_overlaps =
+                    overlaps[mb_row*mb_cols + mb_col].overlaps;
+            mi->mbmi.ref_frame = LAST_FRAME;
+            mi->mbmi.mode = SPLITMV;
+            mi->mbmi.uv_mode = DC_PRED;
+            mi->mbmi.partitioning = 3;
+            mi->mbmi.segment_id = 0;
+            estimate_mb_mvs(block_overlaps,
+                            mi,
+                            mb_to_left_edge,
+                            mb_to_right_edge,
+                            mb_to_top_edge,
+                            mb_to_bottom_edge);
+            ++mi;
+        }
+        mb_col = 0;
+        ++mi;
+    }
+}
+
+void vp8_estimate_missing_mvs(VP8D_COMP *pbi)
+{
+    VP8_COMMON * const pc = &pbi->common;
+    estimate_missing_mvs(pbi->overlaps,
+                         pc->mi, pc->prev_mi,
+                         pc->mb_rows, pc->mb_cols,
+                         pbi->mvs_corrupt_from_mb);
+}
+
+static void assign_neighbor(EC_BLOCK *neighbor, MODE_INFO *mi, int block_idx)
+{
+    assert(mi->mbmi.ref_frame < MAX_REF_FRAMES);
+    neighbor->ref_frame = mi->mbmi.ref_frame;
+    neighbor->mv = mi->bmi[block_idx].mv.as_mv;
+}
+
+/* Finds the neighboring blocks of a macroblocks. In the general case
+ * 20 blocks are found. If a fewer number of blocks are found due to
+ * image boundaries, those positions in the EC_BLOCK array are left "empty".
+ * The neighbors are enumerated with the upper-left neighbor as the first
+ * element, the second element refers to the neighbor to right of the previous
+ * neighbor, and so on. The last element refers to the neighbor below the first
+ * neighbor.
+ */
+static void find_neighboring_blocks(MODE_INFO *mi,
+                                    EC_BLOCK *neighbors,
+                                    int mb_row, int mb_col,
+                                    int mb_rows, int mb_cols,
+                                    int mi_stride)
+{
+    int i = 0;
+    int j;
+    if (mb_row > 0)
+    {
+        /* upper left */
+        if (mb_col > 0)
+            assign_neighbor(&neighbors[i], mi - mi_stride - 1, 15);
+        ++i;
+        /* above */
+        for (j = 12; j < 16; ++j, ++i)
+            assign_neighbor(&neighbors[i], mi - mi_stride, j);
+    }
+    else
+        i += 5;
+    if (mb_col < mb_cols - 1)
+    {
+        /* upper right */
+        if (mb_row > 0)
+            assign_neighbor(&neighbors[i], mi - mi_stride + 1, 12);
+        ++i;
+        /* right */
+        for (j = 0; j <= 12; j += 4, ++i)
+            assign_neighbor(&neighbors[i], mi + 1, j);
+    }
+    else
+        i += 5;
+    if (mb_row < mb_rows - 1)
+    {
+        /* lower right */
+        if (mb_col < mb_cols - 1)
+            assign_neighbor(&neighbors[i], mi + mi_stride + 1, 0);
+        ++i;
+        /* below */
+        for (j = 0; j < 4; ++j, ++i)
+            assign_neighbor(&neighbors[i], mi + mi_stride, j);
+    }
+    else
+        i += 5;
+    if (mb_col > 0)
+    {
+        /* lower left */
+        if (mb_row < mb_rows - 1)
+            assign_neighbor(&neighbors[i], mi + mi_stride - 1, 4);
+        ++i;
+        /* left */
+        for (j = 3; j < 16; j += 4, ++i)
+        {
+            assign_neighbor(&neighbors[i], mi - 1, j);
+        }
+    }
+    else
+        i += 5;
+    assert(i == 20);
+}
+
+/* Interpolates all motion vectors for a macroblock from the neighboring blocks'
+ * motion vectors.
+ */
+static void interpolate_mvs(MACROBLOCKD *mb,
+                         EC_BLOCK *neighbors,
+                         MV_REFERENCE_FRAME dom_ref_frame)
+{
+    int row, col, i;
+    MODE_INFO * const mi = mb->mode_info_context;
+    /* Table with the position of the neighboring blocks relative the position
+     * of the upper left block of the current MB. Starting with the upper left
+     * neighbor and going to the right.
+     */
+    const EC_POS neigh_pos[NUM_NEIGHBORS] = {
+                                        {-1,-1}, {-1,0}, {-1,1}, {-1,2}, {-1,3},
+                                        {-1,4}, {0,4}, {1,4}, {2,4}, {3,4},
+                                        {4,4}, {4,3}, {4,2}, {4,1}, {4,0},
+                                        {4,-1}, {3,-1}, {2,-1}, {1,-1}, {0,-1}
+                                      };
+    mi->mbmi.need_to_clamp_mvs = 0;
+    for (row = 0; row < 4; ++row)
+    {
+        int mb_to_top_edge = mb->mb_to_top_edge + ((row*4)<<3);
+        int mb_to_bottom_edge = mb->mb_to_bottom_edge - ((row*4)<<3);
+        for (col = 0; col < 4; ++col)
+        {
+            int mb_to_left_edge = mb->mb_to_left_edge + ((col*4)<<3);
+            int mb_to_right_edge = mb->mb_to_right_edge - ((col*4)<<3);
+            int w_sum = 0;
+            int mv_row_sum = 0;
+            int mv_col_sum = 0;
+            int_mv * const mv = &(mi->bmi[row*4 + col].mv);
+            mv->as_int = 0;
+            for (i = 0; i < NUM_NEIGHBORS; ++i)
+            {
+                /* Calculate the weighted sum of neighboring MVs referring
+                 * to the dominant frame type.
+                 */
+                const int w = weights_q7[abs(row - neigh_pos[i].row)]
+                                        [abs(col - neigh_pos[i].col)];
+                if (neighbors[i].ref_frame != dom_ref_frame)
+                    continue;
+                w_sum += w;
+                /* Q7 * Q3 = Q10 */
+                mv_row_sum += w*neighbors[i].mv.row;
+                mv_col_sum += w*neighbors[i].mv.col;
+            }
+            if (w_sum > 0)
+            {
+                /* Avoid division by zero.
+                 * Normalize with the sum of the coefficients
+                 * Q3 = Q10 / Q7
+                 */
+                mv->as_mv.row = mv_row_sum / w_sum;
+                mv->as_mv.col = mv_col_sum / w_sum;
+                mi->mbmi.need_to_clamp_mvs |= vp8_check_mv_bounds(
+                                                            mv,
+                                                            mb_to_left_edge,
+                                                            mb_to_right_edge,
+                                                            mb_to_top_edge,
+                                                            mb_to_bottom_edge);
+            }
+        }
+    }
+}
+
+void vp8_interpolate_motion(MACROBLOCKD *mb,
+                        int mb_row, int mb_col,
+                        int mb_rows, int mb_cols,
+                        int mi_stride)
+{
+    /* Find relevant neighboring blocks */
+    EC_BLOCK neighbors[NUM_NEIGHBORS];
+    int i;
+    /* Initialize the array. MAX_REF_FRAMES is interpreted as "doesn't exist" */
+    for (i = 0; i < NUM_NEIGHBORS; ++i)
+    {
+        neighbors[i].ref_frame = MAX_REF_FRAMES;
+        neighbors[i].mv.row = neighbors[i].mv.col = 0;
+    }
+    find_neighboring_blocks(mb->mode_info_context,
+                                neighbors,
+                                mb_row, mb_col,
+                                mb_rows, mb_cols,
+                                mb->mode_info_stride);
+    /* Interpolate MVs for the missing blocks from the surrounding
+     * blocks which refer to the last frame. */
+    interpolate_mvs(mb, neighbors, LAST_FRAME);
+
+    mb->mode_info_context->mbmi.ref_frame = LAST_FRAME;
+    mb->mode_info_context->mbmi.mode = SPLITMV;
+    mb->mode_info_context->mbmi.uv_mode = DC_PRED;
+    mb->mode_info_context->mbmi.partitioning = 3;
+    mb->mode_info_context->mbmi.segment_id = 0;
+}
+
+void vp8_conceal_corrupt_mb(MACROBLOCKD *xd)
+{
+    /* This macroblock has corrupt residual, use the motion compensated
+       image (predictor) for concealment */
+
+    /* The build predictor functions now output directly into the dst buffer,
+     * so the copies are no longer necessary */
+
+}
diff --git a/libs/libvpx/vp8/decoder/error_concealment.h b/libs/libvpx/vp8/decoder/error_concealment.h
new file mode 100644
index 0000000000..9a1e024865
--- /dev/null
+++ b/libs/libvpx/vp8/decoder/error_concealment.h
@@ -0,0 +1,49 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_DECODER_ERROR_CONCEALMENT_H_
+#define VP8_DECODER_ERROR_CONCEALMENT_H_
+
+#include "onyxd_int.h"
+#include "ec_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Allocate memory for the overlap lists */
+int vp8_alloc_overlap_lists(VP8D_COMP *pbi);
+
+/* Deallocate the overlap lists */
+void vp8_de_alloc_overlap_lists(VP8D_COMP *pbi);
+
+/* Estimate all missing motion vectors. */
+void vp8_estimate_missing_mvs(VP8D_COMP *pbi);
+
+/* Functions for spatial MV interpolation */
+
+/* Interpolates all motion vectors for a macroblock mb at position
+ * (mb_row, mb_col). */
+void vp8_interpolate_motion(MACROBLOCKD *mb,
+                            int mb_row, int mb_col,
+                            int mb_rows, int mb_cols,
+                            int mi_stride);
+
+/* Conceal a macroblock with corrupt residual.
+ * Copies the prediction signal to the reconstructed image.
+ */
+void vp8_conceal_corrupt_mb(MACROBLOCKD *xd);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_DECODER_ERROR_CONCEALMENT_H_
diff --git a/libs/libvpx/vp8/decoder/onyxd_if.c b/libs/libvpx/vp8/decoder/onyxd_if.c
new file mode 100644
index 0000000000..3468268a2a
--- /dev/null
+++ b/libs/libvpx/vp8/decoder/onyxd_if.c
@@ -0,0 +1,521 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp8/common/onyxc_int.h"
+#if CONFIG_POSTPROC
+#include "vp8/common/postproc.h"
+#endif
+#include "vp8/common/onyxd.h"
+#include "onyxd_int.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp8/common/alloccommon.h"
+#include "vp8/common/loopfilter.h"
+#include "vp8/common/swapyv12buffer.h"
+#include "vp8/common/threading.h"
+#include "decoderthreading.h"
+#include <stdio.h>
+#include <assert.h>
+
+#include "vp8/common/quant_common.h"
+#include "vp8/common/reconintra.h"
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_scale_rtcd.h"
+#include "vpx_scale/vpx_scale.h"
+#include "vp8/common/systemdependent.h"
+#include "vpx_ports/vpx_once.h"
+#include "vpx_ports/vpx_timer.h"
+#include "detokenize.h"
+#if CONFIG_ERROR_CONCEALMENT
+#include "error_concealment.h"
+#endif
+#if ARCH_ARM
+#include "vpx_ports/arm.h"
+#endif
+
+extern void vp8_init_loop_filter(VP8_COMMON *cm);
+extern void vp8cx_init_de_quantizer(VP8D_COMP *pbi);
+static int get_free_fb (VP8_COMMON *cm);
+static void ref_cnt_fb (int *buf, int *idx, int new_idx);
+
+static void initialize_dec(void) {
+    static volatile int init_done = 0;
+
+    if (!init_done)
+    {
+        vpx_dsp_rtcd();
+        vp8_init_intra_predictors();
+        init_done = 1;
+    }
+}
+
+static void remove_decompressor(VP8D_COMP *pbi)
+{
+#if CONFIG_ERROR_CONCEALMENT
+    vp8_de_alloc_overlap_lists(pbi);
+#endif
+    vp8_remove_common(&pbi->common);
+    vpx_free(pbi);
+}
+
+static struct VP8D_COMP * create_decompressor(VP8D_CONFIG *oxcf)
+{
+    VP8D_COMP *pbi = vpx_memalign(32, sizeof(VP8D_COMP));
+
+    if (!pbi)
+        return NULL;
+
+    memset(pbi, 0, sizeof(VP8D_COMP));
+
+    if (setjmp(pbi->common.error.jmp))
+    {
+        pbi->common.error.setjmp = 0;
+        remove_decompressor(pbi);
+        return 0;
+    }
+
+    pbi->common.error.setjmp = 1;
+
+    vp8_create_common(&pbi->common);
+
+    pbi->common.current_video_frame = 0;
+    pbi->ready_for_new_data = 1;
+
+    /* vp8cx_init_de_quantizer() is first called here. Add check in frame_init_dequantizer() to avoid
+     *  unnecessary calling of vp8cx_init_de_quantizer() for every frame.
+     */
+    vp8cx_init_de_quantizer(pbi);
+
+    vp8_loop_filter_init(&pbi->common);
+
+    pbi->common.error.setjmp = 0;
+
+#if CONFIG_ERROR_CONCEALMENT
+    pbi->ec_enabled = oxcf->error_concealment;
+    pbi->overlaps = NULL;
+#else
+    (void)oxcf;
+    pbi->ec_enabled = 0;
+#endif
+    /* Error concealment is activated after a key frame has been
+     * decoded without errors when error concealment is enabled.
+     */
+    pbi->ec_active = 0;
+
+    pbi->decoded_key_frame = 0;
+
+    /* Independent partitions is activated when a frame updates the
+     * token probability table to have equal probabilities over the
+     * PREV_COEF context.
+     */
+    pbi->independent_partitions = 0;
+
+    vp8_setup_block_dptrs(&pbi->mb);
+
+    once(initialize_dec);
+
+    return pbi;
+}
+
+vpx_codec_err_t vp8dx_get_reference(VP8D_COMP *pbi, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd)
+{
+    VP8_COMMON *cm = &pbi->common;
+    int ref_fb_idx;
+
+    if (ref_frame_flag == VP8_LAST_FRAME)
+        ref_fb_idx = cm->lst_fb_idx;
+    else if (ref_frame_flag == VP8_GOLD_FRAME)
+        ref_fb_idx = cm->gld_fb_idx;
+    else if (ref_frame_flag == VP8_ALTR_FRAME)
+        ref_fb_idx = cm->alt_fb_idx;
+    else{
+        vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,
+            "Invalid reference frame");
+        return pbi->common.error.error_code;
+    }
+
+    if(cm->yv12_fb[ref_fb_idx].y_height != sd->y_height ||
+        cm->yv12_fb[ref_fb_idx].y_width != sd->y_width ||
+        cm->yv12_fb[ref_fb_idx].uv_height != sd->uv_height ||
+        cm->yv12_fb[ref_fb_idx].uv_width != sd->uv_width){
+        vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,
+            "Incorrect buffer dimensions");
+    }
+    else
+        vp8_yv12_copy_frame(&cm->yv12_fb[ref_fb_idx], sd);
+
+    return pbi->common.error.error_code;
+}
+
+
+vpx_codec_err_t vp8dx_set_reference(VP8D_COMP *pbi, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd)
+{
+    VP8_COMMON *cm = &pbi->common;
+    int *ref_fb_ptr = NULL;
+    int free_fb;
+
+    if (ref_frame_flag == VP8_LAST_FRAME)
+        ref_fb_ptr = &cm->lst_fb_idx;
+    else if (ref_frame_flag == VP8_GOLD_FRAME)
+        ref_fb_ptr = &cm->gld_fb_idx;
+    else if (ref_frame_flag == VP8_ALTR_FRAME)
+        ref_fb_ptr = &cm->alt_fb_idx;
+    else{
+        vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,
+            "Invalid reference frame");
+        return pbi->common.error.error_code;
+    }
+
+    if(cm->yv12_fb[*ref_fb_ptr].y_height != sd->y_height ||
+        cm->yv12_fb[*ref_fb_ptr].y_width != sd->y_width ||
+        cm->yv12_fb[*ref_fb_ptr].uv_height != sd->uv_height ||
+        cm->yv12_fb[*ref_fb_ptr].uv_width != sd->uv_width){
+        vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,
+            "Incorrect buffer dimensions");
+    }
+    else{
+        /* Find an empty frame buffer. */
+        free_fb = get_free_fb(cm);
+        /* Decrease fb_idx_ref_cnt since it will be increased again in
+         * ref_cnt_fb() below. */
+        cm->fb_idx_ref_cnt[free_fb]--;
+
+        /* Manage the reference counters and copy image. */
+        ref_cnt_fb (cm->fb_idx_ref_cnt, ref_fb_ptr, free_fb);
+        vp8_yv12_copy_frame(sd, &cm->yv12_fb[*ref_fb_ptr]);
+    }
+
+   return pbi->common.error.error_code;
+}
+
+static int get_free_fb (VP8_COMMON *cm)
+{
+    int i;
+    for (i = 0; i < NUM_YV12_BUFFERS; i++)
+        if (cm->fb_idx_ref_cnt[i] == 0)
+            break;
+
+    assert(i < NUM_YV12_BUFFERS);
+    cm->fb_idx_ref_cnt[i] = 1;
+    return i;
+}
+
+static void ref_cnt_fb (int *buf, int *idx, int new_idx)
+{
+    if (buf[*idx] > 0)
+        buf[*idx]--;
+
+    *idx = new_idx;
+
+    buf[new_idx]++;
+}
+
+/* If any buffer copy / swapping is signalled it should be done here. */
+static int swap_frame_buffers (VP8_COMMON *cm)
+{
+    int err = 0;
+
+    /* The alternate reference frame or golden frame can be updated
+     *  using the new, last, or golden/alt ref frame.  If it
+     *  is updated using the newly decoded frame it is a refresh.
+     *  An update using the last or golden/alt ref frame is a copy.
+     */
+    if (cm->copy_buffer_to_arf)
+    {
+        int new_fb = 0;
+
+        if (cm->copy_buffer_to_arf == 1)
+            new_fb = cm->lst_fb_idx;
+        else if (cm->copy_buffer_to_arf == 2)
+            new_fb = cm->gld_fb_idx;
+        else
+            err = -1;
+
+        ref_cnt_fb (cm->fb_idx_ref_cnt, &cm->alt_fb_idx, new_fb);
+    }
+
+    if (cm->copy_buffer_to_gf)
+    {
+        int new_fb = 0;
+
+        if (cm->copy_buffer_to_gf == 1)
+            new_fb = cm->lst_fb_idx;
+        else if (cm->copy_buffer_to_gf == 2)
+            new_fb = cm->alt_fb_idx;
+        else
+            err = -1;
+
+        ref_cnt_fb (cm->fb_idx_ref_cnt, &cm->gld_fb_idx, new_fb);
+    }
+
+    if (cm->refresh_golden_frame)
+        ref_cnt_fb (cm->fb_idx_ref_cnt, &cm->gld_fb_idx, cm->new_fb_idx);
+
+    if (cm->refresh_alt_ref_frame)
+        ref_cnt_fb (cm->fb_idx_ref_cnt, &cm->alt_fb_idx, cm->new_fb_idx);
+
+    if (cm->refresh_last_frame)
+    {
+        ref_cnt_fb (cm->fb_idx_ref_cnt, &cm->lst_fb_idx, cm->new_fb_idx);
+
+        cm->frame_to_show = &cm->yv12_fb[cm->lst_fb_idx];
+    }
+    else
+        cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx];
+
+    cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
+
+    return err;
+}
+
+static int check_fragments_for_errors(VP8D_COMP *pbi)
+{
+    if (!pbi->ec_active &&
+        pbi->fragments.count <= 1 && pbi->fragments.sizes[0] == 0)
+    {
+        VP8_COMMON *cm = &pbi->common;
+
+        /* If error concealment is disabled we won't signal missing frames
+         * to the decoder.
+         */
+        if (cm->fb_idx_ref_cnt[cm->lst_fb_idx] > 1)
+        {
+            /* The last reference shares buffer with another reference
+             * buffer. Move it to its own buffer before setting it as
+             * corrupt, otherwise we will make multiple buffers corrupt.
+             */
+            const int prev_idx = cm->lst_fb_idx;
+            cm->fb_idx_ref_cnt[prev_idx]--;
+            cm->lst_fb_idx = get_free_fb(cm);
+            vp8_yv12_copy_frame(&cm->yv12_fb[prev_idx],
+                                    &cm->yv12_fb[cm->lst_fb_idx]);
+        }
+        /* This is used to signal that we are missing frames.
+         * We do not know if the missing frame(s) was supposed to update
+         * any of the reference buffers, but we act conservative and
+         * mark only the last buffer as corrupted.
+         */
+        cm->yv12_fb[cm->lst_fb_idx].corrupted = 1;
+
+        /* Signal that we have no frame to show. */
+        cm->show_frame = 0;
+
+        /* Nothing more to do. */
+        return 0;
+    }
+
+    return 1;
+}
+
+int vp8dx_receive_compressed_data(VP8D_COMP *pbi, size_t size,
+                                  const uint8_t *source,
+                                  int64_t time_stamp)
+{
+    VP8_COMMON *cm = &pbi->common;
+    int retcode = -1;
+    (void)size;
+    (void)source;
+
+    pbi->common.error.error_code = VPX_CODEC_OK;
+
+    retcode = check_fragments_for_errors(pbi);
+    if(retcode <= 0)
+        return retcode;
+
+    cm->new_fb_idx = get_free_fb (cm);
+
+    /* setup reference frames for vp8_decode_frame */
+    pbi->dec_fb_ref[INTRA_FRAME]  = &cm->yv12_fb[cm->new_fb_idx];
+    pbi->dec_fb_ref[LAST_FRAME]   = &cm->yv12_fb[cm->lst_fb_idx];
+    pbi->dec_fb_ref[GOLDEN_FRAME] = &cm->yv12_fb[cm->gld_fb_idx];
+    pbi->dec_fb_ref[ALTREF_FRAME] = &cm->yv12_fb[cm->alt_fb_idx];
+
+    if (setjmp(pbi->common.error.jmp))
+    {
+       /* We do not know if the missing frame(s) was supposed to update
+        * any of the reference buffers, but we act conservative and
+        * mark only the last buffer as corrupted.
+        */
+        cm->yv12_fb[cm->lst_fb_idx].corrupted = 1;
+
+        if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0)
+          cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
+
+        goto decode_exit;
+    }
+
+    pbi->common.error.setjmp = 1;
+
+    retcode = vp8_decode_frame(pbi);
+
+    if (retcode < 0)
+    {
+        if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0)
+          cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
+
+        pbi->common.error.error_code = VPX_CODEC_ERROR;
+        goto decode_exit;
+    }
+
+    if (swap_frame_buffers (cm))
+    {
+        pbi->common.error.error_code = VPX_CODEC_ERROR;
+        goto decode_exit;
+    }
+
+    vp8_clear_system_state();
+
+    if (cm->show_frame)
+    {
+        cm->current_video_frame++;
+        cm->show_frame_mi = cm->mi;
+    }
+
+    #if CONFIG_ERROR_CONCEALMENT
+    /* swap the mode infos to storage for future error concealment */
+    if (pbi->ec_enabled && pbi->common.prev_mi)
+    {
+        MODE_INFO* tmp = pbi->common.prev_mi;
+        int row, col;
+        pbi->common.prev_mi = pbi->common.mi;
+        pbi->common.mi = tmp;
+
+        /* Propagate the segment_ids to the next frame */
+        for (row = 0; row < pbi->common.mb_rows; ++row)
+        {
+            for (col = 0; col < pbi->common.mb_cols; ++col)
+            {
+                const int i = row*pbi->common.mode_info_stride + col;
+                pbi->common.mi[i].mbmi.segment_id =
+                        pbi->common.prev_mi[i].mbmi.segment_id;
+            }
+        }
+    }
+#endif
+
+    pbi->ready_for_new_data = 0;
+    pbi->last_time_stamp = time_stamp;
+
+decode_exit:
+    pbi->common.error.setjmp = 0;
+    vp8_clear_system_state();
+    return retcode;
+}
+int vp8dx_get_raw_frame(VP8D_COMP *pbi, YV12_BUFFER_CONFIG *sd, int64_t *time_stamp, int64_t *time_end_stamp, vp8_ppflags_t *flags)
+{
+    int ret = -1;
+
+    if (pbi->ready_for_new_data == 1)
+        return ret;
+
+    /* ie no raw frame to show!!! */
+    if (pbi->common.show_frame == 0)
+        return ret;
+
+    pbi->ready_for_new_data = 1;
+    *time_stamp = pbi->last_time_stamp;
+    *time_end_stamp = 0;
+
+#if CONFIG_POSTPROC
+    ret = vp8_post_proc_frame(&pbi->common, sd, flags);
+#else
+    (void)flags;
+
+    if (pbi->common.frame_to_show)
+    {
+        *sd = *pbi->common.frame_to_show;
+        sd->y_width = pbi->common.Width;
+        sd->y_height = pbi->common.Height;
+        sd->uv_height = pbi->common.Height / 2;
+        ret = 0;
+    }
+    else
+    {
+        ret = -1;
+    }
+
+#endif /*!CONFIG_POSTPROC*/
+    vp8_clear_system_state();
+    return ret;
+}
+
+
+/* This function as written isn't decoder specific, but the encoder has
+ * much faster ways of computing this, so it's ok for it to live in a
+ * decode specific file.
+ */
+int vp8dx_references_buffer( VP8_COMMON *oci, int ref_frame )
+{
+    const MODE_INFO *mi = oci->mi;
+    int mb_row, mb_col;
+
+    for (mb_row = 0; mb_row < oci->mb_rows; mb_row++)
+    {
+        for (mb_col = 0; mb_col < oci->mb_cols; mb_col++,mi++)
+        {
+            if( mi->mbmi.ref_frame == ref_frame)
+              return 1;
+        }
+        mi++;
+    }
+    return 0;
+
+}
+
+int vp8_create_decoder_instances(struct frame_buffers *fb, VP8D_CONFIG *oxcf)
+{
+    if(!fb->use_frame_threads)
+    {
+        /* decoder instance for single thread mode */
+        fb->pbi[0] = create_decompressor(oxcf);
+        if(!fb->pbi[0])
+            return VPX_CODEC_ERROR;
+
+#if CONFIG_MULTITHREAD
+        /* enable row-based threading only when use_frame_threads
+         * is disabled */
+        fb->pbi[0]->max_threads = oxcf->max_threads;
+        vp8_decoder_create_threads(fb->pbi[0]);
+#endif
+    }
+    else
+    {
+        /* TODO : create frame threads and decoder instances for each
+         * thread here */
+    }
+
+    return VPX_CODEC_OK;
+}
+
+int vp8_remove_decoder_instances(struct frame_buffers *fb)
+{
+    if(!fb->use_frame_threads)
+    {
+        VP8D_COMP *pbi = fb->pbi[0];
+
+        if (!pbi)
+            return VPX_CODEC_ERROR;
+#if CONFIG_MULTITHREAD
+        if (pbi->b_multithreaded_rd)
+            vp8mt_de_alloc_temp_buffers(pbi, pbi->common.mb_rows);
+        vp8_decoder_remove_threads(pbi);
+#endif
+
+        /* decoder instance for single thread mode */
+        remove_decompressor(pbi);
+    }
+    else
+    {
+        /* TODO : remove frame threads and decoder instances for each
+         * thread here */
+    }
+
+    return VPX_CODEC_OK;
+}
diff --git a/libs/libvpx/vp8/decoder/onyxd_int.h b/libs/libvpx/vp8/decoder/onyxd_int.h
new file mode 100644
index 0000000000..313fe01c07
--- /dev/null
+++ b/libs/libvpx/vp8/decoder/onyxd_int.h
@@ -0,0 +1,161 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_DECODER_ONYXD_INT_H_
+#define VP8_DECODER_ONYXD_INT_H_
+
+#include "vpx_config.h"
+#include "vp8/common/onyxd.h"
+#include "treereader.h"
+#include "vp8/common/onyxc_int.h"
+#include "vp8/common/threading.h"
+
+#if CONFIG_ERROR_CONCEALMENT
+#include "ec_types.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct
+{
+    int ithread;
+    void *ptr1;
+    void *ptr2;
+} DECODETHREAD_DATA;
+
+typedef struct
+{
+    MACROBLOCKD  mbd;
+} MB_ROW_DEC;
+
+
+typedef struct
+{
+    int enabled;
+    unsigned int count;
+    const unsigned char *ptrs[MAX_PARTITIONS];
+    unsigned int sizes[MAX_PARTITIONS];
+} FRAGMENT_DATA;
+
+#define MAX_FB_MT_DEC 32
+
+struct frame_buffers
+{
+    /*
+     * this struct will be populated with frame buffer management
+     * info in future commits. */
+
+    /* enable/disable frame-based threading */
+    int     use_frame_threads;
+
+    /* decoder instances */
+    struct VP8D_COMP *pbi[MAX_FB_MT_DEC];
+
+};
+
+typedef struct VP8D_COMP
+{
+    DECLARE_ALIGNED(16, MACROBLOCKD, mb);
+
+    YV12_BUFFER_CONFIG *dec_fb_ref[NUM_YV12_BUFFERS];
+
+    DECLARE_ALIGNED(16, VP8_COMMON, common);
+
+    /* the last partition will be used for the modes/mvs */
+    vp8_reader mbc[MAX_PARTITIONS];
+
+    VP8D_CONFIG oxcf;
+
+    FRAGMENT_DATA fragments;
+
+#if CONFIG_MULTITHREAD
+    /* variable for threading */
+
+    int b_multithreaded_rd;
+    int max_threads;
+    int current_mb_col_main;
+    unsigned int decoding_thread_count;
+    int allocated_decoding_thread_count;
+
+    int mt_baseline_filter_level[MAX_MB_SEGMENTS];
+    int sync_range;
+    int *mt_current_mb_col;                  /* Each row remembers its already decoded column. */
+    pthread_mutex_t *pmutex;
+    pthread_mutex_t mt_mutex;                /* mutex for b_multithreaded_rd */
+
+    unsigned char **mt_yabove_row;           /* mb_rows x width */
+    unsigned char **mt_uabove_row;
+    unsigned char **mt_vabove_row;
+    unsigned char **mt_yleft_col;            /* mb_rows x 16 */
+    unsigned char **mt_uleft_col;            /* mb_rows x 8 */
+    unsigned char **mt_vleft_col;            /* mb_rows x 8 */
+
+    MB_ROW_DEC           *mb_row_di;
+    DECODETHREAD_DATA    *de_thread_data;
+
+    pthread_t           *h_decoding_thread;
+    sem_t               *h_event_start_decoding;
+    sem_t                h_event_end_decoding;
+    /* end of threading data */
+#endif
+
+    int64_t last_time_stamp;
+    int   ready_for_new_data;
+
+    vp8_prob prob_intra;
+    vp8_prob prob_last;
+    vp8_prob prob_gf;
+    vp8_prob prob_skip_false;
+
+#if CONFIG_ERROR_CONCEALMENT
+    MB_OVERLAP *overlaps;
+    /* the mb num from which modes and mvs (first partition) are corrupt */
+    unsigned int mvs_corrupt_from_mb;
+#endif
+    int ec_enabled;
+    int ec_active;
+    int decoded_key_frame;
+    int independent_partitions;
+    int frame_corrupt_residual;
+
+    vpx_decrypt_cb decrypt_cb;
+    void *decrypt_state;
+} VP8D_COMP;
+
+int vp8_decode_frame(VP8D_COMP *cpi);
+
+int vp8_create_decoder_instances(struct frame_buffers *fb, VP8D_CONFIG *oxcf);
+int vp8_remove_decoder_instances(struct frame_buffers *fb);
+
+#if CONFIG_DEBUG
+#define CHECK_MEM_ERROR(lval,expr) do {\
+        lval = (expr); \
+        if(!lval) \
+            vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,\
+                               "Failed to allocate "#lval" at %s:%d", \
+                               __FILE__,__LINE__);\
+    } while(0)
+#else
+#define CHECK_MEM_ERROR(lval,expr) do {\
+        lval = (expr); \
+        if(!lval) \
+            vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,\
+                               "Failed to allocate "#lval);\
+    } while(0)
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_DECODER_ONYXD_INT_H_
diff --git a/libs/libvpx/vp8/decoder/threading.c b/libs/libvpx/vp8/decoder/threading.c
new file mode 100644
index 0000000000..97979e3b2f
--- /dev/null
+++ b/libs/libvpx/vp8/decoder/threading.c
@@ -0,0 +1,931 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+#if !defined(WIN32) && CONFIG_OS_SUPPORT == 1
+# include <unistd.h>
+#endif
+#include "onyxd_int.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp8/common/threading.h"
+
+#include "vp8/common/loopfilter.h"
+#include "vp8/common/extend.h"
+#include "vpx_ports/vpx_timer.h"
+#include "detokenize.h"
+#include "vp8/common/reconintra4x4.h"
+#include "vp8/common/reconinter.h"
+#include "vp8/common/reconintra.h"
+#include "vp8/common/setupintrarecon.h"
+#if CONFIG_ERROR_CONCEALMENT
+#include "error_concealment.h"
+#endif
+
+#define CALLOC_ARRAY(p, n) CHECK_MEM_ERROR((p), vpx_calloc(sizeof(*(p)), (n)))
+#define CALLOC_ARRAY_ALIGNED(p, n, algn) do {                      \
+  CHECK_MEM_ERROR((p), vpx_memalign((algn), sizeof(*(p)) * (n)));  \
+  memset((p), 0, (n) * sizeof(*(p)));                              \
+} while (0)
+
+
+void vp8_mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd);
+
+static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC *mbrd, int count)
+{
+    VP8_COMMON *const pc = & pbi->common;
+    int i;
+
+    for (i = 0; i < count; i++)
+    {
+        MACROBLOCKD *mbd = &mbrd[i].mbd;
+        mbd->subpixel_predict        = xd->subpixel_predict;
+        mbd->subpixel_predict8x4     = xd->subpixel_predict8x4;
+        mbd->subpixel_predict8x8     = xd->subpixel_predict8x8;
+        mbd->subpixel_predict16x16   = xd->subpixel_predict16x16;
+
+        mbd->frame_type = pc->frame_type;
+        mbd->pre = xd->pre;
+        mbd->dst = xd->dst;
+
+        mbd->segmentation_enabled    = xd->segmentation_enabled;
+        mbd->mb_segement_abs_delta     = xd->mb_segement_abs_delta;
+        memcpy(mbd->segment_feature_data, xd->segment_feature_data, sizeof(xd->segment_feature_data));
+
+        /*signed char ref_lf_deltas[MAX_REF_LF_DELTAS];*/
+        memcpy(mbd->ref_lf_deltas, xd->ref_lf_deltas, sizeof(xd->ref_lf_deltas));
+        /*signed char mode_lf_deltas[MAX_MODE_LF_DELTAS];*/
+        memcpy(mbd->mode_lf_deltas, xd->mode_lf_deltas, sizeof(xd->mode_lf_deltas));
+        /*unsigned char mode_ref_lf_delta_enabled;
+        unsigned char mode_ref_lf_delta_update;*/
+        mbd->mode_ref_lf_delta_enabled    = xd->mode_ref_lf_delta_enabled;
+        mbd->mode_ref_lf_delta_update    = xd->mode_ref_lf_delta_update;
+
+        mbd->current_bc = &pbi->mbc[0];
+
+        memcpy(mbd->dequant_y1_dc, xd->dequant_y1_dc, sizeof(xd->dequant_y1_dc));
+        memcpy(mbd->dequant_y1, xd->dequant_y1, sizeof(xd->dequant_y1));
+        memcpy(mbd->dequant_y2, xd->dequant_y2, sizeof(xd->dequant_y2));
+        memcpy(mbd->dequant_uv, xd->dequant_uv, sizeof(xd->dequant_uv));
+
+        mbd->fullpixel_mask = 0xffffffff;
+
+        if (pc->full_pixel)
+            mbd->fullpixel_mask = 0xfffffff8;
+
+    }
+
+    for (i = 0; i < pc->mb_rows; i++)
+        pbi->mt_current_mb_col[i] = -1;
+}
+
+static void mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
+                                 unsigned int mb_idx)
+{
+    MB_PREDICTION_MODE mode;
+    int i;
+#if CONFIG_ERROR_CONCEALMENT
+    int corruption_detected = 0;
+#else
+    (void)mb_idx;
+#endif
+
+    if (xd->mode_info_context->mbmi.mb_skip_coeff)
+    {
+        vp8_reset_mb_tokens_context(xd);
+    }
+    else if (!vp8dx_bool_error(xd->current_bc))
+    {
+        int eobtotal;
+        eobtotal = vp8_decode_mb_tokens(pbi, xd);
+
+        /* Special case:  Force the loopfilter to skip when eobtotal is zero */
+        xd->mode_info_context->mbmi.mb_skip_coeff = (eobtotal==0);
+    }
+
+    mode = xd->mode_info_context->mbmi.mode;
+
+    if (xd->segmentation_enabled)
+        vp8_mb_init_dequantizer(pbi, xd);
+
+
+#if CONFIG_ERROR_CONCEALMENT
+
+    if(pbi->ec_active)
+    {
+        int throw_residual;
+        /* When we have independent partitions we can apply residual even
+         * though other partitions within the frame are corrupt.
+         */
+        throw_residual = (!pbi->independent_partitions &&
+                          pbi->frame_corrupt_residual);
+        throw_residual = (throw_residual || vp8dx_bool_error(xd->current_bc));
+
+        if ((mb_idx >= pbi->mvs_corrupt_from_mb || throw_residual))
+        {
+            /* MB with corrupt residuals or corrupt mode/motion vectors.
+             * Better to use the predictor as reconstruction.
+             */
+            pbi->frame_corrupt_residual = 1;
+            memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
+            vp8_conceal_corrupt_mb(xd);
+
+
+            corruption_detected = 1;
+
+            /* force idct to be skipped for B_PRED and use the
+             * prediction only for reconstruction
+             * */
+            memset(xd->eobs, 0, 25);
+        }
+    }
+#endif
+
+    /* do prediction */
+    if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
+    {
+        vp8_build_intra_predictors_mbuv_s(xd,
+                                          xd->recon_above[1],
+                                          xd->recon_above[2],
+                                          xd->recon_left[1],
+                                          xd->recon_left[2],
+                                          xd->recon_left_stride[1],
+                                          xd->dst.u_buffer, xd->dst.v_buffer,
+                                          xd->dst.uv_stride);
+
+        if (mode != B_PRED)
+        {
+            vp8_build_intra_predictors_mby_s(xd,
+                                                 xd->recon_above[0],
+                                                 xd->recon_left[0],
+                                                 xd->recon_left_stride[0],
+                                                 xd->dst.y_buffer,
+                                                 xd->dst.y_stride);
+        }
+        else
+        {
+            short *DQC = xd->dequant_y1;
+            int dst_stride = xd->dst.y_stride;
+
+            /* clear out residual eob info */
+            if(xd->mode_info_context->mbmi.mb_skip_coeff)
+                memset(xd->eobs, 0, 25);
+
+            intra_prediction_down_copy(xd, xd->recon_above[0] + 16);
+
+            for (i = 0; i < 16; i++)
+            {
+                BLOCKD *b = &xd->block[i];
+                unsigned char *dst = xd->dst.y_buffer + b->offset;
+                B_PREDICTION_MODE b_mode =
+                    xd->mode_info_context->bmi[i].as_mode;
+                unsigned char *Above;
+                unsigned char *yleft;
+                int left_stride;
+                unsigned char top_left;
+
+                /*Caution: For some b_mode, it needs 8 pixels (4 above + 4 above-right).*/
+                if (i < 4 && pbi->common.filter_level)
+                    Above = xd->recon_above[0] + b->offset;
+                else
+                    Above = dst - dst_stride;
+
+                if (i%4==0 && pbi->common.filter_level)
+                {
+                    yleft = xd->recon_left[0] + i;
+                    left_stride = 1;
+                }
+                else
+                {
+                    yleft = dst - 1;
+                    left_stride = dst_stride;
+                }
+
+                if ((i==4 || i==8 || i==12) && pbi->common.filter_level)
+                    top_left = *(xd->recon_left[0] + i - 1);
+                else
+                    top_left = Above[-1];
+
+                vp8_intra4x4_predict(Above, yleft, left_stride,
+                                     b_mode, dst, dst_stride, top_left);
+
+                if (xd->eobs[i] )
+                {
+                    if (xd->eobs[i] > 1)
+                    {
+                        vp8_dequant_idct_add(b->qcoeff, DQC, dst, dst_stride);
+                    }
+                    else
+                    {
+                        vp8_dc_only_idct_add(b->qcoeff[0] * DQC[0],
+                                             dst, dst_stride, dst, dst_stride);
+                        memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0]));
+                    }
+                }
+            }
+        }
+    }
+    else
+    {
+        vp8_build_inter_predictors_mb(xd);
+    }
+
+
+#if CONFIG_ERROR_CONCEALMENT
+    if (corruption_detected)
+    {
+        return;
+    }
+#endif
+
+    if(!xd->mode_info_context->mbmi.mb_skip_coeff)
+    {
+        /* dequantization and idct */
+        if (mode != B_PRED)
+        {
+            short *DQC = xd->dequant_y1;
+
+            if (mode != SPLITMV)
+            {
+                BLOCKD *b = &xd->block[24];
+
+                /* do 2nd order transform on the dc block */
+                if (xd->eobs[24] > 1)
+                {
+                    vp8_dequantize_b(b, xd->dequant_y2);
+
+                    vp8_short_inv_walsh4x4(&b->dqcoeff[0],
+                        xd->qcoeff);
+                    memset(b->qcoeff, 0, 16 * sizeof(b->qcoeff[0]));
+                }
+                else
+                {
+                    b->dqcoeff[0] = b->qcoeff[0] * xd->dequant_y2[0];
+                    vp8_short_inv_walsh4x4_1(&b->dqcoeff[0],
+                        xd->qcoeff);
+                    memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0]));
+                }
+
+                /* override the dc dequant constant in order to preserve the
+                 * dc components
+                 */
+                DQC = xd->dequant_y1_dc;
+            }
+
+            vp8_dequant_idct_add_y_block
+                            (xd->qcoeff, DQC,
+                             xd->dst.y_buffer,
+                             xd->dst.y_stride, xd->eobs);
+        }
+
+        vp8_dequant_idct_add_uv_block
+                        (xd->qcoeff+16*16, xd->dequant_uv,
+                         xd->dst.u_buffer, xd->dst.v_buffer,
+                         xd->dst.uv_stride, xd->eobs+16);
+    }
+}
+
+static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd, int start_mb_row)
+{
+    const int *last_row_current_mb_col;
+    int *current_mb_col;
+    int mb_row;
+    VP8_COMMON *pc = &pbi->common;
+    const int nsync = pbi->sync_range;
+    const int first_row_no_sync_above = pc->mb_cols + nsync;
+    int num_part = 1 << pbi->common.multi_token_partition;
+    int last_mb_row = start_mb_row;
+
+    YV12_BUFFER_CONFIG *yv12_fb_new = pbi->dec_fb_ref[INTRA_FRAME];
+    YV12_BUFFER_CONFIG *yv12_fb_lst = pbi->dec_fb_ref[LAST_FRAME];
+
+    int recon_y_stride = yv12_fb_new->y_stride;
+    int recon_uv_stride = yv12_fb_new->uv_stride;
+
+    unsigned char *ref_buffer[MAX_REF_FRAMES][3];
+    unsigned char *dst_buffer[3];
+    int i;
+    int ref_fb_corrupted[MAX_REF_FRAMES];
+
+    ref_fb_corrupted[INTRA_FRAME] = 0;
+
+    for(i = 1; i < MAX_REF_FRAMES; i++)
+    {
+        YV12_BUFFER_CONFIG *this_fb = pbi->dec_fb_ref[i];
+
+        ref_buffer[i][0] = this_fb->y_buffer;
+        ref_buffer[i][1] = this_fb->u_buffer;
+        ref_buffer[i][2] = this_fb->v_buffer;
+
+        ref_fb_corrupted[i] = this_fb->corrupted;
+    }
+
+    dst_buffer[0] = yv12_fb_new->y_buffer;
+    dst_buffer[1] = yv12_fb_new->u_buffer;
+    dst_buffer[2] = yv12_fb_new->v_buffer;
+
+    xd->up_available = (start_mb_row != 0);
+
+    xd->mode_info_context = pc->mi + pc->mode_info_stride * start_mb_row;
+    xd->mode_info_stride = pc->mode_info_stride;
+
+    for (mb_row = start_mb_row; mb_row < pc->mb_rows; mb_row += (pbi->decoding_thread_count + 1))
+    {
+       int recon_yoffset, recon_uvoffset;
+       int mb_col;
+       int filter_level;
+       loop_filter_info_n *lfi_n = &pc->lf_info;
+
+       /* save last row processed by this thread */
+       last_mb_row = mb_row;
+       /* select bool coder for current partition */
+       xd->current_bc =  &pbi->mbc[mb_row%num_part];
+
+       if (mb_row > 0)
+           last_row_current_mb_col = &pbi->mt_current_mb_col[mb_row -1];
+       else
+           last_row_current_mb_col = &first_row_no_sync_above;
+
+       current_mb_col = &pbi->mt_current_mb_col[mb_row];
+
+       recon_yoffset = mb_row * recon_y_stride * 16;
+       recon_uvoffset = mb_row * recon_uv_stride * 8;
+
+       /* reset contexts */
+       xd->above_context = pc->above_context;
+       memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
+
+       xd->left_available = 0;
+
+       xd->mb_to_top_edge = -((mb_row * 16)) << 3;
+       xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;
+
+       if (pbi->common.filter_level)
+       {
+          xd->recon_above[0] = pbi->mt_yabove_row[mb_row] + 0*16 +32;
+          xd->recon_above[1] = pbi->mt_uabove_row[mb_row] + 0*8 +16;
+          xd->recon_above[2] = pbi->mt_vabove_row[mb_row] + 0*8 +16;
+
+          xd->recon_left[0] = pbi->mt_yleft_col[mb_row];
+          xd->recon_left[1] = pbi->mt_uleft_col[mb_row];
+          xd->recon_left[2] = pbi->mt_vleft_col[mb_row];
+
+          /* TODO: move to outside row loop */
+          xd->recon_left_stride[0] = 1;
+          xd->recon_left_stride[1] = 1;
+       }
+       else
+       {
+          xd->recon_above[0] = dst_buffer[0] + recon_yoffset;
+          xd->recon_above[1] = dst_buffer[1] + recon_uvoffset;
+          xd->recon_above[2] = dst_buffer[2] + recon_uvoffset;
+
+          xd->recon_left[0] = xd->recon_above[0] - 1;
+          xd->recon_left[1] = xd->recon_above[1] - 1;
+          xd->recon_left[2] = xd->recon_above[2] - 1;
+
+          xd->recon_above[0] -= xd->dst.y_stride;
+          xd->recon_above[1] -= xd->dst.uv_stride;
+          xd->recon_above[2] -= xd->dst.uv_stride;
+
+          /* TODO: move to outside row loop */
+          xd->recon_left_stride[0] = xd->dst.y_stride;
+          xd->recon_left_stride[1] = xd->dst.uv_stride;
+
+          setup_intra_recon_left(xd->recon_left[0], xd->recon_left[1],
+                                 xd->recon_left[2], xd->dst.y_stride,
+                                 xd->dst.uv_stride);
+       }
+
+       for (mb_col = 0; mb_col < pc->mb_cols; mb_col++) {
+           if (((mb_col - 1) % nsync) == 0) {
+               pthread_mutex_t *mutex = &pbi->pmutex[mb_row];
+               protected_write(mutex, current_mb_col, mb_col - 1);
+           }
+
+           if (mb_row && !(mb_col & (nsync - 1))) {
+               pthread_mutex_t *mutex = &pbi->pmutex[mb_row-1];
+               sync_read(mutex, mb_col, last_row_current_mb_col, nsync);
+           }
+
+           /* Distance of MB to the various image edges.
+            * These are specified to 8th pel as they are always
+            * compared to values that are in 1/8th pel units.
+            */
+           xd->mb_to_left_edge = -((mb_col * 16) << 3);
+           xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
+
+    #if CONFIG_ERROR_CONCEALMENT
+           {
+               int corrupt_residual =
+                           (!pbi->independent_partitions &&
+                           pbi->frame_corrupt_residual) ||
+                           vp8dx_bool_error(xd->current_bc);
+               if (pbi->ec_active &&
+                   (xd->mode_info_context->mbmi.ref_frame ==
+                                                    INTRA_FRAME) &&
+                   corrupt_residual)
+               {
+                   /* We have an intra block with corrupt
+                    * coefficients, better to conceal with an inter
+                    * block.
+                    * Interpolate MVs from neighboring MBs
+                    *
+                    * Note that for the first mb with corrupt
+                    * residual in a frame, we might not discover
+                    * that before decoding the residual. That
+                    * happens after this check, and therefore no
+                    * inter concealment will be done.
+                    */
+                   vp8_interpolate_motion(xd,
+                                          mb_row, mb_col,
+                                          pc->mb_rows, pc->mb_cols,
+                                          pc->mode_info_stride);
+               }
+           }
+    #endif
+
+
+           xd->dst.y_buffer = dst_buffer[0] + recon_yoffset;
+           xd->dst.u_buffer = dst_buffer[1] + recon_uvoffset;
+           xd->dst.v_buffer = dst_buffer[2] + recon_uvoffset;
+
+           xd->pre.y_buffer = ref_buffer[xd->mode_info_context->mbmi.ref_frame][0] + recon_yoffset;
+           xd->pre.u_buffer = ref_buffer[xd->mode_info_context->mbmi.ref_frame][1] + recon_uvoffset;
+           xd->pre.v_buffer = ref_buffer[xd->mode_info_context->mbmi.ref_frame][2] + recon_uvoffset;
+
+           /* propagate errors from reference frames */
+           xd->corrupted |= ref_fb_corrupted[xd->mode_info_context->mbmi.ref_frame];
+
+           mt_decode_macroblock(pbi, xd, 0);
+
+           xd->left_available = 1;
+
+           /* check if the boolean decoder has suffered an error */
+           xd->corrupted |= vp8dx_bool_error(xd->current_bc);
+
+           xd->recon_above[0] += 16;
+           xd->recon_above[1] += 8;
+           xd->recon_above[2] += 8;
+
+           if (!pbi->common.filter_level)
+           {
+              xd->recon_left[0] += 16;
+              xd->recon_left[1] += 8;
+              xd->recon_left[2] += 8;
+           }
+
+           if (pbi->common.filter_level)
+           {
+               int skip_lf = (xd->mode_info_context->mbmi.mode != B_PRED &&
+                               xd->mode_info_context->mbmi.mode != SPLITMV &&
+                               xd->mode_info_context->mbmi.mb_skip_coeff);
+
+               const int mode_index = lfi_n->mode_lf_lut[xd->mode_info_context->mbmi.mode];
+               const int seg = xd->mode_info_context->mbmi.segment_id;
+               const int ref_frame = xd->mode_info_context->mbmi.ref_frame;
+
+               filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
+
+               if( mb_row != pc->mb_rows-1 )
+               {
+                   /* Save decoded MB last row data for next-row decoding */
+                   memcpy((pbi->mt_yabove_row[mb_row + 1] + 32 + mb_col*16), (xd->dst.y_buffer + 15 * recon_y_stride), 16);
+                   memcpy((pbi->mt_uabove_row[mb_row + 1] + 16 + mb_col*8), (xd->dst.u_buffer + 7 * recon_uv_stride), 8);
+                   memcpy((pbi->mt_vabove_row[mb_row + 1] + 16 + mb_col*8), (xd->dst.v_buffer + 7 * recon_uv_stride), 8);
+               }
+
+               /* save left_col for next MB decoding */
+               if(mb_col != pc->mb_cols-1)
+               {
+                   MODE_INFO *next = xd->mode_info_context +1;
+
+                   if (next->mbmi.ref_frame == INTRA_FRAME)
+                   {
+                       for (i = 0; i < 16; i++)
+                           pbi->mt_yleft_col[mb_row][i] = xd->dst.y_buffer [i* recon_y_stride + 15];
+                       for (i = 0; i < 8; i++)
+                       {
+                           pbi->mt_uleft_col[mb_row][i] = xd->dst.u_buffer [i* recon_uv_stride + 7];
+                           pbi->mt_vleft_col[mb_row][i] = xd->dst.v_buffer [i* recon_uv_stride + 7];
+                       }
+                   }
+               }
+
+               /* loopfilter on this macroblock. */
+               if (filter_level)
+               {
+                   if(pc->filter_type == NORMAL_LOOPFILTER)
+                   {
+                       loop_filter_info lfi;
+                       FRAME_TYPE frame_type = pc->frame_type;
+                       const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
+                       lfi.mblim = lfi_n->mblim[filter_level];
+                       lfi.blim = lfi_n->blim[filter_level];
+                       lfi.lim = lfi_n->lim[filter_level];
+                       lfi.hev_thr = lfi_n->hev_thr[hev_index];
+
+                       if (mb_col > 0)
+                           vp8_loop_filter_mbv
+                           (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi);
+
+                       if (!skip_lf)
+                           vp8_loop_filter_bv
+                           (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi);
+
+                       /* don't apply across umv border */
+                       if (mb_row > 0)
+                           vp8_loop_filter_mbh
+                           (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi);
+
+                       if (!skip_lf)
+                           vp8_loop_filter_bh
+                           (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer,  recon_y_stride, recon_uv_stride, &lfi);
+                   }
+                   else
+                   {
+                       if (mb_col > 0)
+                           vp8_loop_filter_simple_mbv
+                           (xd->dst.y_buffer, recon_y_stride, lfi_n->mblim[filter_level]);
+
+                       if (!skip_lf)
+                           vp8_loop_filter_simple_bv
+                           (xd->dst.y_buffer, recon_y_stride, lfi_n->blim[filter_level]);
+
+                       /* don't apply across umv border */
+                       if (mb_row > 0)
+                           vp8_loop_filter_simple_mbh
+                           (xd->dst.y_buffer, recon_y_stride, lfi_n->mblim[filter_level]);
+
+                       if (!skip_lf)
+                           vp8_loop_filter_simple_bh
+                           (xd->dst.y_buffer, recon_y_stride, lfi_n->blim[filter_level]);
+                   }
+               }
+
+           }
+
+           recon_yoffset += 16;
+           recon_uvoffset += 8;
+
+           ++xd->mode_info_context;  /* next mb */
+
+           xd->above_context++;
+       }
+
+       /* adjust to the next row of mbs */
+       if (pbi->common.filter_level)
+       {
+           if(mb_row != pc->mb_rows-1)
+           {
+               int lasty = yv12_fb_lst->y_width + VP8BORDERINPIXELS;
+               int lastuv = (yv12_fb_lst->y_width>>1) + (VP8BORDERINPIXELS>>1);
+
+               for (i = 0; i < 4; i++)
+               {
+                   pbi->mt_yabove_row[mb_row +1][lasty + i] = pbi->mt_yabove_row[mb_row +1][lasty -1];
+                   pbi->mt_uabove_row[mb_row +1][lastuv + i] = pbi->mt_uabove_row[mb_row +1][lastuv -1];
+                   pbi->mt_vabove_row[mb_row +1][lastuv + i] = pbi->mt_vabove_row[mb_row +1][lastuv -1];
+               }
+           }
+       }
+       else
+           vp8_extend_mb_row(yv12_fb_new, xd->dst.y_buffer + 16,
+                             xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
+
+       /* last MB of row is ready just after extension is done */
+       protected_write(&pbi->pmutex[mb_row], current_mb_col, mb_col + nsync);
+
+       ++xd->mode_info_context;      /* skip prediction column */
+       xd->up_available = 1;
+
+       /* since we have multithread */
+       xd->mode_info_context += xd->mode_info_stride * pbi->decoding_thread_count;
+    }
+
+    /* signal end of frame decoding if this thread processed the last mb_row */
+    if (last_mb_row == (pc->mb_rows - 1))
+        sem_post(&pbi->h_event_end_decoding);
+
+}
+
+
+static THREAD_FUNCTION thread_decoding_proc(void *p_data)
+{
+    int ithread = ((DECODETHREAD_DATA *)p_data)->ithread;
+    VP8D_COMP *pbi = (VP8D_COMP *)(((DECODETHREAD_DATA *)p_data)->ptr1);
+    MB_ROW_DEC *mbrd = (MB_ROW_DEC *)(((DECODETHREAD_DATA *)p_data)->ptr2);
+    ENTROPY_CONTEXT_PLANES mb_row_left_context;
+
+    while (1)
+    {
+        if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd) == 0)
+            break;
+
+        if (sem_wait(&pbi->h_event_start_decoding[ithread]) == 0)
+        {
+            if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd) == 0)
+                break;
+            else
+            {
+                MACROBLOCKD *xd = &mbrd->mbd;
+                xd->left_context = &mb_row_left_context;
+
+                mt_decode_mb_rows(pbi, xd, ithread+1);
+            }
+        }
+    }
+
+    return 0 ;
+}
+
+
+void vp8_decoder_create_threads(VP8D_COMP *pbi)
+{
+    int core_count = 0;
+    unsigned int ithread;
+
+    pbi->b_multithreaded_rd = 0;
+    pbi->allocated_decoding_thread_count = 0;
+    pthread_mutex_init(&pbi->mt_mutex, NULL);
+
+    /* limit decoding threads to the max number of token partitions */
+    core_count = (pbi->max_threads > 8) ? 8 : pbi->max_threads;
+
+    /* limit decoding threads to the available cores */
+    if (core_count > pbi->common.processor_core_count)
+        core_count = pbi->common.processor_core_count;
+
+    if (core_count > 1)
+    {
+        pbi->b_multithreaded_rd = 1;
+        pbi->decoding_thread_count = core_count - 1;
+
+        CALLOC_ARRAY(pbi->h_decoding_thread, pbi->decoding_thread_count);
+        CALLOC_ARRAY(pbi->h_event_start_decoding, pbi->decoding_thread_count);
+        CALLOC_ARRAY_ALIGNED(pbi->mb_row_di, pbi->decoding_thread_count, 32);
+        CALLOC_ARRAY(pbi->de_thread_data, pbi->decoding_thread_count);
+
+        for (ithread = 0; ithread < pbi->decoding_thread_count; ithread++)
+        {
+            sem_init(&pbi->h_event_start_decoding[ithread], 0, 0);
+
+            vp8_setup_block_dptrs(&pbi->mb_row_di[ithread].mbd);
+
+            pbi->de_thread_data[ithread].ithread  = ithread;
+            pbi->de_thread_data[ithread].ptr1     = (void *)pbi;
+            pbi->de_thread_data[ithread].ptr2     = (void *) &pbi->mb_row_di[ithread];
+
+            pthread_create(&pbi->h_decoding_thread[ithread], 0, thread_decoding_proc, (&pbi->de_thread_data[ithread]));
+        }
+
+        sem_init(&pbi->h_event_end_decoding, 0, 0);
+
+        pbi->allocated_decoding_thread_count = pbi->decoding_thread_count;
+    }
+}
+
+
+void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows)
+{
+    int i;
+
+    if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd))
+    {
+        /* De-allocate mutex */
+        if (pbi->pmutex != NULL) {
+            for (i = 0; i < mb_rows; i++) {
+                pthread_mutex_destroy(&pbi->pmutex[i]);
+            }
+            vpx_free(pbi->pmutex);
+            pbi->pmutex = NULL;
+        }
+
+            vpx_free(pbi->mt_current_mb_col);
+            pbi->mt_current_mb_col = NULL ;
+
+        /* Free above_row buffers. */
+        if (pbi->mt_yabove_row)
+        {
+            for (i=0; i< mb_rows; i++)
+            {
+                    vpx_free(pbi->mt_yabove_row[i]);
+                    pbi->mt_yabove_row[i] = NULL ;
+            }
+            vpx_free(pbi->mt_yabove_row);
+            pbi->mt_yabove_row = NULL ;
+        }
+
+        if (pbi->mt_uabove_row)
+        {
+            for (i=0; i< mb_rows; i++)
+            {
+                    vpx_free(pbi->mt_uabove_row[i]);
+                    pbi->mt_uabove_row[i] = NULL ;
+            }
+            vpx_free(pbi->mt_uabove_row);
+            pbi->mt_uabove_row = NULL ;
+        }
+
+        if (pbi->mt_vabove_row)
+        {
+            for (i=0; i< mb_rows; i++)
+            {
+                    vpx_free(pbi->mt_vabove_row[i]);
+                    pbi->mt_vabove_row[i] = NULL ;
+            }
+            vpx_free(pbi->mt_vabove_row);
+            pbi->mt_vabove_row = NULL ;
+        }
+
+        /* Free left_col buffers. */
+        if (pbi->mt_yleft_col)
+        {
+            for (i=0; i< mb_rows; i++)
+            {
+                    vpx_free(pbi->mt_yleft_col[i]);
+                    pbi->mt_yleft_col[i] = NULL ;
+            }
+            vpx_free(pbi->mt_yleft_col);
+            pbi->mt_yleft_col = NULL ;
+        }
+
+        if (pbi->mt_uleft_col)
+        {
+            for (i=0; i< mb_rows; i++)
+            {
+                    vpx_free(pbi->mt_uleft_col[i]);
+                    pbi->mt_uleft_col[i] = NULL ;
+            }
+            vpx_free(pbi->mt_uleft_col);
+            pbi->mt_uleft_col = NULL ;
+        }
+
+        if (pbi->mt_vleft_col)
+        {
+            for (i=0; i< mb_rows; i++)
+            {
+                    vpx_free(pbi->mt_vleft_col[i]);
+                    pbi->mt_vleft_col[i] = NULL ;
+            }
+            vpx_free(pbi->mt_vleft_col);
+            pbi->mt_vleft_col = NULL ;
+        }
+    }
+}
+
+
+void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows)
+{
+    VP8_COMMON *const pc = & pbi->common;
+    int i;
+    int uv_width;
+
+    if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd))
+    {
+        vp8mt_de_alloc_temp_buffers(pbi, prev_mb_rows);
+
+        /* our internal buffers are always multiples of 16 */
+        if ((width & 0xf) != 0)
+            width += 16 - (width & 0xf);
+
+        if (width < 640) pbi->sync_range = 1;
+        else if (width <= 1280) pbi->sync_range = 8;
+        else if (width <= 2560) pbi->sync_range =16;
+        else pbi->sync_range = 32;
+
+        uv_width = width >>1;
+
+        /* Allocate mutex */
+        CHECK_MEM_ERROR(pbi->pmutex, vpx_malloc(sizeof(*pbi->pmutex) *
+                                                pc->mb_rows));
+        if (pbi->pmutex) {
+            for (i = 0; i < pc->mb_rows; i++) {
+                pthread_mutex_init(&pbi->pmutex[i], NULL);
+            }
+        }
+
+        /* Allocate an int for each mb row. */
+        CALLOC_ARRAY(pbi->mt_current_mb_col, pc->mb_rows);
+
+        /* Allocate memory for above_row buffers. */
+        CALLOC_ARRAY(pbi->mt_yabove_row, pc->mb_rows);
+        for (i = 0; i < pc->mb_rows; i++)
+            CHECK_MEM_ERROR(pbi->mt_yabove_row[i], vpx_memalign(16,sizeof(unsigned char) * (width + (VP8BORDERINPIXELS<<1))));
+
+        CALLOC_ARRAY(pbi->mt_uabove_row, pc->mb_rows);
+        for (i = 0; i < pc->mb_rows; i++)
+            CHECK_MEM_ERROR(pbi->mt_uabove_row[i], vpx_memalign(16,sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS)));
+
+        CALLOC_ARRAY(pbi->mt_vabove_row, pc->mb_rows);
+        for (i = 0; i < pc->mb_rows; i++)
+            CHECK_MEM_ERROR(pbi->mt_vabove_row[i], vpx_memalign(16,sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS)));
+
+        /* Allocate memory for left_col buffers. */
+        CALLOC_ARRAY(pbi->mt_yleft_col, pc->mb_rows);
+        for (i = 0; i < pc->mb_rows; i++)
+            CHECK_MEM_ERROR(pbi->mt_yleft_col[i], vpx_calloc(sizeof(unsigned char) * 16, 1));
+
+        CALLOC_ARRAY(pbi->mt_uleft_col, pc->mb_rows);
+        for (i = 0; i < pc->mb_rows; i++)
+            CHECK_MEM_ERROR(pbi->mt_uleft_col[i], vpx_calloc(sizeof(unsigned char) * 8, 1));
+
+        CALLOC_ARRAY(pbi->mt_vleft_col, pc->mb_rows);
+        for (i = 0; i < pc->mb_rows; i++)
+            CHECK_MEM_ERROR(pbi->mt_vleft_col[i], vpx_calloc(sizeof(unsigned char) * 8, 1));
+    }
+}
+
+
+void vp8_decoder_remove_threads(VP8D_COMP *pbi)
+{
+    /* shutdown MB Decoding thread; */
+    if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd))
+    {
+        int i;
+
+        protected_write(&pbi->mt_mutex, &pbi->b_multithreaded_rd, 0);
+
+        /* allow all threads to exit */
+        for (i = 0; i < pbi->allocated_decoding_thread_count; i++)
+        {
+            sem_post(&pbi->h_event_start_decoding[i]);
+            pthread_join(pbi->h_decoding_thread[i], NULL);
+        }
+
+        for (i = 0; i < pbi->allocated_decoding_thread_count; i++)
+        {
+            sem_destroy(&pbi->h_event_start_decoding[i]);
+        }
+
+        sem_destroy(&pbi->h_event_end_decoding);
+
+            vpx_free(pbi->h_decoding_thread);
+            pbi->h_decoding_thread = NULL;
+
+            vpx_free(pbi->h_event_start_decoding);
+            pbi->h_event_start_decoding = NULL;
+
+            vpx_free(pbi->mb_row_di);
+            pbi->mb_row_di = NULL ;
+
+            vpx_free(pbi->de_thread_data);
+            pbi->de_thread_data = NULL;
+    }
+    pthread_mutex_destroy(&pbi->mt_mutex);
+}
+
+void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)
+{
+    VP8_COMMON *pc = &pbi->common;
+    unsigned int i;
+    int j;
+
+    int filter_level = pc->filter_level;
+    YV12_BUFFER_CONFIG *yv12_fb_new = pbi->dec_fb_ref[INTRA_FRAME];
+
+    if (filter_level)
+    {
+        /* Set above_row buffer to 127 for decoding first MB row */
+        memset(pbi->mt_yabove_row[0] + VP8BORDERINPIXELS-1, 127, yv12_fb_new->y_width + 5);
+        memset(pbi->mt_uabove_row[0] + (VP8BORDERINPIXELS>>1)-1, 127, (yv12_fb_new->y_width>>1) +5);
+        memset(pbi->mt_vabove_row[0] + (VP8BORDERINPIXELS>>1)-1, 127, (yv12_fb_new->y_width>>1) +5);
+
+        for (j=1; j<pc->mb_rows; j++)
+        {
+            memset(pbi->mt_yabove_row[j] + VP8BORDERINPIXELS-1, (unsigned char)129, 1);
+            memset(pbi->mt_uabove_row[j] + (VP8BORDERINPIXELS>>1)-1, (unsigned char)129, 1);
+            memset(pbi->mt_vabove_row[j] + (VP8BORDERINPIXELS>>1)-1, (unsigned char)129, 1);
+        }
+
+        /* Set left_col to 129 initially */
+        for (j=0; j<pc->mb_rows; j++)
+        {
+            memset(pbi->mt_yleft_col[j], (unsigned char)129, 16);
+            memset(pbi->mt_uleft_col[j], (unsigned char)129, 8);
+            memset(pbi->mt_vleft_col[j], (unsigned char)129, 8);
+        }
+
+        /* Initialize the loop filter for this frame. */
+        vp8_loop_filter_frame_init(pc, &pbi->mb, filter_level);
+    }
+    else
+        vp8_setup_intra_recon_top_line(yv12_fb_new);
+
+    setup_decoding_thread_data(pbi, xd, pbi->mb_row_di, pbi->decoding_thread_count);
+
+    for (i = 0; i < pbi->decoding_thread_count; i++)
+        sem_post(&pbi->h_event_start_decoding[i]);
+
+    mt_decode_mb_rows(pbi, xd, 0);
+
+    sem_wait(&pbi->h_event_end_decoding);   /* add back for each frame */
+}
diff --git a/libs/libvpx/vp8/decoder/treereader.h b/libs/libvpx/vp8/decoder/treereader.h
new file mode 100644
index 0000000000..f7d23c3698
--- /dev/null
+++ b/libs/libvpx/vp8/decoder/treereader.h
@@ -0,0 +1,49 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_DECODER_TREEREADER_H_
+#define VP8_DECODER_TREEREADER_H_
+
+#include "./vpx_config.h"
+#include "vp8/common/treecoder.h"
+#include "dboolhuff.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef BOOL_DECODER vp8_reader;
+
+#define vp8_read vp8dx_decode_bool
+#define vp8_read_literal vp8_decode_value
+#define vp8_read_bit(R) vp8_read(R, vp8_prob_half)
+
+
+/* Intent of tree data structure is to make decoding trivial. */
+
+static INLINE int vp8_treed_read(
+    vp8_reader *const r,        /* !!! must return a 0 or 1 !!! */
+    vp8_tree t,
+    const vp8_prob *const p
+)
+{
+    register vp8_tree_index i = 0;
+
+    while ((i = t[ i + vp8_read(r, p[i>>1])]) > 0) ;
+
+    return -i;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_DECODER_TREEREADER_H_
diff --git a/libs/libvpx/vp8/encoder/arm/armv6/vp8_short_fdct4x4_armv6.asm b/libs/libvpx/vp8/encoder/arm/armv6/vp8_short_fdct4x4_armv6.asm
new file mode 100644
index 0000000000..8034c1db9a
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/arm/armv6/vp8_short_fdct4x4_armv6.asm
@@ -0,0 +1,262 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT |vp8_short_fdct4x4_armv6|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA    |.text|, CODE, READONLY
+; void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
+|vp8_short_fdct4x4_armv6| PROC
+
+    stmfd       sp!, {r4 - r12, lr}
+
+    ; PART 1
+
+    ; coeffs 0-3
+    ldrd        r4, r5, [r0]        ; [i1 | i0] [i3 | i2]
+
+    ldr         r10, c7500
+    ldr         r11, c14500
+    ldr         r12, c0x22a453a0    ; [2217*4 | 5352*4]
+    ldr         lr, c0x00080008
+    ror         r5, r5, #16         ; [i2 | i3]
+
+    qadd16      r6, r4, r5          ; [i1+i2 | i0+i3] = [b1 | a1] without shift
+    qsub16      r7, r4, r5          ; [i1-i2 | i0-i3] = [c1 | d1] without shift
+
+    add         r0, r0, r2          ; update input pointer
+
+    qadd16      r7, r7, r7          ; 2*[c1|d1] --> we can use smlad and smlsd
+                                    ; with 2217*4 and 5352*4 without losing the
+                                    ; sign bit (overflow)
+
+    smuad       r4, r6, lr          ; o0 = (i1+i2)*8 + (i0+i3)*8
+    smusd       r5, r6, lr          ; o2 = (i1+i2)*8 - (i0+i3)*8
+
+    smlad       r6, r7, r12, r11    ; o1 = (c1 * 2217 + d1 * 5352 +  14500)
+    smlsdx      r7, r7, r12, r10    ; o3 = (d1 * 2217 - c1 * 5352 +   7500)
+
+    ldrd        r8, r9, [r0]        ; [i5 | i4] [i7 | i6]
+
+    pkhbt       r3, r4, r6, lsl #4  ; [o1 | o0], keep in register for PART 2
+    pkhbt       r6, r5, r7, lsl #4  ; [o3 | o2]
+
+    str         r6, [r1, #4]
+
+    ; coeffs 4-7
+    ror         r9, r9, #16         ; [i6 | i7]
+
+    qadd16      r6, r8, r9          ; [i5+i6 | i4+i7] = [b1 | a1] without shift
+    qsub16      r7, r8, r9          ; [i5-i6 | i4-i7] = [c1 | d1] without shift
+
+    add         r0, r0, r2          ; update input pointer
+
+    qadd16      r7, r7, r7          ; 2x[c1|d1] --> we can use smlad and smlsd
+                                    ; with 2217*4 and 5352*4 without losing the
+                                    ; sign bit (overflow)
+
+    smuad       r9, r6, lr          ; o4 = (i5+i6)*8 + (i4+i7)*8
+    smusd       r8, r6, lr          ; o6 = (i5+i6)*8 - (i4+i7)*8
+
+    smlad       r6, r7, r12, r11    ; o5 = (c1 * 2217 + d1 * 5352 +  14500)
+    smlsdx      r7, r7, r12, r10    ; o7 = (d1 * 2217 - c1 * 5352 +   7500)
+
+    ldrd        r4, r5, [r0]        ; [i9 | i8] [i11 | i10]
+
+    pkhbt       r9, r9, r6, lsl #4  ; [o5 | o4], keep in register for PART 2
+    pkhbt       r6, r8, r7, lsl #4  ; [o7 | o6]
+
+    str         r6, [r1, #12]
+
+    ; coeffs 8-11
+    ror         r5, r5, #16         ; [i10 | i11]
+
+    qadd16      r6, r4, r5          ; [i9+i10 | i8+i11]=[b1 | a1] without shift
+    qsub16      r7, r4, r5          ; [i9-i10 | i8-i11]=[c1 | d1] without shift
+
+    add         r0, r0, r2          ; update input pointer
+
+    qadd16      r7, r7, r7          ; 2x[c1|d1] --> we can use smlad and smlsd
+                                    ; with 2217*4 and 5352*4 without losing the
+                                    ; sign bit (overflow)
+
+    smuad       r2, r6, lr          ; o8 = (i9+i10)*8 + (i8+i11)*8
+    smusd       r8, r6, lr          ; o10 = (i9+i10)*8 - (i8+i11)*8
+
+    smlad       r6, r7, r12, r11    ; o9 = (c1 * 2217 + d1 * 5352 +  14500)
+    smlsdx      r7, r7, r12, r10    ; o11 = (d1 * 2217 - c1 * 5352 +   7500)
+
+    ldrd        r4, r5, [r0]        ; [i13 | i12] [i15 | i14]
+
+    pkhbt       r2, r2, r6, lsl #4  ; [o9 | o8], keep in register for PART 2
+    pkhbt       r6, r8, r7, lsl #4  ; [o11 | o10]
+
+    str         r6, [r1, #20]
+
+    ; coeffs 12-15
+    ror         r5, r5, #16         ; [i14 | i15]
+
+    qadd16      r6, r4, r5          ; [i13+i14 | i12+i15]=[b1|a1] without shift
+    qsub16      r7, r4, r5          ; [i13-i14 | i12-i15]=[c1|d1] without shift
+
+    qadd16      r7, r7, r7          ; 2x[c1|d1] --> we can use smlad and smlsd
+                                    ; with 2217*4 and 5352*4 without losing the
+                                    ; sign bit (overflow)
+
+    smuad       r4, r6, lr          ; o12 = (i13+i14)*8 + (i12+i15)*8
+    smusd       r5, r6, lr          ; o14 = (i13+i14)*8 - (i12+i15)*8
+
+    smlad       r6, r7, r12, r11    ; o13 = (c1 * 2217 + d1 * 5352 +  14500)
+    smlsdx      r7, r7, r12, r10    ; o15 = (d1 * 2217 - c1 * 5352 +   7500)
+
+    pkhbt       r0, r4, r6, lsl #4  ; [o13 | o12], keep in register for PART 2
+    pkhbt       r6, r5, r7, lsl #4  ; [o15 | o14]
+
+    str         r6, [r1, #28]
+
+
+    ; PART 2 -------------------------------------------------
+    ldr         r11, c12000
+    ldr         r10, c51000
+    ldr         lr, c0x00070007
+
+    qadd16      r4, r3, r0          ; a1 = [i1+i13 | i0+i12]
+    qadd16      r5, r9, r2          ; b1 = [i5+i9  |  i4+i8]
+    qsub16      r6, r9, r2          ; c1 = [i5-i9  |  i4-i8]
+    qsub16      r7, r3, r0          ; d1 = [i1-i13 | i0-i12]
+
+    qadd16      r4, r4, lr          ; a1 + 7
+
+    add         r0, r11, #0x10000   ; add (d!=0)
+
+    qadd16      r2, r4, r5          ; a1 + b1 + 7
+    qsub16      r3, r4, r5          ; a1 - b1 + 7
+
+    ldr         r12, c0x08a914e8    ; [2217 | 5352]
+
+    lsl         r8, r2, #16         ; prepare bottom halfword for scaling
+    asr         r2, r2, #4          ; scale top halfword
+    lsl         r9, r3, #16         ; prepare bottom halfword for scaling
+    asr         r3, r3, #4          ; scale top halfword
+    pkhtb       r4, r2, r8, asr #20 ; pack and scale bottom halfword
+    pkhtb       r5, r3, r9, asr #20 ; pack and scale bottom halfword
+
+    smulbt      r2, r6, r12         ; [ ------ | c1*2217]
+    str         r4, [r1, #0]        ; [     o1 |      o0]
+    smultt      r3, r6, r12         ; [c1*2217 | ------ ]
+    str         r5, [r1, #16]       ; [     o9 |      o8]
+
+    smlabb      r8, r7, r12, r2     ; [ ------ | d1*5352]
+    smlatb      r9, r7, r12, r3     ; [d1*5352 | ------ ]
+
+    smulbb      r2, r6, r12         ; [ ------ | c1*5352]
+    smultb      r3, r6, r12         ; [c1*5352 | ------ ]
+
+    lsls        r6, r7, #16         ; d1 != 0 ?
+    addeq       r8, r8, r11         ; c1_b*2217+d1_b*5352+12000 + (d==0)
+    addne       r8, r8, r0          ; c1_b*2217+d1_b*5352+12000 + (d!=0)
+    asrs        r6, r7, #16
+    addeq       r9, r9, r11         ; c1_t*2217+d1_t*5352+12000 + (d==0)
+    addne       r9, r9, r0          ; c1_t*2217+d1_t*5352+12000 + (d!=0)
+
+    smlabt      r4, r7, r12, r10    ; [ ------ | d1*2217] + 51000
+    smlatt      r5, r7, r12, r10    ; [d1*2217 | ------ ] + 51000
+
+    pkhtb       r9, r9, r8, asr #16
+
+    sub         r4, r4, r2
+    sub         r5, r5, r3
+
+    ldr         r3, [r1, #4]        ; [i3 | i2]
+
+    pkhtb       r5, r5, r4, asr #16 ; [o13|o12]
+
+    str         r9, [r1, #8]        ; [o5 | 04]
+
+    ldr         r9, [r1, #12]       ; [i7 | i6]
+    ldr         r8, [r1, #28]       ; [i15|i14]
+    ldr         r2, [r1, #20]       ; [i11|i10]
+    str         r5, [r1, #24]       ; [o13|o12]
+
+    qadd16      r4, r3, r8          ; a1 = [i3+i15 | i2+i14]
+    qadd16      r5, r9, r2          ; b1 = [i7+i11 | i6+i10]
+
+    qadd16      r4, r4, lr          ; a1 + 7
+
+    qsub16      r6, r9, r2          ; c1 = [i7-i11 | i6-i10]
+    qadd16      r2, r4, r5          ; a1 + b1 + 7
+    qsub16      r7, r3, r8          ; d1 = [i3-i15 | i2-i14]
+    qsub16      r3, r4, r5          ; a1 - b1 + 7
+
+    lsl         r8, r2, #16         ; prepare bottom halfword for scaling
+    asr         r2, r2, #4          ; scale top halfword
+    lsl         r9, r3, #16         ; prepare bottom halfword for scaling
+    asr         r3, r3, #4          ; scale top halfword
+    pkhtb       r4, r2, r8, asr #20 ; pack and scale bottom halfword
+    pkhtb       r5, r3, r9, asr #20 ; pack and scale bottom halfword
+
+    smulbt      r2, r6, r12         ; [ ------ | c1*2217]
+    str         r4, [r1, #4]        ; [     o3 |      o2]
+    smultt      r3, r6, r12         ; [c1*2217 | ------ ]
+    str         r5, [r1, #20]       ; [    o11 |     o10]
+
+    smlabb      r8, r7, r12, r2     ; [ ------ | d1*5352]
+    smlatb      r9, r7, r12, r3     ; [d1*5352 | ------ ]
+
+    smulbb      r2, r6, r12         ; [ ------ | c1*5352]
+    smultb      r3, r6, r12         ; [c1*5352 | ------ ]
+
+    lsls        r6, r7, #16         ; d1 != 0 ?
+    addeq       r8, r8, r11         ; c1_b*2217+d1_b*5352+12000 + (d==0)
+    addne       r8, r8, r0          ; c1_b*2217+d1_b*5352+12000 + (d!=0)
+
+    asrs        r6, r7, #16
+    addeq       r9, r9, r11         ; c1_t*2217+d1_t*5352+12000 + (d==0)
+    addne       r9, r9, r0          ; c1_t*2217+d1_t*5352+12000 + (d!=0)
+
+    smlabt      r4, r7, r12, r10    ; [ ------ | d1*2217] + 51000
+    smlatt      r5, r7, r12, r10    ; [d1*2217 | ------ ] + 51000
+
+    pkhtb       r9, r9, r8, asr #16
+
+    sub         r4, r4, r2
+    sub         r5, r5, r3
+
+    str         r9, [r1, #12]       ; [o7 | o6]
+    pkhtb       r5, r5, r4, asr #16 ; [o15|o14]
+
+    str         r5, [r1, #28]       ; [o15|o14]
+
+    ldmfd       sp!, {r4 - r12, pc}
+
+    ENDP
+
+; Used constants
+c7500
+    DCD     7500
+c14500
+    DCD     14500
+c0x22a453a0
+    DCD     0x22a453a0
+c0x00080008
+    DCD     0x00080008
+c12000
+    DCD     12000
+c51000
+    DCD     51000
+c0x00070007
+    DCD     0x00070007
+c0x08a914e8
+    DCD     0x08a914e8
+
+    END
diff --git a/libs/libvpx/vp8/encoder/arm/armv6/walsh_v6.asm b/libs/libvpx/vp8/encoder/arm/armv6/walsh_v6.asm
new file mode 100644
index 0000000000..5eaf3f25a9
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/arm/armv6/walsh_v6.asm
@@ -0,0 +1,212 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT |vp8_short_walsh4x4_armv6|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+
+;short vp8_short_walsh4x4_armv6(short *input, short *output, int pitch)
+; r0    short *input,
+; r1    short *output,
+; r2    int pitch
+|vp8_short_walsh4x4_armv6| PROC
+
+    stmdb       sp!, {r4 - r11, lr}
+
+    ldrd        r4, r5, [r0], r2
+    ldr         lr, c00040004
+    ldrd        r6, r7, [r0], r2
+
+    ; 0-3
+    qadd16      r3, r4, r5          ; [d1|a1] [1+3   |   0+2]
+    qsub16      r4, r4, r5          ; [c1|b1] [1-3   |   0-2]
+
+    ldrd        r8, r9, [r0], r2
+    ; 4-7
+    qadd16      r5, r6, r7          ; [d1|a1] [5+7   |   4+6]
+    qsub16      r6, r6, r7          ; [c1|b1] [5-7   |   4-6]
+
+    ldrd        r10, r11, [r0]
+    ; 8-11
+    qadd16      r7, r8, r9          ; [d1|a1] [9+11  |  8+10]
+    qsub16      r8, r8, r9          ; [c1|b1] [9-11  |  8-10]
+
+    ; 12-15
+    qadd16      r9, r10, r11        ; [d1|a1] [13+15 | 12+14]
+    qsub16      r10, r10, r11       ; [c1|b1] [13-15 | 12-14]
+
+
+    lsls        r2, r3, #16
+    smuad       r11, r3, lr         ; A0 = a1<<2 + d1<<2
+    addne       r11, r11, #1        ; A0 += (a1!=0)
+
+    lsls        r2, r7, #16
+    smuad       r12, r7, lr         ; C0 = a1<<2 + d1<<2
+    addne       r12, r12, #1        ; C0 += (a1!=0)
+
+    add         r0, r11, r12        ; a1_0 = A0 + C0
+    sub         r11, r11, r12       ; b1_0 = A0 - C0
+
+    lsls        r2, r5, #16
+    smuad       r12, r5, lr         ; B0 = a1<<2 + d1<<2
+    addne       r12, r12, #1        ; B0 += (a1!=0)
+
+    lsls        r2, r9, #16
+    smuad       r2, r9, lr          ; D0 = a1<<2 + d1<<2
+    addne       r2, r2, #1          ; D0 += (a1!=0)
+
+    add         lr, r12, r2         ; d1_0 = B0 + D0
+    sub         r12, r12, r2        ; c1_0 = B0 - D0
+
+    ; op[0,4,8,12]
+    adds        r2, r0, lr          ; a2 = a1_0 + d1_0
+    addmi       r2, r2, #1          ; += a2 < 0
+    add         r2, r2, #3          ; += 3
+    subs        r0, r0, lr          ; d2 = a1_0 - d1_0
+    mov         r2, r2, asr #3      ; >> 3
+    strh        r2, [r1]            ; op[0]
+
+    addmi       r0, r0, #1          ; += a2 < 0
+    add         r0, r0, #3          ; += 3
+    ldr         lr, c00040004
+    mov         r0, r0, asr #3      ; >> 3
+    strh        r0, [r1, #24]       ; op[12]
+
+    adds        r2, r11, r12        ; b2 = b1_0 + c1_0
+    addmi       r2, r2, #1          ; += a2 < 0
+    add         r2, r2, #3          ; += 3
+    subs        r0, r11, r12        ; c2 = b1_0 - c1_0
+    mov         r2, r2, asr #3      ; >> 3
+    strh        r2, [r1, #8]        ; op[4]
+
+    addmi       r0, r0, #1          ; += a2 < 0
+    add         r0, r0, #3          ; += 3
+    smusd       r3, r3, lr          ; A3 = a1<<2 - d1<<2
+    smusd       r7, r7, lr          ; C3 = a1<<2 - d1<<2
+    mov         r0, r0, asr #3      ; >> 3
+    strh        r0, [r1, #16]       ; op[8]
+
+
+    ; op[3,7,11,15]
+    add         r0, r3, r7          ; a1_3 = A3 + C3
+    sub         r3, r3, r7          ; b1_3 = A3 - C3
+
+    smusd       r5, r5, lr          ; B3 = a1<<2 - d1<<2
+    smusd       r9, r9, lr          ; D3 = a1<<2 - d1<<2
+    add         r7, r5, r9          ; d1_3 = B3 + D3
+    sub         r5, r5, r9          ; c1_3 = B3 - D3
+
+    adds        r2, r0, r7          ; a2 = a1_3 + d1_3
+    addmi       r2, r2, #1          ; += a2 < 0
+    add         r2, r2, #3          ; += 3
+    adds        r9, r3, r5          ; b2 = b1_3 + c1_3
+    mov         r2, r2, asr #3      ; >> 3
+    strh        r2, [r1, #6]        ; op[3]
+
+    addmi       r9, r9, #1          ; += a2 < 0
+    add         r9, r9, #3          ; += 3
+    subs        r2, r3, r5          ; c2 = b1_3 - c1_3
+    mov         r9, r9, asr #3      ; >> 3
+    strh        r9, [r1, #14]       ; op[7]
+
+    addmi       r2, r2, #1          ; += a2 < 0
+    add         r2, r2, #3          ; += 3
+    subs        r9, r0, r7          ; d2 = a1_3 - d1_3
+    mov         r2, r2, asr #3      ; >> 3
+    strh        r2, [r1, #22]       ; op[11]
+
+    addmi       r9, r9, #1          ; += a2 < 0
+    add         r9, r9, #3          ; += 3
+    smuad       r3, r4, lr          ; A1 = b1<<2 + c1<<2
+    smuad       r5, r8, lr          ; C1 = b1<<2 + c1<<2
+    mov         r9, r9, asr #3      ; >> 3
+    strh        r9, [r1, #30]       ; op[15]
+
+    ; op[1,5,9,13]
+    add         r0, r3, r5          ; a1_1 = A1 + C1
+    sub         r3, r3, r5          ; b1_1 = A1 - C1
+
+    smuad       r7, r6, lr          ; B1 = b1<<2 + c1<<2
+    smuad       r9, r10, lr         ; D1 = b1<<2 + c1<<2
+    add         r5, r7, r9          ; d1_1 = B1 + D1
+    sub         r7, r7, r9          ; c1_1 = B1 - D1
+
+    adds        r2, r0, r5          ; a2 = a1_1 + d1_1
+    addmi       r2, r2, #1          ; += a2 < 0
+    add         r2, r2, #3          ; += 3
+    adds        r9, r3, r7          ; b2 = b1_1 + c1_1
+    mov         r2, r2, asr #3      ; >> 3
+    strh        r2, [r1, #2]        ; op[1]
+
+    addmi       r9, r9, #1          ; += a2 < 0
+    add         r9, r9, #3          ; += 3
+    subs        r2, r3, r7          ; c2 = b1_1 - c1_1
+    mov         r9, r9, asr #3      ; >> 3
+    strh        r9, [r1, #10]       ; op[5]
+
+    addmi       r2, r2, #1          ; += a2 < 0
+    add         r2, r2, #3          ; += 3
+    subs        r9, r0, r5          ; d2 = a1_1 - d1_1
+    mov         r2, r2, asr #3      ; >> 3
+    strh        r2, [r1, #18]       ; op[9]
+
+    addmi       r9, r9, #1          ; += a2 < 0
+    add         r9, r9, #3          ; += 3
+    smusd       r4, r4, lr          ; A2 = b1<<2 - c1<<2
+    smusd       r8, r8, lr          ; C2 = b1<<2 - c1<<2
+    mov         r9, r9, asr #3      ; >> 3
+    strh        r9, [r1, #26]       ; op[13]
+
+
+    ; op[2,6,10,14]
+    add         r11, r4, r8         ; a1_2 = A2 + C2
+    sub         r12, r4, r8         ; b1_2 = A2 - C2
+
+    smusd       r6, r6, lr          ; B2 = b1<<2 - c1<<2
+    smusd       r10, r10, lr        ; D2 = b1<<2 - c1<<2
+    add         r4, r6, r10         ; d1_2 = B2 + D2
+    sub         r8, r6, r10         ; c1_2 = B2 - D2
+
+    adds        r2, r11, r4         ; a2 = a1_2 + d1_2
+    addmi       r2, r2, #1          ; += a2 < 0
+    add         r2, r2, #3          ; += 3
+    adds        r9, r12, r8         ; b2 = b1_2 + c1_2
+    mov         r2, r2, asr #3      ; >> 3
+    strh        r2, [r1, #4]        ; op[2]
+
+    addmi       r9, r9, #1          ; += a2 < 0
+    add         r9, r9, #3          ; += 3
+    subs        r2, r12, r8         ; c2 = b1_2 - c1_2
+    mov         r9, r9, asr #3      ; >> 3
+    strh        r9, [r1, #12]       ; op[6]
+
+    addmi       r2, r2, #1          ; += a2 < 0
+    add         r2, r2, #3          ; += 3
+    subs        r9, r11, r4         ; d2 = a1_2 - d1_2
+    mov         r2, r2, asr #3      ; >> 3
+    strh        r2, [r1, #20]       ; op[10]
+
+    addmi       r9, r9, #1          ; += a2 < 0
+    add         r9, r9, #3          ; += 3
+    mov         r9, r9, asr #3      ; >> 3
+    strh        r9, [r1, #28]       ; op[14]
+
+
+    ldmia       sp!, {r4 - r11, pc}
+    ENDP        ; |vp8_short_walsh4x4_armv6|
+
+c00040004
+    DCD         0x00040004
+
+    END
diff --git a/libs/libvpx/vp8/encoder/arm/dct_arm.c b/libs/libvpx/vp8/encoder/arm/dct_arm.c
new file mode 100644
index 0000000000..f71300d2c6
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/arm/dct_arm.c
@@ -0,0 +1,22 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+
+#if HAVE_MEDIA
+
+void vp8_short_fdct8x4_armv6(short *input, short *output, int pitch)
+{
+    vp8_short_fdct4x4_armv6(input,   output,    pitch);
+    vp8_short_fdct4x4_armv6(input + 4, output + 16, pitch);
+}
+
+#endif /* HAVE_MEDIA */
diff --git a/libs/libvpx/vp8/encoder/arm/neon/denoising_neon.c b/libs/libvpx/vp8/encoder/arm/neon/denoising_neon.c
new file mode 100644
index 0000000000..08be76e433
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/arm/neon/denoising_neon.c
@@ -0,0 +1,478 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "vp8/encoder/denoising.h"
+#include "vpx_mem/vpx_mem.h"
+#include "./vp8_rtcd.h"
+
+/*
+ * The filter function was modified to reduce the computational complexity.
+ *
+ * Step 1:
+ *  Instead of applying tap coefficients for each pixel, we calculated the
+ *  pixel adjustments vs. pixel diff value ahead of time.
+ *     adjustment = filtered_value - current_raw
+ *                = (filter_coefficient * diff + 128) >> 8
+ *  where
+ *     filter_coefficient = (255 << 8) / (256 + ((abs_diff * 330) >> 3));
+ *     filter_coefficient += filter_coefficient /
+ *                           (3 + motion_magnitude_adjustment);
+ *     filter_coefficient is clamped to 0 ~ 255.
+ *
+ * Step 2:
+ *  The adjustment vs. diff curve becomes flat very quick when diff increases.
+ *  This allowed us to use only several levels to approximate the curve without
+ *  changing the filtering algorithm too much.
+ *  The adjustments were further corrected by checking the motion magnitude.
+ *  The levels used are:
+ *      diff          level       adjustment w/o       adjustment w/
+ *                               motion correction    motion correction
+ *      [-255, -16]     3              -6                   -7
+ *      [-15, -8]       2              -4                   -5
+ *      [-7, -4]        1              -3                   -4
+ *      [-3, 3]         0              diff                 diff
+ *      [4, 7]          1               3                    4
+ *      [8, 15]         2               4                    5
+ *      [16, 255]       3               6                    7
+ */
+
+int vp8_denoiser_filter_neon(unsigned char *mc_running_avg_y,
+                             int mc_running_avg_y_stride,
+                             unsigned char *running_avg_y,
+                             int running_avg_y_stride,
+                             unsigned char *sig, int sig_stride,
+                             unsigned int motion_magnitude,
+                             int increase_denoising) {
+    /* If motion_magnitude is small, making the denoiser more aggressive by
+     * increasing the adjustment for each level, level1 adjustment is
+     * increased, the deltas stay the same.
+     */
+    int shift_inc  = (increase_denoising &&
+        motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 1 : 0;
+    const uint8x16_t v_level1_adjustment = vmovq_n_u8(
+        (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 + shift_inc : 3);
+    const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1);
+    const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2);
+    const uint8x16_t v_level1_threshold = vmovq_n_u8(4 + shift_inc);
+    const uint8x16_t v_level2_threshold = vdupq_n_u8(8);
+    const uint8x16_t v_level3_threshold = vdupq_n_u8(16);
+    int64x2_t v_sum_diff_total = vdupq_n_s64(0);
+
+    /* Go over lines. */
+    int r;
+    for (r = 0; r < 16; ++r) {
+        /* Load inputs. */
+        const uint8x16_t v_sig = vld1q_u8(sig);
+        const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y);
+
+        /* Calculate absolute difference and sign masks. */
+        const uint8x16_t v_abs_diff      = vabdq_u8(v_sig, v_mc_running_avg_y);
+        const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y);
+        const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y);
+
+        /* Figure out which level that put us in. */
+        const uint8x16_t v_level1_mask = vcleq_u8(v_level1_threshold,
+                                                  v_abs_diff);
+        const uint8x16_t v_level2_mask = vcleq_u8(v_level2_threshold,
+                                                  v_abs_diff);
+        const uint8x16_t v_level3_mask = vcleq_u8(v_level3_threshold,
+                                                  v_abs_diff);
+
+        /* Calculate absolute adjustments for level 1, 2 and 3. */
+        const uint8x16_t v_level2_adjustment = vandq_u8(v_level2_mask,
+                                                        v_delta_level_1_and_2);
+        const uint8x16_t v_level3_adjustment = vandq_u8(v_level3_mask,
+                                                        v_delta_level_2_and_3);
+        const uint8x16_t v_level1and2_adjustment = vaddq_u8(v_level1_adjustment,
+            v_level2_adjustment);
+        const uint8x16_t v_level1and2and3_adjustment = vaddq_u8(
+            v_level1and2_adjustment, v_level3_adjustment);
+
+        /* Figure adjustment absolute value by selecting between the absolute
+         * difference if in level0 or the value for level 1, 2 and 3.
+         */
+        const uint8x16_t v_abs_adjustment = vbslq_u8(v_level1_mask,
+            v_level1and2and3_adjustment, v_abs_diff);
+
+        /* Calculate positive and negative adjustments. Apply them to the signal
+         * and accumulate them. Adjustments are less than eight and the maximum
+         * sum of them (7 * 16) can fit in a signed char.
+         */
+        const uint8x16_t v_pos_adjustment = vandq_u8(v_diff_pos_mask,
+                                                     v_abs_adjustment);
+        const uint8x16_t v_neg_adjustment = vandq_u8(v_diff_neg_mask,
+                                                     v_abs_adjustment);
+
+        uint8x16_t v_running_avg_y = vqaddq_u8(v_sig, v_pos_adjustment);
+        v_running_avg_y = vqsubq_u8(v_running_avg_y, v_neg_adjustment);
+
+        /* Store results. */
+        vst1q_u8(running_avg_y, v_running_avg_y);
+
+        /* Sum all the accumulators to have the sum of all pixel differences
+         * for this macroblock.
+         */
+        {
+            const int8x16_t v_sum_diff =
+                vqsubq_s8(vreinterpretq_s8_u8(v_pos_adjustment),
+                          vreinterpretq_s8_u8(v_neg_adjustment));
+
+            const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff);
+
+            const int32x4_t fedc_ba98_7654_3210 =
+                vpaddlq_s16(fe_dc_ba_98_76_54_32_10);
+
+            const int64x2_t fedcba98_76543210 =
+                vpaddlq_s32(fedc_ba98_7654_3210);
+
+            v_sum_diff_total = vqaddq_s64(v_sum_diff_total, fedcba98_76543210);
+        }
+
+        /* Update pointers for next iteration. */
+        sig += sig_stride;
+        mc_running_avg_y += mc_running_avg_y_stride;
+        running_avg_y += running_avg_y_stride;
+    }
+
+    /* Too much adjustments => copy block. */
+    {
+        int64x1_t x = vqadd_s64(vget_high_s64(v_sum_diff_total),
+                                      vget_low_s64(v_sum_diff_total));
+        int sum_diff = vget_lane_s32(vabs_s32(vreinterpret_s32_s64(x)), 0);
+        int sum_diff_thresh = SUM_DIFF_THRESHOLD;
+
+        if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH;
+        if (sum_diff > sum_diff_thresh) {
+          // Before returning to copy the block (i.e., apply no denoising),
+          // checK if we can still apply some (weaker) temporal filtering to
+          // this block, that would otherwise not be denoised at all. Simplest
+          // is to apply an additional adjustment to running_avg_y to bring it
+          // closer to sig. The adjustment is capped by a maximum delta, and
+          // chosen such that in most cases the resulting sum_diff will be
+          // within the accceptable range given by sum_diff_thresh.
+
+          // The delta is set by the excess of absolute pixel diff over the
+          // threshold.
+          int delta = ((sum_diff - sum_diff_thresh) >> 8) + 1;
+          // Only apply the adjustment for max delta up to 3.
+          if (delta < 4) {
+            const uint8x16_t k_delta = vmovq_n_u8(delta);
+            sig -= sig_stride * 16;
+            mc_running_avg_y -= mc_running_avg_y_stride * 16;
+            running_avg_y -= running_avg_y_stride * 16;
+            for (r = 0; r < 16; ++r) {
+              uint8x16_t v_running_avg_y = vld1q_u8(running_avg_y);
+              const uint8x16_t v_sig = vld1q_u8(sig);
+              const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y);
+
+              /* Calculate absolute difference and sign masks. */
+              const uint8x16_t v_abs_diff      = vabdq_u8(v_sig,
+                                                          v_mc_running_avg_y);
+              const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig,
+                                                          v_mc_running_avg_y);
+              const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig,
+                                                          v_mc_running_avg_y);
+              // Clamp absolute difference to delta to get the adjustment.
+              const uint8x16_t v_abs_adjustment =
+                  vminq_u8(v_abs_diff, (k_delta));
+
+              const uint8x16_t v_pos_adjustment = vandq_u8(v_diff_pos_mask,
+                                                           v_abs_adjustment);
+              const uint8x16_t v_neg_adjustment = vandq_u8(v_diff_neg_mask,
+                                                           v_abs_adjustment);
+
+              v_running_avg_y = vqsubq_u8(v_running_avg_y, v_pos_adjustment);
+              v_running_avg_y = vqaddq_u8(v_running_avg_y, v_neg_adjustment);
+
+              /* Store results. */
+              vst1q_u8(running_avg_y, v_running_avg_y);
+
+              {
+                  const int8x16_t v_sum_diff =
+                      vqsubq_s8(vreinterpretq_s8_u8(v_neg_adjustment),
+                                vreinterpretq_s8_u8(v_pos_adjustment));
+
+                  const int16x8_t fe_dc_ba_98_76_54_32_10 =
+                      vpaddlq_s8(v_sum_diff);
+                  const int32x4_t fedc_ba98_7654_3210 =
+                      vpaddlq_s16(fe_dc_ba_98_76_54_32_10);
+                  const int64x2_t fedcba98_76543210 =
+                      vpaddlq_s32(fedc_ba98_7654_3210);
+
+                  v_sum_diff_total = vqaddq_s64(v_sum_diff_total,
+                                                fedcba98_76543210);
+              }
+              /* Update pointers for next iteration. */
+              sig += sig_stride;
+              mc_running_avg_y += mc_running_avg_y_stride;
+              running_avg_y += running_avg_y_stride;
+            }
+            {
+              // Update the sum of all pixel differences of this MB.
+              x = vqadd_s64(vget_high_s64(v_sum_diff_total),
+                            vget_low_s64(v_sum_diff_total));
+              sum_diff = vget_lane_s32(vabs_s32(vreinterpret_s32_s64(x)), 0);
+
+              if (sum_diff > sum_diff_thresh) {
+                return COPY_BLOCK;
+              }
+            }
+          } else {
+            return COPY_BLOCK;
+          }
+        }
+    }
+
+    /* Tell above level that block was filtered. */
+    running_avg_y -= running_avg_y_stride * 16;
+    sig -= sig_stride * 16;
+
+    vp8_copy_mem16x16(running_avg_y, running_avg_y_stride, sig, sig_stride);
+
+    return FILTER_BLOCK;
+}
+
+int vp8_denoiser_filter_uv_neon(unsigned char *mc_running_avg,
+                             int mc_running_avg_stride,
+                             unsigned char *running_avg,
+                             int running_avg_stride,
+                             unsigned char *sig, int sig_stride,
+                             unsigned int motion_magnitude,
+                             int increase_denoising) {
+    /* If motion_magnitude is small, making the denoiser more aggressive by
+     * increasing the adjustment for each level, level1 adjustment is
+     * increased, the deltas stay the same.
+     */
+    int shift_inc  = (increase_denoising &&
+        motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) ? 1 : 0;
+    const uint8x16_t v_level1_adjustment = vmovq_n_u8(
+        (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) ? 4 + shift_inc : 3);
+
+    const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1);
+    const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2);
+    const uint8x16_t v_level1_threshold = vmovq_n_u8(4 + shift_inc);
+    const uint8x16_t v_level2_threshold = vdupq_n_u8(8);
+    const uint8x16_t v_level3_threshold = vdupq_n_u8(16);
+    int64x2_t v_sum_diff_total = vdupq_n_s64(0);
+    int r;
+
+    {
+      uint16x4_t v_sum_block = vdup_n_u16(0);
+
+      // Avoid denoising color signal if its close to average level.
+      for (r = 0; r < 8; ++r) {
+        const uint8x8_t v_sig = vld1_u8(sig);
+        const uint16x4_t _76_54_32_10 = vpaddl_u8(v_sig);
+        v_sum_block = vqadd_u16(v_sum_block, _76_54_32_10);
+        sig += sig_stride;
+      }
+      sig -= sig_stride * 8;
+      {
+        const uint32x2_t _7654_3210 = vpaddl_u16(v_sum_block);
+        const uint64x1_t _76543210 = vpaddl_u32(_7654_3210);
+        const int sum_block =
+            vget_lane_s32(vreinterpret_s32_u64(_76543210), 0);
+        if (abs(sum_block - (128 * 8 * 8)) < SUM_DIFF_FROM_AVG_THRESH_UV) {
+          return COPY_BLOCK;
+        }
+      }
+    }
+
+    /* Go over lines. */
+    for (r = 0; r < 4; ++r) {
+        /* Load inputs. */
+        const uint8x8_t v_sig_lo = vld1_u8(sig);
+        const uint8x8_t v_sig_hi = vld1_u8(&sig[sig_stride]);
+        const uint8x16_t v_sig = vcombine_u8(v_sig_lo, v_sig_hi);
+        const uint8x8_t v_mc_running_avg_lo = vld1_u8(mc_running_avg);
+        const uint8x8_t v_mc_running_avg_hi =
+            vld1_u8(&mc_running_avg[mc_running_avg_stride]);
+        const uint8x16_t v_mc_running_avg =
+            vcombine_u8(v_mc_running_avg_lo, v_mc_running_avg_hi);
+        /* Calculate absolute difference and sign masks. */
+        const uint8x16_t v_abs_diff      = vabdq_u8(v_sig, v_mc_running_avg);
+        const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg);
+        const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg);
+
+        /* Figure out which level that put us in. */
+        const uint8x16_t v_level1_mask = vcleq_u8(v_level1_threshold,
+                                                  v_abs_diff);
+        const uint8x16_t v_level2_mask = vcleq_u8(v_level2_threshold,
+                                                  v_abs_diff);
+        const uint8x16_t v_level3_mask = vcleq_u8(v_level3_threshold,
+                                                  v_abs_diff);
+
+        /* Calculate absolute adjustments for level 1, 2 and 3. */
+        const uint8x16_t v_level2_adjustment = vandq_u8(v_level2_mask,
+                                                        v_delta_level_1_and_2);
+        const uint8x16_t v_level3_adjustment = vandq_u8(v_level3_mask,
+                                                        v_delta_level_2_and_3);
+        const uint8x16_t v_level1and2_adjustment = vaddq_u8(v_level1_adjustment,
+            v_level2_adjustment);
+        const uint8x16_t v_level1and2and3_adjustment = vaddq_u8(
+            v_level1and2_adjustment, v_level3_adjustment);
+
+        /* Figure adjustment absolute value by selecting between the absolute
+         * difference if in level0 or the value for level 1, 2 and 3.
+         */
+        const uint8x16_t v_abs_adjustment = vbslq_u8(v_level1_mask,
+            v_level1and2and3_adjustment, v_abs_diff);
+
+        /* Calculate positive and negative adjustments. Apply them to the signal
+         * and accumulate them. Adjustments are less than eight and the maximum
+         * sum of them (7 * 16) can fit in a signed char.
+         */
+        const uint8x16_t v_pos_adjustment = vandq_u8(v_diff_pos_mask,
+                                                     v_abs_adjustment);
+        const uint8x16_t v_neg_adjustment = vandq_u8(v_diff_neg_mask,
+                                                     v_abs_adjustment);
+
+        uint8x16_t v_running_avg = vqaddq_u8(v_sig, v_pos_adjustment);
+        v_running_avg = vqsubq_u8(v_running_avg, v_neg_adjustment);
+
+        /* Store results. */
+        vst1_u8(running_avg, vget_low_u8(v_running_avg));
+        vst1_u8(&running_avg[running_avg_stride], vget_high_u8(v_running_avg));
+
+        /* Sum all the accumulators to have the sum of all pixel differences
+         * for this macroblock.
+         */
+        {
+            const int8x16_t v_sum_diff =
+                vqsubq_s8(vreinterpretq_s8_u8(v_pos_adjustment),
+                          vreinterpretq_s8_u8(v_neg_adjustment));
+
+            const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff);
+
+            const int32x4_t fedc_ba98_7654_3210 =
+                vpaddlq_s16(fe_dc_ba_98_76_54_32_10);
+
+            const int64x2_t fedcba98_76543210 =
+                vpaddlq_s32(fedc_ba98_7654_3210);
+
+            v_sum_diff_total = vqaddq_s64(v_sum_diff_total, fedcba98_76543210);
+        }
+
+        /* Update pointers for next iteration. */
+        sig += sig_stride * 2;
+        mc_running_avg += mc_running_avg_stride * 2;
+        running_avg += running_avg_stride * 2;
+    }
+
+
+    /* Too much adjustments => copy block. */
+    {
+        int64x1_t x = vqadd_s64(vget_high_s64(v_sum_diff_total),
+                                      vget_low_s64(v_sum_diff_total));
+        int sum_diff = vget_lane_s32(vabs_s32(vreinterpret_s32_s64(x)), 0);
+        int sum_diff_thresh = SUM_DIFF_THRESHOLD_UV;
+        if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH_UV;
+        if (sum_diff > sum_diff_thresh) {
+          // Before returning to copy the block (i.e., apply no denoising),
+          // checK if we can still apply some (weaker) temporal filtering to
+          // this block, that would otherwise not be denoised at all. Simplest
+          // is to apply an additional adjustment to running_avg_y to bring it
+          // closer to sig. The adjustment is capped by a maximum delta, and
+          // chosen such that in most cases the resulting sum_diff will be
+          // within the accceptable range given by sum_diff_thresh.
+
+          // The delta is set by the excess of absolute pixel diff over the
+          // threshold.
+          int delta = ((sum_diff - sum_diff_thresh) >> 8) + 1;
+          // Only apply the adjustment for max delta up to 3.
+          if (delta < 4) {
+            const uint8x16_t k_delta = vmovq_n_u8(delta);
+            sig -= sig_stride * 8;
+            mc_running_avg -= mc_running_avg_stride * 8;
+            running_avg -= running_avg_stride * 8;
+            for (r = 0; r < 4; ++r) {
+              const uint8x8_t v_sig_lo = vld1_u8(sig);
+              const uint8x8_t v_sig_hi = vld1_u8(&sig[sig_stride]);
+              const uint8x16_t v_sig = vcombine_u8(v_sig_lo, v_sig_hi);
+              const uint8x8_t v_mc_running_avg_lo = vld1_u8(mc_running_avg);
+              const uint8x8_t v_mc_running_avg_hi =
+                  vld1_u8(&mc_running_avg[mc_running_avg_stride]);
+              const uint8x16_t v_mc_running_avg =
+                  vcombine_u8(v_mc_running_avg_lo, v_mc_running_avg_hi);
+              /* Calculate absolute difference and sign masks. */
+              const uint8x16_t v_abs_diff      = vabdq_u8(v_sig,
+                                                          v_mc_running_avg);
+              const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig,
+                                                          v_mc_running_avg);
+              const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig,
+                                                          v_mc_running_avg);
+              // Clamp absolute difference to delta to get the adjustment.
+              const uint8x16_t v_abs_adjustment =
+                  vminq_u8(v_abs_diff, (k_delta));
+
+              const uint8x16_t v_pos_adjustment = vandq_u8(v_diff_pos_mask,
+                                                           v_abs_adjustment);
+              const uint8x16_t v_neg_adjustment = vandq_u8(v_diff_neg_mask,
+                                                           v_abs_adjustment);
+              const uint8x8_t v_running_avg_lo = vld1_u8(running_avg);
+              const uint8x8_t v_running_avg_hi =
+                  vld1_u8(&running_avg[running_avg_stride]);
+              uint8x16_t v_running_avg =
+                  vcombine_u8(v_running_avg_lo, v_running_avg_hi);
+
+              v_running_avg = vqsubq_u8(v_running_avg, v_pos_adjustment);
+              v_running_avg = vqaddq_u8(v_running_avg, v_neg_adjustment);
+
+              /* Store results. */
+              vst1_u8(running_avg, vget_low_u8(v_running_avg));
+              vst1_u8(&running_avg[running_avg_stride],
+                      vget_high_u8(v_running_avg));
+
+              {
+                  const int8x16_t v_sum_diff =
+                      vqsubq_s8(vreinterpretq_s8_u8(v_neg_adjustment),
+                                vreinterpretq_s8_u8(v_pos_adjustment));
+
+                  const int16x8_t fe_dc_ba_98_76_54_32_10 =
+                      vpaddlq_s8(v_sum_diff);
+                  const int32x4_t fedc_ba98_7654_3210 =
+                      vpaddlq_s16(fe_dc_ba_98_76_54_32_10);
+                  const int64x2_t fedcba98_76543210 =
+                      vpaddlq_s32(fedc_ba98_7654_3210);
+
+                  v_sum_diff_total = vqaddq_s64(v_sum_diff_total,
+                                                fedcba98_76543210);
+              }
+              /* Update pointers for next iteration. */
+              sig += sig_stride * 2;
+              mc_running_avg += mc_running_avg_stride * 2;
+              running_avg += running_avg_stride * 2;
+            }
+            {
+              // Update the sum of all pixel differences of this MB.
+              x = vqadd_s64(vget_high_s64(v_sum_diff_total),
+                            vget_low_s64(v_sum_diff_total));
+              sum_diff = vget_lane_s32(vabs_s32(vreinterpret_s32_s64(x)), 0);
+
+              if (sum_diff > sum_diff_thresh) {
+                return COPY_BLOCK;
+              }
+            }
+          } else {
+            return COPY_BLOCK;
+          }
+        }
+    }
+
+    /* Tell above level that block was filtered. */
+    running_avg -= running_avg_stride * 8;
+    sig -= sig_stride * 8;
+
+    vp8_copy_mem8x8(running_avg, running_avg_stride, sig, sig_stride);
+
+    return FILTER_BLOCK;
+}
diff --git a/libs/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.c b/libs/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.c
new file mode 100644
index 0000000000..e5824bfb21
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.c
@@ -0,0 +1,89 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "vp8/encoder/block.h"
+
+static const uint16_t inv_zig_zag[16] = {
+    1,  2,  6,   7,
+    3,  5,  8,  13,
+    4,  9,  12, 14,
+    10, 11, 15, 16
+};
+
+void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) {
+    const int16x8_t one_q = vdupq_n_s16(-1),
+                    z0 = vld1q_s16(b->coeff),
+                    z1 = vld1q_s16(b->coeff + 8),
+                    round0 = vld1q_s16(b->round),
+                    round1 = vld1q_s16(b->round + 8),
+                    quant0 = vld1q_s16(b->quant_fast),
+                    quant1 = vld1q_s16(b->quant_fast + 8),
+                    dequant0 = vld1q_s16(d->dequant),
+                    dequant1 = vld1q_s16(d->dequant + 8);
+    const uint16x8_t zig_zag0 = vld1q_u16(inv_zig_zag),
+                     zig_zag1 = vld1q_u16(inv_zig_zag + 8);
+    int16x8_t x0, x1, sz0, sz1, y0, y1;
+    uint16x8_t eob0, eob1;
+    uint16x4_t eob_d16;
+    uint32x2_t eob_d32;
+    uint32x4_t eob_q32;
+
+    /* sign of z: z >> 15 */
+    sz0 = vshrq_n_s16(z0, 15);
+    sz1 = vshrq_n_s16(z1, 15);
+
+    /* x = abs(z) */
+    x0 = vabsq_s16(z0);
+    x1 = vabsq_s16(z1);
+
+    /* x += round */
+    x0 = vaddq_s16(x0, round0);
+    x1 = vaddq_s16(x1, round1);
+
+    /* y = 2 * (x * quant) >> 16 */
+    y0 = vqdmulhq_s16(x0, quant0);
+    y1 = vqdmulhq_s16(x1, quant1);
+
+    /* Compensate for doubling in vqdmulhq */
+    y0 = vshrq_n_s16(y0, 1);
+    y1 = vshrq_n_s16(y1, 1);
+
+    /* Restore sign bit */
+    y0 = veorq_s16(y0, sz0);
+    y1 = veorq_s16(y1, sz1);
+    x0 = vsubq_s16(y0, sz0);
+    x1 = vsubq_s16(y1, sz1);
+
+    /* find non-zero elements */
+    eob0 = vtstq_s16(x0, one_q);
+    eob1 = vtstq_s16(x1, one_q);
+
+    /* mask zig zag */
+    eob0 = vandq_u16(eob0, zig_zag0);
+    eob1 = vandq_u16(eob1, zig_zag1);
+
+    /* select the largest value */
+    eob0 = vmaxq_u16(eob0, eob1);
+    eob_d16 = vmax_u16(vget_low_u16(eob0), vget_high_u16(eob0));
+    eob_q32 = vmovl_u16(eob_d16);
+    eob_d32 = vmax_u32(vget_low_u32(eob_q32), vget_high_u32(eob_q32));
+    eob_d32 = vpmax_u32(eob_d32, eob_d32);
+
+    /* qcoeff = x */
+    vst1q_s16(d->qcoeff, x0);
+    vst1q_s16(d->qcoeff + 8, x1);
+
+    /* dqcoeff = x * dequant */
+    vst1q_s16(d->dqcoeff, vmulq_s16(dequant0, x0));
+    vst1q_s16(d->dqcoeff + 8, vmulq_s16(dequant1, x1));
+
+    vst1_lane_s8((int8_t *)d->eob, vreinterpret_s8_u32(eob_d32), 0);
+}
diff --git a/libs/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c b/libs/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c
new file mode 100644
index 0000000000..391e5f9907
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c
@@ -0,0 +1,269 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+void vp8_short_fdct4x4_neon(
+        int16_t *input,
+        int16_t *output,
+        int pitch) {
+    int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+    int16x4_t d16s16, d17s16, d26s16, dEmptys16;
+    uint16x4_t d4u16;
+    int16x8_t q0s16, q1s16;
+    int32x4_t q9s32, q10s32, q11s32, q12s32;
+    int16x4x2_t v2tmp0, v2tmp1;
+    int32x2x2_t v2tmp2, v2tmp3;
+
+    d16s16 = vdup_n_s16(5352);
+    d17s16 = vdup_n_s16(2217);
+    q9s32 = vdupq_n_s32(14500);
+    q10s32 = vdupq_n_s32(7500);
+    q11s32 = vdupq_n_s32(12000);
+    q12s32 = vdupq_n_s32(51000);
+
+    // Part one
+    pitch >>= 1;
+    d0s16 = vld1_s16(input);
+    input += pitch;
+    d1s16 = vld1_s16(input);
+    input += pitch;
+    d2s16 = vld1_s16(input);
+    input += pitch;
+    d3s16 = vld1_s16(input);
+
+    v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16),
+                      vreinterpret_s32_s16(d2s16));
+    v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16),
+                      vreinterpret_s32_s16(d3s16));
+    v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]),   // d0
+                      vreinterpret_s16_s32(v2tmp3.val[0]));  // d1
+    v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]),   // d2
+                      vreinterpret_s16_s32(v2tmp3.val[1]));  // d3
+
+    d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]);
+    d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]);
+    d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]);
+    d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]);
+
+    d4s16 = vshl_n_s16(d4s16, 3);
+    d5s16 = vshl_n_s16(d5s16, 3);
+    d6s16 = vshl_n_s16(d6s16, 3);
+    d7s16 = vshl_n_s16(d7s16, 3);
+
+    d0s16 = vadd_s16(d4s16, d5s16);
+    d2s16 = vsub_s16(d4s16, d5s16);
+
+    q9s32 = vmlal_s16(q9s32, d7s16, d16s16);
+    q10s32 = vmlal_s16(q10s32, d7s16, d17s16);
+    q9s32 = vmlal_s16(q9s32, d6s16, d17s16);
+    q10s32 = vmlsl_s16(q10s32, d6s16, d16s16);
+
+    d1s16 = vshrn_n_s32(q9s32, 12);
+    d3s16 = vshrn_n_s32(q10s32, 12);
+
+    // Part two
+    v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16),
+                      vreinterpret_s32_s16(d2s16));
+    v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16),
+                      vreinterpret_s32_s16(d3s16));
+    v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]),   // d0
+                      vreinterpret_s16_s32(v2tmp3.val[0]));  // d1
+    v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]),   // d2
+                      vreinterpret_s16_s32(v2tmp3.val[1]));  // d3
+
+    d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]);
+    d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]);
+    d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]);
+    d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]);
+
+    d26s16 = vdup_n_s16(7);
+    d4s16 = vadd_s16(d4s16, d26s16);
+
+    d0s16 = vadd_s16(d4s16, d5s16);
+    d2s16 = vsub_s16(d4s16, d5s16);
+
+    q11s32 = vmlal_s16(q11s32, d7s16, d16s16);
+    q12s32 = vmlal_s16(q12s32, d7s16, d17s16);
+
+    dEmptys16 = vdup_n_s16(0);
+    d4u16 = vceq_s16(d7s16, dEmptys16);
+
+    d0s16 = vshr_n_s16(d0s16, 4);
+    d2s16 = vshr_n_s16(d2s16, 4);
+
+    q11s32 = vmlal_s16(q11s32, d6s16, d17s16);
+    q12s32 = vmlsl_s16(q12s32, d6s16, d16s16);
+
+    d4u16 = vmvn_u16(d4u16);
+    d1s16 = vshrn_n_s32(q11s32, 16);
+    d1s16 = vsub_s16(d1s16, vreinterpret_s16_u16(d4u16));
+    d3s16 = vshrn_n_s32(q12s32, 16);
+
+    q0s16 = vcombine_s16(d0s16, d1s16);
+    q1s16 = vcombine_s16(d2s16, d3s16);
+
+    vst1q_s16(output, q0s16);
+    vst1q_s16(output + 8, q1s16);
+    return;
+}
+
+void vp8_short_fdct8x4_neon(
+        int16_t *input,
+        int16_t *output,
+        int pitch) {
+    int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+    int16x4_t d16s16, d17s16, d26s16, d27s16, d28s16, d29s16;
+    uint16x4_t d28u16, d29u16;
+    uint16x8_t q14u16;
+    int16x8_t q0s16, q1s16, q2s16, q3s16;
+    int16x8_t q11s16, q12s16, q13s16, q14s16, q15s16, qEmptys16;
+    int32x4_t q9s32, q10s32, q11s32, q12s32;
+    int16x8x2_t v2tmp0, v2tmp1;
+    int32x4x2_t v2tmp2, v2tmp3;
+
+    d16s16 = vdup_n_s16(5352);
+    d17s16 = vdup_n_s16(2217);
+    q9s32 = vdupq_n_s32(14500);
+    q10s32 = vdupq_n_s32(7500);
+
+    // Part one
+    pitch >>= 1;
+    q0s16 = vld1q_s16(input);
+    input += pitch;
+    q1s16 = vld1q_s16(input);
+    input += pitch;
+    q2s16 = vld1q_s16(input);
+    input += pitch;
+    q3s16 = vld1q_s16(input);
+
+    v2tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q0s16),
+                       vreinterpretq_s32_s16(q2s16));
+    v2tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q1s16),
+                       vreinterpretq_s32_s16(q3s16));
+    v2tmp0 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[0]),   // q0
+                       vreinterpretq_s16_s32(v2tmp3.val[0]));  // q1
+    v2tmp1 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[1]),   // q2
+                       vreinterpretq_s16_s32(v2tmp3.val[1]));  // q3
+
+    q11s16 = vaddq_s16(v2tmp0.val[0], v2tmp1.val[1]);
+    q12s16 = vaddq_s16(v2tmp0.val[1], v2tmp1.val[0]);
+    q13s16 = vsubq_s16(v2tmp0.val[1], v2tmp1.val[0]);
+    q14s16 = vsubq_s16(v2tmp0.val[0], v2tmp1.val[1]);
+
+    q11s16 = vshlq_n_s16(q11s16, 3);
+    q12s16 = vshlq_n_s16(q12s16, 3);
+    q13s16 = vshlq_n_s16(q13s16, 3);
+    q14s16 = vshlq_n_s16(q14s16, 3);
+
+    q0s16 = vaddq_s16(q11s16, q12s16);
+    q2s16 = vsubq_s16(q11s16, q12s16);
+
+    q11s32 = q9s32;
+    q12s32 = q10s32;
+
+    d26s16 = vget_low_s16(q13s16);
+    d27s16 = vget_high_s16(q13s16);
+    d28s16 = vget_low_s16(q14s16);
+    d29s16 = vget_high_s16(q14s16);
+
+    q9s32 = vmlal_s16(q9s32, d28s16, d16s16);
+    q10s32 = vmlal_s16(q10s32, d28s16, d17s16);
+    q11s32 = vmlal_s16(q11s32, d29s16, d16s16);
+    q12s32 = vmlal_s16(q12s32, d29s16, d17s16);
+
+    q9s32 = vmlal_s16(q9s32, d26s16, d17s16);
+    q10s32 = vmlsl_s16(q10s32, d26s16, d16s16);
+    q11s32 = vmlal_s16(q11s32, d27s16, d17s16);
+    q12s32 = vmlsl_s16(q12s32, d27s16, d16s16);
+
+    d2s16 = vshrn_n_s32(q9s32, 12);
+    d6s16 = vshrn_n_s32(q10s32, 12);
+    d3s16 = vshrn_n_s32(q11s32, 12);
+    d7s16 = vshrn_n_s32(q12s32, 12);
+    q1s16 = vcombine_s16(d2s16, d3s16);
+    q3s16 = vcombine_s16(d6s16, d7s16);
+
+    // Part two
+    q9s32 = vdupq_n_s32(12000);
+    q10s32 = vdupq_n_s32(51000);
+
+    v2tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q0s16),
+                       vreinterpretq_s32_s16(q2s16));
+    v2tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q1s16),
+                       vreinterpretq_s32_s16(q3s16));
+    v2tmp0 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[0]),   // q0
+                       vreinterpretq_s16_s32(v2tmp3.val[0]));  // q1
+    v2tmp1 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[1]),   // q2
+                       vreinterpretq_s16_s32(v2tmp3.val[1]));  // q3
+
+    q11s16 = vaddq_s16(v2tmp0.val[0], v2tmp1.val[1]);
+    q12s16 = vaddq_s16(v2tmp0.val[1], v2tmp1.val[0]);
+    q13s16 = vsubq_s16(v2tmp0.val[1], v2tmp1.val[0]);
+    q14s16 = vsubq_s16(v2tmp0.val[0], v2tmp1.val[1]);
+
+    q15s16 = vdupq_n_s16(7);
+    q11s16 = vaddq_s16(q11s16, q15s16);
+    q0s16 = vaddq_s16(q11s16, q12s16);
+    q1s16 = vsubq_s16(q11s16, q12s16);
+
+    q11s32 = q9s32;
+    q12s32 = q10s32;
+
+    d0s16 = vget_low_s16(q0s16);
+    d1s16 = vget_high_s16(q0s16);
+    d2s16 = vget_low_s16(q1s16);
+    d3s16 = vget_high_s16(q1s16);
+
+    d0s16 = vshr_n_s16(d0s16, 4);
+    d4s16 = vshr_n_s16(d1s16, 4);
+    d2s16 = vshr_n_s16(d2s16, 4);
+    d6s16 = vshr_n_s16(d3s16, 4);
+
+    d26s16 = vget_low_s16(q13s16);
+    d27s16 = vget_high_s16(q13s16);
+    d28s16 = vget_low_s16(q14s16);
+    d29s16 = vget_high_s16(q14s16);
+
+    q9s32 = vmlal_s16(q9s32, d28s16, d16s16);
+    q10s32 = vmlal_s16(q10s32, d28s16, d17s16);
+    q11s32 = vmlal_s16(q11s32, d29s16, d16s16);
+    q12s32 = vmlal_s16(q12s32, d29s16, d17s16);
+
+    q9s32 = vmlal_s16(q9s32, d26s16, d17s16);
+    q10s32 = vmlsl_s16(q10s32, d26s16, d16s16);
+    q11s32 = vmlal_s16(q11s32, d27s16, d17s16);
+    q12s32 = vmlsl_s16(q12s32, d27s16, d16s16);
+
+    d1s16 = vshrn_n_s32(q9s32, 16);
+    d3s16 = vshrn_n_s32(q10s32, 16);
+    d5s16 = vshrn_n_s32(q11s32, 16);
+    d7s16 = vshrn_n_s32(q12s32, 16);
+
+    qEmptys16 = vdupq_n_s16(0);
+    q14u16 = vceqq_s16(q14s16, qEmptys16);
+    q14u16 = vmvnq_u16(q14u16);
+
+    d28u16 = vget_low_u16(q14u16);
+    d29u16 = vget_high_u16(q14u16);
+    d1s16 = vsub_s16(d1s16, vreinterpret_s16_u16(d28u16));
+    d5s16 = vsub_s16(d5s16, vreinterpret_s16_u16(d29u16));
+
+    q0s16 = vcombine_s16(d0s16, d1s16);
+    q1s16 = vcombine_s16(d2s16, d3s16);
+    q2s16 = vcombine_s16(d4s16, d5s16);
+    q3s16 = vcombine_s16(d6s16, d7s16);
+
+    vst1q_s16(output, q0s16);
+    vst1q_s16(output + 8, q1s16);
+    vst1q_s16(output + 16, q2s16);
+    vst1q_s16(output + 24, q3s16);
+    return;
+}
diff --git a/libs/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c b/libs/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c
new file mode 100644
index 0000000000..5ad9465002
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c
@@ -0,0 +1,129 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "vpx_ports/arm.h"
+
+#ifdef VPX_INCOMPATIBLE_GCC
+#include "./vp8_rtcd.h"
+void vp8_short_walsh4x4_neon(
+        int16_t *input,
+        int16_t *output,
+        int pitch) {
+  vp8_short_walsh4x4_c(input, output, pitch);
+}
+#else
+void vp8_short_walsh4x4_neon(
+        int16_t *input,
+        int16_t *output,
+        int pitch) {
+    uint16x4_t d16u16;
+    int16x8_t q0s16, q1s16;
+    int16x4_t dEmptys16, d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+    int32x4_t qEmptys32, q0s32, q1s32, q2s32, q3s32, q8s32;
+    int32x4_t q9s32, q10s32, q11s32, q15s32;
+    uint32x4_t q8u32, q9u32, q10u32, q11u32;
+    int16x4x2_t v2tmp0, v2tmp1;
+    int32x2x2_t v2tmp2, v2tmp3;
+
+    dEmptys16 = vdup_n_s16(0);
+    qEmptys32 = vdupq_n_s32(0);
+    q15s32 = vdupq_n_s32(3);
+
+    d0s16 = vld1_s16(input);
+    input += pitch/2;
+    d1s16 = vld1_s16(input);
+    input += pitch/2;
+    d2s16 = vld1_s16(input);
+    input += pitch/2;
+    d3s16 = vld1_s16(input);
+
+    v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16),
+                      vreinterpret_s32_s16(d2s16));
+    v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16),
+                      vreinterpret_s32_s16(d3s16));
+    v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]),   // d0
+                      vreinterpret_s16_s32(v2tmp3.val[0]));  // d1
+    v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]),   // d2
+                      vreinterpret_s16_s32(v2tmp3.val[1]));  // d3
+
+    d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[0]);
+    d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[1]);
+    d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[1]);
+    d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[0]);
+
+    d4s16 = vshl_n_s16(d4s16, 2);
+    d5s16 = vshl_n_s16(d5s16, 2);
+    d6s16 = vshl_n_s16(d6s16, 2);
+    d7s16 = vshl_n_s16(d7s16, 2);
+
+    d16u16 = vceq_s16(d4s16, dEmptys16);
+    d16u16 = vmvn_u16(d16u16);
+
+    d0s16 = vadd_s16(d4s16, d5s16);
+    d3s16 = vsub_s16(d4s16, d5s16);
+    d1s16 = vadd_s16(d7s16, d6s16);
+    d2s16 = vsub_s16(d7s16, d6s16);
+
+    d0s16 = vsub_s16(d0s16, vreinterpret_s16_u16(d16u16));
+
+    // Second for-loop
+    v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d1s16),
+                      vreinterpret_s32_s16(d3s16));
+    v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d0s16),
+                      vreinterpret_s32_s16(d2s16));
+    v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp3.val[1]),   // d2
+                      vreinterpret_s16_s32(v2tmp2.val[1]));  // d3
+    v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp3.val[0]),   // d0
+                      vreinterpret_s16_s32(v2tmp2.val[0]));  // d1
+
+    q8s32  = vaddl_s16(v2tmp1.val[0], v2tmp0.val[0]);
+    q9s32  = vaddl_s16(v2tmp1.val[1], v2tmp0.val[1]);
+    q10s32 = vsubl_s16(v2tmp1.val[1], v2tmp0.val[1]);
+    q11s32 = vsubl_s16(v2tmp1.val[0], v2tmp0.val[0]);
+
+    q0s32 = vaddq_s32(q8s32, q9s32);
+    q1s32 = vaddq_s32(q11s32, q10s32);
+    q2s32 = vsubq_s32(q11s32, q10s32);
+    q3s32 = vsubq_s32(q8s32, q9s32);
+
+    q8u32  = vcltq_s32(q0s32, qEmptys32);
+    q9u32  = vcltq_s32(q1s32, qEmptys32);
+    q10u32 = vcltq_s32(q2s32, qEmptys32);
+    q11u32 = vcltq_s32(q3s32, qEmptys32);
+
+    q8s32  = vreinterpretq_s32_u32(q8u32);
+    q9s32  = vreinterpretq_s32_u32(q9u32);
+    q10s32 = vreinterpretq_s32_u32(q10u32);
+    q11s32 = vreinterpretq_s32_u32(q11u32);
+
+    q0s32 = vsubq_s32(q0s32, q8s32);
+    q1s32 = vsubq_s32(q1s32, q9s32);
+    q2s32 = vsubq_s32(q2s32, q10s32);
+    q3s32 = vsubq_s32(q3s32, q11s32);
+
+    q8s32  = vaddq_s32(q0s32, q15s32);
+    q9s32  = vaddq_s32(q1s32, q15s32);
+    q10s32 = vaddq_s32(q2s32, q15s32);
+    q11s32 = vaddq_s32(q3s32, q15s32);
+
+    d0s16 = vshrn_n_s32(q8s32, 3);
+    d1s16 = vshrn_n_s32(q9s32, 3);
+    d2s16 = vshrn_n_s32(q10s32, 3);
+    d3s16 = vshrn_n_s32(q11s32, 3);
+
+    q0s16 = vcombine_s16(d0s16, d1s16);
+    q1s16 = vcombine_s16(d2s16, d3s16);
+
+    vst1q_s16(output, q0s16);
+    vst1q_s16(output + 8, q1s16);
+    return;
+}
+#endif  // VPX_INCOMPATIBLE_GCC
diff --git a/libs/libvpx/vp8/encoder/bitstream.c b/libs/libvpx/vp8/encoder/bitstream.c
new file mode 100644
index 0000000000..f3d91b5528
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/bitstream.c
@@ -0,0 +1,1739 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp8/common/header.h"
+#include "encodemv.h"
+#include "vp8/common/entropymode.h"
+#include "vp8/common/findnearmv.h"
+#include "mcomp.h"
+#include "vp8/common/systemdependent.h"
+#include <assert.h>
+#include <stdio.h>
+#include <limits.h>
+#include "vpx/vpx_encoder.h"
+#include "vpx_mem/vpx_mem.h"
+#include "bitstream.h"
+
+#include "defaultcoefcounts.h"
+#include "vp8/common/common.h"
+
+const int vp8cx_base_skip_false_prob[128] =
+{
+    255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255,
+    251, 248, 244, 240, 236, 232, 229, 225,
+    221, 217, 213, 208, 204, 199, 194, 190,
+    187, 183, 179, 175, 172, 168, 164, 160,
+    157, 153, 149, 145, 142, 138, 134, 130,
+    127, 124, 120, 117, 114, 110, 107, 104,
+    101, 98,  95,  92,  89,  86,  83, 80,
+    77,  74,  71,  68,  65,  62,  59, 56,
+    53,  50,  47,  44,  41,  38,  35, 32,
+    30,  28,  26,  24,  22,  20,  18, 16,
+};
+
+#if defined(SECTIONBITS_OUTPUT)
+unsigned __int64 Sectionbits[500];
+#endif
+
+#ifdef VP8_ENTROPY_STATS
+int intra_mode_stats[10][10][10];
+static unsigned int tree_update_hist [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES] [2];
+extern unsigned int active_section;
+#endif
+
+#ifdef MODE_STATS
+int count_mb_seg[4] = { 0, 0, 0, 0 };
+#endif
+
+
+static void update_mode(
+    vp8_writer *const w,
+    int n,
+    vp8_token tok               [/* n */],
+    vp8_tree tree,
+    vp8_prob Pnew               [/* n-1 */],
+    vp8_prob Pcur               [/* n-1 */],
+    unsigned int bct            [/* n-1 */] [2],
+    const unsigned int num_events[/* n */]
+)
+{
+    unsigned int new_b = 0, old_b = 0;
+    int i = 0;
+
+    vp8_tree_probs_from_distribution(
+        n--, tok, tree,
+        Pnew, bct, num_events,
+        256, 1
+    );
+
+    do
+    {
+        new_b += vp8_cost_branch(bct[i], Pnew[i]);
+        old_b += vp8_cost_branch(bct[i], Pcur[i]);
+    }
+    while (++i < n);
+
+    if (new_b + (n << 8) < old_b)
+    {
+        int j = 0;
+
+        vp8_write_bit(w, 1);
+
+        do
+        {
+            const vp8_prob p = Pnew[j];
+
+            vp8_write_literal(w, Pcur[j] = p ? p : 1, 8);
+        }
+        while (++j < n);
+    }
+    else
+        vp8_write_bit(w, 0);
+}
+
+static void update_mbintra_mode_probs(VP8_COMP *cpi)
+{
+    VP8_COMMON *const x = & cpi->common;
+
+    vp8_writer *const w = cpi->bc;
+
+    {
+        vp8_prob Pnew   [VP8_YMODES-1];
+        unsigned int bct [VP8_YMODES-1] [2];
+
+        update_mode(
+            w, VP8_YMODES, vp8_ymode_encodings, vp8_ymode_tree,
+            Pnew, x->fc.ymode_prob, bct, (unsigned int *)cpi->mb.ymode_count
+        );
+    }
+    {
+        vp8_prob Pnew   [VP8_UV_MODES-1];
+        unsigned int bct [VP8_UV_MODES-1] [2];
+
+        update_mode(
+            w, VP8_UV_MODES, vp8_uv_mode_encodings, vp8_uv_mode_tree,
+            Pnew, x->fc.uv_mode_prob, bct, (unsigned int *)cpi->mb.uv_mode_count
+        );
+    }
+}
+
+static void write_ymode(vp8_writer *bc, int m, const vp8_prob *p)
+{
+    vp8_write_token(bc, vp8_ymode_tree, p, vp8_ymode_encodings + m);
+}
+
+static void kfwrite_ymode(vp8_writer *bc, int m, const vp8_prob *p)
+{
+    vp8_write_token(bc, vp8_kf_ymode_tree, p, vp8_kf_ymode_encodings + m);
+}
+
+static void write_uv_mode(vp8_writer *bc, int m, const vp8_prob *p)
+{
+    vp8_write_token(bc, vp8_uv_mode_tree, p, vp8_uv_mode_encodings + m);
+}
+
+
+static void write_bmode(vp8_writer *bc, int m, const vp8_prob *p)
+{
+    vp8_write_token(bc, vp8_bmode_tree, p, vp8_bmode_encodings + m);
+}
+
+static void write_split(vp8_writer *bc, int x)
+{
+    vp8_write_token(
+        bc, vp8_mbsplit_tree, vp8_mbsplit_probs, vp8_mbsplit_encodings + x
+    );
+}
+
+void vp8_pack_tokens(vp8_writer *w, const TOKENEXTRA *p, int xcount)
+{
+    const TOKENEXTRA *stop = p + xcount;
+    unsigned int split;
+    unsigned int shift;
+    int count = w->count;
+    unsigned int range = w->range;
+    unsigned int lowvalue = w->lowvalue;
+
+    while (p < stop)
+    {
+        const int t = p->Token;
+        vp8_token *a = vp8_coef_encodings + t;
+        const vp8_extra_bit_struct *b = vp8_extra_bits + t;
+        int i = 0;
+        const unsigned char *pp = p->context_tree;
+        int v = a->value;
+        int n = a->Len;
+
+        if (p->skip_eob_node)
+        {
+            n--;
+            i = 2;
+        }
+
+        do
+        {
+            const int bb = (v >> --n) & 1;
+            split = 1 + (((range - 1) * pp[i>>1]) >> 8);
+            i = vp8_coef_tree[i+bb];
+
+            if (bb)
+            {
+                lowvalue += split;
+                range = range - split;
+            }
+            else
+            {
+                range = split;
+            }
+
+            shift = vp8_norm[range];
+            range <<= shift;
+            count += shift;
+
+            if (count >= 0)
+            {
+                int offset = shift - count;
+
+                if ((lowvalue << (offset - 1)) & 0x80000000)
+                {
+                    int x = w->pos - 1;
+
+                    while (x >= 0 && w->buffer[x] == 0xff)
+                    {
+                        w->buffer[x] = (unsigned char)0;
+                        x--;
+                    }
+
+                    w->buffer[x] += 1;
+                }
+
+                validate_buffer(w->buffer + w->pos,
+                                1,
+                                w->buffer_end,
+                                w->error);
+
+                w->buffer[w->pos++] = (lowvalue >> (24 - offset));
+                lowvalue <<= offset;
+                shift = count;
+                lowvalue &= 0xffffff;
+                count -= 8 ;
+            }
+
+            lowvalue <<= shift;
+        }
+        while (n);
+
+
+        if (b->base_val)
+        {
+            const int e = p->Extra, L = b->Len;
+
+            if (L)
+            {
+                const unsigned char *proba = b->prob;
+                const int v2 = e >> 1;
+                int n2 = L;              /* number of bits in v2, assumed nonzero */
+                i = 0;
+
+                do
+                {
+                    const int bb = (v2 >> --n2) & 1;
+                    split = 1 + (((range - 1) * proba[i>>1]) >> 8);
+                    i = b->tree[i+bb];
+
+                    if (bb)
+                    {
+                        lowvalue += split;
+                        range = range - split;
+                    }
+                    else
+                    {
+                        range = split;
+                    }
+
+                    shift = vp8_norm[range];
+                    range <<= shift;
+                    count += shift;
+
+                    if (count >= 0)
+                    {
+                        int offset = shift - count;
+
+                        if ((lowvalue << (offset - 1)) & 0x80000000)
+                        {
+                            int x = w->pos - 1;
+
+                            while (x >= 0 && w->buffer[x] == 0xff)
+                            {
+                                w->buffer[x] = (unsigned char)0;
+                                x--;
+                            }
+
+                            w->buffer[x] += 1;
+                        }
+
+                        validate_buffer(w->buffer + w->pos,
+                                        1,
+                                        w->buffer_end,
+                                        w->error);
+
+                        w->buffer[w->pos++] = (lowvalue >> (24 - offset));
+                        lowvalue <<= offset;
+                        shift = count;
+                        lowvalue &= 0xffffff;
+                        count -= 8 ;
+                    }
+
+                    lowvalue <<= shift;
+                }
+                while (n2);
+            }
+
+
+            {
+
+                split = (range + 1) >> 1;
+
+                if (e & 1)
+                {
+                    lowvalue += split;
+                    range = range - split;
+                }
+                else
+                {
+                    range = split;
+                }
+
+                range <<= 1;
+
+                if ((lowvalue & 0x80000000))
+                {
+                    int x = w->pos - 1;
+
+                    while (x >= 0 && w->buffer[x] == 0xff)
+                    {
+                        w->buffer[x] = (unsigned char)0;
+                        x--;
+                    }
+
+                    w->buffer[x] += 1;
+
+                }
+
+                lowvalue  <<= 1;
+
+                if (!++count)
+                {
+                    count = -8;
+
+                    validate_buffer(w->buffer + w->pos,
+                                    1,
+                                    w->buffer_end,
+                                    w->error);
+
+                    w->buffer[w->pos++] = (lowvalue >> 24);
+                    lowvalue &= 0xffffff;
+                }
+            }
+
+        }
+
+        ++p;
+    }
+
+    w->count = count;
+    w->lowvalue = lowvalue;
+    w->range = range;
+
+}
+
+static void write_partition_size(unsigned char *cx_data, int size)
+{
+    signed char csize;
+
+    csize = size & 0xff;
+    *cx_data = csize;
+    csize = (size >> 8) & 0xff;
+    *(cx_data + 1) = csize;
+    csize = (size >> 16) & 0xff;
+    *(cx_data + 2) = csize;
+
+}
+
+static void pack_tokens_into_partitions(VP8_COMP *cpi, unsigned char *cx_data,
+                                          unsigned char * cx_data_end,
+                                          int num_part)
+{
+
+    int i;
+    unsigned char *ptr = cx_data;
+    unsigned char *ptr_end = cx_data_end;
+    vp8_writer * w;
+
+    for (i = 0; i < num_part; i++)
+    {
+        int mb_row;
+
+        w = cpi->bc + i + 1;
+
+        vp8_start_encode(w, ptr, ptr_end);
+
+        for (mb_row = i; mb_row < cpi->common.mb_rows; mb_row += num_part)
+        {
+            const TOKENEXTRA *p    = cpi->tplist[mb_row].start;
+            const TOKENEXTRA *stop = cpi->tplist[mb_row].stop;
+            int tokens = (int)(stop - p);
+
+            vp8_pack_tokens(w, p, tokens);
+        }
+
+        vp8_stop_encode(w);
+        ptr += w->pos;
+    }
+}
+
+
+#if CONFIG_MULTITHREAD
+static void pack_mb_row_tokens(VP8_COMP *cpi, vp8_writer *w)
+{
+    int mb_row;
+
+    for (mb_row = 0; mb_row < cpi->common.mb_rows; mb_row++)
+    {
+        const TOKENEXTRA *p    = cpi->tplist[mb_row].start;
+        const TOKENEXTRA *stop = cpi->tplist[mb_row].stop;
+        int tokens = (int)(stop - p);
+
+        vp8_pack_tokens(w, p, tokens);
+    }
+
+}
+#endif  // CONFIG_MULTITHREAD
+
+static void write_mv_ref
+(
+    vp8_writer *w, MB_PREDICTION_MODE m, const vp8_prob *p
+)
+{
+#if CONFIG_DEBUG
+    assert(NEARESTMV <= m  &&  m <= SPLITMV);
+#endif
+    vp8_write_token(w, vp8_mv_ref_tree, p,
+                    vp8_mv_ref_encoding_array + (m - NEARESTMV));
+}
+
+static void write_sub_mv_ref
+(
+    vp8_writer *w, B_PREDICTION_MODE m, const vp8_prob *p
+)
+{
+#if CONFIG_DEBUG
+    assert(LEFT4X4 <= m  &&  m <= NEW4X4);
+#endif
+    vp8_write_token(w, vp8_sub_mv_ref_tree, p,
+                    vp8_sub_mv_ref_encoding_array + (m - LEFT4X4));
+}
+
+static void write_mv
+(
+    vp8_writer *w, const MV *mv, const int_mv *ref, const MV_CONTEXT *mvc
+)
+{
+    MV e;
+    e.row = mv->row - ref->as_mv.row;
+    e.col = mv->col - ref->as_mv.col;
+
+    vp8_encode_motion_vector(w, &e, mvc);
+}
+
+static void write_mb_features(vp8_writer *w, const MB_MODE_INFO *mi, const MACROBLOCKD *x)
+{
+    /* Encode the MB segment id. */
+    if (x->segmentation_enabled && x->update_mb_segmentation_map)
+    {
+        switch (mi->segment_id)
+        {
+        case 0:
+            vp8_write(w, 0, x->mb_segment_tree_probs[0]);
+            vp8_write(w, 0, x->mb_segment_tree_probs[1]);
+            break;
+        case 1:
+            vp8_write(w, 0, x->mb_segment_tree_probs[0]);
+            vp8_write(w, 1, x->mb_segment_tree_probs[1]);
+            break;
+        case 2:
+            vp8_write(w, 1, x->mb_segment_tree_probs[0]);
+            vp8_write(w, 0, x->mb_segment_tree_probs[2]);
+            break;
+        case 3:
+            vp8_write(w, 1, x->mb_segment_tree_probs[0]);
+            vp8_write(w, 1, x->mb_segment_tree_probs[2]);
+            break;
+
+            /* TRAP.. This should not happen */
+        default:
+            vp8_write(w, 0, x->mb_segment_tree_probs[0]);
+            vp8_write(w, 0, x->mb_segment_tree_probs[1]);
+            break;
+        }
+    }
+}
+void vp8_convert_rfct_to_prob(VP8_COMP *const cpi)
+{
+    const int *const rfct = cpi->mb.count_mb_ref_frame_usage;
+    const int rf_intra = rfct[INTRA_FRAME];
+    const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];
+
+    /* Calculate the probabilities used to code the ref frame based on usage */
+    if (!(cpi->prob_intra_coded = rf_intra * 255 / (rf_intra + rf_inter)))
+        cpi->prob_intra_coded = 1;
+
+    cpi->prob_last_coded = rf_inter ? (rfct[LAST_FRAME] * 255) / rf_inter : 128;
+
+    if (!cpi->prob_last_coded)
+        cpi->prob_last_coded = 1;
+
+    cpi->prob_gf_coded = (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME])
+                  ? (rfct[GOLDEN_FRAME] * 255) / (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) : 128;
+
+    if (!cpi->prob_gf_coded)
+        cpi->prob_gf_coded = 1;
+
+}
+
+static void pack_inter_mode_mvs(VP8_COMP *const cpi)
+{
+    VP8_COMMON *const pc = & cpi->common;
+    vp8_writer *const w = cpi->bc;
+    const MV_CONTEXT *mvc = pc->fc.mvc;
+
+
+    MODE_INFO *m = pc->mi;
+    const int mis = pc->mode_info_stride;
+    int mb_row = -1;
+
+    int prob_skip_false = 0;
+
+    cpi->mb.partition_info = cpi->mb.pi;
+
+    vp8_convert_rfct_to_prob(cpi);
+
+#ifdef VP8_ENTROPY_STATS
+    active_section = 1;
+#endif
+
+    if (pc->mb_no_coeff_skip)
+    {
+        int total_mbs = pc->mb_rows * pc->mb_cols;
+
+        prob_skip_false = (total_mbs - cpi->mb.skip_true_count ) * 256 / total_mbs;
+
+        if (prob_skip_false <= 1)
+            prob_skip_false = 1;
+
+        if (prob_skip_false > 255)
+            prob_skip_false = 255;
+
+        cpi->prob_skip_false = prob_skip_false;
+        vp8_write_literal(w, prob_skip_false, 8);
+    }
+
+    vp8_write_literal(w, cpi->prob_intra_coded, 8);
+    vp8_write_literal(w, cpi->prob_last_coded, 8);
+    vp8_write_literal(w, cpi->prob_gf_coded, 8);
+
+    update_mbintra_mode_probs(cpi);
+
+    vp8_write_mvprobs(cpi);
+
+    while (++mb_row < pc->mb_rows)
+    {
+        int mb_col = -1;
+
+        while (++mb_col < pc->mb_cols)
+        {
+            const MB_MODE_INFO *const mi = & m->mbmi;
+            const MV_REFERENCE_FRAME rf = mi->ref_frame;
+            const MB_PREDICTION_MODE mode = mi->mode;
+
+            MACROBLOCKD *xd = &cpi->mb.e_mbd;
+
+            /* Distance of Mb to the various image edges.
+             * These specified to 8th pel as they are always compared to MV
+             * values that are in 1/8th pel units
+             */
+            xd->mb_to_left_edge = -((mb_col * 16) << 3);
+            xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
+            xd->mb_to_top_edge = -((mb_row * 16) << 3);
+            xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;
+
+#ifdef VP8_ENTROPY_STATS
+            active_section = 9;
+#endif
+
+            if (cpi->mb.e_mbd.update_mb_segmentation_map)
+                write_mb_features(w, mi, &cpi->mb.e_mbd);
+
+            if (pc->mb_no_coeff_skip)
+                vp8_encode_bool(w, m->mbmi.mb_skip_coeff, prob_skip_false);
+
+            if (rf == INTRA_FRAME)
+            {
+                vp8_write(w, 0, cpi->prob_intra_coded);
+#ifdef VP8_ENTROPY_STATS
+                active_section = 6;
+#endif
+                write_ymode(w, mode, pc->fc.ymode_prob);
+
+                if (mode == B_PRED)
+                {
+                    int j = 0;
+
+                    do
+                        write_bmode(w, m->bmi[j].as_mode, pc->fc.bmode_prob);
+                    while (++j < 16);
+                }
+
+                write_uv_mode(w, mi->uv_mode, pc->fc.uv_mode_prob);
+            }
+            else    /* inter coded */
+            {
+                int_mv best_mv;
+                vp8_prob mv_ref_p [VP8_MVREFS-1];
+
+                vp8_write(w, 1, cpi->prob_intra_coded);
+
+                if (rf == LAST_FRAME)
+                    vp8_write(w, 0, cpi->prob_last_coded);
+                else
+                {
+                    vp8_write(w, 1, cpi->prob_last_coded);
+                    vp8_write(w, (rf == GOLDEN_FRAME) ? 0 : 1, cpi->prob_gf_coded);
+                }
+
+                {
+                    int_mv n1, n2;
+                    int ct[4];
+
+                    vp8_find_near_mvs(xd, m, &n1, &n2, &best_mv, ct, rf, cpi->common.ref_frame_sign_bias);
+                    vp8_clamp_mv2(&best_mv, xd);
+
+                    vp8_mv_ref_probs(mv_ref_p, ct);
+
+#ifdef VP8_ENTROPY_STATS
+                    accum_mv_refs(mode, ct);
+#endif
+
+                }
+
+#ifdef VP8_ENTROPY_STATS
+                active_section = 3;
+#endif
+
+                write_mv_ref(w, mode, mv_ref_p);
+
+                switch (mode)   /* new, split require MVs */
+                {
+                case NEWMV:
+
+#ifdef VP8_ENTROPY_STATS
+                    active_section = 5;
+#endif
+
+                    write_mv(w, &mi->mv.as_mv, &best_mv, mvc);
+                    break;
+
+                case SPLITMV:
+                {
+                    int j = 0;
+
+#ifdef MODE_STATS
+                    ++count_mb_seg [mi->partitioning];
+#endif
+
+                    write_split(w, mi->partitioning);
+
+                    do
+                    {
+                        B_PREDICTION_MODE blockmode;
+                        int_mv blockmv;
+                        const int *const  L = vp8_mbsplits [mi->partitioning];
+                        int k = -1;  /* first block in subset j */
+                        int mv_contz;
+                        int_mv leftmv, abovemv;
+
+                        blockmode =  cpi->mb.partition_info->bmi[j].mode;
+                        blockmv =  cpi->mb.partition_info->bmi[j].mv;
+#if CONFIG_DEBUG
+                        while (j != L[++k])
+                            if (k >= 16)
+                                assert(0);
+#else
+                        while (j != L[++k]);
+#endif
+                        leftmv.as_int = left_block_mv(m, k);
+                        abovemv.as_int = above_block_mv(m, k, mis);
+                        mv_contz = vp8_mv_cont(&leftmv, &abovemv);
+
+                        write_sub_mv_ref(w, blockmode, vp8_sub_mv_ref_prob2 [mv_contz]);
+
+                        if (blockmode == NEW4X4)
+                        {
+#ifdef VP8_ENTROPY_STATS
+                            active_section = 11;
+#endif
+                            write_mv(w, &blockmv.as_mv, &best_mv, (const MV_CONTEXT *) mvc);
+                        }
+                    }
+                    while (++j < cpi->mb.partition_info->count);
+                }
+                break;
+                default:
+                    break;
+                }
+            }
+
+            ++m;
+            cpi->mb.partition_info++;
+        }
+
+        ++m;  /* skip L prediction border */
+        cpi->mb.partition_info++;
+    }
+}
+
+
+static void write_kfmodes(VP8_COMP *cpi)
+{
+    vp8_writer *const bc = cpi->bc;
+    const VP8_COMMON *const c = & cpi->common;
+    /* const */
+    MODE_INFO *m = c->mi;
+
+    int mb_row = -1;
+    int prob_skip_false = 0;
+
+    if (c->mb_no_coeff_skip)
+    {
+        int total_mbs = c->mb_rows * c->mb_cols;
+
+        prob_skip_false = (total_mbs - cpi->mb.skip_true_count ) * 256 / total_mbs;
+
+        if (prob_skip_false <= 1)
+            prob_skip_false = 1;
+
+        if (prob_skip_false >= 255)
+            prob_skip_false = 255;
+
+        cpi->prob_skip_false = prob_skip_false;
+        vp8_write_literal(bc, prob_skip_false, 8);
+    }
+
+    while (++mb_row < c->mb_rows)
+    {
+        int mb_col = -1;
+
+        while (++mb_col < c->mb_cols)
+        {
+            const int ym = m->mbmi.mode;
+
+            if (cpi->mb.e_mbd.update_mb_segmentation_map)
+                write_mb_features(bc, &m->mbmi, &cpi->mb.e_mbd);
+
+            if (c->mb_no_coeff_skip)
+                vp8_encode_bool(bc, m->mbmi.mb_skip_coeff, prob_skip_false);
+
+            kfwrite_ymode(bc, ym, vp8_kf_ymode_prob);
+
+            if (ym == B_PRED)
+            {
+                const int mis = c->mode_info_stride;
+                int i = 0;
+
+                do
+                {
+                    const B_PREDICTION_MODE A = above_block_mode(m, i, mis);
+                    const B_PREDICTION_MODE L = left_block_mode(m, i);
+                    const int bm = m->bmi[i].as_mode;
+
+#ifdef VP8_ENTROPY_STATS
+                    ++intra_mode_stats [A] [L] [bm];
+#endif
+
+                    write_bmode(bc, bm, vp8_kf_bmode_prob [A] [L]);
+                }
+                while (++i < 16);
+            }
+
+            write_uv_mode(bc, (m++)->mbmi.uv_mode, vp8_kf_uv_mode_prob);
+        }
+
+        m++;    /* skip L prediction border */
+    }
+}
+
+#if 0
+/* This function is used for debugging probability trees. */
+static void print_prob_tree(vp8_prob
+     coef_probs[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES])
+{
+    /* print coef probability tree */
+    int i,j,k,l;
+    FILE* f = fopen("enc_tree_probs.txt", "a");
+    fprintf(f, "{\n");
+    for (i = 0; i < BLOCK_TYPES; i++)
+    {
+        fprintf(f, "  {\n");
+        for (j = 0; j < COEF_BANDS; j++)
+        {
+            fprintf(f, "    {\n");
+            for (k = 0; k < PREV_COEF_CONTEXTS; k++)
+            {
+                fprintf(f, "      {");
+                for (l = 0; l < ENTROPY_NODES; l++)
+                {
+                    fprintf(f, "%3u, ",
+                            (unsigned int)(coef_probs [i][j][k][l]));
+                }
+                fprintf(f, " }\n");
+            }
+            fprintf(f, "    }\n");
+        }
+        fprintf(f, "  }\n");
+    }
+    fprintf(f, "}\n");
+    fclose(f);
+}
+#endif
+
+static void sum_probs_over_prev_coef_context(
+        const unsigned int probs[PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS],
+        unsigned int* out)
+{
+    int i, j;
+    for (i=0; i < MAX_ENTROPY_TOKENS; ++i)
+    {
+        for (j=0; j < PREV_COEF_CONTEXTS; ++j)
+        {
+            const unsigned int tmp = out[i];
+            out[i] += probs[j][i];
+            /* check for wrap */
+            if (out[i] < tmp)
+                out[i] = UINT_MAX;
+        }
+    }
+}
+
+static int prob_update_savings(const unsigned int *ct,
+                                   const vp8_prob oldp, const vp8_prob newp,
+                                   const vp8_prob upd)
+{
+    const int old_b = vp8_cost_branch(ct, oldp);
+    const int new_b = vp8_cost_branch(ct, newp);
+    const int update_b = 8 +
+                         ((vp8_cost_one(upd) - vp8_cost_zero(upd)) >> 8);
+
+    return old_b - new_b - update_b;
+}
+
+static int independent_coef_context_savings(VP8_COMP *cpi)
+{
+    MACROBLOCK *const x = & cpi->mb;
+    int savings = 0;
+    int i = 0;
+    do
+    {
+        int j = 0;
+        do
+        {
+            int k = 0;
+            unsigned int prev_coef_count_sum[MAX_ENTROPY_TOKENS] = {0};
+            int prev_coef_savings[MAX_ENTROPY_TOKENS] = {0};
+            const unsigned int (*probs)[MAX_ENTROPY_TOKENS];
+            /* Calculate new probabilities given the constraint that
+             * they must be equal over the prev coef contexts
+             */
+
+            probs = (const unsigned int (*)[MAX_ENTROPY_TOKENS])
+                x->coef_counts[i][j];
+
+            /* Reset to default probabilities at key frames */
+            if (cpi->common.frame_type == KEY_FRAME)
+                probs = default_coef_counts[i][j];
+
+            sum_probs_over_prev_coef_context(probs, prev_coef_count_sum);
+
+            do
+            {
+                /* at every context */
+
+                /* calc probs and branch cts for this frame only */
+                int t = 0;      /* token/prob index */
+
+                vp8_tree_probs_from_distribution(
+                    MAX_ENTROPY_TOKENS, vp8_coef_encodings, vp8_coef_tree,
+                    cpi->frame_coef_probs[i][j][k],
+                    cpi->frame_branch_ct [i][j][k],
+                    prev_coef_count_sum,
+                    256, 1);
+
+                do
+                {
+                    const unsigned int *ct  = cpi->frame_branch_ct [i][j][k][t];
+                    const vp8_prob newp = cpi->frame_coef_probs [i][j][k][t];
+                    const vp8_prob oldp = cpi->common.fc.coef_probs [i][j][k][t];
+                    const vp8_prob upd = vp8_coef_update_probs [i][j][k][t];
+                    const int s = prob_update_savings(ct, oldp, newp, upd);
+
+                    if (cpi->common.frame_type != KEY_FRAME ||
+                        (cpi->common.frame_type == KEY_FRAME && newp != oldp))
+                        prev_coef_savings[t] += s;
+                }
+                while (++t < ENTROPY_NODES);
+            }
+            while (++k < PREV_COEF_CONTEXTS);
+            k = 0;
+            do
+            {
+                /* We only update probabilities if we can save bits, except
+                 * for key frames where we have to update all probabilities
+                 * to get the equal probabilities across the prev coef
+                 * contexts.
+                 */
+                if (prev_coef_savings[k] > 0 ||
+                    cpi->common.frame_type == KEY_FRAME)
+                    savings += prev_coef_savings[k];
+            }
+            while (++k < ENTROPY_NODES);
+        }
+        while (++j < COEF_BANDS);
+    }
+    while (++i < BLOCK_TYPES);
+    return savings;
+}
+
+static int default_coef_context_savings(VP8_COMP *cpi)
+{
+    MACROBLOCK *const x = & cpi->mb;
+    int savings = 0;
+    int i = 0;
+    do
+    {
+        int j = 0;
+        do
+        {
+            int k = 0;
+            do
+            {
+                /* at every context */
+
+                /* calc probs and branch cts for this frame only */
+                int t = 0;      /* token/prob index */
+
+                vp8_tree_probs_from_distribution(
+                    MAX_ENTROPY_TOKENS, vp8_coef_encodings, vp8_coef_tree,
+                    cpi->frame_coef_probs [i][j][k],
+                    cpi->frame_branch_ct [i][j][k],
+                    x->coef_counts [i][j][k],
+                    256, 1
+                );
+
+                do
+                {
+                    const unsigned int *ct  = cpi->frame_branch_ct [i][j][k][t];
+                    const vp8_prob newp = cpi->frame_coef_probs [i][j][k][t];
+                    const vp8_prob oldp = cpi->common.fc.coef_probs [i][j][k][t];
+                    const vp8_prob upd = vp8_coef_update_probs [i][j][k][t];
+                    const int s = prob_update_savings(ct, oldp, newp, upd);
+
+                    if (s > 0)
+                    {
+                        savings += s;
+                    }
+                }
+                while (++t < ENTROPY_NODES);
+            }
+            while (++k < PREV_COEF_CONTEXTS);
+        }
+        while (++j < COEF_BANDS);
+    }
+    while (++i < BLOCK_TYPES);
+    return savings;
+}
+
+void vp8_calc_ref_frame_costs(int *ref_frame_cost,
+                              int prob_intra,
+                              int prob_last,
+                              int prob_garf
+                             )
+{
+    assert(prob_intra >= 0);
+    assert(prob_intra <= 255);
+    assert(prob_last >= 0);
+    assert(prob_last <= 255);
+    assert(prob_garf >= 0);
+    assert(prob_garf <= 255);
+    ref_frame_cost[INTRA_FRAME]   = vp8_cost_zero(prob_intra);
+    ref_frame_cost[LAST_FRAME]    = vp8_cost_one(prob_intra)
+                                    + vp8_cost_zero(prob_last);
+    ref_frame_cost[GOLDEN_FRAME]  = vp8_cost_one(prob_intra)
+                                    + vp8_cost_one(prob_last)
+                                    + vp8_cost_zero(prob_garf);
+    ref_frame_cost[ALTREF_FRAME]  = vp8_cost_one(prob_intra)
+                                    + vp8_cost_one(prob_last)
+                                    + vp8_cost_one(prob_garf);
+
+}
+
+int vp8_estimate_entropy_savings(VP8_COMP *cpi)
+{
+    int savings = 0;
+
+    const int *const rfct = cpi->mb.count_mb_ref_frame_usage;
+    const int rf_intra = rfct[INTRA_FRAME];
+    const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];
+    int new_intra, new_last, new_garf, oldtotal, newtotal;
+    int ref_frame_cost[MAX_REF_FRAMES];
+
+    vp8_clear_system_state();
+
+    if (cpi->common.frame_type != KEY_FRAME)
+    {
+        if (!(new_intra = rf_intra * 255 / (rf_intra + rf_inter)))
+            new_intra = 1;
+
+        new_last = rf_inter ? (rfct[LAST_FRAME] * 255) / rf_inter : 128;
+
+        new_garf = (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME])
+                  ? (rfct[GOLDEN_FRAME] * 255) / (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) : 128;
+
+
+        vp8_calc_ref_frame_costs(ref_frame_cost,new_intra,new_last,new_garf);
+
+        newtotal =
+            rfct[INTRA_FRAME] * ref_frame_cost[INTRA_FRAME] +
+            rfct[LAST_FRAME] * ref_frame_cost[LAST_FRAME] +
+            rfct[GOLDEN_FRAME] * ref_frame_cost[GOLDEN_FRAME] +
+            rfct[ALTREF_FRAME] * ref_frame_cost[ALTREF_FRAME];
+
+
+        /* old costs */
+        vp8_calc_ref_frame_costs(ref_frame_cost,cpi->prob_intra_coded,
+                                 cpi->prob_last_coded,cpi->prob_gf_coded);
+
+        oldtotal =
+            rfct[INTRA_FRAME] * ref_frame_cost[INTRA_FRAME] +
+            rfct[LAST_FRAME] * ref_frame_cost[LAST_FRAME] +
+            rfct[GOLDEN_FRAME] * ref_frame_cost[GOLDEN_FRAME] +
+            rfct[ALTREF_FRAME] * ref_frame_cost[ALTREF_FRAME];
+
+        savings += (oldtotal - newtotal) / 256;
+    }
+
+
+    if (cpi->oxcf.error_resilient_mode & VPX_ERROR_RESILIENT_PARTITIONS)
+        savings += independent_coef_context_savings(cpi);
+    else
+        savings += default_coef_context_savings(cpi);
+
+
+    return savings;
+}
+
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+int vp8_update_coef_context(VP8_COMP *cpi)
+{
+    int savings = 0;
+
+
+    if (cpi->common.frame_type == KEY_FRAME)
+    {
+        /* Reset to default counts/probabilities at key frames */
+        vp8_copy(cpi->mb.coef_counts, default_coef_counts);
+    }
+
+    if (cpi->oxcf.error_resilient_mode & VPX_ERROR_RESILIENT_PARTITIONS)
+        savings += independent_coef_context_savings(cpi);
+    else
+        savings += default_coef_context_savings(cpi);
+
+    return savings;
+}
+#endif
+
+void vp8_update_coef_probs(VP8_COMP *cpi)
+{
+    int i = 0;
+#if !(CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
+    vp8_writer *const w = cpi->bc;
+#endif
+    int savings = 0;
+
+    vp8_clear_system_state();
+
+    do
+    {
+        int j = 0;
+
+        do
+        {
+            int k = 0;
+            int prev_coef_savings[ENTROPY_NODES] = {0};
+            if (cpi->oxcf.error_resilient_mode & VPX_ERROR_RESILIENT_PARTITIONS)
+            {
+                for (k = 0; k < PREV_COEF_CONTEXTS; ++k)
+                {
+                    int t;      /* token/prob index */
+                    for (t = 0; t < ENTROPY_NODES; ++t)
+                    {
+                        const unsigned int *ct = cpi->frame_branch_ct [i][j]
+                                                                      [k][t];
+                        const vp8_prob newp = cpi->frame_coef_probs[i][j][k][t];
+                        const vp8_prob oldp = cpi->common.fc.coef_probs[i][j]
+                                                                       [k][t];
+                        const vp8_prob upd = vp8_coef_update_probs[i][j][k][t];
+
+                        prev_coef_savings[t] +=
+                                prob_update_savings(ct, oldp, newp, upd);
+                    }
+                }
+                k = 0;
+            }
+            do
+            {
+                /* note: use result from vp8_estimate_entropy_savings, so no
+                 * need to call vp8_tree_probs_from_distribution here.
+                 */
+
+                /* at every context */
+
+                /* calc probs and branch cts for this frame only */
+                int t = 0;      /* token/prob index */
+
+                do
+                {
+                    const vp8_prob newp = cpi->frame_coef_probs [i][j][k][t];
+
+                    vp8_prob *Pold = cpi->common.fc.coef_probs [i][j][k] + t;
+                    const vp8_prob upd = vp8_coef_update_probs [i][j][k][t];
+
+                    int s = prev_coef_savings[t];
+                    int u = 0;
+
+                    if (!(cpi->oxcf.error_resilient_mode &
+                            VPX_ERROR_RESILIENT_PARTITIONS))
+                    {
+                        s = prob_update_savings(
+                                cpi->frame_branch_ct [i][j][k][t],
+                                *Pold, newp, upd);
+                    }
+
+                    if (s > 0)
+                        u = 1;
+
+                    /* Force updates on key frames if the new is different,
+                     * so that we can be sure we end up with equal probabilities
+                     * over the prev coef contexts.
+                     */
+                    if ((cpi->oxcf.error_resilient_mode &
+                            VPX_ERROR_RESILIENT_PARTITIONS) &&
+                        cpi->common.frame_type == KEY_FRAME && newp != *Pold)
+                        u = 1;
+
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+                    cpi->update_probs[i][j][k][t] = u;
+#else
+                    vp8_write(w, u, upd);
+#endif
+
+
+#ifdef VP8_ENTROPY_STATS
+                    ++ tree_update_hist [i][j][k][t] [u];
+#endif
+
+                    if (u)
+                    {
+                        /* send/use new probability */
+
+                        *Pold = newp;
+#if !(CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
+                        vp8_write_literal(w, newp, 8);
+#endif
+
+                        savings += s;
+
+                    }
+
+                }
+                while (++t < ENTROPY_NODES);
+
+                /* Accum token counts for generation of default statistics */
+#ifdef VP8_ENTROPY_STATS
+                t = 0;
+
+                do
+                {
+                    context_counters [i][j][k][t] += cpi->coef_counts [i][j][k][t];
+                }
+                while (++t < MAX_ENTROPY_TOKENS);
+
+#endif
+
+            }
+            while (++k < PREV_COEF_CONTEXTS);
+        }
+        while (++j < COEF_BANDS);
+    }
+    while (++i < BLOCK_TYPES);
+
+}
+
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+static void pack_coef_probs(VP8_COMP *cpi)
+{
+    int i = 0;
+    vp8_writer *const w = cpi->bc;
+
+    do
+    {
+        int j = 0;
+
+        do
+        {
+            int k = 0;
+
+            do
+            {
+                int t = 0;      /* token/prob index */
+
+                do
+                {
+                    const vp8_prob newp = cpi->common.fc.coef_probs [i][j][k][t];
+                    const vp8_prob upd = vp8_coef_update_probs [i][j][k][t];
+
+                    const char u = cpi->update_probs[i][j][k][t] ;
+
+                    vp8_write(w, u, upd);
+
+                    if (u)
+                    {
+                        /* send/use new probability */
+                        vp8_write_literal(w, newp, 8);
+                    }
+                }
+                while (++t < ENTROPY_NODES);
+            }
+            while (++k < PREV_COEF_CONTEXTS);
+        }
+        while (++j < COEF_BANDS);
+    }
+    while (++i < BLOCK_TYPES);
+}
+#endif
+
+#ifdef PACKET_TESTING
+FILE *vpxlogc = 0;
+#endif
+
+static void put_delta_q(vp8_writer *bc, int delta_q)
+{
+    if (delta_q != 0)
+    {
+        vp8_write_bit(bc, 1);
+        vp8_write_literal(bc, abs(delta_q), 4);
+
+        if (delta_q < 0)
+            vp8_write_bit(bc, 1);
+        else
+            vp8_write_bit(bc, 0);
+    }
+    else
+        vp8_write_bit(bc, 0);
+}
+
+void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char * dest_end, unsigned long *size)
+{
+    int i, j;
+    VP8_HEADER oh;
+    VP8_COMMON *const pc = & cpi->common;
+    vp8_writer *const bc = cpi->bc;
+    MACROBLOCKD *const xd = & cpi->mb.e_mbd;
+    int extra_bytes_packed = 0;
+
+    unsigned char *cx_data = dest;
+    unsigned char *cx_data_end = dest_end;
+    const int *mb_feature_data_bits;
+
+    oh.show_frame = (int) pc->show_frame;
+    oh.type = (int)pc->frame_type;
+    oh.version = pc->version;
+    oh.first_partition_length_in_bytes = 0;
+
+    mb_feature_data_bits = vp8_mb_feature_data_bits;
+
+    bc[0].error = &pc->error;
+
+    validate_buffer(cx_data, 3, cx_data_end, &cpi->common.error);
+    cx_data += 3;
+
+#if defined(SECTIONBITS_OUTPUT)
+    Sectionbits[active_section = 1] += sizeof(VP8_HEADER) * 8 * 256;
+#endif
+
+    /* every keyframe send startcode, width, height, scale factor, clamp
+     * and color type
+     */
+    if (oh.type == KEY_FRAME)
+    {
+        int v;
+
+        validate_buffer(cx_data, 7, cx_data_end, &cpi->common.error);
+
+        /* Start / synch code */
+        cx_data[0] = 0x9D;
+        cx_data[1] = 0x01;
+        cx_data[2] = 0x2a;
+
+        v = (pc->horiz_scale << 14) | pc->Width;
+        cx_data[3] = v;
+        cx_data[4] = v >> 8;
+
+        v = (pc->vert_scale << 14) | pc->Height;
+        cx_data[5] = v;
+        cx_data[6] = v >> 8;
+
+
+        extra_bytes_packed = 7;
+        cx_data += extra_bytes_packed ;
+
+        vp8_start_encode(bc, cx_data, cx_data_end);
+
+        /* signal clr type */
+        vp8_write_bit(bc, 0);
+        vp8_write_bit(bc, pc->clamp_type);
+
+    }
+    else
+        vp8_start_encode(bc, cx_data, cx_data_end);
+
+
+    /* Signal whether or not Segmentation is enabled */
+    vp8_write_bit(bc, xd->segmentation_enabled);
+
+    /*  Indicate which features are enabled */
+    if (xd->segmentation_enabled)
+    {
+        /* Signal whether or not the segmentation map is being updated. */
+        vp8_write_bit(bc, xd->update_mb_segmentation_map);
+        vp8_write_bit(bc, xd->update_mb_segmentation_data);
+
+        if (xd->update_mb_segmentation_data)
+        {
+            signed char Data;
+
+            vp8_write_bit(bc, xd->mb_segement_abs_delta);
+
+            /* For each segmentation feature (Quant and loop filter level) */
+            for (i = 0; i < MB_LVL_MAX; i++)
+            {
+                /* For each of the segments */
+                for (j = 0; j < MAX_MB_SEGMENTS; j++)
+                {
+                    Data = xd->segment_feature_data[i][j];
+
+                    /* Frame level data */
+                    if (Data)
+                    {
+                        vp8_write_bit(bc, 1);
+
+                        if (Data < 0)
+                        {
+                            Data = - Data;
+                            vp8_write_literal(bc, Data, mb_feature_data_bits[i]);
+                            vp8_write_bit(bc, 1);
+                        }
+                        else
+                        {
+                            vp8_write_literal(bc, Data, mb_feature_data_bits[i]);
+                            vp8_write_bit(bc, 0);
+                        }
+                    }
+                    else
+                        vp8_write_bit(bc, 0);
+                }
+            }
+        }
+
+        if (xd->update_mb_segmentation_map)
+        {
+            /* Write the probs used to decode the segment id for each mb */
+            for (i = 0; i < MB_FEATURE_TREE_PROBS; i++)
+            {
+                int Data = xd->mb_segment_tree_probs[i];
+
+                if (Data != 255)
+                {
+                    vp8_write_bit(bc, 1);
+                    vp8_write_literal(bc, Data, 8);
+                }
+                else
+                    vp8_write_bit(bc, 0);
+            }
+        }
+    }
+
+    vp8_write_bit(bc, pc->filter_type);
+    vp8_write_literal(bc, pc->filter_level, 6);
+    vp8_write_literal(bc, pc->sharpness_level, 3);
+
+    /* Write out loop filter deltas applied at the MB level based on mode
+     * or ref frame (if they are enabled).
+     */
+    vp8_write_bit(bc, xd->mode_ref_lf_delta_enabled);
+
+    if (xd->mode_ref_lf_delta_enabled)
+    {
+        /* Do the deltas need to be updated */
+        int send_update = xd->mode_ref_lf_delta_update
+                          || cpi->oxcf.error_resilient_mode;
+
+        vp8_write_bit(bc, send_update);
+        if (send_update)
+        {
+            int Data;
+
+            /* Send update */
+            for (i = 0; i < MAX_REF_LF_DELTAS; i++)
+            {
+                Data = xd->ref_lf_deltas[i];
+
+                /* Frame level data */
+                if (xd->ref_lf_deltas[i] != xd->last_ref_lf_deltas[i]
+                    || cpi->oxcf.error_resilient_mode)
+                {
+                    xd->last_ref_lf_deltas[i] = xd->ref_lf_deltas[i];
+                    vp8_write_bit(bc, 1);
+
+                    if (Data > 0)
+                    {
+                        vp8_write_literal(bc, (Data & 0x3F), 6);
+                        vp8_write_bit(bc, 0);    /* sign */
+                    }
+                    else
+                    {
+                        Data = -Data;
+                        vp8_write_literal(bc, (Data & 0x3F), 6);
+                        vp8_write_bit(bc, 1);    /* sign */
+                    }
+                }
+                else
+                    vp8_write_bit(bc, 0);
+            }
+
+            /* Send update */
+            for (i = 0; i < MAX_MODE_LF_DELTAS; i++)
+            {
+                Data = xd->mode_lf_deltas[i];
+
+                if (xd->mode_lf_deltas[i] != xd->last_mode_lf_deltas[i]
+                    || cpi->oxcf.error_resilient_mode)
+                {
+                    xd->last_mode_lf_deltas[i] = xd->mode_lf_deltas[i];
+                    vp8_write_bit(bc, 1);
+
+                    if (Data > 0)
+                    {
+                        vp8_write_literal(bc, (Data & 0x3F), 6);
+                        vp8_write_bit(bc, 0);    /* sign */
+                    }
+                    else
+                    {
+                        Data = -Data;
+                        vp8_write_literal(bc, (Data & 0x3F), 6);
+                        vp8_write_bit(bc, 1);    /* sign */
+                    }
+                }
+                else
+                    vp8_write_bit(bc, 0);
+            }
+        }
+    }
+
+    /* signal here is multi token partition is enabled */
+    vp8_write_literal(bc, pc->multi_token_partition, 2);
+
+    /* Frame Qbaseline quantizer index */
+    vp8_write_literal(bc, pc->base_qindex, 7);
+
+    /* Transmit Dc, Second order and Uv quantizer delta information */
+    put_delta_q(bc, pc->y1dc_delta_q);
+    put_delta_q(bc, pc->y2dc_delta_q);
+    put_delta_q(bc, pc->y2ac_delta_q);
+    put_delta_q(bc, pc->uvdc_delta_q);
+    put_delta_q(bc, pc->uvac_delta_q);
+
+    /* When there is a key frame all reference buffers are updated using
+     * the new key frame
+     */
+    if (pc->frame_type != KEY_FRAME)
+    {
+        /* Should the GF or ARF be updated using the transmitted frame
+         * or buffer
+         */
+        vp8_write_bit(bc, pc->refresh_golden_frame);
+        vp8_write_bit(bc, pc->refresh_alt_ref_frame);
+
+        /* If not being updated from current frame should either GF or ARF
+         * be updated from another buffer
+         */
+        if (!pc->refresh_golden_frame)
+            vp8_write_literal(bc, pc->copy_buffer_to_gf, 2);
+
+        if (!pc->refresh_alt_ref_frame)
+            vp8_write_literal(bc, pc->copy_buffer_to_arf, 2);
+
+        /* Indicate reference frame sign bias for Golden and ARF frames
+         * (always 0 for last frame buffer)
+         */
+        vp8_write_bit(bc, pc->ref_frame_sign_bias[GOLDEN_FRAME]);
+        vp8_write_bit(bc, pc->ref_frame_sign_bias[ALTREF_FRAME]);
+    }
+
+#if !(CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
+    if (cpi->oxcf.error_resilient_mode & VPX_ERROR_RESILIENT_PARTITIONS)
+    {
+        if (pc->frame_type == KEY_FRAME)
+            pc->refresh_entropy_probs = 1;
+        else
+            pc->refresh_entropy_probs = 0;
+    }
+#endif
+
+    vp8_write_bit(bc, pc->refresh_entropy_probs);
+
+    if (pc->frame_type != KEY_FRAME)
+        vp8_write_bit(bc, pc->refresh_last_frame);
+
+#ifdef VP8_ENTROPY_STATS
+
+    if (pc->frame_type == INTER_FRAME)
+        active_section = 0;
+    else
+        active_section = 7;
+
+#endif
+
+    vp8_clear_system_state();
+
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+    pack_coef_probs(cpi);
+#else
+    if (pc->refresh_entropy_probs == 0)
+    {
+        /* save a copy for later refresh */
+        memcpy(&cpi->common.lfc, &cpi->common.fc, sizeof(cpi->common.fc));
+    }
+
+    vp8_update_coef_probs(cpi);
+#endif
+
+#ifdef VP8_ENTROPY_STATS
+    active_section = 2;
+#endif
+
+    /* Write out the mb_no_coeff_skip flag */
+    vp8_write_bit(bc, pc->mb_no_coeff_skip);
+
+    if (pc->frame_type == KEY_FRAME)
+    {
+        write_kfmodes(cpi);
+
+#ifdef VP8_ENTROPY_STATS
+        active_section = 8;
+#endif
+    }
+    else
+    {
+        pack_inter_mode_mvs(cpi);
+
+#ifdef VP8_ENTROPY_STATS
+        active_section = 1;
+#endif
+    }
+
+    vp8_stop_encode(bc);
+
+    cx_data += bc->pos;
+
+    oh.first_partition_length_in_bytes = cpi->bc->pos;
+
+    /* update frame tag */
+    {
+        int v = (oh.first_partition_length_in_bytes << 5) |
+                (oh.show_frame << 4) |
+                (oh.version << 1) |
+                oh.type;
+
+        dest[0] = v;
+        dest[1] = v >> 8;
+        dest[2] = v >> 16;
+    }
+
+    *size = VP8_HEADER_SIZE + extra_bytes_packed + cpi->bc->pos;
+
+    cpi->partition_sz[0] = *size;
+
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+    {
+        const int num_part = (1 << pc->multi_token_partition);
+        unsigned char * dp = cpi->partition_d[0] + cpi->partition_sz[0];
+
+        if (num_part > 1)
+        {
+            /* write token part sizes (all but last) if more than 1 */
+            validate_buffer(dp, 3 * (num_part - 1), cpi->partition_d_end[0],
+                            &pc->error);
+
+            cpi->partition_sz[0] += 3*(num_part-1);
+
+            for(i = 1; i < num_part; i++)
+            {
+                write_partition_size(dp, cpi->partition_sz[i]);
+                dp += 3;
+            }
+        }
+
+        if (!cpi->output_partition)
+        {
+            /* concatenate partition buffers */
+            for(i = 0; i < num_part; i++)
+            {
+                memmove(dp, cpi->partition_d[i+1], cpi->partition_sz[i+1]);
+                cpi->partition_d[i+1] = dp;
+                dp += cpi->partition_sz[i+1];
+            }
+        }
+
+        /* update total size */
+        *size = 0;
+        for(i = 0; i < num_part+1; i++)
+        {
+            *size += cpi->partition_sz[i];
+        }
+    }
+#else
+    if (pc->multi_token_partition != ONE_PARTITION)
+    {
+        int num_part = 1 << pc->multi_token_partition;
+
+        /* partition size table at the end of first partition */
+        cpi->partition_sz[0] += 3 * (num_part - 1);
+        *size += 3 * (num_part - 1);
+
+        validate_buffer(cx_data, 3 * (num_part - 1), cx_data_end,
+                        &pc->error);
+
+        for(i = 1; i < num_part + 1; i++)
+        {
+            cpi->bc[i].error = &pc->error;
+        }
+
+        pack_tokens_into_partitions(cpi, cx_data + 3 * (num_part - 1),
+                                    cx_data_end, num_part);
+
+        for(i = 1; i < num_part; i++)
+        {
+            cpi->partition_sz[i] = cpi->bc[i].pos;
+            write_partition_size(cx_data, cpi->partition_sz[i]);
+            cx_data += 3;
+            *size += cpi->partition_sz[i]; /* add to total */
+        }
+
+        /* add last partition to total size */
+        cpi->partition_sz[i] = cpi->bc[i].pos;
+        *size += cpi->partition_sz[i];
+    }
+    else
+    {
+        bc[1].error = &pc->error;
+
+        vp8_start_encode(&cpi->bc[1], cx_data, cx_data_end);
+
+#if CONFIG_MULTITHREAD
+        if (cpi->b_multi_threaded)
+            pack_mb_row_tokens(cpi, &cpi->bc[1]);
+        else
+#endif  // CONFIG_MULTITHREAD
+            vp8_pack_tokens(&cpi->bc[1], cpi->tok, cpi->tok_count);
+
+        vp8_stop_encode(&cpi->bc[1]);
+
+        *size += cpi->bc[1].pos;
+        cpi->partition_sz[1] = cpi->bc[1].pos;
+    }
+#endif
+}
+
+#ifdef VP8_ENTROPY_STATS
+void print_tree_update_probs()
+{
+    int i, j, k, l;
+    FILE *f = fopen("context.c", "a");
+    int Sum;
+    fprintf(f, "\n/* Update probabilities for token entropy tree. */\n\n");
+    fprintf(f, "const vp8_prob tree_update_probs[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES] = {\n");
+
+    for (i = 0; i < BLOCK_TYPES; i++)
+    {
+        fprintf(f, "  { \n");
+
+        for (j = 0; j < COEF_BANDS; j++)
+        {
+            fprintf(f, "    {\n");
+
+            for (k = 0; k < PREV_COEF_CONTEXTS; k++)
+            {
+                fprintf(f, "      {");
+
+                for (l = 0; l < ENTROPY_NODES; l++)
+                {
+                    Sum = tree_update_hist[i][j][k][l][0] + tree_update_hist[i][j][k][l][1];
+
+                    if (Sum > 0)
+                    {
+                        if (((tree_update_hist[i][j][k][l][0] * 255) / Sum) > 0)
+                            fprintf(f, "%3ld, ", (tree_update_hist[i][j][k][l][0] * 255) / Sum);
+                        else
+                            fprintf(f, "%3ld, ", 1);
+                    }
+                    else
+                        fprintf(f, "%3ld, ", 128);
+                }
+
+                fprintf(f, "},\n");
+            }
+
+            fprintf(f, "    },\n");
+        }
+
+        fprintf(f, "  },\n");
+    }
+
+    fprintf(f, "};\n");
+    fclose(f);
+}
+#endif
diff --git a/libs/libvpx/vp8/encoder/bitstream.h b/libs/libvpx/vp8/encoder/bitstream.h
new file mode 100644
index 0000000000..de69805513
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/bitstream.h
@@ -0,0 +1,25 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_ENCODER_BITSTREAM_H_
+#define VP8_ENCODER_BITSTREAM_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp8_pack_tokens(vp8_writer *w, const TOKENEXTRA *p, int xcount);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_ENCODER_BITSTREAM_H_
diff --git a/libs/libvpx/vp8/encoder/block.h b/libs/libvpx/vp8/encoder/block.h
new file mode 100644
index 0000000000..248e79549b
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/block.h
@@ -0,0 +1,175 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_ENCODER_BLOCK_H_
+#define VP8_ENCODER_BLOCK_H_
+
+#include "vp8/common/onyx.h"
+#include "vp8/common/blockd.h"
+#include "vp8/common/entropymv.h"
+#include "vp8/common/entropy.h"
+#include "vpx_ports/mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_MODES 20
+#define MAX_ERROR_BINS 1024
+
+/* motion search site */
+typedef struct
+{
+    MV mv;
+    int offset;
+} search_site;
+
+typedef struct block
+{
+    /* 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries */
+    short *src_diff;
+    short *coeff;
+
+    /* 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries */
+    short *quant;
+    short *quant_fast;
+    short *quant_shift;
+    short *zbin;
+    short *zrun_zbin_boost;
+    short *round;
+
+    /* Zbin Over Quant value */
+    short zbin_extra;
+
+    unsigned char **base_src;
+    int src;
+    int src_stride;
+} BLOCK;
+
+typedef struct
+{
+    int count;
+    struct
+    {
+        B_PREDICTION_MODE mode;
+        int_mv mv;
+    } bmi[16];
+} PARTITION_INFO;
+
+typedef struct macroblock
+{
+    DECLARE_ALIGNED(16, short, src_diff[400]); /* 25 blocks Y,U,V,Y2 */
+    DECLARE_ALIGNED(16, short, coeff[400]); /* 25 blocks Y,U,V,Y2 */
+    DECLARE_ALIGNED(16, unsigned char, thismb[256]);
+
+    unsigned char *thismb_ptr;
+    /* 16 Y, 4 U, 4 V, 1 DC 2nd order block */
+    BLOCK block[25];
+
+    YV12_BUFFER_CONFIG src;
+
+    MACROBLOCKD e_mbd;
+    PARTITION_INFO *partition_info; /* work pointer */
+    PARTITION_INFO *pi;   /* Corresponds to upper left visible macroblock */
+    PARTITION_INFO *pip;  /* Base of allocated array */
+
+    int ref_frame_cost[MAX_REF_FRAMES];
+
+    search_site *ss;
+    int ss_count;
+    int searches_per_step;
+
+    int errorperbit;
+    int sadperbit16;
+    int sadperbit4;
+    int rddiv;
+    int rdmult;
+    unsigned int * mb_activity_ptr;
+    int * mb_norm_activity_ptr;
+    signed int act_zbin_adj;
+    signed int last_act_zbin_adj;
+
+    int *mvcost[2];
+    int *mvsadcost[2];
+    int (*mbmode_cost)[MB_MODE_COUNT];
+    int (*intra_uv_mode_cost)[MB_MODE_COUNT];
+    int (*bmode_costs)[10][10];
+    int *inter_bmode_costs;
+    int (*token_costs)[COEF_BANDS][PREV_COEF_CONTEXTS]
+    [MAX_ENTROPY_TOKENS];
+
+    /* These define limits to motion vector components to prevent
+     * them from extending outside the UMV borders.
+     */
+    int mv_col_min;
+    int mv_col_max;
+    int mv_row_min;
+    int mv_row_max;
+
+    int skip;
+
+    unsigned int encode_breakout;
+
+    signed char *gf_active_ptr;
+
+    unsigned char *active_ptr;
+    MV_CONTEXT *mvc;
+
+    int optimize;
+    int q_index;
+    int is_skin;
+    int denoise_zeromv;
+
+#if CONFIG_TEMPORAL_DENOISING
+    int increase_denoising;
+    MB_PREDICTION_MODE best_sse_inter_mode;
+    int_mv best_sse_mv;
+    MV_REFERENCE_FRAME best_reference_frame;
+    MV_REFERENCE_FRAME best_zeromv_reference_frame;
+    unsigned char need_to_clamp_best_mvs;
+#endif
+
+    int skip_true_count;
+    unsigned int coef_counts [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
+    unsigned int MVcount [2] [MVvals];  /* (row,col) MV cts this frame */
+    int ymode_count [VP8_YMODES];        /* intra MB type cts this frame */
+    int uv_mode_count[VP8_UV_MODES];     /* intra MB type cts this frame */
+    int64_t prediction_error;
+    int64_t intra_error;
+    int count_mb_ref_frame_usage[MAX_REF_FRAMES];
+
+    int rd_thresh_mult[MAX_MODES];
+    int rd_threshes[MAX_MODES];
+    unsigned int mbs_tested_so_far;
+    unsigned int mode_test_hit_counts[MAX_MODES];
+    int zbin_mode_boost_enabled;
+    int zbin_mode_boost;
+    int last_zbin_mode_boost;
+
+    int last_zbin_over_quant;
+    int zbin_over_quant;
+    int error_bins[MAX_ERROR_BINS];
+
+    void (*short_fdct4x4)(short *input, short *output, int pitch);
+    void (*short_fdct8x4)(short *input, short *output, int pitch);
+    void (*short_walsh4x4)(short *input, short *output, int pitch);
+    void (*quantize_b)(BLOCK *b, BLOCKD *d);
+
+    unsigned int mbs_zero_last_dot_suppress;
+    int zero_last_dot_suppress;
+} MACROBLOCK;
+
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_ENCODER_BLOCK_H_
diff --git a/libs/libvpx/vp8/encoder/boolhuff.c b/libs/libvpx/vp8/encoder/boolhuff.c
new file mode 100644
index 0000000000..3b0c03a142
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/boolhuff.c
@@ -0,0 +1,70 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "boolhuff.h"
+
+#if defined(SECTIONBITS_OUTPUT)
+unsigned __int64 Sectionbits[500];
+
+#endif
+
+#ifdef VP8_ENTROPY_STATS
+unsigned int active_section = 0;
+#endif
+
+const unsigned int vp8_prob_cost[256] =
+{
+    2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161, 1129, 1099, 1072, 1046,
+    1023, 1000,  979,  959,  940,  922,  905,  889,  873,  858,  843,  829,  816,  803,  790,  778,
+    767,  755,  744,  733,  723,  713,  703,  693,  684,  675,  666,  657,  649,  641,  633,  625,
+    617,  609,  602,  594,  587,  580,  573,  567,  560,  553,  547,  541,  534,  528,  522,  516,
+    511,  505,  499,  494,  488,  483,  477,  472,  467,  462,  457,  452,  447,  442,  437,  433,
+    428,  424,  419,  415,  410,  406,  401,  397,  393,  389,  385,  381,  377,  373,  369,  365,
+    361,  357,  353,  349,  346,  342,  338,  335,  331,  328,  324,  321,  317,  314,  311,  307,
+    304,  301,  297,  294,  291,  288,  285,  281,  278,  275,  272,  269,  266,  263,  260,  257,
+    255,  252,  249,  246,  243,  240,  238,  235,  232,  229,  227,  224,  221,  219,  216,  214,
+    211,  208,  206,  203,  201,  198,  196,  194,  191,  189,  186,  184,  181,  179,  177,  174,
+    172,  170,  168,  165,  163,  161,  159,  156,  154,  152,  150,  148,  145,  143,  141,  139,
+    137,  135,  133,  131,  129,  127,  125,  123,  121,  119,  117,  115,  113,  111,  109,  107,
+    105,  103,  101,   99,   97,   95,   93,   92,   90,   88,   86,   84,   82,   81,   79,   77,
+    75,   73,   72,   70,   68,   66,   65,   63,   61,   60,   58,   56,   55,   53,   51,   50,
+    48,   46,   45,   43,   41,   40,   38,   37,   35,   33,   32,   30,   29,   27,   25,   24,
+    22,   21,   19,   18,   16,   15,   13,   12,   10,    9,    7,    6,    4,    3,    1,   1
+};
+
+void vp8_start_encode(BOOL_CODER *br, unsigned char *source, unsigned char *source_end)
+{
+
+    br->lowvalue   = 0;
+    br->range      = 255;
+    br->count      = -24;
+    br->buffer     = source;
+    br->buffer_end = source_end;
+    br->pos        = 0;
+}
+
+void vp8_stop_encode(BOOL_CODER *br)
+{
+    int i;
+
+    for (i = 0; i < 32; i++)
+        vp8_encode_bool(br, 0, 128);
+}
+
+
+void vp8_encode_value(BOOL_CODER *br, int data, int bits)
+{
+    int bit;
+
+    for (bit = bits - 1; bit >= 0; bit--)
+        vp8_encode_bool(br, (1 & (data >> bit)), 0x80);
+
+}
diff --git a/libs/libvpx/vp8/encoder/boolhuff.h b/libs/libvpx/vp8/encoder/boolhuff.h
new file mode 100644
index 0000000000..7c012a8296
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/boolhuff.h
@@ -0,0 +1,132 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+*
+*   Module Title :     boolhuff.h
+*
+*   Description  :     Bool Coder header file.
+*
+****************************************************************************/
+#ifndef VP8_ENCODER_BOOLHUFF_H_
+#define VP8_ENCODER_BOOLHUFF_H_
+
+#include "vpx_ports/mem.h"
+#include "vpx/internal/vpx_codec_internal.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct
+{
+    unsigned int lowvalue;
+    unsigned int range;
+    int count;
+    unsigned int pos;
+    unsigned char *buffer;
+    unsigned char *buffer_end;
+    struct vpx_internal_error_info *error;
+} BOOL_CODER;
+
+extern void vp8_start_encode(BOOL_CODER *bc, unsigned char *buffer, unsigned char *buffer_end);
+
+extern void vp8_encode_value(BOOL_CODER *br, int data, int bits);
+extern void vp8_stop_encode(BOOL_CODER *bc);
+extern const unsigned int vp8_prob_cost[256];
+
+
+DECLARE_ALIGNED(16, extern const unsigned char, vp8_norm[256]);
+
+static int validate_buffer(const unsigned char *start,
+                           size_t               len,
+                           const unsigned char *end,
+                           struct vpx_internal_error_info *error)
+{
+    if (start + len > start && start + len < end)
+        return 1;
+    else
+        vpx_internal_error(error, VPX_CODEC_CORRUPT_FRAME,
+            "Truncated packet or corrupt partition ");
+
+    return 0;
+}
+static void vp8_encode_bool(BOOL_CODER *br, int bit, int probability)
+{
+    unsigned int split;
+    int count = br->count;
+    unsigned int range = br->range;
+    unsigned int lowvalue = br->lowvalue;
+    register unsigned int shift;
+
+#ifdef VP8_ENTROPY_STATS
+#if defined(SECTIONBITS_OUTPUT)
+
+    if (bit)
+        Sectionbits[active_section] += vp8_prob_cost[255-probability];
+    else
+        Sectionbits[active_section] += vp8_prob_cost[probability];
+
+#endif
+#endif
+
+    split = 1 + (((range - 1) * probability) >> 8);
+
+    range = split;
+
+    if (bit)
+    {
+        lowvalue += split;
+        range = br->range - split;
+    }
+
+    shift = vp8_norm[range];
+
+    range <<= shift;
+    count += shift;
+
+    if (count >= 0)
+    {
+        int offset = shift - count;
+
+        if ((lowvalue << (offset - 1)) & 0x80000000)
+        {
+            int x = br->pos - 1;
+
+            while (x >= 0 && br->buffer[x] == 0xff)
+            {
+                br->buffer[x] = (unsigned char)0;
+                x--;
+            }
+
+            br->buffer[x] += 1;
+        }
+
+        validate_buffer(br->buffer + br->pos, 1, br->buffer_end, br->error);
+        br->buffer[br->pos++] = (lowvalue >> (24 - offset));
+
+        lowvalue <<= offset;
+        shift = count;
+        lowvalue &= 0xffffff;
+        count -= 8 ;
+    }
+
+    lowvalue <<= shift;
+    br->count = count;
+    br->lowvalue = lowvalue;
+    br->range = range;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_ENCODER_BOOLHUFF_H_
diff --git a/libs/libvpx/vp8/encoder/dct.c b/libs/libvpx/vp8/encoder/dct.c
new file mode 100644
index 0000000000..0c7198d5d3
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/dct.c
@@ -0,0 +1,118 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <math.h>
+
+#include "./vp8_rtcd.h"
+
+void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
+{
+    int i;
+    int a1, b1, c1, d1;
+    short *ip = input;
+    short *op = output;
+
+    for (i = 0; i < 4; i++)
+    {
+        a1 = ((ip[0] + ip[3]) * 8);
+        b1 = ((ip[1] + ip[2]) * 8);
+        c1 = ((ip[1] - ip[2]) * 8);
+        d1 = ((ip[0] - ip[3]) * 8);
+
+        op[0] = a1 + b1;
+        op[2] = a1 - b1;
+
+        op[1] = (c1 * 2217 + d1 * 5352 +  14500)>>12;
+        op[3] = (d1 * 2217 - c1 * 5352 +   7500)>>12;
+
+        ip += pitch / 2;
+        op += 4;
+
+    }
+    ip = output;
+    op = output;
+    for (i = 0; i < 4; i++)
+    {
+        a1 = ip[0] + ip[12];
+        b1 = ip[4] + ip[8];
+        c1 = ip[4] - ip[8];
+        d1 = ip[0] - ip[12];
+
+        op[0]  = ( a1 + b1 + 7)>>4;
+        op[8]  = ( a1 - b1 + 7)>>4;
+
+        op[4]  =((c1 * 2217 + d1 * 5352 +  12000)>>16) + (d1!=0);
+        op[12] = (d1 * 2217 - c1 * 5352 +  51000)>>16;
+
+        ip++;
+        op++;
+    }
+}
+
+void vp8_short_fdct8x4_c(short *input, short *output, int pitch)
+{
+    vp8_short_fdct4x4_c(input,   output,    pitch);
+    vp8_short_fdct4x4_c(input + 4, output + 16, pitch);
+}
+
+void vp8_short_walsh4x4_c(short *input, short *output, int pitch)
+{
+    int i;
+    int a1, b1, c1, d1;
+    int a2, b2, c2, d2;
+    short *ip = input;
+    short *op = output;
+
+
+    for (i = 0; i < 4; i++)
+    {
+        a1 = ((ip[0] + ip[2]) * 4);
+        d1 = ((ip[1] + ip[3]) * 4);
+        c1 = ((ip[1] - ip[3]) * 4);
+        b1 = ((ip[0] - ip[2]) * 4);
+
+        op[0] = a1 + d1 + (a1!=0);
+        op[1] = b1 + c1;
+        op[2] = b1 - c1;
+        op[3] = a1 - d1;
+        ip += pitch / 2;
+        op += 4;
+    }
+
+    ip = output;
+    op = output;
+
+    for (i = 0; i < 4; i++)
+    {
+        a1 = ip[0] + ip[8];
+        d1 = ip[4] + ip[12];
+        c1 = ip[4] - ip[12];
+        b1 = ip[0] - ip[8];
+
+        a2 = a1 + d1;
+        b2 = b1 + c1;
+        c2 = b1 - c1;
+        d2 = a1 - d1;
+
+        a2 += a2<0;
+        b2 += b2<0;
+        c2 += c2<0;
+        d2 += d2<0;
+
+        op[0] = (a2+3) >> 3;
+        op[4] = (b2+3) >> 3;
+        op[8] = (c2+3) >> 3;
+        op[12]= (d2+3) >> 3;
+
+        ip++;
+        op++;
+    }
+}
diff --git a/libs/libvpx/vp8/encoder/dct_value_cost.h b/libs/libvpx/vp8/encoder/dct_value_cost.h
new file mode 100644
index 0000000000..1cd3eec84a
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/dct_value_cost.h
@@ -0,0 +1,371 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_ENCODER_DCT_VALUE_COST_H_
+#define VP8_ENCODER_DCT_VALUE_COST_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Generated file, included by tokenize.c  */
+/* Values generated by fill_value_tokens() */
+
+static const short dct_value_cost[2048*2] =
+{
+    8285, 8277, 8267, 8259, 8253, 8245, 8226, 8218, 8212, 8204, 8194, 8186,
+    8180, 8172, 8150, 8142, 8136, 8128, 8118, 8110, 8104, 8096, 8077, 8069,
+    8063, 8055, 8045, 8037, 8031, 8023, 7997, 7989, 7983, 7975, 7965, 7957,
+    7951, 7943, 7924, 7916, 7910, 7902, 7892, 7884, 7878, 7870, 7848, 7840,
+    7834, 7826, 7816, 7808, 7802, 7794, 7775, 7767, 7761, 7753, 7743, 7735,
+    7729, 7721, 7923, 7915, 7909, 7901, 7891, 7883, 7877, 7869, 7850, 7842,
+    7836, 7828, 7818, 7810, 7804, 7796, 7774, 7766, 7760, 7752, 7742, 7734,
+    7728, 7720, 7701, 7693, 7687, 7679, 7669, 7661, 7655, 7647, 7621, 7613,
+    7607, 7599, 7589, 7581, 7575, 7567, 7548, 7540, 7534, 7526, 7516, 7508,
+    7502, 7494, 7472, 7464, 7458, 7450, 7440, 7432, 7426, 7418, 7399, 7391,
+    7385, 7377, 7367, 7359, 7353, 7345, 7479, 7471, 7465, 7457, 7447, 7439,
+    7433, 7425, 7406, 7398, 7392, 7384, 7374, 7366, 7360, 7352, 7330, 7322,
+    7316, 7308, 7298, 7290, 7284, 7276, 7257, 7249, 7243, 7235, 7225, 7217,
+    7211, 7203, 7177, 7169, 7163, 7155, 7145, 7137, 7131, 7123, 7104, 7096,
+    7090, 7082, 7072, 7064, 7058, 7050, 7028, 7020, 7014, 7006, 6996, 6988,
+    6982, 6974, 6955, 6947, 6941, 6933, 6923, 6915, 6909, 6901, 7632, 7624,
+    7618, 7610, 7600, 7592, 7586, 7578, 7559, 7551, 7545, 7537, 7527, 7519,
+    7513, 7505, 7483, 7475, 7469, 7461, 7451, 7443, 7437, 7429, 7410, 7402,
+    7396, 7388, 7378, 7370, 7364, 7356, 7330, 7322, 7316, 7308, 7298, 7290,
+    7284, 7276, 7257, 7249, 7243, 7235, 7225, 7217, 7211, 7203, 7181, 7173,
+    7167, 7159, 7149, 7141, 7135, 7127, 7108, 7100, 7094, 7086, 7076, 7068,
+    7062, 7054, 7188, 7180, 7174, 7166, 7156, 7148, 7142, 7134, 7115, 7107,
+    7101, 7093, 7083, 7075, 7069, 7061, 7039, 7031, 7025, 7017, 7007, 6999,
+    6993, 6985, 6966, 6958, 6952, 6944, 6934, 6926, 6920, 6912, 6886, 6878,
+    6872, 6864, 6854, 6846, 6840, 6832, 6813, 6805, 6799, 6791, 6781, 6773,
+    6767, 6759, 6737, 6729, 6723, 6715, 6705, 6697, 6691, 6683, 6664, 6656,
+    6650, 6642, 6632, 6624, 6618, 6610, 6812, 6804, 6798, 6790, 6780, 6772,
+    6766, 6758, 6739, 6731, 6725, 6717, 6707, 6699, 6693, 6685, 6663, 6655,
+    6649, 6641, 6631, 6623, 6617, 6609, 6590, 6582, 6576, 6568, 6558, 6550,
+    6544, 6536, 6510, 6502, 6496, 6488, 6478, 6470, 6464, 6456, 6437, 6429,
+    6423, 6415, 6405, 6397, 6391, 6383, 6361, 6353, 6347, 6339, 6329, 6321,
+    6315, 6307, 6288, 6280, 6274, 6266, 6256, 6248, 6242, 6234, 6368, 6360,
+    6354, 6346, 6336, 6328, 6322, 6314, 6295, 6287, 6281, 6273, 6263, 6255,
+    6249, 6241, 6219, 6211, 6205, 6197, 6187, 6179, 6173, 6165, 6146, 6138,
+    6132, 6124, 6114, 6106, 6100, 6092, 6066, 6058, 6052, 6044, 6034, 6026,
+    6020, 6012, 5993, 5985, 5979, 5971, 5961, 5953, 5947, 5939, 5917, 5909,
+    5903, 5895, 5885, 5877, 5871, 5863, 5844, 5836, 5830, 5822, 5812, 5804,
+    5798, 5790, 6697, 6689, 6683, 6675, 6665, 6657, 6651, 6643, 6624, 6616,
+    6610, 6602, 6592, 6584, 6578, 6570, 6548, 6540, 6534, 6526, 6516, 6508,
+    6502, 6494, 6475, 6467, 6461, 6453, 6443, 6435, 6429, 6421, 6395, 6387,
+    6381, 6373, 6363, 6355, 6349, 6341, 6322, 6314, 6308, 6300, 6290, 6282,
+    6276, 6268, 6246, 6238, 6232, 6224, 6214, 6206, 6200, 6192, 6173, 6165,
+    6159, 6151, 6141, 6133, 6127, 6119, 6253, 6245, 6239, 6231, 6221, 6213,
+    6207, 6199, 6180, 6172, 6166, 6158, 6148, 6140, 6134, 6126, 6104, 6096,
+    6090, 6082, 6072, 6064, 6058, 6050, 6031, 6023, 6017, 6009, 5999, 5991,
+    5985, 5977, 5951, 5943, 5937, 5929, 5919, 5911, 5905, 5897, 5878, 5870,
+    5864, 5856, 5846, 5838, 5832, 5824, 5802, 5794, 5788, 5780, 5770, 5762,
+    5756, 5748, 5729, 5721, 5715, 5707, 5697, 5689, 5683, 5675, 5877, 5869,
+    5863, 5855, 5845, 5837, 5831, 5823, 5804, 5796, 5790, 5782, 5772, 5764,
+    5758, 5750, 5728, 5720, 5714, 5706, 5696, 5688, 5682, 5674, 5655, 5647,
+    5641, 5633, 5623, 5615, 5609, 5601, 5575, 5567, 5561, 5553, 5543, 5535,
+    5529, 5521, 5502, 5494, 5488, 5480, 5470, 5462, 5456, 5448, 5426, 5418,
+    5412, 5404, 5394, 5386, 5380, 5372, 5353, 5345, 5339, 5331, 5321, 5313,
+    5307, 5299, 5433, 5425, 5419, 5411, 5401, 5393, 5387, 5379, 5360, 5352,
+    5346, 5338, 5328, 5320, 5314, 5306, 5284, 5276, 5270, 5262, 5252, 5244,
+    5238, 5230, 5211, 5203, 5197, 5189, 5179, 5171, 5165, 5157, 5131, 5123,
+    5117, 5109, 5099, 5091, 5085, 5077, 5058, 5050, 5044, 5036, 5026, 5018,
+    5012, 5004, 4982, 4974, 4968, 4960, 4950, 4942, 4936, 4928, 4909, 4901,
+    4895, 4887, 4877, 4869, 4863, 4855, 5586, 5578, 5572, 5564, 5554, 5546,
+    5540, 5532, 5513, 5505, 5499, 5491, 5481, 5473, 5467, 5459, 5437, 5429,
+    5423, 5415, 5405, 5397, 5391, 5383, 5364, 5356, 5350, 5342, 5332, 5324,
+    5318, 5310, 5284, 5276, 5270, 5262, 5252, 5244, 5238, 5230, 5211, 5203,
+    5197, 5189, 5179, 5171, 5165, 5157, 5135, 5127, 5121, 5113, 5103, 5095,
+    5089, 5081, 5062, 5054, 5048, 5040, 5030, 5022, 5016, 5008, 5142, 5134,
+    5128, 5120, 5110, 5102, 5096, 5088, 5069, 5061, 5055, 5047, 5037, 5029,
+    5023, 5015, 4993, 4985, 4979, 4971, 4961, 4953, 4947, 4939, 4920, 4912,
+    4906, 4898, 4888, 4880, 4874, 4866, 4840, 4832, 4826, 4818, 4808, 4800,
+    4794, 4786, 4767, 4759, 4753, 4745, 4735, 4727, 4721, 4713, 4691, 4683,
+    4677, 4669, 4659, 4651, 4645, 4637, 4618, 4610, 4604, 4596, 4586, 4578,
+    4572, 4564, 4766, 4758, 4752, 4744, 4734, 4726, 4720, 4712, 4693, 4685,
+    4679, 4671, 4661, 4653, 4647, 4639, 4617, 4609, 4603, 4595, 4585, 4577,
+    4571, 4563, 4544, 4536, 4530, 4522, 4512, 4504, 4498, 4490, 4464, 4456,
+    4450, 4442, 4432, 4424, 4418, 4410, 4391, 4383, 4377, 4369, 4359, 4351,
+    4345, 4337, 4315, 4307, 4301, 4293, 4283, 4275, 4269, 4261, 4242, 4234,
+    4228, 4220, 4210, 4202, 4196, 4188, 4322, 4314, 4308, 4300, 4290, 4282,
+    4276, 4268, 4249, 4241, 4235, 4227, 4217, 4209, 4203, 4195, 4173, 4165,
+    4159, 4151, 4141, 4133, 4127, 4119, 4100, 4092, 4086, 4078, 4068, 4060,
+    4054, 4046, 4020, 4012, 4006, 3998, 3988, 3980, 3974, 3966, 3947, 3939,
+    3933, 3925, 3915, 3907, 3901, 3893, 3871, 3863, 3857, 3849, 3839, 3831,
+    3825, 3817, 3798, 3790, 3784, 3776, 3766, 3758, 3752, 3744, 6697, 6689,
+    6683, 6675, 6665, 6657, 6651, 6643, 6624, 6616, 6610, 6602, 6592, 6584,
+    6578, 6570, 6548, 6540, 6534, 6526, 6516, 6508, 6502, 6494, 6475, 6467,
+    6461, 6453, 6443, 6435, 6429, 6421, 6395, 6387, 6381, 6373, 6363, 6355,
+    6349, 6341, 6322, 6314, 6308, 6300, 6290, 6282, 6276, 6268, 6246, 6238,
+    6232, 6224, 6214, 6206, 6200, 6192, 6173, 6165, 6159, 6151, 6141, 6133,
+    6127, 6119, 6253, 6245, 6239, 6231, 6221, 6213, 6207, 6199, 6180, 6172,
+    6166, 6158, 6148, 6140, 6134, 6126, 6104, 6096, 6090, 6082, 6072, 6064,
+    6058, 6050, 6031, 6023, 6017, 6009, 5999, 5991, 5985, 5977, 5951, 5943,
+    5937, 5929, 5919, 5911, 5905, 5897, 5878, 5870, 5864, 5856, 5846, 5838,
+    5832, 5824, 5802, 5794, 5788, 5780, 5770, 5762, 5756, 5748, 5729, 5721,
+    5715, 5707, 5697, 5689, 5683, 5675, 5877, 5869, 5863, 5855, 5845, 5837,
+    5831, 5823, 5804, 5796, 5790, 5782, 5772, 5764, 5758, 5750, 5728, 5720,
+    5714, 5706, 5696, 5688, 5682, 5674, 5655, 5647, 5641, 5633, 5623, 5615,
+    5609, 5601, 5575, 5567, 5561, 5553, 5543, 5535, 5529, 5521, 5502, 5494,
+    5488, 5480, 5470, 5462, 5456, 5448, 5426, 5418, 5412, 5404, 5394, 5386,
+    5380, 5372, 5353, 5345, 5339, 5331, 5321, 5313, 5307, 5299, 5433, 5425,
+    5419, 5411, 5401, 5393, 5387, 5379, 5360, 5352, 5346, 5338, 5328, 5320,
+    5314, 5306, 5284, 5276, 5270, 5262, 5252, 5244, 5238, 5230, 5211, 5203,
+    5197, 5189, 5179, 5171, 5165, 5157, 5131, 5123, 5117, 5109, 5099, 5091,
+    5085, 5077, 5058, 5050, 5044, 5036, 5026, 5018, 5012, 5004, 4982, 4974,
+    4968, 4960, 4950, 4942, 4936, 4928, 4909, 4901, 4895, 4887, 4877, 4869,
+    4863, 4855, 5586, 5578, 5572, 5564, 5554, 5546, 5540, 5532, 5513, 5505,
+    5499, 5491, 5481, 5473, 5467, 5459, 5437, 5429, 5423, 5415, 5405, 5397,
+    5391, 5383, 5364, 5356, 5350, 5342, 5332, 5324, 5318, 5310, 5284, 5276,
+    5270, 5262, 5252, 5244, 5238, 5230, 5211, 5203, 5197, 5189, 5179, 5171,
+    5165, 5157, 5135, 5127, 5121, 5113, 5103, 5095, 5089, 5081, 5062, 5054,
+    5048, 5040, 5030, 5022, 5016, 5008, 5142, 5134, 5128, 5120, 5110, 5102,
+    5096, 5088, 5069, 5061, 5055, 5047, 5037, 5029, 5023, 5015, 4993, 4985,
+    4979, 4971, 4961, 4953, 4947, 4939, 4920, 4912, 4906, 4898, 4888, 4880,
+    4874, 4866, 4840, 4832, 4826, 4818, 4808, 4800, 4794, 4786, 4767, 4759,
+    4753, 4745, 4735, 4727, 4721, 4713, 4691, 4683, 4677, 4669, 4659, 4651,
+    4645, 4637, 4618, 4610, 4604, 4596, 4586, 4578, 4572, 4564, 4766, 4758,
+    4752, 4744, 4734, 4726, 4720, 4712, 4693, 4685, 4679, 4671, 4661, 4653,
+    4647, 4639, 4617, 4609, 4603, 4595, 4585, 4577, 4571, 4563, 4544, 4536,
+    4530, 4522, 4512, 4504, 4498, 4490, 4464, 4456, 4450, 4442, 4432, 4424,
+    4418, 4410, 4391, 4383, 4377, 4369, 4359, 4351, 4345, 4337, 4315, 4307,
+    4301, 4293, 4283, 4275, 4269, 4261, 4242, 4234, 4228, 4220, 4210, 4202,
+    4196, 4188, 4322, 4314, 4308, 4300, 4290, 4282, 4276, 4268, 4249, 4241,
+    4235, 4227, 4217, 4209, 4203, 4195, 4173, 4165, 4159, 4151, 4141, 4133,
+    4127, 4119, 4100, 4092, 4086, 4078, 4068, 4060, 4054, 4046, 4020, 4012,
+    4006, 3998, 3988, 3980, 3974, 3966, 3947, 3939, 3933, 3925, 3915, 3907,
+    3901, 3893, 3871, 3863, 3857, 3849, 3839, 3831, 3825, 3817, 3798, 3790,
+    3784, 3776, 3766, 3758, 3752, 3744, 4651, 4643, 4637, 4629, 4619, 4611,
+    4605, 4597, 4578, 4570, 4564, 4556, 4546, 4538, 4532, 4524, 4502, 4494,
+    4488, 4480, 4470, 4462, 4456, 4448, 4429, 4421, 4415, 4407, 4397, 4389,
+    4383, 4375, 4349, 4341, 4335, 4327, 4317, 4309, 4303, 4295, 4276, 4268,
+    4262, 4254, 4244, 4236, 4230, 4222, 4200, 4192, 4186, 4178, 4168, 4160,
+    4154, 4146, 4127, 4119, 4113, 4105, 4095, 4087, 4081, 4073, 4207, 4199,
+    4193, 4185, 4175, 4167, 4161, 4153, 4134, 4126, 4120, 4112, 4102, 4094,
+    4088, 4080, 4058, 4050, 4044, 4036, 4026, 4018, 4012, 4004, 3985, 3977,
+    3971, 3963, 3953, 3945, 3939, 3931, 3905, 3897, 3891, 3883, 3873, 3865,
+    3859, 3851, 3832, 3824, 3818, 3810, 3800, 3792, 3786, 3778, 3756, 3748,
+    3742, 3734, 3724, 3716, 3710, 3702, 3683, 3675, 3669, 3661, 3651, 3643,
+    3637, 3629, 3831, 3823, 3817, 3809, 3799, 3791, 3785, 3777, 3758, 3750,
+    3744, 3736, 3726, 3718, 3712, 3704, 3682, 3674, 3668, 3660, 3650, 3642,
+    3636, 3628, 3609, 3601, 3595, 3587, 3577, 3569, 3563, 3555, 3529, 3521,
+    3515, 3507, 3497, 3489, 3483, 3475, 3456, 3448, 3442, 3434, 3424, 3416,
+    3410, 3402, 3380, 3372, 3366, 3358, 3348, 3340, 3334, 3326, 3307, 3299,
+    3293, 3285, 3275, 3267, 3261, 3253, 3387, 3379, 3373, 3365, 3355, 3347,
+    3341, 3333, 3314, 3306, 3300, 3292, 3282, 3274, 3268, 3260, 3238, 3230,
+    3224, 3216, 3206, 3198, 3192, 3184, 3165, 3157, 3151, 3143, 3133, 3125,
+    3119, 3111, 3085, 3077, 3071, 3063, 3053, 3045, 3039, 3031, 3012, 3004,
+    2998, 2990, 2980, 2972, 2966, 2958, 2936, 2928, 2922, 2914, 2904, 2896,
+    2890, 2882, 2863, 2855, 2849, 2841, 2831, 2823, 2817, 2809, 3540, 3532,
+    3526, 3518, 3508, 3500, 3494, 3486, 3467, 3459, 3453, 3445, 3435, 3427,
+    3421, 3413, 3391, 3383, 3377, 3369, 3359, 3351, 3345, 3337, 3318, 3310,
+    3304, 3296, 3286, 3278, 3272, 3264, 3238, 3230, 3224, 3216, 3206, 3198,
+    3192, 3184, 3165, 3157, 3151, 3143, 3133, 3125, 3119, 3111, 3089, 3081,
+    3075, 3067, 3057, 3049, 3043, 3035, 3016, 3008, 3002, 2994, 2984, 2976,
+    2970, 2962, 3096, 3088, 3082, 3074, 3064, 3056, 3050, 3042, 3023, 3015,
+    3009, 3001, 2991, 2983, 2977, 2969, 2947, 2939, 2933, 2925, 2915, 2907,
+    2901, 2893, 2874, 2866, 2860, 2852, 2842, 2834, 2828, 2820, 2794, 2786,
+    2780, 2772, 2762, 2754, 2748, 2740, 2721, 2713, 2707, 2699, 2689, 2681,
+    2675, 2667, 2645, 2637, 2631, 2623, 2613, 2605, 2599, 2591, 2572, 2564,
+    2558, 2550, 2540, 2532, 2526, 2518, 2720, 2712, 2706, 2698, 2688, 2680,
+    2674, 2666, 2647, 2639, 2633, 2625, 2615, 2607, 2601, 2593, 2571, 2563,
+    2557, 2549, 2539, 2531, 2525, 2517, 2498, 2490, 2484, 2476, 2466, 2458,
+    2452, 2444, 2418, 2410, 2404, 2396, 2386, 2378, 2372, 2364, 2345, 2337,
+    2331, 2323, 2313, 2305, 2299, 2291, 2269, 2261, 2255, 2247, 2237, 2229,
+    2223, 2215, 2196, 2188, 2182, 2174, 2164, 2156, 2150, 2142, 2276, 2268,
+    2262, 2254, 2244, 2236, 2230, 2222, 2203, 2195, 2189, 2181, 2171, 2163,
+    2157, 2149, 2127, 2119, 2113, 2105, 2095, 2087, 2081, 2073, 2054, 2046,
+    2040, 2032, 2022, 2014, 2008, 2000, 1974, 1966, 1960, 1952, 1942, 1934,
+    1928, 1920, 1901, 1893, 1887, 1879, 1869, 1861, 1855, 1847, 1825, 1817,
+    1811, 1803, 1793, 1785, 1779, 1771, 1752, 1744, 1738, 1730, 1720, 1712,
+    1706, 1698, 1897, 1883, 1860, 1846, 1819, 1805, 1782, 1768, 1723, 1709,
+    1686, 1672, 1645, 1631, 1608, 1594, 1574, 1560, 1537, 1523, 1496, 1482,
+    1459, 1445, 1400, 1386, 1363, 1349, 1322, 1308, 1285, 1271, 1608, 1565,
+    1535, 1492, 1446, 1403, 1373, 1330, 1312, 1269, 1239, 1196, 1150, 1107,
+    1077, 1034, 1291, 1218, 1171, 1098, 1015, 942, 895, 822, 953, 850,
+    729, 626, 618, 431, 257, 257, 257, 257, 0, 255, 255, 255,
+    255, 429, 616, 624, 727, 848, 951, 820, 893, 940, 1013, 1096,
+    1169, 1216, 1289, 1032, 1075, 1105, 1148, 1194, 1237, 1267, 1310, 1328,
+    1371, 1401, 1444, 1490, 1533, 1563, 1606, 1269, 1283, 1306, 1320, 1347,
+    1361, 1384, 1398, 1443, 1457, 1480, 1494, 1521, 1535, 1558, 1572, 1592,
+    1606, 1629, 1643, 1670, 1684, 1707, 1721, 1766, 1780, 1803, 1817, 1844,
+    1858, 1881, 1895, 1696, 1704, 1710, 1718, 1728, 1736, 1742, 1750, 1769,
+    1777, 1783, 1791, 1801, 1809, 1815, 1823, 1845, 1853, 1859, 1867, 1877,
+    1885, 1891, 1899, 1918, 1926, 1932, 1940, 1950, 1958, 1964, 1972, 1998,
+    2006, 2012, 2020, 2030, 2038, 2044, 2052, 2071, 2079, 2085, 2093, 2103,
+    2111, 2117, 2125, 2147, 2155, 2161, 2169, 2179, 2187, 2193, 2201, 2220,
+    2228, 2234, 2242, 2252, 2260, 2266, 2274, 2140, 2148, 2154, 2162, 2172,
+    2180, 2186, 2194, 2213, 2221, 2227, 2235, 2245, 2253, 2259, 2267, 2289,
+    2297, 2303, 2311, 2321, 2329, 2335, 2343, 2362, 2370, 2376, 2384, 2394,
+    2402, 2408, 2416, 2442, 2450, 2456, 2464, 2474, 2482, 2488, 2496, 2515,
+    2523, 2529, 2537, 2547, 2555, 2561, 2569, 2591, 2599, 2605, 2613, 2623,
+    2631, 2637, 2645, 2664, 2672, 2678, 2686, 2696, 2704, 2710, 2718, 2516,
+    2524, 2530, 2538, 2548, 2556, 2562, 2570, 2589, 2597, 2603, 2611, 2621,
+    2629, 2635, 2643, 2665, 2673, 2679, 2687, 2697, 2705, 2711, 2719, 2738,
+    2746, 2752, 2760, 2770, 2778, 2784, 2792, 2818, 2826, 2832, 2840, 2850,
+    2858, 2864, 2872, 2891, 2899, 2905, 2913, 2923, 2931, 2937, 2945, 2967,
+    2975, 2981, 2989, 2999, 3007, 3013, 3021, 3040, 3048, 3054, 3062, 3072,
+    3080, 3086, 3094, 2960, 2968, 2974, 2982, 2992, 3000, 3006, 3014, 3033,
+    3041, 3047, 3055, 3065, 3073, 3079, 3087, 3109, 3117, 3123, 3131, 3141,
+    3149, 3155, 3163, 3182, 3190, 3196, 3204, 3214, 3222, 3228, 3236, 3262,
+    3270, 3276, 3284, 3294, 3302, 3308, 3316, 3335, 3343, 3349, 3357, 3367,
+    3375, 3381, 3389, 3411, 3419, 3425, 3433, 3443, 3451, 3457, 3465, 3484,
+    3492, 3498, 3506, 3516, 3524, 3530, 3538, 2807, 2815, 2821, 2829, 2839,
+    2847, 2853, 2861, 2880, 2888, 2894, 2902, 2912, 2920, 2926, 2934, 2956,
+    2964, 2970, 2978, 2988, 2996, 3002, 3010, 3029, 3037, 3043, 3051, 3061,
+    3069, 3075, 3083, 3109, 3117, 3123, 3131, 3141, 3149, 3155, 3163, 3182,
+    3190, 3196, 3204, 3214, 3222, 3228, 3236, 3258, 3266, 3272, 3280, 3290,
+    3298, 3304, 3312, 3331, 3339, 3345, 3353, 3363, 3371, 3377, 3385, 3251,
+    3259, 3265, 3273, 3283, 3291, 3297, 3305, 3324, 3332, 3338, 3346, 3356,
+    3364, 3370, 3378, 3400, 3408, 3414, 3422, 3432, 3440, 3446, 3454, 3473,
+    3481, 3487, 3495, 3505, 3513, 3519, 3527, 3553, 3561, 3567, 3575, 3585,
+    3593, 3599, 3607, 3626, 3634, 3640, 3648, 3658, 3666, 3672, 3680, 3702,
+    3710, 3716, 3724, 3734, 3742, 3748, 3756, 3775, 3783, 3789, 3797, 3807,
+    3815, 3821, 3829, 3627, 3635, 3641, 3649, 3659, 3667, 3673, 3681, 3700,
+    3708, 3714, 3722, 3732, 3740, 3746, 3754, 3776, 3784, 3790, 3798, 3808,
+    3816, 3822, 3830, 3849, 3857, 3863, 3871, 3881, 3889, 3895, 3903, 3929,
+    3937, 3943, 3951, 3961, 3969, 3975, 3983, 4002, 4010, 4016, 4024, 4034,
+    4042, 4048, 4056, 4078, 4086, 4092, 4100, 4110, 4118, 4124, 4132, 4151,
+    4159, 4165, 4173, 4183, 4191, 4197, 4205, 4071, 4079, 4085, 4093, 4103,
+    4111, 4117, 4125, 4144, 4152, 4158, 4166, 4176, 4184, 4190, 4198, 4220,
+    4228, 4234, 4242, 4252, 4260, 4266, 4274, 4293, 4301, 4307, 4315, 4325,
+    4333, 4339, 4347, 4373, 4381, 4387, 4395, 4405, 4413, 4419, 4427, 4446,
+    4454, 4460, 4468, 4478, 4486, 4492, 4500, 4522, 4530, 4536, 4544, 4554,
+    4562, 4568, 4576, 4595, 4603, 4609, 4617, 4627, 4635, 4641, 4649, 3742,
+    3750, 3756, 3764, 3774, 3782, 3788, 3796, 3815, 3823, 3829, 3837, 3847,
+    3855, 3861, 3869, 3891, 3899, 3905, 3913, 3923, 3931, 3937, 3945, 3964,
+    3972, 3978, 3986, 3996, 4004, 4010, 4018, 4044, 4052, 4058, 4066, 4076,
+    4084, 4090, 4098, 4117, 4125, 4131, 4139, 4149, 4157, 4163, 4171, 4193,
+    4201, 4207, 4215, 4225, 4233, 4239, 4247, 4266, 4274, 4280, 4288, 4298,
+    4306, 4312, 4320, 4186, 4194, 4200, 4208, 4218, 4226, 4232, 4240, 4259,
+    4267, 4273, 4281, 4291, 4299, 4305, 4313, 4335, 4343, 4349, 4357, 4367,
+    4375, 4381, 4389, 4408, 4416, 4422, 4430, 4440, 4448, 4454, 4462, 4488,
+    4496, 4502, 4510, 4520, 4528, 4534, 4542, 4561, 4569, 4575, 4583, 4593,
+    4601, 4607, 4615, 4637, 4645, 4651, 4659, 4669, 4677, 4683, 4691, 4710,
+    4718, 4724, 4732, 4742, 4750, 4756, 4764, 4562, 4570, 4576, 4584, 4594,
+    4602, 4608, 4616, 4635, 4643, 4649, 4657, 4667, 4675, 4681, 4689, 4711,
+    4719, 4725, 4733, 4743, 4751, 4757, 4765, 4784, 4792, 4798, 4806, 4816,
+    4824, 4830, 4838, 4864, 4872, 4878, 4886, 4896, 4904, 4910, 4918, 4937,
+    4945, 4951, 4959, 4969, 4977, 4983, 4991, 5013, 5021, 5027, 5035, 5045,
+    5053, 5059, 5067, 5086, 5094, 5100, 5108, 5118, 5126, 5132, 5140, 5006,
+    5014, 5020, 5028, 5038, 5046, 5052, 5060, 5079, 5087, 5093, 5101, 5111,
+    5119, 5125, 5133, 5155, 5163, 5169, 5177, 5187, 5195, 5201, 5209, 5228,
+    5236, 5242, 5250, 5260, 5268, 5274, 5282, 5308, 5316, 5322, 5330, 5340,
+    5348, 5354, 5362, 5381, 5389, 5395, 5403, 5413, 5421, 5427, 5435, 5457,
+    5465, 5471, 5479, 5489, 5497, 5503, 5511, 5530, 5538, 5544, 5552, 5562,
+    5570, 5576, 5584, 4853, 4861, 4867, 4875, 4885, 4893, 4899, 4907, 4926,
+    4934, 4940, 4948, 4958, 4966, 4972, 4980, 5002, 5010, 5016, 5024, 5034,
+    5042, 5048, 5056, 5075, 5083, 5089, 5097, 5107, 5115, 5121, 5129, 5155,
+    5163, 5169, 5177, 5187, 5195, 5201, 5209, 5228, 5236, 5242, 5250, 5260,
+    5268, 5274, 5282, 5304, 5312, 5318, 5326, 5336, 5344, 5350, 5358, 5377,
+    5385, 5391, 5399, 5409, 5417, 5423, 5431, 5297, 5305, 5311, 5319, 5329,
+    5337, 5343, 5351, 5370, 5378, 5384, 5392, 5402, 5410, 5416, 5424, 5446,
+    5454, 5460, 5468, 5478, 5486, 5492, 5500, 5519, 5527, 5533, 5541, 5551,
+    5559, 5565, 5573, 5599, 5607, 5613, 5621, 5631, 5639, 5645, 5653, 5672,
+    5680, 5686, 5694, 5704, 5712, 5718, 5726, 5748, 5756, 5762, 5770, 5780,
+    5788, 5794, 5802, 5821, 5829, 5835, 5843, 5853, 5861, 5867, 5875, 5673,
+    5681, 5687, 5695, 5705, 5713, 5719, 5727, 5746, 5754, 5760, 5768, 5778,
+    5786, 5792, 5800, 5822, 5830, 5836, 5844, 5854, 5862, 5868, 5876, 5895,
+    5903, 5909, 5917, 5927, 5935, 5941, 5949, 5975, 5983, 5989, 5997, 6007,
+    6015, 6021, 6029, 6048, 6056, 6062, 6070, 6080, 6088, 6094, 6102, 6124,
+    6132, 6138, 6146, 6156, 6164, 6170, 6178, 6197, 6205, 6211, 6219, 6229,
+    6237, 6243, 6251, 6117, 6125, 6131, 6139, 6149, 6157, 6163, 6171, 6190,
+    6198, 6204, 6212, 6222, 6230, 6236, 6244, 6266, 6274, 6280, 6288, 6298,
+    6306, 6312, 6320, 6339, 6347, 6353, 6361, 6371, 6379, 6385, 6393, 6419,
+    6427, 6433, 6441, 6451, 6459, 6465, 6473, 6492, 6500, 6506, 6514, 6524,
+    6532, 6538, 6546, 6568, 6576, 6582, 6590, 6600, 6608, 6614, 6622, 6641,
+    6649, 6655, 6663, 6673, 6681, 6687, 6695, 3742, 3750, 3756, 3764, 3774,
+    3782, 3788, 3796, 3815, 3823, 3829, 3837, 3847, 3855, 3861, 3869, 3891,
+    3899, 3905, 3913, 3923, 3931, 3937, 3945, 3964, 3972, 3978, 3986, 3996,
+    4004, 4010, 4018, 4044, 4052, 4058, 4066, 4076, 4084, 4090, 4098, 4117,
+    4125, 4131, 4139, 4149, 4157, 4163, 4171, 4193, 4201, 4207, 4215, 4225,
+    4233, 4239, 4247, 4266, 4274, 4280, 4288, 4298, 4306, 4312, 4320, 4186,
+    4194, 4200, 4208, 4218, 4226, 4232, 4240, 4259, 4267, 4273, 4281, 4291,
+    4299, 4305, 4313, 4335, 4343, 4349, 4357, 4367, 4375, 4381, 4389, 4408,
+    4416, 4422, 4430, 4440, 4448, 4454, 4462, 4488, 4496, 4502, 4510, 4520,
+    4528, 4534, 4542, 4561, 4569, 4575, 4583, 4593, 4601, 4607, 4615, 4637,
+    4645, 4651, 4659, 4669, 4677, 4683, 4691, 4710, 4718, 4724, 4732, 4742,
+    4750, 4756, 4764, 4562, 4570, 4576, 4584, 4594, 4602, 4608, 4616, 4635,
+    4643, 4649, 4657, 4667, 4675, 4681, 4689, 4711, 4719, 4725, 4733, 4743,
+    4751, 4757, 4765, 4784, 4792, 4798, 4806, 4816, 4824, 4830, 4838, 4864,
+    4872, 4878, 4886, 4896, 4904, 4910, 4918, 4937, 4945, 4951, 4959, 4969,
+    4977, 4983, 4991, 5013, 5021, 5027, 5035, 5045, 5053, 5059, 5067, 5086,
+    5094, 5100, 5108, 5118, 5126, 5132, 5140, 5006, 5014, 5020, 5028, 5038,
+    5046, 5052, 5060, 5079, 5087, 5093, 5101, 5111, 5119, 5125, 5133, 5155,
+    5163, 5169, 5177, 5187, 5195, 5201, 5209, 5228, 5236, 5242, 5250, 5260,
+    5268, 5274, 5282, 5308, 5316, 5322, 5330, 5340, 5348, 5354, 5362, 5381,
+    5389, 5395, 5403, 5413, 5421, 5427, 5435, 5457, 5465, 5471, 5479, 5489,
+    5497, 5503, 5511, 5530, 5538, 5544, 5552, 5562, 5570, 5576, 5584, 4853,
+    4861, 4867, 4875, 4885, 4893, 4899, 4907, 4926, 4934, 4940, 4948, 4958,
+    4966, 4972, 4980, 5002, 5010, 5016, 5024, 5034, 5042, 5048, 5056, 5075,
+    5083, 5089, 5097, 5107, 5115, 5121, 5129, 5155, 5163, 5169, 5177, 5187,
+    5195, 5201, 5209, 5228, 5236, 5242, 5250, 5260, 5268, 5274, 5282, 5304,
+    5312, 5318, 5326, 5336, 5344, 5350, 5358, 5377, 5385, 5391, 5399, 5409,
+    5417, 5423, 5431, 5297, 5305, 5311, 5319, 5329, 5337, 5343, 5351, 5370,
+    5378, 5384, 5392, 5402, 5410, 5416, 5424, 5446, 5454, 5460, 5468, 5478,
+    5486, 5492, 5500, 5519, 5527, 5533, 5541, 5551, 5559, 5565, 5573, 5599,
+    5607, 5613, 5621, 5631, 5639, 5645, 5653, 5672, 5680, 5686, 5694, 5704,
+    5712, 5718, 5726, 5748, 5756, 5762, 5770, 5780, 5788, 5794, 5802, 5821,
+    5829, 5835, 5843, 5853, 5861, 5867, 5875, 5673, 5681, 5687, 5695, 5705,
+    5713, 5719, 5727, 5746, 5754, 5760, 5768, 5778, 5786, 5792, 5800, 5822,
+    5830, 5836, 5844, 5854, 5862, 5868, 5876, 5895, 5903, 5909, 5917, 5927,
+    5935, 5941, 5949, 5975, 5983, 5989, 5997, 6007, 6015, 6021, 6029, 6048,
+    6056, 6062, 6070, 6080, 6088, 6094, 6102, 6124, 6132, 6138, 6146, 6156,
+    6164, 6170, 6178, 6197, 6205, 6211, 6219, 6229, 6237, 6243, 6251, 6117,
+    6125, 6131, 6139, 6149, 6157, 6163, 6171, 6190, 6198, 6204, 6212, 6222,
+    6230, 6236, 6244, 6266, 6274, 6280, 6288, 6298, 6306, 6312, 6320, 6339,
+    6347, 6353, 6361, 6371, 6379, 6385, 6393, 6419, 6427, 6433, 6441, 6451,
+    6459, 6465, 6473, 6492, 6500, 6506, 6514, 6524, 6532, 6538, 6546, 6568,
+    6576, 6582, 6590, 6600, 6608, 6614, 6622, 6641, 6649, 6655, 6663, 6673,
+    6681, 6687, 6695, 5788, 5796, 5802, 5810, 5820, 5828, 5834, 5842, 5861,
+    5869, 5875, 5883, 5893, 5901, 5907, 5915, 5937, 5945, 5951, 5959, 5969,
+    5977, 5983, 5991, 6010, 6018, 6024, 6032, 6042, 6050, 6056, 6064, 6090,
+    6098, 6104, 6112, 6122, 6130, 6136, 6144, 6163, 6171, 6177, 6185, 6195,
+    6203, 6209, 6217, 6239, 6247, 6253, 6261, 6271, 6279, 6285, 6293, 6312,
+    6320, 6326, 6334, 6344, 6352, 6358, 6366, 6232, 6240, 6246, 6254, 6264,
+    6272, 6278, 6286, 6305, 6313, 6319, 6327, 6337, 6345, 6351, 6359, 6381,
+    6389, 6395, 6403, 6413, 6421, 6427, 6435, 6454, 6462, 6468, 6476, 6486,
+    6494, 6500, 6508, 6534, 6542, 6548, 6556, 6566, 6574, 6580, 6588, 6607,
+    6615, 6621, 6629, 6639, 6647, 6653, 6661, 6683, 6691, 6697, 6705, 6715,
+    6723, 6729, 6737, 6756, 6764, 6770, 6778, 6788, 6796, 6802, 6810, 6608,
+    6616, 6622, 6630, 6640, 6648, 6654, 6662, 6681, 6689, 6695, 6703, 6713,
+    6721, 6727, 6735, 6757, 6765, 6771, 6779, 6789, 6797, 6803, 6811, 6830,
+    6838, 6844, 6852, 6862, 6870, 6876, 6884, 6910, 6918, 6924, 6932, 6942,
+    6950, 6956, 6964, 6983, 6991, 6997, 7005, 7015, 7023, 7029, 7037, 7059,
+    7067, 7073, 7081, 7091, 7099, 7105, 7113, 7132, 7140, 7146, 7154, 7164,
+    7172, 7178, 7186, 7052, 7060, 7066, 7074, 7084, 7092, 7098, 7106, 7125,
+    7133, 7139, 7147, 7157, 7165, 7171, 7179, 7201, 7209, 7215, 7223, 7233,
+    7241, 7247, 7255, 7274, 7282, 7288, 7296, 7306, 7314, 7320, 7328, 7354,
+    7362, 7368, 7376, 7386, 7394, 7400, 7408, 7427, 7435, 7441, 7449, 7459,
+    7467, 7473, 7481, 7503, 7511, 7517, 7525, 7535, 7543, 7549, 7557, 7576,
+    7584, 7590, 7598, 7608, 7616, 7622, 7630, 6899, 6907, 6913, 6921, 6931,
+    6939, 6945, 6953, 6972, 6980, 6986, 6994, 7004, 7012, 7018, 7026, 7048,
+    7056, 7062, 7070, 7080, 7088, 7094, 7102, 7121, 7129, 7135, 7143, 7153,
+    7161, 7167, 7175, 7201, 7209, 7215, 7223, 7233, 7241, 7247, 7255, 7274,
+    7282, 7288, 7296, 7306, 7314, 7320, 7328, 7350, 7358, 7364, 7372, 7382,
+    7390, 7396, 7404, 7423, 7431, 7437, 7445, 7455, 7463, 7469, 7477, 7343,
+    7351, 7357, 7365, 7375, 7383, 7389, 7397, 7416, 7424, 7430, 7438, 7448,
+    7456, 7462, 7470, 7492, 7500, 7506, 7514, 7524, 7532, 7538, 7546, 7565,
+    7573, 7579, 7587, 7597, 7605, 7611, 7619, 7645, 7653, 7659, 7667, 7677,
+    7685, 7691, 7699, 7718, 7726, 7732, 7740, 7750, 7758, 7764, 7772, 7794,
+    7802, 7808, 7816, 7826, 7834, 7840, 7848, 7867, 7875, 7881, 7889, 7899,
+    7907, 7913, 7921, 7719, 7727, 7733, 7741, 7751, 7759, 7765, 7773, 7792,
+    7800, 7806, 7814, 7824, 7832, 7838, 7846, 7868, 7876, 7882, 7890, 7900,
+    7908, 7914, 7922, 7941, 7949, 7955, 7963, 7973, 7981, 7987, 7995, 8021,
+    8029, 8035, 8043, 8053, 8061, 8067, 8075, 8094, 8102, 8108, 8116, 8126,
+    8134, 8140, 8148, 8170, 8178, 8184, 8192, 8202, 8210, 8216, 8224, 8243,
+    8251, 8257, 8265, 8275
+};
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_ENCODER_DCT_VALUE_COST_H_
diff --git a/libs/libvpx/vp8/encoder/dct_value_tokens.h b/libs/libvpx/vp8/encoder/dct_value_tokens.h
new file mode 100644
index 0000000000..c2aadefca7
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/dct_value_tokens.h
@@ -0,0 +1,712 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_ENCODER_DCT_VALUE_TOKENS_H_
+#define VP8_ENCODER_DCT_VALUE_TOKENS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Generated file, included by tokenize.c  */
+/* Values generated by fill_value_tokens() */
+
+static const TOKENVALUE dct_value_tokens[2048*2] =
+{
+    {10, 3963}, {10, 3961}, {10, 3959}, {10, 3957}, {10, 3955}, {10, 3953},
+    {10, 3951}, {10, 3949}, {10, 3947}, {10, 3945}, {10, 3943}, {10, 3941},
+    {10, 3939}, {10, 3937}, {10, 3935}, {10, 3933}, {10, 3931}, {10, 3929},
+    {10, 3927}, {10, 3925}, {10, 3923}, {10, 3921}, {10, 3919}, {10, 3917},
+    {10, 3915}, {10, 3913}, {10, 3911}, {10, 3909}, {10, 3907}, {10, 3905},
+    {10, 3903}, {10, 3901}, {10, 3899}, {10, 3897}, {10, 3895}, {10, 3893},
+    {10, 3891}, {10, 3889}, {10, 3887}, {10, 3885}, {10, 3883}, {10, 3881},
+    {10, 3879}, {10, 3877}, {10, 3875}, {10, 3873}, {10, 3871}, {10, 3869},
+    {10, 3867}, {10, 3865}, {10, 3863}, {10, 3861}, {10, 3859}, {10, 3857},
+    {10, 3855}, {10, 3853}, {10, 3851}, {10, 3849}, {10, 3847}, {10, 3845},
+    {10, 3843}, {10, 3841}, {10, 3839}, {10, 3837}, {10, 3835}, {10, 3833},
+    {10, 3831}, {10, 3829}, {10, 3827}, {10, 3825}, {10, 3823}, {10, 3821},
+    {10, 3819}, {10, 3817}, {10, 3815}, {10, 3813}, {10, 3811}, {10, 3809},
+    {10, 3807}, {10, 3805}, {10, 3803}, {10, 3801}, {10, 3799}, {10, 3797},
+    {10, 3795}, {10, 3793}, {10, 3791}, {10, 3789}, {10, 3787}, {10, 3785},
+    {10, 3783}, {10, 3781}, {10, 3779}, {10, 3777}, {10, 3775}, {10, 3773},
+    {10, 3771}, {10, 3769}, {10, 3767}, {10, 3765}, {10, 3763}, {10, 3761},
+    {10, 3759}, {10, 3757}, {10, 3755}, {10, 3753}, {10, 3751}, {10, 3749},
+    {10, 3747}, {10, 3745}, {10, 3743}, {10, 3741}, {10, 3739}, {10, 3737},
+    {10, 3735}, {10, 3733}, {10, 3731}, {10, 3729}, {10, 3727}, {10, 3725},
+    {10, 3723}, {10, 3721}, {10, 3719}, {10, 3717}, {10, 3715}, {10, 3713},
+    {10, 3711}, {10, 3709}, {10, 3707}, {10, 3705}, {10, 3703}, {10, 3701},
+    {10, 3699}, {10, 3697}, {10, 3695}, {10, 3693}, {10, 3691}, {10, 3689},
+    {10, 3687}, {10, 3685}, {10, 3683}, {10, 3681}, {10, 3679}, {10, 3677},
+    {10, 3675}, {10, 3673}, {10, 3671}, {10, 3669}, {10, 3667}, {10, 3665},
+    {10, 3663}, {10, 3661}, {10, 3659}, {10, 3657}, {10, 3655}, {10, 3653},
+    {10, 3651}, {10, 3649}, {10, 3647}, {10, 3645}, {10, 3643}, {10, 3641},
+    {10, 3639}, {10, 3637}, {10, 3635}, {10, 3633}, {10, 3631}, {10, 3629},
+    {10, 3627}, {10, 3625}, {10, 3623}, {10, 3621}, {10, 3619}, {10, 3617},
+    {10, 3615}, {10, 3613}, {10, 3611}, {10, 3609}, {10, 3607}, {10, 3605},
+    {10, 3603}, {10, 3601}, {10, 3599}, {10, 3597}, {10, 3595}, {10, 3593},
+    {10, 3591}, {10, 3589}, {10, 3587}, {10, 3585}, {10, 3583}, {10, 3581},
+    {10, 3579}, {10, 3577}, {10, 3575}, {10, 3573}, {10, 3571}, {10, 3569},
+    {10, 3567}, {10, 3565}, {10, 3563}, {10, 3561}, {10, 3559}, {10, 3557},
+    {10, 3555}, {10, 3553}, {10, 3551}, {10, 3549}, {10, 3547}, {10, 3545},
+    {10, 3543}, {10, 3541}, {10, 3539}, {10, 3537}, {10, 3535}, {10, 3533},
+    {10, 3531}, {10, 3529}, {10, 3527}, {10, 3525}, {10, 3523}, {10, 3521},
+    {10, 3519}, {10, 3517}, {10, 3515}, {10, 3513}, {10, 3511}, {10, 3509},
+    {10, 3507}, {10, 3505}, {10, 3503}, {10, 3501}, {10, 3499}, {10, 3497},
+    {10, 3495}, {10, 3493}, {10, 3491}, {10, 3489}, {10, 3487}, {10, 3485},
+    {10, 3483}, {10, 3481}, {10, 3479}, {10, 3477}, {10, 3475}, {10, 3473},
+    {10, 3471}, {10, 3469}, {10, 3467}, {10, 3465}, {10, 3463}, {10, 3461},
+    {10, 3459}, {10, 3457}, {10, 3455}, {10, 3453}, {10, 3451}, {10, 3449},
+    {10, 3447}, {10, 3445}, {10, 3443}, {10, 3441}, {10, 3439}, {10, 3437},
+    {10, 3435}, {10, 3433}, {10, 3431}, {10, 3429}, {10, 3427}, {10, 3425},
+    {10, 3423}, {10, 3421}, {10, 3419}, {10, 3417}, {10, 3415}, {10, 3413},
+    {10, 3411}, {10, 3409}, {10, 3407}, {10, 3405}, {10, 3403}, {10, 3401},
+    {10, 3399}, {10, 3397}, {10, 3395}, {10, 3393}, {10, 3391}, {10, 3389},
+    {10, 3387}, {10, 3385}, {10, 3383}, {10, 3381}, {10, 3379}, {10, 3377},
+    {10, 3375}, {10, 3373}, {10, 3371}, {10, 3369}, {10, 3367}, {10, 3365},
+    {10, 3363}, {10, 3361}, {10, 3359}, {10, 3357}, {10, 3355}, {10, 3353},
+    {10, 3351}, {10, 3349}, {10, 3347}, {10, 3345}, {10, 3343}, {10, 3341},
+    {10, 3339}, {10, 3337}, {10, 3335}, {10, 3333}, {10, 3331}, {10, 3329},
+    {10, 3327}, {10, 3325}, {10, 3323}, {10, 3321}, {10, 3319}, {10, 3317},
+    {10, 3315}, {10, 3313}, {10, 3311}, {10, 3309}, {10, 3307}, {10, 3305},
+    {10, 3303}, {10, 3301}, {10, 3299}, {10, 3297}, {10, 3295}, {10, 3293},
+    {10, 3291}, {10, 3289}, {10, 3287}, {10, 3285}, {10, 3283}, {10, 3281},
+    {10, 3279}, {10, 3277}, {10, 3275}, {10, 3273}, {10, 3271}, {10, 3269},
+    {10, 3267}, {10, 3265}, {10, 3263}, {10, 3261}, {10, 3259}, {10, 3257},
+    {10, 3255}, {10, 3253}, {10, 3251}, {10, 3249}, {10, 3247}, {10, 3245},
+    {10, 3243}, {10, 3241}, {10, 3239}, {10, 3237}, {10, 3235}, {10, 3233},
+    {10, 3231}, {10, 3229}, {10, 3227}, {10, 3225}, {10, 3223}, {10, 3221},
+    {10, 3219}, {10, 3217}, {10, 3215}, {10, 3213}, {10, 3211}, {10, 3209},
+    {10, 3207}, {10, 3205}, {10, 3203}, {10, 3201}, {10, 3199}, {10, 3197},
+    {10, 3195}, {10, 3193}, {10, 3191}, {10, 3189}, {10, 3187}, {10, 3185},
+    {10, 3183}, {10, 3181}, {10, 3179}, {10, 3177}, {10, 3175}, {10, 3173},
+    {10, 3171}, {10, 3169}, {10, 3167}, {10, 3165}, {10, 3163}, {10, 3161},
+    {10, 3159}, {10, 3157}, {10, 3155}, {10, 3153}, {10, 3151}, {10, 3149},
+    {10, 3147}, {10, 3145}, {10, 3143}, {10, 3141}, {10, 3139}, {10, 3137},
+    {10, 3135}, {10, 3133}, {10, 3131}, {10, 3129}, {10, 3127}, {10, 3125},
+    {10, 3123}, {10, 3121}, {10, 3119}, {10, 3117}, {10, 3115}, {10, 3113},
+    {10, 3111}, {10, 3109}, {10, 3107}, {10, 3105}, {10, 3103}, {10, 3101},
+    {10, 3099}, {10, 3097}, {10, 3095}, {10, 3093}, {10, 3091}, {10, 3089},
+    {10, 3087}, {10, 3085}, {10, 3083}, {10, 3081}, {10, 3079}, {10, 3077},
+    {10, 3075}, {10, 3073}, {10, 3071}, {10, 3069}, {10, 3067}, {10, 3065},
+    {10, 3063}, {10, 3061}, {10, 3059}, {10, 3057}, {10, 3055}, {10, 3053},
+    {10, 3051}, {10, 3049}, {10, 3047}, {10, 3045}, {10, 3043}, {10, 3041},
+    {10, 3039}, {10, 3037}, {10, 3035}, {10, 3033}, {10, 3031}, {10, 3029},
+    {10, 3027}, {10, 3025}, {10, 3023}, {10, 3021}, {10, 3019}, {10, 3017},
+    {10, 3015}, {10, 3013}, {10, 3011}, {10, 3009}, {10, 3007}, {10, 3005},
+    {10, 3003}, {10, 3001}, {10, 2999}, {10, 2997}, {10, 2995}, {10, 2993},
+    {10, 2991}, {10, 2989}, {10, 2987}, {10, 2985}, {10, 2983}, {10, 2981},
+    {10, 2979}, {10, 2977}, {10, 2975}, {10, 2973}, {10, 2971}, {10, 2969},
+    {10, 2967}, {10, 2965}, {10, 2963}, {10, 2961}, {10, 2959}, {10, 2957},
+    {10, 2955}, {10, 2953}, {10, 2951}, {10, 2949}, {10, 2947}, {10, 2945},
+    {10, 2943}, {10, 2941}, {10, 2939}, {10, 2937}, {10, 2935}, {10, 2933},
+    {10, 2931}, {10, 2929}, {10, 2927}, {10, 2925}, {10, 2923}, {10, 2921},
+    {10, 2919}, {10, 2917}, {10, 2915}, {10, 2913}, {10, 2911}, {10, 2909},
+    {10, 2907}, {10, 2905}, {10, 2903}, {10, 2901}, {10, 2899}, {10, 2897},
+    {10, 2895}, {10, 2893}, {10, 2891}, {10, 2889}, {10, 2887}, {10, 2885},
+    {10, 2883}, {10, 2881}, {10, 2879}, {10, 2877}, {10, 2875}, {10, 2873},
+    {10, 2871}, {10, 2869}, {10, 2867}, {10, 2865}, {10, 2863}, {10, 2861},
+    {10, 2859}, {10, 2857}, {10, 2855}, {10, 2853}, {10, 2851}, {10, 2849},
+    {10, 2847}, {10, 2845}, {10, 2843}, {10, 2841}, {10, 2839}, {10, 2837},
+    {10, 2835}, {10, 2833}, {10, 2831}, {10, 2829}, {10, 2827}, {10, 2825},
+    {10, 2823}, {10, 2821}, {10, 2819}, {10, 2817}, {10, 2815}, {10, 2813},
+    {10, 2811}, {10, 2809}, {10, 2807}, {10, 2805}, {10, 2803}, {10, 2801},
+    {10, 2799}, {10, 2797}, {10, 2795}, {10, 2793}, {10, 2791}, {10, 2789},
+    {10, 2787}, {10, 2785}, {10, 2783}, {10, 2781}, {10, 2779}, {10, 2777},
+    {10, 2775}, {10, 2773}, {10, 2771}, {10, 2769}, {10, 2767}, {10, 2765},
+    {10, 2763}, {10, 2761}, {10, 2759}, {10, 2757}, {10, 2755}, {10, 2753},
+    {10, 2751}, {10, 2749}, {10, 2747}, {10, 2745}, {10, 2743}, {10, 2741},
+    {10, 2739}, {10, 2737}, {10, 2735}, {10, 2733}, {10, 2731}, {10, 2729},
+    {10, 2727}, {10, 2725}, {10, 2723}, {10, 2721}, {10, 2719}, {10, 2717},
+    {10, 2715}, {10, 2713}, {10, 2711}, {10, 2709}, {10, 2707}, {10, 2705},
+    {10, 2703}, {10, 2701}, {10, 2699}, {10, 2697}, {10, 2695}, {10, 2693},
+    {10, 2691}, {10, 2689}, {10, 2687}, {10, 2685}, {10, 2683}, {10, 2681},
+    {10, 2679}, {10, 2677}, {10, 2675}, {10, 2673}, {10, 2671}, {10, 2669},
+    {10, 2667}, {10, 2665}, {10, 2663}, {10, 2661}, {10, 2659}, {10, 2657},
+    {10, 2655}, {10, 2653}, {10, 2651}, {10, 2649}, {10, 2647}, {10, 2645},
+    {10, 2643}, {10, 2641}, {10, 2639}, {10, 2637}, {10, 2635}, {10, 2633},
+    {10, 2631}, {10, 2629}, {10, 2627}, {10, 2625}, {10, 2623}, {10, 2621},
+    {10, 2619}, {10, 2617}, {10, 2615}, {10, 2613}, {10, 2611}, {10, 2609},
+    {10, 2607}, {10, 2605}, {10, 2603}, {10, 2601}, {10, 2599}, {10, 2597},
+    {10, 2595}, {10, 2593}, {10, 2591}, {10, 2589}, {10, 2587}, {10, 2585},
+    {10, 2583}, {10, 2581}, {10, 2579}, {10, 2577}, {10, 2575}, {10, 2573},
+    {10, 2571}, {10, 2569}, {10, 2567}, {10, 2565}, {10, 2563}, {10, 2561},
+    {10, 2559}, {10, 2557}, {10, 2555}, {10, 2553}, {10, 2551}, {10, 2549},
+    {10, 2547}, {10, 2545}, {10, 2543}, {10, 2541}, {10, 2539}, {10, 2537},
+    {10, 2535}, {10, 2533}, {10, 2531}, {10, 2529}, {10, 2527}, {10, 2525},
+    {10, 2523}, {10, 2521}, {10, 2519}, {10, 2517}, {10, 2515}, {10, 2513},
+    {10, 2511}, {10, 2509}, {10, 2507}, {10, 2505}, {10, 2503}, {10, 2501},
+    {10, 2499}, {10, 2497}, {10, 2495}, {10, 2493}, {10, 2491}, {10, 2489},
+    {10, 2487}, {10, 2485}, {10, 2483}, {10, 2481}, {10, 2479}, {10, 2477},
+    {10, 2475}, {10, 2473}, {10, 2471}, {10, 2469}, {10, 2467}, {10, 2465},
+    {10, 2463}, {10, 2461}, {10, 2459}, {10, 2457}, {10, 2455}, {10, 2453},
+    {10, 2451}, {10, 2449}, {10, 2447}, {10, 2445}, {10, 2443}, {10, 2441},
+    {10, 2439}, {10, 2437}, {10, 2435}, {10, 2433}, {10, 2431}, {10, 2429},
+    {10, 2427}, {10, 2425}, {10, 2423}, {10, 2421}, {10, 2419}, {10, 2417},
+    {10, 2415}, {10, 2413}, {10, 2411}, {10, 2409}, {10, 2407}, {10, 2405},
+    {10, 2403}, {10, 2401}, {10, 2399}, {10, 2397}, {10, 2395}, {10, 2393},
+    {10, 2391}, {10, 2389}, {10, 2387}, {10, 2385}, {10, 2383}, {10, 2381},
+    {10, 2379}, {10, 2377}, {10, 2375}, {10, 2373}, {10, 2371}, {10, 2369},
+    {10, 2367}, {10, 2365}, {10, 2363}, {10, 2361}, {10, 2359}, {10, 2357},
+    {10, 2355}, {10, 2353}, {10, 2351}, {10, 2349}, {10, 2347}, {10, 2345},
+    {10, 2343}, {10, 2341}, {10, 2339}, {10, 2337}, {10, 2335}, {10, 2333},
+    {10, 2331}, {10, 2329}, {10, 2327}, {10, 2325}, {10, 2323}, {10, 2321},
+    {10, 2319}, {10, 2317}, {10, 2315}, {10, 2313}, {10, 2311}, {10, 2309},
+    {10, 2307}, {10, 2305}, {10, 2303}, {10, 2301}, {10, 2299}, {10, 2297},
+    {10, 2295}, {10, 2293}, {10, 2291}, {10, 2289}, {10, 2287}, {10, 2285},
+    {10, 2283}, {10, 2281}, {10, 2279}, {10, 2277}, {10, 2275}, {10, 2273},
+    {10, 2271}, {10, 2269}, {10, 2267}, {10, 2265}, {10, 2263}, {10, 2261},
+    {10, 2259}, {10, 2257}, {10, 2255}, {10, 2253}, {10, 2251}, {10, 2249},
+    {10, 2247}, {10, 2245}, {10, 2243}, {10, 2241}, {10, 2239}, {10, 2237},
+    {10, 2235}, {10, 2233}, {10, 2231}, {10, 2229}, {10, 2227}, {10, 2225},
+    {10, 2223}, {10, 2221}, {10, 2219}, {10, 2217}, {10, 2215}, {10, 2213},
+    {10, 2211}, {10, 2209}, {10, 2207}, {10, 2205}, {10, 2203}, {10, 2201},
+    {10, 2199}, {10, 2197}, {10, 2195}, {10, 2193}, {10, 2191}, {10, 2189},
+    {10, 2187}, {10, 2185}, {10, 2183}, {10, 2181}, {10, 2179}, {10, 2177},
+    {10, 2175}, {10, 2173}, {10, 2171}, {10, 2169}, {10, 2167}, {10, 2165},
+    {10, 2163}, {10, 2161}, {10, 2159}, {10, 2157}, {10, 2155}, {10, 2153},
+    {10, 2151}, {10, 2149}, {10, 2147}, {10, 2145}, {10, 2143}, {10, 2141},
+    {10, 2139}, {10, 2137}, {10, 2135}, {10, 2133}, {10, 2131}, {10, 2129},
+    {10, 2127}, {10, 2125}, {10, 2123}, {10, 2121}, {10, 2119}, {10, 2117},
+    {10, 2115}, {10, 2113}, {10, 2111}, {10, 2109}, {10, 2107}, {10, 2105},
+    {10, 2103}, {10, 2101}, {10, 2099}, {10, 2097}, {10, 2095}, {10, 2093},
+    {10, 2091}, {10, 2089}, {10, 2087}, {10, 2085}, {10, 2083}, {10, 2081},
+    {10, 2079}, {10, 2077}, {10, 2075}, {10, 2073}, {10, 2071}, {10, 2069},
+    {10, 2067}, {10, 2065}, {10, 2063}, {10, 2061}, {10, 2059}, {10, 2057},
+    {10, 2055}, {10, 2053}, {10, 2051}, {10, 2049}, {10, 2047}, {10, 2045},
+    {10, 2043}, {10, 2041}, {10, 2039}, {10, 2037}, {10, 2035}, {10, 2033},
+    {10, 2031}, {10, 2029}, {10, 2027}, {10, 2025}, {10, 2023}, {10, 2021},
+    {10, 2019}, {10, 2017}, {10, 2015}, {10, 2013}, {10, 2011}, {10, 2009},
+    {10, 2007}, {10, 2005}, {10, 2003}, {10, 2001}, {10, 1999}, {10, 1997},
+    {10, 1995}, {10, 1993}, {10, 1991}, {10, 1989}, {10, 1987}, {10, 1985},
+    {10, 1983}, {10, 1981}, {10, 1979}, {10, 1977}, {10, 1975}, {10, 1973},
+    {10, 1971}, {10, 1969}, {10, 1967}, {10, 1965}, {10, 1963}, {10, 1961},
+    {10, 1959}, {10, 1957}, {10, 1955}, {10, 1953}, {10, 1951}, {10, 1949},
+    {10, 1947}, {10, 1945}, {10, 1943}, {10, 1941}, {10, 1939}, {10, 1937},
+    {10, 1935}, {10, 1933}, {10, 1931}, {10, 1929}, {10, 1927}, {10, 1925},
+    {10, 1923}, {10, 1921}, {10, 1919}, {10, 1917}, {10, 1915}, {10, 1913},
+    {10, 1911}, {10, 1909}, {10, 1907}, {10, 1905}, {10, 1903}, {10, 1901},
+    {10, 1899}, {10, 1897}, {10, 1895}, {10, 1893}, {10, 1891}, {10, 1889},
+    {10, 1887}, {10, 1885}, {10, 1883}, {10, 1881}, {10, 1879}, {10, 1877},
+    {10, 1875}, {10, 1873}, {10, 1871}, {10, 1869}, {10, 1867}, {10, 1865},
+    {10, 1863}, {10, 1861}, {10, 1859}, {10, 1857}, {10, 1855}, {10, 1853},
+    {10, 1851}, {10, 1849}, {10, 1847}, {10, 1845}, {10, 1843}, {10, 1841},
+    {10, 1839}, {10, 1837}, {10, 1835}, {10, 1833}, {10, 1831}, {10, 1829},
+    {10, 1827}, {10, 1825}, {10, 1823}, {10, 1821}, {10, 1819}, {10, 1817},
+    {10, 1815}, {10, 1813}, {10, 1811}, {10, 1809}, {10, 1807}, {10, 1805},
+    {10, 1803}, {10, 1801}, {10, 1799}, {10, 1797}, {10, 1795}, {10, 1793},
+    {10, 1791}, {10, 1789}, {10, 1787}, {10, 1785}, {10, 1783}, {10, 1781},
+    {10, 1779}, {10, 1777}, {10, 1775}, {10, 1773}, {10, 1771}, {10, 1769},
+    {10, 1767}, {10, 1765}, {10, 1763}, {10, 1761}, {10, 1759}, {10, 1757},
+    {10, 1755}, {10, 1753}, {10, 1751}, {10, 1749}, {10, 1747}, {10, 1745},
+    {10, 1743}, {10, 1741}, {10, 1739}, {10, 1737}, {10, 1735}, {10, 1733},
+    {10, 1731}, {10, 1729}, {10, 1727}, {10, 1725}, {10, 1723}, {10, 1721},
+    {10, 1719}, {10, 1717}, {10, 1715}, {10, 1713}, {10, 1711}, {10, 1709},
+    {10, 1707}, {10, 1705}, {10, 1703}, {10, 1701}, {10, 1699}, {10, 1697},
+    {10, 1695}, {10, 1693}, {10, 1691}, {10, 1689}, {10, 1687}, {10, 1685},
+    {10, 1683}, {10, 1681}, {10, 1679}, {10, 1677}, {10, 1675}, {10, 1673},
+    {10, 1671}, {10, 1669}, {10, 1667}, {10, 1665}, {10, 1663}, {10, 1661},
+    {10, 1659}, {10, 1657}, {10, 1655}, {10, 1653}, {10, 1651}, {10, 1649},
+    {10, 1647}, {10, 1645}, {10, 1643}, {10, 1641}, {10, 1639}, {10, 1637},
+    {10, 1635}, {10, 1633}, {10, 1631}, {10, 1629}, {10, 1627}, {10, 1625},
+    {10, 1623}, {10, 1621}, {10, 1619}, {10, 1617}, {10, 1615}, {10, 1613},
+    {10, 1611}, {10, 1609}, {10, 1607}, {10, 1605}, {10, 1603}, {10, 1601},
+    {10, 1599}, {10, 1597}, {10, 1595}, {10, 1593}, {10, 1591}, {10, 1589},
+    {10, 1587}, {10, 1585}, {10, 1583}, {10, 1581}, {10, 1579}, {10, 1577},
+    {10, 1575}, {10, 1573}, {10, 1571}, {10, 1569}, {10, 1567}, {10, 1565},
+    {10, 1563}, {10, 1561}, {10, 1559}, {10, 1557}, {10, 1555}, {10, 1553},
+    {10, 1551}, {10, 1549}, {10, 1547}, {10, 1545}, {10, 1543}, {10, 1541},
+    {10, 1539}, {10, 1537}, {10, 1535}, {10, 1533}, {10, 1531}, {10, 1529},
+    {10, 1527}, {10, 1525}, {10, 1523}, {10, 1521}, {10, 1519}, {10, 1517},
+    {10, 1515}, {10, 1513}, {10, 1511}, {10, 1509}, {10, 1507}, {10, 1505},
+    {10, 1503}, {10, 1501}, {10, 1499}, {10, 1497}, {10, 1495}, {10, 1493},
+    {10, 1491}, {10, 1489}, {10, 1487}, {10, 1485}, {10, 1483}, {10, 1481},
+    {10, 1479}, {10, 1477}, {10, 1475}, {10, 1473}, {10, 1471}, {10, 1469},
+    {10, 1467}, {10, 1465}, {10, 1463}, {10, 1461}, {10, 1459}, {10, 1457},
+    {10, 1455}, {10, 1453}, {10, 1451}, {10, 1449}, {10, 1447}, {10, 1445},
+    {10, 1443}, {10, 1441}, {10, 1439}, {10, 1437}, {10, 1435}, {10, 1433},
+    {10, 1431}, {10, 1429}, {10, 1427}, {10, 1425}, {10, 1423}, {10, 1421},
+    {10, 1419}, {10, 1417}, {10, 1415}, {10, 1413}, {10, 1411}, {10, 1409},
+    {10, 1407}, {10, 1405}, {10, 1403}, {10, 1401}, {10, 1399}, {10, 1397},
+    {10, 1395}, {10, 1393}, {10, 1391}, {10, 1389}, {10, 1387}, {10, 1385},
+    {10, 1383}, {10, 1381}, {10, 1379}, {10, 1377}, {10, 1375}, {10, 1373},
+    {10, 1371}, {10, 1369}, {10, 1367}, {10, 1365}, {10, 1363}, {10, 1361},
+    {10, 1359}, {10, 1357}, {10, 1355}, {10, 1353}, {10, 1351}, {10, 1349},
+    {10, 1347}, {10, 1345}, {10, 1343}, {10, 1341}, {10, 1339}, {10, 1337},
+    {10, 1335}, {10, 1333}, {10, 1331}, {10, 1329}, {10, 1327}, {10, 1325},
+    {10, 1323}, {10, 1321}, {10, 1319}, {10, 1317}, {10, 1315}, {10, 1313},
+    {10, 1311}, {10, 1309}, {10, 1307}, {10, 1305}, {10, 1303}, {10, 1301},
+    {10, 1299}, {10, 1297}, {10, 1295}, {10, 1293}, {10, 1291}, {10, 1289},
+    {10, 1287}, {10, 1285}, {10, 1283}, {10, 1281}, {10, 1279}, {10, 1277},
+    {10, 1275}, {10, 1273}, {10, 1271}, {10, 1269}, {10, 1267}, {10, 1265},
+    {10, 1263}, {10, 1261}, {10, 1259}, {10, 1257}, {10, 1255}, {10, 1253},
+    {10, 1251}, {10, 1249}, {10, 1247}, {10, 1245}, {10, 1243}, {10, 1241},
+    {10, 1239}, {10, 1237}, {10, 1235}, {10, 1233}, {10, 1231}, {10, 1229},
+    {10, 1227}, {10, 1225}, {10, 1223}, {10, 1221}, {10, 1219}, {10, 1217},
+    {10, 1215}, {10, 1213}, {10, 1211}, {10, 1209}, {10, 1207}, {10, 1205},
+    {10, 1203}, {10, 1201}, {10, 1199}, {10, 1197}, {10, 1195}, {10, 1193},
+    {10, 1191}, {10, 1189}, {10, 1187}, {10, 1185}, {10, 1183}, {10, 1181},
+    {10, 1179}, {10, 1177}, {10, 1175}, {10, 1173}, {10, 1171}, {10, 1169},
+    {10, 1167}, {10, 1165}, {10, 1163}, {10, 1161}, {10, 1159}, {10, 1157},
+    {10, 1155}, {10, 1153}, {10, 1151}, {10, 1149}, {10, 1147}, {10, 1145},
+    {10, 1143}, {10, 1141}, {10, 1139}, {10, 1137}, {10, 1135}, {10, 1133},
+    {10, 1131}, {10, 1129}, {10, 1127}, {10, 1125}, {10, 1123}, {10, 1121},
+    {10, 1119}, {10, 1117}, {10, 1115}, {10, 1113}, {10, 1111}, {10, 1109},
+    {10, 1107}, {10, 1105}, {10, 1103}, {10, 1101}, {10, 1099}, {10, 1097},
+    {10, 1095}, {10, 1093}, {10, 1091}, {10, 1089}, {10, 1087}, {10, 1085},
+    {10, 1083}, {10, 1081}, {10, 1079}, {10, 1077}, {10, 1075}, {10, 1073},
+    {10, 1071}, {10, 1069}, {10, 1067}, {10, 1065}, {10, 1063}, {10, 1061},
+    {10, 1059}, {10, 1057}, {10, 1055}, {10, 1053}, {10, 1051}, {10, 1049},
+    {10, 1047}, {10, 1045}, {10, 1043}, {10, 1041}, {10, 1039}, {10, 1037},
+    {10, 1035}, {10, 1033}, {10, 1031}, {10, 1029}, {10, 1027}, {10, 1025},
+    {10, 1023}, {10, 1021}, {10, 1019}, {10, 1017}, {10, 1015}, {10, 1013},
+    {10, 1011}, {10, 1009}, {10, 1007}, {10, 1005}, {10, 1003}, {10, 1001},
+    {10, 999}, {10, 997}, {10, 995}, {10, 993}, {10, 991}, {10, 989},
+    {10, 987}, {10, 985}, {10, 983}, {10, 981}, {10, 979}, {10, 977},
+    {10, 975}, {10, 973}, {10, 971}, {10, 969}, {10, 967}, {10, 965},
+    {10, 963}, {10, 961}, {10, 959}, {10, 957}, {10, 955}, {10, 953},
+    {10, 951}, {10, 949}, {10, 947}, {10, 945}, {10, 943}, {10, 941},
+    {10, 939}, {10, 937}, {10, 935}, {10, 933}, {10, 931}, {10, 929},
+    {10, 927}, {10, 925}, {10, 923}, {10, 921}, {10, 919}, {10, 917},
+    {10, 915}, {10, 913}, {10, 911}, {10, 909}, {10, 907}, {10, 905},
+    {10, 903}, {10, 901}, {10, 899}, {10, 897}, {10, 895}, {10, 893},
+    {10, 891}, {10, 889}, {10, 887}, {10, 885}, {10, 883}, {10, 881},
+    {10, 879}, {10, 877}, {10, 875}, {10, 873}, {10, 871}, {10, 869},
+    {10, 867}, {10, 865}, {10, 863}, {10, 861}, {10, 859}, {10, 857},
+    {10, 855}, {10, 853}, {10, 851}, {10, 849}, {10, 847}, {10, 845},
+    {10, 843}, {10, 841}, {10, 839}, {10, 837}, {10, 835}, {10, 833},
+    {10, 831}, {10, 829}, {10, 827}, {10, 825}, {10, 823}, {10, 821},
+    {10, 819}, {10, 817}, {10, 815}, {10, 813}, {10, 811}, {10, 809},
+    {10, 807}, {10, 805}, {10, 803}, {10, 801}, {10, 799}, {10, 797},
+    {10, 795}, {10, 793}, {10, 791}, {10, 789}, {10, 787}, {10, 785},
+    {10, 783}, {10, 781}, {10, 779}, {10, 777}, {10, 775}, {10, 773},
+    {10, 771}, {10, 769}, {10, 767}, {10, 765}, {10, 763}, {10, 761},
+    {10, 759}, {10, 757}, {10, 755}, {10, 753}, {10, 751}, {10, 749},
+    {10, 747}, {10, 745}, {10, 743}, {10, 741}, {10, 739}, {10, 737},
+    {10, 735}, {10, 733}, {10, 731}, {10, 729}, {10, 727}, {10, 725},
+    {10, 723}, {10, 721}, {10, 719}, {10, 717}, {10, 715}, {10, 713},
+    {10, 711}, {10, 709}, {10, 707}, {10, 705}, {10, 703}, {10, 701},
+    {10, 699}, {10, 697}, {10, 695}, {10, 693}, {10, 691}, {10, 689},
+    {10, 687}, {10, 685}, {10, 683}, {10, 681}, {10, 679}, {10, 677},
+    {10, 675}, {10, 673}, {10, 671}, {10, 669}, {10, 667}, {10, 665},
+    {10, 663}, {10, 661}, {10, 659}, {10, 657}, {10, 655}, {10, 653},
+    {10, 651}, {10, 649}, {10, 647}, {10, 645}, {10, 643}, {10, 641},
+    {10, 639}, {10, 637}, {10, 635}, {10, 633}, {10, 631}, {10, 629},
+    {10, 627}, {10, 625}, {10, 623}, {10, 621}, {10, 619}, {10, 617},
+    {10, 615}, {10, 613}, {10, 611}, {10, 609}, {10, 607}, {10, 605},
+    {10, 603}, {10, 601}, {10, 599}, {10, 597}, {10, 595}, {10, 593},
+    {10, 591}, {10, 589}, {10, 587}, {10, 585}, {10, 583}, {10, 581},
+    {10, 579}, {10, 577}, {10, 575}, {10, 573}, {10, 571}, {10, 569},
+    {10, 567}, {10, 565}, {10, 563}, {10, 561}, {10, 559}, {10, 557},
+    {10, 555}, {10, 553}, {10, 551}, {10, 549}, {10, 547}, {10, 545},
+    {10, 543}, {10, 541}, {10, 539}, {10, 537}, {10, 535}, {10, 533},
+    {10, 531}, {10, 529}, {10, 527}, {10, 525}, {10, 523}, {10, 521},
+    {10, 519}, {10, 517}, {10, 515}, {10, 513}, {10, 511}, {10, 509},
+    {10, 507}, {10, 505}, {10, 503}, {10, 501}, {10, 499}, {10, 497},
+    {10, 495}, {10, 493}, {10, 491}, {10, 489}, {10, 487}, {10, 485},
+    {10, 483}, {10, 481}, {10, 479}, {10, 477}, {10, 475}, {10, 473},
+    {10, 471}, {10, 469}, {10, 467}, {10, 465}, {10, 463}, {10, 461},
+    {10, 459}, {10, 457}, {10, 455}, {10, 453}, {10, 451}, {10, 449},
+    {10, 447}, {10, 445}, {10, 443}, {10, 441}, {10, 439}, {10, 437},
+    {10, 435}, {10, 433}, {10, 431}, {10, 429}, {10, 427}, {10, 425},
+    {10, 423}, {10, 421}, {10, 419}, {10, 417}, {10, 415}, {10, 413},
+    {10, 411}, {10, 409}, {10, 407}, {10, 405}, {10, 403}, {10, 401},
+    {10, 399}, {10, 397}, {10, 395}, {10, 393}, {10, 391}, {10, 389},
+    {10, 387}, {10, 385}, {10, 383}, {10, 381}, {10, 379}, {10, 377},
+    {10, 375}, {10, 373}, {10, 371}, {10, 369}, {10, 367}, {10, 365},
+    {10, 363}, {10, 361}, {10, 359}, {10, 357}, {10, 355}, {10, 353},
+    {10, 351}, {10, 349}, {10, 347}, {10, 345}, {10, 343}, {10, 341},
+    {10, 339}, {10, 337}, {10, 335}, {10, 333}, {10, 331}, {10, 329},
+    {10, 327}, {10, 325}, {10, 323}, {10, 321}, {10, 319}, {10, 317},
+    {10, 315}, {10, 313}, {10, 311}, {10, 309}, {10, 307}, {10, 305},
+    {10, 303}, {10, 301}, {10, 299}, {10, 297}, {10, 295}, {10, 293},
+    {10, 291}, {10, 289}, {10, 287}, {10, 285}, {10, 283}, {10, 281},
+    {10, 279}, {10, 277}, {10, 275}, {10, 273}, {10, 271}, {10, 269},
+    {10, 267}, {10, 265}, {10, 263}, {10, 261}, {10, 259}, {10, 257},
+    {10, 255}, {10, 253}, {10, 251}, {10, 249}, {10, 247}, {10, 245},
+    {10, 243}, {10, 241}, {10, 239}, {10, 237}, {10, 235}, {10, 233},
+    {10, 231}, {10, 229}, {10, 227}, {10, 225}, {10, 223}, {10, 221},
+    {10, 219}, {10, 217}, {10, 215}, {10, 213}, {10, 211}, {10, 209},
+    {10, 207}, {10, 205}, {10, 203}, {10, 201}, {10, 199}, {10, 197},
+    {10, 195}, {10, 193}, {10, 191}, {10, 189}, {10, 187}, {10, 185},
+    {10, 183}, {10, 181}, {10, 179}, {10, 177}, {10, 175}, {10, 173},
+    {10, 171}, {10, 169}, {10, 167}, {10, 165}, {10, 163}, {10, 161},
+    {10, 159}, {10, 157}, {10, 155}, {10, 153}, {10, 151}, {10, 149},
+    {10, 147}, {10, 145}, {10, 143}, {10, 141}, {10, 139}, {10, 137},
+    {10, 135}, {10, 133}, {10, 131}, {10, 129}, {10, 127}, {10, 125},
+    {10, 123}, {10, 121}, {10, 119}, {10, 117}, {10, 115}, {10, 113},
+    {10, 111}, {10, 109}, {10, 107}, {10, 105}, {10, 103}, {10, 101},
+    {10, 99}, {10, 97}, {10, 95}, {10, 93}, {10, 91}, {10, 89},
+    {10, 87}, {10, 85}, {10, 83}, {10, 81}, {10, 79}, {10, 77},
+    {10, 75}, {10, 73}, {10, 71}, {10, 69}, {10, 67}, {10, 65},
+    {10, 63}, {10, 61}, {10, 59}, {10, 57}, {10, 55}, {10, 53},
+    {10, 51}, {10, 49}, {10, 47}, {10, 45}, {10, 43}, {10, 41},
+    {10, 39}, {10, 37}, {10, 35}, {10, 33}, {10, 31}, {10, 29},
+    {10, 27}, {10, 25}, {10, 23}, {10, 21}, {10, 19}, {10, 17},
+    {10, 15}, {10, 13}, {10, 11}, {10, 9}, {10, 7}, {10, 5},
+    {10, 3}, {10, 1}, {9, 63}, {9, 61}, {9, 59}, {9, 57},
+    {9, 55}, {9, 53}, {9, 51}, {9, 49}, {9, 47}, {9, 45},
+    {9, 43}, {9, 41}, {9, 39}, {9, 37}, {9, 35}, {9, 33},
+    {9, 31}, {9, 29}, {9, 27}, {9, 25}, {9, 23}, {9, 21},
+    {9, 19}, {9, 17}, {9, 15}, {9, 13}, {9, 11}, {9, 9},
+    {9, 7}, {9, 5}, {9, 3}, {9, 1}, {8, 31}, {8, 29},
+    {8, 27}, {8, 25}, {8, 23}, {8, 21}, {8, 19}, {8, 17},
+    {8, 15}, {8, 13}, {8, 11}, {8, 9}, {8, 7}, {8, 5},
+    {8, 3}, {8, 1}, {7, 15}, {7, 13}, {7, 11}, {7, 9},
+    {7, 7}, {7, 5}, {7, 3}, {7, 1}, {6, 7}, {6, 5},
+    {6, 3}, {6, 1}, {5, 3}, {5, 1}, {4, 1}, {3, 1},
+    {2, 1}, {1, 1}, {0, 0}, {1, 0}, {2, 0}, {3, 0},
+    {4, 0}, {5, 0}, {5, 2}, {6, 0}, {6, 2}, {6, 4},
+    {6, 6}, {7, 0}, {7, 2}, {7, 4}, {7, 6}, {7, 8},
+    {7, 10}, {7, 12}, {7, 14}, {8, 0}, {8, 2}, {8, 4},
+    {8, 6}, {8, 8}, {8, 10}, {8, 12}, {8, 14}, {8, 16},
+    {8, 18}, {8, 20}, {8, 22}, {8, 24}, {8, 26}, {8, 28},
+    {8, 30}, {9, 0}, {9, 2}, {9, 4}, {9, 6}, {9, 8},
+    {9, 10}, {9, 12}, {9, 14}, {9, 16}, {9, 18}, {9, 20},
+    {9, 22}, {9, 24}, {9, 26}, {9, 28}, {9, 30}, {9, 32},
+    {9, 34}, {9, 36}, {9, 38}, {9, 40}, {9, 42}, {9, 44},
+    {9, 46}, {9, 48}, {9, 50}, {9, 52}, {9, 54}, {9, 56},
+    {9, 58}, {9, 60}, {9, 62}, {10, 0}, {10, 2}, {10, 4},
+    {10, 6}, {10, 8}, {10, 10}, {10, 12}, {10, 14}, {10, 16},
+    {10, 18}, {10, 20}, {10, 22}, {10, 24}, {10, 26}, {10, 28},
+    {10, 30}, {10, 32}, {10, 34}, {10, 36}, {10, 38}, {10, 40},
+    {10, 42}, {10, 44}, {10, 46}, {10, 48}, {10, 50}, {10, 52},
+    {10, 54}, {10, 56}, {10, 58}, {10, 60}, {10, 62}, {10, 64},
+    {10, 66}, {10, 68}, {10, 70}, {10, 72}, {10, 74}, {10, 76},
+    {10, 78}, {10, 80}, {10, 82}, {10, 84}, {10, 86}, {10, 88},
+    {10, 90}, {10, 92}, {10, 94}, {10, 96}, {10, 98}, {10, 100},
+    {10, 102}, {10, 104}, {10, 106}, {10, 108}, {10, 110}, {10, 112},
+    {10, 114}, {10, 116}, {10, 118}, {10, 120}, {10, 122}, {10, 124},
+    {10, 126}, {10, 128}, {10, 130}, {10, 132}, {10, 134}, {10, 136},
+    {10, 138}, {10, 140}, {10, 142}, {10, 144}, {10, 146}, {10, 148},
+    {10, 150}, {10, 152}, {10, 154}, {10, 156}, {10, 158}, {10, 160},
+    {10, 162}, {10, 164}, {10, 166}, {10, 168}, {10, 170}, {10, 172},
+    {10, 174}, {10, 176}, {10, 178}, {10, 180}, {10, 182}, {10, 184},
+    {10, 186}, {10, 188}, {10, 190}, {10, 192}, {10, 194}, {10, 196},
+    {10, 198}, {10, 200}, {10, 202}, {10, 204}, {10, 206}, {10, 208},
+    {10, 210}, {10, 212}, {10, 214}, {10, 216}, {10, 218}, {10, 220},
+    {10, 222}, {10, 224}, {10, 226}, {10, 228}, {10, 230}, {10, 232},
+    {10, 234}, {10, 236}, {10, 238}, {10, 240}, {10, 242}, {10, 244},
+    {10, 246}, {10, 248}, {10, 250}, {10, 252}, {10, 254}, {10, 256},
+    {10, 258}, {10, 260}, {10, 262}, {10, 264}, {10, 266}, {10, 268},
+    {10, 270}, {10, 272}, {10, 274}, {10, 276}, {10, 278}, {10, 280},
+    {10, 282}, {10, 284}, {10, 286}, {10, 288}, {10, 290}, {10, 292},
+    {10, 294}, {10, 296}, {10, 298}, {10, 300}, {10, 302}, {10, 304},
+    {10, 306}, {10, 308}, {10, 310}, {10, 312}, {10, 314}, {10, 316},
+    {10, 318}, {10, 320}, {10, 322}, {10, 324}, {10, 326}, {10, 328},
+    {10, 330}, {10, 332}, {10, 334}, {10, 336}, {10, 338}, {10, 340},
+    {10, 342}, {10, 344}, {10, 346}, {10, 348}, {10, 350}, {10, 352},
+    {10, 354}, {10, 356}, {10, 358}, {10, 360}, {10, 362}, {10, 364},
+    {10, 366}, {10, 368}, {10, 370}, {10, 372}, {10, 374}, {10, 376},
+    {10, 378}, {10, 380}, {10, 382}, {10, 384}, {10, 386}, {10, 388},
+    {10, 390}, {10, 392}, {10, 394}, {10, 396}, {10, 398}, {10, 400},
+    {10, 402}, {10, 404}, {10, 406}, {10, 408}, {10, 410}, {10, 412},
+    {10, 414}, {10, 416}, {10, 418}, {10, 420}, {10, 422}, {10, 424},
+    {10, 426}, {10, 428}, {10, 430}, {10, 432}, {10, 434}, {10, 436},
+    {10, 438}, {10, 440}, {10, 442}, {10, 444}, {10, 446}, {10, 448},
+    {10, 450}, {10, 452}, {10, 454}, {10, 456}, {10, 458}, {10, 460},
+    {10, 462}, {10, 464}, {10, 466}, {10, 468}, {10, 470}, {10, 472},
+    {10, 474}, {10, 476}, {10, 478}, {10, 480}, {10, 482}, {10, 484},
+    {10, 486}, {10, 488}, {10, 490}, {10, 492}, {10, 494}, {10, 496},
+    {10, 498}, {10, 500}, {10, 502}, {10, 504}, {10, 506}, {10, 508},
+    {10, 510}, {10, 512}, {10, 514}, {10, 516}, {10, 518}, {10, 520},
+    {10, 522}, {10, 524}, {10, 526}, {10, 528}, {10, 530}, {10, 532},
+    {10, 534}, {10, 536}, {10, 538}, {10, 540}, {10, 542}, {10, 544},
+    {10, 546}, {10, 548}, {10, 550}, {10, 552}, {10, 554}, {10, 556},
+    {10, 558}, {10, 560}, {10, 562}, {10, 564}, {10, 566}, {10, 568},
+    {10, 570}, {10, 572}, {10, 574}, {10, 576}, {10, 578}, {10, 580},
+    {10, 582}, {10, 584}, {10, 586}, {10, 588}, {10, 590}, {10, 592},
+    {10, 594}, {10, 596}, {10, 598}, {10, 600}, {10, 602}, {10, 604},
+    {10, 606}, {10, 608}, {10, 610}, {10, 612}, {10, 614}, {10, 616},
+    {10, 618}, {10, 620}, {10, 622}, {10, 624}, {10, 626}, {10, 628},
+    {10, 630}, {10, 632}, {10, 634}, {10, 636}, {10, 638}, {10, 640},
+    {10, 642}, {10, 644}, {10, 646}, {10, 648}, {10, 650}, {10, 652},
+    {10, 654}, {10, 656}, {10, 658}, {10, 660}, {10, 662}, {10, 664},
+    {10, 666}, {10, 668}, {10, 670}, {10, 672}, {10, 674}, {10, 676},
+    {10, 678}, {10, 680}, {10, 682}, {10, 684}, {10, 686}, {10, 688},
+    {10, 690}, {10, 692}, {10, 694}, {10, 696}, {10, 698}, {10, 700},
+    {10, 702}, {10, 704}, {10, 706}, {10, 708}, {10, 710}, {10, 712},
+    {10, 714}, {10, 716}, {10, 718}, {10, 720}, {10, 722}, {10, 724},
+    {10, 726}, {10, 728}, {10, 730}, {10, 732}, {10, 734}, {10, 736},
+    {10, 738}, {10, 740}, {10, 742}, {10, 744}, {10, 746}, {10, 748},
+    {10, 750}, {10, 752}, {10, 754}, {10, 756}, {10, 758}, {10, 760},
+    {10, 762}, {10, 764}, {10, 766}, {10, 768}, {10, 770}, {10, 772},
+    {10, 774}, {10, 776}, {10, 778}, {10, 780}, {10, 782}, {10, 784},
+    {10, 786}, {10, 788}, {10, 790}, {10, 792}, {10, 794}, {10, 796},
+    {10, 798}, {10, 800}, {10, 802}, {10, 804}, {10, 806}, {10, 808},
+    {10, 810}, {10, 812}, {10, 814}, {10, 816}, {10, 818}, {10, 820},
+    {10, 822}, {10, 824}, {10, 826}, {10, 828}, {10, 830}, {10, 832},
+    {10, 834}, {10, 836}, {10, 838}, {10, 840}, {10, 842}, {10, 844},
+    {10, 846}, {10, 848}, {10, 850}, {10, 852}, {10, 854}, {10, 856},
+    {10, 858}, {10, 860}, {10, 862}, {10, 864}, {10, 866}, {10, 868},
+    {10, 870}, {10, 872}, {10, 874}, {10, 876}, {10, 878}, {10, 880},
+    {10, 882}, {10, 884}, {10, 886}, {10, 888}, {10, 890}, {10, 892},
+    {10, 894}, {10, 896}, {10, 898}, {10, 900}, {10, 902}, {10, 904},
+    {10, 906}, {10, 908}, {10, 910}, {10, 912}, {10, 914}, {10, 916},
+    {10, 918}, {10, 920}, {10, 922}, {10, 924}, {10, 926}, {10, 928},
+    {10, 930}, {10, 932}, {10, 934}, {10, 936}, {10, 938}, {10, 940},
+    {10, 942}, {10, 944}, {10, 946}, {10, 948}, {10, 950}, {10, 952},
+    {10, 954}, {10, 956}, {10, 958}, {10, 960}, {10, 962}, {10, 964},
+    {10, 966}, {10, 968}, {10, 970}, {10, 972}, {10, 974}, {10, 976},
+    {10, 978}, {10, 980}, {10, 982}, {10, 984}, {10, 986}, {10, 988},
+    {10, 990}, {10, 992}, {10, 994}, {10, 996}, {10, 998}, {10, 1000},
+    {10, 1002}, {10, 1004}, {10, 1006}, {10, 1008}, {10, 1010}, {10, 1012},
+    {10, 1014}, {10, 1016}, {10, 1018}, {10, 1020}, {10, 1022}, {10, 1024},
+    {10, 1026}, {10, 1028}, {10, 1030}, {10, 1032}, {10, 1034}, {10, 1036},
+    {10, 1038}, {10, 1040}, {10, 1042}, {10, 1044}, {10, 1046}, {10, 1048},
+    {10, 1050}, {10, 1052}, {10, 1054}, {10, 1056}, {10, 1058}, {10, 1060},
+    {10, 1062}, {10, 1064}, {10, 1066}, {10, 1068}, {10, 1070}, {10, 1072},
+    {10, 1074}, {10, 1076}, {10, 1078}, {10, 1080}, {10, 1082}, {10, 1084},
+    {10, 1086}, {10, 1088}, {10, 1090}, {10, 1092}, {10, 1094}, {10, 1096},
+    {10, 1098}, {10, 1100}, {10, 1102}, {10, 1104}, {10, 1106}, {10, 1108},
+    {10, 1110}, {10, 1112}, {10, 1114}, {10, 1116}, {10, 1118}, {10, 1120},
+    {10, 1122}, {10, 1124}, {10, 1126}, {10, 1128}, {10, 1130}, {10, 1132},
+    {10, 1134}, {10, 1136}, {10, 1138}, {10, 1140}, {10, 1142}, {10, 1144},
+    {10, 1146}, {10, 1148}, {10, 1150}, {10, 1152}, {10, 1154}, {10, 1156},
+    {10, 1158}, {10, 1160}, {10, 1162}, {10, 1164}, {10, 1166}, {10, 1168},
+    {10, 1170}, {10, 1172}, {10, 1174}, {10, 1176}, {10, 1178}, {10, 1180},
+    {10, 1182}, {10, 1184}, {10, 1186}, {10, 1188}, {10, 1190}, {10, 1192},
+    {10, 1194}, {10, 1196}, {10, 1198}, {10, 1200}, {10, 1202}, {10, 1204},
+    {10, 1206}, {10, 1208}, {10, 1210}, {10, 1212}, {10, 1214}, {10, 1216},
+    {10, 1218}, {10, 1220}, {10, 1222}, {10, 1224}, {10, 1226}, {10, 1228},
+    {10, 1230}, {10, 1232}, {10, 1234}, {10, 1236}, {10, 1238}, {10, 1240},
+    {10, 1242}, {10, 1244}, {10, 1246}, {10, 1248}, {10, 1250}, {10, 1252},
+    {10, 1254}, {10, 1256}, {10, 1258}, {10, 1260}, {10, 1262}, {10, 1264},
+    {10, 1266}, {10, 1268}, {10, 1270}, {10, 1272}, {10, 1274}, {10, 1276},
+    {10, 1278}, {10, 1280}, {10, 1282}, {10, 1284}, {10, 1286}, {10, 1288},
+    {10, 1290}, {10, 1292}, {10, 1294}, {10, 1296}, {10, 1298}, {10, 1300},
+    {10, 1302}, {10, 1304}, {10, 1306}, {10, 1308}, {10, 1310}, {10, 1312},
+    {10, 1314}, {10, 1316}, {10, 1318}, {10, 1320}, {10, 1322}, {10, 1324},
+    {10, 1326}, {10, 1328}, {10, 1330}, {10, 1332}, {10, 1334}, {10, 1336},
+    {10, 1338}, {10, 1340}, {10, 1342}, {10, 1344}, {10, 1346}, {10, 1348},
+    {10, 1350}, {10, 1352}, {10, 1354}, {10, 1356}, {10, 1358}, {10, 1360},
+    {10, 1362}, {10, 1364}, {10, 1366}, {10, 1368}, {10, 1370}, {10, 1372},
+    {10, 1374}, {10, 1376}, {10, 1378}, {10, 1380}, {10, 1382}, {10, 1384},
+    {10, 1386}, {10, 1388}, {10, 1390}, {10, 1392}, {10, 1394}, {10, 1396},
+    {10, 1398}, {10, 1400}, {10, 1402}, {10, 1404}, {10, 1406}, {10, 1408},
+    {10, 1410}, {10, 1412}, {10, 1414}, {10, 1416}, {10, 1418}, {10, 1420},
+    {10, 1422}, {10, 1424}, {10, 1426}, {10, 1428}, {10, 1430}, {10, 1432},
+    {10, 1434}, {10, 1436}, {10, 1438}, {10, 1440}, {10, 1442}, {10, 1444},
+    {10, 1446}, {10, 1448}, {10, 1450}, {10, 1452}, {10, 1454}, {10, 1456},
+    {10, 1458}, {10, 1460}, {10, 1462}, {10, 1464}, {10, 1466}, {10, 1468},
+    {10, 1470}, {10, 1472}, {10, 1474}, {10, 1476}, {10, 1478}, {10, 1480},
+    {10, 1482}, {10, 1484}, {10, 1486}, {10, 1488}, {10, 1490}, {10, 1492},
+    {10, 1494}, {10, 1496}, {10, 1498}, {10, 1500}, {10, 1502}, {10, 1504},
+    {10, 1506}, {10, 1508}, {10, 1510}, {10, 1512}, {10, 1514}, {10, 1516},
+    {10, 1518}, {10, 1520}, {10, 1522}, {10, 1524}, {10, 1526}, {10, 1528},
+    {10, 1530}, {10, 1532}, {10, 1534}, {10, 1536}, {10, 1538}, {10, 1540},
+    {10, 1542}, {10, 1544}, {10, 1546}, {10, 1548}, {10, 1550}, {10, 1552},
+    {10, 1554}, {10, 1556}, {10, 1558}, {10, 1560}, {10, 1562}, {10, 1564},
+    {10, 1566}, {10, 1568}, {10, 1570}, {10, 1572}, {10, 1574}, {10, 1576},
+    {10, 1578}, {10, 1580}, {10, 1582}, {10, 1584}, {10, 1586}, {10, 1588},
+    {10, 1590}, {10, 1592}, {10, 1594}, {10, 1596}, {10, 1598}, {10, 1600},
+    {10, 1602}, {10, 1604}, {10, 1606}, {10, 1608}, {10, 1610}, {10, 1612},
+    {10, 1614}, {10, 1616}, {10, 1618}, {10, 1620}, {10, 1622}, {10, 1624},
+    {10, 1626}, {10, 1628}, {10, 1630}, {10, 1632}, {10, 1634}, {10, 1636},
+    {10, 1638}, {10, 1640}, {10, 1642}, {10, 1644}, {10, 1646}, {10, 1648},
+    {10, 1650}, {10, 1652}, {10, 1654}, {10, 1656}, {10, 1658}, {10, 1660},
+    {10, 1662}, {10, 1664}, {10, 1666}, {10, 1668}, {10, 1670}, {10, 1672},
+    {10, 1674}, {10, 1676}, {10, 1678}, {10, 1680}, {10, 1682}, {10, 1684},
+    {10, 1686}, {10, 1688}, {10, 1690}, {10, 1692}, {10, 1694}, {10, 1696},
+    {10, 1698}, {10, 1700}, {10, 1702}, {10, 1704}, {10, 1706}, {10, 1708},
+    {10, 1710}, {10, 1712}, {10, 1714}, {10, 1716}, {10, 1718}, {10, 1720},
+    {10, 1722}, {10, 1724}, {10, 1726}, {10, 1728}, {10, 1730}, {10, 1732},
+    {10, 1734}, {10, 1736}, {10, 1738}, {10, 1740}, {10, 1742}, {10, 1744},
+    {10, 1746}, {10, 1748}, {10, 1750}, {10, 1752}, {10, 1754}, {10, 1756},
+    {10, 1758}, {10, 1760}, {10, 1762}, {10, 1764}, {10, 1766}, {10, 1768},
+    {10, 1770}, {10, 1772}, {10, 1774}, {10, 1776}, {10, 1778}, {10, 1780},
+    {10, 1782}, {10, 1784}, {10, 1786}, {10, 1788}, {10, 1790}, {10, 1792},
+    {10, 1794}, {10, 1796}, {10, 1798}, {10, 1800}, {10, 1802}, {10, 1804},
+    {10, 1806}, {10, 1808}, {10, 1810}, {10, 1812}, {10, 1814}, {10, 1816},
+    {10, 1818}, {10, 1820}, {10, 1822}, {10, 1824}, {10, 1826}, {10, 1828},
+    {10, 1830}, {10, 1832}, {10, 1834}, {10, 1836}, {10, 1838}, {10, 1840},
+    {10, 1842}, {10, 1844}, {10, 1846}, {10, 1848}, {10, 1850}, {10, 1852},
+    {10, 1854}, {10, 1856}, {10, 1858}, {10, 1860}, {10, 1862}, {10, 1864},
+    {10, 1866}, {10, 1868}, {10, 1870}, {10, 1872}, {10, 1874}, {10, 1876},
+    {10, 1878}, {10, 1880}, {10, 1882}, {10, 1884}, {10, 1886}, {10, 1888},
+    {10, 1890}, {10, 1892}, {10, 1894}, {10, 1896}, {10, 1898}, {10, 1900},
+    {10, 1902}, {10, 1904}, {10, 1906}, {10, 1908}, {10, 1910}, {10, 1912},
+    {10, 1914}, {10, 1916}, {10, 1918}, {10, 1920}, {10, 1922}, {10, 1924},
+    {10, 1926}, {10, 1928}, {10, 1930}, {10, 1932}, {10, 1934}, {10, 1936},
+    {10, 1938}, {10, 1940}, {10, 1942}, {10, 1944}, {10, 1946}, {10, 1948},
+    {10, 1950}, {10, 1952}, {10, 1954}, {10, 1956}, {10, 1958}, {10, 1960},
+    {10, 1962}, {10, 1964}, {10, 1966}, {10, 1968}, {10, 1970}, {10, 1972},
+    {10, 1974}, {10, 1976}, {10, 1978}, {10, 1980}, {10, 1982}, {10, 1984},
+    {10, 1986}, {10, 1988}, {10, 1990}, {10, 1992}, {10, 1994}, {10, 1996},
+    {10, 1998}, {10, 2000}, {10, 2002}, {10, 2004}, {10, 2006}, {10, 2008},
+    {10, 2010}, {10, 2012}, {10, 2014}, {10, 2016}, {10, 2018}, {10, 2020},
+    {10, 2022}, {10, 2024}, {10, 2026}, {10, 2028}, {10, 2030}, {10, 2032},
+    {10, 2034}, {10, 2036}, {10, 2038}, {10, 2040}, {10, 2042}, {10, 2044},
+    {10, 2046}, {10, 2048}, {10, 2050}, {10, 2052}, {10, 2054}, {10, 2056},
+    {10, 2058}, {10, 2060}, {10, 2062}, {10, 2064}, {10, 2066}, {10, 2068},
+    {10, 2070}, {10, 2072}, {10, 2074}, {10, 2076}, {10, 2078}, {10, 2080},
+    {10, 2082}, {10, 2084}, {10, 2086}, {10, 2088}, {10, 2090}, {10, 2092},
+    {10, 2094}, {10, 2096}, {10, 2098}, {10, 2100}, {10, 2102}, {10, 2104},
+    {10, 2106}, {10, 2108}, {10, 2110}, {10, 2112}, {10, 2114}, {10, 2116},
+    {10, 2118}, {10, 2120}, {10, 2122}, {10, 2124}, {10, 2126}, {10, 2128},
+    {10, 2130}, {10, 2132}, {10, 2134}, {10, 2136}, {10, 2138}, {10, 2140},
+    {10, 2142}, {10, 2144}, {10, 2146}, {10, 2148}, {10, 2150}, {10, 2152},
+    {10, 2154}, {10, 2156}, {10, 2158}, {10, 2160}, {10, 2162}, {10, 2164},
+    {10, 2166}, {10, 2168}, {10, 2170}, {10, 2172}, {10, 2174}, {10, 2176},
+    {10, 2178}, {10, 2180}, {10, 2182}, {10, 2184}, {10, 2186}, {10, 2188},
+    {10, 2190}, {10, 2192}, {10, 2194}, {10, 2196}, {10, 2198}, {10, 2200},
+    {10, 2202}, {10, 2204}, {10, 2206}, {10, 2208}, {10, 2210}, {10, 2212},
+    {10, 2214}, {10, 2216}, {10, 2218}, {10, 2220}, {10, 2222}, {10, 2224},
+    {10, 2226}, {10, 2228}, {10, 2230}, {10, 2232}, {10, 2234}, {10, 2236},
+    {10, 2238}, {10, 2240}, {10, 2242}, {10, 2244}, {10, 2246}, {10, 2248},
+    {10, 2250}, {10, 2252}, {10, 2254}, {10, 2256}, {10, 2258}, {10, 2260},
+    {10, 2262}, {10, 2264}, {10, 2266}, {10, 2268}, {10, 2270}, {10, 2272},
+    {10, 2274}, {10, 2276}, {10, 2278}, {10, 2280}, {10, 2282}, {10, 2284},
+    {10, 2286}, {10, 2288}, {10, 2290}, {10, 2292}, {10, 2294}, {10, 2296},
+    {10, 2298}, {10, 2300}, {10, 2302}, {10, 2304}, {10, 2306}, {10, 2308},
+    {10, 2310}, {10, 2312}, {10, 2314}, {10, 2316}, {10, 2318}, {10, 2320},
+    {10, 2322}, {10, 2324}, {10, 2326}, {10, 2328}, {10, 2330}, {10, 2332},
+    {10, 2334}, {10, 2336}, {10, 2338}, {10, 2340}, {10, 2342}, {10, 2344},
+    {10, 2346}, {10, 2348}, {10, 2350}, {10, 2352}, {10, 2354}, {10, 2356},
+    {10, 2358}, {10, 2360}, {10, 2362}, {10, 2364}, {10, 2366}, {10, 2368},
+    {10, 2370}, {10, 2372}, {10, 2374}, {10, 2376}, {10, 2378}, {10, 2380},
+    {10, 2382}, {10, 2384}, {10, 2386}, {10, 2388}, {10, 2390}, {10, 2392},
+    {10, 2394}, {10, 2396}, {10, 2398}, {10, 2400}, {10, 2402}, {10, 2404},
+    {10, 2406}, {10, 2408}, {10, 2410}, {10, 2412}, {10, 2414}, {10, 2416},
+    {10, 2418}, {10, 2420}, {10, 2422}, {10, 2424}, {10, 2426}, {10, 2428},
+    {10, 2430}, {10, 2432}, {10, 2434}, {10, 2436}, {10, 2438}, {10, 2440},
+    {10, 2442}, {10, 2444}, {10, 2446}, {10, 2448}, {10, 2450}, {10, 2452},
+    {10, 2454}, {10, 2456}, {10, 2458}, {10, 2460}, {10, 2462}, {10, 2464},
+    {10, 2466}, {10, 2468}, {10, 2470}, {10, 2472}, {10, 2474}, {10, 2476},
+    {10, 2478}, {10, 2480}, {10, 2482}, {10, 2484}, {10, 2486}, {10, 2488},
+    {10, 2490}, {10, 2492}, {10, 2494}, {10, 2496}, {10, 2498}, {10, 2500},
+    {10, 2502}, {10, 2504}, {10, 2506}, {10, 2508}, {10, 2510}, {10, 2512},
+    {10, 2514}, {10, 2516}, {10, 2518}, {10, 2520}, {10, 2522}, {10, 2524},
+    {10, 2526}, {10, 2528}, {10, 2530}, {10, 2532}, {10, 2534}, {10, 2536},
+    {10, 2538}, {10, 2540}, {10, 2542}, {10, 2544}, {10, 2546}, {10, 2548},
+    {10, 2550}, {10, 2552}, {10, 2554}, {10, 2556}, {10, 2558}, {10, 2560},
+    {10, 2562}, {10, 2564}, {10, 2566}, {10, 2568}, {10, 2570}, {10, 2572},
+    {10, 2574}, {10, 2576}, {10, 2578}, {10, 2580}, {10, 2582}, {10, 2584},
+    {10, 2586}, {10, 2588}, {10, 2590}, {10, 2592}, {10, 2594}, {10, 2596},
+    {10, 2598}, {10, 2600}, {10, 2602}, {10, 2604}, {10, 2606}, {10, 2608},
+    {10, 2610}, {10, 2612}, {10, 2614}, {10, 2616}, {10, 2618}, {10, 2620},
+    {10, 2622}, {10, 2624}, {10, 2626}, {10, 2628}, {10, 2630}, {10, 2632},
+    {10, 2634}, {10, 2636}, {10, 2638}, {10, 2640}, {10, 2642}, {10, 2644},
+    {10, 2646}, {10, 2648}, {10, 2650}, {10, 2652}, {10, 2654}, {10, 2656},
+    {10, 2658}, {10, 2660}, {10, 2662}, {10, 2664}, {10, 2666}, {10, 2668},
+    {10, 2670}, {10, 2672}, {10, 2674}, {10, 2676}, {10, 2678}, {10, 2680},
+    {10, 2682}, {10, 2684}, {10, 2686}, {10, 2688}, {10, 2690}, {10, 2692},
+    {10, 2694}, {10, 2696}, {10, 2698}, {10, 2700}, {10, 2702}, {10, 2704},
+    {10, 2706}, {10, 2708}, {10, 2710}, {10, 2712}, {10, 2714}, {10, 2716},
+    {10, 2718}, {10, 2720}, {10, 2722}, {10, 2724}, {10, 2726}, {10, 2728},
+    {10, 2730}, {10, 2732}, {10, 2734}, {10, 2736}, {10, 2738}, {10, 2740},
+    {10, 2742}, {10, 2744}, {10, 2746}, {10, 2748}, {10, 2750}, {10, 2752},
+    {10, 2754}, {10, 2756}, {10, 2758}, {10, 2760}, {10, 2762}, {10, 2764},
+    {10, 2766}, {10, 2768}, {10, 2770}, {10, 2772}, {10, 2774}, {10, 2776},
+    {10, 2778}, {10, 2780}, {10, 2782}, {10, 2784}, {10, 2786}, {10, 2788},
+    {10, 2790}, {10, 2792}, {10, 2794}, {10, 2796}, {10, 2798}, {10, 2800},
+    {10, 2802}, {10, 2804}, {10, 2806}, {10, 2808}, {10, 2810}, {10, 2812},
+    {10, 2814}, {10, 2816}, {10, 2818}, {10, 2820}, {10, 2822}, {10, 2824},
+    {10, 2826}, {10, 2828}, {10, 2830}, {10, 2832}, {10, 2834}, {10, 2836},
+    {10, 2838}, {10, 2840}, {10, 2842}, {10, 2844}, {10, 2846}, {10, 2848},
+    {10, 2850}, {10, 2852}, {10, 2854}, {10, 2856}, {10, 2858}, {10, 2860},
+    {10, 2862}, {10, 2864}, {10, 2866}, {10, 2868}, {10, 2870}, {10, 2872},
+    {10, 2874}, {10, 2876}, {10, 2878}, {10, 2880}, {10, 2882}, {10, 2884},
+    {10, 2886}, {10, 2888}, {10, 2890}, {10, 2892}, {10, 2894}, {10, 2896},
+    {10, 2898}, {10, 2900}, {10, 2902}, {10, 2904}, {10, 2906}, {10, 2908},
+    {10, 2910}, {10, 2912}, {10, 2914}, {10, 2916}, {10, 2918}, {10, 2920},
+    {10, 2922}, {10, 2924}, {10, 2926}, {10, 2928}, {10, 2930}, {10, 2932},
+    {10, 2934}, {10, 2936}, {10, 2938}, {10, 2940}, {10, 2942}, {10, 2944},
+    {10, 2946}, {10, 2948}, {10, 2950}, {10, 2952}, {10, 2954}, {10, 2956},
+    {10, 2958}, {10, 2960}, {10, 2962}, {10, 2964}, {10, 2966}, {10, 2968},
+    {10, 2970}, {10, 2972}, {10, 2974}, {10, 2976}, {10, 2978}, {10, 2980},
+    {10, 2982}, {10, 2984}, {10, 2986}, {10, 2988}, {10, 2990}, {10, 2992},
+    {10, 2994}, {10, 2996}, {10, 2998}, {10, 3000}, {10, 3002}, {10, 3004},
+    {10, 3006}, {10, 3008}, {10, 3010}, {10, 3012}, {10, 3014}, {10, 3016},
+    {10, 3018}, {10, 3020}, {10, 3022}, {10, 3024}, {10, 3026}, {10, 3028},
+    {10, 3030}, {10, 3032}, {10, 3034}, {10, 3036}, {10, 3038}, {10, 3040},
+    {10, 3042}, {10, 3044}, {10, 3046}, {10, 3048}, {10, 3050}, {10, 3052},
+    {10, 3054}, {10, 3056}, {10, 3058}, {10, 3060}, {10, 3062}, {10, 3064},
+    {10, 3066}, {10, 3068}, {10, 3070}, {10, 3072}, {10, 3074}, {10, 3076},
+    {10, 3078}, {10, 3080}, {10, 3082}, {10, 3084}, {10, 3086}, {10, 3088},
+    {10, 3090}, {10, 3092}, {10, 3094}, {10, 3096}, {10, 3098}, {10, 3100},
+    {10, 3102}, {10, 3104}, {10, 3106}, {10, 3108}, {10, 3110}, {10, 3112},
+    {10, 3114}, {10, 3116}, {10, 3118}, {10, 3120}, {10, 3122}, {10, 3124},
+    {10, 3126}, {10, 3128}, {10, 3130}, {10, 3132}, {10, 3134}, {10, 3136},
+    {10, 3138}, {10, 3140}, {10, 3142}, {10, 3144}, {10, 3146}, {10, 3148},
+    {10, 3150}, {10, 3152}, {10, 3154}, {10, 3156}, {10, 3158}, {10, 3160},
+    {10, 3162}, {10, 3164}, {10, 3166}, {10, 3168}, {10, 3170}, {10, 3172},
+    {10, 3174}, {10, 3176}, {10, 3178}, {10, 3180}, {10, 3182}, {10, 3184},
+    {10, 3186}, {10, 3188}, {10, 3190}, {10, 3192}, {10, 3194}, {10, 3196},
+    {10, 3198}, {10, 3200}, {10, 3202}, {10, 3204}, {10, 3206}, {10, 3208},
+    {10, 3210}, {10, 3212}, {10, 3214}, {10, 3216}, {10, 3218}, {10, 3220},
+    {10, 3222}, {10, 3224}, {10, 3226}, {10, 3228}, {10, 3230}, {10, 3232},
+    {10, 3234}, {10, 3236}, {10, 3238}, {10, 3240}, {10, 3242}, {10, 3244},
+    {10, 3246}, {10, 3248}, {10, 3250}, {10, 3252}, {10, 3254}, {10, 3256},
+    {10, 3258}, {10, 3260}, {10, 3262}, {10, 3264}, {10, 3266}, {10, 3268},
+    {10, 3270}, {10, 3272}, {10, 3274}, {10, 3276}, {10, 3278}, {10, 3280},
+    {10, 3282}, {10, 3284}, {10, 3286}, {10, 3288}, {10, 3290}, {10, 3292},
+    {10, 3294}, {10, 3296}, {10, 3298}, {10, 3300}, {10, 3302}, {10, 3304},
+    {10, 3306}, {10, 3308}, {10, 3310}, {10, 3312}, {10, 3314}, {10, 3316},
+    {10, 3318}, {10, 3320}, {10, 3322}, {10, 3324}, {10, 3326}, {10, 3328},
+    {10, 3330}, {10, 3332}, {10, 3334}, {10, 3336}, {10, 3338}, {10, 3340},
+    {10, 3342}, {10, 3344}, {10, 3346}, {10, 3348}, {10, 3350}, {10, 3352},
+    {10, 3354}, {10, 3356}, {10, 3358}, {10, 3360}, {10, 3362}, {10, 3364},
+    {10, 3366}, {10, 3368}, {10, 3370}, {10, 3372}, {10, 3374}, {10, 3376},
+    {10, 3378}, {10, 3380}, {10, 3382}, {10, 3384}, {10, 3386}, {10, 3388},
+    {10, 3390}, {10, 3392}, {10, 3394}, {10, 3396}, {10, 3398}, {10, 3400},
+    {10, 3402}, {10, 3404}, {10, 3406}, {10, 3408}, {10, 3410}, {10, 3412},
+    {10, 3414}, {10, 3416}, {10, 3418}, {10, 3420}, {10, 3422}, {10, 3424},
+    {10, 3426}, {10, 3428}, {10, 3430}, {10, 3432}, {10, 3434}, {10, 3436},
+    {10, 3438}, {10, 3440}, {10, 3442}, {10, 3444}, {10, 3446}, {10, 3448},
+    {10, 3450}, {10, 3452}, {10, 3454}, {10, 3456}, {10, 3458}, {10, 3460},
+    {10, 3462}, {10, 3464}, {10, 3466}, {10, 3468}, {10, 3470}, {10, 3472},
+    {10, 3474}, {10, 3476}, {10, 3478}, {10, 3480}, {10, 3482}, {10, 3484},
+    {10, 3486}, {10, 3488}, {10, 3490}, {10, 3492}, {10, 3494}, {10, 3496},
+    {10, 3498}, {10, 3500}, {10, 3502}, {10, 3504}, {10, 3506}, {10, 3508},
+    {10, 3510}, {10, 3512}, {10, 3514}, {10, 3516}, {10, 3518}, {10, 3520},
+    {10, 3522}, {10, 3524}, {10, 3526}, {10, 3528}, {10, 3530}, {10, 3532},
+    {10, 3534}, {10, 3536}, {10, 3538}, {10, 3540}, {10, 3542}, {10, 3544},
+    {10, 3546}, {10, 3548}, {10, 3550}, {10, 3552}, {10, 3554}, {10, 3556},
+    {10, 3558}, {10, 3560}, {10, 3562}, {10, 3564}, {10, 3566}, {10, 3568},
+    {10, 3570}, {10, 3572}, {10, 3574}, {10, 3576}, {10, 3578}, {10, 3580},
+    {10, 3582}, {10, 3584}, {10, 3586}, {10, 3588}, {10, 3590}, {10, 3592},
+    {10, 3594}, {10, 3596}, {10, 3598}, {10, 3600}, {10, 3602}, {10, 3604},
+    {10, 3606}, {10, 3608}, {10, 3610}, {10, 3612}, {10, 3614}, {10, 3616},
+    {10, 3618}, {10, 3620}, {10, 3622}, {10, 3624}, {10, 3626}, {10, 3628},
+    {10, 3630}, {10, 3632}, {10, 3634}, {10, 3636}, {10, 3638}, {10, 3640},
+    {10, 3642}, {10, 3644}, {10, 3646}, {10, 3648}, {10, 3650}, {10, 3652},
+    {10, 3654}, {10, 3656}, {10, 3658}, {10, 3660}, {10, 3662}, {10, 3664},
+    {10, 3666}, {10, 3668}, {10, 3670}, {10, 3672}, {10, 3674}, {10, 3676},
+    {10, 3678}, {10, 3680}, {10, 3682}, {10, 3684}, {10, 3686}, {10, 3688},
+    {10, 3690}, {10, 3692}, {10, 3694}, {10, 3696}, {10, 3698}, {10, 3700},
+    {10, 3702}, {10, 3704}, {10, 3706}, {10, 3708}, {10, 3710}, {10, 3712},
+    {10, 3714}, {10, 3716}, {10, 3718}, {10, 3720}, {10, 3722}, {10, 3724},
+    {10, 3726}, {10, 3728}, {10, 3730}, {10, 3732}, {10, 3734}, {10, 3736},
+    {10, 3738}, {10, 3740}, {10, 3742}, {10, 3744}, {10, 3746}, {10, 3748},
+    {10, 3750}, {10, 3752}, {10, 3754}, {10, 3756}, {10, 3758}, {10, 3760},
+    {10, 3762}, {10, 3764}, {10, 3766}, {10, 3768}, {10, 3770}, {10, 3772},
+    {10, 3774}, {10, 3776}, {10, 3778}, {10, 3780}, {10, 3782}, {10, 3784},
+    {10, 3786}, {10, 3788}, {10, 3790}, {10, 3792}, {10, 3794}, {10, 3796},
+    {10, 3798}, {10, 3800}, {10, 3802}, {10, 3804}, {10, 3806}, {10, 3808},
+    {10, 3810}, {10, 3812}, {10, 3814}, {10, 3816}, {10, 3818}, {10, 3820},
+    {10, 3822}, {10, 3824}, {10, 3826}, {10, 3828}, {10, 3830}, {10, 3832},
+    {10, 3834}, {10, 3836}, {10, 3838}, {10, 3840}, {10, 3842}, {10, 3844},
+    {10, 3846}, {10, 3848}, {10, 3850}, {10, 3852}, {10, 3854}, {10, 3856},
+    {10, 3858}, {10, 3860}, {10, 3862}, {10, 3864}, {10, 3866}, {10, 3868},
+    {10, 3870}, {10, 3872}, {10, 3874}, {10, 3876}, {10, 3878}, {10, 3880},
+    {10, 3882}, {10, 3884}, {10, 3886}, {10, 3888}, {10, 3890}, {10, 3892},
+    {10, 3894}, {10, 3896}, {10, 3898}, {10, 3900}, {10, 3902}, {10, 3904},
+    {10, 3906}, {10, 3908}, {10, 3910}, {10, 3912}, {10, 3914}, {10, 3916},
+    {10, 3918}, {10, 3920}, {10, 3922}, {10, 3924}, {10, 3926}, {10, 3928},
+    {10, 3930}, {10, 3932}, {10, 3934}, {10, 3936}, {10, 3938}, {10, 3940},
+    {10, 3942}, {10, 3944}, {10, 3946}, {10, 3948}, {10, 3950}, {10, 3952},
+    {10, 3954}, {10, 3956}, {10, 3958}, {10, 3960}
+};
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_ENCODER_DCT_VALUE_TOKENS_H_
diff --git a/libs/libvpx/vp8/encoder/defaultcoefcounts.h b/libs/libvpx/vp8/encoder/defaultcoefcounts.h
new file mode 100644
index 0000000000..1e8e80484a
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/defaultcoefcounts.h
@@ -0,0 +1,236 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_ENCODER_DEFAULTCOEFCOUNTS_H_
+#define VP8_ENCODER_DEFAULTCOEFCOUNTS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Generated file, included by entropy.c */
+
+static const unsigned int default_coef_counts[BLOCK_TYPES]
+                                             [COEF_BANDS]
+                                             [PREV_COEF_CONTEXTS]
+                                             [MAX_ENTROPY_TOKENS] =
+{
+
+    {
+        /* Block Type ( 0 ) */
+        {
+            /* Coeff Band ( 0 ) */
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+        },
+        {
+            /* Coeff Band ( 1 ) */
+            {30190, 26544, 225,  24,   4,   0,   0,   0,   0,   0,   0, 4171593,},
+            {26846, 25157, 1241, 130,  26,   6,   1,   0,   0,   0,   0, 149987,},
+            {10484, 9538, 1006, 160,  36,  18,   0,   0,   0,   0,   0, 15104,},
+        },
+        {
+            /* Coeff Band ( 2 ) */
+            {25842, 40456, 1126,  83,  11,   2,   0,   0,   0,   0,   0,   0,},
+            {9338, 8010, 512,  73,   7,   3,   2,   0,   0,   0,   0, 43294,},
+            {1047, 751, 149,  31,  13,   6,   1,   0,   0,   0,   0, 879,},
+        },
+        {
+            /* Coeff Band ( 3 ) */
+            {26136, 9826, 252,  13,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {8134, 5574, 191,  14,   2,   0,   0,   0,   0,   0,   0, 35302,},
+            { 605, 677, 116,   9,   1,   0,   0,   0,   0,   0,   0, 611,},
+        },
+        {
+            /* Coeff Band ( 4 ) */
+            {10263, 15463, 283,  17,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {2773, 2191, 128,   9,   2,   2,   0,   0,   0,   0,   0, 10073,},
+            { 134, 125,  32,   4,   0,   2,   0,   0,   0,   0,   0,  50,},
+        },
+        {
+            /* Coeff Band ( 5 ) */
+            {10483, 2663,  23,   1,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {2137, 1251,  27,   1,   1,   0,   0,   0,   0,   0,   0, 14362,},
+            { 116, 156,  14,   2,   1,   0,   0,   0,   0,   0,   0, 190,},
+        },
+        {
+            /* Coeff Band ( 6 ) */
+            {40977, 27614, 412,  28,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {6113, 5213, 261,  22,   3,   0,   0,   0,   0,   0,   0, 26164,},
+            { 382, 312,  50,  14,   2,   0,   0,   0,   0,   0,   0, 345,},
+        },
+        {
+            /* Coeff Band ( 7 ) */
+            {   0,  26,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0,  13,   0,   0,   0,   0,   0,   0,   0,   0,   0, 319,},
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   8,},
+        },
+    },
+    {
+        /* Block Type ( 1 ) */
+        {
+            /* Coeff Band ( 0 ) */
+            {3268, 19382, 1043, 250,  93,  82,  49,  26,  17,   8,  25, 82289,},
+            {8758, 32110, 5436, 1832, 827, 668, 420, 153,  24,   0,   3, 52914,},
+            {9337, 23725, 8487, 3954, 2107, 1836, 1069, 399,  59,   0,   0, 18620,},
+        },
+        {
+            /* Coeff Band ( 1 ) */
+            {12419, 8420, 452,  62,   9,   1,   0,   0,   0,   0,   0,   0,},
+            {11715, 8705, 693,  92,  15,   7,   2,   0,   0,   0,   0, 53988,},
+            {7603, 8585, 2306, 778, 270, 145,  39,   5,   0,   0,   0, 9136,},
+        },
+        {
+            /* Coeff Band ( 2 ) */
+            {15938, 14335, 1207, 184,  55,  13,   4,   1,   0,   0,   0,   0,},
+            {7415, 6829, 1138, 244,  71,  26,   7,   0,   0,   0,   0, 9980,},
+            {1580, 1824, 655, 241,  89,  46,  10,   2,   0,   0,   0, 429,},
+        },
+        {
+            /* Coeff Band ( 3 ) */
+            {19453, 5260, 201,  19,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {9173, 3758, 213,  22,   1,   1,   0,   0,   0,   0,   0, 9820,},
+            {1689, 1277, 276,  51,  17,   4,   0,   0,   0,   0,   0, 679,},
+        },
+        {
+            /* Coeff Band ( 4 ) */
+            {12076, 10667, 620,  85,  19,   9,   5,   0,   0,   0,   0,   0,},
+            {4665, 3625, 423,  55,  19,   9,   0,   0,   0,   0,   0, 5127,},
+            { 415, 440, 143,  34,  20,   7,   2,   0,   0,   0,   0, 101,},
+        },
+        {
+            /* Coeff Band ( 5 ) */
+            {12183, 4846, 115,  11,   1,   0,   0,   0,   0,   0,   0,   0,},
+            {4226, 3149, 177,  21,   2,   0,   0,   0,   0,   0,   0, 7157,},
+            { 375, 621, 189,  51,  11,   4,   1,   0,   0,   0,   0, 198,},
+        },
+        {
+            /* Coeff Band ( 6 ) */
+            {61658, 37743, 1203,  94,  10,   3,   0,   0,   0,   0,   0,   0,},
+            {15514, 11563, 903, 111,  14,   5,   0,   0,   0,   0,   0, 25195,},
+            { 929, 1077, 291,  78,  14,   7,   1,   0,   0,   0,   0, 507,},
+        },
+        {
+            /* Coeff Band ( 7 ) */
+            {   0, 990,  15,   3,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0, 412,  13,   0,   0,   0,   0,   0,   0,   0,   0, 1641,},
+            {   0,  18,   7,   1,   0,   0,   0,   0,   0,   0,   0,  30,},
+        },
+    },
+    {
+        /* Block Type ( 2 ) */
+        {
+            /* Coeff Band ( 0 ) */
+            { 953, 24519, 628, 120,  28,  12,   4,   0,   0,   0,   0, 2248798,},
+            {1525, 25654, 2647, 617, 239, 143,  42,   5,   0,   0,   0, 66837,},
+            {1180, 11011, 3001, 1237, 532, 448, 239,  54,   5,   0,   0, 7122,},
+        },
+        {
+            /* Coeff Band ( 1 ) */
+            {1356, 2220,  67,  10,   4,   1,   0,   0,   0,   0,   0,   0,},
+            {1450, 2544, 102,  18,   4,   3,   0,   0,   0,   0,   0, 57063,},
+            {1182, 2110, 470, 130,  41,  21,   0,   0,   0,   0,   0, 6047,},
+        },
+        {
+            /* Coeff Band ( 2 ) */
+            { 370, 3378, 200,  30,   5,   4,   1,   0,   0,   0,   0,   0,},
+            { 293, 1006, 131,  29,  11,   0,   0,   0,   0,   0,   0, 5404,},
+            { 114, 387,  98,  23,   4,   8,   1,   0,   0,   0,   0, 236,},
+        },
+        {
+            /* Coeff Band ( 3 ) */
+            { 579, 194,   4,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            { 395, 213,   5,   1,   0,   0,   0,   0,   0,   0,   0, 4157,},
+            { 119, 122,   4,   0,   0,   0,   0,   0,   0,   0,   0, 300,},
+        },
+        {
+            /* Coeff Band ( 4 ) */
+            {  38, 557,  19,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {  21, 114,  12,   1,   0,   0,   0,   0,   0,   0,   0, 427,},
+            {   0,   5,   0,   0,   0,   0,   0,   0,   0,   0,   0,   7,},
+        },
+        {
+            /* Coeff Band ( 5 ) */
+            {  52,   7,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {  18,   6,   0,   0,   0,   0,   0,   0,   0,   0,   0, 652,},
+            {   1,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,  30,},
+        },
+        {
+            /* Coeff Band ( 6 ) */
+            { 640, 569,  10,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {  25,  77,   2,   0,   0,   0,   0,   0,   0,   0,   0, 517,},
+            {   4,   7,   0,   0,   0,   0,   0,   0,   0,   0,   0,   3,},
+        },
+        {
+            /* Coeff Band ( 7 ) */
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+        },
+    },
+    {
+        /* Block Type ( 3 ) */
+        {
+            /* Coeff Band ( 0 ) */
+            {2506, 20161, 2707, 767, 261, 178, 107,  30,  14,   3,   0, 100694,},
+            {8806, 36478, 8817, 3268, 1280, 850, 401, 114,  42,   0,   0, 58572,},
+            {11003, 27214, 11798, 5716, 2482, 2072, 1048, 175,  32,   0,   0, 19284,},
+        },
+        {
+            /* Coeff Band ( 1 ) */
+            {9738, 11313, 959, 205,  70,  18,  11,   1,   0,   0,   0,   0,},
+            {12628, 15085, 1507, 273,  52,  19,   9,   0,   0,   0,   0, 54280,},
+            {10701, 15846, 5561, 1926, 813, 570, 249,  36,   0,   0,   0, 6460,},
+        },
+        {
+            /* Coeff Band ( 2 ) */
+            {6781, 22539, 2784, 634, 182, 123,  20,   4,   0,   0,   0,   0,},
+            {6263, 11544, 2649, 790, 259, 168,  27,   5,   0,   0,   0, 20539,},
+            {3109, 4075, 2031, 896, 457, 386, 158,  29,   0,   0,   0, 1138,},
+        },
+        {
+            /* Coeff Band ( 3 ) */
+            {11515, 4079, 465,  73,   5,  14,   2,   0,   0,   0,   0,   0,},
+            {9361, 5834, 650,  96,  24,   8,   4,   0,   0,   0,   0, 22181,},
+            {4343, 3974, 1360, 415, 132,  96,  14,   1,   0,   0,   0, 1267,},
+        },
+        {
+            /* Coeff Band ( 4 ) */
+            {4787, 9297, 823, 168,  44,  12,   4,   0,   0,   0,   0,   0,},
+            {3619, 4472, 719, 198,  60,  31,   3,   0,   0,   0,   0, 8401,},
+            {1157, 1175, 483, 182,  88,  31,   8,   0,   0,   0,   0, 268,},
+        },
+        {
+            /* Coeff Band ( 5 ) */
+            {8299, 1226,  32,   5,   1,   0,   0,   0,   0,   0,   0,   0,},
+            {3502, 1568,  57,   4,   1,   1,   0,   0,   0,   0,   0, 9811,},
+            {1055, 1070, 166,  29,   6,   1,   0,   0,   0,   0,   0, 527,},
+        },
+        {
+            /* Coeff Band ( 6 ) */
+            {27414, 27927, 1989, 347,  69,  26,   0,   0,   0,   0,   0,   0,},
+            {5876, 10074, 1574, 341,  91,  24,   4,   0,   0,   0,   0, 21954,},
+            {1571, 2171, 778, 324, 124,  65,  16,   0,   0,   0,   0, 979,},
+        },
+        {
+            /* Coeff Band ( 7 ) */
+            {   0,  29,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0,  23,   0,   0,   0,   0,   0,   0,   0,   0,   0, 459,},
+            {   0,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,  13,},
+        },
+    },
+};
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_ENCODER_DEFAULTCOEFCOUNTS_H_
diff --git a/libs/libvpx/vp8/encoder/denoising.c b/libs/libvpx/vp8/encoder/denoising.c
new file mode 100644
index 0000000000..2a21943fe1
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/denoising.c
@@ -0,0 +1,744 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <limits.h>
+
+#include "denoising.h"
+
+#include "vp8/common/reconinter.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp8_rtcd.h"
+
+static const unsigned int NOISE_MOTION_THRESHOLD = 25 * 25;
+/* SSE_DIFF_THRESHOLD is selected as ~95% confidence assuming
+ * var(noise) ~= 100.
+ */
+static const unsigned int SSE_DIFF_THRESHOLD = 16 * 16 * 20;
+static const unsigned int SSE_THRESHOLD = 16 * 16 * 40;
+static const unsigned int SSE_THRESHOLD_HIGH = 16 * 16 * 60;
+
+/*
+ * The filter function was modified to reduce the computational complexity.
+ * Step 1:
+ * Instead of applying tap coefficients for each pixel, we calculated the
+ * pixel adjustments vs. pixel diff value ahead of time.
+ *     adjustment = filtered_value - current_raw
+ *                = (filter_coefficient * diff + 128) >> 8
+ * where
+ *     filter_coefficient = (255 << 8) / (256 + ((absdiff * 330) >> 3));
+ *     filter_coefficient += filter_coefficient /
+ *                           (3 + motion_magnitude_adjustment);
+ *     filter_coefficient is clamped to 0 ~ 255.
+ *
+ * Step 2:
+ * The adjustment vs. diff curve becomes flat very quick when diff increases.
+ * This allowed us to use only several levels to approximate the curve without
+ * changing the filtering algorithm too much.
+ * The adjustments were further corrected by checking the motion magnitude.
+ * The levels used are:
+ * diff       adjustment w/o motion correction   adjustment w/ motion correction
+ * [-255, -16]           -6                                   -7
+ * [-15, -8]             -4                                   -5
+ * [-7, -4]              -3                                   -4
+ * [-3, 3]               diff                                 diff
+ * [4, 7]                 3                                    4
+ * [8, 15]                4                                    5
+ * [16, 255]              6                                    7
+ */
+
+int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride,
+                          unsigned char *running_avg_y, int avg_y_stride,
+                          unsigned char *sig, int sig_stride,
+                          unsigned int motion_magnitude,
+                          int increase_denoising)
+{
+    unsigned char *running_avg_y_start = running_avg_y;
+    unsigned char *sig_start = sig;
+    int sum_diff_thresh;
+    int r, c;
+    int sum_diff = 0;
+    int adj_val[3] = {3, 4, 6};
+    int shift_inc1 = 0;
+    int shift_inc2 = 1;
+    int col_sum[16] = {0, 0, 0, 0,
+                       0, 0, 0, 0,
+                       0, 0, 0, 0,
+                       0, 0, 0, 0};
+    /* If motion_magnitude is small, making the denoiser more aggressive by
+     * increasing the adjustment for each level. Add another increment for
+     * blocks that are labeled for increase denoising. */
+    if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
+    {
+      if (increase_denoising) {
+        shift_inc1 = 1;
+        shift_inc2 = 2;
+      }
+      adj_val[0] += shift_inc2;
+      adj_val[1] += shift_inc2;
+      adj_val[2] += shift_inc2;
+    }
+
+    for (r = 0; r < 16; ++r)
+    {
+        for (c = 0; c < 16; ++c)
+        {
+            int diff = 0;
+            int adjustment = 0;
+            int absdiff = 0;
+
+            diff = mc_running_avg_y[c] - sig[c];
+            absdiff = abs(diff);
+
+            // When |diff| <= |3 + shift_inc1|, use pixel value from
+            // last denoised raw.
+            if (absdiff <= 3 + shift_inc1)
+            {
+                running_avg_y[c] = mc_running_avg_y[c];
+                col_sum[c] += diff;
+            }
+            else
+            {
+                if (absdiff >= 4 + shift_inc1 && absdiff <= 7)
+                    adjustment = adj_val[0];
+                else if (absdiff >= 8 && absdiff <= 15)
+                    adjustment = adj_val[1];
+                else
+                    adjustment = adj_val[2];
+
+                if (diff > 0)
+                {
+                    if ((sig[c] + adjustment) > 255)
+                        running_avg_y[c] = 255;
+                    else
+                        running_avg_y[c] = sig[c] + adjustment;
+
+                    col_sum[c] += adjustment;
+                }
+                else
+                {
+                    if ((sig[c] - adjustment) < 0)
+                        running_avg_y[c] = 0;
+                    else
+                        running_avg_y[c] = sig[c] - adjustment;
+
+                    col_sum[c] -= adjustment;
+                }
+            }
+        }
+
+        /* Update pointers for next iteration. */
+        sig += sig_stride;
+        mc_running_avg_y += mc_avg_y_stride;
+        running_avg_y += avg_y_stride;
+    }
+
+    for (c = 0; c < 16; ++c) {
+      // Below we clip the value in the same way which SSE code use.
+      // When adopting aggressive denoiser, the adj_val for each pixel
+      // could be at most 8 (this is current max adjustment of the map).
+      // In SSE code, we calculate the sum of adj_val for
+      // the columns, so the sum could be upto 128(16 rows). However,
+      // the range of the value is -128 ~ 127 in SSE code, that's why
+      // we do this change in C code.
+      // We don't do this for UV denoiser, since there are only 8 rows,
+      // and max adjustments <= 8, so the sum of the columns will not
+      // exceed 64.
+      if (col_sum[c] >= 128) {
+        col_sum[c] = 127;
+      }
+      sum_diff += col_sum[c];
+    }
+
+    sum_diff_thresh= SUM_DIFF_THRESHOLD;
+    if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH;
+    if (abs(sum_diff) > sum_diff_thresh) {
+      // Before returning to copy the block (i.e., apply no denoising), check
+      // if we can still apply some (weaker) temporal filtering to this block,
+      // that would otherwise not be denoised at all. Simplest is to apply
+      // an additional adjustment to running_avg_y to bring it closer to sig.
+      // The adjustment is capped by a maximum delta, and chosen such that
+      // in most cases the resulting sum_diff will be within the
+      // accceptable range given by sum_diff_thresh.
+
+      // The delta is set by the excess of absolute pixel diff over threshold.
+      int delta = ((abs(sum_diff) - sum_diff_thresh) >> 8) + 1;
+      // Only apply the adjustment for max delta up to 3.
+      if (delta < 4) {
+        sig -= sig_stride * 16;
+        mc_running_avg_y -= mc_avg_y_stride * 16;
+        running_avg_y -= avg_y_stride * 16;
+        for (r = 0; r < 16; ++r) {
+          for (c = 0; c < 16; ++c) {
+            int diff = mc_running_avg_y[c] - sig[c];
+            int adjustment = abs(diff);
+            if (adjustment > delta)
+              adjustment = delta;
+            if (diff > 0) {
+              // Bring denoised signal down.
+              if (running_avg_y[c] - adjustment < 0)
+                running_avg_y[c] = 0;
+              else
+                running_avg_y[c] = running_avg_y[c] - adjustment;
+              col_sum[c] -= adjustment;
+            } else if (diff < 0) {
+              // Bring denoised signal up.
+              if (running_avg_y[c] + adjustment > 255)
+                running_avg_y[c] = 255;
+              else
+                running_avg_y[c] = running_avg_y[c] + adjustment;
+              col_sum[c] += adjustment;
+            }
+          }
+          // TODO(marpan): Check here if abs(sum_diff) has gone below the
+          // threshold sum_diff_thresh, and if so, we can exit the row loop.
+          sig += sig_stride;
+          mc_running_avg_y += mc_avg_y_stride;
+          running_avg_y += avg_y_stride;
+        }
+
+        sum_diff = 0;
+        for (c = 0; c < 16; ++c) {
+          if (col_sum[c] >= 128) {
+            col_sum[c] = 127;
+          }
+          sum_diff += col_sum[c];
+        }
+
+        if (abs(sum_diff) > sum_diff_thresh)
+          return COPY_BLOCK;
+      } else {
+        return COPY_BLOCK;
+      }
+    }
+
+    vp8_copy_mem16x16(running_avg_y_start, avg_y_stride, sig_start, sig_stride);
+    return FILTER_BLOCK;
+}
+
+int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv,
+                             int mc_avg_uv_stride,
+                             unsigned char *running_avg_uv,
+                             int avg_uv_stride,
+                             unsigned char *sig,
+                             int sig_stride,
+                             unsigned int motion_magnitude,
+                             int increase_denoising) {
+    unsigned char *running_avg_uv_start = running_avg_uv;
+    unsigned char *sig_start = sig;
+    int sum_diff_thresh;
+    int r, c;
+    int sum_diff = 0;
+    int sum_block = 0;
+    int adj_val[3] = {3, 4, 6};
+    int shift_inc1 = 0;
+    int shift_inc2 = 1;
+    /* If motion_magnitude is small, making the denoiser more aggressive by
+     * increasing the adjustment for each level. Add another increment for
+     * blocks that are labeled for increase denoising. */
+    if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) {
+      if (increase_denoising) {
+        shift_inc1 = 1;
+        shift_inc2 = 2;
+      }
+      adj_val[0] += shift_inc2;
+      adj_val[1] += shift_inc2;
+      adj_val[2] += shift_inc2;
+    }
+
+    // Avoid denoising color signal if its close to average level.
+    for (r = 0; r < 8; ++r) {
+      for (c = 0; c < 8; ++c) {
+        sum_block += sig[c];
+      }
+      sig += sig_stride;
+    }
+    if (abs(sum_block - (128 * 8 * 8)) < SUM_DIFF_FROM_AVG_THRESH_UV) {
+      return COPY_BLOCK;
+    }
+
+    sig -= sig_stride * 8;
+    for (r = 0; r < 8; ++r) {
+      for (c = 0; c < 8; ++c) {
+        int diff = 0;
+        int adjustment = 0;
+        int absdiff = 0;
+
+        diff = mc_running_avg_uv[c] - sig[c];
+        absdiff = abs(diff);
+
+        // When |diff| <= |3 + shift_inc1|, use pixel value from
+        // last denoised raw.
+        if (absdiff <= 3 + shift_inc1) {
+          running_avg_uv[c] = mc_running_avg_uv[c];
+          sum_diff += diff;
+        } else {
+          if (absdiff >= 4 && absdiff <= 7)
+            adjustment = adj_val[0];
+          else if (absdiff >= 8 && absdiff <= 15)
+            adjustment = adj_val[1];
+          else
+            adjustment = adj_val[2];
+          if (diff > 0) {
+            if ((sig[c] + adjustment) > 255)
+              running_avg_uv[c] = 255;
+            else
+              running_avg_uv[c] = sig[c] + adjustment;
+            sum_diff += adjustment;
+          } else {
+            if ((sig[c] - adjustment) < 0)
+              running_avg_uv[c] = 0;
+            else
+              running_avg_uv[c] = sig[c] - adjustment;
+            sum_diff -= adjustment;
+          }
+        }
+      }
+      /* Update pointers for next iteration. */
+      sig += sig_stride;
+      mc_running_avg_uv += mc_avg_uv_stride;
+      running_avg_uv += avg_uv_stride;
+    }
+
+    sum_diff_thresh= SUM_DIFF_THRESHOLD_UV;
+    if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH_UV;
+    if (abs(sum_diff) > sum_diff_thresh) {
+      // Before returning to copy the block (i.e., apply no denoising), check
+      // if we can still apply some (weaker) temporal filtering to this block,
+      // that would otherwise not be denoised at all. Simplest is to apply
+      // an additional adjustment to running_avg_y to bring it closer to sig.
+      // The adjustment is capped by a maximum delta, and chosen such that
+      // in most cases the resulting sum_diff will be within the
+      // accceptable range given by sum_diff_thresh.
+
+      // The delta is set by the excess of absolute pixel diff over threshold.
+      int delta = ((abs(sum_diff) - sum_diff_thresh) >> 8) + 1;
+      // Only apply the adjustment for max delta up to 3.
+      if (delta < 4) {
+        sig -= sig_stride * 8;
+        mc_running_avg_uv -= mc_avg_uv_stride * 8;
+        running_avg_uv -= avg_uv_stride * 8;
+        for (r = 0; r < 8; ++r) {
+          for (c = 0; c < 8; ++c) {
+            int diff = mc_running_avg_uv[c] - sig[c];
+            int adjustment = abs(diff);
+            if (adjustment > delta)
+              adjustment = delta;
+            if (diff > 0) {
+              // Bring denoised signal down.
+              if (running_avg_uv[c] - adjustment < 0)
+                running_avg_uv[c] = 0;
+              else
+                running_avg_uv[c] = running_avg_uv[c] - adjustment;
+              sum_diff -= adjustment;
+            } else if (diff < 0) {
+              // Bring denoised signal up.
+              if (running_avg_uv[c] + adjustment > 255)
+                running_avg_uv[c] = 255;
+              else
+                running_avg_uv[c] = running_avg_uv[c] + adjustment;
+              sum_diff += adjustment;
+            }
+          }
+          // TODO(marpan): Check here if abs(sum_diff) has gone below the
+          // threshold sum_diff_thresh, and if so, we can exit the row loop.
+          sig += sig_stride;
+          mc_running_avg_uv += mc_avg_uv_stride;
+          running_avg_uv += avg_uv_stride;
+        }
+        if (abs(sum_diff) > sum_diff_thresh)
+          return COPY_BLOCK;
+      } else {
+        return COPY_BLOCK;
+      }
+    }
+
+    vp8_copy_mem8x8(running_avg_uv_start, avg_uv_stride, sig_start,
+                    sig_stride);
+    return FILTER_BLOCK;
+}
+
+void vp8_denoiser_set_parameters(VP8_DENOISER *denoiser, int mode) {
+  assert(mode > 0);  // Denoiser is allocated only if mode > 0.
+  if (mode == 1) {
+    denoiser->denoiser_mode = kDenoiserOnYOnly;
+  } else if (mode == 2) {
+    denoiser->denoiser_mode = kDenoiserOnYUV;
+  } else if (mode == 3) {
+    denoiser->denoiser_mode = kDenoiserOnYUVAggressive;
+  } else {
+    denoiser->denoiser_mode = kDenoiserOnYUV;
+  }
+  if (denoiser->denoiser_mode != kDenoiserOnYUVAggressive) {
+    denoiser->denoise_pars.scale_sse_thresh = 1;
+    denoiser->denoise_pars.scale_motion_thresh = 8;
+    denoiser->denoise_pars.scale_increase_filter = 0;
+    denoiser->denoise_pars.denoise_mv_bias = 95;
+    denoiser->denoise_pars.pickmode_mv_bias = 100;
+    denoiser->denoise_pars.qp_thresh = 0;
+    denoiser->denoise_pars.consec_zerolast = UINT_MAX;
+    denoiser->denoise_pars.spatial_blur = 0;
+  } else {
+    denoiser->denoise_pars.scale_sse_thresh = 2;
+    denoiser->denoise_pars.scale_motion_thresh = 16;
+    denoiser->denoise_pars.scale_increase_filter = 1;
+    denoiser->denoise_pars.denoise_mv_bias = 60;
+    denoiser->denoise_pars.pickmode_mv_bias = 75;
+    denoiser->denoise_pars.qp_thresh = 80;
+    denoiser->denoise_pars.consec_zerolast = 15;
+    denoiser->denoise_pars.spatial_blur = 0;
+  }
+}
+
+int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height,
+                          int num_mb_rows, int num_mb_cols, int mode)
+{
+    int i;
+    assert(denoiser);
+    denoiser->num_mb_cols = num_mb_cols;
+
+    for (i = 0; i < MAX_REF_FRAMES; i++)
+    {
+        denoiser->yv12_running_avg[i].flags = 0;
+
+        if (vp8_yv12_alloc_frame_buffer(&(denoiser->yv12_running_avg[i]), width,
+                                        height, VP8BORDERINPIXELS)
+            < 0)
+        {
+            vp8_denoiser_free(denoiser);
+            return 1;
+        }
+        memset(denoiser->yv12_running_avg[i].buffer_alloc, 0,
+               denoiser->yv12_running_avg[i].frame_size);
+
+    }
+    denoiser->yv12_mc_running_avg.flags = 0;
+
+    if (vp8_yv12_alloc_frame_buffer(&(denoiser->yv12_mc_running_avg), width,
+                                   height, VP8BORDERINPIXELS) < 0)
+    {
+        vp8_denoiser_free(denoiser);
+        return 1;
+    }
+
+    memset(denoiser->yv12_mc_running_avg.buffer_alloc, 0,
+           denoiser->yv12_mc_running_avg.frame_size);
+
+    if (vp8_yv12_alloc_frame_buffer(&denoiser->yv12_last_source, width,
+                                    height, VP8BORDERINPIXELS) < 0) {
+      vp8_denoiser_free(denoiser);
+      return 1;
+    }
+    memset(denoiser->yv12_last_source.buffer_alloc, 0,
+           denoiser->yv12_last_source.frame_size);
+
+    denoiser->denoise_state = vpx_calloc((num_mb_rows * num_mb_cols), 1);
+    memset(denoiser->denoise_state, 0, (num_mb_rows * num_mb_cols));
+    vp8_denoiser_set_parameters(denoiser, mode);
+    denoiser->nmse_source_diff = 0;
+    denoiser->nmse_source_diff_count = 0;
+    denoiser->qp_avg = 0;
+    // QP threshold below which we can go up to aggressive mode.
+    denoiser->qp_threshold_up = 80;
+    // QP threshold above which we can go back down to normal mode.
+    // For now keep this second threshold high, so not used currently.
+    denoiser->qp_threshold_down = 128;
+    // Bitrate thresholds and noise metric (nmse) thresholds for switching to
+    // aggressive mode.
+    // TODO(marpan): Adjust thresholds, including effect on resolution.
+    denoiser->bitrate_threshold = 400000;  // (bits/sec).
+    denoiser->threshold_aggressive_mode = 80;
+    if (width * height > 1280 * 720) {
+      denoiser->bitrate_threshold = 3000000;
+      denoiser->threshold_aggressive_mode = 200;
+    } else if (width * height > 960 * 540) {
+      denoiser->bitrate_threshold = 1200000;
+      denoiser->threshold_aggressive_mode = 120;
+    } else if (width * height > 640 * 480) {
+      denoiser->bitrate_threshold = 600000;
+      denoiser->threshold_aggressive_mode = 100;
+    }
+    return 0;
+}
+
+
+void vp8_denoiser_free(VP8_DENOISER *denoiser)
+{
+    int i;
+    assert(denoiser);
+
+    for (i = 0; i < MAX_REF_FRAMES ; i++)
+    {
+        vp8_yv12_de_alloc_frame_buffer(&denoiser->yv12_running_avg[i]);
+    }
+    vp8_yv12_de_alloc_frame_buffer(&denoiser->yv12_mc_running_avg);
+    vp8_yv12_de_alloc_frame_buffer(&denoiser->yv12_last_source);
+    vpx_free(denoiser->denoise_state);
+}
+
+void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
+                             MACROBLOCK *x,
+                             unsigned int best_sse,
+                             unsigned int zero_mv_sse,
+                             int recon_yoffset,
+                             int recon_uvoffset,
+                             loop_filter_info_n *lfi_n,
+                             int mb_row,
+                             int mb_col,
+                             int block_index)
+
+{
+    int mv_row;
+    int mv_col;
+    unsigned int motion_threshold;
+    unsigned int motion_magnitude2;
+    unsigned int sse_thresh;
+    int sse_diff_thresh = 0;
+    // Spatial loop filter: only applied selectively based on
+    // temporal filter state of block relative to top/left neighbors.
+    int apply_spatial_loop_filter = 1;
+    MV_REFERENCE_FRAME frame = x->best_reference_frame;
+    MV_REFERENCE_FRAME zero_frame = x->best_zeromv_reference_frame;
+
+    enum vp8_denoiser_decision decision = FILTER_BLOCK;
+    enum vp8_denoiser_decision decision_u = COPY_BLOCK;
+    enum vp8_denoiser_decision decision_v = COPY_BLOCK;
+
+    if (zero_frame)
+    {
+        YV12_BUFFER_CONFIG *src = &denoiser->yv12_running_avg[frame];
+        YV12_BUFFER_CONFIG *dst = &denoiser->yv12_mc_running_avg;
+        YV12_BUFFER_CONFIG saved_pre,saved_dst;
+        MB_MODE_INFO saved_mbmi;
+        MACROBLOCKD *filter_xd = &x->e_mbd;
+        MB_MODE_INFO *mbmi = &filter_xd->mode_info_context->mbmi;
+        int sse_diff = 0;
+        // Bias on zero motion vector sse.
+        const int zero_bias = denoiser->denoise_pars.denoise_mv_bias;
+        zero_mv_sse = (unsigned int)((int64_t)zero_mv_sse * zero_bias / 100);
+        sse_diff = zero_mv_sse - best_sse;
+
+        saved_mbmi = *mbmi;
+
+        /* Use the best MV for the compensation. */
+        mbmi->ref_frame = x->best_reference_frame;
+        mbmi->mode = x->best_sse_inter_mode;
+        mbmi->mv = x->best_sse_mv;
+        mbmi->need_to_clamp_mvs = x->need_to_clamp_best_mvs;
+        mv_col = x->best_sse_mv.as_mv.col;
+        mv_row = x->best_sse_mv.as_mv.row;
+        // Bias to zero_mv if small amount of motion.
+        // Note sse_diff_thresh is intialized to zero, so this ensures
+        // we will always choose zero_mv for denoising if
+        // zero_mv_see <= best_sse (i.e., sse_diff <= 0).
+        if ((unsigned int)(mv_row * mv_row + mv_col * mv_col)
+            <= NOISE_MOTION_THRESHOLD)
+            sse_diff_thresh = (int)SSE_DIFF_THRESHOLD;
+
+        if (frame == INTRA_FRAME ||
+            sse_diff <= sse_diff_thresh)
+        {
+            /*
+             * Handle intra blocks as referring to last frame with zero motion
+             * and let the absolute pixel difference affect the filter factor.
+             * Also consider small amount of motion as being random walk due
+             * to noise, if it doesn't mean that we get a much bigger error.
+             * Note that any changes to the mode info only affects the
+             * denoising.
+             */
+            x->denoise_zeromv = 1;
+            mbmi->ref_frame =
+                    x->best_zeromv_reference_frame;
+
+            src = &denoiser->yv12_running_avg[zero_frame];
+
+            mbmi->mode = ZEROMV;
+            mbmi->mv.as_int = 0;
+            x->best_sse_inter_mode = ZEROMV;
+            x->best_sse_mv.as_int = 0;
+            best_sse = zero_mv_sse;
+        }
+
+        saved_pre = filter_xd->pre;
+        saved_dst = filter_xd->dst;
+
+        /* Compensate the running average. */
+        filter_xd->pre.y_buffer = src->y_buffer + recon_yoffset;
+        filter_xd->pre.u_buffer = src->u_buffer + recon_uvoffset;
+        filter_xd->pre.v_buffer = src->v_buffer + recon_uvoffset;
+        /* Write the compensated running average to the destination buffer. */
+        filter_xd->dst.y_buffer = dst->y_buffer + recon_yoffset;
+        filter_xd->dst.u_buffer = dst->u_buffer + recon_uvoffset;
+        filter_xd->dst.v_buffer = dst->v_buffer + recon_uvoffset;
+
+        if (!x->skip)
+        {
+            vp8_build_inter_predictors_mb(filter_xd);
+        }
+        else
+        {
+            vp8_build_inter16x16_predictors_mb(filter_xd,
+                                               filter_xd->dst.y_buffer,
+                                               filter_xd->dst.u_buffer,
+                                               filter_xd->dst.v_buffer,
+                                               filter_xd->dst.y_stride,
+                                               filter_xd->dst.uv_stride);
+        }
+        filter_xd->pre = saved_pre;
+        filter_xd->dst = saved_dst;
+        *mbmi = saved_mbmi;
+
+    }
+
+    mv_row = x->best_sse_mv.as_mv.row;
+    mv_col = x->best_sse_mv.as_mv.col;
+    motion_magnitude2 = mv_row * mv_row + mv_col * mv_col;
+    motion_threshold = denoiser->denoise_pars.scale_motion_thresh *
+        NOISE_MOTION_THRESHOLD;
+
+    // If block is considered to be skin area, lower the motion threshold.
+    // In current version set threshold = 0, so only denoise zero mv on skin.
+    if (x->is_skin)
+        motion_threshold = 0;
+
+    if (motion_magnitude2 <
+        denoiser->denoise_pars.scale_increase_filter * NOISE_MOTION_THRESHOLD)
+      x->increase_denoising = 1;
+
+    sse_thresh = denoiser->denoise_pars.scale_sse_thresh * SSE_THRESHOLD;
+    if (x->increase_denoising)
+      sse_thresh = denoiser->denoise_pars.scale_sse_thresh * SSE_THRESHOLD_HIGH;
+
+    if (best_sse > sse_thresh || motion_magnitude2 > motion_threshold)
+      decision = COPY_BLOCK;
+
+    if (decision == FILTER_BLOCK)
+    {
+        unsigned char *mc_running_avg_y =
+            denoiser->yv12_mc_running_avg.y_buffer + recon_yoffset;
+        int mc_avg_y_stride = denoiser->yv12_mc_running_avg.y_stride;
+        unsigned char *running_avg_y =
+            denoiser->yv12_running_avg[INTRA_FRAME].y_buffer + recon_yoffset;
+        int avg_y_stride = denoiser->yv12_running_avg[INTRA_FRAME].y_stride;
+
+        /* Filter. */
+        decision = vp8_denoiser_filter(mc_running_avg_y, mc_avg_y_stride,
+                                       running_avg_y, avg_y_stride,
+                                       x->thismb, 16, motion_magnitude2,
+                                       x->increase_denoising);
+        denoiser->denoise_state[block_index] = motion_magnitude2 > 0 ?
+            kFilterNonZeroMV : kFilterZeroMV;
+        // Only denoise UV for zero motion, and if y channel was denoised.
+        if (denoiser->denoiser_mode != kDenoiserOnYOnly &&
+            motion_magnitude2 == 0 &&
+            decision == FILTER_BLOCK) {
+          unsigned char *mc_running_avg_u =
+              denoiser->yv12_mc_running_avg.u_buffer + recon_uvoffset;
+          unsigned char *running_avg_u =
+              denoiser->yv12_running_avg[INTRA_FRAME].u_buffer + recon_uvoffset;
+          unsigned char *mc_running_avg_v =
+              denoiser->yv12_mc_running_avg.v_buffer + recon_uvoffset;
+          unsigned char *running_avg_v =
+              denoiser->yv12_running_avg[INTRA_FRAME].v_buffer + recon_uvoffset;
+          int mc_avg_uv_stride = denoiser->yv12_mc_running_avg.uv_stride;
+          int avg_uv_stride = denoiser->yv12_running_avg[INTRA_FRAME].uv_stride;
+          int signal_stride = x->block[16].src_stride;
+          decision_u =
+              vp8_denoiser_filter_uv(mc_running_avg_u, mc_avg_uv_stride,
+                                      running_avg_u, avg_uv_stride,
+                                      x->block[16].src + *x->block[16].base_src,
+                                      signal_stride, motion_magnitude2, 0);
+          decision_v =
+              vp8_denoiser_filter_uv(mc_running_avg_v, mc_avg_uv_stride,
+                                      running_avg_v, avg_uv_stride,
+                                      x->block[20].src + *x->block[20].base_src,
+                                      signal_stride, motion_magnitude2, 0);
+        }
+    }
+    if (decision == COPY_BLOCK)
+    {
+        /* No filtering of this block; it differs too much from the predictor,
+         * or the motion vector magnitude is considered too big.
+         */
+        x->denoise_zeromv = 0;
+        vp8_copy_mem16x16(
+                x->thismb, 16,
+                denoiser->yv12_running_avg[INTRA_FRAME].y_buffer + recon_yoffset,
+                denoiser->yv12_running_avg[INTRA_FRAME].y_stride);
+        denoiser->denoise_state[block_index] = kNoFilter;
+    }
+    if (denoiser->denoiser_mode != kDenoiserOnYOnly) {
+      if (decision_u == COPY_BLOCK) {
+        vp8_copy_mem8x8(
+            x->block[16].src + *x->block[16].base_src, x->block[16].src_stride,
+            denoiser->yv12_running_avg[INTRA_FRAME].u_buffer + recon_uvoffset,
+            denoiser->yv12_running_avg[INTRA_FRAME].uv_stride);
+      }
+      if (decision_v == COPY_BLOCK) {
+        vp8_copy_mem8x8(
+            x->block[20].src + *x->block[20].base_src, x->block[16].src_stride,
+            denoiser->yv12_running_avg[INTRA_FRAME].v_buffer + recon_uvoffset,
+            denoiser->yv12_running_avg[INTRA_FRAME].uv_stride);
+      }
+    }
+    // Option to selectively deblock the denoised signal, for y channel only.
+    if (apply_spatial_loop_filter) {
+      loop_filter_info lfi;
+      int apply_filter_col = 0;
+      int apply_filter_row = 0;
+      int apply_filter = 0;
+      int y_stride = denoiser->yv12_running_avg[INTRA_FRAME].y_stride;
+      int uv_stride =denoiser->yv12_running_avg[INTRA_FRAME].uv_stride;
+
+      // Fix filter level to some nominal value for now.
+      int filter_level = 48;
+
+      int hev_index = lfi_n->hev_thr_lut[INTER_FRAME][filter_level];
+      lfi.mblim = lfi_n->mblim[filter_level];
+      lfi.blim = lfi_n->blim[filter_level];
+      lfi.lim = lfi_n->lim[filter_level];
+      lfi.hev_thr = lfi_n->hev_thr[hev_index];
+
+      // Apply filter if there is a difference in the denoiser filter state
+      // between the current and left/top block, or if non-zero motion vector
+      // is used for the motion-compensated filtering.
+      if (mb_col > 0) {
+        apply_filter_col = !((denoiser->denoise_state[block_index] ==
+            denoiser->denoise_state[block_index - 1]) &&
+            denoiser->denoise_state[block_index] != kFilterNonZeroMV);
+        if (apply_filter_col) {
+          // Filter left vertical edge.
+          apply_filter = 1;
+          vp8_loop_filter_mbv(
+              denoiser->yv12_running_avg[INTRA_FRAME].y_buffer + recon_yoffset,
+              NULL, NULL, y_stride, uv_stride, &lfi);
+        }
+      }
+      if (mb_row > 0) {
+        apply_filter_row = !((denoiser->denoise_state[block_index] ==
+            denoiser->denoise_state[block_index - denoiser->num_mb_cols]) &&
+            denoiser->denoise_state[block_index] != kFilterNonZeroMV);
+        if (apply_filter_row) {
+          // Filter top horizontal edge.
+          apply_filter = 1;
+          vp8_loop_filter_mbh(
+              denoiser->yv12_running_avg[INTRA_FRAME].y_buffer + recon_yoffset,
+              NULL, NULL, y_stride, uv_stride, &lfi);
+        }
+      }
+      if (apply_filter) {
+        // Update the signal block |x|. Pixel changes are only to top and/or
+        // left boundary pixels: can we avoid full block copy here.
+        vp8_copy_mem16x16(
+            denoiser->yv12_running_avg[INTRA_FRAME].y_buffer + recon_yoffset,
+            y_stride, x->thismb, 16);
+      }
+    }
+}
diff --git a/libs/libvpx/vp8/encoder/denoising.h b/libs/libvpx/vp8/encoder/denoising.h
new file mode 100644
index 0000000000..9a379a6a16
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/denoising.h
@@ -0,0 +1,117 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_ENCODER_DENOISING_H_
+#define VP8_ENCODER_DENOISING_H_
+
+#include "block.h"
+#include "vp8/common/loopfilter.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define SUM_DIFF_THRESHOLD (16 * 16 * 2)
+#define SUM_DIFF_THRESHOLD_HIGH (600)  // ~(16 * 16 * 1.5)
+#define MOTION_MAGNITUDE_THRESHOLD (8*3)
+
+#define SUM_DIFF_THRESHOLD_UV (96)   // (8 * 8 * 1.5)
+#define SUM_DIFF_THRESHOLD_HIGH_UV (8 * 8 * 2)
+#define SUM_DIFF_FROM_AVG_THRESH_UV (8 * 8 * 8)
+#define MOTION_MAGNITUDE_THRESHOLD_UV (8*3)
+
+#define MAX_GF_ARF_DENOISE_RANGE (8)
+
+enum vp8_denoiser_decision
+{
+  COPY_BLOCK,
+  FILTER_BLOCK
+};
+
+enum vp8_denoiser_filter_state {
+  kNoFilter,
+  kFilterZeroMV,
+  kFilterNonZeroMV
+};
+
+enum vp8_denoiser_mode {
+  kDenoiserOff,
+  kDenoiserOnYOnly,
+  kDenoiserOnYUV,
+  kDenoiserOnYUVAggressive,
+  kDenoiserOnAdaptive
+};
+
+typedef struct {
+  // Scale factor on sse threshold above which no denoising is done.
+  unsigned int scale_sse_thresh;
+  // Scale factor on motion magnitude threshold above which no
+  // denoising is done.
+  unsigned int scale_motion_thresh;
+  // Scale factor on motion magnitude below which we increase the strength of
+  // the temporal filter (in function vp8_denoiser_filter).
+  unsigned int scale_increase_filter;
+  // Scale factor to bias to ZEROMV for denoising.
+  unsigned int denoise_mv_bias;
+  // Scale factor to bias to ZEROMV for coding mode selection.
+  unsigned int pickmode_mv_bias;
+  // Quantizer threshold below which we use the segmentation map to switch off
+  // loop filter for blocks that have been coded as ZEROMV-LAST a certain number
+  // (consec_zerolast) of consecutive frames. Note that the delta-QP is set to
+  // 0 when segmentation map is used for shutting off loop filter.
+  unsigned int qp_thresh;
+  // Threshold for number of consecutive frames for blocks coded as ZEROMV-LAST.
+  unsigned int consec_zerolast;
+  // Threshold for amount of spatial blur on Y channel. 0 means no spatial blur.
+  unsigned int spatial_blur;
+} denoise_params;
+
+typedef struct vp8_denoiser
+{
+    YV12_BUFFER_CONFIG yv12_running_avg[MAX_REF_FRAMES];
+    YV12_BUFFER_CONFIG yv12_mc_running_avg;
+    // TODO(marpan): Should remove yv12_last_source and use vp8_lookahead_peak.
+    YV12_BUFFER_CONFIG yv12_last_source;
+    unsigned char* denoise_state;
+    int num_mb_cols;
+    int denoiser_mode;
+    int threshold_aggressive_mode;
+    int nmse_source_diff;
+    int nmse_source_diff_count;
+    int qp_avg;
+    int qp_threshold_up;
+    int qp_threshold_down;
+    int bitrate_threshold;
+    denoise_params denoise_pars;
+} VP8_DENOISER;
+
+int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height,
+                          int num_mb_rows, int num_mb_cols, int mode);
+
+void vp8_denoiser_free(VP8_DENOISER *denoiser);
+
+void vp8_denoiser_set_parameters(VP8_DENOISER *denoiser, int mode);
+
+void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
+                             MACROBLOCK *x,
+                             unsigned int best_sse,
+                             unsigned int zero_mv_sse,
+                             int recon_yoffset,
+                             int recon_uvoffset,
+                             loop_filter_info_n *lfi_n,
+                             int mb_row,
+                             int mb_col,
+                             int block_index);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_ENCODER_DENOISING_H_
diff --git a/libs/libvpx/vp8/encoder/encodeframe.c b/libs/libvpx/vp8/encoder/encodeframe.c
new file mode 100644
index 0000000000..9b05cd1fcd
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/encodeframe.c
@@ -0,0 +1,1424 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "encodemb.h"
+#include "encodemv.h"
+#include "vp8/common/common.h"
+#include "onyx_int.h"
+#include "vp8/common/extend.h"
+#include "vp8/common/entropymode.h"
+#include "vp8/common/quant_common.h"
+#include "segmentation.h"
+#include "vp8/common/setupintrarecon.h"
+#include "encodeintra.h"
+#include "vp8/common/reconinter.h"
+#include "rdopt.h"
+#include "pickinter.h"
+#include "vp8/common/findnearmv.h"
+#include <stdio.h>
+#include <limits.h>
+#include "vp8/common/invtrans.h"
+#include "vpx_ports/vpx_timer.h"
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+#include "bitstream.h"
+#endif
+#include "encodeframe.h"
+
+extern void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t) ;
+extern void vp8_calc_ref_frame_costs(int *ref_frame_cost,
+                                     int prob_intra,
+                                     int prob_last,
+                                     int prob_garf
+                                    );
+extern void vp8_convert_rfct_to_prob(VP8_COMP *const cpi);
+extern void vp8cx_initialize_me_consts(VP8_COMP *cpi, int QIndex);
+extern void vp8_auto_select_speed(VP8_COMP *cpi);
+extern void vp8cx_init_mbrthread_data(VP8_COMP *cpi,
+                                      MACROBLOCK *x,
+                                      MB_ROW_COMP *mbr_ei,
+                                      int count);
+static void adjust_act_zbin( VP8_COMP *cpi, MACROBLOCK *x );
+
+#ifdef MODE_STATS
+unsigned int inter_y_modes[10] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+unsigned int inter_uv_modes[4] = {0, 0, 0, 0};
+unsigned int inter_b_modes[15]  = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+unsigned int y_modes[5]   = {0, 0, 0, 0, 0};
+unsigned int uv_modes[4]  = {0, 0, 0, 0};
+unsigned int b_modes[14]  = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+#endif
+
+
+/* activity_avg must be positive, or flat regions could get a zero weight
+ *  (infinite lambda), which confounds analysis.
+ * This also avoids the need for divide by zero checks in
+ *  vp8_activity_masking().
+ */
+#define VP8_ACTIVITY_AVG_MIN (64)
+
+/* This is used as a reference when computing the source variance for the
+ *  purposes of activity masking.
+ * Eventually this should be replaced by custom no-reference routines,
+ *  which will be faster.
+ */
+static const unsigned char VP8_VAR_OFFS[16]=
+{
+    128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128
+};
+
+
+/* Original activity measure from Tim T's code. */
+static unsigned int tt_activity_measure( VP8_COMP *cpi, MACROBLOCK *x )
+{
+    unsigned int act;
+    unsigned int sse;
+    (void)cpi;
+    /* TODO: This could also be done over smaller areas (8x8), but that would
+     *  require extensive changes elsewhere, as lambda is assumed to be fixed
+     *  over an entire MB in most of the code.
+     * Another option is to compute four 8x8 variances, and pick a single
+     *  lambda using a non-linear combination (e.g., the smallest, or second
+     *  smallest, etc.).
+     */
+    act =  vpx_variance16x16(x->src.y_buffer,
+                    x->src.y_stride, VP8_VAR_OFFS, 0, &sse);
+    act = act<<4;
+
+    /* If the region is flat, lower the activity some more. */
+    if (act < 8<<12)
+        act = act < 5<<12 ? act : 5<<12;
+
+    return act;
+}
+
+/* Stub for alternative experimental activity measures. */
+static unsigned int alt_activity_measure( VP8_COMP *cpi,
+                                          MACROBLOCK *x, int use_dc_pred )
+{
+    return vp8_encode_intra(cpi,x, use_dc_pred);
+}
+
+
+/* Measure the activity of the current macroblock
+ * What we measure here is TBD so abstracted to this function
+ */
+#define ALT_ACT_MEASURE 1
+static unsigned int mb_activity_measure( VP8_COMP *cpi, MACROBLOCK *x,
+                                  int mb_row, int mb_col)
+{
+    unsigned int mb_activity;
+
+    if  ( ALT_ACT_MEASURE )
+    {
+        int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
+
+        /* Or use and alternative. */
+        mb_activity = alt_activity_measure( cpi, x, use_dc_pred );
+    }
+    else
+    {
+        /* Original activity measure from Tim T's code. */
+        mb_activity = tt_activity_measure( cpi, x );
+    }
+
+    if ( mb_activity < VP8_ACTIVITY_AVG_MIN )
+        mb_activity = VP8_ACTIVITY_AVG_MIN;
+
+    return mb_activity;
+}
+
+/* Calculate an "average" mb activity value for the frame */
+#define ACT_MEDIAN 0
+static void calc_av_activity( VP8_COMP *cpi, int64_t activity_sum )
+{
+#if ACT_MEDIAN
+    /* Find median: Simple n^2 algorithm for experimentation */
+    {
+        unsigned int median;
+        unsigned int i,j;
+        unsigned int * sortlist;
+        unsigned int tmp;
+
+        /* Create a list to sort to */
+        CHECK_MEM_ERROR(sortlist,
+                        vpx_calloc(sizeof(unsigned int),
+                        cpi->common.MBs));
+
+        /* Copy map to sort list */
+        memcpy( sortlist, cpi->mb_activity_map,
+                sizeof(unsigned int) * cpi->common.MBs );
+
+
+        /* Ripple each value down to its correct position */
+        for ( i = 1; i < cpi->common.MBs; i ++ )
+        {
+            for ( j = i; j > 0; j -- )
+            {
+                if ( sortlist[j] < sortlist[j-1] )
+                {
+                    /* Swap values */
+                    tmp = sortlist[j-1];
+                    sortlist[j-1] = sortlist[j];
+                    sortlist[j] = tmp;
+                }
+                else
+                    break;
+            }
+        }
+
+        /* Even number MBs so estimate median as mean of two either side. */
+        median = ( 1 + sortlist[cpi->common.MBs >> 1] +
+                   sortlist[(cpi->common.MBs >> 1) + 1] ) >> 1;
+
+        cpi->activity_avg = median;
+
+        vpx_free(sortlist);
+    }
+#else
+    /* Simple mean for now */
+    cpi->activity_avg = (unsigned int)(activity_sum/cpi->common.MBs);
+#endif
+
+    if (cpi->activity_avg < VP8_ACTIVITY_AVG_MIN)
+        cpi->activity_avg = VP8_ACTIVITY_AVG_MIN;
+
+    /* Experimental code: return fixed value normalized for several clips */
+    if  ( ALT_ACT_MEASURE )
+        cpi->activity_avg = 100000;
+}
+
+#define USE_ACT_INDEX   0
+#define OUTPUT_NORM_ACT_STATS   0
+
+#if USE_ACT_INDEX
+/* Calculate and activity index for each mb */
+static void calc_activity_index( VP8_COMP *cpi, MACROBLOCK *x )
+{
+    VP8_COMMON *const cm = & cpi->common;
+    int mb_row, mb_col;
+
+    int64_t act;
+    int64_t a;
+    int64_t b;
+
+#if OUTPUT_NORM_ACT_STATS
+    FILE *f = fopen("norm_act.stt", "a");
+    fprintf(f, "\n%12d\n", cpi->activity_avg );
+#endif
+
+    /* Reset pointers to start of activity map */
+    x->mb_activity_ptr = cpi->mb_activity_map;
+
+    /* Calculate normalized mb activity number. */
+    for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
+    {
+        /* for each macroblock col in image */
+        for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+        {
+            /* Read activity from the map */
+            act = *(x->mb_activity_ptr);
+
+            /* Calculate a normalized activity number */
+            a = act + 4*cpi->activity_avg;
+            b = 4*act + cpi->activity_avg;
+
+            if ( b >= a )
+                *(x->activity_ptr) = (int)((b + (a>>1))/a) - 1;
+            else
+                *(x->activity_ptr) = 1 - (int)((a + (b>>1))/b);
+
+#if OUTPUT_NORM_ACT_STATS
+            fprintf(f, " %6d", *(x->mb_activity_ptr));
+#endif
+            /* Increment activity map pointers */
+            x->mb_activity_ptr++;
+        }
+
+#if OUTPUT_NORM_ACT_STATS
+        fprintf(f, "\n");
+#endif
+
+    }
+
+#if OUTPUT_NORM_ACT_STATS
+    fclose(f);
+#endif
+
+}
+#endif
+
+/* Loop through all MBs. Note activity of each, average activity and
+ * calculate a normalized activity for each
+ */
+static void build_activity_map( VP8_COMP *cpi )
+{
+    MACROBLOCK *const x = & cpi->mb;
+    MACROBLOCKD *xd = &x->e_mbd;
+    VP8_COMMON *const cm = & cpi->common;
+
+#if ALT_ACT_MEASURE
+    YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx];
+    int recon_yoffset;
+    int recon_y_stride = new_yv12->y_stride;
+#endif
+
+    int mb_row, mb_col;
+    unsigned int mb_activity;
+    int64_t activity_sum = 0;
+
+    /* for each macroblock row in image */
+    for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
+    {
+#if ALT_ACT_MEASURE
+        /* reset above block coeffs */
+        xd->up_available = (mb_row != 0);
+        recon_yoffset = (mb_row * recon_y_stride * 16);
+#endif
+        /* for each macroblock col in image */
+        for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+        {
+#if ALT_ACT_MEASURE
+            xd->dst.y_buffer = new_yv12->y_buffer + recon_yoffset;
+            xd->left_available = (mb_col != 0);
+            recon_yoffset += 16;
+#endif
+            /* Copy current mb to a buffer */
+            vp8_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
+
+            /* measure activity */
+            mb_activity = mb_activity_measure( cpi, x, mb_row, mb_col );
+
+            /* Keep frame sum */
+            activity_sum += mb_activity;
+
+            /* Store MB level activity details. */
+            *x->mb_activity_ptr = mb_activity;
+
+            /* Increment activity map pointer */
+            x->mb_activity_ptr++;
+
+            /* adjust to the next column of source macroblocks */
+            x->src.y_buffer += 16;
+        }
+
+
+        /* adjust to the next row of mbs */
+        x->src.y_buffer += 16 * x->src.y_stride - 16 * cm->mb_cols;
+
+#if ALT_ACT_MEASURE
+        /* extend the recon for intra prediction */
+        vp8_extend_mb_row(new_yv12, xd->dst.y_buffer + 16,
+                          xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
+#endif
+
+    }
+
+    /* Calculate an "average" MB activity */
+    calc_av_activity(cpi, activity_sum);
+
+#if USE_ACT_INDEX
+    /* Calculate an activity index number of each mb */
+    calc_activity_index( cpi, x );
+#endif
+
+}
+
+/* Macroblock activity masking */
+void vp8_activity_masking(VP8_COMP *cpi, MACROBLOCK *x)
+{
+#if USE_ACT_INDEX
+    x->rdmult += *(x->mb_activity_ptr) * (x->rdmult >> 2);
+    x->errorperbit = x->rdmult * 100 /(110 * x->rddiv);
+    x->errorperbit += (x->errorperbit==0);
+#else
+    int64_t a;
+    int64_t b;
+    int64_t act = *(x->mb_activity_ptr);
+
+    /* Apply the masking to the RD multiplier. */
+    a = act + (2*cpi->activity_avg);
+    b = (2*act) + cpi->activity_avg;
+
+    x->rdmult = (unsigned int)(((int64_t)x->rdmult*b + (a>>1))/a);
+    x->errorperbit = x->rdmult * 100 /(110 * x->rddiv);
+    x->errorperbit += (x->errorperbit==0);
+#endif
+
+    /* Activity based Zbin adjustment */
+    adjust_act_zbin(cpi, x);
+}
+
+static
+void encode_mb_row(VP8_COMP *cpi,
+                   VP8_COMMON *cm,
+                   int mb_row,
+                   MACROBLOCK  *x,
+                   MACROBLOCKD *xd,
+                   TOKENEXTRA **tp,
+                   int *segment_counts,
+                   int *totalrate)
+{
+    int recon_yoffset, recon_uvoffset;
+    int mb_col;
+    int ref_fb_idx = cm->lst_fb_idx;
+    int dst_fb_idx = cm->new_fb_idx;
+    int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
+    int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
+    int map_index = (mb_row * cpi->common.mb_cols);
+
+#if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
+    const int num_part = (1 << cm->multi_token_partition);
+    TOKENEXTRA * tp_start = cpi->tok;
+    vp8_writer *w;
+#endif
+
+#if CONFIG_MULTITHREAD
+    const int nsync = cpi->mt_sync_range;
+    const int rightmost_col = cm->mb_cols + nsync;
+    const int *last_row_current_mb_col;
+    int *current_mb_col = &cpi->mt_current_mb_col[mb_row];
+
+    if ((cpi->b_multi_threaded != 0) && (mb_row != 0))
+        last_row_current_mb_col = &cpi->mt_current_mb_col[mb_row - 1];
+    else
+        last_row_current_mb_col = &rightmost_col;
+#endif
+
+#if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
+    if(num_part > 1)
+        w= &cpi->bc[1 + (mb_row % num_part)];
+    else
+        w = &cpi->bc[1];
+#endif
+
+    /* reset above block coeffs */
+    xd->above_context = cm->above_context;
+
+    xd->up_available = (mb_row != 0);
+    recon_yoffset = (mb_row * recon_y_stride * 16);
+    recon_uvoffset = (mb_row * recon_uv_stride * 8);
+
+    cpi->tplist[mb_row].start = *tp;
+    /* printf("Main mb_row = %d\n", mb_row); */
+
+    /* Distance of Mb to the top & bottom edges, specified in 1/8th pel
+     * units as they are always compared to values that are in 1/8th pel
+     */
+    xd->mb_to_top_edge = -((mb_row * 16) << 3);
+    xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;
+
+    /* Set up limit values for vertical motion vector components
+     * to prevent them extending beyond the UMV borders
+     */
+    x->mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 16));
+    x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16)
+                        + (VP8BORDERINPIXELS - 16);
+
+    /* Set the mb activity pointer to the start of the row. */
+    x->mb_activity_ptr = &cpi->mb_activity_map[map_index];
+
+    /* for each macroblock col in image */
+    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+    {
+
+#if  (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
+        *tp = cpi->tok;
+#endif
+        /* Distance of Mb to the left & right edges, specified in
+         * 1/8th pel units as they are always compared to values
+         * that are in 1/8th pel units
+         */
+        xd->mb_to_left_edge = -((mb_col * 16) << 3);
+        xd->mb_to_right_edge = ((cm->mb_cols - 1 - mb_col) * 16) << 3;
+
+        /* Set up limit values for horizontal motion vector components
+         * to prevent them extending beyond the UMV borders
+         */
+        x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16));
+        x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16)
+                            + (VP8BORDERINPIXELS - 16);
+
+        xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
+        xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
+        xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
+        xd->left_available = (mb_col != 0);
+
+        x->rddiv = cpi->RDDIV;
+        x->rdmult = cpi->RDMULT;
+
+        /* Copy current mb to a buffer */
+        vp8_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
+
+#if CONFIG_MULTITHREAD
+        if (cpi->b_multi_threaded != 0) {
+            if (((mb_col - 1) % nsync) == 0) {
+                pthread_mutex_t *mutex = &cpi->pmutex[mb_row];
+                protected_write(mutex, current_mb_col, mb_col - 1);
+            }
+
+            if (mb_row && !(mb_col & (nsync - 1))) {
+                pthread_mutex_t *mutex = &cpi->pmutex[mb_row-1];
+                sync_read(mutex, mb_col, last_row_current_mb_col, nsync);
+            }
+        }
+#endif
+
+        if(cpi->oxcf.tuning == VP8_TUNE_SSIM)
+            vp8_activity_masking(cpi, x);
+
+        /* Is segmentation enabled */
+        /* MB level adjustment to quantizer */
+        if (xd->segmentation_enabled)
+        {
+            /* Code to set segment id in xd->mbmi.segment_id for current MB
+             * (with range checking)
+             */
+            if (cpi->segmentation_map[map_index+mb_col] <= 3)
+                xd->mode_info_context->mbmi.segment_id = cpi->segmentation_map[map_index+mb_col];
+            else
+                xd->mode_info_context->mbmi.segment_id = 0;
+
+            vp8cx_mb_init_quantizer(cpi, x, 1);
+        }
+        else
+            /* Set to Segment 0 by default */
+            xd->mode_info_context->mbmi.segment_id = 0;
+
+        x->active_ptr = cpi->active_map + map_index + mb_col;
+
+        if (cm->frame_type == KEY_FRAME)
+        {
+            *totalrate += vp8cx_encode_intra_macroblock(cpi, x, tp);
+#ifdef MODE_STATS
+            y_modes[xd->mbmi.mode] ++;
+#endif
+        }
+        else
+        {
+            *totalrate += vp8cx_encode_inter_macroblock(cpi, x, tp, recon_yoffset, recon_uvoffset, mb_row, mb_col);
+
+#ifdef MODE_STATS
+            inter_y_modes[xd->mbmi.mode] ++;
+
+            if (xd->mbmi.mode == SPLITMV)
+            {
+                int b;
+
+                for (b = 0; b < xd->mbmi.partition_count; b++)
+                {
+                    inter_b_modes[x->partition->bmi[b].mode] ++;
+                }
+            }
+
+#endif
+
+            // Keep track of how many (consecutive) times a  block is coded
+            // as ZEROMV_LASTREF, for base layer frames.
+            // Reset to 0 if its coded as anything else.
+            if (cpi->current_layer == 0) {
+              if (xd->mode_info_context->mbmi.mode == ZEROMV &&
+                  xd->mode_info_context->mbmi.ref_frame == LAST_FRAME) {
+                // Increment, check for wrap-around.
+                if (cpi->consec_zero_last[map_index+mb_col] < 255)
+                  cpi->consec_zero_last[map_index+mb_col] += 1;
+                if (cpi->consec_zero_last_mvbias[map_index+mb_col] < 255)
+                  cpi->consec_zero_last_mvbias[map_index+mb_col] += 1;
+              } else {
+                cpi->consec_zero_last[map_index+mb_col] = 0;
+                cpi->consec_zero_last_mvbias[map_index+mb_col] = 0;
+              }
+              if (x->zero_last_dot_suppress)
+                cpi->consec_zero_last_mvbias[map_index+mb_col] = 0;
+            }
+
+            /* Special case code for cyclic refresh
+             * If cyclic update enabled then copy xd->mbmi.segment_id; (which
+             * may have been updated based on mode during
+             * vp8cx_encode_inter_macroblock()) back into the global
+             * segmentation map
+             */
+            if ((cpi->current_layer == 0) &&
+                (cpi->cyclic_refresh_mode_enabled &&
+                 xd->segmentation_enabled))
+            {
+                cpi->segmentation_map[map_index+mb_col] = xd->mode_info_context->mbmi.segment_id;
+
+                /* If the block has been refreshed mark it as clean (the
+                 * magnitude of the -ve influences how long it will be before
+                 * we consider another refresh):
+                 * Else if it was coded (last frame 0,0) and has not already
+                 * been refreshed then mark it as a candidate for cleanup
+                 * next time (marked 0) else mark it as dirty (1).
+                 */
+                if (xd->mode_info_context->mbmi.segment_id)
+                    cpi->cyclic_refresh_map[map_index+mb_col] = -1;
+                else if ((xd->mode_info_context->mbmi.mode == ZEROMV) && (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME))
+                {
+                    if (cpi->cyclic_refresh_map[map_index+mb_col] == 1)
+                        cpi->cyclic_refresh_map[map_index+mb_col] = 0;
+                }
+                else
+                    cpi->cyclic_refresh_map[map_index+mb_col] = 1;
+
+            }
+        }
+
+        cpi->tplist[mb_row].stop = *tp;
+
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+        /* pack tokens for this MB */
+        {
+            int tok_count = *tp - tp_start;
+            vp8_pack_tokens(w, tp_start, tok_count);
+        }
+#endif
+        /* Increment pointer into gf usage flags structure. */
+        x->gf_active_ptr++;
+
+        /* Increment the activity mask pointers. */
+        x->mb_activity_ptr++;
+
+        /* adjust to the next column of macroblocks */
+        x->src.y_buffer += 16;
+        x->src.u_buffer += 8;
+        x->src.v_buffer += 8;
+
+        recon_yoffset += 16;
+        recon_uvoffset += 8;
+
+        /* Keep track of segment usage */
+        segment_counts[xd->mode_info_context->mbmi.segment_id] ++;
+
+        /* skip to next mb */
+        xd->mode_info_context++;
+        x->partition_info++;
+        xd->above_context++;
+    }
+
+    /* extend the recon for intra prediction */
+    vp8_extend_mb_row( &cm->yv12_fb[dst_fb_idx],
+                        xd->dst.y_buffer + 16,
+                        xd->dst.u_buffer + 8,
+                        xd->dst.v_buffer + 8);
+
+#if CONFIG_MULTITHREAD
+    if (cpi->b_multi_threaded != 0)
+        protected_write(&cpi->pmutex[mb_row], current_mb_col, rightmost_col);
+#endif
+
+    /* this is to account for the border */
+    xd->mode_info_context++;
+    x->partition_info++;
+}
+
+static void init_encode_frame_mb_context(VP8_COMP *cpi)
+{
+    MACROBLOCK *const x = & cpi->mb;
+    VP8_COMMON *const cm = & cpi->common;
+    MACROBLOCKD *const xd = & x->e_mbd;
+
+    /* GF active flags data structure */
+    x->gf_active_ptr = (signed char *)cpi->gf_active_flags;
+
+    /* Activity map pointer */
+    x->mb_activity_ptr = cpi->mb_activity_map;
+
+    x->act_zbin_adj = 0;
+
+    x->partition_info = x->pi;
+
+    xd->mode_info_context = cm->mi;
+    xd->mode_info_stride = cm->mode_info_stride;
+
+    xd->frame_type = cm->frame_type;
+
+    /* reset intra mode contexts */
+    if (cm->frame_type == KEY_FRAME)
+        vp8_init_mbmode_probs(cm);
+
+    /* Copy data over into macro block data structures. */
+    x->src = * cpi->Source;
+    xd->pre = cm->yv12_fb[cm->lst_fb_idx];
+    xd->dst = cm->yv12_fb[cm->new_fb_idx];
+
+    /* set up frame for intra coded blocks */
+    vp8_setup_intra_recon(&cm->yv12_fb[cm->new_fb_idx]);
+
+    vp8_build_block_offsets(x);
+
+    xd->mode_info_context->mbmi.mode = DC_PRED;
+    xd->mode_info_context->mbmi.uv_mode = DC_PRED;
+
+    xd->left_context = &cm->left_context;
+
+    x->mvc = cm->fc.mvc;
+
+    memset(cm->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * cm->mb_cols);
+
+    /* Special case treatment when GF and ARF are not sensible options
+     * for reference
+     */
+    if (cpi->ref_frame_flags == VP8_LAST_FRAME)
+        vp8_calc_ref_frame_costs(x->ref_frame_cost,
+                                 cpi->prob_intra_coded,255,128);
+    else if ((cpi->oxcf.number_of_layers > 1) &&
+               (cpi->ref_frame_flags == VP8_GOLD_FRAME))
+        vp8_calc_ref_frame_costs(x->ref_frame_cost,
+                                 cpi->prob_intra_coded,1,255);
+    else if ((cpi->oxcf.number_of_layers > 1) &&
+                (cpi->ref_frame_flags == VP8_ALTR_FRAME))
+        vp8_calc_ref_frame_costs(x->ref_frame_cost,
+                                 cpi->prob_intra_coded,1,1);
+    else
+        vp8_calc_ref_frame_costs(x->ref_frame_cost,
+                                 cpi->prob_intra_coded,
+                                 cpi->prob_last_coded,
+                                 cpi->prob_gf_coded);
+
+    xd->fullpixel_mask = 0xffffffff;
+    if(cm->full_pixel)
+        xd->fullpixel_mask = 0xfffffff8;
+
+    vp8_zero(x->coef_counts);
+    vp8_zero(x->ymode_count);
+    vp8_zero(x->uv_mode_count)
+    x->prediction_error = 0;
+    x->intra_error = 0;
+    vp8_zero(x->count_mb_ref_frame_usage);
+}
+
+#if CONFIG_MULTITHREAD
+static void sum_coef_counts(MACROBLOCK *x, MACROBLOCK *x_thread)
+{
+    int i = 0;
+    do
+    {
+        int j = 0;
+        do
+        {
+            int k = 0;
+            do
+            {
+                /* at every context */
+
+                /* calc probs and branch cts for this frame only */
+                int t = 0;      /* token/prob index */
+
+                do
+                {
+                    x->coef_counts [i][j][k][t] +=
+                        x_thread->coef_counts [i][j][k][t];
+                }
+                while (++t < ENTROPY_NODES);
+            }
+            while (++k < PREV_COEF_CONTEXTS);
+        }
+        while (++j < COEF_BANDS);
+    }
+    while (++i < BLOCK_TYPES);
+}
+#endif  // CONFIG_MULTITHREAD
+
+void vp8_encode_frame(VP8_COMP *cpi)
+{
+    int mb_row;
+    MACROBLOCK *const x = & cpi->mb;
+    VP8_COMMON *const cm = & cpi->common;
+    MACROBLOCKD *const xd = & x->e_mbd;
+    TOKENEXTRA *tp = cpi->tok;
+    int segment_counts[MAX_MB_SEGMENTS];
+    int totalrate;
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+    BOOL_CODER * bc = &cpi->bc[1]; /* bc[0] is for control partition */
+    const int num_part = (1 << cm->multi_token_partition);
+#endif
+
+    memset(segment_counts, 0, sizeof(segment_counts));
+    totalrate = 0;
+
+    if (cpi->compressor_speed == 2)
+    {
+        if (cpi->oxcf.cpu_used < 0)
+            cpi->Speed = -(cpi->oxcf.cpu_used);
+        else
+            vp8_auto_select_speed(cpi);
+    }
+
+    /* Functions setup for all frame types so we can use MC in AltRef */
+    if(!cm->use_bilinear_mc_filter)
+    {
+        xd->subpixel_predict        = vp8_sixtap_predict4x4;
+        xd->subpixel_predict8x4     = vp8_sixtap_predict8x4;
+        xd->subpixel_predict8x8     = vp8_sixtap_predict8x8;
+        xd->subpixel_predict16x16   = vp8_sixtap_predict16x16;
+    }
+    else
+    {
+        xd->subpixel_predict        = vp8_bilinear_predict4x4;
+        xd->subpixel_predict8x4     = vp8_bilinear_predict8x4;
+        xd->subpixel_predict8x8     = vp8_bilinear_predict8x8;
+        xd->subpixel_predict16x16   = vp8_bilinear_predict16x16;
+    }
+
+    cpi->mb.skip_true_count = 0;
+    cpi->tok_count = 0;
+
+#if 0
+    /* Experimental code */
+    cpi->frame_distortion = 0;
+    cpi->last_mb_distortion = 0;
+#endif
+
+    xd->mode_info_context = cm->mi;
+
+    vp8_zero(cpi->mb.MVcount);
+
+    vp8cx_frame_init_quantizer(cpi);
+
+    vp8_initialize_rd_consts(cpi, x,
+                             vp8_dc_quant(cm->base_qindex, cm->y1dc_delta_q));
+
+    vp8cx_initialize_me_consts(cpi, cm->base_qindex);
+
+    if(cpi->oxcf.tuning == VP8_TUNE_SSIM)
+    {
+        /* Initialize encode frame context. */
+        init_encode_frame_mb_context(cpi);
+
+        /* Build a frame level activity map */
+        build_activity_map(cpi);
+    }
+
+    /* re-init encode frame context. */
+    init_encode_frame_mb_context(cpi);
+
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+    {
+        int i;
+        for(i = 0; i < num_part; i++)
+        {
+            vp8_start_encode(&bc[i], cpi->partition_d[i + 1],
+                    cpi->partition_d_end[i + 1]);
+            bc[i].error = &cm->error;
+        }
+    }
+
+#endif
+
+    {
+        struct vpx_usec_timer  emr_timer;
+        vpx_usec_timer_start(&emr_timer);
+
+#if CONFIG_MULTITHREAD
+        if (cpi->b_multi_threaded)
+        {
+            int i;
+
+            vp8cx_init_mbrthread_data(cpi, x, cpi->mb_row_ei,
+                                      cpi->encoding_thread_count);
+
+            for (i = 0; i < cm->mb_rows; i++)
+                cpi->mt_current_mb_col[i] = -1;
+
+            for (i = 0; i < cpi->encoding_thread_count; i++)
+            {
+                sem_post(&cpi->h_event_start_encoding[i]);
+            }
+
+            for (mb_row = 0; mb_row < cm->mb_rows; mb_row += (cpi->encoding_thread_count + 1))
+            {
+                vp8_zero(cm->left_context)
+
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+                tp = cpi->tok;
+#else
+                tp = cpi->tok + mb_row * (cm->mb_cols * 16 * 24);
+#endif
+
+                encode_mb_row(cpi, cm, mb_row, x, xd, &tp, segment_counts, &totalrate);
+
+                /* adjust to the next row of mbs */
+                x->src.y_buffer += 16 * x->src.y_stride * (cpi->encoding_thread_count + 1) - 16 * cm->mb_cols;
+                x->src.u_buffer +=  8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols;
+                x->src.v_buffer +=  8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols;
+
+                xd->mode_info_context += xd->mode_info_stride * cpi->encoding_thread_count;
+                x->partition_info  += xd->mode_info_stride * cpi->encoding_thread_count;
+                x->gf_active_ptr   += cm->mb_cols * cpi->encoding_thread_count;
+
+                if(mb_row == cm->mb_rows - 1)
+                {
+                    sem_post(&cpi->h_event_end_encoding); /* signal frame encoding end */
+                }
+            }
+
+            sem_wait(&cpi->h_event_end_encoding); /* wait for other threads to finish */
+
+            for (mb_row = 0; mb_row < cm->mb_rows; mb_row ++)
+            {
+                cpi->tok_count += (unsigned int)
+                  (cpi->tplist[mb_row].stop - cpi->tplist[mb_row].start);
+            }
+
+            if (xd->segmentation_enabled)
+            {
+                int j;
+
+                if (xd->segmentation_enabled)
+                {
+                    for (i = 0; i < cpi->encoding_thread_count; i++)
+                    {
+                        for (j = 0; j < 4; j++)
+                            segment_counts[j] += cpi->mb_row_ei[i].segment_counts[j];
+                    }
+                }
+            }
+
+            for (i = 0; i < cpi->encoding_thread_count; i++)
+            {
+                int mode_count;
+                int c_idx;
+                totalrate += cpi->mb_row_ei[i].totalrate;
+
+                cpi->mb.skip_true_count += cpi->mb_row_ei[i].mb.skip_true_count;
+
+                for(mode_count = 0; mode_count < VP8_YMODES; mode_count++)
+                    cpi->mb.ymode_count[mode_count] +=
+                        cpi->mb_row_ei[i].mb.ymode_count[mode_count];
+
+                for(mode_count = 0; mode_count < VP8_UV_MODES; mode_count++)
+                    cpi->mb.uv_mode_count[mode_count] +=
+                        cpi->mb_row_ei[i].mb.uv_mode_count[mode_count];
+
+                for(c_idx = 0; c_idx < MVvals; c_idx++)
+                {
+                    cpi->mb.MVcount[0][c_idx] +=
+                        cpi->mb_row_ei[i].mb.MVcount[0][c_idx];
+                    cpi->mb.MVcount[1][c_idx] +=
+                        cpi->mb_row_ei[i].mb.MVcount[1][c_idx];
+                }
+
+                cpi->mb.prediction_error +=
+                    cpi->mb_row_ei[i].mb.prediction_error;
+                cpi->mb.intra_error += cpi->mb_row_ei[i].mb.intra_error;
+
+                for(c_idx = 0; c_idx < MAX_REF_FRAMES; c_idx++)
+                    cpi->mb.count_mb_ref_frame_usage[c_idx] +=
+                        cpi->mb_row_ei[i].mb.count_mb_ref_frame_usage[c_idx];
+
+                for(c_idx = 0; c_idx < MAX_ERROR_BINS; c_idx++)
+                    cpi->mb.error_bins[c_idx] +=
+                        cpi->mb_row_ei[i].mb.error_bins[c_idx];
+
+                /* add up counts for each thread */
+                sum_coef_counts(x, &cpi->mb_row_ei[i].mb);
+            }
+
+        }
+        else
+#endif  // CONFIG_MULTITHREAD
+        {
+
+            /* for each macroblock row in image */
+            for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
+            {
+                vp8_zero(cm->left_context)
+
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+                tp = cpi->tok;
+#endif
+
+                encode_mb_row(cpi, cm, mb_row, x, xd, &tp, segment_counts, &totalrate);
+
+                /* adjust to the next row of mbs */
+                x->src.y_buffer += 16 * x->src.y_stride - 16 * cm->mb_cols;
+                x->src.u_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;
+                x->src.v_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;
+            }
+
+            cpi->tok_count = (unsigned int)(tp - cpi->tok);
+        }
+
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+        {
+            int i;
+            for(i = 0; i < num_part; i++)
+            {
+                vp8_stop_encode(&bc[i]);
+                cpi->partition_sz[i+1] = bc[i].pos;
+            }
+        }
+#endif
+
+        vpx_usec_timer_mark(&emr_timer);
+        cpi->time_encode_mb_row += vpx_usec_timer_elapsed(&emr_timer);
+    }
+
+
+    // Work out the segment probabilities if segmentation is enabled
+    // and needs to be updated
+    if (xd->segmentation_enabled && xd->update_mb_segmentation_map)
+    {
+        int tot_count;
+        int i;
+
+        /* Set to defaults */
+        memset(xd->mb_segment_tree_probs, 255 , sizeof(xd->mb_segment_tree_probs));
+
+        tot_count = segment_counts[0] + segment_counts[1] + segment_counts[2] + segment_counts[3];
+
+        if (tot_count)
+        {
+            xd->mb_segment_tree_probs[0] = ((segment_counts[0] + segment_counts[1]) * 255) / tot_count;
+
+            tot_count = segment_counts[0] + segment_counts[1];
+
+            if (tot_count > 0)
+            {
+                xd->mb_segment_tree_probs[1] = (segment_counts[0] * 255) / tot_count;
+            }
+
+            tot_count = segment_counts[2] + segment_counts[3];
+
+            if (tot_count > 0)
+                xd->mb_segment_tree_probs[2] = (segment_counts[2] * 255) / tot_count;
+
+            /* Zero probabilities not allowed */
+            for (i = 0; i < MB_FEATURE_TREE_PROBS; i ++)
+            {
+                if (xd->mb_segment_tree_probs[i] == 0)
+                    xd->mb_segment_tree_probs[i] = 1;
+            }
+        }
+    }
+
+    /* projected_frame_size in units of BYTES */
+    cpi->projected_frame_size = totalrate >> 8;
+
+    /* Make a note of the percentage MBs coded Intra. */
+    if (cm->frame_type == KEY_FRAME)
+    {
+        cpi->this_frame_percent_intra = 100;
+    }
+    else
+    {
+        int tot_modes;
+
+        tot_modes = cpi->mb.count_mb_ref_frame_usage[INTRA_FRAME]
+                    + cpi->mb.count_mb_ref_frame_usage[LAST_FRAME]
+                    + cpi->mb.count_mb_ref_frame_usage[GOLDEN_FRAME]
+                    + cpi->mb.count_mb_ref_frame_usage[ALTREF_FRAME];
+
+        if (tot_modes)
+            cpi->this_frame_percent_intra =
+                cpi->mb.count_mb_ref_frame_usage[INTRA_FRAME] * 100 / tot_modes;
+
+    }
+
+#if ! CONFIG_REALTIME_ONLY
+    /* Adjust the projected reference frame usage probability numbers to
+     * reflect what we have just seen. This may be useful when we make
+     * multiple iterations of the recode loop rather than continuing to use
+     * values from the previous frame.
+     */
+    if ((cm->frame_type != KEY_FRAME) && ((cpi->oxcf.number_of_layers > 1) ||
+        (!cm->refresh_alt_ref_frame && !cm->refresh_golden_frame)))
+    {
+      vp8_convert_rfct_to_prob(cpi);
+    }
+#endif
+}
+void vp8_setup_block_ptrs(MACROBLOCK *x)
+{
+    int r, c;
+    int i;
+
+    for (r = 0; r < 4; r++)
+    {
+        for (c = 0; c < 4; c++)
+        {
+            x->block[r*4+c].src_diff = x->src_diff + r * 4 * 16 + c * 4;
+        }
+    }
+
+    for (r = 0; r < 2; r++)
+    {
+        for (c = 0; c < 2; c++)
+        {
+            x->block[16 + r*2+c].src_diff = x->src_diff + 256 + r * 4 * 8 + c * 4;
+        }
+    }
+
+
+    for (r = 0; r < 2; r++)
+    {
+        for (c = 0; c < 2; c++)
+        {
+            x->block[20 + r*2+c].src_diff = x->src_diff + 320 + r * 4 * 8 + c * 4;
+        }
+    }
+
+    x->block[24].src_diff = x->src_diff + 384;
+
+
+    for (i = 0; i < 25; i++)
+    {
+        x->block[i].coeff = x->coeff + i * 16;
+    }
+}
+
+void vp8_build_block_offsets(MACROBLOCK *x)
+{
+    int block = 0;
+    int br, bc;
+
+    vp8_build_block_doffsets(&x->e_mbd);
+
+    /* y blocks */
+    x->thismb_ptr = &x->thismb[0];
+    for (br = 0; br < 4; br++)
+    {
+        for (bc = 0; bc < 4; bc++)
+        {
+            BLOCK *this_block = &x->block[block];
+            this_block->base_src = &x->thismb_ptr;
+            this_block->src_stride = 16;
+            this_block->src = 4 * br * 16 + 4 * bc;
+            ++block;
+        }
+    }
+
+    /* u blocks */
+    for (br = 0; br < 2; br++)
+    {
+        for (bc = 0; bc < 2; bc++)
+        {
+            BLOCK *this_block = &x->block[block];
+            this_block->base_src = &x->src.u_buffer;
+            this_block->src_stride = x->src.uv_stride;
+            this_block->src = 4 * br * this_block->src_stride + 4 * bc;
+            ++block;
+        }
+    }
+
+    /* v blocks */
+    for (br = 0; br < 2; br++)
+    {
+        for (bc = 0; bc < 2; bc++)
+        {
+            BLOCK *this_block = &x->block[block];
+            this_block->base_src = &x->src.v_buffer;
+            this_block->src_stride = x->src.uv_stride;
+            this_block->src = 4 * br * this_block->src_stride + 4 * bc;
+            ++block;
+        }
+    }
+}
+
+static void sum_intra_stats(VP8_COMP *cpi, MACROBLOCK *x)
+{
+    const MACROBLOCKD *xd = & x->e_mbd;
+    const MB_PREDICTION_MODE m = xd->mode_info_context->mbmi.mode;
+    const MB_PREDICTION_MODE uvm = xd->mode_info_context->mbmi.uv_mode;
+
+#ifdef MODE_STATS
+    const int is_key = cpi->common.frame_type == KEY_FRAME;
+
+    ++ (is_key ? uv_modes : inter_uv_modes)[uvm];
+
+    if (m == B_PRED)
+    {
+        unsigned int *const bct = is_key ? b_modes : inter_b_modes;
+
+        int b = 0;
+
+        do
+        {
+            ++ bct[xd->block[b].bmi.mode];
+        }
+        while (++b < 16);
+    }
+
+#else
+    (void)cpi;
+#endif
+
+    ++x->ymode_count[m];
+    ++x->uv_mode_count[uvm];
+
+}
+
+/* Experimental stub function to create a per MB zbin adjustment based on
+ * some previously calculated measure of MB activity.
+ */
+static void adjust_act_zbin( VP8_COMP *cpi, MACROBLOCK *x )
+{
+#if USE_ACT_INDEX
+    x->act_zbin_adj = *(x->mb_activity_ptr);
+#else
+    int64_t a;
+    int64_t b;
+    int64_t act = *(x->mb_activity_ptr);
+
+    /* Apply the masking to the RD multiplier. */
+    a = act + 4*cpi->activity_avg;
+    b = 4*act + cpi->activity_avg;
+
+    if ( act > cpi->activity_avg )
+        x->act_zbin_adj = (int)(((int64_t)b + (a>>1))/a) - 1;
+    else
+        x->act_zbin_adj = 1 - (int)(((int64_t)a + (b>>1))/b);
+#endif
+}
+
+int vp8cx_encode_intra_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
+                                  TOKENEXTRA **t)
+{
+    MACROBLOCKD *xd = &x->e_mbd;
+    int rate;
+
+    if (cpi->sf.RD && cpi->compressor_speed != 2)
+        vp8_rd_pick_intra_mode(x, &rate);
+    else
+        vp8_pick_intra_mode(x, &rate);
+
+    if(cpi->oxcf.tuning == VP8_TUNE_SSIM)
+    {
+        adjust_act_zbin( cpi, x );
+        vp8_update_zbin_extra(cpi, x);
+    }
+
+    if (x->e_mbd.mode_info_context->mbmi.mode == B_PRED)
+        vp8_encode_intra4x4mby(x);
+    else
+        vp8_encode_intra16x16mby(x);
+
+    vp8_encode_intra16x16mbuv(x);
+
+    sum_intra_stats(cpi, x);
+
+    vp8_tokenize_mb(cpi, x, t);
+
+    if (xd->mode_info_context->mbmi.mode != B_PRED)
+        vp8_inverse_transform_mby(xd);
+
+    vp8_dequant_idct_add_uv_block
+                    (xd->qcoeff+16*16, xd->dequant_uv,
+                     xd->dst.u_buffer, xd->dst.v_buffer,
+                     xd->dst.uv_stride, xd->eobs+16);
+    return rate;
+}
+#ifdef SPEEDSTATS
+extern int cnt_pm;
+#endif
+
+extern void vp8_fix_contexts(MACROBLOCKD *x);
+
+int vp8cx_encode_inter_macroblock
+(
+    VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,
+    int recon_yoffset, int recon_uvoffset,
+    int mb_row, int mb_col
+)
+{
+    MACROBLOCKD *const xd = &x->e_mbd;
+    int intra_error = 0;
+    int rate;
+    int distortion;
+
+    x->skip = 0;
+
+    if (xd->segmentation_enabled)
+        x->encode_breakout = cpi->segment_encode_breakout[xd->mode_info_context->mbmi.segment_id];
+    else
+        x->encode_breakout = cpi->oxcf.encode_breakout;
+
+#if CONFIG_TEMPORAL_DENOISING
+    /* Reset the best sse mode/mv for each macroblock. */
+    x->best_reference_frame = INTRA_FRAME;
+    x->best_zeromv_reference_frame = INTRA_FRAME;
+    x->best_sse_inter_mode = 0;
+    x->best_sse_mv.as_int = 0;
+    x->need_to_clamp_best_mvs = 0;
+#endif
+
+    if (cpi->sf.RD)
+    {
+        int zbin_mode_boost_enabled = x->zbin_mode_boost_enabled;
+
+        /* Are we using the fast quantizer for the mode selection? */
+        if(cpi->sf.use_fastquant_for_pick)
+        {
+            x->quantize_b      = vp8_fast_quantize_b;
+
+            /* the fast quantizer does not use zbin_extra, so
+             * do not recalculate */
+            x->zbin_mode_boost_enabled = 0;
+        }
+        vp8_rd_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate,
+                               &distortion, &intra_error, mb_row, mb_col);
+
+        /* switch back to the regular quantizer for the encode */
+        if (cpi->sf.improved_quant)
+        {
+            x->quantize_b      = vp8_regular_quantize_b;
+        }
+
+        /* restore cpi->zbin_mode_boost_enabled */
+        x->zbin_mode_boost_enabled = zbin_mode_boost_enabled;
+
+    }
+    else
+    {
+        vp8_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate,
+                            &distortion, &intra_error, mb_row, mb_col);
+    }
+
+    x->prediction_error += distortion;
+    x->intra_error += intra_error;
+
+    if(cpi->oxcf.tuning == VP8_TUNE_SSIM)
+    {
+        /* Adjust the zbin based on this MB rate. */
+        adjust_act_zbin( cpi, x );
+    }
+
+#if 0
+    /* Experimental RD code */
+    cpi->frame_distortion += distortion;
+    cpi->last_mb_distortion = distortion;
+#endif
+
+    /* MB level adjutment to quantizer setup */
+    if (xd->segmentation_enabled)
+    {
+        /* If cyclic update enabled */
+        if (cpi->current_layer == 0 && cpi->cyclic_refresh_mode_enabled)
+        {
+            /* Clear segment_id back to 0 if not coded (last frame 0,0) */
+            if ((xd->mode_info_context->mbmi.segment_id == 1) &&
+                ((xd->mode_info_context->mbmi.ref_frame != LAST_FRAME) || (xd->mode_info_context->mbmi.mode != ZEROMV)))
+            {
+                xd->mode_info_context->mbmi.segment_id = 0;
+
+                /* segment_id changed, so update */
+                vp8cx_mb_init_quantizer(cpi, x, 1);
+            }
+        }
+    }
+
+    {
+        /* Experimental code.
+         * Special case for gf and arf zeromv modes, for 1 temporal layer.
+         * Increase zbin size to supress noise.
+         */
+        x->zbin_mode_boost = 0;
+        if (x->zbin_mode_boost_enabled)
+        {
+            if ( xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME )
+            {
+                if (xd->mode_info_context->mbmi.mode == ZEROMV)
+                {
+                    if (xd->mode_info_context->mbmi.ref_frame != LAST_FRAME &&
+                        cpi->oxcf.number_of_layers == 1)
+                        x->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
+                    else
+                        x->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
+                }
+                else if (xd->mode_info_context->mbmi.mode == SPLITMV)
+                    x->zbin_mode_boost = 0;
+                else
+                    x->zbin_mode_boost = MV_ZBIN_BOOST;
+            }
+        }
+
+        /* The fast quantizer doesn't use zbin_extra, only do so with
+         * the regular quantizer. */
+        if (cpi->sf.improved_quant)
+            vp8_update_zbin_extra(cpi, x);
+    }
+
+    x->count_mb_ref_frame_usage[xd->mode_info_context->mbmi.ref_frame] ++;
+
+    if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
+    {
+        vp8_encode_intra16x16mbuv(x);
+
+        if (xd->mode_info_context->mbmi.mode == B_PRED)
+        {
+            vp8_encode_intra4x4mby(x);
+        }
+        else
+        {
+            vp8_encode_intra16x16mby(x);
+        }
+
+        sum_intra_stats(cpi, x);
+    }
+    else
+    {
+        int ref_fb_idx;
+
+        if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)
+            ref_fb_idx = cpi->common.lst_fb_idx;
+        else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME)
+            ref_fb_idx = cpi->common.gld_fb_idx;
+        else
+            ref_fb_idx = cpi->common.alt_fb_idx;
+
+        xd->pre.y_buffer = cpi->common.yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;
+        xd->pre.u_buffer = cpi->common.yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
+        xd->pre.v_buffer = cpi->common.yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
+
+        if (!x->skip)
+        {
+            vp8_encode_inter16x16(x);
+        }
+        else
+            vp8_build_inter16x16_predictors_mb(xd, xd->dst.y_buffer,
+                                           xd->dst.u_buffer, xd->dst.v_buffer,
+                                           xd->dst.y_stride, xd->dst.uv_stride);
+
+    }
+
+    if (!x->skip)
+    {
+        vp8_tokenize_mb(cpi, x, t);
+
+        if (xd->mode_info_context->mbmi.mode != B_PRED)
+            vp8_inverse_transform_mby(xd);
+
+        vp8_dequant_idct_add_uv_block
+                        (xd->qcoeff+16*16, xd->dequant_uv,
+                         xd->dst.u_buffer, xd->dst.v_buffer,
+                         xd->dst.uv_stride, xd->eobs+16);
+    }
+    else
+    {
+        /* always set mb_skip_coeff as it is needed by the loopfilter */
+        xd->mode_info_context->mbmi.mb_skip_coeff = 1;
+
+        if (cpi->common.mb_no_coeff_skip)
+        {
+            x->skip_true_count ++;
+            vp8_fix_contexts(xd);
+        }
+        else
+        {
+            vp8_stuff_mb(cpi, x, t);
+        }
+    }
+
+    return rate;
+}
diff --git a/libs/libvpx/vp8/encoder/encodeframe.h b/libs/libvpx/vp8/encoder/encodeframe.h
new file mode 100644
index 0000000000..e185c1035c
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/encodeframe.h
@@ -0,0 +1,35 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef VP8_ENCODER_ENCODEFRAME_H_
+#define VP8_ENCODER_ENCODEFRAME_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern void vp8_activity_masking(VP8_COMP *cpi, MACROBLOCK *x);
+
+extern void vp8_build_block_offsets(MACROBLOCK *x);
+
+extern void vp8_setup_block_ptrs(MACROBLOCK *x);
+
+extern void vp8_encode_frame(VP8_COMP *cpi);
+
+extern int vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
+        TOKENEXTRA **t,
+        int recon_yoffset, int recon_uvoffset,
+        int mb_row, int mb_col);
+
+extern int vp8cx_encode_intra_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
+        TOKENEXTRA **t);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_ENCODER_ENCODEFRAME_H_
diff --git a/libs/libvpx/vp8/encoder/encodeintra.c b/libs/libvpx/vp8/encoder/encodeintra.c
new file mode 100644
index 0000000000..44be959c96
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/encodeintra.c
@@ -0,0 +1,140 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vp8/encoder/quantize.h"
+#include "vp8/common/reconintra.h"
+#include "vp8/common/reconintra4x4.h"
+#include "encodemb.h"
+#include "vp8/common/invtrans.h"
+#include "encodeintra.h"
+
+
+int vp8_encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred)
+{
+
+    int i;
+    int intra_pred_var = 0;
+    (void) cpi;
+
+    if (use_dc_pred)
+    {
+        x->e_mbd.mode_info_context->mbmi.mode = DC_PRED;
+        x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;
+        x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;
+
+        vp8_encode_intra16x16mby(x);
+
+        vp8_inverse_transform_mby(&x->e_mbd);
+    }
+    else
+    {
+        for (i = 0; i < 16; i++)
+        {
+            x->e_mbd.block[i].bmi.as_mode = B_DC_PRED;
+            vp8_encode_intra4x4block(x, i);
+        }
+    }
+
+    intra_pred_var = vpx_get_mb_ss(x->src_diff);
+
+    return intra_pred_var;
+}
+
+void vp8_encode_intra4x4block(MACROBLOCK *x, int ib)
+{
+    BLOCKD *b = &x->e_mbd.block[ib];
+    BLOCK *be = &x->block[ib];
+    int dst_stride = x->e_mbd.dst.y_stride;
+    unsigned char *dst = x->e_mbd.dst.y_buffer + b->offset;
+    unsigned char *Above = dst - dst_stride;
+    unsigned char *yleft = dst - 1;
+    unsigned char top_left = Above[-1];
+
+    vp8_intra4x4_predict(Above, yleft, dst_stride, b->bmi.as_mode,
+                         b->predictor, 16, top_left);
+
+    vp8_subtract_b(be, b, 16);
+
+    x->short_fdct4x4(be->src_diff, be->coeff, 32);
+
+    x->quantize_b(be, b);
+
+    if (*b->eob > 1)
+    {
+      vp8_short_idct4x4llm(b->dqcoeff, b->predictor, 16, dst, dst_stride);
+    }
+    else
+    {
+      vp8_dc_only_idct_add(b->dqcoeff[0], b->predictor, 16, dst, dst_stride);
+    }
+}
+
+void vp8_encode_intra4x4mby(MACROBLOCK *mb)
+{
+    int i;
+
+    MACROBLOCKD *xd = &mb->e_mbd;
+    intra_prediction_down_copy(xd, xd->dst.y_buffer - xd->dst.y_stride + 16);
+
+    for (i = 0; i < 16; i++)
+        vp8_encode_intra4x4block(mb, i);
+    return;
+}
+
+void vp8_encode_intra16x16mby(MACROBLOCK *x)
+{
+    BLOCK *b = &x->block[0];
+    MACROBLOCKD *xd = &x->e_mbd;
+
+    vp8_build_intra_predictors_mby_s(xd,
+                                         xd->dst.y_buffer - xd->dst.y_stride,
+                                         xd->dst.y_buffer - 1,
+                                         xd->dst.y_stride,
+                                         xd->dst.y_buffer,
+                                         xd->dst.y_stride);
+
+    vp8_subtract_mby(x->src_diff, *(b->base_src),
+        b->src_stride, xd->dst.y_buffer, xd->dst.y_stride);
+
+    vp8_transform_intra_mby(x);
+
+    vp8_quantize_mby(x);
+
+    if (x->optimize)
+        vp8_optimize_mby(x);
+}
+
+void vp8_encode_intra16x16mbuv(MACROBLOCK *x)
+{
+    MACROBLOCKD *xd = &x->e_mbd;
+
+    vp8_build_intra_predictors_mbuv_s(xd, xd->dst.u_buffer - xd->dst.uv_stride,
+                                      xd->dst.v_buffer - xd->dst.uv_stride,
+                                      xd->dst.u_buffer - 1,
+                                      xd->dst.v_buffer - 1,
+                                      xd->dst.uv_stride,
+                                      xd->dst.u_buffer, xd->dst.v_buffer,
+                                      xd->dst.uv_stride);
+
+    vp8_subtract_mbuv(x->src_diff, x->src.u_buffer,
+        x->src.v_buffer, x->src.uv_stride, xd->dst.u_buffer,
+        xd->dst.v_buffer, xd->dst.uv_stride);
+
+    vp8_transform_mbuv(x);
+
+    vp8_quantize_mbuv(x);
+
+    if (x->optimize)
+        vp8_optimize_mbuv(x);
+}
diff --git a/libs/libvpx/vp8/encoder/encodeintra.h b/libs/libvpx/vp8/encoder/encodeintra.h
new file mode 100644
index 0000000000..a8d0284d29
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/encodeintra.h
@@ -0,0 +1,29 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_ENCODER_ENCODEINTRA_H_
+#define VP8_ENCODER_ENCODEINTRA_H_
+#include "onyx_int.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int vp8_encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred);
+void vp8_encode_intra16x16mby(MACROBLOCK *x);
+void vp8_encode_intra16x16mbuv(MACROBLOCK *x);
+void vp8_encode_intra4x4mby(MACROBLOCK *mb);
+void vp8_encode_intra4x4block(MACROBLOCK *x, int ib);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_ENCODER_ENCODEINTRA_H_
diff --git a/libs/libvpx/vp8/encoder/encodemb.c b/libs/libvpx/vp8/encoder/encodemb.c
new file mode 100644
index 0000000000..932a157ada
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/encodemb.c
@@ -0,0 +1,593 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+#include "encodemb.h"
+#include "vp8/common/reconinter.h"
+#include "vp8/encoder/quantize.h"
+#include "tokenize.h"
+#include "vp8/common/invtrans.h"
+#include "vpx_mem/vpx_mem.h"
+#include "rdopt.h"
+
+void vp8_subtract_b(BLOCK *be, BLOCKD *bd, int pitch) {
+  unsigned char *src_ptr = (*(be->base_src) + be->src);
+  short *diff_ptr = be->src_diff;
+  unsigned char *pred_ptr = bd->predictor;
+  int src_stride = be->src_stride;
+
+  vpx_subtract_block(4, 4, diff_ptr, pitch, src_ptr, src_stride,
+                     pred_ptr, pitch);
+}
+
+void vp8_subtract_mbuv(short *diff, unsigned char *usrc, unsigned char *vsrc,
+                         int src_stride, unsigned char *upred,
+                         unsigned char *vpred, int pred_stride) {
+  short *udiff = diff + 256;
+  short *vdiff = diff + 320;
+
+  vpx_subtract_block(8, 8, udiff, 8, usrc, src_stride, upred, pred_stride);
+  vpx_subtract_block(8, 8, vdiff, 8, vsrc, src_stride, vpred, pred_stride);
+}
+
+void vp8_subtract_mby(short *diff, unsigned char *src, int src_stride,
+                      unsigned char *pred, int pred_stride) {
+  vpx_subtract_block(16, 16, diff, 16, src, src_stride, pred, pred_stride);
+}
+
+static void vp8_subtract_mb(MACROBLOCK *x)
+{
+    BLOCK *b = &x->block[0];
+
+    vp8_subtract_mby(x->src_diff, *(b->base_src),
+        b->src_stride, x->e_mbd.dst.y_buffer, x->e_mbd.dst.y_stride);
+    vp8_subtract_mbuv(x->src_diff, x->src.u_buffer,
+        x->src.v_buffer, x->src.uv_stride, x->e_mbd.dst.u_buffer,
+        x->e_mbd.dst.v_buffer, x->e_mbd.dst.uv_stride);
+}
+
+static void build_dcblock(MACROBLOCK *x)
+{
+    short *src_diff_ptr = &x->src_diff[384];
+    int i;
+
+    for (i = 0; i < 16; i++)
+    {
+        src_diff_ptr[i] = x->coeff[i * 16];
+    }
+}
+
+void vp8_transform_mbuv(MACROBLOCK *x)
+{
+    int i;
+
+    for (i = 16; i < 24; i += 2)
+    {
+        x->short_fdct8x4(&x->block[i].src_diff[0],
+            &x->block[i].coeff[0], 16);
+    }
+}
+
+
+void vp8_transform_intra_mby(MACROBLOCK *x)
+{
+    int i;
+
+    for (i = 0; i < 16; i += 2)
+    {
+        x->short_fdct8x4(&x->block[i].src_diff[0],
+            &x->block[i].coeff[0], 32);
+    }
+
+    /* build dc block from 16 y dc values */
+    build_dcblock(x);
+
+    /* do 2nd order transform on the dc block */
+    x->short_walsh4x4(&x->block[24].src_diff[0],
+        &x->block[24].coeff[0], 8);
+
+}
+
+
+static void transform_mb(MACROBLOCK *x)
+{
+    int i;
+
+    for (i = 0; i < 16; i += 2)
+    {
+        x->short_fdct8x4(&x->block[i].src_diff[0],
+            &x->block[i].coeff[0], 32);
+    }
+
+    /* build dc block from 16 y dc values */
+    if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV)
+        build_dcblock(x);
+
+    for (i = 16; i < 24; i += 2)
+    {
+        x->short_fdct8x4(&x->block[i].src_diff[0],
+            &x->block[i].coeff[0], 16);
+    }
+
+    /* do 2nd order transform on the dc block */
+    if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV)
+        x->short_walsh4x4(&x->block[24].src_diff[0],
+        &x->block[24].coeff[0], 8);
+
+}
+
+
+static void transform_mby(MACROBLOCK *x)
+{
+    int i;
+
+    for (i = 0; i < 16; i += 2)
+    {
+        x->short_fdct8x4(&x->block[i].src_diff[0],
+            &x->block[i].coeff[0], 32);
+    }
+
+    /* build dc block from 16 y dc values */
+    if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV)
+    {
+        build_dcblock(x);
+        x->short_walsh4x4(&x->block[24].src_diff[0],
+            &x->block[24].coeff[0], 8);
+    }
+}
+
+
+
+#define RDTRUNC(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
+
+typedef struct vp8_token_state vp8_token_state;
+
+struct vp8_token_state{
+  int           rate;
+  int           error;
+  signed char   next;
+  signed char   token;
+  short         qc;
+};
+
+/* TODO: experiments to find optimal multiple numbers */
+#define Y1_RD_MULT 4
+#define UV_RD_MULT 2
+#define Y2_RD_MULT 16
+
+static const int plane_rd_mult[4]=
+{
+    Y1_RD_MULT,
+    Y2_RD_MULT,
+    UV_RD_MULT,
+    Y1_RD_MULT
+};
+
+static void optimize_b(MACROBLOCK *mb, int ib, int type,
+                       ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l)
+{
+    BLOCK *b;
+    BLOCKD *d;
+    vp8_token_state tokens[17][2];
+    unsigned best_mask[2];
+    const short *dequant_ptr;
+    const short *coeff_ptr;
+    short *qcoeff_ptr;
+    short *dqcoeff_ptr;
+    int eob;
+    int i0;
+    int rc;
+    int x;
+    int sz = 0;
+    int next;
+    int rdmult;
+    int rddiv;
+    int final_eob;
+    int rd_cost0;
+    int rd_cost1;
+    int rate0;
+    int rate1;
+    int error0;
+    int error1;
+    int t0;
+    int t1;
+    int best;
+    int band;
+    int pt;
+    int i;
+    int err_mult = plane_rd_mult[type];
+
+    b = &mb->block[ib];
+    d = &mb->e_mbd.block[ib];
+
+    dequant_ptr = d->dequant;
+    coeff_ptr = b->coeff;
+    qcoeff_ptr = d->qcoeff;
+    dqcoeff_ptr = d->dqcoeff;
+    i0 = !type;
+    eob = *d->eob;
+
+    /* Now set up a Viterbi trellis to evaluate alternative roundings. */
+    rdmult = mb->rdmult * err_mult;
+    if(mb->e_mbd.mode_info_context->mbmi.ref_frame==INTRA_FRAME)
+        rdmult = (rdmult * 9)>>4;
+
+    rddiv = mb->rddiv;
+    best_mask[0] = best_mask[1] = 0;
+    /* Initialize the sentinel node of the trellis. */
+    tokens[eob][0].rate = 0;
+    tokens[eob][0].error = 0;
+    tokens[eob][0].next = 16;
+    tokens[eob][0].token = DCT_EOB_TOKEN;
+    tokens[eob][0].qc = 0;
+    *(tokens[eob] + 1) = *(tokens[eob] + 0);
+    next = eob;
+    for (i = eob; i-- > i0;)
+    {
+        int base_bits;
+        int d2;
+        int dx;
+
+        rc = vp8_default_zig_zag1d[i];
+        x = qcoeff_ptr[rc];
+        /* Only add a trellis state for non-zero coefficients. */
+        if (x)
+        {
+            int shortcut=0;
+            error0 = tokens[next][0].error;
+            error1 = tokens[next][1].error;
+            /* Evaluate the first possibility for this state. */
+            rate0 = tokens[next][0].rate;
+            rate1 = tokens[next][1].rate;
+            t0 = (vp8_dct_value_tokens_ptr + x)->Token;
+            /* Consider both possible successor states. */
+            if (next < 16)
+            {
+                band = vp8_coef_bands[i + 1];
+                pt = vp8_prev_token_class[t0];
+                rate0 +=
+                    mb->token_costs[type][band][pt][tokens[next][0].token];
+                rate1 +=
+                    mb->token_costs[type][band][pt][tokens[next][1].token];
+            }
+            rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);
+            rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);
+            if (rd_cost0 == rd_cost1)
+            {
+                rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);
+                rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);
+            }
+            /* And pick the best. */
+            best = rd_cost1 < rd_cost0;
+            base_bits = *(vp8_dct_value_cost_ptr + x);
+            dx = dqcoeff_ptr[rc] - coeff_ptr[rc];
+            d2 = dx*dx;
+            tokens[i][0].rate = base_bits + (best ? rate1 : rate0);
+            tokens[i][0].error = d2 + (best ? error1 : error0);
+            tokens[i][0].next = next;
+            tokens[i][0].token = t0;
+            tokens[i][0].qc = x;
+            best_mask[0] |= best << i;
+            /* Evaluate the second possibility for this state. */
+            rate0 = tokens[next][0].rate;
+            rate1 = tokens[next][1].rate;
+
+            if((abs(x)*dequant_ptr[rc]>abs(coeff_ptr[rc])) &&
+               (abs(x)*dequant_ptr[rc]<abs(coeff_ptr[rc])+dequant_ptr[rc]))
+                shortcut = 1;
+            else
+                shortcut = 0;
+
+            if(shortcut)
+            {
+                sz = -(x < 0);
+                x -= 2*sz + 1;
+            }
+
+            /* Consider both possible successor states. */
+            if (!x)
+            {
+                /* If we reduced this coefficient to zero, check to see if
+                 *  we need to move the EOB back here.
+                 */
+                t0 = tokens[next][0].token == DCT_EOB_TOKEN ?
+                    DCT_EOB_TOKEN : ZERO_TOKEN;
+                t1 = tokens[next][1].token == DCT_EOB_TOKEN ?
+                    DCT_EOB_TOKEN : ZERO_TOKEN;
+            }
+            else
+            {
+                t0=t1 = (vp8_dct_value_tokens_ptr + x)->Token;
+            }
+            if (next < 16)
+            {
+                band = vp8_coef_bands[i + 1];
+                if(t0!=DCT_EOB_TOKEN)
+                {
+                    pt = vp8_prev_token_class[t0];
+                    rate0 += mb->token_costs[type][band][pt][
+                        tokens[next][0].token];
+                }
+                if(t1!=DCT_EOB_TOKEN)
+                {
+                    pt = vp8_prev_token_class[t1];
+                    rate1 += mb->token_costs[type][band][pt][
+                        tokens[next][1].token];
+                }
+            }
+
+            rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);
+            rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);
+            if (rd_cost0 == rd_cost1)
+            {
+                rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);
+                rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);
+            }
+            /* And pick the best. */
+            best = rd_cost1 < rd_cost0;
+            base_bits = *(vp8_dct_value_cost_ptr + x);
+
+            if(shortcut)
+            {
+                dx -= (dequant_ptr[rc] + sz) ^ sz;
+                d2 = dx*dx;
+            }
+            tokens[i][1].rate = base_bits + (best ? rate1 : rate0);
+            tokens[i][1].error = d2 + (best ? error1 : error0);
+            tokens[i][1].next = next;
+            tokens[i][1].token =best?t1:t0;
+            tokens[i][1].qc = x;
+            best_mask[1] |= best << i;
+            /* Finally, make this the new head of the trellis. */
+            next = i;
+        }
+        /* There's no choice to make for a zero coefficient, so we don't
+         *  add a new trellis node, but we do need to update the costs.
+         */
+        else
+        {
+            band = vp8_coef_bands[i + 1];
+            t0 = tokens[next][0].token;
+            t1 = tokens[next][1].token;
+            /* Update the cost of each path if we're past the EOB token. */
+            if (t0 != DCT_EOB_TOKEN)
+            {
+                tokens[next][0].rate += mb->token_costs[type][band][0][t0];
+                tokens[next][0].token = ZERO_TOKEN;
+            }
+            if (t1 != DCT_EOB_TOKEN)
+            {
+                tokens[next][1].rate += mb->token_costs[type][band][0][t1];
+                tokens[next][1].token = ZERO_TOKEN;
+            }
+            /* Don't update next, because we didn't add a new node. */
+        }
+    }
+
+    /* Now pick the best path through the whole trellis. */
+    band = vp8_coef_bands[i + 1];
+    VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+    rate0 = tokens[next][0].rate;
+    rate1 = tokens[next][1].rate;
+    error0 = tokens[next][0].error;
+    error1 = tokens[next][1].error;
+    t0 = tokens[next][0].token;
+    t1 = tokens[next][1].token;
+    rate0 += mb->token_costs[type][band][pt][t0];
+    rate1 += mb->token_costs[type][band][pt][t1];
+    rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);
+    rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);
+    if (rd_cost0 == rd_cost1)
+    {
+        rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);
+        rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);
+    }
+    best = rd_cost1 < rd_cost0;
+    final_eob = i0 - 1;
+    for (i = next; i < eob; i = next)
+    {
+        x = tokens[i][best].qc;
+        if (x)
+            final_eob = i;
+        rc = vp8_default_zig_zag1d[i];
+        qcoeff_ptr[rc] = x;
+        dqcoeff_ptr[rc] = x * dequant_ptr[rc];
+        next = tokens[i][best].next;
+        best = (best_mask[best] >> i) & 1;
+    }
+    final_eob++;
+
+    *a = *l = (final_eob != !type);
+    *d->eob = (char)final_eob;
+}
+static void check_reset_2nd_coeffs(MACROBLOCKD *x, int type,
+                                   ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l)
+{
+    int sum=0;
+    int i;
+    BLOCKD *bd = &x->block[24];
+
+    if(bd->dequant[0]>=35 && bd->dequant[1]>=35)
+        return;
+
+    for(i=0;i<(*bd->eob);i++)
+    {
+        int coef = bd->dqcoeff[vp8_default_zig_zag1d[i]];
+        sum+= (coef>=0)?coef:-coef;
+        if(sum>=35)
+            return;
+    }
+    /**************************************************************************
+    our inverse hadamard transform effectively is weighted sum of all 16 inputs
+    with weight either 1 or -1. It has a last stage scaling of (sum+3)>>3. And
+    dc only idct is (dc+4)>>3. So if all the sums are between -35 and 29, the
+    output after inverse wht and idct will be all zero. A sum of absolute value
+    smaller than 35 guarantees all 16 different (+1/-1) weighted sums in wht
+    fall between -35 and +35.
+    **************************************************************************/
+    if(sum < 35)
+    {
+        for(i=0;i<(*bd->eob);i++)
+        {
+            int rc = vp8_default_zig_zag1d[i];
+            bd->qcoeff[rc]=0;
+            bd->dqcoeff[rc]=0;
+        }
+        *bd->eob = 0;
+        *a = *l = (*bd->eob != !type);
+    }
+}
+
+static void optimize_mb(MACROBLOCK *x)
+{
+    int b;
+    int type;
+    int has_2nd_order;
+
+    ENTROPY_CONTEXT_PLANES t_above, t_left;
+    ENTROPY_CONTEXT *ta;
+    ENTROPY_CONTEXT *tl;
+
+    memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+    ta = (ENTROPY_CONTEXT *)&t_above;
+    tl = (ENTROPY_CONTEXT *)&t_left;
+
+    has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED
+        && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
+    type = has_2nd_order ? PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC;
+
+    for (b = 0; b < 16; b++)
+    {
+        optimize_b(x, b, type,
+            ta + vp8_block2above[b], tl + vp8_block2left[b]);
+    }
+
+    for (b = 16; b < 24; b++)
+    {
+        optimize_b(x, b, PLANE_TYPE_UV,
+            ta + vp8_block2above[b], tl + vp8_block2left[b]);
+    }
+
+    if (has_2nd_order)
+    {
+        b=24;
+        optimize_b(x, b, PLANE_TYPE_Y2,
+            ta + vp8_block2above[b], tl + vp8_block2left[b]);
+        check_reset_2nd_coeffs(&x->e_mbd, PLANE_TYPE_Y2,
+            ta + vp8_block2above[b], tl + vp8_block2left[b]);
+    }
+}
+
+
+void vp8_optimize_mby(MACROBLOCK *x)
+{
+    int b;
+    int type;
+    int has_2nd_order;
+
+    ENTROPY_CONTEXT_PLANES t_above, t_left;
+    ENTROPY_CONTEXT *ta;
+    ENTROPY_CONTEXT *tl;
+
+    if (!x->e_mbd.above_context)
+        return;
+
+    if (!x->e_mbd.left_context)
+        return;
+
+    memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+    ta = (ENTROPY_CONTEXT *)&t_above;
+    tl = (ENTROPY_CONTEXT *)&t_left;
+
+    has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED
+        && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
+    type = has_2nd_order ? PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC;
+
+    for (b = 0; b < 16; b++)
+    {
+        optimize_b(x, b, type,
+            ta + vp8_block2above[b], tl + vp8_block2left[b]);
+    }
+
+
+    if (has_2nd_order)
+    {
+        b=24;
+        optimize_b(x, b, PLANE_TYPE_Y2,
+            ta + vp8_block2above[b], tl + vp8_block2left[b]);
+        check_reset_2nd_coeffs(&x->e_mbd, PLANE_TYPE_Y2,
+            ta + vp8_block2above[b], tl + vp8_block2left[b]);
+    }
+}
+
+void vp8_optimize_mbuv(MACROBLOCK *x)
+{
+    int b;
+    ENTROPY_CONTEXT_PLANES t_above, t_left;
+    ENTROPY_CONTEXT *ta;
+    ENTROPY_CONTEXT *tl;
+
+    if (!x->e_mbd.above_context)
+        return;
+
+    if (!x->e_mbd.left_context)
+        return;
+
+    memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+    ta = (ENTROPY_CONTEXT *)&t_above;
+    tl = (ENTROPY_CONTEXT *)&t_left;
+
+    for (b = 16; b < 24; b++)
+    {
+        optimize_b(x, b, PLANE_TYPE_UV,
+            ta + vp8_block2above[b], tl + vp8_block2left[b]);
+    }
+}
+
+void vp8_encode_inter16x16(MACROBLOCK *x)
+{
+    vp8_build_inter_predictors_mb(&x->e_mbd);
+
+    vp8_subtract_mb(x);
+
+    transform_mb(x);
+
+    vp8_quantize_mb(x);
+
+    if (x->optimize)
+        optimize_mb(x);
+}
+
+/* this funciton is used by first pass only */
+void vp8_encode_inter16x16y(MACROBLOCK *x)
+{
+    BLOCK *b = &x->block[0];
+
+    vp8_build_inter16x16_predictors_mby(&x->e_mbd, x->e_mbd.dst.y_buffer,
+                                        x->e_mbd.dst.y_stride);
+
+    vp8_subtract_mby(x->src_diff, *(b->base_src),
+        b->src_stride, x->e_mbd.dst.y_buffer, x->e_mbd.dst.y_stride);
+
+    transform_mby(x);
+
+    vp8_quantize_mby(x);
+
+    vp8_inverse_transform_mby(&x->e_mbd);
+}
diff --git a/libs/libvpx/vp8/encoder/encodemb.h b/libs/libvpx/vp8/encoder/encodemb.h
new file mode 100644
index 0000000000..10b3d8651a
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/encodemb.h
@@ -0,0 +1,41 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_ENCODER_ENCODEMB_H_
+#define VP8_ENCODER_ENCODEMB_H_
+
+#include "onyx_int.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+void vp8_encode_inter16x16(MACROBLOCK *x);
+
+void vp8_subtract_b(BLOCK *be, BLOCKD *bd, int pitch);
+void vp8_subtract_mbuv(short *diff, unsigned char *usrc, unsigned char *vsrc,
+                       int src_stride, unsigned char *upred,
+                       unsigned char *vpred, int pred_stride);
+void vp8_subtract_mby(short *diff, unsigned char *src, int src_stride,
+                      unsigned char *pred, int pred_stride);
+
+void vp8_build_dcblock(MACROBLOCK *b);
+void vp8_transform_mb(MACROBLOCK *mb);
+void vp8_transform_mbuv(MACROBLOCK *x);
+void vp8_transform_intra_mby(MACROBLOCK *x);
+
+void vp8_optimize_mby(MACROBLOCK *x);
+void vp8_optimize_mbuv(MACROBLOCK *x);
+void vp8_encode_inter16x16y(MACROBLOCK *x);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_ENCODER_ENCODEMB_H_
diff --git a/libs/libvpx/vp8/encoder/encodemv.c b/libs/libvpx/vp8/encoder/encodemv.c
new file mode 100644
index 0000000000..2a74ff4ae3
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/encodemv.c
@@ -0,0 +1,380 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp8/common/common.h"
+#include "encodemv.h"
+#include "vp8/common/entropymode.h"
+#include "vp8/common/systemdependent.h"
+
+#include <math.h>
+
+#ifdef VP8_ENTROPY_STATS
+extern unsigned int active_section;
+#endif
+
+static void encode_mvcomponent(
+    vp8_writer *const w,
+    const int v,
+    const struct mv_context *mvc
+)
+{
+    const vp8_prob *p = mvc->prob;
+    const int x = v < 0 ? -v : v;
+
+    if (x < mvnum_short)     /* Small */
+    {
+        vp8_write(w, 0, p [mvpis_short]);
+        vp8_treed_write(w, vp8_small_mvtree, p + MVPshort, x, 3);
+
+        if (!x)
+            return;         /* no sign bit */
+    }
+    else                    /* Large */
+    {
+        int i = 0;
+
+        vp8_write(w, 1, p [mvpis_short]);
+
+        do
+            vp8_write(w, (x >> i) & 1, p [MVPbits + i]);
+
+        while (++i < 3);
+
+        i = mvlong_width - 1;  /* Skip bit 3, which is sometimes implicit */
+
+        do
+            vp8_write(w, (x >> i) & 1, p [MVPbits + i]);
+
+        while (--i > 3);
+
+        if (x & 0xFFF0)
+            vp8_write(w, (x >> 3) & 1, p [MVPbits + 3]);
+    }
+
+    vp8_write(w, v < 0, p [MVPsign]);
+}
+#if 0
+static int max_mv_r = 0;
+static int max_mv_c = 0;
+#endif
+void vp8_encode_motion_vector(vp8_writer *w, const MV *mv, const MV_CONTEXT *mvc)
+{
+
+#if 0
+    {
+        if (abs(mv->row >> 1) > max_mv_r)
+        {
+            FILE *f = fopen("maxmv.stt", "a");
+            max_mv_r = abs(mv->row >> 1);
+            fprintf(f, "New Mv Row Max %6d\n", (mv->row >> 1));
+
+            if ((abs(mv->row) / 2) != max_mv_r)
+                fprintf(f, "MV Row conversion error %6d\n", abs(mv->row) / 2);
+
+            fclose(f);
+        }
+
+        if (abs(mv->col >> 1) > max_mv_c)
+        {
+            FILE *f = fopen("maxmv.stt", "a");
+            fprintf(f, "New Mv Col Max %6d\n", (mv->col >> 1));
+            max_mv_c = abs(mv->col >> 1);
+            fclose(f);
+        }
+    }
+#endif
+
+    encode_mvcomponent(w, mv->row >> 1, &mvc[0]);
+    encode_mvcomponent(w, mv->col >> 1, &mvc[1]);
+}
+
+
+static unsigned int cost_mvcomponent(const int v, const struct mv_context *mvc)
+{
+    const vp8_prob *p = mvc->prob;
+    const int x = v;
+    unsigned int cost;
+
+    if (x < mvnum_short)
+    {
+        cost = vp8_cost_zero(p [mvpis_short])
+               + vp8_treed_cost(vp8_small_mvtree, p + MVPshort, x, 3);
+
+        if (!x)
+            return cost;
+    }
+    else
+    {
+        int i = 0;
+        cost = vp8_cost_one(p [mvpis_short]);
+
+        do
+            cost += vp8_cost_bit(p [MVPbits + i], (x >> i) & 1);
+
+        while (++i < 3);
+
+        i = mvlong_width - 1;  /* Skip bit 3, which is sometimes implicit */
+
+        do
+            cost += vp8_cost_bit(p [MVPbits + i], (x >> i) & 1);
+
+        while (--i > 3);
+
+        if (x & 0xFFF0)
+            cost += vp8_cost_bit(p [MVPbits + 3], (x >> 3) & 1);
+    }
+
+    return cost;   /* + vp8_cost_bit( p [MVPsign], v < 0); */
+}
+
+void vp8_build_component_cost_table(int *mvcost[2], const MV_CONTEXT *mvc, int mvc_flag[2])
+{
+    int i = 1;
+    unsigned int cost0 = 0;
+    unsigned int cost1 = 0;
+
+    vp8_clear_system_state();
+
+    i = 1;
+
+    if (mvc_flag[0])
+    {
+        mvcost [0] [0] = cost_mvcomponent(0, &mvc[0]);
+
+        do
+        {
+            cost0 = cost_mvcomponent(i, &mvc[0]);
+
+            mvcost [0] [i] = cost0 + vp8_cost_zero(mvc[0].prob[MVPsign]);
+            mvcost [0] [-i] = cost0 + vp8_cost_one(mvc[0].prob[MVPsign]);
+        }
+        while (++i <= mv_max);
+    }
+
+    i = 1;
+
+    if (mvc_flag[1])
+    {
+        mvcost [1] [0] = cost_mvcomponent(0, &mvc[1]);
+
+        do
+        {
+            cost1 = cost_mvcomponent(i, &mvc[1]);
+
+            mvcost [1] [i] = cost1 + vp8_cost_zero(mvc[1].prob[MVPsign]);
+            mvcost [1] [-i] = cost1 + vp8_cost_one(mvc[1].prob[MVPsign]);
+        }
+        while (++i <= mv_max);
+    }
+}
+
+
+/* Motion vector probability table update depends on benefit.
+ * Small correction allows for the fact that an update to an MV probability
+ * may have benefit in subsequent frames as well as the current one.
+ */
+#define MV_PROB_UPDATE_CORRECTION   -1
+
+
+static void calc_prob(vp8_prob *p, const unsigned int ct[2])
+{
+    const unsigned int tot = ct[0] + ct[1];
+
+    if (tot)
+    {
+        const vp8_prob x = ((ct[0] * 255) / tot) & -2;
+        *p = x ? x : 1;
+    }
+}
+
+static void update(
+    vp8_writer *const w,
+    const unsigned int ct[2],
+    vp8_prob *const cur_p,
+    const vp8_prob new_p,
+    const vp8_prob update_p,
+    int *updated
+)
+{
+    const int cur_b = vp8_cost_branch(ct, *cur_p);
+    const int new_b = vp8_cost_branch(ct, new_p);
+    const int cost = 7 + MV_PROB_UPDATE_CORRECTION + ((vp8_cost_one(update_p) - vp8_cost_zero(update_p) + 128) >> 8);
+
+    if (cur_b - new_b > cost)
+    {
+        *cur_p = new_p;
+        vp8_write(w, 1, update_p);
+        vp8_write_literal(w, new_p >> 1, 7);
+        *updated = 1;
+
+    }
+    else
+        vp8_write(w, 0, update_p);
+}
+
+static void write_component_probs(
+    vp8_writer *const w,
+    struct mv_context *cur_mvc,
+    const struct mv_context *default_mvc_,
+    const struct mv_context *update_mvc,
+    const unsigned int events [MVvals],
+    unsigned int rc,
+    int *updated
+)
+{
+    vp8_prob *Pcur = cur_mvc->prob;
+    const vp8_prob *default_mvc = default_mvc_->prob;
+    const vp8_prob *Pupdate = update_mvc->prob;
+    unsigned int is_short_ct[2], sign_ct[2];
+
+    unsigned int bit_ct [mvlong_width] [2];
+
+    unsigned int short_ct  [mvnum_short];
+    unsigned int short_bct [mvnum_short-1] [2];
+
+    vp8_prob Pnew [MVPcount];
+
+    (void) rc;
+    vp8_copy_array(Pnew, default_mvc, MVPcount);
+
+    vp8_zero(is_short_ct)
+    vp8_zero(sign_ct)
+    vp8_zero(bit_ct)
+    vp8_zero(short_ct)
+    vp8_zero(short_bct)
+
+
+    /* j=0 */
+    {
+        const int c = events [mv_max];
+
+        is_short_ct [0] += c;     /* Short vector */
+        short_ct [0] += c;       /* Magnitude distribution */
+    }
+
+    /* j: 1 ~ mv_max (1023) */
+    {
+        int j = 1;
+
+        do
+        {
+            const int c1 = events [mv_max + j];  /* positive */
+            const int c2 = events [mv_max - j];  /* negative */
+            const int c  = c1 + c2;
+            int a = j;
+
+            sign_ct [0] += c1;
+            sign_ct [1] += c2;
+
+            if (a < mvnum_short)
+            {
+                is_short_ct [0] += c;     /* Short vector */
+                short_ct [a] += c;       /* Magnitude distribution */
+            }
+            else
+            {
+                int k = mvlong_width - 1;
+                is_short_ct [1] += c;     /* Long vector */
+
+                /*  bit 3 not always encoded. */
+                do
+                    bit_ct [k] [(a >> k) & 1] += c;
+
+                while (--k >= 0);
+            }
+        }
+        while (++j <= mv_max);
+    }
+
+    calc_prob(Pnew + mvpis_short, is_short_ct);
+
+    calc_prob(Pnew + MVPsign, sign_ct);
+
+    {
+        vp8_prob p [mvnum_short - 1];    /* actually only need branch ct */
+        int j = 0;
+
+        vp8_tree_probs_from_distribution(
+            8, vp8_small_mvencodings, vp8_small_mvtree,
+            p, short_bct, short_ct,
+            256, 1
+        );
+
+        do
+            calc_prob(Pnew + MVPshort + j, short_bct[j]);
+
+        while (++j < mvnum_short - 1);
+    }
+
+    {
+        int j = 0;
+
+        do
+            calc_prob(Pnew + MVPbits + j, bit_ct[j]);
+
+        while (++j < mvlong_width);
+    }
+
+    update(w, is_short_ct, Pcur + mvpis_short, Pnew[mvpis_short], *Pupdate++, updated);
+
+    update(w, sign_ct, Pcur + MVPsign, Pnew[MVPsign], *Pupdate++, updated);
+
+    {
+        const vp8_prob *const new_p = Pnew + MVPshort;
+        vp8_prob *const cur_p = Pcur + MVPshort;
+
+        int j = 0;
+
+        do
+
+            update(w, short_bct[j], cur_p + j, new_p[j], *Pupdate++, updated);
+
+        while (++j < mvnum_short - 1);
+    }
+
+    {
+        const vp8_prob *const new_p = Pnew + MVPbits;
+        vp8_prob *const cur_p = Pcur + MVPbits;
+
+        int j = 0;
+
+        do
+
+            update(w, bit_ct[j], cur_p + j, new_p[j], *Pupdate++, updated);
+
+        while (++j < mvlong_width);
+    }
+}
+
+void vp8_write_mvprobs(VP8_COMP *cpi)
+{
+    vp8_writer *const w  = cpi->bc;
+    MV_CONTEXT *mvc = cpi->common.fc.mvc;
+    int flags[2] = {0, 0};
+#ifdef VP8_ENTROPY_STATS
+    active_section = 4;
+#endif
+    write_component_probs(
+        w, &mvc[0], &vp8_default_mv_context[0], &vp8_mv_update_probs[0],
+        cpi->mb.MVcount[0], 0, &flags[0]
+    );
+    write_component_probs(
+        w, &mvc[1], &vp8_default_mv_context[1], &vp8_mv_update_probs[1],
+        cpi->mb.MVcount[1], 1, &flags[1]
+    );
+
+    if (flags[0] || flags[1])
+        vp8_build_component_cost_table(cpi->mb.mvcost, (const MV_CONTEXT *) cpi->common.fc.mvc, flags);
+
+#ifdef VP8_ENTROPY_STATS
+    active_section = 5;
+#endif
+}
diff --git a/libs/libvpx/vp8/encoder/encodemv.h b/libs/libvpx/vp8/encoder/encodemv.h
new file mode 100644
index 0000000000..722162ba21
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/encodemv.h
@@ -0,0 +1,29 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_ENCODER_ENCODEMV_H_
+#define VP8_ENCODER_ENCODEMV_H_
+
+#include "onyx_int.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp8_write_mvprobs(VP8_COMP *);
+void vp8_encode_motion_vector(vp8_writer *, const MV *, const MV_CONTEXT *);
+void vp8_build_component_cost_table(int *mvcost[2], const MV_CONTEXT *mvc, int mvc_flag[2]);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_ENCODER_ENCODEMV_H_
diff --git a/libs/libvpx/vp8/encoder/ethreading.c b/libs/libvpx/vp8/encoder/ethreading.c
new file mode 100644
index 0000000000..4f689c4bc7
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/ethreading.c
@@ -0,0 +1,680 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "onyx_int.h"
+#include "vp8/common/threading.h"
+#include "vp8/common/common.h"
+#include "vp8/common/extend.h"
+#include "bitstream.h"
+#include "encodeframe.h"
+
+#if CONFIG_MULTITHREAD
+
+extern void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x, int ok_to_skip);
+
+static THREAD_FUNCTION thread_loopfilter(void *p_data)
+{
+    VP8_COMP *cpi = (VP8_COMP *)(((LPFTHREAD_DATA *)p_data)->ptr1);
+    VP8_COMMON *cm = &cpi->common;
+
+    while (1)
+    {
+        if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded) == 0)
+            break;
+
+        if (sem_wait(&cpi->h_event_start_lpf) == 0)
+        {
+            /* we're shutting down */
+            if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded) == 0)
+                break;
+
+            vp8_loopfilter_frame(cpi, cm);
+
+            sem_post(&cpi->h_event_end_lpf);
+        }
+    }
+
+    return 0;
+}
+
+static
+THREAD_FUNCTION thread_encoding_proc(void *p_data)
+{
+    int ithread = ((ENCODETHREAD_DATA *)p_data)->ithread;
+    VP8_COMP *cpi = (VP8_COMP *)(((ENCODETHREAD_DATA *)p_data)->ptr1);
+    MB_ROW_COMP *mbri = (MB_ROW_COMP *)(((ENCODETHREAD_DATA *)p_data)->ptr2);
+    ENTROPY_CONTEXT_PLANES mb_row_left_context;
+
+    while (1)
+    {
+        if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded) == 0)
+            break;
+
+        if (sem_wait(&cpi->h_event_start_encoding[ithread]) == 0)
+        {
+            const int nsync = cpi->mt_sync_range;
+            VP8_COMMON *cm = &cpi->common;
+            int mb_row;
+            MACROBLOCK *x = &mbri->mb;
+            MACROBLOCKD *xd = &x->e_mbd;
+            TOKENEXTRA *tp ;
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+            TOKENEXTRA *tp_start = cpi->tok + (1 + ithread) * (16 * 24);
+            const int num_part = (1 << cm->multi_token_partition);
+#endif
+
+            int *segment_counts = mbri->segment_counts;
+            int *totalrate = &mbri->totalrate;
+
+            /* we're shutting down */
+            if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded) == 0)
+                break;
+
+            xd->mode_info_context = cm->mi + cm->mode_info_stride *
+                (ithread + 1);
+            xd->mode_info_stride = cm->mode_info_stride;
+
+            for (mb_row = ithread + 1; mb_row < cm->mb_rows; mb_row += (cpi->encoding_thread_count + 1))
+            {
+
+                int recon_yoffset, recon_uvoffset;
+                int mb_col;
+                int ref_fb_idx = cm->lst_fb_idx;
+                int dst_fb_idx = cm->new_fb_idx;
+                int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
+                int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
+                int map_index = (mb_row * cm->mb_cols);
+                const int *last_row_current_mb_col;
+                int *current_mb_col = &cpi->mt_current_mb_col[mb_row];
+
+#if  (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
+                vp8_writer *w = &cpi->bc[1 + (mb_row % num_part)];
+#else
+                tp = cpi->tok + (mb_row * (cm->mb_cols * 16 * 24));
+                cpi->tplist[mb_row].start = tp;
+#endif
+
+                last_row_current_mb_col = &cpi->mt_current_mb_col[mb_row - 1];
+
+                /* reset above block coeffs */
+                xd->above_context = cm->above_context;
+                xd->left_context = &mb_row_left_context;
+
+                vp8_zero(mb_row_left_context);
+
+                xd->up_available = (mb_row != 0);
+                recon_yoffset = (mb_row * recon_y_stride * 16);
+                recon_uvoffset = (mb_row * recon_uv_stride * 8);
+
+                /* Set the mb activity pointer to the start of the row. */
+                x->mb_activity_ptr = &cpi->mb_activity_map[map_index];
+
+                /* for each macroblock col in image */
+                for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+                {
+                    if (((mb_col - 1) % nsync) == 0) {
+                        pthread_mutex_t *mutex = &cpi->pmutex[mb_row];
+                        protected_write(mutex, current_mb_col, mb_col - 1);
+                    }
+
+                    if (mb_row && !(mb_col & (nsync - 1))) {
+                      pthread_mutex_t *mutex = &cpi->pmutex[mb_row-1];
+                      sync_read(mutex, mb_col, last_row_current_mb_col, nsync);
+                    }
+
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+                    tp = tp_start;
+#endif
+
+                    /* Distance of Mb to the various image edges.
+                     * These specified to 8th pel as they are always compared
+                     * to values that are in 1/8th pel units
+                     */
+                    xd->mb_to_left_edge = -((mb_col * 16) << 3);
+                    xd->mb_to_right_edge = ((cm->mb_cols - 1 - mb_col) * 16) << 3;
+                    xd->mb_to_top_edge = -((mb_row * 16) << 3);
+                    xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;
+
+                    /* Set up limit values for motion vectors used to prevent
+                     * them extending outside the UMV borders
+                     */
+                    x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16));
+                    x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + (VP8BORDERINPIXELS - 16);
+                    x->mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 16));
+                    x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16) + (VP8BORDERINPIXELS - 16);
+
+                    xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
+                    xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
+                    xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
+                    xd->left_available = (mb_col != 0);
+
+                    x->rddiv = cpi->RDDIV;
+                    x->rdmult = cpi->RDMULT;
+
+                    /* Copy current mb to a buffer */
+                    vp8_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
+
+                    if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
+                        vp8_activity_masking(cpi, x);
+
+                    /* Is segmentation enabled */
+                    /* MB level adjustment to quantizer */
+                    if (xd->segmentation_enabled)
+                    {
+                        /* Code to set segment id in xd->mbmi.segment_id for
+                         * current MB (with range checking)
+                         */
+                        if (cpi->segmentation_map[map_index + mb_col] <= 3)
+                            xd->mode_info_context->mbmi.segment_id = cpi->segmentation_map[map_index + mb_col];
+                        else
+                            xd->mode_info_context->mbmi.segment_id = 0;
+
+                        vp8cx_mb_init_quantizer(cpi, x, 1);
+                    }
+                    else
+                        /* Set to Segment 0 by default */
+                        xd->mode_info_context->mbmi.segment_id = 0;
+
+                    x->active_ptr = cpi->active_map + map_index + mb_col;
+
+                    if (cm->frame_type == KEY_FRAME)
+                    {
+                        *totalrate += vp8cx_encode_intra_macroblock(cpi, x, &tp);
+#ifdef MODE_STATS
+                        y_modes[xd->mbmi.mode] ++;
+#endif
+                    }
+                    else
+                    {
+                        *totalrate += vp8cx_encode_inter_macroblock(cpi, x, &tp, recon_yoffset, recon_uvoffset, mb_row, mb_col);
+
+#ifdef MODE_STATS
+                        inter_y_modes[xd->mbmi.mode] ++;
+
+                        if (xd->mbmi.mode == SPLITMV)
+                        {
+                            int b;
+
+                            for (b = 0; b < xd->mbmi.partition_count; b++)
+                            {
+                                inter_b_modes[x->partition->bmi[b].mode] ++;
+                            }
+                        }
+
+#endif
+                        // Keep track of how many (consecutive) times a  block
+                        // is coded as ZEROMV_LASTREF, for base layer frames.
+                        // Reset to 0 if its coded as anything else.
+                        if (cpi->current_layer == 0) {
+                          if (xd->mode_info_context->mbmi.mode == ZEROMV &&
+                              xd->mode_info_context->mbmi.ref_frame ==
+                                  LAST_FRAME) {
+                            // Increment, check for wrap-around.
+                            if (cpi->consec_zero_last[map_index+mb_col] < 255)
+                              cpi->consec_zero_last[map_index+mb_col] += 1;
+                            if (cpi->consec_zero_last_mvbias[map_index+mb_col] < 255)
+                              cpi->consec_zero_last_mvbias[map_index+mb_col] += 1;
+                          } else {
+                            cpi->consec_zero_last[map_index+mb_col] = 0;
+                            cpi->consec_zero_last_mvbias[map_index+mb_col] = 0;
+                          }
+                          if (x->zero_last_dot_suppress)
+                            cpi->consec_zero_last_mvbias[map_index+mb_col] = 0;
+                        }
+
+                        /* Special case code for cyclic refresh
+                         * If cyclic update enabled then copy
+                         * xd->mbmi.segment_id; (which may have been updated
+                         * based on mode during
+                         * vp8cx_encode_inter_macroblock()) back into the
+                         * global segmentation map
+                         */
+                        if ((cpi->current_layer == 0) &&
+                            (cpi->cyclic_refresh_mode_enabled &&
+                             xd->segmentation_enabled))
+                        {
+                            const MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
+                            cpi->segmentation_map[map_index + mb_col] = mbmi->segment_id;
+
+                            /* If the block has been refreshed mark it as clean
+                             * (the magnitude of the -ve influences how long it
+                             * will be before we consider another refresh):
+                             * Else if it was coded (last frame 0,0) and has
+                             * not already been refreshed then mark it as a
+                             * candidate for cleanup next time (marked 0) else
+                             * mark it as dirty (1).
+                             */
+                            if (mbmi->segment_id)
+                                cpi->cyclic_refresh_map[map_index + mb_col] = -1;
+                            else if ((mbmi->mode == ZEROMV) && (mbmi->ref_frame == LAST_FRAME))
+                            {
+                                if (cpi->cyclic_refresh_map[map_index + mb_col] == 1)
+                                    cpi->cyclic_refresh_map[map_index + mb_col] = 0;
+                            }
+                            else
+                                cpi->cyclic_refresh_map[map_index + mb_col] = 1;
+
+                        }
+                    }
+
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+                    /* pack tokens for this MB */
+                    {
+                        int tok_count = tp - tp_start;
+                        vp8_pack_tokens(w, tp_start, tok_count);
+                    }
+#else
+                    cpi->tplist[mb_row].stop = tp;
+#endif
+                    /* Increment pointer into gf usage flags structure. */
+                    x->gf_active_ptr++;
+
+                    /* Increment the activity mask pointers. */
+                    x->mb_activity_ptr++;
+
+                    /* adjust to the next column of macroblocks */
+                    x->src.y_buffer += 16;
+                    x->src.u_buffer += 8;
+                    x->src.v_buffer += 8;
+
+                    recon_yoffset += 16;
+                    recon_uvoffset += 8;
+
+                    /* Keep track of segment usage */
+                    segment_counts[xd->mode_info_context->mbmi.segment_id]++;
+
+                    /* skip to next mb */
+                    xd->mode_info_context++;
+                    x->partition_info++;
+                    xd->above_context++;
+                }
+
+                vp8_extend_mb_row( &cm->yv12_fb[dst_fb_idx],
+                                    xd->dst.y_buffer + 16,
+                                    xd->dst.u_buffer + 8,
+                                    xd->dst.v_buffer + 8);
+
+                protected_write(&cpi->pmutex[mb_row], current_mb_col,
+                                mb_col + nsync);
+
+                /* this is to account for the border */
+                xd->mode_info_context++;
+                x->partition_info++;
+
+                x->src.y_buffer += 16 * x->src.y_stride * (cpi->encoding_thread_count + 1) - 16 * cm->mb_cols;
+                x->src.u_buffer += 8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols;
+                x->src.v_buffer += 8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols;
+
+                xd->mode_info_context += xd->mode_info_stride * cpi->encoding_thread_count;
+                x->partition_info += xd->mode_info_stride * cpi->encoding_thread_count;
+                x->gf_active_ptr   += cm->mb_cols * cpi->encoding_thread_count;
+
+                if (mb_row == cm->mb_rows - 1)
+                {
+                    sem_post(&cpi->h_event_end_encoding); /* signal frame encoding end */
+                }
+            }
+        }
+    }
+
+    /* printf("exit thread %d\n", ithread); */
+    return 0;
+}
+
+static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
+{
+
+    MACROBLOCK *x = mbsrc;
+    MACROBLOCK *z = mbdst;
+    int i;
+
+    z->ss               = x->ss;
+    z->ss_count          = x->ss_count;
+    z->searches_per_step  = x->searches_per_step;
+    z->errorperbit      = x->errorperbit;
+
+    z->sadperbit16      = x->sadperbit16;
+    z->sadperbit4       = x->sadperbit4;
+
+    /*
+    z->mv_col_min    = x->mv_col_min;
+    z->mv_col_max    = x->mv_col_max;
+    z->mv_row_min    = x->mv_row_min;
+    z->mv_row_max    = x->mv_row_max;
+    */
+
+    z->short_fdct4x4     = x->short_fdct4x4;
+    z->short_fdct8x4     = x->short_fdct8x4;
+    z->short_walsh4x4    = x->short_walsh4x4;
+    z->quantize_b        = x->quantize_b;
+    z->optimize          = x->optimize;
+
+    /*
+    z->mvc              = x->mvc;
+    z->src.y_buffer      = x->src.y_buffer;
+    z->src.u_buffer      = x->src.u_buffer;
+    z->src.v_buffer      = x->src.v_buffer;
+    */
+
+    z->mvcost[0] =  x->mvcost[0];
+    z->mvcost[1] =  x->mvcost[1];
+    z->mvsadcost[0] =  x->mvsadcost[0];
+    z->mvsadcost[1] =  x->mvsadcost[1];
+
+    z->token_costs = x->token_costs;
+    z->inter_bmode_costs = x->inter_bmode_costs;
+    z->mbmode_cost = x->mbmode_cost;
+    z->intra_uv_mode_cost = x->intra_uv_mode_cost;
+    z->bmode_costs = x->bmode_costs;
+
+    for (i = 0; i < 25; i++)
+    {
+        z->block[i].quant           = x->block[i].quant;
+        z->block[i].quant_fast      = x->block[i].quant_fast;
+        z->block[i].quant_shift     = x->block[i].quant_shift;
+        z->block[i].zbin            = x->block[i].zbin;
+        z->block[i].zrun_zbin_boost = x->block[i].zrun_zbin_boost;
+        z->block[i].round           = x->block[i].round;
+        z->block[i].src_stride      = x->block[i].src_stride;
+    }
+
+    z->q_index           = x->q_index;
+    z->act_zbin_adj      = x->act_zbin_adj;
+    z->last_act_zbin_adj = x->last_act_zbin_adj;
+
+    {
+        MACROBLOCKD *xd = &x->e_mbd;
+        MACROBLOCKD *zd = &z->e_mbd;
+
+        /*
+        zd->mode_info_context = xd->mode_info_context;
+        zd->mode_info        = xd->mode_info;
+
+        zd->mode_info_stride  = xd->mode_info_stride;
+        zd->frame_type       = xd->frame_type;
+        zd->up_available     = xd->up_available   ;
+        zd->left_available   = xd->left_available;
+        zd->left_context     = xd->left_context;
+        zd->last_frame_dc     = xd->last_frame_dc;
+        zd->last_frame_dccons = xd->last_frame_dccons;
+        zd->gold_frame_dc     = xd->gold_frame_dc;
+        zd->gold_frame_dccons = xd->gold_frame_dccons;
+        zd->mb_to_left_edge    = xd->mb_to_left_edge;
+        zd->mb_to_right_edge   = xd->mb_to_right_edge;
+        zd->mb_to_top_edge     = xd->mb_to_top_edge   ;
+        zd->mb_to_bottom_edge  = xd->mb_to_bottom_edge;
+        zd->gf_active_ptr     = xd->gf_active_ptr;
+        zd->frames_since_golden       = xd->frames_since_golden;
+        zd->frames_till_alt_ref_frame   = xd->frames_till_alt_ref_frame;
+        */
+        zd->subpixel_predict         = xd->subpixel_predict;
+        zd->subpixel_predict8x4      = xd->subpixel_predict8x4;
+        zd->subpixel_predict8x8      = xd->subpixel_predict8x8;
+        zd->subpixel_predict16x16    = xd->subpixel_predict16x16;
+        zd->segmentation_enabled     = xd->segmentation_enabled;
+        zd->mb_segement_abs_delta      = xd->mb_segement_abs_delta;
+        memcpy(zd->segment_feature_data, xd->segment_feature_data,
+               sizeof(xd->segment_feature_data));
+
+        memcpy(zd->dequant_y1_dc, xd->dequant_y1_dc, sizeof(xd->dequant_y1_dc));
+        memcpy(zd->dequant_y1, xd->dequant_y1, sizeof(xd->dequant_y1));
+        memcpy(zd->dequant_y2, xd->dequant_y2, sizeof(xd->dequant_y2));
+        memcpy(zd->dequant_uv, xd->dequant_uv, sizeof(xd->dequant_uv));
+
+#if 1
+        /*TODO:  Remove dequant from BLOCKD.  This is a temporary solution until
+         * the quantizer code uses a passed in pointer to the dequant constants.
+         * This will also require modifications to the x86 and neon assembly.
+         * */
+        for (i = 0; i < 16; i++)
+            zd->block[i].dequant = zd->dequant_y1;
+        for (i = 16; i < 24; i++)
+            zd->block[i].dequant = zd->dequant_uv;
+        zd->block[24].dequant = zd->dequant_y2;
+#endif
+
+
+        memcpy(z->rd_threshes, x->rd_threshes, sizeof(x->rd_threshes));
+        memcpy(z->rd_thresh_mult, x->rd_thresh_mult, sizeof(x->rd_thresh_mult));
+
+        z->zbin_over_quant = x->zbin_over_quant;
+        z->zbin_mode_boost_enabled = x->zbin_mode_boost_enabled;
+        z->zbin_mode_boost = x->zbin_mode_boost;
+
+        memset(z->error_bins, 0, sizeof(z->error_bins));
+    }
+}
+
+void vp8cx_init_mbrthread_data(VP8_COMP *cpi,
+                               MACROBLOCK *x,
+                               MB_ROW_COMP *mbr_ei,
+                               int count
+                              )
+{
+
+    VP8_COMMON *const cm = & cpi->common;
+    MACROBLOCKD *const xd = & x->e_mbd;
+    int i;
+
+    for (i = 0; i < count; i++)
+    {
+        MACROBLOCK *mb = & mbr_ei[i].mb;
+        MACROBLOCKD *mbd = &mb->e_mbd;
+
+        mbd->subpixel_predict        = xd->subpixel_predict;
+        mbd->subpixel_predict8x4     = xd->subpixel_predict8x4;
+        mbd->subpixel_predict8x8     = xd->subpixel_predict8x8;
+        mbd->subpixel_predict16x16   = xd->subpixel_predict16x16;
+        mb->gf_active_ptr            = x->gf_active_ptr;
+
+        memset(mbr_ei[i].segment_counts, 0, sizeof(mbr_ei[i].segment_counts));
+        mbr_ei[i].totalrate = 0;
+
+        mb->partition_info = x->pi + x->e_mbd.mode_info_stride * (i + 1);
+
+        mbd->frame_type = cm->frame_type;
+
+        mb->src = * cpi->Source;
+        mbd->pre = cm->yv12_fb[cm->lst_fb_idx];
+        mbd->dst = cm->yv12_fb[cm->new_fb_idx];
+
+        mb->src.y_buffer += 16 * x->src.y_stride * (i + 1);
+        mb->src.u_buffer +=  8 * x->src.uv_stride * (i + 1);
+        mb->src.v_buffer +=  8 * x->src.uv_stride * (i + 1);
+
+        vp8_build_block_offsets(mb);
+
+        mbd->left_context = &cm->left_context;
+        mb->mvc = cm->fc.mvc;
+
+        setup_mbby_copy(&mbr_ei[i].mb, x);
+
+        mbd->fullpixel_mask = 0xffffffff;
+        if(cm->full_pixel)
+            mbd->fullpixel_mask = 0xfffffff8;
+
+        vp8_zero(mb->coef_counts);
+        vp8_zero(x->ymode_count);
+        mb->skip_true_count = 0;
+        vp8_zero(mb->MVcount);
+        mb->prediction_error = 0;
+        mb->intra_error = 0;
+        vp8_zero(mb->count_mb_ref_frame_usage);
+        mb->mbs_tested_so_far = 0;
+        mb->mbs_zero_last_dot_suppress = 0;
+    }
+}
+
+int vp8cx_create_encoder_threads(VP8_COMP *cpi)
+{
+    const VP8_COMMON * cm = &cpi->common;
+
+    cpi->b_multi_threaded = 0;
+    cpi->encoding_thread_count = 0;
+    cpi->b_lpf_running = 0;
+
+    pthread_mutex_init(&cpi->mt_mutex, NULL);
+
+    if (cm->processor_core_count > 1 && cpi->oxcf.multi_threaded > 1)
+    {
+        int ithread;
+        int th_count = cpi->oxcf.multi_threaded - 1;
+        int rc = 0;
+
+        /* don't allocate more threads than cores available */
+        if (cpi->oxcf.multi_threaded > cm->processor_core_count)
+            th_count = cm->processor_core_count - 1;
+
+        /* we have th_count + 1 (main) threads processing one row each */
+        /* no point to have more threads than the sync range allows */
+        if(th_count > ((cm->mb_cols / cpi->mt_sync_range) - 1))
+        {
+            th_count = (cm->mb_cols / cpi->mt_sync_range) - 1;
+        }
+
+        if(th_count == 0)
+            return 0;
+
+        CHECK_MEM_ERROR(cpi->h_encoding_thread,
+                        vpx_malloc(sizeof(pthread_t) * th_count));
+        CHECK_MEM_ERROR(cpi->h_event_start_encoding,
+                        vpx_malloc(sizeof(sem_t) * th_count));
+        CHECK_MEM_ERROR(cpi->mb_row_ei,
+                        vpx_memalign(32, sizeof(MB_ROW_COMP) * th_count));
+        memset(cpi->mb_row_ei, 0, sizeof(MB_ROW_COMP) * th_count);
+        CHECK_MEM_ERROR(cpi->en_thread_data,
+                        vpx_malloc(sizeof(ENCODETHREAD_DATA) * th_count));
+
+        sem_init(&cpi->h_event_end_encoding, 0, 0);
+
+        cpi->b_multi_threaded = 1;
+        cpi->encoding_thread_count = th_count;
+
+        /*
+        printf("[VP8:] multi_threaded encoding is enabled with %d threads\n\n",
+               (cpi->encoding_thread_count +1));
+        */
+
+        for (ithread = 0; ithread < th_count; ithread++)
+        {
+            ENCODETHREAD_DATA *ethd = &cpi->en_thread_data[ithread];
+
+            /* Setup block ptrs and offsets */
+            vp8_setup_block_ptrs(&cpi->mb_row_ei[ithread].mb);
+            vp8_setup_block_dptrs(&cpi->mb_row_ei[ithread].mb.e_mbd);
+
+            sem_init(&cpi->h_event_start_encoding[ithread], 0, 0);
+
+            ethd->ithread = ithread;
+            ethd->ptr1 = (void *)cpi;
+            ethd->ptr2 = (void *)&cpi->mb_row_ei[ithread];
+
+            rc = pthread_create(&cpi->h_encoding_thread[ithread], 0,
+                                thread_encoding_proc, ethd);
+            if(rc)
+                break;
+        }
+
+        if(rc)
+        {
+            /* shutdown other threads */
+            protected_write(&cpi->mt_mutex, &cpi->b_multi_threaded, 0);
+            for(--ithread; ithread >= 0; ithread--)
+            {
+                pthread_join(cpi->h_encoding_thread[ithread], 0);
+                sem_destroy(&cpi->h_event_start_encoding[ithread]);
+            }
+            sem_destroy(&cpi->h_event_end_encoding);
+
+            /* free thread related resources */
+            vpx_free(cpi->h_event_start_encoding);
+            vpx_free(cpi->h_encoding_thread);
+            vpx_free(cpi->mb_row_ei);
+            vpx_free(cpi->en_thread_data);
+
+            pthread_mutex_destroy(&cpi->mt_mutex);
+
+            return -1;
+        }
+
+
+        {
+            LPFTHREAD_DATA * lpfthd = &cpi->lpf_thread_data;
+
+            sem_init(&cpi->h_event_start_lpf, 0, 0);
+            sem_init(&cpi->h_event_end_lpf, 0, 0);
+
+            lpfthd->ptr1 = (void *)cpi;
+            rc = pthread_create(&cpi->h_filter_thread, 0, thread_loopfilter,
+                                lpfthd);
+
+            if(rc)
+            {
+                /* shutdown other threads */
+                protected_write(&cpi->mt_mutex, &cpi->b_multi_threaded, 0);
+                for(--ithread; ithread >= 0; ithread--)
+                {
+                    sem_post(&cpi->h_event_start_encoding[ithread]);
+                    pthread_join(cpi->h_encoding_thread[ithread], 0);
+                    sem_destroy(&cpi->h_event_start_encoding[ithread]);
+                }
+                sem_destroy(&cpi->h_event_end_encoding);
+                sem_destroy(&cpi->h_event_end_lpf);
+                sem_destroy(&cpi->h_event_start_lpf);
+
+                /* free thread related resources */
+                vpx_free(cpi->h_event_start_encoding);
+                vpx_free(cpi->h_encoding_thread);
+                vpx_free(cpi->mb_row_ei);
+                vpx_free(cpi->en_thread_data);
+
+                pthread_mutex_destroy(&cpi->mt_mutex);
+
+                return -2;
+            }
+        }
+    }
+    return 0;
+}
+
+void vp8cx_remove_encoder_threads(VP8_COMP *cpi)
+{
+    if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded))
+    {
+        /* shutdown other threads */
+        protected_write(&cpi->mt_mutex, &cpi->b_multi_threaded, 0);
+        {
+            int i;
+
+            for (i = 0; i < cpi->encoding_thread_count; i++)
+            {
+                sem_post(&cpi->h_event_start_encoding[i]);
+                pthread_join(cpi->h_encoding_thread[i], 0);
+
+                sem_destroy(&cpi->h_event_start_encoding[i]);
+            }
+
+            sem_post(&cpi->h_event_start_lpf);
+            pthread_join(cpi->h_filter_thread, 0);
+        }
+
+        sem_destroy(&cpi->h_event_end_encoding);
+        sem_destroy(&cpi->h_event_end_lpf);
+        sem_destroy(&cpi->h_event_start_lpf);
+
+        /* free thread related resources */
+        vpx_free(cpi->h_event_start_encoding);
+        vpx_free(cpi->h_encoding_thread);
+        vpx_free(cpi->mb_row_ei);
+        vpx_free(cpi->en_thread_data);
+    }
+    pthread_mutex_destroy(&cpi->mt_mutex);
+}
+#endif
diff --git a/libs/libvpx/vp8/encoder/firstpass.c b/libs/libvpx/vp8/encoder/firstpass.c
new file mode 100644
index 0000000000..4c2acc7745
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/firstpass.c
@@ -0,0 +1,3368 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <limits.h>
+#include <stdio.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_scale_rtcd.h"
+#include "block.h"
+#include "onyx_int.h"
+#include "vpx_dsp/variance.h"
+#include "encodeintra.h"
+#include "vp8/common/setupintrarecon.h"
+#include "vp8/common/systemdependent.h"
+#include "mcomp.h"
+#include "firstpass.h"
+#include "vpx_scale/vpx_scale.h"
+#include "encodemb.h"
+#include "vp8/common/extend.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp8/common/swapyv12buffer.h"
+#include "rdopt.h"
+#include "vp8/common/quant_common.h"
+#include "encodemv.h"
+#include "encodeframe.h"
+
+/* #define OUTPUT_FPF 1 */
+
+extern void vp8cx_frame_init_quantizer(VP8_COMP *cpi);
+
+#define GFQ_ADJUSTMENT vp8_gf_boost_qadjustment[Q]
+extern int vp8_kf_boost_qadjustment[QINDEX_RANGE];
+
+extern const int vp8_gf_boost_qadjustment[QINDEX_RANGE];
+
+#define IIFACTOR   1.5
+#define IIKFACTOR1 1.40
+#define IIKFACTOR2 1.5
+#define RMAX       14.0
+#define GF_RMAX    48.0
+
+#define KF_MB_INTRA_MIN 300
+#define GF_MB_INTRA_MIN 200
+
+#define DOUBLE_DIVIDE_CHECK(X) ((X)<0?(X)-.000001:(X)+.000001)
+
+#define POW1 (double)cpi->oxcf.two_pass_vbrbias/100.0
+#define POW2 (double)cpi->oxcf.two_pass_vbrbias/100.0
+
+#define NEW_BOOST 1
+
+static int vscale_lookup[7] = {0, 1, 1, 2, 2, 3, 3};
+static int hscale_lookup[7] = {0, 0, 1, 1, 2, 2, 3};
+
+
+static const int cq_level[QINDEX_RANGE] =
+{
+    0,0,1,1,2,3,3,4,4,5,6,6,7,8,8,9,
+    9,10,11,11,12,13,13,14,15,15,16,17,17,18,19,20,
+    20,21,22,22,23,24,24,25,26,27,27,28,29,30,30,31,
+    32,33,33,34,35,36,36,37,38,39,39,40,41,42,42,43,
+    44,45,46,46,47,48,49,50,50,51,52,53,54,55,55,56,
+    57,58,59,60,60,61,62,63,64,65,66,67,67,68,69,70,
+    71,72,73,74,75,75,76,77,78,79,80,81,82,83,84,85,
+    86,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100
+};
+
+static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame);
+
+/* Resets the first pass file to the given position using a relative seek
+ * from the current position
+ */
+static void reset_fpf_position(VP8_COMP *cpi, FIRSTPASS_STATS *Position)
+{
+    cpi->twopass.stats_in = Position;
+}
+
+static int lookup_next_frame_stats(VP8_COMP *cpi, FIRSTPASS_STATS *next_frame)
+{
+    if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end)
+        return EOF;
+
+    *next_frame = *cpi->twopass.stats_in;
+    return 1;
+}
+
+/* Read frame stats at an offset from the current position */
+static int read_frame_stats( VP8_COMP *cpi,
+                             FIRSTPASS_STATS *frame_stats,
+                             int offset )
+{
+    FIRSTPASS_STATS * fps_ptr = cpi->twopass.stats_in;
+
+    /* Check legality of offset */
+    if ( offset >= 0 )
+    {
+        if ( &fps_ptr[offset] >= cpi->twopass.stats_in_end )
+             return EOF;
+    }
+    else if ( offset < 0 )
+    {
+        if ( &fps_ptr[offset] < cpi->twopass.stats_in_start )
+             return EOF;
+    }
+
+    *frame_stats = fps_ptr[offset];
+    return 1;
+}
+
+static int input_stats(VP8_COMP *cpi, FIRSTPASS_STATS *fps)
+{
+    if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end)
+        return EOF;
+
+    *fps = *cpi->twopass.stats_in;
+    cpi->twopass.stats_in =
+         (void*)((char *)cpi->twopass.stats_in + sizeof(FIRSTPASS_STATS));
+    return 1;
+}
+
+static void output_stats(const VP8_COMP            *cpi,
+                         struct vpx_codec_pkt_list *pktlist,
+                         FIRSTPASS_STATS            *stats)
+{
+    struct vpx_codec_cx_pkt pkt;
+    (void)cpi;
+    pkt.kind = VPX_CODEC_STATS_PKT;
+    pkt.data.twopass_stats.buf = stats;
+    pkt.data.twopass_stats.sz = sizeof(FIRSTPASS_STATS);
+    vpx_codec_pkt_list_add(pktlist, &pkt);
+
+/* TEMP debug code */
+#if OUTPUT_FPF
+
+    {
+        FILE *fpfile;
+        fpfile = fopen("firstpass.stt", "a");
+
+        fprintf(fpfile, "%12.0f %12.0f %12.0f %12.4f %12.4f %12.4f %12.4f"
+                " %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f"
+                " %12.0f %12.0f %12.4f\n",
+                stats->frame,
+                stats->intra_error,
+                stats->coded_error,
+                stats->ssim_weighted_pred_err,
+                stats->pcnt_inter,
+                stats->pcnt_motion,
+                stats->pcnt_second_ref,
+                stats->pcnt_neutral,
+                stats->MVr,
+                stats->mvr_abs,
+                stats->MVc,
+                stats->mvc_abs,
+                stats->MVrv,
+                stats->MVcv,
+                stats->mv_in_out_count,
+                stats->new_mv_count,
+                stats->count,
+                stats->duration);
+        fclose(fpfile);
+    }
+#endif
+}
+
+static void zero_stats(FIRSTPASS_STATS *section)
+{
+    section->frame      = 0.0;
+    section->intra_error = 0.0;
+    section->coded_error = 0.0;
+    section->ssim_weighted_pred_err = 0.0;
+    section->pcnt_inter  = 0.0;
+    section->pcnt_motion  = 0.0;
+    section->pcnt_second_ref = 0.0;
+    section->pcnt_neutral = 0.0;
+    section->MVr        = 0.0;
+    section->mvr_abs     = 0.0;
+    section->MVc        = 0.0;
+    section->mvc_abs     = 0.0;
+    section->MVrv       = 0.0;
+    section->MVcv       = 0.0;
+    section->mv_in_out_count  = 0.0;
+    section->new_mv_count = 0.0;
+    section->count      = 0.0;
+    section->duration   = 1.0;
+}
+
+static void accumulate_stats(FIRSTPASS_STATS *section, FIRSTPASS_STATS *frame)
+{
+    section->frame += frame->frame;
+    section->intra_error += frame->intra_error;
+    section->coded_error += frame->coded_error;
+    section->ssim_weighted_pred_err += frame->ssim_weighted_pred_err;
+    section->pcnt_inter  += frame->pcnt_inter;
+    section->pcnt_motion += frame->pcnt_motion;
+    section->pcnt_second_ref += frame->pcnt_second_ref;
+    section->pcnt_neutral += frame->pcnt_neutral;
+    section->MVr        += frame->MVr;
+    section->mvr_abs     += frame->mvr_abs;
+    section->MVc        += frame->MVc;
+    section->mvc_abs     += frame->mvc_abs;
+    section->MVrv       += frame->MVrv;
+    section->MVcv       += frame->MVcv;
+    section->mv_in_out_count  += frame->mv_in_out_count;
+    section->new_mv_count += frame->new_mv_count;
+    section->count      += frame->count;
+    section->duration   += frame->duration;
+}
+
+static void subtract_stats(FIRSTPASS_STATS *section, FIRSTPASS_STATS *frame)
+{
+    section->frame -= frame->frame;
+    section->intra_error -= frame->intra_error;
+    section->coded_error -= frame->coded_error;
+    section->ssim_weighted_pred_err -= frame->ssim_weighted_pred_err;
+    section->pcnt_inter  -= frame->pcnt_inter;
+    section->pcnt_motion -= frame->pcnt_motion;
+    section->pcnt_second_ref -= frame->pcnt_second_ref;
+    section->pcnt_neutral -= frame->pcnt_neutral;
+    section->MVr        -= frame->MVr;
+    section->mvr_abs     -= frame->mvr_abs;
+    section->MVc        -= frame->MVc;
+    section->mvc_abs     -= frame->mvc_abs;
+    section->MVrv       -= frame->MVrv;
+    section->MVcv       -= frame->MVcv;
+    section->mv_in_out_count  -= frame->mv_in_out_count;
+    section->new_mv_count -= frame->new_mv_count;
+    section->count      -= frame->count;
+    section->duration   -= frame->duration;
+}
+
+static void avg_stats(FIRSTPASS_STATS *section)
+{
+    if (section->count < 1.0)
+        return;
+
+    section->intra_error /= section->count;
+    section->coded_error /= section->count;
+    section->ssim_weighted_pred_err /= section->count;
+    section->pcnt_inter  /= section->count;
+    section->pcnt_second_ref /= section->count;
+    section->pcnt_neutral /= section->count;
+    section->pcnt_motion /= section->count;
+    section->MVr        /= section->count;
+    section->mvr_abs     /= section->count;
+    section->MVc        /= section->count;
+    section->mvc_abs     /= section->count;
+    section->MVrv       /= section->count;
+    section->MVcv       /= section->count;
+    section->mv_in_out_count   /= section->count;
+    section->duration   /= section->count;
+}
+
+/* Calculate a modified Error used in distributing bits between easier
+ * and harder frames
+ */
+static double calculate_modified_err(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
+{
+    double av_err = ( cpi->twopass.total_stats.ssim_weighted_pred_err /
+                      cpi->twopass.total_stats.count );
+    double this_err = this_frame->ssim_weighted_pred_err;
+    double modified_err;
+
+    if (this_err > av_err)
+        modified_err = av_err * pow((this_err / DOUBLE_DIVIDE_CHECK(av_err)), POW1);
+    else
+        modified_err = av_err * pow((this_err / DOUBLE_DIVIDE_CHECK(av_err)), POW2);
+
+    return modified_err;
+}
+
+static const double weight_table[256] = {
+0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
+0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
+0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
+0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
+0.020000, 0.031250, 0.062500, 0.093750, 0.125000, 0.156250, 0.187500, 0.218750,
+0.250000, 0.281250, 0.312500, 0.343750, 0.375000, 0.406250, 0.437500, 0.468750,
+0.500000, 0.531250, 0.562500, 0.593750, 0.625000, 0.656250, 0.687500, 0.718750,
+0.750000, 0.781250, 0.812500, 0.843750, 0.875000, 0.906250, 0.937500, 0.968750,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000
+};
+
+static double simple_weight(YV12_BUFFER_CONFIG *source)
+{
+    int i, j;
+
+    unsigned char *src = source->y_buffer;
+    double sum_weights = 0.0;
+
+    /* Loop throught the Y plane raw examining levels and creating a weight
+     * for the image
+     */
+    i = source->y_height;
+    do
+    {
+        j = source->y_width;
+        do
+        {
+            sum_weights += weight_table[ *src];
+            src++;
+        }while(--j);
+        src -= source->y_width;
+        src += source->y_stride;
+    }while(--i);
+
+    sum_weights /= (source->y_height * source->y_width);
+
+    return sum_weights;
+}
+
+
+/* This function returns the current per frame maximum bitrate target */
+static int frame_max_bits(VP8_COMP *cpi)
+{
+    /* Max allocation for a single frame based on the max section guidelines
+     * passed in and how many bits are left
+     */
+    int max_bits;
+
+    /* For CBR we need to also consider buffer fullness.
+     * If we are running below the optimal level then we need to gradually
+     * tighten up on max_bits.
+     */
+    if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+    {
+        double buffer_fullness_ratio = (double)cpi->buffer_level / DOUBLE_DIVIDE_CHECK((double)cpi->oxcf.optimal_buffer_level);
+
+        /* For CBR base this on the target average bits per frame plus the
+         * maximum sedction rate passed in by the user
+         */
+        max_bits = (int)(cpi->av_per_frame_bandwidth * ((double)cpi->oxcf.two_pass_vbrmax_section / 100.0));
+
+        /* If our buffer is below the optimum level */
+        if (buffer_fullness_ratio < 1.0)
+        {
+            /* The lower of max_bits / 4 or cpi->av_per_frame_bandwidth / 4. */
+            int min_max_bits = ((cpi->av_per_frame_bandwidth >> 2) < (max_bits >> 2)) ? cpi->av_per_frame_bandwidth >> 2 : max_bits >> 2;
+
+            max_bits = (int)(max_bits * buffer_fullness_ratio);
+
+            /* Lowest value we will set ... which should allow the buffer to
+             * refill.
+             */
+            if (max_bits < min_max_bits)
+                max_bits = min_max_bits;
+        }
+    }
+    /* VBR */
+    else
+    {
+        /* For VBR base this on the bits and frames left plus the
+         * two_pass_vbrmax_section rate passed in by the user
+         */
+        max_bits = (int)(((double)cpi->twopass.bits_left / (cpi->twopass.total_stats.count - (double)cpi->common.current_video_frame)) * ((double)cpi->oxcf.two_pass_vbrmax_section / 100.0));
+    }
+
+    /* Trap case where we are out of bits */
+    if (max_bits < 0)
+        max_bits = 0;
+
+    return max_bits;
+}
+
+void vp8_init_first_pass(VP8_COMP *cpi)
+{
+    zero_stats(&cpi->twopass.total_stats);
+}
+
+void vp8_end_first_pass(VP8_COMP *cpi)
+{
+    output_stats(cpi, cpi->output_pkt_list, &cpi->twopass.total_stats);
+}
+
+static void zz_motion_search( VP8_COMP *cpi, MACROBLOCK * x,
+                              YV12_BUFFER_CONFIG * raw_buffer,
+                              int * raw_motion_err,
+                              YV12_BUFFER_CONFIG * recon_buffer,
+                              int * best_motion_err, int recon_yoffset)
+{
+    MACROBLOCKD * const xd = & x->e_mbd;
+    BLOCK *b = &x->block[0];
+    BLOCKD *d = &x->e_mbd.block[0];
+
+    unsigned char *src_ptr = (*(b->base_src) + b->src);
+    int src_stride = b->src_stride;
+    unsigned char *raw_ptr;
+    int raw_stride = raw_buffer->y_stride;
+    unsigned char *ref_ptr;
+    int ref_stride = x->e_mbd.pre.y_stride;
+    (void)cpi;
+
+    /* Set up pointers for this macro block raw buffer */
+    raw_ptr = (unsigned char *)(raw_buffer->y_buffer + recon_yoffset
+                                + d->offset);
+    vpx_mse16x16(src_ptr, src_stride, raw_ptr, raw_stride,
+                 (unsigned int *)(raw_motion_err));
+
+    /* Set up pointers for this macro block recon buffer */
+    xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;
+    ref_ptr = (unsigned char *)(xd->pre.y_buffer + d->offset );
+    vpx_mse16x16(src_ptr, src_stride, ref_ptr, ref_stride,
+                 (unsigned int *)(best_motion_err));
+}
+
+static void first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x,
+                                     int_mv *ref_mv, MV *best_mv,
+                                     YV12_BUFFER_CONFIG *recon_buffer,
+                                     int *best_motion_err, int recon_yoffset )
+{
+    MACROBLOCKD *const xd = & x->e_mbd;
+    BLOCK *b = &x->block[0];
+    BLOCKD *d = &x->e_mbd.block[0];
+    int num00;
+
+    int_mv tmp_mv;
+    int_mv ref_mv_full;
+
+    int tmp_err;
+    int step_param = 3; /* Dont search over full range for first pass */
+    int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
+    int n;
+    vp8_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16];
+    int new_mv_mode_penalty = 256;
+
+    /* override the default variance function to use MSE */
+    v_fn_ptr.vf    = vpx_mse16x16;
+
+    /* Set up pointers for this macro block recon buffer */
+    xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;
+
+    /* Initial step/diamond search centred on best mv */
+    tmp_mv.as_int = 0;
+    ref_mv_full.as_mv.col = ref_mv->as_mv.col>>3;
+    ref_mv_full.as_mv.row = ref_mv->as_mv.row>>3;
+    tmp_err = cpi->diamond_search_sad(x, b, d, &ref_mv_full, &tmp_mv, step_param,
+                                      x->sadperbit16, &num00, &v_fn_ptr,
+                                      x->mvcost, ref_mv);
+    if ( tmp_err < INT_MAX-new_mv_mode_penalty )
+        tmp_err += new_mv_mode_penalty;
+
+    if (tmp_err < *best_motion_err)
+    {
+        *best_motion_err = tmp_err;
+        best_mv->row = tmp_mv.as_mv.row;
+        best_mv->col = tmp_mv.as_mv.col;
+    }
+
+    /* Further step/diamond searches as necessary */
+    n = num00;
+    num00 = 0;
+
+    while (n < further_steps)
+    {
+        n++;
+
+        if (num00)
+            num00--;
+        else
+        {
+            tmp_err = cpi->diamond_search_sad(x, b, d, &ref_mv_full, &tmp_mv,
+                                              step_param + n, x->sadperbit16,
+                                              &num00, &v_fn_ptr, x->mvcost,
+                                              ref_mv);
+            if ( tmp_err < INT_MAX-new_mv_mode_penalty )
+                tmp_err += new_mv_mode_penalty;
+
+            if (tmp_err < *best_motion_err)
+            {
+                *best_motion_err = tmp_err;
+                best_mv->row = tmp_mv.as_mv.row;
+                best_mv->col = tmp_mv.as_mv.col;
+            }
+        }
+    }
+}
+
+void vp8_first_pass(VP8_COMP *cpi)
+{
+    int mb_row, mb_col;
+    MACROBLOCK *const x = & cpi->mb;
+    VP8_COMMON *const cm = & cpi->common;
+    MACROBLOCKD *const xd = & x->e_mbd;
+
+    int recon_yoffset, recon_uvoffset;
+    YV12_BUFFER_CONFIG *lst_yv12 = &cm->yv12_fb[cm->lst_fb_idx];
+    YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx];
+    YV12_BUFFER_CONFIG *gld_yv12 = &cm->yv12_fb[cm->gld_fb_idx];
+    int recon_y_stride = lst_yv12->y_stride;
+    int recon_uv_stride = lst_yv12->uv_stride;
+    int64_t intra_error = 0;
+    int64_t coded_error = 0;
+
+    int sum_mvr = 0, sum_mvc = 0;
+    int sum_mvr_abs = 0, sum_mvc_abs = 0;
+    int sum_mvrs = 0, sum_mvcs = 0;
+    int mvcount = 0;
+    int intercount = 0;
+    int second_ref_count = 0;
+    int intrapenalty = 256;
+    int neutral_count = 0;
+    int new_mv_count = 0;
+    int sum_in_vectors = 0;
+    uint32_t lastmv_as_int = 0;
+
+    int_mv zero_ref_mv;
+
+    zero_ref_mv.as_int = 0;
+
+    vp8_clear_system_state();
+
+    x->src = * cpi->Source;
+    xd->pre = *lst_yv12;
+    xd->dst = *new_yv12;
+
+    x->partition_info = x->pi;
+
+    xd->mode_info_context = cm->mi;
+
+    if(!cm->use_bilinear_mc_filter)
+    {
+         xd->subpixel_predict        = vp8_sixtap_predict4x4;
+         xd->subpixel_predict8x4     = vp8_sixtap_predict8x4;
+         xd->subpixel_predict8x8     = vp8_sixtap_predict8x8;
+         xd->subpixel_predict16x16   = vp8_sixtap_predict16x16;
+     }
+     else
+     {
+         xd->subpixel_predict        = vp8_bilinear_predict4x4;
+         xd->subpixel_predict8x4     = vp8_bilinear_predict8x4;
+         xd->subpixel_predict8x8     = vp8_bilinear_predict8x8;
+         xd->subpixel_predict16x16   = vp8_bilinear_predict16x16;
+     }
+
+    vp8_build_block_offsets(x);
+
+    /* set up frame new frame for intra coded blocks */
+    vp8_setup_intra_recon(new_yv12);
+    vp8cx_frame_init_quantizer(cpi);
+
+    /* Initialise the MV cost table to the defaults */
+    {
+        int flag[2] = {1, 1};
+        vp8_initialize_rd_consts(cpi, x, vp8_dc_quant(cm->base_qindex, cm->y1dc_delta_q));
+        memcpy(cm->fc.mvc, vp8_default_mv_context, sizeof(vp8_default_mv_context));
+        vp8_build_component_cost_table(cpi->mb.mvcost, (const MV_CONTEXT *) cm->fc.mvc, flag);
+    }
+
+    /* for each macroblock row in image */
+    for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
+    {
+        int_mv best_ref_mv;
+
+        best_ref_mv.as_int = 0;
+
+        /* reset above block coeffs */
+        xd->up_available = (mb_row != 0);
+        recon_yoffset = (mb_row * recon_y_stride * 16);
+        recon_uvoffset = (mb_row * recon_uv_stride * 8);
+
+        /* Set up limit values for motion vectors to prevent them extending
+         * outside the UMV borders
+         */
+        x->mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 16));
+        x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16) + (VP8BORDERINPIXELS - 16);
+
+
+        /* for each macroblock col in image */
+        for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+        {
+            int this_error;
+            int gf_motion_error = INT_MAX;
+            int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
+
+            xd->dst.y_buffer = new_yv12->y_buffer + recon_yoffset;
+            xd->dst.u_buffer = new_yv12->u_buffer + recon_uvoffset;
+            xd->dst.v_buffer = new_yv12->v_buffer + recon_uvoffset;
+            xd->left_available = (mb_col != 0);
+
+            /* Copy current mb to a buffer */
+            vp8_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
+
+            /* do intra 16x16 prediction */
+            this_error = vp8_encode_intra(cpi, x, use_dc_pred);
+
+            /* "intrapenalty" below deals with situations where the intra
+             * and inter error scores are very low (eg a plain black frame)
+             * We do not have special cases in first pass for 0,0 and
+             * nearest etc so all inter modes carry an overhead cost
+             * estimate fot the mv. When the error score is very low this
+             * causes us to pick all or lots of INTRA modes and throw lots
+             * of key frames. This penalty adds a cost matching that of a
+             * 0,0 mv to the intra case.
+             */
+            this_error += intrapenalty;
+
+            /* Cumulative intra error total */
+            intra_error += (int64_t)this_error;
+
+            /* Set up limit values for motion vectors to prevent them
+             * extending outside the UMV borders
+             */
+            x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16));
+            x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + (VP8BORDERINPIXELS - 16);
+
+            /* Other than for the first frame do a motion search */
+            if (cm->current_video_frame > 0)
+            {
+                BLOCKD *d = &x->e_mbd.block[0];
+                MV tmp_mv = {0, 0};
+                int tmp_err;
+                int motion_error = INT_MAX;
+                int raw_motion_error = INT_MAX;
+
+                /* Simple 0,0 motion with no mv overhead */
+                zz_motion_search( cpi, x, cpi->last_frame_unscaled_source,
+                                  &raw_motion_error, lst_yv12, &motion_error,
+                                  recon_yoffset );
+                d->bmi.mv.as_mv.row = 0;
+                d->bmi.mv.as_mv.col = 0;
+
+                if (raw_motion_error < cpi->oxcf.encode_breakout)
+                    goto skip_motion_search;
+
+                /* Test last reference frame using the previous best mv as the
+                 * starting point (best reference) for the search
+                 */
+                first_pass_motion_search(cpi, x, &best_ref_mv,
+                                        &d->bmi.mv.as_mv, lst_yv12,
+                                        &motion_error, recon_yoffset);
+
+                /* If the current best reference mv is not centred on 0,0
+                 * then do a 0,0 based search as well
+                 */
+                if (best_ref_mv.as_int)
+                {
+                   tmp_err = INT_MAX;
+                   first_pass_motion_search(cpi, x, &zero_ref_mv, &tmp_mv,
+                                     lst_yv12, &tmp_err, recon_yoffset);
+
+                   if ( tmp_err < motion_error )
+                   {
+                        motion_error = tmp_err;
+                        d->bmi.mv.as_mv.row = tmp_mv.row;
+                        d->bmi.mv.as_mv.col = tmp_mv.col;
+                   }
+                }
+
+                /* Experimental search in a second reference frame ((0,0)
+                 * based only)
+                 */
+                if (cm->current_video_frame > 1)
+                {
+                    first_pass_motion_search(cpi, x, &zero_ref_mv, &tmp_mv, gld_yv12, &gf_motion_error, recon_yoffset);
+
+                    if ((gf_motion_error < motion_error) && (gf_motion_error < this_error))
+                    {
+                        second_ref_count++;
+                    }
+
+                    /* Reset to last frame as reference buffer */
+                    xd->pre.y_buffer = lst_yv12->y_buffer + recon_yoffset;
+                    xd->pre.u_buffer = lst_yv12->u_buffer + recon_uvoffset;
+                    xd->pre.v_buffer = lst_yv12->v_buffer + recon_uvoffset;
+                }
+
+skip_motion_search:
+                /* Intra assumed best */
+                best_ref_mv.as_int = 0;
+
+                if (motion_error <= this_error)
+                {
+                    /* Keep a count of cases where the inter and intra were
+                     * very close and very low. This helps with scene cut
+                     * detection for example in cropped clips with black bars
+                     * at the sides or top and bottom.
+                     */
+                    if( (((this_error-intrapenalty) * 9) <=
+                         (motion_error*10)) &&
+                        (this_error < (2*intrapenalty)) )
+                    {
+                        neutral_count++;
+                    }
+
+                    d->bmi.mv.as_mv.row *= 8;
+                    d->bmi.mv.as_mv.col *= 8;
+                    this_error = motion_error;
+                    vp8_set_mbmode_and_mvs(x, NEWMV, &d->bmi.mv);
+                    vp8_encode_inter16x16y(x);
+                    sum_mvr += d->bmi.mv.as_mv.row;
+                    sum_mvr_abs += abs(d->bmi.mv.as_mv.row);
+                    sum_mvc += d->bmi.mv.as_mv.col;
+                    sum_mvc_abs += abs(d->bmi.mv.as_mv.col);
+                    sum_mvrs += d->bmi.mv.as_mv.row * d->bmi.mv.as_mv.row;
+                    sum_mvcs += d->bmi.mv.as_mv.col * d->bmi.mv.as_mv.col;
+                    intercount++;
+
+                    best_ref_mv.as_int = d->bmi.mv.as_int;
+
+                    /* Was the vector non-zero */
+                    if (d->bmi.mv.as_int)
+                    {
+                        mvcount++;
+
+                        /* Was it different from the last non zero vector */
+                        if ( d->bmi.mv.as_int != lastmv_as_int )
+                            new_mv_count++;
+                        lastmv_as_int = d->bmi.mv.as_int;
+
+                        /* Does the Row vector point inwards or outwards */
+                        if (mb_row < cm->mb_rows / 2)
+                        {
+                            if (d->bmi.mv.as_mv.row > 0)
+                                sum_in_vectors--;
+                            else if (d->bmi.mv.as_mv.row < 0)
+                                sum_in_vectors++;
+                        }
+                        else if (mb_row > cm->mb_rows / 2)
+                        {
+                            if (d->bmi.mv.as_mv.row > 0)
+                                sum_in_vectors++;
+                            else if (d->bmi.mv.as_mv.row < 0)
+                                sum_in_vectors--;
+                        }
+
+                        /* Does the Row vector point inwards or outwards */
+                        if (mb_col < cm->mb_cols / 2)
+                        {
+                            if (d->bmi.mv.as_mv.col > 0)
+                                sum_in_vectors--;
+                            else if (d->bmi.mv.as_mv.col < 0)
+                                sum_in_vectors++;
+                        }
+                        else if (mb_col > cm->mb_cols / 2)
+                        {
+                            if (d->bmi.mv.as_mv.col > 0)
+                                sum_in_vectors++;
+                            else if (d->bmi.mv.as_mv.col < 0)
+                                sum_in_vectors--;
+                        }
+                    }
+                }
+            }
+
+            coded_error += (int64_t)this_error;
+
+            /* adjust to the next column of macroblocks */
+            x->src.y_buffer += 16;
+            x->src.u_buffer += 8;
+            x->src.v_buffer += 8;
+
+            recon_yoffset += 16;
+            recon_uvoffset += 8;
+        }
+
+        /* adjust to the next row of mbs */
+        x->src.y_buffer += 16 * x->src.y_stride - 16 * cm->mb_cols;
+        x->src.u_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;
+        x->src.v_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;
+
+        /* extend the recon for intra prediction */
+        vp8_extend_mb_row(new_yv12, xd->dst.y_buffer + 16, xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
+        vp8_clear_system_state();
+    }
+
+    vp8_clear_system_state();
+    {
+        double weight = 0.0;
+
+        FIRSTPASS_STATS fps;
+
+        fps.frame      = cm->current_video_frame ;
+        fps.intra_error = (double)(intra_error >> 8);
+        fps.coded_error = (double)(coded_error >> 8);
+        weight = simple_weight(cpi->Source);
+
+
+        if (weight < 0.1)
+            weight = 0.1;
+
+        fps.ssim_weighted_pred_err = fps.coded_error * weight;
+
+        fps.pcnt_inter  = 0.0;
+        fps.pcnt_motion = 0.0;
+        fps.MVr        = 0.0;
+        fps.mvr_abs     = 0.0;
+        fps.MVc        = 0.0;
+        fps.mvc_abs     = 0.0;
+        fps.MVrv       = 0.0;
+        fps.MVcv       = 0.0;
+        fps.mv_in_out_count  = 0.0;
+        fps.new_mv_count = 0.0;
+        fps.count      = 1.0;
+
+        fps.pcnt_inter   = 1.0 * (double)intercount / cm->MBs;
+        fps.pcnt_second_ref = 1.0 * (double)second_ref_count / cm->MBs;
+        fps.pcnt_neutral = 1.0 * (double)neutral_count / cm->MBs;
+
+        if (mvcount > 0)
+        {
+            fps.MVr = (double)sum_mvr / (double)mvcount;
+            fps.mvr_abs = (double)sum_mvr_abs / (double)mvcount;
+            fps.MVc = (double)sum_mvc / (double)mvcount;
+            fps.mvc_abs = (double)sum_mvc_abs / (double)mvcount;
+            fps.MVrv = ((double)sum_mvrs - (fps.MVr * fps.MVr / (double)mvcount)) / (double)mvcount;
+            fps.MVcv = ((double)sum_mvcs - (fps.MVc * fps.MVc / (double)mvcount)) / (double)mvcount;
+            fps.mv_in_out_count = (double)sum_in_vectors / (double)(mvcount * 2);
+            fps.new_mv_count = new_mv_count;
+
+            fps.pcnt_motion = 1.0 * (double)mvcount / cpi->common.MBs;
+        }
+
+        /* TODO:  handle the case when duration is set to 0, or something less
+         * than the full time between subsequent cpi->source_time_stamps
+         */
+        fps.duration = (double)(cpi->source->ts_end
+                       - cpi->source->ts_start);
+
+        /* don't want to do output stats with a stack variable! */
+        memcpy(&cpi->twopass.this_frame_stats,
+               &fps,
+               sizeof(FIRSTPASS_STATS));
+        output_stats(cpi, cpi->output_pkt_list, &cpi->twopass.this_frame_stats);
+        accumulate_stats(&cpi->twopass.total_stats, &fps);
+    }
+
+    /* Copy the previous Last Frame into the GF buffer if specific
+     * conditions for doing so are met
+     */
+    if ((cm->current_video_frame > 0) &&
+        (cpi->twopass.this_frame_stats.pcnt_inter > 0.20) &&
+        ((cpi->twopass.this_frame_stats.intra_error /
+          DOUBLE_DIVIDE_CHECK(cpi->twopass.this_frame_stats.coded_error)) >
+         2.0))
+    {
+        vp8_yv12_copy_frame(lst_yv12, gld_yv12);
+    }
+
+    /* swap frame pointers so last frame refers to the frame we just
+     * compressed
+     */
+    vp8_swap_yv12_buffer(lst_yv12, new_yv12);
+    vp8_yv12_extend_frame_borders(lst_yv12);
+
+    /* Special case for the first frame. Copy into the GF buffer as a
+     * second reference.
+     */
+    if (cm->current_video_frame == 0)
+    {
+        vp8_yv12_copy_frame(lst_yv12, gld_yv12);
+    }
+
+
+    /* use this to see what the first pass reconstruction looks like */
+    if (0)
+    {
+        char filename[512];
+        FILE *recon_file;
+        sprintf(filename, "enc%04d.yuv", (int) cm->current_video_frame);
+
+        if (cm->current_video_frame == 0)
+            recon_file = fopen(filename, "wb");
+        else
+            recon_file = fopen(filename, "ab");
+
+        (void) fwrite(lst_yv12->buffer_alloc, lst_yv12->frame_size, 1,
+                      recon_file);
+        fclose(recon_file);
+    }
+
+    cm->current_video_frame++;
+
+}
+extern const int vp8_bits_per_mb[2][QINDEX_RANGE];
+
+/* Estimate a cost per mb attributable to overheads such as the coding of
+ * modes and motion vectors.
+ * Currently simplistic in its assumptions for testing.
+ */
+
+static double bitcost( double prob )
+{
+  if (prob > 0.000122)
+    return -log(prob) / log(2.0);
+  else
+    return 13.0;
+}
+static int64_t estimate_modemvcost(VP8_COMP *cpi,
+                                     FIRSTPASS_STATS * fpstats)
+{
+    int mv_cost;
+    int64_t mode_cost;
+
+    double av_pct_inter = fpstats->pcnt_inter / fpstats->count;
+    double av_pct_motion = fpstats->pcnt_motion / fpstats->count;
+    double av_intra = (1.0 - av_pct_inter);
+
+    double zz_cost;
+    double motion_cost;
+    double intra_cost;
+
+    zz_cost = bitcost(av_pct_inter - av_pct_motion);
+    motion_cost = bitcost(av_pct_motion);
+    intra_cost = bitcost(av_intra);
+
+    /* Estimate of extra bits per mv overhead for mbs
+     * << 9 is the normalization to the (bits * 512) used in vp8_bits_per_mb
+     */
+    mv_cost = ((int)(fpstats->new_mv_count / fpstats->count) * 8) << 9;
+
+    /* Crude estimate of overhead cost from modes
+     * << 9 is the normalization to (bits * 512) used in vp8_bits_per_mb
+     */
+    mode_cost = (int64_t)((((av_pct_inter - av_pct_motion) * zz_cost) +
+                             (av_pct_motion * motion_cost) +
+                             (av_intra * intra_cost)) * cpi->common.MBs) * 512;
+
+    return mv_cost + mode_cost;
+}
+
+static double calc_correction_factor( double err_per_mb,
+                                      double err_devisor,
+                                      double pt_low,
+                                      double pt_high,
+                                      int Q )
+{
+    double power_term;
+    double error_term = err_per_mb / err_devisor;
+    double correction_factor;
+
+    /* Adjustment based on Q to power term. */
+    power_term = pt_low + (Q * 0.01);
+    power_term = (power_term > pt_high) ? pt_high : power_term;
+
+    /* Adjustments to error term */
+    /* TBD */
+
+    /* Calculate correction factor */
+    correction_factor = pow(error_term, power_term);
+
+    /* Clip range */
+    correction_factor =
+        (correction_factor < 0.05)
+            ? 0.05 : (correction_factor > 5.0) ? 5.0 : correction_factor;
+
+    return correction_factor;
+}
+
+static int estimate_max_q(VP8_COMP *cpi,
+                          FIRSTPASS_STATS * fpstats,
+                          int section_target_bandwitdh,
+                          int overhead_bits )
+{
+    int Q;
+    int num_mbs = cpi->common.MBs;
+    int target_norm_bits_per_mb;
+
+    double section_err = (fpstats->coded_error / fpstats->count);
+    double err_per_mb = section_err / num_mbs;
+    double err_correction_factor;
+    double speed_correction = 1.0;
+    int overhead_bits_per_mb;
+
+    if (section_target_bandwitdh <= 0)
+        return cpi->twopass.maxq_max_limit;       /* Highest value allowed */
+
+    target_norm_bits_per_mb =
+        (section_target_bandwitdh < (1 << 20))
+            ? (512 * section_target_bandwitdh) / num_mbs
+            : 512 * (section_target_bandwitdh / num_mbs);
+
+    /* Calculate a corrective factor based on a rolling ratio of bits spent
+     * vs target bits
+     */
+    if ((cpi->rolling_target_bits > 0) &&
+        (cpi->active_worst_quality < cpi->worst_quality))
+    {
+        double rolling_ratio;
+
+        rolling_ratio = (double)cpi->rolling_actual_bits /
+                        (double)cpi->rolling_target_bits;
+
+        if (rolling_ratio < 0.95)
+            cpi->twopass.est_max_qcorrection_factor -= 0.005;
+        else if (rolling_ratio > 1.05)
+            cpi->twopass.est_max_qcorrection_factor += 0.005;
+
+        cpi->twopass.est_max_qcorrection_factor =
+            (cpi->twopass.est_max_qcorrection_factor < 0.1)
+                ? 0.1
+                : (cpi->twopass.est_max_qcorrection_factor > 10.0)
+                    ? 10.0 : cpi->twopass.est_max_qcorrection_factor;
+    }
+
+    /* Corrections for higher compression speed settings
+     * (reduced compression expected)
+     */
+    if ((cpi->compressor_speed == 3) || (cpi->compressor_speed == 1))
+    {
+        if (cpi->oxcf.cpu_used <= 5)
+            speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04);
+        else
+            speed_correction = 1.25;
+    }
+
+    /* Estimate of overhead bits per mb */
+    /* Correction to overhead bits for min allowed Q. */
+    overhead_bits_per_mb = overhead_bits / num_mbs;
+    overhead_bits_per_mb = (int)(overhead_bits_per_mb *
+                            pow( 0.98, (double)cpi->twopass.maxq_min_limit ));
+
+    /* Try and pick a max Q that will be high enough to encode the
+     * content at the given rate.
+     */
+    for (Q = cpi->twopass.maxq_min_limit; Q < cpi->twopass.maxq_max_limit; Q++)
+    {
+        int bits_per_mb_at_this_q;
+
+        /* Error per MB based correction factor */
+        err_correction_factor =
+            calc_correction_factor(err_per_mb, 150.0, 0.40, 0.90, Q);
+
+        bits_per_mb_at_this_q =
+            vp8_bits_per_mb[INTER_FRAME][Q] + overhead_bits_per_mb;
+
+        bits_per_mb_at_this_q = (int)(.5 + err_correction_factor
+            * speed_correction * cpi->twopass.est_max_qcorrection_factor
+            * cpi->twopass.section_max_qfactor
+            * (double)bits_per_mb_at_this_q);
+
+        /* Mode and motion overhead */
+        /* As Q rises in real encode loop rd code will force overhead down
+         * We make a crude adjustment for this here as *.98 per Q step.
+         */
+        overhead_bits_per_mb = (int)((double)overhead_bits_per_mb * 0.98);
+
+        if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
+            break;
+    }
+
+    /* Restriction on active max q for constrained quality mode. */
+    if ( (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
+         (Q < cpi->cq_target_quality) )
+    {
+        Q = cpi->cq_target_quality;
+    }
+
+    /* Adjust maxq_min_limit and maxq_max_limit limits based on
+     * average q observed in clip for non kf/gf.arf frames
+     * Give average a chance to settle though.
+     */
+    if ( (cpi->ni_frames >
+                  ((int)cpi->twopass.total_stats.count >> 8)) &&
+         (cpi->ni_frames > 150) )
+    {
+        cpi->twopass.maxq_max_limit = ((cpi->ni_av_qi + 32) < cpi->worst_quality)
+                                  ? (cpi->ni_av_qi + 32) : cpi->worst_quality;
+        cpi->twopass.maxq_min_limit = ((cpi->ni_av_qi - 32) > cpi->best_quality)
+                                  ? (cpi->ni_av_qi - 32) : cpi->best_quality;
+    }
+
+    return Q;
+}
+
+/* For cq mode estimate a cq level that matches the observed
+ * complexity and data rate.
+ */
+static int estimate_cq( VP8_COMP *cpi,
+                        FIRSTPASS_STATS * fpstats,
+                        int section_target_bandwitdh,
+                        int overhead_bits )
+{
+    int Q;
+    int num_mbs = cpi->common.MBs;
+    int target_norm_bits_per_mb;
+
+    double section_err = (fpstats->coded_error / fpstats->count);
+    double err_per_mb = section_err / num_mbs;
+    double err_correction_factor;
+    double speed_correction = 1.0;
+    double clip_iiratio;
+    double clip_iifactor;
+    int overhead_bits_per_mb;
+
+    if (0)
+    {
+        FILE *f = fopen("epmp.stt", "a");
+        fprintf(f, "%10.2f\n", err_per_mb );
+        fclose(f);
+    }
+
+    target_norm_bits_per_mb = (section_target_bandwitdh < (1 << 20))
+                              ? (512 * section_target_bandwitdh) / num_mbs
+                              : 512 * (section_target_bandwitdh / num_mbs);
+
+    /* Estimate of overhead bits per mb */
+    overhead_bits_per_mb = overhead_bits / num_mbs;
+
+    /* Corrections for higher compression speed settings
+     * (reduced compression expected)
+     */
+    if ((cpi->compressor_speed == 3) || (cpi->compressor_speed == 1))
+    {
+        if (cpi->oxcf.cpu_used <= 5)
+            speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04);
+        else
+            speed_correction = 1.25;
+    }
+
+    /* II ratio correction factor for clip as a whole */
+    clip_iiratio = cpi->twopass.total_stats.intra_error /
+                   DOUBLE_DIVIDE_CHECK(cpi->twopass.total_stats.coded_error);
+    clip_iifactor = 1.0 - ((clip_iiratio - 10.0) * 0.025);
+    if (clip_iifactor < 0.80)
+        clip_iifactor = 0.80;
+
+    /* Try and pick a Q that can encode the content at the given rate. */
+    for (Q = 0; Q < MAXQ; Q++)
+    {
+        int bits_per_mb_at_this_q;
+
+        /* Error per MB based correction factor */
+        err_correction_factor =
+            calc_correction_factor(err_per_mb, 100.0, 0.40, 0.90, Q);
+
+        bits_per_mb_at_this_q =
+            vp8_bits_per_mb[INTER_FRAME][Q] + overhead_bits_per_mb;
+
+        bits_per_mb_at_this_q =
+            (int)( .5 + err_correction_factor *
+                        speed_correction *
+                        clip_iifactor *
+                        (double)bits_per_mb_at_this_q);
+
+        /* Mode and motion overhead */
+        /* As Q rises in real encode loop rd code will force overhead down
+         * We make a crude adjustment for this here as *.98 per Q step.
+         */
+        overhead_bits_per_mb = (int)((double)overhead_bits_per_mb * 0.98);
+
+        if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
+            break;
+    }
+
+    /* Clip value to range "best allowed to (worst allowed - 1)" */
+    Q = cq_level[Q];
+    if ( Q >= cpi->worst_quality )
+        Q = cpi->worst_quality - 1;
+    if ( Q < cpi->best_quality )
+        Q = cpi->best_quality;
+
+    return Q;
+}
+
+static int estimate_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh)
+{
+    int Q;
+    int num_mbs = cpi->common.MBs;
+    int target_norm_bits_per_mb;
+
+    double err_per_mb = section_err / num_mbs;
+    double err_correction_factor;
+    double speed_correction = 1.0;
+
+    target_norm_bits_per_mb = (section_target_bandwitdh < (1 << 20)) ? (512 * section_target_bandwitdh) / num_mbs : 512 * (section_target_bandwitdh / num_mbs);
+
+    /* Corrections for higher compression speed settings
+     * (reduced compression expected)
+     */
+    if ((cpi->compressor_speed == 3) || (cpi->compressor_speed == 1))
+    {
+        if (cpi->oxcf.cpu_used <= 5)
+            speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04);
+        else
+            speed_correction = 1.25;
+    }
+
+    /* Try and pick a Q that can encode the content at the given rate. */
+    for (Q = 0; Q < MAXQ; Q++)
+    {
+        int bits_per_mb_at_this_q;
+
+        /* Error per MB based correction factor */
+        err_correction_factor =
+            calc_correction_factor(err_per_mb, 150.0, 0.40, 0.90, Q);
+
+        bits_per_mb_at_this_q =
+            (int)( .5 + ( err_correction_factor *
+                          speed_correction *
+                          cpi->twopass.est_max_qcorrection_factor *
+                          (double)vp8_bits_per_mb[INTER_FRAME][Q] / 1.0 ) );
+
+        if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
+            break;
+    }
+
+    return Q;
+}
+
+/* Estimate a worst case Q for a KF group */
+static int estimate_kf_group_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh, double group_iiratio)
+{
+    int Q;
+    int num_mbs = cpi->common.MBs;
+    int target_norm_bits_per_mb = (512 * section_target_bandwitdh) / num_mbs;
+    int bits_per_mb_at_this_q;
+
+    double err_per_mb = section_err / num_mbs;
+    double err_correction_factor;
+    double speed_correction = 1.0;
+    double current_spend_ratio = 1.0;
+
+    double pow_highq = (POW1 < 0.6) ? POW1 + 0.3 : 0.90;
+    double pow_lowq = (POW1 < 0.7) ? POW1 + 0.1 : 0.80;
+
+    double iiratio_correction_factor = 1.0;
+
+    double combined_correction_factor;
+
+    /* Trap special case where the target is <= 0 */
+    if (target_norm_bits_per_mb <= 0)
+        return MAXQ * 2;
+
+    /* Calculate a corrective factor based on a rolling ratio of bits spent
+     *  vs target bits
+     * This is clamped to the range 0.1 to 10.0
+     */
+    if (cpi->long_rolling_target_bits <= 0)
+        current_spend_ratio = 10.0;
+    else
+    {
+        current_spend_ratio = (double)cpi->long_rolling_actual_bits / (double)cpi->long_rolling_target_bits;
+        current_spend_ratio = (current_spend_ratio > 10.0) ? 10.0 : (current_spend_ratio < 0.1) ? 0.1 : current_spend_ratio;
+    }
+
+    /* Calculate a correction factor based on the quality of prediction in
+     * the sequence as indicated by intra_inter error score ratio (IIRatio)
+     * The idea here is to favour subsampling in the hardest sections vs
+     * the easyest.
+     */
+    iiratio_correction_factor = 1.0 - ((group_iiratio - 6.0) * 0.1);
+
+    if (iiratio_correction_factor < 0.5)
+        iiratio_correction_factor = 0.5;
+
+    /* Corrections for higher compression speed settings
+     * (reduced compression expected)
+     */
+    if ((cpi->compressor_speed == 3) || (cpi->compressor_speed == 1))
+    {
+        if (cpi->oxcf.cpu_used <= 5)
+            speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04);
+        else
+            speed_correction = 1.25;
+    }
+
+    /* Combine the various factors calculated above */
+    combined_correction_factor = speed_correction * iiratio_correction_factor * current_spend_ratio;
+
+    /* Try and pick a Q that should be high enough to encode the content at
+     * the given rate.
+     */
+    for (Q = 0; Q < MAXQ; Q++)
+    {
+        /* Error per MB based correction factor */
+        err_correction_factor =
+            calc_correction_factor(err_per_mb, 150.0, pow_lowq, pow_highq, Q);
+
+        bits_per_mb_at_this_q =
+            (int)(.5 + ( err_correction_factor *
+                         combined_correction_factor *
+                         (double)vp8_bits_per_mb[INTER_FRAME][Q]) );
+
+        if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
+            break;
+    }
+
+    /* If we could not hit the target even at Max Q then estimate what Q
+     * would have been required
+     */
+    while ((bits_per_mb_at_this_q > target_norm_bits_per_mb)  && (Q < (MAXQ * 2)))
+    {
+
+        bits_per_mb_at_this_q = (int)(0.96 * bits_per_mb_at_this_q);
+        Q++;
+    }
+
+    if (0)
+    {
+        FILE *f = fopen("estkf_q.stt", "a");
+        fprintf(f, "%8d %8d %8d %8.2f %8.3f %8.2f %8.3f %8.3f %8.3f %8d\n", cpi->common.current_video_frame, bits_per_mb_at_this_q,
+                target_norm_bits_per_mb, err_per_mb, err_correction_factor,
+                current_spend_ratio, group_iiratio, iiratio_correction_factor,
+                (double)cpi->buffer_level / (double)cpi->oxcf.optimal_buffer_level, Q);
+        fclose(f);
+    }
+
+    return Q;
+}
+
+void vp8_init_second_pass(VP8_COMP *cpi)
+{
+    FIRSTPASS_STATS this_frame;
+    FIRSTPASS_STATS *start_pos;
+
+    double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);
+
+    zero_stats(&cpi->twopass.total_stats);
+    zero_stats(&cpi->twopass.total_left_stats);
+
+    if (!cpi->twopass.stats_in_end)
+        return;
+
+    cpi->twopass.total_stats = *cpi->twopass.stats_in_end;
+    cpi->twopass.total_left_stats = cpi->twopass.total_stats;
+
+    /* each frame can have a different duration, as the frame rate in the
+     * source isn't guaranteed to be constant.   The frame rate prior to
+     * the first frame encoded in the second pass is a guess.  However the
+     * sum duration is not. Its calculated based on the actual durations of
+     * all frames from the first pass.
+     */
+    vp8_new_framerate(cpi, 10000000.0 * cpi->twopass.total_stats.count / cpi->twopass.total_stats.duration);
+
+    cpi->output_framerate = cpi->framerate;
+    cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats.duration * cpi->oxcf.target_bandwidth / 10000000.0) ;
+    cpi->twopass.bits_left -= (int64_t)(cpi->twopass.total_stats.duration * two_pass_min_rate / 10000000.0);
+
+    /* Calculate a minimum intra value to be used in determining the IIratio
+     * scores used in the second pass. We have this minimum to make sure
+     * that clips that are static but "low complexity" in the intra domain
+     * are still boosted appropriately for KF/GF/ARF
+     */
+    cpi->twopass.kf_intra_err_min = KF_MB_INTRA_MIN * cpi->common.MBs;
+    cpi->twopass.gf_intra_err_min = GF_MB_INTRA_MIN * cpi->common.MBs;
+
+    /* Scan the first pass file and calculate an average Intra / Inter error
+     * score ratio for the sequence
+     */
+    {
+        double sum_iiratio = 0.0;
+        double IIRatio;
+
+        start_pos = cpi->twopass.stats_in; /* Note starting "file" position */
+
+        while (input_stats(cpi, &this_frame) != EOF)
+        {
+            IIRatio = this_frame.intra_error / DOUBLE_DIVIDE_CHECK(this_frame.coded_error);
+            IIRatio = (IIRatio < 1.0) ? 1.0 : (IIRatio > 20.0) ? 20.0 : IIRatio;
+            sum_iiratio += IIRatio;
+        }
+
+        cpi->twopass.avg_iiratio = sum_iiratio / DOUBLE_DIVIDE_CHECK((double)cpi->twopass.total_stats.count);
+
+        /* Reset file position */
+        reset_fpf_position(cpi, start_pos);
+    }
+
+    /* Scan the first pass file and calculate a modified total error based
+     * upon the bias/power function used to allocate bits
+     */
+    {
+        start_pos = cpi->twopass.stats_in;  /* Note starting "file" position */
+
+        cpi->twopass.modified_error_total = 0.0;
+        cpi->twopass.modified_error_used = 0.0;
+
+        while (input_stats(cpi, &this_frame) != EOF)
+        {
+            cpi->twopass.modified_error_total += calculate_modified_err(cpi, &this_frame);
+        }
+        cpi->twopass.modified_error_left = cpi->twopass.modified_error_total;
+
+        reset_fpf_position(cpi, start_pos);  /* Reset file position */
+
+    }
+}
+
+void vp8_end_second_pass(VP8_COMP *cpi)
+{
+  (void)cpi;
+}
+
+/* This function gives and estimate of how badly we believe the prediction
+ * quality is decaying from frame to frame.
+ */
+static double get_prediction_decay_rate(VP8_COMP *cpi, FIRSTPASS_STATS *next_frame)
+{
+    double prediction_decay_rate;
+    double motion_decay;
+    double motion_pct = next_frame->pcnt_motion;
+    (void)cpi;
+
+    /* Initial basis is the % mbs inter coded */
+    prediction_decay_rate = next_frame->pcnt_inter;
+
+    /* High % motion -> somewhat higher decay rate */
+    motion_decay = (1.0 - (motion_pct / 20.0));
+    if (motion_decay < prediction_decay_rate)
+        prediction_decay_rate = motion_decay;
+
+    /* Adjustment to decay rate based on speed of motion */
+    {
+        double this_mv_rabs;
+        double this_mv_cabs;
+        double distance_factor;
+
+        this_mv_rabs = fabs(next_frame->mvr_abs * motion_pct);
+        this_mv_cabs = fabs(next_frame->mvc_abs * motion_pct);
+
+        distance_factor = sqrt((this_mv_rabs * this_mv_rabs) +
+                               (this_mv_cabs * this_mv_cabs)) / 250.0;
+        distance_factor = ((distance_factor > 1.0)
+                                ? 0.0 : (1.0 - distance_factor));
+        if (distance_factor < prediction_decay_rate)
+            prediction_decay_rate = distance_factor;
+    }
+
+    return prediction_decay_rate;
+}
+
+/* Function to test for a condition where a complex transition is followed
+ * by a static section. For example in slide shows where there is a fade
+ * between slides. This is to help with more optimal kf and gf positioning.
+ */
+static int detect_transition_to_still(
+    VP8_COMP *cpi,
+    int frame_interval,
+    int still_interval,
+    double loop_decay_rate,
+    double decay_accumulator )
+{
+    int trans_to_still = 0;
+
+    /* Break clause to detect very still sections after motion
+     * For example a static image after a fade or other transition
+     * instead of a clean scene cut.
+     */
+    if ( (frame_interval > MIN_GF_INTERVAL) &&
+         (loop_decay_rate >= 0.999) &&
+         (decay_accumulator < 0.9) )
+    {
+        int j;
+        FIRSTPASS_STATS * position = cpi->twopass.stats_in;
+        FIRSTPASS_STATS tmp_next_frame;
+        double decay_rate;
+
+        /* Look ahead a few frames to see if static condition persists... */
+        for ( j = 0; j < still_interval; j++ )
+        {
+            if (EOF == input_stats(cpi, &tmp_next_frame))
+                break;
+
+            decay_rate = get_prediction_decay_rate(cpi, &tmp_next_frame);
+            if ( decay_rate < 0.999 )
+                break;
+        }
+        /* Reset file position */
+        reset_fpf_position(cpi, position);
+
+        /* Only if it does do we signal a transition to still */
+        if ( j == still_interval )
+            trans_to_still = 1;
+    }
+
+    return trans_to_still;
+}
+
+/* This function detects a flash through the high relative pcnt_second_ref
+ * score in the frame following a flash frame. The offset passed in should
+ * reflect this
+ */
+static int detect_flash( VP8_COMP *cpi, int offset )
+{
+    FIRSTPASS_STATS next_frame;
+
+    int flash_detected = 0;
+
+    /* Read the frame data. */
+    /* The return is 0 (no flash detected) if not a valid frame */
+    if ( read_frame_stats(cpi, &next_frame, offset) != EOF )
+    {
+        /* What we are looking for here is a situation where there is a
+         * brief break in prediction (such as a flash) but subsequent frames
+         * are reasonably well predicted by an earlier (pre flash) frame.
+         * The recovery after a flash is indicated by a high pcnt_second_ref
+         * comapred to pcnt_inter.
+         */
+        if ( (next_frame.pcnt_second_ref > next_frame.pcnt_inter) &&
+             (next_frame.pcnt_second_ref >= 0.5 ) )
+        {
+            flash_detected = 1;
+
+            /*if (1)
+            {
+                FILE *f = fopen("flash.stt", "a");
+                fprintf(f, "%8.0f %6.2f %6.2f\n",
+                    next_frame.frame,
+                    next_frame.pcnt_inter,
+                    next_frame.pcnt_second_ref);
+                fclose(f);
+            }*/
+        }
+    }
+
+    return flash_detected;
+}
+
+/* Update the motion related elements to the GF arf boost calculation */
+static void accumulate_frame_motion_stats(
+    VP8_COMP *cpi,
+    FIRSTPASS_STATS * this_frame,
+    double * this_frame_mv_in_out,
+    double * mv_in_out_accumulator,
+    double * abs_mv_in_out_accumulator,
+    double * mv_ratio_accumulator )
+{
+    double this_frame_mvr_ratio;
+    double this_frame_mvc_ratio;
+    double motion_pct;
+    (void)cpi;
+
+    /* Accumulate motion stats. */
+    motion_pct = this_frame->pcnt_motion;
+
+    /* Accumulate Motion In/Out of frame stats */
+    *this_frame_mv_in_out = this_frame->mv_in_out_count * motion_pct;
+    *mv_in_out_accumulator += this_frame->mv_in_out_count * motion_pct;
+    *abs_mv_in_out_accumulator +=
+        fabs(this_frame->mv_in_out_count * motion_pct);
+
+    /* Accumulate a measure of how uniform (or conversely how random)
+     * the motion field is. (A ratio of absmv / mv)
+     */
+    if (motion_pct > 0.05)
+    {
+        this_frame_mvr_ratio = fabs(this_frame->mvr_abs) /
+                               DOUBLE_DIVIDE_CHECK(fabs(this_frame->MVr));
+
+        this_frame_mvc_ratio = fabs(this_frame->mvc_abs) /
+                               DOUBLE_DIVIDE_CHECK(fabs(this_frame->MVc));
+
+         *mv_ratio_accumulator +=
+            (this_frame_mvr_ratio < this_frame->mvr_abs)
+                ? (this_frame_mvr_ratio * motion_pct)
+                : this_frame->mvr_abs * motion_pct;
+
+        *mv_ratio_accumulator +=
+            (this_frame_mvc_ratio < this_frame->mvc_abs)
+                ? (this_frame_mvc_ratio * motion_pct)
+                : this_frame->mvc_abs * motion_pct;
+
+    }
+}
+
+/* Calculate a baseline boost number for the current frame. */
+static double calc_frame_boost(
+    VP8_COMP *cpi,
+    FIRSTPASS_STATS * this_frame,
+    double this_frame_mv_in_out )
+{
+    double frame_boost;
+
+    /* Underlying boost factor is based on inter intra error ratio */
+    if (this_frame->intra_error > cpi->twopass.gf_intra_err_min)
+        frame_boost = (IIFACTOR * this_frame->intra_error /
+                      DOUBLE_DIVIDE_CHECK(this_frame->coded_error));
+    else
+        frame_boost = (IIFACTOR * cpi->twopass.gf_intra_err_min /
+                      DOUBLE_DIVIDE_CHECK(this_frame->coded_error));
+
+    /* Increase boost for frames where new data coming into frame
+     * (eg zoom out). Slightly reduce boost if there is a net balance
+     * of motion out of the frame (zoom in).
+     * The range for this_frame_mv_in_out is -1.0 to +1.0
+     */
+    if (this_frame_mv_in_out > 0.0)
+        frame_boost += frame_boost * (this_frame_mv_in_out * 2.0);
+    /* In extreme case boost is halved */
+    else
+        frame_boost += frame_boost * (this_frame_mv_in_out / 2.0);
+
+    /* Clip to maximum */
+    if (frame_boost > GF_RMAX)
+        frame_boost = GF_RMAX;
+
+    return frame_boost;
+}
+
+#if NEW_BOOST
+static int calc_arf_boost(
+    VP8_COMP *cpi,
+    int offset,
+    int f_frames,
+    int b_frames,
+    int *f_boost,
+    int *b_boost )
+{
+    FIRSTPASS_STATS this_frame;
+
+    int i;
+    double boost_score = 0.0;
+    double mv_ratio_accumulator = 0.0;
+    double decay_accumulator = 1.0;
+    double this_frame_mv_in_out = 0.0;
+    double mv_in_out_accumulator = 0.0;
+    double abs_mv_in_out_accumulator = 0.0;
+    double r;
+    int flash_detected = 0;
+
+    /* Search forward from the proposed arf/next gf position */
+    for ( i = 0; i < f_frames; i++ )
+    {
+        if ( read_frame_stats(cpi, &this_frame, (i+offset)) == EOF )
+            break;
+
+        /* Update the motion related elements to the boost calculation */
+        accumulate_frame_motion_stats( cpi, &this_frame,
+            &this_frame_mv_in_out, &mv_in_out_accumulator,
+            &abs_mv_in_out_accumulator, &mv_ratio_accumulator );
+
+        /* Calculate the baseline boost number for this frame */
+        r = calc_frame_boost( cpi, &this_frame, this_frame_mv_in_out );
+
+        /* We want to discount the the flash frame itself and the recovery
+         * frame that follows as both will have poor scores.
+         */
+        flash_detected = detect_flash(cpi, (i+offset)) ||
+                         detect_flash(cpi, (i+offset+1));
+
+        /* Cumulative effect of prediction quality decay */
+        if ( !flash_detected )
+        {
+            decay_accumulator =
+                decay_accumulator *
+                get_prediction_decay_rate(cpi, &this_frame);
+            decay_accumulator =
+                decay_accumulator < 0.1 ? 0.1 : decay_accumulator;
+        }
+        boost_score += (decay_accumulator * r);
+
+        /* Break out conditions. */
+        if  ( (!flash_detected) &&
+              ((mv_ratio_accumulator > 100.0) ||
+               (abs_mv_in_out_accumulator > 3.0) ||
+               (mv_in_out_accumulator < -2.0) ) )
+        {
+            break;
+        }
+    }
+
+    *f_boost = (int)(boost_score * 100.0) >> 4;
+
+    /* Reset for backward looking loop */
+    boost_score = 0.0;
+    mv_ratio_accumulator = 0.0;
+    decay_accumulator = 1.0;
+    this_frame_mv_in_out = 0.0;
+    mv_in_out_accumulator = 0.0;
+    abs_mv_in_out_accumulator = 0.0;
+
+    /* Search forward from the proposed arf/next gf position */
+    for ( i = -1; i >= -b_frames; i-- )
+    {
+        if ( read_frame_stats(cpi, &this_frame, (i+offset)) == EOF )
+            break;
+
+        /* Update the motion related elements to the boost calculation */
+        accumulate_frame_motion_stats( cpi, &this_frame,
+            &this_frame_mv_in_out, &mv_in_out_accumulator,
+            &abs_mv_in_out_accumulator, &mv_ratio_accumulator );
+
+        /* Calculate the baseline boost number for this frame */
+        r = calc_frame_boost( cpi, &this_frame, this_frame_mv_in_out );
+
+        /* We want to discount the the flash frame itself and the recovery
+         * frame that follows as both will have poor scores.
+         */
+        flash_detected = detect_flash(cpi, (i+offset)) ||
+                         detect_flash(cpi, (i+offset+1));
+
+        /* Cumulative effect of prediction quality decay */
+        if ( !flash_detected )
+        {
+            decay_accumulator =
+                decay_accumulator *
+                get_prediction_decay_rate(cpi, &this_frame);
+            decay_accumulator =
+                decay_accumulator < 0.1 ? 0.1 : decay_accumulator;
+        }
+
+        boost_score += (decay_accumulator * r);
+
+        /* Break out conditions. */
+        if  ( (!flash_detected) &&
+              ((mv_ratio_accumulator > 100.0) ||
+               (abs_mv_in_out_accumulator > 3.0) ||
+               (mv_in_out_accumulator < -2.0) ) )
+        {
+            break;
+        }
+    }
+    *b_boost = (int)(boost_score * 100.0) >> 4;
+
+    return (*f_boost + *b_boost);
+}
+#endif
+
+/* Analyse and define a gf/arf group . */
+static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
+{
+    FIRSTPASS_STATS next_frame;
+    FIRSTPASS_STATS *start_pos;
+    int i;
+    double r;
+    double boost_score = 0.0;
+    double old_boost_score = 0.0;
+    double gf_group_err = 0.0;
+    double gf_first_frame_err = 0.0;
+    double mod_frame_err = 0.0;
+
+    double mv_ratio_accumulator = 0.0;
+    double decay_accumulator = 1.0;
+
+    double loop_decay_rate = 1.00;          /* Starting decay rate */
+
+    double this_frame_mv_in_out = 0.0;
+    double mv_in_out_accumulator = 0.0;
+    double abs_mv_in_out_accumulator = 0.0;
+    double mod_err_per_mb_accumulator = 0.0;
+
+    int max_bits = frame_max_bits(cpi);     /* Max for a single frame */
+
+    unsigned int allow_alt_ref =
+                    cpi->oxcf.play_alternate && cpi->oxcf.lag_in_frames;
+
+    int alt_boost = 0;
+    int f_boost = 0;
+    int b_boost = 0;
+    int flash_detected;
+
+    cpi->twopass.gf_group_bits = 0;
+    cpi->twopass.gf_decay_rate = 0;
+
+    vp8_clear_system_state();
+
+    start_pos = cpi->twopass.stats_in;
+
+    memset(&next_frame, 0, sizeof(next_frame)); /* assure clean */
+
+    /* Load stats for the current frame. */
+    mod_frame_err = calculate_modified_err(cpi, this_frame);
+
+    /* Note the error of the frame at the start of the group (this will be
+     * the GF frame error if we code a normal gf
+     */
+    gf_first_frame_err = mod_frame_err;
+
+    /* Special treatment if the current frame is a key frame (which is also
+     * a gf). If it is then its error score (and hence bit allocation) need
+     * to be subtracted out from the calculation for the GF group
+     */
+    if (cpi->common.frame_type == KEY_FRAME)
+        gf_group_err -= gf_first_frame_err;
+
+    /* Scan forward to try and work out how many frames the next gf group
+     * should contain and what level of boost is appropriate for the GF
+     * or ARF that will be coded with the group
+     */
+    i = 0;
+
+    while (((i < cpi->twopass.static_scene_max_gf_interval) ||
+            ((cpi->twopass.frames_to_key - i) < MIN_GF_INTERVAL)) &&
+           (i < cpi->twopass.frames_to_key))
+    {
+        i++;
+
+        /* Accumulate error score of frames in this gf group */
+        mod_frame_err = calculate_modified_err(cpi, this_frame);
+
+        gf_group_err += mod_frame_err;
+
+        mod_err_per_mb_accumulator +=
+            mod_frame_err / DOUBLE_DIVIDE_CHECK((double)cpi->common.MBs);
+
+        if (EOF == input_stats(cpi, &next_frame))
+            break;
+
+        /* Test for the case where there is a brief flash but the prediction
+         * quality back to an earlier frame is then restored.
+         */
+        flash_detected = detect_flash(cpi, 0);
+
+        /* Update the motion related elements to the boost calculation */
+        accumulate_frame_motion_stats( cpi, &next_frame,
+            &this_frame_mv_in_out, &mv_in_out_accumulator,
+            &abs_mv_in_out_accumulator, &mv_ratio_accumulator );
+
+        /* Calculate a baseline boost number for this frame */
+        r = calc_frame_boost( cpi, &next_frame, this_frame_mv_in_out );
+
+        /* Cumulative effect of prediction quality decay */
+        if ( !flash_detected )
+        {
+            loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
+            decay_accumulator = decay_accumulator * loop_decay_rate;
+            decay_accumulator =
+                decay_accumulator < 0.1 ? 0.1 : decay_accumulator;
+        }
+        boost_score += (decay_accumulator * r);
+
+        /* Break clause to detect very still sections after motion
+         * For example a staic image after a fade or other transition.
+         */
+        if ( detect_transition_to_still( cpi, i, 5,
+                                         loop_decay_rate,
+                                         decay_accumulator ) )
+        {
+            allow_alt_ref = 0;
+            boost_score = old_boost_score;
+            break;
+        }
+
+        /* Break out conditions. */
+        if  (
+            /* Break at cpi->max_gf_interval unless almost totally static */
+            (i >= cpi->max_gf_interval && (decay_accumulator < 0.995)) ||
+            (
+                /* Dont break out with a very short interval */
+                (i > MIN_GF_INTERVAL) &&
+                /* Dont break out very close to a key frame */
+                ((cpi->twopass.frames_to_key - i) >= MIN_GF_INTERVAL) &&
+                ((boost_score > 20.0) || (next_frame.pcnt_inter < 0.75)) &&
+                (!flash_detected) &&
+                ((mv_ratio_accumulator > 100.0) ||
+                 (abs_mv_in_out_accumulator > 3.0) ||
+                 (mv_in_out_accumulator < -2.0) ||
+                 ((boost_score - old_boost_score) < 2.0))
+            ) )
+        {
+            boost_score = old_boost_score;
+            break;
+        }
+
+        memcpy(this_frame, &next_frame, sizeof(*this_frame));
+
+        old_boost_score = boost_score;
+    }
+
+    cpi->twopass.gf_decay_rate =
+        (i > 0) ? (int)(100.0 * (1.0 - decay_accumulator)) / i : 0;
+
+    /* When using CBR apply additional buffer related upper limits */
+    if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+    {
+        double max_boost;
+
+        /* For cbr apply buffer related limits */
+        if (cpi->drop_frames_allowed)
+        {
+            int64_t df_buffer_level = cpi->oxcf.drop_frames_water_mark *
+                                  (cpi->oxcf.optimal_buffer_level / 100);
+
+            if (cpi->buffer_level > df_buffer_level)
+                max_boost = ((double)((cpi->buffer_level - df_buffer_level) * 2 / 3) * 16.0) / DOUBLE_DIVIDE_CHECK((double)cpi->av_per_frame_bandwidth);
+            else
+                max_boost = 0.0;
+        }
+        else if (cpi->buffer_level > 0)
+        {
+            max_boost = ((double)(cpi->buffer_level * 2 / 3) * 16.0) / DOUBLE_DIVIDE_CHECK((double)cpi->av_per_frame_bandwidth);
+        }
+        else
+        {
+            max_boost = 0.0;
+        }
+
+        if (boost_score > max_boost)
+            boost_score = max_boost;
+    }
+
+    /* Dont allow conventional gf too near the next kf */
+    if ((cpi->twopass.frames_to_key - i) < MIN_GF_INTERVAL)
+    {
+        while (i < cpi->twopass.frames_to_key)
+        {
+            i++;
+
+            if (EOF == input_stats(cpi, this_frame))
+                break;
+
+            if (i < cpi->twopass.frames_to_key)
+            {
+                mod_frame_err = calculate_modified_err(cpi, this_frame);
+                gf_group_err += mod_frame_err;
+            }
+        }
+    }
+
+    cpi->gfu_boost = (int)(boost_score * 100.0) >> 4;
+
+#if NEW_BOOST
+    /* Alterrnative boost calculation for alt ref */
+    alt_boost = calc_arf_boost( cpi, 0, (i-1), (i-1), &f_boost, &b_boost );
+#endif
+
+    /* Should we use the alternate refernce frame */
+    if (allow_alt_ref &&
+        (i >= MIN_GF_INTERVAL) &&
+        /* dont use ARF very near next kf */
+        (i <= (cpi->twopass.frames_to_key - MIN_GF_INTERVAL)) &&
+#if NEW_BOOST
+        ((next_frame.pcnt_inter > 0.75) ||
+         (next_frame.pcnt_second_ref > 0.5)) &&
+        ((mv_in_out_accumulator / (double)i > -0.2) ||
+         (mv_in_out_accumulator > -2.0)) &&
+        (b_boost > 100) &&
+        (f_boost > 100) )
+#else
+        (next_frame.pcnt_inter > 0.75) &&
+        ((mv_in_out_accumulator / (double)i > -0.2) ||
+         (mv_in_out_accumulator > -2.0)) &&
+        (cpi->gfu_boost > 100) &&
+        (cpi->twopass.gf_decay_rate <=
+            (ARF_DECAY_THRESH + (cpi->gfu_boost / 200))) )
+#endif
+    {
+        int Boost;
+        int allocation_chunks;
+        int Q = (cpi->oxcf.fixed_q < 0)
+                ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;
+        int tmp_q;
+        int arf_frame_bits = 0;
+        int group_bits;
+
+#if NEW_BOOST
+        cpi->gfu_boost = alt_boost;
+#endif
+
+        /* Estimate the bits to be allocated to the group as a whole */
+        if ((cpi->twopass.kf_group_bits > 0) &&
+            (cpi->twopass.kf_group_error_left > 0))
+        {
+            group_bits = (int)((double)cpi->twopass.kf_group_bits *
+                (gf_group_err / (double)cpi->twopass.kf_group_error_left));
+        }
+        else
+            group_bits = 0;
+
+        /* Boost for arf frame */
+#if NEW_BOOST
+        Boost = (alt_boost * GFQ_ADJUSTMENT) / 100;
+#else
+        Boost = (cpi->gfu_boost * 3 * GFQ_ADJUSTMENT) / (2 * 100);
+#endif
+        Boost += (i * 50);
+
+        /* Set max and minimum boost and hence minimum allocation */
+        if (Boost > ((cpi->baseline_gf_interval + 1) * 200))
+            Boost = ((cpi->baseline_gf_interval + 1) * 200);
+        else if (Boost < 125)
+            Boost = 125;
+
+        allocation_chunks = (i * 100) + Boost;
+
+        /* Normalize Altboost and allocations chunck down to prevent overflow */
+        while (Boost > 1000)
+        {
+            Boost /= 2;
+            allocation_chunks /= 2;
+        }
+
+        /* Calculate the number of bits to be spent on the arf based on the
+         * boost number
+         */
+        arf_frame_bits = (int)((double)Boost * (group_bits /
+                               (double)allocation_chunks));
+
+        /* Estimate if there are enough bits available to make worthwhile use
+         * of an arf.
+         */
+        tmp_q = estimate_q(cpi, mod_frame_err, (int)arf_frame_bits);
+
+        /* Only use an arf if it is likely we will be able to code
+         * it at a lower Q than the surrounding frames.
+         */
+        if (tmp_q < cpi->worst_quality)
+        {
+            int half_gf_int;
+            int frames_after_arf;
+            int frames_bwd = cpi->oxcf.arnr_max_frames - 1;
+            int frames_fwd = cpi->oxcf.arnr_max_frames - 1;
+
+            cpi->source_alt_ref_pending = 1;
+
+            /*
+             * For alt ref frames the error score for the end frame of the
+             * group (the alt ref frame) should not contribute to the group
+             * total and hence the number of bit allocated to the group.
+             * Rather it forms part of the next group (it is the GF at the
+             * start of the next group)
+             * gf_group_err -= mod_frame_err;
+             *
+             * For alt ref frames alt ref frame is technically part of the
+             * GF frame for the next group but we always base the error
+             * calculation and bit allocation on the current group of frames.
+             *
+             * Set the interval till the next gf or arf.
+             * For ARFs this is the number of frames to be coded before the
+             * future frame that is coded as an ARF.
+             * The future frame itself is part of the next group
+             */
+            cpi->baseline_gf_interval = i;
+
+            /*
+             * Define the arnr filter width for this group of frames:
+             * We only filter frames that lie within a distance of half
+             * the GF interval from the ARF frame. We also have to trap
+             * cases where the filter extends beyond the end of clip.
+             * Note: this_frame->frame has been updated in the loop
+             * so it now points at the ARF frame.
+             */
+            half_gf_int = cpi->baseline_gf_interval >> 1;
+            frames_after_arf = (int)(cpi->twopass.total_stats.count -
+                               this_frame->frame - 1);
+
+            switch (cpi->oxcf.arnr_type)
+            {
+            case 1: /* Backward filter */
+                frames_fwd = 0;
+                if (frames_bwd > half_gf_int)
+                    frames_bwd = half_gf_int;
+                break;
+
+            case 2: /* Forward filter */
+                if (frames_fwd > half_gf_int)
+                    frames_fwd = half_gf_int;
+                if (frames_fwd > frames_after_arf)
+                    frames_fwd = frames_after_arf;
+                frames_bwd = 0;
+                break;
+
+            case 3: /* Centered filter */
+            default:
+                frames_fwd >>= 1;
+                if (frames_fwd > frames_after_arf)
+                    frames_fwd = frames_after_arf;
+                if (frames_fwd > half_gf_int)
+                    frames_fwd = half_gf_int;
+
+                frames_bwd = frames_fwd;
+
+                /* For even length filter there is one more frame backward
+                 * than forward: e.g. len=6 ==> bbbAff, len=7 ==> bbbAfff.
+                 */
+                if (frames_bwd < half_gf_int)
+                    frames_bwd += (cpi->oxcf.arnr_max_frames+1) & 0x1;
+                break;
+            }
+
+            cpi->active_arnr_frames = frames_bwd + 1 + frames_fwd;
+        }
+        else
+        {
+            cpi->source_alt_ref_pending = 0;
+            cpi->baseline_gf_interval = i;
+        }
+    }
+    else
+    {
+        cpi->source_alt_ref_pending = 0;
+        cpi->baseline_gf_interval = i;
+    }
+
+    /*
+     * Now decide how many bits should be allocated to the GF group as  a
+     * proportion of those remaining in the kf group.
+     * The final key frame group in the clip is treated as a special case
+     * where cpi->twopass.kf_group_bits is tied to cpi->twopass.bits_left.
+     * This is also important for short clips where there may only be one
+     * key frame.
+     */
+    if (cpi->twopass.frames_to_key >= (int)(cpi->twopass.total_stats.count -
+                                            cpi->common.current_video_frame))
+    {
+        cpi->twopass.kf_group_bits =
+            (cpi->twopass.bits_left > 0) ? cpi->twopass.bits_left : 0;
+    }
+
+    /* Calculate the bits to be allocated to the group as a whole */
+    if ((cpi->twopass.kf_group_bits > 0) &&
+        (cpi->twopass.kf_group_error_left > 0))
+    {
+        cpi->twopass.gf_group_bits =
+            (int64_t)(cpi->twopass.kf_group_bits *
+                      (gf_group_err / cpi->twopass.kf_group_error_left));
+    }
+    else
+        cpi->twopass.gf_group_bits = 0;
+
+    cpi->twopass.gf_group_bits =
+        (cpi->twopass.gf_group_bits < 0)
+            ? 0
+            : (cpi->twopass.gf_group_bits > cpi->twopass.kf_group_bits)
+                ? cpi->twopass.kf_group_bits : cpi->twopass.gf_group_bits;
+
+    /* Clip cpi->twopass.gf_group_bits based on user supplied data rate
+     * variability limit (cpi->oxcf.two_pass_vbrmax_section)
+     */
+    if (cpi->twopass.gf_group_bits >
+        (int64_t)max_bits * cpi->baseline_gf_interval)
+        cpi->twopass.gf_group_bits =
+            (int64_t)max_bits * cpi->baseline_gf_interval;
+
+    /* Reset the file position */
+    reset_fpf_position(cpi, start_pos);
+
+    /* Update the record of error used so far (only done once per gf group) */
+    cpi->twopass.modified_error_used += gf_group_err;
+
+    /* Assign  bits to the arf or gf. */
+    for (i = 0; i <= (cpi->source_alt_ref_pending && cpi->common.frame_type != KEY_FRAME); i++) {
+        int Boost;
+        int allocation_chunks;
+        int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;
+        int gf_bits;
+
+        /* For ARF frames */
+        if (cpi->source_alt_ref_pending && i == 0)
+        {
+#if NEW_BOOST
+            Boost = (alt_boost * GFQ_ADJUSTMENT) / 100;
+#else
+            Boost = (cpi->gfu_boost * 3 * GFQ_ADJUSTMENT) / (2 * 100);
+#endif
+            Boost += (cpi->baseline_gf_interval * 50);
+
+            /* Set max and minimum boost and hence minimum allocation */
+            if (Boost > ((cpi->baseline_gf_interval + 1) * 200))
+                Boost = ((cpi->baseline_gf_interval + 1) * 200);
+            else if (Boost < 125)
+                Boost = 125;
+
+            allocation_chunks =
+                ((cpi->baseline_gf_interval + 1) * 100) + Boost;
+        }
+        /* Else for standard golden frames */
+        else
+        {
+            /* boost based on inter / intra ratio of subsequent frames */
+            Boost = (cpi->gfu_boost * GFQ_ADJUSTMENT) / 100;
+
+            /* Set max and minimum boost and hence minimum allocation */
+            if (Boost > (cpi->baseline_gf_interval * 150))
+                Boost = (cpi->baseline_gf_interval * 150);
+            else if (Boost < 125)
+                Boost = 125;
+
+            allocation_chunks =
+                (cpi->baseline_gf_interval * 100) + (Boost - 100);
+        }
+
+        /* Normalize Altboost and allocations chunck down to prevent overflow */
+        while (Boost > 1000)
+        {
+            Boost /= 2;
+            allocation_chunks /= 2;
+        }
+
+        /* Calculate the number of bits to be spent on the gf or arf based on
+         * the boost number
+         */
+        gf_bits = (int)((double)Boost *
+                        (cpi->twopass.gf_group_bits /
+                         (double)allocation_chunks));
+
+        /* If the frame that is to be boosted is simpler than the average for
+         * the gf/arf group then use an alternative calculation
+         * based on the error score of the frame itself
+         */
+        if (mod_frame_err < gf_group_err / (double)cpi->baseline_gf_interval)
+        {
+            double  alt_gf_grp_bits;
+            int     alt_gf_bits;
+
+            alt_gf_grp_bits =
+                (double)cpi->twopass.kf_group_bits  *
+                (mod_frame_err * (double)cpi->baseline_gf_interval) /
+                DOUBLE_DIVIDE_CHECK((double)cpi->twopass.kf_group_error_left);
+
+            alt_gf_bits = (int)((double)Boost * (alt_gf_grp_bits /
+                                                 (double)allocation_chunks));
+
+            if (gf_bits > alt_gf_bits)
+            {
+                gf_bits = alt_gf_bits;
+            }
+        }
+        /* Else if it is harder than other frames in the group make sure it at
+         * least receives an allocation in keeping with its relative error
+         * score, otherwise it may be worse off than an "un-boosted" frame
+         */
+        else
+        {
+            int alt_gf_bits =
+                (int)((double)cpi->twopass.kf_group_bits *
+                      mod_frame_err /
+                      DOUBLE_DIVIDE_CHECK((double)cpi->twopass.kf_group_error_left));
+
+            if (alt_gf_bits > gf_bits)
+            {
+                gf_bits = alt_gf_bits;
+            }
+        }
+
+        /* Apply an additional limit for CBR */
+        if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+        {
+            if (cpi->twopass.gf_bits > (int)(cpi->buffer_level >> 1))
+                cpi->twopass.gf_bits = (int)(cpi->buffer_level >> 1);
+        }
+
+        /* Dont allow a negative value for gf_bits */
+        if (gf_bits < 0)
+            gf_bits = 0;
+
+        /* Add in minimum for a frame */
+        gf_bits += cpi->min_frame_bandwidth;
+
+        if (i == 0)
+        {
+            cpi->twopass.gf_bits = gf_bits;
+        }
+        if (i == 1 || (!cpi->source_alt_ref_pending && (cpi->common.frame_type != KEY_FRAME)))
+        {
+            /* Per frame bit target for this frame */
+            cpi->per_frame_bandwidth = gf_bits;
+        }
+    }
+
+    {
+        /* Adjust KF group bits and error remainin */
+        cpi->twopass.kf_group_error_left -= (int64_t)gf_group_err;
+        cpi->twopass.kf_group_bits -= cpi->twopass.gf_group_bits;
+
+        if (cpi->twopass.kf_group_bits < 0)
+            cpi->twopass.kf_group_bits = 0;
+
+        /* Note the error score left in the remaining frames of the group.
+         * For normal GFs we want to remove the error score for the first
+         * frame of the group (except in Key frame case where this has
+         * already happened)
+         */
+        if (!cpi->source_alt_ref_pending && cpi->common.frame_type != KEY_FRAME)
+            cpi->twopass.gf_group_error_left = (int)(gf_group_err -
+                                                     gf_first_frame_err);
+        else
+            cpi->twopass.gf_group_error_left = (int) gf_group_err;
+
+        cpi->twopass.gf_group_bits -= cpi->twopass.gf_bits - cpi->min_frame_bandwidth;
+
+        if (cpi->twopass.gf_group_bits < 0)
+            cpi->twopass.gf_group_bits = 0;
+
+        /* This condition could fail if there are two kfs very close together
+         * despite (MIN_GF_INTERVAL) and would cause a devide by 0 in the
+         * calculation of cpi->twopass.alt_extra_bits.
+         */
+        if ( cpi->baseline_gf_interval >= 3 )
+        {
+#if NEW_BOOST
+            int boost = (cpi->source_alt_ref_pending)
+                        ? b_boost : cpi->gfu_boost;
+#else
+            int boost = cpi->gfu_boost;
+#endif
+            if ( boost >= 150 )
+            {
+                int pct_extra;
+
+                pct_extra = (boost - 100) / 50;
+                pct_extra = (pct_extra > 20) ? 20 : pct_extra;
+
+                cpi->twopass.alt_extra_bits =
+                    (int)(cpi->twopass.gf_group_bits * pct_extra) / 100;
+                cpi->twopass.gf_group_bits -= cpi->twopass.alt_extra_bits;
+                cpi->twopass.alt_extra_bits /=
+                    ((cpi->baseline_gf_interval-1)>>1);
+            }
+            else
+                cpi->twopass.alt_extra_bits = 0;
+        }
+        else
+            cpi->twopass.alt_extra_bits = 0;
+    }
+
+    /* Adjustments based on a measure of complexity of the section */
+    if (cpi->common.frame_type != KEY_FRAME)
+    {
+        FIRSTPASS_STATS sectionstats;
+        double Ratio;
+
+        zero_stats(&sectionstats);
+        reset_fpf_position(cpi, start_pos);
+
+        for (i = 0 ; i < cpi->baseline_gf_interval ; i++)
+        {
+            input_stats(cpi, &next_frame);
+            accumulate_stats(&sectionstats, &next_frame);
+        }
+
+        avg_stats(&sectionstats);
+
+        cpi->twopass.section_intra_rating = (unsigned int)
+            (sectionstats.intra_error /
+            DOUBLE_DIVIDE_CHECK(sectionstats.coded_error));
+
+        Ratio = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
+        cpi->twopass.section_max_qfactor = 1.0 - ((Ratio - 10.0) * 0.025);
+
+        if (cpi->twopass.section_max_qfactor < 0.80)
+            cpi->twopass.section_max_qfactor = 0.80;
+
+        reset_fpf_position(cpi, start_pos);
+    }
+}
+
+/* Allocate bits to a normal frame that is neither a gf an arf or a key frame. */
+static void assign_std_frame_bits(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
+{
+    int    target_frame_size;
+
+    double modified_err;
+    double err_fraction;
+
+    int max_bits = frame_max_bits(cpi);  /* Max for a single frame */
+
+    /* Calculate modified prediction error used in bit allocation */
+    modified_err = calculate_modified_err(cpi, this_frame);
+
+    /* What portion of the remaining GF group error is used by this frame */
+    if (cpi->twopass.gf_group_error_left > 0)
+        err_fraction = modified_err / cpi->twopass.gf_group_error_left;
+    else
+        err_fraction = 0.0;
+
+    /* How many of those bits available for allocation should we give it? */
+    target_frame_size = (int)((double)cpi->twopass.gf_group_bits * err_fraction);
+
+    /* Clip to target size to 0 - max_bits (or cpi->twopass.gf_group_bits)
+     * at the top end.
+     */
+    if (target_frame_size < 0)
+        target_frame_size = 0;
+    else
+    {
+        if (target_frame_size > max_bits)
+            target_frame_size = max_bits;
+
+        if (target_frame_size > cpi->twopass.gf_group_bits)
+            target_frame_size = (int)cpi->twopass.gf_group_bits;
+    }
+
+    /* Adjust error and bits remaining */
+    cpi->twopass.gf_group_error_left -= (int)modified_err;
+    cpi->twopass.gf_group_bits -= target_frame_size;
+
+    if (cpi->twopass.gf_group_bits < 0)
+        cpi->twopass.gf_group_bits = 0;
+
+    /* Add in the minimum number of bits that is set aside for every frame. */
+    target_frame_size += cpi->min_frame_bandwidth;
+
+    /* Every other frame gets a few extra bits */
+    if ( (cpi->frames_since_golden & 0x01) &&
+         (cpi->frames_till_gf_update_due > 0) )
+    {
+        target_frame_size += cpi->twopass.alt_extra_bits;
+    }
+
+    /* Per frame bit target for this frame */
+    cpi->per_frame_bandwidth = target_frame_size;
+}
+
+void vp8_second_pass(VP8_COMP *cpi)
+{
+    int tmp_q;
+    int frames_left = (int)(cpi->twopass.total_stats.count - cpi->common.current_video_frame);
+
+    FIRSTPASS_STATS this_frame = {0};
+    FIRSTPASS_STATS this_frame_copy;
+
+    double this_frame_intra_error;
+    double this_frame_coded_error;
+
+    int overhead_bits;
+
+    if (!cpi->twopass.stats_in)
+    {
+        return ;
+    }
+
+    vp8_clear_system_state();
+
+    if (EOF == input_stats(cpi, &this_frame))
+        return;
+
+    this_frame_intra_error = this_frame.intra_error;
+    this_frame_coded_error = this_frame.coded_error;
+
+    /* keyframe and section processing ! */
+    if (cpi->twopass.frames_to_key == 0)
+    {
+        /* Define next KF group and assign bits to it */
+        memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+        find_next_key_frame(cpi, &this_frame_copy);
+
+        /* Special case: Error error_resilient_mode mode does not make much
+         * sense for two pass but with its current meaning this code is
+         * designed to stop outlandish behaviour if someone does set it when
+         * using two pass. It effectively disables GF groups. This is
+         * temporary code until we decide what should really happen in this
+         * case.
+         */
+        if (cpi->oxcf.error_resilient_mode)
+        {
+            cpi->twopass.gf_group_bits = cpi->twopass.kf_group_bits;
+            cpi->twopass.gf_group_error_left =
+                                  (int)cpi->twopass.kf_group_error_left;
+            cpi->baseline_gf_interval = cpi->twopass.frames_to_key;
+            cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
+            cpi->source_alt_ref_pending = 0;
+        }
+
+    }
+
+    /* Is this a GF / ARF (Note that a KF is always also a GF) */
+    if (cpi->frames_till_gf_update_due == 0)
+    {
+        /* Define next gf group and assign bits to it */
+        memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+        define_gf_group(cpi, &this_frame_copy);
+
+        /* If we are going to code an altref frame at the end of the group
+         * and the current frame is not a key frame.... If the previous
+         * group used an arf this frame has already benefited from that arf
+         * boost and it should not be given extra bits If the previous
+         * group was NOT coded using arf we may want to apply some boost to
+         * this GF as well
+         */
+        if (cpi->source_alt_ref_pending && (cpi->common.frame_type != KEY_FRAME))
+        {
+            /* Assign a standard frames worth of bits from those allocated
+             * to the GF group
+             */
+            int bak = cpi->per_frame_bandwidth;
+            memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+            assign_std_frame_bits(cpi, &this_frame_copy);
+            cpi->per_frame_bandwidth = bak;
+        }
+    }
+
+    /* Otherwise this is an ordinary frame */
+    else
+    {
+        /* Special case: Error error_resilient_mode mode does not make much
+         * sense for two pass but with its current meaning but this code is
+         * designed to stop outlandish behaviour if someone does set it
+         * when using two pass. It effectively disables GF groups. This is
+         * temporary code till we decide what should really happen in this
+         * case.
+         */
+        if (cpi->oxcf.error_resilient_mode)
+        {
+            cpi->frames_till_gf_update_due = cpi->twopass.frames_to_key;
+
+            if (cpi->common.frame_type != KEY_FRAME)
+            {
+                /* Assign bits from those allocated to the GF group */
+                memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+                assign_std_frame_bits(cpi, &this_frame_copy);
+            }
+        }
+        else
+        {
+            /* Assign bits from those allocated to the GF group */
+            memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+            assign_std_frame_bits(cpi, &this_frame_copy);
+        }
+    }
+
+    /* Keep a globally available copy of this and the next frame's iiratio. */
+    cpi->twopass.this_iiratio = (unsigned int)(this_frame_intra_error /
+                        DOUBLE_DIVIDE_CHECK(this_frame_coded_error));
+    {
+        FIRSTPASS_STATS next_frame;
+        if ( lookup_next_frame_stats(cpi, &next_frame) != EOF )
+        {
+            cpi->twopass.next_iiratio = (unsigned int)(next_frame.intra_error /
+                                DOUBLE_DIVIDE_CHECK(next_frame.coded_error));
+        }
+    }
+
+    /* Set nominal per second bandwidth for this frame */
+    cpi->target_bandwidth = (int)
+    (cpi->per_frame_bandwidth * cpi->output_framerate);
+    if (cpi->target_bandwidth < 0)
+        cpi->target_bandwidth = 0;
+
+
+    /* Account for mv, mode and other overheads. */
+    overhead_bits = (int)estimate_modemvcost(
+                        cpi, &cpi->twopass.total_left_stats );
+
+    /* Special case code for first frame. */
+    if (cpi->common.current_video_frame == 0)
+    {
+        cpi->twopass.est_max_qcorrection_factor = 1.0;
+
+        /* Set a cq_level in constrained quality mode. */
+        if ( cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY )
+        {
+            int est_cq;
+
+            est_cq =
+                estimate_cq( cpi,
+                             &cpi->twopass.total_left_stats,
+                             (int)(cpi->twopass.bits_left / frames_left),
+                             overhead_bits );
+
+            cpi->cq_target_quality = cpi->oxcf.cq_level;
+            if ( est_cq > cpi->cq_target_quality )
+                cpi->cq_target_quality = est_cq;
+        }
+
+        /* guess at maxq needed in 2nd pass */
+        cpi->twopass.maxq_max_limit = cpi->worst_quality;
+        cpi->twopass.maxq_min_limit = cpi->best_quality;
+
+        tmp_q = estimate_max_q(
+                    cpi,
+                    &cpi->twopass.total_left_stats,
+                    (int)(cpi->twopass.bits_left / frames_left),
+                    overhead_bits );
+
+        /* Limit the maxq value returned subsequently.
+         * This increases the risk of overspend or underspend if the initial
+         * estimate for the clip is bad, but helps prevent excessive
+         * variation in Q, especially near the end of a clip
+         * where for example a small overspend may cause Q to crash
+         */
+        cpi->twopass.maxq_max_limit = ((tmp_q + 32) < cpi->worst_quality)
+                                  ? (tmp_q + 32) : cpi->worst_quality;
+        cpi->twopass.maxq_min_limit = ((tmp_q - 32) > cpi->best_quality)
+                                  ? (tmp_q - 32) : cpi->best_quality;
+
+        cpi->active_worst_quality         = tmp_q;
+        cpi->ni_av_qi                     = tmp_q;
+    }
+
+    /* The last few frames of a clip almost always have to few or too many
+     * bits and for the sake of over exact rate control we dont want to make
+     * radical adjustments to the allowed quantizer range just to use up a
+     * few surplus bits or get beneath the target rate.
+     */
+    else if ( (cpi->common.current_video_frame <
+                 (((unsigned int)cpi->twopass.total_stats.count * 255)>>8)) &&
+              ((cpi->common.current_video_frame + cpi->baseline_gf_interval) <
+                 (unsigned int)cpi->twopass.total_stats.count) )
+    {
+        if (frames_left < 1)
+            frames_left = 1;
+
+        tmp_q = estimate_max_q(
+                    cpi,
+                    &cpi->twopass.total_left_stats,
+                    (int)(cpi->twopass.bits_left / frames_left),
+                    overhead_bits );
+
+        /* Move active_worst_quality but in a damped way */
+        if (tmp_q > cpi->active_worst_quality)
+            cpi->active_worst_quality ++;
+        else if (tmp_q < cpi->active_worst_quality)
+            cpi->active_worst_quality --;
+
+        cpi->active_worst_quality =
+            ((cpi->active_worst_quality * 3) + tmp_q + 2) / 4;
+    }
+
+    cpi->twopass.frames_to_key --;
+
+    /* Update the total stats remaining sturcture */
+    subtract_stats(&cpi->twopass.total_left_stats, &this_frame );
+}
+
+
+static int test_candidate_kf(VP8_COMP *cpi,  FIRSTPASS_STATS *last_frame, FIRSTPASS_STATS *this_frame, FIRSTPASS_STATS *next_frame)
+{
+    int is_viable_kf = 0;
+
+    /* Does the frame satisfy the primary criteria of a key frame
+     *      If so, then examine how well it predicts subsequent frames
+     */
+    if ((this_frame->pcnt_second_ref < 0.10) &&
+        (next_frame->pcnt_second_ref < 0.10) &&
+        ((this_frame->pcnt_inter < 0.05) ||
+         (
+             ((this_frame->pcnt_inter - this_frame->pcnt_neutral) < .25) &&
+             ((this_frame->intra_error / DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) < 2.5) &&
+             ((fabs(last_frame->coded_error - this_frame->coded_error) / DOUBLE_DIVIDE_CHECK(this_frame->coded_error) > .40) ||
+              (fabs(last_frame->intra_error - this_frame->intra_error) / DOUBLE_DIVIDE_CHECK(this_frame->intra_error) > .40) ||
+              ((next_frame->intra_error / DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) > 3.5)
+             )
+         )
+        )
+       )
+    {
+        int i;
+        FIRSTPASS_STATS *start_pos;
+
+        FIRSTPASS_STATS local_next_frame;
+
+        double boost_score = 0.0;
+        double old_boost_score = 0.0;
+        double decay_accumulator = 1.0;
+        double next_iiratio;
+
+        memcpy(&local_next_frame, next_frame, sizeof(*next_frame));
+
+        /* Note the starting file position so we can reset to it */
+        start_pos = cpi->twopass.stats_in;
+
+        /* Examine how well the key frame predicts subsequent frames */
+        for (i = 0 ; i < 16; i++)
+        {
+            next_iiratio = (IIKFACTOR1 * local_next_frame.intra_error / DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error)) ;
+
+            if (next_iiratio > RMAX)
+                next_iiratio = RMAX;
+
+            /* Cumulative effect of decay in prediction quality */
+            if (local_next_frame.pcnt_inter > 0.85)
+                decay_accumulator = decay_accumulator * local_next_frame.pcnt_inter;
+            else
+                decay_accumulator = decay_accumulator * ((0.85 + local_next_frame.pcnt_inter) / 2.0);
+
+            /* Keep a running total */
+            boost_score += (decay_accumulator * next_iiratio);
+
+            /* Test various breakout clauses */
+            if ((local_next_frame.pcnt_inter < 0.05) ||
+                (next_iiratio < 1.5) ||
+                (((local_next_frame.pcnt_inter -
+                   local_next_frame.pcnt_neutral) < 0.20) &&
+                 (next_iiratio < 3.0)) ||
+                ((boost_score - old_boost_score) < 0.5) ||
+                (local_next_frame.intra_error < 200)
+               )
+            {
+                break;
+            }
+
+            old_boost_score = boost_score;
+
+            /* Get the next frame details */
+            if (EOF == input_stats(cpi, &local_next_frame))
+                break;
+        }
+
+        /* If there is tolerable prediction for at least the next 3 frames
+         * then break out else discard this pottential key frame and move on
+         */
+        if (boost_score > 5.0 && (i > 3))
+            is_viable_kf = 1;
+        else
+        {
+            /* Reset the file position */
+            reset_fpf_position(cpi, start_pos);
+
+            is_viable_kf = 0;
+        }
+    }
+
+    return is_viable_kf;
+}
+static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
+{
+    int i,j;
+    FIRSTPASS_STATS last_frame;
+    FIRSTPASS_STATS first_frame;
+    FIRSTPASS_STATS next_frame;
+    FIRSTPASS_STATS *start_position;
+
+    double decay_accumulator = 1.0;
+    double boost_score = 0;
+    double old_boost_score = 0.0;
+    double loop_decay_rate;
+
+    double kf_mod_err = 0.0;
+    double kf_group_err = 0.0;
+    double kf_group_intra_err = 0.0;
+    double kf_group_coded_err = 0.0;
+    double recent_loop_decay[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
+
+    memset(&next_frame, 0, sizeof(next_frame));
+
+    vp8_clear_system_state();
+    start_position = cpi->twopass.stats_in;
+
+    cpi->common.frame_type = KEY_FRAME;
+
+    /* is this a forced key frame by interval */
+    cpi->this_key_frame_forced = cpi->next_key_frame_forced;
+
+    /* Clear the alt ref active flag as this can never be active on a key
+     * frame
+     */
+    cpi->source_alt_ref_active = 0;
+
+    /* Kf is always a gf so clear frames till next gf counter */
+    cpi->frames_till_gf_update_due = 0;
+
+    cpi->twopass.frames_to_key = 1;
+
+    /* Take a copy of the initial frame details */
+    memcpy(&first_frame, this_frame, sizeof(*this_frame));
+
+    cpi->twopass.kf_group_bits = 0;
+    cpi->twopass.kf_group_error_left = 0;
+
+    kf_mod_err = calculate_modified_err(cpi, this_frame);
+
+    /* find the next keyframe */
+    i = 0;
+    while (cpi->twopass.stats_in < cpi->twopass.stats_in_end)
+    {
+        /* Accumulate kf group error */
+        kf_group_err += calculate_modified_err(cpi, this_frame);
+
+        /* These figures keep intra and coded error counts for all frames
+         * including key frames in the group. The effect of the key frame
+         * itself can be subtracted out using the first_frame data
+         * collected above
+         */
+        kf_group_intra_err += this_frame->intra_error;
+        kf_group_coded_err += this_frame->coded_error;
+
+        /* Load the next frame's stats. */
+        memcpy(&last_frame, this_frame, sizeof(*this_frame));
+        input_stats(cpi, this_frame);
+
+        /* Provided that we are not at the end of the file... */
+        if (cpi->oxcf.auto_key
+            && lookup_next_frame_stats(cpi, &next_frame) != EOF)
+        {
+            /* Normal scene cut check */
+            if ( ( i >= MIN_GF_INTERVAL ) &&
+                 test_candidate_kf(cpi, &last_frame, this_frame, &next_frame) )
+            {
+                break;
+            }
+
+            /* How fast is prediction quality decaying */
+            loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
+
+            /* We want to know something about the recent past... rather than
+             * as used elsewhere where we are concened with decay in prediction
+             * quality since the last GF or KF.
+             */
+            recent_loop_decay[i%8] = loop_decay_rate;
+            decay_accumulator = 1.0;
+            for (j = 0; j < 8; j++)
+            {
+                decay_accumulator = decay_accumulator * recent_loop_decay[j];
+            }
+
+            /* Special check for transition or high motion followed by a
+             * static scene.
+             */
+            if ( detect_transition_to_still( cpi, i,
+                                             (cpi->key_frame_frequency-i),
+                                             loop_decay_rate,
+                                             decay_accumulator ) )
+            {
+                break;
+            }
+
+
+            /* Step on to the next frame */
+            cpi->twopass.frames_to_key ++;
+
+            /* If we don't have a real key frame within the next two
+             * forcekeyframeevery intervals then break out of the loop.
+             */
+            if (cpi->twopass.frames_to_key >= 2 *(int)cpi->key_frame_frequency)
+                break;
+        } else
+            cpi->twopass.frames_to_key ++;
+
+        i++;
+    }
+
+    /* If there is a max kf interval set by the user we must obey it.
+     * We already breakout of the loop above at 2x max.
+     * This code centers the extra kf if the actual natural
+     * interval is between 1x and 2x
+     */
+    if (cpi->oxcf.auto_key
+        && cpi->twopass.frames_to_key > (int)cpi->key_frame_frequency )
+    {
+        FIRSTPASS_STATS *current_pos = cpi->twopass.stats_in;
+        FIRSTPASS_STATS tmp_frame;
+
+        cpi->twopass.frames_to_key /= 2;
+
+        /* Copy first frame details */
+        memcpy(&tmp_frame, &first_frame, sizeof(first_frame));
+
+        /* Reset to the start of the group */
+        reset_fpf_position(cpi, start_position);
+
+        kf_group_err = 0;
+        kf_group_intra_err = 0;
+        kf_group_coded_err = 0;
+
+        /* Rescan to get the correct error data for the forced kf group */
+        for( i = 0; i < cpi->twopass.frames_to_key; i++ )
+        {
+            /* Accumulate kf group errors */
+            kf_group_err += calculate_modified_err(cpi, &tmp_frame);
+            kf_group_intra_err += tmp_frame.intra_error;
+            kf_group_coded_err += tmp_frame.coded_error;
+
+            /* Load a the next frame's stats */
+            input_stats(cpi, &tmp_frame);
+        }
+
+        /* Reset to the start of the group */
+        reset_fpf_position(cpi, current_pos);
+
+        cpi->next_key_frame_forced = 1;
+    }
+    else
+        cpi->next_key_frame_forced = 0;
+
+    /* Special case for the last frame of the file */
+    if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end)
+    {
+        /* Accumulate kf group error */
+        kf_group_err += calculate_modified_err(cpi, this_frame);
+
+        /* These figures keep intra and coded error counts for all frames
+         * including key frames in the group. The effect of the key frame
+         * itself can be subtracted out using the first_frame data
+         * collected above
+         */
+        kf_group_intra_err += this_frame->intra_error;
+        kf_group_coded_err += this_frame->coded_error;
+    }
+
+    /* Calculate the number of bits that should be assigned to the kf group. */
+    if ((cpi->twopass.bits_left > 0) && (cpi->twopass.modified_error_left > 0.0))
+    {
+        /* Max for a single normal frame (not key frame) */
+        int max_bits = frame_max_bits(cpi);
+
+        /* Maximum bits for the kf group */
+        int64_t max_grp_bits;
+
+        /* Default allocation based on bits left and relative
+         * complexity of the section
+         */
+        cpi->twopass.kf_group_bits = (int64_t)( cpi->twopass.bits_left *
+                                          ( kf_group_err /
+                                            cpi->twopass.modified_error_left ));
+
+        /* Clip based on maximum per frame rate defined by the user. */
+        max_grp_bits = (int64_t)max_bits * (int64_t)cpi->twopass.frames_to_key;
+        if (cpi->twopass.kf_group_bits > max_grp_bits)
+            cpi->twopass.kf_group_bits = max_grp_bits;
+
+        /* Additional special case for CBR if buffer is getting full. */
+        if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+        {
+            int64_t opt_buffer_lvl = cpi->oxcf.optimal_buffer_level;
+            int64_t buffer_lvl = cpi->buffer_level;
+
+            /* If the buffer is near or above the optimal and this kf group is
+             * not being allocated much then increase the allocation a bit.
+             */
+            if (buffer_lvl >= opt_buffer_lvl)
+            {
+                int64_t high_water_mark = (opt_buffer_lvl +
+                                       cpi->oxcf.maximum_buffer_size) >> 1;
+
+                int64_t av_group_bits;
+
+                /* Av bits per frame * number of frames */
+                av_group_bits = (int64_t)cpi->av_per_frame_bandwidth *
+                                (int64_t)cpi->twopass.frames_to_key;
+
+                /* We are at or above the maximum. */
+                if (cpi->buffer_level >= high_water_mark)
+                {
+                    int64_t min_group_bits;
+
+                    min_group_bits = av_group_bits +
+                                     (int64_t)(buffer_lvl -
+                                                 high_water_mark);
+
+                    if (cpi->twopass.kf_group_bits < min_group_bits)
+                        cpi->twopass.kf_group_bits = min_group_bits;
+                }
+                /* We are above optimal but below the maximum */
+                else if (cpi->twopass.kf_group_bits < av_group_bits)
+                {
+                    int64_t bits_below_av = av_group_bits -
+                                              cpi->twopass.kf_group_bits;
+
+                    cpi->twopass.kf_group_bits +=
+                       (int64_t)((double)bits_below_av *
+                                   (double)(buffer_lvl - opt_buffer_lvl) /
+                                   (double)(high_water_mark - opt_buffer_lvl));
+                }
+            }
+        }
+    }
+    else
+        cpi->twopass.kf_group_bits = 0;
+
+    /* Reset the first pass file position */
+    reset_fpf_position(cpi, start_position);
+
+    /* determine how big to make this keyframe based on how well the
+     * subsequent frames use inter blocks
+     */
+    decay_accumulator = 1.0;
+    boost_score = 0.0;
+
+    for (i = 0 ; i < cpi->twopass.frames_to_key ; i++)
+    {
+        double r;
+
+        if (EOF == input_stats(cpi, &next_frame))
+            break;
+
+        if (next_frame.intra_error > cpi->twopass.kf_intra_err_min)
+            r = (IIKFACTOR2 * next_frame.intra_error /
+                     DOUBLE_DIVIDE_CHECK(next_frame.coded_error));
+        else
+            r = (IIKFACTOR2 * cpi->twopass.kf_intra_err_min /
+                     DOUBLE_DIVIDE_CHECK(next_frame.coded_error));
+
+        if (r > RMAX)
+            r = RMAX;
+
+        /* How fast is prediction quality decaying */
+        loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
+
+        decay_accumulator = decay_accumulator * loop_decay_rate;
+        decay_accumulator = decay_accumulator < 0.1 ? 0.1 : decay_accumulator;
+
+        boost_score += (decay_accumulator * r);
+
+        if ((i > MIN_GF_INTERVAL) &&
+            ((boost_score - old_boost_score) < 1.0))
+        {
+            break;
+        }
+
+        old_boost_score = boost_score;
+    }
+
+    if (1)
+    {
+        FIRSTPASS_STATS sectionstats;
+        double Ratio;
+
+        zero_stats(&sectionstats);
+        reset_fpf_position(cpi, start_position);
+
+        for (i = 0 ; i < cpi->twopass.frames_to_key ; i++)
+        {
+            input_stats(cpi, &next_frame);
+            accumulate_stats(&sectionstats, &next_frame);
+        }
+
+        avg_stats(&sectionstats);
+
+        cpi->twopass.section_intra_rating = (unsigned int)
+            (sectionstats.intra_error
+            / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error));
+
+        Ratio = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
+        cpi->twopass.section_max_qfactor = 1.0 - ((Ratio - 10.0) * 0.025);
+
+        if (cpi->twopass.section_max_qfactor < 0.80)
+            cpi->twopass.section_max_qfactor = 0.80;
+    }
+
+    /* When using CBR apply additional buffer fullness related upper limits */
+    if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+    {
+        double max_boost;
+
+        if (cpi->drop_frames_allowed)
+        {
+            int df_buffer_level = (int)(cpi->oxcf.drop_frames_water_mark
+                                  * (cpi->oxcf.optimal_buffer_level / 100));
+
+            if (cpi->buffer_level > df_buffer_level)
+                max_boost = ((double)((cpi->buffer_level - df_buffer_level) * 2 / 3) * 16.0) / DOUBLE_DIVIDE_CHECK((double)cpi->av_per_frame_bandwidth);
+            else
+                max_boost = 0.0;
+        }
+        else if (cpi->buffer_level > 0)
+        {
+            max_boost = ((double)(cpi->buffer_level * 2 / 3) * 16.0) / DOUBLE_DIVIDE_CHECK((double)cpi->av_per_frame_bandwidth);
+        }
+        else
+        {
+            max_boost = 0.0;
+        }
+
+        if (boost_score > max_boost)
+            boost_score = max_boost;
+    }
+
+    /* Reset the first pass file position */
+    reset_fpf_position(cpi, start_position);
+
+    /* Work out how many bits to allocate for the key frame itself */
+    if (1)
+    {
+        int kf_boost = (int)boost_score;
+        int allocation_chunks;
+        int Counter = cpi->twopass.frames_to_key;
+        int alt_kf_bits;
+        YV12_BUFFER_CONFIG *lst_yv12 = &cpi->common.yv12_fb[cpi->common.lst_fb_idx];
+        /* Min boost based on kf interval */
+#if 0
+
+        while ((kf_boost < 48) && (Counter > 0))
+        {
+            Counter -= 2;
+            kf_boost ++;
+        }
+
+#endif
+
+        if (kf_boost < 48)
+        {
+            kf_boost += ((Counter + 1) >> 1);
+
+            if (kf_boost > 48) kf_boost = 48;
+        }
+
+        /* bigger frame sizes need larger kf boosts, smaller frames smaller
+         * boosts...
+         */
+        if ((lst_yv12->y_width * lst_yv12->y_height) > (320 * 240))
+            kf_boost += 2 * (lst_yv12->y_width * lst_yv12->y_height) / (320 * 240);
+        else if ((lst_yv12->y_width * lst_yv12->y_height) < (320 * 240))
+            kf_boost -= 4 * (320 * 240) / (lst_yv12->y_width * lst_yv12->y_height);
+
+        /* Min KF boost */
+        kf_boost = (int)((double)kf_boost * 100.0) >> 4; /* Scale 16 to 100 */
+        if (kf_boost < 250)
+            kf_boost = 250;
+
+        /*
+         * We do three calculations for kf size.
+         * The first is based on the error score for the whole kf group.
+         * The second (optionaly) on the key frames own error if this is
+         * smaller than the average for the group.
+         * The final one insures that the frame receives at least the
+         * allocation it would have received based on its own error score vs
+         * the error score remaining
+         * Special case if the sequence appears almost totaly static
+         * as measured by the decay accumulator. In this case we want to
+         * spend almost all of the bits on the key frame.
+         * cpi->twopass.frames_to_key-1 because key frame itself is taken
+         * care of by kf_boost.
+         */
+        if ( decay_accumulator >= 0.99 )
+        {
+            allocation_chunks =
+                ((cpi->twopass.frames_to_key - 1) * 10) + kf_boost;
+        }
+        else
+        {
+            allocation_chunks =
+                ((cpi->twopass.frames_to_key - 1) * 100) + kf_boost;
+        }
+
+        /* Normalize Altboost and allocations chunck down to prevent overflow */
+        while (kf_boost > 1000)
+        {
+            kf_boost /= 2;
+            allocation_chunks /= 2;
+        }
+
+        cpi->twopass.kf_group_bits = (cpi->twopass.kf_group_bits < 0) ? 0 : cpi->twopass.kf_group_bits;
+
+        /* Calculate the number of bits to be spent on the key frame */
+        cpi->twopass.kf_bits  = (int)((double)kf_boost * ((double)cpi->twopass.kf_group_bits / (double)allocation_chunks));
+
+        /* Apply an additional limit for CBR */
+        if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+        {
+            if (cpi->twopass.kf_bits > (int)((3 * cpi->buffer_level) >> 2))
+                cpi->twopass.kf_bits = (int)((3 * cpi->buffer_level) >> 2);
+        }
+
+        /* If the key frame is actually easier than the average for the
+         * kf group (which does sometimes happen... eg a blank intro frame)
+         * Then use an alternate calculation based on the kf error score
+         * which should give a smaller key frame.
+         */
+        if (kf_mod_err < kf_group_err / cpi->twopass.frames_to_key)
+        {
+            double  alt_kf_grp_bits =
+                        ((double)cpi->twopass.bits_left *
+                         (kf_mod_err * (double)cpi->twopass.frames_to_key) /
+                         DOUBLE_DIVIDE_CHECK(cpi->twopass.modified_error_left));
+
+            alt_kf_bits = (int)((double)kf_boost *
+                                (alt_kf_grp_bits / (double)allocation_chunks));
+
+            if (cpi->twopass.kf_bits > alt_kf_bits)
+            {
+                cpi->twopass.kf_bits = alt_kf_bits;
+            }
+        }
+        /* Else if it is much harder than other frames in the group make sure
+         * it at least receives an allocation in keeping with its relative
+         * error score
+         */
+        else
+        {
+            alt_kf_bits =
+                (int)((double)cpi->twopass.bits_left *
+                      (kf_mod_err /
+                       DOUBLE_DIVIDE_CHECK(cpi->twopass.modified_error_left)));
+
+            if (alt_kf_bits > cpi->twopass.kf_bits)
+            {
+                cpi->twopass.kf_bits = alt_kf_bits;
+            }
+        }
+
+        cpi->twopass.kf_group_bits -= cpi->twopass.kf_bits;
+        /* Add in the minimum frame allowance */
+        cpi->twopass.kf_bits += cpi->min_frame_bandwidth;
+
+        /* Peer frame bit target for this frame */
+        cpi->per_frame_bandwidth = cpi->twopass.kf_bits;
+
+        /* Convert to a per second bitrate */
+        cpi->target_bandwidth = (int)(cpi->twopass.kf_bits *
+                                      cpi->output_framerate);
+    }
+
+    /* Note the total error score of the kf group minus the key frame itself */
+    cpi->twopass.kf_group_error_left = (int)(kf_group_err - kf_mod_err);
+
+    /* Adjust the count of total modified error left. The count of bits left
+     * is adjusted elsewhere based on real coded frame sizes
+     */
+    cpi->twopass.modified_error_left -= kf_group_err;
+
+    if (cpi->oxcf.allow_spatial_resampling)
+    {
+        int resample_trigger = 0;
+        int last_kf_resampled = 0;
+        int kf_q;
+        int scale_val = 0;
+        int hr, hs, vr, vs;
+        int new_width = cpi->oxcf.Width;
+        int new_height = cpi->oxcf.Height;
+
+        int projected_buffer_level;
+        int tmp_q;
+
+        double projected_bits_perframe;
+        double group_iiratio = (kf_group_intra_err - first_frame.intra_error) / (kf_group_coded_err - first_frame.coded_error);
+        double err_per_frame = kf_group_err / cpi->twopass.frames_to_key;
+        double bits_per_frame;
+        double av_bits_per_frame;
+        double effective_size_ratio;
+
+        if ((cpi->common.Width != cpi->oxcf.Width) || (cpi->common.Height != cpi->oxcf.Height))
+            last_kf_resampled = 1;
+
+        /* Set back to unscaled by defaults */
+        cpi->common.horiz_scale = NORMAL;
+        cpi->common.vert_scale = NORMAL;
+
+        /* Calculate Average bits per frame. */
+        av_bits_per_frame = cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->framerate);
+
+        /* CBR... Use the clip average as the target for deciding resample */
+        if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+        {
+            bits_per_frame = av_bits_per_frame;
+        }
+
+        /* In VBR we want to avoid downsampling in easy section unless we
+         * are under extreme pressure So use the larger of target bitrate
+         * for this section or average bitrate for sequence
+         */
+        else
+        {
+            /* This accounts for how hard the section is... */
+            bits_per_frame = (double)
+                (cpi->twopass.kf_group_bits / cpi->twopass.frames_to_key);
+
+            /* Dont turn to resampling in easy sections just because they
+             * have been assigned a small number of bits
+             */
+            if (bits_per_frame < av_bits_per_frame)
+                bits_per_frame = av_bits_per_frame;
+        }
+
+        /* bits_per_frame should comply with our minimum */
+        if (bits_per_frame < (cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100))
+            bits_per_frame = (cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);
+
+        /* Work out if spatial resampling is necessary */
+        kf_q = estimate_kf_group_q(cpi, err_per_frame,
+                                  (int)bits_per_frame, group_iiratio);
+
+        /* If we project a required Q higher than the maximum allowed Q then
+         * make a guess at the actual size of frames in this section
+         */
+        projected_bits_perframe = bits_per_frame;
+        tmp_q = kf_q;
+
+        while (tmp_q > cpi->worst_quality)
+        {
+            projected_bits_perframe *= 1.04;
+            tmp_q--;
+        }
+
+        /* Guess at buffer level at the end of the section */
+        projected_buffer_level = (int)
+                    (cpi->buffer_level - (int)
+                    ((projected_bits_perframe - av_bits_per_frame) *
+                    cpi->twopass.frames_to_key));
+
+        if (0)
+        {
+            FILE *f = fopen("Subsamle.stt", "a");
+            fprintf(f, " %8d %8d %8d %8d %12.0f %8d %8d %8d\n",  cpi->common.current_video_frame, kf_q, cpi->common.horiz_scale, cpi->common.vert_scale,  kf_group_err / cpi->twopass.frames_to_key, (int)(cpi->twopass.kf_group_bits / cpi->twopass.frames_to_key), new_height, new_width);
+            fclose(f);
+        }
+
+        /* The trigger for spatial resampling depends on the various
+         * parameters such as whether we are streaming (CBR) or VBR.
+         */
+        if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+        {
+            /* Trigger resample if we are projected to fall below down
+             * sample level or resampled last time and are projected to
+             * remain below the up sample level
+             */
+            if ((projected_buffer_level < (cpi->oxcf.resample_down_water_mark * cpi->oxcf.optimal_buffer_level / 100)) ||
+                (last_kf_resampled && (projected_buffer_level < (cpi->oxcf.resample_up_water_mark * cpi->oxcf.optimal_buffer_level / 100))))
+                resample_trigger = 1;
+            else
+                resample_trigger = 0;
+        }
+        else
+        {
+            int64_t clip_bits = (int64_t)(cpi->twopass.total_stats.count * cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->framerate));
+            int64_t over_spend = cpi->oxcf.starting_buffer_level - cpi->buffer_level;
+
+            /* If triggered last time the threshold for triggering again is
+             * reduced:
+             *
+             * Projected Q higher than allowed and Overspend > 5% of total
+             * bits
+             */
+            if ((last_kf_resampled && (kf_q > cpi->worst_quality)) ||
+                ((kf_q > cpi->worst_quality) &&
+                 (over_spend > clip_bits / 20)))
+                resample_trigger = 1;
+            else
+                resample_trigger = 0;
+
+        }
+
+        if (resample_trigger)
+        {
+            while ((kf_q >= cpi->worst_quality) && (scale_val < 6))
+            {
+                scale_val ++;
+
+                cpi->common.vert_scale   = vscale_lookup[scale_val];
+                cpi->common.horiz_scale  = hscale_lookup[scale_val];
+
+                Scale2Ratio(cpi->common.horiz_scale, &hr, &hs);
+                Scale2Ratio(cpi->common.vert_scale, &vr, &vs);
+
+                new_width = ((hs - 1) + (cpi->oxcf.Width * hr)) / hs;
+                new_height = ((vs - 1) + (cpi->oxcf.Height * vr)) / vs;
+
+                /* Reducing the area to 1/4 does not reduce the complexity
+                 * (err_per_frame) to 1/4... effective_sizeratio attempts
+                 * to provide a crude correction for this
+                 */
+                effective_size_ratio = (double)(new_width * new_height) / (double)(cpi->oxcf.Width * cpi->oxcf.Height);
+                effective_size_ratio = (1.0 + (3.0 * effective_size_ratio)) / 4.0;
+
+                /* Now try again and see what Q we get with the smaller
+                 * image size
+                 */
+                kf_q = estimate_kf_group_q(cpi,
+                                          err_per_frame * effective_size_ratio,
+                                          (int)bits_per_frame, group_iiratio);
+
+                if (0)
+                {
+                    FILE *f = fopen("Subsamle.stt", "a");
+                    fprintf(f, "******** %8d %8d %8d %12.0f %8d %8d %8d\n",  kf_q, cpi->common.horiz_scale, cpi->common.vert_scale,  kf_group_err / cpi->twopass.frames_to_key, (int)(cpi->twopass.kf_group_bits / cpi->twopass.frames_to_key), new_height, new_width);
+                    fclose(f);
+                }
+            }
+        }
+
+        if ((cpi->common.Width != new_width) || (cpi->common.Height != new_height))
+        {
+            cpi->common.Width = new_width;
+            cpi->common.Height = new_height;
+            vp8_alloc_compressor_data(cpi);
+        }
+    }
+}
diff --git a/libs/libvpx/vp8/encoder/firstpass.h b/libs/libvpx/vp8/encoder/firstpass.h
new file mode 100644
index 0000000000..c409ebca8f
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/firstpass.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_ENCODER_FIRSTPASS_H_
+#define VP8_ENCODER_FIRSTPASS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern void vp8_init_first_pass(VP8_COMP *cpi);
+extern void vp8_first_pass(VP8_COMP *cpi);
+extern void vp8_end_first_pass(VP8_COMP *cpi);
+
+extern void vp8_init_second_pass(VP8_COMP *cpi);
+extern void vp8_second_pass(VP8_COMP *cpi);
+extern void vp8_end_second_pass(VP8_COMP *cpi);
+
+extern size_t vp8_firstpass_stats_sz(unsigned int mb_count);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_ENCODER_FIRSTPASS_H_
diff --git a/libs/libvpx/vp8/encoder/lookahead.c b/libs/libvpx/vp8/encoder/lookahead.c
new file mode 100644
index 0000000000..6623385743
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/lookahead.c
@@ -0,0 +1,231 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <assert.h>
+#include <stdlib.h>
+#include "vpx_config.h"
+#include "lookahead.h"
+#include "vp8/common/extend.h"
+
+#define MAX_LAG_BUFFERS (CONFIG_REALTIME_ONLY? 1 : 25)
+
+struct lookahead_ctx
+{
+    unsigned int max_sz;         /* Absolute size of the queue */
+    unsigned int sz;             /* Number of buffers currently in the queue */
+    unsigned int read_idx;       /* Read index */
+    unsigned int write_idx;      /* Write index */
+    struct lookahead_entry *buf; /* Buffer list */
+};
+
+
+/* Return the buffer at the given absolute index and increment the index */
+static struct lookahead_entry *
+pop(struct lookahead_ctx *ctx,
+    unsigned int         *idx)
+{
+    unsigned int            index = *idx;
+    struct lookahead_entry *buf = ctx->buf + index;
+
+    assert(index < ctx->max_sz);
+    if(++index >= ctx->max_sz)
+        index -= ctx->max_sz;
+    *idx = index;
+    return buf;
+}
+
+
+void
+vp8_lookahead_destroy(struct lookahead_ctx *ctx)
+{
+    if(ctx)
+    {
+        if(ctx->buf)
+        {
+            unsigned int i;
+
+            for(i = 0; i < ctx->max_sz; i++)
+                vp8_yv12_de_alloc_frame_buffer(&ctx->buf[i].img);
+            free(ctx->buf);
+        }
+        free(ctx);
+    }
+}
+
+
+struct lookahead_ctx*
+vp8_lookahead_init(unsigned int width,
+                   unsigned int height,
+                   unsigned int depth)
+{
+    struct lookahead_ctx *ctx = NULL;
+    unsigned int i;
+
+    /* Clamp the lookahead queue depth */
+    if(depth < 1)
+        depth = 1;
+    else if(depth > MAX_LAG_BUFFERS)
+        depth = MAX_LAG_BUFFERS;
+
+    /* Keep last frame in lookahead buffer by increasing depth by 1.*/
+    depth += 1;
+
+    /* Align the buffer dimensions */
+    width = (width + 15) & ~15;
+    height = (height + 15) & ~15;
+
+    /* Allocate the lookahead structures */
+    ctx = calloc(1, sizeof(*ctx));
+    if(ctx)
+    {
+        ctx->max_sz = depth;
+        ctx->buf = calloc(depth, sizeof(*ctx->buf));
+        if(!ctx->buf)
+            goto bail;
+        for(i=0; i<depth; i++)
+            if (vp8_yv12_alloc_frame_buffer(&ctx->buf[i].img,
+                                            width, height, VP8BORDERINPIXELS))
+                goto bail;
+    }
+    return ctx;
+bail:
+    vp8_lookahead_destroy(ctx);
+    return NULL;
+}
+
+
+int
+vp8_lookahead_push(struct lookahead_ctx *ctx,
+                   YV12_BUFFER_CONFIG   *src,
+                   int64_t               ts_start,
+                   int64_t               ts_end,
+                   unsigned int          flags,
+                   unsigned char        *active_map)
+{
+    struct lookahead_entry* buf;
+    int row, col, active_end;
+    int mb_rows = (src->y_height + 15) >> 4;
+    int mb_cols = (src->y_width + 15) >> 4;
+
+    if(ctx->sz + 2 > ctx->max_sz)
+        return 1;
+    ctx->sz++;
+    buf = pop(ctx, &ctx->write_idx);
+
+    /* Only do this partial copy if the following conditions are all met:
+     * 1. Lookahead queue has has size of 1.
+     * 2. Active map is provided.
+     * 3. This is not a key frame, golden nor altref frame.
+     */
+    if (ctx->max_sz == 1 && active_map && !flags)
+    {
+        for (row = 0; row < mb_rows; ++row)
+        {
+            col = 0;
+
+            while (1)
+            {
+                /* Find the first active macroblock in this row. */
+                for (; col < mb_cols; ++col)
+                {
+                    if (active_map[col])
+                        break;
+                }
+
+                /* No more active macroblock in this row. */
+                if (col == mb_cols)
+                    break;
+
+                /* Find the end of active region in this row. */
+                active_end = col;
+
+                for (; active_end < mb_cols; ++active_end)
+                {
+                    if (!active_map[active_end])
+                        break;
+                }
+
+                /* Only copy this active region. */
+                vp8_copy_and_extend_frame_with_rect(src, &buf->img,
+                                                    row << 4,
+                                                    col << 4, 16,
+                                                    (active_end - col) << 4);
+
+                /* Start again from the end of this active region. */
+                col = active_end;
+            }
+
+            active_map += mb_cols;
+        }
+    }
+    else
+    {
+        vp8_copy_and_extend_frame(src, &buf->img);
+    }
+    buf->ts_start = ts_start;
+    buf->ts_end = ts_end;
+    buf->flags = flags;
+    return 0;
+}
+
+
+struct lookahead_entry*
+vp8_lookahead_pop(struct lookahead_ctx *ctx,
+                  int                   drain)
+{
+    struct lookahead_entry* buf = NULL;
+
+    assert(ctx != NULL);
+    if(ctx->sz && (drain || ctx->sz == ctx->max_sz - 1))
+    {
+        buf = pop(ctx, &ctx->read_idx);
+        ctx->sz--;
+    }
+    return buf;
+}
+
+
+struct lookahead_entry*
+vp8_lookahead_peek(struct lookahead_ctx *ctx,
+                   unsigned int          index,
+                   int                   direction)
+{
+    struct lookahead_entry* buf = NULL;
+
+    if (direction == PEEK_FORWARD)
+    {
+        assert(index < ctx->max_sz - 1);
+        if(index < ctx->sz)
+        {
+            index += ctx->read_idx;
+            if(index >= ctx->max_sz)
+                index -= ctx->max_sz;
+            buf = ctx->buf + index;
+        }
+    }
+    else if (direction == PEEK_BACKWARD)
+    {
+        assert(index == 1);
+
+        if(ctx->read_idx == 0)
+            index = ctx->max_sz - 1;
+        else
+            index = ctx->read_idx - index;
+        buf = ctx->buf + index;
+    }
+
+    return buf;
+}
+
+
+unsigned int
+vp8_lookahead_depth(struct lookahead_ctx *ctx)
+{
+    return ctx->sz;
+}
diff --git a/libs/libvpx/vp8/encoder/lookahead.h b/libs/libvpx/vp8/encoder/lookahead.h
new file mode 100644
index 0000000000..cad68e639f
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/lookahead.h
@@ -0,0 +1,117 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef VP8_ENCODER_LOOKAHEAD_H_
+#define VP8_ENCODER_LOOKAHEAD_H_
+#include "vpx_scale/yv12config.h"
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct lookahead_entry
+{
+    YV12_BUFFER_CONFIG  img;
+    int64_t             ts_start;
+    int64_t             ts_end;
+    unsigned int        flags;
+};
+
+
+struct lookahead_ctx;
+
+/**\brief Initializes the lookahead stage
+ *
+ * The lookahead stage is a queue of frame buffers on which some analysis
+ * may be done when buffers are enqueued.
+ *
+ *
+ */
+struct lookahead_ctx* vp8_lookahead_init(unsigned int width,
+                                         unsigned int height,
+                                         unsigned int depth
+                                         );
+
+
+/**\brief Destroys the lookahead stage
+ *
+ */
+void vp8_lookahead_destroy(struct lookahead_ctx *ctx);
+
+
+/**\brief Enqueue a source buffer
+ *
+ * This function will copy the source image into a new framebuffer with
+ * the expected stride/border.
+ *
+ * If active_map is non-NULL and there is only one frame in the queue, then copy
+ * only active macroblocks.
+ *
+ * \param[in] ctx         Pointer to the lookahead context
+ * \param[in] src         Pointer to the image to enqueue
+ * \param[in] ts_start    Timestamp for the start of this frame
+ * \param[in] ts_end      Timestamp for the end of this frame
+ * \param[in] flags       Flags set on this frame
+ * \param[in] active_map  Map that specifies which macroblock is active
+ */
+int
+vp8_lookahead_push(struct lookahead_ctx *ctx,
+                   YV12_BUFFER_CONFIG   *src,
+                   int64_t               ts_start,
+                   int64_t               ts_end,
+                   unsigned int          flags,
+                   unsigned char        *active_map);
+
+
+/**\brief Get the next source buffer to encode
+ *
+ *
+ * \param[in] ctx       Pointer to the lookahead context
+ * \param[in] drain     Flag indicating the buffer should be drained
+ *                      (return a buffer regardless of the current queue depth)
+ *
+ * \retval NULL, if drain set and queue is empty
+ * \retval NULL, if drain not set and queue not of the configured depth
+ *
+ */
+struct lookahead_entry*
+vp8_lookahead_pop(struct lookahead_ctx *ctx,
+                  int                   drain);
+
+
+#define PEEK_FORWARD   1
+#define PEEK_BACKWARD -1
+/**\brief Get a future source buffer to encode
+ *
+ * \param[in] ctx       Pointer to the lookahead context
+ * \param[in] index     Index of the frame to be returned, 0 == next frame
+ *
+ * \retval NULL, if no buffer exists at the specified index
+ *
+ */
+struct lookahead_entry*
+vp8_lookahead_peek(struct lookahead_ctx *ctx,
+                   unsigned int          index,
+                   int                   direction);
+
+
+/**\brief Get the number of frames currently in the lookahead queue
+ *
+ * \param[in] ctx       Pointer to the lookahead context
+ */
+unsigned int
+vp8_lookahead_depth(struct lookahead_ctx *ctx);
+
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_ENCODER_LOOKAHEAD_H_
diff --git a/libs/libvpx/vp8/encoder/mcomp.c b/libs/libvpx/vp8/encoder/mcomp.c
new file mode 100644
index 0000000000..768c764ceb
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/mcomp.c
@@ -0,0 +1,2033 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "./vp8_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "onyx_int.h"
+#include "mcomp.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_config.h"
+#include <stdio.h>
+#include <limits.h>
+#include <math.h>
+#include "vp8/common/findnearmv.h"
+#include "vp8/common/common.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+#ifdef VP8_ENTROPY_STATS
+static int mv_ref_ct [31] [4] [2];
+static int mv_mode_cts [4] [2];
+#endif
+
+int vp8_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvcost[2], int Weight)
+{
+    /* MV costing is based on the distribution of vectors in the previous
+     * frame and as such will tend to over state the cost of vectors. In
+     * addition coding a new vector can have a knock on effect on the cost
+     * of subsequent vectors and the quality of prediction from NEAR and
+     * NEAREST for subsequent blocks. The "Weight" parameter allows, to a
+     * limited extent, for some account to be taken of these factors.
+     */
+    return ((mvcost[0][(mv->as_mv.row - ref->as_mv.row) >> 1] + mvcost[1][(mv->as_mv.col - ref->as_mv.col) >> 1]) * Weight) >> 7;
+}
+
+static int mv_err_cost(int_mv *mv, int_mv *ref, int *mvcost[2], int error_per_bit)
+{
+    /* Ignore mv costing if mvcost is NULL */
+    if (mvcost)
+        return ((mvcost[0][(mv->as_mv.row - ref->as_mv.row) >> 1] +
+                 mvcost[1][(mv->as_mv.col - ref->as_mv.col) >> 1])
+                 * error_per_bit + 128) >> 8;
+    return 0;
+}
+
+static int mvsad_err_cost(int_mv *mv, int_mv *ref, int *mvsadcost[2], int error_per_bit)
+{
+    /* Calculate sad error cost on full pixel basis. */
+    /* Ignore mv costing if mvsadcost is NULL */
+    if (mvsadcost)
+        return ((mvsadcost[0][(mv->as_mv.row - ref->as_mv.row)] +
+                 mvsadcost[1][(mv->as_mv.col - ref->as_mv.col)])
+                * error_per_bit + 128) >> 8;
+    return 0;
+}
+
+void vp8_init_dsmotion_compensation(MACROBLOCK *x, int stride)
+{
+    int Len;
+    int search_site_count = 0;
+
+
+    /* Generate offsets for 4 search sites per step. */
+    Len = MAX_FIRST_STEP;
+    x->ss[search_site_count].mv.col = 0;
+    x->ss[search_site_count].mv.row = 0;
+    x->ss[search_site_count].offset = 0;
+    search_site_count++;
+
+    while (Len > 0)
+    {
+
+        /* Compute offsets for search sites. */
+        x->ss[search_site_count].mv.col = 0;
+        x->ss[search_site_count].mv.row = -Len;
+        x->ss[search_site_count].offset = -Len * stride;
+        search_site_count++;
+
+        /* Compute offsets for search sites. */
+        x->ss[search_site_count].mv.col = 0;
+        x->ss[search_site_count].mv.row = Len;
+        x->ss[search_site_count].offset = Len * stride;
+        search_site_count++;
+
+        /* Compute offsets for search sites. */
+        x->ss[search_site_count].mv.col = -Len;
+        x->ss[search_site_count].mv.row = 0;
+        x->ss[search_site_count].offset = -Len;
+        search_site_count++;
+
+        /* Compute offsets for search sites. */
+        x->ss[search_site_count].mv.col = Len;
+        x->ss[search_site_count].mv.row = 0;
+        x->ss[search_site_count].offset = Len;
+        search_site_count++;
+
+        /* Contract. */
+        Len /= 2;
+    }
+
+    x->ss_count = search_site_count;
+    x->searches_per_step = 4;
+}
+
+void vp8_init3smotion_compensation(MACROBLOCK *x, int stride)
+{
+    int Len;
+    int search_site_count = 0;
+
+    /* Generate offsets for 8 search sites per step. */
+    Len = MAX_FIRST_STEP;
+    x->ss[search_site_count].mv.col = 0;
+    x->ss[search_site_count].mv.row = 0;
+    x->ss[search_site_count].offset = 0;
+    search_site_count++;
+
+    while (Len > 0)
+    {
+
+        /* Compute offsets for search sites. */
+        x->ss[search_site_count].mv.col = 0;
+        x->ss[search_site_count].mv.row = -Len;
+        x->ss[search_site_count].offset = -Len * stride;
+        search_site_count++;
+
+        /* Compute offsets for search sites. */
+        x->ss[search_site_count].mv.col = 0;
+        x->ss[search_site_count].mv.row = Len;
+        x->ss[search_site_count].offset = Len * stride;
+        search_site_count++;
+
+        /* Compute offsets for search sites. */
+        x->ss[search_site_count].mv.col = -Len;
+        x->ss[search_site_count].mv.row = 0;
+        x->ss[search_site_count].offset = -Len;
+        search_site_count++;
+
+        /* Compute offsets for search sites. */
+        x->ss[search_site_count].mv.col = Len;
+        x->ss[search_site_count].mv.row = 0;
+        x->ss[search_site_count].offset = Len;
+        search_site_count++;
+
+        /* Compute offsets for search sites. */
+        x->ss[search_site_count].mv.col = -Len;
+        x->ss[search_site_count].mv.row = -Len;
+        x->ss[search_site_count].offset = -Len * stride - Len;
+        search_site_count++;
+
+        /* Compute offsets for search sites. */
+        x->ss[search_site_count].mv.col = Len;
+        x->ss[search_site_count].mv.row = -Len;
+        x->ss[search_site_count].offset = -Len * stride + Len;
+        search_site_count++;
+
+        /* Compute offsets for search sites. */
+        x->ss[search_site_count].mv.col = -Len;
+        x->ss[search_site_count].mv.row = Len;
+        x->ss[search_site_count].offset = Len * stride - Len;
+        search_site_count++;
+
+        /* Compute offsets for search sites. */
+        x->ss[search_site_count].mv.col = Len;
+        x->ss[search_site_count].mv.row = Len;
+        x->ss[search_site_count].offset = Len * stride + Len;
+        search_site_count++;
+
+
+        /* Contract. */
+        Len /= 2;
+    }
+
+    x->ss_count = search_site_count;
+    x->searches_per_step = 8;
+}
+
+/*
+ * To avoid the penalty for crossing cache-line read, preload the reference
+ * area in a small buffer, which is aligned to make sure there won't be crossing
+ * cache-line read while reading from this buffer. This reduced the cpu
+ * cycles spent on reading ref data in sub-pixel filter functions.
+ * TODO: Currently, since sub-pixel search range here is -3 ~ 3, copy 22 rows x
+ * 32 cols area that is enough for 16x16 macroblock. Later, for SPLITMV, we
+ * could reduce the area.
+ */
+
+/* estimated cost of a motion vector (r,c) */
+#define MVC(r,c) (mvcost ? ((mvcost[0][(r)-rr] + mvcost[1][(c) - rc]) * error_per_bit + 128 )>>8 : 0)
+/* pointer to predictor base of a motionvector */
+#define PRE(r,c) (y + (((r)>>2) * y_stride + ((c)>>2) -(offset)))
+/* convert motion vector component to offset for svf calc */
+#define SP(x) (((x)&3)<<1)
+/* returns subpixel variance error function. */
+#define DIST(r,c) vfp->svf( PRE(r,c), y_stride, SP(c),SP(r), z,b->src_stride,&sse)
+#define IFMVCV(r,c,s,e) if ( c >= minc && c <= maxc && r >= minr && r <= maxr) s else e;
+/* returns distortion + motion vector cost */
+#define ERR(r,c) (MVC(r,c)+DIST(r,c))
+/* checks if (r,c) has better score than previous best */
+#define CHECK_BETTER(v,r,c) IFMVCV(r,c,{thismse = DIST(r,c); if((v = (MVC(r,c)+thismse)) < besterr) { besterr = v; br=r; bc=c; *distortion = thismse; *sse1 = sse; }}, v=UINT_MAX;)
+
+int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
+                                             int_mv *bestmv, int_mv *ref_mv,
+                                             int error_per_bit,
+                                             const vp8_variance_fn_ptr_t *vfp,
+                                             int *mvcost[2], int *distortion,
+                                             unsigned int *sse1)
+{
+    unsigned char *z = (*(b->base_src) + b->src);
+
+    int rr = ref_mv->as_mv.row >> 1, rc = ref_mv->as_mv.col >> 1;
+    int br = bestmv->as_mv.row * 4, bc = bestmv->as_mv.col * 4;
+    int tr = br, tc = bc;
+    unsigned int besterr;
+    unsigned int left, right, up, down, diag;
+    unsigned int sse;
+    unsigned int whichdir;
+    unsigned int halfiters = 4;
+    unsigned int quarteriters = 4;
+    int thismse;
+
+    int minc = VPXMAX(x->mv_col_min * 4,
+                      (ref_mv->as_mv.col >> 1) - ((1 << mvlong_width) - 1));
+    int maxc = VPXMIN(x->mv_col_max * 4,
+                      (ref_mv->as_mv.col >> 1) + ((1 << mvlong_width) - 1));
+    int minr = VPXMAX(x->mv_row_min * 4,
+                      (ref_mv->as_mv.row >> 1) - ((1 << mvlong_width) - 1));
+    int maxr = VPXMIN(x->mv_row_max * 4,
+                      (ref_mv->as_mv.row >> 1) + ((1 << mvlong_width) - 1));
+
+    int y_stride;
+    int offset;
+    int pre_stride = x->e_mbd.pre.y_stride;
+    unsigned char *base_pre = x->e_mbd.pre.y_buffer;
+
+
+#if ARCH_X86 || ARCH_X86_64
+    MACROBLOCKD *xd = &x->e_mbd;
+    unsigned char *y_0 = base_pre + d->offset + (bestmv->as_mv.row) * pre_stride + bestmv->as_mv.col;
+    unsigned char *y;
+    int buf_r1, buf_r2, buf_c1;
+
+    /* Clamping to avoid out-of-range data access */
+    buf_r1 = ((bestmv->as_mv.row - 3) < x->mv_row_min)?(bestmv->as_mv.row - x->mv_row_min):3;
+    buf_r2 = ((bestmv->as_mv.row + 3) > x->mv_row_max)?(x->mv_row_max - bestmv->as_mv.row):3;
+    buf_c1 = ((bestmv->as_mv.col - 3) < x->mv_col_min)?(bestmv->as_mv.col - x->mv_col_min):3;
+    y_stride = 32;
+
+    /* Copy to intermediate buffer before searching. */
+    vfp->copymem(y_0 - buf_c1 - pre_stride*buf_r1, pre_stride, xd->y_buf, y_stride, 16+buf_r1+buf_r2);
+    y = xd->y_buf + y_stride*buf_r1 +buf_c1;
+#else
+    unsigned char *y = base_pre + d->offset + (bestmv->as_mv.row) * pre_stride + bestmv->as_mv.col;
+    y_stride = pre_stride;
+#endif
+
+    offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col;
+
+    /* central mv */
+    bestmv->as_mv.row *= 8;
+    bestmv->as_mv.col *= 8;
+
+    /* calculate central point error */
+    besterr = vfp->vf(y, y_stride, z, b->src_stride, sse1);
+    *distortion = besterr;
+    besterr += mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
+
+    /* TODO: Each subsequent iteration checks at least one point in common
+     * with the last iteration could be 2 ( if diag selected)
+     */
+    while (--halfiters)
+    {
+        /* 1/2 pel */
+        CHECK_BETTER(left, tr, tc - 2);
+        CHECK_BETTER(right, tr, tc + 2);
+        CHECK_BETTER(up, tr - 2, tc);
+        CHECK_BETTER(down, tr + 2, tc);
+
+        whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+        switch (whichdir)
+        {
+        case 0:
+            CHECK_BETTER(diag, tr - 2, tc - 2);
+            break;
+        case 1:
+            CHECK_BETTER(diag, tr - 2, tc + 2);
+            break;
+        case 2:
+            CHECK_BETTER(diag, tr + 2, tc - 2);
+            break;
+        case 3:
+            CHECK_BETTER(diag, tr + 2, tc + 2);
+            break;
+        }
+
+        /* no reason to check the same one again. */
+        if (tr == br && tc == bc)
+            break;
+
+        tr = br;
+        tc = bc;
+    }
+
+    /* TODO: Each subsequent iteration checks at least one point in common
+     * with the last iteration could be 2 ( if diag selected)
+     */
+
+    /* 1/4 pel */
+    while (--quarteriters)
+    {
+        CHECK_BETTER(left, tr, tc - 1);
+        CHECK_BETTER(right, tr, tc + 1);
+        CHECK_BETTER(up, tr - 1, tc);
+        CHECK_BETTER(down, tr + 1, tc);
+
+        whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+        switch (whichdir)
+        {
+        case 0:
+            CHECK_BETTER(diag, tr - 1, tc - 1);
+            break;
+        case 1:
+            CHECK_BETTER(diag, tr - 1, tc + 1);
+            break;
+        case 2:
+            CHECK_BETTER(diag, tr + 1, tc - 1);
+            break;
+        case 3:
+            CHECK_BETTER(diag, tr + 1, tc + 1);
+            break;
+        }
+
+        /* no reason to check the same one again. */
+        if (tr == br && tc == bc)
+            break;
+
+        tr = br;
+        tc = bc;
+    }
+
+    bestmv->as_mv.row = br * 2;
+    bestmv->as_mv.col = bc * 2;
+
+    if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL<<3)) ||
+        (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL<<3)))
+        return INT_MAX;
+
+    return besterr;
+}
+#undef MVC
+#undef PRE
+#undef SP
+#undef DIST
+#undef IFMVCV
+#undef ERR
+#undef CHECK_BETTER
+
+int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
+                                 int_mv *bestmv, int_mv *ref_mv,
+                                 int error_per_bit,
+                                 const vp8_variance_fn_ptr_t *vfp,
+                                 int *mvcost[2], int *distortion,
+                                 unsigned int *sse1)
+{
+    int bestmse = INT_MAX;
+    int_mv startmv;
+    int_mv this_mv;
+    unsigned char *z = (*(b->base_src) + b->src);
+    int left, right, up, down, diag;
+    unsigned int sse;
+    int whichdir ;
+    int thismse;
+    int y_stride;
+    int pre_stride = x->e_mbd.pre.y_stride;
+    unsigned char *base_pre = x->e_mbd.pre.y_buffer;
+
+#if ARCH_X86 || ARCH_X86_64
+    MACROBLOCKD *xd = &x->e_mbd;
+    unsigned char *y_0 = base_pre + d->offset + (bestmv->as_mv.row) * pre_stride + bestmv->as_mv.col;
+    unsigned char *y;
+
+    y_stride = 32;
+    /* Copy 18 rows x 32 cols area to intermediate buffer before searching. */
+     vfp->copymem(y_0 - 1 - pre_stride, pre_stride, xd->y_buf, y_stride, 18);
+     y = xd->y_buf + y_stride + 1;
+#else
+     unsigned char *y = base_pre + d->offset + (bestmv->as_mv.row) * pre_stride + bestmv->as_mv.col;
+     y_stride = pre_stride;
+#endif
+
+    /* central mv */
+    bestmv->as_mv.row *= 8;
+    bestmv->as_mv.col *= 8;
+    startmv = *bestmv;
+
+    /* calculate central point error */
+    bestmse = vfp->vf(y, y_stride, z, b->src_stride, sse1);
+    *distortion = bestmse;
+    bestmse += mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
+
+    /* go left then right and check error */
+    this_mv.as_mv.row = startmv.as_mv.row;
+    this_mv.as_mv.col = ((startmv.as_mv.col - 8) | 4);
+    thismse = vfp->svf_halfpix_h(y - 1, y_stride, z, b->src_stride, &sse);
+    left = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (left < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = left;
+        *distortion = thismse;
+        *sse1 = sse;
+    }
+
+    this_mv.as_mv.col += 8;
+    thismse = vfp->svf_halfpix_h(y, y_stride, z, b->src_stride, &sse);
+    right = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (right < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = right;
+        *distortion = thismse;
+        *sse1 = sse;
+    }
+
+    /* go up then down and check error */
+    this_mv.as_mv.col = startmv.as_mv.col;
+    this_mv.as_mv.row = ((startmv.as_mv.row - 8) | 4);
+    thismse =  vfp->svf_halfpix_v(y - y_stride, y_stride, z, b->src_stride, &sse);
+    up = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (up < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = up;
+        *distortion = thismse;
+        *sse1 = sse;
+    }
+
+    this_mv.as_mv.row += 8;
+    thismse = vfp->svf_halfpix_v(y, y_stride, z, b->src_stride, &sse);
+    down = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (down < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = down;
+        *distortion = thismse;
+        *sse1 = sse;
+    }
+
+
+    /* now check 1 more diagonal */
+    whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+    this_mv = startmv;
+
+    switch (whichdir)
+    {
+    case 0:
+        this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
+        this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
+        thismse = vfp->svf_halfpix_hv(y - 1 - y_stride, y_stride, z, b->src_stride, &sse);
+        break;
+    case 1:
+        this_mv.as_mv.col += 4;
+        this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
+        thismse = vfp->svf_halfpix_hv(y - y_stride, y_stride, z, b->src_stride, &sse);
+        break;
+    case 2:
+        this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
+        this_mv.as_mv.row += 4;
+        thismse = vfp->svf_halfpix_hv(y - 1, y_stride, z, b->src_stride, &sse);
+        break;
+    case 3:
+    default:
+        this_mv.as_mv.col += 4;
+        this_mv.as_mv.row += 4;
+        thismse = vfp->svf_halfpix_hv(y, y_stride, z, b->src_stride, &sse);
+        break;
+    }
+
+    diag = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (diag < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = diag;
+        *distortion = thismse;
+        *sse1 = sse;
+    }
+
+
+    /* time to check quarter pels. */
+    if (bestmv->as_mv.row < startmv.as_mv.row)
+        y -= y_stride;
+
+    if (bestmv->as_mv.col < startmv.as_mv.col)
+        y--;
+
+    startmv = *bestmv;
+
+
+
+    /* go left then right and check error */
+    this_mv.as_mv.row = startmv.as_mv.row;
+
+    if (startmv.as_mv.col & 7)
+    {
+        this_mv.as_mv.col = startmv.as_mv.col - 2;
+        thismse = vfp->svf(y, y_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse);
+    }
+    else
+    {
+        this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;
+        thismse = vfp->svf(y - 1, y_stride, 6, this_mv.as_mv.row & 7, z, b->src_stride, &sse);
+    }
+
+    left = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (left < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = left;
+        *distortion = thismse;
+        *sse1 = sse;
+    }
+
+    this_mv.as_mv.col += 4;
+    thismse = vfp->svf(y, y_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse);
+    right = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (right < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = right;
+        *distortion = thismse;
+        *sse1 = sse;
+    }
+
+    /* go up then down and check error */
+    this_mv.as_mv.col = startmv.as_mv.col;
+
+    if (startmv.as_mv.row & 7)
+    {
+        this_mv.as_mv.row = startmv.as_mv.row - 2;
+        thismse = vfp->svf(y, y_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse);
+    }
+    else
+    {
+        this_mv.as_mv.row = (startmv.as_mv.row - 8) | 6;
+        thismse = vfp->svf(y - y_stride, y_stride, this_mv.as_mv.col & 7, 6, z, b->src_stride, &sse);
+    }
+
+    up = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (up < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = up;
+        *distortion = thismse;
+        *sse1 = sse;
+    }
+
+    this_mv.as_mv.row += 4;
+    thismse = vfp->svf(y, y_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse);
+    down = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (down < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = down;
+        *distortion = thismse;
+        *sse1 = sse;
+    }
+
+
+    /* now check 1 more diagonal */
+    whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+    this_mv = startmv;
+
+    switch (whichdir)
+    {
+    case 0:
+
+        if (startmv.as_mv.row & 7)
+        {
+            this_mv.as_mv.row -= 2;
+
+            if (startmv.as_mv.col & 7)
+            {
+                this_mv.as_mv.col -= 2;
+                thismse = vfp->svf(y, y_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse);
+            }
+            else
+            {
+                this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;
+                thismse = vfp->svf(y - 1, y_stride, 6, this_mv.as_mv.row & 7, z, b->src_stride, &sse);;
+            }
+        }
+        else
+        {
+            this_mv.as_mv.row = (startmv.as_mv.row - 8) | 6;
+
+            if (startmv.as_mv.col & 7)
+            {
+                this_mv.as_mv.col -= 2;
+                thismse = vfp->svf(y - y_stride, y_stride, this_mv.as_mv.col & 7, 6, z, b->src_stride, &sse);
+            }
+            else
+            {
+                this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;
+                thismse = vfp->svf(y - y_stride - 1, y_stride, 6, 6, z, b->src_stride, &sse);
+            }
+        }
+
+        break;
+    case 1:
+        this_mv.as_mv.col += 2;
+
+        if (startmv.as_mv.row & 7)
+        {
+            this_mv.as_mv.row -= 2;
+            thismse = vfp->svf(y, y_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse);
+        }
+        else
+        {
+            this_mv.as_mv.row = (startmv.as_mv.row - 8) | 6;
+            thismse = vfp->svf(y - y_stride, y_stride, this_mv.as_mv.col & 7, 6, z, b->src_stride, &sse);
+        }
+
+        break;
+    case 2:
+        this_mv.as_mv.row += 2;
+
+        if (startmv.as_mv.col & 7)
+        {
+            this_mv.as_mv.col -= 2;
+            thismse = vfp->svf(y, y_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse);
+        }
+        else
+        {
+            this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;
+            thismse = vfp->svf(y - 1, y_stride, 6, this_mv.as_mv.row & 7, z, b->src_stride, &sse);
+        }
+
+        break;
+    case 3:
+        this_mv.as_mv.col += 2;
+        this_mv.as_mv.row += 2;
+        thismse = vfp->svf(y, y_stride,  this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse);
+        break;
+    }
+
+    diag = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (diag < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = diag;
+        *distortion = thismse;
+        *sse1 = sse;
+    }
+
+    return bestmse;
+}
+
+int vp8_find_best_half_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
+                                  int_mv *bestmv, int_mv *ref_mv,
+                                  int error_per_bit,
+                                  const vp8_variance_fn_ptr_t *vfp,
+                                  int *mvcost[2], int *distortion,
+                                  unsigned int *sse1)
+{
+    int bestmse = INT_MAX;
+    int_mv startmv;
+    int_mv this_mv;
+    unsigned char *z = (*(b->base_src) + b->src);
+    int left, right, up, down, diag;
+    unsigned int sse;
+    int whichdir ;
+    int thismse;
+    int y_stride;
+    int pre_stride = x->e_mbd.pre.y_stride;
+    unsigned char *base_pre = x->e_mbd.pre.y_buffer;
+
+#if ARCH_X86 || ARCH_X86_64
+    MACROBLOCKD *xd = &x->e_mbd;
+    unsigned char *y_0 = base_pre + d->offset + (bestmv->as_mv.row) * pre_stride + bestmv->as_mv.col;
+    unsigned char *y;
+
+    y_stride = 32;
+    /* Copy 18 rows x 32 cols area to intermediate buffer before searching. */
+    vfp->copymem(y_0 - 1 - pre_stride, pre_stride, xd->y_buf, y_stride, 18);
+    y = xd->y_buf + y_stride + 1;
+#else
+    unsigned char *y = base_pre + d->offset + (bestmv->as_mv.row) * pre_stride + bestmv->as_mv.col;
+    y_stride = pre_stride;
+#endif
+
+    /* central mv */
+    bestmv->as_mv.row *= 8;
+    bestmv->as_mv.col *= 8;
+    startmv = *bestmv;
+
+    /* calculate central point error */
+    bestmse = vfp->vf(y, y_stride, z, b->src_stride, sse1);
+    *distortion = bestmse;
+    bestmse += mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
+
+    /* go left then right and check error */
+    this_mv.as_mv.row = startmv.as_mv.row;
+    this_mv.as_mv.col = ((startmv.as_mv.col - 8) | 4);
+    thismse = vfp->svf_halfpix_h(y - 1, y_stride, z, b->src_stride, &sse);
+    left = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (left < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = left;
+        *distortion = thismse;
+        *sse1 = sse;
+    }
+
+    this_mv.as_mv.col += 8;
+    thismse = vfp->svf_halfpix_h(y, y_stride, z, b->src_stride, &sse);
+    right = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (right < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = right;
+        *distortion = thismse;
+        *sse1 = sse;
+    }
+
+    /* go up then down and check error */
+    this_mv.as_mv.col = startmv.as_mv.col;
+    this_mv.as_mv.row = ((startmv.as_mv.row - 8) | 4);
+    thismse = vfp->svf_halfpix_v(y - y_stride, y_stride, z, b->src_stride, &sse);
+    up = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (up < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = up;
+        *distortion = thismse;
+        *sse1 = sse;
+    }
+
+    this_mv.as_mv.row += 8;
+    thismse = vfp->svf_halfpix_v(y, y_stride, z, b->src_stride, &sse);
+    down = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (down < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = down;
+        *distortion = thismse;
+        *sse1 = sse;
+    }
+
+    /* now check 1 more diagonal - */
+    whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+    this_mv = startmv;
+
+    switch (whichdir)
+    {
+    case 0:
+        this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
+        this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
+        thismse = vfp->svf_halfpix_hv(y - 1 - y_stride, y_stride, z, b->src_stride, &sse);
+        break;
+    case 1:
+        this_mv.as_mv.col += 4;
+        this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
+        thismse = vfp->svf_halfpix_hv(y - y_stride, y_stride, z, b->src_stride, &sse);
+        break;
+    case 2:
+        this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
+        this_mv.as_mv.row += 4;
+        thismse = vfp->svf_halfpix_hv(y - 1, y_stride, z, b->src_stride, &sse);
+        break;
+    case 3:
+    default:
+        this_mv.as_mv.col += 4;
+        this_mv.as_mv.row += 4;
+        thismse = vfp->svf_halfpix_hv(y, y_stride, z, b->src_stride, &sse);
+        break;
+    }
+
+    diag = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (diag < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = diag;
+        *distortion = thismse;
+        *sse1 = sse;
+    }
+
+    return bestmse;
+}
+
+#define CHECK_BOUNDS(range) \
+{\
+    all_in = 1;\
+    all_in &= ((br-range) >= x->mv_row_min);\
+    all_in &= ((br+range) <= x->mv_row_max);\
+    all_in &= ((bc-range) >= x->mv_col_min);\
+    all_in &= ((bc+range) <= x->mv_col_max);\
+}
+
+#define CHECK_POINT \
+{\
+    if (this_mv.as_mv.col < x->mv_col_min) continue;\
+    if (this_mv.as_mv.col > x->mv_col_max) continue;\
+    if (this_mv.as_mv.row < x->mv_row_min) continue;\
+    if (this_mv.as_mv.row > x->mv_row_max) continue;\
+}
+
+#define CHECK_BETTER \
+{\
+    if (thissad < bestsad)\
+    {\
+        thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit);\
+        if (thissad < bestsad)\
+        {\
+            bestsad = thissad;\
+            best_site = i;\
+        }\
+    }\
+}
+
+static const MV next_chkpts[6][3] =
+{
+    {{ -2, 0}, { -1, -2}, {1, -2}},
+    {{ -1, -2}, {1, -2}, {2, 0}},
+    {{1, -2}, {2, 0}, {1, 2}},
+    {{2, 0}, {1, 2}, { -1, 2}},
+    {{1, 2}, { -1, 2}, { -2, 0}},
+    {{ -1, 2}, { -2, 0}, { -1, -2}}
+};
+
+int vp8_hex_search
+(
+    MACROBLOCK *x,
+    BLOCK *b,
+    BLOCKD *d,
+    int_mv *ref_mv,
+    int_mv *best_mv,
+    int search_param,
+    int sad_per_bit,
+    const vp8_variance_fn_ptr_t *vfp,
+    int *mvsadcost[2],
+    int *mvcost[2],
+    int_mv *center_mv
+)
+{
+    MV hex[6] = { { -1, -2}, {1, -2}, {2, 0}, {1, 2}, { -1, 2}, { -2, 0} } ;
+    MV neighbors[4] = {{0, -1}, { -1, 0}, {1, 0}, {0, 1}} ;
+    int i, j;
+
+    unsigned char *what = (*(b->base_src) + b->src);
+    int what_stride = b->src_stride;
+    int pre_stride = x->e_mbd.pre.y_stride;
+    unsigned char *base_pre = x->e_mbd.pre.y_buffer;
+
+    int in_what_stride = pre_stride;
+    int br, bc;
+    int_mv this_mv;
+    unsigned int bestsad;
+    unsigned int thissad;
+    unsigned char *base_offset;
+    unsigned char *this_offset;
+    int k = -1;
+    int all_in;
+    int best_site = -1;
+    int hex_range = 127;
+    int dia_range = 8;
+
+    int_mv fcenter_mv;
+    fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
+    fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+
+    (void)mvcost;
+
+    /* adjust ref_mv to make sure it is within MV range */
+    vp8_clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
+    br = ref_mv->as_mv.row;
+    bc = ref_mv->as_mv.col;
+
+    /* Work out the start point for the search */
+    base_offset = (unsigned char *)(base_pre + d->offset);
+    this_offset = base_offset + (br * (pre_stride)) + bc;
+    this_mv.as_mv.row = br;
+    this_mv.as_mv.col = bc;
+    bestsad = vfp->sdf(what, what_stride, this_offset, in_what_stride)
+            + mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit);
+
+#if CONFIG_MULTI_RES_ENCODING
+    /* Lower search range based on prediction info */
+    if (search_param >= 6) goto cal_neighbors;
+    else if (search_param >= 5) hex_range = 4;
+    else if (search_param >= 4) hex_range = 6;
+    else if (search_param >= 3) hex_range = 15;
+    else if (search_param >= 2) hex_range = 31;
+    else if (search_param >= 1) hex_range = 63;
+
+    dia_range = 8;
+#else
+    (void)search_param;
+#endif
+
+    /* hex search */
+    CHECK_BOUNDS(2)
+
+    if(all_in)
+    {
+        for (i = 0; i < 6; i++)
+        {
+            this_mv.as_mv.row = br + hex[i].row;
+            this_mv.as_mv.col = bc + hex[i].col;
+            this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + this_mv.as_mv.col;
+            thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
+            CHECK_BETTER
+        }
+    }else
+    {
+        for (i = 0; i < 6; i++)
+        {
+            this_mv.as_mv.row = br + hex[i].row;
+            this_mv.as_mv.col = bc + hex[i].col;
+            CHECK_POINT
+            this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + this_mv.as_mv.col;
+            thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
+            CHECK_BETTER
+        }
+    }
+
+    if (best_site == -1)
+        goto cal_neighbors;
+    else
+    {
+        br += hex[best_site].row;
+        bc += hex[best_site].col;
+        k = best_site;
+    }
+
+    for (j = 1; j < hex_range; j++)
+    {
+        best_site = -1;
+        CHECK_BOUNDS(2)
+
+        if(all_in)
+        {
+            for (i = 0; i < 3; i++)
+            {
+                this_mv.as_mv.row = br + next_chkpts[k][i].row;
+                this_mv.as_mv.col = bc + next_chkpts[k][i].col;
+                this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
+                thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
+                CHECK_BETTER
+            }
+        }else
+        {
+            for (i = 0; i < 3; i++)
+            {
+                this_mv.as_mv.row = br + next_chkpts[k][i].row;
+                this_mv.as_mv.col = bc + next_chkpts[k][i].col;
+                CHECK_POINT
+                this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
+                thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
+                CHECK_BETTER
+            }
+        }
+
+        if (best_site == -1)
+            break;
+        else
+        {
+            br += next_chkpts[k][best_site].row;
+            bc += next_chkpts[k][best_site].col;
+            k += 5 + best_site;
+            if (k >= 12) k -= 12;
+            else if (k >= 6) k -= 6;
+        }
+    }
+
+    /* check 4 1-away neighbors */
+cal_neighbors:
+    for (j = 0; j < dia_range; j++)
+    {
+        best_site = -1;
+        CHECK_BOUNDS(1)
+
+        if(all_in)
+        {
+            for (i = 0; i < 4; i++)
+            {
+                this_mv.as_mv.row = br + neighbors[i].row;
+                this_mv.as_mv.col = bc + neighbors[i].col;
+                this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
+                thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
+                CHECK_BETTER
+            }
+        }else
+        {
+            for (i = 0; i < 4; i++)
+            {
+                this_mv.as_mv.row = br + neighbors[i].row;
+                this_mv.as_mv.col = bc + neighbors[i].col;
+                CHECK_POINT
+                this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
+                thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
+                CHECK_BETTER
+            }
+        }
+
+        if (best_site == -1)
+            break;
+        else
+        {
+            br += neighbors[best_site].row;
+            bc += neighbors[best_site].col;
+        }
+    }
+
+    best_mv->as_mv.row = br;
+    best_mv->as_mv.col = bc;
+
+    return bestsad;
+}
+#undef CHECK_BOUNDS
+#undef CHECK_POINT
+#undef CHECK_BETTER
+
+int vp8_diamond_search_sad_c
+(
+    MACROBLOCK *x,
+    BLOCK *b,
+    BLOCKD *d,
+    int_mv *ref_mv,
+    int_mv *best_mv,
+    int search_param,
+    int sad_per_bit,
+    int *num00,
+    vp8_variance_fn_ptr_t *fn_ptr,
+    int *mvcost[2],
+    int_mv *center_mv
+)
+{
+    int i, j, step;
+
+    unsigned char *what = (*(b->base_src) + b->src);
+    int what_stride = b->src_stride;
+    unsigned char *in_what;
+    int pre_stride = x->e_mbd.pre.y_stride;
+    unsigned char *base_pre = x->e_mbd.pre.y_buffer;
+    int in_what_stride = pre_stride;
+    unsigned char *best_address;
+
+    int tot_steps;
+    int_mv this_mv;
+
+    unsigned int bestsad;
+    unsigned int thissad;
+    int best_site = 0;
+    int last_site = 0;
+
+    int ref_row;
+    int ref_col;
+    int this_row_offset;
+    int this_col_offset;
+    search_site *ss;
+
+    unsigned char *check_here;
+
+    int *mvsadcost[2];
+    int_mv fcenter_mv;
+
+    mvsadcost[0] = x->mvsadcost[0];
+    mvsadcost[1] = x->mvsadcost[1];
+    fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
+    fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+
+    vp8_clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
+    ref_row = ref_mv->as_mv.row;
+    ref_col = ref_mv->as_mv.col;
+    *num00 = 0;
+    best_mv->as_mv.row = ref_row;
+    best_mv->as_mv.col = ref_col;
+
+    /* Work out the start point for the search */
+    in_what = (unsigned char *)(base_pre + d->offset + (ref_row * pre_stride) + ref_col);
+    best_address = in_what;
+
+    /* Check the starting position */
+    bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride)
+            + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
+
+    /* search_param determines the length of the initial step and hence
+     * the number of iterations 0 = initial step (MAX_FIRST_STEP) pel :
+     * 1 = (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc.
+     */
+    ss = &x->ss[search_param * x->searches_per_step];
+    tot_steps = (x->ss_count / x->searches_per_step) - search_param;
+
+    i = 1;
+
+    for (step = 0; step < tot_steps ; step++)
+    {
+        for (j = 0 ; j < x->searches_per_step ; j++)
+        {
+            /* Trap illegal vectors */
+            this_row_offset = best_mv->as_mv.row + ss[i].mv.row;
+            this_col_offset = best_mv->as_mv.col + ss[i].mv.col;
+
+            if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&
+            (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max))
+
+            {
+                check_here = ss[i].offset + best_address;
+                thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
+
+                if (thissad < bestsad)
+                {
+                    this_mv.as_mv.row = this_row_offset;
+                    this_mv.as_mv.col = this_col_offset;
+                    thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+                                              mvsadcost, sad_per_bit);
+
+                    if (thissad < bestsad)
+                    {
+                        bestsad = thissad;
+                        best_site = i;
+                    }
+                }
+            }
+
+            i++;
+        }
+
+        if (best_site != last_site)
+        {
+            best_mv->as_mv.row += ss[best_site].mv.row;
+            best_mv->as_mv.col += ss[best_site].mv.col;
+            best_address += ss[best_site].offset;
+            last_site = best_site;
+        }
+        else if (best_address == in_what)
+            (*num00)++;
+    }
+
+    this_mv.as_mv.row = best_mv->as_mv.row << 3;
+    this_mv.as_mv.col = best_mv->as_mv.col << 3;
+
+    return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &thissad)
+           + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
+}
+
+int vp8_diamond_search_sadx4
+(
+    MACROBLOCK *x,
+    BLOCK *b,
+    BLOCKD *d,
+    int_mv *ref_mv,
+    int_mv *best_mv,
+    int search_param,
+    int sad_per_bit,
+    int *num00,
+    vp8_variance_fn_ptr_t *fn_ptr,
+    int *mvcost[2],
+    int_mv *center_mv
+)
+{
+    int i, j, step;
+
+    unsigned char *what = (*(b->base_src) + b->src);
+    int what_stride = b->src_stride;
+    unsigned char *in_what;
+    int pre_stride = x->e_mbd.pre.y_stride;
+    unsigned char *base_pre = x->e_mbd.pre.y_buffer;
+    int in_what_stride = pre_stride;
+    unsigned char *best_address;
+
+    int tot_steps;
+    int_mv this_mv;
+
+    unsigned int bestsad;
+    unsigned int thissad;
+    int best_site = 0;
+    int last_site = 0;
+
+    int ref_row;
+    int ref_col;
+    int this_row_offset;
+    int this_col_offset;
+    search_site *ss;
+
+    unsigned char *check_here;
+
+    int *mvsadcost[2];
+    int_mv fcenter_mv;
+
+    mvsadcost[0] = x->mvsadcost[0];
+    mvsadcost[1] = x->mvsadcost[1];
+    fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
+    fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+
+    vp8_clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
+    ref_row = ref_mv->as_mv.row;
+    ref_col = ref_mv->as_mv.col;
+    *num00 = 0;
+    best_mv->as_mv.row = ref_row;
+    best_mv->as_mv.col = ref_col;
+
+    /* Work out the start point for the search */
+    in_what = (unsigned char *)(base_pre + d->offset + (ref_row * pre_stride) + ref_col);
+    best_address = in_what;
+
+    /* Check the starting position */
+    bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride)
+            + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
+
+    /* search_param determines the length of the initial step and hence the
+     * number of iterations 0 = initial step (MAX_FIRST_STEP) pel : 1 =
+     * (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc.
+     */
+    ss = &x->ss[search_param * x->searches_per_step];
+    tot_steps = (x->ss_count / x->searches_per_step) - search_param;
+
+    i = 1;
+
+    for (step = 0; step < tot_steps ; step++)
+    {
+        int all_in = 1, t;
+
+        /* To know if all neighbor points are within the bounds, 4 bounds
+         * checking are enough instead of checking 4 bounds for each
+         * points.
+         */
+        all_in &= ((best_mv->as_mv.row + ss[i].mv.row)> x->mv_row_min);
+        all_in &= ((best_mv->as_mv.row + ss[i+1].mv.row) < x->mv_row_max);
+        all_in &= ((best_mv->as_mv.col + ss[i+2].mv.col) > x->mv_col_min);
+        all_in &= ((best_mv->as_mv.col + ss[i+3].mv.col) < x->mv_col_max);
+
+        if (all_in)
+        {
+            unsigned int sad_array[4];
+
+            for (j = 0 ; j < x->searches_per_step ; j += 4)
+            {
+                const unsigned char *block_offset[4];
+
+                for (t = 0; t < 4; t++)
+                    block_offset[t] = ss[i+t].offset + best_address;
+
+                fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride, sad_array);
+
+                for (t = 0; t < 4; t++, i++)
+                {
+                    if (sad_array[t] < bestsad)
+                    {
+                        this_mv.as_mv.row = best_mv->as_mv.row + ss[i].mv.row;
+                        this_mv.as_mv.col = best_mv->as_mv.col + ss[i].mv.col;
+                        sad_array[t] += mvsad_err_cost(&this_mv, &fcenter_mv,
+                                                       mvsadcost, sad_per_bit);
+
+                        if (sad_array[t] < bestsad)
+                        {
+                            bestsad = sad_array[t];
+                            best_site = i;
+                        }
+                    }
+                }
+            }
+        }
+        else
+        {
+            for (j = 0 ; j < x->searches_per_step ; j++)
+            {
+                /* Trap illegal vectors */
+                this_row_offset = best_mv->as_mv.row + ss[i].mv.row;
+                this_col_offset = best_mv->as_mv.col + ss[i].mv.col;
+
+                if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&
+                (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max))
+                {
+                    check_here = ss[i].offset + best_address;
+                    thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
+
+                    if (thissad < bestsad)
+                    {
+                        this_mv.as_mv.row = this_row_offset;
+                        this_mv.as_mv.col = this_col_offset;
+                        thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+                                                  mvsadcost, sad_per_bit);
+
+                        if (thissad < bestsad)
+                        {
+                            bestsad = thissad;
+                            best_site = i;
+                        }
+                    }
+                }
+                i++;
+            }
+        }
+
+        if (best_site != last_site)
+        {
+            best_mv->as_mv.row += ss[best_site].mv.row;
+            best_mv->as_mv.col += ss[best_site].mv.col;
+            best_address += ss[best_site].offset;
+            last_site = best_site;
+        }
+        else if (best_address == in_what)
+            (*num00)++;
+    }
+
+    this_mv.as_mv.row = best_mv->as_mv.row * 8;
+    this_mv.as_mv.col = best_mv->as_mv.col * 8;
+
+    return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &thissad)
+           + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
+}
+
+int vp8_full_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
+                        int sad_per_bit, int distance,
+                        vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2],
+                        int_mv *center_mv)
+{
+    unsigned char *what = (*(b->base_src) + b->src);
+    int what_stride = b->src_stride;
+    unsigned char *in_what;
+    int pre_stride = x->e_mbd.pre.y_stride;
+    unsigned char *base_pre = x->e_mbd.pre.y_buffer;
+    int in_what_stride = pre_stride;
+    int mv_stride = pre_stride;
+    unsigned char *bestaddress;
+    int_mv *best_mv = &d->bmi.mv;
+    int_mv this_mv;
+    unsigned int bestsad;
+    unsigned int thissad;
+    int r, c;
+
+    unsigned char *check_here;
+
+    int ref_row = ref_mv->as_mv.row;
+    int ref_col = ref_mv->as_mv.col;
+
+    int row_min = ref_row - distance;
+    int row_max = ref_row + distance;
+    int col_min = ref_col - distance;
+    int col_max = ref_col + distance;
+
+    int *mvsadcost[2];
+    int_mv fcenter_mv;
+
+    mvsadcost[0] = x->mvsadcost[0];
+    mvsadcost[1] = x->mvsadcost[1];
+    fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
+    fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+
+    /* Work out the mid point for the search */
+    in_what = base_pre + d->offset;
+    bestaddress = in_what + (ref_row * pre_stride) + ref_col;
+
+    best_mv->as_mv.row = ref_row;
+    best_mv->as_mv.col = ref_col;
+
+    /* Baseline value at the centre */
+    bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride)
+            + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
+
+    /* Apply further limits to prevent us looking using vectors that
+     * stretch beyiond the UMV border
+     */
+    if (col_min < x->mv_col_min)
+        col_min = x->mv_col_min;
+
+    if (col_max > x->mv_col_max)
+        col_max = x->mv_col_max;
+
+    if (row_min < x->mv_row_min)
+        row_min = x->mv_row_min;
+
+    if (row_max > x->mv_row_max)
+        row_max = x->mv_row_max;
+
+    for (r = row_min; r < row_max ; r++)
+    {
+        this_mv.as_mv.row = r;
+        check_here = r * mv_stride + in_what + col_min;
+
+        for (c = col_min; c < col_max; c++)
+        {
+            thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
+
+            this_mv.as_mv.col = c;
+            thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+                                      mvsadcost, sad_per_bit);
+
+            if (thissad < bestsad)
+            {
+                bestsad = thissad;
+                best_mv->as_mv.row = r;
+                best_mv->as_mv.col = c;
+                bestaddress = check_here;
+            }
+
+            check_here++;
+        }
+    }
+
+    this_mv.as_mv.row = best_mv->as_mv.row << 3;
+    this_mv.as_mv.col = best_mv->as_mv.col << 3;
+
+    return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, &thissad)
+           + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
+}
+
+int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
+                          int sad_per_bit, int distance,
+                          vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2],
+                          int_mv *center_mv)
+{
+    unsigned char *what = (*(b->base_src) + b->src);
+    int what_stride = b->src_stride;
+    unsigned char *in_what;
+    int pre_stride = x->e_mbd.pre.y_stride;
+    unsigned char *base_pre = x->e_mbd.pre.y_buffer;
+    int in_what_stride = pre_stride;
+    int mv_stride = pre_stride;
+    unsigned char *bestaddress;
+    int_mv *best_mv = &d->bmi.mv;
+    int_mv this_mv;
+    unsigned int bestsad;
+    unsigned int thissad;
+    int r, c;
+
+    unsigned char *check_here;
+
+    int ref_row = ref_mv->as_mv.row;
+    int ref_col = ref_mv->as_mv.col;
+
+    int row_min = ref_row - distance;
+    int row_max = ref_row + distance;
+    int col_min = ref_col - distance;
+    int col_max = ref_col + distance;
+
+    unsigned int sad_array[3];
+
+    int *mvsadcost[2];
+    int_mv fcenter_mv;
+
+    mvsadcost[0] = x->mvsadcost[0];
+    mvsadcost[1] = x->mvsadcost[1];
+    fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
+    fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+
+    /* Work out the mid point for the search */
+    in_what = base_pre + d->offset;
+    bestaddress = in_what + (ref_row * pre_stride) + ref_col;
+
+    best_mv->as_mv.row = ref_row;
+    best_mv->as_mv.col = ref_col;
+
+    /* Baseline value at the centre */
+    bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride)
+            + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
+
+    /* Apply further limits to prevent us looking using vectors that stretch
+     * beyond the UMV border
+     */
+    if (col_min < x->mv_col_min)
+        col_min = x->mv_col_min;
+
+    if (col_max > x->mv_col_max)
+        col_max = x->mv_col_max;
+
+    if (row_min < x->mv_row_min)
+        row_min = x->mv_row_min;
+
+    if (row_max > x->mv_row_max)
+        row_max = x->mv_row_max;
+
+    for (r = row_min; r < row_max ; r++)
+    {
+        this_mv.as_mv.row = r;
+        check_here = r * mv_stride + in_what + col_min;
+        c = col_min;
+
+        while ((c + 2) < col_max)
+        {
+            int i;
+
+            fn_ptr->sdx3f(what, what_stride, check_here, in_what_stride, sad_array);
+
+            for (i = 0; i < 3; i++)
+            {
+                thissad = sad_array[i];
+
+                if (thissad < bestsad)
+                {
+                    this_mv.as_mv.col = c;
+                    thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+                                              mvsadcost, sad_per_bit);
+
+                    if (thissad < bestsad)
+                    {
+                        bestsad = thissad;
+                        best_mv->as_mv.row = r;
+                        best_mv->as_mv.col = c;
+                        bestaddress = check_here;
+                    }
+                }
+
+                check_here++;
+                c++;
+            }
+        }
+
+        while (c < col_max)
+        {
+            thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
+
+            if (thissad < bestsad)
+            {
+                this_mv.as_mv.col = c;
+                thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+                                          mvsadcost, sad_per_bit);
+
+                if (thissad < bestsad)
+                {
+                    bestsad = thissad;
+                    best_mv->as_mv.row = r;
+                    best_mv->as_mv.col = c;
+                    bestaddress = check_here;
+                }
+            }
+
+            check_here ++;
+            c ++;
+        }
+
+    }
+
+    this_mv.as_mv.row = best_mv->as_mv.row << 3;
+    this_mv.as_mv.col = best_mv->as_mv.col << 3;
+
+    return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, &thissad)
+           + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
+}
+
+int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
+                          int sad_per_bit, int distance,
+                          vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2],
+                          int_mv *center_mv)
+{
+    unsigned char *what = (*(b->base_src) + b->src);
+    int what_stride = b->src_stride;
+    int pre_stride = x->e_mbd.pre.y_stride;
+    unsigned char *base_pre = x->e_mbd.pre.y_buffer;
+    unsigned char *in_what;
+    int in_what_stride = pre_stride;
+    int mv_stride = pre_stride;
+    unsigned char *bestaddress;
+    int_mv *best_mv = &d->bmi.mv;
+    int_mv this_mv;
+    unsigned int bestsad;
+    unsigned int thissad;
+    int r, c;
+
+    unsigned char *check_here;
+
+    int ref_row = ref_mv->as_mv.row;
+    int ref_col = ref_mv->as_mv.col;
+
+    int row_min = ref_row - distance;
+    int row_max = ref_row + distance;
+    int col_min = ref_col - distance;
+    int col_max = ref_col + distance;
+
+    // TODO(johannkoenig): check if this alignment is necessary.
+    DECLARE_ALIGNED(16, unsigned int, sad_array8[8]);
+    unsigned int sad_array[3];
+
+    int *mvsadcost[2];
+    int_mv fcenter_mv;
+
+    mvsadcost[0] = x->mvsadcost[0];
+    mvsadcost[1] = x->mvsadcost[1];
+    fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
+    fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+
+    /* Work out the mid point for the search */
+    in_what = base_pre + d->offset;
+    bestaddress = in_what + (ref_row * pre_stride) + ref_col;
+
+    best_mv->as_mv.row = ref_row;
+    best_mv->as_mv.col = ref_col;
+
+    /* Baseline value at the centre */
+    bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride)
+            + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
+
+    /* Apply further limits to prevent us looking using vectors that stretch
+     * beyond the UMV border
+     */
+    if (col_min < x->mv_col_min)
+        col_min = x->mv_col_min;
+
+    if (col_max > x->mv_col_max)
+        col_max = x->mv_col_max;
+
+    if (row_min < x->mv_row_min)
+        row_min = x->mv_row_min;
+
+    if (row_max > x->mv_row_max)
+        row_max = x->mv_row_max;
+
+    for (r = row_min; r < row_max ; r++)
+    {
+        this_mv.as_mv.row = r;
+        check_here = r * mv_stride + in_what + col_min;
+        c = col_min;
+
+        while ((c + 7) < col_max)
+        {
+            int i;
+
+            fn_ptr->sdx8f(what, what_stride, check_here, in_what_stride, sad_array8);
+
+            for (i = 0; i < 8; i++)
+            {
+                thissad = sad_array8[i];
+
+                if (thissad < bestsad)
+                {
+                    this_mv.as_mv.col = c;
+                    thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+                                              mvsadcost, sad_per_bit);
+
+                    if (thissad < bestsad)
+                    {
+                        bestsad = thissad;
+                        best_mv->as_mv.row = r;
+                        best_mv->as_mv.col = c;
+                        bestaddress = check_here;
+                    }
+                }
+
+                check_here++;
+                c++;
+            }
+        }
+
+        while ((c + 2) < col_max)
+        {
+            int i;
+
+            fn_ptr->sdx3f(what, what_stride, check_here , in_what_stride, sad_array);
+
+            for (i = 0; i < 3; i++)
+            {
+                thissad = sad_array[i];
+
+                if (thissad < bestsad)
+                {
+                    this_mv.as_mv.col = c;
+                    thissad  += mvsad_err_cost(&this_mv, &fcenter_mv,
+                        mvsadcost, sad_per_bit);
+
+                    if (thissad < bestsad)
+                    {
+                        bestsad = thissad;
+                        best_mv->as_mv.row = r;
+                        best_mv->as_mv.col = c;
+                        bestaddress = check_here;
+                    }
+                }
+
+                check_here++;
+                c++;
+            }
+        }
+
+        while (c < col_max)
+        {
+            thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride);
+
+            if (thissad < bestsad)
+            {
+                this_mv.as_mv.col = c;
+                thissad  += mvsad_err_cost(&this_mv, &fcenter_mv,
+                    mvsadcost, sad_per_bit);
+
+                if (thissad < bestsad)
+                {
+                    bestsad = thissad;
+                    best_mv->as_mv.row = r;
+                    best_mv->as_mv.col = c;
+                    bestaddress = check_here;
+                }
+            }
+
+            check_here ++;
+            c ++;
+        }
+    }
+
+    this_mv.as_mv.row = best_mv->as_mv.row * 8;
+    this_mv.as_mv.col = best_mv->as_mv.col * 8;
+
+    return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, &thissad)
+           + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
+}
+
+int vp8_refining_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
+                            int error_per_bit, int search_range,
+                            vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2],
+                            int_mv *center_mv)
+{
+    MV neighbors[4] = {{-1, 0}, {0, -1}, {0, 1}, {1, 0}};
+    int i, j;
+    short this_row_offset, this_col_offset;
+
+    int what_stride = b->src_stride;
+    int pre_stride = x->e_mbd.pre.y_stride;
+    unsigned char *base_pre = x->e_mbd.pre.y_buffer;
+    int in_what_stride = pre_stride;
+    unsigned char *what = (*(b->base_src) + b->src);
+    unsigned char *best_address = (unsigned char *)(base_pre + d->offset +
+        (ref_mv->as_mv.row * pre_stride) + ref_mv->as_mv.col);
+    unsigned char *check_here;
+    int_mv this_mv;
+    unsigned int bestsad;
+    unsigned int thissad;
+
+    int *mvsadcost[2];
+    int_mv fcenter_mv;
+
+    mvsadcost[0] = x->mvsadcost[0];
+    mvsadcost[1] = x->mvsadcost[1];
+    fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
+    fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+
+    bestsad = fn_ptr->sdf(what, what_stride, best_address, in_what_stride)
+            + mvsad_err_cost(ref_mv, &fcenter_mv, mvsadcost, error_per_bit);
+
+    for (i=0; i<search_range; i++)
+    {
+        int best_site = -1;
+
+        for (j = 0 ; j < 4 ; j++)
+        {
+            this_row_offset = ref_mv->as_mv.row + neighbors[j].row;
+            this_col_offset = ref_mv->as_mv.col + neighbors[j].col;
+
+            if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&
+            (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max))
+            {
+                check_here = (neighbors[j].row)*in_what_stride + neighbors[j].col + best_address;
+                thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride);
+
+                if (thissad < bestsad)
+                {
+                    this_mv.as_mv.row = this_row_offset;
+                    this_mv.as_mv.col = this_col_offset;
+                    thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, error_per_bit);
+
+                    if (thissad < bestsad)
+                    {
+                        bestsad = thissad;
+                        best_site = j;
+                    }
+                }
+            }
+        }
+
+        if (best_site == -1)
+            break;
+        else
+        {
+            ref_mv->as_mv.row += neighbors[best_site].row;
+            ref_mv->as_mv.col += neighbors[best_site].col;
+            best_address += (neighbors[best_site].row)*in_what_stride + neighbors[best_site].col;
+        }
+    }
+
+    this_mv.as_mv.row = ref_mv->as_mv.row << 3;
+    this_mv.as_mv.col = ref_mv->as_mv.col << 3;
+
+    return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &thissad)
+           + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
+}
+
+int vp8_refining_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
+                              int_mv *ref_mv, int error_per_bit,
+                              int search_range, vp8_variance_fn_ptr_t *fn_ptr,
+                              int *mvcost[2], int_mv *center_mv)
+{
+    MV neighbors[4] = {{-1, 0}, {0, -1}, {0, 1}, {1, 0}};
+    int i, j;
+    short this_row_offset, this_col_offset;
+
+    int what_stride = b->src_stride;
+    int pre_stride = x->e_mbd.pre.y_stride;
+    unsigned char *base_pre = x->e_mbd.pre.y_buffer;
+    int in_what_stride = pre_stride;
+    unsigned char *what = (*(b->base_src) + b->src);
+    unsigned char *best_address = (unsigned char *)(base_pre + d->offset +
+        (ref_mv->as_mv.row * pre_stride) + ref_mv->as_mv.col);
+    unsigned char *check_here;
+    int_mv this_mv;
+    unsigned int bestsad;
+    unsigned int thissad;
+
+    int *mvsadcost[2];
+    int_mv fcenter_mv;
+
+    mvsadcost[0] = x->mvsadcost[0];
+    mvsadcost[1] = x->mvsadcost[1];
+    fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
+    fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+
+    bestsad = fn_ptr->sdf(what, what_stride, best_address, in_what_stride)
+            + mvsad_err_cost(ref_mv, &fcenter_mv, mvsadcost, error_per_bit);
+
+    for (i=0; i<search_range; i++)
+    {
+        int best_site = -1;
+        int all_in = 1;
+
+        all_in &= ((ref_mv->as_mv.row - 1) > x->mv_row_min);
+        all_in &= ((ref_mv->as_mv.row + 1) < x->mv_row_max);
+        all_in &= ((ref_mv->as_mv.col - 1) > x->mv_col_min);
+        all_in &= ((ref_mv->as_mv.col + 1) < x->mv_col_max);
+
+        if(all_in)
+        {
+            unsigned int sad_array[4];
+            const unsigned char *block_offset[4];
+            block_offset[0] = best_address - in_what_stride;
+            block_offset[1] = best_address - 1;
+            block_offset[2] = best_address + 1;
+            block_offset[3] = best_address + in_what_stride;
+
+            fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride, sad_array);
+
+            for (j = 0; j < 4; j++)
+            {
+                if (sad_array[j] < bestsad)
+                {
+                    this_mv.as_mv.row = ref_mv->as_mv.row + neighbors[j].row;
+                    this_mv.as_mv.col = ref_mv->as_mv.col + neighbors[j].col;
+                    sad_array[j] += mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, error_per_bit);
+
+                    if (sad_array[j] < bestsad)
+                    {
+                        bestsad = sad_array[j];
+                        best_site = j;
+                    }
+                }
+            }
+        }
+        else
+        {
+            for (j = 0 ; j < 4 ; j++)
+            {
+                this_row_offset = ref_mv->as_mv.row + neighbors[j].row;
+                this_col_offset = ref_mv->as_mv.col + neighbors[j].col;
+
+                if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&
+                (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max))
+                {
+                    check_here = (neighbors[j].row)*in_what_stride + neighbors[j].col + best_address;
+                    thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride);
+
+                    if (thissad < bestsad)
+                    {
+                        this_mv.as_mv.row = this_row_offset;
+                        this_mv.as_mv.col = this_col_offset;
+                        thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, error_per_bit);
+
+                        if (thissad < bestsad)
+                        {
+                            bestsad = thissad;
+                            best_site = j;
+                        }
+                    }
+                }
+            }
+        }
+
+        if (best_site == -1)
+            break;
+        else
+        {
+            ref_mv->as_mv.row += neighbors[best_site].row;
+            ref_mv->as_mv.col += neighbors[best_site].col;
+            best_address += (neighbors[best_site].row)*in_what_stride + neighbors[best_site].col;
+        }
+    }
+
+    this_mv.as_mv.row = ref_mv->as_mv.row * 8;
+    this_mv.as_mv.col = ref_mv->as_mv.col * 8;
+
+    return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &thissad)
+           + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
+}
+
+#ifdef VP8_ENTROPY_STATS
+void print_mode_context(void)
+{
+    FILE *f = fopen("modecont.c", "w");
+    int i, j;
+
+    fprintf(f, "#include \"entropy.h\"\n");
+    fprintf(f, "const int vp8_mode_contexts[6][4] =\n");
+    fprintf(f, "{\n");
+
+    for (j = 0; j < 6; j++)
+    {
+        fprintf(f, "  { /* %d */\n", j);
+        fprintf(f, "    ");
+
+        for (i = 0; i < 4; i++)
+        {
+            int overal_prob;
+            int this_prob;
+            int count;
+
+            /* Overall probs */
+            count = mv_mode_cts[i][0] + mv_mode_cts[i][1];
+
+            if (count)
+                overal_prob = 256 * mv_mode_cts[i][0] / count;
+            else
+                overal_prob = 128;
+
+            if (overal_prob == 0)
+                overal_prob = 1;
+
+            /* context probs */
+            count = mv_ref_ct[j][i][0] + mv_ref_ct[j][i][1];
+
+            if (count)
+                this_prob = 256 * mv_ref_ct[j][i][0] / count;
+            else
+                this_prob = 128;
+
+            if (this_prob == 0)
+                this_prob = 1;
+
+            fprintf(f, "%5d, ", this_prob);
+        }
+
+        fprintf(f, "  },\n");
+    }
+
+    fprintf(f, "};\n");
+    fclose(f);
+}
+
+/* MV ref count VP8_ENTROPY_STATS stats code */
+#ifdef VP8_ENTROPY_STATS
+void init_mv_ref_counts()
+{
+    memset(mv_ref_ct, 0, sizeof(mv_ref_ct));
+    memset(mv_mode_cts, 0, sizeof(mv_mode_cts));
+}
+
+void accum_mv_refs(MB_PREDICTION_MODE m, const int ct[4])
+{
+    if (m == ZEROMV)
+    {
+        ++mv_ref_ct [ct[0]] [0] [0];
+        ++mv_mode_cts[0][0];
+    }
+    else
+    {
+        ++mv_ref_ct [ct[0]] [0] [1];
+        ++mv_mode_cts[0][1];
+
+        if (m == NEARESTMV)
+        {
+            ++mv_ref_ct [ct[1]] [1] [0];
+            ++mv_mode_cts[1][0];
+        }
+        else
+        {
+            ++mv_ref_ct [ct[1]] [1] [1];
+            ++mv_mode_cts[1][1];
+
+            if (m == NEARMV)
+            {
+                ++mv_ref_ct [ct[2]] [2] [0];
+                ++mv_mode_cts[2][0];
+            }
+            else
+            {
+                ++mv_ref_ct [ct[2]] [2] [1];
+                ++mv_mode_cts[2][1];
+
+                if (m == NEWMV)
+                {
+                    ++mv_ref_ct [ct[3]] [3] [0];
+                    ++mv_mode_cts[3][0];
+                }
+                else
+                {
+                    ++mv_ref_ct [ct[3]] [3] [1];
+                    ++mv_mode_cts[3][1];
+                }
+            }
+        }
+    }
+}
+
+#endif/* END MV ref count VP8_ENTROPY_STATS stats code */
+
+#endif
diff --git a/libs/libvpx/vp8/encoder/mcomp.h b/libs/libvpx/vp8/encoder/mcomp.h
new file mode 100644
index 0000000000..1694af819e
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/mcomp.h
@@ -0,0 +1,115 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_ENCODER_MCOMP_H_
+#define VP8_ENCODER_MCOMP_H_
+
+#include "block.h"
+#include "vpx_dsp/variance.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef VP8_ENTROPY_STATS
+extern void init_mv_ref_counts();
+extern void accum_mv_refs(MB_PREDICTION_MODE, const int near_mv_ref_cts[4]);
+#endif
+
+
+/* The maximum number of steps in a step search given the largest allowed
+ * initial step
+ */
+#define MAX_MVSEARCH_STEPS 8
+
+/* Max full pel mv specified in 1 pel units */
+#define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS)) - 1)
+
+/* Maximum size of the first step in full pel units */
+#define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS-1))
+
+extern void print_mode_context(void);
+extern int vp8_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvcost[2], int Weight);
+extern void vp8_init_dsmotion_compensation(MACROBLOCK *x, int stride);
+extern void vp8_init3smotion_compensation(MACROBLOCK *x,  int stride);
+
+
+extern int vp8_hex_search
+(
+    MACROBLOCK *x,
+    BLOCK *b,
+    BLOCKD *d,
+    int_mv *ref_mv,
+    int_mv *best_mv,
+    int search_param,
+    int error_per_bit,
+    const vp8_variance_fn_ptr_t *vf,
+    int *mvsadcost[2],
+    int *mvcost[2],
+    int_mv *center_mv
+);
+
+typedef int (fractional_mv_step_fp)
+    (MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *bestmv, int_mv *ref_mv,
+     int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2],
+     int *distortion, unsigned int *sse);
+
+extern fractional_mv_step_fp vp8_find_best_sub_pixel_step_iteratively;
+extern fractional_mv_step_fp vp8_find_best_sub_pixel_step;
+extern fractional_mv_step_fp vp8_find_best_half_pixel_step;
+extern fractional_mv_step_fp vp8_skip_fractional_mv_step;
+
+typedef int (*vp8_full_search_fn_t)
+    (
+     MACROBLOCK *x,
+     BLOCK *b,
+     BLOCKD *d,
+     int_mv *ref_mv,
+     int sad_per_bit,
+     int distance,
+     vp8_variance_fn_ptr_t *fn_ptr,
+     int *mvcost[2],
+     int_mv *center_mv
+    );
+
+typedef int (*vp8_refining_search_fn_t)
+    (
+     MACROBLOCK *x,
+     BLOCK *b,
+     BLOCKD *d,
+     int_mv *ref_mv,
+     int sad_per_bit,
+     int distance,
+     vp8_variance_fn_ptr_t *fn_ptr,
+     int *mvcost[2],
+     int_mv *center_mv
+    );
+
+typedef int (*vp8_diamond_search_fn_t)
+    (
+     MACROBLOCK *x,
+     BLOCK *b,
+     BLOCKD *d,
+     int_mv *ref_mv,
+     int_mv *best_mv,
+     int search_param,
+     int sad_per_bit,
+     int *num00,
+     vp8_variance_fn_ptr_t *fn_ptr,
+     int *mvcost[2],
+     int_mv *center_mv
+    );
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_ENCODER_MCOMP_H_
diff --git a/libs/libvpx/vp8/encoder/mips/msa/dct_msa.c b/libs/libvpx/vp8/encoder/mips/msa/dct_msa.c
new file mode 100644
index 0000000000..be61ffa0db
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/mips/msa/dct_msa.c
@@ -0,0 +1,199 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/mips/msa/vp8_macros_msa.h"
+
+#define TRANSPOSE4x4_H(in0, in1, in2, in3, out0, out1, out2, out3)  \
+{                                                                   \
+    v8i16 s0_m, s1_m, tp0_m, tp1_m, tp2_m, tp3_m;                   \
+                                                                    \
+    ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                     \
+    ILVRL_H2_SH(s1_m, s0_m, tp0_m, tp1_m);                          \
+    ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                     \
+    ILVRL_H2_SH(s1_m, s0_m, tp2_m, tp3_m);                          \
+    PCKEV_D2_SH(tp2_m, tp0_m, tp3_m, tp1_m, out0, out2);            \
+    PCKOD_D2_SH(tp2_m, tp0_m, tp3_m, tp1_m, out1, out3);            \
+}
+
+#define SET_DOTP_VALUES(coeff, val0, val1, val2, const1, const2)    \
+{                                                                   \
+    v8i16 tmp0_m;                                                   \
+                                                                    \
+    SPLATI_H3_SH(coeff, val0, val1, val2, tmp0_m, const1, const2);  \
+    ILVEV_H2_SH(tmp0_m, const1, const2, tmp0_m, const1, const2);    \
+}
+
+#define RET_1_IF_NZERO_H(in0)       \
+({                                  \
+    v8i16 tmp0_m;                   \
+    v8i16 one_m = __msa_ldi_h(1);   \
+                                    \
+    tmp0_m = __msa_ceqi_h(in0, 0);  \
+    tmp0_m = tmp0_m ^ 255;          \
+    tmp0_m = one_m & tmp0_m;        \
+                                    \
+    tmp0_m;                         \
+})
+
+#define RET_1_IF_NZERO_W(in0)       \
+({                                  \
+    v4i32 tmp0_m;                   \
+    v4i32 one_m = __msa_ldi_w(1);   \
+                                    \
+    tmp0_m = __msa_ceqi_w(in0, 0);  \
+    tmp0_m = tmp0_m ^ 255;          \
+    tmp0_m = one_m & tmp0_m;        \
+                                    \
+    tmp0_m;                         \
+})
+
+#define RET_1_IF_NEG_W(in0)           \
+({                                    \
+    v4i32 tmp0_m;                     \
+                                      \
+    v4i32 one_m = __msa_ldi_w(1);     \
+    tmp0_m = __msa_clti_s_w(in0, 0);  \
+    tmp0_m = one_m & tmp0_m;          \
+                                      \
+    tmp0_m;                           \
+})
+
+void vp8_short_fdct4x4_msa(int16_t *input, int16_t *output, int32_t pitch)
+{
+    v8i16 in0, in1, in2, in3;
+    v8i16 temp0, temp1;
+    v8i16 const0, const1;
+    v8i16 coeff = { 2217, 5352, -5352, 14500, 7500, 12000, 25000, 26000 };
+    v4i32 out0, out1, out2, out3;
+    v8i16 zero = { 0 };
+
+    LD_SH4(input, pitch / 2, in0, in1, in2, in3);
+    TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+
+    BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3);
+    SLLI_4V(temp0, temp1, in1, in3, 3);
+    in0 = temp0 + temp1;
+    in2 = temp0 - temp1;
+    SET_DOTP_VALUES(coeff, 0, 1, 2, const0, const1);
+    temp0 = __msa_ilvr_h(in3, in1);
+    in1 = __msa_splati_h(coeff, 3);
+    out0 = (v4i32)__msa_ilvev_h(zero, in1);
+    coeff = __msa_ilvl_h(zero, coeff);
+    out1 = __msa_splati_w((v4i32)coeff, 0);
+    DPADD_SH2_SW(temp0, temp0, const0, const1, out0, out1);
+    out0 >>= 12;
+    out1 >>= 12;
+    PCKEV_H2_SH(out0, out0, out1, out1, in1, in3);
+    TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+
+    BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3);
+    in0 = temp0 + temp1 + 7;
+    in2 = temp0 - temp1 + 7;
+    in0 >>= 4;
+    in2 >>= 4;
+    ILVR_H2_SW(zero, in0, zero, in2, out0, out2);
+    temp1 = RET_1_IF_NZERO_H(in3);
+    ILVR_H2_SH(zero, temp1, in3, in1, temp1, temp0);
+    SPLATI_W2_SW(coeff, 2, out3, out1);
+    out3 += out1;
+    out1 = __msa_splati_w((v4i32)coeff, 1);
+    DPADD_SH2_SW(temp0, temp0, const0, const1, out1, out3);
+    out1 >>= 16;
+    out3 >>= 16;
+    out1 += (v4i32)temp1;
+    PCKEV_H2_SH(out1, out0, out3, out2, in0, in2);
+    ST_SH2(in0, in2, output, 8);
+}
+
+void vp8_short_fdct8x4_msa(int16_t *input, int16_t *output, int32_t pitch)
+{
+    v8i16 in0, in1, in2, in3;
+    v8i16 temp0, temp1, tmp0, tmp1;
+    v8i16 const0, const1, const2;
+    v8i16 coeff = { 2217, 5352, -5352, 14500, 7500, 12000, 25000, 26000 };
+    v8i16 zero = { 0 };
+    v4i32 vec0_w, vec1_w, vec2_w, vec3_w;
+
+    LD_SH4(input, pitch / 2, in0, in1, in2, in3);
+    TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
+
+    BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3);
+    SLLI_4V(temp0, temp1, in1, in3, 3);
+    in0 = temp0 + temp1;
+    in2 = temp0 - temp1;
+    SET_DOTP_VALUES(coeff, 0, 1, 2, const1, const2);
+    temp0 = __msa_splati_h(coeff, 3);
+    vec1_w = (v4i32)__msa_ilvev_h(zero, temp0);
+    coeff = __msa_ilvl_h(zero, coeff);
+    vec3_w = __msa_splati_w((v4i32)coeff, 0);
+    ILVRL_H2_SH(in3, in1, tmp1, tmp0);
+    vec0_w = vec1_w;
+    vec2_w = vec3_w;
+    DPADD_SH4_SW(tmp1, tmp0, tmp1, tmp0, const1, const1, const2, const2,
+                 vec0_w, vec1_w, vec2_w, vec3_w);
+    SRA_4V(vec1_w, vec0_w, vec3_w, vec2_w, 12);
+    PCKEV_H2_SH(vec1_w, vec0_w, vec3_w, vec2_w, in1, in3);
+    TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
+
+    BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3);
+    in0 = temp0 + temp1 + 7;
+    in2 = temp0 - temp1 + 7;
+    in0 >>= 4;
+    in2 >>= 4;
+    SPLATI_W2_SW(coeff, 2, vec3_w, vec1_w);
+    vec3_w += vec1_w;
+    vec1_w = __msa_splati_w((v4i32)coeff, 1);
+    const0 = RET_1_IF_NZERO_H(in3);
+    ILVRL_H2_SH(in3, in1, tmp1, tmp0);
+    vec0_w = vec1_w;
+    vec2_w = vec3_w;
+    DPADD_SH4_SW(tmp1, tmp0, tmp1, tmp0, const1, const1, const2, const2,
+                 vec0_w, vec1_w, vec2_w, vec3_w);
+    SRA_4V(vec1_w, vec0_w, vec3_w, vec2_w, 16);
+    PCKEV_H2_SH(vec1_w, vec0_w, vec3_w, vec2_w, in1, in3);
+    in1 += const0;
+    PCKEV_D2_SH(in1, in0, in3, in2, temp0, temp1);
+    ST_SH2(temp0, temp1, output, 8);
+
+    PCKOD_D2_SH(in1, in0, in3, in2, in0, in2);
+    ST_SH2(in0, in2, output + 16, 8);
+}
+
+void vp8_short_walsh4x4_msa(int16_t *input, int16_t *output, int32_t pitch)
+{
+    v8i16 in0_h, in1_h, in2_h, in3_h;
+    v4i32 in0_w, in1_w, in2_w, in3_w, temp0, temp1, temp2, temp3;
+
+    LD_SH4(input, pitch / 2, in0_h, in1_h, in2_h, in3_h);
+    TRANSPOSE4x4_SH_SH(in0_h, in1_h, in2_h, in3_h, in0_h, in1_h, in2_h, in3_h);
+
+    UNPCK_R_SH_SW(in0_h, in0_w);
+    UNPCK_R_SH_SW(in1_h, in1_w);
+    UNPCK_R_SH_SW(in2_h, in2_w);
+    UNPCK_R_SH_SW(in3_h, in3_w);
+    BUTTERFLY_4(in0_w, in1_w, in3_w, in2_w, temp0, temp3, temp2, temp1);
+    SLLI_4V(temp0, temp1, temp2, temp3, 2);
+    BUTTERFLY_4(temp0, temp1, temp2, temp3, in0_w, in1_w, in2_w, in3_w);
+    temp0 = RET_1_IF_NZERO_W(temp0);
+    in0_w += temp0;
+    TRANSPOSE4x4_SW_SW(in0_w, in1_w, in2_w, in3_w, in0_w, in1_w, in2_w, in3_w);
+
+    BUTTERFLY_4(in0_w, in1_w, in3_w, in2_w, temp0, temp3, temp2, temp1);
+    BUTTERFLY_4(temp0, temp1, temp2, temp3, in0_w, in1_w, in2_w, in3_w);
+    in0_w += RET_1_IF_NEG_W(in0_w);
+    in1_w += RET_1_IF_NEG_W(in1_w);
+    in2_w += RET_1_IF_NEG_W(in2_w);
+    in3_w += RET_1_IF_NEG_W(in3_w);
+    ADD4(in0_w, 3, in1_w, 3, in2_w, 3, in3_w, 3, in0_w, in1_w, in2_w, in3_w);
+    SRA_4V(in0_w, in1_w, in2_w, in3_w, 3);
+    PCKEV_H2_SH(in1_w, in0_w, in3_w, in2_w, in0_h, in1_h);
+    ST_SH2(in0_h, in1_h, output, 8);
+}
diff --git a/libs/libvpx/vp8/encoder/mips/msa/denoising_msa.c b/libs/libvpx/vp8/encoder/mips/msa/denoising_msa.c
new file mode 100644
index 0000000000..66965c6685
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/mips/msa/denoising_msa.c
@@ -0,0 +1,624 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include "./vp8_rtcd.h"
+#include "vp8/common/mips/msa/vp8_macros_msa.h"
+#include "vp8/encoder/denoising.h"
+
+int32_t vp8_denoiser_filter_msa(uint8_t *mc_running_avg_y_ptr,
+                                int32_t mc_avg_y_stride,
+                                uint8_t *running_avg_y_ptr,
+                                int32_t avg_y_stride,
+                                uint8_t *sig_ptr, int32_t sig_stride,
+                                uint32_t motion_magnitude,
+                                int32_t increase_denoising)
+{
+    uint8_t *running_avg_y_start = running_avg_y_ptr;
+    uint8_t *sig_start = sig_ptr;
+    int32_t cnt = 0;
+    int32_t sum_diff = 0;
+    int32_t shift_inc1 = 3;
+    int32_t delta = 0;
+    int32_t sum_diff_thresh;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
+    v16u8 mc_running_avg_y0, running_avg_y, sig0;
+    v16u8 mc_running_avg_y1, running_avg_y1, sig1;
+    v16u8 coeff0, coeff1;
+    v8i16 diff0, diff1, abs_diff0, abs_diff1, abs_diff_neg0, abs_diff_neg1;
+    v8i16 adjust0, adjust1, adjust2, adjust3;
+    v8i16 shift_inc1_vec = { 0 };
+    v8i16 col_sum0 = { 0 };
+    v8i16 col_sum1 = { 0 };
+    v8i16 col_sum2 = { 0 };
+    v8i16 col_sum3 = { 0 };
+    v8i16 temp0_h, temp1_h, temp2_h, temp3_h, cmp, delta_vec;
+    v4i32 temp0_w;
+    v2i64 temp0_d, temp1_d;
+    v8i16 zero = { 0 };
+    v8i16 one = __msa_ldi_h(1);
+    v8i16 four = __msa_ldi_h(4);
+    v8i16 val_127 = __msa_ldi_h(127);
+    v8i16 adj_val = { 6, 4, 3, 0, -6, -4, -3, 0 };
+
+    if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
+    {
+        adj_val = __msa_add_a_h(adj_val, one);
+        if (increase_denoising)
+        {
+            adj_val = __msa_add_a_h(adj_val, one);
+            shift_inc1 = 4;
+        }
+
+        temp0_h = zero - adj_val;
+        adj_val = (v8i16)__msa_ilvev_d((v2i64)temp0_h, (v2i64)adj_val);
+    }
+
+    adj_val = __msa_insert_h(adj_val, 3, cnt);
+    adj_val = __msa_insert_h(adj_val, 7, cnt);
+    shift_inc1_vec = __msa_fill_h(shift_inc1);
+
+    for (cnt = 8; cnt--;)
+    {
+        v8i16 mask0 = { 0 };
+        v8i16 mask1 = { 0 };
+
+        mc_running_avg_y0 = LD_UB(mc_running_avg_y_ptr);
+        sig0 = LD_UB(sig_ptr);
+        sig_ptr += sig_stride;
+        mc_running_avg_y_ptr += mc_avg_y_stride;
+
+        mc_running_avg_y1 = LD_UB(mc_running_avg_y_ptr);
+        sig1 = LD_UB(sig_ptr);
+
+        ILVRL_B2_UB(mc_running_avg_y0, sig0, coeff0, coeff1);
+        HSUB_UB2_SH(coeff0, coeff1, diff0, diff1);
+        abs_diff0 = __msa_add_a_h(diff0, zero);
+        abs_diff1 = __msa_add_a_h(diff1, zero);
+        cmp = __msa_clei_s_h(abs_diff0, 15);
+        cmp = cmp & one;
+        mask0 += cmp;
+        cmp = __msa_clei_s_h(abs_diff0, 7);
+        cmp = cmp & one;
+        mask0 += cmp;
+        cmp = abs_diff0 < shift_inc1_vec;
+        cmp = cmp & one;
+        mask0 += cmp;
+        cmp = __msa_clei_s_h(abs_diff1, 15);
+        cmp = cmp & one;
+        mask1 += cmp;
+        cmp = __msa_clei_s_h(abs_diff1, 7);
+        cmp = cmp & one;
+        mask1 += cmp;
+        cmp = abs_diff1 < shift_inc1_vec;
+        cmp = cmp & one;
+        mask1 += cmp;
+        temp0_h = __msa_clei_s_h(diff0, 0);
+        temp0_h = temp0_h & four;
+        mask0 += temp0_h;
+        temp1_h = __msa_clei_s_h(diff1, 0);
+        temp1_h = temp1_h & four;
+        mask1 += temp1_h;
+        VSHF_H2_SH(adj_val, adj_val, adj_val, adj_val, mask0, mask1, adjust0,
+                   adjust1);
+        temp2_h = __msa_ceqi_h(adjust0, 0);
+        temp3_h = __msa_ceqi_h(adjust1, 0);
+        adjust0 = (v8i16)__msa_bmnz_v((v16u8)adjust0, (v16u8)diff0,
+                                     (v16u8)temp2_h);
+        adjust1 = (v8i16)__msa_bmnz_v((v16u8)adjust1, (v16u8)diff1,
+                                     (v16u8)temp3_h);
+        ADD2(col_sum0, adjust0, col_sum1, adjust1, col_sum0, col_sum1);
+        UNPCK_UB_SH(sig0, temp0_h, temp1_h);
+        ADD2(temp0_h, adjust0, temp1_h, adjust1, temp0_h, temp1_h);
+        MAXI_SH2_SH(temp0_h, temp1_h, 0);
+        SAT_UH2_SH(temp0_h, temp1_h, 7);
+        temp2_h = (v8i16)__msa_pckev_b((v16i8)temp3_h, (v16i8)temp2_h);
+        running_avg_y = (v16u8)__msa_pckev_b((v16i8)temp1_h, (v16i8)temp0_h);
+        running_avg_y = __msa_bmnz_v(running_avg_y, mc_running_avg_y0,
+                                     (v16u8)temp2_h);
+        ST_UB(running_avg_y, running_avg_y_ptr);
+        running_avg_y_ptr += avg_y_stride;
+
+        mask0 = zero;
+        mask1 = zero;
+        ILVRL_B2_UB(mc_running_avg_y1, sig1, coeff0, coeff1);
+        HSUB_UB2_SH(coeff0, coeff1, diff0, diff1);
+        abs_diff0 = __msa_add_a_h(diff0, zero);
+        abs_diff1 = __msa_add_a_h(diff1, zero);
+        cmp = __msa_clei_s_h(abs_diff0, 15);
+        cmp = cmp & one;
+        mask0 += cmp;
+        cmp = __msa_clei_s_h(abs_diff0, 7);
+        cmp = cmp & one;
+        mask0 += cmp;
+        cmp = abs_diff0 < shift_inc1_vec;
+        cmp = cmp & one;
+        mask0 += cmp;
+        cmp = __msa_clei_s_h(abs_diff1, 15);
+        cmp = cmp & one;
+        mask1 += cmp;
+        cmp = __msa_clei_s_h(abs_diff1, 7);
+        cmp = cmp & one;
+        mask1 += cmp;
+        cmp = abs_diff1 < shift_inc1_vec;
+        cmp = cmp & one;
+        mask1 += cmp;
+        temp0_h = __msa_clei_s_h(diff0, 0);
+        temp0_h = temp0_h & four;
+        mask0 += temp0_h;
+        temp1_h = __msa_clei_s_h(diff1, 0);
+        temp1_h = temp1_h & four;
+        mask1 += temp1_h;
+        VSHF_H2_SH(adj_val, adj_val, adj_val, adj_val, mask0, mask1, adjust0,
+                   adjust1);
+        temp2_h = __msa_ceqi_h(adjust0, 0);
+        temp3_h = __msa_ceqi_h(adjust1, 0);
+        adjust0 = (v8i16)__msa_bmnz_v((v16u8)adjust0, (v16u8)diff0,
+                                      (v16u8)temp2_h);
+        adjust1 = (v8i16)__msa_bmnz_v((v16u8)adjust1, (v16u8)diff1,
+                                      (v16u8)temp3_h);
+        ADD2(col_sum0, adjust0, col_sum1, adjust1, col_sum0, col_sum1);
+        UNPCK_UB_SH(sig1, temp0_h, temp1_h);
+        ADD2(temp0_h, adjust0, temp1_h, adjust1, temp0_h, temp1_h);
+        MAXI_SH2_SH(temp0_h, temp1_h, 0);
+        SAT_UH2_SH(temp0_h, temp1_h, 7);
+        temp2_h = (v8i16)__msa_pckev_b((v16i8)temp3_h, (v16i8)temp2_h);
+        running_avg_y = (v16u8)__msa_pckev_b((v16i8)temp1_h, (v16i8)temp0_h);
+        running_avg_y = __msa_bmnz_v(running_avg_y, mc_running_avg_y1,
+                                     (v16u8)temp2_h);
+        ST_UB(running_avg_y, running_avg_y_ptr);
+        sig_ptr += sig_stride;
+        mc_running_avg_y_ptr += mc_avg_y_stride;
+        running_avg_y_ptr += avg_y_stride;
+    }
+
+    col_sum0 = __msa_min_s_h(col_sum0, val_127);
+    col_sum1 = __msa_min_s_h(col_sum1, val_127);
+    temp0_h = col_sum0 + col_sum1;
+    temp0_w = __msa_hadd_s_w(temp0_h, temp0_h);
+    temp0_d = __msa_hadd_s_d(temp0_w, temp0_w);
+    temp1_d = __msa_splati_d(temp0_d, 1);
+    temp0_d += temp1_d;
+    sum_diff = __msa_copy_s_w((v4i32)temp0_d, 0);
+    sig_ptr -= sig_stride * 16;
+    mc_running_avg_y_ptr -= mc_avg_y_stride * 16;
+    running_avg_y_ptr -= avg_y_stride * 16;
+
+    if (increase_denoising)
+    {
+        sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH;
+    }
+
+    if (abs(sum_diff) > sum_diff_thresh)
+    {
+        delta = ((abs(sum_diff) - sum_diff_thresh) >> 8) + 1;
+        delta_vec = __msa_fill_h(delta);
+        if (delta < 4)
+        {
+            for (cnt = 8; cnt--;)
+            {
+                running_avg_y = LD_UB(running_avg_y_ptr);
+                mc_running_avg_y0 = LD_UB(mc_running_avg_y_ptr);
+                sig0 = LD_UB(sig_ptr);
+                sig_ptr += sig_stride;
+                mc_running_avg_y_ptr += mc_avg_y_stride;
+                running_avg_y_ptr += avg_y_stride;
+                mc_running_avg_y1 = LD_UB(mc_running_avg_y_ptr);
+                sig1 = LD_UB(sig_ptr);
+                running_avg_y1 = LD_UB(running_avg_y_ptr);
+                ILVRL_B2_UB(mc_running_avg_y0, sig0, coeff0, coeff1);
+                HSUB_UB2_SH(coeff0, coeff1, diff0, diff1);
+                abs_diff0 = __msa_add_a_h(diff0, zero);
+                abs_diff1 = __msa_add_a_h(diff1, zero);
+                temp0_h = abs_diff0 < delta_vec;
+                temp1_h = abs_diff1 < delta_vec;
+                abs_diff0 = (v8i16)__msa_bmz_v((v16u8)abs_diff0,
+                                               (v16u8)delta_vec,
+                                               (v16u8)temp0_h);
+                abs_diff1 = (v8i16)__msa_bmz_v((v16u8)abs_diff1,
+                                               (v16u8)delta_vec,
+                                               (v16u8)temp1_h);
+                SUB2(zero, abs_diff0, zero, abs_diff1, abs_diff_neg0,
+                     abs_diff_neg1);
+                abs_diff_neg0 = zero - abs_diff0;
+                abs_diff_neg1 = zero - abs_diff1;
+                temp0_h = __msa_clei_s_h(diff0, 0);
+                temp1_h = __msa_clei_s_h(diff1, 0);
+                adjust0 = (v8i16)__msa_bmnz_v((v16u8)abs_diff0,
+                                              (v16u8)abs_diff_neg0,
+                                              (v16u8)temp0_h);
+                adjust1 = (v8i16)__msa_bmnz_v((v16u8)abs_diff1,
+                                              (v16u8)abs_diff_neg1,
+                                              (v16u8)temp1_h);
+                ILVRL_B2_SH(zero, running_avg_y, temp2_h, temp3_h);
+                ADD2(temp2_h, adjust0, temp3_h, adjust1, adjust2, adjust3);
+                MAXI_SH2_SH(adjust2, adjust3, 0);
+                SAT_UH2_SH(adjust2, adjust3, 7);
+                temp0_h = __msa_ceqi_h(diff0, 0);
+                temp1_h = __msa_ceqi_h(diff1, 0);
+                adjust2 = (v8i16)__msa_bmz_v((v16u8)adjust2, (v16u8)temp2_h,
+                                             (v16u8)temp0_h);
+                adjust3 = (v8i16)__msa_bmz_v((v16u8)adjust3, (v16u8)temp3_h,
+                                             (v16u8)temp1_h);
+                adjust0 = (v8i16)__msa_bmnz_v((v16u8)adjust0, (v16u8)zero,
+                                              (v16u8)temp0_h);
+                adjust1 = (v8i16)__msa_bmnz_v((v16u8)adjust1, (v16u8)zero,
+                                              (v16u8)temp1_h);
+                ADD2(col_sum2, adjust0, col_sum3, adjust1, col_sum2, col_sum3);
+                running_avg_y = (v16u8)__msa_pckev_b((v16i8)adjust3,
+                                                     (v16i8)adjust2);
+                ST_UB(running_avg_y, running_avg_y_ptr - avg_y_stride);
+                ILVRL_B2_UB(mc_running_avg_y1, sig1, coeff0, coeff1);
+                HSUB_UB2_SH(coeff0, coeff1, diff0, diff1);
+                abs_diff0 = __msa_add_a_h(diff0, zero);
+                abs_diff1 = __msa_add_a_h(diff1, zero);
+                temp0_h = abs_diff0 < delta_vec;
+                temp1_h = abs_diff1 < delta_vec;
+                abs_diff0 = (v8i16)__msa_bmz_v((v16u8)abs_diff0,
+                                               (v16u8)delta_vec,
+                                               (v16u8)temp0_h);
+                abs_diff1 = (v8i16)__msa_bmz_v((v16u8)abs_diff1,
+                                               (v16u8)delta_vec,
+                                               (v16u8)temp1_h);
+                SUB2(zero, abs_diff0, zero, abs_diff1, abs_diff_neg0,
+                     abs_diff_neg1);
+                temp0_h = __msa_clei_s_h(diff0, 0);
+                temp1_h = __msa_clei_s_h(diff1, 0);
+                adjust0 = (v8i16)__msa_bmnz_v((v16u8)abs_diff0,
+                                              (v16u8)abs_diff_neg0,
+                                              (v16u8)temp0_h);
+                adjust1 = (v8i16)__msa_bmnz_v((v16u8)abs_diff1,
+                                              (v16u8)abs_diff_neg1,
+                                              (v16u8)temp1_h);
+                ILVRL_H2_SH(zero, running_avg_y1, temp2_h, temp3_h);
+                ADD2(temp2_h, adjust0, temp3_h, adjust1, adjust2, adjust3);
+                MAXI_SH2_SH(adjust2, adjust3, 0);
+                SAT_UH2_SH(adjust2, adjust3, 7);
+                temp0_h = __msa_ceqi_h(diff0, 0);
+                temp1_h = __msa_ceqi_h(diff1, 0);
+                adjust2 = (v8i16)__msa_bmz_v((v16u8)adjust2, (v16u8)temp2_h,
+                                             (v16u8)temp0_h);
+                adjust3 = (v8i16)__msa_bmz_v((v16u8)adjust3, (v16u8)temp3_h,
+                                             (v16u8)temp1_h);
+                adjust0 = (v8i16)__msa_bmz_v((v16u8)adjust0, (v16u8)zero,
+                                             (v16u8)temp0_h);
+                adjust1 = (v8i16)__msa_bmz_v((v16u8)adjust1, (v16u8)zero,
+                                             (v16u8)temp1_h);
+                ADD2(col_sum2, adjust0, col_sum3, adjust1, col_sum2, col_sum3);
+                running_avg_y = (v16u8)__msa_pckev_b((v16i8)adjust3,
+                                                     (v16i8)adjust2);
+                ST_UB(running_avg_y, running_avg_y_ptr);
+                running_avg_y_ptr += avg_y_stride;
+            }
+
+            col_sum2 = __msa_min_s_h(col_sum2, val_127);
+            col_sum3 = __msa_min_s_h(col_sum3, val_127);
+            temp0_h = col_sum2 + col_sum3;
+            temp0_w = __msa_hadd_s_w(temp0_h, temp0_h);
+            temp0_d = __msa_hadd_s_d(temp0_w, temp0_w);
+            temp1_d = __msa_splati_d(temp0_d, 1);
+            temp0_d += (v2i64)temp1_d;
+            sum_diff = __msa_copy_s_w((v4i32)temp0_d, 0);
+            if (abs(sum_diff) > SUM_DIFF_THRESHOLD)
+            {
+                return COPY_BLOCK;
+            }
+        }
+        else
+        {
+            return COPY_BLOCK;
+        }
+    }
+
+    LD_UB8(sig_start, sig_stride, src0, src1, src2, src3, src4, src5, src6,
+           src7);
+    sig_start += (8 * sig_stride);
+    LD_UB8(sig_start, sig_stride, src8, src9, src10, src11, src12, src13,
+           src14, src15);
+
+    ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, running_avg_y_start,
+           avg_y_stride);
+    running_avg_y_start += (8 * avg_y_stride);
+    ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15,
+           running_avg_y_start, avg_y_stride);
+
+    return FILTER_BLOCK;
+}
+
+int32_t vp8_denoiser_filter_uv_msa(uint8_t *mc_running_avg_y_ptr,
+                                   int32_t mc_avg_y_stride,
+                                   uint8_t *running_avg_y_ptr,
+                                   int32_t avg_y_stride,
+                                   uint8_t *sig_ptr,
+                                   int32_t sig_stride,
+                                   uint32_t motion_magnitude,
+                                   int32_t increase_denoising)
+{
+    uint8_t *running_avg_y_start = running_avg_y_ptr;
+    uint8_t *sig_start = sig_ptr;
+    int32_t cnt = 0;
+    int32_t sum_diff = 0;
+    int32_t shift_inc1 = 3;
+    int32_t delta = 0;
+    int32_t sum_block = 0;
+    int32_t sum_diff_thresh;
+    int64_t dst0, dst1, src0, src1, src2, src3;
+    v16u8 mc_running_avg_y0, running_avg_y, sig0;
+    v16u8 mc_running_avg_y1, running_avg_y1, sig1;
+    v16u8 sig2, sig3, sig4, sig5, sig6, sig7;
+    v16u8 coeff0;
+    v8i16 diff0, abs_diff0, abs_diff_neg0;
+    v8i16 adjust0, adjust2;
+    v8i16 shift_inc1_vec = { 0 };
+    v8i16 col_sum0 = { 0 };
+    v8i16 temp0_h, temp2_h, cmp, delta_vec;
+    v4i32 temp0_w;
+    v2i64 temp0_d, temp1_d;
+    v16i8 zero = { 0 };
+    v8i16 one = __msa_ldi_h(1);
+    v8i16 four = __msa_ldi_h(4);
+    v8i16 adj_val = { 6, 4, 3, 0, -6, -4, -3, 0 };
+
+
+    sig0 = LD_UB(sig_ptr);
+    sig_ptr += sig_stride;
+    temp0_h = (v8i16)__msa_ilvr_b(zero, (v16i8)sig0);
+    sig1 = LD_UB(sig_ptr);
+    sig_ptr += sig_stride;
+    temp0_h += (v8i16)__msa_ilvr_b(zero, (v16i8)sig1);
+    sig2 = LD_UB(sig_ptr);
+    sig_ptr += sig_stride;
+    temp0_h += (v8i16)__msa_ilvr_b(zero, (v16i8)sig2);
+    sig3 = LD_UB(sig_ptr);
+    sig_ptr += sig_stride;
+    temp0_h += (v8i16)__msa_ilvr_b(zero, (v16i8)sig3);
+    sig4 = LD_UB(sig_ptr);
+    sig_ptr += sig_stride;
+    temp0_h += (v8i16)__msa_ilvr_b(zero, (v16i8)sig4);
+    sig5 = LD_UB(sig_ptr);
+    sig_ptr += sig_stride;
+    temp0_h += (v8i16)__msa_ilvr_b(zero, (v16i8)sig5);
+    sig6 = LD_UB(sig_ptr);
+    sig_ptr += sig_stride;
+    temp0_h += (v8i16)__msa_ilvr_b(zero, (v16i8)sig6);
+    sig7 = LD_UB(sig_ptr);
+    sig_ptr += sig_stride;
+    temp0_h += (v8i16)__msa_ilvr_b(zero, (v16i8)sig7);
+    temp0_w = __msa_hadd_s_w(temp0_h, temp0_h);
+    temp0_d = __msa_hadd_s_d(temp0_w, temp0_w);
+    temp1_d = __msa_splati_d(temp0_d, 1);
+    temp0_d += temp1_d;
+    sum_block = __msa_copy_s_w((v4i32)temp0_d, 0);
+    sig_ptr -= sig_stride * 8;
+
+    if (abs(sum_block - (128 * 8 * 8)) < SUM_DIFF_FROM_AVG_THRESH_UV)
+    {
+        return COPY_BLOCK;
+    }
+
+    if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
+    {
+        adj_val = __msa_add_a_h(adj_val, one);
+
+        if (increase_denoising)
+        {
+            adj_val = __msa_add_a_h(adj_val, one);
+            shift_inc1 = 4;
+        }
+
+        temp0_h = (v8i16)zero - adj_val;
+        adj_val = (v8i16)__msa_ilvev_d((v2i64)temp0_h, (v2i64)adj_val);
+    }
+
+    adj_val = __msa_insert_h(adj_val, 3, cnt);
+    adj_val = __msa_insert_h(adj_val, 7, cnt);
+    shift_inc1_vec = __msa_fill_h(shift_inc1);
+    for (cnt = 4; cnt--;)
+    {
+        v8i16 mask0 = { 0 };
+        mc_running_avg_y0 = LD_UB(mc_running_avg_y_ptr);
+        sig0 = LD_UB(sig_ptr);
+        sig_ptr += sig_stride;
+        mc_running_avg_y_ptr += mc_avg_y_stride;
+        mc_running_avg_y1 = LD_UB(mc_running_avg_y_ptr);
+        sig1 = LD_UB(sig_ptr);
+        coeff0 = (v16u8)__msa_ilvr_b((v16i8)mc_running_avg_y0, (v16i8)sig0);
+        diff0 = __msa_hsub_u_h(coeff0, coeff0);
+        abs_diff0 = __msa_add_a_h(diff0, (v8i16)zero);
+        cmp = __msa_clei_s_h(abs_diff0, 15);
+        cmp = cmp & one;
+        mask0 += cmp;
+        cmp = __msa_clei_s_h(abs_diff0, 7);
+        cmp = cmp & one;
+        mask0 += cmp;
+        cmp = abs_diff0 < shift_inc1_vec;
+        cmp = cmp & one;
+        mask0 += cmp;
+        temp0_h = __msa_clei_s_h(diff0, 0);
+        temp0_h = temp0_h & four;
+        mask0 += temp0_h;
+        adjust0 = __msa_vshf_h(mask0, adj_val, adj_val);
+        temp2_h = __msa_ceqi_h(adjust0, 0);
+        adjust0 = (v8i16)__msa_bmnz_v((v16u8)adjust0, (v16u8)diff0,
+                                      (v16u8)temp2_h);
+        col_sum0 += adjust0;
+        temp0_h = (v8i16)__msa_ilvr_b(zero, (v16i8)sig0);
+        temp0_h += adjust0;
+        temp0_h = __msa_maxi_s_h(temp0_h, 0);
+        temp0_h = (v8i16)__msa_sat_u_h((v8u16)temp0_h, 7);
+        temp2_h = (v8i16)__msa_pckev_b((v16i8)temp2_h, (v16i8)temp2_h);
+        running_avg_y = (v16u8)__msa_pckev_b((v16i8)temp0_h, (v16i8)temp0_h);
+        running_avg_y = __msa_bmnz_v(running_avg_y, mc_running_avg_y0,
+                                     (v16u8)temp2_h);
+        dst0 = __msa_copy_s_d((v2i64)running_avg_y,  0);
+        SD(dst0, running_avg_y_ptr);
+        running_avg_y_ptr += avg_y_stride;
+
+        mask0 = __msa_ldi_h(0);
+        coeff0 = (v16u8)__msa_ilvr_b((v16i8)mc_running_avg_y1, (v16i8)sig1);
+        diff0 = __msa_hsub_u_h(coeff0, coeff0);
+        abs_diff0 = __msa_add_a_h(diff0, (v8i16)zero);
+        cmp = __msa_clei_s_h(abs_diff0, 15);
+        cmp = cmp & one;
+        mask0 += cmp;
+        cmp = __msa_clei_s_h(abs_diff0, 7);
+        cmp = cmp & one;
+        mask0 += cmp;
+        cmp = abs_diff0 < shift_inc1_vec;
+        cmp = cmp & one;
+        mask0 += cmp;
+        temp0_h = __msa_clei_s_h(diff0, 0);
+        temp0_h = temp0_h & four;
+        mask0 += temp0_h;
+        adjust0 = __msa_vshf_h(mask0, adj_val, adj_val);
+        temp2_h = __msa_ceqi_h(adjust0, 0);
+        adjust0 = (v8i16)__msa_bmnz_v((v16u8)adjust0, (v16u8)diff0,
+                                      (v16u8)temp2_h);
+        col_sum0 += adjust0;
+        temp0_h = (v8i16)__msa_ilvr_b(zero, (v16i8)sig1);
+        temp0_h += adjust0;
+        temp0_h = __msa_maxi_s_h(temp0_h, 0);
+        temp0_h = (v8i16)__msa_sat_u_h((v8u16)temp0_h, 7);
+
+        temp2_h = (v8i16)__msa_pckev_b((v16i8)temp2_h, (v16i8)temp2_h);
+        running_avg_y = (v16u8)__msa_pckev_b((v16i8)temp0_h, (v16i8)temp0_h);
+        running_avg_y = __msa_bmnz_v(running_avg_y, mc_running_avg_y1,
+                                     (v16u8)temp2_h);
+        dst1 = __msa_copy_s_d((v2i64)running_avg_y, 0);
+        SD(dst1, running_avg_y_ptr);
+
+        sig_ptr += sig_stride;
+        mc_running_avg_y_ptr += mc_avg_y_stride;
+        running_avg_y_ptr += avg_y_stride;
+    }
+
+    temp0_h = col_sum0;
+    temp0_w = __msa_hadd_s_w(temp0_h, temp0_h);
+    temp0_d = __msa_hadd_s_d(temp0_w, temp0_w);
+    temp1_d = __msa_splati_d(temp0_d, 1);
+    temp0_d += temp1_d;
+    sum_diff = __msa_copy_s_w((v4i32)temp0_d, 0);
+    sig_ptr -= sig_stride * 8;
+    mc_running_avg_y_ptr -= mc_avg_y_stride * 8;
+    running_avg_y_ptr -= avg_y_stride * 8;
+    sum_diff_thresh = SUM_DIFF_THRESHOLD_UV;
+
+    if (increase_denoising)
+    {
+        sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH_UV;
+    }
+
+    if (abs(sum_diff) > sum_diff_thresh)
+    {
+        delta = ((abs(sum_diff) - sum_diff_thresh) >> 8) + 1;
+        delta_vec = __msa_fill_h(delta);
+        if (delta < 4)
+        {
+            for (cnt = 4; cnt--;)
+            {
+                running_avg_y = LD_UB(running_avg_y_ptr);
+                mc_running_avg_y0 = LD_UB(mc_running_avg_y_ptr);
+                sig0 = LD_UB(sig_ptr);
+                /* Update pointers for next iteration. */
+                sig_ptr += sig_stride;
+                mc_running_avg_y_ptr += mc_avg_y_stride;
+                running_avg_y_ptr += avg_y_stride;
+
+                mc_running_avg_y1 = LD_UB(mc_running_avg_y_ptr);
+                sig1 = LD_UB(sig_ptr);
+                running_avg_y1 = LD_UB(running_avg_y_ptr);
+
+                coeff0 = (v16u8)__msa_ilvr_b((v16i8)mc_running_avg_y0,
+                                             (v16i8)sig0);
+                diff0 = __msa_hsub_u_h(coeff0, coeff0);
+                abs_diff0 = __msa_add_a_h(diff0, (v8i16)zero);
+                temp0_h = delta_vec < abs_diff0;
+                abs_diff0 = (v8i16)__msa_bmnz_v((v16u8)abs_diff0,
+                                                (v16u8)delta_vec,
+                                                (v16u8)temp0_h);
+                abs_diff_neg0 = (v8i16)zero - abs_diff0;
+                temp0_h = __msa_clei_s_h(diff0, 0);
+                adjust0 = (v8i16)__msa_bmz_v((v16u8)abs_diff0,
+                                             (v16u8)abs_diff_neg0,
+                                             (v16u8)temp0_h);
+                temp2_h = (v8i16)__msa_ilvr_b(zero, (v16i8)running_avg_y);
+                adjust2 = temp2_h + adjust0;
+                adjust2 = __msa_maxi_s_h(adjust2, 0);
+                adjust2 = (v8i16)__msa_sat_u_h((v8u16)adjust2, 7);
+                temp0_h = __msa_ceqi_h(diff0, 0);
+                adjust2 = (v8i16)__msa_bmnz_v((v16u8)adjust2, (v16u8)temp2_h,
+                                              (v16u8)temp0_h);
+                adjust0 = (v8i16)__msa_bmnz_v((v16u8)adjust0, (v16u8)zero,
+                                              (v16u8)temp0_h);
+                col_sum0 += adjust0;
+                running_avg_y = (v16u8)__msa_pckev_b((v16i8)adjust2,
+                                                     (v16i8)adjust2);
+                dst0 = __msa_copy_s_d((v2i64)running_avg_y, 0);
+                SD(dst0, running_avg_y_ptr - avg_y_stride);
+
+                coeff0 = (v16u8)__msa_ilvr_b((v16i8)mc_running_avg_y1,
+                                             (v16i8)sig1);
+                diff0 = __msa_hsub_u_h(coeff0, coeff0);
+                abs_diff0 = __msa_add_a_h(diff0, (v8i16)zero);
+                temp0_h = delta_vec < abs_diff0;
+                abs_diff0 = (v8i16)__msa_bmnz_v((v16u8)abs_diff0,
+                                                (v16u8)delta_vec,
+                                                (v16u8)temp0_h);
+                abs_diff_neg0 = (v8i16)zero - abs_diff0;
+                temp0_h = __msa_clei_s_h(diff0, 0);
+                adjust0 = (v8i16)__msa_bmz_v((v16u8)abs_diff0,
+                                             (v16u8)abs_diff_neg0,
+                                             (v16u8)temp0_h);
+                temp2_h = (v8i16)__msa_ilvr_b(zero, (v16i8)running_avg_y1);
+                adjust2 = temp2_h + adjust0;
+                adjust2 = __msa_maxi_s_h(adjust2, 0);
+                adjust2 = (v8i16)__msa_sat_u_h((v8u16)adjust2, 7);
+                temp0_h = __msa_ceqi_h(diff0, 0);
+                adjust2 = (v8i16)__msa_bmnz_v((v16u8)adjust2, (v16u8)temp2_h,
+                                              (v16u8)temp0_h);
+                adjust0 = (v8i16)__msa_bmnz_v((v16u8)adjust0, (v16u8)zero,
+                                              (v16u8)temp0_h);
+                col_sum0 += adjust0;
+                running_avg_y = (v16u8)__msa_pckev_b((v16i8)adjust2,
+                                                     (v16i8)adjust2);
+                dst1 = __msa_copy_s_d((v2i64)running_avg_y, 0);
+                SD(dst1, running_avg_y_ptr);
+                running_avg_y_ptr += avg_y_stride;
+            }
+
+            temp0_h = col_sum0;
+            temp0_w = __msa_hadd_s_w(temp0_h, temp0_h);
+            temp0_d = __msa_hadd_s_d(temp0_w, temp0_w);
+            temp1_d = __msa_splati_d(temp0_d, 1);
+            temp0_d += temp1_d;
+            sum_diff = __msa_copy_s_w((v4i32)temp0_d, 0);
+
+            if (abs(sum_diff) > sum_diff_thresh)
+            {
+                return COPY_BLOCK;
+            }
+        }
+        else
+        {
+            return COPY_BLOCK;
+        }
+    }
+
+    LD4(sig_start, sig_stride, src0, src1, src2, src3);
+    sig_start += (4 * sig_stride);
+    SD4(src0, src1, src2, src3, running_avg_y_start, avg_y_stride);
+    running_avg_y_start += (4 * avg_y_stride);
+
+    LD4(sig_start, sig_stride, src0, src1, src2, src3);
+    SD4(src0, src1, src2, src3, running_avg_y_start, avg_y_stride);
+
+    return FILTER_BLOCK;
+}
diff --git a/libs/libvpx/vp8/encoder/mips/msa/encodeopt_msa.c b/libs/libvpx/vp8/encoder/mips/msa/encodeopt_msa.c
new file mode 100644
index 0000000000..ea794a8a8e
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/mips/msa/encodeopt_msa.c
@@ -0,0 +1,174 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/mips/msa/vp8_macros_msa.h"
+#include "vp8/encoder/block.h"
+
+int32_t vp8_block_error_msa(int16_t *coeff_ptr, int16_t *dq_coeff_ptr)
+{
+    int32_t err = 0;
+    uint32_t loop_cnt;
+    v8i16 coeff, dq_coeff, coeff0, coeff1;
+    v4i32 diff0, diff1;
+    v2i64 err0 = { 0 };
+    v2i64 err1 = { 0 };
+
+    for (loop_cnt = 2; loop_cnt--;)
+    {
+        coeff = LD_SH(coeff_ptr);
+        dq_coeff = LD_SH(dq_coeff_ptr);
+        ILVRL_H2_SH(coeff, dq_coeff, coeff0, coeff1);
+        HSUB_UH2_SW(coeff0, coeff1, diff0, diff1);
+        DPADD_SD2_SD(diff0, diff1, err0, err1);
+        coeff_ptr += 8;
+        dq_coeff_ptr += 8;
+    }
+
+    err0 += __msa_splati_d(err0, 1);
+    err1 += __msa_splati_d(err1, 1);
+    err = __msa_copy_s_d(err0, 0);
+    err += __msa_copy_s_d(err1, 0);
+
+    return err;
+}
+
+int32_t vp8_mbblock_error_msa(MACROBLOCK *mb, int32_t dc)
+{
+    BLOCK *be;
+    BLOCKD *bd;
+    int16_t *coeff_ptr, *dq_coeff_ptr;
+    int32_t err = 0;
+    uint32_t loop_cnt;
+    v8i16 coeff, coeff0, coeff1, coeff2, coeff3, coeff4;
+    v8i16 dq_coeff, dq_coeff2, dq_coeff3, dq_coeff4;
+    v4i32 diff0, diff1;
+    v2i64 err0, err1;
+    v16u8 zero  = { 0 };
+    v16u8 mask0 = (v16u8)__msa_ldi_b(255);
+
+    if (1 == dc)
+    {
+        mask0 = (v16u8)__msa_insve_w((v4i32)mask0, 0, (v4i32)zero);
+    }
+
+    for (loop_cnt = 0; loop_cnt < 8; loop_cnt++)
+    {
+        be = &mb->block[2 * loop_cnt];
+        bd = &mb->e_mbd.block[2 * loop_cnt];
+        coeff_ptr = be->coeff;
+        dq_coeff_ptr = bd->dqcoeff;
+        coeff = LD_SH(coeff_ptr);
+        dq_coeff = LD_SH(dq_coeff_ptr);
+        coeff_ptr += 8;
+        dq_coeff_ptr += 8;
+        coeff2 = LD_SH(coeff_ptr);
+        dq_coeff2 = LD_SH(dq_coeff_ptr);
+        be = &mb->block[2 * loop_cnt + 1];
+        bd = &mb->e_mbd.block[2 * loop_cnt + 1];
+        coeff_ptr = be->coeff;
+        dq_coeff_ptr = bd->dqcoeff;
+        coeff3 = LD_SH(coeff_ptr);
+        dq_coeff3 = LD_SH(dq_coeff_ptr);
+        coeff_ptr += 8;
+        dq_coeff_ptr += 8;
+        coeff4 = LD_SH(coeff_ptr);
+        dq_coeff4 = LD_SH(dq_coeff_ptr);
+        ILVRL_H2_SH(coeff, dq_coeff, coeff0, coeff1);
+        HSUB_UH2_SW(coeff0, coeff1, diff0, diff1);
+        diff0 = (v4i32)__msa_bmnz_v(zero, (v16u8)diff0, mask0);
+        DOTP_SW2_SD(diff0, diff1, diff0, diff1, err0, err1);
+        ILVRL_H2_SH(coeff2, dq_coeff2, coeff0, coeff1);
+        HSUB_UH2_SW(coeff0, coeff1, diff0, diff1);
+        DPADD_SD2_SD(diff0, diff1, err0, err1);
+        err0 += __msa_splati_d(err0, 1);
+        err1 += __msa_splati_d(err1, 1);
+        err += __msa_copy_s_d(err0, 0);
+        err += __msa_copy_s_d(err1, 0);
+
+        ILVRL_H2_SH(coeff3, dq_coeff3, coeff0, coeff1);
+        HSUB_UH2_SW(coeff0, coeff1, diff0, diff1);
+        diff0 = (v4i32)__msa_bmnz_v(zero, (v16u8)diff0, mask0);
+        DOTP_SW2_SD(diff0, diff1, diff0, diff1, err0, err1);
+        ILVRL_H2_SH(coeff4, dq_coeff4, coeff0, coeff1);
+        HSUB_UH2_SW(coeff0, coeff1, diff0, diff1);
+        DPADD_SD2_SD(diff0, diff1, err0, err1);
+        err0 += __msa_splati_d(err0, 1);
+        err1 += __msa_splati_d(err1, 1);
+        err += __msa_copy_s_d(err0, 0);
+        err += __msa_copy_s_d(err1, 0);
+    }
+
+    return err;
+}
+
+int32_t vp8_mbuverror_msa(MACROBLOCK *mb)
+{
+    BLOCK *be;
+    BLOCKD *bd;
+    int16_t *coeff_ptr, *dq_coeff_ptr;
+    int32_t err = 0;
+    uint32_t loop_cnt;
+    v8i16 coeff, coeff0, coeff1, coeff2, coeff3, coeff4;
+    v8i16 dq_coeff, dq_coeff2, dq_coeff3, dq_coeff4;
+    v4i32 diff0, diff1;
+    v2i64 err0, err1, err_dup0, err_dup1;
+
+    for (loop_cnt = 16; loop_cnt < 24; loop_cnt += 2)
+    {
+        be = &mb->block[loop_cnt];
+        bd = &mb->e_mbd.block[loop_cnt];
+        coeff_ptr = be->coeff;
+        dq_coeff_ptr = bd->dqcoeff;
+        coeff = LD_SH(coeff_ptr);
+        dq_coeff = LD_SH(dq_coeff_ptr);
+        coeff_ptr += 8;
+        dq_coeff_ptr += 8;
+        coeff2 = LD_SH(coeff_ptr);
+        dq_coeff2 = LD_SH(dq_coeff_ptr);
+        be = &mb->block[loop_cnt + 1];
+        bd = &mb->e_mbd.block[loop_cnt + 1];
+        coeff_ptr = be->coeff;
+        dq_coeff_ptr = bd->dqcoeff;
+        coeff3 = LD_SH(coeff_ptr);
+        dq_coeff3 = LD_SH(dq_coeff_ptr);
+        coeff_ptr += 8;
+        dq_coeff_ptr += 8;
+        coeff4 = LD_SH(coeff_ptr);
+        dq_coeff4 = LD_SH(dq_coeff_ptr);
+
+        ILVRL_H2_SH(coeff, dq_coeff, coeff0, coeff1);
+        HSUB_UH2_SW(coeff0, coeff1, diff0, diff1);
+        DOTP_SW2_SD(diff0, diff1, diff0, diff1, err0, err1);
+
+        ILVRL_H2_SH(coeff2, dq_coeff2, coeff0, coeff1);
+        HSUB_UH2_SW(coeff0, coeff1, diff0, diff1);
+        DPADD_SD2_SD(diff0, diff1, err0, err1);
+        err_dup0 = __msa_splati_d(err0, 1);
+        err_dup1 = __msa_splati_d(err1, 1);
+        ADD2(err0, err_dup0, err1, err_dup1, err0, err1);
+        err += __msa_copy_s_d(err0, 0);
+        err += __msa_copy_s_d(err1, 0);
+
+        ILVRL_H2_SH(coeff3, dq_coeff3, coeff0, coeff1);
+        HSUB_UH2_SW(coeff0, coeff1, diff0, diff1);
+        DOTP_SW2_SD(diff0, diff1, diff0, diff1, err0, err1);
+        ILVRL_H2_SH(coeff4, dq_coeff4, coeff0, coeff1);
+        HSUB_UH2_SW(coeff0, coeff1, diff0, diff1);
+        DPADD_SD2_SD(diff0, diff1, err0, err1);
+        err_dup0 = __msa_splati_d(err0, 1);
+        err_dup1 = __msa_splati_d(err1, 1);
+        ADD2(err0, err_dup0, err1, err_dup1, err0, err1);
+        err += __msa_copy_s_d(err0, 0);
+        err += __msa_copy_s_d(err1, 0);
+    }
+
+    return err;
+}
diff --git a/libs/libvpx/vp8/encoder/mips/msa/quantize_msa.c b/libs/libvpx/vp8/encoder/mips/msa/quantize_msa.c
new file mode 100644
index 0000000000..0f97646b56
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/mips/msa/quantize_msa.c
@@ -0,0 +1,246 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/mips/msa/vp8_macros_msa.h"
+#include "vp8/encoder/block.h"
+
+static int8_t fast_quantize_b_msa(int16_t *coeff_ptr, int16_t *zbin,
+                                  int16_t *round, int16_t *quant,
+                                  int16_t *de_quant, int16_t *q_coeff,
+                                  int16_t *dq_coeff)
+{
+    int32_t cnt, eob;
+    v16i8 inv_zig_zag = { 0, 1, 5, 6, 2, 4, 7, 12,
+                          3, 8, 11, 13, 9, 10, 14, 15 };
+    v8i16 round0, round1;
+    v8i16 sign_z0, sign_z1;
+    v8i16 q_coeff0, q_coeff1;
+    v8i16 x0, x1, de_quant0, de_quant1;
+    v8i16 coeff0, coeff1, z0, z1;
+    v8i16 quant0, quant1, quant2, quant3;
+    v8i16 zero = { 0 };
+    v8i16 inv_zig_zag0, inv_zig_zag1;
+    v8i16 zigzag_mask0 = { 0, 1, 4, 8, 5, 2, 3, 6 };
+    v8i16 zigzag_mask1 = { 9, 12, 13, 10, 7, 11, 14, 15 };
+    v8i16 temp0_h, temp1_h, temp2_h, temp3_h;
+    v4i32 temp0_w, temp1_w, temp2_w, temp3_w;
+
+    ILVRL_B2_SH(zero, inv_zig_zag, inv_zig_zag0, inv_zig_zag1);
+    eob = -1;
+    LD_SH2(coeff_ptr, 8, coeff0, coeff1);
+    VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1,
+               z0, z1);
+    LD_SH2(round, 8, coeff0, coeff1);
+    VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1,
+               round0, round1);
+    LD_SH2(quant, 8, coeff0, coeff1);
+    VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1,
+               quant0, quant2);
+    sign_z0 = z0 >> 15;
+    sign_z1 = z1 >> 15;
+    x0 = __msa_add_a_h(z0, zero);
+    x1 = __msa_add_a_h(z1, zero);
+    ILVL_H2_SH(quant0, quant0, quant2, quant2, quant1, quant3);
+    ILVR_H2_SH(quant0, quant0, quant2, quant2, quant0, quant2);
+    ILVL_H2_SH(round0, x0, round1, x1, temp1_h, temp3_h);
+    ILVR_H2_SH(round0, x0, round1, x1, temp0_h, temp2_h);
+    DOTP_SH4_SW(temp0_h, temp1_h, temp2_h, temp3_h, quant0, quant1, quant2,
+                quant3, temp0_w, temp1_w, temp2_w, temp3_w);
+    SRA_4V(temp0_w, temp1_w, temp2_w, temp3_w, 16);
+    PCKEV_H2_SH(temp1_w, temp0_w, temp3_w, temp2_w, x0, x1);
+    x0 = x0 ^ sign_z0;
+    x1 = x1 ^ sign_z1;
+    SUB2(x0, sign_z0, x1, sign_z1, x0, x1);
+    VSHF_H2_SH(x0, x1, x0, x1, inv_zig_zag0, inv_zig_zag1, q_coeff0, q_coeff1);
+    ST_SH2(q_coeff0, q_coeff1, q_coeff, 8);
+    LD_SH2(de_quant, 8, de_quant0, de_quant1);
+    q_coeff0 *= de_quant0;
+    q_coeff1 *= de_quant1;
+    ST_SH2(q_coeff0, q_coeff1, dq_coeff, 8);
+
+    for (cnt = 0; cnt < 16; ++cnt)
+    {
+        if ((cnt <= 7) && (x1[7 - cnt] != 0))
+        {
+            eob = (15 - cnt);
+            break;
+        }
+
+        if ((cnt > 7) && (x0[7 - (cnt - 8)] != 0))
+        {
+            eob = (7 - (cnt - 8));
+            break;
+        }
+    }
+
+    return (int8_t)(eob + 1);
+}
+
+static int8_t exact_regular_quantize_b_msa(int16_t *zbin_boost,
+                                           int16_t *coeff_ptr,
+                                           int16_t *zbin,
+                                           int16_t *round,
+                                           int16_t *quant,
+                                           int16_t *quant_shift,
+                                           int16_t *de_quant,
+                                           int16_t zbin_oq_in,
+                                           int16_t *q_coeff,
+                                           int16_t *dq_coeff)
+{
+    int32_t cnt, eob;
+    int16_t *boost_temp = zbin_boost;
+    v16i8 inv_zig_zag = { 0, 1, 5, 6, 2, 4, 7, 12,
+                          3, 8, 11, 13, 9, 10, 14, 15 };
+    v8i16 round0, round1;
+    v8i16 sign_z0, sign_z1;
+    v8i16 q_coeff0, q_coeff1;
+    v8i16 z_bin0, z_bin1, zbin_o_q;
+    v8i16 x0, x1, sign_x0, sign_x1, de_quant0, de_quant1;
+    v8i16 coeff0, coeff1, z0, z1;
+    v8i16 quant0, quant1, quant2, quant3;
+    v8i16 zero = { 0 };
+    v8i16 inv_zig_zag0, inv_zig_zag1;
+    v8i16 zigzag_mask0 = { 0, 1, 4, 8, 5, 2, 3, 6 };
+    v8i16 zigzag_mask1 = { 9, 12, 13, 10, 7, 11, 14, 15 };
+    v8i16 temp0_h, temp1_h, temp2_h, temp3_h;
+    v4i32 temp0_w, temp1_w, temp2_w, temp3_w;
+
+    ILVRL_B2_SH(zero, inv_zig_zag, inv_zig_zag0, inv_zig_zag1);
+    zbin_o_q = __msa_fill_h(zbin_oq_in);
+    eob = -1;
+    LD_SH2(coeff_ptr, 8, coeff0, coeff1);
+    VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1,
+               z0, z1);
+    LD_SH2(round, 8, coeff0, coeff1);
+    VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1,
+               round0, round1);
+    LD_SH2(quant, 8, coeff0, coeff1);
+    VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1,
+               quant0, quant2);
+    LD_SH2(zbin, 8, coeff0, coeff1);
+    VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1,
+               z_bin0, z_bin1);
+    sign_z0 = z0 >> 15;
+    sign_z1 = z1 >> 15;
+    x0 = __msa_add_a_h(z0, zero);
+    x1 = __msa_add_a_h(z1, zero);
+    SUB2(x0, z_bin0, x1, z_bin1, z_bin0, z_bin1);
+    SUB2(z_bin0, zbin_o_q, z_bin1, zbin_o_q, z_bin0, z_bin1);
+    ILVL_H2_SH(quant0, quant0, quant2, quant2, quant1, quant3);
+    ILVR_H2_SH(quant0, quant0, quant2, quant2, quant0, quant2);
+    ILVL_H2_SH(round0, x0, round1, x1, temp1_h, temp3_h);
+    ILVR_H2_SH(round0, x0, round1, x1, temp0_h, temp2_h);
+    DOTP_SH4_SW(temp0_h, temp1_h, temp2_h, temp3_h, quant0, quant1, quant2,
+                quant3, temp0_w, temp1_w, temp2_w, temp3_w);
+    SRA_4V(temp0_w, temp1_w, temp2_w, temp3_w, 16);
+    PCKEV_H2_SH(temp1_w, temp0_w, temp3_w, temp2_w, temp0_h, temp2_h);
+    LD_SH2(quant_shift, 8, coeff0, coeff1);
+    VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1,
+               quant0, quant2);
+    ILVL_H2_SH(quant0, quant0, quant2, quant2, quant1, quant3);
+    ILVR_H2_SH(quant0, quant0, quant2, quant2, quant0, quant2);
+    ADD2(x0, round0, x1, round1, x0, x1);
+    ILVL_H2_SH(temp0_h, x0, temp2_h, x1, temp1_h, temp3_h);
+    ILVR_H2_SH(temp0_h, x0, temp2_h, x1, temp0_h, temp2_h);
+    DOTP_SH4_SW(temp0_h, temp1_h, temp2_h, temp3_h, quant0, quant1, quant2,
+                quant3, temp0_w, temp1_w, temp2_w, temp3_w);
+    SRA_4V(temp0_w, temp1_w, temp2_w, temp3_w, 16);
+    PCKEV_H2_SH(temp1_w, temp0_w, temp3_w, temp2_w, x0, x1);
+    sign_x0 = x0 ^ sign_z0;
+    sign_x1 = x1 ^ sign_z1;
+    SUB2(sign_x0, sign_z0, sign_x1, sign_z1, sign_x0, sign_x1);
+    for (cnt = 0; cnt < 16; ++cnt)
+    {
+        if (cnt <= 7)
+        {
+            if (boost_temp[0] <= z_bin0[cnt])
+            {
+                if (x0[cnt])
+                {
+                    eob = cnt;
+                    boost_temp = zbin_boost;
+                }
+                else
+                {
+                    boost_temp++;
+                }
+            }
+            else
+            {
+                sign_x0[cnt] = 0;
+                boost_temp++;
+            }
+        }
+        else
+        {
+            if (boost_temp[0] <= z_bin1[cnt - 8])
+            {
+                if (x1[cnt - 8])
+                {
+                    eob = cnt;
+                    boost_temp = zbin_boost;
+                }
+                else
+                {
+                    boost_temp++;
+                }
+            }
+            else
+            {
+                sign_x1[cnt - 8] = 0;
+                boost_temp++;
+            }
+        }
+    }
+
+    VSHF_H2_SH(sign_x0, sign_x1, sign_x0, sign_x1, inv_zig_zag0, inv_zig_zag1,
+               q_coeff0, q_coeff1);
+    ST_SH2(q_coeff0, q_coeff1, q_coeff, 8);
+    LD_SH2(de_quant, 8, de_quant0, de_quant1);
+    MUL2(de_quant0, q_coeff0, de_quant1, q_coeff1, de_quant0, de_quant1);
+    ST_SH2(de_quant0, de_quant1, dq_coeff, 8);
+
+    return (int8_t)(eob + 1);
+}
+
+void vp8_fast_quantize_b_msa(BLOCK *b, BLOCKD *d)
+{
+    int16_t *coeff_ptr = b->coeff;
+    int16_t *zbin_ptr = b->zbin;
+    int16_t *round_ptr = b->round;
+    int16_t *quant_ptr = b->quant_fast;
+    int16_t *qcoeff_ptr = d->qcoeff;
+    int16_t *dqcoeff_ptr = d->dqcoeff;
+    int16_t *dequant_ptr = d->dequant;
+
+    *d->eob = fast_quantize_b_msa(coeff_ptr, zbin_ptr, round_ptr, quant_ptr,
+                                  dequant_ptr, qcoeff_ptr, dqcoeff_ptr);
+}
+
+void vp8_regular_quantize_b_msa(BLOCK *b, BLOCKD *d)
+{
+    int16_t *zbin_boost_ptr = b->zrun_zbin_boost;
+    int16_t *coeff_ptr = b->coeff;
+    int16_t *zbin_ptr = b->zbin;
+    int16_t *round_ptr = b->round;
+    int16_t *quant_ptr = b->quant;
+    int16_t *quant_shift_ptr = b->quant_shift;
+    int16_t *qcoeff_ptr = d->qcoeff;
+    int16_t *dqcoeff_ptr = d->dqcoeff;
+    int16_t *dequant_ptr = d->dequant;
+    int16_t zbin_oq_value = b->zbin_extra;
+
+    *d->eob = exact_regular_quantize_b_msa(zbin_boost_ptr, coeff_ptr,
+                                           zbin_ptr, round_ptr,
+                                           quant_ptr, quant_shift_ptr,
+                                           dequant_ptr, zbin_oq_value,
+                                           qcoeff_ptr, dqcoeff_ptr);
+}
diff --git a/libs/libvpx/vp8/encoder/mips/msa/temporal_filter_msa.c b/libs/libvpx/vp8/encoder/mips/msa/temporal_filter_msa.c
new file mode 100644
index 0000000000..5cca5e0872
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/mips/msa/temporal_filter_msa.c
@@ -0,0 +1,303 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/mips/msa/vp8_macros_msa.h"
+
+static void temporal_filter_apply_16size_msa(uint8_t *frame1_ptr,
+                                             uint32_t stride,
+                                             uint8_t *frame2_ptr,
+                                             int32_t strength_in,
+                                             int32_t filter_wt_in,
+                                             uint32_t *acc, uint16_t *cnt)
+{
+    uint32_t row;
+    v16i8 frame1_0_b, frame1_1_b, frame2_0_b, frame2_1_b;
+    v16u8 frame_l, frame_h;
+    v16i8 zero = { 0 };
+    v8i16 frame2_0_h, frame2_1_h, mod0_h, mod1_h;
+    v8i16 diff0, diff1, cnt0, cnt1;
+    v4i32 const3, const16, filter_wt, strength;
+    v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
+    v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
+    v4i32 frame2_0, frame2_1, frame2_2, frame2_3;
+    v4i32 acc0, acc1, acc2, acc3;
+
+    filter_wt = __msa_fill_w(filter_wt_in);
+    strength = __msa_fill_w(strength_in);
+    const3 = __msa_ldi_w(3);
+    const16 = __msa_ldi_w(16);
+
+    for (row = 8; row--;)
+    {
+        frame1_0_b = LD_SB(frame1_ptr);
+        frame2_0_b = LD_SB(frame2_ptr);
+        frame1_ptr += stride;
+        frame2_ptr += 16;
+        frame1_1_b = LD_SB(frame1_ptr);
+        frame2_1_b = LD_SB(frame2_ptr);
+        LD_SW2(acc, 4, acc0, acc1);
+        LD_SW2(acc + 8, 4, acc2, acc3);
+        LD_SH2(cnt, 8, cnt0, cnt1);
+        ILVRL_B2_UB(frame1_0_b, frame2_0_b, frame_l, frame_h);
+        HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
+        UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+        UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+        MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
+             diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
+        MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3,
+             mod0_w, mod1_w, mod2_w, mod3_w);
+        SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+        diff0_r = (mod0_w < const16);
+        diff0_l = (mod1_w < const16);
+        diff1_r = (mod2_w < const16);
+        diff1_l = (mod3_w < const16);
+        SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
+             mod0_w, mod1_w, mod2_w, mod3_w);
+        mod0_w = diff0_r & mod0_w;
+        mod1_w = diff0_l & mod1_w;
+        mod2_w = diff1_r & mod2_w;
+        mod3_w = diff1_l & mod3_w;
+        MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
+             filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
+        PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h)
+        ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+        ST_SH2(mod0_h, mod1_h, cnt, 8);
+        cnt += 16;
+        ILVRL_B2_SH(zero, frame2_0_b, frame2_0_h, frame2_1_h);
+        UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
+        UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
+        MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w,
+             frame2_3, mod0_w, mod1_w, mod2_w, mod3_w);
+        ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
+             mod0_w, mod1_w, mod2_w, mod3_w);
+        ST_SW2(mod0_w, mod1_w, acc, 4);
+        ST_SW2(mod2_w, mod3_w, acc + 8, 4);
+        acc += 16;
+        LD_SW2(acc, 4, acc0, acc1);
+        LD_SW2(acc + 8, 4, acc2, acc3);
+        LD_SH2(cnt, 8, cnt0, cnt1);
+        ILVRL_B2_UB(frame1_1_b, frame2_1_b, frame_l, frame_h);
+        HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
+        UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+        UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+        MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
+             diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
+        MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3,
+             mod0_w, mod1_w, mod2_w, mod3_w);
+        SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+        diff0_r = (mod0_w < const16);
+        diff0_l = (mod1_w < const16);
+        diff1_r = (mod2_w < const16);
+        diff1_l = (mod3_w < const16);
+        SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
+             mod0_w, mod1_w, mod2_w, mod3_w);
+        mod0_w = diff0_r & mod0_w;
+        mod1_w = diff0_l & mod1_w;
+        mod2_w = diff1_r & mod2_w;
+        mod3_w = diff1_l & mod3_w;
+        MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
+             filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
+        PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+        ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+        ST_SH2(mod0_h, mod1_h, cnt, 8);
+        cnt += 16;
+
+        UNPCK_UB_SH(frame2_1_b, frame2_0_h, frame2_1_h);
+        UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
+        UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
+        MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w,
+             frame2_3, mod0_w, mod1_w, mod2_w, mod3_w);
+        ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
+             mod0_w, mod1_w, mod2_w, mod3_w);
+        ST_SW2(mod0_w, mod1_w, acc, 4);
+        ST_SW2(mod2_w, mod3_w, acc + 8, 4);
+        acc += 16;
+        frame1_ptr += stride;
+        frame2_ptr += 16;
+    }
+}
+
+static void temporal_filter_apply_8size_msa(uint8_t *frame1_ptr,
+                                            uint32_t stride,
+                                            uint8_t *frame2_ptr,
+                                            int32_t strength_in,
+                                            int32_t filter_wt_in,
+                                            uint32_t *acc, uint16_t *cnt)
+{
+    uint32_t row;
+    uint64_t f0, f1, f2, f3, f4, f5, f6, f7;
+    v16i8 frame1 = { 0 };
+    v16i8 frame2 = { 0 };
+    v16i8 frame3 = { 0 };
+    v16i8 frame4 = { 0 };
+    v16u8 frame_l, frame_h;
+    v8i16 frame2_0_h, frame2_1_h, mod0_h, mod1_h;
+    v8i16 diff0, diff1, cnt0, cnt1;
+    v4i32 const3, const16;
+    v4i32 filter_wt, strength;
+    v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
+    v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
+    v4i32 frame2_0, frame2_1, frame2_2, frame2_3;
+    v4i32 acc0, acc1, acc2, acc3;
+
+    filter_wt = __msa_fill_w(filter_wt_in);
+    strength = __msa_fill_w(strength_in);
+    const3 = __msa_ldi_w(3);
+    const16 = __msa_ldi_w(16);
+
+    for (row = 2; row--;)
+    {
+        LD2(frame1_ptr, stride, f0, f1);
+        frame1_ptr += (2 * stride);
+        LD2(frame2_ptr, 8, f2, f3);
+        frame2_ptr += 16;
+        LD2(frame1_ptr, stride, f4, f5);
+        frame1_ptr += (2 * stride);
+        LD2(frame2_ptr, 8, f6, f7);
+        frame2_ptr += 16;
+
+        LD_SW2(acc, 4, acc0, acc1);
+        LD_SW2(acc + 8, 4, acc2, acc3);
+        LD_SH2(cnt, 8, cnt0, cnt1);
+        INSERT_D2_SB(f0, f1, frame1);
+        INSERT_D2_SB(f2, f3, frame2);
+        INSERT_D2_SB(f4, f5, frame3);
+        INSERT_D2_SB(f6, f7, frame4);
+        ILVRL_B2_UB(frame1, frame2, frame_l, frame_h);
+        HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
+        UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+        UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+        MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
+             diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
+        MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3,
+             mod0_w, mod1_w, mod2_w, mod3_w);
+        SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+        diff0_r = (mod0_w < const16);
+        diff0_l = (mod1_w < const16);
+        diff1_r = (mod2_w < const16);
+        diff1_l = (mod3_w < const16);
+        SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
+             mod0_w, mod1_w, mod2_w, mod3_w);
+        mod0_w = diff0_r & mod0_w;
+        mod1_w = diff0_l & mod1_w;
+        mod2_w = diff1_r & mod2_w;
+        mod3_w = diff1_l & mod3_w;
+        MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
+             filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
+        PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+        ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+        ST_SH2(mod0_h, mod1_h, cnt, 8);
+        cnt += 16;
+
+        UNPCK_UB_SH(frame2, frame2_0_h, frame2_1_h);
+        UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
+        UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
+        MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w,
+             frame2_3, mod0_w, mod1_w, mod2_w, mod3_w);
+        ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
+             mod0_w, mod1_w, mod2_w, mod3_w);
+        ST_SW2(mod0_w, mod1_w, acc, 4);
+        ST_SW2(mod2_w, mod3_w, acc + 8, 4);
+        acc += 16;
+
+        LD_SW2(acc, 4, acc0, acc1);
+        LD_SW2(acc + 8, 4, acc2, acc3);
+        LD_SH2(cnt, 8, cnt0, cnt1);
+        ILVRL_B2_UB(frame3, frame4, frame_l, frame_h);
+        HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
+        UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+        UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+        MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
+             diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
+        MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3,
+             mod0_w, mod1_w, mod2_w, mod3_w);
+        SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+        diff0_r = (mod0_w < const16);
+        diff0_l = (mod1_w < const16);
+        diff1_r = (mod2_w < const16);
+        diff1_l = (mod3_w < const16);
+        SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
+             mod0_w, mod1_w, mod2_w, mod3_w);
+        mod0_w = diff0_r & mod0_w;
+        mod1_w = diff0_l & mod1_w;
+        mod2_w = diff1_r & mod2_w;
+        mod3_w = diff1_l & mod3_w;
+        MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
+             filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
+        PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+        ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+        ST_SH2(mod0_h, mod1_h, cnt, 8);
+        cnt += 16;
+
+        UNPCK_UB_SH(frame4, frame2_0_h, frame2_1_h);
+        UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
+        UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
+        MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w,
+             frame2_3, mod0_w, mod1_w, mod2_w, mod3_w);
+        ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
+             mod0_w, mod1_w, mod2_w, mod3_w);
+        ST_SW2(mod0_w, mod1_w, acc, 4);
+        ST_SW2(mod2_w, mod3_w, acc + 8, 4);
+        acc += 16;
+    }
+}
+
+void vp8_temporal_filter_apply_msa(uint8_t *frame1, uint32_t stride,
+                                   uint8_t *frame2, uint32_t block_size,
+                                   int32_t strength,  int32_t filter_weight,
+                                   uint32_t *accumulator, uint16_t *count)
+{
+    if (8 == block_size)
+    {
+        temporal_filter_apply_8size_msa(frame1, stride, frame2, strength,
+                                        filter_weight, accumulator, count);
+    }
+    else if (16 == block_size)
+    {
+        temporal_filter_apply_16size_msa(frame1, stride, frame2, strength,
+                                         filter_weight, accumulator, count);
+    }
+    else
+    {
+        uint32_t i, j, k;
+        int32_t modifier;
+        int32_t byte = 0;
+        const int32_t rounding = strength > 0 ? 1 << (strength - 1) : 0;
+
+        for (i = 0, k = 0; i < block_size; ++i)
+        {
+            for (j = 0; j < block_size; ++j, ++k)
+            {
+                int src_byte = frame1[byte];
+                int pixel_value = *frame2++;
+
+                modifier = src_byte - pixel_value;
+                modifier *= modifier;
+                modifier *= 3;
+                modifier += rounding;
+                modifier >>= strength;
+
+                if (modifier > 16)
+                    modifier = 16;
+
+                modifier = 16 - modifier;
+                modifier *= filter_weight;
+
+                count[k] += modifier;
+                accumulator[k] += modifier * pixel_value;
+
+                byte++;
+            }
+
+            byte += stride - block_size;
+        }
+    }
+}
diff --git a/libs/libvpx/vp8/encoder/modecosts.c b/libs/libvpx/vp8/encoder/modecosts.c
new file mode 100644
index 0000000000..ad0e9308dc
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/modecosts.c
@@ -0,0 +1,55 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp8/common/blockd.h"
+#include "modecosts.h"
+#include "onyx_int.h"
+#include "treewriter.h"
+#include "vp8/common/entropymode.h"
+
+
+void vp8_init_mode_costs(VP8_COMP *c)
+{
+    VP8_COMMON *x = &c->common;
+    struct rd_costs_struct *rd_costs = &c->rd_costs;
+
+    {
+        const vp8_tree_p T = vp8_bmode_tree;
+
+        int i = 0;
+
+        do
+        {
+            int j = 0;
+
+            do
+            {
+                vp8_cost_tokens(rd_costs->bmode_costs[i][j],
+                                vp8_kf_bmode_prob[i][j], T);
+            }
+            while (++j < VP8_BINTRAMODES);
+        }
+        while (++i < VP8_BINTRAMODES);
+
+        vp8_cost_tokens(rd_costs->inter_bmode_costs, x->fc.bmode_prob, T);
+    }
+    vp8_cost_tokens(rd_costs->inter_bmode_costs, x->fc.sub_mv_ref_prob,
+                    vp8_sub_mv_ref_tree);
+
+    vp8_cost_tokens(rd_costs->mbmode_cost[1], x->fc.ymode_prob, vp8_ymode_tree);
+    vp8_cost_tokens(rd_costs->mbmode_cost[0], vp8_kf_ymode_prob,
+                    vp8_kf_ymode_tree);
+
+    vp8_cost_tokens(rd_costs->intra_uv_mode_cost[1], x->fc.uv_mode_prob,
+                    vp8_uv_mode_tree);
+    vp8_cost_tokens(rd_costs->intra_uv_mode_cost[0], vp8_kf_uv_mode_prob,
+                    vp8_uv_mode_tree);
+}
diff --git a/libs/libvpx/vp8/encoder/modecosts.h b/libs/libvpx/vp8/encoder/modecosts.h
new file mode 100644
index 0000000000..9871bfffdf
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/modecosts.h
@@ -0,0 +1,27 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_ENCODER_MODECOSTS_H_
+#define VP8_ENCODER_MODECOSTS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP8_COMP;
+
+void vp8_init_mode_costs(struct VP8_COMP *x);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_ENCODER_MODECOSTS_H_
diff --git a/libs/libvpx/vp8/encoder/mr_dissim.c b/libs/libvpx/vp8/encoder/mr_dissim.c
new file mode 100644
index 0000000000..886cba2fd5
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/mr_dissim.c
@@ -0,0 +1,240 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <limits.h>
+#include "vpx_config.h"
+#include "onyx_int.h"
+#include "mr_dissim.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "rdopt.h"
+#include "vp8/common/common.h"
+
+void vp8_cal_low_res_mb_cols(VP8_COMP *cpi)
+{
+    int low_res_w;
+
+    /* Support arbitrary down-sampling factor */
+    unsigned int iw = cpi->oxcf.Width*cpi->oxcf.mr_down_sampling_factor.den
+                      + cpi->oxcf.mr_down_sampling_factor.num - 1;
+
+    low_res_w = iw/cpi->oxcf.mr_down_sampling_factor.num;
+    cpi->mr_low_res_mb_cols = ((low_res_w + 15) >> 4);
+}
+
+#define GET_MV(x)    \
+if(x->mbmi.ref_frame !=INTRA_FRAME)   \
+{   \
+    mvx[cnt] = x->mbmi.mv.as_mv.row;  \
+    mvy[cnt] = x->mbmi.mv.as_mv.col;  \
+    cnt++;    \
+}
+
+#define GET_MV_SIGN(x)    \
+if(x->mbmi.ref_frame !=INTRA_FRAME)   \
+{   \
+    mvx[cnt] = x->mbmi.mv.as_mv.row;  \
+    mvy[cnt] = x->mbmi.mv.as_mv.col;  \
+    if (cm->ref_frame_sign_bias[x->mbmi.ref_frame]  \
+        != cm->ref_frame_sign_bias[tmp->mbmi.ref_frame])  \
+    {  \
+        mvx[cnt] *= -1;   \
+        mvy[cnt] *= -1;   \
+    }  \
+    cnt++;  \
+}
+
+void vp8_cal_dissimilarity(VP8_COMP *cpi)
+{
+    VP8_COMMON *cm = &cpi->common;
+    int i;
+
+    /* Note: The first row & first column in mip are outside the frame, which
+     * were initialized to all 0.(ref_frame, mode, mv...)
+     * Their ref_frame = 0 means they won't be counted in the following
+     * calculation.
+     */
+    if (cpi->oxcf.mr_total_resolutions >1
+        && cpi->oxcf.mr_encoder_id < (cpi->oxcf.mr_total_resolutions - 1))
+    {
+        /* Store info for show/no-show frames for supporting alt_ref.
+         * If parent frame is alt_ref, child has one too.
+         */
+        LOWER_RES_FRAME_INFO* store_info
+                      = (LOWER_RES_FRAME_INFO*)cpi->oxcf.mr_low_res_mode_info;
+
+        store_info->frame_type = cm->frame_type;
+
+        if(cm->frame_type != KEY_FRAME)
+        {
+            store_info->is_frame_dropped = 0;
+            for (i = 1; i < MAX_REF_FRAMES; i++)
+                store_info->low_res_ref_frames[i] = cpi->current_ref_frames[i];
+        }
+
+        if(cm->frame_type != KEY_FRAME)
+        {
+            int mb_row;
+            int mb_col;
+            /* Point to beginning of allocated MODE_INFO arrays. */
+            MODE_INFO *tmp = cm->mip + cm->mode_info_stride;
+            LOWER_RES_MB_INFO* store_mode_info = store_info->mb_info;
+
+            for (mb_row = 0; mb_row < cm->mb_rows; mb_row ++)
+            {
+                tmp++;
+                for (mb_col = 0; mb_col < cm->mb_cols; mb_col ++)
+                {
+                    int dissim = INT_MAX;
+
+                    if(tmp->mbmi.ref_frame !=INTRA_FRAME)
+                    {
+                        int              mvx[8];
+                        int              mvy[8];
+                        int              mmvx;
+                        int              mmvy;
+                        int              cnt=0;
+                        const MODE_INFO *here = tmp;
+                        const MODE_INFO *above = here - cm->mode_info_stride;
+                        const MODE_INFO *left = here - 1;
+                        const MODE_INFO *aboveleft = above - 1;
+                        const MODE_INFO *aboveright = NULL;
+                        const MODE_INFO *right = NULL;
+                        const MODE_INFO *belowleft = NULL;
+                        const MODE_INFO *below = NULL;
+                        const MODE_INFO *belowright = NULL;
+
+                        /* If alternate reference frame is used, we have to
+                         * check sign of MV. */
+                        if(cpi->oxcf.play_alternate)
+                        {
+                            /* Gather mv of neighboring MBs */
+                            GET_MV_SIGN(above)
+                            GET_MV_SIGN(left)
+                            GET_MV_SIGN(aboveleft)
+
+                            if(mb_col < (cm->mb_cols-1))
+                            {
+                                right = here + 1;
+                                aboveright = above + 1;
+                                GET_MV_SIGN(right)
+                                GET_MV_SIGN(aboveright)
+                            }
+
+                            if(mb_row < (cm->mb_rows-1))
+                            {
+                                below = here + cm->mode_info_stride;
+                                belowleft = below - 1;
+                                GET_MV_SIGN(below)
+                                GET_MV_SIGN(belowleft)
+                            }
+
+                            if(mb_col < (cm->mb_cols-1)
+                                && mb_row < (cm->mb_rows-1))
+                            {
+                                belowright = below + 1;
+                                GET_MV_SIGN(belowright)
+                            }
+                        }else
+                        {
+                            /* No alt_ref and gather mv of neighboring MBs */
+                            GET_MV(above)
+                            GET_MV(left)
+                            GET_MV(aboveleft)
+
+                            if(mb_col < (cm->mb_cols-1))
+                            {
+                                right = here + 1;
+                                aboveright = above + 1;
+                                GET_MV(right)
+                                GET_MV(aboveright)
+                            }
+
+                            if(mb_row < (cm->mb_rows-1))
+                            {
+                                below = here + cm->mode_info_stride;
+                                belowleft = below - 1;
+                                GET_MV(below)
+                                GET_MV(belowleft)
+                            }
+
+                            if(mb_col < (cm->mb_cols-1)
+                                && mb_row < (cm->mb_rows-1))
+                            {
+                                belowright = below + 1;
+                                GET_MV(belowright)
+                            }
+                        }
+
+                        if (cnt > 0)
+                        {
+                            int max_mvx = mvx[0];
+                            int min_mvx = mvx[0];
+                            int max_mvy = mvy[0];
+                            int min_mvy = mvy[0];
+                            int i;
+
+                            if (cnt > 1)
+                            {
+                                for (i=1; i< cnt; i++)
+                                {
+                                    if (mvx[i] > max_mvx) max_mvx = mvx[i];
+                                    else if (mvx[i] < min_mvx) min_mvx = mvx[i];
+                                    if (mvy[i] > max_mvy) max_mvy = mvy[i];
+                                    else if (mvy[i] < min_mvy) min_mvy = mvy[i];
+                                }
+                            }
+
+                            mmvx = VPXMAX(
+                                abs(min_mvx - here->mbmi.mv.as_mv.row),
+                                abs(max_mvx - here->mbmi.mv.as_mv.row));
+                            mmvy = VPXMAX(
+                                abs(min_mvy - here->mbmi.mv.as_mv.col),
+                                abs(max_mvy - here->mbmi.mv.as_mv.col));
+                            dissim = VPXMAX(mmvx, mmvy);
+                        }
+                    }
+
+                    /* Store mode info for next resolution encoding */
+                    store_mode_info->mode = tmp->mbmi.mode;
+                    store_mode_info->ref_frame = tmp->mbmi.ref_frame;
+                    store_mode_info->mv.as_int = tmp->mbmi.mv.as_int;
+                    store_mode_info->dissim = dissim;
+                    tmp++;
+                    store_mode_info++;
+                }
+            }
+        }
+    }
+}
+
+/* This function is called only when this frame is dropped at current
+   resolution level. */
+void vp8_store_drop_frame_info(VP8_COMP *cpi)
+{
+    /* If the frame is dropped in lower-resolution encoding, this information
+       is passed to higher resolution level so that the encoder knows there
+       is no mode & motion info available.
+     */
+    if (cpi->oxcf.mr_total_resolutions >1
+        && cpi->oxcf.mr_encoder_id < (cpi->oxcf.mr_total_resolutions - 1))
+    {
+        /* Store info for show/no-show frames for supporting alt_ref.
+         * If parent frame is alt_ref, child has one too.
+         */
+        LOWER_RES_FRAME_INFO* store_info
+                      = (LOWER_RES_FRAME_INFO*)cpi->oxcf.mr_low_res_mode_info;
+
+        /* Set frame_type to be INTER_FRAME since we won't drop key frame. */
+        store_info->frame_type = INTER_FRAME;
+        store_info->is_frame_dropped = 1;
+    }
+}
diff --git a/libs/libvpx/vp8/encoder/mr_dissim.h b/libs/libvpx/vp8/encoder/mr_dissim.h
new file mode 100644
index 0000000000..5a59ce62a6
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/mr_dissim.h
@@ -0,0 +1,28 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_ENCODER_MR_DISSIM_H_
+#define VP8_ENCODER_MR_DISSIM_H_
+#include "vpx_config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern void vp8_cal_low_res_mb_cols(VP8_COMP *cpi);
+extern void vp8_cal_dissimilarity(VP8_COMP *cpi);
+extern void vp8_store_drop_frame_info(VP8_COMP *cpi);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_ENCODER_MR_DISSIM_H_
diff --git a/libs/libvpx/vp8/encoder/onyx_if.c b/libs/libvpx/vp8/encoder/onyx_if.c
new file mode 100644
index 0000000000..5a4b37dcff
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/onyx_if.c
@@ -0,0 +1,6004 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "./vpx_scale_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "./vp8_rtcd.h"
+#include "vp8/common/onyxc_int.h"
+#include "vp8/common/blockd.h"
+#include "onyx_int.h"
+#include "vp8/common/systemdependent.h"
+#include "vp8/encoder/quantize.h"
+#include "vp8/common/alloccommon.h"
+#include "mcomp.h"
+#include "firstpass.h"
+#include "vpx/internal/vpx_psnr.h"
+#include "vpx_scale/vpx_scale.h"
+#include "vp8/common/extend.h"
+#include "ratectrl.h"
+#include "vp8/common/quant_common.h"
+#include "segmentation.h"
+#if CONFIG_POSTPROC
+#include "vp8/common/postproc.h"
+#endif
+#include "vpx_mem/vpx_mem.h"
+#include "vp8/common/reconintra.h"
+#include "vp8/common/swapyv12buffer.h"
+#include "vp8/common/threading.h"
+#include "vpx_ports/vpx_timer.h"
+#if ARCH_ARM
+#include "vpx_ports/arm.h"
+#endif
+#if CONFIG_MULTI_RES_ENCODING
+#include "mr_dissim.h"
+#endif
+#include "encodeframe.h"
+
+#include <math.h>
+#include <stdio.h>
+#include <limits.h>
+
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+extern int vp8_update_coef_context(VP8_COMP *cpi);
+extern void vp8_update_coef_probs(VP8_COMP *cpi);
+#endif
+
+extern void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi);
+extern void vp8cx_set_alt_lf_level(VP8_COMP *cpi, int filt_val);
+extern void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi);
+
+extern void vp8_deblock_frame(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *post, int filt_lvl, int low_var_thresh, int flag);
+extern void print_parms(VP8_CONFIG *ocf, char *filenam);
+extern unsigned int vp8_get_processor_freq();
+extern void print_tree_update_probs();
+extern int vp8cx_create_encoder_threads(VP8_COMP *cpi);
+extern void vp8cx_remove_encoder_threads(VP8_COMP *cpi);
+
+int vp8_estimate_entropy_savings(VP8_COMP *cpi);
+
+int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest);
+
+extern void vp8_temporal_filter_prepare_c(VP8_COMP *cpi, int distance);
+
+static void set_default_lf_deltas(VP8_COMP *cpi);
+
+extern const int vp8_gf_interval_table[101];
+
+#if CONFIG_INTERNAL_STATS
+#include "math.h"
+#include "vpx_dsp/ssim.h"
+#endif
+
+
+#ifdef OUTPUT_YUV_SRC
+FILE *yuv_file;
+#endif
+#ifdef OUTPUT_YUV_DENOISED
+FILE *yuv_denoised_file;
+#endif
+
+#if 0
+FILE *framepsnr;
+FILE *kf_list;
+FILE *keyfile;
+#endif
+
+#if 0
+extern int skip_true_count;
+extern int skip_false_count;
+#endif
+
+
+#ifdef VP8_ENTROPY_STATS
+extern int intra_mode_stats[10][10][10];
+#endif
+
+#ifdef SPEEDSTATS
+unsigned int frames_at_speed[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+unsigned int tot_pm = 0;
+unsigned int cnt_pm = 0;
+unsigned int tot_ef = 0;
+unsigned int cnt_ef = 0;
+#endif
+
+#ifdef MODE_STATS
+extern unsigned __int64 Sectionbits[50];
+extern int y_modes[5]  ;
+extern int uv_modes[4] ;
+extern int b_modes[10]  ;
+
+extern int inter_y_modes[10] ;
+extern int inter_uv_modes[4] ;
+extern unsigned int inter_b_modes[15];
+#endif
+
+extern const int vp8_bits_per_mb[2][QINDEX_RANGE];
+
+extern const int qrounding_factors[129];
+extern const int qzbin_factors[129];
+extern void vp8cx_init_quantizer(VP8_COMP *cpi);
+extern const int vp8cx_base_skip_false_prob[128];
+
+/* Tables relating active max Q to active min Q */
+static const unsigned char kf_low_motion_minq[QINDEX_RANGE] =
+{
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,
+    3,3,3,3,3,3,4,4,4,5,5,5,5,5,6,6,
+    6,6,7,7,8,8,8,8,9,9,10,10,10,10,11,11,
+    11,11,12,12,13,13,13,13,14,14,15,15,15,15,16,16,
+    16,16,17,17,18,18,18,18,19,20,20,21,21,22,23,23
+};
+static const unsigned char kf_high_motion_minq[QINDEX_RANGE] =
+{
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    1,1,1,1,1,1,1,1,2,2,2,2,3,3,3,3,
+    3,3,3,3,4,4,4,4,5,5,5,5,5,5,6,6,
+    6,6,7,7,8,8,8,8,9,9,10,10,10,10,11,11,
+    11,11,12,12,13,13,13,13,14,14,15,15,15,15,16,16,
+    16,16,17,17,18,18,18,18,19,19,20,20,20,20,21,21,
+    21,21,22,22,23,23,24,25,25,26,26,27,28,28,29,30
+};
+static const unsigned char gf_low_motion_minq[QINDEX_RANGE] =
+{
+    0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,
+    3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,
+    7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,
+    11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,
+    19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,
+    27,27,28,28,29,29,30,30,31,31,32,32,33,33,34,34,
+    35,35,36,36,37,37,38,38,39,39,40,40,41,41,42,42,
+    43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58
+};
+static const unsigned char gf_mid_motion_minq[QINDEX_RANGE] =
+{
+    0,0,0,0,1,1,1,1,1,1,2,2,3,3,3,4,
+    4,4,5,5,5,6,6,6,7,7,7,8,8,8,9,9,
+    9,10,10,10,10,11,11,11,12,12,12,12,13,13,13,14,
+    14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,
+    22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,
+    30,30,31,31,32,32,33,33,34,34,35,35,36,36,37,37,
+    38,39,39,40,40,41,41,42,42,43,43,44,45,46,47,48,
+    49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64
+};
+static const unsigned char gf_high_motion_minq[QINDEX_RANGE] =
+{
+    0,0,0,0,1,1,1,1,1,2,2,2,3,3,3,4,
+    4,4,5,5,5,6,6,6,7,7,7,8,8,8,9,9,
+    9,10,10,10,11,11,12,12,13,13,14,14,15,15,16,16,
+    17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,
+    25,25,26,26,27,27,28,28,29,29,30,30,31,31,32,32,
+    33,33,34,34,35,35,36,36,37,37,38,38,39,39,40,40,
+    41,41,42,42,43,44,45,46,47,48,49,50,51,52,53,54,
+    55,56,57,58,59,60,62,64,66,68,70,72,74,76,78,80
+};
+static const unsigned char inter_minq[QINDEX_RANGE] =
+{
+    0,0,1,1,2,3,3,4,4,5,6,6,7,8,8,9,
+    9,10,11,11,12,13,13,14,15,15,16,17,17,18,19,20,
+    20,21,22,22,23,24,24,25,26,27,27,28,29,30,30,31,
+    32,33,33,34,35,36,36,37,38,39,39,40,41,42,42,43,
+    44,45,46,46,47,48,49,50,50,51,52,53,54,55,55,56,
+    57,58,59,60,60,61,62,63,64,65,66,67,67,68,69,70,
+    71,72,73,74,75,75,76,77,78,79,80,81,82,83,84,85,
+    86,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100
+};
+
+#ifdef PACKET_TESTING
+extern FILE *vpxlogc;
+#endif
+
+static void save_layer_context(VP8_COMP *cpi)
+{
+    LAYER_CONTEXT *lc = &cpi->layer_context[cpi->current_layer];
+
+    /* Save layer dependent coding state */
+    lc->target_bandwidth                 = cpi->target_bandwidth;
+    lc->starting_buffer_level            = cpi->oxcf.starting_buffer_level;
+    lc->optimal_buffer_level             = cpi->oxcf.optimal_buffer_level;
+    lc->maximum_buffer_size              = cpi->oxcf.maximum_buffer_size;
+    lc->starting_buffer_level_in_ms      = cpi->oxcf.starting_buffer_level_in_ms;
+    lc->optimal_buffer_level_in_ms       = cpi->oxcf.optimal_buffer_level_in_ms;
+    lc->maximum_buffer_size_in_ms        = cpi->oxcf.maximum_buffer_size_in_ms;
+    lc->buffer_level                     = cpi->buffer_level;
+    lc->bits_off_target                  = cpi->bits_off_target;
+    lc->total_actual_bits                = cpi->total_actual_bits;
+    lc->worst_quality                    = cpi->worst_quality;
+    lc->active_worst_quality             = cpi->active_worst_quality;
+    lc->best_quality                     = cpi->best_quality;
+    lc->active_best_quality              = cpi->active_best_quality;
+    lc->ni_av_qi                         = cpi->ni_av_qi;
+    lc->ni_tot_qi                        = cpi->ni_tot_qi;
+    lc->ni_frames                        = cpi->ni_frames;
+    lc->avg_frame_qindex                 = cpi->avg_frame_qindex;
+    lc->rate_correction_factor           = cpi->rate_correction_factor;
+    lc->key_frame_rate_correction_factor = cpi->key_frame_rate_correction_factor;
+    lc->gf_rate_correction_factor        = cpi->gf_rate_correction_factor;
+    lc->zbin_over_quant                  = cpi->mb.zbin_over_quant;
+    lc->inter_frame_target               = cpi->inter_frame_target;
+    lc->total_byte_count                 = cpi->total_byte_count;
+    lc->filter_level                     = cpi->common.filter_level;
+
+    lc->last_frame_percent_intra         = cpi->last_frame_percent_intra;
+
+    memcpy (lc->count_mb_ref_frame_usage,
+            cpi->mb.count_mb_ref_frame_usage,
+            sizeof(cpi->mb.count_mb_ref_frame_usage));
+}
+
+static void restore_layer_context(VP8_COMP *cpi, const int layer)
+{
+    LAYER_CONTEXT *lc = &cpi->layer_context[layer];
+
+    /* Restore layer dependent coding state */
+    cpi->current_layer                    = layer;
+    cpi->target_bandwidth                 = lc->target_bandwidth;
+    cpi->oxcf.target_bandwidth            = lc->target_bandwidth;
+    cpi->oxcf.starting_buffer_level       = lc->starting_buffer_level;
+    cpi->oxcf.optimal_buffer_level        = lc->optimal_buffer_level;
+    cpi->oxcf.maximum_buffer_size         = lc->maximum_buffer_size;
+    cpi->oxcf.starting_buffer_level_in_ms = lc->starting_buffer_level_in_ms;
+    cpi->oxcf.optimal_buffer_level_in_ms  = lc->optimal_buffer_level_in_ms;
+    cpi->oxcf.maximum_buffer_size_in_ms   = lc->maximum_buffer_size_in_ms;
+    cpi->buffer_level                     = lc->buffer_level;
+    cpi->bits_off_target                  = lc->bits_off_target;
+    cpi->total_actual_bits                = lc->total_actual_bits;
+    cpi->active_worst_quality             = lc->active_worst_quality;
+    cpi->active_best_quality              = lc->active_best_quality;
+    cpi->ni_av_qi                         = lc->ni_av_qi;
+    cpi->ni_tot_qi                        = lc->ni_tot_qi;
+    cpi->ni_frames                        = lc->ni_frames;
+    cpi->avg_frame_qindex                 = lc->avg_frame_qindex;
+    cpi->rate_correction_factor           = lc->rate_correction_factor;
+    cpi->key_frame_rate_correction_factor = lc->key_frame_rate_correction_factor;
+    cpi->gf_rate_correction_factor        = lc->gf_rate_correction_factor;
+    cpi->mb.zbin_over_quant                  = lc->zbin_over_quant;
+    cpi->inter_frame_target               = lc->inter_frame_target;
+    cpi->total_byte_count                 = lc->total_byte_count;
+    cpi->common.filter_level              = lc->filter_level;
+
+    cpi->last_frame_percent_intra         = lc->last_frame_percent_intra;
+
+    memcpy (cpi->mb.count_mb_ref_frame_usage,
+            lc->count_mb_ref_frame_usage,
+            sizeof(cpi->mb.count_mb_ref_frame_usage));
+}
+
+static int rescale(int val, int num, int denom)
+{
+    int64_t llnum = num;
+    int64_t llden = denom;
+    int64_t llval = val;
+
+    return (int)(llval * llnum / llden);
+}
+
+static void init_temporal_layer_context(VP8_COMP *cpi,
+                                        VP8_CONFIG *oxcf,
+                                        const int layer,
+                                        double prev_layer_framerate)
+{
+    LAYER_CONTEXT *lc = &cpi->layer_context[layer];
+
+    lc->framerate = cpi->output_framerate / cpi->oxcf.rate_decimator[layer];
+    lc->target_bandwidth = cpi->oxcf.target_bitrate[layer] * 1000;
+
+    lc->starting_buffer_level_in_ms = oxcf->starting_buffer_level;
+    lc->optimal_buffer_level_in_ms  = oxcf->optimal_buffer_level;
+    lc->maximum_buffer_size_in_ms   = oxcf->maximum_buffer_size;
+
+    lc->starting_buffer_level =
+        rescale((int)(oxcf->starting_buffer_level),
+                lc->target_bandwidth, 1000);
+
+    if (oxcf->optimal_buffer_level == 0)
+      lc->optimal_buffer_level = lc->target_bandwidth / 8;
+    else
+      lc->optimal_buffer_level =
+          rescale((int)(oxcf->optimal_buffer_level),
+                  lc->target_bandwidth, 1000);
+
+    if (oxcf->maximum_buffer_size == 0)
+      lc->maximum_buffer_size = lc->target_bandwidth / 8;
+    else
+      lc->maximum_buffer_size =
+          rescale((int)(oxcf->maximum_buffer_size),
+                  lc->target_bandwidth, 1000);
+
+    /* Work out the average size of a frame within this layer */
+    if (layer > 0)
+      lc->avg_frame_size_for_layer =
+          (int)((cpi->oxcf.target_bitrate[layer] -
+                cpi->oxcf.target_bitrate[layer-1]) * 1000 /
+                (lc->framerate - prev_layer_framerate));
+
+     lc->active_worst_quality         = cpi->oxcf.worst_allowed_q;
+     lc->active_best_quality          = cpi->oxcf.best_allowed_q;
+     lc->avg_frame_qindex             = cpi->oxcf.worst_allowed_q;
+
+     lc->buffer_level                 = lc->starting_buffer_level;
+     lc->bits_off_target              = lc->starting_buffer_level;
+
+     lc->total_actual_bits                 = 0;
+     lc->ni_av_qi                          = 0;
+     lc->ni_tot_qi                         = 0;
+     lc->ni_frames                         = 0;
+     lc->rate_correction_factor            = 1.0;
+     lc->key_frame_rate_correction_factor  = 1.0;
+     lc->gf_rate_correction_factor         = 1.0;
+     lc->inter_frame_target                = 0;
+}
+
+// Upon a run-time change in temporal layers, reset the layer context parameters
+// for any "new" layers. For "existing" layers, let them inherit the parameters
+// from the previous layer state (at the same layer #). In future we may want
+// to better map the previous layer state(s) to the "new" ones.
+static void reset_temporal_layer_change(VP8_COMP *cpi,
+                                        VP8_CONFIG *oxcf,
+                                        const int prev_num_layers)
+{
+    int i;
+    double prev_layer_framerate = 0;
+    const int curr_num_layers = cpi->oxcf.number_of_layers;
+    // If the previous state was 1 layer, get current layer context from cpi.
+    // We need this to set the layer context for the new layers below.
+    if (prev_num_layers == 1)
+    {
+        cpi->current_layer = 0;
+        save_layer_context(cpi);
+    }
+    for (i = 0; i < curr_num_layers; i++)
+    {
+        LAYER_CONTEXT *lc = &cpi->layer_context[i];
+        if (i >= prev_num_layers)
+        {
+           init_temporal_layer_context(cpi, oxcf, i, prev_layer_framerate);
+        }
+        // The initial buffer levels are set based on their starting levels.
+        // We could set the buffer levels based on the previous state (normalized
+        // properly by the layer bandwidths) but we would need to keep track of
+        // the previous set of layer bandwidths (i.e., target_bitrate[i])
+        // before the layer change. For now, reset to the starting levels.
+        lc->buffer_level = cpi->oxcf.starting_buffer_level_in_ms *
+                           cpi->oxcf.target_bitrate[i];
+        lc->bits_off_target = lc->buffer_level;
+        // TDOD(marpan): Should we set the rate_correction_factor and
+        // active_worst/best_quality to values derived from the previous layer
+        // state (to smooth-out quality dips/rate fluctuation at transition)?
+
+        // We need to treat the 1 layer case separately: oxcf.target_bitrate[i]
+        // is not set for 1 layer, and the restore_layer_context/save_context()
+        // are not called in the encoding loop, so we need to call it here to
+        // pass the layer context state to |cpi|.
+        if (curr_num_layers == 1)
+        {
+            lc->target_bandwidth = cpi->oxcf.target_bandwidth;
+            lc->buffer_level = cpi->oxcf.starting_buffer_level_in_ms *
+                               lc->target_bandwidth  / 1000;
+            lc->bits_off_target = lc->buffer_level;
+            restore_layer_context(cpi, 0);
+        }
+        prev_layer_framerate = cpi->output_framerate /
+                               cpi->oxcf.rate_decimator[i];
+    }
+}
+
+static void setup_features(VP8_COMP *cpi)
+{
+    // If segmentation enabled set the update flags
+    if ( cpi->mb.e_mbd.segmentation_enabled )
+    {
+        cpi->mb.e_mbd.update_mb_segmentation_map = 1;
+        cpi->mb.e_mbd.update_mb_segmentation_data = 1;
+    }
+    else
+    {
+        cpi->mb.e_mbd.update_mb_segmentation_map = 0;
+        cpi->mb.e_mbd.update_mb_segmentation_data = 0;
+    }
+
+    cpi->mb.e_mbd.mode_ref_lf_delta_enabled = 0;
+    cpi->mb.e_mbd.mode_ref_lf_delta_update = 0;
+    memset(cpi->mb.e_mbd.ref_lf_deltas, 0, sizeof(cpi->mb.e_mbd.ref_lf_deltas));
+    memset(cpi->mb.e_mbd.mode_lf_deltas, 0, sizeof(cpi->mb.e_mbd.mode_lf_deltas));
+    memset(cpi->mb.e_mbd.last_ref_lf_deltas, 0, sizeof(cpi->mb.e_mbd.ref_lf_deltas));
+    memset(cpi->mb.e_mbd.last_mode_lf_deltas, 0, sizeof(cpi->mb.e_mbd.mode_lf_deltas));
+
+    set_default_lf_deltas(cpi);
+
+}
+
+
+static void dealloc_raw_frame_buffers(VP8_COMP *cpi);
+
+void vp8_initialize_enc(void)
+{
+    static volatile int init_done = 0;
+
+    if (!init_done) {
+        vpx_dsp_rtcd();
+        vp8_init_intra_predictors();
+        init_done = 1;
+    }
+}
+
+static void dealloc_compressor_data(VP8_COMP *cpi)
+{
+    vpx_free(cpi->tplist);
+    cpi->tplist = NULL;
+
+    /* Delete last frame MV storage buffers */
+    vpx_free(cpi->lfmv);
+    cpi->lfmv = 0;
+
+    vpx_free(cpi->lf_ref_frame_sign_bias);
+    cpi->lf_ref_frame_sign_bias = 0;
+
+    vpx_free(cpi->lf_ref_frame);
+    cpi->lf_ref_frame = 0;
+
+    /* Delete sementation map */
+    vpx_free(cpi->segmentation_map);
+    cpi->segmentation_map = 0;
+
+    vpx_free(cpi->active_map);
+    cpi->active_map = 0;
+
+    vp8_de_alloc_frame_buffers(&cpi->common);
+
+    vp8_yv12_de_alloc_frame_buffer(&cpi->pick_lf_lvl_frame);
+    vp8_yv12_de_alloc_frame_buffer(&cpi->scaled_source);
+    dealloc_raw_frame_buffers(cpi);
+
+    vpx_free(cpi->tok);
+    cpi->tok = 0;
+
+    /* Structure used to monitor GF usage */
+    vpx_free(cpi->gf_active_flags);
+    cpi->gf_active_flags = 0;
+
+    /* Activity mask based per mb zbin adjustments */
+    vpx_free(cpi->mb_activity_map);
+    cpi->mb_activity_map = 0;
+
+    vpx_free(cpi->mb.pip);
+    cpi->mb.pip = 0;
+
+#if CONFIG_MULTITHREAD
+    /* De-allocate mutex */
+    if (cpi->pmutex != NULL) {
+        VP8_COMMON *const pc = &cpi->common;
+        int i;
+
+        for (i = 0; i < pc->mb_rows; i++) {
+            pthread_mutex_destroy(&cpi->pmutex[i]);
+        }
+        vpx_free(cpi->pmutex);
+        cpi->pmutex = NULL;
+    }
+
+    vpx_free(cpi->mt_current_mb_col);
+    cpi->mt_current_mb_col = NULL;
+#endif
+}
+
+static void enable_segmentation(VP8_COMP *cpi)
+{
+    /* Set the appropriate feature bit */
+    cpi->mb.e_mbd.segmentation_enabled = 1;
+    cpi->mb.e_mbd.update_mb_segmentation_map = 1;
+    cpi->mb.e_mbd.update_mb_segmentation_data = 1;
+}
+static void disable_segmentation(VP8_COMP *cpi)
+{
+    /* Clear the appropriate feature bit */
+    cpi->mb.e_mbd.segmentation_enabled = 0;
+}
+
+/* Valid values for a segment are 0 to 3
+ * Segmentation map is arrange as [Rows][Columns]
+ */
+static void set_segmentation_map(VP8_COMP *cpi, unsigned char *segmentation_map)
+{
+    /* Copy in the new segmentation map */
+    memcpy(cpi->segmentation_map, segmentation_map, (cpi->common.mb_rows * cpi->common.mb_cols));
+
+    /* Signal that the map should be updated. */
+    cpi->mb.e_mbd.update_mb_segmentation_map = 1;
+    cpi->mb.e_mbd.update_mb_segmentation_data = 1;
+}
+
+/* The values given for each segment can be either deltas (from the default
+ * value chosen for the frame) or absolute values.
+ *
+ * Valid range for abs values is:
+ *    (0-127 for MB_LVL_ALT_Q), (0-63 for SEGMENT_ALT_LF)
+ * Valid range for delta values are:
+ *    (+/-127 for MB_LVL_ALT_Q), (+/-63 for SEGMENT_ALT_LF)
+ *
+ * abs_delta = SEGMENT_DELTADATA (deltas)
+ * abs_delta = SEGMENT_ABSDATA (use the absolute values given).
+ *
+ */
+static void set_segment_data(VP8_COMP *cpi, signed char *feature_data, unsigned char abs_delta)
+{
+    cpi->mb.e_mbd.mb_segement_abs_delta = abs_delta;
+    memcpy(cpi->segment_feature_data, feature_data, sizeof(cpi->segment_feature_data));
+}
+
+
+/* A simple function to cyclically refresh the background at a lower Q */
+static void cyclic_background_refresh(VP8_COMP *cpi, int Q, int lf_adjustment)
+{
+    unsigned char *seg_map = cpi->segmentation_map;
+    signed char feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS];
+    int i;
+    int block_count = cpi->cyclic_refresh_mode_max_mbs_perframe;
+    int mbs_in_frame = cpi->common.mb_rows * cpi->common.mb_cols;
+
+    cpi->cyclic_refresh_q = Q / 2;
+
+    if (cpi->oxcf.screen_content_mode) {
+      // Modify quality ramp-up based on Q. Above some Q level, increase the
+      // number of blocks to be refreshed, and reduce it below the thredhold.
+      // Turn-off under certain conditions (i.e., away from key frame, and if
+      // we are at good quality (low Q) and most of the blocks were skipped-encoded
+      // in previous frame.
+      int qp_thresh = (cpi->oxcf.screen_content_mode == 2) ? 80 : 100;
+      if (Q >= qp_thresh) {
+        cpi->cyclic_refresh_mode_max_mbs_perframe =
+            (cpi->common.mb_rows * cpi->common.mb_cols) / 10;
+      } else if (cpi->frames_since_key > 250 &&
+                 Q < 20 &&
+                 cpi->mb.skip_true_count > (int)(0.95 * mbs_in_frame)) {
+        cpi->cyclic_refresh_mode_max_mbs_perframe = 0;
+      } else {
+        cpi->cyclic_refresh_mode_max_mbs_perframe =
+            (cpi->common.mb_rows * cpi->common.mb_cols) / 20;
+      }
+      block_count = cpi->cyclic_refresh_mode_max_mbs_perframe;
+    }
+
+    // Set every macroblock to be eligible for update.
+    // For key frame this will reset seg map to 0.
+    memset(cpi->segmentation_map, 0, mbs_in_frame);
+
+    if (cpi->common.frame_type != KEY_FRAME && block_count > 0)
+    {
+        /* Cycle through the macro_block rows */
+        /* MB loop to set local segmentation map */
+        i = cpi->cyclic_refresh_mode_index;
+        assert(i < mbs_in_frame);
+        do
+        {
+          /* If the MB is as a candidate for clean up then mark it for
+           * possible boost/refresh (segment 1) The segment id may get
+           * reset to 0 later if the MB gets coded anything other than
+           * last frame 0,0 as only (last frame 0,0) MBs are eligable for
+           * refresh : that is to say Mbs likely to be background blocks.
+           */
+          if (cpi->cyclic_refresh_map[i] == 0)
+          {
+              seg_map[i] = 1;
+              block_count --;
+          }
+          else if (cpi->cyclic_refresh_map[i] < 0)
+              cpi->cyclic_refresh_map[i]++;
+
+          i++;
+          if (i == mbs_in_frame)
+              i = 0;
+
+        }
+        while(block_count && i != cpi->cyclic_refresh_mode_index);
+
+        cpi->cyclic_refresh_mode_index = i;
+
+#if CONFIG_TEMPORAL_DENOISING
+        if (cpi->oxcf.noise_sensitivity > 0) {
+          if (cpi->denoiser.denoiser_mode == kDenoiserOnYUVAggressive &&
+              Q < (int)cpi->denoiser.denoise_pars.qp_thresh &&
+              (cpi->frames_since_key >
+               2 * cpi->denoiser.denoise_pars.consec_zerolast)) {
+            // Under aggressive denoising, use segmentation to turn off loop
+            // filter below some qp thresh. The filter is reduced for all
+            // blocks that have been encoded as ZEROMV LAST x frames in a row,
+            // where x is set by cpi->denoiser.denoise_pars.consec_zerolast.
+            // This is to avoid "dot" artifacts that can occur from repeated
+            // loop filtering on noisy input source.
+            cpi->cyclic_refresh_q = Q;
+            // lf_adjustment = -MAX_LOOP_FILTER;
+            lf_adjustment = -40;
+            for (i = 0; i < mbs_in_frame; ++i) {
+              seg_map[i] = (cpi->consec_zero_last[i] >
+                            cpi->denoiser.denoise_pars.consec_zerolast) ? 1 : 0;
+            }
+          }
+        }
+#endif
+    }
+
+    /* Activate segmentation. */
+    cpi->mb.e_mbd.update_mb_segmentation_map = 1;
+    cpi->mb.e_mbd.update_mb_segmentation_data = 1;
+    enable_segmentation(cpi);
+
+    /* Set up the quant segment data */
+    feature_data[MB_LVL_ALT_Q][0] = 0;
+    feature_data[MB_LVL_ALT_Q][1] = (cpi->cyclic_refresh_q - Q);
+    feature_data[MB_LVL_ALT_Q][2] = 0;
+    feature_data[MB_LVL_ALT_Q][3] = 0;
+
+    /* Set up the loop segment data */
+    feature_data[MB_LVL_ALT_LF][0] = 0;
+    feature_data[MB_LVL_ALT_LF][1] = lf_adjustment;
+    feature_data[MB_LVL_ALT_LF][2] = 0;
+    feature_data[MB_LVL_ALT_LF][3] = 0;
+
+    /* Initialise the feature data structure */
+    set_segment_data(cpi, &feature_data[0][0], SEGMENT_DELTADATA);
+
+}
+
+static void set_default_lf_deltas(VP8_COMP *cpi)
+{
+    cpi->mb.e_mbd.mode_ref_lf_delta_enabled = 1;
+    cpi->mb.e_mbd.mode_ref_lf_delta_update = 1;
+
+    memset(cpi->mb.e_mbd.ref_lf_deltas, 0, sizeof(cpi->mb.e_mbd.ref_lf_deltas));
+    memset(cpi->mb.e_mbd.mode_lf_deltas, 0, sizeof(cpi->mb.e_mbd.mode_lf_deltas));
+
+    /* Test of ref frame deltas */
+    cpi->mb.e_mbd.ref_lf_deltas[INTRA_FRAME] = 2;
+    cpi->mb.e_mbd.ref_lf_deltas[LAST_FRAME] = 0;
+    cpi->mb.e_mbd.ref_lf_deltas[GOLDEN_FRAME] = -2;
+    cpi->mb.e_mbd.ref_lf_deltas[ALTREF_FRAME] = -2;
+
+    cpi->mb.e_mbd.mode_lf_deltas[0] = 4;               /* BPRED */
+
+    if(cpi->oxcf.Mode == MODE_REALTIME)
+      cpi->mb.e_mbd.mode_lf_deltas[1] = -12;              /* Zero */
+    else
+      cpi->mb.e_mbd.mode_lf_deltas[1] = -2;              /* Zero */
+
+    cpi->mb.e_mbd.mode_lf_deltas[2] = 2;               /* New mv */
+    cpi->mb.e_mbd.mode_lf_deltas[3] = 4;               /* Split mv */
+}
+
+/* Convenience macros for mapping speed and mode into a continuous
+ * range
+ */
+#define GOOD(x) (x+1)
+#define RT(x) (x+7)
+
+static int speed_map(int speed, const int *map)
+{
+    int res;
+
+    do
+    {
+        res = *map++;
+    } while(speed >= *map++);
+    return res;
+}
+
+static const int thresh_mult_map_znn[] = {
+    /* map common to zero, nearest, and near */
+    0, GOOD(2), 1500, GOOD(3), 2000, RT(0), 1000, RT(2), 2000, INT_MAX
+};
+
+static const int thresh_mult_map_vhpred[] = {
+    1000, GOOD(2), 1500, GOOD(3), 2000, RT(0), 1000, RT(1), 2000,
+    RT(7), INT_MAX, INT_MAX
+};
+
+static const int thresh_mult_map_bpred[] = {
+    2000, GOOD(0), 2500, GOOD(2), 5000, GOOD(3), 7500, RT(0), 2500, RT(1), 5000,
+    RT(6), INT_MAX, INT_MAX
+};
+
+static const int thresh_mult_map_tm[] = {
+    1000, GOOD(2), 1500, GOOD(3), 2000, RT(0), 0, RT(1), 1000, RT(2), 2000,
+    RT(7), INT_MAX, INT_MAX
+};
+
+static const int thresh_mult_map_new1[] = {
+    1000, GOOD(2), 2000, RT(0), 2000, INT_MAX
+};
+
+static const int thresh_mult_map_new2[] = {
+    1000, GOOD(2), 2000, GOOD(3), 2500, GOOD(5), 4000, RT(0), 2000, RT(2), 2500,
+    RT(5), 4000, INT_MAX
+};
+
+static const int thresh_mult_map_split1[] = {
+    2500, GOOD(0), 1700, GOOD(2), 10000, GOOD(3), 25000, GOOD(4), INT_MAX,
+    RT(0), 5000, RT(1), 10000, RT(2), 25000, RT(3), INT_MAX, INT_MAX
+};
+
+static const int thresh_mult_map_split2[] = {
+    5000, GOOD(0), 4500, GOOD(2), 20000, GOOD(3), 50000, GOOD(4), INT_MAX,
+    RT(0), 10000, RT(1), 20000, RT(2), 50000, RT(3), INT_MAX, INT_MAX
+};
+
+static const int mode_check_freq_map_zn2[] = {
+    /* {zero,nearest}{2,3} */
+    0, RT(10), 1<<1, RT(11), 1<<2, RT(12), 1<<3, INT_MAX
+};
+
+static const int mode_check_freq_map_vhbpred[] = {
+    0, GOOD(5), 2, RT(0), 0, RT(3), 2, RT(5), 4, INT_MAX
+};
+
+static const int mode_check_freq_map_near2[] = {
+    0, GOOD(5), 2, RT(0), 0, RT(3), 2, RT(10), 1<<2, RT(11), 1<<3, RT(12), 1<<4,
+    INT_MAX
+};
+
+static const int mode_check_freq_map_new1[] = {
+    0, RT(10), 1<<1, RT(11), 1<<2, RT(12), 1<<3, INT_MAX
+};
+
+static const int mode_check_freq_map_new2[] = {
+    0, GOOD(5), 4, RT(0), 0, RT(3), 4, RT(10), 1<<3, RT(11), 1<<4, RT(12), 1<<5,
+    INT_MAX
+};
+
+static const int mode_check_freq_map_split1[] = {
+    0, GOOD(2), 2, GOOD(3), 7, RT(1), 2, RT(2), 7, INT_MAX
+};
+
+static const int mode_check_freq_map_split2[] = {
+    0, GOOD(1), 2, GOOD(2), 4, GOOD(3), 15, RT(1), 4, RT(2), 15, INT_MAX
+};
+
+void vp8_set_speed_features(VP8_COMP *cpi)
+{
+    SPEED_FEATURES *sf = &cpi->sf;
+    int Mode = cpi->compressor_speed;
+    int Speed = cpi->Speed;
+    int i;
+    VP8_COMMON *cm = &cpi->common;
+    int last_improved_quant = sf->improved_quant;
+    int ref_frames;
+
+    /* Initialise default mode frequency sampling variables */
+    for (i = 0; i < MAX_MODES; i ++)
+    {
+        cpi->mode_check_freq[i] = 0;
+    }
+
+    cpi->mb.mbs_tested_so_far = 0;
+    cpi->mb.mbs_zero_last_dot_suppress = 0;
+
+    /* best quality defaults */
+    sf->RD = 1;
+    sf->search_method = NSTEP;
+    sf->improved_quant = 1;
+    sf->improved_dct = 1;
+    sf->auto_filter = 1;
+    sf->recode_loop = 1;
+    sf->quarter_pixel_search = 1;
+    sf->half_pixel_search = 1;
+    sf->iterative_sub_pixel = 1;
+    sf->optimize_coefficients = 1;
+    sf->use_fastquant_for_pick = 0;
+    sf->no_skip_block4x4_search = 1;
+
+    sf->first_step = 0;
+    sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
+    sf->improved_mv_pred = 1;
+
+    /* default thresholds to 0 */
+    for (i = 0; i < MAX_MODES; i++)
+        sf->thresh_mult[i] = 0;
+
+    /* Count enabled references */
+    ref_frames = 1;
+    if (cpi->ref_frame_flags & VP8_LAST_FRAME)
+        ref_frames++;
+    if (cpi->ref_frame_flags & VP8_GOLD_FRAME)
+        ref_frames++;
+    if (cpi->ref_frame_flags & VP8_ALTR_FRAME)
+        ref_frames++;
+
+    /* Convert speed to continuous range, with clamping */
+    if (Mode == 0)
+        Speed = 0;
+    else if (Mode == 2)
+        Speed = RT(Speed);
+    else
+    {
+        if (Speed > 5)
+            Speed = 5;
+        Speed = GOOD(Speed);
+    }
+
+    sf->thresh_mult[THR_ZERO1] =
+    sf->thresh_mult[THR_NEAREST1] =
+    sf->thresh_mult[THR_NEAR1] =
+    sf->thresh_mult[THR_DC] = 0; /* always */
+
+    sf->thresh_mult[THR_ZERO2] =
+    sf->thresh_mult[THR_ZERO3] =
+    sf->thresh_mult[THR_NEAREST2] =
+    sf->thresh_mult[THR_NEAREST3] =
+    sf->thresh_mult[THR_NEAR2]  =
+    sf->thresh_mult[THR_NEAR3]  = speed_map(Speed, thresh_mult_map_znn);
+
+    sf->thresh_mult[THR_V_PRED] =
+    sf->thresh_mult[THR_H_PRED] = speed_map(Speed, thresh_mult_map_vhpred);
+    sf->thresh_mult[THR_B_PRED] = speed_map(Speed, thresh_mult_map_bpred);
+    sf->thresh_mult[THR_TM]     = speed_map(Speed, thresh_mult_map_tm);
+    sf->thresh_mult[THR_NEW1]   = speed_map(Speed, thresh_mult_map_new1);
+    sf->thresh_mult[THR_NEW2]   =
+    sf->thresh_mult[THR_NEW3]   = speed_map(Speed, thresh_mult_map_new2);
+    sf->thresh_mult[THR_SPLIT1] = speed_map(Speed, thresh_mult_map_split1);
+    sf->thresh_mult[THR_SPLIT2] =
+    sf->thresh_mult[THR_SPLIT3] = speed_map(Speed, thresh_mult_map_split2);
+
+    // Special case for temporal layers.
+    // Reduce the thresholds for zero/nearest/near for GOLDEN, if GOLDEN is
+    // used as second reference. We don't modify thresholds for ALTREF case
+    // since ALTREF is usually used as long-term reference in temporal layers.
+    if ((cpi->Speed <= 6) &&
+        (cpi->oxcf.number_of_layers > 1) &&
+        (cpi->ref_frame_flags & VP8_LAST_FRAME) &&
+        (cpi->ref_frame_flags & VP8_GOLD_FRAME)) {
+      if (cpi->closest_reference_frame == GOLDEN_FRAME) {
+        sf->thresh_mult[THR_ZERO2] =  sf->thresh_mult[THR_ZERO2] >> 3;
+        sf->thresh_mult[THR_NEAREST2] = sf->thresh_mult[THR_NEAREST2] >> 3;
+        sf->thresh_mult[THR_NEAR2]  = sf->thresh_mult[THR_NEAR2] >> 3;
+      } else {
+        sf->thresh_mult[THR_ZERO2] =  sf->thresh_mult[THR_ZERO2] >> 1;
+        sf->thresh_mult[THR_NEAREST2] = sf->thresh_mult[THR_NEAREST2] >> 1;
+        sf->thresh_mult[THR_NEAR2]  = sf->thresh_mult[THR_NEAR2] >> 1;
+      }
+    }
+
+    cpi->mode_check_freq[THR_ZERO1] =
+    cpi->mode_check_freq[THR_NEAREST1] =
+    cpi->mode_check_freq[THR_NEAR1] =
+    cpi->mode_check_freq[THR_TM]     =
+    cpi->mode_check_freq[THR_DC] = 0; /* always */
+
+    cpi->mode_check_freq[THR_ZERO2] =
+    cpi->mode_check_freq[THR_ZERO3] =
+    cpi->mode_check_freq[THR_NEAREST2] =
+    cpi->mode_check_freq[THR_NEAREST3] = speed_map(Speed,
+                                                   mode_check_freq_map_zn2);
+
+    cpi->mode_check_freq[THR_NEAR2]  =
+    cpi->mode_check_freq[THR_NEAR3]  = speed_map(Speed,
+                                                 mode_check_freq_map_near2);
+
+    cpi->mode_check_freq[THR_V_PRED] =
+    cpi->mode_check_freq[THR_H_PRED] =
+    cpi->mode_check_freq[THR_B_PRED] = speed_map(Speed,
+                                                 mode_check_freq_map_vhbpred);
+    cpi->mode_check_freq[THR_NEW1]   = speed_map(Speed,
+                                                 mode_check_freq_map_new1);
+    cpi->mode_check_freq[THR_NEW2]   =
+    cpi->mode_check_freq[THR_NEW3]   = speed_map(Speed,
+                                                 mode_check_freq_map_new2);
+    cpi->mode_check_freq[THR_SPLIT1] = speed_map(Speed,
+                                                 mode_check_freq_map_split1);
+    cpi->mode_check_freq[THR_SPLIT2] =
+    cpi->mode_check_freq[THR_SPLIT3] = speed_map(Speed,
+                                                 mode_check_freq_map_split2);
+    Speed = cpi->Speed;
+    switch (Mode)
+    {
+#if !CONFIG_REALTIME_ONLY
+    case 0: /* best quality mode */
+        sf->first_step = 0;
+        sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
+        break;
+    case 1:
+    case 3:
+        if (Speed > 0)
+        {
+            /* Disable coefficient optimization above speed 0 */
+            sf->optimize_coefficients = 0;
+            sf->use_fastquant_for_pick = 1;
+            sf->no_skip_block4x4_search = 0;
+
+            sf->first_step = 1;
+        }
+
+        if (Speed > 2)
+        {
+            sf->improved_quant = 0;
+            sf->improved_dct = 0;
+
+            /* Only do recode loop on key frames, golden frames and
+             * alt ref frames
+             */
+            sf->recode_loop = 2;
+
+        }
+
+        if (Speed > 3)
+        {
+            sf->auto_filter = 1;
+            sf->recode_loop = 0; /* recode loop off */
+            sf->RD = 0;         /* Turn rd off */
+
+        }
+
+        if (Speed > 4)
+        {
+            sf->auto_filter = 0;  /* Faster selection of loop filter */
+        }
+
+        break;
+#endif
+    case 2:
+        sf->optimize_coefficients = 0;
+        sf->recode_loop = 0;
+        sf->auto_filter = 1;
+        sf->iterative_sub_pixel = 1;
+        sf->search_method = NSTEP;
+
+        if (Speed > 0)
+        {
+            sf->improved_quant = 0;
+            sf->improved_dct = 0;
+
+            sf->use_fastquant_for_pick = 1;
+            sf->no_skip_block4x4_search = 0;
+            sf->first_step = 1;
+        }
+
+        if (Speed > 2)
+            sf->auto_filter = 0;  /* Faster selection of loop filter */
+
+        if (Speed > 3)
+        {
+            sf->RD = 0;
+            sf->auto_filter = 1;
+        }
+
+        if (Speed > 4)
+        {
+            sf->auto_filter = 0;  /* Faster selection of loop filter */
+            sf->search_method = HEX;
+            sf->iterative_sub_pixel = 0;
+        }
+
+        if (Speed > 6)
+        {
+            unsigned int sum = 0;
+            unsigned int total_mbs = cm->MBs;
+            int thresh;
+            unsigned int total_skip;
+
+            int min = 2000;
+
+            if (cpi->oxcf.encode_breakout > 2000)
+                min = cpi->oxcf.encode_breakout;
+
+            min >>= 7;
+
+            for (i = 0; i < min; i++)
+            {
+                sum += cpi->mb.error_bins[i];
+            }
+
+            total_skip = sum;
+            sum = 0;
+
+            /* i starts from 2 to make sure thresh started from 2048 */
+            for (; i < 1024; i++)
+            {
+                sum += cpi->mb.error_bins[i];
+
+                if (10 * sum >= (unsigned int)(cpi->Speed - 6)*(total_mbs - total_skip))
+                    break;
+            }
+
+            i--;
+            thresh = (i << 7);
+
+            if (thresh < 2000)
+                thresh = 2000;
+
+            if (ref_frames > 1)
+            {
+                sf->thresh_mult[THR_NEW1 ] = thresh;
+                sf->thresh_mult[THR_NEAREST1  ] = thresh >> 1;
+                sf->thresh_mult[THR_NEAR1     ] = thresh >> 1;
+            }
+
+            if (ref_frames > 2)
+            {
+                sf->thresh_mult[THR_NEW2] = thresh << 1;
+                sf->thresh_mult[THR_NEAREST2 ] = thresh;
+                sf->thresh_mult[THR_NEAR2    ] = thresh;
+            }
+
+            if (ref_frames > 3)
+            {
+                sf->thresh_mult[THR_NEW3] = thresh << 1;
+                sf->thresh_mult[THR_NEAREST3 ] = thresh;
+                sf->thresh_mult[THR_NEAR3    ] = thresh;
+            }
+
+            sf->improved_mv_pred = 0;
+        }
+
+        if (Speed > 8)
+            sf->quarter_pixel_search = 0;
+
+        if(cm->version == 0)
+        {
+            cm->filter_type = NORMAL_LOOPFILTER;
+
+            if (Speed >= 14)
+                cm->filter_type = SIMPLE_LOOPFILTER;
+        }
+        else
+        {
+            cm->filter_type = SIMPLE_LOOPFILTER;
+        }
+
+        /* This has a big hit on quality. Last resort */
+        if (Speed >= 15)
+            sf->half_pixel_search = 0;
+
+        memset(cpi->mb.error_bins, 0, sizeof(cpi->mb.error_bins));
+
+    }; /* switch */
+
+    /* Slow quant, dct and trellis not worthwhile for first pass
+     * so make sure they are always turned off.
+     */
+    if ( cpi->pass == 1 )
+    {
+        sf->improved_quant = 0;
+        sf->optimize_coefficients = 0;
+        sf->improved_dct = 0;
+    }
+
+    if (cpi->sf.search_method == NSTEP)
+    {
+        vp8_init3smotion_compensation(&cpi->mb, cm->yv12_fb[cm->lst_fb_idx].y_stride);
+    }
+    else if (cpi->sf.search_method == DIAMOND)
+    {
+        vp8_init_dsmotion_compensation(&cpi->mb, cm->yv12_fb[cm->lst_fb_idx].y_stride);
+    }
+
+    if (cpi->sf.improved_dct)
+    {
+        cpi->mb.short_fdct8x4 = vp8_short_fdct8x4;
+        cpi->mb.short_fdct4x4 = vp8_short_fdct4x4;
+    }
+    else
+    {
+        /* No fast FDCT defined for any platform at this time. */
+        cpi->mb.short_fdct8x4 = vp8_short_fdct8x4;
+        cpi->mb.short_fdct4x4 = vp8_short_fdct4x4;
+    }
+
+    cpi->mb.short_walsh4x4 = vp8_short_walsh4x4;
+
+    if (cpi->sf.improved_quant)
+    {
+        cpi->mb.quantize_b      = vp8_regular_quantize_b;
+    }
+    else
+    {
+        cpi->mb.quantize_b      = vp8_fast_quantize_b;
+    }
+    if (cpi->sf.improved_quant != last_improved_quant)
+        vp8cx_init_quantizer(cpi);
+
+    if (cpi->sf.iterative_sub_pixel == 1)
+    {
+        cpi->find_fractional_mv_step = vp8_find_best_sub_pixel_step_iteratively;
+    }
+    else if (cpi->sf.quarter_pixel_search)
+    {
+        cpi->find_fractional_mv_step = vp8_find_best_sub_pixel_step;
+    }
+    else if (cpi->sf.half_pixel_search)
+    {
+        cpi->find_fractional_mv_step = vp8_find_best_half_pixel_step;
+    }
+    else
+    {
+        cpi->find_fractional_mv_step = vp8_skip_fractional_mv_step;
+    }
+
+    if (cpi->sf.optimize_coefficients == 1 && cpi->pass!=1)
+        cpi->mb.optimize = 1;
+    else
+        cpi->mb.optimize = 0;
+
+    if (cpi->common.full_pixel)
+        cpi->find_fractional_mv_step = vp8_skip_fractional_mv_step;
+
+#ifdef SPEEDSTATS
+    frames_at_speed[cpi->Speed]++;
+#endif
+}
+#undef GOOD
+#undef RT
+
+static void alloc_raw_frame_buffers(VP8_COMP *cpi)
+{
+#if VP8_TEMPORAL_ALT_REF
+    int width = (cpi->oxcf.Width + 15) & ~15;
+    int height = (cpi->oxcf.Height + 15) & ~15;
+#endif
+
+    cpi->lookahead = vp8_lookahead_init(cpi->oxcf.Width, cpi->oxcf.Height,
+                                        cpi->oxcf.lag_in_frames);
+    if(!cpi->lookahead)
+        vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+                           "Failed to allocate lag buffers");
+
+#if VP8_TEMPORAL_ALT_REF
+
+    if (vp8_yv12_alloc_frame_buffer(&cpi->alt_ref_buffer,
+                                    width, height, VP8BORDERINPIXELS))
+        vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+                           "Failed to allocate altref buffer");
+
+#endif
+}
+
+
+static void dealloc_raw_frame_buffers(VP8_COMP *cpi)
+{
+#if VP8_TEMPORAL_ALT_REF
+    vp8_yv12_de_alloc_frame_buffer(&cpi->alt_ref_buffer);
+#endif
+    vp8_lookahead_destroy(cpi->lookahead);
+}
+
+
+static int vp8_alloc_partition_data(VP8_COMP *cpi)
+{
+        vpx_free(cpi->mb.pip);
+
+    cpi->mb.pip = vpx_calloc((cpi->common.mb_cols + 1) *
+                                (cpi->common.mb_rows + 1),
+                                sizeof(PARTITION_INFO));
+    if(!cpi->mb.pip)
+        return 1;
+
+    cpi->mb.pi = cpi->mb.pip + cpi->common.mode_info_stride + 1;
+
+    return 0;
+}
+
+void vp8_alloc_compressor_data(VP8_COMP *cpi)
+{
+    VP8_COMMON *cm = & cpi->common;
+
+    int width = cm->Width;
+    int height = cm->Height;
+#if CONFIG_MULTITHREAD
+    int prev_mb_rows = cm->mb_rows;
+#endif
+
+    if (vp8_alloc_frame_buffers(cm, width, height))
+        vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+                           "Failed to allocate frame buffers");
+
+    if (vp8_alloc_partition_data(cpi))
+        vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+                           "Failed to allocate partition data");
+
+
+    if ((width & 0xf) != 0)
+        width += 16 - (width & 0xf);
+
+    if ((height & 0xf) != 0)
+        height += 16 - (height & 0xf);
+
+
+    if (vp8_yv12_alloc_frame_buffer(&cpi->pick_lf_lvl_frame,
+                                    width, height, VP8BORDERINPIXELS))
+        vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+                           "Failed to allocate last frame buffer");
+
+    if (vp8_yv12_alloc_frame_buffer(&cpi->scaled_source,
+                                    width, height, VP8BORDERINPIXELS))
+        vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+                           "Failed to allocate scaled source buffer");
+
+    vpx_free(cpi->tok);
+
+    {
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+        unsigned int tokens = 8 * 24 * 16; /* one MB for each thread */
+#else
+        unsigned int tokens = cm->mb_rows * cm->mb_cols * 24 * 16;
+#endif
+        CHECK_MEM_ERROR(cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok)));
+    }
+
+    /* Data used for real time vc mode to see if gf needs refreshing */
+    cpi->zeromv_count = 0;
+
+
+    /* Structures used to monitor GF usage */
+    vpx_free(cpi->gf_active_flags);
+    CHECK_MEM_ERROR(cpi->gf_active_flags,
+                    vpx_calloc(sizeof(*cpi->gf_active_flags),
+                    cm->mb_rows * cm->mb_cols));
+    cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
+
+    vpx_free(cpi->mb_activity_map);
+    CHECK_MEM_ERROR(cpi->mb_activity_map,
+                    vpx_calloc(sizeof(*cpi->mb_activity_map),
+                    cm->mb_rows * cm->mb_cols));
+
+    /* allocate memory for storing last frame's MVs for MV prediction. */
+    vpx_free(cpi->lfmv);
+    CHECK_MEM_ERROR(cpi->lfmv, vpx_calloc((cm->mb_rows+2) * (cm->mb_cols+2),
+                    sizeof(*cpi->lfmv)));
+    vpx_free(cpi->lf_ref_frame_sign_bias);
+    CHECK_MEM_ERROR(cpi->lf_ref_frame_sign_bias,
+                    vpx_calloc((cm->mb_rows+2) * (cm->mb_cols+2),
+                    sizeof(*cpi->lf_ref_frame_sign_bias)));
+    vpx_free(cpi->lf_ref_frame);
+    CHECK_MEM_ERROR(cpi->lf_ref_frame,
+                    vpx_calloc((cm->mb_rows+2) * (cm->mb_cols+2),
+                    sizeof(*cpi->lf_ref_frame)));
+
+    /* Create the encoder segmentation map and set all entries to 0 */
+    vpx_free(cpi->segmentation_map);
+    CHECK_MEM_ERROR(cpi->segmentation_map,
+                    vpx_calloc(cm->mb_rows * cm->mb_cols,
+                    sizeof(*cpi->segmentation_map)));
+    cpi->cyclic_refresh_mode_index = 0;
+    vpx_free(cpi->active_map);
+    CHECK_MEM_ERROR(cpi->active_map,
+                    vpx_calloc(cm->mb_rows * cm->mb_cols,
+                    sizeof(*cpi->active_map)));
+    memset(cpi->active_map , 1, (cm->mb_rows * cm->mb_cols));
+
+#if CONFIG_MULTITHREAD
+    if (width < 640)
+        cpi->mt_sync_range = 1;
+    else if (width <= 1280)
+        cpi->mt_sync_range = 4;
+    else if (width <= 2560)
+        cpi->mt_sync_range = 8;
+    else
+        cpi->mt_sync_range = 16;
+
+    if (cpi->oxcf.multi_threaded > 1)
+    {
+        int i;
+
+        /* De-allocate and re-allocate mutex */
+        if (cpi->pmutex != NULL) {
+            for (i = 0; i < prev_mb_rows; i++) {
+                pthread_mutex_destroy(&cpi->pmutex[i]);
+            }
+            vpx_free(cpi->pmutex);
+            cpi->pmutex = NULL;
+        }
+
+        CHECK_MEM_ERROR(cpi->pmutex, vpx_malloc(sizeof(*cpi->pmutex) *
+                                                cm->mb_rows));
+        if (cpi->pmutex) {
+            for (i = 0; i < cm->mb_rows; i++) {
+                pthread_mutex_init(&cpi->pmutex[i], NULL);
+            }
+        }
+
+        vpx_free(cpi->mt_current_mb_col);
+        CHECK_MEM_ERROR(cpi->mt_current_mb_col,
+                    vpx_malloc(sizeof(*cpi->mt_current_mb_col) * cm->mb_rows));
+    }
+
+#endif
+
+    vpx_free(cpi->tplist);
+    CHECK_MEM_ERROR(cpi->tplist, vpx_malloc(sizeof(TOKENLIST) * cm->mb_rows));
+
+#if CONFIG_TEMPORAL_DENOISING
+    if (cpi->oxcf.noise_sensitivity > 0) {
+      vp8_denoiser_free(&cpi->denoiser);
+      vp8_denoiser_allocate(&cpi->denoiser, width, height,
+                            cm->mb_rows, cm->mb_cols,
+                            cpi->oxcf.noise_sensitivity);
+    }
+#endif
+}
+
+
+/* Quant MOD */
+static const int q_trans[] =
+{
+    0,   1,  2,  3,  4,  5,  7,  8,
+    9,  10, 12, 13, 15, 17, 18, 19,
+    20,  21, 23, 24, 25, 26, 27, 28,
+    29,  30, 31, 33, 35, 37, 39, 41,
+    43,  45, 47, 49, 51, 53, 55, 57,
+    59,  61, 64, 67, 70, 73, 76, 79,
+    82,  85, 88, 91, 94, 97, 100, 103,
+    106, 109, 112, 115, 118, 121, 124, 127,
+};
+
+int vp8_reverse_trans(int x)
+{
+    int i;
+
+    for (i = 0; i < 64; i++)
+        if (q_trans[i] >= x)
+            return i;
+
+    return 63;
+}
+void vp8_new_framerate(VP8_COMP *cpi, double framerate)
+{
+    if(framerate < .1)
+        framerate = 30;
+
+    cpi->framerate              = framerate;
+    cpi->output_framerate       = framerate;
+    cpi->per_frame_bandwidth    = (int)(cpi->oxcf.target_bandwidth /
+                                  cpi->output_framerate);
+    cpi->av_per_frame_bandwidth = cpi->per_frame_bandwidth;
+    cpi->min_frame_bandwidth    = (int)(cpi->av_per_frame_bandwidth *
+                                  cpi->oxcf.two_pass_vbrmin_section / 100);
+
+    /* Set Maximum gf/arf interval */
+    cpi->max_gf_interval = ((int)(cpi->output_framerate / 2.0) + 2);
+
+    if(cpi->max_gf_interval < 12)
+        cpi->max_gf_interval = 12;
+
+    /* Extended interval for genuinely static scenes */
+    cpi->twopass.static_scene_max_gf_interval = cpi->key_frame_frequency >> 1;
+
+     /* Special conditions when altr ref frame enabled in lagged compress mode */
+    if (cpi->oxcf.play_alternate && cpi->oxcf.lag_in_frames)
+    {
+        if (cpi->max_gf_interval > cpi->oxcf.lag_in_frames - 1)
+            cpi->max_gf_interval = cpi->oxcf.lag_in_frames - 1;
+
+        if (cpi->twopass.static_scene_max_gf_interval > cpi->oxcf.lag_in_frames - 1)
+            cpi->twopass.static_scene_max_gf_interval = cpi->oxcf.lag_in_frames - 1;
+    }
+
+    if ( cpi->max_gf_interval > cpi->twopass.static_scene_max_gf_interval )
+        cpi->max_gf_interval = cpi->twopass.static_scene_max_gf_interval;
+}
+
+
+static void init_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
+{
+    VP8_COMMON *cm = &cpi->common;
+
+    cpi->oxcf = *oxcf;
+
+    cpi->auto_gold = 1;
+    cpi->auto_adjust_gold_quantizer = 1;
+
+    cm->version = oxcf->Version;
+    vp8_setup_version(cm);
+
+    /* Frame rate is not available on the first frame, as it's derived from
+     * the observed timestamps. The actual value used here doesn't matter
+     * too much, as it will adapt quickly.
+     */
+    if (oxcf->timebase.num > 0) {
+      cpi->framerate = (double)(oxcf->timebase.den) /
+                       (double)(oxcf->timebase.num);
+    } else {
+      cpi->framerate = 30;
+    }
+
+    /* If the reciprocal of the timebase seems like a reasonable framerate,
+     * then use that as a guess, otherwise use 30.
+     */
+    if (cpi->framerate > 180)
+        cpi->framerate = 30;
+
+    cpi->ref_framerate = cpi->framerate;
+
+    cpi->ref_frame_flags = VP8_ALTR_FRAME | VP8_GOLD_FRAME | VP8_LAST_FRAME;
+
+    cm->refresh_golden_frame = 0;
+    cm->refresh_last_frame = 1;
+    cm->refresh_entropy_probs = 1;
+
+    /* change includes all joint functionality */
+    vp8_change_config(cpi, oxcf);
+
+    /* Initialize active best and worst q and average q values. */
+    cpi->active_worst_quality         = cpi->oxcf.worst_allowed_q;
+    cpi->active_best_quality          = cpi->oxcf.best_allowed_q;
+    cpi->avg_frame_qindex             = cpi->oxcf.worst_allowed_q;
+
+    /* Initialise the starting buffer levels */
+    cpi->buffer_level                 = cpi->oxcf.starting_buffer_level;
+    cpi->bits_off_target              = cpi->oxcf.starting_buffer_level;
+
+    cpi->rolling_target_bits          = cpi->av_per_frame_bandwidth;
+    cpi->rolling_actual_bits          = cpi->av_per_frame_bandwidth;
+    cpi->long_rolling_target_bits     = cpi->av_per_frame_bandwidth;
+    cpi->long_rolling_actual_bits     = cpi->av_per_frame_bandwidth;
+
+    cpi->total_actual_bits            = 0;
+    cpi->total_target_vs_actual       = 0;
+
+    /* Temporal scalabilty */
+    if (cpi->oxcf.number_of_layers > 1)
+    {
+        unsigned int i;
+        double prev_layer_framerate=0;
+
+        for (i=0; i<cpi->oxcf.number_of_layers; i++)
+        {
+            init_temporal_layer_context(cpi, oxcf, i, prev_layer_framerate);
+            prev_layer_framerate = cpi->output_framerate /
+                                   cpi->oxcf.rate_decimator[i];
+        }
+    }
+
+#if VP8_TEMPORAL_ALT_REF
+    {
+        int i;
+
+        cpi->fixed_divide[0] = 0;
+
+        for (i = 1; i < 512; i++)
+            cpi->fixed_divide[i] = 0x80000 / i;
+    }
+#endif
+}
+
+static void update_layer_contexts (VP8_COMP *cpi)
+{
+    VP8_CONFIG *oxcf = &cpi->oxcf;
+
+    /* Update snapshots of the layer contexts to reflect new parameters */
+    if (oxcf->number_of_layers > 1)
+    {
+        unsigned int i;
+        double prev_layer_framerate=0;
+
+        assert(oxcf->number_of_layers <= VPX_TS_MAX_LAYERS);
+        for (i = 0; i < oxcf->number_of_layers && i < VPX_TS_MAX_LAYERS; ++i)
+        {
+            LAYER_CONTEXT *lc = &cpi->layer_context[i];
+
+            lc->framerate =
+                cpi->ref_framerate / oxcf->rate_decimator[i];
+            lc->target_bandwidth = oxcf->target_bitrate[i] * 1000;
+
+            lc->starting_buffer_level = rescale(
+                          (int)oxcf->starting_buffer_level_in_ms,
+                          lc->target_bandwidth, 1000);
+
+            if (oxcf->optimal_buffer_level == 0)
+                lc->optimal_buffer_level = lc->target_bandwidth / 8;
+            else
+                lc->optimal_buffer_level = rescale(
+                          (int)oxcf->optimal_buffer_level_in_ms,
+                          lc->target_bandwidth, 1000);
+
+            if (oxcf->maximum_buffer_size == 0)
+                lc->maximum_buffer_size = lc->target_bandwidth / 8;
+            else
+                lc->maximum_buffer_size = rescale(
+                          (int)oxcf->maximum_buffer_size_in_ms,
+                          lc->target_bandwidth, 1000);
+
+            /* Work out the average size of a frame within this layer */
+            if (i > 0)
+                lc->avg_frame_size_for_layer =
+                   (int)((oxcf->target_bitrate[i] -
+                          oxcf->target_bitrate[i-1]) * 1000 /
+                          (lc->framerate - prev_layer_framerate));
+
+            prev_layer_framerate = lc->framerate;
+        }
+    }
+}
+
+void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
+{
+    VP8_COMMON *cm = &cpi->common;
+    int last_w, last_h, prev_number_of_layers;
+
+    if (!cpi)
+        return;
+
+    if (!oxcf)
+        return;
+
+#if CONFIG_MULTITHREAD
+    /*  wait for the last picture loopfilter thread done */
+    if (cpi->b_lpf_running)
+    {
+        sem_wait(&cpi->h_event_end_lpf);
+        cpi->b_lpf_running = 0;
+    }
+#endif
+
+    if (cm->version != oxcf->Version)
+    {
+        cm->version = oxcf->Version;
+        vp8_setup_version(cm);
+    }
+
+    last_w = cpi->oxcf.Width;
+    last_h = cpi->oxcf.Height;
+    prev_number_of_layers = cpi->oxcf.number_of_layers;
+
+    cpi->oxcf = *oxcf;
+
+    switch (cpi->oxcf.Mode)
+    {
+
+    case MODE_REALTIME:
+        cpi->pass = 0;
+        cpi->compressor_speed = 2;
+
+        if (cpi->oxcf.cpu_used < -16)
+        {
+            cpi->oxcf.cpu_used = -16;
+        }
+
+        if (cpi->oxcf.cpu_used > 16)
+            cpi->oxcf.cpu_used = 16;
+
+        break;
+
+    case MODE_GOODQUALITY:
+        cpi->pass = 0;
+        cpi->compressor_speed = 1;
+
+        if (cpi->oxcf.cpu_used < -5)
+        {
+            cpi->oxcf.cpu_used = -5;
+        }
+
+        if (cpi->oxcf.cpu_used > 5)
+            cpi->oxcf.cpu_used = 5;
+
+        break;
+
+    case MODE_BESTQUALITY:
+        cpi->pass = 0;
+        cpi->compressor_speed = 0;
+        break;
+
+    case MODE_FIRSTPASS:
+        cpi->pass = 1;
+        cpi->compressor_speed = 1;
+        break;
+    case MODE_SECONDPASS:
+        cpi->pass = 2;
+        cpi->compressor_speed = 1;
+
+        if (cpi->oxcf.cpu_used < -5)
+        {
+            cpi->oxcf.cpu_used = -5;
+        }
+
+        if (cpi->oxcf.cpu_used > 5)
+            cpi->oxcf.cpu_used = 5;
+
+        break;
+    case MODE_SECONDPASS_BEST:
+        cpi->pass = 2;
+        cpi->compressor_speed = 0;
+        break;
+    }
+
+    if (cpi->pass == 0)
+        cpi->auto_worst_q = 1;
+
+    cpi->oxcf.worst_allowed_q = q_trans[oxcf->worst_allowed_q];
+    cpi->oxcf.best_allowed_q = q_trans[oxcf->best_allowed_q];
+    cpi->oxcf.cq_level = q_trans[cpi->oxcf.cq_level];
+
+    if (oxcf->fixed_q >= 0)
+    {
+        if (oxcf->worst_allowed_q < 0)
+            cpi->oxcf.fixed_q = q_trans[0];
+        else
+            cpi->oxcf.fixed_q = q_trans[oxcf->worst_allowed_q];
+
+        if (oxcf->alt_q < 0)
+            cpi->oxcf.alt_q = q_trans[0];
+        else
+            cpi->oxcf.alt_q = q_trans[oxcf->alt_q];
+
+        if (oxcf->key_q < 0)
+            cpi->oxcf.key_q = q_trans[0];
+        else
+            cpi->oxcf.key_q = q_trans[oxcf->key_q];
+
+        if (oxcf->gold_q < 0)
+            cpi->oxcf.gold_q = q_trans[0];
+        else
+            cpi->oxcf.gold_q = q_trans[oxcf->gold_q];
+
+    }
+
+    cpi->baseline_gf_interval =
+        cpi->oxcf.alt_freq ? cpi->oxcf.alt_freq : DEFAULT_GF_INTERVAL;
+
+#if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
+    cpi->oxcf.token_partitions = 3;
+#endif
+
+    if (cpi->oxcf.token_partitions >= 0 && cpi->oxcf.token_partitions <= 3)
+        cm->multi_token_partition =
+            (TOKEN_PARTITION) cpi->oxcf.token_partitions;
+
+    setup_features(cpi);
+
+    {
+        int i;
+
+        for (i = 0; i < MAX_MB_SEGMENTS; i++)
+            cpi->segment_encode_breakout[i] = cpi->oxcf.encode_breakout;
+    }
+
+    /* At the moment the first order values may not be > MAXQ */
+    if (cpi->oxcf.fixed_q > MAXQ)
+        cpi->oxcf.fixed_q = MAXQ;
+
+    /* local file playback mode == really big buffer */
+    if (cpi->oxcf.end_usage == USAGE_LOCAL_FILE_PLAYBACK)
+    {
+        cpi->oxcf.starting_buffer_level       = 60000;
+        cpi->oxcf.optimal_buffer_level        = 60000;
+        cpi->oxcf.maximum_buffer_size         = 240000;
+        cpi->oxcf.starting_buffer_level_in_ms = 60000;
+        cpi->oxcf.optimal_buffer_level_in_ms  = 60000;
+        cpi->oxcf.maximum_buffer_size_in_ms   = 240000;
+    }
+
+    /* Convert target bandwidth from Kbit/s to Bit/s */
+    cpi->oxcf.target_bandwidth       *= 1000;
+
+    cpi->oxcf.starting_buffer_level =
+        rescale((int)cpi->oxcf.starting_buffer_level,
+                cpi->oxcf.target_bandwidth, 1000);
+
+    /* Set or reset optimal and maximum buffer levels. */
+    if (cpi->oxcf.optimal_buffer_level == 0)
+        cpi->oxcf.optimal_buffer_level = cpi->oxcf.target_bandwidth / 8;
+    else
+        cpi->oxcf.optimal_buffer_level =
+            rescale((int)cpi->oxcf.optimal_buffer_level,
+                    cpi->oxcf.target_bandwidth, 1000);
+
+    if (cpi->oxcf.maximum_buffer_size == 0)
+        cpi->oxcf.maximum_buffer_size = cpi->oxcf.target_bandwidth / 8;
+    else
+        cpi->oxcf.maximum_buffer_size =
+            rescale((int)cpi->oxcf.maximum_buffer_size,
+                    cpi->oxcf.target_bandwidth, 1000);
+    // Under a configuration change, where maximum_buffer_size may change,
+    // keep buffer level clipped to the maximum allowed buffer size.
+    if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size) {
+      cpi->bits_off_target = cpi->oxcf.maximum_buffer_size;
+      cpi->buffer_level = cpi->bits_off_target;
+    }
+
+    /* Set up frame rate and related parameters rate control values. */
+    vp8_new_framerate(cpi, cpi->framerate);
+
+    /* Set absolute upper and lower quality limits */
+    cpi->worst_quality               = cpi->oxcf.worst_allowed_q;
+    cpi->best_quality                = cpi->oxcf.best_allowed_q;
+
+    /* active values should only be modified if out of new range */
+    if (cpi->active_worst_quality > cpi->oxcf.worst_allowed_q)
+    {
+      cpi->active_worst_quality = cpi->oxcf.worst_allowed_q;
+    }
+    /* less likely */
+    else if (cpi->active_worst_quality < cpi->oxcf.best_allowed_q)
+    {
+      cpi->active_worst_quality = cpi->oxcf.best_allowed_q;
+    }
+    if (cpi->active_best_quality < cpi->oxcf.best_allowed_q)
+    {
+      cpi->active_best_quality = cpi->oxcf.best_allowed_q;
+    }
+    /* less likely */
+    else if (cpi->active_best_quality > cpi->oxcf.worst_allowed_q)
+    {
+      cpi->active_best_quality = cpi->oxcf.worst_allowed_q;
+    }
+
+    cpi->buffered_mode = cpi->oxcf.optimal_buffer_level > 0;
+
+    cpi->cq_target_quality = cpi->oxcf.cq_level;
+
+    /* Only allow dropped frames in buffered mode */
+    cpi->drop_frames_allowed = cpi->oxcf.allow_df && cpi->buffered_mode;
+
+    cpi->target_bandwidth = cpi->oxcf.target_bandwidth;
+
+    // Check if the number of temporal layers has changed, and if so reset the
+    // pattern counter and set/initialize the temporal layer context for the
+    // new layer configuration.
+    if (cpi->oxcf.number_of_layers != prev_number_of_layers)
+    {
+        // If the number of temporal layers are changed we must start at the
+        // base of the pattern cycle, so set the layer id to 0 and reset
+        // the temporal pattern counter.
+        if (cpi->temporal_layer_id > 0) {
+          cpi->temporal_layer_id = 0;
+        }
+        cpi->temporal_pattern_counter = 0;
+        reset_temporal_layer_change(cpi, oxcf, prev_number_of_layers);
+    }
+
+    if (!cpi->initial_width)
+    {
+        cpi->initial_width = cpi->oxcf.Width;
+        cpi->initial_height = cpi->oxcf.Height;
+    }
+
+    cm->Width       = cpi->oxcf.Width;
+    cm->Height      = cpi->oxcf.Height;
+    assert(cm->Width <= cpi->initial_width);
+    assert(cm->Height <= cpi->initial_height);
+
+    /* TODO(jkoleszar): if an internal spatial resampling is active,
+     * and we downsize the input image, maybe we should clear the
+     * internal scale immediately rather than waiting for it to
+     * correct.
+     */
+
+    /* VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs) */
+    if (cpi->oxcf.Sharpness > 7)
+        cpi->oxcf.Sharpness = 7;
+
+    cm->sharpness_level = cpi->oxcf.Sharpness;
+
+    if (cm->horiz_scale != NORMAL || cm->vert_scale != NORMAL)
+    {
+        int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs);
+        int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs);
+
+        Scale2Ratio(cm->horiz_scale, &hr, &hs);
+        Scale2Ratio(cm->vert_scale, &vr, &vs);
+
+        /* always go to the next whole number */
+        cm->Width = (hs - 1 + cpi->oxcf.Width * hr) / hs;
+        cm->Height = (vs - 1 + cpi->oxcf.Height * vr) / vs;
+    }
+
+    if (last_w != cpi->oxcf.Width || last_h != cpi->oxcf.Height)
+        cpi->force_next_frame_intra = 1;
+
+    if (((cm->Width + 15) & 0xfffffff0) !=
+          cm->yv12_fb[cm->lst_fb_idx].y_width ||
+        ((cm->Height + 15) & 0xfffffff0) !=
+          cm->yv12_fb[cm->lst_fb_idx].y_height ||
+        cm->yv12_fb[cm->lst_fb_idx].y_width == 0)
+    {
+        dealloc_raw_frame_buffers(cpi);
+        alloc_raw_frame_buffers(cpi);
+        vp8_alloc_compressor_data(cpi);
+    }
+
+    if (cpi->oxcf.fixed_q >= 0)
+    {
+        cpi->last_q[0] = cpi->oxcf.fixed_q;
+        cpi->last_q[1] = cpi->oxcf.fixed_q;
+    }
+
+    cpi->Speed = cpi->oxcf.cpu_used;
+
+    /* force to allowlag to 0 if lag_in_frames is 0; */
+    if (cpi->oxcf.lag_in_frames == 0)
+    {
+        cpi->oxcf.allow_lag = 0;
+    }
+    /* Limit on lag buffers as these are not currently dynamically allocated */
+    else if (cpi->oxcf.lag_in_frames > MAX_LAG_BUFFERS)
+        cpi->oxcf.lag_in_frames = MAX_LAG_BUFFERS;
+
+    /* YX Temp */
+    cpi->alt_ref_source = NULL;
+    cpi->is_src_frame_alt_ref = 0;
+
+#if CONFIG_TEMPORAL_DENOISING
+    if (cpi->oxcf.noise_sensitivity)
+    {
+      if (!cpi->denoiser.yv12_mc_running_avg.buffer_alloc)
+      {
+        int width = (cpi->oxcf.Width + 15) & ~15;
+        int height = (cpi->oxcf.Height + 15) & ~15;
+        vp8_denoiser_allocate(&cpi->denoiser, width, height,
+                              cm->mb_rows, cm->mb_cols,
+                              cpi->oxcf.noise_sensitivity);
+      }
+    }
+#endif
+
+#if 0
+    /* Experimental RD Code */
+    cpi->frame_distortion = 0;
+    cpi->last_frame_distortion = 0;
+#endif
+
+}
+
+#ifndef M_LOG2_E
+#define M_LOG2_E 0.693147180559945309417
+#endif
+#define log2f(x) (log (x) / (float) M_LOG2_E)
+
+static void cal_mvsadcosts(int *mvsadcost[2])
+{
+    int i = 1;
+
+    mvsadcost [0] [0] = 300;
+    mvsadcost [1] [0] = 300;
+
+    do
+    {
+        double z = 256 * (2 * (log2f(8 * i) + .6));
+        mvsadcost [0][i] = (int) z;
+        mvsadcost [1][i] = (int) z;
+        mvsadcost [0][-i] = (int) z;
+        mvsadcost [1][-i] = (int) z;
+    }
+    while (++i <= mvfp_max);
+}
+
+struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
+{
+    int i;
+
+    VP8_COMP *cpi;
+    VP8_COMMON *cm;
+
+    cpi = vpx_memalign(32, sizeof(VP8_COMP));
+    /* Check that the CPI instance is valid */
+    if (!cpi)
+        return 0;
+
+    cm = &cpi->common;
+
+    memset(cpi, 0, sizeof(VP8_COMP));
+
+    if (setjmp(cm->error.jmp))
+    {
+        cpi->common.error.setjmp = 0;
+        vp8_remove_compressor(&cpi);
+        return 0;
+    }
+
+    cpi->common.error.setjmp = 1;
+
+    CHECK_MEM_ERROR(cpi->mb.ss, vpx_calloc(sizeof(search_site), (MAX_MVSEARCH_STEPS * 8) + 1));
+
+    vp8_create_common(&cpi->common);
+
+    init_config(cpi, oxcf);
+
+    memcpy(cpi->base_skip_false_prob, vp8cx_base_skip_false_prob, sizeof(vp8cx_base_skip_false_prob));
+    cpi->common.current_video_frame   = 0;
+    cpi->temporal_pattern_counter     = 0;
+    cpi->temporal_layer_id            = -1;
+    cpi->kf_overspend_bits            = 0;
+    cpi->kf_bitrate_adjustment        = 0;
+    cpi->frames_till_gf_update_due      = 0;
+    cpi->gf_overspend_bits            = 0;
+    cpi->non_gf_bitrate_adjustment     = 0;
+    cpi->prob_last_coded              = 128;
+    cpi->prob_gf_coded                = 128;
+    cpi->prob_intra_coded             = 63;
+
+    /* Prime the recent reference frame usage counters.
+     * Hereafter they will be maintained as a sort of moving average
+     */
+    cpi->recent_ref_frame_usage[INTRA_FRAME]  = 1;
+    cpi->recent_ref_frame_usage[LAST_FRAME]   = 1;
+    cpi->recent_ref_frame_usage[GOLDEN_FRAME] = 1;
+    cpi->recent_ref_frame_usage[ALTREF_FRAME] = 1;
+
+    /* Set reference frame sign bias for ALTREF frame to 1 (for now) */
+    cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 1;
+
+    cpi->twopass.gf_decay_rate = 0;
+    cpi->baseline_gf_interval = DEFAULT_GF_INTERVAL;
+
+    cpi->gold_is_last = 0 ;
+    cpi->alt_is_last  = 0 ;
+    cpi->gold_is_alt  = 0 ;
+
+    cpi->active_map_enabled = 0;
+
+#if 0
+    /* Experimental code for lagged and one pass */
+    /* Initialise one_pass GF frames stats */
+    /* Update stats used for GF selection */
+    if (cpi->pass == 0)
+    {
+        cpi->one_pass_frame_index = 0;
+
+        for (i = 0; i < MAX_LAG_BUFFERS; i++)
+        {
+            cpi->one_pass_frame_stats[i].frames_so_far = 0;
+            cpi->one_pass_frame_stats[i].frame_intra_error = 0.0;
+            cpi->one_pass_frame_stats[i].frame_coded_error = 0.0;
+            cpi->one_pass_frame_stats[i].frame_pcnt_inter = 0.0;
+            cpi->one_pass_frame_stats[i].frame_pcnt_motion = 0.0;
+            cpi->one_pass_frame_stats[i].frame_mvr = 0.0;
+            cpi->one_pass_frame_stats[i].frame_mvr_abs = 0.0;
+            cpi->one_pass_frame_stats[i].frame_mvc = 0.0;
+            cpi->one_pass_frame_stats[i].frame_mvc_abs = 0.0;
+        }
+    }
+#endif
+
+    cpi->mse_source_denoised = 0;
+
+    /* Should we use the cyclic refresh method.
+     * Currently this is tied to error resilliant mode
+     */
+    cpi->cyclic_refresh_mode_enabled = cpi->oxcf.error_resilient_mode;
+    cpi->cyclic_refresh_mode_max_mbs_perframe = (cpi->common.mb_rows * cpi->common.mb_cols) / 7;
+    if (cpi->oxcf.number_of_layers == 1) {
+        cpi->cyclic_refresh_mode_max_mbs_perframe =
+            (cpi->common.mb_rows * cpi->common.mb_cols) / 20;
+    } else if (cpi->oxcf.number_of_layers == 2) {
+        cpi->cyclic_refresh_mode_max_mbs_perframe =
+            (cpi->common.mb_rows * cpi->common.mb_cols) / 10;
+    }
+    cpi->cyclic_refresh_mode_index = 0;
+    cpi->cyclic_refresh_q = 32;
+
+    if (cpi->cyclic_refresh_mode_enabled)
+    {
+        CHECK_MEM_ERROR(cpi->cyclic_refresh_map, vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));
+    }
+    else
+        cpi->cyclic_refresh_map = (signed char *) NULL;
+
+    CHECK_MEM_ERROR(cpi->consec_zero_last,
+                    vpx_calloc(cm->mb_rows * cm->mb_cols, 1));
+    CHECK_MEM_ERROR(cpi->consec_zero_last_mvbias,
+                    vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));
+
+#ifdef VP8_ENTROPY_STATS
+    init_context_counters();
+#endif
+
+    /*Initialize the feed-forward activity masking.*/
+    cpi->activity_avg = 90<<12;
+
+    /* Give a sensible default for the first frame. */
+    cpi->frames_since_key = 8;
+    cpi->key_frame_frequency = cpi->oxcf.key_freq;
+    cpi->this_key_frame_forced = 0;
+    cpi->next_key_frame_forced = 0;
+
+    cpi->source_alt_ref_pending = 0;
+    cpi->source_alt_ref_active = 0;
+    cpi->common.refresh_alt_ref_frame = 0;
+
+    cpi->force_maxqp = 0;
+
+    cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
+#if CONFIG_INTERNAL_STATS
+    cpi->b_calculate_ssimg = 0;
+
+    cpi->count = 0;
+    cpi->bytes = 0;
+
+    if (cpi->b_calculate_psnr)
+    {
+        cpi->total_sq_error = 0.0;
+        cpi->total_sq_error2 = 0.0;
+        cpi->total_y = 0.0;
+        cpi->total_u = 0.0;
+        cpi->total_v = 0.0;
+        cpi->total = 0.0;
+        cpi->totalp_y = 0.0;
+        cpi->totalp_u = 0.0;
+        cpi->totalp_v = 0.0;
+        cpi->totalp = 0.0;
+        cpi->tot_recode_hits = 0;
+        cpi->summed_quality = 0;
+        cpi->summed_weights = 0;
+    }
+
+    if (cpi->b_calculate_ssimg)
+    {
+        cpi->total_ssimg_y = 0;
+        cpi->total_ssimg_u = 0;
+        cpi->total_ssimg_v = 0;
+        cpi->total_ssimg_all = 0;
+    }
+
+#endif
+
+    cpi->first_time_stamp_ever = 0x7FFFFFFF;
+
+    cpi->frames_till_gf_update_due      = 0;
+    cpi->key_frame_count              = 1;
+
+    cpi->ni_av_qi                     = cpi->oxcf.worst_allowed_q;
+    cpi->ni_tot_qi                    = 0;
+    cpi->ni_frames                   = 0;
+    cpi->total_byte_count             = 0;
+
+    cpi->drop_frame                  = 0;
+
+    cpi->rate_correction_factor         = 1.0;
+    cpi->key_frame_rate_correction_factor = 1.0;
+    cpi->gf_rate_correction_factor  = 1.0;
+    cpi->twopass.est_max_qcorrection_factor  = 1.0;
+
+    for (i = 0; i < KEY_FRAME_CONTEXT; i++)
+    {
+        cpi->prior_key_frame_distance[i] = (int)cpi->output_framerate;
+    }
+
+#ifdef OUTPUT_YUV_SRC
+    yuv_file = fopen("bd.yuv", "ab");
+#endif
+#ifdef OUTPUT_YUV_DENOISED
+    yuv_denoised_file = fopen("denoised.yuv", "ab");
+#endif
+
+#if 0
+    framepsnr = fopen("framepsnr.stt", "a");
+    kf_list = fopen("kf_list.stt", "w");
+#endif
+
+    cpi->output_pkt_list = oxcf->output_pkt_list;
+
+#if !CONFIG_REALTIME_ONLY
+
+    if (cpi->pass == 1)
+    {
+        vp8_init_first_pass(cpi);
+    }
+    else if (cpi->pass == 2)
+    {
+        size_t packet_sz = sizeof(FIRSTPASS_STATS);
+        int packets = (int)(oxcf->two_pass_stats_in.sz / packet_sz);
+
+        cpi->twopass.stats_in_start = oxcf->two_pass_stats_in.buf;
+        cpi->twopass.stats_in = cpi->twopass.stats_in_start;
+        cpi->twopass.stats_in_end = (void*)((char *)cpi->twopass.stats_in
+                            + (packets - 1) * packet_sz);
+        vp8_init_second_pass(cpi);
+    }
+
+#endif
+
+    if (cpi->compressor_speed == 2)
+    {
+        cpi->avg_encode_time      = 0;
+        cpi->avg_pick_mode_time    = 0;
+    }
+
+    vp8_set_speed_features(cpi);
+
+    /* Set starting values of RD threshold multipliers (128 = *1) */
+    for (i = 0; i < MAX_MODES; i++)
+    {
+        cpi->mb.rd_thresh_mult[i] = 128;
+    }
+
+#ifdef VP8_ENTROPY_STATS
+    init_mv_ref_counts();
+#endif
+
+#if CONFIG_MULTITHREAD
+    if(vp8cx_create_encoder_threads(cpi))
+    {
+        vp8_remove_compressor(&cpi);
+        return 0;
+    }
+#endif
+
+    cpi->fn_ptr[BLOCK_16X16].sdf            = vpx_sad16x16;
+    cpi->fn_ptr[BLOCK_16X16].vf             = vpx_variance16x16;
+    cpi->fn_ptr[BLOCK_16X16].svf            = vpx_sub_pixel_variance16x16;
+    cpi->fn_ptr[BLOCK_16X16].svf_halfpix_h  = vpx_variance_halfpixvar16x16_h;
+    cpi->fn_ptr[BLOCK_16X16].svf_halfpix_v  = vpx_variance_halfpixvar16x16_v;
+    cpi->fn_ptr[BLOCK_16X16].svf_halfpix_hv = vpx_variance_halfpixvar16x16_hv;
+    cpi->fn_ptr[BLOCK_16X16].sdx3f          = vpx_sad16x16x3;
+    cpi->fn_ptr[BLOCK_16X16].sdx8f          = vpx_sad16x16x8;
+    cpi->fn_ptr[BLOCK_16X16].sdx4df         = vpx_sad16x16x4d;
+
+    cpi->fn_ptr[BLOCK_16X8].sdf            = vpx_sad16x8;
+    cpi->fn_ptr[BLOCK_16X8].vf             = vpx_variance16x8;
+    cpi->fn_ptr[BLOCK_16X8].svf            = vpx_sub_pixel_variance16x8;
+    cpi->fn_ptr[BLOCK_16X8].svf_halfpix_h  = NULL;
+    cpi->fn_ptr[BLOCK_16X8].svf_halfpix_v  = NULL;
+    cpi->fn_ptr[BLOCK_16X8].svf_halfpix_hv = NULL;
+    cpi->fn_ptr[BLOCK_16X8].sdx3f          = vpx_sad16x8x3;
+    cpi->fn_ptr[BLOCK_16X8].sdx8f          = vpx_sad16x8x8;
+    cpi->fn_ptr[BLOCK_16X8].sdx4df         = vpx_sad16x8x4d;
+
+    cpi->fn_ptr[BLOCK_8X16].sdf            = vpx_sad8x16;
+    cpi->fn_ptr[BLOCK_8X16].vf             = vpx_variance8x16;
+    cpi->fn_ptr[BLOCK_8X16].svf            = vpx_sub_pixel_variance8x16;
+    cpi->fn_ptr[BLOCK_8X16].svf_halfpix_h  = NULL;
+    cpi->fn_ptr[BLOCK_8X16].svf_halfpix_v  = NULL;
+    cpi->fn_ptr[BLOCK_8X16].svf_halfpix_hv = NULL;
+    cpi->fn_ptr[BLOCK_8X16].sdx3f          = vpx_sad8x16x3;
+    cpi->fn_ptr[BLOCK_8X16].sdx8f          = vpx_sad8x16x8;
+    cpi->fn_ptr[BLOCK_8X16].sdx4df         = vpx_sad8x16x4d;
+
+    cpi->fn_ptr[BLOCK_8X8].sdf            = vpx_sad8x8;
+    cpi->fn_ptr[BLOCK_8X8].vf             = vpx_variance8x8;
+    cpi->fn_ptr[BLOCK_8X8].svf            = vpx_sub_pixel_variance8x8;
+    cpi->fn_ptr[BLOCK_8X8].svf_halfpix_h  = NULL;
+    cpi->fn_ptr[BLOCK_8X8].svf_halfpix_v  = NULL;
+    cpi->fn_ptr[BLOCK_8X8].svf_halfpix_hv = NULL;
+    cpi->fn_ptr[BLOCK_8X8].sdx3f          = vpx_sad8x8x3;
+    cpi->fn_ptr[BLOCK_8X8].sdx8f          = vpx_sad8x8x8;
+    cpi->fn_ptr[BLOCK_8X8].sdx4df         = vpx_sad8x8x4d;
+
+    cpi->fn_ptr[BLOCK_4X4].sdf            = vpx_sad4x4;
+    cpi->fn_ptr[BLOCK_4X4].vf             = vpx_variance4x4;
+    cpi->fn_ptr[BLOCK_4X4].svf            = vpx_sub_pixel_variance4x4;
+    cpi->fn_ptr[BLOCK_4X4].svf_halfpix_h  = NULL;
+    cpi->fn_ptr[BLOCK_4X4].svf_halfpix_v  = NULL;
+    cpi->fn_ptr[BLOCK_4X4].svf_halfpix_hv = NULL;
+    cpi->fn_ptr[BLOCK_4X4].sdx3f          = vpx_sad4x4x3;
+    cpi->fn_ptr[BLOCK_4X4].sdx8f          = vpx_sad4x4x8;
+    cpi->fn_ptr[BLOCK_4X4].sdx4df         = vpx_sad4x4x4d;
+
+#if ARCH_X86 || ARCH_X86_64
+    cpi->fn_ptr[BLOCK_16X16].copymem      = vp8_copy32xn;
+    cpi->fn_ptr[BLOCK_16X8].copymem       = vp8_copy32xn;
+    cpi->fn_ptr[BLOCK_8X16].copymem       = vp8_copy32xn;
+    cpi->fn_ptr[BLOCK_8X8].copymem        = vp8_copy32xn;
+    cpi->fn_ptr[BLOCK_4X4].copymem        = vp8_copy32xn;
+#endif
+
+    cpi->full_search_sad = vp8_full_search_sad;
+    cpi->diamond_search_sad = vp8_diamond_search_sad;
+    cpi->refining_search_sad = vp8_refining_search_sad;
+
+    /* make sure frame 1 is okay */
+    cpi->mb.error_bins[0] = cpi->common.MBs;
+
+    /* vp8cx_init_quantizer() is first called here. Add check in
+     * vp8cx_frame_init_quantizer() so that vp8cx_init_quantizer is only
+     * called later when needed. This will avoid unnecessary calls of
+     * vp8cx_init_quantizer() for every frame.
+     */
+    vp8cx_init_quantizer(cpi);
+
+    vp8_loop_filter_init(cm);
+
+    cpi->common.error.setjmp = 0;
+
+#if CONFIG_MULTI_RES_ENCODING
+
+    /* Calculate # of MBs in a row in lower-resolution level image. */
+    if (cpi->oxcf.mr_encoder_id > 0)
+        vp8_cal_low_res_mb_cols(cpi);
+
+#endif
+
+    /* setup RD costs to MACROBLOCK struct */
+
+    cpi->mb.mvcost[0] = &cpi->rd_costs.mvcosts[0][mv_max+1];
+    cpi->mb.mvcost[1] = &cpi->rd_costs.mvcosts[1][mv_max+1];
+    cpi->mb.mvsadcost[0] = &cpi->rd_costs.mvsadcosts[0][mvfp_max+1];
+    cpi->mb.mvsadcost[1] = &cpi->rd_costs.mvsadcosts[1][mvfp_max+1];
+
+    cal_mvsadcosts(cpi->mb.mvsadcost);
+
+    cpi->mb.mbmode_cost = cpi->rd_costs.mbmode_cost;
+    cpi->mb.intra_uv_mode_cost = cpi->rd_costs.intra_uv_mode_cost;
+    cpi->mb.bmode_costs = cpi->rd_costs.bmode_costs;
+    cpi->mb.inter_bmode_costs = cpi->rd_costs.inter_bmode_costs;
+    cpi->mb.token_costs = cpi->rd_costs.token_costs;
+
+    /* setup block ptrs & offsets */
+    vp8_setup_block_ptrs(&cpi->mb);
+    vp8_setup_block_dptrs(&cpi->mb.e_mbd);
+
+    return  cpi;
+}
+
+
+void vp8_remove_compressor(VP8_COMP **ptr)
+{
+    VP8_COMP *cpi = *ptr;
+
+    if (!cpi)
+        return;
+
+    if (cpi && (cpi->common.current_video_frame > 0))
+    {
+#if !CONFIG_REALTIME_ONLY
+
+        if (cpi->pass == 2)
+        {
+            vp8_end_second_pass(cpi);
+        }
+
+#endif
+
+#ifdef VP8_ENTROPY_STATS
+        print_context_counters();
+        print_tree_update_probs();
+        print_mode_context();
+#endif
+
+#if CONFIG_INTERNAL_STATS
+
+        if (cpi->pass != 1)
+        {
+            FILE *f = fopen("opsnr.stt", "a");
+            double time_encoded = (cpi->last_end_time_stamp_seen
+                                   - cpi->first_time_stamp_ever) / 10000000.000;
+            double total_encode_time = (cpi->time_receive_data +
+                                            cpi->time_compress_data) / 1000.000;
+            double dr = (double)cpi->bytes * 8.0 / 1000.0 / time_encoded;
+
+            if (cpi->b_calculate_psnr)
+            {
+                if (cpi->oxcf.number_of_layers > 1)
+                {
+                    int i;
+
+                    fprintf(f, "Layer\tBitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\t"
+                               "GLPsnrP\tVPXSSIM\t\n");
+                    for (i=0; i<(int)cpi->oxcf.number_of_layers; i++)
+                    {
+                        double dr = (double)cpi->bytes_in_layer[i] *
+                                              8.0 / 1000.0  / time_encoded;
+                        double samples = 3.0 / 2 * cpi->frames_in_layer[i] *
+                                         cpi->common.Width * cpi->common.Height;
+                        double total_psnr =
+                            vpx_sse_to_psnr(samples, 255.0,
+                                            cpi->total_error2[i]);
+                        double total_psnr2 =
+                            vpx_sse_to_psnr(samples, 255.0,
+                                            cpi->total_error2_p[i]);
+                        double total_ssim = 100 * pow(cpi->sum_ssim[i] /
+                                                      cpi->sum_weights[i], 8.0);
+
+                        fprintf(f, "%5d\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+                                   "%7.3f\t%7.3f\n",
+                                   i, dr,
+                                   cpi->sum_psnr[i] / cpi->frames_in_layer[i],
+                                   total_psnr,
+                                   cpi->sum_psnr_p[i] / cpi->frames_in_layer[i],
+                                   total_psnr2, total_ssim);
+                    }
+                }
+                else
+                {
+                    double samples = 3.0 / 2 * cpi->count *
+                                     cpi->common.Width * cpi->common.Height;
+                    double total_psnr = vpx_sse_to_psnr(samples, 255.0,
+                                                        cpi->total_sq_error);
+                    double total_psnr2 = vpx_sse_to_psnr(samples, 255.0,
+                                                         cpi->total_sq_error2);
+                    double total_ssim = 100 * pow(cpi->summed_quality /
+                                                      cpi->summed_weights, 8.0);
+
+                    fprintf(f, "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\t"
+                               "GLPsnrP\tVPXSSIM\t  Time(us)\n");
+                    fprintf(f, "%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+                               "%7.3f\t%8.0f\n",
+                               dr, cpi->total / cpi->count, total_psnr,
+                               cpi->totalp / cpi->count, total_psnr2,
+                               total_ssim, total_encode_time);
+                }
+            }
+
+            if (cpi->b_calculate_ssimg)
+            {
+                if (cpi->oxcf.number_of_layers > 1)
+                {
+                    int i;
+
+                    fprintf(f, "Layer\tBitRate\tSSIM_Y\tSSIM_U\tSSIM_V\tSSIM_A\t"
+                               "Time(us)\n");
+                    for (i=0; i<(int)cpi->oxcf.number_of_layers; i++)
+                    {
+                        double dr = (double)cpi->bytes_in_layer[i] *
+                                    8.0 / 1000.0  / time_encoded;
+                        fprintf(f, "%5d\t%7.3f\t%6.4f\t"
+                                "%6.4f\t%6.4f\t%6.4f\t%8.0f\n",
+                                i, dr,
+                                cpi->total_ssimg_y_in_layer[i] /
+                                     cpi->frames_in_layer[i],
+                                cpi->total_ssimg_u_in_layer[i] /
+                                     cpi->frames_in_layer[i],
+                                cpi->total_ssimg_v_in_layer[i] /
+                                     cpi->frames_in_layer[i],
+                                cpi->total_ssimg_all_in_layer[i] /
+                                     cpi->frames_in_layer[i],
+                                total_encode_time);
+                    }
+                }
+                else
+                {
+                    fprintf(f, "BitRate\tSSIM_Y\tSSIM_U\tSSIM_V\tSSIM_A\t"
+                               "Time(us)\n");
+                    fprintf(f, "%7.3f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\t%8.0f\n", dr,
+                            cpi->total_ssimg_y / cpi->count,
+                            cpi->total_ssimg_u / cpi->count,
+                            cpi->total_ssimg_v / cpi->count,
+                            cpi->total_ssimg_all / cpi->count, total_encode_time);
+                }
+            }
+
+            fclose(f);
+#if 0
+            f = fopen("qskip.stt", "a");
+            fprintf(f, "minq:%d -maxq:%d skiptrue:skipfalse = %d:%d\n", cpi->oxcf.best_allowed_q, cpi->oxcf.worst_allowed_q, skiptruecount, skipfalsecount);
+            fclose(f);
+#endif
+
+        }
+
+#endif
+
+
+#ifdef SPEEDSTATS
+
+        if (cpi->compressor_speed == 2)
+        {
+            int i;
+            FILE *f = fopen("cxspeed.stt", "a");
+            cnt_pm /= cpi->common.MBs;
+
+            for (i = 0; i < 16; i++)
+                fprintf(f, "%5d", frames_at_speed[i]);
+
+            fprintf(f, "\n");
+            fclose(f);
+        }
+
+#endif
+
+
+#ifdef MODE_STATS
+        {
+            extern int count_mb_seg[4];
+            FILE *f = fopen("modes.stt", "a");
+            double dr = (double)cpi->framerate * (double)bytes * (double)8 / (double)count / (double)1000 ;
+            fprintf(f, "intra_mode in Intra Frames:\n");
+            fprintf(f, "Y: %8d, %8d, %8d, %8d, %8d\n", y_modes[0], y_modes[1], y_modes[2], y_modes[3], y_modes[4]);
+            fprintf(f, "UV:%8d, %8d, %8d, %8d\n", uv_modes[0], uv_modes[1], uv_modes[2], uv_modes[3]);
+            fprintf(f, "B: ");
+            {
+                int i;
+
+                for (i = 0; i < 10; i++)
+                    fprintf(f, "%8d, ", b_modes[i]);
+
+                fprintf(f, "\n");
+
+            }
+
+            fprintf(f, "Modes in Inter Frames:\n");
+            fprintf(f, "Y: %8d, %8d, %8d, %8d, %8d, %8d, %8d, %8d, %8d, %8d\n",
+                    inter_y_modes[0], inter_y_modes[1], inter_y_modes[2], inter_y_modes[3], inter_y_modes[4],
+                    inter_y_modes[5], inter_y_modes[6], inter_y_modes[7], inter_y_modes[8], inter_y_modes[9]);
+            fprintf(f, "UV:%8d, %8d, %8d, %8d\n", inter_uv_modes[0], inter_uv_modes[1], inter_uv_modes[2], inter_uv_modes[3]);
+            fprintf(f, "B: ");
+            {
+                int i;
+
+                for (i = 0; i < 15; i++)
+                    fprintf(f, "%8d, ", inter_b_modes[i]);
+
+                fprintf(f, "\n");
+
+            }
+            fprintf(f, "P:%8d, %8d, %8d, %8d\n", count_mb_seg[0], count_mb_seg[1], count_mb_seg[2], count_mb_seg[3]);
+            fprintf(f, "PB:%8d, %8d, %8d, %8d\n", inter_b_modes[LEFT4X4], inter_b_modes[ABOVE4X4], inter_b_modes[ZERO4X4], inter_b_modes[NEW4X4]);
+
+
+
+            fclose(f);
+        }
+#endif
+
+#ifdef VP8_ENTROPY_STATS
+        {
+            int i, j, k;
+            FILE *fmode = fopen("modecontext.c", "w");
+
+            fprintf(fmode, "\n#include \"entropymode.h\"\n\n");
+            fprintf(fmode, "const unsigned int vp8_kf_default_bmode_counts ");
+            fprintf(fmode, "[VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES] =\n{\n");
+
+            for (i = 0; i < 10; i++)
+            {
+
+                fprintf(fmode, "    { /* Above Mode :  %d */\n", i);
+
+                for (j = 0; j < 10; j++)
+                {
+
+                    fprintf(fmode, "        {");
+
+                    for (k = 0; k < 10; k++)
+                    {
+                        if (!intra_mode_stats[i][j][k])
+                            fprintf(fmode, " %5d, ", 1);
+                        else
+                            fprintf(fmode, " %5d, ", intra_mode_stats[i][j][k]);
+                    }
+
+                    fprintf(fmode, "}, /* left_mode %d */\n", j);
+
+                }
+
+                fprintf(fmode, "    },\n");
+
+            }
+
+            fprintf(fmode, "};\n");
+            fclose(fmode);
+        }
+#endif
+
+
+#if defined(SECTIONBITS_OUTPUT)
+
+        if (0)
+        {
+            int i;
+            FILE *f = fopen("tokenbits.stt", "a");
+
+            for (i = 0; i < 28; i++)
+                fprintf(f, "%8d", (int)(Sectionbits[i] / 256));
+
+            fprintf(f, "\n");
+            fclose(f);
+        }
+
+#endif
+
+#if 0
+        {
+            printf("\n_pick_loop_filter_level:%d\n", cpi->time_pick_lpf / 1000);
+            printf("\n_frames recive_data encod_mb_row compress_frame  Total\n");
+            printf("%6d %10ld %10ld %10ld %10ld\n", cpi->common.current_video_frame, cpi->time_receive_data / 1000, cpi->time_encode_mb_row / 1000, cpi->time_compress_data / 1000, (cpi->time_receive_data + cpi->time_compress_data) / 1000);
+        }
+#endif
+
+    }
+
+#if CONFIG_MULTITHREAD
+    vp8cx_remove_encoder_threads(cpi);
+#endif
+
+#if CONFIG_TEMPORAL_DENOISING
+    vp8_denoiser_free(&cpi->denoiser);
+#endif
+    dealloc_compressor_data(cpi);
+    vpx_free(cpi->mb.ss);
+    vpx_free(cpi->tok);
+    vpx_free(cpi->cyclic_refresh_map);
+    vpx_free(cpi->consec_zero_last);
+    vpx_free(cpi->consec_zero_last_mvbias);
+
+    vp8_remove_common(&cpi->common);
+    vpx_free(cpi);
+    *ptr = 0;
+
+#ifdef OUTPUT_YUV_SRC
+    fclose(yuv_file);
+#endif
+#ifdef OUTPUT_YUV_DENOISED
+    fclose(yuv_denoised_file);
+#endif
+
+#if 0
+
+    if (keyfile)
+        fclose(keyfile);
+
+    if (framepsnr)
+        fclose(framepsnr);
+
+    if (kf_list)
+        fclose(kf_list);
+
+#endif
+
+}
+
+
+static uint64_t calc_plane_error(unsigned char *orig, int orig_stride,
+                                 unsigned char *recon, int recon_stride,
+                                 unsigned int cols, unsigned int rows)
+{
+    unsigned int row, col;
+    uint64_t total_sse = 0;
+    int diff;
+
+    for (row = 0; row + 16 <= rows; row += 16)
+    {
+        for (col = 0; col + 16 <= cols; col += 16)
+        {
+            unsigned int sse;
+
+            vpx_mse16x16(orig + col, orig_stride,
+                                            recon + col, recon_stride,
+                                            &sse);
+            total_sse += sse;
+        }
+
+        /* Handle odd-sized width */
+        if (col < cols)
+        {
+            unsigned int   border_row, border_col;
+            unsigned char *border_orig = orig;
+            unsigned char *border_recon = recon;
+
+            for (border_row = 0; border_row < 16; border_row++)
+            {
+                for (border_col = col; border_col < cols; border_col++)
+                {
+                    diff = border_orig[border_col] - border_recon[border_col];
+                    total_sse += diff * diff;
+                }
+
+                border_orig += orig_stride;
+                border_recon += recon_stride;
+            }
+        }
+
+        orig += orig_stride * 16;
+        recon += recon_stride * 16;
+    }
+
+    /* Handle odd-sized height */
+    for (; row < rows; row++)
+    {
+        for (col = 0; col < cols; col++)
+        {
+            diff = orig[col] - recon[col];
+            total_sse += diff * diff;
+        }
+
+        orig += orig_stride;
+        recon += recon_stride;
+    }
+
+    vp8_clear_system_state();
+    return total_sse;
+}
+
+
+static void generate_psnr_packet(VP8_COMP *cpi)
+{
+    YV12_BUFFER_CONFIG      *orig = cpi->Source;
+    YV12_BUFFER_CONFIG      *recon = cpi->common.frame_to_show;
+    struct vpx_codec_cx_pkt  pkt;
+    uint64_t                 sse;
+    int                      i;
+    unsigned int             width = cpi->common.Width;
+    unsigned int             height = cpi->common.Height;
+
+    pkt.kind = VPX_CODEC_PSNR_PKT;
+    sse = calc_plane_error(orig->y_buffer, orig->y_stride,
+                           recon->y_buffer, recon->y_stride,
+                           width, height);
+    pkt.data.psnr.sse[0] = sse;
+    pkt.data.psnr.sse[1] = sse;
+    pkt.data.psnr.samples[0] = width * height;
+    pkt.data.psnr.samples[1] = width * height;
+
+    width = (width + 1) / 2;
+    height = (height + 1) / 2;
+
+    sse = calc_plane_error(orig->u_buffer, orig->uv_stride,
+                           recon->u_buffer, recon->uv_stride,
+                           width, height);
+    pkt.data.psnr.sse[0] += sse;
+    pkt.data.psnr.sse[2] = sse;
+    pkt.data.psnr.samples[0] += width * height;
+    pkt.data.psnr.samples[2] = width * height;
+
+    sse = calc_plane_error(orig->v_buffer, orig->uv_stride,
+                           recon->v_buffer, recon->uv_stride,
+                           width, height);
+    pkt.data.psnr.sse[0] += sse;
+    pkt.data.psnr.sse[3] = sse;
+    pkt.data.psnr.samples[0] += width * height;
+    pkt.data.psnr.samples[3] = width * height;
+
+    for (i = 0; i < 4; i++)
+        pkt.data.psnr.psnr[i] = vpx_sse_to_psnr(pkt.data.psnr.samples[i], 255.0,
+                                                (double)(pkt.data.psnr.sse[i]));
+
+    vpx_codec_pkt_list_add(cpi->output_pkt_list, &pkt);
+}
+
+
+int vp8_use_as_reference(VP8_COMP *cpi, int ref_frame_flags)
+{
+    if (ref_frame_flags > 7)
+        return -1 ;
+
+    cpi->ref_frame_flags = ref_frame_flags;
+    return 0;
+}
+int vp8_update_reference(VP8_COMP *cpi, int ref_frame_flags)
+{
+    if (ref_frame_flags > 7)
+        return -1 ;
+
+    cpi->common.refresh_golden_frame = 0;
+    cpi->common.refresh_alt_ref_frame = 0;
+    cpi->common.refresh_last_frame   = 0;
+
+    if (ref_frame_flags & VP8_LAST_FRAME)
+        cpi->common.refresh_last_frame = 1;
+
+    if (ref_frame_flags & VP8_GOLD_FRAME)
+        cpi->common.refresh_golden_frame = 1;
+
+    if (ref_frame_flags & VP8_ALTR_FRAME)
+        cpi->common.refresh_alt_ref_frame = 1;
+
+    return 0;
+}
+
+int vp8_get_reference(VP8_COMP *cpi, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd)
+{
+    VP8_COMMON *cm = &cpi->common;
+    int ref_fb_idx;
+
+    if (ref_frame_flag == VP8_LAST_FRAME)
+        ref_fb_idx = cm->lst_fb_idx;
+    else if (ref_frame_flag == VP8_GOLD_FRAME)
+        ref_fb_idx = cm->gld_fb_idx;
+    else if (ref_frame_flag == VP8_ALTR_FRAME)
+        ref_fb_idx = cm->alt_fb_idx;
+    else
+        return -1;
+
+    vp8_yv12_copy_frame(&cm->yv12_fb[ref_fb_idx], sd);
+
+    return 0;
+}
+int vp8_set_reference(VP8_COMP *cpi, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd)
+{
+    VP8_COMMON *cm = &cpi->common;
+
+    int ref_fb_idx;
+
+    if (ref_frame_flag == VP8_LAST_FRAME)
+        ref_fb_idx = cm->lst_fb_idx;
+    else if (ref_frame_flag == VP8_GOLD_FRAME)
+        ref_fb_idx = cm->gld_fb_idx;
+    else if (ref_frame_flag == VP8_ALTR_FRAME)
+        ref_fb_idx = cm->alt_fb_idx;
+    else
+        return -1;
+
+    vp8_yv12_copy_frame(sd, &cm->yv12_fb[ref_fb_idx]);
+
+    return 0;
+}
+int vp8_update_entropy(VP8_COMP *cpi, int update)
+{
+    VP8_COMMON *cm = &cpi->common;
+    cm->refresh_entropy_probs = update;
+
+    return 0;
+}
+
+
+#if defined(OUTPUT_YUV_SRC) || defined(OUTPUT_YUV_DENOISED)
+void vp8_write_yuv_frame(FILE *yuv_file, YV12_BUFFER_CONFIG *s)
+{
+    unsigned char *src = s->y_buffer;
+    int h = s->y_height;
+
+    do
+    {
+        fwrite(src, s->y_width, 1,  yuv_file);
+        src += s->y_stride;
+    }
+    while (--h);
+
+    src = s->u_buffer;
+    h = s->uv_height;
+
+    do
+    {
+        fwrite(src, s->uv_width, 1,  yuv_file);
+        src += s->uv_stride;
+    }
+    while (--h);
+
+    src = s->v_buffer;
+    h = s->uv_height;
+
+    do
+    {
+        fwrite(src, s->uv_width, 1, yuv_file);
+        src += s->uv_stride;
+    }
+    while (--h);
+}
+#endif
+
+static void scale_and_extend_source(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
+{
+    VP8_COMMON *cm = &cpi->common;
+
+    /* are we resizing the image */
+    if (cm->horiz_scale != 0 || cm->vert_scale != 0)
+    {
+#if CONFIG_SPATIAL_RESAMPLING
+        int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs);
+        int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs);
+        int tmp_height;
+
+        if (cm->vert_scale == 3)
+            tmp_height = 9;
+        else
+            tmp_height = 11;
+
+        Scale2Ratio(cm->horiz_scale, &hr, &hs);
+        Scale2Ratio(cm->vert_scale, &vr, &vs);
+
+        vpx_scale_frame(sd, &cpi->scaled_source, cm->temp_scale_frame.y_buffer,
+                        tmp_height, hs, hr, vs, vr, 0);
+
+        vp8_yv12_extend_frame_borders(&cpi->scaled_source);
+        cpi->Source = &cpi->scaled_source;
+#endif
+    }
+    else
+        cpi->Source = sd;
+}
+
+
+static int resize_key_frame(VP8_COMP *cpi)
+{
+#if CONFIG_SPATIAL_RESAMPLING
+    VP8_COMMON *cm = &cpi->common;
+
+    /* Do we need to apply resampling for one pass cbr.
+     * In one pass this is more limited than in two pass cbr.
+     * The test and any change is only made once per key frame sequence.
+     */
+    if (cpi->oxcf.allow_spatial_resampling && (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER))
+    {
+        int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs);
+        int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs);
+        int new_width, new_height;
+
+        /* If we are below the resample DOWN watermark then scale down a
+         * notch.
+         */
+        if (cpi->buffer_level < (cpi->oxcf.resample_down_water_mark * cpi->oxcf.optimal_buffer_level / 100))
+        {
+            cm->horiz_scale = (cm->horiz_scale < ONETWO) ? cm->horiz_scale + 1 : ONETWO;
+            cm->vert_scale = (cm->vert_scale < ONETWO) ? cm->vert_scale + 1 : ONETWO;
+        }
+        /* Should we now start scaling back up */
+        else if (cpi->buffer_level > (cpi->oxcf.resample_up_water_mark * cpi->oxcf.optimal_buffer_level / 100))
+        {
+            cm->horiz_scale = (cm->horiz_scale > NORMAL) ? cm->horiz_scale - 1 : NORMAL;
+            cm->vert_scale = (cm->vert_scale > NORMAL) ? cm->vert_scale - 1 : NORMAL;
+        }
+
+        /* Get the new height and width */
+        Scale2Ratio(cm->horiz_scale, &hr, &hs);
+        Scale2Ratio(cm->vert_scale, &vr, &vs);
+        new_width = ((hs - 1) + (cpi->oxcf.Width * hr)) / hs;
+        new_height = ((vs - 1) + (cpi->oxcf.Height * vr)) / vs;
+
+        /* If the image size has changed we need to reallocate the buffers
+         * and resample the source image
+         */
+        if ((cm->Width != new_width) || (cm->Height != new_height))
+        {
+            cm->Width = new_width;
+            cm->Height = new_height;
+            vp8_alloc_compressor_data(cpi);
+            scale_and_extend_source(cpi->un_scaled_source, cpi);
+            return 1;
+        }
+    }
+
+#endif
+    return 0;
+}
+
+
+static void update_alt_ref_frame_stats(VP8_COMP *cpi)
+{
+    VP8_COMMON *cm = &cpi->common;
+
+    /* Select an interval before next GF or altref */
+    if (!cpi->auto_gold)
+        cpi->frames_till_gf_update_due = DEFAULT_GF_INTERVAL;
+
+    if ((cpi->pass != 2) && cpi->frames_till_gf_update_due)
+    {
+        cpi->current_gf_interval = cpi->frames_till_gf_update_due;
+
+        /* Set the bits per frame that we should try and recover in
+         * subsequent inter frames to account for the extra GF spend...
+         * note that his does not apply for GF updates that occur
+         * coincident with a key frame as the extra cost of key frames is
+         * dealt with elsewhere.
+         */
+        cpi->gf_overspend_bits += cpi->projected_frame_size;
+        cpi->non_gf_bitrate_adjustment = cpi->gf_overspend_bits / cpi->frames_till_gf_update_due;
+    }
+
+    /* Update data structure that monitors level of reference to last GF */
+    memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
+    cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
+
+    /* this frame refreshes means next frames don't unless specified by user */
+    cpi->frames_since_golden = 0;
+
+    /* Clear the alternate reference update pending flag. */
+    cpi->source_alt_ref_pending = 0;
+
+    /* Set the alternate reference frame active flag */
+    cpi->source_alt_ref_active = 1;
+
+
+}
+static void update_golden_frame_stats(VP8_COMP *cpi)
+{
+    VP8_COMMON *cm = &cpi->common;
+
+    /* Update the Golden frame usage counts. */
+    if (cm->refresh_golden_frame)
+    {
+        /* Select an interval before next GF */
+        if (!cpi->auto_gold)
+            cpi->frames_till_gf_update_due = DEFAULT_GF_INTERVAL;
+
+        if ((cpi->pass != 2) && (cpi->frames_till_gf_update_due > 0))
+        {
+            cpi->current_gf_interval = cpi->frames_till_gf_update_due;
+
+            /* Set the bits per frame that we should try and recover in
+             * subsequent inter frames to account for the extra GF spend...
+             * note that his does not apply for GF updates that occur
+             * coincident with a key frame as the extra cost of key frames
+             * is dealt with elsewhere.
+             */
+            if ((cm->frame_type != KEY_FRAME) && !cpi->source_alt_ref_active)
+            {
+                /* Calcluate GF bits to be recovered
+                 * Projected size - av frame bits available for inter
+                 * frames for clip as a whole
+                 */
+                cpi->gf_overspend_bits += (cpi->projected_frame_size - cpi->inter_frame_target);
+            }
+
+            cpi->non_gf_bitrate_adjustment = cpi->gf_overspend_bits / cpi->frames_till_gf_update_due;
+
+        }
+
+        /* Update data structure that monitors level of reference to last GF */
+        memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
+        cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
+
+        /* this frame refreshes means next frames don't unless specified by
+         * user
+         */
+        cm->refresh_golden_frame = 0;
+        cpi->frames_since_golden = 0;
+
+        cpi->recent_ref_frame_usage[INTRA_FRAME] = 1;
+        cpi->recent_ref_frame_usage[LAST_FRAME] = 1;
+        cpi->recent_ref_frame_usage[GOLDEN_FRAME] = 1;
+        cpi->recent_ref_frame_usage[ALTREF_FRAME] = 1;
+
+        /* ******** Fixed Q test code only ************ */
+        /* If we are going to use the ALT reference for the next group of
+         * frames set a flag to say so.
+         */
+        if (cpi->oxcf.fixed_q >= 0 &&
+            cpi->oxcf.play_alternate && !cpi->common.refresh_alt_ref_frame)
+        {
+            cpi->source_alt_ref_pending = 1;
+            cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
+        }
+
+        if (!cpi->source_alt_ref_pending)
+            cpi->source_alt_ref_active = 0;
+
+        /* Decrement count down till next gf */
+        if (cpi->frames_till_gf_update_due > 0)
+            cpi->frames_till_gf_update_due--;
+
+    }
+    else if (!cpi->common.refresh_alt_ref_frame)
+    {
+        /* Decrement count down till next gf */
+        if (cpi->frames_till_gf_update_due > 0)
+            cpi->frames_till_gf_update_due--;
+
+        if (cpi->frames_till_alt_ref_frame)
+            cpi->frames_till_alt_ref_frame --;
+
+        cpi->frames_since_golden ++;
+
+        if (cpi->frames_since_golden > 1)
+        {
+            cpi->recent_ref_frame_usage[INTRA_FRAME] +=
+                cpi->mb.count_mb_ref_frame_usage[INTRA_FRAME];
+            cpi->recent_ref_frame_usage[LAST_FRAME] +=
+                cpi->mb.count_mb_ref_frame_usage[LAST_FRAME];
+            cpi->recent_ref_frame_usage[GOLDEN_FRAME] +=
+                cpi->mb.count_mb_ref_frame_usage[GOLDEN_FRAME];
+            cpi->recent_ref_frame_usage[ALTREF_FRAME] +=
+                cpi->mb.count_mb_ref_frame_usage[ALTREF_FRAME];
+        }
+    }
+}
+
+/* This function updates the reference frame probability estimates that
+ * will be used during mode selection
+ */
+static void update_rd_ref_frame_probs(VP8_COMP *cpi)
+{
+    VP8_COMMON *cm = &cpi->common;
+
+    const int *const rfct = cpi->mb.count_mb_ref_frame_usage;
+    const int rf_intra = rfct[INTRA_FRAME];
+    const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];
+
+    if (cm->frame_type == KEY_FRAME)
+    {
+        cpi->prob_intra_coded = 255;
+        cpi->prob_last_coded  = 128;
+        cpi->prob_gf_coded  = 128;
+    }
+    else if (!(rf_intra + rf_inter))
+    {
+        cpi->prob_intra_coded = 63;
+        cpi->prob_last_coded  = 128;
+        cpi->prob_gf_coded    = 128;
+    }
+
+    /* update reference frame costs since we can do better than what we got
+     * last frame.
+     */
+    if (cpi->oxcf.number_of_layers == 1)
+    {
+        if (cpi->common.refresh_alt_ref_frame)
+        {
+            cpi->prob_intra_coded += 40;
+            if (cpi->prob_intra_coded > 255)
+                cpi->prob_intra_coded = 255;
+            cpi->prob_last_coded = 200;
+            cpi->prob_gf_coded = 1;
+        }
+        else if (cpi->frames_since_golden == 0)
+        {
+            cpi->prob_last_coded = 214;
+        }
+        else if (cpi->frames_since_golden == 1)
+        {
+            cpi->prob_last_coded = 192;
+            cpi->prob_gf_coded = 220;
+        }
+        else if (cpi->source_alt_ref_active)
+        {
+            cpi->prob_gf_coded -= 20;
+
+            if (cpi->prob_gf_coded < 10)
+                cpi->prob_gf_coded = 10;
+        }
+        if (!cpi->source_alt_ref_active)
+            cpi->prob_gf_coded = 255;
+    }
+}
+
+
+#if !CONFIG_REALTIME_ONLY
+/* 1 = key, 0 = inter */
+static int decide_key_frame(VP8_COMP *cpi)
+{
+    VP8_COMMON *cm = &cpi->common;
+
+    int code_key_frame = 0;
+
+    cpi->kf_boost = 0;
+
+    if (cpi->Speed > 11)
+        return 0;
+
+    /* Clear down mmx registers */
+    vp8_clear_system_state();
+
+    if ((cpi->compressor_speed == 2) && (cpi->Speed >= 5) && (cpi->sf.RD == 0))
+    {
+        double change = 1.0 * abs((int)(cpi->mb.intra_error -
+            cpi->last_intra_error)) / (1 + cpi->last_intra_error);
+        double change2 = 1.0 * abs((int)(cpi->mb.prediction_error -
+            cpi->last_prediction_error)) / (1 + cpi->last_prediction_error);
+        double minerror = cm->MBs * 256;
+
+        cpi->last_intra_error = cpi->mb.intra_error;
+        cpi->last_prediction_error = cpi->mb.prediction_error;
+
+        if (10 * cpi->mb.intra_error / (1 + cpi->mb.prediction_error) < 15
+            && cpi->mb.prediction_error > minerror
+            && (change > .25 || change2 > .25))
+        {
+            /*(change > 1.4 || change < .75)&& cpi->this_frame_percent_intra > cpi->last_frame_percent_intra + 3*/
+            return 1;
+        }
+
+        return 0;
+
+    }
+
+    /* If the following are true we might as well code a key frame */
+    if (((cpi->this_frame_percent_intra == 100) &&
+         (cpi->this_frame_percent_intra > (cpi->last_frame_percent_intra + 2))) ||
+        ((cpi->this_frame_percent_intra > 95) &&
+         (cpi->this_frame_percent_intra >= (cpi->last_frame_percent_intra + 5))))
+    {
+        code_key_frame = 1;
+    }
+    /* in addition if the following are true and this is not a golden frame
+     * then code a key frame Note that on golden frames there often seems
+     * to be a pop in intra useage anyway hence this restriction is
+     * designed to prevent spurious key frames. The Intra pop needs to be
+     * investigated.
+     */
+    else if (((cpi->this_frame_percent_intra > 60) &&
+              (cpi->this_frame_percent_intra > (cpi->last_frame_percent_intra * 2))) ||
+             ((cpi->this_frame_percent_intra > 75) &&
+              (cpi->this_frame_percent_intra > (cpi->last_frame_percent_intra * 3 / 2))) ||
+             ((cpi->this_frame_percent_intra > 90) &&
+              (cpi->this_frame_percent_intra > (cpi->last_frame_percent_intra + 10))))
+    {
+        if (!cm->refresh_golden_frame)
+            code_key_frame = 1;
+    }
+
+    return code_key_frame;
+
+}
+
+static void Pass1Encode(VP8_COMP *cpi, unsigned long *size, unsigned char *dest, unsigned int *frame_flags)
+{
+    (void) size;
+    (void) dest;
+    (void) frame_flags;
+    vp8_set_quantizer(cpi, 26);
+
+    vp8_first_pass(cpi);
+}
+#endif
+
+#if 0
+void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame)
+{
+
+    /* write the frame */
+    FILE *yframe;
+    int i;
+    char filename[255];
+
+    sprintf(filename, "cx\\y%04d.raw", this_frame);
+    yframe = fopen(filename, "wb");
+
+    for (i = 0; i < frame->y_height; i++)
+        fwrite(frame->y_buffer + i * frame->y_stride, frame->y_width, 1, yframe);
+
+    fclose(yframe);
+    sprintf(filename, "cx\\u%04d.raw", this_frame);
+    yframe = fopen(filename, "wb");
+
+    for (i = 0; i < frame->uv_height; i++)
+        fwrite(frame->u_buffer + i * frame->uv_stride, frame->uv_width, 1, yframe);
+
+    fclose(yframe);
+    sprintf(filename, "cx\\v%04d.raw", this_frame);
+    yframe = fopen(filename, "wb");
+
+    for (i = 0; i < frame->uv_height; i++)
+        fwrite(frame->v_buffer + i * frame->uv_stride, frame->uv_width, 1, yframe);
+
+    fclose(yframe);
+}
+#endif
+/* return of 0 means drop frame */
+
+#if !CONFIG_REALTIME_ONLY
+/* Function to test for conditions that indeicate we should loop
+ * back and recode a frame.
+ */
+static int recode_loop_test( VP8_COMP *cpi,
+                              int high_limit, int low_limit,
+                              int q, int maxq, int minq )
+{
+    int force_recode = 0;
+    VP8_COMMON *cm = &cpi->common;
+
+    /* Is frame recode allowed at all
+     * Yes if either recode mode 1 is selected or mode two is selcted
+     * and the frame is a key frame. golden frame or alt_ref_frame
+     */
+    if ( (cpi->sf.recode_loop == 1) ||
+         ( (cpi->sf.recode_loop == 2) &&
+           ( (cm->frame_type == KEY_FRAME) ||
+             cm->refresh_golden_frame ||
+             cm->refresh_alt_ref_frame ) ) )
+    {
+        /* General over and under shoot tests */
+        if ( ((cpi->projected_frame_size > high_limit) && (q < maxq)) ||
+             ((cpi->projected_frame_size < low_limit) && (q > minq)) )
+        {
+            force_recode = 1;
+        }
+        /* Special Constrained quality tests */
+        else if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY)
+        {
+            /* Undershoot and below auto cq level */
+            if ( (q > cpi->cq_target_quality) &&
+                 (cpi->projected_frame_size <
+                     ((cpi->this_frame_target * 7) >> 3)))
+            {
+                force_recode = 1;
+            }
+            /* Severe undershoot and between auto and user cq level */
+            else if ( (q > cpi->oxcf.cq_level) &&
+                      (cpi->projected_frame_size < cpi->min_frame_bandwidth) &&
+                      (cpi->active_best_quality > cpi->oxcf.cq_level))
+            {
+                force_recode = 1;
+                cpi->active_best_quality = cpi->oxcf.cq_level;
+            }
+        }
+    }
+
+    return force_recode;
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+static void update_reference_frames(VP8_COMP *cpi)
+{
+    VP8_COMMON *cm = &cpi->common;
+    YV12_BUFFER_CONFIG *yv12_fb = cm->yv12_fb;
+
+    /* At this point the new frame has been encoded.
+     * If any buffer copy / swapping is signaled it should be done here.
+     */
+
+    if (cm->frame_type == KEY_FRAME)
+    {
+        yv12_fb[cm->new_fb_idx].flags |= VP8_GOLD_FRAME | VP8_ALTR_FRAME ;
+
+        yv12_fb[cm->gld_fb_idx].flags &= ~VP8_GOLD_FRAME;
+        yv12_fb[cm->alt_fb_idx].flags &= ~VP8_ALTR_FRAME;
+
+        cm->alt_fb_idx = cm->gld_fb_idx = cm->new_fb_idx;
+
+        cpi->current_ref_frames[GOLDEN_FRAME] = cm->current_video_frame;
+        cpi->current_ref_frames[ALTREF_FRAME] = cm->current_video_frame;
+    }
+    else    /* For non key frames */
+    {
+        if (cm->refresh_alt_ref_frame)
+        {
+            assert(!cm->copy_buffer_to_arf);
+
+            cm->yv12_fb[cm->new_fb_idx].flags |= VP8_ALTR_FRAME;
+            cm->yv12_fb[cm->alt_fb_idx].flags &= ~VP8_ALTR_FRAME;
+            cm->alt_fb_idx = cm->new_fb_idx;
+
+            cpi->current_ref_frames[ALTREF_FRAME] = cm->current_video_frame;
+        }
+        else if (cm->copy_buffer_to_arf)
+        {
+            assert(!(cm->copy_buffer_to_arf & ~0x3));
+
+            if (cm->copy_buffer_to_arf == 1)
+            {
+                if(cm->alt_fb_idx != cm->lst_fb_idx)
+                {
+                    yv12_fb[cm->lst_fb_idx].flags |= VP8_ALTR_FRAME;
+                    yv12_fb[cm->alt_fb_idx].flags &= ~VP8_ALTR_FRAME;
+                    cm->alt_fb_idx = cm->lst_fb_idx;
+
+                    cpi->current_ref_frames[ALTREF_FRAME] =
+                        cpi->current_ref_frames[LAST_FRAME];
+                }
+            }
+            else /* if (cm->copy_buffer_to_arf == 2) */
+            {
+                if(cm->alt_fb_idx != cm->gld_fb_idx)
+                {
+                    yv12_fb[cm->gld_fb_idx].flags |= VP8_ALTR_FRAME;
+                    yv12_fb[cm->alt_fb_idx].flags &= ~VP8_ALTR_FRAME;
+                    cm->alt_fb_idx = cm->gld_fb_idx;
+
+                    cpi->current_ref_frames[ALTREF_FRAME] =
+                        cpi->current_ref_frames[GOLDEN_FRAME];
+                }
+            }
+        }
+
+        if (cm->refresh_golden_frame)
+        {
+            assert(!cm->copy_buffer_to_gf);
+
+            cm->yv12_fb[cm->new_fb_idx].flags |= VP8_GOLD_FRAME;
+            cm->yv12_fb[cm->gld_fb_idx].flags &= ~VP8_GOLD_FRAME;
+            cm->gld_fb_idx = cm->new_fb_idx;
+
+            cpi->current_ref_frames[GOLDEN_FRAME] = cm->current_video_frame;
+        }
+        else if (cm->copy_buffer_to_gf)
+        {
+            assert(!(cm->copy_buffer_to_arf & ~0x3));
+
+            if (cm->copy_buffer_to_gf == 1)
+            {
+                if(cm->gld_fb_idx != cm->lst_fb_idx)
+                {
+                    yv12_fb[cm->lst_fb_idx].flags |= VP8_GOLD_FRAME;
+                    yv12_fb[cm->gld_fb_idx].flags &= ~VP8_GOLD_FRAME;
+                    cm->gld_fb_idx = cm->lst_fb_idx;
+
+                    cpi->current_ref_frames[GOLDEN_FRAME] =
+                        cpi->current_ref_frames[LAST_FRAME];
+                }
+            }
+            else /* if (cm->copy_buffer_to_gf == 2) */
+            {
+                if(cm->alt_fb_idx != cm->gld_fb_idx)
+                {
+                    yv12_fb[cm->alt_fb_idx].flags |= VP8_GOLD_FRAME;
+                    yv12_fb[cm->gld_fb_idx].flags &= ~VP8_GOLD_FRAME;
+                    cm->gld_fb_idx = cm->alt_fb_idx;
+
+                    cpi->current_ref_frames[GOLDEN_FRAME] =
+                        cpi->current_ref_frames[ALTREF_FRAME];
+                }
+            }
+        }
+    }
+
+    if (cm->refresh_last_frame)
+    {
+        cm->yv12_fb[cm->new_fb_idx].flags |= VP8_LAST_FRAME;
+        cm->yv12_fb[cm->lst_fb_idx].flags &= ~VP8_LAST_FRAME;
+        cm->lst_fb_idx = cm->new_fb_idx;
+
+        cpi->current_ref_frames[LAST_FRAME] = cm->current_video_frame;
+    }
+
+#if CONFIG_TEMPORAL_DENOISING
+    if (cpi->oxcf.noise_sensitivity)
+    {
+        /* we shouldn't have to keep multiple copies as we know in advance which
+         * buffer we should start - for now to get something up and running
+         * I've chosen to copy the buffers
+         */
+        if (cm->frame_type == KEY_FRAME)
+        {
+            int i;
+            for (i = LAST_FRAME; i < MAX_REF_FRAMES; ++i)
+              vp8_yv12_copy_frame(cpi->Source,
+                                  &cpi->denoiser.yv12_running_avg[i]);
+        }
+        else /* For non key frames */
+        {
+            vp8_yv12_extend_frame_borders(
+                    &cpi->denoiser.yv12_running_avg[INTRA_FRAME]);
+
+            if (cm->refresh_alt_ref_frame || cm->copy_buffer_to_arf)
+            {
+                vp8_yv12_copy_frame(
+                        &cpi->denoiser.yv12_running_avg[INTRA_FRAME],
+                        &cpi->denoiser.yv12_running_avg[ALTREF_FRAME]);
+            }
+            if (cm->refresh_golden_frame || cm->copy_buffer_to_gf)
+            {
+                vp8_yv12_copy_frame(
+                        &cpi->denoiser.yv12_running_avg[INTRA_FRAME],
+                        &cpi->denoiser.yv12_running_avg[GOLDEN_FRAME]);
+            }
+            if(cm->refresh_last_frame)
+            {
+                vp8_yv12_copy_frame(
+                        &cpi->denoiser.yv12_running_avg[INTRA_FRAME],
+                        &cpi->denoiser.yv12_running_avg[LAST_FRAME]);
+            }
+        }
+        if (cpi->oxcf.noise_sensitivity == 4)
+          vp8_yv12_copy_frame(cpi->Source, &cpi->denoiser.yv12_last_source);
+
+    }
+#endif
+
+}
+
+static int measure_square_diff_partial(YV12_BUFFER_CONFIG *source,
+                                       YV12_BUFFER_CONFIG *dest,
+                                       VP8_COMP *cpi)
+    {
+        int i, j;
+        int Total = 0;
+        int num_blocks = 0;
+        int skip = 2;
+        int min_consec_zero_last = 10;
+        int tot_num_blocks = (source->y_height * source->y_width) >> 8;
+        unsigned char *src = source->y_buffer;
+        unsigned char *dst = dest->y_buffer;
+
+        /* Loop through the Y plane, every |skip| blocks along rows and colmumns,
+         * summing the square differences, and only for blocks that have been
+         * zero_last mode at least |x| frames in a row.
+         */
+        for (i = 0; i < source->y_height; i += 16 * skip)
+        {
+            int block_index_row = (i >> 4) * cpi->common.mb_cols;
+            for (j = 0; j < source->y_width; j += 16 * skip)
+            {
+                int index = block_index_row + (j >> 4);
+                if (cpi->consec_zero_last[index] >= min_consec_zero_last) {
+                  unsigned int sse;
+                  Total += vpx_mse16x16(src + j,
+                                        source->y_stride,
+                                        dst + j, dest->y_stride,
+                                        &sse);
+                  num_blocks++;
+                }
+            }
+            src += 16 * skip * source->y_stride;
+            dst += 16 * skip * dest->y_stride;
+        }
+        // Only return non-zero if we have at least ~1/16 samples for estimate.
+        if (num_blocks > (tot_num_blocks >> 4)) {
+        return (Total / num_blocks);
+        } else {
+          return 0;
+        }
+    }
+
+#if CONFIG_TEMPORAL_DENOISING
+static void process_denoiser_mode_change(VP8_COMP *cpi) {
+  const VP8_COMMON *const cm = &cpi->common;
+  int i, j;
+  int total = 0;
+  int num_blocks = 0;
+  // Number of blocks skipped along row/column in computing the
+  // nmse (normalized mean square error) of source.
+  int skip = 2;
+  // Only select blocks for computing nmse that have been encoded
+  // as ZERO LAST min_consec_zero_last frames in a row.
+  // Scale with number of temporal layers.
+  int min_consec_zero_last = 12 / cpi->oxcf.number_of_layers;
+  // Decision is tested for changing the denoising mode every
+  // num_mode_change times this function is called. Note that this
+  // function called every 8 frames, so (8 * num_mode_change) is number
+  // of frames where denoising mode change is tested for switch.
+  int num_mode_change = 20;
+  // Framerate factor, to compensate for larger mse at lower framerates.
+  // Use ref_framerate, which is full source framerate for temporal layers.
+  // TODO(marpan): Adjust this factor.
+  int fac_framerate = cpi->ref_framerate < 25.0f ? 80 : 100;
+  int tot_num_blocks = cm->mb_rows * cm->mb_cols;
+  int ystride = cpi->Source->y_stride;
+  unsigned char *src = cpi->Source->y_buffer;
+  unsigned char *dst = cpi->denoiser.yv12_last_source.y_buffer;
+  static const unsigned char const_source[16] = {
+      128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+      128, 128, 128};
+  int bandwidth = (int)(cpi->target_bandwidth);
+  // For temporal layers, use full bandwidth (top layer).
+  if (cpi->oxcf.number_of_layers > 1) {
+    LAYER_CONTEXT *lc = &cpi->layer_context[cpi->oxcf.number_of_layers - 1];
+    bandwidth = (int)(lc->target_bandwidth);
+  }
+  // Loop through the Y plane, every skip blocks along rows and columns,
+  // summing the normalized mean square error, only for blocks that have
+  // been encoded as ZEROMV LAST at least min_consec_zero_last least frames in
+  // a row and have small sum difference between current and previous frame.
+  // Normalization here is by the contrast of the current frame block.
+  for (i = 0; i < cm->Height; i += 16 * skip) {
+    int block_index_row = (i >> 4) * cm->mb_cols;
+    for (j = 0; j < cm->Width; j += 16 * skip) {
+      int index = block_index_row + (j >> 4);
+      if (cpi->consec_zero_last[index] >= min_consec_zero_last) {
+        unsigned int sse;
+        const unsigned int var = vpx_variance16x16(src + j,
+                                                   ystride,
+                                                   dst + j,
+                                                   ystride,
+                                                   &sse);
+        // Only consider this block as valid for noise measurement
+        // if the sum_diff average of the current and previous frame
+        // is small (to avoid effects from lighting change).
+        if ((sse - var) < 128) {
+          unsigned int sse2;
+          const unsigned int act = vpx_variance16x16(src + j,
+                                                     ystride,
+                                                     const_source,
+                                                     0,
+                                                     &sse2);
+          if (act > 0)
+            total += sse / act;
+          num_blocks++;
+        }
+      }
+    }
+    src += 16 * skip * ystride;
+    dst += 16 * skip * ystride;
+  }
+  total = total * fac_framerate / 100;
+
+  // Only consider this frame as valid sample if we have computed nmse over
+  // at least ~1/16 blocks, and Total > 0 (Total == 0 can happen if the
+  // application inputs duplicate frames, or contrast is all zero).
+  if (total > 0 &&
+      (num_blocks > (tot_num_blocks >> 4))) {
+    // Update the recursive mean square source_diff.
+    total = (total << 8) / num_blocks;
+    if (cpi->denoiser.nmse_source_diff_count == 0) {
+      // First sample in new interval.
+      cpi->denoiser.nmse_source_diff = total;
+      cpi->denoiser.qp_avg = cm->base_qindex;
+    } else {
+      // For subsequent samples, use average with weight ~1/4 for new sample.
+      cpi->denoiser.nmse_source_diff = (int)((total +
+          3 * cpi->denoiser.nmse_source_diff) >> 2);
+      cpi->denoiser.qp_avg = (int)((cm->base_qindex +
+          3 * cpi->denoiser.qp_avg) >> 2);
+    }
+    cpi->denoiser.nmse_source_diff_count++;
+  }
+  // Check for changing the denoiser mode, when we have obtained #samples =
+  // num_mode_change. Condition the change also on the bitrate and QP.
+  if (cpi->denoiser.nmse_source_diff_count == num_mode_change) {
+    // Check for going up: from normal to aggressive mode.
+    if ((cpi->denoiser.denoiser_mode == kDenoiserOnYUV) &&
+        (cpi->denoiser.nmse_source_diff >
+        cpi->denoiser.threshold_aggressive_mode) &&
+        (cpi->denoiser.qp_avg < cpi->denoiser.qp_threshold_up &&
+         bandwidth > cpi->denoiser.bitrate_threshold)) {
+      vp8_denoiser_set_parameters(&cpi->denoiser, kDenoiserOnYUVAggressive);
+    } else {
+      // Check for going down: from aggressive to normal mode.
+      if (((cpi->denoiser.denoiser_mode == kDenoiserOnYUVAggressive) &&
+          (cpi->denoiser.nmse_source_diff <
+          cpi->denoiser.threshold_aggressive_mode)) ||
+          ((cpi->denoiser.denoiser_mode == kDenoiserOnYUVAggressive) &&
+          (cpi->denoiser.qp_avg > cpi->denoiser.qp_threshold_down ||
+           bandwidth < cpi->denoiser.bitrate_threshold))) {
+        vp8_denoiser_set_parameters(&cpi->denoiser, kDenoiserOnYUV);
+      }
+    }
+    // Reset metric and counter for next interval.
+    cpi->denoiser.nmse_source_diff = 0;
+    cpi->denoiser.qp_avg = 0;
+    cpi->denoiser.nmse_source_diff_count = 0;
+  }
+}
+#endif
+
+void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm)
+{
+    const FRAME_TYPE frame_type = cm->frame_type;
+
+    int update_any_ref_buffers = 1;
+    if (cpi->common.refresh_last_frame == 0 &&
+        cpi->common.refresh_golden_frame == 0 &&
+        cpi->common.refresh_alt_ref_frame == 0) {
+        update_any_ref_buffers = 0;
+    }
+
+    if (cm->no_lpf)
+    {
+        cm->filter_level = 0;
+    }
+    else
+    {
+        struct vpx_usec_timer timer;
+
+        vp8_clear_system_state();
+
+        vpx_usec_timer_start(&timer);
+        if (cpi->sf.auto_filter == 0) {
+#if CONFIG_TEMPORAL_DENOISING
+            if (cpi->oxcf.noise_sensitivity && cm->frame_type != KEY_FRAME) {
+                // Use the denoised buffer for selecting base loop filter level.
+                // Denoised signal for current frame is stored in INTRA_FRAME.
+                // No denoising on key frames.
+                vp8cx_pick_filter_level_fast(
+                    &cpi->denoiser.yv12_running_avg[INTRA_FRAME], cpi);
+            } else {
+                vp8cx_pick_filter_level_fast(cpi->Source, cpi);
+            }
+#else
+            vp8cx_pick_filter_level_fast(cpi->Source, cpi);
+#endif
+        } else {
+#if CONFIG_TEMPORAL_DENOISING
+            if (cpi->oxcf.noise_sensitivity && cm->frame_type != KEY_FRAME) {
+                // Use the denoised buffer for selecting base loop filter level.
+                // Denoised signal for current frame is stored in INTRA_FRAME.
+                // No denoising on key frames.
+                vp8cx_pick_filter_level(
+                    &cpi->denoiser.yv12_running_avg[INTRA_FRAME], cpi);
+            } else {
+                vp8cx_pick_filter_level(cpi->Source, cpi);
+            }
+#else
+            vp8cx_pick_filter_level(cpi->Source, cpi);
+#endif
+        }
+
+
+        if (cm->filter_level > 0)
+        {
+            vp8cx_set_alt_lf_level(cpi, cm->filter_level);
+        }
+
+        vpx_usec_timer_mark(&timer);
+        cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer);
+    }
+
+#if CONFIG_MULTITHREAD
+    if (cpi->b_multi_threaded)
+        sem_post(&cpi->h_event_end_lpf); /* signal that we have set filter_level */
+#endif
+
+    // No need to apply loop-filter if the encoded frame does not update
+    // any reference buffers.
+    if (cm->filter_level > 0 && update_any_ref_buffers)
+    {
+        vp8_loop_filter_frame(cm, &cpi->mb.e_mbd, frame_type);
+    }
+
+    vp8_yv12_extend_frame_borders(cm->frame_to_show);
+
+}
+
+static void encode_frame_to_data_rate
+(
+    VP8_COMP *cpi,
+    unsigned long *size,
+    unsigned char *dest,
+    unsigned char* dest_end,
+    unsigned int *frame_flags
+)
+{
+    int Q;
+    int frame_over_shoot_limit;
+    int frame_under_shoot_limit;
+
+    int Loop = 0;
+    int loop_count;
+
+    VP8_COMMON *cm = &cpi->common;
+    int active_worst_qchanged = 0;
+
+#if !CONFIG_REALTIME_ONLY
+    int q_low;
+    int q_high;
+    int zbin_oq_high;
+    int zbin_oq_low = 0;
+    int top_index;
+    int bottom_index;
+    int overshoot_seen = 0;
+    int undershoot_seen = 0;
+#endif
+
+    int drop_mark = (int)(cpi->oxcf.drop_frames_water_mark *
+                          cpi->oxcf.optimal_buffer_level / 100);
+    int drop_mark75 = drop_mark * 2 / 3;
+    int drop_mark50 = drop_mark / 4;
+    int drop_mark25 = drop_mark / 8;
+
+
+    /* Clear down mmx registers to allow floating point in what follows */
+    vp8_clear_system_state();
+
+#if CONFIG_MULTITHREAD
+    /*  wait for the last picture loopfilter thread done */
+    if (cpi->b_lpf_running)
+    {
+        sem_wait(&cpi->h_event_end_lpf);
+        cpi->b_lpf_running = 0;
+    }
+#endif
+
+    if(cpi->force_next_frame_intra)
+    {
+        cm->frame_type = KEY_FRAME;  /* delayed intra frame */
+        cpi->force_next_frame_intra = 0;
+    }
+
+    /* For an alt ref frame in 2 pass we skip the call to the second pass
+     * function that sets the target bandwidth
+     */
+#if !CONFIG_REALTIME_ONLY
+
+    if (cpi->pass == 2)
+    {
+        if (cpi->common.refresh_alt_ref_frame)
+        {
+            /* Per frame bit target for the alt ref frame */
+            cpi->per_frame_bandwidth = cpi->twopass.gf_bits;
+            /* per second target bitrate */
+            cpi->target_bandwidth = (int)(cpi->twopass.gf_bits *
+                                          cpi->output_framerate);
+        }
+    }
+    else
+#endif
+        cpi->per_frame_bandwidth  = (int)(cpi->target_bandwidth / cpi->output_framerate);
+
+    /* Default turn off buffer to buffer copying */
+    cm->copy_buffer_to_gf = 0;
+    cm->copy_buffer_to_arf = 0;
+
+    /* Clear zbin over-quant value and mode boost values. */
+    cpi->mb.zbin_over_quant = 0;
+    cpi->mb.zbin_mode_boost = 0;
+
+    /* Enable or disable mode based tweaking of the zbin
+     * For 2 Pass Only used where GF/ARF prediction quality
+     * is above a threshold
+     */
+    cpi->mb.zbin_mode_boost_enabled = 1;
+    if (cpi->pass == 2)
+    {
+        if ( cpi->gfu_boost <= 400 )
+        {
+            cpi->mb.zbin_mode_boost_enabled = 0;
+        }
+    }
+
+    /* Current default encoder behaviour for the altref sign bias */
+    if (cpi->source_alt_ref_active)
+        cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 1;
+    else
+        cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 0;
+
+    /* Check to see if a key frame is signaled
+     * For two pass with auto key frame enabled cm->frame_type may already
+     * be set, but not for one pass.
+     */
+    if ((cm->current_video_frame == 0) ||
+        (cm->frame_flags & FRAMEFLAGS_KEY) ||
+        (cpi->oxcf.auto_key && (cpi->frames_since_key % cpi->key_frame_frequency == 0)))
+    {
+        /* Key frame from VFW/auto-keyframe/first frame */
+        cm->frame_type = KEY_FRAME;
+#if CONFIG_TEMPORAL_DENOISING
+        if (cpi->oxcf.noise_sensitivity == 4) {
+          // For adaptive mode, reset denoiser to normal mode on key frame.
+          vp8_denoiser_set_parameters(&cpi->denoiser, kDenoiserOnYUV);
+        }
+#endif
+    }
+
+#if CONFIG_MULTI_RES_ENCODING
+    if (cpi->oxcf.mr_total_resolutions > 1) {
+      LOWER_RES_FRAME_INFO* low_res_frame_info
+         = (LOWER_RES_FRAME_INFO*)cpi->oxcf.mr_low_res_mode_info;
+
+      if (cpi->oxcf.mr_encoder_id) {
+
+        // TODO(marpan): This constraint shouldn't be needed, as we would like
+        // to allow for key frame setting (forced or periodic) defined per
+        // spatial layer. For now, keep this in.
+        cm->frame_type = low_res_frame_info->frame_type;
+
+        // Check if lower resolution is available for motion vector reuse.
+        if(cm->frame_type != KEY_FRAME)
+        {
+          cpi->mr_low_res_mv_avail = 1;
+          cpi->mr_low_res_mv_avail &= !(low_res_frame_info->is_frame_dropped);
+
+          if (cpi->ref_frame_flags & VP8_LAST_FRAME)
+              cpi->mr_low_res_mv_avail &= (cpi->current_ref_frames[LAST_FRAME]
+                       == low_res_frame_info->low_res_ref_frames[LAST_FRAME]);
+
+          if (cpi->ref_frame_flags & VP8_GOLD_FRAME)
+              cpi->mr_low_res_mv_avail &= (cpi->current_ref_frames[GOLDEN_FRAME]
+                       == low_res_frame_info->low_res_ref_frames[GOLDEN_FRAME]);
+
+          // Don't use altref to determine whether low res is available.
+          // TODO (marpan): Should we make this type of condition on a
+          // per-reference frame basis?
+          /*
+          if (cpi->ref_frame_flags & VP8_ALTR_FRAME)
+              cpi->mr_low_res_mv_avail &= (cpi->current_ref_frames[ALTREF_FRAME]
+                       == low_res_frame_info->low_res_ref_frames[ALTREF_FRAME]);
+          */
+        }
+      }
+
+      // On a key frame: For the lowest resolution, keep track of the key frame
+      // counter value. For the higher resolutions, reset the current video
+      // frame counter to that of the lowest resolution.
+      // This is done to the handle the case where we may stop/start encoding
+      // higher layer(s). The restart-encoding of higher layer is only signaled
+      // by a key frame for now.
+      // TODO (marpan): Add flag to indicate restart-encoding of higher layer.
+      if (cm->frame_type == KEY_FRAME) {
+        if (cpi->oxcf.mr_encoder_id) {
+          // If the initial starting value of the buffer level is zero (this can
+          // happen because we may have not started encoding this higher stream),
+          // then reset it to non-zero value based on |starting_buffer_level|.
+          if (cpi->common.current_video_frame == 0 && cpi->buffer_level == 0) {
+            unsigned int i;
+            cpi->bits_off_target = cpi->oxcf.starting_buffer_level;
+            cpi->buffer_level = cpi->oxcf.starting_buffer_level;
+            for (i = 0; i < cpi->oxcf.number_of_layers; i++) {
+              LAYER_CONTEXT *lc = &cpi->layer_context[i];
+              lc->bits_off_target = lc->starting_buffer_level;
+              lc->buffer_level = lc->starting_buffer_level;
+            }
+          }
+          cpi->common.current_video_frame =
+              low_res_frame_info->key_frame_counter_value;
+        } else {
+          low_res_frame_info->key_frame_counter_value =
+              cpi->common.current_video_frame;
+        }
+      }
+
+    }
+#endif
+
+    // Find the reference frame closest to the current frame.
+    cpi->closest_reference_frame = LAST_FRAME;
+    if(cm->frame_type != KEY_FRAME) {
+      int i;
+      MV_REFERENCE_FRAME closest_ref = INTRA_FRAME;
+      if (cpi->ref_frame_flags & VP8_LAST_FRAME) {
+        closest_ref = LAST_FRAME;
+      } else if (cpi->ref_frame_flags & VP8_GOLD_FRAME) {
+        closest_ref = GOLDEN_FRAME;
+      } else if (cpi->ref_frame_flags & VP8_ALTR_FRAME) {
+        closest_ref = ALTREF_FRAME;
+      }
+      for(i = 1; i <= 3; i++) {
+        vpx_ref_frame_type_t ref_frame_type = (vpx_ref_frame_type_t)
+            ((i == 3) ? 4 : i);
+        if (cpi->ref_frame_flags & ref_frame_type) {
+          if ((cm->current_video_frame - cpi->current_ref_frames[i]) <
+              (cm->current_video_frame - cpi->current_ref_frames[closest_ref])) {
+            closest_ref = i;
+          }
+        }
+      }
+      cpi->closest_reference_frame = closest_ref;
+    }
+
+    /* Set various flags etc to special state if it is a key frame */
+    if (cm->frame_type == KEY_FRAME)
+    {
+        int i;
+
+        // Set the loop filter deltas and segmentation map update
+        setup_features(cpi);
+
+        /* The alternate reference frame cannot be active for a key frame */
+        cpi->source_alt_ref_active = 0;
+
+        /* Reset the RD threshold multipliers to default of * 1 (128) */
+        for (i = 0; i < MAX_MODES; i++)
+        {
+            cpi->mb.rd_thresh_mult[i] = 128;
+        }
+
+        // Reset the zero_last counter to 0 on key frame.
+        memset(cpi->consec_zero_last, 0, cm->mb_rows * cm->mb_cols);
+        memset(cpi->consec_zero_last_mvbias, 0,
+               (cpi->common.mb_rows * cpi->common.mb_cols));
+    }
+
+#if 0
+    /* Experimental code for lagged compress and one pass
+     * Initialise one_pass GF frames stats
+     * Update stats used for GF selection
+     */
+    {
+        cpi->one_pass_frame_index = cm->current_video_frame % MAX_LAG_BUFFERS;
+
+        cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frames_so_far = 0;
+        cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_intra_error = 0.0;
+        cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_coded_error = 0.0;
+        cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_pcnt_inter = 0.0;
+        cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_pcnt_motion = 0.0;
+        cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_mvr = 0.0;
+        cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_mvr_abs = 0.0;
+        cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_mvc = 0.0;
+        cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_mvc_abs = 0.0;
+    }
+#endif
+
+    update_rd_ref_frame_probs(cpi);
+
+    if (cpi->drop_frames_allowed)
+    {
+        /* The reset to decimation 0 is only done here for one pass.
+         * Once it is set two pass leaves decimation on till the next kf.
+         */
+        if ((cpi->buffer_level > drop_mark) && (cpi->decimation_factor > 0))
+            cpi->decimation_factor --;
+
+        if (cpi->buffer_level > drop_mark75 && cpi->decimation_factor > 0)
+            cpi->decimation_factor = 1;
+
+        else if (cpi->buffer_level < drop_mark25 && (cpi->decimation_factor == 2 || cpi->decimation_factor == 3))
+        {
+            cpi->decimation_factor = 3;
+        }
+        else if (cpi->buffer_level < drop_mark50 && (cpi->decimation_factor == 1 || cpi->decimation_factor == 2))
+        {
+            cpi->decimation_factor = 2;
+        }
+        else if (cpi->buffer_level < drop_mark75 && (cpi->decimation_factor == 0 || cpi->decimation_factor == 1))
+        {
+            cpi->decimation_factor = 1;
+        }
+    }
+
+    /* The following decimates the frame rate according to a regular
+     * pattern (i.e. to 1/2 or 2/3 frame rate) This can be used to help
+     * prevent buffer under-run in CBR mode. Alternatively it might be
+     * desirable in some situations to drop frame rate but throw more bits
+     * at each frame.
+     *
+     * Note that dropping a key frame can be problematic if spatial
+     * resampling is also active
+     */
+    if (cpi->decimation_factor > 0)
+    {
+        switch (cpi->decimation_factor)
+        {
+        case 1:
+            cpi->per_frame_bandwidth  = cpi->per_frame_bandwidth * 3 / 2;
+            break;
+        case 2:
+            cpi->per_frame_bandwidth  = cpi->per_frame_bandwidth * 5 / 4;
+            break;
+        case 3:
+            cpi->per_frame_bandwidth  = cpi->per_frame_bandwidth * 5 / 4;
+            break;
+        }
+
+        /* Note that we should not throw out a key frame (especially when
+         * spatial resampling is enabled).
+         */
+        if (cm->frame_type == KEY_FRAME)
+        {
+            cpi->decimation_count = cpi->decimation_factor;
+        }
+        else if (cpi->decimation_count > 0)
+        {
+            cpi->decimation_count --;
+
+            cpi->bits_off_target += cpi->av_per_frame_bandwidth;
+            if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size)
+                cpi->bits_off_target = cpi->oxcf.maximum_buffer_size;
+
+#if CONFIG_MULTI_RES_ENCODING
+            vp8_store_drop_frame_info(cpi);
+#endif
+
+            cm->current_video_frame++;
+            cpi->frames_since_key++;
+            // We advance the temporal pattern for dropped frames.
+            cpi->temporal_pattern_counter++;
+
+#if CONFIG_INTERNAL_STATS
+            cpi->count ++;
+#endif
+
+            cpi->buffer_level = cpi->bits_off_target;
+
+            if (cpi->oxcf.number_of_layers > 1)
+            {
+                unsigned int i;
+
+                /* Propagate bits saved by dropping the frame to higher
+                 * layers
+                 */
+                for (i=cpi->current_layer+1; i<cpi->oxcf.number_of_layers; i++)
+                {
+                    LAYER_CONTEXT *lc = &cpi->layer_context[i];
+                    lc->bits_off_target += (int)(lc->target_bandwidth /
+                                                 lc->framerate);
+                    if (lc->bits_off_target > lc->maximum_buffer_size)
+                        lc->bits_off_target = lc->maximum_buffer_size;
+                    lc->buffer_level = lc->bits_off_target;
+                }
+            }
+
+            return;
+        }
+        else
+            cpi->decimation_count = cpi->decimation_factor;
+    }
+    else
+        cpi->decimation_count = 0;
+
+    /* Decide how big to make the frame */
+    if (!vp8_pick_frame_size(cpi))
+    {
+        /*TODO: 2 drop_frame and return code could be put together. */
+#if CONFIG_MULTI_RES_ENCODING
+        vp8_store_drop_frame_info(cpi);
+#endif
+        cm->current_video_frame++;
+        cpi->frames_since_key++;
+        // We advance the temporal pattern for dropped frames.
+        cpi->temporal_pattern_counter++;
+        return;
+    }
+
+    /* Reduce active_worst_allowed_q for CBR if our buffer is getting too full.
+     * This has a knock on effect on active best quality as well.
+     * For CBR if the buffer reaches its maximum level then we can no longer
+     * save up bits for later frames so we might as well use them up
+     * on the current frame.
+     */
+    if ((cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) &&
+        (cpi->buffer_level >= cpi->oxcf.optimal_buffer_level) && cpi->buffered_mode)
+    {
+        /* Max adjustment is 1/4 */
+        int Adjustment = cpi->active_worst_quality / 4;
+
+        if (Adjustment)
+        {
+            int buff_lvl_step;
+
+            if (cpi->buffer_level < cpi->oxcf.maximum_buffer_size)
+            {
+                buff_lvl_step = (int)
+                                ((cpi->oxcf.maximum_buffer_size -
+                                  cpi->oxcf.optimal_buffer_level) /
+                                  Adjustment);
+
+                if (buff_lvl_step)
+                    Adjustment = (int)
+                                 ((cpi->buffer_level -
+                                 cpi->oxcf.optimal_buffer_level) /
+                                 buff_lvl_step);
+                else
+                    Adjustment = 0;
+            }
+
+            cpi->active_worst_quality -= Adjustment;
+
+            if(cpi->active_worst_quality < cpi->active_best_quality)
+                cpi->active_worst_quality = cpi->active_best_quality;
+        }
+    }
+
+    /* Set an active best quality and if necessary active worst quality
+     * There is some odd behavior for one pass here that needs attention.
+     */
+    if ( (cpi->pass == 2) || (cpi->ni_frames > 150))
+    {
+        vp8_clear_system_state();
+
+        Q = cpi->active_worst_quality;
+
+        if ( cm->frame_type == KEY_FRAME )
+        {
+            if ( cpi->pass == 2 )
+            {
+                if (cpi->gfu_boost > 600)
+                   cpi->active_best_quality = kf_low_motion_minq[Q];
+                else
+                   cpi->active_best_quality = kf_high_motion_minq[Q];
+
+                /* Special case for key frames forced because we have reached
+                 * the maximum key frame interval. Here force the Q to a range
+                 * based on the ambient Q to reduce the risk of popping
+                 */
+                if ( cpi->this_key_frame_forced )
+                {
+                    if ( cpi->active_best_quality > cpi->avg_frame_qindex * 7/8)
+                        cpi->active_best_quality = cpi->avg_frame_qindex * 7/8;
+                    else if ( cpi->active_best_quality < cpi->avg_frame_qindex >> 2 )
+                        cpi->active_best_quality = cpi->avg_frame_qindex >> 2;
+                }
+            }
+            /* One pass more conservative */
+            else
+               cpi->active_best_quality = kf_high_motion_minq[Q];
+        }
+
+        else if (cpi->oxcf.number_of_layers==1 &&
+                (cm->refresh_golden_frame || cpi->common.refresh_alt_ref_frame))
+        {
+            /* Use the lower of cpi->active_worst_quality and recent
+             * average Q as basis for GF/ARF Q limit unless last frame was
+             * a key frame.
+             */
+            if ( (cpi->frames_since_key > 1) &&
+               (cpi->avg_frame_qindex < cpi->active_worst_quality) )
+            {
+                Q = cpi->avg_frame_qindex;
+            }
+
+            /* For constrained quality dont allow Q less than the cq level */
+            if ( (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
+                 (Q < cpi->cq_target_quality) )
+            {
+                Q = cpi->cq_target_quality;
+            }
+
+            if ( cpi->pass == 2 )
+            {
+                if ( cpi->gfu_boost > 1000 )
+                    cpi->active_best_quality = gf_low_motion_minq[Q];
+                else if ( cpi->gfu_boost < 400 )
+                    cpi->active_best_quality = gf_high_motion_minq[Q];
+                else
+                    cpi->active_best_quality = gf_mid_motion_minq[Q];
+
+                /* Constrained quality use slightly lower active best. */
+                if ( cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY )
+                {
+                    cpi->active_best_quality =
+                        cpi->active_best_quality * 15/16;
+                }
+            }
+            /* One pass more conservative */
+            else
+                cpi->active_best_quality = gf_high_motion_minq[Q];
+        }
+        else
+        {
+            cpi->active_best_quality = inter_minq[Q];
+
+            /* For the constant/constrained quality mode we dont want
+             * q to fall below the cq level.
+             */
+            if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
+                (cpi->active_best_quality < cpi->cq_target_quality) )
+            {
+                /* If we are strongly undershooting the target rate in the last
+                 * frames then use the user passed in cq value not the auto
+                 * cq value.
+                 */
+                if ( cpi->rolling_actual_bits < cpi->min_frame_bandwidth )
+                    cpi->active_best_quality = cpi->oxcf.cq_level;
+                else
+                    cpi->active_best_quality = cpi->cq_target_quality;
+            }
+        }
+
+        /* If CBR and the buffer is as full then it is reasonable to allow
+         * higher quality on the frames to prevent bits just going to waste.
+         */
+        if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+        {
+            /* Note that the use of >= here elliminates the risk of a devide
+             * by 0 error in the else if clause
+             */
+            if (cpi->buffer_level >= cpi->oxcf.maximum_buffer_size)
+                cpi->active_best_quality = cpi->best_quality;
+
+            else if (cpi->buffer_level > cpi->oxcf.optimal_buffer_level)
+            {
+                int Fraction = (int)
+                  (((cpi->buffer_level - cpi->oxcf.optimal_buffer_level) * 128)
+                  / (cpi->oxcf.maximum_buffer_size -
+                  cpi->oxcf.optimal_buffer_level));
+                int min_qadjustment = ((cpi->active_best_quality -
+                                        cpi->best_quality) * Fraction) / 128;
+
+                cpi->active_best_quality -= min_qadjustment;
+            }
+        }
+    }
+    /* Make sure constrained quality mode limits are adhered to for the first
+     * few frames of one pass encodes
+     */
+    else if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY)
+    {
+        if ( (cm->frame_type == KEY_FRAME) ||
+             cm->refresh_golden_frame || cpi->common.refresh_alt_ref_frame )
+        {
+             cpi->active_best_quality = cpi->best_quality;
+        }
+        else if (cpi->active_best_quality < cpi->cq_target_quality)
+        {
+            cpi->active_best_quality = cpi->cq_target_quality;
+        }
+    }
+
+    /* Clip the active best and worst quality values to limits */
+    if (cpi->active_worst_quality > cpi->worst_quality)
+        cpi->active_worst_quality = cpi->worst_quality;
+
+    if (cpi->active_best_quality < cpi->best_quality)
+        cpi->active_best_quality = cpi->best_quality;
+
+    if ( cpi->active_worst_quality < cpi->active_best_quality )
+        cpi->active_worst_quality = cpi->active_best_quality;
+
+    /* Determine initial Q to try */
+    Q = vp8_regulate_q(cpi, cpi->this_frame_target);
+
+#if !CONFIG_REALTIME_ONLY
+
+    /* Set highest allowed value for Zbin over quant */
+    if (cm->frame_type == KEY_FRAME)
+        zbin_oq_high = 0;
+    else if ((cpi->oxcf.number_of_layers == 1) && ((cm->refresh_alt_ref_frame ||
+              (cm->refresh_golden_frame && !cpi->source_alt_ref_active))))
+    {
+          zbin_oq_high = 16;
+    }
+    else
+        zbin_oq_high = ZBIN_OQ_MAX;
+#endif
+
+    /* Setup background Q adjustment for error resilient mode.
+     * For multi-layer encodes only enable this for the base layer.
+    */
+    if (cpi->cyclic_refresh_mode_enabled)
+    {
+      // Special case for screen_content_mode with golden frame updates.
+      int disable_cr_gf = (cpi->oxcf.screen_content_mode == 2 &&
+                           cm->refresh_golden_frame);
+      if (cpi->current_layer == 0 && cpi->force_maxqp == 0 && !disable_cr_gf)
+        cyclic_background_refresh(cpi, Q, 0);
+      else
+        disable_segmentation(cpi);
+    }
+
+    vp8_compute_frame_size_bounds(cpi, &frame_under_shoot_limit, &frame_over_shoot_limit);
+
+#if !CONFIG_REALTIME_ONLY
+    /* Limit Q range for the adaptive loop. */
+    bottom_index = cpi->active_best_quality;
+    top_index    = cpi->active_worst_quality;
+    q_low  = cpi->active_best_quality;
+    q_high = cpi->active_worst_quality;
+#endif
+
+    vp8_save_coding_context(cpi);
+
+    loop_count = 0;
+
+    scale_and_extend_source(cpi->un_scaled_source, cpi);
+
+#if CONFIG_TEMPORAL_DENOISING && CONFIG_POSTPROC
+    // Option to apply spatial blur under the aggressive or adaptive
+    // (temporal denoising) mode.
+    if (cpi->oxcf.noise_sensitivity >= 3) {
+      if (cpi->denoiser.denoise_pars.spatial_blur != 0) {
+        vp8_de_noise(cm, cpi->Source, cpi->Source,
+            cpi->denoiser.denoise_pars.spatial_blur, 1, 0, 0);
+      }
+    }
+#endif
+
+#if !(CONFIG_REALTIME_ONLY) && CONFIG_POSTPROC && !(CONFIG_TEMPORAL_DENOISING)
+
+    if (cpi->oxcf.noise_sensitivity > 0)
+    {
+        unsigned char *src;
+        int l = 0;
+
+        switch (cpi->oxcf.noise_sensitivity)
+        {
+        case 1:
+            l = 20;
+            break;
+        case 2:
+            l = 40;
+            break;
+        case 3:
+            l = 60;
+            break;
+        case 4:
+            l = 80;
+            break;
+        case 5:
+            l = 100;
+            break;
+        case 6:
+            l = 150;
+            break;
+        }
+
+
+        if (cm->frame_type == KEY_FRAME)
+        {
+            vp8_de_noise(cm, cpi->Source, cpi->Source, l , 1,  0, 1);
+        }
+        else
+        {
+            vp8_de_noise(cm, cpi->Source, cpi->Source, l , 1,  0, 1);
+
+            src = cpi->Source->y_buffer;
+
+            if (cpi->Source->y_stride < 0)
+            {
+                src += cpi->Source->y_stride * (cpi->Source->y_height - 1);
+            }
+        }
+    }
+
+#endif
+
+
+#ifdef OUTPUT_YUV_SRC
+    vp8_write_yuv_frame(yuv_file, cpi->Source);
+#endif
+
+    do
+    {
+        vp8_clear_system_state();
+
+        vp8_set_quantizer(cpi, Q);
+
+        /* setup skip prob for costing in mode/mv decision */
+        if (cpi->common.mb_no_coeff_skip)
+        {
+            cpi->prob_skip_false = cpi->base_skip_false_prob[Q];
+
+            if (cm->frame_type != KEY_FRAME)
+            {
+                if (cpi->common.refresh_alt_ref_frame)
+                {
+                    if (cpi->last_skip_false_probs[2] != 0)
+                        cpi->prob_skip_false = cpi->last_skip_false_probs[2];
+
+                    /*
+                                        if(cpi->last_skip_false_probs[2]!=0 && abs(Q- cpi->last_skip_probs_q[2])<=16 )
+                       cpi->prob_skip_false = cpi->last_skip_false_probs[2];
+                                        else if (cpi->last_skip_false_probs[2]!=0)
+                       cpi->prob_skip_false = (cpi->last_skip_false_probs[2]  + cpi->prob_skip_false ) / 2;
+                       */
+                }
+                else if (cpi->common.refresh_golden_frame)
+                {
+                    if (cpi->last_skip_false_probs[1] != 0)
+                        cpi->prob_skip_false = cpi->last_skip_false_probs[1];
+
+                    /*
+                                        if(cpi->last_skip_false_probs[1]!=0 && abs(Q- cpi->last_skip_probs_q[1])<=16 )
+                       cpi->prob_skip_false = cpi->last_skip_false_probs[1];
+                                        else if (cpi->last_skip_false_probs[1]!=0)
+                       cpi->prob_skip_false = (cpi->last_skip_false_probs[1]  + cpi->prob_skip_false ) / 2;
+                       */
+                }
+                else
+                {
+                    if (cpi->last_skip_false_probs[0] != 0)
+                        cpi->prob_skip_false = cpi->last_skip_false_probs[0];
+
+                    /*
+                    if(cpi->last_skip_false_probs[0]!=0 && abs(Q- cpi->last_skip_probs_q[0])<=16 )
+                        cpi->prob_skip_false = cpi->last_skip_false_probs[0];
+                    else if(cpi->last_skip_false_probs[0]!=0)
+                        cpi->prob_skip_false = (cpi->last_skip_false_probs[0]  + cpi->prob_skip_false ) / 2;
+                        */
+                }
+
+                /* as this is for cost estimate, let's make sure it does not
+                 * go extreme eitehr way
+                 */
+                if (cpi->prob_skip_false < 5)
+                    cpi->prob_skip_false = 5;
+
+                if (cpi->prob_skip_false > 250)
+                    cpi->prob_skip_false = 250;
+
+                if (cpi->oxcf.number_of_layers == 1 && cpi->is_src_frame_alt_ref)
+                    cpi->prob_skip_false = 1;
+            }
+
+#if 0
+
+            if (cpi->pass != 1)
+            {
+                FILE *f = fopen("skip.stt", "a");
+                fprintf(f, "%d, %d, %4d ", cpi->common.refresh_golden_frame, cpi->common.refresh_alt_ref_frame, cpi->prob_skip_false);
+                fclose(f);
+            }
+
+#endif
+
+        }
+
+        if (cm->frame_type == KEY_FRAME)
+        {
+            if(resize_key_frame(cpi))
+            {
+              /* If the frame size has changed, need to reset Q, quantizer,
+               * and background refresh.
+               */
+              Q = vp8_regulate_q(cpi, cpi->this_frame_target);
+              if (cpi->cyclic_refresh_mode_enabled)
+              {
+                if (cpi->current_layer==0)
+                  cyclic_background_refresh(cpi, Q, 0);
+                else
+                  disable_segmentation(cpi);
+              }
+              // Reset the zero_last counter to 0 on key frame.
+              memset(cpi->consec_zero_last, 0, cm->mb_rows * cm->mb_cols);
+              memset(cpi->consec_zero_last_mvbias, 0,
+                     (cpi->common.mb_rows * cpi->common.mb_cols));
+              vp8_set_quantizer(cpi, Q);
+            }
+
+            vp8_setup_key_frame(cpi);
+        }
+
+
+
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+        {
+            if(cpi->oxcf.error_resilient_mode)
+                cm->refresh_entropy_probs = 0;
+
+            if (cpi->oxcf.error_resilient_mode & VPX_ERROR_RESILIENT_PARTITIONS)
+            {
+                if (cm->frame_type == KEY_FRAME)
+                    cm->refresh_entropy_probs = 1;
+            }
+
+            if (cm->refresh_entropy_probs == 0)
+            {
+                /* save a copy for later refresh */
+                memcpy(&cm->lfc, &cm->fc, sizeof(cm->fc));
+            }
+
+            vp8_update_coef_context(cpi);
+
+            vp8_update_coef_probs(cpi);
+
+            /* transform / motion compensation build reconstruction frame
+             * +pack coef partitions
+             */
+            vp8_encode_frame(cpi);
+
+            /* cpi->projected_frame_size is not needed for RT mode */
+        }
+#else
+        /* transform / motion compensation build reconstruction frame */
+        vp8_encode_frame(cpi);
+
+        if (cpi->oxcf.screen_content_mode == 2) {
+          if (vp8_drop_encodedframe_overshoot(cpi, Q))
+            return;
+        }
+
+        cpi->projected_frame_size -= vp8_estimate_entropy_savings(cpi);
+        cpi->projected_frame_size = (cpi->projected_frame_size > 0) ? cpi->projected_frame_size : 0;
+#endif
+        vp8_clear_system_state();
+
+        /* Test to see if the stats generated for this frame indicate that
+         * we should have coded a key frame (assuming that we didn't)!
+         */
+
+        if (cpi->pass != 2 && cpi->oxcf.auto_key && cm->frame_type != KEY_FRAME
+            && cpi->compressor_speed != 2)
+        {
+#if !CONFIG_REALTIME_ONLY
+            if (decide_key_frame(cpi))
+            {
+                /* Reset all our sizing numbers and recode */
+                cm->frame_type = KEY_FRAME;
+
+                vp8_pick_frame_size(cpi);
+
+                /* Clear the Alt reference frame active flag when we have
+                 * a key frame
+                 */
+                cpi->source_alt_ref_active = 0;
+
+                // Set the loop filter deltas and segmentation map update
+                setup_features(cpi);
+
+                vp8_restore_coding_context(cpi);
+
+                Q = vp8_regulate_q(cpi, cpi->this_frame_target);
+
+                vp8_compute_frame_size_bounds(cpi, &frame_under_shoot_limit, &frame_over_shoot_limit);
+
+                /* Limit Q range for the adaptive loop. */
+                bottom_index = cpi->active_best_quality;
+                top_index    = cpi->active_worst_quality;
+                q_low  = cpi->active_best_quality;
+                q_high = cpi->active_worst_quality;
+
+                loop_count++;
+                Loop = 1;
+
+                continue;
+            }
+#endif
+        }
+
+        vp8_clear_system_state();
+
+        if (frame_over_shoot_limit == 0)
+            frame_over_shoot_limit = 1;
+
+        /* Are we are overshooting and up against the limit of active max Q. */
+        if (((cpi->pass != 2) || (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)) &&
+            (Q == cpi->active_worst_quality)                     &&
+            (cpi->active_worst_quality < cpi->worst_quality)      &&
+            (cpi->projected_frame_size > frame_over_shoot_limit))
+        {
+            int over_size_percent = ((cpi->projected_frame_size - frame_over_shoot_limit) * 100) / frame_over_shoot_limit;
+
+            /* If so is there any scope for relaxing it */
+            while ((cpi->active_worst_quality < cpi->worst_quality) && (over_size_percent > 0))
+            {
+                cpi->active_worst_quality++;
+                /* Assume 1 qstep = about 4% on frame size. */
+                over_size_percent = (int)(over_size_percent * 0.96);
+            }
+#if !CONFIG_REALTIME_ONLY
+            top_index = cpi->active_worst_quality;
+#endif  // !CONFIG_REALTIME_ONLY
+            /* If we have updated the active max Q do not call
+             * vp8_update_rate_correction_factors() this loop.
+             */
+            active_worst_qchanged = 1;
+        }
+        else
+            active_worst_qchanged = 0;
+
+#if !CONFIG_REALTIME_ONLY
+        /* Special case handling for forced key frames */
+        if ( (cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced )
+        {
+            int last_q = Q;
+            int kf_err = vp8_calc_ss_err(cpi->Source,
+                                         &cm->yv12_fb[cm->new_fb_idx]);
+
+            /* The key frame is not good enough */
+            if ( kf_err > ((cpi->ambient_err * 7) >> 3) )
+            {
+                /* Lower q_high */
+                q_high = (Q > q_low) ? (Q - 1) : q_low;
+
+                /* Adjust Q */
+                Q = (q_high + q_low) >> 1;
+            }
+            /* The key frame is much better than the previous frame */
+            else if ( kf_err < (cpi->ambient_err >> 1) )
+            {
+                /* Raise q_low */
+                q_low = (Q < q_high) ? (Q + 1) : q_high;
+
+                /* Adjust Q */
+                Q = (q_high + q_low + 1) >> 1;
+            }
+
+            /* Clamp Q to upper and lower limits: */
+            if (Q > q_high)
+                Q = q_high;
+            else if (Q < q_low)
+                Q = q_low;
+
+            Loop = Q != last_q;
+        }
+
+        /* Is the projected frame size out of range and are we allowed
+         * to attempt to recode.
+         */
+        else if ( recode_loop_test( cpi,
+                               frame_over_shoot_limit, frame_under_shoot_limit,
+                               Q, top_index, bottom_index ) )
+        {
+            int last_q = Q;
+            int Retries = 0;
+
+            /* Frame size out of permitted range. Update correction factor
+             * & compute new Q to try...
+             */
+
+            /* Frame is too large */
+            if (cpi->projected_frame_size > cpi->this_frame_target)
+            {
+                /* Raise Qlow as to at least the current value */
+                q_low = (Q < q_high) ? (Q + 1) : q_high;
+
+                /* If we are using over quant do the same for zbin_oq_low */
+                if (cpi->mb.zbin_over_quant > 0)
+                    zbin_oq_low = (cpi->mb.zbin_over_quant < zbin_oq_high) ?
+                        (cpi->mb.zbin_over_quant + 1) : zbin_oq_high;
+
+                if (undershoot_seen)
+                {
+                    /* Update rate_correction_factor unless
+                     * cpi->active_worst_quality has changed.
+                     */
+                    if (!active_worst_qchanged)
+                        vp8_update_rate_correction_factors(cpi, 1);
+
+                    Q = (q_high + q_low + 1) / 2;
+
+                    /* Adjust cpi->zbin_over_quant (only allowed when Q
+                     * is max)
+                     */
+                    if (Q < MAXQ)
+                        cpi->mb.zbin_over_quant = 0;
+                    else
+                    {
+                        zbin_oq_low = (cpi->mb.zbin_over_quant < zbin_oq_high) ?
+                            (cpi->mb.zbin_over_quant + 1) : zbin_oq_high;
+                        cpi->mb.zbin_over_quant =
+                            (zbin_oq_high + zbin_oq_low) / 2;
+                    }
+                }
+                else
+                {
+                    /* Update rate_correction_factor unless
+                     * cpi->active_worst_quality has changed.
+                     */
+                    if (!active_worst_qchanged)
+                        vp8_update_rate_correction_factors(cpi, 0);
+
+                    Q = vp8_regulate_q(cpi, cpi->this_frame_target);
+
+                    while (((Q < q_low) ||
+                        (cpi->mb.zbin_over_quant < zbin_oq_low)) &&
+                        (Retries < 10))
+                    {
+                        vp8_update_rate_correction_factors(cpi, 0);
+                        Q = vp8_regulate_q(cpi, cpi->this_frame_target);
+                        Retries ++;
+                    }
+                }
+
+                overshoot_seen = 1;
+            }
+            /* Frame is too small */
+            else
+            {
+                if (cpi->mb.zbin_over_quant == 0)
+                    /* Lower q_high if not using over quant */
+                    q_high = (Q > q_low) ? (Q - 1) : q_low;
+                else
+                    /* else lower zbin_oq_high */
+                    zbin_oq_high = (cpi->mb.zbin_over_quant > zbin_oq_low) ?
+                        (cpi->mb.zbin_over_quant - 1) : zbin_oq_low;
+
+                if (overshoot_seen)
+                {
+                    /* Update rate_correction_factor unless
+                     * cpi->active_worst_quality has changed.
+                     */
+                    if (!active_worst_qchanged)
+                        vp8_update_rate_correction_factors(cpi, 1);
+
+                    Q = (q_high + q_low) / 2;
+
+                    /* Adjust cpi->zbin_over_quant (only allowed when Q
+                     * is max)
+                     */
+                    if (Q < MAXQ)
+                        cpi->mb.zbin_over_quant = 0;
+                    else
+                        cpi->mb.zbin_over_quant =
+                            (zbin_oq_high + zbin_oq_low) / 2;
+                }
+                else
+                {
+                    /* Update rate_correction_factor unless
+                     * cpi->active_worst_quality has changed.
+                     */
+                    if (!active_worst_qchanged)
+                        vp8_update_rate_correction_factors(cpi, 0);
+
+                    Q = vp8_regulate_q(cpi, cpi->this_frame_target);
+
+                    /* Special case reset for qlow for constrained quality.
+                     * This should only trigger where there is very substantial
+                     * undershoot on a frame and the auto cq level is above
+                     * the user passsed in value.
+                     */
+                    if ( (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
+                         (Q < q_low) )
+                    {
+                        q_low = Q;
+                    }
+
+                    while (((Q > q_high) ||
+                        (cpi->mb.zbin_over_quant > zbin_oq_high)) &&
+                        (Retries < 10))
+                    {
+                        vp8_update_rate_correction_factors(cpi, 0);
+                        Q = vp8_regulate_q(cpi, cpi->this_frame_target);
+                        Retries ++;
+                    }
+                }
+
+                undershoot_seen = 1;
+            }
+
+            /* Clamp Q to upper and lower limits: */
+            if (Q > q_high)
+                Q = q_high;
+            else if (Q < q_low)
+                Q = q_low;
+
+            /* Clamp cpi->zbin_over_quant */
+            cpi->mb.zbin_over_quant = (cpi->mb.zbin_over_quant < zbin_oq_low) ?
+                zbin_oq_low : (cpi->mb.zbin_over_quant > zbin_oq_high) ?
+                    zbin_oq_high : cpi->mb.zbin_over_quant;
+
+            Loop = Q != last_q;
+        }
+        else
+#endif
+            Loop = 0;
+
+        if (cpi->is_src_frame_alt_ref)
+            Loop = 0;
+
+        if (Loop == 1)
+        {
+            vp8_restore_coding_context(cpi);
+            loop_count++;
+#if CONFIG_INTERNAL_STATS
+            cpi->tot_recode_hits++;
+#endif
+        }
+    }
+    while (Loop == 1);
+
+#if 0
+    /* Experimental code for lagged and one pass
+     * Update stats used for one pass GF selection
+     */
+    {
+        cpi->one_pass_frame_stats[cpi->one_pass_frame_index].frame_coded_error = (double)cpi->prediction_error;
+        cpi->one_pass_frame_stats[cpi->one_pass_frame_index].frame_intra_error = (double)cpi->intra_error;
+        cpi->one_pass_frame_stats[cpi->one_pass_frame_index].frame_pcnt_inter = (double)(100 - cpi->this_frame_percent_intra) / 100.0;
+    }
+#endif
+
+    /* Special case code to reduce pulsing when key frames are forced at a
+     * fixed interval. Note the reconstruction error if it is the frame before
+     * the force key frame
+     */
+    if ( cpi->next_key_frame_forced && (cpi->twopass.frames_to_key == 0) )
+    {
+        cpi->ambient_err = vp8_calc_ss_err(cpi->Source,
+                                           &cm->yv12_fb[cm->new_fb_idx]);
+    }
+
+    /* This frame's MVs are saved and will be used in next frame's MV predictor.
+     * Last frame has one more line(add to bottom) and one more column(add to
+     * right) than cm->mip. The edge elements are initialized to 0.
+     */
+#if CONFIG_MULTI_RES_ENCODING
+    if(!cpi->oxcf.mr_encoder_id && cm->show_frame)
+#else
+    if(cm->show_frame)   /* do not save for altref frame */
+#endif
+    {
+        int mb_row;
+        int mb_col;
+        /* Point to beginning of allocated MODE_INFO arrays. */
+        MODE_INFO *tmp = cm->mip;
+
+        if(cm->frame_type != KEY_FRAME)
+        {
+            for (mb_row = 0; mb_row < cm->mb_rows+1; mb_row ++)
+            {
+                for (mb_col = 0; mb_col < cm->mb_cols+1; mb_col ++)
+                {
+                    if(tmp->mbmi.ref_frame != INTRA_FRAME)
+                        cpi->lfmv[mb_col + mb_row*(cm->mode_info_stride+1)].as_int = tmp->mbmi.mv.as_int;
+
+                    cpi->lf_ref_frame_sign_bias[mb_col + mb_row*(cm->mode_info_stride+1)] = cm->ref_frame_sign_bias[tmp->mbmi.ref_frame];
+                    cpi->lf_ref_frame[mb_col + mb_row*(cm->mode_info_stride+1)] = tmp->mbmi.ref_frame;
+                    tmp++;
+                }
+            }
+        }
+    }
+
+    /* Count last ref frame 0,0 usage on current encoded frame. */
+    {
+        int mb_row;
+        int mb_col;
+        /* Point to beginning of MODE_INFO arrays. */
+        MODE_INFO *tmp = cm->mi;
+
+        cpi->zeromv_count = 0;
+
+        if(cm->frame_type != KEY_FRAME)
+        {
+            for (mb_row = 0; mb_row < cm->mb_rows; mb_row ++)
+            {
+                for (mb_col = 0; mb_col < cm->mb_cols; mb_col ++)
+                {
+                    if (tmp->mbmi.mode == ZEROMV &&
+                       tmp->mbmi.ref_frame == LAST_FRAME)
+                        cpi->zeromv_count++;
+                    tmp++;
+                }
+                tmp++;
+            }
+        }
+    }
+
+#if CONFIG_MULTI_RES_ENCODING
+    vp8_cal_dissimilarity(cpi);
+#endif
+
+    /* Update the GF useage maps.
+     * This is done after completing the compression of a frame when all
+     * modes etc. are finalized but before loop filter
+     */
+    if (cpi->oxcf.number_of_layers == 1)
+        vp8_update_gf_useage_maps(cpi, cm, &cpi->mb);
+
+    if (cm->frame_type == KEY_FRAME)
+        cm->refresh_last_frame = 1;
+
+#if 0
+    {
+        FILE *f = fopen("gfactive.stt", "a");
+        fprintf(f, "%8d %8d %8d %8d %8d\n", cm->current_video_frame, (100 * cpi->gf_active_count) / (cpi->common.mb_rows * cpi->common.mb_cols), cpi->this_iiratio, cpi->next_iiratio, cm->refresh_golden_frame);
+        fclose(f);
+    }
+#endif
+
+    /* For inter frames the current default behavior is that when
+     * cm->refresh_golden_frame is set we copy the old GF over to the ARF buffer
+     * This is purely an encoder decision at present.
+     */
+    if (!cpi->oxcf.error_resilient_mode && cm->refresh_golden_frame)
+        cm->copy_buffer_to_arf  = 2;
+    else
+        cm->copy_buffer_to_arf  = 0;
+
+    cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx];
+
+#if CONFIG_TEMPORAL_DENOISING
+    // Get some measure of the amount of noise, by measuring the (partial) mse
+    // between source and denoised buffer, for y channel. Partial refers to
+    // computing the sse for a sub-sample of the frame (i.e., skip x blocks along row/column),
+    // and only for blocks in that set that are consecutive ZEROMV_LAST mode.
+    // Do this every ~8 frames, to further reduce complexity.
+    // TODO(marpan): Keep this for now for the case cpi->oxcf.noise_sensitivity < 4,
+    // should be removed in favor of the process_denoiser_mode_change() function below.
+    if (cpi->oxcf.noise_sensitivity > 0 &&
+       cpi->oxcf.noise_sensitivity < 4 &&
+       !cpi->oxcf.screen_content_mode &&
+       cpi->frames_since_key%8 == 0 &&
+       cm->frame_type != KEY_FRAME) {
+       cpi->mse_source_denoised = measure_square_diff_partial(
+           &cpi->denoiser.yv12_running_avg[INTRA_FRAME], cpi->Source, cpi);
+    }
+
+    // For the adaptive denoising mode (noise_sensitivity == 4), sample the mse
+    // of source diff (between current and previous frame), and determine if we
+    // should switch the denoiser mode. Sampling refers to computing the mse for
+    // a sub-sample of the frame (i.e., skip x blocks along row/column), and
+    // only for blocks in that set that have used ZEROMV LAST, along with some
+    // constraint on the sum diff between blocks. This process is called every
+    // ~8 frames, to further reduce complexity.
+    if (cpi->oxcf.noise_sensitivity == 4 &&
+        !cpi->oxcf.screen_content_mode &&
+        cpi->frames_since_key % 8 == 0 &&
+        cm->frame_type != KEY_FRAME) {
+      process_denoiser_mode_change(cpi);
+    }
+#endif
+
+#if CONFIG_MULTITHREAD
+    if (cpi->b_multi_threaded)
+    {
+        /* start loopfilter in separate thread */
+        sem_post(&cpi->h_event_start_lpf);
+        cpi->b_lpf_running = 1;
+    }
+    else
+#endif
+    {
+        vp8_loopfilter_frame(cpi, cm);
+    }
+
+    update_reference_frames(cpi);
+
+#ifdef OUTPUT_YUV_DENOISED
+    vp8_write_yuv_frame(yuv_denoised_file,
+                        &cpi->denoiser.yv12_running_avg[INTRA_FRAME]);
+#endif
+
+#if !(CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
+    if (cpi->oxcf.error_resilient_mode)
+    {
+        cm->refresh_entropy_probs = 0;
+    }
+#endif
+
+#if CONFIG_MULTITHREAD
+    /* wait that filter_level is picked so that we can continue with stream packing */
+    if (cpi->b_multi_threaded)
+        sem_wait(&cpi->h_event_end_lpf);
+#endif
+
+    /* build the bitstream */
+    vp8_pack_bitstream(cpi, dest, dest_end, size);
+
+#if CONFIG_MULTITHREAD
+    /* if PSNR packets are generated we have to wait for the lpf */
+    if (cpi->b_lpf_running && cpi->b_calculate_psnr)
+    {
+        sem_wait(&cpi->h_event_end_lpf);
+        cpi->b_lpf_running = 0;
+    }
+#endif
+
+    /* Move storing frame_type out of the above loop since it is also
+     * needed in motion search besides loopfilter */
+    cm->last_frame_type = cm->frame_type;
+
+    /* Update rate control heuristics */
+    cpi->total_byte_count += (*size);
+    cpi->projected_frame_size = (*size) << 3;
+
+    if (cpi->oxcf.number_of_layers > 1)
+    {
+        unsigned int i;
+        for (i=cpi->current_layer+1; i<cpi->oxcf.number_of_layers; i++)
+          cpi->layer_context[i].total_byte_count += (*size);
+    }
+
+    if (!active_worst_qchanged)
+        vp8_update_rate_correction_factors(cpi, 2);
+
+    cpi->last_q[cm->frame_type] = cm->base_qindex;
+
+    if (cm->frame_type == KEY_FRAME)
+    {
+        vp8_adjust_key_frame_context(cpi);
+    }
+
+    /* Keep a record of ambient average Q. */
+    if (cm->frame_type != KEY_FRAME)
+        cpi->avg_frame_qindex = (2 + 3 * cpi->avg_frame_qindex + cm->base_qindex) >> 2;
+
+    /* Keep a record from which we can calculate the average Q excluding
+     * GF updates and key frames
+     */
+    if ((cm->frame_type != KEY_FRAME) && ((cpi->oxcf.number_of_layers > 1) ||
+        (!cm->refresh_golden_frame && !cm->refresh_alt_ref_frame)))
+    {
+        cpi->ni_frames++;
+
+        /* Calculate the average Q for normal inter frames (not key or GFU
+         * frames).
+         */
+        if ( cpi->pass == 2 )
+        {
+            cpi->ni_tot_qi += Q;
+            cpi->ni_av_qi = (cpi->ni_tot_qi / cpi->ni_frames);
+        }
+        else
+        {
+            /* Damp value for first few frames */
+            if (cpi->ni_frames > 150 )
+            {
+                cpi->ni_tot_qi += Q;
+                cpi->ni_av_qi = (cpi->ni_tot_qi / cpi->ni_frames);
+            }
+            /* For one pass, early in the clip ... average the current frame Q
+             * value with the worstq entered by the user as a dampening measure
+             */
+            else
+            {
+                cpi->ni_tot_qi += Q;
+                cpi->ni_av_qi = ((cpi->ni_tot_qi / cpi->ni_frames) + cpi->worst_quality + 1) / 2;
+            }
+
+            /* If the average Q is higher than what was used in the last
+             * frame (after going through the recode loop to keep the frame
+             * size within range) then use the last frame value - 1. The -1
+             * is designed to stop Q and hence the data rate, from
+             * progressively falling away during difficult sections, but at
+             * the same time reduce the number of itterations around the
+             * recode loop.
+             */
+            if (Q > cpi->ni_av_qi)
+                cpi->ni_av_qi = Q - 1;
+        }
+    }
+
+    /* Update the buffer level variable. */
+    /* Non-viewable frames are a special case and are treated as pure overhead. */
+    if ( !cm->show_frame )
+        cpi->bits_off_target -= cpi->projected_frame_size;
+    else
+        cpi->bits_off_target += cpi->av_per_frame_bandwidth - cpi->projected_frame_size;
+
+    /* Clip the buffer level to the maximum specified buffer size */
+    if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size)
+        cpi->bits_off_target = cpi->oxcf.maximum_buffer_size;
+
+    // If the frame dropper is not enabled, don't let the buffer level go below
+    // some threshold, given here by -|maximum_buffer_size|. For now we only do
+    // this for screen content input.
+    if (cpi->drop_frames_allowed == 0 && cpi->oxcf.screen_content_mode &&
+        cpi->bits_off_target < -cpi->oxcf.maximum_buffer_size)
+        cpi->bits_off_target = -cpi->oxcf.maximum_buffer_size;
+
+    /* Rolling monitors of whether we are over or underspending used to
+     * help regulate min and Max Q in two pass.
+     */
+    cpi->rolling_target_bits = ((cpi->rolling_target_bits * 3) + cpi->this_frame_target + 2) / 4;
+    cpi->rolling_actual_bits = ((cpi->rolling_actual_bits * 3) + cpi->projected_frame_size + 2) / 4;
+    cpi->long_rolling_target_bits = ((cpi->long_rolling_target_bits * 31) + cpi->this_frame_target + 16) / 32;
+    cpi->long_rolling_actual_bits = ((cpi->long_rolling_actual_bits * 31) + cpi->projected_frame_size + 16) / 32;
+
+    /* Actual bits spent */
+    cpi->total_actual_bits += cpi->projected_frame_size;
+
+    /* Debug stats */
+    cpi->total_target_vs_actual += (cpi->this_frame_target - cpi->projected_frame_size);
+
+    cpi->buffer_level = cpi->bits_off_target;
+
+    /* Propagate values to higher temporal layers */
+    if (cpi->oxcf.number_of_layers > 1)
+    {
+        unsigned int i;
+
+        for (i=cpi->current_layer+1; i<cpi->oxcf.number_of_layers; i++)
+        {
+            LAYER_CONTEXT *lc = &cpi->layer_context[i];
+            int bits_off_for_this_layer =
+               (int)(lc->target_bandwidth / lc->framerate -
+                     cpi->projected_frame_size);
+
+            lc->bits_off_target += bits_off_for_this_layer;
+
+            /* Clip buffer level to maximum buffer size for the layer */
+            if (lc->bits_off_target > lc->maximum_buffer_size)
+                lc->bits_off_target = lc->maximum_buffer_size;
+
+            lc->total_actual_bits += cpi->projected_frame_size;
+            lc->total_target_vs_actual += bits_off_for_this_layer;
+            lc->buffer_level = lc->bits_off_target;
+        }
+    }
+
+    /* Update bits left to the kf and gf groups to account for overshoot
+     * or undershoot on these frames
+     */
+    if (cm->frame_type == KEY_FRAME)
+    {
+        cpi->twopass.kf_group_bits += cpi->this_frame_target - cpi->projected_frame_size;
+
+        if (cpi->twopass.kf_group_bits < 0)
+            cpi->twopass.kf_group_bits = 0 ;
+    }
+    else if (cm->refresh_golden_frame || cm->refresh_alt_ref_frame)
+    {
+        cpi->twopass.gf_group_bits += cpi->this_frame_target - cpi->projected_frame_size;
+
+        if (cpi->twopass.gf_group_bits < 0)
+            cpi->twopass.gf_group_bits = 0 ;
+    }
+
+    if (cm->frame_type != KEY_FRAME)
+    {
+        if (cpi->common.refresh_alt_ref_frame)
+        {
+            cpi->last_skip_false_probs[2] = cpi->prob_skip_false;
+            cpi->last_skip_probs_q[2] = cm->base_qindex;
+        }
+        else if (cpi->common.refresh_golden_frame)
+        {
+            cpi->last_skip_false_probs[1] = cpi->prob_skip_false;
+            cpi->last_skip_probs_q[1] = cm->base_qindex;
+        }
+        else
+        {
+            cpi->last_skip_false_probs[0] = cpi->prob_skip_false;
+            cpi->last_skip_probs_q[0] = cm->base_qindex;
+
+            /* update the baseline */
+            cpi->base_skip_false_prob[cm->base_qindex] = cpi->prob_skip_false;
+
+        }
+    }
+
+#if 0 && CONFIG_INTERNAL_STATS
+    {
+        FILE *f = fopen("tmp.stt", "a");
+
+        vp8_clear_system_state();
+
+        if (cpi->twopass.total_left_stats.coded_error != 0.0)
+            fprintf(f, "%10d %10d %10d %10d %10d %10"PRId64" %10"PRId64
+                       "%10"PRId64" %10d %6d %6d %6d %6d %5d %5d %5d %8d "
+                       "%8.2lf %"PRId64" %10.3lf %10"PRId64" %8d\n",
+                       cpi->common.current_video_frame, cpi->this_frame_target,
+                       cpi->projected_frame_size,
+                       (cpi->projected_frame_size - cpi->this_frame_target),
+                       cpi->total_target_vs_actual,
+                       cpi->buffer_level,
+                       (cpi->oxcf.starting_buffer_level-cpi->bits_off_target),
+                       cpi->total_actual_bits, cm->base_qindex,
+                       cpi->active_best_quality, cpi->active_worst_quality,
+                       cpi->ni_av_qi, cpi->cq_target_quality,
+                       cm->refresh_golden_frame, cm->refresh_alt_ref_frame,
+                       cm->frame_type, cpi->gfu_boost,
+                       cpi->twopass.est_max_qcorrection_factor,
+                       cpi->twopass.bits_left,
+                       cpi->twopass.total_left_stats.coded_error,
+                       (double)cpi->twopass.bits_left /
+                           cpi->twopass.total_left_stats.coded_error,
+                       cpi->tot_recode_hits);
+        else
+            fprintf(f, "%10d %10d %10d %10d %10d %10"PRId64" %10"PRId64
+                       "%10"PRId64" %10d %6d %6d %6d %6d %5d %5d %5d %8d "
+                       "%8.2lf %"PRId64" %10.3lf %8d\n",
+                       cpi->common.current_video_frame, cpi->this_frame_target,
+                       cpi->projected_frame_size,
+                       (cpi->projected_frame_size - cpi->this_frame_target),
+                       cpi->total_target_vs_actual,
+                       cpi->buffer_level,
+                       (cpi->oxcf.starting_buffer_level-cpi->bits_off_target),
+                       cpi->total_actual_bits, cm->base_qindex,
+                       cpi->active_best_quality, cpi->active_worst_quality,
+                       cpi->ni_av_qi, cpi->cq_target_quality,
+                       cm->refresh_golden_frame, cm->refresh_alt_ref_frame,
+                       cm->frame_type, cpi->gfu_boost,
+                       cpi->twopass.est_max_qcorrection_factor,
+                       cpi->twopass.bits_left,
+                       cpi->twopass.total_left_stats.coded_error,
+                       cpi->tot_recode_hits);
+
+        fclose(f);
+
+        {
+            FILE *fmodes = fopen("Modes.stt", "a");
+
+            fprintf(fmodes, "%6d:%1d:%1d:%1d ",
+                        cpi->common.current_video_frame,
+                        cm->frame_type, cm->refresh_golden_frame,
+                        cm->refresh_alt_ref_frame);
+
+            fprintf(fmodes, "\n");
+
+            fclose(fmodes);
+        }
+    }
+
+#endif
+
+    if (cm->refresh_golden_frame == 1)
+        cm->frame_flags = cm->frame_flags | FRAMEFLAGS_GOLDEN;
+    else
+        cm->frame_flags = cm->frame_flags&~FRAMEFLAGS_GOLDEN;
+
+    if (cm->refresh_alt_ref_frame == 1)
+        cm->frame_flags = cm->frame_flags | FRAMEFLAGS_ALTREF;
+    else
+        cm->frame_flags = cm->frame_flags&~FRAMEFLAGS_ALTREF;
+
+
+    if (cm->refresh_last_frame & cm->refresh_golden_frame)
+        /* both refreshed */
+        cpi->gold_is_last = 1;
+    else if (cm->refresh_last_frame ^ cm->refresh_golden_frame)
+        /* 1 refreshed but not the other */
+        cpi->gold_is_last = 0;
+
+    if (cm->refresh_last_frame & cm->refresh_alt_ref_frame)
+        /* both refreshed */
+        cpi->alt_is_last = 1;
+    else if (cm->refresh_last_frame ^ cm->refresh_alt_ref_frame)
+        /* 1 refreshed but not the other */
+        cpi->alt_is_last = 0;
+
+    if (cm->refresh_alt_ref_frame & cm->refresh_golden_frame)
+        /* both refreshed */
+        cpi->gold_is_alt = 1;
+    else if (cm->refresh_alt_ref_frame ^ cm->refresh_golden_frame)
+        /* 1 refreshed but not the other */
+        cpi->gold_is_alt = 0;
+
+    cpi->ref_frame_flags = VP8_ALTR_FRAME | VP8_GOLD_FRAME | VP8_LAST_FRAME;
+
+    if (cpi->gold_is_last)
+        cpi->ref_frame_flags &= ~VP8_GOLD_FRAME;
+
+    if (cpi->alt_is_last)
+        cpi->ref_frame_flags &= ~VP8_ALTR_FRAME;
+
+    if (cpi->gold_is_alt)
+        cpi->ref_frame_flags &= ~VP8_ALTR_FRAME;
+
+
+    if (!cpi->oxcf.error_resilient_mode)
+    {
+        if (cpi->oxcf.play_alternate && cm->refresh_alt_ref_frame && (cm->frame_type != KEY_FRAME))
+            /* Update the alternate reference frame stats as appropriate. */
+            update_alt_ref_frame_stats(cpi);
+        else
+            /* Update the Golden frame stats as appropriate. */
+            update_golden_frame_stats(cpi);
+    }
+
+    if (cm->frame_type == KEY_FRAME)
+    {
+        /* Tell the caller that the frame was coded as a key frame */
+        *frame_flags = cm->frame_flags | FRAMEFLAGS_KEY;
+
+        /* As this frame is a key frame  the next defaults to an inter frame. */
+        cm->frame_type = INTER_FRAME;
+
+        cpi->last_frame_percent_intra = 100;
+    }
+    else
+    {
+        *frame_flags = cm->frame_flags&~FRAMEFLAGS_KEY;
+
+        cpi->last_frame_percent_intra = cpi->this_frame_percent_intra;
+    }
+
+    /* Clear the one shot update flags for segmentation map and mode/ref
+     * loop filter deltas.
+     */
+    cpi->mb.e_mbd.update_mb_segmentation_map = 0;
+    cpi->mb.e_mbd.update_mb_segmentation_data = 0;
+    cpi->mb.e_mbd.mode_ref_lf_delta_update = 0;
+
+
+    /* Dont increment frame counters if this was an altref buffer update
+     * not a real frame
+     */
+    if (cm->show_frame)
+    {
+        cm->current_video_frame++;
+        cpi->frames_since_key++;
+        cpi->temporal_pattern_counter++;
+    }
+
+    /* reset to normal state now that we are done. */
+
+
+
+#if 0
+    {
+        char filename[512];
+        FILE *recon_file;
+        sprintf(filename, "enc%04d.yuv", (int) cm->current_video_frame);
+        recon_file = fopen(filename, "wb");
+        fwrite(cm->yv12_fb[cm->lst_fb_idx].buffer_alloc,
+               cm->yv12_fb[cm->lst_fb_idx].frame_size, 1, recon_file);
+        fclose(recon_file);
+    }
+#endif
+
+    /* DEBUG */
+    /* vp8_write_yuv_frame("encoder_recon.yuv", cm->frame_to_show); */
+
+
+}
+#if !CONFIG_REALTIME_ONLY
+static void Pass2Encode(VP8_COMP *cpi, unsigned long *size, unsigned char *dest, unsigned char * dest_end, unsigned int *frame_flags)
+{
+
+    if (!cpi->common.refresh_alt_ref_frame)
+        vp8_second_pass(cpi);
+
+    encode_frame_to_data_rate(cpi, size, dest, dest_end, frame_flags);
+    cpi->twopass.bits_left -= 8 * *size;
+
+    if (!cpi->common.refresh_alt_ref_frame)
+    {
+        double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth
+            *cpi->oxcf.two_pass_vbrmin_section / 100);
+        cpi->twopass.bits_left += (int64_t)(two_pass_min_rate / cpi->framerate);
+    }
+}
+#endif
+
+int vp8_receive_raw_frame(VP8_COMP *cpi, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, int64_t time_stamp, int64_t end_time)
+{
+    struct vpx_usec_timer  timer;
+    int                    res = 0;
+
+    vpx_usec_timer_start(&timer);
+
+    /* Reinit the lookahead buffer if the frame size changes */
+    if (sd->y_width != cpi->oxcf.Width || sd->y_height != cpi->oxcf.Height)
+    {
+        assert(cpi->oxcf.lag_in_frames < 2);
+        dealloc_raw_frame_buffers(cpi);
+        alloc_raw_frame_buffers(cpi);
+    }
+
+    if(vp8_lookahead_push(cpi->lookahead, sd, time_stamp, end_time,
+                          frame_flags, cpi->active_map_enabled ? cpi->active_map : NULL))
+        res = -1;
+    vpx_usec_timer_mark(&timer);
+    cpi->time_receive_data += vpx_usec_timer_elapsed(&timer);
+
+    return res;
+}
+
+
+static int frame_is_reference(const VP8_COMP *cpi)
+{
+    const VP8_COMMON *cm = &cpi->common;
+    const MACROBLOCKD *xd = &cpi->mb.e_mbd;
+
+    return cm->frame_type == KEY_FRAME || cm->refresh_last_frame
+           || cm->refresh_golden_frame || cm->refresh_alt_ref_frame
+           || cm->copy_buffer_to_gf || cm->copy_buffer_to_arf
+           || cm->refresh_entropy_probs
+           || xd->mode_ref_lf_delta_update
+           || xd->update_mb_segmentation_map || xd->update_mb_segmentation_data;
+}
+
+
+int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned long *size, unsigned char *dest, unsigned char *dest_end, int64_t *time_stamp, int64_t *time_end, int flush)
+{
+    VP8_COMMON *cm;
+    struct vpx_usec_timer  tsctimer;
+    struct vpx_usec_timer  ticktimer;
+    struct vpx_usec_timer  cmptimer;
+    YV12_BUFFER_CONFIG    *force_src_buffer = NULL;
+
+    if (!cpi)
+        return -1;
+
+    cm = &cpi->common;
+
+    if (setjmp(cpi->common.error.jmp))
+    {
+        cpi->common.error.setjmp = 0;
+        vp8_clear_system_state();
+        return VPX_CODEC_CORRUPT_FRAME;
+    }
+
+    cpi->common.error.setjmp = 1;
+
+    vpx_usec_timer_start(&cmptimer);
+
+    cpi->source = NULL;
+
+#if !CONFIG_REALTIME_ONLY
+    /* Should we code an alternate reference frame */
+    if (cpi->oxcf.error_resilient_mode == 0 &&
+        cpi->oxcf.play_alternate &&
+        cpi->source_alt_ref_pending)
+    {
+        if ((cpi->source = vp8_lookahead_peek(cpi->lookahead,
+                                              cpi->frames_till_gf_update_due,
+                                              PEEK_FORWARD)))
+        {
+            cpi->alt_ref_source = cpi->source;
+            if (cpi->oxcf.arnr_max_frames > 0)
+            {
+                vp8_temporal_filter_prepare_c(cpi,
+                                              cpi->frames_till_gf_update_due);
+                force_src_buffer = &cpi->alt_ref_buffer;
+            }
+            cpi->frames_till_alt_ref_frame = cpi->frames_till_gf_update_due;
+            cm->refresh_alt_ref_frame = 1;
+            cm->refresh_golden_frame = 0;
+            cm->refresh_last_frame = 0;
+            cm->show_frame = 0;
+            /* Clear Pending alt Ref flag. */
+            cpi->source_alt_ref_pending = 0;
+            cpi->is_src_frame_alt_ref = 0;
+        }
+    }
+#endif
+
+    if (!cpi->source)
+    {
+        /* Read last frame source if we are encoding first pass. */
+        if (cpi->pass == 1 && cm->current_video_frame > 0)
+        {
+            if((cpi->last_source = vp8_lookahead_peek(cpi->lookahead, 1,
+                                                      PEEK_BACKWARD)) == NULL)
+              return -1;
+        }
+
+
+        if ((cpi->source = vp8_lookahead_pop(cpi->lookahead, flush)))
+        {
+            cm->show_frame = 1;
+
+            cpi->is_src_frame_alt_ref = cpi->alt_ref_source
+                                        && (cpi->source == cpi->alt_ref_source);
+
+            if(cpi->is_src_frame_alt_ref)
+                cpi->alt_ref_source = NULL;
+        }
+    }
+
+    if (cpi->source)
+    {
+        cpi->Source = force_src_buffer ? force_src_buffer : &cpi->source->img;
+        cpi->un_scaled_source = cpi->Source;
+        *time_stamp = cpi->source->ts_start;
+        *time_end = cpi->source->ts_end;
+        *frame_flags = cpi->source->flags;
+
+        if (cpi->pass == 1 && cm->current_video_frame > 0)
+        {
+            cpi->last_frame_unscaled_source = &cpi->last_source->img;
+        }
+    }
+    else
+    {
+        *size = 0;
+#if !CONFIG_REALTIME_ONLY
+
+        if (flush && cpi->pass == 1 && !cpi->twopass.first_pass_done)
+        {
+            vp8_end_first_pass(cpi);    /* get last stats packet */
+            cpi->twopass.first_pass_done = 1;
+        }
+
+#endif
+
+        return -1;
+    }
+
+    if (cpi->source->ts_start < cpi->first_time_stamp_ever)
+    {
+        cpi->first_time_stamp_ever = cpi->source->ts_start;
+        cpi->last_end_time_stamp_seen = cpi->source->ts_start;
+    }
+
+    /* adjust frame rates based on timestamps given */
+    if (cm->show_frame)
+    {
+        int64_t this_duration;
+        int step = 0;
+
+        if (cpi->source->ts_start == cpi->first_time_stamp_ever)
+        {
+            this_duration = cpi->source->ts_end - cpi->source->ts_start;
+            step = 1;
+        }
+        else
+        {
+            int64_t last_duration;
+
+            this_duration = cpi->source->ts_end - cpi->last_end_time_stamp_seen;
+            last_duration = cpi->last_end_time_stamp_seen
+                            - cpi->last_time_stamp_seen;
+            /* do a step update if the duration changes by 10% */
+            if (last_duration)
+                step = (int)(((this_duration - last_duration) *
+                            10 / last_duration));
+        }
+
+        if (this_duration)
+        {
+            if (step)
+                cpi->ref_framerate = 10000000.0 / this_duration;
+            else
+            {
+                double avg_duration, interval;
+
+                /* Average this frame's rate into the last second's average
+                 * frame rate. If we haven't seen 1 second yet, then average
+                 * over the whole interval seen.
+                 */
+                interval = (double)(cpi->source->ts_end -
+                                    cpi->first_time_stamp_ever);
+                if(interval > 10000000.0)
+                    interval = 10000000;
+
+                avg_duration = 10000000.0 / cpi->ref_framerate;
+                avg_duration *= (interval - avg_duration + this_duration);
+                avg_duration /= interval;
+
+                cpi->ref_framerate = 10000000.0 / avg_duration;
+            }
+#if CONFIG_MULTI_RES_ENCODING
+            if (cpi->oxcf.mr_total_resolutions > 1) {
+              LOWER_RES_FRAME_INFO* low_res_frame_info = (LOWER_RES_FRAME_INFO*)
+                  cpi->oxcf.mr_low_res_mode_info;
+              // Frame rate should be the same for all spatial layers in
+              // multi-res-encoding (simulcast), so we constrain the frame for
+              // higher layers to be that of lowest resolution. This is needed
+              // as he application may decide to skip encoding a high layer and
+              // then start again, in which case a big jump in time-stamps will
+              // be received for that high layer, which will yield an incorrect
+              // frame rate (from time-stamp adjustment in above calculation).
+              if (cpi->oxcf.mr_encoder_id) {
+                 cpi->ref_framerate = low_res_frame_info->low_res_framerate;
+              }
+              else {
+                // Keep track of frame rate for lowest resolution.
+                low_res_frame_info->low_res_framerate = cpi->ref_framerate;
+              }
+            }
+#endif
+            if (cpi->oxcf.number_of_layers > 1)
+            {
+                unsigned int i;
+
+                /* Update frame rates for each layer */
+                assert(cpi->oxcf.number_of_layers <= VPX_TS_MAX_LAYERS);
+                for (i = 0; i < cpi->oxcf.number_of_layers &&
+                     i < VPX_TS_MAX_LAYERS; ++i)
+                {
+                    LAYER_CONTEXT *lc = &cpi->layer_context[i];
+                    lc->framerate = cpi->ref_framerate /
+                                    cpi->oxcf.rate_decimator[i];
+                }
+            }
+            else
+                vp8_new_framerate(cpi, cpi->ref_framerate);
+        }
+
+        cpi->last_time_stamp_seen = cpi->source->ts_start;
+        cpi->last_end_time_stamp_seen = cpi->source->ts_end;
+    }
+
+    if (cpi->oxcf.number_of_layers > 1)
+    {
+        int layer;
+
+        update_layer_contexts (cpi);
+
+        /* Restore layer specific context & set frame rate */
+        if (cpi->temporal_layer_id >= 0) {
+          layer = cpi->temporal_layer_id;
+        } else {
+          layer = cpi->oxcf.layer_id[
+                  cpi->temporal_pattern_counter % cpi->oxcf.periodicity];
+        }
+        restore_layer_context (cpi, layer);
+        vp8_new_framerate(cpi, cpi->layer_context[layer].framerate);
+    }
+
+    if (cpi->compressor_speed == 2)
+    {
+        vpx_usec_timer_start(&tsctimer);
+        vpx_usec_timer_start(&ticktimer);
+    }
+
+    cpi->lf_zeromv_pct = (cpi->zeromv_count * 100)/cm->MBs;
+
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+    {
+        int i;
+        const int num_part = (1 << cm->multi_token_partition);
+        /* the available bytes in dest */
+        const unsigned long dest_size = dest_end - dest;
+        const int tok_part_buff_size = (dest_size * 9) / (10 * num_part);
+
+        unsigned char *dp = dest;
+
+        cpi->partition_d[0] = dp;
+        dp += dest_size/10;         /* reserve 1/10 for control partition */
+        cpi->partition_d_end[0] = dp;
+
+        for(i = 0; i < num_part; i++)
+        {
+            cpi->partition_d[i + 1] = dp;
+            dp += tok_part_buff_size;
+            cpi->partition_d_end[i + 1] = dp;
+        }
+    }
+#endif
+
+    /* start with a 0 size frame */
+    *size = 0;
+
+    /* Clear down mmx registers */
+    vp8_clear_system_state();
+
+    cm->frame_type = INTER_FRAME;
+    cm->frame_flags = *frame_flags;
+
+#if 0
+
+    if (cm->refresh_alt_ref_frame)
+    {
+        cm->refresh_golden_frame = 0;
+        cm->refresh_last_frame = 0;
+    }
+    else
+    {
+        cm->refresh_golden_frame = 0;
+        cm->refresh_last_frame = 1;
+    }
+
+#endif
+    /* find a free buffer for the new frame */
+    {
+        int i = 0;
+        for(; i < NUM_YV12_BUFFERS; i++)
+        {
+            if(!cm->yv12_fb[i].flags)
+            {
+                cm->new_fb_idx = i;
+                break;
+            }
+        }
+
+        assert(i < NUM_YV12_BUFFERS );
+    }
+#if !CONFIG_REALTIME_ONLY
+
+    if (cpi->pass == 1)
+    {
+        Pass1Encode(cpi, size, dest, frame_flags);
+    }
+    else if (cpi->pass == 2)
+    {
+        Pass2Encode(cpi, size, dest, dest_end, frame_flags);
+    }
+    else
+#endif
+        encode_frame_to_data_rate(cpi, size, dest, dest_end, frame_flags);
+
+    if (cpi->compressor_speed == 2)
+    {
+        unsigned int duration, duration2;
+        vpx_usec_timer_mark(&tsctimer);
+        vpx_usec_timer_mark(&ticktimer);
+
+        duration = (int)(vpx_usec_timer_elapsed(&ticktimer));
+        duration2 = (unsigned int)((double)duration / 2);
+
+        if (cm->frame_type != KEY_FRAME)
+        {
+            if (cpi->avg_encode_time == 0)
+                cpi->avg_encode_time = duration;
+            else
+                cpi->avg_encode_time = (7 * cpi->avg_encode_time + duration) >> 3;
+        }
+
+        if (duration2)
+        {
+            {
+
+                if (cpi->avg_pick_mode_time == 0)
+                    cpi->avg_pick_mode_time = duration2;
+                else
+                    cpi->avg_pick_mode_time = (7 * cpi->avg_pick_mode_time + duration2) >> 3;
+            }
+        }
+
+    }
+
+    if (cm->refresh_entropy_probs == 0)
+    {
+        memcpy(&cm->fc, &cm->lfc, sizeof(cm->fc));
+    }
+
+    /* Save the contexts separately for alt ref, gold and last. */
+    /* (TODO jbb -> Optimize this with pointers to avoid extra copies. ) */
+    if(cm->refresh_alt_ref_frame)
+        memcpy(&cpi->lfc_a, &cm->fc, sizeof(cm->fc));
+
+    if(cm->refresh_golden_frame)
+        memcpy(&cpi->lfc_g, &cm->fc, sizeof(cm->fc));
+
+    if(cm->refresh_last_frame)
+        memcpy(&cpi->lfc_n, &cm->fc, sizeof(cm->fc));
+
+    /* if its a dropped frame honor the requests on subsequent frames */
+    if (*size > 0)
+    {
+        cpi->droppable = !frame_is_reference(cpi);
+
+        /* return to normal state */
+        cm->refresh_entropy_probs = 1;
+        cm->refresh_alt_ref_frame = 0;
+        cm->refresh_golden_frame = 0;
+        cm->refresh_last_frame = 1;
+        cm->frame_type = INTER_FRAME;
+
+    }
+
+    /* Save layer specific state */
+    if (cpi->oxcf.number_of_layers > 1)
+        save_layer_context (cpi);
+
+    vpx_usec_timer_mark(&cmptimer);
+    cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer);
+
+    if (cpi->b_calculate_psnr && cpi->pass != 1 && cm->show_frame)
+    {
+        generate_psnr_packet(cpi);
+    }
+
+#if CONFIG_INTERNAL_STATS
+
+    if (cpi->pass != 1)
+    {
+        cpi->bytes += *size;
+
+        if (cm->show_frame)
+        {
+            cpi->common.show_frame_mi = cpi->common.mi;
+            cpi->count ++;
+
+            if (cpi->b_calculate_psnr)
+            {
+                uint64_t ye,ue,ve;
+                double frame_psnr;
+                YV12_BUFFER_CONFIG      *orig = cpi->Source;
+                YV12_BUFFER_CONFIG      *recon = cpi->common.frame_to_show;
+                unsigned int y_width = cpi->common.Width;
+                unsigned int y_height = cpi->common.Height;
+                unsigned int uv_width = (y_width + 1) / 2;
+                unsigned int uv_height = (y_height + 1) / 2;
+                int y_samples = y_height * y_width;
+                int uv_samples = uv_height * uv_width;
+                int t_samples = y_samples + 2 * uv_samples;
+                double sq_error;
+
+                ye = calc_plane_error(orig->y_buffer, orig->y_stride,
+                  recon->y_buffer, recon->y_stride, y_width, y_height);
+
+                ue = calc_plane_error(orig->u_buffer, orig->uv_stride,
+                  recon->u_buffer, recon->uv_stride, uv_width, uv_height);
+
+                ve = calc_plane_error(orig->v_buffer, orig->uv_stride,
+                  recon->v_buffer, recon->uv_stride, uv_width, uv_height);
+
+                sq_error = (double)(ye + ue + ve);
+
+                frame_psnr = vpx_sse_to_psnr(t_samples, 255.0, sq_error);
+
+                cpi->total_y += vpx_sse_to_psnr(y_samples, 255.0, (double)ye);
+                cpi->total_u += vpx_sse_to_psnr(uv_samples, 255.0, (double)ue);
+                cpi->total_v += vpx_sse_to_psnr(uv_samples, 255.0, (double)ve);
+                cpi->total_sq_error += sq_error;
+                cpi->total  += frame_psnr;
+#if CONFIG_POSTPROC
+                {
+                    YV12_BUFFER_CONFIG      *pp = &cm->post_proc_buffer;
+                    double sq_error2;
+                    double frame_psnr2, frame_ssim2 = 0;
+                    double weight = 0;
+
+                    vp8_deblock(cm, cm->frame_to_show, &cm->post_proc_buffer, cm->filter_level * 10 / 6, 1, 0);
+                    vp8_clear_system_state();
+
+                    ye = calc_plane_error(orig->y_buffer, orig->y_stride,
+                      pp->y_buffer, pp->y_stride, y_width, y_height);
+
+                    ue = calc_plane_error(orig->u_buffer, orig->uv_stride,
+                      pp->u_buffer, pp->uv_stride, uv_width, uv_height);
+
+                    ve = calc_plane_error(orig->v_buffer, orig->uv_stride,
+                      pp->v_buffer, pp->uv_stride, uv_width, uv_height);
+
+                    sq_error2 = (double)(ye + ue + ve);
+
+                    frame_psnr2 = vpx_sse_to_psnr(t_samples, 255.0, sq_error2);
+
+                    cpi->totalp_y += vpx_sse_to_psnr(y_samples,
+                                                     255.0, (double)ye);
+                    cpi->totalp_u += vpx_sse_to_psnr(uv_samples,
+                                                     255.0, (double)ue);
+                    cpi->totalp_v += vpx_sse_to_psnr(uv_samples,
+                                                     255.0, (double)ve);
+                    cpi->total_sq_error2 += sq_error2;
+                    cpi->totalp  += frame_psnr2;
+
+                    frame_ssim2 = vpx_calc_ssim(cpi->Source,
+                      &cm->post_proc_buffer, &weight);
+
+                    cpi->summed_quality += frame_ssim2 * weight;
+                    cpi->summed_weights += weight;
+
+                    if (cpi->oxcf.number_of_layers > 1)
+                    {
+                         unsigned int i;
+
+                         for (i=cpi->current_layer;
+                                       i<cpi->oxcf.number_of_layers; i++)
+                         {
+                             cpi->frames_in_layer[i]++;
+
+                             cpi->bytes_in_layer[i] += *size;
+                             cpi->sum_psnr[i]       += frame_psnr;
+                             cpi->sum_psnr_p[i]     += frame_psnr2;
+                             cpi->total_error2[i]   += sq_error;
+                             cpi->total_error2_p[i] += sq_error2;
+                             cpi->sum_ssim[i]       += frame_ssim2 * weight;
+                             cpi->sum_weights[i]    += weight;
+                         }
+                    }
+                }
+#endif
+            }
+
+            if (cpi->b_calculate_ssimg)
+            {
+                double y, u, v, frame_all;
+                frame_all = vpx_calc_ssimg(cpi->Source, cm->frame_to_show,
+                    &y, &u, &v);
+
+                if (cpi->oxcf.number_of_layers > 1)
+                {
+                    unsigned int i;
+
+                    for (i=cpi->current_layer;
+                         i<cpi->oxcf.number_of_layers; i++)
+                    {
+                        if (!cpi->b_calculate_psnr)
+                            cpi->frames_in_layer[i]++;
+
+                        cpi->total_ssimg_y_in_layer[i] += y;
+                        cpi->total_ssimg_u_in_layer[i] += u;
+                        cpi->total_ssimg_v_in_layer[i] += v;
+                        cpi->total_ssimg_all_in_layer[i] += frame_all;
+                    }
+                }
+                else
+                {
+                    cpi->total_ssimg_y += y;
+                    cpi->total_ssimg_u += u;
+                    cpi->total_ssimg_v += v;
+                    cpi->total_ssimg_all += frame_all;
+                }
+            }
+
+        }
+    }
+
+#if 0
+
+    if (cpi->common.frame_type != 0 && cpi->common.base_qindex == cpi->oxcf.worst_allowed_q)
+    {
+        skiptruecount += cpi->skip_true_count;
+        skipfalsecount += cpi->skip_false_count;
+    }
+
+#endif
+#if 0
+
+    if (cpi->pass != 1)
+    {
+        FILE *f = fopen("skip.stt", "a");
+        fprintf(f, "frame:%4d flags:%4x Q:%4d P:%4d Size:%5d\n", cpi->common.current_video_frame, *frame_flags, cpi->common.base_qindex, cpi->prob_skip_false, *size);
+
+        if (cpi->is_src_frame_alt_ref == 1)
+            fprintf(f, "skipcount: %4d framesize: %d\n", cpi->skip_true_count , *size);
+
+        fclose(f);
+    }
+
+#endif
+#endif
+
+    cpi->common.error.setjmp = 0;
+
+    return 0;
+}
+
+int vp8_get_preview_raw_frame(VP8_COMP *cpi, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *flags)
+{
+    if (cpi->common.refresh_alt_ref_frame)
+        return -1;
+    else
+    {
+        int ret;
+
+#if CONFIG_MULTITHREAD
+        if(cpi->b_lpf_running)
+        {
+            sem_wait(&cpi->h_event_end_lpf);
+            cpi->b_lpf_running = 0;
+        }
+#endif
+
+#if CONFIG_POSTPROC
+        cpi->common.show_frame_mi = cpi->common.mi;
+        ret = vp8_post_proc_frame(&cpi->common, dest, flags);
+#else
+        (void)flags;
+
+        if (cpi->common.frame_to_show)
+        {
+            *dest = *cpi->common.frame_to_show;
+            dest->y_width = cpi->common.Width;
+            dest->y_height = cpi->common.Height;
+            dest->uv_height = cpi->common.Height / 2;
+            ret = 0;
+        }
+        else
+        {
+            ret = -1;
+        }
+
+#endif
+        vp8_clear_system_state();
+        return ret;
+    }
+}
+
+int vp8_set_roimap(VP8_COMP *cpi, unsigned char *map, unsigned int rows, unsigned int cols, int delta_q[4], int delta_lf[4], unsigned int threshold[4])
+{
+    signed char feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS];
+    int internal_delta_q[MAX_MB_SEGMENTS];
+    const int range = 63;
+    int i;
+
+    // This method is currently incompatible with the cyclic refresh method
+    if ( cpi->cyclic_refresh_mode_enabled )
+        return -1;
+
+    // Check number of rows and columns match
+    if (cpi->common.mb_rows != rows || cpi->common.mb_cols != cols)
+        return -1;
+
+    // Range check the delta Q values and convert the external Q range values
+    // to internal ones.
+    if ( (abs(delta_q[0]) > range) || (abs(delta_q[1]) > range) ||
+         (abs(delta_q[2]) > range) || (abs(delta_q[3]) > range) )
+        return -1;
+
+    // Range check the delta lf values
+    if ( (abs(delta_lf[0]) > range) || (abs(delta_lf[1]) > range) ||
+         (abs(delta_lf[2]) > range) || (abs(delta_lf[3]) > range) )
+        return -1;
+
+    if (!map)
+    {
+        disable_segmentation(cpi);
+        return 0;
+    }
+
+    // Translate the external delta q values to internal values.
+    for ( i = 0; i < MAX_MB_SEGMENTS; i++ )
+        internal_delta_q[i] =
+            ( delta_q[i] >= 0 ) ? q_trans[delta_q[i]] : -q_trans[-delta_q[i]];
+
+    /* Set the segmentation Map */
+    set_segmentation_map(cpi, map);
+
+    /* Activate segmentation. */
+    enable_segmentation(cpi);
+
+    /* Set up the quant segment data */
+    feature_data[MB_LVL_ALT_Q][0] = internal_delta_q[0];
+    feature_data[MB_LVL_ALT_Q][1] = internal_delta_q[1];
+    feature_data[MB_LVL_ALT_Q][2] = internal_delta_q[2];
+    feature_data[MB_LVL_ALT_Q][3] = internal_delta_q[3];
+
+    /* Set up the loop segment data s */
+    feature_data[MB_LVL_ALT_LF][0] = delta_lf[0];
+    feature_data[MB_LVL_ALT_LF][1] = delta_lf[1];
+    feature_data[MB_LVL_ALT_LF][2] = delta_lf[2];
+    feature_data[MB_LVL_ALT_LF][3] = delta_lf[3];
+
+    cpi->segment_encode_breakout[0] = threshold[0];
+    cpi->segment_encode_breakout[1] = threshold[1];
+    cpi->segment_encode_breakout[2] = threshold[2];
+    cpi->segment_encode_breakout[3] = threshold[3];
+
+    /* Initialise the feature data structure */
+    set_segment_data(cpi, &feature_data[0][0], SEGMENT_DELTADATA);
+
+    return 0;
+}
+
+int vp8_set_active_map(VP8_COMP *cpi, unsigned char *map, unsigned int rows, unsigned int cols)
+{
+    if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols)
+    {
+        if (map)
+        {
+            memcpy(cpi->active_map, map, rows * cols);
+            cpi->active_map_enabled = 1;
+        }
+        else
+            cpi->active_map_enabled = 0;
+
+        return 0;
+    }
+    else
+    {
+        return -1 ;
+    }
+}
+
+int vp8_set_internal_size(VP8_COMP *cpi, VPX_SCALING horiz_mode, VPX_SCALING vert_mode)
+{
+    if (horiz_mode <= ONETWO)
+        cpi->common.horiz_scale = horiz_mode;
+    else
+        return -1;
+
+    if (vert_mode <= ONETWO)
+        cpi->common.vert_scale  = vert_mode;
+    else
+        return -1;
+
+    return 0;
+}
+
+
+
+int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest)
+{
+    int i, j;
+    int Total = 0;
+
+    unsigned char *src = source->y_buffer;
+    unsigned char *dst = dest->y_buffer;
+
+    /* Loop through the Y plane raw and reconstruction data summing
+     * (square differences)
+     */
+    for (i = 0; i < source->y_height; i += 16)
+    {
+        for (j = 0; j < source->y_width; j += 16)
+        {
+            unsigned int sse;
+            Total += vpx_mse16x16(src + j, source->y_stride,
+                                  dst + j, dest->y_stride, &sse);
+        }
+
+        src += 16 * source->y_stride;
+        dst += 16 * dest->y_stride;
+    }
+
+    return Total;
+}
+
+
+int vp8_get_quantizer(VP8_COMP *cpi)
+{
+    return cpi->common.base_qindex;
+}
diff --git a/libs/libvpx/vp8/encoder/onyx_int.h b/libs/libvpx/vp8/encoder/onyx_int.h
new file mode 100644
index 0000000000..2b2f7a0a9a
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/onyx_int.h
@@ -0,0 +1,755 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_ENCODER_ONYX_INT_H_
+#define VP8_ENCODER_ONYX_INT_H_
+
+#include <stdio.h>
+#include "vpx_config.h"
+#include "vp8/common/onyx.h"
+#include "treewriter.h"
+#include "tokenize.h"
+#include "vp8/common/onyxc_int.h"
+#include "vpx_dsp/variance.h"
+#include "encodemb.h"
+#include "vp8/encoder/quantize.h"
+#include "vp8/common/entropy.h"
+#include "vp8/common/threading.h"
+#include "vpx_ports/mem.h"
+#include "vpx/internal/vpx_codec_internal.h"
+#include "vpx/vp8.h"
+#include "mcomp.h"
+#include "vp8/common/findnearmv.h"
+#include "lookahead.h"
+#if CONFIG_TEMPORAL_DENOISING
+#include "vp8/encoder/denoising.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MIN_GF_INTERVAL             4
+#define DEFAULT_GF_INTERVAL         7
+
+#define KEY_FRAME_CONTEXT 5
+
+#define MAX_LAG_BUFFERS (CONFIG_REALTIME_ONLY? 1 : 25)
+
+#define AF_THRESH   25
+#define AF_THRESH2  100
+#define ARF_DECAY_THRESH 12
+
+
+#define MIN_THRESHMULT  32
+#define MAX_THRESHMULT  512
+
+#define GF_ZEROMV_ZBIN_BOOST 12
+#define LF_ZEROMV_ZBIN_BOOST 6
+#define MV_ZBIN_BOOST        4
+#define ZBIN_OQ_MAX 192
+
+#if !(CONFIG_REALTIME_ONLY)
+#define VP8_TEMPORAL_ALT_REF 1
+#endif
+
+typedef struct
+{
+    int kf_indicated;
+    unsigned int frames_since_key;
+    unsigned int frames_since_golden;
+    int filter_level;
+    int frames_till_gf_update_due;
+    int recent_ref_frame_usage[MAX_REF_FRAMES];
+
+    MV_CONTEXT mvc[2];
+    int mvcosts[2][MVvals+1];
+
+#ifdef MODE_STATS
+    int y_modes[5];
+    int uv_modes[4];
+    int b_modes[10];
+    int inter_y_modes[10];
+    int inter_uv_modes[4];
+    int inter_b_modes[10];
+#endif
+
+    vp8_prob ymode_prob[4], uv_mode_prob[3];   /* interframe intra mode probs */
+    vp8_prob kf_ymode_prob[4], kf_uv_mode_prob[3];   /* keyframe "" */
+
+    int ymode_count[5], uv_mode_count[4];  /* intra MB type cts this frame */
+
+    int count_mb_ref_frame_usage[MAX_REF_FRAMES];
+
+    int this_frame_percent_intra;
+    int last_frame_percent_intra;
+
+
+} CODING_CONTEXT;
+
+typedef struct
+{
+    double frame;
+    double intra_error;
+    double coded_error;
+    double ssim_weighted_pred_err;
+    double pcnt_inter;
+    double pcnt_motion;
+    double pcnt_second_ref;
+    double pcnt_neutral;
+    double MVr;
+    double mvr_abs;
+    double MVc;
+    double mvc_abs;
+    double MVrv;
+    double MVcv;
+    double mv_in_out_count;
+    double new_mv_count;
+    double duration;
+    double count;
+}
+FIRSTPASS_STATS;
+
+typedef struct
+{
+    int frames_so_far;
+    double frame_intra_error;
+    double frame_coded_error;
+    double frame_pcnt_inter;
+    double frame_pcnt_motion;
+    double frame_mvr;
+    double frame_mvr_abs;
+    double frame_mvc;
+    double frame_mvc_abs;
+
+} ONEPASS_FRAMESTATS;
+
+
+typedef enum
+{
+    THR_ZERO1          = 0,
+    THR_DC             = 1,
+
+    THR_NEAREST1       = 2,
+    THR_NEAR1          = 3,
+
+    THR_ZERO2          = 4,
+    THR_NEAREST2       = 5,
+
+    THR_ZERO3          = 6,
+    THR_NEAREST3       = 7,
+
+    THR_NEAR2          = 8,
+    THR_NEAR3          = 9,
+
+    THR_V_PRED         = 10,
+    THR_H_PRED         = 11,
+    THR_TM             = 12,
+
+    THR_NEW1           = 13,
+    THR_NEW2           = 14,
+    THR_NEW3           = 15,
+
+    THR_SPLIT1         = 16,
+    THR_SPLIT2         = 17,
+    THR_SPLIT3         = 18,
+
+    THR_B_PRED         = 19
+}
+THR_MODES;
+
+typedef enum
+{
+    DIAMOND = 0,
+    NSTEP = 1,
+    HEX = 2
+} SEARCH_METHODS;
+
+typedef struct
+{
+    int RD;
+    SEARCH_METHODS search_method;
+    int improved_quant;
+    int improved_dct;
+    int auto_filter;
+    int recode_loop;
+    int iterative_sub_pixel;
+    int half_pixel_search;
+    int quarter_pixel_search;
+    int thresh_mult[MAX_MODES];
+    int max_step_search_steps;
+    int first_step;
+    int optimize_coefficients;
+
+    int use_fastquant_for_pick;
+    int no_skip_block4x4_search;
+    int improved_mv_pred;
+
+} SPEED_FEATURES;
+
+typedef struct
+{
+    MACROBLOCK  mb;
+    int segment_counts[MAX_MB_SEGMENTS];
+    int totalrate;
+} MB_ROW_COMP;
+
+typedef struct
+{
+    TOKENEXTRA *start;
+    TOKENEXTRA *stop;
+} TOKENLIST;
+
+typedef struct
+{
+    int ithread;
+    void *ptr1;
+    void *ptr2;
+} ENCODETHREAD_DATA;
+typedef struct
+{
+    int ithread;
+    void *ptr1;
+} LPFTHREAD_DATA;
+
+enum
+{
+    BLOCK_16X8,
+    BLOCK_8X16,
+    BLOCK_8X8,
+    BLOCK_4X4,
+    BLOCK_16X16,
+    BLOCK_MAX_SEGMENTS
+};
+
+typedef struct
+{
+    /* Layer configuration */
+    double framerate;
+    int target_bandwidth;
+
+    /* Layer specific coding parameters */
+    int64_t starting_buffer_level;
+    int64_t optimal_buffer_level;
+    int64_t maximum_buffer_size;
+    int64_t starting_buffer_level_in_ms;
+    int64_t optimal_buffer_level_in_ms;
+    int64_t maximum_buffer_size_in_ms;
+
+    int avg_frame_size_for_layer;
+
+    int64_t buffer_level;
+    int64_t bits_off_target;
+
+    int64_t total_actual_bits;
+    int total_target_vs_actual;
+
+    int worst_quality;
+    int active_worst_quality;
+    int best_quality;
+    int active_best_quality;
+
+    int ni_av_qi;
+    int ni_tot_qi;
+    int ni_frames;
+    int avg_frame_qindex;
+
+    double rate_correction_factor;
+    double key_frame_rate_correction_factor;
+    double gf_rate_correction_factor;
+
+    int zbin_over_quant;
+
+    int inter_frame_target;
+    int64_t total_byte_count;
+
+    int filter_level;
+
+    int last_frame_percent_intra;
+
+    int count_mb_ref_frame_usage[MAX_REF_FRAMES];
+
+} LAYER_CONTEXT;
+
+typedef struct VP8_COMP
+{
+
+    DECLARE_ALIGNED(16, short, Y1quant[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, Y1quant_shift[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, Y1zbin[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, Y1round[QINDEX_RANGE][16]);
+
+    DECLARE_ALIGNED(16, short, Y2quant[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, Y2quant_shift[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, Y2zbin[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, Y2round[QINDEX_RANGE][16]);
+
+    DECLARE_ALIGNED(16, short, UVquant[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, UVquant_shift[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, UVzbin[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, UVround[QINDEX_RANGE][16]);
+
+    DECLARE_ALIGNED(16, short, zrun_zbin_boost_y1[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, zrun_zbin_boost_y2[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, Y1quant_fast[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, Y2quant_fast[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, UVquant_fast[QINDEX_RANGE][16]);
+
+
+    MACROBLOCK mb;
+    VP8_COMMON common;
+    vp8_writer bc[9]; /* one boolcoder for each partition */
+
+    VP8_CONFIG oxcf;
+
+    struct lookahead_ctx    *lookahead;
+    struct lookahead_entry  *source;
+    struct lookahead_entry  *alt_ref_source;
+    struct lookahead_entry  *last_source;
+
+    YV12_BUFFER_CONFIG *Source;
+    YV12_BUFFER_CONFIG *un_scaled_source;
+    YV12_BUFFER_CONFIG scaled_source;
+    YV12_BUFFER_CONFIG *last_frame_unscaled_source;
+
+    unsigned int frames_till_alt_ref_frame;
+    /* frame in src_buffers has been identified to be encoded as an alt ref */
+    int source_alt_ref_pending;
+    /* an alt ref frame has been encoded and is usable */
+    int source_alt_ref_active;
+    /* source of frame to encode is an exact copy of an alt ref frame */
+    int is_src_frame_alt_ref;
+
+    /* golden frame same as last frame ( short circuit gold searches) */
+    int gold_is_last;
+    /* Alt reference frame same as last ( short circuit altref search) */
+    int alt_is_last;
+    /* don't do both alt and gold search ( just do gold). */
+    int gold_is_alt;
+
+    YV12_BUFFER_CONFIG pick_lf_lvl_frame;
+
+    TOKENEXTRA *tok;
+    unsigned int tok_count;
+
+
+    unsigned int frames_since_key;
+    unsigned int key_frame_frequency;
+    unsigned int this_key_frame_forced;
+    unsigned int next_key_frame_forced;
+
+    /* Ambient reconstruction err target for force key frames */
+    int ambient_err;
+
+    unsigned int mode_check_freq[MAX_MODES];
+
+    int rd_baseline_thresh[MAX_MODES];
+
+    int RDMULT;
+    int RDDIV ;
+
+    CODING_CONTEXT coding_context;
+
+    /* Rate targetting variables */
+    int64_t last_prediction_error;
+    int64_t last_intra_error;
+
+    int this_frame_target;
+    int projected_frame_size;
+    int last_q[2];                   /* Separate values for Intra/Inter */
+
+    double rate_correction_factor;
+    double key_frame_rate_correction_factor;
+    double gf_rate_correction_factor;
+
+    unsigned int frames_since_golden;
+    /* Count down till next GF */
+    int frames_till_gf_update_due;
+
+    /* GF interval chosen when we coded the last GF */
+    int current_gf_interval;
+
+    /* Total bits overspent becasue of GF boost (cumulative) */
+    int gf_overspend_bits;
+
+    /* Used in the few frames following a GF to recover the extra bits
+     * spent in that GF
+     */
+    int non_gf_bitrate_adjustment;
+
+    /* Extra bits spent on key frames that need to be recovered */
+    int kf_overspend_bits;
+
+    /* Current number of bit s to try and recover on each inter frame. */
+    int kf_bitrate_adjustment;
+    int max_gf_interval;
+    int baseline_gf_interval;
+    int active_arnr_frames;
+
+    int64_t key_frame_count;
+    int prior_key_frame_distance[KEY_FRAME_CONTEXT];
+    /* Current section per frame bandwidth target */
+    int per_frame_bandwidth;
+    /* Average frame size target for clip */
+    int av_per_frame_bandwidth;
+    /* Minimum allocation that should be used for any frame */
+    int min_frame_bandwidth;
+    int inter_frame_target;
+    double output_framerate;
+    int64_t last_time_stamp_seen;
+    int64_t last_end_time_stamp_seen;
+    int64_t first_time_stamp_ever;
+
+    int ni_av_qi;
+    int ni_tot_qi;
+    int ni_frames;
+    int avg_frame_qindex;
+
+    int64_t total_byte_count;
+
+    int buffered_mode;
+
+    double framerate;
+    double ref_framerate;
+    int64_t buffer_level;
+    int64_t bits_off_target;
+
+    int rolling_target_bits;
+    int rolling_actual_bits;
+
+    int long_rolling_target_bits;
+    int long_rolling_actual_bits;
+
+    int64_t total_actual_bits;
+    int total_target_vs_actual; /* debug stats */
+
+    int worst_quality;
+    int active_worst_quality;
+    int best_quality;
+    int active_best_quality;
+
+    int cq_target_quality;
+
+    int drop_frames_allowed; /* Are we permitted to drop frames? */
+    int drop_frame;          /* Drop this frame? */
+
+    vp8_prob frame_coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+    char update_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+
+    unsigned int frame_branch_ct [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];
+
+    int gfu_boost;
+    int kf_boost;
+    int last_boost;
+
+    int target_bandwidth;
+    struct vpx_codec_pkt_list  *output_pkt_list;
+
+#if 0
+    /* Experimental code for lagged and one pass */
+    ONEPASS_FRAMESTATS one_pass_frame_stats[MAX_LAG_BUFFERS];
+    int one_pass_frame_index;
+#endif
+
+    int decimation_factor;
+    int decimation_count;
+
+    /* for real time encoding */
+    int avg_encode_time;     /* microsecond */
+    int avg_pick_mode_time;  /* microsecond */
+    int Speed;
+    int compressor_speed;
+
+    int auto_gold;
+    int auto_adjust_gold_quantizer;
+    int auto_worst_q;
+    int cpu_used;
+    int pass;
+
+
+    int prob_intra_coded;
+    int prob_last_coded;
+    int prob_gf_coded;
+    int prob_skip_false;
+    int last_skip_false_probs[3];
+    int last_skip_probs_q[3];
+    int recent_ref_frame_usage[MAX_REF_FRAMES];
+
+    int this_frame_percent_intra;
+    int last_frame_percent_intra;
+
+    int ref_frame_flags;
+
+    SPEED_FEATURES sf;
+
+    /* Count ZEROMV on all reference frames. */
+    int zeromv_count;
+    int lf_zeromv_pct;
+
+    unsigned char *segmentation_map;
+    signed char segment_feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS];
+    int  segment_encode_breakout[MAX_MB_SEGMENTS];
+
+    unsigned char *active_map;
+    unsigned int active_map_enabled;
+
+    /* Video conferencing cyclic refresh mode flags. This is a mode
+     * designed to clean up the background over time in live encoding
+     * scenarious. It uses segmentation.
+     */
+    int cyclic_refresh_mode_enabled;
+    int cyclic_refresh_mode_max_mbs_perframe;
+    int cyclic_refresh_mode_index;
+    int cyclic_refresh_q;
+    signed char *cyclic_refresh_map;
+    // Count on how many (consecutive) times a macroblock uses ZER0MV_LAST.
+    unsigned char *consec_zero_last;
+    // Counter that is reset when a block is checked for a mode-bias against
+    // ZEROMV_LASTREF.
+    unsigned char *consec_zero_last_mvbias;
+
+    // Frame counter for the temporal pattern. Counter is rest when the temporal
+    // layers are changed dynamically (run-time change).
+    unsigned int temporal_pattern_counter;
+    // Temporal layer id.
+    int temporal_layer_id;
+
+    // Measure of average squared difference between source and denoised signal.
+    int mse_source_denoised;
+
+    int force_maxqp;
+
+#if CONFIG_MULTITHREAD
+    /* multithread data */
+    pthread_mutex_t *pmutex;
+    pthread_mutex_t mt_mutex;           /* mutex for b_multi_threaded */
+    int * mt_current_mb_col;
+    int mt_sync_range;
+    int b_multi_threaded;
+    int encoding_thread_count;
+    int b_lpf_running;
+
+    pthread_t *h_encoding_thread;
+    pthread_t h_filter_thread;
+
+    MB_ROW_COMP *mb_row_ei;
+    ENCODETHREAD_DATA *en_thread_data;
+    LPFTHREAD_DATA lpf_thread_data;
+
+    /* events */
+    sem_t *h_event_start_encoding;
+    sem_t h_event_end_encoding;
+    sem_t h_event_start_lpf;
+    sem_t h_event_end_lpf;
+#endif
+
+    TOKENLIST *tplist;
+    unsigned int partition_sz[MAX_PARTITIONS];
+    unsigned char *partition_d[MAX_PARTITIONS];
+    unsigned char *partition_d_end[MAX_PARTITIONS];
+
+
+    fractional_mv_step_fp *find_fractional_mv_step;
+    vp8_full_search_fn_t full_search_sad;
+    vp8_refining_search_fn_t refining_search_sad;
+    vp8_diamond_search_fn_t diamond_search_sad;
+    vp8_variance_fn_ptr_t fn_ptr[BLOCK_MAX_SEGMENTS];
+    uint64_t time_receive_data;
+    uint64_t time_compress_data;
+    uint64_t time_pick_lpf;
+    uint64_t time_encode_mb_row;
+
+    int base_skip_false_prob[128];
+
+    FRAME_CONTEXT lfc_n; /* last frame entropy */
+    FRAME_CONTEXT lfc_a; /* last alt ref entropy */
+    FRAME_CONTEXT lfc_g; /* last gold ref entropy */
+
+
+    struct twopass_rc
+    {
+        unsigned int section_intra_rating;
+        double section_max_qfactor;
+        unsigned int next_iiratio;
+        unsigned int this_iiratio;
+        FIRSTPASS_STATS total_stats;
+        FIRSTPASS_STATS this_frame_stats;
+        FIRSTPASS_STATS *stats_in, *stats_in_end, *stats_in_start;
+        FIRSTPASS_STATS total_left_stats;
+        int first_pass_done;
+        int64_t bits_left;
+        int64_t clip_bits_total;
+        double avg_iiratio;
+        double modified_error_total;
+        double modified_error_used;
+        double modified_error_left;
+        double kf_intra_err_min;
+        double gf_intra_err_min;
+        int frames_to_key;
+        int maxq_max_limit;
+        int maxq_min_limit;
+        int gf_decay_rate;
+        int static_scene_max_gf_interval;
+        int kf_bits;
+        /* Remaining error from uncoded frames in a gf group. */
+        int gf_group_error_left;
+        /* Projected total bits available for a key frame group of frames */
+        int64_t kf_group_bits;
+        /* Error score of frames still to be coded in kf group */
+        int64_t kf_group_error_left;
+        /* Projected Bits available for a group including 1 GF or ARF */
+        int64_t gf_group_bits;
+        /* Bits for the golden frame or ARF */
+        int gf_bits;
+        int alt_extra_bits;
+        double est_max_qcorrection_factor;
+    } twopass;
+
+#if VP8_TEMPORAL_ALT_REF
+    YV12_BUFFER_CONFIG alt_ref_buffer;
+    YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS];
+    int fixed_divide[512];
+#endif
+
+#if CONFIG_INTERNAL_STATS
+    int    count;
+    double total_y;
+    double total_u;
+    double total_v;
+    double total ;
+    double total_sq_error;
+    double totalp_y;
+    double totalp_u;
+    double totalp_v;
+    double totalp;
+    double total_sq_error2;
+    int    bytes;
+    double summed_quality;
+    double summed_weights;
+    unsigned int tot_recode_hits;
+
+
+    double total_ssimg_y;
+    double total_ssimg_u;
+    double total_ssimg_v;
+    double total_ssimg_all;
+
+    int b_calculate_ssimg;
+#endif
+    int b_calculate_psnr;
+
+    /* Per MB activity measurement */
+    unsigned int activity_avg;
+    unsigned int * mb_activity_map;
+
+    /* Record of which MBs still refer to last golden frame either
+     * directly or through 0,0
+     */
+    unsigned char *gf_active_flags;
+    int gf_active_count;
+
+    int output_partition;
+
+    /* Store last frame's MV info for next frame MV prediction */
+    int_mv *lfmv;
+    int *lf_ref_frame_sign_bias;
+    int *lf_ref_frame;
+
+    /* force next frame to intra when kf_auto says so */
+    int force_next_frame_intra;
+
+    int droppable;
+
+    int initial_width;
+    int initial_height;
+
+#if CONFIG_TEMPORAL_DENOISING
+    VP8_DENOISER denoiser;
+#endif
+
+    /* Coding layer state variables */
+    unsigned int current_layer;
+    LAYER_CONTEXT layer_context[VPX_TS_MAX_LAYERS];
+
+    int64_t frames_in_layer[VPX_TS_MAX_LAYERS];
+    int64_t bytes_in_layer[VPX_TS_MAX_LAYERS];
+    double sum_psnr[VPX_TS_MAX_LAYERS];
+    double sum_psnr_p[VPX_TS_MAX_LAYERS];
+    double total_error2[VPX_TS_MAX_LAYERS];
+    double total_error2_p[VPX_TS_MAX_LAYERS];
+    double sum_ssim[VPX_TS_MAX_LAYERS];
+    double sum_weights[VPX_TS_MAX_LAYERS];
+
+    double total_ssimg_y_in_layer[VPX_TS_MAX_LAYERS];
+    double total_ssimg_u_in_layer[VPX_TS_MAX_LAYERS];
+    double total_ssimg_v_in_layer[VPX_TS_MAX_LAYERS];
+    double total_ssimg_all_in_layer[VPX_TS_MAX_LAYERS];
+
+#if CONFIG_MULTI_RES_ENCODING
+    /* Number of MBs per row at lower-resolution level */
+    int    mr_low_res_mb_cols;
+    /* Indicate if lower-res mv info is available */
+    unsigned char  mr_low_res_mv_avail;
+#endif
+    /* The frame number of each reference frames */
+    unsigned int current_ref_frames[MAX_REF_FRAMES];
+    // Closest reference frame to current frame.
+    MV_REFERENCE_FRAME closest_reference_frame;
+
+    struct rd_costs_struct
+    {
+        int mvcosts[2][MVvals+1];
+        int mvsadcosts[2][MVfpvals+1];
+        int mbmode_cost[2][MB_MODE_COUNT];
+        int intra_uv_mode_cost[2][MB_MODE_COUNT];
+        int bmode_costs[10][10][10];
+        int inter_bmode_costs[B_MODE_COUNT];
+        int token_costs[BLOCK_TYPES][COEF_BANDS]
+        [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];
+    } rd_costs;
+} VP8_COMP;
+
+void vp8_initialize_enc(void);
+
+void vp8_alloc_compressor_data(VP8_COMP *cpi);
+int vp8_reverse_trans(int x);
+void vp8_new_framerate(VP8_COMP *cpi, double framerate);
+void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm);
+
+void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest,
+                        unsigned char *dest_end, unsigned long *size);
+
+void vp8_tokenize_mb(VP8_COMP *, MACROBLOCK *, TOKENEXTRA **);
+
+void vp8_set_speed_features(VP8_COMP *cpi);
+
+#if CONFIG_DEBUG
+#define CHECK_MEM_ERROR(lval,expr) do {\
+        lval = (expr); \
+        if(!lval) \
+            vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,\
+                               "Failed to allocate "#lval" at %s:%d", \
+                               __FILE__,__LINE__);\
+    } while(0)
+#else
+#define CHECK_MEM_ERROR(lval,expr) do {\
+        lval = (expr); \
+        if(!lval) \
+            vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,\
+                               "Failed to allocate "#lval);\
+    } while(0)
+#endif
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_ENCODER_ONYX_INT_H_
diff --git a/libs/libvpx/vp8/encoder/pickinter.c b/libs/libvpx/vp8/encoder/pickinter.c
new file mode 100644
index 0000000000..0ea0632918
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/pickinter.c
@@ -0,0 +1,1585 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <limits.h>
+#include "vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "onyx_int.h"
+#include "modecosts.h"
+#include "encodeintra.h"
+#include "vp8/common/common.h"
+#include "vp8/common/entropymode.h"
+#include "pickinter.h"
+#include "vp8/common/findnearmv.h"
+#include "encodemb.h"
+#include "vp8/common/reconinter.h"
+#include "vp8/common/reconintra.h"
+#include "vp8/common/reconintra4x4.h"
+#include "vpx_dsp/variance.h"
+#include "mcomp.h"
+#include "rdopt.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#if CONFIG_TEMPORAL_DENOISING
+#include "denoising.h"
+#endif
+
+#ifdef SPEEDSTATS
+extern unsigned int cnt_pm;
+#endif
+
+#define MODEL_MODE 0
+
+extern const int vp8_ref_frame_order[MAX_MODES];
+extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES];
+
+// Fixed point implementation of a skin color classifier. Skin color
+// is model by a Gaussian distribution in the CbCr color space.
+// See ../../test/skin_color_detector_test.cc where the reference
+// skin color classifier is defined.
+
+// Fixed-point skin color model parameters.
+static const int skin_mean[5][2] =
+    {{7463, 9614}, {6400, 10240}, {7040, 10240}, {8320, 9280}, {6800, 9614}};
+static const int skin_inv_cov[4] = {4107, 1663, 1663, 2157};  // q16
+static const int skin_threshold[2] = {1570636, 800000};       // q18
+
+// Evaluates the Mahalanobis distance measure for the input CbCr values.
+static int evaluate_skin_color_difference(int cb, int cr, int idx) {
+  const int cb_q6 = cb << 6;
+  const int cr_q6 = cr << 6;
+  const int cb_diff_q12 =
+      (cb_q6 - skin_mean[idx][0]) * (cb_q6 - skin_mean[idx][0]);
+  const int cbcr_diff_q12 =
+      (cb_q6 - skin_mean[idx][0]) * (cr_q6 - skin_mean[idx][1]);
+  const int cr_diff_q12 =
+      (cr_q6 - skin_mean[idx][1]) * (cr_q6 - skin_mean[idx][1]);
+  const int cb_diff_q2 = (cb_diff_q12 + (1 << 9)) >> 10;
+  const int cbcr_diff_q2 = (cbcr_diff_q12 + (1 << 9)) >> 10;
+  const int cr_diff_q2 = (cr_diff_q12 + (1 << 9)) >> 10;
+  const int skin_diff = skin_inv_cov[0] * cb_diff_q2 +
+      skin_inv_cov[1] * cbcr_diff_q2 +
+      skin_inv_cov[2] * cbcr_diff_q2 +
+      skin_inv_cov[3] * cr_diff_q2;
+  return skin_diff;
+}
+
+// Checks if the input yCbCr values corresponds to skin color.
+static int is_skin_color(int y, int cb, int cr)
+{
+  if (y < 40 || y > 220)
+  {
+    return 0;
+  }
+  else
+  {
+    if (MODEL_MODE == 0)
+    {
+      return (evaluate_skin_color_difference(cb, cr, 0) < skin_threshold[0]);
+    }
+    else
+    {
+      int i = 0;
+      for (; i < 5; i++)
+      {
+        if (evaluate_skin_color_difference(cb, cr, i) < skin_threshold[1])
+        {
+          return 1;
+        }
+      }
+      return 0;
+    }
+  }
+}
+
+static int macroblock_corner_grad(unsigned char* signal, int stride,
+                                  int offsetx, int offsety, int sgnx, int sgny)
+{
+  int y1 = signal[offsetx * stride + offsety];
+  int y2 = signal[offsetx * stride + offsety + sgny];
+  int y3 = signal[(offsetx + sgnx) * stride + offsety];
+  int y4 = signal[(offsetx + sgnx) * stride + offsety + sgny];
+  return VPXMAX(VPXMAX(abs(y1 - y2), abs(y1 - y3)), abs(y1 - y4));
+}
+
+static int check_dot_artifact_candidate(VP8_COMP *cpi,
+                                        MACROBLOCK *x,
+                                        unsigned char *target_last,
+                                        int stride,
+                                        unsigned char* last_ref,
+                                        int mb_row,
+                                        int mb_col,
+                                        int channel)
+{
+  int threshold1 = 6;
+  int threshold2 = 3;
+  unsigned int max_num = (cpi->common.MBs) / 10;
+  int grad_last = 0;
+  int grad_source = 0;
+  int index = mb_row * cpi->common.mb_cols + mb_col;
+  // Threshold for #consecutive (base layer) frames using zero_last mode.
+  int num_frames = 30;
+  int shift = 15;
+  if (channel > 0) {
+    shift = 7;
+  }
+  if (cpi->oxcf.number_of_layers > 1)
+  {
+    num_frames = 20;
+  }
+  x->zero_last_dot_suppress = 0;
+  // Blocks on base layer frames that have been using ZEROMV_LAST repeatedly
+  // (i.e, at least |x| consecutive frames are candidates for increasing the
+  // rd adjustment for zero_last mode.
+  // Only allow this for at most |max_num| blocks per frame.
+  // Don't allow this for screen content input.
+  if (cpi->current_layer == 0 &&
+      cpi->consec_zero_last_mvbias[index] > num_frames &&
+      x->mbs_zero_last_dot_suppress < max_num &&
+      !cpi->oxcf.screen_content_mode)
+  {
+    // If this block is checked here, label it so we don't check it again until
+    // ~|x| framaes later.
+    x->zero_last_dot_suppress = 1;
+    // Dot artifact is noticeable as strong gradient at corners of macroblock,
+    // for flat areas. As a simple detector for now, we look for a high
+    // corner gradient on last ref, and a smaller gradient on source.
+    // Check 4 corners, return if any satisfy condition.
+    // Top-left:
+    grad_last = macroblock_corner_grad(last_ref, stride, 0, 0, 1, 1);
+    grad_source = macroblock_corner_grad(target_last, stride, 0, 0, 1, 1);
+    if (grad_last >= threshold1 && grad_source <= threshold2)
+    {
+       x->mbs_zero_last_dot_suppress++;
+       return 1;
+    }
+    // Top-right:
+    grad_last = macroblock_corner_grad(last_ref, stride, 0, shift, 1, -1);
+    grad_source = macroblock_corner_grad(target_last, stride, 0, shift, 1, -1);
+    if (grad_last >= threshold1 && grad_source <= threshold2)
+    {
+      x->mbs_zero_last_dot_suppress++;
+      return 1;
+    }
+    // Bottom-left:
+    grad_last = macroblock_corner_grad(last_ref, stride, shift, 0, -1, 1);
+    grad_source = macroblock_corner_grad(target_last, stride, shift, 0, -1, 1);
+    if (grad_last >= threshold1 && grad_source <= threshold2)
+    {
+      x->mbs_zero_last_dot_suppress++;
+      return 1;
+    }
+    // Bottom-right:
+    grad_last = macroblock_corner_grad(last_ref, stride, shift, shift, -1, -1);
+    grad_source = macroblock_corner_grad(target_last, stride, shift, shift, -1, -1);
+    if (grad_last >= threshold1 && grad_source <= threshold2)
+    {
+      x->mbs_zero_last_dot_suppress++;
+      return 1;
+    }
+    return 0;
+  }
+  return 0;
+}
+
+int vp8_skip_fractional_mv_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d,
+                                int_mv *bestmv, int_mv *ref_mv,
+                                int error_per_bit,
+                                const vp8_variance_fn_ptr_t *vfp,
+                                int *mvcost[2], int *distortion,
+                                unsigned int *sse)
+{
+    (void) b;
+    (void) d;
+    (void) ref_mv;
+    (void) error_per_bit;
+    (void) vfp;
+    (void) mb;
+    (void) mvcost;
+    (void) distortion;
+    (void) sse;
+    bestmv->as_mv.row <<= 3;
+    bestmv->as_mv.col <<= 3;
+    return 0;
+}
+
+
+int vp8_get_inter_mbpred_error(MACROBLOCK *mb,
+                                  const vp8_variance_fn_ptr_t *vfp,
+                                  unsigned int *sse,
+                                  int_mv this_mv)
+{
+
+    BLOCK *b = &mb->block[0];
+    BLOCKD *d = &mb->e_mbd.block[0];
+    unsigned char *what = (*(b->base_src) + b->src);
+    int what_stride = b->src_stride;
+    int pre_stride = mb->e_mbd.pre.y_stride;
+    unsigned char *in_what = mb->e_mbd.pre.y_buffer + d->offset ;
+    int in_what_stride = pre_stride;
+    int xoffset = this_mv.as_mv.col & 7;
+    int yoffset = this_mv.as_mv.row & 7;
+
+    in_what += (this_mv.as_mv.row >> 3) * pre_stride + (this_mv.as_mv.col >> 3);
+
+    if (xoffset | yoffset)
+    {
+        return vfp->svf(in_what, in_what_stride, xoffset, yoffset, what, what_stride, sse);
+    }
+    else
+    {
+        return vfp->vf(what, what_stride, in_what, in_what_stride, sse);
+    }
+
+}
+
+static int get_prediction_error(BLOCK *be, BLOCKD *b)
+{
+    unsigned char *sptr;
+    unsigned char *dptr;
+    sptr = (*(be->base_src) + be->src);
+    dptr = b->predictor;
+
+    return vpx_get4x4sse_cs(sptr, be->src_stride, dptr, 16);
+
+}
+
+static int pick_intra4x4block(
+    MACROBLOCK *x,
+    int ib,
+    B_PREDICTION_MODE *best_mode,
+    const int *mode_costs,
+
+    int *bestrate,
+    int *bestdistortion)
+{
+
+    BLOCKD *b = &x->e_mbd.block[ib];
+    BLOCK *be = &x->block[ib];
+    int dst_stride = x->e_mbd.dst.y_stride;
+    unsigned char *dst = x->e_mbd.dst.y_buffer + b->offset;
+    B_PREDICTION_MODE mode;
+    int best_rd = INT_MAX;
+    int rate;
+    int distortion;
+
+    unsigned char *Above = dst - dst_stride;
+    unsigned char *yleft = dst - 1;
+    unsigned char top_left = Above[-1];
+
+    for (mode = B_DC_PRED; mode <= B_HE_PRED; mode++)
+    {
+        int this_rd;
+
+        rate = mode_costs[mode];
+
+        vp8_intra4x4_predict(Above, yleft, dst_stride, mode,
+                             b->predictor, 16, top_left);
+        distortion = get_prediction_error(be, b);
+        this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+
+        if (this_rd < best_rd)
+        {
+            *bestrate = rate;
+            *bestdistortion = distortion;
+            best_rd = this_rd;
+            *best_mode = mode;
+        }
+    }
+
+    b->bmi.as_mode = *best_mode;
+    vp8_encode_intra4x4block(x, ib);
+    return best_rd;
+}
+
+
+static int pick_intra4x4mby_modes
+(
+    MACROBLOCK *mb,
+    int *Rate,
+    int *best_dist
+)
+{
+    MACROBLOCKD *const xd = &mb->e_mbd;
+    int i;
+    int cost = mb->mbmode_cost [xd->frame_type] [B_PRED];
+    int error;
+    int distortion = 0;
+    const int *bmode_costs;
+
+    intra_prediction_down_copy(xd, xd->dst.y_buffer - xd->dst.y_stride + 16);
+
+    bmode_costs = mb->inter_bmode_costs;
+
+    for (i = 0; i < 16; i++)
+    {
+        MODE_INFO *const mic = xd->mode_info_context;
+        const int mis = xd->mode_info_stride;
+
+        B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
+        int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(d);
+
+        if (mb->e_mbd.frame_type == KEY_FRAME)
+        {
+            const B_PREDICTION_MODE A = above_block_mode(mic, i, mis);
+            const B_PREDICTION_MODE L = left_block_mode(mic, i);
+
+            bmode_costs  = mb->bmode_costs[A][L];
+        }
+
+
+        pick_intra4x4block(mb, i, &best_mode, bmode_costs, &r, &d);
+
+        cost += r;
+        distortion += d;
+        mic->bmi[i].as_mode = best_mode;
+
+        /* Break out case where we have already exceeded best so far value
+         * that was passed in
+         */
+        if (distortion > *best_dist)
+            break;
+    }
+
+    *Rate = cost;
+
+    if (i == 16)
+    {
+        *best_dist = distortion;
+        error = RDCOST(mb->rdmult, mb->rddiv, cost, distortion);
+    }
+    else
+    {
+        *best_dist = INT_MAX;
+        error = INT_MAX;
+    }
+
+    return error;
+}
+
+static void pick_intra_mbuv_mode(MACROBLOCK *mb)
+{
+
+    MACROBLOCKD *x = &mb->e_mbd;
+    unsigned char *uabove_row = x->dst.u_buffer - x->dst.uv_stride;
+    unsigned char *vabove_row = x->dst.v_buffer - x->dst.uv_stride;
+    unsigned char *usrc_ptr = (mb->block[16].src + *mb->block[16].base_src);
+    unsigned char *vsrc_ptr = (mb->block[20].src + *mb->block[20].base_src);
+    int uvsrc_stride = mb->block[16].src_stride;
+    unsigned char uleft_col[8];
+    unsigned char vleft_col[8];
+    unsigned char utop_left = uabove_row[-1];
+    unsigned char vtop_left = vabove_row[-1];
+    int i, j;
+    int expected_udc;
+    int expected_vdc;
+    int shift;
+    int Uaverage = 0;
+    int Vaverage = 0;
+    int diff;
+    int pred_error[4] = {0, 0, 0, 0}, best_error = INT_MAX;
+    MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
+
+
+    for (i = 0; i < 8; i++)
+    {
+        uleft_col[i] = x->dst.u_buffer [i* x->dst.uv_stride -1];
+        vleft_col[i] = x->dst.v_buffer [i* x->dst.uv_stride -1];
+    }
+
+    if (!x->up_available && !x->left_available)
+    {
+        expected_udc = 128;
+        expected_vdc = 128;
+    }
+    else
+    {
+        shift = 2;
+
+        if (x->up_available)
+        {
+
+            for (i = 0; i < 8; i++)
+            {
+                Uaverage += uabove_row[i];
+                Vaverage += vabove_row[i];
+            }
+
+            shift ++;
+
+        }
+
+        if (x->left_available)
+        {
+            for (i = 0; i < 8; i++)
+            {
+                Uaverage += uleft_col[i];
+                Vaverage += vleft_col[i];
+            }
+
+            shift ++;
+
+        }
+
+        expected_udc = (Uaverage + (1 << (shift - 1))) >> shift;
+        expected_vdc = (Vaverage + (1 << (shift - 1))) >> shift;
+    }
+
+
+    for (i = 0; i < 8; i++)
+    {
+        for (j = 0; j < 8; j++)
+        {
+
+            int predu = uleft_col[i] + uabove_row[j] - utop_left;
+            int predv = vleft_col[i] + vabove_row[j] - vtop_left;
+            int u_p, v_p;
+
+            u_p = usrc_ptr[j];
+            v_p = vsrc_ptr[j];
+
+            if (predu < 0)
+                predu = 0;
+
+            if (predu > 255)
+                predu = 255;
+
+            if (predv < 0)
+                predv = 0;
+
+            if (predv > 255)
+                predv = 255;
+
+
+            diff = u_p - expected_udc;
+            pred_error[DC_PRED] += diff * diff;
+            diff = v_p - expected_vdc;
+            pred_error[DC_PRED] += diff * diff;
+
+
+            diff = u_p - uabove_row[j];
+            pred_error[V_PRED] += diff * diff;
+            diff = v_p - vabove_row[j];
+            pred_error[V_PRED] += diff * diff;
+
+
+            diff = u_p - uleft_col[i];
+            pred_error[H_PRED] += diff * diff;
+            diff = v_p - vleft_col[i];
+            pred_error[H_PRED] += diff * diff;
+
+
+            diff = u_p - predu;
+            pred_error[TM_PRED] += diff * diff;
+            diff = v_p - predv;
+            pred_error[TM_PRED] += diff * diff;
+
+
+        }
+
+        usrc_ptr += uvsrc_stride;
+        vsrc_ptr += uvsrc_stride;
+
+        if (i == 3)
+        {
+            usrc_ptr = (mb->block[18].src + *mb->block[18].base_src);
+            vsrc_ptr = (mb->block[22].src + *mb->block[22].base_src);
+        }
+
+
+
+    }
+
+
+    for (i = DC_PRED; i <= TM_PRED; i++)
+    {
+        if (best_error > pred_error[i])
+        {
+            best_error = pred_error[i];
+            best_mode = (MB_PREDICTION_MODE)i;
+        }
+    }
+
+
+    mb->e_mbd.mode_info_context->mbmi.uv_mode = best_mode;
+
+}
+
+static void update_mvcount(MACROBLOCK *x, int_mv *best_ref_mv)
+{
+    MACROBLOCKD *xd = &x->e_mbd;
+    /* Split MV modes currently not supported when RD is nopt enabled,
+     * therefore, only need to modify MVcount in NEWMV mode. */
+    if (xd->mode_info_context->mbmi.mode == NEWMV)
+    {
+        x->MVcount[0][mv_max+((xd->mode_info_context->mbmi.mv.as_mv.row -
+                                      best_ref_mv->as_mv.row) >> 1)]++;
+        x->MVcount[1][mv_max+((xd->mode_info_context->mbmi.mv.as_mv.col -
+                                      best_ref_mv->as_mv.col) >> 1)]++;
+    }
+}
+
+
+#if CONFIG_MULTI_RES_ENCODING
+static
+void get_lower_res_motion_info(VP8_COMP *cpi, MACROBLOCKD *xd, int *dissim,
+                               int *parent_ref_frame,
+                               MB_PREDICTION_MODE *parent_mode,
+                               int_mv *parent_ref_mv, int mb_row, int mb_col)
+{
+    LOWER_RES_MB_INFO* store_mode_info
+                          = ((LOWER_RES_FRAME_INFO*)cpi->oxcf.mr_low_res_mode_info)->mb_info;
+    unsigned int parent_mb_index;
+
+    /* Consider different down_sampling_factor.  */
+    {
+        /* TODO: Removed the loop that supports special down_sampling_factor
+         * such as 2, 4, 8. Will revisit it if needed.
+         * Should also try using a look-up table to see if it helps
+         * performance. */
+        int parent_mb_row, parent_mb_col;
+
+        parent_mb_row = mb_row*cpi->oxcf.mr_down_sampling_factor.den
+                    /cpi->oxcf.mr_down_sampling_factor.num;
+        parent_mb_col = mb_col*cpi->oxcf.mr_down_sampling_factor.den
+                    /cpi->oxcf.mr_down_sampling_factor.num;
+        parent_mb_index = parent_mb_row*cpi->mr_low_res_mb_cols + parent_mb_col;
+    }
+
+    /* Read lower-resolution mode & motion result from memory.*/
+    *parent_ref_frame = store_mode_info[parent_mb_index].ref_frame;
+    *parent_mode =  store_mode_info[parent_mb_index].mode;
+    *dissim = store_mode_info[parent_mb_index].dissim;
+
+    /* For highest-resolution encoder, adjust dissim value. Lower its quality
+     * for good performance. */
+    if (cpi->oxcf.mr_encoder_id == (cpi->oxcf.mr_total_resolutions - 1))
+        *dissim>>=1;
+
+    if(*parent_ref_frame != INTRA_FRAME)
+    {
+        /* Consider different down_sampling_factor.
+         * The result can be rounded to be more precise, but it takes more time.
+         */
+        (*parent_ref_mv).as_mv.row = store_mode_info[parent_mb_index].mv.as_mv.row
+                                  *cpi->oxcf.mr_down_sampling_factor.num
+                                  /cpi->oxcf.mr_down_sampling_factor.den;
+        (*parent_ref_mv).as_mv.col = store_mode_info[parent_mb_index].mv.as_mv.col
+                                  *cpi->oxcf.mr_down_sampling_factor.num
+                                  /cpi->oxcf.mr_down_sampling_factor.den;
+
+        vp8_clamp_mv2(parent_ref_mv, xd);
+    }
+}
+#endif
+
+static void check_for_encode_breakout(unsigned int sse, MACROBLOCK* x)
+{
+    MACROBLOCKD *xd = &x->e_mbd;
+
+    unsigned int threshold = (xd->block[0].dequant[1]
+        * xd->block[0].dequant[1] >>4);
+
+    if(threshold < x->encode_breakout)
+        threshold = x->encode_breakout;
+
+    if (sse < threshold )
+    {
+        /* Check u and v to make sure skip is ok */
+        unsigned int sse2 = 0;
+
+        sse2 = VP8_UVSSE(x);
+
+        if (sse2 * 2 < x->encode_breakout)
+            x->skip = 1;
+        else
+            x->skip = 0;
+    }
+}
+
+static int evaluate_inter_mode(unsigned int* sse, int rate2, int* distortion2,
+                               VP8_COMP *cpi, MACROBLOCK *x, int rd_adj)
+{
+    MB_PREDICTION_MODE this_mode = x->e_mbd.mode_info_context->mbmi.mode;
+    int_mv mv = x->e_mbd.mode_info_context->mbmi.mv;
+    int this_rd;
+    int denoise_aggressive = 0;
+    /* Exit early and don't compute the distortion if this macroblock
+     * is marked inactive. */
+    if (cpi->active_map_enabled && x->active_ptr[0] == 0)
+    {
+        *sse = 0;
+        *distortion2 = 0;
+        x->skip = 1;
+        return INT_MAX;
+    }
+
+    if((this_mode != NEWMV) ||
+        !(cpi->sf.half_pixel_search) || cpi->common.full_pixel==1)
+        *distortion2 = vp8_get_inter_mbpred_error(x,
+                                              &cpi->fn_ptr[BLOCK_16X16],
+                                              sse, mv);
+
+    this_rd = RDCOST(x->rdmult, x->rddiv, rate2, *distortion2);
+
+#if CONFIG_TEMPORAL_DENOISING
+    if (cpi->oxcf.noise_sensitivity > 0) {
+      denoise_aggressive =
+        (cpi->denoiser.denoiser_mode == kDenoiserOnYUVAggressive) ? 1 : 0;
+    }
+#endif
+
+    // Adjust rd for ZEROMV and LAST, if LAST is the closest reference frame.
+    // TODO: We should also add condition on distance of closest to current.
+    if(!cpi->oxcf.screen_content_mode &&
+       this_mode == ZEROMV &&
+       x->e_mbd.mode_info_context->mbmi.ref_frame == LAST_FRAME &&
+       (denoise_aggressive || (cpi->closest_reference_frame == LAST_FRAME)))
+    {
+        // No adjustment if block is considered to be skin area.
+        if(x->is_skin)
+            rd_adj = 100;
+
+        this_rd = ((int64_t)this_rd) * rd_adj / 100;
+    }
+
+    check_for_encode_breakout(*sse, x);
+    return this_rd;
+}
+
+static void calculate_zeromv_rd_adjustment(VP8_COMP *cpi, MACROBLOCK *x,
+                                    int *rd_adjustment)
+{
+    MODE_INFO *mic = x->e_mbd.mode_info_context;
+    int_mv mv_l, mv_a, mv_al;
+    int local_motion_check = 0;
+
+    if (cpi->lf_zeromv_pct > 40)
+    {
+        /* left mb */
+        mic -= 1;
+        mv_l = mic->mbmi.mv;
+
+        if (mic->mbmi.ref_frame != INTRA_FRAME)
+            if( abs(mv_l.as_mv.row) < 8 && abs(mv_l.as_mv.col) < 8)
+                local_motion_check++;
+
+        /* above-left mb */
+        mic -= x->e_mbd.mode_info_stride;
+        mv_al = mic->mbmi.mv;
+
+        if (mic->mbmi.ref_frame != INTRA_FRAME)
+            if( abs(mv_al.as_mv.row) < 8 && abs(mv_al.as_mv.col) < 8)
+                local_motion_check++;
+
+        /* above mb */
+        mic += 1;
+        mv_a = mic->mbmi.mv;
+
+        if (mic->mbmi.ref_frame != INTRA_FRAME)
+            if( abs(mv_a.as_mv.row) < 8 && abs(mv_a.as_mv.col) < 8)
+                local_motion_check++;
+
+        if (((!x->e_mbd.mb_to_top_edge || !x->e_mbd.mb_to_left_edge)
+            && local_motion_check >0) ||  local_motion_check >2 )
+            *rd_adjustment = 80;
+        else if (local_motion_check > 0)
+            *rd_adjustment = 90;
+    }
+}
+
+void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
+                         int recon_uvoffset, int *returnrate,
+                         int *returndistortion, int *returnintra, int mb_row,
+                         int mb_col)
+{
+    BLOCK *b = &x->block[0];
+    BLOCKD *d = &x->e_mbd.block[0];
+    MACROBLOCKD *xd = &x->e_mbd;
+    MB_MODE_INFO best_mbmode;
+
+    int_mv best_ref_mv_sb[2];
+    int_mv mode_mv_sb[2][MB_MODE_COUNT];
+    int_mv best_ref_mv;
+    int_mv *mode_mv;
+    MB_PREDICTION_MODE this_mode;
+    int num00;
+    int mdcounts[4];
+    int best_rd = INT_MAX;
+    int rd_adjustment = 100;
+    int best_intra_rd = INT_MAX;
+    int mode_index;
+    int rate;
+    int rate2;
+    int distortion2;
+    int bestsme = INT_MAX;
+    int best_mode_index = 0;
+    unsigned int sse = UINT_MAX, best_rd_sse = UINT_MAX;
+#if CONFIG_TEMPORAL_DENOISING
+    unsigned int zero_mv_sse = UINT_MAX, best_sse = UINT_MAX;
+#endif
+
+    int sf_improved_mv_pred = cpi->sf.improved_mv_pred;
+
+#if CONFIG_MULTI_RES_ENCODING
+    int dissim = INT_MAX;
+    int parent_ref_frame = 0;
+    int_mv parent_ref_mv;
+    MB_PREDICTION_MODE parent_mode = 0;
+    int parent_ref_valid = 0;
+#endif
+
+    int_mv mvp;
+
+    int near_sadidx[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+    int saddone=0;
+    /* search range got from mv_pred(). It uses step_param levels. (0-7) */
+    int sr=0;
+
+    unsigned char *plane[4][3];
+    int ref_frame_map[4];
+    int sign_bias = 0;
+    int dot_artifact_candidate = 0;
+    get_predictor_pointers(cpi, plane, recon_yoffset, recon_uvoffset);
+
+    // If the current frame is using LAST as a reference, check for
+    // biasing the mode selection for dot artifacts.
+    if (cpi->ref_frame_flags & VP8_LAST_FRAME) {
+      unsigned char* target_y = x->src.y_buffer;
+      unsigned char* target_u = x->block[16].src + *x->block[16].base_src;
+      unsigned char* target_v = x->block[20].src + *x->block[20].base_src;
+      int stride = x->src.y_stride;
+      int stride_uv = x->block[16].src_stride;
+#if CONFIG_TEMPORAL_DENOISING
+      if (cpi->oxcf.noise_sensitivity) {
+        const int uv_denoise = (cpi->oxcf.noise_sensitivity >= 2) ? 1 : 0;
+        target_y =
+            cpi->denoiser.yv12_running_avg[LAST_FRAME].y_buffer + recon_yoffset;
+        stride = cpi->denoiser.yv12_running_avg[LAST_FRAME].y_stride;
+        if (uv_denoise) {
+          target_u =
+              cpi->denoiser.yv12_running_avg[LAST_FRAME].u_buffer +
+                  recon_uvoffset;
+          target_v =
+              cpi->denoiser.yv12_running_avg[LAST_FRAME].v_buffer +
+                  recon_uvoffset;
+          stride_uv = cpi->denoiser.yv12_running_avg[LAST_FRAME].uv_stride;
+        }
+      }
+#endif
+      dot_artifact_candidate =
+          check_dot_artifact_candidate(cpi, x, target_y, stride,
+              plane[LAST_FRAME][0], mb_row, mb_col, 0);
+      // If not found in Y channel, check UV channel.
+      if (!dot_artifact_candidate) {
+        dot_artifact_candidate =
+            check_dot_artifact_candidate(cpi, x, target_u, stride_uv,
+                plane[LAST_FRAME][1], mb_row, mb_col, 1);
+        if (!dot_artifact_candidate) {
+          dot_artifact_candidate =
+              check_dot_artifact_candidate(cpi, x, target_v, stride_uv,
+                  plane[LAST_FRAME][2], mb_row, mb_col, 2);
+        }
+      }
+    }
+
+#if CONFIG_MULTI_RES_ENCODING
+    // |parent_ref_valid| will be set here if potentially we can do mv resue for
+    // this higher resol (|cpi->oxcf.mr_encoder_id| > 0) frame.
+    // |parent_ref_valid| may be reset depending on |parent_ref_frame| for
+    // the current macroblock below.
+    parent_ref_valid = cpi->oxcf.mr_encoder_id && cpi->mr_low_res_mv_avail;
+    if (parent_ref_valid)
+    {
+        int parent_ref_flag;
+
+        get_lower_res_motion_info(cpi, xd, &dissim, &parent_ref_frame,
+                                  &parent_mode, &parent_ref_mv, mb_row, mb_col);
+
+        /* TODO(jkoleszar): The references available (ref_frame_flags) to the
+         * lower res encoder should match those available to this encoder, but
+         * there seems to be a situation where this mismatch can happen in the
+         * case of frame dropping and temporal layers. For example,
+         * GOLD being disallowed in ref_frame_flags, but being returned as
+         * parent_ref_frame.
+         *
+         * In this event, take the conservative approach of disabling the
+         * lower res info for this MB.
+         */
+
+        parent_ref_flag = 0;
+        // Note availability for mv reuse is only based on last and golden.
+        if (parent_ref_frame == LAST_FRAME)
+            parent_ref_flag = (cpi->ref_frame_flags & VP8_LAST_FRAME);
+        else if (parent_ref_frame == GOLDEN_FRAME)
+            parent_ref_flag = (cpi->ref_frame_flags & VP8_GOLD_FRAME);
+
+        //assert(!parent_ref_frame || parent_ref_flag);
+
+        // If |parent_ref_frame| did not match either last or golden then
+        // shut off mv reuse.
+        if (parent_ref_frame && !parent_ref_flag)
+            parent_ref_valid = 0;
+
+        // Don't do mv reuse since we want to allow for another mode besides
+        // ZEROMV_LAST to remove dot artifact.
+        if (dot_artifact_candidate)
+          parent_ref_valid = 0;
+    }
+#endif
+
+    // Check if current macroblock is in skin area.
+    {
+    const int y = (x->src.y_buffer[7 * x->src.y_stride + 7] +
+        x->src.y_buffer[7 * x->src.y_stride + 8] +
+        x->src.y_buffer[8 * x->src.y_stride + 7] +
+        x->src.y_buffer[8 * x->src.y_stride + 8]) >> 2;
+    const int cb = (x->src.u_buffer[3 * x->src.uv_stride + 3] +
+        x->src.u_buffer[3 * x->src.uv_stride + 4] +
+        x->src.u_buffer[4 * x->src.uv_stride + 3] +
+        x->src.u_buffer[4 * x->src.uv_stride + 4]) >> 2;
+    const int cr = (x->src.v_buffer[3 * x->src.uv_stride + 3] +
+        x->src.v_buffer[3 * x->src.uv_stride + 4] +
+        x->src.v_buffer[4 * x->src.uv_stride + 3] +
+        x->src.v_buffer[4 * x->src.uv_stride + 4]) >> 2;
+    x->is_skin = 0;
+    if (!cpi->oxcf.screen_content_mode)
+      x->is_skin = is_skin_color(y, cb, cr);
+    }
+#if CONFIG_TEMPORAL_DENOISING
+    if (cpi->oxcf.noise_sensitivity) {
+      // Under aggressive denoising mode, should we use skin map to reduce denoiser
+      // and ZEROMV bias? Will need to revisit the accuracy of this detection for
+      // very noisy input. For now keep this as is (i.e., don't turn it off).
+      // if (cpi->denoiser.denoiser_mode == kDenoiserOnYUVAggressive)
+      //   x->is_skin = 0;
+    }
+#endif
+
+    mode_mv = mode_mv_sb[sign_bias];
+    best_ref_mv.as_int = 0;
+    memset(mode_mv_sb, 0, sizeof(mode_mv_sb));
+    memset(&best_mbmode, 0, sizeof(best_mbmode));
+
+    /* Setup search priorities */
+#if CONFIG_MULTI_RES_ENCODING
+    if (parent_ref_valid && parent_ref_frame && dissim < 8)
+    {
+        ref_frame_map[0] = -1;
+        ref_frame_map[1] = parent_ref_frame;
+        ref_frame_map[2] = -1;
+        ref_frame_map[3] = -1;
+    } else
+#endif
+    get_reference_search_order(cpi, ref_frame_map);
+
+    /* Check to see if there is at least 1 valid reference frame that we need
+     * to calculate near_mvs.
+     */
+    if (ref_frame_map[1] > 0)
+    {
+        sign_bias = vp8_find_near_mvs_bias(&x->e_mbd,
+                                           x->e_mbd.mode_info_context,
+                                           mode_mv_sb,
+                                           best_ref_mv_sb,
+                                           mdcounts,
+                                           ref_frame_map[1],
+                                           cpi->common.ref_frame_sign_bias);
+
+        mode_mv = mode_mv_sb[sign_bias];
+        best_ref_mv.as_int = best_ref_mv_sb[sign_bias].as_int;
+    }
+
+    /* Count of the number of MBs tested so far this frame */
+    x->mbs_tested_so_far++;
+
+    *returnintra = INT_MAX;
+    x->skip = 0;
+
+    x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;
+
+    /* If the frame has big static background and current MB is in low
+    *  motion area, its mode decision is biased to ZEROMV mode.
+    *  No adjustment if cpu_used is <= -12 (i.e., cpi->Speed >= 12).
+    *  At such speed settings, ZEROMV is already heavily favored.
+    */
+    if (cpi->Speed < 12) {
+      calculate_zeromv_rd_adjustment(cpi, x, &rd_adjustment);
+    }
+
+#if CONFIG_TEMPORAL_DENOISING
+    if (cpi->oxcf.noise_sensitivity) {
+      rd_adjustment = (int)(rd_adjustment *
+          cpi->denoiser.denoise_pars.pickmode_mv_bias / 100);
+    }
+#endif
+
+    if (dot_artifact_candidate)
+    {
+        // Bias against ZEROMV_LAST mode.
+        rd_adjustment = 150;
+    }
+
+
+    /* if we encode a new mv this is important
+     * find the best new motion vector
+     */
+    for (mode_index = 0; mode_index < MAX_MODES; mode_index++)
+    {
+        int frame_cost;
+        int this_rd = INT_MAX;
+        int this_ref_frame = ref_frame_map[vp8_ref_frame_order[mode_index]];
+
+        if (best_rd <= x->rd_threshes[mode_index])
+            continue;
+
+        if (this_ref_frame < 0)
+            continue;
+
+        x->e_mbd.mode_info_context->mbmi.ref_frame = this_ref_frame;
+
+        /* everything but intra */
+        if (x->e_mbd.mode_info_context->mbmi.ref_frame)
+        {
+            x->e_mbd.pre.y_buffer = plane[this_ref_frame][0];
+            x->e_mbd.pre.u_buffer = plane[this_ref_frame][1];
+            x->e_mbd.pre.v_buffer = plane[this_ref_frame][2];
+
+            if (sign_bias != cpi->common.ref_frame_sign_bias[this_ref_frame])
+            {
+                sign_bias = cpi->common.ref_frame_sign_bias[this_ref_frame];
+                mode_mv = mode_mv_sb[sign_bias];
+                best_ref_mv.as_int = best_ref_mv_sb[sign_bias].as_int;
+            }
+
+#if CONFIG_MULTI_RES_ENCODING
+            if (parent_ref_valid)
+            {
+                if (vp8_mode_order[mode_index] == NEARESTMV &&
+                    mode_mv[NEARESTMV].as_int ==0)
+                    continue;
+                if (vp8_mode_order[mode_index] == NEARMV &&
+                    mode_mv[NEARMV].as_int ==0)
+                    continue;
+
+                if (vp8_mode_order[mode_index] == NEWMV && parent_mode == ZEROMV
+                    && best_ref_mv.as_int==0)
+                    continue;
+                else if(vp8_mode_order[mode_index] == NEWMV && dissim==0
+                    && best_ref_mv.as_int==parent_ref_mv.as_int)
+                    continue;
+            }
+#endif
+        }
+
+        /* Check to see if the testing frequency for this mode is at its max
+         * If so then prevent it from being tested and increase the threshold
+         * for its testing */
+        if (x->mode_test_hit_counts[mode_index] &&
+                                         (cpi->mode_check_freq[mode_index] > 1))
+        {
+            if (x->mbs_tested_so_far <= (cpi->mode_check_freq[mode_index] *
+                                         x->mode_test_hit_counts[mode_index]))
+            {
+                /* Increase the threshold for coding this mode to make it less
+                 * likely to be chosen */
+                x->rd_thresh_mult[mode_index] += 4;
+
+                if (x->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
+                    x->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
+
+                x->rd_threshes[mode_index] =
+                                 (cpi->rd_baseline_thresh[mode_index] >> 7) *
+                                 x->rd_thresh_mult[mode_index];
+                continue;
+            }
+        }
+
+        /* We have now reached the point where we are going to test the current
+         * mode so increment the counter for the number of times it has been
+         * tested */
+        x->mode_test_hit_counts[mode_index] ++;
+
+        rate2 = 0;
+        distortion2 = 0;
+
+        this_mode = vp8_mode_order[mode_index];
+
+        x->e_mbd.mode_info_context->mbmi.mode = this_mode;
+        x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;
+
+        /* Work out the cost assosciated with selecting the reference frame */
+        frame_cost =
+            x->ref_frame_cost[x->e_mbd.mode_info_context->mbmi.ref_frame];
+        rate2 += frame_cost;
+
+        /* Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
+         * unless ARNR filtering is enabled in which case we want
+         * an unfiltered alternative */
+        if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0))
+        {
+            if (this_mode != ZEROMV ||
+                x->e_mbd.mode_info_context->mbmi.ref_frame != ALTREF_FRAME)
+                continue;
+        }
+
+        switch (this_mode)
+        {
+        case B_PRED:
+            /* Pass best so far to pick_intra4x4mby_modes to use as breakout */
+            distortion2 = best_rd_sse;
+            pick_intra4x4mby_modes(x, &rate, &distortion2);
+
+            if (distortion2 == INT_MAX)
+            {
+                this_rd = INT_MAX;
+            }
+            else
+            {
+                rate2 += rate;
+                distortion2 = vpx_variance16x16(
+                                    *(b->base_src), b->src_stride,
+                                    x->e_mbd.predictor, 16, &sse);
+                this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+
+                if (this_rd < best_intra_rd)
+                {
+                    best_intra_rd = this_rd;
+                    *returnintra = distortion2;
+                }
+            }
+
+            break;
+
+        case SPLITMV:
+
+            /* Split MV modes currently not supported when RD is not enabled. */
+            break;
+
+        case DC_PRED:
+        case V_PRED:
+        case H_PRED:
+        case TM_PRED:
+            vp8_build_intra_predictors_mby_s(xd,
+                                             xd->dst.y_buffer - xd->dst.y_stride,
+                                             xd->dst.y_buffer - 1,
+                                             xd->dst.y_stride,
+                                             xd->predictor,
+                                             16);
+            distortion2 = vpx_variance16x16
+                                          (*(b->base_src), b->src_stride,
+                                          x->e_mbd.predictor, 16, &sse);
+            rate2 += x->mbmode_cost[x->e_mbd.frame_type][x->e_mbd.mode_info_context->mbmi.mode];
+            this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+
+            if (this_rd < best_intra_rd)
+            {
+                best_intra_rd = this_rd;
+                *returnintra = distortion2;
+            }
+            break;
+
+        case NEWMV:
+        {
+            int thissme;
+            int step_param;
+            int further_steps;
+            int n = 0;
+            int sadpb = x->sadperbit16;
+            int_mv mvp_full;
+
+            int col_min = ((best_ref_mv.as_mv.col+7)>>3) - MAX_FULL_PEL_VAL;
+            int row_min = ((best_ref_mv.as_mv.row+7)>>3) - MAX_FULL_PEL_VAL;
+            int col_max = (best_ref_mv.as_mv.col>>3)
+                         + MAX_FULL_PEL_VAL;
+            int row_max = (best_ref_mv.as_mv.row>>3)
+                         + MAX_FULL_PEL_VAL;
+
+            int tmp_col_min = x->mv_col_min;
+            int tmp_col_max = x->mv_col_max;
+            int tmp_row_min = x->mv_row_min;
+            int tmp_row_max = x->mv_row_max;
+
+            int speed_adjust = (cpi->Speed > 5) ? ((cpi->Speed >= 8)? 3 : 2) : 1;
+
+            /* Further step/diamond searches as necessary */
+            step_param = cpi->sf.first_step + speed_adjust;
+
+#if CONFIG_MULTI_RES_ENCODING
+            /* If lower-res frame is not available for mv reuse (because of
+               frame dropping or different temporal layer pattern), then higher
+               resol encoder does motion search without any previous knowledge.
+               Also, since last frame motion info is not stored, then we can not
+               use improved_mv_pred. */
+            if (cpi->oxcf.mr_encoder_id)
+                sf_improved_mv_pred = 0;
+
+            // Only use parent MV as predictor if this candidate reference frame
+            // (|this_ref_frame|) is equal to |parent_ref_frame|.
+            if (parent_ref_valid && (parent_ref_frame == this_ref_frame))
+            {
+                /* Use parent MV as predictor. Adjust search range
+                 * accordingly.
+                 */
+                mvp.as_int = parent_ref_mv.as_int;
+                mvp_full.as_mv.col = parent_ref_mv.as_mv.col>>3;
+                mvp_full.as_mv.row = parent_ref_mv.as_mv.row>>3;
+
+                if(dissim <=32) step_param += 3;
+                else if(dissim <=128) step_param += 2;
+                else step_param += 1;
+            }else
+#endif
+            {
+                if(sf_improved_mv_pred)
+                {
+                    if(!saddone)
+                    {
+                        vp8_cal_sad(cpi,xd,x, recon_yoffset ,&near_sadidx[0] );
+                        saddone = 1;
+                    }
+
+                    vp8_mv_pred(cpi, &x->e_mbd, x->e_mbd.mode_info_context,
+                                &mvp,x->e_mbd.mode_info_context->mbmi.ref_frame,
+                                cpi->common.ref_frame_sign_bias, &sr,
+                                &near_sadidx[0]);
+
+                    sr += speed_adjust;
+                    /* adjust search range according to sr from mv prediction */
+                    if(sr > step_param)
+                        step_param = sr;
+
+                    mvp_full.as_mv.col = mvp.as_mv.col>>3;
+                    mvp_full.as_mv.row = mvp.as_mv.row>>3;
+                }else
+                {
+                    mvp.as_int = best_ref_mv.as_int;
+                    mvp_full.as_mv.col = best_ref_mv.as_mv.col>>3;
+                    mvp_full.as_mv.row = best_ref_mv.as_mv.row>>3;
+                }
+            }
+
+#if CONFIG_MULTI_RES_ENCODING
+            if (parent_ref_valid && (parent_ref_frame == this_ref_frame) &&
+                dissim <= 2 &&
+                VPXMAX(abs(best_ref_mv.as_mv.row - parent_ref_mv.as_mv.row),
+                       abs(best_ref_mv.as_mv.col - parent_ref_mv.as_mv.col)) <=
+                    4)
+            {
+                d->bmi.mv.as_int = mvp_full.as_int;
+                mode_mv[NEWMV].as_int = mvp_full.as_int;
+
+                cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv, &best_ref_mv,
+                                             x->errorperbit,
+                                             &cpi->fn_ptr[BLOCK_16X16],
+                                             cpi->mb.mvcost,
+                                             &distortion2,&sse);
+            }else
+#endif
+            {
+                /* Get intersection of UMV window and valid MV window to
+                 * reduce # of checks in diamond search. */
+                if (x->mv_col_min < col_min )
+                    x->mv_col_min = col_min;
+                if (x->mv_col_max > col_max )
+                    x->mv_col_max = col_max;
+                if (x->mv_row_min < row_min )
+                    x->mv_row_min = row_min;
+                if (x->mv_row_max > row_max )
+                    x->mv_row_max = row_max;
+
+                further_steps = (cpi->Speed >= 8)?
+                           0: (cpi->sf.max_step_search_steps - 1 - step_param);
+
+                if (cpi->sf.search_method == HEX)
+                {
+#if CONFIG_MULTI_RES_ENCODING
+                /* TODO: In higher-res pick_inter_mode, step_param is used to
+                 * modify hex search range. Here, set step_param to 0 not to
+                 * change the behavior in lowest-resolution encoder.
+                 * Will improve it later.
+                 */
+                /* Set step_param to 0 to ensure large-range motion search
+                 * when mv reuse if not valid (i.e. |parent_ref_valid| = 0),
+                 * or if this candidate reference frame (|this_ref_frame|) is
+                 * not equal to |parent_ref_frame|.
+                 */
+                if (!parent_ref_valid || (parent_ref_frame != this_ref_frame))
+                    step_param = 0;
+#endif
+                    bestsme = vp8_hex_search(x, b, d, &mvp_full, &d->bmi.mv,
+                                          step_param, sadpb,
+                                          &cpi->fn_ptr[BLOCK_16X16],
+                                          x->mvsadcost, x->mvcost, &best_ref_mv);
+                    mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
+                }
+                else
+                {
+                    bestsme = cpi->diamond_search_sad(x, b, d, &mvp_full,
+                                          &d->bmi.mv, step_param, sadpb, &num00,
+                                          &cpi->fn_ptr[BLOCK_16X16],
+                                          x->mvcost, &best_ref_mv);
+                    mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
+
+                    /* Further step/diamond searches as necessary */
+                    n = num00;
+                    num00 = 0;
+
+                    while (n < further_steps)
+                    {
+                        n++;
+
+                        if (num00)
+                            num00--;
+                        else
+                        {
+                            thissme =
+                            cpi->diamond_search_sad(x, b, d, &mvp_full,
+                                                    &d->bmi.mv,
+                                                    step_param + n,
+                                                    sadpb, &num00,
+                                                    &cpi->fn_ptr[BLOCK_16X16],
+                                                    x->mvcost, &best_ref_mv);
+                            if (thissme < bestsme)
+                            {
+                                bestsme = thissme;
+                                mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
+                            }
+                            else
+                            {
+                                d->bmi.mv.as_int = mode_mv[NEWMV].as_int;
+                            }
+                        }
+                    }
+                }
+
+                x->mv_col_min = tmp_col_min;
+                x->mv_col_max = tmp_col_max;
+                x->mv_row_min = tmp_row_min;
+                x->mv_row_max = tmp_row_max;
+
+                if (bestsme < INT_MAX)
+                    cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv,
+                                             &best_ref_mv, x->errorperbit,
+                                             &cpi->fn_ptr[BLOCK_16X16],
+                                             cpi->mb.mvcost,
+                                             &distortion2,&sse);
+            }
+
+            mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
+            // The clamp below is not necessary from the perspective
+            // of VP8 bitstream, but is added to improve ChromeCast
+            // mirroring's robustness. Please do not remove.
+            vp8_clamp_mv2(&mode_mv[this_mode], xd);
+            /* mv cost; */
+            rate2 += vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv,
+                                     cpi->mb.mvcost, 128);
+        }
+
+        case NEARESTMV:
+        case NEARMV:
+            if (mode_mv[this_mode].as_int == 0)
+                continue;
+
+        case ZEROMV:
+
+            /* Trap vectors that reach beyond the UMV borders
+             * Note that ALL New MV, Nearest MV Near MV and Zero MV code drops
+             * through to this point because of the lack of break statements
+             * in the previous two cases.
+             */
+            if (((mode_mv[this_mode].as_mv.row >> 3) < x->mv_row_min) ||
+                ((mode_mv[this_mode].as_mv.row >> 3) > x->mv_row_max) ||
+                ((mode_mv[this_mode].as_mv.col >> 3) < x->mv_col_min) ||
+                ((mode_mv[this_mode].as_mv.col >> 3) > x->mv_col_max))
+                continue;
+
+            rate2 += vp8_cost_mv_ref(this_mode, mdcounts);
+            x->e_mbd.mode_info_context->mbmi.mv.as_int =
+                                                    mode_mv[this_mode].as_int;
+            this_rd = evaluate_inter_mode(&sse, rate2, &distortion2, cpi, x,
+                                          rd_adjustment);
+
+            break;
+        default:
+            break;
+        }
+
+#if CONFIG_TEMPORAL_DENOISING
+        if (cpi->oxcf.noise_sensitivity)
+        {
+            /* Store for later use by denoiser. */
+            // Dont' denoise with GOLDEN OR ALTREF is they are old reference
+            // frames (greater than MAX_GF_ARF_DENOISE_RANGE frames in past).
+            int skip_old_reference = ((this_ref_frame != LAST_FRAME) &&
+                (cpi->common.current_video_frame -
+                 cpi->current_ref_frames[this_ref_frame] >
+                 MAX_GF_ARF_DENOISE_RANGE)) ? 1 : 0;
+            if (this_mode == ZEROMV && sse < zero_mv_sse &&
+                !skip_old_reference)
+            {
+                zero_mv_sse = sse;
+                x->best_zeromv_reference_frame =
+                        x->e_mbd.mode_info_context->mbmi.ref_frame;
+            }
+
+            // Store the best NEWMV in x for later use in the denoiser.
+            if (x->e_mbd.mode_info_context->mbmi.mode == NEWMV &&
+                sse < best_sse && !skip_old_reference)
+            {
+                best_sse = sse;
+                x->best_sse_inter_mode = NEWMV;
+                x->best_sse_mv = x->e_mbd.mode_info_context->mbmi.mv;
+                x->need_to_clamp_best_mvs =
+                    x->e_mbd.mode_info_context->mbmi.need_to_clamp_mvs;
+                x->best_reference_frame =
+                    x->e_mbd.mode_info_context->mbmi.ref_frame;
+            }
+        }
+#endif
+
+        if (this_rd < best_rd || x->skip)
+        {
+            /* Note index of best mode */
+            best_mode_index = mode_index;
+
+            *returnrate = rate2;
+            *returndistortion = distortion2;
+            best_rd_sse = sse;
+            best_rd = this_rd;
+            memcpy(&best_mbmode, &x->e_mbd.mode_info_context->mbmi,
+                   sizeof(MB_MODE_INFO));
+
+            /* Testing this mode gave rise to an improvement in best error
+             * score. Lower threshold a bit for next time
+             */
+            x->rd_thresh_mult[mode_index] =
+                     (x->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ?
+                     x->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;
+            x->rd_threshes[mode_index] =
+                                   (cpi->rd_baseline_thresh[mode_index] >> 7) *
+                                   x->rd_thresh_mult[mode_index];
+        }
+
+        /* If the mode did not help improve the best error case then raise the
+         * threshold for testing that mode next time around.
+         */
+        else
+        {
+            x->rd_thresh_mult[mode_index] += 4;
+
+            if (x->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
+                x->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
+
+            x->rd_threshes[mode_index] =
+                         (cpi->rd_baseline_thresh[mode_index] >> 7) *
+                         x->rd_thresh_mult[mode_index];
+        }
+
+        if (x->skip)
+            break;
+    }
+
+    /* Reduce the activation RD thresholds for the best choice mode */
+    if ((cpi->rd_baseline_thresh[best_mode_index] > 0) && (cpi->rd_baseline_thresh[best_mode_index] < (INT_MAX >> 2)))
+    {
+        int best_adjustment = (x->rd_thresh_mult[best_mode_index] >> 3);
+
+        x->rd_thresh_mult[best_mode_index] =
+                        (x->rd_thresh_mult[best_mode_index]
+                        >= (MIN_THRESHMULT + best_adjustment)) ?
+                        x->rd_thresh_mult[best_mode_index] - best_adjustment :
+                        MIN_THRESHMULT;
+        x->rd_threshes[best_mode_index] =
+                        (cpi->rd_baseline_thresh[best_mode_index] >> 7) *
+                        x->rd_thresh_mult[best_mode_index];
+    }
+
+
+    {
+        int this_rdbin = (*returndistortion >> 7);
+
+        if (this_rdbin >= 1024)
+        {
+            this_rdbin = 1023;
+        }
+
+        x->error_bins[this_rdbin] ++;
+    }
+
+#if CONFIG_TEMPORAL_DENOISING
+    if (cpi->oxcf.noise_sensitivity)
+    {
+        int block_index = mb_row * cpi->common.mb_cols + mb_col;
+        int reevaluate = 0;
+        int is_noisy = 0;
+        if (x->best_sse_inter_mode == DC_PRED)
+        {
+            /* No best MV found. */
+            x->best_sse_inter_mode = best_mbmode.mode;
+            x->best_sse_mv = best_mbmode.mv;
+            x->need_to_clamp_best_mvs = best_mbmode.need_to_clamp_mvs;
+            x->best_reference_frame = best_mbmode.ref_frame;
+            best_sse = best_rd_sse;
+        }
+        // For non-skin blocks that have selected ZEROMV for this current frame,
+        // and have been selecting ZEROMV_LAST (on the base layer frame) at
+        // least |x~20| consecutive past frames in a row, label the block for
+        // possible increase in denoising strength. We also condition this
+        // labeling on there being significant denoising in the scene
+        if  (cpi->oxcf.noise_sensitivity == 4) {
+          if (cpi->denoiser.nmse_source_diff >
+              70 * cpi->denoiser.threshold_aggressive_mode / 100)
+            is_noisy = 1;
+        } else {
+          if (cpi->mse_source_denoised > 1000)
+            is_noisy = 1;
+        }
+        x->increase_denoising = 0;
+        if (!x->is_skin &&
+            x->best_sse_inter_mode == ZEROMV &&
+            (x->best_reference_frame == LAST_FRAME ||
+            x->best_reference_frame == cpi->closest_reference_frame) &&
+            cpi->consec_zero_last[block_index] >= 20 &&
+            is_noisy) {
+            x->increase_denoising = 1;
+        }
+        x->denoise_zeromv = 0;
+        vp8_denoiser_denoise_mb(&cpi->denoiser, x, best_sse, zero_mv_sse,
+                                recon_yoffset, recon_uvoffset,
+                                &cpi->common.lf_info, mb_row, mb_col,
+                                block_index);
+
+        // Reevaluate ZEROMV after denoising: for large noise content
+        // (i.e., cpi->mse_source_denoised is above threshold), do this for all
+        // blocks that did not pick ZEROMV as best mode but are using ZEROMV
+        // for denoising. Otherwise, always re-evaluate for blocks that picked
+        // INTRA mode as best mode.
+        // Avoid blocks that have been biased against ZERO_LAST
+        // (i.e., dot artifact candidate blocks).
+        reevaluate = (best_mbmode.ref_frame == INTRA_FRAME) ||
+                     (best_mbmode.mode != ZEROMV &&
+                      x->denoise_zeromv &&
+                      cpi->mse_source_denoised > 2000);
+        if (!dot_artifact_candidate &&
+            reevaluate &&
+            x->best_zeromv_reference_frame != INTRA_FRAME)
+        {
+            int this_rd = 0;
+            int this_ref_frame = x->best_zeromv_reference_frame;
+            rd_adjustment = 100;
+            rate2 = x->ref_frame_cost[this_ref_frame] +
+                    vp8_cost_mv_ref(ZEROMV, mdcounts);
+            distortion2 = 0;
+
+            /* set up the proper prediction buffers for the frame */
+            x->e_mbd.mode_info_context->mbmi.ref_frame = this_ref_frame;
+            x->e_mbd.pre.y_buffer = plane[this_ref_frame][0];
+            x->e_mbd.pre.u_buffer = plane[this_ref_frame][1];
+            x->e_mbd.pre.v_buffer = plane[this_ref_frame][2];
+
+            x->e_mbd.mode_info_context->mbmi.mode = ZEROMV;
+            x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;
+            x->e_mbd.mode_info_context->mbmi.mv.as_int = 0;
+            this_rd = evaluate_inter_mode(&sse, rate2, &distortion2, cpi, x,
+                                          rd_adjustment);
+
+            if (this_rd < best_rd)
+            {
+                memcpy(&best_mbmode, &x->e_mbd.mode_info_context->mbmi,
+                       sizeof(MB_MODE_INFO));
+            }
+        }
+
+    }
+#endif
+
+    if (cpi->is_src_frame_alt_ref &&
+        (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME))
+    {
+        x->e_mbd.mode_info_context->mbmi.mode = ZEROMV;
+        x->e_mbd.mode_info_context->mbmi.ref_frame = ALTREF_FRAME;
+        x->e_mbd.mode_info_context->mbmi.mv.as_int = 0;
+        x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;
+        x->e_mbd.mode_info_context->mbmi.mb_skip_coeff =
+                                        (cpi->common.mb_no_coeff_skip);
+        x->e_mbd.mode_info_context->mbmi.partitioning = 0;
+
+        return;
+    }
+
+    /* set to the best mb mode, this copy can be skip if x->skip since it
+     * already has the right content */
+    if (!x->skip)
+        memcpy(&x->e_mbd.mode_info_context->mbmi, &best_mbmode,
+               sizeof(MB_MODE_INFO));
+
+    if (best_mbmode.mode <= B_PRED)
+    {
+        /* set mode_info_context->mbmi.uv_mode */
+        pick_intra_mbuv_mode(x);
+    }
+
+    if (sign_bias
+      != cpi->common.ref_frame_sign_bias[xd->mode_info_context->mbmi.ref_frame])
+        best_ref_mv.as_int = best_ref_mv_sb[!sign_bias].as_int;
+
+    update_mvcount(x, &best_ref_mv);
+}
+
+void vp8_pick_intra_mode(MACROBLOCK *x, int *rate_)
+{
+    int error4x4, error16x16 = INT_MAX;
+    int rate, best_rate = 0, distortion, best_sse;
+    MB_PREDICTION_MODE mode, best_mode = DC_PRED;
+    int this_rd;
+    unsigned int sse;
+    BLOCK *b = &x->block[0];
+    MACROBLOCKD *xd = &x->e_mbd;
+
+    xd->mode_info_context->mbmi.ref_frame = INTRA_FRAME;
+
+    pick_intra_mbuv_mode(x);
+
+    for (mode = DC_PRED; mode <= TM_PRED; mode ++)
+    {
+        xd->mode_info_context->mbmi.mode = mode;
+        vp8_build_intra_predictors_mby_s(xd,
+                                         xd->dst.y_buffer - xd->dst.y_stride,
+                                         xd->dst.y_buffer - 1,
+                                         xd->dst.y_stride,
+                                         xd->predictor,
+                                         16);
+        distortion = vpx_variance16x16
+            (*(b->base_src), b->src_stride, xd->predictor, 16, &sse);
+        rate = x->mbmode_cost[xd->frame_type][mode];
+        this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+
+        if (error16x16 > this_rd)
+        {
+            error16x16 = this_rd;
+            best_mode = mode;
+            best_sse = sse;
+            best_rate = rate;
+        }
+    }
+    xd->mode_info_context->mbmi.mode = best_mode;
+
+    error4x4 = pick_intra4x4mby_modes(x, &rate,
+                                      &best_sse);
+    if (error4x4 < error16x16)
+    {
+        xd->mode_info_context->mbmi.mode = B_PRED;
+        best_rate = rate;
+    }
+
+    *rate_ = best_rate;
+}
diff --git a/libs/libvpx/vp8/encoder/pickinter.h b/libs/libvpx/vp8/encoder/pickinter.h
new file mode 100644
index 0000000000..cf3b1f8d49
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/pickinter.h
@@ -0,0 +1,35 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_ENCODER_PICKINTER_H_
+#define VP8_ENCODER_PICKINTER_H_
+#include "vpx_config.h"
+#include "vp8/common/onyxc_int.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
+                                int recon_uvoffset, int *returnrate,
+                                int *returndistortion, int *returnintra,
+                                int mb_row, int mb_col);
+extern void vp8_pick_intra_mode(MACROBLOCK *x, int *rate);
+
+extern int vp8_get_inter_mbpred_error(MACROBLOCK *mb,
+                                      const vp8_variance_fn_ptr_t *vfp,
+                                      unsigned int *sse,
+                                      int_mv this_mv);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_ENCODER_PICKINTER_H_
diff --git a/libs/libvpx/vp8/encoder/picklpf.c b/libs/libvpx/vp8/encoder/picklpf.c
new file mode 100644
index 0000000000..debd304130
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/picklpf.c
@@ -0,0 +1,407 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_scale_rtcd.h"
+#include "vp8/common/onyxc_int.h"
+#include "onyx_int.h"
+#include "vp8/encoder/quantize.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_scale/vpx_scale.h"
+#include "vp8/common/alloccommon.h"
+#include "vp8/common/loopfilter.h"
+#if ARCH_ARM
+#include "vpx_ports/arm.h"
+#endif
+
+extern int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest);
+
+static void yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc,
+                                    YV12_BUFFER_CONFIG *dst_ybc)
+{
+    unsigned char *src_y, *dst_y;
+    int yheight;
+    int ystride;
+    int yoffset;
+    int linestocopy;
+
+    yheight  = src_ybc->y_height;
+    ystride  = src_ybc->y_stride;
+
+    /* number of MB rows to use in partial filtering */
+    linestocopy = (yheight >> 4) / PARTIAL_FRAME_FRACTION;
+    linestocopy = linestocopy ? linestocopy << 4 : 16;     /* 16 lines per MB */
+
+    /* Copy extra 4 so that full filter context is available if filtering done
+     * on the copied partial frame and not original. Partial filter does mb
+     * filtering for top row also, which can modify3 pixels above.
+     */
+    linestocopy += 4;
+    /* partial image starts at ~middle of frame (macroblock border)*/
+    yoffset  = ystride * (((yheight >> 5) * 16) - 4);
+    src_y = src_ybc->y_buffer + yoffset;
+    dst_y = dst_ybc->y_buffer + yoffset;
+
+    memcpy(dst_y, src_y, ystride * linestocopy);
+}
+
+static int calc_partial_ssl_err(YV12_BUFFER_CONFIG *source,
+                                YV12_BUFFER_CONFIG *dest)
+{
+    int i, j;
+    int Total = 0;
+    int srcoffset, dstoffset;
+    unsigned char *src = source->y_buffer;
+    unsigned char *dst = dest->y_buffer;
+
+    int linestocopy;
+
+    /* number of MB rows to use in partial filtering */
+    linestocopy = (source->y_height >> 4) / PARTIAL_FRAME_FRACTION;
+    linestocopy = linestocopy ? linestocopy << 4 : 16;     /* 16 lines per MB */
+
+
+    /* partial image starts at ~middle of frame (macroblock border)*/
+    srcoffset = source->y_stride * ((dest->y_height >> 5) * 16);
+    dstoffset = dest->y_stride   * ((dest->y_height >> 5) * 16);
+
+    src += srcoffset;
+    dst += dstoffset;
+
+    /* Loop through the Y plane raw and reconstruction data summing
+     * (square differences)
+     */
+    for (i = 0; i < linestocopy; i += 16)
+    {
+        for (j = 0; j < source->y_width; j += 16)
+        {
+            unsigned int sse;
+            Total += vpx_mse16x16(src + j, source->y_stride,
+                                                     dst + j, dest->y_stride,
+                                                     &sse);
+        }
+
+        src += 16 * source->y_stride;
+        dst += 16 * dest->y_stride;
+    }
+
+    return Total;
+}
+
+/* Enforce a minimum filter level based upon baseline Q */
+static int get_min_filter_level(VP8_COMP *cpi, int base_qindex)
+{
+    int min_filter_level;
+
+    if (cpi->source_alt_ref_active && cpi->common.refresh_golden_frame &&
+        !cpi->common.refresh_alt_ref_frame)
+        min_filter_level = 0;
+    else
+    {
+        if (base_qindex <= 6)
+            min_filter_level = 0;
+        else if (base_qindex <= 16)
+            min_filter_level = 1;
+        else
+            min_filter_level = (base_qindex / 8);
+    }
+
+    return min_filter_level;
+}
+
+/* Enforce a maximum filter level based upon baseline Q */
+static int get_max_filter_level(VP8_COMP *cpi, int base_qindex)
+{
+    /* PGW August 2006: Highest filter values almost always a bad idea */
+
+    /* jbb chg: 20100118 - not so any more with this overquant stuff allow
+     * high values with lots of intra coming in.
+     */
+    int max_filter_level = MAX_LOOP_FILTER;
+    (void)base_qindex;
+
+    if (cpi->twopass.section_intra_rating > 8)
+        max_filter_level = MAX_LOOP_FILTER * 3 / 4;
+
+    return max_filter_level;
+}
+
+void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
+{
+    VP8_COMMON *cm = &cpi->common;
+
+    int best_err = 0;
+    int filt_err = 0;
+    int min_filter_level = get_min_filter_level(cpi, cm->base_qindex);
+    int max_filter_level = get_max_filter_level(cpi, cm->base_qindex);
+    int filt_val;
+    int best_filt_val;
+    YV12_BUFFER_CONFIG * saved_frame = cm->frame_to_show;
+
+    /* Replace unfiltered frame buffer with a new one */
+    cm->frame_to_show = &cpi->pick_lf_lvl_frame;
+
+    if (cm->frame_type == KEY_FRAME)
+        cm->sharpness_level = 0;
+    else
+        cm->sharpness_level = cpi->oxcf.Sharpness;
+
+    if (cm->sharpness_level != cm->last_sharpness_level)
+    {
+        vp8_loop_filter_update_sharpness(&cm->lf_info, cm->sharpness_level);
+        cm->last_sharpness_level = cm->sharpness_level;
+    }
+
+    /* Start the search at the previous frame filter level unless it is
+     * now out of range.
+     */
+    if (cm->filter_level < min_filter_level)
+        cm->filter_level = min_filter_level;
+    else if (cm->filter_level > max_filter_level)
+        cm->filter_level = max_filter_level;
+
+    filt_val = cm->filter_level;
+    best_filt_val = filt_val;
+
+    /* Get the err using the previous frame's filter value. */
+
+    /* Copy the unfiltered / processed recon buffer to the new buffer */
+    yv12_copy_partial_frame(saved_frame, cm->frame_to_show);
+    vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);
+
+    best_err = calc_partial_ssl_err(sd, cm->frame_to_show);
+
+    filt_val -= 1 + (filt_val > 10);
+
+    /* Search lower filter levels */
+    while (filt_val >= min_filter_level)
+    {
+        /* Apply the loop filter */
+        yv12_copy_partial_frame(saved_frame, cm->frame_to_show);
+        vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);
+
+        /* Get the err for filtered frame */
+        filt_err = calc_partial_ssl_err(sd, cm->frame_to_show);
+
+        /* Update the best case record or exit loop. */
+        if (filt_err < best_err)
+        {
+            best_err = filt_err;
+            best_filt_val = filt_val;
+        }
+        else
+            break;
+
+        /* Adjust filter level */
+        filt_val -= 1 + (filt_val > 10);
+    }
+
+    /* Search up (note that we have already done filt_val = cm->filter_level) */
+    filt_val = cm->filter_level + 1 + (filt_val > 10);
+
+    if (best_filt_val == cm->filter_level)
+    {
+        /* Resist raising filter level for very small gains */
+        best_err -= (best_err >> 10);
+
+        while (filt_val < max_filter_level)
+        {
+            /* Apply the loop filter */
+            yv12_copy_partial_frame(saved_frame, cm->frame_to_show);
+
+            vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);
+
+            /* Get the err for filtered frame */
+            filt_err = calc_partial_ssl_err(sd, cm->frame_to_show);
+
+            /* Update the best case record or exit loop. */
+            if (filt_err < best_err)
+            {
+                /* Do not raise filter level if improvement is < 1 part
+                 * in 4096
+                 */
+                best_err = filt_err - (filt_err >> 10);
+
+                best_filt_val = filt_val;
+            }
+            else
+                break;
+
+            /* Adjust filter level */
+            filt_val += 1 + (filt_val > 10);
+        }
+    }
+
+    cm->filter_level = best_filt_val;
+
+    if (cm->filter_level < min_filter_level)
+        cm->filter_level = min_filter_level;
+
+    if (cm->filter_level > max_filter_level)
+        cm->filter_level = max_filter_level;
+
+    /* restore unfiltered frame pointer */
+    cm->frame_to_show = saved_frame;
+}
+
+/* Stub function for now Alt LF not used */
+void vp8cx_set_alt_lf_level(VP8_COMP *cpi, int filt_val)
+{
+    MACROBLOCKD *mbd = &cpi->mb.e_mbd;
+    (void) filt_val;
+
+    mbd->segment_feature_data[MB_LVL_ALT_LF][0] = cpi->segment_feature_data[MB_LVL_ALT_LF][0];
+    mbd->segment_feature_data[MB_LVL_ALT_LF][1] = cpi->segment_feature_data[MB_LVL_ALT_LF][1];
+    mbd->segment_feature_data[MB_LVL_ALT_LF][2] = cpi->segment_feature_data[MB_LVL_ALT_LF][2];
+    mbd->segment_feature_data[MB_LVL_ALT_LF][3] = cpi->segment_feature_data[MB_LVL_ALT_LF][3];
+}
+
+void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
+{
+    VP8_COMMON *cm = &cpi->common;
+
+    int best_err = 0;
+    int filt_err = 0;
+    int min_filter_level = get_min_filter_level(cpi, cm->base_qindex);
+    int max_filter_level = get_max_filter_level(cpi, cm->base_qindex);
+
+    int filter_step;
+    int filt_high = 0;
+    int filt_mid;
+    int filt_low = 0;
+    int filt_best;
+    int filt_direction = 0;
+
+    /* Bias against raising loop filter and in favor of lowering it */
+    int Bias = 0;
+
+    int ss_err[MAX_LOOP_FILTER + 1];
+
+    YV12_BUFFER_CONFIG * saved_frame = cm->frame_to_show;
+
+    memset(ss_err, 0, sizeof(ss_err));
+
+    /* Replace unfiltered frame buffer with a new one */
+    cm->frame_to_show = &cpi->pick_lf_lvl_frame;
+
+    if (cm->frame_type == KEY_FRAME)
+        cm->sharpness_level = 0;
+    else
+        cm->sharpness_level = cpi->oxcf.Sharpness;
+
+    /* Start the search at the previous frame filter level unless it is
+     * now out of range.
+     */
+    filt_mid = cm->filter_level;
+
+    if (filt_mid < min_filter_level)
+        filt_mid = min_filter_level;
+    else if (filt_mid > max_filter_level)
+        filt_mid = max_filter_level;
+
+    /* Define the initial step size */
+    filter_step = (filt_mid < 16) ? 4 : filt_mid / 4;
+
+    /* Get baseline error score */
+
+    /* Copy the unfiltered / processed recon buffer to the new buffer */
+    vpx_yv12_copy_y(saved_frame, cm->frame_to_show);
+
+    vp8cx_set_alt_lf_level(cpi, filt_mid);
+    vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_mid);
+
+    best_err = vp8_calc_ss_err(sd, cm->frame_to_show);
+
+    ss_err[filt_mid] = best_err;
+
+    filt_best = filt_mid;
+
+    while (filter_step > 0)
+    {
+        Bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
+
+        if (cpi->twopass.section_intra_rating < 20)
+            Bias = Bias * cpi->twopass.section_intra_rating / 20;
+
+        filt_high = ((filt_mid + filter_step) > max_filter_level) ? max_filter_level : (filt_mid + filter_step);
+        filt_low = ((filt_mid - filter_step) < min_filter_level) ? min_filter_level : (filt_mid - filter_step);
+
+        if ((filt_direction <= 0) && (filt_low != filt_mid))
+        {
+            if(ss_err[filt_low] == 0)
+            {
+                /* Get Low filter error score */
+                vpx_yv12_copy_y(saved_frame, cm->frame_to_show);
+                vp8cx_set_alt_lf_level(cpi, filt_low);
+                vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_low);
+
+                filt_err = vp8_calc_ss_err(sd, cm->frame_to_show);
+                ss_err[filt_low] = filt_err;
+            }
+            else
+                filt_err = ss_err[filt_low];
+
+            /* If value is close to the best so far then bias towards a
+             * lower loop filter value.
+             */
+            if ((filt_err - Bias) < best_err)
+            {
+                /* Was it actually better than the previous best? */
+                if (filt_err < best_err)
+                    best_err = filt_err;
+
+                filt_best = filt_low;
+            }
+        }
+
+        /* Now look at filt_high */
+        if ((filt_direction >= 0) && (filt_high != filt_mid))
+        {
+            if(ss_err[filt_high] == 0)
+            {
+                vpx_yv12_copy_y(saved_frame, cm->frame_to_show);
+                vp8cx_set_alt_lf_level(cpi, filt_high);
+                vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_high);
+
+                filt_err = vp8_calc_ss_err(sd, cm->frame_to_show);
+                ss_err[filt_high] = filt_err;
+            }
+            else
+                filt_err = ss_err[filt_high];
+
+            /* Was it better than the previous best? */
+            if (filt_err < (best_err - Bias))
+            {
+                best_err = filt_err;
+                filt_best = filt_high;
+            }
+        }
+
+        /* Half the step distance if the best filter value was the same
+         * as last time
+         */
+        if (filt_best == filt_mid)
+        {
+            filter_step = filter_step / 2;
+            filt_direction = 0;
+        }
+        else
+        {
+            filt_direction = (filt_best < filt_mid) ? -1 : 1;
+            filt_mid = filt_best;
+        }
+    }
+
+    cm->filter_level = filt_best;
+
+    /* restore unfiltered frame pointer */
+    cm->frame_to_show = saved_frame;
+}
diff --git a/libs/libvpx/vp8/encoder/quantize.h b/libs/libvpx/vp8/encoder/quantize.h
new file mode 100644
index 0000000000..7d36c2b45f
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/quantize.h
@@ -0,0 +1,34 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_ENCODER_QUANTIZE_H_
+#define VP8_ENCODER_QUANTIZE_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP8_COMP;
+struct macroblock;
+extern void vp8_quantize_mb(struct macroblock *x);
+extern void vp8_quantize_mby(struct macroblock *x);
+extern void vp8_quantize_mbuv(struct macroblock *x);
+extern void vp8_set_quantizer(struct VP8_COMP *cpi, int Q);
+extern void vp8cx_frame_init_quantizer(struct VP8_COMP *cpi);
+extern void vp8_update_zbin_extra(struct VP8_COMP *cpi, struct macroblock *x);
+extern void vp8cx_mb_init_quantizer(struct VP8_COMP *cpi, struct macroblock *x, int ok_to_skip);
+extern void vp8cx_init_quantizer(struct VP8_COMP *cpi);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_ENCODER_QUANTIZE_H_
diff --git a/libs/libvpx/vp8/encoder/ratectrl.c b/libs/libvpx/vp8/encoder/ratectrl.c
new file mode 100644
index 0000000000..7da3d71adc
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/ratectrl.c
@@ -0,0 +1,1638 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <limits.h>
+#include <assert.h>
+
+#include "math.h"
+#include "vp8/common/common.h"
+#include "ratectrl.h"
+#include "vp8/common/entropymode.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp8/common/systemdependent.h"
+#include "encodemv.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+
+#define MIN_BPB_FACTOR          0.01
+#define MAX_BPB_FACTOR          50
+
+extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES];
+
+
+
+#ifdef MODE_STATS
+extern int y_modes[5];
+extern int uv_modes[4];
+extern int b_modes[10];
+
+extern int inter_y_modes[10];
+extern int inter_uv_modes[4];
+extern int inter_b_modes[10];
+#endif
+
+/* Bits Per MB at different Q (Multiplied by 512) */
+#define BPER_MB_NORMBITS    9
+
+/* Work in progress recalibration of baseline rate tables based on
+ * the assumption that bits per mb is inversely proportional to the
+ * quantizer value.
+ */
+const int vp8_bits_per_mb[2][QINDEX_RANGE] =
+{
+    /* Intra case 450000/Qintra */
+    {
+        1125000,900000, 750000, 642857, 562500, 500000, 450000, 450000,
+        409090, 375000, 346153, 321428, 300000, 281250, 264705, 264705,
+        250000, 236842, 225000, 225000, 214285, 214285, 204545, 204545,
+        195652, 195652, 187500, 180000, 180000, 173076, 166666, 160714,
+        155172, 150000, 145161, 140625, 136363, 132352, 128571, 125000,
+        121621, 121621, 118421, 115384, 112500, 109756, 107142, 104651,
+        102272, 100000, 97826,  97826,  95744,  93750,  91836,  90000,
+        88235,  86538,  84905,  83333,  81818,  80357,  78947,  77586,
+        76271,  75000,  73770,  72580,  71428,  70312,  69230,  68181,
+        67164,  66176,  65217,  64285,  63380,  62500,  61643,  60810,
+        60000,  59210,  59210,  58441,  57692,  56962,  56250,  55555,
+        54878,  54216,  53571,  52941,  52325,  51724,  51136,  50561,
+        49450,  48387,  47368,  46875,  45918,  45000,  44554,  44117,
+        43269,  42452,  41666,  40909,  40178,  39473,  38793,  38135,
+        36885,  36290,  35714,  35156,  34615,  34090,  33582,  33088,
+        32608,  32142,  31468,  31034,  30405,  29801,  29220,  28662,
+    },
+    /* Inter case 285000/Qinter */
+    {
+        712500, 570000, 475000, 407142, 356250, 316666, 285000, 259090,
+        237500, 219230, 203571, 190000, 178125, 167647, 158333, 150000,
+        142500, 135714, 129545, 123913, 118750, 114000, 109615, 105555,
+        101785, 98275,  95000,  91935,  89062,  86363,  83823,  81428,
+        79166,  77027,  75000,  73076,  71250,  69512,  67857,  66279,
+        64772,  63333,  61956,  60638,  59375,  58163,  57000,  55882,
+        54807,  53773,  52777,  51818,  50892,  50000,  49137,  47500,
+        45967,  44531,  43181,  41911,  40714,  39583,  38513,  37500,
+        36538,  35625,  34756,  33928,  33139,  32386,  31666,  30978,
+        30319,  29687,  29081,  28500,  27941,  27403,  26886,  26388,
+        25909,  25446,  25000,  24568,  23949,  23360,  22800,  22265,
+        21755,  21268,  20802,  20357,  19930,  19520,  19127,  18750,
+        18387,  18037,  17701,  17378,  17065,  16764,  16473,  16101,
+        15745,  15405,  15079,  14766,  14467,  14179,  13902,  13636,
+        13380,  13133,  12895,  12666,  12445,  12179,  11924,  11632,
+        11445,  11220,  11003,  10795,  10594,  10401,  10215,  10035,
+    }
+};
+
+static const int kf_boost_qadjustment[QINDEX_RANGE] =
+{
+    128, 129, 130, 131, 132, 133, 134, 135,
+    136, 137, 138, 139, 140, 141, 142, 143,
+    144, 145, 146, 147, 148, 149, 150, 151,
+    152, 153, 154, 155, 156, 157, 158, 159,
+    160, 161, 162, 163, 164, 165, 166, 167,
+    168, 169, 170, 171, 172, 173, 174, 175,
+    176, 177, 178, 179, 180, 181, 182, 183,
+    184, 185, 186, 187, 188, 189, 190, 191,
+    192, 193, 194, 195, 196, 197, 198, 199,
+    200, 200, 201, 201, 202, 203, 203, 203,
+    204, 204, 205, 205, 206, 206, 207, 207,
+    208, 208, 209, 209, 210, 210, 211, 211,
+    212, 212, 213, 213, 214, 214, 215, 215,
+    216, 216, 217, 217, 218, 218, 219, 219,
+    220, 220, 220, 220, 220, 220, 220, 220,
+    220, 220, 220, 220, 220, 220, 220, 220,
+};
+
+/* #define GFQ_ADJUSTMENT (Q+100) */
+#define GFQ_ADJUSTMENT vp8_gf_boost_qadjustment[Q]
+const int vp8_gf_boost_qadjustment[QINDEX_RANGE] =
+{
+    80, 82, 84, 86, 88, 90, 92, 94,
+    96, 97, 98, 99, 100, 101, 102, 103,
+    104, 105, 106, 107, 108, 109, 110, 111,
+    112, 113, 114, 115, 116, 117, 118, 119,
+    120, 121, 122, 123, 124, 125, 126, 127,
+    128, 129, 130, 131, 132, 133, 134, 135,
+    136, 137, 138, 139, 140, 141, 142, 143,
+    144, 145, 146, 147, 148, 149, 150, 151,
+    152, 153, 154, 155, 156, 157, 158, 159,
+    160, 161, 162, 163, 164, 165, 166, 167,
+    168, 169, 170, 171, 172, 173, 174, 175,
+    176, 177, 178, 179, 180, 181, 182, 183,
+    184, 184, 185, 185, 186, 186, 187, 187,
+    188, 188, 189, 189, 190, 190, 191, 191,
+    192, 192, 193, 193, 194, 194, 194, 194,
+    195, 195, 196, 196, 197, 197, 198, 198
+};
+
+/*
+const int vp8_gf_boost_qadjustment[QINDEX_RANGE] =
+{
+    100,101,102,103,104,105,105,106,
+    106,107,107,108,109,109,110,111,
+    112,113,114,115,116,117,118,119,
+    120,121,122,123,124,125,126,127,
+    128,129,130,131,132,133,134,135,
+    136,137,138,139,140,141,142,143,
+    144,145,146,147,148,149,150,151,
+    152,153,154,155,156,157,158,159,
+    160,161,162,163,164,165,166,167,
+    168,169,170,170,171,171,172,172,
+    173,173,173,174,174,174,175,175,
+    175,176,176,176,177,177,177,177,
+    178,178,179,179,180,180,181,181,
+    182,182,183,183,184,184,185,185,
+    186,186,187,187,188,188,189,189,
+    190,190,191,191,192,192,193,193,
+};
+*/
+
+static const int kf_gf_boost_qlimits[QINDEX_RANGE] =
+{
+    150, 155, 160, 165, 170, 175, 180, 185,
+    190, 195, 200, 205, 210, 215, 220, 225,
+    230, 235, 240, 245, 250, 255, 260, 265,
+    270, 275, 280, 285, 290, 295, 300, 305,
+    310, 320, 330, 340, 350, 360, 370, 380,
+    390, 400, 410, 420, 430, 440, 450, 460,
+    470, 480, 490, 500, 510, 520, 530, 540,
+    550, 560, 570, 580, 590, 600, 600, 600,
+    600, 600, 600, 600, 600, 600, 600, 600,
+    600, 600, 600, 600, 600, 600, 600, 600,
+    600, 600, 600, 600, 600, 600, 600, 600,
+    600, 600, 600, 600, 600, 600, 600, 600,
+    600, 600, 600, 600, 600, 600, 600, 600,
+    600, 600, 600, 600, 600, 600, 600, 600,
+    600, 600, 600, 600, 600, 600, 600, 600,
+    600, 600, 600, 600, 600, 600, 600, 600,
+};
+
+static const int gf_adjust_table[101] =
+{
+    100,
+    115, 130, 145, 160, 175, 190, 200, 210, 220, 230,
+    240, 260, 270, 280, 290, 300, 310, 320, 330, 340,
+    350, 360, 370, 380, 390, 400, 400, 400, 400, 400,
+    400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
+    400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
+    400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
+    400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
+    400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
+    400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
+    400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
+};
+
+static const int gf_intra_usage_adjustment[20] =
+{
+    125, 120, 115, 110, 105, 100,  95,  85,  80,  75,
+    70,  65,  60,  55,  50,  50,  50,  50,  50,  50,
+};
+
+static const int gf_interval_table[101] =
+{
+    7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+    9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+    9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+    11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+};
+
+static const unsigned int prior_key_frame_weight[KEY_FRAME_CONTEXT] = { 1, 2, 3, 4, 5 };
+
+
+void vp8_save_coding_context(VP8_COMP *cpi)
+{
+    CODING_CONTEXT *const cc = & cpi->coding_context;
+
+    /* Stores a snapshot of key state variables which can subsequently be
+     * restored with a call to vp8_restore_coding_context. These functions are
+     * intended for use in a re-code loop in vp8_compress_frame where the
+     * quantizer value is adjusted between loop iterations.
+     */
+
+    cc->frames_since_key          = cpi->frames_since_key;
+    cc->filter_level             = cpi->common.filter_level;
+    cc->frames_till_gf_update_due   = cpi->frames_till_gf_update_due;
+    cc->frames_since_golden       = cpi->frames_since_golden;
+
+    vp8_copy(cc->mvc,      cpi->common.fc.mvc);
+    vp8_copy(cc->mvcosts,  cpi->rd_costs.mvcosts);
+
+    vp8_copy(cc->ymode_prob,   cpi->common.fc.ymode_prob);
+    vp8_copy(cc->uv_mode_prob,  cpi->common.fc.uv_mode_prob);
+
+    vp8_copy(cc->ymode_count, cpi->mb.ymode_count);
+    vp8_copy(cc->uv_mode_count, cpi->mb.uv_mode_count);
+
+
+    /* Stats */
+#ifdef MODE_STATS
+    vp8_copy(cc->y_modes,       y_modes);
+    vp8_copy(cc->uv_modes,      uv_modes);
+    vp8_copy(cc->b_modes,       b_modes);
+    vp8_copy(cc->inter_y_modes,  inter_y_modes);
+    vp8_copy(cc->inter_uv_modes, inter_uv_modes);
+    vp8_copy(cc->inter_b_modes,  inter_b_modes);
+#endif
+
+    cc->this_frame_percent_intra = cpi->this_frame_percent_intra;
+}
+
+
+void vp8_restore_coding_context(VP8_COMP *cpi)
+{
+    CODING_CONTEXT *const cc = & cpi->coding_context;
+
+    /* Restore key state variables to the snapshot state stored in the
+     * previous call to vp8_save_coding_context.
+     */
+
+    cpi->frames_since_key         =   cc->frames_since_key;
+    cpi->common.filter_level     =   cc->filter_level;
+    cpi->frames_till_gf_update_due  =   cc->frames_till_gf_update_due;
+    cpi->frames_since_golden       =   cc->frames_since_golden;
+
+    vp8_copy(cpi->common.fc.mvc, cc->mvc);
+
+    vp8_copy(cpi->rd_costs.mvcosts, cc->mvcosts);
+
+    vp8_copy(cpi->common.fc.ymode_prob,   cc->ymode_prob);
+    vp8_copy(cpi->common.fc.uv_mode_prob,  cc->uv_mode_prob);
+
+    vp8_copy(cpi->mb.ymode_count, cc->ymode_count);
+    vp8_copy(cpi->mb.uv_mode_count, cc->uv_mode_count);
+
+    /* Stats */
+#ifdef MODE_STATS
+    vp8_copy(y_modes, cc->y_modes);
+    vp8_copy(uv_modes, cc->uv_modes);
+    vp8_copy(b_modes, cc->b_modes);
+    vp8_copy(inter_y_modes, cc->inter_y_modes);
+    vp8_copy(inter_uv_modes, cc->inter_uv_modes);
+    vp8_copy(inter_b_modes, cc->inter_b_modes);
+#endif
+
+
+    cpi->this_frame_percent_intra = cc->this_frame_percent_intra;
+}
+
+
+void vp8_setup_key_frame(VP8_COMP *cpi)
+{
+    /* Setup for Key frame: */
+
+    vp8_default_coef_probs(& cpi->common);
+
+    memcpy(cpi->common.fc.mvc, vp8_default_mv_context, sizeof(vp8_default_mv_context));
+    {
+        int flag[2] = {1, 1};
+        vp8_build_component_cost_table(cpi->mb.mvcost, (const MV_CONTEXT *) cpi->common.fc.mvc, flag);
+    }
+
+    /* Make sure we initialize separate contexts for altref,gold, and normal.
+     * TODO shouldn't need 3 different copies of structure to do this!
+     */
+    memcpy(&cpi->lfc_a, &cpi->common.fc, sizeof(cpi->common.fc));
+    memcpy(&cpi->lfc_g, &cpi->common.fc, sizeof(cpi->common.fc));
+    memcpy(&cpi->lfc_n, &cpi->common.fc, sizeof(cpi->common.fc));
+
+    cpi->common.filter_level = cpi->common.base_qindex * 3 / 8 ;
+
+    /* Provisional interval before next GF */
+    if (cpi->auto_gold)
+        cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
+    else
+        cpi->frames_till_gf_update_due = DEFAULT_GF_INTERVAL;
+
+    cpi->common.refresh_golden_frame = 1;
+    cpi->common.refresh_alt_ref_frame = 1;
+}
+
+
+static int estimate_bits_at_q(int frame_kind, int Q, int MBs,
+                              double correction_factor)
+{
+    int Bpm = (int)(.5 + correction_factor * vp8_bits_per_mb[frame_kind][Q]);
+
+    /* Attempt to retain reasonable accuracy without overflow. The cutoff is
+     * chosen such that the maximum product of Bpm and MBs fits 31 bits. The
+     * largest Bpm takes 20 bits.
+     */
+    if (MBs > (1 << 11))
+        return (Bpm >> BPER_MB_NORMBITS) * MBs;
+    else
+        return (Bpm * MBs) >> BPER_MB_NORMBITS;
+}
+
+
+static void calc_iframe_target_size(VP8_COMP *cpi)
+{
+    /* boost defaults to half second */
+    int kf_boost;
+    uint64_t target;
+
+    /* Clear down mmx registers to allow floating point in what follows */
+    vp8_clear_system_state();
+
+    if (cpi->oxcf.fixed_q >= 0)
+    {
+        int Q = cpi->oxcf.key_q;
+
+        target = estimate_bits_at_q(INTRA_FRAME, Q, cpi->common.MBs,
+                                    cpi->key_frame_rate_correction_factor);
+    }
+    else if (cpi->pass == 2)
+    {
+        /* New Two pass RC */
+        target = cpi->per_frame_bandwidth;
+    }
+    /* First Frame is a special case */
+    else if (cpi->common.current_video_frame == 0)
+    {
+        /* 1 Pass there is no information on which to base size so use
+         * bandwidth per second * fraction of the initial buffer
+         * level
+         */
+        target = cpi->oxcf.starting_buffer_level / 2;
+
+        if(target > cpi->oxcf.target_bandwidth * 3 / 2)
+            target = cpi->oxcf.target_bandwidth * 3 / 2;
+    }
+    else
+    {
+        /* if this keyframe was forced, use a more recent Q estimate */
+        int Q = (cpi->common.frame_flags & FRAMEFLAGS_KEY)
+                ? cpi->avg_frame_qindex : cpi->ni_av_qi;
+
+        int initial_boost = 32; /* |3.0 * per_frame_bandwidth| */
+        /* Boost depends somewhat on frame rate: only used for 1 layer case. */
+        if (cpi->oxcf.number_of_layers == 1) {
+          kf_boost = VPXMAX(initial_boost,
+                            (int)(2 * cpi->output_framerate - 16));
+        }
+        else {
+          /* Initial factor: set target size to: |3.0 * per_frame_bandwidth|. */
+          kf_boost = initial_boost;
+        }
+
+        /* adjustment up based on q: this factor ranges from ~1.2 to 2.2. */
+        kf_boost = kf_boost * kf_boost_qadjustment[Q] / 100;
+
+        /* frame separation adjustment ( down) */
+        if (cpi->frames_since_key  < cpi->output_framerate / 2)
+            kf_boost = (int)(kf_boost
+                       * cpi->frames_since_key / (cpi->output_framerate / 2));
+
+        /* Minimal target size is |2* per_frame_bandwidth|. */
+        if (kf_boost < 16)
+            kf_boost = 16;
+
+        target = ((16 + kf_boost) * cpi->per_frame_bandwidth) >> 4;
+    }
+
+
+    if (cpi->oxcf.rc_max_intra_bitrate_pct)
+    {
+        unsigned int max_rate = cpi->per_frame_bandwidth
+                                * cpi->oxcf.rc_max_intra_bitrate_pct / 100;
+
+        if (target > max_rate)
+            target = max_rate;
+    }
+
+    cpi->this_frame_target = (int)target;
+
+    /* TODO: if we separate rate targeting from Q targetting, move this.
+     * Reset the active worst quality to the baseline value for key frames.
+     */
+    if (cpi->pass != 2)
+        cpi->active_worst_quality = cpi->worst_quality;
+
+#if 0
+    {
+        FILE *f;
+
+        f = fopen("kf_boost.stt", "a");
+        fprintf(f, " %8u %10d %10d %10d\n",
+                cpi->common.current_video_frame,  cpi->gfu_boost, cpi->baseline_gf_interval, cpi->source_alt_ref_pending);
+
+        fclose(f);
+    }
+#endif
+}
+
+
+/* Do the best we can to define the parameters for the next GF based on what
+ * information we have available.
+ */
+static void calc_gf_params(VP8_COMP *cpi)
+{
+    int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;
+    int Boost = 0;
+
+    int gf_frame_useage = 0;      /* Golden frame useage since last GF */
+    int tot_mbs = cpi->recent_ref_frame_usage[INTRA_FRAME]  +
+                  cpi->recent_ref_frame_usage[LAST_FRAME]   +
+                  cpi->recent_ref_frame_usage[GOLDEN_FRAME] +
+                  cpi->recent_ref_frame_usage[ALTREF_FRAME];
+
+    int pct_gf_active = (100 * cpi->gf_active_count) / (cpi->common.mb_rows * cpi->common.mb_cols);
+
+    if (tot_mbs)
+        gf_frame_useage = (cpi->recent_ref_frame_usage[GOLDEN_FRAME] + cpi->recent_ref_frame_usage[ALTREF_FRAME]) * 100 / tot_mbs;
+
+    if (pct_gf_active > gf_frame_useage)
+        gf_frame_useage = pct_gf_active;
+
+    /* Not two pass */
+    if (cpi->pass != 2)
+    {
+        /* Single Pass lagged mode: TBD */
+        if (0)
+        {
+        }
+
+        /* Single Pass compression: Has to use current and historical data */
+        else
+        {
+#if 0
+            /* Experimental code */
+            int index = cpi->one_pass_frame_index;
+            int frames_to_scan = (cpi->max_gf_interval <= MAX_LAG_BUFFERS) ? cpi->max_gf_interval : MAX_LAG_BUFFERS;
+
+            /* ************** Experimental code - incomplete */
+            /*
+            double decay_val = 1.0;
+            double IIAccumulator = 0.0;
+            double last_iiaccumulator = 0.0;
+            double IIRatio;
+
+            cpi->one_pass_frame_index = cpi->common.current_video_frame%MAX_LAG_BUFFERS;
+
+            for ( i = 0; i < (frames_to_scan - 1); i++ )
+            {
+                if ( index < 0 )
+                    index = MAX_LAG_BUFFERS;
+                index --;
+
+                if ( cpi->one_pass_frame_stats[index].frame_coded_error > 0.0 )
+                {
+                    IIRatio = cpi->one_pass_frame_stats[index].frame_intra_error / cpi->one_pass_frame_stats[index].frame_coded_error;
+
+                    if ( IIRatio > 30.0 )
+                        IIRatio = 30.0;
+                }
+                else
+                    IIRatio = 30.0;
+
+                IIAccumulator += IIRatio * decay_val;
+
+                decay_val = decay_val * cpi->one_pass_frame_stats[index].frame_pcnt_inter;
+
+                if (    (i > MIN_GF_INTERVAL) &&
+                        ((IIAccumulator - last_iiaccumulator) < 2.0) )
+                {
+                    break;
+                }
+                last_iiaccumulator = IIAccumulator;
+            }
+
+            Boost = IIAccumulator*100.0/16.0;
+            cpi->baseline_gf_interval = i;
+
+            */
+#else
+
+            /*************************************************************/
+            /* OLD code */
+
+            /* Adjust boost based upon ambient Q */
+            Boost = GFQ_ADJUSTMENT;
+
+            /* Adjust based upon most recently measure intra useage */
+            Boost = Boost * gf_intra_usage_adjustment[(cpi->this_frame_percent_intra < 15) ? cpi->this_frame_percent_intra : 14] / 100;
+
+            /* Adjust gf boost based upon GF usage since last GF */
+            Boost = Boost * gf_adjust_table[gf_frame_useage] / 100;
+#endif
+        }
+
+        /* golden frame boost without recode loop often goes awry.  be
+         * safe by keeping numbers down.
+         */
+        if (!cpi->sf.recode_loop)
+        {
+            if (cpi->compressor_speed == 2)
+                Boost = Boost / 2;
+        }
+
+        /* Apply an upper limit based on Q for 1 pass encodes */
+        if (Boost > kf_gf_boost_qlimits[Q] && (cpi->pass == 0))
+            Boost = kf_gf_boost_qlimits[Q];
+
+        /* Apply lower limits to boost. */
+        else if (Boost < 110)
+            Boost = 110;
+
+        /* Note the boost used */
+        cpi->last_boost = Boost;
+
+    }
+
+    /* Estimate next interval
+     * This is updated once the real frame size/boost is known.
+     */
+    if (cpi->oxcf.fixed_q == -1)
+    {
+        if (cpi->pass == 2)         /* 2 Pass */
+        {
+            cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
+        }
+        else                            /* 1 Pass */
+        {
+            cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
+
+            if (cpi->last_boost > 750)
+                cpi->frames_till_gf_update_due++;
+
+            if (cpi->last_boost > 1000)
+                cpi->frames_till_gf_update_due++;
+
+            if (cpi->last_boost > 1250)
+                cpi->frames_till_gf_update_due++;
+
+            if (cpi->last_boost >= 1500)
+                cpi->frames_till_gf_update_due ++;
+
+            if (gf_interval_table[gf_frame_useage] > cpi->frames_till_gf_update_due)
+                cpi->frames_till_gf_update_due = gf_interval_table[gf_frame_useage];
+
+            if (cpi->frames_till_gf_update_due > cpi->max_gf_interval)
+                cpi->frames_till_gf_update_due = cpi->max_gf_interval;
+        }
+    }
+    else
+        cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
+
+    /* ARF on or off */
+    if (cpi->pass != 2)
+    {
+        /* For now Alt ref is not allowed except in 2 pass modes. */
+        cpi->source_alt_ref_pending = 0;
+
+        /*if ( cpi->oxcf.fixed_q == -1)
+        {
+            if ( cpi->oxcf.play_alternate && (cpi->last_boost > (100 + (AF_THRESH*cpi->frames_till_gf_update_due)) ) )
+                cpi->source_alt_ref_pending = 1;
+            else
+                cpi->source_alt_ref_pending = 0;
+        }*/
+    }
+}
+
+
+static void calc_pframe_target_size(VP8_COMP *cpi)
+{
+    int min_frame_target;
+    int old_per_frame_bandwidth = cpi->per_frame_bandwidth;
+
+    if ( cpi->current_layer > 0)
+        cpi->per_frame_bandwidth =
+            cpi->layer_context[cpi->current_layer].avg_frame_size_for_layer;
+
+    min_frame_target = 0;
+
+    if (cpi->pass == 2)
+    {
+        min_frame_target = cpi->min_frame_bandwidth;
+
+        if (min_frame_target < (cpi->av_per_frame_bandwidth >> 5))
+            min_frame_target = cpi->av_per_frame_bandwidth >> 5;
+    }
+    else if (min_frame_target < cpi->per_frame_bandwidth / 4)
+        min_frame_target = cpi->per_frame_bandwidth / 4;
+
+
+    /* Special alt reference frame case */
+    if((cpi->common.refresh_alt_ref_frame) && (cpi->oxcf.number_of_layers == 1))
+    {
+        if (cpi->pass == 2)
+        {
+            /* Per frame bit target for the alt ref frame */
+            cpi->per_frame_bandwidth = cpi->twopass.gf_bits;
+            cpi->this_frame_target = cpi->per_frame_bandwidth;
+        }
+
+        /* One Pass ??? TBD */
+    }
+
+    /* Normal frames (gf,and inter) */
+    else
+    {
+        /* 2 pass */
+        if (cpi->pass == 2)
+        {
+            cpi->this_frame_target = cpi->per_frame_bandwidth;
+        }
+        /* 1 pass */
+        else
+        {
+            int Adjustment;
+            /* Make rate adjustment to recover bits spent in key frame
+             * Test to see if the key frame inter data rate correction
+             * should still be in force
+             */
+            if (cpi->kf_overspend_bits > 0)
+            {
+                Adjustment = (cpi->kf_bitrate_adjustment <= cpi->kf_overspend_bits) ? cpi->kf_bitrate_adjustment : cpi->kf_overspend_bits;
+
+                if (Adjustment > (cpi->per_frame_bandwidth - min_frame_target))
+                    Adjustment = (cpi->per_frame_bandwidth - min_frame_target);
+
+                cpi->kf_overspend_bits -= Adjustment;
+
+                /* Calculate an inter frame bandwidth target for the next
+                 * few frames designed to recover any extra bits spent on
+                 * the key frame.
+                 */
+                cpi->this_frame_target = cpi->per_frame_bandwidth - Adjustment;
+
+                if (cpi->this_frame_target < min_frame_target)
+                    cpi->this_frame_target = min_frame_target;
+            }
+            else
+                cpi->this_frame_target = cpi->per_frame_bandwidth;
+
+            /* If appropriate make an adjustment to recover bits spent on a
+             * recent GF
+             */
+            if ((cpi->gf_overspend_bits > 0) && (cpi->this_frame_target > min_frame_target))
+            {
+                Adjustment = (cpi->non_gf_bitrate_adjustment <= cpi->gf_overspend_bits) ? cpi->non_gf_bitrate_adjustment : cpi->gf_overspend_bits;
+
+                if (Adjustment > (cpi->this_frame_target - min_frame_target))
+                    Adjustment = (cpi->this_frame_target - min_frame_target);
+
+                cpi->gf_overspend_bits -= Adjustment;
+                cpi->this_frame_target -= Adjustment;
+            }
+
+            /* Apply small + and - boosts for non gf frames */
+            if ((cpi->last_boost > 150) && (cpi->frames_till_gf_update_due > 0) &&
+                (cpi->current_gf_interval >= (MIN_GF_INTERVAL << 1)))
+            {
+                /* % Adjustment limited to the range 1% to 10% */
+                Adjustment = (cpi->last_boost - 100) >> 5;
+
+                if (Adjustment < 1)
+                    Adjustment = 1;
+                else if (Adjustment > 10)
+                    Adjustment = 10;
+
+                /* Convert to bits */
+                Adjustment = (cpi->this_frame_target * Adjustment) / 100;
+
+                if (Adjustment > (cpi->this_frame_target - min_frame_target))
+                    Adjustment = (cpi->this_frame_target - min_frame_target);
+
+                if (cpi->frames_since_golden == (cpi->current_gf_interval >> 1))
+                {
+                    Adjustment = (cpi->current_gf_interval - 1) * Adjustment;
+                    // Limit adjustment to 10% of current target.
+                    if (Adjustment > (10 * cpi->this_frame_target) / 100)
+                        Adjustment = (10 * cpi->this_frame_target) / 100;
+                    cpi->this_frame_target += Adjustment;
+                }
+                else
+                    cpi->this_frame_target -= Adjustment;
+            }
+        }
+    }
+
+    /* Sanity check that the total sum of adjustments is not above the
+     * maximum allowed That is that having allowed for KF and GF penalties
+     * we have not pushed the current interframe target to low. If the
+     * adjustment we apply here is not capable of recovering all the extra
+     * bits we have spent in the KF or GF then the remainder will have to
+     * be recovered over a longer time span via other buffer / rate control
+     * mechanisms.
+     */
+    if (cpi->this_frame_target < min_frame_target)
+        cpi->this_frame_target = min_frame_target;
+
+    if (!cpi->common.refresh_alt_ref_frame)
+        /* Note the baseline target data rate for this inter frame. */
+        cpi->inter_frame_target = cpi->this_frame_target;
+
+    /* One Pass specific code */
+    if (cpi->pass == 0)
+    {
+        /* Adapt target frame size with respect to any buffering constraints: */
+        if (cpi->buffered_mode)
+        {
+            int one_percent_bits = (int)
+                (1 + cpi->oxcf.optimal_buffer_level / 100);
+
+            if ((cpi->buffer_level < cpi->oxcf.optimal_buffer_level) ||
+                (cpi->bits_off_target < cpi->oxcf.optimal_buffer_level))
+            {
+                int percent_low = 0;
+
+                /* Decide whether or not we need to adjust the frame data
+                 * rate target.
+                 *
+                 * If we are are below the optimal buffer fullness level
+                 * and adherence to buffering constraints is important to
+                 * the end usage then adjust the per frame target.
+                 */
+                if ((cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) &&
+                    (cpi->buffer_level < cpi->oxcf.optimal_buffer_level))
+                {
+                    percent_low = (int)
+                        ((cpi->oxcf.optimal_buffer_level - cpi->buffer_level) /
+                        one_percent_bits);
+                }
+                /* Are we overshooting the long term clip data rate... */
+                else if (cpi->bits_off_target < 0)
+                {
+                    /* Adjust per frame data target downwards to compensate. */
+                    percent_low = (int)(100 * -cpi->bits_off_target /
+                                       (cpi->total_byte_count * 8));
+                }
+
+                if (percent_low > cpi->oxcf.under_shoot_pct)
+                    percent_low = cpi->oxcf.under_shoot_pct;
+                else if (percent_low < 0)
+                    percent_low = 0;
+
+                /* lower the target bandwidth for this frame. */
+                cpi->this_frame_target -=
+                        (cpi->this_frame_target * percent_low) / 200;
+
+                /* Are we using allowing control of active_worst_allowed_q
+                 * according to buffer level.
+                 */
+                if (cpi->auto_worst_q && cpi->ni_frames > 150)
+                {
+                    int64_t critical_buffer_level;
+
+                    /* For streaming applications the most important factor is
+                     * cpi->buffer_level as this takes into account the
+                     * specified short term buffering constraints. However,
+                     * hitting the long term clip data rate target is also
+                     * important.
+                     */
+                    if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+                    {
+                        /* Take the smaller of cpi->buffer_level and
+                         * cpi->bits_off_target
+                         */
+                        critical_buffer_level =
+                            (cpi->buffer_level < cpi->bits_off_target)
+                            ? cpi->buffer_level : cpi->bits_off_target;
+                    }
+                    /* For local file playback short term buffering constraints
+                     * are less of an issue
+                     */
+                    else
+                    {
+                        /* Consider only how we are doing for the clip as a
+                         * whole
+                         */
+                        critical_buffer_level = cpi->bits_off_target;
+                    }
+
+                    /* Set the active worst quality based upon the selected
+                     * buffer fullness number.
+                     */
+                    if (critical_buffer_level < cpi->oxcf.optimal_buffer_level)
+                    {
+                        if ( critical_buffer_level >
+                             (cpi->oxcf.optimal_buffer_level >> 2) )
+                        {
+                            int64_t qadjustment_range =
+                                      cpi->worst_quality - cpi->ni_av_qi;
+                            int64_t above_base =
+                                      (critical_buffer_level -
+                                       (cpi->oxcf.optimal_buffer_level >> 2));
+
+                            /* Step active worst quality down from
+                             * cpi->ni_av_qi when (critical_buffer_level ==
+                             * cpi->optimal_buffer_level) to
+                             * cpi->worst_quality when
+                             * (critical_buffer_level ==
+                             *     cpi->optimal_buffer_level >> 2)
+                             */
+                            cpi->active_worst_quality =
+                                cpi->worst_quality -
+                                (int)((qadjustment_range * above_base) /
+                                 (cpi->oxcf.optimal_buffer_level*3>>2));
+                        }
+                        else
+                        {
+                            cpi->active_worst_quality = cpi->worst_quality;
+                        }
+                    }
+                    else
+                    {
+                        cpi->active_worst_quality = cpi->ni_av_qi;
+                    }
+                }
+                else
+                {
+                    cpi->active_worst_quality = cpi->worst_quality;
+                }
+            }
+            else
+            {
+                int percent_high = 0;
+
+                if ((cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+                     && (cpi->buffer_level > cpi->oxcf.optimal_buffer_level))
+                {
+                    percent_high = (int)((cpi->buffer_level
+                                    - cpi->oxcf.optimal_buffer_level)
+                                   / one_percent_bits);
+                }
+                else if (cpi->bits_off_target > cpi->oxcf.optimal_buffer_level)
+                {
+                    percent_high = (int)((100 * cpi->bits_off_target)
+                                         / (cpi->total_byte_count * 8));
+                }
+
+                if (percent_high > cpi->oxcf.over_shoot_pct)
+                    percent_high = cpi->oxcf.over_shoot_pct;
+                else if (percent_high < 0)
+                    percent_high = 0;
+
+                cpi->this_frame_target += (cpi->this_frame_target *
+                                          percent_high) / 200;
+
+                /* Are we allowing control of active_worst_allowed_q according
+                 * to buffer level.
+                 */
+                if (cpi->auto_worst_q && cpi->ni_frames > 150)
+                {
+                    /* When using the relaxed buffer model stick to the
+                     * user specified value
+                     */
+                    cpi->active_worst_quality = cpi->ni_av_qi;
+                }
+                else
+                {
+                    cpi->active_worst_quality = cpi->worst_quality;
+                }
+            }
+
+            /* Set active_best_quality to prevent quality rising too high */
+            cpi->active_best_quality = cpi->best_quality;
+
+            /* Worst quality obviously must not be better than best quality */
+            if (cpi->active_worst_quality <= cpi->active_best_quality)
+                cpi->active_worst_quality = cpi->active_best_quality + 1;
+
+            if(cpi->active_worst_quality > 127)
+                cpi->active_worst_quality = 127;
+        }
+        /* Unbuffered mode (eg. video conferencing) */
+        else
+        {
+            /* Set the active worst quality */
+            cpi->active_worst_quality = cpi->worst_quality;
+        }
+
+        /* Special trap for constrained quality mode
+         * "active_worst_quality" may never drop below cq level
+         * for any frame type.
+         */
+        if ( cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY &&
+             cpi->active_worst_quality < cpi->cq_target_quality)
+        {
+            cpi->active_worst_quality = cpi->cq_target_quality;
+        }
+    }
+
+    /* Test to see if we have to drop a frame
+     * The auto-drop frame code is only used in buffered mode.
+     * In unbufferd mode (eg vide conferencing) the descision to
+     * code or drop a frame is made outside the codec in response to real
+     * world comms or buffer considerations.
+     */
+    if (cpi->drop_frames_allowed &&
+        (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) &&
+        ((cpi->common.frame_type != KEY_FRAME)))
+    {
+        /* Check for a buffer underun-crisis in which case we have to drop
+         * a frame
+         */
+        if ((cpi->buffer_level < 0))
+        {
+#if 0
+            FILE *f = fopen("dec.stt", "a");
+            fprintf(f, "%10d %10d %10d %10d ***** BUFFER EMPTY\n",
+                    (int) cpi->common.current_video_frame,
+                    cpi->decimation_factor, cpi->common.horiz_scale,
+                    (cpi->buffer_level * 100) / cpi->oxcf.optimal_buffer_level);
+            fclose(f);
+#endif
+            cpi->drop_frame = 1;
+
+            /* Update the buffer level variable. */
+            cpi->bits_off_target += cpi->av_per_frame_bandwidth;
+            if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size)
+              cpi->bits_off_target = (int)cpi->oxcf.maximum_buffer_size;
+            cpi->buffer_level = cpi->bits_off_target;
+
+            if (cpi->oxcf.number_of_layers > 1) {
+              unsigned int i;
+
+              // Propagate bits saved by dropping the frame to higher layers.
+              for (i = cpi->current_layer + 1; i < cpi->oxcf.number_of_layers;
+                  i++) {
+                LAYER_CONTEXT *lc = &cpi->layer_context[i];
+                lc->bits_off_target += (int)(lc->target_bandwidth /
+                                             lc->framerate);
+                if (lc->bits_off_target > lc->maximum_buffer_size)
+                  lc->bits_off_target = lc->maximum_buffer_size;
+                lc->buffer_level = lc->bits_off_target;
+              }
+            }
+        }
+    }
+
+    /* Adjust target frame size for Golden Frames: */
+    if (cpi->oxcf.error_resilient_mode == 0 &&
+        (cpi->frames_till_gf_update_due == 0) && !cpi->drop_frame)
+    {
+        int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;
+
+        int gf_frame_useage = 0;      /* Golden frame useage since last GF */
+        int tot_mbs = cpi->recent_ref_frame_usage[INTRA_FRAME]  +
+                      cpi->recent_ref_frame_usage[LAST_FRAME]   +
+                      cpi->recent_ref_frame_usage[GOLDEN_FRAME] +
+                      cpi->recent_ref_frame_usage[ALTREF_FRAME];
+
+        int pct_gf_active = (100 * cpi->gf_active_count) / (cpi->common.mb_rows * cpi->common.mb_cols);
+
+        if (tot_mbs)
+            gf_frame_useage = (cpi->recent_ref_frame_usage[GOLDEN_FRAME] + cpi->recent_ref_frame_usage[ALTREF_FRAME]) * 100 / tot_mbs;
+
+        if (pct_gf_active > gf_frame_useage)
+            gf_frame_useage = pct_gf_active;
+
+        /* Is a fixed manual GF frequency being used */
+        if (cpi->auto_gold)
+        {
+            /* For one pass throw a GF if recent frame intra useage is
+             * low or the GF useage is high
+             */
+            if ((cpi->pass == 0) && (cpi->this_frame_percent_intra < 15 || gf_frame_useage >= 5))
+                cpi->common.refresh_golden_frame = 1;
+
+            /* Two pass GF descision */
+            else if (cpi->pass == 2)
+                cpi->common.refresh_golden_frame = 1;
+        }
+
+#if 0
+
+        /* Debug stats */
+        if (0)
+        {
+            FILE *f;
+
+            f = fopen("gf_useaget.stt", "a");
+            fprintf(f, " %8ld %10ld %10ld %10ld %10ld\n",
+                    cpi->common.current_video_frame,  cpi->gfu_boost, GFQ_ADJUSTMENT, cpi->gfu_boost, gf_frame_useage);
+            fclose(f);
+        }
+
+#endif
+
+        if (cpi->common.refresh_golden_frame == 1)
+        {
+#if 0
+
+            if (0)
+            {
+                FILE *f;
+
+                f = fopen("GFexit.stt", "a");
+                fprintf(f, "%8ld GF coded\n", cpi->common.current_video_frame);
+                fclose(f);
+            }
+
+#endif
+
+            if (cpi->auto_adjust_gold_quantizer)
+            {
+                calc_gf_params(cpi);
+            }
+
+            /* If we are using alternate ref instead of gf then do not apply the
+             * boost It will instead be applied to the altref update Jims
+             * modified boost
+             */
+            if (!cpi->source_alt_ref_active)
+            {
+                if (cpi->oxcf.fixed_q < 0)
+                {
+                    if (cpi->pass == 2)
+                    {
+                        /* The spend on the GF is defined in the two pass
+                         * code for two pass encodes
+                         */
+                        cpi->this_frame_target = cpi->per_frame_bandwidth;
+                    }
+                    else
+                    {
+                        int Boost = cpi->last_boost;
+                        int frames_in_section = cpi->frames_till_gf_update_due + 1;
+                        int allocation_chunks = (frames_in_section * 100) + (Boost - 100);
+                        int bits_in_section = cpi->inter_frame_target * frames_in_section;
+
+                        /* Normalize Altboost and allocations chunck down to
+                         * prevent overflow
+                         */
+                        while (Boost > 1000)
+                        {
+                            Boost /= 2;
+                            allocation_chunks /= 2;
+                        }
+
+                        /* Avoid loss of precision but avoid overflow */
+                        if ((bits_in_section >> 7) > allocation_chunks)
+                            cpi->this_frame_target = Boost * (bits_in_section / allocation_chunks);
+                        else
+                            cpi->this_frame_target = (Boost * bits_in_section) / allocation_chunks;
+                    }
+                }
+                else
+                    cpi->this_frame_target =
+                        (estimate_bits_at_q(1, Q, cpi->common.MBs, 1.0)
+                         * cpi->last_boost) / 100;
+
+            }
+            /* If there is an active ARF at this location use the minimum
+             * bits on this frame even if it is a contructed arf.
+             * The active maximum quantizer insures that an appropriate
+             * number of bits will be spent if needed for contstructed ARFs.
+             */
+            else
+            {
+                cpi->this_frame_target = 0;
+            }
+
+            cpi->current_gf_interval = cpi->frames_till_gf_update_due;
+
+        }
+    }
+
+    cpi->per_frame_bandwidth = old_per_frame_bandwidth;
+}
+
+
+void vp8_update_rate_correction_factors(VP8_COMP *cpi, int damp_var)
+{
+    int    Q = cpi->common.base_qindex;
+    int    correction_factor = 100;
+    double rate_correction_factor;
+    double adjustment_limit;
+
+    int    projected_size_based_on_q = 0;
+
+    /* Clear down mmx registers to allow floating point in what follows */
+    vp8_clear_system_state();
+
+    if (cpi->common.frame_type == KEY_FRAME)
+    {
+        rate_correction_factor = cpi->key_frame_rate_correction_factor;
+    }
+    else
+    {
+        if (cpi->oxcf.number_of_layers == 1 &&
+           (cpi->common.refresh_alt_ref_frame ||
+            cpi->common.refresh_golden_frame))
+            rate_correction_factor = cpi->gf_rate_correction_factor;
+        else
+            rate_correction_factor = cpi->rate_correction_factor;
+    }
+
+    /* Work out how big we would have expected the frame to be at this Q
+     * given the current correction factor. Stay in double to avoid int
+     * overflow when values are large
+     */
+    projected_size_based_on_q = (int)(((.5 + rate_correction_factor * vp8_bits_per_mb[cpi->common.frame_type][Q]) * cpi->common.MBs) / (1 << BPER_MB_NORMBITS));
+
+    /* Make some allowance for cpi->zbin_over_quant */
+    if (cpi->mb.zbin_over_quant > 0)
+    {
+        int Z = cpi->mb.zbin_over_quant;
+        double Factor = 0.99;
+        double factor_adjustment = 0.01 / 256.0;
+
+        while (Z > 0)
+        {
+            Z --;
+            projected_size_based_on_q =
+                (int)(Factor * projected_size_based_on_q);
+            Factor += factor_adjustment;
+
+            if (Factor  >= 0.999)
+                Factor = 0.999;
+        }
+    }
+
+    /* Work out a size correction factor. */
+    if (projected_size_based_on_q > 0)
+        correction_factor = (100 * cpi->projected_frame_size) / projected_size_based_on_q;
+
+    /* More heavily damped adjustment used if we have been oscillating
+     * either side of target
+     */
+    switch (damp_var)
+    {
+    case 0:
+        adjustment_limit = 0.75;
+        break;
+    case 1:
+        adjustment_limit = 0.375;
+        break;
+    case 2:
+    default:
+        adjustment_limit = 0.25;
+        break;
+    }
+
+    if (correction_factor > 102)
+    {
+        /* We are not already at the worst allowable quality */
+        correction_factor = (int)(100.5 + ((correction_factor - 100) * adjustment_limit));
+        rate_correction_factor = ((rate_correction_factor * correction_factor) / 100);
+
+        /* Keep rate_correction_factor within limits */
+        if (rate_correction_factor > MAX_BPB_FACTOR)
+            rate_correction_factor = MAX_BPB_FACTOR;
+    }
+    else if (correction_factor < 99)
+    {
+        /* We are not already at the best allowable quality */
+        correction_factor = (int)(100.5 - ((100 - correction_factor) * adjustment_limit));
+        rate_correction_factor = ((rate_correction_factor * correction_factor) / 100);
+
+        /* Keep rate_correction_factor within limits */
+        if (rate_correction_factor < MIN_BPB_FACTOR)
+            rate_correction_factor = MIN_BPB_FACTOR;
+    }
+
+    if (cpi->common.frame_type == KEY_FRAME)
+        cpi->key_frame_rate_correction_factor = rate_correction_factor;
+    else
+    {
+        if (cpi->oxcf.number_of_layers == 1 &&
+           (cpi->common.refresh_alt_ref_frame ||
+            cpi->common.refresh_golden_frame))
+            cpi->gf_rate_correction_factor = rate_correction_factor;
+        else
+            cpi->rate_correction_factor = rate_correction_factor;
+    }
+}
+
+
+int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame)
+{
+    int Q = cpi->active_worst_quality;
+
+    if (cpi->force_maxqp == 1) {
+      cpi->active_worst_quality = cpi->worst_quality;
+      return cpi->worst_quality;
+    }
+
+    /* Reset Zbin OQ value */
+    cpi->mb.zbin_over_quant = 0;
+
+    if (cpi->oxcf.fixed_q >= 0)
+    {
+        Q = cpi->oxcf.fixed_q;
+
+        if (cpi->common.frame_type == KEY_FRAME)
+        {
+            Q = cpi->oxcf.key_q;
+        }
+        else if (cpi->oxcf.number_of_layers == 1 &&
+            cpi->common.refresh_alt_ref_frame)
+        {
+            Q = cpi->oxcf.alt_q;
+        }
+        else if (cpi->oxcf.number_of_layers == 1  &&
+            cpi->common.refresh_golden_frame)
+        {
+            Q = cpi->oxcf.gold_q;
+        }
+    }
+    else
+    {
+        int i;
+        int last_error = INT_MAX;
+        int target_bits_per_mb;
+        int bits_per_mb_at_this_q;
+        double correction_factor;
+
+        /* Select the appropriate correction factor based upon type of frame. */
+        if (cpi->common.frame_type == KEY_FRAME)
+            correction_factor = cpi->key_frame_rate_correction_factor;
+        else
+        {
+            if (cpi->oxcf.number_of_layers == 1 &&
+               (cpi->common.refresh_alt_ref_frame ||
+                cpi->common.refresh_golden_frame))
+                correction_factor = cpi->gf_rate_correction_factor;
+            else
+                correction_factor = cpi->rate_correction_factor;
+        }
+
+        /* Calculate required scaling factor based on target frame size and
+         * size of frame produced using previous Q
+         */
+        if (target_bits_per_frame >= (INT_MAX >> BPER_MB_NORMBITS))
+            /* Case where we would overflow int */
+            target_bits_per_mb = (target_bits_per_frame / cpi->common.MBs) << BPER_MB_NORMBITS;
+        else
+            target_bits_per_mb = (target_bits_per_frame << BPER_MB_NORMBITS) / cpi->common.MBs;
+
+        i = cpi->active_best_quality;
+
+        do
+        {
+            bits_per_mb_at_this_q = (int)(.5 + correction_factor * vp8_bits_per_mb[cpi->common.frame_type][i]);
+
+            if (bits_per_mb_at_this_q <= target_bits_per_mb)
+            {
+                if ((target_bits_per_mb - bits_per_mb_at_this_q) <= last_error)
+                    Q = i;
+                else
+                    Q = i - 1;
+
+                break;
+            }
+            else
+                last_error = bits_per_mb_at_this_q - target_bits_per_mb;
+        }
+        while (++i <= cpi->active_worst_quality);
+
+
+        /* If we are at MAXQ then enable Q over-run which seeks to claw
+         * back additional bits through things like the RD multiplier
+         * and zero bin size.
+         */
+        if (Q >= MAXQ)
+        {
+            int zbin_oqmax;
+
+            double Factor = 0.99;
+            double factor_adjustment = 0.01 / 256.0;
+
+            if (cpi->common.frame_type == KEY_FRAME)
+                zbin_oqmax = 0;
+            else if (cpi->oxcf.number_of_layers == 1 &&
+                (cpi->common.refresh_alt_ref_frame ||
+                (cpi->common.refresh_golden_frame &&
+                 !cpi->source_alt_ref_active)))
+                zbin_oqmax = 16;
+            else
+                zbin_oqmax = ZBIN_OQ_MAX;
+
+            /*{
+                double Factor = (double)target_bits_per_mb/(double)bits_per_mb_at_this_q;
+                double Oq;
+
+                Factor = Factor/1.2683;
+
+                Oq = pow( Factor, (1.0/-0.165) );
+
+                if ( Oq > zbin_oqmax )
+                    Oq = zbin_oqmax;
+
+                cpi->zbin_over_quant = (int)Oq;
+            }*/
+
+            /* Each incrment in the zbin is assumed to have a fixed effect
+             * on bitrate. This is not of course true. The effect will be
+             * highly clip dependent and may well have sudden steps. The
+             * idea here is to acheive higher effective quantizers than the
+             * normal maximum by expanding the zero bin and hence
+             * decreasing the number of low magnitude non zero coefficients.
+             */
+            while (cpi->mb.zbin_over_quant < zbin_oqmax)
+            {
+                cpi->mb.zbin_over_quant ++;
+
+                if (cpi->mb.zbin_over_quant > zbin_oqmax)
+                    cpi->mb.zbin_over_quant = zbin_oqmax;
+
+                /* Adjust bits_per_mb_at_this_q estimate */
+                bits_per_mb_at_this_q = (int)(Factor * bits_per_mb_at_this_q);
+                Factor += factor_adjustment;
+
+                if (Factor  >= 0.999)
+                    Factor = 0.999;
+
+                /* Break out if we get down to the target rate */
+                if (bits_per_mb_at_this_q <= target_bits_per_mb)
+                    break;
+            }
+
+        }
+    }
+
+    return Q;
+}
+
+
+static int estimate_keyframe_frequency(VP8_COMP *cpi)
+{
+    int i;
+
+    /* Average key frame frequency */
+    int av_key_frame_frequency = 0;
+
+    /* First key frame at start of sequence is a special case. We have no
+     * frequency data.
+     */
+    if (cpi->key_frame_count == 1)
+    {
+        /* Assume a default of 1 kf every 2 seconds, or the max kf interval,
+         * whichever is smaller.
+         */
+        int key_freq = cpi->oxcf.key_freq>0 ? cpi->oxcf.key_freq : 1;
+        av_key_frame_frequency = 1 + (int)cpi->output_framerate * 2;
+
+        if (cpi->oxcf.auto_key && av_key_frame_frequency > key_freq)
+            av_key_frame_frequency = key_freq;
+
+        cpi->prior_key_frame_distance[KEY_FRAME_CONTEXT - 1]
+            = av_key_frame_frequency;
+    }
+    else
+    {
+        unsigned int total_weight = 0;
+        int last_kf_interval =
+                (cpi->frames_since_key > 0) ? cpi->frames_since_key : 1;
+
+        /* reset keyframe context and calculate weighted average of last
+         * KEY_FRAME_CONTEXT keyframes
+         */
+        for (i = 0; i < KEY_FRAME_CONTEXT; i++)
+        {
+            if (i < KEY_FRAME_CONTEXT - 1)
+                cpi->prior_key_frame_distance[i]
+                    = cpi->prior_key_frame_distance[i+1];
+            else
+                cpi->prior_key_frame_distance[i] = last_kf_interval;
+
+            av_key_frame_frequency += prior_key_frame_weight[i]
+                                      * cpi->prior_key_frame_distance[i];
+            total_weight += prior_key_frame_weight[i];
+        }
+
+        av_key_frame_frequency  /= total_weight;
+
+    }
+    // TODO (marpan): Given the checks above, |av_key_frame_frequency|
+    // should always be above 0. But for now we keep the sanity check in.
+    if (av_key_frame_frequency == 0)
+        av_key_frame_frequency = 1;
+    return av_key_frame_frequency;
+}
+
+
+void vp8_adjust_key_frame_context(VP8_COMP *cpi)
+{
+    /* Clear down mmx registers to allow floating point in what follows */
+    vp8_clear_system_state();
+
+    /* Do we have any key frame overspend to recover? */
+    /* Two-pass overspend handled elsewhere. */
+    if ((cpi->pass != 2)
+         && (cpi->projected_frame_size > cpi->per_frame_bandwidth))
+    {
+        int overspend;
+
+        /* Update the count of key frame overspend to be recovered in
+         * subsequent frames. A portion of the KF overspend is treated as gf
+         * overspend (and hence recovered more quickly) as the kf is also a
+         * gf. Otherwise the few frames following each kf tend to get more
+         * bits allocated than those following other gfs.
+         */
+        overspend = (cpi->projected_frame_size - cpi->per_frame_bandwidth);
+
+        if (cpi->oxcf.number_of_layers > 1)
+            cpi->kf_overspend_bits += overspend;
+        else
+        {
+            cpi->kf_overspend_bits += overspend * 7 / 8;
+            cpi->gf_overspend_bits += overspend * 1 / 8;
+        }
+
+        /* Work out how much to try and recover per frame. */
+        cpi->kf_bitrate_adjustment = cpi->kf_overspend_bits
+                                     / estimate_keyframe_frequency(cpi);
+    }
+
+    cpi->frames_since_key = 0;
+    cpi->key_frame_count++;
+}
+
+
+void vp8_compute_frame_size_bounds(VP8_COMP *cpi, int *frame_under_shoot_limit, int *frame_over_shoot_limit)
+{
+    /* Set-up bounds on acceptable frame size: */
+    if (cpi->oxcf.fixed_q >= 0)
+    {
+        /* Fixed Q scenario: frame size never outranges target
+         * (there is no target!)
+         */
+        *frame_under_shoot_limit = 0;
+        *frame_over_shoot_limit  = INT_MAX;
+    }
+    else
+    {
+        if (cpi->common.frame_type == KEY_FRAME)
+        {
+            *frame_over_shoot_limit  = cpi->this_frame_target * 9 / 8;
+            *frame_under_shoot_limit = cpi->this_frame_target * 7 / 8;
+        }
+        else
+        {
+            if (cpi->oxcf.number_of_layers > 1 ||
+                cpi->common.refresh_alt_ref_frame ||
+                cpi->common.refresh_golden_frame)
+            {
+                *frame_over_shoot_limit  = cpi->this_frame_target * 9 / 8;
+                *frame_under_shoot_limit = cpi->this_frame_target * 7 / 8;
+            }
+            else
+            {
+                /* For CBR take buffer fullness into account */
+                if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+                {
+                    if (cpi->buffer_level >= ((cpi->oxcf.optimal_buffer_level + cpi->oxcf.maximum_buffer_size) >> 1))
+                    {
+                        /* Buffer is too full so relax overshoot and tighten
+                         * undershoot
+                         */
+                        *frame_over_shoot_limit  = cpi->this_frame_target * 12 / 8;
+                        *frame_under_shoot_limit = cpi->this_frame_target * 6 / 8;
+                    }
+                    else if (cpi->buffer_level <= (cpi->oxcf.optimal_buffer_level >> 1))
+                    {
+                        /* Buffer is too low so relax undershoot and tighten
+                         * overshoot
+                         */
+                        *frame_over_shoot_limit  = cpi->this_frame_target * 10 / 8;
+                        *frame_under_shoot_limit = cpi->this_frame_target * 4 / 8;
+                    }
+                    else
+                    {
+                        *frame_over_shoot_limit  = cpi->this_frame_target * 11 / 8;
+                        *frame_under_shoot_limit = cpi->this_frame_target * 5 / 8;
+                    }
+                }
+                /* VBR and CQ mode */
+                /* Note that tighter restrictions here can help quality
+                 * but hurt encode speed
+                 */
+                else
+                {
+                    /* Stron overshoot limit for constrained quality */
+                    if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY)
+                    {
+                        *frame_over_shoot_limit  = cpi->this_frame_target * 11 / 8;
+                        *frame_under_shoot_limit = cpi->this_frame_target * 2 / 8;
+                    }
+                    else
+                    {
+                        *frame_over_shoot_limit  = cpi->this_frame_target * 11 / 8;
+                        *frame_under_shoot_limit = cpi->this_frame_target * 5 / 8;
+                    }
+                }
+            }
+        }
+
+        /* For very small rate targets where the fractional adjustment
+         * (eg * 7/8) may be tiny make sure there is at least a minimum
+         * range.
+         */
+        *frame_over_shoot_limit += 200;
+        *frame_under_shoot_limit -= 200;
+        if ( *frame_under_shoot_limit < 0 )
+            *frame_under_shoot_limit = 0;
+
+    }
+}
+
+
+/* return of 0 means drop frame */
+int vp8_pick_frame_size(VP8_COMP *cpi)
+{
+    VP8_COMMON *cm = &cpi->common;
+
+    if (cm->frame_type == KEY_FRAME)
+        calc_iframe_target_size(cpi);
+    else
+    {
+        calc_pframe_target_size(cpi);
+
+        /* Check if we're dropping the frame: */
+        if (cpi->drop_frame)
+        {
+            cpi->drop_frame = 0;
+            return 0;
+        }
+    }
+    return 1;
+}
+// If this just encoded frame (mcomp/transform/quant, but before loopfilter and
+// pack_bitstream) has large overshoot, and was not being encoded close to the
+// max QP, then drop this frame and force next frame to be encoded at max QP.
+// Condition this on 1 pass CBR with screen content mode and frame dropper off.
+// TODO(marpan): Should do this exit condition during the encode_frame
+// (i.e., halfway during the encoding of the frame) to save cycles.
+int vp8_drop_encodedframe_overshoot(VP8_COMP *cpi, int Q) {
+  if (cpi->pass == 0 &&
+      cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER &&
+      cpi->drop_frames_allowed == 0 &&
+      cpi->common.frame_type != KEY_FRAME) {
+    // Note: the "projected_frame_size" from encode_frame() only gives estimate
+    // of mode/motion vector rate (in non-rd mode): so below we only require
+    // that projected_frame_size is somewhat greater than per-frame-bandwidth,
+    // but add additional condition with high threshold on prediction residual.
+
+    // QP threshold: only allow dropping if we are not close to qp_max.
+    int thresh_qp = 3 * cpi->worst_quality >> 2;
+    // Rate threshold, in bytes.
+    int thresh_rate = 2 * (cpi->av_per_frame_bandwidth >> 3);
+    // Threshold for the average (over all macroblocks) of the pixel-sum
+    // residual error over 16x16 block. Should add QP dependence on threshold?
+    int thresh_pred_err_mb = (256 << 4);
+    int pred_err_mb = (int)(cpi->mb.prediction_error / cpi->common.MBs);
+    if (Q < thresh_qp &&
+        cpi->projected_frame_size > thresh_rate &&
+        pred_err_mb > thresh_pred_err_mb) {
+      double new_correction_factor = cpi->rate_correction_factor;
+      const int target_size = cpi->av_per_frame_bandwidth;
+      int target_bits_per_mb;
+      // Drop this frame: advance frame counters, and set force_maxqp flag.
+      cpi->common.current_video_frame++;
+      cpi->frames_since_key++;
+      // Flag to indicate we will force next frame to be encoded at max QP.
+      cpi->force_maxqp = 1;
+      // Reset the buffer levels.
+      cpi->buffer_level = cpi->oxcf.optimal_buffer_level;
+      cpi->bits_off_target = cpi->oxcf.optimal_buffer_level;
+      // Compute a new rate correction factor, corresponding to the current
+      // target frame size and max_QP, and adjust the rate correction factor
+      // upwards, if needed.
+      // This is to prevent a bad state where the re-encoded frame at max_QP
+      // undershoots significantly, and then we end up dropping every other
+      // frame because the QP/rate_correction_factor may have been too low
+      // before the drop and then takes too long to come up.
+      if (target_size >= (INT_MAX >> BPER_MB_NORMBITS))
+        target_bits_per_mb =
+            (target_size / cpi->common.MBs) << BPER_MB_NORMBITS;
+      else
+        target_bits_per_mb =
+            (target_size << BPER_MB_NORMBITS) / cpi->common.MBs;
+      // Rate correction factor based on target_size_per_mb and max_QP.
+      new_correction_factor = (double)target_bits_per_mb /
+          (double)vp8_bits_per_mb[INTER_FRAME][cpi->worst_quality];
+      if (new_correction_factor > cpi->rate_correction_factor)
+        cpi->rate_correction_factor =
+            VPXMIN(2.0 * cpi->rate_correction_factor, new_correction_factor);
+      if (cpi->rate_correction_factor > MAX_BPB_FACTOR)
+        cpi->rate_correction_factor = MAX_BPB_FACTOR;
+      return 1;
+    } else {
+      cpi->force_maxqp = 0;
+      return 0;
+    }
+    cpi->force_maxqp = 0;
+    return 0;
+  }
+  cpi->force_maxqp = 0;
+  return 0;
+}
diff --git a/libs/libvpx/vp8/encoder/ratectrl.h b/libs/libvpx/vp8/encoder/ratectrl.h
new file mode 100644
index 0000000000..703de9ff55
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/ratectrl.h
@@ -0,0 +1,39 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_ENCODER_RATECTRL_H_
+#define VP8_ENCODER_RATECTRL_H_
+
+#include "onyx_int.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern void vp8_save_coding_context(VP8_COMP *cpi);
+extern void vp8_restore_coding_context(VP8_COMP *cpi);
+
+extern void vp8_setup_key_frame(VP8_COMP *cpi);
+extern void vp8_update_rate_correction_factors(VP8_COMP *cpi, int damp_var);
+extern int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame);
+extern void vp8_adjust_key_frame_context(VP8_COMP *cpi);
+extern void vp8_compute_frame_size_bounds(VP8_COMP *cpi, int *frame_under_shoot_limit, int *frame_over_shoot_limit);
+
+/* return of 0 means drop frame */
+extern int vp8_pick_frame_size(VP8_COMP *cpi);
+
+extern int vp8_drop_encodedframe_overshoot(VP8_COMP *cpi, int Q);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_ENCODER_RATECTRL_H_
diff --git a/libs/libvpx/vp8/encoder/rdopt.c b/libs/libvpx/vp8/encoder/rdopt.c
new file mode 100644
index 0000000000..ab0ad15990
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/rdopt.c
@@ -0,0 +1,2645 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <stdio.h>
+#include <math.h>
+#include <limits.h>
+#include <assert.h>
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "tokenize.h"
+#include "treewriter.h"
+#include "onyx_int.h"
+#include "modecosts.h"
+#include "encodeintra.h"
+#include "pickinter.h"
+#include "vp8/common/entropymode.h"
+#include "vp8/common/reconinter.h"
+#include "vp8/common/reconintra.h"
+#include "vp8/common/reconintra4x4.h"
+#include "vp8/common/findnearmv.h"
+#include "vp8/common/quant_common.h"
+#include "encodemb.h"
+#include "vp8/encoder/quantize.h"
+#include "vpx_dsp/variance.h"
+#include "mcomp.h"
+#include "rdopt.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp8/common/systemdependent.h"
+#if CONFIG_TEMPORAL_DENOISING
+#include "denoising.h"
+#endif
+extern void vp8_update_zbin_extra(VP8_COMP *cpi, MACROBLOCK *x);
+
+#define MAXF(a,b)            (((a) > (b)) ? (a) : (b))
+
+typedef struct rate_distortion_struct
+{
+    int rate2;
+    int rate_y;
+    int rate_uv;
+    int distortion2;
+    int distortion_uv;
+} RATE_DISTORTION;
+
+typedef struct best_mode_struct
+{
+  int yrd;
+  int rd;
+  int intra_rd;
+  MB_MODE_INFO mbmode;
+  union b_mode_info bmodes[16];
+  PARTITION_INFO partition;
+} BEST_MODE;
+
+static const int auto_speed_thresh[17] =
+{
+    1000,
+    200,
+    150,
+    130,
+    150,
+    125,
+    120,
+    115,
+    115,
+    115,
+    115,
+    115,
+    115,
+    115,
+    115,
+    115,
+    105
+};
+
+const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES] =
+{
+    ZEROMV,
+    DC_PRED,
+
+    NEARESTMV,
+    NEARMV,
+
+    ZEROMV,
+    NEARESTMV,
+
+    ZEROMV,
+    NEARESTMV,
+
+    NEARMV,
+    NEARMV,
+
+    V_PRED,
+    H_PRED,
+    TM_PRED,
+
+    NEWMV,
+    NEWMV,
+    NEWMV,
+
+    SPLITMV,
+    SPLITMV,
+    SPLITMV,
+
+    B_PRED,
+};
+
+/* This table determines the search order in reference frame priority order,
+ * which may not necessarily match INTRA,LAST,GOLDEN,ARF
+ */
+const int vp8_ref_frame_order[MAX_MODES] =
+{
+    1,
+    0,
+
+    1,
+    1,
+
+    2,
+    2,
+
+    3,
+    3,
+
+    2,
+    3,
+
+    0,
+    0,
+    0,
+
+    1,
+    2,
+    3,
+
+    1,
+    2,
+    3,
+
+    0,
+};
+
+static void fill_token_costs(
+    int c[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS],
+    const vp8_prob p[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES]
+)
+{
+    int i, j, k;
+
+
+    for (i = 0; i < BLOCK_TYPES; i++)
+        for (j = 0; j < COEF_BANDS; j++)
+            for (k = 0; k < PREV_COEF_CONTEXTS; k++)
+
+                /* check for pt=0 and band > 1 if block type 0
+                 * and 0 if blocktype 1
+                 */
+                if (k == 0 && j > (i == 0))
+                    vp8_cost_tokens2(c[i][j][k], p [i][j][k], vp8_coef_tree, 2);
+                else
+                    vp8_cost_tokens(c[i][j][k], p [i][j][k], vp8_coef_tree);
+}
+
+static const int rd_iifactor[32] =
+{
+    4, 4, 3, 2, 1, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0
+};
+
+/* values are now correlated to quantizer */
+static const int sad_per_bit16lut[QINDEX_RANGE] =
+{
+    2,  2,  2,  2,  2,  2,  2,  2,
+    2,  2,  2,  2,  2,  2,  2,  2,
+    3,  3,  3,  3,  3,  3,  3,  3,
+    3,  3,  3,  3,  3,  3,  4,  4,
+    4,  4,  4,  4,  4,  4,  4,  4,
+    4,  4,  5,  5,  5,  5,  5,  5,
+    5,  5,  5,  5,  5,  5,  6,  6,
+    6,  6,  6,  6,  6,  6,  6,  6,
+    6,  6,  7,  7,  7,  7,  7,  7,
+    7,  7,  7,  7,  7,  7,  8,  8,
+    8,  8,  8,  8,  8,  8,  8,  8,
+    8,  8,  9,  9,  9,  9,  9,  9,
+    9,  9,  9,  9,  9,  9,  10, 10,
+    10, 10, 10, 10, 10, 10, 11, 11,
+    11, 11, 11, 11, 12, 12, 12, 12,
+    12, 12, 13, 13, 13, 13, 14, 14
+};
+static const int sad_per_bit4lut[QINDEX_RANGE] =
+{
+    2,  2,  2,  2,  2,  2,  3,  3,
+    3,  3,  3,  3,  3,  3,  3,  3,
+    3,  3,  3,  3,  4,  4,  4,  4,
+    4,  4,  4,  4,  4,  4,  5,  5,
+    5,  5,  5,  5,  6,  6,  6,  6,
+    6,  6,  6,  6,  6,  6,  6,  6,
+    7,  7,  7,  7,  7,  7,  7,  7,
+    7,  7,  7,  7,  7,  8,  8,  8,
+    8,  8,  9,  9,  9,  9,  9,  9,
+    10, 10, 10, 10, 10, 10, 10, 10,
+    11, 11, 11, 11, 11, 11, 11, 11,
+    12, 12, 12, 12, 12, 12, 12, 12,
+    13, 13, 13, 13, 13, 13, 13, 14,
+    14, 14, 14, 14, 15, 15, 15, 15,
+    16, 16, 16, 16, 17, 17, 17, 18,
+    18, 18, 19, 19, 19, 20, 20, 20,
+};
+
+void vp8cx_initialize_me_consts(VP8_COMP *cpi, int QIndex)
+{
+    cpi->mb.sadperbit16 =  sad_per_bit16lut[QIndex];
+    cpi->mb.sadperbit4  =  sad_per_bit4lut[QIndex];
+}
+
+void vp8_initialize_rd_consts(VP8_COMP *cpi, MACROBLOCK *x, int Qvalue)
+{
+    int q;
+    int i;
+    double capped_q = (Qvalue < 160) ? (double)Qvalue : 160.0;
+    double rdconst = 2.80;
+
+    vp8_clear_system_state();
+
+    /* Further tests required to see if optimum is different
+     * for key frames, golden frames and arf frames.
+     */
+    cpi->RDMULT = (int)(rdconst * (capped_q * capped_q));
+
+    /* Extend rate multiplier along side quantizer zbin increases */
+    if (cpi->mb.zbin_over_quant  > 0)
+    {
+        double oq_factor;
+        double modq;
+
+        /* Experimental code using the same basic equation as used for Q above
+         * The units of cpi->mb.zbin_over_quant are 1/128 of Q bin size
+         */
+        oq_factor = 1.0 + ((double)0.0015625 * cpi->mb.zbin_over_quant);
+        modq = (int)((double)capped_q * oq_factor);
+        cpi->RDMULT = (int)(rdconst * (modq * modq));
+    }
+
+    if (cpi->pass == 2 && (cpi->common.frame_type != KEY_FRAME))
+    {
+        if (cpi->twopass.next_iiratio > 31)
+            cpi->RDMULT += (cpi->RDMULT * rd_iifactor[31]) >> 4;
+        else
+            cpi->RDMULT +=
+                (cpi->RDMULT * rd_iifactor[cpi->twopass.next_iiratio]) >> 4;
+    }
+
+    cpi->mb.errorperbit = (cpi->RDMULT / 110);
+    cpi->mb.errorperbit += (cpi->mb.errorperbit==0);
+
+    vp8_set_speed_features(cpi);
+
+    for (i = 0; i < MAX_MODES; i++)
+    {
+        x->mode_test_hit_counts[i] = 0;
+    }
+
+    q = (int)pow(Qvalue, 1.25);
+
+    if (q < 8)
+        q = 8;
+
+    if (cpi->RDMULT > 1000)
+    {
+        cpi->RDDIV = 1;
+        cpi->RDMULT /= 100;
+
+        for (i = 0; i < MAX_MODES; i++)
+        {
+            if (cpi->sf.thresh_mult[i] < INT_MAX)
+            {
+                x->rd_threshes[i] = cpi->sf.thresh_mult[i] * q / 100;
+            }
+            else
+            {
+                x->rd_threshes[i] = INT_MAX;
+            }
+
+            cpi->rd_baseline_thresh[i] = x->rd_threshes[i];
+        }
+    }
+    else
+    {
+        cpi->RDDIV = 100;
+
+        for (i = 0; i < MAX_MODES; i++)
+        {
+            if (cpi->sf.thresh_mult[i] < (INT_MAX / q))
+            {
+                x->rd_threshes[i] = cpi->sf.thresh_mult[i] * q;
+            }
+            else
+            {
+                x->rd_threshes[i] = INT_MAX;
+            }
+
+            cpi->rd_baseline_thresh[i] = x->rd_threshes[i];
+        }
+    }
+
+    {
+      /* build token cost array for the type of frame we have now */
+      FRAME_CONTEXT *l = &cpi->lfc_n;
+
+      if(cpi->common.refresh_alt_ref_frame)
+          l = &cpi->lfc_a;
+      else if(cpi->common.refresh_golden_frame)
+          l = &cpi->lfc_g;
+
+      fill_token_costs(
+          cpi->mb.token_costs,
+          (const vp8_prob( *)[8][3][11]) l->coef_probs
+      );
+      /*
+      fill_token_costs(
+          cpi->mb.token_costs,
+          (const vp8_prob( *)[8][3][11]) cpi->common.fc.coef_probs);
+      */
+
+
+      /* TODO make these mode costs depend on last,alt or gold too.  (jbb) */
+      vp8_init_mode_costs(cpi);
+    }
+
+}
+
+void vp8_auto_select_speed(VP8_COMP *cpi)
+{
+    int milliseconds_for_compress = (int)(1000000 / cpi->framerate);
+
+    milliseconds_for_compress = milliseconds_for_compress * (16 - cpi->oxcf.cpu_used) / 16;
+
+#if 0
+
+    if (0)
+    {
+        FILE *f;
+
+        f = fopen("speed.stt", "a");
+        fprintf(f, " %8ld %10ld %10ld %10ld\n",
+                cpi->common.current_video_frame, cpi->Speed, milliseconds_for_compress, cpi->avg_pick_mode_time);
+        fclose(f);
+    }
+
+#endif
+
+    if (cpi->avg_pick_mode_time < milliseconds_for_compress && (cpi->avg_encode_time - cpi->avg_pick_mode_time) < milliseconds_for_compress)
+    {
+        if (cpi->avg_pick_mode_time == 0)
+        {
+            cpi->Speed = 4;
+        }
+        else
+        {
+            if (milliseconds_for_compress * 100 < cpi->avg_encode_time * 95)
+            {
+                cpi->Speed          += 2;
+                cpi->avg_pick_mode_time = 0;
+                cpi->avg_encode_time = 0;
+
+                if (cpi->Speed > 16)
+                {
+                    cpi->Speed = 16;
+                }
+            }
+
+            if (milliseconds_for_compress * 100 > cpi->avg_encode_time * auto_speed_thresh[cpi->Speed])
+            {
+                cpi->Speed          -= 1;
+                cpi->avg_pick_mode_time = 0;
+                cpi->avg_encode_time = 0;
+
+                /* In real-time mode, cpi->speed is in [4, 16]. */
+                if (cpi->Speed < 4)
+                {
+                    cpi->Speed = 4;
+                }
+            }
+        }
+    }
+    else
+    {
+        cpi->Speed += 4;
+
+        if (cpi->Speed > 16)
+            cpi->Speed = 16;
+
+
+        cpi->avg_pick_mode_time = 0;
+        cpi->avg_encode_time = 0;
+    }
+}
+
+int vp8_block_error_c(short *coeff, short *dqcoeff)
+{
+    int i;
+    int error = 0;
+
+    for (i = 0; i < 16; i++)
+    {
+        int this_diff = coeff[i] - dqcoeff[i];
+        error += this_diff * this_diff;
+    }
+
+    return error;
+}
+
+int vp8_mbblock_error_c(MACROBLOCK *mb, int dc)
+{
+    BLOCK  *be;
+    BLOCKD *bd;
+    int i, j;
+    int berror, error = 0;
+
+    for (i = 0; i < 16; i++)
+    {
+        be = &mb->block[i];
+        bd = &mb->e_mbd.block[i];
+
+        berror = 0;
+
+        for (j = dc; j < 16; j++)
+        {
+            int this_diff = be->coeff[j] - bd->dqcoeff[j];
+            berror += this_diff * this_diff;
+        }
+
+        error += berror;
+    }
+
+    return error;
+}
+
+int vp8_mbuverror_c(MACROBLOCK *mb)
+{
+
+    BLOCK  *be;
+    BLOCKD *bd;
+
+
+    int i;
+    int error = 0;
+
+    for (i = 16; i < 24; i++)
+    {
+        be = &mb->block[i];
+        bd = &mb->e_mbd.block[i];
+
+        error += vp8_block_error_c(be->coeff, bd->dqcoeff);
+    }
+
+    return error;
+}
+
+int VP8_UVSSE(MACROBLOCK *x)
+{
+    unsigned char *uptr, *vptr;
+    unsigned char *upred_ptr = (*(x->block[16].base_src) + x->block[16].src);
+    unsigned char *vpred_ptr = (*(x->block[20].base_src) + x->block[20].src);
+    int uv_stride = x->block[16].src_stride;
+
+    unsigned int sse1 = 0;
+    unsigned int sse2 = 0;
+    int mv_row = x->e_mbd.mode_info_context->mbmi.mv.as_mv.row;
+    int mv_col = x->e_mbd.mode_info_context->mbmi.mv.as_mv.col;
+    int offset;
+    int pre_stride = x->e_mbd.pre.uv_stride;
+
+    if (mv_row < 0)
+        mv_row -= 1;
+    else
+        mv_row += 1;
+
+    if (mv_col < 0)
+        mv_col -= 1;
+    else
+        mv_col += 1;
+
+    mv_row /= 2;
+    mv_col /= 2;
+
+    offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
+    uptr = x->e_mbd.pre.u_buffer + offset;
+    vptr = x->e_mbd.pre.v_buffer + offset;
+
+    if ((mv_row | mv_col) & 7)
+    {
+        vpx_sub_pixel_variance8x8(uptr, pre_stride,
+            mv_col & 7, mv_row & 7, upred_ptr, uv_stride, &sse2);
+        vpx_sub_pixel_variance8x8(vptr, pre_stride,
+            mv_col & 7, mv_row & 7, vpred_ptr, uv_stride, &sse1);
+        sse2 += sse1;
+    }
+    else
+    {
+        vpx_variance8x8(uptr, pre_stride,
+            upred_ptr, uv_stride, &sse2);
+        vpx_variance8x8(vptr, pre_stride,
+            vpred_ptr, uv_stride, &sse1);
+        sse2 += sse1;
+    }
+    return sse2;
+
+}
+
+static int cost_coeffs(MACROBLOCK *mb, BLOCKD *b, int type, ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l)
+{
+    int c = !type;              /* start at coef 0, unless Y with Y2 */
+    int eob = (int)(*b->eob);
+    int pt ;    /* surrounding block/prev coef predictor */
+    int cost = 0;
+    short *qcoeff_ptr = b->qcoeff;
+
+    VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+
+    assert(eob <= 16);
+    for (; c < eob; c++)
+    {
+        const int v = qcoeff_ptr[vp8_default_zig_zag1d[c]];
+        const int t = vp8_dct_value_tokens_ptr[v].Token;
+        cost += mb->token_costs [type] [vp8_coef_bands[c]] [pt] [t];
+        cost += vp8_dct_value_cost_ptr[v];
+        pt = vp8_prev_token_class[t];
+    }
+
+    if (c < 16)
+        cost += mb->token_costs [type] [vp8_coef_bands[c]] [pt] [DCT_EOB_TOKEN];
+
+    pt = (c != !type); /* is eob first coefficient; */
+    *a = *l = pt;
+
+    return cost;
+}
+
+static int vp8_rdcost_mby(MACROBLOCK *mb)
+{
+    int cost = 0;
+    int b;
+    MACROBLOCKD *x = &mb->e_mbd;
+    ENTROPY_CONTEXT_PLANES t_above, t_left;
+    ENTROPY_CONTEXT *ta;
+    ENTROPY_CONTEXT *tl;
+
+    memcpy(&t_above, mb->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    memcpy(&t_left, mb->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+    ta = (ENTROPY_CONTEXT *)&t_above;
+    tl = (ENTROPY_CONTEXT *)&t_left;
+
+    for (b = 0; b < 16; b++)
+        cost += cost_coeffs(mb, x->block + b, PLANE_TYPE_Y_NO_DC,
+                    ta + vp8_block2above[b], tl + vp8_block2left[b]);
+
+    cost += cost_coeffs(mb, x->block + 24, PLANE_TYPE_Y2,
+                ta + vp8_block2above[24], tl + vp8_block2left[24]);
+
+    return cost;
+}
+
+static void macro_block_yrd( MACROBLOCK *mb,
+                             int *Rate,
+                             int *Distortion)
+{
+    int b;
+    MACROBLOCKD *const x = &mb->e_mbd;
+    BLOCK   *const mb_y2 = mb->block + 24;
+    BLOCKD *const x_y2  = x->block + 24;
+    short *Y2DCPtr = mb_y2->src_diff;
+    BLOCK *beptr;
+    int d;
+
+    vp8_subtract_mby( mb->src_diff, *(mb->block[0].base_src),
+        mb->block[0].src_stride,  mb->e_mbd.predictor, 16);
+
+    /* Fdct and building the 2nd order block */
+    for (beptr = mb->block; beptr < mb->block + 16; beptr += 2)
+    {
+        mb->short_fdct8x4(beptr->src_diff, beptr->coeff, 32);
+        *Y2DCPtr++ = beptr->coeff[0];
+        *Y2DCPtr++ = beptr->coeff[16];
+    }
+
+    /* 2nd order fdct */
+    mb->short_walsh4x4(mb_y2->src_diff, mb_y2->coeff, 8);
+
+    /* Quantization */
+    for (b = 0; b < 16; b++)
+    {
+        mb->quantize_b(&mb->block[b], &mb->e_mbd.block[b]);
+    }
+
+    /* DC predication and Quantization of 2nd Order block */
+    mb->quantize_b(mb_y2, x_y2);
+
+    /* Distortion */
+    d = vp8_mbblock_error(mb, 1) << 2;
+    d += vp8_block_error(mb_y2->coeff, x_y2->dqcoeff);
+
+    *Distortion = (d >> 4);
+
+    /* rate */
+    *Rate = vp8_rdcost_mby(mb);
+}
+
+static void copy_predictor(unsigned char *dst, const unsigned char *predictor)
+{
+    const unsigned int *p = (const unsigned int *)predictor;
+    unsigned int *d = (unsigned int *)dst;
+    d[0] = p[0];
+    d[4] = p[4];
+    d[8] = p[8];
+    d[12] = p[12];
+}
+static int rd_pick_intra4x4block(
+    MACROBLOCK *x,
+    BLOCK *be,
+    BLOCKD *b,
+    B_PREDICTION_MODE *best_mode,
+    const int *bmode_costs,
+    ENTROPY_CONTEXT *a,
+    ENTROPY_CONTEXT *l,
+
+    int *bestrate,
+    int *bestratey,
+    int *bestdistortion)
+{
+    B_PREDICTION_MODE mode;
+    int best_rd = INT_MAX;
+    int rate = 0;
+    int distortion;
+
+    ENTROPY_CONTEXT ta = *a, tempa = *a;
+    ENTROPY_CONTEXT tl = *l, templ = *l;
+    /*
+     * The predictor buffer is a 2d buffer with a stride of 16.  Create
+     * a temp buffer that meets the stride requirements, but we are only
+     * interested in the left 4x4 block
+     * */
+    DECLARE_ALIGNED(16, unsigned char,  best_predictor[16*4]);
+    DECLARE_ALIGNED(16, short, best_dqcoeff[16]);
+    int dst_stride = x->e_mbd.dst.y_stride;
+    unsigned char *dst = x->e_mbd.dst.y_buffer + b->offset;
+
+    unsigned char *Above = dst - dst_stride;
+    unsigned char *yleft = dst - 1;
+    unsigned char top_left = Above[-1];
+
+    for (mode = B_DC_PRED; mode <= B_HU_PRED; mode++)
+    {
+        int this_rd;
+        int ratey;
+
+        rate = bmode_costs[mode];
+
+        vp8_intra4x4_predict(Above, yleft, dst_stride, mode,
+                             b->predictor, 16, top_left);
+        vp8_subtract_b(be, b, 16);
+        x->short_fdct4x4(be->src_diff, be->coeff, 32);
+        x->quantize_b(be, b);
+
+        tempa = ta;
+        templ = tl;
+
+        ratey = cost_coeffs(x, b, PLANE_TYPE_Y_WITH_DC, &tempa, &templ);
+        rate += ratey;
+        distortion = vp8_block_error(be->coeff, b->dqcoeff) >> 2;
+
+        this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+
+        if (this_rd < best_rd)
+        {
+            *bestrate = rate;
+            *bestratey = ratey;
+            *bestdistortion = distortion;
+            best_rd = this_rd;
+            *best_mode = mode;
+            *a = tempa;
+            *l = templ;
+            copy_predictor(best_predictor, b->predictor);
+            memcpy(best_dqcoeff, b->dqcoeff, 32);
+        }
+    }
+    b->bmi.as_mode = *best_mode;
+
+    vp8_short_idct4x4llm(best_dqcoeff, best_predictor, 16, dst, dst_stride);
+
+    return best_rd;
+}
+
+static int rd_pick_intra4x4mby_modes(MACROBLOCK *mb, int *Rate,
+                                     int *rate_y, int *Distortion, int best_rd)
+{
+    MACROBLOCKD *const xd = &mb->e_mbd;
+    int i;
+    int cost = mb->mbmode_cost [xd->frame_type] [B_PRED];
+    int distortion = 0;
+    int tot_rate_y = 0;
+    int64_t total_rd = 0;
+    ENTROPY_CONTEXT_PLANES t_above, t_left;
+    ENTROPY_CONTEXT *ta;
+    ENTROPY_CONTEXT *tl;
+    const int *bmode_costs;
+
+    memcpy(&t_above, mb->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    memcpy(&t_left, mb->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+    ta = (ENTROPY_CONTEXT *)&t_above;
+    tl = (ENTROPY_CONTEXT *)&t_left;
+
+    intra_prediction_down_copy(xd, xd->dst.y_buffer - xd->dst.y_stride + 16);
+
+    bmode_costs = mb->inter_bmode_costs;
+
+    for (i = 0; i < 16; i++)
+    {
+        MODE_INFO *const mic = xd->mode_info_context;
+        const int mis = xd->mode_info_stride;
+        B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
+        int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry), UNINITIALIZED_IS_SAFE(d);
+
+        if (mb->e_mbd.frame_type == KEY_FRAME)
+        {
+            const B_PREDICTION_MODE A = above_block_mode(mic, i, mis);
+            const B_PREDICTION_MODE L = left_block_mode(mic, i);
+
+            bmode_costs  = mb->bmode_costs[A][L];
+        }
+
+        total_rd += rd_pick_intra4x4block(
+            mb, mb->block + i, xd->block + i, &best_mode, bmode_costs,
+            ta + vp8_block2above[i],
+            tl + vp8_block2left[i], &r, &ry, &d);
+
+        cost += r;
+        distortion += d;
+        tot_rate_y += ry;
+
+        mic->bmi[i].as_mode = best_mode;
+
+        if(total_rd >= (int64_t)best_rd)
+            break;
+    }
+
+    if(total_rd >= (int64_t)best_rd)
+        return INT_MAX;
+
+    *Rate = cost;
+    *rate_y = tot_rate_y;
+    *Distortion = distortion;
+
+    return RDCOST(mb->rdmult, mb->rddiv, cost, distortion);
+}
+
+
+static int rd_pick_intra16x16mby_mode(MACROBLOCK *x,
+                                      int *Rate,
+                                      int *rate_y,
+                                      int *Distortion)
+{
+    MB_PREDICTION_MODE mode;
+    MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
+    int rate, ratey;
+    int distortion;
+    int best_rd = INT_MAX;
+    int this_rd;
+    MACROBLOCKD *xd = &x->e_mbd;
+
+    /* Y Search for 16x16 intra prediction mode */
+    for (mode = DC_PRED; mode <= TM_PRED; mode++)
+    {
+        xd->mode_info_context->mbmi.mode = mode;
+
+        vp8_build_intra_predictors_mby_s(xd,
+                                         xd->dst.y_buffer - xd->dst.y_stride,
+                                         xd->dst.y_buffer - 1,
+                                         xd->dst.y_stride,
+                                         xd->predictor,
+                                         16);
+
+        macro_block_yrd(x, &ratey, &distortion);
+        rate = ratey + x->mbmode_cost[xd->frame_type]
+                                     [xd->mode_info_context->mbmi.mode];
+
+        this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+
+        if (this_rd < best_rd)
+        {
+            mode_selected = mode;
+            best_rd = this_rd;
+            *Rate = rate;
+            *rate_y = ratey;
+            *Distortion = distortion;
+        }
+    }
+
+    xd->mode_info_context->mbmi.mode = mode_selected;
+    return best_rd;
+}
+
+static int rd_cost_mbuv(MACROBLOCK *mb)
+{
+    int b;
+    int cost = 0;
+    MACROBLOCKD *x = &mb->e_mbd;
+    ENTROPY_CONTEXT_PLANES t_above, t_left;
+    ENTROPY_CONTEXT *ta;
+    ENTROPY_CONTEXT *tl;
+
+    memcpy(&t_above, mb->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    memcpy(&t_left, mb->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+    ta = (ENTROPY_CONTEXT *)&t_above;
+    tl = (ENTROPY_CONTEXT *)&t_left;
+
+    for (b = 16; b < 24; b++)
+        cost += cost_coeffs(mb, x->block + b, PLANE_TYPE_UV,
+                    ta + vp8_block2above[b], tl + vp8_block2left[b]);
+
+    return cost;
+}
+
+
+static int rd_inter16x16_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
+                            int *distortion, int fullpixel)
+{
+    (void)cpi;
+    (void)fullpixel;
+
+    vp8_build_inter16x16_predictors_mbuv(&x->e_mbd);
+    vp8_subtract_mbuv(x->src_diff,
+        x->src.u_buffer, x->src.v_buffer, x->src.uv_stride,
+        &x->e_mbd.predictor[256], &x->e_mbd.predictor[320], 8);
+
+    vp8_transform_mbuv(x);
+    vp8_quantize_mbuv(x);
+
+    *rate       = rd_cost_mbuv(x);
+    *distortion = vp8_mbuverror(x) / 4;
+
+    return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
+}
+
+static int rd_inter4x4_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
+                          int *distortion, int fullpixel)
+{
+    (void)cpi;
+    (void)fullpixel;
+
+    vp8_build_inter4x4_predictors_mbuv(&x->e_mbd);
+    vp8_subtract_mbuv(x->src_diff,
+        x->src.u_buffer, x->src.v_buffer, x->src.uv_stride,
+        &x->e_mbd.predictor[256], &x->e_mbd.predictor[320], 8);
+
+    vp8_transform_mbuv(x);
+    vp8_quantize_mbuv(x);
+
+    *rate       = rd_cost_mbuv(x);
+    *distortion = vp8_mbuverror(x) / 4;
+
+    return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
+}
+
+static void rd_pick_intra_mbuv_mode(MACROBLOCK *x, int *rate,
+                                    int *rate_tokenonly, int *distortion)
+{
+    MB_PREDICTION_MODE mode;
+    MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
+    int best_rd = INT_MAX;
+    int UNINITIALIZED_IS_SAFE(d), UNINITIALIZED_IS_SAFE(r);
+    int rate_to;
+    MACROBLOCKD *xd = &x->e_mbd;
+
+    for (mode = DC_PRED; mode <= TM_PRED; mode++)
+    {
+        int this_rate;
+        int this_distortion;
+        int this_rd;
+
+        xd->mode_info_context->mbmi.uv_mode = mode;
+
+        vp8_build_intra_predictors_mbuv_s(xd,
+                                          xd->dst.u_buffer - xd->dst.uv_stride,
+                                          xd->dst.v_buffer - xd->dst.uv_stride,
+                                          xd->dst.u_buffer - 1,
+                                          xd->dst.v_buffer - 1,
+                                          xd->dst.uv_stride,
+                                          &xd->predictor[256], &xd->predictor[320],
+                                          8);
+
+
+        vp8_subtract_mbuv(x->src_diff,
+                      x->src.u_buffer, x->src.v_buffer, x->src.uv_stride,
+                      &xd->predictor[256], &xd->predictor[320], 8);
+        vp8_transform_mbuv(x);
+        vp8_quantize_mbuv(x);
+
+        rate_to = rd_cost_mbuv(x);
+        this_rate = rate_to + x->intra_uv_mode_cost[xd->frame_type][xd->mode_info_context->mbmi.uv_mode];
+
+        this_distortion = vp8_mbuverror(x) / 4;
+
+        this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+
+        if (this_rd < best_rd)
+        {
+            best_rd = this_rd;
+            d = this_distortion;
+            r = this_rate;
+            *rate_tokenonly = rate_to;
+            mode_selected = mode;
+        }
+    }
+
+    *rate = r;
+    *distortion = d;
+
+    xd->mode_info_context->mbmi.uv_mode = mode_selected;
+}
+
+int vp8_cost_mv_ref(MB_PREDICTION_MODE m, const int near_mv_ref_ct[4])
+{
+    vp8_prob p [VP8_MVREFS-1];
+    assert(NEARESTMV <= m  &&  m <= SPLITMV);
+    vp8_mv_ref_probs(p, near_mv_ref_ct);
+    return vp8_cost_token(vp8_mv_ref_tree, p,
+                          vp8_mv_ref_encoding_array + (m - NEARESTMV));
+}
+
+void vp8_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, int_mv *mv)
+{
+    x->e_mbd.mode_info_context->mbmi.mode = mb;
+    x->e_mbd.mode_info_context->mbmi.mv.as_int = mv->as_int;
+}
+
+static int labels2mode(
+    MACROBLOCK *x,
+    int const *labelings, int which_label,
+    B_PREDICTION_MODE this_mode,
+    int_mv *this_mv, int_mv *best_ref_mv,
+    int *mvcost[2]
+)
+{
+    MACROBLOCKD *const xd = & x->e_mbd;
+    MODE_INFO *const mic = xd->mode_info_context;
+    const int mis = xd->mode_info_stride;
+
+    int cost = 0;
+    int thismvcost = 0;
+
+    /* We have to be careful retrieving previously-encoded motion vectors.
+       Ones from this macroblock have to be pulled from the BLOCKD array
+       as they have not yet made it to the bmi array in our MB_MODE_INFO. */
+
+    int i = 0;
+
+    do
+    {
+        BLOCKD *const d = xd->block + i;
+        const int row = i >> 2,  col = i & 3;
+
+        B_PREDICTION_MODE m;
+
+        if (labelings[i] != which_label)
+            continue;
+
+        if (col  &&  labelings[i] == labelings[i-1])
+            m = LEFT4X4;
+        else if (row  &&  labelings[i] == labelings[i-4])
+            m = ABOVE4X4;
+        else
+        {
+            /* the only time we should do costing for new motion vector
+             * or mode is when we are on a new label  (jbb May 08, 2007)
+             */
+            switch (m = this_mode)
+            {
+            case NEW4X4 :
+                thismvcost  = vp8_mv_bit_cost(this_mv, best_ref_mv, mvcost, 102);
+                break;
+            case LEFT4X4:
+                this_mv->as_int = col ? d[-1].bmi.mv.as_int : left_block_mv(mic, i);
+                break;
+            case ABOVE4X4:
+                this_mv->as_int = row ? d[-4].bmi.mv.as_int : above_block_mv(mic, i, mis);
+                break;
+            case ZERO4X4:
+                this_mv->as_int = 0;
+                break;
+            default:
+                break;
+            }
+
+            if (m == ABOVE4X4)  /* replace above with left if same */
+            {
+                int_mv left_mv;
+
+                left_mv.as_int = col ? d[-1].bmi.mv.as_int :
+                                        left_block_mv(mic, i);
+
+                if (left_mv.as_int == this_mv->as_int)
+                    m = LEFT4X4;
+            }
+
+            cost = x->inter_bmode_costs[ m];
+        }
+
+        d->bmi.mv.as_int = this_mv->as_int;
+
+        x->partition_info->bmi[i].mode = m;
+        x->partition_info->bmi[i].mv.as_int = this_mv->as_int;
+
+    }
+    while (++i < 16);
+
+    cost += thismvcost ;
+    return cost;
+}
+
+static int rdcost_mbsegment_y(MACROBLOCK *mb, const int *labels,
+                              int which_label, ENTROPY_CONTEXT *ta,
+                              ENTROPY_CONTEXT *tl)
+{
+    int cost = 0;
+    int b;
+    MACROBLOCKD *x = &mb->e_mbd;
+
+    for (b = 0; b < 16; b++)
+        if (labels[ b] == which_label)
+            cost += cost_coeffs(mb, x->block + b, PLANE_TYPE_Y_WITH_DC,
+                                ta + vp8_block2above[b],
+                                tl + vp8_block2left[b]);
+
+    return cost;
+
+}
+static unsigned int vp8_encode_inter_mb_segment(MACROBLOCK *x, int const *labels, int which_label)
+{
+    int i;
+    unsigned int distortion = 0;
+    int pre_stride = x->e_mbd.pre.y_stride;
+    unsigned char *base_pre = x->e_mbd.pre.y_buffer;
+
+
+    for (i = 0; i < 16; i++)
+    {
+        if (labels[i] == which_label)
+        {
+            BLOCKD *bd = &x->e_mbd.block[i];
+            BLOCK *be = &x->block[i];
+
+            vp8_build_inter_predictors_b(bd, 16, base_pre, pre_stride, x->e_mbd.subpixel_predict);
+            vp8_subtract_b(be, bd, 16);
+            x->short_fdct4x4(be->src_diff, be->coeff, 32);
+            x->quantize_b(be, bd);
+
+            distortion += vp8_block_error(be->coeff, bd->dqcoeff);
+        }
+    }
+
+    return distortion;
+}
+
+
+static const unsigned int segmentation_to_sseshift[4] = {3, 3, 2, 0};
+
+
+typedef struct
+{
+  int_mv *ref_mv;
+  int_mv mvp;
+
+  int segment_rd;
+  int segment_num;
+  int r;
+  int d;
+  int segment_yrate;
+  B_PREDICTION_MODE modes[16];
+  int_mv mvs[16];
+  unsigned char eobs[16];
+
+  int mvthresh;
+  int *mdcounts;
+
+  int_mv sv_mvp[4]; /* save 4 mvp from 8x8 */
+  int sv_istep[2];  /* save 2 initial step_param for 16x8/8x16 */
+
+} BEST_SEG_INFO;
+
+
+static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x,
+                             BEST_SEG_INFO *bsi, unsigned int segmentation)
+{
+    int i;
+    int const *labels;
+    int br = 0;
+    int bd = 0;
+    B_PREDICTION_MODE this_mode;
+
+
+    int label_count;
+    int this_segment_rd = 0;
+    int label_mv_thresh;
+    int rate = 0;
+    int sbr = 0;
+    int sbd = 0;
+    int segmentyrate = 0;
+
+    vp8_variance_fn_ptr_t *v_fn_ptr;
+
+    ENTROPY_CONTEXT_PLANES t_above, t_left;
+    ENTROPY_CONTEXT *ta;
+    ENTROPY_CONTEXT *tl;
+    ENTROPY_CONTEXT_PLANES t_above_b, t_left_b;
+    ENTROPY_CONTEXT *ta_b;
+    ENTROPY_CONTEXT *tl_b;
+
+    memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+    ta = (ENTROPY_CONTEXT *)&t_above;
+    tl = (ENTROPY_CONTEXT *)&t_left;
+    ta_b = (ENTROPY_CONTEXT *)&t_above_b;
+    tl_b = (ENTROPY_CONTEXT *)&t_left_b;
+
+    br = 0;
+    bd = 0;
+
+    v_fn_ptr = &cpi->fn_ptr[segmentation];
+    labels = vp8_mbsplits[segmentation];
+    label_count = vp8_mbsplit_count[segmentation];
+
+    /* 64 makes this threshold really big effectively making it so that we
+     * very rarely check mvs on segments.   setting this to 1 would make mv
+     * thresh roughly equal to what it is for macroblocks
+     */
+    label_mv_thresh = 1 * bsi->mvthresh / label_count ;
+
+    /* Segmentation method overheads */
+    rate = vp8_cost_token(vp8_mbsplit_tree, vp8_mbsplit_probs, vp8_mbsplit_encodings + segmentation);
+    rate += vp8_cost_mv_ref(SPLITMV, bsi->mdcounts);
+    this_segment_rd += RDCOST(x->rdmult, x->rddiv, rate, 0);
+    br += rate;
+
+    for (i = 0; i < label_count; i++)
+    {
+        int_mv mode_mv[B_MODE_COUNT];
+        int best_label_rd = INT_MAX;
+        B_PREDICTION_MODE mode_selected = ZERO4X4;
+        int bestlabelyrate = 0;
+
+        /* search for the best motion vector on this segment */
+        for (this_mode = LEFT4X4; this_mode <= NEW4X4 ; this_mode ++)
+        {
+            int this_rd;
+            int distortion;
+            int labelyrate;
+            ENTROPY_CONTEXT_PLANES t_above_s, t_left_s;
+            ENTROPY_CONTEXT *ta_s;
+            ENTROPY_CONTEXT *tl_s;
+
+            memcpy(&t_above_s, &t_above, sizeof(ENTROPY_CONTEXT_PLANES));
+            memcpy(&t_left_s, &t_left, sizeof(ENTROPY_CONTEXT_PLANES));
+
+            ta_s = (ENTROPY_CONTEXT *)&t_above_s;
+            tl_s = (ENTROPY_CONTEXT *)&t_left_s;
+
+            if (this_mode == NEW4X4)
+            {
+                int sseshift;
+                int num00;
+                int step_param = 0;
+                int further_steps;
+                int n;
+                int thissme;
+                int bestsme = INT_MAX;
+                int_mv  temp_mv;
+                BLOCK *c;
+                BLOCKD *e;
+
+                /* Is the best so far sufficiently good that we cant justify
+                 * doing a new motion search.
+                 */
+                if (best_label_rd < label_mv_thresh)
+                    break;
+
+                if(cpi->compressor_speed)
+                {
+                    if (segmentation == BLOCK_8X16 || segmentation == BLOCK_16X8)
+                    {
+                        bsi->mvp.as_int = bsi->sv_mvp[i].as_int;
+                        if (i==1 && segmentation == BLOCK_16X8)
+                          bsi->mvp.as_int = bsi->sv_mvp[2].as_int;
+
+                        step_param = bsi->sv_istep[i];
+                    }
+
+                    /* use previous block's result as next block's MV
+                     * predictor.
+                     */
+                    if (segmentation == BLOCK_4X4 && i>0)
+                    {
+                        bsi->mvp.as_int = x->e_mbd.block[i-1].bmi.mv.as_int;
+                        if (i==4 || i==8 || i==12)
+                            bsi->mvp.as_int = x->e_mbd.block[i-4].bmi.mv.as_int;
+                        step_param = 2;
+                    }
+                }
+
+                further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
+
+                {
+                    int sadpb = x->sadperbit4;
+                    int_mv mvp_full;
+
+                    mvp_full.as_mv.row = bsi->mvp.as_mv.row >>3;
+                    mvp_full.as_mv.col = bsi->mvp.as_mv.col >>3;
+
+                    /* find first label */
+                    n = vp8_mbsplit_offset[segmentation][i];
+
+                    c = &x->block[n];
+                    e = &x->e_mbd.block[n];
+
+                    {
+                        bestsme = cpi->diamond_search_sad(x, c, e, &mvp_full,
+                                                &mode_mv[NEW4X4], step_param,
+                                                sadpb, &num00, v_fn_ptr,
+                                                x->mvcost, bsi->ref_mv);
+
+                        n = num00;
+                        num00 = 0;
+
+                        while (n < further_steps)
+                        {
+                            n++;
+
+                            if (num00)
+                                num00--;
+                            else
+                            {
+                                thissme = cpi->diamond_search_sad(x, c, e,
+                                                    &mvp_full, &temp_mv,
+                                                    step_param + n, sadpb,
+                                                    &num00, v_fn_ptr,
+                                                    x->mvcost, bsi->ref_mv);
+
+                                if (thissme < bestsme)
+                                {
+                                    bestsme = thissme;
+                                    mode_mv[NEW4X4].as_int = temp_mv.as_int;
+                                }
+                            }
+                        }
+                    }
+
+                    sseshift = segmentation_to_sseshift[segmentation];
+
+                    /* Should we do a full search (best quality only) */
+                    if ((cpi->compressor_speed == 0) && (bestsme >> sseshift) > 4000)
+                    {
+                        /* Check if mvp_full is within the range. */
+                        vp8_clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
+
+                        thissme = cpi->full_search_sad(x, c, e, &mvp_full,
+                                                       sadpb, 16, v_fn_ptr,
+                                                       x->mvcost, bsi->ref_mv);
+
+                        if (thissme < bestsme)
+                        {
+                            bestsme = thissme;
+                            mode_mv[NEW4X4].as_int = e->bmi.mv.as_int;
+                        }
+                        else
+                        {
+                            /* The full search result is actually worse so
+                             * re-instate the previous best vector
+                             */
+                            e->bmi.mv.as_int = mode_mv[NEW4X4].as_int;
+                        }
+                    }
+                }
+
+                if (bestsme < INT_MAX)
+                {
+                    int disto;
+                    unsigned int sse;
+                    cpi->find_fractional_mv_step(x, c, e, &mode_mv[NEW4X4],
+                        bsi->ref_mv, x->errorperbit, v_fn_ptr, x->mvcost,
+                        &disto, &sse);
+                }
+            } /* NEW4X4 */
+
+            rate = labels2mode(x, labels, i, this_mode, &mode_mv[this_mode],
+                               bsi->ref_mv, x->mvcost);
+
+            /* Trap vectors that reach beyond the UMV borders */
+            if (((mode_mv[this_mode].as_mv.row >> 3) < x->mv_row_min) || ((mode_mv[this_mode].as_mv.row >> 3) > x->mv_row_max) ||
+                ((mode_mv[this_mode].as_mv.col >> 3) < x->mv_col_min) || ((mode_mv[this_mode].as_mv.col >> 3) > x->mv_col_max))
+            {
+                continue;
+            }
+
+            distortion = vp8_encode_inter_mb_segment(x, labels, i) / 4;
+
+            labelyrate = rdcost_mbsegment_y(x, labels, i, ta_s, tl_s);
+            rate += labelyrate;
+
+            this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+
+            if (this_rd < best_label_rd)
+            {
+                sbr = rate;
+                sbd = distortion;
+                bestlabelyrate = labelyrate;
+                mode_selected = this_mode;
+                best_label_rd = this_rd;
+
+                memcpy(ta_b, ta_s, sizeof(ENTROPY_CONTEXT_PLANES));
+                memcpy(tl_b, tl_s, sizeof(ENTROPY_CONTEXT_PLANES));
+
+            }
+        } /*for each 4x4 mode*/
+
+        memcpy(ta, ta_b, sizeof(ENTROPY_CONTEXT_PLANES));
+        memcpy(tl, tl_b, sizeof(ENTROPY_CONTEXT_PLANES));
+
+        labels2mode(x, labels, i, mode_selected, &mode_mv[mode_selected],
+                    bsi->ref_mv, x->mvcost);
+
+        br += sbr;
+        bd += sbd;
+        segmentyrate += bestlabelyrate;
+        this_segment_rd += best_label_rd;
+
+        if (this_segment_rd >= bsi->segment_rd)
+            break;
+
+    } /* for each label */
+
+    if (this_segment_rd < bsi->segment_rd)
+    {
+        bsi->r = br;
+        bsi->d = bd;
+        bsi->segment_yrate = segmentyrate;
+        bsi->segment_rd = this_segment_rd;
+        bsi->segment_num = segmentation;
+
+        /* store everything needed to come back to this!! */
+        for (i = 0; i < 16; i++)
+        {
+            bsi->mvs[i].as_mv = x->partition_info->bmi[i].mv.as_mv;
+            bsi->modes[i] = x->partition_info->bmi[i].mode;
+            bsi->eobs[i] = x->e_mbd.eobs[i];
+        }
+    }
+}
+
+static
+void vp8_cal_step_param(int sr, int *sp)
+{
+    int step = 0;
+
+    if (sr > MAX_FIRST_STEP) sr = MAX_FIRST_STEP;
+    else if (sr < 1) sr = 1;
+
+    while (sr>>=1)
+        step++;
+
+    *sp = MAX_MVSEARCH_STEPS - 1 - step;
+}
+
+static int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x,
+                                           int_mv *best_ref_mv, int best_rd,
+                                           int *mdcounts, int *returntotrate,
+                                           int *returnyrate, int *returndistortion,
+                                           int mvthresh)
+{
+    int i;
+    BEST_SEG_INFO bsi;
+
+    memset(&bsi, 0, sizeof(bsi));
+
+    bsi.segment_rd = best_rd;
+    bsi.ref_mv = best_ref_mv;
+    bsi.mvp.as_int = best_ref_mv->as_int;
+    bsi.mvthresh = mvthresh;
+    bsi.mdcounts = mdcounts;
+
+    for(i = 0; i < 16; i++)
+    {
+        bsi.modes[i] = ZERO4X4;
+    }
+
+    if(cpi->compressor_speed == 0)
+    {
+        /* for now, we will keep the original segmentation order
+           when in best quality mode */
+        rd_check_segment(cpi, x, &bsi, BLOCK_16X8);
+        rd_check_segment(cpi, x, &bsi, BLOCK_8X16);
+        rd_check_segment(cpi, x, &bsi, BLOCK_8X8);
+        rd_check_segment(cpi, x, &bsi, BLOCK_4X4);
+    }
+    else
+    {
+        int sr;
+
+        rd_check_segment(cpi, x, &bsi, BLOCK_8X8);
+
+        if (bsi.segment_rd < best_rd)
+        {
+            int col_min = ((best_ref_mv->as_mv.col+7)>>3) - MAX_FULL_PEL_VAL;
+            int row_min = ((best_ref_mv->as_mv.row+7)>>3) - MAX_FULL_PEL_VAL;
+            int col_max = (best_ref_mv->as_mv.col>>3) + MAX_FULL_PEL_VAL;
+            int row_max = (best_ref_mv->as_mv.row>>3) + MAX_FULL_PEL_VAL;
+
+            int tmp_col_min = x->mv_col_min;
+            int tmp_col_max = x->mv_col_max;
+            int tmp_row_min = x->mv_row_min;
+            int tmp_row_max = x->mv_row_max;
+
+            /* Get intersection of UMV window and valid MV window to reduce # of checks in diamond search. */
+            if (x->mv_col_min < col_min )
+                x->mv_col_min = col_min;
+            if (x->mv_col_max > col_max )
+                x->mv_col_max = col_max;
+            if (x->mv_row_min < row_min )
+                x->mv_row_min = row_min;
+            if (x->mv_row_max > row_max )
+                x->mv_row_max = row_max;
+
+            /* Get 8x8 result */
+            bsi.sv_mvp[0].as_int = bsi.mvs[0].as_int;
+            bsi.sv_mvp[1].as_int = bsi.mvs[2].as_int;
+            bsi.sv_mvp[2].as_int = bsi.mvs[8].as_int;
+            bsi.sv_mvp[3].as_int = bsi.mvs[10].as_int;
+
+            /* Use 8x8 result as 16x8/8x16's predictor MV. Adjust search range according to the closeness of 2 MV. */
+            /* block 8X16 */
+            {
+                sr = MAXF((abs(bsi.sv_mvp[0].as_mv.row - bsi.sv_mvp[2].as_mv.row))>>3, (abs(bsi.sv_mvp[0].as_mv.col - bsi.sv_mvp[2].as_mv.col))>>3);
+                vp8_cal_step_param(sr, &bsi.sv_istep[0]);
+
+                sr = MAXF((abs(bsi.sv_mvp[1].as_mv.row - bsi.sv_mvp[3].as_mv.row))>>3, (abs(bsi.sv_mvp[1].as_mv.col - bsi.sv_mvp[3].as_mv.col))>>3);
+                vp8_cal_step_param(sr, &bsi.sv_istep[1]);
+
+                rd_check_segment(cpi, x, &bsi, BLOCK_8X16);
+            }
+
+            /* block 16X8 */
+            {
+                sr = MAXF((abs(bsi.sv_mvp[0].as_mv.row - bsi.sv_mvp[1].as_mv.row))>>3, (abs(bsi.sv_mvp[0].as_mv.col - bsi.sv_mvp[1].as_mv.col))>>3);
+                vp8_cal_step_param(sr, &bsi.sv_istep[0]);
+
+                sr = MAXF((abs(bsi.sv_mvp[2].as_mv.row - bsi.sv_mvp[3].as_mv.row))>>3, (abs(bsi.sv_mvp[2].as_mv.col - bsi.sv_mvp[3].as_mv.col))>>3);
+                vp8_cal_step_param(sr, &bsi.sv_istep[1]);
+
+                rd_check_segment(cpi, x, &bsi, BLOCK_16X8);
+            }
+
+            /* If 8x8 is better than 16x8/8x16, then do 4x4 search */
+            /* Not skip 4x4 if speed=0 (good quality) */
+            if (cpi->sf.no_skip_block4x4_search || bsi.segment_num == BLOCK_8X8)  /* || (sv_segment_rd8x8-bsi.segment_rd) < sv_segment_rd8x8>>5) */
+            {
+                bsi.mvp.as_int = bsi.sv_mvp[0].as_int;
+                rd_check_segment(cpi, x, &bsi, BLOCK_4X4);
+            }
+
+            /* restore UMV window */
+            x->mv_col_min = tmp_col_min;
+            x->mv_col_max = tmp_col_max;
+            x->mv_row_min = tmp_row_min;
+            x->mv_row_max = tmp_row_max;
+        }
+    }
+
+    /* set it to the best */
+    for (i = 0; i < 16; i++)
+    {
+        BLOCKD *bd = &x->e_mbd.block[i];
+
+        bd->bmi.mv.as_int = bsi.mvs[i].as_int;
+        *bd->eob = bsi.eobs[i];
+    }
+
+    *returntotrate = bsi.r;
+    *returndistortion = bsi.d;
+    *returnyrate = bsi.segment_yrate;
+
+    /* save partitions */
+    x->e_mbd.mode_info_context->mbmi.partitioning = bsi.segment_num;
+    x->partition_info->count = vp8_mbsplit_count[bsi.segment_num];
+
+    for (i = 0; i < x->partition_info->count; i++)
+    {
+        int j;
+
+        j = vp8_mbsplit_offset[bsi.segment_num][i];
+
+        x->partition_info->bmi[i].mode = bsi.modes[j];
+        x->partition_info->bmi[i].mv.as_mv = bsi.mvs[j].as_mv;
+    }
+    /*
+     * used to set x->e_mbd.mode_info_context->mbmi.mv.as_int
+     */
+    x->partition_info->bmi[15].mv.as_int = bsi.mvs[15].as_int;
+
+    return bsi.segment_rd;
+}
+
+/* The improved MV prediction */
+void vp8_mv_pred
+(
+    VP8_COMP *cpi,
+    MACROBLOCKD *xd,
+    const MODE_INFO *here,
+    int_mv *mvp,
+    int refframe,
+    int *ref_frame_sign_bias,
+    int *sr,
+    int near_sadidx[]
+)
+{
+    const MODE_INFO *above = here - xd->mode_info_stride;
+    const MODE_INFO *left = here - 1;
+    const MODE_INFO *aboveleft = above - 1;
+    int_mv           near_mvs[8];
+    int              near_ref[8];
+    int_mv           mv;
+    int              vcnt=0;
+    int              find=0;
+    int              mb_offset;
+
+    int              mvx[8];
+    int              mvy[8];
+    int              i;
+
+    mv.as_int = 0;
+
+    if(here->mbmi.ref_frame != INTRA_FRAME)
+    {
+        near_mvs[0].as_int = near_mvs[1].as_int = near_mvs[2].as_int = near_mvs[3].as_int = near_mvs[4].as_int = near_mvs[5].as_int = near_mvs[6].as_int = near_mvs[7].as_int = 0;
+        near_ref[0] = near_ref[1] = near_ref[2] = near_ref[3] = near_ref[4] = near_ref[5] = near_ref[6] = near_ref[7] = 0;
+
+        /* read in 3 nearby block's MVs from current frame as prediction
+         * candidates.
+         */
+        if (above->mbmi.ref_frame != INTRA_FRAME)
+        {
+            near_mvs[vcnt].as_int = above->mbmi.mv.as_int;
+            mv_bias(ref_frame_sign_bias[above->mbmi.ref_frame], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+            near_ref[vcnt] =  above->mbmi.ref_frame;
+        }
+        vcnt++;
+        if (left->mbmi.ref_frame != INTRA_FRAME)
+        {
+            near_mvs[vcnt].as_int = left->mbmi.mv.as_int;
+            mv_bias(ref_frame_sign_bias[left->mbmi.ref_frame], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+            near_ref[vcnt] =  left->mbmi.ref_frame;
+        }
+        vcnt++;
+        if (aboveleft->mbmi.ref_frame != INTRA_FRAME)
+        {
+            near_mvs[vcnt].as_int = aboveleft->mbmi.mv.as_int;
+            mv_bias(ref_frame_sign_bias[aboveleft->mbmi.ref_frame], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+            near_ref[vcnt] =  aboveleft->mbmi.ref_frame;
+        }
+        vcnt++;
+
+        /* read in 5 nearby block's MVs from last frame. */
+        if(cpi->common.last_frame_type != KEY_FRAME)
+        {
+            mb_offset = (-xd->mb_to_top_edge/128 + 1) * (xd->mode_info_stride +1) + (-xd->mb_to_left_edge/128 +1) ;
+
+            /* current in last frame */
+            if (cpi->lf_ref_frame[mb_offset] != INTRA_FRAME)
+            {
+                near_mvs[vcnt].as_int = cpi->lfmv[mb_offset].as_int;
+                mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+                near_ref[vcnt] =  cpi->lf_ref_frame[mb_offset];
+            }
+            vcnt++;
+
+            /* above in last frame */
+            if (cpi->lf_ref_frame[mb_offset - xd->mode_info_stride-1] != INTRA_FRAME)
+            {
+                near_mvs[vcnt].as_int = cpi->lfmv[mb_offset - xd->mode_info_stride-1].as_int;
+                mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset - xd->mode_info_stride-1], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+                near_ref[vcnt] =  cpi->lf_ref_frame[mb_offset - xd->mode_info_stride-1];
+            }
+            vcnt++;
+
+            /* left in last frame */
+            if (cpi->lf_ref_frame[mb_offset-1] != INTRA_FRAME)
+            {
+                near_mvs[vcnt].as_int = cpi->lfmv[mb_offset -1].as_int;
+                mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset -1], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+                near_ref[vcnt] =  cpi->lf_ref_frame[mb_offset - 1];
+            }
+            vcnt++;
+
+            /* right in last frame */
+            if (cpi->lf_ref_frame[mb_offset +1] != INTRA_FRAME)
+            {
+                near_mvs[vcnt].as_int = cpi->lfmv[mb_offset +1].as_int;
+                mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset +1], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+                near_ref[vcnt] =  cpi->lf_ref_frame[mb_offset +1];
+            }
+            vcnt++;
+
+            /* below in last frame */
+            if (cpi->lf_ref_frame[mb_offset + xd->mode_info_stride +1] != INTRA_FRAME)
+            {
+                near_mvs[vcnt].as_int = cpi->lfmv[mb_offset + xd->mode_info_stride +1].as_int;
+                mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset + xd->mode_info_stride +1], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+                near_ref[vcnt] =  cpi->lf_ref_frame[mb_offset + xd->mode_info_stride +1];
+            }
+            vcnt++;
+        }
+
+        for(i=0; i< vcnt; i++)
+        {
+            if(near_ref[near_sadidx[i]] != INTRA_FRAME)
+            {
+                if(here->mbmi.ref_frame == near_ref[near_sadidx[i]])
+                {
+                    mv.as_int = near_mvs[near_sadidx[i]].as_int;
+                    find = 1;
+                    if (i < 3)
+                        *sr = 3;
+                    else
+                        *sr = 2;
+                    break;
+                }
+            }
+        }
+
+        if(!find)
+        {
+            for(i=0; i<vcnt; i++)
+            {
+                mvx[i] = near_mvs[i].as_mv.row;
+                mvy[i] = near_mvs[i].as_mv.col;
+            }
+
+            insertsortmv(mvx, vcnt);
+            insertsortmv(mvy, vcnt);
+            mv.as_mv.row = mvx[vcnt/2];
+            mv.as_mv.col = mvy[vcnt/2];
+
+            /* sr is set to 0 to allow calling function to decide the search
+             * range.
+             */
+            *sr = 0;
+        }
+    }
+
+    /* Set up return values */
+    mvp->as_int = mv.as_int;
+    vp8_clamp_mv2(mvp, xd);
+}
+
+void vp8_cal_sad(VP8_COMP *cpi, MACROBLOCKD *xd, MACROBLOCK *x, int recon_yoffset, int near_sadidx[])
+{
+    /* near_sad indexes:
+     *   0-cf above, 1-cf left, 2-cf aboveleft,
+     *   3-lf current, 4-lf above, 5-lf left, 6-lf right, 7-lf below
+     */
+    int near_sad[8] = {0};
+    BLOCK *b = &x->block[0];
+    unsigned char *src_y_ptr = *(b->base_src);
+
+    /* calculate sad for current frame 3 nearby MBs. */
+    if( xd->mb_to_top_edge==0 && xd->mb_to_left_edge ==0)
+    {
+        near_sad[0] = near_sad[1] = near_sad[2] = INT_MAX;
+    }else if(xd->mb_to_top_edge==0)
+    {   /* only has left MB for sad calculation. */
+        near_sad[0] = near_sad[2] = INT_MAX;
+        near_sad[1] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - 16,xd->dst.y_stride);
+    }else if(xd->mb_to_left_edge ==0)
+    {   /* only has left MB for sad calculation. */
+        near_sad[1] = near_sad[2] = INT_MAX;
+        near_sad[0] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - xd->dst.y_stride *16,xd->dst.y_stride);
+    }else
+    {
+        near_sad[0] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - xd->dst.y_stride *16,xd->dst.y_stride);
+        near_sad[1] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - 16,xd->dst.y_stride);
+        near_sad[2] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - xd->dst.y_stride *16 -16,xd->dst.y_stride);
+    }
+
+    if(cpi->common.last_frame_type != KEY_FRAME)
+    {
+        /* calculate sad for last frame 5 nearby MBs. */
+        unsigned char *pre_y_buffer = cpi->common.yv12_fb[cpi->common.lst_fb_idx].y_buffer + recon_yoffset;
+        int pre_y_stride = cpi->common.yv12_fb[cpi->common.lst_fb_idx].y_stride;
+
+        if(xd->mb_to_top_edge==0) near_sad[4] = INT_MAX;
+        if(xd->mb_to_left_edge ==0) near_sad[5] = INT_MAX;
+        if(xd->mb_to_right_edge ==0) near_sad[6] = INT_MAX;
+        if(xd->mb_to_bottom_edge==0) near_sad[7] = INT_MAX;
+
+        if(near_sad[4] != INT_MAX)
+            near_sad[4] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer - pre_y_stride *16, pre_y_stride);
+        if(near_sad[5] != INT_MAX)
+            near_sad[5] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer - 16, pre_y_stride);
+        near_sad[3] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer, pre_y_stride);
+        if(near_sad[6] != INT_MAX)
+            near_sad[6] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer + 16, pre_y_stride);
+        if(near_sad[7] != INT_MAX)
+            near_sad[7] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer + pre_y_stride *16, pre_y_stride);
+    }
+
+    if(cpi->common.last_frame_type != KEY_FRAME)
+    {
+        insertsortsad(near_sad, near_sadidx, 8);
+    }else
+    {
+        insertsortsad(near_sad, near_sadidx, 3);
+    }
+}
+
+static void rd_update_mvcount(MACROBLOCK *x, int_mv *best_ref_mv)
+{
+    if (x->e_mbd.mode_info_context->mbmi.mode == SPLITMV)
+    {
+        int i;
+
+        for (i = 0; i < x->partition_info->count; i++)
+        {
+            if (x->partition_info->bmi[i].mode == NEW4X4)
+            {
+                x->MVcount[0][mv_max+((x->partition_info->bmi[i].mv.as_mv.row
+                                          - best_ref_mv->as_mv.row) >> 1)]++;
+                x->MVcount[1][mv_max+((x->partition_info->bmi[i].mv.as_mv.col
+                                          - best_ref_mv->as_mv.col) >> 1)]++;
+            }
+        }
+    }
+    else if (x->e_mbd.mode_info_context->mbmi.mode == NEWMV)
+    {
+        x->MVcount[0][mv_max+((x->e_mbd.mode_info_context->mbmi.mv.as_mv.row
+                                          - best_ref_mv->as_mv.row) >> 1)]++;
+        x->MVcount[1][mv_max+((x->e_mbd.mode_info_context->mbmi.mv.as_mv.col
+                                          - best_ref_mv->as_mv.col) >> 1)]++;
+    }
+}
+
+static int evaluate_inter_mode_rd(int mdcounts[4],
+                                  RATE_DISTORTION* rd,
+                                  int* disable_skip,
+                                  VP8_COMP *cpi, MACROBLOCK *x)
+{
+    MB_PREDICTION_MODE this_mode = x->e_mbd.mode_info_context->mbmi.mode;
+    BLOCK *b = &x->block[0];
+    MACROBLOCKD *xd = &x->e_mbd;
+    int distortion;
+    vp8_build_inter16x16_predictors_mby(&x->e_mbd, x->e_mbd.predictor, 16);
+
+    if (cpi->active_map_enabled && x->active_ptr[0] == 0) {
+        x->skip = 1;
+    }
+    else if (x->encode_breakout)
+    {
+        unsigned int sse;
+        unsigned int var;
+        unsigned int threshold = (xd->block[0].dequant[1]
+                    * xd->block[0].dequant[1] >>4);
+
+        if(threshold < x->encode_breakout)
+            threshold = x->encode_breakout;
+
+        var = vpx_variance16x16
+                (*(b->base_src), b->src_stride,
+                x->e_mbd.predictor, 16, &sse);
+
+        if (sse < threshold)
+        {
+             unsigned int q2dc = xd->block[24].dequant[0];
+            /* If theres is no codeable 2nd order dc
+               or a very small uniform pixel change change */
+            if ((sse - var < q2dc * q2dc >>4) ||
+                (sse /2 > var && sse-var < 64))
+            {
+                /* Check u and v to make sure skip is ok */
+                unsigned int sse2 = VP8_UVSSE(x);
+                if (sse2 * 2 < threshold)
+                {
+                    x->skip = 1;
+                    rd->distortion2 = sse + sse2;
+                    rd->rate2 = 500;
+
+                    /* for best_yrd calculation */
+                    rd->rate_uv = 0;
+                    rd->distortion_uv = sse2;
+
+                    *disable_skip = 1;
+                    return RDCOST(x->rdmult, x->rddiv, rd->rate2,
+                                  rd->distortion2);
+                }
+            }
+        }
+    }
+
+
+    /* Add in the Mv/mode cost */
+    rd->rate2 += vp8_cost_mv_ref(this_mode, mdcounts);
+
+    /* Y cost and distortion */
+    macro_block_yrd(x, &rd->rate_y, &distortion);
+    rd->rate2 += rd->rate_y;
+    rd->distortion2 += distortion;
+
+    /* UV cost and distortion */
+    rd_inter16x16_uv(cpi, x, &rd->rate_uv, &rd->distortion_uv,
+                     cpi->common.full_pixel);
+    rd->rate2 += rd->rate_uv;
+    rd->distortion2 += rd->distortion_uv;
+    return INT_MAX;
+}
+
+static int calculate_final_rd_costs(int this_rd,
+                                    RATE_DISTORTION* rd,
+                                    int* other_cost,
+                                    int disable_skip,
+                                    int uv_intra_tteob,
+                                    int intra_rd_penalty,
+                                    VP8_COMP *cpi, MACROBLOCK *x)
+{
+    MB_PREDICTION_MODE this_mode = x->e_mbd.mode_info_context->mbmi.mode;
+
+    /* Where skip is allowable add in the default per mb cost for the no
+     * skip case. where we then decide to skip we have to delete this and
+     * replace it with the cost of signalling a skip
+     */
+    if (cpi->common.mb_no_coeff_skip)
+    {
+        *other_cost += vp8_cost_bit(cpi->prob_skip_false, 0);
+        rd->rate2 += *other_cost;
+    }
+
+    /* Estimate the reference frame signaling cost and add it
+     * to the rolling cost variable.
+     */
+    rd->rate2 +=
+        x->ref_frame_cost[x->e_mbd.mode_info_context->mbmi.ref_frame];
+
+    if (!disable_skip)
+    {
+        /* Test for the condition where skip block will be activated
+         * because there are no non zero coefficients and make any
+         * necessary adjustment for rate
+         */
+        if (cpi->common.mb_no_coeff_skip)
+        {
+            int i;
+            int tteob;
+            int has_y2_block = (this_mode!=SPLITMV && this_mode!=B_PRED);
+
+            tteob = 0;
+            if(has_y2_block)
+                tteob += x->e_mbd.eobs[24];
+
+            for (i = 0; i < 16; i++)
+                tteob += (x->e_mbd.eobs[i] > has_y2_block);
+
+            if (x->e_mbd.mode_info_context->mbmi.ref_frame)
+            {
+                for (i = 16; i < 24; i++)
+                    tteob += x->e_mbd.eobs[i];
+            }
+            else
+                tteob += uv_intra_tteob;
+
+            if (tteob == 0)
+            {
+                rd->rate2 -= (rd->rate_y + rd->rate_uv);
+                /* for best_yrd calculation */
+                rd->rate_uv = 0;
+
+                /* Back out no skip flag costing and add in skip flag costing */
+                if (cpi->prob_skip_false)
+                {
+                    int prob_skip_cost;
+
+                    prob_skip_cost = vp8_cost_bit(cpi->prob_skip_false, 1);
+                    prob_skip_cost -= vp8_cost_bit(cpi->prob_skip_false, 0);
+                    rd->rate2 += prob_skip_cost;
+                    *other_cost += prob_skip_cost;
+                }
+            }
+        }
+        /* Calculate the final RD estimate for this mode */
+        this_rd = RDCOST(x->rdmult, x->rddiv, rd->rate2, rd->distortion2);
+        if (this_rd < INT_MAX && x->e_mbd.mode_info_context->mbmi.ref_frame
+                                 == INTRA_FRAME)
+            this_rd += intra_rd_penalty;
+    }
+    return this_rd;
+}
+
+static void update_best_mode(BEST_MODE* best_mode, int this_rd,
+                             RATE_DISTORTION* rd, int other_cost, MACROBLOCK *x)
+{
+    MB_PREDICTION_MODE this_mode = x->e_mbd.mode_info_context->mbmi.mode;
+
+    other_cost +=
+    x->ref_frame_cost[x->e_mbd.mode_info_context->mbmi.ref_frame];
+
+    /* Calculate the final y RD estimate for this mode */
+    best_mode->yrd = RDCOST(x->rdmult, x->rddiv, (rd->rate2-rd->rate_uv-other_cost),
+                      (rd->distortion2-rd->distortion_uv));
+
+    best_mode->rd = this_rd;
+    memcpy(&best_mode->mbmode, &x->e_mbd.mode_info_context->mbmi, sizeof(MB_MODE_INFO));
+    memcpy(&best_mode->partition, x->partition_info, sizeof(PARTITION_INFO));
+
+    if ((this_mode == B_PRED) || (this_mode == SPLITMV))
+    {
+        int i;
+        for (i = 0; i < 16; i++)
+        {
+            best_mode->bmodes[i] = x->e_mbd.block[i].bmi;
+        }
+    }
+}
+
+void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
+                            int recon_uvoffset, int *returnrate,
+                            int *returndistortion, int *returnintra,
+                            int mb_row, int mb_col)
+{
+    BLOCK *b = &x->block[0];
+    BLOCKD *d = &x->e_mbd.block[0];
+    MACROBLOCKD *xd = &x->e_mbd;
+    int_mv best_ref_mv_sb[2];
+    int_mv mode_mv_sb[2][MB_MODE_COUNT];
+    int_mv best_ref_mv;
+    int_mv *mode_mv;
+    MB_PREDICTION_MODE this_mode;
+    int num00;
+    int best_mode_index = 0;
+    BEST_MODE best_mode;
+
+    int i;
+    int mode_index;
+    int mdcounts[4];
+    int rate;
+    RATE_DISTORTION rd;
+    int uv_intra_rate, uv_intra_distortion, uv_intra_rate_tokenonly;
+    int uv_intra_tteob = 0;
+    int uv_intra_done = 0;
+
+    MB_PREDICTION_MODE uv_intra_mode = 0;
+    int_mv mvp;
+    int near_sadidx[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+    int saddone=0;
+    /* search range got from mv_pred(). It uses step_param levels. (0-7) */
+    int sr=0;
+
+    unsigned char *plane[4][3];
+    int ref_frame_map[4];
+    int sign_bias = 0;
+
+    int intra_rd_penalty =  10* vp8_dc_quant(cpi->common.base_qindex,
+                                             cpi->common.y1dc_delta_q);
+
+#if CONFIG_TEMPORAL_DENOISING
+    unsigned int zero_mv_sse = UINT_MAX, best_sse = UINT_MAX,
+            best_rd_sse = UINT_MAX;
+#endif
+
+    mode_mv = mode_mv_sb[sign_bias];
+    best_ref_mv.as_int = 0;
+    best_mode.rd = INT_MAX;
+    best_mode.yrd = INT_MAX;
+    best_mode.intra_rd = INT_MAX;
+    memset(mode_mv_sb, 0, sizeof(mode_mv_sb));
+    memset(&best_mode.mbmode, 0, sizeof(best_mode.mbmode));
+    memset(&best_mode.bmodes, 0, sizeof(best_mode.bmodes));
+
+    /* Setup search priorities */
+    get_reference_search_order(cpi, ref_frame_map);
+
+    /* Check to see if there is at least 1 valid reference frame that we need
+     * to calculate near_mvs.
+     */
+    if (ref_frame_map[1] > 0)
+    {
+        sign_bias = vp8_find_near_mvs_bias(&x->e_mbd,
+                                           x->e_mbd.mode_info_context,
+                                           mode_mv_sb,
+                                           best_ref_mv_sb,
+                                           mdcounts,
+                                           ref_frame_map[1],
+                                           cpi->common.ref_frame_sign_bias);
+
+        mode_mv = mode_mv_sb[sign_bias];
+        best_ref_mv.as_int = best_ref_mv_sb[sign_bias].as_int;
+    }
+
+    get_predictor_pointers(cpi, plane, recon_yoffset, recon_uvoffset);
+
+    *returnintra = INT_MAX;
+    /* Count of the number of MBs tested so far this frame */
+    x->mbs_tested_so_far++;
+
+    x->skip = 0;
+
+    for (mode_index = 0; mode_index < MAX_MODES; mode_index++)
+    {
+        int this_rd = INT_MAX;
+        int disable_skip = 0;
+        int other_cost = 0;
+        int this_ref_frame = ref_frame_map[vp8_ref_frame_order[mode_index]];
+
+        /* Test best rd so far against threshold for trying this mode. */
+        if (best_mode.rd <= x->rd_threshes[mode_index])
+            continue;
+
+        if (this_ref_frame < 0)
+            continue;
+
+        /* These variables hold are rolling total cost and distortion for
+         * this mode
+         */
+        rd.rate2 = 0;
+        rd.distortion2 = 0;
+
+        this_mode = vp8_mode_order[mode_index];
+
+        x->e_mbd.mode_info_context->mbmi.mode = this_mode;
+        x->e_mbd.mode_info_context->mbmi.ref_frame = this_ref_frame;
+
+        /* Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
+         * unless ARNR filtering is enabled in which case we want
+         * an unfiltered alternative
+         */
+        if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0))
+        {
+            if (this_mode != ZEROMV || x->e_mbd.mode_info_context->mbmi.ref_frame != ALTREF_FRAME)
+                continue;
+        }
+
+        /* everything but intra */
+        if (x->e_mbd.mode_info_context->mbmi.ref_frame)
+        {
+            x->e_mbd.pre.y_buffer = plane[this_ref_frame][0];
+            x->e_mbd.pre.u_buffer = plane[this_ref_frame][1];
+            x->e_mbd.pre.v_buffer = plane[this_ref_frame][2];
+
+            if (sign_bias != cpi->common.ref_frame_sign_bias[this_ref_frame])
+            {
+                sign_bias = cpi->common.ref_frame_sign_bias[this_ref_frame];
+                mode_mv = mode_mv_sb[sign_bias];
+                best_ref_mv.as_int = best_ref_mv_sb[sign_bias].as_int;
+            }
+        }
+
+        /* Check to see if the testing frequency for this mode is at its
+         * max If so then prevent it from being tested and increase the
+         * threshold for its testing
+         */
+        if (x->mode_test_hit_counts[mode_index] && (cpi->mode_check_freq[mode_index] > 1))
+        {
+            if (x->mbs_tested_so_far  <= cpi->mode_check_freq[mode_index] * x->mode_test_hit_counts[mode_index])
+            {
+                /* Increase the threshold for coding this mode to make it
+                 * less likely to be chosen
+                 */
+                x->rd_thresh_mult[mode_index] += 4;
+
+                if (x->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
+                    x->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
+
+                x->rd_threshes[mode_index] =
+                    (cpi->rd_baseline_thresh[mode_index] >> 7) *
+                    x->rd_thresh_mult[mode_index];
+
+                continue;
+            }
+        }
+
+        /* We have now reached the point where we are going to test the
+         * current mode so increment the counter for the number of times
+         * it has been tested
+         */
+        x->mode_test_hit_counts[mode_index] ++;
+
+        /* Experimental code. Special case for gf and arf zeromv modes.
+         * Increase zbin size to supress noise
+         */
+        if (x->zbin_mode_boost_enabled)
+        {
+            if ( this_ref_frame == INTRA_FRAME )
+                x->zbin_mode_boost = 0;
+            else
+            {
+                if (vp8_mode_order[mode_index] == ZEROMV)
+                {
+                    if (this_ref_frame != LAST_FRAME)
+                        x->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
+                    else
+                        x->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
+                }
+                else if (vp8_mode_order[mode_index] == SPLITMV)
+                    x->zbin_mode_boost = 0;
+                else
+                    x->zbin_mode_boost = MV_ZBIN_BOOST;
+            }
+
+            vp8_update_zbin_extra(cpi, x);
+        }
+
+        if(!uv_intra_done && this_ref_frame == INTRA_FRAME)
+        {
+            rd_pick_intra_mbuv_mode(x, &uv_intra_rate,
+                                    &uv_intra_rate_tokenonly,
+                                    &uv_intra_distortion);
+            uv_intra_mode = x->e_mbd.mode_info_context->mbmi.uv_mode;
+
+            /*
+             * Total of the eobs is used later to further adjust rate2. Since uv
+             * block's intra eobs will be overwritten when we check inter modes,
+             * we need to save uv_intra_tteob here.
+             */
+            for (i = 16; i < 24; i++)
+                uv_intra_tteob += x->e_mbd.eobs[i];
+
+            uv_intra_done = 1;
+        }
+
+        switch (this_mode)
+        {
+        case B_PRED:
+        {
+            int tmp_rd;
+
+            /* Note the rate value returned here includes the cost of
+             * coding the BPRED mode: x->mbmode_cost[x->e_mbd.frame_type][BPRED]
+             */
+            int distortion;
+            tmp_rd = rd_pick_intra4x4mby_modes(x, &rate, &rd.rate_y, &distortion, best_mode.yrd);
+            rd.rate2 += rate;
+            rd.distortion2 += distortion;
+
+            if(tmp_rd < best_mode.yrd)
+            {
+                rd.rate2 += uv_intra_rate;
+                rd.rate_uv = uv_intra_rate_tokenonly;
+                rd.distortion2 += uv_intra_distortion;
+                rd.distortion_uv = uv_intra_distortion;
+            }
+            else
+            {
+                this_rd = INT_MAX;
+                disable_skip = 1;
+            }
+        }
+        break;
+
+        case SPLITMV:
+        {
+            int tmp_rd;
+            int this_rd_thresh;
+            int distortion;
+
+            this_rd_thresh = (vp8_ref_frame_order[mode_index] == 1) ?
+                x->rd_threshes[THR_NEW1] : x->rd_threshes[THR_NEW3];
+            this_rd_thresh = (vp8_ref_frame_order[mode_index] == 2) ?
+                x->rd_threshes[THR_NEW2] : this_rd_thresh;
+
+            tmp_rd = vp8_rd_pick_best_mbsegmentation(cpi, x, &best_ref_mv,
+                                                     best_mode.yrd, mdcounts,
+                                                     &rate, &rd.rate_y, &distortion, this_rd_thresh) ;
+
+            rd.rate2 += rate;
+            rd.distortion2 += distortion;
+
+            /* If even the 'Y' rd value of split is higher than best so far
+             * then dont bother looking at UV
+             */
+            if (tmp_rd < best_mode.yrd)
+            {
+                /* Now work out UV cost and add it in */
+                rd_inter4x4_uv(cpi, x, &rd.rate_uv, &rd.distortion_uv, cpi->common.full_pixel);
+                rd.rate2 += rd.rate_uv;
+                rd.distortion2 += rd.distortion_uv;
+            }
+            else
+            {
+                this_rd = INT_MAX;
+                disable_skip = 1;
+            }
+        }
+        break;
+        case DC_PRED:
+        case V_PRED:
+        case H_PRED:
+        case TM_PRED:
+        {
+            int distortion;
+            x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;
+
+            vp8_build_intra_predictors_mby_s(xd,
+                                             xd->dst.y_buffer - xd->dst.y_stride,
+                                             xd->dst.y_buffer - 1,
+                                             xd->dst.y_stride,
+                                             xd->predictor,
+                                             16);
+            macro_block_yrd(x, &rd.rate_y, &distortion) ;
+            rd.rate2 += rd.rate_y;
+            rd.distortion2 += distortion;
+            rd.rate2 += x->mbmode_cost[x->e_mbd.frame_type][x->e_mbd.mode_info_context->mbmi.mode];
+            rd.rate2 += uv_intra_rate;
+            rd.rate_uv = uv_intra_rate_tokenonly;
+            rd.distortion2 += uv_intra_distortion;
+            rd.distortion_uv = uv_intra_distortion;
+        }
+        break;
+
+        case NEWMV:
+        {
+            int thissme;
+            int bestsme = INT_MAX;
+            int step_param = cpi->sf.first_step;
+            int further_steps;
+            int n;
+            int do_refine=1;   /* If last step (1-away) of n-step search doesn't pick the center point as the best match,
+                                  we will do a final 1-away diamond refining search  */
+
+            int sadpb = x->sadperbit16;
+            int_mv mvp_full;
+
+            int col_min = ((best_ref_mv.as_mv.col+7)>>3) - MAX_FULL_PEL_VAL;
+            int row_min = ((best_ref_mv.as_mv.row+7)>>3) - MAX_FULL_PEL_VAL;
+            int col_max = (best_ref_mv.as_mv.col>>3) + MAX_FULL_PEL_VAL;
+            int row_max = (best_ref_mv.as_mv.row>>3) + MAX_FULL_PEL_VAL;
+
+            int tmp_col_min = x->mv_col_min;
+            int tmp_col_max = x->mv_col_max;
+            int tmp_row_min = x->mv_row_min;
+            int tmp_row_max = x->mv_row_max;
+
+            if(!saddone)
+            {
+                vp8_cal_sad(cpi,xd,x, recon_yoffset ,&near_sadidx[0] );
+                saddone = 1;
+            }
+
+            vp8_mv_pred(cpi, &x->e_mbd, x->e_mbd.mode_info_context, &mvp,
+                        x->e_mbd.mode_info_context->mbmi.ref_frame, cpi->common.ref_frame_sign_bias, &sr, &near_sadidx[0]);
+
+            mvp_full.as_mv.col = mvp.as_mv.col>>3;
+            mvp_full.as_mv.row = mvp.as_mv.row>>3;
+
+            /* Get intersection of UMV window and valid MV window to
+             * reduce # of checks in diamond search.
+             */
+            if (x->mv_col_min < col_min )
+                x->mv_col_min = col_min;
+            if (x->mv_col_max > col_max )
+                x->mv_col_max = col_max;
+            if (x->mv_row_min < row_min )
+                x->mv_row_min = row_min;
+            if (x->mv_row_max > row_max )
+                x->mv_row_max = row_max;
+
+            /* adjust search range according to sr from mv prediction */
+            if(sr > step_param)
+                step_param = sr;
+
+            /* Initial step/diamond search */
+            {
+                bestsme = cpi->diamond_search_sad(x, b, d, &mvp_full, &d->bmi.mv,
+                                        step_param, sadpb, &num00,
+                                        &cpi->fn_ptr[BLOCK_16X16],
+                                        x->mvcost, &best_ref_mv);
+                mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
+
+                /* Further step/diamond searches as necessary */
+                further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
+
+                n = num00;
+                num00 = 0;
+
+                /* If there won't be more n-step search, check to see if refining search is needed. */
+                if (n > further_steps)
+                    do_refine = 0;
+
+                while (n < further_steps)
+                {
+                    n++;
+
+                    if (num00)
+                        num00--;
+                    else
+                    {
+                        thissme = cpi->diamond_search_sad(x, b, d, &mvp_full,
+                                    &d->bmi.mv, step_param + n, sadpb, &num00,
+                                    &cpi->fn_ptr[BLOCK_16X16], x->mvcost,
+                                    &best_ref_mv);
+
+                        /* check to see if refining search is needed. */
+                        if (num00 > (further_steps-n))
+                            do_refine = 0;
+
+                        if (thissme < bestsme)
+                        {
+                            bestsme = thissme;
+                            mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
+                        }
+                        else
+                        {
+                            d->bmi.mv.as_int = mode_mv[NEWMV].as_int;
+                        }
+                    }
+                }
+            }
+
+            /* final 1-away diamond refining search */
+            if (do_refine == 1)
+            {
+                int search_range;
+
+                search_range = 8;
+
+                thissme = cpi->refining_search_sad(x, b, d, &d->bmi.mv, sadpb,
+                                       search_range, &cpi->fn_ptr[BLOCK_16X16],
+                                       x->mvcost, &best_ref_mv);
+
+                if (thissme < bestsme)
+                {
+                    bestsme = thissme;
+                    mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
+                }
+                else
+                {
+                    d->bmi.mv.as_int = mode_mv[NEWMV].as_int;
+                }
+            }
+
+            x->mv_col_min = tmp_col_min;
+            x->mv_col_max = tmp_col_max;
+            x->mv_row_min = tmp_row_min;
+            x->mv_row_max = tmp_row_max;
+
+            if (bestsme < INT_MAX)
+            {
+                int dis; /* TODO: use dis in distortion calculation later. */
+                unsigned int sse;
+                cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv, &best_ref_mv,
+                                             x->errorperbit,
+                                             &cpi->fn_ptr[BLOCK_16X16],
+                                             x->mvcost, &dis, &sse);
+            }
+
+            mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
+
+            /* Add the new motion vector cost to our rolling cost variable */
+            rd.rate2 += vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv, x->mvcost, 96);
+        }
+
+        case NEARESTMV:
+        case NEARMV:
+            /* Clip "next_nearest" so that it does not extend to far out
+             * of image
+             */
+            vp8_clamp_mv2(&mode_mv[this_mode], xd);
+
+            /* Do not bother proceeding if the vector (from newmv, nearest
+             * or near) is 0,0 as this should then be coded using the zeromv
+             * mode.
+             */
+            if (((this_mode == NEARMV) || (this_mode == NEARESTMV)) && (mode_mv[this_mode].as_int == 0))
+                continue;
+
+        case ZEROMV:
+
+            /* Trap vectors that reach beyond the UMV borders
+             * Note that ALL New MV, Nearest MV Near MV and Zero MV code
+             * drops through to this point because of the lack of break
+             * statements in the previous two cases.
+             */
+            if (((mode_mv[this_mode].as_mv.row >> 3) < x->mv_row_min) || ((mode_mv[this_mode].as_mv.row >> 3) > x->mv_row_max) ||
+                ((mode_mv[this_mode].as_mv.col >> 3) < x->mv_col_min) || ((mode_mv[this_mode].as_mv.col >> 3) > x->mv_col_max))
+                continue;
+
+            vp8_set_mbmode_and_mvs(x, this_mode, &mode_mv[this_mode]);
+            this_rd = evaluate_inter_mode_rd(mdcounts, &rd,
+                                             &disable_skip, cpi, x);
+            break;
+
+        default:
+            break;
+        }
+
+        this_rd = calculate_final_rd_costs(this_rd, &rd, &other_cost,
+                                           disable_skip, uv_intra_tteob,
+                                           intra_rd_penalty, cpi, x);
+
+        /* Keep record of best intra distortion */
+        if ((x->e_mbd.mode_info_context->mbmi.ref_frame == INTRA_FRAME) &&
+            (this_rd < best_mode.intra_rd) )
+        {
+          best_mode.intra_rd = this_rd;
+            *returnintra = rd.distortion2 ;
+        }
+#if CONFIG_TEMPORAL_DENOISING
+        if (cpi->oxcf.noise_sensitivity)
+        {
+            unsigned int sse;
+            vp8_get_inter_mbpred_error(x,&cpi->fn_ptr[BLOCK_16X16],&sse,
+                                   mode_mv[this_mode]);
+
+            if (sse < best_rd_sse)
+                best_rd_sse = sse;
+
+            /* Store for later use by denoiser. */
+            if (this_mode == ZEROMV && sse < zero_mv_sse )
+            {
+                zero_mv_sse = sse;
+                x->best_zeromv_reference_frame =
+                        x->e_mbd.mode_info_context->mbmi.ref_frame;
+            }
+
+            /* Store the best NEWMV in x for later use in the denoiser. */
+            if (x->e_mbd.mode_info_context->mbmi.mode == NEWMV &&
+                    sse < best_sse)
+            {
+                best_sse = sse;
+                vp8_get_inter_mbpred_error(x,&cpi->fn_ptr[BLOCK_16X16],&best_sse,
+                                       mode_mv[this_mode]);
+                x->best_sse_inter_mode = NEWMV;
+                x->best_sse_mv = x->e_mbd.mode_info_context->mbmi.mv;
+                x->need_to_clamp_best_mvs =
+                    x->e_mbd.mode_info_context->mbmi.need_to_clamp_mvs;
+                x->best_reference_frame =
+                    x->e_mbd.mode_info_context->mbmi.ref_frame;
+            }
+        }
+#endif
+
+        /* Did this mode help.. i.i is it the new best mode */
+        if (this_rd < best_mode.rd || x->skip)
+        {
+            /* Note index of best mode so far */
+            best_mode_index = mode_index;
+            *returnrate = rd.rate2;
+            *returndistortion = rd.distortion2;
+            if (this_mode <= B_PRED)
+            {
+                x->e_mbd.mode_info_context->mbmi.uv_mode = uv_intra_mode;
+                /* required for left and above block mv */
+                x->e_mbd.mode_info_context->mbmi.mv.as_int = 0;
+            }
+            update_best_mode(&best_mode, this_rd, &rd, other_cost, x);
+
+
+            /* Testing this mode gave rise to an improvement in best error
+             * score. Lower threshold a bit for next time
+             */
+            x->rd_thresh_mult[mode_index] =
+                (x->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ?
+                    x->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;
+        }
+
+        /* If the mode did not help improve the best error case then raise
+         * the threshold for testing that mode next time around.
+         */
+        else
+        {
+            x->rd_thresh_mult[mode_index] += 4;
+
+            if (x->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
+                x->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
+        }
+        x->rd_threshes[mode_index] =
+            (cpi->rd_baseline_thresh[mode_index] >> 7) *
+                x->rd_thresh_mult[mode_index];
+
+        if (x->skip)
+            break;
+
+    }
+
+    /* Reduce the activation RD thresholds for the best choice mode */
+    if ((cpi->rd_baseline_thresh[best_mode_index] > 0) && (cpi->rd_baseline_thresh[best_mode_index] < (INT_MAX >> 2)))
+    {
+        int best_adjustment = (x->rd_thresh_mult[best_mode_index] >> 2);
+
+        x->rd_thresh_mult[best_mode_index] =
+            (x->rd_thresh_mult[best_mode_index] >=
+                (MIN_THRESHMULT + best_adjustment)) ?
+                    x->rd_thresh_mult[best_mode_index] - best_adjustment :
+                    MIN_THRESHMULT;
+        x->rd_threshes[best_mode_index] =
+            (cpi->rd_baseline_thresh[best_mode_index] >> 7) *
+                x->rd_thresh_mult[best_mode_index];
+    }
+
+#if CONFIG_TEMPORAL_DENOISING
+    if (cpi->oxcf.noise_sensitivity)
+    {
+        int block_index = mb_row * cpi->common.mb_cols + mb_col;
+        if (x->best_sse_inter_mode == DC_PRED)
+        {
+            /* No best MV found. */
+            x->best_sse_inter_mode = best_mode.mbmode.mode;
+            x->best_sse_mv = best_mode.mbmode.mv;
+            x->need_to_clamp_best_mvs = best_mode.mbmode.need_to_clamp_mvs;
+            x->best_reference_frame = best_mode.mbmode.ref_frame;
+            best_sse = best_rd_sse;
+        }
+        vp8_denoiser_denoise_mb(&cpi->denoiser, x, best_sse, zero_mv_sse,
+                                recon_yoffset, recon_uvoffset,
+                                &cpi->common.lf_info, mb_row, mb_col,
+                                block_index);
+
+        /* Reevaluate ZEROMV after denoising. */
+        if (best_mode.mbmode.ref_frame == INTRA_FRAME &&
+            x->best_zeromv_reference_frame != INTRA_FRAME)
+        {
+            int this_rd = INT_MAX;
+            int disable_skip = 0;
+            int other_cost = 0;
+            int this_ref_frame = x->best_zeromv_reference_frame;
+            rd.rate2 = x->ref_frame_cost[this_ref_frame] +
+                    vp8_cost_mv_ref(ZEROMV, mdcounts);
+            rd.distortion2 = 0;
+
+            /* set up the proper prediction buffers for the frame */
+            x->e_mbd.mode_info_context->mbmi.ref_frame = this_ref_frame;
+            x->e_mbd.pre.y_buffer = plane[this_ref_frame][0];
+            x->e_mbd.pre.u_buffer = plane[this_ref_frame][1];
+            x->e_mbd.pre.v_buffer = plane[this_ref_frame][2];
+
+            x->e_mbd.mode_info_context->mbmi.mode = ZEROMV;
+            x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;
+            x->e_mbd.mode_info_context->mbmi.mv.as_int = 0;
+
+            this_rd = evaluate_inter_mode_rd(mdcounts, &rd, &disable_skip, cpi, x);
+            this_rd = calculate_final_rd_costs(this_rd, &rd, &other_cost,
+                                               disable_skip, uv_intra_tteob,
+                                               intra_rd_penalty, cpi, x);
+            if (this_rd < best_mode.rd || x->skip)
+            {
+                *returnrate = rd.rate2;
+                *returndistortion = rd.distortion2;
+                update_best_mode(&best_mode, this_rd, &rd, other_cost, x);
+            }
+        }
+
+    }
+#endif
+
+    if (cpi->is_src_frame_alt_ref &&
+        (best_mode.mbmode.mode != ZEROMV || best_mode.mbmode.ref_frame != ALTREF_FRAME))
+    {
+        x->e_mbd.mode_info_context->mbmi.mode = ZEROMV;
+        x->e_mbd.mode_info_context->mbmi.ref_frame = ALTREF_FRAME;
+        x->e_mbd.mode_info_context->mbmi.mv.as_int = 0;
+        x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;
+        x->e_mbd.mode_info_context->mbmi.mb_skip_coeff =
+                                        (cpi->common.mb_no_coeff_skip);
+        x->e_mbd.mode_info_context->mbmi.partitioning = 0;
+        return;
+    }
+
+
+    /* macroblock modes */
+    memcpy(&x->e_mbd.mode_info_context->mbmi, &best_mode.mbmode, sizeof(MB_MODE_INFO));
+
+    if (best_mode.mbmode.mode == B_PRED)
+    {
+        for (i = 0; i < 16; i++)
+            xd->mode_info_context->bmi[i].as_mode = best_mode.bmodes[i].as_mode;
+    }
+
+    if (best_mode.mbmode.mode == SPLITMV)
+    {
+        for (i = 0; i < 16; i++)
+            xd->mode_info_context->bmi[i].mv.as_int = best_mode.bmodes[i].mv.as_int;
+
+        memcpy(x->partition_info, &best_mode.partition, sizeof(PARTITION_INFO));
+
+        x->e_mbd.mode_info_context->mbmi.mv.as_int =
+                                      x->partition_info->bmi[15].mv.as_int;
+    }
+
+    if (sign_bias
+        != cpi->common.ref_frame_sign_bias[xd->mode_info_context->mbmi.ref_frame])
+        best_ref_mv.as_int = best_ref_mv_sb[!sign_bias].as_int;
+
+    rd_update_mvcount(x, &best_ref_mv);
+}
+
+void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate_)
+{
+    int error4x4, error16x16;
+    int rate4x4, rate16x16 = 0, rateuv;
+    int dist4x4, dist16x16, distuv;
+    int rate;
+    int rate4x4_tokenonly = 0;
+    int rate16x16_tokenonly = 0;
+    int rateuv_tokenonly = 0;
+
+    x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;
+
+    rd_pick_intra_mbuv_mode(x, &rateuv, &rateuv_tokenonly, &distuv);
+    rate = rateuv;
+
+    error16x16 = rd_pick_intra16x16mby_mode(x, &rate16x16, &rate16x16_tokenonly,
+                                            &dist16x16);
+
+    error4x4 = rd_pick_intra4x4mby_modes(x, &rate4x4, &rate4x4_tokenonly,
+                                         &dist4x4, error16x16);
+
+    if (error4x4 < error16x16)
+    {
+        x->e_mbd.mode_info_context->mbmi.mode = B_PRED;
+        rate += rate4x4;
+    }
+    else
+    {
+        rate += rate16x16;
+    }
+
+    *rate_ = rate;
+}
diff --git a/libs/libvpx/vp8/encoder/rdopt.h b/libs/libvpx/vp8/encoder/rdopt.h
new file mode 100644
index 0000000000..1cb1a07266
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/rdopt.h
@@ -0,0 +1,149 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_ENCODER_RDOPT_H_
+#define VP8_ENCODER_RDOPT_H_
+
+#include "./vpx_config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define RDCOST(RM,DM,R,D) ( ((128+(R)*(RM)) >> 8) + (DM)*(D) )
+
+static INLINE void insertsortmv(int arr[], int len)
+{
+    int i, j, k;
+
+    for ( i = 1 ; i <= len-1 ; i++ )
+    {
+        for ( j = 0 ; j < i ; j++ )
+        {
+            if ( arr[j] > arr[i] )
+            {
+                int temp;
+
+                temp = arr[i];
+
+                for ( k = i; k >j; k--)
+                    arr[k] = arr[k - 1] ;
+
+                arr[j] = temp ;
+            }
+        }
+    }
+}
+
+static INLINE void insertsortsad(int arr[],int idx[], int len)
+{
+    int i, j, k;
+
+    for ( i = 1 ; i <= len-1 ; i++ )
+    {
+        for ( j = 0 ; j < i ; j++ )
+        {
+            if ( arr[j] > arr[i] )
+            {
+                int temp, tempi;
+
+                temp = arr[i];
+                tempi = idx[i];
+
+                for ( k = i; k >j; k--)
+                {
+                    arr[k] = arr[k - 1] ;
+                    idx[k] = idx[k - 1];
+                }
+
+                arr[j] = temp ;
+                idx[j] = tempi;
+            }
+        }
+    }
+}
+
+extern void vp8_initialize_rd_consts(VP8_COMP *cpi, MACROBLOCK *x, int Qvalue);
+extern void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x,
+                                   int recon_yoffset, int recon_uvoffset,
+                                   int *returnrate, int *returndistortion,
+                                   int *returnintra, int mb_row, int mb_col);
+extern void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate);
+
+
+static INLINE void get_plane_pointers(const YV12_BUFFER_CONFIG *fb,
+                                      unsigned char *plane[3],
+                                      unsigned int recon_yoffset,
+                                      unsigned int recon_uvoffset)
+{
+    plane[0] = fb->y_buffer + recon_yoffset;
+    plane[1] = fb->u_buffer + recon_uvoffset;
+    plane[2] = fb->v_buffer + recon_uvoffset;
+}
+
+
+static INLINE void get_predictor_pointers(const VP8_COMP *cpi,
+                                          unsigned char *plane[4][3],
+                                          unsigned int recon_yoffset,
+                                          unsigned int recon_uvoffset)
+{
+    if (cpi->ref_frame_flags & VP8_LAST_FRAME)
+        get_plane_pointers(&cpi->common.yv12_fb[cpi->common.lst_fb_idx],
+                           plane[LAST_FRAME], recon_yoffset, recon_uvoffset);
+
+    if (cpi->ref_frame_flags & VP8_GOLD_FRAME)
+        get_plane_pointers(&cpi->common.yv12_fb[cpi->common.gld_fb_idx],
+                           plane[GOLDEN_FRAME], recon_yoffset, recon_uvoffset);
+
+    if (cpi->ref_frame_flags & VP8_ALTR_FRAME)
+        get_plane_pointers(&cpi->common.yv12_fb[cpi->common.alt_fb_idx],
+                           plane[ALTREF_FRAME], recon_yoffset, recon_uvoffset);
+}
+
+
+static INLINE void get_reference_search_order(const VP8_COMP *cpi,
+                                              int ref_frame_map[4])
+{
+    int i=0;
+
+    ref_frame_map[i++] = INTRA_FRAME;
+    if (cpi->ref_frame_flags & VP8_LAST_FRAME)
+        ref_frame_map[i++] = LAST_FRAME;
+    if (cpi->ref_frame_flags & VP8_GOLD_FRAME)
+        ref_frame_map[i++] = GOLDEN_FRAME;
+    if (cpi->ref_frame_flags & VP8_ALTR_FRAME)
+        ref_frame_map[i++] = ALTREF_FRAME;
+    for(; i<4; i++)
+        ref_frame_map[i] = -1;
+}
+
+
+extern void vp8_mv_pred
+(
+    VP8_COMP *cpi,
+    MACROBLOCKD *xd,
+    const MODE_INFO *here,
+    int_mv *mvp,
+    int refframe,
+    int *ref_frame_sign_bias,
+    int *sr,
+    int near_sadidx[]
+);
+void vp8_cal_sad(VP8_COMP *cpi, MACROBLOCKD *xd, MACROBLOCK *x, int recon_yoffset, int near_sadidx[]);
+int VP8_UVSSE(MACROBLOCK *x);
+int vp8_cost_mv_ref(MB_PREDICTION_MODE m, const int near_mv_ref_ct[4]);
+void vp8_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, int_mv *mv);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_ENCODER_RDOPT_H_
diff --git a/libs/libvpx/vp8/encoder/segmentation.c b/libs/libvpx/vp8/encoder/segmentation.c
new file mode 100644
index 0000000000..fdd22fceb6
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/segmentation.c
@@ -0,0 +1,66 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "segmentation.h"
+#include "vpx_mem/vpx_mem.h"
+
+void vp8_update_gf_useage_maps(VP8_COMP *cpi, VP8_COMMON *cm, MACROBLOCK *x)
+{
+    int mb_row, mb_col;
+
+    MODE_INFO *this_mb_mode_info = cm->mi;
+
+    x->gf_active_ptr = (signed char *)cpi->gf_active_flags;
+
+    if ((cm->frame_type == KEY_FRAME) || (cm->refresh_golden_frame))
+    {
+        /* Reset Gf useage monitors */
+        memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
+        cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
+    }
+    else
+    {
+        /* for each macroblock row in image */
+        for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
+        {
+            /* for each macroblock col in image */
+            for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+            {
+
+                /* If using golden then set GF active flag if not already set.
+                 * If using last frame 0,0 mode then leave flag as it is
+                 * else if using non 0,0 motion or intra modes then clear
+                 * flag if it is currently set
+                 */
+                if ((this_mb_mode_info->mbmi.ref_frame == GOLDEN_FRAME) || (this_mb_mode_info->mbmi.ref_frame == ALTREF_FRAME))
+                {
+                    if (*(x->gf_active_ptr) == 0)
+                    {
+                        *(x->gf_active_ptr) = 1;
+                        cpi->gf_active_count ++;
+                    }
+                }
+                else if ((this_mb_mode_info->mbmi.mode != ZEROMV) && *(x->gf_active_ptr))
+                {
+                    *(x->gf_active_ptr) = 0;
+                    cpi->gf_active_count--;
+                }
+
+                x->gf_active_ptr++;          /* Step onto next entry */
+                this_mb_mode_info++;         /* skip to next mb */
+
+            }
+
+            /* this is to account for the border */
+            this_mb_mode_info++;
+        }
+    }
+}
diff --git a/libs/libvpx/vp8/encoder/segmentation.h b/libs/libvpx/vp8/encoder/segmentation.h
new file mode 100644
index 0000000000..6b5500594e
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/segmentation.h
@@ -0,0 +1,28 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_ENCODER_SEGMENTATION_H_
+#define VP8_ENCODER_SEGMENTATION_H_
+
+#include "string.h"
+#include "vp8/common/blockd.h"
+#include "onyx_int.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern void vp8_update_gf_useage_maps(VP8_COMP *cpi, VP8_COMMON *cm, MACROBLOCK *x);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_ENCODER_SEGMENTATION_H_
diff --git a/libs/libvpx/vp8/encoder/temporal_filter.c b/libs/libvpx/vp8/encoder/temporal_filter.c
new file mode 100644
index 0000000000..85d26c20f4
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/temporal_filter.c
@@ -0,0 +1,521 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp8/common/onyxc_int.h"
+#include "onyx_int.h"
+#include "vp8/common/systemdependent.h"
+#include "vp8/encoder/quantize.h"
+#include "vp8/common/alloccommon.h"
+#include "mcomp.h"
+#include "firstpass.h"
+#include "vpx_scale/vpx_scale.h"
+#include "vp8/common/extend.h"
+#include "ratectrl.h"
+#include "vp8/common/quant_common.h"
+#include "segmentation.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp8/common/swapyv12buffer.h"
+#include "vp8/common/threading.h"
+#include "vpx_ports/vpx_timer.h"
+
+#include <math.h>
+#include <limits.h>
+
+#define ALT_REF_MC_ENABLED 1    /* dis/enable MC in AltRef filtering */
+#define ALT_REF_SUBPEL_ENABLED 1 /* dis/enable subpel in MC AltRef filtering */
+
+#if VP8_TEMPORAL_ALT_REF
+
+static void vp8_temporal_filter_predictors_mb_c
+(
+    MACROBLOCKD *x,
+    unsigned char *y_mb_ptr,
+    unsigned char *u_mb_ptr,
+    unsigned char *v_mb_ptr,
+    int stride,
+    int mv_row,
+    int mv_col,
+    unsigned char *pred
+)
+{
+    int offset;
+    unsigned char *yptr, *uptr, *vptr;
+
+    /* Y */
+    yptr = y_mb_ptr + (mv_row >> 3) * stride + (mv_col >> 3);
+
+    if ((mv_row | mv_col) & 7)
+    {
+        x->subpixel_predict16x16(yptr, stride,
+                                    mv_col & 7, mv_row & 7, &pred[0], 16);
+    }
+    else
+    {
+        vp8_copy_mem16x16(yptr, stride, &pred[0], 16);
+    }
+
+    /* U & V */
+    mv_row >>= 1;
+    mv_col >>= 1;
+    stride = (stride + 1) >> 1;
+    offset = (mv_row >> 3) * stride + (mv_col >> 3);
+    uptr = u_mb_ptr + offset;
+    vptr = v_mb_ptr + offset;
+
+    if ((mv_row | mv_col) & 7)
+    {
+        x->subpixel_predict8x8(uptr, stride,
+                            mv_col & 7, mv_row & 7, &pred[256], 8);
+        x->subpixel_predict8x8(vptr, stride,
+                            mv_col & 7, mv_row & 7, &pred[320], 8);
+    }
+    else
+    {
+        vp8_copy_mem8x8(uptr, stride, &pred[256], 8);
+        vp8_copy_mem8x8(vptr, stride, &pred[320], 8);
+    }
+}
+void vp8_temporal_filter_apply_c
+(
+    unsigned char *frame1,
+    unsigned int stride,
+    unsigned char *frame2,
+    unsigned int block_size,
+    int strength,
+    int filter_weight,
+    unsigned int *accumulator,
+    unsigned short *count
+)
+{
+    unsigned int i, j, k;
+    int modifier;
+    int byte = 0;
+    const int rounding = strength > 0 ? 1 << (strength - 1) : 0;
+
+    for (i = 0,k = 0; i < block_size; i++)
+    {
+        for (j = 0; j < block_size; j++, k++)
+        {
+
+            int src_byte = frame1[byte];
+            int pixel_value = *frame2++;
+
+            modifier   = src_byte - pixel_value;
+            /* This is an integer approximation of:
+             * float coeff = (3.0 * modifer * modifier) / pow(2, strength);
+             * modifier =  (int)roundf(coeff > 16 ? 0 : 16-coeff);
+             */
+            modifier  *= modifier;
+            modifier  *= 3;
+            modifier  += rounding;
+            modifier >>= strength;
+
+            if (modifier > 16)
+                modifier = 16;
+
+            modifier = 16 - modifier;
+            modifier *= filter_weight;
+
+            count[k] += modifier;
+            accumulator[k] += modifier * pixel_value;
+
+            byte++;
+        }
+
+        byte += stride - block_size;
+    }
+}
+
+#if ALT_REF_MC_ENABLED
+
+static int vp8_temporal_filter_find_matching_mb_c
+(
+    VP8_COMP *cpi,
+    YV12_BUFFER_CONFIG *arf_frame,
+    YV12_BUFFER_CONFIG *frame_ptr,
+    int mb_offset,
+    int error_thresh
+)
+{
+    MACROBLOCK *x = &cpi->mb;
+    int step_param;
+    int sadpb = x->sadperbit16;
+    int bestsme = INT_MAX;
+
+    BLOCK *b = &x->block[0];
+    BLOCKD *d = &x->e_mbd.block[0];
+    int_mv best_ref_mv1;
+    int_mv best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
+
+    /* Save input state */
+    unsigned char **base_src = b->base_src;
+    int src = b->src;
+    int src_stride = b->src_stride;
+    unsigned char *base_pre = x->e_mbd.pre.y_buffer;
+    int pre = d->offset;
+    int pre_stride = x->e_mbd.pre.y_stride;
+
+    (void)error_thresh;
+
+    best_ref_mv1.as_int = 0;
+    best_ref_mv1_full.as_mv.col = best_ref_mv1.as_mv.col >>3;
+    best_ref_mv1_full.as_mv.row = best_ref_mv1.as_mv.row >>3;
+
+    /* Setup frame pointers */
+    b->base_src = &arf_frame->y_buffer;
+    b->src_stride = arf_frame->y_stride;
+    b->src = mb_offset;
+
+    x->e_mbd.pre.y_buffer = frame_ptr->y_buffer;
+    x->e_mbd.pre.y_stride = frame_ptr->y_stride;
+    d->offset = mb_offset;
+
+    /* Further step/diamond searches as necessary */
+    if (cpi->Speed < 8)
+    {
+        step_param = cpi->sf.first_step + (cpi->Speed > 5);
+    }
+    else
+    {
+        step_param = cpi->sf.first_step + 2;
+    }
+
+    /* TODO Check that the 16x16 vf & sdf are selected here */
+    /* Ignore mv costing by sending NULL cost arrays */
+    bestsme = vp8_hex_search(x, b, d, &best_ref_mv1_full, &d->bmi.mv,
+                             step_param, sadpb,
+                             &cpi->fn_ptr[BLOCK_16X16],
+                             NULL, NULL, &best_ref_mv1);
+
+#if ALT_REF_SUBPEL_ENABLED
+    /* Try sub-pixel MC? */
+    {
+        int distortion;
+        unsigned int sse;
+        /* Ignore mv costing by sending NULL cost array */
+        bestsme = cpi->find_fractional_mv_step(x, b, d,
+                                               &d->bmi.mv,
+                                               &best_ref_mv1,
+                                               x->errorperbit,
+                                               &cpi->fn_ptr[BLOCK_16X16],
+                                               NULL, &distortion, &sse);
+    }
+#endif
+
+    /* Save input state */
+    b->base_src = base_src;
+    b->src = src;
+    b->src_stride = src_stride;
+    x->e_mbd.pre.y_buffer = base_pre;
+    d->offset = pre;
+    x->e_mbd.pre.y_stride = pre_stride;
+
+    return bestsme;
+}
+#endif
+
+static void vp8_temporal_filter_iterate_c
+(
+    VP8_COMP *cpi,
+    int frame_count,
+    int alt_ref_index,
+    int strength
+)
+{
+    int byte;
+    int frame;
+    int mb_col, mb_row;
+    unsigned int filter_weight;
+    int mb_cols = cpi->common.mb_cols;
+    int mb_rows = cpi->common.mb_rows;
+    int mb_y_offset = 0;
+    int mb_uv_offset = 0;
+    DECLARE_ALIGNED(16, unsigned int, accumulator[16*16 + 8*8 + 8*8]);
+    DECLARE_ALIGNED(16, unsigned short, count[16*16 + 8*8 + 8*8]);
+    MACROBLOCKD *mbd = &cpi->mb.e_mbd;
+    YV12_BUFFER_CONFIG *f = cpi->frames[alt_ref_index];
+    unsigned char *dst1, *dst2;
+    DECLARE_ALIGNED(16, unsigned char,  predictor[16*16 + 8*8 + 8*8]);
+
+    /* Save input state */
+    unsigned char *y_buffer = mbd->pre.y_buffer;
+    unsigned char *u_buffer = mbd->pre.u_buffer;
+    unsigned char *v_buffer = mbd->pre.v_buffer;
+
+    for (mb_row = 0; mb_row < mb_rows; mb_row++)
+    {
+#if ALT_REF_MC_ENABLED
+        /* Source frames are extended to 16 pixels.  This is different than
+         *  L/A/G reference frames that have a border of 32 (VP8BORDERINPIXELS)
+         * A 6 tap filter is used for motion search.  This requires 2 pixels
+         *  before and 3 pixels after.  So the largest Y mv on a border would
+         *  then be 16 - 3.  The UV blocks are half the size of the Y and
+         *  therefore only extended by 8.  The largest mv that a UV block
+         *  can support is 8 - 3.  A UV mv is half of a Y mv.
+         *  (16 - 3) >> 1 == 6 which is greater than 8 - 3.
+         * To keep the mv in play for both Y and UV planes the max that it
+         *  can be on a border is therefore 16 - 5.
+         */
+        cpi->mb.mv_row_min = -((mb_row * 16) + (16 - 5));
+        cpi->mb.mv_row_max = ((cpi->common.mb_rows - 1 - mb_row) * 16)
+                                + (16 - 5);
+#endif
+
+        for (mb_col = 0; mb_col < mb_cols; mb_col++)
+        {
+            int i, j, k;
+            int stride;
+
+            memset(accumulator, 0, 384*sizeof(unsigned int));
+            memset(count, 0, 384*sizeof(unsigned short));
+
+#if ALT_REF_MC_ENABLED
+            cpi->mb.mv_col_min = -((mb_col * 16) + (16 - 5));
+            cpi->mb.mv_col_max = ((cpi->common.mb_cols - 1 - mb_col) * 16)
+                                    + (16 - 5);
+#endif
+
+            for (frame = 0; frame < frame_count; frame++)
+            {
+                if (cpi->frames[frame] == NULL)
+                    continue;
+
+                mbd->block[0].bmi.mv.as_mv.row = 0;
+                mbd->block[0].bmi.mv.as_mv.col = 0;
+
+                if (frame == alt_ref_index)
+                {
+                    filter_weight = 2;
+                }
+                else
+                {
+                    int err = 0;
+#if ALT_REF_MC_ENABLED
+#define THRESH_LOW   10000
+#define THRESH_HIGH  20000
+                    /* Find best match in this frame by MC */
+                    err = vp8_temporal_filter_find_matching_mb_c
+                              (cpi,
+                               cpi->frames[alt_ref_index],
+                               cpi->frames[frame],
+                               mb_y_offset,
+                               THRESH_LOW);
+#endif
+                    /* Assign higher weight to matching MB if it's error
+                     * score is lower. If not applying MC default behavior
+                     * is to weight all MBs equal.
+                     */
+                    filter_weight = err<THRESH_LOW
+                                       ? 2 : err<THRESH_HIGH ? 1 : 0;
+                }
+
+                if (filter_weight != 0)
+                {
+                    /* Construct the predictors */
+                    vp8_temporal_filter_predictors_mb_c
+                        (mbd,
+                         cpi->frames[frame]->y_buffer + mb_y_offset,
+                         cpi->frames[frame]->u_buffer + mb_uv_offset,
+                         cpi->frames[frame]->v_buffer + mb_uv_offset,
+                         cpi->frames[frame]->y_stride,
+                         mbd->block[0].bmi.mv.as_mv.row,
+                         mbd->block[0].bmi.mv.as_mv.col,
+                         predictor);
+
+                    /* Apply the filter (YUV) */
+                    vp8_temporal_filter_apply
+                        (f->y_buffer + mb_y_offset,
+                         f->y_stride,
+                         predictor,
+                         16,
+                         strength,
+                         filter_weight,
+                         accumulator,
+                         count);
+
+                    vp8_temporal_filter_apply
+                        (f->u_buffer + mb_uv_offset,
+                         f->uv_stride,
+                         predictor + 256,
+                         8,
+                         strength,
+                         filter_weight,
+                         accumulator + 256,
+                         count + 256);
+
+                    vp8_temporal_filter_apply
+                        (f->v_buffer + mb_uv_offset,
+                         f->uv_stride,
+                         predictor + 320,
+                         8,
+                         strength,
+                         filter_weight,
+                         accumulator + 320,
+                         count + 320);
+                }
+            }
+
+            /* Normalize filter output to produce AltRef frame */
+            dst1 = cpi->alt_ref_buffer.y_buffer;
+            stride = cpi->alt_ref_buffer.y_stride;
+            byte = mb_y_offset;
+            for (i = 0,k = 0; i < 16; i++)
+            {
+                for (j = 0; j < 16; j++, k++)
+                {
+                    unsigned int pval = accumulator[k] + (count[k] >> 1);
+                    pval *= cpi->fixed_divide[count[k]];
+                    pval >>= 19;
+
+                    dst1[byte] = (unsigned char)pval;
+
+                    /* move to next pixel */
+                    byte++;
+                }
+
+                byte += stride - 16;
+            }
+
+            dst1 = cpi->alt_ref_buffer.u_buffer;
+            dst2 = cpi->alt_ref_buffer.v_buffer;
+            stride = cpi->alt_ref_buffer.uv_stride;
+            byte = mb_uv_offset;
+            for (i = 0,k = 256; i < 8; i++)
+            {
+                for (j = 0; j < 8; j++, k++)
+                {
+                    int m=k+64;
+
+                    /* U */
+                    unsigned int pval = accumulator[k] + (count[k] >> 1);
+                    pval *= cpi->fixed_divide[count[k]];
+                    pval >>= 19;
+                    dst1[byte] = (unsigned char)pval;
+
+                    /* V */
+                    pval = accumulator[m] + (count[m] >> 1);
+                    pval *= cpi->fixed_divide[count[m]];
+                    pval >>= 19;
+                    dst2[byte] = (unsigned char)pval;
+
+                    /* move to next pixel */
+                    byte++;
+                }
+
+                byte += stride - 8;
+            }
+
+            mb_y_offset += 16;
+            mb_uv_offset += 8;
+        }
+
+        mb_y_offset += 16*(f->y_stride-mb_cols);
+        mb_uv_offset += 8*(f->uv_stride-mb_cols);
+    }
+
+    /* Restore input state */
+    mbd->pre.y_buffer = y_buffer;
+    mbd->pre.u_buffer = u_buffer;
+    mbd->pre.v_buffer = v_buffer;
+}
+
+void vp8_temporal_filter_prepare_c
+(
+    VP8_COMP *cpi,
+    int distance
+)
+{
+    int frame = 0;
+
+    int num_frames_backward = 0;
+    int num_frames_forward = 0;
+    int frames_to_blur_backward = 0;
+    int frames_to_blur_forward = 0;
+    int frames_to_blur = 0;
+    int start_frame = 0;
+
+    int strength = cpi->oxcf.arnr_strength;
+
+    int blur_type = cpi->oxcf.arnr_type;
+
+    int max_frames = cpi->active_arnr_frames;
+
+    num_frames_backward = distance;
+    num_frames_forward = vp8_lookahead_depth(cpi->lookahead)
+                         - (num_frames_backward + 1);
+
+    switch (blur_type)
+    {
+    case 1:
+        /* Backward Blur */
+
+        frames_to_blur_backward = num_frames_backward;
+
+        if (frames_to_blur_backward >= max_frames)
+            frames_to_blur_backward = max_frames - 1;
+
+        frames_to_blur = frames_to_blur_backward + 1;
+        break;
+
+    case 2:
+        /* Forward Blur */
+
+        frames_to_blur_forward = num_frames_forward;
+
+        if (frames_to_blur_forward >= max_frames)
+            frames_to_blur_forward = max_frames - 1;
+
+        frames_to_blur = frames_to_blur_forward + 1;
+        break;
+
+    case 3:
+    default:
+        /* Center Blur */
+        frames_to_blur_forward = num_frames_forward;
+        frames_to_blur_backward = num_frames_backward;
+
+        if (frames_to_blur_forward > frames_to_blur_backward)
+            frames_to_blur_forward = frames_to_blur_backward;
+
+        if (frames_to_blur_backward > frames_to_blur_forward)
+            frames_to_blur_backward = frames_to_blur_forward;
+
+        /* When max_frames is even we have 1 more frame backward than forward */
+        if (frames_to_blur_forward > (max_frames - 1) / 2)
+            frames_to_blur_forward = ((max_frames - 1) / 2);
+
+        if (frames_to_blur_backward > (max_frames / 2))
+            frames_to_blur_backward = (max_frames / 2);
+
+        frames_to_blur = frames_to_blur_backward + frames_to_blur_forward + 1;
+        break;
+    }
+
+    start_frame = distance + frames_to_blur_forward;
+
+    /* Setup frame pointers, NULL indicates frame not included in filter */
+    memset(cpi->frames, 0, max_frames*sizeof(YV12_BUFFER_CONFIG *));
+    for (frame = 0; frame < frames_to_blur; frame++)
+    {
+        int which_buffer =  start_frame - frame;
+        struct lookahead_entry* buf = vp8_lookahead_peek(cpi->lookahead,
+                                                         which_buffer,
+                                                         PEEK_FORWARD);
+        cpi->frames[frames_to_blur-1-frame] = &buf->img;
+    }
+
+    vp8_temporal_filter_iterate_c (
+        cpi,
+        frames_to_blur,
+        frames_to_blur_backward,
+        strength );
+}
+#endif
diff --git a/libs/libvpx/vp8/encoder/tokenize.c b/libs/libvpx/vp8/encoder/tokenize.c
new file mode 100644
index 0000000000..afd46fb219
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/tokenize.c
@@ -0,0 +1,608 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include "onyx_int.h"
+#include "tokenize.h"
+#include "vpx_mem/vpx_mem.h"
+
+/* Global event counters used for accumulating statistics across several
+   compressions, then generating context.c = initial stats. */
+
+#ifdef VP8_ENTROPY_STATS
+_int64 context_counters[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
+#endif
+void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t) ;
+void vp8_fix_contexts(MACROBLOCKD *x);
+
+#include "dct_value_tokens.h"
+#include "dct_value_cost.h"
+
+const TOKENVALUE *const vp8_dct_value_tokens_ptr = dct_value_tokens +
+        DCT_MAX_VALUE;
+const short *const vp8_dct_value_cost_ptr = dct_value_cost + DCT_MAX_VALUE;
+
+#if 0
+int skip_true_count = 0;
+int skip_false_count = 0;
+#endif
+
+/* function used to generate dct_value_tokens and dct_value_cost tables */
+/*
+static void fill_value_tokens()
+{
+
+    TOKENVALUE *t = dct_value_tokens + DCT_MAX_VALUE;
+    const vp8_extra_bit_struct *e = vp8_extra_bits;
+
+    int i = -DCT_MAX_VALUE;
+    int sign = 1;
+
+    do
+    {
+        if (!i)
+            sign = 0;
+
+        {
+            const int a = sign ? -i : i;
+            int eb = sign;
+
+            if (a > 4)
+            {
+                int j = 4;
+
+                while (++j < 11  &&  e[j].base_val <= a) {}
+
+                t[i].Token = --j;
+                eb |= (a - e[j].base_val) << 1;
+            }
+            else
+                t[i].Token = a;
+
+            t[i].Extra = eb;
+        }
+
+        // initialize the cost for extra bits for all possible coefficient value.
+        {
+            int cost = 0;
+            const vp8_extra_bit_struct *p = vp8_extra_bits + t[i].Token;
+
+            if (p->base_val)
+            {
+                const int extra = t[i].Extra;
+                const int Length = p->Len;
+
+                if (Length)
+                    cost += vp8_treed_cost(p->tree, p->prob, extra >> 1, Length);
+
+                cost += vp8_cost_bit(vp8_prob_half, extra & 1); // sign
+                dct_value_cost[i + DCT_MAX_VALUE] = cost;
+            }
+
+        }
+
+    }
+    while (++i < DCT_MAX_VALUE);
+
+    vp8_dct_value_tokens_ptr = dct_value_tokens + DCT_MAX_VALUE;
+    vp8_dct_value_cost_ptr   = dct_value_cost + DCT_MAX_VALUE;
+}
+*/
+
+static void tokenize2nd_order_b
+(
+    MACROBLOCK *x,
+    TOKENEXTRA **tp,
+    VP8_COMP *cpi
+)
+{
+    MACROBLOCKD *xd = &x->e_mbd;
+    int pt;             /* near block/prev token context index */
+    int c;              /* start at DC */
+    TOKENEXTRA *t = *tp;/* store tokens starting here */
+    const BLOCKD *b;
+    const short *qcoeff_ptr;
+    ENTROPY_CONTEXT * a;
+    ENTROPY_CONTEXT * l;
+    int band, rc, v, token;
+    int eob;
+
+    b = xd->block + 24;
+    qcoeff_ptr = b->qcoeff;
+    a = (ENTROPY_CONTEXT *)xd->above_context + 8;
+    l = (ENTROPY_CONTEXT *)xd->left_context + 8;
+    eob = xd->eobs[24];
+    VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+
+    if(!eob)
+    {
+        /* c = band for this case */
+        t->Token = DCT_EOB_TOKEN;
+        t->context_tree = cpi->common.fc.coef_probs [1] [0] [pt];
+        t->skip_eob_node = 0;
+
+        ++x->coef_counts       [1] [0] [pt] [DCT_EOB_TOKEN];
+        t++;
+        *tp = t;
+        *a = *l = 0;
+        return;
+    }
+
+    v = qcoeff_ptr[0];
+    t->Extra = vp8_dct_value_tokens_ptr[v].Extra;
+    token    = vp8_dct_value_tokens_ptr[v].Token;
+    t->Token = token;
+
+    t->context_tree = cpi->common.fc.coef_probs [1] [0] [pt];
+    t->skip_eob_node = 0;
+    ++x->coef_counts       [1] [0] [pt] [token];
+    pt = vp8_prev_token_class[token];
+    t++;
+    c = 1;
+
+    for (; c < eob; c++)
+    {
+        rc = vp8_default_zig_zag1d[c];
+        band = vp8_coef_bands[c];
+        v = qcoeff_ptr[rc];
+
+        t->Extra = vp8_dct_value_tokens_ptr[v].Extra;
+        token    = vp8_dct_value_tokens_ptr[v].Token;
+
+        t->Token = token;
+        t->context_tree = cpi->common.fc.coef_probs [1] [band] [pt];
+
+        t->skip_eob_node = ((pt == 0));
+
+        ++x->coef_counts       [1] [band] [pt] [token];
+
+        pt = vp8_prev_token_class[token];
+        t++;
+    }
+    if (c < 16)
+    {
+        band = vp8_coef_bands[c];
+        t->Token = DCT_EOB_TOKEN;
+        t->context_tree = cpi->common.fc.coef_probs [1] [band] [pt];
+
+        t->skip_eob_node = 0;
+
+        ++x->coef_counts       [1] [band] [pt] [DCT_EOB_TOKEN];
+
+        t++;
+    }
+
+    *tp = t;
+    *a = *l = 1;
+
+}
+
+static void tokenize1st_order_b
+(
+    MACROBLOCK *x,
+    TOKENEXTRA **tp,
+    int type,           /* which plane: 0=Y no DC, 1=Y2, 2=UV, 3=Y with DC */
+    VP8_COMP *cpi
+)
+{
+    MACROBLOCKD *xd = &x->e_mbd;
+    unsigned int block;
+    const BLOCKD *b;
+    int pt;             /* near block/prev token context index */
+    int c;
+    int token;
+    TOKENEXTRA *t = *tp;/* store tokens starting here */
+    const short *qcoeff_ptr;
+    ENTROPY_CONTEXT * a;
+    ENTROPY_CONTEXT * l;
+    int band, rc, v;
+    int tmp1, tmp2;
+
+    b = xd->block;
+    /* Luma */
+    for (block = 0; block < 16; block++, b++)
+    {
+        const int eob = *b->eob;
+        tmp1 = vp8_block2above[block];
+        tmp2 = vp8_block2left[block];
+        qcoeff_ptr = b->qcoeff;
+        a = (ENTROPY_CONTEXT *)xd->above_context + tmp1;
+        l = (ENTROPY_CONTEXT *)xd->left_context + tmp2;
+
+        VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+
+        c = type ? 0 : 1;
+
+        if(c >= eob)
+        {
+            /* c = band for this case */
+            t->Token = DCT_EOB_TOKEN;
+            t->context_tree = cpi->common.fc.coef_probs [type] [c] [pt];
+            t->skip_eob_node = 0;
+
+            ++x->coef_counts       [type] [c] [pt] [DCT_EOB_TOKEN];
+            t++;
+            *tp = t;
+            *a = *l = 0;
+            continue;
+        }
+
+        v = qcoeff_ptr[c];
+
+        t->Extra = vp8_dct_value_tokens_ptr[v].Extra;
+        token    = vp8_dct_value_tokens_ptr[v].Token;
+        t->Token = token;
+
+        t->context_tree = cpi->common.fc.coef_probs [type] [c] [pt];
+        t->skip_eob_node = 0;
+        ++x->coef_counts       [type] [c] [pt] [token];
+        pt = vp8_prev_token_class[token];
+        t++;
+        c++;
+
+        assert(eob <= 16);
+        for (; c < eob; c++)
+        {
+            rc = vp8_default_zig_zag1d[c];
+            band = vp8_coef_bands[c];
+            v = qcoeff_ptr[rc];
+
+            t->Extra = vp8_dct_value_tokens_ptr[v].Extra;
+            token    = vp8_dct_value_tokens_ptr[v].Token;
+
+            t->Token = token;
+            t->context_tree = cpi->common.fc.coef_probs [type] [band] [pt];
+
+            t->skip_eob_node = (pt == 0);
+            ++x->coef_counts       [type] [band] [pt] [token];
+
+            pt = vp8_prev_token_class[token];
+            t++;
+        }
+        if (c < 16)
+        {
+            band = vp8_coef_bands[c];
+            t->Token = DCT_EOB_TOKEN;
+            t->context_tree = cpi->common.fc.coef_probs [type] [band] [pt];
+
+            t->skip_eob_node = 0;
+            ++x->coef_counts       [type] [band] [pt] [DCT_EOB_TOKEN];
+
+            t++;
+        }
+        *tp = t;
+        *a = *l = 1;
+    }
+
+    /* Chroma */
+    for (block = 16; block < 24; block++, b++)
+    {
+        const int eob = *b->eob;
+        tmp1 = vp8_block2above[block];
+        tmp2 = vp8_block2left[block];
+        qcoeff_ptr = b->qcoeff;
+        a = (ENTROPY_CONTEXT *)xd->above_context + tmp1;
+        l = (ENTROPY_CONTEXT *)xd->left_context + tmp2;
+
+        VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+
+        if(!eob)
+        {
+            /* c = band for this case */
+            t->Token = DCT_EOB_TOKEN;
+            t->context_tree = cpi->common.fc.coef_probs [2] [0] [pt];
+            t->skip_eob_node = 0;
+
+            ++x->coef_counts       [2] [0] [pt] [DCT_EOB_TOKEN];
+            t++;
+            *tp = t;
+            *a = *l = 0;
+            continue;
+        }
+
+        v = qcoeff_ptr[0];
+
+        t->Extra = vp8_dct_value_tokens_ptr[v].Extra;
+        token    = vp8_dct_value_tokens_ptr[v].Token;
+        t->Token = token;
+
+        t->context_tree = cpi->common.fc.coef_probs [2] [0] [pt];
+        t->skip_eob_node = 0;
+        ++x->coef_counts       [2] [0] [pt] [token];
+        pt = vp8_prev_token_class[token];
+        t++;
+        c = 1;
+
+        assert(eob <= 16);
+        for (; c < eob; c++)
+        {
+            rc = vp8_default_zig_zag1d[c];
+            band = vp8_coef_bands[c];
+            v = qcoeff_ptr[rc];
+
+            t->Extra = vp8_dct_value_tokens_ptr[v].Extra;
+            token    = vp8_dct_value_tokens_ptr[v].Token;
+
+            t->Token = token;
+            t->context_tree = cpi->common.fc.coef_probs [2] [band] [pt];
+
+            t->skip_eob_node = (pt == 0);
+
+            ++x->coef_counts       [2] [band] [pt] [token];
+
+            pt = vp8_prev_token_class[token];
+            t++;
+        }
+        if (c < 16)
+        {
+            band = vp8_coef_bands[c];
+            t->Token = DCT_EOB_TOKEN;
+            t->context_tree = cpi->common.fc.coef_probs [2] [band] [pt];
+
+            t->skip_eob_node = 0;
+
+            ++x->coef_counts       [2] [band] [pt] [DCT_EOB_TOKEN];
+
+            t++;
+        }
+        *tp = t;
+        *a = *l = 1;
+    }
+}
+
+
+static int mb_is_skippable(MACROBLOCKD *x, int has_y2_block)
+{
+    int skip = 1;
+    int i = 0;
+
+    if (has_y2_block)
+    {
+        for (i = 0; i < 16; i++)
+            skip &= (x->eobs[i] < 2);
+    }
+
+    for (; i < 24 + has_y2_block; i++)
+        skip &= (!x->eobs[i]);
+
+    return skip;
+}
+
+
+void vp8_tokenize_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t)
+{
+    MACROBLOCKD *xd = &x->e_mbd;
+    int plane_type;
+    int has_y2_block;
+
+    has_y2_block = (xd->mode_info_context->mbmi.mode != B_PRED
+                    && xd->mode_info_context->mbmi.mode != SPLITMV);
+
+    xd->mode_info_context->mbmi.mb_skip_coeff =
+        mb_is_skippable(xd, has_y2_block);
+    if (xd->mode_info_context->mbmi.mb_skip_coeff)
+    {
+        if (!cpi->common.mb_no_coeff_skip)
+        {
+            vp8_stuff_mb(cpi, x, t);
+        }
+        else
+        {
+            vp8_fix_contexts(xd);
+            x->skip_true_count++;
+        }
+
+        return;
+    }
+
+    plane_type = 3;
+    if(has_y2_block)
+    {
+        tokenize2nd_order_b(x, t, cpi);
+        plane_type = 0;
+    }
+
+    tokenize1st_order_b(x, t, plane_type, cpi);
+}
+
+
+#ifdef VP8_ENTROPY_STATS
+
+void init_context_counters(void)
+{
+    memset(context_counters, 0, sizeof(context_counters));
+}
+
+void print_context_counters()
+{
+
+    int type, band, pt, t;
+
+    FILE *const f = fopen("context.c", "w");
+
+    fprintf(f, "#include \"entropy.h\"\n");
+
+    fprintf(f, "\n/* *** GENERATED FILE: DO NOT EDIT *** */\n\n");
+
+    fprintf(f, "int Contexts[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];\n\n");
+
+    fprintf(f, "const int default_contexts[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {");
+
+# define Comma( X) (X? ",":"")
+
+    type = 0;
+
+    do
+    {
+        fprintf(f, "%s\n  { /* block Type %d */", Comma(type), type);
+
+        band = 0;
+
+        do
+        {
+            fprintf(f, "%s\n    { /* Coeff Band %d */", Comma(band), band);
+
+            pt = 0;
+
+            do
+            {
+                fprintf(f, "%s\n      {", Comma(pt));
+
+                t = 0;
+
+                do
+                {
+                    const _int64 x = context_counters [type] [band] [pt] [t];
+                    const int y = (int) x;
+
+                    assert(x == (_int64) y);  /* no overflow handling yet */
+                    fprintf(f, "%s %d", Comma(t), y);
+
+                }
+                while (++t < MAX_ENTROPY_TOKENS);
+
+                fprintf(f, "}");
+            }
+            while (++pt < PREV_COEF_CONTEXTS);
+
+            fprintf(f, "\n    }");
+
+        }
+        while (++band < COEF_BANDS);
+
+        fprintf(f, "\n  }");
+    }
+    while (++type < BLOCK_TYPES);
+
+    fprintf(f, "\n};\n");
+    fclose(f);
+}
+#endif
+
+
+static void stuff2nd_order_b
+(
+    TOKENEXTRA **tp,
+    ENTROPY_CONTEXT *a,
+    ENTROPY_CONTEXT *l,
+    VP8_COMP *cpi,
+    MACROBLOCK *x
+)
+{
+    int pt; /* near block/prev token context index */
+    TOKENEXTRA *t = *tp;        /* store tokens starting here */
+    VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+
+    t->Token = DCT_EOB_TOKEN;
+    t->context_tree = cpi->common.fc.coef_probs [1] [0] [pt];
+    t->skip_eob_node = 0;
+    ++x->coef_counts       [1] [0] [pt] [DCT_EOB_TOKEN];
+    ++t;
+
+    *tp = t;
+    pt = 0;
+    *a = *l = pt;
+}
+
+static void stuff1st_order_b
+(
+    TOKENEXTRA **tp,
+    ENTROPY_CONTEXT *a,
+    ENTROPY_CONTEXT *l,
+    int type,
+    VP8_COMP *cpi,
+    MACROBLOCK *x
+)
+{
+    int pt; /* near block/prev token context index */
+    int band;
+    TOKENEXTRA *t = *tp;        /* store tokens starting here */
+    VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+    band = type ? 0 : 1;
+    t->Token = DCT_EOB_TOKEN;
+    t->context_tree = cpi->common.fc.coef_probs [type] [band] [pt];
+    t->skip_eob_node = 0;
+    ++x->coef_counts       [type] [band] [pt] [DCT_EOB_TOKEN];
+    ++t;
+    *tp = t;
+    pt = 0; /* 0 <-> all coeff data is zero */
+    *a = *l = pt;
+}
+
+static
+void stuff1st_order_buv
+(
+    TOKENEXTRA **tp,
+    ENTROPY_CONTEXT *a,
+    ENTROPY_CONTEXT *l,
+    VP8_COMP *cpi,
+    MACROBLOCK *x
+)
+{
+    int pt; /* near block/prev token context index */
+    TOKENEXTRA *t = *tp;        /* store tokens starting here */
+    VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+
+    t->Token = DCT_EOB_TOKEN;
+    t->context_tree = cpi->common.fc.coef_probs [2] [0] [pt];
+    t->skip_eob_node = 0;
+    ++x->coef_counts[2] [0] [pt] [DCT_EOB_TOKEN];
+    ++t;
+    *tp = t;
+    pt = 0; /* 0 <-> all coeff data is zero */
+    *a = *l = pt;
+}
+
+void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t)
+{
+    MACROBLOCKD *xd = &x->e_mbd;
+    ENTROPY_CONTEXT * A = (ENTROPY_CONTEXT *)xd->above_context;
+    ENTROPY_CONTEXT * L = (ENTROPY_CONTEXT *)xd->left_context;
+    int plane_type;
+    int b;
+    plane_type = 3;
+    if((xd->mode_info_context->mbmi.mode != B_PRED
+                        && xd->mode_info_context->mbmi.mode != SPLITMV))
+    {
+        stuff2nd_order_b(t,
+                     A + vp8_block2above[24], L + vp8_block2left[24], cpi, x);
+        plane_type = 0;
+    }
+
+    for (b = 0; b < 16; b++)
+        stuff1st_order_b(t,
+                         A + vp8_block2above[b],
+                         L + vp8_block2left[b], plane_type, cpi, x);
+
+    for (b = 16; b < 24; b++)
+        stuff1st_order_buv(t,
+                           A + vp8_block2above[b],
+                           L + vp8_block2left[b], cpi, x);
+
+}
+void vp8_fix_contexts(MACROBLOCKD *x)
+{
+    /* Clear entropy contexts for Y2 blocks */
+    if (x->mode_info_context->mbmi.mode != B_PRED && x->mode_info_context->mbmi.mode != SPLITMV)
+    {
+        memset(x->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
+        memset(x->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
+    }
+    else
+    {
+        memset(x->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES)-1);
+        memset(x->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES)-1);
+    }
+
+}
diff --git a/libs/libvpx/vp8/encoder/tokenize.h b/libs/libvpx/vp8/encoder/tokenize.h
new file mode 100644
index 0000000000..b73a9ee1c8
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/tokenize.h
@@ -0,0 +1,58 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_ENCODER_TOKENIZE_H_
+#define VP8_ENCODER_TOKENIZE_H_
+
+#include "vp8/common/entropy.h"
+#include "block.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp8_tokenize_initialize();
+
+typedef struct
+{
+    short Token;
+    short Extra;
+} TOKENVALUE;
+
+typedef struct
+{
+    const vp8_prob *context_tree;
+    short           Extra;
+    unsigned char   Token;
+    unsigned char   skip_eob_node;
+} TOKENEXTRA;
+
+int rd_cost_mby(MACROBLOCKD *);
+
+#ifdef VP8_ENTROPY_STATS
+void init_context_counters();
+void print_context_counters();
+
+extern _int64 context_counters[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
+#endif
+
+extern const short *const vp8_dct_value_cost_ptr;
+/* TODO: The Token field should be broken out into a separate char array to
+ *  improve cache locality, since it's needed for costing when the rest of the
+ *  fields are not.
+ */
+extern const TOKENVALUE *const vp8_dct_value_tokens_ptr;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_ENCODER_TOKENIZE_H_
diff --git a/libs/libvpx/vp8/encoder/treewriter.c b/libs/libvpx/vp8/encoder/treewriter.c
new file mode 100644
index 0000000000..ef25f670b3
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/treewriter.c
@@ -0,0 +1,43 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "treewriter.h"
+
+static void cost(
+    int *const C,
+    vp8_tree T,
+    const vp8_prob *const P,
+    int i,
+    int c
+)
+{
+    const vp8_prob p = P [i>>1];
+
+    do
+    {
+        const vp8_tree_index j = T[i];
+        const int d = c + vp8_cost_bit(p, i & 1);
+
+        if (j <= 0)
+            C[-j] = d;
+        else
+            cost(C, T, P, j, d);
+    }
+    while (++i & 1);
+}
+void vp8_cost_tokens(int *c, const vp8_prob *p, vp8_tree t)
+{
+    cost(c, t, p, 0, 0);
+}
+void vp8_cost_tokens2(int *c, const vp8_prob *p, vp8_tree t,int start)
+{
+    cost(c, t, p, start, 0);
+}
diff --git a/libs/libvpx/vp8/encoder/treewriter.h b/libs/libvpx/vp8/encoder/treewriter.h
new file mode 100644
index 0000000000..2debf9276c
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/treewriter.h
@@ -0,0 +1,135 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP8_ENCODER_TREEWRITER_H_
+#define VP8_ENCODER_TREEWRITER_H_
+
+/* Trees map alphabets into huffman-like codes suitable for an arithmetic
+   bit coder.  Timothy S Murphy  11 October 2004 */
+
+#include "./vpx_config.h"
+#include "vp8/common/treecoder.h"
+
+#include "boolhuff.h"       /* for now */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef BOOL_CODER vp8_writer;
+
+#define vp8_write vp8_encode_bool
+#define vp8_write_literal vp8_encode_value
+#define vp8_write_bit( W, V) vp8_write( W, V, vp8_prob_half)
+
+#define vp8bc_write vp8bc_write_bool
+#define vp8bc_write_literal vp8bc_write_bits
+#define vp8bc_write_bit( W, V) vp8bc_write_bits( W, V, 1)
+
+
+/* Approximate length of an encoded bool in 256ths of a bit at given prob */
+
+#define vp8_cost_zero( x) ( vp8_prob_cost[x])
+#define vp8_cost_one( x)  vp8_cost_zero( vp8_complement(x))
+
+#define vp8_cost_bit( x, b) vp8_cost_zero( (b)?  vp8_complement(x) : (x) )
+
+/* VP8BC version is scaled by 2^20 rather than 2^8; see bool_coder.h */
+
+
+/* Both of these return bits, not scaled bits. */
+
+static INLINE unsigned int vp8_cost_branch(const unsigned int ct[2], vp8_prob p)
+{
+    /* Imitate existing calculation */
+
+    return ((ct[0] * vp8_cost_zero(p))
+            + (ct[1] * vp8_cost_one(p))) >> 8;
+}
+
+/* Small functions to write explicit values and tokens, as well as
+   estimate their lengths. */
+
+static void vp8_treed_write
+(
+    vp8_writer *const w,
+    vp8_tree t,
+    const vp8_prob *const p,
+    int v,
+    int n               /* number of bits in v, assumed nonzero */
+)
+{
+    vp8_tree_index i = 0;
+
+    do
+    {
+        const int b = (v >> --n) & 1;
+        vp8_write(w, b, p[i>>1]);
+        i = t[i+b];
+    }
+    while (n);
+}
+static INLINE void vp8_write_token
+(
+    vp8_writer *const w,
+    vp8_tree t,
+    const vp8_prob *const p,
+    vp8_token *const x
+)
+{
+    vp8_treed_write(w, t, p, x->value, x->Len);
+}
+
+static int vp8_treed_cost(
+    vp8_tree t,
+    const vp8_prob *const p,
+    int v,
+    int n               /* number of bits in v, assumed nonzero */
+)
+{
+    int c = 0;
+    vp8_tree_index i = 0;
+
+    do
+    {
+        const int b = (v >> --n) & 1;
+        c += vp8_cost_bit(p[i>>1], b);
+        i = t[i+b];
+    }
+    while (n);
+
+    return c;
+}
+static INLINE int vp8_cost_token
+(
+    vp8_tree t,
+    const vp8_prob *const p,
+    vp8_token *const x
+)
+{
+    return vp8_treed_cost(t, p, x->value, x->Len);
+}
+
+/* Fill array of costs for all possible token values. */
+
+void vp8_cost_tokens(
+    int *Costs, const vp8_prob *, vp8_tree
+);
+
+void vp8_cost_tokens2(
+    int *Costs, const vp8_prob *, vp8_tree, int
+);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_ENCODER_TREEWRITER_H_
diff --git a/libs/libvpx/vp8/encoder/vp8_quantize.c b/libs/libvpx/vp8/encoder/vp8_quantize.c
new file mode 100644
index 0000000000..ee922c9d69
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/vp8_quantize.c
@@ -0,0 +1,583 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <math.h>
+#include "vpx_mem/vpx_mem.h"
+
+#include "onyx_int.h"
+#include "vp8/encoder/quantize.h"
+#include "vp8/common/quant_common.h"
+
+void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
+{
+    int i, rc, eob;
+    int x, y, z, sz;
+    short *coeff_ptr   = b->coeff;
+    short *round_ptr   = b->round;
+    short *quant_ptr   = b->quant_fast;
+    short *qcoeff_ptr  = d->qcoeff;
+    short *dqcoeff_ptr = d->dqcoeff;
+    short *dequant_ptr = d->dequant;
+
+    eob = -1;
+    for (i = 0; i < 16; i++)
+    {
+        rc   = vp8_default_zig_zag1d[i];
+        z    = coeff_ptr[rc];
+
+        sz = (z >> 31);                              /* sign of z */
+        x  = (z ^ sz) - sz;                          /* x = abs(z) */
+
+        y  = ((x + round_ptr[rc]) * quant_ptr[rc]) >> 16; /* quantize (x) */
+        x  = (y ^ sz) - sz;                          /* get the sign back */
+        qcoeff_ptr[rc] = x;                          /* write to destination */
+        dqcoeff_ptr[rc] = x * dequant_ptr[rc];       /* dequantized value */
+
+        if (y)
+        {
+            eob = i;                                 /* last nonzero coeffs */
+        }
+    }
+    *d->eob = (char)(eob + 1);
+}
+
+void vp8_regular_quantize_b_c(BLOCK *b, BLOCKD *d)
+{
+    int i, rc, eob;
+    int zbin;
+    int x, y, z, sz;
+    short *zbin_boost_ptr  = b->zrun_zbin_boost;
+    short *coeff_ptr       = b->coeff;
+    short *zbin_ptr        = b->zbin;
+    short *round_ptr       = b->round;
+    short *quant_ptr       = b->quant;
+    short *quant_shift_ptr = b->quant_shift;
+    short *qcoeff_ptr      = d->qcoeff;
+    short *dqcoeff_ptr     = d->dqcoeff;
+    short *dequant_ptr     = d->dequant;
+    short zbin_oq_value    = b->zbin_extra;
+
+    memset(qcoeff_ptr, 0, 32);
+    memset(dqcoeff_ptr, 0, 32);
+
+    eob = -1;
+
+    for (i = 0; i < 16; i++)
+    {
+        rc   = vp8_default_zig_zag1d[i];
+        z    = coeff_ptr[rc];
+
+        zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value;
+
+        zbin_boost_ptr ++;
+        sz = (z >> 31);                              /* sign of z */
+        x  = (z ^ sz) - sz;                          /* x = abs(z) */
+
+        if (x >= zbin)
+        {
+            x += round_ptr[rc];
+            y  = ((((x * quant_ptr[rc]) >> 16) + x)
+                 * quant_shift_ptr[rc]) >> 16;       /* quantize (x) */
+            x  = (y ^ sz) - sz;                      /* get the sign back */
+            qcoeff_ptr[rc]  = x;                     /* write to destination */
+            dqcoeff_ptr[rc] = x * dequant_ptr[rc];   /* dequantized value */
+
+            if (y)
+            {
+                eob = i;                             /* last nonzero coeffs */
+                zbin_boost_ptr = b->zrun_zbin_boost; /* reset zero runlength */
+            }
+        }
+    }
+
+    *d->eob = (char)(eob + 1);
+}
+
+void vp8_quantize_mby(MACROBLOCK *x)
+{
+    int i;
+    int has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED
+        && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
+
+    for (i = 0; i < 16; i++)
+        x->quantize_b(&x->block[i], &x->e_mbd.block[i]);
+
+    if(has_2nd_order)
+        x->quantize_b(&x->block[24], &x->e_mbd.block[24]);
+}
+
+void vp8_quantize_mb(MACROBLOCK *x)
+{
+    int i;
+    int has_2nd_order=(x->e_mbd.mode_info_context->mbmi.mode != B_PRED
+        && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
+
+    for (i = 0; i < 24+has_2nd_order; i++)
+        x->quantize_b(&x->block[i], &x->e_mbd.block[i]);
+}
+
+
+void vp8_quantize_mbuv(MACROBLOCK *x)
+{
+    int i;
+
+    for (i = 16; i < 24; i++)
+        x->quantize_b(&x->block[i], &x->e_mbd.block[i]);
+}
+
+static const int qrounding_factors[129] =
+{
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48
+};
+
+
+static const int qzbin_factors[129] =
+{
+    84, 84, 84, 84, 84, 84, 84, 84,
+    84, 84, 84, 84, 84, 84, 84, 84,
+    84, 84, 84, 84, 84, 84, 84, 84,
+    84, 84, 84, 84, 84, 84, 84, 84,
+    84, 84, 84, 84, 84, 84, 84, 84,
+    84, 84, 84, 84, 84, 84, 84, 84,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80
+};
+
+
+static const int qrounding_factors_y2[129] =
+{
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48
+};
+
+
+static const int qzbin_factors_y2[129] =
+{
+    84, 84, 84, 84, 84, 84, 84, 84,
+    84, 84, 84, 84, 84, 84, 84, 84,
+    84, 84, 84, 84, 84, 84, 84, 84,
+    84, 84, 84, 84, 84, 84, 84, 84,
+    84, 84, 84, 84, 84, 84, 84, 84,
+    84, 84, 84, 84, 84, 84, 84, 84,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80
+};
+
+
+static void invert_quant(int improved_quant, short *quant,
+                         short *shift, short d)
+{
+    if(improved_quant)
+    {
+        unsigned t;
+        int l;
+        t = d;
+        for(l = 0; t > 1; l++)
+            t>>=1;
+        t = 1 + (1<<(16+l))/d;
+        *quant = (short)(t - (1<<16));
+        *shift = l;
+        /* use multiplication and constant shift by 16 */
+        *shift = 1 << (16 - *shift);
+    }
+    else
+    {
+        *quant = (1 << 16) / d;
+        *shift = 0;
+        /* use multiplication and constant shift by 16 */
+        *shift = 1 << (16 - *shift);
+    }
+}
+
+
+void vp8cx_init_quantizer(VP8_COMP *cpi)
+{
+    int i;
+    int quant_val;
+    int Q;
+
+    int zbin_boost[16] = {0, 0, 8, 10, 12, 14, 16, 20, 24, 28, 32, 36, 40, 44,
+                          44, 44};
+
+    for (Q = 0; Q < QINDEX_RANGE; Q++)
+    {
+        /* dc values */
+        quant_val = vp8_dc_quant(Q, cpi->common.y1dc_delta_q);
+        cpi->Y1quant_fast[Q][0] = (1 << 16) / quant_val;
+        invert_quant(cpi->sf.improved_quant, cpi->Y1quant[Q] + 0,
+                     cpi->Y1quant_shift[Q] + 0, quant_val);
+        cpi->Y1zbin[Q][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
+        cpi->Y1round[Q][0] = (qrounding_factors[Q] * quant_val) >> 7;
+        cpi->common.Y1dequant[Q][0] = quant_val;
+        cpi->zrun_zbin_boost_y1[Q][0] = (quant_val * zbin_boost[0]) >> 7;
+
+        quant_val = vp8_dc2quant(Q, cpi->common.y2dc_delta_q);
+        cpi->Y2quant_fast[Q][0] = (1 << 16) / quant_val;
+        invert_quant(cpi->sf.improved_quant, cpi->Y2quant[Q] + 0,
+                     cpi->Y2quant_shift[Q] + 0, quant_val);
+        cpi->Y2zbin[Q][0] = ((qzbin_factors_y2[Q] * quant_val) + 64) >> 7;
+        cpi->Y2round[Q][0] = (qrounding_factors_y2[Q] * quant_val) >> 7;
+        cpi->common.Y2dequant[Q][0] = quant_val;
+        cpi->zrun_zbin_boost_y2[Q][0] = (quant_val * zbin_boost[0]) >> 7;
+
+        quant_val = vp8_dc_uv_quant(Q, cpi->common.uvdc_delta_q);
+        cpi->UVquant_fast[Q][0] = (1 << 16) / quant_val;
+        invert_quant(cpi->sf.improved_quant, cpi->UVquant[Q] + 0,
+                     cpi->UVquant_shift[Q] + 0, quant_val);
+        cpi->UVzbin[Q][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;;
+        cpi->UVround[Q][0] = (qrounding_factors[Q] * quant_val) >> 7;
+        cpi->common.UVdequant[Q][0] = quant_val;
+        cpi->zrun_zbin_boost_uv[Q][0] = (quant_val * zbin_boost[0]) >> 7;
+
+        /* all the ac values = ; */
+        quant_val = vp8_ac_yquant(Q);
+        cpi->Y1quant_fast[Q][1] = (1 << 16) / quant_val;
+        invert_quant(cpi->sf.improved_quant, cpi->Y1quant[Q] + 1,
+                     cpi->Y1quant_shift[Q] + 1, quant_val);
+        cpi->Y1zbin[Q][1] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
+        cpi->Y1round[Q][1] = (qrounding_factors[Q] * quant_val) >> 7;
+        cpi->common.Y1dequant[Q][1] = quant_val;
+        cpi->zrun_zbin_boost_y1[Q][1] = (quant_val * zbin_boost[1]) >> 7;
+
+        quant_val = vp8_ac2quant(Q, cpi->common.y2ac_delta_q);
+        cpi->Y2quant_fast[Q][1] = (1 << 16) / quant_val;
+        invert_quant(cpi->sf.improved_quant, cpi->Y2quant[Q] + 1,
+                     cpi->Y2quant_shift[Q] + 1, quant_val);
+        cpi->Y2zbin[Q][1] = ((qzbin_factors_y2[Q] * quant_val) + 64) >> 7;
+        cpi->Y2round[Q][1] = (qrounding_factors_y2[Q] * quant_val) >> 7;
+        cpi->common.Y2dequant[Q][1] = quant_val;
+        cpi->zrun_zbin_boost_y2[Q][1] = (quant_val * zbin_boost[1]) >> 7;
+
+        quant_val = vp8_ac_uv_quant(Q, cpi->common.uvac_delta_q);
+        cpi->UVquant_fast[Q][1] = (1 << 16) / quant_val;
+        invert_quant(cpi->sf.improved_quant, cpi->UVquant[Q] + 1,
+                     cpi->UVquant_shift[Q] + 1, quant_val);
+        cpi->UVzbin[Q][1] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
+        cpi->UVround[Q][1] = (qrounding_factors[Q] * quant_val) >> 7;
+        cpi->common.UVdequant[Q][1] = quant_val;
+        cpi->zrun_zbin_boost_uv[Q][1] = (quant_val * zbin_boost[1]) >> 7;
+
+        for (i = 2; i < 16; i++)
+        {
+            cpi->Y1quant_fast[Q][i] = cpi->Y1quant_fast[Q][1];
+            cpi->Y1quant[Q][i] = cpi->Y1quant[Q][1];
+            cpi->Y1quant_shift[Q][i] = cpi->Y1quant_shift[Q][1];
+            cpi->Y1zbin[Q][i] = cpi->Y1zbin[Q][1];
+            cpi->Y1round[Q][i] = cpi->Y1round[Q][1];
+            cpi->zrun_zbin_boost_y1[Q][i] = (cpi->common.Y1dequant[Q][1] *
+                                             zbin_boost[i]) >> 7;
+
+            cpi->Y2quant_fast[Q][i] = cpi->Y2quant_fast[Q][1];
+            cpi->Y2quant[Q][i] = cpi->Y2quant[Q][1];
+            cpi->Y2quant_shift[Q][i] = cpi->Y2quant_shift[Q][1];
+            cpi->Y2zbin[Q][i] = cpi->Y2zbin[Q][1];
+            cpi->Y2round[Q][i] = cpi->Y2round[Q][1];
+            cpi->zrun_zbin_boost_y2[Q][i] = (cpi->common.Y2dequant[Q][1] *
+                                             zbin_boost[i]) >> 7;
+
+            cpi->UVquant_fast[Q][i] = cpi->UVquant_fast[Q][1];
+            cpi->UVquant[Q][i] = cpi->UVquant[Q][1];
+            cpi->UVquant_shift[Q][i] = cpi->UVquant_shift[Q][1];
+            cpi->UVzbin[Q][i] = cpi->UVzbin[Q][1];
+            cpi->UVround[Q][i] = cpi->UVround[Q][1];
+            cpi->zrun_zbin_boost_uv[Q][i] = (cpi->common.UVdequant[Q][1] *
+                                             zbin_boost[i]) >> 7;
+        }
+    }
+}
+
+#define ZBIN_EXTRA_Y \
+    (( cpi->common.Y1dequant[QIndex][1] *  \
+    ( x->zbin_over_quant +  \
+      x->zbin_mode_boost +  \
+      x->act_zbin_adj ) ) >> 7)
+
+#define ZBIN_EXTRA_UV \
+    (( cpi->common.UVdequant[QIndex][1] *  \
+    ( x->zbin_over_quant +  \
+      x->zbin_mode_boost +  \
+      x->act_zbin_adj ) ) >> 7)
+
+#define ZBIN_EXTRA_Y2 \
+    (( cpi->common.Y2dequant[QIndex][1] *  \
+    ( (x->zbin_over_quant / 2) +  \
+       x->zbin_mode_boost +  \
+       x->act_zbin_adj ) ) >> 7)
+
+void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x, int ok_to_skip)
+{
+    int i;
+    int QIndex;
+    MACROBLOCKD *xd = &x->e_mbd;
+    int zbin_extra;
+
+    /* Select the baseline MB Q index. */
+    if (xd->segmentation_enabled)
+    {
+        /* Abs Value */
+        if (xd->mb_segement_abs_delta == SEGMENT_ABSDATA)
+            QIndex = xd->segment_feature_data[MB_LVL_ALT_Q][xd->mode_info_context->mbmi.segment_id];
+        /* Delta Value */
+        else
+        {
+            QIndex = cpi->common.base_qindex + xd->segment_feature_data[MB_LVL_ALT_Q][xd->mode_info_context->mbmi.segment_id];
+            /* Clamp to valid range */
+            QIndex = (QIndex >= 0) ? ((QIndex <= MAXQ) ? QIndex : MAXQ) : 0;
+        }
+    }
+    else
+        QIndex = cpi->common.base_qindex;
+
+    /* This initialization should be called at least once. Use ok_to_skip to
+     * decide if it is ok to skip.
+     * Before encoding a frame, this function is always called with ok_to_skip
+     * =0, which means no skiping of calculations. The "last" values are
+     * initialized at that time.
+     */
+    if (!ok_to_skip || QIndex != x->q_index)
+    {
+
+        xd->dequant_y1_dc[0] = 1;
+        xd->dequant_y1[0] = cpi->common.Y1dequant[QIndex][0];
+        xd->dequant_y2[0] = cpi->common.Y2dequant[QIndex][0];
+        xd->dequant_uv[0] = cpi->common.UVdequant[QIndex][0];
+
+        for (i = 1; i < 16; i++)
+        {
+            xd->dequant_y1_dc[i] =
+            xd->dequant_y1[i] = cpi->common.Y1dequant[QIndex][1];
+            xd->dequant_y2[i] = cpi->common.Y2dequant[QIndex][1];
+            xd->dequant_uv[i] = cpi->common.UVdequant[QIndex][1];
+        }
+#if 1
+        /*TODO:  Remove dequant from BLOCKD.  This is a temporary solution until
+         * the quantizer code uses a passed in pointer to the dequant constants.
+         * This will also require modifications to the x86 and neon assembly.
+         * */
+        for (i = 0; i < 16; i++)
+            x->e_mbd.block[i].dequant = xd->dequant_y1;
+        for (i = 16; i < 24; i++)
+            x->e_mbd.block[i].dequant = xd->dequant_uv;
+        x->e_mbd.block[24].dequant = xd->dequant_y2;
+#endif
+
+        /* Y */
+        zbin_extra = ZBIN_EXTRA_Y;
+
+        for (i = 0; i < 16; i++)
+        {
+            x->block[i].quant = cpi->Y1quant[QIndex];
+            x->block[i].quant_fast = cpi->Y1quant_fast[QIndex];
+            x->block[i].quant_shift = cpi->Y1quant_shift[QIndex];
+            x->block[i].zbin = cpi->Y1zbin[QIndex];
+            x->block[i].round = cpi->Y1round[QIndex];
+            x->block[i].zrun_zbin_boost = cpi->zrun_zbin_boost_y1[QIndex];
+            x->block[i].zbin_extra = (short)zbin_extra;
+        }
+
+        /* UV */
+        zbin_extra = ZBIN_EXTRA_UV;
+
+        for (i = 16; i < 24; i++)
+        {
+            x->block[i].quant = cpi->UVquant[QIndex];
+            x->block[i].quant_fast = cpi->UVquant_fast[QIndex];
+            x->block[i].quant_shift = cpi->UVquant_shift[QIndex];
+            x->block[i].zbin = cpi->UVzbin[QIndex];
+            x->block[i].round = cpi->UVround[QIndex];
+            x->block[i].zrun_zbin_boost = cpi->zrun_zbin_boost_uv[QIndex];
+            x->block[i].zbin_extra = (short)zbin_extra;
+        }
+
+        /* Y2 */
+        zbin_extra = ZBIN_EXTRA_Y2;
+
+        x->block[24].quant_fast = cpi->Y2quant_fast[QIndex];
+        x->block[24].quant = cpi->Y2quant[QIndex];
+        x->block[24].quant_shift = cpi->Y2quant_shift[QIndex];
+        x->block[24].zbin = cpi->Y2zbin[QIndex];
+        x->block[24].round = cpi->Y2round[QIndex];
+        x->block[24].zrun_zbin_boost = cpi->zrun_zbin_boost_y2[QIndex];
+        x->block[24].zbin_extra = (short)zbin_extra;
+
+        /* save this macroblock QIndex for vp8_update_zbin_extra() */
+        x->q_index = QIndex;
+
+        x->last_zbin_over_quant = x->zbin_over_quant;
+        x->last_zbin_mode_boost = x->zbin_mode_boost;
+        x->last_act_zbin_adj = x->act_zbin_adj;
+
+
+
+    }
+    else if(x->last_zbin_over_quant != x->zbin_over_quant
+            || x->last_zbin_mode_boost != x->zbin_mode_boost
+            || x->last_act_zbin_adj != x->act_zbin_adj)
+    {
+        /* Y */
+        zbin_extra = ZBIN_EXTRA_Y;
+
+        for (i = 0; i < 16; i++)
+            x->block[i].zbin_extra = (short)zbin_extra;
+
+        /* UV */
+        zbin_extra = ZBIN_EXTRA_UV;
+
+        for (i = 16; i < 24; i++)
+            x->block[i].zbin_extra = (short)zbin_extra;
+
+        /* Y2 */
+        zbin_extra = ZBIN_EXTRA_Y2;
+        x->block[24].zbin_extra = (short)zbin_extra;
+
+        x->last_zbin_over_quant = x->zbin_over_quant;
+        x->last_zbin_mode_boost = x->zbin_mode_boost;
+        x->last_act_zbin_adj = x->act_zbin_adj;
+    }
+}
+
+void vp8_update_zbin_extra(VP8_COMP *cpi, MACROBLOCK *x)
+{
+    int i;
+    int QIndex = x->q_index;
+    int zbin_extra;
+
+    /* Y */
+    zbin_extra = ZBIN_EXTRA_Y;
+
+    for (i = 0; i < 16; i++)
+        x->block[i].zbin_extra = (short)zbin_extra;
+
+    /* UV */
+    zbin_extra = ZBIN_EXTRA_UV;
+
+    for (i = 16; i < 24; i++)
+        x->block[i].zbin_extra = (short)zbin_extra;
+
+    /* Y2 */
+    zbin_extra = ZBIN_EXTRA_Y2;
+    x->block[24].zbin_extra = (short)zbin_extra;
+}
+#undef ZBIN_EXTRA_Y
+#undef ZBIN_EXTRA_UV
+#undef ZBIN_EXTRA_Y2
+
+void vp8cx_frame_init_quantizer(VP8_COMP *cpi)
+{
+    /* Clear Zbin mode boost for default case */
+    cpi->mb.zbin_mode_boost = 0;
+
+    /* MB level quantizer setup */
+    vp8cx_mb_init_quantizer(cpi, &cpi->mb, 0);
+}
+
+
+void vp8_set_quantizer(struct VP8_COMP *cpi, int Q)
+{
+    VP8_COMMON *cm = &cpi->common;
+    MACROBLOCKD *mbd = &cpi->mb.e_mbd;
+    int update = 0;
+    int new_delta_q;
+    int new_uv_delta_q;
+    cm->base_qindex = Q;
+
+    /* if any of the delta_q values are changing update flag has to be set */
+    /* currently only y2dc_delta_q may change */
+
+    cm->y1dc_delta_q = 0;
+    cm->y2ac_delta_q = 0;
+
+    if (Q < 4)
+    {
+        new_delta_q = 4-Q;
+    }
+    else
+        new_delta_q = 0;
+
+    update |= cm->y2dc_delta_q != new_delta_q;
+    cm->y2dc_delta_q = new_delta_q;
+
+    new_uv_delta_q = 0;
+    // For screen content, lower the q value for UV channel. For now, select
+    // conservative delta; same delta for dc and ac, and decrease it with lower
+    // Q, and set to 0 below some threshold. May want to condition this in
+    // future on the variance/energy in UV channel.
+    if (cpi->oxcf.screen_content_mode && Q > 40) {
+      new_uv_delta_q = -(int)(0.15 * Q);
+      // Check range: magnitude of delta is 4 bits.
+      if (new_uv_delta_q < -15) {
+        new_uv_delta_q = -15;
+      }
+    }
+    update |= cm->uvdc_delta_q != new_uv_delta_q;
+    cm->uvdc_delta_q = new_uv_delta_q;
+    cm->uvac_delta_q = new_uv_delta_q;
+
+    /* Set Segment specific quatizers */
+    mbd->segment_feature_data[MB_LVL_ALT_Q][0] = cpi->segment_feature_data[MB_LVL_ALT_Q][0];
+    mbd->segment_feature_data[MB_LVL_ALT_Q][1] = cpi->segment_feature_data[MB_LVL_ALT_Q][1];
+    mbd->segment_feature_data[MB_LVL_ALT_Q][2] = cpi->segment_feature_data[MB_LVL_ALT_Q][2];
+    mbd->segment_feature_data[MB_LVL_ALT_Q][3] = cpi->segment_feature_data[MB_LVL_ALT_Q][3];
+
+    /* quantizer has to be reinitialized for any delta_q changes */
+    if(update)
+        vp8cx_init_quantizer(cpi);
+
+}
diff --git a/libs/libvpx/vp8/encoder/x86/dct_mmx.asm b/libs/libvpx/vp8/encoder/x86/dct_mmx.asm
new file mode 100644
index 0000000000..6f188cb94a
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/x86/dct_mmx.asm
@@ -0,0 +1,241 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch)
+global sym(vp8_short_fdct4x4_mmx) PRIVATE
+sym(vp8_short_fdct4x4_mmx):
+    push        rbp
+    mov         rbp,        rsp
+    SHADOW_ARGS_TO_STACK 3
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rsi,        arg(0)      ; input
+        mov         rdi,        arg(1)      ; output
+
+        movsxd      rax,        dword ptr arg(2) ;pitch
+
+        lea         rcx,        [rsi + rax*2]
+        ; read the input data
+        movq        mm0,        [rsi]
+        movq        mm1,        [rsi + rax]
+
+        movq        mm2,        [rcx]
+        movq        mm4,        [rcx + rax]
+
+        ; transpose for the first stage
+        movq        mm3,        mm0         ; 00 01 02 03
+        movq        mm5,        mm2         ; 20 21 22 23
+
+        punpcklwd   mm0,        mm1         ; 00 10 01 11
+        punpckhwd   mm3,        mm1         ; 02 12 03 13
+
+        punpcklwd   mm2,        mm4         ; 20 30 21 31
+        punpckhwd   mm5,        mm4         ; 22 32 23 33
+
+        movq        mm1,        mm0         ; 00 10 01 11
+        punpckldq   mm0,        mm2         ; 00 10 20 30
+
+        punpckhdq   mm1,        mm2         ; 01 11 21 31
+
+        movq        mm2,        mm3         ; 02 12 03 13
+        punpckldq   mm2,        mm5         ; 02 12 22 32
+
+        punpckhdq   mm3,        mm5         ; 03 13 23 33
+
+        ; mm0 0
+        ; mm1 1
+        ; mm2 2
+        ; mm3 3
+
+        ; first stage
+        movq        mm5,        mm0
+        movq        mm4,        mm1
+
+        paddw       mm0,        mm3         ; a1 = 0 + 3
+        paddw       mm1,        mm2         ; b1 = 1 + 2
+
+        psubw       mm4,        mm2         ; c1 = 1 - 2
+        psubw       mm5,        mm3         ; d1 = 0 - 3
+
+        psllw       mm5,        3
+        psllw       mm4,        3
+
+        psllw       mm0,        3
+        psllw       mm1,        3
+
+        ; output 0 and 2
+        movq        mm2,        mm0         ; a1
+
+        paddw       mm0,        mm1         ; op[0] = a1 + b1
+        psubw       mm2,        mm1         ; op[2] = a1 - b1
+
+        ; output 1 and 3
+        ; interleave c1, d1
+        movq        mm1,        mm5         ; d1
+        punpcklwd   mm1,        mm4         ; c1 d1
+        punpckhwd   mm5,        mm4         ; c1 d1
+
+        movq        mm3,        mm1
+        movq        mm4,        mm5
+
+        pmaddwd     mm1,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
+        pmaddwd     mm4,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
+
+        pmaddwd     mm3,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
+        pmaddwd     mm5,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
+
+        paddd       mm1,        MMWORD PTR[GLOBAL(_14500)]
+        paddd       mm4,        MMWORD PTR[GLOBAL(_14500)]
+        paddd       mm3,        MMWORD PTR[GLOBAL(_7500)]
+        paddd       mm5,        MMWORD PTR[GLOBAL(_7500)]
+
+        psrad       mm1,        12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
+        psrad       mm4,        12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
+        psrad       mm3,        12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
+        psrad       mm5,        12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
+
+        packssdw    mm1,        mm4         ; op[1]
+        packssdw    mm3,        mm5         ; op[3]
+
+        ; done with vertical
+        ; transpose for the second stage
+        movq        mm4,        mm0         ; 00 10 20 30
+        movq        mm5,        mm2         ; 02 12 22 32
+
+        punpcklwd   mm0,        mm1         ; 00 01 10 11
+        punpckhwd   mm4,        mm1         ; 20 21 30 31
+
+        punpcklwd   mm2,        mm3         ; 02 03 12 13
+        punpckhwd   mm5,        mm3         ; 22 23 32 33
+
+        movq        mm1,        mm0         ; 00 01 10 11
+        punpckldq   mm0,        mm2         ; 00 01 02 03
+
+        punpckhdq   mm1,        mm2         ; 01 22 12 13
+
+        movq        mm2,        mm4         ; 20 31 30 31
+        punpckldq   mm2,        mm5         ; 20 21 22 23
+
+        punpckhdq   mm4,        mm5         ; 30 31 32 33
+
+        ; mm0 0
+        ; mm1 1
+        ; mm2 2
+        ; mm3 4
+
+        movq        mm5,        mm0
+        movq        mm3,        mm1
+
+        paddw       mm0,        mm4         ; a1 = 0 + 3
+        paddw       mm1,        mm2         ; b1 = 1 + 2
+
+        psubw       mm3,        mm2         ; c1 = 1 - 2
+        psubw       mm5,        mm4         ; d1 = 0 - 3
+
+        pxor        mm6,        mm6         ; zero out for compare
+
+        pcmpeqw     mm6,        mm5         ; d1 != 0
+
+        pandn       mm6,        MMWORD PTR[GLOBAL(_cmp_mask)]   ; clear upper,
+                                                                ; and keep bit 0 of lower
+
+        ; output 0 and 2
+        movq        mm2,        mm0         ; a1
+
+        paddw       mm0,        mm1         ; a1 + b1
+        psubw       mm2,        mm1         ; a1 - b1
+
+        paddw       mm0,        MMWORD PTR[GLOBAL(_7w)]
+        paddw       mm2,        MMWORD PTR[GLOBAL(_7w)]
+
+        psraw       mm0,        4           ; op[0] = (a1 + b1 + 7)>>4
+        psraw       mm2,        4           ; op[8] = (a1 - b1 + 7)>>4
+
+        movq        MMWORD PTR[rdi + 0 ],  mm0
+        movq        MMWORD PTR[rdi + 16],  mm2
+
+        ; output 1 and 3
+        ; interleave c1, d1
+        movq        mm1,        mm5         ; d1
+        punpcklwd   mm1,        mm3         ; c1 d1
+        punpckhwd   mm5,        mm3         ; c1 d1
+
+        movq        mm3,        mm1
+        movq        mm4,        mm5
+
+        pmaddwd     mm1,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
+        pmaddwd     mm4,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
+
+        pmaddwd     mm3,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
+        pmaddwd     mm5,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
+
+        paddd       mm1,        MMWORD PTR[GLOBAL(_12000)]
+        paddd       mm4,        MMWORD PTR[GLOBAL(_12000)]
+        paddd       mm3,        MMWORD PTR[GLOBAL(_51000)]
+        paddd       mm5,        MMWORD PTR[GLOBAL(_51000)]
+
+        psrad       mm1,        16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
+        psrad       mm4,        16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
+        psrad       mm3,        16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
+        psrad       mm5,        16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
+
+        packssdw    mm1,        mm4         ; op[4]
+        packssdw    mm3,        mm5         ; op[12]
+
+        paddw       mm1,        mm6         ; op[4] += (d1!=0)
+
+        movq        MMWORD PTR[rdi + 8 ],  mm1
+        movq        MMWORD PTR[rdi + 24],  mm3
+
+     ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 8
+_5352_2217:
+    dw 5352
+    dw 2217
+    dw 5352
+    dw 2217
+align 8
+_2217_neg5352:
+    dw 2217
+    dw -5352
+    dw 2217
+    dw -5352
+align 8
+_cmp_mask:
+    times 4 dw 1
+align 8
+_7w:
+    times 4 dw 7
+align 8
+_14500:
+    times 2 dd 14500
+align 8
+_7500:
+    times 2 dd 7500
+align 8
+_12000:
+    times 2 dd 12000
+align 8
+_51000:
+    times 2 dd 51000
diff --git a/libs/libvpx/vp8/encoder/x86/dct_sse2.asm b/libs/libvpx/vp8/encoder/x86/dct_sse2.asm
new file mode 100644
index 0000000000..d06bca5927
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/x86/dct_sse2.asm
@@ -0,0 +1,432 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro STACK_FRAME_CREATE 0
+%if ABI_IS_32BIT
+  %define       input       rsi
+  %define       output      rdi
+  %define       pitch       rax
+    push        rbp
+    mov         rbp, rsp
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov         rsi, arg(0)
+    mov         rdi, arg(1)
+
+    movsxd      rax, dword ptr arg(2)
+    lea         rcx, [rsi + rax*2]
+%else
+  %if LIBVPX_YASM_WIN64
+    %define     input       rcx
+    %define     output      rdx
+    %define     pitch       r8
+    SAVE_XMM 7, u
+  %else
+    %define     input       rdi
+    %define     output      rsi
+    %define     pitch       rdx
+  %endif
+%endif
+%endmacro
+
+%macro STACK_FRAME_DESTROY 0
+  %define     input
+  %define     output
+  %define     pitch
+
+%if ABI_IS_32BIT
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    pop         rbp
+%else
+  %if LIBVPX_YASM_WIN64
+    RESTORE_XMM
+  %endif
+%endif
+    ret
+%endmacro
+
+;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch)
+global sym(vp8_short_fdct4x4_sse2) PRIVATE
+sym(vp8_short_fdct4x4_sse2):
+
+    STACK_FRAME_CREATE
+
+    movq        xmm0, MMWORD PTR[input        ] ;03 02 01 00
+    movq        xmm2, MMWORD PTR[input+  pitch] ;13 12 11 10
+    lea         input,          [input+2*pitch]
+    movq        xmm1, MMWORD PTR[input        ] ;23 22 21 20
+    movq        xmm3, MMWORD PTR[input+  pitch] ;33 32 31 30
+
+    punpcklqdq  xmm0, xmm2                      ;13 12 11 10 03 02 01 00
+    punpcklqdq  xmm1, xmm3                      ;33 32 31 30 23 22 21 20
+
+    movdqa      xmm2, xmm0
+    punpckldq   xmm0, xmm1                      ;23 22 03 02 21 20 01 00
+    punpckhdq   xmm2, xmm1                      ;33 32 13 12 31 30 11 10
+    movdqa      xmm1, xmm0
+    punpckldq   xmm0, xmm2                      ;31 21 30 20 11 10 01 00
+    pshufhw     xmm1, xmm1, 0b1h                ;22 23 02 03 xx xx xx xx
+    pshufhw     xmm2, xmm2, 0b1h                ;32 33 12 13 xx xx xx xx
+
+    punpckhdq   xmm1, xmm2                      ;32 33 22 23 12 13 02 03
+    movdqa      xmm3, xmm0
+    paddw       xmm0, xmm1                      ;b1 a1 b1 a1 b1 a1 b1 a1
+    psubw       xmm3, xmm1                      ;c1 d1 c1 d1 c1 d1 c1 d1
+    psllw       xmm0, 3                         ;b1 <<= 3 a1 <<= 3
+    psllw       xmm3, 3                         ;c1 <<= 3 d1 <<= 3
+
+    movdqa      xmm1, xmm0
+    pmaddwd     xmm0, XMMWORD PTR[GLOBAL(_mult_add)]    ;a1 + b1
+    pmaddwd     xmm1, XMMWORD PTR[GLOBAL(_mult_sub)]    ;a1 - b1
+    movdqa      xmm4, xmm3
+    pmaddwd     xmm3, XMMWORD PTR[GLOBAL(_5352_2217)]   ;c1*2217 + d1*5352
+    pmaddwd     xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)];d1*2217 - c1*5352
+
+    paddd       xmm3, XMMWORD PTR[GLOBAL(_14500)]
+    paddd       xmm4, XMMWORD PTR[GLOBAL(_7500)]
+    psrad       xmm3, 12            ;(c1 * 2217 + d1 * 5352 +  14500)>>12
+    psrad       xmm4, 12            ;(d1 * 2217 - c1 * 5352 +   7500)>>12
+
+    packssdw    xmm0, xmm1                      ;op[2] op[0]
+    packssdw    xmm3, xmm4                      ;op[3] op[1]
+    ; 23 22 21 20 03 02 01 00
+    ;
+    ; 33 32 31 30 13 12 11 10
+    ;
+    movdqa      xmm2, xmm0
+    punpcklqdq  xmm0, xmm3                      ;13 12 11 10 03 02 01 00
+    punpckhqdq  xmm2, xmm3                      ;23 22 21 20 33 32 31 30
+
+    movdqa      xmm3, xmm0
+    punpcklwd   xmm0, xmm2                      ;32 30 22 20 12 10 02 00
+    punpckhwd   xmm3, xmm2                      ;33 31 23 21 13 11 03 01
+    movdqa      xmm2, xmm0
+    punpcklwd   xmm0, xmm3                      ;13 12 11 10 03 02 01 00
+    punpckhwd   xmm2, xmm3                      ;33 32 31 30 23 22 21 20
+
+    movdqa      xmm5, XMMWORD PTR[GLOBAL(_7)]
+    pshufd      xmm2, xmm2, 04eh
+    movdqa      xmm3, xmm0
+    paddw       xmm0, xmm2                      ;b1 b1 b1 b1 a1 a1 a1 a1
+    psubw       xmm3, xmm2                      ;c1 c1 c1 c1 d1 d1 d1 d1
+
+    pshufd      xmm0, xmm0, 0d8h                ;b1 b1 a1 a1 b1 b1 a1 a1
+    movdqa      xmm2, xmm3                      ;save d1 for compare
+    pshufd      xmm3, xmm3, 0d8h                ;c1 c1 d1 d1 c1 c1 d1 d1
+    pshuflw     xmm0, xmm0, 0d8h                ;b1 b1 a1 a1 b1 a1 b1 a1
+    pshuflw     xmm3, xmm3, 0d8h                ;c1 c1 d1 d1 c1 d1 c1 d1
+    pshufhw     xmm0, xmm0, 0d8h                ;b1 a1 b1 a1 b1 a1 b1 a1
+    pshufhw     xmm3, xmm3, 0d8h                ;c1 d1 c1 d1 c1 d1 c1 d1
+    movdqa      xmm1, xmm0
+    pmaddwd     xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1
+    pmaddwd     xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1
+
+    pxor        xmm4, xmm4                      ;zero out for compare
+    paddd       xmm0, xmm5
+    paddd       xmm1, xmm5
+    pcmpeqw     xmm2, xmm4
+    psrad       xmm0, 4                         ;(a1 + b1 + 7)>>4
+    psrad       xmm1, 4                         ;(a1 - b1 + 7)>>4
+    pandn       xmm2, XMMWORD PTR[GLOBAL(_cmp_mask)] ;clear upper,
+                                                     ;and keep bit 0 of lower
+
+    movdqa      xmm4, xmm3
+    pmaddwd     xmm3, XMMWORD PTR[GLOBAL(_5352_2217)]    ;c1*2217 + d1*5352
+    pmaddwd     xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)] ;d1*2217 - c1*5352
+    paddd       xmm3, XMMWORD PTR[GLOBAL(_12000)]
+    paddd       xmm4, XMMWORD PTR[GLOBAL(_51000)]
+    packssdw    xmm0, xmm1                      ;op[8] op[0]
+    psrad       xmm3, 16                ;(c1 * 2217 + d1 * 5352 +  12000)>>16
+    psrad       xmm4, 16                ;(d1 * 2217 - c1 * 5352 +  51000)>>16
+
+    packssdw    xmm3, xmm4                      ;op[12] op[4]
+    movdqa      xmm1, xmm0
+    paddw       xmm3, xmm2                      ;op[4] += (d1!=0)
+    punpcklqdq  xmm0, xmm3                      ;op[4] op[0]
+    punpckhqdq  xmm1, xmm3                      ;op[12] op[8]
+
+    movdqa      XMMWORD PTR[output +  0], xmm0
+    movdqa      XMMWORD PTR[output + 16], xmm1
+
+    STACK_FRAME_DESTROY
+
+;void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch)
+global sym(vp8_short_fdct8x4_sse2) PRIVATE
+sym(vp8_short_fdct8x4_sse2):
+
+    STACK_FRAME_CREATE
+
+        ; read the input data
+        movdqa      xmm0,       [input        ]
+        movdqa      xmm2,       [input+  pitch]
+        lea         input,      [input+2*pitch]
+        movdqa      xmm4,       [input        ]
+        movdqa      xmm3,       [input+  pitch]
+
+        ; transpose for the first stage
+        movdqa      xmm1,       xmm0        ; 00 01 02 03 04 05 06 07
+        movdqa      xmm5,       xmm4        ; 20 21 22 23 24 25 26 27
+
+        punpcklwd   xmm0,       xmm2        ; 00 10 01 11 02 12 03 13
+        punpckhwd   xmm1,       xmm2        ; 04 14 05 15 06 16 07 17
+
+        punpcklwd   xmm4,       xmm3        ; 20 30 21 31 22 32 23 33
+        punpckhwd   xmm5,       xmm3        ; 24 34 25 35 26 36 27 37
+
+        movdqa      xmm2,       xmm0        ; 00 10 01 11 02 12 03 13
+        punpckldq   xmm0,       xmm4        ; 00 10 20 30 01 11 21 31
+
+        punpckhdq   xmm2,       xmm4        ; 02 12 22 32 03 13 23 33
+
+        movdqa      xmm4,       xmm1        ; 04 14 05 15 06 16 07 17
+        punpckldq   xmm4,       xmm5        ; 04 14 24 34 05 15 25 35
+
+        punpckhdq   xmm1,       xmm5        ; 06 16 26 36 07 17 27 37
+        movdqa      xmm3,       xmm2        ; 02 12 22 32 03 13 23 33
+
+        punpckhqdq  xmm3,       xmm1        ; 03 13 23 33 07 17 27 37
+        punpcklqdq  xmm2,       xmm1        ; 02 12 22 32 06 16 26 36
+
+        movdqa      xmm1,       xmm0        ; 00 10 20 30 01 11 21 31
+        punpcklqdq  xmm0,       xmm4        ; 00 10 20 30 04 14 24 34
+
+        punpckhqdq  xmm1,       xmm4        ; 01 11 21 32 05 15 25 35
+
+        ; xmm0 0
+        ; xmm1 1
+        ; xmm2 2
+        ; xmm3 3
+
+        ; first stage
+        movdqa      xmm5,       xmm0
+        movdqa      xmm4,       xmm1
+
+        paddw       xmm0,       xmm3        ; a1 = 0 + 3
+        paddw       xmm1,       xmm2        ; b1 = 1 + 2
+
+        psubw       xmm4,       xmm2        ; c1 = 1 - 2
+        psubw       xmm5,       xmm3        ; d1 = 0 - 3
+
+        psllw       xmm5,        3
+        psllw       xmm4,        3
+
+        psllw       xmm0,        3
+        psllw       xmm1,        3
+
+        ; output 0 and 2
+        movdqa      xmm2,       xmm0        ; a1
+
+        paddw       xmm0,       xmm1        ; op[0] = a1 + b1
+        psubw       xmm2,       xmm1        ; op[2] = a1 - b1
+
+        ; output 1 and 3
+        ; interleave c1, d1
+        movdqa      xmm1,       xmm5        ; d1
+        punpcklwd   xmm1,       xmm4        ; c1 d1
+        punpckhwd   xmm5,       xmm4        ; c1 d1
+
+        movdqa      xmm3,       xmm1
+        movdqa      xmm4,       xmm5
+
+        pmaddwd     xmm1,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
+        pmaddwd     xmm4,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
+
+        pmaddwd     xmm3,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
+        pmaddwd     xmm5,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
+
+        paddd       xmm1,       XMMWORD PTR[GLOBAL(_14500)]
+        paddd       xmm4,       XMMWORD PTR[GLOBAL(_14500)]
+        paddd       xmm3,       XMMWORD PTR[GLOBAL(_7500)]
+        paddd       xmm5,       XMMWORD PTR[GLOBAL(_7500)]
+
+        psrad       xmm1,       12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
+        psrad       xmm4,       12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
+        psrad       xmm3,       12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
+        psrad       xmm5,       12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
+
+        packssdw    xmm1,       xmm4        ; op[1]
+        packssdw    xmm3,       xmm5        ; op[3]
+
+        ; done with vertical
+        ; transpose for the second stage
+        movdqa      xmm4,       xmm0         ; 00 10 20 30 04 14 24 34
+        movdqa      xmm5,       xmm2         ; 02 12 22 32 06 16 26 36
+
+        punpcklwd   xmm0,       xmm1         ; 00 01 10 11 20 21 30 31
+        punpckhwd   xmm4,       xmm1         ; 04 05 14 15 24 25 34 35
+
+        punpcklwd   xmm2,       xmm3         ; 02 03 12 13 22 23 32 33
+        punpckhwd   xmm5,       xmm3         ; 06 07 16 17 26 27 36 37
+
+        movdqa      xmm1,       xmm0         ; 00 01 10 11 20 21 30 31
+        punpckldq   xmm0,       xmm2         ; 00 01 02 03 10 11 12 13
+
+        punpckhdq   xmm1,       xmm2         ; 20 21 22 23 30 31 32 33
+
+        movdqa      xmm2,       xmm4         ; 04 05 14 15 24 25 34 35
+        punpckldq   xmm2,       xmm5         ; 04 05 06 07 14 15 16 17
+
+        punpckhdq   xmm4,       xmm5         ; 24 25 26 27 34 35 36 37
+        movdqa      xmm3,       xmm1         ; 20 21 22 23 30 31 32 33
+
+        punpckhqdq  xmm3,       xmm4         ; 30 31 32 33 34 35 36 37
+        punpcklqdq  xmm1,       xmm4         ; 20 21 22 23 24 25 26 27
+
+        movdqa      xmm4,       xmm0         ; 00 01 02 03 10 11 12 13
+        punpcklqdq  xmm0,       xmm2         ; 00 01 02 03 04 05 06 07
+
+        punpckhqdq  xmm4,       xmm2         ; 10 11 12 13 14 15 16 17
+
+        ; xmm0 0
+        ; xmm1 4
+        ; xmm2 1
+        ; xmm3 3
+
+        movdqa      xmm5,       xmm0
+        movdqa      xmm2,       xmm1
+
+        paddw       xmm0,       xmm3        ; a1 = 0 + 3
+        paddw       xmm1,       xmm4        ; b1 = 1 + 2
+
+        psubw       xmm4,       xmm2        ; c1 = 1 - 2
+        psubw       xmm5,       xmm3        ; d1 = 0 - 3
+
+        pxor        xmm6,       xmm6        ; zero out for compare
+
+        pcmpeqw     xmm6,       xmm5        ; d1 != 0
+
+        pandn       xmm6,       XMMWORD PTR[GLOBAL(_cmp_mask8x4)]   ; clear upper,
+                                                                    ; and keep bit 0 of lower
+
+        ; output 0 and 2
+        movdqa      xmm2,       xmm0        ; a1
+
+        paddw       xmm0,       xmm1        ; a1 + b1
+        psubw       xmm2,       xmm1        ; a1 - b1
+
+        paddw       xmm0,       XMMWORD PTR[GLOBAL(_7w)]
+        paddw       xmm2,       XMMWORD PTR[GLOBAL(_7w)]
+
+        psraw       xmm0,       4           ; op[0] = (a1 + b1 + 7)>>4
+        psraw       xmm2,       4           ; op[8] = (a1 - b1 + 7)>>4
+
+        ; output 1 and 3
+        ; interleave c1, d1
+        movdqa      xmm1,       xmm5        ; d1
+        punpcklwd   xmm1,       xmm4        ; c1 d1
+        punpckhwd   xmm5,       xmm4        ; c1 d1
+
+        movdqa      xmm3,       xmm1
+        movdqa      xmm4,       xmm5
+
+        pmaddwd     xmm1,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
+        pmaddwd     xmm4,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
+
+        pmaddwd     xmm3,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
+        pmaddwd     xmm5,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
+
+        paddd       xmm1,       XMMWORD PTR[GLOBAL(_12000)]
+        paddd       xmm4,       XMMWORD PTR[GLOBAL(_12000)]
+        paddd       xmm3,       XMMWORD PTR[GLOBAL(_51000)]
+        paddd       xmm5,       XMMWORD PTR[GLOBAL(_51000)]
+
+        psrad       xmm1,       16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
+        psrad       xmm4,       16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
+        psrad       xmm3,       16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
+        psrad       xmm5,       16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
+
+        packssdw    xmm1,       xmm4        ; op[4]
+        packssdw    xmm3,       xmm5        ; op[12]
+
+        paddw       xmm1,       xmm6        ; op[4] += (d1!=0)
+
+        movdqa      xmm4,       xmm0
+        movdqa      xmm5,       xmm2
+
+        punpcklqdq  xmm0,       xmm1
+        punpckhqdq  xmm4,       xmm1
+
+        punpcklqdq  xmm2,       xmm3
+        punpckhqdq  xmm5,       xmm3
+
+        movdqa      XMMWORD PTR[output + 0 ],  xmm0
+        movdqa      XMMWORD PTR[output + 16],  xmm2
+        movdqa      XMMWORD PTR[output + 32],  xmm4
+        movdqa      XMMWORD PTR[output + 48],  xmm5
+
+    STACK_FRAME_DESTROY
+
+SECTION_RODATA
+align 16
+_5352_2217:
+    dw 5352
+    dw 2217
+    dw 5352
+    dw 2217
+    dw 5352
+    dw 2217
+    dw 5352
+    dw 2217
+align 16
+_2217_neg5352:
+    dw 2217
+    dw -5352
+    dw 2217
+    dw -5352
+    dw 2217
+    dw -5352
+    dw 2217
+    dw -5352
+align 16
+_mult_add:
+    times 8 dw 1
+align 16
+_cmp_mask:
+    times 4 dw 1
+    times 4 dw 0
+align 16
+_cmp_mask8x4:
+    times 8 dw 1
+align 16
+_mult_sub:
+    dw 1
+    dw -1
+    dw 1
+    dw -1
+    dw 1
+    dw -1
+    dw 1
+    dw -1
+align 16
+_7:
+    times 4 dd 7
+align 16
+_7w:
+    times 8 dw 7
+align 16
+_14500:
+    times 4 dd 14500
+align 16
+_7500:
+    times 4 dd 7500
+align 16
+_12000:
+    times 4 dd 12000
+align 16
+_51000:
+    times 4 dd 51000
diff --git a/libs/libvpx/vp8/encoder/x86/denoising_sse2.c b/libs/libvpx/vp8/encoder/x86/denoising_sse2.c
new file mode 100644
index 0000000000..101d646ef4
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/x86/denoising_sse2.c
@@ -0,0 +1,379 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp8/encoder/denoising.h"
+#include "vp8/common/reconinter.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp8_rtcd.h"
+
+#include <emmintrin.h>
+#include "vpx_ports/emmintrin_compat.h"
+
+/* Compute the sum of all pixel differences of this MB. */
+static INLINE unsigned int abs_sum_diff_16x1(__m128i acc_diff) {
+  const __m128i k_1 = _mm_set1_epi16(1);
+  const __m128i acc_diff_lo = _mm_srai_epi16(
+      _mm_unpacklo_epi8(acc_diff, acc_diff), 8);
+  const __m128i acc_diff_hi = _mm_srai_epi16(
+      _mm_unpackhi_epi8(acc_diff, acc_diff), 8);
+  const __m128i acc_diff_16 = _mm_add_epi16(acc_diff_lo, acc_diff_hi);
+  const __m128i hg_fe_dc_ba = _mm_madd_epi16(acc_diff_16, k_1);
+  const __m128i hgfe_dcba = _mm_add_epi32(hg_fe_dc_ba,
+                                          _mm_srli_si128(hg_fe_dc_ba, 8));
+  const __m128i hgfedcba = _mm_add_epi32(hgfe_dcba,
+                                         _mm_srli_si128(hgfe_dcba, 4));
+  unsigned int sum_diff = abs(_mm_cvtsi128_si32(hgfedcba));
+
+  return sum_diff;
+}
+
+int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y,
+                             int mc_avg_y_stride,
+                             unsigned char *running_avg_y, int avg_y_stride,
+                             unsigned char *sig, int sig_stride,
+                             unsigned int motion_magnitude,
+                             int increase_denoising)
+{
+    unsigned char *running_avg_y_start = running_avg_y;
+    unsigned char *sig_start = sig;
+    unsigned int sum_diff_thresh;
+    int r;
+    int shift_inc  = (increase_denoising &&
+        motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 1 : 0;
+    __m128i acc_diff = _mm_setzero_si128();
+    const __m128i k_0 = _mm_setzero_si128();
+    const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
+    const __m128i k_8 = _mm_set1_epi8(8);
+    const __m128i k_16 = _mm_set1_epi8(16);
+    /* Modify each level's adjustment according to motion_magnitude. */
+    const __m128i l3 = _mm_set1_epi8(
+                       (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ?
+                        7 + shift_inc : 6);
+    /* Difference between level 3 and level 2 is 2. */
+    const __m128i l32 = _mm_set1_epi8(2);
+    /* Difference between level 2 and level 1 is 1. */
+    const __m128i l21 = _mm_set1_epi8(1);
+
+    for (r = 0; r < 16; ++r)
+    {
+        /* Calculate differences */
+        const __m128i v_sig = _mm_loadu_si128((__m128i *)(&sig[0]));
+        const __m128i v_mc_running_avg_y = _mm_loadu_si128(
+                                           (__m128i *)(&mc_running_avg_y[0]));
+        __m128i v_running_avg_y;
+        const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
+        const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
+        /* Obtain the sign. FF if diff is negative. */
+        const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
+        /* Clamp absolute difference to 16 to be used to get mask. Doing this
+         * allows us to use _mm_cmpgt_epi8, which operates on signed byte. */
+        const __m128i clamped_absdiff = _mm_min_epu8(
+                                        _mm_or_si128(pdiff, ndiff), k_16);
+        /* Get masks for l2 l1 and l0 adjustments */
+        const __m128i mask2 = _mm_cmpgt_epi8(k_16, clamped_absdiff);
+        const __m128i mask1 = _mm_cmpgt_epi8(k_8, clamped_absdiff);
+        const __m128i mask0 = _mm_cmpgt_epi8(k_4, clamped_absdiff);
+        /* Get adjustments for l2, l1, and l0 */
+        __m128i adj2 = _mm_and_si128(mask2, l32);
+        const __m128i adj1 = _mm_and_si128(mask1, l21);
+        const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff);
+        __m128i adj,  padj, nadj;
+
+        /* Combine the adjustments and get absolute adjustments. */
+        adj2 = _mm_add_epi8(adj2, adj1);
+        adj = _mm_sub_epi8(l3, adj2);
+        adj = _mm_andnot_si128(mask0, adj);
+        adj = _mm_or_si128(adj, adj0);
+
+        /* Restore the sign and get positive and negative adjustments. */
+        padj = _mm_andnot_si128(diff_sign, adj);
+        nadj = _mm_and_si128(diff_sign, adj);
+
+        /* Calculate filtered value. */
+        v_running_avg_y = _mm_adds_epu8(v_sig, padj);
+        v_running_avg_y = _mm_subs_epu8(v_running_avg_y, nadj);
+        _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);
+
+        /* Adjustments <=7, and each element in acc_diff can fit in signed
+         * char.
+         */
+        acc_diff = _mm_adds_epi8(acc_diff, padj);
+        acc_diff = _mm_subs_epi8(acc_diff, nadj);
+
+        /* Update pointers for next iteration. */
+        sig += sig_stride;
+        mc_running_avg_y += mc_avg_y_stride;
+        running_avg_y += avg_y_stride;
+    }
+
+    {
+        /* Compute the sum of all pixel differences of this MB. */
+        unsigned int abs_sum_diff = abs_sum_diff_16x1(acc_diff);
+        sum_diff_thresh = SUM_DIFF_THRESHOLD;
+        if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH;
+        if (abs_sum_diff > sum_diff_thresh) {
+          // Before returning to copy the block (i.e., apply no denoising),
+          // check if we can still apply some (weaker) temporal filtering to
+          // this block, that would otherwise not be denoised at all. Simplest
+          // is to apply an additional adjustment to running_avg_y to bring it
+          // closer to sig. The adjustment is capped by a maximum delta, and
+          // chosen such that in most cases the resulting sum_diff will be
+          // within the acceptable range given by sum_diff_thresh.
+
+          // The delta is set by the excess of absolute pixel diff over the
+          // threshold.
+          int delta = ((abs_sum_diff - sum_diff_thresh) >> 8) + 1;
+          // Only apply the adjustment for max delta up to 3.
+          if (delta < 4) {
+            const __m128i k_delta = _mm_set1_epi8(delta);
+            sig -= sig_stride * 16;
+            mc_running_avg_y -= mc_avg_y_stride * 16;
+            running_avg_y -= avg_y_stride * 16;
+            for (r = 0; r < 16; ++r) {
+              __m128i v_running_avg_y =
+                  _mm_loadu_si128((__m128i *)(&running_avg_y[0]));
+              // Calculate differences.
+              const __m128i v_sig = _mm_loadu_si128((__m128i *)(&sig[0]));
+              const __m128i v_mc_running_avg_y =
+                  _mm_loadu_si128((__m128i *)(&mc_running_avg_y[0]));
+              const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
+              const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
+              // Obtain the sign. FF if diff is negative.
+              const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
+              // Clamp absolute difference to delta to get the adjustment.
+              const __m128i adj =
+                  _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta);
+              // Restore the sign and get positive and negative adjustments.
+              __m128i padj, nadj;
+              padj = _mm_andnot_si128(diff_sign, adj);
+              nadj = _mm_and_si128(diff_sign, adj);
+              // Calculate filtered value.
+              v_running_avg_y = _mm_subs_epu8(v_running_avg_y, padj);
+              v_running_avg_y = _mm_adds_epu8(v_running_avg_y, nadj);
+             _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);
+
+             // Accumulate the adjustments.
+             acc_diff = _mm_subs_epi8(acc_diff, padj);
+             acc_diff = _mm_adds_epi8(acc_diff, nadj);
+
+             // Update pointers for next iteration.
+             sig += sig_stride;
+             mc_running_avg_y += mc_avg_y_stride;
+             running_avg_y += avg_y_stride;
+            }
+            abs_sum_diff = abs_sum_diff_16x1(acc_diff);
+            if (abs_sum_diff > sum_diff_thresh) {
+              return COPY_BLOCK;
+            }
+          } else {
+            return COPY_BLOCK;
+          }
+        }
+    }
+
+    vp8_copy_mem16x16(running_avg_y_start, avg_y_stride, sig_start, sig_stride);
+    return FILTER_BLOCK;
+}
+
+int vp8_denoiser_filter_uv_sse2(unsigned char *mc_running_avg,
+                             int mc_avg_stride,
+                             unsigned char *running_avg, int avg_stride,
+                             unsigned char *sig, int sig_stride,
+                             unsigned int motion_magnitude,
+                             int increase_denoising) {
+    unsigned char *running_avg_start = running_avg;
+    unsigned char *sig_start = sig;
+    unsigned int sum_diff_thresh;
+    int r;
+    int shift_inc  = (increase_denoising &&
+        motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) ? 1 : 0;
+    __m128i acc_diff = _mm_setzero_si128();
+    const __m128i k_0 = _mm_setzero_si128();
+    const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
+    const __m128i k_8 = _mm_set1_epi8(8);
+    const __m128i k_16 = _mm_set1_epi8(16);
+    /* Modify each level's adjustment according to motion_magnitude. */
+    const __m128i l3 = _mm_set1_epi8(
+                       (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) ?
+                        7 + shift_inc : 6);
+    /* Difference between level 3 and level 2 is 2. */
+    const __m128i l32 = _mm_set1_epi8(2);
+    /* Difference between level 2 and level 1 is 1. */
+    const __m128i l21 = _mm_set1_epi8(1);
+
+    {
+      const __m128i k_1 = _mm_set1_epi16(1);
+      __m128i vec_sum_block = _mm_setzero_si128();
+
+      // Avoid denoising color signal if its close to average level.
+      for (r = 0; r < 8; ++r) {
+        const __m128i v_sig = _mm_loadl_epi64((__m128i *)(&sig[0]));
+        const __m128i v_sig_unpack = _mm_unpacklo_epi8(v_sig, k_0);
+        vec_sum_block = _mm_add_epi16(vec_sum_block, v_sig_unpack);
+        sig += sig_stride;
+      }
+      sig -= sig_stride * 8;
+      {
+        const __m128i hg_fe_dc_ba = _mm_madd_epi16(vec_sum_block, k_1);
+        const __m128i hgfe_dcba = _mm_add_epi32(hg_fe_dc_ba,
+                                                _mm_srli_si128(hg_fe_dc_ba, 8));
+        const __m128i hgfedcba = _mm_add_epi32(hgfe_dcba,
+                                               _mm_srli_si128(hgfe_dcba, 4));
+        const int sum_block = _mm_cvtsi128_si32(hgfedcba);
+        if (abs(sum_block - (128 * 8 * 8)) < SUM_DIFF_FROM_AVG_THRESH_UV) {
+          return COPY_BLOCK;
+        }
+      }
+    }
+
+    for (r = 0; r < 4; ++r) {
+        /* Calculate differences */
+        const __m128i v_sig_low = _mm_castpd_si128(
+            _mm_load_sd((double *)(&sig[0])));
+        const __m128i v_sig = _mm_castpd_si128(
+            _mm_loadh_pd(_mm_castsi128_pd(v_sig_low),
+                         (double *)(&sig[sig_stride])));
+        const __m128i v_mc_running_avg_low = _mm_castpd_si128(
+            _mm_load_sd((double *)(&mc_running_avg[0])));
+        const __m128i v_mc_running_avg = _mm_castpd_si128(
+            _mm_loadh_pd(_mm_castsi128_pd(v_mc_running_avg_low),
+                         (double *)(&mc_running_avg[mc_avg_stride])));
+        const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg, v_sig);
+        const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg);
+        /* Obtain the sign. FF if diff is negative. */
+        const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
+        /* Clamp absolute difference to 16 to be used to get mask. Doing this
+         * allows us to use _mm_cmpgt_epi8, which operates on signed byte. */
+        const __m128i clamped_absdiff = _mm_min_epu8(
+                                        _mm_or_si128(pdiff, ndiff), k_16);
+        /* Get masks for l2 l1 and l0 adjustments */
+        const __m128i mask2 = _mm_cmpgt_epi8(k_16, clamped_absdiff);
+        const __m128i mask1 = _mm_cmpgt_epi8(k_8, clamped_absdiff);
+        const __m128i mask0 = _mm_cmpgt_epi8(k_4, clamped_absdiff);
+        /* Get adjustments for l2, l1, and l0 */
+        __m128i adj2 = _mm_and_si128(mask2, l32);
+        const __m128i adj1 = _mm_and_si128(mask1, l21);
+        const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff);
+        __m128i adj,  padj, nadj;
+        __m128i v_running_avg;
+
+        /* Combine the adjustments and get absolute adjustments. */
+        adj2 = _mm_add_epi8(adj2, adj1);
+        adj = _mm_sub_epi8(l3, adj2);
+        adj = _mm_andnot_si128(mask0, adj);
+        adj = _mm_or_si128(adj, adj0);
+
+        /* Restore the sign and get positive and negative adjustments. */
+        padj = _mm_andnot_si128(diff_sign, adj);
+        nadj = _mm_and_si128(diff_sign, adj);
+
+        /* Calculate filtered value. */
+        v_running_avg = _mm_adds_epu8(v_sig, padj);
+        v_running_avg = _mm_subs_epu8(v_running_avg, nadj);
+
+        _mm_storel_pd((double *)&running_avg[0],
+                      _mm_castsi128_pd(v_running_avg));
+        _mm_storeh_pd((double *)&running_avg[avg_stride],
+                      _mm_castsi128_pd(v_running_avg));
+
+        /* Adjustments <=7, and each element in acc_diff can fit in signed
+         * char.
+         */
+        acc_diff = _mm_adds_epi8(acc_diff, padj);
+        acc_diff = _mm_subs_epi8(acc_diff, nadj);
+
+        /* Update pointers for next iteration. */
+        sig += sig_stride * 2;
+        mc_running_avg += mc_avg_stride * 2;
+        running_avg += avg_stride * 2;
+    }
+
+    {
+        unsigned int abs_sum_diff = abs_sum_diff_16x1(acc_diff);
+        sum_diff_thresh = SUM_DIFF_THRESHOLD_UV;
+        if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH_UV;
+        if (abs_sum_diff > sum_diff_thresh) {
+          // Before returning to copy the block (i.e., apply no denoising),
+          // check if we can still apply some (weaker) temporal filtering to
+          // this block, that would otherwise not be denoised at all. Simplest
+          // is to apply an additional adjustment to running_avg_y to bring it
+          // closer to sig. The adjustment is capped by a maximum delta, and
+          // chosen such that in most cases the resulting sum_diff will be
+          // within the acceptable range given by sum_diff_thresh.
+
+          // The delta is set by the excess of absolute pixel diff over the
+          // threshold.
+          int delta = ((abs_sum_diff - sum_diff_thresh) >> 8) + 1;
+          // Only apply the adjustment for max delta up to 3.
+          if (delta < 4) {
+            const __m128i k_delta = _mm_set1_epi8(delta);
+            sig -= sig_stride * 8;
+            mc_running_avg -= mc_avg_stride * 8;
+            running_avg -= avg_stride * 8;
+            for (r = 0; r < 4; ++r) {
+              // Calculate differences.
+              const __m128i v_sig_low = _mm_castpd_si128(
+                  _mm_load_sd((double *)(&sig[0])));
+              const __m128i v_sig = _mm_castpd_si128(
+                  _mm_loadh_pd(_mm_castsi128_pd(v_sig_low),
+                               (double *)(&sig[sig_stride])));
+              const __m128i v_mc_running_avg_low = _mm_castpd_si128(
+                  _mm_load_sd((double *)(&mc_running_avg[0])));
+              const __m128i v_mc_running_avg = _mm_castpd_si128(
+                  _mm_loadh_pd(_mm_castsi128_pd(v_mc_running_avg_low),
+                               (double *)(&mc_running_avg[mc_avg_stride])));
+              const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg, v_sig);
+              const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg);
+              // Obtain the sign. FF if diff is negative.
+              const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
+              // Clamp absolute difference to delta to get the adjustment.
+              const __m128i adj =
+                  _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta);
+              // Restore the sign and get positive and negative adjustments.
+              __m128i padj, nadj;
+              const __m128i v_running_avg_low = _mm_castpd_si128(
+                  _mm_load_sd((double *)(&running_avg[0])));
+              __m128i v_running_avg = _mm_castpd_si128(
+                  _mm_loadh_pd(_mm_castsi128_pd(v_running_avg_low),
+                               (double *)(&running_avg[avg_stride])));
+              padj = _mm_andnot_si128(diff_sign, adj);
+              nadj = _mm_and_si128(diff_sign, adj);
+              // Calculate filtered value.
+              v_running_avg = _mm_subs_epu8(v_running_avg, padj);
+              v_running_avg = _mm_adds_epu8(v_running_avg, nadj);
+
+              _mm_storel_pd((double *)&running_avg[0],
+                            _mm_castsi128_pd(v_running_avg));
+              _mm_storeh_pd((double *)&running_avg[avg_stride],
+                            _mm_castsi128_pd(v_running_avg));
+
+             // Accumulate the adjustments.
+             acc_diff = _mm_subs_epi8(acc_diff, padj);
+             acc_diff = _mm_adds_epi8(acc_diff, nadj);
+
+             // Update pointers for next iteration.
+             sig += sig_stride * 2;
+             mc_running_avg += mc_avg_stride * 2;
+             running_avg += avg_stride * 2;
+            }
+            abs_sum_diff = abs_sum_diff_16x1(acc_diff);
+            if (abs_sum_diff > sum_diff_thresh) {
+              return COPY_BLOCK;
+            }
+          } else {
+            return COPY_BLOCK;
+          }
+        }
+    }
+
+    vp8_copy_mem8x8(running_avg_start, avg_stride, sig_start, sig_stride);
+    return FILTER_BLOCK;
+}
diff --git a/libs/libvpx/vp8/encoder/x86/encodeopt.asm b/libs/libvpx/vp8/encoder/x86/encodeopt.asm
new file mode 100644
index 0000000000..fe26b18e56
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/x86/encodeopt.asm
@@ -0,0 +1,386 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;int vp8_block_error_xmm(short *coeff_ptr,  short *dcoef_ptr)
+global sym(vp8_block_error_xmm) PRIVATE
+sym(vp8_block_error_xmm):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 2
+    push rsi
+    push rdi
+    ; end prologue
+
+        mov         rsi,        arg(0) ;coeff_ptr
+        mov         rdi,        arg(1) ;dcoef_ptr
+
+        movdqa      xmm0,       [rsi]
+        movdqa      xmm1,       [rdi]
+
+        movdqa      xmm2,       [rsi+16]
+        movdqa      xmm3,       [rdi+16]
+
+        psubw       xmm0,       xmm1
+        psubw       xmm2,       xmm3
+
+        pmaddwd     xmm0,       xmm0
+        pmaddwd     xmm2,       xmm2
+
+        paddd       xmm0,       xmm2
+
+        pxor        xmm5,       xmm5
+        movdqa      xmm1,       xmm0
+
+        punpckldq   xmm0,       xmm5
+        punpckhdq   xmm1,       xmm5
+
+        paddd       xmm0,       xmm1
+        movdqa      xmm1,       xmm0
+
+        psrldq      xmm0,       8
+        paddd       xmm0,       xmm1
+
+        movq        rax,        xmm0
+
+    pop rdi
+    pop rsi
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;int vp8_block_error_mmx(short *coeff_ptr,  short *dcoef_ptr)
+global sym(vp8_block_error_mmx) PRIVATE
+sym(vp8_block_error_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 2
+    push rsi
+    push rdi
+    ; end prolog
+
+
+        mov         rsi,        arg(0) ;coeff_ptr
+        pxor        mm7,        mm7
+
+        mov         rdi,        arg(1) ;dcoef_ptr
+        movq        mm3,        [rsi]
+
+        movq        mm4,        [rdi]
+        movq        mm5,        [rsi+8]
+
+        movq        mm6,        [rdi+8]
+        pxor        mm1,        mm1 ; from movd mm1, dc ; dc =0
+
+        movq        mm2,        mm7
+        psubw       mm5,        mm6
+
+        por         mm1,        mm2
+        pmaddwd     mm5,        mm5
+
+        pcmpeqw     mm1,        mm7
+        psubw       mm3,        mm4
+
+        pand        mm1,        mm3
+        pmaddwd     mm1,        mm1
+
+        paddd       mm1,        mm5
+        movq        mm3,        [rsi+16]
+
+        movq        mm4,        [rdi+16]
+        movq        mm5,        [rsi+24]
+
+        movq        mm6,        [rdi+24]
+        psubw       mm5,        mm6
+
+        pmaddwd     mm5,        mm5
+        psubw       mm3,        mm4
+
+        pmaddwd     mm3,        mm3
+        paddd       mm3,        mm5
+
+        paddd       mm1,        mm3
+        movq        mm0,        mm1
+
+        psrlq       mm1,        32
+        paddd       mm0,        mm1
+
+        movq        rax,        mm0
+
+    pop rdi
+    pop rsi
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;int vp8_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
+global sym(vp8_mbblock_error_mmx_impl) PRIVATE
+sym(vp8_mbblock_error_mmx_impl):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 3
+    push rsi
+    push rdi
+    ; end prolog
+
+
+        mov         rsi,        arg(0) ;coeff_ptr
+        pxor        mm7,        mm7
+
+        mov         rdi,        arg(1) ;dcoef_ptr
+        pxor        mm2,        mm2
+
+        movd        mm1,        dword ptr arg(2) ;dc
+        por         mm1,        mm2
+
+        pcmpeqw     mm1,        mm7
+        mov         rcx,        16
+
+.mberror_loop_mmx:
+        movq        mm3,       [rsi]
+        movq        mm4,       [rdi]
+
+        movq        mm5,       [rsi+8]
+        movq        mm6,       [rdi+8]
+
+
+        psubw       mm5,        mm6
+        pmaddwd     mm5,        mm5
+
+        psubw       mm3,        mm4
+        pand        mm3,        mm1
+
+        pmaddwd     mm3,        mm3
+        paddd       mm2,        mm5
+
+        paddd       mm2,        mm3
+        movq        mm3,       [rsi+16]
+
+        movq        mm4,       [rdi+16]
+        movq        mm5,       [rsi+24]
+
+        movq        mm6,       [rdi+24]
+        psubw       mm5,        mm6
+
+        pmaddwd     mm5,        mm5
+        psubw       mm3,        mm4
+
+        pmaddwd     mm3,        mm3
+        paddd       mm2,        mm5
+
+        paddd       mm2,        mm3
+        add         rsi,        32
+
+        add         rdi,        32
+        sub         rcx,        1
+
+        jnz         .mberror_loop_mmx
+
+        movq        mm0,        mm2
+        psrlq       mm2,        32
+
+        paddd       mm0,        mm2
+        movq        rax,        mm0
+
+    pop rdi
+    pop rsi
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
+global sym(vp8_mbblock_error_xmm_impl) PRIVATE
+sym(vp8_mbblock_error_xmm_impl):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 3
+    SAVE_XMM 6
+    push rsi
+    push rdi
+    ; end prolog
+
+
+        mov         rsi,        arg(0) ;coeff_ptr
+        pxor        xmm6,       xmm6
+
+        mov         rdi,        arg(1) ;dcoef_ptr
+        pxor        xmm4,       xmm4
+
+        movd        xmm5,       dword ptr arg(2) ;dc
+        por         xmm5,       xmm4
+
+        pcmpeqw     xmm5,       xmm6
+        mov         rcx,        16
+
+.mberror_loop:
+        movdqa      xmm0,       [rsi]
+        movdqa      xmm1,       [rdi]
+
+        movdqa      xmm2,       [rsi+16]
+        movdqa      xmm3,       [rdi+16]
+
+
+        psubw       xmm2,       xmm3
+        pmaddwd     xmm2,       xmm2
+
+        psubw       xmm0,       xmm1
+        pand        xmm0,       xmm5
+
+        pmaddwd     xmm0,       xmm0
+        add         rsi,        32
+
+        add         rdi,        32
+
+        sub         rcx,        1
+        paddd       xmm4,       xmm2
+
+        paddd       xmm4,       xmm0
+        jnz         .mberror_loop
+
+        movdqa      xmm0,       xmm4
+        punpckldq   xmm0,       xmm6
+
+        punpckhdq   xmm4,       xmm6
+        paddd       xmm0,       xmm4
+
+        movdqa      xmm1,       xmm0
+        psrldq      xmm0,       8
+
+        paddd       xmm0,       xmm1
+        movq        rax,        xmm0
+
+    pop rdi
+    pop rsi
+    ; begin epilog
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;int vp8_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);
+global sym(vp8_mbuverror_mmx_impl) PRIVATE
+sym(vp8_mbuverror_mmx_impl):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 2
+    push rsi
+    push rdi
+    ; end prolog
+
+
+        mov             rsi,        arg(0) ;s_ptr
+        mov             rdi,        arg(1) ;d_ptr
+
+        mov             rcx,        16
+        pxor            mm7,        mm7
+
+.mbuverror_loop_mmx:
+
+        movq            mm1,        [rsi]
+        movq            mm2,        [rdi]
+
+        psubw           mm1,        mm2
+        pmaddwd         mm1,        mm1
+
+
+        movq            mm3,        [rsi+8]
+        movq            mm4,        [rdi+8]
+
+        psubw           mm3,        mm4
+        pmaddwd         mm3,        mm3
+
+
+        paddd           mm7,        mm1
+        paddd           mm7,        mm3
+
+
+        add             rsi,        16
+        add             rdi,        16
+
+        dec             rcx
+        jnz             .mbuverror_loop_mmx
+
+        movq            mm0,        mm7
+        psrlq           mm7,        32
+
+        paddd           mm0,        mm7
+        movq            rax,        mm0
+
+    pop rdi
+    pop rsi
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;int vp8_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);
+global sym(vp8_mbuverror_xmm_impl) PRIVATE
+sym(vp8_mbuverror_xmm_impl):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 2
+    push rsi
+    push rdi
+    ; end prolog
+
+
+        mov             rsi,        arg(0) ;s_ptr
+        mov             rdi,        arg(1) ;d_ptr
+
+        mov             rcx,        16
+        pxor            xmm3,       xmm3
+
+.mbuverror_loop:
+
+        movdqa          xmm1,       [rsi]
+        movdqa          xmm2,       [rdi]
+
+        psubw           xmm1,       xmm2
+        pmaddwd         xmm1,       xmm1
+
+        paddd           xmm3,       xmm1
+
+        add             rsi,        16
+        add             rdi,        16
+
+        dec             rcx
+        jnz             .mbuverror_loop
+
+        pxor        xmm0,           xmm0
+        movdqa      xmm1,           xmm3
+
+        movdqa      xmm2,           xmm1
+        punpckldq   xmm1,           xmm0
+
+        punpckhdq   xmm2,           xmm0
+        paddd       xmm1,           xmm2
+
+        movdqa      xmm2,           xmm1
+
+        psrldq      xmm1,           8
+        paddd       xmm1,           xmm2
+
+        movq            rax,            xmm1
+
+    pop rdi
+    pop rsi
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/libs/libvpx/vp8/encoder/x86/fwalsh_sse2.asm b/libs/libvpx/vp8/encoder/x86/fwalsh_sse2.asm
new file mode 100644
index 0000000000..f4989279f4
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/x86/fwalsh_sse2.asm
@@ -0,0 +1,164 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch)
+global sym(vp8_short_walsh4x4_sse2) PRIVATE
+sym(vp8_short_walsh4x4_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 3
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov     rsi, arg(0)           ; input
+    mov     rdi, arg(1)           ; output
+    movsxd  rdx, dword ptr arg(2) ; pitch
+
+    ; first for loop
+    movq    xmm0, MMWORD PTR [rsi]           ; load input
+    movq    xmm1, MMWORD PTR [rsi + rdx]
+    lea     rsi,  [rsi + rdx*2]
+    movq    xmm2, MMWORD PTR [rsi]
+    movq    xmm3, MMWORD PTR [rsi + rdx]
+
+    punpcklwd xmm0,  xmm1
+    punpcklwd xmm2,  xmm3
+
+    movdqa    xmm1, xmm0
+    punpckldq xmm0, xmm2           ; ip[1] ip[0]
+    punpckhdq xmm1, xmm2           ; ip[3] ip[2]
+
+    movdqa    xmm2, xmm0
+    paddw     xmm0, xmm1
+    psubw     xmm2, xmm1
+
+    psllw     xmm0, 2              ; d1  a1
+    psllw     xmm2, 2              ; c1  b1
+
+    movdqa    xmm1, xmm0
+    punpcklqdq xmm0, xmm2          ; b1  a1
+    punpckhqdq xmm1, xmm2          ; c1  d1
+
+    pxor      xmm6, xmm6
+    movq      xmm6, xmm0
+    pxor      xmm7, xmm7
+    pcmpeqw   xmm7, xmm6
+    paddw     xmm7, [GLOBAL(c1)]
+
+    movdqa    xmm2, xmm0
+    paddw     xmm0, xmm1           ; b1+c1  a1+d1
+    psubw     xmm2, xmm1           ; b1-c1  a1-d1
+    paddw     xmm0, xmm7           ; b1+c1  a1+d1+(a1!=0)
+
+    ; second for loop
+    ; input: 13  9  5  1 12  8  4  0 (xmm0)
+    ;        14 10  6  2 15 11  7  3 (xmm2)
+    ; after shuffle:
+    ;        13  5  9  1 12  4  8  0 (xmm0)
+    ;        14  6 10  2 15  7 11  3 (xmm1)
+    pshuflw   xmm3, xmm0, 0xd8
+    pshufhw   xmm0, xmm3, 0xd8
+    pshuflw   xmm3, xmm2, 0xd8
+    pshufhw   xmm1, xmm3, 0xd8
+
+    movdqa    xmm2, xmm0
+    pmaddwd   xmm0, [GLOBAL(c1)]    ; d11 a11 d10 a10
+    pmaddwd   xmm2, [GLOBAL(cn1)]   ; c11 b11 c10 b10
+    movdqa    xmm3, xmm1
+    pmaddwd   xmm1, [GLOBAL(c1)]    ; d12 a12 d13 a13
+    pmaddwd   xmm3, [GLOBAL(cn1)]   ; c12 b12 c13 b13
+
+    pshufd    xmm4, xmm0, 0xd8      ; d11 d10 a11 a10
+    pshufd    xmm5, xmm2, 0xd8      ; c11 c10 b11 b10
+    pshufd    xmm6, xmm1, 0x72      ; d13 d12 a13 a12
+    pshufd    xmm7, xmm3, 0x72      ; c13 c12 b13 b12
+
+    movdqa    xmm0, xmm4
+    punpcklqdq xmm0, xmm5           ; b11 b10 a11 a10
+    punpckhqdq xmm4, xmm5           ; c11 c10 d11 d10
+    movdqa    xmm1, xmm6
+    punpcklqdq xmm1, xmm7           ; b13 b12 a13 a12
+    punpckhqdq xmm6, xmm7           ; c13 c12 d13 d12
+
+    movdqa    xmm2, xmm0
+    paddd     xmm0, xmm4            ; b21 b20 a21 a20
+    psubd     xmm2, xmm4            ; c21 c20 d21 d20
+    movdqa    xmm3, xmm1
+    paddd     xmm1, xmm6            ; b23 b22 a23 a22
+    psubd     xmm3, xmm6            ; c23 c22 d23 d22
+
+    pxor      xmm4, xmm4
+    movdqa    xmm5, xmm4
+    pcmpgtd   xmm4, xmm0
+    pcmpgtd   xmm5, xmm2
+    pand      xmm4, [GLOBAL(cd1)]
+    pand      xmm5, [GLOBAL(cd1)]
+
+    pxor      xmm6, xmm6
+    movdqa    xmm7, xmm6
+    pcmpgtd   xmm6, xmm1
+    pcmpgtd   xmm7, xmm3
+    pand      xmm6, [GLOBAL(cd1)]
+    pand      xmm7, [GLOBAL(cd1)]
+
+    paddd     xmm0, xmm4
+    paddd     xmm2, xmm5
+    paddd     xmm0, [GLOBAL(cd3)]
+    paddd     xmm2, [GLOBAL(cd3)]
+    paddd     xmm1, xmm6
+    paddd     xmm3, xmm7
+    paddd     xmm1, [GLOBAL(cd3)]
+    paddd     xmm3, [GLOBAL(cd3)]
+
+    psrad     xmm0, 3
+    psrad     xmm1, 3
+    psrad     xmm2, 3
+    psrad     xmm3, 3
+    movdqa    xmm4, xmm0
+    punpcklqdq xmm0, xmm1           ; a23 a22 a21 a20
+    punpckhqdq xmm4, xmm1           ; b23 b22 b21 b20
+    movdqa    xmm5, xmm2
+    punpckhqdq xmm2, xmm3           ; c23 c22 c21 c20
+    punpcklqdq xmm5, xmm3           ; d23 d22 d21 d20
+
+    packssdw  xmm0, xmm4            ; b23 b22 b21 b20 a23 a22 a21 a20
+    packssdw  xmm2, xmm5            ; d23 d22 d21 d20 c23 c22 c21 c20
+
+    movdqa  XMMWORD PTR [rdi], xmm0
+    movdqa  XMMWORD PTR [rdi + 16], xmm2
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+c1:
+    dw 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001
+align 16
+cn1:
+    dw 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff
+align 16
+cd1:
+    dd 0x00000001, 0x00000001, 0x00000001, 0x00000001
+align 16
+cd3:
+    dd 0x00000003, 0x00000003, 0x00000003, 0x00000003
diff --git a/libs/libvpx/vp8/encoder/x86/quantize_mmx.asm b/libs/libvpx/vp8/encoder/x86/quantize_mmx.asm
new file mode 100644
index 0000000000..2864ce16d9
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/x86/quantize_mmx.asm
@@ -0,0 +1,286 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
+;                           short *qcoeff_ptr,short *dequant_ptr,
+;                           short *scan_mask, short *round_ptr,
+;                           short *quant_ptr, short *dqcoeff_ptr);
+global sym(vp8_fast_quantize_b_impl_mmx) PRIVATE
+sym(vp8_fast_quantize_b_impl_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 8
+    push rsi
+    push rdi
+    ; end prolog
+
+
+        mov             rsi,        arg(0) ;coeff_ptr
+        movq            mm0,        [rsi]
+
+        mov             rax,        arg(1) ;zbin_ptr
+        movq            mm1,        [rax]
+
+        movq            mm3,        mm0
+        psraw           mm0,        15
+
+        pxor            mm3,        mm0
+        psubw           mm3,        mm0         ; abs
+
+        movq            mm2,        mm3
+        pcmpgtw         mm1,        mm2
+
+        pandn           mm1,        mm2
+        movq            mm3,        mm1
+
+        mov             rdx,        arg(6) ;quant_ptr
+        movq            mm1,        [rdx]
+
+        mov             rcx,        arg(5) ;round_ptr
+        movq            mm2,        [rcx]
+
+        paddw           mm3,        mm2
+        pmulhuw         mm3,        mm1
+
+        pxor            mm3,        mm0
+        psubw           mm3,        mm0     ;gain the sign back
+
+        mov             rdi,        arg(2) ;qcoeff_ptr
+        movq            mm0,        mm3
+
+        movq            [rdi],      mm3
+
+        mov             rax,        arg(3) ;dequant_ptr
+        movq            mm2,        [rax]
+
+        pmullw          mm3,        mm2
+        mov             rax,        arg(7) ;dqcoeff_ptr
+
+        movq            [rax],      mm3
+
+        ; next 8
+        movq            mm4,        [rsi+8]
+
+        mov             rax,        arg(1) ;zbin_ptr
+        movq            mm5,        [rax+8]
+
+        movq            mm7,        mm4
+        psraw           mm4,        15
+
+        pxor            mm7,        mm4
+        psubw           mm7,        mm4         ; abs
+
+        movq            mm6,        mm7
+        pcmpgtw         mm5,        mm6
+
+        pandn           mm5,        mm6
+        movq            mm7,        mm5
+
+        movq            mm5,        [rdx+8]
+        movq            mm6,        [rcx+8]
+
+        paddw           mm7,        mm6
+        pmulhuw         mm7,        mm5
+
+        pxor            mm7,        mm4
+        psubw           mm7,        mm4;gain the sign back
+
+        mov             rdi,        arg(2) ;qcoeff_ptr
+
+        movq            mm1,        mm7
+        movq            [rdi+8],    mm7
+
+        mov             rax,        arg(3) ;dequant_ptr
+        movq            mm6,        [rax+8]
+
+        pmullw          mm7,        mm6
+        mov             rax,        arg(7) ;dqcoeff_ptr
+
+        movq            [rax+8],    mm7
+
+
+                ; next 8
+        movq            mm4,        [rsi+16]
+
+        mov             rax,        arg(1) ;zbin_ptr
+        movq            mm5,        [rax+16]
+
+        movq            mm7,        mm4
+        psraw           mm4,        15
+
+        pxor            mm7,        mm4
+        psubw           mm7,        mm4         ; abs
+
+        movq            mm6,        mm7
+        pcmpgtw         mm5,        mm6
+
+        pandn           mm5,        mm6
+        movq            mm7,        mm5
+
+        movq            mm5,        [rdx+16]
+        movq            mm6,        [rcx+16]
+
+        paddw           mm7,        mm6
+        pmulhuw         mm7,        mm5
+
+        pxor            mm7,        mm4
+        psubw           mm7,        mm4;gain the sign back
+
+        mov             rdi,        arg(2) ;qcoeff_ptr
+
+        movq            mm1,        mm7
+        movq            [rdi+16],   mm7
+
+        mov             rax,        arg(3) ;dequant_ptr
+        movq            mm6,        [rax+16]
+
+        pmullw          mm7,        mm6
+        mov             rax,        arg(7) ;dqcoeff_ptr
+
+        movq            [rax+16],   mm7
+
+
+                ; next 8
+        movq            mm4,        [rsi+24]
+
+        mov             rax,        arg(1) ;zbin_ptr
+        movq            mm5,        [rax+24]
+
+        movq            mm7,        mm4
+        psraw           mm4,        15
+
+        pxor            mm7,        mm4
+        psubw           mm7,        mm4         ; abs
+
+        movq            mm6,        mm7
+        pcmpgtw         mm5,        mm6
+
+        pandn           mm5,        mm6
+        movq            mm7,        mm5
+
+        movq            mm5,        [rdx+24]
+        movq            mm6,        [rcx+24]
+
+        paddw           mm7,        mm6
+        pmulhuw         mm7,        mm5
+
+        pxor            mm7,        mm4
+        psubw           mm7,        mm4;gain the sign back
+
+        mov             rdi,        arg(2) ;qcoeff_ptr
+
+        movq            mm1,        mm7
+        movq            [rdi+24],   mm7
+
+        mov             rax,        arg(3) ;dequant_ptr
+        movq            mm6,        [rax+24]
+
+        pmullw          mm7,        mm6
+        mov             rax,        arg(7) ;dqcoeff_ptr
+
+        movq            [rax+24],   mm7
+
+
+
+        mov             rdi,        arg(4) ;scan_mask
+        mov             rsi,        arg(2) ;qcoeff_ptr
+
+        pxor            mm5,        mm5
+        pxor            mm7,        mm7
+
+        movq            mm0,        [rsi]
+        movq            mm1,        [rsi+8]
+
+        movq            mm2,        [rdi]
+        movq            mm3,        [rdi+8];
+
+        pcmpeqw         mm0,        mm7
+        pcmpeqw         mm1,        mm7
+
+        pcmpeqw         mm6,        mm6
+        pxor            mm0,        mm6
+
+        pxor            mm1,        mm6
+        psrlw           mm0,        15
+
+        psrlw           mm1,        15
+        pmaddwd         mm0,        mm2
+
+        pmaddwd         mm1,        mm3
+        movq            mm5,        mm0
+
+        paddd           mm5,        mm1
+
+        movq            mm0,        [rsi+16]
+        movq            mm1,        [rsi+24]
+
+        movq            mm2,        [rdi+16]
+        movq            mm3,        [rdi+24];
+
+        pcmpeqw         mm0,        mm7
+        pcmpeqw         mm1,        mm7
+
+        pcmpeqw         mm6,        mm6
+        pxor            mm0,        mm6
+
+        pxor            mm1,        mm6
+        psrlw           mm0,        15
+
+        psrlw           mm1,        15
+        pmaddwd         mm0,        mm2
+
+        pmaddwd         mm1,        mm3
+        paddd           mm5,        mm0
+
+        paddd           mm5,        mm1
+        movq            mm0,        mm5
+
+        psrlq           mm5,        32
+        paddd           mm0,        mm5
+
+        ; eob adjustment begins here
+        movq            rcx,        mm0
+        and             rcx,        0xffff
+
+        xor             rdx,        rdx
+        sub             rdx,        rcx ; rdx=-rcx
+
+        bsr             rax,        rcx
+        inc             rax
+
+        sar             rdx,        31
+        and             rax,        rdx
+        ; Substitute the sse assembly for the old mmx mixed assembly/C. The
+        ; following is kept as reference
+        ;    movq            rcx,        mm0
+        ;    bsr             rax,        rcx
+        ;
+        ;    mov             eob,        rax
+        ;    mov             eee,        rcx
+        ;
+        ;if(eee==0)
+        ;{
+        ;    eob=-1;
+        ;}
+        ;else if(eee<0)
+        ;{
+        ;    eob=15;
+        ;}
+        ;d->eob = eob+1;
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/libs/libvpx/vp8/encoder/x86/quantize_sse4.c b/libs/libvpx/vp8/encoder/x86/quantize_sse4.c
new file mode 100644
index 0000000000..601dd23a2f
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/x86/quantize_sse4.c
@@ -0,0 +1,128 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <smmintrin.h> /* SSE4.1 */
+
+#include "./vp8_rtcd.h"
+#include "vp8/encoder/block.h"
+#include "vp8/common/entropy.h" /* vp8_default_inv_zig_zag */
+
+#define SELECT_EOB(i, z, x, y, q) \
+    do { \
+        short boost = *zbin_boost_ptr; \
+        short x_z = _mm_extract_epi16(x, z); \
+        short y_z = _mm_extract_epi16(y, z); \
+        int cmp = (x_z < boost) | (y_z == 0); \
+        zbin_boost_ptr++; \
+        if (cmp) \
+            break; \
+        q = _mm_insert_epi16(q, y_z, z); \
+        eob = i; \
+        zbin_boost_ptr = b->zrun_zbin_boost; \
+    } while (0)
+
+void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) {
+    char eob = 0;
+    short *zbin_boost_ptr  = b->zrun_zbin_boost;
+
+    __m128i sz0, x0, sz1, x1, y0, y1, x_minus_zbin0, x_minus_zbin1,
+            dqcoeff0, dqcoeff1;
+    __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift));
+    __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8));
+    __m128i z0 = _mm_load_si128((__m128i *)(b->coeff));
+    __m128i z1 = _mm_load_si128((__m128i *)(b->coeff+8));
+    __m128i zbin_extra = _mm_cvtsi32_si128(b->zbin_extra);
+    __m128i zbin0 = _mm_load_si128((__m128i *)(b->zbin));
+    __m128i zbin1 = _mm_load_si128((__m128i *)(b->zbin + 8));
+    __m128i round0 = _mm_load_si128((__m128i *)(b->round));
+    __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8));
+    __m128i quant0 = _mm_load_si128((__m128i *)(b->quant));
+    __m128i quant1 = _mm_load_si128((__m128i *)(b->quant + 8));
+    __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant));
+    __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8));
+    __m128i qcoeff0 = _mm_setzero_si128();
+    __m128i qcoeff1 = _mm_setzero_si128();
+
+    /* Duplicate to all lanes. */
+    zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0);
+    zbin_extra = _mm_unpacklo_epi16(zbin_extra, zbin_extra);
+
+    /* Sign of z: z >> 15 */
+    sz0 = _mm_srai_epi16(z0, 15);
+    sz1 = _mm_srai_epi16(z1, 15);
+
+    /* x = abs(z): (z ^ sz) - sz */
+    x0 = _mm_xor_si128(z0, sz0);
+    x1 = _mm_xor_si128(z1, sz1);
+    x0 = _mm_sub_epi16(x0, sz0);
+    x1 = _mm_sub_epi16(x1, sz1);
+
+    /* zbin[] + zbin_extra */
+    zbin0 = _mm_add_epi16(zbin0, zbin_extra);
+    zbin1 = _mm_add_epi16(zbin1, zbin_extra);
+
+    /* In C x is compared to zbin where zbin = zbin[] + boost + extra. Rebalance
+     * the equation because boost is the only value which can change:
+     * x - (zbin[] + extra) >= boost */
+    x_minus_zbin0 = _mm_sub_epi16(x0, zbin0);
+    x_minus_zbin1 = _mm_sub_epi16(x1, zbin1);
+
+    /* All the remaining calculations are valid whether they are done now with
+     * simd or later inside the loop one at a time. */
+    x0 = _mm_add_epi16(x0, round0);
+    x1 = _mm_add_epi16(x1, round1);
+
+    y0 = _mm_mulhi_epi16(x0, quant0);
+    y1 = _mm_mulhi_epi16(x1, quant1);
+
+    y0 = _mm_add_epi16(y0, x0);
+    y1 = _mm_add_epi16(y1, x1);
+
+    /* Instead of shifting each value independently we convert the scaling
+     * factor with 1 << (16 - shift) so we can use multiply/return high half. */
+    y0 = _mm_mulhi_epi16(y0, quant_shift0);
+    y1 = _mm_mulhi_epi16(y1, quant_shift1);
+
+    /* Return the sign: (y ^ sz) - sz */
+    y0 = _mm_xor_si128(y0, sz0);
+    y1 = _mm_xor_si128(y1, sz1);
+    y0 = _mm_sub_epi16(y0, sz0);
+    y1 = _mm_sub_epi16(y1, sz1);
+
+    /* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */
+    SELECT_EOB(1, 0, x_minus_zbin0, y0, qcoeff0);
+    SELECT_EOB(2, 1, x_minus_zbin0, y0, qcoeff0);
+    SELECT_EOB(3, 4, x_minus_zbin0, y0, qcoeff0);
+    SELECT_EOB(4, 0, x_minus_zbin1, y1, qcoeff1);
+    SELECT_EOB(5, 5, x_minus_zbin0, y0, qcoeff0);
+    SELECT_EOB(6, 2, x_minus_zbin0, y0, qcoeff0);
+    SELECT_EOB(7, 3, x_minus_zbin0, y0, qcoeff0);
+    SELECT_EOB(8, 6, x_minus_zbin0, y0, qcoeff0);
+    SELECT_EOB(9, 1, x_minus_zbin1, y1, qcoeff1);
+    SELECT_EOB(10, 4, x_minus_zbin1, y1, qcoeff1);
+    SELECT_EOB(11, 5, x_minus_zbin1, y1, qcoeff1);
+    SELECT_EOB(12, 2, x_minus_zbin1, y1, qcoeff1);
+    SELECT_EOB(13, 7, x_minus_zbin0, y0, qcoeff0);
+    SELECT_EOB(14, 3, x_minus_zbin1, y1, qcoeff1);
+    SELECT_EOB(15, 6, x_minus_zbin1, y1, qcoeff1);
+    SELECT_EOB(16, 7, x_minus_zbin1, y1, qcoeff1);
+
+    _mm_store_si128((__m128i *)(d->qcoeff), qcoeff0);
+    _mm_store_si128((__m128i *)(d->qcoeff + 8), qcoeff1);
+
+    dqcoeff0 = _mm_mullo_epi16(qcoeff0, dequant0);
+    dqcoeff1 = _mm_mullo_epi16(qcoeff1, dequant1);
+
+    _mm_store_si128((__m128i *)(d->dqcoeff), dqcoeff0);
+    _mm_store_si128((__m128i *)(d->dqcoeff + 8), dqcoeff1);
+
+    *d->eob = eob;
+}
diff --git a/libs/libvpx/vp8/encoder/x86/quantize_ssse3.c b/libs/libvpx/vp8/encoder/x86/quantize_ssse3.c
new file mode 100644
index 0000000000..14282db801
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/x86/quantize_ssse3.c
@@ -0,0 +1,114 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <tmmintrin.h> /* SSSE3 */
+
+#include "vp8/encoder/block.h"
+
+/* bitscan reverse (bsr) */
+#if defined(_MSC_VER)
+#include <intrin.h>
+#pragma intrinsic(_BitScanReverse)
+static int bsr(int mask) {
+  unsigned long eob;
+  _BitScanReverse(&eob, mask);
+  eob++;
+  if (mask == 0)
+    eob = 0;
+  return eob;
+}
+#else
+static int bsr(int mask) {
+  int eob;
+#if defined(__GNUC__) && __GNUC__
+  __asm__ __volatile__("bsr %1, %0" : "=r" (eob) : "r" (mask) : "flags");
+#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
+  asm volatile("bsr %1, %0" : "=r" (eob) : "r" (mask) : "flags");
+#endif
+  eob++;
+  if (mask == 0)
+    eob = 0;
+  return eob;
+}
+#endif
+
+void vp8_fast_quantize_b_ssse3(BLOCK *b, BLOCKD *d) {
+  int eob, mask;
+
+  __m128i z0 = _mm_load_si128((__m128i *)(b->coeff));
+  __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8));
+  __m128i round0 = _mm_load_si128((__m128i *)(b->round));
+  __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8));
+  __m128i quant_fast0 = _mm_load_si128((__m128i *)(b->quant_fast));
+  __m128i quant_fast1 = _mm_load_si128((__m128i *)(b->quant_fast + 8));
+  __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant));
+  __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8));
+
+  __m128i sz0, sz1, x, x0, x1, y0, y1, zeros, abs0, abs1;
+
+  DECLARE_ALIGNED(16, const uint8_t, pshufb_zig_zag_mask[16]) =
+    { 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15 };
+  __m128i zig_zag = _mm_load_si128((const __m128i *)pshufb_zig_zag_mask);
+
+  /* sign of z: z >> 15 */
+  sz0 = _mm_srai_epi16(z0, 15);
+  sz1 = _mm_srai_epi16(z1, 15);
+
+  /* x = abs(z) */
+  x0 = _mm_abs_epi16(z0);
+  x1 = _mm_abs_epi16(z1);
+
+  /* x += round */
+  x0 = _mm_add_epi16(x0, round0);
+  x1 = _mm_add_epi16(x1, round1);
+
+  /* y = (x * quant) >> 16 */
+  y0 = _mm_mulhi_epi16(x0, quant_fast0);
+  y1 = _mm_mulhi_epi16(x1, quant_fast1);
+
+  /* ASM saves Y for EOB */
+  /* I think we can ignore that because adding the sign doesn't change anything
+   * and multiplying 0 by dequant is OK as well */
+  abs0 = y0;
+  abs1 = y1;
+
+  /* Restore the sign bit. */
+  y0 = _mm_xor_si128(y0, sz0);
+  y1 = _mm_xor_si128(y1, sz1);
+  x0 = _mm_sub_epi16(y0, sz0);
+  x1 = _mm_sub_epi16(y1, sz1);
+
+  /* qcoeff = x */
+  _mm_store_si128((__m128i *)(d->qcoeff), x0);
+  _mm_store_si128((__m128i *)(d->qcoeff + 8), x1);
+
+  /* x * dequant */
+  x0 = _mm_mullo_epi16(x0, dequant0);
+  x1 = _mm_mullo_epi16(x1, dequant1);
+
+  /* dqcoeff = x * dequant */
+  _mm_store_si128((__m128i *)(d->dqcoeff), x0);
+  _mm_store_si128((__m128i *)(d->dqcoeff + 8), x1);
+
+  zeros = _mm_setzero_si128();
+
+  x0 = _mm_cmpgt_epi16(abs0, zeros);
+  x1 = _mm_cmpgt_epi16(abs1, zeros);
+
+  x = _mm_packs_epi16(x0, x1);
+
+  x = _mm_shuffle_epi8(x, zig_zag);
+
+  mask = _mm_movemask_epi8(x);
+
+  eob = bsr(mask);
+
+  *d->eob = 0xFF & eob;
+}
diff --git a/libs/libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm b/libs/libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm
new file mode 100644
index 0000000000..bd92b398a0
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm
@@ -0,0 +1,207 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+; void vp8_temporal_filter_apply_sse2 | arg
+;  (unsigned char  *frame1,           |  0
+;   unsigned int    stride,           |  1
+;   unsigned char  *frame2,           |  2
+;   unsigned int    block_size,       |  3
+;   int             strength,         |  4
+;   int             filter_weight,    |  5
+;   unsigned int   *accumulator,      |  6
+;   unsigned short *count)            |  7
+global sym(vp8_temporal_filter_apply_sse2) PRIVATE
+sym(vp8_temporal_filter_apply_sse2):
+
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 8
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ALIGN_STACK 16, rax
+    %define block_size    0
+    %define strength      16
+    %define filter_weight 32
+    %define rounding_bit  48
+    %define rbp_backup    64
+    %define stack_size    80
+    sub         rsp,           stack_size
+    mov         [rsp + rbp_backup], rbp
+    ; end prolog
+
+        mov         rdx,            arg(3)
+        mov         [rsp + block_size], rdx
+        movd        xmm6,            arg(4)
+        movdqa      [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read
+
+        ; calculate the rounding bit outside the loop
+        ; 0x8000 >> (16 - strength)
+        mov         rdx,            16
+        sub         rdx,            arg(4) ; 16 - strength
+        movq        xmm4,           rdx    ; can't use rdx w/ shift
+        movdqa      xmm5,           [GLOBAL(_const_top_bit)]
+        psrlw       xmm5,           xmm4
+        movdqa      [rsp + rounding_bit], xmm5
+
+        mov         rsi,            arg(0) ; src/frame1
+        mov         rdx,            arg(2) ; predictor frame
+        mov         rdi,            arg(6) ; accumulator
+        mov         rax,            arg(7) ; count
+
+        ; dup the filter weight and store for later
+        movd        xmm0,           arg(5) ; filter_weight
+        pshuflw     xmm0,           xmm0, 0
+        punpcklwd   xmm0,           xmm0
+        movdqa      [rsp + filter_weight], xmm0
+
+        mov         rbp,            arg(1) ; stride
+        pxor        xmm7,           xmm7   ; zero for extraction
+
+        lea         rcx,            [rdx + 16*16*1]
+        cmp         dword ptr [rsp + block_size], 8
+        jne         .temporal_filter_apply_load_16
+        lea         rcx,            [rdx + 8*8*1]
+
+.temporal_filter_apply_load_8:
+        movq        xmm0,           [rsi]  ; first row
+        lea         rsi,            [rsi + rbp] ; += stride
+        punpcklbw   xmm0,           xmm7   ; src[ 0- 7]
+        movq        xmm1,           [rsi]  ; second row
+        lea         rsi,            [rsi + rbp] ; += stride
+        punpcklbw   xmm1,           xmm7   ; src[ 8-15]
+        jmp         .temporal_filter_apply_load_finished
+
+.temporal_filter_apply_load_16:
+        movdqa      xmm0,           [rsi]  ; src (frame1)
+        lea         rsi,            [rsi + rbp] ; += stride
+        movdqa      xmm1,           xmm0
+        punpcklbw   xmm0,           xmm7   ; src[ 0- 7]
+        punpckhbw   xmm1,           xmm7   ; src[ 8-15]
+
+.temporal_filter_apply_load_finished:
+        movdqa      xmm2,           [rdx]  ; predictor (frame2)
+        movdqa      xmm3,           xmm2
+        punpcklbw   xmm2,           xmm7   ; pred[ 0- 7]
+        punpckhbw   xmm3,           xmm7   ; pred[ 8-15]
+
+        ; modifier = src_byte - pixel_value
+        psubw       xmm0,           xmm2   ; src - pred[ 0- 7]
+        psubw       xmm1,           xmm3   ; src - pred[ 8-15]
+
+        ; modifier *= modifier
+        pmullw      xmm0,           xmm0   ; modifer[ 0- 7]^2
+        pmullw      xmm1,           xmm1   ; modifer[ 8-15]^2
+
+        ; modifier *= 3
+        pmullw      xmm0,           [GLOBAL(_const_3w)]
+        pmullw      xmm1,           [GLOBAL(_const_3w)]
+
+        ; modifer += 0x8000 >> (16 - strength)
+        paddw       xmm0,           [rsp + rounding_bit]
+        paddw       xmm1,           [rsp + rounding_bit]
+
+        ; modifier >>= strength
+        psrlw       xmm0,           [rsp + strength]
+        psrlw       xmm1,           [rsp + strength]
+
+        ; modifier = 16 - modifier
+        ; saturation takes care of modifier > 16
+        movdqa      xmm3,           [GLOBAL(_const_16w)]
+        movdqa      xmm2,           [GLOBAL(_const_16w)]
+        psubusw     xmm3,           xmm1
+        psubusw     xmm2,           xmm0
+
+        ; modifier *= filter_weight
+        pmullw      xmm2,           [rsp + filter_weight]
+        pmullw      xmm3,           [rsp + filter_weight]
+
+        ; count
+        movdqa      xmm4,           [rax]
+        movdqa      xmm5,           [rax+16]
+        ; += modifier
+        paddw       xmm4,           xmm2
+        paddw       xmm5,           xmm3
+        ; write back
+        movdqa      [rax],          xmm4
+        movdqa      [rax+16],       xmm5
+        lea         rax,            [rax + 16*2] ; count += 16*(sizeof(short))
+
+        ; load and extract the predictor up to shorts
+        pxor        xmm7,           xmm7
+        movdqa      xmm0,           [rdx]
+        lea         rdx,            [rdx + 16*1] ; pred += 16*(sizeof(char))
+        movdqa      xmm1,           xmm0
+        punpcklbw   xmm0,           xmm7   ; pred[ 0- 7]
+        punpckhbw   xmm1,           xmm7   ; pred[ 8-15]
+
+        ; modifier *= pixel_value
+        pmullw      xmm0,           xmm2
+        pmullw      xmm1,           xmm3
+
+        ; expand to double words
+        movdqa      xmm2,           xmm0
+        punpcklwd   xmm0,           xmm7   ; [ 0- 3]
+        punpckhwd   xmm2,           xmm7   ; [ 4- 7]
+        movdqa      xmm3,           xmm1
+        punpcklwd   xmm1,           xmm7   ; [ 8-11]
+        punpckhwd   xmm3,           xmm7   ; [12-15]
+
+        ; accumulator
+        movdqa      xmm4,           [rdi]
+        movdqa      xmm5,           [rdi+16]
+        movdqa      xmm6,           [rdi+32]
+        movdqa      xmm7,           [rdi+48]
+        ; += modifier
+        paddd       xmm4,           xmm0
+        paddd       xmm5,           xmm2
+        paddd       xmm6,           xmm1
+        paddd       xmm7,           xmm3
+        ; write back
+        movdqa      [rdi],          xmm4
+        movdqa      [rdi+16],       xmm5
+        movdqa      [rdi+32],       xmm6
+        movdqa      [rdi+48],       xmm7
+        lea         rdi,            [rdi + 16*4] ; accumulator += 16*(sizeof(int))
+
+        cmp         rdx,            rcx
+        je          .temporal_filter_apply_epilog
+        pxor        xmm7,           xmm7   ; zero for extraction
+        cmp         dword ptr [rsp + block_size], 16
+        je          .temporal_filter_apply_load_16
+        jmp         .temporal_filter_apply_load_8
+
+.temporal_filter_apply_epilog:
+    ; begin epilog
+    mov         rbp,            [rsp + rbp_backup]
+    add         rsp,            stack_size
+    pop         rsp
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+_const_3w:
+    times 8 dw 3
+align 16
+_const_top_bit:
+    times 8 dw 1<<15
+align 16
+_const_16w
+    times 8 dw 16
diff --git a/libs/libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c b/libs/libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c
new file mode 100644
index 0000000000..7bf5155c95
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c
@@ -0,0 +1,67 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+#include "vpx_ports/x86.h"
+#include "vp8/encoder/block.h"
+
+void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch);
+void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch)
+{
+    vp8_short_fdct4x4_mmx(input,   output,    pitch);
+    vp8_short_fdct4x4_mmx(input + 4, output + 16, pitch);
+}
+
+int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
+                                 short *qcoeff_ptr, short *dequant_ptr,
+                                 const short *scan_mask, short *round_ptr,
+                                 short *quant_ptr, short *dqcoeff_ptr);
+void vp8_fast_quantize_b_mmx(BLOCK *b, BLOCKD *d)
+{
+    const short *scan_mask   = vp8_default_zig_zag_mask;
+    short *coeff_ptr   = b->coeff;
+    short *zbin_ptr    = b->zbin;
+    short *round_ptr   = b->round;
+    short *quant_ptr   = b->quant_fast;
+    short *qcoeff_ptr  = d->qcoeff;
+    short *dqcoeff_ptr = d->dqcoeff;
+    short *dequant_ptr = d->dequant;
+
+    *d->eob = (char)vp8_fast_quantize_b_impl_mmx(
+                                                 coeff_ptr,
+                                                 zbin_ptr,
+                                                 qcoeff_ptr,
+                                                 dequant_ptr,
+                                                 scan_mask,
+
+                                                 round_ptr,
+                                                 quant_ptr,
+                                                 dqcoeff_ptr
+                                                 );
+}
+
+int vp8_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
+int vp8_mbblock_error_mmx(MACROBLOCK *mb, int dc)
+{
+    short *coeff_ptr =  mb->block[0].coeff;
+    short *dcoef_ptr =  mb->e_mbd.block[0].dqcoeff;
+    return vp8_mbblock_error_mmx_impl(coeff_ptr, dcoef_ptr, dc);
+}
+
+int vp8_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);
+int vp8_mbuverror_mmx(MACROBLOCK *mb)
+{
+    short *s_ptr = &mb->coeff[256];
+    short *d_ptr = &mb->e_mbd.dqcoeff[256];
+    return vp8_mbuverror_mmx_impl(s_ptr, d_ptr);
+}
+
diff --git a/libs/libvpx/vp8/encoder/x86/vp8_enc_stubs_sse2.c b/libs/libvpx/vp8/encoder/x86/vp8_enc_stubs_sse2.c
new file mode 100644
index 0000000000..be9aaf3c96
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/x86/vp8_enc_stubs_sse2.c
@@ -0,0 +1,32 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+#include "vpx_ports/x86.h"
+#include "vp8/encoder/block.h"
+
+int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
+int vp8_mbblock_error_xmm(MACROBLOCK *mb, int dc)
+{
+    short *coeff_ptr =  mb->block[0].coeff;
+    short *dcoef_ptr =  mb->e_mbd.block[0].dqcoeff;
+    return vp8_mbblock_error_xmm_impl(coeff_ptr, dcoef_ptr, dc);
+}
+
+int vp8_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);
+int vp8_mbuverror_xmm(MACROBLOCK *mb)
+{
+    short *s_ptr = &mb->coeff[256];
+    short *d_ptr = &mb->e_mbd.dqcoeff[256];
+    return vp8_mbuverror_xmm_impl(s_ptr, d_ptr);
+}
+
diff --git a/libs/libvpx/vp8/encoder/x86/vp8_quantize_sse2.c b/libs/libvpx/vp8/encoder/x86/vp8_quantize_sse2.c
new file mode 100644
index 0000000000..b4e92e04b2
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/x86/vp8_quantize_sse2.c
@@ -0,0 +1,228 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vp8_rtcd.h"
+#include "vpx_ports/x86.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp8/encoder/block.h"
+#include "vp8/common/entropy.h" /* vp8_default_inv_zig_zag */
+
+#include <mmintrin.h> /* MMX */
+#include <xmmintrin.h> /* SSE */
+#include <emmintrin.h> /* SSE2 */
+
+#define SELECT_EOB(i, z) \
+    do { \
+        short boost = *zbin_boost_ptr; \
+        int cmp = (x[z] < boost) | (y[z] == 0); \
+        zbin_boost_ptr++; \
+        if (cmp) \
+            break; \
+        qcoeff_ptr[z] = y[z]; \
+        eob = i; \
+        zbin_boost_ptr = b->zrun_zbin_boost; \
+    } while (0)
+
+void vp8_regular_quantize_b_sse2(BLOCK *b, BLOCKD *d)
+{
+    char eob = 0;
+    short *zbin_boost_ptr;
+    short *qcoeff_ptr      = d->qcoeff;
+    DECLARE_ALIGNED(16, short, x[16]);
+    DECLARE_ALIGNED(16, short, y[16]);
+
+    __m128i sz0, x0, sz1, x1, y0, y1, x_minus_zbin0, x_minus_zbin1;
+    __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift));
+    __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8));
+    __m128i z0 = _mm_load_si128((__m128i *)(b->coeff));
+    __m128i z1 = _mm_load_si128((__m128i *)(b->coeff+8));
+    __m128i zbin_extra = _mm_cvtsi32_si128(b->zbin_extra);
+    __m128i zbin0 = _mm_load_si128((__m128i *)(b->zbin));
+    __m128i zbin1 = _mm_load_si128((__m128i *)(b->zbin + 8));
+    __m128i round0 = _mm_load_si128((__m128i *)(b->round));
+    __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8));
+    __m128i quant0 = _mm_load_si128((__m128i *)(b->quant));
+    __m128i quant1 = _mm_load_si128((__m128i *)(b->quant + 8));
+    __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant));
+    __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8));
+
+    memset(qcoeff_ptr, 0, 32);
+
+    /* Duplicate to all lanes. */
+    zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0);
+    zbin_extra = _mm_unpacklo_epi16(zbin_extra, zbin_extra);
+
+    /* Sign of z: z >> 15 */
+    sz0 = _mm_srai_epi16(z0, 15);
+    sz1 = _mm_srai_epi16(z1, 15);
+
+    /* x = abs(z): (z ^ sz) - sz */
+    x0 = _mm_xor_si128(z0, sz0);
+    x1 = _mm_xor_si128(z1, sz1);
+    x0 = _mm_sub_epi16(x0, sz0);
+    x1 = _mm_sub_epi16(x1, sz1);
+
+    /* zbin[] + zbin_extra */
+    zbin0 = _mm_add_epi16(zbin0, zbin_extra);
+    zbin1 = _mm_add_epi16(zbin1, zbin_extra);
+
+    /* In C x is compared to zbin where zbin = zbin[] + boost + extra. Rebalance
+     * the equation because boost is the only value which can change:
+     * x - (zbin[] + extra) >= boost */
+    x_minus_zbin0 = _mm_sub_epi16(x0, zbin0);
+    x_minus_zbin1 = _mm_sub_epi16(x1, zbin1);
+
+    _mm_store_si128((__m128i *)(x), x_minus_zbin0);
+    _mm_store_si128((__m128i *)(x + 8), x_minus_zbin1);
+
+    /* All the remaining calculations are valid whether they are done now with
+     * simd or later inside the loop one at a time. */
+    x0 = _mm_add_epi16(x0, round0);
+    x1 = _mm_add_epi16(x1, round1);
+
+    y0 = _mm_mulhi_epi16(x0, quant0);
+    y1 = _mm_mulhi_epi16(x1, quant1);
+
+    y0 = _mm_add_epi16(y0, x0);
+    y1 = _mm_add_epi16(y1, x1);
+
+    /* Instead of shifting each value independently we convert the scaling
+     * factor with 1 << (16 - shift) so we can use multiply/return high half. */
+    y0 = _mm_mulhi_epi16(y0, quant_shift0);
+    y1 = _mm_mulhi_epi16(y1, quant_shift1);
+
+    /* Return the sign: (y ^ sz) - sz */
+    y0 = _mm_xor_si128(y0, sz0);
+    y1 = _mm_xor_si128(y1, sz1);
+    y0 = _mm_sub_epi16(y0, sz0);
+    y1 = _mm_sub_epi16(y1, sz1);
+
+    _mm_store_si128((__m128i *)(y), y0);
+    _mm_store_si128((__m128i *)(y + 8), y1);
+
+    zbin_boost_ptr = b->zrun_zbin_boost;
+
+    /* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */
+    SELECT_EOB(1, 0);
+    SELECT_EOB(2, 1);
+    SELECT_EOB(3, 4);
+    SELECT_EOB(4, 8);
+    SELECT_EOB(5, 5);
+    SELECT_EOB(6, 2);
+    SELECT_EOB(7, 3);
+    SELECT_EOB(8, 6);
+    SELECT_EOB(9, 9);
+    SELECT_EOB(10, 12);
+    SELECT_EOB(11, 13);
+    SELECT_EOB(12, 10);
+    SELECT_EOB(13, 7);
+    SELECT_EOB(14, 11);
+    SELECT_EOB(15, 14);
+    SELECT_EOB(16, 15);
+
+    y0 = _mm_load_si128((__m128i *)(d->qcoeff));
+    y1 = _mm_load_si128((__m128i *)(d->qcoeff + 8));
+
+    /* dqcoeff = qcoeff * dequant */
+    y0 = _mm_mullo_epi16(y0, dequant0);
+    y1 = _mm_mullo_epi16(y1, dequant1);
+
+    _mm_store_si128((__m128i *)(d->dqcoeff), y0);
+    _mm_store_si128((__m128i *)(d->dqcoeff + 8), y1);
+
+    *d->eob = eob;
+}
+
+void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d)
+{
+  __m128i z0 = _mm_load_si128((__m128i *)(b->coeff));
+  __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8));
+  __m128i round0 = _mm_load_si128((__m128i *)(b->round));
+  __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8));
+  __m128i quant_fast0 = _mm_load_si128((__m128i *)(b->quant_fast));
+  __m128i quant_fast1 = _mm_load_si128((__m128i *)(b->quant_fast + 8));
+  __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant));
+  __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8));
+  __m128i inv_zig_zag0 = _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag));
+  __m128i inv_zig_zag1 = _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag + 8));
+
+  __m128i sz0, sz1, x0, x1, y0, y1, xdq0, xdq1, zeros, ones;
+
+  /* sign of z: z >> 15 */
+  sz0 = _mm_srai_epi16(z0, 15);
+  sz1 = _mm_srai_epi16(z1, 15);
+
+  /* x = abs(z): (z ^ sz) - sz */
+  x0 = _mm_xor_si128(z0, sz0);
+  x1 = _mm_xor_si128(z1, sz1);
+  x0 = _mm_sub_epi16(x0, sz0);
+  x1 = _mm_sub_epi16(x1, sz1);
+
+  /* x += round */
+  x0 = _mm_add_epi16(x0, round0);
+  x1 = _mm_add_epi16(x1, round1);
+
+  /* y = (x * quant) >> 16 */
+  y0 = _mm_mulhi_epi16(x0, quant_fast0);
+  y1 = _mm_mulhi_epi16(x1, quant_fast1);
+
+  /* x = abs(y) = (y ^ sz) - sz */
+  y0 = _mm_xor_si128(y0, sz0);
+  y1 = _mm_xor_si128(y1, sz1);
+  x0 = _mm_sub_epi16(y0, sz0);
+  x1 = _mm_sub_epi16(y1, sz1);
+
+  /* qcoeff = x */
+  _mm_store_si128((__m128i *)(d->qcoeff), x0);
+  _mm_store_si128((__m128i *)(d->qcoeff + 8), x1);
+
+  /* x * dequant */
+  xdq0 = _mm_mullo_epi16(x0, dequant0);
+  xdq1 = _mm_mullo_epi16(x1, dequant1);
+
+  /* dqcoeff = x * dequant */
+  _mm_store_si128((__m128i *)(d->dqcoeff), xdq0);
+  _mm_store_si128((__m128i *)(d->dqcoeff + 8), xdq1);
+
+  /* build a mask for the zig zag */
+  zeros = _mm_setzero_si128();
+
+  x0 = _mm_cmpeq_epi16(x0, zeros);
+  x1 = _mm_cmpeq_epi16(x1, zeros);
+
+  ones = _mm_cmpeq_epi16(zeros, zeros);
+
+  x0 = _mm_xor_si128(x0, ones);
+  x1 = _mm_xor_si128(x1, ones);
+
+  x0 = _mm_and_si128(x0, inv_zig_zag0);
+  x1 = _mm_and_si128(x1, inv_zig_zag1);
+
+  x0 = _mm_max_epi16(x0, x1);
+
+  /* now down to 8 */
+  x1 = _mm_shuffle_epi32(x0, 0xE); // 0b00001110
+
+  x0 = _mm_max_epi16(x0, x1);
+
+  /* only 4 left */
+  x1 = _mm_shufflelo_epi16(x0, 0xE); // 0b00001110
+
+  x0 = _mm_max_epi16(x0, x1);
+
+  /* okay, just 2! */
+  x1 = _mm_shufflelo_epi16(x0, 0x1); // 0b00000001
+
+  x0 = _mm_max_epi16(x0, x1);
+
+  *d->eob = 0xFF & _mm_cvtsi128_si32(x0);
+}
diff --git a/libs/libvpx/vp8/exports_dec b/libs/libvpx/vp8/exports_dec
new file mode 100644
index 0000000000..100ac5c27d
--- /dev/null
+++ b/libs/libvpx/vp8/exports_dec
@@ -0,0 +1,2 @@
+data vpx_codec_vp8_dx_algo
+text vpx_codec_vp8_dx
diff --git a/libs/libvpx/vp8/exports_enc b/libs/libvpx/vp8/exports_enc
new file mode 100644
index 0000000000..29ff35ef7b
--- /dev/null
+++ b/libs/libvpx/vp8/exports_enc
@@ -0,0 +1,2 @@
+data vpx_codec_vp8_cx_algo
+text vpx_codec_vp8_cx
diff --git a/libs/libvpx/vp8/vp8_common.mk b/libs/libvpx/vp8/vp8_common.mk
new file mode 100644
index 0000000000..4c4e856272
--- /dev/null
+++ b/libs/libvpx/vp8/vp8_common.mk
@@ -0,0 +1,169 @@
+##
+##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+VP8_COMMON_SRCS-yes += vp8_common.mk
+VP8_COMMON_SRCS-yes += common/ppflags.h
+VP8_COMMON_SRCS-yes += common/onyx.h
+VP8_COMMON_SRCS-yes += common/onyxd.h
+VP8_COMMON_SRCS-yes += common/alloccommon.c
+VP8_COMMON_SRCS-yes += common/blockd.c
+VP8_COMMON_SRCS-yes += common/coefupdateprobs.h
+VP8_COMMON_SRCS-yes += common/copy_c.c
+VP8_COMMON_SRCS-yes += common/debugmodes.c
+VP8_COMMON_SRCS-yes += common/default_coef_probs.h
+VP8_COMMON_SRCS-yes += common/dequantize.c
+VP8_COMMON_SRCS-yes += common/entropy.c
+VP8_COMMON_SRCS-yes += common/entropymode.c
+VP8_COMMON_SRCS-yes += common/entropymv.c
+VP8_COMMON_SRCS-yes += common/extend.c
+VP8_COMMON_SRCS-yes += common/filter.c
+VP8_COMMON_SRCS-yes += common/filter.h
+VP8_COMMON_SRCS-yes += common/findnearmv.c
+VP8_COMMON_SRCS-yes += common/generic/systemdependent.c
+VP8_COMMON_SRCS-yes += common/idct_blk.c
+VP8_COMMON_SRCS-yes += common/idctllm.c
+VP8_COMMON_SRCS-yes += common/alloccommon.h
+VP8_COMMON_SRCS-yes += common/blockd.h
+VP8_COMMON_SRCS-yes += common/common.h
+VP8_COMMON_SRCS-yes += common/entropy.h
+VP8_COMMON_SRCS-yes += common/entropymode.h
+VP8_COMMON_SRCS-yes += common/entropymv.h
+VP8_COMMON_SRCS-yes += common/extend.h
+VP8_COMMON_SRCS-yes += common/findnearmv.h
+VP8_COMMON_SRCS-yes += common/header.h
+VP8_COMMON_SRCS-yes += common/invtrans.h
+VP8_COMMON_SRCS-yes += common/loopfilter.h
+VP8_COMMON_SRCS-yes += common/modecont.h
+VP8_COMMON_SRCS-yes += common/mv.h
+VP8_COMMON_SRCS-yes += common/onyxc_int.h
+VP8_COMMON_SRCS-yes += common/quant_common.h
+VP8_COMMON_SRCS-yes += common/reconinter.h
+VP8_COMMON_SRCS-yes += common/reconintra.h
+VP8_COMMON_SRCS-yes += common/reconintra4x4.h
+VP8_COMMON_SRCS-yes += common/rtcd.c
+VP8_COMMON_SRCS-yes += common/rtcd_defs.pl
+VP8_COMMON_SRCS-yes += common/setupintrarecon.h
+VP8_COMMON_SRCS-yes += common/swapyv12buffer.h
+VP8_COMMON_SRCS-yes += common/systemdependent.h
+VP8_COMMON_SRCS-yes += common/threading.h
+VP8_COMMON_SRCS-yes += common/treecoder.h
+VP8_COMMON_SRCS-yes += common/vp8_loopfilter.c
+VP8_COMMON_SRCS-yes += common/loopfilter_filters.c
+VP8_COMMON_SRCS-yes += common/mbpitch.c
+VP8_COMMON_SRCS-yes += common/modecont.c
+VP8_COMMON_SRCS-yes += common/quant_common.c
+VP8_COMMON_SRCS-yes += common/reconinter.c
+VP8_COMMON_SRCS-yes += common/reconintra.c
+VP8_COMMON_SRCS-yes += common/reconintra4x4.c
+VP8_COMMON_SRCS-yes += common/setupintrarecon.c
+VP8_COMMON_SRCS-yes += common/swapyv12buffer.c
+VP8_COMMON_SRCS-yes += common/vp8_entropymodedata.h
+
+
+
+VP8_COMMON_SRCS-$(CONFIG_POSTPROC_VISUALIZER) += common/textblit.c
+VP8_COMMON_SRCS-yes += common/treecoder.c
+
+VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/filter_x86.c
+VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/filter_x86.h
+VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp8_asm_stubs.c
+VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/loopfilter_x86.c
+VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/mfqe.c
+VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/postproc.h
+VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/postproc.c
+VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/dequantize_mmx.asm
+VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/idct_blk_mmx.c
+VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/idctllm_mmx.asm
+VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/iwalsh_mmx.asm
+VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp8_loopfilter_mmx.asm
+VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/recon_mmx.asm
+VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/subpixel_mmx.asm
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/copy_sse2.asm
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idct_blk_sse2.c
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idctllm_sse2.asm
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_sse2.asm
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/subpixel_sse2.asm
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_sse2.asm
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/iwalsh_sse2.asm
+VP8_COMMON_SRCS-$(HAVE_SSE3) += common/x86/copy_sse3.asm
+VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/subpixel_ssse3.asm
+
+ifeq ($(CONFIG_POSTPROC),yes)
+VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/postproc_mmx.asm
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/mfqe_sse2.asm
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/postproc_sse2.asm
+endif
+
+ifeq ($(ARCH_X86_64),yes)
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_block_sse2_x86_64.asm
+endif
+
+# common (c)
+VP8_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/idctllm_dspr2.c
+VP8_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/filter_dspr2.c
+VP8_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp8_loopfilter_filters_dspr2.c
+VP8_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/reconinter_dspr2.c
+VP8_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/idct_blk_dspr2.c
+VP8_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/dequantize_dspr2.c
+
+# common (c)
+VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/bilinear_filter_msa.c
+VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/copymem_msa.c
+VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/idct_msa.c
+VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/loopfilter_filters_msa.c
+VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/sixtap_filter_msa.c
+VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp8_macros_msa.h
+
+ifeq ($(CONFIG_POSTPROC),yes)
+VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/mfqe_msa.c
+VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/postproc_msa.c
+endif
+
+# common (c)
+VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/filter_arm.c
+VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/loopfilter_arm.c
+VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/dequantize_arm.c
+
+# common (media)
+VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/bilinearfilter_arm.c
+VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/bilinearfilter_arm.h
+VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/bilinearfilter_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/copymem8x4_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/copymem8x8_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/copymem16x16_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/dc_only_idct_add_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/iwalsh_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/filter_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/idct_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/loopfilter_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/simpleloopfilter_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/sixtappredict8x4_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/dequant_idct_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/dequantize_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/idct_blk_v6.c
+
+# common (neon intrinsics)
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/bilinearpredict_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/copymem_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/dc_only_idct_add_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/dequant_idct_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/dequantizeb_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/idct_blk_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/idct_dequant_0_2x_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/idct_dequant_full_2x_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/iwalsh_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp8_loopfilter_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/loopfiltersimplehorizontaledge_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/loopfiltersimpleverticaledge_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/mbloopfilter_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/shortidct4x4llm_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/sixtappredict_neon.c
+
+$(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.pl))
diff --git a/libs/libvpx/vp8/vp8_cx_iface.c b/libs/libvpx/vp8/vp8_cx_iface.c
new file mode 100644
index 0000000000..257d2a0c4e
--- /dev/null
+++ b/libs/libvpx/vp8/vp8_cx_iface.c
@@ -0,0 +1,1371 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "./vpx_config.h"
+#include "./vp8_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_scale_rtcd.h"
+#include "vpx/vpx_codec.h"
+#include "vpx/internal/vpx_codec_internal.h"
+#include "vpx_version.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/vpx_once.h"
+#include "vp8/encoder/onyx_int.h"
+#include "vpx/vp8cx.h"
+#include "vp8/encoder/firstpass.h"
+#include "vp8/common/onyx.h"
+#include <stdlib.h>
+#include <string.h>
+
+struct vp8_extracfg
+{
+    struct vpx_codec_pkt_list *pkt_list;
+    int                         cpu_used;                    /** available cpu percentage in 1/16*/
+    unsigned int                enable_auto_alt_ref;           /** if encoder decides to uses alternate reference frame */
+    unsigned int                noise_sensitivity;
+    unsigned int                Sharpness;
+    unsigned int                static_thresh;
+    unsigned int                token_partitions;
+    unsigned int                arnr_max_frames;    /* alt_ref Noise Reduction Max Frame Count */
+    unsigned int                arnr_strength;    /* alt_ref Noise Reduction Strength */
+    unsigned int                arnr_type;        /* alt_ref filter type */
+    vp8e_tuning                 tuning;
+    unsigned int                cq_level;         /* constrained quality level */
+    unsigned int                rc_max_intra_bitrate_pct;
+    unsigned int                screen_content_mode;
+
+};
+
+static struct vp8_extracfg default_extracfg = {
+  NULL,
+#if !(CONFIG_REALTIME_ONLY)
+  0,                          /* cpu_used      */
+#else
+  4,                          /* cpu_used      */
+#endif
+  0,                          /* enable_auto_alt_ref */
+  0,                          /* noise_sensitivity */
+  0,                          /* Sharpness */
+  0,                          /* static_thresh */
+#if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
+  VP8_EIGHT_TOKENPARTITION,
+#else
+  VP8_ONE_TOKENPARTITION,     /* token_partitions */
+#endif
+  0,                          /* arnr_max_frames */
+  3,                          /* arnr_strength */
+  3,                          /* arnr_type*/
+  0,                          /* tuning*/
+  10,                         /* cq_level */
+  0,                          /* rc_max_intra_bitrate_pct */
+  0,                          /* screen_content_mode */
+};
+
+struct vpx_codec_alg_priv
+{
+    vpx_codec_priv_t        base;
+    vpx_codec_enc_cfg_t     cfg;
+    struct vp8_extracfg     vp8_cfg;
+    VP8_CONFIG              oxcf;
+    struct VP8_COMP        *cpi;
+    unsigned char          *cx_data;
+    unsigned int            cx_data_sz;
+    vpx_image_t             preview_img;
+    unsigned int            next_frame_flag;
+    vp8_postproc_cfg_t      preview_ppcfg;
+    /* pkt_list size depends on the maximum number of lagged frames allowed. */
+    vpx_codec_pkt_list_decl(64) pkt_list;
+    unsigned int                fixed_kf_cntr;
+    vpx_enc_frame_flags_t   control_frame_flags;
+};
+
+
+static vpx_codec_err_t
+update_error_state(vpx_codec_alg_priv_t                 *ctx,
+                   const struct vpx_internal_error_info *error)
+{
+    vpx_codec_err_t res;
+
+    if ((res = error->error_code))
+        ctx->base.err_detail = error->has_detail
+                               ? error->detail
+                               : NULL;
+
+    return res;
+}
+
+
+#undef ERROR
+#define ERROR(str) do {\
+        ctx->base.err_detail = str;\
+        return VPX_CODEC_INVALID_PARAM;\
+    } while(0)
+
+#define RANGE_CHECK(p,memb,lo,hi) do {\
+        if(!(((p)->memb == lo || (p)->memb > (lo)) && (p)->memb <= hi)) \
+            ERROR(#memb " out of range ["#lo".."#hi"]");\
+    } while(0)
+
+#define RANGE_CHECK_HI(p,memb,hi) do {\
+        if(!((p)->memb <= (hi))) \
+            ERROR(#memb " out of range [.."#hi"]");\
+    } while(0)
+
+#define RANGE_CHECK_LO(p,memb,lo) do {\
+        if(!((p)->memb >= (lo))) \
+            ERROR(#memb " out of range ["#lo"..]");\
+    } while(0)
+
+#define RANGE_CHECK_BOOL(p,memb) do {\
+        if(!!((p)->memb) != (p)->memb) ERROR(#memb " expected boolean");\
+    } while(0)
+
+static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
+                                       const vpx_codec_enc_cfg_t *cfg,
+                                       const struct vp8_extracfg *vp8_cfg,
+                                       int                        finalize)
+{
+    RANGE_CHECK(cfg, g_w,                   1, 16383); /* 14 bits available */
+    RANGE_CHECK(cfg, g_h,                   1, 16383); /* 14 bits available */
+    RANGE_CHECK(cfg, g_timebase.den,        1, 1000000000);
+    RANGE_CHECK(cfg, g_timebase.num,        1, 1000000000);
+    RANGE_CHECK_HI(cfg, g_profile,          3);
+    RANGE_CHECK_HI(cfg, rc_max_quantizer,   63);
+    RANGE_CHECK_HI(cfg, rc_min_quantizer,   cfg->rc_max_quantizer);
+    RANGE_CHECK_HI(cfg, g_threads,          64);
+#if CONFIG_REALTIME_ONLY
+    RANGE_CHECK_HI(cfg, g_lag_in_frames,    0);
+#elif CONFIG_MULTI_RES_ENCODING
+    if (ctx->base.enc.total_encoders > 1)
+        RANGE_CHECK_HI(cfg, g_lag_in_frames,    0);
+#else
+    RANGE_CHECK_HI(cfg, g_lag_in_frames,    25);
+#endif
+    RANGE_CHECK(cfg, rc_end_usage,          VPX_VBR, VPX_Q);
+    RANGE_CHECK_HI(cfg, rc_undershoot_pct,  1000);
+    RANGE_CHECK_HI(cfg, rc_overshoot_pct,   1000);
+    RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100);
+    RANGE_CHECK(cfg, kf_mode,               VPX_KF_DISABLED, VPX_KF_AUTO);
+
+/* TODO: add spatial re-sampling support and frame dropping in
+ * multi-res-encoder.*/
+#if CONFIG_MULTI_RES_ENCODING
+    if (ctx->base.enc.total_encoders > 1)
+        RANGE_CHECK_HI(cfg, rc_resize_allowed,     0);
+#else
+    RANGE_CHECK_BOOL(cfg, rc_resize_allowed);
+#endif
+    RANGE_CHECK_HI(cfg, rc_dropframe_thresh,   100);
+    RANGE_CHECK_HI(cfg, rc_resize_up_thresh,   100);
+    RANGE_CHECK_HI(cfg, rc_resize_down_thresh, 100);
+
+#if CONFIG_REALTIME_ONLY
+    RANGE_CHECK(cfg,        g_pass,         VPX_RC_ONE_PASS, VPX_RC_ONE_PASS);
+#elif CONFIG_MULTI_RES_ENCODING
+    if (ctx->base.enc.total_encoders > 1)
+        RANGE_CHECK(cfg,    g_pass,         VPX_RC_ONE_PASS, VPX_RC_ONE_PASS);
+#else
+    RANGE_CHECK(cfg,        g_pass,         VPX_RC_ONE_PASS, VPX_RC_LAST_PASS);
+#endif
+
+    /* VP8 does not support a lower bound on the keyframe interval in
+     * automatic keyframe placement mode.
+     */
+    if (cfg->kf_mode != VPX_KF_DISABLED && cfg->kf_min_dist != cfg->kf_max_dist
+        && cfg->kf_min_dist > 0)
+        ERROR("kf_min_dist not supported in auto mode, use 0 "
+              "or kf_max_dist instead.");
+
+    RANGE_CHECK_BOOL(vp8_cfg,               enable_auto_alt_ref);
+    RANGE_CHECK(vp8_cfg, cpu_used,           -16, 16);
+
+#if CONFIG_REALTIME_ONLY && !CONFIG_TEMPORAL_DENOISING
+    RANGE_CHECK(vp8_cfg, noise_sensitivity,  0, 0);
+#else
+    RANGE_CHECK_HI(vp8_cfg, noise_sensitivity,  6);
+#endif
+
+    RANGE_CHECK(vp8_cfg, token_partitions,   VP8_ONE_TOKENPARTITION,
+                VP8_EIGHT_TOKENPARTITION);
+    RANGE_CHECK_HI(vp8_cfg, Sharpness,       7);
+    RANGE_CHECK(vp8_cfg, arnr_max_frames, 0, 15);
+    RANGE_CHECK_HI(vp8_cfg, arnr_strength,   6);
+    RANGE_CHECK(vp8_cfg, arnr_type,       1, 3);
+    RANGE_CHECK(vp8_cfg, cq_level, 0, 63);
+    RANGE_CHECK_HI(vp8_cfg, screen_content_mode, 2);
+    if (finalize && (cfg->rc_end_usage == VPX_CQ || cfg->rc_end_usage == VPX_Q))
+        RANGE_CHECK(vp8_cfg, cq_level,
+                    cfg->rc_min_quantizer, cfg->rc_max_quantizer);
+
+#if !(CONFIG_REALTIME_ONLY)
+    if (cfg->g_pass == VPX_RC_LAST_PASS)
+    {
+        size_t           packet_sz = sizeof(FIRSTPASS_STATS);
+        int              n_packets = (int)(cfg->rc_twopass_stats_in.sz /
+                                          packet_sz);
+        FIRSTPASS_STATS *stats;
+
+        if (!cfg->rc_twopass_stats_in.buf)
+            ERROR("rc_twopass_stats_in.buf not set.");
+
+        if (cfg->rc_twopass_stats_in.sz % packet_sz)
+            ERROR("rc_twopass_stats_in.sz indicates truncated packet.");
+
+        if (cfg->rc_twopass_stats_in.sz < 2 * packet_sz)
+            ERROR("rc_twopass_stats_in requires at least two packets.");
+
+        stats = (void*)((char *)cfg->rc_twopass_stats_in.buf
+                + (n_packets - 1) * packet_sz);
+
+        if ((int)(stats->count + 0.5) != n_packets - 1)
+            ERROR("rc_twopass_stats_in missing EOS stats packet");
+    }
+#endif
+
+    RANGE_CHECK(cfg, ts_number_layers, 1, 5);
+
+    if (cfg->ts_number_layers > 1)
+    {
+        unsigned int i;
+        RANGE_CHECK_HI(cfg, ts_periodicity, 16);
+
+        for (i=1; i<cfg->ts_number_layers; i++)
+            if (cfg->ts_target_bitrate[i] <= cfg->ts_target_bitrate[i-1] &&
+                cfg->rc_target_bitrate > 0)
+                ERROR("ts_target_bitrate entries are not strictly increasing");
+
+        RANGE_CHECK(cfg, ts_rate_decimator[cfg->ts_number_layers-1], 1, 1);
+        for (i=cfg->ts_number_layers-2; i>0; i--)
+            if (cfg->ts_rate_decimator[i-1] != 2*cfg->ts_rate_decimator[i])
+                ERROR("ts_rate_decimator factors are not powers of 2");
+
+        RANGE_CHECK_HI(cfg, ts_layer_id[i], cfg->ts_number_layers-1);
+    }
+
+#if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
+    if(cfg->g_threads > (1 << vp8_cfg->token_partitions))
+        ERROR("g_threads cannot be bigger than number of token partitions");
+#endif
+
+    return VPX_CODEC_OK;
+}
+
+
+static vpx_codec_err_t validate_img(vpx_codec_alg_priv_t *ctx,
+                                    const vpx_image_t    *img)
+{
+    switch (img->fmt)
+    {
+    case VPX_IMG_FMT_YV12:
+    case VPX_IMG_FMT_I420:
+    case VPX_IMG_FMT_VPXI420:
+    case VPX_IMG_FMT_VPXYV12:
+        break;
+    default:
+        ERROR("Invalid image format. Only YV12 and I420 images are supported");
+    }
+
+    if ((img->d_w != ctx->cfg.g_w) || (img->d_h != ctx->cfg.g_h))
+        ERROR("Image size must match encoder init configuration size");
+
+    return VPX_CODEC_OK;
+}
+
+
+static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf,
+                                       vpx_codec_enc_cfg_t cfg,
+                                       struct vp8_extracfg vp8_cfg,
+                                       vpx_codec_priv_enc_mr_cfg_t *mr_cfg)
+{
+    oxcf->multi_threaded         = cfg.g_threads;
+    oxcf->Version               = cfg.g_profile;
+
+    oxcf->Width                 = cfg.g_w;
+    oxcf->Height                = cfg.g_h;
+    oxcf->timebase              = cfg.g_timebase;
+
+    oxcf->error_resilient_mode = cfg.g_error_resilient;
+
+    switch (cfg.g_pass)
+    {
+    case VPX_RC_ONE_PASS:
+        oxcf->Mode = MODE_BESTQUALITY;
+        break;
+    case VPX_RC_FIRST_PASS:
+        oxcf->Mode = MODE_FIRSTPASS;
+        break;
+    case VPX_RC_LAST_PASS:
+        oxcf->Mode = MODE_SECONDPASS_BEST;
+        break;
+    }
+
+    if (cfg.g_pass == VPX_RC_FIRST_PASS || cfg.g_pass == VPX_RC_ONE_PASS)
+    {
+        oxcf->allow_lag     = 0;
+        oxcf->lag_in_frames = 0;
+    }
+    else
+    {
+        oxcf->allow_lag     = (cfg.g_lag_in_frames) > 0;
+        oxcf->lag_in_frames = cfg.g_lag_in_frames;
+    }
+
+    oxcf->allow_df               = (cfg.rc_dropframe_thresh > 0);
+    oxcf->drop_frames_water_mark   = cfg.rc_dropframe_thresh;
+
+    oxcf->allow_spatial_resampling = cfg.rc_resize_allowed;
+    oxcf->resample_up_water_mark   = cfg.rc_resize_up_thresh;
+    oxcf->resample_down_water_mark = cfg.rc_resize_down_thresh;
+
+    if (cfg.rc_end_usage == VPX_VBR) {
+      oxcf->end_usage = USAGE_LOCAL_FILE_PLAYBACK;
+    } else if (cfg.rc_end_usage == VPX_CBR) {
+      oxcf->end_usage = USAGE_STREAM_FROM_SERVER;
+    } else if (cfg.rc_end_usage == VPX_CQ) {
+      oxcf->end_usage = USAGE_CONSTRAINED_QUALITY;
+    } else if (cfg.rc_end_usage == VPX_Q) {
+      oxcf->end_usage = USAGE_CONSTANT_QUALITY;
+    }
+
+    oxcf->target_bandwidth         = cfg.rc_target_bitrate;
+    oxcf->rc_max_intra_bitrate_pct = vp8_cfg.rc_max_intra_bitrate_pct;
+
+    oxcf->best_allowed_q           = cfg.rc_min_quantizer;
+    oxcf->worst_allowed_q          = cfg.rc_max_quantizer;
+    oxcf->cq_level                 = vp8_cfg.cq_level;
+    oxcf->fixed_q = -1;
+
+    oxcf->under_shoot_pct          = cfg.rc_undershoot_pct;
+    oxcf->over_shoot_pct           = cfg.rc_overshoot_pct;
+
+    oxcf->maximum_buffer_size_in_ms   = cfg.rc_buf_sz;
+    oxcf->starting_buffer_level_in_ms = cfg.rc_buf_initial_sz;
+    oxcf->optimal_buffer_level_in_ms  = cfg.rc_buf_optimal_sz;
+
+    oxcf->maximum_buffer_size      = cfg.rc_buf_sz;
+    oxcf->starting_buffer_level    = cfg.rc_buf_initial_sz;
+    oxcf->optimal_buffer_level     = cfg.rc_buf_optimal_sz;
+
+    oxcf->two_pass_vbrbias         = cfg.rc_2pass_vbr_bias_pct;
+    oxcf->two_pass_vbrmin_section  = cfg.rc_2pass_vbr_minsection_pct;
+    oxcf->two_pass_vbrmax_section  = cfg.rc_2pass_vbr_maxsection_pct;
+
+    oxcf->auto_key                 = cfg.kf_mode == VPX_KF_AUTO
+                                       && cfg.kf_min_dist != cfg.kf_max_dist;
+    oxcf->key_freq                 = cfg.kf_max_dist;
+
+    oxcf->number_of_layers         = cfg.ts_number_layers;
+    oxcf->periodicity              = cfg.ts_periodicity;
+
+    if (oxcf->number_of_layers > 1)
+    {
+        memcpy (oxcf->target_bitrate, cfg.ts_target_bitrate,
+                sizeof(cfg.ts_target_bitrate));
+        memcpy (oxcf->rate_decimator, cfg.ts_rate_decimator,
+                sizeof(cfg.ts_rate_decimator));
+        memcpy (oxcf->layer_id, cfg.ts_layer_id, sizeof(cfg.ts_layer_id));
+    }
+
+#if CONFIG_MULTI_RES_ENCODING
+    /* When mr_cfg is NULL, oxcf->mr_total_resolutions and oxcf->mr_encoder_id
+     * are both memset to 0, which ensures the correct logic under this
+     * situation.
+     */
+    if(mr_cfg)
+    {
+        oxcf->mr_total_resolutions        = mr_cfg->mr_total_resolutions;
+        oxcf->mr_encoder_id               = mr_cfg->mr_encoder_id;
+        oxcf->mr_down_sampling_factor.num = mr_cfg->mr_down_sampling_factor.num;
+        oxcf->mr_down_sampling_factor.den = mr_cfg->mr_down_sampling_factor.den;
+        oxcf->mr_low_res_mode_info        = mr_cfg->mr_low_res_mode_info;
+    }
+#else
+    (void)mr_cfg;
+#endif
+
+    oxcf->cpu_used               = vp8_cfg.cpu_used;
+    oxcf->encode_breakout        = vp8_cfg.static_thresh;
+    oxcf->play_alternate         = vp8_cfg.enable_auto_alt_ref;
+    oxcf->noise_sensitivity      = vp8_cfg.noise_sensitivity;
+    oxcf->Sharpness              = vp8_cfg.Sharpness;
+    oxcf->token_partitions       = vp8_cfg.token_partitions;
+
+    oxcf->two_pass_stats_in      = cfg.rc_twopass_stats_in;
+    oxcf->output_pkt_list        = vp8_cfg.pkt_list;
+
+    oxcf->arnr_max_frames        = vp8_cfg.arnr_max_frames;
+    oxcf->arnr_strength          = vp8_cfg.arnr_strength;
+    oxcf->arnr_type              = vp8_cfg.arnr_type;
+
+    oxcf->tuning                 = vp8_cfg.tuning;
+
+    oxcf->screen_content_mode    = vp8_cfg.screen_content_mode;
+
+    /*
+        printf("Current VP8 Settings: \n");
+        printf("target_bandwidth: %d\n", oxcf->target_bandwidth);
+        printf("noise_sensitivity: %d\n", oxcf->noise_sensitivity);
+        printf("Sharpness: %d\n",    oxcf->Sharpness);
+        printf("cpu_used: %d\n",  oxcf->cpu_used);
+        printf("Mode: %d\n",     oxcf->Mode);
+        printf("auto_key: %d\n",  oxcf->auto_key);
+        printf("key_freq: %d\n", oxcf->key_freq);
+        printf("end_usage: %d\n", oxcf->end_usage);
+        printf("under_shoot_pct: %d\n", oxcf->under_shoot_pct);
+        printf("over_shoot_pct: %d\n", oxcf->over_shoot_pct);
+        printf("starting_buffer_level: %d\n", oxcf->starting_buffer_level);
+        printf("optimal_buffer_level: %d\n",  oxcf->optimal_buffer_level);
+        printf("maximum_buffer_size: %d\n", oxcf->maximum_buffer_size);
+        printf("fixed_q: %d\n",  oxcf->fixed_q);
+        printf("worst_allowed_q: %d\n", oxcf->worst_allowed_q);
+        printf("best_allowed_q: %d\n", oxcf->best_allowed_q);
+        printf("allow_spatial_resampling: %d\n",  oxcf->allow_spatial_resampling);
+        printf("resample_down_water_mark: %d\n", oxcf->resample_down_water_mark);
+        printf("resample_up_water_mark: %d\n", oxcf->resample_up_water_mark);
+        printf("allow_df: %d\n", oxcf->allow_df);
+        printf("drop_frames_water_mark: %d\n", oxcf->drop_frames_water_mark);
+        printf("two_pass_vbrbias: %d\n",  oxcf->two_pass_vbrbias);
+        printf("two_pass_vbrmin_section: %d\n", oxcf->two_pass_vbrmin_section);
+        printf("two_pass_vbrmax_section: %d\n", oxcf->two_pass_vbrmax_section);
+        printf("allow_lag: %d\n", oxcf->allow_lag);
+        printf("lag_in_frames: %d\n", oxcf->lag_in_frames);
+        printf("play_alternate: %d\n", oxcf->play_alternate);
+        printf("Version: %d\n", oxcf->Version);
+        printf("multi_threaded: %d\n",   oxcf->multi_threaded);
+        printf("encode_breakout: %d\n", oxcf->encode_breakout);
+    */
+    return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t vp8e_set_config(vpx_codec_alg_priv_t       *ctx,
+                                       const vpx_codec_enc_cfg_t  *cfg)
+{
+    vpx_codec_err_t res;
+
+    if (cfg->g_w != ctx->cfg.g_w || cfg->g_h != ctx->cfg.g_h)
+    {
+        if (cfg->g_lag_in_frames > 1 || cfg->g_pass != VPX_RC_ONE_PASS)
+            ERROR("Cannot change width or height after initialization");
+        if ((ctx->cpi->initial_width && (int)cfg->g_w > ctx->cpi->initial_width) ||
+            (ctx->cpi->initial_height && (int)cfg->g_h > ctx->cpi->initial_height))
+            ERROR("Cannot increase width or height larger than their initial values");
+    }
+
+    /* Prevent increasing lag_in_frames. This check is stricter than it needs
+     * to be -- the limit is not increasing past the first lag_in_frames
+     * value, but we don't track the initial config, only the last successful
+     * config.
+     */
+    if ((cfg->g_lag_in_frames > ctx->cfg.g_lag_in_frames))
+        ERROR("Cannot increase lag_in_frames");
+
+    res = validate_config(ctx, cfg, &ctx->vp8_cfg, 0);
+
+    if (!res)
+    {
+        ctx->cfg = *cfg;
+        set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg, NULL);
+        vp8_change_config(ctx->cpi, &ctx->oxcf);
+    }
+
+    return res;
+}
+
+static vpx_codec_err_t get_quantizer(vpx_codec_alg_priv_t *ctx, va_list args)
+{
+  int *const arg = va_arg(args, int *);
+  if (arg == NULL)
+    return VPX_CODEC_INVALID_PARAM;
+  *arg = vp8_get_quantizer(ctx->cpi);
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t get_quantizer64(vpx_codec_alg_priv_t *ctx, va_list args)
+{
+  int *const arg = va_arg(args, int *);
+  if (arg == NULL)
+    return VPX_CODEC_INVALID_PARAM;
+  *arg = vp8_reverse_trans(vp8_get_quantizer(ctx->cpi));
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t update_extracfg(vpx_codec_alg_priv_t *ctx,
+                                       const struct vp8_extracfg *extra_cfg)
+{
+  const vpx_codec_err_t res = validate_config(ctx, &ctx->cfg, extra_cfg, 0);
+  if (res == VPX_CODEC_OK) {
+    ctx->vp8_cfg = *extra_cfg;
+    set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg, NULL);
+    vp8_change_config(ctx->cpi, &ctx->oxcf);
+  }
+  return res;
+}
+
+static vpx_codec_err_t set_cpu_used(vpx_codec_alg_priv_t *ctx, va_list args)
+{
+  struct vp8_extracfg extra_cfg = ctx->vp8_cfg;
+  extra_cfg.cpu_used = CAST(VP8E_SET_CPUUSED, args);
+  return update_extracfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t set_enable_auto_alt_ref(vpx_codec_alg_priv_t *ctx,
+                                               va_list args)
+{
+  struct vp8_extracfg extra_cfg = ctx->vp8_cfg;
+  extra_cfg.enable_auto_alt_ref = CAST(VP8E_SET_ENABLEAUTOALTREF, args);
+  return update_extracfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t set_noise_sensitivity(vpx_codec_alg_priv_t *ctx,
+                                             va_list args)
+{
+  struct vp8_extracfg extra_cfg = ctx->vp8_cfg;
+  extra_cfg.noise_sensitivity = CAST(VP8E_SET_NOISE_SENSITIVITY, args);
+  return update_extracfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t set_sharpness(vpx_codec_alg_priv_t *ctx, va_list args)
+{
+  struct vp8_extracfg extra_cfg = ctx->vp8_cfg;
+  extra_cfg.Sharpness = CAST(VP8E_SET_SHARPNESS, args);
+  return update_extracfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t set_static_thresh(vpx_codec_alg_priv_t *ctx,
+                                         va_list args)
+{
+  struct vp8_extracfg extra_cfg = ctx->vp8_cfg;
+  extra_cfg.static_thresh = CAST(VP8E_SET_STATIC_THRESHOLD, args);
+  return update_extracfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t set_token_partitions(vpx_codec_alg_priv_t *ctx,
+                                            va_list args)
+{
+  struct vp8_extracfg extra_cfg = ctx->vp8_cfg;
+  extra_cfg.token_partitions = CAST(VP8E_SET_TOKEN_PARTITIONS, args);
+  return update_extracfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t set_arnr_max_frames(vpx_codec_alg_priv_t *ctx,
+                                           va_list args)
+{
+  struct vp8_extracfg extra_cfg = ctx->vp8_cfg;
+  extra_cfg.arnr_max_frames = CAST(VP8E_SET_ARNR_MAXFRAMES, args);
+  return update_extracfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t set_arnr_strength(vpx_codec_alg_priv_t *ctx,
+                                         va_list args)
+{
+  struct vp8_extracfg extra_cfg = ctx->vp8_cfg;
+  extra_cfg.arnr_strength = CAST(VP8E_SET_ARNR_STRENGTH, args);
+  return update_extracfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t set_arnr_type(vpx_codec_alg_priv_t *ctx, va_list args)
+{
+  struct vp8_extracfg extra_cfg = ctx->vp8_cfg;
+  extra_cfg.arnr_type = CAST(VP8E_SET_ARNR_TYPE, args);
+  return update_extracfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t set_tuning(vpx_codec_alg_priv_t *ctx, va_list args)
+{
+  struct vp8_extracfg extra_cfg = ctx->vp8_cfg;
+  extra_cfg.tuning = CAST(VP8E_SET_TUNING, args);
+  return update_extracfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t set_cq_level(vpx_codec_alg_priv_t *ctx, va_list args)
+{
+  struct vp8_extracfg extra_cfg = ctx->vp8_cfg;
+  extra_cfg.cq_level = CAST(VP8E_SET_CQ_LEVEL, args);
+  return update_extracfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t set_rc_max_intra_bitrate_pct(vpx_codec_alg_priv_t *ctx,
+                                                    va_list args)
+{
+  struct vp8_extracfg extra_cfg = ctx->vp8_cfg;
+  extra_cfg.rc_max_intra_bitrate_pct =
+      CAST(VP8E_SET_MAX_INTRA_BITRATE_PCT, args);
+  return update_extracfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t set_screen_content_mode(vpx_codec_alg_priv_t *ctx,
+                                               va_list args)
+{
+  struct vp8_extracfg extra_cfg = ctx->vp8_cfg;
+  extra_cfg.screen_content_mode =
+      CAST(VP8E_SET_SCREEN_CONTENT_MODE, args);
+  return update_extracfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t vp8e_mr_alloc_mem(const vpx_codec_enc_cfg_t *cfg,
+                                        void **mem_loc)
+{
+    vpx_codec_err_t res = 0;
+
+#if CONFIG_MULTI_RES_ENCODING
+    LOWER_RES_FRAME_INFO *shared_mem_loc;
+    int mb_rows = ((cfg->g_w + 15) >>4);
+    int mb_cols = ((cfg->g_h + 15) >>4);
+
+    shared_mem_loc = calloc(1, sizeof(LOWER_RES_FRAME_INFO));
+    if(!shared_mem_loc)
+    {
+        res = VPX_CODEC_MEM_ERROR;
+    }
+
+    shared_mem_loc->mb_info = calloc(mb_rows*mb_cols, sizeof(LOWER_RES_MB_INFO));
+    if(!(shared_mem_loc->mb_info))
+    {
+        res = VPX_CODEC_MEM_ERROR;
+    }
+    else
+    {
+        *mem_loc = (void *)shared_mem_loc;
+        res = VPX_CODEC_OK;
+    }
+#else
+    (void)cfg;
+    (void)mem_loc;
+#endif
+    return res;
+}
+
+static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx,
+                                 vpx_codec_priv_enc_mr_cfg_t *mr_cfg)
+{
+    vpx_codec_err_t        res = VPX_CODEC_OK;
+
+
+    vp8_rtcd();
+    vpx_dsp_rtcd();
+    vpx_scale_rtcd();
+
+    if (!ctx->priv)
+    {
+        struct vpx_codec_alg_priv *priv =
+            (struct vpx_codec_alg_priv *)vpx_calloc(1, sizeof(*priv));
+
+        if (!priv)
+        {
+            return VPX_CODEC_MEM_ERROR;
+        }
+
+        ctx->priv = (vpx_codec_priv_t *)priv;
+        ctx->priv->init_flags = ctx->init_flags;
+
+        if (ctx->config.enc)
+        {
+            /* Update the reference to the config structure to an
+             * internal copy.
+             */
+            priv->cfg = *ctx->config.enc;
+            ctx->config.enc = &priv->cfg;
+        }
+
+        priv->vp8_cfg = default_extracfg;
+        priv->vp8_cfg.pkt_list = &priv->pkt_list.head;
+
+        priv->cx_data_sz = priv->cfg.g_w * priv->cfg.g_h * 3 / 2 * 2;
+
+        if (priv->cx_data_sz < 32768) priv->cx_data_sz = 32768;
+
+        priv->cx_data = malloc(priv->cx_data_sz);
+
+        if (!priv->cx_data)
+        {
+            return VPX_CODEC_MEM_ERROR;
+        }
+
+        if(mr_cfg)
+            ctx->priv->enc.total_encoders   = mr_cfg->mr_total_resolutions;
+        else
+            ctx->priv->enc.total_encoders   = 1;
+
+        once(vp8_initialize_enc);
+
+        res = validate_config(priv, &priv->cfg, &priv->vp8_cfg, 0);
+
+        if (!res)
+        {
+            set_vp8e_config(&priv->oxcf, priv->cfg, priv->vp8_cfg, mr_cfg);
+            priv->cpi = vp8_create_compressor(&priv->oxcf);
+            if (!priv->cpi)
+                res = VPX_CODEC_MEM_ERROR;
+        }
+    }
+
+    return res;
+}
+
+static vpx_codec_err_t vp8e_destroy(vpx_codec_alg_priv_t *ctx)
+{
+#if CONFIG_MULTI_RES_ENCODING
+    /* Free multi-encoder shared memory */
+    if (ctx->oxcf.mr_total_resolutions > 0 && (ctx->oxcf.mr_encoder_id == ctx->oxcf.mr_total_resolutions-1))
+    {
+        LOWER_RES_FRAME_INFO *shared_mem_loc = (LOWER_RES_FRAME_INFO *)ctx->oxcf.mr_low_res_mode_info;
+        free(shared_mem_loc->mb_info);
+        free(ctx->oxcf.mr_low_res_mode_info);
+    }
+#endif
+
+    free(ctx->cx_data);
+    vp8_remove_compressor(&ctx->cpi);
+    vpx_free(ctx);
+    return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t image2yuvconfig(const vpx_image_t   *img,
+                                       YV12_BUFFER_CONFIG  *yv12)
+{
+    const int y_w = img->d_w;
+    const int y_h = img->d_h;
+    const int uv_w = (img->d_w + 1) / 2;
+    const int uv_h = (img->d_h + 1) / 2;
+    vpx_codec_err_t        res = VPX_CODEC_OK;
+    yv12->y_buffer = img->planes[VPX_PLANE_Y];
+    yv12->u_buffer = img->planes[VPX_PLANE_U];
+    yv12->v_buffer = img->planes[VPX_PLANE_V];
+
+    yv12->y_crop_width  = y_w;
+    yv12->y_crop_height = y_h;
+    yv12->y_width  = y_w;
+    yv12->y_height = y_h;
+    yv12->uv_crop_width = uv_w;
+    yv12->uv_crop_height = uv_h;
+    yv12->uv_width = uv_w;
+    yv12->uv_height = uv_h;
+
+    yv12->y_stride = img->stride[VPX_PLANE_Y];
+    yv12->uv_stride = img->stride[VPX_PLANE_U];
+
+    yv12->border  = (img->stride[VPX_PLANE_Y] - img->w) / 2;
+    return res;
+}
+
+static void pick_quickcompress_mode(vpx_codec_alg_priv_t  *ctx,
+                                    unsigned long          duration,
+                                    unsigned long          deadline)
+{
+    unsigned int new_qc;
+
+#if !(CONFIG_REALTIME_ONLY)
+    /* Use best quality mode if no deadline is given. */
+    new_qc = MODE_BESTQUALITY;
+
+    if (deadline)
+    {
+        uint64_t     duration_us;
+
+        /* Convert duration parameter from stream timebase to microseconds */
+        duration_us = (uint64_t)duration * 1000000
+                      * (uint64_t)ctx->cfg.g_timebase.num
+                      / (uint64_t)ctx->cfg.g_timebase.den;
+
+        /* If the deadline is more that the duration this frame is to be shown,
+         * use good quality mode. Otherwise use realtime mode.
+         */
+        new_qc = (deadline > duration_us) ? MODE_GOODQUALITY : MODE_REALTIME;
+    }
+
+#else
+    new_qc = MODE_REALTIME;
+#endif
+
+    if (ctx->cfg.g_pass == VPX_RC_FIRST_PASS)
+        new_qc = MODE_FIRSTPASS;
+    else if (ctx->cfg.g_pass == VPX_RC_LAST_PASS)
+        new_qc = (new_qc == MODE_BESTQUALITY)
+                 ? MODE_SECONDPASS_BEST
+                 : MODE_SECONDPASS;
+
+    if (ctx->oxcf.Mode != new_qc)
+    {
+        ctx->oxcf.Mode = new_qc;
+        vp8_change_config(ctx->cpi, &ctx->oxcf);
+    }
+}
+
+static vpx_codec_err_t set_reference_and_update(vpx_codec_alg_priv_t *ctx,
+                                                int flags)
+{
+
+    /* Handle Flags */
+    if (((flags & VP8_EFLAG_NO_UPD_GF) && (flags & VP8_EFLAG_FORCE_GF))
+        || ((flags & VP8_EFLAG_NO_UPD_ARF) && (flags & VP8_EFLAG_FORCE_ARF)))
+    {
+        ctx->base.err_detail = "Conflicting flags.";
+        return VPX_CODEC_INVALID_PARAM;
+    }
+
+    if (flags & (VP8_EFLAG_NO_REF_LAST | VP8_EFLAG_NO_REF_GF
+                 | VP8_EFLAG_NO_REF_ARF))
+    {
+        int ref = 7;
+
+        if (flags & VP8_EFLAG_NO_REF_LAST)
+            ref ^= VP8_LAST_FRAME;
+
+        if (flags & VP8_EFLAG_NO_REF_GF)
+            ref ^= VP8_GOLD_FRAME;
+
+        if (flags & VP8_EFLAG_NO_REF_ARF)
+            ref ^= VP8_ALTR_FRAME;
+
+        vp8_use_as_reference(ctx->cpi, ref);
+    }
+
+    if (flags & (VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF
+                 | VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_FORCE_GF
+                 | VP8_EFLAG_FORCE_ARF))
+    {
+        int upd = 7;
+
+        if (flags & VP8_EFLAG_NO_UPD_LAST)
+            upd ^= VP8_LAST_FRAME;
+
+        if (flags & VP8_EFLAG_NO_UPD_GF)
+            upd ^= VP8_GOLD_FRAME;
+
+        if (flags & VP8_EFLAG_NO_UPD_ARF)
+            upd ^= VP8_ALTR_FRAME;
+
+        vp8_update_reference(ctx->cpi, upd);
+    }
+
+    if (flags & VP8_EFLAG_NO_UPD_ENTROPY)
+    {
+        vp8_update_entropy(ctx->cpi, 0);
+    }
+
+    return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t  *ctx,
+                                   const vpx_image_t     *img,
+                                   vpx_codec_pts_t        pts,
+                                   unsigned long          duration,
+                                   vpx_enc_frame_flags_t  flags,
+                                   unsigned long          deadline)
+{
+    vpx_codec_err_t res = VPX_CODEC_OK;
+
+    if (!ctx->cfg.rc_target_bitrate)
+        return res;
+
+    if (img)
+        res = validate_img(ctx, img);
+
+    if (!res)
+        res = validate_config(ctx, &ctx->cfg, &ctx->vp8_cfg, 1);
+
+    pick_quickcompress_mode(ctx, duration, deadline);
+    vpx_codec_pkt_list_init(&ctx->pkt_list);
+
+    // If no flags are set in the encode call, then use the frame flags as
+    // defined via the control function: vp8e_set_frame_flags.
+    if (!flags) {
+        flags = ctx->control_frame_flags;
+    }
+    ctx->control_frame_flags = 0;
+
+    if (!res)
+        res = set_reference_and_update(ctx, flags);
+
+    /* Handle fixed keyframe intervals */
+    if (ctx->cfg.kf_mode == VPX_KF_AUTO
+        && ctx->cfg.kf_min_dist == ctx->cfg.kf_max_dist)
+    {
+        if (++ctx->fixed_kf_cntr > ctx->cfg.kf_min_dist)
+        {
+            flags |= VPX_EFLAG_FORCE_KF;
+            ctx->fixed_kf_cntr = 1;
+        }
+    }
+
+    /* Initialize the encoder instance on the first frame*/
+    if (!res && ctx->cpi)
+    {
+        unsigned int lib_flags;
+        YV12_BUFFER_CONFIG sd;
+        int64_t dst_time_stamp, dst_end_time_stamp;
+        unsigned long size, cx_data_sz;
+        unsigned char *cx_data;
+        unsigned char *cx_data_end;
+        int comp_data_state = 0;
+
+        /* Set up internal flags */
+        if (ctx->base.init_flags & VPX_CODEC_USE_PSNR)
+            ((VP8_COMP *)ctx->cpi)->b_calculate_psnr = 1;
+
+        if (ctx->base.init_flags & VPX_CODEC_USE_OUTPUT_PARTITION)
+            ((VP8_COMP *)ctx->cpi)->output_partition = 1;
+
+        /* Convert API flags to internal codec lib flags */
+        lib_flags = (flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
+
+        /* vp8 use 10,000,000 ticks/second as time stamp */
+        dst_time_stamp    = pts * 10000000 * ctx->cfg.g_timebase.num / ctx->cfg.g_timebase.den;
+        dst_end_time_stamp = (pts + duration) * 10000000 * ctx->cfg.g_timebase.num / ctx->cfg.g_timebase.den;
+
+        if (img != NULL)
+        {
+            res = image2yuvconfig(img, &sd);
+
+            if (vp8_receive_raw_frame(ctx->cpi, ctx->next_frame_flag | lib_flags,
+                                      &sd, dst_time_stamp, dst_end_time_stamp))
+            {
+                VP8_COMP *cpi = (VP8_COMP *)ctx->cpi;
+                res = update_error_state(ctx, &cpi->common.error);
+            }
+
+            /* reset for next frame */
+            ctx->next_frame_flag = 0;
+        }
+
+        cx_data = ctx->cx_data;
+        cx_data_sz = ctx->cx_data_sz;
+        cx_data_end = ctx->cx_data + cx_data_sz;
+        lib_flags = 0;
+
+        while (cx_data_sz >= ctx->cx_data_sz / 2)
+        {
+            comp_data_state = vp8_get_compressed_data(ctx->cpi,
+                                                  &lib_flags,
+                                                  &size,
+                                                  cx_data,
+                                                  cx_data_end,
+                                                  &dst_time_stamp,
+                                                  &dst_end_time_stamp,
+                                                  !img);
+
+            if(comp_data_state == VPX_CODEC_CORRUPT_FRAME)
+                return VPX_CODEC_CORRUPT_FRAME;
+            else if(comp_data_state == -1)
+                break;
+
+            if (size)
+            {
+                vpx_codec_pts_t    round, delta;
+                vpx_codec_cx_pkt_t pkt;
+                VP8_COMP *cpi = (VP8_COMP *)ctx->cpi;
+
+                /* Add the frame packet to the list of returned packets. */
+                round = (vpx_codec_pts_t)10000000
+                        * ctx->cfg.g_timebase.num / 2 - 1;
+                delta = (dst_end_time_stamp - dst_time_stamp);
+                pkt.kind = VPX_CODEC_CX_FRAME_PKT;
+                pkt.data.frame.pts =
+                    (dst_time_stamp * ctx->cfg.g_timebase.den + round)
+                    / ctx->cfg.g_timebase.num / 10000000;
+                pkt.data.frame.duration = (unsigned long)
+                    ((delta * ctx->cfg.g_timebase.den + round)
+                    / ctx->cfg.g_timebase.num / 10000000);
+                pkt.data.frame.flags = lib_flags << 16;
+
+                if (lib_flags & FRAMEFLAGS_KEY)
+                    pkt.data.frame.flags |= VPX_FRAME_IS_KEY;
+
+                if (!cpi->common.show_frame)
+                {
+                    pkt.data.frame.flags |= VPX_FRAME_IS_INVISIBLE;
+
+                    /* This timestamp should be as close as possible to the
+                     * prior PTS so that if a decoder uses pts to schedule when
+                     * to do this, we start right after last frame was decoded.
+                     * Invisible frames have no duration.
+                     */
+                    pkt.data.frame.pts = ((cpi->last_time_stamp_seen
+                        * ctx->cfg.g_timebase.den + round)
+                        / ctx->cfg.g_timebase.num / 10000000) + 1;
+                    pkt.data.frame.duration = 0;
+                }
+
+                if (cpi->droppable)
+                    pkt.data.frame.flags |= VPX_FRAME_IS_DROPPABLE;
+
+                if (cpi->output_partition)
+                {
+                    int i;
+                    const int num_partitions =
+                            (1 << cpi->common.multi_token_partition) + 1;
+
+                    pkt.data.frame.flags |= VPX_FRAME_IS_FRAGMENT;
+
+                    for (i = 0; i < num_partitions; ++i)
+                    {
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+                        pkt.data.frame.buf = cpi->partition_d[i];
+#else
+                        pkt.data.frame.buf = cx_data;
+                        cx_data += cpi->partition_sz[i];
+                        cx_data_sz -= cpi->partition_sz[i];
+#endif
+                        pkt.data.frame.sz = cpi->partition_sz[i];
+                        pkt.data.frame.partition_id = i;
+                        /* don't set the fragment bit for the last partition */
+                        if (i == (num_partitions - 1))
+                            pkt.data.frame.flags &= ~VPX_FRAME_IS_FRAGMENT;
+                        vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt);
+                    }
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+                    /* In lagged mode the encoder can buffer multiple frames.
+                     * We don't want this in partitioned output because
+                     * partitions are spread all over the output buffer.
+                     * So, force an exit!
+                     */
+                    cx_data_sz -= ctx->cx_data_sz / 2;
+#endif
+                }
+                else
+                {
+                    pkt.data.frame.buf = cx_data;
+                    pkt.data.frame.sz  = size;
+                    pkt.data.frame.partition_id = -1;
+                    vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt);
+                    cx_data += size;
+                    cx_data_sz -= size;
+                }
+            }
+        }
+    }
+
+    return res;
+}
+
+
+static const vpx_codec_cx_pkt_t *vp8e_get_cxdata(vpx_codec_alg_priv_t  *ctx,
+        vpx_codec_iter_t      *iter)
+{
+    return vpx_codec_pkt_list_get(&ctx->pkt_list.head, iter);
+}
+
+static vpx_codec_err_t vp8e_set_reference(vpx_codec_alg_priv_t *ctx,
+                                          va_list args)
+{
+    vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
+
+    if (data)
+    {
+        vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;
+        YV12_BUFFER_CONFIG sd;
+
+        image2yuvconfig(&frame->img, &sd);
+        vp8_set_reference(ctx->cpi, frame->frame_type, &sd);
+        return VPX_CODEC_OK;
+    }
+    else
+        return VPX_CODEC_INVALID_PARAM;
+
+}
+
+static vpx_codec_err_t vp8e_get_reference(vpx_codec_alg_priv_t *ctx,
+                                          va_list args)
+{
+
+    vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
+
+    if (data)
+    {
+        vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;
+        YV12_BUFFER_CONFIG sd;
+
+        image2yuvconfig(&frame->img, &sd);
+        vp8_get_reference(ctx->cpi, frame->frame_type, &sd);
+        return VPX_CODEC_OK;
+    }
+    else
+        return VPX_CODEC_INVALID_PARAM;
+}
+
+static vpx_codec_err_t vp8e_set_previewpp(vpx_codec_alg_priv_t *ctx,
+                                          va_list args)
+{
+#if CONFIG_POSTPROC
+    vp8_postproc_cfg_t *data = va_arg(args, vp8_postproc_cfg_t *);
+
+    if (data)
+    {
+        ctx->preview_ppcfg = *((vp8_postproc_cfg_t *)data);
+        return VPX_CODEC_OK;
+    }
+    else
+        return VPX_CODEC_INVALID_PARAM;
+#else
+    (void)ctx;
+    (void)args;
+    return VPX_CODEC_INCAPABLE;
+#endif
+}
+
+
+static vpx_image_t *vp8e_get_preview(vpx_codec_alg_priv_t *ctx)
+{
+
+    YV12_BUFFER_CONFIG sd;
+    vp8_ppflags_t flags = {0};
+
+    if (ctx->preview_ppcfg.post_proc_flag)
+    {
+        flags.post_proc_flag        = ctx->preview_ppcfg.post_proc_flag;
+        flags.deblocking_level      = ctx->preview_ppcfg.deblocking_level;
+        flags.noise_level           = ctx->preview_ppcfg.noise_level;
+    }
+
+    if (0 == vp8_get_preview_raw_frame(ctx->cpi, &sd, &flags))
+    {
+
+        /*
+        vpx_img_wrap(&ctx->preview_img, VPX_IMG_FMT_YV12,
+            sd.y_width + 2*VP8BORDERINPIXELS,
+            sd.y_height + 2*VP8BORDERINPIXELS,
+            1,
+            sd.buffer_alloc);
+        vpx_img_set_rect(&ctx->preview_img,
+            VP8BORDERINPIXELS, VP8BORDERINPIXELS,
+            sd.y_width, sd.y_height);
+            */
+
+        ctx->preview_img.bps = 12;
+        ctx->preview_img.planes[VPX_PLANE_Y] = sd.y_buffer;
+        ctx->preview_img.planes[VPX_PLANE_U] = sd.u_buffer;
+        ctx->preview_img.planes[VPX_PLANE_V] = sd.v_buffer;
+
+        ctx->preview_img.fmt = VPX_IMG_FMT_I420;
+        ctx->preview_img.x_chroma_shift = 1;
+        ctx->preview_img.y_chroma_shift = 1;
+
+        ctx->preview_img.d_w = sd.y_width;
+        ctx->preview_img.d_h = sd.y_height;
+        ctx->preview_img.stride[VPX_PLANE_Y] = sd.y_stride;
+        ctx->preview_img.stride[VPX_PLANE_U] = sd.uv_stride;
+        ctx->preview_img.stride[VPX_PLANE_V] = sd.uv_stride;
+        ctx->preview_img.w   = sd.y_width;
+        ctx->preview_img.h   = sd.y_height;
+
+        return &ctx->preview_img;
+    }
+    else
+        return NULL;
+}
+
+static vpx_codec_err_t vp8e_set_frame_flags(vpx_codec_alg_priv_t *ctx,
+                                            va_list args)
+{
+    int frame_flags = va_arg(args, int);
+    ctx->control_frame_flags = frame_flags;
+    return set_reference_and_update(ctx, frame_flags);
+}
+
+static vpx_codec_err_t vp8e_set_temporal_layer_id(vpx_codec_alg_priv_t *ctx,
+                                                  va_list args)
+{
+    int layer_id = va_arg(args, int);
+    if (layer_id < 0 || layer_id >= (int)ctx->cfg.ts_number_layers) {
+      return VPX_CODEC_INVALID_PARAM;
+    }
+    ctx->cpi->temporal_layer_id = layer_id;
+    return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t vp8e_set_roi_map(vpx_codec_alg_priv_t *ctx,
+                                        va_list args)
+{
+    vpx_roi_map_t *data = va_arg(args, vpx_roi_map_t *);
+
+    if (data)
+    {
+        vpx_roi_map_t *roi = (vpx_roi_map_t *)data;
+
+        if (!vp8_set_roimap(ctx->cpi, roi->roi_map, roi->rows, roi->cols, roi->delta_q, roi->delta_lf, roi->static_threshold))
+            return VPX_CODEC_OK;
+        else
+            return VPX_CODEC_INVALID_PARAM;
+    }
+    else
+        return VPX_CODEC_INVALID_PARAM;
+}
+
+
+static vpx_codec_err_t vp8e_set_activemap(vpx_codec_alg_priv_t *ctx,
+                                          va_list args)
+{
+    vpx_active_map_t *data = va_arg(args, vpx_active_map_t *);
+
+    if (data)
+    {
+
+        vpx_active_map_t *map = (vpx_active_map_t *)data;
+
+        if (!vp8_set_active_map(ctx->cpi, map->active_map, map->rows, map->cols))
+            return VPX_CODEC_OK;
+        else
+            return VPX_CODEC_INVALID_PARAM;
+    }
+    else
+        return VPX_CODEC_INVALID_PARAM;
+}
+
+static vpx_codec_err_t vp8e_set_scalemode(vpx_codec_alg_priv_t *ctx,
+                                          va_list args)
+{
+
+    vpx_scaling_mode_t *data =  va_arg(args, vpx_scaling_mode_t *);
+
+    if (data)
+    {
+        int res;
+        vpx_scaling_mode_t scalemode = *(vpx_scaling_mode_t *)data ;
+        res = vp8_set_internal_size(ctx->cpi,
+                                    (VPX_SCALING)scalemode.h_scaling_mode,
+                                    (VPX_SCALING)scalemode.v_scaling_mode);
+
+        if (!res)
+        {
+            /*force next frame a key frame to effect scaling mode */
+            ctx->next_frame_flag |= FRAMEFLAGS_KEY;
+            return VPX_CODEC_OK;
+        }
+        else
+            return VPX_CODEC_INVALID_PARAM;
+    }
+    else
+        return VPX_CODEC_INVALID_PARAM;
+}
+
+
+static vpx_codec_ctrl_fn_map_t vp8e_ctf_maps[] =
+{
+    {VP8_SET_REFERENCE,                 vp8e_set_reference},
+    {VP8_COPY_REFERENCE,                vp8e_get_reference},
+    {VP8_SET_POSTPROC,                  vp8e_set_previewpp},
+    {VP8E_SET_FRAME_FLAGS,              vp8e_set_frame_flags},
+    {VP8E_SET_TEMPORAL_LAYER_ID,        vp8e_set_temporal_layer_id},
+    {VP8E_SET_ROI_MAP,                  vp8e_set_roi_map},
+    {VP8E_SET_ACTIVEMAP,                vp8e_set_activemap},
+    {VP8E_SET_SCALEMODE,                vp8e_set_scalemode},
+    {VP8E_SET_CPUUSED,                  set_cpu_used},
+    {VP8E_SET_NOISE_SENSITIVITY,        set_noise_sensitivity},
+    {VP8E_SET_ENABLEAUTOALTREF,         set_enable_auto_alt_ref},
+    {VP8E_SET_SHARPNESS,                set_sharpness},
+    {VP8E_SET_STATIC_THRESHOLD,         set_static_thresh},
+    {VP8E_SET_TOKEN_PARTITIONS,         set_token_partitions},
+    {VP8E_GET_LAST_QUANTIZER,           get_quantizer},
+    {VP8E_GET_LAST_QUANTIZER_64,        get_quantizer64},
+    {VP8E_SET_ARNR_MAXFRAMES,           set_arnr_max_frames},
+    {VP8E_SET_ARNR_STRENGTH ,           set_arnr_strength},
+    {VP8E_SET_ARNR_TYPE     ,           set_arnr_type},
+    {VP8E_SET_TUNING,                   set_tuning},
+    {VP8E_SET_CQ_LEVEL,                 set_cq_level},
+    {VP8E_SET_MAX_INTRA_BITRATE_PCT,    set_rc_max_intra_bitrate_pct},
+    {VP8E_SET_SCREEN_CONTENT_MODE,      set_screen_content_mode},
+    { -1, NULL},
+};
+
+static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] =
+{
+    {
+    0,
+    {
+        0,                  /* g_usage */
+        0,                  /* g_threads */
+        0,                  /* g_profile */
+
+        320,                /* g_width */
+        240,                /* g_height */
+        VPX_BITS_8,         /* g_bit_depth */
+        8,                  /* g_input_bit_depth */
+
+        {1, 30},            /* g_timebase */
+
+        0,                  /* g_error_resilient */
+
+        VPX_RC_ONE_PASS,    /* g_pass */
+
+        0,                  /* g_lag_in_frames */
+
+        0,                  /* rc_dropframe_thresh */
+        0,                  /* rc_resize_allowed */
+        1,                  /* rc_scaled_width */
+        1,                  /* rc_scaled_height */
+        60,                 /* rc_resize_down_thresold */
+        30,                 /* rc_resize_up_thresold */
+
+        VPX_VBR,            /* rc_end_usage */
+        {0},                /* rc_twopass_stats_in */
+        {0},                /* rc_firstpass_mb_stats_in */
+        256,                /* rc_target_bandwidth */
+        4,                  /* rc_min_quantizer */
+        63,                 /* rc_max_quantizer */
+        100,                /* rc_undershoot_pct */
+        100,                /* rc_overshoot_pct */
+
+        6000,               /* rc_max_buffer_size */
+        4000,               /* rc_buffer_initial_size; */
+        5000,               /* rc_buffer_optimal_size; */
+
+        50,                 /* rc_two_pass_vbrbias  */
+        0,                  /* rc_two_pass_vbrmin_section */
+        400,                /* rc_two_pass_vbrmax_section */
+
+        /* keyframing settings (kf) */
+        VPX_KF_AUTO,        /* g_kfmode*/
+        0,                  /* kf_min_dist */
+        128,                /* kf_max_dist */
+
+        VPX_SS_DEFAULT_LAYERS, /* ss_number_layers */
+        {0},
+        {0},                /* ss_target_bitrate */
+        1,                  /* ts_number_layers */
+        {0},                /* ts_target_bitrate */
+        {0},                /* ts_rate_decimator */
+        0,                  /* ts_periodicity */
+        {0},                /* ts_layer_id */
+    }},
+};
+
+
+#ifndef VERSION_STRING
+#define VERSION_STRING
+#endif
+CODEC_INTERFACE(vpx_codec_vp8_cx) =
+{
+    "WebM Project VP8 Encoder" VERSION_STRING,
+    VPX_CODEC_INTERNAL_ABI_VERSION,
+    VPX_CODEC_CAP_ENCODER | VPX_CODEC_CAP_PSNR |
+    VPX_CODEC_CAP_OUTPUT_PARTITION,
+    /* vpx_codec_caps_t          caps; */
+    vp8e_init,          /* vpx_codec_init_fn_t       init; */
+    vp8e_destroy,       /* vpx_codec_destroy_fn_t    destroy; */
+    vp8e_ctf_maps,      /* vpx_codec_ctrl_fn_map_t  *ctrl_maps; */
+    {
+        NULL,    /* vpx_codec_peek_si_fn_t    peek_si; */
+        NULL,    /* vpx_codec_get_si_fn_t     get_si; */
+        NULL,    /* vpx_codec_decode_fn_t     decode; */
+        NULL,    /* vpx_codec_frame_get_fn_t  frame_get; */
+        NULL,    /* vpx_codec_set_fb_fn_t     set_fb_fn; */
+    },
+    {
+        1,                  /* 1 cfg map */
+        vp8e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t    cfg_maps; */
+        vp8e_encode,        /* vpx_codec_encode_fn_t      encode; */
+        vp8e_get_cxdata,    /* vpx_codec_get_cx_data_fn_t   get_cx_data; */
+        vp8e_set_config,
+        NULL,
+        vp8e_get_preview,
+        vp8e_mr_alloc_mem,
+    } /* encoder functions */
+};
diff --git a/libs/libvpx/vp8/vp8_dx_iface.c b/libs/libvpx/vp8/vp8_dx_iface.c
new file mode 100644
index 0000000000..a12a2ad0e1
--- /dev/null
+++ b/libs/libvpx/vp8/vp8_dx_iface.c
@@ -0,0 +1,820 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <stdlib.h>
+#include <string.h>
+#include "./vp8_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_scale_rtcd.h"
+#include "vpx/vpx_decoder.h"
+#include "vpx/vp8dx.h"
+#include "vpx/internal/vpx_codec_internal.h"
+#include "vpx_version.h"
+#include "common/alloccommon.h"
+#include "common/common.h"
+#include "common/onyxd.h"
+#include "decoder/onyxd_int.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#if CONFIG_ERROR_CONCEALMENT
+#include "decoder/error_concealment.h"
+#endif
+#include "decoder/decoderthreading.h"
+
+#define VP8_CAP_POSTPROC (CONFIG_POSTPROC ? VPX_CODEC_CAP_POSTPROC : 0)
+#define VP8_CAP_ERROR_CONCEALMENT (CONFIG_ERROR_CONCEALMENT ? \
+                                    VPX_CODEC_CAP_ERROR_CONCEALMENT : 0)
+
+typedef vpx_codec_stream_info_t  vp8_stream_info_t;
+
+/* Structures for handling memory allocations */
+typedef enum
+{
+    VP8_SEG_ALG_PRIV     = 256,
+    VP8_SEG_MAX
+} mem_seg_id_t;
+#define NELEMENTS(x) ((int)(sizeof(x)/sizeof(x[0])))
+
+struct vpx_codec_alg_priv
+{
+    vpx_codec_priv_t        base;
+    vpx_codec_dec_cfg_t     cfg;
+    vp8_stream_info_t       si;
+    int                     decoder_init;
+    int                     postproc_cfg_set;
+    vp8_postproc_cfg_t      postproc_cfg;
+#if CONFIG_POSTPROC_VISUALIZER
+    unsigned int            dbg_postproc_flag;
+    int                     dbg_color_ref_frame_flag;
+    int                     dbg_color_mb_modes_flag;
+    int                     dbg_color_b_modes_flag;
+    int                     dbg_display_mv_flag;
+#endif
+    vpx_decrypt_cb          decrypt_cb;
+    void                    *decrypt_state;
+    vpx_image_t             img;
+    int                     img_setup;
+    struct frame_buffers    yv12_frame_buffers;
+    void                    *user_priv;
+    FRAGMENT_DATA           fragments;
+};
+
+static void vp8_init_ctx(vpx_codec_ctx_t *ctx)
+{
+    vpx_codec_alg_priv_t *priv =
+        (vpx_codec_alg_priv_t *)vpx_calloc(1, sizeof(*priv));
+
+    ctx->priv = (vpx_codec_priv_t *)priv;
+    ctx->priv->init_flags = ctx->init_flags;
+
+    priv->si.sz = sizeof(priv->si);
+    priv->decrypt_cb = NULL;
+    priv->decrypt_state = NULL;
+
+    if (ctx->config.dec)
+    {
+        /* Update the reference to the config structure to an internal copy. */
+        priv->cfg = *ctx->config.dec;
+        ctx->config.dec = &priv->cfg;
+    }
+}
+
+static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx,
+                                vpx_codec_priv_enc_mr_cfg_t *data)
+{
+    vpx_codec_err_t res = VPX_CODEC_OK;
+    vpx_codec_alg_priv_t *priv = NULL;
+    (void) data;
+
+    vp8_rtcd();
+    vpx_dsp_rtcd();
+    vpx_scale_rtcd();
+
+    /* This function only allocates space for the vpx_codec_alg_priv_t
+     * structure. More memory may be required at the time the stream
+     * information becomes known.
+     */
+    if (!ctx->priv) {
+      vp8_init_ctx(ctx);
+      priv = (vpx_codec_alg_priv_t *)ctx->priv;
+
+      /* initialize number of fragments to zero */
+      priv->fragments.count = 0;
+      /* is input fragments enabled? */
+      priv->fragments.enabled =
+          (priv->base.init_flags & VPX_CODEC_USE_INPUT_FRAGMENTS);
+
+      /*post processing level initialized to do nothing */
+    } else {
+      priv = (vpx_codec_alg_priv_t *)ctx->priv;
+    }
+
+    priv->yv12_frame_buffers.use_frame_threads =
+        (ctx->priv->init_flags & VPX_CODEC_USE_FRAME_THREADING);
+
+    /* for now, disable frame threading */
+    priv->yv12_frame_buffers.use_frame_threads = 0;
+
+    if (priv->yv12_frame_buffers.use_frame_threads &&
+        ((ctx->priv->init_flags & VPX_CODEC_USE_ERROR_CONCEALMENT) ||
+         (ctx->priv->init_flags & VPX_CODEC_USE_INPUT_FRAGMENTS))) {
+      /* row-based threading, error concealment, and input fragments will
+       * not be supported when using frame-based threading */
+      res = VPX_CODEC_INVALID_PARAM;
+    }
+
+    return res;
+}
+
+static vpx_codec_err_t vp8_destroy(vpx_codec_alg_priv_t *ctx)
+{
+    vp8_remove_decoder_instances(&ctx->yv12_frame_buffers);
+
+    vpx_free(ctx);
+
+    return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t vp8_peek_si_internal(const uint8_t *data,
+                                            unsigned int data_sz,
+                                            vpx_codec_stream_info_t *si,
+                                            vpx_decrypt_cb decrypt_cb,
+                                            void *decrypt_state)
+{
+    vpx_codec_err_t res = VPX_CODEC_OK;
+
+    if(data + data_sz <= data)
+    {
+        res = VPX_CODEC_INVALID_PARAM;
+    }
+    else
+    {
+        /* Parse uncompresssed part of key frame header.
+         * 3 bytes:- including version, frame type and an offset
+         * 3 bytes:- sync code (0x9d, 0x01, 0x2a)
+         * 4 bytes:- including image width and height in the lowest 14 bits
+         *           of each 2-byte value.
+         */
+        uint8_t clear_buffer[10];
+        const uint8_t *clear = data;
+        if (decrypt_cb)
+        {
+            int n = VPXMIN(sizeof(clear_buffer), data_sz);
+            decrypt_cb(decrypt_state, data, clear_buffer, n);
+            clear = clear_buffer;
+        }
+        si->is_kf = 0;
+
+        if (data_sz >= 10 && !(clear[0] & 0x01))  /* I-Frame */
+        {
+            si->is_kf = 1;
+
+            /* vet via sync code */
+            if (clear[3] != 0x9d || clear[4] != 0x01 || clear[5] != 0x2a)
+                return VPX_CODEC_UNSUP_BITSTREAM;
+
+            si->w = (clear[6] | (clear[7] << 8)) & 0x3fff;
+            si->h = (clear[8] | (clear[9] << 8)) & 0x3fff;
+
+            /*printf("w=%d, h=%d\n", si->w, si->h);*/
+            if (!(si->h | si->w))
+                res = VPX_CODEC_UNSUP_BITSTREAM;
+        }
+        else
+        {
+            res = VPX_CODEC_UNSUP_BITSTREAM;
+        }
+    }
+
+    return res;
+}
+
+static vpx_codec_err_t vp8_peek_si(const uint8_t *data,
+                                   unsigned int data_sz,
+                                   vpx_codec_stream_info_t *si) {
+    return vp8_peek_si_internal(data, data_sz, si, NULL, NULL);
+}
+
+static vpx_codec_err_t vp8_get_si(vpx_codec_alg_priv_t    *ctx,
+                                  vpx_codec_stream_info_t *si)
+{
+
+    unsigned int sz;
+
+    if (si->sz >= sizeof(vp8_stream_info_t))
+        sz = sizeof(vp8_stream_info_t);
+    else
+        sz = sizeof(vpx_codec_stream_info_t);
+
+    memcpy(si, &ctx->si, sz);
+    si->sz = sz;
+
+    return VPX_CODEC_OK;
+}
+
+
+static vpx_codec_err_t
+update_error_state(vpx_codec_alg_priv_t                 *ctx,
+                   const struct vpx_internal_error_info *error)
+{
+    vpx_codec_err_t res;
+
+    if ((res = error->error_code))
+        ctx->base.err_detail = error->has_detail
+                               ? error->detail
+                               : NULL;
+
+    return res;
+}
+
+static void yuvconfig2image(vpx_image_t               *img,
+                            const YV12_BUFFER_CONFIG  *yv12,
+                            void                      *user_priv)
+{
+    /** vpx_img_wrap() doesn't allow specifying independent strides for
+      * the Y, U, and V planes, nor other alignment adjustments that
+      * might be representable by a YV12_BUFFER_CONFIG, so we just
+      * initialize all the fields.*/
+    img->fmt = VPX_IMG_FMT_I420;
+    img->w = yv12->y_stride;
+    img->h = (yv12->y_height + 2 * VP8BORDERINPIXELS + 15) & ~15;
+    img->d_w = img->r_w = yv12->y_width;
+    img->d_h = img->r_h = yv12->y_height;
+    img->x_chroma_shift = 1;
+    img->y_chroma_shift = 1;
+    img->planes[VPX_PLANE_Y] = yv12->y_buffer;
+    img->planes[VPX_PLANE_U] = yv12->u_buffer;
+    img->planes[VPX_PLANE_V] = yv12->v_buffer;
+    img->planes[VPX_PLANE_ALPHA] = NULL;
+    img->stride[VPX_PLANE_Y] = yv12->y_stride;
+    img->stride[VPX_PLANE_U] = yv12->uv_stride;
+    img->stride[VPX_PLANE_V] = yv12->uv_stride;
+    img->stride[VPX_PLANE_ALPHA] = yv12->y_stride;
+    img->bit_depth = 8;
+    img->bps = 12;
+    img->user_priv = user_priv;
+    img->img_data = yv12->buffer_alloc;
+    img->img_data_owner = 0;
+    img->self_allocd = 0;
+}
+
+static int
+update_fragments(vpx_codec_alg_priv_t  *ctx,
+                 const uint8_t         *data,
+                 unsigned int           data_sz,
+                 vpx_codec_err_t       *res)
+{
+    *res = VPX_CODEC_OK;
+
+    if (ctx->fragments.count == 0)
+    {
+        /* New frame, reset fragment pointers and sizes */
+        memset((void*)ctx->fragments.ptrs, 0, sizeof(ctx->fragments.ptrs));
+        memset(ctx->fragments.sizes, 0, sizeof(ctx->fragments.sizes));
+    }
+    if (ctx->fragments.enabled && !(data == NULL && data_sz == 0))
+    {
+        /* Store a pointer to this fragment and return. We haven't
+         * received the complete frame yet, so we will wait with decoding.
+         */
+        ctx->fragments.ptrs[ctx->fragments.count] = data;
+        ctx->fragments.sizes[ctx->fragments.count] = data_sz;
+        ctx->fragments.count++;
+        if (ctx->fragments.count > (1 << EIGHT_PARTITION) + 1)
+        {
+            ctx->fragments.count = 0;
+            *res = VPX_CODEC_INVALID_PARAM;
+            return -1;
+        }
+        return 0;
+    }
+
+    if (!ctx->fragments.enabled && (data == NULL && data_sz == 0))
+    {
+        return 0;
+    }
+
+    if (!ctx->fragments.enabled)
+    {
+        ctx->fragments.ptrs[0] = data;
+        ctx->fragments.sizes[0] = data_sz;
+        ctx->fragments.count = 1;
+    }
+
+    return 1;
+}
+
+static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t  *ctx,
+                                  const uint8_t         *data,
+                                  unsigned int            data_sz,
+                                  void                    *user_priv,
+                                  long                    deadline)
+{
+    vpx_codec_err_t res = VPX_CODEC_OK;
+    unsigned int resolution_change = 0;
+    unsigned int w, h;
+
+    if (!ctx->fragments.enabled && (data == NULL && data_sz == 0))
+    {
+        return 0;
+    }
+
+    /* Update the input fragment data */
+    if(update_fragments(ctx, data, data_sz, &res) <= 0)
+        return res;
+
+    /* Determine the stream parameters. Note that we rely on peek_si to
+     * validate that we have a buffer that does not wrap around the top
+     * of the heap.
+     */
+    w = ctx->si.w;
+    h = ctx->si.h;
+
+    res = vp8_peek_si_internal(ctx->fragments.ptrs[0], ctx->fragments.sizes[0],
+                               &ctx->si, ctx->decrypt_cb, ctx->decrypt_state);
+
+    if((res == VPX_CODEC_UNSUP_BITSTREAM) && !ctx->si.is_kf)
+    {
+        /* the peek function returns an error for non keyframes, however for
+         * this case, it is not an error */
+        res = VPX_CODEC_OK;
+    }
+
+    if(!ctx->decoder_init && !ctx->si.is_kf)
+        res = VPX_CODEC_UNSUP_BITSTREAM;
+
+    if ((ctx->si.h != h) || (ctx->si.w != w))
+        resolution_change = 1;
+
+    /* Initialize the decoder instance on the first frame*/
+    if (!res && !ctx->decoder_init)
+    {
+      VP8D_CONFIG oxcf;
+
+      oxcf.Width = ctx->si.w;
+      oxcf.Height = ctx->si.h;
+      oxcf.Version = 9;
+      oxcf.postprocess = 0;
+      oxcf.max_threads = ctx->cfg.threads;
+      oxcf.error_concealment =
+          (ctx->base.init_flags & VPX_CODEC_USE_ERROR_CONCEALMENT);
+
+      /* If postprocessing was enabled by the application and a
+       * configuration has not been provided, default it.
+       */
+       if (!ctx->postproc_cfg_set
+           && (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)) {
+         ctx->postproc_cfg.post_proc_flag =
+             VP8_DEBLOCK | VP8_DEMACROBLOCK | VP8_MFQE;
+         ctx->postproc_cfg.deblocking_level = 4;
+         ctx->postproc_cfg.noise_level = 0;
+       }
+
+       res = vp8_create_decoder_instances(&ctx->yv12_frame_buffers, &oxcf);
+       ctx->decoder_init = 1;
+    }
+
+    /* Set these even if already initialized.  The caller may have changed the
+     * decrypt config between frames.
+     */
+    if (ctx->decoder_init) {
+      ctx->yv12_frame_buffers.pbi[0]->decrypt_cb = ctx->decrypt_cb;
+      ctx->yv12_frame_buffers.pbi[0]->decrypt_state = ctx->decrypt_state;
+    }
+
+    if (!res)
+    {
+        VP8D_COMP *pbi = ctx->yv12_frame_buffers.pbi[0];
+        if (resolution_change)
+        {
+            VP8_COMMON *const pc = & pbi->common;
+            MACROBLOCKD *const xd  = & pbi->mb;
+#if CONFIG_MULTITHREAD
+            int i;
+#endif
+            pc->Width = ctx->si.w;
+            pc->Height = ctx->si.h;
+            {
+                int prev_mb_rows = pc->mb_rows;
+
+                if (setjmp(pbi->common.error.jmp))
+                {
+                    pbi->common.error.setjmp = 0;
+                    vp8_clear_system_state();
+                    /* same return value as used in vp8dx_receive_compressed_data */
+                    return -1;
+                }
+
+                pbi->common.error.setjmp = 1;
+
+                if (pc->Width <= 0)
+                {
+                    pc->Width = w;
+                    vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+                                       "Invalid frame width");
+                }
+
+                if (pc->Height <= 0)
+                {
+                    pc->Height = h;
+                    vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+                                       "Invalid frame height");
+                }
+
+                if (vp8_alloc_frame_buffers(pc, pc->Width, pc->Height))
+                    vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
+                                       "Failed to allocate frame buffers");
+
+                xd->pre = pc->yv12_fb[pc->lst_fb_idx];
+                xd->dst = pc->yv12_fb[pc->new_fb_idx];
+
+#if CONFIG_MULTITHREAD
+                for (i = 0; i < pbi->allocated_decoding_thread_count; i++)
+                {
+                    pbi->mb_row_di[i].mbd.dst = pc->yv12_fb[pc->new_fb_idx];
+                    vp8_build_block_doffsets(&pbi->mb_row_di[i].mbd);
+                }
+#endif
+                vp8_build_block_doffsets(&pbi->mb);
+
+                /* allocate memory for last frame MODE_INFO array */
+#if CONFIG_ERROR_CONCEALMENT
+
+                if (pbi->ec_enabled)
+                {
+                    /* old prev_mip was released by vp8_de_alloc_frame_buffers()
+                     * called in vp8_alloc_frame_buffers() */
+                    pc->prev_mip = vpx_calloc(
+                                       (pc->mb_cols + 1) * (pc->mb_rows + 1),
+                                       sizeof(MODE_INFO));
+
+                    if (!pc->prev_mip)
+                    {
+                        vp8_de_alloc_frame_buffers(pc);
+                        vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
+                                           "Failed to allocate"
+                                           "last frame MODE_INFO array");
+                    }
+
+                    pc->prev_mi = pc->prev_mip + pc->mode_info_stride + 1;
+
+                    if (vp8_alloc_overlap_lists(pbi))
+                        vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
+                                           "Failed to allocate overlap lists "
+                                           "for error concealment");
+                }
+
+#endif
+
+#if CONFIG_MULTITHREAD
+                if (pbi->b_multithreaded_rd)
+                    vp8mt_alloc_temp_buffers(pbi, pc->Width, prev_mb_rows);
+#else
+                (void)prev_mb_rows;
+#endif
+            }
+
+            pbi->common.error.setjmp = 0;
+
+            /* required to get past the first get_free_fb() call */
+            pbi->common.fb_idx_ref_cnt[0] = 0;
+        }
+
+        /* update the pbi fragment data */
+        pbi->fragments = ctx->fragments;
+
+        ctx->user_priv = user_priv;
+        if (vp8dx_receive_compressed_data(pbi, data_sz, data, deadline))
+        {
+            res = update_error_state(ctx, &pbi->common.error);
+        }
+
+        /* get ready for the next series of fragments */
+        ctx->fragments.count = 0;
+    }
+
+    return res;
+}
+
+static vpx_image_t *vp8_get_frame(vpx_codec_alg_priv_t  *ctx,
+                                  vpx_codec_iter_t      *iter)
+{
+    vpx_image_t *img = NULL;
+
+    /* iter acts as a flip flop, so an image is only returned on the first
+     * call to get_frame.
+     */
+    if (!(*iter) && ctx->yv12_frame_buffers.pbi[0])
+    {
+        YV12_BUFFER_CONFIG sd;
+        int64_t time_stamp = 0, time_end_stamp = 0;
+        vp8_ppflags_t flags = {0};
+
+        if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)
+        {
+            flags.post_proc_flag= ctx->postproc_cfg.post_proc_flag
+#if CONFIG_POSTPROC_VISUALIZER
+
+                                | ((ctx->dbg_color_ref_frame_flag != 0) ? VP8D_DEBUG_CLR_FRM_REF_BLKS : 0)
+                                | ((ctx->dbg_color_mb_modes_flag != 0) ? VP8D_DEBUG_CLR_BLK_MODES : 0)
+                                | ((ctx->dbg_color_b_modes_flag != 0) ? VP8D_DEBUG_CLR_BLK_MODES : 0)
+                                | ((ctx->dbg_display_mv_flag != 0) ? VP8D_DEBUG_DRAW_MV : 0)
+#endif
+                                ;
+            flags.deblocking_level      = ctx->postproc_cfg.deblocking_level;
+            flags.noise_level           = ctx->postproc_cfg.noise_level;
+#if CONFIG_POSTPROC_VISUALIZER
+            flags.display_ref_frame_flag= ctx->dbg_color_ref_frame_flag;
+            flags.display_mb_modes_flag = ctx->dbg_color_mb_modes_flag;
+            flags.display_b_modes_flag  = ctx->dbg_color_b_modes_flag;
+            flags.display_mv_flag       = ctx->dbg_display_mv_flag;
+#endif
+        }
+
+        if (0 == vp8dx_get_raw_frame(ctx->yv12_frame_buffers.pbi[0], &sd,
+                                     &time_stamp, &time_end_stamp, &flags))
+        {
+            yuvconfig2image(&ctx->img, &sd, ctx->user_priv);
+
+            img = &ctx->img;
+            *iter = img;
+        }
+    }
+
+    return img;
+}
+
+static vpx_codec_err_t image2yuvconfig(const vpx_image_t   *img,
+                                       YV12_BUFFER_CONFIG  *yv12)
+{
+    const int y_w = img->d_w;
+    const int y_h = img->d_h;
+    const int uv_w = (img->d_w + 1) / 2;
+    const int uv_h = (img->d_h + 1) / 2;
+    vpx_codec_err_t        res = VPX_CODEC_OK;
+    yv12->y_buffer = img->planes[VPX_PLANE_Y];
+    yv12->u_buffer = img->planes[VPX_PLANE_U];
+    yv12->v_buffer = img->planes[VPX_PLANE_V];
+
+    yv12->y_crop_width  = y_w;
+    yv12->y_crop_height = y_h;
+    yv12->y_width  = y_w;
+    yv12->y_height = y_h;
+    yv12->uv_crop_width = uv_w;
+    yv12->uv_crop_height = uv_h;
+    yv12->uv_width = uv_w;
+    yv12->uv_height = uv_h;
+
+    yv12->y_stride = img->stride[VPX_PLANE_Y];
+    yv12->uv_stride = img->stride[VPX_PLANE_U];
+
+    yv12->border  = (img->stride[VPX_PLANE_Y] - img->d_w) / 2;
+    return res;
+}
+
+
+static vpx_codec_err_t vp8_set_reference(vpx_codec_alg_priv_t *ctx,
+                                         va_list args)
+{
+
+    vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
+
+    if (data && !ctx->yv12_frame_buffers.use_frame_threads)
+    {
+        vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;
+        YV12_BUFFER_CONFIG sd;
+
+        image2yuvconfig(&frame->img, &sd);
+
+        return vp8dx_set_reference(ctx->yv12_frame_buffers.pbi[0],
+                                   frame->frame_type, &sd);
+    }
+    else
+        return VPX_CODEC_INVALID_PARAM;
+
+}
+
+static vpx_codec_err_t vp8_get_reference(vpx_codec_alg_priv_t *ctx,
+                                         va_list args)
+{
+
+    vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
+
+    if (data && !ctx->yv12_frame_buffers.use_frame_threads)
+    {
+        vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;
+        YV12_BUFFER_CONFIG sd;
+
+        image2yuvconfig(&frame->img, &sd);
+
+        return vp8dx_get_reference(ctx->yv12_frame_buffers.pbi[0],
+                                   frame->frame_type, &sd);
+    }
+    else
+        return VPX_CODEC_INVALID_PARAM;
+
+}
+
+static vpx_codec_err_t vp8_set_postproc(vpx_codec_alg_priv_t *ctx,
+                                        va_list args)
+{
+#if CONFIG_POSTPROC
+    vp8_postproc_cfg_t *data = va_arg(args, vp8_postproc_cfg_t *);
+
+    if (data)
+    {
+        ctx->postproc_cfg_set = 1;
+        ctx->postproc_cfg = *((vp8_postproc_cfg_t *)data);
+        return VPX_CODEC_OK;
+    }
+    else
+        return VPX_CODEC_INVALID_PARAM;
+
+#else
+    (void)ctx;
+    (void)args;
+    return VPX_CODEC_INCAPABLE;
+#endif
+}
+
+
+static vpx_codec_err_t vp8_set_dbg_color_ref_frame(vpx_codec_alg_priv_t *ctx,
+                                                   va_list args) {
+#if CONFIG_POSTPROC_VISUALIZER && CONFIG_POSTPROC
+  ctx->dbg_color_ref_frame_flag = va_arg(args, int);
+  return VPX_CODEC_OK;
+#else
+  (void)ctx;
+  (void)args;
+  return VPX_CODEC_INCAPABLE;
+#endif
+}
+
+static vpx_codec_err_t vp8_set_dbg_color_mb_modes(vpx_codec_alg_priv_t *ctx,
+                                                  va_list args) {
+#if CONFIG_POSTPROC_VISUALIZER && CONFIG_POSTPROC
+  ctx->dbg_color_mb_modes_flag = va_arg(args, int);
+  return VPX_CODEC_OK;
+#else
+  (void)ctx;
+  (void)args;
+  return VPX_CODEC_INCAPABLE;
+#endif
+}
+
+static vpx_codec_err_t vp8_set_dbg_color_b_modes(vpx_codec_alg_priv_t *ctx,
+                                                 va_list args) {
+#if CONFIG_POSTPROC_VISUALIZER && CONFIG_POSTPROC
+  ctx->dbg_color_b_modes_flag = va_arg(args, int);
+  return VPX_CODEC_OK;
+#else
+  (void)ctx;
+  (void)args;
+  return VPX_CODEC_INCAPABLE;
+#endif
+}
+
+static vpx_codec_err_t vp8_set_dbg_display_mv(vpx_codec_alg_priv_t *ctx,
+                                              va_list args) {
+#if CONFIG_POSTPROC_VISUALIZER && CONFIG_POSTPROC
+  ctx->dbg_display_mv_flag = va_arg(args, int);
+  return VPX_CODEC_OK;
+#else
+  (void)ctx;
+  (void)args;
+  return VPX_CODEC_INCAPABLE;
+#endif
+}
+
+static vpx_codec_err_t vp8_get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
+                                                va_list args)
+{
+    int *update_info = va_arg(args, int *);
+
+    if (update_info && !ctx->yv12_frame_buffers.use_frame_threads)
+    {
+        VP8D_COMP *pbi = (VP8D_COMP *)ctx->yv12_frame_buffers.pbi[0];
+
+        *update_info = pbi->common.refresh_alt_ref_frame * (int) VP8_ALTR_FRAME
+            + pbi->common.refresh_golden_frame * (int) VP8_GOLD_FRAME
+            + pbi->common.refresh_last_frame * (int) VP8_LAST_FRAME;
+
+        return VPX_CODEC_OK;
+    }
+    else
+        return VPX_CODEC_INVALID_PARAM;
+}
+
+extern int vp8dx_references_buffer( VP8_COMMON *oci, int ref_frame );
+static vpx_codec_err_t vp8_get_last_ref_frame(vpx_codec_alg_priv_t *ctx,
+                                              va_list args)
+{
+    int *ref_info = va_arg(args, int *);
+
+    if (ref_info && !ctx->yv12_frame_buffers.use_frame_threads)
+    {
+        VP8D_COMP *pbi = (VP8D_COMP *)ctx->yv12_frame_buffers.pbi[0];
+        VP8_COMMON *oci = &pbi->common;
+        *ref_info =
+            (vp8dx_references_buffer( oci, ALTREF_FRAME )?VP8_ALTR_FRAME:0) |
+            (vp8dx_references_buffer( oci, GOLDEN_FRAME )?VP8_GOLD_FRAME:0) |
+            (vp8dx_references_buffer( oci, LAST_FRAME )?VP8_LAST_FRAME:0);
+
+        return VPX_CODEC_OK;
+    }
+    else
+        return VPX_CODEC_INVALID_PARAM;
+}
+
+static vpx_codec_err_t vp8_get_frame_corrupted(vpx_codec_alg_priv_t *ctx,
+                                               va_list args)
+{
+
+    int *corrupted = va_arg(args, int *);
+    VP8D_COMP *pbi = (VP8D_COMP *)ctx->yv12_frame_buffers.pbi[0];
+
+    if (corrupted && pbi)
+    {
+        const YV12_BUFFER_CONFIG *const frame = pbi->common.frame_to_show;
+        if (frame == NULL) return VPX_CODEC_ERROR;
+        *corrupted = frame->corrupted;
+        return VPX_CODEC_OK;
+    }
+    else
+        return VPX_CODEC_INVALID_PARAM;
+
+}
+
+static vpx_codec_err_t vp8_set_decryptor(vpx_codec_alg_priv_t *ctx,
+                                         va_list args)
+{
+    vpx_decrypt_init *init = va_arg(args, vpx_decrypt_init *);
+
+    if (init)
+    {
+        ctx->decrypt_cb = init->decrypt_cb;
+        ctx->decrypt_state = init->decrypt_state;
+    }
+    else
+    {
+        ctx->decrypt_cb = NULL;
+        ctx->decrypt_state = NULL;
+    }
+    return VPX_CODEC_OK;
+}
+
+vpx_codec_ctrl_fn_map_t vp8_ctf_maps[] =
+{
+    {VP8_SET_REFERENCE,             vp8_set_reference},
+    {VP8_COPY_REFERENCE,            vp8_get_reference},
+    {VP8_SET_POSTPROC,              vp8_set_postproc},
+    {VP8_SET_DBG_COLOR_REF_FRAME,   vp8_set_dbg_color_ref_frame},
+    {VP8_SET_DBG_COLOR_MB_MODES,    vp8_set_dbg_color_mb_modes},
+    {VP8_SET_DBG_COLOR_B_MODES,     vp8_set_dbg_color_b_modes},
+    {VP8_SET_DBG_DISPLAY_MV,        vp8_set_dbg_display_mv},
+    {VP8D_GET_LAST_REF_UPDATES,     vp8_get_last_ref_updates},
+    {VP8D_GET_FRAME_CORRUPTED,      vp8_get_frame_corrupted},
+    {VP8D_GET_LAST_REF_USED,        vp8_get_last_ref_frame},
+    {VPXD_SET_DECRYPTOR,            vp8_set_decryptor},
+    { -1, NULL},
+};
+
+
+#ifndef VERSION_STRING
+#define VERSION_STRING
+#endif
+CODEC_INTERFACE(vpx_codec_vp8_dx) =
+{
+    "WebM Project VP8 Decoder" VERSION_STRING,
+    VPX_CODEC_INTERNAL_ABI_VERSION,
+    VPX_CODEC_CAP_DECODER | VP8_CAP_POSTPROC | VP8_CAP_ERROR_CONCEALMENT |
+    VPX_CODEC_CAP_INPUT_FRAGMENTS,
+    /* vpx_codec_caps_t          caps; */
+    vp8_init,         /* vpx_codec_init_fn_t       init; */
+    vp8_destroy,      /* vpx_codec_destroy_fn_t    destroy; */
+    vp8_ctf_maps,     /* vpx_codec_ctrl_fn_map_t  *ctrl_maps; */
+    {
+        vp8_peek_si,      /* vpx_codec_peek_si_fn_t    peek_si; */
+        vp8_get_si,       /* vpx_codec_get_si_fn_t     get_si; */
+        vp8_decode,       /* vpx_codec_decode_fn_t     decode; */
+        vp8_get_frame,    /* vpx_codec_frame_get_fn_t  frame_get; */
+        NULL,
+    },
+    { /* encoder functions */
+        0,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL
+    }
+};
diff --git a/libs/libvpx/vp8/vp8cx.mk b/libs/libvpx/vp8/vp8cx.mk
new file mode 100644
index 0000000000..857a631bff
--- /dev/null
+++ b/libs/libvpx/vp8/vp8cx.mk
@@ -0,0 +1,117 @@
+##
+##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+
+VP8_CX_EXPORTS += exports_enc
+
+VP8_CX_SRCS-yes += $(VP8_COMMON_SRCS-yes)
+VP8_CX_SRCS-no  += $(VP8_COMMON_SRCS-no)
+VP8_CX_SRCS_REMOVE-yes += $(VP8_COMMON_SRCS_REMOVE-yes)
+VP8_CX_SRCS_REMOVE-no  += $(VP8_COMMON_SRCS_REMOVE-no)
+
+ifeq ($(ARCH_ARM),yes)
+  include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8cx_arm.mk
+endif
+
+VP8_CX_SRCS-yes += vp8cx.mk
+
+VP8_CX_SRCS-yes += vp8_cx_iface.c
+
+VP8_CX_SRCS-yes += encoder/defaultcoefcounts.h
+VP8_CX_SRCS-yes += encoder/bitstream.c
+VP8_CX_SRCS-yes += encoder/boolhuff.c
+VP8_CX_SRCS-yes += encoder/dct.c
+VP8_CX_SRCS-yes += encoder/encodeframe.c
+VP8_CX_SRCS-yes += encoder/encodeframe.h
+VP8_CX_SRCS-yes += encoder/encodeintra.c
+VP8_CX_SRCS-yes += encoder/encodemb.c
+VP8_CX_SRCS-yes += encoder/encodemv.c
+VP8_CX_SRCS-$(CONFIG_MULTITHREAD) += encoder/ethreading.c
+VP8_CX_SRCS-yes += encoder/firstpass.c
+VP8_CX_SRCS-yes += encoder/block.h
+VP8_CX_SRCS-yes += encoder/boolhuff.h
+VP8_CX_SRCS-yes += encoder/bitstream.h
+VP8_CX_SRCS-$(CONFIG_TEMPORAL_DENOISING) += encoder/denoising.h
+VP8_CX_SRCS-$(CONFIG_TEMPORAL_DENOISING) += encoder/denoising.c
+VP8_CX_SRCS-yes += encoder/encodeintra.h
+VP8_CX_SRCS-yes += encoder/encodemb.h
+VP8_CX_SRCS-yes += encoder/encodemv.h
+VP8_CX_SRCS-yes += encoder/firstpass.h
+VP8_CX_SRCS-yes += encoder/lookahead.c
+VP8_CX_SRCS-yes += encoder/lookahead.h
+VP8_CX_SRCS-yes += encoder/mcomp.h
+VP8_CX_SRCS-yes += encoder/modecosts.h
+VP8_CX_SRCS-yes += encoder/onyx_int.h
+VP8_CX_SRCS-yes += encoder/pickinter.h
+VP8_CX_SRCS-yes += encoder/quantize.h
+VP8_CX_SRCS-yes += encoder/ratectrl.h
+VP8_CX_SRCS-yes += encoder/rdopt.h
+VP8_CX_SRCS-yes += encoder/tokenize.h
+VP8_CX_SRCS-yes += encoder/treewriter.h
+VP8_CX_SRCS-yes += encoder/mcomp.c
+VP8_CX_SRCS-yes += encoder/modecosts.c
+VP8_CX_SRCS-yes += encoder/onyx_if.c
+VP8_CX_SRCS-yes += encoder/pickinter.c
+VP8_CX_SRCS-yes += encoder/picklpf.c
+VP8_CX_SRCS-yes += encoder/vp8_quantize.c
+VP8_CX_SRCS-yes += encoder/ratectrl.c
+VP8_CX_SRCS-yes += encoder/rdopt.c
+VP8_CX_SRCS-yes += encoder/segmentation.c
+VP8_CX_SRCS-yes += encoder/segmentation.h
+VP8_CX_SRCS-yes += encoder/tokenize.c
+VP8_CX_SRCS-yes += encoder/dct_value_cost.h
+VP8_CX_SRCS-yes += encoder/dct_value_tokens.h
+VP8_CX_SRCS-yes += encoder/treewriter.c
+VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.h
+VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.c
+VP8_CX_SRCS-yes += encoder/temporal_filter.c
+VP8_CX_SRCS-$(CONFIG_MULTI_RES_ENCODING) += encoder/mr_dissim.c
+VP8_CX_SRCS-$(CONFIG_MULTI_RES_ENCODING) += encoder/mr_dissim.h
+
+ifeq ($(CONFIG_REALTIME_ONLY),yes)
+VP8_CX_SRCS_REMOVE-yes += encoder/firstpass.c
+VP8_CX_SRCS_REMOVE-yes += encoder/temporal_filter.c
+endif
+
+VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/dct_mmx.asm
+VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp8_enc_stubs_mmx.c
+VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
+VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
+VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp8_quantize_sse2.c
+VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.c
+VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/quantize_sse4.c
+
+ifeq ($(CONFIG_TEMPORAL_DENOISING),yes)
+VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/denoising_sse2.c
+endif
+
+VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
+VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp8_enc_stubs_sse2.c
+VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm
+VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm
+
+ifeq ($(CONFIG_REALTIME_ONLY),yes)
+VP8_CX_SRCS_REMOVE-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
+endif
+
+VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/dct_msa.c
+VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/encodeopt_msa.c
+VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/quantize_msa.c
+VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/temporal_filter_msa.c
+
+ifeq ($(CONFIG_TEMPORAL_DENOISING),yes)
+VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/denoising_msa.c
+endif
+
+ifeq ($(CONFIG_REALTIME_ONLY),yes)
+VP8_CX_SRCS_REMOVE-$(HAVE_MSA) += encoder/mips/msa/temporal_filter_msa.c
+endif
+
+VP8_CX_SRCS-yes := $(filter-out $(VP8_CX_SRCS_REMOVE-yes),$(VP8_CX_SRCS-yes))
diff --git a/libs/libvpx/vp8/vp8cx_arm.mk b/libs/libvpx/vp8/vp8cx_arm.mk
new file mode 100644
index 0000000000..838b53d84c
--- /dev/null
+++ b/libs/libvpx/vp8/vp8cx_arm.mk
@@ -0,0 +1,28 @@
+##
+##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+
+VP8_CX_SRCS-$(ARCH_ARM)  += vp8cx_arm.mk
+
+#File list for arm
+# encoder
+VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/dct_arm.c
+
+#File list for media
+# encoder
+VP8_CX_SRCS-$(HAVE_MEDIA)  += encoder/arm/armv6/vp8_short_fdct4x4_armv6$(ASM)
+VP8_CX_SRCS-$(HAVE_MEDIA)  += encoder/arm/armv6/walsh_v6$(ASM)
+
+#File list for neon
+# encoder
+VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/denoising_neon.c
+VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/fastquantizeb_neon.c
+VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/shortfdct_neon.c
+VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/vp8_shortwalsh4x4_neon.c
diff --git a/libs/libvpx/vp8/vp8dx.mk b/libs/libvpx/vp8/vp8dx.mk
new file mode 100644
index 0000000000..892ed70f52
--- /dev/null
+++ b/libs/libvpx/vp8/vp8dx.mk
@@ -0,0 +1,39 @@
+##
+##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+
+VP8_DX_EXPORTS += exports_dec
+
+VP8_DX_SRCS-yes += $(VP8_COMMON_SRCS-yes)
+VP8_DX_SRCS-no  += $(VP8_COMMON_SRCS-no)
+VP8_DX_SRCS_REMOVE-yes += $(VP8_COMMON_SRCS_REMOVE-yes)
+VP8_DX_SRCS_REMOVE-no  += $(VP8_COMMON_SRCS_REMOVE-no)
+
+VP8_DX_SRCS-yes += vp8dx.mk
+
+VP8_DX_SRCS-yes += vp8_dx_iface.c
+
+VP8_DX_SRCS-yes += decoder/dboolhuff.c
+VP8_DX_SRCS-yes += decoder/decodemv.c
+VP8_DX_SRCS-yes += decoder/decodeframe.c
+VP8_DX_SRCS-yes += decoder/detokenize.c
+VP8_DX_SRCS-$(CONFIG_ERROR_CONCEALMENT) += decoder/ec_types.h
+VP8_DX_SRCS-$(CONFIG_ERROR_CONCEALMENT) += decoder/error_concealment.h
+VP8_DX_SRCS-$(CONFIG_ERROR_CONCEALMENT) += decoder/error_concealment.c
+VP8_DX_SRCS-yes += decoder/dboolhuff.h
+VP8_DX_SRCS-yes += decoder/decodemv.h
+VP8_DX_SRCS-yes += decoder/decoderthreading.h
+VP8_DX_SRCS-yes += decoder/detokenize.h
+VP8_DX_SRCS-yes += decoder/onyxd_int.h
+VP8_DX_SRCS-yes += decoder/treereader.h
+VP8_DX_SRCS-yes += decoder/onyxd_if.c
+VP8_DX_SRCS-$(CONFIG_MULTITHREAD) += decoder/threading.c
+
+VP8_DX_SRCS-yes := $(filter-out $(VP8_DX_SRCS_REMOVE-yes),$(VP8_DX_SRCS-yes))
diff --git a/libs/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c b/libs/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c
new file mode 100644
index 0000000000..1761fada2f
--- /dev/null
+++ b/libs/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c
@@ -0,0 +1,248 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vp9/common/vp9_common.h"
+
+static int16_t sinpi_1_9 = 0x14a3;
+static int16_t sinpi_2_9 = 0x26c9;
+static int16_t sinpi_3_9 = 0x3441;
+static int16_t sinpi_4_9 = 0x3b6c;
+static int16_t cospi_8_64 = 0x3b21;
+static int16_t cospi_16_64 = 0x2d41;
+static int16_t cospi_24_64 = 0x187e;
+
+static INLINE void TRANSPOSE4X4(
+        int16x8_t *q8s16,
+        int16x8_t *q9s16) {
+    int32x4_t q8s32, q9s32;
+    int16x4x2_t d0x2s16, d1x2s16;
+    int32x4x2_t q0x2s32;
+
+    d0x2s16 = vtrn_s16(vget_low_s16(*q8s16), vget_high_s16(*q8s16));
+    d1x2s16 = vtrn_s16(vget_low_s16(*q9s16), vget_high_s16(*q9s16));
+
+    q8s32 = vreinterpretq_s32_s16(vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]));
+    q9s32 = vreinterpretq_s32_s16(vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]));
+    q0x2s32 = vtrnq_s32(q8s32, q9s32);
+
+    *q8s16 = vreinterpretq_s16_s32(q0x2s32.val[0]);
+    *q9s16 = vreinterpretq_s16_s32(q0x2s32.val[1]);
+    return;
+}
+
+static INLINE void GENERATE_COSINE_CONSTANTS(
+        int16x4_t *d0s16,
+        int16x4_t *d1s16,
+        int16x4_t *d2s16) {
+    *d0s16 = vdup_n_s16(cospi_8_64);
+    *d1s16 = vdup_n_s16(cospi_16_64);
+    *d2s16 = vdup_n_s16(cospi_24_64);
+    return;
+}
+
+static INLINE void GENERATE_SINE_CONSTANTS(
+        int16x4_t *d3s16,
+        int16x4_t *d4s16,
+        int16x4_t *d5s16,
+        int16x8_t *q3s16) {
+    *d3s16 = vdup_n_s16(sinpi_1_9);
+    *d4s16 = vdup_n_s16(sinpi_2_9);
+    *q3s16 = vdupq_n_s16(sinpi_3_9);
+    *d5s16 = vdup_n_s16(sinpi_4_9);
+    return;
+}
+
+static INLINE void IDCT4x4_1D(
+        int16x4_t *d0s16,
+        int16x4_t *d1s16,
+        int16x4_t *d2s16,
+        int16x8_t *q8s16,
+        int16x8_t *q9s16) {
+    int16x4_t d16s16, d17s16, d18s16, d19s16, d23s16, d24s16;
+    int16x4_t d26s16, d27s16, d28s16, d29s16;
+    int32x4_t q10s32, q13s32, q14s32, q15s32;
+    int16x8_t q13s16, q14s16;
+
+    d16s16 = vget_low_s16(*q8s16);
+    d17s16 = vget_high_s16(*q8s16);
+    d18s16 = vget_low_s16(*q9s16);
+    d19s16 = vget_high_s16(*q9s16);
+
+    d23s16 = vadd_s16(d16s16, d18s16);
+    d24s16 = vsub_s16(d16s16, d18s16);
+
+    q15s32 = vmull_s16(d17s16, *d2s16);
+    q10s32 = vmull_s16(d17s16, *d0s16);
+    q13s32 = vmull_s16(d23s16, *d1s16);
+    q14s32 = vmull_s16(d24s16, *d1s16);
+    q15s32 = vmlsl_s16(q15s32, d19s16, *d0s16);
+    q10s32 = vmlal_s16(q10s32, d19s16, *d2s16);
+
+    d26s16 = vqrshrn_n_s32(q13s32, 14);
+    d27s16 = vqrshrn_n_s32(q14s32, 14);
+    d29s16 = vqrshrn_n_s32(q15s32, 14);
+    d28s16 = vqrshrn_n_s32(q10s32, 14);
+
+    q13s16 = vcombine_s16(d26s16, d27s16);
+    q14s16 = vcombine_s16(d28s16, d29s16);
+    *q8s16 = vaddq_s16(q13s16, q14s16);
+    *q9s16 = vsubq_s16(q13s16, q14s16);
+    *q9s16 = vcombine_s16(vget_high_s16(*q9s16),
+                          vget_low_s16(*q9s16));  // vswp
+    return;
+}
+
+static INLINE void IADST4x4_1D(
+        int16x4_t *d3s16,
+        int16x4_t *d4s16,
+        int16x4_t *d5s16,
+        int16x8_t *q3s16,
+        int16x8_t *q8s16,
+        int16x8_t *q9s16) {
+    int16x4_t d6s16, d16s16, d17s16, d18s16, d19s16;
+    int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32;
+
+    d6s16 = vget_low_s16(*q3s16);
+
+    d16s16 = vget_low_s16(*q8s16);
+    d17s16 = vget_high_s16(*q8s16);
+    d18s16 = vget_low_s16(*q9s16);
+    d19s16 = vget_high_s16(*q9s16);
+
+    q10s32 = vmull_s16(*d3s16, d16s16);
+    q11s32 = vmull_s16(*d4s16, d16s16);
+    q12s32 = vmull_s16(d6s16, d17s16);
+    q13s32 = vmull_s16(*d5s16, d18s16);
+    q14s32 = vmull_s16(*d3s16, d18s16);
+    q15s32 = vmovl_s16(d16s16);
+    q15s32 = vaddw_s16(q15s32, d19s16);
+    q8s32  = vmull_s16(*d4s16, d19s16);
+    q15s32 = vsubw_s16(q15s32, d18s16);
+    q9s32  = vmull_s16(*d5s16, d19s16);
+
+    q10s32 = vaddq_s32(q10s32, q13s32);
+    q10s32 = vaddq_s32(q10s32, q8s32);
+    q11s32 = vsubq_s32(q11s32, q14s32);
+    q8s32  = vdupq_n_s32(sinpi_3_9);
+    q11s32 = vsubq_s32(q11s32, q9s32);
+    q15s32 = vmulq_s32(q15s32, q8s32);
+
+    q13s32 = vaddq_s32(q10s32, q12s32);
+    q10s32 = vaddq_s32(q10s32, q11s32);
+    q14s32 = vaddq_s32(q11s32, q12s32);
+    q10s32 = vsubq_s32(q10s32, q12s32);
+
+    d16s16 = vqrshrn_n_s32(q13s32, 14);
+    d17s16 = vqrshrn_n_s32(q14s32, 14);
+    d18s16 = vqrshrn_n_s32(q15s32, 14);
+    d19s16 = vqrshrn_n_s32(q10s32, 14);
+
+    *q8s16 = vcombine_s16(d16s16, d17s16);
+    *q9s16 = vcombine_s16(d18s16, d19s16);
+    return;
+}
+
+void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
+                            int dest_stride, int tx_type) {
+    uint8x8_t d26u8, d27u8;
+    int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16;
+    uint32x2_t d26u32, d27u32;
+    int16x8_t q3s16, q8s16, q9s16;
+    uint16x8_t q8u16, q9u16;
+
+    d26u32 = d27u32 = vdup_n_u32(0);
+
+    q8s16 = vld1q_s16(input);
+    q9s16 = vld1q_s16(input + 8);
+
+    TRANSPOSE4X4(&q8s16, &q9s16);
+
+    switch (tx_type) {
+      case 0:  // idct_idct is not supported. Fall back to C
+        vp9_iht4x4_16_add_c(input, dest, dest_stride, tx_type);
+        return;
+        break;
+      case 1:  // iadst_idct
+        // generate constants
+        GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);
+        GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
+
+        // first transform rows
+        IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16);
+
+        // transpose the matrix
+        TRANSPOSE4X4(&q8s16, &q9s16);
+
+        // then transform columns
+        IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+        break;
+      case 2:  // idct_iadst
+        // generate constantsyy
+        GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);
+        GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
+
+        // first transform rows
+        IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+
+        // transpose the matrix
+        TRANSPOSE4X4(&q8s16, &q9s16);
+
+        // then transform columns
+        IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16);
+        break;
+      case 3:  // iadst_iadst
+        // generate constants
+        GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
+
+        // first transform rows
+        IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+
+        // transpose the matrix
+        TRANSPOSE4X4(&q8s16, &q9s16);
+
+        // then transform columns
+        IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+        break;
+      default:  // iadst_idct
+        assert(0);
+        break;
+    }
+
+    q8s16 = vrshrq_n_s16(q8s16, 4);
+    q9s16 = vrshrq_n_s16(q9s16, 4);
+
+    d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 0);
+    dest += dest_stride;
+    d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 1);
+    dest += dest_stride;
+    d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 0);
+    dest += dest_stride;
+    d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 1);
+
+    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32));
+    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32));
+
+    d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+    d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+
+    vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 1);
+    dest -= dest_stride;
+    vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 0);
+    dest -= dest_stride;
+    vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 1);
+    dest -= dest_stride;
+    vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 0);
+    return;
+}
diff --git a/libs/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c b/libs/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c
new file mode 100644
index 0000000000..04b342c3d3
--- /dev/null
+++ b/libs/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c
@@ -0,0 +1,624 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vp9/common/vp9_common.h"
+
+static int16_t cospi_2_64 = 16305;
+static int16_t cospi_4_64 = 16069;
+static int16_t cospi_6_64 = 15679;
+static int16_t cospi_8_64 = 15137;
+static int16_t cospi_10_64 = 14449;
+static int16_t cospi_12_64 = 13623;
+static int16_t cospi_14_64 = 12665;
+static int16_t cospi_16_64 = 11585;
+static int16_t cospi_18_64 = 10394;
+static int16_t cospi_20_64 = 9102;
+static int16_t cospi_22_64 = 7723;
+static int16_t cospi_24_64 = 6270;
+static int16_t cospi_26_64 = 4756;
+static int16_t cospi_28_64 = 3196;
+static int16_t cospi_30_64 = 1606;
+
+static INLINE void TRANSPOSE8X8(
+        int16x8_t *q8s16,
+        int16x8_t *q9s16,
+        int16x8_t *q10s16,
+        int16x8_t *q11s16,
+        int16x8_t *q12s16,
+        int16x8_t *q13s16,
+        int16x8_t *q14s16,
+        int16x8_t *q15s16) {
+    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+    int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
+    int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
+
+    d16s16 = vget_low_s16(*q8s16);
+    d17s16 = vget_high_s16(*q8s16);
+    d18s16 = vget_low_s16(*q9s16);
+    d19s16 = vget_high_s16(*q9s16);
+    d20s16 = vget_low_s16(*q10s16);
+    d21s16 = vget_high_s16(*q10s16);
+    d22s16 = vget_low_s16(*q11s16);
+    d23s16 = vget_high_s16(*q11s16);
+    d24s16 = vget_low_s16(*q12s16);
+    d25s16 = vget_high_s16(*q12s16);
+    d26s16 = vget_low_s16(*q13s16);
+    d27s16 = vget_high_s16(*q13s16);
+    d28s16 = vget_low_s16(*q14s16);
+    d29s16 = vget_high_s16(*q14s16);
+    d30s16 = vget_low_s16(*q15s16);
+    d31s16 = vget_high_s16(*q15s16);
+
+    *q8s16  = vcombine_s16(d16s16, d24s16);  // vswp d17, d24
+    *q9s16  = vcombine_s16(d18s16, d26s16);  // vswp d19, d26
+    *q10s16 = vcombine_s16(d20s16, d28s16);  // vswp d21, d28
+    *q11s16 = vcombine_s16(d22s16, d30s16);  // vswp d23, d30
+    *q12s16 = vcombine_s16(d17s16, d25s16);
+    *q13s16 = vcombine_s16(d19s16, d27s16);
+    *q14s16 = vcombine_s16(d21s16, d29s16);
+    *q15s16 = vcombine_s16(d23s16, d31s16);
+
+    q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16),
+                        vreinterpretq_s32_s16(*q10s16));
+    q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16),
+                        vreinterpretq_s32_s16(*q11s16));
+    q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16),
+                        vreinterpretq_s32_s16(*q14s16));
+    q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16),
+                        vreinterpretq_s32_s16(*q15s16));
+
+    q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]),   // q8
+                        vreinterpretq_s16_s32(q1x2s32.val[0]));  // q9
+    q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]),   // q10
+                        vreinterpretq_s16_s32(q1x2s32.val[1]));  // q11
+    q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]),   // q12
+                        vreinterpretq_s16_s32(q3x2s32.val[0]));  // q13
+    q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]),   // q14
+                        vreinterpretq_s16_s32(q3x2s32.val[1]));  // q15
+
+    *q8s16  = q0x2s16.val[0];
+    *q9s16  = q0x2s16.val[1];
+    *q10s16 = q1x2s16.val[0];
+    *q11s16 = q1x2s16.val[1];
+    *q12s16 = q2x2s16.val[0];
+    *q13s16 = q2x2s16.val[1];
+    *q14s16 = q3x2s16.val[0];
+    *q15s16 = q3x2s16.val[1];
+    return;
+}
+
+static INLINE void IDCT8x8_1D(
+        int16x8_t *q8s16,
+        int16x8_t *q9s16,
+        int16x8_t *q10s16,
+        int16x8_t *q11s16,
+        int16x8_t *q12s16,
+        int16x8_t *q13s16,
+        int16x8_t *q14s16,
+        int16x8_t *q15s16) {
+    int16x4_t d0s16, d1s16, d2s16, d3s16;
+    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+    int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32;
+    int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
+
+    d0s16 = vdup_n_s16(cospi_28_64);
+    d1s16 = vdup_n_s16(cospi_4_64);
+    d2s16 = vdup_n_s16(cospi_12_64);
+    d3s16 = vdup_n_s16(cospi_20_64);
+
+    d16s16 = vget_low_s16(*q8s16);
+    d17s16 = vget_high_s16(*q8s16);
+    d18s16 = vget_low_s16(*q9s16);
+    d19s16 = vget_high_s16(*q9s16);
+    d20s16 = vget_low_s16(*q10s16);
+    d21s16 = vget_high_s16(*q10s16);
+    d22s16 = vget_low_s16(*q11s16);
+    d23s16 = vget_high_s16(*q11s16);
+    d24s16 = vget_low_s16(*q12s16);
+    d25s16 = vget_high_s16(*q12s16);
+    d26s16 = vget_low_s16(*q13s16);
+    d27s16 = vget_high_s16(*q13s16);
+    d28s16 = vget_low_s16(*q14s16);
+    d29s16 = vget_high_s16(*q14s16);
+    d30s16 = vget_low_s16(*q15s16);
+    d31s16 = vget_high_s16(*q15s16);
+
+    q2s32 = vmull_s16(d18s16, d0s16);
+    q3s32 = vmull_s16(d19s16, d0s16);
+    q5s32 = vmull_s16(d26s16, d2s16);
+    q6s32 = vmull_s16(d27s16, d2s16);
+
+    q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
+    q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
+    q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);
+    q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);
+
+    d8s16  = vqrshrn_n_s32(q2s32, 14);
+    d9s16  = vqrshrn_n_s32(q3s32, 14);
+    d10s16 = vqrshrn_n_s32(q5s32, 14);
+    d11s16 = vqrshrn_n_s32(q6s32, 14);
+    q4s16 = vcombine_s16(d8s16, d9s16);
+    q5s16 = vcombine_s16(d10s16, d11s16);
+
+    q2s32 = vmull_s16(d18s16, d1s16);
+    q3s32 = vmull_s16(d19s16, d1s16);
+    q9s32 = vmull_s16(d26s16, d3s16);
+    q13s32 = vmull_s16(d27s16, d3s16);
+
+    q2s32 = vmlal_s16(q2s32, d30s16, d0s16);
+    q3s32 = vmlal_s16(q3s32, d31s16, d0s16);
+    q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
+    q13s32 = vmlal_s16(q13s32, d23s16, d2s16);
+
+    d14s16 = vqrshrn_n_s32(q2s32, 14);
+    d15s16 = vqrshrn_n_s32(q3s32, 14);
+    d12s16 = vqrshrn_n_s32(q9s32, 14);
+    d13s16 = vqrshrn_n_s32(q13s32, 14);
+    q6s16 = vcombine_s16(d12s16, d13s16);
+    q7s16 = vcombine_s16(d14s16, d15s16);
+
+    d0s16 = vdup_n_s16(cospi_16_64);
+
+    q2s32 = vmull_s16(d16s16, d0s16);
+    q3s32 = vmull_s16(d17s16, d0s16);
+    q13s32 = vmull_s16(d16s16, d0s16);
+    q15s32 = vmull_s16(d17s16, d0s16);
+
+    q2s32 = vmlal_s16(q2s32, d24s16, d0s16);
+    q3s32 = vmlal_s16(q3s32, d25s16, d0s16);
+    q13s32 = vmlsl_s16(q13s32, d24s16, d0s16);
+    q15s32 = vmlsl_s16(q15s32, d25s16, d0s16);
+
+    d0s16 = vdup_n_s16(cospi_24_64);
+    d1s16 = vdup_n_s16(cospi_8_64);
+
+    d18s16 = vqrshrn_n_s32(q2s32, 14);
+    d19s16 = vqrshrn_n_s32(q3s32, 14);
+    d22s16 = vqrshrn_n_s32(q13s32, 14);
+    d23s16 = vqrshrn_n_s32(q15s32, 14);
+    *q9s16  = vcombine_s16(d18s16, d19s16);
+    *q11s16 = vcombine_s16(d22s16, d23s16);
+
+    q2s32 = vmull_s16(d20s16, d0s16);
+    q3s32 = vmull_s16(d21s16, d0s16);
+    q8s32 = vmull_s16(d20s16, d1s16);
+    q12s32 = vmull_s16(d21s16, d1s16);
+
+    q2s32 = vmlsl_s16(q2s32, d28s16, d1s16);
+    q3s32 = vmlsl_s16(q3s32, d29s16, d1s16);
+    q8s32 = vmlal_s16(q8s32, d28s16, d0s16);
+    q12s32 = vmlal_s16(q12s32, d29s16, d0s16);
+
+    d26s16 = vqrshrn_n_s32(q2s32, 14);
+    d27s16 = vqrshrn_n_s32(q3s32, 14);
+    d30s16 = vqrshrn_n_s32(q8s32, 14);
+    d31s16 = vqrshrn_n_s32(q12s32, 14);
+    *q13s16 = vcombine_s16(d26s16, d27s16);
+    *q15s16 = vcombine_s16(d30s16, d31s16);
+
+    q0s16 = vaddq_s16(*q9s16, *q15s16);
+    q1s16 = vaddq_s16(*q11s16, *q13s16);
+    q2s16 = vsubq_s16(*q11s16, *q13s16);
+    q3s16 = vsubq_s16(*q9s16, *q15s16);
+
+    *q13s16 = vsubq_s16(q4s16, q5s16);
+    q4s16   = vaddq_s16(q4s16, q5s16);
+    *q14s16 = vsubq_s16(q7s16, q6s16);
+    q7s16   = vaddq_s16(q7s16, q6s16);
+    d26s16 = vget_low_s16(*q13s16);
+    d27s16 = vget_high_s16(*q13s16);
+    d28s16 = vget_low_s16(*q14s16);
+    d29s16 = vget_high_s16(*q14s16);
+
+    d16s16 = vdup_n_s16(cospi_16_64);
+
+    q9s32  = vmull_s16(d28s16, d16s16);
+    q10s32 = vmull_s16(d29s16, d16s16);
+    q11s32 = vmull_s16(d28s16, d16s16);
+    q12s32 = vmull_s16(d29s16, d16s16);
+
+    q9s32  = vmlsl_s16(q9s32,  d26s16, d16s16);
+    q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
+    q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
+    q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
+
+    d10s16 = vqrshrn_n_s32(q9s32, 14);
+    d11s16 = vqrshrn_n_s32(q10s32, 14);
+    d12s16 = vqrshrn_n_s32(q11s32, 14);
+    d13s16 = vqrshrn_n_s32(q12s32, 14);
+    q5s16 = vcombine_s16(d10s16, d11s16);
+    q6s16 = vcombine_s16(d12s16, d13s16);
+
+    *q8s16  = vaddq_s16(q0s16, q7s16);
+    *q9s16  = vaddq_s16(q1s16, q6s16);
+    *q10s16 = vaddq_s16(q2s16, q5s16);
+    *q11s16 = vaddq_s16(q3s16, q4s16);
+    *q12s16 = vsubq_s16(q3s16, q4s16);
+    *q13s16 = vsubq_s16(q2s16, q5s16);
+    *q14s16 = vsubq_s16(q1s16, q6s16);
+    *q15s16 = vsubq_s16(q0s16, q7s16);
+    return;
+}
+
+static INLINE void IADST8X8_1D(
+        int16x8_t *q8s16,
+        int16x8_t *q9s16,
+        int16x8_t *q10s16,
+        int16x8_t *q11s16,
+        int16x8_t *q12s16,
+        int16x8_t *q13s16,
+        int16x8_t *q14s16,
+        int16x8_t *q15s16) {
+    int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+    int16x8_t q2s16, q4s16, q5s16, q6s16;
+    int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q7s32, q8s32;
+    int32x4_t q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32;
+
+    d16s16 = vget_low_s16(*q8s16);
+    d17s16 = vget_high_s16(*q8s16);
+    d18s16 = vget_low_s16(*q9s16);
+    d19s16 = vget_high_s16(*q9s16);
+    d20s16 = vget_low_s16(*q10s16);
+    d21s16 = vget_high_s16(*q10s16);
+    d22s16 = vget_low_s16(*q11s16);
+    d23s16 = vget_high_s16(*q11s16);
+    d24s16 = vget_low_s16(*q12s16);
+    d25s16 = vget_high_s16(*q12s16);
+    d26s16 = vget_low_s16(*q13s16);
+    d27s16 = vget_high_s16(*q13s16);
+    d28s16 = vget_low_s16(*q14s16);
+    d29s16 = vget_high_s16(*q14s16);
+    d30s16 = vget_low_s16(*q15s16);
+    d31s16 = vget_high_s16(*q15s16);
+
+    d14s16 = vdup_n_s16(cospi_2_64);
+    d15s16 = vdup_n_s16(cospi_30_64);
+
+    q1s32 = vmull_s16(d30s16, d14s16);
+    q2s32 = vmull_s16(d31s16, d14s16);
+    q3s32 = vmull_s16(d30s16, d15s16);
+    q4s32 = vmull_s16(d31s16, d15s16);
+
+    d30s16 = vdup_n_s16(cospi_18_64);
+    d31s16 = vdup_n_s16(cospi_14_64);
+
+    q1s32 = vmlal_s16(q1s32, d16s16, d15s16);
+    q2s32 = vmlal_s16(q2s32, d17s16, d15s16);
+    q3s32 = vmlsl_s16(q3s32, d16s16, d14s16);
+    q4s32 = vmlsl_s16(q4s32, d17s16, d14s16);
+
+    q5s32 = vmull_s16(d22s16, d30s16);
+    q6s32 = vmull_s16(d23s16, d30s16);
+    q7s32 = vmull_s16(d22s16, d31s16);
+    q8s32 = vmull_s16(d23s16, d31s16);
+
+    q5s32 = vmlal_s16(q5s32, d24s16, d31s16);
+    q6s32 = vmlal_s16(q6s32, d25s16, d31s16);
+    q7s32 = vmlsl_s16(q7s32, d24s16, d30s16);
+    q8s32 = vmlsl_s16(q8s32, d25s16, d30s16);
+
+    q11s32 = vaddq_s32(q1s32, q5s32);
+    q12s32 = vaddq_s32(q2s32, q6s32);
+    q1s32 = vsubq_s32(q1s32, q5s32);
+    q2s32 = vsubq_s32(q2s32, q6s32);
+
+    d22s16 = vqrshrn_n_s32(q11s32, 14);
+    d23s16 = vqrshrn_n_s32(q12s32, 14);
+    *q11s16 = vcombine_s16(d22s16, d23s16);
+
+    q12s32 = vaddq_s32(q3s32, q7s32);
+    q15s32 = vaddq_s32(q4s32, q8s32);
+    q3s32 = vsubq_s32(q3s32, q7s32);
+    q4s32 = vsubq_s32(q4s32, q8s32);
+
+    d2s16  = vqrshrn_n_s32(q1s32, 14);
+    d3s16  = vqrshrn_n_s32(q2s32, 14);
+    d24s16 = vqrshrn_n_s32(q12s32, 14);
+    d25s16 = vqrshrn_n_s32(q15s32, 14);
+    d6s16  = vqrshrn_n_s32(q3s32, 14);
+    d7s16  = vqrshrn_n_s32(q4s32, 14);
+    *q12s16 = vcombine_s16(d24s16, d25s16);
+
+    d0s16 = vdup_n_s16(cospi_10_64);
+    d1s16 = vdup_n_s16(cospi_22_64);
+    q4s32 = vmull_s16(d26s16, d0s16);
+    q5s32 = vmull_s16(d27s16, d0s16);
+    q2s32 = vmull_s16(d26s16, d1s16);
+    q6s32 = vmull_s16(d27s16, d1s16);
+
+    d30s16 = vdup_n_s16(cospi_26_64);
+    d31s16 = vdup_n_s16(cospi_6_64);
+
+    q4s32 = vmlal_s16(q4s32, d20s16, d1s16);
+    q5s32 = vmlal_s16(q5s32, d21s16, d1s16);
+    q2s32 = vmlsl_s16(q2s32, d20s16, d0s16);
+    q6s32 = vmlsl_s16(q6s32, d21s16, d0s16);
+
+    q0s32 = vmull_s16(d18s16, d30s16);
+    q13s32 = vmull_s16(d19s16, d30s16);
+
+    q0s32 = vmlal_s16(q0s32, d28s16, d31s16);
+    q13s32 = vmlal_s16(q13s32, d29s16, d31s16);
+
+    q10s32 = vmull_s16(d18s16, d31s16);
+    q9s32 = vmull_s16(d19s16, d31s16);
+
+    q10s32 = vmlsl_s16(q10s32, d28s16, d30s16);
+    q9s32 = vmlsl_s16(q9s32, d29s16, d30s16);
+
+    q14s32 = vaddq_s32(q2s32, q10s32);
+    q15s32 = vaddq_s32(q6s32, q9s32);
+    q2s32 = vsubq_s32(q2s32, q10s32);
+    q6s32 = vsubq_s32(q6s32, q9s32);
+
+    d28s16 = vqrshrn_n_s32(q14s32, 14);
+    d29s16 = vqrshrn_n_s32(q15s32, 14);
+    d4s16 = vqrshrn_n_s32(q2s32, 14);
+    d5s16 = vqrshrn_n_s32(q6s32, 14);
+    *q14s16 = vcombine_s16(d28s16, d29s16);
+
+    q9s32 = vaddq_s32(q4s32, q0s32);
+    q10s32 = vaddq_s32(q5s32, q13s32);
+    q4s32 = vsubq_s32(q4s32, q0s32);
+    q5s32 = vsubq_s32(q5s32, q13s32);
+
+    d30s16 = vdup_n_s16(cospi_8_64);
+    d31s16 = vdup_n_s16(cospi_24_64);
+
+    d18s16 = vqrshrn_n_s32(q9s32, 14);
+    d19s16 = vqrshrn_n_s32(q10s32, 14);
+    d8s16 = vqrshrn_n_s32(q4s32, 14);
+    d9s16 = vqrshrn_n_s32(q5s32, 14);
+    *q9s16 = vcombine_s16(d18s16, d19s16);
+
+    q5s32 = vmull_s16(d2s16, d30s16);
+    q6s32 = vmull_s16(d3s16, d30s16);
+    q7s32 = vmull_s16(d2s16, d31s16);
+    q0s32 = vmull_s16(d3s16, d31s16);
+
+    q5s32 = vmlal_s16(q5s32, d6s16, d31s16);
+    q6s32 = vmlal_s16(q6s32, d7s16, d31s16);
+    q7s32 = vmlsl_s16(q7s32, d6s16, d30s16);
+    q0s32 = vmlsl_s16(q0s32, d7s16, d30s16);
+
+    q1s32 = vmull_s16(d4s16, d30s16);
+    q3s32 = vmull_s16(d5s16, d30s16);
+    q10s32 = vmull_s16(d4s16, d31s16);
+    q2s32 = vmull_s16(d5s16, d31s16);
+
+    q1s32 = vmlsl_s16(q1s32, d8s16, d31s16);
+    q3s32 = vmlsl_s16(q3s32, d9s16, d31s16);
+    q10s32 = vmlal_s16(q10s32, d8s16, d30s16);
+    q2s32 = vmlal_s16(q2s32, d9s16, d30s16);
+
+    *q8s16 = vaddq_s16(*q11s16, *q9s16);
+    *q11s16 = vsubq_s16(*q11s16, *q9s16);
+    q4s16 = vaddq_s16(*q12s16, *q14s16);
+    *q12s16 = vsubq_s16(*q12s16, *q14s16);
+
+    q14s32 = vaddq_s32(q5s32, q1s32);
+    q15s32 = vaddq_s32(q6s32, q3s32);
+    q5s32 = vsubq_s32(q5s32, q1s32);
+    q6s32 = vsubq_s32(q6s32, q3s32);
+
+    d18s16 = vqrshrn_n_s32(q14s32, 14);
+    d19s16 = vqrshrn_n_s32(q15s32, 14);
+    d10s16 = vqrshrn_n_s32(q5s32, 14);
+    d11s16 = vqrshrn_n_s32(q6s32, 14);
+    *q9s16 = vcombine_s16(d18s16, d19s16);
+
+    q1s32 = vaddq_s32(q7s32, q10s32);
+    q3s32 = vaddq_s32(q0s32, q2s32);
+    q7s32 = vsubq_s32(q7s32, q10s32);
+    q0s32 = vsubq_s32(q0s32, q2s32);
+
+    d28s16 = vqrshrn_n_s32(q1s32, 14);
+    d29s16 = vqrshrn_n_s32(q3s32, 14);
+    d14s16 = vqrshrn_n_s32(q7s32, 14);
+    d15s16 = vqrshrn_n_s32(q0s32, 14);
+    *q14s16 = vcombine_s16(d28s16, d29s16);
+
+    d30s16 = vdup_n_s16(cospi_16_64);
+
+    d22s16 = vget_low_s16(*q11s16);
+    d23s16 = vget_high_s16(*q11s16);
+    q2s32 = vmull_s16(d22s16, d30s16);
+    q3s32 = vmull_s16(d23s16, d30s16);
+    q13s32 = vmull_s16(d22s16, d30s16);
+    q1s32 = vmull_s16(d23s16, d30s16);
+
+    d24s16 = vget_low_s16(*q12s16);
+    d25s16 = vget_high_s16(*q12s16);
+    q2s32 = vmlal_s16(q2s32, d24s16, d30s16);
+    q3s32 = vmlal_s16(q3s32, d25s16, d30s16);
+    q13s32 = vmlsl_s16(q13s32, d24s16, d30s16);
+    q1s32 = vmlsl_s16(q1s32, d25s16, d30s16);
+
+    d4s16 = vqrshrn_n_s32(q2s32, 14);
+    d5s16 = vqrshrn_n_s32(q3s32, 14);
+    d24s16 = vqrshrn_n_s32(q13s32, 14);
+    d25s16 = vqrshrn_n_s32(q1s32, 14);
+    q2s16 = vcombine_s16(d4s16, d5s16);
+    *q12s16 = vcombine_s16(d24s16, d25s16);
+
+    q13s32 = vmull_s16(d10s16, d30s16);
+    q1s32 = vmull_s16(d11s16, d30s16);
+    q11s32 = vmull_s16(d10s16, d30s16);
+    q0s32 = vmull_s16(d11s16, d30s16);
+
+    q13s32 = vmlal_s16(q13s32, d14s16, d30s16);
+    q1s32 = vmlal_s16(q1s32, d15s16, d30s16);
+    q11s32 = vmlsl_s16(q11s32, d14s16, d30s16);
+    q0s32 = vmlsl_s16(q0s32, d15s16, d30s16);
+
+    d20s16 = vqrshrn_n_s32(q13s32, 14);
+    d21s16 = vqrshrn_n_s32(q1s32, 14);
+    d12s16 = vqrshrn_n_s32(q11s32, 14);
+    d13s16 = vqrshrn_n_s32(q0s32, 14);
+    *q10s16 = vcombine_s16(d20s16, d21s16);
+    q6s16 = vcombine_s16(d12s16, d13s16);
+
+    q5s16 = vdupq_n_s16(0);
+
+    *q9s16  = vsubq_s16(q5s16, *q9s16);
+    *q11s16 = vsubq_s16(q5s16, q2s16);
+    *q13s16 = vsubq_s16(q5s16, q6s16);
+    *q15s16 = vsubq_s16(q5s16, q4s16);
+    return;
+}
+
+void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest,
+                            int dest_stride, int tx_type) {
+    int i;
+    uint8_t *d1, *d2;
+    uint8x8_t d0u8, d1u8, d2u8, d3u8;
+    uint64x1_t d0u64, d1u64, d2u64, d3u64;
+    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+    uint16x8_t q8u16, q9u16, q10u16, q11u16;
+
+    q8s16  = vld1q_s16(input);
+    q9s16  = vld1q_s16(input + 8);
+    q10s16 = vld1q_s16(input + 8 * 2);
+    q11s16 = vld1q_s16(input + 8 * 3);
+    q12s16 = vld1q_s16(input + 8 * 4);
+    q13s16 = vld1q_s16(input + 8 * 5);
+    q14s16 = vld1q_s16(input + 8 * 6);
+    q15s16 = vld1q_s16(input + 8 * 7);
+
+    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+                 &q12s16, &q13s16, &q14s16, &q15s16);
+
+    switch (tx_type) {
+      case 0:  // idct_idct is not supported. Fall back to C
+        vp9_iht8x8_64_add_c(input, dest, dest_stride, tx_type);
+        return;
+        break;
+      case 1:  // iadst_idct
+        // generate IDCT constants
+        // GENERATE_IDCT_CONSTANTS
+
+        // first transform rows
+        IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+                   &q12s16, &q13s16, &q14s16, &q15s16);
+
+        // transpose the matrix
+        TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+                     &q12s16, &q13s16, &q14s16, &q15s16);
+
+        // generate IADST constants
+        // GENERATE_IADST_CONSTANTS
+
+        // then transform columns
+        IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+                    &q12s16, &q13s16, &q14s16, &q15s16);
+        break;
+      case 2:  // idct_iadst
+        // generate IADST constants
+        // GENERATE_IADST_CONSTANTS
+
+        // first transform rows
+        IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+                    &q12s16, &q13s16, &q14s16, &q15s16);
+
+        // transpose the matrix
+        TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+                     &q12s16, &q13s16, &q14s16, &q15s16);
+
+        // generate IDCT constants
+        // GENERATE_IDCT_CONSTANTS
+
+        // then transform columns
+        IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+                   &q12s16, &q13s16, &q14s16, &q15s16);
+        break;
+      case 3:  // iadst_iadst
+        // generate IADST constants
+        // GENERATE_IADST_CONSTANTS
+
+        // first transform rows
+        IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+                    &q12s16, &q13s16, &q14s16, &q15s16);
+
+        // transpose the matrix
+        TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+                     &q12s16, &q13s16, &q14s16, &q15s16);
+
+        // then transform columns
+        IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+                    &q12s16, &q13s16, &q14s16, &q15s16);
+        break;
+      default:  // iadst_idct
+        assert(0);
+        break;
+    }
+
+    q8s16 = vrshrq_n_s16(q8s16, 5);
+    q9s16 = vrshrq_n_s16(q9s16, 5);
+    q10s16 = vrshrq_n_s16(q10s16, 5);
+    q11s16 = vrshrq_n_s16(q11s16, 5);
+    q12s16 = vrshrq_n_s16(q12s16, 5);
+    q13s16 = vrshrq_n_s16(q13s16, 5);
+    q14s16 = vrshrq_n_s16(q14s16, 5);
+    q15s16 = vrshrq_n_s16(q15s16, 5);
+
+    for (d1 = d2 = dest, i = 0; i < 2; i++) {
+        if (i != 0) {
+            q8s16 = q12s16;
+            q9s16 = q13s16;
+            q10s16 = q14s16;
+            q11s16 = q15s16;
+        }
+
+        d0u64 = vld1_u64((uint64_t *)d1);
+        d1 += dest_stride;
+        d1u64 = vld1_u64((uint64_t *)d1);
+        d1 += dest_stride;
+        d2u64 = vld1_u64((uint64_t *)d1);
+        d1 += dest_stride;
+        d3u64 = vld1_u64((uint64_t *)d1);
+        d1 += dest_stride;
+
+        q8u16  = vaddw_u8(vreinterpretq_u16_s16(q8s16),
+                          vreinterpret_u8_u64(d0u64));
+        q9u16  = vaddw_u8(vreinterpretq_u16_s16(q9s16),
+                          vreinterpret_u8_u64(d1u64));
+        q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
+                          vreinterpret_u8_u64(d2u64));
+        q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
+                          vreinterpret_u8_u64(d3u64));
+
+        d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+        d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+        d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+        d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+        d2 += dest_stride;
+        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+        d2 += dest_stride;
+        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+        d2 += dest_stride;
+        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+        d2 += dest_stride;
+    }
+    return;
+}
diff --git a/libs/libvpx/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c b/libs/libvpx/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c
new file mode 100644
index 0000000000..6ca83a00c5
--- /dev/null
+++ b/libs/libvpx/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c
@@ -0,0 +1,108 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vpx_dsp/mips/inv_txfm_dspr2.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_ports/mem.h"
+
+#if HAVE_DSPR2
+void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
+                                int pitch, int tx_type) {
+  int i, j;
+  DECLARE_ALIGNED(32, int16_t,  out[16 * 16]);
+  int16_t *outptr = out;
+  int16_t temp_out[16];
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp    %[pos],    1    \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  switch (tx_type) {
+    case DCT_DCT:     // DCT in both horizontal and vertical
+      idct16_rows_dspr2(input, outptr, 16);
+      idct16_cols_add_blk_dspr2(out, dest, pitch);
+      break;
+    case ADST_DCT:    // ADST in vertical, DCT in horizontal
+      idct16_rows_dspr2(input, outptr, 16);
+
+      outptr = out;
+
+      for (i = 0; i < 16; ++i) {
+        iadst16_dspr2(outptr, temp_out);
+
+        for (j = 0; j < 16; ++j)
+          dest[j * pitch + i] =
+                    clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+                                      + dest[j * pitch + i]);
+        outptr += 16;
+      }
+      break;
+    case DCT_ADST:    // DCT in vertical, ADST in horizontal
+    {
+      int16_t temp_in[16 * 16];
+
+      for (i = 0; i < 16; ++i) {
+        /* prefetch row */
+        prefetch_load((const uint8_t *)(input + 16));
+
+        iadst16_dspr2(input, outptr);
+        input += 16;
+        outptr += 16;
+      }
+
+      for (i = 0; i < 16; ++i)
+        for (j = 0; j < 16; ++j)
+            temp_in[j * 16 + i] = out[i * 16 + j];
+
+      idct16_cols_add_blk_dspr2(temp_in, dest, pitch);
+    }
+    break;
+    case ADST_ADST:   // ADST in both directions
+    {
+      int16_t temp_in[16];
+
+      for (i = 0; i < 16; ++i) {
+        /* prefetch row */
+        prefetch_load((const uint8_t *)(input + 16));
+
+        iadst16_dspr2(input, outptr);
+        input += 16;
+        outptr += 16;
+      }
+
+      for (i = 0; i < 16; ++i) {
+        for (j = 0; j < 16; ++j)
+          temp_in[j] = out[j * 16 + i];
+        iadst16_dspr2(temp_in, temp_out);
+        for (j = 0; j < 16; ++j)
+          dest[j * pitch + i] =
+                    clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+                                      + dest[j * pitch + i]);
+      }
+    }
+    break;
+    default:
+      printf("vp9_short_iht16x16_add_dspr2 : Invalid tx_type\n");
+      break;
+  }
+}
+#endif  // #if HAVE_DSPR2
diff --git a/libs/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c b/libs/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c
new file mode 100644
index 0000000000..c10979b645
--- /dev/null
+++ b/libs/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c
@@ -0,0 +1,97 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vpx_dsp/mips/inv_txfm_dspr2.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_ports/mem.h"
+
+#if HAVE_DSPR2
+void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
+                             int dest_stride, int tx_type) {
+  int i, j;
+  DECLARE_ALIGNED(32, int16_t, out[4 * 4]);
+  int16_t *outptr = out;
+  int16_t temp_in[4 * 4], temp_out[4];
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  switch (tx_type) {
+    case DCT_DCT:   // DCT in both horizontal and vertical
+      vpx_idct4_rows_dspr2(input, outptr);
+      vpx_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+      break;
+    case ADST_DCT:  // ADST in vertical, DCT in horizontal
+      vpx_idct4_rows_dspr2(input, outptr);
+
+      outptr = out;
+
+      for (i = 0; i < 4; ++i) {
+        iadst4_dspr2(outptr, temp_out);
+
+        for (j = 0; j < 4; ++j)
+          dest[j * dest_stride + i] =
+                    clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
+                                      + dest[j * dest_stride + i]);
+
+        outptr += 4;
+      }
+      break;
+    case DCT_ADST:  // DCT in vertical, ADST in horizontal
+      for (i = 0; i < 4; ++i) {
+        iadst4_dspr2(input, outptr);
+        input  += 4;
+        outptr += 4;
+      }
+
+      for (i = 0; i < 4; ++i) {
+        for (j = 0; j < 4; ++j) {
+          temp_in[i * 4 + j] = out[j * 4 + i];
+        }
+      }
+      vpx_idct4_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride);
+      break;
+    case ADST_ADST:  // ADST in both directions
+      for (i = 0; i < 4; ++i) {
+        iadst4_dspr2(input, outptr);
+        input  += 4;
+        outptr += 4;
+      }
+
+      for (i = 0; i < 4; ++i) {
+        for (j = 0; j < 4; ++j)
+          temp_in[j] = out[j * 4 + i];
+        iadst4_dspr2(temp_in, temp_out);
+
+        for (j = 0; j < 4; ++j)
+          dest[j * dest_stride + i] =
+                  clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
+                                      + dest[j * dest_stride + i]);
+      }
+      break;
+    default:
+      printf("vp9_short_iht4x4_add_dspr2 : Invalid tx_type\n");
+      break;
+  }
+}
+#endif  // #if HAVE_DSPR2
diff --git a/libs/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c b/libs/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
new file mode 100644
index 0000000000..37f3ca9fcb
--- /dev/null
+++ b/libs/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
@@ -0,0 +1,93 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vpx_dsp/mips/inv_txfm_dspr2.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_ports/mem.h"
+
+#if HAVE_DSPR2
+void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
+                             int dest_stride, int tx_type) {
+  int i, j;
+  DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
+  int16_t *outptr = out;
+  int16_t temp_in[8 * 8], temp_out[8];
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp    %[pos],    1    \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  switch (tx_type) {
+    case DCT_DCT:     // DCT in both horizontal and vertical
+      idct8_rows_dspr2(input, outptr, 8);
+      idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+      break;
+    case ADST_DCT:    // ADST in vertical, DCT in horizontal
+      idct8_rows_dspr2(input, outptr, 8);
+
+      for (i = 0; i < 8; ++i) {
+        iadst8_dspr2(&out[i * 8], temp_out);
+
+        for (j = 0; j < 8; ++j)
+          dest[j * dest_stride + i] =
+                    clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+                                      + dest[j * dest_stride + i]);
+      }
+      break;
+    case DCT_ADST:    // DCT in vertical, ADST in horizontal
+      for (i = 0; i < 8; ++i) {
+        iadst8_dspr2(input, outptr);
+        input += 8;
+        outptr += 8;
+      }
+
+      for (i = 0; i < 8; ++i) {
+        for (j = 0; j < 8; ++j) {
+          temp_in[i * 8 + j] = out[j * 8 + i];
+        }
+      }
+      idct8_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride);
+      break;
+    case ADST_ADST:   // ADST in both directions
+      for (i = 0; i < 8; ++i) {
+        iadst8_dspr2(input, outptr);
+        input += 8;
+        outptr += 8;
+      }
+
+      for (i = 0; i < 8; ++i) {
+        for (j = 0; j < 8; ++j)
+          temp_in[j] = out[j * 8 + i];
+
+        iadst8_dspr2(temp_in, temp_out);
+
+        for (j = 0; j < 8; ++j)
+          dest[j * dest_stride + i] =
+                clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+                                      + dest[j * dest_stride + i]);
+      }
+      break;
+    default:
+      printf("vp9_short_iht8x8_add_dspr2 : Invalid tx_type\n");
+      break;
+  }
+}
+#endif  // #if HAVE_DSPR2
diff --git a/libs/libvpx/vp9/common/mips/msa/vp9_idct16x16_msa.c b/libs/libvpx/vp9/common/mips/msa/vp9_idct16x16_msa.c
new file mode 100644
index 0000000000..5adf0aaac1
--- /dev/null
+++ b/libs/libvpx/vp9/common/mips/msa/vp9_idct16x16_msa.c
@@ -0,0 +1,81 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vp9/common/vp9_enums.h"
+#include "vpx_dsp/mips/inv_txfm_msa.h"
+
+void vp9_iht16x16_256_add_msa(const int16_t *input, uint8_t *dst,
+                              int32_t dst_stride, int32_t tx_type) {
+  int32_t i;
+  DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
+  int16_t *out_ptr = &out[0];
+
+  switch (tx_type) {
+    case DCT_DCT:
+      /* transform rows */
+      for (i = 0; i < 2; ++i) {
+        /* process 16 * 8 block */
+        vpx_idct16_1d_rows_msa((input + (i << 7)), (out_ptr + (i << 7)));
+      }
+
+      /* transform columns */
+      for (i = 0; i < 2; ++i) {
+        /* process 8 * 16 block */
+        vpx_idct16_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)),
+                                         dst_stride);
+      }
+      break;
+    case ADST_DCT:
+      /* transform rows */
+      for (i = 0; i < 2; ++i) {
+        /* process 16 * 8 block */
+        vpx_idct16_1d_rows_msa((input + (i << 7)), (out_ptr + (i << 7)));
+      }
+
+      /* transform columns */
+      for (i = 0; i < 2; ++i) {
+        vpx_iadst16_1d_columns_addblk_msa((out_ptr + (i << 3)),
+                                          (dst + (i << 3)), dst_stride);
+      }
+      break;
+    case DCT_ADST:
+      /* transform rows */
+      for (i = 0; i < 2; ++i) {
+        /* process 16 * 8 block */
+        vpx_iadst16_1d_rows_msa((input + (i << 7)), (out_ptr + (i << 7)));
+      }
+
+      /* transform columns */
+      for (i = 0; i < 2; ++i) {
+        /* process 8 * 16 block */
+        vpx_idct16_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)),
+                                         dst_stride);
+      }
+      break;
+    case ADST_ADST:
+      /* transform rows */
+      for (i = 0; i < 2; ++i) {
+        /* process 16 * 8 block */
+        vpx_iadst16_1d_rows_msa((input + (i << 7)), (out_ptr + (i << 7)));
+      }
+
+      /* transform columns */
+      for (i = 0; i < 2; ++i) {
+        vpx_iadst16_1d_columns_addblk_msa((out_ptr + (i << 3)),
+                                          (dst + (i << 3)), dst_stride);
+      }
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
diff --git a/libs/libvpx/vp9/common/mips/msa/vp9_idct4x4_msa.c b/libs/libvpx/vp9/common/mips/msa/vp9_idct4x4_msa.c
new file mode 100644
index 0000000000..75977b11fa
--- /dev/null
+++ b/libs/libvpx/vp9/common/mips/msa/vp9_idct4x4_msa.c
@@ -0,0 +1,62 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vp9/common/vp9_enums.h"
+#include "vpx_dsp/mips/inv_txfm_msa.h"
+
+void vp9_iht4x4_16_add_msa(const int16_t *input, uint8_t *dst,
+                           int32_t dst_stride, int32_t tx_type) {
+  v8i16 in0, in1, in2, in3;
+
+  /* load vector elements of 4x4 block */
+  LD4x4_SH(input, in0, in1, in2, in3);
+  TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+
+  switch (tx_type) {
+    case DCT_DCT:
+      /* DCT in horizontal */
+      VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+      /* DCT in vertical */
+      TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+      VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+      break;
+    case ADST_DCT:
+      /* DCT in horizontal */
+      VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+      /* ADST in vertical */
+      TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+      VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+      break;
+    case DCT_ADST:
+      /* ADST in horizontal */
+      VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+      /* DCT in vertical */
+      TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+      VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+      break;
+    case ADST_ADST:
+      /* ADST in horizontal */
+      VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+      /* ADST in vertical */
+      TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+      VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+
+  /* final rounding (add 2^3, divide by 2^4) and shift */
+  SRARI_H4_SH(in0, in1, in2, in3, 4);
+  /* add block and store 4x4 */
+  ADDBLK_ST4x4_UB(in0, in1, in2, in3, dst, dst_stride);
+}
diff --git a/libs/libvpx/vp9/common/mips/msa/vp9_idct8x8_msa.c b/libs/libvpx/vp9/common/mips/msa/vp9_idct8x8_msa.c
new file mode 100644
index 0000000000..65d2993e8a
--- /dev/null
+++ b/libs/libvpx/vp9/common/mips/msa/vp9_idct8x8_msa.c
@@ -0,0 +1,80 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vp9/common/vp9_enums.h"
+#include "vpx_dsp/mips/inv_txfm_msa.h"
+
+void vp9_iht8x8_64_add_msa(const int16_t *input, uint8_t *dst,
+                           int32_t dst_stride, int32_t tx_type) {
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+  /* load vector elements of 8x8 block */
+  LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+
+  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                     in0, in1, in2, in3, in4, in5, in6, in7);
+
+  switch (tx_type) {
+    case DCT_DCT:
+      /* DCT in horizontal */
+      VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
+                     in0, in1, in2, in3, in4, in5, in6, in7);
+      /* DCT in vertical */
+      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                         in0, in1, in2, in3, in4, in5, in6, in7);
+      VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
+                     in0, in1, in2, in3, in4, in5, in6, in7);
+      break;
+    case ADST_DCT:
+      /* DCT in horizontal */
+      VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
+                     in0, in1, in2, in3, in4, in5, in6, in7);
+      /* ADST in vertical */
+      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                         in0, in1, in2, in3, in4, in5, in6, in7);
+      VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,
+                in0, in1, in2, in3, in4, in5, in6, in7);
+      break;
+    case DCT_ADST:
+      /* ADST in horizontal */
+      VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,
+                in0, in1, in2, in3, in4, in5, in6, in7);
+      /* DCT in vertical */
+      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                         in0, in1, in2, in3, in4, in5, in6, in7);
+      VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
+                     in0, in1, in2, in3, in4, in5, in6, in7);
+      break;
+    case ADST_ADST:
+      /* ADST in horizontal */
+      VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,
+                in0, in1, in2, in3, in4, in5, in6, in7);
+      /* ADST in vertical */
+      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                         in0, in1, in2, in3, in4, in5, in6, in7);
+      VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,
+                in0, in1, in2, in3, in4, in5, in6, in7);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+
+  /* final rounding (add 2^4, divide by 2^5) and shift */
+  SRARI_H4_SH(in0, in1, in2, in3, 5);
+  SRARI_H4_SH(in4, in5, in6, in7, 5);
+
+  /* add block and store 8x8 */
+  VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3);
+  dst += (4 * dst_stride);
+  VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7);
+}
diff --git a/libs/libvpx/vp9/common/mips/msa/vp9_mfqe_msa.c b/libs/libvpx/vp9/common/mips/msa/vp9_mfqe_msa.c
new file mode 100644
index 0000000000..7257cd629d
--- /dev/null
+++ b/libs/libvpx/vp9/common/mips/msa/vp9_mfqe_msa.c
@@ -0,0 +1,137 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vpx_dsp/mips/macros_msa.h"
+
+static void filter_by_weight8x8_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                    uint8_t *dst_ptr, int32_t dst_stride,
+                                    int32_t src_weight) {
+  int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight;
+  int32_t row;
+  uint64_t src0_d, src1_d, dst0_d, dst1_d;
+  v16i8 src0 = { 0 };
+  v16i8 src1 = { 0 };
+  v16i8 dst0 = { 0 };
+  v16i8 dst1 = { 0 };
+  v8i16 src_wt, dst_wt, res_h_r, res_h_l, src_r, src_l, dst_r, dst_l;
+
+  src_wt = __msa_fill_h(src_weight);
+  dst_wt = __msa_fill_h(dst_weight);
+
+  for (row = 2; row--;) {
+    LD2(src_ptr, src_stride, src0_d, src1_d);
+    src_ptr += (2 * src_stride);
+    LD2(dst_ptr, dst_stride, dst0_d, dst1_d);
+    INSERT_D2_SB(src0_d, src1_d, src0);
+    INSERT_D2_SB(dst0_d, dst1_d, dst0);
+
+    LD2(src_ptr, src_stride, src0_d, src1_d);
+    src_ptr += (2 * src_stride);
+    LD2((dst_ptr + 2 * dst_stride), dst_stride, dst0_d, dst1_d);
+    INSERT_D2_SB(src0_d, src1_d, src1);
+    INSERT_D2_SB(dst0_d, dst1_d, dst1);
+
+    UNPCK_UB_SH(src0, src_r, src_l);
+    UNPCK_UB_SH(dst0, dst_r, dst_l);
+    res_h_r = (src_r * src_wt);
+    res_h_r += (dst_r * dst_wt);
+    res_h_l = (src_l * src_wt);
+    res_h_l += (dst_l * dst_wt);
+    SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+    dst0 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r);
+    ST8x2_UB(dst0, dst_ptr, dst_stride);
+    dst_ptr += (2 * dst_stride);
+
+    UNPCK_UB_SH(src1, src_r, src_l);
+    UNPCK_UB_SH(dst1, dst_r, dst_l);
+    res_h_r = (src_r * src_wt);
+    res_h_r += (dst_r * dst_wt);
+    res_h_l = (src_l * src_wt);
+    res_h_l += (dst_l * dst_wt);
+    SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+    dst1 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r);
+    ST8x2_UB(dst1, dst_ptr, dst_stride);
+    dst_ptr += (2 * dst_stride);
+  }
+}
+
+static void filter_by_weight16x16_msa(const uint8_t *src_ptr,
+                                      int32_t src_stride,
+                                      uint8_t *dst_ptr,
+                                      int32_t dst_stride,
+                                      int32_t src_weight) {
+  int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight;
+  int32_t row;
+  v16i8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
+  v8i16 src_wt, dst_wt, res_h_r, res_h_l, src_r, src_l, dst_r, dst_l;
+
+  src_wt = __msa_fill_h(src_weight);
+  dst_wt = __msa_fill_h(dst_weight);
+
+  for (row = 4; row--;) {
+    LD_SB4(src_ptr, src_stride, src0, src1, src2, src3);
+    src_ptr += (4 * src_stride);
+    LD_SB4(dst_ptr, dst_stride, dst0, dst1, dst2, dst3);
+
+    UNPCK_UB_SH(src0, src_r, src_l);
+    UNPCK_UB_SH(dst0, dst_r, dst_l);
+    res_h_r = (src_r * src_wt);
+    res_h_r += (dst_r * dst_wt);
+    res_h_l = (src_l * src_wt);
+    res_h_l += (dst_l * dst_wt);
+    SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+    PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
+    dst_ptr += dst_stride;
+
+    UNPCK_UB_SH(src1, src_r, src_l);
+    UNPCK_UB_SH(dst1, dst_r, dst_l);
+    res_h_r = (src_r * src_wt);
+    res_h_r += (dst_r * dst_wt);
+    res_h_l = (src_l * src_wt);
+    res_h_l += (dst_l * dst_wt);
+    SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+    PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
+    dst_ptr += dst_stride;
+
+    UNPCK_UB_SH(src2, src_r, src_l);
+    UNPCK_UB_SH(dst2, dst_r, dst_l);
+    res_h_r = (src_r * src_wt);
+    res_h_r += (dst_r * dst_wt);
+    res_h_l = (src_l * src_wt);
+    res_h_l += (dst_l * dst_wt);
+    SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+    PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
+    dst_ptr += dst_stride;
+
+    UNPCK_UB_SH(src3, src_r, src_l);
+    UNPCK_UB_SH(dst3, dst_r, dst_l);
+    res_h_r = (src_r * src_wt);
+    res_h_r += (dst_r * dst_wt);
+    res_h_l = (src_l * src_wt);
+    res_h_l += (dst_l * dst_wt);
+    SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+    PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
+    dst_ptr += dst_stride;
+  }
+}
+
+void vp9_filter_by_weight8x8_msa(const uint8_t *src, int src_stride,
+                                 uint8_t *dst, int dst_stride,
+                                 int src_weight) {
+  filter_by_weight8x8_msa(src, src_stride, dst, dst_stride, src_weight);
+}
+
+void vp9_filter_by_weight16x16_msa(const uint8_t *src, int src_stride,
+                                   uint8_t *dst, int dst_stride,
+                                   int src_weight) {
+  filter_by_weight16x16_msa(src, src_stride, dst, dst_stride, src_weight);
+}
diff --git a/libs/libvpx/vp9/common/vp9_alloccommon.c b/libs/libvpx/vp9/common/vp9_alloccommon.c
new file mode 100644
index 0000000000..7dd1005d3f
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_alloccommon.c
@@ -0,0 +1,201 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "vpx_mem/vpx_mem.h"
+
+#include "vp9/common/vp9_alloccommon.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_entropymode.h"
+#include "vp9/common/vp9_entropymv.h"
+#include "vp9/common/vp9_onyxc_int.h"
+
+// TODO(hkuang): Don't need to lock the whole pool after implementing atomic
+// frame reference count.
+void lock_buffer_pool(BufferPool *const pool) {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(&pool->pool_mutex);
+#else
+  (void)pool;
+#endif
+}
+
+void unlock_buffer_pool(BufferPool *const pool) {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_unlock(&pool->pool_mutex);
+#else
+  (void)pool;
+#endif
+}
+
+void vp9_set_mb_mi(VP9_COMMON *cm, int width, int height) {
+  const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2);
+  const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2);
+
+  cm->mi_cols = aligned_width >> MI_SIZE_LOG2;
+  cm->mi_rows = aligned_height >> MI_SIZE_LOG2;
+  cm->mi_stride = calc_mi_size(cm->mi_cols);
+
+  cm->mb_cols = (cm->mi_cols + 1) >> 1;
+  cm->mb_rows = (cm->mi_rows + 1) >> 1;
+  cm->MBs = cm->mb_rows * cm->mb_cols;
+}
+
+static int alloc_seg_map(VP9_COMMON *cm, int seg_map_size) {
+  int i;
+
+  for (i = 0; i < NUM_PING_PONG_BUFFERS; ++i) {
+    cm->seg_map_array[i] = (uint8_t *)vpx_calloc(seg_map_size, 1);
+    if (cm->seg_map_array[i] == NULL)
+      return 1;
+  }
+  cm->seg_map_alloc_size = seg_map_size;
+
+  // Init the index.
+  cm->seg_map_idx = 0;
+  cm->prev_seg_map_idx = 1;
+
+  cm->current_frame_seg_map = cm->seg_map_array[cm->seg_map_idx];
+  if (!cm->frame_parallel_decode)
+    cm->last_frame_seg_map = cm->seg_map_array[cm->prev_seg_map_idx];
+
+  return 0;
+}
+
+static void free_seg_map(VP9_COMMON *cm) {
+  int i;
+
+  for (i = 0; i < NUM_PING_PONG_BUFFERS; ++i) {
+    vpx_free(cm->seg_map_array[i]);
+    cm->seg_map_array[i] = NULL;
+  }
+
+  cm->current_frame_seg_map = NULL;
+
+  if (!cm->frame_parallel_decode) {
+    cm->last_frame_seg_map = NULL;
+  }
+}
+
+void vp9_free_ref_frame_buffers(BufferPool *pool) {
+  int i;
+
+  for (i = 0; i < FRAME_BUFFERS; ++i) {
+    if (pool->frame_bufs[i].ref_count > 0 &&
+        pool->frame_bufs[i].raw_frame_buffer.data != NULL) {
+      pool->release_fb_cb(pool->cb_priv, &pool->frame_bufs[i].raw_frame_buffer);
+      pool->frame_bufs[i].ref_count = 0;
+    }
+    vpx_free(pool->frame_bufs[i].mvs);
+    pool->frame_bufs[i].mvs = NULL;
+    vpx_free_frame_buffer(&pool->frame_bufs[i].buf);
+  }
+}
+
+void vp9_free_postproc_buffers(VP9_COMMON *cm) {
+#if CONFIG_VP9_POSTPROC
+  vpx_free_frame_buffer(&cm->post_proc_buffer);
+  vpx_free_frame_buffer(&cm->post_proc_buffer_int);
+#else
+  (void)cm;
+#endif
+}
+
+void vp9_free_context_buffers(VP9_COMMON *cm) {
+  cm->free_mi(cm);
+  free_seg_map(cm);
+  vpx_free(cm->above_context);
+  cm->above_context = NULL;
+  vpx_free(cm->above_seg_context);
+  cm->above_seg_context = NULL;
+  vpx_free(cm->lf.lfm);
+  cm->lf.lfm = NULL;
+}
+
+
+int vp9_alloc_loop_filter(VP9_COMMON *cm) {
+  vpx_free(cm->lf.lfm);
+  // Each lfm holds bit masks for all the 8x8 blocks in a 64x64 region.  The
+  // stride and rows are rounded up / truncated to a multiple of 8.
+  cm->lf.lfm_stride = (cm->mi_cols + (MI_BLOCK_SIZE - 1)) >> 3;
+  cm->lf.lfm = (LOOP_FILTER_MASK *)vpx_calloc(
+      ((cm->mi_rows + (MI_BLOCK_SIZE - 1)) >> 3) * cm->lf.lfm_stride,
+      sizeof(*cm->lf.lfm));
+  if (!cm->lf.lfm)
+    return 1;
+  return 0;
+}
+
+int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) {
+  int new_mi_size;
+
+  vp9_set_mb_mi(cm, width, height);
+  new_mi_size = cm->mi_stride * calc_mi_size(cm->mi_rows);
+  if (cm->mi_alloc_size < new_mi_size) {
+    cm->free_mi(cm);
+    if (cm->alloc_mi(cm, new_mi_size))
+      goto fail;
+  }
+
+  if (cm->seg_map_alloc_size < cm->mi_rows * cm->mi_cols) {
+    // Create the segmentation map structure and set to 0.
+    free_seg_map(cm);
+    if (alloc_seg_map(cm, cm->mi_rows * cm->mi_cols))
+      goto fail;
+  }
+
+  if (cm->above_context_alloc_cols < cm->mi_cols) {
+    vpx_free(cm->above_context);
+    cm->above_context = (ENTROPY_CONTEXT *)vpx_calloc(
+        2 * mi_cols_aligned_to_sb(cm->mi_cols) * MAX_MB_PLANE,
+        sizeof(*cm->above_context));
+    if (!cm->above_context) goto fail;
+
+    vpx_free(cm->above_seg_context);
+    cm->above_seg_context = (PARTITION_CONTEXT *)vpx_calloc(
+        mi_cols_aligned_to_sb(cm->mi_cols), sizeof(*cm->above_seg_context));
+    if (!cm->above_seg_context) goto fail;
+    cm->above_context_alloc_cols = cm->mi_cols;
+  }
+
+  if (vp9_alloc_loop_filter(cm))
+    goto fail;
+
+  return 0;
+
+ fail:
+  vp9_free_context_buffers(cm);
+  return 1;
+}
+
+void vp9_remove_common(VP9_COMMON *cm) {
+  vp9_free_context_buffers(cm);
+
+  vpx_free(cm->fc);
+  cm->fc = NULL;
+  vpx_free(cm->frame_contexts);
+  cm->frame_contexts = NULL;
+}
+
+void vp9_init_context_buffers(VP9_COMMON *cm) {
+  cm->setup_mi(cm);
+  if (cm->last_frame_seg_map && !cm->frame_parallel_decode)
+    memset(cm->last_frame_seg_map, 0, cm->mi_rows * cm->mi_cols);
+}
+
+void vp9_swap_current_and_last_seg_map(VP9_COMMON *cm) {
+  // Swap indices.
+  const int tmp = cm->seg_map_idx;
+  cm->seg_map_idx = cm->prev_seg_map_idx;
+  cm->prev_seg_map_idx = tmp;
+
+  cm->current_frame_seg_map = cm->seg_map_array[cm->seg_map_idx];
+  cm->last_frame_seg_map = cm->seg_map_array[cm->prev_seg_map_idx];
+}
diff --git a/libs/libvpx/vp9/common/vp9_alloccommon.h b/libs/libvpx/vp9/common/vp9_alloccommon.h
new file mode 100644
index 0000000000..e53955b998
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_alloccommon.h
@@ -0,0 +1,45 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP9_COMMON_VP9_ALLOCCOMMON_H_
+#define VP9_COMMON_VP9_ALLOCCOMMON_H_
+
+#define INVALID_IDX -1  // Invalid buffer index.
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP9Common;
+struct BufferPool;
+
+void vp9_remove_common(struct VP9Common *cm);
+
+int vp9_alloc_loop_filter(struct VP9Common *cm);
+int vp9_alloc_context_buffers(struct VP9Common *cm, int width, int height);
+void vp9_init_context_buffers(struct VP9Common *cm);
+void vp9_free_context_buffers(struct VP9Common *cm);
+
+void vp9_free_ref_frame_buffers(struct BufferPool *pool);
+void vp9_free_postproc_buffers(struct VP9Common *cm);
+
+int vp9_alloc_state_buffers(struct VP9Common *cm, int width, int height);
+void vp9_free_state_buffers(struct VP9Common *cm);
+
+void vp9_set_mb_mi(struct VP9Common *cm, int width, int height);
+
+void vp9_swap_current_and_last_seg_map(struct VP9Common *cm);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_COMMON_VP9_ALLOCCOMMON_H_
diff --git a/libs/libvpx/vp9/common/vp9_blockd.c b/libs/libvpx/vp9/common/vp9_blockd.c
new file mode 100644
index 0000000000..7bab27d4fd
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_blockd.c
@@ -0,0 +1,135 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/common/vp9_blockd.h"
+
+PREDICTION_MODE vp9_left_block_mode(const MODE_INFO *cur_mi,
+                                    const MODE_INFO *left_mi, int b) {
+  if (b == 0 || b == 2) {
+    if (!left_mi || is_inter_block(left_mi))
+      return DC_PRED;
+
+    return get_y_mode(left_mi, b + 1);
+  } else {
+    assert(b == 1 || b == 3);
+    return cur_mi->bmi[b - 1].as_mode;
+  }
+}
+
+PREDICTION_MODE vp9_above_block_mode(const MODE_INFO *cur_mi,
+                                     const MODE_INFO *above_mi, int b) {
+  if (b == 0 || b == 1) {
+    if (!above_mi || is_inter_block(above_mi))
+      return DC_PRED;
+
+    return get_y_mode(above_mi, b + 2);
+  } else {
+    assert(b == 2 || b == 3);
+    return cur_mi->bmi[b - 2].as_mode;
+  }
+}
+
+void vp9_foreach_transformed_block_in_plane(
+    const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
+    foreach_transformed_block_visitor visit, void *arg) {
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const MODE_INFO* mi = xd->mi[0];
+  // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
+  // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
+  // transform size varies per plane, look it up in a common way.
+  const TX_SIZE tx_size = plane ? get_uv_tx_size(mi, pd)
+                                : mi->tx_size;
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+  const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+  const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+  const int step = 1 << (tx_size << 1);
+  int i = 0, r, c;
+
+  // If mb_to_right_edge is < 0 we are in a situation in which
+  // the current block size extends into the UMV and we won't
+  // visit the sub blocks that are wholly within the UMV.
+  const int max_blocks_wide = num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 :
+      xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+  const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 :
+      xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+  const int extra_step = ((num_4x4_w - max_blocks_wide) >> tx_size) * step;
+
+  // Keep track of the row and column of the blocks we use so that we know
+  // if we are in the unrestricted motion border.
+  for (r = 0; r < max_blocks_high; r += (1 << tx_size)) {
+    // Skip visiting the sub blocks that are wholly within the UMV.
+    for (c = 0; c < max_blocks_wide; c += (1 << tx_size)) {
+      visit(plane, i, plane_bsize, tx_size, arg);
+      i += step;
+    }
+    i += extra_step;
+  }
+}
+
+void vp9_foreach_transformed_block(const MACROBLOCKD* const xd,
+                                   BLOCK_SIZE bsize,
+                                   foreach_transformed_block_visitor visit,
+                                   void *arg) {
+  int plane;
+
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane)
+    vp9_foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg);
+}
+
+void vp9_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
+                      BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int has_eob,
+                      int aoff, int loff) {
+  ENTROPY_CONTEXT *const a = pd->above_context + aoff;
+  ENTROPY_CONTEXT *const l = pd->left_context + loff;
+  const int tx_size_in_blocks = 1 << tx_size;
+
+  // above
+  if (has_eob && xd->mb_to_right_edge < 0) {
+    int i;
+    const int blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize] +
+                            (xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+    int above_contexts = tx_size_in_blocks;
+    if (above_contexts + aoff > blocks_wide)
+      above_contexts = blocks_wide - aoff;
+
+    for (i = 0; i < above_contexts; ++i)
+      a[i] = has_eob;
+    for (i = above_contexts; i < tx_size_in_blocks; ++i)
+      a[i] = 0;
+  } else {
+    memset(a, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
+  }
+
+  // left
+  if (has_eob && xd->mb_to_bottom_edge < 0) {
+    int i;
+    const int blocks_high = num_4x4_blocks_high_lookup[plane_bsize] +
+                            (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+    int left_contexts = tx_size_in_blocks;
+    if (left_contexts + loff > blocks_high)
+      left_contexts = blocks_high - loff;
+
+    for (i = 0; i < left_contexts; ++i)
+      l[i] = has_eob;
+    for (i = left_contexts; i < tx_size_in_blocks; ++i)
+      l[i] = 0;
+  } else {
+    memset(l, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
+  }
+}
+
+void vp9_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y) {
+  int i;
+
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    xd->plane[i].subsampling_x = i ? ss_x : 0;
+    xd->plane[i].subsampling_y = i ? ss_y : 0;
+  }
+}
diff --git a/libs/libvpx/vp9/common/vp9_blockd.h b/libs/libvpx/vp9/common/vp9_blockd.h
new file mode 100644
index 0000000000..ae2f66a4a7
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_blockd.h
@@ -0,0 +1,305 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP9_COMMON_VP9_BLOCKD_H_
+#define VP9_COMMON_VP9_BLOCKD_H_
+
+#include "./vpx_config.h"
+
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_ports/mem.h"
+#include "vpx_scale/yv12config.h"
+
+#include "vp9/common/vp9_common_data.h"
+#include "vp9/common/vp9_entropy.h"
+#include "vp9/common/vp9_entropymode.h"
+#include "vp9/common/vp9_mv.h"
+#include "vp9/common/vp9_scale.h"
+#include "vp9/common/vp9_seg_common.h"
+#include "vp9/common/vp9_tile_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_MB_PLANE 3
+
+typedef enum {
+  KEY_FRAME = 0,
+  INTER_FRAME = 1,
+  FRAME_TYPES,
+} FRAME_TYPE;
+
+static INLINE int is_inter_mode(PREDICTION_MODE mode) {
+  return mode >= NEARESTMV && mode <= NEWMV;
+}
+
+/* For keyframes, intra block modes are predicted by the (already decoded)
+   modes for the Y blocks to the left and above us; for interframes, there
+   is a single probability table. */
+
+typedef struct {
+  PREDICTION_MODE as_mode;
+  int_mv as_mv[2];  // first, second inter predictor motion vectors
+} b_mode_info;
+
+// Note that the rate-distortion optimization loop, bit-stream writer, and
+// decoder implementation modules critically rely on the defined entry values
+// specified herein. They should be refactored concurrently.
+
+#define NONE           -1
+#define INTRA_FRAME     0
+#define LAST_FRAME      1
+#define GOLDEN_FRAME    2
+#define ALTREF_FRAME    3
+#define MAX_REF_FRAMES  4
+typedef int8_t MV_REFERENCE_FRAME;
+
+// This structure now relates to 8x8 block regions.
+typedef struct MODE_INFO {
+  // Common for both INTER and INTRA blocks
+  BLOCK_SIZE sb_type;
+  PREDICTION_MODE mode;
+  TX_SIZE tx_size;
+  int8_t skip;
+  int8_t segment_id;
+  int8_t seg_id_predicted;  // valid only when temporal_update is enabled
+
+  // Only for INTRA blocks
+  PREDICTION_MODE uv_mode;
+
+  // Only for INTER blocks
+  INTERP_FILTER interp_filter;
+  MV_REFERENCE_FRAME ref_frame[2];
+
+  // TODO(slavarnway): Delete and use bmi[3].as_mv[] instead.
+  int_mv mv[2];
+
+  b_mode_info bmi[4];
+} MODE_INFO;
+
+static INLINE PREDICTION_MODE get_y_mode(const MODE_INFO *mi, int block) {
+  return mi->sb_type < BLOCK_8X8 ? mi->bmi[block].as_mode
+                                 : mi->mode;
+}
+
+static INLINE int is_inter_block(const MODE_INFO *mi) {
+  return mi->ref_frame[0] > INTRA_FRAME;
+}
+
+static INLINE int has_second_ref(const MODE_INFO *mi) {
+  return mi->ref_frame[1] > INTRA_FRAME;
+}
+
+PREDICTION_MODE vp9_left_block_mode(const MODE_INFO *cur_mi,
+                                    const MODE_INFO *left_mi, int b);
+
+PREDICTION_MODE vp9_above_block_mode(const MODE_INFO *cur_mi,
+                                     const MODE_INFO *above_mi, int b);
+
+enum mv_precision {
+  MV_PRECISION_Q3,
+  MV_PRECISION_Q4
+};
+
+struct buf_2d {
+  uint8_t *buf;
+  int stride;
+};
+
+struct macroblockd_plane {
+  tran_low_t *dqcoeff;
+  int subsampling_x;
+  int subsampling_y;
+  struct buf_2d dst;
+  struct buf_2d pre[2];
+  ENTROPY_CONTEXT *above_context;
+  ENTROPY_CONTEXT *left_context;
+  int16_t seg_dequant[MAX_SEGMENTS][2];
+
+  // number of 4x4s in current block
+  uint16_t n4_w, n4_h;
+  // log2 of n4_w, n4_h
+  uint8_t n4_wl, n4_hl;
+
+  // encoder
+  const int16_t *dequant;
+};
+
+#define BLOCK_OFFSET(x, i) ((x) + (i) * 16)
+
+typedef struct RefBuffer {
+  // TODO(dkovalev): idx is not really required and should be removed, now it
+  // is used in vp9_onyxd_if.c
+  int idx;
+  YV12_BUFFER_CONFIG *buf;
+  struct scale_factors sf;
+} RefBuffer;
+
+typedef struct macroblockd {
+  struct macroblockd_plane plane[MAX_MB_PLANE];
+  uint8_t bmode_blocks_wl;
+  uint8_t bmode_blocks_hl;
+
+  FRAME_COUNTS *counts;
+  TileInfo tile;
+
+  int mi_stride;
+
+  MODE_INFO **mi;
+  MODE_INFO *left_mi;
+  MODE_INFO *above_mi;
+
+  int up_available;
+  int left_available;
+
+  const vpx_prob (*partition_probs)[PARTITION_TYPES - 1];
+
+  /* Distance of MB away from frame edges */
+  int mb_to_left_edge;
+  int mb_to_right_edge;
+  int mb_to_top_edge;
+  int mb_to_bottom_edge;
+
+  FRAME_CONTEXT *fc;
+
+  /* pointers to reference frames */
+  RefBuffer *block_refs[2];
+
+  /* pointer to current frame */
+  const YV12_BUFFER_CONFIG *cur_buf;
+
+  ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
+  ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16];
+
+  PARTITION_CONTEXT *above_seg_context;
+  PARTITION_CONTEXT left_seg_context[8];
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  /* Bit depth: 8, 10, 12 */
+  int bd;
+#endif
+
+  int lossless;
+  int corrupted;
+
+  struct vpx_internal_error_info *error_info;
+} MACROBLOCKD;
+
+static INLINE PLANE_TYPE get_plane_type(int plane) {
+  return (PLANE_TYPE)(plane > 0);
+}
+
+static INLINE BLOCK_SIZE get_subsize(BLOCK_SIZE bsize,
+                                     PARTITION_TYPE partition) {
+  return subsize_lookup[partition][bsize];
+}
+
+extern const TX_TYPE intra_mode_to_tx_type_lookup[INTRA_MODES];
+
+static INLINE TX_TYPE get_tx_type(PLANE_TYPE plane_type,
+                                  const MACROBLOCKD *xd) {
+  const MODE_INFO *const mi = xd->mi[0];
+
+  if (plane_type != PLANE_TYPE_Y || xd->lossless || is_inter_block(mi))
+    return DCT_DCT;
+
+  return intra_mode_to_tx_type_lookup[mi->mode];
+}
+
+static INLINE TX_TYPE get_tx_type_4x4(PLANE_TYPE plane_type,
+                                      const MACROBLOCKD *xd, int ib) {
+  const MODE_INFO *const mi = xd->mi[0];
+
+  if (plane_type != PLANE_TYPE_Y || xd->lossless || is_inter_block(mi))
+    return DCT_DCT;
+
+  return intra_mode_to_tx_type_lookup[get_y_mode(mi, ib)];
+}
+
+void vp9_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y);
+
+static INLINE TX_SIZE get_uv_tx_size_impl(TX_SIZE y_tx_size, BLOCK_SIZE bsize,
+                                          int xss, int yss) {
+  if (bsize < BLOCK_8X8) {
+    return TX_4X4;
+  } else {
+    const BLOCK_SIZE plane_bsize = ss_size_lookup[bsize][xss][yss];
+    return VPXMIN(y_tx_size, max_txsize_lookup[plane_bsize]);
+  }
+}
+
+static INLINE TX_SIZE get_uv_tx_size(const MODE_INFO *mi,
+                                     const struct macroblockd_plane *pd) {
+  return get_uv_tx_size_impl(mi->tx_size, mi->sb_type, pd->subsampling_x,
+                             pd->subsampling_y);
+}
+
+static INLINE BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize,
+    const struct macroblockd_plane *pd) {
+  return ss_size_lookup[bsize][pd->subsampling_x][pd->subsampling_y];
+}
+
+static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) {
+  int i;
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    struct macroblockd_plane *const pd = &xd->plane[i];
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+    memset(pd->above_context, 0,
+           sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide_lookup[plane_bsize]);
+    memset(pd->left_context, 0,
+           sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high_lookup[plane_bsize]);
+  }
+}
+
+static INLINE const vpx_prob *get_y_mode_probs(const MODE_INFO *mi,
+                                               const MODE_INFO *above_mi,
+                                               const MODE_INFO *left_mi,
+                                               int block) {
+  const PREDICTION_MODE above = vp9_above_block_mode(mi, above_mi, block);
+  const PREDICTION_MODE left = vp9_left_block_mode(mi, left_mi, block);
+  return vp9_kf_y_mode_prob[above][left];
+}
+
+typedef void (*foreach_transformed_block_visitor)(int plane, int block,
+                                                  BLOCK_SIZE plane_bsize,
+                                                  TX_SIZE tx_size,
+                                                  void *arg);
+
+void vp9_foreach_transformed_block_in_plane(
+    const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
+    foreach_transformed_block_visitor visit, void *arg);
+
+
+void vp9_foreach_transformed_block(
+    const MACROBLOCKD* const xd, BLOCK_SIZE bsize,
+    foreach_transformed_block_visitor visit, void *arg);
+
+static INLINE void txfrm_block_to_raster_xy(BLOCK_SIZE plane_bsize,
+                                            TX_SIZE tx_size, int block,
+                                            int *x, int *y) {
+  const int bwl = b_width_log2_lookup[plane_bsize];
+  const int tx_cols_log2 = bwl - tx_size;
+  const int tx_cols = 1 << tx_cols_log2;
+  const int raster_mb = block >> (tx_size << 1);
+  *x = (raster_mb & (tx_cols - 1)) << tx_size;
+  *y = (raster_mb >> tx_cols_log2) << tx_size;
+}
+
+void vp9_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
+                      BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int has_eob,
+                      int aoff, int loff);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_COMMON_VP9_BLOCKD_H_
diff --git a/libs/libvpx/vp9/common/vp9_common.h b/libs/libvpx/vp9/common/vp9_common.h
new file mode 100644
index 0000000000..76e7cd440b
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_common.h
@@ -0,0 +1,75 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_COMMON_H_
+#define VP9_COMMON_VP9_COMMON_H_
+
+/* Interface header for common constant data structures and lookup tables */
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/bitops.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Only need this for fixed-size arrays, for structs just assign.
+#define vp9_copy(dest, src) {            \
+    assert(sizeof(dest) == sizeof(src)); \
+    memcpy(dest, src, sizeof(src));  \
+  }
+
+// Use this for variably-sized arrays.
+#define vp9_copy_array(dest, src, n) {       \
+    assert(sizeof(*dest) == sizeof(*src));   \
+    memcpy(dest, src, n * sizeof(*src)); \
+  }
+
+#define vp9_zero(dest) memset(&(dest), 0, sizeof(dest))
+#define vp9_zero_array(dest, n) memset(dest, 0, n * sizeof(*dest))
+
+static INLINE int get_unsigned_bits(unsigned int num_values) {
+  return num_values > 0 ? get_msb(num_values) + 1 : 0;
+}
+
+#if CONFIG_DEBUG
+#define CHECK_MEM_ERROR(cm, lval, expr) do { \
+  lval = (expr); \
+  if (!lval) \
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, \
+                       "Failed to allocate "#lval" at %s:%d", \
+                       __FILE__, __LINE__); \
+  } while (0)
+#else
+#define CHECK_MEM_ERROR(cm, lval, expr) do { \
+  lval = (expr); \
+  if (!lval) \
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, \
+                       "Failed to allocate "#lval); \
+  } while (0)
+#endif
+
+#define VP9_SYNC_CODE_0 0x49
+#define VP9_SYNC_CODE_1 0x83
+#define VP9_SYNC_CODE_2 0x42
+
+#define VP9_FRAME_MARKER 0x2
+
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_COMMON_VP9_COMMON_H_
diff --git a/libs/libvpx/vp9/common/vp9_common_data.c b/libs/libvpx/vp9/common/vp9_common_data.c
new file mode 100644
index 0000000000..a6dae6a1c8
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_common_data.c
@@ -0,0 +1,161 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/common/vp9_common_data.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+// Log 2 conversion lookup tables for block width and height
+const uint8_t b_width_log2_lookup[BLOCK_SIZES] =
+  {0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4};
+const uint8_t b_height_log2_lookup[BLOCK_SIZES] =
+  {0, 1, 0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4};
+const uint8_t num_4x4_blocks_wide_lookup[BLOCK_SIZES] =
+  {1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 8, 16, 16};
+const uint8_t num_4x4_blocks_high_lookup[BLOCK_SIZES] =
+  {1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 16, 8, 16};
+// Log 2 conversion lookup tables for modeinfo width and height
+const uint8_t mi_width_log2_lookup[BLOCK_SIZES] =
+  {0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3};
+const uint8_t num_8x8_blocks_wide_lookup[BLOCK_SIZES] =
+  {1, 1, 1, 1, 1, 2, 2, 2, 4, 4, 4, 8, 8};
+const uint8_t num_8x8_blocks_high_lookup[BLOCK_SIZES] =
+  {1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8};
+
+// VPXMIN(3, VPXMIN(b_width_log2(bsize), b_height_log2(bsize)))
+const uint8_t size_group_lookup[BLOCK_SIZES] =
+  {0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3};
+
+const uint8_t num_pels_log2_lookup[BLOCK_SIZES] =
+  {4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12};
+
+const PARTITION_TYPE partition_lookup[][BLOCK_SIZES] = {
+  {  // 4X4
+    // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
+    PARTITION_NONE, PARTITION_INVALID, PARTITION_INVALID,
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+    PARTITION_INVALID
+  }, {  // 8X8
+    // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
+    PARTITION_SPLIT, PARTITION_VERT, PARTITION_HORZ, PARTITION_NONE,
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID
+  }, {  // 16X16
+    // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
+    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
+    PARTITION_VERT, PARTITION_HORZ, PARTITION_NONE, PARTITION_INVALID,
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+    PARTITION_INVALID, PARTITION_INVALID
+  }, {  // 32X32
+    // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
+    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
+    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_VERT,
+    PARTITION_HORZ, PARTITION_NONE, PARTITION_INVALID,
+    PARTITION_INVALID, PARTITION_INVALID
+  }, {  // 64X64
+    // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
+    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
+    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
+    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_VERT, PARTITION_HORZ,
+    PARTITION_NONE
+  }
+};
+
+const BLOCK_SIZE subsize_lookup[PARTITION_TYPES][BLOCK_SIZES] = {
+  {     // PARTITION_NONE
+    BLOCK_4X4,   BLOCK_4X8,   BLOCK_8X4,
+    BLOCK_8X8,   BLOCK_8X16,  BLOCK_16X8,
+    BLOCK_16X16, BLOCK_16X32, BLOCK_32X16,
+    BLOCK_32X32, BLOCK_32X64, BLOCK_64X32,
+    BLOCK_64X64,
+  }, {  // PARTITION_HORZ
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_8X4,     BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_16X8,    BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_32X16,   BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_64X32,
+  }, {  // PARTITION_VERT
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_4X8,     BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_8X16,    BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_16X32,   BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_32X64,
+  }, {  // PARTITION_SPLIT
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_4X4,     BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_8X8,     BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_16X16,   BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_32X32,
+  }
+};
+
+const TX_SIZE max_txsize_lookup[BLOCK_SIZES] = {
+  TX_4X4,   TX_4X4,   TX_4X4,
+  TX_8X8,   TX_8X8,   TX_8X8,
+  TX_16X16, TX_16X16, TX_16X16,
+  TX_32X32, TX_32X32, TX_32X32, TX_32X32
+};
+
+const BLOCK_SIZE txsize_to_bsize[TX_SIZES] = {
+    BLOCK_4X4,  // TX_4X4
+    BLOCK_8X8,  // TX_8X8
+    BLOCK_16X16,  // TX_16X16
+    BLOCK_32X32,  // TX_32X32
+};
+
+const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES] = {
+  TX_4X4,  // ONLY_4X4
+  TX_8X8,  // ALLOW_8X8
+  TX_16X16,  // ALLOW_16X16
+  TX_32X32,  // ALLOW_32X32
+  TX_32X32,  // TX_MODE_SELECT
+};
+
+const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2] = {
+//  ss_x == 0    ss_x == 0        ss_x == 1      ss_x == 1
+//  ss_y == 0    ss_y == 1        ss_y == 0      ss_y == 1
+  {{BLOCK_4X4,   BLOCK_INVALID}, {BLOCK_INVALID, BLOCK_INVALID}},
+  {{BLOCK_4X8,   BLOCK_4X4},     {BLOCK_INVALID, BLOCK_INVALID}},
+  {{BLOCK_8X4,   BLOCK_INVALID}, {BLOCK_4X4,     BLOCK_INVALID}},
+  {{BLOCK_8X8,   BLOCK_8X4},     {BLOCK_4X8,     BLOCK_4X4}},
+  {{BLOCK_8X16,  BLOCK_8X8},     {BLOCK_INVALID, BLOCK_4X8}},
+  {{BLOCK_16X8,  BLOCK_INVALID}, {BLOCK_8X8,     BLOCK_8X4}},
+  {{BLOCK_16X16, BLOCK_16X8},    {BLOCK_8X16,    BLOCK_8X8}},
+  {{BLOCK_16X32, BLOCK_16X16},   {BLOCK_INVALID, BLOCK_8X16}},
+  {{BLOCK_32X16, BLOCK_INVALID}, {BLOCK_16X16,   BLOCK_16X8}},
+  {{BLOCK_32X32, BLOCK_32X16},   {BLOCK_16X32,   BLOCK_16X16}},
+  {{BLOCK_32X64, BLOCK_32X32},   {BLOCK_INVALID, BLOCK_16X32}},
+  {{BLOCK_64X32, BLOCK_INVALID}, {BLOCK_32X32,   BLOCK_32X16}},
+  {{BLOCK_64X64, BLOCK_64X32},   {BLOCK_32X64,   BLOCK_32X32}},
+};
+
+// Generates 4 bit field in which each bit set to 1 represents
+// a blocksize partition  1111 means we split 64x64, 32x32, 16x16
+// and 8x8.  1000 means we just split the 64x64 to 32x32
+const struct {
+  PARTITION_CONTEXT above;
+  PARTITION_CONTEXT left;
+} partition_context_lookup[BLOCK_SIZES]= {
+  {15, 15},  // 4X4   - {0b1111, 0b1111}
+  {15, 14},  // 4X8   - {0b1111, 0b1110}
+  {14, 15},  // 8X4   - {0b1110, 0b1111}
+  {14, 14},  // 8X8   - {0b1110, 0b1110}
+  {14, 12},  // 8X16  - {0b1110, 0b1100}
+  {12, 14},  // 16X8  - {0b1100, 0b1110}
+  {12, 12},  // 16X16 - {0b1100, 0b1100}
+  {12, 8 },  // 16X32 - {0b1100, 0b1000}
+  {8,  12},  // 32X16 - {0b1000, 0b1100}
+  {8,  8 },  // 32X32 - {0b1000, 0b1000}
+  {8,  0 },  // 32X64 - {0b1000, 0b0000}
+  {0,  8 },  // 64X32 - {0b0000, 0b1000}
+  {0,  0 },  // 64X64 - {0b0000, 0b0000}
+};
diff --git a/libs/libvpx/vp9/common/vp9_common_data.h b/libs/libvpx/vp9/common/vp9_common_data.h
new file mode 100644
index 0000000000..95a1179617
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_common_data.h
@@ -0,0 +1,41 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_COMMON_DATA_H_
+#define VP9_COMMON_VP9_COMMON_DATA_H_
+
+#include "vp9/common/vp9_enums.h"
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern const uint8_t b_width_log2_lookup[BLOCK_SIZES];
+extern const uint8_t b_height_log2_lookup[BLOCK_SIZES];
+extern const uint8_t mi_width_log2_lookup[BLOCK_SIZES];
+extern const uint8_t num_8x8_blocks_wide_lookup[BLOCK_SIZES];
+extern const uint8_t num_8x8_blocks_high_lookup[BLOCK_SIZES];
+extern const uint8_t num_4x4_blocks_high_lookup[BLOCK_SIZES];
+extern const uint8_t num_4x4_blocks_wide_lookup[BLOCK_SIZES];
+extern const uint8_t size_group_lookup[BLOCK_SIZES];
+extern const uint8_t num_pels_log2_lookup[BLOCK_SIZES];
+extern const PARTITION_TYPE partition_lookup[][BLOCK_SIZES];
+extern const BLOCK_SIZE subsize_lookup[PARTITION_TYPES][BLOCK_SIZES];
+extern const TX_SIZE max_txsize_lookup[BLOCK_SIZES];
+extern const BLOCK_SIZE txsize_to_bsize[TX_SIZES];
+extern const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES];
+extern const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2];
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_COMMON_VP9_COMMON_DATA_H_
diff --git a/libs/libvpx/vp9/common/vp9_debugmodes.c b/libs/libvpx/vp9/common/vp9_debugmodes.c
new file mode 100644
index 0000000000..d9c1fd9686
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_debugmodes.c
@@ -0,0 +1,91 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdio.h>
+
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_onyxc_int.h"
+
+static void log_frame_info(VP9_COMMON *cm, const char *str, FILE *f) {
+  fprintf(f, "%s", str);
+  fprintf(f, "(Frame %d, Show:%d, Q:%d): \n", cm->current_video_frame,
+          cm->show_frame, cm->base_qindex);
+}
+/* This function dereferences a pointer to the mbmi structure
+ * and uses the passed in member offset to print out the value of an integer
+ * for each mbmi member value in the mi structure.
+ */
+static void print_mi_data(VP9_COMMON *cm, FILE *file, const char *descriptor,
+                          size_t member_offset) {
+  int mi_row, mi_col;
+  MODE_INFO **mi = cm->mi_grid_visible;
+  int rows = cm->mi_rows;
+  int cols = cm->mi_cols;
+  char prefix = descriptor[0];
+
+  log_frame_info(cm, descriptor, file);
+  for (mi_row = 0; mi_row < rows; mi_row++) {
+    fprintf(file, "%c ", prefix);
+    for (mi_col = 0; mi_col < cols; mi_col++) {
+      fprintf(file, "%2d ",
+              *((int*) ((char *) (mi[0]) +
+                                  member_offset)));
+      mi++;
+    }
+    fprintf(file, "\n");
+    mi += 8;
+  }
+  fprintf(file, "\n");
+}
+
+void vp9_print_modes_and_motion_vectors(VP9_COMMON *cm, const char *file) {
+  int mi_row;
+  int mi_col;
+  FILE *mvs = fopen(file, "a");
+  MODE_INFO **mi = cm->mi_grid_visible;
+  int rows = cm->mi_rows;
+  int cols = cm->mi_cols;
+
+  print_mi_data(cm, mvs, "Partitions:", offsetof(MODE_INFO, sb_type));
+  print_mi_data(cm, mvs, "Modes:", offsetof(MODE_INFO, mode));
+  print_mi_data(cm, mvs, "Ref frame:", offsetof(MODE_INFO, ref_frame[0]));
+  print_mi_data(cm, mvs, "Transform:", offsetof(MODE_INFO, tx_size));
+  print_mi_data(cm, mvs, "UV Modes:", offsetof(MODE_INFO, uv_mode));
+
+  // output skip infomation.
+  log_frame_info(cm, "Skips:", mvs);
+  for (mi_row = 0; mi_row < rows; mi_row++) {
+    fprintf(mvs, "S ");
+    for (mi_col = 0; mi_col < cols; mi_col++) {
+      fprintf(mvs, "%2d ", mi[0]->skip);
+      mi++;
+    }
+    fprintf(mvs, "\n");
+    mi += 8;
+  }
+  fprintf(mvs, "\n");
+
+  // output motion vectors.
+  log_frame_info(cm, "Vectors ", mvs);
+  mi = cm->mi_grid_visible;
+  for (mi_row = 0; mi_row < rows; mi_row++) {
+    fprintf(mvs, "V ");
+    for (mi_col = 0; mi_col < cols; mi_col++) {
+      fprintf(mvs, "%4d:%4d ", mi[0]->mv[0].as_mv.row,
+                               mi[0]->mv[0].as_mv.col);
+      mi++;
+    }
+    fprintf(mvs, "\n");
+    mi += 8;
+  }
+  fprintf(mvs, "\n");
+
+  fclose(mvs);
+}
diff --git a/libs/libvpx/vp9/common/vp9_entropy.c b/libs/libvpx/vp9/common/vp9_entropy.c
new file mode 100644
index 0000000000..fc022093c7
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_entropy.c
@@ -0,0 +1,804 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/common/vp9_entropy.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/vp9_entropymode.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx/vpx_integer.h"
+
+// Unconstrained Node Tree
+const vpx_tree_index vp9_coef_con_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
+  2, 6,                                // 0 = LOW_VAL
+  -TWO_TOKEN, 4,                       // 1 = TWO
+  -THREE_TOKEN, -FOUR_TOKEN,           // 2 = THREE
+  8, 10,                               // 3 = HIGH_LOW
+  -CATEGORY1_TOKEN, -CATEGORY2_TOKEN,  // 4 = CAT_ONE
+  12, 14,                              // 5 = CAT_THREEFOUR
+  -CATEGORY3_TOKEN, -CATEGORY4_TOKEN,  // 6 = CAT_THREE
+  -CATEGORY5_TOKEN, -CATEGORY6_TOKEN   // 7 = CAT_FIVE
+};
+
+const vpx_prob vp9_cat1_prob[] = { 159 };
+const vpx_prob vp9_cat2_prob[] = { 165, 145 };
+const vpx_prob vp9_cat3_prob[] = { 173, 148, 140 };
+const vpx_prob vp9_cat4_prob[] = { 176, 155, 140, 135 };
+const vpx_prob vp9_cat5_prob[] = { 180, 157, 141, 134, 130 };
+const vpx_prob vp9_cat6_prob[] = {
+    254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129
+};
+#if CONFIG_VP9_HIGHBITDEPTH
+const vpx_prob vp9_cat6_prob_high12[] = {
+    255, 255, 255, 255, 254, 254, 254, 252, 249,
+    243, 230, 196, 177, 153, 140, 133, 130, 129
+};
+#endif
+
+const uint8_t vp9_coefband_trans_8x8plus[1024] = {
+  0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4,
+  4, 4, 4, 4, 4, 5,
+  // beyond MAXBAND_INDEX+1 all values are filled as 5
+                    5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+};
+
+const uint8_t vp9_coefband_trans_4x4[16] = {
+  0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5,
+};
+
+const uint8_t vp9_pt_energy_class[ENTROPY_TOKENS] = {
+  0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5
+};
+
+// Model obtained from a 2-sided zero-centerd distribuition derived
+// from a Pareto distribution. The cdf of the distribution is:
+// cdf(x) = 0.5 + 0.5 * sgn(x) * [1 - {alpha/(alpha + |x|)} ^ beta]
+//
+// For a given beta and a given probablity of the 1-node, the alpha
+// is first solved, and then the {alpha, beta} pair is used to generate
+// the probabilities for the rest of the nodes.
+
+// beta = 8
+
+// Every odd line in this table can be generated from the even lines
+// by averaging :
+// vp9_pareto8_full[l][node] = (vp9_pareto8_full[l-1][node] +
+//                              vp9_pareto8_full[l+1][node] ) >> 1;
+const vpx_prob vp9_pareto8_full[COEFF_PROB_MODELS][MODEL_NODES] = {
+  {  3,  86, 128,   6,  86,  23,  88,  29},
+  {  6,  86, 128,  11,  87,  42,  91,  52},
+  {  9,  86, 129,  17,  88,  61,  94,  76},
+  { 12,  86, 129,  22,  88,  77,  97,  93},
+  { 15,  87, 129,  28,  89,  93, 100, 110},
+  { 17,  87, 129,  33,  90, 105, 103, 123},
+  { 20,  88, 130,  38,  91, 118, 106, 136},
+  { 23,  88, 130,  43,  91, 128, 108, 146},
+  { 26,  89, 131,  48,  92, 139, 111, 156},
+  { 28,  89, 131,  53,  93, 147, 114, 163},
+  { 31,  90, 131,  58,  94, 156, 117, 171},
+  { 34,  90, 131,  62,  94, 163, 119, 177},
+  { 37,  90, 132,  66,  95, 171, 122, 184},
+  { 39,  90, 132,  70,  96, 177, 124, 189},
+  { 42,  91, 132,  75,  97, 183, 127, 194},
+  { 44,  91, 132,  79,  97, 188, 129, 198},
+  { 47,  92, 133,  83,  98, 193, 132, 202},
+  { 49,  92, 133,  86,  99, 197, 134, 205},
+  { 52,  93, 133,  90, 100, 201, 137, 208},
+  { 54,  93, 133,  94, 100, 204, 139, 211},
+  { 57,  94, 134,  98, 101, 208, 142, 214},
+  { 59,  94, 134, 101, 102, 211, 144, 216},
+  { 62,  94, 135, 105, 103, 214, 146, 218},
+  { 64,  94, 135, 108, 103, 216, 148, 220},
+  { 66,  95, 135, 111, 104, 219, 151, 222},
+  { 68,  95, 135, 114, 105, 221, 153, 223},
+  { 71,  96, 136, 117, 106, 224, 155, 225},
+  { 73,  96, 136, 120, 106, 225, 157, 226},
+  { 76,  97, 136, 123, 107, 227, 159, 228},
+  { 78,  97, 136, 126, 108, 229, 160, 229},
+  { 80,  98, 137, 129, 109, 231, 162, 231},
+  { 82,  98, 137, 131, 109, 232, 164, 232},
+  { 84,  98, 138, 134, 110, 234, 166, 233},
+  { 86,  98, 138, 137, 111, 235, 168, 234},
+  { 89,  99, 138, 140, 112, 236, 170, 235},
+  { 91,  99, 138, 142, 112, 237, 171, 235},
+  { 93, 100, 139, 145, 113, 238, 173, 236},
+  { 95, 100, 139, 147, 114, 239, 174, 237},
+  { 97, 101, 140, 149, 115, 240, 176, 238},
+  { 99, 101, 140, 151, 115, 241, 177, 238},
+  {101, 102, 140, 154, 116, 242, 179, 239},
+  {103, 102, 140, 156, 117, 242, 180, 239},
+  {105, 103, 141, 158, 118, 243, 182, 240},
+  {107, 103, 141, 160, 118, 243, 183, 240},
+  {109, 104, 141, 162, 119, 244, 185, 241},
+  {111, 104, 141, 164, 119, 244, 186, 241},
+  {113, 104, 142, 166, 120, 245, 187, 242},
+  {114, 104, 142, 168, 121, 245, 188, 242},
+  {116, 105, 143, 170, 122, 246, 190, 243},
+  {118, 105, 143, 171, 122, 246, 191, 243},
+  {120, 106, 143, 173, 123, 247, 192, 244},
+  {121, 106, 143, 175, 124, 247, 193, 244},
+  {123, 107, 144, 177, 125, 248, 195, 244},
+  {125, 107, 144, 178, 125, 248, 196, 244},
+  {127, 108, 145, 180, 126, 249, 197, 245},
+  {128, 108, 145, 181, 127, 249, 198, 245},
+  {130, 109, 145, 183, 128, 249, 199, 245},
+  {132, 109, 145, 184, 128, 249, 200, 245},
+  {134, 110, 146, 186, 129, 250, 201, 246},
+  {135, 110, 146, 187, 130, 250, 202, 246},
+  {137, 111, 147, 189, 131, 251, 203, 246},
+  {138, 111, 147, 190, 131, 251, 204, 246},
+  {140, 112, 147, 192, 132, 251, 205, 247},
+  {141, 112, 147, 193, 132, 251, 206, 247},
+  {143, 113, 148, 194, 133, 251, 207, 247},
+  {144, 113, 148, 195, 134, 251, 207, 247},
+  {146, 114, 149, 197, 135, 252, 208, 248},
+  {147, 114, 149, 198, 135, 252, 209, 248},
+  {149, 115, 149, 199, 136, 252, 210, 248},
+  {150, 115, 149, 200, 137, 252, 210, 248},
+  {152, 115, 150, 201, 138, 252, 211, 248},
+  {153, 115, 150, 202, 138, 252, 212, 248},
+  {155, 116, 151, 204, 139, 253, 213, 249},
+  {156, 116, 151, 205, 139, 253, 213, 249},
+  {158, 117, 151, 206, 140, 253, 214, 249},
+  {159, 117, 151, 207, 141, 253, 215, 249},
+  {161, 118, 152, 208, 142, 253, 216, 249},
+  {162, 118, 152, 209, 142, 253, 216, 249},
+  {163, 119, 153, 210, 143, 253, 217, 249},
+  {164, 119, 153, 211, 143, 253, 217, 249},
+  {166, 120, 153, 212, 144, 254, 218, 250},
+  {167, 120, 153, 212, 145, 254, 219, 250},
+  {168, 121, 154, 213, 146, 254, 220, 250},
+  {169, 121, 154, 214, 146, 254, 220, 250},
+  {171, 122, 155, 215, 147, 254, 221, 250},
+  {172, 122, 155, 216, 147, 254, 221, 250},
+  {173, 123, 155, 217, 148, 254, 222, 250},
+  {174, 123, 155, 217, 149, 254, 222, 250},
+  {176, 124, 156, 218, 150, 254, 223, 250},
+  {177, 124, 156, 219, 150, 254, 223, 250},
+  {178, 125, 157, 220, 151, 254, 224, 251},
+  {179, 125, 157, 220, 151, 254, 224, 251},
+  {180, 126, 157, 221, 152, 254, 225, 251},
+  {181, 126, 157, 221, 152, 254, 225, 251},
+  {183, 127, 158, 222, 153, 254, 226, 251},
+  {184, 127, 158, 223, 154, 254, 226, 251},
+  {185, 128, 159, 224, 155, 255, 227, 251},
+  {186, 128, 159, 224, 155, 255, 227, 251},
+  {187, 129, 160, 225, 156, 255, 228, 251},
+  {188, 130, 160, 225, 156, 255, 228, 251},
+  {189, 131, 160, 226, 157, 255, 228, 251},
+  {190, 131, 160, 226, 158, 255, 228, 251},
+  {191, 132, 161, 227, 159, 255, 229, 251},
+  {192, 132, 161, 227, 159, 255, 229, 251},
+  {193, 133, 162, 228, 160, 255, 230, 252},
+  {194, 133, 162, 229, 160, 255, 230, 252},
+  {195, 134, 163, 230, 161, 255, 231, 252},
+  {196, 134, 163, 230, 161, 255, 231, 252},
+  {197, 135, 163, 231, 162, 255, 231, 252},
+  {198, 135, 163, 231, 162, 255, 231, 252},
+  {199, 136, 164, 232, 163, 255, 232, 252},
+  {200, 136, 164, 232, 164, 255, 232, 252},
+  {201, 137, 165, 233, 165, 255, 233, 252},
+  {201, 137, 165, 233, 165, 255, 233, 252},
+  {202, 138, 166, 233, 166, 255, 233, 252},
+  {203, 138, 166, 233, 166, 255, 233, 252},
+  {204, 139, 166, 234, 167, 255, 234, 252},
+  {205, 139, 166, 234, 167, 255, 234, 252},
+  {206, 140, 167, 235, 168, 255, 235, 252},
+  {206, 140, 167, 235, 168, 255, 235, 252},
+  {207, 141, 168, 236, 169, 255, 235, 252},
+  {208, 141, 168, 236, 170, 255, 235, 252},
+  {209, 142, 169, 237, 171, 255, 236, 252},
+  {209, 143, 169, 237, 171, 255, 236, 252},
+  {210, 144, 169, 237, 172, 255, 236, 252},
+  {211, 144, 169, 237, 172, 255, 236, 252},
+  {212, 145, 170, 238, 173, 255, 237, 252},
+  {213, 145, 170, 238, 173, 255, 237, 252},
+  {214, 146, 171, 239, 174, 255, 237, 253},
+  {214, 146, 171, 239, 174, 255, 237, 253},
+  {215, 147, 172, 240, 175, 255, 238, 253},
+  {215, 147, 172, 240, 175, 255, 238, 253},
+  {216, 148, 173, 240, 176, 255, 238, 253},
+  {217, 148, 173, 240, 176, 255, 238, 253},
+  {218, 149, 173, 241, 177, 255, 239, 253},
+  {218, 149, 173, 241, 178, 255, 239, 253},
+  {219, 150, 174, 241, 179, 255, 239, 253},
+  {219, 151, 174, 241, 179, 255, 239, 253},
+  {220, 152, 175, 242, 180, 255, 240, 253},
+  {221, 152, 175, 242, 180, 255, 240, 253},
+  {222, 153, 176, 242, 181, 255, 240, 253},
+  {222, 153, 176, 242, 181, 255, 240, 253},
+  {223, 154, 177, 243, 182, 255, 240, 253},
+  {223, 154, 177, 243, 182, 255, 240, 253},
+  {224, 155, 178, 244, 183, 255, 241, 253},
+  {224, 155, 178, 244, 183, 255, 241, 253},
+  {225, 156, 178, 244, 184, 255, 241, 253},
+  {225, 157, 178, 244, 184, 255, 241, 253},
+  {226, 158, 179, 244, 185, 255, 242, 253},
+  {227, 158, 179, 244, 185, 255, 242, 253},
+  {228, 159, 180, 245, 186, 255, 242, 253},
+  {228, 159, 180, 245, 186, 255, 242, 253},
+  {229, 160, 181, 245, 187, 255, 242, 253},
+  {229, 160, 181, 245, 187, 255, 242, 253},
+  {230, 161, 182, 246, 188, 255, 243, 253},
+  {230, 162, 182, 246, 188, 255, 243, 253},
+  {231, 163, 183, 246, 189, 255, 243, 253},
+  {231, 163, 183, 246, 189, 255, 243, 253},
+  {232, 164, 184, 247, 190, 255, 243, 253},
+  {232, 164, 184, 247, 190, 255, 243, 253},
+  {233, 165, 185, 247, 191, 255, 244, 253},
+  {233, 165, 185, 247, 191, 255, 244, 253},
+  {234, 166, 185, 247, 192, 255, 244, 253},
+  {234, 167, 185, 247, 192, 255, 244, 253},
+  {235, 168, 186, 248, 193, 255, 244, 253},
+  {235, 168, 186, 248, 193, 255, 244, 253},
+  {236, 169, 187, 248, 194, 255, 244, 253},
+  {236, 169, 187, 248, 194, 255, 244, 253},
+  {236, 170, 188, 248, 195, 255, 245, 253},
+  {236, 170, 188, 248, 195, 255, 245, 253},
+  {237, 171, 189, 249, 196, 255, 245, 254},
+  {237, 172, 189, 249, 196, 255, 245, 254},
+  {238, 173, 190, 249, 197, 255, 245, 254},
+  {238, 173, 190, 249, 197, 255, 245, 254},
+  {239, 174, 191, 249, 198, 255, 245, 254},
+  {239, 174, 191, 249, 198, 255, 245, 254},
+  {240, 175, 192, 249, 199, 255, 246, 254},
+  {240, 176, 192, 249, 199, 255, 246, 254},
+  {240, 177, 193, 250, 200, 255, 246, 254},
+  {240, 177, 193, 250, 200, 255, 246, 254},
+  {241, 178, 194, 250, 201, 255, 246, 254},
+  {241, 178, 194, 250, 201, 255, 246, 254},
+  {242, 179, 195, 250, 202, 255, 246, 254},
+  {242, 180, 195, 250, 202, 255, 246, 254},
+  {242, 181, 196, 250, 203, 255, 247, 254},
+  {242, 181, 196, 250, 203, 255, 247, 254},
+  {243, 182, 197, 251, 204, 255, 247, 254},
+  {243, 183, 197, 251, 204, 255, 247, 254},
+  {244, 184, 198, 251, 205, 255, 247, 254},
+  {244, 184, 198, 251, 205, 255, 247, 254},
+  {244, 185, 199, 251, 206, 255, 247, 254},
+  {244, 185, 199, 251, 206, 255, 247, 254},
+  {245, 186, 200, 251, 207, 255, 247, 254},
+  {245, 187, 200, 251, 207, 255, 247, 254},
+  {246, 188, 201, 252, 207, 255, 248, 254},
+  {246, 188, 201, 252, 207, 255, 248, 254},
+  {246, 189, 202, 252, 208, 255, 248, 254},
+  {246, 190, 202, 252, 208, 255, 248, 254},
+  {247, 191, 203, 252, 209, 255, 248, 254},
+  {247, 191, 203, 252, 209, 255, 248, 254},
+  {247, 192, 204, 252, 210, 255, 248, 254},
+  {247, 193, 204, 252, 210, 255, 248, 254},
+  {248, 194, 205, 252, 211, 255, 248, 254},
+  {248, 194, 205, 252, 211, 255, 248, 254},
+  {248, 195, 206, 252, 212, 255, 249, 254},
+  {248, 196, 206, 252, 212, 255, 249, 254},
+  {249, 197, 207, 253, 213, 255, 249, 254},
+  {249, 197, 207, 253, 213, 255, 249, 254},
+  {249, 198, 208, 253, 214, 255, 249, 254},
+  {249, 199, 209, 253, 214, 255, 249, 254},
+  {250, 200, 210, 253, 215, 255, 249, 254},
+  {250, 200, 210, 253, 215, 255, 249, 254},
+  {250, 201, 211, 253, 215, 255, 249, 254},
+  {250, 202, 211, 253, 215, 255, 249, 254},
+  {250, 203, 212, 253, 216, 255, 249, 254},
+  {250, 203, 212, 253, 216, 255, 249, 254},
+  {251, 204, 213, 253, 217, 255, 250, 254},
+  {251, 205, 213, 253, 217, 255, 250, 254},
+  {251, 206, 214, 254, 218, 255, 250, 254},
+  {251, 206, 215, 254, 218, 255, 250, 254},
+  {252, 207, 216, 254, 219, 255, 250, 254},
+  {252, 208, 216, 254, 219, 255, 250, 254},
+  {252, 209, 217, 254, 220, 255, 250, 254},
+  {252, 210, 217, 254, 220, 255, 250, 254},
+  {252, 211, 218, 254, 221, 255, 250, 254},
+  {252, 212, 218, 254, 221, 255, 250, 254},
+  {253, 213, 219, 254, 222, 255, 250, 254},
+  {253, 213, 220, 254, 222, 255, 250, 254},
+  {253, 214, 221, 254, 223, 255, 250, 254},
+  {253, 215, 221, 254, 223, 255, 250, 254},
+  {253, 216, 222, 254, 224, 255, 251, 254},
+  {253, 217, 223, 254, 224, 255, 251, 254},
+  {253, 218, 224, 254, 225, 255, 251, 254},
+  {253, 219, 224, 254, 225, 255, 251, 254},
+  {254, 220, 225, 254, 225, 255, 251, 254},
+  {254, 221, 226, 254, 225, 255, 251, 254},
+  {254, 222, 227, 255, 226, 255, 251, 254},
+  {254, 223, 227, 255, 226, 255, 251, 254},
+  {254, 224, 228, 255, 227, 255, 251, 254},
+  {254, 225, 229, 255, 227, 255, 251, 254},
+  {254, 226, 230, 255, 228, 255, 251, 254},
+  {254, 227, 230, 255, 229, 255, 251, 254},
+  {255, 228, 231, 255, 230, 255, 251, 254},
+  {255, 229, 232, 255, 230, 255, 251, 254},
+  {255, 230, 233, 255, 231, 255, 252, 254},
+  {255, 231, 234, 255, 231, 255, 252, 254},
+  {255, 232, 235, 255, 232, 255, 252, 254},
+  {255, 233, 236, 255, 232, 255, 252, 254},
+  {255, 235, 237, 255, 233, 255, 252, 254},
+  {255, 236, 238, 255, 234, 255, 252, 254},
+  {255, 238, 240, 255, 235, 255, 252, 255},
+  {255, 239, 241, 255, 235, 255, 252, 254},
+  {255, 241, 243, 255, 236, 255, 252, 254},
+  {255, 243, 245, 255, 237, 255, 252, 254},
+  {255, 246, 247, 255, 239, 255, 253, 255},
+};
+
+static const vp9_coeff_probs_model default_coef_probs_4x4[PLANE_TYPES] = {
+  {  // Y plane
+    {  // Intra
+      {  // Band 0
+        { 195,  29, 183 }, {  84,  49, 136 }, {   8,  42,  71 }
+      }, {  // Band 1
+        {  31, 107, 169 }, {  35,  99, 159 }, {  17,  82, 140 },
+        {   8,  66, 114 }, {   2,  44,  76 }, {   1,  19,  32 }
+      }, {  // Band 2
+        {  40, 132, 201 }, {  29, 114, 187 }, {  13,  91, 157 },
+        {   7,  75, 127 }, {   3,  58,  95 }, {   1,  28,  47 }
+      }, {  // Band 3
+        {  69, 142, 221 }, {  42, 122, 201 }, {  15,  91, 159 },
+        {   6,  67, 121 }, {   1,  42,  77 }, {   1,  17,  31 }
+      }, {  // Band 4
+        { 102, 148, 228 }, {  67, 117, 204 }, {  17,  82, 154 },
+        {   6,  59, 114 }, {   2,  39,  75 }, {   1,  15,  29 }
+      }, {  // Band 5
+        { 156,  57, 233 }, { 119,  57, 212 }, {  58,  48, 163 },
+        {  29,  40, 124 }, {  12,  30,  81 }, {   3,  12,  31 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        { 191, 107, 226 }, { 124, 117, 204 }, {  25,  99, 155 }
+      }, {  // Band 1
+        {  29, 148, 210 }, {  37, 126, 194 }, {   8,  93, 157 },
+        {   2,  68, 118 }, {   1,  39,  69 }, {   1,  17,  33 }
+      }, {  // Band 2
+        {  41, 151, 213 }, {  27, 123, 193 }, {   3,  82, 144 },
+        {   1,  58, 105 }, {   1,  32,  60 }, {   1,  13,  26 }
+      }, {  // Band 3
+        {  59, 159, 220 }, {  23, 126, 198 }, {   4,  88, 151 },
+        {   1,  66, 114 }, {   1,  38,  71 }, {   1,  18,  34 }
+      }, {  // Band 4
+        { 114, 136, 232 }, {  51, 114, 207 }, {  11,  83, 155 },
+        {   3,  56, 105 }, {   1,  33,  65 }, {   1,  17,  34 }
+      }, {  // Band 5
+        { 149,  65, 234 }, { 121,  57, 215 }, {  61,  49, 166 },
+        {  28,  36, 114 }, {  12,  25,  76 }, {   3,  16,  42 }
+      }
+    }
+  }, {  // UV plane
+    {  // Intra
+      {  // Band 0
+        { 214,  49, 220 }, { 132,  63, 188 }, {  42,  65, 137 }
+      }, {  // Band 1
+        {  85, 137, 221 }, { 104, 131, 216 }, {  49, 111, 192 },
+        {  21,  87, 155 }, {   2,  49,  87 }, {   1,  16,  28 }
+      }, {  // Band 2
+        {  89, 163, 230 }, {  90, 137, 220 }, {  29, 100, 183 },
+        {  10,  70, 135 }, {   2,  42,  81 }, {   1,  17,  33 }
+      }, {  // Band 3
+        { 108, 167, 237 }, {  55, 133, 222 }, {  15,  97, 179 },
+        {   4,  72, 135 }, {   1,  45,  85 }, {   1,  19,  38 }
+      }, {  // Band 4
+        { 124, 146, 240 }, {  66, 124, 224 }, {  17,  88, 175 },
+        {   4,  58, 122 }, {   1,  36,  75 }, {   1,  18,  37 }
+      }, {  //  Band 5
+        { 141,  79, 241 }, { 126,  70, 227 }, {  66,  58, 182 },
+        {  30,  44, 136 }, {  12,  34,  96 }, {   2,  20,  47 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        { 229,  99, 249 }, { 143, 111, 235 }, {  46, 109, 192 }
+      }, {  // Band 1
+        {  82, 158, 236 }, {  94, 146, 224 }, {  25, 117, 191 },
+        {   9,  87, 149 }, {   3,  56,  99 }, {   1,  33,  57 }
+      }, {  // Band 2
+        {  83, 167, 237 }, {  68, 145, 222 }, {  10, 103, 177 },
+        {   2,  72, 131 }, {   1,  41,  79 }, {   1,  20,  39 }
+      }, {  // Band 3
+        {  99, 167, 239 }, {  47, 141, 224 }, {  10, 104, 178 },
+        {   2,  73, 133 }, {   1,  44,  85 }, {   1,  22,  47 }
+      }, {  // Band 4
+        { 127, 145, 243 }, {  71, 129, 228 }, {  17,  93, 177 },
+        {   3,  61, 124 }, {   1,  41,  84 }, {   1,  21,  52 }
+      }, {  // Band 5
+        { 157,  78, 244 }, { 140,  72, 231 }, {  69,  58, 184 },
+        {  31,  44, 137 }, {  14,  38, 105 }, {   8,  23,  61 }
+      }
+    }
+  }
+};
+
+static const vp9_coeff_probs_model default_coef_probs_8x8[PLANE_TYPES] = {
+  {  // Y plane
+    {  // Intra
+      {  // Band 0
+        { 125,  34, 187 }, {  52,  41, 133 }, {   6,  31,  56 }
+      }, {  // Band 1
+        {  37, 109, 153 }, {  51, 102, 147 }, {  23,  87, 128 },
+        {   8,  67, 101 }, {   1,  41,  63 }, {   1,  19,  29 }
+      }, {  // Band 2
+        {  31, 154, 185 }, {  17, 127, 175 }, {   6,  96, 145 },
+        {   2,  73, 114 }, {   1,  51,  82 }, {   1,  28,  45 }
+      }, {  // Band 3
+        {  23, 163, 200 }, {  10, 131, 185 }, {   2,  93, 148 },
+        {   1,  67, 111 }, {   1,  41,  69 }, {   1,  14,  24 }
+      }, {  // Band 4
+        {  29, 176, 217 }, {  12, 145, 201 }, {   3, 101, 156 },
+        {   1,  69, 111 }, {   1,  39,  63 }, {   1,  14,  23 }
+      }, {  // Band 5
+        {  57, 192, 233 }, {  25, 154, 215 }, {   6, 109, 167 },
+        {   3,  78, 118 }, {   1,  48,  69 }, {   1,  21,  29 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        { 202, 105, 245 }, { 108, 106, 216 }, {  18,  90, 144 }
+      }, {  // Band 1
+        {  33, 172, 219 }, {  64, 149, 206 }, {  14, 117, 177 },
+        {   5,  90, 141 }, {   2,  61,  95 }, {   1,  37,  57 }
+      }, {  // Band 2
+        {  33, 179, 220 }, {  11, 140, 198 }, {   1,  89, 148 },
+        {   1,  60, 104 }, {   1,  33,  57 }, {   1,  12,  21 }
+      }, {  // Band 3
+        {  30, 181, 221 }, {   8, 141, 198 }, {   1,  87, 145 },
+        {   1,  58, 100 }, {   1,  31,  55 }, {   1,  12,  20 }
+      }, {  // Band 4
+        {  32, 186, 224 }, {   7, 142, 198 }, {   1,  86, 143 },
+        {   1,  58, 100 }, {   1,  31,  55 }, {   1,  12,  22 }
+      }, {  // Band 5
+        {  57, 192, 227 }, {  20, 143, 204 }, {   3,  96, 154 },
+        {   1,  68, 112 }, {   1,  42,  69 }, {   1,  19,  32 }
+      }
+    }
+  }, {  // UV plane
+    {  // Intra
+      {  // Band 0
+        { 212,  35, 215 }, { 113,  47, 169 }, {  29,  48, 105 }
+      }, {  // Band 1
+        {  74, 129, 203 }, { 106, 120, 203 }, {  49, 107, 178 },
+        {  19,  84, 144 }, {   4,  50,  84 }, {   1,  15,  25 }
+      }, {  // Band 2
+        {  71, 172, 217 }, {  44, 141, 209 }, {  15, 102, 173 },
+        {   6,  76, 133 }, {   2,  51,  89 }, {   1,  24,  42 }
+      }, {  // Band 3
+        {  64, 185, 231 }, {  31, 148, 216 }, {   8, 103, 175 },
+        {   3,  74, 131 }, {   1,  46,  81 }, {   1,  18,  30 }
+      }, {  // Band 4
+        {  65, 196, 235 }, {  25, 157, 221 }, {   5, 105, 174 },
+        {   1,  67, 120 }, {   1,  38,  69 }, {   1,  15,  30 }
+      }, {  // Band 5
+        {  65, 204, 238 }, {  30, 156, 224 }, {   7, 107, 177 },
+        {   2,  70, 124 }, {   1,  42,  73 }, {   1,  18,  34 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        { 225,  86, 251 }, { 144, 104, 235 }, {  42,  99, 181 }
+      }, {  // Band 1
+        {  85, 175, 239 }, { 112, 165, 229 }, {  29, 136, 200 },
+        {  12, 103, 162 }, {   6,  77, 123 }, {   2,  53,  84 }
+      }, {  // Band 2
+        {  75, 183, 239 }, {  30, 155, 221 }, {   3, 106, 171 },
+        {   1,  74, 128 }, {   1,  44,  76 }, {   1,  17,  28 }
+      }, {  // Band 3
+        {  73, 185, 240 }, {  27, 159, 222 }, {   2, 107, 172 },
+        {   1,  75, 127 }, {   1,  42,  73 }, {   1,  17,  29 }
+      }, {  // Band 4
+        {  62, 190, 238 }, {  21, 159, 222 }, {   2, 107, 172 },
+        {   1,  72, 122 }, {   1,  40,  71 }, {   1,  18,  32 }
+      }, {  // Band 5
+        {  61, 199, 240 }, {  27, 161, 226 }, {   4, 113, 180 },
+        {   1,  76, 129 }, {   1,  46,  80 }, {   1,  23,  41 }
+      }
+    }
+  }
+};
+
+static const vp9_coeff_probs_model default_coef_probs_16x16[PLANE_TYPES] = {
+  {  // Y plane
+    {  // Intra
+      {  // Band 0
+        {   7,  27, 153 }, {   5,  30,  95 }, {   1,  16,  30 }
+      }, {  // Band 1
+        {  50,  75, 127 }, {  57,  75, 124 }, {  27,  67, 108 },
+        {  10,  54,  86 }, {   1,  33,  52 }, {   1,  12,  18 }
+      }, {  // Band 2
+        {  43, 125, 151 }, {  26, 108, 148 }, {   7,  83, 122 },
+        {   2,  59,  89 }, {   1,  38,  60 }, {   1,  17,  27 }
+      }, {  // Band 3
+        {  23, 144, 163 }, {  13, 112, 154 }, {   2,  75, 117 },
+        {   1,  50,  81 }, {   1,  31,  51 }, {   1,  14,  23 }
+      }, {  // Band 4
+        {  18, 162, 185 }, {   6, 123, 171 }, {   1,  78, 125 },
+        {   1,  51,  86 }, {   1,  31,  54 }, {   1,  14,  23 }
+      }, {  // Band 5
+        {  15, 199, 227 }, {   3, 150, 204 }, {   1,  91, 146 },
+        {   1,  55,  95 }, {   1,  30,  53 }, {   1,  11,  20 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        {  19,  55, 240 }, {  19,  59, 196 }, {   3,  52, 105 }
+      }, {  // Band 1
+        {  41, 166, 207 }, { 104, 153, 199 }, {  31, 123, 181 },
+        {  14, 101, 152 }, {   5,  72, 106 }, {   1,  36,  52 }
+      }, {  // Band 2
+        {  35, 176, 211 }, {  12, 131, 190 }, {   2,  88, 144 },
+        {   1,  60, 101 }, {   1,  36,  60 }, {   1,  16,  28 }
+      }, {  // Band 3
+        {  28, 183, 213 }, {   8, 134, 191 }, {   1,  86, 142 },
+        {   1,  56,  96 }, {   1,  30,  53 }, {   1,  12,  20 }
+      }, {  // Band 4
+        {  20, 190, 215 }, {   4, 135, 192 }, {   1,  84, 139 },
+        {   1,  53,  91 }, {   1,  28,  49 }, {   1,  11,  20 }
+      }, {  // Band 5
+        {  13, 196, 216 }, {   2, 137, 192 }, {   1,  86, 143 },
+        {   1,  57,  99 }, {   1,  32,  56 }, {   1,  13,  24 }
+      }
+    }
+  }, {  // UV plane
+    {  // Intra
+      {  // Band 0
+        { 211,  29, 217 }, {  96,  47, 156 }, {  22,  43,  87 }
+      }, {  // Band 1
+        {  78, 120, 193 }, { 111, 116, 186 }, {  46, 102, 164 },
+        {  15,  80, 128 }, {   2,  49,  76 }, {   1,  18,  28 }
+      }, {  // Band 2
+        {  71, 161, 203 }, {  42, 132, 192 }, {  10,  98, 150 },
+        {   3,  69, 109 }, {   1,  44,  70 }, {   1,  18,  29 }
+      }, {  // Band 3
+        {  57, 186, 211 }, {  30, 140, 196 }, {   4,  93, 146 },
+        {   1,  62, 102 }, {   1,  38,  65 }, {   1,  16,  27 }
+      }, {  // Band 4
+        {  47, 199, 217 }, {  14, 145, 196 }, {   1,  88, 142 },
+        {   1,  57,  98 }, {   1,  36,  62 }, {   1,  15,  26 }
+      }, {  // Band 5
+        {  26, 219, 229 }, {   5, 155, 207 }, {   1,  94, 151 },
+        {   1,  60, 104 }, {   1,  36,  62 }, {   1,  16,  28 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        { 233,  29, 248 }, { 146,  47, 220 }, {  43,  52, 140 }
+      }, {  // Band 1
+        { 100, 163, 232 }, { 179, 161, 222 }, {  63, 142, 204 },
+        {  37, 113, 174 }, {  26,  89, 137 }, {  18,  68,  97 }
+      }, {  // Band 2
+        {  85, 181, 230 }, {  32, 146, 209 }, {   7, 100, 164 },
+        {   3,  71, 121 }, {   1,  45,  77 }, {   1,  18,  30 }
+      }, {  // Band 3
+        {  65, 187, 230 }, {  20, 148, 207 }, {   2,  97, 159 },
+        {   1,  68, 116 }, {   1,  40,  70 }, {   1,  14,  29 }
+      }, {  // Band 4
+        {  40, 194, 227 }, {   8, 147, 204 }, {   1,  94, 155 },
+        {   1,  65, 112 }, {   1,  39,  66 }, {   1,  14,  26 }
+      }, {  // Band 5
+        {  16, 208, 228 }, {   3, 151, 207 }, {   1,  98, 160 },
+        {   1,  67, 117 }, {   1,  41,  74 }, {   1,  17,  31 }
+      }
+    }
+  }
+};
+
+static const vp9_coeff_probs_model default_coef_probs_32x32[PLANE_TYPES] = {
+  {  // Y plane
+    {  // Intra
+      {  // Band 0
+        {  17,  38, 140 }, {   7,  34,  80 }, {   1,  17,  29 }
+      }, {  // Band 1
+        {  37,  75, 128 }, {  41,  76, 128 }, {  26,  66, 116 },
+        {  12,  52,  94 }, {   2,  32,  55 }, {   1,  10,  16 }
+      }, {  // Band 2
+        {  50, 127, 154 }, {  37, 109, 152 }, {  16,  82, 121 },
+        {   5,  59,  85 }, {   1,  35,  54 }, {   1,  13,  20 }
+      }, {  // Band 3
+        {  40, 142, 167 }, {  17, 110, 157 }, {   2,  71, 112 },
+        {   1,  44,  72 }, {   1,  27,  45 }, {   1,  11,  17 }
+      }, {  // Band 4
+        {  30, 175, 188 }, {   9, 124, 169 }, {   1,  74, 116 },
+        {   1,  48,  78 }, {   1,  30,  49 }, {   1,  11,  18 }
+      }, {  // Band 5
+        {  10, 222, 223 }, {   2, 150, 194 }, {   1,  83, 128 },
+        {   1,  48,  79 }, {   1,  27,  45 }, {   1,  11,  17 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        {  36,  41, 235 }, {  29,  36, 193 }, {  10,  27, 111 }
+      }, {  // Band 1
+        {  85, 165, 222 }, { 177, 162, 215 }, { 110, 135, 195 },
+        {  57, 113, 168 }, {  23,  83, 120 }, {  10,  49,  61 }
+      }, {  // Band 2
+        {  85, 190, 223 }, {  36, 139, 200 }, {   5,  90, 146 },
+        {   1,  60, 103 }, {   1,  38,  65 }, {   1,  18,  30 }
+      }, {  // Band 3
+        {  72, 202, 223 }, {  23, 141, 199 }, {   2,  86, 140 },
+        {   1,  56,  97 }, {   1,  36,  61 }, {   1,  16,  27 }
+      }, {  // Band 4
+        {  55, 218, 225 }, {  13, 145, 200 }, {   1,  86, 141 },
+        {   1,  57,  99 }, {   1,  35,  61 }, {   1,  13,  22 }
+      }, {  // Band 5
+        {  15, 235, 212 }, {   1, 132, 184 }, {   1,  84, 139 },
+        {   1,  57,  97 }, {   1,  34,  56 }, {   1,  14,  23 }
+      }
+    }
+  }, {  // UV plane
+    {  // Intra
+      {  // Band 0
+        { 181,  21, 201 }, {  61,  37, 123 }, {  10,  38,  71 }
+      }, {  // Band 1
+        {  47, 106, 172 }, {  95, 104, 173 }, {  42,  93, 159 },
+        {  18,  77, 131 }, {   4,  50,  81 }, {   1,  17,  23 }
+      }, {  // Band 2
+        {  62, 147, 199 }, {  44, 130, 189 }, {  28, 102, 154 },
+        {  18,  75, 115 }, {   2,  44,  65 }, {   1,  12,  19 }
+      }, {  // Band 3
+        {  55, 153, 210 }, {  24, 130, 194 }, {   3,  93, 146 },
+        {   1,  61,  97 }, {   1,  31,  50 }, {   1,  10,  16 }
+      }, {  // Band 4
+        {  49, 186, 223 }, {  17, 148, 204 }, {   1,  96, 142 },
+        {   1,  53,  83 }, {   1,  26,  44 }, {   1,  11,  17 }
+      }, {  // Band 5
+        {  13, 217, 212 }, {   2, 136, 180 }, {   1,  78, 124 },
+        {   1,  50,  83 }, {   1,  29,  49 }, {   1,  14,  23 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        { 197,  13, 247 }, {  82,  17, 222 }, {  25,  17, 162 }
+      }, {  // Band 1
+        { 126, 186, 247 }, { 234, 191, 243 }, { 176, 177, 234 },
+        { 104, 158, 220 }, {  66, 128, 186 }, {  55,  90, 137 }
+      }, {  // Band 2
+        { 111, 197, 242 }, {  46, 158, 219 }, {   9, 104, 171 },
+        {   2,  65, 125 }, {   1,  44,  80 }, {   1,  17,  91 }
+      }, {  // Band 3
+        { 104, 208, 245 }, {  39, 168, 224 }, {   3, 109, 162 },
+        {   1,  79, 124 }, {   1,  50, 102 }, {   1,  43, 102 }
+      }, {  // Band 4
+        {  84, 220, 246 }, {  31, 177, 231 }, {   2, 115, 180 },
+        {   1,  79, 134 }, {   1,  55,  77 }, {   1,  60,  79 }
+      }, {  // Band 5
+        {  43, 243, 240 }, {   8, 180, 217 }, {   1, 115, 166 },
+        {   1,  84, 121 }, {   1,  51,  67 }, {   1,  16,   6 }
+      }
+    }
+  }
+};
+
+static void extend_to_full_distribution(vpx_prob *probs, vpx_prob p) {
+  // TODO(aconverse): model[PIVOT_NODE] should never be zero.
+  // https://code.google.com/p/webm/issues/detail?id=1089
+  memcpy(probs, vp9_pareto8_full[p == 0 ? 254 : p - 1],
+         MODEL_NODES * sizeof(vpx_prob));
+}
+
+void vp9_model_to_full_probs(const vpx_prob *model, vpx_prob *full) {
+  if (full != model)
+    memcpy(full, model, sizeof(vpx_prob) * UNCONSTRAINED_NODES);
+  extend_to_full_distribution(&full[UNCONSTRAINED_NODES], model[PIVOT_NODE]);
+}
+
+void vp9_default_coef_probs(VP9_COMMON *cm) {
+  vp9_copy(cm->fc->coef_probs[TX_4X4], default_coef_probs_4x4);
+  vp9_copy(cm->fc->coef_probs[TX_8X8], default_coef_probs_8x8);
+  vp9_copy(cm->fc->coef_probs[TX_16X16], default_coef_probs_16x16);
+  vp9_copy(cm->fc->coef_probs[TX_32X32], default_coef_probs_32x32);
+}
+
+#define COEF_COUNT_SAT 24
+#define COEF_MAX_UPDATE_FACTOR 112
+#define COEF_COUNT_SAT_KEY 24
+#define COEF_MAX_UPDATE_FACTOR_KEY 112
+#define COEF_COUNT_SAT_AFTER_KEY 24
+#define COEF_MAX_UPDATE_FACTOR_AFTER_KEY 128
+
+static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE tx_size,
+                             unsigned int count_sat,
+                             unsigned int update_factor) {
+  const FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
+  vp9_coeff_probs_model *const probs = cm->fc->coef_probs[tx_size];
+  const vp9_coeff_probs_model *const pre_probs = pre_fc->coef_probs[tx_size];
+  vp9_coeff_count_model *counts = cm->counts.coef[tx_size];
+  unsigned int (*eob_counts)[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS] =
+      cm->counts.eob_branch[tx_size];
+  int i, j, k, l, m;
+
+  for (i = 0; i < PLANE_TYPES; ++i)
+    for (j = 0; j < REF_TYPES; ++j)
+      for (k = 0; k < COEF_BANDS; ++k)
+        for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
+          const int n0 = counts[i][j][k][l][ZERO_TOKEN];
+          const int n1 = counts[i][j][k][l][ONE_TOKEN];
+          const int n2 = counts[i][j][k][l][TWO_TOKEN];
+          const int neob = counts[i][j][k][l][EOB_MODEL_TOKEN];
+          const unsigned int branch_ct[UNCONSTRAINED_NODES][2] = {
+            { neob, eob_counts[i][j][k][l] - neob },
+            { n0, n1 + n2 },
+            { n1, n2 }
+          };
+          for (m = 0; m < UNCONSTRAINED_NODES; ++m)
+            probs[i][j][k][l][m] = merge_probs(pre_probs[i][j][k][l][m],
+                                               branch_ct[m],
+                                               count_sat, update_factor);
+        }
+}
+
+void vp9_adapt_coef_probs(VP9_COMMON *cm) {
+  TX_SIZE t;
+  unsigned int count_sat, update_factor;
+
+  if (frame_is_intra_only(cm)) {
+    update_factor = COEF_MAX_UPDATE_FACTOR_KEY;
+    count_sat = COEF_COUNT_SAT_KEY;
+  } else if (cm->last_frame_type == KEY_FRAME) {
+    update_factor = COEF_MAX_UPDATE_FACTOR_AFTER_KEY;  /* adapt quickly */
+    count_sat = COEF_COUNT_SAT_AFTER_KEY;
+  } else {
+    update_factor = COEF_MAX_UPDATE_FACTOR;
+    count_sat = COEF_COUNT_SAT;
+  }
+  for (t = TX_4X4; t <= TX_32X32; t++)
+    adapt_coef_probs(cm, t, count_sat, update_factor);
+}
diff --git a/libs/libvpx/vp9/common/vp9_entropy.h b/libs/libvpx/vp9/common/vp9_entropy.h
new file mode 100644
index 0000000000..63b3bff5d9
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_entropy.h
@@ -0,0 +1,200 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_ENTROPY_H_
+#define VP9_COMMON_VP9_ENTROPY_H_
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/prob.h"
+
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_enums.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define DIFF_UPDATE_PROB 252
+
+// Coefficient token alphabet
+#define ZERO_TOKEN      0   // 0     Extra Bits 0+0
+#define ONE_TOKEN       1   // 1     Extra Bits 0+1
+#define TWO_TOKEN       2   // 2     Extra Bits 0+1
+#define THREE_TOKEN     3   // 3     Extra Bits 0+1
+#define FOUR_TOKEN      4   // 4     Extra Bits 0+1
+#define CATEGORY1_TOKEN 5   // 5-6   Extra Bits 1+1
+#define CATEGORY2_TOKEN 6   // 7-10  Extra Bits 2+1
+#define CATEGORY3_TOKEN 7   // 11-18 Extra Bits 3+1
+#define CATEGORY4_TOKEN 8   // 19-34 Extra Bits 4+1
+#define CATEGORY5_TOKEN 9   // 35-66 Extra Bits 5+1
+#define CATEGORY6_TOKEN 10  // 67+   Extra Bits 14+1
+#define EOB_TOKEN       11  // EOB   Extra Bits 0+0
+
+#define ENTROPY_TOKENS 12
+
+#define ENTROPY_NODES 11
+
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_pt_energy_class[ENTROPY_TOKENS]);
+
+#define CAT1_MIN_VAL    5
+#define CAT2_MIN_VAL    7
+#define CAT3_MIN_VAL   11
+#define CAT4_MIN_VAL   19
+#define CAT5_MIN_VAL   35
+#define CAT6_MIN_VAL   67
+
+// Extra bit probabilities.
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat1_prob[1]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat2_prob[2]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat3_prob[3]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat4_prob[4]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat5_prob[5]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat6_prob[14]);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat1_prob_high10[1]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat2_prob_high10[2]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat3_prob_high10[3]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat4_prob_high10[4]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat5_prob_high10[5]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat6_prob_high10[16]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat1_prob_high12[1]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat2_prob_high12[2]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat3_prob_high12[3]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat4_prob_high12[4]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat5_prob_high12[5]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat6_prob_high12[18]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#define EOB_MODEL_TOKEN 3
+
+#define DCT_MAX_VALUE           16384
+#if CONFIG_VP9_HIGHBITDEPTH
+#define DCT_MAX_VALUE_HIGH10    65536
+#define DCT_MAX_VALUE_HIGH12   262144
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+/* Coefficients are predicted via a 3-dimensional probability table. */
+
+#define REF_TYPES 2  // intra=0, inter=1
+
+/* Middle dimension reflects the coefficient position within the transform. */
+#define COEF_BANDS 6
+
+/* Inside dimension is measure of nearby complexity, that reflects the energy
+   of nearby coefficients are nonzero.  For the first coefficient (DC, unless
+   block type is 0), we look at the (already encoded) blocks above and to the
+   left of the current block.  The context index is then the number (0,1,or 2)
+   of these blocks having nonzero coefficients.
+   After decoding a coefficient, the measure is determined by the size of the
+   most recently decoded coefficient.
+   Note that the intuitive meaning of this measure changes as coefficients
+   are decoded, e.g., prior to the first token, a zero means that my neighbors
+   are empty while, after the first token, because of the use of end-of-block,
+   a zero means we just decoded a zero and hence guarantees that a non-zero
+   coefficient will appear later in this block.  However, this shift
+   in meaning is perfectly OK because our context depends also on the
+   coefficient band (and since zigzag positions 0, 1, and 2 are in
+   distinct bands). */
+
+#define COEFF_CONTEXTS 6
+#define BAND_COEFF_CONTEXTS(band) ((band) == 0 ? 3 : COEFF_CONTEXTS)
+
+// #define ENTROPY_STATS
+
+typedef unsigned int vp9_coeff_count[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS]
+                                    [ENTROPY_TOKENS];
+typedef unsigned int vp9_coeff_stats[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS]
+                                    [ENTROPY_NODES][2];
+
+#define SUBEXP_PARAM                4   /* Subexponential code parameter */
+#define MODULUS_PARAM               13  /* Modulus parameter */
+
+struct VP9Common;
+void vp9_default_coef_probs(struct VP9Common *cm);
+void vp9_adapt_coef_probs(struct VP9Common *cm);
+
+// This is the index in the scan order beyond which all coefficients for
+// 8x8 transform and above are in the top band.
+// This macro is currently unused but may be used by certain implementations
+#define MAXBAND_INDEX 21
+
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_coefband_trans_8x8plus[1024]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_coefband_trans_4x4[16]);
+
+static INLINE const uint8_t *get_band_translate(TX_SIZE tx_size) {
+  return tx_size == TX_4X4 ? vp9_coefband_trans_4x4
+                           : vp9_coefband_trans_8x8plus;
+}
+
+// 128 lists of probabilities are stored for the following ONE node probs:
+// 1, 3, 5, 7, ..., 253, 255
+// In between probabilities are interpolated linearly
+
+#define COEFF_PROB_MODELS 255
+
+#define UNCONSTRAINED_NODES         3
+
+#define PIVOT_NODE                  2   // which node is pivot
+
+#define MODEL_NODES (ENTROPY_NODES - UNCONSTRAINED_NODES)
+extern const vpx_tree_index vp9_coef_con_tree[TREE_SIZE(ENTROPY_TOKENS)];
+extern const vpx_prob vp9_pareto8_full[COEFF_PROB_MODELS][MODEL_NODES];
+
+typedef vpx_prob vp9_coeff_probs_model[REF_TYPES][COEF_BANDS]
+                                      [COEFF_CONTEXTS][UNCONSTRAINED_NODES];
+
+typedef unsigned int vp9_coeff_count_model[REF_TYPES][COEF_BANDS]
+                                          [COEFF_CONTEXTS]
+                                          [UNCONSTRAINED_NODES + 1];
+
+void vp9_model_to_full_probs(const vpx_prob *model, vpx_prob *full);
+
+typedef char ENTROPY_CONTEXT;
+
+static INLINE int combine_entropy_contexts(ENTROPY_CONTEXT a,
+                                           ENTROPY_CONTEXT b) {
+  return (a != 0) + (b != 0);
+}
+
+static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
+                                      const ENTROPY_CONTEXT *l) {
+  ENTROPY_CONTEXT above_ec = 0, left_ec = 0;
+
+  switch (tx_size) {
+    case TX_4X4:
+      above_ec = a[0] != 0;
+      left_ec = l[0] != 0;
+      break;
+    case TX_8X8:
+      above_ec = !!*(const uint16_t *)a;
+      left_ec  = !!*(const uint16_t *)l;
+      break;
+    case TX_16X16:
+      above_ec = !!*(const uint32_t *)a;
+      left_ec  = !!*(const uint32_t *)l;
+      break;
+    case TX_32X32:
+      above_ec = !!*(const uint64_t *)a;
+      left_ec  = !!*(const uint64_t *)l;
+      break;
+    default:
+      assert(0 && "Invalid transform size.");
+      break;
+  }
+
+  return combine_entropy_contexts(above_ec, left_ec);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_COMMON_VP9_ENTROPY_H_
diff --git a/libs/libvpx/vp9/common/vp9_entropymode.c b/libs/libvpx/vp9/common/vp9_entropymode.c
new file mode 100644
index 0000000000..670348bafd
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_entropymode.c
@@ -0,0 +1,469 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_mem/vpx_mem.h"
+
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/vp9_seg_common.h"
+
+const vpx_prob vp9_kf_y_mode_prob[INTRA_MODES][INTRA_MODES][INTRA_MODES - 1] = {
+  {  // above = dc
+    { 137,  30,  42, 148, 151, 207,  70,  52,  91 },  // left = dc
+    {  92,  45, 102, 136, 116, 180,  74,  90, 100 },  // left = v
+    {  73,  32,  19, 187, 222, 215,  46,  34, 100 },  // left = h
+    {  91,  30,  32, 116, 121, 186,  93,  86,  94 },  // left = d45
+    {  72,  35,  36, 149,  68, 206,  68,  63, 105 },  // left = d135
+    {  73,  31,  28, 138,  57, 124,  55, 122, 151 },  // left = d117
+    {  67,  23,  21, 140, 126, 197,  40,  37, 171 },  // left = d153
+    {  86,  27,  28, 128, 154, 212,  45,  43,  53 },  // left = d207
+    {  74,  32,  27, 107,  86, 160,  63, 134, 102 },  // left = d63
+    {  59,  67,  44, 140, 161, 202,  78,  67, 119 }   // left = tm
+  }, {  // above = v
+    {  63,  36, 126, 146, 123, 158,  60,  90,  96 },  // left = dc
+    {  43,  46, 168, 134, 107, 128,  69, 142,  92 },  // left = v
+    {  44,  29,  68, 159, 201, 177,  50,  57,  77 },  // left = h
+    {  58,  38,  76, 114,  97, 172,  78, 133,  92 },  // left = d45
+    {  46,  41,  76, 140,  63, 184,  69, 112,  57 },  // left = d135
+    {  38,  32,  85, 140,  46, 112,  54, 151, 133 },  // left = d117
+    {  39,  27,  61, 131, 110, 175,  44,  75, 136 },  // left = d153
+    {  52,  30,  74, 113, 130, 175,  51,  64,  58 },  // left = d207
+    {  47,  35,  80, 100,  74, 143,  64, 163,  74 },  // left = d63
+    {  36,  61, 116, 114, 128, 162,  80, 125,  82 }   // left = tm
+  }, {  // above = h
+    {  82,  26,  26, 171, 208, 204,  44,  32, 105 },  // left = dc
+    {  55,  44,  68, 166, 179, 192,  57,  57, 108 },  // left = v
+    {  42,  26,  11, 199, 241, 228,  23,  15,  85 },  // left = h
+    {  68,  42,  19, 131, 160, 199,  55,  52,  83 },  // left = d45
+    {  58,  50,  25, 139, 115, 232,  39,  52, 118 },  // left = d135
+    {  50,  35,  33, 153, 104, 162,  64,  59, 131 },  // left = d117
+    {  44,  24,  16, 150, 177, 202,  33,  19, 156 },  // left = d153
+    {  55,  27,  12, 153, 203, 218,  26,  27,  49 },  // left = d207
+    {  53,  49,  21, 110, 116, 168,  59,  80,  76 },  // left = d63
+    {  38,  72,  19, 168, 203, 212,  50,  50, 107 }   // left = tm
+  }, {  // above = d45
+    { 103,  26,  36, 129, 132, 201,  83,  80,  93 },  // left = dc
+    {  59,  38,  83, 112, 103, 162,  98, 136,  90 },  // left = v
+    {  62,  30,  23, 158, 200, 207,  59,  57,  50 },  // left = h
+    {  67,  30,  29,  84,  86, 191, 102,  91,  59 },  // left = d45
+    {  60,  32,  33, 112,  71, 220,  64,  89, 104 },  // left = d135
+    {  53,  26,  34, 130,  56, 149,  84, 120, 103 },  // left = d117
+    {  53,  21,  23, 133, 109, 210,  56,  77, 172 },  // left = d153
+    {  77,  19,  29, 112, 142, 228,  55,  66,  36 },  // left = d207
+    {  61,  29,  29,  93,  97, 165,  83, 175, 162 },  // left = d63
+    {  47,  47,  43, 114, 137, 181, 100,  99,  95 }   // left = tm
+  }, {  // above = d135
+    {  69,  23,  29, 128,  83, 199,  46,  44, 101 },  // left = dc
+    {  53,  40,  55, 139,  69, 183,  61,  80, 110 },  // left = v
+    {  40,  29,  19, 161, 180, 207,  43,  24,  91 },  // left = h
+    {  60,  34,  19, 105,  61, 198,  53,  64,  89 },  // left = d45
+    {  52,  31,  22, 158,  40, 209,  58,  62,  89 },  // left = d135
+    {  44,  31,  29, 147,  46, 158,  56, 102, 198 },  // left = d117
+    {  35,  19,  12, 135,  87, 209,  41,  45, 167 },  // left = d153
+    {  55,  25,  21, 118,  95, 215,  38,  39,  66 },  // left = d207
+    {  51,  38,  25, 113,  58, 164,  70,  93,  97 },  // left = d63
+    {  47,  54,  34, 146, 108, 203,  72, 103, 151 }   // left = tm
+  }, {  // above = d117
+    {  64,  19,  37, 156,  66, 138,  49,  95, 133 },  // left = dc
+    {  46,  27,  80, 150,  55, 124,  55, 121, 135 },  // left = v
+    {  36,  23,  27, 165, 149, 166,  54,  64, 118 },  // left = h
+    {  53,  21,  36, 131,  63, 163,  60, 109,  81 },  // left = d45
+    {  40,  26,  35, 154,  40, 185,  51,  97, 123 },  // left = d135
+    {  35,  19,  34, 179,  19,  97,  48, 129, 124 },  // left = d117
+    {  36,  20,  26, 136,  62, 164,  33,  77, 154 },  // left = d153
+    {  45,  18,  32, 130,  90, 157,  40,  79,  91 },  // left = d207
+    {  45,  26,  28, 129,  45, 129,  49, 147, 123 },  // left = d63
+    {  38,  44,  51, 136,  74, 162,  57,  97, 121 }   // left = tm
+  }, {  // above = d153
+    {  75,  17,  22, 136, 138, 185,  32,  34, 166 },  // left = dc
+    {  56,  39,  58, 133, 117, 173,  48,  53, 187 },  // left = v
+    {  35,  21,  12, 161, 212, 207,  20,  23, 145 },  // left = h
+    {  56,  29,  19, 117, 109, 181,  55,  68, 112 },  // left = d45
+    {  47,  29,  17, 153,  64, 220,  59,  51, 114 },  // left = d135
+    {  46,  16,  24, 136,  76, 147,  41,  64, 172 },  // left = d117
+    {  34,  17,  11, 108, 152, 187,  13,  15, 209 },  // left = d153
+    {  51,  24,  14, 115, 133, 209,  32,  26, 104 },  // left = d207
+    {  55,  30,  18, 122,  79, 179,  44,  88, 116 },  // left = d63
+    {  37,  49,  25, 129, 168, 164,  41,  54, 148 }   // left = tm
+  }, {  // above = d207
+    {  82,  22,  32, 127, 143, 213,  39,  41,  70 },  // left = dc
+    {  62,  44,  61, 123, 105, 189,  48,  57,  64 },  // left = v
+    {  47,  25,  17, 175, 222, 220,  24,  30,  86 },  // left = h
+    {  68,  36,  17, 106, 102, 206,  59,  74,  74 },  // left = d45
+    {  57,  39,  23, 151,  68, 216,  55,  63,  58 },  // left = d135
+    {  49,  30,  35, 141,  70, 168,  82,  40, 115 },  // left = d117
+    {  51,  25,  15, 136, 129, 202,  38,  35, 139 },  // left = d153
+    {  68,  26,  16, 111, 141, 215,  29,  28,  28 },  // left = d207
+    {  59,  39,  19, 114,  75, 180,  77, 104,  42 },  // left = d63
+    {  40,  61,  26, 126, 152, 206,  61,  59,  93 }   // left = tm
+  }, {  // above = d63
+    {  78,  23,  39, 111, 117, 170,  74, 124,  94 },  // left = dc
+    {  48,  34,  86, 101,  92, 146,  78, 179, 134 },  // left = v
+    {  47,  22,  24, 138, 187, 178,  68,  69,  59 },  // left = h
+    {  56,  25,  33, 105, 112, 187,  95, 177, 129 },  // left = d45
+    {  48,  31,  27, 114,  63, 183,  82, 116,  56 },  // left = d135
+    {  43,  28,  37, 121,  63, 123,  61, 192, 169 },  // left = d117
+    {  42,  17,  24, 109,  97, 177,  56,  76, 122 },  // left = d153
+    {  58,  18,  28, 105, 139, 182,  70,  92,  63 },  // left = d207
+    {  46,  23,  32,  74,  86, 150,  67, 183,  88 },  // left = d63
+    {  36,  38,  48,  92, 122, 165,  88, 137,  91 }   // left = tm
+  }, {  // above = tm
+    {  65,  70,  60, 155, 159, 199,  61,  60,  81 },  // left = dc
+    {  44,  78, 115, 132, 119, 173,  71, 112,  93 },  // left = v
+    {  39,  38,  21, 184, 227, 206,  42,  32,  64 },  // left = h
+    {  58,  47,  36, 124, 137, 193,  80,  82,  78 },  // left = d45
+    {  49,  50,  35, 144,  95, 205,  63,  78,  59 },  // left = d135
+    {  41,  53,  52, 148,  71, 142,  65, 128,  51 },  // left = d117
+    {  40,  36,  28, 143, 143, 202,  40,  55, 137 },  // left = d153
+    {  52,  34,  29, 129, 183, 227,  42,  35,  43 },  // left = d207
+    {  42,  44,  44, 104, 105, 164,  64, 130,  80 },  // left = d63
+    {  43,  81,  53, 140, 169, 204,  68,  84,  72 }   // left = tm
+  }
+};
+
+const vpx_prob vp9_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1] = {
+  { 144,  11,  54, 157, 195, 130,  46,  58, 108 },  // y = dc
+  { 118,  15, 123, 148, 131, 101,  44,  93, 131 },  // y = v
+  { 113,  12,  23, 188, 226, 142,  26,  32, 125 },  // y = h
+  { 120,  11,  50, 123, 163, 135,  64,  77, 103 },  // y = d45
+  { 113,   9,  36, 155, 111, 157,  32,  44, 161 },  // y = d135
+  { 116,   9,  55, 176,  76,  96,  37,  61, 149 },  // y = d117
+  { 115,   9,  28, 141, 161, 167,  21,  25, 193 },  // y = d153
+  { 120,  12,  32, 145, 195, 142,  32,  38,  86 },  // y = d207
+  { 116,  12,  64, 120, 140, 125,  49, 115, 121 },  // y = d63
+  { 102,  19,  66, 162, 182, 122,  35,  59, 128 }   // y = tm
+};
+
+static const vpx_prob default_if_y_probs[BLOCK_SIZE_GROUPS][INTRA_MODES - 1] = {
+  {  65,  32,  18, 144, 162, 194,  41,  51,  98 },  // block_size < 8x8
+  { 132,  68,  18, 165, 217, 196,  45,  40,  78 },  // block_size < 16x16
+  { 173,  80,  19, 176, 240, 193,  64,  35,  46 },  // block_size < 32x32
+  { 221, 135,  38, 194, 248, 121,  96,  85,  29 }   // block_size >= 32x32
+};
+
+static const vpx_prob default_if_uv_probs[INTRA_MODES][INTRA_MODES - 1] = {
+  { 120,   7,  76, 176, 208, 126,  28,  54, 103 },  // y = dc
+  {  48,  12, 154, 155, 139,  90,  34, 117, 119 },  // y = v
+  {  67,   6,  25, 204, 243, 158,  13,  21,  96 },  // y = h
+  {  97,   5,  44, 131, 176, 139,  48,  68,  97 },  // y = d45
+  {  83,   5,  42, 156, 111, 152,  26,  49, 152 },  // y = d135
+  {  80,   5,  58, 178,  74,  83,  33,  62, 145 },  // y = d117
+  {  86,   5,  32, 154, 192, 168,  14,  22, 163 },  // y = d153
+  {  85,   5,  32, 156, 216, 148,  19,  29,  73 },  // y = d207
+  {  77,   7,  64, 116, 132, 122,  37, 126, 120 },  // y = d63
+  { 101,  21, 107, 181, 192, 103,  19,  67, 125 }   // y = tm
+};
+
+const vpx_prob vp9_kf_partition_probs[PARTITION_CONTEXTS]
+                                     [PARTITION_TYPES - 1] = {
+  // 8x8 -> 4x4
+  { 158,  97,  94 },  // a/l both not split
+  {  93,  24,  99 },  // a split, l not split
+  {  85, 119,  44 },  // l split, a not split
+  {  62,  59,  67 },  // a/l both split
+  // 16x16 -> 8x8
+  { 149,  53,  53 },  // a/l both not split
+  {  94,  20,  48 },  // a split, l not split
+  {  83,  53,  24 },  // l split, a not split
+  {  52,  18,  18 },  // a/l both split
+  // 32x32 -> 16x16
+  { 150,  40,  39 },  // a/l both not split
+  {  78,  12,  26 },  // a split, l not split
+  {  67,  33,  11 },  // l split, a not split
+  {  24,   7,   5 },  // a/l both split
+  // 64x64 -> 32x32
+  { 174,  35,  49 },  // a/l both not split
+  {  68,  11,  27 },  // a split, l not split
+  {  57,  15,   9 },  // l split, a not split
+  {  12,   3,   3 },  // a/l both split
+};
+
+static const vpx_prob default_partition_probs[PARTITION_CONTEXTS]
+                                             [PARTITION_TYPES - 1] = {
+  // 8x8 -> 4x4
+  { 199, 122, 141 },  // a/l both not split
+  { 147,  63, 159 },  // a split, l not split
+  { 148, 133, 118 },  // l split, a not split
+  { 121, 104, 114 },  // a/l both split
+  // 16x16 -> 8x8
+  { 174,  73,  87 },  // a/l both not split
+  {  92,  41,  83 },  // a split, l not split
+  {  82,  99,  50 },  // l split, a not split
+  {  53,  39,  39 },  // a/l both split
+  // 32x32 -> 16x16
+  { 177,  58,  59 },  // a/l both not split
+  {  68,  26,  63 },  // a split, l not split
+  {  52,  79,  25 },  // l split, a not split
+  {  17,  14,  12 },  // a/l both split
+  // 64x64 -> 32x32
+  { 222,  34,  30 },  // a/l both not split
+  {  72,  16,  44 },  // a split, l not split
+  {  58,  32,  12 },  // l split, a not split
+  {  10,   7,   6 },  // a/l both split
+};
+
+static const vpx_prob default_inter_mode_probs[INTER_MODE_CONTEXTS]
+                                              [INTER_MODES - 1] = {
+  {2,       173,   34},  // 0 = both zero mv
+  {7,       145,   85},  // 1 = one zero mv + one a predicted mv
+  {7,       166,   63},  // 2 = two predicted mvs
+  {7,       94,    66},  // 3 = one predicted/zero and one new mv
+  {8,       64,    46},  // 4 = two new mvs
+  {17,      81,    31},  // 5 = one intra neighbour + x
+  {25,      29,    30},  // 6 = two intra neighbours
+};
+
+/* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */
+const vpx_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)] = {
+  -DC_PRED, 2,                      /* 0 = DC_NODE */
+  -TM_PRED, 4,                      /* 1 = TM_NODE */
+  -V_PRED, 6,                       /* 2 = V_NODE */
+  8, 12,                            /* 3 = COM_NODE */
+  -H_PRED, 10,                      /* 4 = H_NODE */
+  -D135_PRED, -D117_PRED,           /* 5 = D135_NODE */
+  -D45_PRED, 14,                    /* 6 = D45_NODE */
+  -D63_PRED, 16,                    /* 7 = D63_NODE */
+  -D153_PRED, -D207_PRED             /* 8 = D153_NODE */
+};
+
+const vpx_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)] = {
+  -INTER_OFFSET(ZEROMV), 2,
+  -INTER_OFFSET(NEARESTMV), 4,
+  -INTER_OFFSET(NEARMV), -INTER_OFFSET(NEWMV)
+};
+
+const vpx_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)] = {
+  -PARTITION_NONE, 2,
+  -PARTITION_HORZ, 4,
+  -PARTITION_VERT, -PARTITION_SPLIT
+};
+
+static const vpx_prob default_intra_inter_p[INTRA_INTER_CONTEXTS] = {
+  9, 102, 187, 225
+};
+
+static const vpx_prob default_comp_inter_p[COMP_INTER_CONTEXTS] = {
+  239, 183, 119,  96,  41
+};
+
+static const vpx_prob default_comp_ref_p[REF_CONTEXTS] = {
+  50, 126, 123, 221, 226
+};
+
+static const vpx_prob default_single_ref_p[REF_CONTEXTS][2] = {
+  {  33,  16 },
+  {  77,  74 },
+  { 142, 142 },
+  { 172, 170 },
+  { 238, 247 }
+};
+
+static const struct tx_probs default_tx_probs = {
+  { { 3, 136, 37 },
+    { 5, 52,  13 } },
+
+  { { 20, 152 },
+    { 15, 101 } },
+
+  { { 100 },
+    { 66  } }
+};
+
+void tx_counts_to_branch_counts_32x32(const unsigned int *tx_count_32x32p,
+                                      unsigned int (*ct_32x32p)[2]) {
+  ct_32x32p[0][0] = tx_count_32x32p[TX_4X4];
+  ct_32x32p[0][1] = tx_count_32x32p[TX_8X8] +
+                    tx_count_32x32p[TX_16X16] +
+                    tx_count_32x32p[TX_32X32];
+  ct_32x32p[1][0] = tx_count_32x32p[TX_8X8];
+  ct_32x32p[1][1] = tx_count_32x32p[TX_16X16] +
+                    tx_count_32x32p[TX_32X32];
+  ct_32x32p[2][0] = tx_count_32x32p[TX_16X16];
+  ct_32x32p[2][1] = tx_count_32x32p[TX_32X32];
+}
+
+void tx_counts_to_branch_counts_16x16(const unsigned int *tx_count_16x16p,
+                                      unsigned int (*ct_16x16p)[2]) {
+  ct_16x16p[0][0] = tx_count_16x16p[TX_4X4];
+  ct_16x16p[0][1] = tx_count_16x16p[TX_8X8] + tx_count_16x16p[TX_16X16];
+  ct_16x16p[1][0] = tx_count_16x16p[TX_8X8];
+  ct_16x16p[1][1] = tx_count_16x16p[TX_16X16];
+}
+
+void tx_counts_to_branch_counts_8x8(const unsigned int *tx_count_8x8p,
+                                    unsigned int (*ct_8x8p)[2]) {
+  ct_8x8p[0][0] = tx_count_8x8p[TX_4X4];
+  ct_8x8p[0][1] = tx_count_8x8p[TX_8X8];
+}
+
+static const vpx_prob default_skip_probs[SKIP_CONTEXTS] = {
+  192, 128, 64
+};
+
+static const vpx_prob default_switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
+                                                    [SWITCHABLE_FILTERS - 1] = {
+  { 235, 162, },
+  { 36, 255, },
+  { 34, 3, },
+  { 149, 144, },
+};
+
+static void init_mode_probs(FRAME_CONTEXT *fc) {
+  vp9_copy(fc->uv_mode_prob, default_if_uv_probs);
+  vp9_copy(fc->y_mode_prob, default_if_y_probs);
+  vp9_copy(fc->switchable_interp_prob, default_switchable_interp_prob);
+  vp9_copy(fc->partition_prob, default_partition_probs);
+  vp9_copy(fc->intra_inter_prob, default_intra_inter_p);
+  vp9_copy(fc->comp_inter_prob, default_comp_inter_p);
+  vp9_copy(fc->comp_ref_prob, default_comp_ref_p);
+  vp9_copy(fc->single_ref_prob, default_single_ref_p);
+  fc->tx_probs = default_tx_probs;
+  vp9_copy(fc->skip_probs, default_skip_probs);
+  vp9_copy(fc->inter_mode_probs, default_inter_mode_probs);
+}
+
+const vpx_tree_index vp9_switchable_interp_tree
+                         [TREE_SIZE(SWITCHABLE_FILTERS)] = {
+  -EIGHTTAP, 2,
+  -EIGHTTAP_SMOOTH, -EIGHTTAP_SHARP
+};
+
+void vp9_adapt_mode_probs(VP9_COMMON *cm) {
+  int i, j;
+  FRAME_CONTEXT *fc = cm->fc;
+  const FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
+  const FRAME_COUNTS *counts = &cm->counts;
+
+  for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
+    fc->intra_inter_prob[i] = mode_mv_merge_probs(pre_fc->intra_inter_prob[i],
+                                                  counts->intra_inter[i]);
+  for (i = 0; i < COMP_INTER_CONTEXTS; i++)
+    fc->comp_inter_prob[i] = mode_mv_merge_probs(pre_fc->comp_inter_prob[i],
+                                                 counts->comp_inter[i]);
+  for (i = 0; i < REF_CONTEXTS; i++)
+    fc->comp_ref_prob[i] = mode_mv_merge_probs(pre_fc->comp_ref_prob[i],
+                                               counts->comp_ref[i]);
+  for (i = 0; i < REF_CONTEXTS; i++)
+    for (j = 0; j < 2; j++)
+      fc->single_ref_prob[i][j] = mode_mv_merge_probs(
+          pre_fc->single_ref_prob[i][j], counts->single_ref[i][j]);
+
+  for (i = 0; i < INTER_MODE_CONTEXTS; i++)
+    vpx_tree_merge_probs(vp9_inter_mode_tree, pre_fc->inter_mode_probs[i],
+                counts->inter_mode[i], fc->inter_mode_probs[i]);
+
+  for (i = 0; i < BLOCK_SIZE_GROUPS; i++)
+    vpx_tree_merge_probs(vp9_intra_mode_tree, pre_fc->y_mode_prob[i],
+                counts->y_mode[i], fc->y_mode_prob[i]);
+
+  for (i = 0; i < INTRA_MODES; ++i)
+    vpx_tree_merge_probs(vp9_intra_mode_tree, pre_fc->uv_mode_prob[i],
+                         counts->uv_mode[i], fc->uv_mode_prob[i]);
+
+  for (i = 0; i < PARTITION_CONTEXTS; i++)
+    vpx_tree_merge_probs(vp9_partition_tree, pre_fc->partition_prob[i],
+                         counts->partition[i], fc->partition_prob[i]);
+
+  if (cm->interp_filter == SWITCHABLE) {
+    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
+      vpx_tree_merge_probs(vp9_switchable_interp_tree,
+                           pre_fc->switchable_interp_prob[i],
+                           counts->switchable_interp[i],
+                           fc->switchable_interp_prob[i]);
+  }
+
+  if (cm->tx_mode == TX_MODE_SELECT) {
+    int j;
+    unsigned int branch_ct_8x8p[TX_SIZES - 3][2];
+    unsigned int branch_ct_16x16p[TX_SIZES - 2][2];
+    unsigned int branch_ct_32x32p[TX_SIZES - 1][2];
+
+    for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
+      tx_counts_to_branch_counts_8x8(counts->tx.p8x8[i], branch_ct_8x8p);
+      for (j = 0; j < TX_SIZES - 3; ++j)
+        fc->tx_probs.p8x8[i][j] = mode_mv_merge_probs(
+            pre_fc->tx_probs.p8x8[i][j], branch_ct_8x8p[j]);
+
+      tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i], branch_ct_16x16p);
+      for (j = 0; j < TX_SIZES - 2; ++j)
+        fc->tx_probs.p16x16[i][j] = mode_mv_merge_probs(
+            pre_fc->tx_probs.p16x16[i][j], branch_ct_16x16p[j]);
+
+      tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i], branch_ct_32x32p);
+      for (j = 0; j < TX_SIZES - 1; ++j)
+        fc->tx_probs.p32x32[i][j] = mode_mv_merge_probs(
+            pre_fc->tx_probs.p32x32[i][j], branch_ct_32x32p[j]);
+    }
+  }
+
+  for (i = 0; i < SKIP_CONTEXTS; ++i)
+    fc->skip_probs[i] = mode_mv_merge_probs(
+        pre_fc->skip_probs[i], counts->skip[i]);
+}
+
+static void set_default_lf_deltas(struct loopfilter *lf) {
+  lf->mode_ref_delta_enabled = 1;
+  lf->mode_ref_delta_update = 1;
+
+  lf->ref_deltas[INTRA_FRAME] = 1;
+  lf->ref_deltas[LAST_FRAME] = 0;
+  lf->ref_deltas[GOLDEN_FRAME] = -1;
+  lf->ref_deltas[ALTREF_FRAME] = -1;
+
+  lf->mode_deltas[0] = 0;
+  lf->mode_deltas[1] = 0;
+}
+
+void vp9_setup_past_independence(VP9_COMMON *cm) {
+  // Reset the segment feature data to the default stats:
+  // Features disabled, 0, with delta coding (Default state).
+  struct loopfilter *const lf = &cm->lf;
+
+  int i;
+  vp9_clearall_segfeatures(&cm->seg);
+  cm->seg.abs_delta = SEGMENT_DELTADATA;
+
+  if (cm->last_frame_seg_map && !cm->frame_parallel_decode)
+    memset(cm->last_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols));
+
+  if (cm->current_frame_seg_map)
+    memset(cm->current_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols));
+
+  // Reset the mode ref deltas for loop filter
+  vp9_zero(lf->last_ref_deltas);
+  vp9_zero(lf->last_mode_deltas);
+  set_default_lf_deltas(lf);
+
+  // To force update of the sharpness
+  lf->last_sharpness_level = -1;
+
+  vp9_default_coef_probs(cm);
+  init_mode_probs(cm->fc);
+  vp9_init_mv_probs(cm);
+  cm->fc->initialized = 1;
+
+  if (cm->frame_type == KEY_FRAME ||
+      cm->error_resilient_mode || cm->reset_frame_context == 3) {
+    // Reset all frame contexts.
+    for (i = 0; i < FRAME_CONTEXTS; ++i)
+      cm->frame_contexts[i] = *cm->fc;
+  } else if (cm->reset_frame_context == 2) {
+    // Reset only the frame context specified in the frame header.
+    cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
+  }
+
+  // prev_mip will only be allocated in encoder.
+  if (frame_is_intra_only(cm) && cm->prev_mip && !cm->frame_parallel_decode)
+    memset(cm->prev_mip, 0,
+           cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->prev_mip));
+
+  vp9_zero(cm->ref_frame_sign_bias);
+
+  cm->frame_context_idx = 0;
+}
diff --git a/libs/libvpx/vp9/common/vp9_entropymode.h b/libs/libvpx/vp9/common/vp9_entropymode.h
new file mode 100644
index 0000000000..0285be1557
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_entropymode.h
@@ -0,0 +1,107 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_ENTROPYMODE_H_
+#define VP9_COMMON_VP9_ENTROPYMODE_H_
+
+#include "vp9/common/vp9_entropy.h"
+#include "vp9/common/vp9_entropymv.h"
+#include "vp9/common/vp9_filter.h"
+#include "vpx_dsp/vpx_filter.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define BLOCK_SIZE_GROUPS 4
+
+#define TX_SIZE_CONTEXTS 2
+
+#define INTER_OFFSET(mode) ((mode) - NEARESTMV)
+
+struct VP9Common;
+
+struct tx_probs {
+  vpx_prob p32x32[TX_SIZE_CONTEXTS][TX_SIZES - 1];
+  vpx_prob p16x16[TX_SIZE_CONTEXTS][TX_SIZES - 2];
+  vpx_prob p8x8[TX_SIZE_CONTEXTS][TX_SIZES - 3];
+};
+
+struct tx_counts {
+  unsigned int p32x32[TX_SIZE_CONTEXTS][TX_SIZES];
+  unsigned int p16x16[TX_SIZE_CONTEXTS][TX_SIZES - 1];
+  unsigned int p8x8[TX_SIZE_CONTEXTS][TX_SIZES - 2];
+  unsigned int tx_totals[TX_SIZES];
+};
+
+typedef struct frame_contexts {
+  vpx_prob y_mode_prob[BLOCK_SIZE_GROUPS][INTRA_MODES - 1];
+  vpx_prob uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
+  vpx_prob partition_prob[PARTITION_CONTEXTS][PARTITION_TYPES - 1];
+  vp9_coeff_probs_model coef_probs[TX_SIZES][PLANE_TYPES];
+  vpx_prob switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
+                                 [SWITCHABLE_FILTERS - 1];
+  vpx_prob inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1];
+  vpx_prob intra_inter_prob[INTRA_INTER_CONTEXTS];
+  vpx_prob comp_inter_prob[COMP_INTER_CONTEXTS];
+  vpx_prob single_ref_prob[REF_CONTEXTS][2];
+  vpx_prob comp_ref_prob[REF_CONTEXTS];
+  struct tx_probs tx_probs;
+  vpx_prob skip_probs[SKIP_CONTEXTS];
+  nmv_context nmvc;
+  int initialized;
+} FRAME_CONTEXT;
+
+typedef struct FRAME_COUNTS {
+  unsigned int y_mode[BLOCK_SIZE_GROUPS][INTRA_MODES];
+  unsigned int uv_mode[INTRA_MODES][INTRA_MODES];
+  unsigned int partition[PARTITION_CONTEXTS][PARTITION_TYPES];
+  vp9_coeff_count_model coef[TX_SIZES][PLANE_TYPES];
+  unsigned int eob_branch[TX_SIZES][PLANE_TYPES][REF_TYPES]
+                         [COEF_BANDS][COEFF_CONTEXTS];
+  unsigned int switchable_interp[SWITCHABLE_FILTER_CONTEXTS]
+                                [SWITCHABLE_FILTERS];
+  unsigned int inter_mode[INTER_MODE_CONTEXTS][INTER_MODES];
+  unsigned int intra_inter[INTRA_INTER_CONTEXTS][2];
+  unsigned int comp_inter[COMP_INTER_CONTEXTS][2];
+  unsigned int single_ref[REF_CONTEXTS][2][2];
+  unsigned int comp_ref[REF_CONTEXTS][2];
+  struct tx_counts tx;
+  unsigned int skip[SKIP_CONTEXTS][2];
+  nmv_context_counts mv;
+} FRAME_COUNTS;
+
+extern const vpx_prob vp9_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
+extern const vpx_prob vp9_kf_y_mode_prob[INTRA_MODES][INTRA_MODES]
+                                        [INTRA_MODES - 1];
+extern const vpx_prob vp9_kf_partition_probs[PARTITION_CONTEXTS]
+                                            [PARTITION_TYPES - 1];
+extern const vpx_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)];
+extern const vpx_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)];
+extern const vpx_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)];
+extern const vpx_tree_index vp9_switchable_interp_tree
+                                [TREE_SIZE(SWITCHABLE_FILTERS)];
+
+void vp9_setup_past_independence(struct VP9Common *cm);
+
+void vp9_adapt_mode_probs(struct VP9Common *cm);
+
+void tx_counts_to_branch_counts_32x32(const unsigned int *tx_count_32x32p,
+                                      unsigned int (*ct_32x32p)[2]);
+void tx_counts_to_branch_counts_16x16(const unsigned int *tx_count_16x16p,
+                                      unsigned int (*ct_16x16p)[2]);
+void tx_counts_to_branch_counts_8x8(const unsigned int *tx_count_8x8p,
+                                    unsigned int (*ct_8x8p)[2]);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_COMMON_VP9_ENTROPYMODE_H_
diff --git a/libs/libvpx/vp9/common/vp9_entropymv.c b/libs/libvpx/vp9/common/vp9_entropymv.c
new file mode 100644
index 0000000000..566ae91cf7
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_entropymv.c
@@ -0,0 +1,210 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/vp9_entropymv.h"
+
+const vpx_tree_index vp9_mv_joint_tree[TREE_SIZE(MV_JOINTS)] = {
+  -MV_JOINT_ZERO, 2,
+  -MV_JOINT_HNZVZ, 4,
+  -MV_JOINT_HZVNZ, -MV_JOINT_HNZVNZ
+};
+
+const vpx_tree_index vp9_mv_class_tree[TREE_SIZE(MV_CLASSES)] = {
+  -MV_CLASS_0, 2,
+  -MV_CLASS_1, 4,
+  6, 8,
+  -MV_CLASS_2, -MV_CLASS_3,
+  10, 12,
+  -MV_CLASS_4, -MV_CLASS_5,
+  -MV_CLASS_6, 14,
+  16, 18,
+  -MV_CLASS_7, -MV_CLASS_8,
+  -MV_CLASS_9, -MV_CLASS_10,
+};
+
+const vpx_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)] = {
+  -0, -1,
+};
+
+const vpx_tree_index vp9_mv_fp_tree[TREE_SIZE(MV_FP_SIZE)] = {
+  -0, 2,
+  -1, 4,
+  -2, -3
+};
+
+static const nmv_context default_nmv_context = {
+  {32, 64, 96},
+  {
+    { // Vertical component
+      128,                                                  // sign
+      {224, 144, 192, 168, 192, 176, 192, 198, 198, 245},   // class
+      {216},                                                // class0
+      {136, 140, 148, 160, 176, 192, 224, 234, 234, 240},   // bits
+      {{128, 128, 64}, {96, 112, 64}},                      // class0_fp
+      {64, 96, 64},                                         // fp
+      160,                                                  // class0_hp bit
+      128,                                                  // hp
+    },
+    { // Horizontal component
+      128,                                                  // sign
+      {216, 128, 176, 160, 176, 176, 192, 198, 198, 208},   // class
+      {208},                                                // class0
+      {136, 140, 148, 160, 176, 192, 224, 234, 234, 240},   // bits
+      {{128, 128, 64}, {96, 112, 64}},                      // class0_fp
+      {64, 96, 64},                                         // fp
+      160,                                                  // class0_hp bit
+      128,                                                  // hp
+    }
+  },
+};
+
+static const uint8_t log_in_base_2[] = {
+  0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
+  4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10
+};
+
+static INLINE int mv_class_base(MV_CLASS_TYPE c) {
+  return c ? CLASS0_SIZE << (c + 2) : 0;
+}
+
+MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset) {
+  const MV_CLASS_TYPE c = (z >= CLASS0_SIZE * 4096) ?
+      MV_CLASS_10 : (MV_CLASS_TYPE)log_in_base_2[z >> 3];
+  if (offset)
+    *offset = z - mv_class_base(c);
+  return c;
+}
+
+static void inc_mv_component(int v, nmv_component_counts *comp_counts,
+                             int incr, int usehp) {
+  int s, z, c, o, d, e, f;
+  assert(v != 0);            /* should not be zero */
+  s = v < 0;
+  comp_counts->sign[s] += incr;
+  z = (s ? -v : v) - 1;       /* magnitude - 1 */
+
+  c = vp9_get_mv_class(z, &o);
+  comp_counts->classes[c] += incr;
+
+  d = (o >> 3);               /* int mv data */
+  f = (o >> 1) & 3;           /* fractional pel mv data */
+  e = (o & 1);                /* high precision mv data */
+
+  if (c == MV_CLASS_0) {
+    comp_counts->class0[d] += incr;
+    comp_counts->class0_fp[d][f] += incr;
+    comp_counts->class0_hp[e] += usehp * incr;
+  } else {
+    int i;
+    int b = c + CLASS0_BITS - 1;  // number of bits
+    for (i = 0; i < b; ++i)
+      comp_counts->bits[i][((d >> i) & 1)] += incr;
+    comp_counts->fp[f] += incr;
+    comp_counts->hp[e] += usehp * incr;
+  }
+}
+
+void vp9_inc_mv(const MV *mv, nmv_context_counts *counts) {
+  if (counts != NULL) {
+    const MV_JOINT_TYPE j = vp9_get_mv_joint(mv);
+    ++counts->joints[j];
+
+    if (mv_joint_vertical(j)) {
+      inc_mv_component(mv->row, &counts->comps[0], 1, 1);
+    }
+
+    if (mv_joint_horizontal(j)) {
+      inc_mv_component(mv->col, &counts->comps[1], 1, 1);
+    }
+  }
+}
+
+void vp9_adapt_mv_probs(VP9_COMMON *cm, int allow_hp) {
+  int i, j;
+
+  nmv_context *fc = &cm->fc->nmvc;
+  const nmv_context *pre_fc = &cm->frame_contexts[cm->frame_context_idx].nmvc;
+  const nmv_context_counts *counts = &cm->counts.mv;
+
+  vpx_tree_merge_probs(vp9_mv_joint_tree, pre_fc->joints, counts->joints,
+                       fc->joints);
+
+  for (i = 0; i < 2; ++i) {
+    nmv_component *comp = &fc->comps[i];
+    const nmv_component *pre_comp = &pre_fc->comps[i];
+    const nmv_component_counts *c = &counts->comps[i];
+
+    comp->sign = mode_mv_merge_probs(pre_comp->sign, c->sign);
+    vpx_tree_merge_probs(vp9_mv_class_tree, pre_comp->classes, c->classes,
+                         comp->classes);
+    vpx_tree_merge_probs(vp9_mv_class0_tree, pre_comp->class0, c->class0,
+                         comp->class0);
+
+    for (j = 0; j < MV_OFFSET_BITS; ++j)
+      comp->bits[j] = mode_mv_merge_probs(pre_comp->bits[j], c->bits[j]);
+
+    for (j = 0; j < CLASS0_SIZE; ++j)
+      vpx_tree_merge_probs(vp9_mv_fp_tree, pre_comp->class0_fp[j],
+                           c->class0_fp[j], comp->class0_fp[j]);
+
+    vpx_tree_merge_probs(vp9_mv_fp_tree, pre_comp->fp, c->fp, comp->fp);
+
+    if (allow_hp) {
+      comp->class0_hp = mode_mv_merge_probs(pre_comp->class0_hp, c->class0_hp);
+      comp->hp = mode_mv_merge_probs(pre_comp->hp, c->hp);
+    }
+  }
+}
+
+void vp9_init_mv_probs(VP9_COMMON *cm) {
+  cm->fc->nmvc = default_nmv_context;
+}
diff --git a/libs/libvpx/vp9/common/vp9_entropymv.h b/libs/libvpx/vp9/common/vp9_entropymv.h
new file mode 100644
index 0000000000..2f05ad44b6
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_entropymv.h
@@ -0,0 +1,140 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP9_COMMON_VP9_ENTROPYMV_H_
+#define VP9_COMMON_VP9_ENTROPYMV_H_
+
+#include "./vpx_config.h"
+
+#include "vpx_dsp/prob.h"
+
+#include "vp9/common/vp9_mv.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP9Common;
+
+void vp9_init_mv_probs(struct VP9Common *cm);
+
+void vp9_adapt_mv_probs(struct VP9Common *cm, int usehp);
+
+// Integer pel reference mv threshold for use of high-precision 1/8 mv
+#define COMPANDED_MVREF_THRESH 8
+
+static INLINE int use_mv_hp(const MV *ref) {
+  return (abs(ref->row) >> 3) < COMPANDED_MVREF_THRESH &&
+         (abs(ref->col) >> 3) < COMPANDED_MVREF_THRESH;
+}
+
+#define MV_UPDATE_PROB 252
+
+/* Symbols for coding which components are zero jointly */
+#define MV_JOINTS     4
+typedef enum {
+  MV_JOINT_ZERO = 0,             /* Zero vector */
+  MV_JOINT_HNZVZ = 1,            /* Vert zero, hor nonzero */
+  MV_JOINT_HZVNZ = 2,            /* Hor zero, vert nonzero */
+  MV_JOINT_HNZVNZ = 3,           /* Both components nonzero */
+} MV_JOINT_TYPE;
+
+static INLINE int mv_joint_vertical(MV_JOINT_TYPE type) {
+  return type == MV_JOINT_HZVNZ || type == MV_JOINT_HNZVNZ;
+}
+
+static INLINE int mv_joint_horizontal(MV_JOINT_TYPE type) {
+  return type == MV_JOINT_HNZVZ || type == MV_JOINT_HNZVNZ;
+}
+
+/* Symbols for coding magnitude class of nonzero components */
+#define MV_CLASSES     11
+typedef enum {
+  MV_CLASS_0 = 0,      /* (0, 2]     integer pel */
+  MV_CLASS_1 = 1,      /* (2, 4]     integer pel */
+  MV_CLASS_2 = 2,      /* (4, 8]     integer pel */
+  MV_CLASS_3 = 3,      /* (8, 16]    integer pel */
+  MV_CLASS_4 = 4,      /* (16, 32]   integer pel */
+  MV_CLASS_5 = 5,      /* (32, 64]   integer pel */
+  MV_CLASS_6 = 6,      /* (64, 128]  integer pel */
+  MV_CLASS_7 = 7,      /* (128, 256] integer pel */
+  MV_CLASS_8 = 8,      /* (256, 512] integer pel */
+  MV_CLASS_9 = 9,      /* (512, 1024] integer pel */
+  MV_CLASS_10 = 10,    /* (1024,2048] integer pel */
+} MV_CLASS_TYPE;
+
+#define CLASS0_BITS    1  /* bits at integer precision for class 0 */
+#define CLASS0_SIZE    (1 << CLASS0_BITS)
+#define MV_OFFSET_BITS (MV_CLASSES + CLASS0_BITS - 2)
+#define MV_FP_SIZE 4
+
+#define MV_MAX_BITS    (MV_CLASSES + CLASS0_BITS + 2)
+#define MV_MAX         ((1 << MV_MAX_BITS) - 1)
+#define MV_VALS        ((MV_MAX << 1) + 1)
+
+#define MV_IN_USE_BITS 14
+#define MV_UPP   ((1 << MV_IN_USE_BITS) - 1)
+#define MV_LOW   (-(1 << MV_IN_USE_BITS))
+
+extern const vpx_tree_index vp9_mv_joint_tree[];
+extern const vpx_tree_index vp9_mv_class_tree[];
+extern const vpx_tree_index vp9_mv_class0_tree[];
+extern const vpx_tree_index vp9_mv_fp_tree[];
+
+typedef struct {
+  vpx_prob sign;
+  vpx_prob classes[MV_CLASSES - 1];
+  vpx_prob class0[CLASS0_SIZE - 1];
+  vpx_prob bits[MV_OFFSET_BITS];
+  vpx_prob class0_fp[CLASS0_SIZE][MV_FP_SIZE - 1];
+  vpx_prob fp[MV_FP_SIZE - 1];
+  vpx_prob class0_hp;
+  vpx_prob hp;
+} nmv_component;
+
+typedef struct {
+  vpx_prob joints[MV_JOINTS - 1];
+  nmv_component comps[2];
+} nmv_context;
+
+static INLINE MV_JOINT_TYPE vp9_get_mv_joint(const MV *mv) {
+  if (mv->row == 0) {
+    return mv->col == 0 ? MV_JOINT_ZERO : MV_JOINT_HNZVZ;
+  } else {
+    return mv->col == 0 ? MV_JOINT_HZVNZ : MV_JOINT_HNZVNZ;
+  }
+}
+
+MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset);
+
+typedef struct {
+  unsigned int sign[2];
+  unsigned int classes[MV_CLASSES];
+  unsigned int class0[CLASS0_SIZE];
+  unsigned int bits[MV_OFFSET_BITS][2];
+  unsigned int class0_fp[CLASS0_SIZE][MV_FP_SIZE];
+  unsigned int fp[MV_FP_SIZE];
+  unsigned int class0_hp[2];
+  unsigned int hp[2];
+} nmv_component_counts;
+
+typedef struct {
+  unsigned int joints[MV_JOINTS];
+  nmv_component_counts comps[2];
+} nmv_context_counts;
+
+void vp9_inc_mv(const MV *mv, nmv_context_counts *mvctx);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_COMMON_VP9_ENTROPYMV_H_
diff --git a/libs/libvpx/vp9/common/vp9_enums.h b/libs/libvpx/vp9/common/vp9_enums.h
new file mode 100644
index 0000000000..d089f23f97
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_enums.h
@@ -0,0 +1,147 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_ENUMS_H_
+#define VP9_COMMON_VP9_ENUMS_H_
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MI_SIZE_LOG2 3
+#define MI_BLOCK_SIZE_LOG2 (6 - MI_SIZE_LOG2)  // 64 = 2^6
+
+#define MI_SIZE (1 << MI_SIZE_LOG2)  // pixels per mi-unit
+#define MI_BLOCK_SIZE (1 << MI_BLOCK_SIZE_LOG2)  // mi-units per max block
+
+#define MI_MASK (MI_BLOCK_SIZE - 1)
+
+// Bitstream profiles indicated by 2-3 bits in the uncompressed header.
+// 00: Profile 0.  8-bit 4:2:0 only.
+// 10: Profile 1.  8-bit 4:4:4, 4:2:2, and 4:4:0.
+// 01: Profile 2.  10-bit and 12-bit color only, with 4:2:0 sampling.
+// 110: Profile 3. 10-bit and 12-bit color only, with 4:2:2/4:4:4/4:4:0
+//                 sampling.
+// 111: Undefined profile.
+typedef enum BITSTREAM_PROFILE {
+  PROFILE_0,
+  PROFILE_1,
+  PROFILE_2,
+  PROFILE_3,
+  MAX_PROFILES
+} BITSTREAM_PROFILE;
+
+#define BLOCK_4X4     0
+#define BLOCK_4X8     1
+#define BLOCK_8X4     2
+#define BLOCK_8X8     3
+#define BLOCK_8X16    4
+#define BLOCK_16X8    5
+#define BLOCK_16X16   6
+#define BLOCK_16X32   7
+#define BLOCK_32X16   8
+#define BLOCK_32X32   9
+#define BLOCK_32X64  10
+#define BLOCK_64X32  11
+#define BLOCK_64X64  12
+#define BLOCK_SIZES  13
+#define BLOCK_INVALID BLOCK_SIZES
+typedef uint8_t BLOCK_SIZE;
+
+typedef enum PARTITION_TYPE {
+  PARTITION_NONE,
+  PARTITION_HORZ,
+  PARTITION_VERT,
+  PARTITION_SPLIT,
+  PARTITION_TYPES,
+  PARTITION_INVALID = PARTITION_TYPES
+} PARTITION_TYPE;
+
+typedef char PARTITION_CONTEXT;
+#define PARTITION_PLOFFSET   4  // number of probability models per block size
+#define PARTITION_CONTEXTS (4 * PARTITION_PLOFFSET)
+
+// block transform size
+typedef uint8_t TX_SIZE;
+#define TX_4X4   ((TX_SIZE)0)   // 4x4 transform
+#define TX_8X8   ((TX_SIZE)1)   // 8x8 transform
+#define TX_16X16 ((TX_SIZE)2)   // 16x16 transform
+#define TX_32X32 ((TX_SIZE)3)   // 32x32 transform
+#define TX_SIZES ((TX_SIZE)4)
+
+// frame transform mode
+typedef enum {
+  ONLY_4X4            = 0,        // only 4x4 transform used
+  ALLOW_8X8           = 1,        // allow block transform size up to 8x8
+  ALLOW_16X16         = 2,        // allow block transform size up to 16x16
+  ALLOW_32X32         = 3,        // allow block transform size up to 32x32
+  TX_MODE_SELECT      = 4,        // transform specified for each block
+  TX_MODES            = 5,
+} TX_MODE;
+
+typedef enum {
+  DCT_DCT   = 0,                      // DCT  in both horizontal and vertical
+  ADST_DCT  = 1,                      // ADST in vertical, DCT in horizontal
+  DCT_ADST  = 2,                      // DCT  in vertical, ADST in horizontal
+  ADST_ADST = 3,                      // ADST in both directions
+  TX_TYPES = 4
+} TX_TYPE;
+
+typedef enum {
+  VP9_LAST_FLAG = 1 << 0,
+  VP9_GOLD_FLAG = 1 << 1,
+  VP9_ALT_FLAG = 1 << 2,
+} VP9_REFFRAME;
+
+typedef enum {
+  PLANE_TYPE_Y  = 0,
+  PLANE_TYPE_UV = 1,
+  PLANE_TYPES
+} PLANE_TYPE;
+
+#define DC_PRED    0       // Average of above and left pixels
+#define V_PRED     1       // Vertical
+#define H_PRED     2       // Horizontal
+#define D45_PRED   3       // Directional 45  deg = round(arctan(1/1) * 180/pi)
+#define D135_PRED  4       // Directional 135 deg = 180 - 45
+#define D117_PRED  5       // Directional 117 deg = 180 - 63
+#define D153_PRED  6       // Directional 153 deg = 180 - 27
+#define D207_PRED  7       // Directional 207 deg = 180 + 27
+#define D63_PRED   8       // Directional 63  deg = round(arctan(2/1) * 180/pi)
+#define TM_PRED    9       // True-motion
+#define NEARESTMV 10
+#define NEARMV    11
+#define ZEROMV    12
+#define NEWMV     13
+#define MB_MODE_COUNT 14
+typedef uint8_t PREDICTION_MODE;
+
+#define INTRA_MODES (TM_PRED + 1)
+
+#define INTER_MODES (1 + NEWMV - NEARESTMV)
+
+#define SKIP_CONTEXTS 3
+#define INTER_MODE_CONTEXTS 7
+
+/* Segment Feature Masks */
+#define MAX_MV_REF_CANDIDATES 2
+
+#define INTRA_INTER_CONTEXTS 4
+#define COMP_INTER_CONTEXTS 5
+#define REF_CONTEXTS 5
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_COMMON_VP9_ENUMS_H_
diff --git a/libs/libvpx/vp9/common/vp9_filter.c b/libs/libvpx/vp9/common/vp9_filter.c
new file mode 100644
index 0000000000..4b2198fc40
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_filter.c
@@ -0,0 +1,104 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vp9/common/vp9_filter.h"
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+                bilinear_filters[SUBPEL_SHIFTS]) = {
+  { 0, 0, 0, 128,   0, 0, 0, 0 },
+  { 0, 0, 0, 120,   8, 0, 0, 0 },
+  { 0, 0, 0, 112,  16, 0, 0, 0 },
+  { 0, 0, 0, 104,  24, 0, 0, 0 },
+  { 0, 0, 0,  96,  32, 0, 0, 0 },
+  { 0, 0, 0,  88,  40, 0, 0, 0 },
+  { 0, 0, 0,  80,  48, 0, 0, 0 },
+  { 0, 0, 0,  72,  56, 0, 0, 0 },
+  { 0, 0, 0,  64,  64, 0, 0, 0 },
+  { 0, 0, 0,  56,  72, 0, 0, 0 },
+  { 0, 0, 0,  48,  80, 0, 0, 0 },
+  { 0, 0, 0,  40,  88, 0, 0, 0 },
+  { 0, 0, 0,  32,  96, 0, 0, 0 },
+  { 0, 0, 0,  24, 104, 0, 0, 0 },
+  { 0, 0, 0,  16, 112, 0, 0, 0 },
+  { 0, 0, 0,   8, 120, 0, 0, 0 }
+};
+
+// Lagrangian interpolation filter
+DECLARE_ALIGNED(256, static const InterpKernel,
+                sub_pel_filters_8[SUBPEL_SHIFTS]) = {
+  { 0,   0,   0, 128,   0,   0,   0,  0},
+  { 0,   1,  -5, 126,   8,  -3,   1,  0},
+  { -1,   3, -10, 122,  18,  -6,   2,  0},
+  { -1,   4, -13, 118,  27,  -9,   3, -1},
+  { -1,   4, -16, 112,  37, -11,   4, -1},
+  { -1,   5, -18, 105,  48, -14,   4, -1},
+  { -1,   5, -19,  97,  58, -16,   5, -1},
+  { -1,   6, -19,  88,  68, -18,   5, -1},
+  { -1,   6, -19,  78,  78, -19,   6, -1},
+  { -1,   5, -18,  68,  88, -19,   6, -1},
+  { -1,   5, -16,  58,  97, -19,   5, -1},
+  { -1,   4, -14,  48, 105, -18,   5, -1},
+  { -1,   4, -11,  37, 112, -16,   4, -1},
+  { -1,   3,  -9,  27, 118, -13,   4, -1},
+  { 0,   2,  -6,  18, 122, -10,   3, -1},
+  { 0,   1,  -3,   8, 126,  -5,   1,  0}
+};
+
+// DCT based filter
+DECLARE_ALIGNED(256, static const InterpKernel,
+                sub_pel_filters_8s[SUBPEL_SHIFTS]) = {
+  {0,   0,   0, 128,   0,   0,   0, 0},
+  {-1,   3,  -7, 127,   8,  -3,   1, 0},
+  {-2,   5, -13, 125,  17,  -6,   3, -1},
+  {-3,   7, -17, 121,  27, -10,   5, -2},
+  {-4,   9, -20, 115,  37, -13,   6, -2},
+  {-4,  10, -23, 108,  48, -16,   8, -3},
+  {-4,  10, -24, 100,  59, -19,   9, -3},
+  {-4,  11, -24,  90,  70, -21,  10, -4},
+  {-4,  11, -23,  80,  80, -23,  11, -4},
+  {-4,  10, -21,  70,  90, -24,  11, -4},
+  {-3,   9, -19,  59, 100, -24,  10, -4},
+  {-3,   8, -16,  48, 108, -23,  10, -4},
+  {-2,   6, -13,  37, 115, -20,   9, -4},
+  {-2,   5, -10,  27, 121, -17,   7, -3},
+  {-1,   3,  -6,  17, 125, -13,   5, -2},
+  {0,   1,  -3,   8, 127,  -7,   3, -1}
+};
+
+// freqmultiplier = 0.5
+DECLARE_ALIGNED(256, static const InterpKernel,
+                sub_pel_filters_8lp[SUBPEL_SHIFTS]) = {
+  { 0,  0,  0, 128,  0,  0,  0,  0},
+  {-3, -1, 32,  64, 38,  1, -3,  0},
+  {-2, -2, 29,  63, 41,  2, -3,  0},
+  {-2, -2, 26,  63, 43,  4, -4,  0},
+  {-2, -3, 24,  62, 46,  5, -4,  0},
+  {-2, -3, 21,  60, 49,  7, -4,  0},
+  {-1, -4, 18,  59, 51,  9, -4,  0},
+  {-1, -4, 16,  57, 53, 12, -4, -1},
+  {-1, -4, 14,  55, 55, 14, -4, -1},
+  {-1, -4, 12,  53, 57, 16, -4, -1},
+  { 0, -4,  9,  51, 59, 18, -4, -1},
+  { 0, -4,  7,  49, 60, 21, -3, -2},
+  { 0, -4,  5,  46, 62, 24, -3, -2},
+  { 0, -4,  4,  43, 63, 26, -2, -2},
+  { 0, -3,  2,  41, 63, 29, -2, -2},
+  { 0, -3,  1,  38, 64, 32, -1, -3}
+};
+
+
+const InterpKernel *vp9_filter_kernels[4] = {
+  sub_pel_filters_8,
+  sub_pel_filters_8lp,
+  sub_pel_filters_8s,
+  bilinear_filters
+};
diff --git a/libs/libvpx/vp9/common/vp9_filter.h b/libs/libvpx/vp9/common/vp9_filter.h
new file mode 100644
index 0000000000..efa24bc67b
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_filter.h
@@ -0,0 +1,42 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_FILTER_H_
+#define VP9_COMMON_VP9_FILTER_H_
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_ports/mem.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define EIGHTTAP            0
+#define EIGHTTAP_SMOOTH     1
+#define EIGHTTAP_SHARP      2
+#define SWITCHABLE_FILTERS  3 /* Number of switchable filters */
+#define BILINEAR            3
+// The codec can operate in four possible inter prediction filter mode:
+// 8-tap, 8-tap-smooth, 8-tap-sharp, and switching between the three.
+#define SWITCHABLE_FILTER_CONTEXTS (SWITCHABLE_FILTERS + 1)
+#define SWITCHABLE 4 /* should be the last one */
+
+typedef uint8_t INTERP_FILTER;
+
+extern const InterpKernel *vp9_filter_kernels[4];
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_COMMON_VP9_FILTER_H_
diff --git a/libs/libvpx/vp9/common/vp9_frame_buffers.c b/libs/libvpx/vp9/common/vp9_frame_buffers.c
new file mode 100644
index 0000000000..0f41d66985
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_frame_buffers.c
@@ -0,0 +1,86 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vp9/common/vp9_frame_buffers.h"
+#include "vpx_mem/vpx_mem.h"
+
+int vp9_alloc_internal_frame_buffers(InternalFrameBufferList *list) {
+  assert(list != NULL);
+  vp9_free_internal_frame_buffers(list);
+
+  list->num_internal_frame_buffers =
+      VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS;
+  list->int_fb =
+      (InternalFrameBuffer *)vpx_calloc(list->num_internal_frame_buffers,
+                                        sizeof(*list->int_fb));
+  return (list->int_fb == NULL);
+}
+
+void vp9_free_internal_frame_buffers(InternalFrameBufferList *list) {
+  int i;
+
+  assert(list != NULL);
+
+  for (i = 0; i < list->num_internal_frame_buffers; ++i) {
+    vpx_free(list->int_fb[i].data);
+    list->int_fb[i].data = NULL;
+  }
+  vpx_free(list->int_fb);
+  list->int_fb = NULL;
+}
+
+int vp9_get_frame_buffer(void *cb_priv, size_t min_size,
+                         vpx_codec_frame_buffer_t *fb) {
+  int i;
+  InternalFrameBufferList *const int_fb_list =
+      (InternalFrameBufferList *)cb_priv;
+  if (int_fb_list == NULL)
+    return -1;
+
+  // Find a free frame buffer.
+  for (i = 0; i < int_fb_list->num_internal_frame_buffers; ++i) {
+    if (!int_fb_list->int_fb[i].in_use)
+      break;
+  }
+
+  if (i == int_fb_list->num_internal_frame_buffers)
+    return -1;
+
+  if (int_fb_list->int_fb[i].size < min_size) {
+    int_fb_list->int_fb[i].data =
+        (uint8_t *)vpx_realloc(int_fb_list->int_fb[i].data, min_size);
+    if (!int_fb_list->int_fb[i].data)
+      return -1;
+
+    // This memset is needed for fixing valgrind error from C loop filter
+    // due to access uninitialized memory in frame border. It could be
+    // removed if border is totally removed.
+    memset(int_fb_list->int_fb[i].data, 0, min_size);
+    int_fb_list->int_fb[i].size = min_size;
+  }
+
+  fb->data = int_fb_list->int_fb[i].data;
+  fb->size = int_fb_list->int_fb[i].size;
+  int_fb_list->int_fb[i].in_use = 1;
+
+  // Set the frame buffer's private data to point at the internal frame buffer.
+  fb->priv = &int_fb_list->int_fb[i];
+  return 0;
+}
+
+int vp9_release_frame_buffer(void *cb_priv, vpx_codec_frame_buffer_t *fb) {
+  InternalFrameBuffer *const int_fb = (InternalFrameBuffer *)fb->priv;
+  (void)cb_priv;
+  if (int_fb)
+    int_fb->in_use = 0;
+  return 0;
+}
diff --git a/libs/libvpx/vp9/common/vp9_frame_buffers.h b/libs/libvpx/vp9/common/vp9_frame_buffers.h
new file mode 100644
index 0000000000..e2cfe61b66
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_frame_buffers.h
@@ -0,0 +1,53 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_FRAME_BUFFERS_H_
+#define VP9_COMMON_VP9_FRAME_BUFFERS_H_
+
+#include "vpx/vpx_frame_buffer.h"
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct InternalFrameBuffer {
+  uint8_t *data;
+  size_t size;
+  int in_use;
+} InternalFrameBuffer;
+
+typedef struct InternalFrameBufferList {
+  int num_internal_frame_buffers;
+  InternalFrameBuffer *int_fb;
+} InternalFrameBufferList;
+
+// Initializes |list|. Returns 0 on success.
+int vp9_alloc_internal_frame_buffers(InternalFrameBufferList *list);
+
+// Free any data allocated to the frame buffers.
+void vp9_free_internal_frame_buffers(InternalFrameBufferList *list);
+
+// Callback used by libvpx to request an external frame buffer. |cb_priv|
+// Callback private data, which points to an InternalFrameBufferList.
+// |min_size| is the minimum size in bytes needed to decode the next frame.
+// |fb| pointer to the frame buffer.
+int vp9_get_frame_buffer(void *cb_priv, size_t min_size,
+                         vpx_codec_frame_buffer_t *fb);
+
+// Callback used by libvpx when there are no references to the frame buffer.
+// |cb_priv| is not used. |fb| pointer to the frame buffer.
+int vp9_release_frame_buffer(void *cb_priv, vpx_codec_frame_buffer_t *fb);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_COMMON_VP9_FRAME_BUFFERS_H_
diff --git a/libs/libvpx/vp9/common/vp9_idct.c b/libs/libvpx/vp9/common/vp9_idct.c
new file mode 100644
index 0000000000..1b420143bb
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_idct.c
@@ -0,0 +1,405 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vpx_dsp/inv_txfm.h"
+#include "vpx_ports/mem.h"
+
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride,
+                         int tx_type) {
+  const transform_2d IHT_4[] = {
+    { idct4_c, idct4_c  },  // DCT_DCT  = 0
+    { iadst4_c, idct4_c  },   // ADST_DCT = 1
+    { idct4_c, iadst4_c },    // DCT_ADST = 2
+    { iadst4_c, iadst4_c }      // ADST_ADST = 3
+  };
+
+  int i, j;
+  tran_low_t out[4 * 4];
+  tran_low_t *outptr = out;
+  tran_low_t temp_in[4], temp_out[4];
+
+  // inverse transform row vectors
+  for (i = 0; i < 4; ++i) {
+    IHT_4[tx_type].rows(input, outptr);
+    input  += 4;
+    outptr += 4;
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < 4; ++i) {
+    for (j = 0; j < 4; ++j)
+      temp_in[j] = out[j * 4 + i];
+    IHT_4[tx_type].cols(temp_in, temp_out);
+    for (j = 0; j < 4; ++j) {
+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+                                            ROUND_POWER_OF_TWO(temp_out[j], 4));
+    }
+  }
+}
+
+static const transform_2d IHT_8[] = {
+  { idct8_c,  idct8_c  },  // DCT_DCT  = 0
+  { iadst8_c, idct8_c  },  // ADST_DCT = 1
+  { idct8_c,  iadst8_c },  // DCT_ADST = 2
+  { iadst8_c, iadst8_c }   // ADST_ADST = 3
+};
+
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride,
+                         int tx_type) {
+  int i, j;
+  tran_low_t out[8 * 8];
+  tran_low_t *outptr = out;
+  tran_low_t temp_in[8], temp_out[8];
+  const transform_2d ht = IHT_8[tx_type];
+
+  // inverse transform row vectors
+  for (i = 0; i < 8; ++i) {
+    ht.rows(input, outptr);
+    input += 8;
+    outptr += 8;
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < 8; ++i) {
+    for (j = 0; j < 8; ++j)
+      temp_in[j] = out[j * 8 + i];
+    ht.cols(temp_in, temp_out);
+    for (j = 0; j < 8; ++j) {
+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+                                            ROUND_POWER_OF_TWO(temp_out[j], 5));
+    }
+  }
+}
+
+static const transform_2d IHT_16[] = {
+  { idct16_c,  idct16_c  },  // DCT_DCT  = 0
+  { iadst16_c, idct16_c  },  // ADST_DCT = 1
+  { idct16_c,  iadst16_c },  // DCT_ADST = 2
+  { iadst16_c, iadst16_c }   // ADST_ADST = 3
+};
+
+void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride,
+                            int tx_type) {
+  int i, j;
+  tran_low_t out[16 * 16];
+  tran_low_t *outptr = out;
+  tran_low_t temp_in[16], temp_out[16];
+  const transform_2d ht = IHT_16[tx_type];
+
+  // Rows
+  for (i = 0; i < 16; ++i) {
+    ht.rows(input, outptr);
+    input += 16;
+    outptr += 16;
+  }
+
+  // Columns
+  for (i = 0; i < 16; ++i) {
+    for (j = 0; j < 16; ++j)
+      temp_in[j] = out[j * 16 + i];
+    ht.cols(temp_in, temp_out);
+    for (j = 0; j < 16; ++j) {
+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
+    }
+  }
+}
+
+// idct
+void vp9_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                     int eob) {
+  if (eob > 1)
+    vpx_idct4x4_16_add(input, dest, stride);
+  else
+    vpx_idct4x4_1_add(input, dest, stride);
+}
+
+
+void vp9_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                     int eob) {
+  if (eob > 1)
+    vpx_iwht4x4_16_add(input, dest, stride);
+  else
+    vpx_iwht4x4_1_add(input, dest, stride);
+}
+
+void vp9_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
+                     int eob) {
+  // If dc is 1, then input[0] is the reconstructed value, do not need
+  // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
+
+  // The calculation can be simplified if there are not many non-zero dct
+  // coefficients. Use eobs to decide what to do.
+  // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.
+  // Combine that with code here.
+  if (eob == 1)
+    // DC only DCT coefficient
+    vpx_idct8x8_1_add(input, dest, stride);
+  else if (eob <= 12)
+    vpx_idct8x8_12_add(input, dest, stride);
+  else
+    vpx_idct8x8_64_add(input, dest, stride);
+}
+
+void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride,
+                       int eob) {
+  /* The calculation can be simplified if there are not many non-zero dct
+   * coefficients. Use eobs to separate different cases. */
+  if (eob == 1)
+    /* DC only DCT coefficient. */
+    vpx_idct16x16_1_add(input, dest, stride);
+  else if (eob <= 10)
+    vpx_idct16x16_10_add(input, dest, stride);
+  else
+    vpx_idct16x16_256_add(input, dest, stride);
+}
+
+void vp9_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
+                       int eob) {
+  if (eob == 1)
+    vpx_idct32x32_1_add(input, dest, stride);
+  else if (eob <= 34)
+    // non-zero coeff only in upper-left 8x8
+    vpx_idct32x32_34_add(input, dest, stride);
+  else if (eob <= 135)
+    // non-zero coeff only in upper-left 16x16
+    vpx_idct32x32_135_add(input, dest, stride);
+  else
+    vpx_idct32x32_1024_add(input, dest, stride);
+}
+
+// iht
+void vp9_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
+                    int stride, int eob) {
+  if (tx_type == DCT_DCT)
+    vp9_idct4x4_add(input, dest, stride, eob);
+  else
+    vp9_iht4x4_16_add(input, dest, stride, tx_type);
+}
+
+void vp9_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
+                    int stride, int eob) {
+  if (tx_type == DCT_DCT) {
+    vp9_idct8x8_add(input, dest, stride, eob);
+  } else {
+    vp9_iht8x8_64_add(input, dest, stride, tx_type);
+  }
+}
+
+void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
+                      int stride, int eob) {
+  if (tx_type == DCT_DCT) {
+    vp9_idct16x16_add(input, dest, stride, eob);
+  } else {
+    vp9_iht16x16_256_add(input, dest, stride, tx_type);
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
+                                int stride, int tx_type, int bd) {
+  const highbd_transform_2d IHT_4[] = {
+    { vpx_highbd_idct4_c, vpx_highbd_idct4_c  },    // DCT_DCT  = 0
+    { vpx_highbd_iadst4_c, vpx_highbd_idct4_c },    // ADST_DCT = 1
+    { vpx_highbd_idct4_c, vpx_highbd_iadst4_c },    // DCT_ADST = 2
+    { vpx_highbd_iadst4_c, vpx_highbd_iadst4_c }    // ADST_ADST = 3
+  };
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  int i, j;
+  tran_low_t out[4 * 4];
+  tran_low_t *outptr = out;
+  tran_low_t temp_in[4], temp_out[4];
+
+  // Inverse transform row vectors.
+  for (i = 0; i < 4; ++i) {
+    IHT_4[tx_type].rows(input, outptr, bd);
+    input  += 4;
+    outptr += 4;
+  }
+
+  // Inverse transform column vectors.
+  for (i = 0; i < 4; ++i) {
+    for (j = 0; j < 4; ++j)
+      temp_in[j] = out[j * 4 + i];
+    IHT_4[tx_type].cols(temp_in, temp_out, bd);
+    for (j = 0; j < 4; ++j) {
+      dest[j * stride + i] = highbd_clip_pixel_add(
+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
+    }
+  }
+}
+
+static const highbd_transform_2d HIGH_IHT_8[] = {
+  { vpx_highbd_idct8_c,  vpx_highbd_idct8_c  },  // DCT_DCT  = 0
+  { vpx_highbd_iadst8_c, vpx_highbd_idct8_c  },  // ADST_DCT = 1
+  { vpx_highbd_idct8_c,  vpx_highbd_iadst8_c },  // DCT_ADST = 2
+  { vpx_highbd_iadst8_c, vpx_highbd_iadst8_c }   // ADST_ADST = 3
+};
+
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
+                                int stride, int tx_type, int bd) {
+  int i, j;
+  tran_low_t out[8 * 8];
+  tran_low_t *outptr = out;
+  tran_low_t temp_in[8], temp_out[8];
+  const highbd_transform_2d ht = HIGH_IHT_8[tx_type];
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  // Inverse transform row vectors.
+  for (i = 0; i < 8; ++i) {
+    ht.rows(input, outptr, bd);
+    input += 8;
+    outptr += 8;
+  }
+
+  // Inverse transform column vectors.
+  for (i = 0; i < 8; ++i) {
+    for (j = 0; j < 8; ++j)
+      temp_in[j] = out[j * 8 + i];
+    ht.cols(temp_in, temp_out, bd);
+    for (j = 0; j < 8; ++j) {
+      dest[j * stride + i] = highbd_clip_pixel_add(
+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+    }
+  }
+}
+
+static const highbd_transform_2d HIGH_IHT_16[] = {
+  { vpx_highbd_idct16_c,  vpx_highbd_idct16_c  },  // DCT_DCT  = 0
+  { vpx_highbd_iadst16_c, vpx_highbd_idct16_c  },  // ADST_DCT = 1
+  { vpx_highbd_idct16_c,  vpx_highbd_iadst16_c },  // DCT_ADST = 2
+  { vpx_highbd_iadst16_c, vpx_highbd_iadst16_c }   // ADST_ADST = 3
+};
+
+void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
+                                   int stride, int tx_type, int bd) {
+  int i, j;
+  tran_low_t out[16 * 16];
+  tran_low_t *outptr = out;
+  tran_low_t temp_in[16], temp_out[16];
+  const highbd_transform_2d ht = HIGH_IHT_16[tx_type];
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  // Rows
+  for (i = 0; i < 16; ++i) {
+    ht.rows(input, outptr, bd);
+    input += 16;
+    outptr += 16;
+  }
+
+  // Columns
+  for (i = 0; i < 16; ++i) {
+    for (j = 0; j < 16; ++j)
+      temp_in[j] = out[j * 16 + i];
+    ht.cols(temp_in, temp_out, bd);
+    for (j = 0; j < 16; ++j) {
+      dest[j * stride + i] = highbd_clip_pixel_add(
+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+    }
+  }
+}
+
+// idct
+void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                            int eob, int bd) {
+  if (eob > 1)
+    vpx_highbd_idct4x4_16_add(input, dest, stride, bd);
+  else
+    vpx_highbd_idct4x4_1_add(input, dest, stride, bd);
+}
+
+
+void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                            int eob, int bd) {
+  if (eob > 1)
+    vpx_highbd_iwht4x4_16_add(input, dest, stride, bd);
+  else
+    vpx_highbd_iwht4x4_1_add(input, dest, stride, bd);
+}
+
+void vp9_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
+                            int eob, int bd) {
+  // If dc is 1, then input[0] is the reconstructed value, do not need
+  // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
+
+  // The calculation can be simplified if there are not many non-zero dct
+  // coefficients. Use eobs to decide what to do.
+  // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.
+  // Combine that with code here.
+  // DC only DCT coefficient
+  if (eob == 1) {
+    vpx_highbd_idct8x8_1_add(input, dest, stride, bd);
+  } else if (eob <= 10) {
+    vpx_highbd_idct8x8_10_add(input, dest, stride, bd);
+  } else {
+    vpx_highbd_idct8x8_64_add(input, dest, stride, bd);
+  }
+}
+
+void vp9_highbd_idct16x16_add(const tran_low_t *input, uint8_t *dest,
+                              int stride, int eob, int bd) {
+  // The calculation can be simplified if there are not many non-zero dct
+  // coefficients. Use eobs to separate different cases.
+  // DC only DCT coefficient.
+  if (eob == 1) {
+    vpx_highbd_idct16x16_1_add(input, dest, stride, bd);
+  } else if (eob <= 10) {
+    vpx_highbd_idct16x16_10_add(input, dest, stride, bd);
+  } else {
+    vpx_highbd_idct16x16_256_add(input, dest, stride, bd);
+  }
+}
+
+void vp9_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest,
+                              int stride, int eob, int bd) {
+  // Non-zero coeff only in upper-left 8x8
+  if (eob == 1) {
+    vpx_highbd_idct32x32_1_add(input, dest, stride, bd);
+  } else if (eob <= 34) {
+    vpx_highbd_idct32x32_34_add(input, dest, stride, bd);
+  } else {
+    vpx_highbd_idct32x32_1024_add(input, dest, stride, bd);
+  }
+}
+
+// iht
+void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,
+                           uint8_t *dest, int stride, int eob, int bd) {
+  if (tx_type == DCT_DCT)
+    vp9_highbd_idct4x4_add(input, dest, stride, eob, bd);
+  else
+    vp9_highbd_iht4x4_16_add(input, dest, stride, tx_type, bd);
+}
+
+void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,
+                           uint8_t *dest, int stride, int eob, int bd) {
+  if (tx_type == DCT_DCT) {
+    vp9_highbd_idct8x8_add(input, dest, stride, eob, bd);
+  } else {
+    vp9_highbd_iht8x8_64_add(input, dest, stride, tx_type, bd);
+  }
+}
+
+void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,
+                           uint8_t *dest, int stride, int eob, int bd) {
+  if (tx_type == DCT_DCT) {
+    vp9_highbd_idct16x16_add(input, dest, stride, eob, bd);
+  } else {
+    vp9_highbd_iht16x16_256_add(input, dest, stride, tx_type, bd);
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/libs/libvpx/vp9/common/vp9_idct.h b/libs/libvpx/vp9/common/vp9_idct.h
new file mode 100644
index 0000000000..b5a3fbf362
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_idct.h
@@ -0,0 +1,81 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_IDCT_H_
+#define VP9_COMMON_VP9_IDCT_H_
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_enums.h"
+#include "vpx_dsp/inv_txfm.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_ports/mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void (*transform_1d)(const tran_low_t*, tran_low_t*);
+
+typedef struct {
+  transform_1d cols, rows;  // vertical and horizontal
+} transform_2d;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef void (*highbd_transform_1d)(const tran_low_t*, tran_low_t*, int bd);
+
+typedef struct {
+  highbd_transform_1d cols, rows;  // vertical and horizontal
+} highbd_transform_2d;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+void vp9_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                     int eob);
+void vp9_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                     int eob);
+void vp9_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
+                     int eob);
+void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride,
+                       int eob);
+void vp9_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
+                       int eob);
+
+void vp9_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
+                    int stride, int eob);
+void vp9_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
+                    int stride, int eob);
+void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
+                      int stride, int eob);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                            int eob, int bd);
+void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                            int eob, int bd);
+void vp9_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
+                            int eob, int bd);
+void vp9_highbd_idct16x16_add(const tran_low_t *input, uint8_t *dest,
+                              int stride, int eob, int bd);
+void vp9_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest,
+                              int stride, int eob, int bd);
+void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,
+                           uint8_t *dest, int stride, int eob, int bd);
+void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,
+                           uint8_t *dest, int stride, int eob, int bd);
+void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,
+                             uint8_t *dest, int stride, int eob, int bd);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_COMMON_VP9_IDCT_H_
diff --git a/libs/libvpx/vp9/common/vp9_loopfilter.c b/libs/libvpx/vp9/common/vp9_loopfilter.c
new file mode 100644
index 0000000000..79c3c4820d
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_loopfilter.c
@@ -0,0 +1,1747 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vp9/common/vp9_loopfilter.h"
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+
+#include "vp9/common/vp9_seg_common.h"
+
+// 64 bit masks for left transform size. Each 1 represents a position where
+// we should apply a loop filter across the left border of an 8x8 block
+// boundary.
+//
+// In the case of TX_16X16->  ( in low order byte first we end up with
+// a mask that looks like this
+//
+//    10101010
+//    10101010
+//    10101010
+//    10101010
+//    10101010
+//    10101010
+//    10101010
+//    10101010
+//
+// A loopfilter should be applied to every other 8x8 horizontally.
+static const uint64_t left_64x64_txform_mask[TX_SIZES]= {
+  0xffffffffffffffffULL,  // TX_4X4
+  0xffffffffffffffffULL,  // TX_8x8
+  0x5555555555555555ULL,  // TX_16x16
+  0x1111111111111111ULL,  // TX_32x32
+};
+
+// 64 bit masks for above transform size. Each 1 represents a position where
+// we should apply a loop filter across the top border of an 8x8 block
+// boundary.
+//
+// In the case of TX_32x32 ->  ( in low order byte first we end up with
+// a mask that looks like this
+//
+//    11111111
+//    00000000
+//    00000000
+//    00000000
+//    11111111
+//    00000000
+//    00000000
+//    00000000
+//
+// A loopfilter should be applied to every other 4 the row vertically.
+static const uint64_t above_64x64_txform_mask[TX_SIZES]= {
+  0xffffffffffffffffULL,  // TX_4X4
+  0xffffffffffffffffULL,  // TX_8x8
+  0x00ff00ff00ff00ffULL,  // TX_16x16
+  0x000000ff000000ffULL,  // TX_32x32
+};
+
+// 64 bit masks for prediction sizes (left). Each 1 represents a position
+// where left border of an 8x8 block. These are aligned to the right most
+// appropriate bit, and then shifted into place.
+//
+// In the case of TX_16x32 ->  ( low order byte first ) we end up with
+// a mask that looks like this :
+//
+//  10000000
+//  10000000
+//  10000000
+//  10000000
+//  00000000
+//  00000000
+//  00000000
+//  00000000
+static const uint64_t left_prediction_mask[BLOCK_SIZES] = {
+  0x0000000000000001ULL,  // BLOCK_4X4,
+  0x0000000000000001ULL,  // BLOCK_4X8,
+  0x0000000000000001ULL,  // BLOCK_8X4,
+  0x0000000000000001ULL,  // BLOCK_8X8,
+  0x0000000000000101ULL,  // BLOCK_8X16,
+  0x0000000000000001ULL,  // BLOCK_16X8,
+  0x0000000000000101ULL,  // BLOCK_16X16,
+  0x0000000001010101ULL,  // BLOCK_16X32,
+  0x0000000000000101ULL,  // BLOCK_32X16,
+  0x0000000001010101ULL,  // BLOCK_32X32,
+  0x0101010101010101ULL,  // BLOCK_32X64,
+  0x0000000001010101ULL,  // BLOCK_64X32,
+  0x0101010101010101ULL,  // BLOCK_64X64
+};
+
+// 64 bit mask to shift and set for each prediction size.
+static const uint64_t above_prediction_mask[BLOCK_SIZES] = {
+  0x0000000000000001ULL,  // BLOCK_4X4
+  0x0000000000000001ULL,  // BLOCK_4X8
+  0x0000000000000001ULL,  // BLOCK_8X4
+  0x0000000000000001ULL,  // BLOCK_8X8
+  0x0000000000000001ULL,  // BLOCK_8X16,
+  0x0000000000000003ULL,  // BLOCK_16X8
+  0x0000000000000003ULL,  // BLOCK_16X16
+  0x0000000000000003ULL,  // BLOCK_16X32,
+  0x000000000000000fULL,  // BLOCK_32X16,
+  0x000000000000000fULL,  // BLOCK_32X32,
+  0x000000000000000fULL,  // BLOCK_32X64,
+  0x00000000000000ffULL,  // BLOCK_64X32,
+  0x00000000000000ffULL,  // BLOCK_64X64
+};
+// 64 bit mask to shift and set for each prediction size. A bit is set for
+// each 8x8 block that would be in the left most block of the given block
+// size in the 64x64 block.
+static const uint64_t size_mask[BLOCK_SIZES] = {
+  0x0000000000000001ULL,  // BLOCK_4X4
+  0x0000000000000001ULL,  // BLOCK_4X8
+  0x0000000000000001ULL,  // BLOCK_8X4
+  0x0000000000000001ULL,  // BLOCK_8X8
+  0x0000000000000101ULL,  // BLOCK_8X16,
+  0x0000000000000003ULL,  // BLOCK_16X8
+  0x0000000000000303ULL,  // BLOCK_16X16
+  0x0000000003030303ULL,  // BLOCK_16X32,
+  0x0000000000000f0fULL,  // BLOCK_32X16,
+  0x000000000f0f0f0fULL,  // BLOCK_32X32,
+  0x0f0f0f0f0f0f0f0fULL,  // BLOCK_32X64,
+  0x00000000ffffffffULL,  // BLOCK_64X32,
+  0xffffffffffffffffULL,  // BLOCK_64X64
+};
+
+// These are used for masking the left and above borders.
+static const uint64_t left_border =  0x1111111111111111ULL;
+static const uint64_t above_border = 0x000000ff000000ffULL;
+
+// 16 bit masks for uv transform sizes.
+static const uint16_t left_64x64_txform_mask_uv[TX_SIZES]= {
+  0xffff,  // TX_4X4
+  0xffff,  // TX_8x8
+  0x5555,  // TX_16x16
+  0x1111,  // TX_32x32
+};
+
+static const uint16_t above_64x64_txform_mask_uv[TX_SIZES]= {
+  0xffff,  // TX_4X4
+  0xffff,  // TX_8x8
+  0x0f0f,  // TX_16x16
+  0x000f,  // TX_32x32
+};
+
+// 16 bit left mask to shift and set for each uv prediction size.
+static const uint16_t left_prediction_mask_uv[BLOCK_SIZES] = {
+  0x0001,  // BLOCK_4X4,
+  0x0001,  // BLOCK_4X8,
+  0x0001,  // BLOCK_8X4,
+  0x0001,  // BLOCK_8X8,
+  0x0001,  // BLOCK_8X16,
+  0x0001,  // BLOCK_16X8,
+  0x0001,  // BLOCK_16X16,
+  0x0011,  // BLOCK_16X32,
+  0x0001,  // BLOCK_32X16,
+  0x0011,  // BLOCK_32X32,
+  0x1111,  // BLOCK_32X64
+  0x0011,  // BLOCK_64X32,
+  0x1111,  // BLOCK_64X64
+};
+// 16 bit above mask to shift and set for uv each prediction size.
+static const uint16_t above_prediction_mask_uv[BLOCK_SIZES] = {
+  0x0001,  // BLOCK_4X4
+  0x0001,  // BLOCK_4X8
+  0x0001,  // BLOCK_8X4
+  0x0001,  // BLOCK_8X8
+  0x0001,  // BLOCK_8X16,
+  0x0001,  // BLOCK_16X8
+  0x0001,  // BLOCK_16X16
+  0x0001,  // BLOCK_16X32,
+  0x0003,  // BLOCK_32X16,
+  0x0003,  // BLOCK_32X32,
+  0x0003,  // BLOCK_32X64,
+  0x000f,  // BLOCK_64X32,
+  0x000f,  // BLOCK_64X64
+};
+
+// 64 bit mask to shift and set for each uv prediction size
+static const uint16_t size_mask_uv[BLOCK_SIZES] = {
+  0x0001,  // BLOCK_4X4
+  0x0001,  // BLOCK_4X8
+  0x0001,  // BLOCK_8X4
+  0x0001,  // BLOCK_8X8
+  0x0001,  // BLOCK_8X16,
+  0x0001,  // BLOCK_16X8
+  0x0001,  // BLOCK_16X16
+  0x0011,  // BLOCK_16X32,
+  0x0003,  // BLOCK_32X16,
+  0x0033,  // BLOCK_32X32,
+  0x3333,  // BLOCK_32X64,
+  0x00ff,  // BLOCK_64X32,
+  0xffff,  // BLOCK_64X64
+};
+static const uint16_t left_border_uv =  0x1111;
+static const uint16_t above_border_uv = 0x000f;
+
+static const int mode_lf_lut[MB_MODE_COUNT] = {
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // INTRA_MODES
+  1, 1, 0, 1                     // INTER_MODES (ZEROMV == 0)
+};
+
+static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
+  int lvl;
+
+  // For each possible value for the loop filter fill out limits
+  for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++) {
+    // Set loop filter parameters that control sharpness.
+    int block_inside_limit = lvl >> ((sharpness_lvl > 0) + (sharpness_lvl > 4));
+
+    if (sharpness_lvl > 0) {
+      if (block_inside_limit > (9 - sharpness_lvl))
+        block_inside_limit = (9 - sharpness_lvl);
+    }
+
+    if (block_inside_limit < 1)
+      block_inside_limit = 1;
+
+    memset(lfi->lfthr[lvl].lim, block_inside_limit, SIMD_WIDTH);
+    memset(lfi->lfthr[lvl].mblim, (2 * (lvl + 2) + block_inside_limit),
+           SIMD_WIDTH);
+  }
+}
+
+static uint8_t get_filter_level(const loop_filter_info_n *lfi_n,
+                                const MODE_INFO *mi) {
+  return lfi_n->lvl[mi->segment_id][mi->ref_frame[0]]
+                   [mode_lf_lut[mi->mode]];
+}
+
+void vp9_loop_filter_init(VP9_COMMON *cm) {
+  loop_filter_info_n *lfi = &cm->lf_info;
+  struct loopfilter *lf = &cm->lf;
+  int lvl;
+
+  // init limits for given sharpness
+  update_sharpness(lfi, lf->sharpness_level);
+  lf->last_sharpness_level = lf->sharpness_level;
+
+  // init hev threshold const vectors
+  for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++)
+    memset(lfi->lfthr[lvl].hev_thr, (lvl >> 4), SIMD_WIDTH);
+}
+
+void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) {
+  int seg_id;
+  // n_shift is the multiplier for lf_deltas
+  // the multiplier is 1 for when filter_lvl is between 0 and 31;
+  // 2 when filter_lvl is between 32 and 63
+  const int scale = 1 << (default_filt_lvl >> 5);
+  loop_filter_info_n *const lfi = &cm->lf_info;
+  struct loopfilter *const lf = &cm->lf;
+  const struct segmentation *const seg = &cm->seg;
+
+  // update limits if sharpness has changed
+  if (lf->last_sharpness_level != lf->sharpness_level) {
+    update_sharpness(lfi, lf->sharpness_level);
+    lf->last_sharpness_level = lf->sharpness_level;
+  }
+
+  for (seg_id = 0; seg_id < MAX_SEGMENTS; seg_id++) {
+    int lvl_seg = default_filt_lvl;
+    if (segfeature_active(seg, seg_id, SEG_LVL_ALT_LF)) {
+      const int data = get_segdata(seg, seg_id, SEG_LVL_ALT_LF);
+      lvl_seg = clamp(seg->abs_delta == SEGMENT_ABSDATA ?
+                      data : default_filt_lvl + data,
+                      0, MAX_LOOP_FILTER);
+    }
+
+    if (!lf->mode_ref_delta_enabled) {
+      // we could get rid of this if we assume that deltas are set to
+      // zero when not in use; encoder always uses deltas
+      memset(lfi->lvl[seg_id], lvl_seg, sizeof(lfi->lvl[seg_id]));
+    } else {
+      int ref, mode;
+      const int intra_lvl = lvl_seg + lf->ref_deltas[INTRA_FRAME] * scale;
+      lfi->lvl[seg_id][INTRA_FRAME][0] = clamp(intra_lvl, 0, MAX_LOOP_FILTER);
+
+      for (ref = LAST_FRAME; ref < MAX_REF_FRAMES; ++ref) {
+        for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) {
+          const int inter_lvl = lvl_seg + lf->ref_deltas[ref] * scale
+                                        + lf->mode_deltas[mode] * scale;
+          lfi->lvl[seg_id][ref][mode] = clamp(inter_lvl, 0, MAX_LOOP_FILTER);
+        }
+      }
+    }
+  }
+}
+
+static void filter_selectively_vert_row2(int subsampling_factor,
+                                         uint8_t *s, int pitch,
+                                         unsigned int mask_16x16_l,
+                                         unsigned int mask_8x8_l,
+                                         unsigned int mask_4x4_l,
+                                         unsigned int mask_4x4_int_l,
+                                         const loop_filter_info_n *lfi_n,
+                                         const uint8_t *lfl) {
+  const int mask_shift = subsampling_factor ? 4 : 8;
+  const int mask_cutoff = subsampling_factor ? 0xf : 0xff;
+  const int lfl_forward = subsampling_factor ? 4 : 8;
+
+  unsigned int mask_16x16_0 = mask_16x16_l & mask_cutoff;
+  unsigned int mask_8x8_0 = mask_8x8_l & mask_cutoff;
+  unsigned int mask_4x4_0 = mask_4x4_l & mask_cutoff;
+  unsigned int mask_4x4_int_0 = mask_4x4_int_l & mask_cutoff;
+  unsigned int mask_16x16_1 = (mask_16x16_l >> mask_shift) & mask_cutoff;
+  unsigned int mask_8x8_1 = (mask_8x8_l >> mask_shift) & mask_cutoff;
+  unsigned int mask_4x4_1 = (mask_4x4_l >> mask_shift) & mask_cutoff;
+  unsigned int mask_4x4_int_1 = (mask_4x4_int_l >> mask_shift) & mask_cutoff;
+  unsigned int mask;
+
+  for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_4x4_int_0 |
+              mask_16x16_1 | mask_8x8_1 | mask_4x4_1 | mask_4x4_int_1;
+       mask; mask >>= 1) {
+    const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
+    const loop_filter_thresh *lfi1 = lfi_n->lfthr + *(lfl + lfl_forward);
+
+    // TODO(yunqingwang): count in loopfilter functions should be removed.
+    if (mask & 1) {
+      if ((mask_16x16_0 | mask_16x16_1) & 1) {
+        if ((mask_16x16_0 & mask_16x16_1) & 1) {
+          vpx_lpf_vertical_16_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                   lfi0->hev_thr);
+        } else if (mask_16x16_0 & 1) {
+          vpx_lpf_vertical_16(s, pitch, lfi0->mblim, lfi0->lim,
+                              lfi0->hev_thr);
+        } else {
+          vpx_lpf_vertical_16(s + 8 *pitch, pitch, lfi1->mblim,
+                              lfi1->lim, lfi1->hev_thr);
+        }
+      }
+
+      if ((mask_8x8_0 | mask_8x8_1) & 1) {
+        if ((mask_8x8_0 & mask_8x8_1) & 1) {
+          vpx_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                  lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                  lfi1->hev_thr);
+        } else if (mask_8x8_0 & 1) {
+          vpx_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
+                             1);
+        } else {
+          vpx_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim,
+                             lfi1->hev_thr, 1);
+        }
+      }
+
+      if ((mask_4x4_0 | mask_4x4_1) & 1) {
+        if ((mask_4x4_0 & mask_4x4_1) & 1) {
+          vpx_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                  lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                  lfi1->hev_thr);
+        } else if (mask_4x4_0 & 1) {
+          vpx_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
+                             1);
+        } else {
+          vpx_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim,
+                             lfi1->hev_thr, 1);
+        }
+      }
+
+      if ((mask_4x4_int_0 | mask_4x4_int_1) & 1) {
+        if ((mask_4x4_int_0 & mask_4x4_int_1) & 1) {
+          vpx_lpf_vertical_4_dual(s + 4, pitch, lfi0->mblim, lfi0->lim,
+                                  lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                  lfi1->hev_thr);
+        } else if (mask_4x4_int_0 & 1) {
+          vpx_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim,
+                             lfi0->hev_thr, 1);
+        } else {
+          vpx_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim, lfi1->lim,
+                             lfi1->hev_thr, 1);
+        }
+      }
+    }
+
+    s += 8;
+    lfl += 1;
+    mask_16x16_0 >>= 1;
+    mask_8x8_0 >>= 1;
+    mask_4x4_0 >>= 1;
+    mask_4x4_int_0 >>= 1;
+    mask_16x16_1 >>= 1;
+    mask_8x8_1 >>= 1;
+    mask_4x4_1 >>= 1;
+    mask_4x4_int_1 >>= 1;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_filter_selectively_vert_row2(int subsampling_factor,
+                                                uint16_t *s, int pitch,
+                                                unsigned int mask_16x16_l,
+                                                unsigned int mask_8x8_l,
+                                                unsigned int mask_4x4_l,
+                                                unsigned int mask_4x4_int_l,
+                                                const loop_filter_info_n *lfi_n,
+                                                const uint8_t *lfl, int bd) {
+  const int mask_shift = subsampling_factor ? 4 : 8;
+  const int mask_cutoff = subsampling_factor ? 0xf : 0xff;
+  const int lfl_forward = subsampling_factor ? 4 : 8;
+
+  unsigned int mask_16x16_0 = mask_16x16_l & mask_cutoff;
+  unsigned int mask_8x8_0 = mask_8x8_l & mask_cutoff;
+  unsigned int mask_4x4_0 = mask_4x4_l & mask_cutoff;
+  unsigned int mask_4x4_int_0 = mask_4x4_int_l & mask_cutoff;
+  unsigned int mask_16x16_1 = (mask_16x16_l >> mask_shift) & mask_cutoff;
+  unsigned int mask_8x8_1 = (mask_8x8_l >> mask_shift) & mask_cutoff;
+  unsigned int mask_4x4_1 = (mask_4x4_l >> mask_shift) & mask_cutoff;
+  unsigned int mask_4x4_int_1 = (mask_4x4_int_l >> mask_shift) & mask_cutoff;
+  unsigned int mask;
+
+  for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_4x4_int_0 |
+       mask_16x16_1 | mask_8x8_1 | mask_4x4_1 | mask_4x4_int_1;
+       mask; mask >>= 1) {
+    const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
+    const loop_filter_thresh *lfi1 = lfi_n->lfthr + *(lfl + lfl_forward);
+
+    // TODO(yunqingwang): count in loopfilter functions should be removed.
+    if (mask & 1) {
+      if ((mask_16x16_0 | mask_16x16_1) & 1) {
+        if ((mask_16x16_0 & mask_16x16_1) & 1) {
+          vpx_highbd_lpf_vertical_16_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                          lfi0->hev_thr, bd);
+        } else if (mask_16x16_0 & 1) {
+          vpx_highbd_lpf_vertical_16(s, pitch, lfi0->mblim, lfi0->lim,
+                                     lfi0->hev_thr, bd);
+        } else {
+          vpx_highbd_lpf_vertical_16(s + 8 *pitch, pitch, lfi1->mblim,
+                                     lfi1->lim, lfi1->hev_thr, bd);
+        }
+      }
+
+      if ((mask_8x8_0 | mask_8x8_1) & 1) {
+        if ((mask_8x8_0 & mask_8x8_1) & 1) {
+          vpx_highbd_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                         lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                         lfi1->hev_thr, bd);
+        } else if (mask_8x8_0 & 1) {
+          vpx_highbd_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim,
+                                    lfi0->hev_thr, 1, bd);
+        } else {
+          vpx_highbd_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim,
+                                    lfi1->lim, lfi1->hev_thr, 1, bd);
+        }
+      }
+
+      if ((mask_4x4_0 | mask_4x4_1) & 1) {
+        if ((mask_4x4_0 & mask_4x4_1) & 1) {
+          vpx_highbd_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                         lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                         lfi1->hev_thr, bd);
+        } else if (mask_4x4_0 & 1) {
+          vpx_highbd_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim,
+                                    lfi0->hev_thr, 1, bd);
+        } else {
+          vpx_highbd_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim,
+                                    lfi1->lim, lfi1->hev_thr, 1, bd);
+        }
+      }
+
+      if ((mask_4x4_int_0 | mask_4x4_int_1) & 1) {
+        if ((mask_4x4_int_0 & mask_4x4_int_1) & 1) {
+          vpx_highbd_lpf_vertical_4_dual(s + 4, pitch, lfi0->mblim, lfi0->lim,
+                                         lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                         lfi1->hev_thr, bd);
+        } else if (mask_4x4_int_0 & 1) {
+          vpx_highbd_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim,
+                                    lfi0->hev_thr, 1, bd);
+        } else {
+          vpx_highbd_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim,
+                                    lfi1->lim, lfi1->hev_thr, 1, bd);
+        }
+      }
+    }
+
+    s += 8;
+    lfl += 1;
+    mask_16x16_0 >>= 1;
+    mask_8x8_0 >>= 1;
+    mask_4x4_0 >>= 1;
+    mask_4x4_int_0 >>= 1;
+    mask_16x16_1 >>= 1;
+    mask_8x8_1 >>= 1;
+    mask_4x4_1 >>= 1;
+    mask_4x4_int_1 >>= 1;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static void filter_selectively_horiz(uint8_t *s, int pitch,
+                                     unsigned int mask_16x16,
+                                     unsigned int mask_8x8,
+                                     unsigned int mask_4x4,
+                                     unsigned int mask_4x4_int,
+                                     const loop_filter_info_n *lfi_n,
+                                     const uint8_t *lfl) {
+  unsigned int mask;
+  int count;
+
+  for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
+       mask; mask >>= count) {
+    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
+
+    count = 1;
+    if (mask & 1) {
+      if (mask_16x16 & 1) {
+        if ((mask_16x16 & 3) == 3) {
+          vpx_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
+                                lfi->hev_thr, 2);
+          count = 2;
+        } else {
+          vpx_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
+                                lfi->hev_thr, 1);
+        }
+      } else if (mask_8x8 & 1) {
+        if ((mask_8x8 & 3) == 3) {
+          // Next block's thresholds.
+          const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
+
+          vpx_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
+                                    lfi->hev_thr, lfin->mblim, lfin->lim,
+                                    lfin->hev_thr);
+
+          if ((mask_4x4_int & 3) == 3) {
+            vpx_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
+                                      lfi->lim, lfi->hev_thr, lfin->mblim,
+                                      lfin->lim, lfin->hev_thr);
+          } else {
+            if (mask_4x4_int & 1)
+              vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
+                                   lfi->hev_thr, 1);
+            else if (mask_4x4_int & 2)
+              vpx_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
+                                   lfin->lim, lfin->hev_thr, 1);
+          }
+          count = 2;
+        } else {
+          vpx_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+
+          if (mask_4x4_int & 1)
+            vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
+                                 lfi->hev_thr, 1);
+        }
+      } else if (mask_4x4 & 1) {
+        if ((mask_4x4 & 3) == 3) {
+          // Next block's thresholds.
+          const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
+
+          vpx_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
+                                    lfi->hev_thr, lfin->mblim, lfin->lim,
+                                    lfin->hev_thr);
+          if ((mask_4x4_int & 3) == 3) {
+            vpx_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
+                                      lfi->lim, lfi->hev_thr, lfin->mblim,
+                                      lfin->lim, lfin->hev_thr);
+          } else {
+            if (mask_4x4_int & 1)
+              vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
+                                   lfi->hev_thr, 1);
+            else if (mask_4x4_int & 2)
+              vpx_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
+                                   lfin->lim, lfin->hev_thr, 1);
+          }
+          count = 2;
+        } else {
+          vpx_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+
+          if (mask_4x4_int & 1)
+            vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
+                                 lfi->hev_thr, 1);
+        }
+      } else if (mask_4x4_int & 1) {
+        vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
+                             lfi->hev_thr, 1);
+      }
+    }
+    s += 8 * count;
+    lfl += count;
+    mask_16x16 >>= count;
+    mask_8x8 >>= count;
+    mask_4x4 >>= count;
+    mask_4x4_int >>= count;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_filter_selectively_horiz(uint16_t *s, int pitch,
+                                            unsigned int mask_16x16,
+                                            unsigned int mask_8x8,
+                                            unsigned int mask_4x4,
+                                            unsigned int mask_4x4_int,
+                                            const loop_filter_info_n *lfi_n,
+                                            const uint8_t *lfl, int bd) {
+  unsigned int mask;
+  int count;
+
+  for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
+       mask; mask >>= count) {
+    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
+
+    count = 1;
+    if (mask & 1) {
+      if (mask_16x16 & 1) {
+        if ((mask_16x16 & 3) == 3) {
+          vpx_highbd_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
+                                       lfi->hev_thr, 2, bd);
+          count = 2;
+        } else {
+          vpx_highbd_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
+                                       lfi->hev_thr, 1, bd);
+        }
+      } else if (mask_8x8 & 1) {
+        if ((mask_8x8 & 3) == 3) {
+          // Next block's thresholds.
+          const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
+
+          vpx_highbd_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
+                                           lfi->hev_thr, lfin->mblim, lfin->lim,
+                                           lfin->hev_thr, bd);
+
+          if ((mask_4x4_int & 3) == 3) {
+            vpx_highbd_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
+                                             lfi->lim, lfi->hev_thr,
+                                             lfin->mblim, lfin->lim,
+                                             lfin->hev_thr, bd);
+          } else {
+            if (mask_4x4_int & 1) {
+              vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
+                                          lfi->lim, lfi->hev_thr, 1, bd);
+            } else if (mask_4x4_int & 2) {
+              vpx_highbd_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
+                                          lfin->lim, lfin->hev_thr, 1, bd);
+            }
+          }
+          count = 2;
+        } else {
+          vpx_highbd_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim,
+                                      lfi->hev_thr, 1, bd);
+
+          if (mask_4x4_int & 1) {
+            vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
+                                        lfi->lim, lfi->hev_thr, 1, bd);
+          }
+        }
+      } else if (mask_4x4 & 1) {
+        if ((mask_4x4 & 3) == 3) {
+          // Next block's thresholds.
+          const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
+
+          vpx_highbd_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
+                                           lfi->hev_thr, lfin->mblim, lfin->lim,
+                                           lfin->hev_thr, bd);
+          if ((mask_4x4_int & 3) == 3) {
+            vpx_highbd_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
+                                             lfi->lim, lfi->hev_thr,
+                                             lfin->mblim, lfin->lim,
+                                             lfin->hev_thr, bd);
+          } else {
+            if (mask_4x4_int & 1) {
+              vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
+                                          lfi->lim, lfi->hev_thr, 1, bd);
+            } else if (mask_4x4_int & 2) {
+              vpx_highbd_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
+                                          lfin->lim, lfin->hev_thr, 1, bd);
+            }
+          }
+          count = 2;
+        } else {
+          vpx_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim,
+                                      lfi->hev_thr, 1, bd);
+
+          if (mask_4x4_int & 1) {
+            vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
+                                        lfi->lim, lfi->hev_thr, 1, bd);
+          }
+        }
+      } else if (mask_4x4_int & 1) {
+        vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
+                                    lfi->hev_thr, 1, bd);
+      }
+    }
+    s += 8 * count;
+    lfl += count;
+    mask_16x16 >>= count;
+    mask_8x8 >>= count;
+    mask_4x4 >>= count;
+    mask_4x4_int >>= count;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+// This function ors into the current lfm structure, where to do loop
+// filters for the specific mi we are looking at. It uses information
+// including the block_size_type (32x16, 32x32, etc.), the transform size,
+// whether there were any coefficients encoded, and the loop filter strength
+// block we are currently looking at. Shift is used to position the
+// 1's we produce.
+// TODO(JBB) Need another function for different resolution color..
+static void build_masks(const loop_filter_info_n *const lfi_n,
+                        const MODE_INFO *mi, const int shift_y,
+                        const int shift_uv,
+                        LOOP_FILTER_MASK *lfm) {
+  const BLOCK_SIZE block_size = mi->sb_type;
+  const TX_SIZE tx_size_y = mi->tx_size;
+  const TX_SIZE tx_size_uv = get_uv_tx_size_impl(tx_size_y, block_size, 1, 1);
+  const int filter_level = get_filter_level(lfi_n, mi);
+  uint64_t *const left_y = &lfm->left_y[tx_size_y];
+  uint64_t *const above_y = &lfm->above_y[tx_size_y];
+  uint64_t *const int_4x4_y = &lfm->int_4x4_y;
+  uint16_t *const left_uv = &lfm->left_uv[tx_size_uv];
+  uint16_t *const above_uv = &lfm->above_uv[tx_size_uv];
+  uint16_t *const int_4x4_uv = &lfm->int_4x4_uv;
+  int i;
+
+  // If filter level is 0 we don't loop filter.
+  if (!filter_level) {
+    return;
+  } else {
+    const int w = num_8x8_blocks_wide_lookup[block_size];
+    const int h = num_8x8_blocks_high_lookup[block_size];
+    int index = shift_y;
+    for (i = 0; i < h; i++) {
+      memset(&lfm->lfl_y[index], filter_level, w);
+      index += 8;
+    }
+  }
+
+  // These set 1 in the current block size for the block size edges.
+  // For instance if the block size is 32x16, we'll set:
+  //    above =   1111
+  //              0000
+  //    and
+  //    left  =   1000
+  //          =   1000
+  // NOTE : In this example the low bit is left most ( 1000 ) is stored as
+  //        1,  not 8...
+  //
+  // U and V set things on a 16 bit scale.
+  //
+  *above_y |= above_prediction_mask[block_size] << shift_y;
+  *above_uv |= above_prediction_mask_uv[block_size] << shift_uv;
+  *left_y |= left_prediction_mask[block_size] << shift_y;
+  *left_uv |= left_prediction_mask_uv[block_size] << shift_uv;
+
+  // If the block has no coefficients and is not intra we skip applying
+  // the loop filter on block edges.
+  if (mi->skip && is_inter_block(mi))
+    return;
+
+  // Here we are adding a mask for the transform size. The transform
+  // size mask is set to be correct for a 64x64 prediction block size. We
+  // mask to match the size of the block we are working on and then shift it
+  // into place..
+  *above_y |= (size_mask[block_size] &
+               above_64x64_txform_mask[tx_size_y]) << shift_y;
+  *above_uv |= (size_mask_uv[block_size] &
+                above_64x64_txform_mask_uv[tx_size_uv]) << shift_uv;
+
+  *left_y |= (size_mask[block_size] &
+              left_64x64_txform_mask[tx_size_y]) << shift_y;
+  *left_uv |= (size_mask_uv[block_size] &
+               left_64x64_txform_mask_uv[tx_size_uv]) << shift_uv;
+
+  // Here we are trying to determine what to do with the internal 4x4 block
+  // boundaries.  These differ from the 4x4 boundaries on the outside edge of
+  // an 8x8 in that the internal ones can be skipped and don't depend on
+  // the prediction block size.
+  if (tx_size_y == TX_4X4)
+    *int_4x4_y |= size_mask[block_size] << shift_y;
+
+  if (tx_size_uv == TX_4X4)
+    *int_4x4_uv |= (size_mask_uv[block_size] & 0xffff) << shift_uv;
+}
+
+// This function does the same thing as the one above with the exception that
+// it only affects the y masks. It exists because for blocks < 16x16 in size,
+// we only update u and v masks on the first block.
+static void build_y_mask(const loop_filter_info_n *const lfi_n,
+                         const MODE_INFO *mi, const int shift_y,
+                         LOOP_FILTER_MASK *lfm) {
+  const BLOCK_SIZE block_size = mi->sb_type;
+  const TX_SIZE tx_size_y = mi->tx_size;
+  const int filter_level = get_filter_level(lfi_n, mi);
+  uint64_t *const left_y = &lfm->left_y[tx_size_y];
+  uint64_t *const above_y = &lfm->above_y[tx_size_y];
+  uint64_t *const int_4x4_y = &lfm->int_4x4_y;
+  int i;
+
+  if (!filter_level) {
+    return;
+  } else {
+    const int w = num_8x8_blocks_wide_lookup[block_size];
+    const int h = num_8x8_blocks_high_lookup[block_size];
+    int index = shift_y;
+    for (i = 0; i < h; i++) {
+      memset(&lfm->lfl_y[index], filter_level, w);
+      index += 8;
+    }
+  }
+
+  *above_y |= above_prediction_mask[block_size] << shift_y;
+  *left_y |= left_prediction_mask[block_size] << shift_y;
+
+  if (mi->skip && is_inter_block(mi))
+    return;
+
+  *above_y |= (size_mask[block_size] &
+               above_64x64_txform_mask[tx_size_y]) << shift_y;
+
+  *left_y |= (size_mask[block_size] &
+              left_64x64_txform_mask[tx_size_y]) << shift_y;
+
+  if (tx_size_y == TX_4X4)
+    *int_4x4_y |= size_mask[block_size] << shift_y;
+}
+
+void vp9_adjust_mask(VP9_COMMON *const cm, const int mi_row,
+                     const int mi_col, LOOP_FILTER_MASK *lfm) {
+  int i;
+
+  // The largest loopfilter we have is 16x16 so we use the 16x16 mask
+  // for 32x32 transforms also.
+  lfm->left_y[TX_16X16] |= lfm->left_y[TX_32X32];
+  lfm->above_y[TX_16X16] |= lfm->above_y[TX_32X32];
+  lfm->left_uv[TX_16X16] |= lfm->left_uv[TX_32X32];
+  lfm->above_uv[TX_16X16] |= lfm->above_uv[TX_32X32];
+
+  // We do at least 8 tap filter on every 32x32 even if the transform size
+  // is 4x4. So if the 4x4 is set on a border pixel add it to the 8x8 and
+  // remove it from the 4x4.
+  lfm->left_y[TX_8X8] |= lfm->left_y[TX_4X4] & left_border;
+  lfm->left_y[TX_4X4] &= ~left_border;
+  lfm->above_y[TX_8X8] |= lfm->above_y[TX_4X4] & above_border;
+  lfm->above_y[TX_4X4] &= ~above_border;
+  lfm->left_uv[TX_8X8] |= lfm->left_uv[TX_4X4] & left_border_uv;
+  lfm->left_uv[TX_4X4] &= ~left_border_uv;
+  lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_4X4] & above_border_uv;
+  lfm->above_uv[TX_4X4] &= ~above_border_uv;
+
+  // We do some special edge handling.
+  if (mi_row + MI_BLOCK_SIZE > cm->mi_rows) {
+    const uint64_t rows = cm->mi_rows - mi_row;
+
+    // Each pixel inside the border gets a 1,
+    const uint64_t mask_y = (((uint64_t) 1 << (rows << 3)) - 1);
+    const uint16_t mask_uv = (((uint16_t) 1 << (((rows + 1) >> 1) << 2)) - 1);
+
+    // Remove values completely outside our border.
+    for (i = 0; i < TX_32X32; i++) {
+      lfm->left_y[i] &= mask_y;
+      lfm->above_y[i] &= mask_y;
+      lfm->left_uv[i] &= mask_uv;
+      lfm->above_uv[i] &= mask_uv;
+    }
+    lfm->int_4x4_y &= mask_y;
+    lfm->int_4x4_uv &= mask_uv;
+
+    // We don't apply a wide loop filter on the last uv block row. If set
+    // apply the shorter one instead.
+    if (rows == 1) {
+      lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_16X16];
+      lfm->above_uv[TX_16X16] = 0;
+    }
+    if (rows == 5) {
+      lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_16X16] & 0xff00;
+      lfm->above_uv[TX_16X16] &= ~(lfm->above_uv[TX_16X16] & 0xff00);
+    }
+  }
+
+  if (mi_col + MI_BLOCK_SIZE > cm->mi_cols) {
+    const uint64_t columns = cm->mi_cols - mi_col;
+
+    // Each pixel inside the border gets a 1, the multiply copies the border
+    // to where we need it.
+    const uint64_t mask_y  = (((1 << columns) - 1)) * 0x0101010101010101ULL;
+    const uint16_t mask_uv = ((1 << ((columns + 1) >> 1)) - 1) * 0x1111;
+
+    // Internal edges are not applied on the last column of the image so
+    // we mask 1 more for the internal edges
+    const uint16_t mask_uv_int = ((1 << (columns >> 1)) - 1) * 0x1111;
+
+    // Remove the bits outside the image edge.
+    for (i = 0; i < TX_32X32; i++) {
+      lfm->left_y[i] &= mask_y;
+      lfm->above_y[i] &= mask_y;
+      lfm->left_uv[i] &= mask_uv;
+      lfm->above_uv[i] &= mask_uv;
+    }
+    lfm->int_4x4_y &= mask_y;
+    lfm->int_4x4_uv &= mask_uv_int;
+
+    // We don't apply a wide loop filter on the last uv column. If set
+    // apply the shorter one instead.
+    if (columns == 1) {
+      lfm->left_uv[TX_8X8] |= lfm->left_uv[TX_16X16];
+      lfm->left_uv[TX_16X16] = 0;
+    }
+    if (columns == 5) {
+      lfm->left_uv[TX_8X8] |= (lfm->left_uv[TX_16X16] & 0xcccc);
+      lfm->left_uv[TX_16X16] &= ~(lfm->left_uv[TX_16X16] & 0xcccc);
+    }
+  }
+  // We don't apply a loop filter on the first column in the image, mask that
+  // out.
+  if (mi_col == 0) {
+    for (i = 0; i < TX_32X32; i++) {
+      lfm->left_y[i] &= 0xfefefefefefefefeULL;
+      lfm->left_uv[i] &= 0xeeee;
+    }
+  }
+
+  // Assert if we try to apply 2 different loop filters at the same position.
+  assert(!(lfm->left_y[TX_16X16] & lfm->left_y[TX_8X8]));
+  assert(!(lfm->left_y[TX_16X16] & lfm->left_y[TX_4X4]));
+  assert(!(lfm->left_y[TX_8X8] & lfm->left_y[TX_4X4]));
+  assert(!(lfm->int_4x4_y & lfm->left_y[TX_16X16]));
+  assert(!(lfm->left_uv[TX_16X16]&lfm->left_uv[TX_8X8]));
+  assert(!(lfm->left_uv[TX_16X16] & lfm->left_uv[TX_4X4]));
+  assert(!(lfm->left_uv[TX_8X8] & lfm->left_uv[TX_4X4]));
+  assert(!(lfm->int_4x4_uv & lfm->left_uv[TX_16X16]));
+  assert(!(lfm->above_y[TX_16X16] & lfm->above_y[TX_8X8]));
+  assert(!(lfm->above_y[TX_16X16] & lfm->above_y[TX_4X4]));
+  assert(!(lfm->above_y[TX_8X8] & lfm->above_y[TX_4X4]));
+  assert(!(lfm->int_4x4_y & lfm->above_y[TX_16X16]));
+  assert(!(lfm->above_uv[TX_16X16] & lfm->above_uv[TX_8X8]));
+  assert(!(lfm->above_uv[TX_16X16] & lfm->above_uv[TX_4X4]));
+  assert(!(lfm->above_uv[TX_8X8] & lfm->above_uv[TX_4X4]));
+  assert(!(lfm->int_4x4_uv & lfm->above_uv[TX_16X16]));
+}
+
+// This function sets up the bit masks for the entire 64x64 region represented
+// by mi_row, mi_col.
+// TODO(JBB): This function only works for yv12.
+void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
+                    MODE_INFO **mi, const int mode_info_stride,
+                    LOOP_FILTER_MASK *lfm) {
+  int idx_32, idx_16, idx_8;
+  const loop_filter_info_n *const lfi_n = &cm->lf_info;
+  MODE_INFO **mip = mi;
+  MODE_INFO **mip2 = mi;
+
+  // These are offsets to the next mi in the 64x64 block. It is what gets
+  // added to the mi ptr as we go through each loop. It helps us to avoid
+  // setting up special row and column counters for each index. The last step
+  // brings us out back to the starting position.
+  const int offset_32[] = {4, (mode_info_stride << 2) - 4, 4,
+                           -(mode_info_stride << 2) - 4};
+  const int offset_16[] = {2, (mode_info_stride << 1) - 2, 2,
+                           -(mode_info_stride << 1) - 2};
+  const int offset[] = {1, mode_info_stride - 1, 1, -mode_info_stride - 1};
+
+  // Following variables represent shifts to position the current block
+  // mask over the appropriate block. A shift of 36 to the left will move
+  // the bits for the final 32 by 32 block in the 64x64 up 4 rows and left
+  // 4 rows to the appropriate spot.
+  const int shift_32_y[] = {0, 4, 32, 36};
+  const int shift_16_y[] = {0, 2, 16, 18};
+  const int shift_8_y[] = {0, 1, 8, 9};
+  const int shift_32_uv[] = {0, 2, 8, 10};
+  const int shift_16_uv[] = {0, 1, 4, 5};
+  const int max_rows = (mi_row + MI_BLOCK_SIZE > cm->mi_rows ?
+                        cm->mi_rows - mi_row : MI_BLOCK_SIZE);
+  const int max_cols = (mi_col + MI_BLOCK_SIZE > cm->mi_cols ?
+                        cm->mi_cols - mi_col : MI_BLOCK_SIZE);
+
+  vp9_zero(*lfm);
+  assert(mip[0] != NULL);
+
+  // TODO(jimbankoski): Try moving most of the following code into decode
+  // loop and storing lfm in the mbmi structure so that we don't have to go
+  // through the recursive loop structure multiple times.
+  switch (mip[0]->sb_type) {
+    case BLOCK_64X64:
+      build_masks(lfi_n, mip[0] , 0, 0, lfm);
+      break;
+    case BLOCK_64X32:
+      build_masks(lfi_n, mip[0], 0, 0, lfm);
+      mip2 = mip + mode_info_stride * 4;
+      if (4 >= max_rows)
+        break;
+      build_masks(lfi_n, mip2[0], 32, 8, lfm);
+      break;
+    case BLOCK_32X64:
+      build_masks(lfi_n, mip[0], 0, 0, lfm);
+      mip2 = mip + 4;
+      if (4 >= max_cols)
+        break;
+      build_masks(lfi_n, mip2[0], 4, 2, lfm);
+      break;
+    default:
+      for (idx_32 = 0; idx_32 < 4; mip += offset_32[idx_32], ++idx_32) {
+        const int shift_y = shift_32_y[idx_32];
+        const int shift_uv = shift_32_uv[idx_32];
+        const int mi_32_col_offset = ((idx_32 & 1) << 2);
+        const int mi_32_row_offset = ((idx_32 >> 1) << 2);
+        if (mi_32_col_offset >= max_cols || mi_32_row_offset >= max_rows)
+          continue;
+        switch (mip[0]->sb_type) {
+          case BLOCK_32X32:
+            build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+            break;
+          case BLOCK_32X16:
+            build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+            if (mi_32_row_offset + 2 >= max_rows)
+              continue;
+            mip2 = mip + mode_info_stride * 2;
+            build_masks(lfi_n, mip2[0], shift_y + 16, shift_uv + 4, lfm);
+            break;
+          case BLOCK_16X32:
+            build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+            if (mi_32_col_offset + 2 >= max_cols)
+              continue;
+            mip2 = mip + 2;
+            build_masks(lfi_n, mip2[0], shift_y + 2, shift_uv + 1, lfm);
+            break;
+          default:
+            for (idx_16 = 0; idx_16 < 4; mip += offset_16[idx_16], ++idx_16) {
+              const int shift_y = shift_32_y[idx_32] + shift_16_y[idx_16];
+              const int shift_uv = shift_32_uv[idx_32] + shift_16_uv[idx_16];
+              const int mi_16_col_offset = mi_32_col_offset +
+                  ((idx_16 & 1) << 1);
+              const int mi_16_row_offset = mi_32_row_offset +
+                  ((idx_16 >> 1) << 1);
+
+              if (mi_16_col_offset >= max_cols || mi_16_row_offset >= max_rows)
+                continue;
+
+              switch (mip[0]->sb_type) {
+                case BLOCK_16X16:
+                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+                  break;
+                case BLOCK_16X8:
+                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+                  if (mi_16_row_offset + 1 >= max_rows)
+                    continue;
+                  mip2 = mip + mode_info_stride;
+                  build_y_mask(lfi_n, mip2[0], shift_y+8, lfm);
+                  break;
+                case BLOCK_8X16:
+                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+                  if (mi_16_col_offset +1 >= max_cols)
+                    continue;
+                  mip2 = mip + 1;
+                  build_y_mask(lfi_n, mip2[0], shift_y+1, lfm);
+                  break;
+                default: {
+                  const int shift_y = shift_32_y[idx_32] +
+                                      shift_16_y[idx_16] +
+                                      shift_8_y[0];
+                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+                  mip += offset[0];
+                  for (idx_8 = 1; idx_8 < 4; mip += offset[idx_8], ++idx_8) {
+                    const int shift_y = shift_32_y[idx_32] +
+                                        shift_16_y[idx_16] +
+                                        shift_8_y[idx_8];
+                    const int mi_8_col_offset = mi_16_col_offset +
+                        ((idx_8 & 1));
+                    const int mi_8_row_offset = mi_16_row_offset +
+                        ((idx_8 >> 1));
+
+                    if (mi_8_col_offset >= max_cols ||
+                        mi_8_row_offset >= max_rows)
+                      continue;
+                    build_y_mask(lfi_n, mip[0], shift_y, lfm);
+                  }
+                  break;
+                }
+              }
+            }
+            break;
+        }
+      }
+      break;
+  }
+
+  vp9_adjust_mask(cm, mi_row, mi_col, lfm);
+}
+
+static void filter_selectively_vert(uint8_t *s, int pitch,
+                                    unsigned int mask_16x16,
+                                    unsigned int mask_8x8,
+                                    unsigned int mask_4x4,
+                                    unsigned int mask_4x4_int,
+                                    const loop_filter_info_n *lfi_n,
+                                    const uint8_t *lfl) {
+  unsigned int mask;
+
+  for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
+       mask; mask >>= 1) {
+    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
+
+    if (mask & 1) {
+      if (mask_16x16 & 1) {
+        vpx_lpf_vertical_16(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
+      } else if (mask_8x8 & 1) {
+        vpx_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+      } else if (mask_4x4 & 1) {
+        vpx_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+      }
+    }
+    if (mask_4x4_int & 1)
+      vpx_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+    s += 8;
+    lfl += 1;
+    mask_16x16 >>= 1;
+    mask_8x8 >>= 1;
+    mask_4x4 >>= 1;
+    mask_4x4_int >>= 1;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_filter_selectively_vert(uint16_t *s, int pitch,
+                                           unsigned int mask_16x16,
+                                           unsigned int mask_8x8,
+                                           unsigned int mask_4x4,
+                                           unsigned int mask_4x4_int,
+                                           const loop_filter_info_n *lfi_n,
+                                           const uint8_t *lfl, int bd) {
+  unsigned int mask;
+
+  for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
+       mask; mask >>= 1) {
+    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
+
+    if (mask & 1) {
+      if (mask_16x16 & 1) {
+        vpx_highbd_lpf_vertical_16(s, pitch, lfi->mblim, lfi->lim,
+                                   lfi->hev_thr, bd);
+      } else if (mask_8x8 & 1) {
+        vpx_highbd_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim,
+                                  lfi->hev_thr, 1, bd);
+      } else if (mask_4x4 & 1) {
+        vpx_highbd_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim,
+                                lfi->hev_thr, 1, bd);
+      }
+    }
+    if (mask_4x4_int & 1)
+      vpx_highbd_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim,
+                                lfi->hev_thr, 1, bd);
+    s += 8;
+    lfl += 1;
+    mask_16x16 >>= 1;
+    mask_8x8 >>= 1;
+    mask_4x4 >>= 1;
+    mask_4x4_int >>= 1;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+void vp9_filter_block_plane_non420(VP9_COMMON *cm,
+                                   struct macroblockd_plane *plane,
+                                   MODE_INFO **mi_8x8,
+                                   int mi_row, int mi_col) {
+  const int ss_x = plane->subsampling_x;
+  const int ss_y = plane->subsampling_y;
+  const int row_step = 1 << ss_y;
+  const int col_step = 1 << ss_x;
+  const int row_step_stride = cm->mi_stride * row_step;
+  struct buf_2d *const dst = &plane->dst;
+  uint8_t* const dst0 = dst->buf;
+  unsigned int mask_16x16[MI_BLOCK_SIZE] = {0};
+  unsigned int mask_8x8[MI_BLOCK_SIZE] = {0};
+  unsigned int mask_4x4[MI_BLOCK_SIZE] = {0};
+  unsigned int mask_4x4_int[MI_BLOCK_SIZE] = {0};
+  uint8_t lfl[MI_BLOCK_SIZE * MI_BLOCK_SIZE];
+  int r, c;
+
+  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
+    unsigned int mask_16x16_c = 0;
+    unsigned int mask_8x8_c = 0;
+    unsigned int mask_4x4_c = 0;
+    unsigned int border_mask;
+
+    // Determine the vertical edges that need filtering
+    for (c = 0; c < MI_BLOCK_SIZE && mi_col + c < cm->mi_cols; c += col_step) {
+      const MODE_INFO *mi = mi_8x8[c];
+      const BLOCK_SIZE sb_type = mi[0].sb_type;
+      const int skip_this = mi[0].skip && is_inter_block(mi);
+      // left edge of current unit is block/partition edge -> no skip
+      const int block_edge_left = (num_4x4_blocks_wide_lookup[sb_type] > 1) ?
+          !(c & (num_8x8_blocks_wide_lookup[sb_type] - 1)) : 1;
+      const int skip_this_c = skip_this && !block_edge_left;
+      // top edge of current unit is block/partition edge -> no skip
+      const int block_edge_above = (num_4x4_blocks_high_lookup[sb_type] > 1) ?
+          !(r & (num_8x8_blocks_high_lookup[sb_type] - 1)) : 1;
+      const int skip_this_r = skip_this && !block_edge_above;
+      const TX_SIZE tx_size = get_uv_tx_size(mi, plane);
+      const int skip_border_4x4_c = ss_x && mi_col + c == cm->mi_cols - 1;
+      const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
+
+      // Filter level can vary per MI
+      if (!(lfl[(r << 3) + (c >> ss_x)] =
+            get_filter_level(&cm->lf_info, mi)))
+        continue;
+
+      // Build masks based on the transform size of each block
+      if (tx_size == TX_32X32) {
+        if (!skip_this_c && ((c >> ss_x) & 3) == 0) {
+          if (!skip_border_4x4_c)
+            mask_16x16_c |= 1 << (c >> ss_x);
+          else
+            mask_8x8_c |= 1 << (c >> ss_x);
+        }
+        if (!skip_this_r && ((r >> ss_y) & 3) == 0) {
+          if (!skip_border_4x4_r)
+            mask_16x16[r] |= 1 << (c >> ss_x);
+          else
+            mask_8x8[r] |= 1 << (c >> ss_x);
+        }
+      } else if (tx_size == TX_16X16) {
+        if (!skip_this_c && ((c >> ss_x) & 1) == 0) {
+          if (!skip_border_4x4_c)
+            mask_16x16_c |= 1 << (c >> ss_x);
+          else
+            mask_8x8_c |= 1 << (c >> ss_x);
+        }
+        if (!skip_this_r && ((r >> ss_y) & 1) == 0) {
+          if (!skip_border_4x4_r)
+            mask_16x16[r] |= 1 << (c >> ss_x);
+          else
+            mask_8x8[r] |= 1 << (c >> ss_x);
+        }
+      } else {
+        // force 8x8 filtering on 32x32 boundaries
+        if (!skip_this_c) {
+          if (tx_size == TX_8X8 || ((c >> ss_x) & 3) == 0)
+            mask_8x8_c |= 1 << (c >> ss_x);
+          else
+            mask_4x4_c |= 1 << (c >> ss_x);
+        }
+
+        if (!skip_this_r) {
+          if (tx_size == TX_8X8 || ((r >> ss_y) & 3) == 0)
+            mask_8x8[r] |= 1 << (c >> ss_x);
+          else
+            mask_4x4[r] |= 1 << (c >> ss_x);
+        }
+
+        if (!skip_this && tx_size < TX_8X8 && !skip_border_4x4_c)
+          mask_4x4_int[r] |= 1 << (c >> ss_x);
+      }
+    }
+
+    // Disable filtering on the leftmost column
+    border_mask = ~(mi_col == 0);
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (cm->use_highbitdepth) {
+      highbd_filter_selectively_vert(CONVERT_TO_SHORTPTR(dst->buf),
+                                     dst->stride,
+                                     mask_16x16_c & border_mask,
+                                     mask_8x8_c & border_mask,
+                                     mask_4x4_c & border_mask,
+                                     mask_4x4_int[r],
+                                     &cm->lf_info, &lfl[r << 3],
+                                     (int)cm->bit_depth);
+    } else {
+      filter_selectively_vert(dst->buf, dst->stride,
+                              mask_16x16_c & border_mask,
+                              mask_8x8_c & border_mask,
+                              mask_4x4_c & border_mask,
+                              mask_4x4_int[r],
+                              &cm->lf_info, &lfl[r << 3]);
+    }
+#else
+    filter_selectively_vert(dst->buf, dst->stride,
+                            mask_16x16_c & border_mask,
+                            mask_8x8_c & border_mask,
+                            mask_4x4_c & border_mask,
+                            mask_4x4_int[r],
+                            &cm->lf_info, &lfl[r << 3]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    dst->buf += 8 * dst->stride;
+    mi_8x8 += row_step_stride;
+  }
+
+  // Now do horizontal pass
+  dst->buf = dst0;
+  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
+    const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
+    const unsigned int mask_4x4_int_r = skip_border_4x4_r ? 0 : mask_4x4_int[r];
+
+    unsigned int mask_16x16_r;
+    unsigned int mask_8x8_r;
+    unsigned int mask_4x4_r;
+
+    if (mi_row + r == 0) {
+      mask_16x16_r = 0;
+      mask_8x8_r = 0;
+      mask_4x4_r = 0;
+    } else {
+      mask_16x16_r = mask_16x16[r];
+      mask_8x8_r = mask_8x8[r];
+      mask_4x4_r = mask_4x4[r];
+    }
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (cm->use_highbitdepth) {
+      highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
+                                      dst->stride,
+                                      mask_16x16_r,
+                                      mask_8x8_r,
+                                      mask_4x4_r,
+                                      mask_4x4_int_r,
+                                      &cm->lf_info, &lfl[r << 3],
+                                      (int)cm->bit_depth);
+    } else {
+      filter_selectively_horiz(dst->buf, dst->stride,
+                               mask_16x16_r,
+                               mask_8x8_r,
+                               mask_4x4_r,
+                               mask_4x4_int_r,
+                               &cm->lf_info, &lfl[r << 3]);
+    }
+#else
+    filter_selectively_horiz(dst->buf, dst->stride,
+                             mask_16x16_r,
+                             mask_8x8_r,
+                             mask_4x4_r,
+                             mask_4x4_int_r,
+                             &cm->lf_info, &lfl[r << 3]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    dst->buf += 8 * dst->stride;
+  }
+}
+
+void vp9_filter_block_plane_ss00(VP9_COMMON *const cm,
+                                 struct macroblockd_plane *const plane,
+                                 int mi_row,
+                                 LOOP_FILTER_MASK *lfm) {
+  struct buf_2d *const dst = &plane->dst;
+  uint8_t *const dst0 = dst->buf;
+  int r;
+  uint64_t mask_16x16 = lfm->left_y[TX_16X16];
+  uint64_t mask_8x8 = lfm->left_y[TX_8X8];
+  uint64_t mask_4x4 = lfm->left_y[TX_4X4];
+  uint64_t mask_4x4_int = lfm->int_4x4_y;
+
+  assert(plane->subsampling_x == 0 && plane->subsampling_y == 0);
+
+  // Vertical pass: do 2 rows at one time
+  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) {
+    unsigned int mask_16x16_l = mask_16x16 & 0xffff;
+    unsigned int mask_8x8_l = mask_8x8 & 0xffff;
+    unsigned int mask_4x4_l = mask_4x4 & 0xffff;
+    unsigned int mask_4x4_int_l = mask_4x4_int & 0xffff;
+
+// Disable filtering on the leftmost column.
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (cm->use_highbitdepth) {
+      highbd_filter_selectively_vert_row2(
+          plane->subsampling_x, CONVERT_TO_SHORTPTR(dst->buf), dst->stride,
+          mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
+          &lfm->lfl_y[r << 3], (int)cm->bit_depth);
+    } else {
+      filter_selectively_vert_row2(
+          plane->subsampling_x, dst->buf, dst->stride, mask_16x16_l, mask_8x8_l,
+          mask_4x4_l, mask_4x4_int_l, &cm->lf_info, &lfm->lfl_y[r << 3]);
+    }
+#else
+    filter_selectively_vert_row2(
+        plane->subsampling_x, dst->buf, dst->stride, mask_16x16_l, mask_8x8_l,
+        mask_4x4_l, mask_4x4_int_l, &cm->lf_info, &lfm->lfl_y[r << 3]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    dst->buf += 16 * dst->stride;
+    mask_16x16 >>= 16;
+    mask_8x8 >>= 16;
+    mask_4x4 >>= 16;
+    mask_4x4_int >>= 16;
+  }
+
+  // Horizontal pass
+  dst->buf = dst0;
+  mask_16x16 = lfm->above_y[TX_16X16];
+  mask_8x8 = lfm->above_y[TX_8X8];
+  mask_4x4 = lfm->above_y[TX_4X4];
+  mask_4x4_int = lfm->int_4x4_y;
+
+  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r++) {
+    unsigned int mask_16x16_r;
+    unsigned int mask_8x8_r;
+    unsigned int mask_4x4_r;
+
+    if (mi_row + r == 0) {
+      mask_16x16_r = 0;
+      mask_8x8_r = 0;
+      mask_4x4_r = 0;
+    } else {
+      mask_16x16_r = mask_16x16 & 0xff;
+      mask_8x8_r = mask_8x8 & 0xff;
+      mask_4x4_r = mask_4x4 & 0xff;
+    }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (cm->use_highbitdepth) {
+      highbd_filter_selectively_horiz(
+          CONVERT_TO_SHORTPTR(dst->buf), dst->stride, mask_16x16_r, mask_8x8_r,
+          mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info, &lfm->lfl_y[r << 3],
+          (int)cm->bit_depth);
+    } else {
+      filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
+                               mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info,
+                               &lfm->lfl_y[r << 3]);
+    }
+#else
+    filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
+                             mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info,
+                             &lfm->lfl_y[r << 3]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    dst->buf += 8 * dst->stride;
+    mask_16x16 >>= 8;
+    mask_8x8 >>= 8;
+    mask_4x4 >>= 8;
+    mask_4x4_int >>= 8;
+  }
+}
+
+void vp9_filter_block_plane_ss11(VP9_COMMON *const cm,
+                                 struct macroblockd_plane *const plane,
+                                 int mi_row,
+                                 LOOP_FILTER_MASK *lfm) {
+  struct buf_2d *const dst = &plane->dst;
+  uint8_t *const dst0 = dst->buf;
+  int r, c;
+  uint8_t lfl_uv[16];
+
+  uint16_t mask_16x16 = lfm->left_uv[TX_16X16];
+  uint16_t mask_8x8 = lfm->left_uv[TX_8X8];
+  uint16_t mask_4x4 = lfm->left_uv[TX_4X4];
+  uint16_t mask_4x4_int = lfm->int_4x4_uv;
+
+  assert(plane->subsampling_x == 1 && plane->subsampling_y == 1);
+
+  // Vertical pass: do 2 rows at one time
+  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 4) {
+    for (c = 0; c < (MI_BLOCK_SIZE >> 1); c++) {
+      lfl_uv[(r << 1) + c] = lfm->lfl_y[(r << 3) + (c << 1)];
+      lfl_uv[((r + 2) << 1) + c] = lfm->lfl_y[((r + 2) << 3) + (c << 1)];
+    }
+
+    {
+      unsigned int mask_16x16_l = mask_16x16 & 0xff;
+      unsigned int mask_8x8_l = mask_8x8 & 0xff;
+      unsigned int mask_4x4_l = mask_4x4 & 0xff;
+      unsigned int mask_4x4_int_l = mask_4x4_int & 0xff;
+
+// Disable filtering on the leftmost column.
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (cm->use_highbitdepth) {
+        highbd_filter_selectively_vert_row2(
+            plane->subsampling_x, CONVERT_TO_SHORTPTR(dst->buf), dst->stride,
+            mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
+            &lfl_uv[r << 1], (int)cm->bit_depth);
+      } else {
+        filter_selectively_vert_row2(
+            plane->subsampling_x, dst->buf, dst->stride,
+            mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
+            &lfl_uv[r << 1]);
+      }
+#else
+      filter_selectively_vert_row2(
+          plane->subsampling_x, dst->buf, dst->stride,
+          mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
+          &lfl_uv[r << 1]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+      dst->buf += 16 * dst->stride;
+      mask_16x16 >>= 8;
+      mask_8x8 >>= 8;
+      mask_4x4 >>= 8;
+      mask_4x4_int >>= 8;
+    }
+  }
+
+  // Horizontal pass
+  dst->buf = dst0;
+  mask_16x16 = lfm->above_uv[TX_16X16];
+  mask_8x8 = lfm->above_uv[TX_8X8];
+  mask_4x4 = lfm->above_uv[TX_4X4];
+  mask_4x4_int = lfm->int_4x4_uv;
+
+  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) {
+    const int skip_border_4x4_r = mi_row + r == cm->mi_rows - 1;
+    const unsigned int mask_4x4_int_r =
+        skip_border_4x4_r ? 0 : (mask_4x4_int & 0xf);
+    unsigned int mask_16x16_r;
+    unsigned int mask_8x8_r;
+    unsigned int mask_4x4_r;
+
+    if (mi_row + r == 0) {
+      mask_16x16_r = 0;
+      mask_8x8_r = 0;
+      mask_4x4_r = 0;
+    } else {
+      mask_16x16_r = mask_16x16 & 0xf;
+      mask_8x8_r = mask_8x8 & 0xf;
+      mask_4x4_r = mask_4x4 & 0xf;
+    }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (cm->use_highbitdepth) {
+      highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
+                                      dst->stride, mask_16x16_r, mask_8x8_r,
+                                      mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
+                                      &lfl_uv[r << 1], (int)cm->bit_depth);
+    } else {
+      filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
+                               mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
+                               &lfl_uv[r << 1]);
+    }
+#else
+    filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
+                             mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
+                             &lfl_uv[r << 1]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    dst->buf += 8 * dst->stride;
+    mask_16x16 >>= 4;
+    mask_8x8 >>= 4;
+    mask_4x4 >>= 4;
+    mask_4x4_int >>= 4;
+  }
+}
+
+static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, VP9_COMMON *cm,
+                             struct macroblockd_plane planes[MAX_MB_PLANE],
+                             int start, int stop, int y_only) {
+  const int num_planes = y_only ? 1 : MAX_MB_PLANE;
+  enum lf_path path;
+  int mi_row, mi_col;
+
+  if (y_only)
+    path = LF_PATH_444;
+  else if (planes[1].subsampling_y == 1 && planes[1].subsampling_x == 1)
+    path = LF_PATH_420;
+  else if (planes[1].subsampling_y == 0 && planes[1].subsampling_x == 0)
+    path = LF_PATH_444;
+  else
+    path = LF_PATH_SLOW;
+
+  for (mi_row = start; mi_row < stop; mi_row += MI_BLOCK_SIZE) {
+    MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
+    LOOP_FILTER_MASK *lfm = get_lfm(&cm->lf, mi_row, 0);
+
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE, ++lfm) {
+      int plane;
+
+      vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
+
+      // TODO(JBB): Make setup_mask work for non 420.
+      vp9_adjust_mask(cm, mi_row, mi_col, lfm);
+
+      vp9_filter_block_plane_ss00(cm, &planes[0], mi_row, lfm);
+      for (plane = 1; plane < num_planes; ++plane) {
+        switch (path) {
+          case LF_PATH_420:
+            vp9_filter_block_plane_ss11(cm, &planes[plane], mi_row, lfm);
+            break;
+          case LF_PATH_444:
+            vp9_filter_block_plane_ss00(cm, &planes[plane], mi_row, lfm);
+            break;
+          case LF_PATH_SLOW:
+            vp9_filter_block_plane_non420(cm, &planes[plane], mi + mi_col,
+                                          mi_row, mi_col);
+            break;
+        }
+      }
+    }
+  }
+}
+
+void vp9_loop_filter_frame(YV12_BUFFER_CONFIG *frame,
+                           VP9_COMMON *cm, MACROBLOCKD *xd,
+                           int frame_filter_level,
+                           int y_only, int partial_frame) {
+  int start_mi_row, end_mi_row, mi_rows_to_filter;
+  if (!frame_filter_level) return;
+  start_mi_row = 0;
+  mi_rows_to_filter = cm->mi_rows;
+  if (partial_frame && cm->mi_rows > 8) {
+    start_mi_row = cm->mi_rows >> 1;
+    start_mi_row &= 0xfffffff8;
+    mi_rows_to_filter = VPXMAX(cm->mi_rows / 8, 8);
+  }
+  end_mi_row = start_mi_row + mi_rows_to_filter;
+  loop_filter_rows(frame, cm, xd->plane, start_mi_row, end_mi_row, y_only);
+}
+
+// Used by the encoder to build the loopfilter masks.
+void vp9_build_mask_frame(VP9_COMMON *cm, int frame_filter_level,
+                          int partial_frame) {
+  int start_mi_row, end_mi_row, mi_rows_to_filter;
+  int mi_col, mi_row;
+  if (!frame_filter_level) return;
+  start_mi_row = 0;
+  mi_rows_to_filter = cm->mi_rows;
+  if (partial_frame && cm->mi_rows > 8) {
+    start_mi_row = cm->mi_rows >> 1;
+    start_mi_row &= 0xfffffff8;
+    mi_rows_to_filter = VPXMAX(cm->mi_rows / 8, 8);
+  }
+  end_mi_row = start_mi_row + mi_rows_to_filter;
+
+  vp9_loop_filter_frame_init(cm, frame_filter_level);
+
+  for (mi_row = start_mi_row; mi_row < end_mi_row; mi_row += MI_BLOCK_SIZE) {
+    MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
+      // vp9_setup_mask() zeros lfm
+      vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride,
+                     get_lfm(&cm->lf, mi_row, mi_col));
+    }
+  }
+}
+
+// 8x8 blocks in a superblock.  A "1" represents the first block in a 16x16
+// or greater area.
+static const uint8_t first_block_in_16x16[8][8] = {
+  {1, 0, 1, 0, 1, 0, 1, 0},
+  {0, 0, 0, 0, 0, 0, 0, 0},
+  {1, 0, 1, 0, 1, 0, 1, 0},
+  {0, 0, 0, 0, 0, 0, 0, 0},
+  {1, 0, 1, 0, 1, 0, 1, 0},
+  {0, 0, 0, 0, 0, 0, 0, 0},
+  {1, 0, 1, 0, 1, 0, 1, 0},
+  {0, 0, 0, 0, 0, 0, 0, 0}
+};
+
+// This function sets up the bit masks for a block represented
+// by mi_row, mi_col in a 64x64 region.
+// TODO(SJL): This function only works for yv12.
+void vp9_build_mask(VP9_COMMON *cm, const MODE_INFO *mi, int mi_row,
+                    int mi_col, int bw, int bh) {
+  const BLOCK_SIZE block_size = mi->sb_type;
+  const TX_SIZE tx_size_y = mi->tx_size;
+  const loop_filter_info_n *const lfi_n = &cm->lf_info;
+  const int filter_level = get_filter_level(lfi_n, mi);
+  const TX_SIZE tx_size_uv = get_uv_tx_size_impl(tx_size_y, block_size, 1, 1);
+  LOOP_FILTER_MASK *const lfm = get_lfm(&cm->lf, mi_row, mi_col);
+  uint64_t *const left_y = &lfm->left_y[tx_size_y];
+  uint64_t *const above_y = &lfm->above_y[tx_size_y];
+  uint64_t *const int_4x4_y = &lfm->int_4x4_y;
+  uint16_t *const left_uv = &lfm->left_uv[tx_size_uv];
+  uint16_t *const above_uv = &lfm->above_uv[tx_size_uv];
+  uint16_t *const int_4x4_uv = &lfm->int_4x4_uv;
+  const int row_in_sb = (mi_row & 7);
+  const int col_in_sb = (mi_col & 7);
+  const int shift_y = col_in_sb + (row_in_sb << 3);
+  const int shift_uv = (col_in_sb >> 1) + ((row_in_sb >> 1) << 2);
+  const int build_uv = first_block_in_16x16[row_in_sb][col_in_sb];
+
+  if (!filter_level) {
+    return;
+  } else {
+    int index = shift_y;
+    int i;
+    for (i = 0; i < bh; i++) {
+      memset(&lfm->lfl_y[index], filter_level, bw);
+      index += 8;
+    }
+  }
+
+  // These set 1 in the current block size for the block size edges.
+  // For instance if the block size is 32x16, we'll set:
+  //    above =   1111
+  //              0000
+  //    and
+  //    left  =   1000
+  //          =   1000
+  // NOTE : In this example the low bit is left most ( 1000 ) is stored as
+  //        1,  not 8...
+  //
+  // U and V set things on a 16 bit scale.
+  //
+  *above_y |= above_prediction_mask[block_size] << shift_y;
+  *left_y |= left_prediction_mask[block_size] << shift_y;
+
+  if (build_uv) {
+    *above_uv |= above_prediction_mask_uv[block_size] << shift_uv;
+    *left_uv |= left_prediction_mask_uv[block_size] << shift_uv;
+  }
+
+  // If the block has no coefficients and is not intra we skip applying
+  // the loop filter on block edges.
+  if (mi->skip && is_inter_block(mi))
+    return;
+
+  // Add a mask for the transform size. The transform size mask is set to
+  // be correct for a 64x64 prediction block size. Mask to match the size of
+  // the block we are working on and then shift it into place.
+  *above_y |= (size_mask[block_size] &
+               above_64x64_txform_mask[tx_size_y]) << shift_y;
+  *left_y |= (size_mask[block_size] &
+              left_64x64_txform_mask[tx_size_y]) << shift_y;
+
+  if (build_uv) {
+    *above_uv |= (size_mask_uv[block_size] &
+                  above_64x64_txform_mask_uv[tx_size_uv]) << shift_uv;
+
+    *left_uv |= (size_mask_uv[block_size] &
+                 left_64x64_txform_mask_uv[tx_size_uv]) << shift_uv;
+  }
+
+  // Try to determine what to do with the internal 4x4 block boundaries.  These
+  // differ from the 4x4 boundaries on the outside edge of an 8x8 in that the
+  // internal ones can be skipped and don't depend on the prediction block size.
+  if (tx_size_y == TX_4X4)
+    *int_4x4_y |= size_mask[block_size] << shift_y;
+
+  if (build_uv && tx_size_uv == TX_4X4)
+    *int_4x4_uv |= (size_mask_uv[block_size] & 0xffff) << shift_uv;
+}
+
+void vp9_loop_filter_data_reset(
+    LFWorkerData *lf_data, YV12_BUFFER_CONFIG *frame_buffer,
+    struct VP9Common *cm, const struct macroblockd_plane planes[MAX_MB_PLANE]) {
+  lf_data->frame_buffer = frame_buffer;
+  lf_data->cm = cm;
+  lf_data->start = 0;
+  lf_data->stop = 0;
+  lf_data->y_only = 0;
+  memcpy(lf_data->planes, planes, sizeof(lf_data->planes));
+}
+
+void vp9_reset_lfm(VP9_COMMON *const cm) {
+  if (cm->lf.filter_level) {
+    memset(cm->lf.lfm, 0,
+           ((cm->mi_rows + (MI_BLOCK_SIZE - 1)) >> 3) * cm->lf.lfm_stride *
+            sizeof(*cm->lf.lfm));
+  }
+}
+
+int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused) {
+  (void)unused;
+  loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
+                   lf_data->start, lf_data->stop, lf_data->y_only);
+  return 1;
+}
diff --git a/libs/libvpx/vp9/common/vp9_loopfilter.h b/libs/libvpx/vp9/common/vp9_loopfilter.h
new file mode 100644
index 0000000000..fca8830fa1
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_loopfilter.h
@@ -0,0 +1,166 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_LOOPFILTER_H_
+#define VP9_COMMON_VP9_LOOPFILTER_H_
+
+#include "vpx_ports/mem.h"
+#include "./vpx_config.h"
+
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_seg_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_LOOP_FILTER 63
+#define MAX_SHARPNESS 7
+
+#define SIMD_WIDTH 16
+
+#define MAX_REF_LF_DELTAS       4
+#define MAX_MODE_LF_DELTAS      2
+
+enum lf_path {
+  LF_PATH_420,
+  LF_PATH_444,
+  LF_PATH_SLOW,
+};
+
+// Need to align this structure so when it is declared and
+// passed it can be loaded into vector registers.
+typedef struct {
+  DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, mblim[SIMD_WIDTH]);
+  DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, lim[SIMD_WIDTH]);
+  DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, hev_thr[SIMD_WIDTH]);
+} loop_filter_thresh;
+
+typedef struct {
+  loop_filter_thresh lfthr[MAX_LOOP_FILTER + 1];
+  uint8_t lvl[MAX_SEGMENTS][MAX_REF_FRAMES][MAX_MODE_LF_DELTAS];
+} loop_filter_info_n;
+
+// This structure holds bit masks for all 8x8 blocks in a 64x64 region.
+// Each 1 bit represents a position in which we want to apply the loop filter.
+// Left_ entries refer to whether we apply a filter on the border to the
+// left of the block.   Above_ entries refer to whether or not to apply a
+// filter on the above border.   Int_ entries refer to whether or not to
+// apply borders on the 4x4 edges within the 8x8 block that each bit
+// represents.
+// Since each transform is accompanied by a potentially different type of
+// loop filter there is a different entry in the array for each transform size.
+typedef struct {
+  uint64_t left_y[TX_SIZES];
+  uint64_t above_y[TX_SIZES];
+  uint64_t int_4x4_y;
+  uint16_t left_uv[TX_SIZES];
+  uint16_t above_uv[TX_SIZES];
+  uint16_t int_4x4_uv;
+  uint8_t lfl_y[64];
+} LOOP_FILTER_MASK;
+
+struct loopfilter {
+  int filter_level;
+  int last_filt_level;
+
+  int sharpness_level;
+  int last_sharpness_level;
+
+  uint8_t mode_ref_delta_enabled;
+  uint8_t mode_ref_delta_update;
+
+  // 0 = Intra, Last, GF, ARF
+  signed char ref_deltas[MAX_REF_LF_DELTAS];
+  signed char last_ref_deltas[MAX_REF_LF_DELTAS];
+
+  // 0 = ZERO_MV, MV
+  signed char mode_deltas[MAX_MODE_LF_DELTAS];
+  signed char last_mode_deltas[MAX_MODE_LF_DELTAS];
+
+  LOOP_FILTER_MASK *lfm;
+  int lfm_stride;
+};
+
+/* assorted loopfilter functions which get used elsewhere */
+struct VP9Common;
+struct macroblockd;
+struct VP9LfSyncData;
+
+// This function sets up the bit masks for the entire 64x64 region represented
+// by mi_row, mi_col.
+void vp9_setup_mask(struct VP9Common *const cm,
+                    const int mi_row, const int mi_col,
+                    MODE_INFO **mi_8x8, const int mode_info_stride,
+                    LOOP_FILTER_MASK *lfm);
+
+void vp9_filter_block_plane_ss00(struct VP9Common *const cm,
+                                 struct macroblockd_plane *const plane,
+                                 int mi_row,
+                                 LOOP_FILTER_MASK *lfm);
+
+void vp9_filter_block_plane_ss11(struct VP9Common *const cm,
+                                 struct macroblockd_plane *const plane,
+                                 int mi_row,
+                                 LOOP_FILTER_MASK *lfm);
+
+void vp9_filter_block_plane_non420(struct VP9Common *cm,
+                                   struct macroblockd_plane *plane,
+                                   MODE_INFO **mi_8x8,
+                                   int mi_row, int mi_col);
+
+void vp9_loop_filter_init(struct VP9Common *cm);
+
+// Update the loop filter for the current frame.
+// This should be called before vp9_loop_filter_frame(), vp9_build_mask_frame()
+// calls this function directly.
+void vp9_loop_filter_frame_init(struct VP9Common *cm, int default_filt_lvl);
+
+void vp9_loop_filter_frame(YV12_BUFFER_CONFIG *frame,
+                           struct VP9Common *cm,
+                           struct macroblockd *mbd,
+                           int filter_level,
+                           int y_only, int partial_frame);
+
+// Get the superblock lfm for a given mi_row, mi_col.
+static INLINE LOOP_FILTER_MASK *get_lfm(const struct loopfilter *lf,
+                                        const int mi_row, const int mi_col) {
+  return &lf->lfm[(mi_col >> 3) + ((mi_row >> 3) * lf->lfm_stride)];
+}
+
+void vp9_build_mask(struct VP9Common *cm, const MODE_INFO *mi, int mi_row,
+                    int mi_col, int bw, int bh);
+void vp9_adjust_mask(struct VP9Common *const cm, const int mi_row,
+                     const int mi_col, LOOP_FILTER_MASK *lfm);
+void vp9_build_mask_frame(struct VP9Common *cm, int frame_filter_level,
+                          int partial_frame);
+void vp9_reset_lfm(struct VP9Common *const cm);
+
+typedef struct LoopFilterWorkerData {
+  YV12_BUFFER_CONFIG *frame_buffer;
+  struct VP9Common *cm;
+  struct macroblockd_plane planes[MAX_MB_PLANE];
+
+  int start;
+  int stop;
+  int y_only;
+} LFWorkerData;
+
+void vp9_loop_filter_data_reset(
+    LFWorkerData *lf_data, YV12_BUFFER_CONFIG *frame_buffer,
+    struct VP9Common *cm, const struct macroblockd_plane planes[MAX_MB_PLANE]);
+
+// Operates on the rows described by 'lf_data'.
+int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_COMMON_VP9_LOOPFILTER_H_
diff --git a/libs/libvpx/vp9/common/vp9_mfqe.c b/libs/libvpx/vp9/common/vp9_mfqe.c
new file mode 100644
index 0000000000..f5264665bd
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_mfqe.c
@@ -0,0 +1,394 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_scale_rtcd.h"
+
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/vp9_postproc.h"
+
+// TODO(jackychen): Replace this function with SSE2 code. There is
+// one SSE2 implementation in vp8, so will consider how to share it
+// between vp8 and vp9.
+static void filter_by_weight(const uint8_t *src, int src_stride,
+                             uint8_t *dst, int dst_stride,
+                             int block_size, int src_weight) {
+  const int dst_weight = (1 << MFQE_PRECISION) - src_weight;
+  const int rounding_bit = 1 << (MFQE_PRECISION - 1);
+  int r, c;
+
+  for (r = 0; r < block_size; r++) {
+    for (c = 0; c < block_size; c++) {
+      dst[c] = (src[c] * src_weight + dst[c] * dst_weight + rounding_bit)
+               >> MFQE_PRECISION;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void vp9_filter_by_weight8x8_c(const uint8_t *src, int src_stride,
+                               uint8_t *dst, int dst_stride, int src_weight) {
+  filter_by_weight(src, src_stride, dst, dst_stride, 8, src_weight);
+}
+
+void vp9_filter_by_weight16x16_c(const uint8_t *src, int src_stride,
+                                 uint8_t *dst, int dst_stride,
+                                 int src_weight) {
+  filter_by_weight(src, src_stride, dst, dst_stride, 16, src_weight);
+}
+
+static void filter_by_weight32x32(const uint8_t *src, int src_stride,
+                                  uint8_t *dst, int dst_stride, int weight) {
+  vp9_filter_by_weight16x16(src, src_stride, dst, dst_stride, weight);
+  vp9_filter_by_weight16x16(src + 16, src_stride, dst + 16, dst_stride,
+                            weight);
+  vp9_filter_by_weight16x16(src + src_stride * 16, src_stride,
+                            dst + dst_stride * 16, dst_stride, weight);
+  vp9_filter_by_weight16x16(src + src_stride * 16 + 16, src_stride,
+                            dst + dst_stride * 16 + 16, dst_stride, weight);
+}
+
+static void filter_by_weight64x64(const uint8_t *src, int src_stride,
+                                  uint8_t *dst, int dst_stride, int weight) {
+  filter_by_weight32x32(src, src_stride, dst, dst_stride, weight);
+  filter_by_weight32x32(src + 32, src_stride, dst + 32,
+                        dst_stride, weight);
+  filter_by_weight32x32(src + src_stride * 32, src_stride,
+                        dst + dst_stride * 32, dst_stride, weight);
+  filter_by_weight32x32(src + src_stride * 32 + 32, src_stride,
+                        dst + dst_stride * 32 + 32, dst_stride, weight);
+}
+
+static void apply_ifactor(const uint8_t *y, int y_stride, uint8_t *yd,
+                          int yd_stride, const uint8_t *u, const uint8_t *v,
+                          int uv_stride, uint8_t *ud, uint8_t *vd,
+                          int uvd_stride, BLOCK_SIZE block_size,
+                          int weight) {
+  if (block_size == BLOCK_16X16) {
+    vp9_filter_by_weight16x16(y, y_stride, yd, yd_stride, weight);
+    vp9_filter_by_weight8x8(u, uv_stride, ud, uvd_stride, weight);
+    vp9_filter_by_weight8x8(v, uv_stride, vd, uvd_stride, weight);
+  } else if (block_size == BLOCK_32X32) {
+    filter_by_weight32x32(y, y_stride, yd, yd_stride, weight);
+    vp9_filter_by_weight16x16(u, uv_stride, ud, uvd_stride, weight);
+    vp9_filter_by_weight16x16(v, uv_stride, vd, uvd_stride, weight);
+  } else if (block_size == BLOCK_64X64) {
+    filter_by_weight64x64(y, y_stride, yd, yd_stride, weight);
+    filter_by_weight32x32(u, uv_stride, ud, uvd_stride, weight);
+    filter_by_weight32x32(v, uv_stride, vd, uvd_stride, weight);
+  }
+}
+
+// TODO(jackychen): Determine whether replace it with assembly code.
+static void copy_mem8x8(const uint8_t *src, int src_stride,
+                        uint8_t *dst, int dst_stride) {
+  int r;
+  for (r = 0; r < 8; r++) {
+    memcpy(dst, src, 8);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void copy_mem16x16(const uint8_t *src, int src_stride,
+                          uint8_t *dst, int dst_stride) {
+  int r;
+  for (r = 0; r < 16; r++) {
+    memcpy(dst, src, 16);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void copy_mem32x32(const uint8_t *src, int src_stride,
+                          uint8_t *dst, int dst_stride) {
+  copy_mem16x16(src, src_stride, dst, dst_stride);
+  copy_mem16x16(src + 16, src_stride, dst + 16, dst_stride);
+  copy_mem16x16(src + src_stride * 16, src_stride,
+                dst + dst_stride * 16, dst_stride);
+  copy_mem16x16(src + src_stride * 16 + 16, src_stride,
+                dst + dst_stride * 16 + 16, dst_stride);
+}
+
+static void copy_mem64x64(const uint8_t *src, int src_stride,
+                          uint8_t *dst, int dst_stride) {
+  copy_mem32x32(src, src_stride, dst, dst_stride);
+  copy_mem32x32(src + 32, src_stride, dst + 32, dst_stride);
+  copy_mem32x32(src + src_stride * 32, src_stride,
+                dst + src_stride * 32, dst_stride);
+  copy_mem32x32(src + src_stride * 32 + 32, src_stride,
+                dst + src_stride * 32 + 32, dst_stride);
+}
+
+static void copy_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
+                       int y_stride, int uv_stride, uint8_t *yd, uint8_t *ud,
+                       uint8_t *vd, int yd_stride, int uvd_stride,
+                       BLOCK_SIZE bs) {
+  if (bs == BLOCK_16X16) {
+    copy_mem16x16(y, y_stride, yd, yd_stride);
+    copy_mem8x8(u, uv_stride, ud, uvd_stride);
+    copy_mem8x8(v, uv_stride, vd, uvd_stride);
+  } else if (bs == BLOCK_32X32) {
+    copy_mem32x32(y, y_stride, yd, yd_stride);
+    copy_mem16x16(u, uv_stride, ud, uvd_stride);
+    copy_mem16x16(v, uv_stride, vd, uvd_stride);
+  } else {
+    copy_mem64x64(y, y_stride, yd, yd_stride);
+    copy_mem32x32(u, uv_stride, ud, uvd_stride);
+    copy_mem32x32(v, uv_stride, vd, uvd_stride);
+  }
+}
+
+static void get_thr(BLOCK_SIZE bs, int qdiff, int *sad_thr, int *vdiff_thr) {
+  const int adj = qdiff >> MFQE_PRECISION;
+  if (bs == BLOCK_16X16) {
+    *sad_thr = 7 + adj;
+  } else if (bs == BLOCK_32X32) {
+    *sad_thr = 6 + adj;
+  } else {  // BLOCK_64X64
+    *sad_thr = 5 + adj;
+  }
+  *vdiff_thr = 125 + qdiff;
+}
+
+static void mfqe_block(BLOCK_SIZE bs, const uint8_t *y, const uint8_t *u,
+                       const uint8_t *v, int y_stride, int uv_stride,
+                       uint8_t *yd, uint8_t *ud, uint8_t *vd, int yd_stride,
+                       int uvd_stride, int qdiff) {
+  int sad, sad_thr, vdiff, vdiff_thr;
+  uint32_t sse;
+
+  get_thr(bs, qdiff, &sad_thr, &vdiff_thr);
+
+  if (bs == BLOCK_16X16) {
+    vdiff = (vpx_variance16x16(y, y_stride, yd, yd_stride, &sse) + 128) >> 8;
+    sad = (vpx_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8;
+  } else if (bs == BLOCK_32X32) {
+    vdiff = (vpx_variance32x32(y, y_stride, yd, yd_stride, &sse) + 512) >> 10;
+    sad = (vpx_sad32x32(y, y_stride, yd, yd_stride) + 512) >> 10;
+  } else /* if (bs == BLOCK_64X64) */ {
+    vdiff = (vpx_variance64x64(y, y_stride, yd, yd_stride, &sse) + 2048) >> 12;
+    sad = (vpx_sad64x64(y, y_stride, yd, yd_stride) + 2048) >> 12;
+  }
+
+  // vdiff > sad * 3 means vdiff should not be too small, otherwise,
+  // it might be a lighting change in smooth area. When there is a
+  // lighting change in smooth area, it is dangerous to do MFQE.
+  if (sad > 1 && vdiff > sad * 3) {
+    const int weight = 1 << MFQE_PRECISION;
+    int ifactor = weight * sad * vdiff / (sad_thr * vdiff_thr);
+    // When ifactor equals weight, no MFQE is done.
+    if (ifactor > weight) {
+      ifactor = weight;
+    }
+    apply_ifactor(y, y_stride, yd, yd_stride, u, v, uv_stride, ud, vd,
+                  uvd_stride, bs, ifactor);
+  } else {
+    // Copy the block from current frame (i.e., no mfqe is done).
+    copy_block(y, u, v, y_stride, uv_stride, yd, ud, vd,
+               yd_stride, uvd_stride, bs);
+  }
+}
+
+static int mfqe_decision(MODE_INFO *mi, BLOCK_SIZE cur_bs) {
+  // Check the motion in current block(for inter frame),
+  // or check the motion in the correlated block in last frame (for keyframe).
+  const int mv_len_square = mi->mv[0].as_mv.row *
+                            mi->mv[0].as_mv.row +
+                            mi->mv[0].as_mv.col *
+                            mi->mv[0].as_mv.col;
+  const int mv_threshold = 100;
+  return mi->mode >= NEARESTMV &&  // Not an intra block
+         cur_bs >= BLOCK_16X16 &&
+         mv_len_square <= mv_threshold;
+}
+
+// Process each partiton in a super block, recursively.
+static void mfqe_partition(VP9_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs,
+                           const uint8_t *y, const uint8_t *u,
+                           const uint8_t *v, int y_stride, int uv_stride,
+                           uint8_t *yd, uint8_t *ud, uint8_t *vd,
+                           int yd_stride, int uvd_stride) {
+  int mi_offset, y_offset, uv_offset;
+  const BLOCK_SIZE cur_bs = mi->sb_type;
+  const int qdiff = cm->base_qindex - cm->postproc_state.last_base_qindex;
+  const int bsl = b_width_log2_lookup[bs];
+  PARTITION_TYPE partition = partition_lookup[bsl][cur_bs];
+  const BLOCK_SIZE subsize = get_subsize(bs, partition);
+
+  if (cur_bs < BLOCK_8X8) {
+    // If there are blocks smaller than 8x8, it must be on the boundary.
+    return;
+  }
+  // No MFQE on blocks smaller than 16x16
+  if (bs == BLOCK_16X16) {
+    partition = PARTITION_NONE;
+  }
+  if (bs == BLOCK_64X64) {
+    mi_offset = 4;
+    y_offset = 32;
+    uv_offset = 16;
+  } else {
+    mi_offset = 2;
+    y_offset = 16;
+    uv_offset = 8;
+  }
+  switch (partition) {
+    BLOCK_SIZE mfqe_bs, bs_tmp;
+    case PARTITION_HORZ:
+      if (bs == BLOCK_64X64) {
+        mfqe_bs = BLOCK_64X32;
+        bs_tmp = BLOCK_32X32;
+      } else {
+        mfqe_bs = BLOCK_32X16;
+        bs_tmp = BLOCK_16X16;
+      }
+      if (mfqe_decision(mi, mfqe_bs)) {
+        // Do mfqe on the first square partition.
+        mfqe_block(bs_tmp, y, u, v, y_stride, uv_stride,
+                   yd, ud, vd, yd_stride, uvd_stride, qdiff);
+        // Do mfqe on the second square partition.
+        mfqe_block(bs_tmp, y + y_offset, u + uv_offset, v + uv_offset,
+                   y_stride, uv_stride, yd + y_offset, ud + uv_offset,
+                   vd + uv_offset, yd_stride, uvd_stride, qdiff);
+      }
+      if (mfqe_decision(mi + mi_offset * cm->mi_stride, mfqe_bs)) {
+        // Do mfqe on the first square partition.
+        mfqe_block(bs_tmp, y + y_offset * y_stride, u + uv_offset * uv_stride,
+                   v + uv_offset * uv_stride, y_stride, uv_stride,
+                   yd + y_offset * yd_stride, ud + uv_offset * uvd_stride,
+                   vd + uv_offset * uvd_stride, yd_stride, uvd_stride, qdiff);
+        // Do mfqe on the second square partition.
+        mfqe_block(bs_tmp, y + y_offset * y_stride + y_offset,
+                   u + uv_offset * uv_stride + uv_offset,
+                   v + uv_offset * uv_stride + uv_offset, y_stride,
+                   uv_stride, yd + y_offset * yd_stride + y_offset,
+                   ud + uv_offset * uvd_stride + uv_offset,
+                   vd + uv_offset * uvd_stride + uv_offset,
+                   yd_stride, uvd_stride, qdiff);
+      }
+      break;
+    case PARTITION_VERT:
+      if (bs == BLOCK_64X64) {
+        mfqe_bs = BLOCK_32X64;
+        bs_tmp = BLOCK_32X32;
+      } else {
+        mfqe_bs = BLOCK_16X32;
+        bs_tmp = BLOCK_16X16;
+      }
+      if (mfqe_decision(mi, mfqe_bs)) {
+        // Do mfqe on the first square partition.
+        mfqe_block(bs_tmp, y, u, v, y_stride, uv_stride,
+                   yd, ud, vd, yd_stride, uvd_stride, qdiff);
+        // Do mfqe on the second square partition.
+        mfqe_block(bs_tmp, y + y_offset * y_stride, u + uv_offset * uv_stride,
+                   v + uv_offset * uv_stride, y_stride, uv_stride,
+                   yd + y_offset * yd_stride, ud + uv_offset * uvd_stride,
+                   vd + uv_offset * uvd_stride, yd_stride, uvd_stride, qdiff);
+      }
+      if (mfqe_decision(mi + mi_offset, mfqe_bs)) {
+        // Do mfqe on the first square partition.
+        mfqe_block(bs_tmp, y + y_offset, u + uv_offset, v + uv_offset,
+                   y_stride, uv_stride, yd + y_offset, ud + uv_offset,
+                   vd + uv_offset, yd_stride, uvd_stride, qdiff);
+        // Do mfqe on the second square partition.
+        mfqe_block(bs_tmp, y + y_offset * y_stride + y_offset,
+                   u + uv_offset * uv_stride + uv_offset,
+                   v + uv_offset * uv_stride + uv_offset, y_stride,
+                   uv_stride, yd + y_offset * yd_stride + y_offset,
+                   ud + uv_offset * uvd_stride + uv_offset,
+                   vd + uv_offset * uvd_stride + uv_offset,
+                   yd_stride, uvd_stride, qdiff);
+      }
+      break;
+    case PARTITION_NONE:
+      if (mfqe_decision(mi, cur_bs)) {
+        // Do mfqe on this partition.
+        mfqe_block(cur_bs, y, u, v, y_stride, uv_stride,
+                   yd, ud, vd, yd_stride, uvd_stride, qdiff);
+      } else {
+        // Copy the block from current frame(i.e., no mfqe is done).
+        copy_block(y, u, v, y_stride, uv_stride, yd, ud, vd,
+                   yd_stride, uvd_stride, bs);
+      }
+      break;
+    case PARTITION_SPLIT:
+      // Recursion on four square partitions, e.g. if bs is 64X64,
+      // then look into four 32X32 blocks in it.
+      mfqe_partition(cm, mi, subsize, y, u, v, y_stride, uv_stride, yd, ud, vd,
+                     yd_stride, uvd_stride);
+      mfqe_partition(cm, mi + mi_offset, subsize, y + y_offset, u + uv_offset,
+                     v + uv_offset, y_stride, uv_stride, yd + y_offset,
+                     ud + uv_offset, vd + uv_offset, yd_stride, uvd_stride);
+      mfqe_partition(cm, mi + mi_offset * cm->mi_stride, subsize,
+                     y + y_offset * y_stride, u + uv_offset * uv_stride,
+                     v + uv_offset * uv_stride, y_stride, uv_stride,
+                     yd + y_offset * yd_stride, ud + uv_offset * uvd_stride,
+                     vd + uv_offset * uvd_stride, yd_stride, uvd_stride);
+      mfqe_partition(cm, mi + mi_offset * cm->mi_stride + mi_offset,
+                     subsize, y + y_offset * y_stride + y_offset,
+                     u + uv_offset * uv_stride + uv_offset,
+                     v + uv_offset * uv_stride + uv_offset, y_stride,
+                     uv_stride, yd + y_offset * yd_stride + y_offset,
+                     ud + uv_offset * uvd_stride + uv_offset,
+                     vd + uv_offset * uvd_stride + uv_offset,
+                     yd_stride, uvd_stride);
+      break;
+    default:
+      assert(0);
+  }
+}
+
+void vp9_mfqe(VP9_COMMON *cm) {
+  int mi_row, mi_col;
+  // Current decoded frame.
+  const YV12_BUFFER_CONFIG *show = cm->frame_to_show;
+  // Last decoded frame and will store the MFQE result.
+  YV12_BUFFER_CONFIG *dest = &cm->post_proc_buffer;
+  // Loop through each super block.
+  for (mi_row = 0; mi_row < cm->mi_rows; mi_row += MI_BLOCK_SIZE) {
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
+      MODE_INFO *mi;
+      MODE_INFO *mi_local = cm->mi + (mi_row * cm->mi_stride + mi_col);
+      // Motion Info in last frame.
+      MODE_INFO *mi_prev = cm->postproc_state.prev_mi +
+                           (mi_row * cm->mi_stride + mi_col);
+      const uint32_t y_stride = show->y_stride;
+      const uint32_t uv_stride = show->uv_stride;
+      const uint32_t yd_stride = dest->y_stride;
+      const uint32_t uvd_stride = dest->uv_stride;
+      const uint32_t row_offset_y = mi_row << 3;
+      const uint32_t row_offset_uv = mi_row << 2;
+      const uint32_t col_offset_y = mi_col << 3;
+      const uint32_t col_offset_uv = mi_col << 2;
+      const uint8_t *y = show->y_buffer + row_offset_y * y_stride +
+                         col_offset_y;
+      const uint8_t *u = show->u_buffer + row_offset_uv * uv_stride +
+                         col_offset_uv;
+      const uint8_t *v = show->v_buffer + row_offset_uv * uv_stride +
+                         col_offset_uv;
+      uint8_t *yd = dest->y_buffer + row_offset_y * yd_stride + col_offset_y;
+      uint8_t *ud = dest->u_buffer + row_offset_uv * uvd_stride +
+                    col_offset_uv;
+      uint8_t *vd = dest->v_buffer + row_offset_uv * uvd_stride +
+                    col_offset_uv;
+      if (frame_is_intra_only(cm)) {
+        mi = mi_prev;
+      } else {
+        mi = mi_local;
+      }
+      mfqe_partition(cm, mi, BLOCK_64X64, y, u, v, y_stride, uv_stride, yd, ud,
+                     vd, yd_stride, uvd_stride);
+    }
+  }
+}
diff --git a/libs/libvpx/vp9/common/vp9_mfqe.h b/libs/libvpx/vp9/common/vp9_mfqe.h
new file mode 100644
index 0000000000..dfff8c23d6
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_mfqe.h
@@ -0,0 +1,31 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_MFQE_H_
+#define VP9_COMMON_VP9_MFQE_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Multiframe Quality Enhancement.
+// The aim for MFQE is to replace pixel blocks in the current frame with
+// the correlated pixel blocks (with higher quality) in the last frame.
+// The replacement can only be taken in stationary blocks by checking
+// the motion of the blocks and other conditions such as the SAD of
+// the current block and correlated block, the variance of the block
+// difference, etc.
+void vp9_mfqe(struct VP9Common *cm);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_COMMON_VP9_MFQE_H_
diff --git a/libs/libvpx/vp9/common/vp9_mv.h b/libs/libvpx/vp9/common/vp9_mv.h
new file mode 100644
index 0000000000..5d89da8c25
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_mv.h
@@ -0,0 +1,55 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_MV_H_
+#define VP9_COMMON_VP9_MV_H_
+
+#include "vpx/vpx_integer.h"
+
+#include "vp9/common/vp9_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct mv {
+  int16_t row;
+  int16_t col;
+} MV;
+
+typedef union int_mv {
+  uint32_t as_int;
+  MV as_mv;
+} int_mv; /* facilitates faster equality tests and copies */
+
+typedef struct mv32 {
+  int32_t row;
+  int32_t col;
+} MV32;
+
+static INLINE int is_zero_mv(const MV *mv) {
+  return *((const uint32_t *)mv) == 0;
+}
+
+static INLINE int is_equal_mv(const MV *a, const MV *b) {
+  return  *((const uint32_t *)a) == *((const uint32_t *)b);
+}
+
+static INLINE void clamp_mv(MV *mv, int min_col, int max_col,
+                            int min_row, int max_row) {
+  mv->col = clamp(mv->col, min_col, max_col);
+  mv->row = clamp(mv->row, min_row, max_row);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_COMMON_VP9_MV_H_
diff --git a/libs/libvpx/vp9/common/vp9_mvref_common.c b/libs/libvpx/vp9/common/vp9_mvref_common.c
new file mode 100644
index 0000000000..0eb01a51ba
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_mvref_common.c
@@ -0,0 +1,201 @@
+
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/common/vp9_mvref_common.h"
+
+// This function searches the neighborhood of a given MB/SB
+// to try and find candidate reference vectors.
+static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+                             MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
+                             int_mv *mv_ref_list,
+                             int block, int mi_row, int mi_col,
+                             uint8_t *mode_context) {
+  const int *ref_sign_bias = cm->ref_frame_sign_bias;
+  int i, refmv_count = 0;
+  const POSITION *const mv_ref_search = mv_ref_blocks[mi->sb_type];
+  int different_ref_found = 0;
+  int context_counter = 0;
+  const MV_REF *const prev_frame_mvs = cm->use_prev_frame_mvs ?
+      cm->prev_frame->mvs + mi_row * cm->mi_cols + mi_col : NULL;
+  const TileInfo *const tile = &xd->tile;
+
+  // Blank the reference vector list
+  memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES);
+
+  // The nearest 2 blocks are treated differently
+  // if the size < 8x8 we get the mv from the bmi substructure,
+  // and we also need to keep a mode count.
+  for (i = 0; i < 2; ++i) {
+    const POSITION *const mv_ref = &mv_ref_search[i];
+    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
+      const MODE_INFO *const candidate_mi = xd->mi[mv_ref->col + mv_ref->row *
+                                                   xd->mi_stride];
+      // Keep counts for entropy encoding.
+      context_counter += mode_2_counter[candidate_mi->mode];
+      different_ref_found = 1;
+
+      if (candidate_mi->ref_frame[0] == ref_frame)
+        ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 0, mv_ref->col, block),
+                        refmv_count, mv_ref_list, Done);
+      else if (candidate_mi->ref_frame[1] == ref_frame)
+        ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 1, mv_ref->col, block),
+                        refmv_count, mv_ref_list, Done);
+    }
+  }
+
+  // Check the rest of the neighbors in much the same way
+  // as before except we don't need to keep track of sub blocks or
+  // mode counts.
+  for (; i < MVREF_NEIGHBOURS; ++i) {
+    const POSITION *const mv_ref = &mv_ref_search[i];
+    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
+      const MODE_INFO *const candidate_mi =
+          xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride];
+      different_ref_found = 1;
+
+      if (candidate_mi->ref_frame[0] == ref_frame)
+        ADD_MV_REF_LIST(candidate_mi->mv[0], refmv_count, mv_ref_list, Done);
+      else if (candidate_mi->ref_frame[1] == ref_frame)
+        ADD_MV_REF_LIST(candidate_mi->mv[1], refmv_count, mv_ref_list, Done);
+    }
+  }
+
+  // Check the last frame's mode and mv info.
+  if (cm->use_prev_frame_mvs) {
+    if (prev_frame_mvs->ref_frame[0] == ref_frame) {
+      ADD_MV_REF_LIST(prev_frame_mvs->mv[0], refmv_count, mv_ref_list, Done);
+    } else if (prev_frame_mvs->ref_frame[1] == ref_frame) {
+      ADD_MV_REF_LIST(prev_frame_mvs->mv[1], refmv_count, mv_ref_list, Done);
+    }
+  }
+
+  // Since we couldn't find 2 mvs from the same reference frame
+  // go back through the neighbors and find motion vectors from
+  // different reference frames.
+  if (different_ref_found) {
+    for (i = 0; i < MVREF_NEIGHBOURS; ++i) {
+      const POSITION *mv_ref = &mv_ref_search[i];
+      if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
+        const MODE_INFO *const candidate_mi =
+            xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride];
+
+        // If the candidate is INTRA we don't want to consider its mv.
+        IF_DIFF_REF_FRAME_ADD_MV(candidate_mi, ref_frame, ref_sign_bias,
+                                 refmv_count, mv_ref_list, Done);
+      }
+    }
+  }
+
+  // Since we still don't have a candidate we'll try the last frame.
+  if (cm->use_prev_frame_mvs) {
+    if (prev_frame_mvs->ref_frame[0] != ref_frame &&
+        prev_frame_mvs->ref_frame[0] > INTRA_FRAME) {
+      int_mv mv = prev_frame_mvs->mv[0];
+      if (ref_sign_bias[prev_frame_mvs->ref_frame[0]] !=
+          ref_sign_bias[ref_frame]) {
+        mv.as_mv.row *= -1;
+        mv.as_mv.col *= -1;
+      }
+      ADD_MV_REF_LIST(mv, refmv_count, mv_ref_list, Done);
+    }
+
+    if (prev_frame_mvs->ref_frame[1] > INTRA_FRAME &&
+        prev_frame_mvs->ref_frame[1] != ref_frame &&
+        prev_frame_mvs->mv[1].as_int != prev_frame_mvs->mv[0].as_int) {
+      int_mv mv = prev_frame_mvs->mv[1];
+      if (ref_sign_bias[prev_frame_mvs->ref_frame[1]] !=
+          ref_sign_bias[ref_frame]) {
+        mv.as_mv.row *= -1;
+        mv.as_mv.col *= -1;
+      }
+      ADD_MV_REF_LIST(mv, refmv_count, mv_ref_list, Done);
+    }
+  }
+
+ Done:
+
+  mode_context[ref_frame] = counter_to_context[context_counter];
+
+  // Clamp vectors
+  for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i)
+    clamp_mv_ref(&mv_ref_list[i].as_mv, xd);
+}
+
+void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+                      MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
+                      int_mv *mv_ref_list,
+                      int mi_row, int mi_col,
+                      uint8_t *mode_context) {
+  find_mv_refs_idx(cm, xd, mi, ref_frame, mv_ref_list, -1,
+                   mi_row, mi_col, mode_context);
+}
+
+void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp,
+                           int_mv *mvlist, int_mv *nearest_mv,
+                           int_mv *near_mv) {
+  int i;
+  // Make sure all the candidates are properly clamped etc
+  for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {
+    lower_mv_precision(&mvlist[i].as_mv, allow_hp);
+    clamp_mv2(&mvlist[i].as_mv, xd);
+  }
+  *nearest_mv = mvlist[0];
+  *near_mv = mvlist[1];
+}
+
+void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
+                                   int block, int ref, int mi_row, int mi_col,
+                                   int_mv *nearest_mv, int_mv *near_mv,
+                                   uint8_t *mode_context) {
+  int_mv mv_list[MAX_MV_REF_CANDIDATES];
+  MODE_INFO *const mi = xd->mi[0];
+  b_mode_info *bmi = mi->bmi;
+  int n;
+
+  assert(MAX_MV_REF_CANDIDATES == 2);
+
+  find_mv_refs_idx(cm, xd, mi, mi->ref_frame[ref], mv_list, block,
+                   mi_row, mi_col, mode_context);
+
+  near_mv->as_int = 0;
+  switch (block) {
+    case 0:
+      nearest_mv->as_int = mv_list[0].as_int;
+      near_mv->as_int = mv_list[1].as_int;
+      break;
+    case 1:
+    case 2:
+      nearest_mv->as_int = bmi[0].as_mv[ref].as_int;
+      for (n = 0; n < MAX_MV_REF_CANDIDATES; ++n)
+        if (nearest_mv->as_int != mv_list[n].as_int) {
+          near_mv->as_int = mv_list[n].as_int;
+          break;
+        }
+      break;
+    case 3: {
+      int_mv candidates[2 + MAX_MV_REF_CANDIDATES];
+      candidates[0] = bmi[1].as_mv[ref];
+      candidates[1] = bmi[0].as_mv[ref];
+      candidates[2] = mv_list[0];
+      candidates[3] = mv_list[1];
+
+      nearest_mv->as_int = bmi[2].as_mv[ref].as_int;
+      for (n = 0; n < 2 + MAX_MV_REF_CANDIDATES; ++n)
+        if (nearest_mv->as_int != candidates[n].as_int) {
+          near_mv->as_int = candidates[n].as_int;
+          break;
+        }
+      break;
+    }
+    default:
+      assert(0 && "Invalid block index.");
+  }
+}
diff --git a/libs/libvpx/vp9/common/vp9_mvref_common.h b/libs/libvpx/vp9/common/vp9_mvref_common.h
new file mode 100644
index 0000000000..4380843e24
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_mvref_common.h
@@ -0,0 +1,241 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef VP9_COMMON_VP9_MVREF_COMMON_H_
+#define VP9_COMMON_VP9_MVREF_COMMON_H_
+
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/vp9_blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define LEFT_TOP_MARGIN ((VP9_ENC_BORDER_IN_PIXELS - VP9_INTERP_EXTEND) << 3)
+#define RIGHT_BOTTOM_MARGIN ((VP9_ENC_BORDER_IN_PIXELS -\
+                                VP9_INTERP_EXTEND) << 3)
+
+#define MVREF_NEIGHBOURS 8
+
+typedef struct position {
+  int row;
+  int col;
+} POSITION;
+
+typedef enum {
+  BOTH_ZERO = 0,
+  ZERO_PLUS_PREDICTED = 1,
+  BOTH_PREDICTED = 2,
+  NEW_PLUS_NON_INTRA = 3,
+  BOTH_NEW = 4,
+  INTRA_PLUS_NON_INTRA = 5,
+  BOTH_INTRA = 6,
+  INVALID_CASE = 9
+} motion_vector_context;
+
+// This is used to figure out a context for the ref blocks. The code flattens
+// an array that would have 3 possible counts (0, 1 & 2) for 3 choices by
+// adding 9 for each intra block, 3 for each zero mv and 1 for each new
+// motion vector. This single number is then converted into a context
+// with a single lookup ( counter_to_context ).
+static const int mode_2_counter[MB_MODE_COUNT] = {
+  9,  // DC_PRED
+  9,  // V_PRED
+  9,  // H_PRED
+  9,  // D45_PRED
+  9,  // D135_PRED
+  9,  // D117_PRED
+  9,  // D153_PRED
+  9,  // D207_PRED
+  9,  // D63_PRED
+  9,  // TM_PRED
+  0,  // NEARESTMV
+  0,  // NEARMV
+  3,  // ZEROMV
+  1,  // NEWMV
+};
+
+// There are 3^3 different combinations of 3 counts that can be either 0,1 or
+// 2. However the actual count can never be greater than 2 so the highest
+// counter we need is 18. 9 is an invalid counter that's never used.
+static const int counter_to_context[19] = {
+  BOTH_PREDICTED,  // 0
+  NEW_PLUS_NON_INTRA,  // 1
+  BOTH_NEW,  // 2
+  ZERO_PLUS_PREDICTED,  // 3
+  NEW_PLUS_NON_INTRA,  // 4
+  INVALID_CASE,  // 5
+  BOTH_ZERO,  // 6
+  INVALID_CASE,  // 7
+  INVALID_CASE,  // 8
+  INTRA_PLUS_NON_INTRA,  // 9
+  INTRA_PLUS_NON_INTRA,  // 10
+  INVALID_CASE,  // 11
+  INTRA_PLUS_NON_INTRA,  // 12
+  INVALID_CASE,  // 13
+  INVALID_CASE,  // 14
+  INVALID_CASE,  // 15
+  INVALID_CASE,  // 16
+  INVALID_CASE,  // 17
+  BOTH_INTRA  // 18
+};
+
+static const POSITION mv_ref_blocks[BLOCK_SIZES][MVREF_NEIGHBOURS] = {
+  // 4X4
+  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
+  // 4X8
+  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
+  // 8X4
+  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
+  // 8X8
+  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
+  // 8X16
+  {{0, -1}, {-1, 0}, {1, -1}, {-1, -1}, {0, -2}, {-2, 0}, {-2, -1}, {-1, -2}},
+  // 16X8
+  {{-1, 0}, {0, -1}, {-1, 1}, {-1, -1}, {-2, 0}, {0, -2}, {-1, -2}, {-2, -1}},
+  // 16X16
+  {{-1, 0}, {0, -1}, {-1, 1}, {1, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-3, -3}},
+  // 16X32
+  {{0, -1}, {-1, 0}, {2, -1}, {-1, -1}, {-1, 1}, {0, -3}, {-3, 0}, {-3, -3}},
+  // 32X16
+  {{-1, 0}, {0, -1}, {-1, 2}, {-1, -1}, {1, -1}, {-3, 0}, {0, -3}, {-3, -3}},
+  // 32X32
+  {{-1, 1}, {1, -1}, {-1, 2}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-3, -3}},
+  // 32X64
+  {{0, -1}, {-1, 0}, {4, -1}, {-1, 2}, {-1, -1}, {0, -3}, {-3, 0}, {2, -1}},
+  // 64X32
+  {{-1, 0}, {0, -1}, {-1, 4}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-1, 2}},
+  // 64X64
+  {{-1, 3}, {3, -1}, {-1, 4}, {4, -1}, {-1, -1}, {-1, 0}, {0, -1}, {-1, 6}}
+};
+
+static const int idx_n_column_to_subblock[4][2] = {
+  {1, 2},
+  {1, 3},
+  {3, 2},
+  {3, 3}
+};
+
+// clamp_mv_ref
+#define MV_BORDER (16 << 3)  // Allow 16 pels in 1/8th pel units
+
+static INLINE void clamp_mv_ref(MV *mv, const MACROBLOCKD *xd) {
+  clamp_mv(mv, xd->mb_to_left_edge - MV_BORDER,
+               xd->mb_to_right_edge + MV_BORDER,
+               xd->mb_to_top_edge - MV_BORDER,
+               xd->mb_to_bottom_edge + MV_BORDER);
+}
+
+// This function returns either the appropriate sub block or block's mv
+// on whether the block_size < 8x8 and we have check_sub_blocks set.
+static INLINE int_mv get_sub_block_mv(const MODE_INFO *candidate, int which_mv,
+                                      int search_col, int block_idx) {
+  return block_idx >= 0 && candidate->sb_type < BLOCK_8X8
+          ? candidate->bmi[idx_n_column_to_subblock[block_idx][search_col == 0]]
+              .as_mv[which_mv]
+          : candidate->mv[which_mv];
+}
+
+
+// Performs mv sign inversion if indicated by the reference frame combination.
+static INLINE int_mv scale_mv(const MODE_INFO *mi, int ref,
+                              const MV_REFERENCE_FRAME this_ref_frame,
+                              const int *ref_sign_bias) {
+  int_mv mv = mi->mv[ref];
+  if (ref_sign_bias[mi->ref_frame[ref]] != ref_sign_bias[this_ref_frame]) {
+    mv.as_mv.row *= -1;
+    mv.as_mv.col *= -1;
+  }
+  return mv;
+}
+
+// This macro is used to add a motion vector mv_ref list if it isn't
+// already in the list.  If it's the second motion vector it will also
+// skip all additional processing and jump to Done!
+#define ADD_MV_REF_LIST(mv, refmv_count, mv_ref_list, Done) \
+  do { \
+    if (refmv_count) { \
+      if ((mv).as_int != (mv_ref_list)[0].as_int) { \
+        (mv_ref_list)[(refmv_count)] = (mv); \
+        goto Done; \
+      } \
+    } else { \
+      (mv_ref_list)[(refmv_count)++] = (mv); \
+    } \
+  } while (0)
+
+// If either reference frame is different, not INTRA, and they
+// are different from each other scale and add the mv to our list.
+#define IF_DIFF_REF_FRAME_ADD_MV(mbmi, ref_frame, ref_sign_bias, refmv_count, \
+                                 mv_ref_list, Done) \
+  do { \
+    if (is_inter_block(mbmi)) { \
+      if ((mbmi)->ref_frame[0] != ref_frame) \
+        ADD_MV_REF_LIST(scale_mv((mbmi), 0, ref_frame, ref_sign_bias), \
+                        refmv_count, mv_ref_list, Done); \
+      if (has_second_ref(mbmi) && \
+          (mbmi)->ref_frame[1] != ref_frame && \
+          (mbmi)->mv[1].as_int != (mbmi)->mv[0].as_int) \
+        ADD_MV_REF_LIST(scale_mv((mbmi), 1, ref_frame, ref_sign_bias), \
+                        refmv_count, mv_ref_list, Done); \
+    } \
+  } while (0)
+
+
+// Checks that the given mi_row, mi_col and search point
+// are inside the borders of the tile.
+static INLINE int is_inside(const TileInfo *const tile,
+                            int mi_col, int mi_row, int mi_rows,
+                            const POSITION *mi_pos) {
+  return !(mi_row + mi_pos->row < 0 ||
+           mi_col + mi_pos->col < tile->mi_col_start ||
+           mi_row + mi_pos->row >= mi_rows ||
+           mi_col + mi_pos->col >= tile->mi_col_end);
+}
+
+// TODO(jingning): this mv clamping function should be block size dependent.
+static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
+  clamp_mv(mv, xd->mb_to_left_edge - LEFT_TOP_MARGIN,
+               xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
+               xd->mb_to_top_edge - LEFT_TOP_MARGIN,
+               xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
+}
+
+static INLINE void lower_mv_precision(MV *mv, int allow_hp) {
+  const int use_hp = allow_hp && use_mv_hp(mv);
+  if (!use_hp) {
+    if (mv->row & 1)
+      mv->row += (mv->row > 0 ? -1 : 1);
+    if (mv->col & 1)
+      mv->col += (mv->col > 0 ? -1 : 1);
+  }
+}
+
+typedef void (*find_mv_refs_sync)(void *const data, int mi_row);
+void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+                      MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
+                      int_mv *mv_ref_list, int mi_row, int mi_col,
+                      uint8_t *mode_context);
+
+// check a list of motion vectors by sad score using a number rows of pixels
+// above and a number cols of pixels in the left to select the one with best
+// score to use as ref motion vector
+void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp,
+                           int_mv *mvlist, int_mv *nearest_mv, int_mv *near_mv);
+
+void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
+                                   int block, int ref, int mi_row, int mi_col,
+                                   int_mv *nearest_mv, int_mv *near_mv,
+                                   uint8_t *mode_context);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_COMMON_VP9_MVREF_COMMON_H_
diff --git a/libs/libvpx/vp9/common/vp9_onyxc_int.h b/libs/libvpx/vp9/common/vp9_onyxc_int.h
new file mode 100644
index 0000000000..fd674cbc6f
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_onyxc_int.h
@@ -0,0 +1,458 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_ONYXC_INT_H_
+#define VP9_COMMON_VP9_ONYXC_INT_H_
+
+#include "./vpx_config.h"
+#include "vpx/internal/vpx_codec_internal.h"
+#include "vpx_util/vpx_thread.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_alloccommon.h"
+#include "vp9/common/vp9_loopfilter.h"
+#include "vp9/common/vp9_entropymv.h"
+#include "vp9/common/vp9_entropy.h"
+#include "vp9/common/vp9_entropymode.h"
+#include "vp9/common/vp9_frame_buffers.h"
+#include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_tile_common.h"
+
+#if CONFIG_VP9_POSTPROC
+#include "vp9/common/vp9_postproc.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define REFS_PER_FRAME 3
+
+#define REF_FRAMES_LOG2 3
+#define REF_FRAMES (1 << REF_FRAMES_LOG2)
+
+// 4 scratch frames for the new frames to support a maximum of 4 cores decoding
+// in parallel, 3 for scaled references on the encoder.
+// TODO(hkuang): Add ondemand frame buffers instead of hardcoding the number
+// of framebuffers.
+// TODO(jkoleszar): These 3 extra references could probably come from the
+// normal reference pool.
+#define FRAME_BUFFERS (REF_FRAMES + 7)
+
+#define FRAME_CONTEXTS_LOG2 2
+#define FRAME_CONTEXTS (1 << FRAME_CONTEXTS_LOG2)
+
+#define NUM_PING_PONG_BUFFERS 2
+
+extern const struct {
+  PARTITION_CONTEXT above;
+  PARTITION_CONTEXT left;
+} partition_context_lookup[BLOCK_SIZES];
+
+
+typedef enum {
+  SINGLE_REFERENCE      = 0,
+  COMPOUND_REFERENCE    = 1,
+  REFERENCE_MODE_SELECT = 2,
+  REFERENCE_MODES       = 3,
+} REFERENCE_MODE;
+
+typedef struct {
+  int_mv mv[2];
+  MV_REFERENCE_FRAME ref_frame[2];
+} MV_REF;
+
+typedef struct {
+  int ref_count;
+  MV_REF *mvs;
+  int mi_rows;
+  int mi_cols;
+  vpx_codec_frame_buffer_t raw_frame_buffer;
+  YV12_BUFFER_CONFIG buf;
+
+  // The Following variables will only be used in frame parallel decode.
+
+  // frame_worker_owner indicates which FrameWorker owns this buffer. NULL means
+  // that no FrameWorker owns, or is decoding, this buffer.
+  VPxWorker *frame_worker_owner;
+
+  // row and col indicate which position frame has been decoded to in real
+  // pixel unit. They are reset to -1 when decoding begins and set to INT_MAX
+  // when the frame is fully decoded.
+  int row;
+  int col;
+} RefCntBuffer;
+
+typedef struct BufferPool {
+  // Protect BufferPool from being accessed by several FrameWorkers at
+  // the same time during frame parallel decode.
+  // TODO(hkuang): Try to use atomic variable instead of locking the whole pool.
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t pool_mutex;
+#endif
+
+  // Private data associated with the frame buffer callbacks.
+  void *cb_priv;
+
+  vpx_get_frame_buffer_cb_fn_t get_fb_cb;
+  vpx_release_frame_buffer_cb_fn_t release_fb_cb;
+
+  RefCntBuffer frame_bufs[FRAME_BUFFERS];
+
+  // Frame buffers allocated internally by the codec.
+  InternalFrameBufferList int_frame_buffers;
+} BufferPool;
+
+typedef struct VP9Common {
+  struct vpx_internal_error_info  error;
+  vpx_color_space_t color_space;
+  vpx_color_range_t color_range;
+  int width;
+  int height;
+  int render_width;
+  int render_height;
+  int last_width;
+  int last_height;
+
+  // TODO(jkoleszar): this implies chroma ss right now, but could vary per
+  // plane. Revisit as part of the future change to YV12_BUFFER_CONFIG to
+  // support additional planes.
+  int subsampling_x;
+  int subsampling_y;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  int use_highbitdepth;  // Marks if we need to use 16bit frame buffers.
+#endif
+
+  YV12_BUFFER_CONFIG *frame_to_show;
+  RefCntBuffer *prev_frame;
+
+  // TODO(hkuang): Combine this with cur_buf in macroblockd.
+  RefCntBuffer *cur_frame;
+
+  int ref_frame_map[REF_FRAMES]; /* maps fb_idx to reference slot */
+
+  // Prepare ref_frame_map for the next frame.
+  // Only used in frame parallel decode.
+  int next_ref_frame_map[REF_FRAMES];
+
+  // TODO(jkoleszar): could expand active_ref_idx to 4, with 0 as intra, and
+  // roll new_fb_idx into it.
+
+  // Each frame can reference REFS_PER_FRAME buffers
+  RefBuffer frame_refs[REFS_PER_FRAME];
+
+  int new_fb_idx;
+
+#if CONFIG_VP9_POSTPROC
+  YV12_BUFFER_CONFIG post_proc_buffer;
+  YV12_BUFFER_CONFIG post_proc_buffer_int;
+#endif
+
+  FRAME_TYPE last_frame_type;  /* last frame's frame type for motion search.*/
+  FRAME_TYPE frame_type;
+
+  int show_frame;
+  int last_show_frame;
+  int show_existing_frame;
+
+  // Flag signaling that the frame is encoded using only INTRA modes.
+  uint8_t intra_only;
+  uint8_t last_intra_only;
+
+  int allow_high_precision_mv;
+
+  // Flag signaling that the frame context should be reset to default values.
+  // 0 or 1 implies don't reset, 2 reset just the context specified in the
+  // frame header, 3 reset all contexts.
+  int reset_frame_context;
+
+  // MBs, mb_rows/cols is in 16-pixel units; mi_rows/cols is in
+  // MODE_INFO (8-pixel) units.
+  int MBs;
+  int mb_rows, mi_rows;
+  int mb_cols, mi_cols;
+  int mi_stride;
+
+  /* profile settings */
+  TX_MODE tx_mode;
+
+  int base_qindex;
+  int y_dc_delta_q;
+  int uv_dc_delta_q;
+  int uv_ac_delta_q;
+  int16_t y_dequant[MAX_SEGMENTS][2];
+  int16_t uv_dequant[MAX_SEGMENTS][2];
+
+  /* We allocate a MODE_INFO struct for each macroblock, together with
+     an extra row on top and column on the left to simplify prediction. */
+  int mi_alloc_size;
+  MODE_INFO *mip; /* Base of allocated array */
+  MODE_INFO *mi;  /* Corresponds to upper left visible macroblock */
+
+  // TODO(agrange): Move prev_mi into encoder structure.
+  // prev_mip and prev_mi will only be allocated in VP9 encoder.
+  MODE_INFO *prev_mip; /* MODE_INFO array 'mip' from last decoded frame */
+  MODE_INFO *prev_mi;  /* 'mi' from last frame (points into prev_mip) */
+
+  // Separate mi functions between encoder and decoder.
+  int (*alloc_mi)(struct VP9Common *cm, int mi_size);
+  void (*free_mi)(struct VP9Common *cm);
+  void (*setup_mi)(struct VP9Common *cm);
+
+  // Grid of pointers to 8x8 MODE_INFO structs.  Any 8x8 not in the visible
+  // area will be NULL.
+  MODE_INFO **mi_grid_base;
+  MODE_INFO **mi_grid_visible;
+  MODE_INFO **prev_mi_grid_base;
+  MODE_INFO **prev_mi_grid_visible;
+
+  // Whether to use previous frame's motion vectors for prediction.
+  int use_prev_frame_mvs;
+
+  // Persistent mb segment id map used in prediction.
+  int seg_map_idx;
+  int prev_seg_map_idx;
+
+  uint8_t *seg_map_array[NUM_PING_PONG_BUFFERS];
+  uint8_t *last_frame_seg_map;
+  uint8_t *current_frame_seg_map;
+  int seg_map_alloc_size;
+
+  INTERP_FILTER interp_filter;
+
+  loop_filter_info_n lf_info;
+
+  int refresh_frame_context;    /* Two state 0 = NO, 1 = YES */
+
+  int ref_frame_sign_bias[MAX_REF_FRAMES];    /* Two state 0, 1 */
+
+  struct loopfilter lf;
+  struct segmentation seg;
+
+  // TODO(hkuang): Remove this as it is the same as frame_parallel_decode
+  // in pbi.
+  int frame_parallel_decode;  // frame-based threading.
+
+  // Context probabilities for reference frame prediction
+  MV_REFERENCE_FRAME comp_fixed_ref;
+  MV_REFERENCE_FRAME comp_var_ref[2];
+  REFERENCE_MODE reference_mode;
+
+  FRAME_CONTEXT *fc;  /* this frame entropy */
+  FRAME_CONTEXT *frame_contexts;   // FRAME_CONTEXTS
+  unsigned int  frame_context_idx; /* Context to use/update */
+  FRAME_COUNTS counts;
+
+  unsigned int current_video_frame;
+  BITSTREAM_PROFILE profile;
+
+  // VPX_BITS_8 in profile 0 or 1, VPX_BITS_10 or VPX_BITS_12 in profile 2 or 3.
+  vpx_bit_depth_t bit_depth;
+  vpx_bit_depth_t dequant_bit_depth;  // bit_depth of current dequantizer
+
+#if CONFIG_VP9_POSTPROC
+  struct postproc_state  postproc_state;
+#endif
+
+  int error_resilient_mode;
+  int frame_parallel_decoding_mode;
+
+  int log2_tile_cols, log2_tile_rows;
+  int byte_alignment;
+  int skip_loop_filter;
+
+  // Private data associated with the frame buffer callbacks.
+  void *cb_priv;
+  vpx_get_frame_buffer_cb_fn_t get_fb_cb;
+  vpx_release_frame_buffer_cb_fn_t release_fb_cb;
+
+  // Handles memory for the codec.
+  InternalFrameBufferList int_frame_buffers;
+
+  // External BufferPool passed from outside.
+  BufferPool *buffer_pool;
+
+  PARTITION_CONTEXT *above_seg_context;
+  ENTROPY_CONTEXT *above_context;
+  int above_context_alloc_cols;
+} VP9_COMMON;
+
+// TODO(hkuang): Don't need to lock the whole pool after implementing atomic
+// frame reference count.
+void lock_buffer_pool(BufferPool *const pool);
+void unlock_buffer_pool(BufferPool *const pool);
+
+static INLINE YV12_BUFFER_CONFIG *get_ref_frame(VP9_COMMON *cm, int index) {
+  if (index < 0 || index >= REF_FRAMES)
+    return NULL;
+  if (cm->ref_frame_map[index] < 0)
+    return NULL;
+  assert(cm->ref_frame_map[index] < FRAME_BUFFERS);
+  return &cm->buffer_pool->frame_bufs[cm->ref_frame_map[index]].buf;
+}
+
+static INLINE YV12_BUFFER_CONFIG *get_frame_new_buffer(VP9_COMMON *cm) {
+  return &cm->buffer_pool->frame_bufs[cm->new_fb_idx].buf;
+}
+
+static INLINE int get_free_fb(VP9_COMMON *cm) {
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+  int i;
+
+  lock_buffer_pool(cm->buffer_pool);
+  for (i = 0; i < FRAME_BUFFERS; ++i)
+    if (frame_bufs[i].ref_count == 0)
+      break;
+
+  if (i != FRAME_BUFFERS) {
+    frame_bufs[i].ref_count = 1;
+  } else {
+    // Reset i to be INVALID_IDX to indicate no free buffer found.
+    i = INVALID_IDX;
+  }
+
+  unlock_buffer_pool(cm->buffer_pool);
+  return i;
+}
+
+static INLINE void ref_cnt_fb(RefCntBuffer *bufs, int *idx, int new_idx) {
+  const int ref_index = *idx;
+
+  if (ref_index >= 0 && bufs[ref_index].ref_count > 0)
+    bufs[ref_index].ref_count--;
+
+  *idx = new_idx;
+
+  bufs[new_idx].ref_count++;
+}
+
+static INLINE int mi_cols_aligned_to_sb(int n_mis) {
+  return ALIGN_POWER_OF_TWO(n_mis, MI_BLOCK_SIZE_LOG2);
+}
+
+static INLINE int frame_is_intra_only(const VP9_COMMON *const cm) {
+  return cm->frame_type == KEY_FRAME || cm->intra_only;
+}
+
+static INLINE void set_partition_probs(const VP9_COMMON *const cm,
+                                       MACROBLOCKD *const xd) {
+  xd->partition_probs =
+      frame_is_intra_only(cm) ?
+          &vp9_kf_partition_probs[0] :
+          (const vpx_prob (*)[PARTITION_TYPES - 1])cm->fc->partition_prob;
+}
+
+static INLINE void vp9_init_macroblockd(VP9_COMMON *cm, MACROBLOCKD *xd,
+                                        tran_low_t *dqcoeff) {
+  int i;
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    xd->plane[i].dqcoeff = dqcoeff;
+    xd->above_context[i] = cm->above_context +
+        i * sizeof(*cm->above_context) * 2 * mi_cols_aligned_to_sb(cm->mi_cols);
+
+    if (get_plane_type(i) == PLANE_TYPE_Y) {
+      memcpy(xd->plane[i].seg_dequant, cm->y_dequant, sizeof(cm->y_dequant));
+    } else {
+      memcpy(xd->plane[i].seg_dequant, cm->uv_dequant, sizeof(cm->uv_dequant));
+    }
+    xd->fc = cm->fc;
+  }
+
+  xd->above_seg_context = cm->above_seg_context;
+  xd->mi_stride = cm->mi_stride;
+  xd->error_info = &cm->error;
+
+  set_partition_probs(cm, xd);
+}
+
+static INLINE const vpx_prob* get_partition_probs(const MACROBLOCKD *xd,
+                                                  int ctx) {
+  return xd->partition_probs[ctx];
+}
+
+static INLINE void set_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col) {
+  const int above_idx = mi_col * 2;
+  const int left_idx = (mi_row * 2) & 15;
+  int i;
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    struct macroblockd_plane *const pd = &xd->plane[i];
+    pd->above_context = &xd->above_context[i][above_idx >> pd->subsampling_x];
+    pd->left_context = &xd->left_context[i][left_idx >> pd->subsampling_y];
+  }
+}
+
+static INLINE int calc_mi_size(int len) {
+  // len is in mi units.
+  return len + MI_BLOCK_SIZE;
+}
+
+static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile,
+                                  int mi_row, int bh,
+                                  int mi_col, int bw,
+                                  int mi_rows, int mi_cols) {
+  xd->mb_to_top_edge    = -((mi_row * MI_SIZE) * 8);
+  xd->mb_to_bottom_edge = ((mi_rows - bh - mi_row) * MI_SIZE) * 8;
+  xd->mb_to_left_edge   = -((mi_col * MI_SIZE) * 8);
+  xd->mb_to_right_edge  = ((mi_cols - bw - mi_col) * MI_SIZE) * 8;
+
+  // Are edges available for intra prediction?
+  xd->up_available    = (mi_row != 0);
+  xd->left_available  = (mi_col > tile->mi_col_start);
+  // TODO(slavarnway): eliminate up/left available ???
+  if (xd->up_available) {
+    xd->above_mi = xd->mi[-xd->mi_stride];
+  } else {
+    xd->above_mi = NULL;
+  }
+
+  if (xd->left_available) {
+    xd->left_mi = xd->mi[-1];
+  } else {
+    xd->left_mi = NULL;
+  }
+}
+
+static INLINE void update_partition_context(MACROBLOCKD *xd,
+                                            int mi_row, int mi_col,
+                                            BLOCK_SIZE subsize,
+                                            BLOCK_SIZE bsize) {
+  PARTITION_CONTEXT *const above_ctx = xd->above_seg_context + mi_col;
+  PARTITION_CONTEXT *const left_ctx = xd->left_seg_context + (mi_row & MI_MASK);
+
+  // num_4x4_blocks_wide_lookup[bsize] / 2
+  const int bs = num_8x8_blocks_wide_lookup[bsize];
+
+  // update the partition context at the end notes. set partition bits
+  // of block sizes larger than the current one to be one, and partition
+  // bits of smaller block sizes to be zero.
+  memset(above_ctx, partition_context_lookup[subsize].above, bs);
+  memset(left_ctx, partition_context_lookup[subsize].left, bs);
+}
+
+static INLINE int partition_plane_context(const MACROBLOCKD *xd,
+                                          int mi_row, int mi_col,
+                                          BLOCK_SIZE bsize) {
+  const PARTITION_CONTEXT *above_ctx = xd->above_seg_context + mi_col;
+  const PARTITION_CONTEXT *left_ctx = xd->left_seg_context + (mi_row & MI_MASK);
+  const int bsl = mi_width_log2_lookup[bsize];
+  int above = (*above_ctx >> bsl) & 1 , left = (*left_ctx >> bsl) & 1;
+
+  assert(b_width_log2_lookup[bsize] == b_height_log2_lookup[bsize]);
+  assert(bsl >= 0);
+
+  return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_COMMON_VP9_ONYXC_INT_H_
diff --git a/libs/libvpx/vp9/common/vp9_postproc.c b/libs/libvpx/vp9/common/vp9_postproc.c
new file mode 100644
index 0000000000..b685d813b7
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_postproc.c
@@ -0,0 +1,746 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vpx_scale_rtcd.h"
+#include "./vp9_rtcd.h"
+
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/system_state.h"
+#include "vpx_scale/vpx_scale.h"
+#include "vpx_scale/yv12config.h"
+
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/vp9_postproc.h"
+#include "vp9/common/vp9_textblit.h"
+
+#if CONFIG_VP9_POSTPROC
+static const int16_t kernel5[] = {
+  1, 1, 4, 1, 1
+};
+
+const int16_t vp9_rv[] = {
+  8, 5, 2, 2, 8, 12, 4, 9, 8, 3,
+  0, 3, 9, 0, 0, 0, 8, 3, 14, 4,
+  10, 1, 11, 14, 1, 14, 9, 6, 12, 11,
+  8, 6, 10, 0, 0, 8, 9, 0, 3, 14,
+  8, 11, 13, 4, 2, 9, 0, 3, 9, 6,
+  1, 2, 3, 14, 13, 1, 8, 2, 9, 7,
+  3, 3, 1, 13, 13, 6, 6, 5, 2, 7,
+  11, 9, 11, 8, 7, 3, 2, 0, 13, 13,
+  14, 4, 12, 5, 12, 10, 8, 10, 13, 10,
+  4, 14, 4, 10, 0, 8, 11, 1, 13, 7,
+  7, 14, 6, 14, 13, 2, 13, 5, 4, 4,
+  0, 10, 0, 5, 13, 2, 12, 7, 11, 13,
+  8, 0, 4, 10, 7, 2, 7, 2, 2, 5,
+  3, 4, 7, 3, 3, 14, 14, 5, 9, 13,
+  3, 14, 3, 6, 3, 0, 11, 8, 13, 1,
+  13, 1, 12, 0, 10, 9, 7, 6, 2, 8,
+  5, 2, 13, 7, 1, 13, 14, 7, 6, 7,
+  9, 6, 10, 11, 7, 8, 7, 5, 14, 8,
+  4, 4, 0, 8, 7, 10, 0, 8, 14, 11,
+  3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
+  11, 12, 12, 8, 0, 11, 13, 1, 2, 0,
+  5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
+  0, 3, 10, 5, 8, 0, 11, 6, 7, 8,
+  10, 7, 13, 9, 2, 5, 1, 5, 10, 2,
+  4, 3, 5, 6, 10, 8, 9, 4, 11, 14,
+  0, 10, 0, 5, 13, 2, 12, 7, 11, 13,
+  8, 0, 4, 10, 7, 2, 7, 2, 2, 5,
+  3, 4, 7, 3, 3, 14, 14, 5, 9, 13,
+  3, 14, 3, 6, 3, 0, 11, 8, 13, 1,
+  13, 1, 12, 0, 10, 9, 7, 6, 2, 8,
+  5, 2, 13, 7, 1, 13, 14, 7, 6, 7,
+  9, 6, 10, 11, 7, 8, 7, 5, 14, 8,
+  4, 4, 0, 8, 7, 10, 0, 8, 14, 11,
+  3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
+  11, 12, 12, 8, 0, 11, 13, 1, 2, 0,
+  5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
+  0, 3, 10, 5, 8, 0, 11, 6, 7, 8,
+  10, 7, 13, 9, 2, 5, 1, 5, 10, 2,
+  4, 3, 5, 6, 10, 8, 9, 4, 11, 14,
+  3, 8, 3, 7, 8, 5, 11, 4, 12, 3,
+  11, 9, 14, 8, 14, 13, 4, 3, 1, 2,
+  14, 6, 5, 4, 4, 11, 4, 6, 2, 1,
+  5, 8, 8, 12, 13, 5, 14, 10, 12, 13,
+  0, 9, 5, 5, 11, 10, 13, 9, 10, 13,
+};
+
+static const uint8_t q_diff_thresh = 20;
+static const uint8_t last_q_thresh = 170;
+
+void vp9_post_proc_down_and_across_c(const uint8_t *src_ptr,
+                                     uint8_t *dst_ptr,
+                                     int src_pixels_per_line,
+                                     int dst_pixels_per_line,
+                                     int rows,
+                                     int cols,
+                                     int flimit) {
+  uint8_t const *p_src;
+  uint8_t *p_dst;
+  int row, col, i, v, kernel;
+  int pitch = src_pixels_per_line;
+  uint8_t d[8];
+  (void)dst_pixels_per_line;
+
+  for (row = 0; row < rows; row++) {
+    /* post_proc_down for one row */
+    p_src = src_ptr;
+    p_dst = dst_ptr;
+
+    for (col = 0; col < cols; col++) {
+      kernel = 4;
+      v = p_src[col];
+
+      for (i = -2; i <= 2; i++) {
+        if (abs(v - p_src[col + i * pitch]) > flimit)
+          goto down_skip_convolve;
+
+        kernel += kernel5[2 + i] * p_src[col + i * pitch];
+      }
+
+      v = (kernel >> 3);
+    down_skip_convolve:
+      p_dst[col] = v;
+    }
+
+    /* now post_proc_across */
+    p_src = dst_ptr;
+    p_dst = dst_ptr;
+
+    for (i = 0; i < 8; i++)
+      d[i] = p_src[i];
+
+    for (col = 0; col < cols; col++) {
+      kernel = 4;
+      v = p_src[col];
+
+      d[col & 7] = v;
+
+      for (i = -2; i <= 2; i++) {
+        if (abs(v - p_src[col + i]) > flimit)
+          goto across_skip_convolve;
+
+        kernel += kernel5[2 + i] * p_src[col + i];
+      }
+
+      d[col & 7] = (kernel >> 3);
+    across_skip_convolve:
+
+      if (col >= 2)
+        p_dst[col - 2] = d[(col - 2) & 7];
+    }
+
+    /* handle the last two pixels */
+    p_dst[col - 2] = d[(col - 2) & 7];
+    p_dst[col - 1] = d[(col - 1) & 7];
+
+
+    /* next row */
+    src_ptr += pitch;
+    dst_ptr += pitch;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_post_proc_down_and_across_c(const uint16_t *src_ptr,
+                                            uint16_t *dst_ptr,
+                                            int src_pixels_per_line,
+                                            int dst_pixels_per_line,
+                                            int rows,
+                                            int cols,
+                                            int flimit) {
+  uint16_t const *p_src;
+  uint16_t *p_dst;
+  int row, col, i, v, kernel;
+  int pitch = src_pixels_per_line;
+  uint16_t d[8];
+
+  for (row = 0; row < rows; row++) {
+    // post_proc_down for one row.
+    p_src = src_ptr;
+    p_dst = dst_ptr;
+
+    for (col = 0; col < cols; col++) {
+      kernel = 4;
+      v = p_src[col];
+
+      for (i = -2; i <= 2; i++) {
+        if (abs(v - p_src[col + i * pitch]) > flimit)
+          goto down_skip_convolve;
+
+        kernel += kernel5[2 + i] * p_src[col + i * pitch];
+      }
+
+      v = (kernel >> 3);
+
+    down_skip_convolve:
+      p_dst[col] = v;
+    }
+
+    /* now post_proc_across */
+    p_src = dst_ptr;
+    p_dst = dst_ptr;
+
+    for (i = 0; i < 8; i++)
+      d[i] = p_src[i];
+
+    for (col = 0; col < cols; col++) {
+      kernel = 4;
+      v = p_src[col];
+
+      d[col & 7] = v;
+
+      for (i = -2; i <= 2; i++) {
+        if (abs(v - p_src[col + i]) > flimit)
+          goto across_skip_convolve;
+
+        kernel += kernel5[2 + i] * p_src[col + i];
+      }
+
+      d[col & 7] = (kernel >> 3);
+
+    across_skip_convolve:
+      if (col >= 2)
+        p_dst[col - 2] = d[(col - 2) & 7];
+    }
+
+    /* handle the last two pixels */
+    p_dst[col - 2] = d[(col - 2) & 7];
+    p_dst[col - 1] = d[(col - 1) & 7];
+
+
+    /* next row */
+    src_ptr += pitch;
+    dst_ptr += dst_pixels_per_line;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static int q2mbl(int x) {
+  if (x < 20) x = 20;
+
+  x = 50 + (x - 50) * 10 / 8;
+  return x * x / 3;
+}
+
+void vp9_mbpost_proc_across_ip_c(uint8_t *src, int pitch,
+                                 int rows, int cols, int flimit) {
+  int r, c, i;
+  uint8_t *s = src;
+  uint8_t d[16];
+
+  for (r = 0; r < rows; r++) {
+    int sumsq = 0;
+    int sum = 0;
+
+    for (i = -8; i <= 6; i++) {
+      sumsq += s[i] * s[i];
+      sum += s[i];
+      d[i + 8] = 0;
+    }
+
+    for (c = 0; c < cols + 8; c++) {
+      int x = s[c + 7] - s[c - 8];
+      int y = s[c + 7] + s[c - 8];
+
+      sum += x;
+      sumsq += x * y;
+
+      d[c & 15] = s[c];
+
+      if (sumsq * 15 - sum * sum < flimit) {
+        d[c & 15] = (8 + sum + s[c]) >> 4;
+      }
+
+      s[c - 8] = d[(c - 8) & 15];
+    }
+    s += pitch;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch,
+                                        int rows, int cols, int flimit) {
+  int r, c, i;
+
+  uint16_t *s = src;
+  uint16_t d[16];
+
+
+  for (r = 0; r < rows; r++) {
+    int sumsq = 0;
+    int sum   = 0;
+
+    for (i = -8; i <= 6; i++) {
+      sumsq += s[i] * s[i];
+      sum   += s[i];
+      d[i + 8] = 0;
+    }
+
+    for (c = 0; c < cols + 8; c++) {
+      int x = s[c + 7] - s[c - 8];
+      int y = s[c + 7] + s[c - 8];
+
+      sum  += x;
+      sumsq += x * y;
+
+      d[c & 15] = s[c];
+
+      if (sumsq * 15 - sum * sum < flimit) {
+        d[c & 15] = (8 + sum + s[c]) >> 4;
+      }
+
+      s[c - 8] = d[(c - 8) & 15];
+    }
+
+    s += pitch;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+void vp9_mbpost_proc_down_c(uint8_t *dst, int pitch,
+                            int rows, int cols, int flimit) {
+  int r, c, i;
+  const short *rv3 = &vp9_rv[63 & rand()]; // NOLINT
+
+  for (c = 0; c < cols; c++) {
+    uint8_t *s = &dst[c];
+    int sumsq = 0;
+    int sum   = 0;
+    uint8_t d[16];
+    const int16_t *rv2 = rv3 + ((c * 17) & 127);
+
+    for (i = -8; i <= 6; i++) {
+      sumsq += s[i * pitch] * s[i * pitch];
+      sum   += s[i * pitch];
+    }
+
+    for (r = 0; r < rows + 8; r++) {
+      sumsq += s[7 * pitch] * s[ 7 * pitch] - s[-8 * pitch] * s[-8 * pitch];
+      sum  += s[7 * pitch] - s[-8 * pitch];
+      d[r & 15] = s[0];
+
+      if (sumsq * 15 - sum * sum < flimit) {
+        d[r & 15] = (rv2[r & 127] + sum + s[0]) >> 4;
+      }
+
+      s[-8 * pitch] = d[(r - 8) & 15];
+      s += pitch;
+    }
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_mbpost_proc_down_c(uint16_t *dst, int pitch,
+                                   int rows, int cols, int flimit) {
+  int r, c, i;
+  const int16_t *rv3 = &vp9_rv[63 & rand()];  // NOLINT
+
+  for (c = 0; c < cols; c++) {
+    uint16_t *s = &dst[c];
+    int sumsq = 0;
+    int sum = 0;
+    uint16_t d[16];
+    const int16_t *rv2 = rv3 + ((c * 17) & 127);
+
+    for (i = -8; i <= 6; i++) {
+      sumsq += s[i * pitch] * s[i * pitch];
+      sum += s[i * pitch];
+    }
+
+    for (r = 0; r < rows + 8; r++) {
+      sumsq += s[7 * pitch] * s[ 7 * pitch] - s[-8 * pitch] * s[-8 * pitch];
+      sum += s[7 * pitch] - s[-8 * pitch];
+      d[r & 15] = s[0];
+
+      if (sumsq * 15 - sum * sum < flimit) {
+        d[r & 15] = (rv2[r & 127] + sum + s[0]) >> 4;
+      }
+
+      s[-8 * pitch] = d[(r - 8) & 15];
+      s += pitch;
+    }
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static void deblock_and_de_macro_block(YV12_BUFFER_CONFIG   *source,
+                                       YV12_BUFFER_CONFIG   *post,
+                                       int                   q,
+                                       int                   low_var_thresh,
+                                       int                   flag) {
+  double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
+  int ppl = (int)(level + .5);
+  (void) low_var_thresh;
+  (void) flag;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (source->flags & YV12_FLAG_HIGHBITDEPTH) {
+    vp9_highbd_post_proc_down_and_across(CONVERT_TO_SHORTPTR(source->y_buffer),
+                                         CONVERT_TO_SHORTPTR(post->y_buffer),
+                                         source->y_stride, post->y_stride,
+                                         source->y_height, source->y_width,
+                                         ppl);
+
+    vp9_highbd_mbpost_proc_across_ip(CONVERT_TO_SHORTPTR(post->y_buffer),
+                                     post->y_stride, post->y_height,
+                                     post->y_width, q2mbl(q));
+
+    vp9_highbd_mbpost_proc_down(CONVERT_TO_SHORTPTR(post->y_buffer),
+                                post->y_stride, post->y_height,
+                                post->y_width, q2mbl(q));
+
+    vp9_highbd_post_proc_down_and_across(CONVERT_TO_SHORTPTR(source->u_buffer),
+                                         CONVERT_TO_SHORTPTR(post->u_buffer),
+                                         source->uv_stride, post->uv_stride,
+                                         source->uv_height, source->uv_width,
+                                         ppl);
+    vp9_highbd_post_proc_down_and_across(CONVERT_TO_SHORTPTR(source->v_buffer),
+                                         CONVERT_TO_SHORTPTR(post->v_buffer),
+                                         source->uv_stride, post->uv_stride,
+                                         source->uv_height, source->uv_width,
+                                         ppl);
+  } else {
+    vp9_post_proc_down_and_across(source->y_buffer, post->y_buffer,
+                                  source->y_stride, post->y_stride,
+                                  source->y_height, source->y_width, ppl);
+
+    vp9_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height,
+                              post->y_width, q2mbl(q));
+
+    vp9_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height,
+                         post->y_width, q2mbl(q));
+
+    vp9_post_proc_down_and_across(source->u_buffer, post->u_buffer,
+                                  source->uv_stride, post->uv_stride,
+                                  source->uv_height, source->uv_width, ppl);
+    vp9_post_proc_down_and_across(source->v_buffer, post->v_buffer,
+                                  source->uv_stride, post->uv_stride,
+                                  source->uv_height, source->uv_width, ppl);
+  }
+#else
+  vp9_post_proc_down_and_across(source->y_buffer, post->y_buffer,
+                                source->y_stride, post->y_stride,
+                                source->y_height, source->y_width, ppl);
+
+  vp9_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height,
+                            post->y_width, q2mbl(q));
+
+  vp9_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height,
+                       post->y_width, q2mbl(q));
+
+  vp9_post_proc_down_and_across(source->u_buffer, post->u_buffer,
+                                source->uv_stride, post->uv_stride,
+                                source->uv_height, source->uv_width, ppl);
+  vp9_post_proc_down_and_across(source->v_buffer, post->v_buffer,
+                                source->uv_stride, post->uv_stride,
+                                source->uv_height, source->uv_width, ppl);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}
+
+void vp9_deblock(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
+                 int q) {
+  const int ppl = (int)(6.0e-05 * q * q * q - 0.0067 * q * q + 0.306 * q
+                        + 0.0065 + 0.5);
+  int i;
+
+  const uint8_t *const srcs[3] = {src->y_buffer, src->u_buffer, src->v_buffer};
+  const int src_strides[3] = {src->y_stride, src->uv_stride, src->uv_stride};
+  const int src_widths[3] = {src->y_width, src->uv_width, src->uv_width};
+  const int src_heights[3] = {src->y_height, src->uv_height, src->uv_height};
+
+  uint8_t *const dsts[3] = {dst->y_buffer, dst->u_buffer, dst->v_buffer};
+  const int dst_strides[3] = {dst->y_stride, dst->uv_stride, dst->uv_stride};
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    assert((src->flags & YV12_FLAG_HIGHBITDEPTH) ==
+           (dst->flags & YV12_FLAG_HIGHBITDEPTH));
+    if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
+      vp9_highbd_post_proc_down_and_across(CONVERT_TO_SHORTPTR(srcs[i]),
+                                           CONVERT_TO_SHORTPTR(dsts[i]),
+                                           src_strides[i], dst_strides[i],
+                                           src_heights[i], src_widths[i], ppl);
+    } else {
+      vp9_post_proc_down_and_across(srcs[i], dsts[i],
+                                    src_strides[i], dst_strides[i],
+                                    src_heights[i], src_widths[i], ppl);
+    }
+#else
+    vp9_post_proc_down_and_across(srcs[i], dsts[i],
+                                  src_strides[i], dst_strides[i],
+                                  src_heights[i], src_widths[i], ppl);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  }
+}
+
+void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
+                 int q) {
+  const int ppl = (int)(6.0e-05 * q * q * q - 0.0067 * q * q + 0.306 * q
+                        + 0.0065 + 0.5);
+  int i;
+
+  const uint8_t *const srcs[3] = {src->y_buffer, src->u_buffer, src->v_buffer};
+  const int src_strides[3] = {src->y_stride, src->uv_stride, src->uv_stride};
+  const int src_widths[3] = {src->y_width, src->uv_width, src->uv_width};
+  const int src_heights[3] = {src->y_height, src->uv_height, src->uv_height};
+
+  uint8_t *const dsts[3] = {dst->y_buffer, dst->u_buffer, dst->v_buffer};
+  const int dst_strides[3] = {dst->y_stride, dst->uv_stride, dst->uv_stride};
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    const int src_stride = src_strides[i];
+    const int src_width = src_widths[i] - 4;
+    const int src_height = src_heights[i] - 4;
+    const int dst_stride = dst_strides[i];
+
+#if CONFIG_VP9_HIGHBITDEPTH
+    assert((src->flags & YV12_FLAG_HIGHBITDEPTH) ==
+           (dst->flags & YV12_FLAG_HIGHBITDEPTH));
+    if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
+      const uint16_t *const src_plane = CONVERT_TO_SHORTPTR(
+          srcs[i] + 2 * src_stride + 2);
+      uint16_t *const dst_plane = CONVERT_TO_SHORTPTR(
+          dsts[i] + 2 * dst_stride + 2);
+      vp9_highbd_post_proc_down_and_across(src_plane, dst_plane, src_stride,
+                                           dst_stride, src_height, src_width,
+                                           ppl);
+    } else {
+      const uint8_t *const src_plane = srcs[i] + 2 * src_stride + 2;
+      uint8_t *const dst_plane = dsts[i] + 2 * dst_stride + 2;
+
+      vp9_post_proc_down_and_across(src_plane, dst_plane, src_stride,
+                                    dst_stride, src_height, src_width, ppl);
+    }
+#else
+    const uint8_t *const src_plane = srcs[i] + 2 * src_stride + 2;
+    uint8_t *const dst_plane = dsts[i] + 2 * dst_stride + 2;
+    vp9_post_proc_down_and_across(src_plane, dst_plane, src_stride, dst_stride,
+                                  src_height, src_width, ppl);
+#endif
+  }
+}
+
+static double gaussian(double sigma, double mu, double x) {
+  return 1 / (sigma * sqrt(2.0 * 3.14159265)) *
+         (exp(-(x - mu) * (x - mu) / (2 * sigma * sigma)));
+}
+
+static void fillrd(struct postproc_state *state, int q, int a) {
+  char char_dist[300];
+
+  double sigma;
+  int ai = a, qi = q, i;
+
+  vpx_clear_system_state();
+
+  sigma = ai + .5 + .6 * (63 - qi) / 63.0;
+
+  /* set up a lookup table of 256 entries that matches
+   * a gaussian distribution with sigma determined by q.
+   */
+  {
+    int next, j;
+
+    next = 0;
+
+    for (i = -32; i < 32; i++) {
+      int a_i = (int)(0.5 + 256 * gaussian(sigma, 0, i));
+
+      if (a_i) {
+        for (j = 0; j < a_i; j++) {
+          char_dist[next + j] = (char) i;
+        }
+
+        next = next + j;
+      }
+    }
+
+    for (; next < 256; next++)
+      char_dist[next] = 0;
+  }
+
+  for (i = 0; i < 3072; i++) {
+    state->noise[i] = char_dist[rand() & 0xff];  // NOLINT
+  }
+
+  for (i = 0; i < 16; i++) {
+    state->blackclamp[i] = -char_dist[0];
+    state->whiteclamp[i] = -char_dist[0];
+    state->bothclamp[i] = -2 * char_dist[0];
+  }
+
+  state->last_q = q;
+  state->last_noise = a;
+}
+
+void vp9_plane_add_noise_c(uint8_t *start, char *noise,
+                           char blackclamp[16],
+                           char whiteclamp[16],
+                           char bothclamp[16],
+                           unsigned int width, unsigned int height, int pitch) {
+  unsigned int i, j;
+
+  // TODO(jbb): why does simd code use both but c doesn't,  normalize and
+  // fix..
+  (void) bothclamp;
+  for (i = 0; i < height; i++) {
+    uint8_t *pos = start + i * pitch;
+    char  *ref = (char *)(noise + (rand() & 0xff));  // NOLINT
+
+    for (j = 0; j < width; j++) {
+      if (pos[j] < blackclamp[0])
+        pos[j] = blackclamp[0];
+
+      if (pos[j] > 255 + whiteclamp[0])
+        pos[j] = 255 + whiteclamp[0];
+
+      pos[j] += ref[j];
+    }
+  }
+}
+
+static void swap_mi_and_prev_mi(VP9_COMMON *cm) {
+  // Current mip will be the prev_mip for the next frame.
+  MODE_INFO *temp = cm->postproc_state.prev_mip;
+  cm->postproc_state.prev_mip = cm->mip;
+  cm->mip = temp;
+
+  // Update the upper left visible macroblock ptrs.
+  cm->mi = cm->mip + cm->mi_stride + 1;
+  cm->postproc_state.prev_mi = cm->postproc_state.prev_mip + cm->mi_stride + 1;
+}
+
+int vp9_post_proc_frame(struct VP9Common *cm,
+                        YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *ppflags) {
+  const int q = VPXMIN(105, cm->lf.filter_level * 2);
+  const int flags = ppflags->post_proc_flag;
+  YV12_BUFFER_CONFIG *const ppbuf = &cm->post_proc_buffer;
+  struct postproc_state *const ppstate = &cm->postproc_state;
+
+  if (!cm->frame_to_show)
+    return -1;
+
+  if (!flags) {
+    *dest = *cm->frame_to_show;
+    return 0;
+  }
+
+  vpx_clear_system_state();
+
+  // Alloc memory for prev_mip in the first frame.
+  if (cm->current_video_frame == 1) {
+    cm->postproc_state.last_base_qindex = cm->base_qindex;
+    cm->postproc_state.last_frame_valid = 1;
+    ppstate->prev_mip = vpx_calloc(cm->mi_alloc_size, sizeof(*cm->mip));
+    if (!ppstate->prev_mip) {
+      return 1;
+    }
+    ppstate->prev_mi = ppstate->prev_mip + cm->mi_stride + 1;
+    memset(ppstate->prev_mip, 0,
+           cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mip));
+  }
+
+  // Allocate post_proc_buffer_int if needed.
+  if ((flags & VP9D_MFQE) && !cm->post_proc_buffer_int.buffer_alloc) {
+    if ((flags & VP9D_DEMACROBLOCK) || (flags & VP9D_DEBLOCK)) {
+      const int width = ALIGN_POWER_OF_TWO(cm->width, 4);
+      const int height = ALIGN_POWER_OF_TWO(cm->height, 4);
+
+      if (vpx_alloc_frame_buffer(&cm->post_proc_buffer_int, width, height,
+                                 cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                 cm->use_highbitdepth,
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+                                 VP9_ENC_BORDER_IN_PIXELS,
+                                 cm->byte_alignment) < 0) {
+        vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                           "Failed to allocate MFQE framebuffer");
+      }
+
+      // Ensure that postproc is set to all 0s so that post proc
+      // doesn't pull random data in from edge.
+      memset(cm->post_proc_buffer_int.buffer_alloc, 128,
+             cm->post_proc_buffer.frame_size);
+    }
+  }
+
+  if (vpx_realloc_frame_buffer(&cm->post_proc_buffer, cm->width, cm->height,
+                               cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                               cm->use_highbitdepth,
+#endif
+                               VP9_DEC_BORDER_IN_PIXELS, cm->byte_alignment,
+                               NULL, NULL, NULL) < 0)
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate post-processing buffer");
+
+  if ((flags & VP9D_MFQE) && cm->current_video_frame >= 2 &&
+      cm->postproc_state.last_frame_valid && cm->bit_depth == 8 &&
+      cm->postproc_state.last_base_qindex <= last_q_thresh &&
+      cm->base_qindex - cm->postproc_state.last_base_qindex >= q_diff_thresh) {
+    vp9_mfqe(cm);
+    // TODO(jackychen): Consider whether enable deblocking by default
+    // if mfqe is enabled. Need to take both the quality and the speed
+    // into consideration.
+    if ((flags & VP9D_DEMACROBLOCK) || (flags & VP9D_DEBLOCK)) {
+      vp8_yv12_copy_frame(ppbuf, &cm->post_proc_buffer_int);
+    }
+    if ((flags & VP9D_DEMACROBLOCK) && cm->post_proc_buffer_int.buffer_alloc) {
+      deblock_and_de_macro_block(&cm->post_proc_buffer_int, ppbuf,
+                                 q + (ppflags->deblocking_level - 5) * 10,
+                                 1, 0);
+    } else if (flags & VP9D_DEBLOCK) {
+      vp9_deblock(&cm->post_proc_buffer_int, ppbuf, q);
+    } else {
+      vp8_yv12_copy_frame(&cm->post_proc_buffer_int, ppbuf);
+    }
+  } else if (flags & VP9D_DEMACROBLOCK) {
+    deblock_and_de_macro_block(cm->frame_to_show, ppbuf,
+                               q + (ppflags->deblocking_level - 5) * 10, 1, 0);
+  } else if (flags & VP9D_DEBLOCK) {
+    vp9_deblock(cm->frame_to_show, ppbuf, q);
+  } else {
+    vp8_yv12_copy_frame(cm->frame_to_show, ppbuf);
+  }
+
+  cm->postproc_state.last_base_qindex = cm->base_qindex;
+  cm->postproc_state.last_frame_valid = 1;
+
+  if (flags & VP9D_ADDNOISE) {
+    const int noise_level = ppflags->noise_level;
+    if (ppstate->last_q != q ||
+        ppstate->last_noise != noise_level) {
+      fillrd(ppstate, 63 - q, noise_level);
+    }
+
+    vp9_plane_add_noise(ppbuf->y_buffer, ppstate->noise, ppstate->blackclamp,
+                        ppstate->whiteclamp, ppstate->bothclamp,
+                        ppbuf->y_width, ppbuf->y_height, ppbuf->y_stride);
+  }
+
+  *dest = *ppbuf;
+
+  /* handle problem with extending borders */
+  dest->y_width = cm->width;
+  dest->y_height = cm->height;
+  dest->uv_width = dest->y_width >> cm->subsampling_x;
+  dest->uv_height = dest->y_height >> cm->subsampling_y;
+
+  swap_mi_and_prev_mi(cm);
+  return 0;
+}
+#endif  // CONFIG_VP9_POSTPROC
diff --git a/libs/libvpx/vp9/common/vp9_postproc.h b/libs/libvpx/vp9/common/vp9_postproc.h
new file mode 100644
index 0000000000..035c9cdf84
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_postproc.h
@@ -0,0 +1,53 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP9_COMMON_VP9_POSTPROC_H_
+#define VP9_COMMON_VP9_POSTPROC_H_
+
+#include "vpx_ports/mem.h"
+#include "vpx_scale/yv12config.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_mfqe.h"
+#include "vp9/common/vp9_ppflags.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct postproc_state {
+  int last_q;
+  int last_noise;
+  char noise[3072];
+  int last_base_qindex;
+  int last_frame_valid;
+  MODE_INFO *prev_mip;
+  MODE_INFO *prev_mi;
+  DECLARE_ALIGNED(16, char, blackclamp[16]);
+  DECLARE_ALIGNED(16, char, whiteclamp[16]);
+  DECLARE_ALIGNED(16, char, bothclamp[16]);
+};
+
+struct VP9Common;
+
+#define MFQE_PRECISION 4
+
+int vp9_post_proc_frame(struct VP9Common *cm,
+                        YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *flags);
+
+void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q);
+
+void vp9_deblock(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_COMMON_VP9_POSTPROC_H_
diff --git a/libs/libvpx/vp9/common/vp9_ppflags.h b/libs/libvpx/vp9/common/vp9_ppflags.h
new file mode 100644
index 0000000000..12b989f43a
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_ppflags.h
@@ -0,0 +1,43 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_PPFLAGS_H_
+#define VP9_COMMON_VP9_PPFLAGS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum {
+  VP9D_NOFILTERING            = 0,
+  VP9D_DEBLOCK                = 1 << 0,
+  VP9D_DEMACROBLOCK           = 1 << 1,
+  VP9D_ADDNOISE               = 1 << 2,
+  VP9D_DEBUG_TXT_FRAME_INFO   = 1 << 3,
+  VP9D_DEBUG_TXT_MBLK_MODES   = 1 << 4,
+  VP9D_DEBUG_TXT_DC_DIFF      = 1 << 5,
+  VP9D_DEBUG_TXT_RATE_INFO    = 1 << 6,
+  VP9D_DEBUG_DRAW_MV          = 1 << 7,
+  VP9D_DEBUG_CLR_BLK_MODES    = 1 << 8,
+  VP9D_DEBUG_CLR_FRM_REF_BLKS = 1 << 9,
+  VP9D_MFQE                   = 1 << 10
+};
+
+typedef struct {
+  int post_proc_flag;
+  int deblocking_level;
+  int noise_level;
+} vp9_ppflags_t;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_COMMON_VP9_PPFLAGS_H_
diff --git a/libs/libvpx/vp9/common/vp9_pred_common.c b/libs/libvpx/vp9/common/vp9_pred_common.c
new file mode 100644
index 0000000000..c201890a8b
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_pred_common.c
@@ -0,0 +1,339 @@
+
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_pred_common.h"
+#include "vp9/common/vp9_seg_common.h"
+
+// Returns a context number for the given MB prediction signal
+int vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd) {
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries corresponding to real macroblocks.
+  // The prediction flags in these dummy entries are initialized to 0.
+  const MODE_INFO *const left_mi = xd->left_mi;
+  const int left_type = xd->left_available && is_inter_block(left_mi) ?
+                            left_mi->interp_filter : SWITCHABLE_FILTERS;
+  const MODE_INFO *const above_mi = xd->above_mi;
+  const int above_type = xd->up_available && is_inter_block(above_mi) ?
+                             above_mi->interp_filter : SWITCHABLE_FILTERS;
+
+  if (left_type == above_type)
+    return left_type;
+  else if (left_type == SWITCHABLE_FILTERS && above_type != SWITCHABLE_FILTERS)
+    return above_type;
+  else if (left_type != SWITCHABLE_FILTERS && above_type == SWITCHABLE_FILTERS)
+    return left_type;
+  else
+    return SWITCHABLE_FILTERS;
+}
+
+// The mode info data structure has a one element border above and to the
+// left of the entries corresponding to real macroblocks.
+// The prediction flags in these dummy entries are initialized to 0.
+// 0 - inter/inter, inter/--, --/inter, --/--
+// 1 - intra/inter, inter/intra
+// 2 - intra/--, --/intra
+// 3 - intra/intra
+int vp9_get_intra_inter_context(const MACROBLOCKD *xd) {
+  const MODE_INFO *const above_mi = xd->above_mi;
+  const MODE_INFO *const left_mi = xd->left_mi;
+  const int has_above = xd->up_available;
+  const int has_left = xd->left_available;
+
+  if (has_above && has_left) {  // both edges available
+    const int above_intra = !is_inter_block(above_mi);
+    const int left_intra = !is_inter_block(left_mi);
+    return left_intra && above_intra ? 3
+                                     : left_intra || above_intra;
+  } else if (has_above || has_left) {  // one edge available
+    return 2 * !is_inter_block(has_above ? above_mi : left_mi);
+  } else {
+    return 0;
+  }
+}
+
+int vp9_get_reference_mode_context(const VP9_COMMON *cm,
+                                   const MACROBLOCKD *xd) {
+  int ctx;
+  const MODE_INFO *const above_mi = xd->above_mi;
+  const MODE_INFO *const left_mi = xd->left_mi;
+  const int has_above = xd->up_available;
+  const int has_left = xd->left_available;
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries corresponding to real macroblocks.
+  // The prediction flags in these dummy entries are initialized to 0.
+  if (has_above && has_left) {  // both edges available
+    if (!has_second_ref(above_mi) && !has_second_ref(left_mi))
+      // neither edge uses comp pred (0/1)
+      ctx = (above_mi->ref_frame[0] == cm->comp_fixed_ref) ^
+            (left_mi->ref_frame[0] == cm->comp_fixed_ref);
+    else if (!has_second_ref(above_mi))
+      // one of two edges uses comp pred (2/3)
+      ctx = 2 + (above_mi->ref_frame[0] == cm->comp_fixed_ref ||
+                 !is_inter_block(above_mi));
+    else if (!has_second_ref(left_mi))
+      // one of two edges uses comp pred (2/3)
+      ctx = 2 + (left_mi->ref_frame[0] == cm->comp_fixed_ref ||
+                 !is_inter_block(left_mi));
+    else  // both edges use comp pred (4)
+      ctx = 4;
+  } else if (has_above || has_left) {  // one edge available
+    const MODE_INFO *edge_mi = has_above ? above_mi : left_mi;
+
+    if (!has_second_ref(edge_mi))
+      // edge does not use comp pred (0/1)
+      ctx = edge_mi->ref_frame[0] == cm->comp_fixed_ref;
+    else
+      // edge uses comp pred (3)
+      ctx = 3;
+  } else {  // no edges available (1)
+    ctx = 1;
+  }
+  assert(ctx >= 0 && ctx < COMP_INTER_CONTEXTS);
+  return ctx;
+}
+
+// Returns a context number for the given MB prediction signal
+int vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
+                                    const MACROBLOCKD *xd) {
+  int pred_context;
+  const MODE_INFO *const above_mi = xd->above_mi;
+  const MODE_INFO *const left_mi = xd->left_mi;
+  const int above_in_image = xd->up_available;
+  const int left_in_image = xd->left_available;
+
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries corresponding to real macroblocks.
+  // The prediction flags in these dummy entries are initialized to 0.
+  const int fix_ref_idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
+  const int var_ref_idx = !fix_ref_idx;
+
+  if (above_in_image && left_in_image) {  // both edges available
+    const int above_intra = !is_inter_block(above_mi);
+    const int left_intra = !is_inter_block(left_mi);
+
+    if (above_intra && left_intra) {  // intra/intra (2)
+      pred_context = 2;
+    } else if (above_intra || left_intra) {  // intra/inter
+      const MODE_INFO *edge_mi = above_intra ? left_mi : above_mi;
+
+      if (!has_second_ref(edge_mi))  // single pred (1/3)
+        pred_context = 1 + 2 * (edge_mi->ref_frame[0] != cm->comp_var_ref[1]);
+      else  // comp pred (1/3)
+        pred_context = 1 + 2 * (edge_mi->ref_frame[var_ref_idx]
+                                    != cm->comp_var_ref[1]);
+    } else {  // inter/inter
+      const int l_sg = !has_second_ref(left_mi);
+      const int a_sg = !has_second_ref(above_mi);
+      const MV_REFERENCE_FRAME vrfa = a_sg ? above_mi->ref_frame[0]
+                                           : above_mi->ref_frame[var_ref_idx];
+      const MV_REFERENCE_FRAME vrfl = l_sg ? left_mi->ref_frame[0]
+                                           : left_mi->ref_frame[var_ref_idx];
+
+      if (vrfa == vrfl && cm->comp_var_ref[1] == vrfa) {
+        pred_context = 0;
+      } else if (l_sg && a_sg) {  // single/single
+        if ((vrfa == cm->comp_fixed_ref && vrfl == cm->comp_var_ref[0]) ||
+            (vrfl == cm->comp_fixed_ref && vrfa == cm->comp_var_ref[0]))
+          pred_context = 4;
+        else if (vrfa == vrfl)
+          pred_context = 3;
+        else
+          pred_context = 1;
+      } else if (l_sg || a_sg) {  // single/comp
+        const MV_REFERENCE_FRAME vrfc = l_sg ? vrfa : vrfl;
+        const MV_REFERENCE_FRAME rfs = a_sg ? vrfa : vrfl;
+        if (vrfc == cm->comp_var_ref[1] && rfs != cm->comp_var_ref[1])
+          pred_context = 1;
+        else if (rfs == cm->comp_var_ref[1] && vrfc != cm->comp_var_ref[1])
+          pred_context = 2;
+        else
+          pred_context = 4;
+      } else if (vrfa == vrfl) {  // comp/comp
+        pred_context = 4;
+      } else {
+        pred_context = 2;
+      }
+    }
+  } else if (above_in_image || left_in_image) {  // one edge available
+    const MODE_INFO *edge_mi = above_in_image ? above_mi : left_mi;
+
+    if (!is_inter_block(edge_mi)) {
+      pred_context = 2;
+    } else {
+      if (has_second_ref(edge_mi))
+        pred_context = 4 * (edge_mi->ref_frame[var_ref_idx]
+                              != cm->comp_var_ref[1]);
+      else
+        pred_context = 3 * (edge_mi->ref_frame[0] != cm->comp_var_ref[1]);
+    }
+  } else {  // no edges available (2)
+    pred_context = 2;
+  }
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+
+  return pred_context;
+}
+
+int vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) {
+  int pred_context;
+  const MODE_INFO *const above_mi = xd->above_mi;
+  const MODE_INFO *const left_mi = xd->left_mi;
+  const int has_above = xd->up_available;
+  const int has_left = xd->left_available;
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries corresponding to real macroblocks.
+  // The prediction flags in these dummy entries are initialized to 0.
+  if (has_above && has_left) {  // both edges available
+    const int above_intra = !is_inter_block(above_mi);
+    const int left_intra = !is_inter_block(left_mi);
+
+    if (above_intra && left_intra) {  // intra/intra
+      pred_context = 2;
+    } else if (above_intra || left_intra) {  // intra/inter or inter/intra
+      const MODE_INFO *edge_mi = above_intra ? left_mi : above_mi;
+      if (!has_second_ref(edge_mi))
+        pred_context = 4 * (edge_mi->ref_frame[0] == LAST_FRAME);
+      else
+        pred_context = 1 + (edge_mi->ref_frame[0] == LAST_FRAME ||
+                            edge_mi->ref_frame[1] == LAST_FRAME);
+    } else {  // inter/inter
+      const int above_has_second = has_second_ref(above_mi);
+      const int left_has_second = has_second_ref(left_mi);
+      const MV_REFERENCE_FRAME above0 = above_mi->ref_frame[0];
+      const MV_REFERENCE_FRAME above1 = above_mi->ref_frame[1];
+      const MV_REFERENCE_FRAME left0 = left_mi->ref_frame[0];
+      const MV_REFERENCE_FRAME left1 = left_mi->ref_frame[1];
+
+      if (above_has_second && left_has_second) {
+        pred_context = 1 + (above0 == LAST_FRAME || above1 == LAST_FRAME ||
+                            left0 == LAST_FRAME || left1 == LAST_FRAME);
+      } else if (above_has_second || left_has_second) {
+        const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
+
+        if (rfs == LAST_FRAME)
+          pred_context = 3 + (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
+        else
+          pred_context = (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
+      } else {
+        pred_context = 2 * (above0 == LAST_FRAME) + 2 * (left0 == LAST_FRAME);
+      }
+    }
+  } else if (has_above || has_left) {  // one edge available
+    const MODE_INFO *edge_mi = has_above ? above_mi : left_mi;
+    if (!is_inter_block(edge_mi)) {  // intra
+      pred_context = 2;
+    } else {  // inter
+      if (!has_second_ref(edge_mi))
+        pred_context = 4 * (edge_mi->ref_frame[0] == LAST_FRAME);
+      else
+        pred_context = 1 + (edge_mi->ref_frame[0] == LAST_FRAME ||
+                            edge_mi->ref_frame[1] == LAST_FRAME);
+    }
+  } else {  // no edges available
+    pred_context = 2;
+  }
+
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+  return pred_context;
+}
+
+int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
+  int pred_context;
+  const MODE_INFO *const above_mi = xd->above_mi;
+  const MODE_INFO *const left_mi = xd->left_mi;
+  const int has_above = xd->up_available;
+  const int has_left = xd->left_available;
+
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries corresponding to real macroblocks.
+  // The prediction flags in these dummy entries are initialized to 0.
+  if (has_above && has_left) {  // both edges available
+    const int above_intra = !is_inter_block(above_mi);
+    const int left_intra = !is_inter_block(left_mi);
+
+    if (above_intra && left_intra) {  // intra/intra
+      pred_context = 2;
+    } else if (above_intra || left_intra) {  // intra/inter or inter/intra
+      const MODE_INFO *edge_mi = above_intra ? left_mi : above_mi;
+      if (!has_second_ref(edge_mi)) {
+        if (edge_mi->ref_frame[0] == LAST_FRAME)
+          pred_context = 3;
+        else
+          pred_context = 4 * (edge_mi->ref_frame[0] == GOLDEN_FRAME);
+      } else {
+        pred_context = 1 + 2 * (edge_mi->ref_frame[0] == GOLDEN_FRAME ||
+                                edge_mi->ref_frame[1] == GOLDEN_FRAME);
+      }
+    } else {  // inter/inter
+      const int above_has_second = has_second_ref(above_mi);
+      const int left_has_second = has_second_ref(left_mi);
+      const MV_REFERENCE_FRAME above0 = above_mi->ref_frame[0];
+      const MV_REFERENCE_FRAME above1 = above_mi->ref_frame[1];
+      const MV_REFERENCE_FRAME left0 = left_mi->ref_frame[0];
+      const MV_REFERENCE_FRAME left1 = left_mi->ref_frame[1];
+
+      if (above_has_second && left_has_second) {
+        if (above0 == left0 && above1 == left1)
+          pred_context = 3 * (above0 == GOLDEN_FRAME ||
+                              above1 == GOLDEN_FRAME ||
+                              left0 == GOLDEN_FRAME ||
+                              left1 == GOLDEN_FRAME);
+        else
+          pred_context = 2;
+      } else if (above_has_second || left_has_second) {
+        const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
+
+        if (rfs == GOLDEN_FRAME)
+          pred_context = 3 + (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);
+        else if (rfs == ALTREF_FRAME)
+          pred_context = crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME;
+        else
+          pred_context = 1 + 2 * (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);
+      } else {
+        if (above0 == LAST_FRAME && left0 == LAST_FRAME) {
+          pred_context = 3;
+        } else if (above0 == LAST_FRAME || left0 == LAST_FRAME) {
+          const MV_REFERENCE_FRAME edge0 = (above0 == LAST_FRAME) ? left0
+                                                                  : above0;
+          pred_context = 4 * (edge0 == GOLDEN_FRAME);
+        } else {
+          pred_context = 2 * (above0 == GOLDEN_FRAME) +
+                             2 * (left0 == GOLDEN_FRAME);
+        }
+      }
+    }
+  } else if (has_above || has_left) {  // one edge available
+    const MODE_INFO *edge_mi = has_above ? above_mi : left_mi;
+
+    if (!is_inter_block(edge_mi) ||
+        (edge_mi->ref_frame[0] == LAST_FRAME && !has_second_ref(edge_mi)))
+      pred_context = 2;
+    else if (!has_second_ref(edge_mi))
+      pred_context = 4 * (edge_mi->ref_frame[0] == GOLDEN_FRAME);
+    else
+      pred_context = 3 * (edge_mi->ref_frame[0] == GOLDEN_FRAME ||
+                          edge_mi->ref_frame[1] == GOLDEN_FRAME);
+  } else {  // no edges available (2)
+    pred_context = 2;
+  }
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+  return pred_context;
+}
diff --git a/libs/libvpx/vp9/common/vp9_pred_common.h b/libs/libvpx/vp9/common/vp9_pred_common.h
new file mode 100644
index 0000000000..254cb8b749
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_pred_common.h
@@ -0,0 +1,171 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_PRED_COMMON_H_
+#define VP9_COMMON_VP9_PRED_COMMON_H_
+
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE int get_segment_id(const VP9_COMMON *cm,
+                                 const uint8_t *segment_ids,
+                                 BLOCK_SIZE bsize, int mi_row, int mi_col) {
+  const int mi_offset = mi_row * cm->mi_cols + mi_col;
+  const int bw = num_8x8_blocks_wide_lookup[bsize];
+  const int bh = num_8x8_blocks_high_lookup[bsize];
+  const int xmis = VPXMIN(cm->mi_cols - mi_col, bw);
+  const int ymis = VPXMIN(cm->mi_rows - mi_row, bh);
+  int x, y, segment_id = MAX_SEGMENTS;
+
+  for (y = 0; y < ymis; ++y)
+    for (x = 0; x < xmis; ++x)
+      segment_id =
+          VPXMIN(segment_id, segment_ids[mi_offset + y * cm->mi_cols + x]);
+
+  assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
+  return segment_id;
+}
+
+static INLINE int vp9_get_pred_context_seg_id(const MACROBLOCKD *xd) {
+  const MODE_INFO *const above_mi = xd->above_mi;
+  const MODE_INFO *const left_mi = xd->left_mi;
+  const int above_sip = (above_mi != NULL) ?
+                        above_mi->seg_id_predicted : 0;
+  const int left_sip = (left_mi != NULL) ? left_mi->seg_id_predicted : 0;
+
+  return above_sip + left_sip;
+}
+
+static INLINE vpx_prob vp9_get_pred_prob_seg_id(const struct segmentation *seg,
+                                                const MACROBLOCKD *xd) {
+  return seg->pred_probs[vp9_get_pred_context_seg_id(xd)];
+}
+
+static INLINE int vp9_get_skip_context(const MACROBLOCKD *xd) {
+  const MODE_INFO *const above_mi = xd->above_mi;
+  const MODE_INFO *const left_mi = xd->left_mi;
+  const int above_skip = (above_mi != NULL) ? above_mi->skip : 0;
+  const int left_skip = (left_mi != NULL) ? left_mi->skip : 0;
+  return above_skip + left_skip;
+}
+
+static INLINE vpx_prob vp9_get_skip_prob(const VP9_COMMON *cm,
+                                         const MACROBLOCKD *xd) {
+  return cm->fc->skip_probs[vp9_get_skip_context(xd)];
+}
+
+int vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd);
+
+int vp9_get_intra_inter_context(const MACROBLOCKD *xd);
+
+static INLINE vpx_prob vp9_get_intra_inter_prob(const VP9_COMMON *cm,
+                                                const MACROBLOCKD *xd) {
+  return cm->fc->intra_inter_prob[vp9_get_intra_inter_context(xd)];
+}
+
+int vp9_get_reference_mode_context(const VP9_COMMON *cm, const MACROBLOCKD *xd);
+
+static INLINE vpx_prob vp9_get_reference_mode_prob(const VP9_COMMON *cm,
+                                                   const MACROBLOCKD *xd) {
+  return cm->fc->comp_inter_prob[vp9_get_reference_mode_context(cm, xd)];
+}
+
+int vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
+                                    const MACROBLOCKD *xd);
+
+static INLINE vpx_prob vp9_get_pred_prob_comp_ref_p(const VP9_COMMON *cm,
+                                                    const MACROBLOCKD *xd) {
+  const int pred_context = vp9_get_pred_context_comp_ref_p(cm, xd);
+  return cm->fc->comp_ref_prob[pred_context];
+}
+
+int vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd);
+
+static INLINE vpx_prob vp9_get_pred_prob_single_ref_p1(const VP9_COMMON *cm,
+                                                       const MACROBLOCKD *xd) {
+  return cm->fc->single_ref_prob[vp9_get_pred_context_single_ref_p1(xd)][0];
+}
+
+int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd);
+
+static INLINE vpx_prob vp9_get_pred_prob_single_ref_p2(const VP9_COMMON *cm,
+                                                       const MACROBLOCKD *xd) {
+  return cm->fc->single_ref_prob[vp9_get_pred_context_single_ref_p2(xd)][1];
+}
+
+// Returns a context number for the given MB prediction signal
+// The mode info data structure has a one element border above and to the
+// left of the entries corresponding to real blocks.
+// The prediction flags in these dummy entries are initialized to 0.
+static INLINE int get_tx_size_context(const MACROBLOCKD *xd) {
+  const int max_tx_size = max_txsize_lookup[xd->mi[0]->sb_type];
+  const MODE_INFO *const above_mi = xd->above_mi;
+  const MODE_INFO *const left_mi = xd->left_mi;
+  const int has_above = xd->up_available;
+  const int has_left = xd->left_available;
+  int above_ctx = (has_above && !above_mi->skip) ? (int)above_mi->tx_size
+                                                 : max_tx_size;
+  int left_ctx = (has_left && !left_mi->skip) ? (int)left_mi->tx_size
+                                              : max_tx_size;
+  if (!has_left)
+    left_ctx = above_ctx;
+
+  if (!has_above)
+    above_ctx = left_ctx;
+
+  return (above_ctx + left_ctx) > max_tx_size;
+}
+
+static INLINE const vpx_prob *get_tx_probs(TX_SIZE max_tx_size, int ctx,
+                                           const struct tx_probs *tx_probs) {
+  switch (max_tx_size) {
+    case TX_8X8:
+      return tx_probs->p8x8[ctx];
+    case TX_16X16:
+      return tx_probs->p16x16[ctx];
+    case TX_32X32:
+      return tx_probs->p32x32[ctx];
+    default:
+      assert(0 && "Invalid max_tx_size.");
+      return NULL;
+  }
+}
+
+static INLINE const vpx_prob *get_tx_probs2(TX_SIZE max_tx_size,
+                                            const MACROBLOCKD *xd,
+                                            const struct tx_probs *tx_probs) {
+  return get_tx_probs(max_tx_size, get_tx_size_context(xd), tx_probs);
+}
+
+static INLINE unsigned int *get_tx_counts(TX_SIZE max_tx_size, int ctx,
+                                          struct tx_counts *tx_counts) {
+  switch (max_tx_size) {
+    case TX_8X8:
+      return tx_counts->p8x8[ctx];
+    case TX_16X16:
+      return tx_counts->p16x16[ctx];
+    case TX_32X32:
+      return tx_counts->p32x32[ctx];
+    default:
+      assert(0 && "Invalid max_tx_size.");
+      return NULL;
+  }
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_COMMON_VP9_PRED_COMMON_H_
diff --git a/libs/libvpx/vp9/common/vp9_quant_common.c b/libs/libvpx/vp9/common/vp9_quant_common.c
new file mode 100644
index 0000000000..d83f3c1a2f
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_quant_common.c
@@ -0,0 +1,278 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_seg_common.h"
+
+static const int16_t dc_qlookup[QINDEX_RANGE] = {
+  4,       8,    8,    9,   10,   11,   12,   12,
+  13,     14,   15,   16,   17,   18,   19,   19,
+  20,     21,   22,   23,   24,   25,   26,   26,
+  27,     28,   29,   30,   31,   32,   32,   33,
+  34,     35,   36,   37,   38,   38,   39,   40,
+  41,     42,   43,   43,   44,   45,   46,   47,
+  48,     48,   49,   50,   51,   52,   53,   53,
+  54,     55,   56,   57,   57,   58,   59,   60,
+  61,     62,   62,   63,   64,   65,   66,   66,
+  67,     68,   69,   70,   70,   71,   72,   73,
+  74,     74,   75,   76,   77,   78,   78,   79,
+  80,     81,   81,   82,   83,   84,   85,   85,
+  87,     88,   90,   92,   93,   95,   96,   98,
+  99,    101,  102,  104,  105,  107,  108,  110,
+  111,   113,  114,  116,  117,  118,  120,  121,
+  123,   125,  127,  129,  131,  134,  136,  138,
+  140,   142,  144,  146,  148,  150,  152,  154,
+  156,   158,  161,  164,  166,  169,  172,  174,
+  177,   180,  182,  185,  187,  190,  192,  195,
+  199,   202,  205,  208,  211,  214,  217,  220,
+  223,   226,  230,  233,  237,  240,  243,  247,
+  250,   253,  257,  261,  265,  269,  272,  276,
+  280,   284,  288,  292,  296,  300,  304,  309,
+  313,   317,  322,  326,  330,  335,  340,  344,
+  349,   354,  359,  364,  369,  374,  379,  384,
+  389,   395,  400,  406,  411,  417,  423,  429,
+  435,   441,  447,  454,  461,  467,  475,  482,
+  489,   497,  505,  513,  522,  530,  539,  549,
+  559,   569,  579,  590,  602,  614,  626,  640,
+  654,   668,  684,  700,  717,  736,  755,  775,
+  796,   819,  843,  869,  896,  925,  955,  988,
+  1022, 1058, 1098, 1139, 1184, 1232, 1282, 1336,
+};
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static const int16_t dc_qlookup_10[QINDEX_RANGE] = {
+  4,     9,    10,    13,    15,    17,    20,    22,
+  25,    28,    31,    34,    37,    40,    43,    47,
+  50,    53,    57,    60,    64,    68,    71,    75,
+  78,    82,    86,    90,    93,    97,   101,   105,
+  109,   113,   116,   120,   124,   128,   132,   136,
+  140,   143,   147,   151,   155,   159,   163,   166,
+  170,   174,   178,   182,   185,   189,   193,   197,
+  200,   204,   208,   212,   215,   219,   223,   226,
+  230,   233,   237,   241,   244,   248,   251,   255,
+  259,   262,   266,   269,   273,   276,   280,   283,
+  287,   290,   293,   297,   300,   304,   307,   310,
+  314,   317,   321,   324,   327,   331,   334,   337,
+  343,   350,   356,   362,   369,   375,   381,   387,
+  394,   400,   406,   412,   418,   424,   430,   436,
+  442,   448,   454,   460,   466,   472,   478,   484,
+  490,   499,   507,   516,   525,   533,   542,   550,
+  559,   567,   576,   584,   592,   601,   609,   617,
+  625,   634,   644,   655,   666,   676,   687,   698,
+  708,   718,   729,   739,   749,   759,   770,   782,
+  795,   807,   819,   831,   844,   856,   868,   880,
+  891,   906,   920,   933,   947,   961,   975,   988,
+  1001,  1015,  1030,  1045,  1061,  1076,  1090,  1105,
+  1120,  1137,  1153,  1170,  1186,  1202,  1218,  1236,
+  1253,  1271,  1288,  1306,  1323,  1342,  1361,  1379,
+  1398,  1416,  1436,  1456,  1476,  1496,  1516,  1537,
+  1559,  1580,  1601,  1624,  1647,  1670,  1692,  1717,
+  1741,  1766,  1791,  1817,  1844,  1871,  1900,  1929,
+  1958,  1990,  2021,  2054,  2088,  2123,  2159,  2197,
+  2236,  2276,  2319,  2363,  2410,  2458,  2508,  2561,
+  2616,  2675,  2737,  2802,  2871,  2944,  3020,  3102,
+  3188,  3280,  3375,  3478,  3586,  3702,  3823,  3953,
+  4089,  4236,  4394,  4559,  4737,  4929,  5130,  5347,
+};
+
+static const int16_t dc_qlookup_12[QINDEX_RANGE] = {
+  4,    12,    18,    25,    33,    41,    50,    60,
+  70,    80,    91,   103,   115,   127,   140,   153,
+  166,   180,   194,   208,   222,   237,   251,   266,
+  281,   296,   312,   327,   343,   358,   374,   390,
+  405,   421,   437,   453,   469,   484,   500,   516,
+  532,   548,   564,   580,   596,   611,   627,   643,
+  659,   674,   690,   706,   721,   737,   752,   768,
+  783,   798,   814,   829,   844,   859,   874,   889,
+  904,   919,   934,   949,   964,   978,   993,  1008,
+  1022,  1037,  1051,  1065,  1080,  1094,  1108,  1122,
+  1136,  1151,  1165,  1179,  1192,  1206,  1220,  1234,
+  1248,  1261,  1275,  1288,  1302,  1315,  1329,  1342,
+  1368,  1393,  1419,  1444,  1469,  1494,  1519,  1544,
+  1569,  1594,  1618,  1643,  1668,  1692,  1717,  1741,
+  1765,  1789,  1814,  1838,  1862,  1885,  1909,  1933,
+  1957,  1992,  2027,  2061,  2096,  2130,  2165,  2199,
+  2233,  2267,  2300,  2334,  2367,  2400,  2434,  2467,
+  2499,  2532,  2575,  2618,  2661,  2704,  2746,  2788,
+  2830,  2872,  2913,  2954,  2995,  3036,  3076,  3127,
+  3177,  3226,  3275,  3324,  3373,  3421,  3469,  3517,
+  3565,  3621,  3677,  3733,  3788,  3843,  3897,  3951,
+  4005,  4058,  4119,  4181,  4241,  4301,  4361,  4420,
+  4479,  4546,  4612,  4677,  4742,  4807,  4871,  4942,
+  5013,  5083,  5153,  5222,  5291,  5367,  5442,  5517,
+  5591,  5665,  5745,  5825,  5905,  5984,  6063,  6149,
+  6234,  6319,  6404,  6495,  6587,  6678,  6769,  6867,
+  6966,  7064,  7163,  7269,  7376,  7483,  7599,  7715,
+  7832,  7958,  8085,  8214,  8352,  8492,  8635,  8788,
+  8945,  9104,  9275,  9450,  9639,  9832, 10031, 10245,
+  10465, 10702, 10946, 11210, 11482, 11776, 12081, 12409,
+  12750, 13118, 13501, 13913, 14343, 14807, 15290, 15812,
+  16356, 16943, 17575, 18237, 18949, 19718, 20521, 21387,
+};
+#endif
+
+static const int16_t ac_qlookup[QINDEX_RANGE] = {
+  4,       8,    9,   10,   11,   12,   13,   14,
+  15,     16,   17,   18,   19,   20,   21,   22,
+  23,     24,   25,   26,   27,   28,   29,   30,
+  31,     32,   33,   34,   35,   36,   37,   38,
+  39,     40,   41,   42,   43,   44,   45,   46,
+  47,     48,   49,   50,   51,   52,   53,   54,
+  55,     56,   57,   58,   59,   60,   61,   62,
+  63,     64,   65,   66,   67,   68,   69,   70,
+  71,     72,   73,   74,   75,   76,   77,   78,
+  79,     80,   81,   82,   83,   84,   85,   86,
+  87,     88,   89,   90,   91,   92,   93,   94,
+  95,     96,   97,   98,   99,  100,  101,  102,
+  104,   106,  108,  110,  112,  114,  116,  118,
+  120,   122,  124,  126,  128,  130,  132,  134,
+  136,   138,  140,  142,  144,  146,  148,  150,
+  152,   155,  158,  161,  164,  167,  170,  173,
+  176,   179,  182,  185,  188,  191,  194,  197,
+  200,   203,  207,  211,  215,  219,  223,  227,
+  231,   235,  239,  243,  247,  251,  255,  260,
+  265,   270,  275,  280,  285,  290,  295,  300,
+  305,   311,  317,  323,  329,  335,  341,  347,
+  353,   359,  366,  373,  380,  387,  394,  401,
+  408,   416,  424,  432,  440,  448,  456,  465,
+  474,   483,  492,  501,  510,  520,  530,  540,
+  550,   560,  571,  582,  593,  604,  615,  627,
+  639,   651,  663,  676,  689,  702,  715,  729,
+  743,   757,  771,  786,  801,  816,  832,  848,
+  864,   881,  898,  915,  933,  951,  969,  988,
+  1007, 1026, 1046, 1066, 1087, 1108, 1129, 1151,
+  1173, 1196, 1219, 1243, 1267, 1292, 1317, 1343,
+  1369, 1396, 1423, 1451, 1479, 1508, 1537, 1567,
+  1597, 1628, 1660, 1692, 1725, 1759, 1793, 1828,
+};
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static const int16_t ac_qlookup_10[QINDEX_RANGE] = {
+  4,     9,    11,    13,    16,    18,    21,    24,
+  27,    30,    33,    37,    40,    44,    48,    51,
+  55,    59,    63,    67,    71,    75,    79,    83,
+  88,    92,    96,   100,   105,   109,   114,   118,
+  122,   127,   131,   136,   140,   145,   149,   154,
+  158,   163,   168,   172,   177,   181,   186,   190,
+  195,   199,   204,   208,   213,   217,   222,   226,
+  231,   235,   240,   244,   249,   253,   258,   262,
+  267,   271,   275,   280,   284,   289,   293,   297,
+  302,   306,   311,   315,   319,   324,   328,   332,
+  337,   341,   345,   349,   354,   358,   362,   367,
+  371,   375,   379,   384,   388,   392,   396,   401,
+  409,   417,   425,   433,   441,   449,   458,   466,
+  474,   482,   490,   498,   506,   514,   523,   531,
+  539,   547,   555,   563,   571,   579,   588,   596,
+  604,   616,   628,   640,   652,   664,   676,   688,
+  700,   713,   725,   737,   749,   761,   773,   785,
+  797,   809,   825,   841,   857,   873,   889,   905,
+  922,   938,   954,   970,   986,  1002,  1018,  1038,
+  1058,  1078,  1098,  1118,  1138,  1158,  1178,  1198,
+  1218,  1242,  1266,  1290,  1314,  1338,  1362,  1386,
+  1411,  1435,  1463,  1491,  1519,  1547,  1575,  1603,
+  1631,  1663,  1695,  1727,  1759,  1791,  1823,  1859,
+  1895,  1931,  1967,  2003,  2039,  2079,  2119,  2159,
+  2199,  2239,  2283,  2327,  2371,  2415,  2459,  2507,
+  2555,  2603,  2651,  2703,  2755,  2807,  2859,  2915,
+  2971,  3027,  3083,  3143,  3203,  3263,  3327,  3391,
+  3455,  3523,  3591,  3659,  3731,  3803,  3876,  3952,
+  4028,  4104,  4184,  4264,  4348,  4432,  4516,  4604,
+  4692,  4784,  4876,  4972,  5068,  5168,  5268,  5372,
+  5476,  5584,  5692,  5804,  5916,  6032,  6148,  6268,
+  6388,  6512,  6640,  6768,  6900,  7036,  7172,  7312,
+};
+
+static const int16_t ac_qlookup_12[QINDEX_RANGE] = {
+  4,    13,    19,    27,    35,    44,    54,    64,
+  75,    87,    99,   112,   126,   139,   154,   168,
+  183,   199,   214,   230,   247,   263,   280,   297,
+  314,   331,   349,   366,   384,   402,   420,   438,
+  456,   475,   493,   511,   530,   548,   567,   586,
+  604,   623,   642,   660,   679,   698,   716,   735,
+  753,   772,   791,   809,   828,   846,   865,   884,
+  902,   920,   939,   957,   976,   994,  1012,  1030,
+  1049,  1067,  1085,  1103,  1121,  1139,  1157,  1175,
+  1193,  1211,  1229,  1246,  1264,  1282,  1299,  1317,
+  1335,  1352,  1370,  1387,  1405,  1422,  1440,  1457,
+  1474,  1491,  1509,  1526,  1543,  1560,  1577,  1595,
+  1627,  1660,  1693,  1725,  1758,  1791,  1824,  1856,
+  1889,  1922,  1954,  1987,  2020,  2052,  2085,  2118,
+  2150,  2183,  2216,  2248,  2281,  2313,  2346,  2378,
+  2411,  2459,  2508,  2556,  2605,  2653,  2701,  2750,
+  2798,  2847,  2895,  2943,  2992,  3040,  3088,  3137,
+  3185,  3234,  3298,  3362,  3426,  3491,  3555,  3619,
+  3684,  3748,  3812,  3876,  3941,  4005,  4069,  4149,
+  4230,  4310,  4390,  4470,  4550,  4631,  4711,  4791,
+  4871,  4967,  5064,  5160,  5256,  5352,  5448,  5544,
+  5641,  5737,  5849,  5961,  6073,  6185,  6297,  6410,
+  6522,  6650,  6778,  6906,  7034,  7162,  7290,  7435,
+  7579,  7723,  7867,  8011,  8155,  8315,  8475,  8635,
+  8795,  8956,  9132,  9308,  9484,  9660,  9836, 10028,
+  10220, 10412, 10604, 10812, 11020, 11228, 11437, 11661,
+  11885, 12109, 12333, 12573, 12813, 13053, 13309, 13565,
+  13821, 14093, 14365, 14637, 14925, 15213, 15502, 15806,
+  16110, 16414, 16734, 17054, 17390, 17726, 18062, 18414,
+  18766, 19134, 19502, 19886, 20270, 20670, 21070, 21486,
+  21902, 22334, 22766, 23214, 23662, 24126, 24590, 25070,
+  25551, 26047, 26559, 27071, 27599, 28143, 28687, 29247,
+};
+#endif
+
+int16_t vp9_dc_quant(int qindex, int delta, vpx_bit_depth_t bit_depth) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  switch (bit_depth) {
+    case VPX_BITS_8:
+      return dc_qlookup[clamp(qindex + delta, 0, MAXQ)];
+    case VPX_BITS_10:
+      return dc_qlookup_10[clamp(qindex + delta, 0, MAXQ)];
+    case VPX_BITS_12:
+      return dc_qlookup_12[clamp(qindex + delta, 0, MAXQ)];
+    default:
+      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+      return -1;
+  }
+#else
+  (void) bit_depth;
+  return dc_qlookup[clamp(qindex + delta, 0, MAXQ)];
+#endif
+}
+
+int16_t vp9_ac_quant(int qindex, int delta, vpx_bit_depth_t bit_depth) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  switch (bit_depth) {
+    case VPX_BITS_8:
+      return ac_qlookup[clamp(qindex + delta, 0, MAXQ)];
+    case VPX_BITS_10:
+      return ac_qlookup_10[clamp(qindex + delta, 0, MAXQ)];
+    case VPX_BITS_12:
+      return ac_qlookup_12[clamp(qindex + delta, 0, MAXQ)];
+    default:
+      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+      return -1;
+  }
+#else
+  (void) bit_depth;
+  return ac_qlookup[clamp(qindex + delta, 0, MAXQ)];
+#endif
+}
+
+int vp9_get_qindex(const struct segmentation *seg, int segment_id,
+                   int base_qindex) {
+  if (segfeature_active(seg, segment_id, SEG_LVL_ALT_Q)) {
+    const int data = get_segdata(seg, segment_id, SEG_LVL_ALT_Q);
+    const int seg_qindex = seg->abs_delta == SEGMENT_ABSDATA ?
+        data : base_qindex + data;
+    return clamp(seg_qindex, 0, MAXQ);
+  } else {
+    return base_qindex;
+  }
+}
+
diff --git a/libs/libvpx/vp9/common/vp9_quant_common.h b/libs/libvpx/vp9/common/vp9_quant_common.h
new file mode 100644
index 0000000000..4bae4a8967
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_quant_common.h
@@ -0,0 +1,36 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_QUANT_COMMON_H_
+#define VP9_COMMON_VP9_QUANT_COMMON_H_
+
+#include "vpx/vpx_codec.h"
+#include "vp9/common/vp9_seg_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MINQ 0
+#define MAXQ 255
+#define QINDEX_RANGE (MAXQ - MINQ + 1)
+#define QINDEX_BITS 8
+
+int16_t vp9_dc_quant(int qindex, int delta, vpx_bit_depth_t bit_depth);
+int16_t vp9_ac_quant(int qindex, int delta, vpx_bit_depth_t bit_depth);
+
+int vp9_get_qindex(const struct segmentation *seg, int segment_id,
+                   int base_qindex);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_COMMON_VP9_QUANT_COMMON_H_
diff --git a/libs/libvpx/vp9/common/vp9_reconinter.c b/libs/libvpx/vp9/common/vp9_reconinter.c
new file mode 100644
index 0000000000..74bc1d23eb
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_reconinter.c
@@ -0,0 +1,321 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vpx_scale_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/common/vp9_reconintra.h"
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void high_inter_predictor(const uint8_t *src, int src_stride,
+                                 uint8_t *dst, int dst_stride,
+                                 const int subpel_x,
+                                 const int subpel_y,
+                                 const struct scale_factors *sf,
+                                 int w, int h, int ref,
+                                 const InterpKernel *kernel,
+                                 int xs, int ys, int bd) {
+  sf->highbd_predict[subpel_x != 0][subpel_y != 0][ref](
+      src, src_stride, dst, dst_stride,
+      kernel[subpel_x], xs, kernel[subpel_y], ys, w, h, bd);
+}
+
+void vp9_highbd_build_inter_predictor(const uint8_t *src, int src_stride,
+                                      uint8_t *dst, int dst_stride,
+                                      const MV *src_mv,
+                                      const struct scale_factors *sf,
+                                      int w, int h, int ref,
+                                      const InterpKernel *kernel,
+                                      enum mv_precision precision,
+                                      int x, int y, int bd) {
+  const int is_q4 = precision == MV_PRECISION_Q4;
+  const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2,
+                     is_q4 ? src_mv->col : src_mv->col * 2 };
+  MV32 mv = vp9_scale_mv(&mv_q4, x, y, sf);
+  const int subpel_x = mv.col & SUBPEL_MASK;
+  const int subpel_y = mv.row & SUBPEL_MASK;
+
+  src += (mv.row >> SUBPEL_BITS) * src_stride + (mv.col >> SUBPEL_BITS);
+
+  high_inter_predictor(src, src_stride, dst, dst_stride, subpel_x, subpel_y,
+                       sf, w, h, ref, kernel, sf->x_step_q4, sf->y_step_q4, bd);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
+                               uint8_t *dst, int dst_stride,
+                               const MV *src_mv,
+                               const struct scale_factors *sf,
+                               int w, int h, int ref,
+                               const InterpKernel *kernel,
+                               enum mv_precision precision,
+                               int x, int y) {
+  const int is_q4 = precision == MV_PRECISION_Q4;
+  const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2,
+                     is_q4 ? src_mv->col : src_mv->col * 2 };
+  MV32 mv = vp9_scale_mv(&mv_q4, x, y, sf);
+  const int subpel_x = mv.col & SUBPEL_MASK;
+  const int subpel_y = mv.row & SUBPEL_MASK;
+
+  src += (mv.row >> SUBPEL_BITS) * src_stride + (mv.col >> SUBPEL_BITS);
+
+  inter_predictor(src, src_stride, dst, dst_stride, subpel_x, subpel_y,
+                  sf, w, h, ref, kernel, sf->x_step_q4, sf->y_step_q4);
+}
+
+static INLINE int round_mv_comp_q4(int value) {
+  return (value < 0 ? value - 2 : value + 2) / 4;
+}
+
+static MV mi_mv_pred_q4(const MODE_INFO *mi, int idx) {
+  MV res = { round_mv_comp_q4(mi->bmi[0].as_mv[idx].as_mv.row +
+                              mi->bmi[1].as_mv[idx].as_mv.row +
+                              mi->bmi[2].as_mv[idx].as_mv.row +
+                              mi->bmi[3].as_mv[idx].as_mv.row),
+             round_mv_comp_q4(mi->bmi[0].as_mv[idx].as_mv.col +
+                              mi->bmi[1].as_mv[idx].as_mv.col +
+                              mi->bmi[2].as_mv[idx].as_mv.col +
+                              mi->bmi[3].as_mv[idx].as_mv.col) };
+  return res;
+}
+
+static INLINE int round_mv_comp_q2(int value) {
+  return (value < 0 ? value - 1 : value + 1) / 2;
+}
+
+static MV mi_mv_pred_q2(const MODE_INFO *mi, int idx, int block0, int block1) {
+  MV res = { round_mv_comp_q2(mi->bmi[block0].as_mv[idx].as_mv.row +
+                              mi->bmi[block1].as_mv[idx].as_mv.row),
+             round_mv_comp_q2(mi->bmi[block0].as_mv[idx].as_mv.col +
+                              mi->bmi[block1].as_mv[idx].as_mv.col) };
+  return res;
+}
+
+// TODO(jkoleszar): yet another mv clamping function :-(
+MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, const MV *src_mv,
+                             int bw, int bh, int ss_x, int ss_y) {
+  // If the MV points so far into the UMV border that no visible pixels
+  // are used for reconstruction, the subpel part of the MV can be
+  // discarded and the MV limited to 16 pixels with equivalent results.
+  const int spel_left = (VP9_INTERP_EXTEND + bw) << SUBPEL_BITS;
+  const int spel_right = spel_left - SUBPEL_SHIFTS;
+  const int spel_top = (VP9_INTERP_EXTEND + bh) << SUBPEL_BITS;
+  const int spel_bottom = spel_top - SUBPEL_SHIFTS;
+  MV clamped_mv = {
+    src_mv->row * (1 << (1 - ss_y)),
+    src_mv->col * (1 << (1 - ss_x))
+  };
+  assert(ss_x <= 1);
+  assert(ss_y <= 1);
+
+  clamp_mv(&clamped_mv,
+           xd->mb_to_left_edge * (1 << (1 - ss_x)) - spel_left,
+           xd->mb_to_right_edge * (1 << (1 - ss_x)) + spel_right,
+           xd->mb_to_top_edge * (1 << (1 - ss_y)) - spel_top,
+           xd->mb_to_bottom_edge * (1 << (1 - ss_y)) + spel_bottom);
+
+  return clamped_mv;
+}
+
+MV average_split_mvs(const struct macroblockd_plane *pd,
+                     const MODE_INFO *mi, int ref, int block) {
+  const int ss_idx = ((pd->subsampling_x > 0) << 1) | (pd->subsampling_y > 0);
+  MV res = {0, 0};
+  switch (ss_idx) {
+    case 0:
+      res = mi->bmi[block].as_mv[ref].as_mv;
+      break;
+    case 1:
+      res = mi_mv_pred_q2(mi, ref, block, block + 2);
+      break;
+    case 2:
+      res = mi_mv_pred_q2(mi, ref, block, block + 1);
+      break;
+    case 3:
+      res = mi_mv_pred_q4(mi, ref);
+      break;
+    default:
+      assert(ss_idx <= 3 && ss_idx >= 0);
+  }
+  return res;
+}
+
+static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
+                                   int bw, int bh,
+                                   int x, int y, int w, int h,
+                                   int mi_x, int mi_y) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const MODE_INFO *mi = xd->mi[0];
+  const int is_compound = has_second_ref(mi);
+  const InterpKernel *kernel = vp9_filter_kernels[mi->interp_filter];
+  int ref;
+
+  for (ref = 0; ref < 1 + is_compound; ++ref) {
+    const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
+    struct buf_2d *const pre_buf = &pd->pre[ref];
+    struct buf_2d *const dst_buf = &pd->dst;
+    uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
+    const MV mv = mi->sb_type < BLOCK_8X8
+               ? average_split_mvs(pd, mi, ref, block)
+               : mi->mv[ref].as_mv;
+
+    // TODO(jkoleszar): This clamping is done in the incorrect place for the
+    // scaling case. It needs to be done on the scaled MV, not the pre-scaling
+    // MV. Note however that it performs the subsampling aware scaling so
+    // that the result is always q4.
+    // mv_precision precision is MV_PRECISION_Q4.
+    const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh,
+                                               pd->subsampling_x,
+                                               pd->subsampling_y);
+
+    uint8_t *pre;
+    MV32 scaled_mv;
+    int xs, ys, subpel_x, subpel_y;
+    const int is_scaled = vp9_is_scaled(sf);
+
+    if (is_scaled) {
+      // Co-ordinate of containing block to pixel precision.
+      const int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x));
+      const int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y));
+#if CONFIG_BETTER_HW_COMPATIBILITY
+      assert(xd->mi[0]->sb_type != BLOCK_4X8 &&
+             xd->mi[0]->sb_type != BLOCK_8X4);
+      assert(mv_q4.row == mv.row * (1 << (1 - pd->subsampling_y)) &&
+             mv_q4.col == mv.col * (1 << (1 - pd->subsampling_x)));
+#endif
+      if (plane == 0)
+        pre_buf->buf = xd->block_refs[ref]->buf->y_buffer;
+      else if (plane == 1)
+        pre_buf->buf = xd->block_refs[ref]->buf->u_buffer;
+      else
+        pre_buf->buf = xd->block_refs[ref]->buf->v_buffer;
+
+      pre_buf->buf += scaled_buffer_offset(x_start + x, y_start + y,
+                                           pre_buf->stride, sf);
+      pre = pre_buf->buf;
+      scaled_mv = vp9_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
+      xs = sf->x_step_q4;
+      ys = sf->y_step_q4;
+    } else {
+      pre = pre_buf->buf + (y * pre_buf->stride + x);
+      scaled_mv.row = mv_q4.row;
+      scaled_mv.col = mv_q4.col;
+      xs = ys = 16;
+    }
+    subpel_x = scaled_mv.col & SUBPEL_MASK;
+    subpel_y = scaled_mv.row & SUBPEL_MASK;
+    pre += (scaled_mv.row >> SUBPEL_BITS) * pre_buf->stride
+           + (scaled_mv.col >> SUBPEL_BITS);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      high_inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
+                           subpel_x, subpel_y, sf, w, h, ref, kernel, xs, ys,
+                           xd->bd);
+    } else {
+      inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
+                      subpel_x, subpel_y, sf, w, h, ref, kernel, xs, ys);
+    }
+#else
+    inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
+                    subpel_x, subpel_y, sf, w, h, ref, kernel, xs, ys);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  }
+}
+
+static void build_inter_predictors_for_planes(MACROBLOCKD *xd, BLOCK_SIZE bsize,
+                                              int mi_row, int mi_col,
+                                              int plane_from, int plane_to) {
+  int plane;
+  const int mi_x = mi_col * MI_SIZE;
+  const int mi_y = mi_row * MI_SIZE;
+  for (plane = plane_from; plane <= plane_to; ++plane) {
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize,
+                                                        &xd->plane[plane]);
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+    const int bw = 4 * num_4x4_w;
+    const int bh = 4 * num_4x4_h;
+
+    if (xd->mi[0]->sb_type < BLOCK_8X8) {
+      int i = 0, x, y;
+      assert(bsize == BLOCK_8X8);
+      for (y = 0; y < num_4x4_h; ++y)
+        for (x = 0; x < num_4x4_w; ++x)
+           build_inter_predictors(xd, plane, i++, bw, bh,
+                                  4 * x, 4 * y, 4, 4, mi_x, mi_y);
+    } else {
+      build_inter_predictors(xd, plane, 0, bw, bh,
+                             0, 0, bw, bh, mi_x, mi_y);
+    }
+  }
+}
+
+void vp9_build_inter_predictors_sby(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                    BLOCK_SIZE bsize) {
+  build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 0, 0);
+}
+
+void vp9_build_inter_predictors_sbp(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                    BLOCK_SIZE bsize, int plane) {
+  build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, plane, plane);
+}
+
+void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                     BLOCK_SIZE bsize) {
+  build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 1,
+                                    MAX_MB_PLANE - 1);
+}
+
+void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                   BLOCK_SIZE bsize) {
+  build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 0,
+                                    MAX_MB_PLANE - 1);
+}
+
+void vp9_setup_dst_planes(struct macroblockd_plane planes[MAX_MB_PLANE],
+                          const YV12_BUFFER_CONFIG *src,
+                          int mi_row, int mi_col) {
+  uint8_t *const buffers[MAX_MB_PLANE] = { src->y_buffer, src->u_buffer,
+      src->v_buffer};
+  const int strides[MAX_MB_PLANE] = { src->y_stride, src->uv_stride,
+      src->uv_stride};
+  int i;
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    struct macroblockd_plane *const pd = &planes[i];
+    setup_pred_plane(&pd->dst, buffers[i], strides[i], mi_row, mi_col, NULL,
+                     pd->subsampling_x, pd->subsampling_y);
+  }
+}
+
+void vp9_setup_pre_planes(MACROBLOCKD *xd, int idx,
+                          const YV12_BUFFER_CONFIG *src,
+                          int mi_row, int mi_col,
+                          const struct scale_factors *sf) {
+  if (src != NULL) {
+    int i;
+    uint8_t *const buffers[MAX_MB_PLANE] = { src->y_buffer, src->u_buffer,
+        src->v_buffer};
+    const int strides[MAX_MB_PLANE] = { src->y_stride, src->uv_stride,
+        src->uv_stride};
+    for (i = 0; i < MAX_MB_PLANE; ++i) {
+      struct macroblockd_plane *const pd = &xd->plane[i];
+      setup_pred_plane(&pd->pre[idx], buffers[i], strides[i], mi_row, mi_col,
+                       sf, pd->subsampling_x, pd->subsampling_y);
+    }
+  }
+}
diff --git a/libs/libvpx/vp9/common/vp9_reconinter.h b/libs/libvpx/vp9/common/vp9_reconinter.h
new file mode 100644
index 0000000000..7d907748e6
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_reconinter.h
@@ -0,0 +1,115 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_RECONINTER_H_
+#define VP9_COMMON_VP9_RECONINTER_H_
+
+#include "vp9/common/vp9_filter.h"
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_filter.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE void inter_predictor(const uint8_t *src, int src_stride,
+                                   uint8_t *dst, int dst_stride,
+                                   const int subpel_x,
+                                   const int subpel_y,
+                                   const struct scale_factors *sf,
+                                   int w, int h, int ref,
+                                   const InterpKernel *kernel,
+                                   int xs, int ys) {
+  sf->predict[subpel_x != 0][subpel_y != 0][ref](
+      src, src_stride, dst, dst_stride,
+      kernel[subpel_x], xs, kernel[subpel_y], ys, w, h);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void high_inter_predictor(const uint8_t *src, int src_stride,
+                                 uint8_t *dst, int dst_stride,
+                                 const int subpel_x,
+                                 const int subpel_y,
+                                 const struct scale_factors *sf,
+                                 int w, int h, int ref,
+                                 const InterpKernel *kernel,
+                                 int xs, int ys, int bd);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+MV average_split_mvs(const struct macroblockd_plane *pd, const MODE_INFO *mi,
+                     int ref, int block);
+
+MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, const MV *src_mv,
+                             int bw, int bh, int ss_x, int ss_y);
+
+void vp9_build_inter_predictors_sby(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                    BLOCK_SIZE bsize);
+
+void vp9_build_inter_predictors_sbp(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                    BLOCK_SIZE bsize, int plane);
+
+void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                     BLOCK_SIZE bsize);
+
+void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                   BLOCK_SIZE bsize);
+
+void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
+                               uint8_t *dst, int dst_stride,
+                               const MV *mv_q3,
+                               const struct scale_factors *sf,
+                               int w, int h, int do_avg,
+                               const InterpKernel *kernel,
+                               enum mv_precision precision,
+                               int x, int y);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_build_inter_predictor(const uint8_t *src, int src_stride,
+                                      uint8_t *dst, int dst_stride,
+                                      const MV *mv_q3,
+                                      const struct scale_factors *sf,
+                                      int w, int h, int do_avg,
+                                      const InterpKernel *kernel,
+                                      enum mv_precision precision,
+                                      int x, int y, int bd);
+#endif
+
+static INLINE int scaled_buffer_offset(int x_offset, int y_offset, int stride,
+                                       const struct scale_factors *sf) {
+  const int x = sf ? sf->scale_value_x(x_offset, sf) : x_offset;
+  const int y = sf ? sf->scale_value_y(y_offset, sf) : y_offset;
+  return y * stride + x;
+}
+
+static INLINE void setup_pred_plane(struct buf_2d *dst,
+                                    uint8_t *src, int stride,
+                                    int mi_row, int mi_col,
+                                    const struct scale_factors *scale,
+                                    int subsampling_x, int subsampling_y) {
+  const int x = (MI_SIZE * mi_col) >> subsampling_x;
+  const int y = (MI_SIZE * mi_row) >> subsampling_y;
+  dst->buf = src + scaled_buffer_offset(x, y, stride, scale);
+  dst->stride = stride;
+}
+
+void vp9_setup_dst_planes(struct macroblockd_plane planes[MAX_MB_PLANE],
+                          const YV12_BUFFER_CONFIG *src,
+                          int mi_row, int mi_col);
+
+void vp9_setup_pre_planes(MACROBLOCKD *xd, int idx,
+                          const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
+                          const struct scale_factors *sf);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_COMMON_VP9_RECONINTER_H_
diff --git a/libs/libvpx/vp9/common/vp9_reconintra.c b/libs/libvpx/vp9/common/vp9_reconintra.c
new file mode 100644
index 0000000000..13a95ae8f0
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_reconintra.c
@@ -0,0 +1,448 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#if CONFIG_VP9_HIGHBITDEPTH
+#include "vpx_dsp/vpx_dsp_common.h"
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/vpx_once.h"
+
+#include "vp9/common/vp9_reconintra.h"
+#include "vp9/common/vp9_onyxc_int.h"
+
+const TX_TYPE intra_mode_to_tx_type_lookup[INTRA_MODES] = {
+  DCT_DCT,    // DC
+  ADST_DCT,   // V
+  DCT_ADST,   // H
+  DCT_DCT,    // D45
+  ADST_ADST,  // D135
+  ADST_DCT,   // D117
+  DCT_ADST,   // D153
+  DCT_ADST,   // D207
+  ADST_DCT,   // D63
+  ADST_ADST,  // TM
+};
+
+enum {
+  NEED_LEFT = 1 << 1,
+  NEED_ABOVE = 1 << 2,
+  NEED_ABOVERIGHT = 1 << 3,
+};
+
+static const uint8_t extend_modes[INTRA_MODES] = {
+  NEED_ABOVE | NEED_LEFT,       // DC
+  NEED_ABOVE,                   // V
+  NEED_LEFT,                    // H
+  NEED_ABOVERIGHT,              // D45
+  NEED_LEFT | NEED_ABOVE,       // D135
+  NEED_LEFT | NEED_ABOVE,       // D117
+  NEED_LEFT | NEED_ABOVE,       // D153
+  NEED_LEFT,                    // D207
+  NEED_ABOVERIGHT,              // D63
+  NEED_LEFT | NEED_ABOVE,       // TM
+};
+
+typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left);
+
+static intra_pred_fn pred[INTRA_MODES][TX_SIZES];
+static intra_pred_fn dc_pred[2][2][TX_SIZES];
+
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef void (*intra_high_pred_fn)(uint16_t *dst, ptrdiff_t stride,
+                                   const uint16_t *above, const uint16_t *left,
+                                   int bd);
+static intra_high_pred_fn pred_high[INTRA_MODES][4];
+static intra_high_pred_fn dc_pred_high[2][2][4];
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static void vp9_init_intra_predictors_internal(void) {
+#define INIT_ALL_SIZES(p, type) \
+  p[TX_4X4] = vpx_##type##_predictor_4x4; \
+  p[TX_8X8] = vpx_##type##_predictor_8x8; \
+  p[TX_16X16] = vpx_##type##_predictor_16x16; \
+  p[TX_32X32] = vpx_##type##_predictor_32x32
+
+  INIT_ALL_SIZES(pred[V_PRED], v);
+  INIT_ALL_SIZES(pred[H_PRED], h);
+  INIT_ALL_SIZES(pred[D207_PRED], d207);
+  INIT_ALL_SIZES(pred[D45_PRED], d45);
+  INIT_ALL_SIZES(pred[D63_PRED], d63);
+  INIT_ALL_SIZES(pred[D117_PRED], d117);
+  INIT_ALL_SIZES(pred[D135_PRED], d135);
+  INIT_ALL_SIZES(pred[D153_PRED], d153);
+  INIT_ALL_SIZES(pred[TM_PRED], tm);
+
+  INIT_ALL_SIZES(dc_pred[0][0], dc_128);
+  INIT_ALL_SIZES(dc_pred[0][1], dc_top);
+  INIT_ALL_SIZES(dc_pred[1][0], dc_left);
+  INIT_ALL_SIZES(dc_pred[1][1], dc);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  INIT_ALL_SIZES(pred_high[V_PRED], highbd_v);
+  INIT_ALL_SIZES(pred_high[H_PRED], highbd_h);
+  INIT_ALL_SIZES(pred_high[D207_PRED], highbd_d207);
+  INIT_ALL_SIZES(pred_high[D45_PRED], highbd_d45);
+  INIT_ALL_SIZES(pred_high[D63_PRED], highbd_d63);
+  INIT_ALL_SIZES(pred_high[D117_PRED], highbd_d117);
+  INIT_ALL_SIZES(pred_high[D135_PRED], highbd_d135);
+  INIT_ALL_SIZES(pred_high[D153_PRED], highbd_d153);
+  INIT_ALL_SIZES(pred_high[TM_PRED], highbd_tm);
+
+  INIT_ALL_SIZES(dc_pred_high[0][0], highbd_dc_128);
+  INIT_ALL_SIZES(dc_pred_high[0][1], highbd_dc_top);
+  INIT_ALL_SIZES(dc_pred_high[1][0], highbd_dc_left);
+  INIT_ALL_SIZES(dc_pred_high[1][1], highbd_dc);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#undef intra_pred_allsizes
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void build_intra_predictors_high(const MACROBLOCKD *xd,
+                                        const uint8_t *ref8,
+                                        int ref_stride,
+                                        uint8_t *dst8,
+                                        int dst_stride,
+                                        PREDICTION_MODE mode,
+                                        TX_SIZE tx_size,
+                                        int up_available,
+                                        int left_available,
+                                        int right_available,
+                                        int x, int y,
+                                        int plane, int bd) {
+  int i;
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  DECLARE_ALIGNED(16, uint16_t, left_col[32]);
+  DECLARE_ALIGNED(16, uint16_t, above_data[64 + 16]);
+  uint16_t *above_row = above_data + 16;
+  const uint16_t *const_above_row = above_row;
+  const int bs = 4 << tx_size;
+  int frame_width, frame_height;
+  int x0, y0;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int need_left = extend_modes[mode] & NEED_LEFT;
+  const int need_above = extend_modes[mode] & NEED_ABOVE;
+  const int need_aboveright = extend_modes[mode] & NEED_ABOVERIGHT;
+  int base = 128 << (bd - 8);
+  // 127 127 127 .. 127 127 127 127 127 127
+  // 129  A   B  ..  Y   Z
+  // 129  C   D  ..  W   X
+  // 129  E   F  ..  U   V
+  // 129  G   H  ..  S   T   T   T   T   T
+
+  // Get current frame pointer, width and height.
+  if (plane == 0) {
+    frame_width = xd->cur_buf->y_width;
+    frame_height = xd->cur_buf->y_height;
+  } else {
+    frame_width = xd->cur_buf->uv_width;
+    frame_height = xd->cur_buf->uv_height;
+  }
+
+  // Get block position in current frame.
+  x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x;
+  y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y;
+
+  // NEED_LEFT
+  if (need_left) {
+    if (left_available) {
+      if (xd->mb_to_bottom_edge < 0) {
+        /* slower path if the block needs border extension */
+        if (y0 + bs <= frame_height) {
+          for (i = 0; i < bs; ++i)
+            left_col[i] = ref[i * ref_stride - 1];
+        } else {
+          const int extend_bottom = frame_height - y0;
+          for (i = 0; i < extend_bottom; ++i)
+            left_col[i] = ref[i * ref_stride - 1];
+          for (; i < bs; ++i)
+            left_col[i] = ref[(extend_bottom - 1) * ref_stride - 1];
+        }
+      } else {
+        /* faster path if the block does not need extension */
+        for (i = 0; i < bs; ++i)
+          left_col[i] = ref[i * ref_stride - 1];
+      }
+    } else {
+      // TODO(Peter): this value should probably change for high bitdepth
+      vpx_memset16(left_col, base + 1, bs);
+    }
+  }
+
+  // NEED_ABOVE
+  if (need_above) {
+    if (up_available) {
+      const uint16_t *above_ref = ref - ref_stride;
+      if (xd->mb_to_right_edge < 0) {
+        /* slower path if the block needs border extension */
+        if (x0 + bs <= frame_width) {
+          memcpy(above_row, above_ref, bs * sizeof(above_row[0]));
+        } else if (x0 <= frame_width) {
+          const int r = frame_width - x0;
+          memcpy(above_row, above_ref, r * sizeof(above_row[0]));
+          vpx_memset16(above_row + r, above_row[r - 1], x0 + bs - frame_width);
+        }
+      } else {
+        /* faster path if the block does not need extension */
+        if (bs == 4 && right_available && left_available) {
+          const_above_row = above_ref;
+        } else {
+          memcpy(above_row, above_ref, bs * sizeof(above_row[0]));
+        }
+      }
+      above_row[-1] = left_available ? above_ref[-1] : (base + 1);
+    } else {
+      vpx_memset16(above_row, base - 1, bs);
+      above_row[-1] = base - 1;
+    }
+  }
+
+  // NEED_ABOVERIGHT
+  if (need_aboveright) {
+    if (up_available) {
+      const uint16_t *above_ref = ref - ref_stride;
+      if (xd->mb_to_right_edge < 0) {
+        /* slower path if the block needs border extension */
+        if (x0 + 2 * bs <= frame_width) {
+          if (right_available && bs == 4) {
+            memcpy(above_row, above_ref, 2 * bs * sizeof(above_row[0]));
+          } else {
+            memcpy(above_row, above_ref, bs * sizeof(above_row[0]));
+            vpx_memset16(above_row + bs, above_row[bs - 1], bs);
+          }
+        } else if (x0 + bs <= frame_width) {
+          const int r = frame_width - x0;
+          if (right_available && bs == 4) {
+            memcpy(above_row, above_ref, r * sizeof(above_row[0]));
+            vpx_memset16(above_row + r, above_row[r - 1],
+                         x0 + 2 * bs - frame_width);
+          } else {
+            memcpy(above_row, above_ref, bs * sizeof(above_row[0]));
+            vpx_memset16(above_row + bs, above_row[bs - 1], bs);
+          }
+        } else if (x0 <= frame_width) {
+          const int r = frame_width - x0;
+          memcpy(above_row, above_ref, r * sizeof(above_row[0]));
+          vpx_memset16(above_row + r, above_row[r - 1],
+                       x0 + 2 * bs - frame_width);
+        }
+        // TODO(Peter) this value should probably change for high bitdepth
+        above_row[-1] = left_available ? above_ref[-1] : (base + 1);
+      } else {
+        /* faster path if the block does not need extension */
+        if (bs == 4 && right_available && left_available) {
+          const_above_row = above_ref;
+        } else {
+          memcpy(above_row, above_ref, bs * sizeof(above_row[0]));
+          if (bs == 4 && right_available)
+            memcpy(above_row + bs, above_ref + bs, bs * sizeof(above_row[0]));
+          else
+            vpx_memset16(above_row + bs, above_row[bs - 1], bs);
+          // TODO(Peter): this value should probably change for high bitdepth
+          above_row[-1] = left_available ? above_ref[-1] : (base + 1);
+        }
+      }
+    } else {
+      vpx_memset16(above_row, base - 1, bs * 2);
+      // TODO(Peter): this value should probably change for high bitdepth
+      above_row[-1] = base - 1;
+    }
+  }
+
+  // predict
+  if (mode == DC_PRED) {
+    dc_pred_high[left_available][up_available][tx_size](dst, dst_stride,
+                                                        const_above_row,
+                                                        left_col, xd->bd);
+  } else {
+    pred_high[mode][tx_size](dst, dst_stride, const_above_row, left_col,
+                             xd->bd);
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
+                                   int ref_stride, uint8_t *dst, int dst_stride,
+                                   PREDICTION_MODE mode, TX_SIZE tx_size,
+                                   int up_available, int left_available,
+                                   int right_available, int x, int y,
+                                   int plane) {
+  int i;
+  DECLARE_ALIGNED(16, uint8_t, left_col[32]);
+  DECLARE_ALIGNED(16, uint8_t, above_data[64 + 16]);
+  uint8_t *above_row = above_data + 16;
+  const uint8_t *const_above_row = above_row;
+  const int bs = 4 << tx_size;
+  int frame_width, frame_height;
+  int x0, y0;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+
+  // 127 127 127 .. 127 127 127 127 127 127
+  // 129  A   B  ..  Y   Z
+  // 129  C   D  ..  W   X
+  // 129  E   F  ..  U   V
+  // 129  G   H  ..  S   T   T   T   T   T
+  // ..
+
+  // Get current frame pointer, width and height.
+  if (plane == 0) {
+    frame_width = xd->cur_buf->y_width;
+    frame_height = xd->cur_buf->y_height;
+  } else {
+    frame_width = xd->cur_buf->uv_width;
+    frame_height = xd->cur_buf->uv_height;
+  }
+
+  // Get block position in current frame.
+  x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x;
+  y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y;
+
+  // NEED_LEFT
+  if (extend_modes[mode] & NEED_LEFT) {
+    if (left_available) {
+      if (xd->mb_to_bottom_edge < 0) {
+        /* slower path if the block needs border extension */
+        if (y0 + bs <= frame_height) {
+          for (i = 0; i < bs; ++i)
+            left_col[i] = ref[i * ref_stride - 1];
+        } else {
+          const int extend_bottom = frame_height - y0;
+          for (i = 0; i < extend_bottom; ++i)
+            left_col[i] = ref[i * ref_stride - 1];
+          for (; i < bs; ++i)
+            left_col[i] = ref[(extend_bottom - 1) * ref_stride - 1];
+        }
+      } else {
+        /* faster path if the block does not need extension */
+        for (i = 0; i < bs; ++i)
+          left_col[i] = ref[i * ref_stride - 1];
+      }
+    } else {
+      memset(left_col, 129, bs);
+    }
+  }
+
+  // NEED_ABOVE
+  if (extend_modes[mode] & NEED_ABOVE) {
+    if (up_available) {
+      const uint8_t *above_ref = ref - ref_stride;
+      if (xd->mb_to_right_edge < 0) {
+        /* slower path if the block needs border extension */
+        if (x0 + bs <= frame_width) {
+          memcpy(above_row, above_ref, bs);
+        } else if (x0 <= frame_width) {
+          const int r = frame_width - x0;
+          memcpy(above_row, above_ref, r);
+          memset(above_row + r, above_row[r - 1], x0 + bs - frame_width);
+        }
+      } else {
+        /* faster path if the block does not need extension */
+        if (bs == 4 && right_available && left_available) {
+          const_above_row = above_ref;
+        } else {
+          memcpy(above_row, above_ref, bs);
+        }
+      }
+      above_row[-1] = left_available ? above_ref[-1] : 129;
+    } else {
+      memset(above_row, 127, bs);
+      above_row[-1] = 127;
+    }
+  }
+
+  // NEED_ABOVERIGHT
+  if (extend_modes[mode] & NEED_ABOVERIGHT) {
+    if (up_available) {
+      const uint8_t *above_ref = ref - ref_stride;
+      if (xd->mb_to_right_edge < 0) {
+        /* slower path if the block needs border extension */
+        if (x0 + 2 * bs <= frame_width) {
+          if (right_available && bs == 4) {
+            memcpy(above_row, above_ref, 2 * bs);
+          } else {
+            memcpy(above_row, above_ref, bs);
+            memset(above_row + bs, above_row[bs - 1], bs);
+          }
+        } else if (x0 + bs <= frame_width) {
+          const int r = frame_width - x0;
+          if (right_available && bs == 4) {
+            memcpy(above_row, above_ref, r);
+            memset(above_row + r, above_row[r - 1], x0 + 2 * bs - frame_width);
+          } else {
+            memcpy(above_row, above_ref, bs);
+            memset(above_row + bs, above_row[bs - 1], bs);
+          }
+        } else if (x0 <= frame_width) {
+          const int r = frame_width - x0;
+          memcpy(above_row, above_ref, r);
+          memset(above_row + r, above_row[r - 1], x0 + 2 * bs - frame_width);
+        }
+      } else {
+        /* faster path if the block does not need extension */
+        if (bs == 4 && right_available && left_available) {
+          const_above_row = above_ref;
+        } else {
+          memcpy(above_row, above_ref, bs);
+          if (bs == 4 && right_available)
+            memcpy(above_row + bs, above_ref + bs, bs);
+          else
+            memset(above_row + bs, above_row[bs - 1], bs);
+        }
+      }
+      above_row[-1] = left_available ? above_ref[-1] : 129;
+    } else {
+      memset(above_row, 127, bs * 2);
+      above_row[-1] = 127;
+    }
+  }
+
+  // predict
+  if (mode == DC_PRED) {
+    dc_pred[left_available][up_available][tx_size](dst, dst_stride,
+                                                   const_above_row, left_col);
+  } else {
+    pred[mode][tx_size](dst, dst_stride, const_above_row, left_col);
+  }
+}
+
+void vp9_predict_intra_block(const MACROBLOCKD *xd, int bwl_in,
+                             TX_SIZE tx_size, PREDICTION_MODE mode,
+                             const uint8_t *ref, int ref_stride,
+                             uint8_t *dst, int dst_stride,
+                             int aoff, int loff, int plane) {
+  const int bw = (1 << bwl_in);
+  const int txw = (1 << tx_size);
+  const int have_top = loff || xd->up_available;
+  const int have_left = aoff || xd->left_available;
+  const int have_right = (aoff + txw) < bw;
+  const int x = aoff * 4;
+  const int y = loff * 4;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    build_intra_predictors_high(xd, ref, ref_stride, dst, dst_stride, mode,
+                                tx_size, have_top, have_left, have_right,
+                                x, y, plane, xd->bd);
+    return;
+  }
+#endif
+  build_intra_predictors(xd, ref, ref_stride, dst, dst_stride, mode, tx_size,
+                         have_top, have_left, have_right, x, y, plane);
+}
+
+void vp9_init_intra_predictors(void) {
+  once(vp9_init_intra_predictors_internal);
+}
diff --git a/libs/libvpx/vp9/common/vp9_reconintra.h b/libs/libvpx/vp9/common/vp9_reconintra.h
new file mode 100644
index 0000000000..de453808b7
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_reconintra.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_RECONINTRA_H_
+#define VP9_COMMON_VP9_RECONINTRA_H_
+
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp9_init_intra_predictors(void);
+
+void vp9_predict_intra_block(const MACROBLOCKD *xd, int bwl_in,
+                             TX_SIZE tx_size, PREDICTION_MODE mode,
+                             const uint8_t *ref, int ref_stride,
+                             uint8_t *dst, int dst_stride,
+                             int aoff, int loff, int plane);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_COMMON_VP9_RECONINTRA_H_
diff --git a/libs/libvpx/vp9/common/vp9_rtcd.c b/libs/libvpx/vp9/common/vp9_rtcd.c
new file mode 100644
index 0000000000..2dfa09f50e
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_rtcd.c
@@ -0,0 +1,19 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "./vpx_config.h"
+#define RTCD_C
+#include "./vp9_rtcd.h"
+#include "vpx_ports/vpx_once.h"
+
+void vp9_rtcd() {
+    // TODO(JBB): Remove this once, by insuring that both the encoder and
+    // decoder setup functions are protected by once();
+    once(setup_rtcd_internal);
+}
diff --git a/libs/libvpx/vp9/common/vp9_rtcd_defs.pl b/libs/libvpx/vp9/common/vp9_rtcd_defs.pl
new file mode 100644
index 0000000000..d6a0ce96d4
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_rtcd_defs.pl
@@ -0,0 +1,324 @@
+sub vp9_common_forward_decls() {
+print <<EOF
+/*
+ * VP9
+ */
+
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_enums.h"
+
+struct macroblockd;
+
+/* Encoder forward decls */
+struct macroblock;
+struct vp9_variance_vtable;
+struct search_site_config;
+struct mv;
+union int_mv;
+struct yv12_buffer_config;
+EOF
+}
+forward_decls qw/vp9_common_forward_decls/;
+
+# x86inc.asm had specific constraints. break it out so it's easy to disable.
+# zero all the variables to avoid tricky else conditions.
+$mmx_x86inc = $sse_x86inc = $sse2_x86inc = $ssse3_x86inc = $avx_x86inc =
+  $avx2_x86inc = '';
+$mmx_x86_64_x86inc = $sse_x86_64_x86inc = $sse2_x86_64_x86inc =
+  $ssse3_x86_64_x86inc = $avx_x86_64_x86inc = $avx2_x86_64_x86inc = '';
+if (vpx_config("CONFIG_USE_X86INC") eq "yes") {
+  $mmx_x86inc = 'mmx';
+  $sse_x86inc = 'sse';
+  $sse2_x86inc = 'sse2';
+  $ssse3_x86inc = 'ssse3';
+  $avx_x86inc = 'avx';
+  $avx2_x86inc = 'avx2';
+  if ($opts{arch} eq "x86_64") {
+    $mmx_x86_64_x86inc = 'mmx';
+    $sse_x86_64_x86inc = 'sse';
+    $sse2_x86_64_x86inc = 'sse2';
+    $ssse3_x86_64_x86inc = 'ssse3';
+    $avx_x86_64_x86inc = 'avx';
+    $avx2_x86_64_x86inc = 'avx2';
+  }
+}
+
+# functions that are 64 bit only.
+$mmx_x86_64 = $sse2_x86_64 = $ssse3_x86_64 = $avx_x86_64 = $avx2_x86_64 = '';
+if ($opts{arch} eq "x86_64") {
+  $mmx_x86_64 = 'mmx';
+  $sse2_x86_64 = 'sse2';
+  $ssse3_x86_64 = 'ssse3';
+  $avx_x86_64 = 'avx';
+  $avx2_x86_64 = 'avx2';
+}
+
+#
+# post proc
+#
+if (vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
+add_proto qw/void vp9_mbpost_proc_down/, "uint8_t *dst, int pitch, int rows, int cols, int flimit";
+specialize qw/vp9_mbpost_proc_down sse2/;
+$vp9_mbpost_proc_down_sse2=vp9_mbpost_proc_down_xmm;
+
+add_proto qw/void vp9_mbpost_proc_across_ip/, "uint8_t *src, int pitch, int rows, int cols, int flimit";
+specialize qw/vp9_mbpost_proc_across_ip sse2/;
+$vp9_mbpost_proc_across_ip_sse2=vp9_mbpost_proc_across_ip_xmm;
+
+add_proto qw/void vp9_post_proc_down_and_across/, "const uint8_t *src_ptr, uint8_t *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit";
+specialize qw/vp9_post_proc_down_and_across sse2/;
+$vp9_post_proc_down_and_across_sse2=vp9_post_proc_down_and_across_xmm;
+
+add_proto qw/void vp9_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch";
+specialize qw/vp9_plane_add_noise sse2/;
+$vp9_plane_add_noise_sse2=vp9_plane_add_noise_wmt;
+
+add_proto qw/void vp9_filter_by_weight16x16/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight";
+specialize qw/vp9_filter_by_weight16x16 sse2 msa/;
+
+add_proto qw/void vp9_filter_by_weight8x8/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight";
+specialize qw/vp9_filter_by_weight8x8 sse2 msa/;
+}
+
+#
+# dct
+#
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  # Force C versions if CONFIG_EMULATE_HARDWARE is 1
+  if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
+    add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    specialize qw/vp9_iht4x4_16_add/;
+
+    add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    specialize qw/vp9_iht8x8_64_add/;
+
+    add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+    specialize qw/vp9_iht16x16_256_add/;
+  } else {
+    add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    specialize qw/vp9_iht4x4_16_add sse2/;
+
+    add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    specialize qw/vp9_iht8x8_64_add sse2/;
+
+    add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+    specialize qw/vp9_iht16x16_256_add sse2/;
+  }
+} else {
+  # Force C versions if CONFIG_EMULATE_HARDWARE is 1
+  if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
+    add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    specialize qw/vp9_iht4x4_16_add/;
+
+    add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    specialize qw/vp9_iht8x8_64_add/;
+
+    add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+    specialize qw/vp9_iht16x16_256_add/;
+  } else {
+    add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    specialize qw/vp9_iht4x4_16_add sse2 neon dspr2 msa/;
+
+    add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    specialize qw/vp9_iht8x8_64_add sse2 neon dspr2 msa/;
+
+    add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+    specialize qw/vp9_iht16x16_256_add sse2 dspr2 msa/;
+  }
+}
+
+# High bitdepth functions
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  #
+  # Sub Pixel Filters
+  #
+  add_proto qw/void vp9_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vp9_highbd_convolve_copy/;
+
+  add_proto qw/void vp9_highbd_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vp9_highbd_convolve_avg/;
+
+  add_proto qw/void vp9_highbd_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vp9_highbd_convolve8/, "$sse2_x86_64";
+
+  add_proto qw/void vp9_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vp9_highbd_convolve8_horiz/, "$sse2_x86_64";
+
+  add_proto qw/void vp9_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vp9_highbd_convolve8_vert/, "$sse2_x86_64";
+
+  add_proto qw/void vp9_highbd_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vp9_highbd_convolve8_avg/, "$sse2_x86_64";
+
+  add_proto qw/void vp9_highbd_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vp9_highbd_convolve8_avg_horiz/, "$sse2_x86_64";
+
+  add_proto qw/void vp9_highbd_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vp9_highbd_convolve8_avg_vert/, "$sse2_x86_64";
+
+  #
+  # post proc
+  #
+  if (vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
+    add_proto qw/void vp9_highbd_mbpost_proc_down/, "uint16_t *dst, int pitch, int rows, int cols, int flimit";
+    specialize qw/vp9_highbd_mbpost_proc_down/;
+
+    add_proto qw/void vp9_highbd_mbpost_proc_across_ip/, "uint16_t *src, int pitch, int rows, int cols, int flimit";
+    specialize qw/vp9_highbd_mbpost_proc_across_ip/;
+
+    add_proto qw/void vp9_highbd_post_proc_down_and_across/, "const uint16_t *src_ptr, uint16_t *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit";
+    specialize qw/vp9_highbd_post_proc_down_and_across/;
+
+    add_proto qw/void vp9_highbd_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch";
+    specialize qw/vp9_highbd_plane_add_noise/;
+  }
+
+  #
+  # dct
+  #
+  # Note as optimized versions of these functions are added we need to add a check to ensure
+  # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
+  add_proto qw/void vp9_highbd_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+  specialize qw/vp9_highbd_iht4x4_16_add/;
+
+  add_proto qw/void vp9_highbd_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+  specialize qw/vp9_highbd_iht8x8_64_add/;
+
+  add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd";
+  specialize qw/vp9_highbd_iht16x16_256_add/;
+}
+
+#
+# Encoder functions below this point.
+#
+if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
+
+# ENCODEMB INVOKE
+
+#
+# Denoiser
+#
+if (vpx_config("CONFIG_VP9_TEMPORAL_DENOISING") eq "yes") {
+  add_proto qw/int vp9_denoiser_filter/, "const uint8_t *sig, int sig_stride, const uint8_t *mc_avg, int mc_avg_stride, uint8_t *avg, int avg_stride, int increase_denoising, BLOCK_SIZE bs, int motion_magnitude";
+  specialize qw/vp9_denoiser_filter sse2/;
+}
+
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
+  specialize qw/vp9_block_error/;
+
+  add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
+  specialize qw/vp9_highbd_block_error/, "$sse2_x86inc";
+
+  add_proto qw/int64_t vp9_highbd_block_error_8bit/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
+  specialize qw/vp9_highbd_block_error_8bit/, "$sse2_x86inc", "$avx_x86inc";
+
+  add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_quantize_fp/;
+
+  add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_quantize_fp_32x32/;
+
+  add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_fdct8x8_quant/;
+} else {
+  add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
+  specialize qw/vp9_block_error avx2 msa/, "$sse2_x86inc";
+
+  add_proto qw/int64_t vp9_block_error_fp/, "const int16_t *coeff, const int16_t *dqcoeff, int block_size";
+  specialize qw/vp9_block_error_fp neon/, "$sse2_x86inc";
+
+  add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64_x86inc";
+
+  add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64_x86inc";
+
+  add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_fdct8x8_quant sse2 ssse3 neon/;
+}
+
+# fdct functions
+
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp9_fht4x4 sse2/;
+
+  add_proto qw/void vp9_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp9_fht8x8 sse2/;
+
+  add_proto qw/void vp9_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp9_fht16x16 sse2/;
+
+  add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_fwht4x4/, "$mmx_x86inc";
+} else {
+  add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp9_fht4x4 sse2 msa/;
+
+  add_proto qw/void vp9_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp9_fht8x8 sse2 msa/;
+
+  add_proto qw/void vp9_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp9_fht16x16 sse2 msa/;
+
+  add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_fwht4x4 msa/, "$mmx_x86inc";
+}
+
+#
+# Motion search
+#
+add_proto qw/int vp9_full_search_sad/, "const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv";
+specialize qw/vp9_full_search_sad sse3 sse4_1/;
+$vp9_full_search_sad_sse3=vp9_full_search_sadx3;
+$vp9_full_search_sad_sse4_1=vp9_full_search_sadx8;
+
+add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
+specialize qw/vp9_diamond_search_sad avx/;
+
+add_proto qw/void vp9_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
+specialize qw/vp9_temporal_filter_apply sse2 msa/;
+
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+
+  # ENCODEMB INVOKE
+
+  add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_highbd_quantize_fp/;
+
+  add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_highbd_quantize_fp_32x32/;
+
+  # fdct functions
+  add_proto qw/void vp9_highbd_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp9_highbd_fht4x4/;
+
+  add_proto qw/void vp9_highbd_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp9_highbd_fht8x8/;
+
+  add_proto qw/void vp9_highbd_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp9_highbd_fht16x16/;
+
+  add_proto qw/void vp9_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_highbd_fwht4x4/;
+
+  add_proto qw/void vp9_highbd_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
+  specialize qw/vp9_highbd_temporal_filter_apply/;
+
+}
+# End vp9_high encoder functions
+
+#
+# frame based scale
+#
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+} else {
+  add_proto qw/void vp9_scale_and_extend_frame/, "const struct yv12_buffer_config *src, struct yv12_buffer_config *dst";
+  specialize qw/vp9_scale_and_extend_frame ssse3/;
+}
+
+}
+# end encoder functions
+1;
diff --git a/libs/libvpx/vp9/common/vp9_scale.c b/libs/libvpx/vp9/common/vp9_scale.c
new file mode 100644
index 0000000000..b763b925b3
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_scale.c
@@ -0,0 +1,175 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vp9/common/vp9_filter.h"
+#include "vp9/common/vp9_scale.h"
+#include "vpx_dsp/vpx_filter.h"
+
+static INLINE int scaled_x(int val, const struct scale_factors *sf) {
+  return (int)((int64_t)val * sf->x_scale_fp >> REF_SCALE_SHIFT);
+}
+
+static INLINE int scaled_y(int val, const struct scale_factors *sf) {
+  return (int)((int64_t)val * sf->y_scale_fp >> REF_SCALE_SHIFT);
+}
+
+static int unscaled_value(int val, const struct scale_factors *sf) {
+  (void) sf;
+  return val;
+}
+
+static int get_fixed_point_scale_factor(int other_size, int this_size) {
+  // Calculate scaling factor once for each reference frame
+  // and use fixed point scaling factors in decoding and encoding routines.
+  // Hardware implementations can calculate scale factor in device driver
+  // and use multiplication and shifting on hardware instead of division.
+  return (other_size << REF_SCALE_SHIFT) / this_size;
+}
+
+MV32 vp9_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf) {
+  const int x_off_q4 = scaled_x(x << SUBPEL_BITS, sf) & SUBPEL_MASK;
+  const int y_off_q4 = scaled_y(y << SUBPEL_BITS, sf) & SUBPEL_MASK;
+  const MV32 res = {
+    scaled_y(mv->row, sf) + y_off_q4,
+    scaled_x(mv->col, sf) + x_off_q4
+  };
+  return res;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_setup_scale_factors_for_frame(struct scale_factors *sf,
+                                       int other_w, int other_h,
+                                       int this_w, int this_h,
+                                       int use_highbd) {
+#else
+void vp9_setup_scale_factors_for_frame(struct scale_factors *sf,
+                                       int other_w, int other_h,
+                                       int this_w, int this_h) {
+#endif
+  if (!valid_ref_frame_size(other_w, other_h, this_w, this_h)) {
+    sf->x_scale_fp = REF_INVALID_SCALE;
+    sf->y_scale_fp = REF_INVALID_SCALE;
+    return;
+  }
+
+  sf->x_scale_fp = get_fixed_point_scale_factor(other_w, this_w);
+  sf->y_scale_fp = get_fixed_point_scale_factor(other_h, this_h);
+  sf->x_step_q4 = scaled_x(16, sf);
+  sf->y_step_q4 = scaled_y(16, sf);
+
+  if (vp9_is_scaled(sf)) {
+    sf->scale_value_x = scaled_x;
+    sf->scale_value_y = scaled_y;
+  } else {
+    sf->scale_value_x = unscaled_value;
+    sf->scale_value_y = unscaled_value;
+  }
+
+  // TODO(agrange): Investigate the best choice of functions to use here
+  // for EIGHTTAP_SMOOTH. Since it is not interpolating, need to choose what
+  // to do at full-pel offsets. The current selection, where the filter is
+  // applied in one direction only, and not at all for 0,0, seems to give the
+  // best quality, but it may be worth trying an additional mode that does
+  // do the filtering on full-pel.
+
+  if (sf->x_step_q4 == 16) {
+    if (sf->y_step_q4 == 16) {
+      // No scaling in either direction.
+      sf->predict[0][0][0] = vpx_convolve_copy;
+      sf->predict[0][0][1] = vpx_convolve_avg;
+      sf->predict[0][1][0] = vpx_convolve8_vert;
+      sf->predict[0][1][1] = vpx_convolve8_avg_vert;
+      sf->predict[1][0][0] = vpx_convolve8_horiz;
+      sf->predict[1][0][1] = vpx_convolve8_avg_horiz;
+    } else {
+      // No scaling in x direction. Must always scale in the y direction.
+      sf->predict[0][0][0] = vpx_scaled_vert;
+      sf->predict[0][0][1] = vpx_scaled_avg_vert;
+      sf->predict[0][1][0] = vpx_scaled_vert;
+      sf->predict[0][1][1] = vpx_scaled_avg_vert;
+      sf->predict[1][0][0] = vpx_scaled_2d;
+      sf->predict[1][0][1] = vpx_scaled_avg_2d;
+    }
+  } else {
+    if (sf->y_step_q4 == 16) {
+      // No scaling in the y direction. Must always scale in the x direction.
+      sf->predict[0][0][0] = vpx_scaled_horiz;
+      sf->predict[0][0][1] = vpx_scaled_avg_horiz;
+      sf->predict[0][1][0] = vpx_scaled_2d;
+      sf->predict[0][1][1] = vpx_scaled_avg_2d;
+      sf->predict[1][0][0] = vpx_scaled_horiz;
+      sf->predict[1][0][1] = vpx_scaled_avg_horiz;
+    } else {
+      // Must always scale in both directions.
+      sf->predict[0][0][0] = vpx_scaled_2d;
+      sf->predict[0][0][1] = vpx_scaled_avg_2d;
+      sf->predict[0][1][0] = vpx_scaled_2d;
+      sf->predict[0][1][1] = vpx_scaled_avg_2d;
+      sf->predict[1][0][0] = vpx_scaled_2d;
+      sf->predict[1][0][1] = vpx_scaled_avg_2d;
+    }
+  }
+
+  // 2D subpel motion always gets filtered in both directions
+
+  if ((sf->x_step_q4 != 16) || (sf->y_step_q4 != 16)) {
+    sf->predict[1][1][0] = vpx_scaled_2d;
+    sf->predict[1][1][1] = vpx_scaled_avg_2d;
+  } else {
+    sf->predict[1][1][0] = vpx_convolve8;
+    sf->predict[1][1][1] = vpx_convolve8_avg;
+  }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (use_highbd) {
+    if (sf->x_step_q4 == 16) {
+      if (sf->y_step_q4 == 16) {
+        // No scaling in either direction.
+        sf->highbd_predict[0][0][0] = vpx_highbd_convolve_copy;
+        sf->highbd_predict[0][0][1] = vpx_highbd_convolve_avg;
+        sf->highbd_predict[0][1][0] = vpx_highbd_convolve8_vert;
+        sf->highbd_predict[0][1][1] = vpx_highbd_convolve8_avg_vert;
+        sf->highbd_predict[1][0][0] = vpx_highbd_convolve8_horiz;
+        sf->highbd_predict[1][0][1] = vpx_highbd_convolve8_avg_horiz;
+      } else {
+        // No scaling in x direction. Must always scale in the y direction.
+        sf->highbd_predict[0][0][0] = vpx_highbd_convolve8_vert;
+        sf->highbd_predict[0][0][1] = vpx_highbd_convolve8_avg_vert;
+        sf->highbd_predict[0][1][0] = vpx_highbd_convolve8_vert;
+        sf->highbd_predict[0][1][1] = vpx_highbd_convolve8_avg_vert;
+        sf->highbd_predict[1][0][0] = vpx_highbd_convolve8;
+        sf->highbd_predict[1][0][1] = vpx_highbd_convolve8_avg;
+      }
+    } else {
+      if (sf->y_step_q4 == 16) {
+        // No scaling in the y direction. Must always scale in the x direction.
+        sf->highbd_predict[0][0][0] = vpx_highbd_convolve8_horiz;
+        sf->highbd_predict[0][0][1] = vpx_highbd_convolve8_avg_horiz;
+        sf->highbd_predict[0][1][0] = vpx_highbd_convolve8;
+        sf->highbd_predict[0][1][1] = vpx_highbd_convolve8_avg;
+        sf->highbd_predict[1][0][0] = vpx_highbd_convolve8_horiz;
+        sf->highbd_predict[1][0][1] = vpx_highbd_convolve8_avg_horiz;
+      } else {
+        // Must always scale in both directions.
+        sf->highbd_predict[0][0][0] = vpx_highbd_convolve8;
+        sf->highbd_predict[0][0][1] = vpx_highbd_convolve8_avg;
+        sf->highbd_predict[0][1][0] = vpx_highbd_convolve8;
+        sf->highbd_predict[0][1][1] = vpx_highbd_convolve8_avg;
+        sf->highbd_predict[1][0][0] = vpx_highbd_convolve8;
+        sf->highbd_predict[1][0][1] = vpx_highbd_convolve8_avg;
+      }
+    }
+    // 2D subpel motion always gets filtered in both directions.
+    sf->highbd_predict[1][1][0] = vpx_highbd_convolve8;
+    sf->highbd_predict[1][1][1] = vpx_highbd_convolve8_avg;
+  }
+#endif
+}
diff --git a/libs/libvpx/vp9/common/vp9_scale.h b/libs/libvpx/vp9/common/vp9_scale.h
new file mode 100644
index 0000000000..5e91041079
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_scale.h
@@ -0,0 +1,75 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_SCALE_H_
+#define VP9_COMMON_VP9_SCALE_H_
+
+#include "vp9/common/vp9_mv.h"
+#include "vpx_dsp/vpx_convolve.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define REF_SCALE_SHIFT 14
+#define REF_NO_SCALE (1 << REF_SCALE_SHIFT)
+#define REF_INVALID_SCALE -1
+
+struct scale_factors {
+  int x_scale_fp;   // horizontal fixed point scale factor
+  int y_scale_fp;   // vertical fixed point scale factor
+  int x_step_q4;
+  int y_step_q4;
+
+  int (*scale_value_x)(int val, const struct scale_factors *sf);
+  int (*scale_value_y)(int val, const struct scale_factors *sf);
+
+  convolve_fn_t predict[2][2][2];  // horiz, vert, avg
+#if CONFIG_VP9_HIGHBITDEPTH
+  highbd_convolve_fn_t highbd_predict[2][2][2];  // horiz, vert, avg
+#endif
+};
+
+MV32 vp9_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_setup_scale_factors_for_frame(struct scale_factors *sf,
+                                       int other_w, int other_h,
+                                       int this_w, int this_h,
+                                       int use_high);
+#else
+void vp9_setup_scale_factors_for_frame(struct scale_factors *sf,
+                                       int other_w, int other_h,
+                                       int this_w, int this_h);
+#endif
+
+static INLINE int vp9_is_valid_scale(const struct scale_factors *sf) {
+  return sf->x_scale_fp != REF_INVALID_SCALE &&
+         sf->y_scale_fp != REF_INVALID_SCALE;
+}
+
+static INLINE int vp9_is_scaled(const struct scale_factors *sf) {
+  return vp9_is_valid_scale(sf) &&
+         (sf->x_scale_fp != REF_NO_SCALE || sf->y_scale_fp != REF_NO_SCALE);
+}
+
+static INLINE int valid_ref_frame_size(int ref_width, int ref_height,
+                                      int this_width, int this_height) {
+  return 2 * this_width >= ref_width &&
+         2 * this_height >= ref_height &&
+         this_width <= 16 * ref_width &&
+         this_height <= 16 * ref_height;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_COMMON_VP9_SCALE_H_
diff --git a/libs/libvpx/vp9/common/vp9_scan.c b/libs/libvpx/vp9/common/vp9_scan.c
new file mode 100644
index 0000000000..d6fb8b2d7b
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_scan.c
@@ -0,0 +1,727 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vp9/common/vp9_scan.h"
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_4x4[16]) = {
+  0,  4,  1,  5,
+  8,  2, 12,  9,
+  3,  6, 13, 10,
+  7, 14, 11, 15,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, col_scan_4x4[16]) = {
+  0,  4,  8,  1,
+  12,  5,  9,  2,
+  13,  6, 10,  3,
+  7, 14, 11, 15,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, row_scan_4x4[16]) = {
+  0,  1,  4,  2,
+  5,  3,  6,  8,
+  9,  7, 12, 10,
+  13, 11, 14, 15,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_8x8[64]) = {
+  0,  8,  1, 16,  9,  2, 17, 24,
+  10,  3, 18, 25, 32, 11,  4, 26,
+  33, 19, 40, 12, 34, 27,  5, 41,
+  20, 48, 13, 35, 42, 28, 21,  6,
+  49, 56, 36, 43, 29,  7, 14, 50,
+  57, 44, 22, 37, 15, 51, 58, 30,
+  45, 23, 52, 59, 38, 31, 60, 53,
+  46, 39, 61, 54, 47, 62, 55, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, col_scan_8x8[64]) = {
+  0,  8, 16,  1, 24,  9, 32, 17,
+  2, 40, 25, 10, 33, 18, 48,  3,
+  26, 41, 11, 56, 19, 34,  4, 49,
+  27, 42, 12, 35, 20, 57, 50, 28,
+  5, 43, 13, 36, 58, 51, 21, 44,
+  6, 29, 59, 37, 14, 52, 22,  7,
+  45, 60, 30, 15, 38, 53, 23, 46,
+  31, 61, 39, 54, 47, 62, 55, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, row_scan_8x8[64]) = {
+  0,  1,  2,  8,  9,  3, 16, 10,
+  4, 17, 11, 24,  5, 18, 25, 12,
+  19, 26, 32,  6, 13, 20, 33, 27,
+  7, 34, 40, 21, 28, 41, 14, 35,
+  48, 42, 29, 36, 49, 22, 43, 15,
+  56, 37, 50, 44, 30, 57, 23, 51,
+  58, 45, 38, 52, 31, 59, 53, 46,
+  60, 39, 61, 47, 54, 55, 62, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_16x16[256]) = {
+  0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 64, 34, 49, 19, 65, 80,
+  50, 4, 35, 66, 20, 81, 96, 51, 5, 36, 82, 97, 67, 112, 21, 52,
+  98, 37, 83, 113, 6, 68, 128, 53, 22, 99, 114, 84, 7, 129, 38, 69,
+  100, 115, 144, 130, 85, 54, 23, 8, 145, 39, 70, 116, 101, 131, 160, 146,
+  55, 86, 24, 71, 132, 117, 161, 40, 9, 102, 147, 176, 162, 87, 56, 25,
+  133, 118, 177, 148, 72, 103, 41, 163, 10, 192, 178, 88, 57, 134, 149, 119,
+  26, 164, 73, 104, 193, 42, 179, 208, 11, 135, 89, 165, 120, 150, 58, 194,
+  180, 27, 74, 209, 105, 151, 136, 43, 90, 224, 166, 195, 181, 121, 210, 59,
+  12, 152, 106, 167, 196, 75, 137, 225, 211, 240, 182, 122, 91, 28, 197, 13,
+  226, 168, 183, 153, 44, 212, 138, 107, 241, 60, 29, 123, 198, 184, 227, 169,
+  242, 76, 213, 154, 45, 92, 14, 199, 139, 61, 228, 214, 170, 185, 243, 108,
+  77, 155, 30, 15, 200, 229, 124, 215, 244, 93, 46, 186, 171, 201, 109, 140,
+  230, 62, 216, 245, 31, 125, 78, 156, 231, 47, 187, 202, 217, 94, 246, 141,
+  63, 232, 172, 110, 247, 157, 79, 218, 203, 126, 233, 188, 248, 95, 173, 142,
+  219, 111, 249, 234, 158, 127, 189, 204, 250, 235, 143, 174, 220, 205, 159,
+  251,
+  190, 221, 175, 236, 237, 191, 206, 252, 222, 253, 207, 238, 223, 254, 239,
+  255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, col_scan_16x16[256]) = {
+  0, 16, 32, 48, 1, 64, 17, 80, 33, 96, 49, 2, 65, 112, 18, 81,
+  34, 128, 50, 97, 3, 66, 144, 19, 113, 35, 82, 160, 98, 51, 129, 4,
+  67, 176, 20, 114, 145, 83, 36, 99, 130, 52, 192, 5, 161, 68, 115, 21,
+  146, 84, 208, 177, 37, 131, 100, 53, 162, 224, 69, 6, 116, 193, 147, 85,
+  22, 240, 132, 38, 178, 101, 163, 54, 209, 117, 70, 7, 148, 194, 86, 179,
+  225, 23, 133, 39, 164, 8, 102, 210, 241, 55, 195, 118, 149, 71, 180, 24,
+  87, 226, 134, 165, 211, 40, 103, 56, 72, 150, 196, 242, 119, 9, 181, 227,
+  88, 166, 25, 135, 41, 104, 212, 57, 151, 197, 120, 73, 243, 182, 136, 167,
+  213, 89, 10, 228, 105, 152, 198, 26, 42, 121, 183, 244, 168, 58, 137, 229,
+  74, 214, 90, 153, 199, 184, 11, 106, 245, 27, 122, 230, 169, 43, 215, 59,
+  200, 138, 185, 246, 75, 12, 91, 154, 216, 231, 107, 28, 44, 201, 123, 170,
+  60, 247, 232, 76, 139, 13, 92, 217, 186, 248, 155, 108, 29, 124, 45, 202,
+  233, 171, 61, 14, 77, 140, 15, 249, 93, 30, 187, 156, 218, 46, 109, 125,
+  62, 172, 78, 203, 31, 141, 234, 94, 47, 188, 63, 157, 110, 250, 219, 79,
+  126, 204, 173, 142, 95, 189, 111, 235, 158, 220, 251, 127, 174, 143, 205,
+  236,
+  159, 190, 221, 252, 175, 206, 237, 191, 253, 222, 238, 207, 254, 223, 239,
+  255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, row_scan_16x16[256]) = {
+  0, 1, 2, 16, 3, 17, 4, 18, 32, 5, 33, 19, 6, 34, 48, 20,
+  49, 7, 35, 21, 50, 64, 8, 36, 65, 22, 51, 37, 80, 9, 66, 52,
+  23, 38, 81, 67, 10, 53, 24, 82, 68, 96, 39, 11, 54, 83, 97, 69,
+  25, 98, 84, 40, 112, 55, 12, 70, 99, 113, 85, 26, 41, 56, 114, 100,
+  13, 71, 128, 86, 27, 115, 101, 129, 42, 57, 72, 116, 14, 87, 130, 102,
+  144, 73, 131, 117, 28, 58, 15, 88, 43, 145, 103, 132, 146, 118, 74, 160,
+  89, 133, 104, 29, 59, 147, 119, 44, 161, 148, 90, 105, 134, 162, 120, 176,
+  75, 135, 149, 30, 60, 163, 177, 45, 121, 91, 106, 164, 178, 150, 192, 136,
+  165, 179, 31, 151, 193, 76, 122, 61, 137, 194, 107, 152, 180, 208, 46, 166,
+  167, 195, 92, 181, 138, 209, 123, 153, 224, 196, 77, 168, 210, 182, 240, 108,
+  197, 62, 154, 225, 183, 169, 211, 47, 139, 93, 184, 226, 212, 241, 198, 170,
+  124, 155, 199, 78, 213, 185, 109, 227, 200, 63, 228, 242, 140, 214, 171, 186,
+  156, 229, 243, 125, 94, 201, 244, 215, 216, 230, 141, 187, 202, 79, 172, 110,
+  157, 245, 217, 231, 95, 246, 232, 126, 203, 247, 233, 173, 218, 142, 111,
+  158,
+  188, 248, 127, 234, 219, 249, 189, 204, 143, 174, 159, 250, 235, 205, 220,
+  175,
+  190, 251, 221, 191, 206, 236, 207, 237, 252, 222, 253, 223, 238, 239, 254,
+  255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_32x32[1024]) = {
+  0, 32, 1, 64, 33, 2, 96, 65, 34, 128, 3, 97, 66, 160,
+  129, 35, 98, 4, 67, 130, 161, 192, 36, 99, 224, 5, 162, 193,
+  68, 131, 37, 100,
+  225, 194, 256, 163, 69, 132, 6, 226, 257, 288, 195, 101, 164, 38,
+  258, 7, 227, 289, 133, 320, 70, 196, 165, 290, 259, 228, 39, 321,
+  102, 352, 8, 197,
+  71, 134, 322, 291, 260, 353, 384, 229, 166, 103, 40, 354, 323, 292,
+  135, 385, 198, 261, 72, 9, 416, 167, 386, 355, 230, 324, 104, 293,
+  41, 417, 199, 136,
+  262, 387, 448, 325, 356, 10, 73, 418, 231, 168, 449, 294, 388, 105,
+  419, 263, 42, 200, 357, 450, 137, 480, 74, 326, 232, 11, 389, 169,
+  295, 420, 106, 451,
+  481, 358, 264, 327, 201, 43, 138, 512, 482, 390, 296, 233, 170, 421,
+  75, 452, 359, 12, 513, 265, 483, 328, 107, 202, 514, 544, 422, 391,
+  453, 139, 44, 234,
+  484, 297, 360, 171, 76, 515, 545, 266, 329, 454, 13, 423, 203, 108,
+  546, 485, 576, 298, 235, 140, 361, 330, 172, 547, 45, 455, 267, 577,
+  486, 77, 204, 362,
+  608, 14, 299, 578, 109, 236, 487, 609, 331, 141, 579, 46, 15, 173,
+  610, 363, 78, 205, 16, 110, 237, 611, 142, 47, 174, 79, 206, 17,
+  111, 238, 48, 143,
+  80, 175, 112, 207, 49, 18, 239, 81, 113, 19, 50, 82, 114, 51,
+  83, 115, 640, 516, 392, 268, 144, 20, 672, 641, 548, 517, 424,
+  393, 300, 269, 176, 145,
+  52, 21, 704, 673, 642, 580, 549, 518, 456, 425, 394, 332, 301,
+  270, 208, 177, 146, 84, 53, 22, 736, 705, 674, 643, 612, 581,
+  550, 519, 488, 457, 426, 395,
+  364, 333, 302, 271, 240, 209, 178, 147, 116, 85, 54, 23, 737,
+  706, 675, 613, 582, 551, 489, 458, 427, 365, 334, 303, 241,
+  210, 179, 117, 86, 55, 738, 707,
+  614, 583, 490, 459, 366, 335, 242, 211, 118, 87, 739, 615, 491,
+  367, 243, 119, 768, 644, 520, 396, 272, 148, 24, 800, 769, 676,
+  645, 552, 521, 428, 397, 304,
+  273, 180, 149, 56, 25, 832, 801, 770, 708, 677, 646, 584, 553,
+  522, 460, 429, 398, 336, 305, 274, 212, 181, 150, 88, 57, 26,
+  864, 833, 802, 771, 740, 709,
+  678, 647, 616, 585, 554, 523, 492, 461, 430, 399, 368, 337, 306,
+  275, 244, 213, 182, 151, 120, 89, 58, 27, 865, 834, 803, 741,
+  710, 679, 617, 586, 555, 493,
+  462, 431, 369, 338, 307, 245, 214, 183, 121, 90, 59, 866, 835,
+  742, 711, 618, 587, 494, 463, 370, 339, 246, 215, 122, 91, 867,
+  743, 619, 495, 371, 247, 123,
+  896, 772, 648, 524, 400, 276, 152, 28, 928, 897, 804, 773, 680,
+  649, 556, 525, 432, 401, 308, 277, 184, 153, 60, 29, 960, 929,
+  898, 836, 805, 774, 712, 681,
+  650, 588, 557, 526, 464, 433, 402, 340, 309, 278, 216, 185, 154,
+  92, 61, 30, 992, 961, 930, 899, 868, 837, 806, 775, 744, 713, 682,
+  651, 620, 589, 558, 527,
+  496, 465, 434, 403, 372, 341, 310, 279, 248, 217, 186, 155, 124,
+  93, 62, 31, 993, 962, 931, 869, 838, 807, 745, 714, 683, 621, 590,
+  559, 497, 466, 435, 373,
+  342, 311, 249, 218, 187, 125, 94, 63, 994, 963, 870, 839, 746, 715,
+  622, 591, 498, 467, 374, 343, 250, 219, 126, 95, 995, 871, 747, 623,
+  499, 375, 251, 127,
+  900, 776, 652, 528, 404, 280, 156, 932, 901, 808, 777, 684, 653, 560,
+  529, 436, 405, 312, 281, 188, 157, 964, 933, 902, 840, 809, 778, 716,
+  685, 654, 592, 561,
+  530, 468, 437, 406, 344, 313, 282, 220, 189, 158, 996, 965, 934, 903,
+  872, 841, 810, 779, 748, 717, 686, 655, 624, 593, 562, 531, 500, 469,
+  438, 407, 376, 345,
+  314, 283, 252, 221, 190, 159, 997, 966, 935, 873, 842, 811, 749, 718,
+  687, 625, 594, 563, 501, 470, 439, 377, 346, 315, 253, 222, 191, 998,
+  967, 874, 843, 750,
+  719, 626, 595, 502, 471, 378, 347, 254, 223, 999, 875, 751, 627, 503,
+  379, 255, 904, 780, 656, 532, 408, 284, 936, 905, 812, 781, 688, 657,
+  564, 533, 440, 409,
+  316, 285, 968, 937, 906, 844, 813, 782, 720, 689, 658, 596, 565, 534,
+  472, 441, 410, 348, 317, 286, 1000, 969, 938, 907, 876, 845, 814, 783,
+  752, 721, 690, 659,
+  628, 597, 566, 535, 504, 473, 442, 411, 380, 349, 318, 287, 1001, 970,
+  939, 877, 846, 815, 753, 722, 691, 629, 598, 567, 505, 474, 443, 381,
+  350, 319, 1002, 971,
+  878, 847, 754, 723, 630, 599, 506, 475, 382, 351, 1003, 879, 755, 631,
+  507, 383, 908, 784, 660, 536, 412, 940, 909, 816, 785, 692, 661, 568,
+  537, 444, 413, 972,
+  941, 910, 848, 817, 786, 724, 693, 662, 600, 569, 538, 476, 445, 414,
+  1004, 973, 942, 911, 880, 849, 818, 787, 756, 725, 694, 663, 632, 601,
+  570, 539, 508, 477,
+  446, 415, 1005, 974, 943, 881, 850, 819, 757, 726, 695, 633, 602, 571,
+  509, 478, 447, 1006, 975, 882, 851, 758, 727, 634, 603, 510, 479,
+  1007, 883, 759, 635, 511,
+  912, 788, 664, 540, 944, 913, 820, 789, 696, 665, 572, 541, 976, 945,
+  914, 852, 821, 790, 728, 697, 666, 604, 573, 542, 1008, 977, 946, 915,
+  884, 853, 822, 791,
+  760, 729, 698, 667, 636, 605, 574, 543, 1009, 978, 947, 885, 854, 823,
+  761, 730, 699, 637, 606, 575, 1010, 979, 886, 855, 762, 731, 638, 607,
+  1011, 887, 763, 639,
+  916, 792, 668, 948, 917, 824, 793, 700, 669, 980, 949, 918, 856, 825,
+  794, 732, 701, 670, 1012, 981, 950, 919, 888, 857, 826, 795, 764, 733,
+  702, 671, 1013, 982,
+  951, 889, 858, 827, 765, 734, 703, 1014, 983, 890, 859, 766, 735, 1015,
+  891, 767, 920, 796, 952, 921, 828, 797, 984, 953, 922, 860, 829, 798,
+  1016, 985, 954, 923,
+  892, 861, 830, 799, 1017, 986, 955, 893, 862, 831, 1018, 987, 894, 863,
+  1019, 895, 924, 956, 925, 988, 957, 926, 1020, 989, 958, 927, 1021,
+  990, 959, 1022, 991, 1023,
+};
+
+// Neighborhood 5-tuples for various scans and blocksizes,
+// in {top, left, topleft, topright, bottomleft} order
+// for each position in raster scan order.
+// -1 indicates the neighbor does not exist.
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 0, 0, 1, 4, 4, 4, 1, 1, 8, 8, 5, 8, 2, 2, 2, 5, 9, 12, 6, 9,
+  3, 6, 10, 13, 7, 10, 11, 14, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 4, 4, 0, 0, 8, 8, 1, 1, 5, 5, 1, 1, 9, 9, 2, 2, 6, 6, 2, 2, 3,
+  3, 10, 10, 7, 7, 11, 11, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 0, 0, 1, 1, 4, 4, 2, 2, 5, 5, 4, 4, 8, 8, 6, 6, 8, 8, 9, 9, 12,
+  12, 10, 10, 13, 13, 14, 14, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 8, 8, 0, 0, 16, 16, 1, 1, 24, 24, 9, 9, 1, 1, 32, 32, 17, 17, 2,
+  2, 25, 25, 10, 10, 40, 40, 2, 2, 18, 18, 33, 33, 3, 3, 48, 48, 11, 11, 26,
+  26, 3, 3, 41, 41, 19, 19, 34, 34, 4, 4, 27, 27, 12, 12, 49, 49, 42, 42, 20,
+  20, 4, 4, 35, 35, 5, 5, 28, 28, 50, 50, 43, 43, 13, 13, 36, 36, 5, 5, 21, 21,
+  51, 51, 29, 29, 6, 6, 44, 44, 14, 14, 6, 6, 37, 37, 52, 52, 22, 22, 7, 7, 30,
+  30, 45, 45, 15, 15, 38, 38, 23, 23, 53, 53, 31, 31, 46, 46, 39, 39, 54, 54,
+  47, 47, 55, 55, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 1, 1, 0, 0, 8, 8, 2, 2, 8, 8, 9, 9, 3, 3, 16, 16, 10, 10, 16, 16,
+  4, 4, 17, 17, 24, 24, 11, 11, 18, 18, 25, 25, 24, 24, 5, 5, 12, 12, 19, 19,
+  32, 32, 26, 26, 6, 6, 33, 33, 32, 32, 20, 20, 27, 27, 40, 40, 13, 13, 34, 34,
+  40, 40, 41, 41, 28, 28, 35, 35, 48, 48, 21, 21, 42, 42, 14, 14, 48, 48, 36,
+  36, 49, 49, 43, 43, 29, 29, 56, 56, 22, 22, 50, 50, 57, 57, 44, 44, 37, 37,
+  51, 51, 30, 30, 58, 58, 52, 52, 45, 45, 59, 59, 38, 38, 60, 60, 46, 46, 53,
+  53, 54, 54, 61, 61, 62, 62, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 0, 0, 8, 8, 1, 8, 1, 1, 9, 16, 16, 16, 2, 9, 2, 2, 10, 17, 17,
+  24, 24, 24, 3, 10, 3, 3, 18, 25, 25, 32, 11, 18, 32, 32, 4, 11, 26, 33, 19,
+  26, 4, 4, 33, 40, 12, 19, 40, 40, 5, 12, 27, 34, 34, 41, 20, 27, 13, 20, 5,
+  5, 41, 48, 48, 48, 28, 35, 35, 42, 21, 28, 6, 6, 6, 13, 42, 49, 49, 56, 36,
+  43, 14, 21, 29, 36, 7, 14, 43, 50, 50, 57, 22, 29, 37, 44, 15, 22, 44, 51,
+  51, 58, 30, 37, 23, 30, 52, 59, 45, 52, 38, 45, 31, 38, 53, 60, 46, 53, 39,
+  46, 54, 61, 47, 54, 55, 62, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 16, 16, 32, 32, 0, 0, 48, 48, 1, 1, 64, 64,
+  17, 17, 80, 80, 33, 33, 1, 1, 49, 49, 96, 96, 2, 2, 65, 65,
+  18, 18, 112, 112, 34, 34, 81, 81, 2, 2, 50, 50, 128, 128, 3, 3,
+  97, 97, 19, 19, 66, 66, 144, 144, 82, 82, 35, 35, 113, 113, 3, 3,
+  51, 51, 160, 160, 4, 4, 98, 98, 129, 129, 67, 67, 20, 20, 83, 83,
+  114, 114, 36, 36, 176, 176, 4, 4, 145, 145, 52, 52, 99, 99, 5, 5,
+  130, 130, 68, 68, 192, 192, 161, 161, 21, 21, 115, 115, 84, 84, 37, 37,
+  146, 146, 208, 208, 53, 53, 5, 5, 100, 100, 177, 177, 131, 131, 69, 69,
+  6, 6, 224, 224, 116, 116, 22, 22, 162, 162, 85, 85, 147, 147, 38, 38,
+  193, 193, 101, 101, 54, 54, 6, 6, 132, 132, 178, 178, 70, 70, 163, 163,
+  209, 209, 7, 7, 117, 117, 23, 23, 148, 148, 7, 7, 86, 86, 194, 194,
+  225, 225, 39, 39, 179, 179, 102, 102, 133, 133, 55, 55, 164, 164, 8, 8,
+  71, 71, 210, 210, 118, 118, 149, 149, 195, 195, 24, 24, 87, 87, 40, 40,
+  56, 56, 134, 134, 180, 180, 226, 226, 103, 103, 8, 8, 165, 165, 211, 211,
+  72, 72, 150, 150, 9, 9, 119, 119, 25, 25, 88, 88, 196, 196, 41, 41,
+  135, 135, 181, 181, 104, 104, 57, 57, 227, 227, 166, 166, 120, 120, 151, 151,
+  197, 197, 73, 73, 9, 9, 212, 212, 89, 89, 136, 136, 182, 182, 10, 10,
+  26, 26, 105, 105, 167, 167, 228, 228, 152, 152, 42, 42, 121, 121, 213, 213,
+  58, 58, 198, 198, 74, 74, 137, 137, 183, 183, 168, 168, 10, 10, 90, 90,
+  229, 229, 11, 11, 106, 106, 214, 214, 153, 153, 27, 27, 199, 199, 43, 43,
+  184, 184, 122, 122, 169, 169, 230, 230, 59, 59, 11, 11, 75, 75, 138, 138,
+  200, 200, 215, 215, 91, 91, 12, 12, 28, 28, 185, 185, 107, 107, 154, 154,
+  44, 44, 231, 231, 216, 216, 60, 60, 123, 123, 12, 12, 76, 76, 201, 201,
+  170, 170, 232, 232, 139, 139, 92, 92, 13, 13, 108, 108, 29, 29, 186, 186,
+  217, 217, 155, 155, 45, 45, 13, 13, 61, 61, 124, 124, 14, 14, 233, 233,
+  77, 77, 14, 14, 171, 171, 140, 140, 202, 202, 30, 30, 93, 93, 109, 109,
+  46, 46, 156, 156, 62, 62, 187, 187, 15, 15, 125, 125, 218, 218, 78, 78,
+  31, 31, 172, 172, 47, 47, 141, 141, 94, 94, 234, 234, 203, 203, 63, 63,
+  110, 110, 188, 188, 157, 157, 126, 126, 79, 79, 173, 173, 95, 95, 219, 219,
+  142, 142, 204, 204, 235, 235, 111, 111, 158, 158, 127, 127, 189, 189, 220,
+  220, 143, 143, 174, 174, 205, 205, 236, 236, 159, 159, 190, 190, 221, 221,
+  175, 175, 237, 237, 206, 206, 222, 222, 191, 191, 238, 238, 207, 207, 223,
+  223, 239, 239, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 1, 1, 0, 0, 2, 2, 16, 16, 3, 3, 17, 17,
+  16, 16, 4, 4, 32, 32, 18, 18, 5, 5, 33, 33, 32, 32, 19, 19,
+  48, 48, 6, 6, 34, 34, 20, 20, 49, 49, 48, 48, 7, 7, 35, 35,
+  64, 64, 21, 21, 50, 50, 36, 36, 64, 64, 8, 8, 65, 65, 51, 51,
+  22, 22, 37, 37, 80, 80, 66, 66, 9, 9, 52, 52, 23, 23, 81, 81,
+  67, 67, 80, 80, 38, 38, 10, 10, 53, 53, 82, 82, 96, 96, 68, 68,
+  24, 24, 97, 97, 83, 83, 39, 39, 96, 96, 54, 54, 11, 11, 69, 69,
+  98, 98, 112, 112, 84, 84, 25, 25, 40, 40, 55, 55, 113, 113, 99, 99,
+  12, 12, 70, 70, 112, 112, 85, 85, 26, 26, 114, 114, 100, 100, 128, 128,
+  41, 41, 56, 56, 71, 71, 115, 115, 13, 13, 86, 86, 129, 129, 101, 101,
+  128, 128, 72, 72, 130, 130, 116, 116, 27, 27, 57, 57, 14, 14, 87, 87,
+  42, 42, 144, 144, 102, 102, 131, 131, 145, 145, 117, 117, 73, 73, 144, 144,
+  88, 88, 132, 132, 103, 103, 28, 28, 58, 58, 146, 146, 118, 118, 43, 43,
+  160, 160, 147, 147, 89, 89, 104, 104, 133, 133, 161, 161, 119, 119, 160, 160,
+  74, 74, 134, 134, 148, 148, 29, 29, 59, 59, 162, 162, 176, 176, 44, 44,
+  120, 120, 90, 90, 105, 105, 163, 163, 177, 177, 149, 149, 176, 176, 135, 135,
+  164, 164, 178, 178, 30, 30, 150, 150, 192, 192, 75, 75, 121, 121, 60, 60,
+  136, 136, 193, 193, 106, 106, 151, 151, 179, 179, 192, 192, 45, 45, 165, 165,
+  166, 166, 194, 194, 91, 91, 180, 180, 137, 137, 208, 208, 122, 122, 152, 152,
+  208, 208, 195, 195, 76, 76, 167, 167, 209, 209, 181, 181, 224, 224, 107, 107,
+  196, 196, 61, 61, 153, 153, 224, 224, 182, 182, 168, 168, 210, 210, 46, 46,
+  138, 138, 92, 92, 183, 183, 225, 225, 211, 211, 240, 240, 197, 197, 169, 169,
+  123, 123, 154, 154, 198, 198, 77, 77, 212, 212, 184, 184, 108, 108, 226, 226,
+  199, 199, 62, 62, 227, 227, 241, 241, 139, 139, 213, 213, 170, 170, 185, 185,
+  155, 155, 228, 228, 242, 242, 124, 124, 93, 93, 200, 200, 243, 243, 214, 214,
+  215, 215, 229, 229, 140, 140, 186, 186, 201, 201, 78, 78, 171, 171, 109, 109,
+  156, 156, 244, 244, 216, 216, 230, 230, 94, 94, 245, 245, 231, 231, 125, 125,
+  202, 202, 246, 246, 232, 232, 172, 172, 217, 217, 141, 141, 110, 110, 157,
+  157, 187, 187, 247, 247, 126, 126, 233, 233, 218, 218, 248, 248, 188, 188,
+  203, 203, 142, 142, 173, 173, 158, 158, 249, 249, 234, 234, 204, 204, 219,
+  219, 174, 174, 189, 189, 250, 250, 220, 220, 190, 190, 205, 205, 235, 235,
+  206, 206, 236, 236, 251, 251, 221, 221, 252, 252, 222, 222, 237, 237, 238,
+  238, 253, 253, 254, 254, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 0, 0, 16, 16, 1, 16, 1, 1, 32, 32, 17, 32,
+  2, 17, 2, 2, 48, 48, 18, 33, 33, 48, 3, 18, 49, 64, 64, 64,
+  34, 49, 3, 3, 19, 34, 50, 65, 4, 19, 65, 80, 80, 80, 35, 50,
+  4, 4, 20, 35, 66, 81, 81, 96, 51, 66, 96, 96, 5, 20, 36, 51,
+  82, 97, 21, 36, 67, 82, 97, 112, 5, 5, 52, 67, 112, 112, 37, 52,
+  6, 21, 83, 98, 98, 113, 68, 83, 6, 6, 113, 128, 22, 37, 53, 68,
+  84, 99, 99, 114, 128, 128, 114, 129, 69, 84, 38, 53, 7, 22, 7, 7,
+  129, 144, 23, 38, 54, 69, 100, 115, 85, 100, 115, 130, 144, 144, 130, 145,
+  39, 54, 70, 85, 8, 23, 55, 70, 116, 131, 101, 116, 145, 160, 24, 39,
+  8, 8, 86, 101, 131, 146, 160, 160, 146, 161, 71, 86, 40, 55, 9, 24,
+  117, 132, 102, 117, 161, 176, 132, 147, 56, 71, 87, 102, 25, 40, 147, 162,
+  9, 9, 176, 176, 162, 177, 72, 87, 41, 56, 118, 133, 133, 148, 103, 118,
+  10, 25, 148, 163, 57, 72, 88, 103, 177, 192, 26, 41, 163, 178, 192, 192,
+  10, 10, 119, 134, 73, 88, 149, 164, 104, 119, 134, 149, 42, 57, 178, 193,
+  164, 179, 11, 26, 58, 73, 193, 208, 89, 104, 135, 150, 120, 135, 27, 42,
+  74, 89, 208, 208, 150, 165, 179, 194, 165, 180, 105, 120, 194, 209, 43, 58,
+  11, 11, 136, 151, 90, 105, 151, 166, 180, 195, 59, 74, 121, 136, 209, 224,
+  195, 210, 224, 224, 166, 181, 106, 121, 75, 90, 12, 27, 181, 196, 12, 12,
+  210, 225, 152, 167, 167, 182, 137, 152, 28, 43, 196, 211, 122, 137, 91, 106,
+  225, 240, 44, 59, 13, 28, 107, 122, 182, 197, 168, 183, 211, 226, 153, 168,
+  226, 241, 60, 75, 197, 212, 138, 153, 29, 44, 76, 91, 13, 13, 183, 198,
+  123, 138, 45, 60, 212, 227, 198, 213, 154, 169, 169, 184, 227, 242, 92, 107,
+  61, 76, 139, 154, 14, 29, 14, 14, 184, 199, 213, 228, 108, 123, 199, 214,
+  228, 243, 77, 92, 30, 45, 170, 185, 155, 170, 185, 200, 93, 108, 124, 139,
+  214, 229, 46, 61, 200, 215, 229, 244, 15, 30, 109, 124, 62, 77, 140, 155,
+  215, 230, 31, 46, 171, 186, 186, 201, 201, 216, 78, 93, 230, 245, 125, 140,
+  47, 62, 216, 231, 156, 171, 94, 109, 231, 246, 141, 156, 63, 78, 202, 217,
+  187, 202, 110, 125, 217, 232, 172, 187, 232, 247, 79, 94, 157, 172, 126, 141,
+  203, 218, 95, 110, 233, 248, 218, 233, 142, 157, 111, 126, 173, 188, 188, 203,
+  234, 249, 219, 234, 127, 142, 158, 173, 204, 219, 189, 204, 143, 158, 235,
+  250, 174, 189, 205, 220, 159, 174, 220, 235, 221, 236, 175, 190, 190, 205,
+  236, 251, 206, 221, 237, 252, 191, 206, 222, 237, 207, 222, 238, 253, 223,
+  238, 239, 254, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 0, 0, 32, 32, 1, 32, 1, 1, 64, 64, 33, 64,
+  2, 33, 96, 96, 2, 2, 65, 96, 34, 65, 128, 128, 97, 128, 3, 34,
+  66, 97, 3, 3, 35, 66, 98, 129, 129, 160, 160, 160, 4, 35, 67, 98,
+  192, 192, 4, 4, 130, 161, 161, 192, 36, 67, 99, 130, 5, 36, 68, 99,
+  193, 224, 162, 193, 224, 224, 131, 162, 37, 68, 100, 131, 5, 5, 194, 225,
+  225, 256, 256, 256, 163, 194, 69, 100, 132, 163, 6, 37, 226, 257, 6, 6,
+  195, 226, 257, 288, 101, 132, 288, 288, 38, 69, 164, 195, 133, 164, 258, 289,
+  227, 258, 196, 227, 7, 38, 289, 320, 70, 101, 320, 320, 7, 7, 165, 196,
+  39, 70, 102, 133, 290, 321, 259, 290, 228, 259, 321, 352, 352, 352, 197, 228,
+  134, 165, 71, 102, 8, 39, 322, 353, 291, 322, 260, 291, 103, 134, 353, 384,
+  166, 197, 229, 260, 40, 71, 8, 8, 384, 384, 135, 166, 354, 385, 323, 354,
+  198, 229, 292, 323, 72, 103, 261, 292, 9, 40, 385, 416, 167, 198, 104, 135,
+  230, 261, 355, 386, 416, 416, 293, 324, 324, 355, 9, 9, 41, 72, 386, 417,
+  199, 230, 136, 167, 417, 448, 262, 293, 356, 387, 73, 104, 387, 418, 231, 262,
+  10, 41, 168, 199, 325, 356, 418, 449, 105, 136, 448, 448, 42, 73, 294, 325,
+  200, 231, 10, 10, 357, 388, 137, 168, 263, 294, 388, 419, 74, 105, 419, 450,
+  449, 480, 326, 357, 232, 263, 295, 326, 169, 200, 11, 42, 106, 137, 480, 480,
+  450, 481, 358, 389, 264, 295, 201, 232, 138, 169, 389, 420, 43, 74, 420, 451,
+  327, 358, 11, 11, 481, 512, 233, 264, 451, 482, 296, 327, 75, 106, 170, 201,
+  482, 513, 512, 512, 390, 421, 359, 390, 421, 452, 107, 138, 12, 43, 202, 233,
+  452, 483, 265, 296, 328, 359, 139, 170, 44, 75, 483, 514, 513, 544, 234, 265,
+  297, 328, 422, 453, 12, 12, 391, 422, 171, 202, 76, 107, 514, 545, 453, 484,
+  544, 544, 266, 297, 203, 234, 108, 139, 329, 360, 298, 329, 140, 171, 515,
+  546, 13, 44, 423, 454, 235, 266, 545, 576, 454, 485, 45, 76, 172, 203, 330,
+  361, 576, 576, 13, 13, 267, 298, 546, 577, 77, 108, 204, 235, 455, 486, 577,
+  608, 299, 330, 109, 140, 547, 578, 14, 45, 14, 14, 141, 172, 578, 609, 331,
+  362, 46, 77, 173, 204, 15, 15, 78, 109, 205, 236, 579, 610, 110, 141, 15, 46,
+  142, 173, 47, 78, 174, 205, 16, 16, 79, 110, 206, 237, 16, 47, 111, 142,
+  48, 79, 143, 174, 80, 111, 175, 206, 17, 48, 17, 17, 207, 238, 49, 80,
+  81, 112, 18, 18, 18, 49, 50, 81, 82, 113, 19, 50, 51, 82, 83, 114, 608, 608,
+  484, 515, 360, 391, 236, 267, 112, 143, 19, 19, 640, 640, 609, 640, 516, 547,
+  485, 516, 392, 423, 361, 392, 268, 299, 237, 268, 144, 175, 113, 144, 20, 51,
+  20, 20, 672, 672, 641, 672, 610, 641, 548, 579, 517, 548, 486, 517, 424, 455,
+  393, 424, 362, 393, 300, 331, 269, 300, 238, 269, 176, 207, 145, 176, 114,
+  145, 52, 83, 21, 52, 21, 21, 704, 704, 673, 704, 642, 673, 611, 642, 580,
+  611, 549, 580, 518, 549, 487, 518, 456, 487, 425, 456, 394, 425, 363, 394,
+  332, 363, 301, 332, 270, 301, 239, 270, 208, 239, 177, 208, 146, 177, 115,
+  146, 84, 115, 53, 84, 22, 53, 22, 22, 705, 736, 674, 705, 643, 674, 581, 612,
+  550, 581, 519, 550, 457, 488, 426, 457, 395, 426, 333, 364, 302, 333, 271,
+  302, 209, 240, 178, 209, 147, 178, 85, 116, 54, 85, 23, 54, 706, 737, 675,
+  706, 582, 613, 551, 582, 458, 489, 427, 458, 334, 365, 303, 334, 210, 241,
+  179, 210, 86, 117, 55, 86, 707, 738, 583, 614, 459, 490, 335, 366, 211, 242,
+  87, 118, 736, 736, 612, 643, 488, 519, 364, 395, 240, 271, 116, 147, 23, 23,
+  768, 768, 737, 768, 644, 675, 613, 644, 520, 551, 489, 520, 396, 427, 365,
+  396, 272, 303, 241, 272, 148, 179, 117, 148, 24, 55, 24, 24, 800, 800, 769,
+  800, 738, 769, 676, 707, 645, 676, 614, 645, 552, 583, 521, 552, 490, 521,
+  428, 459, 397, 428, 366, 397, 304, 335, 273, 304, 242, 273, 180, 211, 149,
+  180, 118, 149, 56, 87, 25, 56, 25, 25, 832, 832, 801, 832, 770, 801, 739,
+  770, 708, 739, 677, 708, 646, 677, 615, 646, 584, 615, 553, 584, 522, 553,
+  491, 522, 460, 491, 429, 460, 398, 429, 367, 398, 336, 367, 305, 336, 274,
+  305, 243, 274, 212, 243, 181, 212, 150, 181, 119, 150, 88, 119, 57, 88, 26,
+  57, 26, 26, 833, 864, 802, 833, 771, 802, 709, 740, 678, 709, 647, 678, 585,
+  616, 554, 585, 523, 554, 461, 492, 430, 461, 399, 430, 337, 368, 306, 337,
+  275, 306, 213, 244, 182, 213, 151, 182, 89, 120, 58, 89, 27, 58, 834, 865,
+  803, 834, 710, 741, 679, 710, 586, 617, 555, 586, 462, 493, 431, 462, 338,
+  369, 307, 338, 214, 245, 183, 214, 90, 121, 59, 90, 835, 866, 711, 742, 587,
+  618, 463, 494, 339, 370, 215, 246, 91, 122, 864, 864, 740, 771, 616, 647,
+  492, 523, 368, 399, 244, 275, 120, 151, 27, 27, 896, 896, 865, 896, 772, 803,
+  741, 772, 648, 679, 617, 648, 524, 555, 493, 524, 400, 431, 369, 400, 276,
+  307, 245, 276, 152, 183, 121, 152, 28, 59, 28, 28, 928, 928, 897, 928, 866,
+  897, 804, 835, 773, 804, 742, 773, 680, 711, 649, 680, 618, 649, 556, 587,
+  525, 556, 494, 525, 432, 463, 401, 432, 370, 401, 308, 339, 277, 308, 246,
+  277, 184, 215, 153, 184, 122, 153, 60, 91, 29, 60, 29, 29, 960, 960, 929,
+  960, 898, 929, 867, 898, 836, 867, 805, 836, 774, 805, 743, 774, 712, 743,
+  681, 712, 650, 681, 619, 650, 588, 619, 557, 588, 526, 557, 495, 526, 464,
+  495, 433, 464, 402, 433, 371, 402, 340, 371, 309, 340, 278, 309, 247, 278,
+  216, 247, 185, 216, 154, 185, 123, 154, 92, 123, 61, 92, 30, 61, 30, 30,
+  961, 992, 930, 961, 899, 930, 837, 868, 806, 837, 775, 806, 713, 744, 682,
+  713, 651, 682, 589, 620, 558, 589, 527, 558, 465, 496, 434, 465, 403, 434,
+  341, 372, 310, 341, 279, 310, 217, 248, 186, 217, 155, 186, 93, 124, 62, 93,
+  31, 62, 962, 993, 931, 962, 838, 869, 807, 838, 714, 745, 683, 714, 590, 621,
+  559, 590, 466, 497, 435, 466, 342, 373, 311, 342, 218, 249, 187, 218, 94,
+  125, 63, 94, 963, 994, 839, 870, 715, 746, 591, 622, 467, 498, 343, 374, 219,
+  250, 95, 126, 868, 899, 744, 775, 620, 651, 496, 527, 372, 403, 248, 279,
+  124, 155, 900, 931, 869, 900, 776, 807, 745, 776, 652, 683, 621, 652, 528,
+  559, 497, 528, 404, 435, 373, 404, 280, 311, 249, 280, 156, 187, 125, 156,
+  932, 963, 901, 932, 870, 901, 808, 839, 777, 808, 746, 777, 684, 715, 653,
+  684, 622, 653, 560, 591, 529, 560, 498, 529, 436, 467, 405, 436, 374, 405,
+  312, 343, 281, 312, 250, 281, 188, 219, 157, 188, 126, 157, 964, 995, 933,
+  964, 902, 933, 871, 902, 840, 871, 809, 840, 778, 809, 747, 778, 716, 747,
+  685, 716, 654, 685, 623, 654, 592, 623, 561, 592, 530, 561, 499, 530, 468,
+  499, 437, 468, 406, 437, 375, 406, 344, 375, 313, 344, 282, 313, 251, 282,
+  220, 251, 189, 220, 158, 189, 127, 158, 965, 996, 934, 965, 903, 934, 841,
+  872, 810, 841, 779, 810, 717, 748, 686, 717, 655, 686, 593, 624, 562, 593,
+  531, 562, 469, 500, 438, 469, 407, 438, 345, 376, 314, 345, 283, 314, 221,
+  252, 190, 221, 159, 190, 966, 997, 935, 966, 842, 873, 811, 842, 718, 749,
+  687, 718, 594, 625, 563, 594, 470, 501, 439, 470, 346, 377, 315, 346, 222,
+  253, 191, 222, 967, 998, 843, 874, 719, 750, 595, 626, 471, 502, 347, 378,
+  223, 254, 872, 903, 748, 779, 624, 655, 500, 531, 376, 407, 252, 283, 904,
+  935, 873, 904, 780, 811, 749, 780, 656, 687, 625, 656, 532, 563, 501, 532,
+  408, 439, 377, 408, 284, 315, 253, 284, 936, 967, 905, 936, 874, 905, 812,
+  843, 781, 812, 750, 781, 688, 719, 657, 688, 626, 657, 564, 595, 533, 564,
+  502, 533, 440, 471, 409, 440, 378, 409, 316, 347, 285, 316, 254, 285, 968,
+  999, 937, 968, 906, 937, 875, 906, 844, 875, 813, 844, 782, 813, 751, 782,
+  720, 751, 689, 720, 658, 689, 627, 658, 596, 627, 565, 596, 534, 565, 503,
+  534, 472, 503, 441, 472, 410, 441, 379, 410, 348, 379, 317, 348, 286, 317,
+  255, 286, 969, 1000, 938, 969, 907, 938, 845, 876, 814, 845, 783, 814, 721,
+  752, 690, 721, 659, 690, 597, 628, 566, 597, 535, 566, 473, 504, 442, 473,
+  411, 442, 349, 380, 318, 349, 287, 318, 970, 1001, 939, 970, 846, 877, 815,
+  846, 722, 753, 691, 722, 598, 629, 567, 598, 474, 505, 443, 474, 350, 381,
+  319, 350, 971, 1002, 847, 878, 723, 754, 599, 630, 475, 506, 351, 382, 876,
+  907, 752, 783, 628, 659, 504, 535, 380, 411, 908, 939, 877, 908, 784, 815,
+  753, 784, 660, 691, 629, 660, 536, 567, 505, 536, 412, 443, 381, 412, 940,
+  971, 909, 940, 878, 909, 816, 847, 785, 816, 754, 785, 692, 723, 661, 692,
+  630, 661, 568, 599, 537, 568, 506, 537, 444, 475, 413, 444, 382, 413, 972,
+  1003, 941, 972, 910, 941, 879, 910, 848, 879, 817, 848, 786, 817, 755, 786,
+  724, 755, 693, 724, 662, 693, 631, 662, 600, 631, 569, 600, 538, 569, 507,
+  538, 476, 507, 445, 476, 414, 445, 383, 414, 973, 1004, 942, 973, 911, 942,
+  849, 880, 818, 849, 787, 818, 725, 756, 694, 725, 663, 694, 601, 632, 570,
+  601, 539, 570, 477, 508, 446, 477, 415, 446, 974, 1005, 943, 974, 850, 881,
+  819, 850, 726, 757, 695, 726, 602, 633, 571, 602, 478, 509, 447, 478, 975,
+  1006, 851, 882, 727, 758, 603, 634, 479, 510, 880, 911, 756, 787, 632, 663,
+  508, 539, 912, 943, 881, 912, 788, 819, 757, 788, 664, 695, 633, 664, 540,
+  571, 509, 540, 944, 975, 913, 944, 882, 913, 820, 851, 789, 820, 758, 789,
+  696, 727, 665, 696, 634, 665, 572, 603, 541, 572, 510, 541, 976, 1007, 945,
+  976, 914, 945, 883, 914, 852, 883, 821, 852, 790, 821, 759, 790, 728, 759,
+  697, 728, 666, 697, 635, 666, 604, 635, 573, 604, 542, 573, 511, 542, 977,
+  1008, 946, 977, 915, 946, 853, 884, 822, 853, 791, 822, 729, 760, 698, 729,
+  667, 698, 605, 636, 574, 605, 543, 574, 978, 1009, 947, 978, 854, 885, 823,
+  854, 730, 761, 699, 730, 606, 637, 575, 606, 979, 1010, 855, 886, 731, 762,
+  607, 638, 884, 915, 760, 791, 636, 667, 916, 947, 885, 916, 792, 823, 761,
+  792, 668, 699, 637, 668, 948, 979, 917, 948, 886, 917, 824, 855, 793, 824,
+  762, 793, 700, 731, 669, 700, 638, 669, 980, 1011, 949, 980, 918, 949, 887,
+  918, 856, 887, 825, 856, 794, 825, 763, 794, 732, 763, 701, 732, 670, 701,
+  639, 670, 981, 1012, 950, 981, 919, 950, 857, 888, 826, 857, 795, 826, 733,
+  764, 702, 733, 671, 702, 982, 1013, 951, 982, 858, 889, 827, 858, 734, 765,
+  703, 734, 983, 1014, 859, 890, 735, 766, 888, 919, 764, 795, 920, 951, 889,
+  920, 796, 827, 765, 796, 952, 983, 921, 952, 890, 921, 828, 859, 797, 828,
+  766, 797, 984, 1015, 953, 984, 922, 953, 891, 922, 860, 891, 829, 860, 798,
+  829, 767, 798, 985, 1016, 954, 985, 923, 954, 861, 892, 830, 861, 799, 830,
+  986, 1017, 955, 986, 862, 893, 831, 862, 987, 1018, 863, 894, 892, 923, 924,
+  955, 893, 924, 956, 987, 925, 956, 894, 925, 988, 1019, 957, 988, 926, 957,
+  895, 926, 989, 1020, 958, 989, 927, 958, 990, 1021, 959, 990, 991, 1022, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_4x4[16]) = {
+  0, 2, 5, 8, 1, 3, 9, 12, 4, 7, 11, 14, 6, 10, 13, 15,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp9_col_iscan_4x4[16]) = {
+  0, 3, 7, 11, 1, 5, 9, 12, 2, 6, 10, 14, 4, 8, 13, 15,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp9_row_iscan_4x4[16]) = {
+  0, 1, 3, 5, 2, 4, 6, 9, 7, 8, 11, 13, 10, 12, 14, 15,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp9_col_iscan_8x8[64]) = {
+  0, 3, 8, 15, 22, 32, 40, 47, 1, 5, 11, 18, 26, 34, 44, 51,
+  2, 7, 13, 20, 28, 38, 46, 54, 4, 10, 16, 24, 31, 41, 50, 56,
+  6, 12, 21, 27, 35, 43, 52, 58, 9, 17, 25, 33, 39, 48, 55, 60,
+  14, 23, 30, 37, 45, 53, 59, 62, 19, 29, 36, 42, 49, 57, 61, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp9_row_iscan_8x8[64]) = {
+  0, 1, 2, 5, 8, 12, 19, 24, 3, 4, 7, 10, 15, 20, 30, 39,
+  6, 9, 13, 16, 21, 27, 37, 46, 11, 14, 17, 23, 28, 34, 44, 52,
+  18, 22, 25, 31, 35, 41, 50, 57, 26, 29, 33, 38, 43, 49, 55, 59,
+  32, 36, 42, 47, 51, 54, 60, 61, 40, 45, 48, 53, 56, 58, 62, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_8x8[64]) = {
+  0, 2, 5, 9, 14, 22, 31, 37, 1, 4, 8, 13, 19, 26, 38, 44,
+  3, 6, 10, 17, 24, 30, 42, 49, 7, 11, 15, 21, 29, 36, 47, 53,
+  12, 16, 20, 27, 34, 43, 52, 57, 18, 23, 28, 35, 41, 48, 56, 60,
+  25, 32, 39, 45, 50, 55, 59, 62, 33, 40, 46, 51, 54, 58, 61, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp9_col_iscan_16x16[256]) = {
+  0, 4, 11, 20, 31, 43, 59, 75, 85, 109, 130, 150, 165, 181, 195, 198,
+  1, 6, 14, 23, 34, 47, 64, 81, 95, 114, 135, 153, 171, 188, 201, 212,
+  2, 8, 16, 25, 38, 52, 67, 83, 101, 116, 136, 157, 172, 190, 205, 216,
+  3, 10, 18, 29, 41, 55, 71, 89, 103, 119, 141, 159, 176, 194, 208, 218,
+  5, 12, 21, 32, 45, 58, 74, 93, 104, 123, 144, 164, 179, 196, 210, 223,
+  7, 15, 26, 37, 49, 63, 78, 96, 112, 129, 146, 166, 182, 200, 215, 228,
+  9, 19, 28, 39, 54, 69, 86, 102, 117, 132, 151, 170, 187, 206, 220, 230,
+  13, 24, 35, 46, 60, 73, 91, 108, 122, 137, 154, 174, 189, 207, 224, 235,
+  17, 30, 40, 53, 66, 82, 98, 115, 126, 142, 161, 180, 197, 213, 227, 237,
+  22, 36, 48, 62, 76, 92, 105, 120, 133, 147, 167, 186, 203, 219, 232, 240,
+  27, 44, 56, 70, 84, 99, 113, 127, 140, 156, 175, 193, 209, 226, 236, 244,
+  33, 51, 68, 79, 94, 110, 125, 138, 149, 162, 184, 202, 217, 229, 241, 247,
+  42, 61, 77, 90, 106, 121, 134, 148, 160, 173, 191, 211, 225, 238, 245, 251,
+  50, 72, 87, 100, 118, 128, 145, 158, 168, 183, 204, 222, 233, 242, 249, 253,
+  57, 80, 97, 111, 131, 143, 155, 169, 178, 192, 214, 231, 239, 246, 250, 254,
+  65, 88, 107, 124, 139, 152, 163, 177, 185, 199, 221, 234, 243, 248, 252, 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp9_row_iscan_16x16[256]) = {
+  0, 1, 2, 4, 6, 9, 12, 17, 22, 29, 36, 43, 54, 64, 76, 86,
+  3, 5, 7, 11, 15, 19, 25, 32, 38, 48, 59, 68, 84, 99, 115, 130,
+  8, 10, 13, 18, 23, 27, 33, 42, 51, 60, 72, 88, 103, 119, 142, 167,
+  14, 16, 20, 26, 31, 37, 44, 53, 61, 73, 85, 100, 116, 135, 161, 185,
+  21, 24, 30, 35, 40, 47, 55, 65, 74, 81, 94, 112, 133, 154, 179, 205,
+  28, 34, 39, 45, 50, 58, 67, 77, 87, 96, 106, 121, 146, 169, 196, 212,
+  41, 46, 49, 56, 63, 70, 79, 90, 98, 107, 122, 138, 159, 182, 207, 222,
+  52, 57, 62, 69, 75, 83, 93, 102, 110, 120, 134, 150, 176, 195, 215, 226,
+  66, 71, 78, 82, 91, 97, 108, 113, 127, 136, 148, 168, 188, 202, 221, 232,
+  80, 89, 92, 101, 105, 114, 125, 131, 139, 151, 162, 177, 192, 208, 223, 234,
+  95, 104, 109, 117, 123, 128, 143, 144, 155, 165, 175, 190, 206, 219, 233, 239,
+  111, 118, 124, 129, 140, 147, 157, 164, 170, 181, 191, 203, 224, 230, 240,
+  243, 126, 132, 137, 145, 153, 160, 174, 178, 184, 197, 204, 216, 231, 237,
+  244, 246, 141, 149, 156, 166, 172, 180, 189, 199, 200, 210, 220, 228, 238,
+  242, 249, 251, 152, 163, 171, 183, 186, 193, 201, 211, 214, 218, 227, 236,
+  245, 247, 252, 253, 158, 173, 187, 194, 198, 209, 213, 217, 225, 229, 235,
+  241, 248, 250, 254, 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_16x16[256]) = {
+  0, 2, 5, 9, 17, 24, 36, 44, 55, 72, 88, 104, 128, 143, 166, 179,
+  1, 4, 8, 13, 20, 30, 40, 54, 66, 79, 96, 113, 141, 154, 178, 196,
+  3, 7, 11, 18, 25, 33, 46, 57, 71, 86, 101, 119, 148, 164, 186, 201,
+  6, 12, 16, 23, 31, 39, 53, 64, 78, 92, 110, 127, 153, 169, 193, 208,
+  10, 14, 19, 28, 37, 47, 58, 67, 84, 98, 114, 133, 161, 176, 198, 214,
+  15, 21, 26, 34, 43, 52, 65, 77, 91, 106, 120, 140, 165, 185, 205, 221,
+  22, 27, 32, 41, 48, 60, 73, 85, 99, 116, 130, 151, 175, 190, 211, 225,
+  29, 35, 42, 49, 59, 69, 81, 95, 108, 125, 139, 155, 182, 197, 217, 229,
+  38, 45, 51, 61, 68, 80, 93, 105, 118, 134, 150, 168, 191, 207, 223, 234,
+  50, 56, 63, 74, 83, 94, 109, 117, 129, 147, 163, 177, 199, 213, 228, 238,
+  62, 70, 76, 87, 97, 107, 122, 131, 145, 159, 172, 188, 210, 222, 235, 242,
+  75, 82, 90, 102, 112, 124, 138, 146, 157, 173, 187, 202, 219, 230, 240, 245,
+  89, 100, 111, 123, 132, 142, 156, 167, 180, 189, 203, 216, 231, 237, 246, 250,
+  103, 115, 126, 136, 149, 162, 171, 183, 194, 204, 215, 224, 236, 241, 248,
+  252, 121, 135, 144, 158, 170, 181, 192, 200, 209, 218, 227, 233, 243, 244,
+  251, 254, 137, 152, 160, 174, 184, 195, 206, 212, 220, 226, 232, 239, 247,
+  249, 253, 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_32x32[1024]) = {
+  0, 2, 5, 10, 17, 25, 38, 47, 62, 83, 101, 121, 145, 170, 193, 204,
+  210, 219, 229, 233, 245, 257, 275, 299, 342, 356, 377, 405, 455, 471, 495,
+  527, 1, 4, 8, 15, 22, 30, 45, 58, 74, 92, 112, 133, 158, 184, 203, 215, 222,
+  228, 234, 237, 256, 274, 298, 317, 355, 376, 404, 426, 470, 494, 526, 551,
+  3, 7, 12, 18, 28, 36, 52, 64, 82, 102, 118, 142, 164, 189, 208, 217, 224,
+  231, 235, 238, 273, 297, 316, 329, 375, 403, 425, 440, 493, 525, 550, 567,
+  6, 11, 16, 23, 31, 43, 60, 73, 90, 109, 126, 150, 173, 196, 211, 220, 226,
+  232, 236, 239, 296, 315, 328, 335, 402, 424, 439, 447, 524, 549, 566, 575,
+  9, 14, 19, 29, 37, 50, 65, 78, 95, 116, 134, 157, 179, 201, 214, 223, 244,
+  255, 272, 295, 341, 354, 374, 401, 454, 469, 492, 523, 582, 596, 617, 645,
+  13, 20, 26, 35, 44, 54, 72, 85, 105, 123, 140, 163, 182, 205, 216, 225,
+  254, 271, 294, 314, 353, 373, 400, 423, 468, 491, 522, 548, 595, 616, 644,
+  666, 21, 27, 33, 42, 53, 63, 80, 94, 113, 132, 151, 172, 190, 209, 218, 227,
+  270, 293, 313, 327, 372, 399, 422, 438, 490, 521, 547, 565, 615, 643, 665,
+  680, 24, 32, 39, 48, 57, 71, 88, 104, 120, 139, 159, 178, 197, 212, 221, 230,
+  292, 312, 326, 334, 398, 421, 437, 446, 520, 546, 564, 574, 642, 664, 679,
+  687, 34, 40, 46, 56, 68, 81, 96, 111, 130, 147, 167, 186, 243, 253, 269, 291,
+  340, 352, 371, 397, 453, 467, 489, 519, 581, 594, 614, 641, 693, 705, 723,
+  747, 41, 49, 55, 67, 77, 91, 107, 124, 138, 161, 177, 194, 252, 268, 290,
+  311, 351, 370, 396, 420, 466, 488, 518, 545, 593, 613, 640, 663, 704, 722,
+  746, 765, 51, 59, 66, 76, 89, 99, 119, 131, 149, 168, 181, 200, 267, 289,
+  310, 325, 369, 395, 419, 436, 487, 517, 544, 563, 612, 639, 662, 678, 721,
+  745, 764, 777, 61, 69, 75, 87, 100, 114, 129, 144, 162, 180, 191, 207, 288,
+  309, 324, 333, 394, 418, 435, 445, 516, 543, 562, 573, 638, 661, 677, 686,
+  744, 763, 776, 783, 70, 79, 86, 97, 108, 122, 137, 155, 242, 251, 266, 287,
+  339, 350, 368, 393, 452, 465, 486, 515, 580, 592, 611, 637, 692, 703, 720,
+  743, 788, 798, 813, 833, 84, 93, 103, 110, 125, 141, 154, 171, 250, 265, 286,
+  308, 349, 367, 392, 417, 464, 485, 514, 542, 591, 610, 636, 660, 702, 719,
+  742, 762, 797, 812, 832, 848, 98, 106, 115, 127, 143, 156, 169, 185, 264,
+  285, 307, 323, 366, 391, 416, 434, 484, 513, 541, 561, 609, 635, 659, 676,
+  718, 741, 761, 775, 811, 831, 847, 858, 117, 128, 136, 148, 160, 175, 188,
+  198, 284, 306, 322, 332, 390, 415, 433, 444, 512, 540, 560, 572, 634, 658,
+  675, 685, 740, 760, 774, 782, 830, 846, 857, 863, 135, 146, 152, 165, 241,
+  249, 263, 283, 338, 348, 365, 389, 451, 463, 483, 511, 579, 590, 608, 633,
+  691, 701, 717, 739, 787, 796, 810, 829, 867, 875, 887, 903, 153, 166, 174,
+  183, 248, 262, 282, 305, 347, 364, 388, 414, 462, 482, 510, 539, 589, 607,
+  632, 657, 700, 716, 738, 759, 795, 809, 828, 845, 874, 886, 902, 915, 176,
+  187, 195, 202, 261, 281, 304, 321, 363, 387, 413, 432, 481, 509, 538, 559,
+  606, 631, 656, 674, 715, 737, 758, 773, 808, 827, 844, 856, 885, 901, 914,
+  923, 192, 199, 206, 213, 280, 303, 320, 331, 386, 412, 431, 443, 508, 537,
+  558, 571, 630, 655, 673, 684, 736, 757, 772, 781, 826, 843, 855, 862, 900,
+  913, 922, 927, 240, 247, 260, 279, 337, 346, 362, 385, 450, 461, 480, 507,
+  578, 588, 605, 629, 690, 699, 714, 735, 786, 794, 807, 825, 866, 873, 884,
+  899, 930, 936, 945, 957, 246, 259, 278, 302, 345, 361, 384, 411, 460, 479,
+  506, 536, 587, 604, 628, 654, 698, 713, 734, 756, 793, 806, 824, 842, 872,
+  883, 898, 912, 935, 944, 956, 966, 258, 277, 301, 319, 360, 383, 410, 430,
+  478, 505, 535, 557, 603, 627, 653, 672, 712, 733, 755, 771, 805, 823, 841,
+  854, 882, 897, 911, 921, 943, 955, 965, 972, 276, 300, 318, 330, 382, 409,
+  429, 442, 504, 534, 556, 570, 626, 652, 671, 683, 732, 754, 770, 780, 822,
+  840, 853, 861, 896, 910, 920, 926, 954, 964, 971, 975, 336, 344, 359, 381,
+  449, 459, 477, 503, 577, 586, 602, 625, 689, 697, 711, 731, 785, 792, 804,
+  821, 865, 871, 881, 895, 929, 934, 942, 953, 977, 981, 987, 995, 343, 358,
+  380, 408, 458, 476, 502, 533, 585, 601, 624, 651, 696, 710, 730, 753, 791,
+  803, 820, 839, 870, 880, 894, 909, 933, 941, 952, 963, 980, 986, 994, 1001,
+  357, 379, 407, 428, 475, 501, 532, 555, 600, 623, 650, 670, 709, 729, 752,
+  769, 802, 819, 838, 852, 879, 893, 908, 919, 940, 951, 962, 970, 985, 993,
+  1000, 1005, 378, 406, 427, 441, 500, 531, 554, 569, 622, 649, 669, 682, 728,
+  751, 768, 779, 818, 837, 851, 860, 892, 907, 918, 925, 950, 961, 969, 974,
+  992, 999, 1004, 1007, 448, 457, 474, 499, 576, 584, 599, 621, 688, 695, 708,
+  727, 784, 790, 801, 817, 864, 869, 878, 891, 928, 932, 939, 949, 976, 979,
+  984, 991, 1008, 1010, 1013, 1017, 456, 473, 498, 530, 583, 598, 620, 648,
+  694, 707, 726, 750, 789, 800, 816, 836, 868, 877, 890, 906, 931, 938, 948,
+  960, 978, 983, 990, 998, 1009, 1012, 1016, 1020, 472, 497, 529, 553, 597,
+  619, 647, 668, 706, 725, 749, 767, 799, 815, 835, 850, 876, 889, 905, 917,
+  937, 947, 959, 968, 982, 989, 997, 1003, 1011, 1015, 1019, 1022, 496, 528,
+  552, 568, 618, 646, 667, 681, 724, 748, 766, 778, 814, 834, 849, 859, 888,
+  904, 916, 924, 946, 958, 967, 973, 988, 996, 1002, 1006, 1014, 1018, 1021,
+  1023,
+};
+
+const scan_order vp9_default_scan_orders[TX_SIZES] = {
+  {default_scan_4x4,   vp9_default_iscan_4x4,   default_scan_4x4_neighbors},
+  {default_scan_8x8,   vp9_default_iscan_8x8,   default_scan_8x8_neighbors},
+  {default_scan_16x16, vp9_default_iscan_16x16, default_scan_16x16_neighbors},
+  {default_scan_32x32, vp9_default_iscan_32x32, default_scan_32x32_neighbors},
+};
+
+const scan_order vp9_scan_orders[TX_SIZES][TX_TYPES] = {
+  {  // TX_4X4
+    {default_scan_4x4, vp9_default_iscan_4x4, default_scan_4x4_neighbors},
+    {row_scan_4x4,     vp9_row_iscan_4x4,     row_scan_4x4_neighbors},
+    {col_scan_4x4,     vp9_col_iscan_4x4,     col_scan_4x4_neighbors},
+    {default_scan_4x4, vp9_default_iscan_4x4, default_scan_4x4_neighbors}
+  }, {  // TX_8X8
+    {default_scan_8x8, vp9_default_iscan_8x8, default_scan_8x8_neighbors},
+    {row_scan_8x8,     vp9_row_iscan_8x8,     row_scan_8x8_neighbors},
+    {col_scan_8x8,     vp9_col_iscan_8x8,     col_scan_8x8_neighbors},
+    {default_scan_8x8, vp9_default_iscan_8x8, default_scan_8x8_neighbors}
+  }, {  // TX_16X16
+    {default_scan_16x16, vp9_default_iscan_16x16, default_scan_16x16_neighbors},
+    {row_scan_16x16,     vp9_row_iscan_16x16,     row_scan_16x16_neighbors},
+    {col_scan_16x16,     vp9_col_iscan_16x16,     col_scan_16x16_neighbors},
+    {default_scan_16x16, vp9_default_iscan_16x16, default_scan_16x16_neighbors}
+  }, {  // TX_32X32
+    {default_scan_32x32, vp9_default_iscan_32x32, default_scan_32x32_neighbors},
+    {default_scan_32x32, vp9_default_iscan_32x32, default_scan_32x32_neighbors},
+    {default_scan_32x32, vp9_default_iscan_32x32, default_scan_32x32_neighbors},
+    {default_scan_32x32, vp9_default_iscan_32x32, default_scan_32x32_neighbors},
+  }
+};
diff --git a/libs/libvpx/vp9/common/vp9_scan.h b/libs/libvpx/vp9/common/vp9_scan.h
new file mode 100644
index 0000000000..4c1ee8107c
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_scan.h
@@ -0,0 +1,57 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_SCAN_H_
+#define VP9_COMMON_VP9_SCAN_H_
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+#include "vp9/common/vp9_enums.h"
+#include "vp9/common/vp9_blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_NEIGHBORS 2
+
+typedef struct {
+  const int16_t *scan;
+  const int16_t *iscan;
+  const int16_t *neighbors;
+} scan_order;
+
+extern const scan_order vp9_default_scan_orders[TX_SIZES];
+extern const scan_order vp9_scan_orders[TX_SIZES][TX_TYPES];
+
+static INLINE int get_coef_context(const int16_t *neighbors,
+                                   const uint8_t *token_cache, int c) {
+  return (1 + token_cache[neighbors[MAX_NEIGHBORS * c + 0]] +
+          token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >> 1;
+}
+
+static INLINE const scan_order *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size,
+                                         PLANE_TYPE type, int block_idx) {
+  const MODE_INFO *const mi = xd->mi[0];
+
+  if (is_inter_block(mi) || type != PLANE_TYPE_Y || xd->lossless) {
+    return &vp9_default_scan_orders[tx_size];
+  } else {
+    const PREDICTION_MODE mode = get_y_mode(mi, block_idx);
+    return &vp9_scan_orders[tx_size][intra_mode_to_tx_type_lookup[mode]];
+  }
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_COMMON_VP9_SCAN_H_
diff --git a/libs/libvpx/vp9/common/vp9_seg_common.c b/libs/libvpx/vp9/common/vp9_seg_common.c
new file mode 100644
index 0000000000..c8ef618b77
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_seg_common.c
@@ -0,0 +1,63 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_loopfilter.h"
+#include "vp9/common/vp9_seg_common.h"
+#include "vp9/common/vp9_quant_common.h"
+
+static const int seg_feature_data_signed[SEG_LVL_MAX] = { 1, 1, 0, 0 };
+
+static const int seg_feature_data_max[SEG_LVL_MAX] = {
+  MAXQ, MAX_LOOP_FILTER, 3, 0 };
+
+// These functions provide access to new segment level features.
+// Eventually these function may be "optimized out" but for the moment,
+// the coding mechanism is still subject to change so these provide a
+// convenient single point of change.
+
+void vp9_clearall_segfeatures(struct segmentation *seg) {
+  vp9_zero(seg->feature_data);
+  vp9_zero(seg->feature_mask);
+}
+
+void vp9_enable_segfeature(struct segmentation *seg, int segment_id,
+                           SEG_LVL_FEATURES feature_id) {
+  seg->feature_mask[segment_id] |= 1 << feature_id;
+}
+
+int vp9_seg_feature_data_max(SEG_LVL_FEATURES feature_id) {
+  return seg_feature_data_max[feature_id];
+}
+
+int vp9_is_segfeature_signed(SEG_LVL_FEATURES feature_id) {
+  return seg_feature_data_signed[feature_id];
+}
+
+void vp9_set_segdata(struct segmentation *seg, int segment_id,
+                     SEG_LVL_FEATURES feature_id, int seg_data) {
+  assert(seg_data <= seg_feature_data_max[feature_id]);
+  if (seg_data < 0) {
+    assert(seg_feature_data_signed[feature_id]);
+    assert(-seg_data <= seg_feature_data_max[feature_id]);
+  }
+
+  seg->feature_data[segment_id][feature_id] = seg_data;
+}
+
+const vpx_tree_index vp9_segment_tree[TREE_SIZE(MAX_SEGMENTS)] = {
+  2,  4,  6,  8, 10, 12,
+  0, -1, -2, -3, -4, -5, -6, -7
+};
+
+
+// TBD? Functions to read and write segment data with range / validity checking
diff --git a/libs/libvpx/vp9/common/vp9_seg_common.h b/libs/libvpx/vp9/common/vp9_seg_common.h
new file mode 100644
index 0000000000..5b75d8d4ee
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_seg_common.h
@@ -0,0 +1,86 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_SEG_COMMON_H_
+#define VP9_COMMON_VP9_SEG_COMMON_H_
+
+#include "vpx_dsp/prob.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define SEGMENT_DELTADATA   0
+#define SEGMENT_ABSDATA     1
+
+#define MAX_SEGMENTS     8
+#define SEG_TREE_PROBS   (MAX_SEGMENTS-1)
+
+#define PREDICTION_PROBS 3
+
+// Segment level features.
+typedef enum {
+  SEG_LVL_ALT_Q = 0,               // Use alternate Quantizer ....
+  SEG_LVL_ALT_LF = 1,              // Use alternate loop filter value...
+  SEG_LVL_REF_FRAME = 2,           // Optional Segment reference frame
+  SEG_LVL_SKIP = 3,                // Optional Segment (0,0) + skip mode
+  SEG_LVL_MAX = 4                  // Number of features supported
+} SEG_LVL_FEATURES;
+
+
+struct segmentation {
+  uint8_t enabled;
+  uint8_t update_map;
+  uint8_t update_data;
+  uint8_t abs_delta;
+  uint8_t temporal_update;
+
+  vpx_prob tree_probs[SEG_TREE_PROBS];
+  vpx_prob pred_probs[PREDICTION_PROBS];
+
+  int16_t feature_data[MAX_SEGMENTS][SEG_LVL_MAX];
+  unsigned int feature_mask[MAX_SEGMENTS];
+};
+
+static INLINE int segfeature_active(const struct segmentation *seg,
+                                    int segment_id,
+                                    SEG_LVL_FEATURES feature_id) {
+  return seg->enabled &&
+         (seg->feature_mask[segment_id] & (1 << feature_id));
+}
+
+void vp9_clearall_segfeatures(struct segmentation *seg);
+
+void vp9_enable_segfeature(struct segmentation *seg,
+                           int segment_id,
+                           SEG_LVL_FEATURES feature_id);
+
+int vp9_seg_feature_data_max(SEG_LVL_FEATURES feature_id);
+
+int vp9_is_segfeature_signed(SEG_LVL_FEATURES feature_id);
+
+void vp9_set_segdata(struct segmentation *seg,
+                     int segment_id,
+                     SEG_LVL_FEATURES feature_id,
+                     int seg_data);
+
+static INLINE int get_segdata(const struct segmentation *seg, int segment_id,
+                              SEG_LVL_FEATURES feature_id) {
+  return seg->feature_data[segment_id][feature_id];
+}
+
+extern const vpx_tree_index vp9_segment_tree[TREE_SIZE(MAX_SEGMENTS)];
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_COMMON_VP9_SEG_COMMON_H_
+
diff --git a/libs/libvpx/vp9/common/vp9_textblit.c b/libs/libvpx/vp9/common/vp9_textblit.c
new file mode 100644
index 0000000000..60e95e08f5
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_textblit.c
@@ -0,0 +1,120 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "vp9/common/vp9_textblit.h"
+
+static const int font[] = {
+  0x0, 0x5C00, 0x8020, 0xAFABEA, 0xD7EC0, 0x1111111, 0x1855740, 0x18000,
+  0x45C0, 0x74400, 0x51140, 0x23880, 0xC4000, 0x21080, 0x80000, 0x111110,
+  0xE9D72E, 0x87E40, 0x12AD732, 0xAAD62A, 0x4F94C4, 0x4D6B7, 0x456AA,
+  0x3E8423, 0xAAD6AA, 0xAAD6A2, 0x2800, 0x2A00, 0x8A880, 0x52940, 0x22A20,
+  0x15422, 0x6AD62E, 0x1E4A53E, 0xAAD6BF, 0x8C62E, 0xE8C63F, 0x118D6BF,
+  0x1094BF, 0xCAC62E, 0x1F2109F, 0x118FE31, 0xF8C628, 0x8A89F, 0x108421F,
+  0x1F1105F, 0x1F4105F, 0xE8C62E, 0x2294BF, 0x164C62E, 0x12694BF, 0x8AD6A2,
+  0x10FC21, 0x1F8421F, 0x744107, 0xF8220F, 0x1151151, 0x117041, 0x119D731,
+  0x47E0, 0x1041041, 0xFC400, 0x10440, 0x1084210, 0x820
+};
+
+static void plot(int x, int y, unsigned char *image, int pitch) {
+  image[x + y * pitch] ^= 255;
+}
+
+void vp9_blit_text(const char *msg, unsigned char *address, const int pitch) {
+  int letter_bitmap;
+  unsigned char *output_pos = address;
+  int colpos = 0;
+
+  while (msg[colpos] != 0) {
+    char letter = msg[colpos];
+    int fontcol, fontrow;
+
+    if (letter <= 'Z' && letter >= ' ')
+      letter_bitmap = font[letter - ' '];
+    else if (letter <= 'z' && letter >= 'a')
+      letter_bitmap = font[letter - 'a' + 'A' - ' '];
+    else
+      letter_bitmap = font[0];
+
+    for (fontcol = 6; fontcol >= 0; fontcol--)
+      for (fontrow = 0; fontrow < 5; fontrow++)
+        output_pos[fontrow * pitch + fontcol] =
+          ((letter_bitmap >> (fontcol * 5)) & (1 << fontrow) ? 255 : 0);
+
+    output_pos += 7;
+    colpos++;
+  }
+}
+
+
+
+/* Bresenham line algorithm */
+void vp9_blit_line(int x0, int x1, int y0, int y1, unsigned char *image,
+                   int pitch) {
+  int steep = abs(y1 - y0) > abs(x1 - x0);
+  int deltax, deltay;
+  int error, ystep, y, x;
+
+  if (steep) {
+    int t;
+    t = x0;
+    x0 = y0;
+    y0 = t;
+
+    t = x1;
+    x1 = y1;
+    y1 = t;
+  }
+
+  if (x0 > x1) {
+    int t;
+    t = x0;
+    x0 = x1;
+    x1 = t;
+
+    t = y0;
+    y0 = y1;
+    y1 = t;
+  }
+
+  deltax = x1 - x0;
+  deltay = abs(y1 - y0);
+  error  = deltax / 2;
+
+  y = y0;
+
+  if (y0 < y1)
+    ystep = 1;
+  else
+    ystep = -1;
+
+  if (steep) {
+    for (x = x0; x <= x1; x++) {
+      plot(y, x, image, pitch);
+
+      error = error - deltay;
+      if (error < 0) {
+        y = y + ystep;
+        error = error + deltax;
+      }
+    }
+  } else {
+    for (x = x0; x <= x1; x++) {
+      plot(x, y, image, pitch);
+
+      error = error - deltay;
+      if (error < 0) {
+        y = y + ystep;
+        error = error + deltax;
+      }
+    }
+  }
+}
diff --git a/libs/libvpx/vp9/common/vp9_textblit.h b/libs/libvpx/vp9/common/vp9_textblit.h
new file mode 100644
index 0000000000..158ec1b37e
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_textblit.h
@@ -0,0 +1,27 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_TEXTBLIT_H_
+#define VP9_COMMON_VP9_TEXTBLIT_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp9_blit_text(const char *msg, unsigned char *address, int pitch);
+
+void vp9_blit_line(int x0, int x1, int y0, int y1, unsigned char *image,
+                   int pitch);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_COMMON_VP9_TEXTBLIT_H_
diff --git a/libs/libvpx/vp9/common/vp9_thread_common.c b/libs/libvpx/vp9/common/vp9_thread_common.c
new file mode 100644
index 0000000000..db78d6be89
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_thread_common.c
@@ -0,0 +1,435 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp9/common/vp9_entropymode.h"
+#include "vp9/common/vp9_thread_common.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/common/vp9_loopfilter.h"
+
+#if CONFIG_MULTITHREAD
+static INLINE void mutex_lock(pthread_mutex_t *const mutex) {
+  const int kMaxTryLocks = 4000;
+  int locked = 0;
+  int i;
+
+  for (i = 0; i < kMaxTryLocks; ++i) {
+    if (!pthread_mutex_trylock(mutex)) {
+      locked = 1;
+      break;
+    }
+  }
+
+  if (!locked)
+    pthread_mutex_lock(mutex);
+}
+#endif  // CONFIG_MULTITHREAD
+
+static INLINE void sync_read(VP9LfSync *const lf_sync, int r, int c) {
+#if CONFIG_MULTITHREAD
+  const int nsync = lf_sync->sync_range;
+
+  if (r && !(c & (nsync - 1))) {
+    pthread_mutex_t *const mutex = &lf_sync->mutex_[r - 1];
+    mutex_lock(mutex);
+
+    while (c > lf_sync->cur_sb_col[r - 1] - nsync) {
+      pthread_cond_wait(&lf_sync->cond_[r - 1], mutex);
+    }
+    pthread_mutex_unlock(mutex);
+  }
+#else
+  (void)lf_sync;
+  (void)r;
+  (void)c;
+#endif  // CONFIG_MULTITHREAD
+}
+
+static INLINE void sync_write(VP9LfSync *const lf_sync, int r, int c,
+                              const int sb_cols) {
+#if CONFIG_MULTITHREAD
+  const int nsync = lf_sync->sync_range;
+  int cur;
+  // Only signal when there are enough filtered SB for next row to run.
+  int sig = 1;
+
+  if (c < sb_cols - 1) {
+    cur = c;
+    if (c % nsync)
+      sig = 0;
+  } else {
+    cur = sb_cols + nsync;
+  }
+
+  if (sig) {
+    mutex_lock(&lf_sync->mutex_[r]);
+
+    lf_sync->cur_sb_col[r] = cur;
+
+    pthread_cond_signal(&lf_sync->cond_[r]);
+    pthread_mutex_unlock(&lf_sync->mutex_[r]);
+  }
+#else
+  (void)lf_sync;
+  (void)r;
+  (void)c;
+  (void)sb_cols;
+#endif  // CONFIG_MULTITHREAD
+}
+
+// Implement row loopfiltering for each thread.
+static INLINE
+void thread_loop_filter_rows(const YV12_BUFFER_CONFIG *const frame_buffer,
+                             VP9_COMMON *const cm,
+                             struct macroblockd_plane planes[MAX_MB_PLANE],
+                             int start, int stop, int y_only,
+                             VP9LfSync *const lf_sync) {
+  const int num_planes = y_only ? 1 : MAX_MB_PLANE;
+  const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2;
+  int mi_row, mi_col;
+  enum lf_path path;
+  if (y_only)
+    path = LF_PATH_444;
+  else if (planes[1].subsampling_y == 1 && planes[1].subsampling_x == 1)
+    path = LF_PATH_420;
+  else if (planes[1].subsampling_y == 0 && planes[1].subsampling_x == 0)
+    path = LF_PATH_444;
+  else
+    path = LF_PATH_SLOW;
+
+  for (mi_row = start; mi_row < stop;
+       mi_row += lf_sync->num_workers * MI_BLOCK_SIZE) {
+    MODE_INFO **const mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
+    LOOP_FILTER_MASK *lfm = get_lfm(&cm->lf, mi_row, 0);
+
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE, ++lfm) {
+      const int r = mi_row >> MI_BLOCK_SIZE_LOG2;
+      const int c = mi_col >> MI_BLOCK_SIZE_LOG2;
+      int plane;
+
+      sync_read(lf_sync, r, c);
+
+      vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
+
+      vp9_adjust_mask(cm, mi_row, mi_col, lfm);
+
+      vp9_filter_block_plane_ss00(cm, &planes[0], mi_row, lfm);
+      for (plane = 1; plane < num_planes; ++plane) {
+        switch (path) {
+          case LF_PATH_420:
+            vp9_filter_block_plane_ss11(cm, &planes[plane], mi_row, lfm);
+            break;
+          case LF_PATH_444:
+            vp9_filter_block_plane_ss00(cm, &planes[plane], mi_row, lfm);
+            break;
+          case LF_PATH_SLOW:
+            vp9_filter_block_plane_non420(cm, &planes[plane], mi + mi_col,
+                                          mi_row, mi_col);
+            break;
+        }
+      }
+
+      sync_write(lf_sync, r, c, sb_cols);
+    }
+  }
+}
+
+// Row-based multi-threaded loopfilter hook
+static int loop_filter_row_worker(VP9LfSync *const lf_sync,
+                                  LFWorkerData *const lf_data) {
+  thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
+                          lf_data->start, lf_data->stop, lf_data->y_only,
+                          lf_sync);
+  return 1;
+}
+
+static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame,
+                                VP9_COMMON *cm,
+                                struct macroblockd_plane planes[MAX_MB_PLANE],
+                                int start, int stop, int y_only,
+                                VPxWorker *workers, int nworkers,
+                                VP9LfSync *lf_sync) {
+  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
+  // Number of superblock rows and cols
+  const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
+  // Decoder may allocate more threads than number of tiles based on user's
+  // input.
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int num_workers = VPXMIN(nworkers, tile_cols);
+  int i;
+
+  if (!lf_sync->sync_range || sb_rows != lf_sync->rows ||
+      num_workers > lf_sync->num_workers) {
+    vp9_loop_filter_dealloc(lf_sync);
+    vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);
+  }
+
+  // Initialize cur_sb_col to -1 for all SB rows.
+  memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);
+
+  // Set up loopfilter thread data.
+  // The decoder is capping num_workers because it has been observed that using
+  // more threads on the loopfilter than there are cores will hurt performance
+  // on Android. This is because the system will only schedule the tile decode
+  // workers on cores equal to the number of tile columns. Then if the decoder
+  // tries to use more threads for the loopfilter, it will hurt performance
+  // because of contention. If the multithreading code changes in the future
+  // then the number of workers used by the loopfilter should be revisited.
+  for (i = 0; i < num_workers; ++i) {
+    VPxWorker *const worker = &workers[i];
+    LFWorkerData *const lf_data = &lf_sync->lfdata[i];
+
+    worker->hook = (VPxWorkerHook)loop_filter_row_worker;
+    worker->data1 = lf_sync;
+    worker->data2 = lf_data;
+
+    // Loopfilter data
+    vp9_loop_filter_data_reset(lf_data, frame, cm, planes);
+    lf_data->start = start + i * MI_BLOCK_SIZE;
+    lf_data->stop = stop;
+    lf_data->y_only = y_only;
+
+    // Start loopfiltering
+    if (i == num_workers - 1) {
+      winterface->execute(worker);
+    } else {
+      winterface->launch(worker);
+    }
+  }
+
+  // Wait till all rows are finished
+  for (i = 0; i < num_workers; ++i) {
+    winterface->sync(&workers[i]);
+  }
+}
+
+void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
+                              VP9_COMMON *cm,
+                              struct macroblockd_plane planes[MAX_MB_PLANE],
+                              int frame_filter_level,
+                              int y_only, int partial_frame,
+                              VPxWorker *workers, int num_workers,
+                              VP9LfSync *lf_sync) {
+  int start_mi_row, end_mi_row, mi_rows_to_filter;
+
+  if (!frame_filter_level) return;
+
+  start_mi_row = 0;
+  mi_rows_to_filter = cm->mi_rows;
+  if (partial_frame && cm->mi_rows > 8) {
+    start_mi_row = cm->mi_rows >> 1;
+    start_mi_row &= 0xfffffff8;
+    mi_rows_to_filter = VPXMAX(cm->mi_rows / 8, 8);
+  }
+  end_mi_row = start_mi_row + mi_rows_to_filter;
+  vp9_loop_filter_frame_init(cm, frame_filter_level);
+
+  loop_filter_rows_mt(frame, cm, planes, start_mi_row, end_mi_row,
+                      y_only, workers, num_workers, lf_sync);
+}
+
+// Set up nsync by width.
+static INLINE int get_sync_range(int width) {
+  // nsync numbers are picked by testing. For example, for 4k
+  // video, using 4 gives best performance.
+  if (width < 640)
+    return 1;
+  else if (width <= 1280)
+    return 2;
+  else if (width <= 4096)
+    return 4;
+  else
+    return 8;
+}
+
+// Allocate memory for lf row synchronization
+void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
+                           int width, int num_workers) {
+  lf_sync->rows = rows;
+#if CONFIG_MULTITHREAD
+  {
+    int i;
+
+    CHECK_MEM_ERROR(cm, lf_sync->mutex_,
+                    vpx_malloc(sizeof(*lf_sync->mutex_) * rows));
+    if (lf_sync->mutex_) {
+      for (i = 0; i < rows; ++i) {
+        pthread_mutex_init(&lf_sync->mutex_[i], NULL);
+      }
+    }
+
+    CHECK_MEM_ERROR(cm, lf_sync->cond_,
+                    vpx_malloc(sizeof(*lf_sync->cond_) * rows));
+    if (lf_sync->cond_) {
+      for (i = 0; i < rows; ++i) {
+        pthread_cond_init(&lf_sync->cond_[i], NULL);
+      }
+    }
+  }
+#endif  // CONFIG_MULTITHREAD
+
+  CHECK_MEM_ERROR(cm, lf_sync->lfdata,
+                  vpx_malloc(num_workers * sizeof(*lf_sync->lfdata)));
+  lf_sync->num_workers = num_workers;
+
+  CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col,
+                  vpx_malloc(sizeof(*lf_sync->cur_sb_col) * rows));
+
+  // Set up nsync.
+  lf_sync->sync_range = get_sync_range(width);
+}
+
+// Deallocate lf synchronization related mutex and data
+void vp9_loop_filter_dealloc(VP9LfSync *lf_sync) {
+  if (lf_sync != NULL) {
+#if CONFIG_MULTITHREAD
+    int i;
+
+    if (lf_sync->mutex_ != NULL) {
+      for (i = 0; i < lf_sync->rows; ++i) {
+        pthread_mutex_destroy(&lf_sync->mutex_[i]);
+      }
+      vpx_free(lf_sync->mutex_);
+    }
+    if (lf_sync->cond_ != NULL) {
+      for (i = 0; i < lf_sync->rows; ++i) {
+        pthread_cond_destroy(&lf_sync->cond_[i]);
+      }
+      vpx_free(lf_sync->cond_);
+    }
+#endif  // CONFIG_MULTITHREAD
+    vpx_free(lf_sync->lfdata);
+    vpx_free(lf_sync->cur_sb_col);
+    // clear the structure as the source of this call may be a resize in which
+    // case this call will be followed by an _alloc() which may fail.
+    vp9_zero(*lf_sync);
+  }
+}
+
+// Accumulate frame counts.
+void vp9_accumulate_frame_counts(FRAME_COUNTS *accum,
+                                 const FRAME_COUNTS *counts, int is_dec) {
+  int i, j, k, l, m;
+
+  for (i = 0; i < BLOCK_SIZE_GROUPS; i++)
+    for (j = 0; j < INTRA_MODES; j++)
+      accum->y_mode[i][j] += counts->y_mode[i][j];
+
+  for (i = 0; i < INTRA_MODES; i++)
+    for (j = 0; j < INTRA_MODES; j++)
+      accum->uv_mode[i][j] += counts->uv_mode[i][j];
+
+  for (i = 0; i < PARTITION_CONTEXTS; i++)
+    for (j = 0; j < PARTITION_TYPES; j++)
+      accum->partition[i][j] += counts->partition[i][j];
+
+  if (is_dec) {
+    int n;
+    for (i = 0; i < TX_SIZES; i++)
+      for (j = 0; j < PLANE_TYPES; j++)
+        for (k = 0; k < REF_TYPES; k++)
+          for (l = 0; l < COEF_BANDS; l++)
+            for (m = 0; m < COEFF_CONTEXTS; m++) {
+              accum->eob_branch[i][j][k][l][m] +=
+                  counts->eob_branch[i][j][k][l][m];
+              for (n = 0; n < UNCONSTRAINED_NODES + 1; n++)
+                accum->coef[i][j][k][l][m][n] +=
+                    counts->coef[i][j][k][l][m][n];
+            }
+  } else {
+    for (i = 0; i < TX_SIZES; i++)
+      for (j = 0; j < PLANE_TYPES; j++)
+        for (k = 0; k < REF_TYPES; k++)
+          for (l = 0; l < COEF_BANDS; l++)
+            for (m = 0; m < COEFF_CONTEXTS; m++)
+              accum->eob_branch[i][j][k][l][m] +=
+                  counts->eob_branch[i][j][k][l][m];
+                // In the encoder, coef is only updated at frame
+                // level, so not need to accumulate it here.
+                // for (n = 0; n < UNCONSTRAINED_NODES + 1; n++)
+                //   accum->coef[i][j][k][l][m][n] +=
+                //       counts->coef[i][j][k][l][m][n];
+  }
+
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
+    for (j = 0; j < SWITCHABLE_FILTERS; j++)
+      accum->switchable_interp[i][j] += counts->switchable_interp[i][j];
+
+  for (i = 0; i < INTER_MODE_CONTEXTS; i++)
+    for (j = 0; j < INTER_MODES; j++)
+      accum->inter_mode[i][j] += counts->inter_mode[i][j];
+
+  for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
+    for (j = 0; j < 2; j++)
+      accum->intra_inter[i][j] += counts->intra_inter[i][j];
+
+  for (i = 0; i < COMP_INTER_CONTEXTS; i++)
+    for (j = 0; j < 2; j++)
+      accum->comp_inter[i][j] += counts->comp_inter[i][j];
+
+  for (i = 0; i < REF_CONTEXTS; i++)
+    for (j = 0; j < 2; j++)
+      for (k = 0; k < 2; k++)
+      accum->single_ref[i][j][k] += counts->single_ref[i][j][k];
+
+  for (i = 0; i < REF_CONTEXTS; i++)
+    for (j = 0; j < 2; j++)
+      accum->comp_ref[i][j] += counts->comp_ref[i][j];
+
+  for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
+    for (j = 0; j < TX_SIZES; j++)
+      accum->tx.p32x32[i][j] += counts->tx.p32x32[i][j];
+
+    for (j = 0; j < TX_SIZES - 1; j++)
+      accum->tx.p16x16[i][j] += counts->tx.p16x16[i][j];
+
+    for (j = 0; j < TX_SIZES - 2; j++)
+      accum->tx.p8x8[i][j] += counts->tx.p8x8[i][j];
+  }
+
+  for (i = 0; i < TX_SIZES; i++)
+    accum->tx.tx_totals[i] += counts->tx.tx_totals[i];
+
+  for (i = 0; i < SKIP_CONTEXTS; i++)
+    for (j = 0; j < 2; j++)
+      accum->skip[i][j] += counts->skip[i][j];
+
+  for (i = 0; i < MV_JOINTS; i++)
+    accum->mv.joints[i] += counts->mv.joints[i];
+
+  for (k = 0; k < 2; k++) {
+    nmv_component_counts *const comps = &accum->mv.comps[k];
+    const nmv_component_counts *const comps_t = &counts->mv.comps[k];
+
+    for (i = 0; i < 2; i++) {
+      comps->sign[i] += comps_t->sign[i];
+      comps->class0_hp[i] += comps_t->class0_hp[i];
+      comps->hp[i] += comps_t->hp[i];
+    }
+
+    for (i = 0; i < MV_CLASSES; i++)
+      comps->classes[i] += comps_t->classes[i];
+
+    for (i = 0; i < CLASS0_SIZE; i++) {
+      comps->class0[i] += comps_t->class0[i];
+      for (j = 0; j < MV_FP_SIZE; j++)
+        comps->class0_fp[i][j] += comps_t->class0_fp[i][j];
+    }
+
+    for (i = 0; i < MV_OFFSET_BITS; i++)
+      for (j = 0; j < 2; j++)
+        comps->bits[i][j] += comps_t->bits[i][j];
+
+    for (i = 0; i < MV_FP_SIZE; i++)
+      comps->fp[i] += comps_t->fp[i];
+  }
+}
diff --git a/libs/libvpx/vp9/common/vp9_thread_common.h b/libs/libvpx/vp9/common/vp9_thread_common.h
new file mode 100644
index 0000000000..b3b60c253f
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_thread_common.h
@@ -0,0 +1,65 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_THREAD_COMMON_H_
+#define VP9_COMMON_VP9_THREAD_COMMON_H_
+#include "./vpx_config.h"
+#include "vp9/common/vp9_loopfilter.h"
+#include "vpx_util/vpx_thread.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP9Common;
+struct FRAME_COUNTS;
+
+// Loopfilter row synchronization
+typedef struct VP9LfSyncData {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *mutex_;
+  pthread_cond_t *cond_;
+#endif
+  // Allocate memory to store the loop-filtered superblock index in each row.
+  int *cur_sb_col;
+  // The optimal sync_range for different resolution and platform should be
+  // determined by testing. Currently, it is chosen to be a power-of-2 number.
+  int sync_range;
+  int rows;
+
+  // Row-based parallel loopfilter data
+  LFWorkerData *lfdata;
+  int num_workers;
+} VP9LfSync;
+
+// Allocate memory for loopfilter row synchronization.
+void vp9_loop_filter_alloc(VP9LfSync *lf_sync, struct VP9Common *cm, int rows,
+                           int width, int num_workers);
+
+// Deallocate loopfilter synchronization related mutex and data.
+void vp9_loop_filter_dealloc(VP9LfSync *lf_sync);
+
+// Multi-threaded loopfilter that uses the tile threads.
+void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
+                              struct VP9Common *cm,
+                              struct macroblockd_plane planes[MAX_MB_PLANE],
+                              int frame_filter_level,
+                              int y_only, int partial_frame,
+                              VPxWorker *workers, int num_workers,
+                              VP9LfSync *lf_sync);
+
+void vp9_accumulate_frame_counts(struct FRAME_COUNTS *accum,
+                                 const struct FRAME_COUNTS *counts, int is_dec);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_COMMON_VP9_THREAD_COMMON_H_
diff --git a/libs/libvpx/vp9/common/vp9_tile_common.c b/libs/libvpx/vp9/common/vp9_tile_common.c
new file mode 100644
index 0000000000..9fcb97c854
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_tile_common.c
@@ -0,0 +1,59 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/common/vp9_tile_common.h"
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+#define MIN_TILE_WIDTH_B64 4
+#define MAX_TILE_WIDTH_B64 64
+
+static int get_tile_offset(int idx, int mis, int log2) {
+  const int sb_cols = mi_cols_aligned_to_sb(mis) >> MI_BLOCK_SIZE_LOG2;
+  const int offset = ((idx * sb_cols) >> log2) << MI_BLOCK_SIZE_LOG2;
+  return VPXMIN(offset, mis);
+}
+
+void vp9_tile_set_row(TileInfo *tile, const VP9_COMMON *cm, int row) {
+  tile->mi_row_start = get_tile_offset(row, cm->mi_rows, cm->log2_tile_rows);
+  tile->mi_row_end = get_tile_offset(row + 1, cm->mi_rows, cm->log2_tile_rows);
+}
+
+void vp9_tile_set_col(TileInfo *tile, const VP9_COMMON *cm, int col) {
+  tile->mi_col_start = get_tile_offset(col, cm->mi_cols, cm->log2_tile_cols);
+  tile->mi_col_end = get_tile_offset(col + 1, cm->mi_cols, cm->log2_tile_cols);
+}
+
+void vp9_tile_init(TileInfo *tile, const VP9_COMMON *cm, int row, int col) {
+  vp9_tile_set_row(tile, cm, row);
+  vp9_tile_set_col(tile, cm, col);
+}
+
+static int get_min_log2_tile_cols(const int sb64_cols) {
+  int min_log2 = 0;
+  while ((MAX_TILE_WIDTH_B64 << min_log2) < sb64_cols)
+    ++min_log2;
+  return min_log2;
+}
+
+static int get_max_log2_tile_cols(const int sb64_cols) {
+  int max_log2 = 1;
+  while ((sb64_cols >> max_log2) >= MIN_TILE_WIDTH_B64)
+    ++max_log2;
+  return max_log2 - 1;
+}
+
+void vp9_get_tile_n_bits(int mi_cols,
+                         int *min_log2_tile_cols, int *max_log2_tile_cols) {
+  const int sb64_cols = mi_cols_aligned_to_sb(mi_cols) >> MI_BLOCK_SIZE_LOG2;
+  *min_log2_tile_cols = get_min_log2_tile_cols(sb64_cols);
+  *max_log2_tile_cols = get_max_log2_tile_cols(sb64_cols);
+  assert(*min_log2_tile_cols <= *max_log2_tile_cols);
+}
diff --git a/libs/libvpx/vp9/common/vp9_tile_common.h b/libs/libvpx/vp9/common/vp9_tile_common.h
new file mode 100644
index 0000000000..ae58805de1
--- /dev/null
+++ b/libs/libvpx/vp9/common/vp9_tile_common.h
@@ -0,0 +1,40 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_TILE_COMMON_H_
+#define VP9_COMMON_VP9_TILE_COMMON_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP9Common;
+
+typedef struct TileInfo {
+  int mi_row_start, mi_row_end;
+  int mi_col_start, mi_col_end;
+} TileInfo;
+
+// initializes 'tile->mi_(row|col)_(start|end)' for (row, col) based on
+// 'cm->log2_tile_(rows|cols)' & 'cm->mi_(rows|cols)'
+void vp9_tile_init(TileInfo *tile, const struct VP9Common *cm,
+                   int row, int col);
+
+void vp9_tile_set_row(TileInfo *tile, const struct VP9Common *cm, int row);
+void vp9_tile_set_col(TileInfo *tile, const struct VP9Common *cm, int col);
+
+void vp9_get_tile_n_bits(int mi_cols,
+                         int *min_log2_tile_cols, int *max_log2_tile_cols);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_COMMON_VP9_TILE_COMMON_H_
diff --git a/libs/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c b/libs/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
new file mode 100644
index 0000000000..8d312d03f9
--- /dev/null
+++ b/libs/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -0,0 +1,180 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+#include "vpx_ports/mem.h"
+
+void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
+                            int tx_type) {
+  __m128i in[2];
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i eight = _mm_set1_epi16(8);
+
+  in[0] = load_input_data(input);
+  in[1] = load_input_data(input + 8);
+
+  switch (tx_type) {
+    case 0:  // DCT_DCT
+      idct4_sse2(in);
+      idct4_sse2(in);
+      break;
+    case 1:  // ADST_DCT
+      idct4_sse2(in);
+      iadst4_sse2(in);
+      break;
+    case 2:  // DCT_ADST
+      iadst4_sse2(in);
+      idct4_sse2(in);
+      break;
+    case 3:  // ADST_ADST
+      iadst4_sse2(in);
+      iadst4_sse2(in);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+
+  // Final round and shift
+  in[0] = _mm_add_epi16(in[0], eight);
+  in[1] = _mm_add_epi16(in[1], eight);
+
+  in[0] = _mm_srai_epi16(in[0], 4);
+  in[1] = _mm_srai_epi16(in[1], 4);
+
+  // Reconstruction and Store
+  {
+    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
+    __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
+    d0 = _mm_unpacklo_epi32(d0,
+                            _mm_cvtsi32_si128(*(const int *)(dest + stride)));
+    d2 = _mm_unpacklo_epi32(
+        d2, _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)));
+    d0 = _mm_unpacklo_epi8(d0, zero);
+    d2 = _mm_unpacklo_epi8(d2, zero);
+    d0 = _mm_add_epi16(d0, in[0]);
+    d2 = _mm_add_epi16(d2, in[1]);
+    d0 = _mm_packus_epi16(d0, d2);
+    // store result[0]
+    *(int *)dest = _mm_cvtsi128_si32(d0);
+    // store result[1]
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
+    // store result[2]
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
+    // store result[3]
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
+  }
+}
+
+void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
+                            int tx_type) {
+  __m128i in[8];
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
+
+  // load input data
+  in[0] = load_input_data(input);
+  in[1] = load_input_data(input + 8 * 1);
+  in[2] = load_input_data(input + 8 * 2);
+  in[3] = load_input_data(input + 8 * 3);
+  in[4] = load_input_data(input + 8 * 4);
+  in[5] = load_input_data(input + 8 * 5);
+  in[6] = load_input_data(input + 8 * 6);
+  in[7] = load_input_data(input + 8 * 7);
+
+  switch (tx_type) {
+    case 0:  // DCT_DCT
+      idct8_sse2(in);
+      idct8_sse2(in);
+      break;
+    case 1:  // ADST_DCT
+      idct8_sse2(in);
+      iadst8_sse2(in);
+      break;
+    case 2:  // DCT_ADST
+      iadst8_sse2(in);
+      idct8_sse2(in);
+      break;
+    case 3:  // ADST_ADST
+      iadst8_sse2(in);
+      iadst8_sse2(in);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+
+  // Final rounding and shift
+  in[0] = _mm_adds_epi16(in[0], final_rounding);
+  in[1] = _mm_adds_epi16(in[1], final_rounding);
+  in[2] = _mm_adds_epi16(in[2], final_rounding);
+  in[3] = _mm_adds_epi16(in[3], final_rounding);
+  in[4] = _mm_adds_epi16(in[4], final_rounding);
+  in[5] = _mm_adds_epi16(in[5], final_rounding);
+  in[6] = _mm_adds_epi16(in[6], final_rounding);
+  in[7] = _mm_adds_epi16(in[7], final_rounding);
+
+  in[0] = _mm_srai_epi16(in[0], 5);
+  in[1] = _mm_srai_epi16(in[1], 5);
+  in[2] = _mm_srai_epi16(in[2], 5);
+  in[3] = _mm_srai_epi16(in[3], 5);
+  in[4] = _mm_srai_epi16(in[4], 5);
+  in[5] = _mm_srai_epi16(in[5], 5);
+  in[6] = _mm_srai_epi16(in[6], 5);
+  in[7] = _mm_srai_epi16(in[7], 5);
+
+  RECON_AND_STORE(dest + 0 * stride, in[0]);
+  RECON_AND_STORE(dest + 1 * stride, in[1]);
+  RECON_AND_STORE(dest + 2 * stride, in[2]);
+  RECON_AND_STORE(dest + 3 * stride, in[3]);
+  RECON_AND_STORE(dest + 4 * stride, in[4]);
+  RECON_AND_STORE(dest + 5 * stride, in[5]);
+  RECON_AND_STORE(dest + 6 * stride, in[6]);
+  RECON_AND_STORE(dest + 7 * stride, in[7]);
+}
+
+void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
+                               int stride, int tx_type) {
+  __m128i in0[16], in1[16];
+
+  load_buffer_8x16(input, in0);
+  input += 8;
+  load_buffer_8x16(input, in1);
+
+  switch (tx_type) {
+    case 0:  // DCT_DCT
+      idct16_sse2(in0, in1);
+      idct16_sse2(in0, in1);
+      break;
+    case 1:  // ADST_DCT
+      idct16_sse2(in0, in1);
+      iadst16_sse2(in0, in1);
+      break;
+    case 2:  // DCT_ADST
+      iadst16_sse2(in0, in1);
+      idct16_sse2(in0, in1);
+      break;
+    case 3:  // ADST_ADST
+      iadst16_sse2(in0, in1);
+      iadst16_sse2(in0, in1);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+
+  write_buffer_8x16(dest, in0, stride);
+  dest += 8;
+  write_buffer_8x16(dest, in1, stride);
+}
diff --git a/libs/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm b/libs/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm
new file mode 100644
index 0000000000..6029420d11
--- /dev/null
+++ b/libs/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm
@@ -0,0 +1,287 @@
+;
+;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+;  This file is a duplicate of mfqe_sse2.asm in VP8.
+;  TODO(jackychen): Find a way to fix the duplicate.
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp9_filter_by_weight16x16_sse2
+;(
+;    unsigned char *src,
+;    int            src_stride,
+;    unsigned char *dst,
+;    int            dst_stride,
+;    int            src_weight
+;)
+global sym(vp9_filter_by_weight16x16_sse2) PRIVATE
+sym(vp9_filter_by_weight16x16_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    SAVE_XMM 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    movd        xmm0, arg(4)                ; src_weight
+    pshuflw     xmm0, xmm0, 0x0             ; replicate to all low words
+    punpcklqdq  xmm0, xmm0                  ; replicate to all hi words
+
+    movdqa      xmm1, [GLOBAL(tMFQE)]
+    psubw       xmm1, xmm0                  ; dst_weight
+
+    mov         rax, arg(0)                 ; src
+    mov         rsi, arg(1)                 ; src_stride
+    mov         rdx, arg(2)                 ; dst
+    mov         rdi, arg(3)                 ; dst_stride
+
+    mov         rcx, 16                     ; loop count
+    pxor        xmm6, xmm6
+
+.combine
+    movdqa      xmm2, [rax]
+    movdqa      xmm4, [rdx]
+    add         rax, rsi
+
+    ; src * src_weight
+    movdqa      xmm3, xmm2
+    punpcklbw   xmm2, xmm6
+    punpckhbw   xmm3, xmm6
+    pmullw      xmm2, xmm0
+    pmullw      xmm3, xmm0
+
+    ; dst * dst_weight
+    movdqa      xmm5, xmm4
+    punpcklbw   xmm4, xmm6
+    punpckhbw   xmm5, xmm6
+    pmullw      xmm4, xmm1
+    pmullw      xmm5, xmm1
+
+    ; sum, round and shift
+    paddw       xmm2, xmm4
+    paddw       xmm3, xmm5
+    paddw       xmm2, [GLOBAL(tMFQE_round)]
+    paddw       xmm3, [GLOBAL(tMFQE_round)]
+    psrlw       xmm2, 4
+    psrlw       xmm3, 4
+
+    packuswb    xmm2, xmm3
+    movdqa      [rdx], xmm2
+    add         rdx, rdi
+
+    dec         rcx
+    jnz         .combine
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+
+    ret
+
+;void vp9_filter_by_weight8x8_sse2
+;(
+;    unsigned char *src,
+;    int            src_stride,
+;    unsigned char *dst,
+;    int            dst_stride,
+;    int            src_weight
+;)
+global sym(vp9_filter_by_weight8x8_sse2) PRIVATE
+sym(vp9_filter_by_weight8x8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    movd        xmm0, arg(4)                ; src_weight
+    pshuflw     xmm0, xmm0, 0x0             ; replicate to all low words
+    punpcklqdq  xmm0, xmm0                  ; replicate to all hi words
+
+    movdqa      xmm1, [GLOBAL(tMFQE)]
+    psubw       xmm1, xmm0                  ; dst_weight
+
+    mov         rax, arg(0)                 ; src
+    mov         rsi, arg(1)                 ; src_stride
+    mov         rdx, arg(2)                 ; dst
+    mov         rdi, arg(3)                 ; dst_stride
+
+    mov         rcx, 8                      ; loop count
+    pxor        xmm4, xmm4
+
+.combine
+    movq        xmm2, [rax]
+    movq        xmm3, [rdx]
+    add         rax, rsi
+
+    ; src * src_weight
+    punpcklbw   xmm2, xmm4
+    pmullw      xmm2, xmm0
+
+    ; dst * dst_weight
+    punpcklbw   xmm3, xmm4
+    pmullw      xmm3, xmm1
+
+    ; sum, round and shift
+    paddw       xmm2, xmm3
+    paddw       xmm2, [GLOBAL(tMFQE_round)]
+    psrlw       xmm2, 4
+
+    packuswb    xmm2, xmm4
+    movq        [rdx], xmm2
+    add         rdx, rdi
+
+    dec         rcx
+    jnz         .combine
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+
+    ret
+
+;void vp9_variance_and_sad_16x16_sse2 | arg
+;(
+;    unsigned char *src1,          0
+;    int            stride1,       1
+;    unsigned char *src2,          2
+;    int            stride2,       3
+;    unsigned int  *variance,      4
+;    unsigned int  *sad,           5
+;)
+global sym(vp9_variance_and_sad_16x16_sse2) PRIVATE
+sym(vp9_variance_and_sad_16x16_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov         rax,        arg(0)          ; src1
+    mov         rcx,        arg(1)          ; stride1
+    mov         rdx,        arg(2)          ; src2
+    mov         rdi,        arg(3)          ; stride2
+
+    mov         rsi,        16              ; block height
+
+    ; Prep accumulator registers
+    pxor        xmm3, xmm3                  ; SAD
+    pxor        xmm4, xmm4                  ; sum of src2
+    pxor        xmm5, xmm5                  ; sum of src2^2
+
+    ; Because we're working with the actual output frames
+    ; we can't depend on any kind of data alignment.
+.accumulate
+    movdqa      xmm0, [rax]                 ; src1
+    movdqa      xmm1, [rdx]                 ; src2
+    add         rax, rcx                    ; src1 + stride1
+    add         rdx, rdi                    ; src2 + stride2
+
+    ; SAD(src1, src2)
+    psadbw      xmm0, xmm1
+    paddusw     xmm3, xmm0
+
+    ; SUM(src2)
+    pxor        xmm2, xmm2
+    psadbw      xmm2, xmm1                  ; sum src2 by misusing SAD against 0
+    paddusw     xmm4, xmm2
+
+    ; pmaddubsw would be ideal if it took two unsigned values. instead,
+    ; it expects a signed and an unsigned value. so instead we zero extend
+    ; and operate on words.
+    pxor        xmm2, xmm2
+    movdqa      xmm0, xmm1
+    punpcklbw   xmm0, xmm2
+    punpckhbw   xmm1, xmm2
+    pmaddwd     xmm0, xmm0
+    pmaddwd     xmm1, xmm1
+    paddd       xmm5, xmm0
+    paddd       xmm5, xmm1
+
+    sub         rsi,        1
+    jnz         .accumulate
+
+    ; phaddd only operates on adjacent double words.
+    ; Finalize SAD and store
+    movdqa      xmm0, xmm3
+    psrldq      xmm0, 8
+    paddusw     xmm0, xmm3
+    paddd       xmm0, [GLOBAL(t128)]
+    psrld       xmm0, 8
+
+    mov         rax,  arg(5)
+    movd        [rax], xmm0
+
+    ; Accumulate sum of src2
+    movdqa      xmm0, xmm4
+    psrldq      xmm0, 8
+    paddusw     xmm0, xmm4
+    ; Square src2. Ignore high value
+    pmuludq     xmm0, xmm0
+    psrld       xmm0, 8
+
+    ; phaddw could be used to sum adjacent values but we want
+    ; all the values summed. promote to doubles, accumulate,
+    ; shift and sum
+    pxor        xmm2, xmm2
+    movdqa      xmm1, xmm5
+    punpckldq   xmm1, xmm2
+    punpckhdq   xmm5, xmm2
+    paddd       xmm1, xmm5
+    movdqa      xmm2, xmm1
+    psrldq      xmm1, 8
+    paddd       xmm1, xmm2
+
+    psubd       xmm1, xmm0
+
+    ; (variance + 128) >> 8
+    paddd       xmm1, [GLOBAL(t128)]
+    psrld       xmm1, 8
+    mov         rax,  arg(4)
+
+    movd        [rax], xmm1
+
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+t128:
+%ifndef __NASM_VER__
+    ddq 128
+%elif CONFIG_BIG_ENDIAN
+    dq  0, 128
+%else
+    dq  128, 0
+%endif
+align 16
+tMFQE: ; 1 << MFQE_PRECISION
+    times 8 dw 0x10
+align 16
+tMFQE_round: ; 1 << (MFQE_PRECISION - 1)
+    times 8 dw 0x08
diff --git a/libs/libvpx/vp9/common/x86/vp9_postproc_sse2.asm b/libs/libvpx/vp9/common/x86/vp9_postproc_sse2.asm
new file mode 100644
index 0000000000..ec8bfdb18f
--- /dev/null
+++ b/libs/libvpx/vp9/common/x86/vp9_postproc_sse2.asm
@@ -0,0 +1,694 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp9_post_proc_down_and_across_xmm
+;(
+;    unsigned char *src_ptr,
+;    unsigned char *dst_ptr,
+;    int src_pixels_per_line,
+;    int dst_pixels_per_line,
+;    int rows,
+;    int cols,
+;    int flimit
+;)
+global sym(vp9_post_proc_down_and_across_xmm) PRIVATE
+sym(vp9_post_proc_down_and_across_xmm):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+%if ABI_IS_32BIT=1 && CONFIG_PIC=1
+    ALIGN_STACK 16, rax
+    ; move the global rd onto the stack, since we don't have enough registers
+    ; to do PIC addressing
+    movdqa      xmm0, [GLOBAL(rd42)]
+    sub         rsp, 16
+    movdqa      [rsp], xmm0
+%define RD42 [rsp]
+%else
+%define RD42 [GLOBAL(rd42)]
+%endif
+
+
+        movd        xmm2,       dword ptr arg(6) ;flimit
+        punpcklwd   xmm2,       xmm2
+        punpckldq   xmm2,       xmm2
+        punpcklqdq  xmm2,       xmm2
+
+        mov         rsi,        arg(0) ;src_ptr
+        mov         rdi,        arg(1) ;dst_ptr
+
+        movsxd      rcx,        DWORD PTR arg(4) ;rows
+        movsxd      rax,        DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
+        pxor        xmm0,       xmm0              ; mm0 = 00000000
+
+.nextrow:
+
+        xor         rdx,        rdx       ; clear out rdx for use as loop counter
+.nextcol:
+        movq        xmm3,       QWORD PTR [rsi]         ; mm4 = r0 p0..p7
+        punpcklbw   xmm3,       xmm0                    ; mm3 = p0..p3
+        movdqa      xmm1,       xmm3                    ; mm1 = p0..p3
+        psllw       xmm3,       2                       ;
+
+        movq        xmm5,       QWORD PTR [rsi + rax]   ; mm4 = r1 p0..p7
+        punpcklbw   xmm5,       xmm0                    ; mm5 = r1 p0..p3
+        paddusw     xmm3,       xmm5                    ; mm3 += mm6
+
+        ; thresholding
+        movdqa      xmm7,       xmm1                    ; mm7 = r0 p0..p3
+        psubusw     xmm7,       xmm5                    ; mm7 = r0 p0..p3 - r1 p0..p3
+        psubusw     xmm5,       xmm1                    ; mm5 = r1 p0..p3 - r0 p0..p3
+        paddusw     xmm7,       xmm5                    ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
+        pcmpgtw     xmm7,       xmm2
+
+        movq        xmm5,       QWORD PTR [rsi + 2*rax] ; mm4 = r2 p0..p7
+        punpcklbw   xmm5,       xmm0                    ; mm5 = r2 p0..p3
+        paddusw     xmm3,       xmm5                    ; mm3 += mm5
+
+        ; thresholding
+        movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
+        psubusw     xmm6,       xmm5                    ; mm6 = r0 p0..p3 - r2 p0..p3
+        psubusw     xmm5,       xmm1                    ; mm5 = r2 p0..p3 - r2 p0..p3
+        paddusw     xmm6,       xmm5                    ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
+        pcmpgtw     xmm6,       xmm2
+        por         xmm7,       xmm6                    ; accumulate thresholds
+
+
+        neg         rax
+        movq        xmm5,       QWORD PTR [rsi+2*rax]   ; mm4 = r-2 p0..p7
+        punpcklbw   xmm5,       xmm0                    ; mm5 = r-2 p0..p3
+        paddusw     xmm3,       xmm5                    ; mm3 += mm5
+
+        ; thresholding
+        movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
+        psubusw     xmm6,       xmm5                    ; mm6 = p0..p3 - r-2 p0..p3
+        psubusw     xmm5,       xmm1                    ; mm5 = r-2 p0..p3 - p0..p3
+        paddusw     xmm6,       xmm5                    ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
+        pcmpgtw     xmm6,       xmm2
+        por         xmm7,       xmm6                    ; accumulate thresholds
+
+        movq        xmm4,       QWORD PTR [rsi+rax]     ; mm4 = r-1 p0..p7
+        punpcklbw   xmm4,       xmm0                    ; mm4 = r-1 p0..p3
+        paddusw     xmm3,       xmm4                    ; mm3 += mm5
+
+        ; thresholding
+        movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
+        psubusw     xmm6,       xmm4                    ; mm6 = p0..p3 - r-2 p0..p3
+        psubusw     xmm4,       xmm1                    ; mm5 = r-1 p0..p3 - p0..p3
+        paddusw     xmm6,       xmm4                    ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
+        pcmpgtw     xmm6,       xmm2
+        por         xmm7,       xmm6                    ; accumulate thresholds
+
+
+        paddusw     xmm3,       RD42                    ; mm3 += round value
+        psraw       xmm3,       3                       ; mm3 /= 8
+
+        pand        xmm1,       xmm7                    ; mm1 select vals > thresh from source
+        pandn       xmm7,       xmm3                    ; mm7 select vals < thresh from blurred result
+        paddusw     xmm1,       xmm7                    ; combination
+
+        packuswb    xmm1,       xmm0                    ; pack to bytes
+        movq        QWORD PTR [rdi], xmm1             ;
+
+        neg         rax                   ; pitch is positive
+        add         rsi,        8
+        add         rdi,        8
+
+        add         rdx,        8
+        cmp         edx,        dword arg(5) ;cols
+
+        jl          .nextcol
+
+        ; done with the all cols, start the across filtering in place
+        sub         rsi,        rdx
+        sub         rdi,        rdx
+
+        xor         rdx,        rdx
+        movq        mm0,        QWORD PTR [rdi-8];
+
+.acrossnextcol:
+        movq        xmm7,       QWORD PTR [rdi +rdx -2]
+        movd        xmm4,       DWORD PTR [rdi +rdx +6]
+
+        pslldq      xmm4,       8
+        por         xmm4,       xmm7
+
+        movdqa      xmm3,       xmm4
+        psrldq      xmm3,       2
+        punpcklbw   xmm3,       xmm0              ; mm3 = p0..p3
+        movdqa      xmm1,       xmm3              ; mm1 = p0..p3
+        psllw       xmm3,       2
+
+
+        movdqa      xmm5,       xmm4
+        psrldq      xmm5,       3
+        punpcklbw   xmm5,       xmm0              ; mm5 = p1..p4
+        paddusw     xmm3,       xmm5              ; mm3 += mm6
+
+        ; thresholding
+        movdqa      xmm7,       xmm1              ; mm7 = p0..p3
+        psubusw     xmm7,       xmm5              ; mm7 = p0..p3 - p1..p4
+        psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
+        paddusw     xmm7,       xmm5              ; mm7 = abs(p0..p3 - p1..p4)
+        pcmpgtw     xmm7,       xmm2
+
+        movdqa      xmm5,       xmm4
+        psrldq      xmm5,       4
+        punpcklbw   xmm5,       xmm0              ; mm5 = p2..p5
+        paddusw     xmm3,       xmm5              ; mm3 += mm5
+
+        ; thresholding
+        movdqa      xmm6,       xmm1              ; mm6 = p0..p3
+        psubusw     xmm6,       xmm5              ; mm6 = p0..p3 - p1..p4
+        psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
+        paddusw     xmm6,       xmm5              ; mm6 = abs(p0..p3 - p1..p4)
+        pcmpgtw     xmm6,       xmm2
+        por         xmm7,       xmm6              ; accumulate thresholds
+
+
+        movdqa      xmm5,       xmm4              ; mm5 = p-2..p5
+        punpcklbw   xmm5,       xmm0              ; mm5 = p-2..p1
+        paddusw     xmm3,       xmm5              ; mm3 += mm5
+
+        ; thresholding
+        movdqa      xmm6,       xmm1              ; mm6 = p0..p3
+        psubusw     xmm6,       xmm5              ; mm6 = p0..p3 - p1..p4
+        psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
+        paddusw     xmm6,       xmm5              ; mm6 = abs(p0..p3 - p1..p4)
+        pcmpgtw     xmm6,       xmm2
+        por         xmm7,       xmm6              ; accumulate thresholds
+
+        psrldq      xmm4,       1                   ; mm4 = p-1..p5
+        punpcklbw   xmm4,       xmm0              ; mm4 = p-1..p2
+        paddusw     xmm3,       xmm4              ; mm3 += mm5
+
+        ; thresholding
+        movdqa      xmm6,       xmm1              ; mm6 = p0..p3
+        psubusw     xmm6,       xmm4              ; mm6 = p0..p3 - p1..p4
+        psubusw     xmm4,       xmm1              ; mm5 = p1..p4 - p0..p3
+        paddusw     xmm6,       xmm4              ; mm6 = abs(p0..p3 - p1..p4)
+        pcmpgtw     xmm6,       xmm2
+        por         xmm7,       xmm6              ; accumulate thresholds
+
+        paddusw     xmm3,       RD42              ; mm3 += round value
+        psraw       xmm3,       3                 ; mm3 /= 8
+
+        pand        xmm1,       xmm7              ; mm1 select vals > thresh from source
+        pandn       xmm7,       xmm3              ; mm7 select vals < thresh from blurred result
+        paddusw     xmm1,       xmm7              ; combination
+
+        packuswb    xmm1,       xmm0              ; pack to bytes
+        movq        QWORD PTR [rdi+rdx-8],  mm0   ; store previous four bytes
+        movdq2q     mm0,        xmm1
+
+        add         rdx,        8
+        cmp         edx,        dword arg(5) ;cols
+        jl          .acrossnextcol;
+
+        ; last 8 pixels
+        movq        QWORD PTR [rdi+rdx-8],  mm0
+
+        ; done with this rwo
+        add         rsi,rax               ; next line
+        mov         eax, dword arg(3) ;dst_pixels_per_line ; destination pitch?
+        add         rdi,rax               ; next destination
+        mov         eax, dword arg(2) ;src_pixels_per_line ; destination pitch?
+
+        dec         rcx                   ; decrement count
+        jnz         .nextrow              ; next row
+
+%if ABI_IS_32BIT=1 && CONFIG_PIC=1
+    add rsp,16
+    pop rsp
+%endif
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%undef RD42
+
+
+;void vp9_mbpost_proc_down_xmm(unsigned char *dst,
+;                            int pitch, int rows, int cols,int flimit)
+extern sym(vp9_rv)
+global sym(vp9_mbpost_proc_down_xmm) PRIVATE
+sym(vp9_mbpost_proc_down_xmm):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 128+16
+
+    ; unsigned char d[16][8] at [rsp]
+    ; create flimit2 at [rsp+128]
+    mov         eax, dword ptr arg(4) ;flimit
+    mov         [rsp+128], eax
+    mov         [rsp+128+4], eax
+    mov         [rsp+128+8], eax
+    mov         [rsp+128+12], eax
+%define flimit4 [rsp+128]
+
+%if ABI_IS_32BIT=0
+    lea         r8,       [GLOBAL(sym(vp9_rv))]
+%endif
+
+    ;rows +=8;
+    add         dword arg(2), 8
+
+    ;for(c=0; c<cols; c+=8)
+.loop_col:
+            mov         rsi,        arg(0) ; s
+            pxor        xmm0,       xmm0        ;
+
+            movsxd      rax,        dword ptr arg(1) ;pitch       ;
+            neg         rax                                     ; rax = -pitch
+
+            lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
+            neg         rax
+
+
+            pxor        xmm5,       xmm5
+            pxor        xmm6,       xmm6        ;
+
+            pxor        xmm7,       xmm7        ;
+            mov         rdi,        rsi
+
+            mov         rcx,        15          ;
+
+.loop_initvar:
+            movq        xmm1,       QWORD PTR [rdi];
+            punpcklbw   xmm1,       xmm0        ;
+
+            paddw       xmm5,       xmm1        ;
+            pmullw      xmm1,       xmm1        ;
+
+            movdqa      xmm2,       xmm1        ;
+            punpcklwd   xmm1,       xmm0        ;
+
+            punpckhwd   xmm2,       xmm0        ;
+            paddd       xmm6,       xmm1        ;
+
+            paddd       xmm7,       xmm2        ;
+            lea         rdi,        [rdi+rax]   ;
+
+            dec         rcx
+            jne         .loop_initvar
+            ;save the var and sum
+            xor         rdx,        rdx
+.loop_row:
+            movq        xmm1,       QWORD PTR [rsi]     ; [s-pitch*8]
+            movq        xmm2,       QWORD PTR [rdi]     ; [s+pitch*7]
+
+            punpcklbw   xmm1,       xmm0
+            punpcklbw   xmm2,       xmm0
+
+            paddw       xmm5,       xmm2
+            psubw       xmm5,       xmm1
+
+            pmullw      xmm2,       xmm2
+            movdqa      xmm4,       xmm2
+
+            punpcklwd   xmm2,       xmm0
+            punpckhwd   xmm4,       xmm0
+
+            paddd       xmm6,       xmm2
+            paddd       xmm7,       xmm4
+
+            pmullw      xmm1,       xmm1
+            movdqa      xmm2,       xmm1
+
+            punpcklwd   xmm1,       xmm0
+            psubd       xmm6,       xmm1
+
+            punpckhwd   xmm2,       xmm0
+            psubd       xmm7,       xmm2
+
+
+            movdqa      xmm3,       xmm6
+            pslld       xmm3,       4
+
+            psubd       xmm3,       xmm6
+            movdqa      xmm1,       xmm5
+
+            movdqa      xmm4,       xmm5
+            pmullw      xmm1,       xmm1
+
+            pmulhw      xmm4,       xmm4
+            movdqa      xmm2,       xmm1
+
+            punpcklwd   xmm1,       xmm4
+            punpckhwd   xmm2,       xmm4
+
+            movdqa      xmm4,       xmm7
+            pslld       xmm4,       4
+
+            psubd       xmm4,       xmm7
+
+            psubd       xmm3,       xmm1
+            psubd       xmm4,       xmm2
+
+            psubd       xmm3,       flimit4
+            psubd       xmm4,       flimit4
+
+            psrad       xmm3,       31
+            psrad       xmm4,       31
+
+            packssdw    xmm3,       xmm4
+            packsswb    xmm3,       xmm0
+
+            movq        xmm1,       QWORD PTR [rsi+rax*8]
+
+            movq        xmm2,       xmm1
+            punpcklbw   xmm1,       xmm0
+
+            paddw       xmm1,       xmm5
+            mov         rcx,        rdx
+
+            and         rcx,        127
+%if ABI_IS_32BIT=1 && CONFIG_PIC=1
+            push        rax
+            lea         rax,        [GLOBAL(sym(vp9_rv))]
+            movdqu      xmm4,       [rax + rcx*2] ;vp9_rv[rcx*2]
+            pop         rax
+%elif ABI_IS_32BIT=0
+            movdqu      xmm4,       [r8 + rcx*2] ;vp9_rv[rcx*2]
+%else
+            movdqu      xmm4,       [sym(vp9_rv) + rcx*2]
+%endif
+
+            paddw       xmm1,       xmm4
+            ;paddw     xmm1,       eight8s
+            psraw       xmm1,       4
+
+            packuswb    xmm1,       xmm0
+            pand        xmm1,       xmm3
+
+            pandn       xmm3,       xmm2
+            por         xmm1,       xmm3
+
+            and         rcx,        15
+            movq        QWORD PTR   [rsp + rcx*8], xmm1 ;d[rcx*8]
+
+            mov         rcx,        rdx
+            sub         rcx,        8
+
+            and         rcx,        15
+            movq        mm0,        [rsp + rcx*8] ;d[rcx*8]
+
+            movq        [rsi],      mm0
+            lea         rsi,        [rsi+rax]
+
+            lea         rdi,        [rdi+rax]
+            add         rdx,        1
+
+            cmp         edx,        dword arg(2) ;rows
+            jl          .loop_row
+
+        add         dword arg(0), 8 ; s += 8
+        sub         dword arg(3), 8 ; cols -= 8
+        cmp         dword arg(3), 0
+        jg          .loop_col
+
+    add         rsp, 128+16
+    pop         rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%undef flimit4
+
+
+;void vp9_mbpost_proc_across_ip_xmm(unsigned char *src,
+;                                int pitch, int rows, int cols,int flimit)
+global sym(vp9_mbpost_proc_across_ip_xmm) PRIVATE
+sym(vp9_mbpost_proc_across_ip_xmm):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16
+
+    ; create flimit4 at [rsp]
+    mov         eax, dword ptr arg(4) ;flimit
+    mov         [rsp], eax
+    mov         [rsp+4], eax
+    mov         [rsp+8], eax
+    mov         [rsp+12], eax
+%define flimit4 [rsp]
+
+
+    ;for(r=0;r<rows;r++)
+.ip_row_loop:
+
+        xor         rdx,    rdx ;sumsq=0;
+        xor         rcx,    rcx ;sum=0;
+        mov         rsi,    arg(0); s
+        mov         rdi,    -8
+.ip_var_loop:
+        ;for(i=-8;i<=6;i++)
+        ;{
+        ;    sumsq += s[i]*s[i];
+        ;    sum   += s[i];
+        ;}
+        movzx       eax, byte [rsi+rdi]
+        add         ecx, eax
+        mul         al
+        add         edx, eax
+        add         rdi, 1
+        cmp         rdi, 6
+        jle         .ip_var_loop
+
+
+            ;mov         rax,    sumsq
+            ;movd        xmm7,   rax
+            movd        xmm7,   edx
+
+            ;mov         rax,    sum
+            ;movd        xmm6,   rax
+            movd        xmm6,   ecx
+
+            mov         rsi,    arg(0) ;s
+            xor         rcx,    rcx
+
+            movsxd      rdx,    dword arg(3) ;cols
+            add         rdx,    8
+            pxor        mm0,    mm0
+            pxor        mm1,    mm1
+
+            pxor        xmm0,   xmm0
+.nextcol4:
+
+            movd        xmm1,   DWORD PTR [rsi+rcx-8]   ; -8 -7 -6 -5
+            movd        xmm2,   DWORD PTR [rsi+rcx+7]   ; +7 +8 +9 +10
+
+            punpcklbw   xmm1,   xmm0                    ; expanding
+            punpcklbw   xmm2,   xmm0                    ; expanding
+
+            punpcklwd   xmm1,   xmm0                    ; expanding to dwords
+            punpcklwd   xmm2,   xmm0                    ; expanding to dwords
+
+            psubd       xmm2,   xmm1                    ; 7--8   8--7   9--6 10--5
+            paddd       xmm1,   xmm1                    ; -8*2   -7*2   -6*2 -5*2
+
+            paddd       xmm1,   xmm2                    ; 7+-8   8+-7   9+-6 10+-5
+            pmaddwd     xmm1,   xmm2                    ; squared of 7+-8   8+-7   9+-6 10+-5
+
+            paddd       xmm6,   xmm2
+            paddd       xmm7,   xmm1
+
+            pshufd      xmm6,   xmm6,   0               ; duplicate the last ones
+            pshufd      xmm7,   xmm7,   0               ; duplicate the last ones
+
+            psrldq      xmm1,       4                   ; 8--7   9--6 10--5  0000
+            psrldq      xmm2,       4                   ; 8--7   9--6 10--5  0000
+
+            pshufd      xmm3,   xmm1,   3               ; 0000  8--7   8--7   8--7 squared
+            pshufd      xmm4,   xmm2,   3               ; 0000  8--7   8--7   8--7 squared
+
+            paddd       xmm6,   xmm4
+            paddd       xmm7,   xmm3
+
+            pshufd      xmm3,   xmm1,   01011111b       ; 0000  0000   9--6   9--6 squared
+            pshufd      xmm4,   xmm2,   01011111b       ; 0000  0000   9--6   9--6 squared
+
+            paddd       xmm7,   xmm3
+            paddd       xmm6,   xmm4
+
+            pshufd      xmm3,   xmm1,   10111111b       ; 0000  0000   8--7   8--7 squared
+            pshufd      xmm4,   xmm2,   10111111b       ; 0000  0000   8--7   8--7 squared
+
+            paddd       xmm7,   xmm3
+            paddd       xmm6,   xmm4
+
+            movdqa      xmm3,   xmm6
+            pmaddwd     xmm3,   xmm3
+
+            movdqa      xmm5,   xmm7
+            pslld       xmm5,   4
+
+            psubd       xmm5,   xmm7
+            psubd       xmm5,   xmm3
+
+            psubd       xmm5,   flimit4
+            psrad       xmm5,   31
+
+            packssdw    xmm5,   xmm0
+            packsswb    xmm5,   xmm0
+
+            movd        xmm1,   DWORD PTR [rsi+rcx]
+            movq        xmm2,   xmm1
+
+            punpcklbw   xmm1,   xmm0
+            punpcklwd   xmm1,   xmm0
+
+            paddd       xmm1,   xmm6
+            paddd       xmm1,   [GLOBAL(four8s)]
+
+            psrad       xmm1,   4
+            packssdw    xmm1,   xmm0
+
+            packuswb    xmm1,   xmm0
+            pand        xmm1,   xmm5
+
+            pandn       xmm5,   xmm2
+            por         xmm5,   xmm1
+
+            movd        [rsi+rcx-8],  mm0
+            movq        mm0,    mm1
+
+            movdq2q     mm1,    xmm5
+            psrldq      xmm7,   12
+
+            psrldq      xmm6,   12
+            add         rcx,    4
+
+            cmp         rcx,    rdx
+            jl          .nextcol4
+
+        ;s+=pitch;
+        movsxd rax, dword arg(1)
+        add    arg(0), rax
+
+        sub dword arg(2), 1 ;rows-=1
+        cmp dword arg(2), 0
+        jg .ip_row_loop
+
+    add         rsp, 16
+    pop         rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%undef flimit4
+
+
+;void vp9_plane_add_noise_wmt (unsigned char *start, unsigned char *noise,
+;                            unsigned char blackclamp[16],
+;                            unsigned char whiteclamp[16],
+;                            unsigned char bothclamp[16],
+;                            unsigned int width, unsigned int height, int pitch)
+global sym(vp9_plane_add_noise_wmt) PRIVATE
+sym(vp9_plane_add_noise_wmt):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 8
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+.addnoise_loop:
+    call sym(LIBVPX_RAND) WRT_PLT
+    mov     rcx, arg(1) ;noise
+    and     rax, 0xff
+    add     rcx, rax
+
+    ; we rely on the fact that the clamping vectors are stored contiguously
+    ; in black/white/both order. Note that we have to reload this here because
+    ; rdx could be trashed by rand()
+    mov     rdx, arg(2) ; blackclamp
+
+
+            mov     rdi, rcx
+            movsxd  rcx, dword arg(5) ;[Width]
+            mov     rsi, arg(0) ;Pos
+            xor         rax,rax
+
+.addnoise_nextset:
+            movdqu      xmm1,[rsi+rax]         ; get the source
+
+            psubusb     xmm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
+            paddusb     xmm1, [rdx+32] ;bothclamp
+            psubusb     xmm1, [rdx+16] ;whiteclamp
+
+            movdqu      xmm2,[rdi+rax]         ; get the noise for this line
+            paddb       xmm1,xmm2              ; add it in
+            movdqu      [rsi+rax],xmm1         ; store the result
+
+            add         rax,16                 ; move to the next line
+
+            cmp         rax, rcx
+            jl          .addnoise_nextset
+
+    movsxd  rax, dword arg(7) ; Pitch
+    add     arg(0), rax ; Start += Pitch
+    sub     dword arg(6), 1   ; Height -= 1
+    jg      .addnoise_loop
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+SECTION_RODATA
+align 16
+rd42:
+    times 8 dw 0x04
+four8s:
+    times 4 dd 8
diff --git a/libs/libvpx/vp9/decoder/vp9_decodeframe.c b/libs/libvpx/vp9/decoder/vp9_decodeframe.c
new file mode 100644
index 0000000000..8a492d5623
--- /dev/null
+++ b/libs/libvpx/vp9/decoder/vp9_decodeframe.c
@@ -0,0 +1,2259 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdlib.h>  // qsort()
+
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_scale_rtcd.h"
+
+#include "vpx_dsp/bitreader_buffer.h"
+#include "vpx_dsp/bitreader.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/mem_ops.h"
+#include "vpx_scale/vpx_scale.h"
+#include "vpx_util/vpx_thread.h"
+
+#include "vp9/common/vp9_alloccommon.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_entropy.h"
+#include "vp9/common/vp9_entropymode.h"
+#include "vp9/common/vp9_idct.h"
+#include "vp9/common/vp9_thread_common.h"
+#include "vp9/common/vp9_pred_common.h"
+#include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_reconintra.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/common/vp9_seg_common.h"
+#include "vp9/common/vp9_tile_common.h"
+
+#include "vp9/decoder/vp9_decodeframe.h"
+#include "vp9/decoder/vp9_detokenize.h"
+#include "vp9/decoder/vp9_decodemv.h"
+#include "vp9/decoder/vp9_decoder.h"
+#include "vp9/decoder/vp9_dsubexp.h"
+
+#define MAX_VP9_HEADER_SIZE 80
+
+static int is_compound_reference_allowed(const VP9_COMMON *cm) {
+  int i;
+  for (i = 1; i < REFS_PER_FRAME; ++i)
+    if (cm->ref_frame_sign_bias[i + 1] != cm->ref_frame_sign_bias[1])
+      return 1;
+
+  return 0;
+}
+
+static void setup_compound_reference_mode(VP9_COMMON *cm) {
+  if (cm->ref_frame_sign_bias[LAST_FRAME] ==
+          cm->ref_frame_sign_bias[GOLDEN_FRAME]) {
+    cm->comp_fixed_ref = ALTREF_FRAME;
+    cm->comp_var_ref[0] = LAST_FRAME;
+    cm->comp_var_ref[1] = GOLDEN_FRAME;
+  } else if (cm->ref_frame_sign_bias[LAST_FRAME] ==
+                 cm->ref_frame_sign_bias[ALTREF_FRAME]) {
+    cm->comp_fixed_ref = GOLDEN_FRAME;
+    cm->comp_var_ref[0] = LAST_FRAME;
+    cm->comp_var_ref[1] = ALTREF_FRAME;
+  } else {
+    cm->comp_fixed_ref = LAST_FRAME;
+    cm->comp_var_ref[0] = GOLDEN_FRAME;
+    cm->comp_var_ref[1] = ALTREF_FRAME;
+  }
+}
+
+static int read_is_valid(const uint8_t *start, size_t len, const uint8_t *end) {
+  return len != 0 && len <= (size_t)(end - start);
+}
+
+static int decode_unsigned_max(struct vpx_read_bit_buffer *rb, int max) {
+  const int data = vpx_rb_read_literal(rb, get_unsigned_bits(max));
+  return data > max ? max : data;
+}
+
+static TX_MODE read_tx_mode(vpx_reader *r) {
+  TX_MODE tx_mode = vpx_read_literal(r, 2);
+  if (tx_mode == ALLOW_32X32)
+    tx_mode += vpx_read_bit(r);
+  return tx_mode;
+}
+
+static void read_tx_mode_probs(struct tx_probs *tx_probs, vpx_reader *r) {
+  int i, j;
+
+  for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
+    for (j = 0; j < TX_SIZES - 3; ++j)
+      vp9_diff_update_prob(r, &tx_probs->p8x8[i][j]);
+
+  for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
+    for (j = 0; j < TX_SIZES - 2; ++j)
+      vp9_diff_update_prob(r, &tx_probs->p16x16[i][j]);
+
+  for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
+    for (j = 0; j < TX_SIZES - 1; ++j)
+      vp9_diff_update_prob(r, &tx_probs->p32x32[i][j]);
+}
+
+static void read_switchable_interp_probs(FRAME_CONTEXT *fc, vpx_reader *r) {
+  int i, j;
+  for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
+    for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i)
+      vp9_diff_update_prob(r, &fc->switchable_interp_prob[j][i]);
+}
+
+static void read_inter_mode_probs(FRAME_CONTEXT *fc, vpx_reader *r) {
+  int i, j;
+  for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
+    for (j = 0; j < INTER_MODES - 1; ++j)
+      vp9_diff_update_prob(r, &fc->inter_mode_probs[i][j]);
+}
+
+static REFERENCE_MODE read_frame_reference_mode(const VP9_COMMON *cm,
+                                                vpx_reader *r) {
+  if (is_compound_reference_allowed(cm)) {
+    return vpx_read_bit(r) ? (vpx_read_bit(r) ? REFERENCE_MODE_SELECT
+                                              : COMPOUND_REFERENCE)
+                           : SINGLE_REFERENCE;
+  } else {
+    return SINGLE_REFERENCE;
+  }
+}
+
+static void read_frame_reference_mode_probs(VP9_COMMON *cm, vpx_reader *r) {
+  FRAME_CONTEXT *const fc = cm->fc;
+  int i;
+
+  if (cm->reference_mode == REFERENCE_MODE_SELECT)
+    for (i = 0; i < COMP_INTER_CONTEXTS; ++i)
+      vp9_diff_update_prob(r, &fc->comp_inter_prob[i]);
+
+  if (cm->reference_mode != COMPOUND_REFERENCE)
+    for (i = 0; i < REF_CONTEXTS; ++i) {
+      vp9_diff_update_prob(r, &fc->single_ref_prob[i][0]);
+      vp9_diff_update_prob(r, &fc->single_ref_prob[i][1]);
+    }
+
+  if (cm->reference_mode != SINGLE_REFERENCE)
+    for (i = 0; i < REF_CONTEXTS; ++i)
+      vp9_diff_update_prob(r, &fc->comp_ref_prob[i]);
+}
+
+static void update_mv_probs(vpx_prob *p, int n, vpx_reader *r) {
+  int i;
+  for (i = 0; i < n; ++i)
+    if (vpx_read(r, MV_UPDATE_PROB))
+      p[i] = (vpx_read_literal(r, 7) << 1) | 1;
+}
+
+static void read_mv_probs(nmv_context *ctx, int allow_hp, vpx_reader *r) {
+  int i, j;
+
+  update_mv_probs(ctx->joints, MV_JOINTS - 1, r);
+
+  for (i = 0; i < 2; ++i) {
+    nmv_component *const comp_ctx = &ctx->comps[i];
+    update_mv_probs(&comp_ctx->sign, 1, r);
+    update_mv_probs(comp_ctx->classes, MV_CLASSES - 1, r);
+    update_mv_probs(comp_ctx->class0, CLASS0_SIZE - 1, r);
+    update_mv_probs(comp_ctx->bits, MV_OFFSET_BITS, r);
+  }
+
+  for (i = 0; i < 2; ++i) {
+    nmv_component *const comp_ctx = &ctx->comps[i];
+    for (j = 0; j < CLASS0_SIZE; ++j)
+      update_mv_probs(comp_ctx->class0_fp[j], MV_FP_SIZE - 1, r);
+    update_mv_probs(comp_ctx->fp, 3, r);
+  }
+
+  if (allow_hp) {
+    for (i = 0; i < 2; ++i) {
+      nmv_component *const comp_ctx = &ctx->comps[i];
+      update_mv_probs(&comp_ctx->class0_hp, 1, r);
+      update_mv_probs(&comp_ctx->hp, 1, r);
+    }
+  }
+}
+
+static void inverse_transform_block_inter(MACROBLOCKD* xd, int plane,
+                                          const TX_SIZE tx_size,
+                                          uint8_t *dst, int stride,
+                                          int eob) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  if (eob > 0) {
+    tran_low_t *const dqcoeff = pd->dqcoeff;
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      if (xd->lossless) {
+        vp9_highbd_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd);
+      } else {
+        switch (tx_size) {
+          case TX_4X4:
+            vp9_highbd_idct4x4_add(dqcoeff, dst, stride, eob, xd->bd);
+            break;
+          case TX_8X8:
+            vp9_highbd_idct8x8_add(dqcoeff, dst, stride, eob, xd->bd);
+            break;
+          case TX_16X16:
+            vp9_highbd_idct16x16_add(dqcoeff, dst, stride, eob, xd->bd);
+            break;
+          case TX_32X32:
+            vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd);
+            break;
+          default:
+            assert(0 && "Invalid transform size");
+        }
+      }
+    } else {
+      if (xd->lossless) {
+        vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
+      } else {
+        switch (tx_size) {
+          case TX_4X4:
+            vp9_idct4x4_add(dqcoeff, dst, stride, eob);
+            break;
+          case TX_8X8:
+            vp9_idct8x8_add(dqcoeff, dst, stride, eob);
+            break;
+          case TX_16X16:
+            vp9_idct16x16_add(dqcoeff, dst, stride, eob);
+            break;
+          case TX_32X32:
+            vp9_idct32x32_add(dqcoeff, dst, stride, eob);
+            break;
+          default:
+            assert(0 && "Invalid transform size");
+            return;
+        }
+      }
+    }
+#else
+    if (xd->lossless) {
+      vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
+    } else {
+      switch (tx_size) {
+        case TX_4X4:
+          vp9_idct4x4_add(dqcoeff, dst, stride, eob);
+          break;
+        case TX_8X8:
+          vp9_idct8x8_add(dqcoeff, dst, stride, eob);
+          break;
+        case TX_16X16:
+          vp9_idct16x16_add(dqcoeff, dst, stride, eob);
+          break;
+        case TX_32X32:
+          vp9_idct32x32_add(dqcoeff, dst, stride, eob);
+          break;
+        default:
+          assert(0 && "Invalid transform size");
+          return;
+      }
+    }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    if (eob == 1) {
+      dqcoeff[0] = 0;
+    } else {
+      if (tx_size <= TX_16X16 && eob <= 10)
+        memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0]));
+      else if (tx_size == TX_32X32 && eob <= 34)
+        memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0]));
+      else
+        memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0]));
+    }
+  }
+}
+
+static void inverse_transform_block_intra(MACROBLOCKD* xd, int plane,
+                                          const TX_TYPE tx_type,
+                                          const TX_SIZE tx_size,
+                                          uint8_t *dst, int stride,
+                                          int eob) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  if (eob > 0) {
+    tran_low_t *const dqcoeff = pd->dqcoeff;
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      if (xd->lossless) {
+        vp9_highbd_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd);
+      } else {
+        switch (tx_size) {
+          case TX_4X4:
+            vp9_highbd_iht4x4_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+            break;
+          case TX_8X8:
+            vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+            break;
+          case TX_16X16:
+            vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+            break;
+          case TX_32X32:
+            vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd);
+            break;
+          default:
+            assert(0 && "Invalid transform size");
+        }
+      }
+    } else {
+      if (xd->lossless) {
+        vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
+      } else {
+        switch (tx_size) {
+          case TX_4X4:
+            vp9_iht4x4_add(tx_type, dqcoeff, dst, stride, eob);
+            break;
+          case TX_8X8:
+            vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob);
+            break;
+          case TX_16X16:
+            vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob);
+            break;
+          case TX_32X32:
+            vp9_idct32x32_add(dqcoeff, dst, stride, eob);
+            break;
+          default:
+            assert(0 && "Invalid transform size");
+            return;
+        }
+      }
+    }
+#else
+    if (xd->lossless) {
+      vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
+    } else {
+      switch (tx_size) {
+        case TX_4X4:
+          vp9_iht4x4_add(tx_type, dqcoeff, dst, stride, eob);
+          break;
+        case TX_8X8:
+          vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob);
+          break;
+        case TX_16X16:
+          vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob);
+          break;
+        case TX_32X32:
+          vp9_idct32x32_add(dqcoeff, dst, stride, eob);
+          break;
+        default:
+          assert(0 && "Invalid transform size");
+          return;
+      }
+    }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    if (eob == 1) {
+      dqcoeff[0] = 0;
+    } else {
+      if (tx_type == DCT_DCT && tx_size <= TX_16X16 && eob <= 10)
+        memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0]));
+      else if (tx_size == TX_32X32 && eob <= 34)
+        memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0]));
+      else
+        memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0]));
+    }
+  }
+}
+
+static void predict_and_reconstruct_intra_block(MACROBLOCKD *const xd,
+                                                vpx_reader *r,
+                                                MODE_INFO *const mi,
+                                                int plane,
+                                                int row, int col,
+                                                TX_SIZE tx_size) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  PREDICTION_MODE mode = (plane == 0) ? mi->mode : mi->uv_mode;
+  uint8_t *dst;
+  dst = &pd->dst.buf[4 * row * pd->dst.stride + 4 * col];
+
+  if (mi->sb_type < BLOCK_8X8)
+    if (plane == 0)
+      mode = xd->mi[0]->bmi[(row << 1) + col].as_mode;
+
+  vp9_predict_intra_block(xd, pd->n4_wl, tx_size, mode,
+                          dst, pd->dst.stride, dst, pd->dst.stride,
+                          col, row, plane);
+
+  if (!mi->skip) {
+    const TX_TYPE tx_type = (plane || xd->lossless) ?
+        DCT_DCT : intra_mode_to_tx_type_lookup[mode];
+    const scan_order *sc = (plane || xd->lossless) ?
+        &vp9_default_scan_orders[tx_size] : &vp9_scan_orders[tx_size][tx_type];
+    const int eob = vp9_decode_block_tokens(xd, plane, sc, col, row, tx_size,
+                                            r, mi->segment_id);
+    inverse_transform_block_intra(xd, plane, tx_type, tx_size,
+                                  dst, pd->dst.stride, eob);
+  }
+}
+
+static int reconstruct_inter_block(MACROBLOCKD *const xd, vpx_reader *r,
+                                   MODE_INFO *const mi, int plane,
+                                   int row, int col, TX_SIZE tx_size) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const scan_order *sc = &vp9_default_scan_orders[tx_size];
+  const int eob = vp9_decode_block_tokens(xd, plane, sc, col, row, tx_size, r,
+                                          mi->segment_id);
+
+  inverse_transform_block_inter(xd, plane, tx_size,
+                            &pd->dst.buf[4 * row * pd->dst.stride + 4 * col],
+                            pd->dst.stride, eob);
+  return eob;
+}
+
+static void build_mc_border(const uint8_t *src, int src_stride,
+                            uint8_t *dst, int dst_stride,
+                            int x, int y, int b_w, int b_h, int w, int h) {
+  // Get a pointer to the start of the real data for this row.
+  const uint8_t *ref_row = src - x - y * src_stride;
+
+  if (y >= h)
+    ref_row += (h - 1) * src_stride;
+  else if (y > 0)
+    ref_row += y * src_stride;
+
+  do {
+    int right = 0, copy;
+    int left = x < 0 ? -x : 0;
+
+    if (left > b_w)
+      left = b_w;
+
+    if (x + b_w > w)
+      right = x + b_w - w;
+
+    if (right > b_w)
+      right = b_w;
+
+    copy = b_w - left - right;
+
+    if (left)
+      memset(dst, ref_row[0], left);
+
+    if (copy)
+      memcpy(dst + left, ref_row + x + left, copy);
+
+    if (right)
+      memset(dst + left + copy, ref_row[w - 1], right);
+
+    dst += dst_stride;
+    ++y;
+
+    if (y > 0 && y < h)
+      ref_row += src_stride;
+  } while (--b_h);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void high_build_mc_border(const uint8_t *src8, int src_stride,
+                                 uint16_t *dst, int dst_stride,
+                                 int x, int y, int b_w, int b_h,
+                                 int w, int h) {
+  // Get a pointer to the start of the real data for this row.
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *ref_row = src - x - y * src_stride;
+
+  if (y >= h)
+    ref_row += (h - 1) * src_stride;
+  else if (y > 0)
+    ref_row += y * src_stride;
+
+  do {
+    int right = 0, copy;
+    int left = x < 0 ? -x : 0;
+
+    if (left > b_w)
+      left = b_w;
+
+    if (x + b_w > w)
+      right = x + b_w - w;
+
+    if (right > b_w)
+      right = b_w;
+
+    copy = b_w - left - right;
+
+    if (left)
+      vpx_memset16(dst, ref_row[0], left);
+
+    if (copy)
+      memcpy(dst + left, ref_row + x + left, copy * sizeof(uint16_t));
+
+    if (right)
+      vpx_memset16(dst + left + copy, ref_row[w - 1], right);
+
+    dst += dst_stride;
+    ++y;
+
+    if (y > 0 && y < h)
+      ref_row += src_stride;
+  } while (--b_h);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void extend_and_predict(const uint8_t *buf_ptr1, int pre_buf_stride,
+                               int x0, int y0, int b_w, int b_h,
+                               int frame_width, int frame_height,
+                               int border_offset,
+                               uint8_t *const dst, int dst_buf_stride,
+                               int subpel_x, int subpel_y,
+                               const InterpKernel *kernel,
+                               const struct scale_factors *sf,
+                               MACROBLOCKD *xd,
+                               int w, int h, int ref, int xs, int ys) {
+  DECLARE_ALIGNED(16, uint16_t, mc_buf_high[80 * 2 * 80 * 2]);
+  const uint8_t *buf_ptr;
+
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    high_build_mc_border(buf_ptr1, pre_buf_stride, mc_buf_high, b_w,
+                         x0, y0, b_w, b_h, frame_width, frame_height);
+    buf_ptr = CONVERT_TO_BYTEPTR(mc_buf_high) + border_offset;
+  } else {
+    build_mc_border(buf_ptr1, pre_buf_stride, (uint8_t *)mc_buf_high, b_w,
+                    x0, y0, b_w, b_h, frame_width, frame_height);
+    buf_ptr = ((uint8_t *)mc_buf_high) + border_offset;
+  }
+
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    high_inter_predictor(buf_ptr, b_w, dst, dst_buf_stride, subpel_x,
+                         subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd);
+  } else {
+    inter_predictor(buf_ptr, b_w, dst, dst_buf_stride, subpel_x,
+                    subpel_y, sf, w, h, ref, kernel, xs, ys);
+  }
+}
+#else
+static void extend_and_predict(const uint8_t *buf_ptr1, int pre_buf_stride,
+                               int x0, int y0, int b_w, int b_h,
+                               int frame_width, int frame_height,
+                               int border_offset,
+                               uint8_t *const dst, int dst_buf_stride,
+                               int subpel_x, int subpel_y,
+                               const InterpKernel *kernel,
+                               const struct scale_factors *sf,
+                               int w, int h, int ref, int xs, int ys) {
+  DECLARE_ALIGNED(16, uint8_t, mc_buf[80 * 2 * 80 * 2]);
+  const uint8_t *buf_ptr;
+
+  build_mc_border(buf_ptr1, pre_buf_stride, mc_buf, b_w,
+                  x0, y0, b_w, b_h, frame_width, frame_height);
+  buf_ptr = mc_buf + border_offset;
+
+  inter_predictor(buf_ptr, b_w, dst, dst_buf_stride, subpel_x,
+                  subpel_y, sf, w, h, ref, kernel, xs, ys);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static void dec_build_inter_predictors(VPxWorker *const worker, MACROBLOCKD *xd,
+                                       int plane, int bw, int bh, int x,
+                                       int y, int w, int h, int mi_x, int mi_y,
+                                       const InterpKernel *kernel,
+                                       const struct scale_factors *sf,
+                                       struct buf_2d *pre_buf,
+                                       struct buf_2d *dst_buf, const MV* mv,
+                                       RefCntBuffer *ref_frame_buf,
+                                       int is_scaled, int ref) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
+  MV32 scaled_mv;
+  int xs, ys, x0, y0, x0_16, y0_16, frame_width, frame_height,
+      buf_stride, subpel_x, subpel_y;
+  uint8_t *ref_frame, *buf_ptr;
+
+  // Get reference frame pointer, width and height.
+  if (plane == 0) {
+    frame_width = ref_frame_buf->buf.y_crop_width;
+    frame_height = ref_frame_buf->buf.y_crop_height;
+    ref_frame = ref_frame_buf->buf.y_buffer;
+  } else {
+    frame_width = ref_frame_buf->buf.uv_crop_width;
+    frame_height = ref_frame_buf->buf.uv_crop_height;
+    ref_frame = plane == 1 ? ref_frame_buf->buf.u_buffer
+                         : ref_frame_buf->buf.v_buffer;
+  }
+
+  if (is_scaled) {
+    const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, mv, bw, bh,
+                                               pd->subsampling_x,
+                                               pd->subsampling_y);
+    // Co-ordinate of containing block to pixel precision.
+    int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x));
+    int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y));
+#if CONFIG_BETTER_HW_COMPATIBILITY
+    assert(xd->mi[0]->sb_type != BLOCK_4X8 &&
+           xd->mi[0]->sb_type != BLOCK_8X4);
+    assert(mv_q4.row == mv->row * (1 << (1 - pd->subsampling_y)) &&
+           mv_q4.col == mv->col * (1 << (1 - pd->subsampling_x)));
+#endif
+    // Co-ordinate of the block to 1/16th pixel precision.
+    x0_16 = (x_start + x) << SUBPEL_BITS;
+    y0_16 = (y_start + y) << SUBPEL_BITS;
+
+    // Co-ordinate of current block in reference frame
+    // to 1/16th pixel precision.
+    x0_16 = sf->scale_value_x(x0_16, sf);
+    y0_16 = sf->scale_value_y(y0_16, sf);
+
+    // Map the top left corner of the block into the reference frame.
+    x0 = sf->scale_value_x(x_start + x, sf);
+    y0 = sf->scale_value_y(y_start + y, sf);
+
+    // Scale the MV and incorporate the sub-pixel offset of the block
+    // in the reference frame.
+    scaled_mv = vp9_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
+    xs = sf->x_step_q4;
+    ys = sf->y_step_q4;
+  } else {
+    // Co-ordinate of containing block to pixel precision.
+    x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x;
+    y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y;
+
+    // Co-ordinate of the block to 1/16th pixel precision.
+    x0_16 = x0 << SUBPEL_BITS;
+    y0_16 = y0 << SUBPEL_BITS;
+
+    scaled_mv.row = mv->row * (1 << (1 - pd->subsampling_y));
+    scaled_mv.col = mv->col * (1 << (1 - pd->subsampling_x));
+    xs = ys = 16;
+  }
+  subpel_x = scaled_mv.col & SUBPEL_MASK;
+  subpel_y = scaled_mv.row & SUBPEL_MASK;
+
+  // Calculate the top left corner of the best matching block in the
+  // reference frame.
+  x0 += scaled_mv.col >> SUBPEL_BITS;
+  y0 += scaled_mv.row >> SUBPEL_BITS;
+  x0_16 += scaled_mv.col;
+  y0_16 += scaled_mv.row;
+
+  // Get reference block pointer.
+  buf_ptr = ref_frame + y0 * pre_buf->stride + x0;
+  buf_stride = pre_buf->stride;
+
+  // Do border extension if there is motion or the
+  // width/height is not a multiple of 8 pixels.
+  if (is_scaled || scaled_mv.col || scaled_mv.row ||
+      (frame_width & 0x7) || (frame_height & 0x7)) {
+    int y1 = ((y0_16 + (h - 1) * ys) >> SUBPEL_BITS) + 1;
+
+    // Get reference block bottom right horizontal coordinate.
+    int x1 = ((x0_16 + (w - 1) * xs) >> SUBPEL_BITS) + 1;
+    int x_pad = 0, y_pad = 0;
+
+    if (subpel_x || (sf->x_step_q4 != SUBPEL_SHIFTS)) {
+      x0 -= VP9_INTERP_EXTEND - 1;
+      x1 += VP9_INTERP_EXTEND;
+      x_pad = 1;
+    }
+
+    if (subpel_y || (sf->y_step_q4 != SUBPEL_SHIFTS)) {
+      y0 -= VP9_INTERP_EXTEND - 1;
+      y1 += VP9_INTERP_EXTEND;
+      y_pad = 1;
+    }
+
+    // Wait until reference block is ready. Pad 7 more pixels as last 7
+    // pixels of each superblock row can be changed by next superblock row.
+    if (worker != NULL)
+      vp9_frameworker_wait(worker, ref_frame_buf,
+                           VPXMAX(0, (y1 + 7)) << (plane == 0 ? 0 : 1));
+
+    // Skip border extension if block is inside the frame.
+    if (x0 < 0 || x0 > frame_width - 1 || x1 < 0 || x1 > frame_width - 1 ||
+        y0 < 0 || y0 > frame_height - 1 || y1 < 0 || y1 > frame_height - 1) {
+      // Extend the border.
+      const uint8_t *const buf_ptr1 = ref_frame + y0 * buf_stride + x0;
+      const int b_w = x1 - x0 + 1;
+      const int b_h = y1 - y0 + 1;
+      const int border_offset = y_pad * 3 * b_w + x_pad * 3;
+
+      extend_and_predict(buf_ptr1, buf_stride, x0, y0, b_w, b_h,
+                         frame_width, frame_height, border_offset,
+                         dst, dst_buf->stride,
+                         subpel_x, subpel_y,
+                         kernel, sf,
+#if CONFIG_VP9_HIGHBITDEPTH
+                         xd,
+#endif
+                         w, h, ref, xs, ys);
+      return;
+    }
+  } else {
+    // Wait until reference block is ready. Pad 7 more pixels as last 7
+    // pixels of each superblock row can be changed by next superblock row.
+    if (worker != NULL) {
+      const int y1 = (y0_16 + (h - 1) * ys) >> SUBPEL_BITS;
+      vp9_frameworker_wait(worker, ref_frame_buf,
+                           VPXMAX(0, (y1 + 7)) << (plane == 0 ? 0 : 1));
+    }
+  }
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    high_inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
+                         subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd);
+  } else {
+    inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
+                    subpel_y, sf, w, h, ref, kernel, xs, ys);
+  }
+#else
+  inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
+                  subpel_y, sf, w, h, ref, kernel, xs, ys);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}
+
+static void dec_build_inter_predictors_sb(VP9Decoder *const pbi,
+                                          MACROBLOCKD *xd,
+                                          int mi_row, int mi_col) {
+  int plane;
+  const int mi_x = mi_col * MI_SIZE;
+  const int mi_y = mi_row * MI_SIZE;
+  const MODE_INFO *mi = xd->mi[0];
+  const InterpKernel *kernel = vp9_filter_kernels[mi->interp_filter];
+  const BLOCK_SIZE sb_type = mi->sb_type;
+  const int is_compound = has_second_ref(mi);
+  int ref;
+  int is_scaled;
+  VPxWorker *const fwo = pbi->frame_parallel_decode ?
+      pbi->frame_worker_owner : NULL;
+
+  for (ref = 0; ref < 1 + is_compound; ++ref) {
+    const MV_REFERENCE_FRAME frame = mi->ref_frame[ref];
+    RefBuffer *ref_buf = &pbi->common.frame_refs[frame - LAST_FRAME];
+    const struct scale_factors *const sf = &ref_buf->sf;
+    const int idx = ref_buf->idx;
+    BufferPool *const pool = pbi->common.buffer_pool;
+    RefCntBuffer *const ref_frame_buf = &pool->frame_bufs[idx];
+
+    if (!vp9_is_valid_scale(sf))
+      vpx_internal_error(xd->error_info, VPX_CODEC_UNSUP_BITSTREAM,
+                         "Reference frame has invalid dimensions");
+
+    is_scaled = vp9_is_scaled(sf);
+    vp9_setup_pre_planes(xd, ref, ref_buf->buf, mi_row, mi_col,
+                         is_scaled ? sf : NULL);
+    xd->block_refs[ref] = ref_buf;
+
+    if (sb_type < BLOCK_8X8) {
+      for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+        struct macroblockd_plane *const pd = &xd->plane[plane];
+        struct buf_2d *const dst_buf = &pd->dst;
+        const int num_4x4_w = pd->n4_w;
+        const int num_4x4_h = pd->n4_h;
+        const int n4w_x4 = 4 * num_4x4_w;
+        const int n4h_x4 = 4 * num_4x4_h;
+        struct buf_2d *const pre_buf = &pd->pre[ref];
+        int i = 0, x, y;
+        for (y = 0; y < num_4x4_h; ++y) {
+          for (x = 0; x < num_4x4_w; ++x) {
+            const MV mv = average_split_mvs(pd, mi, ref, i++);
+            dec_build_inter_predictors(fwo, xd, plane, n4w_x4, n4h_x4,
+                                       4 * x, 4 * y, 4, 4, mi_x, mi_y, kernel,
+                                       sf, pre_buf, dst_buf, &mv,
+                                       ref_frame_buf, is_scaled, ref);
+          }
+        }
+      }
+    } else {
+      const MV mv = mi->mv[ref].as_mv;
+      for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+        struct macroblockd_plane *const pd = &xd->plane[plane];
+        struct buf_2d *const dst_buf = &pd->dst;
+        const int num_4x4_w = pd->n4_w;
+        const int num_4x4_h = pd->n4_h;
+        const int n4w_x4 = 4 * num_4x4_w;
+        const int n4h_x4 = 4 * num_4x4_h;
+        struct buf_2d *const pre_buf = &pd->pre[ref];
+        dec_build_inter_predictors(fwo, xd, plane, n4w_x4, n4h_x4,
+                                   0, 0, n4w_x4, n4h_x4, mi_x, mi_y, kernel,
+                                   sf, pre_buf, dst_buf, &mv,
+                                   ref_frame_buf, is_scaled, ref);
+      }
+    }
+  }
+}
+
+static INLINE TX_SIZE dec_get_uv_tx_size(const MODE_INFO *mi,
+                                         int n4_wl, int n4_hl) {
+  // get minimum log2 num4x4s dimension
+  const int x = VPXMIN(n4_wl, n4_hl);
+  return VPXMIN(mi->tx_size,  x);
+}
+
+static INLINE void dec_reset_skip_context(MACROBLOCKD *xd) {
+  int i;
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    struct macroblockd_plane *const pd = &xd->plane[i];
+    memset(pd->above_context, 0, sizeof(ENTROPY_CONTEXT) * pd->n4_w);
+    memset(pd->left_context, 0, sizeof(ENTROPY_CONTEXT) * pd->n4_h);
+  }
+}
+
+static void set_plane_n4(MACROBLOCKD *const xd, int bw, int bh, int bwl,
+                         int bhl) {
+  int i;
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    xd->plane[i].n4_w = (bw << 1) >> xd->plane[i].subsampling_x;
+    xd->plane[i].n4_h = (bh << 1) >> xd->plane[i].subsampling_y;
+    xd->plane[i].n4_wl = bwl - xd->plane[i].subsampling_x;
+    xd->plane[i].n4_hl = bhl - xd->plane[i].subsampling_y;
+  }
+}
+
+static MODE_INFO *set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+                              BLOCK_SIZE bsize, int mi_row, int mi_col,
+                              int bw, int bh, int x_mis, int y_mis,
+                              int bwl, int bhl) {
+  const int offset = mi_row * cm->mi_stride + mi_col;
+  int x, y;
+  const TileInfo *const tile = &xd->tile;
+
+  xd->mi = cm->mi_grid_visible + offset;
+  xd->mi[0] = &cm->mi[offset];
+  // TODO(slavarnway): Generate sb_type based on bwl and bhl, instead of
+  // passing bsize from decode_partition().
+  xd->mi[0]->sb_type = bsize;
+  for (y = 0; y < y_mis; ++y)
+    for (x = !y; x < x_mis; ++x) {
+      xd->mi[y * cm->mi_stride + x] = xd->mi[0];
+    }
+
+  set_plane_n4(xd, bw, bh, bwl, bhl);
+
+  set_skip_context(xd, mi_row, mi_col);
+
+  // Distance of Mb to the various image edges. These are specified to 8th pel
+  // as they are always compared to values that are in 1/8th pel units
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
+
+  vp9_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col);
+  return xd->mi[0];
+}
+
+static void decode_block(VP9Decoder *const pbi, MACROBLOCKD *const xd,
+                         int mi_row, int mi_col,
+                         vpx_reader *r, BLOCK_SIZE bsize,
+                         int bwl, int bhl) {
+  VP9_COMMON *const cm = &pbi->common;
+  const int less8x8 = bsize < BLOCK_8X8;
+  const int bw = 1 << (bwl - 1);
+  const int bh = 1 << (bhl - 1);
+  const int x_mis = VPXMIN(bw, cm->mi_cols - mi_col);
+  const int y_mis = VPXMIN(bh, cm->mi_rows - mi_row);
+
+  MODE_INFO *mi = set_offsets(cm, xd, bsize, mi_row, mi_col,
+                              bw, bh, x_mis, y_mis, bwl, bhl);
+
+  if (bsize >= BLOCK_8X8 && (cm->subsampling_x || cm->subsampling_y)) {
+    const BLOCK_SIZE uv_subsize =
+        ss_size_lookup[bsize][cm->subsampling_x][cm->subsampling_y];
+    if (uv_subsize == BLOCK_INVALID)
+      vpx_internal_error(xd->error_info,
+                         VPX_CODEC_CORRUPT_FRAME, "Invalid block size.");
+  }
+
+  vpx_read_mode_info(pbi, xd, mi_row, mi_col, r, x_mis, y_mis);
+
+  if (mi->skip) {
+    dec_reset_skip_context(xd);
+  }
+
+  if (!is_inter_block(mi)) {
+    int plane;
+    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      const TX_SIZE tx_size =
+          plane ? dec_get_uv_tx_size(mi, pd->n4_wl, pd->n4_hl)
+                  : mi->tx_size;
+      const int num_4x4_w = pd->n4_w;
+      const int num_4x4_h = pd->n4_h;
+      const int step = (1 << tx_size);
+      int row, col;
+      const int max_blocks_wide = num_4x4_w + (xd->mb_to_right_edge >= 0 ?
+          0 : xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+      const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ?
+          0 : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+
+      for (row = 0; row < max_blocks_high; row += step)
+        for (col = 0; col < max_blocks_wide; col += step)
+          predict_and_reconstruct_intra_block(xd, r, mi, plane,
+                                              row, col, tx_size);
+    }
+  } else {
+    // Prediction
+    dec_build_inter_predictors_sb(pbi, xd, mi_row, mi_col);
+
+    // Reconstruction
+    if (!mi->skip) {
+      int eobtotal = 0;
+      int plane;
+
+      for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+        const struct macroblockd_plane *const pd = &xd->plane[plane];
+        const TX_SIZE tx_size =
+            plane ? dec_get_uv_tx_size(mi, pd->n4_wl, pd->n4_hl)
+                    : mi->tx_size;
+        const int num_4x4_w = pd->n4_w;
+        const int num_4x4_h = pd->n4_h;
+        const int step = (1 << tx_size);
+        int row, col;
+        const int max_blocks_wide = num_4x4_w + (xd->mb_to_right_edge >= 0 ?
+            0 : xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+        const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ?
+            0 : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+
+        for (row = 0; row < max_blocks_high; row += step)
+          for (col = 0; col < max_blocks_wide; col += step)
+            eobtotal += reconstruct_inter_block(xd, r, mi, plane, row, col,
+                                                tx_size);
+      }
+
+      if (!less8x8 && eobtotal == 0)
+        mi->skip = 1;  // skip loopfilter
+    }
+  }
+
+  xd->corrupted |= vpx_reader_has_error(r);
+
+  if (cm->lf.filter_level) {
+    vp9_build_mask(cm, mi, mi_row, mi_col, bw, bh);
+  }
+}
+
+static INLINE int dec_partition_plane_context(const MACROBLOCKD *xd,
+                                              int mi_row, int mi_col,
+                                              int bsl) {
+  const PARTITION_CONTEXT *above_ctx = xd->above_seg_context + mi_col;
+  const PARTITION_CONTEXT *left_ctx = xd->left_seg_context + (mi_row & MI_MASK);
+  int above = (*above_ctx >> bsl) & 1 , left = (*left_ctx >> bsl) & 1;
+
+//  assert(bsl >= 0);
+
+  return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
+}
+
+static INLINE void dec_update_partition_context(MACROBLOCKD *xd,
+                                                int mi_row, int mi_col,
+                                                BLOCK_SIZE subsize,
+                                                int bw) {
+  PARTITION_CONTEXT *const above_ctx = xd->above_seg_context + mi_col;
+  PARTITION_CONTEXT *const left_ctx = xd->left_seg_context + (mi_row & MI_MASK);
+
+  // update the partition context at the end notes. set partition bits
+  // of block sizes larger than the current one to be one, and partition
+  // bits of smaller block sizes to be zero.
+  memset(above_ctx, partition_context_lookup[subsize].above, bw);
+  memset(left_ctx, partition_context_lookup[subsize].left, bw);
+}
+
+static PARTITION_TYPE read_partition(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                     vpx_reader *r,
+                                     int has_rows, int has_cols, int bsl) {
+  const int ctx = dec_partition_plane_context(xd, mi_row, mi_col, bsl);
+  const vpx_prob *const probs = get_partition_probs(xd, ctx);
+  FRAME_COUNTS *counts = xd->counts;
+  PARTITION_TYPE p;
+
+  if (has_rows && has_cols)
+    p = (PARTITION_TYPE)vpx_read_tree(r, vp9_partition_tree, probs);
+  else if (!has_rows && has_cols)
+    p = vpx_read(r, probs[1]) ? PARTITION_SPLIT : PARTITION_HORZ;
+  else if (has_rows && !has_cols)
+    p = vpx_read(r, probs[2]) ? PARTITION_SPLIT : PARTITION_VERT;
+  else
+    p = PARTITION_SPLIT;
+
+  if (counts)
+    ++counts->partition[ctx][p];
+
+  return p;
+}
+
+// TODO(slavarnway): eliminate bsize and subsize in future commits
+static void decode_partition(VP9Decoder *const pbi, MACROBLOCKD *const xd,
+                             int mi_row, int mi_col,
+                             vpx_reader* r, BLOCK_SIZE bsize, int n4x4_l2) {
+  VP9_COMMON *const cm = &pbi->common;
+  const int n8x8_l2 = n4x4_l2 - 1;
+  const int num_8x8_wh = 1 << n8x8_l2;
+  const int hbs = num_8x8_wh >> 1;
+  PARTITION_TYPE partition;
+  BLOCK_SIZE subsize;
+  const int has_rows = (mi_row + hbs) < cm->mi_rows;
+  const int has_cols = (mi_col + hbs) < cm->mi_cols;
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
+
+  partition = read_partition(xd, mi_row, mi_col, r, has_rows, has_cols,
+                             n8x8_l2);
+  subsize = subsize_lookup[partition][bsize];  // get_subsize(bsize, partition);
+  if (!hbs) {
+    // calculate bmode block dimensions (log 2)
+    xd->bmode_blocks_wl = 1 >> !!(partition & PARTITION_VERT);
+    xd->bmode_blocks_hl = 1 >> !!(partition & PARTITION_HORZ);
+    decode_block(pbi, xd, mi_row, mi_col, r, subsize, 1, 1);
+  } else {
+    switch (partition) {
+      case PARTITION_NONE:
+        decode_block(pbi, xd, mi_row, mi_col, r, subsize, n4x4_l2, n4x4_l2);
+        break;
+      case PARTITION_HORZ:
+        decode_block(pbi, xd, mi_row, mi_col, r, subsize, n4x4_l2, n8x8_l2);
+        if (has_rows)
+          decode_block(pbi, xd, mi_row + hbs, mi_col, r, subsize, n4x4_l2,
+                       n8x8_l2);
+        break;
+      case PARTITION_VERT:
+        decode_block(pbi, xd, mi_row, mi_col, r, subsize, n8x8_l2, n4x4_l2);
+        if (has_cols)
+          decode_block(pbi, xd, mi_row, mi_col + hbs, r, subsize, n8x8_l2,
+                       n4x4_l2);
+        break;
+      case PARTITION_SPLIT:
+        decode_partition(pbi, xd, mi_row, mi_col, r, subsize, n8x8_l2);
+        decode_partition(pbi, xd, mi_row, mi_col + hbs, r, subsize, n8x8_l2);
+        decode_partition(pbi, xd, mi_row + hbs, mi_col, r, subsize, n8x8_l2);
+        decode_partition(pbi, xd, mi_row + hbs, mi_col + hbs, r, subsize,
+                         n8x8_l2);
+        break;
+      default:
+        assert(0 && "Invalid partition type");
+    }
+  }
+
+  // update partition context
+  if (bsize >= BLOCK_8X8 &&
+      (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
+    dec_update_partition_context(xd, mi_row, mi_col, subsize, num_8x8_wh);
+}
+
+static void setup_token_decoder(const uint8_t *data,
+                                const uint8_t *data_end,
+                                size_t read_size,
+                                struct vpx_internal_error_info *error_info,
+                                vpx_reader *r,
+                                vpx_decrypt_cb decrypt_cb,
+                                void *decrypt_state) {
+  // Validate the calculated partition length. If the buffer
+  // described by the partition can't be fully read, then restrict
+  // it to the portion that can be (for EC mode) or throw an error.
+  if (!read_is_valid(data, read_size, data_end))
+    vpx_internal_error(error_info, VPX_CODEC_CORRUPT_FRAME,
+                       "Truncated packet or corrupt tile length");
+
+  if (vpx_reader_init(r, data, read_size, decrypt_cb, decrypt_state))
+    vpx_internal_error(error_info, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate bool decoder %d", 1);
+}
+
+static void read_coef_probs_common(vp9_coeff_probs_model *coef_probs,
+                                   vpx_reader *r) {
+  int i, j, k, l, m;
+
+  if (vpx_read_bit(r))
+    for (i = 0; i < PLANE_TYPES; ++i)
+      for (j = 0; j < REF_TYPES; ++j)
+        for (k = 0; k < COEF_BANDS; ++k)
+          for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l)
+            for (m = 0; m < UNCONSTRAINED_NODES; ++m)
+              vp9_diff_update_prob(r, &coef_probs[i][j][k][l][m]);
+}
+
+static void read_coef_probs(FRAME_CONTEXT *fc, TX_MODE tx_mode,
+                            vpx_reader *r) {
+    const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
+    TX_SIZE tx_size;
+    for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
+      read_coef_probs_common(fc->coef_probs[tx_size], r);
+}
+
+static void setup_segmentation(struct segmentation *seg,
+                               struct vpx_read_bit_buffer *rb) {
+  int i, j;
+
+  seg->update_map = 0;
+  seg->update_data = 0;
+
+  seg->enabled = vpx_rb_read_bit(rb);
+  if (!seg->enabled)
+    return;
+
+  // Segmentation map update
+  seg->update_map = vpx_rb_read_bit(rb);
+  if (seg->update_map) {
+    for (i = 0; i < SEG_TREE_PROBS; i++)
+      seg->tree_probs[i] = vpx_rb_read_bit(rb) ? vpx_rb_read_literal(rb, 8)
+                                               : MAX_PROB;
+
+    seg->temporal_update = vpx_rb_read_bit(rb);
+    if (seg->temporal_update) {
+      for (i = 0; i < PREDICTION_PROBS; i++)
+        seg->pred_probs[i] = vpx_rb_read_bit(rb) ? vpx_rb_read_literal(rb, 8)
+                                                 : MAX_PROB;
+    } else {
+      for (i = 0; i < PREDICTION_PROBS; i++)
+        seg->pred_probs[i] = MAX_PROB;
+    }
+  }
+
+  // Segmentation data update
+  seg->update_data = vpx_rb_read_bit(rb);
+  if (seg->update_data) {
+    seg->abs_delta = vpx_rb_read_bit(rb);
+
+    vp9_clearall_segfeatures(seg);
+
+    for (i = 0; i < MAX_SEGMENTS; i++) {
+      for (j = 0; j < SEG_LVL_MAX; j++) {
+        int data = 0;
+        const int feature_enabled = vpx_rb_read_bit(rb);
+        if (feature_enabled) {
+          vp9_enable_segfeature(seg, i, j);
+          data = decode_unsigned_max(rb, vp9_seg_feature_data_max(j));
+          if (vp9_is_segfeature_signed(j))
+            data = vpx_rb_read_bit(rb) ? -data : data;
+        }
+        vp9_set_segdata(seg, i, j, data);
+      }
+    }
+  }
+}
+
+static void setup_loopfilter(struct loopfilter *lf,
+                             struct vpx_read_bit_buffer *rb) {
+  lf->filter_level = vpx_rb_read_literal(rb, 6);
+  lf->sharpness_level = vpx_rb_read_literal(rb, 3);
+
+  // Read in loop filter deltas applied at the MB level based on mode or ref
+  // frame.
+  lf->mode_ref_delta_update = 0;
+
+  lf->mode_ref_delta_enabled = vpx_rb_read_bit(rb);
+  if (lf->mode_ref_delta_enabled) {
+    lf->mode_ref_delta_update = vpx_rb_read_bit(rb);
+    if (lf->mode_ref_delta_update) {
+      int i;
+
+      for (i = 0; i < MAX_REF_LF_DELTAS; i++)
+        if (vpx_rb_read_bit(rb))
+          lf->ref_deltas[i] = vpx_rb_read_signed_literal(rb, 6);
+
+      for (i = 0; i < MAX_MODE_LF_DELTAS; i++)
+        if (vpx_rb_read_bit(rb))
+          lf->mode_deltas[i] = vpx_rb_read_signed_literal(rb, 6);
+    }
+  }
+}
+
+static INLINE int read_delta_q(struct vpx_read_bit_buffer *rb) {
+  return vpx_rb_read_bit(rb) ? vpx_rb_read_signed_literal(rb, 4) : 0;
+}
+
+static void setup_quantization(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+                               struct vpx_read_bit_buffer *rb) {
+  cm->base_qindex = vpx_rb_read_literal(rb, QINDEX_BITS);
+  cm->y_dc_delta_q = read_delta_q(rb);
+  cm->uv_dc_delta_q = read_delta_q(rb);
+  cm->uv_ac_delta_q = read_delta_q(rb);
+  cm->dequant_bit_depth = cm->bit_depth;
+  xd->lossless = cm->base_qindex == 0 &&
+                 cm->y_dc_delta_q == 0 &&
+                 cm->uv_dc_delta_q == 0 &&
+                 cm->uv_ac_delta_q == 0;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  xd->bd = (int)cm->bit_depth;
+#endif
+}
+
+static void setup_segmentation_dequant(VP9_COMMON *const cm) {
+  // Build y/uv dequant values based on segmentation.
+  if (cm->seg.enabled) {
+    int i;
+    for (i = 0; i < MAX_SEGMENTS; ++i) {
+      const int qindex = vp9_get_qindex(&cm->seg, i, cm->base_qindex);
+      cm->y_dequant[i][0] = vp9_dc_quant(qindex, cm->y_dc_delta_q,
+                                         cm->bit_depth);
+      cm->y_dequant[i][1] = vp9_ac_quant(qindex, 0, cm->bit_depth);
+      cm->uv_dequant[i][0] = vp9_dc_quant(qindex, cm->uv_dc_delta_q,
+                                          cm->bit_depth);
+      cm->uv_dequant[i][1] = vp9_ac_quant(qindex, cm->uv_ac_delta_q,
+                                          cm->bit_depth);
+    }
+  } else {
+    const int qindex = cm->base_qindex;
+    // When segmentation is disabled, only the first value is used.  The
+    // remaining are don't cares.
+    cm->y_dequant[0][0] = vp9_dc_quant(qindex, cm->y_dc_delta_q, cm->bit_depth);
+    cm->y_dequant[0][1] = vp9_ac_quant(qindex, 0, cm->bit_depth);
+    cm->uv_dequant[0][0] = vp9_dc_quant(qindex, cm->uv_dc_delta_q,
+                                        cm->bit_depth);
+    cm->uv_dequant[0][1] = vp9_ac_quant(qindex, cm->uv_ac_delta_q,
+                                        cm->bit_depth);
+  }
+}
+
+static INTERP_FILTER read_interp_filter(struct vpx_read_bit_buffer *rb) {
+  const INTERP_FILTER literal_to_filter[] = { EIGHTTAP_SMOOTH,
+                                              EIGHTTAP,
+                                              EIGHTTAP_SHARP,
+                                              BILINEAR };
+  return vpx_rb_read_bit(rb) ? SWITCHABLE
+                             : literal_to_filter[vpx_rb_read_literal(rb, 2)];
+}
+
+static void setup_render_size(VP9_COMMON *cm, struct vpx_read_bit_buffer *rb) {
+  cm->render_width = cm->width;
+  cm->render_height = cm->height;
+  if (vpx_rb_read_bit(rb))
+    vp9_read_frame_size(rb, &cm->render_width, &cm->render_height);
+}
+
+static void resize_mv_buffer(VP9_COMMON *cm) {
+  vpx_free(cm->cur_frame->mvs);
+  cm->cur_frame->mi_rows = cm->mi_rows;
+  cm->cur_frame->mi_cols = cm->mi_cols;
+  cm->cur_frame->mvs = (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols,
+                                            sizeof(*cm->cur_frame->mvs));
+}
+
+static void resize_context_buffers(VP9_COMMON *cm, int width, int height) {
+#if CONFIG_SIZE_LIMIT
+  if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT)
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Dimensions of %dx%d beyond allowed size of %dx%d.",
+                       width, height, DECODE_WIDTH_LIMIT, DECODE_HEIGHT_LIMIT);
+#endif
+  if (cm->width != width || cm->height != height) {
+    const int new_mi_rows =
+        ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2) >> MI_SIZE_LOG2;
+    const int new_mi_cols =
+        ALIGN_POWER_OF_TWO(width,  MI_SIZE_LOG2) >> MI_SIZE_LOG2;
+
+    // Allocations in vp9_alloc_context_buffers() depend on individual
+    // dimensions as well as the overall size.
+    if (new_mi_cols > cm->mi_cols || new_mi_rows > cm->mi_rows) {
+      if (vp9_alloc_context_buffers(cm, width, height))
+        vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                           "Failed to allocate context buffers");
+    } else {
+      vp9_set_mb_mi(cm, width, height);
+    }
+    vp9_init_context_buffers(cm);
+    cm->width = width;
+    cm->height = height;
+  }
+  if (cm->cur_frame->mvs == NULL || cm->mi_rows > cm->cur_frame->mi_rows ||
+      cm->mi_cols > cm->cur_frame->mi_cols) {
+    resize_mv_buffer(cm);
+  }
+}
+
+static void setup_frame_size(VP9_COMMON *cm, struct vpx_read_bit_buffer *rb) {
+  int width, height;
+  BufferPool *const pool = cm->buffer_pool;
+  vp9_read_frame_size(rb, &width, &height);
+  resize_context_buffers(cm, width, height);
+  setup_render_size(cm, rb);
+
+  lock_buffer_pool(pool);
+  if (vpx_realloc_frame_buffer(
+          get_frame_new_buffer(cm), cm->width, cm->height,
+          cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+          cm->use_highbitdepth,
+#endif
+          VP9_DEC_BORDER_IN_PIXELS,
+          cm->byte_alignment,
+          &pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer, pool->get_fb_cb,
+          pool->cb_priv)) {
+    unlock_buffer_pool(pool);
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate frame buffer");
+  }
+  unlock_buffer_pool(pool);
+
+  pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x;
+  pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y;
+  pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth;
+  pool->frame_bufs[cm->new_fb_idx].buf.color_space = cm->color_space;
+  pool->frame_bufs[cm->new_fb_idx].buf.color_range = cm->color_range;
+  pool->frame_bufs[cm->new_fb_idx].buf.render_width  = cm->render_width;
+  pool->frame_bufs[cm->new_fb_idx].buf.render_height = cm->render_height;
+}
+
+static INLINE int valid_ref_frame_img_fmt(vpx_bit_depth_t ref_bit_depth,
+                                          int ref_xss, int ref_yss,
+                                          vpx_bit_depth_t this_bit_depth,
+                                          int this_xss, int this_yss) {
+  return ref_bit_depth == this_bit_depth && ref_xss == this_xss &&
+         ref_yss == this_yss;
+}
+
+static void setup_frame_size_with_refs(VP9_COMMON *cm,
+                                       struct vpx_read_bit_buffer *rb) {
+  int width, height;
+  int found = 0, i;
+  int has_valid_ref_frame = 0;
+  BufferPool *const pool = cm->buffer_pool;
+  for (i = 0; i < REFS_PER_FRAME; ++i) {
+    if (vpx_rb_read_bit(rb)) {
+      YV12_BUFFER_CONFIG *const buf = cm->frame_refs[i].buf;
+      width = buf->y_crop_width;
+      height = buf->y_crop_height;
+      found = 1;
+      break;
+    }
+  }
+
+  if (!found)
+    vp9_read_frame_size(rb, &width, &height);
+
+  if (width <= 0 || height <= 0)
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Invalid frame size");
+
+  // Check to make sure at least one of frames that this frame references
+  // has valid dimensions.
+  for (i = 0; i < REFS_PER_FRAME; ++i) {
+    RefBuffer *const ref_frame = &cm->frame_refs[i];
+    has_valid_ref_frame |= valid_ref_frame_size(ref_frame->buf->y_crop_width,
+                                                ref_frame->buf->y_crop_height,
+                                                width, height);
+  }
+  if (!has_valid_ref_frame)
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Referenced frame has invalid size");
+  for (i = 0; i < REFS_PER_FRAME; ++i) {
+    RefBuffer *const ref_frame = &cm->frame_refs[i];
+    if (!valid_ref_frame_img_fmt(
+            ref_frame->buf->bit_depth,
+            ref_frame->buf->subsampling_x,
+            ref_frame->buf->subsampling_y,
+            cm->bit_depth,
+            cm->subsampling_x,
+            cm->subsampling_y))
+      vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                         "Referenced frame has incompatible color format");
+  }
+
+  resize_context_buffers(cm, width, height);
+  setup_render_size(cm, rb);
+
+  lock_buffer_pool(pool);
+  if (vpx_realloc_frame_buffer(
+          get_frame_new_buffer(cm), cm->width, cm->height,
+          cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+          cm->use_highbitdepth,
+#endif
+          VP9_DEC_BORDER_IN_PIXELS,
+          cm->byte_alignment,
+          &pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer, pool->get_fb_cb,
+          pool->cb_priv)) {
+    unlock_buffer_pool(pool);
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate frame buffer");
+  }
+  unlock_buffer_pool(pool);
+
+  pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x;
+  pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y;
+  pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth;
+  pool->frame_bufs[cm->new_fb_idx].buf.color_space = cm->color_space;
+  pool->frame_bufs[cm->new_fb_idx].buf.color_range = cm->color_range;
+  pool->frame_bufs[cm->new_fb_idx].buf.render_width  = cm->render_width;
+  pool->frame_bufs[cm->new_fb_idx].buf.render_height = cm->render_height;
+}
+
+static void setup_tile_info(VP9_COMMON *cm, struct vpx_read_bit_buffer *rb) {
+  int min_log2_tile_cols, max_log2_tile_cols, max_ones;
+  vp9_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
+
+  // columns
+  max_ones = max_log2_tile_cols - min_log2_tile_cols;
+  cm->log2_tile_cols = min_log2_tile_cols;
+  while (max_ones-- && vpx_rb_read_bit(rb))
+    cm->log2_tile_cols++;
+
+  if (cm->log2_tile_cols > 6)
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Invalid number of tile columns");
+
+  // rows
+  cm->log2_tile_rows = vpx_rb_read_bit(rb);
+  if (cm->log2_tile_rows)
+    cm->log2_tile_rows += vpx_rb_read_bit(rb);
+}
+
+// Reads the next tile returning its size and adjusting '*data' accordingly
+// based on 'is_last'.
+static void get_tile_buffer(const uint8_t *const data_end,
+                            int is_last,
+                            struct vpx_internal_error_info *error_info,
+                            const uint8_t **data,
+                            vpx_decrypt_cb decrypt_cb, void *decrypt_state,
+                            TileBuffer *buf) {
+  size_t size;
+
+  if (!is_last) {
+    if (!read_is_valid(*data, 4, data_end))
+      vpx_internal_error(error_info, VPX_CODEC_CORRUPT_FRAME,
+                         "Truncated packet or corrupt tile length");
+
+    if (decrypt_cb) {
+      uint8_t be_data[4];
+      decrypt_cb(decrypt_state, *data, be_data, 4);
+      size = mem_get_be32(be_data);
+    } else {
+      size = mem_get_be32(*data);
+    }
+    *data += 4;
+
+    if (size > (size_t)(data_end - *data))
+      vpx_internal_error(error_info, VPX_CODEC_CORRUPT_FRAME,
+                         "Truncated packet or corrupt tile size");
+  } else {
+    size = data_end - *data;
+  }
+
+  buf->data = *data;
+  buf->size = size;
+
+  *data += size;
+}
+
+static void get_tile_buffers(VP9Decoder *pbi,
+                             const uint8_t *data, const uint8_t *data_end,
+                             int tile_cols, int tile_rows,
+                             TileBuffer (*tile_buffers)[1 << 6]) {
+  int r, c;
+
+  for (r = 0; r < tile_rows; ++r) {
+    for (c = 0; c < tile_cols; ++c) {
+      const int is_last = (r == tile_rows - 1) && (c == tile_cols - 1);
+      TileBuffer *const buf = &tile_buffers[r][c];
+      buf->col = c;
+      get_tile_buffer(data_end, is_last, &pbi->common.error, &data,
+                      pbi->decrypt_cb, pbi->decrypt_state, buf);
+    }
+  }
+}
+
+static const uint8_t *decode_tiles(VP9Decoder *pbi,
+                                   const uint8_t *data,
+                                   const uint8_t *data_end) {
+  VP9_COMMON *const cm = &pbi->common;
+  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
+  const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int tile_rows = 1 << cm->log2_tile_rows;
+  TileBuffer tile_buffers[4][1 << 6];
+  int tile_row, tile_col;
+  int mi_row, mi_col;
+  TileData *tile_data = NULL;
+
+  if (cm->lf.filter_level && !cm->skip_loop_filter &&
+      pbi->lf_worker.data1 == NULL) {
+    CHECK_MEM_ERROR(cm, pbi->lf_worker.data1,
+                    vpx_memalign(32, sizeof(LFWorkerData)));
+    pbi->lf_worker.hook = (VPxWorkerHook)vp9_loop_filter_worker;
+    if (pbi->max_threads > 1 && !winterface->reset(&pbi->lf_worker)) {
+      vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                         "Loop filter thread creation failed");
+    }
+  }
+
+  if (cm->lf.filter_level && !cm->skip_loop_filter) {
+    LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
+    // Be sure to sync as we might be resuming after a failed frame decode.
+    winterface->sync(&pbi->lf_worker);
+    vp9_loop_filter_data_reset(lf_data, get_frame_new_buffer(cm), cm,
+                               pbi->mb.plane);
+  }
+
+  assert(tile_rows <= 4);
+  assert(tile_cols <= (1 << 6));
+
+  // Note: this memset assumes above_context[0], [1] and [2]
+  // are allocated as part of the same buffer.
+  memset(cm->above_context, 0,
+         sizeof(*cm->above_context) * MAX_MB_PLANE * 2 * aligned_cols);
+
+  memset(cm->above_seg_context, 0,
+         sizeof(*cm->above_seg_context) * aligned_cols);
+
+  vp9_reset_lfm(cm);
+
+  get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows, tile_buffers);
+
+  if (pbi->tile_data == NULL ||
+      (tile_cols * tile_rows) != pbi->total_tiles) {
+    vpx_free(pbi->tile_data);
+    CHECK_MEM_ERROR(
+        cm,
+        pbi->tile_data,
+        vpx_memalign(32, tile_cols * tile_rows * (sizeof(*pbi->tile_data))));
+    pbi->total_tiles = tile_rows * tile_cols;
+  }
+
+  // Load all tile information into tile_data.
+  for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
+    for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+      const TileBuffer *const buf = &tile_buffers[tile_row][tile_col];
+      tile_data = pbi->tile_data + tile_cols * tile_row + tile_col;
+      tile_data->cm = cm;
+      tile_data->xd = pbi->mb;
+      tile_data->xd.corrupted = 0;
+      tile_data->xd.counts = cm->frame_parallel_decoding_mode ?
+                             NULL : &cm->counts;
+      vp9_zero(tile_data->dqcoeff);
+      vp9_tile_init(&tile_data->xd.tile, tile_data->cm, tile_row, tile_col);
+      setup_token_decoder(buf->data, data_end, buf->size, &cm->error,
+                          &tile_data->bit_reader, pbi->decrypt_cb,
+                          pbi->decrypt_state);
+      vp9_init_macroblockd(cm, &tile_data->xd, tile_data->dqcoeff);
+    }
+  }
+
+  for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
+    TileInfo tile;
+    vp9_tile_set_row(&tile, cm, tile_row);
+    for (mi_row = tile.mi_row_start; mi_row < tile.mi_row_end;
+         mi_row += MI_BLOCK_SIZE) {
+      for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+        const int col = pbi->inv_tile_order ?
+                        tile_cols - tile_col - 1 : tile_col;
+        tile_data = pbi->tile_data + tile_cols * tile_row + col;
+        vp9_tile_set_col(&tile, tile_data->cm, col);
+        vp9_zero(tile_data->xd.left_context);
+        vp9_zero(tile_data->xd.left_seg_context);
+        for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end;
+             mi_col += MI_BLOCK_SIZE) {
+          decode_partition(pbi, &tile_data->xd, mi_row,
+                           mi_col, &tile_data->bit_reader, BLOCK_64X64, 4);
+        }
+        pbi->mb.corrupted |= tile_data->xd.corrupted;
+        if (pbi->mb.corrupted)
+            vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                               "Failed to decode tile data");
+      }
+      // Loopfilter one row.
+      if (cm->lf.filter_level && !cm->skip_loop_filter) {
+        const int lf_start = mi_row - MI_BLOCK_SIZE;
+        LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
+
+        // delay the loopfilter by 1 macroblock row.
+        if (lf_start < 0) continue;
+
+        // decoding has completed: finish up the loop filter in this thread.
+        if (mi_row + MI_BLOCK_SIZE >= cm->mi_rows) continue;
+
+        winterface->sync(&pbi->lf_worker);
+        lf_data->start = lf_start;
+        lf_data->stop = mi_row;
+        if (pbi->max_threads > 1) {
+          winterface->launch(&pbi->lf_worker);
+        } else {
+          winterface->execute(&pbi->lf_worker);
+        }
+      }
+      // After loopfiltering, the last 7 row pixels in each superblock row may
+      // still be changed by the longest loopfilter of the next superblock
+      // row.
+      if (pbi->frame_parallel_decode)
+        vp9_frameworker_broadcast(pbi->cur_buf,
+                                  mi_row << MI_BLOCK_SIZE_LOG2);
+    }
+  }
+
+  // Loopfilter remaining rows in the frame.
+  if (cm->lf.filter_level && !cm->skip_loop_filter) {
+    LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
+    winterface->sync(&pbi->lf_worker);
+    lf_data->start = lf_data->stop;
+    lf_data->stop = cm->mi_rows;
+    winterface->execute(&pbi->lf_worker);
+  }
+
+  // Get last tile data.
+  tile_data = pbi->tile_data + tile_cols * tile_rows - 1;
+
+  if (pbi->frame_parallel_decode)
+    vp9_frameworker_broadcast(pbi->cur_buf, INT_MAX);
+  return vpx_reader_find_end(&tile_data->bit_reader);
+}
+
+// On entry 'tile_data->data_end' points to the end of the input frame, on exit
+// it is updated to reflect the bitreader position of the final tile column if
+// present in the tile buffer group or NULL otherwise.
+static int tile_worker_hook(TileWorkerData *const tile_data,
+                            VP9Decoder *const pbi) {
+  TileInfo *volatile tile = &tile_data->xd.tile;
+  const int final_col = (1 << pbi->common.log2_tile_cols) - 1;
+  const uint8_t *volatile bit_reader_end = NULL;
+  volatile int n = tile_data->buf_start;
+  tile_data->error_info.setjmp = 1;
+
+  if (setjmp(tile_data->error_info.jmp)) {
+    tile_data->error_info.setjmp = 0;
+    tile_data->xd.corrupted = 1;
+    tile_data->data_end = NULL;
+    return 0;
+  }
+
+  tile_data->xd.error_info = &tile_data->error_info;
+  tile_data->xd.corrupted = 0;
+
+  do {
+    int mi_row, mi_col;
+    const TileBuffer *const buf = pbi->tile_buffers + n;
+    vp9_zero(tile_data->dqcoeff);
+    vp9_tile_init(tile, &pbi->common, 0, buf->col);
+    setup_token_decoder(buf->data, tile_data->data_end, buf->size,
+                        &tile_data->error_info, &tile_data->bit_reader,
+                        pbi->decrypt_cb, pbi->decrypt_state);
+    vp9_init_macroblockd(&pbi->common, &tile_data->xd, tile_data->dqcoeff);
+
+    for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
+         mi_row += MI_BLOCK_SIZE) {
+      vp9_zero(tile_data->xd.left_context);
+      vp9_zero(tile_data->xd.left_seg_context);
+      for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
+           mi_col += MI_BLOCK_SIZE) {
+        decode_partition(pbi, &tile_data->xd, mi_row, mi_col,
+                         &tile_data->bit_reader, BLOCK_64X64, 4);
+      }
+    }
+
+    if (buf->col == final_col) {
+      bit_reader_end = vpx_reader_find_end(&tile_data->bit_reader);
+    }
+  } while (!tile_data->xd.corrupted && ++n <= tile_data->buf_end);
+
+  tile_data->data_end = bit_reader_end;
+  return !tile_data->xd.corrupted;
+}
+
+// sorts in descending order
+static int compare_tile_buffers(const void *a, const void *b) {
+  const TileBuffer *const buf1 = (const TileBuffer*)a;
+  const TileBuffer *const buf2 = (const TileBuffer*)b;
+  return (int)(buf2->size - buf1->size);
+}
+
+static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,
+                                      const uint8_t *data,
+                                      const uint8_t *data_end) {
+  VP9_COMMON *const cm = &pbi->common;
+  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
+  const uint8_t *bit_reader_end = NULL;
+  const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int tile_rows = 1 << cm->log2_tile_rows;
+  const int num_workers = VPXMIN(pbi->max_threads, tile_cols);
+  int n;
+
+  assert(tile_cols <= (1 << 6));
+  assert(tile_rows == 1);
+  (void)tile_rows;
+
+  if (pbi->num_tile_workers == 0) {
+    const int num_threads = pbi->max_threads;
+    CHECK_MEM_ERROR(cm, pbi->tile_workers,
+                    vpx_malloc(num_threads * sizeof(*pbi->tile_workers)));
+    // Ensure tile data offsets will be properly aligned. This may fail on
+    // platforms without DECLARE_ALIGNED().
+    assert((sizeof(*pbi->tile_worker_data) % 16) == 0);
+    CHECK_MEM_ERROR(cm, pbi->tile_worker_data,
+                    vpx_memalign(32, num_threads *
+                                 sizeof(*pbi->tile_worker_data)));
+    for (n = 0; n < num_threads; ++n) {
+      VPxWorker *const worker = &pbi->tile_workers[n];
+      ++pbi->num_tile_workers;
+
+      winterface->init(worker);
+      if (n < num_threads - 1 && !winterface->reset(worker)) {
+        vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                           "Tile decoder thread creation failed");
+      }
+    }
+  }
+
+  // Reset tile decoding hook
+  for (n = 0; n < num_workers; ++n) {
+    VPxWorker *const worker = &pbi->tile_workers[n];
+    TileWorkerData *const tile_data = &pbi->tile_worker_data[n];
+    winterface->sync(worker);
+    tile_data->xd = pbi->mb;
+    tile_data->xd.counts =
+        cm->frame_parallel_decoding_mode ? NULL : &tile_data->counts;
+    worker->hook = (VPxWorkerHook)tile_worker_hook;
+    worker->data1 = tile_data;
+    worker->data2 = pbi;
+  }
+
+  // Note: this memset assumes above_context[0], [1] and [2]
+  // are allocated as part of the same buffer.
+  memset(cm->above_context, 0,
+         sizeof(*cm->above_context) * MAX_MB_PLANE * 2 * aligned_mi_cols);
+  memset(cm->above_seg_context, 0,
+         sizeof(*cm->above_seg_context) * aligned_mi_cols);
+
+  vp9_reset_lfm(cm);
+
+  // Load tile data into tile_buffers
+  get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows,
+                   &pbi->tile_buffers);
+
+  // Sort the buffers based on size in descending order.
+  qsort(pbi->tile_buffers, tile_cols, sizeof(pbi->tile_buffers[0]),
+        compare_tile_buffers);
+
+  if (num_workers == tile_cols) {
+    // Rearrange the tile buffers such that the largest, and
+    // presumably the most difficult, tile will be decoded in the main thread.
+    // This should help minimize the number of instances where the main thread
+    // is waiting for a worker to complete.
+    const TileBuffer largest = pbi->tile_buffers[0];
+    memmove(pbi->tile_buffers, pbi->tile_buffers + 1,
+            (tile_cols - 1) * sizeof(pbi->tile_buffers[0]));
+    pbi->tile_buffers[tile_cols - 1] = largest;
+  } else {
+    int start = 0, end = tile_cols - 2;
+    TileBuffer tmp;
+
+    // Interleave the tiles to distribute the load between threads, assuming a
+    // larger tile implies it is more difficult to decode.
+    while (start < end) {
+      tmp = pbi->tile_buffers[start];
+      pbi->tile_buffers[start] = pbi->tile_buffers[end];
+      pbi->tile_buffers[end] = tmp;
+      start += 2;
+      end -= 2;
+    }
+  }
+
+  // Initialize thread frame counts.
+  if (!cm->frame_parallel_decoding_mode) {
+    for (n = 0; n < num_workers; ++n) {
+      TileWorkerData *const tile_data =
+          (TileWorkerData*)pbi->tile_workers[n].data1;
+      vp9_zero(tile_data->counts);
+    }
+  }
+
+  {
+    const int base = tile_cols / num_workers;
+    const int remain = tile_cols % num_workers;
+    int buf_start = 0;
+
+    for (n = 0; n < num_workers; ++n) {
+      const int count = base + (remain + n) / num_workers;
+      VPxWorker *const worker = &pbi->tile_workers[n];
+      TileWorkerData *const tile_data = (TileWorkerData*)worker->data1;
+
+      tile_data->buf_start = buf_start;
+      tile_data->buf_end = buf_start + count - 1;
+      tile_data->data_end = data_end;
+      buf_start += count;
+
+      worker->had_error = 0;
+      if (n == num_workers - 1) {
+        assert(tile_data->buf_end == tile_cols - 1);
+        winterface->execute(worker);
+      } else {
+        winterface->launch(worker);
+      }
+    }
+
+    for (; n > 0; --n) {
+      VPxWorker *const worker = &pbi->tile_workers[n - 1];
+      TileWorkerData *const tile_data = (TileWorkerData*)worker->data1;
+      // TODO(jzern): The tile may have specific error data associated with
+      // its vpx_internal_error_info which could be propagated to the main info
+      // in cm. Additionally once the threads have been synced and an error is
+      // detected, there's no point in continuing to decode tiles.
+      pbi->mb.corrupted |= !winterface->sync(worker);
+      if (!bit_reader_end) bit_reader_end = tile_data->data_end;
+    }
+  }
+
+  // Accumulate thread frame counts.
+  if (!cm->frame_parallel_decoding_mode) {
+    for (n = 0; n < num_workers; ++n) {
+      TileWorkerData *const tile_data =
+          (TileWorkerData*)pbi->tile_workers[n].data1;
+      vp9_accumulate_frame_counts(&cm->counts, &tile_data->counts, 1);
+    }
+  }
+
+  assert(bit_reader_end || pbi->mb.corrupted);
+  return bit_reader_end;
+}
+
+static void error_handler(void *data) {
+  VP9_COMMON *const cm = (VP9_COMMON *)data;
+  vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, "Truncated packet");
+}
+
+static void read_bitdepth_colorspace_sampling(
+    VP9_COMMON *cm, struct vpx_read_bit_buffer *rb) {
+  if (cm->profile >= PROFILE_2) {
+    cm->bit_depth = vpx_rb_read_bit(rb) ? VPX_BITS_12 : VPX_BITS_10;
+#if CONFIG_VP9_HIGHBITDEPTH
+    cm->use_highbitdepth = 1;
+#endif
+  } else {
+    cm->bit_depth = VPX_BITS_8;
+#if CONFIG_VP9_HIGHBITDEPTH
+    cm->use_highbitdepth = 0;
+#endif
+  }
+  cm->color_space = vpx_rb_read_literal(rb, 3);
+  if (cm->color_space != VPX_CS_SRGB) {
+    cm->color_range = (vpx_color_range_t)vpx_rb_read_bit(rb);
+    if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) {
+      cm->subsampling_x = vpx_rb_read_bit(rb);
+      cm->subsampling_y = vpx_rb_read_bit(rb);
+      if (cm->subsampling_x == 1 && cm->subsampling_y == 1)
+        vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+                           "4:2:0 color not supported in profile 1 or 3");
+      if (vpx_rb_read_bit(rb))
+        vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+                           "Reserved bit set");
+    } else {
+      cm->subsampling_y = cm->subsampling_x = 1;
+    }
+  } else {
+    cm->color_range = VPX_CR_FULL_RANGE;
+    if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) {
+      // Note if colorspace is SRGB then 4:4:4 chroma sampling is assumed.
+      // 4:2:2 or 4:4:0 chroma sampling is not allowed.
+      cm->subsampling_y = cm->subsampling_x = 0;
+      if (vpx_rb_read_bit(rb))
+        vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+                           "Reserved bit set");
+    } else {
+      vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+                         "4:4:4 color not supported in profile 0 or 2");
+    }
+  }
+}
+
+static size_t read_uncompressed_header(VP9Decoder *pbi,
+                                       struct vpx_read_bit_buffer *rb) {
+  VP9_COMMON *const cm = &pbi->common;
+  BufferPool *const pool = cm->buffer_pool;
+  RefCntBuffer *const frame_bufs = pool->frame_bufs;
+  int i, mask, ref_index = 0;
+  size_t sz;
+
+  cm->last_frame_type = cm->frame_type;
+  cm->last_intra_only = cm->intra_only;
+
+  if (vpx_rb_read_literal(rb, 2) != VP9_FRAME_MARKER)
+      vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+                         "Invalid frame marker");
+
+  cm->profile = vp9_read_profile(rb);
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (cm->profile >= MAX_PROFILES)
+    vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+                       "Unsupported bitstream profile");
+#else
+  if (cm->profile >= PROFILE_2)
+    vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+                       "Unsupported bitstream profile");
+#endif
+
+  cm->show_existing_frame = vpx_rb_read_bit(rb);
+  if (cm->show_existing_frame) {
+    // Show an existing frame directly.
+    const int frame_to_show = cm->ref_frame_map[vpx_rb_read_literal(rb, 3)];
+    lock_buffer_pool(pool);
+    if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1) {
+      unlock_buffer_pool(pool);
+      vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+                         "Buffer %d does not contain a decoded frame",
+                         frame_to_show);
+    }
+
+    ref_cnt_fb(frame_bufs, &cm->new_fb_idx, frame_to_show);
+    unlock_buffer_pool(pool);
+    pbi->refresh_frame_flags = 0;
+    cm->lf.filter_level = 0;
+    cm->show_frame = 1;
+
+    if (pbi->frame_parallel_decode) {
+      for (i = 0; i < REF_FRAMES; ++i)
+        cm->next_ref_frame_map[i] = cm->ref_frame_map[i];
+    }
+    return 0;
+  }
+
+  cm->frame_type = (FRAME_TYPE) vpx_rb_read_bit(rb);
+  cm->show_frame = vpx_rb_read_bit(rb);
+  cm->error_resilient_mode = vpx_rb_read_bit(rb);
+
+  if (cm->frame_type == KEY_FRAME) {
+    if (!vp9_read_sync_code(rb))
+      vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+                         "Invalid frame sync code");
+
+    read_bitdepth_colorspace_sampling(cm, rb);
+    pbi->refresh_frame_flags = (1 << REF_FRAMES) - 1;
+
+    for (i = 0; i < REFS_PER_FRAME; ++i) {
+      cm->frame_refs[i].idx = INVALID_IDX;
+      cm->frame_refs[i].buf = NULL;
+    }
+
+    setup_frame_size(cm, rb);
+    if (pbi->need_resync) {
+      memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
+      pbi->need_resync = 0;
+    }
+  } else {
+    cm->intra_only = cm->show_frame ? 0 : vpx_rb_read_bit(rb);
+
+    cm->reset_frame_context = cm->error_resilient_mode ?
+        0 : vpx_rb_read_literal(rb, 2);
+
+    if (cm->intra_only) {
+      if (!vp9_read_sync_code(rb))
+        vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+                           "Invalid frame sync code");
+      if (cm->profile > PROFILE_0) {
+        read_bitdepth_colorspace_sampling(cm, rb);
+      } else {
+        // NOTE: The intra-only frame header does not include the specification
+        // of either the color format or color sub-sampling in profile 0. VP9
+        // specifies that the default color format should be YUV 4:2:0 in this
+        // case (normative).
+        cm->color_space = VPX_CS_BT_601;
+        cm->color_range = VPX_CR_STUDIO_RANGE;
+        cm->subsampling_y = cm->subsampling_x = 1;
+        cm->bit_depth = VPX_BITS_8;
+#if CONFIG_VP9_HIGHBITDEPTH
+        cm->use_highbitdepth = 0;
+#endif
+      }
+
+      pbi->refresh_frame_flags = vpx_rb_read_literal(rb, REF_FRAMES);
+      setup_frame_size(cm, rb);
+      if (pbi->need_resync) {
+        memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
+        pbi->need_resync = 0;
+      }
+    } else if (pbi->need_resync != 1) {  /* Skip if need resync */
+      pbi->refresh_frame_flags = vpx_rb_read_literal(rb, REF_FRAMES);
+      for (i = 0; i < REFS_PER_FRAME; ++i) {
+        const int ref = vpx_rb_read_literal(rb, REF_FRAMES_LOG2);
+        const int idx = cm->ref_frame_map[ref];
+        RefBuffer *const ref_frame = &cm->frame_refs[i];
+        ref_frame->idx = idx;
+        ref_frame->buf = &frame_bufs[idx].buf;
+        cm->ref_frame_sign_bias[LAST_FRAME + i] = vpx_rb_read_bit(rb);
+      }
+
+      setup_frame_size_with_refs(cm, rb);
+
+      cm->allow_high_precision_mv = vpx_rb_read_bit(rb);
+      cm->interp_filter = read_interp_filter(rb);
+
+      for (i = 0; i < REFS_PER_FRAME; ++i) {
+        RefBuffer *const ref_buf = &cm->frame_refs[i];
+#if CONFIG_VP9_HIGHBITDEPTH
+        vp9_setup_scale_factors_for_frame(&ref_buf->sf,
+                                          ref_buf->buf->y_crop_width,
+                                          ref_buf->buf->y_crop_height,
+                                          cm->width, cm->height,
+                                          cm->use_highbitdepth);
+#else
+        vp9_setup_scale_factors_for_frame(&ref_buf->sf,
+                                          ref_buf->buf->y_crop_width,
+                                          ref_buf->buf->y_crop_height,
+                                          cm->width, cm->height);
+#endif
+      }
+    }
+  }
+#if CONFIG_VP9_HIGHBITDEPTH
+  get_frame_new_buffer(cm)->bit_depth = cm->bit_depth;
+#endif
+  get_frame_new_buffer(cm)->color_space = cm->color_space;
+  get_frame_new_buffer(cm)->color_range = cm->color_range;
+  get_frame_new_buffer(cm)->render_width  = cm->render_width;
+  get_frame_new_buffer(cm)->render_height = cm->render_height;
+
+  if (pbi->need_resync) {
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Keyframe / intra-only frame required to reset decoder"
+                       " state");
+  }
+
+  if (!cm->error_resilient_mode) {
+    cm->refresh_frame_context = vpx_rb_read_bit(rb);
+    cm->frame_parallel_decoding_mode = vpx_rb_read_bit(rb);
+    if (!cm->frame_parallel_decoding_mode)
+      vp9_zero(cm->counts);
+  } else {
+    cm->refresh_frame_context = 0;
+    cm->frame_parallel_decoding_mode = 1;
+  }
+
+  // This flag will be overridden by the call to vp9_setup_past_independence
+  // below, forcing the use of context 0 for those frame types.
+  cm->frame_context_idx = vpx_rb_read_literal(rb, FRAME_CONTEXTS_LOG2);
+
+  // Generate next_ref_frame_map.
+  lock_buffer_pool(pool);
+  for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
+    if (mask & 1) {
+      cm->next_ref_frame_map[ref_index] = cm->new_fb_idx;
+      ++frame_bufs[cm->new_fb_idx].ref_count;
+    } else {
+      cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index];
+    }
+    // Current thread holds the reference frame.
+    if (cm->ref_frame_map[ref_index] >= 0)
+      ++frame_bufs[cm->ref_frame_map[ref_index]].ref_count;
+    ++ref_index;
+  }
+
+  for (; ref_index < REF_FRAMES; ++ref_index) {
+    cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index];
+    // Current thread holds the reference frame.
+    if (cm->ref_frame_map[ref_index] >= 0)
+      ++frame_bufs[cm->ref_frame_map[ref_index]].ref_count;
+  }
+  unlock_buffer_pool(pool);
+  pbi->hold_ref_buf = 1;
+
+  if (frame_is_intra_only(cm) || cm->error_resilient_mode)
+    vp9_setup_past_independence(cm);
+
+  setup_loopfilter(&cm->lf, rb);
+  setup_quantization(cm, &pbi->mb, rb);
+  setup_segmentation(&cm->seg, rb);
+  setup_segmentation_dequant(cm);
+
+  setup_tile_info(cm, rb);
+  sz = vpx_rb_read_literal(rb, 16);
+
+  if (sz == 0)
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Invalid header size");
+
+  return sz;
+}
+
+static int read_compressed_header(VP9Decoder *pbi, const uint8_t *data,
+                                  size_t partition_size) {
+  VP9_COMMON *const cm = &pbi->common;
+  MACROBLOCKD *const xd = &pbi->mb;
+  FRAME_CONTEXT *const fc = cm->fc;
+  vpx_reader r;
+  int k;
+
+  if (vpx_reader_init(&r, data, partition_size, pbi->decrypt_cb,
+                      pbi->decrypt_state))
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate bool decoder 0");
+
+  cm->tx_mode = xd->lossless ? ONLY_4X4 : read_tx_mode(&r);
+  if (cm->tx_mode == TX_MODE_SELECT)
+    read_tx_mode_probs(&fc->tx_probs, &r);
+  read_coef_probs(fc, cm->tx_mode, &r);
+
+  for (k = 0; k < SKIP_CONTEXTS; ++k)
+    vp9_diff_update_prob(&r, &fc->skip_probs[k]);
+
+  if (!frame_is_intra_only(cm)) {
+    nmv_context *const nmvc = &fc->nmvc;
+    int i, j;
+
+    read_inter_mode_probs(fc, &r);
+
+    if (cm->interp_filter == SWITCHABLE)
+      read_switchable_interp_probs(fc, &r);
+
+    for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
+      vp9_diff_update_prob(&r, &fc->intra_inter_prob[i]);
+
+    cm->reference_mode = read_frame_reference_mode(cm, &r);
+    if (cm->reference_mode != SINGLE_REFERENCE)
+      setup_compound_reference_mode(cm);
+    read_frame_reference_mode_probs(cm, &r);
+
+    for (j = 0; j < BLOCK_SIZE_GROUPS; j++)
+      for (i = 0; i < INTRA_MODES - 1; ++i)
+        vp9_diff_update_prob(&r, &fc->y_mode_prob[j][i]);
+
+    for (j = 0; j < PARTITION_CONTEXTS; ++j)
+      for (i = 0; i < PARTITION_TYPES - 1; ++i)
+        vp9_diff_update_prob(&r, &fc->partition_prob[j][i]);
+
+    read_mv_probs(nmvc, cm->allow_high_precision_mv, &r);
+  }
+
+  return vpx_reader_has_error(&r);
+}
+
+static struct vpx_read_bit_buffer *init_read_bit_buffer(
+    VP9Decoder *pbi,
+    struct vpx_read_bit_buffer *rb,
+    const uint8_t *data,
+    const uint8_t *data_end,
+    uint8_t clear_data[MAX_VP9_HEADER_SIZE]) {
+  rb->bit_offset = 0;
+  rb->error_handler = error_handler;
+  rb->error_handler_data = &pbi->common;
+  if (pbi->decrypt_cb) {
+    const int n = (int)VPXMIN(MAX_VP9_HEADER_SIZE, data_end - data);
+    pbi->decrypt_cb(pbi->decrypt_state, data, clear_data, n);
+    rb->bit_buffer = clear_data;
+    rb->bit_buffer_end = clear_data + n;
+  } else {
+    rb->bit_buffer = data;
+    rb->bit_buffer_end = data_end;
+  }
+  return rb;
+}
+
+//------------------------------------------------------------------------------
+
+int vp9_read_sync_code(struct vpx_read_bit_buffer *const rb) {
+  return vpx_rb_read_literal(rb, 8) == VP9_SYNC_CODE_0 &&
+         vpx_rb_read_literal(rb, 8) == VP9_SYNC_CODE_1 &&
+         vpx_rb_read_literal(rb, 8) == VP9_SYNC_CODE_2;
+}
+
+void vp9_read_frame_size(struct vpx_read_bit_buffer *rb,
+                         int *width, int *height) {
+  *width = vpx_rb_read_literal(rb, 16) + 1;
+  *height = vpx_rb_read_literal(rb, 16) + 1;
+}
+
+BITSTREAM_PROFILE vp9_read_profile(struct vpx_read_bit_buffer *rb) {
+  int profile = vpx_rb_read_bit(rb);
+  profile |= vpx_rb_read_bit(rb) << 1;
+  if (profile > 2)
+    profile += vpx_rb_read_bit(rb);
+  return (BITSTREAM_PROFILE) profile;
+}
+
+void vp9_decode_frame(VP9Decoder *pbi,
+                      const uint8_t *data, const uint8_t *data_end,
+                      const uint8_t **p_data_end) {
+  VP9_COMMON *const cm = &pbi->common;
+  MACROBLOCKD *const xd = &pbi->mb;
+  struct vpx_read_bit_buffer rb;
+  int context_updated = 0;
+  uint8_t clear_data[MAX_VP9_HEADER_SIZE];
+  const size_t first_partition_size = read_uncompressed_header(pbi,
+      init_read_bit_buffer(pbi, &rb, data, data_end, clear_data));
+  const int tile_rows = 1 << cm->log2_tile_rows;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm);
+  xd->cur_buf = new_fb;
+
+  if (!first_partition_size) {
+    // showing a frame directly
+    *p_data_end = data + (cm->profile <= PROFILE_2 ? 1 : 2);
+    return;
+  }
+
+  data += vpx_rb_bytes_read(&rb);
+  if (!read_is_valid(data, first_partition_size, data_end))
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Truncated packet or corrupt header length");
+
+  cm->use_prev_frame_mvs = !cm->error_resilient_mode &&
+                           cm->width == cm->last_width &&
+                           cm->height == cm->last_height &&
+                           !cm->last_intra_only &&
+                           cm->last_show_frame &&
+                           (cm->last_frame_type != KEY_FRAME);
+
+  vp9_setup_block_planes(xd, cm->subsampling_x, cm->subsampling_y);
+
+  *cm->fc = cm->frame_contexts[cm->frame_context_idx];
+  if (!cm->fc->initialized)
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Uninitialized entropy context.");
+
+  xd->corrupted = 0;
+  new_fb->corrupted = read_compressed_header(pbi, data, first_partition_size);
+  if (new_fb->corrupted)
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Decode failed. Frame data header is corrupted.");
+
+  if (cm->lf.filter_level && !cm->skip_loop_filter) {
+    vp9_loop_filter_frame_init(cm, cm->lf.filter_level);
+  }
+
+  // If encoded in frame parallel mode, frame context is ready after decoding
+  // the frame header.
+  if (pbi->frame_parallel_decode && cm->frame_parallel_decoding_mode) {
+    VPxWorker *const worker = pbi->frame_worker_owner;
+    FrameWorkerData *const frame_worker_data = worker->data1;
+    if (cm->refresh_frame_context) {
+      context_updated = 1;
+      cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
+    }
+    vp9_frameworker_lock_stats(worker);
+    pbi->cur_buf->row = -1;
+    pbi->cur_buf->col = -1;
+    frame_worker_data->frame_context_ready = 1;
+    // Signal the main thread that context is ready.
+    vp9_frameworker_signal_stats(worker);
+    vp9_frameworker_unlock_stats(worker);
+  }
+
+  if (pbi->max_threads > 1 && tile_rows == 1 && tile_cols > 1) {
+    // Multi-threaded tile decoder
+    *p_data_end = decode_tiles_mt(pbi, data + first_partition_size, data_end);
+    if (!xd->corrupted) {
+      if (!cm->skip_loop_filter) {
+        // If multiple threads are used to decode tiles, then we use those
+        // threads to do parallel loopfiltering.
+        vp9_loop_filter_frame_mt(new_fb, cm, pbi->mb.plane,
+                                 cm->lf.filter_level, 0, 0, pbi->tile_workers,
+                                 pbi->num_tile_workers, &pbi->lf_row_sync);
+      }
+    } else {
+      vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                         "Decode failed. Frame data is corrupted.");
+    }
+  } else {
+    *p_data_end = decode_tiles(pbi, data + first_partition_size, data_end);
+  }
+
+  if (!xd->corrupted) {
+    if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode) {
+      vp9_adapt_coef_probs(cm);
+
+      if (!frame_is_intra_only(cm)) {
+        vp9_adapt_mode_probs(cm);
+        vp9_adapt_mv_probs(cm, cm->allow_high_precision_mv);
+      }
+    }
+  } else {
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Decode failed. Frame data is corrupted.");
+  }
+
+  // Non frame parallel update frame context here.
+  if (cm->refresh_frame_context && !context_updated)
+    cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
+}
diff --git a/libs/libvpx/vp9/decoder/vp9_decodeframe.h b/libs/libvpx/vp9/decoder/vp9_decodeframe.h
new file mode 100644
index 0000000000..ce33cbdbd9
--- /dev/null
+++ b/libs/libvpx/vp9/decoder/vp9_decodeframe.h
@@ -0,0 +1,37 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP9_DECODER_VP9_DECODEFRAME_H_
+#define VP9_DECODER_VP9_DECODEFRAME_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "vp9/common/vp9_enums.h"
+
+struct VP9Decoder;
+struct vpx_read_bit_buffer;
+
+int vp9_read_sync_code(struct vpx_read_bit_buffer *const rb);
+void vp9_read_frame_size(struct vpx_read_bit_buffer *rb,
+                         int *width, int *height);
+BITSTREAM_PROFILE vp9_read_profile(struct vpx_read_bit_buffer *rb);
+
+void vp9_decode_frame(struct VP9Decoder *pbi,
+                      const uint8_t *data, const uint8_t *data_end,
+                      const uint8_t **p_data_end);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_DECODER_VP9_DECODEFRAME_H_
diff --git a/libs/libvpx/vp9/decoder/vp9_decodemv.c b/libs/libvpx/vp9/decoder/vp9_decodemv.c
new file mode 100644
index 0000000000..86044207c2
--- /dev/null
+++ b/libs/libvpx/vp9/decoder/vp9_decodemv.c
@@ -0,0 +1,885 @@
+/*
+  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_entropy.h"
+#include "vp9/common/vp9_entropymode.h"
+#include "vp9/common/vp9_entropymv.h"
+#include "vp9/common/vp9_mvref_common.h"
+#include "vp9/common/vp9_pred_common.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/common/vp9_seg_common.h"
+
+#include "vp9/decoder/vp9_decodemv.h"
+#include "vp9/decoder/vp9_decodeframe.h"
+
+#include "vpx_dsp/vpx_dsp_common.h"
+
+static PREDICTION_MODE read_intra_mode(vpx_reader *r, const vpx_prob *p) {
+  return (PREDICTION_MODE)vpx_read_tree(r, vp9_intra_mode_tree, p);
+}
+
+static PREDICTION_MODE read_intra_mode_y(VP9_COMMON *cm, MACROBLOCKD *xd,
+                                         vpx_reader *r, int size_group) {
+  const PREDICTION_MODE y_mode =
+      read_intra_mode(r, cm->fc->y_mode_prob[size_group]);
+  FRAME_COUNTS *counts = xd->counts;
+  if (counts)
+    ++counts->y_mode[size_group][y_mode];
+  return y_mode;
+}
+
+static PREDICTION_MODE read_intra_mode_uv(VP9_COMMON *cm, MACROBLOCKD *xd,
+                                          vpx_reader *r,
+                                          PREDICTION_MODE y_mode) {
+  const PREDICTION_MODE uv_mode = read_intra_mode(r,
+                                         cm->fc->uv_mode_prob[y_mode]);
+  FRAME_COUNTS *counts = xd->counts;
+  if (counts)
+    ++counts->uv_mode[y_mode][uv_mode];
+  return uv_mode;
+}
+
+static PREDICTION_MODE read_inter_mode(VP9_COMMON *cm, MACROBLOCKD *xd,
+                                       vpx_reader *r, int ctx) {
+  const int mode = vpx_read_tree(r, vp9_inter_mode_tree,
+                                 cm->fc->inter_mode_probs[ctx]);
+  FRAME_COUNTS *counts = xd->counts;
+  if (counts)
+    ++counts->inter_mode[ctx][mode];
+
+  return NEARESTMV + mode;
+}
+
+static int read_segment_id(vpx_reader *r, const struct segmentation *seg) {
+  return vpx_read_tree(r, vp9_segment_tree, seg->tree_probs);
+}
+
+static TX_SIZE read_selected_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd,
+                                     TX_SIZE max_tx_size, vpx_reader *r) {
+  FRAME_COUNTS *counts = xd->counts;
+  const int ctx = get_tx_size_context(xd);
+  const vpx_prob *tx_probs = get_tx_probs(max_tx_size, ctx, &cm->fc->tx_probs);
+  int tx_size = vpx_read(r, tx_probs[0]);
+  if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) {
+    tx_size += vpx_read(r, tx_probs[1]);
+    if (tx_size != TX_8X8 && max_tx_size >= TX_32X32)
+      tx_size += vpx_read(r, tx_probs[2]);
+  }
+
+  if (counts)
+    ++get_tx_counts(max_tx_size, ctx, &counts->tx)[tx_size];
+  return (TX_SIZE)tx_size;
+}
+
+static INLINE TX_SIZE read_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd,
+                                   int allow_select, vpx_reader *r) {
+  TX_MODE tx_mode = cm->tx_mode;
+  BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
+  if (allow_select && tx_mode == TX_MODE_SELECT && bsize >= BLOCK_8X8)
+    return read_selected_tx_size(cm, xd, max_tx_size, r);
+  else
+    return VPXMIN(max_tx_size, tx_mode_to_biggest_tx_size[tx_mode]);
+}
+
+static int dec_get_segment_id(const VP9_COMMON *cm, const uint8_t *segment_ids,
+                              int mi_offset, int x_mis, int y_mis) {
+  int x, y, segment_id = INT_MAX;
+
+  for (y = 0; y < y_mis; y++)
+    for (x = 0; x < x_mis; x++)
+      segment_id =
+          VPXMIN(segment_id, segment_ids[mi_offset + y * cm->mi_cols + x]);
+
+  assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
+  return segment_id;
+}
+
+static void set_segment_id(VP9_COMMON *cm, int mi_offset,
+                           int x_mis, int y_mis, int segment_id) {
+  int x, y;
+
+  assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
+
+  for (y = 0; y < y_mis; y++)
+    for (x = 0; x < x_mis; x++)
+      cm->current_frame_seg_map[mi_offset + y * cm->mi_cols + x] = segment_id;
+}
+
+static void copy_segment_id(const VP9_COMMON *cm,
+                           const uint8_t *last_segment_ids,
+                           uint8_t *current_segment_ids,
+                           int mi_offset, int x_mis, int y_mis) {
+  int x, y;
+
+  for (y = 0; y < y_mis; y++)
+    for (x = 0; x < x_mis; x++)
+      current_segment_ids[mi_offset + y * cm->mi_cols + x] =  last_segment_ids ?
+          last_segment_ids[mi_offset + y * cm->mi_cols + x] : 0;
+}
+
+static int read_intra_segment_id(VP9_COMMON *const cm, int mi_offset,
+                                 int x_mis, int y_mis,
+                                 vpx_reader *r) {
+  struct segmentation *const seg = &cm->seg;
+  int segment_id;
+
+  if (!seg->enabled)
+    return 0;  // Default for disabled segmentation
+
+  if (!seg->update_map) {
+    copy_segment_id(cm, cm->last_frame_seg_map, cm->current_frame_seg_map,
+                    mi_offset, x_mis, y_mis);
+    return 0;
+  }
+
+  segment_id = read_segment_id(r, seg);
+  set_segment_id(cm, mi_offset, x_mis, y_mis, segment_id);
+  return segment_id;
+}
+
+static int read_inter_segment_id(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+                                 int mi_row, int mi_col, vpx_reader *r) {
+  struct segmentation *const seg = &cm->seg;
+  MODE_INFO *const mi = xd->mi[0];
+  int predicted_segment_id, segment_id;
+  const int mi_offset = mi_row * cm->mi_cols + mi_col;
+  const int bw = xd->plane[0].n4_w >> 1;
+  const int bh = xd->plane[0].n4_h >> 1;
+
+  // TODO(slavarnway): move x_mis, y_mis into xd ?????
+  const int x_mis = VPXMIN(cm->mi_cols - mi_col, bw);
+  const int y_mis = VPXMIN(cm->mi_rows - mi_row, bh);
+
+  if (!seg->enabled)
+    return 0;  // Default for disabled segmentation
+
+  predicted_segment_id = cm->last_frame_seg_map ?
+      dec_get_segment_id(cm, cm->last_frame_seg_map, mi_offset, x_mis, y_mis) :
+      0;
+
+  if (!seg->update_map) {
+    copy_segment_id(cm, cm->last_frame_seg_map, cm->current_frame_seg_map,
+                    mi_offset, x_mis, y_mis);
+    return predicted_segment_id;
+  }
+
+  if (seg->temporal_update) {
+    const vpx_prob pred_prob = vp9_get_pred_prob_seg_id(seg, xd);
+    mi->seg_id_predicted = vpx_read(r, pred_prob);
+    segment_id = mi->seg_id_predicted ? predicted_segment_id
+                                      : read_segment_id(r, seg);
+  } else {
+    segment_id = read_segment_id(r, seg);
+  }
+  set_segment_id(cm, mi_offset, x_mis, y_mis, segment_id);
+  return segment_id;
+}
+
+static int read_skip(VP9_COMMON *cm, const MACROBLOCKD *xd,
+                     int segment_id, vpx_reader *r) {
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
+    return 1;
+  } else {
+    const int ctx = vp9_get_skip_context(xd);
+    const int skip = vpx_read(r, cm->fc->skip_probs[ctx]);
+    FRAME_COUNTS *counts = xd->counts;
+    if (counts)
+      ++counts->skip[ctx][skip];
+    return skip;
+  }
+}
+
+static void read_intra_frame_mode_info(VP9_COMMON *const cm,
+                                       MACROBLOCKD *const xd,
+                                       int mi_row, int mi_col, vpx_reader *r) {
+  MODE_INFO *const mi = xd->mi[0];
+  const MODE_INFO *above_mi = xd->above_mi;
+  const MODE_INFO *left_mi  = xd->left_mi;
+  const BLOCK_SIZE bsize = mi->sb_type;
+  int i;
+  const int mi_offset = mi_row * cm->mi_cols + mi_col;
+  const int bw = xd->plane[0].n4_w >> 1;
+  const int bh = xd->plane[0].n4_h >> 1;
+
+  // TODO(slavarnway): move x_mis, y_mis into xd ?????
+  const int x_mis = VPXMIN(cm->mi_cols - mi_col, bw);
+  const int y_mis = VPXMIN(cm->mi_rows - mi_row, bh);
+
+  mi->segment_id = read_intra_segment_id(cm, mi_offset, x_mis, y_mis, r);
+  mi->skip = read_skip(cm, xd, mi->segment_id, r);
+  mi->tx_size = read_tx_size(cm, xd, 1, r);
+  mi->ref_frame[0] = INTRA_FRAME;
+  mi->ref_frame[1] = NONE;
+
+  switch (bsize) {
+    case BLOCK_4X4:
+      for (i = 0; i < 4; ++i)
+        mi->bmi[i].as_mode =
+            read_intra_mode(r, get_y_mode_probs(mi, above_mi, left_mi, i));
+      mi->mode = mi->bmi[3].as_mode;
+      break;
+    case BLOCK_4X8:
+      mi->bmi[0].as_mode = mi->bmi[2].as_mode =
+          read_intra_mode(r, get_y_mode_probs(mi, above_mi, left_mi, 0));
+      mi->bmi[1].as_mode = mi->bmi[3].as_mode = mi->mode =
+          read_intra_mode(r, get_y_mode_probs(mi, above_mi, left_mi, 1));
+      break;
+    case BLOCK_8X4:
+      mi->bmi[0].as_mode = mi->bmi[1].as_mode =
+          read_intra_mode(r, get_y_mode_probs(mi, above_mi, left_mi, 0));
+      mi->bmi[2].as_mode = mi->bmi[3].as_mode = mi->mode =
+          read_intra_mode(r, get_y_mode_probs(mi, above_mi, left_mi, 2));
+      break;
+    default:
+      mi->mode = read_intra_mode(r,
+                                 get_y_mode_probs(mi, above_mi, left_mi, 0));
+  }
+
+  mi->uv_mode = read_intra_mode(r, vp9_kf_uv_mode_prob[mi->mode]);
+}
+
+static int read_mv_component(vpx_reader *r,
+                             const nmv_component *mvcomp, int usehp) {
+  int mag, d, fr, hp;
+  const int sign = vpx_read(r, mvcomp->sign);
+  const int mv_class = vpx_read_tree(r, vp9_mv_class_tree, mvcomp->classes);
+  const int class0 = mv_class == MV_CLASS_0;
+
+  // Integer part
+  if (class0) {
+    d = vpx_read_tree(r, vp9_mv_class0_tree, mvcomp->class0);
+    mag = 0;
+  } else {
+    int i;
+    const int n = mv_class + CLASS0_BITS - 1;  // number of bits
+
+    d = 0;
+    for (i = 0; i < n; ++i)
+      d |= vpx_read(r, mvcomp->bits[i]) << i;
+    mag = CLASS0_SIZE << (mv_class + 2);
+  }
+
+  // Fractional part
+  fr = vpx_read_tree(r, vp9_mv_fp_tree, class0 ? mvcomp->class0_fp[d]
+                                               : mvcomp->fp);
+
+  // High precision part (if hp is not used, the default value of the hp is 1)
+  hp = usehp ? vpx_read(r, class0 ? mvcomp->class0_hp : mvcomp->hp)
+             : 1;
+
+  // Result
+  mag += ((d << 3) | (fr << 1) | hp) + 1;
+  return sign ? -mag : mag;
+}
+
+static INLINE void read_mv(vpx_reader *r, MV *mv, const MV *ref,
+                           const nmv_context *ctx,
+                           nmv_context_counts *counts, int allow_hp) {
+  const MV_JOINT_TYPE joint_type =
+      (MV_JOINT_TYPE)vpx_read_tree(r, vp9_mv_joint_tree, ctx->joints);
+  const int use_hp = allow_hp && use_mv_hp(ref);
+  MV diff = {0, 0};
+
+  if (mv_joint_vertical(joint_type))
+    diff.row = read_mv_component(r, &ctx->comps[0], use_hp);
+
+  if (mv_joint_horizontal(joint_type))
+    diff.col = read_mv_component(r, &ctx->comps[1], use_hp);
+
+  vp9_inc_mv(&diff, counts);
+
+  mv->row = ref->row + diff.row;
+  mv->col = ref->col + diff.col;
+}
+
+static REFERENCE_MODE read_block_reference_mode(VP9_COMMON *cm,
+                                                const MACROBLOCKD *xd,
+                                                vpx_reader *r) {
+  if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+    const int ctx = vp9_get_reference_mode_context(cm, xd);
+    const REFERENCE_MODE mode =
+        (REFERENCE_MODE)vpx_read(r, cm->fc->comp_inter_prob[ctx]);
+    FRAME_COUNTS *counts = xd->counts;
+    if (counts)
+      ++counts->comp_inter[ctx][mode];
+    return mode;  // SINGLE_REFERENCE or COMPOUND_REFERENCE
+  } else {
+    return cm->reference_mode;
+  }
+}
+
+// Read the referncence frame
+static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+                            vpx_reader *r,
+                            int segment_id, MV_REFERENCE_FRAME ref_frame[2]) {
+  FRAME_CONTEXT *const fc = cm->fc;
+  FRAME_COUNTS *counts = xd->counts;
+
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
+    ref_frame[0] = (MV_REFERENCE_FRAME)get_segdata(&cm->seg, segment_id,
+                                                   SEG_LVL_REF_FRAME);
+    ref_frame[1] = NONE;
+  } else {
+    const REFERENCE_MODE mode = read_block_reference_mode(cm, xd, r);
+    // FIXME(rbultje) I'm pretty sure this breaks segmentation ref frame coding
+    if (mode == COMPOUND_REFERENCE) {
+      const int idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
+      const int ctx = vp9_get_pred_context_comp_ref_p(cm, xd);
+      const int bit = vpx_read(r, fc->comp_ref_prob[ctx]);
+      if (counts)
+        ++counts->comp_ref[ctx][bit];
+      ref_frame[idx] = cm->comp_fixed_ref;
+      ref_frame[!idx] = cm->comp_var_ref[bit];
+    } else if (mode == SINGLE_REFERENCE) {
+      const int ctx0 = vp9_get_pred_context_single_ref_p1(xd);
+      const int bit0 = vpx_read(r, fc->single_ref_prob[ctx0][0]);
+      if (counts)
+        ++counts->single_ref[ctx0][0][bit0];
+      if (bit0) {
+        const int ctx1 = vp9_get_pred_context_single_ref_p2(xd);
+        const int bit1 = vpx_read(r, fc->single_ref_prob[ctx1][1]);
+        if (counts)
+          ++counts->single_ref[ctx1][1][bit1];
+        ref_frame[0] = bit1 ? ALTREF_FRAME : GOLDEN_FRAME;
+      } else {
+        ref_frame[0] = LAST_FRAME;
+      }
+
+      ref_frame[1] = NONE;
+    } else {
+      assert(0 && "Invalid prediction mode.");
+    }
+  }
+}
+
+
+static INLINE INTERP_FILTER read_switchable_interp_filter(
+    VP9_COMMON *const cm, MACROBLOCKD *const xd,
+    vpx_reader *r) {
+  const int ctx = vp9_get_pred_context_switchable_interp(xd);
+  const INTERP_FILTER type =
+      (INTERP_FILTER)vpx_read_tree(r, vp9_switchable_interp_tree,
+                                   cm->fc->switchable_interp_prob[ctx]);
+  FRAME_COUNTS *counts = xd->counts;
+  if (counts)
+    ++counts->switchable_interp[ctx][type];
+  return type;
+}
+
+static void read_intra_block_mode_info(VP9_COMMON *const cm,
+                                       MACROBLOCKD *const xd, MODE_INFO *mi,
+                                       vpx_reader *r) {
+  const BLOCK_SIZE bsize = mi->sb_type;
+  int i;
+
+  mi->ref_frame[0] = INTRA_FRAME;
+  mi->ref_frame[1] = NONE;
+
+  switch (bsize) {
+    case BLOCK_4X4:
+      for (i = 0; i < 4; ++i)
+        mi->bmi[i].as_mode = read_intra_mode_y(cm, xd, r, 0);
+      mi->mode = mi->bmi[3].as_mode;
+      break;
+    case BLOCK_4X8:
+      mi->bmi[0].as_mode = mi->bmi[2].as_mode = read_intra_mode_y(cm, xd,
+                                                                  r, 0);
+      mi->bmi[1].as_mode = mi->bmi[3].as_mode = mi->mode =
+          read_intra_mode_y(cm, xd, r, 0);
+      break;
+    case BLOCK_8X4:
+      mi->bmi[0].as_mode = mi->bmi[1].as_mode = read_intra_mode_y(cm, xd,
+                                                                  r, 0);
+      mi->bmi[2].as_mode = mi->bmi[3].as_mode = mi->mode =
+          read_intra_mode_y(cm, xd, r, 0);
+      break;
+    default:
+      mi->mode = read_intra_mode_y(cm, xd, r, size_group_lookup[bsize]);
+  }
+
+  mi->uv_mode = read_intra_mode_uv(cm, xd, r, mi->mode);
+}
+
+static INLINE int is_mv_valid(const MV *mv) {
+  return mv->row > MV_LOW && mv->row < MV_UPP &&
+         mv->col > MV_LOW && mv->col < MV_UPP;
+}
+
+static INLINE void copy_mv_pair(int_mv *dst, const int_mv *src) {
+  memcpy(dst, src, sizeof(*dst) * 2);
+}
+
+static INLINE void zero_mv_pair(int_mv *dst) {
+  memset(dst, 0, sizeof(*dst) * 2);
+}
+
+static INLINE int assign_mv(VP9_COMMON *cm, MACROBLOCKD *xd,
+                            PREDICTION_MODE mode,
+                            int_mv mv[2], int_mv ref_mv[2],
+                            int_mv near_nearest_mv[2],
+                            int is_compound, int allow_hp, vpx_reader *r) {
+  int i;
+  int ret = 1;
+
+  switch (mode) {
+    case NEWMV: {
+      FRAME_COUNTS *counts = xd->counts;
+      nmv_context_counts *const mv_counts = counts ? &counts->mv : NULL;
+      for (i = 0; i < 1 + is_compound; ++i) {
+        read_mv(r, &mv[i].as_mv, &ref_mv[i].as_mv, &cm->fc->nmvc, mv_counts,
+                allow_hp);
+        ret = ret && is_mv_valid(&mv[i].as_mv);
+      }
+      break;
+    }
+    case NEARMV:
+    case NEARESTMV: {
+      copy_mv_pair(mv, near_nearest_mv);
+      break;
+    }
+    case ZEROMV: {
+      zero_mv_pair(mv);
+      break;
+    }
+    default: {
+      return 0;
+    }
+  }
+  return ret;
+}
+
+static int read_is_inter_block(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+                               int segment_id, vpx_reader *r) {
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
+    return get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME) != INTRA_FRAME;
+  } else {
+    const int ctx = vp9_get_intra_inter_context(xd);
+    const int is_inter = vpx_read(r, cm->fc->intra_inter_prob[ctx]);
+    FRAME_COUNTS *counts = xd->counts;
+    if (counts)
+      ++counts->intra_inter[ctx][is_inter];
+    return is_inter;
+  }
+}
+
+static void dec_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp, int_mv *mvlist,
+                                  int_mv *best_mv, int refmv_count) {
+  int i;
+
+  // Make sure all the candidates are properly clamped etc
+  for (i = 0; i < refmv_count; ++i) {
+    lower_mv_precision(&mvlist[i].as_mv, allow_hp);
+    clamp_mv2(&mvlist[i].as_mv, xd);
+    *best_mv = mvlist[i];
+  }
+}
+
+static void fpm_sync(void *const data, int mi_row) {
+  VP9Decoder *const pbi = (VP9Decoder *)data;
+  vp9_frameworker_wait(pbi->frame_worker_owner, pbi->common.prev_frame,
+                       mi_row << MI_BLOCK_SIZE_LOG2);
+}
+
+// This macro is used to add a motion vector mv_ref list if it isn't
+// already in the list.  If it's the second motion vector or early_break
+// it will also skip all additional processing and jump to Done!
+#define ADD_MV_REF_LIST_EB(mv, refmv_count, mv_ref_list, Done) \
+  do { \
+    if (refmv_count) { \
+      if ((mv).as_int != (mv_ref_list)[0].as_int) { \
+        (mv_ref_list)[(refmv_count)] = (mv); \
+        refmv_count++; \
+        goto Done; \
+      } \
+    } else { \
+      (mv_ref_list)[(refmv_count)++] = (mv); \
+      if (early_break) \
+        goto Done; \
+    } \
+  } while (0)
+
+// If either reference frame is different, not INTRA, and they
+// are different from each other scale and add the mv to our list.
+#define IF_DIFF_REF_FRAME_ADD_MV_EB(mbmi, ref_frame, ref_sign_bias, \
+                                    refmv_count, mv_ref_list, Done) \
+  do { \
+    if (is_inter_block(mbmi)) { \
+      if ((mbmi)->ref_frame[0] != ref_frame) \
+        ADD_MV_REF_LIST_EB(scale_mv((mbmi), 0, ref_frame, ref_sign_bias), \
+                           refmv_count, mv_ref_list, Done); \
+      if (has_second_ref(mbmi) && \
+          (mbmi)->ref_frame[1] != ref_frame && \
+          (mbmi)->mv[1].as_int != (mbmi)->mv[0].as_int) \
+        ADD_MV_REF_LIST_EB(scale_mv((mbmi), 1, ref_frame, ref_sign_bias), \
+                           refmv_count, mv_ref_list, Done); \
+    } \
+  } while (0)
+
+// This function searches the neighborhood of a given MB/SB
+// to try and find candidate reference vectors.
+static int dec_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+                            PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame,
+                            const POSITION *const mv_ref_search,
+                            int_mv *mv_ref_list,
+                            int mi_row, int mi_col, int block, int is_sub8x8,
+                            find_mv_refs_sync sync, void *const data) {
+  const int *ref_sign_bias = cm->ref_frame_sign_bias;
+  int i, refmv_count = 0;
+  int different_ref_found = 0;
+  const MV_REF *const prev_frame_mvs = cm->use_prev_frame_mvs ?
+      cm->prev_frame->mvs + mi_row * cm->mi_cols + mi_col : NULL;
+  const TileInfo *const tile = &xd->tile;
+  // If mode is nearestmv or newmv (uses nearestmv as a reference) then stop
+  // searching after the first mv is found.
+  const int early_break = (mode != NEARMV);
+
+  // Blank the reference vector list
+  memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES);
+
+  i = 0;
+  if (is_sub8x8) {
+    // If the size < 8x8 we get the mv from the bmi substructure for the
+    // nearest two blocks.
+    for (i = 0; i < 2; ++i) {
+      const POSITION *const mv_ref = &mv_ref_search[i];
+      if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
+        const MODE_INFO *const candidate_mi =
+            xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride];
+        different_ref_found = 1;
+
+        if (candidate_mi->ref_frame[0] == ref_frame)
+          ADD_MV_REF_LIST_EB(
+              get_sub_block_mv(candidate_mi, 0, mv_ref->col, block),
+                  refmv_count, mv_ref_list, Done);
+        else if (candidate_mi->ref_frame[1] == ref_frame)
+          ADD_MV_REF_LIST_EB(
+              get_sub_block_mv(candidate_mi, 1, mv_ref->col, block),
+                  refmv_count, mv_ref_list, Done);
+      }
+    }
+  }
+
+  // Check the rest of the neighbors in much the same way
+  // as before except we don't need to keep track of sub blocks or
+  // mode counts.
+  for (; i < MVREF_NEIGHBOURS; ++i) {
+    const POSITION *const mv_ref = &mv_ref_search[i];
+    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
+      const MODE_INFO *const candidate =
+          xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride];
+      different_ref_found = 1;
+
+      if (candidate->ref_frame[0] == ref_frame)
+        ADD_MV_REF_LIST_EB(candidate->mv[0], refmv_count, mv_ref_list, Done);
+      else if (candidate->ref_frame[1] == ref_frame)
+        ADD_MV_REF_LIST_EB(candidate->mv[1], refmv_count, mv_ref_list, Done);
+    }
+  }
+
+  // TODO(hkuang): Remove this sync after fixing pthread_cond_broadcast
+  // on windows platform. The sync here is unnecessary if use_prev_frame_mvs
+  // is 0. But after removing it, there will be hang in the unit test on windows
+  // due to several threads waiting for a thread's signal.
+#if defined(_WIN32) && !HAVE_PTHREAD_H
+    if (cm->frame_parallel_decode && sync != NULL) {
+      sync(data, mi_row);
+    }
+#endif
+
+  // Check the last frame's mode and mv info.
+  if (prev_frame_mvs) {
+    // Synchronize here for frame parallel decode if sync function is provided.
+    if (cm->frame_parallel_decode && sync != NULL) {
+      sync(data, mi_row);
+    }
+
+    if (prev_frame_mvs->ref_frame[0] == ref_frame) {
+      ADD_MV_REF_LIST_EB(prev_frame_mvs->mv[0], refmv_count, mv_ref_list, Done);
+    } else if (prev_frame_mvs->ref_frame[1] == ref_frame) {
+      ADD_MV_REF_LIST_EB(prev_frame_mvs->mv[1], refmv_count, mv_ref_list, Done);
+    }
+  }
+
+  // Since we couldn't find 2 mvs from the same reference frame
+  // go back through the neighbors and find motion vectors from
+  // different reference frames.
+  if (different_ref_found) {
+    for (i = 0; i < MVREF_NEIGHBOURS; ++i) {
+      const POSITION *mv_ref = &mv_ref_search[i];
+      if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
+        const MODE_INFO *const candidate =
+            xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride];
+
+        // If the candidate is INTRA we don't want to consider its mv.
+        IF_DIFF_REF_FRAME_ADD_MV_EB(candidate, ref_frame, ref_sign_bias,
+                                    refmv_count, mv_ref_list, Done);
+      }
+    }
+  }
+
+  // Since we still don't have a candidate we'll try the last frame.
+  if (prev_frame_mvs) {
+    if (prev_frame_mvs->ref_frame[0] != ref_frame &&
+        prev_frame_mvs->ref_frame[0] > INTRA_FRAME) {
+      int_mv mv = prev_frame_mvs->mv[0];
+      if (ref_sign_bias[prev_frame_mvs->ref_frame[0]] !=
+          ref_sign_bias[ref_frame]) {
+        mv.as_mv.row *= -1;
+        mv.as_mv.col *= -1;
+      }
+      ADD_MV_REF_LIST_EB(mv, refmv_count, mv_ref_list, Done);
+    }
+
+    if (prev_frame_mvs->ref_frame[1] > INTRA_FRAME &&
+        prev_frame_mvs->ref_frame[1] != ref_frame &&
+        prev_frame_mvs->mv[1].as_int != prev_frame_mvs->mv[0].as_int) {
+      int_mv mv = prev_frame_mvs->mv[1];
+      if (ref_sign_bias[prev_frame_mvs->ref_frame[1]] !=
+          ref_sign_bias[ref_frame]) {
+        mv.as_mv.row *= -1;
+        mv.as_mv.col *= -1;
+      }
+      ADD_MV_REF_LIST_EB(mv, refmv_count, mv_ref_list, Done);
+    }
+  }
+
+  if (mode == NEARMV)
+    refmv_count = MAX_MV_REF_CANDIDATES;
+  else
+    // we only care about the nearestmv for the remaining modes
+    refmv_count = 1;
+
+ Done:
+  // Clamp vectors
+  for (i = 0; i < refmv_count; ++i)
+    clamp_mv_ref(&mv_ref_list[i].as_mv, xd);
+
+  return refmv_count;
+}
+
+static void append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
+                                      const POSITION *const mv_ref_search,
+                                      PREDICTION_MODE b_mode, int block,
+                                      int ref, int mi_row, int mi_col,
+                                      int_mv *best_sub8x8) {
+  int_mv mv_list[MAX_MV_REF_CANDIDATES];
+  MODE_INFO *const mi = xd->mi[0];
+  b_mode_info *bmi = mi->bmi;
+  int n;
+  int refmv_count;
+
+  assert(MAX_MV_REF_CANDIDATES == 2);
+
+  refmv_count = dec_find_mv_refs(cm, xd, b_mode, mi->ref_frame[ref],
+                                 mv_ref_search, mv_list, mi_row, mi_col, block,
+                                 1, NULL, NULL);
+
+  switch (block) {
+    case 0:
+      best_sub8x8->as_int = mv_list[refmv_count - 1].as_int;
+      break;
+    case 1:
+    case 2:
+      if (b_mode == NEARESTMV) {
+        best_sub8x8->as_int = bmi[0].as_mv[ref].as_int;
+      } else {
+        best_sub8x8->as_int = 0;
+        for (n = 0; n < refmv_count; ++n)
+          if (bmi[0].as_mv[ref].as_int != mv_list[n].as_int) {
+            best_sub8x8->as_int = mv_list[n].as_int;
+            break;
+          }
+      }
+      break;
+    case 3:
+      if (b_mode == NEARESTMV) {
+        best_sub8x8->as_int = bmi[2].as_mv[ref].as_int;
+      } else {
+        int_mv candidates[2 + MAX_MV_REF_CANDIDATES];
+        candidates[0] = bmi[1].as_mv[ref];
+        candidates[1] = bmi[0].as_mv[ref];
+        candidates[2] = mv_list[0];
+        candidates[3] = mv_list[1];
+        best_sub8x8->as_int = 0;
+        for (n = 0; n < 2 + MAX_MV_REF_CANDIDATES; ++n)
+          if (bmi[2].as_mv[ref].as_int != candidates[n].as_int) {
+            best_sub8x8->as_int = candidates[n].as_int;
+            break;
+          }
+      }
+      break;
+    default:
+      assert(0 && "Invalid block index.");
+  }
+}
+
+static uint8_t get_mode_context(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+                                const POSITION *const mv_ref_search,
+                                int mi_row, int mi_col) {
+  int i;
+  int context_counter = 0;
+  const TileInfo *const tile = &xd->tile;
+
+  // Get mode count from nearest 2 blocks
+  for (i = 0; i < 2; ++i) {
+    const POSITION *const mv_ref = &mv_ref_search[i];
+    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
+      const MODE_INFO *const candidate =
+          xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride];
+      // Keep counts for entropy encoding.
+      context_counter += mode_2_counter[candidate->mode];
+    }
+  }
+
+  return counter_to_context[context_counter];
+}
+
+static void read_inter_block_mode_info(VP9Decoder *const pbi,
+                                       MACROBLOCKD *const xd,
+                                       MODE_INFO *const mi,
+                                       int mi_row, int mi_col, vpx_reader *r) {
+  VP9_COMMON *const cm = &pbi->common;
+  const BLOCK_SIZE bsize = mi->sb_type;
+  const int allow_hp = cm->allow_high_precision_mv;
+  int_mv best_ref_mvs[2];
+  int ref, is_compound;
+  uint8_t inter_mode_ctx;
+  const POSITION *const mv_ref_search = mv_ref_blocks[bsize];
+
+  read_ref_frames(cm, xd, r, mi->segment_id, mi->ref_frame);
+  is_compound = has_second_ref(mi);
+  inter_mode_ctx = get_mode_context(cm, xd, mv_ref_search, mi_row, mi_col);
+
+  if (segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_SKIP)) {
+    mi->mode = ZEROMV;
+    if (bsize < BLOCK_8X8) {
+        vpx_internal_error(xd->error_info, VPX_CODEC_UNSUP_BITSTREAM,
+                           "Invalid usage of segement feature on small blocks");
+        return;
+    }
+  } else {
+    if (bsize >= BLOCK_8X8)
+      mi->mode = read_inter_mode(cm, xd, r, inter_mode_ctx);
+    else
+      // Sub 8x8 blocks use the nearestmv as a ref_mv if the b_mode is NEWMV.
+      // Setting mode to NEARESTMV forces the search to stop after the nearestmv
+      // has been found. After b_modes have been read, mode will be overwritten
+      // by the last b_mode.
+      mi->mode = NEARESTMV;
+
+    if (mi->mode != ZEROMV) {
+      for (ref = 0; ref < 1 + is_compound; ++ref) {
+        int_mv tmp_mvs[MAX_MV_REF_CANDIDATES];
+        const MV_REFERENCE_FRAME frame = mi->ref_frame[ref];
+        int refmv_count;
+
+        refmv_count = dec_find_mv_refs(cm, xd, mi->mode, frame, mv_ref_search,
+                                       tmp_mvs, mi_row, mi_col, -1, 0,
+                                       fpm_sync, (void *)pbi);
+
+        dec_find_best_ref_mvs(xd, allow_hp, tmp_mvs, &best_ref_mvs[ref],
+                              refmv_count);
+      }
+    }
+  }
+
+  mi->interp_filter = (cm->interp_filter == SWITCHABLE)
+                      ? read_switchable_interp_filter(cm, xd, r)
+                      : cm->interp_filter;
+
+  if (bsize < BLOCK_8X8) {
+    const int num_4x4_w = 1 << xd->bmode_blocks_wl;
+    const int num_4x4_h = 1 << xd->bmode_blocks_hl;
+    int idx, idy;
+    PREDICTION_MODE b_mode;
+    int_mv best_sub8x8[2];
+    for (idy = 0; idy < 2; idy += num_4x4_h) {
+      for (idx = 0; idx < 2; idx += num_4x4_w) {
+        const int j = idy * 2 + idx;
+        b_mode = read_inter_mode(cm, xd, r, inter_mode_ctx);
+
+        if (b_mode == NEARESTMV || b_mode == NEARMV) {
+          for (ref = 0; ref < 1 + is_compound; ++ref)
+            append_sub8x8_mvs_for_idx(cm, xd, mv_ref_search, b_mode, j, ref,
+                                      mi_row, mi_col, &best_sub8x8[ref]);
+        }
+
+        if (!assign_mv(cm, xd, b_mode, mi->bmi[j].as_mv, best_ref_mvs,
+                       best_sub8x8, is_compound, allow_hp, r)) {
+          xd->corrupted |= 1;
+          break;
+        }
+
+        if (num_4x4_h == 2)
+          mi->bmi[j + 2] = mi->bmi[j];
+        if (num_4x4_w == 2)
+          mi->bmi[j + 1] = mi->bmi[j];
+      }
+    }
+
+    mi->mode = b_mode;
+
+    copy_mv_pair(mi->mv, mi->bmi[3].as_mv);
+  } else {
+    xd->corrupted |= !assign_mv(cm, xd, mi->mode, mi->mv, best_ref_mvs,
+                                best_ref_mvs, is_compound, allow_hp, r);
+  }
+}
+
+static void read_inter_frame_mode_info(VP9Decoder *const pbi,
+                                       MACROBLOCKD *const xd,
+                                       int mi_row, int mi_col, vpx_reader *r) {
+  VP9_COMMON *const cm = &pbi->common;
+  MODE_INFO *const mi = xd->mi[0];
+  int inter_block;
+
+  mi->segment_id = read_inter_segment_id(cm, xd, mi_row, mi_col, r);
+  mi->skip = read_skip(cm, xd, mi->segment_id, r);
+  inter_block = read_is_inter_block(cm, xd, mi->segment_id, r);
+  mi->tx_size = read_tx_size(cm, xd, !mi->skip || !inter_block, r);
+
+  if (inter_block)
+    read_inter_block_mode_info(pbi, xd, mi, mi_row, mi_col, r);
+  else
+    read_intra_block_mode_info(cm, xd, mi, r);
+}
+
+static INLINE void copy_ref_frame_pair(MV_REFERENCE_FRAME *dst,
+                                       const MV_REFERENCE_FRAME *src) {
+  memcpy(dst, src, sizeof(*dst) * 2);
+}
+
+void vpx_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd,
+                        int mi_row, int mi_col, vpx_reader *r,
+                        int x_mis, int y_mis) {
+  VP9_COMMON *const cm = &pbi->common;
+  MODE_INFO *const mi = xd->mi[0];
+  MV_REF* frame_mvs = cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col;
+  int w, h;
+
+  if (frame_is_intra_only(cm)) {
+    read_intra_frame_mode_info(cm, xd, mi_row, mi_col, r);
+  } else {
+    read_inter_frame_mode_info(pbi, xd, mi_row, mi_col, r);
+
+    for (h = 0; h < y_mis; ++h) {
+      for (w = 0; w < x_mis; ++w) {
+        MV_REF *const mv = frame_mvs + w;
+        copy_ref_frame_pair(mv->ref_frame, mi->ref_frame);
+        copy_mv_pair(mv->mv, mi->mv);
+      }
+      frame_mvs += cm->mi_cols;
+    }
+  }
+}
diff --git a/libs/libvpx/vp9/decoder/vp9_decodemv.h b/libs/libvpx/vp9/decoder/vp9_decodemv.h
new file mode 100644
index 0000000000..75f568cf1f
--- /dev/null
+++ b/libs/libvpx/vp9/decoder/vp9_decodemv.h
@@ -0,0 +1,30 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_DECODER_VP9_DECODEMV_H_
+#define VP9_DECODER_VP9_DECODEMV_H_
+
+#include "vpx_dsp/bitreader.h"
+
+#include "vp9/decoder/vp9_decoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vpx_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd,
+                        int mi_row, int mi_col, vpx_reader *r,
+                        int x_mis, int y_mis);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_DECODER_VP9_DECODEMV_H_
diff --git a/libs/libvpx/vp9/decoder/vp9_decoder.c b/libs/libvpx/vp9/decoder/vp9_decoder.c
new file mode 100644
index 0000000000..f5da07ea02
--- /dev/null
+++ b/libs/libvpx/vp9/decoder/vp9_decoder.c
@@ -0,0 +1,511 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <stdio.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_scale_rtcd.h"
+
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/system_state.h"
+#include "vpx_ports/vpx_once.h"
+#include "vpx_ports/vpx_timer.h"
+#include "vpx_scale/vpx_scale.h"
+#include "vpx_util/vpx_thread.h"
+
+#include "vp9/common/vp9_alloccommon.h"
+#include "vp9/common/vp9_loopfilter.h"
+#include "vp9/common/vp9_onyxc_int.h"
+#if CONFIG_VP9_POSTPROC
+#include "vp9/common/vp9_postproc.h"
+#endif
+#include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_reconintra.h"
+
+#include "vp9/decoder/vp9_decodeframe.h"
+#include "vp9/decoder/vp9_decoder.h"
+#include "vp9/decoder/vp9_detokenize.h"
+
+static void initialize_dec(void) {
+  static volatile int init_done = 0;
+
+  if (!init_done) {
+    vp9_rtcd();
+    vpx_dsp_rtcd();
+    vpx_scale_rtcd();
+    vp9_init_intra_predictors();
+    init_done = 1;
+  }
+}
+
+static void vp9_dec_setup_mi(VP9_COMMON *cm) {
+  cm->mi = cm->mip + cm->mi_stride + 1;
+  cm->mi_grid_visible = cm->mi_grid_base + cm->mi_stride + 1;
+  memset(cm->mi_grid_base, 0,
+         cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mi_grid_base));
+}
+
+static int vp9_dec_alloc_mi(VP9_COMMON *cm, int mi_size) {
+  cm->mip = vpx_calloc(mi_size, sizeof(*cm->mip));
+  if (!cm->mip)
+    return 1;
+  cm->mi_alloc_size = mi_size;
+  cm->mi_grid_base = (MODE_INFO **)vpx_calloc(mi_size, sizeof(MODE_INFO*));
+  if (!cm->mi_grid_base)
+    return 1;
+  return 0;
+}
+
+static void vp9_dec_free_mi(VP9_COMMON *cm) {
+  vpx_free(cm->mip);
+  cm->mip = NULL;
+  vpx_free(cm->mi_grid_base);
+  cm->mi_grid_base = NULL;
+}
+
+VP9Decoder *vp9_decoder_create(BufferPool *const pool) {
+  VP9Decoder *volatile const pbi = vpx_memalign(32, sizeof(*pbi));
+  VP9_COMMON *volatile const cm = pbi ? &pbi->common : NULL;
+
+  if (!cm)
+    return NULL;
+
+  vp9_zero(*pbi);
+
+  if (setjmp(cm->error.jmp)) {
+    cm->error.setjmp = 0;
+    vp9_decoder_remove(pbi);
+    return NULL;
+  }
+
+  cm->error.setjmp = 1;
+
+  CHECK_MEM_ERROR(cm, cm->fc,
+                  (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc)));
+  CHECK_MEM_ERROR(cm, cm->frame_contexts,
+                  (FRAME_CONTEXT *)vpx_calloc(FRAME_CONTEXTS,
+                  sizeof(*cm->frame_contexts)));
+
+  pbi->need_resync = 1;
+  once(initialize_dec);
+
+  // Initialize the references to not point to any frame buffers.
+  memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
+  memset(&cm->next_ref_frame_map, -1, sizeof(cm->next_ref_frame_map));
+
+  cm->current_video_frame = 0;
+  pbi->ready_for_new_data = 1;
+  pbi->common.buffer_pool = pool;
+
+  cm->bit_depth = VPX_BITS_8;
+  cm->dequant_bit_depth = VPX_BITS_8;
+
+  cm->alloc_mi = vp9_dec_alloc_mi;
+  cm->free_mi = vp9_dec_free_mi;
+  cm->setup_mi = vp9_dec_setup_mi;
+
+  vp9_loop_filter_init(cm);
+
+  cm->error.setjmp = 0;
+
+  vpx_get_worker_interface()->init(&pbi->lf_worker);
+
+  return pbi;
+}
+
+void vp9_decoder_remove(VP9Decoder *pbi) {
+  int i;
+
+  if (!pbi)
+    return;
+
+  vpx_get_worker_interface()->end(&pbi->lf_worker);
+  vpx_free(pbi->lf_worker.data1);
+  vpx_free(pbi->tile_data);
+  for (i = 0; i < pbi->num_tile_workers; ++i) {
+    VPxWorker *const worker = &pbi->tile_workers[i];
+    vpx_get_worker_interface()->end(worker);
+  }
+  vpx_free(pbi->tile_worker_data);
+  vpx_free(pbi->tile_workers);
+
+  if (pbi->num_tile_workers > 0) {
+    vp9_loop_filter_dealloc(&pbi->lf_row_sync);
+  }
+
+  vpx_free(pbi);
+}
+
+static int equal_dimensions(const YV12_BUFFER_CONFIG *a,
+                            const YV12_BUFFER_CONFIG *b) {
+    return a->y_height == b->y_height && a->y_width == b->y_width &&
+           a->uv_height == b->uv_height && a->uv_width == b->uv_width;
+}
+
+vpx_codec_err_t vp9_copy_reference_dec(VP9Decoder *pbi,
+                                       VP9_REFFRAME ref_frame_flag,
+                                       YV12_BUFFER_CONFIG *sd) {
+  VP9_COMMON *cm = &pbi->common;
+
+  /* TODO(jkoleszar): The decoder doesn't have any real knowledge of what the
+   * encoder is using the frame buffers for. This is just a stub to keep the
+   * vpxenc --test-decode functionality working, and will be replaced in a
+   * later commit that adds VP9-specific controls for this functionality.
+   */
+  if (ref_frame_flag == VP9_LAST_FLAG) {
+    const YV12_BUFFER_CONFIG *const cfg = get_ref_frame(cm, 0);
+    if (cfg == NULL) {
+      vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                         "No 'last' reference frame");
+      return VPX_CODEC_ERROR;
+    }
+    if (!equal_dimensions(cfg, sd))
+      vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                         "Incorrect buffer dimensions");
+    else
+      vp8_yv12_copy_frame(cfg, sd);
+  } else {
+    vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                       "Invalid reference frame");
+  }
+
+  return cm->error.error_code;
+}
+
+
+vpx_codec_err_t vp9_set_reference_dec(VP9_COMMON *cm,
+                                      VP9_REFFRAME ref_frame_flag,
+                                      YV12_BUFFER_CONFIG *sd) {
+  RefBuffer *ref_buf = NULL;
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+
+  // TODO(jkoleszar): The decoder doesn't have any real knowledge of what the
+  // encoder is using the frame buffers for. This is just a stub to keep the
+  // vpxenc --test-decode functionality working, and will be replaced in a
+  // later commit that adds VP9-specific controls for this functionality.
+  if (ref_frame_flag == VP9_LAST_FLAG) {
+    ref_buf = &cm->frame_refs[0];
+  } else if (ref_frame_flag == VP9_GOLD_FLAG) {
+    ref_buf = &cm->frame_refs[1];
+  } else if (ref_frame_flag == VP9_ALT_FLAG) {
+    ref_buf = &cm->frame_refs[2];
+  } else {
+    vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                       "Invalid reference frame");
+    return cm->error.error_code;
+  }
+
+  if (!equal_dimensions(ref_buf->buf, sd)) {
+    vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                       "Incorrect buffer dimensions");
+  } else {
+    int *ref_fb_ptr = &ref_buf->idx;
+
+    // Find an empty frame buffer.
+    const int free_fb = get_free_fb(cm);
+    if (cm->new_fb_idx == INVALID_IDX)
+      return VPX_CODEC_MEM_ERROR;
+
+    // Decrease ref_count since it will be increased again in
+    // ref_cnt_fb() below.
+    --frame_bufs[free_fb].ref_count;
+
+    // Manage the reference counters and copy image.
+    ref_cnt_fb(frame_bufs, ref_fb_ptr, free_fb);
+    ref_buf->buf = &frame_bufs[*ref_fb_ptr].buf;
+    vp8_yv12_copy_frame(sd, ref_buf->buf);
+  }
+
+  return cm->error.error_code;
+}
+
+/* If any buffer updating is signaled it should be done here. */
+static void swap_frame_buffers(VP9Decoder *pbi) {
+  int ref_index = 0, mask;
+  VP9_COMMON *const cm = &pbi->common;
+  BufferPool *const pool = cm->buffer_pool;
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+
+  lock_buffer_pool(pool);
+  for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
+    const int old_idx = cm->ref_frame_map[ref_index];
+    // Current thread releases the holding of reference frame.
+    decrease_ref_count(old_idx, frame_bufs, pool);
+
+    // Release the reference frame in reference map.
+    if (mask & 1) {
+      decrease_ref_count(old_idx, frame_bufs, pool);
+    }
+    cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
+    ++ref_index;
+  }
+
+  // Current thread releases the holding of reference frame.
+  for (; ref_index < REF_FRAMES && !cm->show_existing_frame; ++ref_index) {
+    const int old_idx = cm->ref_frame_map[ref_index];
+    decrease_ref_count(old_idx, frame_bufs, pool);
+    cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
+  }
+  unlock_buffer_pool(pool);
+  pbi->hold_ref_buf = 0;
+  cm->frame_to_show = get_frame_new_buffer(cm);
+
+  if (!pbi->frame_parallel_decode || !cm->show_frame) {
+    lock_buffer_pool(pool);
+    --frame_bufs[cm->new_fb_idx].ref_count;
+    unlock_buffer_pool(pool);
+  }
+
+  // Invalidate these references until the next frame starts.
+  for (ref_index = 0; ref_index < 3; ref_index++)
+    cm->frame_refs[ref_index].idx = -1;
+}
+
+int vp9_receive_compressed_data(VP9Decoder *pbi,
+                                size_t size, const uint8_t **psource) {
+  VP9_COMMON *volatile const cm = &pbi->common;
+  BufferPool *volatile const pool = cm->buffer_pool;
+  RefCntBuffer *volatile const frame_bufs = cm->buffer_pool->frame_bufs;
+  const uint8_t *source = *psource;
+  int retcode = 0;
+  cm->error.error_code = VPX_CODEC_OK;
+
+  if (size == 0) {
+    // This is used to signal that we are missing frames.
+    // We do not know if the missing frame(s) was supposed to update
+    // any of the reference buffers, but we act conservative and
+    // mark only the last buffer as corrupted.
+    //
+    // TODO(jkoleszar): Error concealment is undefined and non-normative
+    // at this point, but if it becomes so, [0] may not always be the correct
+    // thing to do here.
+    if (cm->frame_refs[0].idx > 0) {
+      assert(cm->frame_refs[0].buf != NULL);
+      cm->frame_refs[0].buf->corrupted = 1;
+    }
+  }
+
+  pbi->ready_for_new_data = 0;
+
+  // Check if the previous frame was a frame without any references to it.
+  // Release frame buffer if not decoding in frame parallel mode.
+  if (!pbi->frame_parallel_decode && cm->new_fb_idx >= 0
+      && frame_bufs[cm->new_fb_idx].ref_count == 0)
+    pool->release_fb_cb(pool->cb_priv,
+                        &frame_bufs[cm->new_fb_idx].raw_frame_buffer);
+  // Find a free frame buffer. Return error if can not find any.
+  cm->new_fb_idx = get_free_fb(cm);
+  if (cm->new_fb_idx == INVALID_IDX)
+    return VPX_CODEC_MEM_ERROR;
+
+  // Assign a MV array to the frame buffer.
+  cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx];
+
+  pbi->hold_ref_buf = 0;
+  if (pbi->frame_parallel_decode) {
+    VPxWorker *const worker = pbi->frame_worker_owner;
+    vp9_frameworker_lock_stats(worker);
+    frame_bufs[cm->new_fb_idx].frame_worker_owner = worker;
+    // Reset decoding progress.
+    pbi->cur_buf = &frame_bufs[cm->new_fb_idx];
+    pbi->cur_buf->row = -1;
+    pbi->cur_buf->col = -1;
+    vp9_frameworker_unlock_stats(worker);
+  } else {
+    pbi->cur_buf = &frame_bufs[cm->new_fb_idx];
+  }
+
+
+  if (setjmp(cm->error.jmp)) {
+    const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
+    int i;
+
+    cm->error.setjmp = 0;
+    pbi->ready_for_new_data = 1;
+
+    // Synchronize all threads immediately as a subsequent decode call may
+    // cause a resize invalidating some allocations.
+    winterface->sync(&pbi->lf_worker);
+    for (i = 0; i < pbi->num_tile_workers; ++i) {
+      winterface->sync(&pbi->tile_workers[i]);
+    }
+
+    lock_buffer_pool(pool);
+    // Release all the reference buffers if worker thread is holding them.
+    if (pbi->hold_ref_buf == 1) {
+      int ref_index = 0, mask;
+      for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
+        const int old_idx = cm->ref_frame_map[ref_index];
+        // Current thread releases the holding of reference frame.
+        decrease_ref_count(old_idx, frame_bufs, pool);
+
+        // Release the reference frame in reference map.
+        if (mask & 1) {
+          decrease_ref_count(old_idx, frame_bufs, pool);
+        }
+        ++ref_index;
+      }
+
+      // Current thread releases the holding of reference frame.
+      for (; ref_index < REF_FRAMES && !cm->show_existing_frame; ++ref_index) {
+        const int old_idx = cm->ref_frame_map[ref_index];
+        decrease_ref_count(old_idx, frame_bufs, pool);
+      }
+      pbi->hold_ref_buf = 0;
+    }
+    // Release current frame.
+    decrease_ref_count(cm->new_fb_idx, frame_bufs, pool);
+    unlock_buffer_pool(pool);
+
+    vpx_clear_system_state();
+    return -1;
+  }
+
+  cm->error.setjmp = 1;
+  vp9_decode_frame(pbi, source, source + size, psource);
+
+  swap_frame_buffers(pbi);
+
+  vpx_clear_system_state();
+
+  if (!cm->show_existing_frame) {
+    cm->last_show_frame = cm->show_frame;
+    cm->prev_frame = cm->cur_frame;
+    if (cm->seg.enabled && !pbi->frame_parallel_decode)
+      vp9_swap_current_and_last_seg_map(cm);
+  }
+
+  // Update progress in frame parallel decode.
+  if (pbi->frame_parallel_decode) {
+    // Need to lock the mutex here as another thread may
+    // be accessing this buffer.
+    VPxWorker *const worker = pbi->frame_worker_owner;
+    FrameWorkerData *const frame_worker_data = worker->data1;
+    vp9_frameworker_lock_stats(worker);
+
+    if (cm->show_frame) {
+      cm->current_video_frame++;
+    }
+    frame_worker_data->frame_decoded = 1;
+    frame_worker_data->frame_context_ready = 1;
+    vp9_frameworker_signal_stats(worker);
+    vp9_frameworker_unlock_stats(worker);
+  } else {
+    cm->last_width = cm->width;
+    cm->last_height = cm->height;
+    if (cm->show_frame) {
+      cm->current_video_frame++;
+    }
+  }
+
+  cm->error.setjmp = 0;
+  return retcode;
+}
+
+int vp9_get_raw_frame(VP9Decoder *pbi, YV12_BUFFER_CONFIG *sd,
+                      vp9_ppflags_t *flags) {
+  VP9_COMMON *const cm = &pbi->common;
+  int ret = -1;
+#if !CONFIG_VP9_POSTPROC
+  (void)*flags;
+#endif
+
+  if (pbi->ready_for_new_data == 1)
+    return ret;
+
+  pbi->ready_for_new_data = 1;
+
+  /* no raw frame to show!!! */
+  if (!cm->show_frame)
+    return ret;
+
+  pbi->ready_for_new_data = 1;
+
+#if CONFIG_VP9_POSTPROC
+  if (!cm->show_existing_frame) {
+    ret = vp9_post_proc_frame(cm, sd, flags);
+  } else {
+    *sd = *cm->frame_to_show;
+    ret = 0;
+  }
+#else
+  *sd = *cm->frame_to_show;
+  ret = 0;
+#endif /*!CONFIG_POSTPROC*/
+  vpx_clear_system_state();
+  return ret;
+}
+
+vpx_codec_err_t vp9_parse_superframe_index(const uint8_t *data,
+                                           size_t data_sz,
+                                           uint32_t sizes[8], int *count,
+                                           vpx_decrypt_cb decrypt_cb,
+                                           void *decrypt_state) {
+  // A chunk ending with a byte matching 0xc0 is an invalid chunk unless
+  // it is a super frame index. If the last byte of real video compression
+  // data is 0xc0 the encoder must add a 0 byte. If we have the marker but
+  // not the associated matching marker byte at the front of the index we have
+  // an invalid bitstream and need to return an error.
+
+  uint8_t marker;
+
+  assert(data_sz);
+  marker = read_marker(decrypt_cb, decrypt_state, data + data_sz - 1);
+  *count = 0;
+
+  if ((marker & 0xe0) == 0xc0) {
+    const uint32_t frames = (marker & 0x7) + 1;
+    const uint32_t mag = ((marker >> 3) & 0x3) + 1;
+    const size_t index_sz = 2 + mag * frames;
+
+    // This chunk is marked as having a superframe index but doesn't have
+    // enough data for it, thus it's an invalid superframe index.
+    if (data_sz < index_sz)
+      return VPX_CODEC_CORRUPT_FRAME;
+
+    {
+      const uint8_t marker2 = read_marker(decrypt_cb, decrypt_state,
+                                          data + data_sz - index_sz);
+
+      // This chunk is marked as having a superframe index but doesn't have
+      // the matching marker byte at the front of the index therefore it's an
+      // invalid chunk.
+      if (marker != marker2)
+        return VPX_CODEC_CORRUPT_FRAME;
+    }
+
+    {
+      // Found a valid superframe index.
+      uint32_t i, j;
+      const uint8_t *x = &data[data_sz - index_sz + 1];
+
+      // Frames has a maximum of 8 and mag has a maximum of 4.
+      uint8_t clear_buffer[32];
+      assert(sizeof(clear_buffer) >= frames * mag);
+      if (decrypt_cb) {
+        decrypt_cb(decrypt_state, x, clear_buffer, frames * mag);
+        x = clear_buffer;
+      }
+
+      for (i = 0; i < frames; ++i) {
+        uint32_t this_sz = 0;
+
+        for (j = 0; j < mag; ++j)
+          this_sz |= (*x++) << (j * 8);
+        sizes[i] = this_sz;
+      }
+      *count = frames;
+    }
+  }
+  return VPX_CODEC_OK;
+}
diff --git a/libs/libvpx/vp9/decoder/vp9_decoder.h b/libs/libvpx/vp9/decoder/vp9_decoder.h
new file mode 100644
index 0000000000..afa400941d
--- /dev/null
+++ b/libs/libvpx/vp9/decoder/vp9_decoder.h
@@ -0,0 +1,148 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_DECODER_VP9_DECODER_H_
+#define VP9_DECODER_VP9_DECODER_H_
+
+#include "./vpx_config.h"
+
+#include "vpx/vpx_codec.h"
+#include "vpx_dsp/bitreader.h"
+#include "vpx_scale/yv12config.h"
+#include "vpx_util/vpx_thread.h"
+
+#include "vp9/common/vp9_thread_common.h"
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/vp9_ppflags.h"
+#include "vp9/decoder/vp9_dthread.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// TODO(hkuang): combine this with TileWorkerData.
+typedef struct TileData {
+  VP9_COMMON *cm;
+  vpx_reader bit_reader;
+  DECLARE_ALIGNED(16, MACROBLOCKD, xd);
+  /* dqcoeff are shared by all the planes. So planes must be decoded serially */
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
+} TileData;
+
+typedef struct TileBuffer {
+  const uint8_t *data;
+  size_t size;
+  int col;  // only used with multi-threaded decoding
+} TileBuffer;
+
+typedef struct TileWorkerData {
+  const uint8_t *data_end;
+  int buf_start, buf_end;  // pbi->tile_buffers to decode, inclusive
+  vpx_reader bit_reader;
+  FRAME_COUNTS counts;
+  DECLARE_ALIGNED(16, MACROBLOCKD, xd);
+  /* dqcoeff are shared by all the planes. So planes must be decoded serially */
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
+  struct vpx_internal_error_info error_info;
+} TileWorkerData;
+
+typedef struct VP9Decoder {
+  DECLARE_ALIGNED(16, MACROBLOCKD, mb);
+
+  DECLARE_ALIGNED(16, VP9_COMMON, common);
+
+  int ready_for_new_data;
+
+  int refresh_frame_flags;
+
+  int frame_parallel_decode;  // frame-based threading.
+
+  // TODO(hkuang): Combine this with cur_buf in macroblockd as they are
+  // the same.
+  RefCntBuffer *cur_buf;   //  Current decoding frame buffer.
+
+  VPxWorker *frame_worker_owner;   // frame_worker that owns this pbi.
+  VPxWorker lf_worker;
+  VPxWorker *tile_workers;
+  TileWorkerData *tile_worker_data;
+  TileBuffer tile_buffers[64];
+  int num_tile_workers;
+
+  TileData *tile_data;
+  int total_tiles;
+
+  VP9LfSync lf_row_sync;
+
+  vpx_decrypt_cb decrypt_cb;
+  void *decrypt_state;
+
+  int max_threads;
+  int inv_tile_order;
+  int need_resync;  // wait for key/intra-only frame.
+  int hold_ref_buf;  // hold the reference buffer.
+} VP9Decoder;
+
+int vp9_receive_compressed_data(struct VP9Decoder *pbi,
+                                size_t size, const uint8_t **dest);
+
+int vp9_get_raw_frame(struct VP9Decoder *pbi, YV12_BUFFER_CONFIG *sd,
+                      vp9_ppflags_t *flags);
+
+vpx_codec_err_t vp9_copy_reference_dec(struct VP9Decoder *pbi,
+                                       VP9_REFFRAME ref_frame_flag,
+                                       YV12_BUFFER_CONFIG *sd);
+
+vpx_codec_err_t vp9_set_reference_dec(VP9_COMMON *cm,
+                                      VP9_REFFRAME ref_frame_flag,
+                                      YV12_BUFFER_CONFIG *sd);
+
+static INLINE uint8_t read_marker(vpx_decrypt_cb decrypt_cb,
+                                  void *decrypt_state,
+                                  const uint8_t *data) {
+  if (decrypt_cb) {
+    uint8_t marker;
+    decrypt_cb(decrypt_state, data, &marker, 1);
+    return marker;
+  }
+  return *data;
+}
+
+// This function is exposed for use in tests, as well as the inlined function
+// "read_marker".
+vpx_codec_err_t vp9_parse_superframe_index(const uint8_t *data,
+                                           size_t data_sz,
+                                           uint32_t sizes[8], int *count,
+                                           vpx_decrypt_cb decrypt_cb,
+                                           void *decrypt_state);
+
+struct VP9Decoder *vp9_decoder_create(BufferPool *const pool);
+
+void vp9_decoder_remove(struct VP9Decoder *pbi);
+
+static INLINE void decrease_ref_count(int idx, RefCntBuffer *const frame_bufs,
+                                      BufferPool *const pool) {
+  if (idx >= 0 && frame_bufs[idx].ref_count > 0) {
+    --frame_bufs[idx].ref_count;
+    // A worker may only get a free framebuffer index when calling get_free_fb.
+    // But the private buffer is not set up until finish decoding header.
+    // So any error happens during decoding header, the frame_bufs will not
+    // have valid priv buffer.
+    if (frame_bufs[idx].ref_count == 0 &&
+        frame_bufs[idx].raw_frame_buffer.priv) {
+      pool->release_fb_cb(pool->cb_priv, &frame_bufs[idx].raw_frame_buffer);
+    }
+  }
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_DECODER_VP9_DECODER_H_
diff --git a/libs/libvpx/vp9/decoder/vp9_detokenize.c b/libs/libvpx/vp9/decoder/vp9_detokenize.c
new file mode 100644
index 0000000000..dcc75b9d2d
--- /dev/null
+++ b/libs/libvpx/vp9/decoder/vp9_detokenize.c
@@ -0,0 +1,216 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_entropy.h"
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+#include "vp9/common/vp9_idct.h"
+#endif
+
+#include "vp9/decoder/vp9_detokenize.h"
+
+#define EOB_CONTEXT_NODE            0
+#define ZERO_CONTEXT_NODE           1
+#define ONE_CONTEXT_NODE            2
+
+#define INCREMENT_COUNT(token)                              \
+  do {                                                      \
+     if (counts)                                            \
+       ++coef_counts[band][ctx][token];                     \
+  } while (0)
+
+static INLINE int read_coeff(const vpx_prob *probs, int n, vpx_reader *r) {
+  int i, val = 0;
+  for (i = 0; i < n; ++i)
+    val = (val << 1) | vpx_read(r, probs[i]);
+  return val;
+}
+
+static int decode_coefs(const MACROBLOCKD *xd,
+                        PLANE_TYPE type,
+                        tran_low_t *dqcoeff, TX_SIZE tx_size, const int16_t *dq,
+                        int ctx, const int16_t *scan, const int16_t *nb,
+                        vpx_reader *r) {
+  FRAME_COUNTS *counts = xd->counts;
+  const int max_eob = 16 << (tx_size << 1);
+  const FRAME_CONTEXT *const fc = xd->fc;
+  const int ref = is_inter_block(xd->mi[0]);
+  int band, c = 0;
+  const vpx_prob (*coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
+      fc->coef_probs[tx_size][type][ref];
+  const vpx_prob *prob;
+  unsigned int (*coef_counts)[COEFF_CONTEXTS][UNCONSTRAINED_NODES + 1];
+  unsigned int (*eob_branch_count)[COEFF_CONTEXTS];
+  uint8_t token_cache[32 * 32];
+  const uint8_t *band_translate = get_band_translate(tx_size);
+  const int dq_shift = (tx_size == TX_32X32);
+  int v, token;
+  int16_t dqv = dq[0];
+  const uint8_t *const cat6_prob =
+#if CONFIG_VP9_HIGHBITDEPTH
+      (xd->bd == VPX_BITS_12) ? vp9_cat6_prob_high12 :
+      (xd->bd == VPX_BITS_10) ? vp9_cat6_prob_high12 + 2 :
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      vp9_cat6_prob;
+  const int cat6_bits =
+#if CONFIG_VP9_HIGHBITDEPTH
+      (xd->bd == VPX_BITS_12) ? 18 :
+      (xd->bd == VPX_BITS_10) ? 16 :
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      14;
+
+  if (counts) {
+    coef_counts = counts->coef[tx_size][type][ref];
+    eob_branch_count = counts->eob_branch[tx_size][type][ref];
+  }
+
+  while (c < max_eob) {
+    int val = -1;
+    band = *band_translate++;
+    prob = coef_probs[band][ctx];
+    if (counts)
+      ++eob_branch_count[band][ctx];
+    if (!vpx_read(r, prob[EOB_CONTEXT_NODE])) {
+      INCREMENT_COUNT(EOB_MODEL_TOKEN);
+      break;
+    }
+
+    while (!vpx_read(r, prob[ZERO_CONTEXT_NODE])) {
+      INCREMENT_COUNT(ZERO_TOKEN);
+      dqv = dq[1];
+      token_cache[scan[c]] = 0;
+      ++c;
+      if (c >= max_eob)
+        return c;  // zero tokens at the end (no eob token)
+      ctx = get_coef_context(nb, token_cache, c);
+      band = *band_translate++;
+      prob = coef_probs[band][ctx];
+    }
+
+    if (!vpx_read(r, prob[ONE_CONTEXT_NODE])) {
+      INCREMENT_COUNT(ONE_TOKEN);
+      token = ONE_TOKEN;
+      val = 1;
+    } else {
+      INCREMENT_COUNT(TWO_TOKEN);
+      token = vpx_read_tree(r, vp9_coef_con_tree,
+                            vp9_pareto8_full[prob[PIVOT_NODE] - 1]);
+      switch (token) {
+        case TWO_TOKEN:
+        case THREE_TOKEN:
+        case FOUR_TOKEN:
+          val = token;
+          break;
+        case CATEGORY1_TOKEN:
+          val = CAT1_MIN_VAL + read_coeff(vp9_cat1_prob, 1, r);
+          break;
+        case CATEGORY2_TOKEN:
+          val = CAT2_MIN_VAL + read_coeff(vp9_cat2_prob, 2, r);
+          break;
+        case CATEGORY3_TOKEN:
+          val = CAT3_MIN_VAL + read_coeff(vp9_cat3_prob, 3, r);
+          break;
+        case CATEGORY4_TOKEN:
+          val = CAT4_MIN_VAL + read_coeff(vp9_cat4_prob, 4, r);
+          break;
+        case CATEGORY5_TOKEN:
+          val = CAT5_MIN_VAL + read_coeff(vp9_cat5_prob, 5, r);
+          break;
+        case CATEGORY6_TOKEN:
+          val = CAT6_MIN_VAL + read_coeff(cat6_prob, cat6_bits, r);
+          break;
+      }
+    }
+    v = (val * dqv) >> dq_shift;
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+#if CONFIG_VP9_HIGHBITDEPTH
+    dqcoeff[scan[c]] = highbd_check_range((vpx_read_bit(r) ? -v : v),
+                                          xd->bd);
+#else
+    dqcoeff[scan[c]] = check_range(vpx_read_bit(r) ? -v : v);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#else
+    dqcoeff[scan[c]] = vpx_read_bit(r) ? -v : v;
+#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
+    token_cache[scan[c]] = vp9_pt_energy_class[token];
+    ++c;
+    ctx = get_coef_context(nb, token_cache, c);
+    dqv = dq[1];
+  }
+
+  return c;
+}
+
+// TODO(slavarnway): Decode version of vp9_set_context.  Modify vp9_set_context
+// after testing is complete, then delete this version.
+static
+void dec_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
+                      TX_SIZE tx_size, int has_eob,
+                      int aoff, int loff) {
+  ENTROPY_CONTEXT *const a = pd->above_context + aoff;
+  ENTROPY_CONTEXT *const l = pd->left_context + loff;
+  const int tx_size_in_blocks = 1 << tx_size;
+
+  // above
+  if (has_eob && xd->mb_to_right_edge < 0) {
+    int i;
+    const int blocks_wide = pd->n4_w +
+                            (xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+    int above_contexts = tx_size_in_blocks;
+    if (above_contexts + aoff > blocks_wide)
+      above_contexts = blocks_wide - aoff;
+
+    for (i = 0; i < above_contexts; ++i)
+      a[i] = has_eob;
+    for (i = above_contexts; i < tx_size_in_blocks; ++i)
+      a[i] = 0;
+  } else {
+    memset(a, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
+  }
+
+  // left
+  if (has_eob && xd->mb_to_bottom_edge < 0) {
+    int i;
+    const int blocks_high = pd->n4_h +
+                            (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+    int left_contexts = tx_size_in_blocks;
+    if (left_contexts + loff > blocks_high)
+      left_contexts = blocks_high - loff;
+
+    for (i = 0; i < left_contexts; ++i)
+      l[i] = has_eob;
+    for (i = left_contexts; i < tx_size_in_blocks; ++i)
+      l[i] = 0;
+  } else {
+    memset(l, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
+  }
+}
+
+int vp9_decode_block_tokens(MACROBLOCKD *xd,
+                            int plane, const scan_order *sc,
+                            int x, int y,
+                            TX_SIZE tx_size, vpx_reader *r,
+                            int seg_id) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int16_t *const dequant = pd->seg_dequant[seg_id];
+  const int ctx = get_entropy_context(tx_size, pd->above_context + x,
+                                               pd->left_context + y);
+  const int eob = decode_coefs(xd, get_plane_type(plane),
+                               pd->dqcoeff, tx_size,
+                               dequant, ctx, sc->scan, sc->neighbors, r);
+  dec_set_contexts(xd, pd, tx_size, eob > 0, x, y);
+  return eob;
+}
+
+
diff --git a/libs/libvpx/vp9/decoder/vp9_detokenize.h b/libs/libvpx/vp9/decoder/vp9_detokenize.h
new file mode 100644
index 0000000000..d242d4466e
--- /dev/null
+++ b/libs/libvpx/vp9/decoder/vp9_detokenize.h
@@ -0,0 +1,33 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP9_DECODER_VP9_DETOKENIZE_H_
+#define VP9_DECODER_VP9_DETOKENIZE_H_
+
+#include "vpx_dsp/bitreader.h"
+#include "vp9/decoder/vp9_decoder.h"
+#include "vp9/common/vp9_scan.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int vp9_decode_block_tokens(MACROBLOCKD *xd,
+                            int plane, const scan_order *sc,
+                            int x, int y,
+                            TX_SIZE tx_size, vpx_reader *r,
+                            int seg_id);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_DECODER_VP9_DETOKENIZE_H_
diff --git a/libs/libvpx/vp9/decoder/vp9_dsubexp.c b/libs/libvpx/vp9/decoder/vp9_dsubexp.c
new file mode 100644
index 0000000000..05b38538ae
--- /dev/null
+++ b/libs/libvpx/vp9/decoder/vp9_dsubexp.c
@@ -0,0 +1,76 @@
+/*
+  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vp9/common/vp9_entropy.h"
+
+#include "vp9/decoder/vp9_dsubexp.h"
+
+static int inv_recenter_nonneg(int v, int m) {
+  if (v > 2 * m)
+    return v;
+
+  return (v & 1) ? m - ((v + 1) >> 1) : m + (v >> 1);
+}
+
+static int decode_uniform(vpx_reader *r) {
+  const int l = 8;
+  const int m = (1 << l) - 191;
+  const int v = vpx_read_literal(r, l - 1);
+  return v < m ?  v : (v << 1) - m + vpx_read_bit(r);
+}
+
+static int inv_remap_prob(int v, int m) {
+  static uint8_t inv_map_table[MAX_PROB] = {
+      7,  20,  33,  46,  59,  72,  85,  98, 111, 124, 137, 150, 163, 176, 189,
+    202, 215, 228, 241, 254,   1,   2,   3,   4,   5,   6,   8,   9,  10,  11,
+     12,  13,  14,  15,  16,  17,  18,  19,  21,  22,  23,  24,  25,  26,  27,
+     28,  29,  30,  31,  32,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,
+     44,  45,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  60,
+     61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  73,  74,  75,  76,
+     77,  78,  79,  80,  81,  82,  83,  84,  86,  87,  88,  89,  90,  91,  92,
+     93,  94,  95,  96,  97,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108,
+    109, 110, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 125,
+    126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 138, 139, 140, 141,
+    142, 143, 144, 145, 146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157,
+    158, 159, 160, 161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173,
+    174, 175, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 190,
+    191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206,
+    207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221, 222,
+    223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238,
+    239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 253
+  };
+  assert(v < (int)(sizeof(inv_map_table) / sizeof(inv_map_table[0])));
+  v = inv_map_table[v];
+  m--;
+  if ((m << 1) <= MAX_PROB) {
+    return 1 + inv_recenter_nonneg(v, m);
+  } else {
+    return MAX_PROB - inv_recenter_nonneg(v, MAX_PROB - 1 - m);
+  }
+}
+
+static int decode_term_subexp(vpx_reader *r) {
+  if (!vpx_read_bit(r))
+    return vpx_read_literal(r, 4);
+  if (!vpx_read_bit(r))
+    return vpx_read_literal(r, 4) + 16;
+  if (!vpx_read_bit(r))
+    return vpx_read_literal(r, 5) + 32;
+  return decode_uniform(r) + 64;
+}
+
+void vp9_diff_update_prob(vpx_reader *r, vpx_prob* p) {
+  if (vpx_read(r, DIFF_UPDATE_PROB)) {
+    const int delp = decode_term_subexp(r);
+    *p = (vpx_prob)inv_remap_prob(delp, *p);
+  }
+}
diff --git a/libs/libvpx/vp9/decoder/vp9_dsubexp.h b/libs/libvpx/vp9/decoder/vp9_dsubexp.h
new file mode 100644
index 0000000000..a8bcc70be9
--- /dev/null
+++ b/libs/libvpx/vp9/decoder/vp9_dsubexp.h
@@ -0,0 +1,27 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP9_DECODER_VP9_DSUBEXP_H_
+#define VP9_DECODER_VP9_DSUBEXP_H_
+
+#include "vpx_dsp/bitreader.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp9_diff_update_prob(vpx_reader *r, vpx_prob* p);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_DECODER_VP9_DSUBEXP_H_
diff --git a/libs/libvpx/vp9/decoder/vp9_dthread.c b/libs/libvpx/vp9/decoder/vp9_dthread.c
new file mode 100644
index 0000000000..14a71448fe
--- /dev/null
+++ b/libs/libvpx/vp9/decoder/vp9_dthread.c
@@ -0,0 +1,189 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/decoder/vp9_dthread.h"
+#include "vp9/decoder/vp9_decoder.h"
+
+// #define DEBUG_THREAD
+
+// TODO(hkuang): Clean up all the #ifdef in this file.
+void vp9_frameworker_lock_stats(VPxWorker *const worker) {
+#if CONFIG_MULTITHREAD
+  FrameWorkerData *const worker_data = worker->data1;
+  pthread_mutex_lock(&worker_data->stats_mutex);
+#else
+  (void)worker;
+#endif
+}
+
+void vp9_frameworker_unlock_stats(VPxWorker *const worker) {
+#if CONFIG_MULTITHREAD
+  FrameWorkerData *const worker_data = worker->data1;
+  pthread_mutex_unlock(&worker_data->stats_mutex);
+#else
+  (void)worker;
+#endif
+}
+
+void vp9_frameworker_signal_stats(VPxWorker *const worker) {
+#if CONFIG_MULTITHREAD
+  FrameWorkerData *const worker_data = worker->data1;
+
+// TODO(hkuang): Fix the pthread_cond_broadcast in windows wrapper.
+#if defined(_WIN32) && !HAVE_PTHREAD_H
+  pthread_cond_signal(&worker_data->stats_cond);
+#else
+  pthread_cond_broadcast(&worker_data->stats_cond);
+#endif
+
+#else
+  (void)worker;
+#endif
+}
+
+// This macro prevents thread_sanitizer from reporting known concurrent writes.
+#if defined(__has_feature)
+#if __has_feature(thread_sanitizer)
+#define BUILDING_WITH_TSAN
+#endif
+#endif
+
+// TODO(hkuang): Remove worker parameter as it is only used in debug code.
+void vp9_frameworker_wait(VPxWorker *const worker, RefCntBuffer *const ref_buf,
+                          int row) {
+#if CONFIG_MULTITHREAD
+  if (!ref_buf)
+    return;
+
+#ifndef BUILDING_WITH_TSAN
+  // The following line of code will get harmless tsan error but it is the key
+  // to get best performance.
+  if (ref_buf->row >= row && ref_buf->buf.corrupted != 1) return;
+#endif
+
+  {
+    // Find the worker thread that owns the reference frame. If the reference
+    // frame has been fully decoded, it may not have owner.
+    VPxWorker *const ref_worker = ref_buf->frame_worker_owner;
+    FrameWorkerData *const ref_worker_data =
+        (FrameWorkerData *)ref_worker->data1;
+    const VP9Decoder *const pbi = ref_worker_data->pbi;
+
+#ifdef DEBUG_THREAD
+    {
+      FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+      printf("%d %p worker is waiting for %d %p worker (%d)  ref %d \r\n",
+             worker_data->worker_id, worker, ref_worker_data->worker_id,
+             ref_buf->frame_worker_owner, row, ref_buf->row);
+    }
+#endif
+
+    vp9_frameworker_lock_stats(ref_worker);
+    while (ref_buf->row < row && pbi->cur_buf == ref_buf &&
+           ref_buf->buf.corrupted != 1) {
+      pthread_cond_wait(&ref_worker_data->stats_cond,
+                        &ref_worker_data->stats_mutex);
+    }
+
+    if (ref_buf->buf.corrupted == 1) {
+      FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+      vp9_frameworker_unlock_stats(ref_worker);
+      vpx_internal_error(&worker_data->pbi->common.error,
+                         VPX_CODEC_CORRUPT_FRAME,
+                         "Worker %p failed to decode frame", worker);
+    }
+    vp9_frameworker_unlock_stats(ref_worker);
+  }
+#else
+  (void)worker;
+  (void)ref_buf;
+  (void)row;
+  (void)ref_buf;
+#endif  // CONFIG_MULTITHREAD
+}
+
+void vp9_frameworker_broadcast(RefCntBuffer *const buf, int row) {
+#if CONFIG_MULTITHREAD
+  VPxWorker *worker = buf->frame_worker_owner;
+
+#ifdef DEBUG_THREAD
+  {
+    FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+    printf("%d %p worker decode to (%d) \r\n", worker_data->worker_id,
+           buf->frame_worker_owner, row);
+  }
+#endif
+
+  vp9_frameworker_lock_stats(worker);
+  buf->row = row;
+  vp9_frameworker_signal_stats(worker);
+  vp9_frameworker_unlock_stats(worker);
+#else
+  (void)buf;
+  (void)row;
+#endif  // CONFIG_MULTITHREAD
+}
+
+void vp9_frameworker_copy_context(VPxWorker *const dst_worker,
+                                  VPxWorker *const src_worker) {
+#if CONFIG_MULTITHREAD
+  FrameWorkerData *const src_worker_data = (FrameWorkerData *)src_worker->data1;
+  FrameWorkerData *const dst_worker_data = (FrameWorkerData *)dst_worker->data1;
+  VP9_COMMON *const src_cm = &src_worker_data->pbi->common;
+  VP9_COMMON *const dst_cm = &dst_worker_data->pbi->common;
+  int i;
+
+  // Wait until source frame's context is ready.
+  vp9_frameworker_lock_stats(src_worker);
+  while (!src_worker_data->frame_context_ready) {
+    pthread_cond_wait(&src_worker_data->stats_cond,
+        &src_worker_data->stats_mutex);
+  }
+
+  dst_cm->last_frame_seg_map = src_cm->seg.enabled ?
+      src_cm->current_frame_seg_map : src_cm->last_frame_seg_map;
+  dst_worker_data->pbi->need_resync = src_worker_data->pbi->need_resync;
+  vp9_frameworker_unlock_stats(src_worker);
+
+  dst_cm->bit_depth = src_cm->bit_depth;
+#if CONFIG_VP9_HIGHBITDEPTH
+  dst_cm->use_highbitdepth = src_cm->use_highbitdepth;
+#endif
+  dst_cm->prev_frame = src_cm->show_existing_frame ?
+                       src_cm->prev_frame : src_cm->cur_frame;
+  dst_cm->last_width = !src_cm->show_existing_frame ?
+                       src_cm->width : src_cm->last_width;
+  dst_cm->last_height = !src_cm->show_existing_frame ?
+                        src_cm->height : src_cm->last_height;
+  dst_cm->subsampling_x = src_cm->subsampling_x;
+  dst_cm->subsampling_y = src_cm->subsampling_y;
+  dst_cm->frame_type = src_cm->frame_type;
+  dst_cm->last_show_frame = !src_cm->show_existing_frame ?
+                            src_cm->show_frame : src_cm->last_show_frame;
+  for (i = 0; i < REF_FRAMES; ++i)
+    dst_cm->ref_frame_map[i] = src_cm->next_ref_frame_map[i];
+
+  memcpy(dst_cm->lf_info.lfthr, src_cm->lf_info.lfthr,
+         (MAX_LOOP_FILTER + 1) * sizeof(loop_filter_thresh));
+  dst_cm->lf.last_sharpness_level = src_cm->lf.sharpness_level;
+  dst_cm->lf.filter_level = src_cm->lf.filter_level;
+  memcpy(dst_cm->lf.ref_deltas, src_cm->lf.ref_deltas, MAX_REF_LF_DELTAS);
+  memcpy(dst_cm->lf.mode_deltas, src_cm->lf.mode_deltas, MAX_MODE_LF_DELTAS);
+  dst_cm->seg = src_cm->seg;
+  memcpy(dst_cm->frame_contexts, src_cm->frame_contexts,
+         FRAME_CONTEXTS * sizeof(dst_cm->frame_contexts[0]));
+#else
+  (void) dst_worker;
+  (void) src_worker;
+#endif  // CONFIG_MULTITHREAD
+}
diff --git a/libs/libvpx/vp9/decoder/vp9_dthread.h b/libs/libvpx/vp9/decoder/vp9_dthread.h
new file mode 100644
index 0000000000..ba7c38a511
--- /dev/null
+++ b/libs/libvpx/vp9/decoder/vp9_dthread.h
@@ -0,0 +1,74 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_DECODER_VP9_DTHREAD_H_
+#define VP9_DECODER_VP9_DTHREAD_H_
+
+#include "./vpx_config.h"
+#include "vpx_util/vpx_thread.h"
+#include "vpx/internal/vpx_codec_internal.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP9Common;
+struct VP9Decoder;
+
+// WorkerData for the FrameWorker thread. It contains all the information of
+// the worker and decode structures for decoding a frame.
+typedef struct FrameWorkerData {
+  struct VP9Decoder *pbi;
+  const uint8_t *data;
+  const uint8_t *data_end;
+  size_t data_size;
+  void *user_priv;
+  int result;
+  int worker_id;
+  int received_frame;
+
+  // scratch_buffer is used in frame parallel mode only.
+  // It is used to make a copy of the compressed data.
+  uint8_t *scratch_buffer;
+  size_t scratch_buffer_size;
+
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t stats_mutex;
+  pthread_cond_t stats_cond;
+#endif
+
+  int frame_context_ready;  // Current frame's context is ready to read.
+  int frame_decoded;        // Finished decoding current frame.
+} FrameWorkerData;
+
+void vp9_frameworker_lock_stats(VPxWorker *const worker);
+void vp9_frameworker_unlock_stats(VPxWorker *const worker);
+void vp9_frameworker_signal_stats(VPxWorker *const worker);
+
+// Wait until ref_buf has been decoded to row in real pixel unit.
+// Note: worker may already finish decoding ref_buf and release it in order to
+// start decoding next frame. So need to check whether worker is still decoding
+// ref_buf.
+void vp9_frameworker_wait(VPxWorker *const worker, RefCntBuffer *const ref_buf,
+                          int row);
+
+// FrameWorker broadcasts its decoding progress so other workers that are
+// waiting on it can resume decoding.
+void vp9_frameworker_broadcast(RefCntBuffer *const buf, int row);
+
+// Copy necessary decoding context from src worker to dst worker.
+void vp9_frameworker_copy_context(VPxWorker *const dst_worker,
+                                  VPxWorker *const src_worker);
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif  // VP9_DECODER_VP9_DTHREAD_H_
diff --git a/libs/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c b/libs/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c
new file mode 100644
index 0000000000..11e8773060
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c
@@ -0,0 +1,36 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vp9/common/vp9_blockd.h"
+#include "vpx_dsp/txfm_common.h"
+
+void vp9_fdct8x8_quant_neon(const int16_t *input, int stride,
+                            int16_t* coeff_ptr, intptr_t n_coeffs,
+                            int skip_block, const int16_t* zbin_ptr,
+                            const int16_t* round_ptr, const int16_t* quant_ptr,
+                            const int16_t* quant_shift_ptr,
+                            int16_t* qcoeff_ptr, int16_t* dqcoeff_ptr,
+                            const int16_t* dequant_ptr, uint16_t* eob_ptr,
+                            const int16_t* scan_ptr,
+                            const int16_t* iscan_ptr) {
+  int16_t temp_buffer[64];
+  (void)coeff_ptr;
+
+  vpx_fdct8x8_neon(input, temp_buffer, stride);
+  vp9_quantize_fp_neon(temp_buffer, n_coeffs, skip_block, zbin_ptr, round_ptr,
+                       quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr,
+                       dequant_ptr, eob_ptr, scan_ptr, iscan_ptr);
+}
diff --git a/libs/libvpx/vp9/encoder/arm/neon/vp9_error_neon.c b/libs/libvpx/vp9/encoder/arm/neon/vp9_error_neon.c
new file mode 100644
index 0000000000..1c7503139e
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/arm/neon/vp9_error_neon.c
@@ -0,0 +1,41 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vp9_rtcd.h"
+
+int64_t vp9_block_error_fp_neon(const int16_t *coeff, const int16_t *dqcoeff,
+                                int block_size) {
+  int64x2_t error = vdupq_n_s64(0);
+
+  assert(block_size >= 8);
+  assert((block_size % 8) == 0);
+
+  do {
+    const int16x8_t c = vld1q_s16(coeff);
+    const int16x8_t d = vld1q_s16(dqcoeff);
+    const int16x8_t diff = vsubq_s16(c, d);
+    const int16x4_t diff_lo = vget_low_s16(diff);
+    const int16x4_t diff_hi = vget_high_s16(diff);
+    // diff is 15-bits, the squares 30, so we can store 2 in 31-bits before
+    // accumulating them in 64-bits.
+    const int32x4_t err0 = vmull_s16(diff_lo, diff_lo);
+    const int32x4_t err1 = vmlal_s16(err0, diff_hi, diff_hi);
+    const int64x2_t err2 = vaddl_s32(vget_low_s32(err1), vget_high_s32(err1));
+    error = vaddq_s64(error, err2);
+    coeff += 8;
+    dqcoeff += 8;
+    block_size -= 8;
+  } while (block_size != 0);
+
+  return vgetq_lane_s64(error, 0) + vgetq_lane_s64(error, 1);
+}
diff --git a/libs/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c b/libs/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c
new file mode 100644
index 0000000000..47363c75ba
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c
@@ -0,0 +1,118 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include <math.h>
+
+#include "vpx_mem/vpx_mem.h"
+
+#include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_seg_common.h"
+
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_quantize.h"
+#include "vp9/encoder/vp9_rd.h"
+
+void vp9_quantize_fp_neon(const int16_t *coeff_ptr, intptr_t count,
+                          int skip_block, const int16_t *zbin_ptr,
+                          const int16_t *round_ptr, const int16_t *quant_ptr,
+                          const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
+                          int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                          uint16_t *eob_ptr,
+                          const int16_t *scan, const int16_t *iscan) {
+  // TODO(jingning) Decide the need of these arguments after the
+  // quantization process is completed.
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)scan;
+
+  if (!skip_block) {
+    // Quantization pass: All coefficients with index >= zero_flag are
+    // skippable. Note: zero_flag can be zero.
+    int i;
+    const int16x8_t v_zero = vdupq_n_s16(0);
+    const int16x8_t v_one = vdupq_n_s16(1);
+    int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
+    int16x8_t v_round = vmovq_n_s16(round_ptr[1]);
+    int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]);
+    int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]);
+    // adjust for dc
+    v_round = vsetq_lane_s16(round_ptr[0], v_round, 0);
+    v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0);
+    v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0);
+    // process dc and the first seven ac coeffs
+    {
+      const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+      const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[0]);
+      const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+      const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero);
+      const int32x4_t v_tmp_lo = vmull_s16(vget_low_s16(v_tmp),
+                                           vget_low_s16(v_quant));
+      const int32x4_t v_tmp_hi = vmull_s16(vget_high_s16(v_tmp),
+                                           vget_high_s16(v_quant));
+      const int16x8_t v_tmp2 = vcombine_s16(vshrn_n_s32(v_tmp_lo, 16),
+                                            vshrn_n_s32(v_tmp_hi, 16));
+      const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
+      const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
+      const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
+      const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
+      const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
+      const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
+      v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
+      vst1q_s16(&qcoeff_ptr[0], v_qcoeff);
+      vst1q_s16(&dqcoeff_ptr[0], v_dqcoeff);
+      v_round = vmovq_n_s16(round_ptr[1]);
+      v_quant = vmovq_n_s16(quant_ptr[1]);
+      v_dequant = vmovq_n_s16(dequant_ptr[1]);
+    }
+    // now process the rest of the ac coeffs
+    for (i = 8; i < count; i += 8) {
+      const int16x8_t v_iscan = vld1q_s16(&iscan[i]);
+      const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[i]);
+      const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+      const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero);
+      const int32x4_t v_tmp_lo = vmull_s16(vget_low_s16(v_tmp),
+                                           vget_low_s16(v_quant));
+      const int32x4_t v_tmp_hi = vmull_s16(vget_high_s16(v_tmp),
+                                           vget_high_s16(v_quant));
+      const int16x8_t v_tmp2 = vcombine_s16(vshrn_n_s32(v_tmp_lo, 16),
+                                            vshrn_n_s32(v_tmp_hi, 16));
+      const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
+      const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
+      const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
+      const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
+      const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
+      const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
+      v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
+      vst1q_s16(&qcoeff_ptr[i], v_qcoeff);
+      vst1q_s16(&dqcoeff_ptr[i], v_dqcoeff);
+    }
+    {
+      const int16x4_t v_eobmax_3210 =
+          vmax_s16(vget_low_s16(v_eobmax_76543210),
+                   vget_high_s16(v_eobmax_76543210));
+      const int64x1_t v_eobmax_xx32 =
+          vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
+      const int16x4_t v_eobmax_tmp =
+          vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
+      const int64x1_t v_eobmax_xxx3 =
+          vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
+      const int16x4_t v_eobmax_final =
+          vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
+
+      *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0);
+    }
+  } else {
+    memset(qcoeff_ptr, 0, count * sizeof(int16_t));
+    memset(dqcoeff_ptr, 0, count * sizeof(int16_t));
+    *eob_ptr = 0;
+  }
+}
diff --git a/libs/libvpx/vp9/encoder/mips/msa/vp9_error_msa.c b/libs/libvpx/vp9/encoder/mips/msa/vp9_error_msa.c
new file mode 100644
index 0000000000..1dc70bd82f
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/mips/msa/vp9_error_msa.c
@@ -0,0 +1,114 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vpx_dsp/mips/macros_msa.h"
+
+#define BLOCK_ERROR_BLOCKSIZE_MSA(BSize)                                   \
+static int64_t block_error_##BSize##size_msa(const int16_t *coeff_ptr,     \
+                                             const int16_t *dq_coeff_ptr,  \
+                                             int64_t *ssz) {               \
+  int64_t err = 0;                                                         \
+  uint32_t loop_cnt;                                                       \
+  v8i16 coeff, dq_coeff, coeff_r_h, coeff_l_h;                             \
+  v4i32 diff_r, diff_l, coeff_r_w, coeff_l_w;                              \
+  v2i64 sq_coeff_r, sq_coeff_l;                                            \
+  v2i64 err0, err_dup0, err1, err_dup1;                                    \
+                                                                           \
+  coeff = LD_SH(coeff_ptr);                                                \
+  dq_coeff = LD_SH(dq_coeff_ptr);                                          \
+  UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w);                                \
+  ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h);                      \
+  HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l);                       \
+  DOTP_SW2_SD(coeff_r_w, coeff_l_w, coeff_r_w, coeff_l_w,                  \
+              sq_coeff_r, sq_coeff_l);                                     \
+  DOTP_SW2_SD(diff_r, diff_l, diff_r, diff_l, err0, err1);                 \
+                                                                           \
+  coeff = LD_SH(coeff_ptr + 8);                                            \
+  dq_coeff = LD_SH(dq_coeff_ptr + 8);                                      \
+  UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w);                                \
+  ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h);                      \
+  HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l);                       \
+  DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l);              \
+  DPADD_SD2_SD(diff_r, diff_l, err0, err1);                                \
+                                                                           \
+  coeff_ptr += 16;                                                         \
+  dq_coeff_ptr += 16;                                                      \
+                                                                           \
+  for (loop_cnt = ((BSize >> 4) - 1); loop_cnt--;) {                       \
+    coeff = LD_SH(coeff_ptr);                                              \
+    dq_coeff = LD_SH(dq_coeff_ptr);                                        \
+    UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w);                              \
+    ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h);                    \
+    HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l);                     \
+    DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l);            \
+    DPADD_SD2_SD(diff_r, diff_l, err0, err1);                              \
+                                                                           \
+    coeff = LD_SH(coeff_ptr + 8);                                          \
+    dq_coeff = LD_SH(dq_coeff_ptr + 8);                                    \
+    UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w);                              \
+    ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h);                    \
+    HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l);                     \
+    DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l);            \
+    DPADD_SD2_SD(diff_r, diff_l, err0, err1);                              \
+                                                                           \
+    coeff_ptr += 16;                                                       \
+    dq_coeff_ptr += 16;                                                    \
+  }                                                                        \
+                                                                           \
+  err_dup0 = __msa_splati_d(sq_coeff_r, 1);                                \
+  err_dup1 = __msa_splati_d(sq_coeff_l, 1);                                \
+  sq_coeff_r += err_dup0;                                                  \
+  sq_coeff_l += err_dup1;                                                  \
+  *ssz = __msa_copy_s_d(sq_coeff_r, 0);                                    \
+  *ssz += __msa_copy_s_d(sq_coeff_l, 0);                                   \
+                                                                           \
+  err_dup0 = __msa_splati_d(err0, 1);                                      \
+  err_dup1 = __msa_splati_d(err1, 1);                                      \
+  err0 += err_dup0;                                                        \
+  err1 += err_dup1;                                                        \
+  err = __msa_copy_s_d(err0, 0);                                           \
+  err += __msa_copy_s_d(err1, 0);                                          \
+                                                                           \
+  return err;                                                              \
+}
+
+BLOCK_ERROR_BLOCKSIZE_MSA(16);
+BLOCK_ERROR_BLOCKSIZE_MSA(64);
+BLOCK_ERROR_BLOCKSIZE_MSA(256);
+BLOCK_ERROR_BLOCKSIZE_MSA(1024);
+
+int64_t vp9_block_error_msa(const tran_low_t *coeff_ptr,
+                            const tran_low_t *dq_coeff_ptr,
+                            intptr_t blk_size, int64_t *ssz) {
+  int64_t err;
+  const int16_t *coeff = (const int16_t *)coeff_ptr;
+  const int16_t *dq_coeff = (const int16_t *)dq_coeff_ptr;
+
+  switch (blk_size) {
+    case 16:
+      err = block_error_16size_msa(coeff, dq_coeff, ssz);
+      break;
+    case 64:
+      err = block_error_64size_msa(coeff, dq_coeff, ssz);
+      break;
+    case 256:
+      err = block_error_256size_msa(coeff, dq_coeff, ssz);
+      break;
+    case 1024:
+      err = block_error_1024size_msa(coeff, dq_coeff, ssz);
+      break;
+    default:
+      err = vp9_block_error_c(coeff_ptr, dq_coeff_ptr, blk_size, ssz);
+      break;
+  }
+
+  return err;
+}
diff --git a/libs/libvpx/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c b/libs/libvpx/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c
new file mode 100644
index 0000000000..6dabb58900
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c
@@ -0,0 +1,507 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vp9/common/vp9_enums.h"
+#include "vp9/encoder/mips/msa/vp9_fdct_msa.h"
+#include "vpx_dsp/mips/fwd_txfm_msa.h"
+
+static void fadst16_cols_step1_msa(const int16_t *input, int32_t stride,
+                                   const int32_t *const0, int16_t *int_buf) {
+  v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+  v8i16 tp0, tp1, tp2, tp3, g0, g1, g2, g3, g8, g9, g10, g11, h0, h1, h2, h3;
+  v4i32 k0, k1, k2, k3;
+
+  /* load input data */
+  r0 = LD_SH(input);
+  r15 = LD_SH(input + 15 * stride);
+  r7 = LD_SH(input + 7 * stride);
+  r8 = LD_SH(input + 8 * stride);
+  SLLI_4V(r0, r15, r7, r8, 2);
+
+  /* stage 1 */
+  LD_SW2(const0, 4, k0, k1);
+  LD_SW2(const0 + 8, 4, k2, k3);
+  MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3);
+
+  r3 = LD_SH(input + 3 * stride);
+  r4 = LD_SH(input + 4 * stride);
+  r11 = LD_SH(input + 11 * stride);
+  r12 = LD_SH(input + 12 * stride);
+  SLLI_4V(r3, r4, r11, r12, 2);
+
+  LD_SW2(const0 + 4 * 4, 4, k0, k1);
+  LD_SW2(const0 + 4 * 6, 4, k2, k3);
+  MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11);
+
+  /* stage 2 */
+  BUTTERFLY_4(g0, g2, g10, g8, tp0, tp2, tp3, tp1);
+  ST_SH2(tp0, tp2, int_buf, 8);
+  ST_SH2(tp1, tp3, int_buf + 4 * 8, 8);
+
+  LD_SW2(const0 + 4 * 8, 4, k0, k1);
+  k2 = LD_SW(const0 + 4 * 10);
+  MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3);
+
+  ST_SH2(h0, h1, int_buf + 8 * 8, 8);
+  ST_SH2(h3, h2, int_buf + 12 * 8, 8);
+
+  r9 = LD_SH(input + 9 * stride);
+  r6 = LD_SH(input + 6 * stride);
+  r1 = LD_SH(input + stride);
+  r14 = LD_SH(input + 14 * stride);
+  SLLI_4V(r9, r6, r1, r14, 2);
+
+  LD_SW2(const0 + 4 * 11, 4, k0, k1);
+  LD_SW2(const0 + 4 * 13, 4, k2, k3);
+  MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g0, g1, g2, g3);
+
+  ST_SH2(g1, g3, int_buf + 3 * 8, 4 * 8);
+
+  r13 = LD_SH(input + 13 * stride);
+  r2 = LD_SH(input + 2 * stride);
+  r5 = LD_SH(input + 5 * stride);
+  r10 = LD_SH(input + 10 * stride);
+  SLLI_4V(r13, r2, r5, r10, 2);
+
+  LD_SW2(const0 + 4 * 15, 4, k0, k1);
+  LD_SW2(const0 + 4 * 17, 4, k2, k3);
+  MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, h0, h1, h2, h3);
+
+  ST_SH2(h1, h3, int_buf + 11 * 8, 4 * 8);
+
+  BUTTERFLY_4(h0, h2, g2, g0, tp0, tp1, tp2, tp3);
+  ST_SH4(tp0, tp1, tp2, tp3, int_buf + 2 * 8, 4 * 8);
+}
+
+static void fadst16_cols_step2_msa(int16_t *int_buf, const int32_t *const0,
+                                   int16_t *out) {
+  int16_t *out_ptr = out + 128;
+  v8i16 tp0, tp1, tp2, tp3, g5, g7, g13, g15;
+  v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h10, h11;
+  v8i16 out0, out1, out2, out3, out4, out5, out6, out7;
+  v8i16 out8, out9, out10, out11, out12, out13, out14, out15;
+  v4i32 k0, k1, k2, k3;
+
+  LD_SH2(int_buf + 3 * 8, 4 * 8, g13, g15);
+  LD_SH2(int_buf + 11 * 8, 4 * 8, g5, g7);
+  LD_SW2(const0 + 4 * 19, 4, k0, k1);
+  k2 = LD_SW(const0 + 4 * 21);
+  MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7);
+
+  tp0 = LD_SH(int_buf + 4 * 8);
+  tp1 = LD_SH(int_buf + 5 * 8);
+  tp3 = LD_SH(int_buf + 10 * 8);
+  tp2 = LD_SH(int_buf + 14 * 8);
+  LD_SW2(const0 + 4 * 22, 4, k0, k1);
+  k2 = LD_SW(const0 + 4 * 24);
+  MADD_BF(tp0, tp1, tp2, tp3, k0, k1, k2, k0, out4, out6, out5, out7);
+  out4 = -out4;
+  ST_SH(out4, (out + 3 * 16));
+  ST_SH(out5, (out_ptr + 4 * 16));
+
+  h1 = LD_SH(int_buf + 9 * 8);
+  h3 = LD_SH(int_buf + 12 * 8);
+  MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15);
+  out13 = -out13;
+  ST_SH(out12, (out + 2 * 16));
+  ST_SH(out13, (out_ptr + 5 * 16));
+
+  tp0 = LD_SH(int_buf);
+  tp1 = LD_SH(int_buf + 8);
+  tp2 = LD_SH(int_buf + 2 * 8);
+  tp3 = LD_SH(int_buf + 6 * 8);
+
+  BUTTERFLY_4(tp0, tp1, tp3, tp2, out0, out1, h11, h10);
+  out1 = -out1;
+  ST_SH(out0, (out));
+  ST_SH(out1, (out_ptr + 7 * 16));
+
+  h0 = LD_SH(int_buf + 8 * 8);
+  h2 = LD_SH(int_buf + 13 * 8);
+
+  BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10);
+  out8 = -out8;
+  ST_SH(out8, (out + 16));
+  ST_SH(out9, (out_ptr + 6 * 16));
+
+  /* stage 4 */
+  LD_SW2(const0 + 4 * 25, 4, k0, k1);
+  LD_SW2(const0 + 4 * 27, 4, k2, k3);
+  MADD_SHORT(h10, h11, k1, k2, out2, out3);
+  ST_SH(out2, (out + 7 * 16));
+  ST_SH(out3, (out_ptr));
+
+  MADD_SHORT(out6, out7, k0, k3, out6, out7);
+  ST_SH(out6, (out + 4 * 16));
+  ST_SH(out7, (out_ptr + 3 * 16));
+
+  MADD_SHORT(out10, out11, k0, k3, out10, out11);
+  ST_SH(out10, (out + 6 * 16));
+  ST_SH(out11, (out_ptr + 16));
+
+  MADD_SHORT(out14, out15, k1, k2, out14, out15);
+  ST_SH(out14, (out + 5 * 16));
+  ST_SH(out15, (out_ptr + 2 * 16));
+}
+
+static void fadst16_transpose_postproc_msa(int16_t *input, int16_t *out) {
+  v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+  v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15;
+
+  /* load input data */
+  LD_SH8(input, 16, l0, l1, l2, l3, l4, l5, l6, l7);
+  TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7,
+                     r0, r1, r2, r3, r4, r5, r6, r7);
+  FDCT_POSTPROC_2V_NEG_H(r0, r1);
+  FDCT_POSTPROC_2V_NEG_H(r2, r3);
+  FDCT_POSTPROC_2V_NEG_H(r4, r5);
+  FDCT_POSTPROC_2V_NEG_H(r6, r7);
+  ST_SH8(r0, r1, r2, r3, r4, r5, r6, r7, out, 8);
+  out += 64;
+
+  LD_SH8(input + 8, 16, l8, l9, l10, l11, l12, l13, l14, l15);
+  TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15,
+                     r8, r9, r10, r11, r12, r13, r14, r15);
+  FDCT_POSTPROC_2V_NEG_H(r8, r9);
+  FDCT_POSTPROC_2V_NEG_H(r10, r11);
+  FDCT_POSTPROC_2V_NEG_H(r12, r13);
+  FDCT_POSTPROC_2V_NEG_H(r14, r15);
+  ST_SH8(r8, r9, r10, r11, r12, r13, r14, r15, out, 8);
+  out += 64;
+
+  /* load input data */
+  input += 128;
+  LD_SH8(input, 16, l0, l1, l2, l3, l4, l5, l6, l7);
+  TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7,
+                     r0, r1, r2, r3, r4, r5, r6, r7);
+  FDCT_POSTPROC_2V_NEG_H(r0, r1);
+  FDCT_POSTPROC_2V_NEG_H(r2, r3);
+  FDCT_POSTPROC_2V_NEG_H(r4, r5);
+  FDCT_POSTPROC_2V_NEG_H(r6, r7);
+  ST_SH8(r0, r1, r2, r3, r4, r5, r6, r7, out, 8);
+  out += 64;
+
+  LD_SH8(input + 8, 16, l8, l9, l10, l11, l12, l13, l14, l15);
+  TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15,
+                     r8, r9, r10, r11, r12, r13, r14, r15);
+  FDCT_POSTPROC_2V_NEG_H(r8, r9);
+  FDCT_POSTPROC_2V_NEG_H(r10, r11);
+  FDCT_POSTPROC_2V_NEG_H(r12, r13);
+  FDCT_POSTPROC_2V_NEG_H(r14, r15);
+  ST_SH8(r8, r9, r10, r11, r12, r13, r14, r15, out, 8);
+}
+
+static void fadst16_rows_step1_msa(int16_t *input, const int32_t *const0,
+                                   int16_t *int_buf) {
+  v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+  v8i16 tp0, tp1, tp2, tp3, g0, g1, g2, g3, g8, g9, g10, g11, h0, h1, h2, h3;
+  v4i32 k0, k1, k2, k3;
+
+  /* load input data */
+  r0 = LD_SH(input);
+  r7 = LD_SH(input + 7 * 8);
+  r8 = LD_SH(input + 8 * 8);
+  r15 = LD_SH(input + 15 * 8);
+
+  /* stage 1 */
+  LD_SW2(const0, 4, k0, k1);
+  LD_SW2(const0 + 4 * 2, 4, k2, k3);
+  MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3);
+
+  r3 = LD_SH(input + 3 * 8);
+  r4 = LD_SH(input + 4 * 8);
+  r11 = LD_SH(input + 11 * 8);
+  r12 = LD_SH(input + 12 * 8);
+
+  LD_SW2(const0 + 4 * 4, 4, k0, k1);
+  LD_SW2(const0 + 4 * 6, 4, k2, k3);
+  MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11);
+
+  /* stage 2 */
+  BUTTERFLY_4(g0, g2, g10, g8, tp0, tp2, tp3, tp1);
+  ST_SH2(tp0, tp1, int_buf, 4 * 8);
+  ST_SH2(tp2, tp3, int_buf + 8, 4 * 8);
+
+  LD_SW2(const0 + 4 * 8, 4, k0, k1);
+  k2 = LD_SW(const0 + 4 * 10);
+  MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3);
+  ST_SH2(h0, h3, int_buf + 8 * 8, 4 * 8);
+  ST_SH2(h1, h2, int_buf + 9 * 8, 4 * 8);
+
+  r1 = LD_SH(input + 8);
+  r6 = LD_SH(input + 6 * 8);
+  r9 = LD_SH(input + 9 * 8);
+  r14 = LD_SH(input + 14 * 8);
+
+  LD_SW2(const0 + 4 * 11, 4, k0, k1);
+  LD_SW2(const0 + 4 * 13, 4, k2, k3);
+  MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g0, g1, g2, g3);
+  ST_SH2(g1, g3, int_buf + 3 * 8, 4 * 8);
+
+  r2 = LD_SH(input + 2 * 8);
+  r5 = LD_SH(input + 5 * 8);
+  r10 = LD_SH(input + 10 * 8);
+  r13 = LD_SH(input + 13 * 8);
+
+  LD_SW2(const0 + 4 * 15, 4, k0, k1);
+  LD_SW2(const0 + 4 * 17, 4, k2, k3);
+  MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, h0, h1, h2, h3);
+  ST_SH2(h1, h3, int_buf + 11 * 8, 4 * 8);
+  BUTTERFLY_4(h0, h2, g2, g0, tp0, tp1, tp2, tp3);
+  ST_SH4(tp0, tp1, tp2, tp3, int_buf + 2 * 8, 4 * 8);
+}
+
+static void fadst16_rows_step2_msa(int16_t *int_buf, const int32_t *const0,
+                                   int16_t *out) {
+  int16_t *out_ptr = out + 8;
+  v8i16 tp0, tp1, tp2, tp3, g5, g7, g13, g15;
+  v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h10, h11;
+  v8i16 out0, out1, out2, out3, out4, out5, out6, out7;
+  v8i16 out8, out9, out10, out11, out12, out13, out14, out15;
+  v4i32 k0, k1, k2, k3;
+
+  g13 = LD_SH(int_buf + 3 * 8);
+  g15 = LD_SH(int_buf + 7 * 8);
+  g5 = LD_SH(int_buf + 11 * 8);
+  g7 = LD_SH(int_buf + 15 * 8);
+
+  LD_SW2(const0 + 4 * 19, 4, k0, k1);
+  k2 = LD_SW(const0 + 4 * 21);
+  MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7);
+
+  tp0 = LD_SH(int_buf + 4 * 8);
+  tp1 = LD_SH(int_buf + 5 * 8);
+  tp3 = LD_SH(int_buf + 10 * 8);
+  tp2 = LD_SH(int_buf + 14 * 8);
+
+  LD_SW2(const0 + 4 * 22, 4, k0, k1);
+  k2 = LD_SW(const0 + 4 * 24);
+  MADD_BF(tp0, tp1, tp2, tp3, k0, k1, k2, k0, out4, out6, out5, out7);
+  out4 = -out4;
+  ST_SH(out4, (out + 3 * 16));
+  ST_SH(out5, (out_ptr + 4 * 16));
+
+  h1 = LD_SH(int_buf + 9 * 8);
+  h3 = LD_SH(int_buf + 12 * 8);
+  MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15);
+  out13 = -out13;
+  ST_SH(out12, (out + 2 * 16));
+  ST_SH(out13, (out_ptr + 5 * 16));
+
+  tp0 = LD_SH(int_buf);
+  tp1 = LD_SH(int_buf + 8);
+  tp2 = LD_SH(int_buf + 2 * 8);
+  tp3 = LD_SH(int_buf + 6 * 8);
+
+  BUTTERFLY_4(tp0, tp1, tp3, tp2, out0, out1, h11, h10);
+  out1 = -out1;
+  ST_SH(out0, (out));
+  ST_SH(out1, (out_ptr + 7 * 16));
+
+  h0 = LD_SH(int_buf + 8 * 8);
+  h2 = LD_SH(int_buf + 13 * 8);
+  BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10);
+  out8 = -out8;
+  ST_SH(out8, (out + 16));
+  ST_SH(out9, (out_ptr + 6 * 16));
+
+  /* stage 4 */
+  LD_SW2(const0 + 4 * 25, 4, k0, k1);
+  LD_SW2(const0 + 4 * 27, 4, k2, k3);
+  MADD_SHORT(h10, h11, k1, k2, out2, out3);
+  ST_SH(out2, (out + 7 * 16));
+  ST_SH(out3, (out_ptr));
+
+  MADD_SHORT(out6, out7, k0, k3, out6, out7);
+  ST_SH(out6, (out + 4 * 16));
+  ST_SH(out7, (out_ptr + 3 * 16));
+
+  MADD_SHORT(out10, out11, k0, k3, out10, out11);
+  ST_SH(out10, (out + 6 * 16));
+  ST_SH(out11, (out_ptr + 16));
+
+  MADD_SHORT(out14, out15, k1, k2, out14, out15);
+  ST_SH(out14, (out + 5 * 16));
+  ST_SH(out15, (out_ptr + 2 * 16));
+}
+
+static void fadst16_transpose_msa(int16_t *input, int16_t *out) {
+  v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+  v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15;
+
+  /* load input data */
+  LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11,
+          l4, l12, l5, l13, l6, l14, l7, l15);
+  TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7,
+                     r0, r1, r2, r3, r4, r5, r6, r7);
+  TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15,
+                     r8, r9, r10, r11, r12, r13, r14, r15);
+  ST_SH8(r0, r8, r1, r9, r2, r10, r3, r11, out, 8);
+  ST_SH8(r4, r12, r5, r13, r6, r14, r7, r15, (out + 64), 8);
+  out += 16 * 8;
+
+  /* load input data */
+  input += 128;
+  LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11,
+          l4, l12, l5, l13, l6, l14, l7, l15);
+  TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7,
+                     r0, r1, r2, r3, r4, r5, r6, r7);
+  TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15,
+                     r8, r9, r10, r11, r12, r13, r14, r15);
+  ST_SH8(r0, r8, r1, r9, r2, r10, r3, r11, out, 8);
+  ST_SH8(r4, r12, r5, r13, r6, r14, r7, r15, (out + 64), 8);
+}
+
+static void postproc_fdct16x8_1d_row(int16_t *intermediate, int16_t *output) {
+  int16_t *temp = intermediate;
+  int16_t *out = output;
+  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11;
+  v8i16 in12, in13, in14, in15;
+
+  LD_SH8(temp, 16, in0, in1, in2, in3, in4, in5, in6, in7);
+  temp = intermediate + 8;
+  LD_SH8(temp, 16, in8, in9, in10, in11, in12, in13, in14, in15);
+  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                     in0, in1, in2, in3, in4, in5, in6, in7);
+  TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15,
+                     in8, in9, in10, in11, in12, in13, in14, in15);
+  FDCT_POSTPROC_2V_NEG_H(in0, in1);
+  FDCT_POSTPROC_2V_NEG_H(in2, in3);
+  FDCT_POSTPROC_2V_NEG_H(in4, in5);
+  FDCT_POSTPROC_2V_NEG_H(in6, in7);
+  FDCT_POSTPROC_2V_NEG_H(in8, in9);
+  FDCT_POSTPROC_2V_NEG_H(in10, in11);
+  FDCT_POSTPROC_2V_NEG_H(in12, in13);
+  FDCT_POSTPROC_2V_NEG_H(in14, in15);
+  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
+               in8, in9, in10, in11, in12, in13, in14, in15,
+               tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
+               in8, in9, in10, in11, in12, in13, in14, in15);
+  temp = intermediate;
+  ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, temp, 16);
+  FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
+                tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
+  temp = intermediate;
+  LD_SH8(temp, 16, in8, in9, in10, in11, in12, in13, in14, in15);
+  FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15,
+               in0, in1, in2, in3, in4, in5, in6, in7);
+  TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3,
+                     tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3);
+  ST_SH8(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, out, 16);
+  TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7,
+                     tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7);
+  out = output + 8;
+  ST_SH8(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, out, 16);
+}
+
+void vp9_fht16x16_msa(const int16_t *input, int16_t *output,
+                      int32_t stride, int32_t tx_type) {
+  DECLARE_ALIGNED(32, int16_t, tmp[256]);
+  DECLARE_ALIGNED(32, int16_t, trans_buf[256]);
+  DECLARE_ALIGNED(32, int16_t, tmp_buf[128]);
+  int32_t i;
+  int16_t *ptmpbuf = &tmp_buf[0];
+  int16_t *trans = &trans_buf[0];
+  const int32_t const_arr[29 * 4] = {
+    52707308, 52707308, 52707308, 52707308,
+    -1072430300, -1072430300, -1072430300, -1072430300,
+    795618043, 795618043, 795618043, 795618043,
+    -721080468, -721080468, -721080468, -721080468,
+    459094491, 459094491, 459094491, 459094491,
+    -970646691, -970646691, -970646691, -970646691,
+    1010963856, 1010963856, 1010963856, 1010963856,
+    -361743294, -361743294, -361743294, -361743294,
+    209469125, 209469125, 209469125, 209469125,
+    -1053094788, -1053094788, -1053094788, -1053094788,
+    1053160324, 1053160324, 1053160324, 1053160324,
+    639644520, 639644520, 639644520, 639644520,
+    -862444000, -862444000, -862444000, -862444000,
+    1062144356, 1062144356, 1062144356, 1062144356,
+    -157532337, -157532337, -157532337, -157532337,
+    260914709, 260914709, 260914709, 260914709,
+    -1041559667, -1041559667, -1041559667, -1041559667,
+    920985831, 920985831, 920985831, 920985831,
+    -551995675, -551995675, -551995675, -551995675,
+    596522295, 596522295, 596522295, 596522295,
+    892853362, 892853362, 892853362, 892853362,
+    -892787826, -892787826, -892787826, -892787826,
+    410925857, 410925857, 410925857, 410925857,
+    -992012162, -992012162, -992012162, -992012162,
+    992077698, 992077698, 992077698, 992077698,
+    759246145, 759246145, 759246145, 759246145,
+    -759180609, -759180609, -759180609, -759180609,
+    -759222975, -759222975, -759222975, -759222975,
+    759288511, 759288511, 759288511, 759288511 };
+
+  switch (tx_type) {
+    case DCT_DCT:
+      /* column transform */
+      for (i = 0; i < 2; ++i) {
+        fdct8x16_1d_column(input + 8 * i, tmp + 8 * i, stride);
+      }
+
+      /* row transform */
+      for (i = 0; i < 2; ++i) {
+        fdct16x8_1d_row(tmp + (128 * i), output + (128 * i));
+      }
+      break;
+    case ADST_DCT:
+      /* column transform */
+      for (i = 0; i < 2; ++i) {
+        fadst16_cols_step1_msa(input + (i << 3), stride, const_arr, ptmpbuf);
+        fadst16_cols_step2_msa(ptmpbuf, const_arr, tmp + (i << 3));
+      }
+
+      /* row transform */
+      for (i = 0; i < 2; ++i) {
+        postproc_fdct16x8_1d_row(tmp + (128 * i), output + (128 * i));
+      }
+      break;
+    case DCT_ADST:
+      /* column transform */
+      for (i = 0; i < 2; ++i) {
+        fdct8x16_1d_column(input + 8 * i, tmp + 8 * i, stride);
+      }
+
+      fadst16_transpose_postproc_msa(tmp, trans);
+
+      /* row transform */
+      for (i = 0; i < 2; ++i) {
+        fadst16_rows_step1_msa(trans + (i << 7), const_arr, ptmpbuf);
+        fadst16_rows_step2_msa(ptmpbuf, const_arr, tmp + (i << 7));
+      }
+
+      fadst16_transpose_msa(tmp, output);
+      break;
+    case ADST_ADST:
+      /* column transform */
+      for (i = 0; i < 2; ++i) {
+        fadst16_cols_step1_msa(input + (i << 3), stride, const_arr, ptmpbuf);
+        fadst16_cols_step2_msa(ptmpbuf, const_arr, tmp + (i << 3));
+      }
+
+      fadst16_transpose_postproc_msa(tmp, trans);
+
+      /* row transform */
+      for (i = 0; i < 2; ++i) {
+        fadst16_rows_step1_msa(trans + (i << 7), const_arr, ptmpbuf);
+        fadst16_rows_step2_msa(ptmpbuf, const_arr, tmp + (i << 7));
+      }
+
+      fadst16_transpose_msa(tmp, output);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
diff --git a/libs/libvpx/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c b/libs/libvpx/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c
new file mode 100644
index 0000000000..574016f155
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c
@@ -0,0 +1,99 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vp9/common/vp9_enums.h"
+#include "vp9/encoder/mips/msa/vp9_fdct_msa.h"
+
+void vp9_fwht4x4_msa(const int16_t *input, int16_t *output,
+                     int32_t src_stride) {
+  v8i16 in0, in1, in2, in3, in4;
+
+  LD_SH4(input, src_stride, in0, in1, in2, in3);
+
+  in0 += in1;
+  in3 -= in2;
+  in4 = (in0 - in3) >> 1;
+  SUB2(in4, in1, in4, in2, in1, in2);
+  in0 -= in2;
+  in3 += in1;
+
+  TRANSPOSE4x4_SH_SH(in0, in2, in3, in1, in0, in2, in3, in1);
+
+  in0 += in2;
+  in1 -= in3;
+  in4 = (in0 - in1) >> 1;
+  SUB2(in4, in2, in4, in3, in2, in3);
+  in0 -= in3;
+  in1 += in2;
+
+  SLLI_4V(in0, in1, in2, in3, 2);
+
+  TRANSPOSE4x4_SH_SH(in0, in3, in1, in2, in0, in3, in1, in2);
+
+  ST4x2_UB(in0, output, 4);
+  ST4x2_UB(in3, output + 4, 4);
+  ST4x2_UB(in1, output + 8, 4);
+  ST4x2_UB(in2, output + 12, 4);
+}
+
+void vp9_fht4x4_msa(const int16_t *input, int16_t *output, int32_t stride,
+                    int32_t tx_type) {
+  v8i16 in0, in1, in2, in3;
+
+  LD_SH4(input, stride, in0, in1, in2, in3);
+
+  /* fdct4 pre-process */
+  {
+    v8i16 temp, mask;
+    v16i8 zero = { 0 };
+    v16i8 one = __msa_ldi_b(1);
+
+    mask = (v8i16)__msa_sldi_b(zero, one, 15);
+    SLLI_4V(in0, in1, in2, in3, 4);
+    temp = __msa_ceqi_h(in0, 0);
+    temp = (v8i16)__msa_xori_b((v16u8)temp, 255);
+    temp = mask & temp;
+    in0 += temp;
+  }
+
+  switch (tx_type) {
+    case DCT_DCT:
+      VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+      TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+      VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+      break;
+    case ADST_DCT:
+      VP9_FADST4(in0, in1, in2, in3, in0, in1, in2, in3);
+      TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+      VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+      break;
+    case DCT_ADST:
+      VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+      TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+      VP9_FADST4(in0, in1, in2, in3, in0, in1, in2, in3);
+      break;
+    case ADST_ADST:
+      VP9_FADST4(in0, in1, in2, in3, in0, in1, in2, in3);
+      TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+      VP9_FADST4(in0, in1, in2, in3, in0, in1, in2, in3);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+
+  TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+  ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
+  SRA_4V(in0, in1, in2, in3, 2);
+  PCKEV_D2_SH(in1, in0, in3, in2, in0, in2);
+  ST_SH2(in0, in2, output, 8);
+}
diff --git a/libs/libvpx/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c b/libs/libvpx/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c
new file mode 100644
index 0000000000..7c3c635f8d
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c
@@ -0,0 +1,66 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vp9/common/vp9_enums.h"
+#include "vp9/encoder/mips/msa/vp9_fdct_msa.h"
+
+void vp9_fht8x8_msa(const int16_t *input, int16_t *output, int32_t stride,
+                    int32_t tx_type) {
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+  LD_SH8(input, stride, in0, in1, in2, in3, in4, in5, in6, in7);
+  SLLI_4V(in0, in1, in2, in3, 2);
+  SLLI_4V(in4, in5, in6, in7, 2);
+
+  switch (tx_type) {
+    case DCT_DCT:
+      VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
+                in0, in1, in2, in3, in4, in5, in6, in7);
+      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                         in0, in1, in2, in3, in4, in5, in6, in7);
+      VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
+                in0, in1, in2, in3, in4, in5, in6, in7);
+      break;
+    case ADST_DCT:
+      VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,
+                in0, in1, in2, in3, in4, in5, in6, in7);
+      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                         in0, in1, in2, in3, in4, in5, in6, in7);
+      VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
+                in0, in1, in2, in3, in4, in5, in6, in7);
+      break;
+    case DCT_ADST:
+      VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
+                in0, in1, in2, in3, in4, in5, in6, in7);
+      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                         in0, in1, in2, in3, in4, in5, in6, in7);
+      VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,
+                in0, in1, in2, in3, in4, in5, in6, in7);
+      break;
+    case ADST_ADST:
+      VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,
+                in0, in1, in2, in3, in4, in5, in6, in7);
+      TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                         in0, in1, in2, in3, in4, in5, in6, in7);
+      VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,
+                in0, in1, in2, in3, in4, in5, in6, in7);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+
+  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                     in0, in1, in2, in3, in4, in5, in6, in7);
+  SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7);
+  ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8);
+}
diff --git a/libs/libvpx/vp9/encoder/mips/msa/vp9_fdct_msa.h b/libs/libvpx/vp9/encoder/mips/msa/vp9_fdct_msa.h
new file mode 100644
index 0000000000..d7d40cb72c
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/mips/msa/vp9_fdct_msa.h
@@ -0,0 +1,117 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_
+#define VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_
+
+#include "vpx_dsp/mips/fwd_txfm_msa.h"
+#include "vpx_dsp/mips/txfm_macros_msa.h"
+#include "vpx_ports/mem.h"
+
+#define VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,                   \
+                  out0, out1, out2, out3, out4, out5, out6, out7) {         \
+  v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst4_m;                        \
+  v8i16 vec0_m, vec1_m, vec2_m, vec3_m, s0_m, s1_m;                         \
+  v8i16 coeff0_m = { cospi_2_64, cospi_6_64, cospi_10_64, cospi_14_64,      \
+                     cospi_18_64, cospi_22_64, cospi_26_64, cospi_30_64 };  \
+  v8i16 coeff1_m = { cospi_8_64, -cospi_8_64, cospi_16_64, -cospi_16_64,    \
+                     cospi_24_64, -cospi_24_64, 0, 0 };                     \
+                                                                            \
+  SPLATI_H2_SH(coeff0_m, 0, 7, cnst0_m, cnst1_m);                           \
+  cnst2_m = -cnst0_m;                                                       \
+  ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m);        \
+  SPLATI_H2_SH(coeff0_m, 4, 3, cnst2_m, cnst3_m);                           \
+  cnst4_m = -cnst2_m;                                                       \
+  ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m);        \
+                                                                            \
+  ILVRL_H2_SH(in0, in7, vec1_m, vec0_m);                                    \
+  ILVRL_H2_SH(in4, in3, vec3_m, vec2_m);                                    \
+  DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m,            \
+                        cnst1_m, cnst2_m, cnst3_m, in7, in0,                \
+                        in4, in3);                                          \
+                                                                            \
+  SPLATI_H2_SH(coeff0_m, 2, 5, cnst0_m, cnst1_m);                           \
+  cnst2_m = -cnst0_m;                                                       \
+  ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m);        \
+  SPLATI_H2_SH(coeff0_m, 6, 1, cnst2_m, cnst3_m);                           \
+  cnst4_m = -cnst2_m;                                                       \
+  ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m);        \
+                                                                            \
+  ILVRL_H2_SH(in2, in5, vec1_m, vec0_m);                                    \
+  ILVRL_H2_SH(in6, in1, vec3_m, vec2_m);                                    \
+                                                                            \
+  DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m,            \
+                        cnst1_m, cnst2_m, cnst3_m, in5, in2,                \
+                        in6, in1);                                          \
+  BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5);                    \
+  out7 = -s0_m;                                                             \
+  out0 = s1_m;                                                              \
+                                                                            \
+  SPLATI_H4_SH(coeff1_m, 0, 4, 1, 5, cnst0_m, cnst1_m, cnst2_m, cnst3_m);   \
+                                                                            \
+  ILVEV_H2_SH(cnst3_m, cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst2_m);        \
+  cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m);                                \
+  cnst1_m = cnst0_m;                                                        \
+                                                                            \
+  ILVRL_H2_SH(in4, in3, vec1_m, vec0_m);                                    \
+  ILVRL_H2_SH(in6, in1, vec3_m, vec2_m);                                    \
+  DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m,            \
+                        cnst2_m, cnst3_m, cnst1_m, out1, out6,              \
+                        s0_m, s1_m);                                        \
+                                                                            \
+  SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m);                           \
+  cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m);                                \
+                                                                            \
+  ILVRL_H2_SH(in2, in5, vec1_m, vec0_m);                                    \
+  ILVRL_H2_SH(s0_m, s1_m, vec3_m, vec2_m);                                  \
+  out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);                    \
+  out4 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m);                    \
+  out2 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m);                    \
+  out5 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m);                    \
+                                                                            \
+  out1 = -out1;                                                             \
+  out3 = -out3;                                                             \
+  out5 = -out5;                                                             \
+}
+
+#define VP9_FADST4(in0, in1, in2, in3, out0, out1, out2, out3) {  \
+  v4i32 s0_m, s1_m, s2_m, s3_m, constant_m;                       \
+  v4i32 in0_r_m, in1_r_m, in2_r_m, in3_r_m;                       \
+                                                                  \
+  UNPCK_R_SH_SW(in0, in0_r_m);                                    \
+  UNPCK_R_SH_SW(in1, in1_r_m);                                    \
+  UNPCK_R_SH_SW(in2, in2_r_m);                                    \
+  UNPCK_R_SH_SW(in3, in3_r_m);                                    \
+                                                                  \
+  constant_m = __msa_fill_w(sinpi_4_9);                           \
+  MUL2(in0_r_m, constant_m, in3_r_m, constant_m, s1_m, s0_m);     \
+                                                                  \
+  constant_m = __msa_fill_w(sinpi_1_9);                           \
+  s0_m += in0_r_m * constant_m;                                   \
+  s1_m -= in1_r_m * constant_m;                                   \
+                                                                  \
+  constant_m = __msa_fill_w(sinpi_2_9);                           \
+  s0_m += in1_r_m * constant_m;                                   \
+  s1_m += in3_r_m * constant_m;                                   \
+                                                                  \
+  s2_m = in0_r_m + in1_r_m - in3_r_m;                             \
+                                                                  \
+  constant_m = __msa_fill_w(sinpi_3_9);                           \
+  MUL2(in2_r_m, constant_m, s2_m, constant_m, s3_m, in1_r_m);     \
+                                                                  \
+  in0_r_m = s0_m + s3_m;                                          \
+  s2_m = s1_m - s3_m;                                             \
+  s3_m = s1_m - s0_m + s3_m;                                      \
+                                                                  \
+  SRARI_W4_SW(in0_r_m, in1_r_m, s2_m, s3_m, DCT_CONST_BITS);      \
+  PCKEV_H4_SH(in0_r_m, in0_r_m, in1_r_m, in1_r_m, s2_m, s2_m,     \
+              s3_m, s3_m, out0, out1, out2, out3);                \
+}
+#endif  /* VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_ */
diff --git a/libs/libvpx/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c b/libs/libvpx/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c
new file mode 100644
index 0000000000..363aabb7cb
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c
@@ -0,0 +1,289 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vpx_dsp/mips/macros_msa.h"
+
+static void temporal_filter_apply_8size_msa(uint8_t *frm1_ptr,
+                                            uint32_t stride,
+                                            uint8_t *frm2_ptr,
+                                            int32_t filt_sth,
+                                            int32_t filt_wgt,
+                                            uint32_t *acc,
+                                            uint16_t *cnt) {
+  uint32_t row;
+  uint64_t f0, f1, f2, f3;
+  v16i8 frm2, frm1 = { 0 };
+  v16i8 frm4, frm3 = { 0 };
+  v16u8 frm_r, frm_l;
+  v8i16 frm2_r, frm2_l;
+  v8i16 diff0, diff1, mod0_h, mod1_h;
+  v4i32 cnst3, cnst16, filt_wt, strength;
+  v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
+  v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
+  v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll;
+  v4i32 acc0, acc1, acc2, acc3;
+  v8i16 cnt0, cnt1;
+
+  filt_wt = __msa_fill_w(filt_wgt);
+  strength = __msa_fill_w(filt_sth);
+  cnst3 = __msa_ldi_w(3);
+  cnst16 = __msa_ldi_w(16);
+
+  for (row = 2; row--;) {
+    LD4(frm1_ptr, stride, f0, f1, f2, f3);
+    frm1_ptr += (4 * stride);
+
+    LD_SB2(frm2_ptr, 16, frm2, frm4);
+    frm2_ptr += 32;
+
+    LD_SW2(acc, 4, acc0, acc1);
+    LD_SW2(acc + 8, 4, acc2, acc3);
+    LD_SH2(cnt, 8, cnt0, cnt1);
+
+    INSERT_D2_SB(f0, f1, frm1);
+    INSERT_D2_SB(f2, f3, frm3);
+    ILVRL_B2_UB(frm1, frm2, frm_r, frm_l);
+    HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
+    UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+    UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
+         diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
+    MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+
+    diff0_r = (mod0_w < cnst16);
+    diff0_l = (mod1_w < cnst16);
+    diff1_r = (mod2_w < cnst16);
+    diff1_l = (mod3_w < cnst16);
+
+    SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+
+    mod0_w = diff0_r & mod0_w;
+    mod1_w = diff0_l & mod1_w;
+    mod2_w = diff1_r & mod2_w;
+    mod3_w = diff1_l & mod3_w;
+
+    MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+    ST_SH2(mod0_h, mod1_h, cnt, 8);
+    cnt += 16;
+
+    UNPCK_UB_SH(frm2, frm2_r, frm2_l);
+    UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
+    UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
+    MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+
+    ST_SW2(mod0_w, mod1_w, acc, 4);
+    acc += 8;
+    ST_SW2(mod2_w, mod3_w, acc, 4);
+    acc += 8;
+
+    LD_SW2(acc, 4, acc0, acc1);
+    LD_SW2(acc + 8, 4, acc2, acc3);
+    LD_SH2(cnt, 8, cnt0, cnt1);
+
+    ILVRL_B2_UB(frm3, frm4, frm_r, frm_l);
+    HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
+    UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+    UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
+         diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
+    MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+
+    diff0_r = (mod0_w < cnst16);
+    diff0_l = (mod1_w < cnst16);
+    diff1_r = (mod2_w < cnst16);
+    diff1_l = (mod3_w < cnst16);
+
+    SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+
+    mod0_w = diff0_r & mod0_w;
+    mod1_w = diff0_l & mod1_w;
+    mod2_w = diff1_r & mod2_w;
+    mod3_w = diff1_l & mod3_w;
+
+    MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+    ST_SH2(mod0_h, mod1_h, cnt, 8);
+    cnt += 16;
+    UNPCK_UB_SH(frm4, frm2_r, frm2_l);
+    UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
+    UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
+    MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+
+    ST_SW2(mod0_w, mod1_w, acc, 4);
+    acc += 8;
+    ST_SW2(mod2_w, mod3_w, acc, 4);
+    acc += 8;
+  }
+}
+
+static void temporal_filter_apply_16size_msa(uint8_t *frm1_ptr,
+                                             uint32_t stride,
+                                             uint8_t *frm2_ptr,
+                                             int32_t filt_sth,
+                                             int32_t filt_wgt,
+                                             uint32_t *acc,
+                                             uint16_t *cnt) {
+  uint32_t row;
+  v16i8 frm1, frm2, frm3, frm4;
+  v16u8 frm_r, frm_l;
+  v16i8 zero = { 0 };
+  v8u16 frm2_r, frm2_l;
+  v8i16 diff0, diff1, mod0_h, mod1_h;
+  v4i32 cnst3, cnst16, filt_wt, strength;
+  v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
+  v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
+  v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll;
+  v4i32 acc0, acc1, acc2, acc3;
+  v8i16 cnt0, cnt1;
+
+  filt_wt = __msa_fill_w(filt_wgt);
+  strength = __msa_fill_w(filt_sth);
+  cnst3 = __msa_ldi_w(3);
+  cnst16 = __msa_ldi_w(16);
+
+  for (row = 8; row--;) {
+    LD_SB2(frm1_ptr, stride, frm1, frm3);
+    frm1_ptr += stride;
+
+    LD_SB2(frm2_ptr, 16, frm2, frm4);
+    frm2_ptr += 16;
+
+    LD_SW2(acc, 4, acc0, acc1);
+    LD_SW2(acc, 4, acc2, acc3);
+    LD_SH2(cnt, 8, cnt0, cnt1);
+
+    ILVRL_B2_UB(frm1, frm2, frm_r, frm_l);
+    HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
+    UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+    UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+
+    diff0_r = (mod0_w < cnst16);
+    diff0_l = (mod1_w < cnst16);
+    diff1_r = (mod2_w < cnst16);
+    diff1_l = (mod3_w < cnst16);
+
+    SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+
+    mod0_w = diff0_r & mod0_w;
+    mod1_w = diff0_l & mod1_w;
+    mod2_w = diff1_r & mod2_w;
+    mod3_w = diff1_l & mod3_w;
+
+    MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+    ST_SH2(mod0_h, mod1_h, cnt, 8);
+    cnt += 16;
+
+    ILVRL_B2_UH(zero, frm2, frm2_r, frm2_l);
+    UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
+    UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
+    MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+
+    ST_SW2(mod0_w, mod1_w, acc, 4);
+    acc += 8;
+    ST_SW2(mod2_w, mod3_w, acc, 4);
+    acc += 8;
+
+    LD_SW2(acc, 4, acc0, acc1);
+    LD_SW2(acc + 8, 4, acc2, acc3);
+    LD_SH2(cnt, 8, cnt0, cnt1);
+
+    ILVRL_B2_UB(frm3, frm4, frm_r, frm_l);
+    HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
+    UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+    UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+
+    diff0_r = (mod0_w < cnst16);
+    diff0_l = (mod1_w < cnst16);
+    diff1_r = (mod2_w < cnst16);
+    diff1_l = (mod3_w < cnst16);
+
+    SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+
+    mod0_w = diff0_r & mod0_w;
+    mod1_w = diff0_l & mod1_w;
+    mod2_w = diff1_r & mod2_w;
+    mod3_w = diff1_l & mod3_w;
+
+    MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+    ST_SH2(mod0_h, mod1_h, cnt, 8);
+    cnt += 16;
+
+    ILVRL_B2_UH(zero, frm4, frm2_r, frm2_l);
+    UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
+    UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
+    MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    ST_SW2(mod0_w, mod1_w, acc, 4);
+    acc += 8;
+    ST_SW2(mod2_w, mod3_w, acc, 4);
+    acc += 8;
+
+    frm1_ptr += stride;
+    frm2_ptr += 16;
+  }
+}
+
+void vp9_temporal_filter_apply_msa(uint8_t *frame1_ptr, uint32_t stride,
+                                   uint8_t *frame2_ptr, uint32_t blk_w,
+                                   uint32_t blk_h, int32_t strength,
+                                   int32_t filt_wgt, uint32_t *accu,
+                                   uint16_t *cnt) {
+  if (8 == (blk_w * blk_h)) {
+    temporal_filter_apply_8size_msa(frame1_ptr, stride, frame2_ptr,
+                                    strength, filt_wgt, accu, cnt);
+  } else if (16 == (blk_w * blk_h)) {
+    temporal_filter_apply_16size_msa(frame1_ptr, stride, frame2_ptr,
+                                     strength, filt_wgt, accu, cnt);
+  } else {
+    vp9_temporal_filter_apply_c(frame1_ptr, stride, frame2_ptr, blk_w, blk_h,
+                                strength, filt_wgt, accu, cnt);
+  }
+}
diff --git a/libs/libvpx/vp9/encoder/vp9_aq_360.c b/libs/libvpx/vp9/encoder/vp9_aq_360.c
new file mode 100644
index 0000000000..f8c187cc5d
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_aq_360.c
@@ -0,0 +1,73 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+
+#include "vpx_ports/mem.h"
+#include "vpx_ports/system_state.h"
+
+#include "vp9/encoder/vp9_aq_variance.h"
+
+#include "vp9/common/vp9_seg_common.h"
+
+#include "vp9/encoder/vp9_ratectrl.h"
+#include "vp9/encoder/vp9_rd.h"
+#include "vp9/encoder/vp9_segmentation.h"
+
+static const double rate_ratio[MAX_SEGMENTS] =
+  {1.0, 0.75, 0.6, 0.5, 0.4, 0.3, 0.25};
+
+// Sets segment id 0 for the equatorial region, 1 for temperate region
+// and 2 for the polar regions
+unsigned int vp9_360aq_segment_id(int mi_row, int mi_rows) {
+  if (mi_row < mi_rows / 8 || mi_row > mi_rows - mi_rows / 8)
+    return 2;
+  else if (mi_row < mi_rows / 4 || mi_row > mi_rows - mi_rows / 4)
+    return 1;
+  else
+    return 0;
+}
+
+void vp9_360aq_frame_setup(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+  struct segmentation *seg = &cm->seg;
+  int i;
+
+  if (frame_is_intra_only(cm) || cm->error_resilient_mode) {
+    vp9_enable_segmentation(seg);
+    vp9_clearall_segfeatures(seg);
+
+    seg->abs_delta = SEGMENT_DELTADATA;
+
+    vpx_clear_system_state();
+
+    for (i = 0; i < MAX_SEGMENTS; ++i) {
+      int qindex_delta =
+          vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex,
+                                     rate_ratio[i], cm->bit_depth);
+
+      // We don't allow qindex 0 in a segment if the base value is not 0.
+      // Q index 0 (lossless) implies 4x4 encoding only and in AQ mode a segment
+      // Q delta is sometimes applied without going back around the rd loop.
+      // This could lead to an illegal combination of partition size and q.
+      if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) {
+        qindex_delta = -cm->base_qindex + 1;
+      }
+
+      // No need to enable SEG_LVL_ALT_Q for this segment.
+      if (rate_ratio[i] == 1.0) {
+        continue;
+      }
+
+      vp9_set_segdata(seg, i, SEG_LVL_ALT_Q, qindex_delta);
+      vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q);
+    }
+  }
+}
diff --git a/libs/libvpx/vp9/encoder/vp9_aq_360.h b/libs/libvpx/vp9/encoder/vp9_aq_360.h
new file mode 100644
index 0000000000..fb861cb052
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_aq_360.h
@@ -0,0 +1,28 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP9_ENCODER_VP9_AQ_360_H_
+#define VP9_ENCODER_VP9_AQ_360_H_
+
+#include "vp9/encoder/vp9_encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+unsigned int vp9_360aq_segment_id(int mi_row, int mi_rows);
+void vp9_360aq_frame_setup(VP9_COMP *cpi);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_VP9_AQ_VARIANCE_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_aq_complexity.c b/libs/libvpx/vp9/encoder/vp9_aq_complexity.c
new file mode 100644
index 0000000000..2d979ec70b
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_aq_complexity.c
@@ -0,0 +1,161 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <limits.h>
+#include <math.h>
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_ports/system_state.h"
+
+#include "vp9/encoder/vp9_aq_complexity.h"
+#include "vp9/encoder/vp9_aq_variance.h"
+#include "vp9/encoder/vp9_encodeframe.h"
+#include "vp9/common/vp9_seg_common.h"
+#include "vp9/encoder/vp9_segmentation.h"
+
+#define AQ_C_SEGMENTS  5
+#define DEFAULT_AQ2_SEG 3   // Neutral Q segment
+#define AQ_C_STRENGTHS 3
+static const double aq_c_q_adj_factor[AQ_C_STRENGTHS][AQ_C_SEGMENTS] =
+  { {1.75, 1.25, 1.05, 1.00, 0.90},
+    {2.00, 1.50, 1.15, 1.00, 0.85},
+    {2.50, 1.75, 1.25, 1.00, 0.80} };
+static const double aq_c_transitions[AQ_C_STRENGTHS][AQ_C_SEGMENTS] =
+  { {0.15, 0.30, 0.55, 2.00, 100.0},
+    {0.20, 0.40, 0.65, 2.00, 100.0},
+    {0.25, 0.50, 0.75, 2.00, 100.0} };
+static const double aq_c_var_thresholds[AQ_C_STRENGTHS][AQ_C_SEGMENTS] =
+  { {-4.0, -3.0, -2.0, 100.00, 100.0},
+    {-3.5, -2.5, -1.5, 100.00, 100.0},
+    {-3.0, -2.0, -1.0, 100.00, 100.0} };
+
+static int get_aq_c_strength(int q_index, vpx_bit_depth_t bit_depth) {
+  // Approximate base quatizer (truncated to int)
+  const int base_quant = vp9_ac_quant(q_index, 0, bit_depth) / 4;
+  return (base_quant > 10) + (base_quant > 25);
+}
+
+void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  struct segmentation *const seg = &cm->seg;
+
+  // Make SURE use of floating point in this function is safe.
+  vpx_clear_system_state();
+
+  if (frame_is_intra_only(cm) || cm->error_resilient_mode ||
+      cpi->refresh_alt_ref_frame ||
+      (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
+    int segment;
+    const int aq_strength = get_aq_c_strength(cm->base_qindex, cm->bit_depth);
+
+    // Clear down the segment map.
+    memset(cpi->segmentation_map, DEFAULT_AQ2_SEG, cm->mi_rows * cm->mi_cols);
+
+    vp9_clearall_segfeatures(seg);
+
+    // Segmentation only makes sense if the target bits per SB is above a
+    // threshold. Below this the overheads will usually outweigh any benefit.
+    if (cpi->rc.sb64_target_rate < 256) {
+      vp9_disable_segmentation(seg);
+      return;
+    }
+
+    vp9_enable_segmentation(seg);
+
+    // Select delta coding method.
+    seg->abs_delta = SEGMENT_DELTADATA;
+
+    // Default segment "Q" feature is disabled so it defaults to the baseline Q.
+    vp9_disable_segfeature(seg, DEFAULT_AQ2_SEG, SEG_LVL_ALT_Q);
+
+    // Use some of the segments for in frame Q adjustment.
+    for (segment = 0; segment < AQ_C_SEGMENTS; ++segment) {
+      int qindex_delta;
+
+      if (segment == DEFAULT_AQ2_SEG)
+        continue;
+
+      qindex_delta =
+        vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex,
+                                   aq_c_q_adj_factor[aq_strength][segment],
+                                   cm->bit_depth);
+
+
+      // For AQ complexity mode, we dont allow Q0 in a segment if the base
+      // Q is not 0. Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment
+      // Q delta is sometimes applied without going back around the rd loop.
+      // This could lead to an illegal combination of partition size and q.
+      if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) {
+        qindex_delta = -cm->base_qindex + 1;
+      }
+      if ((cm->base_qindex + qindex_delta) > 0) {
+        vp9_enable_segfeature(seg, segment, SEG_LVL_ALT_Q);
+        vp9_set_segdata(seg, segment, SEG_LVL_ALT_Q, qindex_delta);
+      }
+    }
+  }
+}
+
+#define DEFAULT_LV_THRESH 10.0
+#define MIN_DEFAULT_LV_THRESH 8.0
+// Select a segment for the current block.
+// The choice of segment for a block depends on the ratio of the projected
+// bits for the block vs a target average and its spatial complexity.
+void vp9_caq_select_segment(VP9_COMP *cpi, MACROBLOCK *mb, BLOCK_SIZE bs,
+                            int mi_row, int mi_col, int projected_rate) {
+  VP9_COMMON *const cm = &cpi->common;
+
+  const int mi_offset = mi_row * cm->mi_cols + mi_col;
+  const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64];
+  const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64];
+  const int xmis = VPXMIN(cm->mi_cols - mi_col, num_8x8_blocks_wide_lookup[bs]);
+  const int ymis = VPXMIN(cm->mi_rows - mi_row, num_8x8_blocks_high_lookup[bs]);
+  int x, y;
+  int i;
+  unsigned char segment;
+
+  if (0) {
+    segment = DEFAULT_AQ2_SEG;
+  } else {
+    // Rate depends on fraction of a SB64 in frame (xmis * ymis / bw * bh).
+    // It is converted to bits * 256 units.
+    const int target_rate = (cpi->rc.sb64_target_rate * xmis * ymis * 256) /
+                            (bw * bh);
+    double logvar;
+    double low_var_thresh;
+    const int aq_strength = get_aq_c_strength(cm->base_qindex, cm->bit_depth);
+
+    vpx_clear_system_state();
+    low_var_thresh = (cpi->oxcf.pass == 2)
+      ? VPXMAX(cpi->twopass.mb_av_energy, MIN_DEFAULT_LV_THRESH)
+      : DEFAULT_LV_THRESH;
+
+    vp9_setup_src_planes(mb, cpi->Source, mi_row, mi_col);
+    logvar = vp9_log_block_var(cpi, mb, bs);
+
+    segment = AQ_C_SEGMENTS - 1;    // Just in case no break out below.
+    for (i = 0; i < AQ_C_SEGMENTS; ++i) {
+      // Test rate against a threshold value and variance against a threshold.
+      // Increasing segment number (higher variance and complexity) = higher Q.
+      if ((projected_rate <
+           target_rate * aq_c_transitions[aq_strength][i]) &&
+          (logvar < (low_var_thresh + aq_c_var_thresholds[aq_strength][i]))) {
+        segment = i;
+        break;
+      }
+    }
+  }
+
+  // Fill in the entires in the segment map corresponding to this SB64.
+  for (y = 0; y < ymis; y++) {
+    for (x = 0; x < xmis; x++) {
+      cpi->segmentation_map[mi_offset + y * cm->mi_cols + x] = segment;
+    }
+  }
+}
diff --git a/libs/libvpx/vp9/encoder/vp9_aq_complexity.h b/libs/libvpx/vp9/encoder/vp9_aq_complexity.h
new file mode 100644
index 0000000000..e9acb1ca50
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_aq_complexity.h
@@ -0,0 +1,37 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP9_ENCODER_VP9_AQ_COMPLEXITY_H_
+#define VP9_ENCODER_VP9_AQ_COMPLEXITY_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "vp9/common/vp9_enums.h"
+
+struct VP9_COMP;
+struct macroblock;
+
+// Select a segment for the current Block.
+void vp9_caq_select_segment(struct VP9_COMP *cpi, struct macroblock *,
+                            BLOCK_SIZE bs,
+                            int mi_row, int mi_col, int projected_rate);
+
+// This function sets up a set of segments with delta Q values around
+// the baseline frame quantizer.
+void vp9_setup_in_frame_q_adj(struct VP9_COMP *cpi);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_VP9_AQ_COMPLEXITY_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c b/libs/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c
new file mode 100644
index 0000000000..b7cfdf6bf4
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -0,0 +1,612 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <limits.h>
+#include <math.h>
+
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_ports/system_state.h"
+
+#include "vp9/encoder/vp9_aq_cyclicrefresh.h"
+
+#include "vp9/common/vp9_seg_common.h"
+
+#include "vp9/encoder/vp9_ratectrl.h"
+#include "vp9/encoder/vp9_segmentation.h"
+
+CYCLIC_REFRESH *vp9_cyclic_refresh_alloc(int mi_rows, int mi_cols) {
+  size_t last_coded_q_map_size;
+  size_t consec_zero_mv_size;
+  CYCLIC_REFRESH *const cr = vpx_calloc(1, sizeof(*cr));
+  if (cr == NULL)
+    return NULL;
+
+  cr->map = vpx_calloc(mi_rows * mi_cols, sizeof(*cr->map));
+  if (cr->map == NULL) {
+    vpx_free(cr);
+    return NULL;
+  }
+  last_coded_q_map_size = mi_rows * mi_cols * sizeof(*cr->last_coded_q_map);
+  cr->last_coded_q_map = vpx_malloc(last_coded_q_map_size);
+  if (cr->last_coded_q_map == NULL) {
+    vpx_free(cr);
+    return NULL;
+  }
+  assert(MAXQ <= 255);
+  memset(cr->last_coded_q_map, MAXQ, last_coded_q_map_size);
+
+  consec_zero_mv_size = mi_rows * mi_cols * sizeof(*cr->consec_zero_mv);
+  cr->consec_zero_mv = vpx_malloc(consec_zero_mv_size);
+  if (cr->consec_zero_mv == NULL) {
+    vpx_free(cr);
+    return NULL;
+  }
+  memset(cr->consec_zero_mv, 0, consec_zero_mv_size);
+  return cr;
+}
+
+void vp9_cyclic_refresh_free(CYCLIC_REFRESH *cr) {
+  vpx_free(cr->map);
+  vpx_free(cr->last_coded_q_map);
+  vpx_free(cr->consec_zero_mv);
+  vpx_free(cr);
+}
+
+// Check if this coding block, of size bsize, should be considered for refresh
+// (lower-qp coding). Decision can be based on various factors, such as
+// size of the coding block (i.e., below min_block size rejected), coding
+// mode, and rate/distortion.
+static int candidate_refresh_aq(const CYCLIC_REFRESH *cr,
+                                const MODE_INFO *mi,
+                                int64_t rate,
+                                int64_t dist,
+                                int bsize) {
+  MV mv = mi->mv[0].as_mv;
+  // Reject the block for lower-qp coding if projected distortion
+  // is above the threshold, and any of the following is true:
+  // 1) mode uses large mv
+  // 2) mode is an intra-mode
+  // Otherwise accept for refresh.
+  if (dist > cr->thresh_dist_sb &&
+      (mv.row > cr->motion_thresh || mv.row < -cr->motion_thresh ||
+       mv.col > cr->motion_thresh || mv.col < -cr->motion_thresh ||
+       !is_inter_block(mi)))
+    return CR_SEGMENT_ID_BASE;
+  else  if (bsize >= BLOCK_16X16 &&
+            rate < cr->thresh_rate_sb &&
+            is_inter_block(mi) &&
+            mi->mv[0].as_int == 0 &&
+            cr->rate_boost_fac > 10)
+    // More aggressive delta-q for bigger blocks with zero motion.
+    return CR_SEGMENT_ID_BOOST2;
+  else
+    return CR_SEGMENT_ID_BOOST1;
+}
+
+// Compute delta-q for the segment.
+static int compute_deltaq(const VP9_COMP *cpi, int q, double rate_factor) {
+  const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  int deltaq = vp9_compute_qdelta_by_rate(rc, cpi->common.frame_type,
+                                          q, rate_factor,
+                                          cpi->common.bit_depth);
+  if ((-deltaq) > cr->max_qdelta_perc * q / 100) {
+    deltaq = -cr->max_qdelta_perc * q / 100;
+  }
+  return deltaq;
+}
+
+// For the just encoded frame, estimate the bits, incorporating the delta-q
+// from non-base segment. For now ignore effect of multiple segments
+// (with different delta-q). Note this function is called in the postencode
+// (called from rc_update_rate_correction_factors()).
+int vp9_cyclic_refresh_estimate_bits_at_q(const VP9_COMP *cpi,
+                                          double correction_factor) {
+  const VP9_COMMON *const cm = &cpi->common;
+  const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  int estimated_bits;
+  int mbs = cm->MBs;
+  int num8x8bl = mbs << 2;
+  // Weight for non-base segments: use actual number of blocks refreshed in
+  // previous/just encoded frame. Note number of blocks here is in 8x8 units.
+  double weight_segment1 = (double)cr->actual_num_seg1_blocks / num8x8bl;
+  double weight_segment2 = (double)cr->actual_num_seg2_blocks / num8x8bl;
+  // Take segment weighted average for estimated bits.
+  estimated_bits = (int)((1.0 - weight_segment1 - weight_segment2) *
+      vp9_estimate_bits_at_q(cm->frame_type, cm->base_qindex, mbs,
+                             correction_factor, cm->bit_depth) +
+                             weight_segment1 *
+      vp9_estimate_bits_at_q(cm->frame_type,
+                             cm->base_qindex + cr->qindex_delta[1], mbs,
+                             correction_factor, cm->bit_depth) +
+                             weight_segment2 *
+      vp9_estimate_bits_at_q(cm->frame_type,
+                             cm->base_qindex + cr->qindex_delta[2], mbs,
+                             correction_factor, cm->bit_depth));
+  return estimated_bits;
+}
+
+// Prior to encoding the frame, estimate the bits per mb, for a given q = i and
+// a corresponding delta-q (for segment 1). This function is called in the
+// rc_regulate_q() to set the base qp index.
+// Note: the segment map is set to either 0/CR_SEGMENT_ID_BASE (no refresh) or
+// to 1/CR_SEGMENT_ID_BOOST1 (refresh) for each superblock, prior to encoding.
+int vp9_cyclic_refresh_rc_bits_per_mb(const VP9_COMP *cpi, int i,
+                                      double correction_factor) {
+  const VP9_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  int bits_per_mb;
+  int num8x8bl = cm->MBs << 2;
+  // Weight for segment prior to encoding: take the average of the target
+  // number for the frame to be encoded and the actual from the previous frame.
+  int target_refresh = cr->percent_refresh * cm->mi_rows * cm->mi_cols / 100;
+  double weight_segment = (double)((target_refresh +
+      cr->actual_num_seg1_blocks + cr->actual_num_seg2_blocks) >> 1) /
+      num8x8bl;
+  // Compute delta-q corresponding to qindex i.
+  int deltaq = compute_deltaq(cpi, i, cr->rate_ratio_qdelta);
+  // Take segment weighted average for bits per mb.
+  bits_per_mb = (int)((1.0 - weight_segment) *
+      vp9_rc_bits_per_mb(cm->frame_type, i, correction_factor, cm->bit_depth) +
+      weight_segment *
+      vp9_rc_bits_per_mb(cm->frame_type, i + deltaq, correction_factor,
+                         cm->bit_depth));
+  return bits_per_mb;
+}
+
+// Prior to coding a given prediction block, of size bsize at (mi_row, mi_col),
+// check if we should reset the segment_id, and update the cyclic_refresh map
+// and segmentation map.
+void vp9_cyclic_refresh_update_segment(VP9_COMP *const cpi,
+                                       MODE_INFO *const mi,
+                                       int mi_row, int mi_col,
+                                       BLOCK_SIZE bsize,
+                                       int64_t rate,
+                                       int64_t dist,
+                                       int skip,
+                                       struct macroblock_plane *const p) {
+  const VP9_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  const int bw = num_8x8_blocks_wide_lookup[bsize];
+  const int bh = num_8x8_blocks_high_lookup[bsize];
+  const int xmis = VPXMIN(cm->mi_cols - mi_col, bw);
+  const int ymis = VPXMIN(cm->mi_rows - mi_row, bh);
+  const int block_index = mi_row * cm->mi_cols + mi_col;
+  int refresh_this_block = candidate_refresh_aq(cr, mi, rate, dist, bsize);
+  // Default is to not update the refresh map.
+  int new_map_value = cr->map[block_index];
+  int x = 0; int y = 0;
+
+  int is_skin = 0;
+  if (refresh_this_block == 0 &&
+      bsize <= BLOCK_16X16 &&
+      cpi->use_skin_detection) {
+    is_skin = vp9_compute_skin_block(p[0].src.buf,
+                                     p[1].src.buf,
+                                     p[2].src.buf,
+                                     p[0].src.stride,
+                                     p[1].src.stride,
+                                     bsize);
+    if (is_skin)
+      refresh_this_block = 1;
+  }
+
+  // If this block is labeled for refresh, check if we should reset the
+  // segment_id.
+  if (cyclic_refresh_segment_id_boosted(mi->segment_id)) {
+    mi->segment_id = refresh_this_block;
+    // Reset segment_id if it will be skipped.
+    if (skip)
+      mi->segment_id = CR_SEGMENT_ID_BASE;
+  }
+
+  // Update the cyclic refresh map, to be used for setting segmentation map
+  // for the next frame. If the block  will be refreshed this frame, mark it
+  // as clean. The magnitude of the -ve influences how long before we consider
+  // it for refresh again.
+  if (cyclic_refresh_segment_id_boosted(mi->segment_id)) {
+    new_map_value = -cr->time_for_refresh;
+  } else if (refresh_this_block) {
+    // Else if it is accepted as candidate for refresh, and has not already
+    // been refreshed (marked as 1) then mark it as a candidate for cleanup
+    // for future time (marked as 0), otherwise don't update it.
+    if (cr->map[block_index] == 1)
+      new_map_value = 0;
+  } else {
+    // Leave it marked as block that is not candidate for refresh.
+    new_map_value = 1;
+  }
+
+  // Update entries in the cyclic refresh map with new_map_value, and
+  // copy mbmi->segment_id into global segmentation map.
+  for (y = 0; y < ymis; y++)
+    for (x = 0; x < xmis; x++) {
+      int map_offset = block_index + y * cm->mi_cols + x;
+      cr->map[map_offset] = new_map_value;
+      cpi->segmentation_map[map_offset] = mi->segment_id;
+    }
+}
+
+void vp9_cyclic_refresh_update_sb_postencode(VP9_COMP *const cpi,
+                                             const MODE_INFO *const mi,
+                                             int mi_row, int mi_col,
+                                             BLOCK_SIZE bsize) {
+  const VP9_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  MV mv = mi->mv[0].as_mv;
+  const int bw = num_8x8_blocks_wide_lookup[bsize];
+  const int bh = num_8x8_blocks_high_lookup[bsize];
+  const int xmis = VPXMIN(cm->mi_cols - mi_col, bw);
+  const int ymis = VPXMIN(cm->mi_rows - mi_row, bh);
+  const int block_index = mi_row * cm->mi_cols + mi_col;
+  int x, y;
+  for (y = 0; y < ymis; y++)
+    for (x = 0; x < xmis; x++) {
+      int map_offset = block_index + y * cm->mi_cols + x;
+      // Inter skip blocks were clearly not coded at the current qindex, so
+      // don't update the map for them. For cases where motion is non-zero or
+      // the reference frame isn't the previous frame, the previous value in
+      // the map for this spatial location is not entirely correct.
+      if ((!is_inter_block(mi) || !mi->skip) &&
+          mi->segment_id <= CR_SEGMENT_ID_BOOST2) {
+        cr->last_coded_q_map[map_offset] = clamp(
+            cm->base_qindex + cr->qindex_delta[mi->segment_id], 0, MAXQ);
+      } else if (is_inter_block(mi) && mi->skip &&
+                 mi->segment_id <= CR_SEGMENT_ID_BOOST2) {
+        cr->last_coded_q_map[map_offset] = VPXMIN(
+            clamp(cm->base_qindex + cr->qindex_delta[mi->segment_id],
+                  0, MAXQ),
+            cr->last_coded_q_map[map_offset]);
+      // Update the consecutive zero/low_mv count.
+      if (is_inter_block(mi) && (abs(mv.row) < 8 && abs(mv.col) < 8)) {
+        if (cr->consec_zero_mv[map_offset] < 255)
+          cr->consec_zero_mv[map_offset]++;
+      } else {
+        cr->consec_zero_mv[map_offset] = 0;
+      }
+    }
+  }
+}
+
+// Update the actual number of blocks that were applied the segment delta q.
+void vp9_cyclic_refresh_postencode(VP9_COMP *const cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  unsigned char *const seg_map = cpi->segmentation_map;
+  int mi_row, mi_col;
+  cr->actual_num_seg1_blocks = 0;
+  cr->actual_num_seg2_blocks = 0;
+  for (mi_row = 0; mi_row < cm->mi_rows; mi_row++)
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) {
+      if (cyclic_refresh_segment_id(
+          seg_map[mi_row * cm->mi_cols + mi_col]) == CR_SEGMENT_ID_BOOST1)
+        cr->actual_num_seg1_blocks++;
+      else if (cyclic_refresh_segment_id(
+          seg_map[mi_row * cm->mi_cols + mi_col]) == CR_SEGMENT_ID_BOOST2)
+        cr->actual_num_seg2_blocks++;
+    }
+}
+
+// Set golden frame update interval, for non-svc 1 pass CBR mode.
+void vp9_cyclic_refresh_set_golden_update(VP9_COMP *const cpi) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  // Set minimum gf_interval for GF update to a multiple of the refresh period,
+  // with some max limit. Depending on past encoding stats, GF flag may be
+  // reset and update may not occur until next baseline_gf_interval.
+  if (cr->percent_refresh > 0)
+    rc->baseline_gf_interval = VPXMIN(4 * (100 / cr->percent_refresh), 40);
+  else
+    rc->baseline_gf_interval = 40;
+}
+
+// Update some encoding stats (from the just encoded frame). If this frame's
+// background has high motion, refresh the golden frame. Otherwise, if the
+// golden reference is to be updated check if we should NOT update the golden
+// ref.
+void vp9_cyclic_refresh_check_golden_update(VP9_COMP *const cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  int mi_row, mi_col;
+  double fraction_low = 0.0;
+  int low_content_frame = 0;
+
+  MODE_INFO **mi = cm->mi_grid_visible;
+  RATE_CONTROL *const rc = &cpi->rc;
+  const int rows = cm->mi_rows, cols = cm->mi_cols;
+  int cnt1 = 0, cnt2 = 0;
+  int force_gf_refresh = 0;
+
+  for (mi_row = 0; mi_row < rows; mi_row++) {
+    for (mi_col = 0; mi_col < cols; mi_col++) {
+      int16_t abs_mvr = mi[0]->mv[0].as_mv.row >= 0 ?
+          mi[0]->mv[0].as_mv.row : -1 * mi[0]->mv[0].as_mv.row;
+      int16_t abs_mvc = mi[0]->mv[0].as_mv.col >= 0 ?
+          mi[0]->mv[0].as_mv.col : -1 * mi[0]->mv[0].as_mv.col;
+
+      // Calculate the motion of the background.
+      if (abs_mvr <= 16 && abs_mvc <= 16) {
+        cnt1++;
+        if (abs_mvr == 0 && abs_mvc == 0)
+          cnt2++;
+      }
+      mi++;
+
+      // Accumulate low_content_frame.
+      if (cr->map[mi_row * cols + mi_col] < 1)
+        low_content_frame++;
+    }
+    mi += 8;
+  }
+
+  // For video conference clips, if the background has high motion in current
+  // frame because of the camera movement, set this frame as the golden frame.
+  // Use 70% and 5% as the thresholds for golden frame refreshing.
+  // Also, force this frame as a golden update frame if this frame will change
+  // the resolution (resize_pending != 0).
+  if (cpi->resize_pending != 0 ||
+     (cnt1 * 10 > (70 * rows * cols) && cnt2 * 20 < cnt1)) {
+    vp9_cyclic_refresh_set_golden_update(cpi);
+    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+
+    if (rc->frames_till_gf_update_due > rc->frames_to_key)
+      rc->frames_till_gf_update_due = rc->frames_to_key;
+    cpi->refresh_golden_frame = 1;
+    force_gf_refresh = 1;
+  }
+
+  fraction_low =
+      (double)low_content_frame / (rows * cols);
+  // Update average.
+  cr->low_content_avg = (fraction_low + 3 * cr->low_content_avg) / 4;
+  if (!force_gf_refresh && cpi->refresh_golden_frame == 1) {
+    // Don't update golden reference if the amount of low_content for the
+    // current encoded frame is small, or if the recursive average of the
+    // low_content over the update interval window falls below threshold.
+    if (fraction_low < 0.8 || cr->low_content_avg < 0.7)
+      cpi->refresh_golden_frame = 0;
+    // Reset for next internal.
+    cr->low_content_avg = fraction_low;
+  }
+}
+
+// Update the segmentation map, and related quantities: cyclic refresh map,
+// refresh sb_index, and target number of blocks to be refreshed.
+// The map is set to either 0/CR_SEGMENT_ID_BASE (no refresh) or to
+// 1/CR_SEGMENT_ID_BOOST1 (refresh) for each superblock.
+// Blocks labeled as BOOST1 may later get set to BOOST2 (during the
+// encoding of the superblock).
+static void cyclic_refresh_update_map(VP9_COMP *const cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  unsigned char *const seg_map = cpi->segmentation_map;
+  int i, block_count, bl_index, sb_rows, sb_cols, sbs_in_frame;
+  int xmis, ymis, x, y;
+  int consec_zero_mv_thresh = 0;
+  int qindex_thresh = 0;
+  int count_sel = 0;
+  int count_tot = 0;
+  memset(seg_map, CR_SEGMENT_ID_BASE, cm->mi_rows * cm->mi_cols);
+  sb_cols = (cm->mi_cols + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE;
+  sb_rows = (cm->mi_rows + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE;
+  sbs_in_frame = sb_cols * sb_rows;
+  // Number of target blocks to get the q delta (segment 1).
+  block_count = cr->percent_refresh * cm->mi_rows * cm->mi_cols / 100;
+  // Set the segmentation map: cycle through the superblocks, starting at
+  // cr->mb_index, and stopping when either block_count blocks have been found
+  // to be refreshed, or we have passed through whole frame.
+  assert(cr->sb_index < sbs_in_frame);
+  i = cr->sb_index;
+  cr->target_num_seg_blocks = 0;
+  if (cpi->oxcf.content != VP9E_CONTENT_SCREEN) {
+    consec_zero_mv_thresh = 100;
+   if (cpi->noise_estimate.enabled && cpi->noise_estimate.level >= kMedium)
+     consec_zero_mv_thresh = 80;
+  }
+  qindex_thresh =
+      cpi->oxcf.content == VP9E_CONTENT_SCREEN
+      ? vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST2, cm->base_qindex)
+      : vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST1, cm->base_qindex);
+  do {
+    int sum_map = 0;
+    // Get the mi_row/mi_col corresponding to superblock index i.
+    int sb_row_index = (i / sb_cols);
+    int sb_col_index = i - sb_row_index * sb_cols;
+    int mi_row = sb_row_index * MI_BLOCK_SIZE;
+    int mi_col = sb_col_index * MI_BLOCK_SIZE;
+    assert(mi_row >= 0 && mi_row < cm->mi_rows);
+    assert(mi_col >= 0 && mi_col < cm->mi_cols);
+    bl_index = mi_row * cm->mi_cols + mi_col;
+    // Loop through all 8x8 blocks in superblock and update map.
+    xmis =
+        VPXMIN(cm->mi_cols - mi_col, num_8x8_blocks_wide_lookup[BLOCK_64X64]);
+    ymis =
+        VPXMIN(cm->mi_rows - mi_row, num_8x8_blocks_high_lookup[BLOCK_64X64]);
+    for (y = 0; y < ymis; y++) {
+      for (x = 0; x < xmis; x++) {
+        const int bl_index2 = bl_index + y * cm->mi_cols + x;
+        // If the block is as a candidate for clean up then mark it
+        // for possible boost/refresh (segment 1). The segment id may get
+        // reset to 0 later if block gets coded anything other than ZEROMV.
+        if (cr->map[bl_index2] == 0) {
+          count_tot++;
+          if (cr->last_coded_q_map[bl_index2] > qindex_thresh ||
+              cr->consec_zero_mv[bl_index2] < consec_zero_mv_thresh) {
+            sum_map++;
+            count_sel++;
+          }
+        } else if (cr->map[bl_index2] < 0) {
+          cr->map[bl_index2]++;
+        }
+      }
+    }
+    // Enforce constant segment over superblock.
+    // If segment is at least half of superblock, set to 1.
+    if (sum_map >= xmis * ymis / 2) {
+      for (y = 0; y < ymis; y++)
+        for (x = 0; x < xmis; x++) {
+          seg_map[bl_index + y * cm->mi_cols + x] = CR_SEGMENT_ID_BOOST1;
+        }
+      cr->target_num_seg_blocks += xmis * ymis;
+    }
+    i++;
+    if (i == sbs_in_frame) {
+      i = 0;
+    }
+  } while (cr->target_num_seg_blocks < block_count && i != cr->sb_index);
+  cr->sb_index = i;
+  cr->reduce_refresh = 0;
+  if (count_sel < (3 * count_tot) >> 2)
+    cr->reduce_refresh = 1;
+}
+
+// Set cyclic refresh parameters.
+void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const VP9_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  cr->percent_refresh = 10;
+  if (cr->reduce_refresh)
+    cr->percent_refresh = 5;
+  cr->max_qdelta_perc = 50;
+  cr->time_for_refresh = 0;
+  // Use larger delta-qp (increase rate_ratio_qdelta) for first few (~4)
+  // periods of the refresh cycle, after a key frame.
+  // Account for larger interval on base layer for temporal layers.
+  if (cr->percent_refresh > 0 &&
+      rc->frames_since_key <  (4 * cpi->svc.number_temporal_layers) *
+      (100 / cr->percent_refresh)) {
+    cr->rate_ratio_qdelta = 3.0;
+  } else {
+    cr->rate_ratio_qdelta = 2.0;
+  if (cpi->noise_estimate.enabled && cpi->noise_estimate.level >= kMedium)
+    // Reduce the delta-qp if the estimated source noise is above threshold.
+    cr->rate_ratio_qdelta = 1.5;
+  }
+  // Adjust some parameters for low resolutions at low bitrates.
+  if (cm->width <= 352 &&
+      cm->height <= 288 &&
+      rc->avg_frame_bandwidth < 3400) {
+    cr->motion_thresh = 4;
+    cr->rate_boost_fac = 10;
+  } else {
+    cr->motion_thresh = 32;
+    cr->rate_boost_fac = 15;
+  }
+  if (cpi->svc.spatial_layer_id > 0) {
+    cr->motion_thresh = 4;
+    cr->rate_boost_fac = 12;
+  }
+}
+
+// Setup cyclic background refresh: set delta q and segmentation map.
+void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  struct segmentation *const seg = &cm->seg;
+  // TODO(marpan): Look into whether we should reduce the amount/delta-qp
+  // instead of completely shutting off at low bitrates. For now keep it on.
+  // const int apply_cyclic_refresh = apply_cyclic_refresh_bitrate(cm, rc);
+  const int apply_cyclic_refresh = 1;
+  if (cm->current_video_frame == 0)
+    cr->low_content_avg = 0.0;
+  // Don't apply refresh on key frame or temporal enhancement layer frames.
+  if (!apply_cyclic_refresh ||
+      (cm->frame_type == KEY_FRAME) ||
+      (cpi->svc.temporal_layer_id > 0)) {
+    // Set segmentation map to 0 and disable.
+    unsigned char *const seg_map = cpi->segmentation_map;
+    memset(seg_map, 0, cm->mi_rows * cm->mi_cols);
+    vp9_disable_segmentation(&cm->seg);
+    if (cm->frame_type == KEY_FRAME) {
+      memset(cr->last_coded_q_map, MAXQ,
+             cm->mi_rows * cm->mi_cols * sizeof(*cr->last_coded_q_map));
+      memset(cr->consec_zero_mv, 0,
+             cm->mi_rows * cm->mi_cols * sizeof(*cr->consec_zero_mv));
+      cr->sb_index = 0;
+    }
+    return;
+  } else {
+    int qindex_delta = 0;
+    int qindex2;
+    const double q = vp9_convert_qindex_to_q(cm->base_qindex, cm->bit_depth);
+    vpx_clear_system_state();
+    // Set rate threshold to some multiple (set to 2 for now) of the target
+    // rate (target is given by sb64_target_rate and scaled by 256).
+    cr->thresh_rate_sb = ((int64_t)(rc->sb64_target_rate) << 8) << 2;
+    // Distortion threshold, quadratic in Q, scale factor to be adjusted.
+    // q will not exceed 457, so (q * q) is within 32bit; see:
+    // vp9_convert_qindex_to_q(), vp9_ac_quant(), ac_qlookup*[].
+    cr->thresh_dist_sb = ((int64_t)(q * q)) << 2;
+
+    // Set up segmentation.
+    // Clear down the segment map.
+    vp9_enable_segmentation(&cm->seg);
+    vp9_clearall_segfeatures(seg);
+    // Select delta coding method.
+    seg->abs_delta = SEGMENT_DELTADATA;
+
+    // Note: setting temporal_update has no effect, as the seg-map coding method
+    // (temporal or spatial) is determined in vp9_choose_segmap_coding_method(),
+    // based on the coding cost of each method. For error_resilient mode on the
+    // last_frame_seg_map is set to 0, so if temporal coding is used, it is
+    // relative to 0 previous map.
+    // seg->temporal_update = 0;
+
+    // Segment BASE "Q" feature is disabled so it defaults to the baseline Q.
+    vp9_disable_segfeature(seg, CR_SEGMENT_ID_BASE, SEG_LVL_ALT_Q);
+    // Use segment BOOST1 for in-frame Q adjustment.
+    vp9_enable_segfeature(seg, CR_SEGMENT_ID_BOOST1, SEG_LVL_ALT_Q);
+    // Use segment BOOST2 for more aggressive in-frame Q adjustment.
+    vp9_enable_segfeature(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q);
+
+    // Set the q delta for segment BOOST1.
+    qindex_delta = compute_deltaq(cpi, cm->base_qindex, cr->rate_ratio_qdelta);
+    cr->qindex_delta[1] = qindex_delta;
+
+    // Compute rd-mult for segment BOOST1.
+    qindex2 = clamp(cm->base_qindex + cm->y_dc_delta_q + qindex_delta, 0, MAXQ);
+
+    cr->rdmult = vp9_compute_rd_mult(cpi, qindex2);
+
+    vp9_set_segdata(seg, CR_SEGMENT_ID_BOOST1, SEG_LVL_ALT_Q, qindex_delta);
+
+    // Set a more aggressive (higher) q delta for segment BOOST2.
+    qindex_delta = compute_deltaq(
+        cpi, cm->base_qindex,
+        VPXMIN(CR_MAX_RATE_TARGET_RATIO,
+               0.1 * cr->rate_boost_fac * cr->rate_ratio_qdelta));
+    cr->qindex_delta[2] = qindex_delta;
+    vp9_set_segdata(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q, qindex_delta);
+
+    // Reset if resoluton change has occurred.
+    if (cpi->resize_pending != 0)
+      vp9_cyclic_refresh_reset_resize(cpi);
+
+    // Update the segmentation and refresh map.
+    cyclic_refresh_update_map(cpi);
+  }
+}
+
+int vp9_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr) {
+  return cr->rdmult;
+}
+
+void vp9_cyclic_refresh_reset_resize(VP9_COMP *const cpi) {
+  const VP9_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  memset(cr->map, 0, cm->mi_rows * cm->mi_cols);
+  memset(cr->last_coded_q_map, MAXQ, cm->mi_rows * cm->mi_cols);
+  memset(cr->consec_zero_mv, 0, cm->mi_rows * cm->mi_cols);
+  cr->sb_index = 0;
+  cpi->refresh_golden_frame = 1;
+  cpi->refresh_alt_ref_frame = 1;
+}
diff --git a/libs/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h b/libs/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h
new file mode 100644
index 0000000000..095b9283f9
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h
@@ -0,0 +1,147 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_
+#define VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_
+
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/encoder/vp9_block.h"
+#include "vp9/encoder/vp9_skin_detection.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// The segment ids used in cyclic refresh: from base (no boost) to increasing
+// boost (higher delta-qp).
+#define CR_SEGMENT_ID_BASE    0
+#define CR_SEGMENT_ID_BOOST1  1
+#define CR_SEGMENT_ID_BOOST2  2
+
+// Maximum rate target ratio for setting segment delta-qp.
+#define CR_MAX_RATE_TARGET_RATIO 4.0
+
+struct CYCLIC_REFRESH {
+  // Percentage of blocks per frame that are targeted as candidates
+  // for cyclic refresh.
+  int percent_refresh;
+  // Maximum q-delta as percentage of base q.
+  int max_qdelta_perc;
+  // Superblock starting index for cycling through the frame.
+  int sb_index;
+  // Controls how long block will need to wait to be refreshed again, in
+  // excess of the cycle time, i.e., in the case of all zero motion, block
+  // will be refreshed every (100/percent_refresh + time_for_refresh) frames.
+  int time_for_refresh;
+  // Target number of (8x8) blocks that are set for delta-q.
+  int target_num_seg_blocks;
+  // Actual number of (8x8) blocks that were applied delta-q.
+  int actual_num_seg1_blocks;
+  int actual_num_seg2_blocks;
+  // RD mult. parameters for segment 1.
+  int rdmult;
+  // Cyclic refresh map.
+  signed char *map;
+  // Map of the last q a block was coded at.
+  uint8_t *last_coded_q_map;
+  // Count on how many consecutive times a block uses ZER0MV for encoding.
+  uint8_t *consec_zero_mv;
+  // Thresholds applied to the projected rate/distortion of the coding block,
+  // when deciding whether block should be refreshed.
+  int64_t thresh_rate_sb;
+  int64_t thresh_dist_sb;
+  // Threshold applied to the motion vector (in units of 1/8 pel) of the
+  // coding block, when deciding whether block should be refreshed.
+  int16_t motion_thresh;
+  // Rate target ratio to set q delta.
+  double rate_ratio_qdelta;
+  // Boost factor for rate target ratio, for segment CR_SEGMENT_ID_BOOST2.
+  int rate_boost_fac;
+  double low_content_avg;
+  int qindex_delta[3];
+  int reduce_refresh;
+};
+
+struct VP9_COMP;
+
+typedef struct CYCLIC_REFRESH CYCLIC_REFRESH;
+
+CYCLIC_REFRESH *vp9_cyclic_refresh_alloc(int mi_rows, int mi_cols);
+
+void vp9_cyclic_refresh_free(CYCLIC_REFRESH *cr);
+
+// Estimate the bits, incorporating the delta-q from segment 1, after encoding
+// the frame.
+int vp9_cyclic_refresh_estimate_bits_at_q(const struct VP9_COMP *cpi,
+                                          double correction_factor);
+
+// Estimate the bits per mb, for a given q = i and a corresponding delta-q
+// (for segment 1), prior to encoding the frame.
+int vp9_cyclic_refresh_rc_bits_per_mb(const struct VP9_COMP *cpi, int i,
+                                      double correction_factor);
+
+// Prior to coding a given prediction block, of size bsize at (mi_row, mi_col),
+// check if we should reset the segment_id, and update the cyclic_refresh map
+// and segmentation map.
+void vp9_cyclic_refresh_update_segment(struct VP9_COMP *const cpi,
+                                       MODE_INFO *const mi,
+                                       int mi_row, int mi_col, BLOCK_SIZE bsize,
+                                       int64_t rate, int64_t dist, int skip,
+                                       struct macroblock_plane *const p);
+
+void vp9_cyclic_refresh_update_sb_postencode(struct VP9_COMP *const cpi,
+                                             const MODE_INFO *const mi,
+                                             int mi_row, int mi_col,
+                                             BLOCK_SIZE bsize);
+
+// Update the segmentation map, and related quantities: cyclic refresh map,
+// refresh sb_index, and target number of blocks to be refreshed.
+void vp9_cyclic_refresh_update__map(struct VP9_COMP *const cpi);
+
+// Update the actual number of blocks that were applied the segment delta q.
+void vp9_cyclic_refresh_postencode(struct VP9_COMP *const cpi);
+
+// Set golden frame update interval, for non-svc 1 pass CBR mode.
+void vp9_cyclic_refresh_set_golden_update(struct VP9_COMP *const cpi);
+
+// Check if we should not update golden reference, based on past refresh stats.
+void vp9_cyclic_refresh_check_golden_update(struct VP9_COMP *const cpi);
+
+// Set/update global/frame level refresh parameters.
+void vp9_cyclic_refresh_update_parameters(struct VP9_COMP *const cpi);
+
+// Setup cyclic background refresh: set delta q and segmentation map.
+void vp9_cyclic_refresh_setup(struct VP9_COMP *const cpi);
+
+int vp9_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr);
+
+void vp9_cyclic_refresh_reset_resize(struct VP9_COMP *const cpi);
+
+static INLINE int cyclic_refresh_segment_id_boosted(int segment_id) {
+  return segment_id == CR_SEGMENT_ID_BOOST1 ||
+         segment_id == CR_SEGMENT_ID_BOOST2;
+}
+
+static INLINE int cyclic_refresh_segment_id(int segment_id) {
+  if (segment_id == CR_SEGMENT_ID_BOOST1)
+    return CR_SEGMENT_ID_BOOST1;
+  else if (segment_id == CR_SEGMENT_ID_BOOST2)
+    return CR_SEGMENT_ID_BOOST2;
+  else
+    return CR_SEGMENT_ID_BASE;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_aq_variance.c b/libs/libvpx/vp9/encoder/vp9_aq_variance.c
new file mode 100644
index 0000000000..d8f7d07213
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_aq_variance.c
@@ -0,0 +1,207 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+
+#include "vpx_ports/mem.h"
+#include "vpx_ports/system_state.h"
+
+#include "vp9/encoder/vp9_aq_variance.h"
+
+#include "vp9/common/vp9_seg_common.h"
+
+#include "vp9/encoder/vp9_ratectrl.h"
+#include "vp9/encoder/vp9_rd.h"
+#include "vp9/encoder/vp9_segmentation.h"
+
+#define ENERGY_MIN (-4)
+#define ENERGY_MAX (1)
+#define ENERGY_SPAN (ENERGY_MAX - ENERGY_MIN +  1)
+#define ENERGY_IN_BOUNDS(energy)\
+  assert((energy) >= ENERGY_MIN && (energy) <= ENERGY_MAX)
+
+static const double rate_ratio[MAX_SEGMENTS] =
+  {2.5, 2.0, 1.5, 1.0, 0.75, 1.0, 1.0, 1.0};
+static const int segment_id[ENERGY_SPAN] = {0, 1, 1, 2, 3, 4};
+
+#define SEGMENT_ID(i) segment_id[(i) - ENERGY_MIN]
+
+DECLARE_ALIGNED(16, static const uint8_t, vp9_64_zeros[64]) = {0};
+#if CONFIG_VP9_HIGHBITDEPTH
+DECLARE_ALIGNED(16, static const uint16_t, vp9_highbd_64_zeros[64]) = {0};
+#endif
+
+unsigned int vp9_vaq_segment_id(int energy) {
+  ENERGY_IN_BOUNDS(energy);
+  return SEGMENT_ID(energy);
+}
+
+void vp9_vaq_frame_setup(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+  struct segmentation *seg = &cm->seg;
+  int i;
+
+  if (frame_is_intra_only(cm) || cm->error_resilient_mode ||
+      cpi->refresh_alt_ref_frame ||
+      (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
+    vp9_enable_segmentation(seg);
+    vp9_clearall_segfeatures(seg);
+
+    seg->abs_delta = SEGMENT_DELTADATA;
+
+    vpx_clear_system_state();
+
+    for (i = 0; i < MAX_SEGMENTS; ++i) {
+      int qindex_delta =
+          vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex,
+                                     rate_ratio[i], cm->bit_depth);
+
+      // We don't allow qindex 0 in a segment if the base value is not 0.
+      // Q index 0 (lossless) implies 4x4 encoding only and in AQ mode a segment
+      // Q delta is sometimes applied without going back around the rd loop.
+      // This could lead to an illegal combination of partition size and q.
+      if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) {
+        qindex_delta = -cm->base_qindex + 1;
+      }
+
+      // No need to enable SEG_LVL_ALT_Q for this segment.
+      if (rate_ratio[i] == 1.0) {
+        continue;
+      }
+
+      vp9_set_segdata(seg, i, SEG_LVL_ALT_Q, qindex_delta);
+      vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q);
+    }
+  }
+}
+
+/* TODO(agrange, paulwilkins): The block_variance calls the unoptimized versions
+ * of variance() and highbd_8_variance(). It should not.
+ */
+static void aq_variance(const uint8_t *a, int  a_stride,
+                        const uint8_t *b, int  b_stride,
+                        int  w, int  h, unsigned int *sse, int *sum) {
+  int i, j;
+
+  *sum = 0;
+  *sse = 0;
+
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      const int diff = a[j] - b[j];
+      *sum += diff;
+      *sse += diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void aq_highbd_variance64(const uint8_t *a8, int  a_stride,
+                                 const uint8_t *b8, int  b_stride,
+                                 int w, int h, uint64_t *sse, uint64_t *sum) {
+  int i, j;
+
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  *sum = 0;
+  *sse = 0;
+
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      const int diff = a[j] - b[j];
+      *sum += diff;
+      *sse += diff * diff;
+    }
+    a += a_stride;
+    b += b_stride;
+  }
+}
+
+static void aq_highbd_8_variance(const uint8_t *a8, int  a_stride,
+                                 const uint8_t *b8, int  b_stride,
+                                 int w, int h, unsigned int *sse, int *sum) {
+  uint64_t sse_long = 0;
+  uint64_t sum_long = 0;
+  aq_highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+  *sse = (unsigned int)sse_long;
+  *sum = (int)sum_long;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static unsigned int block_variance(VP9_COMP *cpi, MACROBLOCK *x,
+                                   BLOCK_SIZE bs) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  unsigned int var, sse;
+  int right_overflow = (xd->mb_to_right_edge < 0) ?
+      ((-xd->mb_to_right_edge) >> 3) : 0;
+  int bottom_overflow = (xd->mb_to_bottom_edge < 0) ?
+      ((-xd->mb_to_bottom_edge) >> 3) : 0;
+
+  if (right_overflow || bottom_overflow) {
+    const int bw = 8 * num_8x8_blocks_wide_lookup[bs] - right_overflow;
+    const int bh = 8 * num_8x8_blocks_high_lookup[bs] - bottom_overflow;
+    int avg;
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      aq_highbd_8_variance(x->plane[0].src.buf, x->plane[0].src.stride,
+                           CONVERT_TO_BYTEPTR(vp9_highbd_64_zeros), 0, bw, bh,
+                           &sse, &avg);
+      sse >>= 2 * (xd->bd - 8);
+      avg >>= (xd->bd - 8);
+    } else {
+      aq_variance(x->plane[0].src.buf, x->plane[0].src.stride,
+                  vp9_64_zeros, 0, bw, bh, &sse, &avg);
+    }
+#else
+    aq_variance(x->plane[0].src.buf, x->plane[0].src.stride,
+                vp9_64_zeros, 0, bw, bh, &sse, &avg);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    var = sse - (((int64_t)avg * avg) / (bw * bh));
+    return (256 * var) / (bw * bh);
+  } else {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf,
+                               x->plane[0].src.stride,
+                               CONVERT_TO_BYTEPTR(vp9_highbd_64_zeros),
+                               0, &sse);
+    } else {
+      var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf,
+                               x->plane[0].src.stride,
+                               vp9_64_zeros, 0, &sse);
+    }
+#else
+    var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf,
+                             x->plane[0].src.stride,
+                             vp9_64_zeros, 0, &sse);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    return (256 * var) >> num_pels_log2_lookup[bs];
+  }
+}
+
+double vp9_log_block_var(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
+  unsigned int var = block_variance(cpi, x, bs);
+  vpx_clear_system_state();
+  return log(var + 1.0);
+}
+
+#define DEFAULT_E_MIDPOINT 10.0
+int vp9_block_energy(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
+  double energy;
+  double energy_midpoint;
+  vpx_clear_system_state();
+  energy_midpoint =
+    (cpi->oxcf.pass == 2) ? cpi->twopass.mb_av_energy : DEFAULT_E_MIDPOINT;
+  energy = vp9_log_block_var(cpi, x, bs) - energy_midpoint;
+  return clamp((int)round(energy), ENERGY_MIN, ENERGY_MAX);
+}
diff --git a/libs/libvpx/vp9/encoder/vp9_aq_variance.h b/libs/libvpx/vp9/encoder/vp9_aq_variance.h
new file mode 100644
index 0000000000..a0effa3116
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_aq_variance.h
@@ -0,0 +1,31 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP9_ENCODER_VP9_AQ_VARIANCE_H_
+#define VP9_ENCODER_VP9_AQ_VARIANCE_H_
+
+#include "vp9/encoder/vp9_encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+unsigned int vp9_vaq_segment_id(int energy);
+void vp9_vaq_frame_setup(VP9_COMP *cpi);
+
+int vp9_block_energy(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs);
+double vp9_log_block_var(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_VP9_AQ_VARIANCE_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_bitstream.c b/libs/libvpx/vp9/encoder/vp9_bitstream.c
new file mode 100644
index 0000000000..5600ed4585
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_bitstream.c
@@ -0,0 +1,1245 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <limits.h>
+
+#include "vpx/vpx_encoder.h"
+#include "vpx_dsp/bitwriter_buffer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem_ops.h"
+#include "vpx_ports/system_state.h"
+
+#include "vp9/common/vp9_entropy.h"
+#include "vp9/common/vp9_entropymode.h"
+#include "vp9/common/vp9_entropymv.h"
+#include "vp9/common/vp9_mvref_common.h"
+#include "vp9/common/vp9_pred_common.h"
+#include "vp9/common/vp9_seg_common.h"
+#include "vp9/common/vp9_tile_common.h"
+
+#include "vp9/encoder/vp9_cost.h"
+#include "vp9/encoder/vp9_bitstream.h"
+#include "vp9/encoder/vp9_encodemv.h"
+#include "vp9/encoder/vp9_mcomp.h"
+#include "vp9/encoder/vp9_segmentation.h"
+#include "vp9/encoder/vp9_subexp.h"
+#include "vp9/encoder/vp9_tokenize.h"
+
+static const struct vp9_token intra_mode_encodings[INTRA_MODES] = {
+  {0, 1}, {6, 3}, {28, 5}, {30, 5}, {58, 6}, {59, 6}, {126, 7}, {127, 7},
+  {62, 6}, {2, 2}};
+static const struct vp9_token switchable_interp_encodings[SWITCHABLE_FILTERS] =
+  {{0, 1}, {2, 2}, {3, 2}};
+static const struct vp9_token partition_encodings[PARTITION_TYPES] =
+  {{0, 1}, {2, 2}, {6, 3}, {7, 3}};
+static const struct vp9_token inter_mode_encodings[INTER_MODES] =
+  {{2, 2}, {6, 3}, {0, 1}, {7, 3}};
+
+static void write_intra_mode(vpx_writer *w, PREDICTION_MODE mode,
+                             const vpx_prob *probs) {
+  vp9_write_token(w, vp9_intra_mode_tree, probs, &intra_mode_encodings[mode]);
+}
+
+static void write_inter_mode(vpx_writer *w, PREDICTION_MODE mode,
+                             const vpx_prob *probs) {
+  assert(is_inter_mode(mode));
+  vp9_write_token(w, vp9_inter_mode_tree, probs,
+                  &inter_mode_encodings[INTER_OFFSET(mode)]);
+}
+
+static void encode_unsigned_max(struct vpx_write_bit_buffer *wb,
+                                int data, int max) {
+  vpx_wb_write_literal(wb, data, get_unsigned_bits(max));
+}
+
+static void prob_diff_update(const vpx_tree_index *tree,
+                             vpx_prob probs[/*n - 1*/],
+                             const unsigned int counts[/*n - 1*/],
+                             int n, vpx_writer *w) {
+  int i;
+  unsigned int branch_ct[32][2];
+
+  // Assuming max number of probabilities <= 32
+  assert(n <= 32);
+
+  vp9_tree_probs_from_distribution(tree, branch_ct, counts);
+  for (i = 0; i < n - 1; ++i)
+    vp9_cond_prob_diff_update(w, &probs[i], branch_ct[i]);
+}
+
+static void write_selected_tx_size(const VP9_COMMON *cm,
+                                   const MACROBLOCKD *xd, vpx_writer *w) {
+  TX_SIZE tx_size = xd->mi[0]->tx_size;
+  BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
+  const vpx_prob *const tx_probs = get_tx_probs2(max_tx_size, xd,
+                                                 &cm->fc->tx_probs);
+  vpx_write(w, tx_size != TX_4X4, tx_probs[0]);
+  if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) {
+    vpx_write(w, tx_size != TX_8X8, tx_probs[1]);
+    if (tx_size != TX_8X8 && max_tx_size >= TX_32X32)
+      vpx_write(w, tx_size != TX_16X16, tx_probs[2]);
+  }
+}
+
+static int write_skip(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+                      int segment_id, const MODE_INFO *mi, vpx_writer *w) {
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
+    return 1;
+  } else {
+    const int skip = mi->skip;
+    vpx_write(w, skip, vp9_get_skip_prob(cm, xd));
+    return skip;
+  }
+}
+
+static void update_skip_probs(VP9_COMMON *cm, vpx_writer *w,
+                              FRAME_COUNTS *counts) {
+  int k;
+
+  for (k = 0; k < SKIP_CONTEXTS; ++k)
+    vp9_cond_prob_diff_update(w, &cm->fc->skip_probs[k], counts->skip[k]);
+}
+
+static void update_switchable_interp_probs(VP9_COMMON *cm, vpx_writer *w,
+                                           FRAME_COUNTS *counts) {
+  int j;
+  for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
+    prob_diff_update(vp9_switchable_interp_tree,
+                     cm->fc->switchable_interp_prob[j],
+                     counts->switchable_interp[j], SWITCHABLE_FILTERS, w);
+}
+
+static void pack_mb_tokens(vpx_writer *w,
+                           TOKENEXTRA **tp, const TOKENEXTRA *const stop,
+                           vpx_bit_depth_t bit_depth) {
+  const TOKENEXTRA *p;
+  const vp9_extra_bit *const extra_bits =
+#if CONFIG_VP9_HIGHBITDEPTH
+    (bit_depth == VPX_BITS_12) ? vp9_extra_bits_high12 :
+    (bit_depth == VPX_BITS_10) ? vp9_extra_bits_high10 :
+    vp9_extra_bits;
+#else
+    vp9_extra_bits;
+    (void) bit_depth;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  for (p = *tp; p < stop && p->token != EOSB_TOKEN; ++p) {
+    if (p->token == EOB_TOKEN) {
+      vpx_write(w, 0, p->context_tree[0]);
+      continue;
+    }
+    vpx_write(w, 1, p->context_tree[0]);
+    while (p->token == ZERO_TOKEN) {
+      vpx_write(w, 0, p->context_tree[1]);
+      ++p;
+      if (p == stop || p->token == EOSB_TOKEN) {
+        *tp = (TOKENEXTRA*)(uintptr_t)p + (p->token == EOSB_TOKEN);
+        return;
+      }
+    }
+
+    {
+      const int t = p->token;
+      const vpx_prob *const context_tree = p->context_tree;
+      assert(t != ZERO_TOKEN);
+      assert(t != EOB_TOKEN);
+      assert(t != EOSB_TOKEN);
+      vpx_write(w, 1, context_tree[1]);
+      if (t == ONE_TOKEN) {
+        vpx_write(w, 0, context_tree[2]);
+        vpx_write_bit(w, p->extra & 1);
+      } else {  // t >= TWO_TOKEN && t < EOB_TOKEN
+        const struct vp9_token *const a = &vp9_coef_encodings[t];
+        const int v = a->value;
+        const int n = a->len;
+        const int e = p->extra;
+        vpx_write(w, 1, context_tree[2]);
+        vp9_write_tree(w, vp9_coef_con_tree,
+                       vp9_pareto8_full[context_tree[PIVOT_NODE] - 1], v,
+                       n - UNCONSTRAINED_NODES, 0);
+        if (t >= CATEGORY1_TOKEN) {
+          const vp9_extra_bit *const b = &extra_bits[t];
+          const unsigned char *pb = b->prob;
+          int v = e >> 1;
+          int n = b->len;  // number of bits in v, assumed nonzero
+          do {
+            const int bb = (v >> --n) & 1;
+            vpx_write(w, bb, *pb++);
+          } while (n);
+        }
+        vpx_write_bit(w, e & 1);
+      }
+    }
+  }
+  *tp = (TOKENEXTRA*)(uintptr_t)p + (p->token == EOSB_TOKEN);
+}
+
+static void write_segment_id(vpx_writer *w, const struct segmentation *seg,
+                             int segment_id) {
+  if (seg->enabled && seg->update_map)
+    vp9_write_tree(w, vp9_segment_tree, seg->tree_probs, segment_id, 3, 0);
+}
+
+// This function encodes the reference frame
+static void write_ref_frames(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+                             vpx_writer *w) {
+  const MODE_INFO *const mi = xd->mi[0];
+  const int is_compound = has_second_ref(mi);
+  const int segment_id = mi->segment_id;
+
+  // If segment level coding of this signal is disabled...
+  // or the segment allows multiple reference frame options
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
+    assert(!is_compound);
+    assert(mi->ref_frame[0] ==
+               get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME));
+  } else {
+    // does the feature use compound prediction or not
+    // (if not specified at the frame/segment level)
+    if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+      vpx_write(w, is_compound, vp9_get_reference_mode_prob(cm, xd));
+    } else {
+      assert(!is_compound == (cm->reference_mode == SINGLE_REFERENCE));
+    }
+
+    if (is_compound) {
+      vpx_write(w, mi->ref_frame[0] == GOLDEN_FRAME,
+                vp9_get_pred_prob_comp_ref_p(cm, xd));
+    } else {
+      const int bit0 = mi->ref_frame[0] != LAST_FRAME;
+      vpx_write(w, bit0, vp9_get_pred_prob_single_ref_p1(cm, xd));
+      if (bit0) {
+        const int bit1 = mi->ref_frame[0] != GOLDEN_FRAME;
+        vpx_write(w, bit1, vp9_get_pred_prob_single_ref_p2(cm, xd));
+      }
+    }
+  }
+}
+
+static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
+                                vpx_writer *w) {
+  VP9_COMMON *const cm = &cpi->common;
+  const nmv_context *nmvc = &cm->fc->nmvc;
+  const MACROBLOCK *const x = &cpi->td.mb;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct segmentation *const seg = &cm->seg;
+  const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  const PREDICTION_MODE mode = mi->mode;
+  const int segment_id = mi->segment_id;
+  const BLOCK_SIZE bsize = mi->sb_type;
+  const int allow_hp = cm->allow_high_precision_mv;
+  const int is_inter = is_inter_block(mi);
+  const int is_compound = has_second_ref(mi);
+  int skip, ref;
+
+  if (seg->update_map) {
+    if (seg->temporal_update) {
+      const int pred_flag = mi->seg_id_predicted;
+      vpx_prob pred_prob = vp9_get_pred_prob_seg_id(seg, xd);
+      vpx_write(w, pred_flag, pred_prob);
+      if (!pred_flag)
+        write_segment_id(w, seg, segment_id);
+    } else {
+      write_segment_id(w, seg, segment_id);
+    }
+  }
+
+  skip = write_skip(cm, xd, segment_id, mi, w);
+
+  if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
+    vpx_write(w, is_inter, vp9_get_intra_inter_prob(cm, xd));
+
+  if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT &&
+      !(is_inter && skip)) {
+    write_selected_tx_size(cm, xd, w);
+  }
+
+  if (!is_inter) {
+    if (bsize >= BLOCK_8X8) {
+      write_intra_mode(w, mode, cm->fc->y_mode_prob[size_group_lookup[bsize]]);
+    } else {
+      int idx, idy;
+      const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+      const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+      for (idy = 0; idy < 2; idy += num_4x4_h) {
+        for (idx = 0; idx < 2; idx += num_4x4_w) {
+          const PREDICTION_MODE b_mode = mi->bmi[idy * 2 + idx].as_mode;
+          write_intra_mode(w, b_mode, cm->fc->y_mode_prob[0]);
+        }
+      }
+    }
+    write_intra_mode(w, mi->uv_mode, cm->fc->uv_mode_prob[mode]);
+  } else {
+    const int mode_ctx = mbmi_ext->mode_context[mi->ref_frame[0]];
+    const vpx_prob *const inter_probs = cm->fc->inter_mode_probs[mode_ctx];
+    write_ref_frames(cm, xd, w);
+
+    // If segment skip is not enabled code the mode.
+    if (!segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
+      if (bsize >= BLOCK_8X8) {
+        write_inter_mode(w, mode, inter_probs);
+      }
+    }
+
+    if (cm->interp_filter == SWITCHABLE) {
+      const int ctx = vp9_get_pred_context_switchable_interp(xd);
+      vp9_write_token(w, vp9_switchable_interp_tree,
+                      cm->fc->switchable_interp_prob[ctx],
+                      &switchable_interp_encodings[mi->interp_filter]);
+      ++cpi->interp_filter_selected[0][mi->interp_filter];
+    } else {
+      assert(mi->interp_filter == cm->interp_filter);
+    }
+
+    if (bsize < BLOCK_8X8) {
+      const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+      const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+      int idx, idy;
+      for (idy = 0; idy < 2; idy += num_4x4_h) {
+        for (idx = 0; idx < 2; idx += num_4x4_w) {
+          const int j = idy * 2 + idx;
+          const PREDICTION_MODE b_mode = mi->bmi[j].as_mode;
+          write_inter_mode(w, b_mode, inter_probs);
+          if (b_mode == NEWMV) {
+            for (ref = 0; ref < 1 + is_compound; ++ref)
+              vp9_encode_mv(cpi, w, &mi->bmi[j].as_mv[ref].as_mv,
+                            &mbmi_ext->ref_mvs[mi->ref_frame[ref]][0].as_mv,
+                            nmvc, allow_hp);
+          }
+        }
+      }
+    } else {
+      if (mode == NEWMV) {
+        for (ref = 0; ref < 1 + is_compound; ++ref)
+          vp9_encode_mv(cpi, w, &mi->mv[ref].as_mv,
+                        &mbmi_ext->ref_mvs[mi->ref_frame[ref]][0].as_mv, nmvc,
+                        allow_hp);
+      }
+    }
+  }
+}
+
+static void write_mb_modes_kf(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+                              MODE_INFO **mi_8x8, vpx_writer *w) {
+  const struct segmentation *const seg = &cm->seg;
+  const MODE_INFO *const mi = mi_8x8[0];
+  const MODE_INFO *const above_mi = xd->above_mi;
+  const MODE_INFO *const left_mi = xd->left_mi;
+  const BLOCK_SIZE bsize = mi->sb_type;
+
+  if (seg->update_map)
+    write_segment_id(w, seg, mi->segment_id);
+
+  write_skip(cm, xd, mi->segment_id, mi, w);
+
+  if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT)
+    write_selected_tx_size(cm, xd, w);
+
+  if (bsize >= BLOCK_8X8) {
+    write_intra_mode(w, mi->mode, get_y_mode_probs(mi, above_mi, left_mi, 0));
+  } else {
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+    int idx, idy;
+
+    for (idy = 0; idy < 2; idy += num_4x4_h) {
+      for (idx = 0; idx < 2; idx += num_4x4_w) {
+        const int block = idy * 2 + idx;
+        write_intra_mode(w, mi->bmi[block].as_mode,
+                         get_y_mode_probs(mi, above_mi, left_mi, block));
+      }
+    }
+  }
+
+  write_intra_mode(w, mi->uv_mode, vp9_kf_uv_mode_prob[mi->mode]);
+}
+
+static void write_modes_b(VP9_COMP *cpi, const TileInfo *const tile,
+                          vpx_writer *w, TOKENEXTRA **tok,
+                          const TOKENEXTRA *const tok_end,
+                          int mi_row, int mi_col) {
+  const VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  MODE_INFO *m;
+
+  xd->mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col);
+  m = xd->mi[0];
+
+  cpi->td.mb.mbmi_ext = cpi->td.mb.mbmi_ext_base +
+      (mi_row * cm->mi_cols + mi_col);
+
+  set_mi_row_col(xd, tile,
+                 mi_row, num_8x8_blocks_high_lookup[m->sb_type],
+                 mi_col, num_8x8_blocks_wide_lookup[m->sb_type],
+                 cm->mi_rows, cm->mi_cols);
+  if (frame_is_intra_only(cm)) {
+    write_mb_modes_kf(cm, xd, xd->mi, w);
+  } else {
+    pack_inter_mode_mvs(cpi, m, w);
+  }
+
+  assert(*tok < tok_end);
+  pack_mb_tokens(w, tok, tok_end, cm->bit_depth);
+}
+
+static void write_partition(const VP9_COMMON *const cm,
+                            const MACROBLOCKD *const xd,
+                            int hbs, int mi_row, int mi_col,
+                            PARTITION_TYPE p, BLOCK_SIZE bsize, vpx_writer *w) {
+  const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
+  const vpx_prob *const probs = xd->partition_probs[ctx];
+  const int has_rows = (mi_row + hbs) < cm->mi_rows;
+  const int has_cols = (mi_col + hbs) < cm->mi_cols;
+
+  if (has_rows && has_cols) {
+    vp9_write_token(w, vp9_partition_tree, probs, &partition_encodings[p]);
+  } else if (!has_rows && has_cols) {
+    assert(p == PARTITION_SPLIT || p == PARTITION_HORZ);
+    vpx_write(w, p == PARTITION_SPLIT, probs[1]);
+  } else if (has_rows && !has_cols) {
+    assert(p == PARTITION_SPLIT || p == PARTITION_VERT);
+    vpx_write(w, p == PARTITION_SPLIT, probs[2]);
+  } else {
+    assert(p == PARTITION_SPLIT);
+  }
+}
+
+static void write_modes_sb(VP9_COMP *cpi,
+                           const TileInfo *const tile, vpx_writer *w,
+                           TOKENEXTRA **tok, const TOKENEXTRA *const tok_end,
+                           int mi_row, int mi_col, BLOCK_SIZE bsize) {
+  const VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+
+  const int bsl = b_width_log2_lookup[bsize];
+  const int bs = (1 << bsl) / 4;
+  PARTITION_TYPE partition;
+  BLOCK_SIZE subsize;
+  const MODE_INFO *m = NULL;
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
+
+  m = cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col];
+
+  partition = partition_lookup[bsl][m->sb_type];
+  write_partition(cm, xd, bs, mi_row, mi_col, partition, bsize, w);
+  subsize = get_subsize(bsize, partition);
+  if (subsize < BLOCK_8X8) {
+    write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+  } else {
+    switch (partition) {
+      case PARTITION_NONE:
+        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+        break;
+      case PARTITION_HORZ:
+        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+        if (mi_row + bs < cm->mi_rows)
+          write_modes_b(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col);
+        break;
+      case PARTITION_VERT:
+        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+        if (mi_col + bs < cm->mi_cols)
+          write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + bs);
+        break;
+      case PARTITION_SPLIT:
+        write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, subsize);
+        write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col + bs,
+                       subsize);
+        write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col,
+                       subsize);
+        write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col + bs,
+                       subsize);
+        break;
+      default:
+        assert(0);
+    }
+  }
+
+  // update partition context
+  if (bsize >= BLOCK_8X8 &&
+      (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
+    update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+}
+
+static void write_modes(VP9_COMP *cpi,
+                        const TileInfo *const tile, vpx_writer *w,
+                        TOKENEXTRA **tok, const TOKENEXTRA *const tok_end) {
+  const VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  int mi_row, mi_col;
+
+  set_partition_probs(cm, xd);
+
+  for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
+       mi_row += MI_BLOCK_SIZE) {
+    vp9_zero(xd->left_seg_context);
+    for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
+         mi_col += MI_BLOCK_SIZE)
+      write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col,
+                     BLOCK_64X64);
+  }
+}
+
+static void build_tree_distribution(VP9_COMP *cpi, TX_SIZE tx_size,
+                                    vp9_coeff_stats *coef_branch_ct,
+                                    vp9_coeff_probs_model *coef_probs) {
+  vp9_coeff_count *coef_counts = cpi->td.rd_counts.coef_counts[tx_size];
+  unsigned int (*eob_branch_ct)[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS] =
+      cpi->common.counts.eob_branch[tx_size];
+  int i, j, k, l, m;
+
+  for (i = 0; i < PLANE_TYPES; ++i) {
+    for (j = 0; j < REF_TYPES; ++j) {
+      for (k = 0; k < COEF_BANDS; ++k) {
+        for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
+          vp9_tree_probs_from_distribution(vp9_coef_tree,
+                                           coef_branch_ct[i][j][k][l],
+                                           coef_counts[i][j][k][l]);
+          coef_branch_ct[i][j][k][l][0][1] = eob_branch_ct[i][j][k][l] -
+                                             coef_branch_ct[i][j][k][l][0][0];
+          for (m = 0; m < UNCONSTRAINED_NODES; ++m)
+            coef_probs[i][j][k][l][m] = get_binary_prob(
+                                            coef_branch_ct[i][j][k][l][m][0],
+                                            coef_branch_ct[i][j][k][l][m][1]);
+        }
+      }
+    }
+  }
+}
+
+static void update_coef_probs_common(vpx_writer* const bc, VP9_COMP *cpi,
+                                     TX_SIZE tx_size,
+                                     vp9_coeff_stats *frame_branch_ct,
+                                     vp9_coeff_probs_model *new_coef_probs) {
+  vp9_coeff_probs_model *old_coef_probs = cpi->common.fc->coef_probs[tx_size];
+  const vpx_prob upd = DIFF_UPDATE_PROB;
+  const int entropy_nodes_update = UNCONSTRAINED_NODES;
+  int i, j, k, l, t;
+  int stepsize = cpi->sf.coeff_prob_appx_step;
+
+  switch (cpi->sf.use_fast_coef_updates) {
+    case TWO_LOOP: {
+      /* dry run to see if there is any update at all needed */
+      int savings = 0;
+      int update[2] = {0, 0};
+      for (i = 0; i < PLANE_TYPES; ++i) {
+        for (j = 0; j < REF_TYPES; ++j) {
+          for (k = 0; k < COEF_BANDS; ++k) {
+            for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
+              for (t = 0; t < entropy_nodes_update; ++t) {
+                vpx_prob newp = new_coef_probs[i][j][k][l][t];
+                const vpx_prob oldp = old_coef_probs[i][j][k][l][t];
+                int s;
+                int u = 0;
+                if (t == PIVOT_NODE)
+                  s = vp9_prob_diff_update_savings_search_model(
+                      frame_branch_ct[i][j][k][l][0],
+                      old_coef_probs[i][j][k][l], &newp, upd, stepsize);
+                else
+                  s = vp9_prob_diff_update_savings_search(
+                      frame_branch_ct[i][j][k][l][t], oldp, &newp, upd);
+                if (s > 0 && newp != oldp)
+                  u = 1;
+                if (u)
+                  savings += s - (int)(vp9_cost_zero(upd));
+                else
+                  savings -= (int)(vp9_cost_zero(upd));
+                update[u]++;
+              }
+            }
+          }
+        }
+      }
+
+      // printf("Update %d %d, savings %d\n", update[0], update[1], savings);
+      /* Is coef updated at all */
+      if (update[1] == 0 || savings < 0) {
+        vpx_write_bit(bc, 0);
+        return;
+      }
+      vpx_write_bit(bc, 1);
+      for (i = 0; i < PLANE_TYPES; ++i) {
+        for (j = 0; j < REF_TYPES; ++j) {
+          for (k = 0; k < COEF_BANDS; ++k) {
+            for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
+              // calc probs and branch cts for this frame only
+              for (t = 0; t < entropy_nodes_update; ++t) {
+                vpx_prob newp = new_coef_probs[i][j][k][l][t];
+                vpx_prob *oldp = old_coef_probs[i][j][k][l] + t;
+                const vpx_prob upd = DIFF_UPDATE_PROB;
+                int s;
+                int u = 0;
+                if (t == PIVOT_NODE)
+                  s = vp9_prob_diff_update_savings_search_model(
+                      frame_branch_ct[i][j][k][l][0],
+                      old_coef_probs[i][j][k][l], &newp, upd, stepsize);
+                else
+                  s = vp9_prob_diff_update_savings_search(
+                      frame_branch_ct[i][j][k][l][t],
+                      *oldp, &newp, upd);
+                if (s > 0 && newp != *oldp)
+                  u = 1;
+                vpx_write(bc, u, upd);
+                if (u) {
+                  /* send/use new probability */
+                  vp9_write_prob_diff_update(bc, newp, *oldp);
+                  *oldp = newp;
+                }
+              }
+            }
+          }
+        }
+      }
+      return;
+    }
+
+    case ONE_LOOP_REDUCED: {
+      int updates = 0;
+      int noupdates_before_first = 0;
+      for (i = 0; i < PLANE_TYPES; ++i) {
+        for (j = 0; j < REF_TYPES; ++j) {
+          for (k = 0; k < COEF_BANDS; ++k) {
+            for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
+              // calc probs and branch cts for this frame only
+              for (t = 0; t < entropy_nodes_update; ++t) {
+                vpx_prob newp = new_coef_probs[i][j][k][l][t];
+                vpx_prob *oldp = old_coef_probs[i][j][k][l] + t;
+                int s;
+                int u = 0;
+
+                if (t == PIVOT_NODE) {
+                  s = vp9_prob_diff_update_savings_search_model(
+                      frame_branch_ct[i][j][k][l][0],
+                      old_coef_probs[i][j][k][l], &newp, upd, stepsize);
+                } else {
+                  s = vp9_prob_diff_update_savings_search(
+                      frame_branch_ct[i][j][k][l][t],
+                      *oldp, &newp, upd);
+                }
+
+                if (s > 0 && newp != *oldp)
+                  u = 1;
+                updates += u;
+                if (u == 0 && updates == 0) {
+                  noupdates_before_first++;
+                  continue;
+                }
+                if (u == 1 && updates == 1) {
+                  int v;
+                  // first update
+                  vpx_write_bit(bc, 1);
+                  for (v = 0; v < noupdates_before_first; ++v)
+                    vpx_write(bc, 0, upd);
+                }
+                vpx_write(bc, u, upd);
+                if (u) {
+                  /* send/use new probability */
+                  vp9_write_prob_diff_update(bc, newp, *oldp);
+                  *oldp = newp;
+                }
+              }
+            }
+          }
+        }
+      }
+      if (updates == 0) {
+        vpx_write_bit(bc, 0);  // no updates
+      }
+      return;
+    }
+    default:
+      assert(0);
+  }
+}
+
+static void update_coef_probs(VP9_COMP *cpi, vpx_writer* w) {
+  const TX_MODE tx_mode = cpi->common.tx_mode;
+  const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
+  TX_SIZE tx_size;
+  for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size) {
+    vp9_coeff_stats frame_branch_ct[PLANE_TYPES];
+    vp9_coeff_probs_model frame_coef_probs[PLANE_TYPES];
+    if (cpi->td.counts->tx.tx_totals[tx_size] <= 20 ||
+        (tx_size >= TX_16X16 && cpi->sf.tx_size_search_method == USE_TX_8X8)) {
+      vpx_write_bit(w, 0);
+    } else {
+      build_tree_distribution(cpi, tx_size, frame_branch_ct,
+                              frame_coef_probs);
+      update_coef_probs_common(w, cpi, tx_size, frame_branch_ct,
+                               frame_coef_probs);
+    }
+  }
+}
+
+static void encode_loopfilter(struct loopfilter *lf,
+                              struct vpx_write_bit_buffer *wb) {
+  int i;
+
+  // Encode the loop filter level and type
+  vpx_wb_write_literal(wb, lf->filter_level, 6);
+  vpx_wb_write_literal(wb, lf->sharpness_level, 3);
+
+  // Write out loop filter deltas applied at the MB level based on mode or
+  // ref frame (if they are enabled).
+  vpx_wb_write_bit(wb, lf->mode_ref_delta_enabled);
+
+  if (lf->mode_ref_delta_enabled) {
+    vpx_wb_write_bit(wb, lf->mode_ref_delta_update);
+    if (lf->mode_ref_delta_update) {
+      for (i = 0; i < MAX_REF_LF_DELTAS; i++) {
+        const int delta = lf->ref_deltas[i];
+        const int changed = delta != lf->last_ref_deltas[i];
+        vpx_wb_write_bit(wb, changed);
+        if (changed) {
+          lf->last_ref_deltas[i] = delta;
+          vpx_wb_write_literal(wb, abs(delta) & 0x3F, 6);
+          vpx_wb_write_bit(wb, delta < 0);
+        }
+      }
+
+      for (i = 0; i < MAX_MODE_LF_DELTAS; i++) {
+        const int delta = lf->mode_deltas[i];
+        const int changed = delta != lf->last_mode_deltas[i];
+        vpx_wb_write_bit(wb, changed);
+        if (changed) {
+          lf->last_mode_deltas[i] = delta;
+          vpx_wb_write_literal(wb, abs(delta) & 0x3F, 6);
+          vpx_wb_write_bit(wb, delta < 0);
+        }
+      }
+    }
+  }
+}
+
+static void write_delta_q(struct vpx_write_bit_buffer *wb, int delta_q) {
+  if (delta_q != 0) {
+    vpx_wb_write_bit(wb, 1);
+    vpx_wb_write_literal(wb, abs(delta_q), 4);
+    vpx_wb_write_bit(wb, delta_q < 0);
+  } else {
+    vpx_wb_write_bit(wb, 0);
+  }
+}
+
+static void encode_quantization(const VP9_COMMON *const cm,
+                                struct vpx_write_bit_buffer *wb) {
+  vpx_wb_write_literal(wb, cm->base_qindex, QINDEX_BITS);
+  write_delta_q(wb, cm->y_dc_delta_q);
+  write_delta_q(wb, cm->uv_dc_delta_q);
+  write_delta_q(wb, cm->uv_ac_delta_q);
+}
+
+static void encode_segmentation(VP9_COMMON *cm, MACROBLOCKD *xd,
+                                struct vpx_write_bit_buffer *wb) {
+  int i, j;
+
+  const struct segmentation *seg = &cm->seg;
+
+  vpx_wb_write_bit(wb, seg->enabled);
+  if (!seg->enabled)
+    return;
+
+  // Segmentation map
+  vpx_wb_write_bit(wb, seg->update_map);
+  if (seg->update_map) {
+    // Select the coding strategy (temporal or spatial)
+    vp9_choose_segmap_coding_method(cm, xd);
+    // Write out probabilities used to decode unpredicted  macro-block segments
+    for (i = 0; i < SEG_TREE_PROBS; i++) {
+      const int prob = seg->tree_probs[i];
+      const int update = prob != MAX_PROB;
+      vpx_wb_write_bit(wb, update);
+      if (update)
+        vpx_wb_write_literal(wb, prob, 8);
+    }
+
+    // Write out the chosen coding method.
+    vpx_wb_write_bit(wb, seg->temporal_update);
+    if (seg->temporal_update) {
+      for (i = 0; i < PREDICTION_PROBS; i++) {
+        const int prob = seg->pred_probs[i];
+        const int update = prob != MAX_PROB;
+        vpx_wb_write_bit(wb, update);
+        if (update)
+          vpx_wb_write_literal(wb, prob, 8);
+      }
+    }
+  }
+
+  // Segmentation data
+  vpx_wb_write_bit(wb, seg->update_data);
+  if (seg->update_data) {
+    vpx_wb_write_bit(wb, seg->abs_delta);
+
+    for (i = 0; i < MAX_SEGMENTS; i++) {
+      for (j = 0; j < SEG_LVL_MAX; j++) {
+        const int active = segfeature_active(seg, i, j);
+        vpx_wb_write_bit(wb, active);
+        if (active) {
+          const int data = get_segdata(seg, i, j);
+          const int data_max = vp9_seg_feature_data_max(j);
+
+          if (vp9_is_segfeature_signed(j)) {
+            encode_unsigned_max(wb, abs(data), data_max);
+            vpx_wb_write_bit(wb, data < 0);
+          } else {
+            encode_unsigned_max(wb, data, data_max);
+          }
+        }
+      }
+    }
+  }
+}
+
+static void encode_txfm_probs(VP9_COMMON *cm, vpx_writer *w,
+                              FRAME_COUNTS *counts) {
+  // Mode
+  vpx_write_literal(w, VPXMIN(cm->tx_mode, ALLOW_32X32), 2);
+  if (cm->tx_mode >= ALLOW_32X32)
+    vpx_write_bit(w, cm->tx_mode == TX_MODE_SELECT);
+
+  // Probabilities
+  if (cm->tx_mode == TX_MODE_SELECT) {
+    int i, j;
+    unsigned int ct_8x8p[TX_SIZES - 3][2];
+    unsigned int ct_16x16p[TX_SIZES - 2][2];
+    unsigned int ct_32x32p[TX_SIZES - 1][2];
+
+
+    for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
+      tx_counts_to_branch_counts_8x8(counts->tx.p8x8[i], ct_8x8p);
+      for (j = 0; j < TX_SIZES - 3; j++)
+        vp9_cond_prob_diff_update(w, &cm->fc->tx_probs.p8x8[i][j], ct_8x8p[j]);
+    }
+
+    for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
+      tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i], ct_16x16p);
+      for (j = 0; j < TX_SIZES - 2; j++)
+        vp9_cond_prob_diff_update(w, &cm->fc->tx_probs.p16x16[i][j],
+                                  ct_16x16p[j]);
+    }
+
+    for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
+      tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i], ct_32x32p);
+      for (j = 0; j < TX_SIZES - 1; j++)
+        vp9_cond_prob_diff_update(w, &cm->fc->tx_probs.p32x32[i][j],
+                                  ct_32x32p[j]);
+    }
+  }
+}
+
+static void write_interp_filter(INTERP_FILTER filter,
+                                struct vpx_write_bit_buffer *wb) {
+  const int filter_to_literal[] = { 1, 0, 2, 3 };
+
+  vpx_wb_write_bit(wb, filter == SWITCHABLE);
+  if (filter != SWITCHABLE)
+    vpx_wb_write_literal(wb, filter_to_literal[filter], 2);
+}
+
+static void fix_interp_filter(VP9_COMMON *cm, FRAME_COUNTS *counts) {
+  if (cm->interp_filter == SWITCHABLE) {
+    // Check to see if only one of the filters is actually used
+    int count[SWITCHABLE_FILTERS];
+    int i, j, c = 0;
+    for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
+      count[i] = 0;
+      for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
+        count[i] += counts->switchable_interp[j][i];
+      c += (count[i] > 0);
+    }
+    if (c == 1) {
+      // Only one filter is used. So set the filter at frame level
+      for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
+        if (count[i]) {
+          cm->interp_filter = i;
+          break;
+        }
+      }
+    }
+  }
+}
+
+static void write_tile_info(const VP9_COMMON *const cm,
+                            struct vpx_write_bit_buffer *wb) {
+  int min_log2_tile_cols, max_log2_tile_cols, ones;
+  vp9_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
+
+  // columns
+  ones = cm->log2_tile_cols - min_log2_tile_cols;
+  while (ones--)
+    vpx_wb_write_bit(wb, 1);
+
+  if (cm->log2_tile_cols < max_log2_tile_cols)
+    vpx_wb_write_bit(wb, 0);
+
+  // rows
+  vpx_wb_write_bit(wb, cm->log2_tile_rows != 0);
+  if (cm->log2_tile_rows != 0)
+    vpx_wb_write_bit(wb, cm->log2_tile_rows != 1);
+}
+
+static int get_refresh_mask(VP9_COMP *cpi) {
+  if (vp9_preserve_existing_gf(cpi)) {
+    // We have decided to preserve the previously existing golden frame as our
+    // new ARF frame. However, in the short term we leave it in the GF slot and,
+    // if we're updating the GF with the current decoded frame, we save it
+    // instead to the ARF slot.
+    // Later, in the function vp9_encoder.c:vp9_update_reference_frames() we
+    // will swap gld_fb_idx and alt_fb_idx to achieve our objective. We do it
+    // there so that it can be done outside of the recode loop.
+    // Note: This is highly specific to the use of ARF as a forward reference,
+    // and this needs to be generalized as other uses are implemented
+    // (like RTC/temporal scalability).
+    return (cpi->refresh_last_frame << cpi->lst_fb_idx) |
+           (cpi->refresh_golden_frame << cpi->alt_fb_idx);
+  } else {
+    int arf_idx = cpi->alt_fb_idx;
+    if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) {
+      const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+      arf_idx = gf_group->arf_update_idx[gf_group->index];
+    }
+    return (cpi->refresh_last_frame << cpi->lst_fb_idx) |
+           (cpi->refresh_golden_frame << cpi->gld_fb_idx) |
+           (cpi->refresh_alt_ref_frame << arf_idx);
+  }
+}
+
+static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
+  VP9_COMMON *const cm = &cpi->common;
+  vpx_writer residual_bc;
+  int tile_row, tile_col;
+  TOKENEXTRA *tok_end;
+  size_t total_size = 0;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int tile_rows = 1 << cm->log2_tile_rows;
+
+  memset(cm->above_seg_context, 0,
+         sizeof(*cm->above_seg_context) * mi_cols_aligned_to_sb(cm->mi_cols));
+
+  for (tile_row = 0; tile_row < tile_rows; tile_row++) {
+    for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+      int tile_idx = tile_row * tile_cols + tile_col;
+      TOKENEXTRA *tok = cpi->tile_tok[tile_row][tile_col];
+
+      tok_end = cpi->tile_tok[tile_row][tile_col] +
+          cpi->tok_count[tile_row][tile_col];
+
+      if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1)
+        vpx_start_encode(&residual_bc, data_ptr + total_size + 4);
+      else
+        vpx_start_encode(&residual_bc, data_ptr + total_size);
+
+      write_modes(cpi, &cpi->tile_data[tile_idx].tile_info,
+                  &residual_bc, &tok, tok_end);
+      assert(tok == tok_end);
+      vpx_stop_encode(&residual_bc);
+      if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1) {
+        // size of this tile
+        mem_put_be32(data_ptr + total_size, residual_bc.pos);
+        total_size += 4;
+      }
+
+      total_size += residual_bc.pos;
+    }
+  }
+
+  return total_size;
+}
+
+static void write_render_size(const VP9_COMMON *cm,
+                              struct vpx_write_bit_buffer *wb) {
+  const int scaling_active = cm->width != cm->render_width ||
+                             cm->height != cm->render_height;
+  vpx_wb_write_bit(wb, scaling_active);
+  if (scaling_active) {
+    vpx_wb_write_literal(wb, cm->render_width - 1, 16);
+    vpx_wb_write_literal(wb, cm->render_height - 1, 16);
+  }
+}
+
+static void write_frame_size(const VP9_COMMON *cm,
+                             struct vpx_write_bit_buffer *wb) {
+  vpx_wb_write_literal(wb, cm->width - 1, 16);
+  vpx_wb_write_literal(wb, cm->height - 1, 16);
+
+  write_render_size(cm, wb);
+}
+
+static void write_frame_size_with_refs(VP9_COMP *cpi,
+                                       struct vpx_write_bit_buffer *wb) {
+  VP9_COMMON *const cm = &cpi->common;
+  int found = 0;
+
+  MV_REFERENCE_FRAME ref_frame;
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, ref_frame);
+
+    // Set "found" to 0 for temporal svc and for spatial svc key frame
+    if (cpi->use_svc &&
+        ((cpi->svc.number_temporal_layers > 1 &&
+         cpi->oxcf.rc_mode == VPX_CBR) ||
+        (cpi->svc.number_spatial_layers > 1 &&
+         cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame) ||
+        (is_two_pass_svc(cpi) &&
+         cpi->svc.encode_empty_frame_state == ENCODING &&
+         cpi->svc.layer_context[0].frames_from_key_frame <
+         cpi->svc.number_temporal_layers + 1))) {
+      found = 0;
+    } else if (cfg != NULL) {
+      found = cm->width == cfg->y_crop_width &&
+              cm->height == cfg->y_crop_height;
+    }
+    vpx_wb_write_bit(wb, found);
+    if (found) {
+      break;
+    }
+  }
+
+  if (!found) {
+    vpx_wb_write_literal(wb, cm->width - 1, 16);
+    vpx_wb_write_literal(wb, cm->height - 1, 16);
+  }
+
+  write_render_size(cm, wb);
+}
+
+static void write_sync_code(struct vpx_write_bit_buffer *wb) {
+  vpx_wb_write_literal(wb, VP9_SYNC_CODE_0, 8);
+  vpx_wb_write_literal(wb, VP9_SYNC_CODE_1, 8);
+  vpx_wb_write_literal(wb, VP9_SYNC_CODE_2, 8);
+}
+
+static void write_profile(BITSTREAM_PROFILE profile,
+                          struct vpx_write_bit_buffer *wb) {
+  switch (profile) {
+    case PROFILE_0:
+      vpx_wb_write_literal(wb, 0, 2);
+      break;
+    case PROFILE_1:
+      vpx_wb_write_literal(wb, 2, 2);
+      break;
+    case PROFILE_2:
+      vpx_wb_write_literal(wb, 1, 2);
+      break;
+    case PROFILE_3:
+      vpx_wb_write_literal(wb, 6, 3);
+      break;
+    default:
+      assert(0);
+  }
+}
+
+static void write_bitdepth_colorspace_sampling(
+    VP9_COMMON *const cm, struct vpx_write_bit_buffer *wb) {
+  if (cm->profile >= PROFILE_2) {
+    assert(cm->bit_depth > VPX_BITS_8);
+    vpx_wb_write_bit(wb, cm->bit_depth == VPX_BITS_10 ? 0 : 1);
+  }
+  vpx_wb_write_literal(wb, cm->color_space, 3);
+  if (cm->color_space != VPX_CS_SRGB) {
+    // 0: [16, 235] (i.e. xvYCC), 1: [0, 255]
+    vpx_wb_write_bit(wb, cm->color_range);
+    if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) {
+      assert(cm->subsampling_x != 1 || cm->subsampling_y != 1);
+      vpx_wb_write_bit(wb, cm->subsampling_x);
+      vpx_wb_write_bit(wb, cm->subsampling_y);
+      vpx_wb_write_bit(wb, 0);  // unused
+    } else {
+      assert(cm->subsampling_x == 1 && cm->subsampling_y == 1);
+    }
+  } else {
+    assert(cm->profile == PROFILE_1 || cm->profile == PROFILE_3);
+    vpx_wb_write_bit(wb, 0);  // unused
+  }
+}
+
+static void write_uncompressed_header(VP9_COMP *cpi,
+                                      struct vpx_write_bit_buffer *wb) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+
+  vpx_wb_write_literal(wb, VP9_FRAME_MARKER, 2);
+
+  write_profile(cm->profile, wb);
+
+  vpx_wb_write_bit(wb, 0);  // show_existing_frame
+  vpx_wb_write_bit(wb, cm->frame_type);
+  vpx_wb_write_bit(wb, cm->show_frame);
+  vpx_wb_write_bit(wb, cm->error_resilient_mode);
+
+  if (cm->frame_type == KEY_FRAME) {
+    write_sync_code(wb);
+    write_bitdepth_colorspace_sampling(cm, wb);
+    write_frame_size(cm, wb);
+  } else {
+    // In spatial svc if it's not error_resilient_mode then we need to code all
+    // visible frames as invisible. But we need to keep the show_frame flag so
+    // that the publisher could know whether it is supposed to be visible.
+    // So we will code the show_frame flag as it is. Then code the intra_only
+    // bit here. This will make the bitstream incompatible. In the player we
+    // will change to show_frame flag to 0, then add an one byte frame with
+    // show_existing_frame flag which tells the decoder which frame we want to
+    // show.
+    if (!cm->show_frame)
+      vpx_wb_write_bit(wb, cm->intra_only);
+
+    if (!cm->error_resilient_mode)
+      vpx_wb_write_literal(wb, cm->reset_frame_context, 2);
+
+    if (cm->intra_only) {
+      write_sync_code(wb);
+
+      // Note for profile 0, 420 8bpp is assumed.
+      if (cm->profile > PROFILE_0) {
+        write_bitdepth_colorspace_sampling(cm, wb);
+      }
+
+      vpx_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES);
+      write_frame_size(cm, wb);
+    } else {
+      MV_REFERENCE_FRAME ref_frame;
+      vpx_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES);
+      for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+        assert(get_ref_frame_map_idx(cpi, ref_frame) != INVALID_IDX);
+        vpx_wb_write_literal(wb, get_ref_frame_map_idx(cpi, ref_frame),
+                             REF_FRAMES_LOG2);
+        vpx_wb_write_bit(wb, cm->ref_frame_sign_bias[ref_frame]);
+      }
+
+      write_frame_size_with_refs(cpi, wb);
+
+      vpx_wb_write_bit(wb, cm->allow_high_precision_mv);
+
+      fix_interp_filter(cm, cpi->td.counts);
+      write_interp_filter(cm->interp_filter, wb);
+    }
+  }
+
+  if (!cm->error_resilient_mode) {
+    vpx_wb_write_bit(wb, cm->refresh_frame_context);
+    vpx_wb_write_bit(wb, cm->frame_parallel_decoding_mode);
+  }
+
+  vpx_wb_write_literal(wb, cm->frame_context_idx, FRAME_CONTEXTS_LOG2);
+
+  encode_loopfilter(&cm->lf, wb);
+  encode_quantization(cm, wb);
+  encode_segmentation(cm, xd, wb);
+
+  write_tile_info(cm, wb);
+}
+
+static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  FRAME_CONTEXT *const fc = cm->fc;
+  FRAME_COUNTS *counts = cpi->td.counts;
+  vpx_writer header_bc;
+
+  vpx_start_encode(&header_bc, data);
+
+  if (xd->lossless)
+    cm->tx_mode = ONLY_4X4;
+  else
+    encode_txfm_probs(cm, &header_bc, counts);
+
+  update_coef_probs(cpi, &header_bc);
+  update_skip_probs(cm, &header_bc, counts);
+
+  if (!frame_is_intra_only(cm)) {
+    int i;
+
+    for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
+      prob_diff_update(vp9_inter_mode_tree, cm->fc->inter_mode_probs[i],
+                       counts->inter_mode[i], INTER_MODES, &header_bc);
+
+    if (cm->interp_filter == SWITCHABLE)
+      update_switchable_interp_probs(cm, &header_bc, counts);
+
+    for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
+      vp9_cond_prob_diff_update(&header_bc, &fc->intra_inter_prob[i],
+                                counts->intra_inter[i]);
+
+    if (cpi->allow_comp_inter_inter) {
+      const int use_compound_pred = cm->reference_mode != SINGLE_REFERENCE;
+      const int use_hybrid_pred = cm->reference_mode == REFERENCE_MODE_SELECT;
+
+      vpx_write_bit(&header_bc, use_compound_pred);
+      if (use_compound_pred) {
+        vpx_write_bit(&header_bc, use_hybrid_pred);
+        if (use_hybrid_pred)
+          for (i = 0; i < COMP_INTER_CONTEXTS; i++)
+            vp9_cond_prob_diff_update(&header_bc, &fc->comp_inter_prob[i],
+                                      counts->comp_inter[i]);
+      }
+    }
+
+    if (cm->reference_mode != COMPOUND_REFERENCE) {
+      for (i = 0; i < REF_CONTEXTS; i++) {
+        vp9_cond_prob_diff_update(&header_bc, &fc->single_ref_prob[i][0],
+                                  counts->single_ref[i][0]);
+        vp9_cond_prob_diff_update(&header_bc, &fc->single_ref_prob[i][1],
+                                  counts->single_ref[i][1]);
+      }
+    }
+
+    if (cm->reference_mode != SINGLE_REFERENCE)
+      for (i = 0; i < REF_CONTEXTS; i++)
+        vp9_cond_prob_diff_update(&header_bc, &fc->comp_ref_prob[i],
+                                  counts->comp_ref[i]);
+
+    for (i = 0; i < BLOCK_SIZE_GROUPS; ++i)
+      prob_diff_update(vp9_intra_mode_tree, cm->fc->y_mode_prob[i],
+                       counts->y_mode[i], INTRA_MODES, &header_bc);
+
+    for (i = 0; i < PARTITION_CONTEXTS; ++i)
+      prob_diff_update(vp9_partition_tree, fc->partition_prob[i],
+                       counts->partition[i], PARTITION_TYPES, &header_bc);
+
+    vp9_write_nmv_probs(cm, cm->allow_high_precision_mv, &header_bc,
+                        &counts->mv);
+  }
+
+  vpx_stop_encode(&header_bc);
+  assert(header_bc.pos <= 0xffff);
+
+  return header_bc.pos;
+}
+
+void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size) {
+  uint8_t *data = dest;
+  size_t first_part_size, uncompressed_hdr_size;
+  struct vpx_write_bit_buffer wb = {data, 0};
+  struct vpx_write_bit_buffer saved_wb;
+
+  write_uncompressed_header(cpi, &wb);
+  saved_wb = wb;
+  vpx_wb_write_literal(&wb, 0, 16);  // don't know in advance first part. size
+
+  uncompressed_hdr_size = vpx_wb_bytes_written(&wb);
+  data += uncompressed_hdr_size;
+
+  vpx_clear_system_state();
+
+  first_part_size = write_compressed_header(cpi, data);
+  data += first_part_size;
+  // TODO(jbb): Figure out what to do if first_part_size > 16 bits.
+  vpx_wb_write_literal(&saved_wb, (int)first_part_size, 16);
+
+  data += encode_tiles(cpi, data);
+
+  *size = data - dest;
+}
diff --git a/libs/libvpx/vp9/encoder/vp9_bitstream.h b/libs/libvpx/vp9/encoder/vp9_bitstream.h
new file mode 100644
index 0000000000..da6b414642
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_bitstream.h
@@ -0,0 +1,37 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP9_ENCODER_VP9_BITSTREAM_H_
+#define VP9_ENCODER_VP9_BITSTREAM_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "vp9/encoder/vp9_encoder.h"
+
+void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size);
+
+static INLINE int vp9_preserve_existing_gf(VP9_COMP *cpi) {
+  return !cpi->multi_arf_allowed && cpi->refresh_golden_frame &&
+         cpi->rc.is_src_frame_alt_ref &&
+         (!cpi->use_svc ||      // Add spatial svc base layer case here
+          (is_two_pass_svc(cpi) &&
+           cpi->svc.spatial_layer_id == 0 &&
+           cpi->svc.layer_context[0].gold_ref_idx >=0 &&
+           cpi->oxcf.ss_enable_auto_arf[0]));
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_VP9_BITSTREAM_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_block.h b/libs/libvpx/vp9/encoder/vp9_block.h
new file mode 100644
index 0000000000..147743e8d8
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_block.h
@@ -0,0 +1,160 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_VP9_BLOCK_H_
+#define VP9_ENCODER_VP9_BLOCK_H_
+
+#include "vp9/common/vp9_entropymv.h"
+#include "vp9/common/vp9_entropy.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+  unsigned int sse;
+  int sum;
+  unsigned int var;
+} diff;
+
+struct macroblock_plane {
+  DECLARE_ALIGNED(16, int16_t, src_diff[64 * 64]);
+  tran_low_t *qcoeff;
+  tran_low_t *coeff;
+  uint16_t *eobs;
+  struct buf_2d src;
+
+  // Quantizer setings
+  int16_t *quant_fp;
+  int16_t *round_fp;
+  int16_t *quant;
+  int16_t *quant_shift;
+  int16_t *zbin;
+  int16_t *round;
+
+  int64_t quant_thred[2];
+};
+
+/* The [2] dimension is for whether we skip the EOB node (i.e. if previous
+ * coefficient in this block was zero) or not. */
+typedef unsigned int vp9_coeff_cost[PLANE_TYPES][REF_TYPES][COEF_BANDS][2]
+                                   [COEFF_CONTEXTS][ENTROPY_TOKENS];
+
+typedef struct {
+  int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
+  uint8_t mode_context[MAX_REF_FRAMES];
+} MB_MODE_INFO_EXT;
+
+typedef struct macroblock MACROBLOCK;
+struct macroblock {
+  struct macroblock_plane plane[MAX_MB_PLANE];
+
+  MACROBLOCKD e_mbd;
+  MB_MODE_INFO_EXT *mbmi_ext;
+  MB_MODE_INFO_EXT *mbmi_ext_base;
+  int skip_block;
+  int select_tx_size;
+  int skip_recode;
+  int skip_optimize;
+  int q_index;
+
+  // The equivalent error at the current rdmult of one whole bit (not one
+  // bitcost unit).
+  int errorperbit;
+  // The equivalend SAD error of one (whole) bit at the current quantizer
+  // for large blocks.
+  int sadperbit16;
+  // The equivalend SAD error of one (whole) bit at the current quantizer
+  // for sub-8x8 blocks.
+  int sadperbit4;
+  int rddiv;
+  int rdmult;
+  int mb_energy;
+  int * m_search_count_ptr;
+  int * ex_search_count_ptr;
+
+  // These are set to their default values at the beginning, and then adjusted
+  // further in the encoding process.
+  BLOCK_SIZE min_partition_size;
+  BLOCK_SIZE max_partition_size;
+
+  int mv_best_ref_index[MAX_REF_FRAMES];
+  unsigned int max_mv_context[MAX_REF_FRAMES];
+  unsigned int source_variance;
+  unsigned int pred_sse[MAX_REF_FRAMES];
+  int pred_mv_sad[MAX_REF_FRAMES];
+
+  int nmvjointcost[MV_JOINTS];
+  int *nmvcost[2];
+  int *nmvcost_hp[2];
+  int **mvcost;
+
+  int nmvjointsadcost[MV_JOINTS];
+  int *nmvsadcost[2];
+  int *nmvsadcost_hp[2];
+  int **mvsadcost;
+
+  // These define limits to motion vector components to prevent them
+  // from extending outside the UMV borders
+  int mv_col_min;
+  int mv_col_max;
+  int mv_row_min;
+  int mv_row_max;
+
+  // Notes transform blocks where no coefficents are coded.
+  // Set during mode selection. Read during block encoding.
+  uint8_t zcoeff_blk[TX_SIZES][256];
+
+  int skip;
+
+  int encode_breakout;
+
+  // note that token_costs is the cost when eob node is skipped
+  vp9_coeff_cost token_costs[TX_SIZES];
+
+  int optimize;
+
+  // indicate if it is in the rd search loop or encoding process
+  int use_lp32x32fdct;
+  int skip_encode;
+
+  // use fast quantization process
+  int quant_fp;
+
+  // skip forward transform and quantization
+  uint8_t skip_txfm[MAX_MB_PLANE << 2];
+  #define SKIP_TXFM_NONE 0
+  #define SKIP_TXFM_AC_DC 1
+  #define SKIP_TXFM_AC_ONLY 2
+
+  int64_t bsse[MAX_MB_PLANE << 2];
+
+  // Used to store sub partition's choices.
+  MV pred_mv[MAX_REF_FRAMES];
+
+  // Strong color activity detection. Used in RTC coding mode to enhance
+  // the visual quality at the boundary of moving color objects.
+  uint8_t color_sensitivity[2];
+
+  uint8_t sb_is_skin;
+
+  void (*fwd_txm4x4)(const int16_t *input, tran_low_t *output, int stride);
+  void (*itxm_add)(const tran_low_t *input, uint8_t *dest, int stride, int eob);
+#if CONFIG_VP9_HIGHBITDEPTH
+  void (*highbd_itxm_add)(const tran_low_t *input, uint8_t *dest, int stride,
+                          int eob, int bd);
+#endif
+};
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_VP9_BLOCK_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_blockiness.c b/libs/libvpx/vp9/encoder/vp9_blockiness.c
new file mode 100644
index 0000000000..1a89ce4f0d
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_blockiness.c
@@ -0,0 +1,134 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <stdlib.h>
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/system_state.h"
+
+static int horizontal_filter(const uint8_t *s) {
+  return (s[1] - s[-2]) * 2 + (s[-1] - s[0]) * 6;
+}
+
+static int vertical_filter(const uint8_t *s, int p) {
+  return (s[p] - s[-2 * p]) * 2 + (s[-p] - s[0]) * 6;
+}
+
+static int variance(int sum, int sum_squared, int size) {
+  return sum_squared / size - (sum / size) * (sum / size);
+}
+// Calculate a blockiness level for a vertical block edge.
+// This function returns a new blockiness metric that's defined as
+
+//              p0 p1 p2 p3
+//              q0 q1 q2 q3
+// block edge ->
+//              r0 r1 r2 r3
+//              s0 s1 s2 s3
+
+// blockiness =  p0*-2+q0*6+r0*-6+s0*2 +
+//               p1*-2+q1*6+r1*-6+s1*2 +
+//               p2*-2+q2*6+r2*-6+s2*2 +
+//               p3*-2+q3*6+r3*-6+s3*2 ;
+
+// reconstructed_blockiness = abs(blockiness from reconstructed buffer -
+//                                blockiness from source buffer,0)
+//
+// I make the assumption that flat blocks are much more visible than high
+// contrast blocks. As such, I scale the result of the blockiness calc
+// by dividing the blockiness by the variance of the pixels on either side
+// of the edge as follows:
+// var_0 = (q0^2+q1^2+q2^2+q3^2) - ((q0 + q1 + q2 + q3) / 4 )^2
+// var_1 = (r0^2+r1^2+r2^2+r3^2) - ((r0 + r1 + r2 + r3) / 4 )^2
+// The returned blockiness is the scaled value
+// Reconstructed blockiness / ( 1 + var_0 + var_1 ) ;
+static int blockiness_vertical(const uint8_t *s, int sp, const uint8_t *r,
+                               int rp, int size) {
+  int s_blockiness = 0;
+  int r_blockiness = 0;
+  int sum_0 = 0;
+  int sum_sq_0 = 0;
+  int sum_1 = 0;
+  int sum_sq_1 = 0;
+  int i;
+  int var_0;
+  int var_1;
+  for (i = 0; i < size; ++i, s += sp, r += rp) {
+    s_blockiness += horizontal_filter(s);
+    r_blockiness += horizontal_filter(r);
+    sum_0 += s[0];
+    sum_sq_0 += s[0]*s[0];
+    sum_1 += s[-1];
+    sum_sq_1 += s[-1]*s[-1];
+  }
+  var_0 = variance(sum_0, sum_sq_0, size);
+  var_1 = variance(sum_1, sum_sq_1, size);
+  r_blockiness = abs(r_blockiness);
+  s_blockiness = abs(s_blockiness);
+
+  if (r_blockiness > s_blockiness)
+    return (r_blockiness - s_blockiness) / (1 + var_0 + var_1);
+  else
+    return 0;
+}
+
+// Calculate a blockiness level for a horizontal block edge
+// same as above.
+static int blockiness_horizontal(const uint8_t *s, int sp, const uint8_t *r,
+                                 int rp, int size) {
+  int s_blockiness = 0;
+  int r_blockiness = 0;
+  int sum_0 = 0;
+  int sum_sq_0 = 0;
+  int sum_1 = 0;
+  int sum_sq_1 = 0;
+  int i;
+  int var_0;
+  int var_1;
+  for (i = 0; i < size; ++i, ++s, ++r) {
+    s_blockiness += vertical_filter(s, sp);
+    r_blockiness += vertical_filter(r, rp);
+    sum_0 += s[0];
+    sum_sq_0 += s[0] * s[0];
+    sum_1 += s[-sp];
+    sum_sq_1 += s[-sp] * s[-sp];
+  }
+  var_0 = variance(sum_0, sum_sq_0, size);
+  var_1 = variance(sum_1, sum_sq_1, size);
+  r_blockiness = abs(r_blockiness);
+  s_blockiness = abs(s_blockiness);
+
+  if (r_blockiness > s_blockiness)
+    return (r_blockiness - s_blockiness) / (1 + var_0 + var_1);
+  else
+    return 0;
+}
+
+// This function returns the blockiness for the entire frame currently by
+// looking at all borders in steps of 4.
+double vp9_get_blockiness(const uint8_t *img1, int img1_pitch,
+                          const uint8_t *img2, int img2_pitch,
+                          int width, int height) {
+  double blockiness = 0;
+  int i, j;
+  vpx_clear_system_state();
+  for (i = 0; i < height; i += 4, img1 += img1_pitch * 4,
+       img2 += img2_pitch * 4) {
+    for (j = 0; j < width; j += 4) {
+      if (i > 0 && i < height && j > 0 && j < width) {
+        blockiness += blockiness_vertical(img1 + j, img1_pitch,
+                                          img2 + j, img2_pitch, 4);
+        blockiness += blockiness_horizontal(img1 + j, img1_pitch,
+                                            img2 + j, img2_pitch, 4);
+      }
+    }
+  }
+  blockiness /= width * height / 16;
+  return blockiness;
+}
diff --git a/libs/libvpx/vp9/encoder/vp9_context_tree.c b/libs/libvpx/vp9/encoder/vp9_context_tree.c
new file mode 100644
index 0000000000..396ed3fe73
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_context_tree.c
@@ -0,0 +1,161 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/encoder/vp9_context_tree.h"
+#include "vp9/encoder/vp9_encoder.h"
+
+static const BLOCK_SIZE square[] = {
+  BLOCK_8X8,
+  BLOCK_16X16,
+  BLOCK_32X32,
+  BLOCK_64X64,
+};
+
+static void alloc_mode_context(VP9_COMMON *cm, int num_4x4_blk,
+                               PICK_MODE_CONTEXT *ctx) {
+  const int num_blk = (num_4x4_blk < 4 ? 4 : num_4x4_blk);
+  const int num_pix = num_blk << 4;
+  int i, k;
+  ctx->num_4x4_blk = num_blk;
+
+  CHECK_MEM_ERROR(cm, ctx->zcoeff_blk,
+                  vpx_calloc(num_blk, sizeof(uint8_t)));
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    for (k = 0; k < 3; ++k) {
+      CHECK_MEM_ERROR(cm, ctx->coeff[i][k],
+                      vpx_memalign(32, num_pix * sizeof(*ctx->coeff[i][k])));
+      CHECK_MEM_ERROR(cm, ctx->qcoeff[i][k],
+                      vpx_memalign(32, num_pix * sizeof(*ctx->qcoeff[i][k])));
+      CHECK_MEM_ERROR(cm, ctx->dqcoeff[i][k],
+                      vpx_memalign(32, num_pix * sizeof(*ctx->dqcoeff[i][k])));
+      CHECK_MEM_ERROR(cm, ctx->eobs[i][k],
+                      vpx_memalign(32, num_blk * sizeof(*ctx->eobs[i][k])));
+      ctx->coeff_pbuf[i][k]   = ctx->coeff[i][k];
+      ctx->qcoeff_pbuf[i][k]  = ctx->qcoeff[i][k];
+      ctx->dqcoeff_pbuf[i][k] = ctx->dqcoeff[i][k];
+      ctx->eobs_pbuf[i][k]    = ctx->eobs[i][k];
+    }
+  }
+}
+
+static void free_mode_context(PICK_MODE_CONTEXT *ctx) {
+  int i, k;
+  vpx_free(ctx->zcoeff_blk);
+  ctx->zcoeff_blk = 0;
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    for (k = 0; k < 3; ++k) {
+      vpx_free(ctx->coeff[i][k]);
+      ctx->coeff[i][k] = 0;
+      vpx_free(ctx->qcoeff[i][k]);
+      ctx->qcoeff[i][k] = 0;
+      vpx_free(ctx->dqcoeff[i][k]);
+      ctx->dqcoeff[i][k] = 0;
+      vpx_free(ctx->eobs[i][k]);
+      ctx->eobs[i][k] = 0;
+    }
+  }
+}
+
+static void alloc_tree_contexts(VP9_COMMON *cm, PC_TREE *tree,
+                                int num_4x4_blk) {
+  alloc_mode_context(cm, num_4x4_blk, &tree->none);
+  alloc_mode_context(cm, num_4x4_blk/2, &tree->horizontal[0]);
+  alloc_mode_context(cm, num_4x4_blk/2, &tree->vertical[0]);
+
+  if (num_4x4_blk > 4) {
+    alloc_mode_context(cm, num_4x4_blk/2, &tree->horizontal[1]);
+    alloc_mode_context(cm, num_4x4_blk/2, &tree->vertical[1]);
+  } else {
+    memset(&tree->horizontal[1], 0, sizeof(tree->horizontal[1]));
+    memset(&tree->vertical[1], 0, sizeof(tree->vertical[1]));
+  }
+}
+
+static void free_tree_contexts(PC_TREE *tree) {
+  free_mode_context(&tree->none);
+  free_mode_context(&tree->horizontal[0]);
+  free_mode_context(&tree->horizontal[1]);
+  free_mode_context(&tree->vertical[0]);
+  free_mode_context(&tree->vertical[1]);
+}
+
+// This function sets up a tree of contexts such that at each square
+// partition level. There are contexts for none, horizontal, vertical, and
+// split.  Along with a block_size value and a selected block_size which
+// represents the state of our search.
+void vp9_setup_pc_tree(VP9_COMMON *cm, ThreadData *td) {
+  int i, j;
+  const int leaf_nodes = 64;
+  const int tree_nodes = 64 + 16 + 4 + 1;
+  int pc_tree_index = 0;
+  PC_TREE *this_pc;
+  PICK_MODE_CONTEXT *this_leaf;
+  int square_index = 1;
+  int nodes;
+
+  vpx_free(td->leaf_tree);
+  CHECK_MEM_ERROR(cm, td->leaf_tree, vpx_calloc(leaf_nodes,
+                                                sizeof(*td->leaf_tree)));
+  vpx_free(td->pc_tree);
+  CHECK_MEM_ERROR(cm, td->pc_tree, vpx_calloc(tree_nodes,
+                                              sizeof(*td->pc_tree)));
+
+  this_pc = &td->pc_tree[0];
+  this_leaf = &td->leaf_tree[0];
+
+  // 4x4 blocks smaller than 8x8 but in the same 8x8 block share the same
+  // context so we only need to allocate 1 for each 8x8 block.
+  for (i = 0; i < leaf_nodes; ++i)
+    alloc_mode_context(cm, 1, &td->leaf_tree[i]);
+
+  // Sets up all the leaf nodes in the tree.
+  for (pc_tree_index = 0; pc_tree_index < leaf_nodes; ++pc_tree_index) {
+    PC_TREE *const tree = &td->pc_tree[pc_tree_index];
+    tree->block_size = square[0];
+    alloc_tree_contexts(cm, tree, 4);
+    tree->leaf_split[0] = this_leaf++;
+    for (j = 1; j < 4; j++)
+      tree->leaf_split[j] = tree->leaf_split[0];
+  }
+
+  // Each node has 4 leaf nodes, fill each block_size level of the tree
+  // from leafs to the root.
+  for (nodes = 16; nodes > 0; nodes >>= 2) {
+    for (i = 0; i < nodes; ++i) {
+      PC_TREE *const tree = &td->pc_tree[pc_tree_index];
+      alloc_tree_contexts(cm, tree, 4 << (2 * square_index));
+      tree->block_size = square[square_index];
+      for (j = 0; j < 4; j++)
+        tree->split[j] = this_pc++;
+      ++pc_tree_index;
+    }
+    ++square_index;
+  }
+  td->pc_root = &td->pc_tree[tree_nodes - 1];
+  td->pc_root[0].none.best_mode_index = 2;
+}
+
+void vp9_free_pc_tree(ThreadData *td) {
+  const int tree_nodes = 64 + 16 + 4 + 1;
+  int i;
+
+  // Set up all 4x4 mode contexts
+  for (i = 0; i < 64; ++i)
+    free_mode_context(&td->leaf_tree[i]);
+
+  // Sets up all the leaf nodes in the tree.
+  for (i = 0; i < tree_nodes; ++i)
+    free_tree_contexts(&td->pc_tree[i]);
+
+  vpx_free(td->pc_tree);
+  td->pc_tree = NULL;
+  vpx_free(td->leaf_tree);
+  td->leaf_tree = NULL;
+}
diff --git a/libs/libvpx/vp9/encoder/vp9_context_tree.h b/libs/libvpx/vp9/encoder/vp9_context_tree.h
new file mode 100644
index 0000000000..86ba03d69f
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_context_tree.h
@@ -0,0 +1,96 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_VP9_CONTEXT_TREE_H_
+#define VP9_ENCODER_VP9_CONTEXT_TREE_H_
+
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/encoder/vp9_block.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP9_COMP;
+struct VP9Common;
+struct ThreadData;
+
+// Structure to hold snapshot of coding context during the mode picking process
+typedef struct {
+  MODE_INFO mic;
+  MB_MODE_INFO_EXT mbmi_ext;
+  uint8_t *zcoeff_blk;
+  tran_low_t *coeff[MAX_MB_PLANE][3];
+  tran_low_t *qcoeff[MAX_MB_PLANE][3];
+  tran_low_t *dqcoeff[MAX_MB_PLANE][3];
+  uint16_t *eobs[MAX_MB_PLANE][3];
+
+  // dual buffer pointers, 0: in use, 1: best in store
+  tran_low_t *coeff_pbuf[MAX_MB_PLANE][3];
+  tran_low_t *qcoeff_pbuf[MAX_MB_PLANE][3];
+  tran_low_t *dqcoeff_pbuf[MAX_MB_PLANE][3];
+  uint16_t *eobs_pbuf[MAX_MB_PLANE][3];
+
+  int is_coded;
+  int num_4x4_blk;
+  int skip;
+  int pred_pixel_ready;
+  // For current partition, only if all Y, U, and V transform blocks'
+  // coefficients are quantized to 0, skippable is set to 0.
+  int skippable;
+  uint8_t skip_txfm[MAX_MB_PLANE << 2];
+  int best_mode_index;
+  int hybrid_pred_diff;
+  int comp_pred_diff;
+  int single_pred_diff;
+  int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
+
+  // TODO(jingning) Use RD_COST struct here instead. This involves a boarder
+  // scope of refactoring.
+  int rate;
+  int64_t dist;
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+  unsigned int newmv_sse;
+  unsigned int zeromv_sse;
+  unsigned int zeromv_lastref_sse;
+  PREDICTION_MODE best_sse_inter_mode;
+  int_mv best_sse_mv;
+  MV_REFERENCE_FRAME best_reference_frame;
+  MV_REFERENCE_FRAME best_zeromv_reference_frame;
+#endif
+
+  // motion vector cache for adaptive motion search control in partition
+  // search loop
+  MV pred_mv[MAX_REF_FRAMES];
+  INTERP_FILTER pred_interp_filter;
+} PICK_MODE_CONTEXT;
+
+typedef struct PC_TREE {
+  int index;
+  PARTITION_TYPE partitioning;
+  BLOCK_SIZE block_size;
+  PICK_MODE_CONTEXT none;
+  PICK_MODE_CONTEXT horizontal[2];
+  PICK_MODE_CONTEXT vertical[2];
+  union {
+    struct PC_TREE *split[4];
+    PICK_MODE_CONTEXT *leaf_split[4];
+  };
+} PC_TREE;
+
+void vp9_setup_pc_tree(struct VP9Common *cm, struct ThreadData *td);
+void vp9_free_pc_tree(struct ThreadData *td);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif /* VP9_ENCODER_VP9_CONTEXT_TREE_H_ */
diff --git a/libs/libvpx/vp9/encoder/vp9_cost.c b/libs/libvpx/vp9/encoder/vp9_cost.c
new file mode 100644
index 0000000000..c85f763221
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_cost.c
@@ -0,0 +1,66 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <assert.h>
+
+#include "vp9/encoder/vp9_cost.h"
+
+/* round(-log2(i/256.) * (1 << VP9_PROB_COST_SHIFT))
+   Begins and ends with a bogus entry to satisfy use of prob=0 in the firstpass.
+   https://code.google.com/p/webm/issues/detail?id=1089 */
+const uint16_t vp9_prob_cost[257] = {
+    4096, 4096, 3584, 3284, 3072, 2907, 2772, 2659, 2560, 2473, 2395, 2325,
+    2260, 2201, 2147, 2096, 2048, 2003, 1961, 1921, 1883, 1847, 1813, 1780,
+    1748, 1718, 1689, 1661, 1635, 1609, 1584, 1559, 1536, 1513, 1491, 1470,
+    1449, 1429, 1409, 1390, 1371, 1353, 1335, 1318, 1301, 1284, 1268, 1252,
+    1236, 1221, 1206, 1192, 1177, 1163, 1149, 1136, 1123, 1110, 1097, 1084,
+    1072, 1059, 1047, 1036, 1024, 1013, 1001, 990,  979,  968,  958,  947,
+    937,  927,  917,  907,  897,  887,  878,  868,  859,  850,  841,  832,
+    823,  814,  806,  797,  789,  780,  772,  764,  756,  748,  740,  732,
+    724,  717,  709,  702,  694,  687,  680,  673,  665,  658,  651,  644,
+    637,  631,  624,  617,  611,  604,  598,  591,  585,  578,  572,  566,
+    560,  554,  547,  541,  535,  530,  524,  518,  512,  506,  501,  495,
+    489,  484,  478,  473,  467,  462,  456,  451,  446,  441,  435,  430,
+    425,  420,  415,  410,  405,  400,  395,  390,  385,  380,  375,  371,
+    366,  361,  356,  352,  347,  343,  338,  333,  329,  324,  320,  316,
+    311,  307,  302,  298,  294,  289,  285,  281,  277,  273,  268,  264,
+    260,  256,  252,  248,  244,  240,  236,  232,  228,  224,  220,  216,
+    212,  209,  205,  201,  197,  194,  190,  186,  182,  179,  175,  171,
+    168,  164,  161,  157,  153,  150,  146,  143,  139,  136,  132,  129,
+    125,  122,  119,  115,  112,  109,  105,  102,  99,   95,   92,   89,
+    86,   82,   79,   76,   73,   70,   66,   63,   60,   57,   54,   51,
+    48,   45,   42,   38,   35,   32,   29,   26,   23,   20,   18,   15,
+    12,   9,    6,    3,     3};
+
+static void cost(int *costs, vpx_tree tree, const vpx_prob *probs,
+                 int i, int c) {
+  const vpx_prob prob = probs[i / 2];
+  int b;
+
+  for (b = 0; b <= 1; ++b) {
+    const int cc = c + vp9_cost_bit(prob, b);
+    const vpx_tree_index ii = tree[i + b];
+
+    if (ii <= 0)
+      costs[-ii] = cc;
+    else
+      cost(costs, tree, probs, ii, cc);
+  }
+}
+
+void vp9_cost_tokens(int *costs, const vpx_prob *probs, vpx_tree tree) {
+  cost(costs, tree, probs, 0, 0);
+}
+
+void vp9_cost_tokens_skip(int *costs, const vpx_prob *probs, vpx_tree tree) {
+  assert(tree[0] <= 0 && tree[1] > 0);
+
+  costs[-tree[0]] = vp9_cost_bit(probs[0], 0);
+  cost(costs, tree, probs, 2, 0);
+}
diff --git a/libs/libvpx/vp9/encoder/vp9_cost.h b/libs/libvpx/vp9/encoder/vp9_cost.h
new file mode 100644
index 0000000000..9831013b18
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_cost.h
@@ -0,0 +1,59 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_VP9_COST_H_
+#define VP9_ENCODER_VP9_COST_H_
+
+#include "vpx_dsp/prob.h"
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern const uint16_t vp9_prob_cost[257];
+
+// The factor to scale from cost in bits to cost in vp9_prob_cost units.
+#define VP9_PROB_COST_SHIFT 9
+
+#define vp9_cost_zero(prob) (vp9_prob_cost[prob])
+
+#define vp9_cost_one(prob) vp9_cost_zero(256 - (prob))
+
+#define vp9_cost_bit(prob, bit) vp9_cost_zero((bit) ? 256 - (prob) \
+                                                    : (prob))
+
+static INLINE unsigned int cost_branch256(const unsigned int ct[2],
+                                          vpx_prob p) {
+  return ct[0] * vp9_cost_zero(p) + ct[1] * vp9_cost_one(p);
+}
+
+static INLINE int treed_cost(vpx_tree tree, const vpx_prob *probs,
+                             int bits, int len) {
+  int cost = 0;
+  vpx_tree_index i = 0;
+
+  do {
+    const int bit = (bits >> --len) & 1;
+    cost += vp9_cost_bit(probs[i >> 1], bit);
+    i = tree[i + bit];
+  } while (len);
+
+  return cost;
+}
+
+void vp9_cost_tokens(int *costs, const vpx_prob *probs, vpx_tree tree);
+void vp9_cost_tokens_skip(int *costs, const vpx_prob *probs, vpx_tree tree);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_VP9_COST_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_dct.c b/libs/libvpx/vp9/encoder/vp9_dct.c
new file mode 100644
index 0000000000..f94540baa0
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_dct.c
@@ -0,0 +1,810 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <math.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vpx_dsp/fwd_txfm.h"
+#include "vpx_ports/mem.h"
+
+static void fdct4(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t step[4];
+  tran_high_t temp1, temp2;
+
+  step[0] = input[0] + input[3];
+  step[1] = input[1] + input[2];
+  step[2] = input[1] - input[2];
+  step[3] = input[0] - input[3];
+
+  temp1 = (step[0] + step[1]) * cospi_16_64;
+  temp2 = (step[0] - step[1]) * cospi_16_64;
+  output[0] = (tran_low_t)fdct_round_shift(temp1);
+  output[2] = (tran_low_t)fdct_round_shift(temp2);
+  temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
+  temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
+  output[1] = (tran_low_t)fdct_round_shift(temp1);
+  output[3] = (tran_low_t)fdct_round_shift(temp2);
+}
+
+static void fdct8(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
+  tran_high_t t0, t1, t2, t3;                  // needs32
+  tran_high_t x0, x1, x2, x3;                  // canbe16
+
+  // stage 1
+  s0 = input[0] + input[7];
+  s1 = input[1] + input[6];
+  s2 = input[2] + input[5];
+  s3 = input[3] + input[4];
+  s4 = input[3] - input[4];
+  s5 = input[2] - input[5];
+  s6 = input[1] - input[6];
+  s7 = input[0] - input[7];
+
+  // fdct4(step, step);
+  x0 = s0 + s3;
+  x1 = s1 + s2;
+  x2 = s1 - s2;
+  x3 = s0 - s3;
+  t0 = (x0 + x1) * cospi_16_64;
+  t1 = (x0 - x1) * cospi_16_64;
+  t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
+  t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
+  output[0] = (tran_low_t)fdct_round_shift(t0);
+  output[2] = (tran_low_t)fdct_round_shift(t2);
+  output[4] = (tran_low_t)fdct_round_shift(t1);
+  output[6] = (tran_low_t)fdct_round_shift(t3);
+
+  // Stage 2
+  t0 = (s6 - s5) * cospi_16_64;
+  t1 = (s6 + s5) * cospi_16_64;
+  t2 = (tran_low_t)fdct_round_shift(t0);
+  t3 = (tran_low_t)fdct_round_shift(t1);
+
+  // Stage 3
+  x0 = s4 + t2;
+  x1 = s4 - t2;
+  x2 = s7 - t3;
+  x3 = s7 + t3;
+
+  // Stage 4
+  t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
+  t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
+  t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+  t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
+  output[1] = (tran_low_t)fdct_round_shift(t0);
+  output[3] = (tran_low_t)fdct_round_shift(t2);
+  output[5] = (tran_low_t)fdct_round_shift(t1);
+  output[7] = (tran_low_t)fdct_round_shift(t3);
+}
+
+static void fdct16(const tran_low_t in[16], tran_low_t out[16]) {
+  tran_high_t step1[8];      // canbe16
+  tran_high_t step2[8];      // canbe16
+  tran_high_t step3[8];      // canbe16
+  tran_high_t input[8];      // canbe16
+  tran_high_t temp1, temp2;  // needs32
+
+  // step 1
+  input[0] = in[0] + in[15];
+  input[1] = in[1] + in[14];
+  input[2] = in[2] + in[13];
+  input[3] = in[3] + in[12];
+  input[4] = in[4] + in[11];
+  input[5] = in[5] + in[10];
+  input[6] = in[6] + in[ 9];
+  input[7] = in[7] + in[ 8];
+
+  step1[0] = in[7] - in[ 8];
+  step1[1] = in[6] - in[ 9];
+  step1[2] = in[5] - in[10];
+  step1[3] = in[4] - in[11];
+  step1[4] = in[3] - in[12];
+  step1[5] = in[2] - in[13];
+  step1[6] = in[1] - in[14];
+  step1[7] = in[0] - in[15];
+
+  // fdct8(step, step);
+  {
+    tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
+    tran_high_t t0, t1, t2, t3;                  // needs32
+    tran_high_t x0, x1, x2, x3;                  // canbe16
+
+    // stage 1
+    s0 = input[0] + input[7];
+    s1 = input[1] + input[6];
+    s2 = input[2] + input[5];
+    s3 = input[3] + input[4];
+    s4 = input[3] - input[4];
+    s5 = input[2] - input[5];
+    s6 = input[1] - input[6];
+    s7 = input[0] - input[7];
+
+    // fdct4(step, step);
+    x0 = s0 + s3;
+    x1 = s1 + s2;
+    x2 = s1 - s2;
+    x3 = s0 - s3;
+    t0 = (x0 + x1) * cospi_16_64;
+    t1 = (x0 - x1) * cospi_16_64;
+    t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
+    t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
+    out[0] = (tran_low_t)fdct_round_shift(t0);
+    out[4] = (tran_low_t)fdct_round_shift(t2);
+    out[8] = (tran_low_t)fdct_round_shift(t1);
+    out[12] = (tran_low_t)fdct_round_shift(t3);
+
+    // Stage 2
+    t0 = (s6 - s5) * cospi_16_64;
+    t1 = (s6 + s5) * cospi_16_64;
+    t2 = fdct_round_shift(t0);
+    t3 = fdct_round_shift(t1);
+
+    // Stage 3
+    x0 = s4 + t2;
+    x1 = s4 - t2;
+    x2 = s7 - t3;
+    x3 = s7 + t3;
+
+    // Stage 4
+    t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
+    t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
+    t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+    t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
+    out[2] = (tran_low_t)fdct_round_shift(t0);
+    out[6] = (tran_low_t)fdct_round_shift(t2);
+    out[10] = (tran_low_t)fdct_round_shift(t1);
+    out[14] = (tran_low_t)fdct_round_shift(t3);
+  }
+
+  // step 2
+  temp1 = (step1[5] - step1[2]) * cospi_16_64;
+  temp2 = (step1[4] - step1[3]) * cospi_16_64;
+  step2[2] = fdct_round_shift(temp1);
+  step2[3] = fdct_round_shift(temp2);
+  temp1 = (step1[4] + step1[3]) * cospi_16_64;
+  temp2 = (step1[5] + step1[2]) * cospi_16_64;
+  step2[4] = fdct_round_shift(temp1);
+  step2[5] = fdct_round_shift(temp2);
+
+  // step 3
+  step3[0] = step1[0] + step2[3];
+  step3[1] = step1[1] + step2[2];
+  step3[2] = step1[1] - step2[2];
+  step3[3] = step1[0] - step2[3];
+  step3[4] = step1[7] - step2[4];
+  step3[5] = step1[6] - step2[5];
+  step3[6] = step1[6] + step2[5];
+  step3[7] = step1[7] + step2[4];
+
+  // step 4
+  temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
+  temp2 = step3[2] * cospi_24_64 + step3[5] *  cospi_8_64;
+  step2[1] = fdct_round_shift(temp1);
+  step2[2] = fdct_round_shift(temp2);
+  temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
+  temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
+  step2[5] = fdct_round_shift(temp1);
+  step2[6] = fdct_round_shift(temp2);
+
+  // step 5
+  step1[0] = step3[0] + step2[1];
+  step1[1] = step3[0] - step2[1];
+  step1[2] = step3[3] + step2[2];
+  step1[3] = step3[3] - step2[2];
+  step1[4] = step3[4] - step2[5];
+  step1[5] = step3[4] + step2[5];
+  step1[6] = step3[7] - step2[6];
+  step1[7] = step3[7] + step2[6];
+
+  // step 6
+  temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
+  temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
+  out[1] = (tran_low_t)fdct_round_shift(temp1);
+  out[9] = (tran_low_t)fdct_round_shift(temp2);
+
+  temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
+  temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
+  out[5] = (tran_low_t)fdct_round_shift(temp1);
+  out[13] = (tran_low_t)fdct_round_shift(temp2);
+
+  temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
+  temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
+  out[3] = (tran_low_t)fdct_round_shift(temp1);
+  out[11] = (tran_low_t)fdct_round_shift(temp2);
+
+  temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
+  temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
+  out[7] = (tran_low_t)fdct_round_shift(temp1);
+  out[15] = (tran_low_t)fdct_round_shift(temp2);
+}
+
+static void fadst4(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t x0, x1, x2, x3;
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+  x0 = input[0];
+  x1 = input[1];
+  x2 = input[2];
+  x3 = input[3];
+
+  if (!(x0 | x1 | x2 | x3)) {
+    output[0] = output[1] = output[2] = output[3] = 0;
+    return;
+  }
+
+  s0 = sinpi_1_9 * x0;
+  s1 = sinpi_4_9 * x0;
+  s2 = sinpi_2_9 * x1;
+  s3 = sinpi_1_9 * x1;
+  s4 = sinpi_3_9 * x2;
+  s5 = sinpi_4_9 * x3;
+  s6 = sinpi_2_9 * x3;
+  s7 = x0 + x1 - x3;
+
+  x0 = s0 + s2 + s5;
+  x1 = sinpi_3_9 * s7;
+  x2 = s1 - s3 + s6;
+  x3 = s4;
+
+  s0 = x0 + x3;
+  s1 = x1;
+  s2 = x2 - x3;
+  s3 = x2 - x0 + x3;
+
+  // 1-D transform scaling factor is sqrt(2).
+  output[0] = (tran_low_t)fdct_round_shift(s0);
+  output[1] = (tran_low_t)fdct_round_shift(s1);
+  output[2] = (tran_low_t)fdct_round_shift(s2);
+  output[3] = (tran_low_t)fdct_round_shift(s3);
+}
+
+static void fadst8(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+  tran_high_t x0 = input[7];
+  tran_high_t x1 = input[0];
+  tran_high_t x2 = input[5];
+  tran_high_t x3 = input[2];
+  tran_high_t x4 = input[3];
+  tran_high_t x5 = input[4];
+  tran_high_t x6 = input[1];
+  tran_high_t x7 = input[6];
+
+  // stage 1
+  s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
+  s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
+  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
+  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
+  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
+  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
+  s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
+  s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
+
+  x0 = fdct_round_shift(s0 + s4);
+  x1 = fdct_round_shift(s1 + s5);
+  x2 = fdct_round_shift(s2 + s6);
+  x3 = fdct_round_shift(s3 + s7);
+  x4 = fdct_round_shift(s0 - s4);
+  x5 = fdct_round_shift(s1 - s5);
+  x6 = fdct_round_shift(s2 - s6);
+  x7 = fdct_round_shift(s3 - s7);
+
+  // stage 2
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = cospi_8_64  * x4 + cospi_24_64 * x5;
+  s5 = cospi_24_64 * x4 - cospi_8_64  * x5;
+  s6 = - cospi_24_64 * x6 + cospi_8_64  * x7;
+  s7 =   cospi_8_64  * x6 + cospi_24_64 * x7;
+
+  x0 = s0 + s2;
+  x1 = s1 + s3;
+  x2 = s0 - s2;
+  x3 = s1 - s3;
+  x4 = fdct_round_shift(s4 + s6);
+  x5 = fdct_round_shift(s5 + s7);
+  x6 = fdct_round_shift(s4 - s6);
+  x7 = fdct_round_shift(s5 - s7);
+
+  // stage 3
+  s2 = cospi_16_64 * (x2 + x3);
+  s3 = cospi_16_64 * (x2 - x3);
+  s6 = cospi_16_64 * (x6 + x7);
+  s7 = cospi_16_64 * (x6 - x7);
+
+  x2 = fdct_round_shift(s2);
+  x3 = fdct_round_shift(s3);
+  x6 = fdct_round_shift(s6);
+  x7 = fdct_round_shift(s7);
+
+  output[0] = (tran_low_t)x0;
+  output[1] = (tran_low_t)-x4;
+  output[2] = (tran_low_t)x6;
+  output[3] = (tran_low_t)-x2;
+  output[4] = (tran_low_t)x3;
+  output[5] = (tran_low_t)-x7;
+  output[6] = (tran_low_t)x5;
+  output[7] = (tran_low_t)-x1;
+}
+
+static void fadst16(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
+  tran_high_t s9, s10, s11, s12, s13, s14, s15;
+
+  tran_high_t x0 = input[15];
+  tran_high_t x1 = input[0];
+  tran_high_t x2 = input[13];
+  tran_high_t x3 = input[2];
+  tran_high_t x4 = input[11];
+  tran_high_t x5 = input[4];
+  tran_high_t x6 = input[9];
+  tran_high_t x7 = input[6];
+  tran_high_t x8 = input[7];
+  tran_high_t x9 = input[8];
+  tran_high_t x10 = input[5];
+  tran_high_t x11 = input[10];
+  tran_high_t x12 = input[3];
+  tran_high_t x13 = input[12];
+  tran_high_t x14 = input[1];
+  tran_high_t x15 = input[14];
+
+  // stage 1
+  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
+  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
+  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
+  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
+  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
+  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
+  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
+  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
+  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
+  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
+  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
+  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
+  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
+  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
+  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
+  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
+
+  x0 = fdct_round_shift(s0 + s8);
+  x1 = fdct_round_shift(s1 + s9);
+  x2 = fdct_round_shift(s2 + s10);
+  x3 = fdct_round_shift(s3 + s11);
+  x4 = fdct_round_shift(s4 + s12);
+  x5 = fdct_round_shift(s5 + s13);
+  x6 = fdct_round_shift(s6 + s14);
+  x7 = fdct_round_shift(s7 + s15);
+  x8  = fdct_round_shift(s0 - s8);
+  x9  = fdct_round_shift(s1 - s9);
+  x10 = fdct_round_shift(s2 - s10);
+  x11 = fdct_round_shift(s3 - s11);
+  x12 = fdct_round_shift(s4 - s12);
+  x13 = fdct_round_shift(s5 - s13);
+  x14 = fdct_round_shift(s6 - s14);
+  x15 = fdct_round_shift(s7 - s15);
+
+  // stage 2
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = x4;
+  s5 = x5;
+  s6 = x6;
+  s7 = x7;
+  s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;
+  s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;
+  s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;
+  s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;
+  s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
+  s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;
+  s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
+  s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;
+
+  x0 = s0 + s4;
+  x1 = s1 + s5;
+  x2 = s2 + s6;
+  x3 = s3 + s7;
+  x4 = s0 - s4;
+  x5 = s1 - s5;
+  x6 = s2 - s6;
+  x7 = s3 - s7;
+  x8 = fdct_round_shift(s8 + s12);
+  x9 = fdct_round_shift(s9 + s13);
+  x10 = fdct_round_shift(s10 + s14);
+  x11 = fdct_round_shift(s11 + s15);
+  x12 = fdct_round_shift(s8 - s12);
+  x13 = fdct_round_shift(s9 - s13);
+  x14 = fdct_round_shift(s10 - s14);
+  x15 = fdct_round_shift(s11 - s15);
+
+  // stage 3
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = x4 * cospi_8_64  + x5 * cospi_24_64;
+  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
+  s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
+  s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;
+  s8 = x8;
+  s9 = x9;
+  s10 = x10;
+  s11 = x11;
+  s12 = x12 * cospi_8_64  + x13 * cospi_24_64;
+  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
+  s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
+  s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;
+
+  x0 = s0 + s2;
+  x1 = s1 + s3;
+  x2 = s0 - s2;
+  x3 = s1 - s3;
+  x4 = fdct_round_shift(s4 + s6);
+  x5 = fdct_round_shift(s5 + s7);
+  x6 = fdct_round_shift(s4 - s6);
+  x7 = fdct_round_shift(s5 - s7);
+  x8 = s8 + s10;
+  x9 = s9 + s11;
+  x10 = s8 - s10;
+  x11 = s9 - s11;
+  x12 = fdct_round_shift(s12 + s14);
+  x13 = fdct_round_shift(s13 + s15);
+  x14 = fdct_round_shift(s12 - s14);
+  x15 = fdct_round_shift(s13 - s15);
+
+  // stage 4
+  s2 = (- cospi_16_64) * (x2 + x3);
+  s3 = cospi_16_64 * (x2 - x3);
+  s6 = cospi_16_64 * (x6 + x7);
+  s7 = cospi_16_64 * (- x6 + x7);
+  s10 = cospi_16_64 * (x10 + x11);
+  s11 = cospi_16_64 * (- x10 + x11);
+  s14 = (- cospi_16_64) * (x14 + x15);
+  s15 = cospi_16_64 * (x14 - x15);
+
+  x2 = fdct_round_shift(s2);
+  x3 = fdct_round_shift(s3);
+  x6 = fdct_round_shift(s6);
+  x7 = fdct_round_shift(s7);
+  x10 = fdct_round_shift(s10);
+  x11 = fdct_round_shift(s11);
+  x14 = fdct_round_shift(s14);
+  x15 = fdct_round_shift(s15);
+
+  output[0] = (tran_low_t)x0;
+  output[1] = (tran_low_t)-x8;
+  output[2] = (tran_low_t)x12;
+  output[3] = (tran_low_t)-x4;
+  output[4] = (tran_low_t)x6;
+  output[5] = (tran_low_t)x14;
+  output[6] = (tran_low_t)x10;
+  output[7] = (tran_low_t)x2;
+  output[8] = (tran_low_t)x3;
+  output[9] = (tran_low_t)x11;
+  output[10] = (tran_low_t)x15;
+  output[11] = (tran_low_t)x7;
+  output[12] = (tran_low_t)x5;
+  output[13] = (tran_low_t)-x13;
+  output[14] = (tran_low_t)x9;
+  output[15] = (tran_low_t)-x1;
+}
+
+static const transform_2d FHT_4[] = {
+  { fdct4,  fdct4  },  // DCT_DCT  = 0
+  { fadst4, fdct4  },  // ADST_DCT = 1
+  { fdct4,  fadst4 },  // DCT_ADST = 2
+  { fadst4, fadst4 }   // ADST_ADST = 3
+};
+
+static const transform_2d FHT_8[] = {
+  { fdct8,  fdct8  },  // DCT_DCT  = 0
+  { fadst8, fdct8  },  // ADST_DCT = 1
+  { fdct8,  fadst8 },  // DCT_ADST = 2
+  { fadst8, fadst8 }   // ADST_ADST = 3
+};
+
+static const transform_2d FHT_16[] = {
+  { fdct16,  fdct16  },  // DCT_DCT  = 0
+  { fadst16, fdct16  },  // ADST_DCT = 1
+  { fdct16,  fadst16 },  // DCT_ADST = 2
+  { fadst16, fadst16 }   // ADST_ADST = 3
+};
+
+void vp9_fht4x4_c(const int16_t *input, tran_low_t *output,
+                  int stride, int tx_type) {
+  if (tx_type == DCT_DCT) {
+    vpx_fdct4x4_c(input, output, stride);
+  } else {
+    tran_low_t out[4 * 4];
+    int i, j;
+    tran_low_t temp_in[4], temp_out[4];
+    const transform_2d ht = FHT_4[tx_type];
+
+    // Columns
+    for (i = 0; i < 4; ++i) {
+      for (j = 0; j < 4; ++j)
+        temp_in[j] = input[j * stride + i] * 16;
+      if (i == 0 && temp_in[0])
+        temp_in[0] += 1;
+      ht.cols(temp_in, temp_out);
+      for (j = 0; j < 4; ++j)
+        out[j * 4 + i] = temp_out[j];
+    }
+
+    // Rows
+    for (i = 0; i < 4; ++i) {
+      for (j = 0; j < 4; ++j)
+        temp_in[j] = out[j + i * 4];
+      ht.rows(temp_in, temp_out);
+      for (j = 0; j < 4; ++j)
+        output[j + i * 4] = (temp_out[j] + 1) >> 2;
+    }
+  }
+}
+
+void vp9_fdct8x8_quant_c(const int16_t *input, int stride,
+                         tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                         int skip_block,
+                         const int16_t *zbin_ptr, const int16_t *round_ptr,
+                         const int16_t *quant_ptr,
+                         const int16_t *quant_shift_ptr,
+                         tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                         const int16_t *dequant_ptr,
+                         uint16_t *eob_ptr,
+                         const int16_t *scan, const int16_t *iscan) {
+  int eob = -1;
+
+  int i, j;
+  tran_low_t intermediate[64];
+
+  // Transform columns
+  {
+    tran_low_t *output = intermediate;
+    tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
+    tran_high_t t0, t1, t2, t3;                  // needs32
+    tran_high_t x0, x1, x2, x3;                  // canbe16
+
+    int i;
+    for (i = 0; i < 8; i++) {
+      // stage 1
+      s0 = (input[0 * stride] + input[7 * stride]) * 4;
+      s1 = (input[1 * stride] + input[6 * stride]) * 4;
+      s2 = (input[2 * stride] + input[5 * stride]) * 4;
+      s3 = (input[3 * stride] + input[4 * stride]) * 4;
+      s4 = (input[3 * stride] - input[4 * stride]) * 4;
+      s5 = (input[2 * stride] - input[5 * stride]) * 4;
+      s6 = (input[1 * stride] - input[6 * stride]) * 4;
+      s7 = (input[0 * stride] - input[7 * stride]) * 4;
+
+      // fdct4(step, step);
+      x0 = s0 + s3;
+      x1 = s1 + s2;
+      x2 = s1 - s2;
+      x3 = s0 - s3;
+      t0 = (x0 + x1) * cospi_16_64;
+      t1 = (x0 - x1) * cospi_16_64;
+      t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
+      t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
+      output[0 * 8] = (tran_low_t)fdct_round_shift(t0);
+      output[2 * 8] = (tran_low_t)fdct_round_shift(t2);
+      output[4 * 8] = (tran_low_t)fdct_round_shift(t1);
+      output[6 * 8] = (tran_low_t)fdct_round_shift(t3);
+
+      // Stage 2
+      t0 = (s6 - s5) * cospi_16_64;
+      t1 = (s6 + s5) * cospi_16_64;
+      t2 = fdct_round_shift(t0);
+      t3 = fdct_round_shift(t1);
+
+      // Stage 3
+      x0 = s4 + t2;
+      x1 = s4 - t2;
+      x2 = s7 - t3;
+      x3 = s7 + t3;
+
+      // Stage 4
+      t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
+      t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
+      t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+      t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
+      output[1 * 8] = (tran_low_t)fdct_round_shift(t0);
+      output[3 * 8] = (tran_low_t)fdct_round_shift(t2);
+      output[5 * 8] = (tran_low_t)fdct_round_shift(t1);
+      output[7 * 8] = (tran_low_t)fdct_round_shift(t3);
+      input++;
+      output++;
+    }
+  }
+
+  // Rows
+  for (i = 0; i < 8; ++i) {
+    fdct8(&intermediate[i * 8], &coeff_ptr[i * 8]);
+    for (j = 0; j < 8; ++j)
+      coeff_ptr[j + i * 8] /= 2;
+  }
+
+  // TODO(jingning) Decide the need of these arguments after the
+  // quantization process is completed.
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Quantization pass: All coefficients with index >= zero_flag are
+    // skippable. Note: zero_flag can be zero.
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+      int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+      tmp = (tmp * quant_ptr[rc != 0]) >> 16;
+
+      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+
+      if (tmp)
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+void vp9_fht8x8_c(const int16_t *input, tran_low_t *output,
+                  int stride, int tx_type) {
+  if (tx_type == DCT_DCT) {
+    vpx_fdct8x8_c(input, output, stride);
+  } else {
+    tran_low_t out[64];
+    int i, j;
+    tran_low_t temp_in[8], temp_out[8];
+    const transform_2d ht = FHT_8[tx_type];
+
+    // Columns
+    for (i = 0; i < 8; ++i) {
+      for (j = 0; j < 8; ++j)
+        temp_in[j] = input[j * stride + i] * 4;
+      ht.cols(temp_in, temp_out);
+      for (j = 0; j < 8; ++j)
+        out[j * 8 + i] = temp_out[j];
+    }
+
+    // Rows
+    for (i = 0; i < 8; ++i) {
+      for (j = 0; j < 8; ++j)
+        temp_in[j] = out[j + i * 8];
+      ht.rows(temp_in, temp_out);
+      for (j = 0; j < 8; ++j)
+        output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
+    }
+  }
+}
+
+/* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
+   pixel. */
+void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
+  int i;
+  tran_high_t a1, b1, c1, d1, e1;
+  const int16_t *ip_pass0 = input;
+  const tran_low_t *ip = NULL;
+  tran_low_t *op = output;
+
+  for (i = 0; i < 4; i++) {
+    a1 = ip_pass0[0 * stride];
+    b1 = ip_pass0[1 * stride];
+    c1 = ip_pass0[2 * stride];
+    d1 = ip_pass0[3 * stride];
+
+    a1 += b1;
+    d1 = d1 - c1;
+    e1 = (a1 - d1) >> 1;
+    b1 = e1 - b1;
+    c1 = e1 - c1;
+    a1 -= c1;
+    d1 += b1;
+    op[0] = (tran_low_t)a1;
+    op[4] = (tran_low_t)c1;
+    op[8] = (tran_low_t)d1;
+    op[12] = (tran_low_t)b1;
+
+    ip_pass0++;
+    op++;
+  }
+  ip = output;
+  op = output;
+
+  for (i = 0; i < 4; i++) {
+    a1 = ip[0];
+    b1 = ip[1];
+    c1 = ip[2];
+    d1 = ip[3];
+
+    a1 += b1;
+    d1 -= c1;
+    e1 = (a1 - d1) >> 1;
+    b1 = e1 - b1;
+    c1 = e1 - c1;
+    a1 -= c1;
+    d1 += b1;
+    op[0] = (tran_low_t)(a1 * UNIT_QUANT_FACTOR);
+    op[1] = (tran_low_t)(c1 * UNIT_QUANT_FACTOR);
+    op[2] = (tran_low_t)(d1 * UNIT_QUANT_FACTOR);
+    op[3] = (tran_low_t)(b1 * UNIT_QUANT_FACTOR);
+
+    ip += 4;
+    op += 4;
+  }
+}
+
+void vp9_fht16x16_c(const int16_t *input, tran_low_t *output,
+                    int stride, int tx_type) {
+  if (tx_type == DCT_DCT) {
+    vpx_fdct16x16_c(input, output, stride);
+  } else {
+    tran_low_t out[256];
+    int i, j;
+    tran_low_t temp_in[16], temp_out[16];
+    const transform_2d ht = FHT_16[tx_type];
+
+    // Columns
+    for (i = 0; i < 16; ++i) {
+      for (j = 0; j < 16; ++j)
+        temp_in[j] = input[j * stride + i] * 4;
+      ht.cols(temp_in, temp_out);
+      for (j = 0; j < 16; ++j)
+        out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
+    }
+
+    // Rows
+    for (i = 0; i < 16; ++i) {
+      for (j = 0; j < 16; ++j)
+        temp_in[j] = out[j + i * 16];
+      ht.rows(temp_in, temp_out);
+      for (j = 0; j < 16; ++j)
+        output[j + i * 16] = temp_out[j];
+    }
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output,
+                         int stride, int tx_type) {
+  vp9_fht4x4_c(input, output, stride, tx_type);
+}
+
+void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output,
+                         int stride, int tx_type) {
+  vp9_fht8x8_c(input, output, stride, tx_type);
+}
+
+void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output,
+                          int stride) {
+  vp9_fwht4x4_c(input, output, stride);
+}
+
+void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output,
+                           int stride, int tx_type) {
+  vp9_fht16x16_c(input, output, stride, tx_type);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/libs/libvpx/vp9/encoder/vp9_denoiser.c b/libs/libvpx/vp9/encoder/vp9_denoiser.c
new file mode 100644
index 0000000000..e419cffd8f
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_denoiser.c
@@ -0,0 +1,574 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_scale/yv12config.h"
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/encoder/vp9_context_tree.h"
+#include "vp9/encoder/vp9_denoiser.h"
+#include "vp9/encoder/vp9_encoder.h"
+
+/* The VP9 denoiser is similar to that of the VP8 denoiser. While
+ * choosing the motion vectors / reference frames, the denoiser is run, and if
+ * it did not modify the signal to much, the denoised block is copied to the
+ * signal.
+ */
+
+#ifdef OUTPUT_YUV_DENOISED
+static void make_grayscale(YV12_BUFFER_CONFIG *yuv);
+#endif
+
+static int absdiff_thresh(BLOCK_SIZE bs, int increase_denoising) {
+  (void)bs;
+  return 3 + (increase_denoising ? 1 : 0);
+}
+
+static int delta_thresh(BLOCK_SIZE bs, int increase_denoising) {
+  (void)bs;
+  (void)increase_denoising;
+  return 4;
+}
+
+static int noise_motion_thresh(BLOCK_SIZE bs, int increase_denoising) {
+  (void)bs;
+  (void)increase_denoising;
+  return 625;
+}
+
+static unsigned int sse_thresh(BLOCK_SIZE bs, int increase_denoising) {
+  return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 60 : 40);
+}
+
+static int sse_diff_thresh(BLOCK_SIZE bs, int increase_denoising,
+                           int motion_magnitude) {
+  if (motion_magnitude >
+      noise_motion_thresh(bs, increase_denoising)) {
+    return 0;
+  } else {
+    return (1 << num_pels_log2_lookup[bs]) * 20;
+  }
+}
+
+static int total_adj_weak_thresh(BLOCK_SIZE bs, int increase_denoising) {
+  return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 3 : 2);
+}
+
+// TODO(jackychen): If increase_denoising is enabled in the future,
+// we might need to update the code for calculating 'total_adj' in
+// case the C code is not bit-exact with corresponding sse2 code.
+int vp9_denoiser_filter_c(const uint8_t *sig, int sig_stride,
+                          const uint8_t *mc_avg,
+                          int mc_avg_stride,
+                          uint8_t *avg, int avg_stride,
+                          int increase_denoising,
+                          BLOCK_SIZE bs,
+                          int motion_magnitude) {
+  int r, c;
+  const uint8_t *sig_start = sig;
+  const uint8_t *mc_avg_start = mc_avg;
+  uint8_t *avg_start = avg;
+  int diff, adj, absdiff, delta;
+  int adj_val[] = {3, 4, 6};
+  int total_adj = 0;
+  int shift_inc = 1;
+
+  // If motion_magnitude is small, making the denoiser more aggressive by
+  // increasing the adjustment for each level. Add another increment for
+  // blocks that are labeled for increase denoising.
+  if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) {
+    if (increase_denoising) {
+      shift_inc = 2;
+    }
+    adj_val[0] += shift_inc;
+    adj_val[1] += shift_inc;
+    adj_val[2] += shift_inc;
+  }
+
+  // First attempt to apply a strong temporal denoising filter.
+  for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) {
+    for (c = 0; c < (4 << b_width_log2_lookup[bs]); ++c) {
+      diff = mc_avg[c] - sig[c];
+      absdiff = abs(diff);
+
+      if (absdiff <= absdiff_thresh(bs, increase_denoising)) {
+        avg[c] = mc_avg[c];
+        total_adj += diff;
+      } else {
+        switch (absdiff) {
+          case 4: case 5: case 6: case 7:
+            adj = adj_val[0];
+            break;
+          case 8: case 9: case 10: case 11:
+          case 12: case 13: case 14: case 15:
+            adj = adj_val[1];
+            break;
+          default:
+            adj = adj_val[2];
+        }
+        if (diff > 0) {
+          avg[c] = VPXMIN(UINT8_MAX, sig[c] + adj);
+          total_adj += adj;
+        } else {
+          avg[c] = VPXMAX(0, sig[c] - adj);
+          total_adj -= adj;
+        }
+      }
+    }
+    sig += sig_stride;
+    avg += avg_stride;
+    mc_avg += mc_avg_stride;
+  }
+
+  // If the strong filter did not modify the signal too much, we're all set.
+  if (abs(total_adj) <= total_adj_strong_thresh(bs, increase_denoising)) {
+    return FILTER_BLOCK;
+  }
+
+  // Otherwise, we try to dampen the filter if the delta is not too high.
+  delta = ((abs(total_adj) - total_adj_strong_thresh(bs, increase_denoising))
+           >> num_pels_log2_lookup[bs]) + 1;
+
+  if (delta >= delta_thresh(bs, increase_denoising)) {
+    return COPY_BLOCK;
+  }
+
+  mc_avg =  mc_avg_start;
+  avg = avg_start;
+  sig = sig_start;
+  for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) {
+    for (c = 0; c < (4 << b_width_log2_lookup[bs]); ++c) {
+      diff = mc_avg[c] - sig[c];
+      adj = abs(diff);
+      if (adj > delta) {
+        adj = delta;
+      }
+      if (diff > 0) {
+        // Diff positive means we made positive adjustment above
+        // (in first try/attempt), so now make negative adjustment to bring
+        // denoised signal down.
+        avg[c] = VPXMAX(0, avg[c] - adj);
+        total_adj -= adj;
+      } else {
+        // Diff negative means we made negative adjustment above
+        // (in first try/attempt), so now make positive adjustment to bring
+        // denoised signal up.
+        avg[c] = VPXMIN(UINT8_MAX, avg[c] + adj);
+        total_adj += adj;
+      }
+    }
+    sig += sig_stride;
+    avg += avg_stride;
+    mc_avg += mc_avg_stride;
+  }
+
+  // We can use the filter if it has been sufficiently dampened
+  if (abs(total_adj) <= total_adj_weak_thresh(bs, increase_denoising)) {
+    return FILTER_BLOCK;
+  }
+  return COPY_BLOCK;
+}
+
+static uint8_t *block_start(uint8_t *framebuf, int stride,
+                            int mi_row, int mi_col) {
+  return framebuf + (stride * mi_row * 8) + (mi_col * 8);
+}
+
+static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser,
+                                                         MACROBLOCK *mb,
+                                                         BLOCK_SIZE bs,
+                                                         int increase_denoising,
+                                                         int mi_row,
+                                                         int mi_col,
+                                                         PICK_MODE_CONTEXT *ctx,
+                                                         int motion_magnitude,
+                                                         int is_skin,
+                                                         int *zeromv_filter) {
+  int mv_col, mv_row;
+  int sse_diff = ctx->zeromv_sse - ctx->newmv_sse;
+  MV_REFERENCE_FRAME frame;
+  MACROBLOCKD *filter_mbd = &mb->e_mbd;
+  MODE_INFO *mi = filter_mbd->mi[0];
+  MODE_INFO saved_mi;
+  int i, j;
+  struct buf_2d saved_dst[MAX_MB_PLANE];
+  struct buf_2d saved_pre[MAX_MB_PLANE][2];  // 2 pre buffers
+
+  mv_col = ctx->best_sse_mv.as_mv.col;
+  mv_row = ctx->best_sse_mv.as_mv.row;
+  frame = ctx->best_reference_frame;
+
+  saved_mi = *mi;
+
+  if (is_skin && motion_magnitude > 0)
+    return COPY_BLOCK;
+
+  // If the best reference frame uses inter-prediction and there is enough of a
+  // difference in sum-squared-error, use it.
+  if (frame != INTRA_FRAME &&
+      sse_diff > sse_diff_thresh(bs, increase_denoising, motion_magnitude)) {
+    mi->ref_frame[0] = ctx->best_reference_frame;
+    mi->mode = ctx->best_sse_inter_mode;
+    mi->mv[0] = ctx->best_sse_mv;
+  } else {
+    // Otherwise, use the zero reference frame.
+    frame = ctx->best_zeromv_reference_frame;
+    ctx->newmv_sse = ctx->zeromv_sse;
+    // Bias to last reference.
+    if (frame != LAST_FRAME &&
+        ((ctx->zeromv_lastref_sse < (5 * ctx->zeromv_sse) >> 2) ||
+         denoiser->denoising_level >= kDenHigh)) {
+      frame = LAST_FRAME;
+      ctx->newmv_sse = ctx->zeromv_lastref_sse;
+    }
+    mi->ref_frame[0] = frame;
+    mi->mode = ZEROMV;
+    mi->mv[0].as_int = 0;
+    ctx->best_sse_inter_mode = ZEROMV;
+    ctx->best_sse_mv.as_int = 0;
+    *zeromv_filter = 1;
+  }
+
+  if (ctx->newmv_sse > sse_thresh(bs, increase_denoising)) {
+    // Restore everything to its original state
+    *mi = saved_mi;
+    return COPY_BLOCK;
+  }
+  if (motion_magnitude >
+     (noise_motion_thresh(bs, increase_denoising) << 3)) {
+    // Restore everything to its original state
+    *mi = saved_mi;
+    return COPY_BLOCK;
+  }
+
+  // We will restore these after motion compensation.
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    for (j = 0; j < 2; ++j) {
+      saved_pre[i][j] = filter_mbd->plane[i].pre[j];
+    }
+    saved_dst[i] = filter_mbd->plane[i].dst;
+  }
+
+  // Set the pointers in the MACROBLOCKD to point to the buffers in the denoiser
+  // struct.
+  for (j = 0; j < 2; ++j) {
+    filter_mbd->plane[0].pre[j].buf =
+        block_start(denoiser->running_avg_y[frame].y_buffer,
+                    denoiser->running_avg_y[frame].y_stride,
+                    mi_row, mi_col);
+    filter_mbd->plane[0].pre[j].stride =
+        denoiser->running_avg_y[frame].y_stride;
+    filter_mbd->plane[1].pre[j].buf =
+        block_start(denoiser->running_avg_y[frame].u_buffer,
+                    denoiser->running_avg_y[frame].uv_stride,
+                    mi_row, mi_col);
+    filter_mbd->plane[1].pre[j].stride =
+        denoiser->running_avg_y[frame].uv_stride;
+    filter_mbd->plane[2].pre[j].buf =
+        block_start(denoiser->running_avg_y[frame].v_buffer,
+                    denoiser->running_avg_y[frame].uv_stride,
+                    mi_row, mi_col);
+    filter_mbd->plane[2].pre[j].stride =
+        denoiser->running_avg_y[frame].uv_stride;
+  }
+  filter_mbd->plane[0].dst.buf =
+      block_start(denoiser->mc_running_avg_y.y_buffer,
+                  denoiser->mc_running_avg_y.y_stride,
+                  mi_row, mi_col);
+  filter_mbd->plane[0].dst.stride = denoiser->mc_running_avg_y.y_stride;
+  filter_mbd->plane[1].dst.buf =
+      block_start(denoiser->mc_running_avg_y.u_buffer,
+                  denoiser->mc_running_avg_y.uv_stride,
+                  mi_row, mi_col);
+  filter_mbd->plane[1].dst.stride = denoiser->mc_running_avg_y.uv_stride;
+  filter_mbd->plane[2].dst.buf =
+      block_start(denoiser->mc_running_avg_y.v_buffer,
+                  denoiser->mc_running_avg_y.uv_stride,
+                  mi_row, mi_col);
+  filter_mbd->plane[2].dst.stride = denoiser->mc_running_avg_y.uv_stride;
+
+  vp9_build_inter_predictors_sby(filter_mbd, mv_row, mv_col, bs);
+
+  // Restore everything to its original state
+  *mi = saved_mi;
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    for (j = 0; j < 2; ++j) {
+      filter_mbd->plane[i].pre[j] = saved_pre[i][j];
+    }
+    filter_mbd->plane[i].dst = saved_dst[i];
+  }
+
+  mv_row = ctx->best_sse_mv.as_mv.row;
+  mv_col = ctx->best_sse_mv.as_mv.col;
+
+  return FILTER_BLOCK;
+}
+
+void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
+                          int mi_row, int mi_col, BLOCK_SIZE bs,
+                          PICK_MODE_CONTEXT *ctx,
+                          VP9_DENOISER_DECISION *denoiser_decision) {
+  int mv_col, mv_row;
+  int motion_magnitude = 0;
+  int zeromv_filter = 0;
+  VP9_DENOISER_DECISION decision = COPY_BLOCK;
+  YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME];
+  YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y;
+  uint8_t *avg_start = block_start(avg.y_buffer, avg.y_stride, mi_row, mi_col);
+  uint8_t *mc_avg_start = block_start(mc_avg.y_buffer, mc_avg.y_stride,
+                                          mi_row, mi_col);
+  struct buf_2d src = mb->plane[0].src;
+  int is_skin = 0;
+
+  if (bs <= BLOCK_32X32 && denoiser->denoising_level >= kDenLow) {
+    is_skin = vp9_compute_skin_block(mb->plane[0].src.buf,
+                                     mb->plane[1].src.buf,
+                                     mb->plane[2].src.buf,
+                                     mb->plane[0].src.stride,
+                                     mb->plane[1].src.stride,
+                                     bs);
+  }
+
+  mv_col = ctx->best_sse_mv.as_mv.col;
+  mv_row = ctx->best_sse_mv.as_mv.row;
+  motion_magnitude = mv_row * mv_row + mv_col * mv_col;
+  if (!is_skin &&
+      denoiser->denoising_level == kDenHigh &&
+      motion_magnitude < 16) {
+    denoiser->increase_denoising = 1;
+  } else {
+    denoiser->increase_denoising = 0;
+  }
+
+  if (denoiser->denoising_level >= kDenLow)
+    decision = perform_motion_compensation(denoiser, mb, bs,
+                                           denoiser->increase_denoising,
+                                           mi_row, mi_col, ctx,
+                                           motion_magnitude,
+                                           is_skin,
+                                           &zeromv_filter);
+
+  if (decision == FILTER_BLOCK) {
+    decision = vp9_denoiser_filter(src.buf, src.stride,
+                                 mc_avg_start, mc_avg.y_stride,
+                                 avg_start, avg.y_stride,
+                                 denoiser->increase_denoising,
+                                 bs, motion_magnitude);
+  }
+
+  if (decision == FILTER_BLOCK) {
+    vpx_convolve_copy(avg_start, avg.y_stride, src.buf, src.stride,
+                      NULL, 0, NULL, 0,
+                      num_4x4_blocks_wide_lookup[bs] << 2,
+                      num_4x4_blocks_high_lookup[bs] << 2);
+  } else {  // COPY_BLOCK
+    vpx_convolve_copy(src.buf, src.stride, avg_start, avg.y_stride,
+                      NULL, 0, NULL, 0,
+                      num_4x4_blocks_wide_lookup[bs] << 2,
+                      num_4x4_blocks_high_lookup[bs] << 2);
+  }
+  *denoiser_decision = decision;
+  if (decision == FILTER_BLOCK && zeromv_filter == 1)
+    *denoiser_decision = FILTER_ZEROMV_BLOCK;
+}
+
+static void copy_frame(YV12_BUFFER_CONFIG * const dest,
+                       const YV12_BUFFER_CONFIG * const src) {
+  int r;
+  const uint8_t *srcbuf = src->y_buffer;
+  uint8_t *destbuf = dest->y_buffer;
+
+  assert(dest->y_width == src->y_width);
+  assert(dest->y_height == src->y_height);
+
+  for (r = 0; r < dest->y_height; ++r) {
+    memcpy(destbuf, srcbuf, dest->y_width);
+    destbuf += dest->y_stride;
+    srcbuf += src->y_stride;
+  }
+}
+
+static void swap_frame_buffer(YV12_BUFFER_CONFIG * const dest,
+                              YV12_BUFFER_CONFIG * const src) {
+  uint8_t *tmp_buf = dest->y_buffer;
+  assert(dest->y_width == src->y_width);
+  assert(dest->y_height == src->y_height);
+  dest->y_buffer = src->y_buffer;
+  src->y_buffer = tmp_buf;
+}
+
+void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser,
+                                    YV12_BUFFER_CONFIG src,
+                                    FRAME_TYPE frame_type,
+                                    int refresh_alt_ref_frame,
+                                    int refresh_golden_frame,
+                                    int refresh_last_frame,
+                                    int resized) {
+  // Copy source into denoised reference buffers on KEY_FRAME or
+  // if the just encoded frame was resized.
+  if (frame_type == KEY_FRAME || resized != 0) {
+    int i;
+    // Start at 1 so as not to overwrite the INTRA_FRAME
+    for (i = 1; i < MAX_REF_FRAMES; ++i)
+      copy_frame(&denoiser->running_avg_y[i], &src);
+    return;
+  }
+
+  // If more than one refresh occurs, must copy frame buffer.
+  if ((refresh_alt_ref_frame + refresh_golden_frame + refresh_last_frame)
+      > 1) {
+    if (refresh_alt_ref_frame) {
+      copy_frame(&denoiser->running_avg_y[ALTREF_FRAME],
+                 &denoiser->running_avg_y[INTRA_FRAME]);
+    }
+    if (refresh_golden_frame) {
+      copy_frame(&denoiser->running_avg_y[GOLDEN_FRAME],
+                 &denoiser->running_avg_y[INTRA_FRAME]);
+    }
+    if (refresh_last_frame) {
+      copy_frame(&denoiser->running_avg_y[LAST_FRAME],
+                 &denoiser->running_avg_y[INTRA_FRAME]);
+    }
+  } else {
+    if (refresh_alt_ref_frame) {
+      swap_frame_buffer(&denoiser->running_avg_y[ALTREF_FRAME],
+                        &denoiser->running_avg_y[INTRA_FRAME]);
+    }
+    if (refresh_golden_frame) {
+      swap_frame_buffer(&denoiser->running_avg_y[GOLDEN_FRAME],
+                        &denoiser->running_avg_y[INTRA_FRAME]);
+    }
+    if (refresh_last_frame) {
+      swap_frame_buffer(&denoiser->running_avg_y[LAST_FRAME],
+                        &denoiser->running_avg_y[INTRA_FRAME]);
+    }
+  }
+}
+
+void vp9_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx) {
+  ctx->zeromv_sse = UINT_MAX;
+  ctx->newmv_sse = UINT_MAX;
+  ctx->zeromv_lastref_sse = UINT_MAX;
+}
+
+void vp9_denoiser_update_frame_stats(MODE_INFO *mi, unsigned int sse,
+                                     PREDICTION_MODE mode,
+                                     PICK_MODE_CONTEXT *ctx) {
+  // TODO(tkopp): Use both MVs if possible
+  if (mi->mv[0].as_int == 0 && sse < ctx->zeromv_sse) {
+    ctx->zeromv_sse = sse;
+    ctx->best_zeromv_reference_frame = mi->ref_frame[0];
+    if (mi->ref_frame[0] == LAST_FRAME)
+      ctx->zeromv_lastref_sse = sse;
+  }
+
+  if (mi->mv[0].as_int != 0 && sse < ctx->newmv_sse) {
+    ctx->newmv_sse = sse;
+    ctx->best_sse_inter_mode = mode;
+    ctx->best_sse_mv = mi->mv[0];
+    ctx->best_reference_frame = mi->ref_frame[0];
+  }
+}
+
+int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height,
+                       int ssx, int ssy,
+#if CONFIG_VP9_HIGHBITDEPTH
+                       int use_highbitdepth,
+#endif
+                       int border) {
+  int i, fail;
+  const int legacy_byte_alignment = 0;
+  assert(denoiser != NULL);
+
+  for (i = 0; i < MAX_REF_FRAMES; ++i) {
+    fail = vpx_alloc_frame_buffer(&denoiser->running_avg_y[i], width, height,
+                                  ssx, ssy,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                  use_highbitdepth,
+#endif
+                                  border, legacy_byte_alignment);
+    if (fail) {
+      vp9_denoiser_free(denoiser);
+      return 1;
+    }
+#ifdef OUTPUT_YUV_DENOISED
+    make_grayscale(&denoiser->running_avg_y[i]);
+#endif
+  }
+
+  fail = vpx_alloc_frame_buffer(&denoiser->mc_running_avg_y, width, height,
+                                ssx, ssy,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                use_highbitdepth,
+#endif
+                                border, legacy_byte_alignment);
+  if (fail) {
+    vp9_denoiser_free(denoiser);
+    return 1;
+  }
+
+  fail = vpx_alloc_frame_buffer(&denoiser->last_source, width, height,
+                                ssx, ssy,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                use_highbitdepth,
+#endif
+                                border, legacy_byte_alignment);
+  if (fail) {
+    vp9_denoiser_free(denoiser);
+    return 1;
+  }
+#ifdef OUTPUT_YUV_DENOISED
+  make_grayscale(&denoiser->running_avg_y[i]);
+#endif
+  denoiser->increase_denoising = 0;
+  denoiser->frame_buffer_initialized = 1;
+  denoiser->denoising_level = kDenLow;
+  return 0;
+}
+
+void vp9_denoiser_free(VP9_DENOISER *denoiser) {
+  int i;
+  denoiser->frame_buffer_initialized = 0;
+  if (denoiser == NULL) {
+    return;
+  }
+  for (i = 0; i < MAX_REF_FRAMES; ++i) {
+    vpx_free_frame_buffer(&denoiser->running_avg_y[i]);
+  }
+  vpx_free_frame_buffer(&denoiser->mc_running_avg_y);
+  vpx_free_frame_buffer(&denoiser->last_source);
+}
+
+void vp9_denoiser_set_noise_level(VP9_DENOISER *denoiser,
+                                  int noise_level) {
+  denoiser->denoising_level = noise_level;
+}
+
+#ifdef OUTPUT_YUV_DENOISED
+static void make_grayscale(YV12_BUFFER_CONFIG *yuv) {
+  int r, c;
+  uint8_t *u = yuv->u_buffer;
+  uint8_t *v = yuv->v_buffer;
+
+  for (r = 0; r < yuv->uv_height; ++r) {
+    for (c = 0; c < yuv->uv_width; ++c) {
+      u[c] = UINT8_MAX / 2;
+      v[c] = UINT8_MAX / 2;
+    }
+    u += yuv->uv_stride;
+    v += yuv->uv_stride;
+  }
+}
+#endif
diff --git a/libs/libvpx/vp9/encoder/vp9_denoiser.h b/libs/libvpx/vp9/encoder/vp9_denoiser.h
new file mode 100644
index 0000000000..9f13bd533e
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_denoiser.h
@@ -0,0 +1,93 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_DENOISER_H_
+#define VP9_ENCODER_DENOISER_H_
+
+#include "vp9/encoder/vp9_block.h"
+#include "vp9/encoder/vp9_skin_detection.h"
+#include "vpx_scale/yv12config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MOTION_MAGNITUDE_THRESHOLD (8 * 3)
+
+typedef enum vp9_denoiser_decision {
+  COPY_BLOCK,
+  FILTER_BLOCK,
+  FILTER_ZEROMV_BLOCK
+} VP9_DENOISER_DECISION;
+
+typedef enum vp9_denoiser_level {
+  kDenLowLow,
+  kDenLow,
+  kDenMedium,
+  kDenHigh
+} VP9_DENOISER_LEVEL;
+
+typedef struct vp9_denoiser {
+  YV12_BUFFER_CONFIG running_avg_y[MAX_REF_FRAMES];
+  YV12_BUFFER_CONFIG mc_running_avg_y;
+  YV12_BUFFER_CONFIG last_source;
+  int increase_denoising;
+  int frame_buffer_initialized;
+  VP9_DENOISER_LEVEL denoising_level;
+} VP9_DENOISER;
+
+struct VP9_COMP;
+
+void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser,
+                                    YV12_BUFFER_CONFIG src,
+                                    FRAME_TYPE frame_type,
+                                    int refresh_alt_ref_frame,
+                                    int refresh_golden_frame,
+                                    int refresh_last_frame,
+                                    int resized);
+
+void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
+                          int mi_row, int mi_col, BLOCK_SIZE bs,
+                          PICK_MODE_CONTEXT *ctx ,
+                          VP9_DENOISER_DECISION *denoiser_decision);
+
+void vp9_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx);
+
+void vp9_denoiser_update_frame_stats(MODE_INFO *mi,
+                                     unsigned int sse, PREDICTION_MODE mode,
+                                     PICK_MODE_CONTEXT *ctx);
+
+int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height,
+                       int ssx, int ssy,
+#if CONFIG_VP9_HIGHBITDEPTH
+                       int use_highbitdepth,
+#endif
+                       int border);
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+// This function is used by both c and sse2 denoiser implementations.
+// Define it as a static function within the scope where vp9_denoiser.h
+// is referenced.
+static INLINE int total_adj_strong_thresh(BLOCK_SIZE bs,
+                                          int increase_denoising) {
+  return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 3 : 2);
+}
+#endif
+
+void vp9_denoiser_free(VP9_DENOISER *denoiser);
+
+void vp9_denoiser_set_noise_level(VP9_DENOISER *denoiser,
+                                  int noise_level);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_DENOISER_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_encodeframe.c b/libs/libvpx/vp9/encoder/vp9_encodeframe.c
new file mode 100644
index 0000000000..4109c19a9c
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_encodeframe.c
@@ -0,0 +1,4356 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/vpx_timer.h"
+#include "vpx_ports/system_state.h"
+
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_entropy.h"
+#include "vp9/common/vp9_entropymode.h"
+#include "vp9/common/vp9_idct.h"
+#include "vp9/common/vp9_mvref_common.h"
+#include "vp9/common/vp9_pred_common.h"
+#include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_reconintra.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/common/vp9_seg_common.h"
+#include "vp9/common/vp9_tile_common.h"
+
+#include "vp9/encoder/vp9_aq_360.h"
+#include "vp9/encoder/vp9_aq_complexity.h"
+#include "vp9/encoder/vp9_aq_cyclicrefresh.h"
+#include "vp9/encoder/vp9_aq_variance.h"
+#include "vp9/encoder/vp9_encodeframe.h"
+#include "vp9/encoder/vp9_encodemb.h"
+#include "vp9/encoder/vp9_encodemv.h"
+#include "vp9/encoder/vp9_ethread.h"
+#include "vp9/encoder/vp9_extend.h"
+#include "vp9/encoder/vp9_pickmode.h"
+#include "vp9/encoder/vp9_rd.h"
+#include "vp9/encoder/vp9_rdopt.h"
+#include "vp9/encoder/vp9_segmentation.h"
+#include "vp9/encoder/vp9_tokenize.h"
+
+static void encode_superblock(VP9_COMP *cpi, ThreadData * td,
+                              TOKENEXTRA **t, int output_enabled,
+                              int mi_row, int mi_col, BLOCK_SIZE bsize,
+                              PICK_MODE_CONTEXT *ctx);
+
+// This is used as a reference when computing the source variance for the
+//  purposes of activity masking.
+// Eventually this should be replaced by custom no-reference routines,
+//  which will be faster.
+static const uint8_t VP9_VAR_OFFS[64] = {
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128
+};
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static const uint16_t VP9_HIGH_VAR_OFFS_8[64] = {
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128
+};
+
+static const uint16_t VP9_HIGH_VAR_OFFS_10[64] = {
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4
+};
+
+static const uint16_t VP9_HIGH_VAR_OFFS_12[64] = {
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16
+};
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+unsigned int vp9_get_sby_perpixel_variance(VP9_COMP *cpi,
+                                           const struct buf_2d *ref,
+                                           BLOCK_SIZE bs) {
+  unsigned int sse;
+  const unsigned int var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
+                                              VP9_VAR_OFFS, 0, &sse);
+  return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+unsigned int vp9_high_get_sby_perpixel_variance(
+    VP9_COMP *cpi, const struct buf_2d *ref, BLOCK_SIZE bs, int bd) {
+  unsigned int var, sse;
+  switch (bd) {
+    case 10:
+      var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
+                               CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_10),
+                               0, &sse);
+      break;
+    case 12:
+      var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
+                               CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_12),
+                               0, &sse);
+      break;
+    case 8:
+    default:
+      var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
+                               CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_8),
+                               0, &sse);
+      break;
+  }
+  return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static unsigned int get_sby_perpixel_diff_variance(VP9_COMP *cpi,
+                                                   const struct buf_2d *ref,
+                                                   int mi_row, int mi_col,
+                                                   BLOCK_SIZE bs) {
+  unsigned int sse, var;
+  uint8_t *last_y;
+  const YV12_BUFFER_CONFIG *last = get_ref_frame_buffer(cpi, LAST_FRAME);
+
+  assert(last != NULL);
+  last_y =
+      &last->y_buffer[mi_row * MI_SIZE * last->y_stride + mi_col * MI_SIZE];
+  var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride, last_y, last->y_stride, &sse);
+  return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
+}
+
+static BLOCK_SIZE get_rd_var_based_fixed_partition(VP9_COMP *cpi, MACROBLOCK *x,
+                                                   int mi_row,
+                                                   int mi_col) {
+  unsigned int var = get_sby_perpixel_diff_variance(cpi, &x->plane[0].src,
+                                                    mi_row, mi_col,
+                                                    BLOCK_64X64);
+  if (var < 8)
+    return BLOCK_64X64;
+  else if (var < 128)
+    return BLOCK_32X32;
+  else if (var < 2048)
+    return BLOCK_16X16;
+  else
+    return BLOCK_8X8;
+}
+
+// Lighter version of set_offsets that only sets the mode info
+// pointers.
+static INLINE void set_mode_info_offsets(VP9_COMMON *const cm,
+                                         MACROBLOCK *const x,
+                                         MACROBLOCKD *const xd,
+                                         int mi_row,
+                                         int mi_col) {
+  const int idx_str = xd->mi_stride * mi_row + mi_col;
+  xd->mi = cm->mi_grid_visible + idx_str;
+  xd->mi[0] = cm->mi + idx_str;
+  x->mbmi_ext = x->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
+}
+
+static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile,
+                        MACROBLOCK *const x, int mi_row, int mi_col,
+                        BLOCK_SIZE bsize) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *mi;
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const struct segmentation *const seg = &cm->seg;
+
+  set_skip_context(xd, mi_row, mi_col);
+
+  set_mode_info_offsets(cm, x, xd, mi_row, mi_col);
+
+  mi = xd->mi[0];
+
+  // Set up destination pointers.
+  vp9_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col);
+
+  // Set up limit values for MV components.
+  // Mv beyond the range do not produce new/different prediction block.
+  x->mv_row_min = -(((mi_row + mi_height) * MI_SIZE) + VP9_INTERP_EXTEND);
+  x->mv_col_min = -(((mi_col + mi_width) * MI_SIZE) + VP9_INTERP_EXTEND);
+  x->mv_row_max = (cm->mi_rows - mi_row) * MI_SIZE + VP9_INTERP_EXTEND;
+  x->mv_col_max = (cm->mi_cols - mi_col) * MI_SIZE + VP9_INTERP_EXTEND;
+
+  // Set up distance of MB to edge of frame in 1/8th pel units.
+  assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1)));
+  set_mi_row_col(xd, tile, mi_row, mi_height, mi_col, mi_width,
+                 cm->mi_rows, cm->mi_cols);
+
+  // Set up source buffers.
+  vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col);
+
+  // R/D setup.
+  x->rddiv = cpi->rd.RDDIV;
+  x->rdmult = cpi->rd.RDMULT;
+
+  // Setup segment ID.
+  if (seg->enabled) {
+    if (cpi->oxcf.aq_mode != VARIANCE_AQ &&
+        cpi->oxcf.aq_mode != EQUATOR360_AQ) {
+      const uint8_t *const map = seg->update_map ? cpi->segmentation_map
+                                                 : cm->last_frame_seg_map;
+      mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
+    }
+    vp9_init_plane_quantizers(cpi, x);
+
+    x->encode_breakout = cpi->segment_encode_breakout[mi->segment_id];
+  } else {
+    mi->segment_id = 0;
+    x->encode_breakout = cpi->encode_breakout;
+  }
+
+  // required by vp9_append_sub8x8_mvs_for_idx() and vp9_find_best_ref_mvs()
+  xd->tile = *tile;
+}
+
+static void duplicate_mode_info_in_sb(VP9_COMMON *cm, MACROBLOCKD *xd,
+                                      int mi_row, int mi_col,
+                                      BLOCK_SIZE bsize) {
+  const int block_width = num_8x8_blocks_wide_lookup[bsize];
+  const int block_height = num_8x8_blocks_high_lookup[bsize];
+  int i, j;
+  for (j = 0; j < block_height; ++j)
+    for (i = 0; i < block_width; ++i) {
+      if (mi_row + j < cm->mi_rows && mi_col + i < cm->mi_cols)
+        xd->mi[j * xd->mi_stride + i] = xd->mi[0];
+    }
+}
+
+static void set_block_size(VP9_COMP * const cpi,
+                           MACROBLOCK *const x,
+                           MACROBLOCKD *const xd,
+                           int mi_row, int mi_col,
+                           BLOCK_SIZE bsize) {
+  if (cpi->common.mi_cols > mi_col && cpi->common.mi_rows > mi_row) {
+    set_mode_info_offsets(&cpi->common, x, xd, mi_row, mi_col);
+    xd->mi[0]->sb_type = bsize;
+  }
+}
+
+typedef struct {
+  int64_t sum_square_error;
+  int64_t sum_error;
+  int log2_count;
+  int variance;
+} var;
+
+typedef struct {
+  var none;
+  var horz[2];
+  var vert[2];
+} partition_variance;
+
+typedef struct {
+  partition_variance part_variances;
+  var split[4];
+} v4x4;
+
+typedef struct {
+  partition_variance part_variances;
+  v4x4 split[4];
+} v8x8;
+
+typedef struct {
+  partition_variance part_variances;
+  v8x8 split[4];
+} v16x16;
+
+typedef struct {
+  partition_variance part_variances;
+  v16x16 split[4];
+} v32x32;
+
+typedef struct {
+  partition_variance part_variances;
+  v32x32 split[4];
+} v64x64;
+
+typedef struct {
+  partition_variance *part_variances;
+  var *split[4];
+} variance_node;
+
+typedef enum {
+  V16X16,
+  V32X32,
+  V64X64,
+} TREE_LEVEL;
+
+static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) {
+  int i;
+  node->part_variances = NULL;
+  switch (bsize) {
+    case BLOCK_64X64: {
+      v64x64 *vt = (v64x64 *) data;
+      node->part_variances = &vt->part_variances;
+      for (i = 0; i < 4; i++)
+        node->split[i] = &vt->split[i].part_variances.none;
+      break;
+    }
+    case BLOCK_32X32: {
+      v32x32 *vt = (v32x32 *) data;
+      node->part_variances = &vt->part_variances;
+      for (i = 0; i < 4; i++)
+        node->split[i] = &vt->split[i].part_variances.none;
+      break;
+    }
+    case BLOCK_16X16: {
+      v16x16 *vt = (v16x16 *) data;
+      node->part_variances = &vt->part_variances;
+      for (i = 0; i < 4; i++)
+        node->split[i] = &vt->split[i].part_variances.none;
+      break;
+    }
+    case BLOCK_8X8: {
+      v8x8 *vt = (v8x8 *) data;
+      node->part_variances = &vt->part_variances;
+      for (i = 0; i < 4; i++)
+        node->split[i] = &vt->split[i].part_variances.none;
+      break;
+    }
+    case BLOCK_4X4: {
+      v4x4 *vt = (v4x4 *) data;
+      node->part_variances = &vt->part_variances;
+      for (i = 0; i < 4; i++)
+        node->split[i] = &vt->split[i];
+      break;
+    }
+    default: {
+      assert(0);
+      break;
+    }
+  }
+}
+
+// Set variance values given sum square error, sum error, count.
+static void fill_variance(int64_t s2, int64_t s, int c, var *v) {
+  v->sum_square_error = s2;
+  v->sum_error = s;
+  v->log2_count = c;
+}
+
+static void get_variance(var *v) {
+  v->variance = (int)(256 * (v->sum_square_error -
+      ((v->sum_error * v->sum_error) >> v->log2_count)) >> v->log2_count);
+}
+
+static void sum_2_variances(const var *a, const var *b, var *r) {
+  assert(a->log2_count == b->log2_count);
+  fill_variance(a->sum_square_error + b->sum_square_error,
+                a->sum_error + b->sum_error, a->log2_count + 1, r);
+}
+
+static void fill_variance_tree(void *data, BLOCK_SIZE bsize) {
+  variance_node node;
+  memset(&node, 0, sizeof(node));
+  tree_to_node(data, bsize, &node);
+  sum_2_variances(node.split[0], node.split[1], &node.part_variances->horz[0]);
+  sum_2_variances(node.split[2], node.split[3], &node.part_variances->horz[1]);
+  sum_2_variances(node.split[0], node.split[2], &node.part_variances->vert[0]);
+  sum_2_variances(node.split[1], node.split[3], &node.part_variances->vert[1]);
+  sum_2_variances(&node.part_variances->vert[0], &node.part_variances->vert[1],
+                  &node.part_variances->none);
+}
+
+static int set_vt_partitioning(VP9_COMP *cpi,
+                               MACROBLOCK *const x,
+                               MACROBLOCKD *const xd,
+                               void *data,
+                               BLOCK_SIZE bsize,
+                               int mi_row,
+                               int mi_col,
+                               int64_t threshold,
+                               BLOCK_SIZE bsize_min,
+                               int force_split) {
+  VP9_COMMON * const cm = &cpi->common;
+  variance_node vt;
+  const int block_width = num_8x8_blocks_wide_lookup[bsize];
+  const int block_height = num_8x8_blocks_high_lookup[bsize];
+
+  assert(block_height == block_width);
+  tree_to_node(data, bsize, &vt);
+
+  if (force_split == 1)
+    return 0;
+
+  // For bsize=bsize_min (16x16/8x8 for 8x8/4x4 downsampling), select if
+  // variance is below threshold, otherwise split will be selected.
+  // No check for vert/horiz split as too few samples for variance.
+  if (bsize == bsize_min) {
+    // Variance already computed to set the force_split.
+    if (cm->frame_type == KEY_FRAME)
+      get_variance(&vt.part_variances->none);
+    if (mi_col + block_width / 2 < cm->mi_cols &&
+        mi_row + block_height / 2 < cm->mi_rows &&
+        vt.part_variances->none.variance < threshold) {
+      set_block_size(cpi, x, xd, mi_row, mi_col, bsize);
+      return 1;
+    }
+    return 0;
+  } else if (bsize > bsize_min) {
+    // Variance already computed to set the force_split.
+    if (cm->frame_type == KEY_FRAME)
+      get_variance(&vt.part_variances->none);
+    // For key frame: take split for bsize above 32X32 or very high variance.
+    if (cm->frame_type == KEY_FRAME &&
+        (bsize > BLOCK_32X32 ||
+        vt.part_variances->none.variance > (threshold << 4))) {
+      return 0;
+    }
+    // If variance is low, take the bsize (no split).
+    if (mi_col + block_width / 2 < cm->mi_cols &&
+        mi_row + block_height / 2 < cm->mi_rows &&
+        vt.part_variances->none.variance < threshold) {
+      set_block_size(cpi, x, xd, mi_row, mi_col, bsize);
+      return 1;
+    }
+
+    // Check vertical split.
+    if (mi_row + block_height / 2 < cm->mi_rows) {
+      BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_VERT);
+      get_variance(&vt.part_variances->vert[0]);
+      get_variance(&vt.part_variances->vert[1]);
+      if (vt.part_variances->vert[0].variance < threshold &&
+          vt.part_variances->vert[1].variance < threshold &&
+          get_plane_block_size(subsize, &xd->plane[1]) < BLOCK_INVALID) {
+        set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
+        set_block_size(cpi, x, xd, mi_row, mi_col + block_width / 2, subsize);
+        return 1;
+      }
+    }
+    // Check horizontal split.
+    if (mi_col + block_width / 2 < cm->mi_cols) {
+      BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_HORZ);
+      get_variance(&vt.part_variances->horz[0]);
+      get_variance(&vt.part_variances->horz[1]);
+      if (vt.part_variances->horz[0].variance < threshold &&
+          vt.part_variances->horz[1].variance < threshold &&
+          get_plane_block_size(subsize, &xd->plane[1]) < BLOCK_INVALID) {
+        set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
+        set_block_size(cpi, x, xd, mi_row + block_height / 2, mi_col, subsize);
+        return 1;
+      }
+    }
+
+    return 0;
+  }
+  return 0;
+}
+
+// Set the variance split thresholds for following the block sizes:
+// 0 - threshold_64x64, 1 - threshold_32x32, 2 - threshold_16x16,
+// 3 - vbp_threshold_8x8. vbp_threshold_8x8 (to split to 4x4 partition) is
+// currently only used on key frame.
+static void set_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int is_key_frame = (cm->frame_type == KEY_FRAME);
+  const int threshold_multiplier = is_key_frame ? 20 : 1;
+  int64_t threshold_base = (int64_t)(threshold_multiplier *
+      cpi->y_dequant[q][1]);
+  if (is_key_frame) {
+    thresholds[0] = threshold_base;
+    thresholds[1] = threshold_base >> 2;
+    thresholds[2] = threshold_base >> 2;
+    thresholds[3] = threshold_base << 2;
+  } else {
+    // Increase base variance threshold based on  estimated noise level.
+    if (cpi->noise_estimate.enabled) {
+      NOISE_LEVEL noise_level = vp9_noise_estimate_extract_level(
+          &cpi->noise_estimate);
+      if (noise_level == kHigh)
+        threshold_base = 3 * threshold_base;
+      else if (noise_level == kMedium)
+        threshold_base = threshold_base << 1;
+      else if (noise_level < kLow)
+        threshold_base = (7 * threshold_base) >> 3;
+    }
+    if (cm->width <= 352 && cm->height <= 288) {
+      thresholds[0] = threshold_base >> 3;
+      thresholds[1] = threshold_base >> 1;
+      thresholds[2] = threshold_base << 3;
+    } else {
+      thresholds[0] = threshold_base;
+      thresholds[1] = (5 * threshold_base) >> 2;
+      if (cm->width >= 1920 && cm->height >= 1080)
+        thresholds[1] = (7 * threshold_base) >> 2;
+      thresholds[2] = threshold_base << cpi->oxcf.speed;
+    }
+  }
+}
+
+void vp9_set_variance_partition_thresholds(VP9_COMP *cpi, int q) {
+  VP9_COMMON *const cm = &cpi->common;
+  SPEED_FEATURES *const sf = &cpi->sf;
+  const int is_key_frame = (cm->frame_type == KEY_FRAME);
+  if (sf->partition_search_type != VAR_BASED_PARTITION &&
+      sf->partition_search_type != REFERENCE_PARTITION) {
+    return;
+  } else {
+    set_vbp_thresholds(cpi, cpi->vbp_thresholds, q);
+    // The thresholds below are not changed locally.
+    if (is_key_frame) {
+      cpi->vbp_threshold_sad = 0;
+      cpi->vbp_bsize_min = BLOCK_8X8;
+    } else {
+      if (cm->width <= 352 && cm->height <= 288)
+        cpi->vbp_threshold_sad = 10;
+      else
+        cpi->vbp_threshold_sad = (cpi->y_dequant[q][1] << 1) > 1000 ?
+            (cpi->y_dequant[q][1] << 1) : 1000;
+      cpi->vbp_bsize_min = BLOCK_16X16;
+    }
+    cpi->vbp_threshold_minmax = 15 + (q >> 3);
+  }
+}
+
+// Compute the minmax over the 8x8 subblocks.
+static int compute_minmax_8x8(const uint8_t *s, int sp, const uint8_t *d,
+                              int dp, int x16_idx, int y16_idx,
+#if CONFIG_VP9_HIGHBITDEPTH
+                              int highbd_flag,
+#endif
+                              int pixels_wide,
+                              int pixels_high) {
+  int k;
+  int minmax_max = 0;
+  int minmax_min = 255;
+  // Loop over the 4 8x8 subblocks.
+  for (k = 0; k < 4; k++) {
+    int x8_idx = x16_idx + ((k & 1) << 3);
+    int y8_idx = y16_idx + ((k >> 1) << 3);
+    int min = 0;
+    int max = 0;
+    if (x8_idx < pixels_wide && y8_idx < pixels_high) {
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
+        vpx_highbd_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
+                              d + y8_idx * dp + x8_idx, dp,
+                              &min, &max);
+      } else {
+        vpx_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
+                       d + y8_idx * dp + x8_idx, dp,
+                       &min, &max);
+      }
+#else
+      vpx_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
+                     d + y8_idx * dp + x8_idx, dp,
+                     &min, &max);
+#endif
+      if ((max - min) > minmax_max)
+        minmax_max = (max - min);
+      if ((max - min) < minmax_min)
+        minmax_min = (max - min);
+    }
+  }
+  return (minmax_max - minmax_min);
+}
+
+static void fill_variance_4x4avg(const uint8_t *s, int sp, const uint8_t *d,
+                                 int dp, int x8_idx, int y8_idx, v8x8 *vst,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                 int highbd_flag,
+#endif
+                                 int pixels_wide,
+                                 int pixels_high,
+                                 int is_key_frame) {
+  int k;
+  for (k = 0; k < 4; k++) {
+    int x4_idx = x8_idx + ((k & 1) << 2);
+    int y4_idx = y8_idx + ((k >> 1) << 2);
+    unsigned int sse = 0;
+    int sum = 0;
+    if (x4_idx < pixels_wide && y4_idx < pixels_high) {
+      int s_avg;
+      int d_avg = 128;
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
+        s_avg = vpx_highbd_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+        if (!is_key_frame)
+          d_avg = vpx_highbd_avg_4x4(d + y4_idx * dp + x4_idx, dp);
+      } else {
+        s_avg = vpx_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+        if (!is_key_frame)
+          d_avg = vpx_avg_4x4(d + y4_idx * dp + x4_idx, dp);
+      }
+#else
+      s_avg = vpx_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+      if (!is_key_frame)
+        d_avg = vpx_avg_4x4(d + y4_idx * dp + x4_idx, dp);
+#endif
+      sum = s_avg - d_avg;
+      sse = sum * sum;
+    }
+    fill_variance(sse, sum, 0, &vst->split[k].part_variances.none);
+  }
+}
+
+static void fill_variance_8x8avg(const uint8_t *s, int sp, const uint8_t *d,
+                                 int dp, int x16_idx, int y16_idx, v16x16 *vst,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                 int highbd_flag,
+#endif
+                                 int pixels_wide,
+                                 int pixels_high,
+                                 int is_key_frame) {
+  int k;
+  for (k = 0; k < 4; k++) {
+    int x8_idx = x16_idx + ((k & 1) << 3);
+    int y8_idx = y16_idx + ((k >> 1) << 3);
+    unsigned int sse = 0;
+    int sum = 0;
+    if (x8_idx < pixels_wide && y8_idx < pixels_high) {
+      int s_avg;
+      int d_avg = 128;
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
+        s_avg = vpx_highbd_avg_8x8(s + y8_idx * sp + x8_idx, sp);
+        if (!is_key_frame)
+          d_avg = vpx_highbd_avg_8x8(d + y8_idx * dp + x8_idx, dp);
+      } else {
+        s_avg = vpx_avg_8x8(s + y8_idx * sp + x8_idx, sp);
+        if (!is_key_frame)
+          d_avg = vpx_avg_8x8(d + y8_idx * dp + x8_idx, dp);
+      }
+#else
+      s_avg = vpx_avg_8x8(s + y8_idx * sp + x8_idx, sp);
+      if (!is_key_frame)
+        d_avg = vpx_avg_8x8(d + y8_idx * dp + x8_idx, dp);
+#endif
+      sum = s_avg - d_avg;
+      sse = sum * sum;
+    }
+    fill_variance(sse, sum, 0, &vst->split[k].part_variances.none);
+  }
+}
+
+// This function chooses partitioning based on the variance between source and
+// reconstructed last, where variance is computed for down-sampled inputs.
+static int choose_partitioning(VP9_COMP *cpi,
+                                const TileInfo *const tile,
+                                MACROBLOCK *x,
+                                int mi_row, int mi_col) {
+  VP9_COMMON * const cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  int i, j, k, m;
+  v64x64 vt;
+  v16x16 vt2[16];
+  int force_split[21];
+  int avg_32x32;
+  int avg_16x16[4];
+  uint8_t *s;
+  const uint8_t *d;
+  int sp;
+  int dp;
+  int pixels_wide = 64, pixels_high = 64;
+  int64_t thresholds[4] = {cpi->vbp_thresholds[0], cpi->vbp_thresholds[1],
+      cpi->vbp_thresholds[2], cpi->vbp_thresholds[3]};
+
+  // For the variance computation under SVC mode, we treat the frame as key if
+  // the reference (base layer frame) is key frame (i.e., is_key_frame == 1).
+  const int is_key_frame = (cm->frame_type == KEY_FRAME ||
+      (is_one_pass_cbr_svc(cpi) &&
+      cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame));
+  // Always use 4x4 partition for key frame.
+  const int use_4x4_partition = cm->frame_type == KEY_FRAME;
+  const int low_res = (cm->width <= 352 && cm->height <= 288);
+  int variance4x4downsample[16];
+
+  int segment_id = CR_SEGMENT_ID_BASE;
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) {
+    const uint8_t *const map = cm->seg.update_map ? cpi->segmentation_map :
+                                                    cm->last_frame_seg_map;
+    segment_id = get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col);
+
+    if (cyclic_refresh_segment_id_boosted(segment_id)) {
+      int q = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex);
+      set_vbp_thresholds(cpi, thresholds, q);
+    }
+  }
+
+  set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64);
+
+  if (xd->mb_to_right_edge < 0)
+    pixels_wide += (xd->mb_to_right_edge >> 3);
+  if (xd->mb_to_bottom_edge < 0)
+    pixels_high += (xd->mb_to_bottom_edge >> 3);
+
+  s = x->plane[0].src.buf;
+  sp = x->plane[0].src.stride;
+
+  // Index for force_split: 0 for 64x64, 1-4 for 32x32 blocks,
+  // 5-20 for the 16x16 blocks.
+  force_split[0] = 0;
+
+  if (!is_key_frame) {
+    // In the case of spatial/temporal scalable coding, the assumption here is
+    // that the temporal reference frame will always be of type LAST_FRAME.
+    // TODO(marpan): If that assumption is broken, we need to revisit this code.
+    MODE_INFO *mi = xd->mi[0];
+    unsigned int uv_sad;
+    const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
+
+    const YV12_BUFFER_CONFIG *yv12_g = NULL;
+    unsigned int y_sad, y_sad_g;
+    const BLOCK_SIZE bsize = BLOCK_32X32
+        + (mi_col + 4 < cm->mi_cols) * 2 + (mi_row + 4 < cm->mi_rows);
+
+    assert(yv12 != NULL);
+
+    if (!(is_one_pass_cbr_svc(cpi) && cpi->svc.spatial_layer_id)) {
+      // For now, GOLDEN will not be used for non-zero spatial layers, since
+      // it may not be a temporal reference.
+      yv12_g = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+    }
+
+    if (yv12_g && yv12_g != yv12 &&
+       (cpi->ref_frame_flags & VP9_GOLD_FLAG)) {
+      vp9_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
+                           &cm->frame_refs[GOLDEN_FRAME - 1].sf);
+      y_sad_g = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf,
+                                       x->plane[0].src.stride,
+                                       xd->plane[0].pre[0].buf,
+                                       xd->plane[0].pre[0].stride);
+    } else {
+      y_sad_g = UINT_MAX;
+    }
+
+    vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
+                         &cm->frame_refs[LAST_FRAME - 1].sf);
+    mi->ref_frame[0] = LAST_FRAME;
+    mi->ref_frame[1] = NONE;
+    mi->sb_type = BLOCK_64X64;
+    mi->mv[0].as_int = 0;
+    mi->interp_filter = BILINEAR;
+
+    y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col);
+    if (y_sad_g < y_sad) {
+      vp9_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
+                           &cm->frame_refs[GOLDEN_FRAME - 1].sf);
+      mi->ref_frame[0] = GOLDEN_FRAME;
+      mi->mv[0].as_int = 0;
+      y_sad = y_sad_g;
+    } else {
+      x->pred_mv[LAST_FRAME] = mi->mv[0].as_mv;
+    }
+
+    vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64);
+
+    // Check if most of the superblock is skin content, and if so, force split
+    // to 32x32. Avoid checking superblocks on/near boundary and avoid low
+    // resolutons for now.
+    // Note superblock may still pick 64X64 if y_sad is very small
+    // (i.e., y_sad < cpi->vbp_threshold_sad) below. For now leave this as is.
+    x->sb_is_skin = 0;
+#if !CONFIG_VP9_HIGHBITDEPTH
+    if (cpi->use_skin_detection && !low_res && (mi_col >= 8 &&
+        mi_col + 8 < cm->mi_cols && mi_row >= 8 && mi_row + 8 < cm->mi_rows)) {
+      int num_16x16_skin = 0;
+      int num_16x16_nonskin = 0;
+      uint8_t *ysignal = x->plane[0].src.buf;
+      uint8_t *usignal = x->plane[1].src.buf;
+      uint8_t *vsignal = x->plane[2].src.buf;
+      int spuv = x->plane[1].src.stride;
+      for (i = 0; i < 4; i++) {
+        for (j = 0; j < 4; j++) {
+          int is_skin = vp9_compute_skin_block(ysignal,
+                                               usignal,
+                                               vsignal,
+                                               sp,
+                                               spuv,
+                                               BLOCK_16X16);
+          num_16x16_skin += is_skin;
+          num_16x16_nonskin += (1 - is_skin);
+          if (num_16x16_nonskin > 3) {
+            // Exit loop if at least 4 of the 16x16 blocks are not skin.
+            i = 4;
+            j = 4;
+          }
+          ysignal += 16;
+          usignal += 8;
+          vsignal += 8;
+        }
+        ysignal += (sp << 4) - 64;
+        usignal += (spuv << 3) - 32;
+        vsignal += (spuv << 3) - 32;
+      }
+      if (num_16x16_skin > 12) {
+        x->sb_is_skin = 1;
+        force_split[0] = 1;
+      }
+    }
+#endif
+    for (i = 1; i <= 2; ++i) {
+      struct macroblock_plane  *p = &x->plane[i];
+      struct macroblockd_plane *pd = &xd->plane[i];
+      const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
+
+      if (bs == BLOCK_INVALID)
+        uv_sad = UINT_MAX;
+      else
+        uv_sad = cpi->fn_ptr[bs].sdf(p->src.buf, p->src.stride,
+                                     pd->dst.buf, pd->dst.stride);
+
+        // TODO(marpan): Investigate if we should lower this threshold if
+        // superblock is detected as skin.
+        x->color_sensitivity[i - 1] = uv_sad > (y_sad >> 2);
+    }
+
+    d = xd->plane[0].dst.buf;
+    dp = xd->plane[0].dst.stride;
+
+    // If the y_sad is very small, take 64x64 as partition and exit.
+    // Don't check on boosted segment for now, as 64x64 is suppressed there.
+    if (segment_id == CR_SEGMENT_ID_BASE &&
+        y_sad < cpi->vbp_threshold_sad) {
+      const int block_width = num_8x8_blocks_wide_lookup[BLOCK_64X64];
+      const int block_height = num_8x8_blocks_high_lookup[BLOCK_64X64];
+      if (mi_col + block_width / 2 < cm->mi_cols &&
+          mi_row + block_height / 2 < cm->mi_rows) {
+        set_block_size(cpi, x, xd, mi_row, mi_col, BLOCK_64X64);
+        return 0;
+      }
+    }
+  } else {
+    d = VP9_VAR_OFFS;
+    dp = 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      switch (xd->bd) {
+        case 10:
+          d = CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_10);
+          break;
+        case 12:
+          d = CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_12);
+          break;
+        case 8:
+        default:
+          d = CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_8);
+          break;
+      }
+    }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  }
+
+  // Fill in the entire tree of 8x8 (or 4x4 under some conditions) variances
+  // for splits.
+  for (i = 0; i < 4; i++) {
+    const int x32_idx = ((i & 1) << 5);
+    const int y32_idx = ((i >> 1) << 5);
+    const int i2 = i << 2;
+    force_split[i + 1] = 0;
+    avg_16x16[i] = 0;
+    for (j = 0; j < 4; j++) {
+      const int x16_idx = x32_idx + ((j & 1) << 4);
+      const int y16_idx = y32_idx + ((j >> 1) << 4);
+      const int split_index = 5 + i2 + j;
+      v16x16 *vst = &vt.split[i].split[j];
+      force_split[split_index] = 0;
+      variance4x4downsample[i2 + j] = 0;
+      if (!is_key_frame) {
+        fill_variance_8x8avg(s, sp, d, dp, x16_idx, y16_idx, vst,
+#if CONFIG_VP9_HIGHBITDEPTH
+                            xd->cur_buf->flags,
+#endif
+                            pixels_wide,
+                            pixels_high,
+                            is_key_frame);
+        fill_variance_tree(&vt.split[i].split[j], BLOCK_16X16);
+        get_variance(&vt.split[i].split[j].part_variances.none);
+        avg_16x16[i] += vt.split[i].split[j].part_variances.none.variance;
+        if (vt.split[i].split[j].part_variances.none.variance >
+            thresholds[2]) {
+          // 16X16 variance is above threshold for split, so force split to 8x8
+          // for this 16x16 block (this also forces splits for upper levels).
+          force_split[split_index] = 1;
+          force_split[i + 1] = 1;
+          force_split[0] = 1;
+        } else if (cpi->oxcf.speed < 8 &&
+                   vt.split[i].split[j].part_variances.none.variance >
+                   thresholds[1] &&
+                   !cyclic_refresh_segment_id_boosted(segment_id)) {
+          // We have some nominal amount of 16x16 variance (based on average),
+          // compute the minmax over the 8x8 sub-blocks, and if above threshold,
+          // force split to 8x8 block for this 16x16 block.
+          int minmax = compute_minmax_8x8(s, sp, d, dp, x16_idx, y16_idx,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                          xd->cur_buf->flags,
+#endif
+                                          pixels_wide, pixels_high);
+          if (minmax > cpi->vbp_threshold_minmax) {
+            force_split[split_index] = 1;
+            force_split[i + 1] = 1;
+            force_split[0] = 1;
+          }
+        }
+      }
+      if (is_key_frame || (low_res &&
+          vt.split[i].split[j].part_variances.none.variance >
+          (thresholds[1] << 1))) {
+        force_split[split_index] = 0;
+        // Go down to 4x4 down-sampling for variance.
+        variance4x4downsample[i2 + j] = 1;
+        for (k = 0; k < 4; k++) {
+          int x8_idx = x16_idx + ((k & 1) << 3);
+          int y8_idx = y16_idx + ((k >> 1) << 3);
+          v8x8 *vst2 = is_key_frame ? &vst->split[k] :
+              &vt2[i2 + j].split[k];
+          fill_variance_4x4avg(s, sp, d, dp, x8_idx, y8_idx, vst2,
+#if CONFIG_VP9_HIGHBITDEPTH
+                               xd->cur_buf->flags,
+#endif
+                               pixels_wide,
+                               pixels_high,
+                               is_key_frame);
+        }
+      }
+    }
+  }
+  // Fill the rest of the variance tree by summing split partition values.
+  avg_32x32 = 0;
+  for (i = 0; i < 4; i++) {
+    const int i2 = i << 2;
+    for (j = 0; j < 4; j++) {
+      if (variance4x4downsample[i2 + j] == 1) {
+        v16x16 *vtemp = (!is_key_frame) ? &vt2[i2 + j] :
+            &vt.split[i].split[j];
+        for (m = 0; m < 4; m++)
+          fill_variance_tree(&vtemp->split[m], BLOCK_8X8);
+        fill_variance_tree(vtemp, BLOCK_16X16);
+        // If variance of this 16x16 block is above the threshold, force block
+        // to split. This also forces a split on the upper levels.
+        get_variance(&vtemp->part_variances.none);
+        if (vtemp->part_variances.none.variance > thresholds[2]) {
+          force_split[5 + i2 + j] = 1;
+          force_split[i + 1] = 1;
+          force_split[0] = 1;
+        }
+      }
+    }
+    fill_variance_tree(&vt.split[i], BLOCK_32X32);
+    // If variance of this 32x32 block is above the threshold, or if its above
+    // (some threshold of) the average variance over the sub-16x16 blocks, then
+    // force this block to split. This also forces a split on the upper
+    // (64x64) level.
+    if (!force_split[i + 1]) {
+      get_variance(&vt.split[i].part_variances.none);
+      if (vt.split[i].part_variances.none.variance > thresholds[1] ||
+          (!is_key_frame &&
+          vt.split[i].part_variances.none.variance > (thresholds[1] >> 1) &&
+          vt.split[i].part_variances.none.variance > (avg_16x16[i] >> 1))) {
+        force_split[i + 1] = 1;
+        force_split[0] = 1;
+      }
+      avg_32x32 += vt.split[i].part_variances.none.variance;
+    }
+  }
+  if (!force_split[0]) {
+    fill_variance_tree(&vt, BLOCK_64X64);
+    get_variance(&vt.part_variances.none);
+    // If variance of this 64x64 block is above (some threshold of) the average
+    // variance over the sub-32x32 blocks, then force this block to split.
+    if (!is_key_frame &&
+        vt.part_variances.none.variance > (5 * avg_32x32) >> 4)
+      force_split[0] = 1;
+  }
+
+  // Now go through the entire structure, splitting every block size until
+  // we get to one that's got a variance lower than our threshold.
+  if ( mi_col + 8 > cm->mi_cols || mi_row + 8 > cm->mi_rows ||
+      !set_vt_partitioning(cpi, x, xd, &vt, BLOCK_64X64, mi_row, mi_col,
+                           thresholds[0], BLOCK_16X16, force_split[0])) {
+    for (i = 0; i < 4; ++i) {
+      const int x32_idx = ((i & 1) << 2);
+      const int y32_idx = ((i >> 1) << 2);
+      const int i2 = i << 2;
+      if (!set_vt_partitioning(cpi, x, xd, &vt.split[i], BLOCK_32X32,
+                               (mi_row + y32_idx), (mi_col + x32_idx),
+                               thresholds[1], BLOCK_16X16,
+                               force_split[i + 1])) {
+        for (j = 0; j < 4; ++j) {
+          const int x16_idx = ((j & 1) << 1);
+          const int y16_idx = ((j >> 1) << 1);
+          // For inter frames: if variance4x4downsample[] == 1 for this 16x16
+          // block, then the variance is based on 4x4 down-sampling, so use vt2
+          // in set_vt_partioning(), otherwise use vt.
+          v16x16 *vtemp = (!is_key_frame &&
+                           variance4x4downsample[i2 + j] == 1) ?
+                           &vt2[i2 + j] : &vt.split[i].split[j];
+          if (!set_vt_partitioning(cpi, x, xd, vtemp, BLOCK_16X16,
+                                   mi_row + y32_idx + y16_idx,
+                                   mi_col + x32_idx + x16_idx,
+                                   thresholds[2],
+                                   cpi->vbp_bsize_min,
+                                   force_split[5 + i2  + j])) {
+            for (k = 0; k < 4; ++k) {
+              const int x8_idx = (k & 1);
+              const int y8_idx = (k >> 1);
+              if (use_4x4_partition) {
+                if (!set_vt_partitioning(cpi, x, xd, &vtemp->split[k],
+                                         BLOCK_8X8,
+                                         mi_row + y32_idx + y16_idx + y8_idx,
+                                         mi_col + x32_idx + x16_idx + x8_idx,
+                                         thresholds[3], BLOCK_8X8, 0)) {
+                  set_block_size(cpi, x, xd,
+                                 (mi_row + y32_idx + y16_idx + y8_idx),
+                                 (mi_col + x32_idx + x16_idx + x8_idx),
+                                 BLOCK_4X4);
+                }
+              } else {
+                set_block_size(cpi, x, xd,
+                               (mi_row + y32_idx + y16_idx + y8_idx),
+                               (mi_col + x32_idx + x16_idx + x8_idx),
+                               BLOCK_8X8);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  return 0;
+}
+
+static void update_state(VP9_COMP *cpi, ThreadData *td,
+                         PICK_MODE_CONTEXT *ctx,
+                         int mi_row, int mi_col, BLOCK_SIZE bsize,
+                         int output_enabled) {
+  int i, x_idx, y;
+  VP9_COMMON *const cm = &cpi->common;
+  RD_COUNTS *const rdc = &td->rd_counts;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *const p = x->plane;
+  struct macroblockd_plane *const pd = xd->plane;
+  MODE_INFO *mi = &ctx->mic;
+  MODE_INFO *const xdmi = xd->mi[0];
+  MODE_INFO *mi_addr = xd->mi[0];
+  const struct segmentation *const seg = &cm->seg;
+  const int bw = num_8x8_blocks_wide_lookup[mi->sb_type];
+  const int bh = num_8x8_blocks_high_lookup[mi->sb_type];
+  const int x_mis = VPXMIN(bw, cm->mi_cols - mi_col);
+  const int y_mis = VPXMIN(bh, cm->mi_rows - mi_row);
+  MV_REF *const frame_mvs =
+      cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col;
+  int w, h;
+
+  const int mis = cm->mi_stride;
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  int max_plane;
+
+  assert(mi->sb_type == bsize);
+
+  *mi_addr = *mi;
+  *x->mbmi_ext = ctx->mbmi_ext;
+
+  // If segmentation in use
+  if (seg->enabled) {
+    // For in frame complexity AQ copy the segment id from the segment map.
+    if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
+      const uint8_t *const map = seg->update_map ? cpi->segmentation_map
+                                                 : cm->last_frame_seg_map;
+      mi_addr->segment_id =
+        get_segment_id(cm, map, bsize, mi_row, mi_col);
+    }
+    // Else for cyclic refresh mode update the segment map, set the segment id
+    // and then update the quantizer.
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+      vp9_cyclic_refresh_update_segment(cpi, xd->mi[0], mi_row,
+                                        mi_col, bsize, ctx->rate, ctx->dist,
+                                        x->skip, p);
+    }
+  }
+
+  max_plane = is_inter_block(xdmi) ? MAX_MB_PLANE : 1;
+  for (i = 0; i < max_plane; ++i) {
+    p[i].coeff = ctx->coeff_pbuf[i][1];
+    p[i].qcoeff = ctx->qcoeff_pbuf[i][1];
+    pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
+    p[i].eobs = ctx->eobs_pbuf[i][1];
+  }
+
+  for (i = max_plane; i < MAX_MB_PLANE; ++i) {
+    p[i].coeff = ctx->coeff_pbuf[i][2];
+    p[i].qcoeff = ctx->qcoeff_pbuf[i][2];
+    pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][2];
+    p[i].eobs = ctx->eobs_pbuf[i][2];
+  }
+
+  // Restore the coding context of the MB to that that was in place
+  // when the mode was picked for it
+  for (y = 0; y < mi_height; y++)
+    for (x_idx = 0; x_idx < mi_width; x_idx++)
+      if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > x_idx
+        && (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > y) {
+        xd->mi[x_idx + y * mis] = mi_addr;
+      }
+
+  if (cpi->oxcf.aq_mode)
+    vp9_init_plane_quantizers(cpi, x);
+
+  if (is_inter_block(xdmi) && xdmi->sb_type < BLOCK_8X8) {
+    xdmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
+    xdmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
+  }
+
+  x->skip = ctx->skip;
+  memcpy(x->zcoeff_blk[xdmi->tx_size], ctx->zcoeff_blk,
+         sizeof(ctx->zcoeff_blk[0]) * ctx->num_4x4_blk);
+
+  if (!output_enabled)
+    return;
+
+#if CONFIG_INTERNAL_STATS
+  if (frame_is_intra_only(cm)) {
+    static const int kf_mode_index[] = {
+      THR_DC        /*DC_PRED*/,
+      THR_V_PRED    /*V_PRED*/,
+      THR_H_PRED    /*H_PRED*/,
+      THR_D45_PRED  /*D45_PRED*/,
+      THR_D135_PRED /*D135_PRED*/,
+      THR_D117_PRED /*D117_PRED*/,
+      THR_D153_PRED /*D153_PRED*/,
+      THR_D207_PRED /*D207_PRED*/,
+      THR_D63_PRED  /*D63_PRED*/,
+      THR_TM        /*TM_PRED*/,
+    };
+    ++cpi->mode_chosen_counts[kf_mode_index[xdmi->mode]];
+  } else {
+    // Note how often each mode chosen as best
+    ++cpi->mode_chosen_counts[ctx->best_mode_index];
+  }
+#endif
+  if (!frame_is_intra_only(cm)) {
+    if (is_inter_block(xdmi)) {
+      vp9_update_mv_count(td);
+
+      if (cm->interp_filter == SWITCHABLE) {
+        const int ctx = vp9_get_pred_context_switchable_interp(xd);
+        ++td->counts->switchable_interp[ctx][xdmi->interp_filter];
+      }
+    }
+
+    rdc->comp_pred_diff[SINGLE_REFERENCE] += ctx->single_pred_diff;
+    rdc->comp_pred_diff[COMPOUND_REFERENCE] += ctx->comp_pred_diff;
+    rdc->comp_pred_diff[REFERENCE_MODE_SELECT] += ctx->hybrid_pred_diff;
+
+    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+      rdc->filter_diff[i] += ctx->best_filter_diff[i];
+  }
+
+  for (h = 0; h < y_mis; ++h) {
+    MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols;
+    for (w = 0; w < x_mis; ++w) {
+      MV_REF *const mv = frame_mv + w;
+      mv->ref_frame[0] = mi->ref_frame[0];
+      mv->ref_frame[1] = mi->ref_frame[1];
+      mv->mv[0].as_int = mi->mv[0].as_int;
+      mv->mv[1].as_int = mi->mv[1].as_int;
+    }
+  }
+}
+
+void vp9_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src,
+                          int mi_row, int mi_col) {
+  uint8_t *const buffers[3] = {src->y_buffer, src->u_buffer, src->v_buffer };
+  const int strides[3] = {src->y_stride, src->uv_stride, src->uv_stride };
+  int i;
+
+  // Set current frame pointer.
+  x->e_mbd.cur_buf = src;
+
+  for (i = 0; i < MAX_MB_PLANE; i++)
+    setup_pred_plane(&x->plane[i].src, buffers[i], strides[i], mi_row, mi_col,
+                     NULL, x->e_mbd.plane[i].subsampling_x,
+                     x->e_mbd.plane[i].subsampling_y);
+}
+
+static void set_mode_info_seg_skip(MACROBLOCK *x, TX_MODE tx_mode,
+                                   RD_COST *rd_cost, BLOCK_SIZE bsize) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *const mi = xd->mi[0];
+  INTERP_FILTER filter_ref;
+
+  if (xd->up_available)
+    filter_ref = xd->mi[-xd->mi_stride]->interp_filter;
+  else if (xd->left_available)
+    filter_ref = xd->mi[-1]->interp_filter;
+  else
+    filter_ref = EIGHTTAP;
+
+  mi->sb_type = bsize;
+  mi->mode = ZEROMV;
+  mi->tx_size =
+      VPXMIN(max_txsize_lookup[bsize], tx_mode_to_biggest_tx_size[tx_mode]);
+  mi->skip = 1;
+  mi->uv_mode = DC_PRED;
+  mi->ref_frame[0] = LAST_FRAME;
+  mi->ref_frame[1] = NONE;
+  mi->mv[0].as_int = 0;
+  mi->interp_filter = filter_ref;
+
+  xd->mi[0]->bmi[0].as_mv[0].as_int = 0;
+  x->skip = 1;
+
+  vp9_rd_cost_init(rd_cost);
+}
+
+static int set_segment_rdmult(VP9_COMP *const cpi,
+                               MACROBLOCK *const x,
+                               int8_t segment_id) {
+  int segment_qindex;
+  VP9_COMMON *const cm = &cpi->common;
+  vp9_init_plane_quantizers(cpi, x);
+  vpx_clear_system_state();
+  segment_qindex = vp9_get_qindex(&cm->seg, segment_id,
+                                  cm->base_qindex);
+  return vp9_compute_rd_mult(cpi, segment_qindex + cm->y_dc_delta_q);
+}
+
+static void rd_pick_sb_modes(VP9_COMP *cpi,
+                             TileDataEnc *tile_data,
+                             MACROBLOCK *const x,
+                             int mi_row, int mi_col, RD_COST *rd_cost,
+                             BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                             int64_t best_rd) {
+  VP9_COMMON *const cm = &cpi->common;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *mi;
+  struct macroblock_plane *const p = x->plane;
+  struct macroblockd_plane *const pd = xd->plane;
+  const AQ_MODE aq_mode = cpi->oxcf.aq_mode;
+  int i, orig_rdmult;
+
+  vpx_clear_system_state();
+
+  // Use the lower precision, but faster, 32x32 fdct for mode selection.
+  x->use_lp32x32fdct = 1;
+
+  set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+  mi = xd->mi[0];
+  mi->sb_type = bsize;
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    p[i].coeff = ctx->coeff_pbuf[i][0];
+    p[i].qcoeff = ctx->qcoeff_pbuf[i][0];
+    pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][0];
+    p[i].eobs = ctx->eobs_pbuf[i][0];
+  }
+  ctx->is_coded = 0;
+  ctx->skippable = 0;
+  ctx->pred_pixel_ready = 0;
+  x->skip_recode = 0;
+
+  // Set to zero to make sure we do not use the previous encoded frame stats
+  mi->skip = 0;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    x->source_variance =
+        vp9_high_get_sby_perpixel_variance(cpi, &x->plane[0].src,
+                                           bsize, xd->bd);
+  } else {
+    x->source_variance =
+      vp9_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
+  }
+#else
+  x->source_variance =
+    vp9_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  // Save rdmult before it might be changed, so it can be restored later.
+  orig_rdmult = x->rdmult;
+
+  if (aq_mode == VARIANCE_AQ) {
+    const int energy = bsize <= BLOCK_16X16 ? x->mb_energy
+                                            : vp9_block_energy(cpi, x, bsize);
+    if (cm->frame_type == KEY_FRAME ||
+        cpi->refresh_alt_ref_frame ||
+        (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
+      mi->segment_id = vp9_vaq_segment_id(energy);
+    } else {
+      const uint8_t *const map = cm->seg.update_map ? cpi->segmentation_map
+                                                    : cm->last_frame_seg_map;
+      mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
+    }
+    x->rdmult = set_segment_rdmult(cpi, x, mi->segment_id);
+  } else if (aq_mode == EQUATOR360_AQ) {
+    if (cm->frame_type == KEY_FRAME) {
+      mi->segment_id = vp9_360aq_segment_id(mi_row, cm->mi_rows);
+    } else {
+      const uint8_t *const map = cm->seg.update_map ? cpi->segmentation_map
+                                                    : cm->last_frame_seg_map;
+      mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
+    }
+    x->rdmult = set_segment_rdmult(cpi, x, mi->segment_id);
+  } else if (aq_mode == COMPLEXITY_AQ) {
+    x->rdmult = set_segment_rdmult(cpi, x, mi->segment_id);
+  } else if (aq_mode == CYCLIC_REFRESH_AQ) {
+    const uint8_t *const map = cm->seg.update_map ? cpi->segmentation_map
+                                                  : cm->last_frame_seg_map;
+    // If segment is boosted, use rdmult for that segment.
+    if (cyclic_refresh_segment_id_boosted(
+            get_segment_id(cm, map, bsize, mi_row, mi_col)))
+      x->rdmult = vp9_cyclic_refresh_get_rdmult(cpi->cyclic_refresh);
+  }
+
+  // Find best coding mode & reconstruct the MB so it is available
+  // as a predictor for MBs that follow in the SB
+  if (frame_is_intra_only(cm)) {
+    vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, best_rd);
+  } else {
+    if (bsize >= BLOCK_8X8) {
+      if (segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_SKIP))
+        vp9_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, rd_cost, bsize,
+                                           ctx, best_rd);
+      else
+        vp9_rd_pick_inter_mode_sb(cpi, tile_data, x, mi_row, mi_col,
+                                  rd_cost, bsize, ctx, best_rd);
+    } else {
+      vp9_rd_pick_inter_mode_sub8x8(cpi, tile_data, x, mi_row, mi_col,
+                                    rd_cost, bsize, ctx, best_rd);
+    }
+  }
+
+
+  // Examine the resulting rate and for AQ mode 2 make a segment choice.
+  if ((rd_cost->rate != INT_MAX) &&
+      (aq_mode == COMPLEXITY_AQ) && (bsize >= BLOCK_16X16) &&
+      (cm->frame_type == KEY_FRAME ||
+       cpi->refresh_alt_ref_frame ||
+       (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref))) {
+    vp9_caq_select_segment(cpi, x, bsize, mi_row, mi_col, rd_cost->rate);
+  }
+
+  x->rdmult = orig_rdmult;
+
+  // TODO(jingning) The rate-distortion optimization flow needs to be
+  // refactored to provide proper exit/return handle.
+  if (rd_cost->rate == INT_MAX)
+    rd_cost->rdcost = INT64_MAX;
+
+  ctx->rate = rd_cost->rate;
+  ctx->dist = rd_cost->dist;
+}
+
+static void update_stats(VP9_COMMON *cm, ThreadData *td) {
+  const MACROBLOCK *x = &td->mb;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MODE_INFO *const mi = xd->mi[0];
+  const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  const BLOCK_SIZE bsize = mi->sb_type;
+
+  if (!frame_is_intra_only(cm)) {
+    FRAME_COUNTS *const counts = td->counts;
+    const int inter_block = is_inter_block(mi);
+    const int seg_ref_active = segfeature_active(&cm->seg, mi->segment_id,
+                                                 SEG_LVL_REF_FRAME);
+    if (!seg_ref_active) {
+      counts->intra_inter[vp9_get_intra_inter_context(xd)][inter_block]++;
+      // If the segment reference feature is enabled we have only a single
+      // reference frame allowed for the segment so exclude it from
+      // the reference frame counts used to work out probabilities.
+      if (inter_block) {
+        const MV_REFERENCE_FRAME ref0 = mi->ref_frame[0];
+        if (cm->reference_mode == REFERENCE_MODE_SELECT)
+          counts->comp_inter[vp9_get_reference_mode_context(cm, xd)]
+                            [has_second_ref(mi)]++;
+
+        if (has_second_ref(mi)) {
+          counts->comp_ref[vp9_get_pred_context_comp_ref_p(cm, xd)]
+                          [ref0 == GOLDEN_FRAME]++;
+        } else {
+          counts->single_ref[vp9_get_pred_context_single_ref_p1(xd)][0]
+                            [ref0 != LAST_FRAME]++;
+          if (ref0 != LAST_FRAME)
+            counts->single_ref[vp9_get_pred_context_single_ref_p2(xd)][1]
+                              [ref0 != GOLDEN_FRAME]++;
+        }
+      }
+    }
+    if (inter_block &&
+        !segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_SKIP)) {
+      const int mode_ctx = mbmi_ext->mode_context[mi->ref_frame[0]];
+      if (bsize >= BLOCK_8X8) {
+        const PREDICTION_MODE mode = mi->mode;
+        ++counts->inter_mode[mode_ctx][INTER_OFFSET(mode)];
+      } else {
+        const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+        const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+        int idx, idy;
+        for (idy = 0; idy < 2; idy += num_4x4_h) {
+          for (idx = 0; idx < 2; idx += num_4x4_w) {
+            const int j = idy * 2 + idx;
+            const PREDICTION_MODE b_mode = mi->bmi[j].as_mode;
+            ++counts->inter_mode[mode_ctx][INTER_OFFSET(b_mode)];
+          }
+        }
+      }
+    }
+  }
+}
+
+static void restore_context(MACROBLOCK *const x, int mi_row, int mi_col,
+                            ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
+                            ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
+                            PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8],
+                            BLOCK_SIZE bsize) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int p;
+  const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+  const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
+  int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  int mi_height = num_8x8_blocks_high_lookup[bsize];
+  for (p = 0; p < MAX_MB_PLANE; p++) {
+    memcpy(
+        xd->above_context[p] + ((mi_col * 2) >> xd->plane[p].subsampling_x),
+        a + num_4x4_blocks_wide * p,
+        (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
+        xd->plane[p].subsampling_x);
+    memcpy(
+        xd->left_context[p]
+            + ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
+        l + num_4x4_blocks_high * p,
+        (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
+        xd->plane[p].subsampling_y);
+  }
+  memcpy(xd->above_seg_context + mi_col, sa,
+         sizeof(*xd->above_seg_context) * mi_width);
+  memcpy(xd->left_seg_context + (mi_row & MI_MASK), sl,
+         sizeof(xd->left_seg_context[0]) * mi_height);
+}
+
+static void save_context(MACROBLOCK *const x, int mi_row, int mi_col,
+                         ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
+                         ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
+                         PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8],
+                         BLOCK_SIZE bsize) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  int p;
+  const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+  const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
+  int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  int mi_height = num_8x8_blocks_high_lookup[bsize];
+
+  // buffer the above/left context information of the block in search.
+  for (p = 0; p < MAX_MB_PLANE; ++p) {
+    memcpy(
+        a + num_4x4_blocks_wide * p,
+        xd->above_context[p] + (mi_col * 2 >> xd->plane[p].subsampling_x),
+        (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
+        xd->plane[p].subsampling_x);
+    memcpy(
+        l + num_4x4_blocks_high * p,
+        xd->left_context[p]
+            + ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
+        (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
+        xd->plane[p].subsampling_y);
+  }
+  memcpy(sa, xd->above_seg_context + mi_col,
+         sizeof(*xd->above_seg_context) * mi_width);
+  memcpy(sl, xd->left_seg_context + (mi_row & MI_MASK),
+         sizeof(xd->left_seg_context[0]) * mi_height);
+}
+
+static void encode_b(VP9_COMP *cpi, const TileInfo *const tile,
+                     ThreadData *td,
+                     TOKENEXTRA **tp, int mi_row, int mi_col,
+                     int output_enabled, BLOCK_SIZE bsize,
+                     PICK_MODE_CONTEXT *ctx) {
+  MACROBLOCK *const x = &td->mb;
+  set_offsets(cpi, tile, x, mi_row, mi_col, bsize);
+  update_state(cpi, td, ctx, mi_row, mi_col, bsize, output_enabled);
+  encode_superblock(cpi, td, tp, output_enabled, mi_row, mi_col, bsize, ctx);
+
+  if (output_enabled) {
+    update_stats(&cpi->common, td);
+
+    (*tp)->token = EOSB_TOKEN;
+    (*tp)++;
+  }
+}
+
+static void encode_sb(VP9_COMP *cpi, ThreadData *td,
+                      const TileInfo *const tile,
+                      TOKENEXTRA **tp, int mi_row, int mi_col,
+                      int output_enabled, BLOCK_SIZE bsize,
+                      PC_TREE *pc_tree) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
+  int ctx;
+  PARTITION_TYPE partition;
+  BLOCK_SIZE subsize = bsize;
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
+
+  if (bsize >= BLOCK_8X8) {
+    ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
+    subsize = get_subsize(bsize, pc_tree->partitioning);
+  } else {
+    ctx = 0;
+    subsize = BLOCK_4X4;
+  }
+
+  partition = partition_lookup[bsl][subsize];
+  if (output_enabled && bsize != BLOCK_4X4)
+    td->counts->partition[ctx][partition]++;
+
+  switch (partition) {
+    case PARTITION_NONE:
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
+               &pc_tree->none);
+      break;
+    case PARTITION_VERT:
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
+               &pc_tree->vertical[0]);
+      if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) {
+        encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, output_enabled,
+                 subsize, &pc_tree->vertical[1]);
+      }
+      break;
+    case PARTITION_HORZ:
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
+               &pc_tree->horizontal[0]);
+      if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) {
+        encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, output_enabled,
+                 subsize, &pc_tree->horizontal[1]);
+      }
+      break;
+    case PARTITION_SPLIT:
+      if (bsize == BLOCK_8X8) {
+        encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
+                 pc_tree->leaf_split[0]);
+      } else {
+        encode_sb(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize,
+                  pc_tree->split[0]);
+        encode_sb(cpi, td, tile, tp, mi_row, mi_col + hbs, output_enabled,
+                  subsize, pc_tree->split[1]);
+        encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col, output_enabled,
+                  subsize, pc_tree->split[2]);
+        encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs, output_enabled,
+                  subsize, pc_tree->split[3]);
+      }
+      break;
+    default:
+      assert(0 && "Invalid partition type.");
+      break;
+  }
+
+  if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
+    update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+}
+
+// Check to see if the given partition size is allowed for a specified number
+// of 8x8 block rows and columns remaining in the image.
+// If not then return the largest allowed partition size
+static BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize,
+                                      int rows_left, int cols_left,
+                                      int *bh, int *bw) {
+  if (rows_left <= 0 || cols_left <= 0) {
+    return VPXMIN(bsize, BLOCK_8X8);
+  } else {
+    for (; bsize > 0; bsize -= 3) {
+      *bh = num_8x8_blocks_high_lookup[bsize];
+      *bw = num_8x8_blocks_wide_lookup[bsize];
+      if ((*bh <= rows_left) && (*bw <= cols_left)) {
+        break;
+      }
+    }
+  }
+  return bsize;
+}
+
+static void set_partial_b64x64_partition(MODE_INFO *mi, int mis,
+    int bh_in, int bw_in, int row8x8_remaining, int col8x8_remaining,
+    BLOCK_SIZE bsize, MODE_INFO **mi_8x8) {
+  int bh = bh_in;
+  int r, c;
+  for (r = 0; r < MI_BLOCK_SIZE; r += bh) {
+    int bw = bw_in;
+    for (c = 0; c < MI_BLOCK_SIZE; c += bw) {
+      const int index = r * mis + c;
+      mi_8x8[index] = mi + index;
+      mi_8x8[index]->sb_type = find_partition_size(bsize,
+          row8x8_remaining - r, col8x8_remaining - c, &bh, &bw);
+    }
+  }
+}
+
+// This function attempts to set all mode info entries in a given SB64
+// to the same block partition size.
+// However, at the bottom and right borders of the image the requested size
+// may not be allowed in which case this code attempts to choose the largest
+// allowable partition.
+static void set_fixed_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
+                                   MODE_INFO **mi_8x8, int mi_row, int mi_col,
+                                   BLOCK_SIZE bsize) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int mis = cm->mi_stride;
+  const int row8x8_remaining = tile->mi_row_end - mi_row;
+  const int col8x8_remaining = tile->mi_col_end - mi_col;
+  int block_row, block_col;
+  MODE_INFO *mi_upper_left = cm->mi + mi_row * mis + mi_col;
+  int bh = num_8x8_blocks_high_lookup[bsize];
+  int bw = num_8x8_blocks_wide_lookup[bsize];
+
+  assert((row8x8_remaining > 0) && (col8x8_remaining > 0));
+
+  // Apply the requested partition size to the SB64 if it is all "in image"
+  if ((col8x8_remaining >= MI_BLOCK_SIZE) &&
+      (row8x8_remaining >= MI_BLOCK_SIZE)) {
+    for (block_row = 0; block_row < MI_BLOCK_SIZE; block_row += bh) {
+      for (block_col = 0; block_col < MI_BLOCK_SIZE; block_col += bw) {
+        int index = block_row * mis + block_col;
+        mi_8x8[index] = mi_upper_left + index;
+        mi_8x8[index]->sb_type = bsize;
+      }
+    }
+  } else {
+    // Else this is a partial SB64.
+    set_partial_b64x64_partition(mi_upper_left, mis, bh, bw, row8x8_remaining,
+        col8x8_remaining, bsize, mi_8x8);
+  }
+}
+
+static const struct {
+  int row;
+  int col;
+} coord_lookup[16] = {
+    // 32x32 index = 0
+    {0, 0}, {0, 2}, {2, 0}, {2, 2},
+    // 32x32 index = 1
+    {0, 4}, {0, 6}, {2, 4}, {2, 6},
+    // 32x32 index = 2
+    {4, 0}, {4, 2}, {6, 0}, {6, 2},
+    // 32x32 index = 3
+    {4, 4}, {4, 6}, {6, 4}, {6, 6},
+};
+
+static void set_source_var_based_partition(VP9_COMP *cpi,
+                                           const TileInfo *const tile,
+                                           MACROBLOCK *const x,
+                                           MODE_INFO **mi_8x8,
+                                           int mi_row, int mi_col) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int mis = cm->mi_stride;
+  const int row8x8_remaining = tile->mi_row_end - mi_row;
+  const int col8x8_remaining = tile->mi_col_end - mi_col;
+  MODE_INFO *mi_upper_left = cm->mi + mi_row * mis + mi_col;
+
+  vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col);
+
+  assert((row8x8_remaining > 0) && (col8x8_remaining > 0));
+
+  // In-image SB64
+  if ((col8x8_remaining >= MI_BLOCK_SIZE) &&
+      (row8x8_remaining >= MI_BLOCK_SIZE)) {
+    int i, j;
+    int index;
+    diff d32[4];
+    const int offset = (mi_row >> 1) * cm->mb_cols + (mi_col >> 1);
+    int is_larger_better = 0;
+    int use32x32 = 0;
+    unsigned int thr = cpi->source_var_thresh;
+
+    memset(d32, 0, 4 * sizeof(diff));
+
+    for (i = 0; i < 4; i++) {
+      diff *d16[4];
+
+      for (j = 0; j < 4; j++) {
+        int b_mi_row = coord_lookup[i * 4 + j].row;
+        int b_mi_col = coord_lookup[i * 4 + j].col;
+        int boffset = b_mi_row / 2 * cm->mb_cols +
+                      b_mi_col / 2;
+
+        d16[j] = cpi->source_diff_var + offset + boffset;
+
+        index = b_mi_row * mis + b_mi_col;
+        mi_8x8[index] = mi_upper_left + index;
+        mi_8x8[index]->sb_type = BLOCK_16X16;
+
+        // TODO(yunqingwang): If d16[j].var is very large, use 8x8 partition
+        // size to further improve quality.
+      }
+
+      is_larger_better = (d16[0]->var < thr) && (d16[1]->var < thr) &&
+          (d16[2]->var < thr) && (d16[3]->var < thr);
+
+      // Use 32x32 partition
+      if (is_larger_better) {
+        use32x32 += 1;
+
+        for (j = 0; j < 4; j++) {
+          d32[i].sse += d16[j]->sse;
+          d32[i].sum += d16[j]->sum;
+        }
+
+        d32[i].var = d32[i].sse - (((int64_t)d32[i].sum * d32[i].sum) >> 10);
+
+        index = coord_lookup[i*4].row * mis + coord_lookup[i*4].col;
+        mi_8x8[index] = mi_upper_left + index;
+        mi_8x8[index]->sb_type = BLOCK_32X32;
+      }
+    }
+
+    if (use32x32 == 4) {
+      thr <<= 1;
+      is_larger_better = (d32[0].var < thr) && (d32[1].var < thr) &&
+          (d32[2].var < thr) && (d32[3].var < thr);
+
+      // Use 64x64 partition
+      if (is_larger_better) {
+        mi_8x8[0] = mi_upper_left;
+        mi_8x8[0]->sb_type = BLOCK_64X64;
+      }
+    }
+  } else {   // partial in-image SB64
+    int bh = num_8x8_blocks_high_lookup[BLOCK_16X16];
+    int bw = num_8x8_blocks_wide_lookup[BLOCK_16X16];
+    set_partial_b64x64_partition(mi_upper_left, mis, bh, bw,
+        row8x8_remaining, col8x8_remaining, BLOCK_16X16, mi_8x8);
+  }
+}
+
+static void update_state_rt(VP9_COMP *cpi, ThreadData *td,
+                            PICK_MODE_CONTEXT *ctx,
+                            int mi_row, int mi_col, int bsize) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *const mi = xd->mi[0];
+  struct macroblock_plane *const p = x->plane;
+  const struct segmentation *const seg = &cm->seg;
+  const int bw = num_8x8_blocks_wide_lookup[mi->sb_type];
+  const int bh = num_8x8_blocks_high_lookup[mi->sb_type];
+  const int x_mis = VPXMIN(bw, cm->mi_cols - mi_col);
+  const int y_mis = VPXMIN(bh, cm->mi_rows - mi_row);
+
+  *(xd->mi[0]) = ctx->mic;
+  *(x->mbmi_ext) = ctx->mbmi_ext;
+
+  if (seg->enabled && cpi->oxcf.aq_mode) {
+    // For in frame complexity AQ or variance AQ, copy segment_id from
+    // segmentation_map.
+    if (cpi->oxcf.aq_mode == COMPLEXITY_AQ ||
+        cpi->oxcf.aq_mode == VARIANCE_AQ ||
+        cpi->oxcf.aq_mode == EQUATOR360_AQ) {
+      const uint8_t *const map = seg->update_map ? cpi->segmentation_map
+                                                 : cm->last_frame_seg_map;
+      mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
+    } else {
+    // Setting segmentation map for cyclic_refresh.
+      vp9_cyclic_refresh_update_segment(cpi, mi, mi_row, mi_col, bsize,
+                                        ctx->rate, ctx->dist, x->skip, p);
+    }
+    vp9_init_plane_quantizers(cpi, x);
+  }
+
+  if (is_inter_block(mi)) {
+    vp9_update_mv_count(td);
+    if (cm->interp_filter == SWITCHABLE) {
+      const int pred_ctx = vp9_get_pred_context_switchable_interp(xd);
+      ++td->counts->switchable_interp[pred_ctx][mi->interp_filter];
+    }
+
+    if (mi->sb_type < BLOCK_8X8) {
+      mi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
+      mi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
+    }
+  }
+
+  if (cm->use_prev_frame_mvs ||
+      (cpi->svc.use_base_mv && cpi->svc.number_spatial_layers > 1
+        && cpi->svc.spatial_layer_id != cpi->svc.number_spatial_layers - 1)) {
+    MV_REF *const frame_mvs =
+        cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col;
+    int w, h;
+
+    for (h = 0; h < y_mis; ++h) {
+      MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols;
+      for (w = 0; w < x_mis; ++w) {
+        MV_REF *const mv = frame_mv + w;
+        mv->ref_frame[0] = mi->ref_frame[0];
+        mv->ref_frame[1] = mi->ref_frame[1];
+        mv->mv[0].as_int = mi->mv[0].as_int;
+        mv->mv[1].as_int = mi->mv[1].as_int;
+      }
+    }
+  }
+
+  x->skip = ctx->skip;
+  x->skip_txfm[0] = mi->segment_id ? 0 : ctx->skip_txfm[0];
+}
+
+static void encode_b_rt(VP9_COMP *cpi, ThreadData *td,
+                        const TileInfo *const tile,
+                        TOKENEXTRA **tp, int mi_row, int mi_col,
+                        int output_enabled, BLOCK_SIZE bsize,
+                        PICK_MODE_CONTEXT *ctx) {
+  MACROBLOCK *const x = &td->mb;
+  set_offsets(cpi, tile, x, mi_row, mi_col, bsize);
+  update_state_rt(cpi, td, ctx, mi_row, mi_col, bsize);
+
+  encode_superblock(cpi, td, tp, output_enabled, mi_row, mi_col, bsize, ctx);
+  update_stats(&cpi->common, td);
+
+  (*tp)->token = EOSB_TOKEN;
+  (*tp)++;
+}
+
+static void encode_sb_rt(VP9_COMP *cpi, ThreadData *td,
+                         const TileInfo *const tile,
+                         TOKENEXTRA **tp, int mi_row, int mi_col,
+                         int output_enabled, BLOCK_SIZE bsize,
+                         PC_TREE *pc_tree) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
+  int ctx;
+  PARTITION_TYPE partition;
+  BLOCK_SIZE subsize;
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
+
+  if (bsize >= BLOCK_8X8) {
+    const int idx_str = xd->mi_stride * mi_row + mi_col;
+    MODE_INFO ** mi_8x8 = cm->mi_grid_visible + idx_str;
+    ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
+    subsize = mi_8x8[0]->sb_type;
+  } else {
+    ctx = 0;
+    subsize = BLOCK_4X4;
+  }
+
+  partition = partition_lookup[bsl][subsize];
+  if (output_enabled && bsize != BLOCK_4X4)
+    td->counts->partition[ctx][partition]++;
+
+  switch (partition) {
+    case PARTITION_NONE:
+      encode_b_rt(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize,
+                  &pc_tree->none);
+      break;
+    case PARTITION_VERT:
+      encode_b_rt(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize,
+                  &pc_tree->vertical[0]);
+      if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) {
+        encode_b_rt(cpi, td, tile, tp, mi_row, mi_col + hbs, output_enabled,
+                    subsize, &pc_tree->vertical[1]);
+      }
+      break;
+    case PARTITION_HORZ:
+      encode_b_rt(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize,
+                  &pc_tree->horizontal[0]);
+      if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) {
+        encode_b_rt(cpi, td, tile, tp, mi_row + hbs, mi_col, output_enabled,
+                    subsize, &pc_tree->horizontal[1]);
+      }
+      break;
+    case PARTITION_SPLIT:
+      subsize = get_subsize(bsize, PARTITION_SPLIT);
+      encode_sb_rt(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize,
+                   pc_tree->split[0]);
+      encode_sb_rt(cpi, td, tile, tp, mi_row, mi_col + hbs, output_enabled,
+                   subsize, pc_tree->split[1]);
+      encode_sb_rt(cpi, td, tile, tp, mi_row + hbs, mi_col, output_enabled,
+                   subsize, pc_tree->split[2]);
+      encode_sb_rt(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs,
+                   output_enabled, subsize, pc_tree->split[3]);
+      break;
+    default:
+      assert(0 && "Invalid partition type.");
+      break;
+  }
+
+  if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
+    update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+}
+
+static void rd_use_partition(VP9_COMP *cpi,
+                             ThreadData *td,
+                             TileDataEnc *tile_data,
+                             MODE_INFO **mi_8x8, TOKENEXTRA **tp,
+                             int mi_row, int mi_col,
+                             BLOCK_SIZE bsize,
+                             int *rate, int64_t *dist,
+                             int do_recon, PC_TREE *pc_tree) {
+  VP9_COMMON *const cm = &cpi->common;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int mis = cm->mi_stride;
+  const int bsl = b_width_log2_lookup[bsize];
+  const int mi_step = num_4x4_blocks_wide_lookup[bsize] / 2;
+  const int bss = (1 << bsl) / 4;
+  int i, pl;
+  PARTITION_TYPE partition = PARTITION_NONE;
+  BLOCK_SIZE subsize;
+  ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
+  PARTITION_CONTEXT sl[8], sa[8];
+  RD_COST last_part_rdc, none_rdc, chosen_rdc;
+  BLOCK_SIZE sub_subsize = BLOCK_4X4;
+  int splits_below = 0;
+  BLOCK_SIZE bs_type = mi_8x8[0]->sb_type;
+  int do_partition_search = 1;
+  PICK_MODE_CONTEXT *ctx = &pc_tree->none;
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
+
+  assert(num_4x4_blocks_wide_lookup[bsize] ==
+         num_4x4_blocks_high_lookup[bsize]);
+
+  vp9_rd_cost_reset(&last_part_rdc);
+  vp9_rd_cost_reset(&none_rdc);
+  vp9_rd_cost_reset(&chosen_rdc);
+
+  partition = partition_lookup[bsl][bs_type];
+  subsize = get_subsize(bsize, partition);
+
+  pc_tree->partitioning = partition;
+  save_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+
+  if (bsize == BLOCK_16X16 && cpi->oxcf.aq_mode) {
+    set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+    x->mb_energy = vp9_block_energy(cpi, x, bsize);
+  }
+
+  if (do_partition_search &&
+      cpi->sf.partition_search_type == SEARCH_PARTITION &&
+      cpi->sf.adjust_partitioning_from_last_frame) {
+    // Check if any of the sub blocks are further split.
+    if (partition == PARTITION_SPLIT && subsize > BLOCK_8X8) {
+      sub_subsize = get_subsize(subsize, PARTITION_SPLIT);
+      splits_below = 1;
+      for (i = 0; i < 4; i++) {
+        int jj = i >> 1, ii = i & 0x01;
+        MODE_INFO *this_mi = mi_8x8[jj * bss * mis + ii * bss];
+        if (this_mi && this_mi->sb_type >= sub_subsize) {
+          splits_below = 0;
+        }
+      }
+    }
+
+    // If partition is not none try none unless each of the 4 splits are split
+    // even further..
+    if (partition != PARTITION_NONE && !splits_below &&
+        mi_row + (mi_step >> 1) < cm->mi_rows &&
+        mi_col + (mi_step >> 1) < cm->mi_cols) {
+      pc_tree->partitioning = PARTITION_NONE;
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc, bsize,
+                       ctx, INT64_MAX);
+
+      pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+
+      if (none_rdc.rate < INT_MAX) {
+        none_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
+        none_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, none_rdc.rate,
+                                 none_rdc.dist);
+      }
+
+      restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+      mi_8x8[0]->sb_type = bs_type;
+      pc_tree->partitioning = partition;
+    }
+  }
+
+  switch (partition) {
+    case PARTITION_NONE:
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+                       bsize, ctx, INT64_MAX);
+      break;
+    case PARTITION_HORZ:
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+                       subsize, &pc_tree->horizontal[0],
+                       INT64_MAX);
+      if (last_part_rdc.rate != INT_MAX &&
+          bsize >= BLOCK_8X8 && mi_row + (mi_step >> 1) < cm->mi_rows) {
+        RD_COST tmp_rdc;
+        PICK_MODE_CONTEXT *ctx = &pc_tree->horizontal[0];
+        vp9_rd_cost_init(&tmp_rdc);
+        update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
+        encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
+        rd_pick_sb_modes(cpi, tile_data, x,
+                         mi_row + (mi_step >> 1), mi_col, &tmp_rdc,
+                         subsize, &pc_tree->horizontal[1], INT64_MAX);
+        if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+          vp9_rd_cost_reset(&last_part_rdc);
+          break;
+        }
+        last_part_rdc.rate += tmp_rdc.rate;
+        last_part_rdc.dist += tmp_rdc.dist;
+        last_part_rdc.rdcost += tmp_rdc.rdcost;
+      }
+      break;
+    case PARTITION_VERT:
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+                       subsize, &pc_tree->vertical[0], INT64_MAX);
+      if (last_part_rdc.rate != INT_MAX &&
+          bsize >= BLOCK_8X8 && mi_col + (mi_step >> 1) < cm->mi_cols) {
+        RD_COST tmp_rdc;
+        PICK_MODE_CONTEXT *ctx = &pc_tree->vertical[0];
+        vp9_rd_cost_init(&tmp_rdc);
+        update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
+        encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
+        rd_pick_sb_modes(cpi, tile_data, x,
+                         mi_row, mi_col + (mi_step >> 1), &tmp_rdc,
+                         subsize, &pc_tree->vertical[bsize > BLOCK_8X8],
+                         INT64_MAX);
+        if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+          vp9_rd_cost_reset(&last_part_rdc);
+          break;
+        }
+        last_part_rdc.rate += tmp_rdc.rate;
+        last_part_rdc.dist += tmp_rdc.dist;
+        last_part_rdc.rdcost += tmp_rdc.rdcost;
+      }
+      break;
+    case PARTITION_SPLIT:
+      if (bsize == BLOCK_8X8) {
+        rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+                         subsize, pc_tree->leaf_split[0], INT64_MAX);
+        break;
+      }
+      last_part_rdc.rate = 0;
+      last_part_rdc.dist = 0;
+      last_part_rdc.rdcost = 0;
+      for (i = 0; i < 4; i++) {
+        int x_idx = (i & 1) * (mi_step >> 1);
+        int y_idx = (i >> 1) * (mi_step >> 1);
+        int jj = i >> 1, ii = i & 0x01;
+        RD_COST tmp_rdc;
+        if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
+          continue;
+
+        vp9_rd_cost_init(&tmp_rdc);
+        rd_use_partition(cpi, td, tile_data,
+                         mi_8x8 + jj * bss * mis + ii * bss, tp,
+                         mi_row + y_idx, mi_col + x_idx, subsize,
+                         &tmp_rdc.rate, &tmp_rdc.dist,
+                         i != 3, pc_tree->split[i]);
+        if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+          vp9_rd_cost_reset(&last_part_rdc);
+          break;
+        }
+        last_part_rdc.rate += tmp_rdc.rate;
+        last_part_rdc.dist += tmp_rdc.dist;
+      }
+      break;
+    default:
+      assert(0);
+      break;
+  }
+
+  pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+  if (last_part_rdc.rate < INT_MAX) {
+    last_part_rdc.rate += cpi->partition_cost[pl][partition];
+    last_part_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+                                  last_part_rdc.rate, last_part_rdc.dist);
+  }
+
+  if (do_partition_search
+      && cpi->sf.adjust_partitioning_from_last_frame
+      && cpi->sf.partition_search_type == SEARCH_PARTITION
+      && partition != PARTITION_SPLIT && bsize > BLOCK_8X8
+      && (mi_row + mi_step < cm->mi_rows ||
+          mi_row + (mi_step >> 1) == cm->mi_rows)
+      && (mi_col + mi_step < cm->mi_cols ||
+          mi_col + (mi_step >> 1) == cm->mi_cols)) {
+    BLOCK_SIZE split_subsize = get_subsize(bsize, PARTITION_SPLIT);
+    chosen_rdc.rate = 0;
+    chosen_rdc.dist = 0;
+    restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+    pc_tree->partitioning = PARTITION_SPLIT;
+
+    // Split partition.
+    for (i = 0; i < 4; i++) {
+      int x_idx = (i & 1) * (mi_step >> 1);
+      int y_idx = (i >> 1) * (mi_step >> 1);
+      RD_COST tmp_rdc;
+      ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
+      PARTITION_CONTEXT sl[8], sa[8];
+
+      if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
+        continue;
+
+      save_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+      pc_tree->split[i]->partitioning = PARTITION_NONE;
+      rd_pick_sb_modes(cpi, tile_data, x,
+                       mi_row + y_idx, mi_col + x_idx, &tmp_rdc,
+                       split_subsize, &pc_tree->split[i]->none, INT64_MAX);
+
+      restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+
+      if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+        vp9_rd_cost_reset(&chosen_rdc);
+        break;
+      }
+
+      chosen_rdc.rate += tmp_rdc.rate;
+      chosen_rdc.dist += tmp_rdc.dist;
+
+      if (i != 3)
+        encode_sb(cpi, td, tile_info, tp,  mi_row + y_idx, mi_col + x_idx, 0,
+                  split_subsize, pc_tree->split[i]);
+
+      pl = partition_plane_context(xd, mi_row + y_idx, mi_col + x_idx,
+                                   split_subsize);
+      chosen_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
+    }
+    pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+    if (chosen_rdc.rate < INT_MAX) {
+      chosen_rdc.rate += cpi->partition_cost[pl][PARTITION_SPLIT];
+      chosen_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+                                 chosen_rdc.rate, chosen_rdc.dist);
+    }
+  }
+
+  // If last_part is better set the partitioning to that.
+  if (last_part_rdc.rdcost < chosen_rdc.rdcost) {
+    mi_8x8[0]->sb_type = bsize;
+    if (bsize >= BLOCK_8X8)
+      pc_tree->partitioning = partition;
+    chosen_rdc = last_part_rdc;
+  }
+  // If none was better set the partitioning to that.
+  if (none_rdc.rdcost < chosen_rdc.rdcost) {
+    if (bsize >= BLOCK_8X8)
+      pc_tree->partitioning = PARTITION_NONE;
+    chosen_rdc = none_rdc;
+  }
+
+  restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+
+  // We must have chosen a partitioning and encoding or we'll fail later on.
+  // No other opportunities for success.
+  if (bsize == BLOCK_64X64)
+    assert(chosen_rdc.rate < INT_MAX && chosen_rdc.dist < INT64_MAX);
+
+  if (do_recon) {
+    int output_enabled = (bsize == BLOCK_64X64);
+    encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, bsize,
+              pc_tree);
+  }
+
+  *rate = chosen_rdc.rate;
+  *dist = chosen_rdc.dist;
+}
+
+static const BLOCK_SIZE min_partition_size[BLOCK_SIZES] = {
+  BLOCK_4X4,   BLOCK_4X4,   BLOCK_4X4,
+  BLOCK_4X4,   BLOCK_4X4,   BLOCK_4X4,
+  BLOCK_8X8,   BLOCK_8X8,   BLOCK_8X8,
+  BLOCK_16X16, BLOCK_16X16, BLOCK_16X16,
+  BLOCK_16X16
+};
+
+static const BLOCK_SIZE max_partition_size[BLOCK_SIZES] = {
+  BLOCK_8X8,   BLOCK_16X16, BLOCK_16X16,
+  BLOCK_16X16, BLOCK_32X32, BLOCK_32X32,
+  BLOCK_32X32, BLOCK_64X64, BLOCK_64X64,
+  BLOCK_64X64, BLOCK_64X64, BLOCK_64X64,
+  BLOCK_64X64
+};
+
+
+// Look at all the mode_info entries for blocks that are part of this
+// partition and find the min and max values for sb_type.
+// At the moment this is designed to work on a 64x64 SB but could be
+// adjusted to use a size parameter.
+//
+// The min and max are assumed to have been initialized prior to calling this
+// function so repeat calls can accumulate a min and max of more than one sb64.
+static void get_sb_partition_size_range(MACROBLOCKD *xd, MODE_INFO **mi_8x8,
+                                        BLOCK_SIZE *min_block_size,
+                                        BLOCK_SIZE *max_block_size,
+                                        int bs_hist[BLOCK_SIZES]) {
+  int sb_width_in_blocks = MI_BLOCK_SIZE;
+  int sb_height_in_blocks  = MI_BLOCK_SIZE;
+  int i, j;
+  int index = 0;
+
+  // Check the sb_type for each block that belongs to this region.
+  for (i = 0; i < sb_height_in_blocks; ++i) {
+    for (j = 0; j < sb_width_in_blocks; ++j) {
+      MODE_INFO *mi = mi_8x8[index+j];
+      BLOCK_SIZE sb_type = mi ? mi->sb_type : 0;
+      bs_hist[sb_type]++;
+      *min_block_size = VPXMIN(*min_block_size, sb_type);
+      *max_block_size = VPXMAX(*max_block_size, sb_type);
+    }
+    index += xd->mi_stride;
+  }
+}
+
+// Next square block size less or equal than current block size.
+static const BLOCK_SIZE next_square_size[BLOCK_SIZES] = {
+  BLOCK_4X4, BLOCK_4X4, BLOCK_4X4,
+  BLOCK_8X8, BLOCK_8X8, BLOCK_8X8,
+  BLOCK_16X16, BLOCK_16X16, BLOCK_16X16,
+  BLOCK_32X32, BLOCK_32X32, BLOCK_32X32,
+  BLOCK_64X64
+};
+
+// Look at neighboring blocks and set a min and max partition size based on
+// what they chose.
+static void rd_auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile,
+                                    MACROBLOCKD *const xd,
+                                    int mi_row, int mi_col,
+                                    BLOCK_SIZE *min_block_size,
+                                    BLOCK_SIZE *max_block_size) {
+  VP9_COMMON *const cm = &cpi->common;
+  MODE_INFO **mi = xd->mi;
+  const int left_in_image = xd->left_available && mi[-1];
+  const int above_in_image = xd->up_available && mi[-xd->mi_stride];
+  const int row8x8_remaining = tile->mi_row_end - mi_row;
+  const int col8x8_remaining = tile->mi_col_end - mi_col;
+  int bh, bw;
+  BLOCK_SIZE min_size = BLOCK_4X4;
+  BLOCK_SIZE max_size = BLOCK_64X64;
+  int bs_hist[BLOCK_SIZES] = {0};
+
+  // Trap case where we do not have a prediction.
+  if (left_in_image || above_in_image || cm->frame_type != KEY_FRAME) {
+    // Default "min to max" and "max to min"
+    min_size = BLOCK_64X64;
+    max_size = BLOCK_4X4;
+
+    // NOTE: each call to get_sb_partition_size_range() uses the previous
+    // passed in values for min and max as a starting point.
+    // Find the min and max partition used in previous frame at this location
+    if (cm->frame_type != KEY_FRAME) {
+      MODE_INFO **prev_mi =
+          &cm->prev_mi_grid_visible[mi_row * xd->mi_stride + mi_col];
+      get_sb_partition_size_range(xd, prev_mi, &min_size, &max_size, bs_hist);
+    }
+    // Find the min and max partition sizes used in the left SB64
+    if (left_in_image) {
+      MODE_INFO **left_sb64_mi = &mi[-MI_BLOCK_SIZE];
+      get_sb_partition_size_range(xd, left_sb64_mi, &min_size, &max_size,
+                                  bs_hist);
+    }
+    // Find the min and max partition sizes used in the above SB64.
+    if (above_in_image) {
+      MODE_INFO **above_sb64_mi = &mi[-xd->mi_stride * MI_BLOCK_SIZE];
+      get_sb_partition_size_range(xd, above_sb64_mi, &min_size, &max_size,
+                                  bs_hist);
+    }
+
+    // Adjust observed min and max for "relaxed" auto partition case.
+    if (cpi->sf.auto_min_max_partition_size == RELAXED_NEIGHBORING_MIN_MAX) {
+      min_size = min_partition_size[min_size];
+      max_size = max_partition_size[max_size];
+    }
+  }
+
+  // Check border cases where max and min from neighbors may not be legal.
+  max_size = find_partition_size(max_size,
+                                 row8x8_remaining, col8x8_remaining,
+                                 &bh, &bw);
+  // Test for blocks at the edge of the active image.
+  // This may be the actual edge of the image or where there are formatting
+  // bars.
+  if (vp9_active_edge_sb(cpi, mi_row, mi_col)) {
+    min_size = BLOCK_4X4;
+  } else {
+    min_size =
+        VPXMIN(cpi->sf.rd_auto_partition_min_limit, VPXMIN(min_size, max_size));
+  }
+
+  // When use_square_partition_only is true, make sure at least one square
+  // partition is allowed by selecting the next smaller square size as
+  // *min_block_size.
+  if (cpi->sf.use_square_partition_only &&
+      next_square_size[max_size] < min_size) {
+     min_size = next_square_size[max_size];
+  }
+
+  *min_block_size = min_size;
+  *max_block_size = max_size;
+}
+
+// TODO(jingning) refactor functions setting partition search range
+static void set_partition_range(VP9_COMMON *cm, MACROBLOCKD *xd,
+                                int mi_row, int mi_col, BLOCK_SIZE bsize,
+                                BLOCK_SIZE *min_bs, BLOCK_SIZE *max_bs) {
+  int mi_width  = num_8x8_blocks_wide_lookup[bsize];
+  int mi_height = num_8x8_blocks_high_lookup[bsize];
+  int idx, idy;
+
+  MODE_INFO *mi;
+  const int idx_str = cm->mi_stride * mi_row + mi_col;
+  MODE_INFO **prev_mi = &cm->prev_mi_grid_visible[idx_str];
+  BLOCK_SIZE bs, min_size, max_size;
+
+  min_size = BLOCK_64X64;
+  max_size = BLOCK_4X4;
+
+  if (prev_mi) {
+    for (idy = 0; idy < mi_height; ++idy) {
+      for (idx = 0; idx < mi_width; ++idx) {
+        mi = prev_mi[idy * cm->mi_stride + idx];
+        bs = mi ? mi->sb_type : bsize;
+        min_size = VPXMIN(min_size, bs);
+        max_size = VPXMAX(max_size, bs);
+      }
+    }
+  }
+
+  if (xd->left_available) {
+    for (idy = 0; idy < mi_height; ++idy) {
+      mi = xd->mi[idy * cm->mi_stride - 1];
+      bs = mi ? mi->sb_type : bsize;
+      min_size = VPXMIN(min_size, bs);
+      max_size = VPXMAX(max_size, bs);
+    }
+  }
+
+  if (xd->up_available) {
+    for (idx = 0; idx < mi_width; ++idx) {
+      mi = xd->mi[idx - cm->mi_stride];
+      bs = mi ? mi->sb_type : bsize;
+      min_size = VPXMIN(min_size, bs);
+      max_size = VPXMAX(max_size, bs);
+    }
+  }
+
+  if (min_size == max_size) {
+    min_size = min_partition_size[min_size];
+    max_size = max_partition_size[max_size];
+  }
+
+  *min_bs = min_size;
+  *max_bs = max_size;
+}
+
+static INLINE void store_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
+  memcpy(ctx->pred_mv, x->pred_mv, sizeof(x->pred_mv));
+}
+
+static INLINE void load_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
+  memcpy(x->pred_mv, ctx->pred_mv, sizeof(x->pred_mv));
+}
+
+#if CONFIG_FP_MB_STATS
+const int num_16x16_blocks_wide_lookup[BLOCK_SIZES] =
+  {1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 4, 4};
+const int num_16x16_blocks_high_lookup[BLOCK_SIZES] =
+  {1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 4, 2, 4};
+const int qindex_skip_threshold_lookup[BLOCK_SIZES] =
+  {0, 10, 10, 30, 40, 40, 60, 80, 80, 90, 100, 100, 120};
+const int qindex_split_threshold_lookup[BLOCK_SIZES] =
+  {0, 3, 3, 7, 15, 15, 30, 40, 40, 60, 80, 80, 120};
+const int complexity_16x16_blocks_threshold[BLOCK_SIZES] =
+  {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 6};
+
+typedef enum {
+  MV_ZERO = 0,
+  MV_LEFT = 1,
+  MV_UP = 2,
+  MV_RIGHT = 3,
+  MV_DOWN = 4,
+  MV_INVALID
+} MOTION_DIRECTION;
+
+static INLINE MOTION_DIRECTION get_motion_direction_fp(uint8_t fp_byte) {
+  if (fp_byte & FPMB_MOTION_ZERO_MASK) {
+    return MV_ZERO;
+  } else if (fp_byte & FPMB_MOTION_LEFT_MASK) {
+    return MV_LEFT;
+  } else if (fp_byte & FPMB_MOTION_RIGHT_MASK) {
+    return MV_RIGHT;
+  } else if (fp_byte & FPMB_MOTION_UP_MASK) {
+    return MV_UP;
+  } else {
+    return MV_DOWN;
+  }
+}
+
+static INLINE int get_motion_inconsistency(MOTION_DIRECTION this_mv,
+                                           MOTION_DIRECTION that_mv) {
+  if (this_mv == that_mv) {
+    return 0;
+  } else {
+    return abs(this_mv - that_mv) == 2 ? 2 : 1;
+  }
+}
+#endif
+
+// TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
+// unlikely to be selected depending on previous rate-distortion optimization
+// results, for encoding speed-up.
+static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
+                              TileDataEnc *tile_data,
+                              TOKENEXTRA **tp, int mi_row, int mi_col,
+                              BLOCK_SIZE bsize, RD_COST *rd_cost,
+                              int64_t best_rd, PC_TREE *pc_tree) {
+  VP9_COMMON *const cm = &cpi->common;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int mi_step = num_8x8_blocks_wide_lookup[bsize] / 2;
+  ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
+  PARTITION_CONTEXT sl[8], sa[8];
+  TOKENEXTRA *tp_orig = *tp;
+  PICK_MODE_CONTEXT *ctx = &pc_tree->none;
+  int i, pl;
+  BLOCK_SIZE subsize;
+  RD_COST this_rdc, sum_rdc, best_rdc;
+  int do_split = bsize >= BLOCK_8X8;
+  int do_rect = 1;
+
+  // Override skipping rectangular partition operations for edge blocks
+  const int force_horz_split = (mi_row + mi_step >= cm->mi_rows);
+  const int force_vert_split = (mi_col + mi_step >= cm->mi_cols);
+  const int xss = x->e_mbd.plane[1].subsampling_x;
+  const int yss = x->e_mbd.plane[1].subsampling_y;
+
+  BLOCK_SIZE min_size = x->min_partition_size;
+  BLOCK_SIZE max_size = x->max_partition_size;
+
+#if CONFIG_FP_MB_STATS
+  unsigned int src_diff_var = UINT_MAX;
+  int none_complexity = 0;
+#endif
+
+  int partition_none_allowed = !force_horz_split && !force_vert_split;
+  int partition_horz_allowed = !force_vert_split && yss <= xss &&
+                               bsize >= BLOCK_8X8;
+  int partition_vert_allowed = !force_horz_split && xss <= yss &&
+                               bsize >= BLOCK_8X8;
+
+  int64_t dist_breakout_thr = cpi->sf.partition_search_breakout_dist_thr;
+  int rate_breakout_thr = cpi->sf.partition_search_breakout_rate_thr;
+
+  (void)*tp_orig;
+
+  assert(num_8x8_blocks_wide_lookup[bsize] ==
+             num_8x8_blocks_high_lookup[bsize]);
+
+  // Adjust dist breakout threshold according to the partition size.
+  dist_breakout_thr >>= 8 - (b_width_log2_lookup[bsize] +
+      b_height_log2_lookup[bsize]);
+  rate_breakout_thr *= num_pels_log2_lookup[bsize];
+
+  vp9_rd_cost_init(&this_rdc);
+  vp9_rd_cost_init(&sum_rdc);
+  vp9_rd_cost_reset(&best_rdc);
+  best_rdc.rdcost = best_rd;
+
+  set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+
+  if (bsize == BLOCK_16X16 && cpi->oxcf.aq_mode)
+    x->mb_energy = vp9_block_energy(cpi, x, bsize);
+
+  if (cpi->sf.cb_partition_search && bsize == BLOCK_16X16) {
+    int cb_partition_search_ctrl = ((pc_tree->index == 0 || pc_tree->index == 3)
+        + get_chessboard_index(cm->current_video_frame)) & 0x1;
+
+    if (cb_partition_search_ctrl && bsize > min_size && bsize < max_size)
+      set_partition_range(cm, xd, mi_row, mi_col, bsize, &min_size, &max_size);
+  }
+
+  // Determine partition types in search according to the speed features.
+  // The threshold set here has to be of square block size.
+  if (cpi->sf.auto_min_max_partition_size) {
+    partition_none_allowed &= (bsize <= max_size && bsize >= min_size);
+    partition_horz_allowed &= ((bsize <= max_size && bsize > min_size) ||
+                                force_horz_split);
+    partition_vert_allowed &= ((bsize <= max_size && bsize > min_size) ||
+                                force_vert_split);
+    do_split &= bsize > min_size;
+  }
+
+  if (cpi->sf.use_square_partition_only &&
+      bsize > cpi->sf.use_square_only_threshold) {
+    if (cpi->use_svc) {
+      if (!vp9_active_h_edge(cpi, mi_row, mi_step) || x->e_mbd.lossless)
+        partition_horz_allowed &= force_horz_split;
+      if (!vp9_active_v_edge(cpi, mi_row, mi_step) || x->e_mbd.lossless)
+        partition_vert_allowed &= force_vert_split;
+    } else {
+      partition_horz_allowed &= force_horz_split;
+      partition_vert_allowed &= force_vert_split;
+    }
+  }
+
+  save_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+
+#if CONFIG_FP_MB_STATS
+  if (cpi->use_fp_mb_stats) {
+    set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+    src_diff_var = get_sby_perpixel_diff_variance(cpi, &x->plane[0].src,
+                                                  mi_row, mi_col, bsize);
+  }
+#endif
+
+#if CONFIG_FP_MB_STATS
+  // Decide whether we shall split directly and skip searching NONE by using
+  // the first pass block statistics
+  if (cpi->use_fp_mb_stats && bsize >= BLOCK_32X32 && do_split &&
+      partition_none_allowed && src_diff_var > 4 &&
+      cm->base_qindex < qindex_split_threshold_lookup[bsize]) {
+    int mb_row = mi_row >> 1;
+    int mb_col = mi_col >> 1;
+    int mb_row_end =
+        VPXMIN(mb_row + num_16x16_blocks_high_lookup[bsize], cm->mb_rows);
+    int mb_col_end =
+        VPXMIN(mb_col + num_16x16_blocks_wide_lookup[bsize], cm->mb_cols);
+    int r, c;
+
+    // compute a complexity measure, basically measure inconsistency of motion
+    // vectors obtained from the first pass in the current block
+    for (r = mb_row; r < mb_row_end ; r++) {
+      for (c = mb_col; c < mb_col_end; c++) {
+        const int mb_index = r * cm->mb_cols + c;
+
+        MOTION_DIRECTION this_mv;
+        MOTION_DIRECTION right_mv;
+        MOTION_DIRECTION bottom_mv;
+
+        this_mv =
+            get_motion_direction_fp(cpi->twopass.this_frame_mb_stats[mb_index]);
+
+        // to its right
+        if (c != mb_col_end - 1) {
+          right_mv = get_motion_direction_fp(
+              cpi->twopass.this_frame_mb_stats[mb_index + 1]);
+          none_complexity += get_motion_inconsistency(this_mv, right_mv);
+        }
+
+        // to its bottom
+        if (r != mb_row_end - 1) {
+          bottom_mv = get_motion_direction_fp(
+              cpi->twopass.this_frame_mb_stats[mb_index + cm->mb_cols]);
+          none_complexity += get_motion_inconsistency(this_mv, bottom_mv);
+        }
+
+        // do not count its left and top neighbors to avoid double counting
+      }
+    }
+
+    if (none_complexity > complexity_16x16_blocks_threshold[bsize]) {
+      partition_none_allowed = 0;
+    }
+  }
+#endif
+
+  // PARTITION_NONE
+  if (partition_none_allowed) {
+    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col,
+                     &this_rdc, bsize, ctx, best_rdc.rdcost);
+    if (this_rdc.rate != INT_MAX) {
+      if (bsize >= BLOCK_8X8) {
+        pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+        this_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
+        this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+                                 this_rdc.rate, this_rdc.dist);
+      }
+
+      if (this_rdc.rdcost < best_rdc.rdcost) {
+        best_rdc = this_rdc;
+        if (bsize >= BLOCK_8X8)
+          pc_tree->partitioning = PARTITION_NONE;
+
+        // If all y, u, v transform blocks in this partition are skippable, and
+        // the dist & rate are within the thresholds, the partition search is
+        // terminated for current branch of the partition search tree.
+        if (!x->e_mbd.lossless && ctx->skippable  &&
+            ((best_rdc.dist < (dist_breakout_thr >> 2)) ||
+             (best_rdc.dist < dist_breakout_thr &&
+              best_rdc.rate < rate_breakout_thr))) {
+          do_split = 0;
+          do_rect = 0;
+        }
+
+#if CONFIG_FP_MB_STATS
+        // Check if every 16x16 first pass block statistics has zero
+        // motion and the corresponding first pass residue is small enough.
+        // If that is the case, check the difference variance between the
+        // current frame and the last frame. If the variance is small enough,
+        // stop further splitting in RD optimization
+        if (cpi->use_fp_mb_stats && do_split != 0 &&
+            cm->base_qindex > qindex_skip_threshold_lookup[bsize]) {
+          int mb_row = mi_row >> 1;
+          int mb_col = mi_col >> 1;
+          int mb_row_end =
+              VPXMIN(mb_row + num_16x16_blocks_high_lookup[bsize], cm->mb_rows);
+          int mb_col_end =
+              VPXMIN(mb_col + num_16x16_blocks_wide_lookup[bsize], cm->mb_cols);
+          int r, c;
+
+          int skip = 1;
+          for (r = mb_row; r < mb_row_end; r++) {
+            for (c = mb_col; c < mb_col_end; c++) {
+              const int mb_index = r * cm->mb_cols + c;
+              if (!(cpi->twopass.this_frame_mb_stats[mb_index] &
+                    FPMB_MOTION_ZERO_MASK) ||
+                  !(cpi->twopass.this_frame_mb_stats[mb_index] &
+                    FPMB_ERROR_SMALL_MASK)) {
+                skip = 0;
+                break;
+              }
+            }
+            if (skip == 0) {
+              break;
+            }
+          }
+          if (skip) {
+            if (src_diff_var == UINT_MAX) {
+              set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+              src_diff_var = get_sby_perpixel_diff_variance(
+                  cpi, &x->plane[0].src, mi_row, mi_col, bsize);
+            }
+            if (src_diff_var < 8) {
+              do_split = 0;
+              do_rect = 0;
+            }
+          }
+        }
+#endif
+      }
+    }
+    restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+  }
+
+  // store estimated motion vector
+  if (cpi->sf.adaptive_motion_search)
+    store_pred_mv(x, ctx);
+
+  // PARTITION_SPLIT
+  // TODO(jingning): use the motion vectors given by the above search as
+  // the starting point of motion search in the following partition type check.
+  if (do_split) {
+    subsize = get_subsize(bsize, PARTITION_SPLIT);
+    if (bsize == BLOCK_8X8) {
+      i = 4;
+      if (cpi->sf.adaptive_pred_interp_filter && partition_none_allowed)
+        pc_tree->leaf_split[0]->pred_interp_filter =
+            ctx->mic.interp_filter;
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
+                       pc_tree->leaf_split[0], best_rdc.rdcost);
+      if (sum_rdc.rate == INT_MAX)
+        sum_rdc.rdcost = INT64_MAX;
+    } else {
+      for (i = 0; i < 4 && sum_rdc.rdcost < best_rdc.rdcost; ++i) {
+      const int x_idx = (i & 1) * mi_step;
+      const int y_idx = (i >> 1) * mi_step;
+
+        if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
+          continue;
+
+        if (cpi->sf.adaptive_motion_search)
+          load_pred_mv(x, ctx);
+
+        pc_tree->split[i]->index = i;
+        rd_pick_partition(cpi, td, tile_data, tp,
+                          mi_row + y_idx, mi_col + x_idx,
+                          subsize, &this_rdc,
+                          best_rdc.rdcost - sum_rdc.rdcost, pc_tree->split[i]);
+
+        if (this_rdc.rate == INT_MAX) {
+          sum_rdc.rdcost = INT64_MAX;
+          break;
+        } else {
+          sum_rdc.rate += this_rdc.rate;
+          sum_rdc.dist += this_rdc.dist;
+          sum_rdc.rdcost += this_rdc.rdcost;
+        }
+      }
+    }
+
+    if (sum_rdc.rdcost < best_rdc.rdcost && i == 4) {
+      pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+      sum_rdc.rate += cpi->partition_cost[pl][PARTITION_SPLIT];
+      sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+                              sum_rdc.rate, sum_rdc.dist);
+
+      if (sum_rdc.rdcost < best_rdc.rdcost) {
+        best_rdc = sum_rdc;
+        pc_tree->partitioning = PARTITION_SPLIT;
+
+        // Rate and distortion based partition search termination clause.
+        if (!x->e_mbd.lossless &&
+            ((best_rdc.dist < (dist_breakout_thr >> 2)) ||
+             (best_rdc.dist < dist_breakout_thr &&
+              best_rdc.rate < rate_breakout_thr))) {
+          do_rect = 0;
+        }
+      }
+    } else {
+      // skip rectangular partition test when larger block size
+      // gives better rd cost
+      if ((cpi->sf.less_rectangular_check) &&
+          ((bsize > cpi->sf.use_square_only_threshold) ||
+           (best_rdc.dist < dist_breakout_thr)))
+        do_rect &= !partition_none_allowed;
+    }
+    restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+  }
+
+  // PARTITION_HORZ
+  if (partition_horz_allowed &&
+      (do_rect || vp9_active_h_edge(cpi, mi_row, mi_step))) {
+    subsize = get_subsize(bsize, PARTITION_HORZ);
+    if (cpi->sf.adaptive_motion_search)
+      load_pred_mv(x, ctx);
+    if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
+        partition_none_allowed)
+      pc_tree->horizontal[0].pred_interp_filter =
+          ctx->mic.interp_filter;
+    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
+                     &pc_tree->horizontal[0], best_rdc.rdcost);
+
+    if (sum_rdc.rdcost < best_rdc.rdcost && mi_row + mi_step < cm->mi_rows &&
+        bsize > BLOCK_8X8) {
+      PICK_MODE_CONTEXT *ctx = &pc_tree->horizontal[0];
+      update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
+      encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
+
+      if (cpi->sf.adaptive_motion_search)
+        load_pred_mv(x, ctx);
+      if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
+          partition_none_allowed)
+        pc_tree->horizontal[1].pred_interp_filter =
+            ctx->mic.interp_filter;
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col,
+                       &this_rdc, subsize, &pc_tree->horizontal[1],
+                       best_rdc.rdcost - sum_rdc.rdcost);
+      if (this_rdc.rate == INT_MAX) {
+        sum_rdc.rdcost = INT64_MAX;
+      } else {
+        sum_rdc.rate += this_rdc.rate;
+        sum_rdc.dist += this_rdc.dist;
+        sum_rdc.rdcost += this_rdc.rdcost;
+      }
+    }
+
+    if (sum_rdc.rdcost < best_rdc.rdcost) {
+      pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+      sum_rdc.rate += cpi->partition_cost[pl][PARTITION_HORZ];
+      sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+      if (sum_rdc.rdcost < best_rdc.rdcost) {
+        best_rdc = sum_rdc;
+        pc_tree->partitioning = PARTITION_HORZ;
+
+        if ((cpi->sf.less_rectangular_check) &&
+            (bsize > cpi->sf.use_square_only_threshold))
+          do_rect = 0;
+      }
+    }
+    restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+  }
+  // PARTITION_VERT
+  if (partition_vert_allowed &&
+      (do_rect || vp9_active_v_edge(cpi, mi_col, mi_step))) {
+    subsize = get_subsize(bsize, PARTITION_VERT);
+
+    if (cpi->sf.adaptive_motion_search)
+      load_pred_mv(x, ctx);
+    if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
+        partition_none_allowed)
+      pc_tree->vertical[0].pred_interp_filter =
+          ctx->mic.interp_filter;
+    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
+                     &pc_tree->vertical[0], best_rdc.rdcost);
+    if (sum_rdc.rdcost < best_rdc.rdcost && mi_col + mi_step < cm->mi_cols &&
+        bsize > BLOCK_8X8) {
+      update_state(cpi, td, &pc_tree->vertical[0], mi_row, mi_col, subsize, 0);
+      encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize,
+                        &pc_tree->vertical[0]);
+
+      if (cpi->sf.adaptive_motion_search)
+        load_pred_mv(x, ctx);
+      if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
+          partition_none_allowed)
+        pc_tree->vertical[1].pred_interp_filter =
+            ctx->mic.interp_filter;
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step,
+                       &this_rdc, subsize,
+                       &pc_tree->vertical[1], best_rdc.rdcost - sum_rdc.rdcost);
+      if (this_rdc.rate == INT_MAX) {
+        sum_rdc.rdcost = INT64_MAX;
+      } else {
+        sum_rdc.rate += this_rdc.rate;
+        sum_rdc.dist += this_rdc.dist;
+        sum_rdc.rdcost += this_rdc.rdcost;
+      }
+    }
+
+    if (sum_rdc.rdcost < best_rdc.rdcost) {
+      pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+      sum_rdc.rate += cpi->partition_cost[pl][PARTITION_VERT];
+      sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+                              sum_rdc.rate, sum_rdc.dist);
+      if (sum_rdc.rdcost < best_rdc.rdcost) {
+        best_rdc = sum_rdc;
+        pc_tree->partitioning = PARTITION_VERT;
+      }
+    }
+    restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+  }
+
+  // TODO(jbb): This code added so that we avoid static analysis
+  // warning related to the fact that best_rd isn't used after this
+  // point.  This code should be refactored so that the duplicate
+  // checks occur in some sub function and thus are used...
+  (void) best_rd;
+  *rd_cost = best_rdc;
+
+  if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX &&
+      pc_tree->index != 3) {
+    int output_enabled = (bsize == BLOCK_64X64);
+    encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled,
+              bsize, pc_tree);
+  }
+
+  if (bsize == BLOCK_64X64) {
+    assert(tp_orig < *tp);
+    assert(best_rdc.rate < INT_MAX);
+    assert(best_rdc.dist < INT64_MAX);
+  } else {
+    assert(tp_orig == *tp);
+  }
+}
+
+static void encode_rd_sb_row(VP9_COMP *cpi,
+                             ThreadData *td,
+                             TileDataEnc *tile_data,
+                             int mi_row,
+                             TOKENEXTRA **tp) {
+  VP9_COMMON *const cm = &cpi->common;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  SPEED_FEATURES *const sf = &cpi->sf;
+  int mi_col;
+
+  // Initialize the left context for the new SB row
+  memset(&xd->left_context, 0, sizeof(xd->left_context));
+  memset(xd->left_seg_context, 0, sizeof(xd->left_seg_context));
+
+  // Code each SB in the row
+  for (mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end;
+       mi_col += MI_BLOCK_SIZE) {
+    const struct segmentation *const seg = &cm->seg;
+    int dummy_rate;
+    int64_t dummy_dist;
+    RD_COST dummy_rdc;
+    int i;
+    int seg_skip = 0;
+
+    const int idx_str = cm->mi_stride * mi_row + mi_col;
+    MODE_INFO **mi = cm->mi_grid_visible + idx_str;
+
+    if (sf->adaptive_pred_interp_filter) {
+      for (i = 0; i < 64; ++i)
+        td->leaf_tree[i].pred_interp_filter = SWITCHABLE;
+
+      for (i = 0; i < 64; ++i) {
+        td->pc_tree[i].vertical[0].pred_interp_filter = SWITCHABLE;
+        td->pc_tree[i].vertical[1].pred_interp_filter = SWITCHABLE;
+        td->pc_tree[i].horizontal[0].pred_interp_filter = SWITCHABLE;
+        td->pc_tree[i].horizontal[1].pred_interp_filter = SWITCHABLE;
+      }
+    }
+
+    vp9_zero(x->pred_mv);
+    td->pc_root->index = 0;
+
+    if (seg->enabled) {
+      const uint8_t *const map = seg->update_map ? cpi->segmentation_map
+                                                 : cm->last_frame_seg_map;
+      int segment_id = get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col);
+      seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP);
+    }
+
+    x->source_variance = UINT_MAX;
+    if (sf->partition_search_type == FIXED_PARTITION || seg_skip) {
+      const BLOCK_SIZE bsize =
+          seg_skip ? BLOCK_64X64 : sf->always_this_block_size;
+      set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
+      set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
+      rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+                       BLOCK_64X64, &dummy_rate, &dummy_dist, 1, td->pc_root);
+    } else if (cpi->partition_search_skippable_frame) {
+      BLOCK_SIZE bsize;
+      set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
+      bsize = get_rd_var_based_fixed_partition(cpi, x, mi_row, mi_col);
+      set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
+      rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+                       BLOCK_64X64, &dummy_rate, &dummy_dist, 1, td->pc_root);
+    } else if (sf->partition_search_type == VAR_BASED_PARTITION &&
+               cm->frame_type != KEY_FRAME) {
+      choose_partitioning(cpi, tile_info, x, mi_row, mi_col);
+      rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+                       BLOCK_64X64, &dummy_rate, &dummy_dist, 1, td->pc_root);
+    } else {
+      // If required set upper and lower partition size limits
+      if (sf->auto_min_max_partition_size) {
+        set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
+        rd_auto_partition_range(cpi, tile_info, xd, mi_row, mi_col,
+                                &x->min_partition_size,
+                                &x->max_partition_size);
+      }
+      rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, BLOCK_64X64,
+                        &dummy_rdc, INT64_MAX, td->pc_root);
+    }
+  }
+}
+
+static void init_encode_frame_mb_context(VP9_COMP *cpi) {
+  MACROBLOCK *const x = &cpi->td.mb;
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+
+  // Copy data over into macro block data structures.
+  vp9_setup_src_planes(x, cpi->Source, 0, 0);
+
+  vp9_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
+
+  // Note: this memset assumes above_context[0], [1] and [2]
+  // are allocated as part of the same buffer.
+  memset(xd->above_context[0], 0,
+         sizeof(*xd->above_context[0]) *
+         2 * aligned_mi_cols * MAX_MB_PLANE);
+  memset(xd->above_seg_context, 0,
+         sizeof(*xd->above_seg_context) * aligned_mi_cols);
+}
+
+static int check_dual_ref_flags(VP9_COMP *cpi) {
+  const int ref_flags = cpi->ref_frame_flags;
+
+  if (segfeature_active(&cpi->common.seg, 1, SEG_LVL_REF_FRAME)) {
+    return 0;
+  } else {
+    return (!!(ref_flags & VP9_GOLD_FLAG) + !!(ref_flags & VP9_LAST_FLAG)
+        + !!(ref_flags & VP9_ALT_FLAG)) >= 2;
+  }
+}
+
+static void reset_skip_tx_size(VP9_COMMON *cm, TX_SIZE max_tx_size) {
+  int mi_row, mi_col;
+  const int mis = cm->mi_stride;
+  MODE_INFO **mi_ptr = cm->mi_grid_visible;
+
+  for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row, mi_ptr += mis) {
+    for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) {
+      if (mi_ptr[mi_col]->tx_size > max_tx_size)
+        mi_ptr[mi_col]->tx_size = max_tx_size;
+    }
+  }
+}
+
+static MV_REFERENCE_FRAME get_frame_type(const VP9_COMP *cpi) {
+  if (frame_is_intra_only(&cpi->common))
+    return INTRA_FRAME;
+  else if (cpi->rc.is_src_frame_alt_ref && cpi->refresh_golden_frame)
+    return ALTREF_FRAME;
+  else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)
+    return GOLDEN_FRAME;
+  else
+    return LAST_FRAME;
+}
+
+static TX_MODE select_tx_mode(const VP9_COMP *cpi, MACROBLOCKD *const xd) {
+  if (xd->lossless)
+    return ONLY_4X4;
+  if (cpi->common.frame_type == KEY_FRAME &&
+      cpi->sf.use_nonrd_pick_mode)
+    return ALLOW_16X16;
+  if (cpi->sf.tx_size_search_method == USE_LARGESTALL)
+    return ALLOW_32X32;
+  else if (cpi->sf.tx_size_search_method == USE_FULL_RD||
+           cpi->sf.tx_size_search_method == USE_TX_8X8)
+    return TX_MODE_SELECT;
+  else
+    return cpi->common.tx_mode;
+}
+
+static void hybrid_intra_mode_search(VP9_COMP *cpi, MACROBLOCK *const x,
+                                     RD_COST *rd_cost, BLOCK_SIZE bsize,
+                                     PICK_MODE_CONTEXT *ctx) {
+  if (bsize < BLOCK_16X16)
+    vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, INT64_MAX);
+  else
+    vp9_pick_intra_mode(cpi, x, rd_cost, bsize, ctx);
+}
+
+static void nonrd_pick_sb_modes(VP9_COMP *cpi,
+                                TileDataEnc *tile_data, MACROBLOCK *const x,
+                                int mi_row, int mi_col, RD_COST *rd_cost,
+                                BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
+  VP9_COMMON *const cm = &cpi->common;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *mi;
+  ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
+  BLOCK_SIZE bs = VPXMAX(bsize, BLOCK_8X8);  // processing unit block size
+  const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bs];
+  const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bs];
+  int plane;
+
+  set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+  mi = xd->mi[0];
+  mi->sb_type = bsize;
+
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    struct macroblockd_plane *pd = &xd->plane[plane];
+    memcpy(a + num_4x4_blocks_wide * plane, pd->above_context,
+           (sizeof(a[0]) * num_4x4_blocks_wide) >> pd->subsampling_x);
+    memcpy(l + num_4x4_blocks_high * plane, pd->left_context,
+           (sizeof(l[0]) * num_4x4_blocks_high) >> pd->subsampling_y);
+  }
+
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled)
+    if (cyclic_refresh_segment_id_boosted(mi->segment_id))
+      x->rdmult = vp9_cyclic_refresh_get_rdmult(cpi->cyclic_refresh);
+
+  if (cm->frame_type == KEY_FRAME)
+    hybrid_intra_mode_search(cpi, x, rd_cost, bsize, ctx);
+  else if (segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_SKIP))
+    set_mode_info_seg_skip(x, cm->tx_mode, rd_cost, bsize);
+  else if (bsize >= BLOCK_8X8)
+    vp9_pick_inter_mode(cpi, x, tile_data, mi_row, mi_col,
+                        rd_cost, bsize, ctx);
+  else
+    vp9_pick_inter_mode_sub8x8(cpi, x, mi_row, mi_col,
+                               rd_cost, bsize, ctx);
+
+  duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize);
+
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    struct macroblockd_plane *pd = &xd->plane[plane];
+    memcpy(pd->above_context, a + num_4x4_blocks_wide * plane,
+           (sizeof(a[0]) * num_4x4_blocks_wide) >> pd->subsampling_x);
+    memcpy(pd->left_context, l + num_4x4_blocks_high * plane,
+           (sizeof(l[0]) * num_4x4_blocks_high) >> pd->subsampling_y);
+  }
+
+  if (rd_cost->rate == INT_MAX)
+    vp9_rd_cost_reset(rd_cost);
+
+  ctx->rate = rd_cost->rate;
+  ctx->dist = rd_cost->dist;
+}
+
+static void fill_mode_info_sb(VP9_COMMON *cm, MACROBLOCK *x,
+                              int mi_row, int mi_col,
+                              BLOCK_SIZE bsize,
+                              PC_TREE *pc_tree) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
+  PARTITION_TYPE partition = pc_tree->partitioning;
+  BLOCK_SIZE subsize = get_subsize(bsize, partition);
+
+  assert(bsize >= BLOCK_8X8);
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
+
+  switch (partition) {
+    case PARTITION_NONE:
+      set_mode_info_offsets(cm, x, xd, mi_row, mi_col);
+      *(xd->mi[0]) = pc_tree->none.mic;
+      *(x->mbmi_ext) = pc_tree->none.mbmi_ext;
+      duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize);
+      break;
+    case PARTITION_VERT:
+      set_mode_info_offsets(cm, x, xd, mi_row, mi_col);
+      *(xd->mi[0]) = pc_tree->vertical[0].mic;
+      *(x->mbmi_ext) = pc_tree->vertical[0].mbmi_ext;
+      duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, subsize);
+
+      if (mi_col + hbs < cm->mi_cols) {
+        set_mode_info_offsets(cm, x, xd, mi_row, mi_col + hbs);
+        *(xd->mi[0]) = pc_tree->vertical[1].mic;
+        *(x->mbmi_ext) = pc_tree->vertical[1].mbmi_ext;
+        duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col + hbs, subsize);
+      }
+      break;
+    case PARTITION_HORZ:
+      set_mode_info_offsets(cm, x, xd, mi_row, mi_col);
+      *(xd->mi[0]) = pc_tree->horizontal[0].mic;
+      *(x->mbmi_ext) = pc_tree->horizontal[0].mbmi_ext;
+      duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, subsize);
+      if (mi_row + hbs < cm->mi_rows) {
+        set_mode_info_offsets(cm, x, xd, mi_row + hbs, mi_col);
+        *(xd->mi[0]) = pc_tree->horizontal[1].mic;
+        *(x->mbmi_ext) = pc_tree->horizontal[1].mbmi_ext;
+        duplicate_mode_info_in_sb(cm, xd, mi_row + hbs, mi_col, subsize);
+      }
+      break;
+    case PARTITION_SPLIT: {
+      fill_mode_info_sb(cm, x, mi_row, mi_col, subsize, pc_tree->split[0]);
+      fill_mode_info_sb(cm, x, mi_row, mi_col + hbs, subsize,
+                        pc_tree->split[1]);
+      fill_mode_info_sb(cm, x, mi_row + hbs, mi_col, subsize,
+                        pc_tree->split[2]);
+      fill_mode_info_sb(cm, x, mi_row + hbs, mi_col + hbs, subsize,
+                        pc_tree->split[3]);
+      break;
+    }
+    default:
+      break;
+  }
+}
+
+// Reset the prediction pixel ready flag recursively.
+static void pred_pixel_ready_reset(PC_TREE *pc_tree, BLOCK_SIZE bsize) {
+  pc_tree->none.pred_pixel_ready = 0;
+  pc_tree->horizontal[0].pred_pixel_ready = 0;
+  pc_tree->horizontal[1].pred_pixel_ready = 0;
+  pc_tree->vertical[0].pred_pixel_ready = 0;
+  pc_tree->vertical[1].pred_pixel_ready = 0;
+
+  if (bsize > BLOCK_8X8) {
+    BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_SPLIT);
+    int i;
+    for (i = 0; i < 4; ++i)
+      pred_pixel_ready_reset(pc_tree->split[i], subsize);
+  }
+}
+
+static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td,
+                                 TileDataEnc *tile_data,
+                                 TOKENEXTRA **tp, int mi_row,
+                                 int mi_col, BLOCK_SIZE bsize, RD_COST *rd_cost,
+                                 int do_recon, int64_t best_rd,
+                                 PC_TREE *pc_tree) {
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  VP9_COMMON *const cm = &cpi->common;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int ms = num_8x8_blocks_wide_lookup[bsize] / 2;
+  TOKENEXTRA *tp_orig = *tp;
+  PICK_MODE_CONTEXT *ctx = &pc_tree->none;
+  int i;
+  BLOCK_SIZE subsize = bsize;
+  RD_COST this_rdc, sum_rdc, best_rdc;
+  int do_split = bsize >= BLOCK_8X8;
+  int do_rect = 1;
+  // Override skipping rectangular partition operations for edge blocks
+  const int force_horz_split = (mi_row + ms >= cm->mi_rows);
+  const int force_vert_split = (mi_col + ms >= cm->mi_cols);
+  const int xss = x->e_mbd.plane[1].subsampling_x;
+  const int yss = x->e_mbd.plane[1].subsampling_y;
+
+  int partition_none_allowed = !force_horz_split && !force_vert_split;
+  int partition_horz_allowed = !force_vert_split && yss <= xss &&
+                               bsize >= BLOCK_8X8;
+  int partition_vert_allowed = !force_horz_split && xss <= yss &&
+                               bsize >= BLOCK_8X8;
+  (void) *tp_orig;
+
+  assert(num_8x8_blocks_wide_lookup[bsize] ==
+             num_8x8_blocks_high_lookup[bsize]);
+
+  vp9_rd_cost_init(&sum_rdc);
+  vp9_rd_cost_reset(&best_rdc);
+  best_rdc.rdcost = best_rd;
+
+  // Determine partition types in search according to the speed features.
+  // The threshold set here has to be of square block size.
+  if (sf->auto_min_max_partition_size) {
+    partition_none_allowed &= (bsize <= x->max_partition_size &&
+                               bsize >= x->min_partition_size);
+    partition_horz_allowed &= ((bsize <= x->max_partition_size &&
+                                bsize > x->min_partition_size) ||
+                                force_horz_split);
+    partition_vert_allowed &= ((bsize <= x->max_partition_size &&
+                                bsize > x->min_partition_size) ||
+                                force_vert_split);
+    do_split &= bsize > x->min_partition_size;
+  }
+  if (sf->use_square_partition_only) {
+    partition_horz_allowed &= force_horz_split;
+    partition_vert_allowed &= force_vert_split;
+  }
+
+  ctx->pred_pixel_ready = !(partition_vert_allowed ||
+                            partition_horz_allowed ||
+                            do_split);
+
+  // PARTITION_NONE
+  if (partition_none_allowed) {
+    nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col,
+                        &this_rdc, bsize, ctx);
+    ctx->mic = *xd->mi[0];
+    ctx->mbmi_ext = *x->mbmi_ext;
+    ctx->skip_txfm[0] = x->skip_txfm[0];
+    ctx->skip = x->skip;
+
+    if (this_rdc.rate != INT_MAX) {
+      int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+      this_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
+      this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+                              this_rdc.rate, this_rdc.dist);
+      if (this_rdc.rdcost < best_rdc.rdcost) {
+        int64_t dist_breakout_thr = sf->partition_search_breakout_dist_thr;
+        int64_t rate_breakout_thr = sf->partition_search_breakout_rate_thr;
+
+        dist_breakout_thr >>= 8 - (b_width_log2_lookup[bsize] +
+            b_height_log2_lookup[bsize]);
+
+        rate_breakout_thr *= num_pels_log2_lookup[bsize];
+
+        best_rdc = this_rdc;
+        if (bsize >= BLOCK_8X8)
+          pc_tree->partitioning = PARTITION_NONE;
+
+        if (!x->e_mbd.lossless &&
+            this_rdc.rate < rate_breakout_thr &&
+            this_rdc.dist < dist_breakout_thr) {
+          do_split = 0;
+          do_rect = 0;
+        }
+      }
+    }
+  }
+
+  // store estimated motion vector
+  store_pred_mv(x, ctx);
+
+  // PARTITION_SPLIT
+  if (do_split) {
+    int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+    sum_rdc.rate += cpi->partition_cost[pl][PARTITION_SPLIT];
+    sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+    subsize = get_subsize(bsize, PARTITION_SPLIT);
+    for (i = 0; i < 4 && sum_rdc.rdcost < best_rdc.rdcost; ++i) {
+      const int x_idx = (i & 1) * ms;
+      const int y_idx = (i >> 1) * ms;
+
+      if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
+        continue;
+      load_pred_mv(x, ctx);
+      nonrd_pick_partition(cpi, td, tile_data, tp,
+                           mi_row + y_idx, mi_col + x_idx,
+                           subsize, &this_rdc, 0,
+                           best_rdc.rdcost - sum_rdc.rdcost, pc_tree->split[i]);
+
+      if (this_rdc.rate == INT_MAX) {
+        vp9_rd_cost_reset(&sum_rdc);
+      } else {
+        sum_rdc.rate += this_rdc.rate;
+        sum_rdc.dist += this_rdc.dist;
+        sum_rdc.rdcost += this_rdc.rdcost;
+      }
+    }
+
+    if (sum_rdc.rdcost < best_rdc.rdcost) {
+      best_rdc = sum_rdc;
+      pc_tree->partitioning = PARTITION_SPLIT;
+    } else {
+      // skip rectangular partition test when larger block size
+      // gives better rd cost
+      if (sf->less_rectangular_check)
+        do_rect &= !partition_none_allowed;
+    }
+  }
+
+  // PARTITION_HORZ
+  if (partition_horz_allowed && do_rect) {
+    subsize = get_subsize(bsize, PARTITION_HORZ);
+    if (sf->adaptive_motion_search)
+      load_pred_mv(x, ctx);
+    pc_tree->horizontal[0].pred_pixel_ready = 1;
+    nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
+                        &pc_tree->horizontal[0]);
+
+    pc_tree->horizontal[0].mic = *xd->mi[0];
+    pc_tree->horizontal[0].mbmi_ext = *x->mbmi_ext;
+    pc_tree->horizontal[0].skip_txfm[0] = x->skip_txfm[0];
+    pc_tree->horizontal[0].skip = x->skip;
+
+    if (sum_rdc.rdcost < best_rdc.rdcost && mi_row + ms < cm->mi_rows) {
+      load_pred_mv(x, ctx);
+      pc_tree->horizontal[1].pred_pixel_ready = 1;
+      nonrd_pick_sb_modes(cpi, tile_data, x, mi_row + ms, mi_col,
+                          &this_rdc, subsize,
+                          &pc_tree->horizontal[1]);
+
+      pc_tree->horizontal[1].mic = *xd->mi[0];
+      pc_tree->horizontal[1].mbmi_ext = *x->mbmi_ext;
+      pc_tree->horizontal[1].skip_txfm[0] = x->skip_txfm[0];
+      pc_tree->horizontal[1].skip = x->skip;
+
+      if (this_rdc.rate == INT_MAX) {
+        vp9_rd_cost_reset(&sum_rdc);
+      } else {
+        int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+        this_rdc.rate += cpi->partition_cost[pl][PARTITION_HORZ];
+        sum_rdc.rate += this_rdc.rate;
+        sum_rdc.dist += this_rdc.dist;
+        sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+                                sum_rdc.rate, sum_rdc.dist);
+      }
+    }
+
+    if (sum_rdc.rdcost < best_rdc.rdcost) {
+      best_rdc = sum_rdc;
+      pc_tree->partitioning = PARTITION_HORZ;
+    } else {
+      pred_pixel_ready_reset(pc_tree, bsize);
+    }
+  }
+
+  // PARTITION_VERT
+  if (partition_vert_allowed && do_rect) {
+    subsize = get_subsize(bsize, PARTITION_VERT);
+    if (sf->adaptive_motion_search)
+      load_pred_mv(x, ctx);
+    pc_tree->vertical[0].pred_pixel_ready = 1;
+    nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
+                        &pc_tree->vertical[0]);
+    pc_tree->vertical[0].mic = *xd->mi[0];
+    pc_tree->vertical[0].mbmi_ext = *x->mbmi_ext;
+    pc_tree->vertical[0].skip_txfm[0] = x->skip_txfm[0];
+    pc_tree->vertical[0].skip = x->skip;
+
+    if (sum_rdc.rdcost < best_rdc.rdcost && mi_col + ms < cm->mi_cols) {
+      load_pred_mv(x, ctx);
+      pc_tree->vertical[1].pred_pixel_ready = 1;
+      nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + ms,
+                          &this_rdc, subsize,
+                          &pc_tree->vertical[1]);
+      pc_tree->vertical[1].mic = *xd->mi[0];
+      pc_tree->vertical[1].mbmi_ext = *x->mbmi_ext;
+      pc_tree->vertical[1].skip_txfm[0] = x->skip_txfm[0];
+      pc_tree->vertical[1].skip = x->skip;
+
+      if (this_rdc.rate == INT_MAX) {
+        vp9_rd_cost_reset(&sum_rdc);
+      } else {
+        int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+        sum_rdc.rate += cpi->partition_cost[pl][PARTITION_VERT];
+        sum_rdc.rate += this_rdc.rate;
+        sum_rdc.dist += this_rdc.dist;
+        sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+                                sum_rdc.rate, sum_rdc.dist);
+      }
+    }
+
+    if (sum_rdc.rdcost < best_rdc.rdcost) {
+      best_rdc = sum_rdc;
+      pc_tree->partitioning = PARTITION_VERT;
+    } else {
+      pred_pixel_ready_reset(pc_tree, bsize);
+    }
+  }
+
+  *rd_cost = best_rdc;
+
+  if (best_rdc.rate == INT_MAX) {
+    vp9_rd_cost_reset(rd_cost);
+    return;
+  }
+
+  // update mode info array
+  fill_mode_info_sb(cm, x, mi_row, mi_col, bsize, pc_tree);
+
+  if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX && do_recon) {
+    int output_enabled = (bsize == BLOCK_64X64);
+    encode_sb_rt(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled,
+                 bsize, pc_tree);
+  }
+
+  if (bsize == BLOCK_64X64 && do_recon) {
+    assert(tp_orig < *tp);
+    assert(best_rdc.rate < INT_MAX);
+    assert(best_rdc.dist < INT64_MAX);
+  } else {
+    assert(tp_orig == *tp);
+  }
+}
+
+static void nonrd_select_partition(VP9_COMP *cpi,
+                                   ThreadData *td,
+                                   TileDataEnc *tile_data,
+                                   MODE_INFO **mi,
+                                   TOKENEXTRA **tp,
+                                   int mi_row, int mi_col,
+                                   BLOCK_SIZE bsize, int output_enabled,
+                                   RD_COST *rd_cost, PC_TREE *pc_tree) {
+  VP9_COMMON *const cm = &cpi->common;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
+  const int mis = cm->mi_stride;
+  PARTITION_TYPE partition;
+  BLOCK_SIZE subsize;
+  RD_COST this_rdc;
+
+  vp9_rd_cost_reset(&this_rdc);
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
+
+  subsize = (bsize >= BLOCK_8X8) ? mi[0]->sb_type : BLOCK_4X4;
+  partition = partition_lookup[bsl][subsize];
+
+  if (bsize == BLOCK_32X32 && subsize == BLOCK_32X32) {
+    x->max_partition_size = BLOCK_32X32;
+    x->min_partition_size = BLOCK_16X16;
+    nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, bsize,
+                         rd_cost, 0, INT64_MAX, pc_tree);
+  } else if (bsize == BLOCK_32X32 && partition != PARTITION_NONE &&
+             subsize >= BLOCK_16X16) {
+    x->max_partition_size = BLOCK_32X32;
+    x->min_partition_size = BLOCK_8X8;
+    nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, bsize,
+                         rd_cost, 0, INT64_MAX, pc_tree);
+  } else if (bsize == BLOCK_16X16 && partition != PARTITION_NONE) {
+    x->max_partition_size = BLOCK_16X16;
+    x->min_partition_size = BLOCK_8X8;
+    nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, bsize,
+                         rd_cost, 0, INT64_MAX, pc_tree);
+  } else {
+    switch (partition) {
+      case PARTITION_NONE:
+        pc_tree->none.pred_pixel_ready = 1;
+        nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, rd_cost,
+                            subsize, &pc_tree->none);
+        pc_tree->none.mic = *xd->mi[0];
+        pc_tree->none.mbmi_ext = *x->mbmi_ext;
+        pc_tree->none.skip_txfm[0] = x->skip_txfm[0];
+        pc_tree->none.skip = x->skip;
+        break;
+      case PARTITION_VERT:
+        pc_tree->vertical[0].pred_pixel_ready = 1;
+        nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, rd_cost,
+                            subsize, &pc_tree->vertical[0]);
+        pc_tree->vertical[0].mic = *xd->mi[0];
+        pc_tree->vertical[0].mbmi_ext = *x->mbmi_ext;
+        pc_tree->vertical[0].skip_txfm[0] = x->skip_txfm[0];
+        pc_tree->vertical[0].skip = x->skip;
+        if (mi_col + hbs < cm->mi_cols) {
+          pc_tree->vertical[1].pred_pixel_ready = 1;
+          nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs,
+                              &this_rdc, subsize, &pc_tree->vertical[1]);
+          pc_tree->vertical[1].mic = *xd->mi[0];
+          pc_tree->vertical[1].mbmi_ext = *x->mbmi_ext;
+          pc_tree->vertical[1].skip_txfm[0] = x->skip_txfm[0];
+          pc_tree->vertical[1].skip = x->skip;
+          if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
+              rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
+            rd_cost->rate += this_rdc.rate;
+            rd_cost->dist += this_rdc.dist;
+          }
+        }
+        break;
+      case PARTITION_HORZ:
+        pc_tree->horizontal[0].pred_pixel_ready = 1;
+        nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, rd_cost,
+                            subsize, &pc_tree->horizontal[0]);
+        pc_tree->horizontal[0].mic = *xd->mi[0];
+        pc_tree->horizontal[0].mbmi_ext = *x->mbmi_ext;
+        pc_tree->horizontal[0].skip_txfm[0] = x->skip_txfm[0];
+        pc_tree->horizontal[0].skip = x->skip;
+        if (mi_row + hbs < cm->mi_rows) {
+          pc_tree->horizontal[1].pred_pixel_ready = 1;
+          nonrd_pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col,
+                              &this_rdc, subsize, &pc_tree->horizontal[1]);
+          pc_tree->horizontal[1].mic = *xd->mi[0];
+          pc_tree->horizontal[1].mbmi_ext = *x->mbmi_ext;
+          pc_tree->horizontal[1].skip_txfm[0] = x->skip_txfm[0];
+          pc_tree->horizontal[1].skip = x->skip;
+          if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
+              rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
+            rd_cost->rate += this_rdc.rate;
+            rd_cost->dist += this_rdc.dist;
+          }
+        }
+        break;
+      case PARTITION_SPLIT:
+        subsize = get_subsize(bsize, PARTITION_SPLIT);
+        nonrd_select_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+                               subsize, output_enabled, rd_cost,
+                               pc_tree->split[0]);
+        nonrd_select_partition(cpi, td, tile_data, mi + hbs, tp,
+                               mi_row, mi_col + hbs, subsize, output_enabled,
+                               &this_rdc, pc_tree->split[1]);
+        if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
+            rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
+          rd_cost->rate += this_rdc.rate;
+          rd_cost->dist += this_rdc.dist;
+        }
+        nonrd_select_partition(cpi, td, tile_data, mi + hbs * mis, tp,
+                               mi_row + hbs, mi_col, subsize, output_enabled,
+                               &this_rdc, pc_tree->split[2]);
+        if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
+            rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
+          rd_cost->rate += this_rdc.rate;
+          rd_cost->dist += this_rdc.dist;
+        }
+        nonrd_select_partition(cpi, td, tile_data, mi + hbs * mis + hbs, tp,
+                               mi_row + hbs, mi_col + hbs, subsize,
+                               output_enabled, &this_rdc, pc_tree->split[3]);
+        if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
+            rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
+          rd_cost->rate += this_rdc.rate;
+          rd_cost->dist += this_rdc.dist;
+        }
+        break;
+      default:
+        assert(0 && "Invalid partition type.");
+        break;
+    }
+  }
+
+  if (bsize == BLOCK_64X64 && output_enabled)
+    encode_sb_rt(cpi, td, tile_info, tp, mi_row, mi_col, 1, bsize, pc_tree);
+}
+
+
+static void nonrd_use_partition(VP9_COMP *cpi,
+                                ThreadData *td,
+                                TileDataEnc *tile_data,
+                                MODE_INFO **mi,
+                                TOKENEXTRA **tp,
+                                int mi_row, int mi_col,
+                                BLOCK_SIZE bsize, int output_enabled,
+                                RD_COST *dummy_cost, PC_TREE *pc_tree) {
+  VP9_COMMON *const cm = &cpi->common;
+  TileInfo *tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
+  const int mis = cm->mi_stride;
+  PARTITION_TYPE partition;
+  BLOCK_SIZE subsize;
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
+
+  subsize = (bsize >= BLOCK_8X8) ? mi[0]->sb_type : BLOCK_4X4;
+  partition = partition_lookup[bsl][subsize];
+
+  if (output_enabled && bsize != BLOCK_4X4) {
+    int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
+    td->counts->partition[ctx][partition]++;
+  }
+
+  switch (partition) {
+    case PARTITION_NONE:
+      pc_tree->none.pred_pixel_ready = 1;
+      nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, dummy_cost,
+                          subsize, &pc_tree->none);
+      pc_tree->none.mic = *xd->mi[0];
+      pc_tree->none.mbmi_ext = *x->mbmi_ext;
+      pc_tree->none.skip_txfm[0] = x->skip_txfm[0];
+      pc_tree->none.skip = x->skip;
+      encode_b_rt(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled,
+                  subsize, &pc_tree->none);
+      break;
+    case PARTITION_VERT:
+      pc_tree->vertical[0].pred_pixel_ready = 1;
+      nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, dummy_cost,
+                          subsize, &pc_tree->vertical[0]);
+      pc_tree->vertical[0].mic = *xd->mi[0];
+      pc_tree->vertical[0].mbmi_ext = *x->mbmi_ext;
+      pc_tree->vertical[0].skip_txfm[0] = x->skip_txfm[0];
+      pc_tree->vertical[0].skip = x->skip;
+      encode_b_rt(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled,
+                  subsize, &pc_tree->vertical[0]);
+      if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) {
+        pc_tree->vertical[1].pred_pixel_ready = 1;
+        nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs,
+                            dummy_cost, subsize, &pc_tree->vertical[1]);
+        pc_tree->vertical[1].mic = *xd->mi[0];
+        pc_tree->vertical[1].mbmi_ext = *x->mbmi_ext;
+        pc_tree->vertical[1].skip_txfm[0] = x->skip_txfm[0];
+        pc_tree->vertical[1].skip = x->skip;
+        encode_b_rt(cpi, td, tile_info, tp, mi_row, mi_col + hbs,
+                    output_enabled, subsize, &pc_tree->vertical[1]);
+      }
+      break;
+    case PARTITION_HORZ:
+      pc_tree->horizontal[0].pred_pixel_ready = 1;
+      nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, dummy_cost,
+                          subsize, &pc_tree->horizontal[0]);
+      pc_tree->horizontal[0].mic = *xd->mi[0];
+      pc_tree->horizontal[0].mbmi_ext = *x->mbmi_ext;
+      pc_tree->horizontal[0].skip_txfm[0] = x->skip_txfm[0];
+      pc_tree->horizontal[0].skip = x->skip;
+      encode_b_rt(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled,
+                  subsize, &pc_tree->horizontal[0]);
+
+      if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) {
+        pc_tree->horizontal[1].pred_pixel_ready = 1;
+        nonrd_pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col,
+                            dummy_cost, subsize, &pc_tree->horizontal[1]);
+        pc_tree->horizontal[1].mic = *xd->mi[0];
+        pc_tree->horizontal[1].mbmi_ext = *x->mbmi_ext;
+        pc_tree->horizontal[1].skip_txfm[0] = x->skip_txfm[0];
+        pc_tree->horizontal[1].skip = x->skip;
+        encode_b_rt(cpi, td, tile_info, tp, mi_row + hbs, mi_col,
+                    output_enabled, subsize, &pc_tree->horizontal[1]);
+      }
+      break;
+    case PARTITION_SPLIT:
+      subsize = get_subsize(bsize, PARTITION_SPLIT);
+      if (bsize == BLOCK_8X8) {
+        nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, dummy_cost,
+                            subsize, pc_tree->leaf_split[0]);
+        encode_b_rt(cpi, td, tile_info, tp, mi_row, mi_col,
+                    output_enabled, subsize, pc_tree->leaf_split[0]);
+      } else {
+        nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+                            subsize, output_enabled, dummy_cost,
+                            pc_tree->split[0]);
+        nonrd_use_partition(cpi, td, tile_data, mi + hbs, tp,
+                            mi_row, mi_col + hbs, subsize, output_enabled,
+                            dummy_cost, pc_tree->split[1]);
+        nonrd_use_partition(cpi, td, tile_data, mi + hbs * mis, tp,
+                            mi_row + hbs, mi_col, subsize, output_enabled,
+                            dummy_cost, pc_tree->split[2]);
+        nonrd_use_partition(cpi, td, tile_data, mi + hbs * mis + hbs, tp,
+                            mi_row + hbs, mi_col + hbs, subsize, output_enabled,
+                            dummy_cost, pc_tree->split[3]);
+      }
+      break;
+    default:
+      assert(0 && "Invalid partition type.");
+      break;
+  }
+
+  if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
+    update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+}
+
+static void encode_nonrd_sb_row(VP9_COMP *cpi,
+                                ThreadData *td,
+                                TileDataEnc *tile_data,
+                                int mi_row,
+                                TOKENEXTRA **tp) {
+  SPEED_FEATURES *const sf = &cpi->sf;
+  VP9_COMMON *const cm = &cpi->common;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int mi_col;
+
+  // Initialize the left context for the new SB row
+  memset(&xd->left_context, 0, sizeof(xd->left_context));
+  memset(xd->left_seg_context, 0, sizeof(xd->left_seg_context));
+
+  // Code each SB in the row
+  for (mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end;
+       mi_col += MI_BLOCK_SIZE) {
+    const struct segmentation *const seg = &cm->seg;
+    RD_COST dummy_rdc;
+    const int idx_str = cm->mi_stride * mi_row + mi_col;
+    MODE_INFO **mi = cm->mi_grid_visible + idx_str;
+    PARTITION_SEARCH_TYPE partition_search_type = sf->partition_search_type;
+    BLOCK_SIZE bsize = BLOCK_64X64;
+    int seg_skip = 0;
+    x->source_variance = UINT_MAX;
+    vp9_zero(x->pred_mv);
+    vp9_rd_cost_init(&dummy_rdc);
+    x->color_sensitivity[0] = 0;
+    x->color_sensitivity[1] = 0;
+    x->sb_is_skin = 0;
+
+    if (seg->enabled) {
+      const uint8_t *const map = seg->update_map ? cpi->segmentation_map
+                                                 : cm->last_frame_seg_map;
+      int segment_id = get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col);
+      seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP);
+      if (seg_skip) {
+        partition_search_type = FIXED_PARTITION;
+      }
+    }
+
+    // Set the partition type of the 64X64 block
+    switch (partition_search_type) {
+      case VAR_BASED_PARTITION:
+        // TODO(jingning, marpan): The mode decision and encoding process
+        // support both intra and inter sub8x8 block coding for RTC mode.
+        // Tune the thresholds accordingly to use sub8x8 block coding for
+        // coding performance improvement.
+        choose_partitioning(cpi, tile_info, x, mi_row, mi_col);
+        nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+                            BLOCK_64X64, 1, &dummy_rdc, td->pc_root);
+        break;
+      case SOURCE_VAR_BASED_PARTITION:
+        set_source_var_based_partition(cpi, tile_info, x, mi, mi_row, mi_col);
+        nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+                            BLOCK_64X64, 1, &dummy_rdc, td->pc_root);
+        break;
+      case FIXED_PARTITION:
+        if (!seg_skip)
+          bsize = sf->always_this_block_size;
+        set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
+        nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+                            BLOCK_64X64, 1, &dummy_rdc, td->pc_root);
+        break;
+      case REFERENCE_PARTITION:
+        set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
+        if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled &&
+            xd->mi[0]->segment_id) {
+          // Use lower max_partition_size for low resoultions.
+          if (cm->width <= 352 && cm->height <= 288)
+            x->max_partition_size = BLOCK_32X32;
+          else
+            x->max_partition_size = BLOCK_64X64;
+          x->min_partition_size = BLOCK_8X8;
+          nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col,
+                               BLOCK_64X64, &dummy_rdc, 1,
+                               INT64_MAX, td->pc_root);
+        } else {
+          choose_partitioning(cpi, tile_info, x, mi_row, mi_col);
+          // TODO(marpan): Seems like nonrd_select_partition does not support
+          // 4x4 partition. Since 4x4 is used on key frame, use this switch
+          // for now.
+          if (cm->frame_type == KEY_FRAME)
+            nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+                                BLOCK_64X64, 1, &dummy_rdc, td->pc_root);
+          else
+            nonrd_select_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+                                   BLOCK_64X64, 1, &dummy_rdc, td->pc_root);
+        }
+
+        break;
+      default:
+        assert(0);
+        break;
+    }
+  }
+}
+// end RTC play code
+
+static int set_var_thresh_from_histogram(VP9_COMP *cpi) {
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  const VP9_COMMON *const cm = &cpi->common;
+
+  const uint8_t *src = cpi->Source->y_buffer;
+  const uint8_t *last_src = cpi->Last_Source->y_buffer;
+  const int src_stride = cpi->Source->y_stride;
+  const int last_stride = cpi->Last_Source->y_stride;
+
+  // Pick cutoff threshold
+  const int cutoff = (VPXMIN(cm->width, cm->height) >= 720) ?
+      (cm->MBs * VAR_HIST_LARGE_CUT_OFF / 100) :
+      (cm->MBs * VAR_HIST_SMALL_CUT_OFF / 100);
+  DECLARE_ALIGNED(16, int, hist[VAR_HIST_BINS]);
+  diff *var16 = cpi->source_diff_var;
+
+  int sum = 0;
+  int i, j;
+
+  memset(hist, 0, VAR_HIST_BINS * sizeof(hist[0]));
+
+  for (i = 0; i < cm->mb_rows; i++) {
+    for (j = 0; j < cm->mb_cols; j++) {
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (cm->use_highbitdepth) {
+        switch (cm->bit_depth) {
+          case VPX_BITS_8:
+            vpx_highbd_8_get16x16var(src, src_stride, last_src, last_stride,
+                                   &var16->sse, &var16->sum);
+            break;
+          case VPX_BITS_10:
+            vpx_highbd_10_get16x16var(src, src_stride, last_src, last_stride,
+                                    &var16->sse, &var16->sum);
+            break;
+          case VPX_BITS_12:
+            vpx_highbd_12_get16x16var(src, src_stride, last_src, last_stride,
+                                      &var16->sse, &var16->sum);
+            break;
+          default:
+            assert(0 && "cm->bit_depth should be VPX_BITS_8, VPX_BITS_10"
+                   " or VPX_BITS_12");
+            return -1;
+        }
+      } else {
+        vpx_get16x16var(src, src_stride, last_src, last_stride,
+                        &var16->sse, &var16->sum);
+      }
+#else
+      vpx_get16x16var(src, src_stride, last_src, last_stride,
+                      &var16->sse, &var16->sum);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      var16->var = var16->sse -
+          (((uint32_t)var16->sum * var16->sum) >> 8);
+
+      if (var16->var >= VAR_HIST_MAX_BG_VAR)
+        hist[VAR_HIST_BINS - 1]++;
+      else
+        hist[var16->var / VAR_HIST_FACTOR]++;
+
+      src += 16;
+      last_src += 16;
+      var16++;
+    }
+
+    src = src - cm->mb_cols * 16 + 16 * src_stride;
+    last_src = last_src - cm->mb_cols * 16 + 16 * last_stride;
+  }
+
+  cpi->source_var_thresh = 0;
+
+  if (hist[VAR_HIST_BINS - 1] < cutoff) {
+    for (i = 0; i < VAR_HIST_BINS - 1; i++) {
+      sum += hist[i];
+
+      if (sum > cutoff) {
+        cpi->source_var_thresh = (i + 1) * VAR_HIST_FACTOR;
+        return 0;
+      }
+    }
+  }
+
+  return sf->search_type_check_frequency;
+}
+
+static void source_var_based_partition_search_method(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  SPEED_FEATURES *const sf = &cpi->sf;
+
+  if (cm->frame_type == KEY_FRAME) {
+    // For key frame, use SEARCH_PARTITION.
+    sf->partition_search_type = SEARCH_PARTITION;
+  } else if (cm->intra_only) {
+    sf->partition_search_type = FIXED_PARTITION;
+  } else {
+    if (cm->last_width != cm->width || cm->last_height != cm->height) {
+      if (cpi->source_diff_var)
+        vpx_free(cpi->source_diff_var);
+
+      CHECK_MEM_ERROR(cm, cpi->source_diff_var,
+                      vpx_calloc(cm->MBs, sizeof(diff)));
+    }
+
+    if (!cpi->frames_till_next_var_check)
+      cpi->frames_till_next_var_check = set_var_thresh_from_histogram(cpi);
+
+    if (cpi->frames_till_next_var_check > 0) {
+      sf->partition_search_type = FIXED_PARTITION;
+      cpi->frames_till_next_var_check--;
+    }
+  }
+}
+
+static int get_skip_encode_frame(const VP9_COMMON *cm, ThreadData *const td) {
+  unsigned int intra_count = 0, inter_count = 0;
+  int j;
+
+  for (j = 0; j < INTRA_INTER_CONTEXTS; ++j) {
+    intra_count += td->counts->intra_inter[j][0];
+    inter_count += td->counts->intra_inter[j][1];
+  }
+
+  return (intra_count << 2) < inter_count &&
+         cm->frame_type != KEY_FRAME &&
+         cm->show_frame;
+}
+
+void vp9_init_tile_data(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int tile_rows = 1 << cm->log2_tile_rows;
+  int tile_col, tile_row;
+  TOKENEXTRA *pre_tok = cpi->tile_tok[0][0];
+  int tile_tok = 0;
+
+  if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) {
+    if (cpi->tile_data != NULL)
+      vpx_free(cpi->tile_data);
+    CHECK_MEM_ERROR(cm, cpi->tile_data,
+        vpx_malloc(tile_cols * tile_rows * sizeof(*cpi->tile_data)));
+    cpi->allocated_tiles = tile_cols * tile_rows;
+
+    for (tile_row = 0; tile_row < tile_rows; ++tile_row)
+      for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+        TileDataEnc *tile_data =
+            &cpi->tile_data[tile_row * tile_cols + tile_col];
+        int i, j;
+        for (i = 0; i < BLOCK_SIZES; ++i) {
+          for (j = 0; j < MAX_MODES; ++j) {
+            tile_data->thresh_freq_fact[i][j] = 32;
+            tile_data->mode_map[i][j] = j;
+          }
+        }
+      }
+  }
+
+  for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
+    for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+      TileInfo *tile_info =
+          &cpi->tile_data[tile_row * tile_cols + tile_col].tile_info;
+      vp9_tile_init(tile_info, cm, tile_row, tile_col);
+
+      cpi->tile_tok[tile_row][tile_col] = pre_tok + tile_tok;
+      pre_tok = cpi->tile_tok[tile_row][tile_col];
+      tile_tok = allocated_tokens(*tile_info);
+    }
+  }
+}
+
+void vp9_encode_tile(VP9_COMP *cpi, ThreadData *td,
+                     int tile_row, int tile_col) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  TileDataEnc *this_tile =
+      &cpi->tile_data[tile_row * tile_cols + tile_col];
+  const TileInfo * const tile_info = &this_tile->tile_info;
+  TOKENEXTRA *tok = cpi->tile_tok[tile_row][tile_col];
+  int mi_row;
+
+  // Set up pointers to per thread motion search counters.
+  td->mb.m_search_count_ptr = &td->rd_counts.m_search_count;
+  td->mb.ex_search_count_ptr = &td->rd_counts.ex_search_count;
+
+  for (mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end;
+       mi_row += MI_BLOCK_SIZE) {
+    if (cpi->sf.use_nonrd_pick_mode)
+      encode_nonrd_sb_row(cpi, td, this_tile, mi_row, &tok);
+    else
+      encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok);
+  }
+  cpi->tok_count[tile_row][tile_col] =
+      (unsigned int)(tok - cpi->tile_tok[tile_row][tile_col]);
+  assert(tok - cpi->tile_tok[tile_row][tile_col] <=
+      allocated_tokens(*tile_info));
+}
+
+static void encode_tiles(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int tile_rows = 1 << cm->log2_tile_rows;
+  int tile_col, tile_row;
+
+  vp9_init_tile_data(cpi);
+
+  for (tile_row = 0; tile_row < tile_rows; ++tile_row)
+    for (tile_col = 0; tile_col < tile_cols; ++tile_col)
+      vp9_encode_tile(cpi, &cpi->td, tile_row, tile_col);
+}
+
+#if CONFIG_FP_MB_STATS
+static int input_fpmb_stats(FIRSTPASS_MB_STATS *firstpass_mb_stats,
+                            VP9_COMMON *cm, uint8_t **this_frame_mb_stats) {
+  uint8_t *mb_stats_in = firstpass_mb_stats->mb_stats_start +
+      cm->current_video_frame * cm->MBs * sizeof(uint8_t);
+
+  if (mb_stats_in > firstpass_mb_stats->mb_stats_end)
+    return EOF;
+
+  *this_frame_mb_stats = mb_stats_in;
+
+  return 1;
+}
+#endif
+
+static void encode_frame_internal(VP9_COMP *cpi) {
+  SPEED_FEATURES *const sf = &cpi->sf;
+  ThreadData *const td = &cpi->td;
+  MACROBLOCK *const x = &td->mb;
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  RD_COUNTS *const rdc = &cpi->td.rd_counts;
+
+  xd->mi = cm->mi_grid_visible;
+  xd->mi[0] = cm->mi;
+
+  vp9_zero(*td->counts);
+  vp9_zero(rdc->coef_counts);
+  vp9_zero(rdc->comp_pred_diff);
+  vp9_zero(rdc->filter_diff);
+  rdc->m_search_count = 0;   // Count of motion search hits.
+  rdc->ex_search_count = 0;  // Exhaustive mesh search hits.
+
+
+  xd->lossless = cm->base_qindex == 0 &&
+                 cm->y_dc_delta_q == 0 &&
+                 cm->uv_dc_delta_q == 0 &&
+                 cm->uv_ac_delta_q == 0;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (cm->use_highbitdepth)
+    x->fwd_txm4x4 = xd->lossless ? vp9_highbd_fwht4x4 : vpx_highbd_fdct4x4;
+  else
+    x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vpx_fdct4x4;
+  x->highbd_itxm_add = xd->lossless ? vp9_highbd_iwht4x4_add :
+                                      vp9_highbd_idct4x4_add;
+#else
+  x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vpx_fdct4x4;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  x->itxm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
+
+  if (xd->lossless)
+    x->optimize = 0;
+
+  cm->tx_mode = select_tx_mode(cpi, xd);
+
+  vp9_frame_init_quantizer(cpi);
+
+  vp9_initialize_rd_consts(cpi);
+  vp9_initialize_me_consts(cpi, x, cm->base_qindex);
+  init_encode_frame_mb_context(cpi);
+  cm->use_prev_frame_mvs = !cm->error_resilient_mode &&
+                           cm->width == cm->last_width &&
+                           cm->height == cm->last_height &&
+                           !cm->intra_only &&
+                           cm->last_show_frame;
+  // Special case: set prev_mi to NULL when the previous mode info
+  // context cannot be used.
+  cm->prev_mi = cm->use_prev_frame_mvs ?
+                cm->prev_mip + cm->mi_stride + 1 : NULL;
+
+  x->quant_fp = cpi->sf.use_quant_fp;
+  vp9_zero(x->skip_txfm);
+  if (sf->use_nonrd_pick_mode) {
+    // Initialize internal buffer pointers for rtc coding, where non-RD
+    // mode decision is used and hence no buffer pointer swap needed.
+    int i;
+    struct macroblock_plane *const p = x->plane;
+    struct macroblockd_plane *const pd = xd->plane;
+    PICK_MODE_CONTEXT *ctx = &cpi->td.pc_root->none;
+
+    for (i = 0; i < MAX_MB_PLANE; ++i) {
+      p[i].coeff = ctx->coeff_pbuf[i][0];
+      p[i].qcoeff = ctx->qcoeff_pbuf[i][0];
+      pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][0];
+      p[i].eobs = ctx->eobs_pbuf[i][0];
+    }
+    vp9_zero(x->zcoeff_blk);
+
+    if (cm->frame_type != KEY_FRAME &&
+        cpi->rc.frames_since_golden == 0 &&
+        !cpi->use_svc)
+      cpi->ref_frame_flags &= (~VP9_GOLD_FLAG);
+
+    if (sf->partition_search_type == SOURCE_VAR_BASED_PARTITION)
+      source_var_based_partition_search_method(cpi);
+  }
+
+  {
+    struct vpx_usec_timer emr_timer;
+    vpx_usec_timer_start(&emr_timer);
+
+#if CONFIG_FP_MB_STATS
+  if (cpi->use_fp_mb_stats) {
+    input_fpmb_stats(&cpi->twopass.firstpass_mb_stats, cm,
+                     &cpi->twopass.this_frame_mb_stats);
+  }
+#endif
+
+    // If allowed, encoding tiles in parallel with one thread handling one tile.
+    if (VPXMIN(cpi->oxcf.max_threads, 1 << cm->log2_tile_cols) > 1)
+      vp9_encode_tiles_mt(cpi);
+    else
+      encode_tiles(cpi);
+
+    vpx_usec_timer_mark(&emr_timer);
+    cpi->time_encode_sb_row += vpx_usec_timer_elapsed(&emr_timer);
+  }
+
+  sf->skip_encode_frame = sf->skip_encode_sb ?
+      get_skip_encode_frame(cm, td) : 0;
+
+#if 0
+  // Keep record of the total distortion this time around for future use
+  cpi->last_frame_distortion = cpi->frame_distortion;
+#endif
+}
+
+static INTERP_FILTER get_interp_filter(
+    const int64_t threshes[SWITCHABLE_FILTER_CONTEXTS], int is_alt_ref) {
+  if (!is_alt_ref &&
+      threshes[EIGHTTAP_SMOOTH] > threshes[EIGHTTAP] &&
+      threshes[EIGHTTAP_SMOOTH] > threshes[EIGHTTAP_SHARP] &&
+      threshes[EIGHTTAP_SMOOTH] > threshes[SWITCHABLE - 1]) {
+    return EIGHTTAP_SMOOTH;
+  } else if (threshes[EIGHTTAP_SHARP] > threshes[EIGHTTAP] &&
+             threshes[EIGHTTAP_SHARP] > threshes[SWITCHABLE - 1]) {
+    return EIGHTTAP_SHARP;
+  } else if (threshes[EIGHTTAP] > threshes[SWITCHABLE - 1]) {
+    return EIGHTTAP;
+  } else {
+    return SWITCHABLE;
+  }
+}
+
+void vp9_encode_frame(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+
+  // In the longer term the encoder should be generalized to match the
+  // decoder such that we allow compound where one of the 3 buffers has a
+  // different sign bias and that buffer is then the fixed ref. However, this
+  // requires further work in the rd loop. For now the only supported encoder
+  // side behavior is where the ALT ref buffer has opposite sign bias to
+  // the other two.
+  if (!frame_is_intra_only(cm)) {
+    if ((cm->ref_frame_sign_bias[ALTREF_FRAME] ==
+             cm->ref_frame_sign_bias[GOLDEN_FRAME]) ||
+        (cm->ref_frame_sign_bias[ALTREF_FRAME] ==
+             cm->ref_frame_sign_bias[LAST_FRAME])) {
+      cpi->allow_comp_inter_inter = 0;
+    } else {
+      cpi->allow_comp_inter_inter = 1;
+      cm->comp_fixed_ref = ALTREF_FRAME;
+      cm->comp_var_ref[0] = LAST_FRAME;
+      cm->comp_var_ref[1] = GOLDEN_FRAME;
+    }
+  }
+
+  if (cpi->sf.frame_parameter_update) {
+    int i;
+    RD_OPT *const rd_opt = &cpi->rd;
+    FRAME_COUNTS *counts = cpi->td.counts;
+    RD_COUNTS *const rdc = &cpi->td.rd_counts;
+
+    // This code does a single RD pass over the whole frame assuming
+    // either compound, single or hybrid prediction as per whatever has
+    // worked best for that type of frame in the past.
+    // It also predicts whether another coding mode would have worked
+    // better that this coding mode. If that is the case, it remembers
+    // that for subsequent frames.
+    // It does the same analysis for transform size selection also.
+    const MV_REFERENCE_FRAME frame_type = get_frame_type(cpi);
+    int64_t *const mode_thrs = rd_opt->prediction_type_threshes[frame_type];
+    int64_t *const filter_thrs = rd_opt->filter_threshes[frame_type];
+    const int is_alt_ref = frame_type == ALTREF_FRAME;
+
+    /* prediction (compound, single or hybrid) mode selection */
+    if (is_alt_ref || !cpi->allow_comp_inter_inter)
+      cm->reference_mode = SINGLE_REFERENCE;
+    else if (mode_thrs[COMPOUND_REFERENCE] > mode_thrs[SINGLE_REFERENCE] &&
+             mode_thrs[COMPOUND_REFERENCE] >
+                 mode_thrs[REFERENCE_MODE_SELECT] &&
+             check_dual_ref_flags(cpi) &&
+             cpi->static_mb_pct == 100)
+      cm->reference_mode = COMPOUND_REFERENCE;
+    else if (mode_thrs[SINGLE_REFERENCE] > mode_thrs[REFERENCE_MODE_SELECT])
+      cm->reference_mode = SINGLE_REFERENCE;
+    else
+      cm->reference_mode = REFERENCE_MODE_SELECT;
+
+    if (cm->interp_filter == SWITCHABLE)
+      cm->interp_filter = get_interp_filter(filter_thrs, is_alt_ref);
+
+    encode_frame_internal(cpi);
+
+    for (i = 0; i < REFERENCE_MODES; ++i)
+      mode_thrs[i] = (mode_thrs[i] + rdc->comp_pred_diff[i] / cm->MBs) / 2;
+
+    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+      filter_thrs[i] = (filter_thrs[i] + rdc->filter_diff[i] / cm->MBs) / 2;
+
+    if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+      int single_count_zero = 0;
+      int comp_count_zero = 0;
+
+      for (i = 0; i < COMP_INTER_CONTEXTS; i++) {
+        single_count_zero += counts->comp_inter[i][0];
+        comp_count_zero += counts->comp_inter[i][1];
+      }
+
+      if (comp_count_zero == 0) {
+        cm->reference_mode = SINGLE_REFERENCE;
+        vp9_zero(counts->comp_inter);
+      } else if (single_count_zero == 0) {
+        cm->reference_mode = COMPOUND_REFERENCE;
+        vp9_zero(counts->comp_inter);
+      }
+    }
+
+    if (cm->tx_mode == TX_MODE_SELECT) {
+      int count4x4 = 0;
+      int count8x8_lp = 0, count8x8_8x8p = 0;
+      int count16x16_16x16p = 0, count16x16_lp = 0;
+      int count32x32 = 0;
+
+      for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
+        count4x4 += counts->tx.p32x32[i][TX_4X4];
+        count4x4 += counts->tx.p16x16[i][TX_4X4];
+        count4x4 += counts->tx.p8x8[i][TX_4X4];
+
+        count8x8_lp += counts->tx.p32x32[i][TX_8X8];
+        count8x8_lp += counts->tx.p16x16[i][TX_8X8];
+        count8x8_8x8p += counts->tx.p8x8[i][TX_8X8];
+
+        count16x16_16x16p += counts->tx.p16x16[i][TX_16X16];
+        count16x16_lp += counts->tx.p32x32[i][TX_16X16];
+        count32x32 += counts->tx.p32x32[i][TX_32X32];
+      }
+      if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 &&
+          count32x32 == 0) {
+        cm->tx_mode = ALLOW_8X8;
+        reset_skip_tx_size(cm, TX_8X8);
+      } else if (count8x8_8x8p == 0 && count16x16_16x16p == 0 &&
+                 count8x8_lp == 0 && count16x16_lp == 0 && count32x32 == 0) {
+        cm->tx_mode = ONLY_4X4;
+        reset_skip_tx_size(cm, TX_4X4);
+      } else if (count8x8_lp == 0 && count16x16_lp == 0 && count4x4 == 0) {
+        cm->tx_mode = ALLOW_32X32;
+      } else if (count32x32 == 0 && count8x8_lp == 0 && count4x4 == 0) {
+        cm->tx_mode = ALLOW_16X16;
+        reset_skip_tx_size(cm, TX_16X16);
+      }
+    }
+  } else {
+    cm->reference_mode = SINGLE_REFERENCE;
+    encode_frame_internal(cpi);
+  }
+}
+
+static void sum_intra_stats(FRAME_COUNTS *counts, const MODE_INFO *mi) {
+  const PREDICTION_MODE y_mode = mi->mode;
+  const PREDICTION_MODE uv_mode = mi->uv_mode;
+  const BLOCK_SIZE bsize = mi->sb_type;
+
+  if (bsize < BLOCK_8X8) {
+    int idx, idy;
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+    for (idy = 0; idy < 2; idy += num_4x4_h)
+      for (idx = 0; idx < 2; idx += num_4x4_w)
+        ++counts->y_mode[0][mi->bmi[idy * 2 + idx].as_mode];
+  } else {
+    ++counts->y_mode[size_group_lookup[bsize]][y_mode];
+  }
+
+  ++counts->uv_mode[y_mode][uv_mode];
+}
+
+static void encode_superblock(VP9_COMP *cpi, ThreadData *td,
+                              TOKENEXTRA **t, int output_enabled,
+                              int mi_row, int mi_col, BLOCK_SIZE bsize,
+                              PICK_MODE_CONTEXT *ctx) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO **mi_8x8 = xd->mi;
+  MODE_INFO *mi = mi_8x8[0];
+  const int seg_skip = segfeature_active(&cm->seg, mi->segment_id,
+                                         SEG_LVL_SKIP);
+  const int mis = cm->mi_stride;
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+
+  x->skip_recode = !x->select_tx_size && mi->sb_type >= BLOCK_8X8 &&
+                   cpi->oxcf.aq_mode != COMPLEXITY_AQ &&
+                   cpi->oxcf.aq_mode != CYCLIC_REFRESH_AQ &&
+                   cpi->sf.allow_skip_recode;
+
+  if (!x->skip_recode && !cpi->sf.use_nonrd_pick_mode)
+    memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
+
+  x->skip_optimize = ctx->is_coded;
+  ctx->is_coded = 1;
+  x->use_lp32x32fdct = cpi->sf.use_lp32x32fdct;
+  x->skip_encode = (!output_enabled && cpi->sf.skip_encode_frame &&
+                    x->q_index < QIDX_SKIP_THRESH);
+
+  if (x->skip_encode)
+    return;
+
+  if (!is_inter_block(mi)) {
+    int plane;
+    mi->skip = 1;
+    for (plane = 0; plane < MAX_MB_PLANE; ++plane)
+      vp9_encode_intra_block_plane(x, VPXMAX(bsize, BLOCK_8X8), plane);
+    if (output_enabled)
+      sum_intra_stats(td->counts, mi);
+    vp9_tokenize_sb(cpi, td, t, !output_enabled, VPXMAX(bsize, BLOCK_8X8));
+  } else {
+    int ref;
+    const int is_compound = has_second_ref(mi);
+    set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
+    for (ref = 0; ref < 1 + is_compound; ++ref) {
+      YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi,
+                                                     mi->ref_frame[ref]);
+      assert(cfg != NULL);
+      vp9_setup_pre_planes(xd, ref, cfg, mi_row, mi_col,
+                           &xd->block_refs[ref]->sf);
+    }
+    if (!(cpi->sf.reuse_inter_pred_sby && ctx->pred_pixel_ready) || seg_skip)
+      vp9_build_inter_predictors_sby(xd, mi_row, mi_col,
+                                     VPXMAX(bsize, BLOCK_8X8));
+
+    vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col,
+                                    VPXMAX(bsize, BLOCK_8X8));
+
+    vp9_encode_sb(x, VPXMAX(bsize, BLOCK_8X8));
+    vp9_tokenize_sb(cpi, td, t, !output_enabled, VPXMAX(bsize, BLOCK_8X8));
+  }
+
+  if (output_enabled) {
+    if (cm->tx_mode == TX_MODE_SELECT &&
+        mi->sb_type >= BLOCK_8X8  &&
+        !(is_inter_block(mi) && (mi->skip || seg_skip))) {
+      ++get_tx_counts(max_txsize_lookup[bsize], get_tx_size_context(xd),
+                      &td->counts->tx)[mi->tx_size];
+    } else {
+      int x, y;
+      TX_SIZE tx_size;
+      // The new intra coding scheme requires no change of transform size
+      if (is_inter_block(mi)) {
+        tx_size = VPXMIN(tx_mode_to_biggest_tx_size[cm->tx_mode],
+                         max_txsize_lookup[bsize]);
+      } else {
+        tx_size = (bsize >= BLOCK_8X8) ? mi->tx_size : TX_4X4;
+      }
+
+      for (y = 0; y < mi_height; y++)
+        for (x = 0; x < mi_width; x++)
+          if (mi_col + x < cm->mi_cols && mi_row + y < cm->mi_rows)
+            mi_8x8[mis * y + x]->tx_size = tx_size;
+    }
+    ++td->counts->tx.tx_totals[mi->tx_size];
+    ++td->counts->tx.tx_totals[get_uv_tx_size(mi, &xd->plane[1])];
+    if (cm->seg.enabled && cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+      vp9_cyclic_refresh_update_sb_postencode(cpi, mi, mi_row, mi_col, bsize);
+  }
+}
diff --git a/libs/libvpx/vp9/encoder/vp9_encodeframe.h b/libs/libvpx/vp9/encoder/vp9_encodeframe.h
new file mode 100644
index 0000000000..6aaa56463b
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_encodeframe.h
@@ -0,0 +1,49 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP9_ENCODER_VP9_ENCODEFRAME_H_
+#define VP9_ENCODER_VP9_ENCODEFRAME_H_
+
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct macroblock;
+struct yv12_buffer_config;
+struct VP9_COMP;
+struct ThreadData;
+
+// Constants used in SOURCE_VAR_BASED_PARTITION
+#define VAR_HIST_MAX_BG_VAR 1000
+#define VAR_HIST_FACTOR 10
+#define VAR_HIST_BINS (VAR_HIST_MAX_BG_VAR / VAR_HIST_FACTOR + 1)
+#define VAR_HIST_LARGE_CUT_OFF 75
+#define VAR_HIST_SMALL_CUT_OFF 45
+
+void vp9_setup_src_planes(struct macroblock *x,
+                          const struct yv12_buffer_config *src,
+                          int mi_row, int mi_col);
+
+void vp9_encode_frame(struct VP9_COMP *cpi);
+
+void vp9_init_tile_data(struct VP9_COMP *cpi);
+void vp9_encode_tile(struct VP9_COMP *cpi, struct ThreadData *td,
+                     int tile_row, int tile_col);
+
+void vp9_set_variance_partition_thresholds(struct VP9_COMP *cpi, int q);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_VP9_ENCODEFRAME_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_encodemb.c b/libs/libvpx/vp9/encoder/vp9_encodemb.c
new file mode 100644
index 0000000000..689e8c0d9c
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_encodemb.c
@@ -0,0 +1,977 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_dsp/quantize.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+
+#include "vp9/common/vp9_idct.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/common/vp9_reconintra.h"
+#include "vp9/common/vp9_scan.h"
+
+#include "vp9/encoder/vp9_encodemb.h"
+#include "vp9/encoder/vp9_rd.h"
+#include "vp9/encoder/vp9_tokenize.h"
+
+struct optimize_ctx {
+  ENTROPY_CONTEXT ta[MAX_MB_PLANE][16];
+  ENTROPY_CONTEXT tl[MAX_MB_PLANE][16];
+};
+
+void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
+  struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane];
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+  const int bw = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+  const int bh = 4 * num_4x4_blocks_high_lookup[plane_bsize];
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    vpx_highbd_subtract_block(bh, bw, p->src_diff, bw, p->src.buf,
+                              p->src.stride, pd->dst.buf, pd->dst.stride,
+                              x->e_mbd.bd);
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  vpx_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
+                     pd->dst.buf, pd->dst.stride);
+}
+
+#define RDTRUNC(RM, DM, R, D)                        \
+  (((1 << (VP9_PROB_COST_SHIFT - 1)) + (R) * (RM)) & \
+   ((1 << VP9_PROB_COST_SHIFT) - 1))
+
+typedef struct vp9_token_state {
+  int           rate;
+  int           error;
+  int           next;
+  int16_t       token;
+  int16_t       qc;
+} vp9_token_state;
+
+// TODO(jimbankoski): experiment to find optimal RD numbers.
+static const int plane_rd_mult[PLANE_TYPES] = { 4, 2 };
+
+#define UPDATE_RD_COST()\
+{\
+  rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);\
+  rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);\
+  if (rd_cost0 == rd_cost1) {\
+    rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);\
+    rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);\
+  }\
+}
+
+// This function is a place holder for now but may ultimately need
+// to scan previous tokens to work out the correct context.
+static int trellis_get_coeff_context(const int16_t *scan,
+                                     const int16_t *nb,
+                                     int idx, int token,
+                                     uint8_t *token_cache) {
+  int bak = token_cache[scan[idx]], pt;
+  token_cache[scan[idx]] = vp9_pt_energy_class[token];
+  pt = get_coef_context(nb, token_cache, idx + 1);
+  token_cache[scan[idx]] = bak;
+  return pt;
+}
+
+static int optimize_b(MACROBLOCK *mb, int plane, int block,
+                      TX_SIZE tx_size, int ctx) {
+  MACROBLOCKD *const xd = &mb->e_mbd;
+  struct macroblock_plane *const p = &mb->plane[plane];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int ref = is_inter_block(xd->mi[0]);
+  vp9_token_state tokens[1025][2];
+  unsigned best_index[1025][2];
+  uint8_t token_cache[1024];
+  const tran_low_t *const coeff = BLOCK_OFFSET(mb->plane[plane].coeff, block);
+  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  const int eob = p->eobs[block];
+  const PLANE_TYPE type = get_plane_type(plane);
+  const int default_eob = 16 << (tx_size << 1);
+  const int mul = 1 + (tx_size == TX_32X32);
+  const int16_t *dequant_ptr = pd->dequant;
+  const uint8_t *const band_translate = get_band_translate(tx_size);
+  const scan_order *const so = get_scan(xd, tx_size, type, block);
+  const int16_t *const scan = so->scan;
+  const int16_t *const nb = so->neighbors;
+  int next = eob, sz = 0;
+  int64_t rdmult = mb->rdmult * plane_rd_mult[type], rddiv = mb->rddiv;
+  int64_t rd_cost0, rd_cost1;
+  int rate0, rate1, error0, error1;
+  int16_t t0, t1;
+  EXTRABIT e0;
+  int best, band, pt, i, final_eob;
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int *cat6_high_cost = vp9_get_high_cost_table(xd->bd);
+#else
+  const int *cat6_high_cost = vp9_get_high_cost_table(8);
+#endif
+
+  assert((!type && !plane) || (type && plane));
+  assert(eob <= default_eob);
+
+  /* Now set up a Viterbi trellis to evaluate alternative roundings. */
+  if (!ref)
+    rdmult = (rdmult * 9) >> 4;
+
+  /* Initialize the sentinel node of the trellis. */
+  tokens[eob][0].rate = 0;
+  tokens[eob][0].error = 0;
+  tokens[eob][0].next = default_eob;
+  tokens[eob][0].token = EOB_TOKEN;
+  tokens[eob][0].qc = 0;
+  tokens[eob][1] = tokens[eob][0];
+
+  for (i = 0; i < eob; i++)
+    token_cache[scan[i]] =
+        vp9_pt_energy_class[vp9_get_token(qcoeff[scan[i]])];
+
+  for (i = eob; i-- > 0;) {
+    int base_bits, d2, dx;
+    const int rc = scan[i];
+    int x = qcoeff[rc];
+    /* Only add a trellis state for non-zero coefficients. */
+    if (x) {
+      int shortcut = 0;
+      error0 = tokens[next][0].error;
+      error1 = tokens[next][1].error;
+      /* Evaluate the first possibility for this state. */
+      rate0 = tokens[next][0].rate;
+      rate1 = tokens[next][1].rate;
+      vp9_get_token_extra(x, &t0, &e0);
+      /* Consider both possible successor states. */
+      if (next < default_eob) {
+        band = band_translate[i + 1];
+        pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
+        rate0 += mb->token_costs[tx_size][type][ref][band][0][pt]
+                                [tokens[next][0].token];
+        rate1 += mb->token_costs[tx_size][type][ref][band][0][pt]
+                                [tokens[next][1].token];
+      }
+      UPDATE_RD_COST();
+      /* And pick the best. */
+      best = rd_cost1 < rd_cost0;
+      base_bits = vp9_get_cost(t0, e0, cat6_high_cost);
+      dx = mul * (dqcoeff[rc] - coeff[rc]);
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        dx >>= xd->bd - 8;
+      }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      d2 = dx * dx;
+      tokens[i][0].rate = base_bits + (best ? rate1 : rate0);
+      tokens[i][0].error = d2 + (best ? error1 : error0);
+      tokens[i][0].next = next;
+      tokens[i][0].token = t0;
+      tokens[i][0].qc = x;
+      best_index[i][0] = best;
+
+      /* Evaluate the second possibility for this state. */
+      rate0 = tokens[next][0].rate;
+      rate1 = tokens[next][1].rate;
+
+      if ((abs(x) * dequant_ptr[rc != 0] > abs(coeff[rc]) * mul) &&
+          (abs(x) * dequant_ptr[rc != 0] < abs(coeff[rc]) * mul +
+                                               dequant_ptr[rc != 0]))
+        shortcut = 1;
+      else
+        shortcut = 0;
+
+      if (shortcut) {
+        sz = -(x < 0);
+        x -= 2 * sz + 1;
+      }
+
+      /* Consider both possible successor states. */
+      if (!x) {
+        /* If we reduced this coefficient to zero, check to see if
+         *  we need to move the EOB back here.
+         */
+        t0 = tokens[next][0].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
+        t1 = tokens[next][1].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
+        e0 = 0;
+      } else {
+        vp9_get_token_extra(x, &t0, &e0);
+        t1 = t0;
+      }
+      if (next < default_eob) {
+        band = band_translate[i + 1];
+        if (t0 != EOB_TOKEN) {
+          pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
+          rate0 += mb->token_costs[tx_size][type][ref][band][!x][pt]
+                                  [tokens[next][0].token];
+        }
+        if (t1 != EOB_TOKEN) {
+          pt = trellis_get_coeff_context(scan, nb, i, t1, token_cache);
+          rate1 += mb->token_costs[tx_size][type][ref][band][!x][pt]
+                                  [tokens[next][1].token];
+        }
+      }
+
+      UPDATE_RD_COST();
+      /* And pick the best. */
+      best = rd_cost1 < rd_cost0;
+      base_bits = vp9_get_cost(t0, e0, cat6_high_cost);
+
+      if (shortcut) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+          dx -= ((dequant_ptr[rc != 0] >> (xd->bd - 8)) + sz) ^ sz;
+        } else {
+          dx -= (dequant_ptr[rc != 0] + sz) ^ sz;
+        }
+#else
+        dx -= (dequant_ptr[rc != 0] + sz) ^ sz;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        d2 = dx * dx;
+      }
+      tokens[i][1].rate = base_bits + (best ? rate1 : rate0);
+      tokens[i][1].error = d2 + (best ? error1 : error0);
+      tokens[i][1].next = next;
+      tokens[i][1].token = best ? t1 : t0;
+      tokens[i][1].qc = x;
+      best_index[i][1] = best;
+      /* Finally, make this the new head of the trellis. */
+      next = i;
+    } else {
+      /* There's no choice to make for a zero coefficient, so we don't
+       *  add a new trellis node, but we do need to update the costs.
+       */
+      band = band_translate[i + 1];
+      t0 = tokens[next][0].token;
+      t1 = tokens[next][1].token;
+      /* Update the cost of each path if we're past the EOB token. */
+      if (t0 != EOB_TOKEN) {
+        tokens[next][0].rate +=
+            mb->token_costs[tx_size][type][ref][band][1][0][t0];
+        tokens[next][0].token = ZERO_TOKEN;
+      }
+      if (t1 != EOB_TOKEN) {
+        tokens[next][1].rate +=
+            mb->token_costs[tx_size][type][ref][band][1][0][t1];
+        tokens[next][1].token = ZERO_TOKEN;
+      }
+      best_index[i][0] = best_index[i][1] = 0;
+      /* Don't update next, because we didn't add a new node. */
+    }
+  }
+
+  /* Now pick the best path through the whole trellis. */
+  band = band_translate[i + 1];
+  rate0 = tokens[next][0].rate;
+  rate1 = tokens[next][1].rate;
+  error0 = tokens[next][0].error;
+  error1 = tokens[next][1].error;
+  t0 = tokens[next][0].token;
+  t1 = tokens[next][1].token;
+  rate0 += mb->token_costs[tx_size][type][ref][band][0][ctx][t0];
+  rate1 += mb->token_costs[tx_size][type][ref][band][0][ctx][t1];
+  UPDATE_RD_COST();
+  best = rd_cost1 < rd_cost0;
+  final_eob = -1;
+  memset(qcoeff, 0, sizeof(*qcoeff) * (16 << (tx_size * 2)));
+  memset(dqcoeff, 0, sizeof(*dqcoeff) * (16 << (tx_size * 2)));
+  for (i = next; i < eob; i = next) {
+    const int x = tokens[i][best].qc;
+    const int rc = scan[i];
+    if (x) {
+      final_eob = i;
+    }
+
+    qcoeff[rc] = x;
+    dqcoeff[rc] = (x * dequant_ptr[rc != 0]) / mul;
+
+    next = tokens[i][best].next;
+    best = best_index[i][best];
+  }
+  final_eob++;
+
+  mb->plane[plane].eobs[block] = final_eob;
+  return final_eob;
+}
+
+static INLINE void fdct32x32(int rd_transform,
+                             const int16_t *src, tran_low_t *dst,
+                             int src_stride) {
+  if (rd_transform)
+    vpx_fdct32x32_rd(src, dst, src_stride);
+  else
+    vpx_fdct32x32(src, dst, src_stride);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void highbd_fdct32x32(int rd_transform, const int16_t *src,
+                                    tran_low_t *dst, int src_stride) {
+  if (rd_transform)
+    vpx_highbd_fdct32x32_rd(src, dst, src_stride);
+  else
+    vpx_highbd_fdct32x32(src, dst, src_stride);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
+                        BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
+  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  uint16_t *const eob = &p->eobs[block];
+  const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+  int i, j;
+  const int16_t *src_diff;
+  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
+  src_diff = &p->src_diff[4 * (j * diff_stride + i)];
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    switch (tx_size) {
+      case TX_32X32:
+        highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
+        vp9_highbd_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin,
+                                     p->round_fp, p->quant_fp, p->quant_shift,
+                                     qcoeff, dqcoeff, pd->dequant,
+                                     eob, scan_order->scan,
+                                     scan_order->iscan);
+        break;
+      case TX_16X16:
+        vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
+        vp9_highbd_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
+                               p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                               pd->dequant, eob,
+                               scan_order->scan, scan_order->iscan);
+        break;
+      case TX_8X8:
+        vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
+        vp9_highbd_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp,
+                               p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                               pd->dequant, eob,
+                               scan_order->scan, scan_order->iscan);
+        break;
+      case TX_4X4:
+        x->fwd_txm4x4(src_diff, coeff, diff_stride);
+        vp9_highbd_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
+                               p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                               pd->dequant, eob,
+                               scan_order->scan, scan_order->iscan);
+        break;
+      default:
+        assert(0);
+    }
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  switch (tx_size) {
+    case TX_32X32:
+      fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
+      vp9_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin, p->round_fp,
+                            p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                            pd->dequant, eob, scan_order->scan,
+                            scan_order->iscan);
+      break;
+    case TX_16X16:
+      vpx_fdct16x16(src_diff, coeff, diff_stride);
+      vp9_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
+                      p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                      pd->dequant, eob,
+                      scan_order->scan, scan_order->iscan);
+      break;
+    case TX_8X8:
+      vp9_fdct8x8_quant(src_diff, diff_stride, coeff, 64,
+                        x->skip_block, p->zbin, p->round_fp,
+                        p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                        pd->dequant, eob,
+                        scan_order->scan, scan_order->iscan);
+      break;
+    case TX_4X4:
+      x->fwd_txm4x4(src_diff, coeff, diff_stride);
+      vp9_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
+                      p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                      pd->dequant, eob,
+                      scan_order->scan, scan_order->iscan);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
+
+void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block,
+                        BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  uint16_t *const eob = &p->eobs[block];
+  const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+  int i, j;
+  const int16_t *src_diff;
+
+  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
+  src_diff = &p->src_diff[4 * (j * diff_stride + i)];
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    switch (tx_size) {
+      case TX_32X32:
+        vpx_highbd_fdct32x32_1(src_diff, coeff, diff_stride);
+        vpx_highbd_quantize_dc_32x32(coeff, x->skip_block, p->round,
+                                     p->quant_fp[0], qcoeff, dqcoeff,
+                                     pd->dequant[0], eob);
+        break;
+      case TX_16X16:
+        vpx_highbd_fdct16x16_1(src_diff, coeff, diff_stride);
+        vpx_highbd_quantize_dc(coeff, 256, x->skip_block, p->round,
+                               p->quant_fp[0], qcoeff, dqcoeff,
+                               pd->dequant[0], eob);
+        break;
+      case TX_8X8:
+        vpx_highbd_fdct8x8_1(src_diff, coeff, diff_stride);
+        vpx_highbd_quantize_dc(coeff, 64, x->skip_block, p->round,
+                               p->quant_fp[0], qcoeff, dqcoeff,
+                               pd->dequant[0], eob);
+        break;
+      case TX_4X4:
+        x->fwd_txm4x4(src_diff, coeff, diff_stride);
+        vpx_highbd_quantize_dc(coeff, 16, x->skip_block, p->round,
+                               p->quant_fp[0], qcoeff, dqcoeff,
+                               pd->dequant[0], eob);
+        break;
+      default:
+        assert(0);
+    }
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  switch (tx_size) {
+    case TX_32X32:
+      vpx_fdct32x32_1(src_diff, coeff, diff_stride);
+      vpx_quantize_dc_32x32(coeff, x->skip_block, p->round,
+                            p->quant_fp[0], qcoeff, dqcoeff,
+                            pd->dequant[0], eob);
+      break;
+    case TX_16X16:
+      vpx_fdct16x16_1(src_diff, coeff, diff_stride);
+      vpx_quantize_dc(coeff, 256, x->skip_block, p->round,
+                     p->quant_fp[0], qcoeff, dqcoeff,
+                     pd->dequant[0], eob);
+      break;
+    case TX_8X8:
+      vpx_fdct8x8_1(src_diff, coeff, diff_stride);
+      vpx_quantize_dc(coeff, 64, x->skip_block, p->round,
+                      p->quant_fp[0], qcoeff, dqcoeff,
+                      pd->dequant[0], eob);
+      break;
+    case TX_4X4:
+      x->fwd_txm4x4(src_diff, coeff, diff_stride);
+      vpx_quantize_dc(coeff, 16, x->skip_block, p->round,
+                      p->quant_fp[0], qcoeff, dqcoeff,
+                      pd->dequant[0], eob);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
+
+void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
+                     BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
+  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  uint16_t *const eob = &p->eobs[block];
+  const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+  int i, j;
+  const int16_t *src_diff;
+  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
+  src_diff = &p->src_diff[4 * (j * diff_stride + i)];
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+     switch (tx_size) {
+      case TX_32X32:
+        highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
+        vpx_highbd_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin,
+                                    p->round, p->quant, p->quant_shift, qcoeff,
+                                    dqcoeff, pd->dequant, eob,
+                                    scan_order->scan, scan_order->iscan);
+        break;
+      case TX_16X16:
+        vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
+        vpx_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
+                              p->quant, p->quant_shift, qcoeff, dqcoeff,
+                              pd->dequant, eob,
+                              scan_order->scan, scan_order->iscan);
+        break;
+      case TX_8X8:
+        vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
+        vpx_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
+                              p->quant, p->quant_shift, qcoeff, dqcoeff,
+                              pd->dequant, eob,
+                              scan_order->scan, scan_order->iscan);
+        break;
+      case TX_4X4:
+        x->fwd_txm4x4(src_diff, coeff, diff_stride);
+        vpx_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
+                              p->quant, p->quant_shift, qcoeff, dqcoeff,
+                              pd->dequant, eob,
+                              scan_order->scan, scan_order->iscan);
+        break;
+      default:
+        assert(0);
+    }
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  switch (tx_size) {
+    case TX_32X32:
+      fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
+      vpx_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
+                           p->quant, p->quant_shift, qcoeff, dqcoeff,
+                           pd->dequant, eob, scan_order->scan,
+                           scan_order->iscan);
+      break;
+    case TX_16X16:
+      vpx_fdct16x16(src_diff, coeff, diff_stride);
+      vpx_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
+                     p->quant, p->quant_shift, qcoeff, dqcoeff,
+                     pd->dequant, eob,
+                     scan_order->scan, scan_order->iscan);
+      break;
+    case TX_8X8:
+      vpx_fdct8x8(src_diff, coeff, diff_stride);
+      vpx_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
+                     p->quant, p->quant_shift, qcoeff, dqcoeff,
+                     pd->dequant, eob,
+                     scan_order->scan, scan_order->iscan);
+      break;
+    case TX_4X4:
+      x->fwd_txm4x4(src_diff, coeff, diff_stride);
+      vpx_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
+                     p->quant, p->quant_shift, qcoeff, dqcoeff,
+                     pd->dequant, eob,
+                     scan_order->scan, scan_order->iscan);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
+
+static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
+                         TX_SIZE tx_size, void *arg) {
+  struct encode_b_args *const args = arg;
+  MACROBLOCK *const x = args->x;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct optimize_ctx *const ctx = args->ctx;
+  struct macroblock_plane *const p = &x->plane[plane];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  int i, j;
+  uint8_t *dst;
+  ENTROPY_CONTEXT *a, *l;
+  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
+  dst = &pd->dst.buf[4 * j * pd->dst.stride + 4 * i];
+  a = &ctx->ta[plane][i];
+  l = &ctx->tl[plane][j];
+
+  // TODO(jingning): per transformed block zero forcing only enabled for
+  // luma component. will integrate chroma components as well.
+  if (x->zcoeff_blk[tx_size][block] && plane == 0) {
+    p->eobs[block] = 0;
+    *a = *l = 0;
+    return;
+  }
+
+  if (!x->skip_recode) {
+    if (x->quant_fp) {
+      // Encoding process for rtc mode
+      if (x->skip_txfm[0] == SKIP_TXFM_AC_DC && plane == 0) {
+        // skip forward transform
+        p->eobs[block] = 0;
+        *a = *l = 0;
+        return;
+      } else {
+        vp9_xform_quant_fp(x, plane, block, plane_bsize, tx_size);
+      }
+    } else {
+      if (max_txsize_lookup[plane_bsize] == tx_size) {
+        int txfm_blk_index = (plane << 2) + (block >> (tx_size << 1));
+        if (x->skip_txfm[txfm_blk_index] == SKIP_TXFM_NONE) {
+          // full forward transform and quantization
+          vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
+        } else if (x->skip_txfm[txfm_blk_index] == SKIP_TXFM_AC_ONLY) {
+          // fast path forward transform and quantization
+          vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size);
+        } else {
+          // skip forward transform
+          p->eobs[block] = 0;
+          *a = *l = 0;
+          return;
+        }
+      } else {
+        vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
+      }
+    }
+  }
+
+  if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
+    const int ctx = combine_entropy_contexts(*a, *l);
+    *a = *l = optimize_b(x, plane, block, tx_size, ctx) > 0;
+  } else {
+    *a = *l = p->eobs[block] > 0;
+  }
+
+  if (p->eobs[block])
+    *(args->skip) = 0;
+
+  if (x->skip_encode || p->eobs[block] == 0)
+    return;
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    switch (tx_size) {
+      case TX_32X32:
+        vp9_highbd_idct32x32_add(dqcoeff, dst, pd->dst.stride,
+                                 p->eobs[block], xd->bd);
+        break;
+      case TX_16X16:
+        vp9_highbd_idct16x16_add(dqcoeff, dst, pd->dst.stride,
+                                 p->eobs[block], xd->bd);
+        break;
+      case TX_8X8:
+        vp9_highbd_idct8x8_add(dqcoeff, dst, pd->dst.stride,
+                               p->eobs[block], xd->bd);
+        break;
+      case TX_4X4:
+        // this is like vp9_short_idct4x4 but has a special case around eob<=1
+        // which is significant (not just an optimization) for the lossless
+        // case.
+        x->highbd_itxm_add(dqcoeff, dst, pd->dst.stride,
+                           p->eobs[block], xd->bd);
+        break;
+      default:
+        assert(0 && "Invalid transform size");
+    }
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  switch (tx_size) {
+    case TX_32X32:
+      vp9_idct32x32_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
+      break;
+    case TX_16X16:
+      vp9_idct16x16_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
+      break;
+    case TX_8X8:
+      vp9_idct8x8_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
+      break;
+    case TX_4X4:
+      // this is like vp9_short_idct4x4 but has a special case around eob<=1
+      // which is significant (not just an optimization) for the lossless
+      // case.
+      x->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
+      break;
+    default:
+      assert(0 && "Invalid transform size");
+      break;
+  }
+}
+
+static void encode_block_pass1(int plane, int block, BLOCK_SIZE plane_bsize,
+                               TX_SIZE tx_size, void *arg) {
+  MACROBLOCK *const x = (MACROBLOCK *)arg;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *const p = &x->plane[plane];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  int i, j;
+  uint8_t *dst;
+  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
+  dst = &pd->dst.buf[4 * j * pd->dst.stride + 4 * i];
+
+  vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
+
+  if (p->eobs[block] > 0) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+       x->highbd_itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block], xd->bd);
+       return;
+    }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    x->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
+  }
+}
+
+void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize) {
+  vp9_subtract_plane(x, bsize, 0);
+  vp9_foreach_transformed_block_in_plane(&x->e_mbd, bsize, 0,
+                                         encode_block_pass1, x);
+}
+
+void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct optimize_ctx ctx;
+  MODE_INFO *mi = xd->mi[0];
+  struct encode_b_args arg = {x, &ctx, &mi->skip};
+  int plane;
+
+  mi->skip = 1;
+
+  if (x->skip)
+    return;
+
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    if (!x->skip_recode)
+      vp9_subtract_plane(x, bsize, plane);
+
+    if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
+      const struct macroblockd_plane* const pd = &xd->plane[plane];
+      const TX_SIZE tx_size = plane ? get_uv_tx_size(mi, pd) : mi->tx_size;
+      vp9_get_entropy_contexts(bsize, tx_size, pd,
+                               ctx.ta[plane], ctx.tl[plane]);
+    }
+
+    vp9_foreach_transformed_block_in_plane(xd, bsize, plane, encode_block,
+                                           &arg);
+  }
+}
+
+void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
+                            TX_SIZE tx_size, void *arg) {
+  struct encode_b_args* const args = arg;
+  MACROBLOCK *const x = args->x;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *mi = xd->mi[0];
+  struct macroblock_plane *const p = &x->plane[plane];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  tran_low_t *coeff = BLOCK_OFFSET(p->coeff, block);
+  tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  const scan_order *scan_order;
+  TX_TYPE tx_type = DCT_DCT;
+  PREDICTION_MODE mode;
+  const int bwl = b_width_log2_lookup[plane_bsize];
+  const int diff_stride = 4 * (1 << bwl);
+  uint8_t *src, *dst;
+  int16_t *src_diff;
+  uint16_t *eob = &p->eobs[block];
+  const int src_stride = p->src.stride;
+  const int dst_stride = pd->dst.stride;
+  int i, j;
+  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
+  dst = &pd->dst.buf[4 * (j * dst_stride + i)];
+  src = &p->src.buf[4 * (j * src_stride + i)];
+  src_diff = &p->src_diff[4 * (j * diff_stride + i)];
+
+  if (tx_size == TX_4X4) {
+    tx_type = get_tx_type_4x4(get_plane_type(plane), xd, block);
+    scan_order = &vp9_scan_orders[TX_4X4][tx_type];
+    mode = plane == 0 ? get_y_mode(xd->mi[0], block) : mi->uv_mode;
+  } else {
+    mode = plane == 0 ? mi->mode : mi->uv_mode;
+    if (tx_size == TX_32X32) {
+      scan_order = &vp9_default_scan_orders[TX_32X32];
+    } else {
+      tx_type = get_tx_type(get_plane_type(plane), xd);
+      scan_order = &vp9_scan_orders[tx_size][tx_type];
+    }
+  }
+
+  vp9_predict_intra_block(xd, bwl, tx_size, mode, x->skip_encode ? src : dst,
+                          x->skip_encode ? src_stride : dst_stride,
+                          dst, dst_stride, i, j, plane);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    switch (tx_size) {
+      case TX_32X32:
+        if (!x->skip_recode) {
+          vpx_highbd_subtract_block(32, 32, src_diff, diff_stride,
+                                    src, src_stride, dst, dst_stride, xd->bd);
+          highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
+          vpx_highbd_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin,
+                                      p->round, p->quant, p->quant_shift,
+                                      qcoeff, dqcoeff, pd->dequant, eob,
+                                      scan_order->scan, scan_order->iscan);
+        }
+        if (!x->skip_encode && *eob) {
+          vp9_highbd_idct32x32_add(dqcoeff, dst, dst_stride, *eob, xd->bd);
+        }
+        break;
+      case TX_16X16:
+        if (!x->skip_recode) {
+          vpx_highbd_subtract_block(16, 16, src_diff, diff_stride,
+                                    src, src_stride, dst, dst_stride, xd->bd);
+          if (tx_type == DCT_DCT)
+            vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
+          else
+            vp9_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type);
+          vpx_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
+                                p->quant, p->quant_shift, qcoeff, dqcoeff,
+                                pd->dequant, eob,
+                                scan_order->scan, scan_order->iscan);
+        }
+        if (!x->skip_encode && *eob) {
+          vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst, dst_stride,
+                                  *eob, xd->bd);
+        }
+        break;
+      case TX_8X8:
+        if (!x->skip_recode) {
+          vpx_highbd_subtract_block(8, 8, src_diff, diff_stride,
+                                    src, src_stride, dst, dst_stride, xd->bd);
+          if (tx_type == DCT_DCT)
+            vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
+          else
+            vp9_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type);
+          vpx_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
+                                p->quant, p->quant_shift, qcoeff, dqcoeff,
+                                pd->dequant, eob,
+                                scan_order->scan, scan_order->iscan);
+        }
+        if (!x->skip_encode && *eob) {
+          vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst, dst_stride, *eob,
+                                xd->bd);
+        }
+        break;
+      case TX_4X4:
+        if (!x->skip_recode) {
+          vpx_highbd_subtract_block(4, 4, src_diff, diff_stride,
+                                    src, src_stride, dst, dst_stride, xd->bd);
+          if (tx_type != DCT_DCT)
+            vp9_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type);
+          else
+            x->fwd_txm4x4(src_diff, coeff, diff_stride);
+          vpx_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
+                                p->quant, p->quant_shift, qcoeff, dqcoeff,
+                                pd->dequant, eob,
+                                scan_order->scan, scan_order->iscan);
+        }
+
+        if (!x->skip_encode && *eob) {
+          if (tx_type == DCT_DCT) {
+            // this is like vp9_short_idct4x4 but has a special case around
+            // eob<=1 which is significant (not just an optimization) for the
+            // lossless case.
+            x->highbd_itxm_add(dqcoeff, dst, dst_stride, *eob, xd->bd);
+          } else {
+            vp9_highbd_iht4x4_16_add(dqcoeff, dst, dst_stride, tx_type, xd->bd);
+          }
+        }
+        break;
+      default:
+        assert(0);
+        return;
+    }
+    if (*eob)
+      *(args->skip) = 0;
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  switch (tx_size) {
+    case TX_32X32:
+      if (!x->skip_recode) {
+        vpx_subtract_block(32, 32, src_diff, diff_stride,
+                           src, src_stride, dst, dst_stride);
+        fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
+        vpx_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
+                             p->quant, p->quant_shift, qcoeff, dqcoeff,
+                             pd->dequant, eob, scan_order->scan,
+                             scan_order->iscan);
+      }
+      if (!x->skip_encode && *eob)
+        vp9_idct32x32_add(dqcoeff, dst, dst_stride, *eob);
+      break;
+    case TX_16X16:
+      if (!x->skip_recode) {
+        vpx_subtract_block(16, 16, src_diff, diff_stride,
+                           src, src_stride, dst, dst_stride);
+        vp9_fht16x16(src_diff, coeff, diff_stride, tx_type);
+        vpx_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
+                       p->quant, p->quant_shift, qcoeff, dqcoeff,
+                       pd->dequant, eob, scan_order->scan,
+                       scan_order->iscan);
+      }
+      if (!x->skip_encode && *eob)
+        vp9_iht16x16_add(tx_type, dqcoeff, dst, dst_stride, *eob);
+      break;
+    case TX_8X8:
+      if (!x->skip_recode) {
+        vpx_subtract_block(8, 8, src_diff, diff_stride,
+                           src, src_stride, dst, dst_stride);
+        vp9_fht8x8(src_diff, coeff, diff_stride, tx_type);
+        vpx_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
+                       p->quant_shift, qcoeff, dqcoeff,
+                       pd->dequant, eob, scan_order->scan,
+                       scan_order->iscan);
+      }
+      if (!x->skip_encode && *eob)
+        vp9_iht8x8_add(tx_type, dqcoeff, dst, dst_stride, *eob);
+      break;
+    case TX_4X4:
+      if (!x->skip_recode) {
+        vpx_subtract_block(4, 4, src_diff, diff_stride,
+                           src, src_stride, dst, dst_stride);
+        if (tx_type != DCT_DCT)
+          vp9_fht4x4(src_diff, coeff, diff_stride, tx_type);
+        else
+          x->fwd_txm4x4(src_diff, coeff, diff_stride);
+        vpx_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
+                       p->quant_shift, qcoeff, dqcoeff,
+                       pd->dequant, eob, scan_order->scan,
+                       scan_order->iscan);
+      }
+
+      if (!x->skip_encode && *eob) {
+        if (tx_type == DCT_DCT)
+          // this is like vp9_short_idct4x4 but has a special case around eob<=1
+          // which is significant (not just an optimization) for the lossless
+          // case.
+          x->itxm_add(dqcoeff, dst, dst_stride, *eob);
+        else
+          vp9_iht4x4_16_add(dqcoeff, dst, dst_stride, tx_type);
+      }
+      break;
+    default:
+      assert(0);
+      break;
+  }
+  if (*eob)
+    *(args->skip) = 0;
+}
+
+void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  struct encode_b_args arg = {x, NULL, &xd->mi[0]->skip};
+
+  vp9_foreach_transformed_block_in_plane(xd, bsize, plane,
+                                         vp9_encode_block_intra, &arg);
+}
diff --git a/libs/libvpx/vp9/encoder/vp9_encodemb.h b/libs/libvpx/vp9/encoder/vp9_encodemb.h
new file mode 100644
index 0000000000..97df8a66be
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_encodemb.h
@@ -0,0 +1,46 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_VP9_ENCODEMB_H_
+#define VP9_ENCODER_VP9_ENCODEMB_H_
+
+#include "./vpx_config.h"
+#include "vp9/encoder/vp9_block.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct encode_b_args {
+  MACROBLOCK *x;
+  struct optimize_ctx *ctx;
+  int8_t *skip;
+};
+void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize);
+void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize);
+void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
+                        BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
+void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block,
+                        BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
+void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
+                     BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
+
+void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
+
+void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
+                            TX_SIZE tx_size, void *arg);
+
+void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_VP9_ENCODEMB_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_encodemv.c b/libs/libvpx/vp9/encoder/vp9_encodemv.c
new file mode 100644
index 0000000000..8f4d80cbbd
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_encodemv.c
@@ -0,0 +1,269 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_entropymode.h"
+
+#include "vp9/encoder/vp9_cost.h"
+#include "vp9/encoder/vp9_encodemv.h"
+
+#include "vpx_dsp/vpx_dsp_common.h"
+
+static struct vp9_token mv_joint_encodings[MV_JOINTS];
+static struct vp9_token mv_class_encodings[MV_CLASSES];
+static struct vp9_token mv_fp_encodings[MV_FP_SIZE];
+static struct vp9_token mv_class0_encodings[CLASS0_SIZE];
+
+void vp9_entropy_mv_init(void) {
+  vp9_tokens_from_tree(mv_joint_encodings, vp9_mv_joint_tree);
+  vp9_tokens_from_tree(mv_class_encodings, vp9_mv_class_tree);
+  vp9_tokens_from_tree(mv_class0_encodings, vp9_mv_class0_tree);
+  vp9_tokens_from_tree(mv_fp_encodings, vp9_mv_fp_tree);
+}
+
+static void encode_mv_component(vpx_writer* w, int comp,
+                                const nmv_component* mvcomp, int usehp) {
+  int offset;
+  const int sign = comp < 0;
+  const int mag = sign ? -comp : comp;
+  const int mv_class = vp9_get_mv_class(mag - 1, &offset);
+  const int d = offset >> 3;                // int mv data
+  const int fr = (offset >> 1) & 3;         // fractional mv data
+  const int hp = offset & 1;                // high precision mv data
+
+  assert(comp != 0);
+
+  // Sign
+  vpx_write(w, sign, mvcomp->sign);
+
+  // Class
+  vp9_write_token(w, vp9_mv_class_tree, mvcomp->classes,
+                  &mv_class_encodings[mv_class]);
+
+  // Integer bits
+  if (mv_class == MV_CLASS_0) {
+    vp9_write_token(w, vp9_mv_class0_tree, mvcomp->class0,
+                    &mv_class0_encodings[d]);
+  } else {
+    int i;
+    const int n = mv_class + CLASS0_BITS - 1;  // number of bits
+    for (i = 0; i < n; ++i)
+      vpx_write(w, (d >> i) & 1, mvcomp->bits[i]);
+  }
+
+  // Fractional bits
+  vp9_write_token(w, vp9_mv_fp_tree,
+                  mv_class == MV_CLASS_0 ?  mvcomp->class0_fp[d] : mvcomp->fp,
+                  &mv_fp_encodings[fr]);
+
+  // High precision bit
+  if (usehp)
+    vpx_write(w, hp,
+              mv_class == MV_CLASS_0 ? mvcomp->class0_hp : mvcomp->hp);
+}
+
+
+static void build_nmv_component_cost_table(int *mvcost,
+                                           const nmv_component* const mvcomp,
+                                           int usehp) {
+  int i, v;
+  int sign_cost[2], class_cost[MV_CLASSES], class0_cost[CLASS0_SIZE];
+  int bits_cost[MV_OFFSET_BITS][2];
+  int class0_fp_cost[CLASS0_SIZE][MV_FP_SIZE], fp_cost[MV_FP_SIZE];
+  int class0_hp_cost[2], hp_cost[2];
+
+  sign_cost[0] = vp9_cost_zero(mvcomp->sign);
+  sign_cost[1] = vp9_cost_one(mvcomp->sign);
+  vp9_cost_tokens(class_cost, mvcomp->classes, vp9_mv_class_tree);
+  vp9_cost_tokens(class0_cost, mvcomp->class0, vp9_mv_class0_tree);
+  for (i = 0; i < MV_OFFSET_BITS; ++i) {
+    bits_cost[i][0] = vp9_cost_zero(mvcomp->bits[i]);
+    bits_cost[i][1] = vp9_cost_one(mvcomp->bits[i]);
+  }
+
+  for (i = 0; i < CLASS0_SIZE; ++i)
+    vp9_cost_tokens(class0_fp_cost[i], mvcomp->class0_fp[i], vp9_mv_fp_tree);
+  vp9_cost_tokens(fp_cost, mvcomp->fp, vp9_mv_fp_tree);
+
+  if (usehp) {
+    class0_hp_cost[0] = vp9_cost_zero(mvcomp->class0_hp);
+    class0_hp_cost[1] = vp9_cost_one(mvcomp->class0_hp);
+    hp_cost[0] = vp9_cost_zero(mvcomp->hp);
+    hp_cost[1] = vp9_cost_one(mvcomp->hp);
+  }
+  mvcost[0] = 0;
+  for (v = 1; v <= MV_MAX; ++v) {
+    int z, c, o, d, e, f, cost = 0;
+    z = v - 1;
+    c = vp9_get_mv_class(z, &o);
+    cost += class_cost[c];
+    d = (o >> 3);               /* int mv data */
+    f = (o >> 1) & 3;           /* fractional pel mv data */
+    e = (o & 1);                /* high precision mv data */
+    if (c == MV_CLASS_0) {
+      cost += class0_cost[d];
+    } else {
+      int i, b;
+      b = c + CLASS0_BITS - 1;  /* number of bits */
+      for (i = 0; i < b; ++i)
+        cost += bits_cost[i][((d >> i) & 1)];
+    }
+    if (c == MV_CLASS_0) {
+      cost += class0_fp_cost[d][f];
+    } else {
+      cost += fp_cost[f];
+    }
+    if (usehp) {
+      if (c == MV_CLASS_0) {
+        cost += class0_hp_cost[e];
+      } else {
+        cost += hp_cost[e];
+      }
+    }
+    mvcost[v] = cost + sign_cost[0];
+    mvcost[-v] = cost + sign_cost[1];
+  }
+}
+
+static int update_mv(vpx_writer *w, const unsigned int ct[2], vpx_prob *cur_p,
+                     vpx_prob upd_p) {
+  const vpx_prob new_p = get_binary_prob(ct[0], ct[1]) | 1;
+  const int update = cost_branch256(ct, *cur_p) + vp9_cost_zero(upd_p) >
+                     cost_branch256(ct, new_p) + vp9_cost_one(upd_p) +
+                         (7 << VP9_PROB_COST_SHIFT);
+  vpx_write(w, update, upd_p);
+  if (update) {
+    *cur_p = new_p;
+    vpx_write_literal(w, new_p >> 1, 7);
+  }
+  return update;
+}
+
+static void write_mv_update(const vpx_tree_index *tree,
+                            vpx_prob probs[/*n - 1*/],
+                            const unsigned int counts[/*n - 1*/],
+                            int n, vpx_writer *w) {
+  int i;
+  unsigned int branch_ct[32][2];
+
+  // Assuming max number of probabilities <= 32
+  assert(n <= 32);
+
+  vp9_tree_probs_from_distribution(tree, branch_ct, counts);
+  for (i = 0; i < n - 1; ++i)
+    update_mv(w, branch_ct[i], &probs[i], MV_UPDATE_PROB);
+}
+
+void vp9_write_nmv_probs(VP9_COMMON *cm, int usehp, vpx_writer *w,
+                         nmv_context_counts *const counts) {
+  int i, j;
+  nmv_context *const mvc = &cm->fc->nmvc;
+
+  write_mv_update(vp9_mv_joint_tree, mvc->joints, counts->joints, MV_JOINTS, w);
+
+  for (i = 0; i < 2; ++i) {
+    nmv_component *comp = &mvc->comps[i];
+    nmv_component_counts *comp_counts = &counts->comps[i];
+
+    update_mv(w, comp_counts->sign, &comp->sign, MV_UPDATE_PROB);
+    write_mv_update(vp9_mv_class_tree, comp->classes, comp_counts->classes,
+                    MV_CLASSES, w);
+    write_mv_update(vp9_mv_class0_tree, comp->class0, comp_counts->class0,
+                    CLASS0_SIZE, w);
+    for (j = 0; j < MV_OFFSET_BITS; ++j)
+      update_mv(w, comp_counts->bits[j], &comp->bits[j], MV_UPDATE_PROB);
+  }
+
+  for (i = 0; i < 2; ++i) {
+    for (j = 0; j < CLASS0_SIZE; ++j)
+      write_mv_update(vp9_mv_fp_tree, mvc->comps[i].class0_fp[j],
+                      counts->comps[i].class0_fp[j], MV_FP_SIZE, w);
+
+    write_mv_update(vp9_mv_fp_tree, mvc->comps[i].fp, counts->comps[i].fp,
+                    MV_FP_SIZE, w);
+  }
+
+  if (usehp) {
+    for (i = 0; i < 2; ++i) {
+      update_mv(w, counts->comps[i].class0_hp, &mvc->comps[i].class0_hp,
+                MV_UPDATE_PROB);
+      update_mv(w, counts->comps[i].hp, &mvc->comps[i].hp, MV_UPDATE_PROB);
+    }
+  }
+}
+
+void vp9_encode_mv(VP9_COMP* cpi, vpx_writer* w,
+                   const MV* mv, const MV* ref,
+                   const nmv_context* mvctx, int usehp) {
+  const MV diff = {mv->row - ref->row,
+                   mv->col - ref->col};
+  const MV_JOINT_TYPE j = vp9_get_mv_joint(&diff);
+  usehp = usehp && use_mv_hp(ref);
+
+  vp9_write_token(w, vp9_mv_joint_tree, mvctx->joints, &mv_joint_encodings[j]);
+  if (mv_joint_vertical(j))
+    encode_mv_component(w, diff.row, &mvctx->comps[0], usehp);
+
+  if (mv_joint_horizontal(j))
+    encode_mv_component(w, diff.col, &mvctx->comps[1], usehp);
+
+  // If auto_mv_step_size is enabled then keep track of the largest
+  // motion vector component used.
+  if (cpi->sf.mv.auto_mv_step_size) {
+    unsigned int maxv = VPXMAX(abs(mv->row), abs(mv->col)) >> 3;
+    cpi->max_mv_magnitude = VPXMAX(maxv, cpi->max_mv_magnitude);
+  }
+}
+
+void vp9_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
+                              const nmv_context* ctx, int usehp) {
+  vp9_cost_tokens(mvjoint, ctx->joints, vp9_mv_joint_tree);
+  build_nmv_component_cost_table(mvcost[0], &ctx->comps[0], usehp);
+  build_nmv_component_cost_table(mvcost[1], &ctx->comps[1], usehp);
+}
+
+static void inc_mvs(const MODE_INFO *mi, const MB_MODE_INFO_EXT *mbmi_ext,
+                    const int_mv mvs[2],
+                    nmv_context_counts *counts) {
+  int i;
+
+  for (i = 0; i < 1 + has_second_ref(mi); ++i) {
+    const MV *ref = &mbmi_ext->ref_mvs[mi->ref_frame[i]][0].as_mv;
+    const MV diff = {mvs[i].as_mv.row - ref->row,
+                     mvs[i].as_mv.col - ref->col};
+    vp9_inc_mv(&diff, counts);
+  }
+}
+
+void vp9_update_mv_count(ThreadData *td) {
+  const MACROBLOCKD *xd = &td->mb.e_mbd;
+  const MODE_INFO *mi = xd->mi[0];
+  const MB_MODE_INFO_EXT *mbmi_ext = td->mb.mbmi_ext;
+
+  if (mi->sb_type < BLOCK_8X8) {
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[mi->sb_type];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[mi->sb_type];
+    int idx, idy;
+
+    for (idy = 0; idy < 2; idy += num_4x4_h) {
+      for (idx = 0; idx < 2; idx += num_4x4_w) {
+        const int i = idy * 2 + idx;
+        if (mi->bmi[i].as_mode == NEWMV)
+          inc_mvs(mi, mbmi_ext, mi->bmi[i].as_mv, &td->counts->mv);
+      }
+    }
+  } else {
+    if (mi->mode == NEWMV)
+      inc_mvs(mi, mbmi_ext, mi->mv, &td->counts->mv);
+  }
+}
+
diff --git a/libs/libvpx/vp9/encoder/vp9_encodemv.h b/libs/libvpx/vp9/encoder/vp9_encodemv.h
new file mode 100644
index 0000000000..5fb114cc1e
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_encodemv.h
@@ -0,0 +1,38 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP9_ENCODER_VP9_ENCODEMV_H_
+#define VP9_ENCODER_VP9_ENCODEMV_H_
+
+#include "vp9/encoder/vp9_encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp9_entropy_mv_init(void);
+
+void vp9_write_nmv_probs(VP9_COMMON *cm, int usehp, vpx_writer *w,
+                         nmv_context_counts *const counts);
+
+void vp9_encode_mv(VP9_COMP *cpi, vpx_writer* w, const MV* mv, const MV* ref,
+                   const nmv_context* mvctx, int usehp);
+
+void vp9_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
+                              const nmv_context* mvctx, int usehp);
+
+void vp9_update_mv_count(ThreadData *td);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_VP9_ENCODEMV_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_encoder.c b/libs/libvpx/vp9/encoder/vp9_encoder.c
new file mode 100644
index 0000000000..0f4f93d9c6
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_encoder.c
@@ -0,0 +1,4933 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <limits.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_scale_rtcd.h"
+#include "vpx/internal/vpx_psnr.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
+#if CONFIG_INTERNAL_STATS
+#include "vpx_dsp/ssim.h"
+#endif
+#include "vpx_ports/mem.h"
+#include "vpx_ports/system_state.h"
+#include "vpx_ports/vpx_timer.h"
+
+#include "vp9/common/vp9_alloccommon.h"
+#include "vp9/common/vp9_filter.h"
+#include "vp9/common/vp9_idct.h"
+#if CONFIG_VP9_POSTPROC
+#include "vp9/common/vp9_postproc.h"
+#endif
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/common/vp9_reconintra.h"
+#include "vp9/common/vp9_tile_common.h"
+
+#include "vp9/encoder/vp9_aq_360.h"
+#include "vp9/encoder/vp9_aq_complexity.h"
+#include "vp9/encoder/vp9_aq_cyclicrefresh.h"
+#include "vp9/encoder/vp9_aq_variance.h"
+#include "vp9/encoder/vp9_bitstream.h"
+#include "vp9/encoder/vp9_context_tree.h"
+#include "vp9/encoder/vp9_encodeframe.h"
+#include "vp9/encoder/vp9_encodemv.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_ethread.h"
+#include "vp9/encoder/vp9_firstpass.h"
+#include "vp9/encoder/vp9_mbgraph.h"
+#include "vp9/encoder/vp9_noise_estimate.h"
+#include "vp9/encoder/vp9_picklpf.h"
+#include "vp9/encoder/vp9_ratectrl.h"
+#include "vp9/encoder/vp9_rd.h"
+#include "vp9/encoder/vp9_resize.h"
+#include "vp9/encoder/vp9_segmentation.h"
+#include "vp9/encoder/vp9_skin_detection.h"
+#include "vp9/encoder/vp9_speed_features.h"
+#include "vp9/encoder/vp9_svc_layercontext.h"
+#include "vp9/encoder/vp9_temporal_filter.h"
+
+#define AM_SEGMENT_ID_INACTIVE 7
+#define AM_SEGMENT_ID_ACTIVE 0
+
+#define ALTREF_HIGH_PRECISION_MV 1      // Whether to use high precision mv
+                                         //  for altref computation.
+#define HIGH_PRECISION_MV_QTHRESH 200   // Q threshold for high precision
+                                         // mv. Choose a very high value for
+                                         // now so that HIGH_PRECISION is always
+                                         // chosen.
+// #define OUTPUT_YUV_REC
+
+#ifdef OUTPUT_YUV_DENOISED
+FILE *yuv_denoised_file = NULL;
+#endif
+#ifdef OUTPUT_YUV_SKINMAP
+FILE *yuv_skinmap_file = NULL;
+#endif
+#ifdef OUTPUT_YUV_REC
+FILE *yuv_rec_file;
+#endif
+
+#if 0
+FILE *framepsnr;
+FILE *kf_list;
+FILE *keyfile;
+#endif
+
+static INLINE void Scale2Ratio(VPX_SCALING mode, int *hr, int *hs) {
+  switch (mode) {
+    case NORMAL:
+      *hr = 1;
+      *hs = 1;
+      break;
+    case FOURFIVE:
+      *hr = 4;
+      *hs = 5;
+      break;
+    case THREEFIVE:
+      *hr = 3;
+      *hs = 5;
+    break;
+    case ONETWO:
+      *hr = 1;
+      *hs = 2;
+    break;
+    default:
+      *hr = 1;
+      *hs = 1;
+       assert(0);
+      break;
+  }
+}
+
+// Mark all inactive blocks as active. Other segmentation features may be set
+// so memset cannot be used, instead only inactive blocks should be reset.
+static void suppress_active_map(VP9_COMP *cpi) {
+  unsigned char *const seg_map = cpi->segmentation_map;
+  int i;
+  if (cpi->active_map.enabled || cpi->active_map.update)
+    for (i = 0; i < cpi->common.mi_rows * cpi->common.mi_cols; ++i)
+      if (seg_map[i] == AM_SEGMENT_ID_INACTIVE)
+        seg_map[i] = AM_SEGMENT_ID_ACTIVE;
+}
+
+static void apply_active_map(VP9_COMP *cpi) {
+  struct segmentation *const seg = &cpi->common.seg;
+  unsigned char *const seg_map = cpi->segmentation_map;
+  const unsigned char *const active_map = cpi->active_map.map;
+  int i;
+
+  assert(AM_SEGMENT_ID_ACTIVE == CR_SEGMENT_ID_BASE);
+
+  if (frame_is_intra_only(&cpi->common)) {
+    cpi->active_map.enabled = 0;
+    cpi->active_map.update = 1;
+  }
+
+  if (cpi->active_map.update) {
+    if (cpi->active_map.enabled) {
+      for (i = 0; i < cpi->common.mi_rows * cpi->common.mi_cols; ++i)
+        if (seg_map[i] == AM_SEGMENT_ID_ACTIVE) seg_map[i] = active_map[i];
+      vp9_enable_segmentation(seg);
+      vp9_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP);
+      vp9_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF);
+      // Setting the data to -MAX_LOOP_FILTER will result in the computed loop
+      // filter level being zero regardless of the value of seg->abs_delta.
+      vp9_set_segdata(seg, AM_SEGMENT_ID_INACTIVE,
+                      SEG_LVL_ALT_LF, -MAX_LOOP_FILTER);
+    } else {
+      vp9_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP);
+      vp9_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF);
+      if (seg->enabled) {
+        seg->update_data = 1;
+        seg->update_map = 1;
+      }
+    }
+    cpi->active_map.update = 0;
+  }
+}
+
+int vp9_set_active_map(VP9_COMP* cpi,
+                       unsigned char* new_map_16x16,
+                       int rows,
+                       int cols) {
+  if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols) {
+    unsigned char *const active_map_8x8 = cpi->active_map.map;
+    const int mi_rows = cpi->common.mi_rows;
+    const int mi_cols = cpi->common.mi_cols;
+    cpi->active_map.update = 1;
+    if (new_map_16x16) {
+      int r, c;
+      for (r = 0; r < mi_rows; ++r) {
+        for (c = 0; c < mi_cols; ++c) {
+          active_map_8x8[r * mi_cols + c] =
+              new_map_16x16[(r >> 1) * cols + (c >> 1)]
+                  ? AM_SEGMENT_ID_ACTIVE
+                  : AM_SEGMENT_ID_INACTIVE;
+        }
+      }
+      cpi->active_map.enabled = 1;
+    } else {
+      cpi->active_map.enabled = 0;
+    }
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+int vp9_get_active_map(VP9_COMP* cpi,
+                       unsigned char* new_map_16x16,
+                       int rows,
+                       int cols) {
+  if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols &&
+      new_map_16x16) {
+    unsigned char* const seg_map_8x8 = cpi->segmentation_map;
+    const int mi_rows = cpi->common.mi_rows;
+    const int mi_cols = cpi->common.mi_cols;
+    memset(new_map_16x16, !cpi->active_map.enabled, rows * cols);
+    if (cpi->active_map.enabled) {
+      int r, c;
+      for (r = 0; r < mi_rows; ++r) {
+        for (c = 0; c < mi_cols; ++c) {
+          // Cyclic refresh segments are considered active despite not having
+          // AM_SEGMENT_ID_ACTIVE
+          new_map_16x16[(r >> 1) * cols + (c >> 1)] |=
+              seg_map_8x8[r * mi_cols + c] != AM_SEGMENT_ID_INACTIVE;
+        }
+      }
+    }
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+void vp9_set_high_precision_mv(VP9_COMP *cpi, int allow_high_precision_mv) {
+  MACROBLOCK *const mb = &cpi->td.mb;
+  cpi->common.allow_high_precision_mv = allow_high_precision_mv;
+  if (cpi->common.allow_high_precision_mv) {
+    mb->mvcost = mb->nmvcost_hp;
+    mb->mvsadcost = mb->nmvsadcost_hp;
+  } else {
+    mb->mvcost = mb->nmvcost;
+    mb->mvsadcost = mb->nmvsadcost;
+  }
+}
+
+static void setup_frame(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  // Set up entropy context depending on frame type. The decoder mandates
+  // the use of the default context, index 0, for keyframes and inter
+  // frames where the error_resilient_mode or intra_only flag is set. For
+  // other inter-frames the encoder currently uses only two contexts;
+  // context 1 for ALTREF frames and context 0 for the others.
+  if (frame_is_intra_only(cm) || cm->error_resilient_mode) {
+    vp9_setup_past_independence(cm);
+  } else {
+    if (!cpi->use_svc)
+      cm->frame_context_idx = cpi->refresh_alt_ref_frame;
+  }
+
+  if (cm->frame_type == KEY_FRAME) {
+    if (!is_two_pass_svc(cpi))
+      cpi->refresh_golden_frame = 1;
+    cpi->refresh_alt_ref_frame = 1;
+    vp9_zero(cpi->interp_filter_selected);
+  } else {
+    *cm->fc = cm->frame_contexts[cm->frame_context_idx];
+    vp9_zero(cpi->interp_filter_selected[0]);
+  }
+}
+
+static void vp9_enc_setup_mi(VP9_COMMON *cm) {
+  int i;
+  cm->mi = cm->mip + cm->mi_stride + 1;
+  memset(cm->mip, 0, cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mip));
+  cm->prev_mi = cm->prev_mip + cm->mi_stride + 1;
+  // Clear top border row
+  memset(cm->prev_mip, 0, sizeof(*cm->prev_mip) * cm->mi_stride);
+  // Clear left border column
+  for (i = 1; i < cm->mi_rows + 1; ++i)
+    memset(&cm->prev_mip[i * cm->mi_stride], 0, sizeof(*cm->prev_mip));
+
+  cm->mi_grid_visible = cm->mi_grid_base + cm->mi_stride + 1;
+  cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mi_stride + 1;
+
+  memset(cm->mi_grid_base, 0,
+         cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mi_grid_base));
+}
+
+static int vp9_enc_alloc_mi(VP9_COMMON *cm, int mi_size) {
+  cm->mip = vpx_calloc(mi_size, sizeof(*cm->mip));
+  if (!cm->mip)
+    return 1;
+  cm->prev_mip = vpx_calloc(mi_size, sizeof(*cm->prev_mip));
+  if (!cm->prev_mip)
+    return 1;
+  cm->mi_alloc_size = mi_size;
+
+  cm->mi_grid_base = (MODE_INFO **)vpx_calloc(mi_size, sizeof(MODE_INFO*));
+  if (!cm->mi_grid_base)
+    return 1;
+  cm->prev_mi_grid_base = (MODE_INFO **)vpx_calloc(mi_size, sizeof(MODE_INFO*));
+  if (!cm->prev_mi_grid_base)
+    return 1;
+
+  return 0;
+}
+
+static void vp9_enc_free_mi(VP9_COMMON *cm) {
+  vpx_free(cm->mip);
+  cm->mip = NULL;
+  vpx_free(cm->prev_mip);
+  cm->prev_mip = NULL;
+  vpx_free(cm->mi_grid_base);
+  cm->mi_grid_base = NULL;
+  vpx_free(cm->prev_mi_grid_base);
+  cm->prev_mi_grid_base = NULL;
+}
+
+static void vp9_swap_mi_and_prev_mi(VP9_COMMON *cm) {
+  // Current mip will be the prev_mip for the next frame.
+  MODE_INFO **temp_base = cm->prev_mi_grid_base;
+  MODE_INFO *temp = cm->prev_mip;
+  cm->prev_mip = cm->mip;
+  cm->mip = temp;
+
+  // Update the upper left visible macroblock ptrs.
+  cm->mi = cm->mip + cm->mi_stride + 1;
+  cm->prev_mi = cm->prev_mip + cm->mi_stride + 1;
+
+  cm->prev_mi_grid_base = cm->mi_grid_base;
+  cm->mi_grid_base = temp_base;
+  cm->mi_grid_visible = cm->mi_grid_base + cm->mi_stride + 1;
+  cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mi_stride + 1;
+}
+
+void vp9_initialize_enc(void) {
+  static volatile int init_done = 0;
+
+  if (!init_done) {
+    vp9_rtcd();
+    vpx_dsp_rtcd();
+    vpx_scale_rtcd();
+    vp9_init_intra_predictors();
+    vp9_init_me_luts();
+    vp9_rc_init_minq_luts();
+    vp9_entropy_mv_init();
+    vp9_temporal_filter_init();
+    init_done = 1;
+  }
+}
+
+static void dealloc_compressor_data(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  int i;
+
+  vpx_free(cpi->mbmi_ext_base);
+  cpi->mbmi_ext_base = NULL;
+
+  vpx_free(cpi->tile_data);
+  cpi->tile_data = NULL;
+
+  // Delete sementation map
+  vpx_free(cpi->segmentation_map);
+  cpi->segmentation_map = NULL;
+  vpx_free(cpi->coding_context.last_frame_seg_map_copy);
+  cpi->coding_context.last_frame_seg_map_copy = NULL;
+
+  vpx_free(cpi->nmvcosts[0]);
+  vpx_free(cpi->nmvcosts[1]);
+  cpi->nmvcosts[0] = NULL;
+  cpi->nmvcosts[1] = NULL;
+
+  vpx_free(cpi->nmvcosts_hp[0]);
+  vpx_free(cpi->nmvcosts_hp[1]);
+  cpi->nmvcosts_hp[0] = NULL;
+  cpi->nmvcosts_hp[1] = NULL;
+
+  vpx_free(cpi->nmvsadcosts[0]);
+  vpx_free(cpi->nmvsadcosts[1]);
+  cpi->nmvsadcosts[0] = NULL;
+  cpi->nmvsadcosts[1] = NULL;
+
+  vpx_free(cpi->nmvsadcosts_hp[0]);
+  vpx_free(cpi->nmvsadcosts_hp[1]);
+  cpi->nmvsadcosts_hp[0] = NULL;
+  cpi->nmvsadcosts_hp[1] = NULL;
+
+  vp9_cyclic_refresh_free(cpi->cyclic_refresh);
+  cpi->cyclic_refresh = NULL;
+
+  vpx_free(cpi->active_map.map);
+  cpi->active_map.map = NULL;
+
+  vp9_free_ref_frame_buffers(cm->buffer_pool);
+#if CONFIG_VP9_POSTPROC
+  vp9_free_postproc_buffers(cm);
+#endif
+  vp9_free_context_buffers(cm);
+
+  vpx_free_frame_buffer(&cpi->last_frame_uf);
+  vpx_free_frame_buffer(&cpi->scaled_source);
+  vpx_free_frame_buffer(&cpi->scaled_last_source);
+  vpx_free_frame_buffer(&cpi->alt_ref_buffer);
+  vp9_lookahead_destroy(cpi->lookahead);
+
+  vpx_free(cpi->tile_tok[0][0]);
+  cpi->tile_tok[0][0] = 0;
+
+  vp9_free_pc_tree(&cpi->td);
+
+  for (i = 0; i < cpi->svc.number_spatial_layers; ++i) {
+    LAYER_CONTEXT *const lc = &cpi->svc.layer_context[i];
+    vpx_free(lc->rc_twopass_stats_in.buf);
+    lc->rc_twopass_stats_in.buf = NULL;
+    lc->rc_twopass_stats_in.sz = 0;
+  }
+
+  if (cpi->source_diff_var != NULL) {
+    vpx_free(cpi->source_diff_var);
+    cpi->source_diff_var = NULL;
+  }
+
+  for (i = 0; i < MAX_LAG_BUFFERS; ++i) {
+    vpx_free_frame_buffer(&cpi->svc.scaled_frames[i]);
+  }
+  memset(&cpi->svc.scaled_frames[0], 0,
+         MAX_LAG_BUFFERS * sizeof(cpi->svc.scaled_frames[0]));
+
+  vpx_free_frame_buffer(&cpi->svc.empty_frame.img);
+  memset(&cpi->svc.empty_frame, 0, sizeof(cpi->svc.empty_frame));
+
+  vp9_free_svc_cyclic_refresh(cpi);
+}
+
+static void save_coding_context(VP9_COMP *cpi) {
+  CODING_CONTEXT *const cc = &cpi->coding_context;
+  VP9_COMMON *cm = &cpi->common;
+
+  // Stores a snapshot of key state variables which can subsequently be
+  // restored with a call to vp9_restore_coding_context. These functions are
+  // intended for use in a re-code loop in vp9_compress_frame where the
+  // quantizer value is adjusted between loop iterations.
+  vp9_copy(cc->nmvjointcost,  cpi->td.mb.nmvjointcost);
+
+  memcpy(cc->nmvcosts[0], cpi->nmvcosts[0],
+         MV_VALS * sizeof(*cpi->nmvcosts[0]));
+  memcpy(cc->nmvcosts[1], cpi->nmvcosts[1],
+         MV_VALS * sizeof(*cpi->nmvcosts[1]));
+  memcpy(cc->nmvcosts_hp[0], cpi->nmvcosts_hp[0],
+         MV_VALS * sizeof(*cpi->nmvcosts_hp[0]));
+  memcpy(cc->nmvcosts_hp[1], cpi->nmvcosts_hp[1],
+         MV_VALS * sizeof(*cpi->nmvcosts_hp[1]));
+
+  vp9_copy(cc->segment_pred_probs, cm->seg.pred_probs);
+
+  memcpy(cpi->coding_context.last_frame_seg_map_copy,
+         cm->last_frame_seg_map, (cm->mi_rows * cm->mi_cols));
+
+  vp9_copy(cc->last_ref_lf_deltas, cm->lf.last_ref_deltas);
+  vp9_copy(cc->last_mode_lf_deltas, cm->lf.last_mode_deltas);
+
+  cc->fc = *cm->fc;
+}
+
+static void restore_coding_context(VP9_COMP *cpi) {
+  CODING_CONTEXT *const cc = &cpi->coding_context;
+  VP9_COMMON *cm = &cpi->common;
+
+  // Restore key state variables to the snapshot state stored in the
+  // previous call to vp9_save_coding_context.
+  vp9_copy(cpi->td.mb.nmvjointcost, cc->nmvjointcost);
+
+  memcpy(cpi->nmvcosts[0], cc->nmvcosts[0], MV_VALS * sizeof(*cc->nmvcosts[0]));
+  memcpy(cpi->nmvcosts[1], cc->nmvcosts[1], MV_VALS * sizeof(*cc->nmvcosts[1]));
+  memcpy(cpi->nmvcosts_hp[0], cc->nmvcosts_hp[0],
+         MV_VALS * sizeof(*cc->nmvcosts_hp[0]));
+  memcpy(cpi->nmvcosts_hp[1], cc->nmvcosts_hp[1],
+         MV_VALS * sizeof(*cc->nmvcosts_hp[1]));
+
+  vp9_copy(cm->seg.pred_probs, cc->segment_pred_probs);
+
+  memcpy(cm->last_frame_seg_map,
+         cpi->coding_context.last_frame_seg_map_copy,
+         (cm->mi_rows * cm->mi_cols));
+
+  vp9_copy(cm->lf.last_ref_deltas, cc->last_ref_lf_deltas);
+  vp9_copy(cm->lf.last_mode_deltas, cc->last_mode_lf_deltas);
+
+  *cm->fc = cc->fc;
+}
+
+static void configure_static_seg_features(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  struct segmentation *const seg = &cm->seg;
+
+  int high_q = (int)(rc->avg_q > 48.0);
+  int qi_delta;
+
+  // Disable and clear down for KF
+  if (cm->frame_type == KEY_FRAME) {
+    // Clear down the global segmentation map
+    memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+    seg->update_map = 0;
+    seg->update_data = 0;
+    cpi->static_mb_pct = 0;
+
+    // Disable segmentation
+    vp9_disable_segmentation(seg);
+
+    // Clear down the segment features.
+    vp9_clearall_segfeatures(seg);
+  } else if (cpi->refresh_alt_ref_frame) {
+    // If this is an alt ref frame
+    // Clear down the global segmentation map
+    memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+    seg->update_map = 0;
+    seg->update_data = 0;
+    cpi->static_mb_pct = 0;
+
+    // Disable segmentation and individual segment features by default
+    vp9_disable_segmentation(seg);
+    vp9_clearall_segfeatures(seg);
+
+    // Scan frames from current to arf frame.
+    // This function re-enables segmentation if appropriate.
+    vp9_update_mbgraph_stats(cpi);
+
+    // If segmentation was enabled set those features needed for the
+    // arf itself.
+    if (seg->enabled) {
+      seg->update_map = 1;
+      seg->update_data = 1;
+
+      qi_delta = vp9_compute_qdelta(rc, rc->avg_q, rc->avg_q * 0.875,
+                                    cm->bit_depth);
+      vp9_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta - 2);
+      vp9_set_segdata(seg, 1, SEG_LVL_ALT_LF, -2);
+
+      vp9_enable_segfeature(seg, 1, SEG_LVL_ALT_Q);
+      vp9_enable_segfeature(seg, 1, SEG_LVL_ALT_LF);
+
+      // Where relevant assume segment data is delta data
+      seg->abs_delta = SEGMENT_DELTADATA;
+    }
+  } else if (seg->enabled) {
+    // All other frames if segmentation has been enabled
+
+    // First normal frame in a valid gf or alt ref group
+    if (rc->frames_since_golden == 0) {
+      // Set up segment features for normal frames in an arf group
+      if (rc->source_alt_ref_active) {
+        seg->update_map = 0;
+        seg->update_data = 1;
+        seg->abs_delta = SEGMENT_DELTADATA;
+
+        qi_delta = vp9_compute_qdelta(rc, rc->avg_q, rc->avg_q * 1.125,
+                                      cm->bit_depth);
+        vp9_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta + 2);
+        vp9_enable_segfeature(seg, 1, SEG_LVL_ALT_Q);
+
+        vp9_set_segdata(seg, 1, SEG_LVL_ALT_LF, -2);
+        vp9_enable_segfeature(seg, 1, SEG_LVL_ALT_LF);
+
+        // Segment coding disabled for compred testing
+        if (high_q || (cpi->static_mb_pct == 100)) {
+          vp9_set_segdata(seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME);
+          vp9_enable_segfeature(seg, 1, SEG_LVL_REF_FRAME);
+          vp9_enable_segfeature(seg, 1, SEG_LVL_SKIP);
+        }
+      } else {
+        // Disable segmentation and clear down features if alt ref
+        // is not active for this group
+
+        vp9_disable_segmentation(seg);
+
+        memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+
+        seg->update_map = 0;
+        seg->update_data = 0;
+
+        vp9_clearall_segfeatures(seg);
+      }
+    } else if (rc->is_src_frame_alt_ref) {
+      // Special case where we are coding over the top of a previous
+      // alt ref frame.
+      // Segment coding disabled for compred testing
+
+      // Enable ref frame features for segment 0 as well
+      vp9_enable_segfeature(seg, 0, SEG_LVL_REF_FRAME);
+      vp9_enable_segfeature(seg, 1, SEG_LVL_REF_FRAME);
+
+      // All mbs should use ALTREF_FRAME
+      vp9_clear_segdata(seg, 0, SEG_LVL_REF_FRAME);
+      vp9_set_segdata(seg, 0, SEG_LVL_REF_FRAME, ALTREF_FRAME);
+      vp9_clear_segdata(seg, 1, SEG_LVL_REF_FRAME);
+      vp9_set_segdata(seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME);
+
+      // Skip all MBs if high Q (0,0 mv and skip coeffs)
+      if (high_q) {
+        vp9_enable_segfeature(seg, 0, SEG_LVL_SKIP);
+        vp9_enable_segfeature(seg, 1, SEG_LVL_SKIP);
+      }
+      // Enable data update
+      seg->update_data = 1;
+    } else {
+      // All other frames.
+
+      // No updates.. leave things as they are.
+      seg->update_map = 0;
+      seg->update_data = 0;
+    }
+  }
+}
+
+static void update_reference_segmentation_map(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  MODE_INFO **mi_8x8_ptr = cm->mi_grid_visible;
+  uint8_t *cache_ptr = cm->last_frame_seg_map;
+  int row, col;
+
+  for (row = 0; row < cm->mi_rows; row++) {
+    MODE_INFO **mi_8x8 = mi_8x8_ptr;
+    uint8_t *cache = cache_ptr;
+    for (col = 0; col < cm->mi_cols; col++, mi_8x8++, cache++)
+      cache[0] = mi_8x8[0]->segment_id;
+    mi_8x8_ptr += cm->mi_stride;
+    cache_ptr += cm->mi_cols;
+  }
+}
+
+static void alloc_raw_frame_buffers(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+  const VP9EncoderConfig *oxcf = &cpi->oxcf;
+
+  if (!cpi->lookahead)
+    cpi->lookahead = vp9_lookahead_init(oxcf->width, oxcf->height,
+                                        cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                      cm->use_highbitdepth,
+#endif
+                                      oxcf->lag_in_frames);
+  if (!cpi->lookahead)
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate lag buffers");
+
+  // TODO(agrange) Check if ARF is enabled and skip allocation if not.
+  if (vpx_realloc_frame_buffer(&cpi->alt_ref_buffer,
+                               oxcf->width, oxcf->height,
+                               cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                               cm->use_highbitdepth,
+#endif
+                               VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+                               NULL, NULL, NULL))
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate altref buffer");
+}
+
+static void alloc_util_frame_buffers(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  if (vpx_realloc_frame_buffer(&cpi->last_frame_uf,
+                               cm->width, cm->height,
+                               cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                               cm->use_highbitdepth,
+#endif
+                               VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+                               NULL, NULL, NULL))
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate last frame buffer");
+
+  if (vpx_realloc_frame_buffer(&cpi->scaled_source,
+                               cm->width, cm->height,
+                               cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                               cm->use_highbitdepth,
+#endif
+                               VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+                               NULL, NULL, NULL))
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate scaled source buffer");
+
+  if (vpx_realloc_frame_buffer(&cpi->scaled_last_source,
+                               cm->width, cm->height,
+                               cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                               cm->use_highbitdepth,
+#endif
+                               VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+                               NULL, NULL, NULL))
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate scaled last source buffer");
+}
+
+
+static int alloc_context_buffers_ext(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+  int mi_size = cm->mi_cols * cm->mi_rows;
+
+  cpi->mbmi_ext_base = vpx_calloc(mi_size, sizeof(*cpi->mbmi_ext_base));
+  if (!cpi->mbmi_ext_base)
+    return 1;
+
+  return 0;
+}
+
+static void alloc_compressor_data(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+
+  vp9_alloc_context_buffers(cm, cm->width, cm->height);
+
+  alloc_context_buffers_ext(cpi);
+
+  vpx_free(cpi->tile_tok[0][0]);
+
+  {
+    unsigned int tokens = get_token_alloc(cm->mb_rows, cm->mb_cols);
+    CHECK_MEM_ERROR(cm, cpi->tile_tok[0][0],
+        vpx_calloc(tokens, sizeof(*cpi->tile_tok[0][0])));
+  }
+
+  vp9_setup_pc_tree(&cpi->common, &cpi->td);
+}
+
+void vp9_new_framerate(VP9_COMP *cpi, double framerate) {
+  cpi->framerate = framerate < 0.1 ? 30 : framerate;
+  vp9_rc_update_framerate(cpi);
+}
+
+static void set_tile_limits(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+
+  int min_log2_tile_cols, max_log2_tile_cols;
+  vp9_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
+
+  if (is_two_pass_svc(cpi) &&
+      (cpi->svc.encode_empty_frame_state == ENCODING ||
+      cpi->svc.number_spatial_layers > 1)) {
+    cm->log2_tile_cols = 0;
+    cm->log2_tile_rows = 0;
+  } else {
+    cm->log2_tile_cols = clamp(cpi->oxcf.tile_columns,
+                               min_log2_tile_cols, max_log2_tile_cols);
+    cm->log2_tile_rows = cpi->oxcf.tile_rows;
+  }
+}
+
+static void update_frame_size(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+
+  vp9_set_mb_mi(cm, cm->width, cm->height);
+  vp9_init_context_buffers(cm);
+  vp9_init_macroblockd(cm, xd, NULL);
+  cpi->td.mb.mbmi_ext_base = cpi->mbmi_ext_base;
+  memset(cpi->mbmi_ext_base, 0,
+         cm->mi_rows * cm->mi_cols * sizeof(*cpi->mbmi_ext_base));
+
+  set_tile_limits(cpi);
+
+  if (is_two_pass_svc(cpi)) {
+    if (vpx_realloc_frame_buffer(&cpi->alt_ref_buffer,
+                                 cm->width, cm->height,
+                                 cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                 cm->use_highbitdepth,
+#endif
+                                 VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+                                 NULL, NULL, NULL))
+      vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                         "Failed to reallocate alt_ref_buffer");
+  }
+}
+
+static void init_buffer_indices(VP9_COMP *cpi) {
+  cpi->lst_fb_idx = 0;
+  cpi->gld_fb_idx = 1;
+  cpi->alt_fb_idx = 2;
+}
+
+static void init_config(struct VP9_COMP *cpi, VP9EncoderConfig *oxcf) {
+  VP9_COMMON *const cm = &cpi->common;
+
+  cpi->oxcf = *oxcf;
+  cpi->framerate = oxcf->init_framerate;
+
+  cm->profile = oxcf->profile;
+  cm->bit_depth = oxcf->bit_depth;
+#if CONFIG_VP9_HIGHBITDEPTH
+  cm->use_highbitdepth = oxcf->use_highbitdepth;
+#endif
+  cm->color_space = oxcf->color_space;
+  cm->color_range = oxcf->color_range;
+
+  cm->width = oxcf->width;
+  cm->height = oxcf->height;
+  alloc_compressor_data(cpi);
+
+  cpi->svc.temporal_layering_mode = oxcf->temporal_layering_mode;
+
+  // Single thread case: use counts in common.
+  cpi->td.counts = &cm->counts;
+
+  // Spatial scalability.
+  cpi->svc.number_spatial_layers = oxcf->ss_number_layers;
+  // Temporal scalability.
+  cpi->svc.number_temporal_layers = oxcf->ts_number_layers;
+
+  if ((cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) ||
+      ((cpi->svc.number_temporal_layers > 1 ||
+        cpi->svc.number_spatial_layers > 1) &&
+       cpi->oxcf.pass != 1)) {
+    vp9_init_layer_context(cpi);
+  }
+
+  // change includes all joint functionality
+  vp9_change_config(cpi, oxcf);
+
+  cpi->static_mb_pct = 0;
+  cpi->ref_frame_flags = 0;
+
+  init_buffer_indices(cpi);
+
+  vp9_noise_estimate_init(&cpi->noise_estimate, cm->width, cm->height);
+}
+
+static void set_rc_buffer_sizes(RATE_CONTROL *rc,
+                                const VP9EncoderConfig *oxcf) {
+  const int64_t bandwidth = oxcf->target_bandwidth;
+  const int64_t starting = oxcf->starting_buffer_level_ms;
+  const int64_t optimal = oxcf->optimal_buffer_level_ms;
+  const int64_t maximum = oxcf->maximum_buffer_size_ms;
+
+  rc->starting_buffer_level = starting * bandwidth / 1000;
+  rc->optimal_buffer_level = (optimal == 0) ? bandwidth / 8
+                                            : optimal * bandwidth / 1000;
+  rc->maximum_buffer_size = (maximum == 0) ? bandwidth / 8
+                                           : maximum * bandwidth / 1000;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX3F, SDX8F, SDX4DF) \
+    cpi->fn_ptr[BT].sdf = SDF; \
+    cpi->fn_ptr[BT].sdaf = SDAF; \
+    cpi->fn_ptr[BT].vf = VF; \
+    cpi->fn_ptr[BT].svf = SVF; \
+    cpi->fn_ptr[BT].svaf = SVAF; \
+    cpi->fn_ptr[BT].sdx3f = SDX3F; \
+    cpi->fn_ptr[BT].sdx8f = SDX8F; \
+    cpi->fn_ptr[BT].sdx4df = SDX4DF;
+
+#define MAKE_BFP_SAD_WRAPPER(fnname) \
+static unsigned int fnname##_bits8(const uint8_t *src_ptr, \
+                                   int source_stride, \
+                                   const uint8_t *ref_ptr, \
+                                   int ref_stride) {  \
+  return fnname(src_ptr, source_stride, ref_ptr, ref_stride); \
+} \
+static unsigned int fnname##_bits10(const uint8_t *src_ptr, \
+                                    int source_stride, \
+                                    const uint8_t *ref_ptr, \
+                                    int ref_stride) {  \
+  return fnname(src_ptr, source_stride, ref_ptr, ref_stride) >> 2; \
+} \
+static unsigned int fnname##_bits12(const uint8_t *src_ptr, \
+                                    int source_stride, \
+                                    const uint8_t *ref_ptr, \
+                                    int ref_stride) {  \
+  return fnname(src_ptr, source_stride, ref_ptr, ref_stride) >> 4; \
+}
+
+#define MAKE_BFP_SADAVG_WRAPPER(fnname) static unsigned int \
+fnname##_bits8(const uint8_t *src_ptr, \
+               int source_stride, \
+               const uint8_t *ref_ptr, \
+               int ref_stride, \
+               const uint8_t *second_pred) {  \
+  return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred); \
+} \
+static unsigned int fnname##_bits10(const uint8_t *src_ptr, \
+                                    int source_stride, \
+                                    const uint8_t *ref_ptr, \
+                                    int ref_stride, \
+                                    const uint8_t *second_pred) {  \
+  return fnname(src_ptr, source_stride, ref_ptr, ref_stride, \
+                second_pred) >> 2; \
+} \
+static unsigned int fnname##_bits12(const uint8_t *src_ptr, \
+                                    int source_stride, \
+                                    const uint8_t *ref_ptr, \
+                                    int ref_stride, \
+                                    const uint8_t *second_pred) {  \
+  return fnname(src_ptr, source_stride, ref_ptr, ref_stride, \
+                second_pred) >> 4; \
+}
+
+#define MAKE_BFP_SAD3_WRAPPER(fnname) \
+static void fnname##_bits8(const uint8_t *src_ptr, \
+                           int source_stride, \
+                           const uint8_t *ref_ptr, \
+                           int  ref_stride, \
+                           unsigned int *sad_array) {  \
+  fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+} \
+static void fnname##_bits10(const uint8_t *src_ptr, \
+                            int source_stride, \
+                            const uint8_t *ref_ptr, \
+                            int  ref_stride, \
+                            unsigned int *sad_array) {  \
+  int i; \
+  fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+  for (i = 0; i < 3; i++) \
+    sad_array[i] >>= 2; \
+} \
+static void fnname##_bits12(const uint8_t *src_ptr, \
+                            int source_stride, \
+                            const uint8_t *ref_ptr, \
+                            int  ref_stride, \
+                            unsigned int *sad_array) {  \
+  int i; \
+  fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+  for (i = 0; i < 3; i++) \
+    sad_array[i] >>= 4; \
+}
+
+#define MAKE_BFP_SAD8_WRAPPER(fnname) \
+static void fnname##_bits8(const uint8_t *src_ptr, \
+                           int source_stride, \
+                           const uint8_t *ref_ptr, \
+                           int  ref_stride, \
+                           unsigned int *sad_array) {  \
+  fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+} \
+static void fnname##_bits10(const uint8_t *src_ptr, \
+                            int source_stride, \
+                            const uint8_t *ref_ptr, \
+                            int  ref_stride, \
+                            unsigned int *sad_array) {  \
+  int i; \
+  fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+  for (i = 0; i < 8; i++) \
+    sad_array[i] >>= 2; \
+} \
+static void fnname##_bits12(const uint8_t *src_ptr, \
+                            int source_stride, \
+                            const uint8_t *ref_ptr, \
+                            int  ref_stride, \
+                            unsigned int *sad_array) {  \
+  int i; \
+  fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+  for (i = 0; i < 8; i++) \
+    sad_array[i] >>= 4; \
+}
+#define MAKE_BFP_SAD4D_WRAPPER(fnname) \
+static void fnname##_bits8(const uint8_t *src_ptr, \
+                           int source_stride, \
+                           const uint8_t* const ref_ptr[], \
+                           int  ref_stride, \
+                           unsigned int *sad_array) {  \
+  fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+} \
+static void fnname##_bits10(const uint8_t *src_ptr, \
+                            int source_stride, \
+                            const uint8_t* const ref_ptr[], \
+                            int  ref_stride, \
+                            unsigned int *sad_array) {  \
+  int i; \
+  fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+  for (i = 0; i < 4; i++) \
+  sad_array[i] >>= 2; \
+} \
+static void fnname##_bits12(const uint8_t *src_ptr, \
+                            int source_stride, \
+                            const uint8_t* const ref_ptr[], \
+                            int  ref_stride, \
+                            unsigned int *sad_array) {  \
+  int i; \
+  fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+  for (i = 0; i < 4; i++) \
+  sad_array[i] >>= 4; \
+}
+
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x16)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x16x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x32)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x32x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad64x32)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad64x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad64x32x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x64)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x64_avg)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x64x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x32)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x32_avg)
+MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad32x32x3)
+MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad32x32x8)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x32x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad64x64)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad64x64_avg)
+MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad64x64x3)
+MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad64x64x8)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad64x64x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x16)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x16_avg)
+MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad16x16x3)
+MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad16x16x8)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x16x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x8)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x8_avg)
+MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad16x8x3)
+MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad16x8x8)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x8x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x16)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x16_avg)
+MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad8x16x3)
+MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad8x16x8)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x16x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x8)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x8_avg)
+MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad8x8x3)
+MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad8x8x8)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x8x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x4)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x4_avg)
+MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad8x4x8)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x4x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad4x8)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad4x8_avg)
+MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad4x8x8)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x8x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad4x4)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad4x4_avg)
+MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad4x4x3)
+MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad4x4x8)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x4x4d)
+
+static void  highbd_set_var_fns(VP9_COMP *const cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  if (cm->use_highbitdepth) {
+    switch (cm->bit_depth) {
+      case VPX_BITS_8:
+        HIGHBD_BFP(BLOCK_32X16,
+                   vpx_highbd_sad32x16_bits8,
+                   vpx_highbd_sad32x16_avg_bits8,
+                   vpx_highbd_8_variance32x16,
+                   vpx_highbd_8_sub_pixel_variance32x16,
+                   vpx_highbd_8_sub_pixel_avg_variance32x16,
+                   NULL,
+                   NULL,
+                   vpx_highbd_sad32x16x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_16X32,
+                   vpx_highbd_sad16x32_bits8,
+                   vpx_highbd_sad16x32_avg_bits8,
+                   vpx_highbd_8_variance16x32,
+                   vpx_highbd_8_sub_pixel_variance16x32,
+                   vpx_highbd_8_sub_pixel_avg_variance16x32,
+                   NULL,
+                   NULL,
+                   vpx_highbd_sad16x32x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_64X32,
+                   vpx_highbd_sad64x32_bits8,
+                   vpx_highbd_sad64x32_avg_bits8,
+                   vpx_highbd_8_variance64x32,
+                   vpx_highbd_8_sub_pixel_variance64x32,
+                   vpx_highbd_8_sub_pixel_avg_variance64x32,
+                   NULL,
+                   NULL,
+                   vpx_highbd_sad64x32x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_32X64,
+                   vpx_highbd_sad32x64_bits8,
+                   vpx_highbd_sad32x64_avg_bits8,
+                   vpx_highbd_8_variance32x64,
+                   vpx_highbd_8_sub_pixel_variance32x64,
+                   vpx_highbd_8_sub_pixel_avg_variance32x64,
+                   NULL,
+                   NULL,
+                   vpx_highbd_sad32x64x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_32X32,
+                   vpx_highbd_sad32x32_bits8,
+                   vpx_highbd_sad32x32_avg_bits8,
+                   vpx_highbd_8_variance32x32,
+                   vpx_highbd_8_sub_pixel_variance32x32,
+                   vpx_highbd_8_sub_pixel_avg_variance32x32,
+                   vpx_highbd_sad32x32x3_bits8,
+                   vpx_highbd_sad32x32x8_bits8,
+                   vpx_highbd_sad32x32x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_64X64,
+                   vpx_highbd_sad64x64_bits8,
+                   vpx_highbd_sad64x64_avg_bits8,
+                   vpx_highbd_8_variance64x64,
+                   vpx_highbd_8_sub_pixel_variance64x64,
+                   vpx_highbd_8_sub_pixel_avg_variance64x64,
+                   vpx_highbd_sad64x64x3_bits8,
+                   vpx_highbd_sad64x64x8_bits8,
+                   vpx_highbd_sad64x64x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_16X16,
+                   vpx_highbd_sad16x16_bits8,
+                   vpx_highbd_sad16x16_avg_bits8,
+                   vpx_highbd_8_variance16x16,
+                   vpx_highbd_8_sub_pixel_variance16x16,
+                   vpx_highbd_8_sub_pixel_avg_variance16x16,
+                   vpx_highbd_sad16x16x3_bits8,
+                   vpx_highbd_sad16x16x8_bits8,
+                   vpx_highbd_sad16x16x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_16X8,
+                   vpx_highbd_sad16x8_bits8,
+                   vpx_highbd_sad16x8_avg_bits8,
+                   vpx_highbd_8_variance16x8,
+                   vpx_highbd_8_sub_pixel_variance16x8,
+                   vpx_highbd_8_sub_pixel_avg_variance16x8,
+                   vpx_highbd_sad16x8x3_bits8,
+                   vpx_highbd_sad16x8x8_bits8,
+                   vpx_highbd_sad16x8x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_8X16,
+                   vpx_highbd_sad8x16_bits8,
+                   vpx_highbd_sad8x16_avg_bits8,
+                   vpx_highbd_8_variance8x16,
+                   vpx_highbd_8_sub_pixel_variance8x16,
+                   vpx_highbd_8_sub_pixel_avg_variance8x16,
+                   vpx_highbd_sad8x16x3_bits8,
+                   vpx_highbd_sad8x16x8_bits8,
+                   vpx_highbd_sad8x16x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_8X8,
+                   vpx_highbd_sad8x8_bits8,
+                   vpx_highbd_sad8x8_avg_bits8,
+                   vpx_highbd_8_variance8x8,
+                   vpx_highbd_8_sub_pixel_variance8x8,
+                   vpx_highbd_8_sub_pixel_avg_variance8x8,
+                   vpx_highbd_sad8x8x3_bits8,
+                   vpx_highbd_sad8x8x8_bits8,
+                   vpx_highbd_sad8x8x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_8X4,
+                   vpx_highbd_sad8x4_bits8,
+                   vpx_highbd_sad8x4_avg_bits8,
+                   vpx_highbd_8_variance8x4,
+                   vpx_highbd_8_sub_pixel_variance8x4,
+                   vpx_highbd_8_sub_pixel_avg_variance8x4,
+                   NULL,
+                   vpx_highbd_sad8x4x8_bits8,
+                   vpx_highbd_sad8x4x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_4X8,
+                   vpx_highbd_sad4x8_bits8,
+                   vpx_highbd_sad4x8_avg_bits8,
+                   vpx_highbd_8_variance4x8,
+                   vpx_highbd_8_sub_pixel_variance4x8,
+                   vpx_highbd_8_sub_pixel_avg_variance4x8,
+                   NULL,
+                   vpx_highbd_sad4x8x8_bits8,
+                   vpx_highbd_sad4x8x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_4X4,
+                   vpx_highbd_sad4x4_bits8,
+                   vpx_highbd_sad4x4_avg_bits8,
+                   vpx_highbd_8_variance4x4,
+                   vpx_highbd_8_sub_pixel_variance4x4,
+                   vpx_highbd_8_sub_pixel_avg_variance4x4,
+                   vpx_highbd_sad4x4x3_bits8,
+                   vpx_highbd_sad4x4x8_bits8,
+                   vpx_highbd_sad4x4x4d_bits8)
+        break;
+
+      case VPX_BITS_10:
+        HIGHBD_BFP(BLOCK_32X16,
+                   vpx_highbd_sad32x16_bits10,
+                   vpx_highbd_sad32x16_avg_bits10,
+                   vpx_highbd_10_variance32x16,
+                   vpx_highbd_10_sub_pixel_variance32x16,
+                   vpx_highbd_10_sub_pixel_avg_variance32x16,
+                   NULL,
+                   NULL,
+                   vpx_highbd_sad32x16x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_16X32,
+                   vpx_highbd_sad16x32_bits10,
+                   vpx_highbd_sad16x32_avg_bits10,
+                   vpx_highbd_10_variance16x32,
+                   vpx_highbd_10_sub_pixel_variance16x32,
+                   vpx_highbd_10_sub_pixel_avg_variance16x32,
+                   NULL,
+                   NULL,
+                   vpx_highbd_sad16x32x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_64X32,
+                   vpx_highbd_sad64x32_bits10,
+                   vpx_highbd_sad64x32_avg_bits10,
+                   vpx_highbd_10_variance64x32,
+                   vpx_highbd_10_sub_pixel_variance64x32,
+                   vpx_highbd_10_sub_pixel_avg_variance64x32,
+                   NULL,
+                   NULL,
+                   vpx_highbd_sad64x32x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_32X64,
+                   vpx_highbd_sad32x64_bits10,
+                   vpx_highbd_sad32x64_avg_bits10,
+                   vpx_highbd_10_variance32x64,
+                   vpx_highbd_10_sub_pixel_variance32x64,
+                   vpx_highbd_10_sub_pixel_avg_variance32x64,
+                   NULL,
+                   NULL,
+                   vpx_highbd_sad32x64x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_32X32,
+                   vpx_highbd_sad32x32_bits10,
+                   vpx_highbd_sad32x32_avg_bits10,
+                   vpx_highbd_10_variance32x32,
+                   vpx_highbd_10_sub_pixel_variance32x32,
+                   vpx_highbd_10_sub_pixel_avg_variance32x32,
+                   vpx_highbd_sad32x32x3_bits10,
+                   vpx_highbd_sad32x32x8_bits10,
+                   vpx_highbd_sad32x32x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_64X64,
+                   vpx_highbd_sad64x64_bits10,
+                   vpx_highbd_sad64x64_avg_bits10,
+                   vpx_highbd_10_variance64x64,
+                   vpx_highbd_10_sub_pixel_variance64x64,
+                   vpx_highbd_10_sub_pixel_avg_variance64x64,
+                   vpx_highbd_sad64x64x3_bits10,
+                   vpx_highbd_sad64x64x8_bits10,
+                   vpx_highbd_sad64x64x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_16X16,
+                   vpx_highbd_sad16x16_bits10,
+                   vpx_highbd_sad16x16_avg_bits10,
+                   vpx_highbd_10_variance16x16,
+                   vpx_highbd_10_sub_pixel_variance16x16,
+                   vpx_highbd_10_sub_pixel_avg_variance16x16,
+                   vpx_highbd_sad16x16x3_bits10,
+                   vpx_highbd_sad16x16x8_bits10,
+                   vpx_highbd_sad16x16x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_16X8,
+                   vpx_highbd_sad16x8_bits10,
+                   vpx_highbd_sad16x8_avg_bits10,
+                   vpx_highbd_10_variance16x8,
+                   vpx_highbd_10_sub_pixel_variance16x8,
+                   vpx_highbd_10_sub_pixel_avg_variance16x8,
+                   vpx_highbd_sad16x8x3_bits10,
+                   vpx_highbd_sad16x8x8_bits10,
+                   vpx_highbd_sad16x8x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_8X16,
+                   vpx_highbd_sad8x16_bits10,
+                   vpx_highbd_sad8x16_avg_bits10,
+                   vpx_highbd_10_variance8x16,
+                   vpx_highbd_10_sub_pixel_variance8x16,
+                   vpx_highbd_10_sub_pixel_avg_variance8x16,
+                   vpx_highbd_sad8x16x3_bits10,
+                   vpx_highbd_sad8x16x8_bits10,
+                   vpx_highbd_sad8x16x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_8X8,
+                   vpx_highbd_sad8x8_bits10,
+                   vpx_highbd_sad8x8_avg_bits10,
+                   vpx_highbd_10_variance8x8,
+                   vpx_highbd_10_sub_pixel_variance8x8,
+                   vpx_highbd_10_sub_pixel_avg_variance8x8,
+                   vpx_highbd_sad8x8x3_bits10,
+                   vpx_highbd_sad8x8x8_bits10,
+                   vpx_highbd_sad8x8x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_8X4,
+                   vpx_highbd_sad8x4_bits10,
+                   vpx_highbd_sad8x4_avg_bits10,
+                   vpx_highbd_10_variance8x4,
+                   vpx_highbd_10_sub_pixel_variance8x4,
+                   vpx_highbd_10_sub_pixel_avg_variance8x4,
+                   NULL,
+                   vpx_highbd_sad8x4x8_bits10,
+                   vpx_highbd_sad8x4x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_4X8,
+                   vpx_highbd_sad4x8_bits10,
+                   vpx_highbd_sad4x8_avg_bits10,
+                   vpx_highbd_10_variance4x8,
+                   vpx_highbd_10_sub_pixel_variance4x8,
+                   vpx_highbd_10_sub_pixel_avg_variance4x8,
+                   NULL,
+                   vpx_highbd_sad4x8x8_bits10,
+                   vpx_highbd_sad4x8x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_4X4,
+                   vpx_highbd_sad4x4_bits10,
+                   vpx_highbd_sad4x4_avg_bits10,
+                   vpx_highbd_10_variance4x4,
+                   vpx_highbd_10_sub_pixel_variance4x4,
+                   vpx_highbd_10_sub_pixel_avg_variance4x4,
+                   vpx_highbd_sad4x4x3_bits10,
+                   vpx_highbd_sad4x4x8_bits10,
+                   vpx_highbd_sad4x4x4d_bits10)
+        break;
+
+      case VPX_BITS_12:
+        HIGHBD_BFP(BLOCK_32X16,
+                   vpx_highbd_sad32x16_bits12,
+                   vpx_highbd_sad32x16_avg_bits12,
+                   vpx_highbd_12_variance32x16,
+                   vpx_highbd_12_sub_pixel_variance32x16,
+                   vpx_highbd_12_sub_pixel_avg_variance32x16,
+                   NULL,
+                   NULL,
+                   vpx_highbd_sad32x16x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_16X32,
+                   vpx_highbd_sad16x32_bits12,
+                   vpx_highbd_sad16x32_avg_bits12,
+                   vpx_highbd_12_variance16x32,
+                   vpx_highbd_12_sub_pixel_variance16x32,
+                   vpx_highbd_12_sub_pixel_avg_variance16x32,
+                   NULL,
+                   NULL,
+                   vpx_highbd_sad16x32x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_64X32,
+                   vpx_highbd_sad64x32_bits12,
+                   vpx_highbd_sad64x32_avg_bits12,
+                   vpx_highbd_12_variance64x32,
+                   vpx_highbd_12_sub_pixel_variance64x32,
+                   vpx_highbd_12_sub_pixel_avg_variance64x32,
+                   NULL,
+                   NULL,
+                   vpx_highbd_sad64x32x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_32X64,
+                   vpx_highbd_sad32x64_bits12,
+                   vpx_highbd_sad32x64_avg_bits12,
+                   vpx_highbd_12_variance32x64,
+                   vpx_highbd_12_sub_pixel_variance32x64,
+                   vpx_highbd_12_sub_pixel_avg_variance32x64,
+                   NULL,
+                   NULL,
+                   vpx_highbd_sad32x64x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_32X32,
+                   vpx_highbd_sad32x32_bits12,
+                   vpx_highbd_sad32x32_avg_bits12,
+                   vpx_highbd_12_variance32x32,
+                   vpx_highbd_12_sub_pixel_variance32x32,
+                   vpx_highbd_12_sub_pixel_avg_variance32x32,
+                   vpx_highbd_sad32x32x3_bits12,
+                   vpx_highbd_sad32x32x8_bits12,
+                   vpx_highbd_sad32x32x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_64X64,
+                   vpx_highbd_sad64x64_bits12,
+                   vpx_highbd_sad64x64_avg_bits12,
+                   vpx_highbd_12_variance64x64,
+                   vpx_highbd_12_sub_pixel_variance64x64,
+                   vpx_highbd_12_sub_pixel_avg_variance64x64,
+                   vpx_highbd_sad64x64x3_bits12,
+                   vpx_highbd_sad64x64x8_bits12,
+                   vpx_highbd_sad64x64x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_16X16,
+                   vpx_highbd_sad16x16_bits12,
+                   vpx_highbd_sad16x16_avg_bits12,
+                   vpx_highbd_12_variance16x16,
+                   vpx_highbd_12_sub_pixel_variance16x16,
+                   vpx_highbd_12_sub_pixel_avg_variance16x16,
+                   vpx_highbd_sad16x16x3_bits12,
+                   vpx_highbd_sad16x16x8_bits12,
+                   vpx_highbd_sad16x16x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_16X8,
+                   vpx_highbd_sad16x8_bits12,
+                   vpx_highbd_sad16x8_avg_bits12,
+                   vpx_highbd_12_variance16x8,
+                   vpx_highbd_12_sub_pixel_variance16x8,
+                   vpx_highbd_12_sub_pixel_avg_variance16x8,
+                   vpx_highbd_sad16x8x3_bits12,
+                   vpx_highbd_sad16x8x8_bits12,
+                   vpx_highbd_sad16x8x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_8X16,
+                   vpx_highbd_sad8x16_bits12,
+                   vpx_highbd_sad8x16_avg_bits12,
+                   vpx_highbd_12_variance8x16,
+                   vpx_highbd_12_sub_pixel_variance8x16,
+                   vpx_highbd_12_sub_pixel_avg_variance8x16,
+                   vpx_highbd_sad8x16x3_bits12,
+                   vpx_highbd_sad8x16x8_bits12,
+                   vpx_highbd_sad8x16x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_8X8,
+                   vpx_highbd_sad8x8_bits12,
+                   vpx_highbd_sad8x8_avg_bits12,
+                   vpx_highbd_12_variance8x8,
+                   vpx_highbd_12_sub_pixel_variance8x8,
+                   vpx_highbd_12_sub_pixel_avg_variance8x8,
+                   vpx_highbd_sad8x8x3_bits12,
+                   vpx_highbd_sad8x8x8_bits12,
+                   vpx_highbd_sad8x8x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_8X4,
+                   vpx_highbd_sad8x4_bits12,
+                   vpx_highbd_sad8x4_avg_bits12,
+                   vpx_highbd_12_variance8x4,
+                   vpx_highbd_12_sub_pixel_variance8x4,
+                   vpx_highbd_12_sub_pixel_avg_variance8x4,
+                   NULL,
+                   vpx_highbd_sad8x4x8_bits12,
+                   vpx_highbd_sad8x4x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_4X8,
+                   vpx_highbd_sad4x8_bits12,
+                   vpx_highbd_sad4x8_avg_bits12,
+                   vpx_highbd_12_variance4x8,
+                   vpx_highbd_12_sub_pixel_variance4x8,
+                   vpx_highbd_12_sub_pixel_avg_variance4x8,
+                   NULL,
+                   vpx_highbd_sad4x8x8_bits12,
+                   vpx_highbd_sad4x8x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_4X4,
+                   vpx_highbd_sad4x4_bits12,
+                   vpx_highbd_sad4x4_avg_bits12,
+                   vpx_highbd_12_variance4x4,
+                   vpx_highbd_12_sub_pixel_variance4x4,
+                   vpx_highbd_12_sub_pixel_avg_variance4x4,
+                   vpx_highbd_sad4x4x3_bits12,
+                   vpx_highbd_sad4x4x8_bits12,
+                   vpx_highbd_sad4x4x4d_bits12)
+        break;
+
+      default:
+        assert(0 && "cm->bit_depth should be VPX_BITS_8, "
+                    "VPX_BITS_10 or VPX_BITS_12");
+    }
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static void realloc_segmentation_maps(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+
+  // Create the encoder segmentation map and set all entries to 0
+  vpx_free(cpi->segmentation_map);
+  CHECK_MEM_ERROR(cm, cpi->segmentation_map,
+                  vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
+
+  // Create a map used for cyclic background refresh.
+  if (cpi->cyclic_refresh)
+    vp9_cyclic_refresh_free(cpi->cyclic_refresh);
+  CHECK_MEM_ERROR(cm, cpi->cyclic_refresh,
+                  vp9_cyclic_refresh_alloc(cm->mi_rows, cm->mi_cols));
+
+  // Create a map used to mark inactive areas.
+  vpx_free(cpi->active_map.map);
+  CHECK_MEM_ERROR(cm, cpi->active_map.map,
+                  vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
+
+  // And a place holder structure is the coding context
+  // for use if we want to save and restore it
+  vpx_free(cpi->coding_context.last_frame_seg_map_copy);
+  CHECK_MEM_ERROR(cm, cpi->coding_context.last_frame_seg_map_copy,
+                  vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
+}
+
+void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
+  VP9_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  int last_w = cpi->oxcf.width;
+  int last_h = cpi->oxcf.height;
+
+  if (cm->profile != oxcf->profile)
+    cm->profile = oxcf->profile;
+  cm->bit_depth = oxcf->bit_depth;
+  cm->color_space = oxcf->color_space;
+  cm->color_range = oxcf->color_range;
+
+  if (cm->profile <= PROFILE_1)
+    assert(cm->bit_depth == VPX_BITS_8);
+  else
+    assert(cm->bit_depth > VPX_BITS_8);
+
+  cpi->oxcf = *oxcf;
+#if CONFIG_VP9_HIGHBITDEPTH
+  cpi->td.mb.e_mbd.bd = (int)cm->bit_depth;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  if ((oxcf->pass == 0) && (oxcf->rc_mode == VPX_Q)) {
+    rc->baseline_gf_interval = FIXED_GF_INTERVAL;
+  } else {
+    rc->baseline_gf_interval = (MIN_GF_INTERVAL + MAX_GF_INTERVAL) / 2;
+  }
+
+  cpi->refresh_golden_frame = 0;
+  cpi->refresh_last_frame = 1;
+  cm->refresh_frame_context = 1;
+  cm->reset_frame_context = 0;
+
+  vp9_reset_segment_features(&cm->seg);
+  vp9_set_high_precision_mv(cpi, 0);
+
+  {
+    int i;
+
+    for (i = 0; i < MAX_SEGMENTS; i++)
+      cpi->segment_encode_breakout[i] = cpi->oxcf.encode_breakout;
+  }
+  cpi->encode_breakout = cpi->oxcf.encode_breakout;
+
+  set_rc_buffer_sizes(rc, &cpi->oxcf);
+
+  // Under a configuration change, where maximum_buffer_size may change,
+  // keep buffer level clipped to the maximum allowed buffer size.
+  rc->bits_off_target = VPXMIN(rc->bits_off_target, rc->maximum_buffer_size);
+  rc->buffer_level = VPXMIN(rc->buffer_level, rc->maximum_buffer_size);
+
+  // Set up frame rate and related parameters rate control values.
+  vp9_new_framerate(cpi, cpi->framerate);
+
+  // Set absolute upper and lower quality limits
+  rc->worst_quality = cpi->oxcf.worst_allowed_q;
+  rc->best_quality = cpi->oxcf.best_allowed_q;
+
+  cm->interp_filter = cpi->sf.default_interp_filter;
+
+  if (cpi->oxcf.render_width > 0 && cpi->oxcf.render_height > 0) {
+    cm->render_width = cpi->oxcf.render_width;
+    cm->render_height = cpi->oxcf.render_height;
+  } else {
+    cm->render_width = cpi->oxcf.width;
+    cm->render_height = cpi->oxcf.height;
+  }
+  if (last_w != cpi->oxcf.width || last_h != cpi->oxcf.height) {
+    cm->width = cpi->oxcf.width;
+    cm->height = cpi->oxcf.height;
+    cpi->external_resize = 1;
+  }
+
+  if (cpi->initial_width) {
+    int new_mi_size = 0;
+    vp9_set_mb_mi(cm, cm->width, cm->height);
+    new_mi_size = cm->mi_stride * calc_mi_size(cm->mi_rows);
+    if (cm->mi_alloc_size < new_mi_size) {
+      vp9_free_context_buffers(cm);
+      alloc_compressor_data(cpi);
+      realloc_segmentation_maps(cpi);
+      cpi->initial_width = cpi->initial_height = 0;
+      cpi->external_resize = 0;
+    } else if (cm->mi_alloc_size == new_mi_size &&
+             (cpi->oxcf.width > last_w || cpi->oxcf.height > last_h)) {
+        vp9_alloc_loop_filter(cm);
+    }
+  }
+
+  update_frame_size(cpi);
+
+  if ((last_w != cpi->oxcf.width || last_h != cpi->oxcf.height) &&
+      cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+    vp9_cyclic_refresh_reset_resize(cpi);
+
+  if ((cpi->svc.number_temporal_layers > 1 &&
+      cpi->oxcf.rc_mode == VPX_CBR) ||
+      ((cpi->svc.number_temporal_layers > 1 ||
+        cpi->svc.number_spatial_layers > 1) &&
+       cpi->oxcf.pass != 1)) {
+    vp9_update_layer_context_change_config(cpi,
+                                           (int)cpi->oxcf.target_bandwidth);
+  }
+
+  cpi->alt_ref_source = NULL;
+  rc->is_src_frame_alt_ref = 0;
+
+#if 0
+  // Experimental RD Code
+  cpi->frame_distortion = 0;
+  cpi->last_frame_distortion = 0;
+#endif
+
+  set_tile_limits(cpi);
+
+  cpi->ext_refresh_frame_flags_pending = 0;
+  cpi->ext_refresh_frame_context_pending = 0;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  highbd_set_var_fns(cpi);
+#endif
+}
+
+#ifndef M_LOG2_E
+#define M_LOG2_E 0.693147180559945309417
+#endif
+#define log2f(x) (log (x) / (float) M_LOG2_E)
+
+/***********************************************************************
+ * Read before modifying 'cal_nmvjointsadcost' or 'cal_nmvsadcosts'    *
+ ***********************************************************************
+ * The following 2 functions ('cal_nmvjointsadcost' and                *
+ * 'cal_nmvsadcosts') are used to calculate cost lookup tables         *
+ * used by 'vp9_diamond_search_sad'. The C implementation of the       *
+ * function is generic, but the AVX intrinsics optimised version       *
+ * relies on the following properties of the computed tables:          *
+ * For cal_nmvjointsadcost:                                            *
+ *   - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3]     *
+ * For cal_nmvsadcosts:                                                *
+ *   - For all i: mvsadcost[0][i] == mvsadcost[1][i]                   *
+ *         (Equal costs for both components)                           *
+ *   - For all i: mvsadcost[0][i] == mvsadcost[0][-i]                  *
+ *         (Cost function is even)                                     *
+ * If these do not hold, then the AVX optimised version of the         *
+ * 'vp9_diamond_search_sad' function cannot be used as it is, in which *
+ * case you can revert to using the C function instead.                *
+ ***********************************************************************/
+
+static void cal_nmvjointsadcost(int *mvjointsadcost) {
+  /*********************************************************************
+   * Warning: Read the comments above before modifying this function   *
+   *********************************************************************/
+  mvjointsadcost[0] = 600;
+  mvjointsadcost[1] = 300;
+  mvjointsadcost[2] = 300;
+  mvjointsadcost[3] = 300;
+}
+
+static void cal_nmvsadcosts(int *mvsadcost[2]) {
+  /*********************************************************************
+   * Warning: Read the comments above before modifying this function   *
+   *********************************************************************/
+  int i = 1;
+
+  mvsadcost[0][0] = 0;
+  mvsadcost[1][0] = 0;
+
+  do {
+    double z = 256 * (2 * (log2f(8 * i) + .6));
+    mvsadcost[0][i] = (int)z;
+    mvsadcost[1][i] = (int)z;
+    mvsadcost[0][-i] = (int)z;
+    mvsadcost[1][-i] = (int)z;
+  } while (++i <= MV_MAX);
+}
+
+static void cal_nmvsadcosts_hp(int *mvsadcost[2]) {
+  int i = 1;
+
+  mvsadcost[0][0] = 0;
+  mvsadcost[1][0] = 0;
+
+  do {
+    double z = 256 * (2 * (log2f(8 * i) + .6));
+    mvsadcost[0][i] = (int)z;
+    mvsadcost[1][i] = (int)z;
+    mvsadcost[0][-i] = (int)z;
+    mvsadcost[1][-i] = (int)z;
+  } while (++i <= MV_MAX);
+}
+
+
+VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
+                                BufferPool *const pool) {
+  unsigned int i;
+  VP9_COMP *volatile const cpi = vpx_memalign(32, sizeof(VP9_COMP));
+  VP9_COMMON *volatile const cm = cpi != NULL ? &cpi->common : NULL;
+
+  if (!cm)
+    return NULL;
+
+  vp9_zero(*cpi);
+
+  if (setjmp(cm->error.jmp)) {
+    cm->error.setjmp = 0;
+    vp9_remove_compressor(cpi);
+    return 0;
+  }
+
+  cm->error.setjmp = 1;
+  cm->alloc_mi = vp9_enc_alloc_mi;
+  cm->free_mi = vp9_enc_free_mi;
+  cm->setup_mi = vp9_enc_setup_mi;
+
+  CHECK_MEM_ERROR(cm, cm->fc,
+                  (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc)));
+  CHECK_MEM_ERROR(cm, cm->frame_contexts,
+                  (FRAME_CONTEXT *)vpx_calloc(FRAME_CONTEXTS,
+                  sizeof(*cm->frame_contexts)));
+
+  cpi->use_svc = 0;
+  cpi->resize_state = 0;
+  cpi->external_resize = 0;
+  cpi->resize_avg_qp = 0;
+  cpi->resize_buffer_underflow = 0;
+  cpi->use_skin_detection = 0;
+  cpi->common.buffer_pool = pool;
+
+  cpi->rc.high_source_sad = 0;
+
+  init_config(cpi, oxcf);
+  vp9_rc_init(&cpi->oxcf, oxcf->pass, &cpi->rc);
+
+  cm->current_video_frame = 0;
+  cpi->partition_search_skippable_frame = 0;
+  cpi->tile_data = NULL;
+
+  realloc_segmentation_maps(cpi);
+
+  CHECK_MEM_ERROR(cm, cpi->nmvcosts[0],
+                  vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts[0])));
+  CHECK_MEM_ERROR(cm, cpi->nmvcosts[1],
+                  vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts[1])));
+  CHECK_MEM_ERROR(cm, cpi->nmvcosts_hp[0],
+                  vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts_hp[0])));
+  CHECK_MEM_ERROR(cm, cpi->nmvcosts_hp[1],
+                  vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts_hp[1])));
+  CHECK_MEM_ERROR(cm, cpi->nmvsadcosts[0],
+                  vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts[0])));
+  CHECK_MEM_ERROR(cm, cpi->nmvsadcosts[1],
+                  vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts[1])));
+  CHECK_MEM_ERROR(cm, cpi->nmvsadcosts_hp[0],
+                  vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts_hp[0])));
+  CHECK_MEM_ERROR(cm, cpi->nmvsadcosts_hp[1],
+                  vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts_hp[1])));
+
+  for (i = 0; i < (sizeof(cpi->mbgraph_stats) /
+                   sizeof(cpi->mbgraph_stats[0])); i++) {
+    CHECK_MEM_ERROR(cm, cpi->mbgraph_stats[i].mb_stats,
+                    vpx_calloc(cm->MBs *
+                               sizeof(*cpi->mbgraph_stats[i].mb_stats), 1));
+  }
+
+#if CONFIG_FP_MB_STATS
+  cpi->use_fp_mb_stats = 0;
+  if (cpi->use_fp_mb_stats) {
+    // a place holder used to store the first pass mb stats in the first pass
+    CHECK_MEM_ERROR(cm, cpi->twopass.frame_mb_stats_buf,
+                    vpx_calloc(cm->MBs * sizeof(uint8_t), 1));
+  } else {
+    cpi->twopass.frame_mb_stats_buf = NULL;
+  }
+#endif
+
+  cpi->refresh_alt_ref_frame = 0;
+  cpi->multi_arf_last_grp_enabled = 0;
+
+  cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
+#if CONFIG_INTERNAL_STATS
+  cpi->b_calculate_ssimg = 0;
+  cpi->b_calculate_blockiness = 1;
+  cpi->b_calculate_consistency = 1;
+  cpi->total_inconsistency = 0;
+  cpi->psnr.worst = 100.0;
+  cpi->worst_ssim = 100.0;
+
+  cpi->count = 0;
+  cpi->bytes = 0;
+
+  if (cpi->b_calculate_psnr) {
+    cpi->total_sq_error = 0;
+    cpi->total_samples = 0;
+
+    cpi->totalp_sq_error = 0;
+    cpi->totalp_samples = 0;
+
+    cpi->tot_recode_hits = 0;
+    cpi->summed_quality = 0;
+    cpi->summed_weights = 0;
+    cpi->summedp_quality = 0;
+    cpi->summedp_weights = 0;
+  }
+
+  if (cpi->b_calculate_ssimg) {
+    cpi->ssimg.worst= 100.0;
+  }
+  cpi->fastssim.worst = 100.0;
+
+  cpi->psnrhvs.worst = 100.0;
+
+  if (cpi->b_calculate_blockiness) {
+    cpi->total_blockiness = 0;
+    cpi->worst_blockiness = 0.0;
+  }
+
+  if (cpi->b_calculate_consistency) {
+    cpi->ssim_vars = vpx_malloc(sizeof(*cpi->ssim_vars) *
+                                4 * cpi->common.mi_rows * cpi->common.mi_cols);
+    cpi->worst_consistency = 100.0;
+  }
+
+#endif
+
+  cpi->first_time_stamp_ever = INT64_MAX;
+
+  /*********************************************************************
+   * Warning: Read the comments around 'cal_nmvjointsadcost' and       *
+   * 'cal_nmvsadcosts' before modifying how these tables are computed. *
+   *********************************************************************/
+  cal_nmvjointsadcost(cpi->td.mb.nmvjointsadcost);
+  cpi->td.mb.nmvcost[0] = &cpi->nmvcosts[0][MV_MAX];
+  cpi->td.mb.nmvcost[1] = &cpi->nmvcosts[1][MV_MAX];
+  cpi->td.mb.nmvsadcost[0] = &cpi->nmvsadcosts[0][MV_MAX];
+  cpi->td.mb.nmvsadcost[1] = &cpi->nmvsadcosts[1][MV_MAX];
+  cal_nmvsadcosts(cpi->td.mb.nmvsadcost);
+
+  cpi->td.mb.nmvcost_hp[0] = &cpi->nmvcosts_hp[0][MV_MAX];
+  cpi->td.mb.nmvcost_hp[1] = &cpi->nmvcosts_hp[1][MV_MAX];
+  cpi->td.mb.nmvsadcost_hp[0] = &cpi->nmvsadcosts_hp[0][MV_MAX];
+  cpi->td.mb.nmvsadcost_hp[1] = &cpi->nmvsadcosts_hp[1][MV_MAX];
+  cal_nmvsadcosts_hp(cpi->td.mb.nmvsadcost_hp);
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+#ifdef OUTPUT_YUV_DENOISED
+  yuv_denoised_file = fopen("denoised.yuv", "ab");
+#endif
+#endif
+#ifdef OUTPUT_YUV_SKINMAP
+  yuv_skinmap_file = fopen("skinmap.yuv", "ab");
+#endif
+#ifdef OUTPUT_YUV_REC
+  yuv_rec_file = fopen("rec.yuv", "wb");
+#endif
+
+#if 0
+  framepsnr = fopen("framepsnr.stt", "a");
+  kf_list = fopen("kf_list.stt", "w");
+#endif
+
+  cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED;
+
+  if (oxcf->pass == 1) {
+    vp9_init_first_pass(cpi);
+  } else if (oxcf->pass == 2) {
+    const size_t packet_sz = sizeof(FIRSTPASS_STATS);
+    const int packets = (int)(oxcf->two_pass_stats_in.sz / packet_sz);
+
+    if (cpi->svc.number_spatial_layers > 1
+        || cpi->svc.number_temporal_layers > 1) {
+      FIRSTPASS_STATS *const stats = oxcf->two_pass_stats_in.buf;
+      FIRSTPASS_STATS *stats_copy[VPX_SS_MAX_LAYERS] = {0};
+      int i;
+
+      for (i = 0; i < oxcf->ss_number_layers; ++i) {
+        FIRSTPASS_STATS *const last_packet_for_layer =
+            &stats[packets - oxcf->ss_number_layers + i];
+        const int layer_id = (int)last_packet_for_layer->spatial_layer_id;
+        const int packets_in_layer = (int)last_packet_for_layer->count + 1;
+        if (layer_id >= 0 && layer_id < oxcf->ss_number_layers) {
+          LAYER_CONTEXT *const lc = &cpi->svc.layer_context[layer_id];
+
+          vpx_free(lc->rc_twopass_stats_in.buf);
+
+          lc->rc_twopass_stats_in.sz = packets_in_layer * packet_sz;
+          CHECK_MEM_ERROR(cm, lc->rc_twopass_stats_in.buf,
+                          vpx_malloc(lc->rc_twopass_stats_in.sz));
+          lc->twopass.stats_in_start = lc->rc_twopass_stats_in.buf;
+          lc->twopass.stats_in = lc->twopass.stats_in_start;
+          lc->twopass.stats_in_end = lc->twopass.stats_in_start
+                                     + packets_in_layer - 1;
+          stats_copy[layer_id] = lc->rc_twopass_stats_in.buf;
+        }
+      }
+
+      for (i = 0; i < packets; ++i) {
+        const int layer_id = (int)stats[i].spatial_layer_id;
+        if (layer_id >= 0 && layer_id < oxcf->ss_number_layers
+            && stats_copy[layer_id] != NULL) {
+          *stats_copy[layer_id] = stats[i];
+          ++stats_copy[layer_id];
+        }
+      }
+
+      vp9_init_second_pass_spatial_svc(cpi);
+    } else {
+#if CONFIG_FP_MB_STATS
+      if (cpi->use_fp_mb_stats) {
+        const size_t psz = cpi->common.MBs * sizeof(uint8_t);
+        const int ps = (int)(oxcf->firstpass_mb_stats_in.sz / psz);
+
+        cpi->twopass.firstpass_mb_stats.mb_stats_start =
+            oxcf->firstpass_mb_stats_in.buf;
+        cpi->twopass.firstpass_mb_stats.mb_stats_end =
+            cpi->twopass.firstpass_mb_stats.mb_stats_start +
+            (ps - 1) * cpi->common.MBs * sizeof(uint8_t);
+      }
+#endif
+
+      cpi->twopass.stats_in_start = oxcf->two_pass_stats_in.buf;
+      cpi->twopass.stats_in = cpi->twopass.stats_in_start;
+      cpi->twopass.stats_in_end = &cpi->twopass.stats_in[packets - 1];
+
+      vp9_init_second_pass(cpi);
+    }
+  }
+
+  vp9_set_speed_features_framesize_independent(cpi);
+  vp9_set_speed_features_framesize_dependent(cpi);
+
+  // Allocate memory to store variances for a frame.
+  CHECK_MEM_ERROR(cm, cpi->source_diff_var,
+                  vpx_calloc(cm->MBs, sizeof(diff)));
+  cpi->source_var_thresh = 0;
+  cpi->frames_till_next_var_check = 0;
+
+#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX3F, SDX8F, SDX4DF)\
+    cpi->fn_ptr[BT].sdf            = SDF; \
+    cpi->fn_ptr[BT].sdaf           = SDAF; \
+    cpi->fn_ptr[BT].vf             = VF; \
+    cpi->fn_ptr[BT].svf            = SVF; \
+    cpi->fn_ptr[BT].svaf           = SVAF; \
+    cpi->fn_ptr[BT].sdx3f          = SDX3F; \
+    cpi->fn_ptr[BT].sdx8f          = SDX8F; \
+    cpi->fn_ptr[BT].sdx4df         = SDX4DF;
+
+  BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad32x16_avg,
+      vpx_variance32x16, vpx_sub_pixel_variance32x16,
+      vpx_sub_pixel_avg_variance32x16, NULL, NULL, vpx_sad32x16x4d)
+
+  BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad16x32_avg,
+      vpx_variance16x32, vpx_sub_pixel_variance16x32,
+      vpx_sub_pixel_avg_variance16x32, NULL, NULL, vpx_sad16x32x4d)
+
+  BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad64x32_avg,
+      vpx_variance64x32, vpx_sub_pixel_variance64x32,
+      vpx_sub_pixel_avg_variance64x32, NULL, NULL, vpx_sad64x32x4d)
+
+  BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad32x64_avg,
+      vpx_variance32x64, vpx_sub_pixel_variance32x64,
+      vpx_sub_pixel_avg_variance32x64, NULL, NULL, vpx_sad32x64x4d)
+
+  BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad32x32_avg,
+      vpx_variance32x32, vpx_sub_pixel_variance32x32,
+      vpx_sub_pixel_avg_variance32x32, vpx_sad32x32x3, vpx_sad32x32x8,
+      vpx_sad32x32x4d)
+
+  BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad64x64_avg,
+      vpx_variance64x64, vpx_sub_pixel_variance64x64,
+      vpx_sub_pixel_avg_variance64x64, vpx_sad64x64x3, vpx_sad64x64x8,
+      vpx_sad64x64x4d)
+
+  BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad16x16_avg,
+      vpx_variance16x16, vpx_sub_pixel_variance16x16,
+      vpx_sub_pixel_avg_variance16x16, vpx_sad16x16x3, vpx_sad16x16x8,
+      vpx_sad16x16x4d)
+
+  BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad16x8_avg,
+      vpx_variance16x8, vpx_sub_pixel_variance16x8,
+      vpx_sub_pixel_avg_variance16x8,
+      vpx_sad16x8x3, vpx_sad16x8x8, vpx_sad16x8x4d)
+
+  BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad8x16_avg,
+      vpx_variance8x16, vpx_sub_pixel_variance8x16,
+      vpx_sub_pixel_avg_variance8x16,
+      vpx_sad8x16x3, vpx_sad8x16x8, vpx_sad8x16x4d)
+
+  BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad8x8_avg,
+      vpx_variance8x8, vpx_sub_pixel_variance8x8,
+      vpx_sub_pixel_avg_variance8x8,
+      vpx_sad8x8x3, vpx_sad8x8x8, vpx_sad8x8x4d)
+
+  BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad8x4_avg,
+      vpx_variance8x4, vpx_sub_pixel_variance8x4,
+      vpx_sub_pixel_avg_variance8x4, NULL, vpx_sad8x4x8, vpx_sad8x4x4d)
+
+  BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad4x8_avg,
+      vpx_variance4x8, vpx_sub_pixel_variance4x8,
+      vpx_sub_pixel_avg_variance4x8, NULL, vpx_sad4x8x8, vpx_sad4x8x4d)
+
+  BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad4x4_avg,
+      vpx_variance4x4, vpx_sub_pixel_variance4x4,
+      vpx_sub_pixel_avg_variance4x4,
+      vpx_sad4x4x3, vpx_sad4x4x8, vpx_sad4x4x4d)
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  highbd_set_var_fns(cpi);
+#endif
+
+  /* vp9_init_quantizer() is first called here. Add check in
+   * vp9_frame_init_quantizer() so that vp9_init_quantizer is only
+   * called later when needed. This will avoid unnecessary calls of
+   * vp9_init_quantizer() for every frame.
+   */
+  vp9_init_quantizer(cpi);
+
+  vp9_loop_filter_init(cm);
+
+  cm->error.setjmp = 0;
+
+  return cpi;
+}
+
+#if CONFIG_INTERNAL_STATS
+#define SNPRINT(H, T) \
+  snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T))
+
+#define SNPRINT2(H, T, V) \
+  snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T), (V))
+#endif  // CONFIG_INTERNAL_STATS
+
+void vp9_remove_compressor(VP9_COMP *cpi) {
+  VP9_COMMON *cm;
+  unsigned int i;
+  int t;
+
+  if (!cpi)
+    return;
+
+  cm = &cpi->common;
+  if (cm->current_video_frame > 0) {
+#if CONFIG_INTERNAL_STATS
+    vpx_clear_system_state();
+
+    if (cpi->oxcf.pass != 1) {
+      char headings[512] = {0};
+      char results[512] = {0};
+      FILE *f = fopen("opsnr.stt", "a");
+      double time_encoded = (cpi->last_end_time_stamp_seen
+                             - cpi->first_time_stamp_ever) / 10000000.000;
+      double total_encode_time = (cpi->time_receive_data +
+                                  cpi->time_compress_data)   / 1000.000;
+      const double dr =
+          (double)cpi->bytes * (double) 8 / (double)1000 / time_encoded;
+      const double peak = (double)((1 << cpi->oxcf.input_bit_depth) - 1);
+
+      if (cpi->b_calculate_psnr) {
+        const double total_psnr =
+            vpx_sse_to_psnr((double)cpi->total_samples, peak,
+                            (double)cpi->total_sq_error);
+        const double totalp_psnr =
+            vpx_sse_to_psnr((double)cpi->totalp_samples, peak,
+                            (double)cpi->totalp_sq_error);
+        const double total_ssim = 100 * pow(cpi->summed_quality /
+                                            cpi->summed_weights, 8.0);
+        const double totalp_ssim = 100 * pow(cpi->summedp_quality /
+                                             cpi->summedp_weights, 8.0);
+
+        snprintf(headings, sizeof(headings),
+                 "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\t"
+                 "VPXSSIM\tVPSSIMP\tFASTSIM\tPSNRHVS\t"
+                 "WstPsnr\tWstSsim\tWstFast\tWstHVS");
+        snprintf(results, sizeof(results),
+                 "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+                 "%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+                 "%7.3f\t%7.3f\t%7.3f\t%7.3f",
+                 dr, cpi->psnr.stat[ALL] / cpi->count, total_psnr,
+                 cpi->psnrp.stat[ALL] / cpi->count, totalp_psnr,
+                 total_ssim, totalp_ssim,
+                 cpi->fastssim.stat[ALL] / cpi->count,
+                 cpi->psnrhvs.stat[ALL] / cpi->count,
+                 cpi->psnr.worst, cpi->worst_ssim, cpi->fastssim.worst,
+                 cpi->psnrhvs.worst);
+
+        if (cpi->b_calculate_blockiness) {
+          SNPRINT(headings, "\t  Block\tWstBlck");
+          SNPRINT2(results, "\t%7.3f", cpi->total_blockiness / cpi->count);
+          SNPRINT2(results, "\t%7.3f", cpi->worst_blockiness);
+        }
+
+        if (cpi->b_calculate_consistency) {
+          double consistency =
+              vpx_sse_to_psnr((double)cpi->totalp_samples, peak,
+                              (double)cpi->total_inconsistency);
+
+          SNPRINT(headings, "\tConsist\tWstCons");
+          SNPRINT2(results, "\t%7.3f", consistency);
+          SNPRINT2(results, "\t%7.3f", cpi->worst_consistency);
+        }
+
+        if (cpi->b_calculate_ssimg) {
+          SNPRINT(headings, "\t  SSIMG\tWtSSIMG");
+          SNPRINT2(results, "\t%7.3f", cpi->ssimg.stat[ALL] / cpi->count);
+          SNPRINT2(results, "\t%7.3f", cpi->ssimg.worst);
+        }
+
+        fprintf(f, "%s\t    Time\n", headings);
+        fprintf(f, "%s\t%8.0f\n", results, total_encode_time);
+      }
+
+      fclose(f);
+    }
+
+#endif
+
+#if 0
+    {
+      printf("\n_pick_loop_filter_level:%d\n", cpi->time_pick_lpf / 1000);
+      printf("\n_frames recive_data encod_mb_row compress_frame  Total\n");
+      printf("%6d %10ld %10ld %10ld %10ld\n", cpi->common.current_video_frame,
+             cpi->time_receive_data / 1000, cpi->time_encode_sb_row / 1000,
+             cpi->time_compress_data / 1000,
+             (cpi->time_receive_data + cpi->time_compress_data) / 1000);
+    }
+#endif
+  }
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+  vp9_denoiser_free(&(cpi->denoiser));
+#endif
+
+  for (t = 0; t < cpi->num_workers; ++t) {
+    VPxWorker *const worker = &cpi->workers[t];
+    EncWorkerData *const thread_data = &cpi->tile_thr_data[t];
+
+    // Deallocate allocated threads.
+    vpx_get_worker_interface()->end(worker);
+
+    // Deallocate allocated thread data.
+    if (t < cpi->num_workers - 1) {
+      vpx_free(thread_data->td->counts);
+      vp9_free_pc_tree(thread_data->td);
+      vpx_free(thread_data->td);
+    }
+  }
+  vpx_free(cpi->tile_thr_data);
+  vpx_free(cpi->workers);
+
+  if (cpi->num_workers > 1)
+    vp9_loop_filter_dealloc(&cpi->lf_row_sync);
+
+  dealloc_compressor_data(cpi);
+
+  for (i = 0; i < sizeof(cpi->mbgraph_stats) /
+                  sizeof(cpi->mbgraph_stats[0]); ++i) {
+    vpx_free(cpi->mbgraph_stats[i].mb_stats);
+  }
+
+#if CONFIG_FP_MB_STATS
+  if (cpi->use_fp_mb_stats) {
+    vpx_free(cpi->twopass.frame_mb_stats_buf);
+    cpi->twopass.frame_mb_stats_buf = NULL;
+  }
+#endif
+
+  vp9_remove_common(cm);
+  vp9_free_ref_frame_buffers(cm->buffer_pool);
+#if CONFIG_VP9_POSTPROC
+  vp9_free_postproc_buffers(cm);
+#endif
+  vpx_free(cpi);
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+#ifdef OUTPUT_YUV_DENOISED
+  fclose(yuv_denoised_file);
+#endif
+#endif
+#ifdef OUTPUT_YUV_SKINMAP
+  fclose(yuv_skinmap_file);
+#endif
+#ifdef OUTPUT_YUV_REC
+  fclose(yuv_rec_file);
+#endif
+
+#if 0
+
+  if (keyfile)
+    fclose(keyfile);
+
+  if (framepsnr)
+    fclose(framepsnr);
+
+  if (kf_list)
+    fclose(kf_list);
+
+#endif
+}
+
+/* TODO(yaowu): The block_variance calls the unoptimized versions of variance()
+ * and highbd_8_variance(). It should not.
+ */
+static void encoder_variance(const uint8_t *a, int  a_stride,
+                             const uint8_t *b, int  b_stride,
+                             int  w, int  h, unsigned int *sse, int *sum) {
+  int i, j;
+
+  *sum = 0;
+  *sse = 0;
+
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      const int diff = a[j] - b[j];
+      *sum += diff;
+      *sse += diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void encoder_highbd_variance64(const uint8_t *a8, int  a_stride,
+                                      const uint8_t *b8, int  b_stride,
+                                      int w, int h, uint64_t *sse,
+                                      uint64_t *sum) {
+  int i, j;
+
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  *sum = 0;
+  *sse = 0;
+
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      const int diff = a[j] - b[j];
+      *sum += diff;
+      *sse += diff * diff;
+    }
+    a += a_stride;
+    b += b_stride;
+  }
+}
+
+static void encoder_highbd_8_variance(const uint8_t *a8, int  a_stride,
+                                      const uint8_t *b8, int  b_stride,
+                                      int w, int h,
+                                      unsigned int *sse, int *sum) {
+  uint64_t sse_long = 0;
+  uint64_t sum_long = 0;
+  encoder_highbd_variance64(a8, a_stride, b8, b_stride, w, h,
+                            &sse_long, &sum_long);
+  *sse = (unsigned int)sse_long;
+  *sum = (int)sum_long;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static int64_t get_sse(const uint8_t *a, int a_stride,
+                       const uint8_t *b, int b_stride,
+                       int width, int height) {
+  const int dw = width % 16;
+  const int dh = height % 16;
+  int64_t total_sse = 0;
+  unsigned int sse = 0;
+  int sum = 0;
+  int x, y;
+
+  if (dw > 0) {
+    encoder_variance(&a[width - dw], a_stride, &b[width - dw], b_stride,
+                     dw, height, &sse, &sum);
+    total_sse += sse;
+  }
+
+  if (dh > 0) {
+    encoder_variance(&a[(height - dh) * a_stride], a_stride,
+                     &b[(height - dh) * b_stride], b_stride,
+                     width - dw, dh, &sse, &sum);
+    total_sse += sse;
+  }
+
+  for (y = 0; y < height / 16; ++y) {
+    const uint8_t *pa = a;
+    const uint8_t *pb = b;
+    for (x = 0; x < width / 16; ++x) {
+      vpx_mse16x16(pa, a_stride, pb, b_stride, &sse);
+      total_sse += sse;
+
+      pa += 16;
+      pb += 16;
+    }
+
+    a += 16 * a_stride;
+    b += 16 * b_stride;
+  }
+
+  return total_sse;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static int64_t highbd_get_sse_shift(const uint8_t *a8, int a_stride,
+                                    const uint8_t *b8, int b_stride,
+                                    int width, int height,
+                                    unsigned int input_shift) {
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  int64_t total_sse = 0;
+  int x, y;
+  for (y = 0; y < height; ++y) {
+    for (x = 0; x < width; ++x) {
+      int64_t diff;
+      diff = (a[x] >> input_shift) - (b[x] >> input_shift);
+      total_sse += diff * diff;
+    }
+    a += a_stride;
+    b += b_stride;
+  }
+  return total_sse;
+}
+
+static int64_t highbd_get_sse(const uint8_t *a, int a_stride,
+                              const uint8_t *b, int b_stride,
+                              int width, int height) {
+  int64_t total_sse = 0;
+  int x, y;
+  const int dw = width % 16;
+  const int dh = height % 16;
+  unsigned int sse = 0;
+  int sum = 0;
+  if (dw > 0) {
+    encoder_highbd_8_variance(&a[width - dw], a_stride,
+                              &b[width - dw], b_stride,
+                              dw, height, &sse, &sum);
+    total_sse += sse;
+  }
+  if (dh > 0) {
+    encoder_highbd_8_variance(&a[(height - dh) * a_stride], a_stride,
+                              &b[(height - dh) * b_stride], b_stride,
+                              width - dw, dh, &sse, &sum);
+    total_sse += sse;
+  }
+  for (y = 0; y < height / 16; ++y) {
+    const uint8_t *pa = a;
+    const uint8_t *pb = b;
+    for (x = 0; x < width / 16; ++x) {
+      vpx_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse);
+      total_sse += sse;
+      pa += 16;
+      pb += 16;
+    }
+    a += 16 * a_stride;
+    b += 16 * b_stride;
+  }
+  return total_sse;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+typedef struct {
+  double psnr[4];       // total/y/u/v
+  uint64_t sse[4];      // total/y/u/v
+  uint32_t samples[4];  // total/y/u/v
+} PSNR_STATS;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
+                             const YV12_BUFFER_CONFIG *b,
+                             PSNR_STATS *psnr,
+                             unsigned int bit_depth,
+                             unsigned int in_bit_depth) {
+  const int widths[3] =
+      {a->y_crop_width,  a->uv_crop_width,  a->uv_crop_width };
+  const int heights[3] =
+      {a->y_crop_height, a->uv_crop_height, a->uv_crop_height};
+  const uint8_t *a_planes[3] = {a->y_buffer, a->u_buffer,  a->v_buffer };
+  const int a_strides[3] = {a->y_stride, a->uv_stride, a->uv_stride};
+  const uint8_t *b_planes[3] = {b->y_buffer, b->u_buffer,  b->v_buffer };
+  const int b_strides[3] = {b->y_stride, b->uv_stride, b->uv_stride};
+  int i;
+  uint64_t total_sse = 0;
+  uint32_t total_samples = 0;
+  const double peak = (double)((1 << in_bit_depth) - 1);
+  const unsigned int input_shift = bit_depth - in_bit_depth;
+
+  for (i = 0; i < 3; ++i) {
+    const int w = widths[i];
+    const int h = heights[i];
+    const uint32_t samples = w * h;
+    uint64_t sse;
+    if (a->flags & YV12_FLAG_HIGHBITDEPTH) {
+      if (input_shift) {
+        sse = highbd_get_sse_shift(a_planes[i], a_strides[i],
+                                   b_planes[i], b_strides[i], w, h,
+                                   input_shift);
+      } else {
+        sse = highbd_get_sse(a_planes[i], a_strides[i],
+                             b_planes[i], b_strides[i], w, h);
+      }
+    } else {
+      sse = get_sse(a_planes[i], a_strides[i],
+                    b_planes[i], b_strides[i],
+                    w, h);
+    }
+    psnr->sse[1 + i] = sse;
+    psnr->samples[1 + i] = samples;
+    psnr->psnr[1 + i] = vpx_sse_to_psnr(samples, peak, (double)sse);
+
+    total_sse += sse;
+    total_samples += samples;
+  }
+
+  psnr->sse[0] = total_sse;
+  psnr->samples[0] = total_samples;
+  psnr->psnr[0] = vpx_sse_to_psnr((double)total_samples, peak,
+                                  (double)total_sse);
+}
+
+#else  // !CONFIG_VP9_HIGHBITDEPTH
+
+static void calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
+                      PSNR_STATS *psnr) {
+  static const double peak = 255.0;
+  const int widths[3]        = {
+      a->y_crop_width, a->uv_crop_width, a->uv_crop_width};
+  const int heights[3]       = {
+      a->y_crop_height, a->uv_crop_height, a->uv_crop_height};
+  const uint8_t *a_planes[3] = {a->y_buffer, a->u_buffer, a->v_buffer};
+  const int a_strides[3]     = {a->y_stride, a->uv_stride, a->uv_stride};
+  const uint8_t *b_planes[3] = {b->y_buffer, b->u_buffer, b->v_buffer};
+  const int b_strides[3]     = {b->y_stride, b->uv_stride, b->uv_stride};
+  int i;
+  uint64_t total_sse = 0;
+  uint32_t total_samples = 0;
+
+  for (i = 0; i < 3; ++i) {
+    const int w = widths[i];
+    const int h = heights[i];
+    const uint32_t samples = w * h;
+    const uint64_t sse = get_sse(a_planes[i], a_strides[i],
+                                 b_planes[i], b_strides[i],
+                                 w, h);
+    psnr->sse[1 + i] = sse;
+    psnr->samples[1 + i] = samples;
+    psnr->psnr[1 + i] = vpx_sse_to_psnr(samples, peak, (double)sse);
+
+    total_sse += sse;
+    total_samples += samples;
+  }
+
+  psnr->sse[0] = total_sse;
+  psnr->samples[0] = total_samples;
+  psnr->psnr[0] = vpx_sse_to_psnr((double)total_samples, peak,
+                                  (double)total_sse);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static void generate_psnr_packet(VP9_COMP *cpi) {
+  struct vpx_codec_cx_pkt pkt;
+  int i;
+  PSNR_STATS psnr;
+#if CONFIG_VP9_HIGHBITDEPTH
+  calc_highbd_psnr(cpi->Source, cpi->common.frame_to_show, &psnr,
+                   cpi->td.mb.e_mbd.bd, cpi->oxcf.input_bit_depth);
+#else
+  calc_psnr(cpi->Source, cpi->common.frame_to_show, &psnr);
+#endif
+
+  for (i = 0; i < 4; ++i) {
+    pkt.data.psnr.samples[i] = psnr.samples[i];
+    pkt.data.psnr.sse[i] = psnr.sse[i];
+    pkt.data.psnr.psnr[i] = psnr.psnr[i];
+  }
+  pkt.kind = VPX_CODEC_PSNR_PKT;
+  if (cpi->use_svc)
+    cpi->svc.layer_context[cpi->svc.spatial_layer_id *
+        cpi->svc.number_temporal_layers].psnr_pkt = pkt.data.psnr;
+  else
+    vpx_codec_pkt_list_add(cpi->output_pkt_list, &pkt);
+}
+
+int vp9_use_as_reference(VP9_COMP *cpi, int ref_frame_flags) {
+  if (ref_frame_flags > 7)
+    return -1;
+
+  cpi->ref_frame_flags = ref_frame_flags;
+  return 0;
+}
+
+void vp9_update_reference(VP9_COMP *cpi, int ref_frame_flags) {
+  cpi->ext_refresh_golden_frame = (ref_frame_flags & VP9_GOLD_FLAG) != 0;
+  cpi->ext_refresh_alt_ref_frame = (ref_frame_flags & VP9_ALT_FLAG) != 0;
+  cpi->ext_refresh_last_frame = (ref_frame_flags & VP9_LAST_FLAG) != 0;
+  cpi->ext_refresh_frame_flags_pending = 1;
+}
+
+static YV12_BUFFER_CONFIG *get_vp9_ref_frame_buffer(VP9_COMP *cpi,
+                                VP9_REFFRAME ref_frame_flag) {
+  MV_REFERENCE_FRAME ref_frame = NONE;
+  if (ref_frame_flag == VP9_LAST_FLAG)
+    ref_frame = LAST_FRAME;
+  else if (ref_frame_flag == VP9_GOLD_FLAG)
+    ref_frame = GOLDEN_FRAME;
+  else if (ref_frame_flag == VP9_ALT_FLAG)
+    ref_frame = ALTREF_FRAME;
+
+  return ref_frame == NONE ? NULL : get_ref_frame_buffer(cpi, ref_frame);
+}
+
+int vp9_copy_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag,
+                           YV12_BUFFER_CONFIG *sd) {
+  YV12_BUFFER_CONFIG *cfg = get_vp9_ref_frame_buffer(cpi, ref_frame_flag);
+  if (cfg) {
+    vp8_yv12_copy_frame(cfg, sd);
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+int vp9_set_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag,
+                          YV12_BUFFER_CONFIG *sd) {
+  YV12_BUFFER_CONFIG *cfg = get_vp9_ref_frame_buffer(cpi, ref_frame_flag);
+  if (cfg) {
+    vp8_yv12_copy_frame(sd, cfg);
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+int vp9_update_entropy(VP9_COMP * cpi, int update) {
+  cpi->ext_refresh_frame_context = update;
+  cpi->ext_refresh_frame_context_pending = 1;
+  return 0;
+}
+
+#if defined(OUTPUT_YUV_DENOISED) || defined(OUTPUT_YUV_SKINMAP)
+// The denoiser buffer is allocated as a YUV 440 buffer. This function writes it
+// as YUV 420. We simply use the top-left pixels of the UV buffers, since we do
+// not denoise the UV channels at this time. If ever we implement UV channel
+// denoising we will have to modify this.
+void vp9_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f) {
+  uint8_t *src = s->y_buffer;
+  int h = s->y_height;
+
+  do {
+    fwrite(src, s->y_width, 1, f);
+    src += s->y_stride;
+  } while (--h);
+
+  src = s->u_buffer;
+  h = s->uv_height;
+
+  do {
+    fwrite(src, s->uv_width, 1, f);
+    src += s->uv_stride;
+  } while (--h);
+
+  src = s->v_buffer;
+  h = s->uv_height;
+
+  do {
+    fwrite(src, s->uv_width, 1, f);
+    src += s->uv_stride;
+  } while (--h);
+}
+#endif
+
+#ifdef OUTPUT_YUV_REC
+void vp9_write_yuv_rec_frame(VP9_COMMON *cm) {
+  YV12_BUFFER_CONFIG *s = cm->frame_to_show;
+  uint8_t *src = s->y_buffer;
+  int h = cm->height;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (s->flags & YV12_FLAG_HIGHBITDEPTH) {
+    uint16_t *src16 = CONVERT_TO_SHORTPTR(s->y_buffer);
+
+    do {
+      fwrite(src16, s->y_width, 2,  yuv_rec_file);
+      src16 += s->y_stride;
+    } while (--h);
+
+    src16 = CONVERT_TO_SHORTPTR(s->u_buffer);
+    h = s->uv_height;
+
+    do {
+      fwrite(src16, s->uv_width, 2,  yuv_rec_file);
+      src16 += s->uv_stride;
+    } while (--h);
+
+    src16 = CONVERT_TO_SHORTPTR(s->v_buffer);
+    h = s->uv_height;
+
+    do {
+      fwrite(src16, s->uv_width, 2, yuv_rec_file);
+      src16 += s->uv_stride;
+    } while (--h);
+
+    fflush(yuv_rec_file);
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  do {
+    fwrite(src, s->y_width, 1,  yuv_rec_file);
+    src += s->y_stride;
+  } while (--h);
+
+  src = s->u_buffer;
+  h = s->uv_height;
+
+  do {
+    fwrite(src, s->uv_width, 1,  yuv_rec_file);
+    src += s->uv_stride;
+  } while (--h);
+
+  src = s->v_buffer;
+  h = s->uv_height;
+
+  do {
+    fwrite(src, s->uv_width, 1, yuv_rec_file);
+    src += s->uv_stride;
+  } while (--h);
+
+  fflush(yuv_rec_file);
+}
+#endif
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
+                                                YV12_BUFFER_CONFIG *dst,
+                                                int bd) {
+#else
+static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
+                                                YV12_BUFFER_CONFIG *dst) {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  // TODO(dkovalev): replace YV12_BUFFER_CONFIG with vpx_image_t
+  int i;
+  const uint8_t *const srcs[3] = {src->y_buffer, src->u_buffer, src->v_buffer};
+  const int src_strides[3] = {src->y_stride, src->uv_stride, src->uv_stride};
+  const int src_widths[3] = {src->y_crop_width, src->uv_crop_width,
+                             src->uv_crop_width };
+  const int src_heights[3] = {src->y_crop_height, src->uv_crop_height,
+                              src->uv_crop_height};
+  uint8_t *const dsts[3] = {dst->y_buffer, dst->u_buffer, dst->v_buffer};
+  const int dst_strides[3] = {dst->y_stride, dst->uv_stride, dst->uv_stride};
+  const int dst_widths[3] = {dst->y_crop_width, dst->uv_crop_width,
+                             dst->uv_crop_width};
+  const int dst_heights[3] = {dst->y_crop_height, dst->uv_crop_height,
+                              dst->uv_crop_height};
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
+      vp9_highbd_resize_plane(srcs[i], src_heights[i], src_widths[i],
+                              src_strides[i], dsts[i], dst_heights[i],
+                              dst_widths[i], dst_strides[i], bd);
+    } else {
+      vp9_resize_plane(srcs[i], src_heights[i], src_widths[i], src_strides[i],
+                       dsts[i], dst_heights[i], dst_widths[i], dst_strides[i]);
+    }
+#else
+    vp9_resize_plane(srcs[i], src_heights[i], src_widths[i], src_strides[i],
+                     dsts[i], dst_heights[i], dst_widths[i], dst_strides[i]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  }
+  vpx_extend_frame_borders(dst);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
+                                   YV12_BUFFER_CONFIG *dst, int bd) {
+  const int src_w = src->y_crop_width;
+  const int src_h = src->y_crop_height;
+  const int dst_w = dst->y_crop_width;
+  const int dst_h = dst->y_crop_height;
+  const uint8_t *const srcs[3] = {src->y_buffer, src->u_buffer, src->v_buffer};
+  const int src_strides[3] = {src->y_stride, src->uv_stride, src->uv_stride};
+  uint8_t *const dsts[3] = {dst->y_buffer, dst->u_buffer, dst->v_buffer};
+  const int dst_strides[3] = {dst->y_stride, dst->uv_stride, dst->uv_stride};
+  const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP];
+  int x, y, i;
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    const int factor = (i == 0 || i == 3 ? 1 : 2);
+    const int src_stride = src_strides[i];
+    const int dst_stride = dst_strides[i];
+    for (y = 0; y < dst_h; y += 16) {
+      const int y_q4 = y * (16 / factor) * src_h / dst_h;
+      for (x = 0; x < dst_w; x += 16) {
+        const int x_q4 = x * (16 / factor) * src_w / dst_w;
+        const uint8_t *src_ptr = srcs[i] + (y / factor) * src_h / dst_h *
+                                   src_stride + (x / factor) * src_w / dst_w;
+        uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor);
+
+        if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
+          vpx_highbd_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
+                               kernel[x_q4 & 0xf], 16 * src_w / dst_w,
+                               kernel[y_q4 & 0xf], 16 * src_h / dst_h,
+                               16 / factor, 16 / factor, bd);
+        } else {
+          vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride,
+                        kernel[x_q4 & 0xf], 16 * src_w / dst_w,
+                        kernel[y_q4 & 0xf], 16 * src_h / dst_h,
+                        16 / factor, 16 / factor);
+        }
+      }
+    }
+  }
+
+  vpx_extend_frame_borders(dst);
+}
+#else
+void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src,
+                                  YV12_BUFFER_CONFIG *dst) {
+  const int src_w = src->y_crop_width;
+  const int src_h = src->y_crop_height;
+  const int dst_w = dst->y_crop_width;
+  const int dst_h = dst->y_crop_height;
+  const uint8_t *const srcs[3] = {src->y_buffer, src->u_buffer, src->v_buffer};
+  const int src_strides[3] = {src->y_stride, src->uv_stride, src->uv_stride};
+  uint8_t *const dsts[3] = {dst->y_buffer, dst->u_buffer, dst->v_buffer};
+  const int dst_strides[3] = {dst->y_stride, dst->uv_stride, dst->uv_stride};
+  const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP];
+  int x, y, i;
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    const int factor = (i == 0 || i == 3 ? 1 : 2);
+    const int src_stride = src_strides[i];
+    const int dst_stride = dst_strides[i];
+    for (y = 0; y < dst_h; y += 16) {
+      const int y_q4 = y * (16 / factor) * src_h / dst_h;
+      for (x = 0; x < dst_w; x += 16) {
+        const int x_q4 = x * (16 / factor) * src_w / dst_w;
+        const uint8_t *src_ptr = srcs[i] + (y / factor) * src_h / dst_h *
+                                   src_stride + (x / factor) * src_w / dst_w;
+        uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor);
+
+        vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride,
+                      kernel[x_q4 & 0xf], 16 * src_w / dst_w,
+                      kernel[y_q4 & 0xf], 16 * src_h / dst_h,
+                      16 / factor, 16 / factor);
+      }
+    }
+  }
+
+  vpx_extend_frame_borders(dst);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static int scale_down(VP9_COMP *cpi, int q) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+  int scale = 0;
+  assert(frame_is_kf_gf_arf(cpi));
+
+  if (rc->frame_size_selector == UNSCALED &&
+      q >= rc->rf_level_maxq[gf_group->rf_level[gf_group->index]]) {
+    const int max_size_thresh = (int)(rate_thresh_mult[SCALE_STEP1]
+        * VPXMAX(rc->this_frame_target, rc->avg_frame_bandwidth));
+    scale = rc->projected_frame_size > max_size_thresh ? 1 : 0;
+  }
+  return scale;
+}
+
+// Function to test for conditions that indicate we should loop
+// back and recode a frame.
+static int recode_loop_test(VP9_COMP *cpi,
+                            int high_limit, int low_limit,
+                            int q, int maxq, int minq) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  const int frame_is_kfgfarf = frame_is_kf_gf_arf(cpi);
+  int force_recode = 0;
+
+  if ((rc->projected_frame_size >= rc->max_frame_bandwidth) ||
+      (cpi->sf.recode_loop == ALLOW_RECODE) ||
+      (frame_is_kfgfarf &&
+       (cpi->sf.recode_loop == ALLOW_RECODE_KFARFGF))) {
+    if (frame_is_kfgfarf &&
+        (oxcf->resize_mode == RESIZE_DYNAMIC) &&
+        scale_down(cpi, q)) {
+        // Code this group at a lower resolution.
+        cpi->resize_pending = 1;
+        return 1;
+    }
+
+    // TODO(agrange) high_limit could be greater than the scale-down threshold.
+    if ((rc->projected_frame_size > high_limit && q < maxq) ||
+        (rc->projected_frame_size < low_limit && q > minq)) {
+      force_recode = 1;
+    } else if (cpi->oxcf.rc_mode == VPX_CQ) {
+      // Deal with frame undershoot and whether or not we are
+      // below the automatically set cq level.
+      if (q > oxcf->cq_level &&
+          rc->projected_frame_size < ((rc->this_frame_target * 7) >> 3)) {
+        force_recode = 1;
+      }
+    }
+  }
+  return force_recode;
+}
+
+void vp9_update_reference_frames(VP9_COMP *cpi) {
+  VP9_COMMON * const cm = &cpi->common;
+  BufferPool *const pool = cm->buffer_pool;
+
+  // At this point the new frame has been encoded.
+  // If any buffer copy / swapping is signaled it should be done here.
+  if (cm->frame_type == KEY_FRAME) {
+    ref_cnt_fb(pool->frame_bufs,
+               &cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx);
+    ref_cnt_fb(pool->frame_bufs,
+               &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx);
+  } else if (vp9_preserve_existing_gf(cpi)) {
+    // We have decided to preserve the previously existing golden frame as our
+    // new ARF frame. However, in the short term in function
+    // vp9_bitstream.c::get_refresh_mask() we left it in the GF slot and, if
+    // we're updating the GF with the current decoded frame, we save it to the
+    // ARF slot instead.
+    // We now have to update the ARF with the current frame and swap gld_fb_idx
+    // and alt_fb_idx so that, overall, we've stored the old GF in the new ARF
+    // slot and, if we're updating the GF, the current frame becomes the new GF.
+    int tmp;
+
+    ref_cnt_fb(pool->frame_bufs,
+               &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx);
+
+    tmp = cpi->alt_fb_idx;
+    cpi->alt_fb_idx = cpi->gld_fb_idx;
+    cpi->gld_fb_idx = tmp;
+
+    if (is_two_pass_svc(cpi)) {
+      cpi->svc.layer_context[0].gold_ref_idx = cpi->gld_fb_idx;
+      cpi->svc.layer_context[0].alt_ref_idx = cpi->alt_fb_idx;
+    }
+  } else { /* For non key/golden frames */
+    if (cpi->refresh_alt_ref_frame) {
+      int arf_idx = cpi->alt_fb_idx;
+      if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) {
+        const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+        arf_idx = gf_group->arf_update_idx[gf_group->index];
+      }
+
+      ref_cnt_fb(pool->frame_bufs,
+                 &cm->ref_frame_map[arf_idx], cm->new_fb_idx);
+      memcpy(cpi->interp_filter_selected[ALTREF_FRAME],
+             cpi->interp_filter_selected[0],
+             sizeof(cpi->interp_filter_selected[0]));
+    }
+
+    if (cpi->refresh_golden_frame) {
+      ref_cnt_fb(pool->frame_bufs,
+                 &cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx);
+      if (!cpi->rc.is_src_frame_alt_ref)
+        memcpy(cpi->interp_filter_selected[GOLDEN_FRAME],
+               cpi->interp_filter_selected[0],
+               sizeof(cpi->interp_filter_selected[0]));
+      else
+        memcpy(cpi->interp_filter_selected[GOLDEN_FRAME],
+               cpi->interp_filter_selected[ALTREF_FRAME],
+               sizeof(cpi->interp_filter_selected[ALTREF_FRAME]));
+    }
+  }
+
+  if (cpi->refresh_last_frame) {
+    ref_cnt_fb(pool->frame_bufs,
+               &cm->ref_frame_map[cpi->lst_fb_idx], cm->new_fb_idx);
+    if (!cpi->rc.is_src_frame_alt_ref)
+      memcpy(cpi->interp_filter_selected[LAST_FRAME],
+             cpi->interp_filter_selected[0],
+             sizeof(cpi->interp_filter_selected[0]));
+  }
+#if CONFIG_VP9_TEMPORAL_DENOISING
+  if (cpi->oxcf.noise_sensitivity > 0) {
+    vp9_denoiser_update_frame_info(&cpi->denoiser,
+                                   *cpi->Source,
+                                   cpi->common.frame_type,
+                                   cpi->refresh_alt_ref_frame,
+                                   cpi->refresh_golden_frame,
+                                   cpi->refresh_last_frame,
+                                   cpi->resize_pending);
+  }
+#endif
+  if (is_one_pass_cbr_svc(cpi)) {
+    // Keep track of frame index for each reference frame.
+    SVC *const svc = &cpi->svc;
+    if (cm->frame_type == KEY_FRAME) {
+      svc->ref_frame_index[cpi->lst_fb_idx] = svc->current_superframe;
+      svc->ref_frame_index[cpi->gld_fb_idx] = svc->current_superframe;
+      svc->ref_frame_index[cpi->alt_fb_idx] = svc->current_superframe;
+    } else {
+      if (cpi->refresh_last_frame)
+        svc->ref_frame_index[cpi->lst_fb_idx] = svc->current_superframe;
+      if (cpi->refresh_golden_frame)
+        svc->ref_frame_index[cpi->gld_fb_idx] = svc->current_superframe;
+      if (cpi->refresh_alt_ref_frame)
+        svc->ref_frame_index[cpi->alt_fb_idx] = svc->current_superframe;
+    }
+  }
+}
+
+static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
+  MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
+  struct loopfilter *lf = &cm->lf;
+
+  if (xd->lossless) {
+      lf->filter_level = 0;
+      lf->last_filt_level = 0;
+  } else {
+    struct vpx_usec_timer timer;
+
+    vpx_clear_system_state();
+
+    vpx_usec_timer_start(&timer);
+
+    if (!cpi->rc.is_src_frame_alt_ref) {
+      if ((cpi->common.frame_type == KEY_FRAME) &&
+          (!cpi->rc.this_key_frame_forced)) {
+        lf->last_filt_level = 0;
+      }
+      vp9_pick_filter_level(cpi->Source, cpi, cpi->sf.lpf_pick);
+      lf->last_filt_level = lf->filter_level;
+    } else {
+      lf->filter_level = 0;
+    }
+
+    vpx_usec_timer_mark(&timer);
+    cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer);
+  }
+
+  if (lf->filter_level > 0) {
+    vp9_build_mask_frame(cm, lf->filter_level, 0);
+
+    if (cpi->num_workers > 1)
+      vp9_loop_filter_frame_mt(cm->frame_to_show, cm, xd->plane,
+                               lf->filter_level, 0, 0,
+                               cpi->workers, cpi->num_workers,
+                               &cpi->lf_row_sync);
+    else
+      vp9_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0);
+  }
+
+  vpx_extend_frame_inner_borders(cm->frame_to_show);
+}
+
+static INLINE void alloc_frame_mvs(const VP9_COMMON *cm,
+                                   int buffer_idx) {
+  RefCntBuffer *const new_fb_ptr = &cm->buffer_pool->frame_bufs[buffer_idx];
+  if (new_fb_ptr->mvs == NULL ||
+      new_fb_ptr->mi_rows < cm->mi_rows ||
+      new_fb_ptr->mi_cols < cm->mi_cols) {
+    vpx_free(new_fb_ptr->mvs);
+    new_fb_ptr->mvs =
+      (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols,
+                           sizeof(*new_fb_ptr->mvs));
+    new_fb_ptr->mi_rows = cm->mi_rows;
+    new_fb_ptr->mi_cols = cm->mi_cols;
+  }
+}
+
+void vp9_scale_references(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+  MV_REFERENCE_FRAME ref_frame;
+  const VP9_REFFRAME ref_mask[3] = {VP9_LAST_FLAG, VP9_GOLD_FLAG, VP9_ALT_FLAG};
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    // Need to convert from VP9_REFFRAME to index into ref_mask (subtract 1).
+    if (cpi->ref_frame_flags & ref_mask[ref_frame - 1]) {
+      BufferPool *const pool = cm->buffer_pool;
+      const YV12_BUFFER_CONFIG *const ref = get_ref_frame_buffer(cpi,
+                                                                 ref_frame);
+
+      if (ref == NULL) {
+        cpi->scaled_ref_idx[ref_frame - 1] = INVALID_IDX;
+        continue;
+      }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
+        RefCntBuffer *new_fb_ptr = NULL;
+        int force_scaling = 0;
+        int new_fb = cpi->scaled_ref_idx[ref_frame - 1];
+        if (new_fb == INVALID_IDX) {
+          new_fb = get_free_fb(cm);
+          force_scaling = 1;
+        }
+        if (new_fb == INVALID_IDX)
+          return;
+        new_fb_ptr = &pool->frame_bufs[new_fb];
+        if (force_scaling ||
+            new_fb_ptr->buf.y_crop_width != cm->width ||
+            new_fb_ptr->buf.y_crop_height != cm->height) {
+          vpx_realloc_frame_buffer(&new_fb_ptr->buf,
+                                   cm->width, cm->height,
+                                   cm->subsampling_x, cm->subsampling_y,
+                                   cm->use_highbitdepth,
+                                   VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+                                   NULL, NULL, NULL);
+          scale_and_extend_frame(ref, &new_fb_ptr->buf, (int)cm->bit_depth);
+          cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
+          alloc_frame_mvs(cm, new_fb);
+        }
+#else
+      if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
+        RefCntBuffer *new_fb_ptr = NULL;
+        int force_scaling = 0;
+        int new_fb = cpi->scaled_ref_idx[ref_frame - 1];
+        if (new_fb == INVALID_IDX) {
+          new_fb = get_free_fb(cm);
+          force_scaling = 1;
+        }
+        if (new_fb == INVALID_IDX)
+          return;
+        new_fb_ptr = &pool->frame_bufs[new_fb];
+        if (force_scaling ||
+            new_fb_ptr->buf.y_crop_width != cm->width ||
+            new_fb_ptr->buf.y_crop_height != cm->height) {
+          vpx_realloc_frame_buffer(&new_fb_ptr->buf,
+                                   cm->width, cm->height,
+                                   cm->subsampling_x, cm->subsampling_y,
+                                   VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+                                   NULL, NULL, NULL);
+          vp9_scale_and_extend_frame(ref, &new_fb_ptr->buf);
+          cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
+          alloc_frame_mvs(cm, new_fb);
+        }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        int buf_idx;
+        RefCntBuffer *buf = NULL;
+        if (cpi->oxcf.pass == 0 && !cpi->use_svc) {
+          // Check for release of scaled reference.
+          buf_idx = cpi->scaled_ref_idx[ref_frame - 1];
+          buf = (buf_idx != INVALID_IDX) ? &pool->frame_bufs[buf_idx] : NULL;
+          if (buf != NULL) {
+            --buf->ref_count;
+            cpi->scaled_ref_idx[ref_frame - 1] = INVALID_IDX;
+          }
+        }
+        buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+        buf = &pool->frame_bufs[buf_idx];
+        buf->buf.y_crop_width = ref->y_crop_width;
+        buf->buf.y_crop_height = ref->y_crop_height;
+        cpi->scaled_ref_idx[ref_frame - 1] = buf_idx;
+        ++buf->ref_count;
+      }
+    } else {
+      if (cpi->oxcf.pass != 0 || cpi->use_svc)
+        cpi->scaled_ref_idx[ref_frame - 1] = INVALID_IDX;
+    }
+  }
+}
+
+static void release_scaled_references(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+  int i;
+  if (cpi->oxcf.pass == 0 && !cpi->use_svc) {
+    // Only release scaled references under certain conditions:
+    // if reference will be updated, or if scaled reference has same resolution.
+    int refresh[3];
+    refresh[0] = (cpi->refresh_last_frame) ? 1 : 0;
+    refresh[1] = (cpi->refresh_golden_frame) ? 1 : 0;
+    refresh[2] = (cpi->refresh_alt_ref_frame) ? 1 : 0;
+    for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+      const int idx = cpi->scaled_ref_idx[i - 1];
+      RefCntBuffer *const buf = idx != INVALID_IDX ?
+          &cm->buffer_pool->frame_bufs[idx] : NULL;
+      const YV12_BUFFER_CONFIG *const ref = get_ref_frame_buffer(cpi, i);
+      if (buf != NULL &&
+          (refresh[i - 1] ||
+          (buf->buf.y_crop_width == ref->y_crop_width &&
+           buf->buf.y_crop_height == ref->y_crop_height))) {
+        --buf->ref_count;
+        cpi->scaled_ref_idx[i -1] = INVALID_IDX;
+      }
+    }
+  } else {
+    for (i = 0; i < MAX_REF_FRAMES; ++i) {
+      const int idx = cpi->scaled_ref_idx[i];
+      RefCntBuffer *const buf = idx != INVALID_IDX ?
+          &cm->buffer_pool->frame_bufs[idx] : NULL;
+      if (buf != NULL) {
+        --buf->ref_count;
+        cpi->scaled_ref_idx[i] = INVALID_IDX;
+      }
+    }
+  }
+}
+
+static void full_to_model_count(unsigned int *model_count,
+                                unsigned int *full_count) {
+  int n;
+  model_count[ZERO_TOKEN] = full_count[ZERO_TOKEN];
+  model_count[ONE_TOKEN] = full_count[ONE_TOKEN];
+  model_count[TWO_TOKEN] = full_count[TWO_TOKEN];
+  for (n = THREE_TOKEN; n < EOB_TOKEN; ++n)
+    model_count[TWO_TOKEN] += full_count[n];
+  model_count[EOB_MODEL_TOKEN] = full_count[EOB_TOKEN];
+}
+
+static void full_to_model_counts(vp9_coeff_count_model *model_count,
+                                 vp9_coeff_count *full_count) {
+  int i, j, k, l;
+
+  for (i = 0; i < PLANE_TYPES; ++i)
+    for (j = 0; j < REF_TYPES; ++j)
+      for (k = 0; k < COEF_BANDS; ++k)
+        for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l)
+          full_to_model_count(model_count[i][j][k][l], full_count[i][j][k][l]);
+}
+
+#if 0 && CONFIG_INTERNAL_STATS
+static void output_frame_level_debug_stats(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  FILE *const f = fopen("tmp.stt", cm->current_video_frame ? "a" : "w");
+  int64_t recon_err;
+
+  vpx_clear_system_state();
+
+  recon_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+
+  if (cpi->twopass.total_left_stats.coded_error != 0.0)
+    fprintf(f, "%10u %dx%d %10d %10d %d %d %10d %10d %10d %10d"
+       "%10"PRId64" %10"PRId64" %5d %5d %10"PRId64" "
+       "%10"PRId64" %10"PRId64" %10d "
+       "%7.2lf %7.2lf %7.2lf %7.2lf %7.2lf"
+        "%6d %6d %5d %5d %5d "
+        "%10"PRId64" %10.3lf"
+        "%10lf %8u %10"PRId64" %10d %10d %10d %10d\n",
+        cpi->common.current_video_frame,
+        cm->width, cm->height,
+        cpi->td.rd_counts.m_search_count,
+        cpi->td.rd_counts.ex_search_count,
+        cpi->rc.source_alt_ref_pending,
+        cpi->rc.source_alt_ref_active,
+        cpi->rc.this_frame_target,
+        cpi->rc.projected_frame_size,
+        cpi->rc.projected_frame_size / cpi->common.MBs,
+        (cpi->rc.projected_frame_size - cpi->rc.this_frame_target),
+        cpi->rc.vbr_bits_off_target,
+        cpi->rc.vbr_bits_off_target_fast,
+        cpi->twopass.extend_minq,
+        cpi->twopass.extend_minq_fast,
+        cpi->rc.total_target_vs_actual,
+        (cpi->rc.starting_buffer_level - cpi->rc.bits_off_target),
+        cpi->rc.total_actual_bits, cm->base_qindex,
+        vp9_convert_qindex_to_q(cm->base_qindex, cm->bit_depth),
+        (double)vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth) / 4.0,
+        vp9_convert_qindex_to_q(cpi->twopass.active_worst_quality,
+                                cm->bit_depth),
+        cpi->rc.avg_q,
+        vp9_convert_qindex_to_q(cpi->oxcf.cq_level, cm->bit_depth),
+        cpi->refresh_last_frame, cpi->refresh_golden_frame,
+        cpi->refresh_alt_ref_frame, cm->frame_type, cpi->rc.gfu_boost,
+        cpi->twopass.bits_left,
+        cpi->twopass.total_left_stats.coded_error,
+        cpi->twopass.bits_left /
+            (1 + cpi->twopass.total_left_stats.coded_error),
+        cpi->tot_recode_hits, recon_err, cpi->rc.kf_boost,
+        cpi->twopass.kf_zeromotion_pct,
+        cpi->twopass.fr_content_type,
+        cm->lf.filter_level);
+
+  fclose(f);
+
+  if (0) {
+    FILE *const fmodes = fopen("Modes.stt", "a");
+    int i;
+
+    fprintf(fmodes, "%6d:%1d:%1d:%1d ", cpi->common.current_video_frame,
+            cm->frame_type, cpi->refresh_golden_frame,
+            cpi->refresh_alt_ref_frame);
+
+    for (i = 0; i < MAX_MODES; ++i)
+      fprintf(fmodes, "%5d ", cpi->mode_chosen_counts[i]);
+
+    fprintf(fmodes, "\n");
+
+    fclose(fmodes);
+  }
+}
+#endif
+
+static void set_mv_search_params(VP9_COMP *cpi) {
+  const VP9_COMMON *const cm = &cpi->common;
+  const unsigned int max_mv_def = VPXMIN(cm->width, cm->height);
+
+  // Default based on max resolution.
+  cpi->mv_step_param = vp9_init_search_range(max_mv_def);
+
+  if (cpi->sf.mv.auto_mv_step_size) {
+    if (frame_is_intra_only(cm)) {
+      // Initialize max_mv_magnitude for use in the first INTER frame
+      // after a key/intra-only frame.
+      cpi->max_mv_magnitude = max_mv_def;
+    } else {
+      if (cm->show_frame) {
+        // Allow mv_steps to correspond to twice the max mv magnitude found
+        // in the previous frame, capped by the default max_mv_magnitude based
+        // on resolution.
+        cpi->mv_step_param = vp9_init_search_range(
+            VPXMIN(max_mv_def, 2 * cpi->max_mv_magnitude));
+      }
+      cpi->max_mv_magnitude = 0;
+    }
+  }
+}
+
+static void set_size_independent_vars(VP9_COMP *cpi) {
+  vp9_set_speed_features_framesize_independent(cpi);
+  vp9_set_rd_speed_thresholds(cpi);
+  vp9_set_rd_speed_thresholds_sub8x8(cpi);
+  cpi->common.interp_filter = cpi->sf.default_interp_filter;
+}
+
+static void set_size_dependent_vars(VP9_COMP *cpi, int *q,
+                                    int *bottom_index, int *top_index) {
+  VP9_COMMON *const cm = &cpi->common;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+
+  // Setup variables that depend on the dimensions of the frame.
+  vp9_set_speed_features_framesize_dependent(cpi);
+
+  // Decide q and q bounds.
+  *q = vp9_rc_pick_q_and_bounds(cpi, bottom_index, top_index);
+
+  if (!frame_is_intra_only(cm)) {
+    vp9_set_high_precision_mv(cpi, (*q) < HIGH_PRECISION_MV_QTHRESH);
+  }
+
+  // Configure experimental use of segmentation for enhanced coding of
+  // static regions if indicated.
+  // Only allowed in the second pass of a two pass encode, as it requires
+  // lagged coding, and if the relevant speed feature flag is set.
+  if (oxcf->pass == 2 && cpi->sf.static_segmentation)
+    configure_static_seg_features(cpi);
+
+#if CONFIG_VP9_POSTPROC && !(CONFIG_VP9_TEMPORAL_DENOISING)
+  if (oxcf->noise_sensitivity > 0) {
+    int l = 0;
+    switch (oxcf->noise_sensitivity) {
+      case 1:
+        l = 20;
+        break;
+      case 2:
+        l = 40;
+        break;
+      case 3:
+        l = 60;
+        break;
+      case 4:
+      case 5:
+        l = 100;
+        break;
+      case 6:
+        l = 150;
+        break;
+    }
+    vp9_denoise(cpi->Source, cpi->Source, l);
+  }
+#endif  // CONFIG_VP9_POSTPROC
+}
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+static void setup_denoiser_buffer(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  if (cpi->oxcf.noise_sensitivity > 0 &&
+      !cpi->denoiser.frame_buffer_initialized) {
+    vp9_denoiser_alloc(&(cpi->denoiser), cm->width, cm->height,
+                       cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                       cm->use_highbitdepth,
+#endif
+                       VP9_ENC_BORDER_IN_PIXELS);
+  }
+}
+#endif
+
+static void init_motion_estimation(VP9_COMP *cpi) {
+  int y_stride = cpi->scaled_source.y_stride;
+
+  if (cpi->sf.mv.search_method == NSTEP) {
+    vp9_init3smotion_compensation(&cpi->ss_cfg, y_stride);
+  } else if (cpi->sf.mv.search_method == DIAMOND) {
+    vp9_init_dsmotion_compensation(&cpi->ss_cfg, y_stride);
+  }
+}
+
+static void set_frame_size(VP9_COMP *cpi) {
+  int ref_frame;
+  VP9_COMMON *const cm = &cpi->common;
+  VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+
+  if (oxcf->pass == 2 &&
+      oxcf->rc_mode == VPX_VBR &&
+      ((oxcf->resize_mode == RESIZE_FIXED && cm->current_video_frame == 0) ||
+        (oxcf->resize_mode == RESIZE_DYNAMIC && cpi->resize_pending))) {
+    calculate_coded_size(
+        cpi, &oxcf->scaled_frame_width, &oxcf->scaled_frame_height);
+
+    // There has been a change in frame size.
+    vp9_set_size_literal(cpi, oxcf->scaled_frame_width,
+                         oxcf->scaled_frame_height);
+  }
+
+  if (oxcf->pass == 0 &&
+      oxcf->rc_mode == VPX_CBR &&
+      !cpi->use_svc &&
+      oxcf->resize_mode == RESIZE_DYNAMIC &&
+      cpi->resize_pending != 0) {
+    oxcf->scaled_frame_width =
+        (oxcf->width * cpi->resize_scale_num) / cpi->resize_scale_den;
+    oxcf->scaled_frame_height =
+        (oxcf->height * cpi->resize_scale_num) /cpi->resize_scale_den;
+    // There has been a change in frame size.
+    vp9_set_size_literal(cpi,
+                         oxcf->scaled_frame_width,
+                         oxcf->scaled_frame_height);
+
+    // TODO(agrange) Scale cpi->max_mv_magnitude if frame-size has changed.
+    set_mv_search_params(cpi);
+
+    vp9_noise_estimate_init(&cpi->noise_estimate, cm->width, cm->height);
+#if CONFIG_VP9_TEMPORAL_DENOISING
+    // Reset the denoiser on the resized frame.
+    if (cpi->oxcf.noise_sensitivity > 0) {
+      vp9_denoiser_free(&(cpi->denoiser));
+      setup_denoiser_buffer(cpi);
+      // Dynamic resize is only triggered for non-SVC, so we can force
+      // golden frame update here as temporary fix to denoiser.
+      cpi->refresh_golden_frame = 1;
+    }
+#endif
+  }
+
+  if ((oxcf->pass == 2) &&
+      (!cpi->use_svc ||
+          (is_two_pass_svc(cpi) &&
+              cpi->svc.encode_empty_frame_state != ENCODING))) {
+    vp9_set_target_rate(cpi);
+  }
+
+  alloc_frame_mvs(cm, cm->new_fb_idx);
+
+  // Reset the frame pointers to the current frame size.
+  vpx_realloc_frame_buffer(get_frame_new_buffer(cm),
+                           cm->width, cm->height,
+                           cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                           cm->use_highbitdepth,
+#endif
+                           VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+                           NULL, NULL, NULL);
+
+  alloc_util_frame_buffers(cpi);
+  init_motion_estimation(cpi);
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    RefBuffer *const ref_buf = &cm->frame_refs[ref_frame - 1];
+    const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+
+    ref_buf->idx = buf_idx;
+
+    if (buf_idx != INVALID_IDX) {
+      YV12_BUFFER_CONFIG *const buf = &cm->buffer_pool->frame_bufs[buf_idx].buf;
+      ref_buf->buf = buf;
+#if CONFIG_VP9_HIGHBITDEPTH
+      vp9_setup_scale_factors_for_frame(&ref_buf->sf,
+                                        buf->y_crop_width, buf->y_crop_height,
+                                        cm->width, cm->height,
+                                        (buf->flags & YV12_FLAG_HIGHBITDEPTH) ?
+                                            1 : 0);
+#else
+      vp9_setup_scale_factors_for_frame(&ref_buf->sf,
+                                        buf->y_crop_width, buf->y_crop_height,
+                                        cm->width, cm->height);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      if (vp9_is_scaled(&ref_buf->sf))
+        vpx_extend_frame_borders(buf);
+    } else {
+      ref_buf->buf = NULL;
+    }
+  }
+
+  set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME);
+}
+
+static void encode_without_recode_loop(VP9_COMP *cpi,
+                                       size_t *size,
+                                       uint8_t *dest) {
+  VP9_COMMON *const cm = &cpi->common;
+  int q = 0, bottom_index = 0, top_index = 0;  // Dummy variables.
+
+  vpx_clear_system_state();
+
+  set_frame_size(cpi);
+  cpi->Source = vp9_scale_if_required(cm,
+                                      cpi->un_scaled_source,
+                                      &cpi->scaled_source,
+                                      (cpi->oxcf.pass == 0));
+
+  // Avoid scaling last_source unless its needed.
+  // Last source is currently only used for screen-content mode,
+  // if partition_search_type == SOURCE_VAR_BASED_PARTITION, or if noise
+  // estimation is enabled.
+  if (cpi->unscaled_last_source != NULL &&
+      (cpi->oxcf.content == VP9E_CONTENT_SCREEN ||
+      cpi->sf.partition_search_type == SOURCE_VAR_BASED_PARTITION ||
+      cpi->noise_estimate.enabled))
+    cpi->Last_Source = vp9_scale_if_required(cm,
+                                             cpi->unscaled_last_source,
+                                             &cpi->scaled_last_source,
+                                             (cpi->oxcf.pass == 0));
+  vp9_update_noise_estimate(cpi);
+
+  if (cpi->oxcf.pass == 0 &&
+      cpi->oxcf.rc_mode == VPX_CBR &&
+      cpi->resize_state == 0 &&
+      cm->frame_type != KEY_FRAME &&
+      cpi->oxcf.content == VP9E_CONTENT_SCREEN)
+    vp9_avg_source_sad(cpi);
+
+  // TODO(wonkap/marpan): For 1 pass SVC, since only ZERMOV is allowed for
+  // upsampled reference frame (i.e, svc->force_zero_mode_spatial_ref = 0),
+  // we should be able to avoid this frame-level upsampling.
+  // Keeping it for now as there is an asan error in the multi-threaded SVC
+  // rate control test if this upsampling is removed.
+  if (frame_is_intra_only(cm) == 0) {
+    vp9_scale_references(cpi);
+  }
+
+  set_size_independent_vars(cpi);
+  set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
+
+  if (cpi->oxcf.speed >= 5 &&
+      cpi->oxcf.pass == 0 &&
+      cpi->oxcf.rc_mode == VPX_CBR &&
+      cpi->oxcf.content != VP9E_CONTENT_SCREEN &&
+      cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+    cpi->use_skin_detection = 1;
+  }
+
+  vp9_set_quantizer(cm, q);
+  vp9_set_variance_partition_thresholds(cpi, q);
+
+  setup_frame(cpi);
+
+  suppress_active_map(cpi);
+  // Variance adaptive and in frame q adjustment experiments are mutually
+  // exclusive.
+  if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
+    vp9_vaq_frame_setup(cpi);
+  } else if (cpi->oxcf.aq_mode == EQUATOR360_AQ) {
+    vp9_360aq_frame_setup(cpi);
+  } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
+    vp9_setup_in_frame_q_adj(cpi);
+  } else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+    vp9_cyclic_refresh_setup(cpi);
+  }
+  apply_active_map(cpi);
+
+  // transform / motion compensation build reconstruction frame
+  vp9_encode_frame(cpi);
+
+  // Check if we should drop this frame because of high overshoot.
+  // Only for frames where high temporal-source sad is detected.
+  if (cpi->oxcf.pass == 0 &&
+      cpi->oxcf.rc_mode == VPX_CBR &&
+      cpi->resize_state == 0 &&
+      cm->frame_type != KEY_FRAME &&
+      cpi->oxcf.content == VP9E_CONTENT_SCREEN &&
+      cpi->rc.high_source_sad == 1) {
+    int frame_size = 0;
+    // Get an estimate of the encoded frame size.
+    save_coding_context(cpi);
+    vp9_pack_bitstream(cpi, dest, size);
+    restore_coding_context(cpi);
+    frame_size = (int)(*size) << 3;
+    // Check if encoded frame will overshoot too much, and if so, set the q and
+    // adjust some rate control parameters, and return to re-encode the frame.
+    if (vp9_encodedframe_overshoot(cpi, frame_size, &q)) {
+      vpx_clear_system_state();
+      vp9_set_quantizer(cm, q);
+      vp9_set_variance_partition_thresholds(cpi, q);
+      suppress_active_map(cpi);
+      // Turn-off cyclic refresh for re-encoded frame.
+      if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+        unsigned char *const seg_map = cpi->segmentation_map;
+        memset(seg_map, 0, cm->mi_rows * cm->mi_cols);
+        vp9_disable_segmentation(&cm->seg);
+      }
+      apply_active_map(cpi);
+      vp9_encode_frame(cpi);
+    }
+  }
+
+  // Update some stats from cyclic refresh, and check if we should not update
+  // golden reference, for non-SVC 1 pass CBR.
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+      cm->frame_type != KEY_FRAME &&
+      !cpi->use_svc &&
+      cpi->ext_refresh_frame_flags_pending == 0 &&
+      (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR))
+    vp9_cyclic_refresh_check_golden_update(cpi);
+
+  // Update the skip mb flag probabilities based on the distribution
+  // seen in the last encoder iteration.
+  // update_base_skip_probs(cpi);
+  vpx_clear_system_state();
+}
+
+static void encode_with_recode_loop(VP9_COMP *cpi,
+                                    size_t *size,
+                                    uint8_t *dest) {
+  VP9_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  int bottom_index, top_index;
+  int loop_count = 0;
+  int loop_at_this_size = 0;
+  int loop = 0;
+  int overshoot_seen = 0;
+  int undershoot_seen = 0;
+  int frame_over_shoot_limit;
+  int frame_under_shoot_limit;
+  int q = 0, q_low = 0, q_high = 0;
+
+  set_size_independent_vars(cpi);
+
+  do {
+    vpx_clear_system_state();
+
+    set_frame_size(cpi);
+
+    if (loop_count == 0 || cpi->resize_pending != 0) {
+      set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
+
+      // TODO(agrange) Scale cpi->max_mv_magnitude if frame-size has changed.
+      set_mv_search_params(cpi);
+
+      // Reset the loop state for new frame size.
+      overshoot_seen = 0;
+      undershoot_seen = 0;
+
+      // Reconfiguration for change in frame size has concluded.
+      cpi->resize_pending = 0;
+
+      q_low = bottom_index;
+      q_high = top_index;
+
+      loop_at_this_size = 0;
+    }
+
+    // Decide frame size bounds first time through.
+    if (loop_count == 0) {
+      vp9_rc_compute_frame_size_bounds(cpi, rc->this_frame_target,
+                                       &frame_under_shoot_limit,
+                                       &frame_over_shoot_limit);
+    }
+
+    cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source,
+                                      &cpi->scaled_source,
+                                      (cpi->oxcf.pass == 0));
+
+    if (cpi->unscaled_last_source != NULL)
+      cpi->Last_Source = vp9_scale_if_required(cm, cpi->unscaled_last_source,
+                                               &cpi->scaled_last_source,
+                                               (cpi->oxcf.pass == 0));
+
+    if (frame_is_intra_only(cm) == 0) {
+      if (loop_count > 0) {
+        release_scaled_references(cpi);
+      }
+      vp9_scale_references(cpi);
+    }
+
+    vp9_set_quantizer(cm, q);
+
+    if (loop_count == 0)
+      setup_frame(cpi);
+
+    // Variance adaptive and in frame q adjustment experiments are mutually
+    // exclusive.
+    if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
+      vp9_vaq_frame_setup(cpi);
+    } else if (cpi->oxcf.aq_mode == EQUATOR360_AQ) {
+      vp9_360aq_frame_setup(cpi);
+    } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
+      vp9_setup_in_frame_q_adj(cpi);
+    }
+
+    // transform / motion compensation build reconstruction frame
+    vp9_encode_frame(cpi);
+
+    // Update the skip mb flag probabilities based on the distribution
+    // seen in the last encoder iteration.
+    // update_base_skip_probs(cpi);
+
+    vpx_clear_system_state();
+
+    // Dummy pack of the bitstream using up to date stats to get an
+    // accurate estimate of output frame size to determine if we need
+    // to recode.
+    if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF) {
+      save_coding_context(cpi);
+      if (!cpi->sf.use_nonrd_pick_mode)
+        vp9_pack_bitstream(cpi, dest, size);
+
+      rc->projected_frame_size = (int)(*size) << 3;
+      restore_coding_context(cpi);
+
+      if (frame_over_shoot_limit == 0)
+        frame_over_shoot_limit = 1;
+    }
+
+    if (cpi->oxcf.rc_mode == VPX_Q) {
+      loop = 0;
+    } else {
+      if ((cm->frame_type == KEY_FRAME) &&
+           rc->this_key_frame_forced &&
+           (rc->projected_frame_size < rc->max_frame_bandwidth)) {
+        int last_q = q;
+        int64_t kf_err;
+
+        int64_t high_err_target = cpi->ambient_err;
+        int64_t low_err_target = cpi->ambient_err >> 1;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (cm->use_highbitdepth) {
+          kf_err = vp9_highbd_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+        } else {
+          kf_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+        }
+#else
+        kf_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+        // Prevent possible divide by zero error below for perfect KF
+        kf_err += !kf_err;
+
+        // The key frame is not good enough or we can afford
+        // to make it better without undue risk of popping.
+        if ((kf_err > high_err_target &&
+             rc->projected_frame_size <= frame_over_shoot_limit) ||
+            (kf_err > low_err_target &&
+             rc->projected_frame_size <= frame_under_shoot_limit)) {
+          // Lower q_high
+          q_high = q > q_low ? q - 1 : q_low;
+
+          // Adjust Q
+          q = (int)((q * high_err_target) / kf_err);
+          q = VPXMIN(q, (q_high + q_low) >> 1);
+        } else if (kf_err < low_err_target &&
+                   rc->projected_frame_size >= frame_under_shoot_limit) {
+          // The key frame is much better than the previous frame
+          // Raise q_low
+          q_low = q < q_high ? q + 1 : q_high;
+
+          // Adjust Q
+          q = (int)((q * low_err_target) / kf_err);
+          q = VPXMIN(q, (q_high + q_low + 1) >> 1);
+        }
+
+        // Clamp Q to upper and lower limits:
+        q = clamp(q, q_low, q_high);
+
+        loop = q != last_q;
+      } else if (recode_loop_test(
+          cpi, frame_over_shoot_limit, frame_under_shoot_limit,
+          q, VPXMAX(q_high, top_index), bottom_index)) {
+        // Is the projected frame size out of range and are we allowed
+        // to attempt to recode.
+        int last_q = q;
+        int retries = 0;
+
+        if (cpi->resize_pending == 1) {
+          // Change in frame size so go back around the recode loop.
+          cpi->rc.frame_size_selector =
+              SCALE_STEP1 - cpi->rc.frame_size_selector;
+          cpi->rc.next_frame_size_selector = cpi->rc.frame_size_selector;
+
+#if CONFIG_INTERNAL_STATS
+          ++cpi->tot_recode_hits;
+#endif
+          ++loop_count;
+          loop = 1;
+          continue;
+        }
+
+        // Frame size out of permitted range:
+        // Update correction factor & compute new Q to try...
+
+        // Frame is too large
+        if (rc->projected_frame_size > rc->this_frame_target) {
+          // Special case if the projected size is > the max allowed.
+          if (rc->projected_frame_size >= rc->max_frame_bandwidth)
+            q_high = rc->worst_quality;
+
+          // Raise Qlow as to at least the current value
+          q_low = q < q_high ? q + 1 : q_high;
+
+          if (undershoot_seen || loop_at_this_size > 1) {
+            // Update rate_correction_factor unless
+            vp9_rc_update_rate_correction_factors(cpi);
+
+            q = (q_high + q_low + 1) / 2;
+          } else {
+            // Update rate_correction_factor unless
+            vp9_rc_update_rate_correction_factors(cpi);
+
+            q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
+                                  bottom_index, VPXMAX(q_high, top_index));
+
+            while (q < q_low && retries < 10) {
+              vp9_rc_update_rate_correction_factors(cpi);
+              q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
+                                    bottom_index, VPXMAX(q_high, top_index));
+              retries++;
+            }
+          }
+
+          overshoot_seen = 1;
+        } else {
+          // Frame is too small
+          q_high = q > q_low ? q - 1 : q_low;
+
+          if (overshoot_seen || loop_at_this_size > 1) {
+            vp9_rc_update_rate_correction_factors(cpi);
+            q = (q_high + q_low) / 2;
+          } else {
+            vp9_rc_update_rate_correction_factors(cpi);
+            q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
+                                   bottom_index, top_index);
+            // Special case reset for qlow for constrained quality.
+            // This should only trigger where there is very substantial
+            // undershoot on a frame and the auto cq level is above
+            // the user passsed in value.
+            if (cpi->oxcf.rc_mode == VPX_CQ &&
+                q < q_low) {
+              q_low = q;
+            }
+
+            while (q > q_high && retries < 10) {
+              vp9_rc_update_rate_correction_factors(cpi);
+              q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
+                                     bottom_index, top_index);
+              retries++;
+            }
+          }
+
+          undershoot_seen = 1;
+        }
+
+        // Clamp Q to upper and lower limits:
+        q = clamp(q, q_low, q_high);
+
+        loop = (q != last_q);
+      } else {
+        loop = 0;
+      }
+    }
+
+    // Special case for overlay frame.
+    if (rc->is_src_frame_alt_ref &&
+        rc->projected_frame_size < rc->max_frame_bandwidth)
+      loop = 0;
+
+    if (loop) {
+      ++loop_count;
+      ++loop_at_this_size;
+
+#if CONFIG_INTERNAL_STATS
+      ++cpi->tot_recode_hits;
+#endif
+    }
+  } while (loop);
+}
+
+static int get_ref_frame_flags(const VP9_COMP *cpi) {
+  const int *const map = cpi->common.ref_frame_map;
+  const int gold_is_last = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idx];
+  const int alt_is_last = map[cpi->alt_fb_idx] == map[cpi->lst_fb_idx];
+  const int gold_is_alt = map[cpi->gld_fb_idx] == map[cpi->alt_fb_idx];
+  int flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;
+
+  if (gold_is_last)
+    flags &= ~VP9_GOLD_FLAG;
+
+  if (cpi->rc.frames_till_gf_update_due == INT_MAX &&
+      (cpi->svc.number_temporal_layers == 1 &&
+       cpi->svc.number_spatial_layers == 1))
+    flags &= ~VP9_GOLD_FLAG;
+
+  if (alt_is_last)
+    flags &= ~VP9_ALT_FLAG;
+
+  if (gold_is_alt)
+    flags &= ~VP9_ALT_FLAG;
+
+  return flags;
+}
+
+static void set_ext_overrides(VP9_COMP *cpi) {
+  // Overrides the defaults with the externally supplied values with
+  // vp9_update_reference() and vp9_update_entropy() calls
+  // Note: The overrides are valid only for the next frame passed
+  // to encode_frame_to_data_rate() function
+  if (cpi->ext_refresh_frame_context_pending) {
+    cpi->common.refresh_frame_context = cpi->ext_refresh_frame_context;
+    cpi->ext_refresh_frame_context_pending = 0;
+  }
+  if (cpi->ext_refresh_frame_flags_pending) {
+    cpi->refresh_last_frame = cpi->ext_refresh_last_frame;
+    cpi->refresh_golden_frame = cpi->ext_refresh_golden_frame;
+    cpi->refresh_alt_ref_frame = cpi->ext_refresh_alt_ref_frame;
+  }
+}
+
+YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm,
+                                          YV12_BUFFER_CONFIG *unscaled,
+                                          YV12_BUFFER_CONFIG *scaled,
+                                          int use_normative_scaler) {
+  if (cm->mi_cols * MI_SIZE != unscaled->y_width ||
+      cm->mi_rows * MI_SIZE != unscaled->y_height) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (use_normative_scaler &&
+        unscaled->y_width <= (scaled->y_width << 1) &&
+        unscaled->y_height <= (scaled->y_height << 1))
+      scale_and_extend_frame(unscaled, scaled, (int)cm->bit_depth);
+    else
+      scale_and_extend_frame_nonnormative(unscaled, scaled, (int)cm->bit_depth);
+#else
+    if (use_normative_scaler &&
+        unscaled->y_width <= (scaled->y_width << 1) &&
+        unscaled->y_height <= (scaled->y_height << 1))
+      vp9_scale_and_extend_frame(unscaled, scaled);
+    else
+      scale_and_extend_frame_nonnormative(unscaled, scaled);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    return scaled;
+  } else {
+    return unscaled;
+  }
+}
+
+static void set_arf_sign_bias(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  int arf_sign_bias;
+
+  if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) {
+    const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+    arf_sign_bias = cpi->rc.source_alt_ref_active &&
+                    (!cpi->refresh_alt_ref_frame ||
+                     (gf_group->rf_level[gf_group->index] == GF_ARF_LOW));
+  } else {
+    arf_sign_bias =
+      (cpi->rc.source_alt_ref_active && !cpi->refresh_alt_ref_frame);
+  }
+  cm->ref_frame_sign_bias[ALTREF_FRAME] = arf_sign_bias;
+}
+
+static int setup_interp_filter_search_mask(VP9_COMP *cpi) {
+  INTERP_FILTER ifilter;
+  int ref_total[MAX_REF_FRAMES] = {0};
+  MV_REFERENCE_FRAME ref;
+  int mask = 0;
+  if (cpi->common.last_frame_type == KEY_FRAME ||
+      cpi->refresh_alt_ref_frame)
+    return mask;
+  for (ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref)
+    for (ifilter = EIGHTTAP; ifilter <= EIGHTTAP_SHARP; ++ifilter)
+      ref_total[ref] += cpi->interp_filter_selected[ref][ifilter];
+
+  for (ifilter = EIGHTTAP; ifilter <= EIGHTTAP_SHARP; ++ifilter) {
+    if ((ref_total[LAST_FRAME] &&
+        cpi->interp_filter_selected[LAST_FRAME][ifilter] == 0) &&
+        (ref_total[GOLDEN_FRAME] == 0 ||
+         cpi->interp_filter_selected[GOLDEN_FRAME][ifilter] * 50
+           < ref_total[GOLDEN_FRAME]) &&
+        (ref_total[ALTREF_FRAME] == 0 ||
+         cpi->interp_filter_selected[ALTREF_FRAME][ifilter] * 50
+           < ref_total[ALTREF_FRAME]))
+      mask |= 1 << ifilter;
+  }
+  return mask;
+}
+
+static void encode_frame_to_data_rate(VP9_COMP *cpi,
+                                      size_t *size,
+                                      uint8_t *dest,
+                                      unsigned int *frame_flags) {
+  VP9_COMMON *const cm = &cpi->common;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  struct segmentation *const seg = &cm->seg;
+  TX_SIZE t;
+
+  set_ext_overrides(cpi);
+  vpx_clear_system_state();
+
+  // Set the arf sign bias for this frame.
+  set_arf_sign_bias(cpi);
+
+  // Set default state for segment based loop filter update flags.
+  cm->lf.mode_ref_delta_update = 0;
+
+  if (cpi->oxcf.pass == 2 &&
+      cpi->sf.adaptive_interp_filter_search)
+    cpi->sf.interp_filter_search_mask =
+        setup_interp_filter_search_mask(cpi);
+
+  // Set various flags etc to special state if it is a key frame.
+  if (frame_is_intra_only(cm)) {
+    // Reset the loop filter deltas and segmentation map.
+    vp9_reset_segment_features(&cm->seg);
+
+    // If segmentation is enabled force a map update for key frames.
+    if (seg->enabled) {
+      seg->update_map = 1;
+      seg->update_data = 1;
+    }
+
+    // The alternate reference frame cannot be active for a key frame.
+    cpi->rc.source_alt_ref_active = 0;
+
+    cm->error_resilient_mode = oxcf->error_resilient_mode;
+    cm->frame_parallel_decoding_mode = oxcf->frame_parallel_decoding_mode;
+
+    // By default, encoder assumes decoder can use prev_mi.
+    if (cm->error_resilient_mode) {
+      cm->frame_parallel_decoding_mode = 1;
+      cm->reset_frame_context = 0;
+      cm->refresh_frame_context = 0;
+    } else if (cm->intra_only) {
+      // Only reset the current context.
+      cm->reset_frame_context = 2;
+    }
+  }
+  if (is_two_pass_svc(cpi) && cm->error_resilient_mode == 0) {
+    // Use context 0 for intra only empty frame, but the last frame context
+    // for other empty frames.
+    if (cpi->svc.encode_empty_frame_state == ENCODING) {
+      if (cpi->svc.encode_intra_empty_frame != 0)
+        cm->frame_context_idx = 0;
+      else
+        cm->frame_context_idx = FRAME_CONTEXTS - 1;
+    } else {
+    cm->frame_context_idx =
+        cpi->svc.spatial_layer_id * cpi->svc.number_temporal_layers +
+        cpi->svc.temporal_layer_id;
+    }
+
+    cm->frame_parallel_decoding_mode = oxcf->frame_parallel_decoding_mode;
+
+    // The probs will be updated based on the frame type of its previous
+    // frame if frame_parallel_decoding_mode is 0. The type may vary for
+    // the frame after a key frame in base layer since we may drop enhancement
+    // layers. So set frame_parallel_decoding_mode to 1 in this case.
+    if (cm->frame_parallel_decoding_mode == 0) {
+      if (cpi->svc.number_temporal_layers == 1) {
+        if (cpi->svc.spatial_layer_id == 0 &&
+            cpi->svc.layer_context[0].last_frame_type == KEY_FRAME)
+          cm->frame_parallel_decoding_mode = 1;
+      } else if (cpi->svc.spatial_layer_id == 0) {
+        // Find the 2nd frame in temporal base layer and 1st frame in temporal
+        // enhancement layers from the key frame.
+        int i;
+        for (i = 0; i < cpi->svc.number_temporal_layers; ++i) {
+          if (cpi->svc.layer_context[0].frames_from_key_frame == 1 << i) {
+            cm->frame_parallel_decoding_mode = 1;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  // For 1 pass CBR, check if we are dropping this frame.
+  // For spatial layers, for now only check for frame-dropping on first spatial
+  // layer, and if decision is to drop, we drop whole super-frame.
+  if (oxcf->pass == 0 &&
+      oxcf->rc_mode == VPX_CBR &&
+      cm->frame_type != KEY_FRAME) {
+    if (vp9_rc_drop_frame(cpi) ||
+        (is_one_pass_cbr_svc(cpi) && cpi->svc.rc_drop_superframe == 1)) {
+      vp9_rc_postencode_update_drop_frame(cpi);
+      ++cm->current_video_frame;
+      cpi->ext_refresh_frame_flags_pending = 0;
+      cpi->svc.rc_drop_superframe = 1;
+      return;
+    }
+  }
+
+  vpx_clear_system_state();
+
+#if CONFIG_INTERNAL_STATS
+  memset(cpi->mode_chosen_counts, 0,
+         MAX_MODES * sizeof(*cpi->mode_chosen_counts));
+#endif
+
+  if (cpi->sf.recode_loop == DISALLOW_RECODE) {
+    encode_without_recode_loop(cpi, size, dest);
+  } else {
+    encode_with_recode_loop(cpi, size, dest);
+  }
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+#ifdef OUTPUT_YUV_DENOISED
+  if (oxcf->noise_sensitivity > 0) {
+    vp9_write_yuv_frame_420(&cpi->denoiser.running_avg_y[INTRA_FRAME],
+                            yuv_denoised_file);
+  }
+#endif
+#endif
+#ifdef OUTPUT_YUV_SKINMAP
+  if (cpi->common.current_video_frame > 1) {
+    vp9_compute_skin_map(cpi, yuv_skinmap_file);
+  }
+#endif
+
+  // Special case code to reduce pulsing when key frames are forced at a
+  // fixed interval. Note the reconstruction error if it is the frame before
+  // the force key frame
+  if (cpi->rc.next_key_frame_forced && cpi->rc.frames_to_key == 1) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (cm->use_highbitdepth) {
+      cpi->ambient_err = vp9_highbd_get_y_sse(cpi->Source,
+                                              get_frame_new_buffer(cm));
+    } else {
+      cpi->ambient_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+    }
+#else
+    cpi->ambient_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  }
+
+  // If the encoder forced a KEY_FRAME decision
+  if (cm->frame_type == KEY_FRAME)
+    cpi->refresh_last_frame = 1;
+
+  cm->frame_to_show = get_frame_new_buffer(cm);
+  cm->frame_to_show->color_space = cm->color_space;
+  cm->frame_to_show->color_range = cm->color_range;
+  cm->frame_to_show->render_width  = cm->render_width;
+  cm->frame_to_show->render_height = cm->render_height;
+
+  // Pick the loop filter level for the frame.
+  loopfilter_frame(cpi, cm);
+
+  // build the bitstream
+  vp9_pack_bitstream(cpi, dest, size);
+
+  if (cm->seg.update_map)
+    update_reference_segmentation_map(cpi);
+
+  if (frame_is_intra_only(cm) == 0) {
+    release_scaled_references(cpi);
+  }
+  vp9_update_reference_frames(cpi);
+
+  for (t = TX_4X4; t <= TX_32X32; t++)
+    full_to_model_counts(cpi->td.counts->coef[t],
+                         cpi->td.rd_counts.coef_counts[t]);
+
+  if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode)
+    vp9_adapt_coef_probs(cm);
+
+  if (!frame_is_intra_only(cm)) {
+    if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode) {
+      vp9_adapt_mode_probs(cm);
+      vp9_adapt_mv_probs(cm, cm->allow_high_precision_mv);
+    }
+  }
+
+  cpi->ext_refresh_frame_flags_pending = 0;
+
+  if (cpi->refresh_golden_frame == 1)
+    cpi->frame_flags |= FRAMEFLAGS_GOLDEN;
+  else
+    cpi->frame_flags &= ~FRAMEFLAGS_GOLDEN;
+
+  if (cpi->refresh_alt_ref_frame == 1)
+    cpi->frame_flags |= FRAMEFLAGS_ALTREF;
+  else
+    cpi->frame_flags &= ~FRAMEFLAGS_ALTREF;
+
+  cpi->ref_frame_flags = get_ref_frame_flags(cpi);
+
+  cm->last_frame_type = cm->frame_type;
+
+  if (!(is_two_pass_svc(cpi) && cpi->svc.encode_empty_frame_state == ENCODING))
+    vp9_rc_postencode_update(cpi, *size);
+
+#if 0
+  output_frame_level_debug_stats(cpi);
+#endif
+
+  if (cm->frame_type == KEY_FRAME) {
+    // Tell the caller that the frame was coded as a key frame
+    *frame_flags = cpi->frame_flags | FRAMEFLAGS_KEY;
+  } else {
+    *frame_flags = cpi->frame_flags & ~FRAMEFLAGS_KEY;
+  }
+
+  // Clear the one shot update flags for segmentation map and mode/ref loop
+  // filter deltas.
+  cm->seg.update_map = 0;
+  cm->seg.update_data = 0;
+  cm->lf.mode_ref_delta_update = 0;
+
+  // keep track of the last coded dimensions
+  cm->last_width = cm->width;
+  cm->last_height = cm->height;
+
+  // reset to normal state now that we are done.
+  if (!cm->show_existing_frame)
+    cm->last_show_frame = cm->show_frame;
+
+  if (cm->show_frame) {
+    vp9_swap_mi_and_prev_mi(cm);
+    // Don't increment frame counters if this was an altref buffer
+    // update not a real frame
+    ++cm->current_video_frame;
+    if (cpi->use_svc)
+      vp9_inc_frame_in_layer(cpi);
+  }
+  cm->prev_frame = cm->cur_frame;
+
+  if (cpi->use_svc)
+    cpi->svc.layer_context[cpi->svc.spatial_layer_id *
+                           cpi->svc.number_temporal_layers +
+                           cpi->svc.temporal_layer_id].last_frame_type =
+                               cm->frame_type;
+}
+
+static void SvcEncode(VP9_COMP *cpi, size_t *size, uint8_t *dest,
+                      unsigned int *frame_flags) {
+  vp9_rc_get_svc_params(cpi);
+  encode_frame_to_data_rate(cpi, size, dest, frame_flags);
+}
+
+static void Pass0Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest,
+                        unsigned int *frame_flags) {
+  if (cpi->oxcf.rc_mode == VPX_CBR) {
+    vp9_rc_get_one_pass_cbr_params(cpi);
+  } else {
+    vp9_rc_get_one_pass_vbr_params(cpi);
+  }
+  encode_frame_to_data_rate(cpi, size, dest, frame_flags);
+}
+
+static void Pass2Encode(VP9_COMP *cpi, size_t *size,
+                        uint8_t *dest, unsigned int *frame_flags) {
+  cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED;
+  encode_frame_to_data_rate(cpi, size, dest, frame_flags);
+
+  if (!(is_two_pass_svc(cpi) && cpi->svc.encode_empty_frame_state == ENCODING))
+    vp9_twopass_postencode_update(cpi);
+}
+
+static void init_ref_frame_bufs(VP9_COMMON *cm) {
+  int i;
+  BufferPool *const pool = cm->buffer_pool;
+  cm->new_fb_idx = INVALID_IDX;
+  for (i = 0; i < REF_FRAMES; ++i) {
+    cm->ref_frame_map[i] = INVALID_IDX;
+    pool->frame_bufs[i].ref_count = 0;
+  }
+}
+
+static void check_initial_width(VP9_COMP *cpi,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                int use_highbitdepth,
+#endif
+                                int subsampling_x, int subsampling_y) {
+  VP9_COMMON *const cm = &cpi->common;
+
+  if (!cpi->initial_width ||
+#if CONFIG_VP9_HIGHBITDEPTH
+      cm->use_highbitdepth != use_highbitdepth ||
+#endif
+      cm->subsampling_x != subsampling_x ||
+      cm->subsampling_y != subsampling_y) {
+    cm->subsampling_x = subsampling_x;
+    cm->subsampling_y = subsampling_y;
+#if CONFIG_VP9_HIGHBITDEPTH
+    cm->use_highbitdepth = use_highbitdepth;
+#endif
+
+    alloc_raw_frame_buffers(cpi);
+    init_ref_frame_bufs(cm);
+    alloc_util_frame_buffers(cpi);
+
+    init_motion_estimation(cpi);  // TODO(agrange) This can be removed.
+
+    cpi->initial_width = cm->width;
+    cpi->initial_height = cm->height;
+    cpi->initial_mbs = cm->MBs;
+  }
+}
+
+int vp9_receive_raw_frame(VP9_COMP *cpi, unsigned int frame_flags,
+                          YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
+                          int64_t end_time) {
+  VP9_COMMON *volatile const cm = &cpi->common;
+  struct vpx_usec_timer timer;
+  volatile int res = 0;
+  const int subsampling_x = sd->subsampling_x;
+  const int subsampling_y = sd->subsampling_y;
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int use_highbitdepth = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0;
+#endif
+
+  if (setjmp(cm->error.jmp)) {
+    cm->error.setjmp = 0;
+    return -1;
+  }
+  cm->error.setjmp = 1;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  check_initial_width(cpi, use_highbitdepth, subsampling_x, subsampling_y);
+#else
+  check_initial_width(cpi, subsampling_x, subsampling_y);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+  setup_denoiser_buffer(cpi);
+#endif
+  vpx_usec_timer_start(&timer);
+
+  if (vp9_lookahead_push(cpi->lookahead, sd, time_stamp, end_time,
+#if CONFIG_VP9_HIGHBITDEPTH
+                         use_highbitdepth,
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+                         frame_flags))
+    res = -1;
+  vpx_usec_timer_mark(&timer);
+  cpi->time_receive_data += vpx_usec_timer_elapsed(&timer);
+
+  if ((cm->profile == PROFILE_0 || cm->profile == PROFILE_2) &&
+      (subsampling_x != 1 || subsampling_y != 1)) {
+    vpx_internal_error(&cm->error, VPX_CODEC_INVALID_PARAM,
+                       "Non-4:2:0 color format requires profile 1 or 3");
+    res = -1;
+  }
+  if ((cm->profile == PROFILE_1 || cm->profile == PROFILE_3) &&
+      (subsampling_x == 1 && subsampling_y == 1)) {
+    vpx_internal_error(&cm->error, VPX_CODEC_INVALID_PARAM,
+                       "4:2:0 color format requires profile 0 or 2");
+    res = -1;
+  }
+
+  cm->error.setjmp = 0;
+  return res;
+}
+
+
+static int frame_is_reference(const VP9_COMP *cpi) {
+  const VP9_COMMON *cm = &cpi->common;
+
+  return cm->frame_type == KEY_FRAME ||
+         cpi->refresh_last_frame ||
+         cpi->refresh_golden_frame ||
+         cpi->refresh_alt_ref_frame ||
+         cm->refresh_frame_context ||
+         cm->lf.mode_ref_delta_update ||
+         cm->seg.update_map ||
+         cm->seg.update_data;
+}
+
+static void adjust_frame_rate(VP9_COMP *cpi,
+                              const struct lookahead_entry *source) {
+  int64_t this_duration;
+  int step = 0;
+
+  if (source->ts_start == cpi->first_time_stamp_ever) {
+    this_duration = source->ts_end - source->ts_start;
+    step = 1;
+  } else {
+    int64_t last_duration = cpi->last_end_time_stamp_seen
+        - cpi->last_time_stamp_seen;
+
+    this_duration = source->ts_end - cpi->last_end_time_stamp_seen;
+
+    // do a step update if the duration changes by 10%
+    if (last_duration)
+      step = (int)((this_duration - last_duration) * 10 / last_duration);
+  }
+
+  if (this_duration) {
+    if (step) {
+      vp9_new_framerate(cpi, 10000000.0 / this_duration);
+    } else {
+      // Average this frame's rate into the last second's average
+      // frame rate. If we haven't seen 1 second yet, then average
+      // over the whole interval seen.
+      const double interval = VPXMIN(
+          (double)(source->ts_end - cpi->first_time_stamp_ever), 10000000.0);
+      double avg_duration = 10000000.0 / cpi->framerate;
+      avg_duration *= (interval - avg_duration + this_duration);
+      avg_duration /= interval;
+
+      vp9_new_framerate(cpi, 10000000.0 / avg_duration);
+    }
+  }
+  cpi->last_time_stamp_seen = source->ts_start;
+  cpi->last_end_time_stamp_seen = source->ts_end;
+}
+
+// Returns 0 if this is not an alt ref else the offset of the source frame
+// used as the arf midpoint.
+static int get_arf_src_index(VP9_COMP *cpi) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  int arf_src_index = 0;
+  if (is_altref_enabled(cpi)) {
+    if (cpi->oxcf.pass == 2) {
+      const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+      if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
+        arf_src_index = gf_group->arf_src_offset[gf_group->index];
+      }
+    } else if (rc->source_alt_ref_pending) {
+      arf_src_index = rc->frames_till_gf_update_due;
+    }
+  }
+  return arf_src_index;
+}
+
+static void check_src_altref(VP9_COMP *cpi,
+                             const struct lookahead_entry *source) {
+  RATE_CONTROL *const rc = &cpi->rc;
+
+  if (cpi->oxcf.pass == 2) {
+    const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+    rc->is_src_frame_alt_ref =
+      (gf_group->update_type[gf_group->index] == OVERLAY_UPDATE);
+  } else {
+    rc->is_src_frame_alt_ref = cpi->alt_ref_source &&
+                               (source == cpi->alt_ref_source);
+  }
+
+  if (rc->is_src_frame_alt_ref) {
+    // Current frame is an ARF overlay frame.
+    cpi->alt_ref_source = NULL;
+
+    // Don't refresh the last buffer for an ARF overlay frame. It will
+    // become the GF so preserve last as an alternative prediction option.
+    cpi->refresh_last_frame = 0;
+  }
+}
+
+#if CONFIG_INTERNAL_STATS
+extern double vp9_get_blockiness(const uint8_t *img1, int img1_pitch,
+                                 const uint8_t *img2, int img2_pitch,
+                                 int width, int height);
+
+static void adjust_image_stat(double y, double u, double v, double all,
+                              ImageStat *s) {
+  s->stat[Y] += y;
+  s->stat[U] += u;
+  s->stat[V] += v;
+  s->stat[ALL] += all;
+  s->worst = VPXMIN(s->worst, all);
+}
+#endif  // CONFIG_INTERNAL_STATS
+
+int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
+                            size_t *size, uint8_t *dest,
+                            int64_t *time_stamp, int64_t *time_end, int flush) {
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  VP9_COMMON *const cm = &cpi->common;
+  BufferPool *const pool = cm->buffer_pool;
+  RATE_CONTROL *const rc = &cpi->rc;
+  struct vpx_usec_timer  cmptimer;
+  YV12_BUFFER_CONFIG *force_src_buffer = NULL;
+  struct lookahead_entry *last_source = NULL;
+  struct lookahead_entry *source = NULL;
+  int arf_src_index;
+  int i;
+
+  if (is_two_pass_svc(cpi)) {
+#if CONFIG_SPATIAL_SVC
+    vp9_svc_start_frame(cpi);
+    // Use a small empty frame instead of a real frame
+    if (cpi->svc.encode_empty_frame_state == ENCODING)
+      source = &cpi->svc.empty_frame;
+#endif
+    if (oxcf->pass == 2)
+      vp9_restore_layer_context(cpi);
+  } else if (is_one_pass_cbr_svc(cpi)) {
+    vp9_one_pass_cbr_svc_start_layer(cpi);
+  }
+
+  vpx_usec_timer_start(&cmptimer);
+
+  vp9_set_high_precision_mv(cpi, ALTREF_HIGH_PRECISION_MV);
+
+  // Is multi-arf enabled.
+  // Note that at the moment multi_arf is only configured for 2 pass VBR and
+  // will not work properly with svc.
+  if ((oxcf->pass == 2) && !cpi->use_svc &&
+      (cpi->oxcf.enable_auto_arf > 1))
+    cpi->multi_arf_allowed = 1;
+  else
+    cpi->multi_arf_allowed = 0;
+
+  // Normal defaults
+  cm->reset_frame_context = 0;
+  cm->refresh_frame_context = 1;
+  if (!is_one_pass_cbr_svc(cpi)) {
+    cpi->refresh_last_frame = 1;
+    cpi->refresh_golden_frame = 0;
+    cpi->refresh_alt_ref_frame = 0;
+  }
+
+  // Should we encode an arf frame.
+  arf_src_index = get_arf_src_index(cpi);
+
+  // Skip alt frame if we encode the empty frame
+  if (is_two_pass_svc(cpi) && source != NULL)
+    arf_src_index = 0;
+
+  if (arf_src_index) {
+    assert(arf_src_index <= rc->frames_to_key);
+
+    if ((source = vp9_lookahead_peek(cpi->lookahead, arf_src_index)) != NULL) {
+      cpi->alt_ref_source = source;
+
+#if CONFIG_SPATIAL_SVC
+      if (is_two_pass_svc(cpi) && cpi->svc.spatial_layer_id > 0) {
+        int i;
+        // Reference a hidden frame from a lower layer
+        for (i = cpi->svc.spatial_layer_id - 1; i >= 0; --i) {
+          if (oxcf->ss_enable_auto_arf[i]) {
+            cpi->gld_fb_idx = cpi->svc.layer_context[i].alt_ref_idx;
+            break;
+          }
+        }
+      }
+      cpi->svc.layer_context[cpi->svc.spatial_layer_id].has_alt_frame = 1;
+#endif
+
+      if ((oxcf->arnr_max_frames > 0) && (oxcf->arnr_strength > 0)) {
+        // Produce the filtered ARF frame.
+        vp9_temporal_filter(cpi, arf_src_index);
+        vpx_extend_frame_borders(&cpi->alt_ref_buffer);
+        force_src_buffer = &cpi->alt_ref_buffer;
+      }
+
+      cm->show_frame = 0;
+      cm->intra_only = 0;
+      cpi->refresh_alt_ref_frame = 1;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_last_frame = 0;
+      rc->is_src_frame_alt_ref = 0;
+      rc->source_alt_ref_pending = 0;
+    } else {
+      rc->source_alt_ref_pending = 0;
+    }
+  }
+
+  if (!source) {
+    // Get last frame source.
+    if (cm->current_video_frame > 0) {
+      if ((last_source = vp9_lookahead_peek(cpi->lookahead, -1)) == NULL)
+        return -1;
+    }
+
+    // Read in the source frame.
+    if (cpi->use_svc)
+      source = vp9_svc_lookahead_pop(cpi, cpi->lookahead, flush);
+    else
+      source = vp9_lookahead_pop(cpi->lookahead, flush);
+
+    if (source != NULL) {
+      cm->show_frame = 1;
+      cm->intra_only = 0;
+      // if the flags indicate intra frame, but if the current picture is for
+      // non-zero spatial layer, it should not be an intra picture.
+      // TODO(Won Kap): this needs to change if per-layer intra frame is
+      // allowed.
+      if ((source->flags & VPX_EFLAG_FORCE_KF) &&
+          cpi->svc.spatial_layer_id > cpi->svc.first_spatial_layer_to_encode) {
+        source->flags &= ~(unsigned int)(VPX_EFLAG_FORCE_KF);
+      }
+
+      // Check to see if the frame should be encoded as an arf overlay.
+      check_src_altref(cpi, source);
+    }
+  }
+
+  if (source) {
+    cpi->un_scaled_source = cpi->Source = force_src_buffer ? force_src_buffer
+                                                           : &source->img;
+
+    cpi->unscaled_last_source = last_source != NULL ? &last_source->img : NULL;
+
+    *time_stamp = source->ts_start;
+    *time_end = source->ts_end;
+    *frame_flags = (source->flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
+
+  } else {
+    *size = 0;
+    if (flush && oxcf->pass == 1 && !cpi->twopass.first_pass_done) {
+      vp9_end_first_pass(cpi);    /* get last stats packet */
+      cpi->twopass.first_pass_done = 1;
+    }
+    return -1;
+  }
+
+  if (source->ts_start < cpi->first_time_stamp_ever) {
+    cpi->first_time_stamp_ever = source->ts_start;
+    cpi->last_end_time_stamp_seen = source->ts_start;
+  }
+
+  // Clear down mmx registers
+  vpx_clear_system_state();
+
+  // adjust frame rates based on timestamps given
+  if (cm->show_frame) {
+    adjust_frame_rate(cpi, source);
+  }
+
+  if (is_one_pass_cbr_svc(cpi)) {
+    vp9_update_temporal_layer_framerate(cpi);
+    vp9_restore_layer_context(cpi);
+  }
+
+  // Find a free buffer for the new frame, releasing the reference previously
+  // held.
+  if (cm->new_fb_idx != INVALID_IDX) {
+    --pool->frame_bufs[cm->new_fb_idx].ref_count;
+  }
+  cm->new_fb_idx = get_free_fb(cm);
+
+  if (cm->new_fb_idx == INVALID_IDX)
+    return -1;
+
+  cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx];
+
+  if (!cpi->use_svc && cpi->multi_arf_allowed) {
+    if (cm->frame_type == KEY_FRAME) {
+      init_buffer_indices(cpi);
+    } else if (oxcf->pass == 2) {
+      const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+      cpi->alt_fb_idx = gf_group->arf_ref_idx[gf_group->index];
+    }
+  }
+
+  // Start with a 0 size frame.
+  *size = 0;
+
+  cpi->frame_flags = *frame_flags;
+
+  if ((oxcf->pass == 2) &&
+      (!cpi->use_svc ||
+          (is_two_pass_svc(cpi) &&
+              cpi->svc.encode_empty_frame_state != ENCODING))) {
+    vp9_rc_get_second_pass_params(cpi);
+  } else if (oxcf->pass == 1) {
+    set_frame_size(cpi);
+  }
+
+  if (cpi->oxcf.pass != 0 ||
+      cpi->use_svc ||
+      frame_is_intra_only(cm) == 1) {
+    for (i = 0; i < MAX_REF_FRAMES; ++i)
+      cpi->scaled_ref_idx[i] = INVALID_IDX;
+  }
+
+  if (oxcf->pass == 1 &&
+      (!cpi->use_svc || is_two_pass_svc(cpi))) {
+    const int lossless = is_lossless_requested(oxcf);
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (cpi->oxcf.use_highbitdepth)
+      cpi->td.mb.fwd_txm4x4 = lossless ?
+          vp9_highbd_fwht4x4 : vpx_highbd_fdct4x4;
+    else
+      cpi->td.mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vpx_fdct4x4;
+    cpi->td.mb.highbd_itxm_add = lossless ? vp9_highbd_iwht4x4_add :
+                                         vp9_highbd_idct4x4_add;
+#else
+    cpi->td.mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vpx_fdct4x4;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    cpi->td.mb.itxm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
+    vp9_first_pass(cpi, source);
+  } else if (oxcf->pass == 2 &&
+      (!cpi->use_svc || is_two_pass_svc(cpi))) {
+    Pass2Encode(cpi, size, dest, frame_flags);
+  } else if (cpi->use_svc) {
+    SvcEncode(cpi, size, dest, frame_flags);
+  } else {
+    // One pass encode
+    Pass0Encode(cpi, size, dest, frame_flags);
+  }
+
+  if (cm->refresh_frame_context)
+    cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
+
+  // No frame encoded, or frame was dropped, release scaled references.
+  if ((*size == 0) && (frame_is_intra_only(cm) == 0)) {
+    release_scaled_references(cpi);
+  }
+
+  if (*size > 0) {
+    cpi->droppable = !frame_is_reference(cpi);
+  }
+
+  // Save layer specific state.
+  if (is_one_pass_cbr_svc(cpi) ||
+        ((cpi->svc.number_temporal_layers > 1 ||
+          cpi->svc.number_spatial_layers > 1) &&
+         oxcf->pass == 2)) {
+    vp9_save_layer_context(cpi);
+  }
+
+  vpx_usec_timer_mark(&cmptimer);
+  cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer);
+
+  if (cpi->b_calculate_psnr && oxcf->pass != 1 && cm->show_frame)
+    generate_psnr_packet(cpi);
+
+#if CONFIG_INTERNAL_STATS
+
+  if (oxcf->pass != 1) {
+    double samples = 0.0;
+    cpi->bytes += (int)(*size);
+
+    if (cm->show_frame) {
+      cpi->count++;
+
+      if (cpi->b_calculate_psnr) {
+        YV12_BUFFER_CONFIG *orig = cpi->Source;
+        YV12_BUFFER_CONFIG *recon = cpi->common.frame_to_show;
+        YV12_BUFFER_CONFIG *pp = &cm->post_proc_buffer;
+        PSNR_STATS psnr;
+#if CONFIG_VP9_HIGHBITDEPTH
+        calc_highbd_psnr(orig, recon, &psnr, cpi->td.mb.e_mbd.bd,
+                         cpi->oxcf.input_bit_depth);
+#else
+        calc_psnr(orig, recon, &psnr);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+        adjust_image_stat(psnr.psnr[1], psnr.psnr[2], psnr.psnr[3],
+                          psnr.psnr[0], &cpi->psnr);
+        cpi->total_sq_error += psnr.sse[0];
+        cpi->total_samples += psnr.samples[0];
+        samples = psnr.samples[0];
+
+        {
+          PSNR_STATS psnr2;
+          double frame_ssim2 = 0, weight = 0;
+#if CONFIG_VP9_POSTPROC
+          if (vpx_alloc_frame_buffer(&cm->post_proc_buffer,
+                                     recon->y_crop_width, recon->y_crop_height,
+                                     cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                     cm->use_highbitdepth,
+#endif
+                                     VP9_ENC_BORDER_IN_PIXELS,
+                                     cm->byte_alignment) < 0) {
+            vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                               "Failed to allocate post processing buffer");
+          }
+
+          vp9_deblock(cm->frame_to_show, &cm->post_proc_buffer,
+                      cm->lf.filter_level * 10 / 6);
+#endif
+          vpx_clear_system_state();
+
+#if CONFIG_VP9_HIGHBITDEPTH
+          calc_highbd_psnr(orig, pp, &psnr2, cpi->td.mb.e_mbd.bd,
+                           cpi->oxcf.input_bit_depth);
+#else
+          calc_psnr(orig, pp, &psnr2);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+          cpi->totalp_sq_error += psnr2.sse[0];
+          cpi->totalp_samples += psnr2.samples[0];
+          adjust_image_stat(psnr2.psnr[1], psnr2.psnr[2], psnr2.psnr[3],
+                            psnr2.psnr[0], &cpi->psnrp);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+          if (cm->use_highbitdepth) {
+            frame_ssim2 = vpx_highbd_calc_ssim(orig, recon, &weight,
+                                               (int)cm->bit_depth);
+          } else {
+            frame_ssim2 = vpx_calc_ssim(orig, recon, &weight);
+          }
+#else
+          frame_ssim2 = vpx_calc_ssim(orig, recon, &weight);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+          cpi->worst_ssim = VPXMIN(cpi->worst_ssim, frame_ssim2);
+          cpi->summed_quality += frame_ssim2 * weight;
+          cpi->summed_weights += weight;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+          if (cm->use_highbitdepth) {
+            frame_ssim2 = vpx_highbd_calc_ssim(
+                orig, &cm->post_proc_buffer, &weight, (int)cm->bit_depth);
+          } else {
+            frame_ssim2 = vpx_calc_ssim(orig, &cm->post_proc_buffer, &weight);
+          }
+#else
+          frame_ssim2 = vpx_calc_ssim(orig, &cm->post_proc_buffer, &weight);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+          cpi->summedp_quality += frame_ssim2 * weight;
+          cpi->summedp_weights += weight;
+#if 0
+          {
+            FILE *f = fopen("q_used.stt", "a");
+            fprintf(f, "%5d : Y%f7.3:U%f7.3:V%f7.3:F%f7.3:S%7.3f\n",
+                    cpi->common.current_video_frame, y2, u2, v2,
+                    frame_psnr2, frame_ssim2);
+            fclose(f);
+          }
+#endif
+        }
+      }
+      if (cpi->b_calculate_blockiness) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (!cm->use_highbitdepth)
+#endif
+        {
+          double frame_blockiness = vp9_get_blockiness(
+              cpi->Source->y_buffer, cpi->Source->y_stride,
+              cm->frame_to_show->y_buffer, cm->frame_to_show->y_stride,
+              cpi->Source->y_width, cpi->Source->y_height);
+          cpi->worst_blockiness =
+              VPXMAX(cpi->worst_blockiness, frame_blockiness);
+          cpi->total_blockiness += frame_blockiness;
+        }
+      }
+
+      if (cpi->b_calculate_consistency) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (!cm->use_highbitdepth)
+#endif
+        {
+          double this_inconsistency = vpx_get_ssim_metrics(
+              cpi->Source->y_buffer, cpi->Source->y_stride,
+              cm->frame_to_show->y_buffer, cm->frame_to_show->y_stride,
+              cpi->Source->y_width, cpi->Source->y_height, cpi->ssim_vars,
+              &cpi->metrics, 1);
+
+          const double peak = (double)((1 << cpi->oxcf.input_bit_depth) - 1);
+          double consistency = vpx_sse_to_psnr(samples, peak,
+                                             (double)cpi->total_inconsistency);
+          if (consistency > 0.0)
+            cpi->worst_consistency =
+                VPXMIN(cpi->worst_consistency, consistency);
+          cpi->total_inconsistency += this_inconsistency;
+        }
+      }
+
+      if (cpi->b_calculate_ssimg) {
+        double y, u, v, frame_all;
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (cm->use_highbitdepth) {
+          frame_all = vpx_highbd_calc_ssimg(cpi->Source, cm->frame_to_show, &y,
+                                            &u, &v, (int)cm->bit_depth);
+        } else {
+          frame_all = vpx_calc_ssimg(cpi->Source, cm->frame_to_show, &y, &u,
+                                     &v);
+        }
+#else
+        frame_all = vpx_calc_ssimg(cpi->Source, cm->frame_to_show, &y, &u, &v);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        adjust_image_stat(y, u, v, frame_all, &cpi->ssimg);
+      }
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (!cm->use_highbitdepth)
+#endif
+      {
+        double y, u, v, frame_all;
+        frame_all = vpx_calc_fastssim(cpi->Source, cm->frame_to_show, &y, &u,
+                                      &v);
+        adjust_image_stat(y, u, v, frame_all, &cpi->fastssim);
+        /* TODO(JBB): add 10/12 bit support */
+      }
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (!cm->use_highbitdepth)
+#endif
+      {
+        double y, u, v, frame_all;
+        frame_all = vpx_psnrhvs(cpi->Source, cm->frame_to_show, &y, &u, &v);
+        adjust_image_stat(y, u, v, frame_all, &cpi->psnrhvs);
+      }
+    }
+  }
+
+#endif
+
+  if (is_two_pass_svc(cpi)) {
+    if (cpi->svc.encode_empty_frame_state == ENCODING) {
+      cpi->svc.encode_empty_frame_state = ENCODED;
+      cpi->svc.encode_intra_empty_frame = 0;
+    }
+
+    if (cm->show_frame) {
+      ++cpi->svc.spatial_layer_to_encode;
+      if (cpi->svc.spatial_layer_to_encode >= cpi->svc.number_spatial_layers)
+        cpi->svc.spatial_layer_to_encode = 0;
+
+      // May need the empty frame after an visible frame.
+      cpi->svc.encode_empty_frame_state = NEED_TO_ENCODE;
+    }
+  } else if (is_one_pass_cbr_svc(cpi)) {
+    if (cm->show_frame) {
+      ++cpi->svc.spatial_layer_to_encode;
+      if (cpi->svc.spatial_layer_to_encode >= cpi->svc.number_spatial_layers)
+        cpi->svc.spatial_layer_to_encode = 0;
+    }
+  }
+  vpx_clear_system_state();
+  return 0;
+}
+
+int vp9_get_preview_raw_frame(VP9_COMP *cpi, YV12_BUFFER_CONFIG *dest,
+                              vp9_ppflags_t *flags) {
+  VP9_COMMON *cm = &cpi->common;
+#if !CONFIG_VP9_POSTPROC
+  (void)flags;
+#endif
+
+  if (!cm->show_frame) {
+    return -1;
+  } else {
+    int ret;
+#if CONFIG_VP9_POSTPROC
+    ret = vp9_post_proc_frame(cm, dest, flags);
+#else
+    if (cm->frame_to_show) {
+      *dest = *cm->frame_to_show;
+      dest->y_width = cm->width;
+      dest->y_height = cm->height;
+      dest->uv_width = cm->width >> cm->subsampling_x;
+      dest->uv_height = cm->height >> cm->subsampling_y;
+      ret = 0;
+    } else {
+      ret = -1;
+    }
+#endif  // !CONFIG_VP9_POSTPROC
+    vpx_clear_system_state();
+    return ret;
+  }
+}
+
+int vp9_set_internal_size(VP9_COMP *cpi,
+                          VPX_SCALING horiz_mode, VPX_SCALING vert_mode) {
+  VP9_COMMON *cm = &cpi->common;
+  int hr = 0, hs = 0, vr = 0, vs = 0;
+
+  if (horiz_mode > ONETWO || vert_mode > ONETWO)
+    return -1;
+
+  Scale2Ratio(horiz_mode, &hr, &hs);
+  Scale2Ratio(vert_mode, &vr, &vs);
+
+  // always go to the next whole number
+  cm->width = (hs - 1 + cpi->oxcf.width * hr) / hs;
+  cm->height = (vs - 1 + cpi->oxcf.height * vr) / vs;
+  if (cm->current_video_frame) {
+    assert(cm->width <= cpi->initial_width);
+    assert(cm->height <= cpi->initial_height);
+  }
+
+  update_frame_size(cpi);
+
+  return 0;
+}
+
+int vp9_set_size_literal(VP9_COMP *cpi, unsigned int width,
+                         unsigned int height) {
+  VP9_COMMON *cm = &cpi->common;
+#if CONFIG_VP9_HIGHBITDEPTH
+  check_initial_width(cpi, cm->use_highbitdepth, 1, 1);
+#else
+  check_initial_width(cpi, 1, 1);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+  setup_denoiser_buffer(cpi);
+#endif
+
+  if (width) {
+    cm->width = width;
+    if (cm->width > cpi->initial_width) {
+      cm->width = cpi->initial_width;
+      printf("Warning: Desired width too large, changed to %d\n", cm->width);
+    }
+  }
+
+  if (height) {
+    cm->height = height;
+    if (cm->height > cpi->initial_height) {
+      cm->height = cpi->initial_height;
+      printf("Warning: Desired height too large, changed to %d\n", cm->height);
+    }
+  }
+  assert(cm->width <= cpi->initial_width);
+  assert(cm->height <= cpi->initial_height);
+
+  update_frame_size(cpi);
+
+  return 0;
+}
+
+void vp9_set_svc(VP9_COMP *cpi, int use_svc) {
+  cpi->use_svc = use_svc;
+  return;
+}
+
+int64_t vp9_get_y_sse(const YV12_BUFFER_CONFIG *a,
+                      const YV12_BUFFER_CONFIG *b) {
+  assert(a->y_crop_width == b->y_crop_width);
+  assert(a->y_crop_height == b->y_crop_height);
+
+  return get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
+                 a->y_crop_width, a->y_crop_height);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+int64_t vp9_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
+                             const YV12_BUFFER_CONFIG *b) {
+  assert(a->y_crop_width == b->y_crop_width);
+  assert(a->y_crop_height == b->y_crop_height);
+  assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+  assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+
+  return highbd_get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
+                        a->y_crop_width, a->y_crop_height);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+int vp9_get_quantizer(VP9_COMP *cpi) {
+  return cpi->common.base_qindex;
+}
+
+void vp9_apply_encoding_flags(VP9_COMP *cpi, vpx_enc_frame_flags_t flags) {
+  if (flags & (VP8_EFLAG_NO_REF_LAST | VP8_EFLAG_NO_REF_GF |
+               VP8_EFLAG_NO_REF_ARF)) {
+    int ref = 7;
+
+    if (flags & VP8_EFLAG_NO_REF_LAST)
+      ref ^= VP9_LAST_FLAG;
+
+    if (flags & VP8_EFLAG_NO_REF_GF)
+      ref ^= VP9_GOLD_FLAG;
+
+    if (flags & VP8_EFLAG_NO_REF_ARF)
+      ref ^= VP9_ALT_FLAG;
+
+    vp9_use_as_reference(cpi, ref);
+  }
+
+  if (flags & (VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF |
+               VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_FORCE_GF |
+               VP8_EFLAG_FORCE_ARF)) {
+    int upd = 7;
+
+    if (flags & VP8_EFLAG_NO_UPD_LAST)
+      upd ^= VP9_LAST_FLAG;
+
+    if (flags & VP8_EFLAG_NO_UPD_GF)
+      upd ^= VP9_GOLD_FLAG;
+
+    if (flags & VP8_EFLAG_NO_UPD_ARF)
+      upd ^= VP9_ALT_FLAG;
+
+    vp9_update_reference(cpi, upd);
+  }
+
+  if (flags & VP8_EFLAG_NO_UPD_ENTROPY) {
+    vp9_update_entropy(cpi, 0);
+  }
+}
diff --git a/libs/libvpx/vp9/encoder/vp9_encoder.h b/libs/libvpx/vp9/encoder/vp9_encoder.h
new file mode 100644
index 0000000000..c486ac2589
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_encoder.h
@@ -0,0 +1,673 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_VP9_ENCODER_H_
+#define VP9_ENCODER_VP9_ENCODER_H_
+
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "vpx/internal/vpx_codec_internal.h"
+#include "vpx/vp8cx.h"
+#if CONFIG_INTERNAL_STATS
+#include "vpx_dsp/ssim.h"
+#endif
+#include "vpx_dsp/variance.h"
+#include "vpx_util/vpx_thread.h"
+
+#include "vp9/common/vp9_alloccommon.h"
+#include "vp9/common/vp9_ppflags.h"
+#include "vp9/common/vp9_entropymode.h"
+#include "vp9/common/vp9_thread_common.h"
+#include "vp9/common/vp9_onyxc_int.h"
+
+#include "vp9/encoder/vp9_aq_cyclicrefresh.h"
+#include "vp9/encoder/vp9_context_tree.h"
+#include "vp9/encoder/vp9_encodemb.h"
+#include "vp9/encoder/vp9_firstpass.h"
+#include "vp9/encoder/vp9_lookahead.h"
+#include "vp9/encoder/vp9_mbgraph.h"
+#include "vp9/encoder/vp9_mcomp.h"
+#include "vp9/encoder/vp9_noise_estimate.h"
+#include "vp9/encoder/vp9_quantize.h"
+#include "vp9/encoder/vp9_ratectrl.h"
+#include "vp9/encoder/vp9_rd.h"
+#include "vp9/encoder/vp9_speed_features.h"
+#include "vp9/encoder/vp9_svc_layercontext.h"
+#include "vp9/encoder/vp9_tokenize.h"
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+#include "vp9/encoder/vp9_denoiser.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+  int nmvjointcost[MV_JOINTS];
+  int nmvcosts[2][MV_VALS];
+  int nmvcosts_hp[2][MV_VALS];
+
+  vpx_prob segment_pred_probs[PREDICTION_PROBS];
+
+  unsigned char *last_frame_seg_map_copy;
+
+  // 0 = Intra, Last, GF, ARF
+  signed char last_ref_lf_deltas[MAX_REF_LF_DELTAS];
+  // 0 = ZERO_MV, MV
+  signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS];
+
+  FRAME_CONTEXT fc;
+} CODING_CONTEXT;
+
+
+typedef enum {
+  // encode_breakout is disabled.
+  ENCODE_BREAKOUT_DISABLED = 0,
+  // encode_breakout is enabled.
+  ENCODE_BREAKOUT_ENABLED = 1,
+  // encode_breakout is enabled with small max_thresh limit.
+  ENCODE_BREAKOUT_LIMITED = 2
+} ENCODE_BREAKOUT_TYPE;
+
+typedef enum {
+  NORMAL      = 0,
+  FOURFIVE    = 1,
+  THREEFIVE   = 2,
+  ONETWO      = 3
+} VPX_SCALING;
+
+typedef enum {
+  // Good Quality Fast Encoding. The encoder balances quality with the amount of
+  // time it takes to encode the output. Speed setting controls how fast.
+  GOOD,
+
+  // The encoder places priority on the quality of the output over encoding
+  // speed. The output is compressed at the highest possible quality. This
+  // option takes the longest amount of time to encode. Speed setting ignored.
+  BEST,
+
+  // Realtime/Live Encoding. This mode is optimized for realtime encoding (for
+  // example, capturing a television signal or feed from a live camera). Speed
+  // setting controls how fast.
+  REALTIME
+} MODE;
+
+typedef enum {
+  FRAMEFLAGS_KEY    = 1 << 0,
+  FRAMEFLAGS_GOLDEN = 1 << 1,
+  FRAMEFLAGS_ALTREF = 1 << 2,
+} FRAMETYPE_FLAGS;
+
+typedef enum {
+  NO_AQ = 0,
+  VARIANCE_AQ = 1,
+  COMPLEXITY_AQ = 2,
+  CYCLIC_REFRESH_AQ = 3,
+  EQUATOR360_AQ = 4,
+  AQ_MODE_COUNT  // This should always be the last member of the enum
+} AQ_MODE;
+
+typedef enum {
+  RESIZE_NONE = 0,    // No frame resizing allowed (except for SVC).
+  RESIZE_FIXED = 1,   // All frames are coded at the specified dimension.
+  RESIZE_DYNAMIC = 2  // Coded size of each frame is determined by the codec.
+} RESIZE_TYPE;
+
+typedef struct VP9EncoderConfig {
+  BITSTREAM_PROFILE profile;
+  vpx_bit_depth_t bit_depth;     // Codec bit-depth.
+  int width;  // width of data passed to the compressor
+  int height;  // height of data passed to the compressor
+  unsigned int input_bit_depth;  // Input bit depth.
+  double init_framerate;  // set to passed in framerate
+  int64_t target_bandwidth;  // bandwidth to be used in kilobits per second
+
+  int noise_sensitivity;  // pre processing blur: recommendation 0
+  int sharpness;  // sharpening output: recommendation 0:
+  int speed;
+  // maximum allowed bitrate for any intra frame in % of bitrate target.
+  unsigned int rc_max_intra_bitrate_pct;
+  // maximum allowed bitrate for any inter frame in % of bitrate target.
+  unsigned int rc_max_inter_bitrate_pct;
+  // percent of rate boost for golden frame in CBR mode.
+  unsigned int gf_cbr_boost_pct;
+
+  MODE mode;
+  int pass;
+
+  // Key Framing Operations
+  int auto_key;  // autodetect cut scenes and set the keyframes
+  int key_freq;  // maximum distance to key frame.
+
+  int lag_in_frames;  // how many frames lag before we start encoding
+
+  // ----------------------------------------------------------------
+  // DATARATE CONTROL OPTIONS
+
+  // vbr, cbr, constrained quality or constant quality
+  enum vpx_rc_mode rc_mode;
+
+  // buffer targeting aggressiveness
+  int under_shoot_pct;
+  int over_shoot_pct;
+
+  // buffering parameters
+  int64_t starting_buffer_level_ms;
+  int64_t optimal_buffer_level_ms;
+  int64_t maximum_buffer_size_ms;
+
+  // Frame drop threshold.
+  int drop_frames_water_mark;
+
+  // controlling quality
+  int fixed_q;
+  int worst_allowed_q;
+  int best_allowed_q;
+  int cq_level;
+  AQ_MODE aq_mode;  // Adaptive Quantization mode
+
+  // Internal frame size scaling.
+  RESIZE_TYPE resize_mode;
+  int scaled_frame_width;
+  int scaled_frame_height;
+
+  // Enable feature to reduce the frame quantization every x frames.
+  int frame_periodic_boost;
+
+  // two pass datarate control
+  int two_pass_vbrbias;        // two pass datarate control tweaks
+  int two_pass_vbrmin_section;
+  int two_pass_vbrmax_section;
+  // END DATARATE CONTROL OPTIONS
+  // ----------------------------------------------------------------
+
+  // Spatial and temporal scalability.
+  int ss_number_layers;  // Number of spatial layers.
+  int ts_number_layers;  // Number of temporal layers.
+  // Bitrate allocation for spatial layers.
+  int layer_target_bitrate[VPX_MAX_LAYERS];
+  int ss_target_bitrate[VPX_SS_MAX_LAYERS];
+  int ss_enable_auto_arf[VPX_SS_MAX_LAYERS];
+  // Bitrate allocation (CBR mode) and framerate factor, for temporal layers.
+  int ts_rate_decimator[VPX_TS_MAX_LAYERS];
+
+  int enable_auto_arf;
+
+  int encode_breakout;  // early breakout : for video conf recommend 800
+
+  /* Bitfield defining the error resiliency features to enable.
+   * Can provide decodable frames after losses in previous
+   * frames and decodable partitions after losses in the same frame.
+   */
+  unsigned int error_resilient_mode;
+
+  /* Bitfield defining the parallel decoding mode where the
+   * decoding in successive frames may be conducted in parallel
+   * just by decoding the frame headers.
+   */
+  unsigned int frame_parallel_decoding_mode;
+
+  int arnr_max_frames;
+  int arnr_strength;
+
+  int min_gf_interval;
+  int max_gf_interval;
+
+  int tile_columns;
+  int tile_rows;
+
+  int max_threads;
+
+  vpx_fixed_buf_t two_pass_stats_in;
+  struct vpx_codec_pkt_list *output_pkt_list;
+
+#if CONFIG_FP_MB_STATS
+  vpx_fixed_buf_t firstpass_mb_stats_in;
+#endif
+
+  vp8e_tuning tuning;
+  vp9e_tune_content content;
+#if CONFIG_VP9_HIGHBITDEPTH
+  int use_highbitdepth;
+#endif
+  vpx_color_space_t color_space;
+  vpx_color_range_t color_range;
+  int render_width;
+  int render_height;
+  VP9E_TEMPORAL_LAYERING_MODE temporal_layering_mode;
+} VP9EncoderConfig;
+
+static INLINE int is_lossless_requested(const VP9EncoderConfig *cfg) {
+  return cfg->best_allowed_q == 0 && cfg->worst_allowed_q == 0;
+}
+
+// TODO(jingning) All spatially adaptive variables should go to TileDataEnc.
+typedef struct TileDataEnc {
+  TileInfo tile_info;
+  int thresh_freq_fact[BLOCK_SIZES][MAX_MODES];
+  int mode_map[BLOCK_SIZES][MAX_MODES];
+} TileDataEnc;
+
+typedef struct RD_COUNTS {
+  vp9_coeff_count coef_counts[TX_SIZES][PLANE_TYPES];
+  int64_t comp_pred_diff[REFERENCE_MODES];
+  int64_t filter_diff[SWITCHABLE_FILTER_CONTEXTS];
+  int m_search_count;
+  int ex_search_count;
+} RD_COUNTS;
+
+typedef struct ThreadData {
+  MACROBLOCK mb;
+  RD_COUNTS rd_counts;
+  FRAME_COUNTS *counts;
+
+  PICK_MODE_CONTEXT *leaf_tree;
+  PC_TREE *pc_tree;
+  PC_TREE *pc_root;
+} ThreadData;
+
+struct EncWorkerData;
+
+typedef struct ActiveMap {
+  int enabled;
+  int update;
+  unsigned char *map;
+} ActiveMap;
+
+typedef enum {
+  Y,
+  U,
+  V,
+  ALL
+} STAT_TYPE;
+
+typedef struct IMAGE_STAT {
+  double stat[ALL+1];
+  double worst;
+} ImageStat;
+
+typedef struct VP9_COMP {
+  QUANTS quants;
+  ThreadData td;
+  MB_MODE_INFO_EXT *mbmi_ext_base;
+  DECLARE_ALIGNED(16, int16_t, y_dequant[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, uv_dequant[QINDEX_RANGE][8]);
+  VP9_COMMON common;
+  VP9EncoderConfig oxcf;
+  struct lookahead_ctx    *lookahead;
+  struct lookahead_entry  *alt_ref_source;
+
+  YV12_BUFFER_CONFIG *Source;
+  YV12_BUFFER_CONFIG *Last_Source;  // NULL for first frame and alt_ref frames
+  YV12_BUFFER_CONFIG *un_scaled_source;
+  YV12_BUFFER_CONFIG scaled_source;
+  YV12_BUFFER_CONFIG *unscaled_last_source;
+  YV12_BUFFER_CONFIG scaled_last_source;
+
+  TileDataEnc *tile_data;
+  int allocated_tiles;  // Keep track of memory allocated for tiles.
+
+  // For a still frame, this flag is set to 1 to skip partition search.
+  int partition_search_skippable_frame;
+
+  int scaled_ref_idx[MAX_REF_FRAMES];
+  int lst_fb_idx;
+  int gld_fb_idx;
+  int alt_fb_idx;
+
+  int refresh_last_frame;
+  int refresh_golden_frame;
+  int refresh_alt_ref_frame;
+
+  int ext_refresh_frame_flags_pending;
+  int ext_refresh_last_frame;
+  int ext_refresh_golden_frame;
+  int ext_refresh_alt_ref_frame;
+
+  int ext_refresh_frame_context_pending;
+  int ext_refresh_frame_context;
+
+  YV12_BUFFER_CONFIG last_frame_uf;
+
+  TOKENEXTRA *tile_tok[4][1 << 6];
+  unsigned int tok_count[4][1 << 6];
+
+  // Ambient reconstruction err target for force key frames
+  int64_t ambient_err;
+
+  RD_OPT rd;
+
+  CODING_CONTEXT coding_context;
+
+  int *nmvcosts[2];
+  int *nmvcosts_hp[2];
+  int *nmvsadcosts[2];
+  int *nmvsadcosts_hp[2];
+
+  int64_t last_time_stamp_seen;
+  int64_t last_end_time_stamp_seen;
+  int64_t first_time_stamp_ever;
+
+  RATE_CONTROL rc;
+  double framerate;
+
+  int interp_filter_selected[MAX_REF_FRAMES][SWITCHABLE];
+
+  struct vpx_codec_pkt_list  *output_pkt_list;
+
+  MBGRAPH_FRAME_STATS mbgraph_stats[MAX_LAG_BUFFERS];
+  int mbgraph_n_frames;             // number of frames filled in the above
+  int static_mb_pct;                // % forced skip mbs by segmentation
+  int ref_frame_flags;
+
+  SPEED_FEATURES sf;
+
+  unsigned int max_mv_magnitude;
+  int mv_step_param;
+
+  int allow_comp_inter_inter;
+
+  // Default value is 1. From first pass stats, encode_breakout may be disabled.
+  ENCODE_BREAKOUT_TYPE allow_encode_breakout;
+
+  // Get threshold from external input. A suggested threshold is 800 for HD
+  // clips, and 300 for < HD clips.
+  int encode_breakout;
+
+  unsigned char *segmentation_map;
+
+  // segment threashold for encode breakout
+  int  segment_encode_breakout[MAX_SEGMENTS];
+
+  CYCLIC_REFRESH *cyclic_refresh;
+  ActiveMap active_map;
+
+  fractional_mv_step_fp *find_fractional_mv_step;
+  vp9_full_search_fn_t full_search_sad;
+  vp9_diamond_search_fn_t diamond_search_sad;
+  vp9_variance_fn_ptr_t fn_ptr[BLOCK_SIZES];
+  uint64_t time_receive_data;
+  uint64_t time_compress_data;
+  uint64_t time_pick_lpf;
+  uint64_t time_encode_sb_row;
+
+#if CONFIG_FP_MB_STATS
+  int use_fp_mb_stats;
+#endif
+
+  TWO_PASS twopass;
+
+  YV12_BUFFER_CONFIG alt_ref_buffer;
+
+
+#if CONFIG_INTERNAL_STATS
+  unsigned int mode_chosen_counts[MAX_MODES];
+
+  int    count;
+  uint64_t total_sq_error;
+  uint64_t total_samples;
+  ImageStat psnr;
+
+  uint64_t totalp_sq_error;
+  uint64_t totalp_samples;
+  ImageStat psnrp;
+
+  double total_blockiness;
+  double worst_blockiness;
+
+  int    bytes;
+  double summed_quality;
+  double summed_weights;
+  double summedp_quality;
+  double summedp_weights;
+  unsigned int tot_recode_hits;
+  double worst_ssim;
+
+  ImageStat ssimg;
+  ImageStat fastssim;
+  ImageStat psnrhvs;
+
+  int b_calculate_ssimg;
+  int b_calculate_blockiness;
+
+  int b_calculate_consistency;
+
+  double total_inconsistency;
+  double worst_consistency;
+  Ssimv *ssim_vars;
+  Metrics metrics;
+#endif
+  int b_calculate_psnr;
+
+  int droppable;
+
+  int initial_width;
+  int initial_height;
+  int initial_mbs;  // Number of MBs in the full-size frame; to be used to
+                    // normalize the firstpass stats. This will differ from the
+                    // number of MBs in the current frame when the frame is
+                    // scaled.
+
+  int use_svc;
+
+  SVC svc;
+
+  // Store frame variance info in SOURCE_VAR_BASED_PARTITION search type.
+  diff *source_diff_var;
+  // The threshold used in SOURCE_VAR_BASED_PARTITION search type.
+  unsigned int source_var_thresh;
+  int frames_till_next_var_check;
+
+  int frame_flags;
+
+  search_site_config ss_cfg;
+
+  int mbmode_cost[INTRA_MODES];
+  unsigned int inter_mode_cost[INTER_MODE_CONTEXTS][INTER_MODES];
+  int intra_uv_mode_cost[FRAME_TYPES][INTRA_MODES][INTRA_MODES];
+  int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
+  int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
+  int partition_cost[PARTITION_CONTEXTS][PARTITION_TYPES];
+
+  int multi_arf_allowed;
+  int multi_arf_enabled;
+  int multi_arf_last_grp_enabled;
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+  VP9_DENOISER denoiser;
+#endif
+
+  int resize_pending;
+  int resize_state;
+  int external_resize;
+  int resize_scale_num;
+  int resize_scale_den;
+  int resize_avg_qp;
+  int resize_buffer_underflow;
+  int resize_count;
+
+  int use_skin_detection;
+
+  NOISE_ESTIMATE noise_estimate;
+
+  // VAR_BASED_PARTITION thresholds
+  // 0 - threshold_64x64; 1 - threshold_32x32;
+  // 2 - threshold_16x16; 3 - vbp_threshold_8x8;
+  int64_t vbp_thresholds[4];
+  int64_t vbp_threshold_minmax;
+  int64_t vbp_threshold_sad;
+  BLOCK_SIZE vbp_bsize_min;
+
+  // Multi-threading
+  int num_workers;
+  VPxWorker *workers;
+  struct EncWorkerData *tile_thr_data;
+  VP9LfSync lf_row_sync;
+} VP9_COMP;
+
+void vp9_initialize_enc(void);
+
+struct VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
+                                       BufferPool *const pool);
+void vp9_remove_compressor(VP9_COMP *cpi);
+
+void vp9_change_config(VP9_COMP *cpi, const VP9EncoderConfig *oxcf);
+
+  // receive a frames worth of data. caller can assume that a copy of this
+  // frame is made and not just a copy of the pointer..
+int vp9_receive_raw_frame(VP9_COMP *cpi, unsigned int frame_flags,
+                          YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
+                          int64_t end_time_stamp);
+
+int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
+                            size_t *size, uint8_t *dest,
+                            int64_t *time_stamp, int64_t *time_end, int flush);
+
+int vp9_get_preview_raw_frame(VP9_COMP *cpi, YV12_BUFFER_CONFIG *dest,
+                              vp9_ppflags_t *flags);
+
+int vp9_use_as_reference(VP9_COMP *cpi, int ref_frame_flags);
+
+void vp9_update_reference(VP9_COMP *cpi, int ref_frame_flags);
+
+int vp9_copy_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag,
+                           YV12_BUFFER_CONFIG *sd);
+
+int vp9_set_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag,
+                          YV12_BUFFER_CONFIG *sd);
+
+int vp9_update_entropy(VP9_COMP *cpi, int update);
+
+int vp9_set_active_map(VP9_COMP *cpi, unsigned char *map, int rows, int cols);
+
+int vp9_get_active_map(VP9_COMP *cpi, unsigned char *map, int rows, int cols);
+
+int vp9_set_internal_size(VP9_COMP *cpi,
+                          VPX_SCALING horiz_mode, VPX_SCALING vert_mode);
+
+int vp9_set_size_literal(VP9_COMP *cpi, unsigned int width,
+                         unsigned int height);
+
+void vp9_set_svc(VP9_COMP *cpi, int use_svc);
+
+int vp9_get_quantizer(struct VP9_COMP *cpi);
+
+static INLINE int frame_is_kf_gf_arf(const VP9_COMP *cpi) {
+  return frame_is_intra_only(&cpi->common) ||
+         cpi->refresh_alt_ref_frame ||
+         (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref);
+}
+
+static INLINE int get_ref_frame_map_idx(const VP9_COMP *cpi,
+                                        MV_REFERENCE_FRAME ref_frame) {
+  if (ref_frame == LAST_FRAME) {
+    return cpi->lst_fb_idx;
+  } else if (ref_frame == GOLDEN_FRAME) {
+    return cpi->gld_fb_idx;
+  } else {
+    return cpi->alt_fb_idx;
+  }
+}
+
+static INLINE int get_ref_frame_buf_idx(const VP9_COMP *const cpi,
+                                        int ref_frame) {
+  const VP9_COMMON *const cm = &cpi->common;
+  const int map_idx = get_ref_frame_map_idx(cpi, ref_frame);
+  return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : INVALID_IDX;
+}
+
+static INLINE YV12_BUFFER_CONFIG *get_ref_frame_buffer(
+    VP9_COMP *cpi, MV_REFERENCE_FRAME ref_frame) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+  return
+      buf_idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[buf_idx].buf : NULL;
+}
+
+static INLINE int get_token_alloc(int mb_rows, int mb_cols) {
+  // TODO(JBB): double check we can't exceed this token count if we have a
+  // 32x32 transform crossing a boundary at a multiple of 16.
+  // mb_rows, cols are in units of 16 pixels. We assume 3 planes all at full
+  // resolution. We assume up to 1 token per pixel, and then allow
+  // a head room of 4.
+  return mb_rows * mb_cols * (16 * 16 * 3 + 4);
+}
+
+// Get the allocated token size for a tile. It does the same calculation as in
+// the frame token allocation.
+static INLINE int allocated_tokens(TileInfo tile) {
+  int tile_mb_rows = (tile.mi_row_end - tile.mi_row_start + 1) >> 1;
+  int tile_mb_cols = (tile.mi_col_end - tile.mi_col_start + 1) >> 1;
+
+  return get_token_alloc(tile_mb_rows, tile_mb_cols);
+}
+
+int64_t vp9_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
+#if CONFIG_VP9_HIGHBITDEPTH
+int64_t vp9_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
+                             const YV12_BUFFER_CONFIG *b);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+void vp9_scale_references(VP9_COMP *cpi);
+
+void vp9_update_reference_frames(VP9_COMP *cpi);
+
+void vp9_set_high_precision_mv(VP9_COMP *cpi, int allow_high_precision_mv);
+
+YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm,
+                                          YV12_BUFFER_CONFIG *unscaled,
+                                          YV12_BUFFER_CONFIG *scaled,
+                                          int use_normative_scaler);
+
+void vp9_apply_encoding_flags(VP9_COMP *cpi, vpx_enc_frame_flags_t flags);
+
+static INLINE int is_two_pass_svc(const struct VP9_COMP *const cpi) {
+  return cpi->use_svc && cpi->oxcf.pass != 0;
+}
+
+static INLINE int is_one_pass_cbr_svc(const struct VP9_COMP *const cpi) {
+  return (cpi->use_svc && cpi->oxcf.pass == 0);
+}
+
+static INLINE int is_altref_enabled(const VP9_COMP *const cpi) {
+  return cpi->oxcf.mode != REALTIME && cpi->oxcf.lag_in_frames > 0 &&
+         (cpi->oxcf.enable_auto_arf &&
+          (!is_two_pass_svc(cpi) ||
+           cpi->oxcf.ss_enable_auto_arf[cpi->svc.spatial_layer_id]));
+}
+
+static INLINE void set_ref_ptrs(VP9_COMMON *cm, MACROBLOCKD *xd,
+                                MV_REFERENCE_FRAME ref0,
+                                MV_REFERENCE_FRAME ref1) {
+  xd->block_refs[0] = &cm->frame_refs[ref0 >= LAST_FRAME ? ref0 - LAST_FRAME
+                                                         : 0];
+  xd->block_refs[1] = &cm->frame_refs[ref1 >= LAST_FRAME ? ref1 - LAST_FRAME
+                                                         : 0];
+}
+
+static INLINE int get_chessboard_index(const int frame_index) {
+  return frame_index & 0x1;
+}
+
+static INLINE int *cond_cost_list(const struct VP9_COMP *cpi, int *cost_list) {
+  return cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL;
+}
+
+void vp9_new_framerate(VP9_COMP *cpi, double framerate);
+
+#define LAYER_IDS_TO_IDX(sl, tl, num_tl) ((sl) * (num_tl) + (tl))
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_VP9_ENCODER_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_ethread.c b/libs/libvpx/vp9/encoder/vp9_ethread.c
new file mode 100644
index 0000000000..1d1926caeb
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_ethread.c
@@ -0,0 +1,203 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/encoder/vp9_encodeframe.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_ethread.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
+  int i, j, k, l, m, n;
+
+  for (i = 0; i < REFERENCE_MODES; i++)
+    td->rd_counts.comp_pred_diff[i] += td_t->rd_counts.comp_pred_diff[i];
+
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
+    td->rd_counts.filter_diff[i] += td_t->rd_counts.filter_diff[i];
+
+  for (i = 0; i < TX_SIZES; i++)
+    for (j = 0; j < PLANE_TYPES; j++)
+      for (k = 0; k < REF_TYPES; k++)
+        for (l = 0; l < COEF_BANDS; l++)
+          for (m = 0; m < COEFF_CONTEXTS; m++)
+            for (n = 0; n < ENTROPY_TOKENS; n++)
+              td->rd_counts.coef_counts[i][j][k][l][m][n] +=
+                  td_t->rd_counts.coef_counts[i][j][k][l][m][n];
+
+  // Counts of all motion searches and exhuastive mesh searches.
+  td->rd_counts.m_search_count += td_t->rd_counts.m_search_count;
+  td->rd_counts.ex_search_count += td_t->rd_counts.ex_search_count;
+}
+
+static int enc_worker_hook(EncWorkerData *const thread_data, void *unused) {
+  VP9_COMP *const cpi = thread_data->cpi;
+  const VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int tile_rows = 1 << cm->log2_tile_rows;
+  int t;
+
+  (void) unused;
+
+  for (t = thread_data->start; t < tile_rows * tile_cols;
+      t += cpi->num_workers) {
+    int tile_row = t / tile_cols;
+    int tile_col = t % tile_cols;
+
+    vp9_encode_tile(cpi, thread_data->td, tile_row, tile_col);
+  }
+
+  return 0;
+}
+
+static int get_max_tile_cols(VP9_COMP *cpi) {
+  const int aligned_width = ALIGN_POWER_OF_TWO(cpi->oxcf.width, MI_SIZE_LOG2);
+  int mi_cols = aligned_width >> MI_SIZE_LOG2;
+  int min_log2_tile_cols, max_log2_tile_cols;
+  int log2_tile_cols;
+
+  vp9_get_tile_n_bits(mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
+  log2_tile_cols = clamp(cpi->oxcf.tile_columns,
+                   min_log2_tile_cols, max_log2_tile_cols);
+  return (1 << log2_tile_cols);
+}
+
+void vp9_encode_tiles_mt(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
+  const int num_workers = VPXMIN(cpi->oxcf.max_threads, tile_cols);
+  int i;
+
+  vp9_init_tile_data(cpi);
+
+  // Only run once to create threads and allocate thread data.
+  if (cpi->num_workers == 0) {
+    int allocated_workers = num_workers;
+
+    // While using SVC, we need to allocate threads according to the highest
+    // resolution.
+    if (cpi->use_svc) {
+      int max_tile_cols = get_max_tile_cols(cpi);
+      allocated_workers = VPXMIN(cpi->oxcf.max_threads, max_tile_cols);
+    }
+
+    CHECK_MEM_ERROR(cm, cpi->workers,
+                    vpx_malloc(allocated_workers * sizeof(*cpi->workers)));
+
+    CHECK_MEM_ERROR(cm, cpi->tile_thr_data,
+                    vpx_calloc(allocated_workers,
+                    sizeof(*cpi->tile_thr_data)));
+
+    for (i = 0; i < allocated_workers; i++) {
+      VPxWorker *const worker = &cpi->workers[i];
+      EncWorkerData *thread_data = &cpi->tile_thr_data[i];
+
+      ++cpi->num_workers;
+      winterface->init(worker);
+
+      if (i < allocated_workers - 1) {
+        thread_data->cpi = cpi;
+
+        // Allocate thread data.
+        CHECK_MEM_ERROR(cm, thread_data->td,
+                        vpx_memalign(32, sizeof(*thread_data->td)));
+        vp9_zero(*thread_data->td);
+
+        // Set up pc_tree.
+        thread_data->td->leaf_tree = NULL;
+        thread_data->td->pc_tree = NULL;
+        vp9_setup_pc_tree(cm, thread_data->td);
+
+        // Allocate frame counters in thread data.
+        CHECK_MEM_ERROR(cm, thread_data->td->counts,
+                        vpx_calloc(1, sizeof(*thread_data->td->counts)));
+
+        // Create threads
+        if (!winterface->reset(worker))
+          vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                             "Tile encoder thread creation failed");
+      } else {
+        // Main thread acts as a worker and uses the thread data in cpi.
+        thread_data->cpi = cpi;
+        thread_data->td = &cpi->td;
+      }
+
+      winterface->sync(worker);
+    }
+  }
+
+  for (i = 0; i < num_workers; i++) {
+    VPxWorker *const worker = &cpi->workers[i];
+    EncWorkerData *thread_data;
+
+    worker->hook = (VPxWorkerHook)enc_worker_hook;
+    worker->data1 = &cpi->tile_thr_data[i];
+    worker->data2 = NULL;
+    thread_data = (EncWorkerData*)worker->data1;
+
+    // Before encoding a frame, copy the thread data from cpi.
+    if (thread_data->td != &cpi->td) {
+      thread_data->td->mb = cpi->td.mb;
+      thread_data->td->rd_counts = cpi->td.rd_counts;
+    }
+    if (thread_data->td->counts != &cpi->common.counts) {
+      memcpy(thread_data->td->counts, &cpi->common.counts,
+             sizeof(cpi->common.counts));
+    }
+
+    // Handle use_nonrd_pick_mode case.
+    if (cpi->sf.use_nonrd_pick_mode) {
+      MACROBLOCK *const x = &thread_data->td->mb;
+      MACROBLOCKD *const xd = &x->e_mbd;
+      struct macroblock_plane *const p = x->plane;
+      struct macroblockd_plane *const pd = xd->plane;
+      PICK_MODE_CONTEXT *ctx = &thread_data->td->pc_root->none;
+      int j;
+
+      for (j = 0; j < MAX_MB_PLANE; ++j) {
+        p[j].coeff = ctx->coeff_pbuf[j][0];
+        p[j].qcoeff = ctx->qcoeff_pbuf[j][0];
+        pd[j].dqcoeff = ctx->dqcoeff_pbuf[j][0];
+        p[j].eobs = ctx->eobs_pbuf[j][0];
+      }
+    }
+  }
+
+  // Encode a frame
+  for (i = 0; i < num_workers; i++) {
+    VPxWorker *const worker = &cpi->workers[i];
+    EncWorkerData *const thread_data = (EncWorkerData*)worker->data1;
+
+    // Set the starting tile for each thread.
+    thread_data->start = i;
+
+    if (i == cpi->num_workers - 1)
+      winterface->execute(worker);
+    else
+      winterface->launch(worker);
+  }
+
+  // Encoding ends.
+  for (i = 0; i < num_workers; i++) {
+    VPxWorker *const worker = &cpi->workers[i];
+    winterface->sync(worker);
+  }
+
+  for (i = 0; i < num_workers; i++) {
+    VPxWorker *const worker = &cpi->workers[i];
+    EncWorkerData *const thread_data = (EncWorkerData*)worker->data1;
+
+    // Accumulate counters.
+    if (i < cpi->num_workers - 1) {
+      vp9_accumulate_frame_counts(&cm->counts, thread_data->td->counts, 0);
+      accumulate_rd_opt(&cpi->td, thread_data->td);
+    }
+  }
+}
diff --git a/libs/libvpx/vp9/encoder/vp9_ethread.h b/libs/libvpx/vp9/encoder/vp9_ethread.h
new file mode 100644
index 0000000000..1efa4dcde2
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_ethread.h
@@ -0,0 +1,33 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_VP9_ETHREAD_H_
+#define VP9_ENCODER_VP9_ETHREAD_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP9_COMP;
+struct ThreadData;
+
+typedef struct EncWorkerData {
+  struct VP9_COMP *cpi;
+  struct ThreadData *td;
+  int start;
+} EncWorkerData;
+
+void vp9_encode_tiles_mt(struct VP9_COMP *cpi);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_VP9_ETHREAD_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_extend.c b/libs/libvpx/vp9/encoder/vp9_extend.c
new file mode 100644
index 0000000000..92585b82a4
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_extend.c
@@ -0,0 +1,201 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+
+#include "vp9/common/vp9_common.h"
+#include "vp9/encoder/vp9_extend.h"
+
+static void copy_and_extend_plane(const uint8_t *src, int src_pitch,
+                                  uint8_t *dst, int dst_pitch,
+                                  int w, int h,
+                                  int extend_top, int extend_left,
+                                  int extend_bottom, int extend_right) {
+  int i, linesize;
+
+  // copy the left and right most columns out
+  const uint8_t *src_ptr1 = src;
+  const uint8_t *src_ptr2 = src + w - 1;
+  uint8_t *dst_ptr1 = dst - extend_left;
+  uint8_t *dst_ptr2 = dst + w;
+
+  for (i = 0; i < h; i++) {
+    memset(dst_ptr1, src_ptr1[0], extend_left);
+    memcpy(dst_ptr1 + extend_left, src_ptr1, w);
+    memset(dst_ptr2, src_ptr2[0], extend_right);
+    src_ptr1 += src_pitch;
+    src_ptr2 += src_pitch;
+    dst_ptr1 += dst_pitch;
+    dst_ptr2 += dst_pitch;
+  }
+
+  // Now copy the top and bottom lines into each line of the respective
+  // borders
+  src_ptr1 = dst - extend_left;
+  src_ptr2 = dst + dst_pitch * (h - 1) - extend_left;
+  dst_ptr1 = dst + dst_pitch * (-extend_top) - extend_left;
+  dst_ptr2 = dst + dst_pitch * (h) - extend_left;
+  linesize = extend_left + extend_right + w;
+
+  for (i = 0; i < extend_top; i++) {
+    memcpy(dst_ptr1, src_ptr1, linesize);
+    dst_ptr1 += dst_pitch;
+  }
+
+  for (i = 0; i < extend_bottom; i++) {
+    memcpy(dst_ptr2, src_ptr2, linesize);
+    dst_ptr2 += dst_pitch;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_copy_and_extend_plane(const uint8_t *src8, int src_pitch,
+                                         uint8_t *dst8, int dst_pitch,
+                                         int w, int h,
+                                         int extend_top, int extend_left,
+                                         int extend_bottom, int extend_right) {
+  int i, linesize;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+
+  // copy the left and right most columns out
+  const uint16_t *src_ptr1 = src;
+  const uint16_t *src_ptr2 = src + w - 1;
+  uint16_t *dst_ptr1 = dst - extend_left;
+  uint16_t *dst_ptr2 = dst + w;
+
+  for (i = 0; i < h; i++) {
+    vpx_memset16(dst_ptr1, src_ptr1[0], extend_left);
+    memcpy(dst_ptr1 + extend_left, src_ptr1, w * sizeof(src_ptr1[0]));
+    vpx_memset16(dst_ptr2, src_ptr2[0], extend_right);
+    src_ptr1 += src_pitch;
+    src_ptr2 += src_pitch;
+    dst_ptr1 += dst_pitch;
+    dst_ptr2 += dst_pitch;
+  }
+
+  // Now copy the top and bottom lines into each line of the respective
+  // borders
+  src_ptr1 = dst - extend_left;
+  src_ptr2 = dst + dst_pitch * (h - 1) - extend_left;
+  dst_ptr1 = dst + dst_pitch * (-extend_top) - extend_left;
+  dst_ptr2 = dst + dst_pitch * (h) - extend_left;
+  linesize = extend_left + extend_right + w;
+
+  for (i = 0; i < extend_top; i++) {
+    memcpy(dst_ptr1, src_ptr1, linesize * sizeof(src_ptr1[0]));
+    dst_ptr1 += dst_pitch;
+  }
+
+  for (i = 0; i < extend_bottom; i++) {
+    memcpy(dst_ptr2, src_ptr2, linesize * sizeof(src_ptr2[0]));
+    dst_ptr2 += dst_pitch;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
+                               YV12_BUFFER_CONFIG *dst) {
+  // Extend src frame in buffer
+  // Altref filtering assumes 16 pixel extension
+  const int et_y = 16;
+  const int el_y = 16;
+  // Motion estimation may use src block variance with the block size up
+  // to 64x64, so the right and bottom need to be extended to 64 multiple
+  // or up to 16, whichever is greater.
+  const int er_y =
+      VPXMAX(src->y_width + 16, ALIGN_POWER_OF_TWO(src->y_width, 6)) -
+      src->y_crop_width;
+  const int eb_y =
+      VPXMAX(src->y_height + 16, ALIGN_POWER_OF_TWO(src->y_height, 6)) -
+      src->y_crop_height;
+  const int uv_width_subsampling = (src->uv_width != src->y_width);
+  const int uv_height_subsampling = (src->uv_height != src->y_height);
+  const int et_uv = et_y >> uv_height_subsampling;
+  const int el_uv = el_y >> uv_width_subsampling;
+  const int eb_uv = eb_y >> uv_height_subsampling;
+  const int er_uv = er_y >> uv_width_subsampling;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
+    highbd_copy_and_extend_plane(src->y_buffer, src->y_stride,
+                                 dst->y_buffer, dst->y_stride,
+                                 src->y_crop_width, src->y_crop_height,
+                                 et_y, el_y, eb_y, er_y);
+
+    highbd_copy_and_extend_plane(src->u_buffer, src->uv_stride,
+                                 dst->u_buffer, dst->uv_stride,
+                                 src->uv_crop_width, src->uv_crop_height,
+                                 et_uv, el_uv, eb_uv, er_uv);
+
+    highbd_copy_and_extend_plane(src->v_buffer, src->uv_stride,
+                                 dst->v_buffer, dst->uv_stride,
+                                 src->uv_crop_width, src->uv_crop_height,
+                                 et_uv, el_uv, eb_uv, er_uv);
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  copy_and_extend_plane(src->y_buffer, src->y_stride,
+                        dst->y_buffer, dst->y_stride,
+                        src->y_crop_width, src->y_crop_height,
+                        et_y, el_y, eb_y, er_y);
+
+  copy_and_extend_plane(src->u_buffer, src->uv_stride,
+                        dst->u_buffer, dst->uv_stride,
+                        src->uv_crop_width, src->uv_crop_height,
+                        et_uv, el_uv, eb_uv, er_uv);
+
+  copy_and_extend_plane(src->v_buffer, src->uv_stride,
+                        dst->v_buffer, dst->uv_stride,
+                        src->uv_crop_width, src->uv_crop_height,
+                        et_uv, el_uv, eb_uv, er_uv);
+}
+
+void vp9_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src,
+                                         YV12_BUFFER_CONFIG *dst,
+                                         int srcy, int srcx,
+                                         int srch, int srcw) {
+  // If the side is not touching the bounder then don't extend.
+  const int et_y = srcy ? 0 : dst->border;
+  const int el_y = srcx ? 0 : dst->border;
+  const int eb_y = srcy + srch != src->y_height ? 0 :
+                      dst->border + dst->y_height - src->y_height;
+  const int er_y = srcx + srcw != src->y_width ? 0 :
+                      dst->border + dst->y_width - src->y_width;
+  const int src_y_offset = srcy * src->y_stride + srcx;
+  const int dst_y_offset = srcy * dst->y_stride + srcx;
+
+  const int et_uv = ROUND_POWER_OF_TWO(et_y, 1);
+  const int el_uv = ROUND_POWER_OF_TWO(el_y, 1);
+  const int eb_uv = ROUND_POWER_OF_TWO(eb_y, 1);
+  const int er_uv = ROUND_POWER_OF_TWO(er_y, 1);
+  const int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1);
+  const int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1);
+  const int srch_uv = ROUND_POWER_OF_TWO(srch, 1);
+  const int srcw_uv = ROUND_POWER_OF_TWO(srcw, 1);
+
+  copy_and_extend_plane(src->y_buffer + src_y_offset, src->y_stride,
+                        dst->y_buffer + dst_y_offset, dst->y_stride,
+                        srcw, srch,
+                        et_y, el_y, eb_y, er_y);
+
+  copy_and_extend_plane(src->u_buffer + src_uv_offset, src->uv_stride,
+                        dst->u_buffer + dst_uv_offset, dst->uv_stride,
+                        srcw_uv, srch_uv,
+                        et_uv, el_uv, eb_uv, er_uv);
+
+  copy_and_extend_plane(src->v_buffer + src_uv_offset, src->uv_stride,
+                        dst->v_buffer + dst_uv_offset, dst->uv_stride,
+                        srcw_uv, srch_uv,
+                        et_uv, el_uv, eb_uv, er_uv);
+}
diff --git a/libs/libvpx/vp9/encoder/vp9_extend.h b/libs/libvpx/vp9/encoder/vp9_extend.h
new file mode 100644
index 0000000000..058fe09cf9
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_extend.h
@@ -0,0 +1,33 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_VP9_EXTEND_H_
+#define VP9_ENCODER_VP9_EXTEND_H_
+
+#include "vpx_scale/yv12config.h"
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
+                               YV12_BUFFER_CONFIG *dst);
+
+void vp9_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src,
+                                         YV12_BUFFER_CONFIG *dst,
+                                         int srcy, int srcx,
+                                         int srch, int srcw);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_VP9_EXTEND_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_firstpass.c b/libs/libvpx/vp9/encoder/vp9_firstpass.c
new file mode 100644
index 0000000000..9d3b15407b
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_firstpass.c
@@ -0,0 +1,2856 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_scale_rtcd.h"
+
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/system_state.h"
+#include "vpx_scale/vpx_scale.h"
+#include "vpx_scale/yv12config.h"
+
+#include "vp9/common/vp9_entropymv.h"
+#include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_reconinter.h"  // vp9_setup_dst_planes()
+#include "vp9/encoder/vp9_aq_variance.h"
+#include "vp9/encoder/vp9_block.h"
+#include "vp9/encoder/vp9_encodeframe.h"
+#include "vp9/encoder/vp9_encodemb.h"
+#include "vp9/encoder/vp9_encodemv.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_extend.h"
+#include "vp9/encoder/vp9_firstpass.h"
+#include "vp9/encoder/vp9_mcomp.h"
+#include "vp9/encoder/vp9_quantize.h"
+#include "vp9/encoder/vp9_rd.h"
+#include "vpx_dsp/variance.h"
+
+#define OUTPUT_FPF          0
+#define ARF_STATS_OUTPUT    0
+
+#define GROUP_ADAPTIVE_MAXQ 1
+
+#define BOOST_BREAKOUT      12.5
+#define BOOST_FACTOR        12.5
+#define ERR_DIVISOR         128.0
+#define FACTOR_PT_LOW       0.70
+#define FACTOR_PT_HIGH      0.90
+#define FIRST_PASS_Q        10.0
+#define GF_MAX_BOOST        96.0
+#define INTRA_MODE_PENALTY  1024
+#define KF_MAX_BOOST        128.0
+#define MIN_ARF_GF_BOOST    240
+#define MIN_DECAY_FACTOR    0.01
+#define MIN_KF_BOOST        300
+#define NEW_MV_MODE_PENALTY 32
+#define SVC_FACTOR_PT_LOW   0.45
+#define DARK_THRESH         64
+#define DEFAULT_GRP_WEIGHT  1.0
+#define RC_FACTOR_MIN       0.75
+#define RC_FACTOR_MAX       1.75
+
+
+#define NCOUNT_INTRA_THRESH 8192
+#define NCOUNT_INTRA_FACTOR 3
+#define NCOUNT_FRAME_II_THRESH 5.0
+
+#define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x) - 0.000001 : (x) + 0.000001)
+
+#if ARF_STATS_OUTPUT
+unsigned int arf_count = 0;
+#endif
+
+// Resets the first pass file to the given position using a relative seek from
+// the current position.
+static void reset_fpf_position(TWO_PASS *p,
+                               const FIRSTPASS_STATS *position) {
+  p->stats_in = position;
+}
+
+// Read frame stats at an offset from the current position.
+static const FIRSTPASS_STATS *read_frame_stats(const TWO_PASS *p, int offset) {
+  if ((offset >= 0 && p->stats_in + offset >= p->stats_in_end) ||
+      (offset < 0 && p->stats_in + offset < p->stats_in_start)) {
+    return NULL;
+  }
+
+  return &p->stats_in[offset];
+}
+
+static int input_stats(TWO_PASS *p, FIRSTPASS_STATS *fps) {
+  if (p->stats_in >= p->stats_in_end)
+    return EOF;
+
+  *fps = *p->stats_in;
+  ++p->stats_in;
+  return 1;
+}
+
+static void output_stats(FIRSTPASS_STATS *stats,
+                         struct vpx_codec_pkt_list *pktlist) {
+  struct vpx_codec_cx_pkt pkt;
+  pkt.kind = VPX_CODEC_STATS_PKT;
+  pkt.data.twopass_stats.buf = stats;
+  pkt.data.twopass_stats.sz = sizeof(FIRSTPASS_STATS);
+  vpx_codec_pkt_list_add(pktlist, &pkt);
+
+// TEMP debug code
+#if OUTPUT_FPF
+  {
+    FILE *fpfile;
+    fpfile = fopen("firstpass.stt", "a");
+
+    fprintf(fpfile, "%12.0lf %12.4lf %12.0lf %12.0lf %12.0lf %12.4lf %12.4lf"
+            "%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf"
+            "%12.4lf %12.4lf %12.0lf %12.0lf %12.0lf %12.4lf\n",
+            stats->frame,
+            stats->weight,
+            stats->intra_error,
+            stats->coded_error,
+            stats->sr_coded_error,
+            stats->pcnt_inter,
+            stats->pcnt_motion,
+            stats->pcnt_second_ref,
+            stats->pcnt_neutral,
+            stats->intra_skip_pct,
+            stats->inactive_zone_rows,
+            stats->inactive_zone_cols,
+            stats->MVr,
+            stats->mvr_abs,
+            stats->MVc,
+            stats->mvc_abs,
+            stats->MVrv,
+            stats->MVcv,
+            stats->mv_in_out_count,
+            stats->new_mv_count,
+            stats->count,
+            stats->duration);
+    fclose(fpfile);
+  }
+#endif
+}
+
+#if CONFIG_FP_MB_STATS
+static void output_fpmb_stats(uint8_t *this_frame_mb_stats, VP9_COMMON *cm,
+                         struct vpx_codec_pkt_list *pktlist) {
+  struct vpx_codec_cx_pkt pkt;
+  pkt.kind = VPX_CODEC_FPMB_STATS_PKT;
+  pkt.data.firstpass_mb_stats.buf = this_frame_mb_stats;
+  pkt.data.firstpass_mb_stats.sz = cm->initial_mbs * sizeof(uint8_t);
+  vpx_codec_pkt_list_add(pktlist, &pkt);
+}
+#endif
+
+static void zero_stats(FIRSTPASS_STATS *section) {
+  section->frame = 0.0;
+  section->weight = 0.0;
+  section->intra_error = 0.0;
+  section->coded_error = 0.0;
+  section->sr_coded_error = 0.0;
+  section->pcnt_inter  = 0.0;
+  section->pcnt_motion  = 0.0;
+  section->pcnt_second_ref = 0.0;
+  section->pcnt_neutral = 0.0;
+  section->intra_skip_pct = 0.0;
+  section->inactive_zone_rows = 0.0;
+  section->inactive_zone_cols = 0.0;
+  section->MVr = 0.0;
+  section->mvr_abs     = 0.0;
+  section->MVc        = 0.0;
+  section->mvc_abs     = 0.0;
+  section->MVrv       = 0.0;
+  section->MVcv       = 0.0;
+  section->mv_in_out_count  = 0.0;
+  section->new_mv_count = 0.0;
+  section->count      = 0.0;
+  section->duration   = 1.0;
+  section->spatial_layer_id = 0;
+}
+
+static void accumulate_stats(FIRSTPASS_STATS *section,
+                             const FIRSTPASS_STATS *frame) {
+  section->frame += frame->frame;
+  section->weight += frame->weight;
+  section->spatial_layer_id = frame->spatial_layer_id;
+  section->intra_error += frame->intra_error;
+  section->coded_error += frame->coded_error;
+  section->sr_coded_error += frame->sr_coded_error;
+  section->pcnt_inter  += frame->pcnt_inter;
+  section->pcnt_motion += frame->pcnt_motion;
+  section->pcnt_second_ref += frame->pcnt_second_ref;
+  section->pcnt_neutral += frame->pcnt_neutral;
+  section->intra_skip_pct += frame->intra_skip_pct;
+  section->inactive_zone_rows += frame->inactive_zone_rows;
+  section->inactive_zone_cols += frame->inactive_zone_cols;
+  section->MVr += frame->MVr;
+  section->mvr_abs     += frame->mvr_abs;
+  section->MVc        += frame->MVc;
+  section->mvc_abs     += frame->mvc_abs;
+  section->MVrv       += frame->MVrv;
+  section->MVcv       += frame->MVcv;
+  section->mv_in_out_count  += frame->mv_in_out_count;
+  section->new_mv_count += frame->new_mv_count;
+  section->count      += frame->count;
+  section->duration   += frame->duration;
+}
+
+static void subtract_stats(FIRSTPASS_STATS *section,
+                           const FIRSTPASS_STATS *frame) {
+  section->frame -= frame->frame;
+  section->weight -= frame->weight;
+  section->intra_error -= frame->intra_error;
+  section->coded_error -= frame->coded_error;
+  section->sr_coded_error -= frame->sr_coded_error;
+  section->pcnt_inter  -= frame->pcnt_inter;
+  section->pcnt_motion -= frame->pcnt_motion;
+  section->pcnt_second_ref -= frame->pcnt_second_ref;
+  section->pcnt_neutral -= frame->pcnt_neutral;
+  section->intra_skip_pct -= frame->intra_skip_pct;
+  section->inactive_zone_rows -= frame->inactive_zone_rows;
+  section->inactive_zone_cols -= frame->inactive_zone_cols;
+  section->MVr -= frame->MVr;
+  section->mvr_abs     -= frame->mvr_abs;
+  section->MVc        -= frame->MVc;
+  section->mvc_abs     -= frame->mvc_abs;
+  section->MVrv       -= frame->MVrv;
+  section->MVcv       -= frame->MVcv;
+  section->mv_in_out_count  -= frame->mv_in_out_count;
+  section->new_mv_count -= frame->new_mv_count;
+  section->count      -= frame->count;
+  section->duration   -= frame->duration;
+}
+
+// Calculate an active area of the image that discounts formatting
+// bars and partially discounts other 0 energy areas.
+#define MIN_ACTIVE_AREA 0.5
+#define MAX_ACTIVE_AREA 1.0
+static double calculate_active_area(const VP9_COMP *cpi,
+                                    const FIRSTPASS_STATS *this_frame) {
+  double active_pct;
+
+  active_pct = 1.0 -
+    ((this_frame->intra_skip_pct / 2) +
+     ((this_frame->inactive_zone_rows * 2) / (double)cpi->common.mb_rows));
+  return fclamp(active_pct, MIN_ACTIVE_AREA, MAX_ACTIVE_AREA);
+}
+
+// Calculate a modified Error used in distributing bits between easier and
+// harder frames.
+#define ACT_AREA_CORRECTION 0.5
+static double calculate_modified_err(const VP9_COMP *cpi,
+                                     const TWO_PASS *twopass,
+                                     const VP9EncoderConfig *oxcf,
+                                     const FIRSTPASS_STATS *this_frame) {
+  const FIRSTPASS_STATS *const stats = &twopass->total_stats;
+  const double av_weight = stats->weight / stats->count;
+  const double av_err = (stats->coded_error * av_weight) / stats->count;
+  double modified_error =
+    av_err * pow(this_frame->coded_error * this_frame->weight /
+                 DOUBLE_DIVIDE_CHECK(av_err), oxcf->two_pass_vbrbias / 100.0);
+
+  // Correction for active area. Frames with a reduced active area
+  // (eg due to formatting bars) have a higher error per mb for the
+  // remaining active MBs. The correction here assumes that coding
+  // 0.5N blocks of complexity 2X is a little easier than coding N
+  // blocks of complexity X.
+  modified_error *=
+    pow(calculate_active_area(cpi, this_frame), ACT_AREA_CORRECTION);
+
+  return fclamp(modified_error,
+                twopass->modified_error_min, twopass->modified_error_max);
+}
+
+// This function returns the maximum target rate per frame.
+static int frame_max_bits(const RATE_CONTROL *rc,
+                          const VP9EncoderConfig *oxcf) {
+  int64_t max_bits = ((int64_t)rc->avg_frame_bandwidth *
+                          (int64_t)oxcf->two_pass_vbrmax_section) / 100;
+  if (max_bits < 0)
+    max_bits = 0;
+  else if (max_bits > rc->max_frame_bandwidth)
+    max_bits = rc->max_frame_bandwidth;
+
+  return (int)max_bits;
+}
+
+void vp9_init_first_pass(VP9_COMP *cpi) {
+  zero_stats(&cpi->twopass.total_stats);
+}
+
+void vp9_end_first_pass(VP9_COMP *cpi) {
+  if (is_two_pass_svc(cpi)) {
+    int i;
+    for (i = 0; i < cpi->svc.number_spatial_layers; ++i) {
+      output_stats(&cpi->svc.layer_context[i].twopass.total_stats,
+                   cpi->output_pkt_list);
+    }
+  } else {
+    output_stats(&cpi->twopass.total_stats, cpi->output_pkt_list);
+  }
+}
+
+static vpx_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) {
+  switch (bsize) {
+    case BLOCK_8X8:
+      return vpx_mse8x8;
+    case BLOCK_16X8:
+      return vpx_mse16x8;
+    case BLOCK_8X16:
+      return vpx_mse8x16;
+    default:
+      return vpx_mse16x16;
+  }
+}
+
+static unsigned int get_prediction_error(BLOCK_SIZE bsize,
+                                         const struct buf_2d *src,
+                                         const struct buf_2d *ref) {
+  unsigned int sse;
+  const vpx_variance_fn_t fn = get_block_variance_fn(bsize);
+  fn(src->buf, src->stride, ref->buf, ref->stride, &sse);
+  return sse;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static vpx_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,
+                                                      int bd) {
+  switch (bd) {
+    default:
+      switch (bsize) {
+        case BLOCK_8X8:
+          return vpx_highbd_8_mse8x8;
+        case BLOCK_16X8:
+          return vpx_highbd_8_mse16x8;
+        case BLOCK_8X16:
+          return vpx_highbd_8_mse8x16;
+        default:
+          return vpx_highbd_8_mse16x16;
+      }
+      break;
+    case 10:
+      switch (bsize) {
+        case BLOCK_8X8:
+          return vpx_highbd_10_mse8x8;
+        case BLOCK_16X8:
+          return vpx_highbd_10_mse16x8;
+        case BLOCK_8X16:
+          return vpx_highbd_10_mse8x16;
+        default:
+          return vpx_highbd_10_mse16x16;
+      }
+      break;
+    case 12:
+      switch (bsize) {
+        case BLOCK_8X8:
+          return vpx_highbd_12_mse8x8;
+        case BLOCK_16X8:
+          return vpx_highbd_12_mse16x8;
+        case BLOCK_8X16:
+          return vpx_highbd_12_mse8x16;
+        default:
+          return vpx_highbd_12_mse16x16;
+      }
+      break;
+  }
+}
+
+static unsigned int highbd_get_prediction_error(BLOCK_SIZE bsize,
+                                                const struct buf_2d *src,
+                                                const struct buf_2d *ref,
+                                                int bd) {
+  unsigned int sse;
+  const vpx_variance_fn_t fn = highbd_get_block_variance_fn(bsize, bd);
+  fn(src->buf, src->stride, ref->buf, ref->stride, &sse);
+  return sse;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+// Refine the motion search range according to the frame dimension
+// for first pass test.
+static int get_search_range(const VP9_COMP *cpi) {
+  int sr = 0;
+  const int dim = VPXMIN(cpi->initial_width, cpi->initial_height);
+
+  while ((dim << sr) < MAX_FULL_PEL_VAL)
+    ++sr;
+  return sr;
+}
+
+static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
+                                     const MV *ref_mv, MV *best_mv,
+                                     int *best_motion_err) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MV tmp_mv = {0, 0};
+  MV ref_mv_full = {ref_mv->row >> 3, ref_mv->col >> 3};
+  int num00, tmp_err, n;
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize];
+  const int new_mv_mode_penalty = NEW_MV_MODE_PENALTY;
+
+  int step_param = 3;
+  int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
+  const int sr = get_search_range(cpi);
+  step_param += sr;
+  further_steps -= sr;
+
+  // Override the default variance function to use MSE.
+  v_fn_ptr.vf = get_block_variance_fn(bsize);
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    v_fn_ptr.vf = highbd_get_block_variance_fn(bsize, xd->bd);
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  // Center the initial step/diamond search on best mv.
+  tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv,
+                                    step_param,
+                                    x->sadperbit16, &num00, &v_fn_ptr, ref_mv);
+  if (tmp_err < INT_MAX)
+    tmp_err = vp9_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1);
+  if (tmp_err < INT_MAX - new_mv_mode_penalty)
+    tmp_err += new_mv_mode_penalty;
+
+  if (tmp_err < *best_motion_err) {
+    *best_motion_err = tmp_err;
+    *best_mv = tmp_mv;
+  }
+
+  // Carry out further step/diamond searches as necessary.
+  n = num00;
+  num00 = 0;
+
+  while (n < further_steps) {
+    ++n;
+
+    if (num00) {
+      --num00;
+    } else {
+      tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv,
+                                        step_param + n, x->sadperbit16,
+                                        &num00, &v_fn_ptr, ref_mv);
+      if (tmp_err < INT_MAX)
+        tmp_err = vp9_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1);
+      if (tmp_err < INT_MAX - new_mv_mode_penalty)
+        tmp_err += new_mv_mode_penalty;
+
+      if (tmp_err < *best_motion_err) {
+        *best_motion_err = tmp_err;
+        *best_mv = tmp_mv;
+      }
+    }
+  }
+}
+
+static BLOCK_SIZE get_bsize(const VP9_COMMON *cm, int mb_row, int mb_col) {
+  if (2 * mb_col + 1 < cm->mi_cols) {
+    return 2 * mb_row + 1 < cm->mi_rows ? BLOCK_16X16
+                                        : BLOCK_16X8;
+  } else {
+    return 2 * mb_row + 1 < cm->mi_rows ? BLOCK_8X16
+                                        : BLOCK_8X8;
+  }
+}
+
+static int find_fp_qindex(vpx_bit_depth_t bit_depth) {
+  int i;
+
+  for (i = 0; i < QINDEX_RANGE; ++i)
+    if (vp9_convert_qindex_to_q(i, bit_depth) >= FIRST_PASS_Q)
+      break;
+
+  if (i == QINDEX_RANGE)
+    i--;
+
+  return i;
+}
+
+static void set_first_pass_params(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  if (!cpi->refresh_alt_ref_frame &&
+      (cm->current_video_frame == 0 ||
+       (cpi->frame_flags & FRAMEFLAGS_KEY))) {
+    cm->frame_type = KEY_FRAME;
+  } else {
+    cm->frame_type = INTER_FRAME;
+  }
+  // Do not use periodic key frames.
+  cpi->rc.frames_to_key = INT_MAX;
+}
+
+#define UL_INTRA_THRESH 50
+#define INVALID_ROW -1
+void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
+  int mb_row, mb_col;
+  MACROBLOCK *const x = &cpi->td.mb;
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  TileInfo tile;
+  struct macroblock_plane *const p = x->plane;
+  struct macroblockd_plane *const pd = xd->plane;
+  const PICK_MODE_CONTEXT *ctx = &cpi->td.pc_root->none;
+  int i;
+
+  int recon_yoffset, recon_uvoffset;
+  int64_t intra_error = 0;
+  int64_t coded_error = 0;
+  int64_t sr_coded_error = 0;
+
+  int sum_mvr = 0, sum_mvc = 0;
+  int sum_mvr_abs = 0, sum_mvc_abs = 0;
+  int64_t sum_mvrs = 0, sum_mvcs = 0;
+  int mvcount = 0;
+  int intercount = 0;
+  int second_ref_count = 0;
+  const int intrapenalty = INTRA_MODE_PENALTY;
+  double neutral_count;
+  int intra_skip_count = 0;
+  int image_data_start_row = INVALID_ROW;
+  int new_mv_count = 0;
+  int sum_in_vectors = 0;
+  MV lastmv = {0, 0};
+  TWO_PASS *twopass = &cpi->twopass;
+  const MV zero_mv = {0, 0};
+  int recon_y_stride, recon_uv_stride, uv_mb_height;
+
+  YV12_BUFFER_CONFIG *const lst_yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
+  YV12_BUFFER_CONFIG *gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+  YV12_BUFFER_CONFIG *const new_yv12 = get_frame_new_buffer(cm);
+  const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12;
+
+  LAYER_CONTEXT *const lc = is_two_pass_svc(cpi) ?
+        &cpi->svc.layer_context[cpi->svc.spatial_layer_id] : NULL;
+  double intra_factor;
+  double brightness_factor;
+  BufferPool *const pool = cm->buffer_pool;
+
+  // First pass code requires valid last and new frame buffers.
+  assert(new_yv12 != NULL);
+  assert((lc != NULL) || frame_is_intra_only(cm) || (lst_yv12 != NULL));
+
+#if CONFIG_FP_MB_STATS
+  if (cpi->use_fp_mb_stats) {
+    vp9_zero_array(cpi->twopass.frame_mb_stats_buf, cm->initial_mbs);
+  }
+#endif
+
+  vpx_clear_system_state();
+
+  intra_factor = 0.0;
+  brightness_factor = 0.0;
+  neutral_count = 0.0;
+
+  set_first_pass_params(cpi);
+  vp9_set_quantizer(cm, find_fp_qindex(cm->bit_depth));
+
+  if (lc != NULL) {
+    twopass = &lc->twopass;
+
+    cpi->lst_fb_idx = cpi->svc.spatial_layer_id;
+    cpi->ref_frame_flags = VP9_LAST_FLAG;
+
+    if (cpi->svc.number_spatial_layers + cpi->svc.spatial_layer_id <
+        REF_FRAMES) {
+      cpi->gld_fb_idx =
+          cpi->svc.number_spatial_layers + cpi->svc.spatial_layer_id;
+      cpi->ref_frame_flags |= VP9_GOLD_FLAG;
+      cpi->refresh_golden_frame = (lc->current_video_frame_in_layer == 0);
+    } else {
+      cpi->refresh_golden_frame = 0;
+    }
+
+    if (lc->current_video_frame_in_layer == 0)
+      cpi->ref_frame_flags = 0;
+
+    vp9_scale_references(cpi);
+
+    // Use either last frame or alt frame for motion search.
+    if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
+      first_ref_buf = vp9_get_scaled_ref_frame(cpi, LAST_FRAME);
+      if (first_ref_buf == NULL)
+        first_ref_buf = get_ref_frame_buffer(cpi, LAST_FRAME);
+    }
+
+    if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
+      gld_yv12 = vp9_get_scaled_ref_frame(cpi, GOLDEN_FRAME);
+      if (gld_yv12 == NULL) {
+        gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+      }
+    } else {
+      gld_yv12 = NULL;
+    }
+
+    set_ref_ptrs(cm, xd,
+                 (cpi->ref_frame_flags & VP9_LAST_FLAG) ? LAST_FRAME: NONE,
+                 (cpi->ref_frame_flags & VP9_GOLD_FLAG) ? GOLDEN_FRAME : NONE);
+
+    cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source,
+                                        &cpi->scaled_source, 0);
+  }
+
+  vp9_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
+
+  vp9_setup_src_planes(x, cpi->Source, 0, 0);
+  vp9_setup_dst_planes(xd->plane, new_yv12, 0, 0);
+
+  if (!frame_is_intra_only(cm)) {
+    vp9_setup_pre_planes(xd, 0, first_ref_buf, 0, 0, NULL);
+  }
+
+  xd->mi = cm->mi_grid_visible;
+  xd->mi[0] = cm->mi;
+
+  vp9_frame_init_quantizer(cpi);
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    p[i].coeff = ctx->coeff_pbuf[i][1];
+    p[i].qcoeff = ctx->qcoeff_pbuf[i][1];
+    pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
+    p[i].eobs = ctx->eobs_pbuf[i][1];
+  }
+  x->skip_recode = 0;
+
+  vp9_init_mv_probs(cm);
+  vp9_initialize_rd_consts(cpi);
+
+  // Tiling is ignored in the first pass.
+  vp9_tile_init(&tile, cm, 0, 0);
+
+  recon_y_stride = new_yv12->y_stride;
+  recon_uv_stride = new_yv12->uv_stride;
+  uv_mb_height = 16 >> (new_yv12->y_height > new_yv12->uv_height);
+
+  for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
+    MV best_ref_mv = {0, 0};
+
+    // Reset above block coeffs.
+    recon_yoffset = (mb_row * recon_y_stride * 16);
+    recon_uvoffset = (mb_row * recon_uv_stride * uv_mb_height);
+
+    // Set up limit values for motion vectors to prevent them extending
+    // outside the UMV borders.
+    x->mv_row_min = -((mb_row * 16) + BORDER_MV_PIXELS_B16);
+    x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16)
+                    + BORDER_MV_PIXELS_B16;
+
+    for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) {
+      int this_error;
+      const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
+      const BLOCK_SIZE bsize = get_bsize(cm, mb_row, mb_col);
+      double log_intra;
+      int level_sample;
+
+#if CONFIG_FP_MB_STATS
+      const int mb_index = mb_row * cm->mb_cols + mb_col;
+#endif
+
+      vpx_clear_system_state();
+
+      xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset;
+      xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset;
+      xd->plane[2].dst.buf = new_yv12->v_buffer + recon_uvoffset;
+      xd->mi[0]->sb_type = bsize;
+      xd->mi[0]->ref_frame[0] = INTRA_FRAME;
+      set_mi_row_col(xd, &tile,
+                     mb_row << 1, num_8x8_blocks_high_lookup[bsize],
+                     mb_col << 1, num_8x8_blocks_wide_lookup[bsize],
+                     cm->mi_rows, cm->mi_cols);
+
+      // Do intra 16x16 prediction.
+      x->skip_encode = 0;
+      xd->mi[0]->mode = DC_PRED;
+      xd->mi[0]->tx_size = use_dc_pred ?
+         (bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4;
+      vp9_encode_intra_block_plane(x, bsize, 0);
+      this_error = vpx_get_mb_ss(x->plane[0].src_diff);
+
+      // Keep a record of blocks that have almost no intra error residual
+      // (i.e. are in effect completely flat and untextured in the intra
+      // domain). In natural videos this is uncommon, but it is much more
+      // common in animations, graphics and screen content, so may be used
+      // as a signal to detect these types of content.
+      if (this_error < UL_INTRA_THRESH) {
+        ++intra_skip_count;
+      } else if ((mb_col > 0) && (image_data_start_row == INVALID_ROW)) {
+        image_data_start_row = mb_row;
+      }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (cm->use_highbitdepth) {
+        switch (cm->bit_depth) {
+          case VPX_BITS_8:
+            break;
+          case VPX_BITS_10:
+            this_error >>= 4;
+            break;
+          case VPX_BITS_12:
+            this_error >>= 8;
+            break;
+          default:
+            assert(0 && "cm->bit_depth should be VPX_BITS_8, "
+                        "VPX_BITS_10 or VPX_BITS_12");
+            return;
+        }
+      }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+      vpx_clear_system_state();
+      log_intra = log(this_error + 1.0);
+      if (log_intra < 10.0)
+        intra_factor += 1.0 + ((10.0 - log_intra) * 0.05);
+      else
+        intra_factor += 1.0;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (cm->use_highbitdepth)
+        level_sample = CONVERT_TO_SHORTPTR(x->plane[0].src.buf)[0];
+      else
+        level_sample = x->plane[0].src.buf[0];
+#else
+      level_sample = x->plane[0].src.buf[0];
+#endif
+      if ((level_sample < DARK_THRESH) && (log_intra < 9.0))
+        brightness_factor += 1.0 + (0.01 * (DARK_THRESH - level_sample));
+      else
+        brightness_factor += 1.0;
+
+      // Intrapenalty below deals with situations where the intra and inter
+      // error scores are very low (e.g. a plain black frame).
+      // We do not have special cases in first pass for 0,0 and nearest etc so
+      // all inter modes carry an overhead cost estimate for the mv.
+      // When the error score is very low this causes us to pick all or lots of
+      // INTRA modes and throw lots of key frames.
+      // This penalty adds a cost matching that of a 0,0 mv to the intra case.
+      this_error += intrapenalty;
+
+      // Accumulate the intra error.
+      intra_error += (int64_t)this_error;
+
+#if CONFIG_FP_MB_STATS
+      if (cpi->use_fp_mb_stats) {
+        // initialization
+        cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
+      }
+#endif
+
+      // Set up limit values for motion vectors to prevent them extending
+      // outside the UMV borders.
+      x->mv_col_min = -((mb_col * 16) + BORDER_MV_PIXELS_B16);
+      x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + BORDER_MV_PIXELS_B16;
+
+      // Other than for the first frame do a motion search.
+      if ((lc == NULL && cm->current_video_frame > 0) ||
+          (lc != NULL && lc->current_video_frame_in_layer > 0)) {
+        int tmp_err, motion_error, raw_motion_error;
+        // Assume 0,0 motion with no mv overhead.
+        MV mv = {0, 0} , tmp_mv = {0, 0};
+        struct buf_2d unscaled_last_source_buf_2d;
+
+        xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+          motion_error = highbd_get_prediction_error(
+              bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);
+        } else {
+          motion_error = get_prediction_error(
+              bsize, &x->plane[0].src, &xd->plane[0].pre[0]);
+        }
+#else
+        motion_error = get_prediction_error(
+            bsize, &x->plane[0].src, &xd->plane[0].pre[0]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+        // Compute the motion error of the 0,0 motion using the last source
+        // frame as the reference. Skip the further motion search on
+        // reconstructed frame if this error is small.
+        unscaled_last_source_buf_2d.buf =
+            cpi->unscaled_last_source->y_buffer + recon_yoffset;
+        unscaled_last_source_buf_2d.stride =
+            cpi->unscaled_last_source->y_stride;
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+          raw_motion_error = highbd_get_prediction_error(
+              bsize, &x->plane[0].src, &unscaled_last_source_buf_2d, xd->bd);
+        } else {
+          raw_motion_error = get_prediction_error(
+              bsize, &x->plane[0].src, &unscaled_last_source_buf_2d);
+        }
+#else
+        raw_motion_error = get_prediction_error(
+            bsize, &x->plane[0].src, &unscaled_last_source_buf_2d);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+        // TODO(pengchong): Replace the hard-coded threshold
+        if (raw_motion_error > 25 || lc != NULL) {
+          // Test last reference frame using the previous best mv as the
+          // starting point (best reference) for the search.
+          first_pass_motion_search(cpi, x, &best_ref_mv, &mv, &motion_error);
+
+          // If the current best reference mv is not centered on 0,0 then do a
+          // 0,0 based search as well.
+          if (!is_zero_mv(&best_ref_mv)) {
+            tmp_err = INT_MAX;
+            first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &tmp_err);
+
+            if (tmp_err < motion_error) {
+              motion_error = tmp_err;
+              mv = tmp_mv;
+            }
+          }
+
+          // Search in an older reference frame.
+          if (((lc == NULL && cm->current_video_frame > 1) ||
+               (lc != NULL && lc->current_video_frame_in_layer > 1))
+              && gld_yv12 != NULL) {
+            // Assume 0,0 motion with no mv overhead.
+            int gf_motion_error;
+
+            xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset;
+#if CONFIG_VP9_HIGHBITDEPTH
+            if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+              gf_motion_error = highbd_get_prediction_error(
+                  bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);
+            } else {
+              gf_motion_error = get_prediction_error(
+                  bsize, &x->plane[0].src, &xd->plane[0].pre[0]);
+            }
+#else
+            gf_motion_error = get_prediction_error(
+                bsize, &x->plane[0].src, &xd->plane[0].pre[0]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+            first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv,
+                                     &gf_motion_error);
+
+            if (gf_motion_error < motion_error && gf_motion_error < this_error)
+              ++second_ref_count;
+
+            // Reset to last frame as reference buffer.
+            xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
+            xd->plane[1].pre[0].buf = first_ref_buf->u_buffer + recon_uvoffset;
+            xd->plane[2].pre[0].buf = first_ref_buf->v_buffer + recon_uvoffset;
+
+            // In accumulating a score for the older reference frame take the
+            // best of the motion predicted score and the intra coded error
+            // (just as will be done for) accumulation of "coded_error" for
+            // the last frame.
+            if (gf_motion_error < this_error)
+              sr_coded_error += gf_motion_error;
+            else
+              sr_coded_error += this_error;
+          } else {
+            sr_coded_error += motion_error;
+          }
+        } else {
+          sr_coded_error += motion_error;
+        }
+
+        // Start by assuming that intra mode is best.
+        best_ref_mv.row = 0;
+        best_ref_mv.col = 0;
+
+#if CONFIG_FP_MB_STATS
+        if (cpi->use_fp_mb_stats) {
+          // intra predication statistics
+          cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
+          cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_DCINTRA_MASK;
+          cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK;
+          if (this_error > FPMB_ERROR_LARGE_TH) {
+            cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LARGE_MASK;
+          } else if (this_error < FPMB_ERROR_SMALL_TH) {
+            cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_SMALL_MASK;
+          }
+        }
+#endif
+
+        if (motion_error <= this_error) {
+          vpx_clear_system_state();
+
+          // Keep a count of cases where the inter and intra were very close
+          // and very low. This helps with scene cut detection for example in
+          // cropped clips with black bars at the sides or top and bottom.
+          if (((this_error - intrapenalty) * 9 <= motion_error * 10) &&
+              (this_error < (2 * intrapenalty))) {
+            neutral_count += 1.0;
+          // Also track cases where the intra is not much worse than the inter
+          // and use this in limiting the GF/arf group length.
+          } else if ((this_error > NCOUNT_INTRA_THRESH) &&
+                     (this_error < (NCOUNT_INTRA_FACTOR * motion_error))) {
+            neutral_count += (double)motion_error /
+                             DOUBLE_DIVIDE_CHECK((double)this_error);
+          }
+
+          mv.row *= 8;
+          mv.col *= 8;
+          this_error = motion_error;
+          xd->mi[0]->mode = NEWMV;
+          xd->mi[0]->mv[0].as_mv = mv;
+          xd->mi[0]->tx_size = TX_4X4;
+          xd->mi[0]->ref_frame[0] = LAST_FRAME;
+          xd->mi[0]->ref_frame[1] = NONE;
+          vp9_build_inter_predictors_sby(xd, mb_row << 1, mb_col << 1, bsize);
+          vp9_encode_sby_pass1(x, bsize);
+          sum_mvr += mv.row;
+          sum_mvr_abs += abs(mv.row);
+          sum_mvc += mv.col;
+          sum_mvc_abs += abs(mv.col);
+          sum_mvrs += mv.row * mv.row;
+          sum_mvcs += mv.col * mv.col;
+          ++intercount;
+
+          best_ref_mv = mv;
+
+#if CONFIG_FP_MB_STATS
+          if (cpi->use_fp_mb_stats) {
+            // inter predication statistics
+            cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
+            cpi->twopass.frame_mb_stats_buf[mb_index] &= ~FPMB_DCINTRA_MASK;
+            cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK;
+            if (this_error > FPMB_ERROR_LARGE_TH) {
+              cpi->twopass.frame_mb_stats_buf[mb_index] |=
+                  FPMB_ERROR_LARGE_MASK;
+            } else if (this_error < FPMB_ERROR_SMALL_TH) {
+              cpi->twopass.frame_mb_stats_buf[mb_index] |=
+                  FPMB_ERROR_SMALL_MASK;
+            }
+          }
+#endif
+
+          if (!is_zero_mv(&mv)) {
+            ++mvcount;
+
+#if CONFIG_FP_MB_STATS
+            if (cpi->use_fp_mb_stats) {
+              cpi->twopass.frame_mb_stats_buf[mb_index] &=
+                  ~FPMB_MOTION_ZERO_MASK;
+              // check estimated motion direction
+              if (mv.as_mv.col > 0 && mv.as_mv.col >= abs(mv.as_mv.row)) {
+                // right direction
+                cpi->twopass.frame_mb_stats_buf[mb_index] |=
+                    FPMB_MOTION_RIGHT_MASK;
+              } else if (mv.as_mv.row < 0 &&
+                         abs(mv.as_mv.row) >= abs(mv.as_mv.col)) {
+                // up direction
+                cpi->twopass.frame_mb_stats_buf[mb_index] |=
+                    FPMB_MOTION_UP_MASK;
+              } else if (mv.as_mv.col < 0 &&
+                         abs(mv.as_mv.col) >= abs(mv.as_mv.row)) {
+                // left direction
+                cpi->twopass.frame_mb_stats_buf[mb_index] |=
+                    FPMB_MOTION_LEFT_MASK;
+              } else {
+                // down direction
+                cpi->twopass.frame_mb_stats_buf[mb_index] |=
+                    FPMB_MOTION_DOWN_MASK;
+              }
+            }
+#endif
+
+            // Non-zero vector, was it different from the last non zero vector?
+            if (!is_equal_mv(&mv, &lastmv))
+              ++new_mv_count;
+            lastmv = mv;
+
+            // Does the row vector point inwards or outwards?
+            if (mb_row < cm->mb_rows / 2) {
+              if (mv.row > 0)
+                --sum_in_vectors;
+              else if (mv.row < 0)
+                ++sum_in_vectors;
+            } else if (mb_row > cm->mb_rows / 2) {
+              if (mv.row > 0)
+                ++sum_in_vectors;
+              else if (mv.row < 0)
+                --sum_in_vectors;
+            }
+
+            // Does the col vector point inwards or outwards?
+            if (mb_col < cm->mb_cols / 2) {
+              if (mv.col > 0)
+                --sum_in_vectors;
+              else if (mv.col < 0)
+                ++sum_in_vectors;
+            } else if (mb_col > cm->mb_cols / 2) {
+              if (mv.col > 0)
+                ++sum_in_vectors;
+              else if (mv.col < 0)
+                --sum_in_vectors;
+            }
+          }
+        }
+      } else {
+        sr_coded_error += (int64_t)this_error;
+      }
+      coded_error += (int64_t)this_error;
+
+      // Adjust to the next column of MBs.
+      x->plane[0].src.buf += 16;
+      x->plane[1].src.buf += uv_mb_height;
+      x->plane[2].src.buf += uv_mb_height;
+
+      recon_yoffset += 16;
+      recon_uvoffset += uv_mb_height;
+    }
+
+    // Adjust to the next row of MBs.
+    x->plane[0].src.buf += 16 * x->plane[0].src.stride - 16 * cm->mb_cols;
+    x->plane[1].src.buf += uv_mb_height * x->plane[1].src.stride -
+                           uv_mb_height * cm->mb_cols;
+    x->plane[2].src.buf += uv_mb_height * x->plane[1].src.stride -
+                           uv_mb_height * cm->mb_cols;
+
+    vpx_clear_system_state();
+  }
+
+  // Clamp the image start to rows/2. This number of rows is discarded top
+  // and bottom as dead data so rows / 2 means the frame is blank.
+  if ((image_data_start_row > cm->mb_rows / 2) ||
+      (image_data_start_row == INVALID_ROW)) {
+    image_data_start_row = cm->mb_rows / 2;
+  }
+  // Exclude any image dead zone
+  if (image_data_start_row > 0) {
+    intra_skip_count =
+        VPXMAX(0, intra_skip_count - (image_data_start_row * cm->mb_cols * 2));
+  }
+
+  {
+    FIRSTPASS_STATS fps;
+    // The minimum error here insures some bit allocation to frames even
+    // in static regions. The allocation per MB declines for larger formats
+    // where the typical "real" energy per MB also falls.
+    // Initial estimate here uses sqrt(mbs) to define the min_err, where the
+    // number of mbs is proportional to the image area.
+    const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+                        ? cpi->initial_mbs : cpi->common.MBs;
+    const double min_err = 200 * sqrt(num_mbs);
+
+    intra_factor = intra_factor / (double)num_mbs;
+    brightness_factor = brightness_factor / (double)num_mbs;
+    fps.weight = intra_factor * brightness_factor;
+
+    fps.frame = cm->current_video_frame;
+    fps.spatial_layer_id = cpi->svc.spatial_layer_id;
+    fps.coded_error = (double)(coded_error >> 8) + min_err;
+    fps.sr_coded_error = (double)(sr_coded_error >> 8) + min_err;
+    fps.intra_error = (double)(intra_error >> 8) + min_err;
+    fps.count = 1.0;
+    fps.pcnt_inter = (double)intercount / num_mbs;
+    fps.pcnt_second_ref = (double)second_ref_count / num_mbs;
+    fps.pcnt_neutral = (double)neutral_count / num_mbs;
+    fps.intra_skip_pct = (double)intra_skip_count / num_mbs;
+    fps.inactive_zone_rows = (double)image_data_start_row;
+    fps.inactive_zone_cols = (double)0;  // TODO(paulwilkins): fix
+
+    if (mvcount > 0) {
+      fps.MVr = (double)sum_mvr / mvcount;
+      fps.mvr_abs = (double)sum_mvr_abs / mvcount;
+      fps.MVc = (double)sum_mvc / mvcount;
+      fps.mvc_abs = (double)sum_mvc_abs / mvcount;
+      fps.MVrv = ((double)sum_mvrs -
+                  ((double)sum_mvr * sum_mvr / mvcount)) / mvcount;
+      fps.MVcv = ((double)sum_mvcs -
+                  ((double)sum_mvc * sum_mvc / mvcount)) / mvcount;
+      fps.mv_in_out_count = (double)sum_in_vectors / (mvcount * 2);
+      fps.new_mv_count = new_mv_count;
+      fps.pcnt_motion = (double)mvcount / num_mbs;
+    } else {
+      fps.MVr = 0.0;
+      fps.mvr_abs = 0.0;
+      fps.MVc = 0.0;
+      fps.mvc_abs = 0.0;
+      fps.MVrv = 0.0;
+      fps.MVcv = 0.0;
+      fps.mv_in_out_count = 0.0;
+      fps.new_mv_count = 0.0;
+      fps.pcnt_motion = 0.0;
+    }
+
+    // TODO(paulwilkins):  Handle the case when duration is set to 0, or
+    // something less than the full time between subsequent values of
+    // cpi->source_time_stamp.
+    fps.duration = (double)(source->ts_end - source->ts_start);
+
+    // Don't want to do output stats with a stack variable!
+    twopass->this_frame_stats = fps;
+    output_stats(&twopass->this_frame_stats, cpi->output_pkt_list);
+    accumulate_stats(&twopass->total_stats, &fps);
+
+#if CONFIG_FP_MB_STATS
+    if (cpi->use_fp_mb_stats) {
+      output_fpmb_stats(twopass->frame_mb_stats_buf, cm, cpi->output_pkt_list);
+    }
+#endif
+  }
+
+  // Copy the previous Last Frame back into gf and and arf buffers if
+  // the prediction is good enough... but also don't allow it to lag too far.
+  if ((twopass->sr_update_lag > 3) ||
+      ((cm->current_video_frame > 0) &&
+       (twopass->this_frame_stats.pcnt_inter > 0.20) &&
+       ((twopass->this_frame_stats.intra_error /
+         DOUBLE_DIVIDE_CHECK(twopass->this_frame_stats.coded_error)) > 2.0))) {
+    if (gld_yv12 != NULL) {
+      ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
+                 cm->ref_frame_map[cpi->lst_fb_idx]);
+    }
+    twopass->sr_update_lag = 1;
+  } else {
+    ++twopass->sr_update_lag;
+  }
+
+  vpx_extend_frame_borders(new_yv12);
+
+  if (lc != NULL) {
+    vp9_update_reference_frames(cpi);
+  } else {
+    // The frame we just compressed now becomes the last frame.
+    ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->lst_fb_idx],
+               cm->new_fb_idx);
+  }
+
+  // Special case for the first frame. Copy into the GF buffer as a second
+  // reference.
+  if (cm->current_video_frame == 0 && cpi->gld_fb_idx != INVALID_IDX &&
+      lc == NULL) {
+    ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
+               cm->ref_frame_map[cpi->lst_fb_idx]);
+  }
+
+  // Use this to see what the first pass reconstruction looks like.
+  if (0) {
+    char filename[512];
+    FILE *recon_file;
+    snprintf(filename, sizeof(filename), "enc%04d.yuv",
+             (int)cm->current_video_frame);
+
+    if (cm->current_video_frame == 0)
+      recon_file = fopen(filename, "wb");
+    else
+      recon_file = fopen(filename, "ab");
+
+    (void)fwrite(lst_yv12->buffer_alloc, lst_yv12->frame_size, 1, recon_file);
+    fclose(recon_file);
+  }
+
+  ++cm->current_video_frame;
+  if (cpi->use_svc)
+    vp9_inc_frame_in_layer(cpi);
+}
+
+static double calc_correction_factor(double err_per_mb,
+                                     double err_divisor,
+                                     double pt_low,
+                                     double pt_high,
+                                     int q,
+                                     vpx_bit_depth_t bit_depth) {
+  const double error_term = err_per_mb / err_divisor;
+
+  // Adjustment based on actual quantizer to power term.
+  const double power_term =
+      VPXMIN(vp9_convert_qindex_to_q(q, bit_depth) * 0.01 + pt_low, pt_high);
+
+  // Calculate correction factor.
+  if (power_term < 1.0)
+    assert(error_term >= 0.0);
+
+  return fclamp(pow(error_term, power_term), 0.05, 5.0);
+}
+
+// Larger image formats are expected to be a little harder to code relatively
+// given the same prediction error score. This in part at least relates to the
+// increased size and hence coding cost of motion vectors.
+#define EDIV_SIZE_FACTOR 800
+
+static int get_twopass_worst_quality(const VP9_COMP *cpi,
+                                     const double section_err,
+                                     double inactive_zone,
+                                     int section_target_bandwidth,
+                                     double group_weight_factor) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  // Clamp the target rate to VBR min / max limts.
+  const int target_rate =
+      vp9_rc_clamp_pframe_target_size(cpi, section_target_bandwidth);
+
+  inactive_zone = fclamp(inactive_zone, 0.0, 1.0);
+
+  if (target_rate <= 0) {
+    return rc->worst_quality;  // Highest value allowed
+  } else {
+    const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+                        ? cpi->initial_mbs : cpi->common.MBs;
+    const int active_mbs = VPXMAX(1, num_mbs - (int)(num_mbs * inactive_zone));
+    const double av_err_per_mb = section_err / active_mbs;
+    const double speed_term = 1.0 + 0.04 * oxcf->speed;
+    const double ediv_size_correction = (double)num_mbs / EDIV_SIZE_FACTOR;
+    const int target_norm_bits_per_mb = ((uint64_t)target_rate <<
+                                         BPER_MB_NORMBITS) / active_mbs;
+
+    int q;
+    int is_svc_upper_layer = 0;
+
+    if (is_two_pass_svc(cpi) && cpi->svc.spatial_layer_id > 0)
+      is_svc_upper_layer = 1;
+
+
+    // Try and pick a max Q that will be high enough to encode the
+    // content at the given rate.
+    for (q = rc->best_quality; q < rc->worst_quality; ++q) {
+      const double factor =
+          calc_correction_factor(av_err_per_mb,
+                                 ERR_DIVISOR - ediv_size_correction,
+                                 is_svc_upper_layer ? SVC_FACTOR_PT_LOW :
+                                 FACTOR_PT_LOW, FACTOR_PT_HIGH, q,
+                                 cpi->common.bit_depth);
+      const int bits_per_mb =
+        vp9_rc_bits_per_mb(INTER_FRAME, q,
+                           factor * speed_term * group_weight_factor,
+                           cpi->common.bit_depth);
+      if (bits_per_mb <= target_norm_bits_per_mb)
+        break;
+    }
+
+    // Restriction on active max q for constrained quality mode.
+    if (cpi->oxcf.rc_mode == VPX_CQ)
+      q = VPXMAX(q, oxcf->cq_level);
+    return q;
+  }
+}
+
+static void setup_rf_level_maxq(VP9_COMP *cpi) {
+  int i;
+  RATE_CONTROL *const rc = &cpi->rc;
+  for (i = INTER_NORMAL; i < RATE_FACTOR_LEVELS; ++i) {
+    int qdelta = vp9_frame_type_qdelta(cpi, i, rc->worst_quality);
+    rc->rf_level_maxq[i] = VPXMAX(rc->worst_quality + qdelta, rc->best_quality);
+  }
+}
+
+static void init_subsampling(VP9_COMP *cpi) {
+  const VP9_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  const int w = cm->width;
+  const int h = cm->height;
+  int i;
+
+  for (i = 0; i < FRAME_SCALE_STEPS; ++i) {
+    // Note: Frames with odd-sized dimensions may result from this scaling.
+    rc->frame_width[i] = (w * 16) / frame_scale_factor[i];
+    rc->frame_height[i] = (h * 16) / frame_scale_factor[i];
+  }
+
+  setup_rf_level_maxq(cpi);
+}
+
+void calculate_coded_size(VP9_COMP *cpi,
+                          int *scaled_frame_width,
+                          int *scaled_frame_height) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  *scaled_frame_width = rc->frame_width[rc->frame_size_selector];
+  *scaled_frame_height = rc->frame_height[rc->frame_size_selector];
+}
+
+void vp9_init_second_pass(VP9_COMP *cpi) {
+  SVC *const svc = &cpi->svc;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  const int is_two_pass_svc = (svc->number_spatial_layers > 1) ||
+                              (svc->number_temporal_layers > 1);
+  TWO_PASS *const twopass = is_two_pass_svc ?
+      &svc->layer_context[svc->spatial_layer_id].twopass : &cpi->twopass;
+  double frame_rate;
+  FIRSTPASS_STATS *stats;
+
+  zero_stats(&twopass->total_stats);
+  zero_stats(&twopass->total_left_stats);
+
+  if (!twopass->stats_in_end)
+    return;
+
+  stats = &twopass->total_stats;
+
+  *stats = *twopass->stats_in_end;
+  twopass->total_left_stats = *stats;
+
+  frame_rate = 10000000.0 * stats->count / stats->duration;
+  // Each frame can have a different duration, as the frame rate in the source
+  // isn't guaranteed to be constant. The frame rate prior to the first frame
+  // encoded in the second pass is a guess. However, the sum duration is not.
+  // It is calculated based on the actual durations of all frames from the
+  // first pass.
+
+  if (is_two_pass_svc) {
+    vp9_update_spatial_layer_framerate(cpi, frame_rate);
+    twopass->bits_left = (int64_t)(stats->duration *
+        svc->layer_context[svc->spatial_layer_id].target_bandwidth /
+        10000000.0);
+  } else {
+    vp9_new_framerate(cpi, frame_rate);
+    twopass->bits_left = (int64_t)(stats->duration * oxcf->target_bandwidth /
+                             10000000.0);
+  }
+
+  // This variable monitors how far behind the second ref update is lagging.
+  twopass->sr_update_lag = 1;
+
+  // Scan the first pass file and calculate a modified total error based upon
+  // the bias/power function used to allocate bits.
+  {
+    const double avg_error = stats->coded_error /
+                             DOUBLE_DIVIDE_CHECK(stats->count);
+    const FIRSTPASS_STATS *s = twopass->stats_in;
+    double modified_error_total = 0.0;
+    twopass->modified_error_min = (avg_error *
+                                      oxcf->two_pass_vbrmin_section) / 100;
+    twopass->modified_error_max = (avg_error *
+                                      oxcf->two_pass_vbrmax_section) / 100;
+    while (s < twopass->stats_in_end) {
+      modified_error_total += calculate_modified_err(cpi, twopass, oxcf, s);
+      ++s;
+    }
+    twopass->modified_error_left = modified_error_total;
+  }
+
+  // Reset the vbr bits off target counters
+  cpi->rc.vbr_bits_off_target = 0;
+  cpi->rc.vbr_bits_off_target_fast = 0;
+
+  cpi->rc.rate_error_estimate = 0;
+
+  // Static sequence monitor variables.
+  twopass->kf_zeromotion_pct = 100;
+  twopass->last_kfgroup_zeromotion_pct = 100;
+
+  if (oxcf->resize_mode != RESIZE_NONE) {
+    init_subsampling(cpi);
+  }
+}
+
+#define SR_DIFF_PART 0.0015
+#define MOTION_AMP_PART 0.003
+#define INTRA_PART 0.005
+#define DEFAULT_DECAY_LIMIT 0.75
+#define LOW_SR_DIFF_TRHESH 0.1
+#define SR_DIFF_MAX 128.0
+
+static double get_sr_decay_rate(const VP9_COMP *cpi,
+                                const FIRSTPASS_STATS *frame) {
+  const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+                      ? cpi->initial_mbs : cpi->common.MBs;
+  double sr_diff =
+      (frame->sr_coded_error - frame->coded_error) / num_mbs;
+  double sr_decay = 1.0;
+  double modified_pct_inter;
+  double modified_pcnt_intra;
+  const double motion_amplitude_factor =
+    frame->pcnt_motion * ((frame->mvc_abs + frame->mvr_abs) / 2);
+
+  modified_pct_inter = frame->pcnt_inter;
+  if ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) <
+      (double)NCOUNT_FRAME_II_THRESH) {
+    modified_pct_inter = frame->pcnt_inter - frame->pcnt_neutral;
+  }
+  modified_pcnt_intra = 100 * (1.0 - modified_pct_inter);
+
+
+  if ((sr_diff > LOW_SR_DIFF_TRHESH)) {
+    sr_diff = VPXMIN(sr_diff, SR_DIFF_MAX);
+    sr_decay = 1.0 - (SR_DIFF_PART * sr_diff) -
+               (MOTION_AMP_PART * motion_amplitude_factor) -
+               (INTRA_PART * modified_pcnt_intra);
+  }
+  return VPXMAX(sr_decay, VPXMIN(DEFAULT_DECAY_LIMIT, modified_pct_inter));
+}
+
+// This function gives an estimate of how badly we believe the prediction
+// quality is decaying from frame to frame.
+static double get_zero_motion_factor(const VP9_COMP *cpi,
+                                     const FIRSTPASS_STATS *frame) {
+  const double zero_motion_pct = frame->pcnt_inter -
+                                 frame->pcnt_motion;
+  double sr_decay = get_sr_decay_rate(cpi, frame);
+  return VPXMIN(sr_decay, zero_motion_pct);
+}
+
+#define ZM_POWER_FACTOR 0.75
+
+static double get_prediction_decay_rate(const VP9_COMP *cpi,
+                                        const FIRSTPASS_STATS *next_frame) {
+  const double sr_decay_rate = get_sr_decay_rate(cpi, next_frame);
+  const double zero_motion_factor =
+    (0.95 * pow((next_frame->pcnt_inter - next_frame->pcnt_motion),
+                ZM_POWER_FACTOR));
+
+  return VPXMAX(zero_motion_factor,
+                (sr_decay_rate + ((1.0 - sr_decay_rate) * zero_motion_factor)));
+}
+
+// Function to test for a condition where a complex transition is followed
+// by a static section. For example in slide shows where there is a fade
+// between slides. This is to help with more optimal kf and gf positioning.
+static int detect_transition_to_still(VP9_COMP *cpi,
+                                      int frame_interval, int still_interval,
+                                      double loop_decay_rate,
+                                      double last_decay_rate) {
+  TWO_PASS *const twopass = &cpi->twopass;
+  RATE_CONTROL *const rc = &cpi->rc;
+
+  // Break clause to detect very still sections after motion
+  // For example a static image after a fade or other transition
+  // instead of a clean scene cut.
+  if (frame_interval > rc->min_gf_interval &&
+      loop_decay_rate >= 0.999 &&
+      last_decay_rate < 0.9) {
+    int j;
+
+    // Look ahead a few frames to see if static condition persists...
+    for (j = 0; j < still_interval; ++j) {
+      const FIRSTPASS_STATS *stats = &twopass->stats_in[j];
+      if (stats >= twopass->stats_in_end)
+        break;
+
+      if (stats->pcnt_inter - stats->pcnt_motion < 0.999)
+        break;
+    }
+
+    // Only if it does do we signal a transition to still.
+    return j == still_interval;
+  }
+
+  return 0;
+}
+
+// This function detects a flash through the high relative pcnt_second_ref
+// score in the frame following a flash frame. The offset passed in should
+// reflect this.
+static int detect_flash(const TWO_PASS *twopass, int offset) {
+  const FIRSTPASS_STATS *const next_frame = read_frame_stats(twopass, offset);
+
+  // What we are looking for here is a situation where there is a
+  // brief break in prediction (such as a flash) but subsequent frames
+  // are reasonably well predicted by an earlier (pre flash) frame.
+  // The recovery after a flash is indicated by a high pcnt_second_ref
+  // compared to pcnt_inter.
+  return next_frame != NULL &&
+         next_frame->pcnt_second_ref > next_frame->pcnt_inter &&
+         next_frame->pcnt_second_ref >= 0.5;
+}
+
+// Update the motion related elements to the GF arf boost calculation.
+static void accumulate_frame_motion_stats(const FIRSTPASS_STATS *stats,
+                                          double *mv_in_out,
+                                          double *mv_in_out_accumulator,
+                                          double *abs_mv_in_out_accumulator,
+                                          double *mv_ratio_accumulator) {
+  const double pct = stats->pcnt_motion;
+
+  // Accumulate Motion In/Out of frame stats.
+  *mv_in_out = stats->mv_in_out_count * pct;
+  *mv_in_out_accumulator += *mv_in_out;
+  *abs_mv_in_out_accumulator += fabs(*mv_in_out);
+
+  // Accumulate a measure of how uniform (or conversely how random) the motion
+  // field is (a ratio of abs(mv) / mv).
+  if (pct > 0.05) {
+    const double mvr_ratio = fabs(stats->mvr_abs) /
+                                 DOUBLE_DIVIDE_CHECK(fabs(stats->MVr));
+    const double mvc_ratio = fabs(stats->mvc_abs) /
+                                 DOUBLE_DIVIDE_CHECK(fabs(stats->MVc));
+
+    *mv_ratio_accumulator += pct * (mvr_ratio < stats->mvr_abs ?
+                                       mvr_ratio : stats->mvr_abs);
+    *mv_ratio_accumulator += pct * (mvc_ratio < stats->mvc_abs ?
+                                       mvc_ratio : stats->mvc_abs);
+  }
+}
+
+#define BASELINE_ERR_PER_MB 1000.0
+static double calc_frame_boost(VP9_COMP *cpi,
+                               const FIRSTPASS_STATS *this_frame,
+                               double this_frame_mv_in_out,
+                               double max_boost) {
+  double frame_boost;
+  const double lq =
+    vp9_convert_qindex_to_q(cpi->rc.avg_frame_qindex[INTER_FRAME],
+                            cpi->common.bit_depth);
+  const double boost_q_correction = VPXMIN((0.5 + (lq * 0.015)), 1.5);
+  int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+                ? cpi->initial_mbs : cpi->common.MBs;
+
+  // Correct for any inactive region in the image
+  num_mbs = (int)VPXMAX(1, num_mbs * calculate_active_area(cpi, this_frame));
+
+  // Underlying boost factor is based on inter error ratio.
+  frame_boost = (BASELINE_ERR_PER_MB * num_mbs) /
+                DOUBLE_DIVIDE_CHECK(this_frame->coded_error);
+  frame_boost = frame_boost * BOOST_FACTOR * boost_q_correction;
+
+  // Increase boost for frames where new data coming into frame (e.g. zoom out).
+  // Slightly reduce boost if there is a net balance of motion out of the frame
+  // (zoom in). The range for this_frame_mv_in_out is -1.0 to +1.0.
+  if (this_frame_mv_in_out > 0.0)
+    frame_boost += frame_boost * (this_frame_mv_in_out * 2.0);
+  // In the extreme case the boost is halved.
+  else
+    frame_boost += frame_boost * (this_frame_mv_in_out / 2.0);
+
+  return VPXMIN(frame_boost, max_boost * boost_q_correction);
+}
+
+static int calc_arf_boost(VP9_COMP *cpi, int offset,
+                          int f_frames, int b_frames,
+                          int *f_boost, int *b_boost) {
+  TWO_PASS *const twopass = &cpi->twopass;
+  int i;
+  double boost_score = 0.0;
+  double mv_ratio_accumulator = 0.0;
+  double decay_accumulator = 1.0;
+  double this_frame_mv_in_out = 0.0;
+  double mv_in_out_accumulator = 0.0;
+  double abs_mv_in_out_accumulator = 0.0;
+  int arf_boost;
+  int flash_detected = 0;
+
+  // Search forward from the proposed arf/next gf position.
+  for (i = 0; i < f_frames; ++i) {
+    const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset);
+    if (this_frame == NULL)
+      break;
+
+    // Update the motion related elements to the boost calculation.
+    accumulate_frame_motion_stats(this_frame,
+                                  &this_frame_mv_in_out, &mv_in_out_accumulator,
+                                  &abs_mv_in_out_accumulator,
+                                  &mv_ratio_accumulator);
+
+    // We want to discount the flash frame itself and the recovery
+    // frame that follows as both will have poor scores.
+    flash_detected = detect_flash(twopass, i + offset) ||
+                     detect_flash(twopass, i + offset + 1);
+
+    // Accumulate the effect of prediction quality decay.
+    if (!flash_detected) {
+      decay_accumulator *= get_prediction_decay_rate(cpi, this_frame);
+      decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
+                          ? MIN_DECAY_FACTOR : decay_accumulator;
+    }
+
+    boost_score += decay_accumulator * calc_frame_boost(cpi, this_frame,
+                                                        this_frame_mv_in_out,
+                                                        GF_MAX_BOOST);
+  }
+
+  *f_boost = (int)boost_score;
+
+  // Reset for backward looking loop.
+  boost_score = 0.0;
+  mv_ratio_accumulator = 0.0;
+  decay_accumulator = 1.0;
+  this_frame_mv_in_out = 0.0;
+  mv_in_out_accumulator = 0.0;
+  abs_mv_in_out_accumulator = 0.0;
+
+  // Search backward towards last gf position.
+  for (i = -1; i >= -b_frames; --i) {
+    const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset);
+    if (this_frame == NULL)
+      break;
+
+    // Update the motion related elements to the boost calculation.
+    accumulate_frame_motion_stats(this_frame,
+                                  &this_frame_mv_in_out, &mv_in_out_accumulator,
+                                  &abs_mv_in_out_accumulator,
+                                  &mv_ratio_accumulator);
+
+    // We want to discount the the flash frame itself and the recovery
+    // frame that follows as both will have poor scores.
+    flash_detected = detect_flash(twopass, i + offset) ||
+                     detect_flash(twopass, i + offset + 1);
+
+    // Cumulative effect of prediction quality decay.
+    if (!flash_detected) {
+      decay_accumulator *= get_prediction_decay_rate(cpi, this_frame);
+      decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
+                              ? MIN_DECAY_FACTOR : decay_accumulator;
+    }
+
+    boost_score += decay_accumulator * calc_frame_boost(cpi, this_frame,
+                                                        this_frame_mv_in_out,
+                                                        GF_MAX_BOOST);
+  }
+  *b_boost = (int)boost_score;
+
+  arf_boost = (*f_boost + *b_boost);
+  if (arf_boost < ((b_frames + f_frames) * 20))
+    arf_boost = ((b_frames + f_frames) * 20);
+  arf_boost = VPXMAX(arf_boost, MIN_ARF_GF_BOOST);
+
+  return arf_boost;
+}
+
+// Calculate a section intra ratio used in setting max loop filter.
+static int calculate_section_intra_ratio(const FIRSTPASS_STATS *begin,
+                                         const FIRSTPASS_STATS *end,
+                                         int section_length) {
+  const FIRSTPASS_STATS *s = begin;
+  double intra_error = 0.0;
+  double coded_error = 0.0;
+  int i = 0;
+
+  while (s < end && i < section_length) {
+    intra_error += s->intra_error;
+    coded_error += s->coded_error;
+    ++s;
+    ++i;
+  }
+
+  return (int)(intra_error / DOUBLE_DIVIDE_CHECK(coded_error));
+}
+
+// Calculate the total bits to allocate in this GF/ARF group.
+static int64_t calculate_total_gf_group_bits(VP9_COMP *cpi,
+                                             double gf_group_err) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const TWO_PASS *const twopass = &cpi->twopass;
+  const int max_bits = frame_max_bits(rc, &cpi->oxcf);
+  int64_t total_group_bits;
+
+  // Calculate the bits to be allocated to the group as a whole.
+  if ((twopass->kf_group_bits > 0) && (twopass->kf_group_error_left > 0)) {
+    total_group_bits = (int64_t)(twopass->kf_group_bits *
+                                 (gf_group_err / twopass->kf_group_error_left));
+  } else {
+    total_group_bits = 0;
+  }
+
+  // Clamp odd edge cases.
+  total_group_bits = (total_group_bits < 0) ?
+     0 : (total_group_bits > twopass->kf_group_bits) ?
+     twopass->kf_group_bits : total_group_bits;
+
+  // Clip based on user supplied data rate variability limit.
+  if (total_group_bits > (int64_t)max_bits * rc->baseline_gf_interval)
+    total_group_bits = (int64_t)max_bits * rc->baseline_gf_interval;
+
+  return total_group_bits;
+}
+
+// Calculate the number bits extra to assign to boosted frames in a group.
+static int calculate_boost_bits(int frame_count,
+                                int boost, int64_t total_group_bits) {
+  int allocation_chunks;
+
+  // return 0 for invalid inputs (could arise e.g. through rounding errors)
+  if (!boost || (total_group_bits <= 0) || (frame_count <= 0) )
+    return 0;
+
+  allocation_chunks = (frame_count * 100) + boost;
+
+  // Prevent overflow.
+  if (boost > 1023) {
+    int divisor = boost >> 10;
+    boost /= divisor;
+    allocation_chunks /= divisor;
+  }
+
+  // Calculate the number of extra bits for use in the boosted frame or frames.
+  return VPXMAX((int)(((int64_t)boost * total_group_bits) / allocation_chunks),
+                0);
+}
+
+// Current limit on maximum number of active arfs in a GF/ARF group.
+#define MAX_ACTIVE_ARFS 2
+#define ARF_SLOT1 2
+#define ARF_SLOT2 3
+// This function indirects the choice of buffers for arfs.
+// At the moment the values are fixed but this may change as part of
+// the integration process with other codec features that swap buffers around.
+static void get_arf_buffer_indices(unsigned char *arf_buffer_indices) {
+  arf_buffer_indices[0] = ARF_SLOT1;
+  arf_buffer_indices[1] = ARF_SLOT2;
+}
+
+static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
+                                   double group_error, int gf_arf_bits) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  TWO_PASS *const twopass = &cpi->twopass;
+  GF_GROUP *const gf_group = &twopass->gf_group;
+  FIRSTPASS_STATS frame_stats;
+  int i;
+  int frame_index = 1;
+  int target_frame_size;
+  int key_frame;
+  const int max_bits = frame_max_bits(&cpi->rc, &cpi->oxcf);
+  int64_t total_group_bits = gf_group_bits;
+  double modified_err = 0.0;
+  double err_fraction;
+  int mid_boost_bits = 0;
+  int mid_frame_idx;
+  unsigned char arf_buffer_indices[MAX_ACTIVE_ARFS];
+  int alt_frame_index = frame_index;
+  int has_temporal_layers = is_two_pass_svc(cpi) &&
+                            cpi->svc.number_temporal_layers > 1;
+
+  // Only encode alt reference frame in temporal base layer.
+  if (has_temporal_layers)
+    alt_frame_index = cpi->svc.number_temporal_layers;
+
+  key_frame = cpi->common.frame_type == KEY_FRAME ||
+              vp9_is_upper_layer_key_frame(cpi);
+
+  get_arf_buffer_indices(arf_buffer_indices);
+
+  // For key frames the frame target rate is already set and it
+  // is also the golden frame.
+  if (!key_frame) {
+    if (rc->source_alt_ref_active) {
+      gf_group->update_type[0] = OVERLAY_UPDATE;
+      gf_group->rf_level[0] = INTER_NORMAL;
+      gf_group->bit_allocation[0] = 0;
+    } else {
+      gf_group->update_type[0] = GF_UPDATE;
+      gf_group->rf_level[0] = GF_ARF_STD;
+      gf_group->bit_allocation[0] = gf_arf_bits;
+    }
+    gf_group->arf_update_idx[0] = arf_buffer_indices[0];
+    gf_group->arf_ref_idx[0] = arf_buffer_indices[0];
+
+    // Step over the golden frame / overlay frame
+    if (EOF == input_stats(twopass, &frame_stats))
+      return;
+  }
+
+  // Deduct the boost bits for arf (or gf if it is not a key frame)
+  // from the group total.
+  if (rc->source_alt_ref_pending || !key_frame)
+    total_group_bits -= gf_arf_bits;
+
+  // Store the bits to spend on the ARF if there is one.
+  if (rc->source_alt_ref_pending) {
+    gf_group->update_type[alt_frame_index] = ARF_UPDATE;
+    gf_group->rf_level[alt_frame_index] = GF_ARF_STD;
+    gf_group->bit_allocation[alt_frame_index] = gf_arf_bits;
+
+    if (has_temporal_layers)
+      gf_group->arf_src_offset[alt_frame_index] =
+          (unsigned char)(rc->baseline_gf_interval -
+                          cpi->svc.number_temporal_layers);
+    else
+      gf_group->arf_src_offset[alt_frame_index] =
+          (unsigned char)(rc->baseline_gf_interval - 1);
+
+    gf_group->arf_update_idx[alt_frame_index] = arf_buffer_indices[0];
+    gf_group->arf_ref_idx[alt_frame_index] =
+      arf_buffer_indices[cpi->multi_arf_last_grp_enabled &&
+                         rc->source_alt_ref_active];
+    if (!has_temporal_layers)
+      ++frame_index;
+
+    if (cpi->multi_arf_enabled) {
+      // Set aside a slot for a level 1 arf.
+      gf_group->update_type[frame_index] = ARF_UPDATE;
+      gf_group->rf_level[frame_index] = GF_ARF_LOW;
+      gf_group->arf_src_offset[frame_index] =
+        (unsigned char)((rc->baseline_gf_interval >> 1) - 1);
+      gf_group->arf_update_idx[frame_index] = arf_buffer_indices[1];
+      gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[0];
+      ++frame_index;
+    }
+  }
+
+  // Define middle frame
+  mid_frame_idx = frame_index + (rc->baseline_gf_interval >> 1) - 1;
+
+  // Allocate bits to the other frames in the group.
+  for (i = 0; i < rc->baseline_gf_interval - rc->source_alt_ref_pending; ++i) {
+    int arf_idx = 0;
+    if (EOF == input_stats(twopass, &frame_stats))
+      break;
+
+    if (has_temporal_layers && frame_index == alt_frame_index) {
+      ++frame_index;
+    }
+
+    modified_err = calculate_modified_err(cpi, twopass, oxcf, &frame_stats);
+
+    if (group_error > 0)
+      err_fraction = modified_err / DOUBLE_DIVIDE_CHECK(group_error);
+    else
+      err_fraction = 0.0;
+
+    target_frame_size = (int)((double)total_group_bits * err_fraction);
+
+    if (rc->source_alt_ref_pending && cpi->multi_arf_enabled) {
+      mid_boost_bits += (target_frame_size >> 4);
+      target_frame_size -= (target_frame_size >> 4);
+
+      if (frame_index <= mid_frame_idx)
+        arf_idx = 1;
+    }
+    gf_group->arf_update_idx[frame_index] = arf_buffer_indices[arf_idx];
+    gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[arf_idx];
+
+    target_frame_size = clamp(target_frame_size, 0,
+                              VPXMIN(max_bits, (int)total_group_bits));
+
+    gf_group->update_type[frame_index] = LF_UPDATE;
+    gf_group->rf_level[frame_index] = INTER_NORMAL;
+
+    gf_group->bit_allocation[frame_index] = target_frame_size;
+    ++frame_index;
+  }
+
+  // Note:
+  // We need to configure the frame at the end of the sequence + 1 that will be
+  // the start frame for the next group. Otherwise prior to the call to
+  // vp9_rc_get_second_pass_params() the data will be undefined.
+  gf_group->arf_update_idx[frame_index] = arf_buffer_indices[0];
+  gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[0];
+
+  if (rc->source_alt_ref_pending) {
+    gf_group->update_type[frame_index] = OVERLAY_UPDATE;
+    gf_group->rf_level[frame_index] = INTER_NORMAL;
+
+    // Final setup for second arf and its overlay.
+    if (cpi->multi_arf_enabled) {
+      gf_group->bit_allocation[2] =
+          gf_group->bit_allocation[mid_frame_idx] + mid_boost_bits;
+      gf_group->update_type[mid_frame_idx] = OVERLAY_UPDATE;
+      gf_group->bit_allocation[mid_frame_idx] = 0;
+    }
+  } else {
+    gf_group->update_type[frame_index] = GF_UPDATE;
+    gf_group->rf_level[frame_index] = GF_ARF_STD;
+  }
+
+  // Note whether multi-arf was enabled this group for next time.
+  cpi->multi_arf_last_grp_enabled = cpi->multi_arf_enabled;
+}
+
+// Analyse and define a gf/arf group.
+static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
+  VP9_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  TWO_PASS *const twopass = &cpi->twopass;
+  FIRSTPASS_STATS next_frame;
+  const FIRSTPASS_STATS *const start_pos = twopass->stats_in;
+  int i;
+
+  double boost_score = 0.0;
+  double old_boost_score = 0.0;
+  double gf_group_err = 0.0;
+#if GROUP_ADAPTIVE_MAXQ
+  double gf_group_raw_error = 0.0;
+#endif
+  double gf_group_skip_pct = 0.0;
+  double gf_group_inactive_zone_rows = 0.0;
+  double gf_first_frame_err = 0.0;
+  double mod_frame_err = 0.0;
+
+  double mv_ratio_accumulator = 0.0;
+  double decay_accumulator = 1.0;
+  double zero_motion_accumulator = 1.0;
+
+  double loop_decay_rate = 1.00;
+  double last_loop_decay_rate = 1.00;
+
+  double this_frame_mv_in_out = 0.0;
+  double mv_in_out_accumulator = 0.0;
+  double abs_mv_in_out_accumulator = 0.0;
+  double mv_ratio_accumulator_thresh;
+  unsigned int allow_alt_ref = is_altref_enabled(cpi);
+
+  int f_boost = 0;
+  int b_boost = 0;
+  int flash_detected;
+  int active_max_gf_interval;
+  int active_min_gf_interval;
+  int64_t gf_group_bits;
+  double gf_group_error_left;
+  int gf_arf_bits;
+  const int is_key_frame = frame_is_intra_only(cm);
+  const int arf_active_or_kf = is_key_frame || rc->source_alt_ref_active;
+
+  // Reset the GF group data structures unless this is a key
+  // frame in which case it will already have been done.
+  if (is_key_frame == 0) {
+    vp9_zero(twopass->gf_group);
+  }
+
+  vpx_clear_system_state();
+  vp9_zero(next_frame);
+
+  // Load stats for the current frame.
+  mod_frame_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
+
+  // Note the error of the frame at the start of the group. This will be
+  // the GF frame error if we code a normal gf.
+  gf_first_frame_err = mod_frame_err;
+
+  // If this is a key frame or the overlay from a previous arf then
+  // the error score / cost of this frame has already been accounted for.
+  if (arf_active_or_kf) {
+    gf_group_err -= gf_first_frame_err;
+#if GROUP_ADAPTIVE_MAXQ
+    gf_group_raw_error -= this_frame->coded_error;
+#endif
+    gf_group_skip_pct -= this_frame->intra_skip_pct;
+    gf_group_inactive_zone_rows -= this_frame->inactive_zone_rows;
+  }
+
+  // Motion breakout threshold for loop below depends on image size.
+  mv_ratio_accumulator_thresh =
+      (cpi->initial_height + cpi->initial_width) / 4.0;
+
+  // Set a maximum and minimum interval for the GF group.
+  // If the image appears almost completely static we can extend beyond this.
+  {
+    int int_max_q =
+      (int)(vp9_convert_qindex_to_q(twopass->active_worst_quality,
+                                   cpi->common.bit_depth));
+    int int_lbq =
+      (int)(vp9_convert_qindex_to_q(rc->last_boosted_qindex,
+                                   cpi->common.bit_depth));
+    active_min_gf_interval = rc->min_gf_interval + VPXMIN(2, int_max_q / 200);
+    if (active_min_gf_interval > rc->max_gf_interval)
+      active_min_gf_interval = rc->max_gf_interval;
+
+    if (cpi->multi_arf_allowed) {
+      active_max_gf_interval = rc->max_gf_interval;
+    } else {
+      // The value chosen depends on the active Q range. At low Q we have
+      // bits to spare and are better with a smaller interval and smaller boost.
+      // At high Q when there are few bits to spare we are better with a longer
+      // interval to spread the cost of the GF.
+      active_max_gf_interval = 12 + VPXMIN(4, (int_lbq / 6));
+
+      // We have: active_min_gf_interval <= rc->max_gf_interval
+      if (active_max_gf_interval < active_min_gf_interval)
+        active_max_gf_interval = active_min_gf_interval;
+      else if (active_max_gf_interval > rc->max_gf_interval)
+        active_max_gf_interval = rc->max_gf_interval;
+    }
+  }
+
+  i = 0;
+  while (i < rc->static_scene_max_gf_interval && i < rc->frames_to_key) {
+    ++i;
+
+    // Accumulate error score of frames in this gf group.
+    mod_frame_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
+    gf_group_err += mod_frame_err;
+#if GROUP_ADAPTIVE_MAXQ
+    gf_group_raw_error += this_frame->coded_error;
+#endif
+    gf_group_skip_pct += this_frame->intra_skip_pct;
+    gf_group_inactive_zone_rows += this_frame->inactive_zone_rows;
+
+    if (EOF == input_stats(twopass, &next_frame))
+      break;
+
+    // Test for the case where there is a brief flash but the prediction
+    // quality back to an earlier frame is then restored.
+    flash_detected = detect_flash(twopass, 0);
+
+    // Update the motion related elements to the boost calculation.
+    accumulate_frame_motion_stats(&next_frame,
+                                  &this_frame_mv_in_out, &mv_in_out_accumulator,
+                                  &abs_mv_in_out_accumulator,
+                                  &mv_ratio_accumulator);
+
+    // Accumulate the effect of prediction quality decay.
+    if (!flash_detected) {
+      last_loop_decay_rate = loop_decay_rate;
+      loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
+
+      decay_accumulator = decay_accumulator * loop_decay_rate;
+
+      // Monitor for static sections.
+      zero_motion_accumulator = VPXMIN(
+          zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));
+
+      // Break clause to detect very still sections after motion. For example,
+      // a static image after a fade or other transition.
+      if (detect_transition_to_still(cpi, i, 5, loop_decay_rate,
+                                     last_loop_decay_rate)) {
+        allow_alt_ref = 0;
+        break;
+      }
+    }
+
+    // Calculate a boost number for this frame.
+    boost_score += decay_accumulator * calc_frame_boost(cpi, &next_frame,
+                                                        this_frame_mv_in_out,
+                                                        GF_MAX_BOOST);
+
+    // Break out conditions.
+    if (
+      // Break at active_max_gf_interval unless almost totally static.
+      (i >= (active_max_gf_interval + arf_active_or_kf) &&
+            zero_motion_accumulator < 0.995) ||
+      (
+        // Don't break out with a very short interval.
+        (i >= active_min_gf_interval + arf_active_or_kf) &&
+        (!flash_detected) &&
+        ((mv_ratio_accumulator > mv_ratio_accumulator_thresh) ||
+         (abs_mv_in_out_accumulator > 3.0) ||
+         (mv_in_out_accumulator < -2.0) ||
+         ((boost_score - old_boost_score) < BOOST_BREAKOUT)))) {
+      boost_score = old_boost_score;
+      break;
+    }
+
+    *this_frame = next_frame;
+    old_boost_score = boost_score;
+  }
+
+  twopass->gf_zeromotion_pct = (int)(zero_motion_accumulator * 1000.0);
+
+  // Was the group length constrained by the requirement for a new KF?
+  rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0;
+
+  // Should we use the alternate reference frame.
+  if (allow_alt_ref &&
+    (i < cpi->oxcf.lag_in_frames) &&
+    (i >= rc->min_gf_interval)) {
+    // Calculate the boost for alt ref.
+    rc->gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost,
+      &b_boost);
+    rc->source_alt_ref_pending = 1;
+
+    // Test to see if multi arf is appropriate.
+    cpi->multi_arf_enabled =
+      (cpi->multi_arf_allowed && (rc->baseline_gf_interval >= 6) &&
+      (zero_motion_accumulator < 0.995)) ? 1 : 0;
+  } else {
+    rc->gfu_boost = VPXMAX((int)boost_score, MIN_ARF_GF_BOOST);
+    rc->source_alt_ref_pending = 0;
+  }
+
+  // Set the interval until the next gf.
+  rc->baseline_gf_interval = i - (is_key_frame || rc->source_alt_ref_pending);
+
+  // Only encode alt reference frame in temporal base layer. So
+  // baseline_gf_interval should be multiple of a temporal layer group
+  // (typically the frame distance between two base layer frames)
+  if (is_two_pass_svc(cpi) && cpi->svc.number_temporal_layers > 1) {
+    int count = (1 << (cpi->svc.number_temporal_layers - 1)) - 1;
+    int new_gf_interval = (rc->baseline_gf_interval + count) & (~count);
+    int j;
+    for (j = 0; j < new_gf_interval - rc->baseline_gf_interval; ++j) {
+      if (EOF == input_stats(twopass, this_frame))
+        break;
+      gf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
+#if GROUP_ADAPTIVE_MAXQ
+      gf_group_raw_error += this_frame->coded_error;
+#endif
+      gf_group_skip_pct += this_frame->intra_skip_pct;
+      gf_group_inactive_zone_rows += this_frame->inactive_zone_rows;
+    }
+    rc->baseline_gf_interval = new_gf_interval;
+  }
+
+  rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+
+  // Reset the file position.
+  reset_fpf_position(twopass, start_pos);
+
+  // Calculate the bits to be allocated to the gf/arf group as a whole
+  gf_group_bits = calculate_total_gf_group_bits(cpi, gf_group_err);
+
+#if GROUP_ADAPTIVE_MAXQ
+  // Calculate an estimate of the maxq needed for the group.
+  // We are more agressive about correcting for sections
+  // where there could be significant overshoot than for easier
+  // sections where we do not wish to risk creating an overshoot
+  // of the allocated bit budget.
+  if ((cpi->oxcf.rc_mode != VPX_Q) && (rc->baseline_gf_interval > 1)) {
+    const int vbr_group_bits_per_frame =
+      (int)(gf_group_bits / rc->baseline_gf_interval);
+    const double group_av_err = gf_group_raw_error  / rc->baseline_gf_interval;
+    const double group_av_skip_pct =
+      gf_group_skip_pct / rc->baseline_gf_interval;
+    const double group_av_inactive_zone =
+      ((gf_group_inactive_zone_rows * 2) /
+       (rc->baseline_gf_interval * (double)cm->mb_rows));
+
+    int tmp_q;
+    // rc factor is a weight factor that corrects for local rate control drift.
+    double rc_factor = 1.0;
+    if (rc->rate_error_estimate > 0) {
+      rc_factor = VPXMAX(RC_FACTOR_MIN,
+                         (double)(100 - rc->rate_error_estimate) / 100.0);
+    } else {
+      rc_factor = VPXMIN(RC_FACTOR_MAX,
+                         (double)(100 - rc->rate_error_estimate) / 100.0);
+    }
+    tmp_q =
+      get_twopass_worst_quality(cpi, group_av_err,
+                                (group_av_skip_pct + group_av_inactive_zone),
+                                vbr_group_bits_per_frame,
+                                twopass->kfgroup_inter_fraction * rc_factor);
+    twopass->active_worst_quality =
+        VPXMAX(tmp_q, twopass->active_worst_quality >> 1);
+  }
+#endif
+
+  // Calculate the extra bits to be used for boosted frame(s)
+  gf_arf_bits = calculate_boost_bits(rc->baseline_gf_interval,
+                                     rc->gfu_boost, gf_group_bits);
+
+  // Adjust KF group bits and error remaining.
+  twopass->kf_group_error_left -= (int64_t)gf_group_err;
+
+  // If this is an arf update we want to remove the score for the overlay
+  // frame at the end which will usually be very cheap to code.
+  // The overlay frame has already, in effect, been coded so we want to spread
+  // the remaining bits among the other frames.
+  // For normal GFs remove the score for the GF itself unless this is
+  // also a key frame in which case it has already been accounted for.
+  if (rc->source_alt_ref_pending) {
+    gf_group_error_left = gf_group_err - mod_frame_err;
+  } else if (is_key_frame == 0) {
+    gf_group_error_left = gf_group_err - gf_first_frame_err;
+  } else {
+    gf_group_error_left = gf_group_err;
+  }
+
+  // Allocate bits to each of the frames in the GF group.
+  allocate_gf_group_bits(cpi, gf_group_bits, gf_group_error_left, gf_arf_bits);
+
+  // Reset the file position.
+  reset_fpf_position(twopass, start_pos);
+
+  // Calculate a section intra ratio used in setting max loop filter.
+  if (cpi->common.frame_type != KEY_FRAME) {
+    twopass->section_intra_rating =
+        calculate_section_intra_ratio(start_pos, twopass->stats_in_end,
+                                      rc->baseline_gf_interval);
+  }
+
+  if (oxcf->resize_mode == RESIZE_DYNAMIC) {
+    // Default to starting GF groups at normal frame size.
+    cpi->rc.next_frame_size_selector = UNSCALED;
+  }
+}
+
+// Threshold for use of the lagging second reference frame. High second ref
+// usage may point to a transient event like a flash or occlusion rather than
+// a real scene cut.
+#define SECOND_REF_USEAGE_THRESH 0.1
+// Minimum % intra coding observed in first pass (1.0 = 100%)
+#define MIN_INTRA_LEVEL 0.25
+// Minimum ratio between the % of intra coding and inter coding in the first
+// pass after discounting neutral blocks (discounting neutral blocks in this
+// way helps catch scene cuts in clips with very flat areas or letter box
+// format clips with image padding.
+#define INTRA_VS_INTER_THRESH 2.0
+// Hard threshold where the first pass chooses intra for almost all blocks.
+// In such a case even if the frame is not a scene cut coding a key frame
+// may be a good option.
+#define VERY_LOW_INTER_THRESH 0.05
+// Maximum threshold for the relative ratio of intra error score vs best
+// inter error score.
+#define KF_II_ERR_THRESHOLD 2.5
+// In real scene cuts there is almost always a sharp change in the intra
+// or inter error score.
+#define ERR_CHANGE_THRESHOLD 0.4
+// For real scene cuts we expect an improvment in the intra inter error
+// ratio in the next frame.
+#define II_IMPROVEMENT_THRESHOLD 3.5
+#define KF_II_MAX 128.0
+
+static int test_candidate_kf(TWO_PASS *twopass,
+                             const FIRSTPASS_STATS *last_frame,
+                             const FIRSTPASS_STATS *this_frame,
+                             const FIRSTPASS_STATS *next_frame) {
+  int is_viable_kf = 0;
+  double pcnt_intra = 1.0 - this_frame->pcnt_inter;
+  double modified_pcnt_inter =
+    this_frame->pcnt_inter - this_frame->pcnt_neutral;
+
+  // Does the frame satisfy the primary criteria of a key frame?
+  // See above for an explanation of the test criteria.
+  // If so, then examine how well it predicts subsequent frames.
+  if ((this_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) &&
+      (next_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) &&
+      ((this_frame->pcnt_inter < VERY_LOW_INTER_THRESH) ||
+       ((pcnt_intra > MIN_INTRA_LEVEL) &&
+        (pcnt_intra > (INTRA_VS_INTER_THRESH * modified_pcnt_inter)) &&
+        ((this_frame->intra_error /
+          DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) <
+          KF_II_ERR_THRESHOLD) &&
+        ((fabs(last_frame->coded_error - this_frame->coded_error) /
+          DOUBLE_DIVIDE_CHECK(this_frame->coded_error) >
+          ERR_CHANGE_THRESHOLD) ||
+         (fabs(last_frame->intra_error - this_frame->intra_error) /
+          DOUBLE_DIVIDE_CHECK(this_frame->intra_error) >
+          ERR_CHANGE_THRESHOLD) ||
+         ((next_frame->intra_error /
+          DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) >
+          II_IMPROVEMENT_THRESHOLD))))) {
+    int i;
+    const FIRSTPASS_STATS *start_pos = twopass->stats_in;
+    FIRSTPASS_STATS local_next_frame = *next_frame;
+    double boost_score = 0.0;
+    double old_boost_score = 0.0;
+    double decay_accumulator = 1.0;
+
+    // Examine how well the key frame predicts subsequent frames.
+    for (i = 0; i < 16; ++i) {
+      double next_iiratio = (BOOST_FACTOR * local_next_frame.intra_error /
+                             DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error));
+
+      if (next_iiratio > KF_II_MAX)
+        next_iiratio = KF_II_MAX;
+
+      // Cumulative effect of decay in prediction quality.
+      if (local_next_frame.pcnt_inter > 0.85)
+        decay_accumulator *= local_next_frame.pcnt_inter;
+      else
+        decay_accumulator *= (0.85 + local_next_frame.pcnt_inter) / 2.0;
+
+      // Keep a running total.
+      boost_score += (decay_accumulator * next_iiratio);
+
+      // Test various breakout clauses.
+      if ((local_next_frame.pcnt_inter < 0.05) ||
+          (next_iiratio < 1.5) ||
+          (((local_next_frame.pcnt_inter -
+             local_next_frame.pcnt_neutral) < 0.20) &&
+           (next_iiratio < 3.0)) ||
+          ((boost_score - old_boost_score) < 3.0) ||
+          (local_next_frame.intra_error < 200)) {
+        break;
+      }
+
+      old_boost_score = boost_score;
+
+      // Get the next frame details
+      if (EOF == input_stats(twopass, &local_next_frame))
+        break;
+    }
+
+    // If there is tolerable prediction for at least the next 3 frames then
+    // break out else discard this potential key frame and move on
+    if (boost_score > 30.0 && (i > 3)) {
+      is_viable_kf = 1;
+    } else {
+      // Reset the file position
+      reset_fpf_position(twopass, start_pos);
+
+      is_viable_kf = 0;
+    }
+  }
+
+  return is_viable_kf;
+}
+
+#define FRAMES_TO_CHECK_DECAY 8
+
+static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
+  int i, j;
+  RATE_CONTROL *const rc = &cpi->rc;
+  TWO_PASS *const twopass = &cpi->twopass;
+  GF_GROUP *const gf_group = &twopass->gf_group;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  const FIRSTPASS_STATS first_frame = *this_frame;
+  const FIRSTPASS_STATS *const start_position = twopass->stats_in;
+  FIRSTPASS_STATS next_frame;
+  FIRSTPASS_STATS last_frame;
+  int kf_bits = 0;
+  int loop_decay_counter = 0;
+  double decay_accumulator = 1.0;
+  double av_decay_accumulator = 0.0;
+  double zero_motion_accumulator = 1.0;
+  double boost_score = 0.0;
+  double kf_mod_err = 0.0;
+  double kf_group_err = 0.0;
+  double recent_loop_decay[FRAMES_TO_CHECK_DECAY];
+
+  vp9_zero(next_frame);
+
+  cpi->common.frame_type = KEY_FRAME;
+
+  // Reset the GF group data structures.
+  vp9_zero(*gf_group);
+
+  // Is this a forced key frame by interval.
+  rc->this_key_frame_forced = rc->next_key_frame_forced;
+
+  // Clear the alt ref active flag and last group multi arf flags as they
+  // can never be set for a key frame.
+  rc->source_alt_ref_active = 0;
+  cpi->multi_arf_last_grp_enabled = 0;
+
+  // KF is always a GF so clear frames till next gf counter.
+  rc->frames_till_gf_update_due = 0;
+
+  rc->frames_to_key = 1;
+
+  twopass->kf_group_bits = 0;        // Total bits available to kf group
+  twopass->kf_group_error_left = 0;  // Group modified error score.
+
+  kf_mod_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
+
+  // Initialize the decay rates for the recent frames to check
+  for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j)
+    recent_loop_decay[j] = 1.0;
+
+  // Find the next keyframe.
+  i = 0;
+  while (twopass->stats_in < twopass->stats_in_end &&
+         rc->frames_to_key < cpi->oxcf.key_freq) {
+    // Accumulate kf group error.
+    kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
+
+    // Load the next frame's stats.
+    last_frame = *this_frame;
+    input_stats(twopass, this_frame);
+
+    // Provided that we are not at the end of the file...
+    if (cpi->oxcf.auto_key && twopass->stats_in < twopass->stats_in_end) {
+      double loop_decay_rate;
+
+      // Check for a scene cut.
+      if (test_candidate_kf(twopass, &last_frame, this_frame,
+                            twopass->stats_in))
+        break;
+
+      // How fast is the prediction quality decaying?
+      loop_decay_rate = get_prediction_decay_rate(cpi, twopass->stats_in);
+
+      // We want to know something about the recent past... rather than
+      // as used elsewhere where we are concerned with decay in prediction
+      // quality since the last GF or KF.
+      recent_loop_decay[i % FRAMES_TO_CHECK_DECAY] = loop_decay_rate;
+      decay_accumulator = 1.0;
+      for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j)
+        decay_accumulator *= recent_loop_decay[j];
+
+      // Special check for transition or high motion followed by a
+      // static scene.
+      if (detect_transition_to_still(cpi, i, cpi->oxcf.key_freq - i,
+                                     loop_decay_rate, decay_accumulator))
+        break;
+
+      // Step on to the next frame.
+      ++rc->frames_to_key;
+
+      // If we don't have a real key frame within the next two
+      // key_freq intervals then break out of the loop.
+      if (rc->frames_to_key >= 2 * cpi->oxcf.key_freq)
+        break;
+    } else {
+      ++rc->frames_to_key;
+    }
+    ++i;
+  }
+
+  // If there is a max kf interval set by the user we must obey it.
+  // We already breakout of the loop above at 2x max.
+  // This code centers the extra kf if the actual natural interval
+  // is between 1x and 2x.
+  if (cpi->oxcf.auto_key &&
+      rc->frames_to_key > cpi->oxcf.key_freq) {
+    FIRSTPASS_STATS tmp_frame = first_frame;
+
+    rc->frames_to_key /= 2;
+
+    // Reset to the start of the group.
+    reset_fpf_position(twopass, start_position);
+
+    kf_group_err = 0.0;
+
+    // Rescan to get the correct error data for the forced kf group.
+    for (i = 0; i < rc->frames_to_key; ++i) {
+      kf_group_err += calculate_modified_err(cpi, twopass, oxcf, &tmp_frame);
+      input_stats(twopass, &tmp_frame);
+    }
+    rc->next_key_frame_forced = 1;
+  } else if (twopass->stats_in == twopass->stats_in_end ||
+             rc->frames_to_key >= cpi->oxcf.key_freq) {
+    rc->next_key_frame_forced = 1;
+  } else {
+    rc->next_key_frame_forced = 0;
+  }
+
+  if (is_two_pass_svc(cpi) && cpi->svc.number_temporal_layers > 1) {
+    int count = (1 << (cpi->svc.number_temporal_layers - 1)) - 1;
+    int new_frame_to_key = (rc->frames_to_key + count) & (~count);
+    int j;
+    for (j = 0; j < new_frame_to_key - rc->frames_to_key; ++j) {
+      if (EOF == input_stats(twopass, this_frame))
+        break;
+      kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
+    }
+    rc->frames_to_key = new_frame_to_key;
+  }
+
+  // Special case for the last key frame of the file.
+  if (twopass->stats_in >= twopass->stats_in_end) {
+    // Accumulate kf group error.
+    kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
+  }
+
+  // Calculate the number of bits that should be assigned to the kf group.
+  if (twopass->bits_left > 0 && twopass->modified_error_left > 0.0) {
+    // Maximum number of bits for a single normal frame (not key frame).
+    const int max_bits = frame_max_bits(rc, &cpi->oxcf);
+
+    // Maximum number of bits allocated to the key frame group.
+    int64_t max_grp_bits;
+
+    // Default allocation based on bits left and relative
+    // complexity of the section.
+    twopass->kf_group_bits = (int64_t)(twopass->bits_left *
+       (kf_group_err / twopass->modified_error_left));
+
+    // Clip based on maximum per frame rate defined by the user.
+    max_grp_bits = (int64_t)max_bits * (int64_t)rc->frames_to_key;
+    if (twopass->kf_group_bits > max_grp_bits)
+      twopass->kf_group_bits = max_grp_bits;
+  } else {
+    twopass->kf_group_bits = 0;
+  }
+  twopass->kf_group_bits = VPXMAX(0, twopass->kf_group_bits);
+
+  // Reset the first pass file position.
+  reset_fpf_position(twopass, start_position);
+
+  // Scan through the kf group collating various stats used to determine
+  // how many bits to spend on it.
+  decay_accumulator = 1.0;
+  boost_score = 0.0;
+  for (i = 0; i < (rc->frames_to_key - 1); ++i) {
+    if (EOF == input_stats(twopass, &next_frame))
+      break;
+
+    // Monitor for static sections.
+    zero_motion_accumulator = VPXMIN(
+        zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));
+
+    // Not all frames in the group are necessarily used in calculating boost.
+    if ((i <= rc->max_gf_interval) ||
+        ((i <= (rc->max_gf_interval * 4)) && (decay_accumulator > 0.5))) {
+      const double frame_boost =
+        calc_frame_boost(cpi, &next_frame, 0, KF_MAX_BOOST);
+
+      // How fast is prediction quality decaying.
+      if (!detect_flash(twopass, 0)) {
+        const double loop_decay_rate =
+          get_prediction_decay_rate(cpi, &next_frame);
+        decay_accumulator *= loop_decay_rate;
+        decay_accumulator = VPXMAX(decay_accumulator, MIN_DECAY_FACTOR);
+        av_decay_accumulator += decay_accumulator;
+        ++loop_decay_counter;
+      }
+      boost_score += (decay_accumulator * frame_boost);
+    }
+  }
+  av_decay_accumulator /= (double)loop_decay_counter;
+
+  reset_fpf_position(twopass, start_position);
+
+  // Store the zero motion percentage
+  twopass->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0);
+
+  // Calculate a section intra ratio used in setting max loop filter.
+  twopass->section_intra_rating =
+      calculate_section_intra_ratio(start_position, twopass->stats_in_end,
+                                    rc->frames_to_key);
+
+  // Apply various clamps for min and max boost
+  rc->kf_boost = (int)(av_decay_accumulator * boost_score);
+  rc->kf_boost = VPXMAX(rc->kf_boost, (rc->frames_to_key * 3));
+  rc->kf_boost = VPXMAX(rc->kf_boost, MIN_KF_BOOST);
+
+  // Work out how many bits to allocate for the key frame itself.
+  kf_bits = calculate_boost_bits((rc->frames_to_key - 1),
+                                  rc->kf_boost, twopass->kf_group_bits);
+
+  // Work out the fraction of the kf group bits reserved for the inter frames
+  // within the group after discounting the bits for the kf itself.
+  if (twopass->kf_group_bits) {
+    twopass->kfgroup_inter_fraction =
+      (double)(twopass->kf_group_bits - kf_bits) /
+      (double)twopass->kf_group_bits;
+  } else {
+    twopass->kfgroup_inter_fraction = 1.0;
+  }
+
+  twopass->kf_group_bits -= kf_bits;
+
+  // Save the bits to spend on the key frame.
+  gf_group->bit_allocation[0] = kf_bits;
+  gf_group->update_type[0] = KF_UPDATE;
+  gf_group->rf_level[0] = KF_STD;
+
+  // Note the total error score of the kf group minus the key frame itself.
+  twopass->kf_group_error_left = (int)(kf_group_err - kf_mod_err);
+
+  // Adjust the count of total modified error left.
+  // The count of bits left is adjusted elsewhere based on real coded frame
+  // sizes.
+  twopass->modified_error_left -= kf_group_err;
+
+  if (oxcf->resize_mode == RESIZE_DYNAMIC) {
+    // Default to normal-sized frame on keyframes.
+    cpi->rc.next_frame_size_selector = UNSCALED;
+  }
+}
+
+// Define the reference buffers that will be updated post encode.
+static void configure_buffer_updates(VP9_COMP *cpi) {
+  TWO_PASS *const twopass = &cpi->twopass;
+
+  cpi->rc.is_src_frame_alt_ref = 0;
+  switch (twopass->gf_group.update_type[twopass->gf_group.index]) {
+    case KF_UPDATE:
+      cpi->refresh_last_frame = 1;
+      cpi->refresh_golden_frame = 1;
+      cpi->refresh_alt_ref_frame = 1;
+      break;
+    case LF_UPDATE:
+      cpi->refresh_last_frame = 1;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_alt_ref_frame = 0;
+      break;
+    case GF_UPDATE:
+      cpi->refresh_last_frame = 1;
+      cpi->refresh_golden_frame = 1;
+      cpi->refresh_alt_ref_frame = 0;
+      break;
+    case OVERLAY_UPDATE:
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 1;
+      cpi->refresh_alt_ref_frame = 0;
+      cpi->rc.is_src_frame_alt_ref = 1;
+      break;
+    case ARF_UPDATE:
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_alt_ref_frame = 1;
+      break;
+    default:
+      assert(0);
+      break;
+  }
+  if (is_two_pass_svc(cpi)) {
+    if (cpi->svc.temporal_layer_id > 0) {
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 0;
+    }
+    if (cpi->svc.layer_context[cpi->svc.spatial_layer_id].gold_ref_idx < 0)
+      cpi->refresh_golden_frame = 0;
+    if (cpi->alt_ref_source == NULL)
+      cpi->refresh_alt_ref_frame = 0;
+  }
+}
+
+static int is_skippable_frame(const VP9_COMP *cpi) {
+  // If the current frame does not have non-zero motion vector detected in the
+  // first  pass, and so do its previous and forward frames, then this frame
+  // can be skipped for partition check, and the partition size is assigned
+  // according to the variance
+  const SVC *const svc = &cpi->svc;
+  const TWO_PASS *const twopass = is_two_pass_svc(cpi) ?
+      &svc->layer_context[svc->spatial_layer_id].twopass : &cpi->twopass;
+
+  return (!frame_is_intra_only(&cpi->common) &&
+    twopass->stats_in - 2 > twopass->stats_in_start &&
+    twopass->stats_in < twopass->stats_in_end &&
+    (twopass->stats_in - 1)->pcnt_inter - (twopass->stats_in - 1)->pcnt_motion
+    == 1 &&
+    (twopass->stats_in - 2)->pcnt_inter - (twopass->stats_in - 2)->pcnt_motion
+    == 1 &&
+    twopass->stats_in->pcnt_inter - twopass->stats_in->pcnt_motion == 1);
+}
+
+void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  TWO_PASS *const twopass = &cpi->twopass;
+  GF_GROUP *const gf_group = &twopass->gf_group;
+  int frames_left;
+  FIRSTPASS_STATS this_frame;
+
+  int target_rate;
+  LAYER_CONTEXT *const lc = is_two_pass_svc(cpi) ?
+        &cpi->svc.layer_context[cpi->svc.spatial_layer_id] : 0;
+
+  if (lc != NULL) {
+    frames_left = (int)(twopass->total_stats.count -
+                  lc->current_video_frame_in_layer);
+  } else {
+    frames_left = (int)(twopass->total_stats.count -
+                  cm->current_video_frame);
+  }
+
+  if (!twopass->stats_in)
+    return;
+
+  // If this is an arf frame then we dont want to read the stats file or
+  // advance the input pointer as we already have what we need.
+  if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
+    int target_rate;
+    configure_buffer_updates(cpi);
+    target_rate = gf_group->bit_allocation[gf_group->index];
+    target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate);
+    rc->base_frame_target = target_rate;
+
+    cm->frame_type = INTER_FRAME;
+
+    if (lc != NULL) {
+      if (cpi->svc.spatial_layer_id == 0) {
+        lc->is_key_frame = 0;
+      } else {
+        lc->is_key_frame = cpi->svc.layer_context[0].is_key_frame;
+
+        if (lc->is_key_frame)
+          cpi->ref_frame_flags &= (~VP9_LAST_FLAG);
+      }
+    }
+
+    // Do the firstpass stats indicate that this frame is skippable for the
+    // partition search?
+    if (cpi->sf.allow_partition_search_skip &&
+        cpi->oxcf.pass == 2 && (!cpi->use_svc || is_two_pass_svc(cpi))) {
+      cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
+    }
+
+    return;
+  }
+
+  vpx_clear_system_state();
+
+  if (cpi->oxcf.rc_mode == VPX_Q) {
+    twopass->active_worst_quality = cpi->oxcf.cq_level;
+  } else if (cm->current_video_frame == 0 ||
+             (lc != NULL && lc->current_video_frame_in_layer == 0)) {
+    // Special case code for first frame.
+    const int section_target_bandwidth = (int)(twopass->bits_left /
+                                               frames_left);
+    const double section_length = twopass->total_left_stats.count;
+    const double section_error =
+      twopass->total_left_stats.coded_error / section_length;
+    const double section_intra_skip =
+      twopass->total_left_stats.intra_skip_pct / section_length;
+    const double section_inactive_zone =
+      (twopass->total_left_stats.inactive_zone_rows * 2) /
+      ((double)cm->mb_rows * section_length);
+    const int tmp_q =
+      get_twopass_worst_quality(cpi, section_error,
+                                section_intra_skip + section_inactive_zone,
+                                section_target_bandwidth, DEFAULT_GRP_WEIGHT);
+
+    twopass->active_worst_quality = tmp_q;
+    twopass->baseline_active_worst_quality = tmp_q;
+    rc->ni_av_qi = tmp_q;
+    rc->last_q[INTER_FRAME] = tmp_q;
+    rc->avg_q = vp9_convert_qindex_to_q(tmp_q, cm->bit_depth);
+    rc->avg_frame_qindex[INTER_FRAME] = tmp_q;
+    rc->last_q[KEY_FRAME] = (tmp_q + cpi->oxcf.best_allowed_q) / 2;
+    rc->avg_frame_qindex[KEY_FRAME] = rc->last_q[KEY_FRAME];
+  }
+  vp9_zero(this_frame);
+  if (EOF == input_stats(twopass, &this_frame))
+    return;
+
+  // Set the frame content type flag.
+  if (this_frame.intra_skip_pct >= FC_ANIMATION_THRESH)
+    twopass->fr_content_type = FC_GRAPHICS_ANIMATION;
+  else
+    twopass->fr_content_type = FC_NORMAL;
+
+  // Keyframe and section processing.
+  if (rc->frames_to_key == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY)) {
+    FIRSTPASS_STATS this_frame_copy;
+    this_frame_copy = this_frame;
+    // Define next KF group and assign bits to it.
+    find_next_key_frame(cpi, &this_frame);
+    this_frame = this_frame_copy;
+  } else {
+    cm->frame_type = INTER_FRAME;
+  }
+
+  if (lc != NULL) {
+    if (cpi->svc.spatial_layer_id == 0) {
+      lc->is_key_frame = (cm->frame_type == KEY_FRAME);
+      if (lc->is_key_frame) {
+        cpi->ref_frame_flags &=
+            (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG);
+        lc->frames_from_key_frame = 0;
+        // Encode an intra only empty frame since we have a key frame.
+        cpi->svc.encode_intra_empty_frame = 1;
+      }
+    } else {
+      cm->frame_type = INTER_FRAME;
+      lc->is_key_frame = cpi->svc.layer_context[0].is_key_frame;
+
+      if (lc->is_key_frame) {
+        cpi->ref_frame_flags &= (~VP9_LAST_FLAG);
+        lc->frames_from_key_frame = 0;
+      }
+    }
+  }
+
+  // Define a new GF/ARF group. (Should always enter here for key frames).
+  if (rc->frames_till_gf_update_due == 0) {
+    define_gf_group(cpi, &this_frame);
+
+    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+    if (lc != NULL)
+      cpi->refresh_golden_frame = 1;
+
+#if ARF_STATS_OUTPUT
+    {
+      FILE *fpfile;
+      fpfile = fopen("arf.stt", "a");
+      ++arf_count;
+      fprintf(fpfile, "%10d %10ld %10d %10d %10ld\n",
+              cm->current_video_frame, rc->frames_till_gf_update_due,
+              rc->kf_boost, arf_count, rc->gfu_boost);
+
+      fclose(fpfile);
+    }
+#endif
+  }
+
+  configure_buffer_updates(cpi);
+
+  // Do the firstpass stats indicate that this frame is skippable for the
+  // partition search?
+  if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2 &&
+      (!cpi->use_svc || is_two_pass_svc(cpi))) {
+    cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
+  }
+
+  target_rate = gf_group->bit_allocation[gf_group->index];
+  rc->base_frame_target = target_rate;
+
+  {
+    const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+                        ? cpi->initial_mbs : cpi->common.MBs;
+    // The multiplication by 256 reverses a scaling factor of (>> 8)
+    // applied when combining MB error values for the frame.
+    twopass->mb_av_energy =
+      log(((this_frame.intra_error * 256.0) / num_mbs) + 1.0);
+  }
+
+  // Update the total stats remaining structure.
+  subtract_stats(&twopass->total_left_stats, &this_frame);
+}
+
+#define MINQ_ADJ_LIMIT 48
+#define MINQ_ADJ_LIMIT_CQ 20
+#define HIGH_UNDERSHOOT_RATIO 2
+void vp9_twopass_postencode_update(VP9_COMP *cpi) {
+  TWO_PASS *const twopass = &cpi->twopass;
+  RATE_CONTROL *const rc = &cpi->rc;
+  const int bits_used = rc->base_frame_target;
+
+  // VBR correction is done through rc->vbr_bits_off_target. Based on the
+  // sign of this value, a limited % adjustment is made to the target rate
+  // of subsequent frames, to try and push it back towards 0. This method
+  // is designed to prevent extreme behaviour at the end of a clip
+  // or group of frames.
+  rc->vbr_bits_off_target += rc->base_frame_target - rc->projected_frame_size;
+  twopass->bits_left = VPXMAX(twopass->bits_left - bits_used, 0);
+
+  // Calculate the pct rc error.
+  if (rc->total_actual_bits) {
+    rc->rate_error_estimate =
+      (int)((rc->vbr_bits_off_target * 100) / rc->total_actual_bits);
+    rc->rate_error_estimate = clamp(rc->rate_error_estimate, -100, 100);
+  } else {
+    rc->rate_error_estimate = 0;
+  }
+
+  if (cpi->common.frame_type != KEY_FRAME &&
+      !vp9_is_upper_layer_key_frame(cpi)) {
+    twopass->kf_group_bits -= bits_used;
+    twopass->last_kfgroup_zeromotion_pct = twopass->kf_zeromotion_pct;
+  }
+  twopass->kf_group_bits = VPXMAX(twopass->kf_group_bits, 0);
+
+  // Increment the gf group index ready for the next frame.
+  ++twopass->gf_group.index;
+
+  // If the rate control is drifting consider adjustment to min or maxq.
+  if ((cpi->oxcf.rc_mode != VPX_Q) &&
+      (cpi->twopass.gf_zeromotion_pct < VLOW_MOTION_THRESHOLD) &&
+      !cpi->rc.is_src_frame_alt_ref) {
+    const int maxq_adj_limit =
+      rc->worst_quality - twopass->active_worst_quality;
+    const int minq_adj_limit =
+        (cpi->oxcf.rc_mode == VPX_CQ ? MINQ_ADJ_LIMIT_CQ : MINQ_ADJ_LIMIT);
+
+    // Undershoot.
+    if (rc->rate_error_estimate > cpi->oxcf.under_shoot_pct) {
+      --twopass->extend_maxq;
+      if (rc->rolling_target_bits >= rc->rolling_actual_bits)
+        ++twopass->extend_minq;
+    // Overshoot.
+    } else if (rc->rate_error_estimate < -cpi->oxcf.over_shoot_pct) {
+      --twopass->extend_minq;
+      if (rc->rolling_target_bits < rc->rolling_actual_bits)
+        ++twopass->extend_maxq;
+    } else {
+      // Adjustment for extreme local overshoot.
+      if (rc->projected_frame_size > (2 * rc->base_frame_target) &&
+          rc->projected_frame_size > (2 * rc->avg_frame_bandwidth))
+        ++twopass->extend_maxq;
+
+      // Unwind undershoot or overshoot adjustment.
+      if (rc->rolling_target_bits < rc->rolling_actual_bits)
+        --twopass->extend_minq;
+      else if (rc->rolling_target_bits > rc->rolling_actual_bits)
+        --twopass->extend_maxq;
+    }
+
+    twopass->extend_minq = clamp(twopass->extend_minq, 0, minq_adj_limit);
+    twopass->extend_maxq = clamp(twopass->extend_maxq, 0, maxq_adj_limit);
+
+    // If there is a big and undexpected undershoot then feed the extra
+    // bits back in quickly. One situation where this may happen is if a
+    // frame is unexpectedly almost perfectly predicted by the ARF or GF
+    // but not very well predcited by the previous frame.
+    if (!frame_is_kf_gf_arf(cpi) && !cpi->rc.is_src_frame_alt_ref) {
+      int fast_extra_thresh = rc->base_frame_target / HIGH_UNDERSHOOT_RATIO;
+      if (rc->projected_frame_size < fast_extra_thresh) {
+        rc->vbr_bits_off_target_fast +=
+          fast_extra_thresh - rc->projected_frame_size;
+        rc->vbr_bits_off_target_fast =
+          VPXMIN(rc->vbr_bits_off_target_fast, (4 * rc->avg_frame_bandwidth));
+
+        // Fast adaptation of minQ if necessary to use up the extra bits.
+        if (rc->avg_frame_bandwidth) {
+          twopass->extend_minq_fast =
+            (int)(rc->vbr_bits_off_target_fast * 8 / rc->avg_frame_bandwidth);
+        }
+        twopass->extend_minq_fast = VPXMIN(
+            twopass->extend_minq_fast, minq_adj_limit - twopass->extend_minq);
+      } else if (rc->vbr_bits_off_target_fast) {
+        twopass->extend_minq_fast = VPXMIN(
+            twopass->extend_minq_fast, minq_adj_limit - twopass->extend_minq);
+      } else {
+        twopass->extend_minq_fast = 0;
+      }
+    }
+  }
+}
diff --git a/libs/libvpx/vp9/encoder/vp9_firstpass.h b/libs/libvpx/vp9/encoder/vp9_firstpass.h
new file mode 100644
index 0000000000..5875a7b9b5
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_firstpass.h
@@ -0,0 +1,164 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_VP9_FIRSTPASS_H_
+#define VP9_ENCODER_VP9_FIRSTPASS_H_
+
+#include "vp9/encoder/vp9_lookahead.h"
+#include "vp9/encoder/vp9_ratectrl.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if CONFIG_FP_MB_STATS
+
+#define FPMB_DCINTRA_MASK 0x01
+
+#define FPMB_MOTION_ZERO_MASK 0x02
+#define FPMB_MOTION_LEFT_MASK 0x04
+#define FPMB_MOTION_RIGHT_MASK 0x08
+#define FPMB_MOTION_UP_MASK 0x10
+#define FPMB_MOTION_DOWN_MASK 0x20
+
+#define FPMB_ERROR_SMALL_MASK 0x40
+#define FPMB_ERROR_LARGE_MASK 0x80
+#define FPMB_ERROR_SMALL_TH 2000
+#define FPMB_ERROR_LARGE_TH 48000
+
+typedef struct {
+  uint8_t *mb_stats_start;
+  uint8_t *mb_stats_end;
+} FIRSTPASS_MB_STATS;
+#endif
+
+#define VLOW_MOTION_THRESHOLD 950
+
+typedef struct {
+  double frame;
+  double weight;
+  double intra_error;
+  double coded_error;
+  double sr_coded_error;
+  double pcnt_inter;
+  double pcnt_motion;
+  double pcnt_second_ref;
+  double pcnt_neutral;
+  double intra_skip_pct;
+  double inactive_zone_rows;  // Image mask rows top and bottom.
+  double inactive_zone_cols;  // Image mask columns at left and right edges.
+  double MVr;
+  double mvr_abs;
+  double MVc;
+  double mvc_abs;
+  double MVrv;
+  double MVcv;
+  double mv_in_out_count;
+  double new_mv_count;
+  double duration;
+  double count;
+  int64_t spatial_layer_id;
+} FIRSTPASS_STATS;
+
+typedef enum {
+  KF_UPDATE = 0,
+  LF_UPDATE = 1,
+  GF_UPDATE = 2,
+  ARF_UPDATE = 3,
+  OVERLAY_UPDATE = 4,
+  FRAME_UPDATE_TYPES = 5
+} FRAME_UPDATE_TYPE;
+
+#define FC_ANIMATION_THRESH 0.15
+typedef enum {
+  FC_NORMAL = 0,
+  FC_GRAPHICS_ANIMATION = 1,
+  FRAME_CONTENT_TYPES = 2
+} FRAME_CONTENT_TYPE;
+
+typedef struct {
+  unsigned char index;
+  RATE_FACTOR_LEVEL rf_level[(MAX_LAG_BUFFERS * 2) + 1];
+  FRAME_UPDATE_TYPE update_type[(MAX_LAG_BUFFERS * 2) + 1];
+  unsigned char arf_src_offset[(MAX_LAG_BUFFERS * 2) + 1];
+  unsigned char arf_update_idx[(MAX_LAG_BUFFERS * 2) + 1];
+  unsigned char arf_ref_idx[(MAX_LAG_BUFFERS * 2) + 1];
+  int bit_allocation[(MAX_LAG_BUFFERS * 2) + 1];
+} GF_GROUP;
+
+typedef struct {
+  unsigned int section_intra_rating;
+  FIRSTPASS_STATS total_stats;
+  FIRSTPASS_STATS this_frame_stats;
+  const FIRSTPASS_STATS *stats_in;
+  const FIRSTPASS_STATS *stats_in_start;
+  const FIRSTPASS_STATS *stats_in_end;
+  FIRSTPASS_STATS total_left_stats;
+  int first_pass_done;
+  int64_t bits_left;
+  double modified_error_min;
+  double modified_error_max;
+  double modified_error_left;
+  double mb_av_energy;
+
+#if CONFIG_FP_MB_STATS
+  uint8_t *frame_mb_stats_buf;
+  uint8_t *this_frame_mb_stats;
+  FIRSTPASS_MB_STATS firstpass_mb_stats;
+#endif
+  // An indication of the content type of the current frame
+  FRAME_CONTENT_TYPE fr_content_type;
+
+  // Projected total bits available for a key frame group of frames
+  int64_t kf_group_bits;
+
+  // Error score of frames still to be coded in kf group
+  int64_t kf_group_error_left;
+
+  // The fraction for a kf groups total bits allocated to the inter frames
+  double kfgroup_inter_fraction;
+
+  int sr_update_lag;
+
+  int kf_zeromotion_pct;
+  int last_kfgroup_zeromotion_pct;
+  int gf_zeromotion_pct;
+  int active_worst_quality;
+  int baseline_active_worst_quality;
+  int extend_minq;
+  int extend_maxq;
+  int extend_minq_fast;
+
+  GF_GROUP gf_group;
+} TWO_PASS;
+
+struct VP9_COMP;
+
+void vp9_init_first_pass(struct VP9_COMP *cpi);
+void vp9_rc_get_first_pass_params(struct VP9_COMP *cpi);
+void vp9_first_pass(struct VP9_COMP *cpi, const struct lookahead_entry *source);
+void vp9_end_first_pass(struct VP9_COMP *cpi);
+
+void vp9_init_second_pass(struct VP9_COMP *cpi);
+void vp9_rc_get_second_pass_params(struct VP9_COMP *cpi);
+void vp9_twopass_postencode_update(struct VP9_COMP *cpi);
+
+// Post encode update of the rate control parameters for 2-pass
+void vp9_twopass_postencode_update(struct VP9_COMP *cpi);
+
+void calculate_coded_size(struct VP9_COMP *cpi,
+                          int *scaled_frame_width,
+                          int *scaled_frame_height);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_VP9_FIRSTPASS_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_lookahead.c b/libs/libvpx/vp9/encoder/vp9_lookahead.c
new file mode 100644
index 0000000000..def9b8c1db
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_lookahead.c
@@ -0,0 +1,245 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <assert.h>
+#include <stdlib.h>
+
+#include "./vpx_config.h"
+
+#include "vp9/common/vp9_common.h"
+
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_extend.h"
+#include "vp9/encoder/vp9_lookahead.h"
+
+/* Return the buffer at the given absolute index and increment the index */
+static struct lookahead_entry *pop(struct lookahead_ctx *ctx,
+                                   unsigned int *idx) {
+  unsigned int index = *idx;
+  struct lookahead_entry *buf = ctx->buf + index;
+
+  assert(index < ctx->max_sz);
+  if (++index >= ctx->max_sz)
+    index -= ctx->max_sz;
+  *idx = index;
+  return buf;
+}
+
+
+void vp9_lookahead_destroy(struct lookahead_ctx *ctx) {
+  if (ctx) {
+    if (ctx->buf) {
+      unsigned int i;
+
+      for (i = 0; i < ctx->max_sz; i++)
+        vpx_free_frame_buffer(&ctx->buf[i].img);
+      free(ctx->buf);
+    }
+    free(ctx);
+  }
+}
+
+
+struct lookahead_ctx *vp9_lookahead_init(unsigned int width,
+                                         unsigned int height,
+                                         unsigned int subsampling_x,
+                                         unsigned int subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                         int use_highbitdepth,
+#endif
+                                         unsigned int depth) {
+  struct lookahead_ctx *ctx = NULL;
+
+  // Clamp the lookahead queue depth
+  depth = clamp(depth, 1, MAX_LAG_BUFFERS);
+
+  // Allocate memory to keep previous source frames available.
+  depth += MAX_PRE_FRAMES;
+
+  // Allocate the lookahead structures
+  ctx = calloc(1, sizeof(*ctx));
+  if (ctx) {
+    const int legacy_byte_alignment = 0;
+    unsigned int i;
+    ctx->max_sz = depth;
+    ctx->buf = calloc(depth, sizeof(*ctx->buf));
+    if (!ctx->buf)
+      goto bail;
+    for (i = 0; i < depth; i++)
+      if (vpx_alloc_frame_buffer(&ctx->buf[i].img,
+                                 width, height, subsampling_x, subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                 use_highbitdepth,
+#endif
+                                 VP9_ENC_BORDER_IN_PIXELS,
+                                 legacy_byte_alignment))
+        goto bail;
+  }
+  return ctx;
+ bail:
+  vp9_lookahead_destroy(ctx);
+  return NULL;
+}
+
+#define USE_PARTIAL_COPY 0
+
+int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG   *src,
+                       int64_t ts_start, int64_t ts_end,
+#if CONFIG_VP9_HIGHBITDEPTH
+                       int use_highbitdepth,
+#endif
+                       unsigned int flags) {
+  struct lookahead_entry *buf;
+#if USE_PARTIAL_COPY
+  int row, col, active_end;
+  int mb_rows = (src->y_height + 15) >> 4;
+  int mb_cols = (src->y_width + 15) >> 4;
+#endif
+  int width = src->y_crop_width;
+  int height = src->y_crop_height;
+  int uv_width = src->uv_crop_width;
+  int uv_height = src->uv_crop_height;
+  int subsampling_x = src->subsampling_x;
+  int subsampling_y = src->subsampling_y;
+  int larger_dimensions, new_dimensions;
+
+  if (ctx->sz + 1  + MAX_PRE_FRAMES > ctx->max_sz)
+    return 1;
+  ctx->sz++;
+  buf = pop(ctx, &ctx->write_idx);
+
+  new_dimensions = width != buf->img.y_crop_width ||
+                   height != buf->img.y_crop_height ||
+                   uv_width != buf->img.uv_crop_width ||
+                   uv_height != buf->img.uv_crop_height;
+  larger_dimensions = width > buf->img.y_width ||
+                      height > buf->img.y_height ||
+                      uv_width > buf->img.uv_width ||
+                      uv_height > buf->img.uv_height;
+  assert(!larger_dimensions || new_dimensions);
+
+#if USE_PARTIAL_COPY
+  // TODO(jkoleszar): This is disabled for now, as
+  // vp9_copy_and_extend_frame_with_rect is not subsampling/alpha aware.
+
+  // Only do this partial copy if the following conditions are all met:
+  // 1. Lookahead queue has has size of 1.
+  // 2. Active map is provided.
+  // 3. This is not a key frame, golden nor altref frame.
+  if (!new_dimensions && ctx->max_sz == 1 && active_map && !flags) {
+    for (row = 0; row < mb_rows; ++row) {
+      col = 0;
+
+      while (1) {
+        // Find the first active macroblock in this row.
+        for (; col < mb_cols; ++col) {
+          if (active_map[col])
+            break;
+        }
+
+        // No more active macroblock in this row.
+        if (col == mb_cols)
+          break;
+
+        // Find the end of active region in this row.
+        active_end = col;
+
+        for (; active_end < mb_cols; ++active_end) {
+          if (!active_map[active_end])
+            break;
+        }
+
+        // Only copy this active region.
+        vp9_copy_and_extend_frame_with_rect(src, &buf->img,
+                                            row << 4,
+                                            col << 4, 16,
+                                            (active_end - col) << 4);
+
+        // Start again from the end of this active region.
+        col = active_end;
+      }
+
+      active_map += mb_cols;
+    }
+  } else {
+#endif
+    if (larger_dimensions) {
+      YV12_BUFFER_CONFIG new_img;
+      memset(&new_img, 0, sizeof(new_img));
+      if (vpx_alloc_frame_buffer(&new_img,
+                                 width, height, subsampling_x, subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                 use_highbitdepth,
+#endif
+                                 VP9_ENC_BORDER_IN_PIXELS,
+                                 0))
+          return 1;
+      vpx_free_frame_buffer(&buf->img);
+      buf->img = new_img;
+    } else if (new_dimensions) {
+      buf->img.y_crop_width = src->y_crop_width;
+      buf->img.y_crop_height = src->y_crop_height;
+      buf->img.uv_crop_width = src->uv_crop_width;
+      buf->img.uv_crop_height = src->uv_crop_height;
+      buf->img.subsampling_x = src->subsampling_x;
+      buf->img.subsampling_y = src->subsampling_y;
+    }
+    // Partial copy not implemented yet
+    vp9_copy_and_extend_frame(src, &buf->img);
+#if USE_PARTIAL_COPY
+  }
+#endif
+
+  buf->ts_start = ts_start;
+  buf->ts_end = ts_end;
+  buf->flags = flags;
+  return 0;
+}
+
+
+struct lookahead_entry *vp9_lookahead_pop(struct lookahead_ctx *ctx,
+                                          int drain) {
+  struct lookahead_entry *buf = NULL;
+
+  if (ctx && ctx->sz && (drain || ctx->sz == ctx->max_sz - MAX_PRE_FRAMES)) {
+    buf = pop(ctx, &ctx->read_idx);
+    ctx->sz--;
+  }
+  return buf;
+}
+
+
+struct lookahead_entry *vp9_lookahead_peek(struct lookahead_ctx *ctx,
+                                           int index) {
+  struct lookahead_entry *buf = NULL;
+
+  if (index >= 0) {
+    // Forward peek
+    if (index < (int)ctx->sz) {
+      index += ctx->read_idx;
+      if (index >= (int)ctx->max_sz)
+        index -= ctx->max_sz;
+      buf = ctx->buf + index;
+    }
+  } else if (index < 0) {
+    // Backward peek
+    if (-index <= MAX_PRE_FRAMES) {
+      index += ctx->read_idx;
+      if (index < 0)
+        index += ctx->max_sz;
+      buf = ctx->buf + index;
+    }
+  }
+
+  return buf;
+}
+
+unsigned int vp9_lookahead_depth(struct lookahead_ctx *ctx) {
+  return ctx->sz;
+}
diff --git a/libs/libvpx/vp9/encoder/vp9_lookahead.h b/libs/libvpx/vp9/encoder/vp9_lookahead.h
new file mode 100644
index 0000000000..13820380ff
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_lookahead.h
@@ -0,0 +1,124 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_VP9_LOOKAHEAD_H_
+#define VP9_ENCODER_VP9_LOOKAHEAD_H_
+
+#include "vpx_scale/yv12config.h"
+#include "vpx/vpx_integer.h"
+
+#if CONFIG_SPATIAL_SVC
+#include "vpx/vp8cx.h"
+#include "vpx/vpx_encoder.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_LAG_BUFFERS 25
+
+struct lookahead_entry {
+  YV12_BUFFER_CONFIG  img;
+  int64_t             ts_start;
+  int64_t             ts_end;
+  unsigned int        flags;
+};
+
+// The max of past frames we want to keep in the queue.
+#define MAX_PRE_FRAMES 1
+
+struct lookahead_ctx {
+  unsigned int max_sz;         /* Absolute size of the queue */
+  unsigned int sz;             /* Number of buffers currently in the queue */
+  unsigned int read_idx;       /* Read index */
+  unsigned int write_idx;      /* Write index */
+  struct lookahead_entry *buf; /* Buffer list */
+};
+
+/**\brief Initializes the lookahead stage
+ *
+ * The lookahead stage is a queue of frame buffers on which some analysis
+ * may be done when buffers are enqueued.
+ */
+struct lookahead_ctx *vp9_lookahead_init(unsigned int width,
+                                         unsigned int height,
+                                         unsigned int subsampling_x,
+                                         unsigned int subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                         int use_highbitdepth,
+#endif
+                                         unsigned int depth);
+
+
+/**\brief Destroys the lookahead stage
+ */
+void vp9_lookahead_destroy(struct lookahead_ctx *ctx);
+
+
+/**\brief Enqueue a source buffer
+ *
+ * This function will copy the source image into a new framebuffer with
+ * the expected stride/border.
+ *
+ * If active_map is non-NULL and there is only one frame in the queue, then copy
+ * only active macroblocks.
+ *
+ * \param[in] ctx         Pointer to the lookahead context
+ * \param[in] src         Pointer to the image to enqueue
+ * \param[in] ts_start    Timestamp for the start of this frame
+ * \param[in] ts_end      Timestamp for the end of this frame
+ * \param[in] flags       Flags set on this frame
+ * \param[in] active_map  Map that specifies which macroblock is active
+ */
+int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
+                       int64_t ts_start, int64_t ts_end,
+#if CONFIG_VP9_HIGHBITDEPTH
+                       int use_highbitdepth,
+#endif
+                       unsigned int flags);
+
+
+/**\brief Get the next source buffer to encode
+ *
+ *
+ * \param[in] ctx       Pointer to the lookahead context
+ * \param[in] drain     Flag indicating the buffer should be drained
+ *                      (return a buffer regardless of the current queue depth)
+ *
+ * \retval NULL, if drain set and queue is empty
+ * \retval NULL, if drain not set and queue not of the configured depth
+ */
+struct lookahead_entry *vp9_lookahead_pop(struct lookahead_ctx *ctx,
+                                          int drain);
+
+
+/**\brief Get a future source buffer to encode
+ *
+ * \param[in] ctx       Pointer to the lookahead context
+ * \param[in] index     Index of the frame to be returned, 0 == next frame
+ *
+ * \retval NULL, if no buffer exists at the specified index
+ */
+struct lookahead_entry *vp9_lookahead_peek(struct lookahead_ctx *ctx,
+                                           int index);
+
+
+/**\brief Get the number of frames currently in the lookahead queue
+ *
+ * \param[in] ctx       Pointer to the lookahead context
+ */
+unsigned int vp9_lookahead_depth(struct lookahead_ctx *ctx);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_VP9_LOOKAHEAD_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_mbgraph.c b/libs/libvpx/vp9/encoder/vp9_mbgraph.c
new file mode 100644
index 0000000000..7ce86b45dc
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_mbgraph.c
@@ -0,0 +1,419 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <limits.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/system_state.h"
+#include "vp9/encoder/vp9_segmentation.h"
+#include "vp9/encoder/vp9_mcomp.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/common/vp9_reconintra.h"
+
+
+static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
+                                              const MV *ref_mv,
+                                              MV *dst_mv,
+                                              int mb_row,
+                                              int mb_col) {
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
+  const SEARCH_METHODS old_search_method = mv_sf->search_method;
+  const vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16];
+
+  const int tmp_col_min = x->mv_col_min;
+  const int tmp_col_max = x->mv_col_max;
+  const int tmp_row_min = x->mv_row_min;
+  const int tmp_row_max = x->mv_row_max;
+  MV ref_full;
+  int cost_list[5];
+
+  // Further step/diamond searches as necessary
+  int step_param = mv_sf->reduce_first_step_size;
+  step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2);
+
+  vp9_set_mv_search_range(x, ref_mv);
+
+  ref_full.col = ref_mv->col >> 3;
+  ref_full.row = ref_mv->row >> 3;
+
+  mv_sf->search_method = HEX;
+  vp9_full_pixel_search(cpi, x, BLOCK_16X16, &ref_full, step_param,
+                        x->errorperbit, cond_cost_list(cpi, cost_list), ref_mv,
+                        dst_mv, 0, 0);
+  mv_sf->search_method = old_search_method;
+
+  // Try sub-pixel MC
+  // if (bestsme > error_thresh && bestsme < INT_MAX)
+  {
+    int distortion;
+    unsigned int sse;
+    cpi->find_fractional_mv_step(
+        x, dst_mv, ref_mv, cpi->common.allow_high_precision_mv, x->errorperbit,
+        &v_fn_ptr, 0, mv_sf->subpel_iters_per_step,
+        cond_cost_list(cpi, cost_list),
+        NULL, NULL,
+        &distortion, &sse, NULL, 0, 0);
+  }
+
+  xd->mi[0]->mode = NEWMV;
+  xd->mi[0]->mv[0].as_mv = *dst_mv;
+
+  vp9_build_inter_predictors_sby(xd, mb_row, mb_col, BLOCK_16X16);
+
+  /* restore UMV window */
+  x->mv_col_min = tmp_col_min;
+  x->mv_col_max = tmp_col_max;
+  x->mv_row_min = tmp_row_min;
+  x->mv_row_max = tmp_row_max;
+
+  return vpx_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
+                      xd->plane[0].dst.buf, xd->plane[0].dst.stride);
+}
+
+static int do_16x16_motion_search(VP9_COMP *cpi, const MV *ref_mv,
+                                  int_mv *dst_mv, int mb_row, int mb_col) {
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  unsigned int err, tmp_err;
+  MV tmp_mv;
+
+  // Try zero MV first
+  // FIXME should really use something like near/nearest MV and/or MV prediction
+  err = vpx_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
+                     xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride);
+  dst_mv->as_int = 0;
+
+  // Test last reference frame using the previous best mv as the
+  // starting point (best reference) for the search
+  tmp_err = do_16x16_motion_iteration(cpi, ref_mv, &tmp_mv, mb_row, mb_col);
+  if (tmp_err < err) {
+    err = tmp_err;
+    dst_mv->as_mv = tmp_mv;
+  }
+
+  // If the current best reference mv is not centered on 0,0 then do a 0,0
+  // based search as well.
+  if (ref_mv->row != 0 || ref_mv->col != 0) {
+    unsigned int tmp_err;
+    MV zero_ref_mv = {0, 0}, tmp_mv;
+
+    tmp_err = do_16x16_motion_iteration(cpi, &zero_ref_mv, &tmp_mv,
+                                        mb_row, mb_col);
+    if (tmp_err < err) {
+      dst_mv->as_mv = tmp_mv;
+      err = tmp_err;
+    }
+  }
+
+  return err;
+}
+
+static int do_16x16_zerozero_search(VP9_COMP *cpi, int_mv *dst_mv) {
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  unsigned int err;
+
+  // Try zero MV first
+  // FIXME should really use something like near/nearest MV and/or MV prediction
+  err = vpx_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
+                     xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride);
+
+  dst_mv->as_int = 0;
+
+  return err;
+}
+static int find_best_16x16_intra(VP9_COMP *cpi, PREDICTION_MODE *pbest_mode) {
+  MACROBLOCK   *const x  = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  PREDICTION_MODE best_mode = -1, mode;
+  unsigned int best_err = INT_MAX;
+
+  // calculate SATD for each intra prediction mode;
+  // we're intentionally not doing 4x4, we just want a rough estimate
+  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
+    unsigned int err;
+
+    xd->mi[0]->mode = mode;
+    vp9_predict_intra_block(xd, 2, TX_16X16, mode,
+                            x->plane[0].src.buf, x->plane[0].src.stride,
+                            xd->plane[0].dst.buf, xd->plane[0].dst.stride,
+                            0, 0, 0);
+    err = vpx_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
+                       xd->plane[0].dst.buf, xd->plane[0].dst.stride);
+
+    // find best
+    if (err < best_err) {
+      best_err  = err;
+      best_mode = mode;
+    }
+  }
+
+  if (pbest_mode)
+    *pbest_mode = best_mode;
+
+  return best_err;
+}
+
+static void update_mbgraph_mb_stats
+(
+  VP9_COMP *cpi,
+  MBGRAPH_MB_STATS *stats,
+  YV12_BUFFER_CONFIG *buf,
+  int mb_y_offset,
+  YV12_BUFFER_CONFIG *golden_ref,
+  const MV *prev_golden_ref_mv,
+  YV12_BUFFER_CONFIG *alt_ref,
+  int mb_row,
+  int mb_col
+) {
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int intra_error;
+  VP9_COMMON *cm = &cpi->common;
+
+  // FIXME in practice we're completely ignoring chroma here
+  x->plane[0].src.buf = buf->y_buffer + mb_y_offset;
+  x->plane[0].src.stride = buf->y_stride;
+
+  xd->plane[0].dst.buf = get_frame_new_buffer(cm)->y_buffer + mb_y_offset;
+  xd->plane[0].dst.stride = get_frame_new_buffer(cm)->y_stride;
+
+  // do intra 16x16 prediction
+  intra_error = find_best_16x16_intra(cpi,
+                                      &stats->ref[INTRA_FRAME].m.mode);
+  if (intra_error <= 0)
+    intra_error = 1;
+  stats->ref[INTRA_FRAME].err = intra_error;
+
+  // Golden frame MV search, if it exists and is different than last frame
+  if (golden_ref) {
+    int g_motion_error;
+    xd->plane[0].pre[0].buf = golden_ref->y_buffer + mb_y_offset;
+    xd->plane[0].pre[0].stride = golden_ref->y_stride;
+    g_motion_error = do_16x16_motion_search(cpi,
+                                            prev_golden_ref_mv,
+                                            &stats->ref[GOLDEN_FRAME].m.mv,
+                                            mb_row, mb_col);
+    stats->ref[GOLDEN_FRAME].err = g_motion_error;
+  } else {
+    stats->ref[GOLDEN_FRAME].err = INT_MAX;
+    stats->ref[GOLDEN_FRAME].m.mv.as_int = 0;
+  }
+
+  // Do an Alt-ref frame MV search, if it exists and is different than
+  // last/golden frame.
+  if (alt_ref) {
+    int a_motion_error;
+    xd->plane[0].pre[0].buf = alt_ref->y_buffer + mb_y_offset;
+    xd->plane[0].pre[0].stride = alt_ref->y_stride;
+    a_motion_error = do_16x16_zerozero_search(cpi,
+                                              &stats->ref[ALTREF_FRAME].m.mv);
+
+    stats->ref[ALTREF_FRAME].err = a_motion_error;
+  } else {
+    stats->ref[ALTREF_FRAME].err = INT_MAX;
+    stats->ref[ALTREF_FRAME].m.mv.as_int = 0;
+  }
+}
+
+static void update_mbgraph_frame_stats(VP9_COMP *cpi,
+                                       MBGRAPH_FRAME_STATS *stats,
+                                       YV12_BUFFER_CONFIG *buf,
+                                       YV12_BUFFER_CONFIG *golden_ref,
+                                       YV12_BUFFER_CONFIG *alt_ref) {
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  VP9_COMMON *const cm = &cpi->common;
+
+  int mb_col, mb_row, offset = 0;
+  int mb_y_offset = 0, arf_y_offset = 0, gld_y_offset = 0;
+  MV gld_top_mv = {0, 0};
+  MODE_INFO mi_local;
+
+  vp9_zero(mi_local);
+  // Set up limit values for motion vectors to prevent them extending outside
+  // the UMV borders.
+  x->mv_row_min     = -BORDER_MV_PIXELS_B16;
+  x->mv_row_max     = (cm->mb_rows - 1) * 8 + BORDER_MV_PIXELS_B16;
+  xd->up_available  = 0;
+  xd->plane[0].dst.stride  = buf->y_stride;
+  xd->plane[0].pre[0].stride  = buf->y_stride;
+  xd->plane[1].dst.stride = buf->uv_stride;
+  xd->mi[0] = &mi_local;
+  mi_local.sb_type = BLOCK_16X16;
+  mi_local.ref_frame[0] = LAST_FRAME;
+  mi_local.ref_frame[1] = NONE;
+
+  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
+    MV gld_left_mv = gld_top_mv;
+    int mb_y_in_offset  = mb_y_offset;
+    int arf_y_in_offset = arf_y_offset;
+    int gld_y_in_offset = gld_y_offset;
+
+    // Set up limit values for motion vectors to prevent them extending outside
+    // the UMV borders.
+    x->mv_col_min      = -BORDER_MV_PIXELS_B16;
+    x->mv_col_max      = (cm->mb_cols - 1) * 8 + BORDER_MV_PIXELS_B16;
+    xd->left_available = 0;
+
+    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
+      MBGRAPH_MB_STATS *mb_stats = &stats->mb_stats[offset + mb_col];
+
+      update_mbgraph_mb_stats(cpi, mb_stats, buf, mb_y_in_offset,
+                              golden_ref, &gld_left_mv, alt_ref,
+                              mb_row, mb_col);
+      gld_left_mv = mb_stats->ref[GOLDEN_FRAME].m.mv.as_mv;
+      if (mb_col == 0) {
+        gld_top_mv = gld_left_mv;
+      }
+      xd->left_available = 1;
+      mb_y_in_offset    += 16;
+      gld_y_in_offset   += 16;
+      arf_y_in_offset   += 16;
+      x->mv_col_min     -= 16;
+      x->mv_col_max     -= 16;
+    }
+    xd->up_available = 1;
+    mb_y_offset     += buf->y_stride * 16;
+    gld_y_offset    += golden_ref->y_stride * 16;
+    if (alt_ref)
+      arf_y_offset    += alt_ref->y_stride * 16;
+    x->mv_row_min   -= 16;
+    x->mv_row_max   -= 16;
+    offset          += cm->mb_cols;
+  }
+}
+
+// void separate_arf_mbs_byzz
+static void separate_arf_mbs(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  int mb_col, mb_row, offset, i;
+  int mi_row, mi_col;
+  int ncnt[4] = { 0 };
+  int n_frames = cpi->mbgraph_n_frames;
+
+  int *arf_not_zz;
+
+  CHECK_MEM_ERROR(cm, arf_not_zz,
+                  vpx_calloc(cm->mb_rows * cm->mb_cols * sizeof(*arf_not_zz),
+                             1));
+
+  // We are not interested in results beyond the alt ref itself.
+  if (n_frames > cpi->rc.frames_till_gf_update_due)
+    n_frames = cpi->rc.frames_till_gf_update_due;
+
+  // defer cost to reference frames
+  for (i = n_frames - 1; i >= 0; i--) {
+    MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i];
+
+    for (offset = 0, mb_row = 0; mb_row < cm->mb_rows;
+         offset += cm->mb_cols, mb_row++) {
+      for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
+        MBGRAPH_MB_STATS *mb_stats = &frame_stats->mb_stats[offset + mb_col];
+
+        int altref_err = mb_stats->ref[ALTREF_FRAME].err;
+        int intra_err  = mb_stats->ref[INTRA_FRAME ].err;
+        int golden_err = mb_stats->ref[GOLDEN_FRAME].err;
+
+        // Test for altref vs intra and gf and that its mv was 0,0.
+        if (altref_err > 1000 ||
+            altref_err > intra_err ||
+            altref_err > golden_err) {
+          arf_not_zz[offset + mb_col]++;
+        }
+      }
+    }
+  }
+
+  // arf_not_zz is indexed by MB, but this loop is indexed by MI to avoid out
+  // of bound access in segmentation_map
+  for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) {
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) {
+      // If any of the blocks in the sequence failed then the MB
+      // goes in segment 0
+      if (arf_not_zz[mi_row / 2 * cm->mb_cols + mi_col / 2]) {
+        ncnt[0]++;
+        cpi->segmentation_map[mi_row * cm->mi_cols + mi_col] = 0;
+      } else {
+        cpi->segmentation_map[mi_row * cm->mi_cols + mi_col] = 1;
+        ncnt[1]++;
+      }
+    }
+  }
+
+  // Only bother with segmentation if over 10% of the MBs in static segment
+  // if ( ncnt[1] && (ncnt[0] / ncnt[1] < 10) )
+  if (1) {
+    // Note % of blocks that are marked as static
+    if (cm->MBs)
+      cpi->static_mb_pct = (ncnt[1] * 100) / (cm->mi_rows * cm->mi_cols);
+
+    // This error case should not be reachable as this function should
+    // never be called with the common data structure uninitialized.
+    else
+      cpi->static_mb_pct = 0;
+
+    vp9_enable_segmentation(&cm->seg);
+  } else {
+    cpi->static_mb_pct = 0;
+    vp9_disable_segmentation(&cm->seg);
+  }
+
+  // Free localy allocated storage
+  vpx_free(arf_not_zz);
+}
+
+void vp9_update_mbgraph_stats(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  int i, n_frames = vp9_lookahead_depth(cpi->lookahead);
+  YV12_BUFFER_CONFIG *golden_ref = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+
+  assert(golden_ref != NULL);
+
+  // we need to look ahead beyond where the ARF transitions into
+  // being a GF - so exit if we don't look ahead beyond that
+  if (n_frames <= cpi->rc.frames_till_gf_update_due)
+    return;
+
+  if (n_frames > MAX_LAG_BUFFERS)
+    n_frames = MAX_LAG_BUFFERS;
+
+  cpi->mbgraph_n_frames = n_frames;
+  for (i = 0; i < n_frames; i++) {
+    MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i];
+    memset(frame_stats->mb_stats, 0,
+           cm->mb_rows * cm->mb_cols * sizeof(*cpi->mbgraph_stats[i].mb_stats));
+  }
+
+  // do motion search to find contribution of each reference to data
+  // later on in this GF group
+  // FIXME really, the GF/last MC search should be done forward, and
+  // the ARF MC search backwards, to get optimal results for MV caching
+  for (i = 0; i < n_frames; i++) {
+    MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i];
+    struct lookahead_entry *q_cur = vp9_lookahead_peek(cpi->lookahead, i);
+
+    assert(q_cur != NULL);
+
+    update_mbgraph_frame_stats(cpi, frame_stats, &q_cur->img,
+                               golden_ref, cpi->Source);
+  }
+
+  vpx_clear_system_state();
+
+  separate_arf_mbs(cpi);
+}
diff --git a/libs/libvpx/vp9/encoder/vp9_mbgraph.h b/libs/libvpx/vp9/encoder/vp9_mbgraph.h
new file mode 100644
index 0000000000..c3af972bc0
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_mbgraph.h
@@ -0,0 +1,40 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_VP9_MBGRAPH_H_
+#define VP9_ENCODER_VP9_MBGRAPH_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+  struct {
+    int err;
+    union {
+      int_mv mv;
+      PREDICTION_MODE mode;
+    } m;
+  } ref[MAX_REF_FRAMES];
+} MBGRAPH_MB_STATS;
+
+typedef struct {
+  MBGRAPH_MB_STATS *mb_stats;
+} MBGRAPH_FRAME_STATS;
+
+struct VP9_COMP;
+
+void vp9_update_mbgraph_stats(struct VP9_COMP *cpi);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_VP9_MBGRAPH_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_mcomp.c b/libs/libvpx/vp9/encoder/vp9_mcomp.c
new file mode 100644
index 0000000000..8b7825e7b6
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_mcomp.c
@@ -0,0 +1,2499 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_reconinter.h"
+
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_mcomp.h"
+
+// #define NEW_DIAMOND_SEARCH
+
+static INLINE const uint8_t *get_buf_from_mv(const struct buf_2d *buf,
+                                             const MV *mv) {
+  return &buf->buf[mv->row * buf->stride + mv->col];
+}
+
+void vp9_set_mv_search_range(MACROBLOCK *x, const MV *mv) {
+  int col_min = (mv->col >> 3) - MAX_FULL_PEL_VAL + (mv->col & 7 ? 1 : 0);
+  int row_min = (mv->row >> 3) - MAX_FULL_PEL_VAL + (mv->row & 7 ? 1 : 0);
+  int col_max = (mv->col >> 3) + MAX_FULL_PEL_VAL;
+  int row_max = (mv->row >> 3) + MAX_FULL_PEL_VAL;
+
+  col_min = VPXMAX(col_min, (MV_LOW >> 3) + 1);
+  row_min = VPXMAX(row_min, (MV_LOW >> 3) + 1);
+  col_max = VPXMIN(col_max, (MV_UPP >> 3) - 1);
+  row_max = VPXMIN(row_max, (MV_UPP >> 3) - 1);
+
+  // Get intersection of UMV window and valid MV window to reduce # of checks
+  // in diamond search.
+  if (x->mv_col_min < col_min)
+    x->mv_col_min = col_min;
+  if (x->mv_col_max > col_max)
+    x->mv_col_max = col_max;
+  if (x->mv_row_min < row_min)
+    x->mv_row_min = row_min;
+  if (x->mv_row_max > row_max)
+    x->mv_row_max = row_max;
+}
+
+int vp9_init_search_range(int size) {
+  int sr = 0;
+  // Minimum search size no matter what the passed in value.
+  size = VPXMAX(16, size);
+
+  while ((size << sr) < MAX_FULL_PEL_VAL)
+    sr++;
+
+  sr = VPXMIN(sr, MAX_MVSEARCH_STEPS - 2);
+  return sr;
+}
+
+static INLINE int mv_cost(const MV *mv,
+                          const int *joint_cost, int *const comp_cost[2]) {
+  return joint_cost[vp9_get_mv_joint(mv)] +
+             comp_cost[0][mv->row] + comp_cost[1][mv->col];
+}
+
+int vp9_mv_bit_cost(const MV *mv, const MV *ref,
+                    const int *mvjcost, int *mvcost[2], int weight) {
+  const MV diff = { mv->row - ref->row,
+                    mv->col - ref->col };
+  return ROUND_POWER_OF_TWO(mv_cost(&diff, mvjcost, mvcost) * weight, 7);
+}
+
+#define PIXEL_TRANSFORM_ERROR_SCALE 4
+static int mv_err_cost(const MV *mv, const MV *ref, const int *mvjcost,
+                       int *mvcost[2], int error_per_bit) {
+  if (mvcost) {
+    const MV diff = {mv->row - ref->row, mv->col - ref->col};
+    // This product sits at a 32-bit ceiling right now and any additional
+    // accuracy in either bit cost or error cost will cause it to overflow.
+    return ROUND_POWER_OF_TWO(
+        (unsigned)mv_cost(&diff, mvjcost, mvcost) * error_per_bit,
+        RDDIV_BITS + VP9_PROB_COST_SHIFT - RD_EPB_SHIFT +
+            PIXEL_TRANSFORM_ERROR_SCALE);
+  }
+  return 0;
+}
+
+static int mvsad_err_cost(const MACROBLOCK *x, const MV *mv, const MV *ref,
+                          int sad_per_bit) {
+  const MV diff = { mv->row - ref->row,
+                    mv->col - ref->col };
+  return ROUND_POWER_OF_TWO(
+      (unsigned)mv_cost(&diff, x->nmvjointsadcost, x->nmvsadcost) *
+          sad_per_bit,
+      VP9_PROB_COST_SHIFT);
+}
+
+void vp9_init_dsmotion_compensation(search_site_config *cfg, int stride) {
+  int len;
+  int ss_count = 0;
+
+  for (len = MAX_FIRST_STEP; len > 0; len /= 2) {
+    // Generate offsets for 4 search sites per step.
+    const MV ss_mvs[] = {{-len, 0}, {len, 0}, {0, -len}, {0, len}};
+    int i;
+    for (i = 0; i < 4; ++i, ++ss_count) {
+      cfg->ss_mv[ss_count] = ss_mvs[i];
+      cfg->ss_os[ss_count] = ss_mvs[i].row * stride + ss_mvs[i].col;
+    }
+  }
+
+  cfg->searches_per_step = 4;
+  cfg->total_steps = ss_count / cfg->searches_per_step;
+}
+
+void vp9_init3smotion_compensation(search_site_config *cfg, int stride) {
+  int len;
+  int ss_count = 0;
+
+  for (len = MAX_FIRST_STEP; len > 0; len /= 2) {
+    // Generate offsets for 8 search sites per step.
+    const MV ss_mvs[8] = {
+      {-len,  0  }, {len,  0  }, { 0,   -len}, {0,    len},
+      {-len, -len}, {-len, len}, {len,  -len}, {len,  len}
+    };
+    int i;
+    for (i = 0; i < 8; ++i, ++ss_count) {
+      cfg->ss_mv[ss_count] = ss_mvs[i];
+      cfg->ss_os[ss_count] = ss_mvs[i].row * stride + ss_mvs[i].col;
+    }
+  }
+
+  cfg->searches_per_step = 8;
+  cfg->total_steps = ss_count / cfg->searches_per_step;
+}
+
+/*
+ * To avoid the penalty for crossing cache-line read, preload the reference
+ * area in a small buffer, which is aligned to make sure there won't be crossing
+ * cache-line read while reading from this buffer. This reduced the cpu
+ * cycles spent on reading ref data in sub-pixel filter functions.
+ * TODO: Currently, since sub-pixel search range here is -3 ~ 3, copy 22 rows x
+ * 32 cols area that is enough for 16x16 macroblock. Later, for SPLITMV, we
+ * could reduce the area.
+ */
+
+/* Estimated (square) error cost of a motion vector (r,c). The 14 scale comes
+ * from the same math as in mv_err_cost(). */
+#define MVC(r, c)                                              \
+    (mvcost ?                                                  \
+     ((unsigned)(mvjcost[((r) != rr) * 2 + ((c) != rc)] +      \
+       mvcost[0][((r) - rr)] + mvcost[1][((c) - rc)]) *        \
+      error_per_bit + 8192) >> 14 : 0)
+
+
+// convert motion vector component to offset for sv[a]f calc
+static INLINE int sp(int x) {
+  return x & 7;
+}
+
+static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
+  return &buf[(r >> 3) * stride + (c >> 3)];
+}
+
+/* checks if (r, c) has better score than previous best */
+#define CHECK_BETTER(v, r, c) \
+  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {              \
+    if (second_pred == NULL)                                           \
+      thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \
+                             src_stride, &sse);                        \
+    else                                                               \
+      thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), \
+                              z, src_stride, &sse, second_pred);       \
+    if ((v = MVC(r, c) + thismse) < besterr) {                         \
+      besterr = v;                                                     \
+      br = r;                                                          \
+      bc = c;                                                          \
+      *distortion = thismse;                                           \
+      *sse1 = sse;                                                     \
+    }                                                                  \
+  } else {                                                             \
+    v = INT_MAX;                                                       \
+  }
+
+#define FIRST_LEVEL_CHECKS                              \
+  {                                                     \
+    unsigned int left, right, up, down, diag;           \
+    CHECK_BETTER(left, tr, tc - hstep);                 \
+    CHECK_BETTER(right, tr, tc + hstep);                \
+    CHECK_BETTER(up, tr - hstep, tc);                   \
+    CHECK_BETTER(down, tr + hstep, tc);                 \
+    whichdir = (left < right ? 0 : 1) +                 \
+               (up < down ? 0 : 2);                     \
+    switch (whichdir) {                                 \
+      case 0:                                           \
+        CHECK_BETTER(diag, tr - hstep, tc - hstep);     \
+        break;                                          \
+      case 1:                                           \
+        CHECK_BETTER(diag, tr - hstep, tc + hstep);     \
+        break;                                          \
+      case 2:                                           \
+        CHECK_BETTER(diag, tr + hstep, tc - hstep);     \
+        break;                                          \
+      case 3:                                           \
+        CHECK_BETTER(diag, tr + hstep, tc + hstep);     \
+        break;                                          \
+    }                                                   \
+  }
+
+#define SECOND_LEVEL_CHECKS                             \
+  {                                                     \
+    int kr, kc;                                         \
+    unsigned int second;                                \
+    if (tr != br && tc != bc) {                         \
+      kr = br - tr;                                     \
+      kc = bc - tc;                                     \
+      CHECK_BETTER(second, tr + kr, tc + 2 * kc);       \
+      CHECK_BETTER(second, tr + 2 * kr, tc + kc);       \
+    } else if (tr == br && tc != bc) {                  \
+      kc = bc - tc;                                     \
+      CHECK_BETTER(second, tr + hstep, tc + 2 * kc);    \
+      CHECK_BETTER(second, tr - hstep, tc + 2 * kc);    \
+      switch (whichdir) {                               \
+        case 0:                                         \
+        case 1:                                         \
+          CHECK_BETTER(second, tr + hstep, tc + kc);    \
+          break;                                        \
+        case 2:                                         \
+        case 3:                                         \
+          CHECK_BETTER(second, tr - hstep, tc + kc);    \
+          break;                                        \
+      }                                                 \
+    } else if (tr != br && tc == bc) {                  \
+      kr = br - tr;                                     \
+      CHECK_BETTER(second, tr + 2 * kr, tc + hstep);    \
+      CHECK_BETTER(second, tr + 2 * kr, tc - hstep);    \
+      switch (whichdir) {                               \
+        case 0:                                         \
+        case 2:                                         \
+          CHECK_BETTER(second, tr + kr, tc + hstep);    \
+          break;                                        \
+        case 1:                                         \
+        case 3:                                         \
+          CHECK_BETTER(second, tr + kr, tc - hstep);    \
+          break;                                        \
+      }                                                 \
+    }                                                   \
+  }
+
+// TODO(yunqingwang): SECOND_LEVEL_CHECKS_BEST was a rewrote of
+// SECOND_LEVEL_CHECKS, and SECOND_LEVEL_CHECKS should be rewritten
+// later in the same way.
+#define SECOND_LEVEL_CHECKS_BEST                        \
+  {                                                     \
+    unsigned int second;                                \
+    int br0 = br;                                       \
+    int bc0 = bc;                                       \
+    assert(tr == br || tc == bc);                       \
+    if (tr == br && tc != bc) {                         \
+      kc = bc - tc;                                     \
+    } else if (tr != br && tc == bc) {                  \
+      kr = br - tr;                                     \
+    }                                                   \
+    CHECK_BETTER(second, br0 + kr, bc0);                \
+    CHECK_BETTER(second, br0, bc0 + kc);                \
+    if (br0 != br || bc0 != bc) {                       \
+      CHECK_BETTER(second, br0 + kr, bc0 + kc);         \
+    }                                                   \
+  }
+
+#define SETUP_SUBPEL_SEARCH                                                \
+  const uint8_t *const z = x->plane[0].src.buf;                            \
+  const int src_stride = x->plane[0].src.stride;                           \
+  const MACROBLOCKD *xd = &x->e_mbd;                                       \
+  unsigned int besterr = INT_MAX;                                          \
+  unsigned int sse;                                                        \
+  unsigned int whichdir;                                                   \
+  int thismse;                                                             \
+  const unsigned int halfiters = iters_per_step;                           \
+  const unsigned int quarteriters = iters_per_step;                        \
+  const unsigned int eighthiters = iters_per_step;                         \
+  const int y_stride = xd->plane[0].pre[0].stride;                         \
+  const int offset = bestmv->row * y_stride + bestmv->col;                 \
+  const uint8_t *const y = xd->plane[0].pre[0].buf;                        \
+                                                                           \
+  int rr = ref_mv->row;                                                    \
+  int rc = ref_mv->col;                                                    \
+  int br = bestmv->row * 8;                                                \
+  int bc = bestmv->col * 8;                                                \
+  int hstep = 4;                                                           \
+  const int minc = VPXMAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);        \
+  const int maxc = VPXMIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);        \
+  const int minr = VPXMAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);        \
+  const int maxr = VPXMIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);        \
+  int tr = br;                                                             \
+  int tc = bc;                                                             \
+                                                                           \
+  bestmv->row *= 8;                                                        \
+  bestmv->col *= 8;
+
+static unsigned int setup_center_error(const MACROBLOCKD *xd,
+                                       const MV *bestmv,
+                                       const MV *ref_mv,
+                                       int error_per_bit,
+                                       const vp9_variance_fn_ptr_t *vfp,
+                                       const uint8_t *const src,
+                                       const int src_stride,
+                                       const uint8_t *const y,
+                                       int y_stride,
+                                       const uint8_t *second_pred,
+                                       int w, int h, int offset,
+                                       int *mvjcost, int *mvcost[2],
+                                       unsigned int *sse1,
+                                       int *distortion) {
+  unsigned int besterr;
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (second_pred != NULL) {
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      DECLARE_ALIGNED(16, uint16_t, comp_pred16[64 * 64]);
+      vpx_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset,
+                               y_stride);
+      besterr = vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride,
+                        sse1);
+    } else {
+      DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+      vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
+      besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
+    }
+  } else {
+    besterr = vfp->vf(y + offset, y_stride, src, src_stride, sse1);
+  }
+  *distortion = besterr;
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+#else
+  (void) xd;
+  if (second_pred != NULL) {
+    DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+    vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
+    besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
+  } else {
+    besterr = vfp->vf(y + offset, y_stride, src, src_stride, sse1);
+  }
+  *distortion = besterr;
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  return besterr;
+}
+
+static INLINE int divide_and_round(const int n, const int d) {
+  return ((n < 0) ^ (d < 0)) ? ((n - d / 2) / d) : ((n + d / 2) / d);
+}
+
+static INLINE int is_cost_list_wellbehaved(int *cost_list) {
+  return cost_list[0] < cost_list[1] &&
+         cost_list[0] < cost_list[2] &&
+         cost_list[0] < cost_list[3] &&
+         cost_list[0] < cost_list[4];
+}
+
+// Returns surface minima estimate at given precision in 1/2^n bits.
+// Assume a model for the cost surface: S = A(x - x0)^2 + B(y - y0)^2 + C
+// For a given set of costs S0, S1, S2, S3, S4 at points
+// (y, x) = (0, 0), (0, -1), (1, 0), (0, 1) and (-1, 0) respectively,
+// the solution for the location of the minima (x0, y0) is given by:
+// x0 = 1/2 (S1 - S3)/(S1 + S3 - 2*S0),
+// y0 = 1/2 (S4 - S2)/(S4 + S2 - 2*S0).
+// The code below is an integerized version of that.
+static void get_cost_surf_min(int *cost_list, int *ir, int *ic,
+                              int bits) {
+  *ic = divide_and_round((cost_list[1] - cost_list[3]) * (1 << (bits - 1)),
+                         (cost_list[1] - 2 * cost_list[0] + cost_list[3]));
+  *ir = divide_and_round((cost_list[4] - cost_list[2]) * (1 << (bits - 1)),
+                         (cost_list[4] - 2 * cost_list[0] + cost_list[2]));
+}
+
+int vp9_find_best_sub_pixel_tree_pruned_evenmore(
+    const MACROBLOCK *x,
+    MV *bestmv, const MV *ref_mv,
+    int allow_hp,
+    int error_per_bit,
+    const vp9_variance_fn_ptr_t *vfp,
+    int forced_stop,
+    int iters_per_step,
+    int *cost_list,
+    int *mvjcost, int *mvcost[2],
+    int *distortion,
+    unsigned int *sse1,
+    const uint8_t *second_pred,
+    int w, int h) {
+  SETUP_SUBPEL_SEARCH;
+  besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
+                               z, src_stride, y, y_stride, second_pred,
+                               w, h, offset, mvjcost, mvcost,
+                               sse1, distortion);
+  (void) halfiters;
+  (void) quarteriters;
+  (void) eighthiters;
+  (void) whichdir;
+  (void) allow_hp;
+  (void) forced_stop;
+  (void) hstep;
+
+  if (cost_list &&
+      cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
+      cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
+      cost_list[4] != INT_MAX &&
+      is_cost_list_wellbehaved(cost_list)) {
+    int ir, ic;
+    unsigned int minpt;
+    get_cost_surf_min(cost_list, &ir, &ic, 2);
+    if (ir != 0 || ic != 0) {
+      CHECK_BETTER(minpt, tr + 2 * ir, tc + 2 * ic);
+    }
+  } else {
+    FIRST_LEVEL_CHECKS;
+    if (halfiters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+
+    tr = br;
+    tc = bc;
+
+    // Each subsequent iteration checks at least one point in common with
+    // the last iteration could be 2 ( if diag selected) 1/4 pel
+    // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
+    if (forced_stop != 2) {
+      hstep >>= 1;
+      FIRST_LEVEL_CHECKS;
+      if (quarteriters > 1) {
+        SECOND_LEVEL_CHECKS;
+      }
+    }
+  }
+
+  tr = br;
+  tc = bc;
+
+  if (allow_hp && use_mv_hp(ref_mv) && forced_stop == 0) {
+    hstep >>= 1;
+    FIRST_LEVEL_CHECKS;
+    if (eighthiters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+  }
+
+  bestmv->row = br;
+  bestmv->col = bc;
+
+  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
+      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
+    return INT_MAX;
+
+  return besterr;
+}
+
+int vp9_find_best_sub_pixel_tree_pruned_more(const MACROBLOCK *x,
+                                             MV *bestmv, const MV *ref_mv,
+                                             int allow_hp,
+                                             int error_per_bit,
+                                             const vp9_variance_fn_ptr_t *vfp,
+                                             int forced_stop,
+                                             int iters_per_step,
+                                             int *cost_list,
+                                             int *mvjcost, int *mvcost[2],
+                                             int *distortion,
+                                             unsigned int *sse1,
+                                             const uint8_t *second_pred,
+                                             int w, int h) {
+  SETUP_SUBPEL_SEARCH;
+  besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
+                               z, src_stride, y, y_stride, second_pred,
+                               w, h, offset, mvjcost, mvcost,
+                               sse1, distortion);
+  if (cost_list &&
+      cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
+      cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
+      cost_list[4] != INT_MAX &&
+      is_cost_list_wellbehaved(cost_list)) {
+    unsigned int minpt;
+    int ir, ic;
+    get_cost_surf_min(cost_list, &ir, &ic, 1);
+    if (ir != 0 || ic != 0) {
+      CHECK_BETTER(minpt, tr + ir * hstep, tc + ic * hstep);
+    }
+  } else {
+    FIRST_LEVEL_CHECKS;
+    if (halfiters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+  }
+
+  // Each subsequent iteration checks at least one point in common with
+  // the last iteration could be 2 ( if diag selected) 1/4 pel
+
+  // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
+  if (forced_stop != 2) {
+    tr = br;
+    tc = bc;
+    hstep >>= 1;
+    FIRST_LEVEL_CHECKS;
+    if (quarteriters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+  }
+
+  if (allow_hp && use_mv_hp(ref_mv) && forced_stop == 0) {
+    tr = br;
+    tc = bc;
+    hstep >>= 1;
+    FIRST_LEVEL_CHECKS;
+    if (eighthiters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+  }
+  // These lines insure static analysis doesn't warn that
+  // tr and tc aren't used after the above point.
+  (void) tr;
+  (void) tc;
+
+  bestmv->row = br;
+  bestmv->col = bc;
+
+  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
+      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
+    return INT_MAX;
+
+  return besterr;
+}
+
+int vp9_find_best_sub_pixel_tree_pruned(const MACROBLOCK *x,
+                                        MV *bestmv, const MV *ref_mv,
+                                        int allow_hp,
+                                        int error_per_bit,
+                                        const vp9_variance_fn_ptr_t *vfp,
+                                        int forced_stop,
+                                        int iters_per_step,
+                                        int *cost_list,
+                                        int *mvjcost, int *mvcost[2],
+                                        int *distortion,
+                                        unsigned int *sse1,
+                                        const uint8_t *second_pred,
+                                        int w, int h) {
+  SETUP_SUBPEL_SEARCH;
+  besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
+                               z, src_stride, y, y_stride, second_pred,
+                               w, h, offset, mvjcost, mvcost,
+                               sse1, distortion);
+  if (cost_list &&
+      cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
+      cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
+      cost_list[4] != INT_MAX) {
+    unsigned int left, right, up, down, diag;
+    whichdir = (cost_list[1] < cost_list[3] ? 0 : 1) +
+               (cost_list[2] < cost_list[4] ? 0 : 2);
+    switch (whichdir) {
+      case 0:
+        CHECK_BETTER(left, tr, tc - hstep);
+        CHECK_BETTER(down, tr + hstep, tc);
+        CHECK_BETTER(diag, tr + hstep, tc - hstep);
+        break;
+      case 1:
+        CHECK_BETTER(right, tr, tc + hstep);
+        CHECK_BETTER(down, tr + hstep, tc);
+        CHECK_BETTER(diag, tr + hstep, tc + hstep);
+        break;
+      case 2:
+        CHECK_BETTER(left, tr, tc - hstep);
+        CHECK_BETTER(up, tr - hstep, tc);
+        CHECK_BETTER(diag, tr - hstep, tc - hstep);
+        break;
+      case 3:
+        CHECK_BETTER(right, tr, tc + hstep);
+        CHECK_BETTER(up, tr - hstep, tc);
+        CHECK_BETTER(diag, tr - hstep, tc + hstep);
+        break;
+    }
+  } else {
+    FIRST_LEVEL_CHECKS;
+    if (halfiters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+  }
+
+  tr = br;
+  tc = bc;
+
+  // Each subsequent iteration checks at least one point in common with
+  // the last iteration could be 2 ( if diag selected) 1/4 pel
+
+  // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
+  if (forced_stop != 2) {
+    hstep >>= 1;
+    FIRST_LEVEL_CHECKS;
+    if (quarteriters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+    tr = br;
+    tc = bc;
+  }
+
+  if (allow_hp && use_mv_hp(ref_mv) && forced_stop == 0) {
+    hstep >>= 1;
+    FIRST_LEVEL_CHECKS;
+    if (eighthiters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+    tr = br;
+    tc = bc;
+  }
+  // These lines insure static analysis doesn't warn that
+  // tr and tc aren't used after the above point.
+  (void) tr;
+  (void) tc;
+
+  bestmv->row = br;
+  bestmv->col = bc;
+
+  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
+      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
+    return INT_MAX;
+
+  return besterr;
+}
+
+static const MV search_step_table[12] = {
+    // left, right, up, down
+    {0, -4}, {0, 4}, {-4, 0}, {4, 0},
+    {0, -2}, {0, 2}, {-2, 0}, {2, 0},
+    {0, -1}, {0, 1}, {-1, 0}, {1, 0}
+};
+
+int vp9_find_best_sub_pixel_tree(const MACROBLOCK *x,
+                                 MV *bestmv, const MV *ref_mv,
+                                 int allow_hp,
+                                 int error_per_bit,
+                                 const vp9_variance_fn_ptr_t *vfp,
+                                 int forced_stop,
+                                 int iters_per_step,
+                                 int *cost_list,
+                                 int *mvjcost, int *mvcost[2],
+                                 int *distortion,
+                                 unsigned int *sse1,
+                                 const uint8_t *second_pred,
+                                 int w, int h) {
+  const uint8_t *const z = x->plane[0].src.buf;
+  const uint8_t *const src_address = z;
+  const int src_stride = x->plane[0].src.stride;
+  const MACROBLOCKD *xd = &x->e_mbd;
+  unsigned int besterr = INT_MAX;
+  unsigned int sse;
+  int thismse;
+  const int y_stride = xd->plane[0].pre[0].stride;
+  const int offset = bestmv->row * y_stride + bestmv->col;
+  const uint8_t *const y = xd->plane[0].pre[0].buf;
+
+  int rr = ref_mv->row;
+  int rc = ref_mv->col;
+  int br = bestmv->row * 8;
+  int bc = bestmv->col * 8;
+  int hstep = 4;
+  int iter, round = 3 - forced_stop;
+  const int minc = VPXMAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);
+  const int maxc = VPXMIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);
+  const int minr = VPXMAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);
+  const int maxr = VPXMIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);
+  int tr = br;
+  int tc = bc;
+  const MV *search_step = search_step_table;
+  int idx, best_idx = -1;
+  unsigned int cost_array[5];
+  int kr, kc;
+
+  if (!(allow_hp && use_mv_hp(ref_mv)))
+    if (round == 3)
+      round = 2;
+
+  bestmv->row *= 8;
+  bestmv->col *= 8;
+
+  besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
+                               z, src_stride, y, y_stride, second_pred,
+                               w, h, offset, mvjcost, mvcost,
+                               sse1, distortion);
+
+  (void) cost_list;  // to silence compiler warning
+
+  for (iter = 0; iter < round; ++iter) {
+    // Check vertical and horizontal sub-pixel positions.
+    for (idx = 0; idx < 4; ++idx) {
+      tr = br + search_step[idx].row;
+      tc = bc + search_step[idx].col;
+      if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
+        const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
+        MV this_mv;
+        this_mv.row = tr;
+        this_mv.col = tc;
+        if (second_pred == NULL)
+          thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr),
+                             src_address, src_stride, &sse);
+        else
+          thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr),
+                              src_address, src_stride, &sse, second_pred);
+        cost_array[idx] = thismse +
+            mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);
+
+        if (cost_array[idx] < besterr) {
+          best_idx = idx;
+          besterr = cost_array[idx];
+          *distortion = thismse;
+          *sse1 = sse;
+        }
+      } else {
+        cost_array[idx] = INT_MAX;
+      }
+    }
+
+    // Check diagonal sub-pixel position
+    kc = (cost_array[0] <= cost_array[1] ? -hstep : hstep);
+    kr = (cost_array[2] <= cost_array[3] ? -hstep : hstep);
+
+    tc = bc + kc;
+    tr = br + kr;
+    if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
+      const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
+      MV this_mv = {tr, tc};
+      if (second_pred == NULL)
+        thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr),
+                           src_address, src_stride, &sse);
+      else
+        thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr),
+                            src_address, src_stride, &sse, second_pred);
+      cost_array[4] = thismse +
+          mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);
+
+      if (cost_array[4] < besterr) {
+        best_idx = 4;
+        besterr = cost_array[4];
+        *distortion = thismse;
+        *sse1 = sse;
+      }
+    } else {
+      cost_array[idx] = INT_MAX;
+    }
+
+    if (best_idx < 4 && best_idx >= 0) {
+      br += search_step[best_idx].row;
+      bc += search_step[best_idx].col;
+    } else if (best_idx == 4) {
+      br = tr;
+      bc = tc;
+    }
+
+    if (iters_per_step > 1 && best_idx != -1)
+      SECOND_LEVEL_CHECKS_BEST;
+
+    tr = br;
+    tc = bc;
+
+    search_step += 4;
+    hstep >>= 1;
+    best_idx = -1;
+  }
+
+  // Each subsequent iteration checks at least one point in common with
+  // the last iteration could be 2 ( if diag selected) 1/4 pel
+
+  // These lines insure static analysis doesn't warn that
+  // tr and tc aren't used after the above point.
+  (void) tr;
+  (void) tc;
+
+  bestmv->row = br;
+  bestmv->col = bc;
+
+  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
+      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
+    return INT_MAX;
+
+  return besterr;
+}
+
+#undef MVC
+#undef PRE
+#undef CHECK_BETTER
+
+static INLINE int check_bounds(const MACROBLOCK *x, int row, int col,
+                               int range) {
+  return ((row - range) >= x->mv_row_min) &
+         ((row + range) <= x->mv_row_max) &
+         ((col - range) >= x->mv_col_min) &
+         ((col + range) <= x->mv_col_max);
+}
+
+static INLINE int is_mv_in(const MACROBLOCK *x, const MV *mv) {
+  return (mv->col >= x->mv_col_min) && (mv->col <= x->mv_col_max) &&
+         (mv->row >= x->mv_row_min) && (mv->row <= x->mv_row_max);
+}
+
+#define CHECK_BETTER \
+  {\
+    if (thissad < bestsad) {\
+      if (use_mvcost) \
+        thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);\
+      if (thissad < bestsad) {\
+        bestsad = thissad;\
+        best_site = i;\
+      }\
+    }\
+  }
+
+#define MAX_PATTERN_SCALES         11
+#define MAX_PATTERN_CANDIDATES      8  // max number of canddiates per scale
+#define PATTERN_CANDIDATES_REF      3  // number of refinement candidates
+
+// Calculate and return a sad+mvcost list around an integer best pel.
+static INLINE void calc_int_cost_list(const MACROBLOCK *x,
+                                      const MV *ref_mv,
+                                      int sadpb,
+                                      const vp9_variance_fn_ptr_t *fn_ptr,
+                                      const MV *best_mv,
+                                      int *cost_list) {
+  static const MV neighbors[4] = {{0, -1}, {1, 0}, {0, 1}, {-1, 0}};
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &x->e_mbd.plane[0].pre[0];
+  const MV fcenter_mv = {ref_mv->row >> 3, ref_mv->col >> 3};
+  int br = best_mv->row;
+  int bc = best_mv->col;
+  MV this_mv;
+  int i;
+  unsigned int sse;
+
+  this_mv.row = br;
+  this_mv.col = bc;
+  cost_list[0] = fn_ptr->vf(what->buf, what->stride,
+                            get_buf_from_mv(in_what, &this_mv),
+                            in_what->stride, &sse) +
+      mvsad_err_cost(x, &this_mv, &fcenter_mv, sadpb);
+  if (check_bounds(x, br, bc, 1)) {
+    for (i = 0; i < 4; i++) {
+      const MV this_mv = {br + neighbors[i].row,
+        bc + neighbors[i].col};
+      cost_list[i + 1] = fn_ptr->vf(what->buf, what->stride,
+                                    get_buf_from_mv(in_what, &this_mv),
+                                    in_what->stride, &sse) +
+                                    mv_err_cost(&this_mv, &fcenter_mv,
+                                                x->nmvjointcost, x->mvcost,
+                                                x->errorperbit);
+    }
+  } else {
+    for (i = 0; i < 4; i++) {
+      const MV this_mv = {br + neighbors[i].row,
+        bc + neighbors[i].col};
+      if (!is_mv_in(x, &this_mv))
+        cost_list[i + 1] = INT_MAX;
+      else
+        cost_list[i + 1] = fn_ptr->vf(what->buf, what->stride,
+                                      get_buf_from_mv(in_what, &this_mv),
+                                      in_what->stride, &sse) +
+                                      mv_err_cost(&this_mv, &fcenter_mv,
+                                                  x->nmvjointcost, x->mvcost,
+                                                  x->errorperbit);
+    }
+  }
+}
+
+// Generic pattern search function that searches over multiple scales.
+// Each scale can have a different number of candidates and shape of
+// candidates as indicated in the num_candidates and candidates arrays
+// passed into this function
+//
+static int vp9_pattern_search(const MACROBLOCK *x,
+                              MV *ref_mv,
+                              int search_param,
+                              int sad_per_bit,
+                              int do_init_search,
+                              int *cost_list,
+                              const vp9_variance_fn_ptr_t *vfp,
+                              int use_mvcost,
+                              const MV *center_mv,
+                              MV *best_mv,
+                              const int num_candidates[MAX_PATTERN_SCALES],
+                              const MV candidates[MAX_PATTERN_SCALES]
+                                                 [MAX_PATTERN_CANDIDATES]) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  static const int search_param_to_steps[MAX_MVSEARCH_STEPS] = {
+    10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+  };
+  int i, s, t;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  int br, bc;
+  int bestsad = INT_MAX;
+  int thissad;
+  int k = -1;
+  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+  int best_init_s = search_param_to_steps[search_param];
+  // adjust ref_mv to make sure it is within MV range
+  clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
+  br = ref_mv->row;
+  bc = ref_mv->col;
+
+  // Work out the start point for the search
+  bestsad = vfp->sdf(what->buf, what->stride,
+                     get_buf_from_mv(in_what, ref_mv), in_what->stride) +
+      mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
+
+  // Search all possible scales upto the search param around the center point
+  // pick the scale of the point that is best as the starting scale of
+  // further steps around it.
+  if (do_init_search) {
+    s = best_init_s;
+    best_init_s = -1;
+    for (t = 0; t <= s; ++t) {
+      int best_site = -1;
+      if (check_bounds(x, br, bc, 1 << t)) {
+        for (i = 0; i < num_candidates[t]; i++) {
+          const MV this_mv = {br + candidates[t][i].row,
+                              bc + candidates[t][i].col};
+          thissad = vfp->sdf(what->buf, what->stride,
+                             get_buf_from_mv(in_what, &this_mv),
+                             in_what->stride);
+          CHECK_BETTER
+        }
+      } else {
+        for (i = 0; i < num_candidates[t]; i++) {
+          const MV this_mv = {br + candidates[t][i].row,
+                              bc + candidates[t][i].col};
+          if (!is_mv_in(x, &this_mv))
+            continue;
+          thissad = vfp->sdf(what->buf, what->stride,
+                             get_buf_from_mv(in_what, &this_mv),
+                             in_what->stride);
+          CHECK_BETTER
+        }
+      }
+      if (best_site == -1) {
+        continue;
+      } else {
+        best_init_s = t;
+        k = best_site;
+      }
+    }
+    if (best_init_s != -1) {
+      br += candidates[best_init_s][k].row;
+      bc += candidates[best_init_s][k].col;
+    }
+  }
+
+  // If the center point is still the best, just skip this and move to
+  // the refinement step.
+  if (best_init_s != -1) {
+    int best_site = -1;
+    s = best_init_s;
+
+    do {
+      // No need to search all 6 points the 1st time if initial search was used
+      if (!do_init_search || s != best_init_s) {
+        if (check_bounds(x, br, bc, 1 << s)) {
+          for (i = 0; i < num_candidates[s]; i++) {
+            const MV this_mv = {br + candidates[s][i].row,
+                                bc + candidates[s][i].col};
+            thissad = vfp->sdf(what->buf, what->stride,
+                               get_buf_from_mv(in_what, &this_mv),
+                               in_what->stride);
+            CHECK_BETTER
+          }
+        } else {
+          for (i = 0; i < num_candidates[s]; i++) {
+            const MV this_mv = {br + candidates[s][i].row,
+                                bc + candidates[s][i].col};
+            if (!is_mv_in(x, &this_mv))
+              continue;
+            thissad = vfp->sdf(what->buf, what->stride,
+                               get_buf_from_mv(in_what, &this_mv),
+                               in_what->stride);
+            CHECK_BETTER
+          }
+        }
+
+        if (best_site == -1) {
+          continue;
+        } else {
+          br += candidates[s][best_site].row;
+          bc += candidates[s][best_site].col;
+          k = best_site;
+        }
+      }
+
+      do {
+        int next_chkpts_indices[PATTERN_CANDIDATES_REF];
+        best_site = -1;
+        next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1;
+        next_chkpts_indices[1] = k;
+        next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1;
+
+        if (check_bounds(x, br, bc, 1 << s)) {
+          for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
+            const MV this_mv = {br + candidates[s][next_chkpts_indices[i]].row,
+                                bc + candidates[s][next_chkpts_indices[i]].col};
+            thissad = vfp->sdf(what->buf, what->stride,
+                               get_buf_from_mv(in_what, &this_mv),
+                               in_what->stride);
+            CHECK_BETTER
+          }
+        } else {
+          for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
+            const MV this_mv = {br + candidates[s][next_chkpts_indices[i]].row,
+                                bc + candidates[s][next_chkpts_indices[i]].col};
+            if (!is_mv_in(x, &this_mv))
+              continue;
+            thissad = vfp->sdf(what->buf, what->stride,
+                               get_buf_from_mv(in_what, &this_mv),
+                               in_what->stride);
+            CHECK_BETTER
+          }
+        }
+
+        if (best_site != -1) {
+          k = next_chkpts_indices[best_site];
+          br += candidates[s][k].row;
+          bc += candidates[s][k].col;
+        }
+      } while (best_site != -1);
+    } while (s--);
+  }
+
+  // Returns the one-away integer pel sad values around the best as follows:
+  // cost_list[0]: cost at the best integer pel
+  // cost_list[1]: cost at delta {0, -1} (left)   from the best integer pel
+  // cost_list[2]: cost at delta { 1, 0} (bottom) from the best integer pel
+  // cost_list[3]: cost at delta { 0, 1} (right)  from the best integer pel
+  // cost_list[4]: cost at delta {-1, 0} (top)    from the best integer pel
+  if (cost_list) {
+    const MV best_mv = { br, bc };
+    calc_int_cost_list(x, &fcenter_mv, sad_per_bit, vfp, &best_mv, cost_list);
+  }
+  best_mv->row = br;
+  best_mv->col = bc;
+  return bestsad;
+}
+
+// A specialized function where the smallest scale search candidates
+// are 4 1-away neighbors, and cost_list is non-null
+// TODO(debargha): Merge this function with the one above. Also remove
+// use_mvcost option since it is always 1, to save unnecessary branches.
+static int vp9_pattern_search_sad(const MACROBLOCK *x,
+                                  MV *ref_mv,
+                                  int search_param,
+                                  int sad_per_bit,
+                                  int do_init_search,
+                                  int *cost_list,
+                                  const vp9_variance_fn_ptr_t *vfp,
+                                  int use_mvcost,
+                                  const MV *center_mv,
+                                  MV *best_mv,
+                                  const int num_candidates[MAX_PATTERN_SCALES],
+                                  const MV candidates[MAX_PATTERN_SCALES]
+                                                     [MAX_PATTERN_CANDIDATES]) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  static const int search_param_to_steps[MAX_MVSEARCH_STEPS] = {
+    10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+  };
+  int i, s, t;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  int br, bc;
+  int bestsad = INT_MAX;
+  int thissad;
+  int k = -1;
+  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+  int best_init_s = search_param_to_steps[search_param];
+  // adjust ref_mv to make sure it is within MV range
+  clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
+  br = ref_mv->row;
+  bc = ref_mv->col;
+  if (cost_list != NULL) {
+    cost_list[0] = cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] =
+        INT_MAX;
+  }
+
+  // Work out the start point for the search
+  bestsad = vfp->sdf(what->buf, what->stride,
+                     get_buf_from_mv(in_what, ref_mv), in_what->stride) +
+      mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
+
+  // Search all possible scales upto the search param around the center point
+  // pick the scale of the point that is best as the starting scale of
+  // further steps around it.
+  if (do_init_search) {
+    s = best_init_s;
+    best_init_s = -1;
+    for (t = 0; t <= s; ++t) {
+      int best_site = -1;
+      if (check_bounds(x, br, bc, 1 << t)) {
+        for (i = 0; i < num_candidates[t]; i++) {
+          const MV this_mv = {br + candidates[t][i].row,
+                              bc + candidates[t][i].col};
+          thissad = vfp->sdf(what->buf, what->stride,
+                             get_buf_from_mv(in_what, &this_mv),
+                             in_what->stride);
+          CHECK_BETTER
+        }
+      } else {
+        for (i = 0; i < num_candidates[t]; i++) {
+          const MV this_mv = {br + candidates[t][i].row,
+                              bc + candidates[t][i].col};
+          if (!is_mv_in(x, &this_mv))
+            continue;
+          thissad = vfp->sdf(what->buf, what->stride,
+                             get_buf_from_mv(in_what, &this_mv),
+                             in_what->stride);
+          CHECK_BETTER
+        }
+      }
+      if (best_site == -1) {
+        continue;
+      } else {
+        best_init_s = t;
+        k = best_site;
+      }
+    }
+    if (best_init_s != -1) {
+      br += candidates[best_init_s][k].row;
+      bc += candidates[best_init_s][k].col;
+    }
+  }
+
+  // If the center point is still the best, just skip this and move to
+  // the refinement step.
+  if (best_init_s != -1) {
+    int do_sad = (num_candidates[0] == 4 && cost_list != NULL);
+    int best_site = -1;
+    s = best_init_s;
+
+    for (; s >= do_sad; s--) {
+      if (!do_init_search || s != best_init_s) {
+        if (check_bounds(x, br, bc, 1 << s)) {
+          for (i = 0; i < num_candidates[s]; i++) {
+            const MV this_mv = {br + candidates[s][i].row,
+                                bc + candidates[s][i].col};
+            thissad = vfp->sdf(what->buf, what->stride,
+                               get_buf_from_mv(in_what, &this_mv),
+                               in_what->stride);
+            CHECK_BETTER
+          }
+        } else {
+          for (i = 0; i < num_candidates[s]; i++) {
+            const MV this_mv = {br + candidates[s][i].row,
+                                bc + candidates[s][i].col};
+            if (!is_mv_in(x, &this_mv))
+              continue;
+            thissad = vfp->sdf(what->buf, what->stride,
+                               get_buf_from_mv(in_what, &this_mv),
+                               in_what->stride);
+            CHECK_BETTER
+          }
+        }
+
+        if (best_site == -1) {
+          continue;
+        } else {
+          br += candidates[s][best_site].row;
+          bc += candidates[s][best_site].col;
+          k = best_site;
+        }
+      }
+
+      do {
+        int next_chkpts_indices[PATTERN_CANDIDATES_REF];
+        best_site = -1;
+        next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1;
+        next_chkpts_indices[1] = k;
+        next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1;
+
+        if (check_bounds(x, br, bc, 1 << s)) {
+          for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
+            const MV this_mv = {br + candidates[s][next_chkpts_indices[i]].row,
+                                bc + candidates[s][next_chkpts_indices[i]].col};
+            thissad = vfp->sdf(what->buf, what->stride,
+                               get_buf_from_mv(in_what, &this_mv),
+                               in_what->stride);
+            CHECK_BETTER
+          }
+        } else {
+          for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
+            const MV this_mv = {br + candidates[s][next_chkpts_indices[i]].row,
+                                bc + candidates[s][next_chkpts_indices[i]].col};
+            if (!is_mv_in(x, &this_mv))
+              continue;
+            thissad = vfp->sdf(what->buf, what->stride,
+                               get_buf_from_mv(in_what, &this_mv),
+                               in_what->stride);
+            CHECK_BETTER
+          }
+        }
+
+        if (best_site != -1) {
+          k = next_chkpts_indices[best_site];
+          br += candidates[s][k].row;
+          bc += candidates[s][k].col;
+        }
+      } while (best_site != -1);
+    }
+
+    // Note: If we enter the if below, then cost_list must be non-NULL.
+    if (s == 0) {
+      cost_list[0] = bestsad;
+      if (!do_init_search || s != best_init_s) {
+        if (check_bounds(x, br, bc, 1 << s)) {
+          for (i = 0; i < num_candidates[s]; i++) {
+            const MV this_mv = {br + candidates[s][i].row,
+                                bc + candidates[s][i].col};
+            cost_list[i + 1] =
+            thissad = vfp->sdf(what->buf, what->stride,
+                               get_buf_from_mv(in_what, &this_mv),
+                               in_what->stride);
+            CHECK_BETTER
+          }
+        } else {
+          for (i = 0; i < num_candidates[s]; i++) {
+            const MV this_mv = {br + candidates[s][i].row,
+                                bc + candidates[s][i].col};
+            if (!is_mv_in(x, &this_mv))
+              continue;
+            cost_list[i + 1] =
+            thissad = vfp->sdf(what->buf, what->stride,
+                               get_buf_from_mv(in_what, &this_mv),
+                               in_what->stride);
+            CHECK_BETTER
+          }
+        }
+
+        if (best_site != -1) {
+          br += candidates[s][best_site].row;
+          bc += candidates[s][best_site].col;
+          k = best_site;
+        }
+      }
+      while (best_site != -1) {
+        int next_chkpts_indices[PATTERN_CANDIDATES_REF];
+        best_site = -1;
+        next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1;
+        next_chkpts_indices[1] = k;
+        next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1;
+        cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] = INT_MAX;
+        cost_list[((k + 2) % 4) + 1] = cost_list[0];
+        cost_list[0] = bestsad;
+
+        if (check_bounds(x, br, bc, 1 << s)) {
+          for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
+            const MV this_mv = {br + candidates[s][next_chkpts_indices[i]].row,
+                                bc + candidates[s][next_chkpts_indices[i]].col};
+            cost_list[next_chkpts_indices[i] + 1] =
+            thissad = vfp->sdf(what->buf, what->stride,
+                               get_buf_from_mv(in_what, &this_mv),
+                               in_what->stride);
+            CHECK_BETTER
+          }
+        } else {
+          for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
+            const MV this_mv = {br + candidates[s][next_chkpts_indices[i]].row,
+                                bc + candidates[s][next_chkpts_indices[i]].col};
+            if (!is_mv_in(x, &this_mv)) {
+              cost_list[next_chkpts_indices[i] + 1] = INT_MAX;
+              continue;
+            }
+            cost_list[next_chkpts_indices[i] + 1] =
+            thissad = vfp->sdf(what->buf, what->stride,
+                               get_buf_from_mv(in_what, &this_mv),
+                               in_what->stride);
+            CHECK_BETTER
+          }
+        }
+
+        if (best_site != -1) {
+          k = next_chkpts_indices[best_site];
+          br += candidates[s][k].row;
+          bc += candidates[s][k].col;
+        }
+      }
+    }
+  }
+
+  // Returns the one-away integer pel sad values around the best as follows:
+  // cost_list[0]: sad at the best integer pel
+  // cost_list[1]: sad at delta {0, -1} (left)   from the best integer pel
+  // cost_list[2]: sad at delta { 1, 0} (bottom) from the best integer pel
+  // cost_list[3]: sad at delta { 0, 1} (right)  from the best integer pel
+  // cost_list[4]: sad at delta {-1, 0} (top)    from the best integer pel
+  if (cost_list) {
+    static const MV neighbors[4] = {{0, -1}, {1, 0}, {0, 1}, {-1, 0}};
+    if (cost_list[0] == INT_MAX) {
+      cost_list[0] = bestsad;
+      if (check_bounds(x, br, bc, 1)) {
+        for (i = 0; i < 4; i++) {
+          const MV this_mv = { br + neighbors[i].row,
+                               bc + neighbors[i].col };
+          cost_list[i + 1] = vfp->sdf(what->buf, what->stride,
+                                     get_buf_from_mv(in_what, &this_mv),
+                                     in_what->stride);
+        }
+      } else {
+        for (i = 0; i < 4; i++) {
+          const MV this_mv = {br + neighbors[i].row,
+            bc + neighbors[i].col};
+          if (!is_mv_in(x, &this_mv))
+            cost_list[i + 1] = INT_MAX;
+          else
+            cost_list[i + 1] = vfp->sdf(what->buf, what->stride,
+                                       get_buf_from_mv(in_what, &this_mv),
+                                       in_what->stride);
+        }
+      }
+    } else {
+      if (use_mvcost) {
+        for (i = 0; i < 4; i++) {
+          const MV this_mv = {br + neighbors[i].row,
+            bc + neighbors[i].col};
+          if (cost_list[i + 1] != INT_MAX) {
+            cost_list[i + 1] +=
+                mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
+          }
+        }
+      }
+    }
+  }
+  best_mv->row = br;
+  best_mv->col = bc;
+  return bestsad;
+}
+
+int vp9_get_mvpred_var(const MACROBLOCK *x,
+                       const MV *best_mv, const MV *center_mv,
+                       const vp9_variance_fn_ptr_t *vfp,
+                       int use_mvcost) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  const MV mv = {best_mv->row * 8, best_mv->col * 8};
+  unsigned int unused;
+
+  return vfp->vf(what->buf, what->stride,
+                 get_buf_from_mv(in_what, best_mv), in_what->stride, &unused) +
+      (use_mvcost ?  mv_err_cost(&mv, center_mv, x->nmvjointcost,
+                                 x->mvcost, x->errorperbit) : 0);
+}
+
+int vp9_get_mvpred_av_var(const MACROBLOCK *x,
+                          const MV *best_mv, const MV *center_mv,
+                          const uint8_t *second_pred,
+                          const vp9_variance_fn_ptr_t *vfp,
+                          int use_mvcost) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  const MV mv = {best_mv->row * 8, best_mv->col * 8};
+  unsigned int unused;
+
+  return vfp->svaf(get_buf_from_mv(in_what, best_mv), in_what->stride, 0, 0,
+                   what->buf, what->stride, &unused, second_pred) +
+      (use_mvcost ?  mv_err_cost(&mv, center_mv, x->nmvjointcost,
+                                 x->mvcost, x->errorperbit) : 0);
+}
+
+static int hex_search(const MACROBLOCK *x,
+                      MV *ref_mv,
+                      int search_param,
+                      int sad_per_bit,
+                      int do_init_search,
+                      int *cost_list,
+                      const vp9_variance_fn_ptr_t *vfp,
+                      int use_mvcost,
+                      const MV *center_mv, MV *best_mv) {
+  // First scale has 8-closest points, the rest have 6 points in hex shape
+  // at increasing scales
+  static const int hex_num_candidates[MAX_PATTERN_SCALES] = {
+    8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6
+  };
+  // Note that the largest candidate step at each scale is 2^scale
+  static const MV hex_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = {
+    {{-1, -1}, {0, -1}, {1, -1}, {1, 0}, {1, 1}, { 0, 1}, { -1, 1}, {-1, 0}},
+    {{-1, -2}, {1, -2}, {2, 0}, {1, 2}, { -1, 2}, { -2, 0}},
+    {{-2, -4}, {2, -4}, {4, 0}, {2, 4}, { -2, 4}, { -4, 0}},
+    {{-4, -8}, {4, -8}, {8, 0}, {4, 8}, { -4, 8}, { -8, 0}},
+    {{-8, -16}, {8, -16}, {16, 0}, {8, 16}, { -8, 16}, { -16, 0}},
+    {{-16, -32}, {16, -32}, {32, 0}, {16, 32}, { -16, 32}, { -32, 0}},
+    {{-32, -64}, {32, -64}, {64, 0}, {32, 64}, { -32, 64}, { -64, 0}},
+    {{-64, -128}, {64, -128}, {128, 0}, {64, 128}, { -64, 128}, { -128, 0}},
+    {{-128, -256}, {128, -256}, {256, 0}, {128, 256}, { -128, 256}, { -256, 0}},
+    {{-256, -512}, {256, -512}, {512, 0}, {256, 512}, { -256, 512}, { -512, 0}},
+    {{-512, -1024}, {512, -1024}, {1024, 0}, {512, 1024}, { -512, 1024},
+      { -1024, 0}},
+  };
+  return vp9_pattern_search(x, ref_mv, search_param, sad_per_bit,
+                            do_init_search, cost_list, vfp, use_mvcost,
+                            center_mv, best_mv,
+                            hex_num_candidates, hex_candidates);
+}
+
+static int bigdia_search(const MACROBLOCK *x,
+                         MV *ref_mv,
+                         int search_param,
+                         int sad_per_bit,
+                         int do_init_search,
+                         int *cost_list,
+                         const vp9_variance_fn_ptr_t *vfp,
+                         int use_mvcost,
+                         const MV *center_mv,
+                         MV *best_mv) {
+  // First scale has 4-closest points, the rest have 8 points in diamond
+  // shape at increasing scales
+  static const int bigdia_num_candidates[MAX_PATTERN_SCALES] = {
+    4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  };
+  // Note that the largest candidate step at each scale is 2^scale
+  static const MV bigdia_candidates[MAX_PATTERN_SCALES]
+                                   [MAX_PATTERN_CANDIDATES] = {
+    {{0, -1}, {1, 0}, { 0, 1}, {-1, 0}},
+    {{-1, -1}, {0, -2}, {1, -1}, {2, 0}, {1, 1}, {0, 2}, {-1, 1}, {-2, 0}},
+    {{-2, -2}, {0, -4}, {2, -2}, {4, 0}, {2, 2}, {0, 4}, {-2, 2}, {-4, 0}},
+    {{-4, -4}, {0, -8}, {4, -4}, {8, 0}, {4, 4}, {0, 8}, {-4, 4}, {-8, 0}},
+    {{-8, -8}, {0, -16}, {8, -8}, {16, 0}, {8, 8}, {0, 16}, {-8, 8}, {-16, 0}},
+    {{-16, -16}, {0, -32}, {16, -16}, {32, 0}, {16, 16}, {0, 32},
+      {-16, 16}, {-32, 0}},
+    {{-32, -32}, {0, -64}, {32, -32}, {64, 0}, {32, 32}, {0, 64},
+      {-32, 32}, {-64, 0}},
+    {{-64, -64}, {0, -128}, {64, -64}, {128, 0}, {64, 64}, {0, 128},
+      {-64, 64}, {-128, 0}},
+    {{-128, -128}, {0, -256}, {128, -128}, {256, 0}, {128, 128}, {0, 256},
+      {-128, 128}, {-256, 0}},
+    {{-256, -256}, {0, -512}, {256, -256}, {512, 0}, {256, 256}, {0, 512},
+      {-256, 256}, {-512, 0}},
+    {{-512, -512}, {0, -1024}, {512, -512}, {1024, 0}, {512, 512}, {0, 1024},
+      {-512, 512}, {-1024, 0}},
+  };
+  return vp9_pattern_search_sad(x, ref_mv, search_param, sad_per_bit,
+                                do_init_search, cost_list, vfp, use_mvcost,
+                                center_mv, best_mv,
+                                bigdia_num_candidates, bigdia_candidates);
+}
+
+static int square_search(const MACROBLOCK *x,
+                         MV *ref_mv,
+                         int search_param,
+                         int sad_per_bit,
+                         int do_init_search,
+                         int *cost_list,
+                         const vp9_variance_fn_ptr_t *vfp,
+                         int use_mvcost,
+                         const MV *center_mv,
+                         MV *best_mv) {
+  // All scales have 8 closest points in square shape
+  static const int square_num_candidates[MAX_PATTERN_SCALES] = {
+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  };
+  // Note that the largest candidate step at each scale is 2^scale
+  static const MV square_candidates[MAX_PATTERN_SCALES]
+                                   [MAX_PATTERN_CANDIDATES] = {
+    {{-1, -1}, {0, -1}, {1, -1}, {1, 0}, {1, 1}, {0, 1}, {-1, 1}, {-1, 0}},
+    {{-2, -2}, {0, -2}, {2, -2}, {2, 0}, {2, 2}, {0, 2}, {-2, 2}, {-2, 0}},
+    {{-4, -4}, {0, -4}, {4, -4}, {4, 0}, {4, 4}, {0, 4}, {-4, 4}, {-4, 0}},
+    {{-8, -8}, {0, -8}, {8, -8}, {8, 0}, {8, 8}, {0, 8}, {-8, 8}, {-8, 0}},
+    {{-16, -16}, {0, -16}, {16, -16}, {16, 0}, {16, 16}, {0, 16},
+      {-16, 16}, {-16, 0}},
+    {{-32, -32}, {0, -32}, {32, -32}, {32, 0}, {32, 32}, {0, 32},
+      {-32, 32}, {-32, 0}},
+    {{-64, -64}, {0, -64}, {64, -64}, {64, 0}, {64, 64}, {0, 64},
+      {-64, 64}, {-64, 0}},
+    {{-128, -128}, {0, -128}, {128, -128}, {128, 0}, {128, 128}, {0, 128},
+      {-128, 128}, {-128, 0}},
+    {{-256, -256}, {0, -256}, {256, -256}, {256, 0}, {256, 256}, {0, 256},
+      {-256, 256}, {-256, 0}},
+    {{-512, -512}, {0, -512}, {512, -512}, {512, 0}, {512, 512}, {0, 512},
+      {-512, 512}, {-512, 0}},
+    {{-1024, -1024}, {0, -1024}, {1024, -1024}, {1024, 0}, {1024, 1024},
+      {0, 1024}, {-1024, 1024}, {-1024, 0}},
+  };
+  return vp9_pattern_search(x, ref_mv, search_param, sad_per_bit,
+                            do_init_search, cost_list, vfp, use_mvcost,
+                            center_mv, best_mv,
+                            square_num_candidates, square_candidates);
+}
+
+static int fast_hex_search(const MACROBLOCK *x,
+                           MV *ref_mv,
+                           int search_param,
+                           int sad_per_bit,
+                           int do_init_search,  // must be zero for fast_hex
+                           int *cost_list,
+                           const vp9_variance_fn_ptr_t *vfp,
+                           int use_mvcost,
+                           const MV *center_mv,
+                           MV *best_mv) {
+  return hex_search(x, ref_mv, VPXMAX(MAX_MVSEARCH_STEPS - 2, search_param),
+                    sad_per_bit, do_init_search, cost_list, vfp, use_mvcost,
+                    center_mv, best_mv);
+}
+
+static int fast_dia_search(const MACROBLOCK *x,
+                           MV *ref_mv,
+                           int search_param,
+                           int sad_per_bit,
+                           int do_init_search,
+                           int *cost_list,
+                           const vp9_variance_fn_ptr_t *vfp,
+                           int use_mvcost,
+                           const MV *center_mv,
+                           MV *best_mv) {
+  return bigdia_search(
+      x, ref_mv, VPXMAX(MAX_MVSEARCH_STEPS - 2, search_param), sad_per_bit,
+      do_init_search, cost_list, vfp, use_mvcost, center_mv, best_mv);
+}
+
+#undef CHECK_BETTER
+
+// Exhuastive motion search around a given centre position with a given
+// step size.
+static int exhuastive_mesh_search(const MACROBLOCK *x,
+                                  MV *ref_mv, MV *best_mv,
+                                  int range, int step, int sad_per_bit,
+                                  const vp9_variance_fn_ptr_t *fn_ptr,
+                                  const MV *center_mv) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  MV fcenter_mv = {center_mv->row, center_mv->col};
+  unsigned int best_sad = INT_MAX;
+  int r, c, i;
+  int start_col, end_col, start_row, end_row;
+  int col_step = (step > 1) ? step : 4;
+
+  assert(step >= 1);
+
+  clamp_mv(&fcenter_mv, x->mv_col_min, x->mv_col_max,
+           x->mv_row_min, x->mv_row_max);
+  *best_mv = fcenter_mv;
+  best_sad = fn_ptr->sdf(what->buf, what->stride,
+             get_buf_from_mv(in_what, &fcenter_mv), in_what->stride) +
+             mvsad_err_cost(x, &fcenter_mv, ref_mv, sad_per_bit);
+  start_row = VPXMAX(-range, x->mv_row_min - fcenter_mv.row);
+  start_col = VPXMAX(-range, x->mv_col_min - fcenter_mv.col);
+  end_row = VPXMIN(range, x->mv_row_max - fcenter_mv.row);
+  end_col = VPXMIN(range, x->mv_col_max - fcenter_mv.col);
+
+  for (r = start_row; r <= end_row; r += step) {
+    for (c = start_col; c <= end_col; c += col_step) {
+      // Step > 1 means we are not checking every location in this pass.
+      if (step > 1) {
+        const MV mv = {fcenter_mv.row + r, fcenter_mv.col + c};
+        unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
+                           get_buf_from_mv(in_what, &mv), in_what->stride);
+        if (sad < best_sad) {
+          sad += mvsad_err_cost(x, &mv, ref_mv, sad_per_bit);
+          if (sad < best_sad) {
+            best_sad = sad;
+            *best_mv = mv;
+          }
+        }
+      } else {
+        // 4 sads in a single call if we are checking every location
+        if (c + 3 <= end_col) {
+          unsigned int sads[4];
+          const uint8_t *addrs[4];
+          for (i = 0; i < 4; ++i) {
+            const MV mv = {fcenter_mv.row + r, fcenter_mv.col + c + i};
+            addrs[i] = get_buf_from_mv(in_what, &mv);
+          }
+          fn_ptr->sdx4df(what->buf, what->stride, addrs,
+                         in_what->stride, sads);
+
+          for (i = 0; i < 4; ++i) {
+            if (sads[i] < best_sad) {
+              const MV mv = {fcenter_mv.row + r, fcenter_mv.col + c + i};
+              const unsigned int sad = sads[i] +
+                  mvsad_err_cost(x, &mv, ref_mv, sad_per_bit);
+              if (sad < best_sad) {
+                best_sad = sad;
+                *best_mv = mv;
+              }
+            }
+          }
+        } else {
+          for (i = 0; i < end_col - c; ++i) {
+            const MV mv = {fcenter_mv.row + r, fcenter_mv.col + c + i};
+            unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
+                get_buf_from_mv(in_what, &mv), in_what->stride);
+            if (sad < best_sad) {
+              sad += mvsad_err_cost(x, &mv, ref_mv, sad_per_bit);
+              if (sad < best_sad) {
+                best_sad = sad;
+                *best_mv = mv;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return best_sad;
+}
+
+int vp9_diamond_search_sad_c(const MACROBLOCK *x,
+                             const search_site_config *cfg,
+                             MV *ref_mv, MV *best_mv, int search_param,
+                             int sad_per_bit, int *num00,
+                             const vp9_variance_fn_ptr_t *fn_ptr,
+                             const MV *center_mv) {
+  int i, j, step;
+
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  uint8_t *what = x->plane[0].src.buf;
+  const int what_stride = x->plane[0].src.stride;
+  const uint8_t *in_what;
+  const int in_what_stride = xd->plane[0].pre[0].stride;
+  const uint8_t *best_address;
+
+  unsigned int bestsad = INT_MAX;
+  int best_site = -1;
+  int last_site = -1;
+
+  int ref_row;
+  int ref_col;
+
+  // search_param determines the length of the initial step and hence the number
+  // of iterations.
+  // 0 = initial step (MAX_FIRST_STEP) pel
+  // 1 = (MAX_FIRST_STEP/2) pel,
+  // 2 = (MAX_FIRST_STEP/4) pel...
+//  const search_site *ss = &cfg->ss[search_param * cfg->searches_per_step];
+  const MV *ss_mv = &cfg->ss_mv[search_param * cfg->searches_per_step];
+  const intptr_t *ss_os = &cfg->ss_os[search_param * cfg->searches_per_step];
+  const int tot_steps = cfg->total_steps - search_param;
+
+  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+  clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
+  ref_row = ref_mv->row;
+  ref_col = ref_mv->col;
+  *num00 = 0;
+  best_mv->row = ref_row;
+  best_mv->col = ref_col;
+
+  // Work out the start point for the search
+  in_what = xd->plane[0].pre[0].buf + ref_row * in_what_stride + ref_col;
+  best_address = in_what;
+
+  // Check the starting position
+  bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride)
+                + mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit);
+
+  i = 0;
+
+  for (step = 0; step < tot_steps; step++) {
+    int all_in = 1, t;
+
+    // All_in is true if every one of the points we are checking are within
+    // the bounds of the image.
+    all_in &= ((best_mv->row + ss_mv[i].row) > x->mv_row_min);
+    all_in &= ((best_mv->row + ss_mv[i + 1].row) < x->mv_row_max);
+    all_in &= ((best_mv->col + ss_mv[i + 2].col) > x->mv_col_min);
+    all_in &= ((best_mv->col + ss_mv[i + 3].col) < x->mv_col_max);
+
+    // If all the pixels are within the bounds we don't check whether the
+    // search point is valid in this loop,  otherwise we check each point
+    // for validity..
+    if (all_in) {
+      unsigned int sad_array[4];
+
+      for (j = 0; j < cfg->searches_per_step; j += 4) {
+        unsigned char const *block_offset[4];
+
+        for (t = 0; t < 4; t++)
+          block_offset[t] = ss_os[i + t] + best_address;
+
+        fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride,
+                       sad_array);
+
+        for (t = 0; t < 4; t++, i++) {
+          if (sad_array[t] < bestsad) {
+            const MV this_mv = {best_mv->row + ss_mv[i].row,
+                                best_mv->col + ss_mv[i].col};
+            sad_array[t] += mvsad_err_cost(x, &this_mv, &fcenter_mv,
+                                           sad_per_bit);
+            if (sad_array[t] < bestsad) {
+              bestsad = sad_array[t];
+              best_site = i;
+            }
+          }
+        }
+      }
+    } else {
+      for (j = 0; j < cfg->searches_per_step; j++) {
+        // Trap illegal vectors
+        const MV this_mv = {best_mv->row + ss_mv[i].row,
+                            best_mv->col + ss_mv[i].col};
+
+        if (is_mv_in(x, &this_mv)) {
+          const uint8_t *const check_here = ss_os[i] + best_address;
+          unsigned int thissad = fn_ptr->sdf(what, what_stride, check_here,
+                                             in_what_stride);
+
+          if (thissad < bestsad) {
+            thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
+            if (thissad < bestsad) {
+              bestsad = thissad;
+              best_site = i;
+            }
+          }
+        }
+        i++;
+      }
+    }
+    if (best_site != last_site) {
+      best_mv->row += ss_mv[best_site].row;
+      best_mv->col += ss_mv[best_site].col;
+      best_address += ss_os[best_site];
+      last_site = best_site;
+#if defined(NEW_DIAMOND_SEARCH)
+      while (1) {
+        const MV this_mv = {best_mv->row + ss_mv[best_site].row,
+                            best_mv->col + ss_mv[best_site].col};
+        if (is_mv_in(x, &this_mv)) {
+          const uint8_t *const check_here = ss_os[best_site] + best_address;
+          unsigned int thissad = fn_ptr->sdf(what, what_stride, check_here,
+                                             in_what_stride);
+          if (thissad < bestsad) {
+            thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
+            if (thissad < bestsad) {
+              bestsad = thissad;
+              best_mv->row += ss_mv[best_site].row;
+              best_mv->col += ss_mv[best_site].col;
+              best_address += ss_os[best_site];
+              continue;
+            }
+          }
+        }
+        break;
+      }
+#endif
+    } else if (best_address == in_what) {
+      (*num00)++;
+    }
+  }
+  return bestsad;
+}
+
+static int vector_match(int16_t *ref, int16_t *src, int bwl) {
+  int best_sad = INT_MAX;
+  int this_sad;
+  int d;
+  int center, offset = 0;
+  int bw = 4 << bwl;  // redundant variable, to be changed in the experiments.
+  for (d = 0; d <= bw; d += 16) {
+    this_sad = vpx_vector_var(&ref[d], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      offset = d;
+    }
+  }
+  center = offset;
+
+  for (d = -8; d <= 8; d += 16) {
+    int this_pos = offset + d;
+    // check limit
+    if (this_pos < 0 || this_pos > bw)
+      continue;
+    this_sad = vpx_vector_var(&ref[this_pos], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      center = this_pos;
+    }
+  }
+  offset = center;
+
+  for (d = -4; d <= 4; d += 8) {
+    int this_pos = offset + d;
+    // check limit
+    if (this_pos < 0 || this_pos > bw)
+      continue;
+    this_sad = vpx_vector_var(&ref[this_pos], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      center = this_pos;
+    }
+  }
+  offset = center;
+
+  for (d = -2; d <= 2; d += 4) {
+    int this_pos = offset + d;
+    // check limit
+    if (this_pos < 0 || this_pos > bw)
+      continue;
+    this_sad = vpx_vector_var(&ref[this_pos], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      center = this_pos;
+    }
+  }
+  offset = center;
+
+  for (d = -1; d <= 1; d += 2) {
+    int this_pos = offset + d;
+    // check limit
+    if (this_pos < 0 || this_pos > bw)
+      continue;
+    this_sad = vpx_vector_var(&ref[this_pos], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      center = this_pos;
+    }
+  }
+
+  return (center - (bw >> 1));
+}
+
+static const MV search_pos[4] = {
+    {-1, 0}, {0, -1}, {0, 1}, {1, 0},
+};
+
+unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,
+                                           BLOCK_SIZE bsize,
+                                           int mi_row, int mi_col) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  MODE_INFO *mi = xd->mi[0];
+  struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0, 0}};
+  DECLARE_ALIGNED(16, int16_t, hbuf[128]);
+  DECLARE_ALIGNED(16, int16_t, vbuf[128]);
+  DECLARE_ALIGNED(16, int16_t, src_hbuf[64]);
+  DECLARE_ALIGNED(16, int16_t, src_vbuf[64]);
+  int idx;
+  const int bw = 4 << b_width_log2_lookup[bsize];
+  const int bh = 4 << b_height_log2_lookup[bsize];
+  const int search_width = bw << 1;
+  const int search_height = bh << 1;
+  const int src_stride = x->plane[0].src.stride;
+  const int ref_stride = xd->plane[0].pre[0].stride;
+  uint8_t const *ref_buf, *src_buf;
+  MV *tmp_mv = &xd->mi[0]->mv[0].as_mv;
+  unsigned int best_sad, tmp_sad, this_sad[4];
+  MV this_mv;
+  const int norm_factor = 3 + (bw >> 5);
+  const YV12_BUFFER_CONFIG *scaled_ref_frame =
+      vp9_get_scaled_ref_frame(cpi, mi->ref_frame[0]);
+
+  if (scaled_ref_frame) {
+    int i;
+    // Swap out the reference frame for a version that's been scaled to
+    // match the resolution of the current frame, allowing the existing
+    // motion search code to be used without additional modifications.
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      backup_yv12[i] = xd->plane[i].pre[0];
+    vp9_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
+  }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  {
+    unsigned int this_sad;
+    tmp_mv->row = 0;
+    tmp_mv->col = 0;
+    this_sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride,
+                                      xd->plane[0].pre[0].buf, ref_stride);
+
+    if (scaled_ref_frame) {
+      int i;
+      for (i = 0; i < MAX_MB_PLANE; i++)
+        xd->plane[i].pre[0] = backup_yv12[i];
+    }
+    return this_sad;
+  }
+#endif
+
+  // Set up prediction 1-D reference set
+  ref_buf = xd->plane[0].pre[0].buf - (bw >> 1);
+  for (idx = 0; idx < search_width; idx += 16) {
+    vpx_int_pro_row(&hbuf[idx], ref_buf, ref_stride, bh);
+    ref_buf += 16;
+  }
+
+  ref_buf = xd->plane[0].pre[0].buf - (bh >> 1) * ref_stride;
+  for (idx = 0; idx < search_height; ++idx) {
+    vbuf[idx] = vpx_int_pro_col(ref_buf, bw) >> norm_factor;
+    ref_buf += ref_stride;
+  }
+
+  // Set up src 1-D reference set
+  for (idx = 0; idx < bw; idx += 16) {
+    src_buf = x->plane[0].src.buf + idx;
+    vpx_int_pro_row(&src_hbuf[idx], src_buf, src_stride, bh);
+  }
+
+  src_buf = x->plane[0].src.buf;
+  for (idx = 0; idx < bh; ++idx) {
+    src_vbuf[idx] = vpx_int_pro_col(src_buf, bw) >> norm_factor;
+    src_buf += src_stride;
+  }
+
+  // Find the best match per 1-D search
+  tmp_mv->col = vector_match(hbuf, src_hbuf, b_width_log2_lookup[bsize]);
+  tmp_mv->row = vector_match(vbuf, src_vbuf, b_height_log2_lookup[bsize]);
+
+  this_mv = *tmp_mv;
+  src_buf = x->plane[0].src.buf;
+  ref_buf = xd->plane[0].pre[0].buf + this_mv.row * ref_stride + this_mv.col;
+  best_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
+
+  {
+    const uint8_t * const pos[4] = {
+        ref_buf - ref_stride,
+        ref_buf - 1,
+        ref_buf + 1,
+        ref_buf + ref_stride,
+    };
+
+    cpi->fn_ptr[bsize].sdx4df(src_buf, src_stride, pos, ref_stride, this_sad);
+  }
+
+  for (idx = 0; idx < 4; ++idx) {
+    if (this_sad[idx] < best_sad) {
+      best_sad = this_sad[idx];
+      tmp_mv->row = search_pos[idx].row + this_mv.row;
+      tmp_mv->col = search_pos[idx].col + this_mv.col;
+    }
+  }
+
+  if (this_sad[0] < this_sad[3])
+    this_mv.row -= 1;
+  else
+    this_mv.row += 1;
+
+  if (this_sad[1] < this_sad[2])
+    this_mv.col -= 1;
+  else
+    this_mv.col += 1;
+
+  ref_buf = xd->plane[0].pre[0].buf + this_mv.row * ref_stride + this_mv.col;
+
+  tmp_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride,
+                                   ref_buf, ref_stride);
+  if (best_sad > tmp_sad) {
+    *tmp_mv = this_mv;
+    best_sad = tmp_sad;
+  }
+
+  tmp_mv->row *= 8;
+  tmp_mv->col *= 8;
+
+  if (scaled_ref_frame) {
+    int i;
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      xd->plane[i].pre[0] = backup_yv12[i];
+  }
+
+  return best_sad;
+}
+
+// Runs sequence of diamond searches in smaller steps for RD.
+/* do_refine: If last step (1-away) of n-step search doesn't pick the center
+              point as the best match, we will do a final 1-away diamond
+              refining search  */
+static int full_pixel_diamond(const VP9_COMP *cpi, MACROBLOCK *x,
+                              MV *mvp_full, int step_param,
+                              int sadpb, int further_steps, int do_refine,
+                              int *cost_list,
+                              const vp9_variance_fn_ptr_t *fn_ptr,
+                              const MV *ref_mv, MV *dst_mv) {
+  MV temp_mv;
+  int thissme, n, num00 = 0;
+  int bestsme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, &temp_mv,
+                                        step_param, sadpb, &n,
+                                        fn_ptr, ref_mv);
+  if (bestsme < INT_MAX)
+    bestsme = vp9_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
+  *dst_mv = temp_mv;
+
+  // If there won't be more n-step search, check to see if refining search is
+  // needed.
+  if (n > further_steps)
+    do_refine = 0;
+
+  while (n < further_steps) {
+    ++n;
+
+    if (num00) {
+      num00--;
+    } else {
+      thissme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, &temp_mv,
+                                        step_param + n, sadpb, &num00,
+                                        fn_ptr, ref_mv);
+      if (thissme < INT_MAX)
+        thissme = vp9_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
+
+      // check to see if refining search is needed.
+      if (num00 > further_steps - n)
+        do_refine = 0;
+
+      if (thissme < bestsme) {
+        bestsme = thissme;
+        *dst_mv = temp_mv;
+      }
+    }
+  }
+
+  // final 1-away diamond refining search
+  if (do_refine) {
+    const int search_range = 8;
+    MV best_mv = *dst_mv;
+    thissme = vp9_refining_search_sad(x, &best_mv, sadpb, search_range,
+                                       fn_ptr, ref_mv);
+    if (thissme < INT_MAX)
+      thissme = vp9_get_mvpred_var(x, &best_mv, ref_mv, fn_ptr, 1);
+    if (thissme < bestsme) {
+      bestsme = thissme;
+      *dst_mv = best_mv;
+    }
+  }
+
+  // Return cost list.
+  if (cost_list) {
+    calc_int_cost_list(x, ref_mv, sadpb, fn_ptr, dst_mv, cost_list);
+  }
+  return bestsme;
+}
+
+#define MIN_RANGE 7
+#define MAX_RANGE 256
+#define MIN_INTERVAL 1
+// Runs an limited range exhaustive mesh search using a pattern set
+// according to the encode speed profile.
+static int full_pixel_exhaustive(VP9_COMP *cpi, MACROBLOCK *x,
+                                 MV *centre_mv_full, int sadpb,  int *cost_list,
+                                 const vp9_variance_fn_ptr_t *fn_ptr,
+                                 const MV *ref_mv, MV *dst_mv) {
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  MV temp_mv = {centre_mv_full->row, centre_mv_full->col};
+  MV f_ref_mv = {ref_mv->row >> 3, ref_mv->col >> 3};
+  int bestsme;
+  int i;
+  int interval = sf->mesh_patterns[0].interval;
+  int range = sf->mesh_patterns[0].range;
+  int baseline_interval_divisor;
+
+  // Keep track of number of exhaustive calls (this frame in this thread).
+  ++(*x->ex_search_count_ptr);
+
+  // Trap illegal values for interval and range for this function.
+  if ((range < MIN_RANGE) || (range > MAX_RANGE) ||
+      (interval < MIN_INTERVAL) || (interval > range))
+    return INT_MAX;
+
+  baseline_interval_divisor = range / interval;
+
+  // Check size of proposed first range against magnitude of the centre
+  // value used as a starting point.
+  range = VPXMAX(range, (5 * VPXMAX(abs(temp_mv.row), abs(temp_mv.col))) / 4);
+  range = VPXMIN(range, MAX_RANGE);
+  interval = VPXMAX(interval, range / baseline_interval_divisor);
+
+  // initial search
+  bestsme = exhuastive_mesh_search(x, &f_ref_mv, &temp_mv, range,
+                                  interval, sadpb, fn_ptr, &temp_mv);
+
+  if ((interval > MIN_INTERVAL) && (range > MIN_RANGE)) {
+    // Progressive searches with range and step size decreasing each time
+    // till we reach a step size of 1. Then break out.
+    for (i = 1; i < MAX_MESH_STEP; ++i) {
+      // First pass with coarser step and longer range
+      bestsme = exhuastive_mesh_search(x, &f_ref_mv, &temp_mv,
+                                       sf->mesh_patterns[i].range,
+                                       sf->mesh_patterns[i].interval,
+                                       sadpb, fn_ptr, &temp_mv);
+
+      if (sf->mesh_patterns[i].interval == 1)
+        break;
+    }
+  }
+
+  if (bestsme < INT_MAX)
+    bestsme = vp9_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
+  *dst_mv = temp_mv;
+
+  // Return cost list.
+  if (cost_list) {
+    calc_int_cost_list(x, ref_mv, sadpb, fn_ptr, dst_mv, cost_list);
+  }
+  return bestsme;
+}
+
+int vp9_full_search_sad_c(const MACROBLOCK *x, const MV *ref_mv,
+                          int sad_per_bit, int distance,
+                          const vp9_variance_fn_ptr_t *fn_ptr,
+                          const MV *center_mv, MV *best_mv) {
+  int r, c;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  const int row_min = VPXMAX(ref_mv->row - distance, x->mv_row_min);
+  const int row_max = VPXMIN(ref_mv->row + distance, x->mv_row_max);
+  const int col_min = VPXMAX(ref_mv->col - distance, x->mv_col_min);
+  const int col_max = VPXMIN(ref_mv->col + distance, x->mv_col_max);
+  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+  int best_sad = fn_ptr->sdf(what->buf, what->stride,
+      get_buf_from_mv(in_what, ref_mv), in_what->stride) +
+      mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
+  *best_mv = *ref_mv;
+
+  for (r = row_min; r < row_max; ++r) {
+    for (c = col_min; c < col_max; ++c) {
+      const MV mv = {r, c};
+      const int sad = fn_ptr->sdf(what->buf, what->stride,
+          get_buf_from_mv(in_what, &mv), in_what->stride) +
+              mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+      if (sad < best_sad) {
+        best_sad = sad;
+        *best_mv = mv;
+      }
+    }
+  }
+  return best_sad;
+}
+
+int vp9_full_search_sadx3(const MACROBLOCK *x, const MV *ref_mv,
+                          int sad_per_bit, int distance,
+                          const vp9_variance_fn_ptr_t *fn_ptr,
+                          const MV *center_mv, MV *best_mv) {
+  int r;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  const int row_min = VPXMAX(ref_mv->row - distance, x->mv_row_min);
+  const int row_max = VPXMIN(ref_mv->row + distance, x->mv_row_max);
+  const int col_min = VPXMAX(ref_mv->col - distance, x->mv_col_min);
+  const int col_max = VPXMIN(ref_mv->col + distance, x->mv_col_max);
+  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+  unsigned int best_sad = fn_ptr->sdf(what->buf, what->stride,
+      get_buf_from_mv(in_what, ref_mv), in_what->stride) +
+      mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
+  *best_mv = *ref_mv;
+
+  for (r = row_min; r < row_max; ++r) {
+    int c = col_min;
+    const uint8_t *check_here = &in_what->buf[r * in_what->stride + c];
+
+    if (fn_ptr->sdx3f != NULL) {
+      while ((c + 2) < col_max) {
+        int i;
+        DECLARE_ALIGNED(16, uint32_t, sads[3]);
+
+        fn_ptr->sdx3f(what->buf, what->stride, check_here, in_what->stride,
+                      sads);
+
+        for (i = 0; i < 3; ++i) {
+          unsigned int sad = sads[i];
+          if (sad < best_sad) {
+            const MV mv = {r, c};
+            sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+            if (sad < best_sad) {
+              best_sad = sad;
+              *best_mv = mv;
+            }
+          }
+          ++check_here;
+          ++c;
+        }
+      }
+    }
+
+    while (c < col_max) {
+      unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
+                                     check_here, in_what->stride);
+      if (sad < best_sad) {
+        const MV mv = {r, c};
+        sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+        if (sad < best_sad) {
+          best_sad = sad;
+          *best_mv = mv;
+        }
+      }
+      ++check_here;
+      ++c;
+    }
+  }
+
+  return best_sad;
+}
+
+int vp9_full_search_sadx8(const MACROBLOCK *x, const MV *ref_mv,
+                          int sad_per_bit, int distance,
+                          const vp9_variance_fn_ptr_t *fn_ptr,
+                          const MV *center_mv, MV *best_mv) {
+  int r;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  const int row_min = VPXMAX(ref_mv->row - distance, x->mv_row_min);
+  const int row_max = VPXMIN(ref_mv->row + distance, x->mv_row_max);
+  const int col_min = VPXMAX(ref_mv->col - distance, x->mv_col_min);
+  const int col_max = VPXMIN(ref_mv->col + distance, x->mv_col_max);
+  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+  unsigned int best_sad = fn_ptr->sdf(what->buf, what->stride,
+      get_buf_from_mv(in_what, ref_mv), in_what->stride) +
+      mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
+  *best_mv = *ref_mv;
+
+  for (r = row_min; r < row_max; ++r) {
+    int c = col_min;
+    const uint8_t *check_here = &in_what->buf[r * in_what->stride + c];
+
+    if (fn_ptr->sdx8f != NULL) {
+      while ((c + 7) < col_max) {
+        int i;
+        DECLARE_ALIGNED(16, uint32_t, sads[8]);
+
+        fn_ptr->sdx8f(what->buf, what->stride, check_here, in_what->stride,
+                      sads);
+
+        for (i = 0; i < 8; ++i) {
+          unsigned int sad = sads[i];
+          if (sad < best_sad) {
+            const MV mv = {r, c};
+            sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+            if (sad < best_sad) {
+              best_sad = sad;
+              *best_mv = mv;
+            }
+          }
+          ++check_here;
+          ++c;
+        }
+      }
+    }
+
+    if (fn_ptr->sdx3f != NULL) {
+      while ((c + 2) < col_max) {
+        int i;
+        DECLARE_ALIGNED(16, uint32_t, sads[3]);
+
+        fn_ptr->sdx3f(what->buf, what->stride, check_here, in_what->stride,
+                      sads);
+
+        for (i = 0; i < 3; ++i) {
+          unsigned int sad = sads[i];
+          if (sad < best_sad) {
+            const MV mv = {r, c};
+            sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+            if (sad < best_sad) {
+              best_sad = sad;
+              *best_mv = mv;
+            }
+          }
+          ++check_here;
+          ++c;
+        }
+      }
+    }
+
+    while (c < col_max) {
+      unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
+                                     check_here, in_what->stride);
+      if (sad < best_sad) {
+        const MV mv = {r, c};
+        sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+        if (sad < best_sad) {
+          best_sad = sad;
+          *best_mv = mv;
+        }
+      }
+      ++check_here;
+      ++c;
+    }
+  }
+
+  return best_sad;
+}
+
+int vp9_refining_search_sad(const MACROBLOCK *x,
+                            MV *ref_mv, int error_per_bit,
+                            int search_range,
+                            const vp9_variance_fn_ptr_t *fn_ptr,
+                            const MV *center_mv) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MV neighbors[4] = {{ -1, 0}, {0, -1}, {0, 1}, {1, 0}};
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+  const uint8_t *best_address = get_buf_from_mv(in_what, ref_mv);
+  unsigned int best_sad = fn_ptr->sdf(what->buf, what->stride, best_address,
+                                    in_what->stride) +
+      mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
+  int i, j;
+
+  for (i = 0; i < search_range; i++) {
+    int best_site = -1;
+    const int all_in = ((ref_mv->row - 1) > x->mv_row_min) &
+                       ((ref_mv->row + 1) < x->mv_row_max) &
+                       ((ref_mv->col - 1) > x->mv_col_min) &
+                       ((ref_mv->col + 1) < x->mv_col_max);
+
+    if (all_in) {
+      unsigned int sads[4];
+      const uint8_t *const positions[4] = {
+        best_address - in_what->stride,
+        best_address - 1,
+        best_address + 1,
+        best_address + in_what->stride
+      };
+
+      fn_ptr->sdx4df(what->buf, what->stride, positions, in_what->stride, sads);
+
+      for (j = 0; j < 4; ++j) {
+        if (sads[j] < best_sad) {
+          const MV mv = {ref_mv->row + neighbors[j].row,
+                         ref_mv->col + neighbors[j].col};
+          sads[j] += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
+          if (sads[j] < best_sad) {
+            best_sad = sads[j];
+            best_site = j;
+          }
+        }
+      }
+    } else {
+      for (j = 0; j < 4; ++j) {
+        const MV mv = {ref_mv->row + neighbors[j].row,
+                       ref_mv->col + neighbors[j].col};
+
+        if (is_mv_in(x, &mv)) {
+          unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
+                                         get_buf_from_mv(in_what, &mv),
+                                         in_what->stride);
+          if (sad < best_sad) {
+            sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
+            if (sad < best_sad) {
+              best_sad = sad;
+              best_site = j;
+            }
+          }
+        }
+      }
+    }
+
+    if (best_site == -1) {
+      break;
+    } else {
+      ref_mv->row += neighbors[best_site].row;
+      ref_mv->col += neighbors[best_site].col;
+      best_address = get_buf_from_mv(in_what, ref_mv);
+    }
+  }
+
+  return best_sad;
+}
+
+// This function is called when we do joint motion search in comp_inter_inter
+// mode.
+int vp9_refining_search_8p_c(const MACROBLOCK *x,
+                             MV *ref_mv, int error_per_bit,
+                             int search_range,
+                             const vp9_variance_fn_ptr_t *fn_ptr,
+                             const MV *center_mv,
+                             const uint8_t *second_pred) {
+  const MV neighbors[8] = {{-1, 0}, {0, -1}, {0, 1}, {1, 0},
+                           {-1, -1}, {1, -1}, {-1, 1}, {1, 1}};
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+  unsigned int best_sad = fn_ptr->sdaf(what->buf, what->stride,
+      get_buf_from_mv(in_what, ref_mv), in_what->stride, second_pred) +
+      mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
+  int i, j;
+
+  for (i = 0; i < search_range; ++i) {
+    int best_site = -1;
+
+    for (j = 0; j < 8; ++j) {
+      const MV mv = {ref_mv->row + neighbors[j].row,
+                     ref_mv->col + neighbors[j].col};
+
+      if (is_mv_in(x, &mv)) {
+        unsigned int sad = fn_ptr->sdaf(what->buf, what->stride,
+            get_buf_from_mv(in_what, &mv), in_what->stride, second_pred);
+        if (sad < best_sad) {
+          sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
+          if (sad < best_sad) {
+            best_sad = sad;
+            best_site = j;
+          }
+        }
+      }
+    }
+
+    if (best_site == -1) {
+      break;
+    } else {
+      ref_mv->row += neighbors[best_site].row;
+      ref_mv->col += neighbors[best_site].col;
+    }
+  }
+  return best_sad;
+}
+
+#define MIN_EX_SEARCH_LIMIT 128
+static int is_exhaustive_allowed(VP9_COMP *cpi, MACROBLOCK *x) {
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  const int max_ex = VPXMAX(MIN_EX_SEARCH_LIMIT,
+      (*x->m_search_count_ptr * sf->max_exaustive_pct) / 100);
+
+  return sf->allow_exhaustive_searches &&
+      (sf->exhaustive_searches_thresh < INT_MAX) &&
+      (*x->ex_search_count_ptr <= max_ex) &&
+      !cpi->rc.is_src_frame_alt_ref;
+}
+
+int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x,
+                          BLOCK_SIZE bsize, MV *mvp_full,
+                          int step_param, int error_per_bit,
+                          int *cost_list,
+                          const MV *ref_mv, MV *tmp_mv,
+                          int var_max, int rd) {
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  const SEARCH_METHODS method = sf->mv.search_method;
+  vp9_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize];
+  int var = 0;
+  if (cost_list) {
+    cost_list[0] = INT_MAX;
+    cost_list[1] = INT_MAX;
+    cost_list[2] = INT_MAX;
+    cost_list[3] = INT_MAX;
+    cost_list[4] = INT_MAX;
+  }
+
+  // Keep track of number of searches (this frame in this thread).
+  ++(*x->m_search_count_ptr);
+
+  switch (method) {
+    case FAST_DIAMOND:
+      var = fast_dia_search(x, mvp_full, step_param, error_per_bit, 0,
+                            cost_list, fn_ptr, 1, ref_mv, tmp_mv);
+      break;
+    case FAST_HEX:
+      var = fast_hex_search(x, mvp_full, step_param, error_per_bit, 0,
+                            cost_list, fn_ptr, 1, ref_mv, tmp_mv);
+      break;
+    case HEX:
+      var = hex_search(x, mvp_full, step_param, error_per_bit, 1,
+                       cost_list, fn_ptr, 1, ref_mv, tmp_mv);
+      break;
+    case SQUARE:
+      var = square_search(x, mvp_full, step_param, error_per_bit, 1,
+                          cost_list, fn_ptr, 1, ref_mv, tmp_mv);
+      break;
+    case BIGDIA:
+      var = bigdia_search(x, mvp_full, step_param, error_per_bit, 1,
+                          cost_list, fn_ptr, 1, ref_mv, tmp_mv);
+      break;
+    case NSTEP:
+      var = full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit,
+                               MAX_MVSEARCH_STEPS - 1 - step_param,
+                               1, cost_list, fn_ptr, ref_mv, tmp_mv);
+
+      // Should we allow a follow on exhaustive search?
+      if (is_exhaustive_allowed(cpi, x)) {
+        int64_t exhuastive_thr = sf->exhaustive_searches_thresh;
+        exhuastive_thr >>= 8 - (b_width_log2_lookup[bsize] +
+                                b_height_log2_lookup[bsize]);
+
+        // Threshold variance for an exhaustive full search.
+        if (var > exhuastive_thr) {
+            int var_ex;
+          MV tmp_mv_ex;
+          var_ex = full_pixel_exhaustive(cpi, x, tmp_mv,
+                                         error_per_bit, cost_list, fn_ptr,
+                                         ref_mv, &tmp_mv_ex);
+
+          if (var_ex < var) {
+            var = var_ex;
+            *tmp_mv = tmp_mv_ex;
+          }
+        }
+      }
+      break;
+    default:
+      assert(0 && "Invalid search method.");
+  }
+
+  if (method != NSTEP && rd && var < var_max)
+    var = vp9_get_mvpred_var(x, tmp_mv, ref_mv, fn_ptr, 1);
+
+  return var;
+}
diff --git a/libs/libvpx/vp9/encoder/vp9_mcomp.h b/libs/libvpx/vp9/encoder/vp9_mcomp.h
new file mode 100644
index 0000000000..1c101f2e20
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_mcomp.h
@@ -0,0 +1,135 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP9_ENCODER_VP9_MCOMP_H_
+#define VP9_ENCODER_VP9_MCOMP_H_
+
+#include "vp9/encoder/vp9_block.h"
+#include "vpx_dsp/variance.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// The maximum number of steps in a step search given the largest
+// allowed initial step
+#define MAX_MVSEARCH_STEPS 11
+// Max full pel mv specified in the unit of full pixel
+// Enable the use of motion vector in range [-1023, 1023].
+#define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS - 1)) - 1)
+// Maximum size of the first step in full pel units
+#define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS-1))
+// Allowed motion vector pixel distance outside image border
+// for Block_16x16
+#define BORDER_MV_PIXELS_B16 (16 + VP9_INTERP_EXTEND)
+
+typedef struct search_site_config {
+  // motion search sites
+  MV  ss_mv[8 * MAX_MVSEARCH_STEPS];        // Motion vector
+  intptr_t ss_os[8 * MAX_MVSEARCH_STEPS];   // Offset
+  int searches_per_step;
+  int total_steps;
+} search_site_config;
+
+void vp9_init_dsmotion_compensation(search_site_config *cfg, int stride);
+void vp9_init3smotion_compensation(search_site_config *cfg,  int stride);
+
+void vp9_set_mv_search_range(MACROBLOCK *x, const MV *mv);
+int vp9_mv_bit_cost(const MV *mv, const MV *ref,
+                    const int *mvjcost, int *mvcost[2], int weight);
+
+// Utility to compute variance + MV rate cost for a given MV
+int vp9_get_mvpred_var(const MACROBLOCK *x,
+                       const MV *best_mv, const MV *center_mv,
+                       const vp9_variance_fn_ptr_t *vfp,
+                       int use_mvcost);
+int vp9_get_mvpred_av_var(const MACROBLOCK *x,
+                          const MV *best_mv, const MV *center_mv,
+                          const uint8_t *second_pred,
+                          const vp9_variance_fn_ptr_t *vfp,
+                          int use_mvcost);
+
+struct VP9_COMP;
+struct SPEED_FEATURES;
+
+int vp9_init_search_range(int size);
+
+int vp9_refining_search_sad(const struct macroblock *x,
+                            struct mv *ref_mv,
+                            int sad_per_bit, int distance,
+                            const struct vp9_variance_vtable *fn_ptr,
+                            const struct mv *center_mv);
+
+// Perform integral projection based motion estimation.
+unsigned int vp9_int_pro_motion_estimation(const struct VP9_COMP *cpi,
+                                           MACROBLOCK *x,
+                                           BLOCK_SIZE bsize,
+                                           int mi_row, int mi_col);
+
+typedef int (fractional_mv_step_fp) (
+    const MACROBLOCK *x,
+    MV *bestmv, const MV *ref_mv,
+    int allow_hp,
+    int error_per_bit,
+    const vp9_variance_fn_ptr_t *vfp,
+    int forced_stop,  // 0 - full, 1 - qtr only, 2 - half only
+    int iters_per_step,
+    int *cost_list,
+    int *mvjcost, int *mvcost[2],
+    int *distortion, unsigned int *sse1,
+    const uint8_t *second_pred,
+    int w, int h);
+
+extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree;
+extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned;
+extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned_more;
+extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned_evenmore;
+
+typedef int (*vp9_full_search_fn_t)(const MACROBLOCK *x,
+                                    const MV *ref_mv, int sad_per_bit,
+                                    int distance,
+                                    const vp9_variance_fn_ptr_t *fn_ptr,
+                                    const MV *center_mv, MV *best_mv);
+
+typedef int (*vp9_refining_search_fn_t)(const MACROBLOCK *x,
+                                        MV *ref_mv, int sad_per_bit,
+                                        int distance,
+                                        const vp9_variance_fn_ptr_t *fn_ptr,
+                                        const MV *center_mv);
+
+typedef int (*vp9_diamond_search_fn_t)(const MACROBLOCK *x,
+                                       const search_site_config *cfg,
+                                       MV *ref_mv, MV *best_mv,
+                                       int search_param, int sad_per_bit,
+                                       int *num00,
+                                       const vp9_variance_fn_ptr_t *fn_ptr,
+                                       const MV *center_mv);
+
+int vp9_refining_search_8p_c(const MACROBLOCK *x,
+                             MV *ref_mv, int error_per_bit,
+                             int search_range,
+                             const vp9_variance_fn_ptr_t *fn_ptr,
+                             const MV *center_mv, const uint8_t *second_pred);
+
+struct VP9_COMP;
+
+int vp9_full_pixel_search(struct VP9_COMP *cpi, MACROBLOCK *x,
+                          BLOCK_SIZE bsize, MV *mvp_full,
+                          int step_param, int error_per_bit,
+                          int *cost_list,
+                          const MV *ref_mv, MV *tmp_mv,
+                          int var_max, int rd);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_VP9_MCOMP_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_noise_estimate.c b/libs/libvpx/vp9/encoder/vp9_noise_estimate.c
new file mode 100644
index 0000000000..e56cc9b017
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_noise_estimate.c
@@ -0,0 +1,249 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_scale/yv12config.h"
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/encoder/vp9_context_tree.h"
+#include "vp9/encoder/vp9_noise_estimate.h"
+#include "vp9/encoder/vp9_encoder.h"
+
+void vp9_noise_estimate_init(NOISE_ESTIMATE *const ne,
+                             int width,
+                             int height) {
+  ne->enabled = 0;
+  ne->level = kLowLow;
+  ne->value = 0;
+  ne->count = 0;
+  ne->thresh = 90;
+  ne->last_w = 0;
+  ne->last_h = 0;
+  if (width * height >= 1920 * 1080) {
+    ne->thresh = 200;
+  } else if (width * height >= 1280 * 720) {
+    ne->thresh = 130;
+  }
+  ne->num_frames_estimate = 20;
+}
+
+int enable_noise_estimation(VP9_COMP *const cpi) {
+  // Enable noise estimation if denoising is on (and cyclic refresh, since
+  // noise estimate is currently using a struct defined in cyclic refresh).
+#if CONFIG_VP9_TEMPORAL_DENOISING
+  if (cpi->oxcf.noise_sensitivity > 0 &&
+      cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+    return 1;
+#endif
+  // Only allow noise estimate under certain encoding mode.
+  // Enabled for 1 pass CBR, speed >=5, and if resolution is same as original.
+  // Not enabled for SVC mode and screen_content_mode.
+  // Not enabled for low resolutions.
+  if (cpi->oxcf.pass == 0 &&
+      cpi->oxcf.rc_mode == VPX_CBR &&
+      cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+      cpi->oxcf.speed >= 5 &&
+      cpi->resize_state == ORIG &&
+      cpi->resize_pending == 0 &&
+      !cpi->use_svc &&
+      cpi->oxcf.content != VP9E_CONTENT_SCREEN &&
+      cpi->common.width >= 640 &&
+      cpi->common.height >= 480)
+    return 1;
+  else
+    return 0;
+}
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+static void copy_frame(YV12_BUFFER_CONFIG * const dest,
+                       const YV12_BUFFER_CONFIG * const src) {
+  int r;
+  const uint8_t *srcbuf = src->y_buffer;
+  uint8_t *destbuf = dest->y_buffer;
+
+  assert(dest->y_width == src->y_width);
+  assert(dest->y_height == src->y_height);
+
+  for (r = 0; r < dest->y_height; ++r) {
+    memcpy(destbuf, srcbuf, dest->y_width);
+    destbuf += dest->y_stride;
+    srcbuf += src->y_stride;
+  }
+}
+#endif  // CONFIG_VP9_TEMPORAL_DENOISING
+
+NOISE_LEVEL vp9_noise_estimate_extract_level(NOISE_ESTIMATE *const ne) {
+  int noise_level = kLowLow;
+  if (ne->value > (ne->thresh << 1)) {
+    noise_level = kHigh;
+  } else {
+    if (ne->value > ne->thresh)
+      noise_level = kMedium;
+    else if (ne->value > (ne->thresh >> 1))
+      noise_level = kLow;
+    else
+      noise_level = kLowLow;
+  }
+  return noise_level;
+}
+
+void vp9_update_noise_estimate(VP9_COMP *const cpi) {
+  const VP9_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  NOISE_ESTIMATE *const ne = &cpi->noise_estimate;
+  // Estimate of noise level every frame_period frames.
+  int frame_period = 10;
+  int thresh_consec_zeromv = 8;
+  unsigned int thresh_sum_diff = 100;
+  unsigned int thresh_sum_spatial = (200 * 200) << 8;
+  unsigned int thresh_spatial_var = (32 * 32) << 8;
+  int min_blocks_estimate = cm->mi_rows * cm->mi_cols >> 7;
+  // Estimate is between current source and last source.
+  YV12_BUFFER_CONFIG *last_source = cpi->Last_Source;
+#if CONFIG_VP9_TEMPORAL_DENOISING
+  if (cpi->oxcf.noise_sensitivity > 0)
+    last_source = &cpi->denoiser.last_source;
+#endif
+  ne->enabled = enable_noise_estimation(cpi);
+  if (!ne->enabled ||
+      cm->current_video_frame % frame_period != 0 ||
+      last_source == NULL ||
+      ne->last_w != cm->width ||
+      ne->last_h != cm->height) {
+#if CONFIG_VP9_TEMPORAL_DENOISING
+  if (cpi->oxcf.noise_sensitivity > 0)
+    copy_frame(&cpi->denoiser.last_source, cpi->Source);
+#endif
+    if (last_source != NULL) {
+      ne->last_w = cm->width;
+      ne->last_h = cm->height;
+    }
+    return;
+  } else {
+    int num_samples = 0;
+    uint64_t avg_est = 0;
+    int bsize = BLOCK_16X16;
+    static const unsigned char const_source[16] = {
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    // Loop over sub-sample of 16x16 blocks of frame, and for blocks that have
+    // been encoded as zero/small mv at least x consecutive frames, compute
+    // the variance to update estimate of noise in the source.
+    const uint8_t *src_y = cpi->Source->y_buffer;
+    const int src_ystride = cpi->Source->y_stride;
+    const uint8_t *last_src_y = last_source->y_buffer;
+    const int last_src_ystride = last_source->y_stride;
+    const uint8_t *src_u = cpi->Source->u_buffer;
+    const uint8_t *src_v = cpi->Source->v_buffer;
+    const int src_uvstride = cpi->Source->uv_stride;
+    int mi_row, mi_col;
+    int num_low_motion = 0;
+    int frame_low_motion = 1;
+    for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) {
+      for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) {
+        int bl_index = mi_row * cm->mi_cols + mi_col;
+        if (cr->consec_zero_mv[bl_index] > thresh_consec_zeromv)
+          num_low_motion++;
+      }
+    }
+    if (num_low_motion < ((3 * cm->mi_rows * cm->mi_cols) >> 3))
+      frame_low_motion = 0;
+    for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) {
+      for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) {
+        // 16x16 blocks, 1/4 sample of frame.
+        if (mi_row % 4 == 0 && mi_col % 4 == 0 &&
+            mi_row < cm->mi_rows - 1 &&
+            mi_col < cm->mi_cols - 1) {
+          int bl_index = mi_row * cm->mi_cols + mi_col;
+          int bl_index1 = bl_index + 1;
+          int bl_index2 = bl_index + cm->mi_cols;
+          int bl_index3 = bl_index2 + 1;
+          // Only consider blocks that are likely steady background. i.e, have
+          // been encoded as zero/low motion x (= thresh_consec_zeromv) frames
+          // in a row. consec_zero_mv[] defined for 8x8 blocks, so consider all
+          // 4 sub-blocks for 16x16 block. Also, avoid skin blocks.
+          int is_skin = vp9_compute_skin_block(src_y,
+                                               src_u,
+                                               src_v,
+                                               src_ystride,
+                                               src_uvstride,
+                                               bsize);
+          if (frame_low_motion &&
+              cr->consec_zero_mv[bl_index] > thresh_consec_zeromv &&
+              cr->consec_zero_mv[bl_index1] > thresh_consec_zeromv &&
+              cr->consec_zero_mv[bl_index2] > thresh_consec_zeromv &&
+              cr->consec_zero_mv[bl_index3] > thresh_consec_zeromv &&
+              !is_skin) {
+            // Compute variance.
+            unsigned int sse;
+            unsigned int variance = cpi->fn_ptr[bsize].vf(src_y,
+                                                          src_ystride,
+                                                          last_src_y,
+                                                          last_src_ystride,
+                                                          &sse);
+            // Only consider this block as valid for noise measurement if the
+            // average term (sse - variance = N * avg^{2}, N = 16X16) of the
+            // temporal residual is small (avoid effects from lighting change).
+            if ((sse - variance) < thresh_sum_diff) {
+              unsigned int sse2;
+              const unsigned int spatial_variance =
+                  cpi->fn_ptr[bsize].vf(src_y, src_ystride, const_source,
+                                        0, &sse2);
+              // Avoid blocks with high brightness and high spatial variance.
+              if ((sse2 - spatial_variance) < thresh_sum_spatial &&
+                  spatial_variance < thresh_spatial_var) {
+                avg_est += variance / ((spatial_variance >> 9) + 1);
+                num_samples++;
+              }
+            }
+          }
+        }
+        src_y += 8;
+        last_src_y += 8;
+        src_u += 4;
+        src_v += 4;
+      }
+      src_y += (src_ystride << 3) - (cm->mi_cols << 3);
+      last_src_y += (last_src_ystride << 3) - (cm->mi_cols << 3);
+      src_u += (src_uvstride << 2) - (cm->mi_cols << 2);
+      src_v += (src_uvstride << 2) - (cm->mi_cols << 2);
+    }
+    ne->last_w = cm->width;
+    ne->last_h = cm->height;
+    // Update noise estimate if we have at a minimum number of block samples,
+    // and avg_est > 0 (avg_est == 0 can happen if the application inputs
+    // duplicate frames).
+    if (num_samples > min_blocks_estimate && avg_est > 0) {
+      // Normalize.
+      avg_est = avg_est / num_samples;
+      // Update noise estimate.
+      ne->value = (int)((15 * ne->value + avg_est) >> 4);
+      ne->count++;
+      if (ne->count == ne->num_frames_estimate) {
+        // Reset counter and check noise level condition.
+        ne->num_frames_estimate = 30;
+        ne->count = 0;
+        ne->level = vp9_noise_estimate_extract_level(ne);
+#if CONFIG_VP9_TEMPORAL_DENOISING
+        if (cpi->oxcf.noise_sensitivity > 0)
+          vp9_denoiser_set_noise_level(&cpi->denoiser, ne->level);
+#endif
+      }
+    }
+  }
+#if CONFIG_VP9_TEMPORAL_DENOISING
+  if (cpi->oxcf.noise_sensitivity > 0)
+    copy_frame(&cpi->denoiser.last_source, cpi->Source);
+#endif
+}
diff --git a/libs/libvpx/vp9/encoder/vp9_noise_estimate.h b/libs/libvpx/vp9/encoder/vp9_noise_estimate.h
new file mode 100644
index 0000000000..826d125b5b
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_noise_estimate.h
@@ -0,0 +1,58 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_NOISE_ESTIMATE_H_
+#define VP9_ENCODER_NOISE_ESTIMATE_H_
+
+#include "vp9/encoder/vp9_block.h"
+#include "vp9/encoder/vp9_skin_detection.h"
+#include "vpx_scale/yv12config.h"
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+#include "vp9/encoder/vp9_denoiser.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum noise_level {
+  kLowLow,
+  kLow,
+  kMedium,
+  kHigh
+} NOISE_LEVEL;
+
+typedef struct noise_estimate {
+  int enabled;
+  NOISE_LEVEL level;
+  int value;
+  int thresh;
+  int count;
+  int last_w;
+  int last_h;
+  int num_frames_estimate;
+} NOISE_ESTIMATE;
+
+struct VP9_COMP;
+
+void vp9_noise_estimate_init(NOISE_ESTIMATE *const ne,
+                             int width,
+                             int height);
+
+NOISE_LEVEL vp9_noise_estimate_extract_level(NOISE_ESTIMATE *const ne);
+
+void vp9_update_noise_estimate(struct VP9_COMP *const cpi);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_NOISE_ESTIMATE_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_picklpf.c b/libs/libvpx/vp9/encoder/vp9_picklpf.c
new file mode 100644
index 0000000000..f6b1dfcd58
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_picklpf.c
@@ -0,0 +1,195 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <limits.h>
+
+#include "./vpx_scale_rtcd.h"
+
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+
+#include "vp9/common/vp9_loopfilter.h"
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/vp9_quant_common.h"
+
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_picklpf.h"
+#include "vp9/encoder/vp9_quantize.h"
+
+static int get_max_filter_level(const VP9_COMP *cpi) {
+  if (cpi->oxcf.pass == 2) {
+    return cpi->twopass.section_intra_rating > 8 ? MAX_LOOP_FILTER * 3 / 4
+                                                 : MAX_LOOP_FILTER;
+  } else {
+    return MAX_LOOP_FILTER;
+  }
+}
+
+
+static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd,
+                                VP9_COMP *const cpi,
+                                int filt_level, int partial_frame) {
+  VP9_COMMON *const cm = &cpi->common;
+  int64_t filt_err;
+
+  vp9_build_mask_frame(cm, filt_level, partial_frame);
+
+  if (cpi->num_workers > 1)
+    vp9_loop_filter_frame_mt(cm->frame_to_show, cm, cpi->td.mb.e_mbd.plane,
+                             filt_level, 1, partial_frame,
+                             cpi->workers, cpi->num_workers, &cpi->lf_row_sync);
+  else
+    vp9_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level,
+                          1, partial_frame);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (cm->use_highbitdepth) {
+    filt_err = vp9_highbd_get_y_sse(sd, cm->frame_to_show);
+  } else {
+    filt_err = vp9_get_y_sse(sd, cm->frame_to_show);
+  }
+#else
+  filt_err = vp9_get_y_sse(sd, cm->frame_to_show);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  // Re-instate the unfiltered frame
+  vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
+
+  return filt_err;
+}
+
+static int search_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
+                               int partial_frame) {
+  const VP9_COMMON *const cm = &cpi->common;
+  const struct loopfilter *const lf = &cm->lf;
+  const int min_filter_level = 0;
+  const int max_filter_level = get_max_filter_level(cpi);
+  int filt_direction = 0;
+  int64_t best_err;
+  int filt_best;
+
+  // Start the search at the previous frame filter level unless it is now out of
+  // range.
+  int filt_mid =
+      clamp(lf->last_filt_level, min_filter_level, max_filter_level);
+  int filter_step = filt_mid < 16 ? 4 : filt_mid / 4;
+  // Sum squared error at each filter level
+  int64_t ss_err[MAX_LOOP_FILTER + 1];
+
+  // Set each entry to -1
+  memset(ss_err, 0xFF, sizeof(ss_err));
+
+  //  Make a copy of the unfiltered / processed recon buffer
+  vpx_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf);
+
+  best_err = try_filter_frame(sd, cpi, filt_mid, partial_frame);
+  filt_best = filt_mid;
+  ss_err[filt_mid] = best_err;
+
+  while (filter_step > 0) {
+    const int filt_high = VPXMIN(filt_mid + filter_step, max_filter_level);
+    const int filt_low = VPXMAX(filt_mid - filter_step, min_filter_level);
+
+    // Bias against raising loop filter in favor of lowering it.
+    int64_t bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
+
+    if ((cpi->oxcf.pass == 2) && (cpi->twopass.section_intra_rating < 20))
+      bias = (bias * cpi->twopass.section_intra_rating) / 20;
+
+    // yx, bias less for large block size
+    if (cm->tx_mode != ONLY_4X4)
+      bias >>= 1;
+
+    if (filt_direction <= 0 && filt_low != filt_mid) {
+      // Get Low filter error score
+      if (ss_err[filt_low] < 0) {
+        ss_err[filt_low] = try_filter_frame(sd, cpi, filt_low, partial_frame);
+      }
+      // If value is close to the best so far then bias towards a lower loop
+      // filter value.
+      if ((ss_err[filt_low] - bias) < best_err) {
+        // Was it actually better than the previous best?
+        if (ss_err[filt_low] < best_err)
+          best_err = ss_err[filt_low];
+
+        filt_best = filt_low;
+      }
+    }
+
+    // Now look at filt_high
+    if (filt_direction >= 0 && filt_high != filt_mid) {
+      if (ss_err[filt_high] < 0) {
+        ss_err[filt_high] = try_filter_frame(sd, cpi, filt_high, partial_frame);
+      }
+      // Was it better than the previous best?
+      if (ss_err[filt_high] < (best_err - bias)) {
+        best_err = ss_err[filt_high];
+        filt_best = filt_high;
+      }
+    }
+
+    // Half the step distance if the best filter value was the same as last time
+    if (filt_best == filt_mid) {
+      filter_step /= 2;
+      filt_direction = 0;
+    } else {
+      filt_direction = (filt_best < filt_mid) ? -1 : 1;
+      filt_mid = filt_best;
+    }
+  }
+
+  return filt_best;
+}
+
+void vp9_pick_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
+                           LPF_PICK_METHOD method) {
+  VP9_COMMON *const cm = &cpi->common;
+  struct loopfilter *const lf = &cm->lf;
+
+  lf->sharpness_level = cm->frame_type == KEY_FRAME ? 0
+                                                    : cpi->oxcf.sharpness;
+
+  if (method == LPF_PICK_MINIMAL_LPF && lf->filter_level) {
+      lf->filter_level = 0;
+  } else if (method >= LPF_PICK_FROM_Q) {
+    const int min_filter_level = 0;
+    const int max_filter_level = get_max_filter_level(cpi);
+    const int q = vp9_ac_quant(cm->base_qindex, 0, cm->bit_depth);
+    // These values were determined by linear fitting the result of the
+    // searched level, filt_guess = q * 0.316206 + 3.87252
+#if CONFIG_VP9_HIGHBITDEPTH
+    int filt_guess;
+    switch (cm->bit_depth) {
+      case VPX_BITS_8:
+        filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18);
+        break;
+      case VPX_BITS_10:
+        filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 4060632, 20);
+        break;
+      case VPX_BITS_12:
+        filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 16242526, 22);
+        break;
+      default:
+        assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 "
+                    "or VPX_BITS_12");
+        return;
+    }
+#else
+    int filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    if (cm->frame_type == KEY_FRAME)
+      filt_guess -= 4;
+    lf->filter_level = clamp(filt_guess, min_filter_level, max_filter_level);
+  } else {
+    lf->filter_level = search_filter_level(sd, cpi,
+                                           method == LPF_PICK_FROM_SUBIMAGE);
+  }
+}
diff --git a/libs/libvpx/vp9/encoder/vp9_picklpf.h b/libs/libvpx/vp9/encoder/vp9_picklpf.h
new file mode 100644
index 0000000000..33c490f693
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_picklpf.h
@@ -0,0 +1,30 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP9_ENCODER_VP9_PICKLPF_H_
+#define VP9_ENCODER_VP9_PICKLPF_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "vp9/encoder/vp9_encoder.h"
+
+struct yv12_buffer_config;
+struct VP9_COMP;
+
+void vp9_pick_filter_level(const struct yv12_buffer_config *sd,
+                           struct VP9_COMP *cpi, LPF_PICK_METHOD method);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_VP9_PICKLPF_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_pickmode.c b/libs/libvpx/vp9/encoder/vp9_pickmode.c
new file mode 100644
index 0000000000..d861f80967
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_pickmode.c
@@ -0,0 +1,2166 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_mvref_common.h"
+#include "vp9/common/vp9_pred_common.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/common/vp9_reconintra.h"
+#include "vp9/common/vp9_scan.h"
+
+#include "vp9/encoder/vp9_cost.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_pickmode.h"
+#include "vp9/encoder/vp9_ratectrl.h"
+#include "vp9/encoder/vp9_rd.h"
+
+typedef struct {
+  uint8_t *data;
+  int stride;
+  int in_use;
+} PRED_BUFFER;
+
+static int mv_refs_rt(VP9_COMP *cpi, const VP9_COMMON *cm,
+                      const MACROBLOCK *x,
+                      const MACROBLOCKD *xd,
+                      const TileInfo *const tile,
+                      MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
+                      int_mv *mv_ref_list, int_mv *base_mv,
+                      int mi_row, int mi_col, int use_base_mv) {
+  const int *ref_sign_bias = cm->ref_frame_sign_bias;
+  int i, refmv_count = 0;
+
+  const POSITION *const mv_ref_search = mv_ref_blocks[mi->sb_type];
+
+  int different_ref_found = 0;
+  int context_counter = 0;
+  int const_motion = 0;
+
+  // Blank the reference vector list
+  memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES);
+
+  // The nearest 2 blocks are treated differently
+  // if the size < 8x8 we get the mv from the bmi substructure,
+  // and we also need to keep a mode count.
+  for (i = 0; i < 2; ++i) {
+    const POSITION *const mv_ref = &mv_ref_search[i];
+    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
+      const MODE_INFO *const candidate_mi = xd->mi[mv_ref->col + mv_ref->row *
+                                                   xd->mi_stride];
+      // Keep counts for entropy encoding.
+      context_counter += mode_2_counter[candidate_mi->mode];
+      different_ref_found = 1;
+
+      if (candidate_mi->ref_frame[0] == ref_frame)
+        ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 0, mv_ref->col, -1),
+                        refmv_count, mv_ref_list, Done);
+    }
+  }
+
+  const_motion = 1;
+
+  // Check the rest of the neighbors in much the same way
+  // as before except we don't need to keep track of sub blocks or
+  // mode counts.
+  for (; i < MVREF_NEIGHBOURS && !refmv_count; ++i) {
+    const POSITION *const mv_ref = &mv_ref_search[i];
+    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
+      const MODE_INFO *const candidate_mi = xd->mi[mv_ref->col + mv_ref->row *
+                                                   xd->mi_stride];
+      different_ref_found = 1;
+
+      if (candidate_mi->ref_frame[0] == ref_frame)
+        ADD_MV_REF_LIST(candidate_mi->mv[0], refmv_count, mv_ref_list, Done);
+    }
+  }
+
+  // Since we couldn't find 2 mvs from the same reference frame
+  // go back through the neighbors and find motion vectors from
+  // different reference frames.
+  if (different_ref_found && !refmv_count) {
+    for (i = 0; i < MVREF_NEIGHBOURS; ++i) {
+      const POSITION *mv_ref = &mv_ref_search[i];
+      if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
+        const MODE_INFO *const candidate_mi = xd->mi[mv_ref->col + mv_ref->row
+                                              * xd->mi_stride];
+
+        // If the candidate is INTRA we don't want to consider its mv.
+        IF_DIFF_REF_FRAME_ADD_MV(candidate_mi, ref_frame, ref_sign_bias,
+                                 refmv_count, mv_ref_list, Done);
+      }
+    }
+  }
+  if (use_base_mv &&
+      !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame &&
+      ref_frame == LAST_FRAME) {
+    // Get base layer mv.
+    MV_REF *candidate =
+        &cm->prev_frame->mvs[(mi_col>>1) + (mi_row>>1) * (cm->mi_cols>>1)];
+    if (candidate->mv[0].as_int != INVALID_MV) {
+        base_mv->as_mv.row = (candidate->mv[0].as_mv.row * 2);
+        base_mv->as_mv.col = (candidate->mv[0].as_mv.col * 2);
+      clamp_mv_ref(&base_mv->as_mv, xd);
+    } else {
+      base_mv->as_int = INVALID_MV;
+    }
+  }
+
+ Done:
+
+  x->mbmi_ext->mode_context[ref_frame] = counter_to_context[context_counter];
+
+  // Clamp vectors
+  for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i)
+    clamp_mv_ref(&mv_ref_list[i].as_mv, xd);
+
+  return const_motion;
+}
+
+static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
+                                  BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                  int_mv *tmp_mv, int *rate_mv,
+                                  int64_t best_rd_sofar, int use_base_mv) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  MODE_INFO *mi = xd->mi[0];
+  struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0, 0}};
+  const int step_param = cpi->sf.mv.fullpel_search_step_param;
+  const int sadpb = x->sadperbit16;
+  MV mvp_full;
+  const int ref = mi->ref_frame[0];
+  const MV ref_mv = x->mbmi_ext->ref_mvs[ref][0].as_mv;
+  MV center_mv;
+  int dis;
+  int rate_mode;
+  const int tmp_col_min = x->mv_col_min;
+  const int tmp_col_max = x->mv_col_max;
+  const int tmp_row_min = x->mv_row_min;
+  const int tmp_row_max = x->mv_row_max;
+  int rv = 0;
+  int cost_list[5];
+  const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi,
+                                                                        ref);
+  if (scaled_ref_frame) {
+    int i;
+    // Swap out the reference frame for a version that's been scaled to
+    // match the resolution of the current frame, allowing the existing
+    // motion search code to be used without additional modifications.
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      backup_yv12[i] = xd->plane[i].pre[0];
+    vp9_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
+  }
+  vp9_set_mv_search_range(x, &ref_mv);
+
+  assert(x->mv_best_ref_index[ref] <= 2);
+  if (x->mv_best_ref_index[ref] < 2)
+    mvp_full = x->mbmi_ext->ref_mvs[ref][x->mv_best_ref_index[ref]].as_mv;
+  else
+    mvp_full = x->pred_mv[ref];
+
+  mvp_full.col >>= 3;
+  mvp_full.row >>= 3;
+
+  if (!use_base_mv)
+    center_mv = ref_mv;
+  else
+    center_mv = tmp_mv->as_mv;
+
+  vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb,
+                        cond_cost_list(cpi, cost_list),
+                        &center_mv, &tmp_mv->as_mv, INT_MAX, 0);
+
+  x->mv_col_min = tmp_col_min;
+  x->mv_col_max = tmp_col_max;
+  x->mv_row_min = tmp_row_min;
+  x->mv_row_max = tmp_row_max;
+
+  // calculate the bit cost on motion vector
+  mvp_full.row = tmp_mv->as_mv.row * 8;
+  mvp_full.col = tmp_mv->as_mv.col * 8;
+
+  *rate_mv = vp9_mv_bit_cost(&mvp_full, &ref_mv,
+                             x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+
+  rate_mode = cpi->inter_mode_cost[x->mbmi_ext->mode_context[ref]]
+                                  [INTER_OFFSET(NEWMV)];
+  rv = !(RDCOST(x->rdmult, x->rddiv, (*rate_mv + rate_mode), 0) >
+         best_rd_sofar);
+
+  if (rv) {
+    cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv,
+                                 cpi->common.allow_high_precision_mv,
+                                 x->errorperbit,
+                                 &cpi->fn_ptr[bsize],
+                                 cpi->sf.mv.subpel_force_stop,
+                                 cpi->sf.mv.subpel_iters_per_step,
+                                 cond_cost_list(cpi, cost_list),
+                                 x->nmvjointcost, x->mvcost,
+                                 &dis, &x->pred_sse[ref], NULL, 0, 0);
+    *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv,
+                               x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+  }
+
+  if (scaled_ref_frame) {
+    int i;
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      xd->plane[i].pre[0] = backup_yv12[i];
+  }
+  return rv;
+}
+
+static void block_variance(const uint8_t *src, int src_stride,
+                           const uint8_t *ref, int ref_stride,
+                           int w, int h, unsigned int *sse, int *sum,
+                           int block_size, unsigned int *sse8x8,
+                           int *sum8x8, unsigned int *var8x8) {
+  int i, j, k = 0;
+
+  *sse = 0;
+  *sum = 0;
+
+  for (i = 0; i < h; i += block_size) {
+    for (j = 0; j < w; j += block_size) {
+      vpx_get8x8var(src + src_stride * i + j, src_stride,
+                    ref + ref_stride * i + j, ref_stride,
+                    &sse8x8[k], &sum8x8[k]);
+      *sse += sse8x8[k];
+      *sum += sum8x8[k];
+      var8x8[k] = sse8x8[k] - (((unsigned int)sum8x8[k] * sum8x8[k]) >> 6);
+      k++;
+    }
+  }
+}
+
+static void calculate_variance(int bw, int bh, TX_SIZE tx_size,
+                               unsigned int *sse_i, int *sum_i,
+                               unsigned int *var_o, unsigned int *sse_o,
+                               int *sum_o) {
+  const BLOCK_SIZE unit_size = txsize_to_bsize[tx_size];
+  const int nw = 1 << (bw - b_width_log2_lookup[unit_size]);
+  const int nh = 1 << (bh - b_height_log2_lookup[unit_size]);
+  int i, j, k = 0;
+
+  for (i = 0; i < nh; i += 2) {
+    for (j = 0; j < nw; j += 2) {
+      sse_o[k] = sse_i[i * nw + j] + sse_i[i * nw + j + 1] +
+          sse_i[(i + 1) * nw + j] + sse_i[(i + 1) * nw + j + 1];
+      sum_o[k] = sum_i[i * nw + j] + sum_i[i * nw + j + 1] +
+          sum_i[(i + 1) * nw + j] + sum_i[(i + 1) * nw + j + 1];
+      var_o[k] = sse_o[k] - (((unsigned int)sum_o[k] * sum_o[k]) >>
+          (b_width_log2_lookup[unit_size] +
+              b_height_log2_lookup[unit_size] + 6));
+      k++;
+    }
+  }
+}
+
+static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize,
+                                    MACROBLOCK *x, MACROBLOCKD *xd,
+                                    int *out_rate_sum, int64_t *out_dist_sum,
+                                    unsigned int *var_y, unsigned int *sse_y,
+                                    int mi_row, int mi_col, int *early_term) {
+  // Note our transform coeffs are 8 times an orthogonal transform.
+  // Hence quantizer step is also 8 times. To get effective quantizer
+  // we need to divide by 8 before sending to modeling function.
+  unsigned int sse;
+  int rate;
+  int64_t dist;
+  struct macroblock_plane *const p = &x->plane[0];
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  const uint32_t dc_quant = pd->dequant[0];
+  const uint32_t ac_quant = pd->dequant[1];
+  const int64_t dc_thr = dc_quant * dc_quant >> 6;
+  const int64_t ac_thr = ac_quant * ac_quant >> 6;
+  unsigned int var;
+  int sum;
+  int skip_dc = 0;
+
+  const int bw = b_width_log2_lookup[bsize];
+  const int bh = b_height_log2_lookup[bsize];
+  const int num8x8 = 1 << (bw + bh - 2);
+  unsigned int sse8x8[64] = {0};
+  int sum8x8[64] = {0};
+  unsigned int var8x8[64] = {0};
+  TX_SIZE tx_size;
+  int i, k;
+
+  // Calculate variance for whole partition, and also save 8x8 blocks' variance
+  // to be used in following transform skipping test.
+  block_variance(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
+                 4 << bw, 4 << bh, &sse, &sum, 8, sse8x8, sum8x8, var8x8);
+  var = sse - (((int64_t)sum * sum) >> (bw + bh + 4));
+
+  *var_y = var;
+  *sse_y = sse;
+
+  if (cpi->common.tx_mode == TX_MODE_SELECT) {
+    if (sse > (var << 2))
+      tx_size = VPXMIN(max_txsize_lookup[bsize],
+                       tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+    else
+      tx_size = TX_8X8;
+
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+        cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id))
+      tx_size = TX_8X8;
+    else if (tx_size > TX_16X16)
+      tx_size = TX_16X16;
+  } else {
+    tx_size = VPXMIN(max_txsize_lookup[bsize],
+                     tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+  }
+
+  assert(tx_size >= TX_8X8);
+  xd->mi[0]->tx_size = tx_size;
+
+  // Evaluate if the partition block is a skippable block in Y plane.
+  {
+    unsigned int sse16x16[16] = {0};
+    int sum16x16[16] = {0};
+    unsigned int var16x16[16] = {0};
+    const int num16x16 = num8x8 >> 2;
+
+    unsigned int sse32x32[4] = {0};
+    int sum32x32[4] = {0};
+    unsigned int var32x32[4] = {0};
+    const int num32x32 = num8x8 >> 4;
+
+    int ac_test = 1;
+    int dc_test = 1;
+    const int num = (tx_size == TX_8X8) ? num8x8 :
+        ((tx_size == TX_16X16) ? num16x16 : num32x32);
+    const unsigned int *sse_tx = (tx_size == TX_8X8) ? sse8x8 :
+        ((tx_size == TX_16X16) ? sse16x16 : sse32x32);
+    const unsigned int *var_tx = (tx_size == TX_8X8) ? var8x8 :
+        ((tx_size == TX_16X16) ? var16x16 : var32x32);
+
+    // Calculate variance if tx_size > TX_8X8
+    if (tx_size >= TX_16X16)
+      calculate_variance(bw, bh, TX_8X8, sse8x8, sum8x8, var16x16, sse16x16,
+                         sum16x16);
+    if (tx_size == TX_32X32)
+      calculate_variance(bw, bh, TX_16X16, sse16x16, sum16x16, var32x32,
+                         sse32x32, sum32x32);
+
+    // Skipping test
+    x->skip_txfm[0] = SKIP_TXFM_NONE;
+    for (k = 0; k < num; k++)
+      // Check if all ac coefficients can be quantized to zero.
+      if (!(var_tx[k] < ac_thr || var == 0)) {
+        ac_test = 0;
+        break;
+      }
+
+    for (k = 0; k < num; k++)
+      // Check if dc coefficient can be quantized to zero.
+      if (!(sse_tx[k] - var_tx[k] < dc_thr || sse == var)) {
+        dc_test = 0;
+        break;
+      }
+
+    if (ac_test) {
+      x->skip_txfm[0] = SKIP_TXFM_AC_ONLY;
+
+      if (dc_test)
+        x->skip_txfm[0] = SKIP_TXFM_AC_DC;
+    } else if (dc_test) {
+      skip_dc = 1;
+    }
+  }
+
+  if (x->skip_txfm[0] == SKIP_TXFM_AC_DC) {
+    int skip_uv[2] = {0};
+    unsigned int var_uv[2];
+    unsigned int sse_uv[2];
+
+    *out_rate_sum = 0;
+    *out_dist_sum = sse << 4;
+
+    // Transform skipping test in UV planes.
+    for (i = 1; i <= 2; i++) {
+      struct macroblock_plane *const p = &x->plane[i];
+      struct macroblockd_plane *const pd = &xd->plane[i];
+      const TX_SIZE uv_tx_size = get_uv_tx_size(xd->mi[0], pd);
+      const BLOCK_SIZE unit_size = txsize_to_bsize[uv_tx_size];
+      const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, pd);
+      const int uv_bw = b_width_log2_lookup[uv_bsize];
+      const int uv_bh = b_height_log2_lookup[uv_bsize];
+      const int sf = (uv_bw - b_width_log2_lookup[unit_size]) +
+          (uv_bh - b_height_log2_lookup[unit_size]);
+      const uint32_t uv_dc_thr = pd->dequant[0] * pd->dequant[0] >> (6 - sf);
+      const uint32_t uv_ac_thr = pd->dequant[1] * pd->dequant[1] >> (6 - sf);
+      int j = i - 1;
+
+      vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, i);
+      var_uv[j] = cpi->fn_ptr[uv_bsize].vf(p->src.buf, p->src.stride,
+          pd->dst.buf, pd->dst.stride, &sse_uv[j]);
+
+      if ((var_uv[j] < uv_ac_thr || var_uv[j] == 0) &&
+          (sse_uv[j] - var_uv[j] < uv_dc_thr || sse_uv[j] == var_uv[j]))
+        skip_uv[j] = 1;
+      else
+        break;
+    }
+
+    // If the transform in YUV planes are skippable, the mode search checks
+    // fewer inter modes and doesn't check intra modes.
+    if (skip_uv[0] & skip_uv[1]) {
+      *early_term = 1;
+    }
+
+    return;
+  }
+
+  if (!skip_dc) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
+                                   dc_quant >> (xd->bd - 5), &rate, &dist);
+    } else {
+      vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
+                                   dc_quant >> 3, &rate, &dist);
+    }
+#else
+    vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
+                                 dc_quant >> 3, &rate, &dist);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  }
+
+  if (!skip_dc) {
+    *out_rate_sum = rate >> 1;
+    *out_dist_sum = dist << 3;
+  } else {
+    *out_rate_sum = 0;
+    *out_dist_sum = (sse - var) << 4;
+  }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize],
+                                 ac_quant >> (xd->bd - 5), &rate, &dist);
+  } else {
+    vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize],
+                                 ac_quant >> 3, &rate, &dist);
+  }
+#else
+  vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize],
+                               ac_quant >> 3, &rate, &dist);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  *out_rate_sum += rate;
+  *out_dist_sum += dist << 4;
+}
+
+static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize,
+                              MACROBLOCK *x, MACROBLOCKD *xd,
+                              int *out_rate_sum, int64_t *out_dist_sum,
+                              unsigned int *var_y, unsigned int *sse_y) {
+  // Note our transform coeffs are 8 times an orthogonal transform.
+  // Hence quantizer step is also 8 times. To get effective quantizer
+  // we need to divide by 8 before sending to modeling function.
+  unsigned int sse;
+  int rate;
+  int64_t dist;
+  struct macroblock_plane *const p = &x->plane[0];
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  const int64_t dc_thr = p->quant_thred[0] >> 6;
+  const int64_t ac_thr = p->quant_thred[1] >> 6;
+  const uint32_t dc_quant = pd->dequant[0];
+  const uint32_t ac_quant = pd->dequant[1];
+  unsigned int var = cpi->fn_ptr[bsize].vf(p->src.buf, p->src.stride,
+                                           pd->dst.buf, pd->dst.stride, &sse);
+  int skip_dc = 0;
+
+  *var_y = var;
+  *sse_y = sse;
+
+  if (cpi->common.tx_mode == TX_MODE_SELECT) {
+    if (sse > (var << 2))
+      xd->mi[0]->tx_size =
+          VPXMIN(max_txsize_lookup[bsize],
+                 tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+    else
+      xd->mi[0]->tx_size = TX_8X8;
+
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+        cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id))
+      xd->mi[0]->tx_size = TX_8X8;
+    else if (xd->mi[0]->tx_size > TX_16X16)
+      xd->mi[0]->tx_size = TX_16X16;
+  } else {
+    xd->mi[0]->tx_size =
+        VPXMIN(max_txsize_lookup[bsize],
+               tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+  }
+
+  // Evaluate if the partition block is a skippable block in Y plane.
+  {
+    const BLOCK_SIZE unit_size =
+        txsize_to_bsize[xd->mi[0]->tx_size];
+    const unsigned int num_blk_log2 =
+        (b_width_log2_lookup[bsize] - b_width_log2_lookup[unit_size]) +
+        (b_height_log2_lookup[bsize] - b_height_log2_lookup[unit_size]);
+    const unsigned int sse_tx = sse >> num_blk_log2;
+    const unsigned int var_tx = var >> num_blk_log2;
+
+    x->skip_txfm[0] = SKIP_TXFM_NONE;
+    // Check if all ac coefficients can be quantized to zero.
+    if (var_tx < ac_thr || var == 0) {
+      x->skip_txfm[0] = SKIP_TXFM_AC_ONLY;
+      // Check if dc coefficient can be quantized to zero.
+      if (sse_tx - var_tx < dc_thr || sse == var)
+        x->skip_txfm[0] = SKIP_TXFM_AC_DC;
+    } else {
+      if (sse_tx - var_tx < dc_thr || sse == var)
+        skip_dc = 1;
+    }
+  }
+
+  if (x->skip_txfm[0] == SKIP_TXFM_AC_DC) {
+    *out_rate_sum = 0;
+    *out_dist_sum = sse << 4;
+    return;
+  }
+
+  if (!skip_dc) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
+                                   dc_quant >> (xd->bd - 5), &rate, &dist);
+    } else {
+      vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
+                                   dc_quant >> 3, &rate, &dist);
+    }
+#else
+    vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
+                                 dc_quant >> 3, &rate, &dist);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  }
+
+  if (!skip_dc) {
+    *out_rate_sum = rate >> 1;
+    *out_dist_sum = dist << 3;
+  } else {
+    *out_rate_sum = 0;
+    *out_dist_sum = (sse - var) << 4;
+  }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize],
+                                 ac_quant >> (xd->bd - 5), &rate, &dist);
+  } else {
+    vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize],
+                                 ac_quant >> 3, &rate, &dist);
+  }
+#else
+  vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize],
+                               ac_quant >> 3, &rate, &dist);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  *out_rate_sum += rate;
+  *out_dist_sum += dist << 4;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
+                      int *skippable, int64_t *sse, int plane,
+                      BLOCK_SIZE bsize, TX_SIZE tx_size) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  unsigned int var_y, sse_y;
+  (void)plane;
+  (void)tx_size;
+  model_rd_for_sb_y(cpi, bsize, x, xd, rate, dist, &var_y, &sse_y);
+  *sse = INT_MAX;
+  *skippable = 0;
+  return;
+}
+#else
+static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
+                      int *skippable, int64_t *sse, int plane,
+                      BLOCK_SIZE bsize, TX_SIZE tx_size) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const struct macroblockd_plane *pd = &xd->plane[plane];
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+  const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+  const int step = 1 << (tx_size << 1);
+  const int block_step = (1 << tx_size);
+  int block = 0, r, c;
+  int shift = tx_size == TX_32X32 ? 0 : 2;
+  const int max_blocks_wide = num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 :
+      xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+  const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 :
+      xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+  int eob_cost = 0;
+
+  (void)cpi;
+  vp9_subtract_plane(x, bsize, plane);
+  *skippable = 1;
+  // Keep track of the row and column of the blocks we use so that we know
+  // if we are in the unrestricted motion border.
+  for (r = 0; r < max_blocks_high; r += block_step) {
+    for (c = 0; c < num_4x4_w; c += block_step) {
+      if (c < max_blocks_wide) {
+        const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
+        tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+        tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+        tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+        uint16_t *const eob = &p->eobs[block];
+        const int diff_stride = 4 * num_4x4_blocks_wide_lookup[bsize];
+        const int16_t *src_diff;
+        src_diff = &p->src_diff[(r * diff_stride + c) << 2];
+
+        switch (tx_size) {
+          case TX_32X32:
+            vpx_fdct32x32_rd(src_diff, coeff, diff_stride);
+            vp9_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin,
+                                  p->round_fp, p->quant_fp, p->quant_shift,
+                                  qcoeff, dqcoeff, pd->dequant, eob,
+                                  scan_order->scan, scan_order->iscan);
+            break;
+          case TX_16X16:
+            vpx_hadamard_16x16(src_diff, diff_stride, (int16_t *)coeff);
+            vp9_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
+                            p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                            pd->dequant, eob,
+                            scan_order->scan, scan_order->iscan);
+            break;
+          case TX_8X8:
+            vpx_hadamard_8x8(src_diff, diff_stride, (int16_t *)coeff);
+            vp9_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp,
+                            p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                            pd->dequant, eob,
+                            scan_order->scan, scan_order->iscan);
+            break;
+          case TX_4X4:
+            x->fwd_txm4x4(src_diff, coeff, diff_stride);
+            vp9_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
+                            p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                            pd->dequant, eob,
+                            scan_order->scan, scan_order->iscan);
+            break;
+          default:
+            assert(0);
+            break;
+        }
+        *skippable &= (*eob == 0);
+        eob_cost += 1;
+      }
+      block += step;
+    }
+  }
+
+  if (*skippable && *sse < INT64_MAX) {
+    *rate = 0;
+    *dist = (*sse << 6) >> shift;
+    *sse = *dist;
+    return;
+  }
+
+  block = 0;
+  *rate = 0;
+  *dist = 0;
+  if (*sse < INT64_MAX)
+    *sse = (*sse << 6) >> shift;
+  for (r = 0; r < max_blocks_high; r += block_step) {
+    for (c = 0; c < num_4x4_w; c += block_step) {
+      if (c < max_blocks_wide) {
+        tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+        tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+        tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+        uint16_t *const eob = &p->eobs[block];
+
+        if (*eob == 1)
+          *rate += (int)abs(qcoeff[0]);
+        else if (*eob > 1)
+          *rate += vpx_satd((const int16_t *)qcoeff, step << 4);
+
+        *dist += vp9_block_error_fp(coeff, dqcoeff, step << 4) >> shift;
+      }
+      block += step;
+    }
+  }
+
+  if (*skippable == 0) {
+    *rate <<= (2 + VP9_PROB_COST_SHIFT);
+    *rate += (eob_cost << VP9_PROB_COST_SHIFT);
+  }
+}
+#endif
+
+static void model_rd_for_sb_uv(VP9_COMP *cpi, BLOCK_SIZE plane_bsize,
+                               MACROBLOCK *x, MACROBLOCKD *xd,
+                               int *out_rate_sum, int64_t *out_dist_sum,
+                               unsigned int *var_y, unsigned int *sse_y,
+                               int start_plane, int stop_plane) {
+  // Note our transform coeffs are 8 times an orthogonal transform.
+  // Hence quantizer step is also 8 times. To get effective quantizer
+  // we need to divide by 8 before sending to modeling function.
+  unsigned int sse;
+  int rate;
+  int64_t dist;
+  int i;
+
+  *out_rate_sum = 0;
+  *out_dist_sum = 0;
+
+  for (i = start_plane; i <= stop_plane; ++i) {
+    struct macroblock_plane *const p = &x->plane[i];
+    struct macroblockd_plane *const pd = &xd->plane[i];
+    const uint32_t dc_quant = pd->dequant[0];
+    const uint32_t ac_quant = pd->dequant[1];
+    const BLOCK_SIZE bs = plane_bsize;
+    unsigned int var;
+
+    if (!x->color_sensitivity[i - 1])
+      continue;
+
+    var = cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
+                             pd->dst.buf, pd->dst.stride, &sse);
+    *var_y += var;
+    *sse_y += sse;
+
+  #if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bs],
+                                   dc_quant >> (xd->bd - 5), &rate, &dist);
+    } else {
+      vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bs],
+                                   dc_quant >> 3, &rate, &dist);
+    }
+  #else
+    vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bs],
+                                 dc_quant >> 3, &rate, &dist);
+  #endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    *out_rate_sum += rate >> 1;
+    *out_dist_sum += dist << 3;
+
+  #if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bs],
+                                   ac_quant >> (xd->bd - 5), &rate, &dist);
+    } else {
+      vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bs],
+                                   ac_quant >> 3, &rate, &dist);
+    }
+  #else
+    vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bs],
+                                 ac_quant >> 3, &rate, &dist);
+  #endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    *out_rate_sum += rate;
+    *out_dist_sum += dist << 4;
+  }
+}
+
+static int get_pred_buffer(PRED_BUFFER *p, int len) {
+  int i;
+
+  for (i = 0; i < len; i++) {
+    if (!p[i].in_use) {
+      p[i].in_use = 1;
+      return i;
+    }
+  }
+  return -1;
+}
+
+static void free_pred_buffer(PRED_BUFFER *p) {
+  if (p != NULL)
+    p->in_use = 0;
+}
+
+static void encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x,
+                                 BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                 MV_REFERENCE_FRAME ref_frame,
+                                 PREDICTION_MODE this_mode,
+                                 unsigned int var_y, unsigned int sse_y,
+                                 struct buf_2d yv12_mb[][MAX_MB_PLANE],
+                                 int *rate, int64_t *dist) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  MODE_INFO *const mi = xd->mi[0];
+  const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]);
+  unsigned int var = var_y, sse = sse_y;
+  // Skipping threshold for ac.
+  unsigned int thresh_ac;
+  // Skipping threshold for dc.
+  unsigned int thresh_dc;
+  int motion_low = 1;
+  if (mi->mv[0].as_mv.row > 64 ||
+      mi->mv[0].as_mv.row < -64 ||
+      mi->mv[0].as_mv.col > 64 ||
+      mi->mv[0].as_mv.col < -64)
+    motion_low = 0;
+  if (x->encode_breakout > 0 && motion_low == 1) {
+    // Set a maximum for threshold to avoid big PSNR loss in low bit rate
+    // case. Use extreme low threshold for static frames to limit
+    // skipping.
+    const unsigned int max_thresh = 36000;
+    // The encode_breakout input
+    const unsigned int min_thresh =
+        VPXMIN(((unsigned int)x->encode_breakout << 4), max_thresh);
+#if CONFIG_VP9_HIGHBITDEPTH
+    const int shift = (xd->bd << 1) - 16;
+#endif
+
+    // Calculate threshold according to dequant value.
+    thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) >> 3;
+#if CONFIG_VP9_HIGHBITDEPTH
+    if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) && shift > 0) {
+      thresh_ac = ROUND_POWER_OF_TWO(thresh_ac, shift);
+    }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    thresh_ac = clamp(thresh_ac, min_thresh, max_thresh);
+
+    // Adjust ac threshold according to partition size.
+    thresh_ac >>=
+        8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
+
+    thresh_dc = (xd->plane[0].dequant[0] * xd->plane[0].dequant[0] >> 6);
+#if CONFIG_VP9_HIGHBITDEPTH
+    if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) && shift > 0) {
+      thresh_dc = ROUND_POWER_OF_TWO(thresh_dc, shift);
+    }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  } else {
+    thresh_ac = 0;
+    thresh_dc = 0;
+  }
+
+  // Y skipping condition checking for ac and dc.
+  if (var <= thresh_ac && (sse - var) <= thresh_dc) {
+    unsigned int sse_u, sse_v;
+    unsigned int var_u, var_v;
+    unsigned int thresh_ac_uv = thresh_ac;
+    unsigned int thresh_dc_uv = thresh_dc;
+    if (x->sb_is_skin) {
+      thresh_ac_uv = 0;
+      thresh_dc_uv = 0;
+    }
+
+    // Skip UV prediction unless breakout is zero (lossless) to save
+    // computation with low impact on the result
+    if (x->encode_breakout == 0) {
+      xd->plane[1].pre[0] = yv12_mb[ref_frame][1];
+      xd->plane[2].pre[0] = yv12_mb[ref_frame][2];
+      vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col, bsize);
+    }
+
+    var_u = cpi->fn_ptr[uv_size].vf(x->plane[1].src.buf,
+                                    x->plane[1].src.stride,
+                                    xd->plane[1].dst.buf,
+                                    xd->plane[1].dst.stride, &sse_u);
+
+    // U skipping condition checking
+    if (((var_u << 2) <= thresh_ac_uv) && (sse_u - var_u <= thresh_dc_uv)) {
+      var_v = cpi->fn_ptr[uv_size].vf(x->plane[2].src.buf,
+                                      x->plane[2].src.stride,
+                                      xd->plane[2].dst.buf,
+                                      xd->plane[2].dst.stride, &sse_v);
+
+      // V skipping condition checking
+      if (((var_v << 2) <= thresh_ac_uv) && (sse_v - var_v <= thresh_dc_uv)) {
+        x->skip = 1;
+
+        // The cost of skip bit needs to be added.
+        *rate = cpi->inter_mode_cost[x->mbmi_ext->mode_context[ref_frame]]
+                                    [INTER_OFFSET(this_mode)];
+
+        // More on this part of rate
+        // rate += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
+
+        // Scaling factor for SSE from spatial domain to frequency
+        // domain is 16. Adjust distortion accordingly.
+        // TODO(yunqingwang): In this function, only y-plane dist is
+        // calculated.
+        *dist = (sse << 4);  // + ((sse_u + sse_v) << 4);
+
+        // *disable_skip = 1;
+      }
+    }
+  }
+}
+
+struct estimate_block_intra_args {
+  VP9_COMP *cpi;
+  MACROBLOCK *x;
+  PREDICTION_MODE mode;
+  int rate;
+  int64_t dist;
+};
+
+static void estimate_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
+                                 TX_SIZE tx_size, void *arg) {
+  struct estimate_block_intra_args* const args = arg;
+  VP9_COMP *const cpi = args->cpi;
+  MACROBLOCK *const x = args->x;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *const p = &x->plane[0];
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  const BLOCK_SIZE bsize_tx = txsize_to_bsize[tx_size];
+  uint8_t *const src_buf_base = p->src.buf;
+  uint8_t *const dst_buf_base = pd->dst.buf;
+  const int src_stride = p->src.stride;
+  const int dst_stride = pd->dst.stride;
+  int i, j;
+  int rate;
+  int64_t dist;
+
+  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
+
+  p->src.buf = &src_buf_base[4 * (j * src_stride + i)];
+  pd->dst.buf = &dst_buf_base[4 * (j * dst_stride + i)];
+  // Use source buffer as an approximation for the fully reconstructed buffer.
+  vp9_predict_intra_block(xd, b_width_log2_lookup[plane_bsize],
+                          tx_size, args->mode,
+                          x->skip_encode ? p->src.buf : pd->dst.buf,
+                          x->skip_encode ? src_stride : dst_stride,
+                          pd->dst.buf, dst_stride,
+                          i, j, plane);
+
+  if (plane == 0) {
+    int64_t this_sse = INT64_MAX;
+    int is_skippable;
+    // TODO(jingning): This needs further refactoring.
+    block_yrd(cpi, x, &rate, &dist, &is_skippable, &this_sse, 0,
+              bsize_tx, VPXMIN(tx_size, TX_16X16));
+    x->skip_txfm[0] = is_skippable;
+    // TODO(jingning): Skip is signalled per prediciton block not per tx block.
+    rate += vp9_cost_bit(vp9_get_skip_prob(&cpi->common, xd), is_skippable);
+  } else {
+    unsigned int var = 0;
+    unsigned int sse = 0;
+    model_rd_for_sb_uv(cpi, plane_bsize, x, xd, &rate, &dist, &var, &sse,
+                       plane, plane);
+  }
+
+  p->src.buf = src_buf_base;
+  pd->dst.buf = dst_buf_base;
+  args->rate += rate;
+  args->dist += dist;
+}
+
+static const THR_MODES mode_idx[MAX_REF_FRAMES - 1][4] = {
+  {THR_DC, THR_V_PRED, THR_H_PRED, THR_TM},
+  {THR_NEARESTMV, THR_NEARMV, THR_ZEROMV, THR_NEWMV},
+  {THR_NEARESTG, THR_NEARG, THR_ZEROG, THR_NEWG},
+};
+
+static const PREDICTION_MODE intra_mode_list[] = {
+  DC_PRED, V_PRED, H_PRED, TM_PRED
+};
+
+static int mode_offset(const PREDICTION_MODE mode) {
+  if (mode >= NEARESTMV) {
+    return INTER_OFFSET(mode);
+  } else {
+    switch (mode) {
+      case DC_PRED:
+        return 0;
+      case V_PRED:
+        return 1;
+      case H_PRED:
+        return 2;
+      case TM_PRED:
+        return 3;
+      default:
+        return -1;
+    }
+  }
+}
+
+static INLINE void update_thresh_freq_fact(VP9_COMP *cpi,
+                                           TileDataEnc *tile_data,
+                                           BLOCK_SIZE bsize,
+                                           MV_REFERENCE_FRAME ref_frame,
+                                           THR_MODES best_mode_idx,
+                                           PREDICTION_MODE mode) {
+  THR_MODES thr_mode_idx = mode_idx[ref_frame][mode_offset(mode)];
+  int *freq_fact = &tile_data->thresh_freq_fact[bsize][thr_mode_idx];
+  if (thr_mode_idx == best_mode_idx)
+    *freq_fact -= (*freq_fact >> 4);
+  else
+    *freq_fact = VPXMIN(*freq_fact + RD_THRESH_INC,
+                        cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT);
+}
+
+void vp9_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *rd_cost,
+                         BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *const mi = xd->mi[0];
+  RD_COST this_rdc, best_rdc;
+  PREDICTION_MODE this_mode;
+  struct estimate_block_intra_args args = { cpi, x, DC_PRED, 0, 0 };
+  const TX_SIZE intra_tx_size =
+      VPXMIN(max_txsize_lookup[bsize],
+             tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+  MODE_INFO *const mic = xd->mi[0];
+  int *bmode_costs;
+  const MODE_INFO *above_mi = xd->mi[-xd->mi_stride];
+  const MODE_INFO *left_mi = xd->left_available ? xd->mi[-1] : NULL;
+  const PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, 0);
+  const PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, 0);
+  bmode_costs = cpi->y_mode_costs[A][L];
+
+  (void) ctx;
+  vp9_rd_cost_reset(&best_rdc);
+  vp9_rd_cost_reset(&this_rdc);
+
+  mi->ref_frame[0] = INTRA_FRAME;
+  mi->mv[0].as_int = INVALID_MV;
+  mi->uv_mode = DC_PRED;
+  memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
+
+  // Change the limit of this loop to add other intra prediction
+  // mode tests.
+  for (this_mode = DC_PRED; this_mode <= H_PRED; ++this_mode) {
+    args.mode = this_mode;
+    args.rate = 0;
+    args.dist = 0;
+    mi->tx_size = intra_tx_size;
+    vp9_foreach_transformed_block_in_plane(xd, bsize, 0,
+                                           estimate_block_intra, &args);
+    this_rdc.rate = args.rate;
+    this_rdc.dist = args.dist;
+    this_rdc.rate += bmode_costs[this_mode];
+    this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+                             this_rdc.rate, this_rdc.dist);
+
+    if (this_rdc.rdcost < best_rdc.rdcost) {
+      best_rdc = this_rdc;
+      mi->mode = this_mode;
+    }
+  }
+
+  *rd_cost = best_rdc;
+}
+
+static void init_ref_frame_cost(VP9_COMMON *const cm,
+                                MACROBLOCKD *const xd,
+                                int ref_frame_cost[MAX_REF_FRAMES]) {
+  vpx_prob intra_inter_p = vp9_get_intra_inter_prob(cm, xd);
+  vpx_prob ref_single_p1 = vp9_get_pred_prob_single_ref_p1(cm, xd);
+  vpx_prob ref_single_p2 = vp9_get_pred_prob_single_ref_p2(cm, xd);
+
+  ref_frame_cost[INTRA_FRAME] = vp9_cost_bit(intra_inter_p, 0);
+  ref_frame_cost[LAST_FRAME] = ref_frame_cost[GOLDEN_FRAME] =
+    ref_frame_cost[ALTREF_FRAME] = vp9_cost_bit(intra_inter_p, 1);
+
+  ref_frame_cost[LAST_FRAME] += vp9_cost_bit(ref_single_p1, 0);
+  ref_frame_cost[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p1, 1);
+  ref_frame_cost[ALTREF_FRAME] += vp9_cost_bit(ref_single_p1, 1);
+  ref_frame_cost[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p2, 0);
+  ref_frame_cost[ALTREF_FRAME] += vp9_cost_bit(ref_single_p2, 1);
+}
+
+typedef struct {
+  MV_REFERENCE_FRAME ref_frame;
+  PREDICTION_MODE pred_mode;
+} REF_MODE;
+
+#define RT_INTER_MODES 8
+static const REF_MODE ref_mode_set[RT_INTER_MODES] = {
+    {LAST_FRAME, ZEROMV},
+    {LAST_FRAME, NEARESTMV},
+    {GOLDEN_FRAME, ZEROMV},
+    {LAST_FRAME, NEARMV},
+    {LAST_FRAME, NEWMV},
+    {GOLDEN_FRAME, NEARESTMV},
+    {GOLDEN_FRAME, NEARMV},
+    {GOLDEN_FRAME, NEWMV}
+};
+static const REF_MODE ref_mode_set_svc[RT_INTER_MODES] = {
+    {LAST_FRAME, ZEROMV},
+    {GOLDEN_FRAME, ZEROMV},
+    {LAST_FRAME, NEARESTMV},
+    {LAST_FRAME, NEARMV},
+    {GOLDEN_FRAME, NEARESTMV},
+    {GOLDEN_FRAME, NEARMV},
+    {LAST_FRAME, NEWMV},
+    {GOLDEN_FRAME, NEWMV}
+};
+
+int set_intra_cost_penalty(const VP9_COMP *const cpi, BLOCK_SIZE bsize) {
+  const VP9_COMMON *const cm = &cpi->common;
+  // Reduce the intra cost penalty for small blocks (<=16x16).
+  int reduction_fac =
+      (bsize <= BLOCK_16X16) ? ((bsize <= BLOCK_8X8) ? 4 : 2) : 0;
+  if (cpi->noise_estimate.enabled && cpi->noise_estimate.level == kHigh)
+     // Don't reduce intra cost penalty if estimated noise level is high.
+     reduction_fac = 0;
+  return vp9_get_intra_cost_penalty(
+      cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth) >> reduction_fac;
+}
+
+static INLINE void find_predictors(VP9_COMP *cpi, MACROBLOCK *x,
+                                 MV_REFERENCE_FRAME ref_frame,
+                                 int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
+                                 int const_motion[MAX_REF_FRAMES],
+                                 int *ref_frame_skip_mask,
+                                 const int flag_list[4],
+                                 TileDataEnc *tile_data,
+                                 int mi_row, int mi_col,
+                                 struct buf_2d yv12_mb[4][MAX_MB_PLANE],
+                                 BLOCK_SIZE bsize) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
+  TileInfo *const tile_info = &tile_data->tile_info;
+// TODO(jingning) placeholder for inter-frame non-RD mode decision.
+  x->pred_mv_sad[ref_frame] = INT_MAX;
+  frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
+  frame_mv[ZEROMV][ref_frame].as_int = 0;
+// this needs various further optimizations. to be continued..
+  if ((cpi->ref_frame_flags & flag_list[ref_frame]) && (yv12 != NULL)) {
+    int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame];
+    const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
+    vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col,
+                         sf, sf);
+    if (cm->use_prev_frame_mvs)
+      vp9_find_mv_refs(cm, xd, xd->mi[0], ref_frame,
+                       candidates, mi_row, mi_col,
+                       x->mbmi_ext->mode_context);
+    else
+    const_motion[ref_frame] =
+        mv_refs_rt(cpi, cm, x, xd, tile_info, xd->mi[0], ref_frame,
+            candidates, &frame_mv[NEWMV][ref_frame], mi_row, mi_col,
+            (int)(cpi->svc.use_base_mv && cpi->svc.spatial_layer_id));
+    vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, candidates,
+                          &frame_mv[NEARESTMV][ref_frame],
+                          &frame_mv[NEARMV][ref_frame]);
+    if (!vp9_is_scaled(sf) && bsize >= BLOCK_8X8) {
+      vp9_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride,
+                  ref_frame, bsize);
+    }
+  } else {
+    *ref_frame_skip_mask |= (1 << ref_frame);
+  }
+}
+void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
+                         TileDataEnc *tile_data,
+                         int mi_row, int mi_col, RD_COST *rd_cost,
+                         BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
+  VP9_COMMON *const cm = &cpi->common;
+  SPEED_FEATURES *const sf = &cpi->sf;
+  const SVC *const svc = &cpi->svc;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *const mi = xd->mi[0];
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  PREDICTION_MODE best_mode = ZEROMV;
+  MV_REFERENCE_FRAME ref_frame, best_ref_frame = LAST_FRAME;
+  MV_REFERENCE_FRAME usable_ref_frame;
+  TX_SIZE best_tx_size = TX_SIZES;
+  INTERP_FILTER best_pred_filter = EIGHTTAP;
+  int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
+  struct buf_2d yv12_mb[4][MAX_MB_PLANE];
+  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
+                                    VP9_ALT_FLAG };
+  RD_COST this_rdc, best_rdc;
+  uint8_t skip_txfm = SKIP_TXFM_NONE, best_mode_skip_txfm = SKIP_TXFM_NONE;
+  // var_y and sse_y are saved to be used in skipping checking
+  unsigned int var_y = UINT_MAX;
+  unsigned int sse_y = UINT_MAX;
+  const int intra_cost_penalty = set_intra_cost_penalty(cpi, bsize);
+  int64_t inter_mode_thresh = RDCOST(x->rdmult, x->rddiv,
+                                           intra_cost_penalty, 0);
+  const int *const rd_threshes = cpi->rd.threshes[mi->segment_id][bsize];
+  const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize];
+  INTERP_FILTER filter_ref;
+  const int bsl = mi_width_log2_lookup[bsize];
+  const int pred_filter_search = cm->interp_filter == SWITCHABLE ?
+      (((mi_row + mi_col) >> bsl) +
+       get_chessboard_index(cm->current_video_frame)) & 0x1 : 0;
+  int const_motion[MAX_REF_FRAMES] = { 0 };
+  const int bh = num_4x4_blocks_high_lookup[bsize] << 2;
+  const int bw = num_4x4_blocks_wide_lookup[bsize] << 2;
+  // For speed 6, the result of interp filter is reused later in actual encoding
+  // process.
+  // tmp[3] points to dst buffer, and the other 3 point to allocated buffers.
+  PRED_BUFFER tmp[4];
+  DECLARE_ALIGNED(16, uint8_t, pred_buf[3 * 64 * 64]);
+#if CONFIG_VP9_HIGHBITDEPTH
+  DECLARE_ALIGNED(16, uint16_t, pred_buf_16[3 * 64 * 64]);
+#endif
+  struct buf_2d orig_dst = pd->dst;
+  PRED_BUFFER *best_pred = NULL;
+  PRED_BUFFER *this_mode_pred = NULL;
+  const int pixels_in_block = bh * bw;
+  int reuse_inter_pred = cpi->sf.reuse_inter_pred_sby && ctx->pred_pixel_ready;
+  int ref_frame_skip_mask = 0;
+  int idx;
+  int best_pred_sad = INT_MAX;
+  int best_early_term = 0;
+  int ref_frame_cost[MAX_REF_FRAMES];
+  int svc_force_zero_mode[3] = {0};
+  int perform_intra_pred = 1;
+#if CONFIG_VP9_TEMPORAL_DENOISING
+  int64_t zero_last_cost_orig = INT64_MAX;
+#endif
+
+  init_ref_frame_cost(cm, xd, ref_frame_cost);
+
+  if (reuse_inter_pred) {
+    int i;
+    for (i = 0; i < 3; i++) {
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (cm->use_highbitdepth)
+        tmp[i].data = CONVERT_TO_BYTEPTR(&pred_buf_16[pixels_in_block * i]);
+      else
+        tmp[i].data = &pred_buf[pixels_in_block * i];
+#else
+      tmp[i].data = &pred_buf[pixels_in_block * i];
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      tmp[i].stride = bw;
+      tmp[i].in_use = 0;
+    }
+    tmp[3].data = pd->dst.buf;
+    tmp[3].stride = pd->dst.stride;
+    tmp[3].in_use = 0;
+  }
+
+  x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
+  x->skip = 0;
+
+  if (xd->up_available)
+    filter_ref = xd->mi[-xd->mi_stride]->interp_filter;
+  else if (xd->left_available)
+    filter_ref = xd->mi[-1]->interp_filter;
+  else
+    filter_ref = cm->interp_filter;
+
+  // initialize mode decisions
+  vp9_rd_cost_reset(&best_rdc);
+  vp9_rd_cost_reset(rd_cost);
+  mi->sb_type = bsize;
+  mi->ref_frame[0] = NONE;
+  mi->ref_frame[1] = NONE;
+  mi->tx_size = VPXMIN(max_txsize_lookup[bsize],
+                       tx_mode_to_biggest_tx_size[cm->tx_mode]);
+
+  if (sf->short_circuit_flat_blocks) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+      x->source_variance = vp9_high_get_sby_perpixel_variance(
+          cpi, &x->plane[0].src, bsize, xd->bd);
+    else
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      x->source_variance =
+          vp9_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
+  }
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+  vp9_denoiser_reset_frame_stats(ctx);
+#endif
+
+  if (cpi->rc.frames_since_golden == 0 && !cpi->use_svc) {
+    usable_ref_frame = LAST_FRAME;
+  } else {
+    usable_ref_frame = GOLDEN_FRAME;
+  }
+
+  // If the reference is temporally aligned with current superframe
+  // (e.g., spatial reference within superframe), constrain the inter mode:
+  // for now only test zero motion.
+  if (cpi->use_svc && svc ->force_zero_mode_spatial_ref) {
+    if (svc->ref_frame_index[cpi->lst_fb_idx] == svc->current_superframe)
+      svc_force_zero_mode[LAST_FRAME - 1] = 1;
+    if (svc->ref_frame_index[cpi->gld_fb_idx] == svc->current_superframe)
+      svc_force_zero_mode[GOLDEN_FRAME - 1] = 1;
+  }
+
+  for (ref_frame = LAST_FRAME; ref_frame <= usable_ref_frame; ++ref_frame) {
+    find_predictors(cpi, x, ref_frame, frame_mv, const_motion,
+                    &ref_frame_skip_mask, flag_list, tile_data, mi_row, mi_col,
+                    yv12_mb, bsize);
+  }
+
+  for (idx = 0; idx < RT_INTER_MODES; ++idx) {
+    int rate_mv = 0;
+    int mode_rd_thresh;
+    int mode_index;
+    int i;
+    int64_t this_sse;
+    int is_skippable;
+    int this_early_term = 0;
+    PREDICTION_MODE this_mode = ref_mode_set[idx].pred_mode;
+    if (cpi->use_svc)
+      this_mode = ref_mode_set_svc[idx].pred_mode;
+
+    if (sf->short_circuit_flat_blocks && x->source_variance == 0 &&
+        this_mode != NEARESTMV) {
+      continue;
+    }
+
+    if (!(cpi->sf.inter_mode_mask[bsize] & (1 << this_mode)))
+      continue;
+
+    ref_frame = ref_mode_set[idx].ref_frame;
+    if (cpi->use_svc) {
+      ref_frame = ref_mode_set_svc[idx].ref_frame;
+    }
+
+    if (!(cpi->ref_frame_flags & flag_list[ref_frame]))
+      continue;
+    if (const_motion[ref_frame] && this_mode == NEARMV)
+      continue;
+
+    if (cpi->use_svc) {
+      if (svc_force_zero_mode[ref_frame - 1] &&
+          frame_mv[this_mode][ref_frame].as_int != 0)
+        continue;
+    }
+
+    if (!(frame_mv[this_mode][ref_frame].as_int == 0 &&
+        ref_frame == LAST_FRAME)) {
+      i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME;
+      if ((cpi->ref_frame_flags & flag_list[i]) && sf->reference_masking)
+        if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1))
+          ref_frame_skip_mask |= (1 << ref_frame);
+    }
+    if (ref_frame_skip_mask & (1 << ref_frame))
+      continue;
+
+    // Select prediction reference frames.
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
+
+    mi->ref_frame[0] = ref_frame;
+    set_ref_ptrs(cm, xd, ref_frame, NONE);
+
+    mode_index = mode_idx[ref_frame][INTER_OFFSET(this_mode)];
+    mode_rd_thresh = best_mode_skip_txfm ?
+            rd_threshes[mode_index] << 1 : rd_threshes[mode_index];
+    if (rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh,
+                            rd_thresh_freq_fact[mode_index]))
+      continue;
+
+    if (this_mode == NEWMV) {
+      if (ref_frame > LAST_FRAME && !cpi->use_svc) {
+        int tmp_sad;
+        int dis, cost_list[5];
+
+        if (bsize < BLOCK_16X16)
+          continue;
+
+        tmp_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col);
+
+        if (tmp_sad > x->pred_mv_sad[LAST_FRAME])
+          continue;
+        if (tmp_sad + (num_pels_log2_lookup[bsize] << 4) > best_pred_sad)
+          continue;
+
+        frame_mv[NEWMV][ref_frame].as_int = mi->mv[0].as_int;
+        rate_mv = vp9_mv_bit_cost(&frame_mv[NEWMV][ref_frame].as_mv,
+          &x->mbmi_ext->ref_mvs[ref_frame][0].as_mv,
+          x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+        frame_mv[NEWMV][ref_frame].as_mv.row >>= 3;
+        frame_mv[NEWMV][ref_frame].as_mv.col >>= 3;
+
+        cpi->find_fractional_mv_step(x, &frame_mv[NEWMV][ref_frame].as_mv,
+          &x->mbmi_ext->ref_mvs[ref_frame][0].as_mv,
+          cpi->common.allow_high_precision_mv,
+          x->errorperbit,
+          &cpi->fn_ptr[bsize],
+          cpi->sf.mv.subpel_force_stop,
+          cpi->sf.mv.subpel_iters_per_step,
+          cond_cost_list(cpi, cost_list),
+          x->nmvjointcost, x->mvcost, &dis,
+          &x->pred_sse[ref_frame], NULL, 0, 0);
+      } else if (svc->use_base_mv && svc->spatial_layer_id) {
+        if (frame_mv[NEWMV][ref_frame].as_int != INVALID_MV &&
+            frame_mv[NEWMV][ref_frame].as_int != 0) {
+          const int pre_stride = xd->plane[0].pre[0].stride;
+          int base_mv_sad = INT_MAX;
+          const uint8_t * const pre_buf = xd->plane[0].pre[0].buf +
+              (frame_mv[NEWMV][ref_frame].as_mv.row >> 3) * pre_stride +
+              (frame_mv[NEWMV][ref_frame].as_mv.col >> 3);
+          base_mv_sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf,
+                                       x->plane[0].src.stride,
+                                       pre_buf, pre_stride);
+
+          // TODO(wonkap): make the decision to use base layer mv on RD;
+          // not just SAD.
+          if (base_mv_sad < x->pred_mv_sad[ref_frame]) {
+            // Base layer mv is good.
+            if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
+                &frame_mv[NEWMV][ref_frame], &rate_mv, best_rdc.rdcost, 1)) {
+                continue;
+            }
+          } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
+            &frame_mv[NEWMV][ref_frame], &rate_mv, best_rdc.rdcost, 0)) {
+            continue;
+          }
+        } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
+          &frame_mv[NEWMV][ref_frame], &rate_mv, best_rdc.rdcost, 0)) {
+          continue;
+        }
+      } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
+        &frame_mv[NEWMV][ref_frame], &rate_mv, best_rdc.rdcost, 0)) {
+        continue;
+      }
+    }
+
+    if (this_mode == NEWMV && ref_frame == LAST_FRAME &&
+        frame_mv[NEWMV][LAST_FRAME].as_int != INVALID_MV) {
+      const int pre_stride = xd->plane[0].pre[0].stride;
+      const uint8_t * const pre_buf = xd->plane[0].pre[0].buf +
+          (frame_mv[NEWMV][LAST_FRAME].as_mv.row >> 3) * pre_stride +
+          (frame_mv[NEWMV][LAST_FRAME].as_mv.col >> 3);
+      best_pred_sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf,
+                                   x->plane[0].src.stride,
+                                   pre_buf, pre_stride);
+      x->pred_mv_sad[LAST_FRAME] = best_pred_sad;
+    }
+
+    if (cpi->use_svc) {
+      if (this_mode == NEWMV && ref_frame == GOLDEN_FRAME &&
+          frame_mv[NEWMV][GOLDEN_FRAME].as_int != INVALID_MV) {
+        const int pre_stride = xd->plane[0].pre[0].stride;
+        const uint8_t * const pre_buf = xd->plane[0].pre[0].buf +
+            (frame_mv[NEWMV][GOLDEN_FRAME].as_mv.row >> 3) * pre_stride +
+            (frame_mv[NEWMV][GOLDEN_FRAME].as_mv.col >> 3);
+        best_pred_sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf,
+                                               x->plane[0].src.stride,
+                                               pre_buf, pre_stride);
+        x->pred_mv_sad[GOLDEN_FRAME] = best_pred_sad;
+      }
+    }
+
+
+    if (this_mode != NEARESTMV &&
+        frame_mv[this_mode][ref_frame].as_int ==
+            frame_mv[NEARESTMV][ref_frame].as_int)
+      continue;
+
+    mi->mode = this_mode;
+    mi->mv[0].as_int = frame_mv[this_mode][ref_frame].as_int;
+
+    // Search for the best prediction filter type, when the resulting
+    // motion vector is at sub-pixel accuracy level for luma component, i.e.,
+    // the last three bits are all zeros.
+    if (reuse_inter_pred) {
+      if (!this_mode_pred) {
+        this_mode_pred = &tmp[3];
+      } else {
+        this_mode_pred = &tmp[get_pred_buffer(tmp, 3)];
+        pd->dst.buf = this_mode_pred->data;
+        pd->dst.stride = bw;
+      }
+    }
+
+    if ((this_mode == NEWMV || filter_ref == SWITCHABLE) && pred_filter_search
+        && (ref_frame == LAST_FRAME ||
+            (ref_frame == GOLDEN_FRAME && cpi->use_svc))
+        && (((mi->mv[0].as_mv.row | mi->mv[0].as_mv.col) & 0x07) != 0)) {
+      int pf_rate[3];
+      int64_t pf_dist[3];
+      unsigned int pf_var[3];
+      unsigned int pf_sse[3];
+      TX_SIZE pf_tx_size[3];
+      int64_t best_cost = INT64_MAX;
+      INTERP_FILTER best_filter = SWITCHABLE, filter;
+      PRED_BUFFER *current_pred = this_mode_pred;
+
+      for (filter = EIGHTTAP; filter <= EIGHTTAP_SMOOTH; ++filter) {
+        int64_t cost;
+        mi->interp_filter = filter;
+        vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+        model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[filter], &pf_dist[filter],
+                          &pf_var[filter], &pf_sse[filter]);
+        pf_rate[filter] += vp9_get_switchable_rate(cpi, xd);
+        cost = RDCOST(x->rdmult, x->rddiv, pf_rate[filter], pf_dist[filter]);
+        pf_tx_size[filter] = mi->tx_size;
+        if (cost < best_cost) {
+          best_filter = filter;
+          best_cost = cost;
+          skip_txfm = x->skip_txfm[0];
+
+          if (reuse_inter_pred) {
+            if (this_mode_pred != current_pred) {
+              free_pred_buffer(this_mode_pred);
+              this_mode_pred = current_pred;
+            }
+
+            if (filter < EIGHTTAP_SHARP) {
+              current_pred = &tmp[get_pred_buffer(tmp, 3)];
+              pd->dst.buf = current_pred->data;
+              pd->dst.stride = bw;
+            }
+          }
+        }
+      }
+
+      if (reuse_inter_pred && this_mode_pred != current_pred)
+        free_pred_buffer(current_pred);
+
+      mi->interp_filter = best_filter;
+      mi->tx_size = pf_tx_size[best_filter];
+      this_rdc.rate = pf_rate[best_filter];
+      this_rdc.dist = pf_dist[best_filter];
+      var_y = pf_var[best_filter];
+      sse_y = pf_sse[best_filter];
+      x->skip_txfm[0] = skip_txfm;
+      if (reuse_inter_pred) {
+        pd->dst.buf = this_mode_pred->data;
+        pd->dst.stride = this_mode_pred->stride;
+      }
+    } else {
+      mi->interp_filter = (filter_ref == SWITCHABLE) ? EIGHTTAP : filter_ref;
+      vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+
+      // For large partition blocks, extra testing is done.
+      if (bsize > BLOCK_32X32 &&
+        !cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id) &&
+        cm->base_qindex) {
+        model_rd_for_sb_y_large(cpi, bsize, x, xd, &this_rdc.rate,
+                                &this_rdc.dist, &var_y, &sse_y, mi_row, mi_col,
+                                &this_early_term);
+      } else {
+        model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist,
+                          &var_y, &sse_y);
+      }
+    }
+
+    if (!this_early_term) {
+      this_sse = (int64_t)sse_y;
+      block_yrd(cpi, x, &this_rdc.rate, &this_rdc.dist, &is_skippable,
+                &this_sse, 0, bsize, VPXMIN(mi->tx_size, TX_16X16));
+      x->skip_txfm[0] = is_skippable;
+      if (is_skippable) {
+        this_rdc.rate = vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
+      } else {
+        if (RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist) <
+            RDCOST(x->rdmult, x->rddiv, 0, this_sse)) {
+          this_rdc.rate += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
+        } else {
+          this_rdc.rate = vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
+          this_rdc.dist = this_sse;
+          x->skip_txfm[0] = SKIP_TXFM_AC_DC;
+        }
+      }
+
+      if (cm->interp_filter == SWITCHABLE) {
+        if ((mi->mv[0].as_mv.row | mi->mv[0].as_mv.col) & 0x07)
+          this_rdc.rate += vp9_get_switchable_rate(cpi, xd);
+      }
+    } else {
+      this_rdc.rate += cm->interp_filter == SWITCHABLE ?
+          vp9_get_switchable_rate(cpi, xd) : 0;
+      this_rdc.rate += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
+    }
+
+    if (x->color_sensitivity[0] || x->color_sensitivity[1]) {
+      int uv_rate = 0;
+      int64_t uv_dist = 0;
+      const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, &xd->plane[1]);
+      if (x->color_sensitivity[0])
+        vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 1);
+      if (x->color_sensitivity[1])
+        vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 2);
+      model_rd_for_sb_uv(cpi, uv_bsize, x, xd, &uv_rate, &uv_dist,
+                         &var_y, &sse_y, 1, 2);
+      this_rdc.rate += uv_rate;
+      this_rdc.dist += uv_dist;
+    }
+
+    this_rdc.rate += rate_mv;
+    this_rdc.rate +=
+        cpi->inter_mode_cost[x->mbmi_ext->mode_context[ref_frame]][INTER_OFFSET(
+            this_mode)];
+    this_rdc.rate += ref_frame_cost[ref_frame];
+    this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist);
+
+    if (cpi->oxcf.speed >= 5 &&
+        cpi->oxcf.content != VP9E_CONTENT_SCREEN &&
+        !x->sb_is_skin) {
+      // Bias against non-zero (above some threshold) motion for large blocks.
+      // This is temporary fix to avoid selection of large mv for big blocks.
+      if (frame_mv[this_mode][ref_frame].as_mv.row > 64 ||
+          frame_mv[this_mode][ref_frame].as_mv.row < -64 ||
+          frame_mv[this_mode][ref_frame].as_mv.col > 64 ||
+          frame_mv[this_mode][ref_frame].as_mv.col < -64) {
+        if (bsize == BLOCK_64X64)
+          this_rdc.rdcost = this_rdc.rdcost << 1;
+        else if (bsize >= BLOCK_32X32)
+          this_rdc.rdcost = 3 * this_rdc.rdcost >> 1;
+      }
+      // If noise estimation is enabled, and estimated level is above threshold,
+      // add a bias to LAST reference with small motion, for large blocks.
+      if (cpi->noise_estimate.enabled &&
+          cpi->noise_estimate.level >= kMedium &&
+          bsize >= BLOCK_32X32 &&
+          ref_frame == LAST_FRAME &&
+          frame_mv[this_mode][ref_frame].as_mv.row < 8 &&
+          frame_mv[this_mode][ref_frame].as_mv.row > -8 &&
+          frame_mv[this_mode][ref_frame].as_mv.col < 8 &&
+          frame_mv[this_mode][ref_frame].as_mv.col > -8)
+        this_rdc.rdcost = 7 * this_rdc.rdcost >> 3;
+    }
+
+    // Skipping checking: test to see if this block can be reconstructed by
+    // prediction only.
+    if (cpi->allow_encode_breakout) {
+      encode_breakout_test(cpi, x, bsize, mi_row, mi_col, ref_frame, this_mode,
+                           var_y, sse_y, yv12_mb, &this_rdc.rate,
+                           &this_rdc.dist);
+      if (x->skip) {
+        this_rdc.rate += rate_mv;
+        this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, this_rdc.rate,
+                                 this_rdc.dist);
+      }
+    }
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+    if (cpi->oxcf.noise_sensitivity > 0) {
+      vp9_denoiser_update_frame_stats(mi, sse_y, this_mode, ctx);
+      // Keep track of zero_last cost.
+      if (ref_frame == LAST_FRAME && frame_mv[this_mode][ref_frame].as_int == 0)
+        zero_last_cost_orig = this_rdc.rdcost;
+    }
+#else
+    (void)ctx;
+#endif
+
+    if (this_rdc.rdcost < best_rdc.rdcost || x->skip) {
+      best_rdc = this_rdc;
+      best_mode = this_mode;
+      best_pred_filter = mi->interp_filter;
+      best_tx_size = mi->tx_size;
+      best_ref_frame = ref_frame;
+      best_mode_skip_txfm = x->skip_txfm[0];
+      best_early_term = this_early_term;
+
+      if (reuse_inter_pred) {
+        free_pred_buffer(best_pred);
+        best_pred = this_mode_pred;
+      }
+    } else {
+      if (reuse_inter_pred)
+        free_pred_buffer(this_mode_pred);
+    }
+
+    if (x->skip)
+      break;
+
+    // If early termination flag is 1 and at least 2 modes are checked,
+    // the mode search is terminated.
+    if (best_early_term && idx > 0) {
+      x->skip = 1;
+      break;
+    }
+  }
+
+  mi->mode          = best_mode;
+  mi->interp_filter = best_pred_filter;
+  mi->tx_size       = best_tx_size;
+  mi->ref_frame[0]  = best_ref_frame;
+  mi->mv[0].as_int  = frame_mv[best_mode][best_ref_frame].as_int;
+  xd->mi[0]->bmi[0].as_mv[0].as_int = mi->mv[0].as_int;
+  x->skip_txfm[0] = best_mode_skip_txfm;
+
+  // Perform intra prediction only if base layer is chosen as the reference.
+  if (cpi->svc.spatial_layer_id) {
+    perform_intra_pred =
+        cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame ||
+        (!cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame
+            && svc_force_zero_mode[best_ref_frame]);
+    inter_mode_thresh = (inter_mode_thresh << 1) + inter_mode_thresh;
+  }
+  // Perform intra prediction search, if the best SAD is above a certain
+  // threshold.
+  if (perform_intra_pred &&
+      ((best_rdc.rdcost == INT64_MAX ||
+      (!x->skip && best_rdc.rdcost > inter_mode_thresh &&
+       bsize <= cpi->sf.max_intra_bsize)))) {
+    struct estimate_block_intra_args args = { cpi, x, DC_PRED, 0, 0 };
+    int i;
+    TX_SIZE best_intra_tx_size = TX_SIZES;
+    TX_SIZE intra_tx_size =
+        VPXMIN(max_txsize_lookup[bsize],
+               tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+    if (cpi->oxcf.content != VP9E_CONTENT_SCREEN && intra_tx_size > TX_16X16)
+      intra_tx_size = TX_16X16;
+
+    if (reuse_inter_pred && best_pred != NULL) {
+      if (best_pred->data == orig_dst.buf) {
+        this_mode_pred = &tmp[get_pred_buffer(tmp, 3)];
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (cm->use_highbitdepth)
+          vpx_highbd_convolve_copy(best_pred->data, best_pred->stride,
+                                   this_mode_pred->data, this_mode_pred->stride,
+                                   NULL, 0, NULL, 0, bw, bh, xd->bd);
+        else
+          vpx_convolve_copy(best_pred->data, best_pred->stride,
+                          this_mode_pred->data, this_mode_pred->stride,
+                          NULL, 0, NULL, 0, bw, bh);
+#else
+        vpx_convolve_copy(best_pred->data, best_pred->stride,
+                          this_mode_pred->data, this_mode_pred->stride,
+                          NULL, 0, NULL, 0, bw, bh);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        best_pred = this_mode_pred;
+      }
+    }
+    pd->dst = orig_dst;
+
+    for (i = 0; i < 4; ++i) {
+      const PREDICTION_MODE this_mode = intra_mode_list[i];
+      THR_MODES mode_index = mode_idx[INTRA_FRAME][mode_offset(this_mode)];
+      int mode_rd_thresh = rd_threshes[mode_index];
+      if (sf->short_circuit_flat_blocks && x->source_variance == 0 &&
+          this_mode != DC_PRED) {
+        continue;
+      }
+
+      if (!((1 << this_mode) & cpi->sf.intra_y_mode_bsize_mask[bsize]))
+        continue;
+
+      if (rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh,
+                              rd_thresh_freq_fact[mode_index]))
+        continue;
+
+      mi->mode = this_mode;
+      mi->ref_frame[0] = INTRA_FRAME;
+      args.mode = this_mode;
+      args.rate = 0;
+      args.dist = 0;
+      mi->tx_size = intra_tx_size;
+      vp9_foreach_transformed_block_in_plane(xd, bsize, 0,
+                                             estimate_block_intra, &args);
+      // Inter and intra RD will mismatch in scale for non-screen content.
+      if (cpi->oxcf.content == VP9E_CONTENT_SCREEN) {
+        if (x->color_sensitivity[0])
+          vp9_foreach_transformed_block_in_plane(xd, bsize, 1,
+                                                 estimate_block_intra, &args);
+        if (x->color_sensitivity[1])
+          vp9_foreach_transformed_block_in_plane(xd, bsize, 2,
+                                                 estimate_block_intra, &args);
+      }
+      this_rdc.rate = args.rate;
+      this_rdc.dist = args.dist;
+      this_rdc.rate += cpi->mbmode_cost[this_mode];
+      this_rdc.rate += ref_frame_cost[INTRA_FRAME];
+      this_rdc.rate += intra_cost_penalty;
+      this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+                               this_rdc.rate, this_rdc.dist);
+
+      if (this_rdc.rdcost < best_rdc.rdcost) {
+        best_rdc = this_rdc;
+        best_mode = this_mode;
+        best_intra_tx_size = mi->tx_size;
+        best_ref_frame = INTRA_FRAME;
+        mi->uv_mode = this_mode;
+        mi->mv[0].as_int = INVALID_MV;
+        best_mode_skip_txfm = x->skip_txfm[0];
+      }
+    }
+
+    // Reset mb_mode_info to the best inter mode.
+    if (best_ref_frame != INTRA_FRAME) {
+      mi->tx_size = best_tx_size;
+    } else {
+      mi->tx_size = best_intra_tx_size;
+    }
+  }
+
+  pd->dst = orig_dst;
+  mi->mode = best_mode;
+  mi->ref_frame[0] = best_ref_frame;
+  x->skip_txfm[0] = best_mode_skip_txfm;
+
+  if (reuse_inter_pred && best_pred != NULL) {
+    if (best_pred->data != orig_dst.buf && is_inter_mode(mi->mode)) {
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (cm->use_highbitdepth)
+        vpx_highbd_convolve_copy(best_pred->data, best_pred->stride,
+                                 pd->dst.buf, pd->dst.stride, NULL, 0,
+                                 NULL, 0, bw, bh, xd->bd);
+      else
+        vpx_convolve_copy(best_pred->data, best_pred->stride,
+                          pd->dst.buf, pd->dst.stride, NULL, 0,
+                          NULL, 0, bw, bh);
+#else
+      vpx_convolve_copy(best_pred->data, best_pred->stride,
+                        pd->dst.buf, pd->dst.stride, NULL, 0,
+                        NULL, 0, bw, bh);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    }
+  }
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+  if (cpi->oxcf.noise_sensitivity > 0 &&
+      cpi->resize_pending == 0) {
+    VP9_DENOISER_DECISION decision = COPY_BLOCK;
+    vp9_denoiser_denoise(&cpi->denoiser, x, mi_row, mi_col,
+                         VPXMAX(BLOCK_8X8, bsize), ctx, &decision);
+    // If INTRA or GOLDEN reference was selected, re-evaluate ZEROMV on denoised
+    // result. Only do this under noise conditions, and if rdcost of ZEROMV on
+    // original source is not significantly higher than rdcost of best mode.
+    if (((best_ref_frame == INTRA_FRAME && decision >= FILTER_BLOCK) ||
+        (best_ref_frame == GOLDEN_FRAME && decision == FILTER_ZEROMV_BLOCK)) &&
+        cpi->noise_estimate.enabled &&
+        cpi->noise_estimate.level > kLow &&
+        zero_last_cost_orig < (best_rdc.rdcost << 3)) {
+      // Check if we should pick ZEROMV on denoised signal.
+      int rate = 0;
+      int64_t dist = 0;
+      mi->mode = ZEROMV;
+      mi->ref_frame[0] = LAST_FRAME;
+      mi->ref_frame[1] = NONE;
+      mi->mv[0].as_int = 0;
+      mi->interp_filter = EIGHTTAP;
+      xd->plane[0].pre[0] = yv12_mb[LAST_FRAME][0];
+      vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+      model_rd_for_sb_y(cpi, bsize, x, xd, &rate, &dist, &var_y, &sse_y);
+      this_rdc.rate = rate + ref_frame_cost[LAST_FRAME] +
+          cpi->inter_mode_cost[x->mbmi_ext->mode_context[LAST_FRAME]]
+                              [INTER_OFFSET(ZEROMV)];
+      this_rdc.dist = dist;
+      this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, rate, dist);
+      // Switch to ZEROMV if the rdcost for ZEROMV on denoised source
+      // is lower than best_ref mode (on original source).
+      if (this_rdc.rdcost > best_rdc.rdcost) {
+        this_rdc = best_rdc;
+        mi->mode = best_mode;
+        mi->ref_frame[0] = best_ref_frame;
+        mi->interp_filter = best_pred_filter;
+        if (best_ref_frame == INTRA_FRAME)
+          mi->mv[0].as_int = INVALID_MV;
+        else if (best_ref_frame == GOLDEN_FRAME) {
+          mi->mv[0].as_int = frame_mv[best_mode][best_ref_frame].as_int;
+          if (reuse_inter_pred) {
+            xd->plane[0].pre[0] = yv12_mb[GOLDEN_FRAME][0];
+            vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+          }
+        }
+        mi->tx_size = best_tx_size;
+        x->skip_txfm[0] = best_mode_skip_txfm;
+      } else {
+        best_ref_frame = LAST_FRAME;
+        best_rdc = this_rdc;
+      }
+    }
+  }
+#endif
+
+  if (cpi->sf.adaptive_rd_thresh) {
+    THR_MODES best_mode_idx = mode_idx[best_ref_frame][mode_offset(mi->mode)];
+
+    if (best_ref_frame == INTRA_FRAME) {
+      // Only consider the modes that are included in the intra_mode_list.
+      int intra_modes = sizeof(intra_mode_list)/sizeof(PREDICTION_MODE);
+      int i;
+
+      // TODO(yunqingwang): Check intra mode mask and only update freq_fact
+      // for those valid modes.
+      for (i = 0; i < intra_modes; i++) {
+        update_thresh_freq_fact(cpi, tile_data, bsize, INTRA_FRAME,
+                                best_mode_idx, intra_mode_list[i]);
+      }
+    } else {
+      for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) {
+        PREDICTION_MODE this_mode;
+        if (best_ref_frame != ref_frame) continue;
+        for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
+          update_thresh_freq_fact(cpi, tile_data, bsize, ref_frame,
+                                  best_mode_idx, this_mode);
+        }
+      }
+    }
+  }
+
+  *rd_cost = best_rdc;
+}
+
+void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
+                                int mi_row, int mi_col, RD_COST *rd_cost,
+                                BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
+  VP9_COMMON *const cm = &cpi->common;
+  SPEED_FEATURES *const sf = &cpi->sf;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *const mi = xd->mi[0];
+  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  const struct segmentation *const seg = &cm->seg;
+  MV_REFERENCE_FRAME ref_frame, second_ref_frame = NONE;
+  MV_REFERENCE_FRAME best_ref_frame = NONE;
+  unsigned char segment_id = mi->segment_id;
+  struct buf_2d yv12_mb[4][MAX_MB_PLANE];
+  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
+                                    VP9_ALT_FLAG };
+  int64_t best_rd = INT64_MAX;
+  b_mode_info bsi[MAX_REF_FRAMES][4];
+  int ref_frame_skip_mask = 0;
+  const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+  const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
+  int idx, idy;
+
+  x->skip_encode = sf->skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
+  ctx->pred_pixel_ready = 0;
+
+  for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) {
+    const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
+    int_mv dummy_mv[2];
+    x->pred_mv_sad[ref_frame] = INT_MAX;
+
+    if ((cpi->ref_frame_flags & flag_list[ref_frame]) && (yv12 != NULL)) {
+      int_mv *const candidates = mbmi_ext->ref_mvs[ref_frame];
+      const struct scale_factors *const sf =
+                             &cm->frame_refs[ref_frame - 1].sf;
+      vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col,
+                           sf, sf);
+      vp9_find_mv_refs(cm, xd, xd->mi[0], ref_frame,
+                       candidates, mi_row, mi_col, mbmi_ext->mode_context);
+
+      vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, candidates,
+                            &dummy_mv[0], &dummy_mv[1]);
+    } else {
+      ref_frame_skip_mask |= (1 << ref_frame);
+    }
+  }
+
+  mi->sb_type = bsize;
+  mi->tx_size = TX_4X4;
+  mi->uv_mode = DC_PRED;
+  mi->ref_frame[0] = LAST_FRAME;
+  mi->ref_frame[1] = NONE;
+  mi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP
+                                                      : cm->interp_filter;
+
+  for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) {
+    int64_t this_rd = 0;
+    int plane;
+
+    if (ref_frame_skip_mask & (1 << ref_frame))
+      continue;
+
+#if CONFIG_BETTER_HW_COMPATIBILITY
+    if ((bsize == BLOCK_8X4 || bsize == BLOCK_4X8) &&
+        ref_frame > INTRA_FRAME &&
+        vp9_is_scaled(&cm->frame_refs[ref_frame - 1].sf))
+      continue;
+#endif
+
+    // TODO(jingning, agrange): Scaling reference frame not supported for
+    // sub8x8 blocks. Is this supported now?
+    if (ref_frame > INTRA_FRAME &&
+        vp9_is_scaled(&cm->frame_refs[ref_frame - 1].sf))
+      continue;
+
+    // If the segment reference frame feature is enabled....
+    // then do nothing if the current ref frame is not allowed..
+    if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+        get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame)
+      continue;
+
+    mi->ref_frame[0] = ref_frame;
+    x->skip = 0;
+    set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
+
+    // Select prediction reference frames.
+    for (plane = 0; plane < MAX_MB_PLANE; plane++)
+      xd->plane[plane].pre[0] = yv12_mb[ref_frame][plane];
+
+    for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
+      for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
+        int_mv b_mv[MB_MODE_COUNT];
+        int64_t b_best_rd = INT64_MAX;
+        const int i = idy * 2 + idx;
+        PREDICTION_MODE this_mode;
+        RD_COST this_rdc;
+        unsigned int var_y, sse_y;
+
+        struct macroblock_plane *p = &x->plane[0];
+        struct macroblockd_plane *pd = &xd->plane[0];
+
+        const struct buf_2d orig_src = p->src;
+        const struct buf_2d orig_dst = pd->dst;
+        struct buf_2d orig_pre[2];
+        memcpy(orig_pre, xd->plane[0].pre, sizeof(orig_pre));
+
+        // set buffer pointers for sub8x8 motion search.
+        p->src.buf =
+            &p->src.buf[vp9_raster_block_offset(BLOCK_8X8, i, p->src.stride)];
+        pd->dst.buf =
+            &pd->dst.buf[vp9_raster_block_offset(BLOCK_8X8, i, pd->dst.stride)];
+        pd->pre[0].buf =
+            &pd->pre[0].buf[vp9_raster_block_offset(BLOCK_8X8,
+                                                    i, pd->pre[0].stride)];
+
+        b_mv[ZEROMV].as_int = 0;
+        b_mv[NEWMV].as_int = INVALID_MV;
+        vp9_append_sub8x8_mvs_for_idx(cm, xd, i, 0, mi_row, mi_col,
+                                      &b_mv[NEARESTMV],
+                                      &b_mv[NEARMV],
+                                      mbmi_ext->mode_context);
+
+        for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
+          int b_rate = 0;
+          xd->mi[0]->bmi[i].as_mv[0].as_int = b_mv[this_mode].as_int;
+
+          if (this_mode == NEWMV) {
+            const int step_param = cpi->sf.mv.fullpel_search_step_param;
+            MV mvp_full;
+            MV tmp_mv;
+            int cost_list[5];
+            const int tmp_col_min = x->mv_col_min;
+            const int tmp_col_max = x->mv_col_max;
+            const int tmp_row_min = x->mv_row_min;
+            const int tmp_row_max = x->mv_row_max;
+            int dummy_dist;
+
+            if (i == 0) {
+              mvp_full.row = b_mv[NEARESTMV].as_mv.row >> 3;
+              mvp_full.col = b_mv[NEARESTMV].as_mv.col >> 3;
+            } else {
+              mvp_full.row = xd->mi[0]->bmi[0].as_mv[0].as_mv.row >> 3;
+              mvp_full.col = xd->mi[0]->bmi[0].as_mv[0].as_mv.col >> 3;
+            }
+
+            vp9_set_mv_search_range(x, &mbmi_ext->ref_mvs[0]->as_mv);
+
+            vp9_full_pixel_search(
+                cpi, x, bsize, &mvp_full, step_param, x->sadperbit4,
+                cond_cost_list(cpi, cost_list),
+                &mbmi_ext->ref_mvs[ref_frame][0].as_mv, &tmp_mv,
+                INT_MAX, 0);
+
+            x->mv_col_min = tmp_col_min;
+            x->mv_col_max = tmp_col_max;
+            x->mv_row_min = tmp_row_min;
+            x->mv_row_max = tmp_row_max;
+
+            // calculate the bit cost on motion vector
+            mvp_full.row = tmp_mv.row * 8;
+            mvp_full.col = tmp_mv.col * 8;
+
+            b_rate += vp9_mv_bit_cost(&mvp_full,
+                                      &mbmi_ext->ref_mvs[ref_frame][0].as_mv,
+                                      x->nmvjointcost, x->mvcost,
+                                      MV_COST_WEIGHT);
+
+            b_rate += cpi->inter_mode_cost[x->mbmi_ext->mode_context[ref_frame]]
+                                          [INTER_OFFSET(NEWMV)];
+            if (RDCOST(x->rdmult, x->rddiv, b_rate, 0) > b_best_rd)
+              continue;
+
+            cpi->find_fractional_mv_step(x, &tmp_mv,
+                                         &mbmi_ext->ref_mvs[ref_frame][0].as_mv,
+                                         cpi->common.allow_high_precision_mv,
+                                         x->errorperbit,
+                                         &cpi->fn_ptr[bsize],
+                                         cpi->sf.mv.subpel_force_stop,
+                                         cpi->sf.mv.subpel_iters_per_step,
+                                         cond_cost_list(cpi, cost_list),
+                                         x->nmvjointcost, x->mvcost,
+                                         &dummy_dist,
+                                         &x->pred_sse[ref_frame], NULL, 0, 0);
+
+            xd->mi[0]->bmi[i].as_mv[0].as_mv = tmp_mv;
+          } else {
+            b_rate += cpi->inter_mode_cost[x->mbmi_ext->mode_context[ref_frame]]
+                                          [INTER_OFFSET(this_mode)];
+          }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+          if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+            vp9_highbd_build_inter_predictor(pd->pre[0].buf, pd->pre[0].stride,
+                                    pd->dst.buf, pd->dst.stride,
+                                    &xd->mi[0]->bmi[i].as_mv[0].as_mv,
+                                    &xd->block_refs[0]->sf,
+                                    4 * num_4x4_blocks_wide,
+                                    4 * num_4x4_blocks_high, 0,
+                                    vp9_filter_kernels[mi->interp_filter],
+                                    MV_PRECISION_Q3,
+                                    mi_col * MI_SIZE + 4 * (i & 0x01),
+                                    mi_row * MI_SIZE + 4 * (i >> 1), xd->bd);
+          } else {
+#endif
+            vp9_build_inter_predictor(pd->pre[0].buf, pd->pre[0].stride,
+                                     pd->dst.buf, pd->dst.stride,
+                                     &xd->mi[0]->bmi[i].as_mv[0].as_mv,
+                                     &xd->block_refs[0]->sf,
+                                     4 * num_4x4_blocks_wide,
+                                     4 * num_4x4_blocks_high, 0,
+                                     vp9_filter_kernels[mi->interp_filter],
+                                     MV_PRECISION_Q3,
+                                     mi_col * MI_SIZE + 4 * (i & 0x01),
+                                     mi_row * MI_SIZE + 4 * (i >> 1));
+
+#if CONFIG_VP9_HIGHBITDEPTH
+          }
+#endif
+
+          model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist,
+                            &var_y, &sse_y);
+
+          this_rdc.rate += b_rate;
+          this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+                                   this_rdc.rate, this_rdc.dist);
+          if (this_rdc.rdcost < b_best_rd) {
+            b_best_rd = this_rdc.rdcost;
+            bsi[ref_frame][i].as_mode = this_mode;
+            bsi[ref_frame][i].as_mv[0].as_mv = xd->mi[0]->bmi[i].as_mv[0].as_mv;
+          }
+        }  // mode search
+
+        // restore source and prediction buffer pointers.
+        p->src = orig_src;
+        pd->pre[0] = orig_pre[0];
+        pd->dst = orig_dst;
+        this_rd += b_best_rd;
+
+        xd->mi[0]->bmi[i] = bsi[ref_frame][i];
+        if (num_4x4_blocks_wide > 1)
+          xd->mi[0]->bmi[i + 1] = xd->mi[0]->bmi[i];
+        if (num_4x4_blocks_high > 1)
+          xd->mi[0]->bmi[i + 2] = xd->mi[0]->bmi[i];
+      }
+    }  // loop through sub8x8 blocks
+
+    if (this_rd < best_rd) {
+      best_rd = this_rd;
+      best_ref_frame = ref_frame;
+    }
+  }  // reference frames
+
+  mi->tx_size = TX_4X4;
+  mi->ref_frame[0] = best_ref_frame;
+  for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
+    for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
+      const int block = idy * 2 + idx;
+      xd->mi[0]->bmi[block] = bsi[best_ref_frame][block];
+      if (num_4x4_blocks_wide > 1)
+        xd->mi[0]->bmi[block + 1] = bsi[best_ref_frame][block];
+      if (num_4x4_blocks_high > 1)
+        xd->mi[0]->bmi[block + 2] = bsi[best_ref_frame][block];
+    }
+  }
+  mi->mode = xd->mi[0]->bmi[3].as_mode;
+  ctx->mic = *(xd->mi[0]);
+  ctx->mbmi_ext = *x->mbmi_ext;
+  ctx->skip_txfm[0] = SKIP_TXFM_NONE;
+  ctx->skip = 0;
+  // Dummy assignment for speed -5. No effect in speed -6.
+  rd_cost->rdcost = best_rd;
+}
diff --git a/libs/libvpx/vp9/encoder/vp9_pickmode.h b/libs/libvpx/vp9/encoder/vp9_pickmode.h
new file mode 100644
index 0000000000..a43bb81260
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_pickmode.h
@@ -0,0 +1,38 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_VP9_PICKMODE_H_
+#define VP9_ENCODER_VP9_PICKMODE_H_
+
+#include "vp9/encoder/vp9_encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp9_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *rd_cost,
+                         BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx);
+
+void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
+                         TileDataEnc *tile_data,
+                         int mi_row, int mi_col, RD_COST *rd_cost,
+                         BLOCK_SIZE bsize,
+                         PICK_MODE_CONTEXT *ctx);
+
+void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
+                                int mi_row, int mi_col, RD_COST *rd_cost,
+                                BLOCK_SIZE bsize,
+                                PICK_MODE_CONTEXT *ctx);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_VP9_PICKMODE_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_quantize.c b/libs/libvpx/vp9/encoder/vp9_quantize.c
new file mode 100644
index 0000000000..91f877ed7e
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_quantize.c
@@ -0,0 +1,388 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+
+#include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_seg_common.h"
+
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_quantize.h"
+#include "vp9/encoder/vp9_rd.h"
+
+void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                       int skip_block,
+                       const int16_t *zbin_ptr, const int16_t *round_ptr,
+                       const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
+                       tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                       const int16_t *dequant_ptr,
+                       uint16_t *eob_ptr,
+                       const int16_t *scan, const int16_t *iscan) {
+  int i, eob = -1;
+  // TODO(jingning) Decide the need of these arguments after the
+  // quantization process is completed.
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Quantization pass: All coefficients with index >= zero_flag are
+    // skippable. Note: zero_flag can be zero.
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+      int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+      tmp = (tmp * quant_ptr[rc != 0]) >> 16;
+
+      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+
+      if (tmp)
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr,
+                              intptr_t count,
+                              int skip_block,
+                              const int16_t *zbin_ptr,
+                              const int16_t *round_ptr,
+                              const int16_t *quant_ptr,
+                              const int16_t *quant_shift_ptr,
+                              tran_low_t *qcoeff_ptr,
+                              tran_low_t *dqcoeff_ptr,
+                              const int16_t *dequant_ptr,
+                              uint16_t *eob_ptr,
+                              const int16_t *scan,
+                              const int16_t *iscan) {
+  int i;
+  int eob = -1;
+  // TODO(jingning) Decide the need of these arguments after the
+  // quantization process is completed.
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Quantization pass: All coefficients with index >= zero_flag are
+    // skippable. Note: zero_flag can be zero.
+    for (i = 0; i < count; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int64_t tmp = abs_coeff + round_ptr[rc != 0];
+      const uint32_t abs_qcoeff = (uint32_t)((tmp * quant_ptr[rc != 0]) >> 16);
+      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+      if (abs_qcoeff)
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+#endif
+
+// TODO(jingning) Refactor this file and combine functions with similar
+// operations.
+void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             int skip_block,
+                             const int16_t *zbin_ptr, const int16_t *round_ptr,
+                             const int16_t *quant_ptr,
+                             const int16_t *quant_shift_ptr,
+                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                             const int16_t *dequant_ptr,
+                             uint16_t *eob_ptr,
+                             const int16_t *scan, const int16_t *iscan) {
+  int i, eob = -1;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      int tmp = 0;
+      int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+      if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) {
+        abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+        abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
+        tmp = (abs_coeff * quant_ptr[rc != 0]) >> 15;
+        qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+      }
+
+      if (tmp)
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr,
+                                    intptr_t n_coeffs, int skip_block,
+                                    const int16_t *zbin_ptr,
+                                    const int16_t *round_ptr,
+                                    const int16_t *quant_ptr,
+                                    const int16_t *quant_shift_ptr,
+                                    tran_low_t *qcoeff_ptr,
+                                    tran_low_t *dqcoeff_ptr,
+                                    const int16_t *dequant_ptr,
+                                    uint16_t *eob_ptr,
+                                    const int16_t *scan, const int16_t *iscan) {
+  int i, eob = -1;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    for (i = 0; i < n_coeffs; i++) {
+      uint32_t abs_qcoeff = 0;
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+      if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) {
+        const int64_t tmp = abs_coeff
+                           + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+        abs_qcoeff = (uint32_t) ((tmp * quant_ptr[rc != 0]) >> 15);
+        qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+      }
+
+      if (abs_qcoeff)
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+#endif
+
+void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
+                                const int16_t *scan, const int16_t *iscan) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *p = &x->plane[plane];
+  struct macroblockd_plane *pd = &xd->plane[plane];
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    vpx_highbd_quantize_b(BLOCK_OFFSET(p->coeff, block),
+                          16, x->skip_block,
+                          p->zbin, p->round, p->quant, p->quant_shift,
+                          BLOCK_OFFSET(p->qcoeff, block),
+                          BLOCK_OFFSET(pd->dqcoeff, block),
+                          pd->dequant, &p->eobs[block],
+                          scan, iscan);
+    return;
+  }
+#endif
+  vpx_quantize_b(BLOCK_OFFSET(p->coeff, block),
+                 16, x->skip_block,
+                 p->zbin, p->round, p->quant, p->quant_shift,
+                 BLOCK_OFFSET(p->qcoeff, block),
+                 BLOCK_OFFSET(pd->dqcoeff, block),
+                 pd->dequant, &p->eobs[block], scan, iscan);
+}
+
+static void invert_quant(int16_t *quant, int16_t *shift, int d) {
+  unsigned t;
+  int l;
+  t = d;
+  for (l = 0; t > 1; l++)
+    t >>= 1;
+  t = 1 + (1 << (16 + l)) / d;
+  *quant = (int16_t)(t - (1 << 16));
+  *shift = 1 << (16 - l);
+}
+
+static int get_qzbin_factor(int q, vpx_bit_depth_t bit_depth) {
+  const int quant = vp9_dc_quant(q, 0, bit_depth);
+#if CONFIG_VP9_HIGHBITDEPTH
+  switch (bit_depth) {
+    case VPX_BITS_8:
+      return q == 0 ? 64 : (quant < 148 ? 84 : 80);
+    case VPX_BITS_10:
+      return q == 0 ? 64 : (quant < 592 ? 84 : 80);
+    case VPX_BITS_12:
+      return q == 0 ? 64 : (quant < 2368 ? 84 : 80);
+    default:
+      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+      return -1;
+  }
+#else
+  (void) bit_depth;
+  return q == 0 ? 64 : (quant < 148 ? 84 : 80);
+#endif
+}
+
+void vp9_init_quantizer(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  QUANTS *const quants = &cpi->quants;
+  int i, q, quant;
+
+  for (q = 0; q < QINDEX_RANGE; q++) {
+    const int qzbin_factor = get_qzbin_factor(q, cm->bit_depth);
+    const int qrounding_factor = q == 0 ? 64 : 48;
+
+    for (i = 0; i < 2; ++i) {
+      int qrounding_factor_fp = i == 0 ? 48 : 42;
+      if (q == 0)
+        qrounding_factor_fp = 64;
+
+      // y
+      quant = i == 0 ? vp9_dc_quant(q, cm->y_dc_delta_q, cm->bit_depth)
+                     : vp9_ac_quant(q, 0, cm->bit_depth);
+      invert_quant(&quants->y_quant[q][i], &quants->y_quant_shift[q][i], quant);
+      quants->y_quant_fp[q][i] = (1 << 16) / quant;
+      quants->y_round_fp[q][i] = (qrounding_factor_fp * quant) >> 7;
+      quants->y_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
+      quants->y_round[q][i] = (qrounding_factor * quant) >> 7;
+      cpi->y_dequant[q][i] = quant;
+
+      // uv
+      quant = i == 0 ? vp9_dc_quant(q, cm->uv_dc_delta_q, cm->bit_depth)
+                     : vp9_ac_quant(q, cm->uv_ac_delta_q, cm->bit_depth);
+      invert_quant(&quants->uv_quant[q][i],
+                   &quants->uv_quant_shift[q][i], quant);
+      quants->uv_quant_fp[q][i] = (1 << 16) / quant;
+      quants->uv_round_fp[q][i] = (qrounding_factor_fp * quant) >> 7;
+      quants->uv_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
+      quants->uv_round[q][i] = (qrounding_factor * quant) >> 7;
+      cpi->uv_dequant[q][i] = quant;
+    }
+
+    for (i = 2; i < 8; i++) {
+      quants->y_quant[q][i] = quants->y_quant[q][1];
+      quants->y_quant_fp[q][i] = quants->y_quant_fp[q][1];
+      quants->y_round_fp[q][i] = quants->y_round_fp[q][1];
+      quants->y_quant_shift[q][i] = quants->y_quant_shift[q][1];
+      quants->y_zbin[q][i] = quants->y_zbin[q][1];
+      quants->y_round[q][i] = quants->y_round[q][1];
+      cpi->y_dequant[q][i] = cpi->y_dequant[q][1];
+
+      quants->uv_quant[q][i] = quants->uv_quant[q][1];
+      quants->uv_quant_fp[q][i] = quants->uv_quant_fp[q][1];
+      quants->uv_round_fp[q][i] = quants->uv_round_fp[q][1];
+      quants->uv_quant_shift[q][i] = quants->uv_quant_shift[q][1];
+      quants->uv_zbin[q][i] = quants->uv_zbin[q][1];
+      quants->uv_round[q][i] = quants->uv_round[q][1];
+      cpi->uv_dequant[q][i] = cpi->uv_dequant[q][1];
+    }
+  }
+}
+
+void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
+  const VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  QUANTS *const quants = &cpi->quants;
+  const int segment_id = xd->mi[0]->segment_id;
+  const int qindex = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex);
+  const int rdmult = vp9_compute_rd_mult(cpi, qindex + cm->y_dc_delta_q);
+  int i;
+
+  // Y
+  x->plane[0].quant = quants->y_quant[qindex];
+  x->plane[0].quant_fp = quants->y_quant_fp[qindex];
+  x->plane[0].round_fp = quants->y_round_fp[qindex];
+  x->plane[0].quant_shift = quants->y_quant_shift[qindex];
+  x->plane[0].zbin = quants->y_zbin[qindex];
+  x->plane[0].round = quants->y_round[qindex];
+  xd->plane[0].dequant = cpi->y_dequant[qindex];
+
+  x->plane[0].quant_thred[0] = x->plane[0].zbin[0] * x->plane[0].zbin[0];
+  x->plane[0].quant_thred[1] = x->plane[0].zbin[1] * x->plane[0].zbin[1];
+
+  // UV
+  for (i = 1; i < 3; i++) {
+    x->plane[i].quant = quants->uv_quant[qindex];
+    x->plane[i].quant_fp = quants->uv_quant_fp[qindex];
+    x->plane[i].round_fp = quants->uv_round_fp[qindex];
+    x->plane[i].quant_shift = quants->uv_quant_shift[qindex];
+    x->plane[i].zbin = quants->uv_zbin[qindex];
+    x->plane[i].round = quants->uv_round[qindex];
+    xd->plane[i].dequant = cpi->uv_dequant[qindex];
+
+    x->plane[i].quant_thred[0] = x->plane[i].zbin[0] * x->plane[i].zbin[0];
+    x->plane[i].quant_thred[1] = x->plane[i].zbin[1] * x->plane[i].zbin[1];
+  }
+
+  x->skip_block = segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP);
+  x->q_index = qindex;
+
+  set_error_per_bit(x, rdmult);
+
+  vp9_initialize_me_consts(cpi, x, x->q_index);
+}
+
+void vp9_frame_init_quantizer(VP9_COMP *cpi) {
+  vp9_init_plane_quantizers(cpi, &cpi->td.mb);
+}
+
+void vp9_set_quantizer(VP9_COMMON *cm, int q) {
+  // quantizer has to be reinitialized with vp9_init_quantizer() if any
+  // delta_q changes.
+  cm->base_qindex = q;
+  cm->y_dc_delta_q = 0;
+  cm->uv_dc_delta_q = 0;
+  cm->uv_ac_delta_q = 0;
+}
+
+// Table that converts 0-63 Q-range values passed in outside to the Qindex
+// range used internally.
+static const int quantizer_to_qindex[] = {
+  0,    4,   8,  12,  16,  20,  24,  28,
+  32,   36,  40,  44,  48,  52,  56,  60,
+  64,   68,  72,  76,  80,  84,  88,  92,
+  96,  100, 104, 108, 112, 116, 120, 124,
+  128, 132, 136, 140, 144, 148, 152, 156,
+  160, 164, 168, 172, 176, 180, 184, 188,
+  192, 196, 200, 204, 208, 212, 216, 220,
+  224, 228, 232, 236, 240, 244, 249, 255,
+};
+
+int vp9_quantizer_to_qindex(int quantizer) {
+  return quantizer_to_qindex[quantizer];
+}
+
+int vp9_qindex_to_quantizer(int qindex) {
+  int quantizer;
+
+  for (quantizer = 0; quantizer < 64; ++quantizer)
+    if (quantizer_to_qindex[quantizer] >= qindex)
+      return quantizer;
+
+  return 63;
+}
diff --git a/libs/libvpx/vp9/encoder/vp9_quantize.h b/libs/libvpx/vp9/encoder/vp9_quantize.h
new file mode 100644
index 0000000000..61320361b6
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_quantize.h
@@ -0,0 +1,62 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_VP9_QUANTIZE_H_
+#define VP9_ENCODER_VP9_QUANTIZE_H_
+
+#include "./vpx_config.h"
+#include "vp9/encoder/vp9_block.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+  DECLARE_ALIGNED(16, int16_t, y_quant[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, y_quant_shift[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, y_zbin[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, y_round[QINDEX_RANGE][8]);
+
+  // TODO(jingning): in progress of re-working the quantization. will decide
+  // if we want to deprecate the current use of y_quant.
+  DECLARE_ALIGNED(16, int16_t, y_quant_fp[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, uv_quant_fp[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, y_round_fp[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, uv_round_fp[QINDEX_RANGE][8]);
+
+  DECLARE_ALIGNED(16, int16_t, uv_quant[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, uv_quant_shift[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, uv_zbin[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, uv_round[QINDEX_RANGE][8]);
+} QUANTS;
+
+void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
+                                const int16_t *scan, const int16_t *iscan);
+
+struct VP9_COMP;
+struct VP9Common;
+
+void vp9_frame_init_quantizer(struct VP9_COMP *cpi);
+
+void vp9_init_plane_quantizers(struct VP9_COMP *cpi, MACROBLOCK *x);
+
+void vp9_init_quantizer(struct VP9_COMP *cpi);
+
+void vp9_set_quantizer(struct VP9Common *cm, int q);
+
+int vp9_quantizer_to_qindex(int quantizer);
+
+int vp9_qindex_to_quantizer(int qindex);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_VP9_QUANTIZE_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_ratectrl.c b/libs/libvpx/vp9/encoder/vp9_ratectrl.c
new file mode 100644
index 0000000000..5df2909cca
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_ratectrl.c
@@ -0,0 +1,2127 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/system_state.h"
+
+#include "vp9/common/vp9_alloccommon.h"
+#include "vp9/encoder/vp9_aq_cyclicrefresh.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_entropymode.h"
+#include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_seg_common.h"
+
+#include "vp9/encoder/vp9_encodemv.h"
+#include "vp9/encoder/vp9_ratectrl.h"
+
+// Max rate target for 1080P and below encodes under normal circumstances
+// (1920 * 1080 / (16 * 16)) * MAX_MB_RATE bits per MB
+#define MAX_MB_RATE 250
+#define MAXRATE_1080P 2025000
+
+#define DEFAULT_KF_BOOST 2000
+#define DEFAULT_GF_BOOST 2000
+
+#define LIMIT_QRANGE_FOR_ALTREF_AND_KEY 1
+
+#define MIN_BPB_FACTOR 0.005
+#define MAX_BPB_FACTOR 50
+
+#define FRAME_OVERHEAD_BITS 200
+
+#if CONFIG_VP9_HIGHBITDEPTH
+#define ASSIGN_MINQ_TABLE(bit_depth, name) \
+  do { \
+    switch (bit_depth) { \
+      case VPX_BITS_8: \
+        name = name##_8; \
+        break; \
+      case VPX_BITS_10: \
+        name = name##_10; \
+        break; \
+      case VPX_BITS_12: \
+        name = name##_12; \
+        break; \
+      default: \
+        assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10" \
+                    " or VPX_BITS_12"); \
+        name = NULL; \
+    } \
+  } while (0)
+#else
+#define ASSIGN_MINQ_TABLE(bit_depth, name) \
+  do { \
+    (void) bit_depth; \
+    name = name##_8; \
+  } while (0)
+#endif
+
+// Tables relating active max Q to active min Q
+static int kf_low_motion_minq_8[QINDEX_RANGE];
+static int kf_high_motion_minq_8[QINDEX_RANGE];
+static int arfgf_low_motion_minq_8[QINDEX_RANGE];
+static int arfgf_high_motion_minq_8[QINDEX_RANGE];
+static int inter_minq_8[QINDEX_RANGE];
+static int rtc_minq_8[QINDEX_RANGE];
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static int kf_low_motion_minq_10[QINDEX_RANGE];
+static int kf_high_motion_minq_10[QINDEX_RANGE];
+static int arfgf_low_motion_minq_10[QINDEX_RANGE];
+static int arfgf_high_motion_minq_10[QINDEX_RANGE];
+static int inter_minq_10[QINDEX_RANGE];
+static int rtc_minq_10[QINDEX_RANGE];
+static int kf_low_motion_minq_12[QINDEX_RANGE];
+static int kf_high_motion_minq_12[QINDEX_RANGE];
+static int arfgf_low_motion_minq_12[QINDEX_RANGE];
+static int arfgf_high_motion_minq_12[QINDEX_RANGE];
+static int inter_minq_12[QINDEX_RANGE];
+static int rtc_minq_12[QINDEX_RANGE];
+#endif
+
+static int gf_high = 2000;
+static int gf_low = 400;
+static int kf_high = 5000;
+static int kf_low = 400;
+
+// Functions to compute the active minq lookup table entries based on a
+// formulaic approach to facilitate easier adjustment of the Q tables.
+// The formulae were derived from computing a 3rd order polynomial best
+// fit to the original data (after plotting real maxq vs minq (not q index))
+static int get_minq_index(double maxq, double x3, double x2, double x1,
+                          vpx_bit_depth_t bit_depth) {
+  int i;
+  const double minqtarget = VPXMIN(((x3 * maxq + x2) * maxq + x1) * maxq,
+                                   maxq);
+
+  // Special case handling to deal with the step from q2.0
+  // down to lossless mode represented by q 1.0.
+  if (minqtarget <= 2.0)
+    return 0;
+
+  for (i = 0; i < QINDEX_RANGE; i++) {
+    if (minqtarget <= vp9_convert_qindex_to_q(i, bit_depth))
+      return i;
+  }
+
+  return QINDEX_RANGE - 1;
+}
+
+static void init_minq_luts(int *kf_low_m, int *kf_high_m,
+                           int *arfgf_low, int *arfgf_high,
+                           int *inter, int *rtc, vpx_bit_depth_t bit_depth) {
+  int i;
+  for (i = 0; i < QINDEX_RANGE; i++) {
+    const double maxq = vp9_convert_qindex_to_q(i, bit_depth);
+    kf_low_m[i] = get_minq_index(maxq, 0.000001, -0.0004, 0.150, bit_depth);
+    kf_high_m[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55, bit_depth);
+    arfgf_low[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.30, bit_depth);
+    arfgf_high[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55, bit_depth);
+    inter[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.90, bit_depth);
+    rtc[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.70, bit_depth);
+  }
+}
+
+void vp9_rc_init_minq_luts(void) {
+  init_minq_luts(kf_low_motion_minq_8, kf_high_motion_minq_8,
+                 arfgf_low_motion_minq_8, arfgf_high_motion_minq_8,
+                 inter_minq_8, rtc_minq_8, VPX_BITS_8);
+#if CONFIG_VP9_HIGHBITDEPTH
+  init_minq_luts(kf_low_motion_minq_10, kf_high_motion_minq_10,
+                 arfgf_low_motion_minq_10, arfgf_high_motion_minq_10,
+                 inter_minq_10, rtc_minq_10, VPX_BITS_10);
+  init_minq_luts(kf_low_motion_minq_12, kf_high_motion_minq_12,
+                 arfgf_low_motion_minq_12, arfgf_high_motion_minq_12,
+                 inter_minq_12, rtc_minq_12, VPX_BITS_12);
+#endif
+}
+
+// These functions use formulaic calculations to make playing with the
+// quantizer tables easier. If necessary they can be replaced by lookup
+// tables if and when things settle down in the experimental bitstream
+double vp9_convert_qindex_to_q(int qindex, vpx_bit_depth_t bit_depth) {
+  // Convert the index to a real Q value (scaled down to match old Q values)
+#if CONFIG_VP9_HIGHBITDEPTH
+  switch (bit_depth) {
+    case VPX_BITS_8:
+      return vp9_ac_quant(qindex, 0, bit_depth) / 4.0;
+    case VPX_BITS_10:
+      return vp9_ac_quant(qindex, 0, bit_depth) / 16.0;
+    case VPX_BITS_12:
+      return vp9_ac_quant(qindex, 0, bit_depth) / 64.0;
+    default:
+      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+      return -1.0;
+  }
+#else
+  return vp9_ac_quant(qindex, 0, bit_depth) / 4.0;
+#endif
+}
+
+int vp9_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
+                       double correction_factor,
+                       vpx_bit_depth_t bit_depth) {
+  const double q = vp9_convert_qindex_to_q(qindex, bit_depth);
+  int enumerator = frame_type == KEY_FRAME ? 2700000 : 1800000;
+
+  assert(correction_factor <= MAX_BPB_FACTOR &&
+         correction_factor >= MIN_BPB_FACTOR);
+
+  // q based adjustment to baseline enumerator
+  enumerator += (int)(enumerator * q) >> 12;
+  return (int)(enumerator * correction_factor / q);
+}
+
+int vp9_estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs,
+                           double correction_factor,
+                           vpx_bit_depth_t bit_depth) {
+  const int bpm = (int)(vp9_rc_bits_per_mb(frame_type, q, correction_factor,
+                                           bit_depth));
+  return VPXMAX(FRAME_OVERHEAD_BITS,
+                (int)((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS);
+}
+
+int vp9_rc_clamp_pframe_target_size(const VP9_COMP *const cpi, int target) {
+  const RATE_CONTROL *rc = &cpi->rc;
+  const VP9EncoderConfig *oxcf = &cpi->oxcf;
+  const int min_frame_target = VPXMAX(rc->min_frame_bandwidth,
+                                      rc->avg_frame_bandwidth >> 5);
+  if (target < min_frame_target)
+    target = min_frame_target;
+  if (cpi->refresh_golden_frame && rc->is_src_frame_alt_ref) {
+    // If there is an active ARF at this location use the minimum
+    // bits on this frame even if it is a constructed arf.
+    // The active maximum quantizer insures that an appropriate
+    // number of bits will be spent if needed for constructed ARFs.
+    target = min_frame_target;
+  }
+  // Clip the frame target to the maximum allowed value.
+  if (target > rc->max_frame_bandwidth)
+    target = rc->max_frame_bandwidth;
+  if (oxcf->rc_max_inter_bitrate_pct) {
+    const int max_rate = rc->avg_frame_bandwidth *
+                         oxcf->rc_max_inter_bitrate_pct / 100;
+    target = VPXMIN(target, max_rate);
+  }
+  return target;
+}
+
+int vp9_rc_clamp_iframe_target_size(const VP9_COMP *const cpi, int target) {
+  const RATE_CONTROL *rc = &cpi->rc;
+  const VP9EncoderConfig *oxcf = &cpi->oxcf;
+  if (oxcf->rc_max_intra_bitrate_pct) {
+    const int max_rate = rc->avg_frame_bandwidth *
+                             oxcf->rc_max_intra_bitrate_pct / 100;
+    target = VPXMIN(target, max_rate);
+  }
+  if (target > rc->max_frame_bandwidth)
+    target = rc->max_frame_bandwidth;
+  return target;
+}
+
+// Update the buffer level for higher temporal layers, given the encoded current
+// temporal layer.
+static void update_layer_buffer_level(SVC *svc, int encoded_frame_size) {
+  int i = 0;
+  int current_temporal_layer = svc->temporal_layer_id;
+  for (i = current_temporal_layer + 1;
+      i < svc->number_temporal_layers; ++i) {
+    const int layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, i,
+                                       svc->number_temporal_layers);
+    LAYER_CONTEXT *lc = &svc->layer_context[layer];
+    RATE_CONTROL *lrc = &lc->rc;
+    int bits_off_for_this_layer = (int)(lc->target_bandwidth / lc->framerate -
+        encoded_frame_size);
+    lrc->bits_off_target += bits_off_for_this_layer;
+
+    // Clip buffer level to maximum buffer size for the layer.
+    lrc->bits_off_target =
+        VPXMIN(lrc->bits_off_target, lrc->maximum_buffer_size);
+    lrc->buffer_level = lrc->bits_off_target;
+  }
+}
+
+// Update the buffer level: leaky bucket model.
+static void update_buffer_level(VP9_COMP *cpi, int encoded_frame_size) {
+  const VP9_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+
+  // Non-viewable frames are a special case and are treated as pure overhead.
+  if (!cm->show_frame) {
+    rc->bits_off_target -= encoded_frame_size;
+  } else {
+    rc->bits_off_target += rc->avg_frame_bandwidth - encoded_frame_size;
+  }
+
+  // Clip the buffer level to the maximum specified buffer size.
+  rc->bits_off_target = VPXMIN(rc->bits_off_target, rc->maximum_buffer_size);
+
+  // For screen-content mode, and if frame-dropper is off, don't let buffer
+  // level go below threshold, given here as -rc->maximum_ buffer_size.
+  if (cpi->oxcf.content == VP9E_CONTENT_SCREEN &&
+      cpi->oxcf.drop_frames_water_mark == 0)
+    rc->bits_off_target = VPXMAX(rc->bits_off_target, -rc->maximum_buffer_size);
+
+  rc->buffer_level = rc->bits_off_target;
+
+  if (is_one_pass_cbr_svc(cpi)) {
+    update_layer_buffer_level(&cpi->svc, encoded_frame_size);
+  }
+}
+
+int vp9_rc_get_default_min_gf_interval(
+    int width, int height, double framerate) {
+  // Assume we do not need any constraint lower than 4K 20 fps
+  static const double factor_safe = 3840 * 2160 * 20.0;
+  const double factor = width * height * framerate;
+  const int default_interval =
+      clamp((int)(framerate * 0.125), MIN_GF_INTERVAL, MAX_GF_INTERVAL);
+
+  if (factor <= factor_safe)
+    return default_interval;
+  else
+    return VPXMAX(default_interval,
+                  (int)(MIN_GF_INTERVAL * factor / factor_safe + 0.5));
+  // Note this logic makes:
+  // 4K24: 5
+  // 4K30: 6
+  // 4K60: 12
+}
+
+int vp9_rc_get_default_max_gf_interval(double framerate, int min_gf_interval) {
+  int interval = VPXMIN(MAX_GF_INTERVAL, (int)(framerate * 0.75));
+  interval += (interval & 0x01);  // Round to even value
+  return VPXMAX(interval, min_gf_interval);
+}
+
+void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) {
+  int i;
+
+  if (pass == 0 && oxcf->rc_mode == VPX_CBR) {
+    rc->avg_frame_qindex[KEY_FRAME] = oxcf->worst_allowed_q;
+    rc->avg_frame_qindex[INTER_FRAME] = oxcf->worst_allowed_q;
+  } else {
+    rc->avg_frame_qindex[KEY_FRAME] = (oxcf->worst_allowed_q +
+                                       oxcf->best_allowed_q) / 2;
+    rc->avg_frame_qindex[INTER_FRAME] = (oxcf->worst_allowed_q +
+                                         oxcf->best_allowed_q) / 2;
+  }
+
+  rc->last_q[KEY_FRAME] = oxcf->best_allowed_q;
+  rc->last_q[INTER_FRAME] = oxcf->worst_allowed_q;
+
+  rc->buffer_level =    rc->starting_buffer_level;
+  rc->bits_off_target = rc->starting_buffer_level;
+
+  rc->rolling_target_bits      = rc->avg_frame_bandwidth;
+  rc->rolling_actual_bits      = rc->avg_frame_bandwidth;
+  rc->long_rolling_target_bits = rc->avg_frame_bandwidth;
+  rc->long_rolling_actual_bits = rc->avg_frame_bandwidth;
+
+  rc->total_actual_bits = 0;
+  rc->total_target_bits = 0;
+  rc->total_target_vs_actual = 0;
+
+  rc->frames_since_key = 8;  // Sensible default for first frame.
+  rc->this_key_frame_forced = 0;
+  rc->next_key_frame_forced = 0;
+  rc->source_alt_ref_pending = 0;
+  rc->source_alt_ref_active = 0;
+
+  rc->frames_till_gf_update_due = 0;
+  rc->ni_av_qi = oxcf->worst_allowed_q;
+  rc->ni_tot_qi = 0;
+  rc->ni_frames = 0;
+
+  rc->tot_q = 0.0;
+  rc->avg_q = vp9_convert_qindex_to_q(oxcf->worst_allowed_q, oxcf->bit_depth);
+
+  for (i = 0; i < RATE_FACTOR_LEVELS; ++i) {
+    rc->rate_correction_factors[i] = 1.0;
+  }
+
+  rc->min_gf_interval = oxcf->min_gf_interval;
+  rc->max_gf_interval = oxcf->max_gf_interval;
+  if (rc->min_gf_interval == 0)
+    rc->min_gf_interval = vp9_rc_get_default_min_gf_interval(
+        oxcf->width, oxcf->height, oxcf->init_framerate);
+  if (rc->max_gf_interval == 0)
+    rc->max_gf_interval = vp9_rc_get_default_max_gf_interval(
+        oxcf->init_framerate, rc->min_gf_interval);
+  rc->baseline_gf_interval = (rc->min_gf_interval + rc->max_gf_interval) / 2;
+}
+
+int vp9_rc_drop_frame(VP9_COMP *cpi) {
+  const VP9EncoderConfig *oxcf = &cpi->oxcf;
+  RATE_CONTROL *const rc = &cpi->rc;
+  if (!oxcf->drop_frames_water_mark ||
+      (is_one_pass_cbr_svc(cpi) &&
+       cpi->svc.spatial_layer_id > cpi->svc.first_spatial_layer_to_encode)) {
+    return 0;
+  } else {
+    if (rc->buffer_level < 0) {
+      // Always drop if buffer is below 0.
+      return 1;
+    } else {
+      // If buffer is below drop_mark, for now just drop every other frame
+      // (starting with the next frame) until it increases back over drop_mark.
+      int drop_mark = (int)(oxcf->drop_frames_water_mark *
+          rc->optimal_buffer_level / 100);
+      if ((rc->buffer_level > drop_mark) &&
+          (rc->decimation_factor > 0)) {
+        --rc->decimation_factor;
+      } else if (rc->buffer_level <= drop_mark &&
+          rc->decimation_factor == 0) {
+        rc->decimation_factor = 1;
+      }
+      if (rc->decimation_factor > 0) {
+        if (rc->decimation_count > 0) {
+          --rc->decimation_count;
+          return 1;
+        } else {
+          rc->decimation_count = rc->decimation_factor;
+          return 0;
+        }
+      } else {
+        rc->decimation_count = 0;
+        return 0;
+      }
+    }
+  }
+}
+
+static double get_rate_correction_factor(const VP9_COMP *cpi) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  double rcf;
+
+  if (cpi->common.frame_type == KEY_FRAME) {
+    rcf = rc->rate_correction_factors[KF_STD];
+  } else if (cpi->oxcf.pass == 2) {
+    RATE_FACTOR_LEVEL rf_lvl =
+      cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index];
+    rcf = rc->rate_correction_factors[rf_lvl];
+  } else {
+    if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) &&
+        !rc->is_src_frame_alt_ref && !cpi->use_svc &&
+        (cpi->oxcf.rc_mode != VPX_CBR || cpi->oxcf.gf_cbr_boost_pct > 20))
+      rcf = rc->rate_correction_factors[GF_ARF_STD];
+    else
+      rcf = rc->rate_correction_factors[INTER_NORMAL];
+  }
+  rcf *= rcf_mult[rc->frame_size_selector];
+  return fclamp(rcf, MIN_BPB_FACTOR, MAX_BPB_FACTOR);
+}
+
+static void set_rate_correction_factor(VP9_COMP *cpi, double factor) {
+  RATE_CONTROL *const rc = &cpi->rc;
+
+  // Normalize RCF to account for the size-dependent scaling factor.
+  factor /= rcf_mult[cpi->rc.frame_size_selector];
+
+  factor = fclamp(factor, MIN_BPB_FACTOR, MAX_BPB_FACTOR);
+
+  if (cpi->common.frame_type == KEY_FRAME) {
+    rc->rate_correction_factors[KF_STD] = factor;
+  } else if (cpi->oxcf.pass == 2) {
+    RATE_FACTOR_LEVEL rf_lvl =
+      cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index];
+    rc->rate_correction_factors[rf_lvl] = factor;
+  } else {
+    if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) &&
+        !rc->is_src_frame_alt_ref && !cpi->use_svc &&
+        (cpi->oxcf.rc_mode != VPX_CBR || cpi->oxcf.gf_cbr_boost_pct > 20))
+      rc->rate_correction_factors[GF_ARF_STD] = factor;
+    else
+      rc->rate_correction_factors[INTER_NORMAL] = factor;
+  }
+}
+
+void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi) {
+  const VP9_COMMON *const cm = &cpi->common;
+  int correction_factor = 100;
+  double rate_correction_factor = get_rate_correction_factor(cpi);
+  double adjustment_limit;
+
+  int projected_size_based_on_q = 0;
+
+  // Do not update the rate factors for arf overlay frames.
+  if (cpi->rc.is_src_frame_alt_ref)
+    return;
+
+  // Clear down mmx registers to allow floating point in what follows
+  vpx_clear_system_state();
+
+  // Work out how big we would have expected the frame to be at this Q given
+  // the current correction factor.
+  // Stay in double to avoid int overflow when values are large
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->common.seg.enabled) {
+    projected_size_based_on_q =
+        vp9_cyclic_refresh_estimate_bits_at_q(cpi, rate_correction_factor);
+  } else {
+    projected_size_based_on_q = vp9_estimate_bits_at_q(cpi->common.frame_type,
+                                                       cm->base_qindex,
+                                                       cm->MBs,
+                                                       rate_correction_factor,
+                                                       cm->bit_depth);
+  }
+  // Work out a size correction factor.
+  if (projected_size_based_on_q > FRAME_OVERHEAD_BITS)
+    correction_factor = (int)((100 * (int64_t)cpi->rc.projected_frame_size) /
+                        projected_size_based_on_q);
+
+  // More heavily damped adjustment used if we have been oscillating either side
+  // of target.
+  adjustment_limit = 0.25 +
+      0.5 * VPXMIN(1, fabs(log10(0.01 * correction_factor)));
+
+  cpi->rc.q_2_frame = cpi->rc.q_1_frame;
+  cpi->rc.q_1_frame = cm->base_qindex;
+  cpi->rc.rc_2_frame = cpi->rc.rc_1_frame;
+  if (correction_factor > 110)
+    cpi->rc.rc_1_frame = -1;
+  else if (correction_factor < 90)
+    cpi->rc.rc_1_frame = 1;
+  else
+    cpi->rc.rc_1_frame = 0;
+
+  // Turn off oscilation detection in the case of massive overshoot.
+  if (cpi->rc.rc_1_frame == -1 && cpi->rc.rc_2_frame == 1 &&
+      correction_factor > 1000) {
+    cpi->rc.rc_2_frame = 0;
+  }
+
+  if (correction_factor > 102) {
+    // We are not already at the worst allowable quality
+    correction_factor = (int)(100 + ((correction_factor - 100) *
+                                  adjustment_limit));
+    rate_correction_factor = (rate_correction_factor * correction_factor) / 100;
+    // Keep rate_correction_factor within limits
+    if (rate_correction_factor > MAX_BPB_FACTOR)
+      rate_correction_factor = MAX_BPB_FACTOR;
+  } else if (correction_factor < 99) {
+    // We are not already at the best allowable quality
+    correction_factor = (int)(100 - ((100 - correction_factor) *
+                                  adjustment_limit));
+    rate_correction_factor = (rate_correction_factor * correction_factor) / 100;
+
+    // Keep rate_correction_factor within limits
+    if (rate_correction_factor < MIN_BPB_FACTOR)
+      rate_correction_factor = MIN_BPB_FACTOR;
+  }
+
+  set_rate_correction_factor(cpi, rate_correction_factor);
+}
+
+
+int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame,
+                      int active_best_quality, int active_worst_quality) {
+  const VP9_COMMON *const cm = &cpi->common;
+  int q = active_worst_quality;
+  int last_error = INT_MAX;
+  int i, target_bits_per_mb, bits_per_mb_at_this_q;
+  const double correction_factor = get_rate_correction_factor(cpi);
+
+  // Calculate required scaling factor based on target frame size and size of
+  // frame produced using previous Q.
+  target_bits_per_mb =
+      ((uint64_t)target_bits_per_frame << BPER_MB_NORMBITS) / cm->MBs;
+
+  i = active_best_quality;
+
+  do {
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+        cm->seg.enabled &&
+        cpi->svc.temporal_layer_id == 0) {
+      bits_per_mb_at_this_q =
+          (int)vp9_cyclic_refresh_rc_bits_per_mb(cpi, i, correction_factor);
+    } else {
+      bits_per_mb_at_this_q = (int)vp9_rc_bits_per_mb(cm->frame_type, i,
+                                                      correction_factor,
+                                                      cm->bit_depth);
+    }
+
+    if (bits_per_mb_at_this_q <= target_bits_per_mb) {
+      if ((target_bits_per_mb - bits_per_mb_at_this_q) <= last_error)
+        q = i;
+      else
+        q = i - 1;
+
+      break;
+    } else {
+      last_error = bits_per_mb_at_this_q - target_bits_per_mb;
+    }
+  } while (++i <= active_worst_quality);
+
+  // In CBR mode, this makes sure q is between oscillating Qs to prevent
+  // resonance.
+  if (cpi->oxcf.rc_mode == VPX_CBR &&
+      (cpi->rc.rc_1_frame * cpi->rc.rc_2_frame == -1) &&
+      cpi->rc.q_1_frame != cpi->rc.q_2_frame) {
+    q = clamp(q, VPXMIN(cpi->rc.q_1_frame, cpi->rc.q_2_frame),
+              VPXMAX(cpi->rc.q_1_frame, cpi->rc.q_2_frame));
+  }
+  return q;
+}
+
+static int get_active_quality(int q, int gfu_boost, int low, int high,
+                              int *low_motion_minq, int *high_motion_minq) {
+  if (gfu_boost > high) {
+    return low_motion_minq[q];
+  } else if (gfu_boost < low) {
+    return high_motion_minq[q];
+  } else {
+    const int gap = high - low;
+    const int offset = high - gfu_boost;
+    const int qdiff = high_motion_minq[q] - low_motion_minq[q];
+    const int adjustment = ((offset * qdiff) + (gap >> 1)) / gap;
+    return low_motion_minq[q] + adjustment;
+  }
+}
+
+static int get_kf_active_quality(const RATE_CONTROL *const rc, int q,
+                                 vpx_bit_depth_t bit_depth) {
+  int *kf_low_motion_minq;
+  int *kf_high_motion_minq;
+  ASSIGN_MINQ_TABLE(bit_depth, kf_low_motion_minq);
+  ASSIGN_MINQ_TABLE(bit_depth, kf_high_motion_minq);
+  return get_active_quality(q, rc->kf_boost, kf_low, kf_high,
+                            kf_low_motion_minq, kf_high_motion_minq);
+}
+
+static int get_gf_active_quality(const RATE_CONTROL *const rc, int q,
+                                 vpx_bit_depth_t bit_depth) {
+  int *arfgf_low_motion_minq;
+  int *arfgf_high_motion_minq;
+  ASSIGN_MINQ_TABLE(bit_depth, arfgf_low_motion_minq);
+  ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq);
+  return get_active_quality(q, rc->gfu_boost, gf_low, gf_high,
+                            arfgf_low_motion_minq, arfgf_high_motion_minq);
+}
+
+static int calc_active_worst_quality_one_pass_vbr(const VP9_COMP *cpi) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const unsigned int curr_frame = cpi->common.current_video_frame;
+  int active_worst_quality;
+
+  if (cpi->common.frame_type == KEY_FRAME) {
+    active_worst_quality = curr_frame == 0 ? rc->worst_quality
+                                           : rc->last_q[KEY_FRAME] * 2;
+  } else {
+    if (!rc->is_src_frame_alt_ref &&
+        (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+      active_worst_quality =  curr_frame == 1 ? rc->last_q[KEY_FRAME] * 5 / 4
+                                              : rc->last_q[INTER_FRAME];
+    } else {
+      active_worst_quality = curr_frame == 1 ? rc->last_q[KEY_FRAME] * 2
+                                             : rc->last_q[INTER_FRAME] * 2;
+    }
+  }
+  return VPXMIN(active_worst_quality, rc->worst_quality);
+}
+
+// Adjust active_worst_quality level based on buffer level.
+static int calc_active_worst_quality_one_pass_cbr(const VP9_COMP *cpi) {
+  // Adjust active_worst_quality: If buffer is above the optimal/target level,
+  // bring active_worst_quality down depending on fullness of buffer.
+  // If buffer is below the optimal level, let the active_worst_quality go from
+  // ambient Q (at buffer = optimal level) to worst_quality level
+  // (at buffer = critical level).
+  const VP9_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *rc = &cpi->rc;
+  // Buffer level below which we push active_worst to worst_quality.
+  int64_t critical_level = rc->optimal_buffer_level >> 3;
+  int64_t buff_lvl_step = 0;
+  int adjustment = 0;
+  int active_worst_quality;
+  int ambient_qp;
+  unsigned int num_frames_weight_key = 5 * cpi->svc.number_temporal_layers;
+  if (cm->frame_type == KEY_FRAME)
+    return rc->worst_quality;
+  // For ambient_qp we use minimum of avg_frame_qindex[KEY_FRAME/INTER_FRAME]
+  // for the first few frames following key frame. These are both initialized
+  // to worst_quality and updated with (3/4, 1/4) average in postencode_update.
+  // So for first few frames following key, the qp of that key frame is weighted
+  // into the active_worst_quality setting.
+  ambient_qp = (cm->current_video_frame < num_frames_weight_key) ?
+                   VPXMIN(rc->avg_frame_qindex[INTER_FRAME],
+                          rc->avg_frame_qindex[KEY_FRAME]) :
+                   rc->avg_frame_qindex[INTER_FRAME];
+  active_worst_quality = VPXMIN(rc->worst_quality, ambient_qp * 5 / 4);
+  if (rc->buffer_level > rc->optimal_buffer_level) {
+    // Adjust down.
+    // Maximum limit for down adjustment, ~30%.
+    int max_adjustment_down = active_worst_quality / 3;
+    if (max_adjustment_down) {
+      buff_lvl_step = ((rc->maximum_buffer_size -
+                        rc->optimal_buffer_level) / max_adjustment_down);
+      if (buff_lvl_step)
+        adjustment = (int)((rc->buffer_level - rc->optimal_buffer_level) /
+                            buff_lvl_step);
+      active_worst_quality -= adjustment;
+    }
+  } else if (rc->buffer_level > critical_level) {
+    // Adjust up from ambient Q.
+    if (critical_level) {
+      buff_lvl_step = (rc->optimal_buffer_level - critical_level);
+      if (buff_lvl_step) {
+        adjustment = (int)((rc->worst_quality - ambient_qp) *
+                           (rc->optimal_buffer_level - rc->buffer_level) /
+                           buff_lvl_step);
+      }
+      active_worst_quality = ambient_qp + adjustment;
+    }
+  } else {
+    // Set to worst_quality if buffer is below critical level.
+    active_worst_quality = rc->worst_quality;
+  }
+  return active_worst_quality;
+}
+
+static int rc_pick_q_and_bounds_one_pass_cbr(const VP9_COMP *cpi,
+                                             int *bottom_index,
+                                             int *top_index) {
+  const VP9_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  int active_best_quality;
+  int active_worst_quality = calc_active_worst_quality_one_pass_cbr(cpi);
+  int q;
+  int *rtc_minq;
+  ASSIGN_MINQ_TABLE(cm->bit_depth, rtc_minq);
+
+  if (frame_is_intra_only(cm)) {
+    active_best_quality = rc->best_quality;
+    // Handle the special case for key frames forced when we have reached
+    // the maximum key frame interval. Here force the Q to a range
+    // based on the ambient Q to reduce the risk of popping.
+    if (rc->this_key_frame_forced) {
+      int qindex = rc->last_boosted_qindex;
+      double last_boosted_q = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
+      int delta_qindex = vp9_compute_qdelta(rc, last_boosted_q,
+                                            (last_boosted_q * 0.75),
+                                            cm->bit_depth);
+      active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
+    } else if (cm->current_video_frame > 0) {
+      // not first frame of one pass and kf_boost is set
+      double q_adj_factor = 1.0;
+      double q_val;
+
+      active_best_quality =
+          get_kf_active_quality(rc, rc->avg_frame_qindex[KEY_FRAME],
+                                cm->bit_depth);
+
+      // Allow somewhat lower kf minq with small image formats.
+      if ((cm->width * cm->height) <= (352 * 288)) {
+        q_adj_factor -= 0.25;
+      }
+
+      // Convert the adjustment factor to a qindex delta
+      // on active_best_quality.
+      q_val = vp9_convert_qindex_to_q(active_best_quality, cm->bit_depth);
+      active_best_quality += vp9_compute_qdelta(rc, q_val,
+                                                q_val * q_adj_factor,
+                                                cm->bit_depth);
+    }
+  } else if (!rc->is_src_frame_alt_ref &&
+             !cpi->use_svc &&
+             (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+    // Use the lower of active_worst_quality and recent
+    // average Q as basis for GF/ARF best Q limit unless last frame was
+    // a key frame.
+    if (rc->frames_since_key > 1 &&
+        rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) {
+      q = rc->avg_frame_qindex[INTER_FRAME];
+    } else {
+      q = active_worst_quality;
+    }
+    active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+  } else {
+    // Use the lower of active_worst_quality and recent/average Q.
+    if (cm->current_video_frame > 1) {
+      if (rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality)
+        active_best_quality = rtc_minq[rc->avg_frame_qindex[INTER_FRAME]];
+      else
+        active_best_quality = rtc_minq[active_worst_quality];
+    } else {
+      if (rc->avg_frame_qindex[KEY_FRAME] < active_worst_quality)
+        active_best_quality = rtc_minq[rc->avg_frame_qindex[KEY_FRAME]];
+      else
+        active_best_quality = rtc_minq[active_worst_quality];
+    }
+  }
+
+  // Clip the active best and worst quality values to limits
+  active_best_quality = clamp(active_best_quality,
+                              rc->best_quality, rc->worst_quality);
+  active_worst_quality = clamp(active_worst_quality,
+                               active_best_quality, rc->worst_quality);
+
+  *top_index = active_worst_quality;
+  *bottom_index = active_best_quality;
+
+#if LIMIT_QRANGE_FOR_ALTREF_AND_KEY
+  // Limit Q range for the adaptive loop.
+  if (cm->frame_type == KEY_FRAME &&
+      !rc->this_key_frame_forced  &&
+      !(cm->current_video_frame == 0)) {
+    int qdelta = 0;
+    vpx_clear_system_state();
+    qdelta = vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type,
+                                        active_worst_quality, 2.0,
+                                        cm->bit_depth);
+    *top_index = active_worst_quality + qdelta;
+    *top_index = (*top_index > *bottom_index) ? *top_index : *bottom_index;
+  }
+#endif
+
+  // Special case code to try and match quality with forced key frames
+  if (cm->frame_type == KEY_FRAME && rc->this_key_frame_forced) {
+    q = rc->last_boosted_qindex;
+  } else {
+    q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
+                          active_best_quality, active_worst_quality);
+    if (q > *top_index) {
+      // Special case when we are targeting the max allowed rate
+      if (rc->this_frame_target >= rc->max_frame_bandwidth)
+        *top_index = q;
+      else
+        q = *top_index;
+    }
+  }
+  assert(*top_index <= rc->worst_quality &&
+         *top_index >= rc->best_quality);
+  assert(*bottom_index <= rc->worst_quality &&
+         *bottom_index >= rc->best_quality);
+  assert(q <= rc->worst_quality && q >= rc->best_quality);
+  return q;
+}
+
+static int get_active_cq_level(const RATE_CONTROL *rc,
+                               const VP9EncoderConfig *const oxcf) {
+  static const double cq_adjust_threshold = 0.1;
+  int active_cq_level = oxcf->cq_level;
+  if (oxcf->rc_mode == VPX_CQ &&
+      rc->total_target_bits > 0) {
+    const double x = (double)rc->total_actual_bits / rc->total_target_bits;
+    if (x < cq_adjust_threshold) {
+      active_cq_level = (int)(active_cq_level * x / cq_adjust_threshold);
+    }
+  }
+  return active_cq_level;
+}
+
+static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi,
+                                             int *bottom_index,
+                                             int *top_index) {
+  const VP9_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  const int cq_level = get_active_cq_level(rc, oxcf);
+  int active_best_quality;
+  int active_worst_quality = calc_active_worst_quality_one_pass_vbr(cpi);
+  int q;
+  int *inter_minq;
+  ASSIGN_MINQ_TABLE(cm->bit_depth, inter_minq);
+
+  if (frame_is_intra_only(cm)) {
+    if (oxcf->rc_mode == VPX_Q) {
+      int qindex = cq_level;
+      double q = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
+      int delta_qindex = vp9_compute_qdelta(rc, q, q * 0.25,
+                                            cm->bit_depth);
+      active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
+    } else if (rc->this_key_frame_forced) {
+      // Handle the special case for key frames forced when we have reached
+      // the maximum key frame interval. Here force the Q to a range
+      // based on the ambient Q to reduce the risk of popping.
+      int qindex = rc->last_boosted_qindex;
+      double last_boosted_q = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
+      int delta_qindex = vp9_compute_qdelta(rc, last_boosted_q,
+                                            last_boosted_q * 0.75,
+                                            cm->bit_depth);
+      active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
+    } else {
+      // not first frame of one pass and kf_boost is set
+      double q_adj_factor = 1.0;
+      double q_val;
+
+      active_best_quality =
+          get_kf_active_quality(rc, rc->avg_frame_qindex[KEY_FRAME],
+                                cm->bit_depth);
+
+      // Allow somewhat lower kf minq with small image formats.
+      if ((cm->width * cm->height) <= (352 * 288)) {
+        q_adj_factor -= 0.25;
+      }
+
+      // Convert the adjustment factor to a qindex delta
+      // on active_best_quality.
+      q_val = vp9_convert_qindex_to_q(active_best_quality, cm->bit_depth);
+      active_best_quality += vp9_compute_qdelta(rc, q_val,
+                                                q_val * q_adj_factor,
+                                                cm->bit_depth);
+    }
+  } else if (!rc->is_src_frame_alt_ref &&
+             (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+    // Use the lower of active_worst_quality and recent
+    // average Q as basis for GF/ARF best Q limit unless last frame was
+    // a key frame.
+    if (rc->frames_since_key > 1 &&
+        rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) {
+      q = rc->avg_frame_qindex[INTER_FRAME];
+    } else {
+      q = rc->avg_frame_qindex[KEY_FRAME];
+    }
+    // For constrained quality dont allow Q less than the cq level
+    if (oxcf->rc_mode == VPX_CQ) {
+      if (q < cq_level)
+        q = cq_level;
+
+      active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+
+      // Constrained quality use slightly lower active best.
+      active_best_quality = active_best_quality * 15 / 16;
+
+    } else if (oxcf->rc_mode == VPX_Q) {
+      int qindex = cq_level;
+      double q = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
+      int delta_qindex;
+      if (cpi->refresh_alt_ref_frame)
+        delta_qindex = vp9_compute_qdelta(rc, q, q * 0.40, cm->bit_depth);
+      else
+        delta_qindex = vp9_compute_qdelta(rc, q, q * 0.50, cm->bit_depth);
+      active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
+    } else {
+      active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+    }
+  } else {
+    if (oxcf->rc_mode == VPX_Q) {
+      int qindex = cq_level;
+      double q = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
+      double delta_rate[FIXED_GF_INTERVAL] =
+          {0.50, 1.0, 0.85, 1.0, 0.70, 1.0, 0.85, 1.0};
+      int delta_qindex =
+          vp9_compute_qdelta(rc, q,
+                             q * delta_rate[cm->current_video_frame %
+                             FIXED_GF_INTERVAL], cm->bit_depth);
+      active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
+    } else {
+      // Use the lower of active_worst_quality and recent/average Q.
+      if (cm->current_video_frame > 1)
+        active_best_quality = inter_minq[rc->avg_frame_qindex[INTER_FRAME]];
+      else
+        active_best_quality = inter_minq[rc->avg_frame_qindex[KEY_FRAME]];
+      // For the constrained quality mode we don't want
+      // q to fall below the cq level.
+      if ((oxcf->rc_mode == VPX_CQ) &&
+          (active_best_quality < cq_level)) {
+        active_best_quality = cq_level;
+      }
+    }
+  }
+
+  // Clip the active best and worst quality values to limits
+  active_best_quality = clamp(active_best_quality,
+                              rc->best_quality, rc->worst_quality);
+  active_worst_quality = clamp(active_worst_quality,
+                               active_best_quality, rc->worst_quality);
+
+  *top_index = active_worst_quality;
+  *bottom_index = active_best_quality;
+
+#if LIMIT_QRANGE_FOR_ALTREF_AND_KEY
+  {
+    int qdelta = 0;
+    vpx_clear_system_state();
+
+    // Limit Q range for the adaptive loop.
+    if (cm->frame_type == KEY_FRAME &&
+        !rc->this_key_frame_forced &&
+        !(cm->current_video_frame == 0)) {
+      qdelta = vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type,
+                                          active_worst_quality, 2.0,
+                                          cm->bit_depth);
+    } else if (!rc->is_src_frame_alt_ref &&
+               (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+      qdelta = vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type,
+                                          active_worst_quality, 1.75,
+                                          cm->bit_depth);
+    }
+    *top_index = active_worst_quality + qdelta;
+    *top_index = (*top_index > *bottom_index) ? *top_index : *bottom_index;
+  }
+#endif
+
+  if (oxcf->rc_mode == VPX_Q) {
+    q = active_best_quality;
+  // Special case code to try and match quality with forced key frames
+  } else if ((cm->frame_type == KEY_FRAME) && rc->this_key_frame_forced) {
+    q = rc->last_boosted_qindex;
+  } else {
+    q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
+                          active_best_quality, active_worst_quality);
+    if (q > *top_index) {
+      // Special case when we are targeting the max allowed rate
+      if (rc->this_frame_target >= rc->max_frame_bandwidth)
+        *top_index = q;
+      else
+        q = *top_index;
+    }
+  }
+
+  assert(*top_index <= rc->worst_quality &&
+         *top_index >= rc->best_quality);
+  assert(*bottom_index <= rc->worst_quality &&
+         *bottom_index >= rc->best_quality);
+  assert(q <= rc->worst_quality && q >= rc->best_quality);
+  return q;
+}
+
+int vp9_frame_type_qdelta(const VP9_COMP *cpi, int rf_level, int q) {
+  static const double rate_factor_deltas[RATE_FACTOR_LEVELS] = {
+    1.00,  // INTER_NORMAL
+    1.00,  // INTER_HIGH
+    1.50,  // GF_ARF_LOW
+    1.75,  // GF_ARF_STD
+    2.00,  // KF_STD
+  };
+  static const FRAME_TYPE frame_type[RATE_FACTOR_LEVELS] =
+      {INTER_FRAME, INTER_FRAME, INTER_FRAME, INTER_FRAME, KEY_FRAME};
+  const VP9_COMMON *const cm = &cpi->common;
+  int qdelta = vp9_compute_qdelta_by_rate(&cpi->rc, frame_type[rf_level],
+                                          q, rate_factor_deltas[rf_level],
+                                          cm->bit_depth);
+  return qdelta;
+}
+
+#define STATIC_MOTION_THRESH 95
+static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi,
+                                         int *bottom_index,
+                                         int *top_index) {
+  const VP9_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  const GF_GROUP *gf_group = &cpi->twopass.gf_group;
+  const int cq_level = get_active_cq_level(rc, oxcf);
+  int active_best_quality;
+  int active_worst_quality = cpi->twopass.active_worst_quality;
+  int q;
+  int *inter_minq;
+  ASSIGN_MINQ_TABLE(cm->bit_depth, inter_minq);
+
+  if (frame_is_intra_only(cm) || vp9_is_upper_layer_key_frame(cpi)) {
+    // Handle the special case for key frames forced when we have reached
+    // the maximum key frame interval. Here force the Q to a range
+    // based on the ambient Q to reduce the risk of popping.
+    if (rc->this_key_frame_forced) {
+      double last_boosted_q;
+      int delta_qindex;
+      int qindex;
+
+      if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
+        qindex = VPXMIN(rc->last_kf_qindex, rc->last_boosted_qindex);
+        active_best_quality = qindex;
+        last_boosted_q = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
+        delta_qindex = vp9_compute_qdelta(rc, last_boosted_q,
+                                              last_boosted_q * 1.25,
+                                              cm->bit_depth);
+        active_worst_quality =
+            VPXMIN(qindex + delta_qindex, active_worst_quality);
+      } else {
+        qindex = rc->last_boosted_qindex;
+        last_boosted_q = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
+        delta_qindex = vp9_compute_qdelta(rc, last_boosted_q,
+                                              last_boosted_q * 0.75,
+                                              cm->bit_depth);
+        active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
+      }
+    } else {
+      // Not forced keyframe.
+      double q_adj_factor = 1.0;
+      double q_val;
+      // Baseline value derived from cpi->active_worst_quality and kf boost.
+      active_best_quality = get_kf_active_quality(rc, active_worst_quality,
+                                                  cm->bit_depth);
+
+      // Allow somewhat lower kf minq with small image formats.
+      if ((cm->width * cm->height) <= (352 * 288)) {
+        q_adj_factor -= 0.25;
+      }
+
+      // Make a further adjustment based on the kf zero motion measure.
+      q_adj_factor += 0.05 - (0.001 * (double)cpi->twopass.kf_zeromotion_pct);
+
+      // Convert the adjustment factor to a qindex delta
+      // on active_best_quality.
+      q_val = vp9_convert_qindex_to_q(active_best_quality, cm->bit_depth);
+      active_best_quality += vp9_compute_qdelta(rc, q_val,
+                                                q_val * q_adj_factor,
+                                                cm->bit_depth);
+    }
+  } else if (!rc->is_src_frame_alt_ref &&
+             (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+    // Use the lower of active_worst_quality and recent
+    // average Q as basis for GF/ARF best Q limit unless last frame was
+    // a key frame.
+    if (rc->frames_since_key > 1 &&
+        rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) {
+      q = rc->avg_frame_qindex[INTER_FRAME];
+    } else {
+      q = active_worst_quality;
+    }
+    // For constrained quality dont allow Q less than the cq level
+    if (oxcf->rc_mode == VPX_CQ) {
+      if (q < cq_level)
+        q = cq_level;
+
+      active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+
+      // Constrained quality use slightly lower active best.
+      active_best_quality = active_best_quality * 15 / 16;
+
+    } else if (oxcf->rc_mode == VPX_Q) {
+      if (!cpi->refresh_alt_ref_frame) {
+        active_best_quality = cq_level;
+      } else {
+        active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+
+        // Modify best quality for second level arfs. For mode VPX_Q this
+        // becomes the baseline frame q.
+        if (gf_group->rf_level[gf_group->index] == GF_ARF_LOW)
+          active_best_quality = (active_best_quality + cq_level + 1) / 2;
+      }
+    } else {
+      active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+    }
+  } else {
+    if (oxcf->rc_mode == VPX_Q) {
+      active_best_quality = cq_level;
+    } else {
+      active_best_quality = inter_minq[active_worst_quality];
+
+      // For the constrained quality mode we don't want
+      // q to fall below the cq level.
+      if ((oxcf->rc_mode == VPX_CQ) &&
+          (active_best_quality < cq_level)) {
+        active_best_quality = cq_level;
+      }
+    }
+  }
+
+  // Extension to max or min Q if undershoot or overshoot is outside
+  // the permitted range.
+  if ((cpi->oxcf.rc_mode != VPX_Q) &&
+      (cpi->twopass.gf_zeromotion_pct < VLOW_MOTION_THRESHOLD)) {
+    if (frame_is_intra_only(cm) ||
+        (!rc->is_src_frame_alt_ref &&
+         (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) {
+      active_best_quality -=
+        (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast);
+      active_worst_quality += (cpi->twopass.extend_maxq / 2);
+    } else {
+      active_best_quality -=
+        (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast) / 2;
+      active_worst_quality += cpi->twopass.extend_maxq;
+    }
+  }
+
+#if LIMIT_QRANGE_FOR_ALTREF_AND_KEY
+  vpx_clear_system_state();
+  // Static forced key frames Q restrictions dealt with elsewhere.
+  if (!((frame_is_intra_only(cm) || vp9_is_upper_layer_key_frame(cpi))) ||
+      !rc->this_key_frame_forced ||
+      (cpi->twopass.last_kfgroup_zeromotion_pct < STATIC_MOTION_THRESH)) {
+    int qdelta = vp9_frame_type_qdelta(cpi, gf_group->rf_level[gf_group->index],
+                                       active_worst_quality);
+    active_worst_quality = VPXMAX(active_worst_quality + qdelta,
+                                  active_best_quality);
+  }
+#endif
+
+  // Modify active_best_quality for downscaled normal frames.
+  if (rc->frame_size_selector != UNSCALED && !frame_is_kf_gf_arf(cpi)) {
+    int qdelta = vp9_compute_qdelta_by_rate(rc, cm->frame_type,
+                                            active_best_quality, 2.0,
+                                            cm->bit_depth);
+    active_best_quality =
+        VPXMAX(active_best_quality + qdelta, rc->best_quality);
+  }
+
+  active_best_quality = clamp(active_best_quality,
+                              rc->best_quality, rc->worst_quality);
+  active_worst_quality = clamp(active_worst_quality,
+                               active_best_quality, rc->worst_quality);
+
+  if (oxcf->rc_mode == VPX_Q) {
+    q = active_best_quality;
+  // Special case code to try and match quality with forced key frames.
+  } else if ((frame_is_intra_only(cm) || vp9_is_upper_layer_key_frame(cpi)) &&
+             rc->this_key_frame_forced) {
+    // If static since last kf use better of last boosted and last kf q.
+    if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
+      q = VPXMIN(rc->last_kf_qindex, rc->last_boosted_qindex);
+    } else {
+      q = rc->last_boosted_qindex;
+    }
+  } else {
+    q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
+                          active_best_quality, active_worst_quality);
+    if (q > active_worst_quality) {
+      // Special case when we are targeting the max allowed rate.
+      if (rc->this_frame_target >= rc->max_frame_bandwidth)
+        active_worst_quality = q;
+      else
+        q = active_worst_quality;
+    }
+  }
+  clamp(q, active_best_quality, active_worst_quality);
+
+  *top_index = active_worst_quality;
+  *bottom_index = active_best_quality;
+
+  assert(*top_index <= rc->worst_quality &&
+         *top_index >= rc->best_quality);
+  assert(*bottom_index <= rc->worst_quality &&
+         *bottom_index >= rc->best_quality);
+  assert(q <= rc->worst_quality && q >= rc->best_quality);
+  return q;
+}
+
+int vp9_rc_pick_q_and_bounds(const VP9_COMP *cpi,
+                             int *bottom_index, int *top_index) {
+  int q;
+  if (cpi->oxcf.pass == 0) {
+    if (cpi->oxcf.rc_mode == VPX_CBR)
+      q = rc_pick_q_and_bounds_one_pass_cbr(cpi, bottom_index, top_index);
+    else
+      q = rc_pick_q_and_bounds_one_pass_vbr(cpi, bottom_index, top_index);
+  } else {
+    q = rc_pick_q_and_bounds_two_pass(cpi, bottom_index, top_index);
+  }
+  if (cpi->sf.use_nonrd_pick_mode) {
+    if (cpi->sf.force_frame_boost == 1)
+      q -= cpi->sf.max_delta_qindex;
+
+    if (q < *bottom_index)
+      *bottom_index = q;
+    else if (q > *top_index)
+      *top_index = q;
+  }
+  return q;
+}
+
+void vp9_rc_compute_frame_size_bounds(const VP9_COMP *cpi,
+                                      int frame_target,
+                                      int *frame_under_shoot_limit,
+                                      int *frame_over_shoot_limit) {
+  if (cpi->oxcf.rc_mode == VPX_Q) {
+    *frame_under_shoot_limit = 0;
+    *frame_over_shoot_limit  = INT_MAX;
+  } else {
+    // For very small rate targets where the fractional adjustment
+    // may be tiny make sure there is at least a minimum range.
+    const int tolerance = (cpi->sf.recode_tolerance * frame_target) / 100;
+    *frame_under_shoot_limit = VPXMAX(frame_target - tolerance - 200, 0);
+    *frame_over_shoot_limit = VPXMIN(frame_target + tolerance + 200,
+                                     cpi->rc.max_frame_bandwidth);
+  }
+}
+
+void vp9_rc_set_frame_target(VP9_COMP *cpi, int target) {
+  const VP9_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+
+  rc->this_frame_target = target;
+
+  // Modify frame size target when down-scaling.
+  if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC &&
+      rc->frame_size_selector != UNSCALED)
+    rc->this_frame_target = (int)(rc->this_frame_target
+        * rate_thresh_mult[rc->frame_size_selector]);
+
+  // Target rate per SB64 (including partial SB64s.
+  rc->sb64_target_rate = ((int64_t)rc->this_frame_target * 64 * 64) /
+                             (cm->width * cm->height);
+}
+
+static void update_alt_ref_frame_stats(VP9_COMP *cpi) {
+  // this frame refreshes means next frames don't unless specified by user
+  RATE_CONTROL *const rc = &cpi->rc;
+  rc->frames_since_golden = 0;
+
+  // Mark the alt ref as done (setting to 0 means no further alt refs pending).
+  rc->source_alt_ref_pending = 0;
+
+  // Set the alternate reference frame active flag
+  rc->source_alt_ref_active = 1;
+}
+
+static void update_golden_frame_stats(VP9_COMP *cpi) {
+  RATE_CONTROL *const rc = &cpi->rc;
+
+  // Update the Golden frame usage counts.
+  if (cpi->refresh_golden_frame) {
+    // this frame refreshes means next frames don't unless specified by user
+    rc->frames_since_golden = 0;
+
+    // If we are not using alt ref in the up and coming group clear the arf
+    // active flag. In multi arf group case, if the index is not 0 then
+    // we are overlaying a mid group arf so should not reset the flag.
+    if (cpi->oxcf.pass == 2) {
+      if (!rc->source_alt_ref_pending && (cpi->twopass.gf_group.index == 0))
+        rc->source_alt_ref_active = 0;
+    } else if (!rc->source_alt_ref_pending) {
+      rc->source_alt_ref_active = 0;
+    }
+
+    // Decrement count down till next gf
+    if (rc->frames_till_gf_update_due > 0)
+      rc->frames_till_gf_update_due--;
+
+  } else if (!cpi->refresh_alt_ref_frame) {
+    // Decrement count down till next gf
+    if (rc->frames_till_gf_update_due > 0)
+      rc->frames_till_gf_update_due--;
+
+    rc->frames_since_golden++;
+  }
+}
+
+void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
+  const VP9_COMMON *const cm = &cpi->common;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  RATE_CONTROL *const rc = &cpi->rc;
+  const int qindex = cm->base_qindex;
+
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) {
+    vp9_cyclic_refresh_postencode(cpi);
+  }
+
+  // Update rate control heuristics
+  rc->projected_frame_size = (int)(bytes_used << 3);
+
+  // Post encode loop adjustment of Q prediction.
+  vp9_rc_update_rate_correction_factors(cpi);
+
+  // Keep a record of last Q and ambient average Q.
+  if (cm->frame_type == KEY_FRAME) {
+    rc->last_q[KEY_FRAME] = qindex;
+    rc->avg_frame_qindex[KEY_FRAME] =
+        ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[KEY_FRAME] + qindex, 2);
+    if (cpi->use_svc) {
+      int i = 0;
+      SVC *svc = &cpi->svc;
+      for (i = 0; i < svc->number_temporal_layers; ++i) {
+        const int layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, i,
+                                           svc->number_temporal_layers);
+        LAYER_CONTEXT *lc = &svc->layer_context[layer];
+        RATE_CONTROL *lrc = &lc->rc;
+        lrc->last_q[KEY_FRAME] = rc->last_q[KEY_FRAME];
+        lrc->avg_frame_qindex[KEY_FRAME] = rc->avg_frame_qindex[KEY_FRAME];
+      }
+    }
+  } else {
+    if ((cpi->use_svc && oxcf->rc_mode == VPX_CBR) ||
+        (!rc->is_src_frame_alt_ref &&
+         !(cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) {
+      rc->last_q[INTER_FRAME] = qindex;
+      rc->avg_frame_qindex[INTER_FRAME] =
+        ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[INTER_FRAME] + qindex, 2);
+      rc->ni_frames++;
+      rc->tot_q += vp9_convert_qindex_to_q(qindex, cm->bit_depth);
+      rc->avg_q = rc->tot_q / rc->ni_frames;
+      // Calculate the average Q for normal inter frames (not key or GFU
+      // frames).
+      rc->ni_tot_qi += qindex;
+      rc->ni_av_qi = rc->ni_tot_qi / rc->ni_frames;
+    }
+  }
+
+  // Keep record of last boosted (KF/KF/ARF) Q value.
+  // If the current frame is coded at a lower Q then we also update it.
+  // If all mbs in this group are skipped only update if the Q value is
+  // better than that already stored.
+  // This is used to help set quality in forced key frames to reduce popping
+  if ((qindex < rc->last_boosted_qindex) ||
+      (cm->frame_type == KEY_FRAME) ||
+      (!rc->constrained_gf_group &&
+       (cpi->refresh_alt_ref_frame ||
+        (cpi->refresh_golden_frame && !rc->is_src_frame_alt_ref)))) {
+    rc->last_boosted_qindex = qindex;
+  }
+  if (cm->frame_type == KEY_FRAME)
+    rc->last_kf_qindex = qindex;
+
+  update_buffer_level(cpi, rc->projected_frame_size);
+
+  // Rolling monitors of whether we are over or underspending used to help
+  // regulate min and Max Q in two pass.
+  if (cm->frame_type != KEY_FRAME) {
+    rc->rolling_target_bits = ROUND_POWER_OF_TWO(
+        rc->rolling_target_bits * 3 + rc->this_frame_target, 2);
+    rc->rolling_actual_bits = ROUND_POWER_OF_TWO(
+        rc->rolling_actual_bits * 3 + rc->projected_frame_size, 2);
+    rc->long_rolling_target_bits = ROUND_POWER_OF_TWO(
+        rc->long_rolling_target_bits * 31 + rc->this_frame_target, 5);
+    rc->long_rolling_actual_bits = ROUND_POWER_OF_TWO(
+        rc->long_rolling_actual_bits * 31 + rc->projected_frame_size, 5);
+  }
+
+  // Actual bits spent
+  rc->total_actual_bits += rc->projected_frame_size;
+  rc->total_target_bits += cm->show_frame ? rc->avg_frame_bandwidth : 0;
+
+  rc->total_target_vs_actual = rc->total_actual_bits - rc->total_target_bits;
+
+  if (!cpi->use_svc || is_two_pass_svc(cpi)) {
+    if (is_altref_enabled(cpi) && cpi->refresh_alt_ref_frame &&
+        (cm->frame_type != KEY_FRAME))
+      // Update the alternate reference frame stats as appropriate.
+      update_alt_ref_frame_stats(cpi);
+    else
+      // Update the Golden frame stats as appropriate.
+      update_golden_frame_stats(cpi);
+  }
+
+  if (cm->frame_type == KEY_FRAME)
+    rc->frames_since_key = 0;
+  if (cm->show_frame) {
+    rc->frames_since_key++;
+    rc->frames_to_key--;
+  }
+
+  // Trigger the resizing of the next frame if it is scaled.
+  if (oxcf->pass != 0) {
+    cpi->resize_pending =
+        rc->next_frame_size_selector != rc->frame_size_selector;
+    rc->frame_size_selector = rc->next_frame_size_selector;
+  }
+}
+
+void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) {
+  // Update buffer level with zero size, update frame counters, and return.
+  update_buffer_level(cpi, 0);
+  cpi->rc.frames_since_key++;
+  cpi->rc.frames_to_key--;
+  cpi->rc.rc_2_frame = 0;
+  cpi->rc.rc_1_frame = 0;
+}
+
+// Use this macro to turn on/off use of alt-refs in one-pass mode.
+#define USE_ALTREF_FOR_ONE_PASS   1
+
+static int calc_pframe_target_size_one_pass_vbr(const VP9_COMP *const cpi) {
+  static const int af_ratio = 10;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  int target;
+#if USE_ALTREF_FOR_ONE_PASS
+  target = (!rc->is_src_frame_alt_ref &&
+            (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) ?
+      (rc->avg_frame_bandwidth * rc->baseline_gf_interval * af_ratio) /
+      (rc->baseline_gf_interval + af_ratio - 1) :
+      (rc->avg_frame_bandwidth * rc->baseline_gf_interval) /
+      (rc->baseline_gf_interval + af_ratio - 1);
+#else
+  target = rc->avg_frame_bandwidth;
+#endif
+  return vp9_rc_clamp_pframe_target_size(cpi, target);
+}
+
+static int calc_iframe_target_size_one_pass_vbr(const VP9_COMP *const cpi) {
+  static const int kf_ratio = 25;
+  const RATE_CONTROL *rc = &cpi->rc;
+  const int target = rc->avg_frame_bandwidth * kf_ratio;
+  return vp9_rc_clamp_iframe_target_size(cpi, target);
+}
+
+void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  int target;
+  // TODO(yaowu): replace the "auto_key && 0" below with proper decision logic.
+  if (!cpi->refresh_alt_ref_frame &&
+      (cm->current_video_frame == 0 ||
+       (cpi->frame_flags & FRAMEFLAGS_KEY) ||
+       rc->frames_to_key == 0 ||
+       (cpi->oxcf.auto_key && 0))) {
+    cm->frame_type = KEY_FRAME;
+    rc->this_key_frame_forced = cm->current_video_frame != 0 &&
+                                rc->frames_to_key == 0;
+    rc->frames_to_key = cpi->oxcf.key_freq;
+    rc->kf_boost = DEFAULT_KF_BOOST;
+    rc->source_alt_ref_active = 0;
+  } else {
+    cm->frame_type = INTER_FRAME;
+  }
+  if (rc->frames_till_gf_update_due == 0) {
+    rc->baseline_gf_interval = (rc->min_gf_interval + rc->max_gf_interval) / 2;
+    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+    // NOTE: frames_till_gf_update_due must be <= frames_to_key.
+    if (rc->frames_till_gf_update_due > rc->frames_to_key) {
+      rc->frames_till_gf_update_due = rc->frames_to_key;
+      rc->constrained_gf_group = 1;
+    } else {
+      rc->constrained_gf_group = 0;
+    }
+    cpi->refresh_golden_frame = 1;
+    rc->source_alt_ref_pending = USE_ALTREF_FOR_ONE_PASS;
+    rc->gfu_boost = DEFAULT_GF_BOOST;
+  }
+  if (cm->frame_type == KEY_FRAME)
+    target = calc_iframe_target_size_one_pass_vbr(cpi);
+  else
+    target = calc_pframe_target_size_one_pass_vbr(cpi);
+  vp9_rc_set_frame_target(cpi, target);
+}
+
+static int calc_pframe_target_size_one_pass_cbr(const VP9_COMP *cpi) {
+  const VP9EncoderConfig *oxcf = &cpi->oxcf;
+  const RATE_CONTROL *rc = &cpi->rc;
+  const SVC *const svc = &cpi->svc;
+  const int64_t diff = rc->optimal_buffer_level - rc->buffer_level;
+  const int64_t one_pct_bits = 1 + rc->optimal_buffer_level / 100;
+  int min_frame_target =
+      VPXMAX(rc->avg_frame_bandwidth >> 4, FRAME_OVERHEAD_BITS);
+  int target;
+
+  if (oxcf->gf_cbr_boost_pct) {
+    const int af_ratio_pct = oxcf->gf_cbr_boost_pct + 100;
+    target =  cpi->refresh_golden_frame ?
+      (rc->avg_frame_bandwidth * rc->baseline_gf_interval * af_ratio_pct) /
+      (rc->baseline_gf_interval * 100 + af_ratio_pct - 100) :
+      (rc->avg_frame_bandwidth * rc->baseline_gf_interval * 100) /
+      (rc->baseline_gf_interval * 100 + af_ratio_pct - 100);
+  } else {
+    target = rc->avg_frame_bandwidth;
+  }
+  if (is_one_pass_cbr_svc(cpi)) {
+    // Note that for layers, avg_frame_bandwidth is the cumulative
+    // per-frame-bandwidth. For the target size of this frame, use the
+    // layer average frame size (i.e., non-cumulative per-frame-bw).
+    int layer =
+        LAYER_IDS_TO_IDX(svc->spatial_layer_id,
+            svc->temporal_layer_id, svc->number_temporal_layers);
+    const LAYER_CONTEXT *lc = &svc->layer_context[layer];
+    target = lc->avg_frame_size;
+    min_frame_target = VPXMAX(lc->avg_frame_size >> 4, FRAME_OVERHEAD_BITS);
+  }
+  if (diff > 0) {
+    // Lower the target bandwidth for this frame.
+    const int pct_low = (int)VPXMIN(diff / one_pct_bits, oxcf->under_shoot_pct);
+    target -= (target * pct_low) / 200;
+  } else if (diff < 0) {
+    // Increase the target bandwidth for this frame.
+    const int pct_high =
+        (int)VPXMIN(-diff / one_pct_bits, oxcf->over_shoot_pct);
+    target += (target * pct_high) / 200;
+  }
+  if (oxcf->rc_max_inter_bitrate_pct) {
+    const int max_rate = rc->avg_frame_bandwidth *
+                         oxcf->rc_max_inter_bitrate_pct / 100;
+    target = VPXMIN(target, max_rate);
+  }
+  return VPXMAX(min_frame_target, target);
+}
+
+static int calc_iframe_target_size_one_pass_cbr(const VP9_COMP *cpi) {
+  const RATE_CONTROL *rc = &cpi->rc;
+  const VP9EncoderConfig *oxcf = &cpi->oxcf;
+  const SVC *const svc = &cpi->svc;
+  int target;
+  if (cpi->common.current_video_frame == 0) {
+    target = ((rc->starting_buffer_level / 2) > INT_MAX)
+      ? INT_MAX : (int)(rc->starting_buffer_level / 2);
+  } else {
+    int kf_boost = 32;
+    double framerate = cpi->framerate;
+    if (svc->number_temporal_layers > 1 &&
+        oxcf->rc_mode == VPX_CBR) {
+      // Use the layer framerate for temporal layers CBR mode.
+      const int layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id,
+          svc->temporal_layer_id, svc->number_temporal_layers);
+      const LAYER_CONTEXT *lc = &svc->layer_context[layer];
+      framerate = lc->framerate;
+    }
+    kf_boost = VPXMAX(kf_boost, (int)(2 * framerate - 16));
+    if (rc->frames_since_key <  framerate / 2) {
+      kf_boost = (int)(kf_boost * rc->frames_since_key /
+                       (framerate / 2));
+    }
+    target = ((16 + kf_boost) * rc->avg_frame_bandwidth) >> 4;
+  }
+  return vp9_rc_clamp_iframe_target_size(cpi, target);
+}
+
+// Reset information needed to set proper reference frames and buffer updates
+// for temporal layering. This is called when a key frame is encoded.
+static void reset_temporal_layer_to_zero(VP9_COMP *cpi) {
+  int sl;
+  LAYER_CONTEXT *lc = NULL;
+  cpi->svc.temporal_layer_id = 0;
+
+  for (sl = 0; sl < cpi->svc.number_spatial_layers; ++sl) {
+    lc = &cpi->svc.layer_context[sl * cpi->svc.number_temporal_layers];
+    lc->current_video_frame_in_layer = 0;
+    lc->frames_from_key_frame = 0;
+  }
+}
+
+void vp9_rc_get_svc_params(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  int target = rc->avg_frame_bandwidth;
+  int layer = LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id,
+      cpi->svc.temporal_layer_id, cpi->svc.number_temporal_layers);
+
+  if ((cm->current_video_frame == 0) ||
+      (cpi->frame_flags & FRAMEFLAGS_KEY) ||
+      (cpi->oxcf.auto_key && (rc->frames_since_key %
+          cpi->oxcf.key_freq == 0))) {
+    cm->frame_type = KEY_FRAME;
+    rc->source_alt_ref_active = 0;
+
+    if (is_two_pass_svc(cpi)) {
+      cpi->svc.layer_context[layer].is_key_frame = 1;
+      cpi->ref_frame_flags &=
+          (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG);
+    } else if (is_one_pass_cbr_svc(cpi)) {
+      reset_temporal_layer_to_zero(cpi);
+      layer = LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id,
+           cpi->svc.temporal_layer_id, cpi->svc.number_temporal_layers);
+      cpi->svc.layer_context[layer].is_key_frame = 1;
+      cpi->ref_frame_flags &=
+                (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG);
+      // Assumption here is that LAST_FRAME is being updated for a keyframe.
+      // Thus no change in update flags.
+      target = calc_iframe_target_size_one_pass_cbr(cpi);
+    }
+  } else {
+    cm->frame_type = INTER_FRAME;
+    if (is_two_pass_svc(cpi)) {
+      LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
+      if (cpi->svc.spatial_layer_id == 0) {
+        lc->is_key_frame = 0;
+      } else {
+        lc->is_key_frame =
+            cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame;
+        if (lc->is_key_frame)
+          cpi->ref_frame_flags &= (~VP9_LAST_FLAG);
+      }
+      cpi->ref_frame_flags &= (~VP9_ALT_FLAG);
+    } else if (is_one_pass_cbr_svc(cpi)) {
+      LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
+      if (cpi->svc.spatial_layer_id == cpi->svc.first_spatial_layer_to_encode) {
+        lc->is_key_frame = 0;
+      } else {
+        lc->is_key_frame =
+            cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame;
+      }
+      target = calc_pframe_target_size_one_pass_cbr(cpi);
+    }
+  }
+
+  // Any update/change of global cyclic refresh parameters (amount/delta-qp)
+  // should be done here, before the frame qp is selected.
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+    vp9_cyclic_refresh_update_parameters(cpi);
+
+  vp9_rc_set_frame_target(cpi, target);
+  rc->frames_till_gf_update_due = INT_MAX;
+  rc->baseline_gf_interval = INT_MAX;
+}
+
+void vp9_rc_get_one_pass_cbr_params(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  int target;
+  // TODO(yaowu): replace the "auto_key && 0" below with proper decision logic.
+  if ((cm->current_video_frame == 0 ||
+      (cpi->frame_flags & FRAMEFLAGS_KEY) ||
+      rc->frames_to_key == 0 ||
+      (cpi->oxcf.auto_key && 0))) {
+    cm->frame_type = KEY_FRAME;
+    rc->this_key_frame_forced = cm->current_video_frame != 0 &&
+                                rc->frames_to_key == 0;
+    rc->frames_to_key = cpi->oxcf.key_freq;
+    rc->kf_boost = DEFAULT_KF_BOOST;
+    rc->source_alt_ref_active = 0;
+  } else {
+    cm->frame_type = INTER_FRAME;
+  }
+  if (rc->frames_till_gf_update_due == 0) {
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+      vp9_cyclic_refresh_set_golden_update(cpi);
+    else
+      rc->baseline_gf_interval =
+          (rc->min_gf_interval + rc->max_gf_interval) / 2;
+    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+    // NOTE: frames_till_gf_update_due must be <= frames_to_key.
+    if (rc->frames_till_gf_update_due > rc->frames_to_key)
+      rc->frames_till_gf_update_due = rc->frames_to_key;
+    cpi->refresh_golden_frame = 1;
+    rc->gfu_boost = DEFAULT_GF_BOOST;
+  }
+
+  // Any update/change of global cyclic refresh parameters (amount/delta-qp)
+  // should be done here, before the frame qp is selected.
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+    vp9_cyclic_refresh_update_parameters(cpi);
+
+  if (cm->frame_type == KEY_FRAME)
+    target = calc_iframe_target_size_one_pass_cbr(cpi);
+  else
+    target = calc_pframe_target_size_one_pass_cbr(cpi);
+
+  vp9_rc_set_frame_target(cpi, target);
+  if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC)
+    cpi->resize_pending = vp9_resize_one_pass_cbr(cpi);
+  else
+    cpi->resize_pending = 0;
+}
+
+int vp9_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget,
+                       vpx_bit_depth_t bit_depth) {
+  int start_index = rc->worst_quality;
+  int target_index = rc->worst_quality;
+  int i;
+
+  // Convert the average q value to an index.
+  for (i = rc->best_quality; i < rc->worst_quality; ++i) {
+    start_index = i;
+    if (vp9_convert_qindex_to_q(i, bit_depth) >= qstart)
+      break;
+  }
+
+  // Convert the q target to an index
+  for (i = rc->best_quality; i < rc->worst_quality; ++i) {
+    target_index = i;
+    if (vp9_convert_qindex_to_q(i, bit_depth) >= qtarget)
+      break;
+  }
+
+  return target_index - start_index;
+}
+
+int vp9_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type,
+                               int qindex, double rate_target_ratio,
+                               vpx_bit_depth_t bit_depth) {
+  int target_index = rc->worst_quality;
+  int i;
+
+  // Look up the current projected bits per block for the base index
+  const int base_bits_per_mb = vp9_rc_bits_per_mb(frame_type, qindex, 1.0,
+                                                  bit_depth);
+
+  // Find the target bits per mb based on the base value and given ratio.
+  const int target_bits_per_mb = (int)(rate_target_ratio * base_bits_per_mb);
+
+  // Convert the q target to an index
+  for (i = rc->best_quality; i < rc->worst_quality; ++i) {
+    if (vp9_rc_bits_per_mb(frame_type, i, 1.0, bit_depth) <=
+        target_bits_per_mb) {
+      target_index = i;
+      break;
+    }
+  }
+  return target_index - qindex;
+}
+
+void vp9_rc_set_gf_interval_range(const VP9_COMP *const cpi,
+                                  RATE_CONTROL *const rc) {
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+
+  // Special case code for 1 pass fixed Q mode tests
+  if ((oxcf->pass == 0) && (oxcf->rc_mode == VPX_Q)) {
+    rc->max_gf_interval = FIXED_GF_INTERVAL;
+    rc->min_gf_interval = FIXED_GF_INTERVAL;
+    rc->static_scene_max_gf_interval = FIXED_GF_INTERVAL;
+  } else {
+    // Set Maximum gf/arf interval
+    rc->max_gf_interval = oxcf->max_gf_interval;
+    rc->min_gf_interval = oxcf->min_gf_interval;
+    if (rc->min_gf_interval == 0)
+      rc->min_gf_interval = vp9_rc_get_default_min_gf_interval(
+          oxcf->width, oxcf->height, cpi->framerate);
+    if (rc->max_gf_interval == 0)
+      rc->max_gf_interval = vp9_rc_get_default_max_gf_interval(
+          cpi->framerate, rc->min_gf_interval);
+
+    // Extended interval for genuinely static scenes
+    rc->static_scene_max_gf_interval = MAX_LAG_BUFFERS * 2;
+
+    if (is_altref_enabled(cpi)) {
+      if (rc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1)
+        rc->static_scene_max_gf_interval = oxcf->lag_in_frames - 1;
+    }
+
+    if (rc->max_gf_interval > rc->static_scene_max_gf_interval)
+      rc->max_gf_interval = rc->static_scene_max_gf_interval;
+
+    // Clamp min to max
+    rc->min_gf_interval = VPXMIN(rc->min_gf_interval, rc->max_gf_interval);
+  }
+}
+
+void vp9_rc_update_framerate(VP9_COMP *cpi) {
+  const VP9_COMMON *const cm = &cpi->common;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  RATE_CONTROL *const rc = &cpi->rc;
+  int vbr_max_bits;
+
+  rc->avg_frame_bandwidth = (int)(oxcf->target_bandwidth / cpi->framerate);
+  rc->min_frame_bandwidth = (int)(rc->avg_frame_bandwidth *
+                                oxcf->two_pass_vbrmin_section / 100);
+
+  rc->min_frame_bandwidth =
+      VPXMAX(rc->min_frame_bandwidth, FRAME_OVERHEAD_BITS);
+
+  // A maximum bitrate for a frame is defined.
+  // The baseline for this aligns with HW implementations that
+  // can support decode of 1080P content up to a bitrate of MAX_MB_RATE bits
+  // per 16x16 MB (averaged over a frame). However this limit is extended if
+  // a very high rate is given on the command line or the the rate cannnot
+  // be acheived because of a user specificed max q (e.g. when the user
+  // specifies lossless encode.
+  vbr_max_bits = (int)(((int64_t)rc->avg_frame_bandwidth *
+                     oxcf->two_pass_vbrmax_section) / 100);
+  rc->max_frame_bandwidth =
+      VPXMAX(VPXMAX((cm->MBs * MAX_MB_RATE), MAXRATE_1080P), vbr_max_bits);
+
+  vp9_rc_set_gf_interval_range(cpi, rc);
+}
+
+#define VBR_PCT_ADJUSTMENT_LIMIT 50
+// For VBR...adjustment to the frame target based on error from previous frames
+static void vbr_rate_correction(VP9_COMP *cpi, int *this_frame_target) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  int64_t vbr_bits_off_target = rc->vbr_bits_off_target;
+  int max_delta;
+  double position_factor = 1.0;
+
+  // How far through the clip are we.
+  // This number is used to damp the per frame rate correction.
+  // Range 0 - 1.0
+  if (cpi->twopass.total_stats.count) {
+    position_factor = sqrt((double)cpi->common.current_video_frame /
+                           cpi->twopass.total_stats.count);
+  }
+  max_delta = (int)(position_factor *
+                    ((*this_frame_target * VBR_PCT_ADJUSTMENT_LIMIT) / 100));
+
+  // vbr_bits_off_target > 0 means we have extra bits to spend
+  if (vbr_bits_off_target > 0) {
+    *this_frame_target +=
+      (vbr_bits_off_target > max_delta) ? max_delta
+                                        : (int)vbr_bits_off_target;
+  } else {
+    *this_frame_target -=
+      (vbr_bits_off_target < -max_delta) ? max_delta
+                                         : (int)-vbr_bits_off_target;
+  }
+
+  // Fast redistribution of bits arising from massive local undershoot.
+  // Dont do it for kf,arf,gf or overlay frames.
+  if (!frame_is_kf_gf_arf(cpi) && !rc->is_src_frame_alt_ref &&
+      rc->vbr_bits_off_target_fast) {
+    int one_frame_bits = VPXMAX(rc->avg_frame_bandwidth, *this_frame_target);
+    int fast_extra_bits;
+    fast_extra_bits = (int)VPXMIN(rc->vbr_bits_off_target_fast, one_frame_bits);
+    fast_extra_bits = (int)VPXMIN(
+        fast_extra_bits,
+        VPXMAX(one_frame_bits / 8, rc->vbr_bits_off_target_fast / 8));
+    *this_frame_target += (int)fast_extra_bits;
+    rc->vbr_bits_off_target_fast -= fast_extra_bits;
+  }
+}
+
+void vp9_set_target_rate(VP9_COMP *cpi) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  int target_rate = rc->base_frame_target;
+
+  if (cpi->common.frame_type == KEY_FRAME)
+    target_rate = vp9_rc_clamp_iframe_target_size(cpi, target_rate);
+  else
+    target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate);
+
+  // Correction to rate target based on prior over or under shoot.
+  if (cpi->oxcf.rc_mode == VPX_VBR || cpi->oxcf.rc_mode == VPX_CQ)
+    vbr_rate_correction(cpi, &target_rate);
+  vp9_rc_set_frame_target(cpi, target_rate);
+}
+
+// Check if we should resize, based on average QP from past x frames.
+// Only allow for resize at most one scale down for now, scaling factor is 2.
+int vp9_resize_one_pass_cbr(VP9_COMP *cpi) {
+  const VP9_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  RESIZE_ACTION resize_action = NO_RESIZE;
+  int avg_qp_thr1 = 70;
+  int avg_qp_thr2 = 50;
+  int min_width = 180;
+  int min_height = 180;
+  int down_size_on = 1;
+  cpi->resize_scale_num = 1;
+  cpi->resize_scale_den = 1;
+  // Don't resize on key frame; reset the counters on key frame.
+  if (cm->frame_type == KEY_FRAME) {
+    cpi->resize_avg_qp = 0;
+    cpi->resize_count = 0;
+    return 0;
+  }
+  // Check current frame reslution to avoid generating frames smaller than
+  // the minimum resolution.
+  if (ONEHALFONLY_RESIZE) {
+    if ((cm->width >> 1) < min_width || (cm->height >> 1) < min_height)
+      down_size_on = 0;
+  } else {
+    if (cpi->resize_state == ORIG &&
+        (cm->width * 3 / 4 < min_width ||
+         cm->height * 3 / 4 < min_height))
+      return 0;
+    else if (cpi->resize_state == THREE_QUARTER &&
+             ((cpi->oxcf.width >> 1) < min_width ||
+              (cpi->oxcf.height >> 1) < min_height))
+      down_size_on = 0;
+  }
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+  // If denoiser is on, apply a smaller qp threshold.
+  if (cpi->oxcf.noise_sensitivity > 0) {
+    avg_qp_thr1 = 60;
+    avg_qp_thr2 = 40;
+  }
+#endif
+
+  // Resize based on average buffer underflow and QP over some window.
+  // Ignore samples close to key frame, since QP is usually high after key.
+  if (cpi->rc.frames_since_key > 2 * cpi->framerate) {
+    const int window = (int)(4 * cpi->framerate);
+    cpi->resize_avg_qp += cm->base_qindex;
+    if (cpi->rc.buffer_level < (int)(30 * rc->optimal_buffer_level / 100))
+      ++cpi->resize_buffer_underflow;
+    ++cpi->resize_count;
+    // Check for resize action every "window" frames.
+    if (cpi->resize_count >= window) {
+      int avg_qp = cpi->resize_avg_qp / cpi->resize_count;
+      // Resize down if buffer level has underflowed sufficient amount in past
+      // window, and we are at original or 3/4 of original resolution.
+      // Resize back up if average QP is low, and we are currently in a resized
+      // down state, i.e. 1/2 or 3/4 of original resolution.
+      // Currently, use a flag to turn 3/4 resizing feature on/off.
+      if (cpi->resize_buffer_underflow > (cpi->resize_count >> 2)) {
+        if (cpi->resize_state == THREE_QUARTER && down_size_on) {
+          resize_action = DOWN_ONEHALF;
+          cpi->resize_state = ONE_HALF;
+        } else if (cpi->resize_state == ORIG) {
+          resize_action = ONEHALFONLY_RESIZE ? DOWN_ONEHALF : DOWN_THREEFOUR;
+          cpi->resize_state = ONEHALFONLY_RESIZE ? ONE_HALF : THREE_QUARTER;
+        }
+      } else if (cpi->resize_state != ORIG &&
+                 avg_qp < avg_qp_thr1 * cpi->rc.worst_quality / 100) {
+        if (cpi->resize_state == THREE_QUARTER ||
+            avg_qp < avg_qp_thr2 * cpi->rc.worst_quality / 100 ||
+            ONEHALFONLY_RESIZE) {
+          resize_action = UP_ORIG;
+          cpi->resize_state = ORIG;
+        } else if (cpi->resize_state == ONE_HALF) {
+          resize_action = UP_THREEFOUR;
+          cpi->resize_state = THREE_QUARTER;
+        }
+      }
+      // Reset for next window measurement.
+      cpi->resize_avg_qp = 0;
+      cpi->resize_count = 0;
+      cpi->resize_buffer_underflow = 0;
+    }
+  }
+  // If decision is to resize, reset some quantities, and check is we should
+  // reduce rate correction factor,
+  if (resize_action != NO_RESIZE) {
+    int target_bits_per_frame;
+    int active_worst_quality;
+    int qindex;
+    int tot_scale_change;
+    if (resize_action == DOWN_THREEFOUR || resize_action == UP_THREEFOUR) {
+      cpi->resize_scale_num = 3;
+      cpi->resize_scale_den = 4;
+    } else if (resize_action == DOWN_ONEHALF) {
+      cpi->resize_scale_num = 1;
+      cpi->resize_scale_den = 2;
+    } else {  // UP_ORIG or anything else
+      cpi->resize_scale_num = 1;
+      cpi->resize_scale_den = 1;
+    }
+    tot_scale_change = (cpi->resize_scale_den * cpi->resize_scale_den) /
+        (cpi->resize_scale_num * cpi->resize_scale_num);
+    // Reset buffer level to optimal, update target size.
+    rc->buffer_level = rc->optimal_buffer_level;
+    rc->bits_off_target = rc->optimal_buffer_level;
+    rc->this_frame_target = calc_pframe_target_size_one_pass_cbr(cpi);
+    // Get the projected qindex, based on the scaled target frame size (scaled
+    // so target_bits_per_mb in vp9_rc_regulate_q will be correct target).
+    target_bits_per_frame = (resize_action >= 0) ?
+        rc->this_frame_target * tot_scale_change :
+        rc->this_frame_target / tot_scale_change;
+    active_worst_quality = calc_active_worst_quality_one_pass_cbr(cpi);
+    qindex = vp9_rc_regulate_q(cpi,
+                               target_bits_per_frame,
+                               rc->best_quality,
+                               active_worst_quality);
+    // If resize is down, check if projected q index is close to worst_quality,
+    // and if so, reduce the rate correction factor (since likely can afford
+    // lower q for resized frame).
+    if (resize_action > 0 &&
+        qindex > 90 * cpi->rc.worst_quality / 100) {
+      rc->rate_correction_factors[INTER_NORMAL] *= 0.85;
+    }
+    // If resize is back up, check if projected q index is too much above the
+    // current base_qindex, and if so, reduce the rate correction factor
+    // (since prefer to keep q for resized frame at least close to previous q).
+    if (resize_action < 0 &&
+       qindex > 130 * cm->base_qindex / 100) {
+      rc->rate_correction_factors[INTER_NORMAL] *= 0.9;
+    }
+  }
+  return resize_action;
+}
+
+// Compute average source sad (temporal sad: between current source and
+// previous source) over a subset of superblocks. Use this is detect big changes
+// in content and allow rate control to react.
+// TODO(marpan): Superblock sad is computed again in variance partition for
+// non-rd mode (but based on last reconstructed frame). Should try to reuse
+// these computations.
+void vp9_avg_source_sad(VP9_COMP *cpi) {
+  VP9_COMMON * const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  rc->high_source_sad = 0;
+  if (cpi->Last_Source != NULL) {
+    const uint8_t *src_y = cpi->Source->y_buffer;
+    const int src_ystride = cpi->Source->y_stride;
+    const uint8_t *last_src_y = cpi->Last_Source->y_buffer;
+    const int last_src_ystride = cpi->Last_Source->y_stride;
+    int sbi_row, sbi_col;
+    const BLOCK_SIZE bsize = BLOCK_64X64;
+    // Loop over sub-sample of frame, and compute average sad over 64x64 blocks.
+    uint64_t avg_sad = 0;
+    int num_samples = 0;
+    int sb_cols = (cm->mi_cols + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE;
+    int sb_rows = (cm->mi_rows + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE;
+    for (sbi_row = 0; sbi_row < sb_rows; sbi_row ++) {
+      for (sbi_col = 0; sbi_col < sb_cols; sbi_col ++) {
+        // Checker-board pattern, ignore boundary.
+        if ((sbi_row > 0 && sbi_col > 0) &&
+            (sbi_row < sb_rows - 1 && sbi_col < sb_cols - 1) &&
+            ((sbi_row % 2 == 0 && sbi_col % 2 == 0) ||
+            (sbi_row % 2 != 0 && sbi_col % 2 != 0))) {
+          num_samples++;
+          avg_sad += cpi->fn_ptr[bsize].sdf(src_y,
+                                            src_ystride,
+                                            last_src_y,
+                                            last_src_ystride);
+        }
+        src_y += 64;
+        last_src_y += 64;
+      }
+      src_y += (src_ystride << 6) - (sb_cols << 6);
+      last_src_y += (last_src_ystride << 6) - (sb_cols << 6);
+    }
+    if (num_samples > 0)
+      avg_sad = avg_sad / num_samples;
+    // Set high_source_sad flag if we detect very high increase in avg_sad
+    // between current and the previous frame value(s). Use a minimum threshold
+    // for cases where there is small change from content that is completely
+    // static.
+    if (avg_sad > VPXMAX(4000, (rc->avg_source_sad << 3)) &&
+        rc->frames_since_key > 1)
+      rc->high_source_sad = 1;
+    else
+      rc->high_source_sad = 0;
+    rc->avg_source_sad = (rc->avg_source_sad + avg_sad) >> 1;
+  }
+}
+
+// Test if encoded frame will significantly overshoot the target bitrate, and
+// if so, set the QP, reset/adjust some rate control parameters, and return 1.
+int vp9_encodedframe_overshoot(VP9_COMP *cpi,
+                               int frame_size,
+                               int *q) {
+  VP9_COMMON * const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  int thresh_qp = 3 * (rc->worst_quality >> 2);
+  int thresh_rate = rc->avg_frame_bandwidth * 10;
+  if (cm->base_qindex < thresh_qp &&
+      frame_size > thresh_rate) {
+    double rate_correction_factor =
+        cpi->rc.rate_correction_factors[INTER_NORMAL];
+    const int target_size = cpi->rc.avg_frame_bandwidth;
+    double new_correction_factor;
+    int target_bits_per_mb;
+    double q2;
+    int enumerator;
+    // Force a re-encode, and for now use max-QP.
+    *q = cpi->rc.worst_quality;
+    // Adjust avg_frame_qindex, buffer_level, and rate correction factors, as
+    // these parameters will affect QP selection for subsequent frames. If they
+    // have settled down to a very different (low QP) state, then not adjusting
+    // them may cause next frame to select low QP and overshoot again.
+    cpi->rc.avg_frame_qindex[INTER_FRAME] = *q;
+    rc->buffer_level = rc->optimal_buffer_level;
+    rc->bits_off_target = rc->optimal_buffer_level;
+    // Reset rate under/over-shoot flags.
+    cpi->rc.rc_1_frame = 0;
+    cpi->rc.rc_2_frame = 0;
+    // Adjust rate correction factor.
+    target_bits_per_mb = ((uint64_t)target_size << BPER_MB_NORMBITS) / cm->MBs;
+    // Rate correction factor based on target_bits_per_mb and qp (==max_QP).
+    // This comes from the inverse computation of vp9_rc_bits_per_mb().
+    q2 = vp9_convert_qindex_to_q(*q, cm->bit_depth);
+    enumerator = 1800000;  // Factor for inter frame.
+    enumerator += (int)(enumerator * q2) >> 12;
+    new_correction_factor = (double)target_bits_per_mb * q2 / enumerator;
+    if (new_correction_factor > rate_correction_factor) {
+      rate_correction_factor =
+          VPXMIN(2.0 * rate_correction_factor, new_correction_factor);
+      if (rate_correction_factor > MAX_BPB_FACTOR)
+        rate_correction_factor = MAX_BPB_FACTOR;
+      cpi->rc.rate_correction_factors[INTER_NORMAL] = rate_correction_factor;
+    }
+    // For temporal layers, reset the rate control parametes across all
+    // temporal layers.
+    if (cpi->use_svc) {
+      int i = 0;
+      SVC *svc = &cpi->svc;
+      for (i = 0; i < svc->number_temporal_layers; ++i) {
+        const int layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, i,
+                                           svc->number_temporal_layers);
+        LAYER_CONTEXT *lc = &svc->layer_context[layer];
+        RATE_CONTROL *lrc = &lc->rc;
+        lrc->avg_frame_qindex[INTER_FRAME] = *q;
+        lrc->buffer_level = rc->optimal_buffer_level;
+        lrc->bits_off_target = rc->optimal_buffer_level;
+        lrc->rc_1_frame = 0;
+        lrc->rc_2_frame = 0;
+        lrc->rate_correction_factors[INTER_NORMAL] =
+            rate_correction_factor;
+      }
+    }
+    return 1;
+  } else {
+    return 0;
+  }
+}
diff --git a/libs/libvpx/vp9/encoder/vp9_ratectrl.h b/libs/libvpx/vp9/encoder/vp9_ratectrl.h
new file mode 100644
index 0000000000..3df909cb18
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_ratectrl.h
@@ -0,0 +1,286 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP9_ENCODER_VP9_RATECTRL_H_
+#define VP9_ENCODER_VP9_RATECTRL_H_
+
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_integer.h"
+
+#include "vp9/common/vp9_blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Bits Per MB at different Q (Multiplied by 512)
+#define BPER_MB_NORMBITS    9
+
+#define MIN_GF_INTERVAL     4
+#define MAX_GF_INTERVAL     16
+#define FIXED_GF_INTERVAL   8    // Used in some testing modes only
+#define ONEHALFONLY_RESIZE  0
+
+typedef enum {
+  INTER_NORMAL = 0,
+  INTER_HIGH = 1,
+  GF_ARF_LOW = 2,
+  GF_ARF_STD = 3,
+  KF_STD = 4,
+  RATE_FACTOR_LEVELS = 5
+} RATE_FACTOR_LEVEL;
+
+// Internal frame scaling level.
+typedef enum {
+  UNSCALED = 0,     // Frame is unscaled.
+  SCALE_STEP1 = 1,  // First-level down-scaling.
+  FRAME_SCALE_STEPS
+} FRAME_SCALE_LEVEL;
+
+typedef enum {
+  NO_RESIZE = 0,
+  DOWN_THREEFOUR = 1,  // From orig to 3/4.
+  DOWN_ONEHALF = 2,    // From orig or 3/4 to 1/2.
+  UP_THREEFOUR = -1,   // From 1/2 to 3/4.
+  UP_ORIG = -2,        // From 1/2 or 3/4 to orig.
+} RESIZE_ACTION;
+
+typedef enum {
+  ORIG = 0,
+  THREE_QUARTER = 1,
+  ONE_HALF = 2
+} RESIZE_STATE;
+
+// Frame dimensions multiplier wrt the native frame size, in 1/16ths,
+// specified for the scale-up case.
+// e.g. 24 => 16/24 = 2/3 of native size. The restriction to 1/16th is
+// intended to match the capabilities of the normative scaling filters,
+// giving precedence to the up-scaling accuracy.
+static const int frame_scale_factor[FRAME_SCALE_STEPS] = {16, 24};
+
+// Multiplier of the target rate to be used as threshold for triggering scaling.
+static const double rate_thresh_mult[FRAME_SCALE_STEPS] = {1.0, 2.0};
+
+// Scale dependent Rate Correction Factor multipliers. Compensates for the
+// greater number of bits per pixel generated in down-scaled frames.
+static const double rcf_mult[FRAME_SCALE_STEPS] = {1.0, 2.0};
+
+typedef struct {
+  // Rate targetting variables
+  int base_frame_target;           // A baseline frame target before adjustment
+                                   // for previous under or over shoot.
+  int this_frame_target;           // Actual frame target after rc adjustment.
+  int projected_frame_size;
+  int sb64_target_rate;
+  int last_q[FRAME_TYPES];         // Separate values for Intra/Inter
+  int last_boosted_qindex;         // Last boosted GF/KF/ARF q
+  int last_kf_qindex;              // Q index of the last key frame coded.
+
+  int gfu_boost;
+  int last_boost;
+  int kf_boost;
+
+  double rate_correction_factors[RATE_FACTOR_LEVELS];
+
+  int frames_since_golden;
+  int frames_till_gf_update_due;
+  int min_gf_interval;
+  int max_gf_interval;
+  int static_scene_max_gf_interval;
+  int baseline_gf_interval;
+  int constrained_gf_group;
+  int frames_to_key;
+  int frames_since_key;
+  int this_key_frame_forced;
+  int next_key_frame_forced;
+  int source_alt_ref_pending;
+  int source_alt_ref_active;
+  int is_src_frame_alt_ref;
+
+  int avg_frame_bandwidth;  // Average frame size target for clip
+  int min_frame_bandwidth;  // Minimum allocation used for any frame
+  int max_frame_bandwidth;  // Maximum burst rate allowed for a frame.
+
+  int ni_av_qi;
+  int ni_tot_qi;
+  int ni_frames;
+  int avg_frame_qindex[FRAME_TYPES];
+  double tot_q;
+  double avg_q;
+
+  int64_t buffer_level;
+  int64_t bits_off_target;
+  int64_t vbr_bits_off_target;
+  int64_t vbr_bits_off_target_fast;
+
+  int decimation_factor;
+  int decimation_count;
+
+  int rolling_target_bits;
+  int rolling_actual_bits;
+
+  int long_rolling_target_bits;
+  int long_rolling_actual_bits;
+
+  int rate_error_estimate;
+
+  int64_t total_actual_bits;
+  int64_t total_target_bits;
+  int64_t total_target_vs_actual;
+
+  int worst_quality;
+  int best_quality;
+
+  int64_t starting_buffer_level;
+  int64_t optimal_buffer_level;
+  int64_t maximum_buffer_size;
+
+  // rate control history for last frame(1) and the frame before(2).
+  // -1: undershot
+  //  1: overshoot
+  //  0: not initialized.
+  int rc_1_frame;
+  int rc_2_frame;
+  int q_1_frame;
+  int q_2_frame;
+
+  // Auto frame-scaling variables.
+  FRAME_SCALE_LEVEL frame_size_selector;
+  FRAME_SCALE_LEVEL next_frame_size_selector;
+  int frame_width[FRAME_SCALE_STEPS];
+  int frame_height[FRAME_SCALE_STEPS];
+  int rf_level_maxq[RATE_FACTOR_LEVELS];
+
+  uint64_t avg_source_sad;
+  int high_source_sad;
+} RATE_CONTROL;
+
+struct VP9_COMP;
+struct VP9EncoderConfig;
+
+void vp9_rc_init(const struct VP9EncoderConfig *oxcf, int pass,
+                 RATE_CONTROL *rc);
+
+int vp9_estimate_bits_at_q(FRAME_TYPE frame_kind, int q, int mbs,
+                           double correction_factor,
+                           vpx_bit_depth_t bit_depth);
+
+double vp9_convert_qindex_to_q(int qindex, vpx_bit_depth_t bit_depth);
+
+void vp9_rc_init_minq_luts(void);
+
+int vp9_rc_get_default_min_gf_interval(int width, int height, double framerate);
+// Note vp9_rc_get_default_max_gf_interval() requires the min_gf_interval to
+// be passed in to ensure that the max_gf_interval returned is at least as bis
+// as that.
+int vp9_rc_get_default_max_gf_interval(double framerate, int min_frame_rate);
+
+// Generally at the high level, the following flow is expected
+// to be enforced for rate control:
+// First call per frame, one of:
+//   vp9_rc_get_one_pass_vbr_params()
+//   vp9_rc_get_one_pass_cbr_params()
+//   vp9_rc_get_svc_params()
+//   vp9_rc_get_first_pass_params()
+//   vp9_rc_get_second_pass_params()
+// depending on the usage to set the rate control encode parameters desired.
+//
+// Then, call encode_frame_to_data_rate() to perform the
+// actual encode. This function will in turn call encode_frame()
+// one or more times, followed by one of:
+//   vp9_rc_postencode_update()
+//   vp9_rc_postencode_update_drop_frame()
+//
+// The majority of rate control parameters are only expected
+// to be set in the vp9_rc_get_..._params() functions and
+// updated during the vp9_rc_postencode_update...() functions.
+// The only exceptions are vp9_rc_drop_frame() and
+// vp9_rc_update_rate_correction_factors() functions.
+
+// Functions to set parameters for encoding before the actual
+// encode_frame_to_data_rate() function.
+void vp9_rc_get_one_pass_vbr_params(struct VP9_COMP *cpi);
+void vp9_rc_get_one_pass_cbr_params(struct VP9_COMP *cpi);
+void vp9_rc_get_svc_params(struct VP9_COMP *cpi);
+
+// Post encode update of the rate control parameters based
+// on bytes used
+void vp9_rc_postencode_update(struct VP9_COMP *cpi, uint64_t bytes_used);
+// Post encode update of the rate control parameters for dropped frames
+void vp9_rc_postencode_update_drop_frame(struct VP9_COMP *cpi);
+
+// Updates rate correction factors
+// Changes only the rate correction factors in the rate control structure.
+void vp9_rc_update_rate_correction_factors(struct VP9_COMP *cpi);
+
+// Decide if we should drop this frame: For 1-pass CBR.
+// Changes only the decimation count in the rate control structure
+int vp9_rc_drop_frame(struct VP9_COMP *cpi);
+
+// Computes frame size bounds.
+void vp9_rc_compute_frame_size_bounds(const struct VP9_COMP *cpi,
+                                      int this_frame_target,
+                                      int *frame_under_shoot_limit,
+                                      int *frame_over_shoot_limit);
+
+// Picks q and q bounds given the target for bits
+int vp9_rc_pick_q_and_bounds(const struct VP9_COMP *cpi,
+                             int *bottom_index,
+                             int *top_index);
+
+// Estimates q to achieve a target bits per frame
+int vp9_rc_regulate_q(const struct VP9_COMP *cpi, int target_bits_per_frame,
+                      int active_best_quality, int active_worst_quality);
+
+// Estimates bits per mb for a given qindex and correction factor.
+int vp9_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
+                       double correction_factor, vpx_bit_depth_t bit_depth);
+
+// Clamping utilities for bitrate targets for iframes and pframes.
+int vp9_rc_clamp_iframe_target_size(const struct VP9_COMP *const cpi,
+                                    int target);
+int vp9_rc_clamp_pframe_target_size(const struct VP9_COMP *const cpi,
+                                    int target);
+// Utility to set frame_target into the RATE_CONTROL structure
+// This function is called only from the vp9_rc_get_..._params() functions.
+void vp9_rc_set_frame_target(struct VP9_COMP *cpi, int target);
+
+// Computes a q delta (in "q index" terms) to get from a starting q value
+// to a target q value
+int vp9_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget,
+                       vpx_bit_depth_t bit_depth);
+
+// Computes a q delta (in "q index" terms) to get from a starting q value
+// to a value that should equate to the given rate ratio.
+int vp9_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type,
+                               int qindex, double rate_target_ratio,
+                               vpx_bit_depth_t bit_depth);
+
+int vp9_frame_type_qdelta(const struct VP9_COMP *cpi, int rf_level, int q);
+
+void vp9_rc_update_framerate(struct VP9_COMP *cpi);
+
+void vp9_rc_set_gf_interval_range(const struct VP9_COMP *const cpi,
+                                  RATE_CONTROL *const rc);
+
+void vp9_set_target_rate(struct VP9_COMP *cpi);
+
+int vp9_resize_one_pass_cbr(struct VP9_COMP *cpi);
+
+void vp9_avg_source_sad(struct VP9_COMP *cpi);
+
+int vp9_encodedframe_overshoot(struct VP9_COMP *cpi, int frame_size, int *q);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_VP9_RATECTRL_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_rd.c b/libs/libvpx/vp9/encoder/vp9_rd.c
new file mode 100644
index 0000000000..fc32d19112
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_rd.c
@@ -0,0 +1,667 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "./vp9_rtcd.h"
+
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/bitops.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/system_state.h"
+
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_entropy.h"
+#include "vp9/common/vp9_entropymode.h"
+#include "vp9/common/vp9_mvref_common.h"
+#include "vp9/common/vp9_pred_common.h"
+#include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/common/vp9_reconintra.h"
+#include "vp9/common/vp9_seg_common.h"
+
+#include "vp9/encoder/vp9_cost.h"
+#include "vp9/encoder/vp9_encodemb.h"
+#include "vp9/encoder/vp9_encodemv.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_mcomp.h"
+#include "vp9/encoder/vp9_quantize.h"
+#include "vp9/encoder/vp9_ratectrl.h"
+#include "vp9/encoder/vp9_rd.h"
+#include "vp9/encoder/vp9_tokenize.h"
+
+#define RD_THRESH_POW      1.25
+
+// Factor to weigh the rate for switchable interp filters.
+#define SWITCHABLE_INTERP_RATE_FACTOR 1
+
+void vp9_rd_cost_reset(RD_COST *rd_cost) {
+  rd_cost->rate = INT_MAX;
+  rd_cost->dist = INT64_MAX;
+  rd_cost->rdcost = INT64_MAX;
+}
+
+void vp9_rd_cost_init(RD_COST *rd_cost) {
+  rd_cost->rate = 0;
+  rd_cost->dist = 0;
+  rd_cost->rdcost = 0;
+}
+
+// The baseline rd thresholds for breaking out of the rd loop for
+// certain modes are assumed to be based on 8x8 blocks.
+// This table is used to correct for block size.
+// The factors here are << 2 (2 = x0.5, 32 = x8 etc).
+static const uint8_t rd_thresh_block_size_factor[BLOCK_SIZES] = {
+  2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32
+};
+
+static void fill_mode_costs(VP9_COMP *cpi) {
+  const FRAME_CONTEXT *const fc = cpi->common.fc;
+  int i, j;
+
+  for (i = 0; i < INTRA_MODES; ++i)
+    for (j = 0; j < INTRA_MODES; ++j)
+      vp9_cost_tokens(cpi->y_mode_costs[i][j], vp9_kf_y_mode_prob[i][j],
+                      vp9_intra_mode_tree);
+
+  vp9_cost_tokens(cpi->mbmode_cost, fc->y_mode_prob[1], vp9_intra_mode_tree);
+  for (i = 0; i < INTRA_MODES; ++i) {
+    vp9_cost_tokens(cpi->intra_uv_mode_cost[KEY_FRAME][i],
+                    vp9_kf_uv_mode_prob[i], vp9_intra_mode_tree);
+    vp9_cost_tokens(cpi->intra_uv_mode_cost[INTER_FRAME][i],
+                    fc->uv_mode_prob[i], vp9_intra_mode_tree);
+  }
+
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+    vp9_cost_tokens(cpi->switchable_interp_costs[i],
+                    fc->switchable_interp_prob[i], vp9_switchable_interp_tree);
+}
+
+static void fill_token_costs(vp9_coeff_cost *c,
+                             vp9_coeff_probs_model (*p)[PLANE_TYPES]) {
+  int i, j, k, l;
+  TX_SIZE t;
+  for (t = TX_4X4; t <= TX_32X32; ++t)
+    for (i = 0; i < PLANE_TYPES; ++i)
+      for (j = 0; j < REF_TYPES; ++j)
+        for (k = 0; k < COEF_BANDS; ++k)
+          for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
+            vpx_prob probs[ENTROPY_NODES];
+            vp9_model_to_full_probs(p[t][i][j][k][l], probs);
+            vp9_cost_tokens((int *)c[t][i][j][k][0][l], probs,
+                            vp9_coef_tree);
+            vp9_cost_tokens_skip((int *)c[t][i][j][k][1][l], probs,
+                                 vp9_coef_tree);
+            assert(c[t][i][j][k][0][l][EOB_TOKEN] ==
+                   c[t][i][j][k][1][l][EOB_TOKEN]);
+          }
+}
+
+// Values are now correlated to quantizer.
+static int sad_per_bit16lut_8[QINDEX_RANGE];
+static int sad_per_bit4lut_8[QINDEX_RANGE];
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static int sad_per_bit16lut_10[QINDEX_RANGE];
+static int sad_per_bit4lut_10[QINDEX_RANGE];
+static int sad_per_bit16lut_12[QINDEX_RANGE];
+static int sad_per_bit4lut_12[QINDEX_RANGE];
+#endif
+
+static void init_me_luts_bd(int *bit16lut, int *bit4lut, int range,
+                            vpx_bit_depth_t bit_depth) {
+  int i;
+  // Initialize the sad lut tables using a formulaic calculation for now.
+  // This is to make it easier to resolve the impact of experimental changes
+  // to the quantizer tables.
+  for (i = 0; i < range; i++) {
+    const double q = vp9_convert_qindex_to_q(i, bit_depth);
+    bit16lut[i] = (int)(0.0418 * q + 2.4107);
+    bit4lut[i] = (int)(0.063 * q + 2.742);
+  }
+}
+
+void vp9_init_me_luts(void) {
+  init_me_luts_bd(sad_per_bit16lut_8, sad_per_bit4lut_8, QINDEX_RANGE,
+                  VPX_BITS_8);
+#if CONFIG_VP9_HIGHBITDEPTH
+  init_me_luts_bd(sad_per_bit16lut_10, sad_per_bit4lut_10, QINDEX_RANGE,
+                  VPX_BITS_10);
+  init_me_luts_bd(sad_per_bit16lut_12, sad_per_bit4lut_12, QINDEX_RANGE,
+                  VPX_BITS_12);
+#endif
+}
+
+static const int rd_boost_factor[16] = {
+  64, 32, 32, 32, 24, 16, 12, 12,
+  8, 8, 4, 4, 2, 2, 1, 0
+};
+static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = {
+  128, 144, 128, 128, 144
+};
+
+int vp9_compute_rd_mult(const VP9_COMP *cpi, int qindex) {
+  const int64_t q = vp9_dc_quant(qindex, 0, cpi->common.bit_depth);
+#if CONFIG_VP9_HIGHBITDEPTH
+  int64_t rdmult = 0;
+  switch (cpi->common.bit_depth) {
+    case VPX_BITS_8:
+      rdmult = 88 * q * q / 24;
+      break;
+    case VPX_BITS_10:
+      rdmult = ROUND_POWER_OF_TWO(88 * q * q / 24, 4);
+      break;
+    case VPX_BITS_12:
+      rdmult = ROUND_POWER_OF_TWO(88 * q * q / 24, 8);
+      break;
+    default:
+      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+      return -1;
+  }
+#else
+  int64_t rdmult = 88 * q * q / 24;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  if (cpi->oxcf.pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
+    const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+    const FRAME_UPDATE_TYPE frame_type = gf_group->update_type[gf_group->index];
+    const int boost_index = VPXMIN(15, (cpi->rc.gfu_boost / 100));
+
+    rdmult = (rdmult * rd_frame_type_factor[frame_type]) >> 7;
+    rdmult += ((rdmult * rd_boost_factor[boost_index]) >> 7);
+  }
+  if (rdmult < 1)
+    rdmult = 1;
+  return (int)rdmult;
+}
+
+static int compute_rd_thresh_factor(int qindex, vpx_bit_depth_t bit_depth) {
+  double q;
+#if CONFIG_VP9_HIGHBITDEPTH
+  switch (bit_depth) {
+    case VPX_BITS_8:
+      q = vp9_dc_quant(qindex, 0, VPX_BITS_8) / 4.0;
+      break;
+    case VPX_BITS_10:
+      q = vp9_dc_quant(qindex, 0, VPX_BITS_10) / 16.0;
+      break;
+    case VPX_BITS_12:
+      q = vp9_dc_quant(qindex, 0, VPX_BITS_12) / 64.0;
+      break;
+    default:
+      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+      return -1;
+  }
+#else
+  (void) bit_depth;
+  q = vp9_dc_quant(qindex, 0, VPX_BITS_8) / 4.0;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  // TODO(debargha): Adjust the function below.
+  return VPXMAX((int)(pow(q, RD_THRESH_POW) * 5.12), 8);
+}
+
+void vp9_initialize_me_consts(VP9_COMP *cpi, MACROBLOCK *x, int qindex) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  switch (cpi->common.bit_depth) {
+    case VPX_BITS_8:
+      x->sadperbit16 = sad_per_bit16lut_8[qindex];
+      x->sadperbit4 = sad_per_bit4lut_8[qindex];
+      break;
+    case VPX_BITS_10:
+      x->sadperbit16 = sad_per_bit16lut_10[qindex];
+      x->sadperbit4 = sad_per_bit4lut_10[qindex];
+      break;
+    case VPX_BITS_12:
+      x->sadperbit16 = sad_per_bit16lut_12[qindex];
+      x->sadperbit4 = sad_per_bit4lut_12[qindex];
+      break;
+    default:
+      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+  }
+#else
+  (void)cpi;
+  x->sadperbit16 = sad_per_bit16lut_8[qindex];
+  x->sadperbit4 = sad_per_bit4lut_8[qindex];
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}
+
+static void set_block_thresholds(const VP9_COMMON *cm, RD_OPT *rd) {
+  int i, bsize, segment_id;
+
+  for (segment_id = 0; segment_id < MAX_SEGMENTS; ++segment_id) {
+    const int qindex =
+        clamp(vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex) +
+              cm->y_dc_delta_q, 0, MAXQ);
+    const int q = compute_rd_thresh_factor(qindex, cm->bit_depth);
+
+    for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) {
+      // Threshold here seems unnecessarily harsh but fine given actual
+      // range of values used for cpi->sf.thresh_mult[].
+      const int t = q * rd_thresh_block_size_factor[bsize];
+      const int thresh_max = INT_MAX / t;
+
+      if (bsize >= BLOCK_8X8) {
+        for (i = 0; i < MAX_MODES; ++i)
+          rd->threshes[segment_id][bsize][i] =
+              rd->thresh_mult[i] < thresh_max
+                  ? rd->thresh_mult[i] * t / 4
+                  : INT_MAX;
+      } else {
+        for (i = 0; i < MAX_REFS; ++i)
+          rd->threshes[segment_id][bsize][i] =
+              rd->thresh_mult_sub8x8[i] < thresh_max
+                  ? rd->thresh_mult_sub8x8[i] * t / 4
+                  : INT_MAX;
+      }
+    }
+  }
+}
+
+void vp9_initialize_rd_consts(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  RD_OPT *const rd = &cpi->rd;
+  int i;
+
+  vpx_clear_system_state();
+
+  rd->RDDIV = RDDIV_BITS;  // In bits (to multiply D by 128).
+  rd->RDMULT = vp9_compute_rd_mult(cpi, cm->base_qindex + cm->y_dc_delta_q);
+
+  set_error_per_bit(x, rd->RDMULT);
+
+  x->select_tx_size = (cpi->sf.tx_size_search_method == USE_LARGESTALL &&
+                       cm->frame_type != KEY_FRAME) ? 0 : 1;
+
+  set_block_thresholds(cm, rd);
+  set_partition_probs(cm, xd);
+
+  if (!cpi->sf.use_nonrd_pick_mode || cm->frame_type == KEY_FRAME)
+    fill_token_costs(x->token_costs, cm->fc->coef_probs);
+
+  if (cpi->sf.partition_search_type != VAR_BASED_PARTITION ||
+      cm->frame_type == KEY_FRAME) {
+    for (i = 0; i < PARTITION_CONTEXTS; ++i)
+      vp9_cost_tokens(cpi->partition_cost[i], get_partition_probs(xd, i),
+                      vp9_partition_tree);
+  }
+
+  if (!cpi->sf.use_nonrd_pick_mode || (cm->current_video_frame & 0x07) == 1 ||
+      cm->frame_type == KEY_FRAME) {
+    fill_mode_costs(cpi);
+
+    if (!frame_is_intra_only(cm)) {
+      vp9_build_nmv_cost_table(x->nmvjointcost,
+                               cm->allow_high_precision_mv ? x->nmvcost_hp
+                                                           : x->nmvcost,
+                               &cm->fc->nmvc, cm->allow_high_precision_mv);
+
+      for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
+        vp9_cost_tokens((int *)cpi->inter_mode_cost[i],
+                        cm->fc->inter_mode_probs[i], vp9_inter_mode_tree);
+    }
+  }
+}
+
+static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) {
+  // NOTE: The tables below must be of the same size.
+
+  // The functions described below are sampled at the four most significant
+  // bits of x^2 + 8 / 256.
+
+  // Normalized rate:
+  // This table models the rate for a Laplacian source with given variance
+  // when quantized with a uniform quantizer with given stepsize. The
+  // closed form expression is:
+  // Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)],
+  // where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance),
+  // and H(x) is the binary entropy function.
+  static const int rate_tab_q10[] = {
+    65536,  6086,  5574,  5275,  5063,  4899,  4764,  4651,
+     4553,  4389,  4255,  4142,  4044,  3958,  3881,  3811,
+     3748,  3635,  3538,  3453,  3376,  3307,  3244,  3186,
+     3133,  3037,  2952,  2877,  2809,  2747,  2690,  2638,
+     2589,  2501,  2423,  2353,  2290,  2232,  2179,  2130,
+     2084,  2001,  1928,  1862,  1802,  1748,  1698,  1651,
+     1608,  1530,  1460,  1398,  1342,  1290,  1243,  1199,
+     1159,  1086,  1021,   963,   911,   864,   821,   781,
+      745,   680,   623,   574,   530,   490,   455,   424,
+      395,   345,   304,   269,   239,   213,   190,   171,
+      154,   126,   104,    87,    73,    61,    52,    44,
+       38,    28,    21,    16,    12,    10,     8,     6,
+        5,     3,     2,     1,     1,     1,     0,     0,
+  };
+  // Normalized distortion:
+  // This table models the normalized distortion for a Laplacian source
+  // with given variance when quantized with a uniform quantizer
+  // with given stepsize. The closed form expression is:
+  // Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2))
+  // where x = qpstep / sqrt(variance).
+  // Note the actual distortion is Dn * variance.
+  static const int dist_tab_q10[] = {
+       0,     0,     1,     1,     1,     2,     2,     2,
+       3,     3,     4,     5,     5,     6,     7,     7,
+       8,     9,    11,    12,    13,    15,    16,    17,
+      18,    21,    24,    26,    29,    31,    34,    36,
+      39,    44,    49,    54,    59,    64,    69,    73,
+      78,    88,    97,   106,   115,   124,   133,   142,
+     151,   167,   184,   200,   215,   231,   245,   260,
+     274,   301,   327,   351,   375,   397,   418,   439,
+     458,   495,   528,   559,   587,   613,   637,   659,
+     680,   717,   749,   777,   801,   823,   842,   859,
+     874,   899,   919,   936,   949,   960,   969,   977,
+     983,   994,  1001,  1006,  1010,  1013,  1015,  1017,
+    1018,  1020,  1022,  1022,  1023,  1023,  1023,  1024,
+  };
+  static const int xsq_iq_q10[] = {
+         0,      4,      8,     12,     16,     20,     24,     28,
+        32,     40,     48,     56,     64,     72,     80,     88,
+        96,    112,    128,    144,    160,    176,    192,    208,
+       224,    256,    288,    320,    352,    384,    416,    448,
+       480,    544,    608,    672,    736,    800,    864,    928,
+       992,   1120,   1248,   1376,   1504,   1632,   1760,   1888,
+      2016,   2272,   2528,   2784,   3040,   3296,   3552,   3808,
+      4064,   4576,   5088,   5600,   6112,   6624,   7136,   7648,
+      8160,   9184,  10208,  11232,  12256,  13280,  14304,  15328,
+     16352,  18400,  20448,  22496,  24544,  26592,  28640,  30688,
+     32736,  36832,  40928,  45024,  49120,  53216,  57312,  61408,
+     65504,  73696,  81888,  90080,  98272, 106464, 114656, 122848,
+    131040, 147424, 163808, 180192, 196576, 212960, 229344, 245728,
+  };
+  const int tmp = (xsq_q10 >> 2) + 8;
+  const int k = get_msb(tmp) - 3;
+  const int xq = (k << 3) + ((tmp >> k) & 0x7);
+  const int one_q10 = 1 << 10;
+  const int a_q10 = ((xsq_q10 - xsq_iq_q10[xq]) << 10) >> (2 + k);
+  const int b_q10 = one_q10 - a_q10;
+  *r_q10 = (rate_tab_q10[xq] * b_q10 + rate_tab_q10[xq + 1] * a_q10) >> 10;
+  *d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10;
+}
+
+void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2,
+                                  unsigned int qstep, int *rate,
+                                  int64_t *dist) {
+  // This function models the rate and distortion for a Laplacian
+  // source with given variance when quantized with a uniform quantizer
+  // with given stepsize. The closed form expressions are in:
+  // Hang and Chen, "Source Model for transform video coder and its
+  // application - Part I: Fundamental Theory", IEEE Trans. Circ.
+  // Sys. for Video Tech., April 1997.
+  if (var == 0) {
+    *rate = 0;
+    *dist = 0;
+  } else {
+    int d_q10, r_q10;
+    static const uint32_t MAX_XSQ_Q10 = 245727;
+    const uint64_t xsq_q10_64 =
+        (((uint64_t)qstep * qstep << (n_log2 + 10)) + (var >> 1)) / var;
+    const int xsq_q10 = (int)VPXMIN(xsq_q10_64, MAX_XSQ_Q10);
+    model_rd_norm(xsq_q10, &r_q10, &d_q10);
+    *rate = ROUND_POWER_OF_TWO(r_q10 << n_log2, 10 - VP9_PROB_COST_SHIFT);
+    *dist = (var * (int64_t)d_q10 + 512) >> 10;
+  }
+}
+
+void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
+                              const struct macroblockd_plane *pd,
+                              ENTROPY_CONTEXT t_above[16],
+                              ENTROPY_CONTEXT t_left[16]) {
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+  const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+  const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+  const ENTROPY_CONTEXT *const above = pd->above_context;
+  const ENTROPY_CONTEXT *const left = pd->left_context;
+
+  int i;
+  switch (tx_size) {
+    case TX_4X4:
+      memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w);
+      memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h);
+      break;
+    case TX_8X8:
+      for (i = 0; i < num_4x4_w; i += 2)
+        t_above[i] = !!*(const uint16_t *)&above[i];
+      for (i = 0; i < num_4x4_h; i += 2)
+        t_left[i] = !!*(const uint16_t *)&left[i];
+      break;
+    case TX_16X16:
+      for (i = 0; i < num_4x4_w; i += 4)
+        t_above[i] = !!*(const uint32_t *)&above[i];
+      for (i = 0; i < num_4x4_h; i += 4)
+        t_left[i] = !!*(const uint32_t *)&left[i];
+      break;
+    case TX_32X32:
+      for (i = 0; i < num_4x4_w; i += 8)
+        t_above[i] = !!*(const uint64_t *)&above[i];
+      for (i = 0; i < num_4x4_h; i += 8)
+        t_left[i] = !!*(const uint64_t *)&left[i];
+      break;
+    default:
+      assert(0 && "Invalid transform size.");
+      break;
+  }
+}
+
+void vp9_mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
+                 uint8_t *ref_y_buffer, int ref_y_stride,
+                 int ref_frame, BLOCK_SIZE block_size) {
+  int i;
+  int zero_seen = 0;
+  int best_index = 0;
+  int best_sad = INT_MAX;
+  int this_sad = INT_MAX;
+  int max_mv = 0;
+  int near_same_nearest;
+  uint8_t *src_y_ptr = x->plane[0].src.buf;
+  uint8_t *ref_y_ptr;
+  const int num_mv_refs = MAX_MV_REF_CANDIDATES +
+                    (cpi->sf.adaptive_motion_search &&
+                     block_size < x->max_partition_size);
+
+  MV pred_mv[3];
+  pred_mv[0] = x->mbmi_ext->ref_mvs[ref_frame][0].as_mv;
+  pred_mv[1] = x->mbmi_ext->ref_mvs[ref_frame][1].as_mv;
+  pred_mv[2] = x->pred_mv[ref_frame];
+  assert(num_mv_refs <= (int)(sizeof(pred_mv) / sizeof(pred_mv[0])));
+
+  near_same_nearest =
+      x->mbmi_ext->ref_mvs[ref_frame][0].as_int ==
+          x->mbmi_ext->ref_mvs[ref_frame][1].as_int;
+  // Get the sad for each candidate reference mv.
+  for (i = 0; i < num_mv_refs; ++i) {
+    const MV *this_mv = &pred_mv[i];
+    int fp_row, fp_col;
+
+    if (i == 1 && near_same_nearest)
+      continue;
+    fp_row = (this_mv->row + 3 + (this_mv->row >= 0)) >> 3;
+    fp_col = (this_mv->col + 3 + (this_mv->col >= 0)) >> 3;
+    max_mv = VPXMAX(max_mv, VPXMAX(abs(this_mv->row), abs(this_mv->col)) >> 3);
+
+    if (fp_row ==0 && fp_col == 0 && zero_seen)
+      continue;
+    zero_seen |= (fp_row ==0 && fp_col == 0);
+
+    ref_y_ptr =&ref_y_buffer[ref_y_stride * fp_row + fp_col];
+    // Find sad for current vector.
+    this_sad = cpi->fn_ptr[block_size].sdf(src_y_ptr, x->plane[0].src.stride,
+                                           ref_y_ptr, ref_y_stride);
+    // Note if it is the best so far.
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      best_index = i;
+    }
+  }
+
+  // Note the index of the mv that worked best in the reference list.
+  x->mv_best_ref_index[ref_frame] = best_index;
+  x->max_mv_context[ref_frame] = max_mv;
+  x->pred_mv_sad[ref_frame] = best_sad;
+}
+
+void vp9_setup_pred_block(const MACROBLOCKD *xd,
+                          struct buf_2d dst[MAX_MB_PLANE],
+                          const YV12_BUFFER_CONFIG *src,
+                          int mi_row, int mi_col,
+                          const struct scale_factors *scale,
+                          const struct scale_factors *scale_uv) {
+  int i;
+
+  dst[0].buf = src->y_buffer;
+  dst[0].stride = src->y_stride;
+  dst[1].buf = src->u_buffer;
+  dst[2].buf = src->v_buffer;
+  dst[1].stride = dst[2].stride = src->uv_stride;
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    setup_pred_plane(dst + i, dst[i].buf, dst[i].stride, mi_row, mi_col,
+                     i ? scale_uv : scale,
+                     xd->plane[i].subsampling_x, xd->plane[i].subsampling_y);
+  }
+}
+
+int vp9_raster_block_offset(BLOCK_SIZE plane_bsize,
+                            int raster_block, int stride) {
+  const int bw = b_width_log2_lookup[plane_bsize];
+  const int y = 4 * (raster_block >> bw);
+  const int x = 4 * (raster_block & ((1 << bw) - 1));
+  return y * stride + x;
+}
+
+int16_t* vp9_raster_block_offset_int16(BLOCK_SIZE plane_bsize,
+                                       int raster_block, int16_t *base) {
+  const int stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+  return base + vp9_raster_block_offset(plane_bsize, raster_block, stride);
+}
+
+YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const VP9_COMP *cpi,
+                                             int ref_frame) {
+  const VP9_COMMON *const cm = &cpi->common;
+  const int scaled_idx = cpi->scaled_ref_idx[ref_frame - 1];
+  const int ref_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+  return
+      (scaled_idx != ref_idx && scaled_idx != INVALID_IDX) ?
+          &cm->buffer_pool->frame_bufs[scaled_idx].buf : NULL;
+}
+
+int vp9_get_switchable_rate(const VP9_COMP *cpi, const MACROBLOCKD *const xd) {
+  const MODE_INFO *const mi = xd->mi[0];
+  const int ctx = vp9_get_pred_context_switchable_interp(xd);
+  return SWITCHABLE_INTERP_RATE_FACTOR *
+             cpi->switchable_interp_costs[ctx][mi->interp_filter];
+}
+
+void vp9_set_rd_speed_thresholds(VP9_COMP *cpi) {
+  int i;
+  RD_OPT *const rd = &cpi->rd;
+  SPEED_FEATURES *const sf = &cpi->sf;
+
+  // Set baseline threshold values.
+  for (i = 0; i < MAX_MODES; ++i)
+    rd->thresh_mult[i] = cpi->oxcf.mode == BEST ? -500 : 0;
+
+  if (sf->adaptive_rd_thresh) {
+    rd->thresh_mult[THR_NEARESTMV] = 300;
+    rd->thresh_mult[THR_NEARESTG] = 300;
+    rd->thresh_mult[THR_NEARESTA] = 300;
+  } else {
+    rd->thresh_mult[THR_NEARESTMV] = 0;
+    rd->thresh_mult[THR_NEARESTG] = 0;
+    rd->thresh_mult[THR_NEARESTA] = 0;
+  }
+
+  rd->thresh_mult[THR_DC] += 1000;
+
+  rd->thresh_mult[THR_NEWMV] += 1000;
+  rd->thresh_mult[THR_NEWA] += 1000;
+  rd->thresh_mult[THR_NEWG] += 1000;
+
+  rd->thresh_mult[THR_NEARMV] += 1000;
+  rd->thresh_mult[THR_NEARA] += 1000;
+  rd->thresh_mult[THR_COMP_NEARESTLA] += 1000;
+  rd->thresh_mult[THR_COMP_NEARESTGA] += 1000;
+
+  rd->thresh_mult[THR_TM] += 1000;
+
+  rd->thresh_mult[THR_COMP_NEARLA] += 1500;
+  rd->thresh_mult[THR_COMP_NEWLA] += 2000;
+  rd->thresh_mult[THR_NEARG] += 1000;
+  rd->thresh_mult[THR_COMP_NEARGA] += 1500;
+  rd->thresh_mult[THR_COMP_NEWGA] += 2000;
+
+  rd->thresh_mult[THR_ZEROMV] += 2000;
+  rd->thresh_mult[THR_ZEROG] += 2000;
+  rd->thresh_mult[THR_ZEROA] += 2000;
+  rd->thresh_mult[THR_COMP_ZEROLA] += 2500;
+  rd->thresh_mult[THR_COMP_ZEROGA] += 2500;
+
+  rd->thresh_mult[THR_H_PRED] += 2000;
+  rd->thresh_mult[THR_V_PRED] += 2000;
+  rd->thresh_mult[THR_D45_PRED ] += 2500;
+  rd->thresh_mult[THR_D135_PRED] += 2500;
+  rd->thresh_mult[THR_D117_PRED] += 2500;
+  rd->thresh_mult[THR_D153_PRED] += 2500;
+  rd->thresh_mult[THR_D207_PRED] += 2500;
+  rd->thresh_mult[THR_D63_PRED] += 2500;
+}
+
+void vp9_set_rd_speed_thresholds_sub8x8(VP9_COMP *cpi) {
+  static const int thresh_mult[2][MAX_REFS] =
+      {{2500, 2500, 2500, 4500, 4500, 2500},
+       {2000, 2000, 2000, 4000, 4000, 2000}};
+  RD_OPT *const rd = &cpi->rd;
+  const int idx = cpi->oxcf.mode == BEST;
+  memcpy(rd->thresh_mult_sub8x8, thresh_mult[idx], sizeof(thresh_mult[idx]));
+}
+
+void vp9_update_rd_thresh_fact(int (*factor_buf)[MAX_MODES], int rd_thresh,
+                               int bsize, int best_mode_index) {
+  if (rd_thresh > 0) {
+    const int top_mode = bsize < BLOCK_8X8 ? MAX_REFS : MAX_MODES;
+    int mode;
+    for (mode = 0; mode < top_mode; ++mode) {
+      const BLOCK_SIZE min_size = VPXMAX(bsize - 1, BLOCK_4X4);
+      const BLOCK_SIZE max_size = VPXMIN(bsize + 2, BLOCK_64X64);
+      BLOCK_SIZE bs;
+      for (bs = min_size; bs <= max_size; ++bs) {
+        int *const fact = &factor_buf[bs][mode];
+        if (mode == best_mode_index) {
+          *fact -= (*fact >> 4);
+        } else {
+          *fact = VPXMIN(*fact + RD_THRESH_INC, rd_thresh * RD_THRESH_MAX_FACT);
+        }
+      }
+    }
+  }
+}
+
+int vp9_get_intra_cost_penalty(int qindex, int qdelta,
+                               vpx_bit_depth_t bit_depth) {
+  const int q = vp9_dc_quant(qindex, qdelta, bit_depth);
+#if CONFIG_VP9_HIGHBITDEPTH
+  switch (bit_depth) {
+    case VPX_BITS_8:
+      return 20 * q;
+    case VPX_BITS_10:
+      return 5 * q;
+    case VPX_BITS_12:
+      return ROUND_POWER_OF_TWO(5 * q, 2);
+    default:
+      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+      return -1;
+  }
+#else
+  return 20 * q;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}
+
diff --git a/libs/libvpx/vp9/encoder/vp9_rd.h b/libs/libvpx/vp9/encoder/vp9_rd.h
new file mode 100644
index 0000000000..9b8e2732c5
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_rd.h
@@ -0,0 +1,204 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_VP9_RD_H_
+#define VP9_ENCODER_VP9_RD_H_
+
+#include <limits.h>
+
+#include "vp9/common/vp9_blockd.h"
+
+#include "vp9/encoder/vp9_block.h"
+#include "vp9/encoder/vp9_context_tree.h"
+#include "vp9/encoder/vp9_cost.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define RDDIV_BITS          7
+#define RD_EPB_SHIFT        6
+
+#define RDCOST(RM, DM, R, D) \
+  (ROUND_POWER_OF_TWO(((int64_t)R) * (RM), VP9_PROB_COST_SHIFT) + (D << DM))
+#define QIDX_SKIP_THRESH     115
+
+#define MV_COST_WEIGHT      108
+#define MV_COST_WEIGHT_SUB  120
+
+#define INVALID_MV 0x80008000
+
+#define MAX_MODES 30
+#define MAX_REFS  6
+
+#define RD_THRESH_MAX_FACT 64
+#define RD_THRESH_INC      1
+
+// This enumerator type needs to be kept aligned with the mode order in
+// const MODE_DEFINITION vp9_mode_order[MAX_MODES] used in the rd code.
+typedef enum {
+  THR_NEARESTMV,
+  THR_NEARESTA,
+  THR_NEARESTG,
+
+  THR_DC,
+
+  THR_NEWMV,
+  THR_NEWA,
+  THR_NEWG,
+
+  THR_NEARMV,
+  THR_NEARA,
+  THR_NEARG,
+
+  THR_ZEROMV,
+  THR_ZEROG,
+  THR_ZEROA,
+
+  THR_COMP_NEARESTLA,
+  THR_COMP_NEARESTGA,
+
+  THR_TM,
+
+  THR_COMP_NEARLA,
+  THR_COMP_NEWLA,
+  THR_COMP_NEARGA,
+  THR_COMP_NEWGA,
+
+  THR_COMP_ZEROLA,
+  THR_COMP_ZEROGA,
+
+  THR_H_PRED,
+  THR_V_PRED,
+  THR_D135_PRED,
+  THR_D207_PRED,
+  THR_D153_PRED,
+  THR_D63_PRED,
+  THR_D117_PRED,
+  THR_D45_PRED,
+} THR_MODES;
+
+typedef enum {
+  THR_LAST,
+  THR_GOLD,
+  THR_ALTR,
+  THR_COMP_LA,
+  THR_COMP_GA,
+  THR_INTRA,
+} THR_MODES_SUB8X8;
+
+typedef struct RD_OPT {
+  // Thresh_mult is used to set a threshold for the rd score. A higher value
+  // means that we will accept the best mode so far more often. This number
+  // is used in combination with the current block size, and thresh_freq_fact
+  // to pick a threshold.
+  int thresh_mult[MAX_MODES];
+  int thresh_mult_sub8x8[MAX_REFS];
+
+  int threshes[MAX_SEGMENTS][BLOCK_SIZES][MAX_MODES];
+
+  int64_t prediction_type_threshes[MAX_REF_FRAMES][REFERENCE_MODES];
+
+  int64_t filter_threshes[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS];
+
+  int RDMULT;
+  int RDDIV;
+} RD_OPT;
+
+typedef struct RD_COST {
+  int rate;
+  int64_t dist;
+  int64_t rdcost;
+} RD_COST;
+
+// Reset the rate distortion cost values to maximum (invalid) value.
+void vp9_rd_cost_reset(RD_COST *rd_cost);
+// Initialize the rate distortion cost values to zero.
+void vp9_rd_cost_init(RD_COST *rd_cost);
+
+struct TileInfo;
+struct TileDataEnc;
+struct VP9_COMP;
+struct macroblock;
+
+int vp9_compute_rd_mult(const struct VP9_COMP *cpi, int qindex);
+
+void vp9_initialize_rd_consts(struct VP9_COMP *cpi);
+
+void vp9_initialize_me_consts(struct VP9_COMP *cpi, MACROBLOCK *x, int qindex);
+
+void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n,
+                                  unsigned int qstep, int *rate,
+                                  int64_t *dist);
+
+int vp9_get_switchable_rate(const struct VP9_COMP *cpi,
+                            const MACROBLOCKD *const xd);
+
+int vp9_raster_block_offset(BLOCK_SIZE plane_bsize,
+                            int raster_block, int stride);
+
+int16_t* vp9_raster_block_offset_int16(BLOCK_SIZE plane_bsize,
+                                       int raster_block, int16_t *base);
+
+YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const struct VP9_COMP *cpi,
+                                             int ref_frame);
+
+void vp9_init_me_luts(void);
+
+void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
+                              const struct macroblockd_plane *pd,
+                              ENTROPY_CONTEXT t_above[16],
+                              ENTROPY_CONTEXT t_left[16]);
+
+void vp9_set_rd_speed_thresholds(struct VP9_COMP *cpi);
+
+void vp9_set_rd_speed_thresholds_sub8x8(struct VP9_COMP *cpi);
+
+void vp9_update_rd_thresh_fact(int (*fact)[MAX_MODES], int rd_thresh,
+                               int bsize, int best_mode_index);
+
+static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh,
+                                      int thresh_fact) {
+    return best_rd < ((int64_t)thresh * thresh_fact >> 5) || thresh == INT_MAX;
+}
+
+static INLINE void set_error_per_bit(MACROBLOCK *x, int rdmult) {
+  x->errorperbit = rdmult >> RD_EPB_SHIFT;
+  x->errorperbit += (x->errorperbit == 0);
+}
+
+void vp9_mv_pred(struct VP9_COMP *cpi, MACROBLOCK *x,
+                 uint8_t *ref_y_buffer, int ref_y_stride,
+                 int ref_frame, BLOCK_SIZE block_size);
+
+void vp9_setup_pred_block(const MACROBLOCKD *xd,
+                          struct buf_2d dst[MAX_MB_PLANE],
+                          const YV12_BUFFER_CONFIG *src,
+                          int mi_row, int mi_col,
+                          const struct scale_factors *scale,
+                          const struct scale_factors *scale_uv);
+
+int vp9_get_intra_cost_penalty(int qindex, int qdelta,
+                               vpx_bit_depth_t bit_depth);
+
+unsigned int vp9_get_sby_perpixel_variance(struct VP9_COMP *cpi,
+                                           const struct buf_2d *ref,
+                                           BLOCK_SIZE bs);
+#if CONFIG_VP9_HIGHBITDEPTH
+unsigned int vp9_high_get_sby_perpixel_variance(struct VP9_COMP *cpi,
+                                                const struct buf_2d *ref,
+                                                BLOCK_SIZE bs, int bd);
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_VP9_RD_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_rdopt.c b/libs/libvpx/vp9/encoder/vp9_rdopt.c
new file mode 100644
index 0000000000..193c9d33cf
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_rdopt.c
@@ -0,0 +1,4369 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <math.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/system_state.h"
+
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_entropy.h"
+#include "vp9/common/vp9_entropymode.h"
+#include "vp9/common/vp9_idct.h"
+#include "vp9/common/vp9_mvref_common.h"
+#include "vp9/common/vp9_pred_common.h"
+#include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/common/vp9_reconintra.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/common/vp9_seg_common.h"
+
+#include "vp9/encoder/vp9_cost.h"
+#include "vp9/encoder/vp9_encodemb.h"
+#include "vp9/encoder/vp9_encodemv.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_mcomp.h"
+#include "vp9/encoder/vp9_quantize.h"
+#include "vp9/encoder/vp9_ratectrl.h"
+#include "vp9/encoder/vp9_rd.h"
+#include "vp9/encoder/vp9_rdopt.h"
+#include "vp9/encoder/vp9_aq_variance.h"
+
+#define LAST_FRAME_MODE_MASK    ((1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME) | \
+                                 (1 << INTRA_FRAME))
+#define GOLDEN_FRAME_MODE_MASK  ((1 << LAST_FRAME) | (1 << ALTREF_FRAME) | \
+                                 (1 << INTRA_FRAME))
+#define ALT_REF_MODE_MASK       ((1 << LAST_FRAME) | (1 << GOLDEN_FRAME) | \
+                                 (1 << INTRA_FRAME))
+
+#define SECOND_REF_FRAME_MASK   ((1 << ALTREF_FRAME) | 0x01)
+
+#define MIN_EARLY_TERM_INDEX    3
+#define NEW_MV_DISCOUNT_FACTOR  8
+
+typedef struct {
+  PREDICTION_MODE mode;
+  MV_REFERENCE_FRAME ref_frame[2];
+} MODE_DEFINITION;
+
+typedef struct {
+  MV_REFERENCE_FRAME ref_frame[2];
+} REF_DEFINITION;
+
+struct rdcost_block_args {
+  MACROBLOCK *x;
+  ENTROPY_CONTEXT t_above[16];
+  ENTROPY_CONTEXT t_left[16];
+  int this_rate;
+  int64_t this_dist;
+  int64_t this_sse;
+  int64_t this_rd;
+  int64_t best_rd;
+  int exit_early;
+  int use_fast_coef_costing;
+  const scan_order *so;
+  uint8_t skippable;
+};
+
+#define LAST_NEW_MV_INDEX 6
+static const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
+  {NEARESTMV, {LAST_FRAME,   NONE}},
+  {NEARESTMV, {ALTREF_FRAME, NONE}},
+  {NEARESTMV, {GOLDEN_FRAME, NONE}},
+
+  {DC_PRED,   {INTRA_FRAME,  NONE}},
+
+  {NEWMV,     {LAST_FRAME,   NONE}},
+  {NEWMV,     {ALTREF_FRAME, NONE}},
+  {NEWMV,     {GOLDEN_FRAME, NONE}},
+
+  {NEARMV,    {LAST_FRAME,   NONE}},
+  {NEARMV,    {ALTREF_FRAME, NONE}},
+  {NEARMV,    {GOLDEN_FRAME, NONE}},
+
+  {ZEROMV,    {LAST_FRAME,   NONE}},
+  {ZEROMV,    {GOLDEN_FRAME, NONE}},
+  {ZEROMV,    {ALTREF_FRAME, NONE}},
+
+  {NEARESTMV, {LAST_FRAME,   ALTREF_FRAME}},
+  {NEARESTMV, {GOLDEN_FRAME, ALTREF_FRAME}},
+
+  {TM_PRED,   {INTRA_FRAME,  NONE}},
+
+  {NEARMV,    {LAST_FRAME,   ALTREF_FRAME}},
+  {NEWMV,     {LAST_FRAME,   ALTREF_FRAME}},
+  {NEARMV,    {GOLDEN_FRAME, ALTREF_FRAME}},
+  {NEWMV,     {GOLDEN_FRAME, ALTREF_FRAME}},
+
+  {ZEROMV,    {LAST_FRAME,   ALTREF_FRAME}},
+  {ZEROMV,    {GOLDEN_FRAME, ALTREF_FRAME}},
+
+  {H_PRED,    {INTRA_FRAME,  NONE}},
+  {V_PRED,    {INTRA_FRAME,  NONE}},
+  {D135_PRED, {INTRA_FRAME,  NONE}},
+  {D207_PRED, {INTRA_FRAME,  NONE}},
+  {D153_PRED, {INTRA_FRAME,  NONE}},
+  {D63_PRED,  {INTRA_FRAME,  NONE}},
+  {D117_PRED, {INTRA_FRAME,  NONE}},
+  {D45_PRED,  {INTRA_FRAME,  NONE}},
+};
+
+static const REF_DEFINITION vp9_ref_order[MAX_REFS] = {
+  {{LAST_FRAME,   NONE}},
+  {{GOLDEN_FRAME, NONE}},
+  {{ALTREF_FRAME, NONE}},
+  {{LAST_FRAME,   ALTREF_FRAME}},
+  {{GOLDEN_FRAME, ALTREF_FRAME}},
+  {{INTRA_FRAME,  NONE}},
+};
+
+static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
+                           int m, int n, int min_plane, int max_plane) {
+  int i;
+
+  for (i = min_plane; i < max_plane; ++i) {
+    struct macroblock_plane *const p = &x->plane[i];
+    struct macroblockd_plane *const pd = &x->e_mbd.plane[i];
+
+    p->coeff    = ctx->coeff_pbuf[i][m];
+    p->qcoeff   = ctx->qcoeff_pbuf[i][m];
+    pd->dqcoeff = ctx->dqcoeff_pbuf[i][m];
+    p->eobs     = ctx->eobs_pbuf[i][m];
+
+    ctx->coeff_pbuf[i][m]   = ctx->coeff_pbuf[i][n];
+    ctx->qcoeff_pbuf[i][m]  = ctx->qcoeff_pbuf[i][n];
+    ctx->dqcoeff_pbuf[i][m] = ctx->dqcoeff_pbuf[i][n];
+    ctx->eobs_pbuf[i][m]    = ctx->eobs_pbuf[i][n];
+
+    ctx->coeff_pbuf[i][n]   = p->coeff;
+    ctx->qcoeff_pbuf[i][n]  = p->qcoeff;
+    ctx->dqcoeff_pbuf[i][n] = pd->dqcoeff;
+    ctx->eobs_pbuf[i][n]    = p->eobs;
+  }
+}
+
+static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize,
+                            MACROBLOCK *x, MACROBLOCKD *xd,
+                            int *out_rate_sum, int64_t *out_dist_sum,
+                            int *skip_txfm_sb, int64_t *skip_sse_sb) {
+  // Note our transform coeffs are 8 times an orthogonal transform.
+  // Hence quantizer step is also 8 times. To get effective quantizer
+  // we need to divide by 8 before sending to modeling function.
+  int i;
+  int64_t rate_sum = 0;
+  int64_t dist_sum = 0;
+  const int ref = xd->mi[0]->ref_frame[0];
+  unsigned int sse;
+  unsigned int var = 0;
+  unsigned int sum_sse = 0;
+  int64_t total_sse = 0;
+  int skip_flag = 1;
+  const int shift = 6;
+  int rate;
+  int64_t dist;
+  const int dequant_shift =
+#if CONFIG_VP9_HIGHBITDEPTH
+      (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ?
+          xd->bd - 5 :
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+          3;
+
+  x->pred_sse[ref] = 0;
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    struct macroblock_plane *const p = &x->plane[i];
+    struct macroblockd_plane *const pd = &xd->plane[i];
+    const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
+    const TX_SIZE max_tx_size = max_txsize_lookup[bs];
+    const BLOCK_SIZE unit_size = txsize_to_bsize[max_tx_size];
+    const int64_t dc_thr = p->quant_thred[0] >> shift;
+    const int64_t ac_thr = p->quant_thred[1] >> shift;
+    // The low thresholds are used to measure if the prediction errors are
+    // low enough so that we can skip the mode search.
+    const int64_t low_dc_thr = VPXMIN(50, dc_thr >> 2);
+    const int64_t low_ac_thr = VPXMIN(80, ac_thr >> 2);
+    int bw = 1 << (b_width_log2_lookup[bs] - b_width_log2_lookup[unit_size]);
+    int bh = 1 << (b_height_log2_lookup[bs] - b_width_log2_lookup[unit_size]);
+    int idx, idy;
+    int lw = b_width_log2_lookup[unit_size] + 2;
+    int lh = b_height_log2_lookup[unit_size] + 2;
+
+    sum_sse = 0;
+
+    for (idy = 0; idy < bh; ++idy) {
+      for (idx = 0; idx < bw; ++idx) {
+        uint8_t *src = p->src.buf + (idy * p->src.stride << lh) + (idx << lw);
+        uint8_t *dst = pd->dst.buf + (idy * pd->dst.stride << lh) + (idx << lh);
+        int block_idx = (idy << 1) + idx;
+        int low_err_skip = 0;
+
+        var = cpi->fn_ptr[unit_size].vf(src, p->src.stride,
+                                        dst, pd->dst.stride, &sse);
+        x->bsse[(i << 2) + block_idx] = sse;
+        sum_sse += sse;
+
+        x->skip_txfm[(i << 2) + block_idx] = SKIP_TXFM_NONE;
+        if (!x->select_tx_size) {
+          // Check if all ac coefficients can be quantized to zero.
+          if (var < ac_thr || var == 0) {
+            x->skip_txfm[(i << 2) + block_idx] = SKIP_TXFM_AC_ONLY;
+
+            // Check if dc coefficient can be quantized to zero.
+            if (sse - var < dc_thr || sse == var) {
+              x->skip_txfm[(i << 2) + block_idx] = SKIP_TXFM_AC_DC;
+
+              if (!sse || (var < low_ac_thr && sse - var < low_dc_thr))
+                low_err_skip = 1;
+            }
+          }
+        }
+
+        if (skip_flag && !low_err_skip)
+          skip_flag = 0;
+
+        if (i == 0)
+          x->pred_sse[ref] += sse;
+      }
+    }
+
+    total_sse += sum_sse;
+
+    // Fast approximate the modelling function.
+    if (cpi->sf.simple_model_rd_from_var) {
+      int64_t rate;
+      const int64_t square_error = sum_sse;
+      int quantizer = (pd->dequant[1] >> dequant_shift);
+
+      if (quantizer < 120)
+        rate = (square_error * (280 - quantizer)) >> (16 - VP9_PROB_COST_SHIFT);
+      else
+        rate = 0;
+      dist = (square_error * quantizer) >> 8;
+      rate_sum += rate;
+      dist_sum += dist;
+    } else {
+      vp9_model_rd_from_var_lapndz(sum_sse, num_pels_log2_lookup[bs],
+                                   pd->dequant[1] >> dequant_shift,
+                                   &rate, &dist);
+      rate_sum += rate;
+      dist_sum += dist;
+    }
+  }
+
+  *skip_txfm_sb = skip_flag;
+  *skip_sse_sb = total_sse << 4;
+  *out_rate_sum = (int)rate_sum;
+  *out_dist_sum = dist_sum << 4;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+int64_t vp9_highbd_block_error_c(const tran_low_t *coeff,
+                                 const tran_low_t *dqcoeff,
+                                 intptr_t block_size,
+                                 int64_t *ssz, int bd) {
+  int i;
+  int64_t error = 0, sqcoeff = 0;
+  int shift = 2 * (bd - 8);
+  int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+
+  for (i = 0; i < block_size; i++) {
+    const int64_t diff = coeff[i] - dqcoeff[i];
+    error +=  diff * diff;
+    sqcoeff += (int64_t)coeff[i] * (int64_t)coeff[i];
+  }
+  assert(error >= 0 && sqcoeff >= 0);
+  error = (error + rounding) >> shift;
+  sqcoeff = (sqcoeff + rounding) >> shift;
+
+  *ssz = sqcoeff;
+  return error;
+}
+
+int64_t vp9_highbd_block_error_8bit_c(const tran_low_t *coeff,
+                                      const tran_low_t *dqcoeff,
+                                      intptr_t block_size,
+                                      int64_t *ssz) {
+  // Note that the C versions of these 2 functions (vp9_block_error and
+  // vp9_highbd_block_error_8bit are the same, but the optimized assembly
+  // routines are not compatible in the non high bitdepth configuration, so
+  // they still cannot share the same name.
+  return vp9_block_error_c(coeff, dqcoeff, block_size, ssz);
+}
+
+static int64_t vp9_highbd_block_error_dispatch(const tran_low_t *coeff,
+                                               const tran_low_t *dqcoeff,
+                                               intptr_t block_size,
+                                               int64_t *ssz, int bd) {
+  if (bd == 8) {
+    return vp9_highbd_block_error_8bit(coeff, dqcoeff, block_size, ssz);
+  } else {
+    return vp9_highbd_block_error(coeff, dqcoeff, block_size, ssz, bd);
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
+                          intptr_t block_size, int64_t *ssz) {
+  int i;
+  int64_t error = 0, sqcoeff = 0;
+
+  for (i = 0; i < block_size; i++) {
+    const int diff = coeff[i] - dqcoeff[i];
+    error +=  diff * diff;
+    sqcoeff += coeff[i] * coeff[i];
+  }
+
+  *ssz = sqcoeff;
+  return error;
+}
+
+int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff,
+                             int block_size) {
+  int i;
+  int64_t error = 0;
+
+  for (i = 0; i < block_size; i++) {
+    const int diff = coeff[i] - dqcoeff[i];
+    error +=  diff * diff;
+  }
+
+  return error;
+}
+
+/* The trailing '0' is a terminator which is used inside cost_coeffs() to
+ * decide whether to include cost of a trailing EOB node or not (i.e. we
+ * can skip this if the last coefficient in this transform block, e.g. the
+ * 16th coefficient in a 4x4 block or the 64th coefficient in a 8x8 block,
+ * were non-zero). */
+static const int16_t band_counts[TX_SIZES][8] = {
+  { 1, 2, 3, 4,  3,   16 - 13, 0 },
+  { 1, 2, 3, 4, 11,   64 - 21, 0 },
+  { 1, 2, 3, 4, 11,  256 - 21, 0 },
+  { 1, 2, 3, 4, 11, 1024 - 21, 0 },
+};
+static int cost_coeffs(MACROBLOCK *x,
+                       int plane, int block,
+                       ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L,
+                       TX_SIZE tx_size,
+                       const int16_t *scan, const int16_t *nb,
+                       int use_fast_coef_costing) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *mi = xd->mi[0];
+  const struct macroblock_plane *p = &x->plane[plane];
+  const PLANE_TYPE type = get_plane_type(plane);
+  const int16_t *band_count = &band_counts[tx_size][1];
+  const int eob = p->eobs[block];
+  const tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  unsigned int (*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
+                   x->token_costs[tx_size][type][is_inter_block(mi)];
+  uint8_t token_cache[32 * 32];
+  int pt = combine_entropy_contexts(*A, *L);
+  int c, cost;
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int *cat6_high_cost = vp9_get_high_cost_table(xd->bd);
+#else
+  const int *cat6_high_cost = vp9_get_high_cost_table(8);
+#endif
+
+  // Check for consistency of tx_size with mode info
+  assert(type == PLANE_TYPE_Y ? mi->tx_size == tx_size :
+         get_uv_tx_size(mi, &xd->plane[plane]) == tx_size);
+
+  if (eob == 0) {
+    // single eob token
+    cost = token_costs[0][0][pt][EOB_TOKEN];
+    c = 0;
+  } else {
+    int band_left = *band_count++;
+
+    // dc token
+    int v = qcoeff[0];
+    int16_t prev_t;
+    EXTRABIT e;
+    vp9_get_token_extra(v, &prev_t, &e);
+    cost = (*token_costs)[0][pt][prev_t] +
+        vp9_get_cost(prev_t, e, cat6_high_cost);
+
+    token_cache[0] = vp9_pt_energy_class[prev_t];
+    ++token_costs;
+
+    // ac tokens
+    for (c = 1; c < eob; c++) {
+      const int rc = scan[c];
+      int16_t t;
+
+      v = qcoeff[rc];
+      vp9_get_token_extra(v, &t, &e);
+      if (use_fast_coef_costing) {
+        cost += (*token_costs)[!prev_t][!prev_t][t] +
+            vp9_get_cost(t, e, cat6_high_cost);
+      } else {
+        pt = get_coef_context(nb, token_cache, c);
+        cost += (*token_costs)[!prev_t][pt][t] +
+            vp9_get_cost(t, e, cat6_high_cost);
+        token_cache[rc] = vp9_pt_energy_class[t];
+      }
+      prev_t = t;
+      if (!--band_left) {
+        band_left = *band_count++;
+        ++token_costs;
+      }
+    }
+
+    // eob token
+    if (band_left) {
+      if (use_fast_coef_costing) {
+        cost += (*token_costs)[0][!prev_t][EOB_TOKEN];
+      } else {
+        pt = get_coef_context(nb, token_cache, c);
+        cost += (*token_costs)[0][pt][EOB_TOKEN];
+      }
+    }
+  }
+
+  // is eob first coefficient;
+  *A = *L = (c > 0);
+
+  return cost;
+}
+
+static void dist_block(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size,
+                       int64_t *out_dist, int64_t *out_sse) {
+  const int ss_txfrm_size = tx_size << 1;
+  MACROBLOCKD* const xd = &x->e_mbd;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  int64_t this_sse;
+  int shift = tx_size == TX_32X32 ? 0 : 2;
+  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8;
+  *out_dist = vp9_highbd_block_error_dispatch(coeff, dqcoeff,
+                                              16 << ss_txfrm_size,
+                                              &this_sse, bd) >> shift;
+#else
+  *out_dist = vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
+                              &this_sse) >> shift;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  *out_sse = this_sse >> shift;
+
+  if (x->skip_encode && !is_inter_block(xd->mi[0])) {
+    // TODO(jingning): tune the model to better capture the distortion.
+    int64_t p = (pd->dequant[1] * pd->dequant[1] *
+                    (1 << ss_txfrm_size)) >>
+#if CONFIG_VP9_HIGHBITDEPTH
+                        (shift + 2 + (bd - 8) * 2);
+#else
+                        (shift + 2);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    *out_dist += (p >> 4);
+    *out_sse  += p;
+  }
+}
+
+static int rate_block(int plane, int block, BLOCK_SIZE plane_bsize,
+                      TX_SIZE tx_size, struct rdcost_block_args* args) {
+  int x_idx, y_idx;
+  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x_idx, &y_idx);
+
+  return cost_coeffs(args->x, plane, block, args->t_above + x_idx,
+                     args->t_left + y_idx, tx_size,
+                     args->so->scan, args->so->neighbors,
+                     args->use_fast_coef_costing);
+}
+
+static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
+                          TX_SIZE tx_size, void *arg) {
+  struct rdcost_block_args *args = arg;
+  MACROBLOCK *const x = args->x;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *const mi = xd->mi[0];
+  int64_t rd1, rd2, rd;
+  int rate;
+  int64_t dist;
+  int64_t sse;
+
+  if (args->exit_early)
+    return;
+
+  if (!is_inter_block(mi)) {
+    struct encode_b_args arg = {x, NULL, &mi->skip};
+    vp9_encode_block_intra(plane, block, plane_bsize, tx_size, &arg);
+    dist_block(x, plane, block, tx_size, &dist, &sse);
+  } else if (max_txsize_lookup[plane_bsize] == tx_size) {
+    if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] ==
+        SKIP_TXFM_NONE) {
+      // full forward transform and quantization
+      vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
+      dist_block(x, plane, block, tx_size, &dist, &sse);
+    } else if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] ==
+               SKIP_TXFM_AC_ONLY) {
+      // compute DC coefficient
+      tran_low_t *const coeff   = BLOCK_OFFSET(x->plane[plane].coeff, block);
+      tran_low_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block);
+      vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size);
+      sse  = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4;
+      dist = sse;
+      if (x->plane[plane].eobs[block]) {
+        const int64_t orig_sse = (int64_t)coeff[0] * coeff[0];
+        const int64_t resd_sse = coeff[0] - dqcoeff[0];
+        int64_t dc_correct = orig_sse - resd_sse * resd_sse;
+#if CONFIG_VP9_HIGHBITDEPTH
+        dc_correct >>= ((xd->bd - 8) * 2);
+#endif
+        if (tx_size != TX_32X32)
+          dc_correct >>= 2;
+
+        dist = VPXMAX(0, sse - dc_correct);
+      }
+    } else {
+      // SKIP_TXFM_AC_DC
+      // skip forward transform
+      x->plane[plane].eobs[block] = 0;
+      sse  = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4;
+      dist = sse;
+    }
+  } else {
+    // full forward transform and quantization
+    vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
+    dist_block(x, plane, block, tx_size, &dist, &sse);
+  }
+
+  rd = RDCOST(x->rdmult, x->rddiv, 0, dist);
+  if (args->this_rd + rd > args->best_rd) {
+    args->exit_early = 1;
+    return;
+  }
+
+  rate = rate_block(plane, block, plane_bsize, tx_size, args);
+  rd1 = RDCOST(x->rdmult, x->rddiv, rate, dist);
+  rd2 = RDCOST(x->rdmult, x->rddiv, 0, sse);
+
+  // TODO(jingning): temporarily enabled only for luma component
+  rd = VPXMIN(rd1, rd2);
+  if (plane == 0)
+    x->zcoeff_blk[tx_size][block] = !x->plane[plane].eobs[block] ||
+                                    (rd1 > rd2 && !xd->lossless);
+
+  args->this_rate += rate;
+  args->this_dist += dist;
+  args->this_sse += sse;
+  args->this_rd += rd;
+
+  if (args->this_rd > args->best_rd) {
+    args->exit_early = 1;
+    return;
+  }
+
+  args->skippable &= !x->plane[plane].eobs[block];
+}
+
+static void txfm_rd_in_plane(MACROBLOCK *x,
+                             int *rate, int64_t *distortion,
+                             int *skippable, int64_t *sse,
+                             int64_t ref_best_rd, int plane,
+                             BLOCK_SIZE bsize, TX_SIZE tx_size,
+                             int use_fast_coef_casting) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  struct rdcost_block_args args;
+  vp9_zero(args);
+  args.x = x;
+  args.best_rd = ref_best_rd;
+  args.use_fast_coef_costing = use_fast_coef_casting;
+  args.skippable = 1;
+
+  if (plane == 0)
+    xd->mi[0]->tx_size = tx_size;
+
+  vp9_get_entropy_contexts(bsize, tx_size, pd, args.t_above, args.t_left);
+
+  args.so = get_scan(xd, tx_size, get_plane_type(plane), 0);
+
+  vp9_foreach_transformed_block_in_plane(xd, bsize, plane,
+                                         block_rd_txfm, &args);
+  if (args.exit_early) {
+    *rate       = INT_MAX;
+    *distortion = INT64_MAX;
+    *sse        = INT64_MAX;
+    *skippable  = 0;
+  } else {
+    *distortion = args.this_dist;
+    *rate       = args.this_rate;
+    *sse        = args.this_sse;
+    *skippable  = args.skippable;
+  }
+}
+
+static void choose_largest_tx_size(VP9_COMP *cpi, MACROBLOCK *x,
+                                   int *rate, int64_t *distortion,
+                                   int *skip, int64_t *sse,
+                                   int64_t ref_best_rd,
+                                   BLOCK_SIZE bs) {
+  const TX_SIZE max_tx_size = max_txsize_lookup[bs];
+  VP9_COMMON *const cm = &cpi->common;
+  const TX_SIZE largest_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *const mi = xd->mi[0];
+
+  mi->tx_size = VPXMIN(max_tx_size, largest_tx_size);
+
+  txfm_rd_in_plane(x, rate, distortion, skip,
+                   sse, ref_best_rd, 0, bs,
+                   mi->tx_size, cpi->sf.use_fast_coef_costing);
+}
+
+static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
+                                   int *rate,
+                                   int64_t *distortion,
+                                   int *skip,
+                                   int64_t *psse,
+                                   int64_t ref_best_rd,
+                                   BLOCK_SIZE bs) {
+  const TX_SIZE max_tx_size = max_txsize_lookup[bs];
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *const mi = xd->mi[0];
+  vpx_prob skip_prob = vp9_get_skip_prob(cm, xd);
+  int r[TX_SIZES][2], s[TX_SIZES];
+  int64_t d[TX_SIZES], sse[TX_SIZES];
+  int64_t rd[TX_SIZES][2] = {{INT64_MAX, INT64_MAX},
+                             {INT64_MAX, INT64_MAX},
+                             {INT64_MAX, INT64_MAX},
+                             {INT64_MAX, INT64_MAX}};
+  int n, m;
+  int s0, s1;
+  int64_t best_rd = INT64_MAX;
+  TX_SIZE best_tx = max_tx_size;
+  int start_tx, end_tx;
+
+  const vpx_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc->tx_probs);
+  assert(skip_prob > 0);
+  s0 = vp9_cost_bit(skip_prob, 0);
+  s1 = vp9_cost_bit(skip_prob, 1);
+
+  if (cm->tx_mode == TX_MODE_SELECT) {
+    start_tx = max_tx_size;
+    end_tx = 0;
+  } else {
+    TX_SIZE chosen_tx_size = VPXMIN(max_tx_size,
+                                    tx_mode_to_biggest_tx_size[cm->tx_mode]);
+    start_tx = chosen_tx_size;
+    end_tx = chosen_tx_size;
+  }
+
+  for (n = start_tx; n >= end_tx; n--) {
+    int r_tx_size = 0;
+    for (m = 0; m <= n - (n == (int) max_tx_size); m++) {
+      if (m == n)
+        r_tx_size += vp9_cost_zero(tx_probs[m]);
+      else
+        r_tx_size += vp9_cost_one(tx_probs[m]);
+    }
+    txfm_rd_in_plane(x, &r[n][0], &d[n], &s[n],
+                     &sse[n], ref_best_rd, 0, bs, n,
+                     cpi->sf.use_fast_coef_costing);
+    r[n][1] = r[n][0];
+    if (r[n][0] < INT_MAX) {
+      r[n][1] += r_tx_size;
+    }
+    if (d[n] == INT64_MAX || r[n][0] == INT_MAX) {
+      rd[n][0] = rd[n][1] = INT64_MAX;
+    } else if (s[n]) {
+      if (is_inter_block(mi)) {
+        rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, sse[n]);
+        r[n][1] -= r_tx_size;
+      } else {
+        rd[n][0] = RDCOST(x->rdmult, x->rddiv, s1, sse[n]);
+        rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1 + r_tx_size, sse[n]);
+      }
+    } else {
+      rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]);
+      rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]);
+    }
+
+    if (is_inter_block(mi) && !xd->lossless && !s[n] && sse[n] != INT64_MAX) {
+      rd[n][0] = VPXMIN(rd[n][0], RDCOST(x->rdmult, x->rddiv, s1, sse[n]));
+      rd[n][1] = VPXMIN(rd[n][1], RDCOST(x->rdmult, x->rddiv, s1, sse[n]));
+    }
+
+    // Early termination in transform size search.
+    if (cpi->sf.tx_size_search_breakout &&
+        (rd[n][1] == INT64_MAX ||
+        (n < (int) max_tx_size && rd[n][1] > rd[n + 1][1]) ||
+        s[n] == 1))
+      break;
+
+    if (rd[n][1] < best_rd) {
+      best_tx = n;
+      best_rd = rd[n][1];
+    }
+  }
+  mi->tx_size = best_tx;
+
+  *distortion = d[mi->tx_size];
+  *rate       = r[mi->tx_size][cm->tx_mode == TX_MODE_SELECT];
+  *skip       = s[mi->tx_size];
+  *psse       = sse[mi->tx_size];
+}
+
+static void super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
+                            int64_t *distortion, int *skip,
+                            int64_t *psse, BLOCK_SIZE bs,
+                            int64_t ref_best_rd) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  int64_t sse;
+  int64_t *ret_sse = psse ? psse : &sse;
+
+  assert(bs == xd->mi[0]->sb_type);
+
+  if (cpi->sf.tx_size_search_method == USE_LARGESTALL || xd->lossless) {
+    choose_largest_tx_size(cpi, x, rate, distortion, skip, ret_sse, ref_best_rd,
+                           bs);
+  } else {
+    choose_tx_size_from_rd(cpi, x, rate, distortion, skip, ret_sse,
+                           ref_best_rd, bs);
+  }
+}
+
+static int conditional_skipintra(PREDICTION_MODE mode,
+                                 PREDICTION_MODE best_intra_mode) {
+  if (mode == D117_PRED &&
+      best_intra_mode != V_PRED &&
+      best_intra_mode != D135_PRED)
+    return 1;
+  if (mode == D63_PRED &&
+      best_intra_mode != V_PRED &&
+      best_intra_mode != D45_PRED)
+    return 1;
+  if (mode == D207_PRED &&
+      best_intra_mode != H_PRED &&
+      best_intra_mode != D45_PRED)
+    return 1;
+  if (mode == D153_PRED &&
+      best_intra_mode != H_PRED &&
+      best_intra_mode != D135_PRED)
+    return 1;
+  return 0;
+}
+
+static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x,
+                                     int row, int col,
+                                     PREDICTION_MODE *best_mode,
+                                     const int *bmode_costs,
+                                     ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
+                                     int *bestrate, int *bestratey,
+                                     int64_t *bestdistortion,
+                                     BLOCK_SIZE bsize, int64_t rd_thresh) {
+  PREDICTION_MODE mode;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int64_t best_rd = rd_thresh;
+  struct macroblock_plane *p = &x->plane[0];
+  struct macroblockd_plane *pd = &xd->plane[0];
+  const int src_stride = p->src.stride;
+  const int dst_stride = pd->dst.stride;
+  const uint8_t *src_init = &p->src.buf[row * 4 * src_stride + col * 4];
+  uint8_t *dst_init = &pd->dst.buf[row * 4 * src_stride + col * 4];
+  ENTROPY_CONTEXT ta[2], tempa[2];
+  ENTROPY_CONTEXT tl[2], templ[2];
+  const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+  const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
+  int idx, idy;
+  uint8_t best_dst[8 * 8];
+#if CONFIG_VP9_HIGHBITDEPTH
+  uint16_t best_dst16[8 * 8];
+#endif
+  memcpy(ta, a, num_4x4_blocks_wide * sizeof(a[0]));
+  memcpy(tl, l, num_4x4_blocks_high * sizeof(l[0]));
+
+  xd->mi[0]->tx_size = TX_4X4;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
+      int64_t this_rd;
+      int ratey = 0;
+      int64_t distortion = 0;
+      int rate = bmode_costs[mode];
+
+      if (!(cpi->sf.intra_y_mode_mask[TX_4X4] & (1 << mode)))
+        continue;
+
+      // Only do the oblique modes if the best so far is
+      // one of the neighboring directional modes
+      if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
+        if (conditional_skipintra(mode, *best_mode))
+            continue;
+      }
+
+      memcpy(tempa, ta, num_4x4_blocks_wide * sizeof(ta[0]));
+      memcpy(templ, tl, num_4x4_blocks_high * sizeof(tl[0]));
+
+      for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
+        for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
+          const int block = (row + idy) * 2 + (col + idx);
+          const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
+          uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
+          int16_t *const src_diff = vp9_raster_block_offset_int16(BLOCK_8X8,
+                                                                  block,
+                                                                  p->src_diff);
+          tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
+          xd->mi[0]->bmi[block].as_mode = mode;
+          vp9_predict_intra_block(xd, 1, TX_4X4, mode,
+                                  x->skip_encode ? src : dst,
+                                  x->skip_encode ? src_stride : dst_stride,
+                                  dst, dst_stride,
+                                  col + idx, row + idy, 0);
+          vpx_highbd_subtract_block(4, 4, src_diff, 8, src, src_stride,
+                                    dst, dst_stride, xd->bd);
+          if (xd->lossless) {
+            const scan_order *so = &vp9_default_scan_orders[TX_4X4];
+            vp9_highbd_fwht4x4(src_diff, coeff, 8);
+            vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
+            ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
+                                 so->scan, so->neighbors,
+                                 cpi->sf.use_fast_coef_costing);
+            if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
+              goto next_highbd;
+            vp9_highbd_iwht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block),
+                                   dst, dst_stride,
+                                   p->eobs[block], xd->bd);
+          } else {
+            int64_t unused;
+            const TX_TYPE tx_type = get_tx_type_4x4(PLANE_TYPE_Y, xd, block);
+            const scan_order *so = &vp9_scan_orders[TX_4X4][tx_type];
+            if (tx_type == DCT_DCT)
+              vpx_highbd_fdct4x4(src_diff, coeff, 8);
+            else
+              vp9_highbd_fht4x4(src_diff, coeff, 8, tx_type);
+            vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
+            ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
+                                 so->scan, so->neighbors,
+                                 cpi->sf.use_fast_coef_costing);
+            distortion += vp9_highbd_block_error_dispatch(
+                coeff, BLOCK_OFFSET(pd->dqcoeff, block),
+                16, &unused, xd->bd) >> 2;
+            if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
+              goto next_highbd;
+            vp9_highbd_iht4x4_add(tx_type, BLOCK_OFFSET(pd->dqcoeff, block),
+                                  dst, dst_stride, p->eobs[block], xd->bd);
+          }
+        }
+      }
+
+      rate += ratey;
+      this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+
+      if (this_rd < best_rd) {
+        *bestrate = rate;
+        *bestratey = ratey;
+        *bestdistortion = distortion;
+        best_rd = this_rd;
+        *best_mode = mode;
+        memcpy(a, tempa, num_4x4_blocks_wide * sizeof(tempa[0]));
+        memcpy(l, templ, num_4x4_blocks_high * sizeof(templ[0]));
+        for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy) {
+          memcpy(best_dst16 + idy * 8,
+                 CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride),
+                 num_4x4_blocks_wide * 4 * sizeof(uint16_t));
+        }
+      }
+    next_highbd:
+      {}
+    }
+    if (best_rd >= rd_thresh || x->skip_encode)
+      return best_rd;
+
+    for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy) {
+      memcpy(CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride),
+             best_dst16 + idy * 8,
+             num_4x4_blocks_wide * 4 * sizeof(uint16_t));
+    }
+
+    return best_rd;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
+    int64_t this_rd;
+    int ratey = 0;
+    int64_t distortion = 0;
+    int rate = bmode_costs[mode];
+
+    if (!(cpi->sf.intra_y_mode_mask[TX_4X4] & (1 << mode)))
+      continue;
+
+    // Only do the oblique modes if the best so far is
+    // one of the neighboring directional modes
+    if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
+      if (conditional_skipintra(mode, *best_mode))
+          continue;
+    }
+
+    memcpy(tempa, ta, num_4x4_blocks_wide * sizeof(ta[0]));
+    memcpy(templ, tl, num_4x4_blocks_high * sizeof(tl[0]));
+
+    for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
+      for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
+        const int block = (row + idy) * 2 + (col + idx);
+        const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
+        uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
+        int16_t *const src_diff =
+            vp9_raster_block_offset_int16(BLOCK_8X8, block, p->src_diff);
+        tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
+        xd->mi[0]->bmi[block].as_mode = mode;
+        vp9_predict_intra_block(xd, 1, TX_4X4, mode,
+                                x->skip_encode ? src : dst,
+                                x->skip_encode ? src_stride : dst_stride,
+                                dst, dst_stride, col + idx, row + idy, 0);
+        vpx_subtract_block(4, 4, src_diff, 8, src, src_stride, dst, dst_stride);
+
+        if (xd->lossless) {
+          const scan_order *so = &vp9_default_scan_orders[TX_4X4];
+          vp9_fwht4x4(src_diff, coeff, 8);
+          vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
+          ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
+                               so->scan, so->neighbors,
+                               cpi->sf.use_fast_coef_costing);
+          if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
+            goto next;
+          vp9_iwht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block), dst, dst_stride,
+                          p->eobs[block]);
+        } else {
+          int64_t unused;
+          const TX_TYPE tx_type = get_tx_type_4x4(PLANE_TYPE_Y, xd, block);
+          const scan_order *so = &vp9_scan_orders[TX_4X4][tx_type];
+          vp9_fht4x4(src_diff, coeff, 8, tx_type);
+          vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
+          ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
+                             so->scan, so->neighbors,
+                             cpi->sf.use_fast_coef_costing);
+#if CONFIG_VP9_HIGHBITDEPTH
+          distortion += vp9_highbd_block_error_8bit(
+              coeff, BLOCK_OFFSET(pd->dqcoeff, block), 16, &unused) >> 2;
+#else
+          distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block),
+                                        16, &unused) >> 2;
+#endif
+          if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
+            goto next;
+          vp9_iht4x4_add(tx_type, BLOCK_OFFSET(pd->dqcoeff, block),
+                         dst, dst_stride, p->eobs[block]);
+        }
+      }
+    }
+
+    rate += ratey;
+    this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+
+    if (this_rd < best_rd) {
+      *bestrate = rate;
+      *bestratey = ratey;
+      *bestdistortion = distortion;
+      best_rd = this_rd;
+      *best_mode = mode;
+      memcpy(a, tempa, num_4x4_blocks_wide * sizeof(tempa[0]));
+      memcpy(l, templ, num_4x4_blocks_high * sizeof(templ[0]));
+      for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy)
+        memcpy(best_dst + idy * 8, dst_init + idy * dst_stride,
+               num_4x4_blocks_wide * 4);
+    }
+  next:
+    {}
+  }
+
+  if (best_rd >= rd_thresh || x->skip_encode)
+    return best_rd;
+
+  for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy)
+    memcpy(dst_init + idy * dst_stride, best_dst + idy * 8,
+           num_4x4_blocks_wide * 4);
+
+  return best_rd;
+}
+
+static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP *cpi, MACROBLOCK *mb,
+                                            int *rate, int *rate_y,
+                                            int64_t *distortion,
+                                            int64_t best_rd) {
+  int i, j;
+  const MACROBLOCKD *const xd = &mb->e_mbd;
+  MODE_INFO *const mic = xd->mi[0];
+  const MODE_INFO *above_mi = xd->above_mi;
+  const MODE_INFO *left_mi = xd->left_mi;
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+  const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
+  int idx, idy;
+  int cost = 0;
+  int64_t total_distortion = 0;
+  int tot_rate_y = 0;
+  int64_t total_rd = 0;
+  const int *bmode_costs = cpi->mbmode_cost;
+
+  // Pick modes for each sub-block (of size 4x4, 4x8, or 8x4) in an 8x8 block.
+  for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
+    for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
+      PREDICTION_MODE best_mode = DC_PRED;
+      int r = INT_MAX, ry = INT_MAX;
+      int64_t d = INT64_MAX, this_rd = INT64_MAX;
+      i = idy * 2 + idx;
+      if (cpi->common.frame_type == KEY_FRAME) {
+        const PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, i);
+        const PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, i);
+
+        bmode_costs  = cpi->y_mode_costs[A][L];
+      }
+
+      this_rd = rd_pick_intra4x4block(cpi, mb, idy, idx, &best_mode,
+                                      bmode_costs,
+                                      xd->plane[0].above_context + idx,
+                                      xd->plane[0].left_context + idy,
+                                      &r, &ry, &d, bsize, best_rd - total_rd);
+
+      if (this_rd >= best_rd - total_rd)
+        return INT64_MAX;
+
+      total_rd += this_rd;
+      cost += r;
+      total_distortion += d;
+      tot_rate_y += ry;
+
+      mic->bmi[i].as_mode = best_mode;
+      for (j = 1; j < num_4x4_blocks_high; ++j)
+        mic->bmi[i + j * 2].as_mode = best_mode;
+      for (j = 1; j < num_4x4_blocks_wide; ++j)
+        mic->bmi[i + j].as_mode = best_mode;
+
+      if (total_rd >= best_rd)
+        return INT64_MAX;
+    }
+  }
+
+  *rate = cost;
+  *rate_y = tot_rate_y;
+  *distortion = total_distortion;
+  mic->mode = mic->bmi[3].as_mode;
+
+  return RDCOST(mb->rdmult, mb->rddiv, cost, total_distortion);
+}
+
+// This function is used only for intra_only frames
+static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
+                                      int *rate, int *rate_tokenonly,
+                                      int64_t *distortion, int *skippable,
+                                      BLOCK_SIZE bsize,
+                                      int64_t best_rd) {
+  PREDICTION_MODE mode;
+  PREDICTION_MODE mode_selected = DC_PRED;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *const mic = xd->mi[0];
+  int this_rate, this_rate_tokenonly, s;
+  int64_t this_distortion, this_rd;
+  TX_SIZE best_tx = TX_4X4;
+  int *bmode_costs;
+  const MODE_INFO *above_mi = xd->above_mi;
+  const MODE_INFO *left_mi = xd->left_mi;
+  const PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, 0);
+  const PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, 0);
+  bmode_costs = cpi->y_mode_costs[A][L];
+
+  memset(x->skip_txfm, SKIP_TXFM_NONE, sizeof(x->skip_txfm));
+  /* Y Search for intra prediction mode */
+  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
+    if (cpi->sf.use_nonrd_pick_mode) {
+      // These speed features are turned on in hybrid non-RD and RD mode
+      // for key frame coding in the context of real-time setting.
+      if (conditional_skipintra(mode, mode_selected))
+          continue;
+      if (*skippable)
+        break;
+    }
+
+    mic->mode = mode;
+
+    super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
+        &s, NULL, bsize, best_rd);
+
+    if (this_rate_tokenonly == INT_MAX)
+      continue;
+
+    this_rate = this_rate_tokenonly + bmode_costs[mode];
+    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+
+    if (this_rd < best_rd) {
+      mode_selected   = mode;
+      best_rd         = this_rd;
+      best_tx         = mic->tx_size;
+      *rate           = this_rate;
+      *rate_tokenonly = this_rate_tokenonly;
+      *distortion     = this_distortion;
+      *skippable      = s;
+    }
+  }
+
+  mic->mode = mode_selected;
+  mic->tx_size = best_tx;
+
+  return best_rd;
+}
+
+// Return value 0: early termination triggered, no valid rd cost available;
+//              1: rd cost values are valid.
+static int super_block_uvrd(const VP9_COMP *cpi, MACROBLOCK *x,
+                            int *rate, int64_t *distortion, int *skippable,
+                            int64_t *sse, BLOCK_SIZE bsize,
+                            int64_t ref_best_rd) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *const mi = xd->mi[0];
+  const TX_SIZE uv_tx_size = get_uv_tx_size(mi, &xd->plane[1]);
+  int plane;
+  int pnrate = 0, pnskip = 1;
+  int64_t pndist = 0, pnsse = 0;
+  int is_cost_valid = 1;
+
+  if (ref_best_rd < 0)
+    is_cost_valid = 0;
+
+  if (is_inter_block(mi) && is_cost_valid) {
+    int plane;
+    for (plane = 1; plane < MAX_MB_PLANE; ++plane)
+      vp9_subtract_plane(x, bsize, plane);
+  }
+
+  *rate = 0;
+  *distortion = 0;
+  *sse = 0;
+  *skippable = 1;
+
+  for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
+    txfm_rd_in_plane(x, &pnrate, &pndist, &pnskip, &pnsse,
+                     ref_best_rd, plane, bsize, uv_tx_size,
+                     cpi->sf.use_fast_coef_costing);
+    if (pnrate == INT_MAX) {
+      is_cost_valid = 0;
+      break;
+    }
+    *rate += pnrate;
+    *distortion += pndist;
+    *sse += pnsse;
+    *skippable &= pnskip;
+  }
+
+  if (!is_cost_valid) {
+    // reset cost value
+    *rate = INT_MAX;
+    *distortion = INT64_MAX;
+    *sse = INT64_MAX;
+    *skippable = 0;
+  }
+
+  return is_cost_valid;
+}
+
+static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
+                                       PICK_MODE_CONTEXT *ctx,
+                                       int *rate, int *rate_tokenonly,
+                                       int64_t *distortion, int *skippable,
+                                       BLOCK_SIZE bsize, TX_SIZE max_tx_size) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  PREDICTION_MODE mode;
+  PREDICTION_MODE mode_selected = DC_PRED;
+  int64_t best_rd = INT64_MAX, this_rd;
+  int this_rate_tokenonly, this_rate, s;
+  int64_t this_distortion, this_sse;
+
+  memset(x->skip_txfm, SKIP_TXFM_NONE, sizeof(x->skip_txfm));
+  for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
+    if (!(cpi->sf.intra_uv_mode_mask[max_tx_size] & (1 << mode)))
+      continue;
+
+    xd->mi[0]->uv_mode = mode;
+
+    if (!super_block_uvrd(cpi, x, &this_rate_tokenonly,
+                          &this_distortion, &s, &this_sse, bsize, best_rd))
+      continue;
+    this_rate = this_rate_tokenonly +
+        cpi->intra_uv_mode_cost[cpi->common.frame_type]
+                                [xd->mi[0]->mode][mode];
+    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+
+    if (this_rd < best_rd) {
+      mode_selected   = mode;
+      best_rd         = this_rd;
+      *rate           = this_rate;
+      *rate_tokenonly = this_rate_tokenonly;
+      *distortion     = this_distortion;
+      *skippable      = s;
+      if (!x->select_tx_size)
+        swap_block_ptr(x, ctx, 2, 0, 1, MAX_MB_PLANE);
+    }
+  }
+
+  xd->mi[0]->uv_mode = mode_selected;
+  return best_rd;
+}
+
+static int64_t rd_sbuv_dcpred(const VP9_COMP *cpi, MACROBLOCK *x,
+                              int *rate, int *rate_tokenonly,
+                              int64_t *distortion, int *skippable,
+                              BLOCK_SIZE bsize) {
+  const VP9_COMMON *cm = &cpi->common;
+  int64_t unused;
+
+  x->e_mbd.mi[0]->uv_mode = DC_PRED;
+  memset(x->skip_txfm, SKIP_TXFM_NONE, sizeof(x->skip_txfm));
+  super_block_uvrd(cpi, x, rate_tokenonly, distortion,
+                   skippable, &unused, bsize, INT64_MAX);
+  *rate = *rate_tokenonly +
+      cpi->intra_uv_mode_cost[cm->frame_type]
+                              [x->e_mbd.mi[0]->mode][DC_PRED];
+  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
+}
+
+static void choose_intra_uv_mode(VP9_COMP *cpi, MACROBLOCK *const x,
+                                 PICK_MODE_CONTEXT *ctx,
+                                 BLOCK_SIZE bsize, TX_SIZE max_tx_size,
+                                 int *rate_uv, int *rate_uv_tokenonly,
+                                 int64_t *dist_uv, int *skip_uv,
+                                 PREDICTION_MODE *mode_uv) {
+  // Use an estimated rd for uv_intra based on DC_PRED if the
+  // appropriate speed flag is set.
+  if (cpi->sf.use_uv_intra_rd_estimate) {
+    rd_sbuv_dcpred(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv,
+                   skip_uv, bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize);
+  // Else do a proper rd search for each possible transform size that may
+  // be considered in the main rd loop.
+  } else {
+    rd_pick_intra_sbuv_mode(cpi, x, ctx,
+                            rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
+                            bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize, max_tx_size);
+  }
+  *mode_uv = x->e_mbd.mi[0]->uv_mode;
+}
+
+static int cost_mv_ref(const VP9_COMP *cpi, PREDICTION_MODE mode,
+                       int mode_context) {
+  assert(is_inter_mode(mode));
+  return cpi->inter_mode_cost[mode_context][INTER_OFFSET(mode)];
+}
+
+static int set_and_cost_bmi_mvs(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
+                                int i,
+                                PREDICTION_MODE mode, int_mv this_mv[2],
+                                int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
+                                int_mv seg_mvs[MAX_REF_FRAMES],
+                                int_mv *best_ref_mv[2], const int *mvjcost,
+                                int *mvcost[2]) {
+  MODE_INFO *const mi = xd->mi[0];
+  const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  int thismvcost = 0;
+  int idx, idy;
+  const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mi->sb_type];
+  const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mi->sb_type];
+  const int is_compound = has_second_ref(mi);
+
+  switch (mode) {
+    case NEWMV:
+      this_mv[0].as_int = seg_mvs[mi->ref_frame[0]].as_int;
+      thismvcost += vp9_mv_bit_cost(&this_mv[0].as_mv, &best_ref_mv[0]->as_mv,
+                                    mvjcost, mvcost, MV_COST_WEIGHT_SUB);
+      if (is_compound) {
+        this_mv[1].as_int = seg_mvs[mi->ref_frame[1]].as_int;
+        thismvcost += vp9_mv_bit_cost(&this_mv[1].as_mv, &best_ref_mv[1]->as_mv,
+                                      mvjcost, mvcost, MV_COST_WEIGHT_SUB);
+      }
+      break;
+    case NEARMV:
+    case NEARESTMV:
+      this_mv[0].as_int = frame_mv[mode][mi->ref_frame[0]].as_int;
+      if (is_compound)
+        this_mv[1].as_int = frame_mv[mode][mi->ref_frame[1]].as_int;
+      break;
+    case ZEROMV:
+      this_mv[0].as_int = 0;
+      if (is_compound)
+        this_mv[1].as_int = 0;
+      break;
+    default:
+      break;
+  }
+
+  mi->bmi[i].as_mv[0].as_int = this_mv[0].as_int;
+  if (is_compound)
+    mi->bmi[i].as_mv[1].as_int = this_mv[1].as_int;
+
+  mi->bmi[i].as_mode = mode;
+
+  for (idy = 0; idy < num_4x4_blocks_high; ++idy)
+    for (idx = 0; idx < num_4x4_blocks_wide; ++idx)
+      memmove(&mi->bmi[i + idy * 2 + idx], &mi->bmi[i], sizeof(mi->bmi[i]));
+
+  return cost_mv_ref(cpi, mode, mbmi_ext->mode_context[mi->ref_frame[0]]) +
+            thismvcost;
+}
+
+static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
+                                       MACROBLOCK *x,
+                                       int64_t best_yrd,
+                                       int i,
+                                       int *labelyrate,
+                                       int64_t *distortion, int64_t *sse,
+                                       ENTROPY_CONTEXT *ta,
+                                       ENTROPY_CONTEXT *tl,
+                                       int mi_row, int mi_col) {
+  int k;
+  MACROBLOCKD *xd = &x->e_mbd;
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  struct macroblock_plane *const p = &x->plane[0];
+  MODE_INFO *const mi = xd->mi[0];
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(mi->sb_type, pd);
+  const int width = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+  const int height = 4 * num_4x4_blocks_high_lookup[plane_bsize];
+  int idx, idy;
+
+  const uint8_t *const src =
+      &p->src.buf[vp9_raster_block_offset(BLOCK_8X8, i, p->src.stride)];
+  uint8_t *const dst = &pd->dst.buf[vp9_raster_block_offset(BLOCK_8X8, i,
+                                                            pd->dst.stride)];
+  int64_t thisdistortion = 0, thissse = 0;
+  int thisrate = 0, ref;
+  const scan_order *so = &vp9_default_scan_orders[TX_4X4];
+  const int is_compound = has_second_ref(mi);
+  const InterpKernel *kernel = vp9_filter_kernels[mi->interp_filter];
+
+  for (ref = 0; ref < 1 + is_compound; ++ref) {
+    const int bw = b_width_log2_lookup[BLOCK_8X8];
+    const int h = 4 * (i >> bw);
+    const int w = 4 * (i & ((1 << bw) - 1));
+    const struct scale_factors *sf = &xd->block_refs[ref]->sf;
+    int y_stride = pd->pre[ref].stride;
+    uint8_t *pre = pd->pre[ref].buf + (h * pd->pre[ref].stride + w);
+
+    if (vp9_is_scaled(sf)) {
+      const int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x));
+      const int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y));
+
+      y_stride = xd->block_refs[ref]->buf->y_stride;
+      pre = xd->block_refs[ref]->buf->y_buffer;
+      pre += scaled_buffer_offset(x_start + w, y_start + h,
+                                  y_stride, sf);
+    }
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    vp9_highbd_build_inter_predictor(pre, y_stride,
+                                     dst, pd->dst.stride,
+                                     &mi->bmi[i].as_mv[ref].as_mv,
+                                     &xd->block_refs[ref]->sf, width, height,
+                                     ref, kernel, MV_PRECISION_Q3,
+                                     mi_col * MI_SIZE + 4 * (i % 2),
+                                     mi_row * MI_SIZE + 4 * (i / 2), xd->bd);
+  } else {
+    vp9_build_inter_predictor(pre, y_stride,
+                              dst, pd->dst.stride,
+                              &mi->bmi[i].as_mv[ref].as_mv,
+                              &xd->block_refs[ref]->sf, width, height, ref,
+                              kernel, MV_PRECISION_Q3,
+                              mi_col * MI_SIZE + 4 * (i % 2),
+                              mi_row * MI_SIZE + 4 * (i / 2));
+  }
+#else
+    vp9_build_inter_predictor(pre, y_stride,
+                              dst, pd->dst.stride,
+                              &mi->bmi[i].as_mv[ref].as_mv,
+                              &xd->block_refs[ref]->sf, width, height, ref,
+                              kernel, MV_PRECISION_Q3,
+                              mi_col * MI_SIZE + 4 * (i % 2),
+                              mi_row * MI_SIZE + 4 * (i / 2));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    vpx_highbd_subtract_block(
+        height, width, vp9_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
+        8, src, p->src.stride, dst, pd->dst.stride, xd->bd);
+  } else {
+    vpx_subtract_block(
+        height, width, vp9_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
+        8, src, p->src.stride, dst, pd->dst.stride);
+  }
+#else
+  vpx_subtract_block(height, width,
+                     vp9_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
+                     8, src, p->src.stride, dst, pd->dst.stride);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  k = i;
+  for (idy = 0; idy < height / 4; ++idy) {
+    for (idx = 0; idx < width / 4; ++idx) {
+#if CONFIG_VP9_HIGHBITDEPTH
+      const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8;
+#endif
+      int64_t ssz, rd, rd1, rd2;
+      tran_low_t* coeff;
+
+      k += (idy * 2 + idx);
+      coeff = BLOCK_OFFSET(p->coeff, k);
+      x->fwd_txm4x4(vp9_raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
+                    coeff, 8);
+      vp9_regular_quantize_b_4x4(x, 0, k, so->scan, so->iscan);
+#if CONFIG_VP9_HIGHBITDEPTH
+      thisdistortion += vp9_highbd_block_error_dispatch(
+          coeff, BLOCK_OFFSET(pd->dqcoeff, k), 16, &ssz, bd);
+#else
+      thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k),
+                                        16, &ssz);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      thissse += ssz;
+      thisrate += cost_coeffs(x, 0, k, ta + (k & 1), tl + (k >> 1), TX_4X4,
+                              so->scan, so->neighbors,
+                              cpi->sf.use_fast_coef_costing);
+      rd1 = RDCOST(x->rdmult, x->rddiv, thisrate, thisdistortion >> 2);
+      rd2 = RDCOST(x->rdmult, x->rddiv, 0, thissse >> 2);
+      rd = VPXMIN(rd1, rd2);
+      if (rd >= best_yrd)
+        return INT64_MAX;
+    }
+  }
+
+  *distortion = thisdistortion >> 2;
+  *labelyrate = thisrate;
+  *sse = thissse >> 2;
+
+  return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);
+}
+
+typedef struct {
+  int eobs;
+  int brate;
+  int byrate;
+  int64_t bdist;
+  int64_t bsse;
+  int64_t brdcost;
+  int_mv mvs[2];
+  ENTROPY_CONTEXT ta[2];
+  ENTROPY_CONTEXT tl[2];
+} SEG_RDSTAT;
+
+typedef struct {
+  int_mv *ref_mv[2];
+  int_mv mvp;
+
+  int64_t segment_rd;
+  int r;
+  int64_t d;
+  int64_t sse;
+  int segment_yrate;
+  PREDICTION_MODE modes[4];
+  SEG_RDSTAT rdstat[4][INTER_MODES];
+  int mvthresh;
+} BEST_SEG_INFO;
+
+static INLINE int mv_check_bounds(const MACROBLOCK *x, const MV *mv) {
+  return (mv->row >> 3) < x->mv_row_min ||
+         (mv->row >> 3) > x->mv_row_max ||
+         (mv->col >> 3) < x->mv_col_min ||
+         (mv->col >> 3) > x->mv_col_max;
+}
+
+static INLINE void mi_buf_shift(MACROBLOCK *x, int i) {
+  MODE_INFO *const mi = x->e_mbd.mi[0];
+  struct macroblock_plane *const p = &x->plane[0];
+  struct macroblockd_plane *const pd = &x->e_mbd.plane[0];
+
+  p->src.buf = &p->src.buf[vp9_raster_block_offset(BLOCK_8X8, i,
+                                                   p->src.stride)];
+  assert(((intptr_t)pd->pre[0].buf & 0x7) == 0);
+  pd->pre[0].buf = &pd->pre[0].buf[vp9_raster_block_offset(BLOCK_8X8, i,
+                                                           pd->pre[0].stride)];
+  if (has_second_ref(mi))
+    pd->pre[1].buf = &pd->pre[1].buf[vp9_raster_block_offset(BLOCK_8X8, i,
+                                                           pd->pre[1].stride)];
+}
+
+static INLINE void mi_buf_restore(MACROBLOCK *x, struct buf_2d orig_src,
+                                  struct buf_2d orig_pre[2]) {
+  MODE_INFO *mi = x->e_mbd.mi[0];
+  x->plane[0].src = orig_src;
+  x->e_mbd.plane[0].pre[0] = orig_pre[0];
+  if (has_second_ref(mi))
+    x->e_mbd.plane[0].pre[1] = orig_pre[1];
+}
+
+static INLINE int mv_has_subpel(const MV *mv) {
+  return (mv->row & 0x0F) || (mv->col & 0x0F);
+}
+
+// Check if NEARESTMV/NEARMV/ZEROMV is the cheapest way encode zero motion.
+// TODO(aconverse): Find out if this is still productive then clean up or remove
+static int check_best_zero_mv(
+    const VP9_COMP *cpi, const uint8_t mode_context[MAX_REF_FRAMES],
+    int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES], int this_mode,
+    const MV_REFERENCE_FRAME ref_frames[2]) {
+  if ((this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) &&
+      frame_mv[this_mode][ref_frames[0]].as_int == 0 &&
+      (ref_frames[1] == NONE ||
+       frame_mv[this_mode][ref_frames[1]].as_int == 0)) {
+    int rfc = mode_context[ref_frames[0]];
+    int c1 = cost_mv_ref(cpi, NEARMV, rfc);
+    int c2 = cost_mv_ref(cpi, NEARESTMV, rfc);
+    int c3 = cost_mv_ref(cpi, ZEROMV, rfc);
+
+    if (this_mode == NEARMV) {
+      if (c1 > c3) return 0;
+    } else if (this_mode == NEARESTMV) {
+      if (c2 > c3) return 0;
+    } else {
+      assert(this_mode == ZEROMV);
+      if (ref_frames[1] == NONE) {
+        if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frames[0]].as_int == 0) ||
+            (c3 >= c1 && frame_mv[NEARMV][ref_frames[0]].as_int == 0))
+          return 0;
+      } else {
+        if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frames[0]].as_int == 0 &&
+             frame_mv[NEARESTMV][ref_frames[1]].as_int == 0) ||
+            (c3 >= c1 && frame_mv[NEARMV][ref_frames[0]].as_int == 0 &&
+             frame_mv[NEARMV][ref_frames[1]].as_int == 0))
+          return 0;
+      }
+    }
+  }
+  return 1;
+}
+
+static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
+                                BLOCK_SIZE bsize,
+                                int_mv *frame_mv,
+                                int mi_row, int mi_col,
+                                int_mv single_newmv[MAX_REF_FRAMES],
+                                int *rate_mv) {
+  const VP9_COMMON *const cm = &cpi->common;
+  const int pw = 4 * num_4x4_blocks_wide_lookup[bsize];
+  const int ph = 4 * num_4x4_blocks_high_lookup[bsize];
+  MACROBLOCKD *xd = &x->e_mbd;
+  MODE_INFO *mi = xd->mi[0];
+  const int refs[2] = {mi->ref_frame[0],
+                       mi->ref_frame[1] < 0 ? 0 : mi->ref_frame[1]};
+  int_mv ref_mv[2];
+  int ite, ref;
+  const InterpKernel *kernel = vp9_filter_kernels[mi->interp_filter];
+  struct scale_factors sf;
+
+  // Do joint motion search in compound mode to get more accurate mv.
+  struct buf_2d backup_yv12[2][MAX_MB_PLANE];
+  int last_besterr[2] = {INT_MAX, INT_MAX};
+  const YV12_BUFFER_CONFIG *const scaled_ref_frame[2] = {
+    vp9_get_scaled_ref_frame(cpi, mi->ref_frame[0]),
+    vp9_get_scaled_ref_frame(cpi, mi->ref_frame[1])
+  };
+
+  // Prediction buffer from second frame.
+#if CONFIG_VP9_HIGHBITDEPTH
+  DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[64 * 64]);
+  uint8_t *second_pred;
+#else
+  DECLARE_ALIGNED(16, uint8_t, second_pred[64 * 64]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  for (ref = 0; ref < 2; ++ref) {
+    ref_mv[ref] = x->mbmi_ext->ref_mvs[refs[ref]][0];
+
+    if (scaled_ref_frame[ref]) {
+      int i;
+      // Swap out the reference frame for a version that's been scaled to
+      // match the resolution of the current frame, allowing the existing
+      // motion search code to be used without additional modifications.
+      for (i = 0; i < MAX_MB_PLANE; i++)
+        backup_yv12[ref][i] = xd->plane[i].pre[ref];
+      vp9_setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col,
+                           NULL);
+    }
+
+    frame_mv[refs[ref]].as_int = single_newmv[refs[ref]].as_int;
+  }
+
+  // Since we have scaled the reference frames to match the size of the current
+  // frame we must use a unit scaling factor during mode selection.
+#if CONFIG_VP9_HIGHBITDEPTH
+  vp9_setup_scale_factors_for_frame(&sf, cm->width, cm->height,
+                                    cm->width, cm->height,
+                                    cm->use_highbitdepth);
+#else
+  vp9_setup_scale_factors_for_frame(&sf, cm->width, cm->height,
+                                    cm->width, cm->height);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  // Allow joint search multiple times iteratively for each reference frame
+  // and break out of the search loop if it couldn't find a better mv.
+  for (ite = 0; ite < 4; ite++) {
+    struct buf_2d ref_yv12[2];
+    int bestsme = INT_MAX;
+    int sadpb = x->sadperbit16;
+    MV tmp_mv;
+    int search_range = 3;
+
+    int tmp_col_min = x->mv_col_min;
+    int tmp_col_max = x->mv_col_max;
+    int tmp_row_min = x->mv_row_min;
+    int tmp_row_max = x->mv_row_max;
+    int id = ite % 2;  // Even iterations search in the first reference frame,
+                       // odd iterations search in the second. The predictor
+                       // found for the 'other' reference frame is factored in.
+
+    // Initialized here because of compiler problem in Visual Studio.
+    ref_yv12[0] = xd->plane[0].pre[0];
+    ref_yv12[1] = xd->plane[0].pre[1];
+
+    // Get the prediction block from the 'other' reference frame.
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16);
+      vp9_highbd_build_inter_predictor(ref_yv12[!id].buf,
+                                       ref_yv12[!id].stride,
+                                       second_pred, pw,
+                                       &frame_mv[refs[!id]].as_mv,
+                                       &sf, pw, ph, 0,
+                                       kernel, MV_PRECISION_Q3,
+                                       mi_col * MI_SIZE, mi_row * MI_SIZE,
+                                       xd->bd);
+    } else {
+      second_pred = (uint8_t *)second_pred_alloc_16;
+      vp9_build_inter_predictor(ref_yv12[!id].buf,
+                                ref_yv12[!id].stride,
+                                second_pred, pw,
+                                &frame_mv[refs[!id]].as_mv,
+                                &sf, pw, ph, 0,
+                                kernel, MV_PRECISION_Q3,
+                                mi_col * MI_SIZE, mi_row * MI_SIZE);
+    }
+#else
+    vp9_build_inter_predictor(ref_yv12[!id].buf,
+                              ref_yv12[!id].stride,
+                              second_pred, pw,
+                              &frame_mv[refs[!id]].as_mv,
+                              &sf, pw, ph, 0,
+                              kernel, MV_PRECISION_Q3,
+                              mi_col * MI_SIZE, mi_row * MI_SIZE);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    // Do compound motion search on the current reference frame.
+    if (id)
+      xd->plane[0].pre[0] = ref_yv12[id];
+    vp9_set_mv_search_range(x, &ref_mv[id].as_mv);
+
+    // Use the mv result from the single mode as mv predictor.
+    tmp_mv = frame_mv[refs[id]].as_mv;
+
+    tmp_mv.col >>= 3;
+    tmp_mv.row >>= 3;
+
+    // Small-range full-pixel motion search.
+    bestsme = vp9_refining_search_8p_c(x, &tmp_mv, sadpb,
+                                       search_range,
+                                       &cpi->fn_ptr[bsize],
+                                       &ref_mv[id].as_mv, second_pred);
+    if (bestsme < INT_MAX)
+      bestsme = vp9_get_mvpred_av_var(x, &tmp_mv, &ref_mv[id].as_mv,
+                                      second_pred, &cpi->fn_ptr[bsize], 1);
+
+    x->mv_col_min = tmp_col_min;
+    x->mv_col_max = tmp_col_max;
+    x->mv_row_min = tmp_row_min;
+    x->mv_row_max = tmp_row_max;
+
+    if (bestsme < INT_MAX) {
+      int dis; /* TODO: use dis in distortion calculation later. */
+      unsigned int sse;
+      bestsme = cpi->find_fractional_mv_step(
+          x, &tmp_mv,
+          &ref_mv[id].as_mv,
+          cpi->common.allow_high_precision_mv,
+          x->errorperbit,
+          &cpi->fn_ptr[bsize],
+          0, cpi->sf.mv.subpel_iters_per_step,
+          NULL,
+          x->nmvjointcost, x->mvcost,
+          &dis, &sse, second_pred,
+          pw, ph);
+    }
+
+    // Restore the pointer to the first (possibly scaled) prediction buffer.
+    if (id)
+      xd->plane[0].pre[0] = ref_yv12[0];
+
+    if (bestsme < last_besterr[id]) {
+      frame_mv[refs[id]].as_mv = tmp_mv;
+      last_besterr[id] = bestsme;
+    } else {
+      break;
+    }
+  }
+
+  *rate_mv = 0;
+
+  for (ref = 0; ref < 2; ++ref) {
+    if (scaled_ref_frame[ref]) {
+      // Restore the prediction frame pointers to their unscaled versions.
+      int i;
+      for (i = 0; i < MAX_MB_PLANE; i++)
+        xd->plane[i].pre[ref] = backup_yv12[ref][i];
+    }
+
+    *rate_mv += vp9_mv_bit_cost(&frame_mv[refs[ref]].as_mv,
+                                &x->mbmi_ext->ref_mvs[refs[ref]][0].as_mv,
+                                x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+  }
+}
+
+static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
+                                        int_mv *best_ref_mv,
+                                        int_mv *second_best_ref_mv,
+                                        int64_t best_rd, int *returntotrate,
+                                        int *returnyrate,
+                                        int64_t *returndistortion,
+                                        int *skippable, int64_t *psse,
+                                        int mvthresh,
+                                        int_mv seg_mvs[4][MAX_REF_FRAMES],
+                                        BEST_SEG_INFO *bsi_buf, int filter_idx,
+                                        int mi_row, int mi_col) {
+  int i;
+  BEST_SEG_INFO *bsi = bsi_buf + filter_idx;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MODE_INFO *mi = xd->mi[0];
+  int mode_idx;
+  int k, br = 0, idx, idy;
+  int64_t bd = 0, block_sse = 0;
+  PREDICTION_MODE this_mode;
+  VP9_COMMON *cm = &cpi->common;
+  struct macroblock_plane *const p = &x->plane[0];
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  const int label_count = 4;
+  int64_t this_segment_rd = 0;
+  int label_mv_thresh;
+  int segmentyrate = 0;
+  const BLOCK_SIZE bsize = mi->sb_type;
+  const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+  const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
+  ENTROPY_CONTEXT t_above[2], t_left[2];
+  int subpelmv = 1, have_ref = 0;
+  SPEED_FEATURES *const sf = &cpi->sf;
+  const int has_second_rf = has_second_ref(mi);
+  const int inter_mode_mask = sf->inter_mode_mask[bsize];
+  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+
+  vp9_zero(*bsi);
+
+  bsi->segment_rd = best_rd;
+  bsi->ref_mv[0] = best_ref_mv;
+  bsi->ref_mv[1] = second_best_ref_mv;
+  bsi->mvp.as_int = best_ref_mv->as_int;
+  bsi->mvthresh = mvthresh;
+
+  for (i = 0; i < 4; i++)
+    bsi->modes[i] = ZEROMV;
+
+  memcpy(t_above, pd->above_context, sizeof(t_above));
+  memcpy(t_left, pd->left_context, sizeof(t_left));
+
+  // 64 makes this threshold really big effectively
+  // making it so that we very rarely check mvs on
+  // segments.   setting this to 1 would make mv thresh
+  // roughly equal to what it is for macroblocks
+  label_mv_thresh = 1 * bsi->mvthresh / label_count;
+
+  // Segmentation method overheads
+  for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
+    for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
+      // TODO(jingning,rbultje): rewrite the rate-distortion optimization
+      // loop for 4x4/4x8/8x4 block coding. to be replaced with new rd loop
+      int_mv mode_mv[MB_MODE_COUNT][2];
+      int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
+      PREDICTION_MODE mode_selected = ZEROMV;
+      int64_t best_rd = INT64_MAX;
+      const int i = idy * 2 + idx;
+      int ref;
+
+      for (ref = 0; ref < 1 + has_second_rf; ++ref) {
+        const MV_REFERENCE_FRAME frame = mi->ref_frame[ref];
+        frame_mv[ZEROMV][frame].as_int = 0;
+        vp9_append_sub8x8_mvs_for_idx(cm, xd, i, ref, mi_row, mi_col,
+                                      &frame_mv[NEARESTMV][frame],
+                                      &frame_mv[NEARMV][frame],
+                                      mbmi_ext->mode_context);
+      }
+
+      // search for the best motion vector on this segment
+      for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
+        const struct buf_2d orig_src = x->plane[0].src;
+        struct buf_2d orig_pre[2];
+
+        mode_idx = INTER_OFFSET(this_mode);
+        bsi->rdstat[i][mode_idx].brdcost = INT64_MAX;
+        if (!(inter_mode_mask & (1 << this_mode)))
+          continue;
+
+        if (!check_best_zero_mv(cpi, mbmi_ext->mode_context, frame_mv,
+                                this_mode, mi->ref_frame))
+          continue;
+
+        memcpy(orig_pre, pd->pre, sizeof(orig_pre));
+        memcpy(bsi->rdstat[i][mode_idx].ta, t_above,
+               sizeof(bsi->rdstat[i][mode_idx].ta));
+        memcpy(bsi->rdstat[i][mode_idx].tl, t_left,
+               sizeof(bsi->rdstat[i][mode_idx].tl));
+
+        // motion search for newmv (single predictor case only)
+        if (!has_second_rf && this_mode == NEWMV &&
+            seg_mvs[i][mi->ref_frame[0]].as_int == INVALID_MV) {
+          MV *const new_mv = &mode_mv[NEWMV][0].as_mv;
+          int step_param = 0;
+          int bestsme = INT_MAX;
+          int sadpb = x->sadperbit4;
+          MV mvp_full;
+          int max_mv;
+          int cost_list[5];
+
+          /* Is the best so far sufficiently good that we cant justify doing
+           * and new motion search. */
+          if (best_rd < label_mv_thresh)
+            break;
+
+          if (cpi->oxcf.mode != BEST) {
+            // use previous block's result as next block's MV predictor.
+            if (i > 0) {
+              bsi->mvp.as_int = mi->bmi[i - 1].as_mv[0].as_int;
+              if (i == 2)
+                bsi->mvp.as_int = mi->bmi[i - 2].as_mv[0].as_int;
+            }
+          }
+          if (i == 0)
+            max_mv = x->max_mv_context[mi->ref_frame[0]];
+          else
+            max_mv =
+                VPXMAX(abs(bsi->mvp.as_mv.row), abs(bsi->mvp.as_mv.col)) >> 3;
+
+          if (sf->mv.auto_mv_step_size && cm->show_frame) {
+            // Take wtd average of the step_params based on the last frame's
+            // max mv magnitude and the best ref mvs of the current block for
+            // the given reference.
+            step_param = (vp9_init_search_range(max_mv) +
+                              cpi->mv_step_param) / 2;
+          } else {
+            step_param = cpi->mv_step_param;
+          }
+
+          mvp_full.row = bsi->mvp.as_mv.row >> 3;
+          mvp_full.col = bsi->mvp.as_mv.col >> 3;
+
+          if (sf->adaptive_motion_search) {
+            mvp_full.row = x->pred_mv[mi->ref_frame[0]].row >> 3;
+            mvp_full.col = x->pred_mv[mi->ref_frame[0]].col >> 3;
+            step_param = VPXMAX(step_param, 8);
+          }
+
+          // adjust src pointer for this block
+          mi_buf_shift(x, i);
+
+          vp9_set_mv_search_range(x, &bsi->ref_mv[0]->as_mv);
+
+          bestsme = vp9_full_pixel_search(
+              cpi, x, bsize, &mvp_full, step_param, sadpb,
+              sf->mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL,
+              &bsi->ref_mv[0]->as_mv, new_mv,
+              INT_MAX, 1);
+
+          if (bestsme < INT_MAX) {
+            int distortion;
+            cpi->find_fractional_mv_step(
+                x,
+                new_mv,
+                &bsi->ref_mv[0]->as_mv,
+                cm->allow_high_precision_mv,
+                x->errorperbit, &cpi->fn_ptr[bsize],
+                sf->mv.subpel_force_stop,
+                sf->mv.subpel_iters_per_step,
+                cond_cost_list(cpi, cost_list),
+                x->nmvjointcost, x->mvcost,
+                &distortion,
+                &x->pred_sse[mi->ref_frame[0]],
+                NULL, 0, 0);
+
+            // save motion search result for use in compound prediction
+            seg_mvs[i][mi->ref_frame[0]].as_mv = *new_mv;
+          }
+
+          if (sf->adaptive_motion_search)
+            x->pred_mv[mi->ref_frame[0]] = *new_mv;
+
+          // restore src pointers
+          mi_buf_restore(x, orig_src, orig_pre);
+        }
+
+        if (has_second_rf) {
+          if (seg_mvs[i][mi->ref_frame[1]].as_int == INVALID_MV ||
+              seg_mvs[i][mi->ref_frame[0]].as_int == INVALID_MV)
+            continue;
+        }
+
+        if (has_second_rf && this_mode == NEWMV &&
+            mi->interp_filter == EIGHTTAP) {
+          // adjust src pointers
+          mi_buf_shift(x, i);
+          if (sf->comp_inter_joint_search_thresh <= bsize) {
+            int rate_mv;
+            joint_motion_search(cpi, x, bsize, frame_mv[this_mode],
+                                mi_row, mi_col, seg_mvs[i],
+                                &rate_mv);
+            seg_mvs[i][mi->ref_frame[0]].as_int =
+                frame_mv[this_mode][mi->ref_frame[0]].as_int;
+            seg_mvs[i][mi->ref_frame[1]].as_int =
+                frame_mv[this_mode][mi->ref_frame[1]].as_int;
+          }
+          // restore src pointers
+          mi_buf_restore(x, orig_src, orig_pre);
+        }
+
+        bsi->rdstat[i][mode_idx].brate =
+            set_and_cost_bmi_mvs(cpi, x, xd, i, this_mode, mode_mv[this_mode],
+                                 frame_mv, seg_mvs[i], bsi->ref_mv,
+                                 x->nmvjointcost, x->mvcost);
+
+        for (ref = 0; ref < 1 + has_second_rf; ++ref) {
+          bsi->rdstat[i][mode_idx].mvs[ref].as_int =
+              mode_mv[this_mode][ref].as_int;
+          if (num_4x4_blocks_wide > 1)
+            bsi->rdstat[i + 1][mode_idx].mvs[ref].as_int =
+                mode_mv[this_mode][ref].as_int;
+          if (num_4x4_blocks_high > 1)
+            bsi->rdstat[i + 2][mode_idx].mvs[ref].as_int =
+                mode_mv[this_mode][ref].as_int;
+        }
+
+        // Trap vectors that reach beyond the UMV borders
+        if (mv_check_bounds(x, &mode_mv[this_mode][0].as_mv) ||
+            (has_second_rf &&
+             mv_check_bounds(x, &mode_mv[this_mode][1].as_mv)))
+          continue;
+
+        if (filter_idx > 0) {
+          BEST_SEG_INFO *ref_bsi = bsi_buf;
+          subpelmv = 0;
+          have_ref = 1;
+
+          for (ref = 0; ref < 1 + has_second_rf; ++ref) {
+            subpelmv |= mv_has_subpel(&mode_mv[this_mode][ref].as_mv);
+            have_ref &= mode_mv[this_mode][ref].as_int ==
+                ref_bsi->rdstat[i][mode_idx].mvs[ref].as_int;
+          }
+
+          if (filter_idx > 1 && !subpelmv && !have_ref) {
+            ref_bsi = bsi_buf + 1;
+            have_ref = 1;
+            for (ref = 0; ref < 1 + has_second_rf; ++ref)
+              have_ref &= mode_mv[this_mode][ref].as_int ==
+                  ref_bsi->rdstat[i][mode_idx].mvs[ref].as_int;
+          }
+
+          if (!subpelmv && have_ref &&
+              ref_bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) {
+            memcpy(&bsi->rdstat[i][mode_idx], &ref_bsi->rdstat[i][mode_idx],
+                   sizeof(SEG_RDSTAT));
+            if (num_4x4_blocks_wide > 1)
+              bsi->rdstat[i + 1][mode_idx].eobs =
+                  ref_bsi->rdstat[i + 1][mode_idx].eobs;
+            if (num_4x4_blocks_high > 1)
+              bsi->rdstat[i + 2][mode_idx].eobs =
+                  ref_bsi->rdstat[i + 2][mode_idx].eobs;
+
+            if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
+              mode_selected = this_mode;
+              best_rd = bsi->rdstat[i][mode_idx].brdcost;
+            }
+            continue;
+          }
+        }
+
+        bsi->rdstat[i][mode_idx].brdcost =
+            encode_inter_mb_segment(cpi, x,
+                                    bsi->segment_rd - this_segment_rd, i,
+                                    &bsi->rdstat[i][mode_idx].byrate,
+                                    &bsi->rdstat[i][mode_idx].bdist,
+                                    &bsi->rdstat[i][mode_idx].bsse,
+                                    bsi->rdstat[i][mode_idx].ta,
+                                    bsi->rdstat[i][mode_idx].tl,
+                                    mi_row, mi_col);
+        if (bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) {
+          bsi->rdstat[i][mode_idx].brdcost += RDCOST(x->rdmult, x->rddiv,
+                                            bsi->rdstat[i][mode_idx].brate, 0);
+          bsi->rdstat[i][mode_idx].brate += bsi->rdstat[i][mode_idx].byrate;
+          bsi->rdstat[i][mode_idx].eobs = p->eobs[i];
+          if (num_4x4_blocks_wide > 1)
+            bsi->rdstat[i + 1][mode_idx].eobs = p->eobs[i + 1];
+          if (num_4x4_blocks_high > 1)
+            bsi->rdstat[i + 2][mode_idx].eobs = p->eobs[i + 2];
+        }
+
+        if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
+          mode_selected = this_mode;
+          best_rd = bsi->rdstat[i][mode_idx].brdcost;
+        }
+      } /*for each 4x4 mode*/
+
+      if (best_rd == INT64_MAX) {
+        int iy, midx;
+        for (iy = i + 1; iy < 4; ++iy)
+          for (midx = 0; midx < INTER_MODES; ++midx)
+            bsi->rdstat[iy][midx].brdcost = INT64_MAX;
+        bsi->segment_rd = INT64_MAX;
+        return INT64_MAX;
+      }
+
+      mode_idx = INTER_OFFSET(mode_selected);
+      memcpy(t_above, bsi->rdstat[i][mode_idx].ta, sizeof(t_above));
+      memcpy(t_left, bsi->rdstat[i][mode_idx].tl, sizeof(t_left));
+
+      set_and_cost_bmi_mvs(cpi, x, xd, i, mode_selected, mode_mv[mode_selected],
+                           frame_mv, seg_mvs[i], bsi->ref_mv, x->nmvjointcost,
+                           x->mvcost);
+
+      br += bsi->rdstat[i][mode_idx].brate;
+      bd += bsi->rdstat[i][mode_idx].bdist;
+      block_sse += bsi->rdstat[i][mode_idx].bsse;
+      segmentyrate += bsi->rdstat[i][mode_idx].byrate;
+      this_segment_rd += bsi->rdstat[i][mode_idx].brdcost;
+
+      if (this_segment_rd > bsi->segment_rd) {
+        int iy, midx;
+        for (iy = i + 1; iy < 4; ++iy)
+          for (midx = 0; midx < INTER_MODES; ++midx)
+            bsi->rdstat[iy][midx].brdcost = INT64_MAX;
+        bsi->segment_rd = INT64_MAX;
+        return INT64_MAX;
+      }
+    }
+  } /* for each label */
+
+  bsi->r = br;
+  bsi->d = bd;
+  bsi->segment_yrate = segmentyrate;
+  bsi->segment_rd = this_segment_rd;
+  bsi->sse = block_sse;
+
+  // update the coding decisions
+  for (k = 0; k < 4; ++k)
+    bsi->modes[k] = mi->bmi[k].as_mode;
+
+  if (bsi->segment_rd > best_rd)
+    return INT64_MAX;
+  /* set it to the best */
+  for (i = 0; i < 4; i++) {
+    mode_idx = INTER_OFFSET(bsi->modes[i]);
+    mi->bmi[i].as_mv[0].as_int = bsi->rdstat[i][mode_idx].mvs[0].as_int;
+    if (has_second_ref(mi))
+      mi->bmi[i].as_mv[1].as_int = bsi->rdstat[i][mode_idx].mvs[1].as_int;
+    x->plane[0].eobs[i] = bsi->rdstat[i][mode_idx].eobs;
+    mi->bmi[i].as_mode = bsi->modes[i];
+  }
+
+  /*
+   * used to set mbmi->mv.as_int
+   */
+  *returntotrate = bsi->r;
+  *returndistortion = bsi->d;
+  *returnyrate = bsi->segment_yrate;
+  *skippable = vp9_is_skippable_in_plane(x, BLOCK_8X8, 0);
+  *psse = bsi->sse;
+  mi->mode = bsi->modes[3];
+
+  return bsi->segment_rd;
+}
+
+static void estimate_ref_frame_costs(const VP9_COMMON *cm,
+                                     const MACROBLOCKD *xd,
+                                     int segment_id,
+                                     unsigned int *ref_costs_single,
+                                     unsigned int *ref_costs_comp,
+                                     vpx_prob *comp_mode_p) {
+  int seg_ref_active = segfeature_active(&cm->seg, segment_id,
+                                         SEG_LVL_REF_FRAME);
+  if (seg_ref_active) {
+    memset(ref_costs_single, 0, MAX_REF_FRAMES * sizeof(*ref_costs_single));
+    memset(ref_costs_comp,   0, MAX_REF_FRAMES * sizeof(*ref_costs_comp));
+    *comp_mode_p = 128;
+  } else {
+    vpx_prob intra_inter_p = vp9_get_intra_inter_prob(cm, xd);
+    vpx_prob comp_inter_p = 128;
+
+    if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+      comp_inter_p = vp9_get_reference_mode_prob(cm, xd);
+      *comp_mode_p = comp_inter_p;
+    } else {
+      *comp_mode_p = 128;
+    }
+
+    ref_costs_single[INTRA_FRAME] = vp9_cost_bit(intra_inter_p, 0);
+
+    if (cm->reference_mode != COMPOUND_REFERENCE) {
+      vpx_prob ref_single_p1 = vp9_get_pred_prob_single_ref_p1(cm, xd);
+      vpx_prob ref_single_p2 = vp9_get_pred_prob_single_ref_p2(cm, xd);
+      unsigned int base_cost = vp9_cost_bit(intra_inter_p, 1);
+
+      if (cm->reference_mode == REFERENCE_MODE_SELECT)
+        base_cost += vp9_cost_bit(comp_inter_p, 0);
+
+      ref_costs_single[LAST_FRAME] = ref_costs_single[GOLDEN_FRAME] =
+          ref_costs_single[ALTREF_FRAME] = base_cost;
+      ref_costs_single[LAST_FRAME]   += vp9_cost_bit(ref_single_p1, 0);
+      ref_costs_single[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p1, 1);
+      ref_costs_single[ALTREF_FRAME] += vp9_cost_bit(ref_single_p1, 1);
+      ref_costs_single[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p2, 0);
+      ref_costs_single[ALTREF_FRAME] += vp9_cost_bit(ref_single_p2, 1);
+    } else {
+      ref_costs_single[LAST_FRAME]   = 512;
+      ref_costs_single[GOLDEN_FRAME] = 512;
+      ref_costs_single[ALTREF_FRAME] = 512;
+    }
+    if (cm->reference_mode != SINGLE_REFERENCE) {
+      vpx_prob ref_comp_p = vp9_get_pred_prob_comp_ref_p(cm, xd);
+      unsigned int base_cost = vp9_cost_bit(intra_inter_p, 1);
+
+      if (cm->reference_mode == REFERENCE_MODE_SELECT)
+        base_cost += vp9_cost_bit(comp_inter_p, 1);
+
+      ref_costs_comp[LAST_FRAME]   = base_cost + vp9_cost_bit(ref_comp_p, 0);
+      ref_costs_comp[GOLDEN_FRAME] = base_cost + vp9_cost_bit(ref_comp_p, 1);
+    } else {
+      ref_costs_comp[LAST_FRAME]   = 512;
+      ref_costs_comp[GOLDEN_FRAME] = 512;
+    }
+  }
+}
+
+static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
+                         int mode_index,
+                         int64_t comp_pred_diff[REFERENCE_MODES],
+                         int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS],
+                         int skippable) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  // Take a snapshot of the coding context so it can be
+  // restored if we decide to encode this way
+  ctx->skip = x->skip;
+  ctx->skippable = skippable;
+  ctx->best_mode_index = mode_index;
+  ctx->mic = *xd->mi[0];
+  ctx->mbmi_ext = *x->mbmi_ext;
+  ctx->single_pred_diff = (int)comp_pred_diff[SINGLE_REFERENCE];
+  ctx->comp_pred_diff   = (int)comp_pred_diff[COMPOUND_REFERENCE];
+  ctx->hybrid_pred_diff = (int)comp_pred_diff[REFERENCE_MODE_SELECT];
+
+  memcpy(ctx->best_filter_diff, best_filter_diff,
+         sizeof(*best_filter_diff) * SWITCHABLE_FILTER_CONTEXTS);
+}
+
+static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
+                               MV_REFERENCE_FRAME ref_frame,
+                               BLOCK_SIZE block_size,
+                               int mi_row, int mi_col,
+                               int_mv frame_nearest_mv[MAX_REF_FRAMES],
+                               int_mv frame_near_mv[MAX_REF_FRAMES],
+                               struct buf_2d yv12_mb[4][MAX_MB_PLANE]) {
+  const VP9_COMMON *cm = &cpi->common;
+  const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *const mi = xd->mi[0];
+  int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame];
+  const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
+  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+
+  assert(yv12 != NULL);
+
+  // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this
+  // use the UV scaling factors.
+  vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf);
+
+  // Gets an initial list of candidate vectors from neighbours and orders them
+  vp9_find_mv_refs(cm, xd, mi, ref_frame, candidates, mi_row, mi_col,
+                   mbmi_ext->mode_context);
+
+  // Candidate refinement carried out at encoder and decoder
+  vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, candidates,
+                        &frame_nearest_mv[ref_frame],
+                        &frame_near_mv[ref_frame]);
+
+  // Further refinement that is encode side only to test the top few candidates
+  // in full and choose the best as the centre point for subsequent searches.
+  // The current implementation doesn't support scaling.
+  if (!vp9_is_scaled(sf) && block_size >= BLOCK_8X8)
+    vp9_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride,
+                ref_frame, block_size);
+}
+
+static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
+                                 BLOCK_SIZE bsize,
+                                 int mi_row, int mi_col,
+                                 int_mv *tmp_mv, int *rate_mv) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const VP9_COMMON *cm = &cpi->common;
+  MODE_INFO *mi = xd->mi[0];
+  struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0, 0}};
+  int bestsme = INT_MAX;
+  int step_param;
+  int sadpb = x->sadperbit16;
+  MV mvp_full;
+  int ref = mi->ref_frame[0];
+  MV ref_mv = x->mbmi_ext->ref_mvs[ref][0].as_mv;
+
+  int tmp_col_min = x->mv_col_min;
+  int tmp_col_max = x->mv_col_max;
+  int tmp_row_min = x->mv_row_min;
+  int tmp_row_max = x->mv_row_max;
+  int cost_list[5];
+
+  const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi,
+                                                                        ref);
+
+  MV pred_mv[3];
+  pred_mv[0] = x->mbmi_ext->ref_mvs[ref][0].as_mv;
+  pred_mv[1] = x->mbmi_ext->ref_mvs[ref][1].as_mv;
+  pred_mv[2] = x->pred_mv[ref];
+
+  if (scaled_ref_frame) {
+    int i;
+    // Swap out the reference frame for a version that's been scaled to
+    // match the resolution of the current frame, allowing the existing
+    // motion search code to be used without additional modifications.
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      backup_yv12[i] = xd->plane[i].pre[0];
+
+    vp9_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
+  }
+
+  vp9_set_mv_search_range(x, &ref_mv);
+
+  // Work out the size of the first step in the mv step search.
+  // 0 here is maximum length first step. 1 is VPXMAX >> 1 etc.
+  if (cpi->sf.mv.auto_mv_step_size && cm->show_frame) {
+    // Take wtd average of the step_params based on the last frame's
+    // max mv magnitude and that based on the best ref mvs of the current
+    // block for the given reference.
+    step_param = (vp9_init_search_range(x->max_mv_context[ref]) +
+                    cpi->mv_step_param) / 2;
+  } else {
+    step_param = cpi->mv_step_param;
+  }
+
+  if (cpi->sf.adaptive_motion_search && bsize < BLOCK_64X64) {
+    int boffset =
+        2 * (b_width_log2_lookup[BLOCK_64X64] -
+             VPXMIN(b_height_log2_lookup[bsize], b_width_log2_lookup[bsize]));
+    step_param = VPXMAX(step_param, boffset);
+  }
+
+  if (cpi->sf.adaptive_motion_search) {
+    int bwl = b_width_log2_lookup[bsize];
+    int bhl = b_height_log2_lookup[bsize];
+    int tlevel = x->pred_mv_sad[ref] >> (bwl + bhl + 4);
+
+    if (tlevel < 5)
+      step_param += 2;
+
+    // prev_mv_sad is not setup for dynamically scaled frames.
+    if (cpi->oxcf.resize_mode != RESIZE_DYNAMIC) {
+      int i;
+      for (i = LAST_FRAME; i <= ALTREF_FRAME && cm->show_frame; ++i) {
+        if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
+          x->pred_mv[ref].row = 0;
+          x->pred_mv[ref].col = 0;
+          tmp_mv->as_int = INVALID_MV;
+
+          if (scaled_ref_frame) {
+            int i;
+            for (i = 0; i < MAX_MB_PLANE; ++i)
+              xd->plane[i].pre[0] = backup_yv12[i];
+          }
+          return;
+        }
+      }
+    }
+  }
+
+  mvp_full = pred_mv[x->mv_best_ref_index[ref]];
+
+  mvp_full.col >>= 3;
+  mvp_full.row >>= 3;
+
+  bestsme = vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb,
+                                  cond_cost_list(cpi, cost_list),
+                                  &ref_mv, &tmp_mv->as_mv, INT_MAX, 1);
+
+  x->mv_col_min = tmp_col_min;
+  x->mv_col_max = tmp_col_max;
+  x->mv_row_min = tmp_row_min;
+  x->mv_row_max = tmp_row_max;
+
+  if (bestsme < INT_MAX) {
+    int dis;  /* TODO: use dis in distortion calculation later. */
+    cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv,
+                                 cm->allow_high_precision_mv,
+                                 x->errorperbit,
+                                 &cpi->fn_ptr[bsize],
+                                 cpi->sf.mv.subpel_force_stop,
+                                 cpi->sf.mv.subpel_iters_per_step,
+                                 cond_cost_list(cpi, cost_list),
+                                 x->nmvjointcost, x->mvcost,
+                                 &dis, &x->pred_sse[ref], NULL, 0, 0);
+  }
+  *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv,
+                             x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+
+  if (cpi->sf.adaptive_motion_search)
+    x->pred_mv[ref] = tmp_mv->as_mv;
+
+  if (scaled_ref_frame) {
+    int i;
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      xd->plane[i].pre[0] = backup_yv12[i];
+  }
+}
+
+
+
+static INLINE void restore_dst_buf(MACROBLOCKD *xd,
+                                   uint8_t *orig_dst[MAX_MB_PLANE],
+                                   int orig_dst_stride[MAX_MB_PLANE]) {
+  int i;
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    xd->plane[i].dst.buf = orig_dst[i];
+    xd->plane[i].dst.stride = orig_dst_stride[i];
+  }
+}
+
+// In some situations we want to discount tha pparent cost of a new motion
+// vector. Where there is a subtle motion field and especially where there is
+// low spatial complexity then it can be hard to cover the cost of a new motion
+// vector in a single block, even if that motion vector reduces distortion.
+// However, once established that vector may be usable through the nearest and
+// near mv modes to reduce distortion in subsequent blocks and also improve
+// visual quality.
+static int discount_newmv_test(const VP9_COMP *cpi,
+                               int this_mode,
+                               int_mv this_mv,
+                               int_mv (*mode_mv)[MAX_REF_FRAMES],
+                               int ref_frame) {
+  return (!cpi->rc.is_src_frame_alt_ref &&
+          (this_mode == NEWMV) &&
+          (this_mv.as_int != 0) &&
+          ((mode_mv[NEARESTMV][ref_frame].as_int == 0) ||
+           (mode_mv[NEARESTMV][ref_frame].as_int == INVALID_MV)) &&
+          ((mode_mv[NEARMV][ref_frame].as_int == 0) ||
+           (mode_mv[NEARMV][ref_frame].as_int == INVALID_MV)));
+}
+
+static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
+                                 BLOCK_SIZE bsize,
+                                 int *rate2, int64_t *distortion,
+                                 int *skippable,
+                                 int *rate_y, int *rate_uv,
+                                 int *disable_skip,
+                                 int_mv (*mode_mv)[MAX_REF_FRAMES],
+                                 int mi_row, int mi_col,
+                                 int_mv single_newmv[MAX_REF_FRAMES],
+                                 INTERP_FILTER (*single_filter)[MAX_REF_FRAMES],
+                                 int (*single_skippable)[MAX_REF_FRAMES],
+                                 int64_t *psse,
+                                 const int64_t ref_best_rd,
+                                 int64_t *mask_filter,
+                                 int64_t filter_cache[]) {
+  VP9_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MODE_INFO *mi = xd->mi[0];
+  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  const int is_comp_pred = has_second_ref(mi);
+  const int this_mode = mi->mode;
+  int_mv *frame_mv = mode_mv[this_mode];
+  int i;
+  int refs[2] = { mi->ref_frame[0],
+    (mi->ref_frame[1] < 0 ? 0 : mi->ref_frame[1]) };
+  int_mv cur_mv[2];
+#if CONFIG_VP9_HIGHBITDEPTH
+  DECLARE_ALIGNED(16, uint16_t, tmp_buf16[MAX_MB_PLANE * 64 * 64]);
+  uint8_t *tmp_buf;
+#else
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf[MAX_MB_PLANE * 64 * 64]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  int pred_exists = 0;
+  int intpel_mv;
+  int64_t rd, tmp_rd, best_rd = INT64_MAX;
+  int best_needs_copy = 0;
+  uint8_t *orig_dst[MAX_MB_PLANE];
+  int orig_dst_stride[MAX_MB_PLANE];
+  int rs = 0;
+  INTERP_FILTER best_filter = SWITCHABLE;
+  uint8_t skip_txfm[MAX_MB_PLANE << 2] = {0};
+  int64_t bsse[MAX_MB_PLANE << 2] = {0};
+
+  int bsl = mi_width_log2_lookup[bsize];
+  int pred_filter_search = cpi->sf.cb_pred_filter_search ?
+      (((mi_row + mi_col) >> bsl) +
+       get_chessboard_index(cm->current_video_frame)) & 0x1 : 0;
+
+  int skip_txfm_sb = 0;
+  int64_t skip_sse_sb = INT64_MAX;
+  int64_t distortion_y = 0, distortion_uv = 0;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    tmp_buf = CONVERT_TO_BYTEPTR(tmp_buf16);
+  } else {
+    tmp_buf = (uint8_t *)tmp_buf16;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  if (pred_filter_search) {
+    INTERP_FILTER af = SWITCHABLE, lf = SWITCHABLE;
+    if (xd->up_available)
+      af = xd->mi[-xd->mi_stride]->interp_filter;
+    if (xd->left_available)
+      lf = xd->mi[-1]->interp_filter;
+
+    if ((this_mode != NEWMV) || (af == lf))
+      best_filter = af;
+  }
+
+  if (is_comp_pred) {
+    if (frame_mv[refs[0]].as_int == INVALID_MV ||
+        frame_mv[refs[1]].as_int == INVALID_MV)
+      return INT64_MAX;
+
+    if (cpi->sf.adaptive_mode_search) {
+      if (single_filter[this_mode][refs[0]] ==
+          single_filter[this_mode][refs[1]])
+        best_filter = single_filter[this_mode][refs[0]];
+    }
+  }
+
+  if (this_mode == NEWMV) {
+    int rate_mv;
+    if (is_comp_pred) {
+      // Initialize mv using single prediction mode result.
+      frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
+      frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
+
+      if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
+        joint_motion_search(cpi, x, bsize, frame_mv,
+                            mi_row, mi_col, single_newmv, &rate_mv);
+      } else {
+        rate_mv  = vp9_mv_bit_cost(&frame_mv[refs[0]].as_mv,
+                                   &x->mbmi_ext->ref_mvs[refs[0]][0].as_mv,
+                                   x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+        rate_mv += vp9_mv_bit_cost(&frame_mv[refs[1]].as_mv,
+                                   &x->mbmi_ext->ref_mvs[refs[1]][0].as_mv,
+                                   x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+      }
+      *rate2 += rate_mv;
+    } else {
+      int_mv tmp_mv;
+      single_motion_search(cpi, x, bsize, mi_row, mi_col,
+                           &tmp_mv, &rate_mv);
+      if (tmp_mv.as_int == INVALID_MV)
+        return INT64_MAX;
+
+      frame_mv[refs[0]].as_int =
+          xd->mi[0]->bmi[0].as_mv[0].as_int = tmp_mv.as_int;
+      single_newmv[refs[0]].as_int = tmp_mv.as_int;
+
+      // Estimate the rate implications of a new mv but discount this
+      // under certain circumstances where we want to help initiate a weak
+      // motion field, where the distortion gain for a single block may not
+      // be enough to overcome the cost of a new mv.
+      if (discount_newmv_test(cpi, this_mode, tmp_mv, mode_mv, refs[0])) {
+        *rate2 += VPXMAX((rate_mv / NEW_MV_DISCOUNT_FACTOR), 1);
+      } else {
+        *rate2 += rate_mv;
+      }
+    }
+  }
+
+  for (i = 0; i < is_comp_pred + 1; ++i) {
+    cur_mv[i] = frame_mv[refs[i]];
+    // Clip "next_nearest" so that it does not extend to far out of image
+    if (this_mode != NEWMV)
+      clamp_mv2(&cur_mv[i].as_mv, xd);
+
+    if (mv_check_bounds(x, &cur_mv[i].as_mv))
+      return INT64_MAX;
+    mi->mv[i].as_int = cur_mv[i].as_int;
+  }
+
+  // do first prediction into the destination buffer. Do the next
+  // prediction into a temporary buffer. Then keep track of which one
+  // of these currently holds the best predictor, and use the other
+  // one for future predictions. In the end, copy from tmp_buf to
+  // dst if necessary.
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    orig_dst[i] = xd->plane[i].dst.buf;
+    orig_dst_stride[i] = xd->plane[i].dst.stride;
+  }
+
+  // We don't include the cost of the second reference here, because there
+  // are only two options: Last/ARF or Golden/ARF; The second one is always
+  // known, which is ARF.
+  //
+  // Under some circumstances we discount the cost of new mv mode to encourage
+  // initiation of a motion field.
+  if (discount_newmv_test(cpi, this_mode, frame_mv[refs[0]],
+                          mode_mv, refs[0])) {
+    *rate2 += VPXMIN(cost_mv_ref(cpi, this_mode,
+                                 mbmi_ext->mode_context[refs[0]]),
+                     cost_mv_ref(cpi, NEARESTMV,
+                                 mbmi_ext->mode_context[refs[0]]));
+  } else {
+    *rate2 += cost_mv_ref(cpi, this_mode, mbmi_ext->mode_context[refs[0]]);
+  }
+
+  if (RDCOST(x->rdmult, x->rddiv, *rate2, 0) > ref_best_rd &&
+      mi->mode != NEARESTMV)
+    return INT64_MAX;
+
+  pred_exists = 0;
+  // Are all MVs integer pel for Y and UV
+  intpel_mv = !mv_has_subpel(&mi->mv[0].as_mv);
+  if (is_comp_pred)
+    intpel_mv &= !mv_has_subpel(&mi->mv[1].as_mv);
+
+  // Search for best switchable filter by checking the variance of
+  // pred error irrespective of whether the filter will be used
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+    filter_cache[i] = INT64_MAX;
+
+  if (cm->interp_filter != BILINEAR) {
+    if (x->source_variance < cpi->sf.disable_filter_search_var_thresh) {
+      best_filter = EIGHTTAP;
+    } else if (best_filter == SWITCHABLE) {
+      int newbest;
+      int tmp_rate_sum = 0;
+      int64_t tmp_dist_sum = 0;
+
+      for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
+        int j;
+        int64_t rs_rd;
+        int tmp_skip_sb = 0;
+        int64_t tmp_skip_sse = INT64_MAX;
+
+        mi->interp_filter = i;
+        rs = vp9_get_switchable_rate(cpi, xd);
+        rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
+
+        if (i > 0 && intpel_mv) {
+          rd = RDCOST(x->rdmult, x->rddiv, tmp_rate_sum, tmp_dist_sum);
+          filter_cache[i] = rd;
+          filter_cache[SWITCHABLE_FILTERS] =
+              VPXMIN(filter_cache[SWITCHABLE_FILTERS], rd + rs_rd);
+          if (cm->interp_filter == SWITCHABLE)
+            rd += rs_rd;
+          *mask_filter = VPXMAX(*mask_filter, rd);
+        } else {
+          int rate_sum = 0;
+          int64_t dist_sum = 0;
+          if (i > 0 && cpi->sf.adaptive_interp_filter_search &&
+              (cpi->sf.interp_filter_search_mask & (1 << i))) {
+            rate_sum = INT_MAX;
+            dist_sum = INT64_MAX;
+            continue;
+          }
+
+          if ((cm->interp_filter == SWITCHABLE &&
+               (!i || best_needs_copy)) ||
+              (cm->interp_filter != SWITCHABLE &&
+               (cm->interp_filter == mi->interp_filter ||
+                (i == 0 && intpel_mv)))) {
+            restore_dst_buf(xd, orig_dst, orig_dst_stride);
+          } else {
+            for (j = 0; j < MAX_MB_PLANE; j++) {
+              xd->plane[j].dst.buf = tmp_buf + j * 64 * 64;
+              xd->plane[j].dst.stride = 64;
+            }
+          }
+          vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
+          model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum,
+                          &tmp_skip_sb, &tmp_skip_sse);
+
+          rd = RDCOST(x->rdmult, x->rddiv, rate_sum, dist_sum);
+          filter_cache[i] = rd;
+          filter_cache[SWITCHABLE_FILTERS] =
+              VPXMIN(filter_cache[SWITCHABLE_FILTERS], rd + rs_rd);
+          if (cm->interp_filter == SWITCHABLE)
+            rd += rs_rd;
+          *mask_filter = VPXMAX(*mask_filter, rd);
+
+          if (i == 0 && intpel_mv) {
+            tmp_rate_sum = rate_sum;
+            tmp_dist_sum = dist_sum;
+          }
+        }
+
+        if (i == 0 && cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
+          if (rd / 2 > ref_best_rd) {
+            restore_dst_buf(xd, orig_dst, orig_dst_stride);
+            return INT64_MAX;
+          }
+        }
+        newbest = i == 0 || rd < best_rd;
+
+        if (newbest) {
+          best_rd = rd;
+          best_filter = mi->interp_filter;
+          if (cm->interp_filter == SWITCHABLE && i && !intpel_mv)
+            best_needs_copy = !best_needs_copy;
+        }
+
+        if ((cm->interp_filter == SWITCHABLE && newbest) ||
+            (cm->interp_filter != SWITCHABLE &&
+             cm->interp_filter == mi->interp_filter)) {
+          pred_exists = 1;
+          tmp_rd = best_rd;
+
+          skip_txfm_sb = tmp_skip_sb;
+          skip_sse_sb = tmp_skip_sse;
+          memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm));
+          memcpy(bsse, x->bsse, sizeof(bsse));
+        }
+      }
+      restore_dst_buf(xd, orig_dst, orig_dst_stride);
+    }
+  }
+  // Set the appropriate filter
+  mi->interp_filter = cm->interp_filter != SWITCHABLE ?
+      cm->interp_filter : best_filter;
+  rs = cm->interp_filter == SWITCHABLE ? vp9_get_switchable_rate(cpi, xd) : 0;
+
+  if (pred_exists) {
+    if (best_needs_copy) {
+      // again temporarily set the buffers to local memory to prevent a memcpy
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = tmp_buf + i * 64 * 64;
+        xd->plane[i].dst.stride = 64;
+      }
+    }
+    rd = tmp_rd + RDCOST(x->rdmult, x->rddiv, rs, 0);
+  } else {
+    int tmp_rate;
+    int64_t tmp_dist;
+    // Handles the special case when a filter that is not in the
+    // switchable list (ex. bilinear) is indicated at the frame level, or
+    // skip condition holds.
+    vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
+    model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist,
+                    &skip_txfm_sb, &skip_sse_sb);
+    rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
+    memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm));
+    memcpy(bsse, x->bsse, sizeof(bsse));
+  }
+
+  if (!is_comp_pred)
+    single_filter[this_mode][refs[0]] = mi->interp_filter;
+
+  if (cpi->sf.adaptive_mode_search)
+    if (is_comp_pred)
+      if (single_skippable[this_mode][refs[0]] &&
+          single_skippable[this_mode][refs[1]])
+        memset(skip_txfm, SKIP_TXFM_AC_DC, sizeof(skip_txfm));
+
+  if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
+    // if current pred_error modeled rd is substantially more than the best
+    // so far, do not bother doing full rd
+    if (rd / 2 > ref_best_rd) {
+      restore_dst_buf(xd, orig_dst, orig_dst_stride);
+      return INT64_MAX;
+    }
+  }
+
+  if (cm->interp_filter == SWITCHABLE)
+    *rate2 += rs;
+
+  memcpy(x->skip_txfm, skip_txfm, sizeof(skip_txfm));
+  memcpy(x->bsse, bsse, sizeof(bsse));
+
+  if (!skip_txfm_sb) {
+    int skippable_y, skippable_uv;
+    int64_t sseuv = INT64_MAX;
+    int64_t rdcosty = INT64_MAX;
+
+    // Y cost and distortion
+    vp9_subtract_plane(x, bsize, 0);
+    super_block_yrd(cpi, x, rate_y, &distortion_y, &skippable_y, psse,
+                    bsize, ref_best_rd);
+
+    if (*rate_y == INT_MAX) {
+      *rate2 = INT_MAX;
+      *distortion = INT64_MAX;
+      restore_dst_buf(xd, orig_dst, orig_dst_stride);
+      return INT64_MAX;
+    }
+
+    *rate2 += *rate_y;
+    *distortion += distortion_y;
+
+    rdcosty = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
+    rdcosty = VPXMIN(rdcosty, RDCOST(x->rdmult, x->rddiv, 0, *psse));
+
+    if (!super_block_uvrd(cpi, x, rate_uv, &distortion_uv, &skippable_uv,
+                          &sseuv, bsize, ref_best_rd - rdcosty)) {
+      *rate2 = INT_MAX;
+      *distortion = INT64_MAX;
+      restore_dst_buf(xd, orig_dst, orig_dst_stride);
+      return INT64_MAX;
+    }
+
+    *psse += sseuv;
+    *rate2 += *rate_uv;
+    *distortion += distortion_uv;
+    *skippable = skippable_y && skippable_uv;
+  } else {
+    x->skip = 1;
+    *disable_skip = 1;
+
+    // The cost of skip bit needs to be added.
+    *rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
+
+    *distortion = skip_sse_sb;
+  }
+
+  if (!is_comp_pred)
+    single_skippable[this_mode][refs[0]] = *skippable;
+
+  restore_dst_buf(xd, orig_dst, orig_dst_stride);
+  return 0;  // The rate-distortion cost will be re-calculated by caller.
+}
+
+void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
+                               RD_COST *rd_cost, BLOCK_SIZE bsize,
+                               PICK_MODE_CONTEXT *ctx, int64_t best_rd) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblockd_plane *const pd = xd->plane;
+  int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0;
+  int y_skip = 0, uv_skip = 0;
+  int64_t dist_y = 0, dist_uv = 0;
+  TX_SIZE max_uv_tx_size;
+  x->skip_encode = 0;
+  ctx->skip = 0;
+  xd->mi[0]->ref_frame[0] = INTRA_FRAME;
+  xd->mi[0]->ref_frame[1] = NONE;
+
+  if (bsize >= BLOCK_8X8) {
+    if (rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
+                               &dist_y, &y_skip, bsize,
+                               best_rd) >= best_rd) {
+      rd_cost->rate = INT_MAX;
+      return;
+    }
+  } else {
+    y_skip = 0;
+    if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate_y, &rate_y_tokenonly,
+                                     &dist_y, best_rd) >= best_rd) {
+      rd_cost->rate = INT_MAX;
+      return;
+    }
+  }
+  max_uv_tx_size = get_uv_tx_size_impl(xd->mi[0]->tx_size, bsize,
+                                       pd[1].subsampling_x,
+                                       pd[1].subsampling_y);
+  rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv, &rate_uv_tokenonly,
+                          &dist_uv, &uv_skip, VPXMAX(BLOCK_8X8, bsize),
+                          max_uv_tx_size);
+
+  if (y_skip && uv_skip) {
+    rd_cost->rate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
+                    vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
+    rd_cost->dist = dist_y + dist_uv;
+  } else {
+    rd_cost->rate = rate_y + rate_uv +
+                      vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
+    rd_cost->dist = dist_y + dist_uv;
+  }
+
+  ctx->mic = *xd->mi[0];
+  ctx->mbmi_ext = *x->mbmi_ext;
+  rd_cost->rdcost = RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist);
+}
+
+// This function is designed to apply a bias or adjustment to an rd value based
+// on the relative variance of the source and reconstruction.
+#define LOW_VAR_THRESH 16
+#define VLOW_ADJ_MAX 25
+#define VHIGH_ADJ_MAX 8
+static void rd_variance_adjustment(VP9_COMP *cpi,
+                                   MACROBLOCK *x,
+                                   BLOCK_SIZE bsize,
+                                   int64_t *this_rd,
+                                   MV_REFERENCE_FRAME ref_frame,
+                                   unsigned int source_variance) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  unsigned int recon_variance;
+  unsigned int absvar_diff = 0;
+  int64_t var_error = 0;
+  int64_t var_factor = 0;
+
+  if (*this_rd == INT64_MAX)
+    return;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    recon_variance =
+      vp9_high_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize, xd->bd);
+  } else {
+    recon_variance =
+      vp9_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize);
+  }
+#else
+  recon_variance =
+    vp9_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  if ((source_variance + recon_variance) > LOW_VAR_THRESH) {
+    absvar_diff = (source_variance > recon_variance)
+      ? (source_variance - recon_variance)
+      : (recon_variance - source_variance);
+
+    var_error = ((int64_t)200 * source_variance * recon_variance) /
+      (((int64_t)source_variance * source_variance) +
+       ((int64_t)recon_variance * recon_variance));
+    var_error = 100 - var_error;
+  }
+
+  // Source variance above a threshold and ref frame is intra.
+  // This case is targeted mainly at discouraging intra modes that give rise
+  // to a predictor with a low spatial complexity compared to the source.
+  if ((source_variance > LOW_VAR_THRESH) && (ref_frame == INTRA_FRAME) &&
+      (source_variance > recon_variance)) {
+    var_factor = VPXMIN(absvar_diff, VPXMIN(VLOW_ADJ_MAX, var_error));
+  // A second possible case of interest is where the source variance
+  // is very low and we wish to discourage false texture or motion trails.
+  } else if ((source_variance < (LOW_VAR_THRESH >> 1)) &&
+             (recon_variance > source_variance)) {
+    var_factor = VPXMIN(absvar_diff, VPXMIN(VHIGH_ADJ_MAX, var_error));
+  }
+  *this_rd += (*this_rd * var_factor) / 100;
+}
+
+
+// Do we have an internal image edge (e.g. formatting bars).
+int vp9_internal_image_edge(VP9_COMP *cpi) {
+  return (cpi->oxcf.pass == 2) &&
+    ((cpi->twopass.this_frame_stats.inactive_zone_rows > 0) ||
+    (cpi->twopass.this_frame_stats.inactive_zone_cols > 0));
+}
+
+// Checks to see if a super block is on a horizontal image edge.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+int vp9_active_h_edge(VP9_COMP *cpi, int mi_row, int mi_step) {
+  int top_edge = 0;
+  int bottom_edge = cpi->common.mi_rows;
+  int is_active_h_edge = 0;
+
+  // For two pass account for any formatting bars detected.
+  if (cpi->oxcf.pass == 2) {
+    TWO_PASS *twopass = &cpi->twopass;
+
+    // The inactive region is specified in MBs not mi units.
+    // The image edge is in the following MB row.
+    top_edge += (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
+
+    bottom_edge -= (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
+    bottom_edge = VPXMAX(top_edge, bottom_edge);
+  }
+
+  if (((top_edge >= mi_row) && (top_edge < (mi_row + mi_step))) ||
+      ((bottom_edge >= mi_row) && (bottom_edge < (mi_row + mi_step)))) {
+    is_active_h_edge = 1;
+  }
+  return is_active_h_edge;
+}
+
+// Checks to see if a super block is on a vertical image edge.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+int vp9_active_v_edge(VP9_COMP *cpi, int mi_col, int mi_step) {
+  int left_edge = 0;
+  int right_edge = cpi->common.mi_cols;
+  int is_active_v_edge = 0;
+
+  // For two pass account for any formatting bars detected.
+  if (cpi->oxcf.pass == 2) {
+    TWO_PASS *twopass = &cpi->twopass;
+
+    // The inactive region is specified in MBs not mi units.
+    // The image edge is in the following MB row.
+    left_edge += (int)(twopass->this_frame_stats.inactive_zone_cols * 2);
+
+    right_edge -= (int)(twopass->this_frame_stats.inactive_zone_cols * 2);
+    right_edge = VPXMAX(left_edge, right_edge);
+  }
+
+  if (((left_edge >= mi_col) && (left_edge < (mi_col + mi_step))) ||
+      ((right_edge >= mi_col) && (right_edge < (mi_col + mi_step)))) {
+    is_active_v_edge = 1;
+  }
+  return is_active_v_edge;
+}
+
+// Checks to see if a super block is at the edge of the active image.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+int vp9_active_edge_sb(VP9_COMP *cpi,
+                       int mi_row, int mi_col) {
+  return vp9_active_h_edge(cpi, mi_row, MI_BLOCK_SIZE) ||
+         vp9_active_v_edge(cpi, mi_col, MI_BLOCK_SIZE);
+}
+
+void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi,
+                               TileDataEnc *tile_data,
+                               MACROBLOCK *x,
+                               int mi_row, int mi_col,
+                               RD_COST *rd_cost, BLOCK_SIZE bsize,
+                               PICK_MODE_CONTEXT *ctx,
+                               int64_t best_rd_so_far) {
+  VP9_COMMON *const cm = &cpi->common;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  RD_OPT *const rd_opt = &cpi->rd;
+  SPEED_FEATURES *const sf = &cpi->sf;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *const mi = xd->mi[0];
+  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  const struct segmentation *const seg = &cm->seg;
+  PREDICTION_MODE this_mode;
+  MV_REFERENCE_FRAME ref_frame, second_ref_frame;
+  unsigned char segment_id = mi->segment_id;
+  int comp_pred, i, k;
+  int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
+  struct buf_2d yv12_mb[4][MAX_MB_PLANE];
+  int_mv single_newmv[MAX_REF_FRAMES] = { { 0 } };
+  INTERP_FILTER single_inter_filter[MB_MODE_COUNT][MAX_REF_FRAMES];
+  int single_skippable[MB_MODE_COUNT][MAX_REF_FRAMES];
+  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
+                                    VP9_ALT_FLAG };
+  int64_t best_rd = best_rd_so_far;
+  int64_t best_pred_diff[REFERENCE_MODES];
+  int64_t best_pred_rd[REFERENCE_MODES];
+  int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
+  int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
+  MODE_INFO best_mbmode;
+  int best_mode_skippable = 0;
+  int midx, best_mode_index = -1;
+  unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
+  vpx_prob comp_mode_p;
+  int64_t best_intra_rd = INT64_MAX;
+  unsigned int best_pred_sse = UINT_MAX;
+  PREDICTION_MODE best_intra_mode = DC_PRED;
+  int rate_uv_intra[TX_SIZES], rate_uv_tokenonly[TX_SIZES];
+  int64_t dist_uv[TX_SIZES];
+  int skip_uv[TX_SIZES];
+  PREDICTION_MODE mode_uv[TX_SIZES];
+  const int intra_cost_penalty = vp9_get_intra_cost_penalty(
+      cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
+  int best_skip2 = 0;
+  uint8_t ref_frame_skip_mask[2] = { 0 };
+  uint16_t mode_skip_mask[MAX_REF_FRAMES] = { 0 };
+  int mode_skip_start = sf->mode_skip_start + 1;
+  const int *const rd_threshes = rd_opt->threshes[segment_id][bsize];
+  const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize];
+  int64_t mode_threshold[MAX_MODES];
+  int *mode_map = tile_data->mode_map[bsize];
+  const int mode_search_skip_flags = sf->mode_search_skip_flags;
+  int64_t mask_filter = 0;
+  int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS];
+
+  vp9_zero(best_mbmode);
+
+  x->skip_encode = sf->skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
+
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+    filter_cache[i] = INT64_MAX;
+
+  estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
+                           &comp_mode_p);
+
+  for (i = 0; i < REFERENCE_MODES; ++i)
+    best_pred_rd[i] = INT64_MAX;
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
+    best_filter_rd[i] = INT64_MAX;
+  for (i = 0; i < TX_SIZES; i++)
+    rate_uv_intra[i] = INT_MAX;
+  for (i = 0; i < MAX_REF_FRAMES; ++i)
+    x->pred_sse[i] = INT_MAX;
+  for (i = 0; i < MB_MODE_COUNT; ++i) {
+    for (k = 0; k < MAX_REF_FRAMES; ++k) {
+      single_inter_filter[i][k] = SWITCHABLE;
+      single_skippable[i][k] = 0;
+    }
+  }
+
+  rd_cost->rate = INT_MAX;
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    x->pred_mv_sad[ref_frame] = INT_MAX;
+    if (cpi->ref_frame_flags & flag_list[ref_frame]) {
+      assert(get_ref_frame_buffer(cpi, ref_frame) != NULL);
+      setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
+                         frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
+    }
+    frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
+    frame_mv[ZEROMV][ref_frame].as_int = 0;
+  }
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    if (!(cpi->ref_frame_flags & flag_list[ref_frame])) {
+      // Skip checking missing references in both single and compound reference
+      // modes. Note that a mode will be skipped if both reference frames
+      // are masked out.
+      ref_frame_skip_mask[0] |= (1 << ref_frame);
+      ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+    } else if (sf->reference_masking) {
+      for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+        // Skip fixed mv modes for poor references
+        if ((x->pred_mv_sad[ref_frame] >> 2) > x->pred_mv_sad[i]) {
+          mode_skip_mask[ref_frame] |= INTER_NEAREST_NEAR_ZERO;
+          break;
+        }
+      }
+    }
+    // If the segment reference frame feature is enabled....
+    // then do nothing if the current ref frame is not allowed..
+    if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+        get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
+      ref_frame_skip_mask[0] |= (1 << ref_frame);
+      ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+    }
+  }
+
+  // Disable this drop out case if the ref frame
+  // segment level feature is enabled for this segment. This is to
+  // prevent the possibility that we end up unable to pick any mode.
+  if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
+    // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
+    // unless ARNR filtering is enabled in which case we want
+    // an unfiltered alternative. We allow near/nearest as well
+    // because they may result in zero-zero MVs but be cheaper.
+    if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
+      ref_frame_skip_mask[0] = (1 << LAST_FRAME) | (1 << GOLDEN_FRAME);
+      ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK;
+      mode_skip_mask[ALTREF_FRAME] = ~INTER_NEAREST_NEAR_ZERO;
+      if (frame_mv[NEARMV][ALTREF_FRAME].as_int != 0)
+        mode_skip_mask[ALTREF_FRAME] |= (1 << NEARMV);
+      if (frame_mv[NEARESTMV][ALTREF_FRAME].as_int != 0)
+        mode_skip_mask[ALTREF_FRAME] |= (1 << NEARESTMV);
+    }
+  }
+
+  if (cpi->rc.is_src_frame_alt_ref) {
+    if (sf->alt_ref_search_fp) {
+      mode_skip_mask[ALTREF_FRAME] = 0;
+      ref_frame_skip_mask[0] = ~(1 << ALTREF_FRAME);
+      ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK;
+    }
+  }
+
+  if (sf->alt_ref_search_fp)
+    if (!cm->show_frame && x->pred_mv_sad[GOLDEN_FRAME] < INT_MAX)
+      if (x->pred_mv_sad[ALTREF_FRAME] > (x->pred_mv_sad[GOLDEN_FRAME] << 1))
+        mode_skip_mask[ALTREF_FRAME] |= INTER_ALL;
+
+  if (sf->adaptive_mode_search) {
+    if (cm->show_frame && !cpi->rc.is_src_frame_alt_ref &&
+        cpi->rc.frames_since_golden >= 3)
+      if (x->pred_mv_sad[GOLDEN_FRAME] > (x->pred_mv_sad[LAST_FRAME] << 1))
+        mode_skip_mask[GOLDEN_FRAME] |= INTER_ALL;
+  }
+
+  if (bsize > sf->max_intra_bsize) {
+    ref_frame_skip_mask[0] |= (1 << INTRA_FRAME);
+    ref_frame_skip_mask[1] |= (1 << INTRA_FRAME);
+  }
+
+  mode_skip_mask[INTRA_FRAME] |=
+      ~(sf->intra_y_mode_mask[max_txsize_lookup[bsize]]);
+
+  for (i = 0; i <= LAST_NEW_MV_INDEX; ++i)
+    mode_threshold[i] = 0;
+  for (i = LAST_NEW_MV_INDEX + 1; i < MAX_MODES; ++i)
+    mode_threshold[i] = ((int64_t)rd_threshes[i] * rd_thresh_freq_fact[i]) >> 5;
+
+  midx =  sf->schedule_mode_search ? mode_skip_start : 0;
+  while (midx > 4) {
+    uint8_t end_pos = 0;
+    for (i = 5; i < midx; ++i) {
+      if (mode_threshold[mode_map[i - 1]] > mode_threshold[mode_map[i]]) {
+        uint8_t tmp = mode_map[i];
+        mode_map[i] = mode_map[i - 1];
+        mode_map[i - 1] = tmp;
+        end_pos = i;
+      }
+    }
+    midx = end_pos;
+  }
+
+  for (midx = 0; midx < MAX_MODES; ++midx) {
+    int mode_index = mode_map[midx];
+    int mode_excluded = 0;
+    int64_t this_rd = INT64_MAX;
+    int disable_skip = 0;
+    int compmode_cost = 0;
+    int rate2 = 0, rate_y = 0, rate_uv = 0;
+    int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
+    int skippable = 0;
+    int this_skip2 = 0;
+    int64_t total_sse = INT64_MAX;
+    int early_term = 0;
+
+    this_mode = vp9_mode_order[mode_index].mode;
+    ref_frame = vp9_mode_order[mode_index].ref_frame[0];
+    second_ref_frame = vp9_mode_order[mode_index].ref_frame[1];
+
+    // Look at the reference frame of the best mode so far and set the
+    // skip mask to look at a subset of the remaining modes.
+    if (midx == mode_skip_start && best_mode_index >= 0) {
+      switch (best_mbmode.ref_frame[0]) {
+        case INTRA_FRAME:
+          break;
+        case LAST_FRAME:
+          ref_frame_skip_mask[0] |= LAST_FRAME_MODE_MASK;
+          ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+          break;
+        case GOLDEN_FRAME:
+          ref_frame_skip_mask[0] |= GOLDEN_FRAME_MODE_MASK;
+          ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+          break;
+        case ALTREF_FRAME:
+          ref_frame_skip_mask[0] |= ALT_REF_MODE_MASK;
+          break;
+        case NONE:
+        case MAX_REF_FRAMES:
+          assert(0 && "Invalid Reference frame");
+          break;
+      }
+    }
+
+    if ((ref_frame_skip_mask[0] & (1 << ref_frame)) &&
+        (ref_frame_skip_mask[1] & (1 << VPXMAX(0, second_ref_frame))))
+      continue;
+
+    if (mode_skip_mask[ref_frame] & (1 << this_mode))
+      continue;
+
+    // Test best rd so far against threshold for trying this mode.
+    if (best_mode_skippable && sf->schedule_mode_search)
+      mode_threshold[mode_index] <<= 1;
+
+    if (best_rd < mode_threshold[mode_index])
+      continue;
+
+    if (sf->motion_field_mode_search) {
+      const int mi_width  = VPXMIN(num_8x8_blocks_wide_lookup[bsize],
+                                   tile_info->mi_col_end - mi_col);
+      const int mi_height = VPXMIN(num_8x8_blocks_high_lookup[bsize],
+                                   tile_info->mi_row_end - mi_row);
+      const int bsl = mi_width_log2_lookup[bsize];
+      int cb_partition_search_ctrl = (((mi_row + mi_col) >> bsl)
+          + get_chessboard_index(cm->current_video_frame)) & 0x1;
+      MODE_INFO *ref_mi;
+      int const_motion = 1;
+      int skip_ref_frame = !cb_partition_search_ctrl;
+      MV_REFERENCE_FRAME rf = NONE;
+      int_mv ref_mv;
+      ref_mv.as_int = INVALID_MV;
+
+      if ((mi_row - 1) >= tile_info->mi_row_start) {
+        ref_mv = xd->mi[-xd->mi_stride]->mv[0];
+        rf = xd->mi[-xd->mi_stride]->ref_frame[0];
+        for (i = 0; i < mi_width; ++i) {
+          ref_mi = xd->mi[-xd->mi_stride + i];
+          const_motion &= (ref_mv.as_int == ref_mi->mv[0].as_int) &&
+                          (ref_frame == ref_mi->ref_frame[0]);
+          skip_ref_frame &= (rf == ref_mi->ref_frame[0]);
+        }
+      }
+
+      if ((mi_col - 1) >= tile_info->mi_col_start) {
+        if (ref_mv.as_int == INVALID_MV)
+          ref_mv = xd->mi[-1]->mv[0];
+        if (rf == NONE)
+          rf = xd->mi[-1]->ref_frame[0];
+        for (i = 0; i < mi_height; ++i) {
+          ref_mi = xd->mi[i * xd->mi_stride - 1];
+          const_motion &= (ref_mv.as_int == ref_mi->mv[0].as_int) &&
+                          (ref_frame == ref_mi->ref_frame[0]);
+          skip_ref_frame &= (rf == ref_mi->ref_frame[0]);
+        }
+      }
+
+      if (skip_ref_frame && this_mode != NEARESTMV && this_mode != NEWMV)
+        if (rf > INTRA_FRAME)
+          if (ref_frame != rf)
+            continue;
+
+      if (const_motion)
+        if (this_mode == NEARMV || this_mode == ZEROMV)
+          continue;
+    }
+
+    comp_pred = second_ref_frame > INTRA_FRAME;
+    if (comp_pred) {
+      if (!cpi->allow_comp_inter_inter)
+        continue;
+
+      // Skip compound inter modes if ARF is not available.
+      if (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))
+        continue;
+
+      // Do not allow compound prediction if the segment level reference frame
+      // feature is in use as in this case there can only be one reference.
+      if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
+        continue;
+
+      if ((mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
+          best_mode_index >= 0 && best_mbmode.ref_frame[0] == INTRA_FRAME)
+        continue;
+
+      mode_excluded = cm->reference_mode == SINGLE_REFERENCE;
+    } else {
+      if (ref_frame != INTRA_FRAME)
+        mode_excluded = cm->reference_mode == COMPOUND_REFERENCE;
+    }
+
+    if (ref_frame == INTRA_FRAME) {
+      if (sf->adaptive_mode_search)
+        if ((x->source_variance << num_pels_log2_lookup[bsize]) > best_pred_sse)
+          continue;
+
+      if (this_mode != DC_PRED) {
+        // Disable intra modes other than DC_PRED for blocks with low variance
+        // Threshold for intra skipping based on source variance
+        // TODO(debargha): Specialize the threshold for super block sizes
+        const unsigned int skip_intra_var_thresh = 64;
+        if ((mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) &&
+            x->source_variance < skip_intra_var_thresh)
+          continue;
+        // Only search the oblique modes if the best so far is
+        // one of the neighboring directional modes
+        if ((mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
+            (this_mode >= D45_PRED && this_mode <= TM_PRED)) {
+          if (best_mode_index >= 0 &&
+              best_mbmode.ref_frame[0] > INTRA_FRAME)
+            continue;
+        }
+        if (mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
+          if (conditional_skipintra(this_mode, best_intra_mode))
+              continue;
+        }
+      }
+    } else {
+      const MV_REFERENCE_FRAME ref_frames[2] = {ref_frame, second_ref_frame};
+      if (!check_best_zero_mv(cpi, mbmi_ext->mode_context, frame_mv,
+                              this_mode, ref_frames))
+        continue;
+    }
+
+    mi->mode = this_mode;
+    mi->uv_mode = DC_PRED;
+    mi->ref_frame[0] = ref_frame;
+    mi->ref_frame[1] = second_ref_frame;
+    // Evaluate all sub-pel filters irrespective of whether we can use
+    // them for this frame.
+    mi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP
+                                                          : cm->interp_filter;
+    mi->mv[0].as_int = mi->mv[1].as_int = 0;
+
+    x->skip = 0;
+    set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
+
+    // Select prediction reference frames.
+    for (i = 0; i < MAX_MB_PLANE; i++) {
+      xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
+      if (comp_pred)
+        xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
+    }
+
+    if (ref_frame == INTRA_FRAME) {
+      TX_SIZE uv_tx;
+      struct macroblockd_plane *const pd = &xd->plane[1];
+      memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
+      super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable,
+                      NULL, bsize, best_rd);
+      if (rate_y == INT_MAX)
+        continue;
+
+      uv_tx = get_uv_tx_size_impl(mi->tx_size, bsize, pd->subsampling_x,
+                                  pd->subsampling_y);
+      if (rate_uv_intra[uv_tx] == INT_MAX) {
+        choose_intra_uv_mode(cpi, x, ctx, bsize, uv_tx,
+                             &rate_uv_intra[uv_tx], &rate_uv_tokenonly[uv_tx],
+                             &dist_uv[uv_tx], &skip_uv[uv_tx], &mode_uv[uv_tx]);
+      }
+
+      rate_uv = rate_uv_tokenonly[uv_tx];
+      distortion_uv = dist_uv[uv_tx];
+      skippable = skippable && skip_uv[uv_tx];
+      mi->uv_mode = mode_uv[uv_tx];
+
+      rate2 = rate_y + cpi->mbmode_cost[mi->mode] + rate_uv_intra[uv_tx];
+      if (this_mode != DC_PRED && this_mode != TM_PRED)
+        rate2 += intra_cost_penalty;
+      distortion2 = distortion_y + distortion_uv;
+    } else {
+      this_rd = handle_inter_mode(cpi, x, bsize,
+                                  &rate2, &distortion2, &skippable,
+                                  &rate_y, &rate_uv,
+                                  &disable_skip, frame_mv,
+                                  mi_row, mi_col,
+                                  single_newmv, single_inter_filter,
+                                  single_skippable, &total_sse, best_rd,
+                                  &mask_filter, filter_cache);
+      if (this_rd == INT64_MAX)
+        continue;
+
+      compmode_cost = vp9_cost_bit(comp_mode_p, comp_pred);
+
+      if (cm->reference_mode == REFERENCE_MODE_SELECT)
+        rate2 += compmode_cost;
+    }
+
+    // Estimate the reference frame signaling cost and add it
+    // to the rolling cost variable.
+    if (comp_pred) {
+      rate2 += ref_costs_comp[ref_frame];
+    } else {
+      rate2 += ref_costs_single[ref_frame];
+    }
+
+    if (!disable_skip) {
+      const vpx_prob skip_prob = vp9_get_skip_prob(cm, xd);
+      const int skip_cost0 = vp9_cost_bit(skip_prob, 0);
+      const int skip_cost1 = vp9_cost_bit(skip_prob, 1);
+
+      if (skippable) {
+        // Back out the coefficient coding costs
+        rate2 -= (rate_y + rate_uv);
+
+        // Cost the skip mb case
+        rate2 += skip_cost1;
+      } else if (ref_frame != INTRA_FRAME && !xd->lossless) {
+        if (RDCOST(x->rdmult, x->rddiv,
+                   rate_y + rate_uv + skip_cost0, distortion2) <
+            RDCOST(x->rdmult, x->rddiv, skip_cost1, total_sse)) {
+          // Add in the cost of the no skip flag.
+          rate2 += skip_cost0;
+        } else {
+          // FIXME(rbultje) make this work for splitmv also
+          rate2 += skip_cost1;
+          distortion2 = total_sse;
+          assert(total_sse >= 0);
+          rate2 -= (rate_y + rate_uv);
+          this_skip2 = 1;
+        }
+      } else {
+        // Add in the cost of the no skip flag.
+        rate2 += skip_cost0;
+      }
+
+      // Calculate the final RD estimate for this mode.
+      this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+    }
+
+    // Apply an adjustment to the rd value based on the similarity of the
+    // source variance and reconstructed variance.
+    rd_variance_adjustment(cpi, x, bsize, &this_rd,
+                           ref_frame, x->source_variance);
+
+    if (ref_frame == INTRA_FRAME) {
+    // Keep record of best intra rd
+      if (this_rd < best_intra_rd) {
+        best_intra_rd = this_rd;
+        best_intra_mode = mi->mode;
+      }
+    }
+
+    if (!disable_skip && ref_frame == INTRA_FRAME) {
+      for (i = 0; i < REFERENCE_MODES; ++i)
+        best_pred_rd[i] = VPXMIN(best_pred_rd[i], this_rd);
+      for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
+        best_filter_rd[i] = VPXMIN(best_filter_rd[i], this_rd);
+    }
+
+    // Did this mode help.. i.e. is it the new best mode
+    if (this_rd < best_rd || x->skip) {
+      int max_plane = MAX_MB_PLANE;
+      if (!mode_excluded) {
+        // Note index of best mode so far
+        best_mode_index = mode_index;
+
+        if (ref_frame == INTRA_FRAME) {
+          /* required for left and above block mv */
+          mi->mv[0].as_int = 0;
+          max_plane = 1;
+        } else {
+          best_pred_sse = x->pred_sse[ref_frame];
+        }
+
+        rd_cost->rate = rate2;
+        rd_cost->dist = distortion2;
+        rd_cost->rdcost = this_rd;
+        best_rd = this_rd;
+        best_mbmode = *mi;
+        best_skip2 = this_skip2;
+        best_mode_skippable = skippable;
+
+        if (!x->select_tx_size)
+          swap_block_ptr(x, ctx, 1, 0, 0, max_plane);
+        memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mi->tx_size],
+               sizeof(ctx->zcoeff_blk[0]) * ctx->num_4x4_blk);
+
+        // TODO(debargha): enhance this test with a better distortion prediction
+        // based on qp, activity mask and history
+        if ((mode_search_skip_flags & FLAG_EARLY_TERMINATE) &&
+            (mode_index > MIN_EARLY_TERM_INDEX)) {
+          int qstep = xd->plane[0].dequant[1];
+          // TODO(debargha): Enhance this by specializing for each mode_index
+          int scale = 4;
+#if CONFIG_VP9_HIGHBITDEPTH
+          if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+            qstep >>= (xd->bd - 8);
+          }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+          if (x->source_variance < UINT_MAX) {
+            const int var_adjust = (x->source_variance < 16);
+            scale -= var_adjust;
+          }
+          if (ref_frame > INTRA_FRAME &&
+              distortion2 * scale < qstep * qstep) {
+            early_term = 1;
+          }
+        }
+      }
+    }
+
+    /* keep record of best compound/single-only prediction */
+    if (!disable_skip && ref_frame != INTRA_FRAME) {
+      int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
+
+      if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+        single_rate = rate2 - compmode_cost;
+        hybrid_rate = rate2;
+      } else {
+        single_rate = rate2;
+        hybrid_rate = rate2 + compmode_cost;
+      }
+
+      single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
+      hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
+
+      if (!comp_pred) {
+        if (single_rd < best_pred_rd[SINGLE_REFERENCE])
+          best_pred_rd[SINGLE_REFERENCE] = single_rd;
+      } else {
+        if (single_rd < best_pred_rd[COMPOUND_REFERENCE])
+          best_pred_rd[COMPOUND_REFERENCE] = single_rd;
+      }
+      if (hybrid_rd < best_pred_rd[REFERENCE_MODE_SELECT])
+        best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
+
+      /* keep record of best filter type */
+      if (!mode_excluded && cm->interp_filter != BILINEAR) {
+        int64_t ref = filter_cache[cm->interp_filter == SWITCHABLE ?
+                              SWITCHABLE_FILTERS : cm->interp_filter];
+
+        for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
+          int64_t adj_rd;
+          if (ref == INT64_MAX)
+            adj_rd = 0;
+          else if (filter_cache[i] == INT64_MAX)
+            // when early termination is triggered, the encoder does not have
+            // access to the rate-distortion cost. it only knows that the cost
+            // should be above the maximum valid value. hence it takes the known
+            // maximum plus an arbitrary constant as the rate-distortion cost.
+            adj_rd = mask_filter - ref + 10;
+          else
+            adj_rd = filter_cache[i] - ref;
+
+          adj_rd += this_rd;
+          best_filter_rd[i] = VPXMIN(best_filter_rd[i], adj_rd);
+        }
+      }
+    }
+
+    if (early_term)
+      break;
+
+    if (x->skip && !comp_pred)
+      break;
+  }
+
+  // The inter modes' rate costs are not calculated precisely in some cases.
+  // Therefore, sometimes, NEWMV is chosen instead of NEARESTMV, NEARMV, and
+  // ZEROMV. Here, checks are added for those cases, and the mode decisions
+  // are corrected.
+  if (best_mbmode.mode == NEWMV) {
+    const MV_REFERENCE_FRAME refs[2] = {best_mbmode.ref_frame[0],
+        best_mbmode.ref_frame[1]};
+    int comp_pred_mode = refs[1] > INTRA_FRAME;
+
+    if (frame_mv[NEARESTMV][refs[0]].as_int == best_mbmode.mv[0].as_int &&
+        ((comp_pred_mode && frame_mv[NEARESTMV][refs[1]].as_int ==
+            best_mbmode.mv[1].as_int) || !comp_pred_mode))
+      best_mbmode.mode = NEARESTMV;
+    else if (frame_mv[NEARMV][refs[0]].as_int == best_mbmode.mv[0].as_int &&
+        ((comp_pred_mode && frame_mv[NEARMV][refs[1]].as_int ==
+            best_mbmode.mv[1].as_int) || !comp_pred_mode))
+      best_mbmode.mode = NEARMV;
+    else if (best_mbmode.mv[0].as_int == 0 &&
+        ((comp_pred_mode && best_mbmode.mv[1].as_int == 0) || !comp_pred_mode))
+      best_mbmode.mode = ZEROMV;
+  }
+
+  if (best_mode_index < 0 || best_rd >= best_rd_so_far) {
+    rd_cost->rate = INT_MAX;
+    rd_cost->rdcost = INT64_MAX;
+    return;
+  }
+
+  // If we used an estimate for the uv intra rd in the loop above...
+  if (sf->use_uv_intra_rd_estimate) {
+    // Do Intra UV best rd mode selection if best mode choice above was intra.
+    if (best_mbmode.ref_frame[0] == INTRA_FRAME) {
+      TX_SIZE uv_tx_size;
+      *mi = best_mbmode;
+      uv_tx_size = get_uv_tx_size(mi, &xd->plane[1]);
+      rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra[uv_tx_size],
+                              &rate_uv_tokenonly[uv_tx_size],
+                              &dist_uv[uv_tx_size],
+                              &skip_uv[uv_tx_size],
+                              bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize,
+                              uv_tx_size);
+    }
+  }
+
+  assert((cm->interp_filter == SWITCHABLE) ||
+         (cm->interp_filter == best_mbmode.interp_filter) ||
+         !is_inter_block(&best_mbmode));
+
+  if (!cpi->rc.is_src_frame_alt_ref)
+    vp9_update_rd_thresh_fact(tile_data->thresh_freq_fact,
+                              sf->adaptive_rd_thresh, bsize, best_mode_index);
+
+  // macroblock modes
+  *mi = best_mbmode;
+  x->skip |= best_skip2;
+
+  for (i = 0; i < REFERENCE_MODES; ++i) {
+    if (best_pred_rd[i] == INT64_MAX)
+      best_pred_diff[i] = INT_MIN;
+    else
+      best_pred_diff[i] = best_rd - best_pred_rd[i];
+  }
+
+  if (!x->skip) {
+    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
+      if (best_filter_rd[i] == INT64_MAX)
+        best_filter_diff[i] = 0;
+      else
+        best_filter_diff[i] = best_rd - best_filter_rd[i];
+    }
+    if (cm->interp_filter == SWITCHABLE)
+      assert(best_filter_diff[SWITCHABLE_FILTERS] == 0);
+  } else {
+    vp9_zero(best_filter_diff);
+  }
+
+  // TODO(yunqingwang): Moving this line in front of the above best_filter_diff
+  // updating code causes PSNR loss. Need to figure out the confliction.
+  x->skip |= best_mode_skippable;
+
+  if (!x->skip && !x->select_tx_size) {
+    int has_high_freq_coeff = 0;
+    int plane;
+    int max_plane = is_inter_block(xd->mi[0])
+                        ? MAX_MB_PLANE : 1;
+    for (plane = 0; plane < max_plane; ++plane) {
+      x->plane[plane].eobs = ctx->eobs_pbuf[plane][1];
+      has_high_freq_coeff |= vp9_has_high_freq_in_plane(x, bsize, plane);
+    }
+
+    for (plane = max_plane; plane < MAX_MB_PLANE; ++plane) {
+      x->plane[plane].eobs = ctx->eobs_pbuf[plane][2];
+      has_high_freq_coeff |= vp9_has_high_freq_in_plane(x, bsize, plane);
+    }
+
+    best_mode_skippable |= !has_high_freq_coeff;
+  }
+
+  assert(best_mode_index >= 0);
+
+  store_coding_context(x, ctx, best_mode_index, best_pred_diff,
+                       best_filter_diff, best_mode_skippable);
+}
+
+void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi,
+                                        TileDataEnc *tile_data,
+                                        MACROBLOCK *x,
+                                        RD_COST *rd_cost,
+                                        BLOCK_SIZE bsize,
+                                        PICK_MODE_CONTEXT *ctx,
+                                        int64_t best_rd_so_far) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *const mi = xd->mi[0];
+  unsigned char segment_id = mi->segment_id;
+  const int comp_pred = 0;
+  int i;
+  int64_t best_pred_diff[REFERENCE_MODES];
+  int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
+  unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
+  vpx_prob comp_mode_p;
+  INTERP_FILTER best_filter = SWITCHABLE;
+  int64_t this_rd = INT64_MAX;
+  int rate2 = 0;
+  const int64_t distortion2 = 0;
+
+  x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
+
+  estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
+                           &comp_mode_p);
+
+  for (i = 0; i < MAX_REF_FRAMES; ++i)
+    x->pred_sse[i] = INT_MAX;
+  for (i = LAST_FRAME; i < MAX_REF_FRAMES; ++i)
+    x->pred_mv_sad[i] = INT_MAX;
+
+  rd_cost->rate = INT_MAX;
+
+  assert(segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP));
+
+  mi->mode = ZEROMV;
+  mi->uv_mode = DC_PRED;
+  mi->ref_frame[0] = LAST_FRAME;
+  mi->ref_frame[1] = NONE;
+  mi->mv[0].as_int = 0;
+  x->skip = 1;
+
+  if (cm->interp_filter != BILINEAR) {
+    best_filter = EIGHTTAP;
+    if (cm->interp_filter == SWITCHABLE &&
+        x->source_variance >= cpi->sf.disable_filter_search_var_thresh) {
+      int rs;
+      int best_rs = INT_MAX;
+      for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
+        mi->interp_filter = i;
+        rs = vp9_get_switchable_rate(cpi, xd);
+        if (rs < best_rs) {
+          best_rs = rs;
+          best_filter = mi->interp_filter;
+        }
+      }
+    }
+  }
+  // Set the appropriate filter
+  if (cm->interp_filter == SWITCHABLE) {
+    mi->interp_filter = best_filter;
+    rate2 += vp9_get_switchable_rate(cpi, xd);
+  } else {
+    mi->interp_filter = cm->interp_filter;
+  }
+
+  if (cm->reference_mode == REFERENCE_MODE_SELECT)
+    rate2 += vp9_cost_bit(comp_mode_p, comp_pred);
+
+  // Estimate the reference frame signaling cost and add it
+  // to the rolling cost variable.
+  rate2 += ref_costs_single[LAST_FRAME];
+  this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+
+  rd_cost->rate = rate2;
+  rd_cost->dist = distortion2;
+  rd_cost->rdcost = this_rd;
+
+  if (this_rd >= best_rd_so_far) {
+    rd_cost->rate = INT_MAX;
+    rd_cost->rdcost = INT64_MAX;
+    return;
+  }
+
+  assert((cm->interp_filter == SWITCHABLE) ||
+         (cm->interp_filter == mi->interp_filter));
+
+  vp9_update_rd_thresh_fact(tile_data->thresh_freq_fact,
+                            cpi->sf.adaptive_rd_thresh, bsize, THR_ZEROMV);
+
+  vp9_zero(best_pred_diff);
+  vp9_zero(best_filter_diff);
+
+  if (!x->select_tx_size)
+    swap_block_ptr(x, ctx, 1, 0, 0, MAX_MB_PLANE);
+  store_coding_context(x, ctx, THR_ZEROMV,
+                       best_pred_diff, best_filter_diff, 0);
+}
+
+void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi,
+                                   TileDataEnc *tile_data,
+                                   MACROBLOCK *x,
+                                   int mi_row, int mi_col,
+                                   RD_COST *rd_cost,
+                                   BLOCK_SIZE bsize,
+                                   PICK_MODE_CONTEXT *ctx,
+                                   int64_t best_rd_so_far) {
+  VP9_COMMON *const cm = &cpi->common;
+  RD_OPT *const rd_opt = &cpi->rd;
+  SPEED_FEATURES *const sf = &cpi->sf;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *const mi = xd->mi[0];
+  const struct segmentation *const seg = &cm->seg;
+  MV_REFERENCE_FRAME ref_frame, second_ref_frame;
+  unsigned char segment_id = mi->segment_id;
+  int comp_pred, i;
+  int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
+  struct buf_2d yv12_mb[4][MAX_MB_PLANE];
+  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
+                                    VP9_ALT_FLAG };
+  int64_t best_rd = best_rd_so_far;
+  int64_t best_yrd = best_rd_so_far;  // FIXME(rbultje) more precise
+  int64_t best_pred_diff[REFERENCE_MODES];
+  int64_t best_pred_rd[REFERENCE_MODES];
+  int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
+  int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
+  MODE_INFO best_mbmode;
+  int ref_index, best_ref_index = 0;
+  unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
+  vpx_prob comp_mode_p;
+  INTERP_FILTER tmp_best_filter = SWITCHABLE;
+  int rate_uv_intra, rate_uv_tokenonly;
+  int64_t dist_uv;
+  int skip_uv;
+  PREDICTION_MODE mode_uv = DC_PRED;
+  const int intra_cost_penalty = vp9_get_intra_cost_penalty(
+    cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
+  int_mv seg_mvs[4][MAX_REF_FRAMES];
+  b_mode_info best_bmodes[4];
+  int best_skip2 = 0;
+  int ref_frame_skip_mask[2] = { 0 };
+  int64_t mask_filter = 0;
+  int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS];
+  int internal_active_edge =
+    vp9_active_edge_sb(cpi, mi_row, mi_col) && vp9_internal_image_edge(cpi);
+
+  x->skip_encode = sf->skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
+  memset(x->zcoeff_blk[TX_4X4], 0, 4);
+  vp9_zero(best_mbmode);
+
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+    filter_cache[i] = INT64_MAX;
+
+  for (i = 0; i < 4; i++) {
+    int j;
+    for (j = 0; j < MAX_REF_FRAMES; j++)
+      seg_mvs[i][j].as_int = INVALID_MV;
+  }
+
+  estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
+                           &comp_mode_p);
+
+  for (i = 0; i < REFERENCE_MODES; ++i)
+    best_pred_rd[i] = INT64_MAX;
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
+    best_filter_rd[i] = INT64_MAX;
+  rate_uv_intra = INT_MAX;
+
+  rd_cost->rate = INT_MAX;
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+    if (cpi->ref_frame_flags & flag_list[ref_frame]) {
+      setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
+                         frame_mv[NEARESTMV], frame_mv[NEARMV],
+                         yv12_mb);
+    } else {
+      ref_frame_skip_mask[0] |= (1 << ref_frame);
+      ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+    }
+    frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
+    frame_mv[ZEROMV][ref_frame].as_int = 0;
+  }
+
+  for (ref_index = 0; ref_index < MAX_REFS; ++ref_index) {
+    int mode_excluded = 0;
+    int64_t this_rd = INT64_MAX;
+    int disable_skip = 0;
+    int compmode_cost = 0;
+    int rate2 = 0, rate_y = 0, rate_uv = 0;
+    int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
+    int skippable = 0;
+    int i;
+    int this_skip2 = 0;
+    int64_t total_sse = INT_MAX;
+    int early_term = 0;
+    struct buf_2d backup_yv12[2][MAX_MB_PLANE];
+
+    ref_frame = vp9_ref_order[ref_index].ref_frame[0];
+    second_ref_frame = vp9_ref_order[ref_index].ref_frame[1];
+
+#if CONFIG_BETTER_HW_COMPATIBILITY
+    // forbid 8X4 and 4X8 partitions if any reference frame is scaled.
+    if (bsize == BLOCK_8X4 || bsize == BLOCK_4X8) {
+      int ref_scaled = vp9_is_scaled(&cm->frame_refs[ref_frame - 1].sf);
+      if (second_ref_frame > INTRA_FRAME)
+        ref_scaled += vp9_is_scaled(&cm->frame_refs[second_ref_frame - 1].sf);
+      if (ref_scaled)
+        continue;
+    }
+#endif
+    // Look at the reference frame of the best mode so far and set the
+    // skip mask to look at a subset of the remaining modes.
+    if (ref_index > 2 && sf->mode_skip_start < MAX_MODES) {
+      if (ref_index == 3) {
+        switch (best_mbmode.ref_frame[0]) {
+          case INTRA_FRAME:
+            break;
+          case LAST_FRAME:
+            ref_frame_skip_mask[0] |= (1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME);
+            ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+            break;
+          case GOLDEN_FRAME:
+            ref_frame_skip_mask[0] |= (1 << LAST_FRAME) | (1 << ALTREF_FRAME);
+            ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+            break;
+          case ALTREF_FRAME:
+            ref_frame_skip_mask[0] |= (1 << GOLDEN_FRAME) | (1 << LAST_FRAME);
+            break;
+          case NONE:
+          case MAX_REF_FRAMES:
+            assert(0 && "Invalid Reference frame");
+            break;
+        }
+      }
+    }
+
+    if ((ref_frame_skip_mask[0] & (1 << ref_frame)) &&
+        (ref_frame_skip_mask[1] & (1 << VPXMAX(0, second_ref_frame))))
+      continue;
+
+    // Test best rd so far against threshold for trying this mode.
+    if (!internal_active_edge &&
+        rd_less_than_thresh(best_rd,
+                            rd_opt->threshes[segment_id][bsize][ref_index],
+                            tile_data->thresh_freq_fact[bsize][ref_index]))
+      continue;
+
+    comp_pred = second_ref_frame > INTRA_FRAME;
+    if (comp_pred) {
+      if (!cpi->allow_comp_inter_inter)
+        continue;
+      if (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))
+        continue;
+      // Do not allow compound prediction if the segment level reference frame
+      // feature is in use as in this case there can only be one reference.
+      if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
+        continue;
+
+      if ((sf->mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
+          best_mbmode.ref_frame[0] == INTRA_FRAME)
+        continue;
+    }
+
+    if (comp_pred)
+      mode_excluded = cm->reference_mode == SINGLE_REFERENCE;
+    else if (ref_frame != INTRA_FRAME)
+      mode_excluded = cm->reference_mode == COMPOUND_REFERENCE;
+
+    // If the segment reference frame feature is enabled....
+    // then do nothing if the current ref frame is not allowed..
+    if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+        get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
+      continue;
+    // Disable this drop out case if the ref frame
+    // segment level feature is enabled for this segment. This is to
+    // prevent the possibility that we end up unable to pick any mode.
+    } else if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
+      // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
+      // unless ARNR filtering is enabled in which case we want
+      // an unfiltered alternative. We allow near/nearest as well
+      // because they may result in zero-zero MVs but be cheaper.
+      if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0))
+        continue;
+    }
+
+    mi->tx_size = TX_4X4;
+    mi->uv_mode = DC_PRED;
+    mi->ref_frame[0] = ref_frame;
+    mi->ref_frame[1] = second_ref_frame;
+    // Evaluate all sub-pel filters irrespective of whether we can use
+    // them for this frame.
+    mi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP
+                                                        : cm->interp_filter;
+    x->skip = 0;
+    set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
+
+    // Select prediction reference frames.
+    for (i = 0; i < MAX_MB_PLANE; i++) {
+      xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
+      if (comp_pred)
+        xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
+    }
+
+    if (ref_frame == INTRA_FRAME) {
+      int rate;
+      if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate, &rate_y,
+                                       &distortion_y, best_rd) >= best_rd)
+        continue;
+      rate2 += rate;
+      rate2 += intra_cost_penalty;
+      distortion2 += distortion_y;
+
+      if (rate_uv_intra == INT_MAX) {
+        choose_intra_uv_mode(cpi, x, ctx, bsize, TX_4X4,
+                             &rate_uv_intra,
+                             &rate_uv_tokenonly,
+                             &dist_uv, &skip_uv,
+                             &mode_uv);
+      }
+      rate2 += rate_uv_intra;
+      rate_uv = rate_uv_tokenonly;
+      distortion2 += dist_uv;
+      distortion_uv = dist_uv;
+      mi->uv_mode = mode_uv;
+    } else {
+      int rate;
+      int64_t distortion;
+      int64_t this_rd_thresh;
+      int64_t tmp_rd, tmp_best_rd = INT64_MAX, tmp_best_rdu = INT64_MAX;
+      int tmp_best_rate = INT_MAX, tmp_best_ratey = INT_MAX;
+      int64_t tmp_best_distortion = INT_MAX, tmp_best_sse, uv_sse;
+      int tmp_best_skippable = 0;
+      int switchable_filter_index;
+      int_mv *second_ref = comp_pred ?
+                             &x->mbmi_ext->ref_mvs[second_ref_frame][0] : NULL;
+      b_mode_info tmp_best_bmodes[16];
+      MODE_INFO tmp_best_mbmode;
+      BEST_SEG_INFO bsi[SWITCHABLE_FILTERS];
+      int pred_exists = 0;
+      int uv_skippable;
+
+      YV12_BUFFER_CONFIG *scaled_ref_frame[2] = {NULL, NULL};
+      int ref;
+
+      for (ref = 0; ref < 2; ++ref) {
+        scaled_ref_frame[ref] = mi->ref_frame[ref] > INTRA_FRAME ?
+            vp9_get_scaled_ref_frame(cpi, mi->ref_frame[ref]) : NULL;
+
+        if (scaled_ref_frame[ref]) {
+          int i;
+          // Swap out the reference frame for a version that's been scaled to
+          // match the resolution of the current frame, allowing the existing
+          // motion search code to be used without additional modifications.
+          for (i = 0; i < MAX_MB_PLANE; i++)
+            backup_yv12[ref][i] = xd->plane[i].pre[ref];
+          vp9_setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col,
+                               NULL);
+        }
+      }
+
+      this_rd_thresh = (ref_frame == LAST_FRAME) ?
+          rd_opt->threshes[segment_id][bsize][THR_LAST] :
+          rd_opt->threshes[segment_id][bsize][THR_ALTR];
+      this_rd_thresh = (ref_frame == GOLDEN_FRAME) ?
+      rd_opt->threshes[segment_id][bsize][THR_GOLD] : this_rd_thresh;
+      for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+        filter_cache[i] = INT64_MAX;
+
+      if (cm->interp_filter != BILINEAR) {
+        tmp_best_filter = EIGHTTAP;
+        if (x->source_variance < sf->disable_filter_search_var_thresh) {
+          tmp_best_filter = EIGHTTAP;
+        } else if (sf->adaptive_pred_interp_filter == 1 &&
+                   ctx->pred_interp_filter < SWITCHABLE) {
+          tmp_best_filter = ctx->pred_interp_filter;
+        } else if (sf->adaptive_pred_interp_filter == 2) {
+          tmp_best_filter = ctx->pred_interp_filter < SWITCHABLE ?
+                              ctx->pred_interp_filter : 0;
+        } else {
+          for (switchable_filter_index = 0;
+               switchable_filter_index < SWITCHABLE_FILTERS;
+               ++switchable_filter_index) {
+            int newbest, rs;
+            int64_t rs_rd;
+            MB_MODE_INFO_EXT *mbmi_ext = x->mbmi_ext;
+            mi->interp_filter = switchable_filter_index;
+            tmp_rd = rd_pick_best_sub8x8_mode(cpi, x,
+                                              &mbmi_ext->ref_mvs[ref_frame][0],
+                                              second_ref, best_yrd, &rate,
+                                              &rate_y, &distortion,
+                                              &skippable, &total_sse,
+                                              (int) this_rd_thresh, seg_mvs,
+                                              bsi, switchable_filter_index,
+                                              mi_row, mi_col);
+
+            if (tmp_rd == INT64_MAX)
+              continue;
+            rs = vp9_get_switchable_rate(cpi, xd);
+            rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
+            filter_cache[switchable_filter_index] = tmp_rd;
+            filter_cache[SWITCHABLE_FILTERS] =
+                VPXMIN(filter_cache[SWITCHABLE_FILTERS], tmp_rd + rs_rd);
+            if (cm->interp_filter == SWITCHABLE)
+              tmp_rd += rs_rd;
+
+            mask_filter = VPXMAX(mask_filter, tmp_rd);
+
+            newbest = (tmp_rd < tmp_best_rd);
+            if (newbest) {
+              tmp_best_filter = mi->interp_filter;
+              tmp_best_rd = tmp_rd;
+            }
+            if ((newbest && cm->interp_filter == SWITCHABLE) ||
+                (mi->interp_filter == cm->interp_filter &&
+                 cm->interp_filter != SWITCHABLE)) {
+              tmp_best_rdu = tmp_rd;
+              tmp_best_rate = rate;
+              tmp_best_ratey = rate_y;
+              tmp_best_distortion = distortion;
+              tmp_best_sse = total_sse;
+              tmp_best_skippable = skippable;
+              tmp_best_mbmode = *mi;
+              for (i = 0; i < 4; i++) {
+                tmp_best_bmodes[i] = xd->mi[0]->bmi[i];
+                x->zcoeff_blk[TX_4X4][i] = !x->plane[0].eobs[i];
+              }
+              pred_exists = 1;
+              if (switchable_filter_index == 0 &&
+                  sf->use_rd_breakout &&
+                  best_rd < INT64_MAX) {
+                if (tmp_best_rdu / 2 > best_rd) {
+                  // skip searching the other filters if the first is
+                  // already substantially larger than the best so far
+                  tmp_best_filter = mi->interp_filter;
+                  tmp_best_rdu = INT64_MAX;
+                  break;
+                }
+              }
+            }
+          }  // switchable_filter_index loop
+        }
+      }
+
+      if (tmp_best_rdu == INT64_MAX && pred_exists)
+        continue;
+
+      mi->interp_filter = (cm->interp_filter == SWITCHABLE ?
+                           tmp_best_filter : cm->interp_filter);
+      if (!pred_exists) {
+        // Handles the special case when a filter that is not in the
+        // switchable list (bilinear, 6-tap) is indicated at the frame level
+        tmp_rd = rd_pick_best_sub8x8_mode(cpi, x,
+                                          &x->mbmi_ext->ref_mvs[ref_frame][0],
+                                          second_ref, best_yrd, &rate, &rate_y,
+                                          &distortion, &skippable, &total_sse,
+                                          (int) this_rd_thresh, seg_mvs, bsi, 0,
+                                          mi_row, mi_col);
+        if (tmp_rd == INT64_MAX)
+          continue;
+      } else {
+        total_sse = tmp_best_sse;
+        rate = tmp_best_rate;
+        rate_y = tmp_best_ratey;
+        distortion = tmp_best_distortion;
+        skippable = tmp_best_skippable;
+        *mi = tmp_best_mbmode;
+        for (i = 0; i < 4; i++)
+          xd->mi[0]->bmi[i] = tmp_best_bmodes[i];
+      }
+
+      rate2 += rate;
+      distortion2 += distortion;
+
+      if (cm->interp_filter == SWITCHABLE)
+        rate2 += vp9_get_switchable_rate(cpi, xd);
+
+      if (!mode_excluded)
+        mode_excluded = comp_pred ? cm->reference_mode == SINGLE_REFERENCE
+                                  : cm->reference_mode == COMPOUND_REFERENCE;
+
+      compmode_cost = vp9_cost_bit(comp_mode_p, comp_pred);
+
+      tmp_best_rdu =
+          best_rd - VPXMIN(RDCOST(x->rdmult, x->rddiv, rate2, distortion2),
+                           RDCOST(x->rdmult, x->rddiv, 0, total_sse));
+
+      if (tmp_best_rdu > 0) {
+        // If even the 'Y' rd value of split is higher than best so far
+        // then dont bother looking at UV
+        vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col,
+                                        BLOCK_8X8);
+        memset(x->skip_txfm, SKIP_TXFM_NONE, sizeof(x->skip_txfm));
+        if (!super_block_uvrd(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,
+                              &uv_sse, BLOCK_8X8, tmp_best_rdu)) {
+          for (ref = 0; ref < 2; ++ref) {
+            if (scaled_ref_frame[ref]) {
+              int i;
+              for (i = 0; i < MAX_MB_PLANE; ++i)
+                xd->plane[i].pre[ref] = backup_yv12[ref][i];
+            }
+          }
+          continue;
+        }
+
+        rate2 += rate_uv;
+        distortion2 += distortion_uv;
+        skippable = skippable && uv_skippable;
+        total_sse += uv_sse;
+      }
+
+      for (ref = 0; ref < 2; ++ref) {
+        if (scaled_ref_frame[ref]) {
+          // Restore the prediction frame pointers to their unscaled versions.
+          int i;
+          for (i = 0; i < MAX_MB_PLANE; ++i)
+            xd->plane[i].pre[ref] = backup_yv12[ref][i];
+        }
+      }
+    }
+
+    if (cm->reference_mode == REFERENCE_MODE_SELECT)
+      rate2 += compmode_cost;
+
+    // Estimate the reference frame signaling cost and add it
+    // to the rolling cost variable.
+    if (second_ref_frame > INTRA_FRAME) {
+      rate2 += ref_costs_comp[ref_frame];
+    } else {
+      rate2 += ref_costs_single[ref_frame];
+    }
+
+    if (!disable_skip) {
+      const vpx_prob skip_prob = vp9_get_skip_prob(cm, xd);
+      const int skip_cost0 = vp9_cost_bit(skip_prob, 0);
+      const int skip_cost1 = vp9_cost_bit(skip_prob, 1);
+
+      // Skip is never coded at the segment level for sub8x8 blocks and instead
+      // always coded in the bitstream at the mode info level.
+      if (ref_frame != INTRA_FRAME && !xd->lossless) {
+        if (RDCOST(x->rdmult, x->rddiv,
+                   rate_y + rate_uv + skip_cost0, distortion2) <
+            RDCOST(x->rdmult, x->rddiv, skip_cost1, total_sse)) {
+          // Add in the cost of the no skip flag.
+          rate2 += skip_cost0;
+        } else {
+          // FIXME(rbultje) make this work for splitmv also
+          rate2 += skip_cost1;
+          distortion2 = total_sse;
+          assert(total_sse >= 0);
+          rate2 -= (rate_y + rate_uv);
+          rate_y = 0;
+          rate_uv = 0;
+          this_skip2 = 1;
+        }
+      } else {
+        // Add in the cost of the no skip flag.
+        rate2 += skip_cost0;
+      }
+
+      // Calculate the final RD estimate for this mode.
+      this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+    }
+
+    if (!disable_skip && ref_frame == INTRA_FRAME) {
+      for (i = 0; i < REFERENCE_MODES; ++i)
+        best_pred_rd[i] = VPXMIN(best_pred_rd[i], this_rd);
+      for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
+        best_filter_rd[i] = VPXMIN(best_filter_rd[i], this_rd);
+    }
+
+    // Did this mode help.. i.e. is it the new best mode
+    if (this_rd < best_rd || x->skip) {
+      if (!mode_excluded) {
+        int max_plane = MAX_MB_PLANE;
+        // Note index of best mode so far
+        best_ref_index = ref_index;
+
+        if (ref_frame == INTRA_FRAME) {
+          /* required for left and above block mv */
+          mi->mv[0].as_int = 0;
+          max_plane = 1;
+        }
+
+        rd_cost->rate = rate2;
+        rd_cost->dist = distortion2;
+        rd_cost->rdcost = this_rd;
+        best_rd = this_rd;
+        best_yrd = best_rd -
+                   RDCOST(x->rdmult, x->rddiv, rate_uv, distortion_uv);
+        best_mbmode = *mi;
+        best_skip2 = this_skip2;
+        if (!x->select_tx_size)
+          swap_block_ptr(x, ctx, 1, 0, 0, max_plane);
+        memcpy(ctx->zcoeff_blk, x->zcoeff_blk[TX_4X4],
+               sizeof(ctx->zcoeff_blk[0]) * ctx->num_4x4_blk);
+
+        for (i = 0; i < 4; i++)
+          best_bmodes[i] = xd->mi[0]->bmi[i];
+
+        // TODO(debargha): enhance this test with a better distortion prediction
+        // based on qp, activity mask and history
+        if ((sf->mode_search_skip_flags & FLAG_EARLY_TERMINATE) &&
+            (ref_index > MIN_EARLY_TERM_INDEX)) {
+          int qstep = xd->plane[0].dequant[1];
+          // TODO(debargha): Enhance this by specializing for each mode_index
+          int scale = 4;
+#if CONFIG_VP9_HIGHBITDEPTH
+          if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+            qstep >>= (xd->bd - 8);
+          }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+          if (x->source_variance < UINT_MAX) {
+            const int var_adjust = (x->source_variance < 16);
+            scale -= var_adjust;
+          }
+          if (ref_frame > INTRA_FRAME &&
+              distortion2 * scale < qstep * qstep) {
+            early_term = 1;
+          }
+        }
+      }
+    }
+
+    /* keep record of best compound/single-only prediction */
+    if (!disable_skip && ref_frame != INTRA_FRAME) {
+      int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
+
+      if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+        single_rate = rate2 - compmode_cost;
+        hybrid_rate = rate2;
+      } else {
+        single_rate = rate2;
+        hybrid_rate = rate2 + compmode_cost;
+      }
+
+      single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
+      hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
+
+      if (!comp_pred && single_rd < best_pred_rd[SINGLE_REFERENCE])
+        best_pred_rd[SINGLE_REFERENCE] = single_rd;
+      else if (comp_pred && single_rd < best_pred_rd[COMPOUND_REFERENCE])
+        best_pred_rd[COMPOUND_REFERENCE] = single_rd;
+
+      if (hybrid_rd < best_pred_rd[REFERENCE_MODE_SELECT])
+        best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
+    }
+
+    /* keep record of best filter type */
+    if (!mode_excluded && !disable_skip && ref_frame != INTRA_FRAME &&
+        cm->interp_filter != BILINEAR) {
+      int64_t ref = filter_cache[cm->interp_filter == SWITCHABLE ?
+                              SWITCHABLE_FILTERS : cm->interp_filter];
+      int64_t adj_rd;
+      for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
+        if (ref == INT64_MAX)
+          adj_rd = 0;
+        else if (filter_cache[i] == INT64_MAX)
+          // when early termination is triggered, the encoder does not have
+          // access to the rate-distortion cost. it only knows that the cost
+          // should be above the maximum valid value. hence it takes the known
+          // maximum plus an arbitrary constant as the rate-distortion cost.
+          adj_rd = mask_filter - ref + 10;
+        else
+          adj_rd = filter_cache[i] - ref;
+
+        adj_rd += this_rd;
+        best_filter_rd[i] = VPXMIN(best_filter_rd[i], adj_rd);
+      }
+    }
+
+    if (early_term)
+      break;
+
+    if (x->skip && !comp_pred)
+      break;
+  }
+
+  if (best_rd >= best_rd_so_far) {
+    rd_cost->rate = INT_MAX;
+    rd_cost->rdcost = INT64_MAX;
+    return;
+  }
+
+  // If we used an estimate for the uv intra rd in the loop above...
+  if (sf->use_uv_intra_rd_estimate) {
+    // Do Intra UV best rd mode selection if best mode choice above was intra.
+    if (best_mbmode.ref_frame[0] == INTRA_FRAME) {
+      *mi = best_mbmode;
+      rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra,
+                              &rate_uv_tokenonly,
+                              &dist_uv,
+                              &skip_uv,
+                              BLOCK_8X8, TX_4X4);
+    }
+  }
+
+  if (best_rd == INT64_MAX) {
+    rd_cost->rate = INT_MAX;
+    rd_cost->dist = INT64_MAX;
+    rd_cost->rdcost = INT64_MAX;
+    return;
+  }
+
+  assert((cm->interp_filter == SWITCHABLE) ||
+         (cm->interp_filter == best_mbmode.interp_filter) ||
+         !is_inter_block(&best_mbmode));
+
+  vp9_update_rd_thresh_fact(tile_data->thresh_freq_fact,
+                            sf->adaptive_rd_thresh, bsize, best_ref_index);
+
+  // macroblock modes
+  *mi = best_mbmode;
+  x->skip |= best_skip2;
+  if (!is_inter_block(&best_mbmode)) {
+    for (i = 0; i < 4; i++)
+      xd->mi[0]->bmi[i].as_mode = best_bmodes[i].as_mode;
+  } else {
+    for (i = 0; i < 4; ++i)
+      memcpy(&xd->mi[0]->bmi[i], &best_bmodes[i], sizeof(b_mode_info));
+
+    mi->mv[0].as_int = xd->mi[0]->bmi[3].as_mv[0].as_int;
+    mi->mv[1].as_int = xd->mi[0]->bmi[3].as_mv[1].as_int;
+  }
+
+  for (i = 0; i < REFERENCE_MODES; ++i) {
+    if (best_pred_rd[i] == INT64_MAX)
+      best_pred_diff[i] = INT_MIN;
+    else
+      best_pred_diff[i] = best_rd - best_pred_rd[i];
+  }
+
+  if (!x->skip) {
+    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
+      if (best_filter_rd[i] == INT64_MAX)
+        best_filter_diff[i] = 0;
+      else
+        best_filter_diff[i] = best_rd - best_filter_rd[i];
+    }
+    if (cm->interp_filter == SWITCHABLE)
+      assert(best_filter_diff[SWITCHABLE_FILTERS] == 0);
+  } else {
+    vp9_zero(best_filter_diff);
+  }
+
+  store_coding_context(x, ctx, best_ref_index,
+                       best_pred_diff, best_filter_diff, 0);
+}
diff --git a/libs/libvpx/vp9/encoder/vp9_rdopt.h b/libs/libvpx/vp9/encoder/vp9_rdopt.h
new file mode 100644
index 0000000000..253e4a02df
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_rdopt.h
@@ -0,0 +1,65 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_VP9_RDOPT_H_
+#define VP9_ENCODER_VP9_RDOPT_H_
+
+#include "vp9/common/vp9_blockd.h"
+
+#include "vp9/encoder/vp9_block.h"
+#include "vp9/encoder/vp9_context_tree.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct TileInfo;
+struct VP9_COMP;
+struct macroblock;
+struct RD_COST;
+
+void vp9_rd_pick_intra_mode_sb(struct VP9_COMP *cpi, struct macroblock *x,
+                               struct RD_COST *rd_cost, BLOCK_SIZE bsize,
+                               PICK_MODE_CONTEXT *ctx, int64_t best_rd);
+
+void vp9_rd_pick_inter_mode_sb(struct VP9_COMP *cpi,
+                               struct TileDataEnc *tile_data,
+                               struct macroblock *x,
+                               int mi_row, int mi_col,
+                               struct RD_COST *rd_cost,
+                               BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                               int64_t best_rd_so_far);
+
+void vp9_rd_pick_inter_mode_sb_seg_skip(struct VP9_COMP *cpi,
+                                        struct TileDataEnc *tile_data,
+                                        struct macroblock *x,
+                                        struct RD_COST *rd_cost,
+                                        BLOCK_SIZE bsize,
+                                        PICK_MODE_CONTEXT *ctx,
+                                        int64_t best_rd_so_far);
+
+int vp9_internal_image_edge(struct VP9_COMP *cpi);
+int vp9_active_h_edge(struct VP9_COMP *cpi, int mi_row, int mi_step);
+int vp9_active_v_edge(struct VP9_COMP *cpi, int mi_col, int mi_step);
+int vp9_active_edge_sb(struct VP9_COMP *cpi, int mi_row, int mi_col);
+
+void vp9_rd_pick_inter_mode_sub8x8(struct VP9_COMP *cpi,
+                                   struct TileDataEnc *tile_data,
+                                   struct macroblock *x,
+                                   int mi_row, int mi_col,
+                                   struct RD_COST *rd_cost,
+                                   BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                                   int64_t best_rd_so_far);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_VP9_RDOPT_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_resize.c b/libs/libvpx/vp9/encoder/vp9_resize.c
new file mode 100644
index 0000000000..f4d0db4d5a
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_resize.c
@@ -0,0 +1,929 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "./vpx_config.h"
+#if CONFIG_VP9_HIGHBITDEPTH
+#include "vpx_dsp/vpx_dsp_common.h"
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/encoder/vp9_resize.h"
+
+#define FILTER_BITS               7
+
+#define INTERP_TAPS               8
+#define SUBPEL_BITS               5
+#define SUBPEL_MASK               ((1 << SUBPEL_BITS) - 1)
+#define INTERP_PRECISION_BITS     32
+
+typedef int16_t interp_kernel[INTERP_TAPS];
+
+// Filters for interpolation (0.5-band) - note this also filters integer pels.
+static const interp_kernel filteredinterp_filters500[(1 << SUBPEL_BITS)] = {
+  {-3,  0, 35, 64, 35,  0, -3, 0},
+  {-3, -1, 34, 64, 36,  1, -3, 0},
+  {-3, -1, 32, 64, 38,  1, -3, 0},
+  {-2, -2, 31, 63, 39,  2, -3, 0},
+  {-2, -2, 29, 63, 41,  2, -3, 0},
+  {-2, -2, 28, 63, 42,  3, -4, 0},
+  {-2, -3, 27, 63, 43,  4, -4, 0},
+  {-2, -3, 25, 62, 45,  5, -4, 0},
+  {-2, -3, 24, 62, 46,  5, -4, 0},
+  {-2, -3, 23, 61, 47,  6, -4, 0},
+  {-2, -3, 21, 60, 49,  7, -4, 0},
+  {-1, -4, 20, 60, 50,  8, -4, -1},
+  {-1, -4, 19, 59, 51,  9, -4, -1},
+  {-1, -4, 17, 58, 52, 10, -4, 0},
+  {-1, -4, 16, 57, 53, 12, -4, -1},
+  {-1, -4, 15, 56, 54, 13, -4, -1},
+  {-1, -4, 14, 55, 55, 14, -4, -1},
+  {-1, -4, 13, 54, 56, 15, -4, -1},
+  {-1, -4, 12, 53, 57, 16, -4, -1},
+  {0, -4, 10, 52, 58, 17, -4, -1},
+  {-1, -4,  9, 51, 59, 19, -4, -1},
+  {-1, -4,  8, 50, 60, 20, -4, -1},
+  {0, -4,  7, 49, 60, 21, -3, -2},
+  {0, -4,  6, 47, 61, 23, -3, -2},
+  {0, -4,  5, 46, 62, 24, -3, -2},
+  {0, -4,  5, 45, 62, 25, -3, -2},
+  {0, -4,  4, 43, 63, 27, -3, -2},
+  {0, -4,  3, 42, 63, 28, -2, -2},
+  {0, -3,  2, 41, 63, 29, -2, -2},
+  {0, -3,  2, 39, 63, 31, -2, -2},
+  {0, -3,  1, 38, 64, 32, -1, -3},
+  {0, -3,  1, 36, 64, 34, -1, -3}
+};
+
+// Filters for interpolation (0.625-band) - note this also filters integer pels.
+static const interp_kernel filteredinterp_filters625[(1 << SUBPEL_BITS)] = {
+  {-1, -8, 33, 80, 33, -8, -1, 0},
+  {-1, -8, 30, 80, 35, -8, -1, 1},
+  {-1, -8, 28, 80, 37, -7, -2, 1},
+  {0, -8, 26, 79, 39, -7, -2, 1},
+  {0, -8, 24, 79, 41, -7, -2, 1},
+  {0, -8, 22, 78, 43, -6, -2, 1},
+  {0, -8, 20, 78, 45, -5, -3, 1},
+  {0, -8, 18, 77, 48, -5, -3, 1},
+  {0, -8, 16, 76, 50, -4, -3, 1},
+  {0, -8, 15, 75, 52, -3, -4, 1},
+  {0, -7, 13, 74, 54, -3, -4, 1},
+  {0, -7, 11, 73, 56, -2, -4, 1},
+  {0, -7, 10, 71, 58, -1, -4, 1},
+  {1, -7,  8, 70, 60,  0, -5, 1},
+  {1, -6,  6, 68, 62,  1, -5, 1},
+  {1, -6,  5, 67, 63,  2, -5, 1},
+  {1, -6,  4, 65, 65,  4, -6, 1},
+  {1, -5,  2, 63, 67,  5, -6, 1},
+  {1, -5,  1, 62, 68,  6, -6, 1},
+  {1, -5,  0, 60, 70,  8, -7, 1},
+  {1, -4, -1, 58, 71, 10, -7, 0},
+  {1, -4, -2, 56, 73, 11, -7, 0},
+  {1, -4, -3, 54, 74, 13, -7, 0},
+  {1, -4, -3, 52, 75, 15, -8, 0},
+  {1, -3, -4, 50, 76, 16, -8, 0},
+  {1, -3, -5, 48, 77, 18, -8, 0},
+  {1, -3, -5, 45, 78, 20, -8, 0},
+  {1, -2, -6, 43, 78, 22, -8, 0},
+  {1, -2, -7, 41, 79, 24, -8, 0},
+  {1, -2, -7, 39, 79, 26, -8, 0},
+  {1, -2, -7, 37, 80, 28, -8, -1},
+  {1, -1, -8, 35, 80, 30, -8, -1},
+};
+
+// Filters for interpolation (0.75-band) - note this also filters integer pels.
+static const interp_kernel filteredinterp_filters750[(1 << SUBPEL_BITS)] = {
+  {2, -11,  25,  96,  25, -11,   2, 0},
+  {2, -11,  22,  96,  28, -11,   2, 0},
+  {2, -10,  19,  95,  31, -11,   2, 0},
+  {2, -10,  17,  95,  34, -12,   2, 0},
+  {2,  -9,  14,  94,  37, -12,   2, 0},
+  {2,  -8,  12,  93,  40, -12,   1, 0},
+  {2,  -8,   9,  92,  43, -12,   1, 1},
+  {2,  -7,   7,  91,  46, -12,   1, 0},
+  {2,  -7,   5,  90,  49, -12,   1, 0},
+  {2,  -6,   3,  88,  52, -12,   0, 1},
+  {2,  -5,   1,  86,  55, -12,   0, 1},
+  {2,  -5,  -1,  84,  58, -11,   0, 1},
+  {2,  -4,  -2,  82,  61, -11,  -1, 1},
+  {2,  -4,  -4,  80,  64, -10,  -1, 1},
+  {1, -3, -5, 77, 67, -9, -1, 1},
+  {1, -3, -6, 75, 70, -8, -2, 1},
+  {1, -2, -7, 72, 72, -7, -2, 1},
+  {1, -2, -8, 70, 75, -6, -3, 1},
+  {1, -1, -9, 67, 77, -5, -3, 1},
+  {1,  -1, -10,  64,  80,  -4,  -4, 2},
+  {1,  -1, -11,  61,  82,  -2,  -4, 2},
+  {1,   0, -11,  58,  84,  -1,  -5, 2},
+  {1,   0, -12,  55,  86,   1,  -5, 2},
+  {1,   0, -12,  52,  88,   3,  -6, 2},
+  {0,   1, -12,  49,  90,   5,  -7, 2},
+  {0,   1, -12,  46,  91,   7,  -7, 2},
+  {1,   1, -12,  43,  92,   9,  -8, 2},
+  {0,   1, -12,  40,  93,  12,  -8, 2},
+  {0,   2, -12,  37,  94,  14,  -9, 2},
+  {0,   2, -12,  34,  95,  17, -10, 2},
+  {0,   2, -11,  31,  95,  19, -10, 2},
+  {0,   2, -11,  28,  96,  22, -11, 2}
+};
+
+// Filters for interpolation (0.875-band) - note this also filters integer pels.
+static const interp_kernel filteredinterp_filters875[(1 << SUBPEL_BITS)] = {
+  {3,  -8,  13, 112,  13,  -8,   3, 0},
+  {3,  -7,  10, 112,  17,  -9,   3, -1},
+  {2,  -6,   7, 111,  21,  -9,   3, -1},
+  {2,  -5,   4, 111,  24, -10,   3, -1},
+  {2,  -4,   1, 110,  28, -11,   3, -1},
+  {1,  -3,  -1, 108,  32, -12,   4, -1},
+  {1,  -2,  -3, 106,  36, -13,   4, -1},
+  {1,  -1,  -6, 105,  40, -14,   4, -1},
+  {1,  -1,  -7, 102,  44, -14,   4, -1},
+  {1,   0,  -9, 100,  48, -15,   4, -1},
+  {1,   1, -11,  97,  53, -16,   4, -1},
+  {0,   1, -12,  95,  57, -16,   4, -1},
+  {0,   2, -13,  91,  61, -16,   4, -1},
+  {0,   2, -14,  88,  65, -16,   4, -1},
+  {0,   3, -15,  84,  69, -17,   4, 0},
+  {0,   3, -16,  81,  73, -16,   3, 0},
+  {0,   3, -16,  77,  77, -16,   3, 0},
+  {0,   3, -16,  73,  81, -16,   3, 0},
+  {0,   4, -17,  69,  84, -15,   3, 0},
+  {-1,   4, -16,  65,  88, -14,   2, 0},
+  {-1,   4, -16,  61,  91, -13,   2, 0},
+  {-1,   4, -16,  57,  95, -12,   1, 0},
+  {-1,   4, -16,  53,  97, -11,   1, 1},
+  {-1,   4, -15,  48, 100,  -9,   0, 1},
+  {-1,   4, -14,  44, 102,  -7,  -1, 1},
+  {-1,   4, -14,  40, 105,  -6,  -1, 1},
+  {-1,   4, -13,  36, 106,  -3,  -2, 1},
+  {-1,   4, -12,  32, 108,  -1,  -3, 1},
+  {-1,   3, -11,  28, 110,   1,  -4, 2},
+  {-1,   3, -10,  24, 111,   4,  -5, 2},
+  {-1,   3,  -9,  21, 111,   7,  -6, 2},
+  {-1,   3,  -9,  17, 112,  10,  -7, 3}
+};
+
+// Filters for interpolation (full-band) - no filtering for integer pixels
+static const interp_kernel filteredinterp_filters1000[(1 << SUBPEL_BITS)] = {
+  {0,   0,   0, 128,   0,   0,   0, 0},
+  {0,   1,  -3, 128,   3,  -1,   0, 0},
+  {-1,   2,  -6, 127,   7,  -2,   1, 0},
+  {-1,   3,  -9, 126,  12,  -4,   1, 0},
+  {-1,   4, -12, 125,  16,  -5,   1, 0},
+  {-1,   4, -14, 123,  20,  -6,   2, 0},
+  {-1,   5, -15, 120,  25,  -8,   2, 0},
+  {-1,   5, -17, 118,  30,  -9,   3, -1},
+  {-1,   6, -18, 114,  35, -10,   3, -1},
+  {-1,   6, -19, 111,  41, -12,   3, -1},
+  {-1,   6, -20, 107,  46, -13,   4, -1},
+  {-1,   6, -21, 103,  52, -14,   4, -1},
+  {-1,   6, -21,  99,  57, -16,   5, -1},
+  {-1,   6, -21,  94,  63, -17,   5, -1},
+  {-1,   6, -20,  89,  68, -18,   5, -1},
+  {-1,   6, -20,  84,  73, -19,   6, -1},
+  {-1,   6, -20,  79,  79, -20,   6, -1},
+  {-1,   6, -19,  73,  84, -20,   6, -1},
+  {-1,   5, -18,  68,  89, -20,   6, -1},
+  {-1,   5, -17,  63,  94, -21,   6, -1},
+  {-1,   5, -16,  57,  99, -21,   6, -1},
+  {-1,   4, -14,  52, 103, -21,   6, -1},
+  {-1,   4, -13,  46, 107, -20,   6, -1},
+  {-1,   3, -12,  41, 111, -19,   6, -1},
+  {-1,   3, -10,  35, 114, -18,   6, -1},
+  {-1,   3,  -9,  30, 118, -17,   5, -1},
+  {0,   2,  -8,  25, 120, -15,   5, -1},
+  {0,   2,  -6,  20, 123, -14,   4, -1},
+  {0,   1,  -5,  16, 125, -12,   4, -1},
+  {0,   1,  -4,  12, 126,  -9,   3, -1},
+  {0,   1,  -2,   7, 127,  -6,   2, -1},
+  {0,   0,  -1,   3, 128,  -3,   1, 0}
+};
+
+// Filters for factor of 2 downsampling.
+static const int16_t vp9_down2_symeven_half_filter[] = {56, 12, -3, -1};
+static const int16_t vp9_down2_symodd_half_filter[] = {64, 35, 0, -3};
+
+static const interp_kernel *choose_interp_filter(int inlength, int outlength) {
+  int outlength16 = outlength * 16;
+  if (outlength16 >= inlength * 16)
+    return filteredinterp_filters1000;
+  else if (outlength16 >= inlength * 13)
+    return filteredinterp_filters875;
+  else if (outlength16 >= inlength * 11)
+    return filteredinterp_filters750;
+  else if (outlength16 >= inlength * 9)
+    return filteredinterp_filters625;
+  else
+    return filteredinterp_filters500;
+}
+
+static void interpolate(const uint8_t *const input, int inlength,
+                        uint8_t *output, int outlength) {
+  const int64_t delta = (((uint64_t)inlength << 32) + outlength / 2) /
+      outlength;
+  const int64_t offset = inlength > outlength ?
+      (((int64_t)(inlength - outlength) << 31) + outlength / 2) / outlength :
+      -(((int64_t)(outlength - inlength) << 31) + outlength / 2) / outlength;
+  uint8_t *optr = output;
+  int x, x1, x2, sum, k, int_pel, sub_pel;
+  int64_t y;
+
+  const interp_kernel *interp_filters =
+      choose_interp_filter(inlength, outlength);
+
+  x = 0;
+  y = offset;
+  while ((y >> INTERP_PRECISION_BITS) < (INTERP_TAPS / 2 - 1)) {
+    x++;
+    y += delta;
+  }
+  x1 = x;
+  x = outlength - 1;
+  y = delta * x + offset;
+  while ((y >> INTERP_PRECISION_BITS) +
+         (int64_t)(INTERP_TAPS / 2) >= inlength) {
+    x--;
+    y -= delta;
+  }
+  x2 = x;
+  if (x1 > x2) {
+    for (x = 0, y = offset; x < outlength; ++x, y += delta) {
+      const int16_t *filter;
+      int_pel = y >> INTERP_PRECISION_BITS;
+      sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+      filter = interp_filters[sub_pel];
+      sum = 0;
+      for (k = 0; k < INTERP_TAPS; ++k) {
+        const int pk = int_pel - INTERP_TAPS / 2 + 1 + k;
+        sum += filter[k] * input[(pk < 0 ? 0 :
+                                  (pk >= inlength ? inlength - 1 : pk))];
+      }
+      *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+    }
+  } else {
+    // Initial part.
+    for (x = 0, y = offset; x < x1; ++x, y += delta) {
+      const int16_t *filter;
+      int_pel = y >> INTERP_PRECISION_BITS;
+      sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+      filter = interp_filters[sub_pel];
+      sum = 0;
+      for (k = 0; k < INTERP_TAPS; ++k)
+        sum += filter[k] * input[(int_pel - INTERP_TAPS / 2 + 1 + k < 0 ?
+                                  0 :
+                                  int_pel - INTERP_TAPS / 2 + 1 + k)];
+      *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+    }
+    // Middle part.
+    for (; x <= x2; ++x, y += delta) {
+      const int16_t *filter;
+      int_pel = y >> INTERP_PRECISION_BITS;
+      sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+      filter = interp_filters[sub_pel];
+      sum = 0;
+      for (k = 0; k < INTERP_TAPS; ++k)
+        sum += filter[k] * input[int_pel - INTERP_TAPS / 2 + 1 + k];
+      *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+    }
+    // End part.
+    for (; x < outlength; ++x, y += delta) {
+      const int16_t *filter;
+      int_pel = y >> INTERP_PRECISION_BITS;
+      sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+      filter = interp_filters[sub_pel];
+      sum = 0;
+      for (k = 0; k < INTERP_TAPS; ++k)
+        sum += filter[k] * input[(int_pel - INTERP_TAPS / 2 + 1 + k >=
+                                  inlength ?  inlength - 1 :
+                                  int_pel - INTERP_TAPS / 2 + 1 + k)];
+      *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+    }
+  }
+}
+
+static void down2_symeven(const uint8_t *const input, int length,
+                          uint8_t *output) {
+  // Actual filter len = 2 * filter_len_half.
+  const int16_t *filter = vp9_down2_symeven_half_filter;
+  const int filter_len_half = sizeof(vp9_down2_symeven_half_filter) / 2;
+  int i, j;
+  uint8_t *optr = output;
+  int l1 = filter_len_half;
+  int l2 = (length - filter_len_half);
+  l1 += (l1 & 1);
+  l2 += (l2 & 1);
+  if (l1 > l2) {
+    // Short input length.
+    for (i = 0; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum += (input[(i - j < 0 ? 0 : i - j)] +
+                input[(i + 1 + j >= length ? length - 1 : i + 1 + j)]) *
+            filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+  } else {
+    // Initial part.
+    for (i = 0; i < l1; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum += (input[(i - j < 0 ? 0 : i - j)] + input[i + 1 + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+    // Middle part.
+    for (; i < l2; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum += (input[i - j] + input[i + 1 + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+    // End part.
+    for (; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum += (input[i - j] +
+                input[(i + 1 + j >= length ? length - 1 : i + 1 + j)]) *
+            filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+  }
+}
+
+static void down2_symodd(const uint8_t *const input, int length,
+                         uint8_t *output) {
+  // Actual filter len = 2 * filter_len_half - 1.
+  const int16_t *filter = vp9_down2_symodd_half_filter;
+  const int filter_len_half = sizeof(vp9_down2_symodd_half_filter) / 2;
+  int i, j;
+  uint8_t *optr = output;
+  int l1 = filter_len_half - 1;
+  int l2 = (length - filter_len_half + 1);
+  l1 += (l1 & 1);
+  l2 += (l2 & 1);
+  if (l1 > l2) {
+    // Short input length.
+    for (i = 0; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[(i - j < 0 ? 0 : i - j)] +
+                input[(i + j >= length ? length - 1 : i + j)]) *
+            filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+  } else {
+    // Initial part.
+    for (i = 0; i < l1; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[(i - j < 0 ? 0 : i - j)] + input[i + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+    // Middle part.
+    for (; i < l2; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[i - j] + input[i + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+    // End part.
+    for (; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[i - j] + input[(i + j >= length ? length - 1 : i + j)]) *
+            filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+  }
+}
+
+static int get_down2_length(int length, int steps) {
+  int s;
+  for (s = 0; s < steps; ++s)
+    length = (length + 1) >> 1;
+  return length;
+}
+
+static int get_down2_steps(int in_length, int out_length) {
+  int steps = 0;
+  int proj_in_length;
+  while ((proj_in_length = get_down2_length(in_length, 1)) >= out_length) {
+    ++steps;
+    in_length = proj_in_length;
+  }
+  return steps;
+}
+
+static void resize_multistep(const uint8_t *const input,
+                             int length,
+                             uint8_t *output,
+                             int olength,
+                             uint8_t *buf) {
+  int steps;
+  if (length == olength) {
+    memcpy(output, input, sizeof(output[0]) * length);
+    return;
+  }
+  steps = get_down2_steps(length, olength);
+
+  if (steps > 0) {
+    int s;
+    uint8_t *out = NULL;
+    uint8_t *tmpbuf = NULL;
+    uint8_t *otmp, *otmp2;
+    int filteredlength = length;
+    if (!tmpbuf) {
+      tmpbuf = (uint8_t *)malloc(sizeof(uint8_t) * length);
+      otmp = tmpbuf;
+    } else {
+      otmp = buf;
+    }
+    otmp2 = otmp + get_down2_length(length, 1);
+    for (s = 0; s < steps; ++s) {
+      const int proj_filteredlength = get_down2_length(filteredlength, 1);
+      const uint8_t *const in = (s == 0 ? input : out);
+      if (s == steps - 1 && proj_filteredlength == olength)
+        out = output;
+      else
+        out = (s & 1 ? otmp2 : otmp);
+      if (filteredlength & 1)
+        down2_symodd(in, filteredlength, out);
+      else
+        down2_symeven(in, filteredlength, out);
+      filteredlength = proj_filteredlength;
+    }
+    if (filteredlength != olength) {
+      interpolate(out, filteredlength, output, olength);
+    }
+    if (tmpbuf)
+      free(tmpbuf);
+  } else {
+    interpolate(input, length, output, olength);
+  }
+}
+
+static void fill_col_to_arr(uint8_t *img, int stride, int len, uint8_t *arr) {
+  int i;
+  uint8_t *iptr = img;
+  uint8_t *aptr = arr;
+  for (i = 0; i < len; ++i, iptr += stride) {
+    *aptr++ = *iptr;
+  }
+}
+
+static void fill_arr_to_col(uint8_t *img, int stride, int len, uint8_t *arr) {
+  int i;
+  uint8_t *iptr = img;
+  uint8_t *aptr = arr;
+  for (i = 0; i < len; ++i, iptr += stride) {
+    *iptr = *aptr++;
+  }
+}
+
+void vp9_resize_plane(const uint8_t *const input,
+                      int height,
+                      int width,
+                      int in_stride,
+                      uint8_t *output,
+                      int height2,
+                      int width2,
+                      int out_stride) {
+  int i;
+  uint8_t *intbuf = (uint8_t *)malloc(sizeof(uint8_t) * width2 * height);
+  uint8_t *tmpbuf = (uint8_t *)malloc(sizeof(uint8_t) *
+                                      (width < height ? height : width));
+  uint8_t *arrbuf = (uint8_t *)malloc(sizeof(uint8_t) * (height + height2));
+  assert(width > 0);
+  assert(height > 0);
+  assert(width2 > 0);
+  assert(height2 > 0);
+  for (i = 0; i < height; ++i)
+    resize_multistep(input + in_stride * i, width,
+                        intbuf + width2 * i, width2, tmpbuf);
+  for (i = 0; i < width2; ++i) {
+    fill_col_to_arr(intbuf + i, width2, height, arrbuf);
+    resize_multistep(arrbuf, height, arrbuf + height, height2, tmpbuf);
+    fill_arr_to_col(output + i, out_stride, height2, arrbuf + height);
+  }
+  free(intbuf);
+  free(tmpbuf);
+  free(arrbuf);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_interpolate(const uint16_t *const input, int inlength,
+                               uint16_t *output, int outlength, int bd) {
+  const int64_t delta =
+      (((uint64_t)inlength << 32) + outlength / 2) / outlength;
+  const int64_t offset = inlength > outlength ?
+      (((int64_t)(inlength - outlength) << 31) + outlength / 2) / outlength :
+      -(((int64_t)(outlength - inlength) << 31) + outlength / 2) / outlength;
+  uint16_t *optr = output;
+  int x, x1, x2, sum, k, int_pel, sub_pel;
+  int64_t y;
+
+  const interp_kernel *interp_filters =
+      choose_interp_filter(inlength, outlength);
+
+  x = 0;
+  y = offset;
+  while ((y >> INTERP_PRECISION_BITS) < (INTERP_TAPS / 2 - 1)) {
+    x++;
+    y += delta;
+  }
+  x1 = x;
+  x = outlength - 1;
+  y = delta * x + offset;
+  while ((y >> INTERP_PRECISION_BITS) +
+         (int64_t)(INTERP_TAPS / 2) >= inlength) {
+    x--;
+    y -= delta;
+  }
+  x2 = x;
+  if (x1 > x2) {
+    for (x = 0, y = offset; x < outlength; ++x, y += delta) {
+      const int16_t *filter;
+      int_pel = y >> INTERP_PRECISION_BITS;
+      sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+      filter = interp_filters[sub_pel];
+      sum = 0;
+      for (k = 0; k < INTERP_TAPS; ++k) {
+        const int pk = int_pel - INTERP_TAPS / 2 + 1 + k;
+        sum += filter[k] *
+            input[(pk < 0 ? 0 : (pk >= inlength ? inlength - 1 : pk))];
+      }
+      *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+    }
+  } else {
+    // Initial part.
+    for (x = 0, y = offset; x < x1; ++x, y += delta) {
+      const int16_t *filter;
+      int_pel = y >> INTERP_PRECISION_BITS;
+      sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+      filter = interp_filters[sub_pel];
+      sum = 0;
+      for (k = 0; k < INTERP_TAPS; ++k)
+        sum += filter[k] *
+            input[(int_pel - INTERP_TAPS / 2 + 1 + k < 0 ?
+                   0 : int_pel - INTERP_TAPS / 2 + 1 + k)];
+      *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+    }
+    // Middle part.
+    for (; x <= x2; ++x, y += delta) {
+      const int16_t *filter;
+      int_pel = y >> INTERP_PRECISION_BITS;
+      sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+      filter = interp_filters[sub_pel];
+      sum = 0;
+      for (k = 0; k < INTERP_TAPS; ++k)
+        sum += filter[k] * input[int_pel - INTERP_TAPS / 2 + 1 + k];
+      *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+    }
+    // End part.
+    for (; x < outlength; ++x, y += delta) {
+      const int16_t *filter;
+      int_pel = y >> INTERP_PRECISION_BITS;
+      sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+      filter = interp_filters[sub_pel];
+      sum = 0;
+      for (k = 0; k < INTERP_TAPS; ++k)
+        sum += filter[k] * input[(int_pel - INTERP_TAPS / 2 + 1 + k >=
+                                  inlength ?  inlength - 1 :
+                                  int_pel - INTERP_TAPS / 2 + 1 + k)];
+      *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+    }
+  }
+}
+
+static void highbd_down2_symeven(const uint16_t *const input, int length,
+                                 uint16_t *output, int bd) {
+  // Actual filter len = 2 * filter_len_half.
+  static const int16_t *filter = vp9_down2_symeven_half_filter;
+  const int filter_len_half = sizeof(vp9_down2_symeven_half_filter) / 2;
+  int i, j;
+  uint16_t *optr = output;
+  int l1 = filter_len_half;
+  int l2 = (length - filter_len_half);
+  l1 += (l1 & 1);
+  l2 += (l2 & 1);
+  if (l1 > l2) {
+    // Short input length.
+    for (i = 0; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum += (input[(i - j < 0 ? 0 : i - j)] +
+                input[(i + 1 + j >= length ? length - 1 : i + 1 + j)]) *
+            filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+  } else {
+    // Initial part.
+    for (i = 0; i < l1; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum += (input[(i - j < 0 ? 0 : i - j)] + input[i + 1 + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+    // Middle part.
+    for (; i < l2; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum += (input[i - j] + input[i + 1 + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+    // End part.
+    for (; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum += (input[i - j] +
+                input[(i + 1 + j >= length ? length - 1 : i + 1 + j)]) *
+            filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+  }
+}
+
+static void highbd_down2_symodd(const uint16_t *const input, int length,
+                              uint16_t *output, int bd) {
+  // Actual filter len = 2 * filter_len_half - 1.
+  static const int16_t *filter = vp9_down2_symodd_half_filter;
+  const int filter_len_half = sizeof(vp9_down2_symodd_half_filter) / 2;
+  int i, j;
+  uint16_t *optr = output;
+  int l1 = filter_len_half - 1;
+  int l2 = (length - filter_len_half + 1);
+  l1 += (l1 & 1);
+  l2 += (l2 & 1);
+  if (l1 > l2) {
+    // Short input length.
+    for (i = 0; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[(i - j < 0 ? 0 : i - j)] +
+                input[(i + j >= length ? length - 1 : i + j)]) *
+            filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+  } else {
+    // Initial part.
+    for (i = 0; i < l1; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[(i - j < 0 ? 0 : i - j)] + input[i + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+    // Middle part.
+    for (; i < l2; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[i - j] + input[i + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+    // End part.
+    for (; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[i - j] + input[(i + j >= length ? length - 1 : i + j)]) *
+            filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+  }
+}
+
+static void highbd_resize_multistep(const uint16_t *const input,
+                                    int length,
+                                    uint16_t *output,
+                                    int olength,
+                                    uint16_t *buf,
+                                    int bd) {
+  int steps;
+  if (length == olength) {
+    memcpy(output, input, sizeof(output[0]) * length);
+    return;
+  }
+  steps = get_down2_steps(length, olength);
+
+  if (steps > 0) {
+    int s;
+    uint16_t *out = NULL;
+    uint16_t *tmpbuf = NULL;
+    uint16_t *otmp, *otmp2;
+    int filteredlength = length;
+    if (!tmpbuf) {
+      tmpbuf = (uint16_t *)malloc(sizeof(uint16_t) * length);
+      otmp = tmpbuf;
+    } else {
+      otmp = buf;
+    }
+    otmp2 = otmp + get_down2_length(length, 1);
+    for (s = 0; s < steps; ++s) {
+      const int proj_filteredlength = get_down2_length(filteredlength, 1);
+      const uint16_t *const in = (s == 0 ? input : out);
+      if (s == steps - 1 && proj_filteredlength == olength)
+        out = output;
+      else
+        out = (s & 1 ? otmp2 : otmp);
+      if (filteredlength & 1)
+        highbd_down2_symodd(in, filteredlength, out, bd);
+      else
+        highbd_down2_symeven(in, filteredlength, out, bd);
+      filteredlength = proj_filteredlength;
+    }
+    if (filteredlength != olength) {
+      highbd_interpolate(out, filteredlength, output, olength, bd);
+    }
+    if (tmpbuf)
+      free(tmpbuf);
+  } else {
+    highbd_interpolate(input, length, output, olength, bd);
+  }
+}
+
+static void highbd_fill_col_to_arr(uint16_t *img, int stride, int len,
+                                   uint16_t *arr) {
+  int i;
+  uint16_t *iptr = img;
+  uint16_t *aptr = arr;
+  for (i = 0; i < len; ++i, iptr += stride) {
+    *aptr++ = *iptr;
+  }
+}
+
+static void highbd_fill_arr_to_col(uint16_t *img, int stride, int len,
+                                   uint16_t *arr) {
+  int i;
+  uint16_t *iptr = img;
+  uint16_t *aptr = arr;
+  for (i = 0; i < len; ++i, iptr += stride) {
+    *iptr = *aptr++;
+  }
+}
+
+void vp9_highbd_resize_plane(const uint8_t *const input,
+                             int height,
+                             int width,
+                             int in_stride,
+                             uint8_t *output,
+                             int height2,
+                             int width2,
+                             int out_stride,
+                             int bd) {
+  int i;
+  uint16_t *intbuf = (uint16_t *)malloc(sizeof(uint16_t) * width2 * height);
+  uint16_t *tmpbuf = (uint16_t *)malloc(sizeof(uint16_t) *
+                                        (width < height ? height : width));
+  uint16_t *arrbuf = (uint16_t *)malloc(sizeof(uint16_t) * (height + height2));
+  for (i = 0; i < height; ++i) {
+    highbd_resize_multistep(CONVERT_TO_SHORTPTR(input + in_stride * i), width,
+                            intbuf + width2 * i, width2, tmpbuf, bd);
+  }
+  for (i = 0; i < width2; ++i) {
+    highbd_fill_col_to_arr(intbuf + i, width2, height, arrbuf);
+    highbd_resize_multistep(arrbuf, height, arrbuf + height, height2, tmpbuf,
+                            bd);
+    highbd_fill_arr_to_col(CONVERT_TO_SHORTPTR(output + i), out_stride, height2,
+                           arrbuf + height);
+  }
+  free(intbuf);
+  free(tmpbuf);
+  free(arrbuf);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+void vp9_resize_frame420(const uint8_t *const y,
+                         int y_stride,
+                         const uint8_t *const u, const uint8_t *const v,
+                         int uv_stride,
+                         int height, int width,
+                         uint8_t *oy, int oy_stride,
+                         uint8_t *ou, uint8_t *ov, int ouv_stride,
+                         int oheight, int owidth) {
+  vp9_resize_plane(y, height, width, y_stride,
+                   oy, oheight, owidth, oy_stride);
+  vp9_resize_plane(u, height / 2, width / 2, uv_stride,
+                   ou, oheight / 2, owidth / 2, ouv_stride);
+  vp9_resize_plane(v, height / 2, width / 2, uv_stride,
+                   ov, oheight / 2, owidth / 2, ouv_stride);
+}
+
+void vp9_resize_frame422(const uint8_t *const y, int y_stride,
+                         const uint8_t *const u, const uint8_t *const v,
+                         int uv_stride,
+                         int height, int width,
+                         uint8_t *oy, int oy_stride,
+                         uint8_t *ou, uint8_t *ov, int ouv_stride,
+                         int oheight, int owidth) {
+  vp9_resize_plane(y, height, width, y_stride,
+                   oy, oheight, owidth, oy_stride);
+  vp9_resize_plane(u, height, width / 2, uv_stride,
+                   ou, oheight, owidth / 2, ouv_stride);
+  vp9_resize_plane(v, height, width / 2, uv_stride,
+                   ov, oheight, owidth / 2, ouv_stride);
+}
+
+void vp9_resize_frame444(const uint8_t *const y, int y_stride,
+                         const uint8_t *const u, const uint8_t *const v,
+                         int uv_stride,
+                         int height, int width,
+                         uint8_t *oy, int oy_stride,
+                         uint8_t *ou, uint8_t *ov, int ouv_stride,
+                         int oheight, int owidth) {
+  vp9_resize_plane(y, height, width, y_stride,
+                   oy, oheight, owidth, oy_stride);
+  vp9_resize_plane(u, height, width, uv_stride,
+                   ou, oheight, owidth, ouv_stride);
+  vp9_resize_plane(v, height, width, uv_stride,
+                   ov, oheight, owidth, ouv_stride);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_resize_frame420(const uint8_t *const y,
+                                int y_stride,
+                                const uint8_t *const u, const uint8_t *const v,
+                                int uv_stride,
+                                int height, int width,
+                                uint8_t *oy, int oy_stride,
+                                uint8_t *ou, uint8_t *ov, int ouv_stride,
+                                int oheight, int owidth, int bd) {
+  vp9_highbd_resize_plane(y, height, width, y_stride,
+                          oy, oheight, owidth, oy_stride, bd);
+  vp9_highbd_resize_plane(u, height / 2, width / 2, uv_stride,
+                          ou, oheight / 2, owidth / 2, ouv_stride, bd);
+  vp9_highbd_resize_plane(v, height / 2, width / 2, uv_stride,
+                          ov, oheight / 2, owidth / 2, ouv_stride, bd);
+}
+
+void vp9_highbd_resize_frame422(const uint8_t *const y, int y_stride,
+                                const uint8_t *const u, const uint8_t *const v,
+                                int uv_stride,
+                                int height, int width,
+                                uint8_t *oy, int oy_stride,
+                                uint8_t *ou, uint8_t *ov, int ouv_stride,
+                                int oheight, int owidth, int bd) {
+  vp9_highbd_resize_plane(y, height, width, y_stride,
+                          oy, oheight, owidth, oy_stride, bd);
+  vp9_highbd_resize_plane(u, height, width / 2, uv_stride,
+                          ou, oheight, owidth / 2, ouv_stride, bd);
+  vp9_highbd_resize_plane(v, height, width / 2, uv_stride,
+                          ov, oheight, owidth / 2, ouv_stride, bd);
+}
+
+void vp9_highbd_resize_frame444(const uint8_t *const y, int y_stride,
+                                const uint8_t *const u, const uint8_t *const v,
+                                int uv_stride,
+                                int height, int width,
+                                uint8_t *oy, int oy_stride,
+                                uint8_t *ou, uint8_t *ov, int ouv_stride,
+                                int oheight, int owidth, int bd) {
+  vp9_highbd_resize_plane(y, height, width, y_stride,
+                          oy, oheight, owidth, oy_stride, bd);
+  vp9_highbd_resize_plane(u, height, width, uv_stride,
+                          ou, oheight, owidth, ouv_stride, bd);
+  vp9_highbd_resize_plane(v, height, width, uv_stride,
+                          ov, oheight, owidth, ouv_stride, bd);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/libs/libvpx/vp9/encoder/vp9_resize.h b/libs/libvpx/vp9/encoder/vp9_resize.h
new file mode 100644
index 0000000000..b5feb38606
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_resize.h
@@ -0,0 +1,133 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_VP9_RESIZE_H_
+#define VP9_ENCODER_VP9_RESIZE_H_
+
+#include <stdio.h>
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp9_resize_plane(const uint8_t *const input,
+                      int height,
+                      int width,
+                      int in_stride,
+                      uint8_t *output,
+                      int height2,
+                      int width2,
+                      int out_stride);
+void vp9_resize_frame420(const uint8_t *const y,
+                         int y_stride,
+                         const uint8_t *const u,
+                         const uint8_t *const v,
+                         int uv_stride,
+                         int height,
+                         int width,
+                         uint8_t *oy,
+                         int oy_stride,
+                         uint8_t *ou,
+                         uint8_t *ov,
+                         int ouv_stride,
+                         int oheight,
+                         int owidth);
+void vp9_resize_frame422(const uint8_t *const y,
+                         int y_stride,
+                         const uint8_t *const u,
+                         const uint8_t *const v,
+                         int uv_stride,
+                         int height,
+                         int width,
+                         uint8_t *oy,
+                         int oy_stride,
+                         uint8_t *ou,
+                         uint8_t *ov,
+                         int ouv_stride,
+                         int oheight,
+                         int owidth);
+void vp9_resize_frame444(const uint8_t *const y,
+                         int y_stride,
+                         const uint8_t *const u,
+                         const uint8_t *const v,
+                         int uv_stride,
+                         int height,
+                         int width,
+                         uint8_t *oy,
+                         int oy_stride,
+                         uint8_t *ou,
+                         uint8_t *ov,
+                         int ouv_stride,
+                         int oheight,
+                         int owidth);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_resize_plane(const uint8_t *const input,
+                             int height,
+                             int width,
+                             int in_stride,
+                             uint8_t *output,
+                             int height2,
+                             int width2,
+                             int out_stride,
+                             int bd);
+void vp9_highbd_resize_frame420(const uint8_t *const y,
+                                int y_stride,
+                                const uint8_t *const u,
+                                const uint8_t *const v,
+                                int uv_stride,
+                                int height,
+                                int width,
+                                uint8_t *oy,
+                                int oy_stride,
+                                uint8_t *ou,
+                                uint8_t *ov,
+                                int ouv_stride,
+                                int oheight,
+                                int owidth,
+                                int bd);
+void vp9_highbd_resize_frame422(const uint8_t *const y,
+                                int y_stride,
+                                const uint8_t *const u,
+                                const uint8_t *const v,
+                                int uv_stride,
+                                int height,
+                                int width,
+                                uint8_t *oy,
+                                int oy_stride,
+                                uint8_t *ou,
+                                uint8_t *ov,
+                                int ouv_stride,
+                                int oheight,
+                                int owidth,
+                                int bd);
+void vp9_highbd_resize_frame444(const uint8_t *const y,
+                                int y_stride,
+                                const uint8_t *const u,
+                                const uint8_t *const v,
+                                int uv_stride,
+                                int height,
+                                int width,
+                                uint8_t *oy,
+                                int oy_stride,
+                                uint8_t *ou,
+                                uint8_t *ov,
+                                int ouv_stride,
+                                int oheight,
+                                int owidth,
+                                int bd);
+#endif    // CONFIG_VP9_HIGHBITDEPTH
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif    // VP9_ENCODER_VP9_RESIZE_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_segmentation.c b/libs/libvpx/vp9/encoder/vp9_segmentation.c
new file mode 100644
index 0000000000..5a0a23d489
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_segmentation.c
@@ -0,0 +1,281 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <limits.h>
+
+#include "vpx_mem/vpx_mem.h"
+
+#include "vp9/common/vp9_pred_common.h"
+#include "vp9/common/vp9_tile_common.h"
+
+#include "vp9/encoder/vp9_cost.h"
+#include "vp9/encoder/vp9_segmentation.h"
+
+void vp9_enable_segmentation(struct segmentation *seg) {
+  seg->enabled = 1;
+  seg->update_map = 1;
+  seg->update_data = 1;
+}
+
+void vp9_disable_segmentation(struct segmentation *seg) {
+  seg->enabled = 0;
+  seg->update_map = 0;
+  seg->update_data = 0;
+}
+
+void vp9_set_segment_data(struct segmentation *seg,
+                          signed char *feature_data,
+                          unsigned char abs_delta) {
+  seg->abs_delta = abs_delta;
+
+  memcpy(seg->feature_data, feature_data, sizeof(seg->feature_data));
+}
+void vp9_disable_segfeature(struct segmentation *seg, int segment_id,
+                            SEG_LVL_FEATURES feature_id) {
+  seg->feature_mask[segment_id] &= ~(1 << feature_id);
+}
+
+void vp9_clear_segdata(struct segmentation *seg, int segment_id,
+                       SEG_LVL_FEATURES feature_id) {
+  seg->feature_data[segment_id][feature_id] = 0;
+}
+
+// Based on set of segment counts calculate a probability tree
+static void calc_segtree_probs(int *segcounts, vpx_prob *segment_tree_probs) {
+  // Work out probabilities of each segment
+  const int c01 = segcounts[0] + segcounts[1];
+  const int c23 = segcounts[2] + segcounts[3];
+  const int c45 = segcounts[4] + segcounts[5];
+  const int c67 = segcounts[6] + segcounts[7];
+
+  segment_tree_probs[0] = get_binary_prob(c01 + c23, c45 + c67);
+  segment_tree_probs[1] = get_binary_prob(c01, c23);
+  segment_tree_probs[2] = get_binary_prob(c45, c67);
+  segment_tree_probs[3] = get_binary_prob(segcounts[0], segcounts[1]);
+  segment_tree_probs[4] = get_binary_prob(segcounts[2], segcounts[3]);
+  segment_tree_probs[5] = get_binary_prob(segcounts[4], segcounts[5]);
+  segment_tree_probs[6] = get_binary_prob(segcounts[6], segcounts[7]);
+}
+
+// Based on set of segment counts and probabilities calculate a cost estimate
+static int cost_segmap(int *segcounts, vpx_prob *probs) {
+  const int c01 = segcounts[0] + segcounts[1];
+  const int c23 = segcounts[2] + segcounts[3];
+  const int c45 = segcounts[4] + segcounts[5];
+  const int c67 = segcounts[6] + segcounts[7];
+  const int c0123 = c01 + c23;
+  const int c4567 = c45 + c67;
+
+  // Cost the top node of the tree
+  int cost = c0123 * vp9_cost_zero(probs[0]) +
+             c4567 * vp9_cost_one(probs[0]);
+
+  // Cost subsequent levels
+  if (c0123 > 0) {
+    cost += c01 * vp9_cost_zero(probs[1]) +
+            c23 * vp9_cost_one(probs[1]);
+
+    if (c01 > 0)
+      cost += segcounts[0] * vp9_cost_zero(probs[3]) +
+              segcounts[1] * vp9_cost_one(probs[3]);
+    if (c23 > 0)
+      cost += segcounts[2] * vp9_cost_zero(probs[4]) +
+              segcounts[3] * vp9_cost_one(probs[4]);
+  }
+
+  if (c4567 > 0) {
+    cost += c45 * vp9_cost_zero(probs[2]) +
+            c67 * vp9_cost_one(probs[2]);
+
+    if (c45 > 0)
+      cost += segcounts[4] * vp9_cost_zero(probs[5]) +
+              segcounts[5] * vp9_cost_one(probs[5]);
+    if (c67 > 0)
+      cost += segcounts[6] * vp9_cost_zero(probs[6]) +
+              segcounts[7] * vp9_cost_one(probs[6]);
+  }
+
+  return cost;
+}
+
+static void count_segs(const VP9_COMMON *cm, MACROBLOCKD *xd,
+                       const TileInfo *tile, MODE_INFO **mi,
+                       int *no_pred_segcounts,
+                       int (*temporal_predictor_count)[2],
+                       int *t_unpred_seg_counts,
+                       int bw, int bh, int mi_row, int mi_col) {
+  int segment_id;
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
+
+  xd->mi = mi;
+  segment_id = xd->mi[0]->segment_id;
+
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
+
+  // Count the number of hits on each segment with no prediction
+  no_pred_segcounts[segment_id]++;
+
+  // Temporal prediction not allowed on key frames
+  if (cm->frame_type != KEY_FRAME) {
+    const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+    // Test to see if the segment id matches the predicted value.
+    const int pred_segment_id = get_segment_id(cm, cm->last_frame_seg_map,
+                                               bsize, mi_row, mi_col);
+    const int pred_flag = pred_segment_id == segment_id;
+    const int pred_context = vp9_get_pred_context_seg_id(xd);
+
+    // Store the prediction status for this mb and update counts
+    // as appropriate
+    xd->mi[0]->seg_id_predicted = pred_flag;
+    temporal_predictor_count[pred_context][pred_flag]++;
+
+    // Update the "unpredicted" segment count
+    if (!pred_flag)
+      t_unpred_seg_counts[segment_id]++;
+  }
+}
+
+static void count_segs_sb(const VP9_COMMON *cm, MACROBLOCKD *xd,
+                          const TileInfo *tile, MODE_INFO **mi,
+                          int *no_pred_segcounts,
+                          int (*temporal_predictor_count)[2],
+                          int *t_unpred_seg_counts,
+                          int mi_row, int mi_col,
+                          BLOCK_SIZE bsize) {
+  const int mis = cm->mi_stride;
+  int bw, bh;
+  const int bs = num_8x8_blocks_wide_lookup[bsize], hbs = bs / 2;
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
+
+  bw = num_8x8_blocks_wide_lookup[mi[0]->sb_type];
+  bh = num_8x8_blocks_high_lookup[mi[0]->sb_type];
+
+  if (bw == bs && bh == bs) {
+    count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
+               t_unpred_seg_counts, bs, bs, mi_row, mi_col);
+  } else if (bw == bs && bh < bs) {
+    count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
+               t_unpred_seg_counts, bs, hbs, mi_row, mi_col);
+    count_segs(cm, xd, tile, mi + hbs * mis, no_pred_segcounts,
+               temporal_predictor_count, t_unpred_seg_counts, bs, hbs,
+               mi_row + hbs, mi_col);
+  } else if (bw < bs && bh == bs) {
+    count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
+               t_unpred_seg_counts, hbs, bs, mi_row, mi_col);
+    count_segs(cm, xd, tile, mi + hbs,
+               no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts,
+               hbs, bs, mi_row, mi_col + hbs);
+  } else {
+    const BLOCK_SIZE subsize = subsize_lookup[PARTITION_SPLIT][bsize];
+    int n;
+
+    assert(bw < bs && bh < bs);
+
+    for (n = 0; n < 4; n++) {
+      const int mi_dc = hbs * (n & 1);
+      const int mi_dr = hbs * (n >> 1);
+
+      count_segs_sb(cm, xd, tile, &mi[mi_dr * mis + mi_dc],
+                    no_pred_segcounts, temporal_predictor_count,
+                    t_unpred_seg_counts,
+                    mi_row + mi_dr, mi_col + mi_dc, subsize);
+    }
+  }
+}
+
+void vp9_choose_segmap_coding_method(VP9_COMMON *cm, MACROBLOCKD *xd) {
+  struct segmentation *seg = &cm->seg;
+
+  int no_pred_cost;
+  int t_pred_cost = INT_MAX;
+
+  int i, tile_col, mi_row, mi_col;
+
+  int temporal_predictor_count[PREDICTION_PROBS][2] = { { 0 } };
+  int no_pred_segcounts[MAX_SEGMENTS] = { 0 };
+  int t_unpred_seg_counts[MAX_SEGMENTS] = { 0 };
+
+  vpx_prob no_pred_tree[SEG_TREE_PROBS];
+  vpx_prob t_pred_tree[SEG_TREE_PROBS];
+  vpx_prob t_nopred_prob[PREDICTION_PROBS];
+
+  // Set default state for the segment tree probabilities and the
+  // temporal coding probabilities
+  memset(seg->tree_probs, 255, sizeof(seg->tree_probs));
+  memset(seg->pred_probs, 255, sizeof(seg->pred_probs));
+
+  // First of all generate stats regarding how well the last segment map
+  // predicts this one
+  for (tile_col = 0; tile_col < 1 << cm->log2_tile_cols; tile_col++) {
+    TileInfo tile;
+    MODE_INFO **mi_ptr;
+    vp9_tile_init(&tile, cm, 0, tile_col);
+
+    mi_ptr = cm->mi_grid_visible + tile.mi_col_start;
+    for (mi_row = 0; mi_row < cm->mi_rows;
+         mi_row += 8, mi_ptr += 8 * cm->mi_stride) {
+      MODE_INFO **mi = mi_ptr;
+      for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end;
+           mi_col += 8, mi += 8)
+        count_segs_sb(cm, xd, &tile, mi, no_pred_segcounts,
+                      temporal_predictor_count, t_unpred_seg_counts,
+                      mi_row, mi_col, BLOCK_64X64);
+    }
+  }
+
+  // Work out probability tree for coding segments without prediction
+  // and the cost.
+  calc_segtree_probs(no_pred_segcounts, no_pred_tree);
+  no_pred_cost = cost_segmap(no_pred_segcounts, no_pred_tree);
+
+  // Key frames cannot use temporal prediction
+  if (!frame_is_intra_only(cm)) {
+    // Work out probability tree for coding those segments not
+    // predicted using the temporal method and the cost.
+    calc_segtree_probs(t_unpred_seg_counts, t_pred_tree);
+    t_pred_cost = cost_segmap(t_unpred_seg_counts, t_pred_tree);
+
+    // Add in the cost of the signaling for each prediction context.
+    for (i = 0; i < PREDICTION_PROBS; i++) {
+      const int count0 = temporal_predictor_count[i][0];
+      const int count1 = temporal_predictor_count[i][1];
+
+      t_nopred_prob[i] = get_binary_prob(count0, count1);
+
+      // Add in the predictor signaling cost
+      t_pred_cost += count0 * vp9_cost_zero(t_nopred_prob[i]) +
+                     count1 * vp9_cost_one(t_nopred_prob[i]);
+    }
+  }
+
+  // Now choose which coding method to use.
+  if (t_pred_cost < no_pred_cost) {
+    seg->temporal_update = 1;
+    memcpy(seg->tree_probs, t_pred_tree, sizeof(t_pred_tree));
+    memcpy(seg->pred_probs, t_nopred_prob, sizeof(t_nopred_prob));
+  } else {
+    seg->temporal_update = 0;
+    memcpy(seg->tree_probs, no_pred_tree, sizeof(no_pred_tree));
+  }
+}
+
+void vp9_reset_segment_features(struct segmentation *seg) {
+  // Set up default state for MB feature flags
+  seg->enabled = 0;
+  seg->update_map = 0;
+  seg->update_data = 0;
+  memset(seg->tree_probs, 255, sizeof(seg->tree_probs));
+  vp9_clearall_segfeatures(seg);
+}
diff --git a/libs/libvpx/vp9/encoder/vp9_segmentation.h b/libs/libvpx/vp9/encoder/vp9_segmentation.h
new file mode 100644
index 0000000000..8c6944ad13
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_segmentation.h
@@ -0,0 +1,53 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP9_ENCODER_VP9_SEGMENTATION_H_
+#define VP9_ENCODER_VP9_SEGMENTATION_H_
+
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/encoder/vp9_encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp9_enable_segmentation(struct segmentation *seg);
+void vp9_disable_segmentation(struct segmentation *seg);
+
+void vp9_disable_segfeature(struct segmentation *seg,
+                            int segment_id,
+                            SEG_LVL_FEATURES feature_id);
+void vp9_clear_segdata(struct segmentation *seg,
+                       int segment_id,
+                       SEG_LVL_FEATURES feature_id);
+
+// The values given for each segment can be either deltas (from the default
+// value chosen for the frame) or absolute values.
+//
+// Valid range for abs values is (0-127 for MB_LVL_ALT_Q), (0-63 for
+// SEGMENT_ALT_LF)
+// Valid range for delta values are (+/-127 for MB_LVL_ALT_Q), (+/-63 for
+// SEGMENT_ALT_LF)
+//
+// abs_delta = SEGMENT_DELTADATA (deltas) abs_delta = SEGMENT_ABSDATA (use
+// the absolute values given).
+void vp9_set_segment_data(struct segmentation *seg, signed char *feature_data,
+                          unsigned char abs_delta);
+
+void vp9_choose_segmap_coding_method(VP9_COMMON *cm, MACROBLOCKD *xd);
+
+void vp9_reset_segment_features(struct segmentation *seg);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_VP9_SEGMENTATION_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_skin_detection.c b/libs/libvpx/vp9/encoder/vp9_skin_detection.c
new file mode 100644
index 0000000000..8e117eb084
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_skin_detection.c
@@ -0,0 +1,176 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <limits.h>
+#include <math.h>
+
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_skin_detection.h"
+
+#define MODEL_MODE 0
+
+// Fixed-point skin color model parameters.
+static const int skin_mean[5][2] = {
+    {7463, 9614}, {6400, 10240}, {7040, 10240}, {8320, 9280}, {6800, 9614}};
+static const int skin_inv_cov[4] = {4107, 1663, 1663, 2157};  // q16
+static const int skin_threshold[6] = {1570636, 1400000, 800000, 800000, 800000,
+    800000};  // q18
+
+// Thresholds on luminance.
+static const int y_low = 40;
+static const int y_high = 220;
+
+// Evaluates the Mahalanobis distance measure for the input CbCr values.
+static int evaluate_skin_color_difference(int cb, int cr, int idx) {
+  const int cb_q6 = cb << 6;
+  const int cr_q6 = cr << 6;
+  const int cb_diff_q12 =
+      (cb_q6 - skin_mean[idx][0]) * (cb_q6 - skin_mean[idx][0]);
+  const int cbcr_diff_q12 =
+      (cb_q6 - skin_mean[idx][0]) * (cr_q6 - skin_mean[idx][1]);
+  const int cr_diff_q12 =
+      (cr_q6 - skin_mean[idx][1]) * (cr_q6 - skin_mean[idx][1]);
+  const int cb_diff_q2 = (cb_diff_q12 + (1 << 9)) >> 10;
+  const int cbcr_diff_q2 = (cbcr_diff_q12 + (1 << 9)) >> 10;
+  const int cr_diff_q2 = (cr_diff_q12 + (1 << 9)) >> 10;
+  const int skin_diff = skin_inv_cov[0] * cb_diff_q2 +
+      skin_inv_cov[1] * cbcr_diff_q2 +
+      skin_inv_cov[2] * cbcr_diff_q2 +
+      skin_inv_cov[3] * cr_diff_q2;
+  return skin_diff;
+}
+
+int vp9_skin_pixel(const uint8_t y, const uint8_t cb, const uint8_t cr) {
+  if (y < y_low || y > y_high) {
+    return 0;
+  } else {
+    if (MODEL_MODE == 0) {
+      return (evaluate_skin_color_difference(cb, cr, 0) < skin_threshold[0]);
+    } else {
+      int i = 0;
+      // Exit on grey.
+      if (cb == 128 && cr == 128)
+        return 0;
+      // Exit on very strong cb.
+      if (cb > 150 && cr < 110)
+        return 0;
+      // Exit on (another) low luminance threshold if either color is high.
+      if (y < 50 && (cb > 140 || cr > 140))
+        return 0;
+      for (; i < 5; i++) {
+        if (evaluate_skin_color_difference(cb, cr, i) < skin_threshold[i + 1]) {
+          return 1;
+        }
+        // Exit if difference is much large than the threshold.
+        if (evaluate_skin_color_difference(cb, cr, i) >
+            (skin_threshold[i + 1] << 3)) {
+          return 0;
+        }
+      }
+      return 0;
+    }
+  }
+}
+
+int vp9_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
+                           int stride, int strideuv, int bsize) {
+  // Take center pixel in block to determine is_skin.
+  const int y_width_shift = (4 << b_width_log2_lookup[bsize]) >> 1;
+  const int y_height_shift = (4 << b_height_log2_lookup[bsize]) >> 1;
+  const int uv_width_shift = y_width_shift >> 1;
+  const int uv_height_shift = y_height_shift >> 1;
+  const uint8_t ysource = y[y_height_shift * stride + y_width_shift];
+  const uint8_t usource = u[uv_height_shift * strideuv + uv_width_shift];
+  const uint8_t vsource = v[uv_height_shift * strideuv + uv_width_shift];
+  return vp9_skin_pixel(ysource, usource, vsource);
+}
+
+
+#ifdef OUTPUT_YUV_SKINMAP
+// For viewing skin map on input source.
+void vp9_compute_skin_map(VP9_COMP *const cpi, FILE *yuv_skinmap_file) {
+  int i, j, mi_row, mi_col, num_bl;
+  VP9_COMMON *const cm = &cpi->common;
+  uint8_t *y;
+  const uint8_t *src_y = cpi->Source->y_buffer;
+  const uint8_t *src_u = cpi->Source->u_buffer;
+  const uint8_t *src_v = cpi->Source->v_buffer;
+  const int src_ystride = cpi->Source->y_stride;
+  const int src_uvstride = cpi->Source->uv_stride;
+  int y_bsize = 16;  // Use 8x8 or 16x16.
+  int uv_bsize = y_bsize >> 1;
+  int ypos = y_bsize >> 1;
+  int uvpos = uv_bsize >> 1;
+  int shy = (y_bsize == 8) ? 3 : 4;
+  int shuv = shy - 1;
+  int fac = y_bsize / 8;
+  // Use center pixel or average of center 2x2 pixels.
+  int mode_filter = 1;
+  YV12_BUFFER_CONFIG skinmap;
+  memset(&skinmap, 0, sizeof(YV12_BUFFER_CONFIG));
+  if (vpx_alloc_frame_buffer(&skinmap, cm->width, cm->height,
+                               cm->subsampling_x, cm->subsampling_y,
+                               VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment)) {
+      vpx_free_frame_buffer(&skinmap);
+      return;
+  }
+  memset(skinmap.buffer_alloc, 128, skinmap.frame_size);
+  y = skinmap.y_buffer;
+  // Loop through blocks and set skin map based on center pixel of block.
+  // Set y to white for skin block, otherwise set to source with gray scale.
+  // Ignore rightmost/bottom boundary blocks.
+  for (mi_row = 0; mi_row < cm->mi_rows - 1; mi_row += fac) {
+    num_bl = 0;
+    for (mi_col = 0; mi_col < cm->mi_cols - 1; mi_col += fac) {
+      // Select pixel for each block for skin detection.
+      // Use center pixel, or 2x2 average at center.
+      uint8_t ysource = src_y[ypos * src_ystride + ypos];
+      uint8_t usource = src_u[uvpos * src_uvstride + uvpos];
+      uint8_t vsource = src_v[uvpos * src_uvstride + uvpos];
+      uint8_t ysource2 = src_y[(ypos + 1) * src_ystride + ypos];
+      uint8_t usource2 = src_u[(uvpos + 1) * src_uvstride + uvpos];
+      uint8_t vsource2 = src_v[(uvpos + 1) * src_uvstride + uvpos];
+      uint8_t ysource3 = src_y[ypos * src_ystride + (ypos + 1)];
+      uint8_t usource3 = src_u[uvpos * src_uvstride + (uvpos  + 1)];
+      uint8_t vsource3 = src_v[uvpos * src_uvstride + (uvpos +  1)];
+      uint8_t ysource4 = src_y[(ypos + 1) * src_ystride + (ypos + 1)];
+      uint8_t usource4 = src_u[(uvpos + 1) * src_uvstride + (uvpos  + 1)];
+      uint8_t vsource4 = src_v[(uvpos + 1) * src_uvstride + (uvpos +  1)];
+      int is_skin = 0;
+      if (mode_filter == 1) {
+        ysource = (ysource + ysource2 + ysource3 + ysource4) >> 2;
+        usource = (usource + usource2 + usource3 + usource4) >> 2;
+        vsource = (vsource + vsource2 + vsource3 + vsource4) >> 2;
+      }
+      is_skin = vp9_skin_pixel(ysource, usource, vsource);
+      for (i = 0; i < y_bsize; i++) {
+        for (j = 0; j < y_bsize; j++) {
+          if (is_skin)
+            y[i * src_ystride + j] = 255;
+          else
+            y[i * src_ystride + j] = src_y[i * src_ystride + j];
+        }
+      }
+      num_bl++;
+      y += y_bsize;
+      src_y += y_bsize;
+      src_u += uv_bsize;
+      src_v += uv_bsize;
+    }
+    y += (src_ystride << shy) - (num_bl << shy);
+    src_y += (src_ystride << shy) - (num_bl << shy);
+    src_u += (src_uvstride << shuv) - (num_bl << shuv);
+    src_v += (src_uvstride << shuv) - (num_bl << shuv);
+  }
+  vp9_write_yuv_frame_420(&skinmap, yuv_skinmap_file);
+  vpx_free_frame_buffer(&skinmap);
+}
+#endif
diff --git a/libs/libvpx/vp9/encoder/vp9_skin_detection.h b/libs/libvpx/vp9/encoder/vp9_skin_detection.h
new file mode 100644
index 0000000000..73f7c39d9a
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_skin_detection.h
@@ -0,0 +1,39 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_VP9_SKIN_MAP_H_
+#define VP9_ENCODER_VP9_SKIN_MAP_H_
+
+#include "vp9/common/vp9_blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP9_COMP;
+
+// #define OUTPUT_YUV_SKINMAP
+
+int vp9_skin_pixel(const uint8_t y, const uint8_t cb, const uint8_t cr);
+
+int vp9_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
+                           int stride, int strideuv, int bsize);
+
+#ifdef OUTPUT_YUV_SKINMAP
+// For viewing skin map on input source.
+void vp9_compute_skin_map(struct VP9_COMP *const cpi, FILE *yuv_skinmap_file);
+extern void vp9_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f);
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_VP9_SKIN_MAP_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_speed_features.c b/libs/libvpx/vp9/encoder/vp9_speed_features.c
new file mode 100644
index 0000000000..f6845078a6
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_speed_features.c
@@ -0,0 +1,628 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <limits.h>
+
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_speed_features.h"
+#include "vp9/encoder/vp9_rdopt.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+// Mesh search patters for various speed settings
+static MESH_PATTERN best_quality_mesh_pattern[MAX_MESH_STEP] =
+    {{64, 4}, {28, 2}, {15, 1}, {7, 1}};
+
+#define MAX_MESH_SPEED 5  // Max speed setting for mesh motion method
+static MESH_PATTERN good_quality_mesh_patterns[MAX_MESH_SPEED + 1]
+                                              [MAX_MESH_STEP] =
+    {{{64, 8}, {28, 4}, {15, 1}, {7, 1}},
+     {{64, 8}, {28, 4}, {15, 1}, {7, 1}},
+     {{64, 8},  {14, 2}, {7, 1},  {7, 1}},
+     {{64, 16}, {24, 8}, {12, 4}, {7, 1}},
+     {{64, 16}, {24, 8}, {12, 4}, {7, 1}},
+     {{64, 16}, {24, 8}, {12, 4}, {7, 1}},
+    };
+static unsigned char good_quality_max_mesh_pct[MAX_MESH_SPEED + 1] =
+    {50, 25, 15, 5, 1, 1};
+
+// Intra only frames, golden frames (except alt ref overlays) and
+// alt ref frames tend to be coded at a higher than ambient quality
+static int frame_is_boosted(const VP9_COMP *cpi) {
+  return frame_is_kf_gf_arf(cpi) || vp9_is_upper_layer_key_frame(cpi);
+}
+
+// Sets a partition size down to which the auto partition code will always
+// search (can go lower), based on the image dimensions. The logic here
+// is that the extent to which ringing artefacts are offensive, depends
+// partly on the screen area that over which they propogate. Propogation is
+// limited by transform block size but the screen area take up by a given block
+// size will be larger for a small image format stretched to full screen.
+static BLOCK_SIZE set_partition_min_limit(VP9_COMMON *const cm) {
+  unsigned int screen_area = (cm->width * cm->height);
+
+  // Select block size based on image format size.
+  if (screen_area < 1280 * 720) {
+    // Formats smaller in area than 720P
+    return BLOCK_4X4;
+  } else if (screen_area < 1920 * 1080) {
+    // Format >= 720P and < 1080P
+    return BLOCK_8X8;
+  } else {
+    // Formats 1080P and up
+    return BLOCK_16X16;
+  }
+}
+
+static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi,
+                                                       SPEED_FEATURES *sf,
+                                                       int speed) {
+  VP9_COMMON *const cm = &cpi->common;
+
+  if (speed >= 1) {
+    if (VPXMIN(cm->width, cm->height) >= 720) {
+      sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
+                                              : DISABLE_ALL_INTER_SPLIT;
+      sf->partition_search_breakout_dist_thr = (1 << 23);
+    } else {
+      sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
+      sf->partition_search_breakout_dist_thr = (1 << 21);
+    }
+  }
+
+  if (speed >= 2) {
+    if (VPXMIN(cm->width, cm->height) >= 720) {
+      sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
+                                              : DISABLE_ALL_INTER_SPLIT;
+      sf->adaptive_pred_interp_filter = 0;
+      sf->partition_search_breakout_dist_thr = (1 << 24);
+      sf->partition_search_breakout_rate_thr = 120;
+    } else {
+      sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
+      sf->partition_search_breakout_dist_thr = (1 << 22);
+      sf->partition_search_breakout_rate_thr = 100;
+    }
+    sf->rd_auto_partition_min_limit = set_partition_min_limit(cm);
+  }
+
+  if (speed >= 3) {
+    if (VPXMIN(cm->width, cm->height) >= 720) {
+      sf->disable_split_mask = DISABLE_ALL_SPLIT;
+      sf->schedule_mode_search = cm->base_qindex < 220 ? 1 : 0;
+      sf->partition_search_breakout_dist_thr = (1 << 25);
+      sf->partition_search_breakout_rate_thr = 200;
+    } else {
+      sf->max_intra_bsize = BLOCK_32X32;
+      sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
+      sf->schedule_mode_search = cm->base_qindex < 175 ? 1 : 0;
+      sf->partition_search_breakout_dist_thr = (1 << 23);
+      sf->partition_search_breakout_rate_thr = 120;
+    }
+  }
+
+  // If this is a two pass clip that fits the criteria for animated or
+  // graphics content then reset disable_split_mask for speeds 1-4.
+  // Also if the image edge is internal to the coded area.
+  if ((speed >= 1) && (cpi->oxcf.pass == 2) &&
+      ((cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ||
+       (vp9_internal_image_edge(cpi)))) {
+    sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
+  }
+
+  if (speed >= 4) {
+    if (VPXMIN(cm->width, cm->height) >= 720) {
+      sf->partition_search_breakout_dist_thr = (1 << 26);
+    } else {
+      sf->partition_search_breakout_dist_thr = (1 << 24);
+    }
+    sf->disable_split_mask = DISABLE_ALL_SPLIT;
+  }
+}
+
+static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
+                                   SPEED_FEATURES *sf, int speed) {
+  const int boosted = frame_is_boosted(cpi);
+
+  sf->partition_search_breakout_dist_thr = (1 << 20);
+  sf->partition_search_breakout_rate_thr = 80;
+  sf->tx_size_search_breakout = 1;
+  sf->adaptive_rd_thresh = 1;
+  sf->allow_skip_recode = 1;
+  sf->less_rectangular_check = 1;
+  sf->use_square_partition_only = !frame_is_boosted(cpi);
+  sf->use_square_only_threshold = BLOCK_16X16;
+
+  if (speed >= 1) {
+    if ((cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ||
+        vp9_internal_image_edge(cpi)) {
+      sf->use_square_partition_only = !frame_is_boosted(cpi);
+    } else {
+      sf->use_square_partition_only = !frame_is_intra_only(cm);
+    }
+    sf->use_square_only_threshold = BLOCK_4X4;
+
+    sf->less_rectangular_check  = 1;
+
+    sf->use_rd_breakout = 1;
+    sf->adaptive_motion_search = 1;
+    sf->mv.auto_mv_step_size = 1;
+    sf->adaptive_rd_thresh = 2;
+    sf->mv.subpel_iters_per_step = 1;
+    sf->mode_skip_start = 10;
+    sf->adaptive_pred_interp_filter = 1;
+
+    sf->recode_loop = ALLOW_RECODE_KFARFGF;
+    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
+    sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
+  }
+
+  if (speed >= 2) {
+    sf->tx_size_search_method = frame_is_boosted(cpi) ? USE_FULL_RD
+                                                      : USE_LARGESTALL;
+
+    // Reference masking is not supported in dynamic scaling mode.
+    sf->reference_masking = cpi->oxcf.resize_mode != RESIZE_DYNAMIC ? 1 : 0;
+
+    sf->mode_search_skip_flags = (cm->frame_type == KEY_FRAME) ? 0 :
+                                 FLAG_SKIP_INTRA_DIRMISMATCH |
+                                 FLAG_SKIP_INTRA_BESTINTER |
+                                 FLAG_SKIP_COMP_BESTINTRA |
+                                 FLAG_SKIP_INTRA_LOWVAR;
+    sf->disable_filter_search_var_thresh = 100;
+    sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
+    sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
+    sf->allow_partition_search_skip = 1;
+  }
+
+  if (speed >= 3) {
+    sf->use_square_partition_only = !frame_is_intra_only(cm);
+    sf->tx_size_search_method = frame_is_intra_only(cm) ? USE_FULL_RD
+                                                        : USE_LARGESTALL;
+    sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED;
+    sf->adaptive_pred_interp_filter = 0;
+    sf->adaptive_mode_search = 1;
+    sf->cb_partition_search = !boosted;
+    sf->cb_pred_filter_search = 1;
+    sf->alt_ref_search_fp = 1;
+    sf->recode_loop = ALLOW_RECODE_KFMAXBW;
+    sf->adaptive_rd_thresh = 3;
+    sf->mode_skip_start = 6;
+    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
+    sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC;
+    sf->adaptive_interp_filter_search = 1;
+  }
+
+  if (speed >= 4) {
+    sf->use_square_partition_only = 1;
+    sf->tx_size_search_method = USE_LARGESTALL;
+    sf->mv.search_method = BIGDIA;
+    sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
+    sf->adaptive_rd_thresh = 4;
+    if (cm->frame_type != KEY_FRAME)
+      sf->mode_search_skip_flags |= FLAG_EARLY_TERMINATE;
+    sf->disable_filter_search_var_thresh = 200;
+    sf->use_lp32x32fdct = 1;
+    sf->use_fast_coef_updates = ONE_LOOP_REDUCED;
+    sf->use_fast_coef_costing = 1;
+    sf->motion_field_mode_search = !boosted;
+    sf->partition_search_breakout_rate_thr = 300;
+  }
+
+  if (speed >= 5) {
+    int i;
+    sf->optimize_coefficients = 0;
+    sf->mv.search_method = HEX;
+    sf->disable_filter_search_var_thresh = 500;
+    for (i = 0; i < TX_SIZES; ++i) {
+      sf->intra_y_mode_mask[i] = INTRA_DC;
+      sf->intra_uv_mode_mask[i] = INTRA_DC;
+    }
+    sf->partition_search_breakout_rate_thr = 500;
+    sf->mv.reduce_first_step_size = 1;
+    sf->simple_model_rd_from_var = 1;
+  }
+}
+
+static void set_rt_speed_feature_framesize_dependent(VP9_COMP *cpi,
+    SPEED_FEATURES *sf, int speed) {
+  VP9_COMMON *const cm = &cpi->common;
+
+  if (speed >= 1) {
+    if (VPXMIN(cm->width, cm->height) >= 720) {
+      sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
+                                              : DISABLE_ALL_INTER_SPLIT;
+    } else {
+      sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
+    }
+  }
+
+  if (speed >= 2) {
+    if (VPXMIN(cm->width, cm->height) >= 720) {
+      sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
+                                              : DISABLE_ALL_INTER_SPLIT;
+    } else {
+      sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
+    }
+  }
+
+  if (speed >= 5) {
+    if (VPXMIN(cm->width, cm->height) >= 720) {
+      sf->partition_search_breakout_dist_thr = (1 << 25);
+    } else {
+      sf->partition_search_breakout_dist_thr = (1 << 23);
+    }
+  }
+
+  if (speed >= 7) {
+    sf->encode_breakout_thresh = (VPXMIN(cm->width, cm->height) >= 720) ?
+        800 : 300;
+  }
+}
+
+static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
+                                 int speed, vp9e_tune_content content) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int is_keyframe = cm->frame_type == KEY_FRAME;
+  const int frames_since_key = is_keyframe ? 0 : cpi->rc.frames_since_key;
+  sf->static_segmentation = 0;
+  sf->adaptive_rd_thresh = 1;
+  sf->use_fast_coef_costing = 1;
+  sf->allow_exhaustive_searches = 0;
+  sf->exhaustive_searches_thresh = INT_MAX;
+
+  if (speed >= 1) {
+    sf->use_square_partition_only = !frame_is_intra_only(cm);
+    sf->less_rectangular_check = 1;
+    sf->tx_size_search_method = frame_is_intra_only(cm) ? USE_FULL_RD
+                                                        : USE_LARGESTALL;
+
+    sf->use_rd_breakout = 1;
+
+    sf->adaptive_motion_search = 1;
+    sf->adaptive_pred_interp_filter = 1;
+    sf->mv.auto_mv_step_size = 1;
+    sf->adaptive_rd_thresh = 2;
+    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
+  }
+
+  if (speed >= 2) {
+    sf->mode_search_skip_flags = (cm->frame_type == KEY_FRAME) ? 0 :
+                                 FLAG_SKIP_INTRA_DIRMISMATCH |
+                                 FLAG_SKIP_INTRA_BESTINTER |
+                                 FLAG_SKIP_COMP_BESTINTRA |
+                                 FLAG_SKIP_INTRA_LOWVAR;
+    sf->adaptive_pred_interp_filter = 2;
+
+    // Reference masking only enabled for 1 spatial layer, and if none of the
+    // references have been scaled. The latter condition needs to be checked
+    // for external or internal dynamic resize.
+    sf->reference_masking = (cpi->svc.number_spatial_layers == 1);
+    if (sf->reference_masking == 1 &&
+        (cpi->external_resize == 1 ||
+         cpi->oxcf.resize_mode == RESIZE_DYNAMIC)) {
+      MV_REFERENCE_FRAME ref_frame;
+      static const int flag_list[4] =
+          {0, VP9_LAST_FLAG, VP9_GOLD_FLAG, VP9_ALT_FLAG};
+      for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+        const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
+        if (yv12 != NULL && (cpi->ref_frame_flags & flag_list[ref_frame])) {
+          const struct scale_factors *const scale_fac =
+              &cm->frame_refs[ref_frame - 1].sf;
+          if (vp9_is_scaled(scale_fac))
+            sf->reference_masking = 0;
+        }
+      }
+    }
+
+    sf->disable_filter_search_var_thresh = 50;
+    sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
+    sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
+    sf->lf_motion_threshold = LOW_MOTION_THRESHOLD;
+    sf->adjust_partitioning_from_last_frame = 1;
+    sf->last_partitioning_redo_frequency = 3;
+    sf->use_lp32x32fdct = 1;
+    sf->mode_skip_start = 11;
+    sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
+  }
+
+  if (speed >= 3) {
+    sf->use_square_partition_only = 1;
+    sf->disable_filter_search_var_thresh = 100;
+    sf->use_uv_intra_rd_estimate = 1;
+    sf->skip_encode_sb = 1;
+    sf->mv.subpel_iters_per_step = 1;
+    sf->adaptive_rd_thresh = 4;
+    sf->mode_skip_start = 6;
+    sf->allow_skip_recode = 0;
+    sf->optimize_coefficients = 0;
+    sf->disable_split_mask = DISABLE_ALL_SPLIT;
+    sf->lpf_pick = LPF_PICK_FROM_Q;
+  }
+
+  if (speed >= 4) {
+    int i;
+    sf->last_partitioning_redo_frequency = 4;
+    sf->adaptive_rd_thresh = 5;
+    sf->use_fast_coef_costing = 0;
+    sf->auto_min_max_partition_size = STRICT_NEIGHBORING_MIN_MAX;
+    sf->adjust_partitioning_from_last_frame =
+        cm->last_frame_type != cm->frame_type || (0 ==
+        (frames_since_key + 1) % sf->last_partitioning_redo_frequency);
+    sf->mv.subpel_force_stop = 1;
+    for (i = 0; i < TX_SIZES; i++) {
+      sf->intra_y_mode_mask[i] = INTRA_DC_H_V;
+      sf->intra_uv_mode_mask[i] = INTRA_DC;
+    }
+    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
+    sf->frame_parameter_update = 0;
+    sf->mv.search_method = FAST_HEX;
+
+    sf->inter_mode_mask[BLOCK_32X32] = INTER_NEAREST_NEAR_NEW;
+    sf->inter_mode_mask[BLOCK_32X64] = INTER_NEAREST;
+    sf->inter_mode_mask[BLOCK_64X32] = INTER_NEAREST;
+    sf->inter_mode_mask[BLOCK_64X64] = INTER_NEAREST;
+    sf->max_intra_bsize = BLOCK_32X32;
+    sf->allow_skip_recode = 1;
+  }
+
+  if (speed >= 5) {
+    sf->use_quant_fp = !is_keyframe;
+    sf->auto_min_max_partition_size = is_keyframe ? RELAXED_NEIGHBORING_MIN_MAX
+                                                  : STRICT_NEIGHBORING_MIN_MAX;
+    sf->default_max_partition_size = BLOCK_32X32;
+    sf->default_min_partition_size = BLOCK_8X8;
+    sf->force_frame_boost = is_keyframe ||
+        (frames_since_key % (sf->last_partitioning_redo_frequency << 1) == 1);
+    sf->max_delta_qindex = is_keyframe ? 20 : 15;
+    sf->partition_search_type = REFERENCE_PARTITION;
+    sf->use_nonrd_pick_mode = 1;
+    sf->allow_skip_recode = 0;
+    sf->inter_mode_mask[BLOCK_32X32] = INTER_NEAREST_NEW_ZERO;
+    sf->inter_mode_mask[BLOCK_32X64] = INTER_NEAREST_NEW_ZERO;
+    sf->inter_mode_mask[BLOCK_64X32] = INTER_NEAREST_NEW_ZERO;
+    sf->inter_mode_mask[BLOCK_64X64] = INTER_NEAREST_NEW_ZERO;
+    sf->adaptive_rd_thresh = 2;
+    // This feature is only enabled when partition search is disabled.
+    sf->reuse_inter_pred_sby = 1;
+    sf->partition_search_breakout_rate_thr = 200;
+    sf->coeff_prob_appx_step = 4;
+    sf->use_fast_coef_updates = is_keyframe ? TWO_LOOP : ONE_LOOP_REDUCED;
+    sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH;
+    sf->tx_size_search_method = is_keyframe ? USE_LARGESTALL : USE_TX_8X8;
+    sf->simple_model_rd_from_var = 1;
+
+    if (!is_keyframe) {
+      int i;
+      if (content == VP9E_CONTENT_SCREEN) {
+        for (i = 0; i < BLOCK_SIZES; ++i)
+          sf->intra_y_mode_bsize_mask[i] = INTRA_DC_TM_H_V;
+      } else {
+        for (i = 0; i < BLOCK_SIZES; ++i)
+          if (i > BLOCK_16X16)
+            sf->intra_y_mode_bsize_mask[i] = INTRA_DC;
+          else
+            // Use H and V intra mode for block sizes <= 16X16.
+            sf->intra_y_mode_bsize_mask[i] = INTRA_DC_H_V;
+      }
+    }
+    if (content == VP9E_CONTENT_SCREEN) {
+      sf->short_circuit_flat_blocks = 1;
+    }
+  }
+
+  if (speed >= 6) {
+    sf->partition_search_type = VAR_BASED_PARTITION;
+    // Turn on this to use non-RD key frame coding mode.
+    sf->use_nonrd_pick_mode = 1;
+    sf->mv.search_method = NSTEP;
+    sf->mv.reduce_first_step_size = 1;
+    sf->skip_encode_sb = 0;
+  }
+
+  if (speed >= 7) {
+    sf->adaptive_rd_thresh = 3;
+    sf->mv.search_method = FAST_DIAMOND;
+    sf->mv.fullpel_search_step_param = 10;
+    if (cpi->svc.number_temporal_layers > 2 &&
+        cpi->svc.temporal_layer_id == 0) {
+      sf->mv.search_method = NSTEP;
+      sf->mv.fullpel_search_step_param = 6;
+    }
+  }
+  if (speed >= 8) {
+    sf->adaptive_rd_thresh = 4;
+    sf->mv.subpel_force_stop = 2;
+    sf->lpf_pick = LPF_PICK_MINIMAL_LPF;
+  }
+}
+
+void vp9_set_speed_features_framesize_dependent(VP9_COMP *cpi) {
+  SPEED_FEATURES *const sf = &cpi->sf;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  RD_OPT *const rd = &cpi->rd;
+  int i;
+
+  if (oxcf->mode == REALTIME) {
+    set_rt_speed_feature_framesize_dependent(cpi, sf, oxcf->speed);
+  } else if (oxcf->mode == GOOD) {
+    set_good_speed_feature_framesize_dependent(cpi, sf, oxcf->speed);
+  }
+
+  if (sf->disable_split_mask == DISABLE_ALL_SPLIT) {
+    sf->adaptive_pred_interp_filter = 0;
+  }
+
+  if (cpi->encode_breakout && oxcf->mode == REALTIME &&
+      sf->encode_breakout_thresh > cpi->encode_breakout) {
+    cpi->encode_breakout = sf->encode_breakout_thresh;
+  }
+
+  // Check for masked out split cases.
+  for (i = 0; i < MAX_REFS; ++i) {
+    if (sf->disable_split_mask & (1 << i)) {
+      rd->thresh_mult_sub8x8[i] = INT_MAX;
+    }
+  }
+}
+
+void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
+  SPEED_FEATURES *const sf = &cpi->sf;
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->td.mb;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  int i;
+
+  // best quality defaults
+  sf->frame_parameter_update = 1;
+  sf->mv.search_method = NSTEP;
+  sf->recode_loop = ALLOW_RECODE;
+  sf->mv.subpel_search_method = SUBPEL_TREE;
+  sf->mv.subpel_iters_per_step = 2;
+  sf->mv.subpel_force_stop = 0;
+  sf->optimize_coefficients = !is_lossless_requested(&cpi->oxcf);
+  sf->mv.reduce_first_step_size = 0;
+  sf->coeff_prob_appx_step = 1;
+  sf->mv.auto_mv_step_size = 0;
+  sf->mv.fullpel_search_step_param = 6;
+  sf->comp_inter_joint_search_thresh = BLOCK_4X4;
+  sf->tx_size_search_method = USE_FULL_RD;
+  sf->use_lp32x32fdct = 0;
+  sf->adaptive_motion_search = 0;
+  sf->adaptive_pred_interp_filter = 0;
+  sf->adaptive_mode_search = 0;
+  sf->cb_pred_filter_search = 0;
+  sf->cb_partition_search = 0;
+  sf->motion_field_mode_search = 0;
+  sf->alt_ref_search_fp = 0;
+  sf->use_quant_fp = 0;
+  sf->reference_masking = 0;
+  sf->partition_search_type = SEARCH_PARTITION;
+  sf->less_rectangular_check = 0;
+  sf->use_square_partition_only = 0;
+  sf->use_square_only_threshold = BLOCK_SIZES;
+  sf->auto_min_max_partition_size = NOT_IN_USE;
+  sf->rd_auto_partition_min_limit = BLOCK_4X4;
+  sf->default_max_partition_size = BLOCK_64X64;
+  sf->default_min_partition_size = BLOCK_4X4;
+  sf->adjust_partitioning_from_last_frame = 0;
+  sf->last_partitioning_redo_frequency = 4;
+  sf->disable_split_mask = 0;
+  sf->mode_search_skip_flags = 0;
+  sf->force_frame_boost = 0;
+  sf->max_delta_qindex = 0;
+  sf->disable_filter_search_var_thresh = 0;
+  sf->adaptive_interp_filter_search = 0;
+  sf->allow_partition_search_skip = 0;
+
+  for (i = 0; i < TX_SIZES; i++) {
+    sf->intra_y_mode_mask[i] = INTRA_ALL;
+    sf->intra_uv_mode_mask[i] = INTRA_ALL;
+  }
+  sf->use_rd_breakout = 0;
+  sf->skip_encode_sb = 0;
+  sf->use_uv_intra_rd_estimate = 0;
+  sf->allow_skip_recode = 0;
+  sf->lpf_pick = LPF_PICK_FROM_FULL_IMAGE;
+  sf->use_fast_coef_updates = TWO_LOOP;
+  sf->use_fast_coef_costing = 0;
+  sf->mode_skip_start = MAX_MODES;  // Mode index at which mode skip mask set
+  sf->schedule_mode_search = 0;
+  sf->use_nonrd_pick_mode = 0;
+  for (i = 0; i < BLOCK_SIZES; ++i)
+    sf->inter_mode_mask[i] = INTER_ALL;
+  sf->max_intra_bsize = BLOCK_64X64;
+  sf->reuse_inter_pred_sby = 0;
+  // This setting only takes effect when partition_search_type is set
+  // to FIXED_PARTITION.
+  sf->always_this_block_size = BLOCK_16X16;
+  sf->search_type_check_frequency = 50;
+  sf->encode_breakout_thresh = 0;
+  // Recode loop tolerance %.
+  sf->recode_tolerance = 25;
+  sf->default_interp_filter = SWITCHABLE;
+  sf->simple_model_rd_from_var = 0;
+  sf->short_circuit_flat_blocks = 0;
+
+  // Some speed-up features even for best quality as minimal impact on quality.
+  sf->adaptive_rd_thresh = 1;
+  sf->tx_size_search_breakout = 1;
+  sf->partition_search_breakout_dist_thr = (1 << 19);
+  sf->partition_search_breakout_rate_thr = 80;
+
+  if (oxcf->mode == REALTIME)
+    set_rt_speed_feature(cpi, sf, oxcf->speed, oxcf->content);
+  else if (oxcf->mode == GOOD)
+    set_good_speed_feature(cpi, cm, sf, oxcf->speed);
+
+  cpi->full_search_sad = vp9_full_search_sad;
+  cpi->diamond_search_sad = vp9_diamond_search_sad;
+
+  sf->allow_exhaustive_searches = 1;
+  if (oxcf->mode == BEST) {
+    if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION)
+      sf->exhaustive_searches_thresh = (1 << 20);
+    else
+      sf->exhaustive_searches_thresh = (1 << 21);
+    sf->max_exaustive_pct = 100;
+    for (i = 0; i < MAX_MESH_STEP; ++i) {
+      sf->mesh_patterns[i].range = best_quality_mesh_pattern[i].range;
+      sf->mesh_patterns[i].interval = best_quality_mesh_pattern[i].interval;
+    }
+  } else {
+    int speed = (oxcf->speed > MAX_MESH_SPEED) ? MAX_MESH_SPEED : oxcf->speed;
+    if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION)
+      sf->exhaustive_searches_thresh = (1 << 22);
+    else
+      sf->exhaustive_searches_thresh = (1 << 23);
+    sf->max_exaustive_pct = good_quality_max_mesh_pct[speed];
+    if (speed > 0)
+      sf->exhaustive_searches_thresh = sf->exhaustive_searches_thresh << 1;
+
+    for (i = 0; i < MAX_MESH_STEP; ++i) {
+      sf->mesh_patterns[i].range =
+          good_quality_mesh_patterns[speed][i].range;
+      sf->mesh_patterns[i].interval =
+          good_quality_mesh_patterns[speed][i].interval;
+    }
+  }
+
+  // Slow quant, dct and trellis not worthwhile for first pass
+  // so make sure they are always turned off.
+  if (oxcf->pass == 1)
+    sf->optimize_coefficients = 0;
+
+  // No recode for 1 pass.
+  if (oxcf->pass == 0) {
+    sf->recode_loop = DISALLOW_RECODE;
+    sf->optimize_coefficients = 0;
+  }
+
+  if (sf->mv.subpel_search_method == SUBPEL_TREE) {
+    cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree;
+  } else if (sf->mv.subpel_search_method == SUBPEL_TREE_PRUNED) {
+    cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree_pruned;
+  } else if (sf->mv.subpel_search_method == SUBPEL_TREE_PRUNED_MORE) {
+    cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree_pruned_more;
+  } else if (sf->mv.subpel_search_method == SUBPEL_TREE_PRUNED_EVENMORE) {
+    cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree_pruned_evenmore;
+  }
+
+  x->optimize = sf->optimize_coefficients == 1 && oxcf->pass != 1;
+
+  x->min_partition_size = sf->default_min_partition_size;
+  x->max_partition_size = sf->default_max_partition_size;
+
+  if (!cpi->oxcf.frame_periodic_boost) {
+    sf->max_delta_qindex = 0;
+  }
+}
diff --git a/libs/libvpx/vp9/encoder/vp9_speed_features.h b/libs/libvpx/vp9/encoder/vp9_speed_features.h
new file mode 100644
index 0000000000..fa2f79d31e
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_speed_features.h
@@ -0,0 +1,456 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_VP9_SPEED_FEATURES_H_
+#define VP9_ENCODER_VP9_SPEED_FEATURES_H_
+
+#include "vp9/common/vp9_enums.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum {
+  INTRA_ALL       = (1 << DC_PRED) |
+                    (1 << V_PRED) | (1 << H_PRED) |
+                    (1 << D45_PRED) | (1 << D135_PRED) |
+                    (1 << D117_PRED) | (1 << D153_PRED) |
+                    (1 << D207_PRED) | (1 << D63_PRED) |
+                    (1 << TM_PRED),
+  INTRA_DC        = (1 << DC_PRED),
+  INTRA_DC_TM     = (1 << DC_PRED) | (1 << TM_PRED),
+  INTRA_DC_H_V    = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED),
+  INTRA_DC_TM_H_V = (1 << DC_PRED) | (1 << TM_PRED) | (1 << V_PRED) |
+                    (1 << H_PRED)
+};
+
+enum {
+  INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) | (1 << NEWMV),
+  INTER_NEAREST = (1 << NEARESTMV),
+  INTER_NEAREST_NEW = (1 << NEARESTMV) | (1 << NEWMV),
+  INTER_NEAREST_ZERO = (1 << NEARESTMV) | (1 << ZEROMV),
+  INTER_NEAREST_NEW_ZERO = (1 << NEARESTMV) | (1 << ZEROMV) | (1 << NEWMV),
+  INTER_NEAREST_NEAR_NEW = (1 << NEARESTMV) | (1 << NEARMV) | (1 << NEWMV),
+  INTER_NEAREST_NEAR_ZERO = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV),
+};
+
+enum {
+  DISABLE_ALL_INTER_SPLIT   = (1 << THR_COMP_GA) |
+                              (1 << THR_COMP_LA) |
+                              (1 << THR_ALTR) |
+                              (1 << THR_GOLD) |
+                              (1 << THR_LAST),
+
+  DISABLE_ALL_SPLIT         = (1 << THR_INTRA) | DISABLE_ALL_INTER_SPLIT,
+
+  DISABLE_COMPOUND_SPLIT    = (1 << THR_COMP_GA) | (1 << THR_COMP_LA),
+
+  LAST_AND_INTRA_SPLIT_ONLY = (1 << THR_COMP_GA) |
+                              (1 << THR_COMP_LA) |
+                              (1 << THR_ALTR) |
+                              (1 << THR_GOLD)
+};
+
+typedef enum {
+  DIAMOND = 0,
+  NSTEP = 1,
+  HEX = 2,
+  BIGDIA = 3,
+  SQUARE = 4,
+  FAST_HEX = 5,
+  FAST_DIAMOND = 6
+} SEARCH_METHODS;
+
+typedef enum {
+  // No recode.
+  DISALLOW_RECODE = 0,
+  // Allow recode for KF and exceeding maximum frame bandwidth.
+  ALLOW_RECODE_KFMAXBW = 1,
+  // Allow recode only for KF/ARF/GF frames.
+  ALLOW_RECODE_KFARFGF = 2,
+  // Allow recode for all frames based on bitrate constraints.
+  ALLOW_RECODE = 3,
+} RECODE_LOOP_TYPE;
+
+typedef enum {
+  SUBPEL_TREE = 0,
+  SUBPEL_TREE_PRUNED = 1,           // Prunes 1/2-pel searches
+  SUBPEL_TREE_PRUNED_MORE = 2,      // Prunes 1/2-pel searches more aggressively
+  SUBPEL_TREE_PRUNED_EVENMORE = 3,  // Prunes 1/2- and 1/4-pel searches
+  // Other methods to come
+} SUBPEL_SEARCH_METHODS;
+
+typedef enum {
+  NO_MOTION_THRESHOLD = 0,
+  LOW_MOTION_THRESHOLD = 7
+} MOTION_THRESHOLD;
+
+typedef enum {
+  USE_FULL_RD = 0,
+  USE_LARGESTALL,
+  USE_TX_8X8
+} TX_SIZE_SEARCH_METHOD;
+
+typedef enum {
+  NOT_IN_USE = 0,
+  RELAXED_NEIGHBORING_MIN_MAX = 1,
+  STRICT_NEIGHBORING_MIN_MAX = 2
+} AUTO_MIN_MAX_MODE;
+
+typedef enum {
+  // Try the full image with different values.
+  LPF_PICK_FROM_FULL_IMAGE,
+  // Try a small portion of the image with different values.
+  LPF_PICK_FROM_SUBIMAGE,
+  // Estimate the level based on quantizer and frame type
+  LPF_PICK_FROM_Q,
+  // Pick 0 to disable LPF if LPF was enabled last frame
+  LPF_PICK_MINIMAL_LPF
+} LPF_PICK_METHOD;
+
+typedef enum {
+  // Terminate search early based on distortion so far compared to
+  // qp step, distortion in the neighborhood of the frame, etc.
+  FLAG_EARLY_TERMINATE = 1 << 0,
+
+  // Skips comp inter modes if the best so far is an intra mode.
+  FLAG_SKIP_COMP_BESTINTRA = 1 << 1,
+
+  // Skips oblique intra modes if the best so far is an inter mode.
+  FLAG_SKIP_INTRA_BESTINTER = 1 << 3,
+
+  // Skips oblique intra modes  at angles 27, 63, 117, 153 if the best
+  // intra so far is not one of the neighboring directions.
+  FLAG_SKIP_INTRA_DIRMISMATCH = 1 << 4,
+
+  // Skips intra modes other than DC_PRED if the source variance is small
+  FLAG_SKIP_INTRA_LOWVAR = 1 << 5,
+} MODE_SEARCH_SKIP_LOGIC;
+
+typedef enum {
+  FLAG_SKIP_EIGHTTAP = 1 << EIGHTTAP,
+  FLAG_SKIP_EIGHTTAP_SMOOTH = 1 << EIGHTTAP_SMOOTH,
+  FLAG_SKIP_EIGHTTAP_SHARP = 1 << EIGHTTAP_SHARP,
+} INTERP_FILTER_MASK;
+
+typedef enum {
+  // Search partitions using RD/NONRD criterion
+  SEARCH_PARTITION,
+
+  // Always use a fixed size partition
+  FIXED_PARTITION,
+
+  REFERENCE_PARTITION,
+
+  // Use an arbitrary partitioning scheme based on source variance within
+  // a 64X64 SB
+  VAR_BASED_PARTITION,
+
+  // Use non-fixed partitions based on source variance
+  SOURCE_VAR_BASED_PARTITION
+} PARTITION_SEARCH_TYPE;
+
+typedef enum {
+  // Does a dry run to see if any of the contexts need to be updated or not,
+  // before the final run.
+  TWO_LOOP = 0,
+
+  // No dry run, also only half the coef contexts and bands are updated.
+  // The rest are not updated at all.
+  ONE_LOOP_REDUCED = 1
+} FAST_COEFF_UPDATE;
+
+typedef struct MV_SPEED_FEATURES {
+  // Motion search method (Diamond, NSTEP, Hex, Big Diamond, Square, etc).
+  SEARCH_METHODS search_method;
+
+  // This parameter controls which step in the n-step process we start at.
+  // It's changed adaptively based on circumstances.
+  int reduce_first_step_size;
+
+  // If this is set to 1, we limit the motion search range to 2 times the
+  // largest motion vector found in the last frame.
+  int auto_mv_step_size;
+
+  // Subpel_search_method can only be subpel_tree which does a subpixel
+  // logarithmic search that keeps stepping at 1/2 pixel units until
+  // you stop getting a gain, and then goes on to 1/4 and repeats
+  // the same process. Along the way it skips many diagonals.
+  SUBPEL_SEARCH_METHODS subpel_search_method;
+
+  // Maximum number of steps in logarithmic subpel search before giving up.
+  int subpel_iters_per_step;
+
+  // Control when to stop subpel search
+  int subpel_force_stop;
+
+  // This variable sets the step_param used in full pel motion search.
+  int fullpel_search_step_param;
+} MV_SPEED_FEATURES;
+
+#define MAX_MESH_STEP 4
+
+typedef struct MESH_PATTERN {
+  int range;
+  int interval;
+} MESH_PATTERN;
+
+typedef struct SPEED_FEATURES {
+  MV_SPEED_FEATURES mv;
+
+  // Frame level coding parameter update
+  int frame_parameter_update;
+
+  RECODE_LOOP_TYPE recode_loop;
+
+  // Trellis (dynamic programming) optimization of quantized values (+1, 0).
+  int optimize_coefficients;
+
+  // Always set to 0. If on it enables 0 cost background transmission
+  // (except for the initial transmission of the segmentation). The feature is
+  // disabled because the addition of very large block sizes make the
+  // backgrounds very to cheap to encode, and the segmentation we have
+  // adds overhead.
+  int static_segmentation;
+
+  // If 1 we iterate finding a best reference for 2 ref frames together - via
+  // a log search that iterates 4 times (check around mv for last for best
+  // error of combined predictor then check around mv for alt). If 0 we
+  // we just use the best motion vector found for each frame by itself.
+  BLOCK_SIZE comp_inter_joint_search_thresh;
+
+  // This variable is used to cap the maximum number of times we skip testing a
+  // mode to be evaluated. A high value means we will be faster.
+  int adaptive_rd_thresh;
+
+  // Enables skipping the reconstruction step (idct, recon) in the
+  // intermediate steps assuming the last frame didn't have too many intra
+  // blocks and the q is less than a threshold.
+  int skip_encode_sb;
+  int skip_encode_frame;
+  // Speed feature to allow or disallow skipping of recode at block
+  // level within a frame.
+  int allow_skip_recode;
+
+  // Coefficient probability model approximation step size
+  int coeff_prob_appx_step;
+
+  // The threshold is to determine how slow the motino is, it is used when
+  // use_lastframe_partitioning is set to LAST_FRAME_PARTITION_LOW_MOTION
+  MOTION_THRESHOLD lf_motion_threshold;
+
+  // Determine which method we use to determine transform size. We can choose
+  // between options like full rd, largest for prediction size, largest
+  // for intra and model coefs for the rest.
+  TX_SIZE_SEARCH_METHOD tx_size_search_method;
+
+  // Low precision 32x32 fdct keeps everything in 16 bits and thus is less
+  // precise but significantly faster than the non lp version.
+  int use_lp32x32fdct;
+
+  // After looking at the first set of modes (set by index here), skip
+  // checking modes for reference frames that don't match the reference frame
+  // of the best so far.
+  int mode_skip_start;
+
+  // TODO(JBB): Remove this.
+  int reference_masking;
+
+  PARTITION_SEARCH_TYPE partition_search_type;
+
+  // Used if partition_search_type = FIXED_SIZE_PARTITION
+  BLOCK_SIZE always_this_block_size;
+
+  // Skip rectangular partition test when partition type none gives better
+  // rd than partition type split.
+  int less_rectangular_check;
+
+  // Disable testing non square partitions. (eg 16x32)
+  int use_square_partition_only;
+  BLOCK_SIZE use_square_only_threshold;
+
+  // Sets min and max partition sizes for this 64x64 region based on the
+  // same 64x64 in last encoded frame, and the left and above neighbor.
+  AUTO_MIN_MAX_MODE auto_min_max_partition_size;
+  // Ensures the rd based auto partition search will always
+  // go down at least to the specified level.
+  BLOCK_SIZE rd_auto_partition_min_limit;
+
+  // Min and max partition size we enable (block_size) as per auto
+  // min max, but also used by adjust partitioning, and pick_partitioning.
+  BLOCK_SIZE default_min_partition_size;
+  BLOCK_SIZE default_max_partition_size;
+
+  // Whether or not we allow partitions one smaller or one greater than the last
+  // frame's partitioning. Only used if use_lastframe_partitioning is set.
+  int adjust_partitioning_from_last_frame;
+
+  // How frequently we re do the partitioning from scratch. Only used if
+  // use_lastframe_partitioning is set.
+  int last_partitioning_redo_frequency;
+
+  // Disables sub 8x8 blocksizes in different scenarios: Choices are to disable
+  // it always, to allow it for only Last frame and Intra, disable it for all
+  // inter modes or to enable it always.
+  int disable_split_mask;
+
+  // TODO(jingning): combine the related motion search speed features
+  // This allows us to use motion search at other sizes as a starting
+  // point for this motion search and limits the search range around it.
+  int adaptive_motion_search;
+
+  // Flag for allowing some use of exhaustive searches;
+  int allow_exhaustive_searches;
+
+  // Threshold for allowing exhaistive motion search.
+  int exhaustive_searches_thresh;
+
+  // Maximum number of exhaustive searches for a frame.
+  int max_exaustive_pct;
+
+  // Pattern to be used for any exhaustive mesh searches.
+  MESH_PATTERN mesh_patterns[MAX_MESH_STEP];
+
+  int schedule_mode_search;
+
+  // Allows sub 8x8 modes to use the prediction filter that was determined
+  // best for 8x8 mode. If set to 0 we always re check all the filters for
+  // sizes less than 8x8, 1 means we check all filter modes if no 8x8 filter
+  // was selected, and 2 means we use 8 tap if no 8x8 filter mode was selected.
+  int adaptive_pred_interp_filter;
+
+  // Adaptive prediction mode search
+  int adaptive_mode_search;
+
+  // Chessboard pattern prediction filter type search
+  int cb_pred_filter_search;
+
+  int cb_partition_search;
+
+  int motion_field_mode_search;
+
+  int alt_ref_search_fp;
+
+  // Fast quantization process path
+  int use_quant_fp;
+
+  // Use finer quantizer in every other few frames that run variable block
+  // partition type search.
+  int force_frame_boost;
+
+  // Maximally allowed base quantization index fluctuation.
+  int max_delta_qindex;
+
+  // Implements various heuristics to skip searching modes
+  // The heuristics selected are based on  flags
+  // defined in the MODE_SEARCH_SKIP_HEURISTICS enum
+  unsigned int mode_search_skip_flags;
+
+  // A source variance threshold below which filter search is disabled
+  // Choose a very large value (UINT_MAX) to use 8-tap always
+  unsigned int disable_filter_search_var_thresh;
+
+  // These bit masks allow you to enable or disable intra modes for each
+  // transform size separately.
+  int intra_y_mode_mask[TX_SIZES];
+  int intra_uv_mode_mask[TX_SIZES];
+
+  // These bit masks allow you to enable or disable intra modes for each
+  // prediction block size separately.
+  int intra_y_mode_bsize_mask[BLOCK_SIZES];
+
+  // This variable enables an early break out of mode testing if the model for
+  // rd built from the prediction signal indicates a value that's much
+  // higher than the best rd we've seen so far.
+  int use_rd_breakout;
+
+  // This enables us to use an estimate for intra rd based on dc mode rather
+  // than choosing an actual uv mode in the stage of encoding before the actual
+  // final encode.
+  int use_uv_intra_rd_estimate;
+
+  // This feature controls how the loop filter level is determined.
+  LPF_PICK_METHOD lpf_pick;
+
+  // This feature limits the number of coefficients updates we actually do
+  // by only looking at counts from 1/2 the bands.
+  FAST_COEFF_UPDATE use_fast_coef_updates;
+
+  // This flag controls the use of non-RD mode decision.
+  int use_nonrd_pick_mode;
+
+  // A binary mask indicating if NEARESTMV, NEARMV, ZEROMV, NEWMV
+  // modes are used in order from LSB to MSB for each BLOCK_SIZE.
+  int inter_mode_mask[BLOCK_SIZES];
+
+  // This feature controls whether we do the expensive context update and
+  // calculation in the rd coefficient costing loop.
+  int use_fast_coef_costing;
+
+  // This feature controls the tolerence vs target used in deciding whether to
+  // recode a frame. It has no meaning if recode is disabled.
+  int recode_tolerance;
+
+  // This variable controls the maximum block size where intra blocks can be
+  // used in inter frames.
+  // TODO(aconverse): Fold this into one of the other many mode skips
+  BLOCK_SIZE max_intra_bsize;
+
+  // The frequency that we check if SOURCE_VAR_BASED_PARTITION or
+  // FIXED_PARTITION search type should be used.
+  int search_type_check_frequency;
+
+  // When partition is pre-set, the inter prediction result from pick_inter_mode
+  // can be reused in final block encoding process. It is enabled only for real-
+  // time mode speed 6.
+  int reuse_inter_pred_sby;
+
+  // This variable sets the encode_breakout threshold. Currently, it is only
+  // enabled in real time mode.
+  int encode_breakout_thresh;
+
+  // default interp filter choice
+  INTERP_FILTER default_interp_filter;
+
+  // Early termination in transform size search, which only applies while
+  // tx_size_search_method is USE_FULL_RD.
+  int tx_size_search_breakout;
+
+  // adaptive interp_filter search to allow skip of certain filter types.
+  int adaptive_interp_filter_search;
+
+  // mask for skip evaluation of certain interp_filter type.
+  INTERP_FILTER_MASK interp_filter_search_mask;
+
+  // Partition search early breakout thresholds.
+  int64_t partition_search_breakout_dist_thr;
+  int partition_search_breakout_rate_thr;
+
+  // Allow skipping partition search for still image frame
+  int allow_partition_search_skip;
+
+  // Fast approximation of vp9_model_rd_from_var_lapndz
+  int simple_model_rd_from_var;
+
+  // Skip a number of expensive mode evaluations for blocks with zero source
+  // variance.
+  int short_circuit_flat_blocks;
+} SPEED_FEATURES;
+
+struct VP9_COMP;
+
+void vp9_set_speed_features_framesize_independent(struct VP9_COMP *cpi);
+void vp9_set_speed_features_framesize_dependent(struct VP9_COMP *cpi);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_VP9_SPEED_FEATURES_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_subexp.c b/libs/libvpx/vp9/encoder/vp9_subexp.c
new file mode 100644
index 0000000000..cbd3c49466
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_subexp.c
@@ -0,0 +1,197 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "vpx_dsp/bitwriter.h"
+
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_entropy.h"
+#include "vp9/encoder/vp9_cost.h"
+#include "vp9/encoder/vp9_subexp.h"
+
+#define vp9_cost_upd256  ((int)(vp9_cost_one(upd) - vp9_cost_zero(upd)))
+
+static const uint8_t update_bits[255] = {
+   5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
+   6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
+   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+  10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  0,
+};
+
+static int recenter_nonneg(int v, int m) {
+  if (v > (m << 1))
+    return v;
+  else if (v >= m)
+    return ((v - m) << 1);
+  else
+    return ((m - v) << 1) - 1;
+}
+
+static int remap_prob(int v, int m) {
+  int i;
+  static const uint8_t map_table[MAX_PROB - 1] = {
+    // generated by:
+    //   map_table[j] = split_index(j, MAX_PROB - 1, MODULUS_PARAM);
+     20,  21,  22,  23,  24,  25,   0,  26,  27,  28,  29,  30,  31,  32,  33,
+     34,  35,  36,  37,   1,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,
+     48,  49,   2,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,
+      3,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,   4,  74,
+     75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,   5,  86,  87,  88,
+     89,  90,  91,  92,  93,  94,  95,  96,  97,   6,  98,  99, 100, 101, 102,
+    103, 104, 105, 106, 107, 108, 109,   7, 110, 111, 112, 113, 114, 115, 116,
+    117, 118, 119, 120, 121,   8, 122, 123, 124, 125, 126, 127, 128, 129, 130,
+    131, 132, 133,   9, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144,
+    145,  10, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157,  11,
+    158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,  12, 170, 171,
+    172, 173, 174, 175, 176, 177, 178, 179, 180, 181,  13, 182, 183, 184, 185,
+    186, 187, 188, 189, 190, 191, 192, 193,  14, 194, 195, 196, 197, 198, 199,
+    200, 201, 202, 203, 204, 205,  15, 206, 207, 208, 209, 210, 211, 212, 213,
+    214, 215, 216, 217,  16, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227,
+    228, 229,  17, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241,
+     18, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253,  19,
+  };
+  v--;
+  m--;
+  if ((m << 1) <= MAX_PROB)
+    i = recenter_nonneg(v, m) - 1;
+  else
+    i = recenter_nonneg(MAX_PROB - 1 - v, MAX_PROB - 1 - m) - 1;
+
+  i = map_table[i];
+  return i;
+}
+
+static int prob_diff_update_cost(vpx_prob newp, vpx_prob oldp) {
+  int delp = remap_prob(newp, oldp);
+  return update_bits[delp] << VP9_PROB_COST_SHIFT;
+}
+
+static void encode_uniform(vpx_writer *w, int v) {
+  const int l = 8;
+  const int m = (1 << l) - 191;
+  if (v < m) {
+    vpx_write_literal(w, v, l - 1);
+  } else {
+    vpx_write_literal(w, m + ((v - m) >> 1), l - 1);
+    vpx_write_literal(w, (v - m) & 1, 1);
+  }
+}
+
+static INLINE int write_bit_gte(vpx_writer *w, int word, int test) {
+  vpx_write_literal(w, word >= test, 1);
+  return word >= test;
+}
+
+static void encode_term_subexp(vpx_writer *w, int word) {
+  if (!write_bit_gte(w, word, 16)) {
+    vpx_write_literal(w, word, 4);
+  } else if (!write_bit_gte(w, word, 32)) {
+    vpx_write_literal(w, word - 16, 4);
+  } else if (!write_bit_gte(w, word, 64)) {
+    vpx_write_literal(w, word - 32, 5);
+  } else {
+    encode_uniform(w, word - 64);
+  }
+}
+
+void vp9_write_prob_diff_update(vpx_writer *w, vpx_prob newp, vpx_prob oldp) {
+  const int delp = remap_prob(newp, oldp);
+  encode_term_subexp(w, delp);
+}
+
+int vp9_prob_diff_update_savings_search(const unsigned int *ct,
+                                        vpx_prob oldp, vpx_prob *bestp,
+                                        vpx_prob upd) {
+  const int old_b = cost_branch256(ct, oldp);
+  int bestsavings = 0;
+  vpx_prob newp, bestnewp = oldp;
+  const int step = *bestp > oldp ? -1 : 1;
+
+  for (newp = *bestp; newp != oldp; newp += step) {
+    const int new_b = cost_branch256(ct, newp);
+    const int update_b = prob_diff_update_cost(newp, oldp) + vp9_cost_upd256;
+    const int savings = old_b - new_b - update_b;
+    if (savings > bestsavings) {
+      bestsavings = savings;
+      bestnewp = newp;
+    }
+  }
+  *bestp = bestnewp;
+  return bestsavings;
+}
+
+int vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
+                                              const vpx_prob *oldp,
+                                              vpx_prob *bestp,
+                                              vpx_prob upd,
+                                              int stepsize) {
+  int i, old_b, new_b, update_b, savings, bestsavings;
+  int newp;
+  const int step_sign = *bestp > oldp[PIVOT_NODE] ? -1 : 1;
+  const int step = stepsize * step_sign;
+  vpx_prob bestnewp, newplist[ENTROPY_NODES], oldplist[ENTROPY_NODES];
+  vp9_model_to_full_probs(oldp, oldplist);
+  memcpy(newplist, oldp, sizeof(vpx_prob) * UNCONSTRAINED_NODES);
+  for (i = UNCONSTRAINED_NODES, old_b = 0; i < ENTROPY_NODES; ++i)
+    old_b += cost_branch256(ct + 2 * i, oldplist[i]);
+  old_b += cost_branch256(ct + 2 * PIVOT_NODE, oldplist[PIVOT_NODE]);
+
+  bestsavings = 0;
+  bestnewp = oldp[PIVOT_NODE];
+
+  assert(stepsize > 0);
+
+  for (newp = *bestp; (newp - oldp[PIVOT_NODE]) * step_sign < 0;
+      newp += step) {
+    if (newp < 1 || newp > 255)
+      continue;
+    newplist[PIVOT_NODE] = newp;
+    vp9_model_to_full_probs(newplist, newplist);
+    for (i = UNCONSTRAINED_NODES, new_b = 0; i < ENTROPY_NODES; ++i)
+      new_b += cost_branch256(ct + 2 * i, newplist[i]);
+    new_b += cost_branch256(ct + 2 * PIVOT_NODE, newplist[PIVOT_NODE]);
+    update_b = prob_diff_update_cost(newp, oldp[PIVOT_NODE]) +
+        vp9_cost_upd256;
+    savings = old_b - new_b - update_b;
+    if (savings > bestsavings) {
+      bestsavings = savings;
+      bestnewp = newp;
+    }
+  }
+
+  *bestp = bestnewp;
+  return bestsavings;
+}
+
+void vp9_cond_prob_diff_update(vpx_writer *w, vpx_prob *oldp,
+                               const unsigned int ct[2]) {
+  const vpx_prob upd = DIFF_UPDATE_PROB;
+  vpx_prob newp = get_binary_prob(ct[0], ct[1]);
+  const int savings = vp9_prob_diff_update_savings_search(ct, *oldp, &newp,
+                                                          upd);
+  assert(newp >= 1);
+  if (savings > 0) {
+    vpx_write(w, 1, upd);
+    vp9_write_prob_diff_update(w, newp, *oldp);
+    *oldp = newp;
+  } else {
+    vpx_write(w, 0, upd);
+  }
+}
diff --git a/libs/libvpx/vp9/encoder/vp9_subexp.h b/libs/libvpx/vp9/encoder/vp9_subexp.h
new file mode 100644
index 0000000000..b968232322
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_subexp.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP9_ENCODER_VP9_SUBEXP_H_
+#define VP9_ENCODER_VP9_SUBEXP_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "vpx_dsp/prob.h"
+
+struct vpx_writer;
+
+void vp9_write_prob_diff_update(struct vpx_writer *w,
+                                vpx_prob newp, vpx_prob oldp);
+
+void vp9_cond_prob_diff_update(struct vpx_writer *w, vpx_prob *oldp,
+                               const unsigned int ct[2]);
+
+int vp9_prob_diff_update_savings_search(const unsigned int *ct,
+                                        vpx_prob oldp, vpx_prob *bestp,
+                                        vpx_prob upd);
+
+
+int vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
+                                              const vpx_prob *oldp,
+                                              vpx_prob *bestp,
+                                              vpx_prob upd,
+                                              int stepsize);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_VP9_SUBEXP_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_svc_layercontext.c b/libs/libvpx/vp9/encoder/vp9_svc_layercontext.c
new file mode 100644
index 0000000000..9724df4cd7
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_svc_layercontext.c
@@ -0,0 +1,793 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+
+#include "vp9/encoder/vp9_aq_cyclicrefresh.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_svc_layercontext.h"
+#include "vp9/encoder/vp9_extend.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+#define SMALL_FRAME_WIDTH  32
+#define SMALL_FRAME_HEIGHT 16
+
+void vp9_init_layer_context(VP9_COMP *const cpi) {
+  SVC *const svc = &cpi->svc;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  int mi_rows = cpi->common.mi_rows;
+  int mi_cols = cpi->common.mi_cols;
+  int sl, tl, i;
+  int alt_ref_idx = svc->number_spatial_layers;
+
+  svc->spatial_layer_id = 0;
+  svc->temporal_layer_id = 0;
+  svc->first_spatial_layer_to_encode = 0;
+  svc->rc_drop_superframe = 0;
+  svc->force_zero_mode_spatial_ref = 0;
+  svc->use_base_mv = 0;
+  svc->current_superframe = 0;
+  for (i = 0; i < REF_FRAMES; ++i)
+    svc->ref_frame_index[i] = -1;
+  for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
+    cpi->svc.ext_frame_flags[sl] = 0;
+    cpi->svc.ext_lst_fb_idx[sl] = 0;
+    cpi->svc.ext_gld_fb_idx[sl] = 1;
+    cpi->svc.ext_alt_fb_idx[sl] = 2;
+  }
+
+  if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2) {
+    if (vpx_realloc_frame_buffer(&cpi->svc.empty_frame.img,
+                                 SMALL_FRAME_WIDTH, SMALL_FRAME_HEIGHT,
+                                 cpi->common.subsampling_x,
+                                 cpi->common.subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                 cpi->common.use_highbitdepth,
+#endif
+                                 VP9_ENC_BORDER_IN_PIXELS,
+                                 cpi->common.byte_alignment,
+                                 NULL, NULL, NULL))
+      vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+                         "Failed to allocate empty frame for multiple frame "
+                         "contexts");
+
+    memset(cpi->svc.empty_frame.img.buffer_alloc, 0x80,
+           cpi->svc.empty_frame.img.buffer_alloc_sz);
+  }
+
+  for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
+    for (tl = 0; tl < oxcf->ts_number_layers; ++tl) {
+      int layer = LAYER_IDS_TO_IDX(sl, tl, oxcf->ts_number_layers);
+      LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+      RATE_CONTROL *const lrc = &lc->rc;
+      int i;
+      lc->current_video_frame_in_layer = 0;
+      lc->layer_size = 0;
+      lc->frames_from_key_frame = 0;
+      lc->last_frame_type = FRAME_TYPES;
+      lrc->ni_av_qi = oxcf->worst_allowed_q;
+      lrc->total_actual_bits = 0;
+      lrc->total_target_vs_actual = 0;
+      lrc->ni_tot_qi = 0;
+      lrc->tot_q = 0.0;
+      lrc->avg_q = 0.0;
+      lrc->ni_frames = 0;
+      lrc->decimation_count = 0;
+      lrc->decimation_factor = 0;
+
+      for (i = 0; i < RATE_FACTOR_LEVELS; ++i) {
+        lrc->rate_correction_factors[i] = 1.0;
+      }
+
+      if (cpi->oxcf.rc_mode == VPX_CBR) {
+        lc->target_bandwidth = oxcf->layer_target_bitrate[layer];
+        lrc->last_q[INTER_FRAME] = oxcf->worst_allowed_q;
+        lrc->avg_frame_qindex[INTER_FRAME] = oxcf->worst_allowed_q;
+        lrc->avg_frame_qindex[KEY_FRAME] = oxcf->worst_allowed_q;
+      } else {
+        lc->target_bandwidth = oxcf->layer_target_bitrate[layer];
+        lrc->last_q[KEY_FRAME] = oxcf->best_allowed_q;
+        lrc->last_q[INTER_FRAME] = oxcf->best_allowed_q;
+        lrc->avg_frame_qindex[KEY_FRAME] = (oxcf->worst_allowed_q +
+                                            oxcf->best_allowed_q) / 2;
+        lrc->avg_frame_qindex[INTER_FRAME] = (oxcf->worst_allowed_q +
+                                              oxcf->best_allowed_q) / 2;
+        if (oxcf->ss_enable_auto_arf[sl])
+          lc->alt_ref_idx = alt_ref_idx++;
+        else
+          lc->alt_ref_idx = INVALID_IDX;
+        lc->gold_ref_idx = INVALID_IDX;
+      }
+
+      lrc->buffer_level = oxcf->starting_buffer_level_ms *
+                              lc->target_bandwidth / 1000;
+      lrc->bits_off_target = lrc->buffer_level;
+
+      // Initialize the cyclic refresh parameters. If spatial layers are used
+      // (i.e., ss_number_layers > 1), these need to be updated per spatial
+      // layer.
+      // Cyclic refresh is only applied on base temporal layer.
+      if (oxcf->ss_number_layers > 1 &&
+          tl == 0) {
+        size_t last_coded_q_map_size;
+        size_t consec_zero_mv_size;
+        lc->sb_index = 0;
+        lc->map = vpx_malloc(mi_rows * mi_cols * sizeof(signed char));
+        memset(lc->map, 0, mi_rows * mi_cols);
+        last_coded_q_map_size = mi_rows * mi_cols * sizeof(uint8_t);
+        lc->last_coded_q_map = vpx_malloc(last_coded_q_map_size);
+        assert(MAXQ <= 255);
+        memset(lc->last_coded_q_map, MAXQ, last_coded_q_map_size);
+        consec_zero_mv_size = mi_rows * mi_cols * sizeof(uint8_t);
+        lc->consec_zero_mv = vpx_malloc(consec_zero_mv_size);
+        memset(lc->consec_zero_mv, 0, consec_zero_mv_size);
+       }
+    }
+  }
+
+  // Still have extra buffer for base layer golden frame
+  if (!(svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR)
+      && alt_ref_idx < REF_FRAMES)
+    svc->layer_context[0].gold_ref_idx = alt_ref_idx;
+}
+
+// Update the layer context from a change_config() call.
+void vp9_update_layer_context_change_config(VP9_COMP *const cpi,
+                                            const int target_bandwidth) {
+  SVC *const svc = &cpi->svc;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  int sl, tl, layer = 0, spatial_layer_target;
+  float bitrate_alloc = 1.0;
+
+  if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING) {
+    for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
+      for (tl = 0; tl < oxcf->ts_number_layers; ++tl) {
+        layer = LAYER_IDS_TO_IDX(sl, tl, oxcf->ts_number_layers);
+        svc->layer_context[layer].target_bandwidth =
+            oxcf->layer_target_bitrate[layer];
+      }
+
+      layer = LAYER_IDS_TO_IDX(sl, ((oxcf->ts_number_layers - 1) < 0 ?
+          0 : (oxcf->ts_number_layers - 1)), oxcf->ts_number_layers);
+      spatial_layer_target =
+          svc->layer_context[layer].target_bandwidth =
+              oxcf->layer_target_bitrate[layer];
+
+      for (tl = 0; tl < oxcf->ts_number_layers; ++tl) {
+        LAYER_CONTEXT *const lc =
+            &svc->layer_context[sl * oxcf->ts_number_layers + tl];
+        RATE_CONTROL *const lrc = &lc->rc;
+
+        lc->spatial_layer_target_bandwidth = spatial_layer_target;
+        bitrate_alloc = (float)lc->target_bandwidth / spatial_layer_target;
+        lrc->starting_buffer_level =
+            (int64_t)(rc->starting_buffer_level * bitrate_alloc);
+        lrc->optimal_buffer_level =
+            (int64_t)(rc->optimal_buffer_level * bitrate_alloc);
+        lrc->maximum_buffer_size =
+            (int64_t)(rc->maximum_buffer_size * bitrate_alloc);
+        lrc->bits_off_target =
+            VPXMIN(lrc->bits_off_target, lrc->maximum_buffer_size);
+        lrc->buffer_level = VPXMIN(lrc->buffer_level, lrc->maximum_buffer_size);
+        lc->framerate = cpi->framerate / oxcf->ts_rate_decimator[tl];
+        lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
+        lrc->max_frame_bandwidth = rc->max_frame_bandwidth;
+        lrc->worst_quality = rc->worst_quality;
+        lrc->best_quality = rc->best_quality;
+      }
+    }
+  } else {
+    int layer_end;
+
+    if (svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) {
+      layer_end = svc->number_temporal_layers;
+    } else {
+      layer_end = svc->number_spatial_layers;
+    }
+
+    for (layer = 0; layer < layer_end; ++layer) {
+      LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+      RATE_CONTROL *const lrc = &lc->rc;
+
+      lc->target_bandwidth = oxcf->layer_target_bitrate[layer];
+
+      bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth;
+      // Update buffer-related quantities.
+      lrc->starting_buffer_level =
+          (int64_t)(rc->starting_buffer_level * bitrate_alloc);
+      lrc->optimal_buffer_level =
+          (int64_t)(rc->optimal_buffer_level * bitrate_alloc);
+      lrc->maximum_buffer_size =
+          (int64_t)(rc->maximum_buffer_size * bitrate_alloc);
+      lrc->bits_off_target = VPXMIN(lrc->bits_off_target,
+                                    lrc->maximum_buffer_size);
+      lrc->buffer_level = VPXMIN(lrc->buffer_level, lrc->maximum_buffer_size);
+      // Update framerate-related quantities.
+      if (svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) {
+        lc->framerate = cpi->framerate / oxcf->ts_rate_decimator[layer];
+      } else {
+        lc->framerate = cpi->framerate;
+      }
+      lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
+      lrc->max_frame_bandwidth = rc->max_frame_bandwidth;
+      // Update qp-related quantities.
+      lrc->worst_quality = rc->worst_quality;
+      lrc->best_quality = rc->best_quality;
+    }
+  }
+}
+
+static LAYER_CONTEXT *get_layer_context(VP9_COMP *const cpi) {
+  if (is_one_pass_cbr_svc(cpi))
+    return &cpi->svc.layer_context[cpi->svc.spatial_layer_id *
+        cpi->svc.number_temporal_layers + cpi->svc.temporal_layer_id];
+  else
+    return (cpi->svc.number_temporal_layers > 1 &&
+            cpi->oxcf.rc_mode == VPX_CBR) ?
+             &cpi->svc.layer_context[cpi->svc.temporal_layer_id] :
+             &cpi->svc.layer_context[cpi->svc.spatial_layer_id];
+}
+
+void vp9_update_temporal_layer_framerate(VP9_COMP *const cpi) {
+  SVC *const svc = &cpi->svc;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  LAYER_CONTEXT *const lc = get_layer_context(cpi);
+  RATE_CONTROL *const lrc = &lc->rc;
+  // Index into spatial+temporal arrays.
+  const int st_idx = svc->spatial_layer_id * svc->number_temporal_layers +
+      svc->temporal_layer_id;
+  const int tl = svc->temporal_layer_id;
+
+  lc->framerate = cpi->framerate / oxcf->ts_rate_decimator[tl];
+  lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
+  lrc->max_frame_bandwidth = cpi->rc.max_frame_bandwidth;
+  // Update the average layer frame size (non-cumulative per-frame-bw).
+  if (tl == 0) {
+    lc->avg_frame_size = lrc->avg_frame_bandwidth;
+  } else {
+    const double prev_layer_framerate =
+        cpi->framerate / oxcf->ts_rate_decimator[tl - 1];
+    const int prev_layer_target_bandwidth =
+        oxcf->layer_target_bitrate[st_idx - 1];
+    lc->avg_frame_size =
+        (int)((lc->target_bandwidth - prev_layer_target_bandwidth) /
+              (lc->framerate - prev_layer_framerate));
+  }
+}
+
+void vp9_update_spatial_layer_framerate(VP9_COMP *const cpi, double framerate) {
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  LAYER_CONTEXT *const lc = get_layer_context(cpi);
+  RATE_CONTROL *const lrc = &lc->rc;
+
+  lc->framerate = framerate;
+  lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
+  lrc->min_frame_bandwidth = (int)(lrc->avg_frame_bandwidth *
+                                   oxcf->two_pass_vbrmin_section / 100);
+  lrc->max_frame_bandwidth = (int)(((int64_t)lrc->avg_frame_bandwidth *
+                                   oxcf->two_pass_vbrmax_section) / 100);
+  vp9_rc_set_gf_interval_range(cpi, lrc);
+}
+
+void vp9_restore_layer_context(VP9_COMP *const cpi) {
+  LAYER_CONTEXT *const lc = get_layer_context(cpi);
+  const int old_frame_since_key = cpi->rc.frames_since_key;
+  const int old_frame_to_key = cpi->rc.frames_to_key;
+
+  cpi->rc = lc->rc;
+  cpi->twopass = lc->twopass;
+  cpi->oxcf.target_bandwidth = lc->target_bandwidth;
+  cpi->alt_ref_source = lc->alt_ref_source;
+  // Reset the frames_since_key and frames_to_key counters to their values
+  // before the layer restore. Keep these defined for the stream (not layer).
+  if (cpi->svc.number_temporal_layers > 1 ||
+      (cpi->svc.number_spatial_layers > 1 && !is_two_pass_svc(cpi))) {
+    cpi->rc.frames_since_key = old_frame_since_key;
+    cpi->rc.frames_to_key = old_frame_to_key;
+  }
+
+  // For spatial-svc, allow cyclic-refresh to be applied on the spatial layers,
+  // for the base temporal layer.
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+      cpi->svc.number_spatial_layers > 1 &&
+      cpi->svc.temporal_layer_id == 0) {
+    CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+    signed char *temp = cr->map;
+    uint8_t *temp2 = cr->last_coded_q_map;
+    uint8_t *temp3 = cr->consec_zero_mv;
+    cr->map = lc->map;
+    lc->map = temp;
+    cr->last_coded_q_map = lc->last_coded_q_map;
+    lc->last_coded_q_map = temp2;
+    cr->consec_zero_mv = lc->consec_zero_mv;
+    lc->consec_zero_mv = temp3;
+    cr->sb_index = lc->sb_index;
+  }
+}
+
+void vp9_save_layer_context(VP9_COMP *const cpi) {
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  LAYER_CONTEXT *const lc = get_layer_context(cpi);
+
+  lc->rc = cpi->rc;
+  lc->twopass = cpi->twopass;
+  lc->target_bandwidth = (int)oxcf->target_bandwidth;
+  lc->alt_ref_source = cpi->alt_ref_source;
+
+  // For spatial-svc, allow cyclic-refresh to be applied on the spatial layers,
+  // for the base temporal layer.
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+      cpi->svc.number_spatial_layers > 1 &&
+      cpi->svc.temporal_layer_id == 0) {
+    CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+    signed char *temp = lc->map;
+    uint8_t *temp2 = lc->last_coded_q_map;
+    uint8_t *temp3 = lc->consec_zero_mv;
+    lc->map = cr->map;
+    cr->map = temp;
+    lc->last_coded_q_map = cr->last_coded_q_map;
+    cr->last_coded_q_map = temp2;
+    lc->consec_zero_mv = cr->consec_zero_mv;
+    cr->consec_zero_mv = temp3;
+    lc->sb_index = cr->sb_index;
+  }
+}
+
+void vp9_init_second_pass_spatial_svc(VP9_COMP *cpi) {
+  SVC *const svc = &cpi->svc;
+  int i;
+
+  for (i = 0; i < svc->number_spatial_layers; ++i) {
+    TWO_PASS *const twopass = &svc->layer_context[i].twopass;
+
+    svc->spatial_layer_id = i;
+    vp9_init_second_pass(cpi);
+
+    twopass->total_stats.spatial_layer_id = i;
+    twopass->total_left_stats.spatial_layer_id = i;
+  }
+  svc->spatial_layer_id = 0;
+}
+
+void vp9_inc_frame_in_layer(VP9_COMP *const cpi) {
+  LAYER_CONTEXT *const lc =
+      &cpi->svc.layer_context[cpi->svc.spatial_layer_id *
+                              cpi->svc.number_temporal_layers];
+  ++lc->current_video_frame_in_layer;
+  ++lc->frames_from_key_frame;
+  if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)
+    ++cpi->svc.current_superframe;
+}
+
+int vp9_is_upper_layer_key_frame(const VP9_COMP *const cpi) {
+  return is_two_pass_svc(cpi) &&
+         cpi->svc.spatial_layer_id > 0 &&
+         cpi->svc.layer_context[cpi->svc.spatial_layer_id *
+                                cpi->svc.number_temporal_layers +
+                                cpi->svc.temporal_layer_id].is_key_frame;
+}
+
+static void get_layer_resolution(const int width_org, const int height_org,
+                                 const int num, const int den,
+                                 int *width_out, int *height_out) {
+  int w, h;
+
+  if (width_out == NULL || height_out == NULL || den == 0)
+    return;
+
+  w = width_org * num / den;
+  h = height_org * num / den;
+
+  // make height and width even to make chrome player happy
+  w += w % 2;
+  h += h % 2;
+
+  *width_out = w;
+  *height_out = h;
+}
+
+// The function sets proper ref_frame_flags, buffer indices, and buffer update
+// variables for temporal layering mode 3 - that does 0-2-1-2 temporal layering
+// scheme.
+static void set_flags_and_fb_idx_for_temporal_mode3(VP9_COMP *const cpi) {
+  int frame_num_within_temporal_struct = 0;
+  int spatial_id, temporal_id;
+  spatial_id = cpi->svc.spatial_layer_id = cpi->svc.spatial_layer_to_encode;
+  frame_num_within_temporal_struct =
+      cpi->svc.layer_context[cpi->svc.spatial_layer_id *
+      cpi->svc.number_temporal_layers].current_video_frame_in_layer % 4;
+  temporal_id = cpi->svc.temporal_layer_id =
+      (frame_num_within_temporal_struct & 1) ? 2 :
+      (frame_num_within_temporal_struct >> 1);
+  cpi->ext_refresh_last_frame = cpi->ext_refresh_golden_frame =
+      cpi->ext_refresh_alt_ref_frame = 0;
+  if (!temporal_id) {
+    cpi->ext_refresh_frame_flags_pending = 1;
+    cpi->ext_refresh_last_frame = 1;
+    if (!spatial_id) {
+      cpi->ref_frame_flags = VP9_LAST_FLAG;
+    } else if (cpi->svc.layer_context[temporal_id].is_key_frame) {
+      // base layer is a key frame.
+      cpi->ref_frame_flags = VP9_LAST_FLAG;
+      cpi->ext_refresh_last_frame = 0;
+      cpi->ext_refresh_golden_frame = 1;
+    } else {
+      cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
+    }
+  } else if (temporal_id == 1) {
+    cpi->ext_refresh_frame_flags_pending = 1;
+    cpi->ext_refresh_alt_ref_frame = 1;
+    if (!spatial_id) {
+      cpi->ref_frame_flags = VP9_LAST_FLAG;
+    } else {
+      cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
+    }
+  } else {
+    if (frame_num_within_temporal_struct == 1) {
+      // the first tl2 picture
+      if (spatial_id == cpi->svc.number_spatial_layers - 1) {  // top layer
+        cpi->ext_refresh_frame_flags_pending = 1;
+        if (!spatial_id)
+          cpi->ref_frame_flags = VP9_LAST_FLAG;
+        else
+          cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
+      } else if (!spatial_id) {
+        cpi->ext_refresh_frame_flags_pending = 1;
+        cpi->ext_refresh_alt_ref_frame = 1;
+        cpi->ref_frame_flags = VP9_LAST_FLAG;
+      } else if (spatial_id < cpi->svc.number_spatial_layers - 1) {
+        cpi->ext_refresh_frame_flags_pending = 1;
+        cpi->ext_refresh_alt_ref_frame = 1;
+        cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
+      }
+    } else {
+      //  The second tl2 picture
+      if (spatial_id == cpi->svc.number_spatial_layers - 1) {  // top layer
+        cpi->ext_refresh_frame_flags_pending = 1;
+        if (!spatial_id)
+        cpi->ref_frame_flags = VP9_LAST_FLAG;
+        else
+          cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
+      } else if (!spatial_id) {
+        cpi->ext_refresh_frame_flags_pending = 1;
+        cpi->ref_frame_flags = VP9_LAST_FLAG;
+        cpi->ext_refresh_alt_ref_frame = 1;
+      } else {  // top layer
+        cpi->ext_refresh_frame_flags_pending = 1;
+        cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
+        cpi->ext_refresh_alt_ref_frame = 1;
+      }
+    }
+  }
+  if (temporal_id == 0) {
+    cpi->lst_fb_idx = spatial_id;
+    if (spatial_id) {
+      if (cpi->svc.layer_context[temporal_id].is_key_frame) {
+        cpi->lst_fb_idx = spatial_id - 1;
+        cpi->gld_fb_idx = spatial_id;
+      } else {
+      cpi->gld_fb_idx = spatial_id - 1;
+      }
+    } else {
+      cpi->gld_fb_idx = 0;
+    }
+    cpi->alt_fb_idx = 0;
+  } else if (temporal_id == 1) {
+    cpi->lst_fb_idx = spatial_id;
+    cpi->gld_fb_idx = cpi->svc.number_spatial_layers + spatial_id - 1;
+    cpi->alt_fb_idx = cpi->svc.number_spatial_layers + spatial_id;
+  } else if (frame_num_within_temporal_struct == 1) {
+    cpi->lst_fb_idx = spatial_id;
+    cpi->gld_fb_idx = cpi->svc.number_spatial_layers + spatial_id - 1;
+    cpi->alt_fb_idx = cpi->svc.number_spatial_layers + spatial_id;
+  } else {
+    cpi->lst_fb_idx = cpi->svc.number_spatial_layers + spatial_id;
+    cpi->gld_fb_idx = cpi->svc.number_spatial_layers + spatial_id - 1;
+    cpi->alt_fb_idx = cpi->svc.number_spatial_layers + spatial_id;
+  }
+}
+
+// The function sets proper ref_frame_flags, buffer indices, and buffer update
+// variables for temporal layering mode 2 - that does 0-1-0-1 temporal layering
+// scheme.
+static void set_flags_and_fb_idx_for_temporal_mode2(VP9_COMP *const cpi) {
+  int spatial_id, temporal_id;
+  spatial_id = cpi->svc.spatial_layer_id = cpi->svc.spatial_layer_to_encode;
+  temporal_id = cpi->svc.temporal_layer_id =
+      cpi->svc.layer_context[cpi->svc.spatial_layer_id *
+      cpi->svc.number_temporal_layers].current_video_frame_in_layer & 1;
+  cpi->ext_refresh_last_frame = cpi->ext_refresh_golden_frame =
+                                cpi->ext_refresh_alt_ref_frame = 0;
+  if (!temporal_id) {
+    cpi->ext_refresh_frame_flags_pending = 1;
+    cpi->ext_refresh_last_frame = 1;
+    if (!spatial_id) {
+      cpi->ref_frame_flags = VP9_LAST_FLAG;
+    } else if (cpi->svc.layer_context[temporal_id].is_key_frame) {
+      // base layer is a key frame.
+      cpi->ref_frame_flags = VP9_LAST_FLAG;
+      cpi->ext_refresh_last_frame = 0;
+      cpi->ext_refresh_golden_frame = 1;
+    } else {
+      cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
+    }
+  } else if (temporal_id == 1) {
+    cpi->ext_refresh_frame_flags_pending = 1;
+    cpi->ext_refresh_alt_ref_frame = 1;
+    if (!spatial_id) {
+      cpi->ref_frame_flags = VP9_LAST_FLAG;
+    } else {
+      cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
+    }
+  }
+
+  if (temporal_id == 0) {
+    cpi->lst_fb_idx = spatial_id;
+    if (spatial_id) {
+      if (cpi->svc.layer_context[temporal_id].is_key_frame) {
+        cpi->lst_fb_idx = spatial_id - 1;
+        cpi->gld_fb_idx = spatial_id;
+      } else {
+      cpi->gld_fb_idx = spatial_id - 1;
+      }
+    } else {
+      cpi->gld_fb_idx = 0;
+    }
+    cpi->alt_fb_idx = 0;
+  } else if (temporal_id == 1) {
+    cpi->lst_fb_idx = spatial_id;
+    cpi->gld_fb_idx = cpi->svc.number_spatial_layers + spatial_id - 1;
+    cpi->alt_fb_idx = cpi->svc.number_spatial_layers + spatial_id;
+  }
+}
+
+// The function sets proper ref_frame_flags, buffer indices, and buffer update
+// variables for temporal layering mode 0 - that has no temporal layering.
+static void set_flags_and_fb_idx_for_temporal_mode_noLayering(
+    VP9_COMP *const cpi) {
+  int spatial_id;
+  spatial_id = cpi->svc.spatial_layer_id = cpi->svc.spatial_layer_to_encode;
+  cpi->ext_refresh_last_frame =
+      cpi->ext_refresh_golden_frame = cpi->ext_refresh_alt_ref_frame = 0;
+  cpi->ext_refresh_frame_flags_pending = 1;
+  cpi->ext_refresh_last_frame = 1;
+  if (!spatial_id) {
+    cpi->ref_frame_flags = VP9_LAST_FLAG;
+  } else if (cpi->svc.layer_context[0].is_key_frame) {
+    cpi->ref_frame_flags = VP9_LAST_FLAG;
+    cpi->ext_refresh_last_frame = 0;
+    cpi->ext_refresh_golden_frame = 1;
+  } else {
+    cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
+  }
+  cpi->lst_fb_idx = spatial_id;
+  if (spatial_id) {
+    if (cpi->svc.layer_context[0].is_key_frame) {
+      cpi->lst_fb_idx = spatial_id - 1;
+      cpi->gld_fb_idx = spatial_id;
+    } else {
+    cpi->gld_fb_idx = spatial_id - 1;
+    }
+  } else {
+    cpi->gld_fb_idx = 0;
+  }
+}
+
+int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
+  int width = 0, height = 0;
+  LAYER_CONTEXT *lc = NULL;
+  if (cpi->svc.number_spatial_layers > 1)
+    cpi->svc.use_base_mv = 1;
+  cpi->svc.force_zero_mode_spatial_ref = 1;
+
+  if (cpi->svc.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_0212) {
+    set_flags_and_fb_idx_for_temporal_mode3(cpi);
+  } else if (cpi->svc.temporal_layering_mode ==
+           VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING) {
+    set_flags_and_fb_idx_for_temporal_mode_noLayering(cpi);
+  } else if (cpi->svc.temporal_layering_mode ==
+           VP9E_TEMPORAL_LAYERING_MODE_0101) {
+    set_flags_and_fb_idx_for_temporal_mode2(cpi);
+  } else if (cpi->svc.temporal_layering_mode ==
+      VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
+    // In the BYPASS/flexible mode, the encoder is relying on the application
+    // to specify, for each spatial layer, the flags and buffer indices for the
+    // layering.
+    // Note that the check (cpi->ext_refresh_frame_flags_pending == 0) is
+    // needed to support the case where the frame flags may be passed in via
+    // vpx_codec_encode(), which can be used for the temporal-only svc case.
+    // TODO(marpan): Consider adding an enc_config parameter to better handle
+    // this case.
+    if (cpi->ext_refresh_frame_flags_pending == 0) {
+      int sl;
+      cpi->svc.spatial_layer_id = cpi->svc.spatial_layer_to_encode;
+      sl = cpi->svc.spatial_layer_id;
+      vp9_apply_encoding_flags(cpi, cpi->svc.ext_frame_flags[sl]);
+      cpi->lst_fb_idx = cpi->svc.ext_lst_fb_idx[sl];
+      cpi->gld_fb_idx = cpi->svc.ext_gld_fb_idx[sl];
+      cpi->alt_fb_idx = cpi->svc.ext_alt_fb_idx[sl];
+    }
+  }
+
+  if (cpi->svc.spatial_layer_id == cpi->svc.first_spatial_layer_to_encode)
+    cpi->svc.rc_drop_superframe = 0;
+
+  lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id *
+                               cpi->svc.number_temporal_layers +
+                               cpi->svc.temporal_layer_id];
+
+  // Setting the worst/best_quality via the encoder control: SET_SVC_PARAMETERS,
+  // only for non-BYPASS mode for now.
+  if (cpi->svc.temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
+    RATE_CONTROL *const lrc = &lc->rc;
+    lrc->worst_quality = vp9_quantizer_to_qindex(lc->max_q);
+    lrc->best_quality =  vp9_quantizer_to_qindex(lc->min_q);
+  }
+
+  get_layer_resolution(cpi->oxcf.width, cpi->oxcf.height,
+                       lc->scaling_factor_num, lc->scaling_factor_den,
+                       &width, &height);
+
+  if (vp9_set_size_literal(cpi, width, height) != 0)
+    return VPX_CODEC_INVALID_PARAM;
+
+  return 0;
+}
+
+#if CONFIG_SPATIAL_SVC
+#define SMALL_FRAME_FB_IDX 7
+
+int vp9_svc_start_frame(VP9_COMP *const cpi) {
+  int width = 0, height = 0;
+  LAYER_CONTEXT *lc;
+  struct lookahead_entry *buf;
+  int count = 1 << (cpi->svc.number_temporal_layers - 1);
+
+  cpi->svc.spatial_layer_id = cpi->svc.spatial_layer_to_encode;
+  lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id];
+
+  cpi->svc.temporal_layer_id = 0;
+  while ((lc->current_video_frame_in_layer % count) != 0) {
+    ++cpi->svc.temporal_layer_id;
+    count >>= 1;
+  }
+
+  cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;
+
+  cpi->lst_fb_idx = cpi->svc.spatial_layer_id;
+
+  if (cpi->svc.spatial_layer_id == 0)
+    cpi->gld_fb_idx = (lc->gold_ref_idx >= 0) ?
+                      lc->gold_ref_idx : cpi->lst_fb_idx;
+  else
+    cpi->gld_fb_idx = cpi->svc.spatial_layer_id - 1;
+
+  if (lc->current_video_frame_in_layer == 0) {
+    if (cpi->svc.spatial_layer_id >= 2) {
+      cpi->alt_fb_idx = cpi->svc.spatial_layer_id - 2;
+    } else {
+      cpi->alt_fb_idx = cpi->lst_fb_idx;
+      cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_ALT_FLAG);
+    }
+  } else {
+    if (cpi->oxcf.ss_enable_auto_arf[cpi->svc.spatial_layer_id]) {
+      cpi->alt_fb_idx = lc->alt_ref_idx;
+      if (!lc->has_alt_frame)
+        cpi->ref_frame_flags &= (~VP9_ALT_FLAG);
+    } else {
+      // Find a proper alt_fb_idx for layers that don't have alt ref frame
+      if (cpi->svc.spatial_layer_id == 0) {
+        cpi->alt_fb_idx = cpi->lst_fb_idx;
+      } else {
+        LAYER_CONTEXT *lc_lower =
+            &cpi->svc.layer_context[cpi->svc.spatial_layer_id - 1];
+
+        if (cpi->oxcf.ss_enable_auto_arf[cpi->svc.spatial_layer_id - 1] &&
+            lc_lower->alt_ref_source != NULL)
+          cpi->alt_fb_idx = lc_lower->alt_ref_idx;
+        else if (cpi->svc.spatial_layer_id >= 2)
+          cpi->alt_fb_idx = cpi->svc.spatial_layer_id - 2;
+        else
+          cpi->alt_fb_idx = cpi->lst_fb_idx;
+      }
+    }
+  }
+
+  get_layer_resolution(cpi->oxcf.width, cpi->oxcf.height,
+                       lc->scaling_factor_num, lc->scaling_factor_den,
+                       &width, &height);
+
+  // Workaround for multiple frame contexts. In some frames we can't use prev_mi
+  // since its previous frame could be changed during decoding time. The idea is
+  // we put a empty invisible frame in front of them, then we will not use
+  // prev_mi when encoding these frames.
+
+  buf = vp9_lookahead_peek(cpi->lookahead, 0);
+  if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2 &&
+      cpi->svc.encode_empty_frame_state == NEED_TO_ENCODE &&
+      lc->rc.frames_to_key != 0 &&
+      !(buf != NULL && (buf->flags & VPX_EFLAG_FORCE_KF))) {
+    if ((cpi->svc.number_temporal_layers > 1 &&
+         cpi->svc.temporal_layer_id < cpi->svc.number_temporal_layers - 1) ||
+        (cpi->svc.number_spatial_layers > 1 &&
+         cpi->svc.spatial_layer_id == 0)) {
+      struct lookahead_entry *buf = vp9_lookahead_peek(cpi->lookahead, 0);
+
+      if (buf != NULL) {
+        cpi->svc.empty_frame.ts_start = buf->ts_start;
+        cpi->svc.empty_frame.ts_end = buf->ts_end;
+        cpi->svc.encode_empty_frame_state = ENCODING;
+        cpi->common.show_frame = 0;
+        cpi->ref_frame_flags = 0;
+        cpi->common.frame_type = INTER_FRAME;
+        cpi->lst_fb_idx =
+            cpi->gld_fb_idx = cpi->alt_fb_idx = SMALL_FRAME_FB_IDX;
+
+        if (cpi->svc.encode_intra_empty_frame != 0)
+          cpi->common.intra_only = 1;
+
+        width = SMALL_FRAME_WIDTH;
+        height = SMALL_FRAME_HEIGHT;
+      }
+    }
+  }
+
+  cpi->oxcf.worst_allowed_q = vp9_quantizer_to_qindex(lc->max_q);
+  cpi->oxcf.best_allowed_q = vp9_quantizer_to_qindex(lc->min_q);
+
+  vp9_change_config(cpi, &cpi->oxcf);
+
+  if (vp9_set_size_literal(cpi, width, height) != 0)
+    return VPX_CODEC_INVALID_PARAM;
+
+  vp9_set_high_precision_mv(cpi, 1);
+
+  cpi->alt_ref_source = get_layer_context(cpi)->alt_ref_source;
+
+  return 0;
+}
+
+#undef SMALL_FRAME_FB_IDX
+#endif  // CONFIG_SPATIAL_SVC
+
+struct lookahead_entry *vp9_svc_lookahead_pop(VP9_COMP *const cpi,
+                                              struct lookahead_ctx *ctx,
+                                              int drain) {
+  struct lookahead_entry *buf = NULL;
+  if (ctx->sz && (drain || ctx->sz == ctx->max_sz - MAX_PRE_FRAMES)) {
+    buf = vp9_lookahead_peek(ctx, 0);
+    if (buf != NULL) {
+      // Only remove the buffer when pop the highest layer.
+      if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) {
+        vp9_lookahead_pop(ctx, drain);
+      }
+    }
+  }
+  return buf;
+}
+
+void vp9_free_svc_cyclic_refresh(VP9_COMP *const cpi) {
+  int sl, tl;
+  SVC *const svc = &cpi->svc;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
+    for (tl = 0; tl < oxcf->ts_number_layers; ++tl) {
+      int layer = LAYER_IDS_TO_IDX(sl, tl, oxcf->ts_number_layers);
+      LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+        if (lc->map)
+          vpx_free(lc->map);
+        if (lc->last_coded_q_map)
+          vpx_free(lc->last_coded_q_map);
+        if (lc->consec_zero_mv)
+          vpx_free(lc->consec_zero_mv);
+    }
+  }
+}
diff --git a/libs/libvpx/vp9/encoder/vp9_svc_layercontext.h b/libs/libvpx/vp9/encoder/vp9_svc_layercontext.h
new file mode 100644
index 0000000000..4e186401fe
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_svc_layercontext.h
@@ -0,0 +1,141 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_VP9_SVC_LAYERCONTEXT_H_
+#define VP9_ENCODER_VP9_SVC_LAYERCONTEXT_H_
+
+#include "vpx/vpx_encoder.h"
+
+#include "vp9/encoder/vp9_ratectrl.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+  RATE_CONTROL rc;
+  int target_bandwidth;
+  int spatial_layer_target_bandwidth;  // Target for the spatial layer.
+  double framerate;
+  int avg_frame_size;
+  int max_q;
+  int min_q;
+  int scaling_factor_num;
+  int scaling_factor_den;
+  TWO_PASS twopass;
+  vpx_fixed_buf_t rc_twopass_stats_in;
+  unsigned int current_video_frame_in_layer;
+  int is_key_frame;
+  int frames_from_key_frame;
+  FRAME_TYPE last_frame_type;
+  struct lookahead_entry  *alt_ref_source;
+  int alt_ref_idx;
+  int gold_ref_idx;
+  int has_alt_frame;
+  size_t layer_size;
+  struct vpx_psnr_pkt psnr_pkt;
+  // Cyclic refresh parameters (aq-mode=3), that need to be updated per-frame.
+  int sb_index;
+  signed char *map;
+  uint8_t *last_coded_q_map;
+  uint8_t *consec_zero_mv;
+} LAYER_CONTEXT;
+
+typedef struct {
+  int spatial_layer_id;
+  int temporal_layer_id;
+  int number_spatial_layers;
+  int number_temporal_layers;
+
+  int spatial_layer_to_encode;
+  int first_spatial_layer_to_encode;
+  int rc_drop_superframe;
+
+  // Workaround for multiple frame contexts
+  enum {
+    ENCODED = 0,
+    ENCODING,
+    NEED_TO_ENCODE
+  }encode_empty_frame_state;
+  struct lookahead_entry empty_frame;
+  int encode_intra_empty_frame;
+
+  // Store scaled source frames to be used for temporal filter to generate
+  // a alt ref frame.
+  YV12_BUFFER_CONFIG scaled_frames[MAX_LAG_BUFFERS];
+
+  // Layer context used for rate control in one pass temporal CBR mode or
+  // two pass spatial mode.
+  LAYER_CONTEXT layer_context[VPX_MAX_LAYERS];
+  // Indicates what sort of temporal layering is used.
+  // Currently, this only works for CBR mode.
+  VP9E_TEMPORAL_LAYERING_MODE temporal_layering_mode;
+  // Frame flags and buffer indexes for each spatial layer, set by the
+  // application (external settings).
+  int ext_frame_flags[VPX_MAX_LAYERS];
+  int ext_lst_fb_idx[VPX_MAX_LAYERS];
+  int ext_gld_fb_idx[VPX_MAX_LAYERS];
+  int ext_alt_fb_idx[VPX_MAX_LAYERS];
+  int ref_frame_index[REF_FRAMES];
+  int force_zero_mode_spatial_ref;
+  int current_superframe;
+  int use_base_mv;
+} SVC;
+
+struct VP9_COMP;
+
+// Initialize layer context data from init_config().
+void vp9_init_layer_context(struct VP9_COMP *const cpi);
+
+// Update the layer context from a change_config() call.
+void vp9_update_layer_context_change_config(struct VP9_COMP *const cpi,
+                                            const int target_bandwidth);
+
+// Prior to encoding the frame, update framerate-related quantities
+// for the current temporal layer.
+void vp9_update_temporal_layer_framerate(struct VP9_COMP *const cpi);
+
+// Update framerate-related quantities for the current spatial layer.
+void vp9_update_spatial_layer_framerate(struct VP9_COMP *const cpi,
+                                        double framerate);
+
+// Prior to encoding the frame, set the layer context, for the current layer
+// to be encoded, to the cpi struct.
+void vp9_restore_layer_context(struct VP9_COMP *const cpi);
+
+// Save the layer context after encoding the frame.
+void vp9_save_layer_context(struct VP9_COMP *const cpi);
+
+// Initialize second pass rc for spatial svc.
+void vp9_init_second_pass_spatial_svc(struct VP9_COMP *cpi);
+
+// Increment number of video frames in layer
+void vp9_inc_frame_in_layer(struct VP9_COMP *const cpi);
+
+// Check if current layer is key frame in spatial upper layer
+int vp9_is_upper_layer_key_frame(const struct VP9_COMP *const cpi);
+
+// Get the next source buffer to encode
+struct lookahead_entry *vp9_svc_lookahead_pop(struct VP9_COMP *const cpi,
+                                              struct lookahead_ctx *ctx,
+                                              int drain);
+
+// Start a frame and initialize svc parameters
+int vp9_svc_start_frame(struct VP9_COMP *const cpi);
+
+int vp9_one_pass_cbr_svc_start_layer(struct VP9_COMP *const cpi);
+
+void vp9_free_svc_cyclic_refresh(struct VP9_COMP *const cpi);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_VP9_SVC_LAYERCONTEXT_
diff --git a/libs/libvpx/vp9/encoder/vp9_temporal_filter.c b/libs/libvpx/vp9/encoder/vp9_temporal_filter.c
new file mode 100644
index 0000000000..82f566b132
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_temporal_filter.c
@@ -0,0 +1,797 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <limits.h>
+
+#include "vp9/common/vp9_alloccommon.h"
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/encoder/vp9_extend.h"
+#include "vp9/encoder/vp9_firstpass.h"
+#include "vp9/encoder/vp9_mcomp.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_quantize.h"
+#include "vp9/encoder/vp9_ratectrl.h"
+#include "vp9/encoder/vp9_segmentation.h"
+#include "vp9/encoder/vp9_temporal_filter.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/vpx_timer.h"
+#include "vpx_scale/vpx_scale.h"
+
+static int fixed_divide[512];
+
+static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
+                                            uint8_t *y_mb_ptr,
+                                            uint8_t *u_mb_ptr,
+                                            uint8_t *v_mb_ptr,
+                                            int stride,
+                                            int uv_block_width,
+                                            int uv_block_height,
+                                            int mv_row,
+                                            int mv_col,
+                                            uint8_t *pred,
+                                            struct scale_factors *scale,
+                                            int x, int y) {
+  const int which_mv = 0;
+  const MV mv = { mv_row, mv_col };
+  const InterpKernel *const kernel =
+    vp9_filter_kernels[xd->mi[0]->interp_filter];
+
+  enum mv_precision mv_precision_uv;
+  int uv_stride;
+  if (uv_block_width == 8) {
+    uv_stride = (stride + 1) >> 1;
+    mv_precision_uv = MV_PRECISION_Q4;
+  } else {
+    uv_stride = stride;
+    mv_precision_uv = MV_PRECISION_Q3;
+  }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    vp9_highbd_build_inter_predictor(y_mb_ptr, stride,
+                                     &pred[0], 16,
+                                     &mv,
+                                     scale,
+                                     16, 16,
+                                     which_mv,
+                                     kernel, MV_PRECISION_Q3, x, y, xd->bd);
+
+    vp9_highbd_build_inter_predictor(u_mb_ptr, uv_stride,
+                                     &pred[256], uv_block_width,
+                                     &mv,
+                                     scale,
+                                     uv_block_width, uv_block_height,
+                                     which_mv,
+                                     kernel, mv_precision_uv, x, y, xd->bd);
+
+    vp9_highbd_build_inter_predictor(v_mb_ptr, uv_stride,
+                                     &pred[512], uv_block_width,
+                                     &mv,
+                                     scale,
+                                     uv_block_width, uv_block_height,
+                                     which_mv,
+                                     kernel, mv_precision_uv, x, y, xd->bd);
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  vp9_build_inter_predictor(y_mb_ptr, stride,
+                            &pred[0], 16,
+                            &mv,
+                            scale,
+                            16, 16,
+                            which_mv,
+                            kernel, MV_PRECISION_Q3, x, y);
+
+  vp9_build_inter_predictor(u_mb_ptr, uv_stride,
+                            &pred[256], uv_block_width,
+                            &mv,
+                            scale,
+                            uv_block_width, uv_block_height,
+                            which_mv,
+                            kernel, mv_precision_uv, x, y);
+
+  vp9_build_inter_predictor(v_mb_ptr, uv_stride,
+                            &pred[512], uv_block_width,
+                            &mv,
+                            scale,
+                            uv_block_width, uv_block_height,
+                            which_mv,
+                            kernel, mv_precision_uv, x, y);
+}
+
+void vp9_temporal_filter_init(void) {
+  int i;
+
+  fixed_divide[0] = 0;
+  for (i = 1; i < 512; ++i)
+    fixed_divide[i] = 0x80000 / i;
+}
+
+void vp9_temporal_filter_apply_c(uint8_t *frame1,
+                                 unsigned int stride,
+                                 uint8_t *frame2,
+                                 unsigned int block_width,
+                                 unsigned int block_height,
+                                 int strength,
+                                 int filter_weight,
+                                 unsigned int *accumulator,
+                                 uint16_t *count) {
+  unsigned int i, j, k;
+  int modifier;
+  int byte = 0;
+  const int rounding = strength > 0 ? 1 << (strength - 1) : 0;
+
+  for (i = 0, k = 0; i < block_height; i++) {
+    for (j = 0; j < block_width; j++, k++) {
+      int pixel_value = *frame2;
+
+      // non-local mean approach
+      int diff_sse[9] = { 0 };
+      int idx, idy, index = 0;
+
+      for (idy = -1; idy <= 1; ++idy) {
+        for (idx = -1; idx <= 1; ++idx) {
+          int row = i + idy;
+          int col = j + idx;
+
+          if (row >= 0 && row < (int)block_height &&
+              col >= 0 && col < (int)block_width) {
+            int diff = frame1[byte + idy * (int)stride + idx] -
+                frame2[idy * (int)block_width + idx];
+            diff_sse[index] = diff * diff;
+            ++index;
+          }
+        }
+      }
+
+      assert(index > 0);
+
+      modifier = 0;
+      for (idx = 0; idx < 9; ++idx)
+        modifier += diff_sse[idx];
+
+      modifier *= 3;
+      modifier /= index;
+
+      ++frame2;
+
+      modifier  += rounding;
+      modifier >>= strength;
+
+      if (modifier > 16)
+        modifier = 16;
+
+      modifier = 16 - modifier;
+      modifier *= filter_weight;
+
+      count[k] += modifier;
+      accumulator[k] += modifier * pixel_value;
+
+      byte++;
+    }
+
+    byte += stride - block_width;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_temporal_filter_apply_c(uint8_t *frame1_8,
+                                        unsigned int stride,
+                                        uint8_t *frame2_8,
+                                        unsigned int block_width,
+                                        unsigned int block_height,
+                                        int strength,
+                                        int filter_weight,
+                                        unsigned int *accumulator,
+                                        uint16_t *count) {
+  uint16_t *frame1 = CONVERT_TO_SHORTPTR(frame1_8);
+  uint16_t *frame2 = CONVERT_TO_SHORTPTR(frame2_8);
+  unsigned int i, j, k;
+  int modifier;
+  int byte = 0;
+  const int rounding = strength > 0 ? 1 << (strength - 1) : 0;
+
+  for (i = 0, k = 0; i < block_height; i++) {
+    for (j = 0; j < block_width; j++, k++) {
+      int pixel_value = *frame2;
+      int diff_sse[9] = { 0 };
+      int idx, idy, index = 0;
+
+      for (idy = -1; idy <= 1; ++idy) {
+        for (idx = -1; idx <= 1; ++idx) {
+          int row = i + idy;
+          int col = j + idx;
+
+          if (row >= 0 && row < (int)block_height &&
+              col >= 0 && col < (int)block_width) {
+            int diff = frame1[byte + idy * (int)stride + idx] -
+                frame2[idy * (int)block_width + idx];
+            diff_sse[index] = diff * diff;
+            ++index;
+          }
+        }
+      }
+      assert(index > 0);
+
+      modifier = 0;
+      for (idx = 0; idx < 9; ++idx)
+        modifier += diff_sse[idx];
+
+      modifier *= 3;
+      modifier /= index;
+
+      ++frame2;
+      modifier += rounding;
+      modifier >>= strength;
+
+      if (modifier > 16)
+        modifier = 16;
+
+      modifier = 16 - modifier;
+      modifier *= filter_weight;
+
+      count[k] += modifier;
+      accumulator[k] += modifier * pixel_value;
+
+      byte++;
+    }
+
+    byte += stride - block_width;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
+                                              uint8_t *arf_frame_buf,
+                                              uint8_t *frame_ptr_buf,
+                                              int stride) {
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
+  const SEARCH_METHODS old_search_method = mv_sf->search_method;
+  int step_param;
+  int sadpb = x->sadperbit16;
+  int bestsme = INT_MAX;
+  int distortion;
+  unsigned int sse;
+  int cost_list[5];
+
+  MV best_ref_mv1 = {0, 0};
+  MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
+  MV *ref_mv = &x->e_mbd.mi[0]->bmi[0].as_mv[0].as_mv;
+
+  // Save input state
+  struct buf_2d src = x->plane[0].src;
+  struct buf_2d pre = xd->plane[0].pre[0];
+
+  best_ref_mv1_full.col = best_ref_mv1.col >> 3;
+  best_ref_mv1_full.row = best_ref_mv1.row >> 3;
+
+  // Setup frame pointers
+  x->plane[0].src.buf = arf_frame_buf;
+  x->plane[0].src.stride = stride;
+  xd->plane[0].pre[0].buf = frame_ptr_buf;
+  xd->plane[0].pre[0].stride = stride;
+
+  step_param = mv_sf->reduce_first_step_size;
+  step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2);
+
+  mv_sf->search_method = HEX;
+  vp9_full_pixel_search(cpi, x, BLOCK_16X16, &best_ref_mv1_full, step_param,
+                        sadpb, cond_cost_list(cpi, cost_list), &best_ref_mv1,
+                        ref_mv, 0, 0);
+  mv_sf->search_method = old_search_method;
+
+  // Ignore mv costing by sending NULL pointer instead of cost array
+  bestsme = cpi->find_fractional_mv_step(x, ref_mv,
+                                         &best_ref_mv1,
+                                         cpi->common.allow_high_precision_mv,
+                                         x->errorperbit,
+                                         &cpi->fn_ptr[BLOCK_16X16],
+                                         0, mv_sf->subpel_iters_per_step,
+                                         cond_cost_list(cpi, cost_list),
+                                         NULL, NULL,
+                                         &distortion, &sse, NULL, 0, 0);
+
+  // Restore input state
+  x->plane[0].src = src;
+  xd->plane[0].pre[0] = pre;
+
+  return bestsme;
+}
+
+static void temporal_filter_iterate_c(VP9_COMP *cpi,
+                                      YV12_BUFFER_CONFIG **frames,
+                                      int frame_count,
+                                      int alt_ref_index,
+                                      int strength,
+                                      struct scale_factors *scale) {
+  int byte;
+  int frame;
+  int mb_col, mb_row;
+  unsigned int filter_weight;
+  int mb_cols = (frames[alt_ref_index]->y_crop_width + 15) >> 4;
+  int mb_rows = (frames[alt_ref_index]->y_crop_height + 15) >> 4;
+  int mb_y_offset = 0;
+  int mb_uv_offset = 0;
+  DECLARE_ALIGNED(16, unsigned int, accumulator[16 * 16 * 3]);
+  DECLARE_ALIGNED(16, uint16_t, count[16 * 16 * 3]);
+  MACROBLOCKD *mbd = &cpi->td.mb.e_mbd;
+  YV12_BUFFER_CONFIG *f = frames[alt_ref_index];
+  uint8_t *dst1, *dst2;
+#if CONFIG_VP9_HIGHBITDEPTH
+  DECLARE_ALIGNED(16, uint16_t,  predictor16[16 * 16 * 3]);
+  DECLARE_ALIGNED(16, uint8_t,  predictor8[16 * 16 * 3]);
+  uint8_t *predictor;
+#else
+  DECLARE_ALIGNED(16, uint8_t,  predictor[16 * 16 * 3]);
+#endif
+  const int mb_uv_height = 16 >> mbd->plane[1].subsampling_y;
+  const int mb_uv_width  = 16 >> mbd->plane[1].subsampling_x;
+
+  // Save input state
+  uint8_t* input_buffer[MAX_MB_PLANE];
+  int i;
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    predictor = CONVERT_TO_BYTEPTR(predictor16);
+  } else {
+    predictor = predictor8;
+  }
+#endif
+
+  for (i = 0; i < MAX_MB_PLANE; i++)
+    input_buffer[i] = mbd->plane[i].pre[0].buf;
+
+  for (mb_row = 0; mb_row < mb_rows; mb_row++) {
+    // Source frames are extended to 16 pixels. This is different than
+    //  L/A/G reference frames that have a border of 32 (VP9ENCBORDERINPIXELS)
+    // A 6/8 tap filter is used for motion search.  This requires 2 pixels
+    //  before and 3 pixels after.  So the largest Y mv on a border would
+    //  then be 16 - VP9_INTERP_EXTEND. The UV blocks are half the size of the
+    //  Y and therefore only extended by 8.  The largest mv that a UV block
+    //  can support is 8 - VP9_INTERP_EXTEND.  A UV mv is half of a Y mv.
+    //  (16 - VP9_INTERP_EXTEND) >> 1 which is greater than
+    //  8 - VP9_INTERP_EXTEND.
+    // To keep the mv in play for both Y and UV planes the max that it
+    //  can be on a border is therefore 16 - (2*VP9_INTERP_EXTEND+1).
+    cpi->td.mb.mv_row_min = -((mb_row * 16) + (17 - 2 * VP9_INTERP_EXTEND));
+    cpi->td.mb.mv_row_max = ((mb_rows - 1 - mb_row) * 16)
+                         + (17 - 2 * VP9_INTERP_EXTEND);
+
+    for (mb_col = 0; mb_col < mb_cols; mb_col++) {
+      int i, j, k;
+      int stride;
+
+      memset(accumulator, 0, 16 * 16 * 3 * sizeof(accumulator[0]));
+      memset(count, 0, 16 * 16 * 3 * sizeof(count[0]));
+
+      cpi->td.mb.mv_col_min = -((mb_col * 16) + (17 - 2 * VP9_INTERP_EXTEND));
+      cpi->td.mb.mv_col_max = ((mb_cols - 1 - mb_col) * 16)
+                           + (17 - 2 * VP9_INTERP_EXTEND);
+
+      for (frame = 0; frame < frame_count; frame++) {
+        const int thresh_low  = 10000;
+        const int thresh_high = 20000;
+
+        if (frames[frame] == NULL)
+          continue;
+
+        mbd->mi[0]->bmi[0].as_mv[0].as_mv.row = 0;
+        mbd->mi[0]->bmi[0].as_mv[0].as_mv.col = 0;
+
+        if (frame == alt_ref_index) {
+          filter_weight = 2;
+        } else {
+          // Find best match in this frame by MC
+          int err = temporal_filter_find_matching_mb_c(cpi,
+              frames[alt_ref_index]->y_buffer + mb_y_offset,
+              frames[frame]->y_buffer + mb_y_offset,
+              frames[frame]->y_stride);
+
+          // Assign higher weight to matching MB if it's error
+          // score is lower. If not applying MC default behavior
+          // is to weight all MBs equal.
+          filter_weight = err < thresh_low
+                          ? 2 : err < thresh_high ? 1 : 0;
+        }
+
+        if (filter_weight != 0) {
+          // Construct the predictors
+          temporal_filter_predictors_mb_c(mbd,
+              frames[frame]->y_buffer + mb_y_offset,
+              frames[frame]->u_buffer + mb_uv_offset,
+              frames[frame]->v_buffer + mb_uv_offset,
+              frames[frame]->y_stride,
+              mb_uv_width, mb_uv_height,
+              mbd->mi[0]->bmi[0].as_mv[0].as_mv.row,
+              mbd->mi[0]->bmi[0].as_mv[0].as_mv.col,
+              predictor, scale,
+              mb_col * 16, mb_row * 16);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+          if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+            int adj_strength = strength + 2 * (mbd->bd - 8);
+            // Apply the filter (YUV)
+            vp9_highbd_temporal_filter_apply_c(f->y_buffer + mb_y_offset,
+                                               f->y_stride,
+                                               predictor, 16, 16, adj_strength,
+                                               filter_weight,
+                                               accumulator, count);
+            vp9_highbd_temporal_filter_apply_c(f->u_buffer + mb_uv_offset,
+                                               f->uv_stride, predictor + 256,
+                                               mb_uv_width, mb_uv_height,
+                                               adj_strength,
+                                               filter_weight, accumulator + 256,
+                                               count + 256);
+            vp9_highbd_temporal_filter_apply_c(f->v_buffer + mb_uv_offset,
+                                               f->uv_stride, predictor + 512,
+                                               mb_uv_width, mb_uv_height,
+                                               adj_strength, filter_weight,
+                                               accumulator + 512, count + 512);
+          } else {
+            // Apply the filter (YUV)
+            vp9_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride,
+                                        predictor, 16, 16,
+                                        strength, filter_weight,
+                                        accumulator, count);
+            vp9_temporal_filter_apply_c(f->u_buffer + mb_uv_offset,
+                                        f->uv_stride,
+                                        predictor + 256,
+                                        mb_uv_width, mb_uv_height, strength,
+                                        filter_weight, accumulator + 256,
+                                        count + 256);
+            vp9_temporal_filter_apply_c(f->v_buffer + mb_uv_offset,
+                                        f->uv_stride,
+                                        predictor + 512,
+                                        mb_uv_width, mb_uv_height, strength,
+                                        filter_weight, accumulator + 512,
+                                        count + 512);
+          }
+#else
+          // Apply the filter (YUV)
+          // TODO(jingning): Need SIMD optimization for this.
+          vp9_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride,
+                                      predictor, 16, 16,
+                                      strength, filter_weight,
+                                      accumulator, count);
+          vp9_temporal_filter_apply_c(f->u_buffer + mb_uv_offset, f->uv_stride,
+                                      predictor + 256,
+                                      mb_uv_width, mb_uv_height, strength,
+                                      filter_weight, accumulator + 256,
+                                      count + 256);
+          vp9_temporal_filter_apply_c(f->v_buffer + mb_uv_offset, f->uv_stride,
+                                      predictor + 512,
+                                      mb_uv_width, mb_uv_height, strength,
+                                      filter_weight, accumulator + 512,
+                                      count + 512);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        }
+      }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        uint16_t *dst1_16;
+        uint16_t *dst2_16;
+        // Normalize filter output to produce AltRef frame
+        dst1 = cpi->alt_ref_buffer.y_buffer;
+        dst1_16 = CONVERT_TO_SHORTPTR(dst1);
+        stride = cpi->alt_ref_buffer.y_stride;
+        byte = mb_y_offset;
+        for (i = 0, k = 0; i < 16; i++) {
+          for (j = 0; j < 16; j++, k++) {
+            unsigned int pval = accumulator[k] + (count[k] >> 1);
+            pval *= fixed_divide[count[k]];
+            pval >>= 19;
+
+            dst1_16[byte] = (uint16_t)pval;
+
+            // move to next pixel
+            byte++;
+          }
+
+          byte += stride - 16;
+        }
+
+        dst1 = cpi->alt_ref_buffer.u_buffer;
+        dst2 = cpi->alt_ref_buffer.v_buffer;
+        dst1_16 = CONVERT_TO_SHORTPTR(dst1);
+        dst2_16 = CONVERT_TO_SHORTPTR(dst2);
+        stride = cpi->alt_ref_buffer.uv_stride;
+        byte = mb_uv_offset;
+        for (i = 0, k = 256; i < mb_uv_height; i++) {
+          for (j = 0; j < mb_uv_width; j++, k++) {
+            int m = k + 256;
+
+            // U
+            unsigned int pval = accumulator[k] + (count[k] >> 1);
+            pval *= fixed_divide[count[k]];
+            pval >>= 19;
+            dst1_16[byte] = (uint16_t)pval;
+
+            // V
+            pval = accumulator[m] + (count[m] >> 1);
+            pval *= fixed_divide[count[m]];
+            pval >>= 19;
+            dst2_16[byte] = (uint16_t)pval;
+
+            // move to next pixel
+            byte++;
+          }
+
+          byte += stride - mb_uv_width;
+        }
+      } else {
+        // Normalize filter output to produce AltRef frame
+        dst1 = cpi->alt_ref_buffer.y_buffer;
+        stride = cpi->alt_ref_buffer.y_stride;
+        byte = mb_y_offset;
+        for (i = 0, k = 0; i < 16; i++) {
+          for (j = 0; j < 16; j++, k++) {
+            unsigned int pval = accumulator[k] + (count[k] >> 1);
+            pval *= fixed_divide[count[k]];
+            pval >>= 19;
+
+            dst1[byte] = (uint8_t)pval;
+
+            // move to next pixel
+            byte++;
+          }
+          byte += stride - 16;
+        }
+
+        dst1 = cpi->alt_ref_buffer.u_buffer;
+        dst2 = cpi->alt_ref_buffer.v_buffer;
+        stride = cpi->alt_ref_buffer.uv_stride;
+        byte = mb_uv_offset;
+        for (i = 0, k = 256; i < mb_uv_height; i++) {
+          for (j = 0; j < mb_uv_width; j++, k++) {
+            int m = k + 256;
+
+            // U
+            unsigned int pval = accumulator[k] + (count[k] >> 1);
+            pval *= fixed_divide[count[k]];
+            pval >>= 19;
+            dst1[byte] = (uint8_t)pval;
+
+            // V
+            pval = accumulator[m] + (count[m] >> 1);
+            pval *= fixed_divide[count[m]];
+            pval >>= 19;
+            dst2[byte] = (uint8_t)pval;
+
+            // move to next pixel
+            byte++;
+          }
+          byte += stride - mb_uv_width;
+        }
+      }
+#else
+      // Normalize filter output to produce AltRef frame
+      dst1 = cpi->alt_ref_buffer.y_buffer;
+      stride = cpi->alt_ref_buffer.y_stride;
+      byte = mb_y_offset;
+      for (i = 0, k = 0; i < 16; i++) {
+        for (j = 0; j < 16; j++, k++) {
+          unsigned int pval = accumulator[k] + (count[k] >> 1);
+          pval *= fixed_divide[count[k]];
+          pval >>= 19;
+
+          dst1[byte] = (uint8_t)pval;
+
+          // move to next pixel
+          byte++;
+        }
+        byte += stride - 16;
+      }
+
+      dst1 = cpi->alt_ref_buffer.u_buffer;
+      dst2 = cpi->alt_ref_buffer.v_buffer;
+      stride = cpi->alt_ref_buffer.uv_stride;
+      byte = mb_uv_offset;
+      for (i = 0, k = 256; i < mb_uv_height; i++) {
+        for (j = 0; j < mb_uv_width; j++, k++) {
+          int m = k + 256;
+
+          // U
+          unsigned int pval = accumulator[k] + (count[k] >> 1);
+          pval *= fixed_divide[count[k]];
+          pval >>= 19;
+          dst1[byte] = (uint8_t)pval;
+
+          // V
+          pval = accumulator[m] + (count[m] >> 1);
+          pval *= fixed_divide[count[m]];
+          pval >>= 19;
+          dst2[byte] = (uint8_t)pval;
+
+          // move to next pixel
+          byte++;
+        }
+        byte += stride - mb_uv_width;
+      }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      mb_y_offset += 16;
+      mb_uv_offset += mb_uv_width;
+    }
+    mb_y_offset += 16 * (f->y_stride - mb_cols);
+    mb_uv_offset += mb_uv_height * f->uv_stride - mb_uv_width * mb_cols;
+  }
+
+  // Restore input state
+  for (i = 0; i < MAX_MB_PLANE; i++)
+    mbd->plane[i].pre[0].buf = input_buffer[i];
+}
+
+// Apply buffer limits and context specific adjustments to arnr filter.
+static void adjust_arnr_filter(VP9_COMP *cpi,
+                               int distance, int group_boost,
+                               int *arnr_frames, int *arnr_strength) {
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  const int frames_after_arf =
+      vp9_lookahead_depth(cpi->lookahead) - distance - 1;
+  int frames_fwd = (cpi->oxcf.arnr_max_frames - 1) >> 1;
+  int frames_bwd;
+  int q, frames, strength;
+
+  // Define the forward and backwards filter limits for this arnr group.
+  if (frames_fwd > frames_after_arf)
+    frames_fwd = frames_after_arf;
+  if (frames_fwd > distance)
+    frames_fwd = distance;
+
+  frames_bwd = frames_fwd;
+
+  // For even length filter there is one more frame backward
+  // than forward: e.g. len=6 ==> bbbAff, len=7 ==> bbbAfff.
+  if (frames_bwd < distance)
+    frames_bwd += (oxcf->arnr_max_frames + 1) & 0x1;
+
+  // Set the baseline active filter size.
+  frames = frames_bwd + 1 + frames_fwd;
+
+  // Adjust the strength based on active max q.
+  if (cpi->common.current_video_frame > 1)
+    q = ((int)vp9_convert_qindex_to_q(
+        cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.bit_depth));
+  else
+    q = ((int)vp9_convert_qindex_to_q(
+        cpi->rc.avg_frame_qindex[KEY_FRAME], cpi->common.bit_depth));
+  if (q > 16) {
+    strength = oxcf->arnr_strength;
+  } else {
+    strength = oxcf->arnr_strength - ((16 - q) / 2);
+    if (strength < 0)
+      strength = 0;
+  }
+
+  // Adjust number of frames in filter and strength based on gf boost level.
+  if (frames > group_boost / 150) {
+    frames = group_boost / 150;
+    frames += !(frames & 1);
+  }
+
+  if (strength > group_boost / 300) {
+    strength = group_boost / 300;
+  }
+
+  // Adjustments for second level arf in multi arf case.
+  if (cpi->oxcf.pass == 2 && cpi->multi_arf_allowed) {
+    const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+    if (gf_group->rf_level[gf_group->index] != GF_ARF_STD) {
+      strength >>= 1;
+    }
+  }
+
+  *arnr_frames = frames;
+  *arnr_strength = strength;
+}
+
+void vp9_temporal_filter(VP9_COMP *cpi, int distance) {
+  VP9_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  int frame;
+  int frames_to_blur;
+  int start_frame;
+  int strength;
+  int frames_to_blur_backward;
+  int frames_to_blur_forward;
+  struct scale_factors sf;
+  YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS] = {NULL};
+
+  // Apply context specific adjustments to the arnr filter parameters.
+  adjust_arnr_filter(cpi, distance, rc->gfu_boost, &frames_to_blur, &strength);
+  frames_to_blur_backward = (frames_to_blur / 2);
+  frames_to_blur_forward = ((frames_to_blur - 1) / 2);
+  start_frame = distance + frames_to_blur_forward;
+
+  // Setup frame pointers, NULL indicates frame not included in filter.
+  for (frame = 0; frame < frames_to_blur; ++frame) {
+    const int which_buffer = start_frame - frame;
+    struct lookahead_entry *buf = vp9_lookahead_peek(cpi->lookahead,
+                                                     which_buffer);
+    frames[frames_to_blur - 1 - frame] = &buf->img;
+  }
+
+  if (frames_to_blur > 0) {
+    // Setup scaling factors. Scaling on each of the arnr frames is not
+    // supported.
+    if (cpi->use_svc) {
+      // In spatial svc the scaling factors might be less then 1/2.
+      // So we will use non-normative scaling.
+      int frame_used = 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+      vp9_setup_scale_factors_for_frame(
+          &sf,
+          get_frame_new_buffer(cm)->y_crop_width,
+          get_frame_new_buffer(cm)->y_crop_height,
+          get_frame_new_buffer(cm)->y_crop_width,
+          get_frame_new_buffer(cm)->y_crop_height,
+          cm->use_highbitdepth);
+#else
+      vp9_setup_scale_factors_for_frame(
+          &sf,
+          get_frame_new_buffer(cm)->y_crop_width,
+          get_frame_new_buffer(cm)->y_crop_height,
+          get_frame_new_buffer(cm)->y_crop_width,
+          get_frame_new_buffer(cm)->y_crop_height);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+      for (frame = 0; frame < frames_to_blur; ++frame) {
+        if (cm->mi_cols * MI_SIZE != frames[frame]->y_width ||
+            cm->mi_rows * MI_SIZE != frames[frame]->y_height) {
+          if (vpx_realloc_frame_buffer(&cpi->svc.scaled_frames[frame_used],
+                                       cm->width, cm->height,
+                                       cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                       cm->use_highbitdepth,
+#endif
+                                       VP9_ENC_BORDER_IN_PIXELS,
+                                       cm->byte_alignment,
+                                       NULL, NULL, NULL)) {
+            vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                               "Failed to reallocate alt_ref_buffer");
+          }
+          frames[frame] = vp9_scale_if_required(
+              cm, frames[frame], &cpi->svc.scaled_frames[frame_used], 0);
+          ++frame_used;
+        }
+      }
+      cm->mi = cm->mip + cm->mi_stride + 1;
+      xd->mi = cm->mi_grid_visible;
+      xd->mi[0] = cm->mi;
+    } else {
+      // ARF is produced at the native frame size and resized when coded.
+#if CONFIG_VP9_HIGHBITDEPTH
+      vp9_setup_scale_factors_for_frame(&sf,
+                                        frames[0]->y_crop_width,
+                                        frames[0]->y_crop_height,
+                                        frames[0]->y_crop_width,
+                                        frames[0]->y_crop_height,
+                                        cm->use_highbitdepth);
+#else
+      vp9_setup_scale_factors_for_frame(&sf,
+                                        frames[0]->y_crop_width,
+                                        frames[0]->y_crop_height,
+                                        frames[0]->y_crop_width,
+                                        frames[0]->y_crop_height);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    }
+  }
+
+  temporal_filter_iterate_c(cpi, frames, frames_to_blur,
+                            frames_to_blur_backward, strength, &sf);
+}
diff --git a/libs/libvpx/vp9/encoder/vp9_temporal_filter.h b/libs/libvpx/vp9/encoder/vp9_temporal_filter.h
new file mode 100644
index 0000000000..f537b8870a
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_temporal_filter.h
@@ -0,0 +1,25 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_VP9_TEMPORAL_FILTER_H_
+#define VP9_ENCODER_VP9_TEMPORAL_FILTER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp9_temporal_filter_init(void);
+void vp9_temporal_filter(VP9_COMP *cpi, int distance);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_VP9_TEMPORAL_FILTER_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_tokenize.c b/libs/libvpx/vp9/encoder/vp9_tokenize.c
new file mode 100644
index 0000000000..93be6d7ae3
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_tokenize.c
@@ -0,0 +1,477 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "vpx_mem/vpx_mem.h"
+
+#include "vp9/common/vp9_entropy.h"
+#include "vp9/common/vp9_pred_common.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/common/vp9_seg_common.h"
+
+#include "vp9/encoder/vp9_cost.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_tokenize.h"
+
+static const TOKENVALUE dct_cat_lt_10_value_tokens[] = {
+  {9, 63}, {9, 61}, {9, 59}, {9, 57}, {9, 55}, {9, 53}, {9, 51}, {9, 49},
+  {9, 47}, {9, 45}, {9, 43}, {9, 41}, {9, 39}, {9, 37}, {9, 35}, {9, 33},
+  {9, 31}, {9, 29}, {9, 27}, {9, 25}, {9, 23}, {9, 21}, {9, 19}, {9, 17},
+  {9, 15}, {9, 13}, {9, 11}, {9, 9}, {9, 7}, {9, 5}, {9, 3}, {9, 1},
+  {8, 31}, {8, 29}, {8, 27}, {8, 25}, {8, 23}, {8, 21},
+  {8, 19}, {8, 17}, {8, 15}, {8, 13}, {8, 11}, {8, 9},
+  {8, 7}, {8, 5}, {8, 3}, {8, 1},
+  {7, 15}, {7, 13}, {7, 11}, {7, 9}, {7, 7}, {7, 5}, {7, 3}, {7, 1},
+  {6, 7}, {6, 5}, {6, 3}, {6, 1}, {5, 3}, {5, 1},
+  {4, 1}, {3, 1}, {2, 1}, {1, 1}, {0, 0},
+  {1, 0},  {2, 0}, {3, 0}, {4, 0},
+  {5, 0}, {5, 2}, {6, 0}, {6, 2}, {6, 4}, {6, 6},
+  {7, 0}, {7, 2}, {7, 4}, {7, 6}, {7, 8}, {7, 10}, {7, 12}, {7, 14},
+  {8, 0}, {8, 2}, {8, 4}, {8, 6}, {8, 8}, {8, 10}, {8, 12},
+  {8, 14}, {8, 16}, {8, 18}, {8, 20}, {8, 22}, {8, 24},
+  {8, 26}, {8, 28}, {8, 30}, {9, 0}, {9, 2},
+  {9, 4}, {9, 6}, {9, 8}, {9, 10}, {9, 12}, {9, 14}, {9, 16},
+  {9, 18}, {9, 20}, {9, 22}, {9, 24}, {9, 26}, {9, 28},
+  {9, 30}, {9, 32}, {9, 34}, {9, 36}, {9, 38}, {9, 40},
+  {9, 42}, {9, 44}, {9, 46}, {9, 48}, {9, 50}, {9, 52},
+  {9, 54}, {9, 56}, {9, 58}, {9, 60}, {9, 62}
+};
+const TOKENVALUE *vp9_dct_cat_lt_10_value_tokens = dct_cat_lt_10_value_tokens +
+    (sizeof(dct_cat_lt_10_value_tokens) / sizeof(*dct_cat_lt_10_value_tokens))
+    / 2;
+
+// Array indices are identical to previously-existing CONTEXT_NODE indices
+const vpx_tree_index vp9_coef_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
+  -EOB_TOKEN, 2,                       // 0  = EOB
+  -ZERO_TOKEN, 4,                      // 1  = ZERO
+  -ONE_TOKEN, 6,                       // 2  = ONE
+  8, 12,                               // 3  = LOW_VAL
+  -TWO_TOKEN, 10,                      // 4  = TWO
+  -THREE_TOKEN, -FOUR_TOKEN,           // 5  = THREE
+  14, 16,                              // 6  = HIGH_LOW
+  -CATEGORY1_TOKEN, -CATEGORY2_TOKEN,  // 7  = CAT_ONE
+  18, 20,                              // 8  = CAT_THREEFOUR
+  -CATEGORY3_TOKEN, -CATEGORY4_TOKEN,  // 9  = CAT_THREE
+  -CATEGORY5_TOKEN, -CATEGORY6_TOKEN   // 10 = CAT_FIVE
+};
+
+static const int16_t zero_cost[] = {0};
+static const int16_t sign_cost[1] = {512};
+static const int16_t cat1_cost[1 << 1] = {864, 1229};
+static const int16_t cat2_cost[1 << 2] = {1256, 1453, 1696, 1893};
+static const int16_t cat3_cost[1 << 3] = {1652, 1791, 1884, 2023,
+                                          2195, 2334, 2427, 2566};
+static const int16_t cat4_cost[1 << 4] = {2079, 2160, 2218, 2299, 2395, 2476,
+                                          2534, 2615, 2661, 2742, 2800, 2881,
+                                          2977, 3058, 3116, 3197};
+static const int16_t cat5_cost[1 << 5] = {
+    2553, 2576, 2622, 2645, 2703, 2726, 2772, 2795, 2894, 2917, 2963,
+    2986, 3044, 3067, 3113, 3136, 3190, 3213, 3259, 3282, 3340, 3363,
+    3409, 3432, 3531, 3554, 3600, 3623, 3681, 3704, 3750, 3773};
+const int16_t vp9_cat6_low_cost[256] = {
+    3378, 3390, 3401, 3413, 3435, 3447, 3458, 3470, 3517, 3529, 3540, 3552,
+    3574, 3586, 3597, 3609, 3671, 3683, 3694, 3706, 3728, 3740, 3751, 3763,
+    3810, 3822, 3833, 3845, 3867, 3879, 3890, 3902, 3973, 3985, 3996, 4008,
+    4030, 4042, 4053, 4065, 4112, 4124, 4135, 4147, 4169, 4181, 4192, 4204,
+    4266, 4278, 4289, 4301, 4323, 4335, 4346, 4358, 4405, 4417, 4428, 4440,
+    4462, 4474, 4485, 4497, 4253, 4265, 4276, 4288, 4310, 4322, 4333, 4345,
+    4392, 4404, 4415, 4427, 4449, 4461, 4472, 4484, 4546, 4558, 4569, 4581,
+    4603, 4615, 4626, 4638, 4685, 4697, 4708, 4720, 4742, 4754, 4765, 4777,
+    4848, 4860, 4871, 4883, 4905, 4917, 4928, 4940, 4987, 4999, 5010, 5022,
+    5044, 5056, 5067, 5079, 5141, 5153, 5164, 5176, 5198, 5210, 5221, 5233,
+    5280, 5292, 5303, 5315, 5337, 5349, 5360, 5372, 4988, 5000, 5011, 5023,
+    5045, 5057, 5068, 5080, 5127, 5139, 5150, 5162, 5184, 5196, 5207, 5219,
+    5281, 5293, 5304, 5316, 5338, 5350, 5361, 5373, 5420, 5432, 5443, 5455,
+    5477, 5489, 5500, 5512, 5583, 5595, 5606, 5618, 5640, 5652, 5663, 5675,
+    5722, 5734, 5745, 5757, 5779, 5791, 5802, 5814, 5876, 5888, 5899, 5911,
+    5933, 5945, 5956, 5968, 6015, 6027, 6038, 6050, 6072, 6084, 6095, 6107,
+    5863, 5875, 5886, 5898, 5920, 5932, 5943, 5955, 6002, 6014, 6025, 6037,
+    6059, 6071, 6082, 6094, 6156, 6168, 6179, 6191, 6213, 6225, 6236, 6248,
+    6295, 6307, 6318, 6330, 6352, 6364, 6375, 6387, 6458, 6470, 6481, 6493,
+    6515, 6527, 6538, 6550, 6597, 6609, 6620, 6632, 6654, 6666, 6677, 6689,
+    6751, 6763, 6774, 6786, 6808, 6820, 6831, 6843, 6890, 6902, 6913, 6925,
+    6947, 6959, 6970, 6982};
+const int vp9_cat6_high_cost[64] = {
+    88,    2251,  2727,  4890,  3148,  5311,  5787,  7950,  3666,  5829,  6305,
+    8468,  6726,  8889,  9365,  11528, 3666,  5829,  6305,  8468,  6726,  8889,
+    9365,  11528, 7244,  9407,  9883,  12046, 10304, 12467, 12943, 15106, 3666,
+    5829,  6305,  8468,  6726,  8889,  9365,  11528, 7244,  9407,  9883,  12046,
+    10304, 12467, 12943, 15106, 7244,  9407,  9883,  12046, 10304, 12467, 12943,
+    15106, 10822, 12985, 13461, 15624, 13882, 16045, 16521, 18684};
+
+#if CONFIG_VP9_HIGHBITDEPTH
+const int vp9_cat6_high10_high_cost[256] = {
+    94,    2257,  2733,  4896,  3154,  5317,  5793,  7956,  3672,  5835,  6311,
+    8474,  6732,  8895,  9371,  11534, 3672,  5835,  6311,  8474,  6732,  8895,
+    9371,  11534, 7250,  9413,  9889,  12052, 10310, 12473, 12949, 15112, 3672,
+    5835,  6311,  8474,  6732,  8895,  9371,  11534, 7250,  9413,  9889,  12052,
+    10310, 12473, 12949, 15112, 7250,  9413,  9889,  12052, 10310, 12473, 12949,
+    15112, 10828, 12991, 13467, 15630, 13888, 16051, 16527, 18690, 4187,  6350,
+    6826,  8989,  7247,  9410,  9886,  12049, 7765,  9928,  10404, 12567, 10825,
+    12988, 13464, 15627, 7765,  9928,  10404, 12567, 10825, 12988, 13464, 15627,
+    11343, 13506, 13982, 16145, 14403, 16566, 17042, 19205, 7765,  9928,  10404,
+    12567, 10825, 12988, 13464, 15627, 11343, 13506, 13982, 16145, 14403, 16566,
+    17042, 19205, 11343, 13506, 13982, 16145, 14403, 16566, 17042, 19205, 14921,
+    17084, 17560, 19723, 17981, 20144, 20620, 22783, 4187,  6350,  6826,  8989,
+    7247,  9410,  9886,  12049, 7765,  9928,  10404, 12567, 10825, 12988, 13464,
+    15627, 7765,  9928,  10404, 12567, 10825, 12988, 13464, 15627, 11343, 13506,
+    13982, 16145, 14403, 16566, 17042, 19205, 7765,  9928,  10404, 12567, 10825,
+    12988, 13464, 15627, 11343, 13506, 13982, 16145, 14403, 16566, 17042, 19205,
+    11343, 13506, 13982, 16145, 14403, 16566, 17042, 19205, 14921, 17084, 17560,
+    19723, 17981, 20144, 20620, 22783, 8280,  10443, 10919, 13082, 11340, 13503,
+    13979, 16142, 11858, 14021, 14497, 16660, 14918, 17081, 17557, 19720, 11858,
+    14021, 14497, 16660, 14918, 17081, 17557, 19720, 15436, 17599, 18075, 20238,
+    18496, 20659, 21135, 23298, 11858, 14021, 14497, 16660, 14918, 17081, 17557,
+    19720, 15436, 17599, 18075, 20238, 18496, 20659, 21135, 23298, 15436, 17599,
+    18075, 20238, 18496, 20659, 21135, 23298, 19014, 21177, 21653, 23816, 22074,
+    24237, 24713, 26876};
+const int vp9_cat6_high12_high_cost[1024] = {
+    100,   2263,  2739,  4902,  3160,  5323,  5799,  7962,  3678,  5841,  6317,
+    8480,  6738,  8901,  9377,  11540, 3678,  5841,  6317,  8480,  6738,  8901,
+    9377,  11540, 7256,  9419,  9895,  12058, 10316, 12479, 12955, 15118, 3678,
+    5841,  6317,  8480,  6738,  8901,  9377,  11540, 7256,  9419,  9895,  12058,
+    10316, 12479, 12955, 15118, 7256,  9419,  9895,  12058, 10316, 12479, 12955,
+    15118, 10834, 12997, 13473, 15636, 13894, 16057, 16533, 18696, 4193,  6356,
+    6832,  8995,  7253,  9416,  9892,  12055, 7771,  9934,  10410, 12573, 10831,
+    12994, 13470, 15633, 7771,  9934,  10410, 12573, 10831, 12994, 13470, 15633,
+    11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211, 7771,  9934,  10410,
+    12573, 10831, 12994, 13470, 15633, 11349, 13512, 13988, 16151, 14409, 16572,
+    17048, 19211, 11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211, 14927,
+    17090, 17566, 19729, 17987, 20150, 20626, 22789, 4193,  6356,  6832,  8995,
+    7253,  9416,  9892,  12055, 7771,  9934,  10410, 12573, 10831, 12994, 13470,
+    15633, 7771,  9934,  10410, 12573, 10831, 12994, 13470, 15633, 11349, 13512,
+    13988, 16151, 14409, 16572, 17048, 19211, 7771,  9934,  10410, 12573, 10831,
+    12994, 13470, 15633, 11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211,
+    11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211, 14927, 17090, 17566,
+    19729, 17987, 20150, 20626, 22789, 8286,  10449, 10925, 13088, 11346, 13509,
+    13985, 16148, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 11864,
+    14027, 14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081, 20244,
+    18502, 20665, 21141, 23304, 11864, 14027, 14503, 16666, 14924, 17087, 17563,
+    19726, 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, 15442, 17605,
+    18081, 20244, 18502, 20665, 21141, 23304, 19020, 21183, 21659, 23822, 22080,
+    24243, 24719, 26882, 4193,  6356,  6832,  8995,  7253,  9416,  9892,  12055,
+    7771,  9934,  10410, 12573, 10831, 12994, 13470, 15633, 7771,  9934,  10410,
+    12573, 10831, 12994, 13470, 15633, 11349, 13512, 13988, 16151, 14409, 16572,
+    17048, 19211, 7771,  9934,  10410, 12573, 10831, 12994, 13470, 15633, 11349,
+    13512, 13988, 16151, 14409, 16572, 17048, 19211, 11349, 13512, 13988, 16151,
+    14409, 16572, 17048, 19211, 14927, 17090, 17566, 19729, 17987, 20150, 20626,
+    22789, 8286,  10449, 10925, 13088, 11346, 13509, 13985, 16148, 11864, 14027,
+    14503, 16666, 14924, 17087, 17563, 19726, 11864, 14027, 14503, 16666, 14924,
+    17087, 17563, 19726, 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304,
+    11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081,
+    20244, 18502, 20665, 21141, 23304, 15442, 17605, 18081, 20244, 18502, 20665,
+    21141, 23304, 19020, 21183, 21659, 23822, 22080, 24243, 24719, 26882, 8286,
+    10449, 10925, 13088, 11346, 13509, 13985, 16148, 11864, 14027, 14503, 16666,
+    14924, 17087, 17563, 19726, 11864, 14027, 14503, 16666, 14924, 17087, 17563,
+    19726, 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, 11864, 14027,
+    14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081, 20244, 18502,
+    20665, 21141, 23304, 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304,
+    19020, 21183, 21659, 23822, 22080, 24243, 24719, 26882, 12379, 14542, 15018,
+    17181, 15439, 17602, 18078, 20241, 15957, 18120, 18596, 20759, 19017, 21180,
+    21656, 23819, 15957, 18120, 18596, 20759, 19017, 21180, 21656, 23819, 19535,
+    21698, 22174, 24337, 22595, 24758, 25234, 27397, 15957, 18120, 18596, 20759,
+    19017, 21180, 21656, 23819, 19535, 21698, 22174, 24337, 22595, 24758, 25234,
+    27397, 19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397, 23113, 25276,
+    25752, 27915, 26173, 28336, 28812, 30975, 4193,  6356,  6832,  8995,  7253,
+    9416,  9892,  12055, 7771,  9934,  10410, 12573, 10831, 12994, 13470, 15633,
+    7771,  9934,  10410, 12573, 10831, 12994, 13470, 15633, 11349, 13512, 13988,
+    16151, 14409, 16572, 17048, 19211, 7771,  9934,  10410, 12573, 10831, 12994,
+    13470, 15633, 11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211, 11349,
+    13512, 13988, 16151, 14409, 16572, 17048, 19211, 14927, 17090, 17566, 19729,
+    17987, 20150, 20626, 22789, 8286,  10449, 10925, 13088, 11346, 13509, 13985,
+    16148, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 11864, 14027,
+    14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081, 20244, 18502,
+    20665, 21141, 23304, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726,
+    15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, 15442, 17605, 18081,
+    20244, 18502, 20665, 21141, 23304, 19020, 21183, 21659, 23822, 22080, 24243,
+    24719, 26882, 8286,  10449, 10925, 13088, 11346, 13509, 13985, 16148, 11864,
+    14027, 14503, 16666, 14924, 17087, 17563, 19726, 11864, 14027, 14503, 16666,
+    14924, 17087, 17563, 19726, 15442, 17605, 18081, 20244, 18502, 20665, 21141,
+    23304, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605,
+    18081, 20244, 18502, 20665, 21141, 23304, 15442, 17605, 18081, 20244, 18502,
+    20665, 21141, 23304, 19020, 21183, 21659, 23822, 22080, 24243, 24719, 26882,
+    12379, 14542, 15018, 17181, 15439, 17602, 18078, 20241, 15957, 18120, 18596,
+    20759, 19017, 21180, 21656, 23819, 15957, 18120, 18596, 20759, 19017, 21180,
+    21656, 23819, 19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397, 15957,
+    18120, 18596, 20759, 19017, 21180, 21656, 23819, 19535, 21698, 22174, 24337,
+    22595, 24758, 25234, 27397, 19535, 21698, 22174, 24337, 22595, 24758, 25234,
+    27397, 23113, 25276, 25752, 27915, 26173, 28336, 28812, 30975, 8286,  10449,
+    10925, 13088, 11346, 13509, 13985, 16148, 11864, 14027, 14503, 16666, 14924,
+    17087, 17563, 19726, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726,
+    15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, 11864, 14027, 14503,
+    16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081, 20244, 18502, 20665,
+    21141, 23304, 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, 19020,
+    21183, 21659, 23822, 22080, 24243, 24719, 26882, 12379, 14542, 15018, 17181,
+    15439, 17602, 18078, 20241, 15957, 18120, 18596, 20759, 19017, 21180, 21656,
+    23819, 15957, 18120, 18596, 20759, 19017, 21180, 21656, 23819, 19535, 21698,
+    22174, 24337, 22595, 24758, 25234, 27397, 15957, 18120, 18596, 20759, 19017,
+    21180, 21656, 23819, 19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397,
+    19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397, 23113, 25276, 25752,
+    27915, 26173, 28336, 28812, 30975, 12379, 14542, 15018, 17181, 15439, 17602,
+    18078, 20241, 15957, 18120, 18596, 20759, 19017, 21180, 21656, 23819, 15957,
+    18120, 18596, 20759, 19017, 21180, 21656, 23819, 19535, 21698, 22174, 24337,
+    22595, 24758, 25234, 27397, 15957, 18120, 18596, 20759, 19017, 21180, 21656,
+    23819, 19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397, 19535, 21698,
+    22174, 24337, 22595, 24758, 25234, 27397, 23113, 25276, 25752, 27915, 26173,
+    28336, 28812, 30975, 16472, 18635, 19111, 21274, 19532, 21695, 22171, 24334,
+    20050, 22213, 22689, 24852, 23110, 25273, 25749, 27912, 20050, 22213, 22689,
+    24852, 23110, 25273, 25749, 27912, 23628, 25791, 26267, 28430, 26688, 28851,
+    29327, 31490, 20050, 22213, 22689, 24852, 23110, 25273, 25749, 27912, 23628,
+    25791, 26267, 28430, 26688, 28851, 29327, 31490, 23628, 25791, 26267, 28430,
+    26688, 28851, 29327, 31490, 27206, 29369, 29845, 32008, 30266, 32429, 32905,
+    35068};
+#endif
+
+const vp9_extra_bit vp9_extra_bits[ENTROPY_TOKENS] = {
+  {0, 0, 0, zero_cost},                          // ZERO_TOKEN
+  {0, 0, 1, sign_cost},                          // ONE_TOKEN
+  {0, 0, 2, sign_cost},                          // TWO_TOKEN
+  {0, 0, 3, sign_cost},                          // THREE_TOKEN
+  {0, 0, 4, sign_cost},                          // FOUR_TOKEN
+  {vp9_cat1_prob, 1,  CAT1_MIN_VAL, cat1_cost},  // CATEGORY1_TOKEN
+  {vp9_cat2_prob, 2,  CAT2_MIN_VAL, cat2_cost},  // CATEGORY2_TOKEN
+  {vp9_cat3_prob, 3,  CAT3_MIN_VAL, cat3_cost},  // CATEGORY3_TOKEN
+  {vp9_cat4_prob, 4,  CAT4_MIN_VAL, cat4_cost},  // CATEGORY4_TOKEN
+  {vp9_cat5_prob, 5,  CAT5_MIN_VAL, cat5_cost},  // CATEGORY5_TOKEN
+  {vp9_cat6_prob, 14, CAT6_MIN_VAL, 0},          // CATEGORY6_TOKEN
+  {0, 0, 0, zero_cost}                           // EOB_TOKEN
+};
+
+#if CONFIG_VP9_HIGHBITDEPTH
+const vp9_extra_bit vp9_extra_bits_high10[ENTROPY_TOKENS] = {
+  {0, 0, 0, zero_cost},                             // ZERO
+  {0, 0, 1, sign_cost},                             // ONE
+  {0, 0, 2, sign_cost},                             // TWO
+  {0, 0, 3, sign_cost},                             // THREE
+  {0, 0, 4, sign_cost},                             // FOUR
+  {vp9_cat1_prob, 1,  CAT1_MIN_VAL, cat1_cost},     // CAT1
+  {vp9_cat2_prob, 2,  CAT2_MIN_VAL, cat2_cost},     // CAT2
+  {vp9_cat3_prob, 3,  CAT3_MIN_VAL, cat3_cost},     // CAT3
+  {vp9_cat4_prob, 4,  CAT4_MIN_VAL, cat4_cost},     // CAT4
+  {vp9_cat5_prob, 5,  CAT5_MIN_VAL, cat5_cost},     // CAT5
+  {vp9_cat6_prob_high12 + 2, 16, CAT6_MIN_VAL, 0},  // CAT6
+  {0, 0, 0, zero_cost}                              // EOB
+};
+const vp9_extra_bit vp9_extra_bits_high12[ENTROPY_TOKENS] = {
+  {0, 0, 0, zero_cost},                          // ZERO
+  {0, 0, 1, sign_cost},                          // ONE
+  {0, 0, 2, sign_cost},                          // TWO
+  {0, 0, 3, sign_cost},                          // THREE
+  {0, 0, 4, sign_cost},                          // FOUR
+  {vp9_cat1_prob, 1,  CAT1_MIN_VAL, cat1_cost},  // CAT1
+  {vp9_cat2_prob, 2,  CAT2_MIN_VAL, cat2_cost},  // CAT2
+  {vp9_cat3_prob, 3,  CAT3_MIN_VAL, cat3_cost},  // CAT3
+  {vp9_cat4_prob, 4,  CAT4_MIN_VAL, cat4_cost},  // CAT4
+  {vp9_cat5_prob, 5,  CAT5_MIN_VAL, cat5_cost},  // CAT5
+  {vp9_cat6_prob_high12, 18, CAT6_MIN_VAL, 0},   // CAT6
+  {0, 0, 0, zero_cost}                           // EOB
+};
+#endif
+
+const struct vp9_token vp9_coef_encodings[ENTROPY_TOKENS] = {
+  {2, 2}, {6, 3}, {28, 5}, {58, 6}, {59, 6}, {60, 6}, {61, 6}, {124, 7},
+  {125, 7}, {126, 7}, {127, 7}, {0, 1}
+};
+
+
+struct tokenize_b_args {
+  VP9_COMP *cpi;
+  ThreadData *td;
+  TOKENEXTRA **tp;
+};
+
+static void set_entropy_context_b(int plane, int block, BLOCK_SIZE plane_bsize,
+                                  TX_SIZE tx_size, void *arg) {
+  struct tokenize_b_args* const args = arg;
+  ThreadData *const td = args->td;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *p = &x->plane[plane];
+  struct macroblockd_plane *pd = &xd->plane[plane];
+  int aoff, loff;
+  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff);
+  vp9_set_contexts(xd, pd, plane_bsize, tx_size, p->eobs[block] > 0,
+                   aoff, loff);
+}
+
+static INLINE void add_token(TOKENEXTRA **t, const vpx_prob *context_tree,
+                             int16_t token, EXTRABIT extra,
+                             unsigned int *counts) {
+  (*t)->context_tree = context_tree;
+  (*t)->token = token;
+  (*t)->extra = extra;
+  (*t)++;
+  ++counts[token];
+}
+
+static INLINE void add_token_no_extra(TOKENEXTRA **t,
+                                      const vpx_prob *context_tree,
+                                      int16_t token,
+                                      unsigned int *counts) {
+  (*t)->context_tree = context_tree;
+  (*t)->token = token;
+  (*t)++;
+  ++counts[token];
+}
+
+static INLINE int get_tx_eob(const struct segmentation *seg, int segment_id,
+                             TX_SIZE tx_size) {
+  const int eob_max = 16 << (tx_size << 1);
+  return segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
+}
+
+static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
+                       TX_SIZE tx_size, void *arg) {
+  struct tokenize_b_args* const args = arg;
+  VP9_COMP *cpi = args->cpi;
+  ThreadData *const td = args->td;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  TOKENEXTRA **tp = args->tp;
+  uint8_t token_cache[32 * 32];
+  struct macroblock_plane *p = &x->plane[plane];
+  struct macroblockd_plane *pd = &xd->plane[plane];
+  MODE_INFO *mi = xd->mi[0];
+  int pt; /* near block/prev token context index */
+  int c;
+  TOKENEXTRA *t = *tp;        /* store tokens starting here */
+  int eob = p->eobs[block];
+  const PLANE_TYPE type = get_plane_type(plane);
+  const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  const int segment_id = mi->segment_id;
+  const int16_t *scan, *nb;
+  const scan_order *so;
+  const int ref = is_inter_block(mi);
+  unsigned int (*const counts)[COEFF_CONTEXTS][ENTROPY_TOKENS] =
+      td->rd_counts.coef_counts[tx_size][type][ref];
+  vpx_prob (*const coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
+      cpi->common.fc->coef_probs[tx_size][type][ref];
+  unsigned int (*const eob_branch)[COEFF_CONTEXTS] =
+      td->counts->eob_branch[tx_size][type][ref];
+  const uint8_t *const band = get_band_translate(tx_size);
+  const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size);
+  int16_t token;
+  EXTRABIT extra;
+  int aoff, loff;
+  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff);
+
+  pt = get_entropy_context(tx_size, pd->above_context + aoff,
+                           pd->left_context + loff);
+  so = get_scan(xd, tx_size, type, block);
+  scan = so->scan;
+  nb = so->neighbors;
+  c = 0;
+
+  while (c < eob) {
+    int v = 0;
+    v = qcoeff[scan[c]];
+    ++eob_branch[band[c]][pt];
+
+    while (!v) {
+      add_token_no_extra(&t, coef_probs[band[c]][pt], ZERO_TOKEN,
+                         counts[band[c]][pt]);
+
+      token_cache[scan[c]] = 0;
+      ++c;
+      pt = get_coef_context(nb, token_cache, c);
+      v = qcoeff[scan[c]];
+    }
+
+    vp9_get_token_extra(v, &token, &extra);
+
+    add_token(&t, coef_probs[band[c]][pt], token, extra,
+              counts[band[c]][pt]);
+
+    token_cache[scan[c]] = vp9_pt_energy_class[token];
+    ++c;
+    pt = get_coef_context(nb, token_cache, c);
+  }
+  if (c < seg_eob) {
+    ++eob_branch[band[c]][pt];
+    add_token_no_extra(&t, coef_probs[band[c]][pt], EOB_TOKEN,
+                       counts[band[c]][pt]);
+  }
+
+  *tp = t;
+
+  vp9_set_contexts(xd, pd, plane_bsize, tx_size, c > 0, aoff, loff);
+}
+
+struct is_skippable_args {
+  uint16_t *eobs;
+  int *skippable;
+};
+static void is_skippable(int plane, int block,
+                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                         void *argv) {
+  struct is_skippable_args *args = argv;
+  (void)plane;
+  (void)plane_bsize;
+  (void)tx_size;
+  args->skippable[0] &= (!args->eobs[block]);
+}
+
+// TODO(yaowu): rewrite and optimize this function to remove the usage of
+//              vp9_foreach_transform_block() and simplify is_skippable().
+int vp9_is_skippable_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
+  int result = 1;
+  struct is_skippable_args args = {x->plane[plane].eobs, &result};
+  vp9_foreach_transformed_block_in_plane(&x->e_mbd, bsize, plane, is_skippable,
+                                         &args);
+  return result;
+}
+
+static void has_high_freq_coeff(int plane, int block,
+                                BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                                void *argv) {
+  struct is_skippable_args *args = argv;
+  int eobs = (tx_size == TX_4X4) ? 3 : 10;
+  (void) plane;
+  (void) plane_bsize;
+
+  *(args->skippable) |= (args->eobs[block] > eobs);
+}
+
+int vp9_has_high_freq_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
+  int result = 0;
+  struct is_skippable_args args = {x->plane[plane].eobs, &result};
+  vp9_foreach_transformed_block_in_plane(&x->e_mbd, bsize, plane,
+                                         has_high_freq_coeff, &args);
+  return result;
+}
+
+void vp9_tokenize_sb(VP9_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
+                     int dry_run, BLOCK_SIZE bsize) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *const mi = xd->mi[0];
+  const int ctx = vp9_get_skip_context(xd);
+  const int skip_inc = !segfeature_active(&cm->seg, mi->segment_id,
+                                          SEG_LVL_SKIP);
+  struct tokenize_b_args arg = {cpi, td, t};
+  if (mi->skip) {
+    if (!dry_run)
+      td->counts->skip[ctx][1] += skip_inc;
+    reset_skip_context(xd, bsize);
+    return;
+  }
+
+  if (!dry_run) {
+    td->counts->skip[ctx][0] += skip_inc;
+    vp9_foreach_transformed_block(xd, bsize, tokenize_b, &arg);
+  } else {
+    vp9_foreach_transformed_block(xd, bsize, set_entropy_context_b, &arg);
+  }
+}
diff --git a/libs/libvpx/vp9/encoder/vp9_tokenize.h b/libs/libvpx/vp9/encoder/vp9_tokenize.h
new file mode 100644
index 0000000000..df979b25dd
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_tokenize.h
@@ -0,0 +1,125 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_VP9_TOKENIZE_H_
+#define VP9_ENCODER_VP9_TOKENIZE_H_
+
+#include "vp9/common/vp9_entropy.h"
+
+#include "vp9/encoder/vp9_block.h"
+#include "vp9/encoder/vp9_treewriter.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define EOSB_TOKEN 127     // Not signalled, encoder only
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  typedef int32_t EXTRABIT;
+#else
+  typedef int16_t EXTRABIT;
+#endif
+
+
+typedef struct {
+  int16_t token;
+  EXTRABIT extra;
+} TOKENVALUE;
+
+typedef struct {
+  const vpx_prob *context_tree;
+  int16_t token;
+  EXTRABIT extra;
+} TOKENEXTRA;
+
+extern const vpx_tree_index vp9_coef_tree[];
+extern const vpx_tree_index vp9_coef_con_tree[];
+extern const struct vp9_token vp9_coef_encodings[];
+
+int vp9_is_skippable_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
+int vp9_has_high_freq_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
+
+struct VP9_COMP;
+struct ThreadData;
+
+void vp9_tokenize_sb(struct VP9_COMP *cpi, struct ThreadData *td,
+                     TOKENEXTRA **t, int dry_run, BLOCK_SIZE bsize);
+
+typedef struct {
+  const vpx_prob *prob;
+  int len;
+  int base_val;
+  const int16_t *cost;
+} vp9_extra_bit;
+
+// indexed by token value
+extern const vp9_extra_bit vp9_extra_bits[ENTROPY_TOKENS];
+#if CONFIG_VP9_HIGHBITDEPTH
+extern const vp9_extra_bit vp9_extra_bits_high10[ENTROPY_TOKENS];
+extern const vp9_extra_bit vp9_extra_bits_high12[ENTROPY_TOKENS];
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+extern const int16_t *vp9_dct_value_cost_ptr;
+/* TODO: The Token field should be broken out into a separate char array to
+ *  improve cache locality, since it's needed for costing when the rest of the
+ *  fields are not.
+ */
+extern const TOKENVALUE *vp9_dct_value_tokens_ptr;
+extern const TOKENVALUE *vp9_dct_cat_lt_10_value_tokens;
+extern const int16_t vp9_cat6_low_cost[256];
+extern const int vp9_cat6_high_cost[64];
+extern const int vp9_cat6_high10_high_cost[256];
+extern const int vp9_cat6_high12_high_cost[1024];
+static INLINE int vp9_get_cost(int16_t token, EXTRABIT extrabits,
+                               const int *cat6_high_table) {
+  if (token != CATEGORY6_TOKEN)
+    return vp9_extra_bits[token].cost[extrabits >> 1];
+  return vp9_cat6_low_cost[(extrabits >> 1) & 0xff]
+      + cat6_high_table[extrabits >> 9];
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE const int* vp9_get_high_cost_table(int bit_depth) {
+  return bit_depth == 8 ? vp9_cat6_high_cost
+      : (bit_depth == 10 ? vp9_cat6_high10_high_cost :
+         vp9_cat6_high12_high_cost);
+}
+#else
+static INLINE const int* vp9_get_high_cost_table(int bit_depth) {
+  (void) bit_depth;
+  return vp9_cat6_high_cost;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static INLINE void vp9_get_token_extra(int v, int16_t *token, EXTRABIT *extra) {
+  if (v >= CAT6_MIN_VAL || v <= -CAT6_MIN_VAL) {
+    *token = CATEGORY6_TOKEN;
+    if (v >= CAT6_MIN_VAL)
+      *extra = 2 * v - 2 * CAT6_MIN_VAL;
+    else
+      *extra = -2 * v - 2 * CAT6_MIN_VAL + 1;
+    return;
+  }
+  *token = vp9_dct_cat_lt_10_value_tokens[v].token;
+  *extra = vp9_dct_cat_lt_10_value_tokens[v].extra;
+}
+static INLINE int16_t vp9_get_token(int v) {
+  if (v >= CAT6_MIN_VAL || v <= -CAT6_MIN_VAL)
+    return 10;
+  return vp9_dct_cat_lt_10_value_tokens[v].token;
+}
+
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_VP9_TOKENIZE_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_treewriter.c b/libs/libvpx/vp9/encoder/vp9_treewriter.c
new file mode 100644
index 0000000000..0fc078e0a7
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_treewriter.c
@@ -0,0 +1,58 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/encoder/vp9_treewriter.h"
+
+static void tree2tok(struct vp9_token *tokens, const vpx_tree_index *tree,
+                     int i, int v, int l) {
+  v += v;
+  ++l;
+
+  do {
+    const vpx_tree_index j = tree[i++];
+    if (j <= 0) {
+      tokens[-j].value = v;
+      tokens[-j].len = l;
+    } else {
+      tree2tok(tokens, tree, j, v, l);
+    }
+  } while (++v & 1);
+}
+
+void vp9_tokens_from_tree(struct vp9_token *tokens,
+                          const vpx_tree_index *tree) {
+  tree2tok(tokens, tree, 0, 0, 0);
+}
+
+static unsigned int convert_distribution(unsigned int i, vpx_tree tree,
+                                         unsigned int branch_ct[][2],
+                                         const unsigned int num_events[]) {
+  unsigned int left, right;
+
+  if (tree[i] <= 0)
+    left = num_events[-tree[i]];
+  else
+    left = convert_distribution(tree[i], tree, branch_ct, num_events);
+
+  if (tree[i + 1] <= 0)
+    right = num_events[-tree[i + 1]];
+  else
+    right = convert_distribution(tree[i + 1], tree, branch_ct, num_events);
+
+  branch_ct[i >> 1][0] = left;
+  branch_ct[i >> 1][1] = right;
+  return left + right;
+}
+
+void vp9_tree_probs_from_distribution(vpx_tree tree,
+                                      unsigned int branch_ct[/* n-1 */][2],
+                                      const unsigned int num_events[/* n */]) {
+  convert_distribution(0, tree, branch_ct, num_events);
+}
diff --git a/libs/libvpx/vp9/encoder/vp9_treewriter.h b/libs/libvpx/vp9/encoder/vp9_treewriter.h
new file mode 100644
index 0000000000..0f89350763
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_treewriter.h
@@ -0,0 +1,51 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_VP9_TREEWRITER_H_
+#define VP9_ENCODER_VP9_TREEWRITER_H_
+
+#include "vpx_dsp/bitwriter.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp9_tree_probs_from_distribution(vpx_tree tree,
+                                      unsigned int branch_ct[ /* n - 1 */ ][2],
+                                      const unsigned int num_events[ /* n */ ]);
+
+struct vp9_token {
+  int value;
+  int len;
+};
+
+void vp9_tokens_from_tree(struct vp9_token*, const vpx_tree_index *);
+
+static INLINE void vp9_write_tree(vpx_writer *w, const vpx_tree_index *tree,
+                                  const vpx_prob *probs, int bits, int len,
+                                  vpx_tree_index i) {
+  do {
+    const int bit = (bits >> --len) & 1;
+    vpx_write(w, bit, probs[i >> 1]);
+    i = tree[i + bit];
+  } while (len);
+}
+
+static INLINE void vp9_write_token(vpx_writer *w, const vpx_tree_index *tree,
+                                   const vpx_prob *probs,
+                                   const struct vp9_token *token) {
+  vp9_write_tree(w, tree, probs, token->value, token->len, 0);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_VP9_TREEWRITER_H_
diff --git a/libs/libvpx/vp9/encoder/x86/vp9_dct_mmx.asm b/libs/libvpx/vp9/encoder/x86/vp9_dct_mmx.asm
new file mode 100644
index 0000000000..7a7a6b6555
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/x86/vp9_dct_mmx.asm
@@ -0,0 +1,104 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%define private_prefix vp9
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro TRANSFORM_COLS 0
+  paddw           m0,        m1
+  movq            m4,        m0
+  psubw           m3,        m2
+  psubw           m4,        m3
+  psraw           m4,        1
+  movq            m5,        m4
+  psubw           m5,        m1 ;b1
+  psubw           m4,        m2 ;c1
+  psubw           m0,        m4
+  paddw           m3,        m5
+                                ; m0 a0
+  SWAP            1,         4  ; m1 c1
+  SWAP            2,         3  ; m2 d1
+  SWAP            3,         5  ; m3 b1
+%endmacro
+
+%macro TRANSPOSE_4X4 0
+  movq            m4,        m0
+  movq            m5,        m2
+  punpcklwd       m4,        m1
+  punpckhwd       m0,        m1
+  punpcklwd       m5,        m3
+  punpckhwd       m2,        m3
+  movq            m1,        m4
+  movq            m3,        m0
+  punpckldq       m1,        m5
+  punpckhdq       m4,        m5
+  punpckldq       m3,        m2
+  punpckhdq       m0,        m2
+  SWAP            2, 3, 0, 1, 4
+%endmacro
+
+INIT_MMX mmx
+cglobal fwht4x4, 3, 4, 8, input, output, stride
+  lea             r3q,       [inputq + strideq*4]
+  movq            m0,        [inputq] ;a1
+  movq            m1,        [inputq + strideq*2] ;b1
+  movq            m2,        [r3q] ;c1
+  movq            m3,        [r3q + strideq*2] ;d1
+
+  TRANSFORM_COLS
+  TRANSPOSE_4X4
+  TRANSFORM_COLS
+  TRANSPOSE_4X4
+
+  psllw           m0,        2
+  psllw           m1,        2
+  psllw           m2,        2
+  psllw           m3,        2
+
+%if CONFIG_VP9_HIGHBITDEPTH
+  pxor            m4,             m4
+  pxor            m5,             m5
+  pcmpgtw         m4,             m0
+  pcmpgtw         m5,             m1
+  movq            m6,             m0
+  movq            m7,             m1
+  punpcklwd       m0,             m4
+  punpcklwd       m1,             m5
+  punpckhwd       m6,             m4
+  punpckhwd       m7,             m5
+  movq            [outputq],      m0
+  movq            [outputq + 8],  m6
+  movq            [outputq + 16], m1
+  movq            [outputq + 24], m7
+  pxor            m4,             m4
+  pxor            m5,             m5
+  pcmpgtw         m4,             m2
+  pcmpgtw         m5,             m3
+  movq            m6,             m2
+  movq            m7,             m3
+  punpcklwd       m2,             m4
+  punpcklwd       m3,             m5
+  punpckhwd       m6,             m4
+  punpckhwd       m7,             m5
+  movq            [outputq + 32], m2
+  movq            [outputq + 40], m6
+  movq            [outputq + 48], m3
+  movq            [outputq + 56], m7
+%else
+  movq            [outputq],      m0
+  movq            [outputq + 8],  m1
+  movq            [outputq + 16], m2
+  movq            [outputq + 24], m3
+%endif
+
+  RET
diff --git a/libs/libvpx/vp9/encoder/x86/vp9_dct_sse2.c b/libs/libvpx/vp9/encoder/x86/vp9_dct_sse2.c
new file mode 100644
index 0000000000..fa37b6fed1
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/x86/vp9_dct_sse2.c
@@ -0,0 +1,2058 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>  // SSE2
+
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/x86/fwd_txfm_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+#include "vpx_ports/mem.h"
+
+static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
+                                   int stride) {
+  const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
+  const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
+  __m128i mask;
+
+  in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+  in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+  in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+  in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+
+  in[0] = _mm_slli_epi16(in[0], 4);
+  in[1] = _mm_slli_epi16(in[1], 4);
+  in[2] = _mm_slli_epi16(in[2], 4);
+  in[3] = _mm_slli_epi16(in[3], 4);
+
+  mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a);
+  in[0] = _mm_add_epi16(in[0], mask);
+  in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b);
+}
+
+static INLINE void write_buffer_4x4(tran_low_t *output, __m128i *res) {
+  const __m128i kOne = _mm_set1_epi16(1);
+  __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]);
+  __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]);
+  __m128i out01 = _mm_add_epi16(in01, kOne);
+  __m128i out23 = _mm_add_epi16(in23, kOne);
+  out01 = _mm_srai_epi16(out01, 2);
+  out23 = _mm_srai_epi16(out23, 2);
+  store_output(&out01, (output + 0 * 8));
+  store_output(&out23, (output + 1 * 8));
+}
+
+static INLINE void transpose_4x4(__m128i *res) {
+  // Combine and transpose
+  // 00 01 02 03 20 21 22 23
+  // 10 11 12 13 30 31 32 33
+  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
+  const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
+
+  // 00 10 01 11 02 12 03 13
+  // 20 30 21 31 22 32 23 33
+  res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1);
+
+  // 00 10 20 30 01 11 21 31
+  // 02 12 22 32 03 13 23 33
+  // only use the first 4 16-bit integers
+  res[1] = _mm_unpackhi_epi64(res[0], res[0]);
+  res[3] = _mm_unpackhi_epi64(res[2], res[2]);
+}
+
+static void fdct4_sse2(__m128i *in) {
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+  __m128i u[4], v[4];
+  u[0]=_mm_unpacklo_epi16(in[0], in[1]);
+  u[1]=_mm_unpacklo_epi16(in[3], in[2]);
+
+  v[0] = _mm_add_epi16(u[0], u[1]);
+  v[1] = _mm_sub_epi16(u[0], u[1]);
+
+  u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);  // 0
+  u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16);  // 2
+  u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24);  // 1
+  u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08);  // 3
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+
+  in[0] = _mm_packs_epi32(u[0], u[1]);
+  in[1] = _mm_packs_epi32(u[2], u[3]);
+  transpose_4x4(in);
+}
+
+static void fadst4_sse2(__m128i *in) {
+  const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9);
+  const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9);
+  const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
+  const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9);
+  const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
+  const __m128i kZero = _mm_set1_epi16(0);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  __m128i u[8], v[8];
+  __m128i in7 = _mm_add_epi16(in[0], in[1]);
+
+  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
+  u[1] = _mm_unpacklo_epi16(in[2], in[3]);
+  u[2] = _mm_unpacklo_epi16(in7, kZero);
+  u[3] = _mm_unpacklo_epi16(in[2], kZero);
+  u[4] = _mm_unpacklo_epi16(in[3], kZero);
+
+  v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02);  // s0 + s2
+  v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04);  // s4 + s5
+  v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x1
+  v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01);  // s1 - s3
+  v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02);  // -s4 + s6
+  v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s4
+  v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03);
+
+  u[0] = _mm_add_epi32(v[0], v[1]);
+  u[1] = _mm_sub_epi32(v[2], v[6]);
+  u[2] = _mm_add_epi32(v[3], v[4]);
+  u[3] = _mm_sub_epi32(u[2], u[0]);
+  u[4] = _mm_slli_epi32(v[5], 2);
+  u[5] = _mm_sub_epi32(u[4], v[5]);
+  u[6] = _mm_add_epi32(u[3], u[5]);
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+
+  in[0] = _mm_packs_epi32(u[0], u[2]);
+  in[1] = _mm_packs_epi32(u[1], u[3]);
+  transpose_4x4(in);
+}
+
+void vp9_fht4x4_sse2(const int16_t *input, tran_low_t *output,
+                     int stride, int tx_type) {
+  __m128i in[4];
+
+  switch (tx_type) {
+    case DCT_DCT:
+      vpx_fdct4x4_sse2(input, output, stride);
+      break;
+    case ADST_DCT:
+      load_buffer_4x4(input, in, stride);
+      fadst4_sse2(in);
+      fdct4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+    case DCT_ADST:
+      load_buffer_4x4(input, in, stride);
+      fdct4_sse2(in);
+      fadst4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+    case ADST_ADST:
+      load_buffer_4x4(input, in, stride);
+      fadst4_sse2(in);
+      fadst4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+   default:
+     assert(0);
+     break;
+  }
+}
+
+void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride,
+                            int16_t* coeff_ptr, intptr_t n_coeffs,
+                            int skip_block, const int16_t* zbin_ptr,
+                            const int16_t* round_ptr, const int16_t* quant_ptr,
+                            const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr,
+                            int16_t* dqcoeff_ptr, const int16_t* dequant_ptr,
+                            uint16_t* eob_ptr,
+                            const int16_t* scan_ptr,
+                            const int16_t* iscan_ptr) {
+  __m128i zero;
+  int pass;
+  // Constants
+  //    When we use them, in one case, they are all the same. In all others
+  //    it's a pair of them that we need to repeat four times. This is done
+  //    by constructing the 32 bit constant corresponding to that pair.
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  // Load input
+  __m128i in0  = _mm_load_si128((const __m128i *)(input + 0 * stride));
+  __m128i in1  = _mm_load_si128((const __m128i *)(input + 1 * stride));
+  __m128i in2  = _mm_load_si128((const __m128i *)(input + 2 * stride));
+  __m128i in3  = _mm_load_si128((const __m128i *)(input + 3 * stride));
+  __m128i in4  = _mm_load_si128((const __m128i *)(input + 4 * stride));
+  __m128i in5  = _mm_load_si128((const __m128i *)(input + 5 * stride));
+  __m128i in6  = _mm_load_si128((const __m128i *)(input + 6 * stride));
+  __m128i in7  = _mm_load_si128((const __m128i *)(input + 7 * stride));
+  __m128i *in[8];
+  int index = 0;
+
+  (void)scan_ptr;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)coeff_ptr;
+
+  // Pre-condition input (shift by two)
+  in0 = _mm_slli_epi16(in0, 2);
+  in1 = _mm_slli_epi16(in1, 2);
+  in2 = _mm_slli_epi16(in2, 2);
+  in3 = _mm_slli_epi16(in3, 2);
+  in4 = _mm_slli_epi16(in4, 2);
+  in5 = _mm_slli_epi16(in5, 2);
+  in6 = _mm_slli_epi16(in6, 2);
+  in7 = _mm_slli_epi16(in7, 2);
+
+  in[0] = &in0;
+  in[1] = &in1;
+  in[2] = &in2;
+  in[3] = &in3;
+  in[4] = &in4;
+  in[5] = &in5;
+  in[6] = &in6;
+  in[7] = &in7;
+
+  // We do two passes, first the columns, then the rows. The results of the
+  // first pass are transposed so that the same column code can be reused. The
+  // results of the second pass are also transposed so that the rows (processed
+  // as columns) are put back in row positions.
+  for (pass = 0; pass < 2; pass++) {
+    // To store results of each pass before the transpose.
+    __m128i res0, res1, res2, res3, res4, res5, res6, res7;
+    // Add/subtract
+    const __m128i q0 = _mm_add_epi16(in0, in7);
+    const __m128i q1 = _mm_add_epi16(in1, in6);
+    const __m128i q2 = _mm_add_epi16(in2, in5);
+    const __m128i q3 = _mm_add_epi16(in3, in4);
+    const __m128i q4 = _mm_sub_epi16(in3, in4);
+    const __m128i q5 = _mm_sub_epi16(in2, in5);
+    const __m128i q6 = _mm_sub_epi16(in1, in6);
+    const __m128i q7 = _mm_sub_epi16(in0, in7);
+    // Work on first four results
+    {
+      // Add/subtract
+      const __m128i r0 = _mm_add_epi16(q0, q3);
+      const __m128i r1 = _mm_add_epi16(q1, q2);
+      const __m128i r2 = _mm_sub_epi16(q1, q2);
+      const __m128i r3 = _mm_sub_epi16(q0, q3);
+      // Interleave to do the multiply by constants which gets us into 32bits
+      const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+      const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
+      const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+      const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
+      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
+      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
+      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
+      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
+      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
+      // dct_const_round_shift
+      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+      // Combine
+      res0 = _mm_packs_epi32(w0, w1);
+      res4 = _mm_packs_epi32(w2, w3);
+      res2 = _mm_packs_epi32(w4, w5);
+      res6 = _mm_packs_epi32(w6, w7);
+    }
+    // Work on next four results
+    {
+      // Interleave to do the multiply by constants which gets us into 32bits
+      const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
+      const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
+      const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
+      const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
+      const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
+      const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
+      // dct_const_round_shift
+      const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
+      const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
+      const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
+      const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
+      const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
+      const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
+      const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
+      const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
+      // Combine
+      const __m128i r0 = _mm_packs_epi32(s0, s1);
+      const __m128i r1 = _mm_packs_epi32(s2, s3);
+      // Add/subtract
+      const __m128i x0 = _mm_add_epi16(q4, r0);
+      const __m128i x1 = _mm_sub_epi16(q4, r0);
+      const __m128i x2 = _mm_sub_epi16(q7, r1);
+      const __m128i x3 = _mm_add_epi16(q7, r1);
+      // Interleave to do the multiply by constants which gets us into 32bits
+      const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
+      const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
+      const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
+      const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
+      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
+      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
+      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
+      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
+      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
+      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
+      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
+      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
+      // dct_const_round_shift
+      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+      // Combine
+      res1 = _mm_packs_epi32(w0, w1);
+      res7 = _mm_packs_epi32(w2, w3);
+      res5 = _mm_packs_epi32(w4, w5);
+      res3 = _mm_packs_epi32(w6, w7);
+    }
+    // Transpose the 8x8.
+    {
+      // 00 01 02 03 04 05 06 07
+      // 10 11 12 13 14 15 16 17
+      // 20 21 22 23 24 25 26 27
+      // 30 31 32 33 34 35 36 37
+      // 40 41 42 43 44 45 46 47
+      // 50 51 52 53 54 55 56 57
+      // 60 61 62 63 64 65 66 67
+      // 70 71 72 73 74 75 76 77
+      const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
+      const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
+      const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
+      const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
+      const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
+      const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
+      const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
+      const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
+      // 00 10 01 11 02 12 03 13
+      // 20 30 21 31 22 32 23 33
+      // 04 14 05 15 06 16 07 17
+      // 24 34 25 35 26 36 27 37
+      // 40 50 41 51 42 52 43 53
+      // 60 70 61 71 62 72 63 73
+      // 54 54 55 55 56 56 57 57
+      // 64 74 65 75 66 76 67 77
+      const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+      const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+      const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+      const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+      const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+      const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+      const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+      const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+      // 00 10 20 30 01 11 21 31
+      // 40 50 60 70 41 51 61 71
+      // 02 12 22 32 03 13 23 33
+      // 42 52 62 72 43 53 63 73
+      // 04 14 24 34 05 15 21 36
+      // 44 54 64 74 45 55 61 76
+      // 06 16 26 36 07 17 27 37
+      // 46 56 66 76 47 57 67 77
+      in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+      in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+      in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+      in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+      in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+      in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+      in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+      in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+      // 00 10 20 30 40 50 60 70
+      // 01 11 21 31 41 51 61 71
+      // 02 12 22 32 42 52 62 72
+      // 03 13 23 33 43 53 63 73
+      // 04 14 24 34 44 54 64 74
+      // 05 15 25 35 45 55 65 75
+      // 06 16 26 36 46 56 66 76
+      // 07 17 27 37 47 57 67 77
+    }
+  }
+  // Post-condition output and store it
+  {
+    // Post-condition (division by two)
+    //    division of two 16 bits signed numbers using shifts
+    //    n / 2 = (n - (n >> 15)) >> 1
+    const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
+    const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
+    const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
+    const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
+    const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
+    const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
+    const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
+    const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
+    in0 = _mm_sub_epi16(in0, sign_in0);
+    in1 = _mm_sub_epi16(in1, sign_in1);
+    in2 = _mm_sub_epi16(in2, sign_in2);
+    in3 = _mm_sub_epi16(in3, sign_in3);
+    in4 = _mm_sub_epi16(in4, sign_in4);
+    in5 = _mm_sub_epi16(in5, sign_in5);
+    in6 = _mm_sub_epi16(in6, sign_in6);
+    in7 = _mm_sub_epi16(in7, sign_in7);
+    in0 = _mm_srai_epi16(in0, 1);
+    in1 = _mm_srai_epi16(in1, 1);
+    in2 = _mm_srai_epi16(in2, 1);
+    in3 = _mm_srai_epi16(in3, 1);
+    in4 = _mm_srai_epi16(in4, 1);
+    in5 = _mm_srai_epi16(in5, 1);
+    in6 = _mm_srai_epi16(in6, 1);
+    in7 = _mm_srai_epi16(in7, 1);
+  }
+
+  iscan_ptr += n_coeffs;
+  qcoeff_ptr += n_coeffs;
+  dqcoeff_ptr += n_coeffs;
+  n_coeffs = -n_coeffs;
+  zero = _mm_setzero_si128();
+
+  if (!skip_block) {
+    __m128i eob;
+    __m128i round, quant, dequant;
+    {
+      __m128i coeff0, coeff1;
+
+      // Setup global values
+      {
+        round = _mm_load_si128((const __m128i*)round_ptr);
+        quant = _mm_load_si128((const __m128i*)quant_ptr);
+        dequant = _mm_load_si128((const __m128i*)dequant_ptr);
+      }
+
+      {
+        __m128i coeff0_sign, coeff1_sign;
+        __m128i qcoeff0, qcoeff1;
+        __m128i qtmp0, qtmp1;
+        // Do DC and first 15 AC
+        coeff0 = *in[0];
+        coeff1 = *in[1];
+
+        // Poor man's sign extract
+        coeff0_sign = _mm_srai_epi16(coeff0, 15);
+        coeff1_sign = _mm_srai_epi16(coeff1, 15);
+        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+        round = _mm_unpackhi_epi64(round, round);
+        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+        quant = _mm_unpackhi_epi64(quant, quant);
+        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+        // Reinsert signs
+        qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+
+        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+        dequant = _mm_unpackhi_epi64(dequant, dequant);
+        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+      }
+
+      {
+        // Scan for eob
+        __m128i zero_coeff0, zero_coeff1;
+        __m128i nzero_coeff0, nzero_coeff1;
+        __m128i iscan0, iscan1;
+        __m128i eob1;
+        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+        iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
+        iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
+        // Add one to convert from indices to counts
+        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+        eob = _mm_and_si128(iscan0, nzero_coeff0);
+        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+        eob = _mm_max_epi16(eob, eob1);
+      }
+      n_coeffs += 8 * 2;
+    }
+
+    // AC only loop
+    index = 2;
+    while (n_coeffs < 0) {
+      __m128i coeff0, coeff1;
+      {
+        __m128i coeff0_sign, coeff1_sign;
+        __m128i qcoeff0, qcoeff1;
+        __m128i qtmp0, qtmp1;
+
+        assert(index < (int)(sizeof(in) / sizeof(in[0])) - 1);
+        coeff0 = *in[index];
+        coeff1 = *in[index + 1];
+
+        // Poor man's sign extract
+        coeff0_sign = _mm_srai_epi16(coeff0, 15);
+        coeff1_sign = _mm_srai_epi16(coeff1, 15);
+        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+        // Reinsert signs
+        qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+
+        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+      }
+
+      {
+        // Scan for eob
+        __m128i zero_coeff0, zero_coeff1;
+        __m128i nzero_coeff0, nzero_coeff1;
+        __m128i iscan0, iscan1;
+        __m128i eob0, eob1;
+        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+        iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
+        iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
+        // Add one to convert from indices to counts
+        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+        eob0 = _mm_and_si128(iscan0, nzero_coeff0);
+        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+        eob0 = _mm_max_epi16(eob0, eob1);
+        eob = _mm_max_epi16(eob, eob0);
+      }
+      n_coeffs += 8 * 2;
+      index += 2;
+    }
+
+    // Accumulate EOB
+    {
+      __m128i eob_shuffled;
+      eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      *eob_ptr = _mm_extract_epi16(eob, 1);
+    }
+  } else {
+    do {
+      _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
+      _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
+      _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
+      _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
+      n_coeffs += 8 * 2;
+    } while (n_coeffs < 0);
+    *eob_ptr = 0;
+  }
+}
+
+// load 8x8 array
+static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
+                                   int stride) {
+  in[0]  = _mm_load_si128((const __m128i *)(input + 0 * stride));
+  in[1]  = _mm_load_si128((const __m128i *)(input + 1 * stride));
+  in[2]  = _mm_load_si128((const __m128i *)(input + 2 * stride));
+  in[3]  = _mm_load_si128((const __m128i *)(input + 3 * stride));
+  in[4]  = _mm_load_si128((const __m128i *)(input + 4 * stride));
+  in[5]  = _mm_load_si128((const __m128i *)(input + 5 * stride));
+  in[6]  = _mm_load_si128((const __m128i *)(input + 6 * stride));
+  in[7]  = _mm_load_si128((const __m128i *)(input + 7 * stride));
+
+  in[0] = _mm_slli_epi16(in[0], 2);
+  in[1] = _mm_slli_epi16(in[1], 2);
+  in[2] = _mm_slli_epi16(in[2], 2);
+  in[3] = _mm_slli_epi16(in[3], 2);
+  in[4] = _mm_slli_epi16(in[4], 2);
+  in[5] = _mm_slli_epi16(in[5], 2);
+  in[6] = _mm_slli_epi16(in[6], 2);
+  in[7] = _mm_slli_epi16(in[7], 2);
+}
+
+// right shift and rounding
+static INLINE void right_shift_8x8(__m128i *res, const int bit) {
+  __m128i sign0 = _mm_srai_epi16(res[0], 15);
+  __m128i sign1 = _mm_srai_epi16(res[1], 15);
+  __m128i sign2 = _mm_srai_epi16(res[2], 15);
+  __m128i sign3 = _mm_srai_epi16(res[3], 15);
+  __m128i sign4 = _mm_srai_epi16(res[4], 15);
+  __m128i sign5 = _mm_srai_epi16(res[5], 15);
+  __m128i sign6 = _mm_srai_epi16(res[6], 15);
+  __m128i sign7 = _mm_srai_epi16(res[7], 15);
+
+  if (bit == 2) {
+    const __m128i const_rounding = _mm_set1_epi16(1);
+    res[0] = _mm_add_epi16(res[0], const_rounding);
+    res[1] = _mm_add_epi16(res[1], const_rounding);
+    res[2] = _mm_add_epi16(res[2], const_rounding);
+    res[3] = _mm_add_epi16(res[3], const_rounding);
+    res[4] = _mm_add_epi16(res[4], const_rounding);
+    res[5] = _mm_add_epi16(res[5], const_rounding);
+    res[6] = _mm_add_epi16(res[6], const_rounding);
+    res[7] = _mm_add_epi16(res[7], const_rounding);
+  }
+
+  res[0] = _mm_sub_epi16(res[0], sign0);
+  res[1] = _mm_sub_epi16(res[1], sign1);
+  res[2] = _mm_sub_epi16(res[2], sign2);
+  res[3] = _mm_sub_epi16(res[3], sign3);
+  res[4] = _mm_sub_epi16(res[4], sign4);
+  res[5] = _mm_sub_epi16(res[5], sign5);
+  res[6] = _mm_sub_epi16(res[6], sign6);
+  res[7] = _mm_sub_epi16(res[7], sign7);
+
+  if (bit == 1) {
+    res[0] = _mm_srai_epi16(res[0], 1);
+    res[1] = _mm_srai_epi16(res[1], 1);
+    res[2] = _mm_srai_epi16(res[2], 1);
+    res[3] = _mm_srai_epi16(res[3], 1);
+    res[4] = _mm_srai_epi16(res[4], 1);
+    res[5] = _mm_srai_epi16(res[5], 1);
+    res[6] = _mm_srai_epi16(res[6], 1);
+    res[7] = _mm_srai_epi16(res[7], 1);
+  } else {
+    res[0] = _mm_srai_epi16(res[0], 2);
+    res[1] = _mm_srai_epi16(res[1], 2);
+    res[2] = _mm_srai_epi16(res[2], 2);
+    res[3] = _mm_srai_epi16(res[3], 2);
+    res[4] = _mm_srai_epi16(res[4], 2);
+    res[5] = _mm_srai_epi16(res[5], 2);
+    res[6] = _mm_srai_epi16(res[6], 2);
+    res[7] = _mm_srai_epi16(res[7], 2);
+  }
+}
+
+// write 8x8 array
+static INLINE void write_buffer_8x8(tran_low_t *output, __m128i *res,
+                                    int stride) {
+  store_output(&res[0], (output + 0 * stride));
+  store_output(&res[1], (output + 1 * stride));
+  store_output(&res[2], (output + 2 * stride));
+  store_output(&res[3], (output + 3 * stride));
+  store_output(&res[4], (output + 4 * stride));
+  store_output(&res[5], (output + 5 * stride));
+  store_output(&res[6], (output + 6 * stride));
+  store_output(&res[7], (output + 7 * stride));
+}
+
+// perform in-place transpose
+static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
+  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
+  const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
+  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
+  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
+  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
+  // 00 10 01 11 02 12 03 13
+  // 20 30 21 31 22 32 23 33
+  // 04 14 05 15 06 16 07 17
+  // 24 34 25 35 26 36 27 37
+  // 40 50 41 51 42 52 43 53
+  // 60 70 61 71 62 72 63 73
+  // 44 54 45 55 46 56 47 57
+  // 64 74 65 75 66 76 67 77
+  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+  // 00 10 20 30 01 11 21 31
+  // 40 50 60 70 41 51 61 71
+  // 02 12 22 32 03 13 23 33
+  // 42 52 62 72 43 53 63 73
+  // 04 14 24 34 05 15 25 35
+  // 44 54 64 74 45 55 65 75
+  // 06 16 26 36 07 17 27 37
+  // 46 56 66 76 47 57 67 77
+  res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
+  res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
+  res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
+  res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
+  res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
+  res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
+  res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
+  res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
+  // 00 10 20 30 40 50 60 70
+  // 01 11 21 31 41 51 61 71
+  // 02 12 22 32 42 52 62 72
+  // 03 13 23 33 43 53 63 73
+  // 04 14 24 34 44 54 64 74
+  // 05 15 25 35 45 55 65 75
+  // 06 16 26 36 46 56 66 76
+  // 07 17 27 37 47 57 67 77
+}
+
+static void fdct8_sse2(__m128i *in) {
+  // constants
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+
+  // stage 1
+  s0 = _mm_add_epi16(in[0], in[7]);
+  s1 = _mm_add_epi16(in[1], in[6]);
+  s2 = _mm_add_epi16(in[2], in[5]);
+  s3 = _mm_add_epi16(in[3], in[4]);
+  s4 = _mm_sub_epi16(in[3], in[4]);
+  s5 = _mm_sub_epi16(in[2], in[5]);
+  s6 = _mm_sub_epi16(in[1], in[6]);
+  s7 = _mm_sub_epi16(in[0], in[7]);
+
+  u0 = _mm_add_epi16(s0, s3);
+  u1 = _mm_add_epi16(s1, s2);
+  u2 = _mm_sub_epi16(s1, s2);
+  u3 = _mm_sub_epi16(s0, s3);
+  // interleave and perform butterfly multiplication/addition
+  v0 = _mm_unpacklo_epi16(u0, u1);
+  v1 = _mm_unpackhi_epi16(u0, u1);
+  v2 = _mm_unpacklo_epi16(u2, u3);
+  v3 = _mm_unpackhi_epi16(u2, u3);
+
+  u0 = _mm_madd_epi16(v0, k__cospi_p16_p16);
+  u1 = _mm_madd_epi16(v1, k__cospi_p16_p16);
+  u2 = _mm_madd_epi16(v0, k__cospi_p16_m16);
+  u3 = _mm_madd_epi16(v1, k__cospi_p16_m16);
+  u4 = _mm_madd_epi16(v2, k__cospi_p24_p08);
+  u5 = _mm_madd_epi16(v3, k__cospi_p24_p08);
+  u6 = _mm_madd_epi16(v2, k__cospi_m08_p24);
+  u7 = _mm_madd_epi16(v3, k__cospi_m08_p24);
+
+  // shift and rounding
+  v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+  v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+  v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+  v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+  v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+  v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+  v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+  v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+
+  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+
+  in[0] = _mm_packs_epi32(u0, u1);
+  in[2] = _mm_packs_epi32(u4, u5);
+  in[4] = _mm_packs_epi32(u2, u3);
+  in[6] = _mm_packs_epi32(u6, u7);
+
+  // stage 2
+  // interleave and perform butterfly multiplication/addition
+  u0 = _mm_unpacklo_epi16(s6, s5);
+  u1 = _mm_unpackhi_epi16(s6, s5);
+  v0 = _mm_madd_epi16(u0, k__cospi_p16_m16);
+  v1 = _mm_madd_epi16(u1, k__cospi_p16_m16);
+  v2 = _mm_madd_epi16(u0, k__cospi_p16_p16);
+  v3 = _mm_madd_epi16(u1, k__cospi_p16_p16);
+
+  // shift and rounding
+  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+
+  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+
+  u0 = _mm_packs_epi32(v0, v1);
+  u1 = _mm_packs_epi32(v2, v3);
+
+  // stage 3
+  s0 = _mm_add_epi16(s4, u0);
+  s1 = _mm_sub_epi16(s4, u0);
+  s2 = _mm_sub_epi16(s7, u1);
+  s3 = _mm_add_epi16(s7, u1);
+
+  // stage 4
+  u0 = _mm_unpacklo_epi16(s0, s3);
+  u1 = _mm_unpackhi_epi16(s0, s3);
+  u2 = _mm_unpacklo_epi16(s1, s2);
+  u3 = _mm_unpackhi_epi16(s1, s2);
+
+  v0 = _mm_madd_epi16(u0, k__cospi_p28_p04);
+  v1 = _mm_madd_epi16(u1, k__cospi_p28_p04);
+  v2 = _mm_madd_epi16(u2, k__cospi_p12_p20);
+  v3 = _mm_madd_epi16(u3, k__cospi_p12_p20);
+  v4 = _mm_madd_epi16(u2, k__cospi_m20_p12);
+  v5 = _mm_madd_epi16(u3, k__cospi_m20_p12);
+  v6 = _mm_madd_epi16(u0, k__cospi_m04_p28);
+  v7 = _mm_madd_epi16(u1, k__cospi_m04_p28);
+
+  // shift and rounding
+  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
+  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
+  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
+  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
+
+  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
+  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
+  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
+  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
+
+  in[1] = _mm_packs_epi32(v0, v1);
+  in[3] = _mm_packs_epi32(v4, v5);
+  in[5] = _mm_packs_epi32(v2, v3);
+  in[7] = _mm_packs_epi32(v6, v7);
+
+  // transpose
+  array_transpose_8x8(in, in);
+}
+
+static void fadst8_sse2(__m128i *in) {
+  // Constants
+  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
+  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
+  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
+  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
+  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__const_0 = _mm_set1_epi16(0);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
+  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
+  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+
+  // properly aligned for butterfly input
+  in0  = in[7];
+  in1  = in[0];
+  in2  = in[5];
+  in3  = in[2];
+  in4  = in[3];
+  in5  = in[4];
+  in6  = in[1];
+  in7  = in[6];
+
+  // column transformation
+  // stage 1
+  // interleave and multiply/add into 32-bit integer
+  s0 = _mm_unpacklo_epi16(in0, in1);
+  s1 = _mm_unpackhi_epi16(in0, in1);
+  s2 = _mm_unpacklo_epi16(in2, in3);
+  s3 = _mm_unpackhi_epi16(in2, in3);
+  s4 = _mm_unpacklo_epi16(in4, in5);
+  s5 = _mm_unpackhi_epi16(in4, in5);
+  s6 = _mm_unpacklo_epi16(in6, in7);
+  s7 = _mm_unpackhi_epi16(in6, in7);
+
+  u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
+  u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
+  u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
+  u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
+  u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
+  u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
+  u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
+  u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
+  u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
+  u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
+  u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
+  u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
+  u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
+  u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
+  u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
+  u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
+
+  // addition
+  w0 = _mm_add_epi32(u0, u8);
+  w1 = _mm_add_epi32(u1, u9);
+  w2 = _mm_add_epi32(u2, u10);
+  w3 = _mm_add_epi32(u3, u11);
+  w4 = _mm_add_epi32(u4, u12);
+  w5 = _mm_add_epi32(u5, u13);
+  w6 = _mm_add_epi32(u6, u14);
+  w7 = _mm_add_epi32(u7, u15);
+  w8 = _mm_sub_epi32(u0, u8);
+  w9 = _mm_sub_epi32(u1, u9);
+  w10 = _mm_sub_epi32(u2, u10);
+  w11 = _mm_sub_epi32(u3, u11);
+  w12 = _mm_sub_epi32(u4, u12);
+  w13 = _mm_sub_epi32(u5, u13);
+  w14 = _mm_sub_epi32(u6, u14);
+  w15 = _mm_sub_epi32(u7, u15);
+
+  // shift and rounding
+  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
+  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
+  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
+  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
+  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
+  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
+  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
+  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
+  v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
+  v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
+  v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
+  v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
+  v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
+  v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
+  v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
+  v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
+
+  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+  u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
+  u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
+  u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
+  u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
+  u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
+  u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
+  u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
+  u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
+
+  // back to 16-bit and pack 8 integers into __m128i
+  in[0] = _mm_packs_epi32(u0, u1);
+  in[1] = _mm_packs_epi32(u2, u3);
+  in[2] = _mm_packs_epi32(u4, u5);
+  in[3] = _mm_packs_epi32(u6, u7);
+  in[4] = _mm_packs_epi32(u8, u9);
+  in[5] = _mm_packs_epi32(u10, u11);
+  in[6] = _mm_packs_epi32(u12, u13);
+  in[7] = _mm_packs_epi32(u14, u15);
+
+  // stage 2
+  s0 = _mm_add_epi16(in[0], in[2]);
+  s1 = _mm_add_epi16(in[1], in[3]);
+  s2 = _mm_sub_epi16(in[0], in[2]);
+  s3 = _mm_sub_epi16(in[1], in[3]);
+  u0 = _mm_unpacklo_epi16(in[4], in[5]);
+  u1 = _mm_unpackhi_epi16(in[4], in[5]);
+  u2 = _mm_unpacklo_epi16(in[6], in[7]);
+  u3 = _mm_unpackhi_epi16(in[6], in[7]);
+
+  v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
+  v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
+  v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
+  v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
+  v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
+  v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
+  v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
+  v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
+
+  w0 = _mm_add_epi32(v0, v4);
+  w1 = _mm_add_epi32(v1, v5);
+  w2 = _mm_add_epi32(v2, v6);
+  w3 = _mm_add_epi32(v3, v7);
+  w4 = _mm_sub_epi32(v0, v4);
+  w5 = _mm_sub_epi32(v1, v5);
+  w6 = _mm_sub_epi32(v2, v6);
+  w7 = _mm_sub_epi32(v3, v7);
+
+  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
+  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
+  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
+  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
+  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
+  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
+  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
+  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
+
+  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+
+  // back to 16-bit intergers
+  s4 = _mm_packs_epi32(u0, u1);
+  s5 = _mm_packs_epi32(u2, u3);
+  s6 = _mm_packs_epi32(u4, u5);
+  s7 = _mm_packs_epi32(u6, u7);
+
+  // stage 3
+  u0 = _mm_unpacklo_epi16(s2, s3);
+  u1 = _mm_unpackhi_epi16(s2, s3);
+  u2 = _mm_unpacklo_epi16(s6, s7);
+  u3 = _mm_unpackhi_epi16(s6, s7);
+
+  v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
+  v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
+  v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
+  v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
+  v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
+  v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
+  v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
+  v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
+
+  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
+  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
+  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
+  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
+
+  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
+  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
+  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
+  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
+
+  s2 = _mm_packs_epi32(v0, v1);
+  s3 = _mm_packs_epi32(v2, v3);
+  s6 = _mm_packs_epi32(v4, v5);
+  s7 = _mm_packs_epi32(v6, v7);
+
+  // FIXME(jingning): do subtract using bit inversion?
+  in[0] = s0;
+  in[1] = _mm_sub_epi16(k__const_0, s4);
+  in[2] = s6;
+  in[3] = _mm_sub_epi16(k__const_0, s2);
+  in[4] = s3;
+  in[5] = _mm_sub_epi16(k__const_0, s7);
+  in[6] = s5;
+  in[7] = _mm_sub_epi16(k__const_0, s1);
+
+  // transpose
+  array_transpose_8x8(in, in);
+}
+
+void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output,
+                     int stride, int tx_type) {
+  __m128i in[8];
+
+  switch (tx_type) {
+    case DCT_DCT:
+      vpx_fdct8x8_sse2(input, output, stride);
+      break;
+    case ADST_DCT:
+      load_buffer_8x8(input, in, stride);
+      fadst8_sse2(in);
+      fdct8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case DCT_ADST:
+      load_buffer_8x8(input, in, stride);
+      fdct8_sse2(in);
+      fadst8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case ADST_ADST:
+      load_buffer_8x8(input, in, stride);
+      fadst8_sse2(in);
+      fadst8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
+
+static INLINE void load_buffer_16x16(const int16_t* input, __m128i *in0,
+                                     __m128i *in1, int stride) {
+  // load first 8 columns
+  load_buffer_8x8(input, in0, stride);
+  load_buffer_8x8(input + 8 * stride, in0 + 8, stride);
+
+  input += 8;
+  // load second 8 columns
+  load_buffer_8x8(input, in1, stride);
+  load_buffer_8x8(input + 8 * stride, in1 + 8, stride);
+}
+
+static INLINE void write_buffer_16x16(tran_low_t *output, __m128i *in0,
+                                      __m128i *in1, int stride) {
+  // write first 8 columns
+  write_buffer_8x8(output, in0, stride);
+  write_buffer_8x8(output + 8 * stride, in0 + 8, stride);
+  // write second 8 columns
+  output += 8;
+  write_buffer_8x8(output, in1, stride);
+  write_buffer_8x8(output + 8 * stride, in1 + 8, stride);
+}
+
+static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
+  __m128i tbuf[8];
+  array_transpose_8x8(res0, res0);
+  array_transpose_8x8(res1, tbuf);
+  array_transpose_8x8(res0 + 8, res1);
+  array_transpose_8x8(res1 + 8, res1 + 8);
+
+  res0[8] = tbuf[0];
+  res0[9] = tbuf[1];
+  res0[10] = tbuf[2];
+  res0[11] = tbuf[3];
+  res0[12] = tbuf[4];
+  res0[13] = tbuf[5];
+  res0[14] = tbuf[6];
+  res0[15] = tbuf[7];
+}
+
+static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) {
+  // perform rounding operations
+  right_shift_8x8(res0, 2);
+  right_shift_8x8(res0 + 8, 2);
+  right_shift_8x8(res1, 2);
+  right_shift_8x8(res1 + 8, 2);
+}
+
+static void fdct16_8col(__m128i *in) {
+  // perform 16x16 1-D DCT for 8 columns
+  __m128i i[8], s[8], p[8], t[8], u[16], v[16];
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
+  const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
+  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
+  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
+  const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
+  const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
+  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
+  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+  // stage 1
+  i[0] = _mm_add_epi16(in[0], in[15]);
+  i[1] = _mm_add_epi16(in[1], in[14]);
+  i[2] = _mm_add_epi16(in[2], in[13]);
+  i[3] = _mm_add_epi16(in[3], in[12]);
+  i[4] = _mm_add_epi16(in[4], in[11]);
+  i[5] = _mm_add_epi16(in[5], in[10]);
+  i[6] = _mm_add_epi16(in[6], in[9]);
+  i[7] = _mm_add_epi16(in[7], in[8]);
+
+  s[0] = _mm_sub_epi16(in[7], in[8]);
+  s[1] = _mm_sub_epi16(in[6], in[9]);
+  s[2] = _mm_sub_epi16(in[5], in[10]);
+  s[3] = _mm_sub_epi16(in[4], in[11]);
+  s[4] = _mm_sub_epi16(in[3], in[12]);
+  s[5] = _mm_sub_epi16(in[2], in[13]);
+  s[6] = _mm_sub_epi16(in[1], in[14]);
+  s[7] = _mm_sub_epi16(in[0], in[15]);
+
+  p[0] = _mm_add_epi16(i[0], i[7]);
+  p[1] = _mm_add_epi16(i[1], i[6]);
+  p[2] = _mm_add_epi16(i[2], i[5]);
+  p[3] = _mm_add_epi16(i[3], i[4]);
+  p[4] = _mm_sub_epi16(i[3], i[4]);
+  p[5] = _mm_sub_epi16(i[2], i[5]);
+  p[6] = _mm_sub_epi16(i[1], i[6]);
+  p[7] = _mm_sub_epi16(i[0], i[7]);
+
+  u[0] = _mm_add_epi16(p[0], p[3]);
+  u[1] = _mm_add_epi16(p[1], p[2]);
+  u[2] = _mm_sub_epi16(p[1], p[2]);
+  u[3] = _mm_sub_epi16(p[0], p[3]);
+
+  v[0] = _mm_unpacklo_epi16(u[0], u[1]);
+  v[1] = _mm_unpackhi_epi16(u[0], u[1]);
+  v[2] = _mm_unpacklo_epi16(u[2], u[3]);
+  v[3] = _mm_unpackhi_epi16(u[2], u[3]);
+
+  u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);
+  u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16);
+  u[2] = _mm_madd_epi16(v[0], k__cospi_p16_m16);
+  u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16);
+  u[4] = _mm_madd_epi16(v[2], k__cospi_p24_p08);
+  u[5] = _mm_madd_epi16(v[3], k__cospi_p24_p08);
+  u[6] = _mm_madd_epi16(v[2], k__cospi_m08_p24);
+  u[7] = _mm_madd_epi16(v[3], k__cospi_m08_p24);
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+
+  in[0] = _mm_packs_epi32(u[0], u[1]);
+  in[4] = _mm_packs_epi32(u[4], u[5]);
+  in[8] = _mm_packs_epi32(u[2], u[3]);
+  in[12] = _mm_packs_epi32(u[6], u[7]);
+
+  u[0] = _mm_unpacklo_epi16(p[5], p[6]);
+  u[1] = _mm_unpackhi_epi16(p[5], p[6]);
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+
+  u[0] = _mm_packs_epi32(v[0], v[1]);
+  u[1] = _mm_packs_epi32(v[2], v[3]);
+
+  t[0] = _mm_add_epi16(p[4], u[0]);
+  t[1] = _mm_sub_epi16(p[4], u[0]);
+  t[2] = _mm_sub_epi16(p[7], u[1]);
+  t[3] = _mm_add_epi16(p[7], u[1]);
+
+  u[0] = _mm_unpacklo_epi16(t[0], t[3]);
+  u[1] = _mm_unpackhi_epi16(t[0], t[3]);
+  u[2] = _mm_unpacklo_epi16(t[1], t[2]);
+  u[3] = _mm_unpackhi_epi16(t[1], t[2]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p28_p04);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p28_p04);
+  v[2] = _mm_madd_epi16(u[2], k__cospi_p12_p20);
+  v[3] = _mm_madd_epi16(u[3], k__cospi_p12_p20);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_m20_p12);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_m20_p12);
+  v[6] = _mm_madd_epi16(u[0], k__cospi_m04_p28);
+  v[7] = _mm_madd_epi16(u[1], k__cospi_m04_p28);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+  in[2] = _mm_packs_epi32(v[0], v[1]);
+  in[6] = _mm_packs_epi32(v[4], v[5]);
+  in[10] = _mm_packs_epi32(v[2], v[3]);
+  in[14] = _mm_packs_epi32(v[6], v[7]);
+
+  // stage 2
+  u[0] = _mm_unpacklo_epi16(s[2], s[5]);
+  u[1] = _mm_unpackhi_epi16(s[2], s[5]);
+  u[2] = _mm_unpacklo_epi16(s[3], s[4]);
+  u[3] = _mm_unpackhi_epi16(s[3], s[4]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
+  v[2] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
+  v[3] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
+  v[6] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+  v[7] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+  t[2] = _mm_packs_epi32(v[0], v[1]);
+  t[3] = _mm_packs_epi32(v[2], v[3]);
+  t[4] = _mm_packs_epi32(v[4], v[5]);
+  t[5] = _mm_packs_epi32(v[6], v[7]);
+
+  // stage 3
+  p[0] = _mm_add_epi16(s[0], t[3]);
+  p[1] = _mm_add_epi16(s[1], t[2]);
+  p[2] = _mm_sub_epi16(s[1], t[2]);
+  p[3] = _mm_sub_epi16(s[0], t[3]);
+  p[4] = _mm_sub_epi16(s[7], t[4]);
+  p[5] = _mm_sub_epi16(s[6], t[5]);
+  p[6] = _mm_add_epi16(s[6], t[5]);
+  p[7] = _mm_add_epi16(s[7], t[4]);
+
+  // stage 4
+  u[0] = _mm_unpacklo_epi16(p[1], p[6]);
+  u[1] = _mm_unpackhi_epi16(p[1], p[6]);
+  u[2] = _mm_unpacklo_epi16(p[2], p[5]);
+  u[3] = _mm_unpackhi_epi16(p[2], p[5]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24);
+  v[2] = _mm_madd_epi16(u[2], k__cospi_p24_p08);
+  v[3] = _mm_madd_epi16(u[3], k__cospi_p24_p08);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p08_m24);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p08_m24);
+  v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08);
+  v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+  t[1] = _mm_packs_epi32(v[0], v[1]);
+  t[2] = _mm_packs_epi32(v[2], v[3]);
+  t[5] = _mm_packs_epi32(v[4], v[5]);
+  t[6] = _mm_packs_epi32(v[6], v[7]);
+
+  // stage 5
+  s[0] = _mm_add_epi16(p[0], t[1]);
+  s[1] = _mm_sub_epi16(p[0], t[1]);
+  s[2] = _mm_add_epi16(p[3], t[2]);
+  s[3] = _mm_sub_epi16(p[3], t[2]);
+  s[4] = _mm_sub_epi16(p[4], t[5]);
+  s[5] = _mm_add_epi16(p[4], t[5]);
+  s[6] = _mm_sub_epi16(p[7], t[6]);
+  s[7] = _mm_add_epi16(p[7], t[6]);
+
+  // stage 6
+  u[0] = _mm_unpacklo_epi16(s[0], s[7]);
+  u[1] = _mm_unpackhi_epi16(s[0], s[7]);
+  u[2] = _mm_unpacklo_epi16(s[1], s[6]);
+  u[3] = _mm_unpackhi_epi16(s[1], s[6]);
+  u[4] = _mm_unpacklo_epi16(s[2], s[5]);
+  u[5] = _mm_unpackhi_epi16(s[2], s[5]);
+  u[6] = _mm_unpacklo_epi16(s[3], s[4]);
+  u[7] = _mm_unpackhi_epi16(s[3], s[4]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p30_p02);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p30_p02);
+  v[2] = _mm_madd_epi16(u[2], k__cospi_p14_p18);
+  v[3] = _mm_madd_epi16(u[3], k__cospi_p14_p18);
+  v[4] = _mm_madd_epi16(u[4], k__cospi_p22_p10);
+  v[5] = _mm_madd_epi16(u[5], k__cospi_p22_p10);
+  v[6] = _mm_madd_epi16(u[6], k__cospi_p06_p26);
+  v[7] = _mm_madd_epi16(u[7], k__cospi_p06_p26);
+  v[8] = _mm_madd_epi16(u[6], k__cospi_m26_p06);
+  v[9] = _mm_madd_epi16(u[7], k__cospi_m26_p06);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_m10_p22);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_m10_p22);
+  v[12] = _mm_madd_epi16(u[2], k__cospi_m18_p14);
+  v[13] = _mm_madd_epi16(u[3], k__cospi_m18_p14);
+  v[14] = _mm_madd_epi16(u[0], k__cospi_m02_p30);
+  v[15] = _mm_madd_epi16(u[1], k__cospi_m02_p30);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+  in[1]  = _mm_packs_epi32(v[0], v[1]);
+  in[9]  = _mm_packs_epi32(v[2], v[3]);
+  in[5]  = _mm_packs_epi32(v[4], v[5]);
+  in[13] = _mm_packs_epi32(v[6], v[7]);
+  in[3]  = _mm_packs_epi32(v[8], v[9]);
+  in[11] = _mm_packs_epi32(v[10], v[11]);
+  in[7]  = _mm_packs_epi32(v[12], v[13]);
+  in[15] = _mm_packs_epi32(v[14], v[15]);
+}
+
+static void fadst16_8col(__m128i *in) {
+  // perform 16x16 1-D ADST for 8 columns
+  __m128i s[16], x[16], u[32], v[32];
+  const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
+  const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+  const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
+  const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+  const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
+  const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
+  const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
+  const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
+  const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
+  const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
+  const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
+  const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
+  const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
+  const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+  const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
+  const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
+  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+  const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i kZero = _mm_set1_epi16(0);
+
+  u[0] = _mm_unpacklo_epi16(in[15], in[0]);
+  u[1] = _mm_unpackhi_epi16(in[15], in[0]);
+  u[2] = _mm_unpacklo_epi16(in[13], in[2]);
+  u[3] = _mm_unpackhi_epi16(in[13], in[2]);
+  u[4] = _mm_unpacklo_epi16(in[11], in[4]);
+  u[5] = _mm_unpackhi_epi16(in[11], in[4]);
+  u[6] = _mm_unpacklo_epi16(in[9], in[6]);
+  u[7] = _mm_unpackhi_epi16(in[9], in[6]);
+  u[8] = _mm_unpacklo_epi16(in[7], in[8]);
+  u[9] = _mm_unpackhi_epi16(in[7], in[8]);
+  u[10] = _mm_unpacklo_epi16(in[5], in[10]);
+  u[11] = _mm_unpackhi_epi16(in[5], in[10]);
+  u[12] = _mm_unpacklo_epi16(in[3], in[12]);
+  u[13] = _mm_unpackhi_epi16(in[3], in[12]);
+  u[14] = _mm_unpacklo_epi16(in[1], in[14]);
+  u[15] = _mm_unpackhi_epi16(in[1], in[14]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
+  v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
+  v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
+  v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
+  v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
+  v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
+  v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
+  v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
+  v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
+  v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
+  v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
+  v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
+  v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
+  v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
+  v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
+  v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
+  v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
+
+  u[0] = _mm_add_epi32(v[0], v[16]);
+  u[1] = _mm_add_epi32(v[1], v[17]);
+  u[2] = _mm_add_epi32(v[2], v[18]);
+  u[3] = _mm_add_epi32(v[3], v[19]);
+  u[4] = _mm_add_epi32(v[4], v[20]);
+  u[5] = _mm_add_epi32(v[5], v[21]);
+  u[6] = _mm_add_epi32(v[6], v[22]);
+  u[7] = _mm_add_epi32(v[7], v[23]);
+  u[8] = _mm_add_epi32(v[8], v[24]);
+  u[9] = _mm_add_epi32(v[9], v[25]);
+  u[10] = _mm_add_epi32(v[10], v[26]);
+  u[11] = _mm_add_epi32(v[11], v[27]);
+  u[12] = _mm_add_epi32(v[12], v[28]);
+  u[13] = _mm_add_epi32(v[13], v[29]);
+  u[14] = _mm_add_epi32(v[14], v[30]);
+  u[15] = _mm_add_epi32(v[15], v[31]);
+  u[16] = _mm_sub_epi32(v[0], v[16]);
+  u[17] = _mm_sub_epi32(v[1], v[17]);
+  u[18] = _mm_sub_epi32(v[2], v[18]);
+  u[19] = _mm_sub_epi32(v[3], v[19]);
+  u[20] = _mm_sub_epi32(v[4], v[20]);
+  u[21] = _mm_sub_epi32(v[5], v[21]);
+  u[22] = _mm_sub_epi32(v[6], v[22]);
+  u[23] = _mm_sub_epi32(v[7], v[23]);
+  u[24] = _mm_sub_epi32(v[8], v[24]);
+  u[25] = _mm_sub_epi32(v[9], v[25]);
+  u[26] = _mm_sub_epi32(v[10], v[26]);
+  u[27] = _mm_sub_epi32(v[11], v[27]);
+  u[28] = _mm_sub_epi32(v[12], v[28]);
+  u[29] = _mm_sub_epi32(v[13], v[29]);
+  u[30] = _mm_sub_epi32(v[14], v[30]);
+  u[31] = _mm_sub_epi32(v[15], v[31]);
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+  v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
+  v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
+  v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
+  v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
+  v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
+  v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
+  v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
+  v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
+  v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
+  v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
+  v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
+  v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
+  v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
+  v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
+  v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
+  v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+  u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
+  u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
+  u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
+  u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
+  u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
+  u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
+  u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
+  u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
+  u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
+  u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
+  u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
+  u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
+  u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
+  u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
+  u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
+  u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
+
+  s[0] = _mm_packs_epi32(u[0], u[1]);
+  s[1] = _mm_packs_epi32(u[2], u[3]);
+  s[2] = _mm_packs_epi32(u[4], u[5]);
+  s[3] = _mm_packs_epi32(u[6], u[7]);
+  s[4] = _mm_packs_epi32(u[8], u[9]);
+  s[5] = _mm_packs_epi32(u[10], u[11]);
+  s[6] = _mm_packs_epi32(u[12], u[13]);
+  s[7] = _mm_packs_epi32(u[14], u[15]);
+  s[8] = _mm_packs_epi32(u[16], u[17]);
+  s[9] = _mm_packs_epi32(u[18], u[19]);
+  s[10] = _mm_packs_epi32(u[20], u[21]);
+  s[11] = _mm_packs_epi32(u[22], u[23]);
+  s[12] = _mm_packs_epi32(u[24], u[25]);
+  s[13] = _mm_packs_epi32(u[26], u[27]);
+  s[14] = _mm_packs_epi32(u[28], u[29]);
+  s[15] = _mm_packs_epi32(u[30], u[31]);
+
+  // stage 2
+  u[0] = _mm_unpacklo_epi16(s[8], s[9]);
+  u[1] = _mm_unpackhi_epi16(s[8], s[9]);
+  u[2] = _mm_unpacklo_epi16(s[10], s[11]);
+  u[3] = _mm_unpackhi_epi16(s[10], s[11]);
+  u[4] = _mm_unpacklo_epi16(s[12], s[13]);
+  u[5] = _mm_unpackhi_epi16(s[12], s[13]);
+  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
+  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
+
+  u[0] = _mm_add_epi32(v[0], v[8]);
+  u[1] = _mm_add_epi32(v[1], v[9]);
+  u[2] = _mm_add_epi32(v[2], v[10]);
+  u[3] = _mm_add_epi32(v[3], v[11]);
+  u[4] = _mm_add_epi32(v[4], v[12]);
+  u[5] = _mm_add_epi32(v[5], v[13]);
+  u[6] = _mm_add_epi32(v[6], v[14]);
+  u[7] = _mm_add_epi32(v[7], v[15]);
+  u[8] = _mm_sub_epi32(v[0], v[8]);
+  u[9] = _mm_sub_epi32(v[1], v[9]);
+  u[10] = _mm_sub_epi32(v[2], v[10]);
+  u[11] = _mm_sub_epi32(v[3], v[11]);
+  u[12] = _mm_sub_epi32(v[4], v[12]);
+  u[13] = _mm_sub_epi32(v[5], v[13]);
+  u[14] = _mm_sub_epi32(v[6], v[14]);
+  u[15] = _mm_sub_epi32(v[7], v[15]);
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+
+  x[0] = _mm_add_epi16(s[0], s[4]);
+  x[1] = _mm_add_epi16(s[1], s[5]);
+  x[2] = _mm_add_epi16(s[2], s[6]);
+  x[3] = _mm_add_epi16(s[3], s[7]);
+  x[4] = _mm_sub_epi16(s[0], s[4]);
+  x[5] = _mm_sub_epi16(s[1], s[5]);
+  x[6] = _mm_sub_epi16(s[2], s[6]);
+  x[7] = _mm_sub_epi16(s[3], s[7]);
+  x[8] = _mm_packs_epi32(u[0], u[1]);
+  x[9] = _mm_packs_epi32(u[2], u[3]);
+  x[10] = _mm_packs_epi32(u[4], u[5]);
+  x[11] = _mm_packs_epi32(u[6], u[7]);
+  x[12] = _mm_packs_epi32(u[8], u[9]);
+  x[13] = _mm_packs_epi32(u[10], u[11]);
+  x[14] = _mm_packs_epi32(u[12], u[13]);
+  x[15] = _mm_packs_epi32(u[14], u[15]);
+
+  // stage 3
+  u[0] = _mm_unpacklo_epi16(x[4], x[5]);
+  u[1] = _mm_unpackhi_epi16(x[4], x[5]);
+  u[2] = _mm_unpacklo_epi16(x[6], x[7]);
+  u[3] = _mm_unpackhi_epi16(x[6], x[7]);
+  u[4] = _mm_unpacklo_epi16(x[12], x[13]);
+  u[5] = _mm_unpackhi_epi16(x[12], x[13]);
+  u[6] = _mm_unpacklo_epi16(x[14], x[15]);
+  u[7] = _mm_unpackhi_epi16(x[14], x[15]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
+
+  u[0] = _mm_add_epi32(v[0], v[4]);
+  u[1] = _mm_add_epi32(v[1], v[5]);
+  u[2] = _mm_add_epi32(v[2], v[6]);
+  u[3] = _mm_add_epi32(v[3], v[7]);
+  u[4] = _mm_sub_epi32(v[0], v[4]);
+  u[5] = _mm_sub_epi32(v[1], v[5]);
+  u[6] = _mm_sub_epi32(v[2], v[6]);
+  u[7] = _mm_sub_epi32(v[3], v[7]);
+  u[8] = _mm_add_epi32(v[8], v[12]);
+  u[9] = _mm_add_epi32(v[9], v[13]);
+  u[10] = _mm_add_epi32(v[10], v[14]);
+  u[11] = _mm_add_epi32(v[11], v[15]);
+  u[12] = _mm_sub_epi32(v[8], v[12]);
+  u[13] = _mm_sub_epi32(v[9], v[13]);
+  u[14] = _mm_sub_epi32(v[10], v[14]);
+  u[15] = _mm_sub_epi32(v[11], v[15]);
+
+  u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+  u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+  u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+  u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+  u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+  u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+  u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+  u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+  u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+  s[0] = _mm_add_epi16(x[0], x[2]);
+  s[1] = _mm_add_epi16(x[1], x[3]);
+  s[2] = _mm_sub_epi16(x[0], x[2]);
+  s[3] = _mm_sub_epi16(x[1], x[3]);
+  s[4] = _mm_packs_epi32(v[0], v[1]);
+  s[5] = _mm_packs_epi32(v[2], v[3]);
+  s[6] = _mm_packs_epi32(v[4], v[5]);
+  s[7] = _mm_packs_epi32(v[6], v[7]);
+  s[8] = _mm_add_epi16(x[8], x[10]);
+  s[9] = _mm_add_epi16(x[9], x[11]);
+  s[10] = _mm_sub_epi16(x[8], x[10]);
+  s[11] = _mm_sub_epi16(x[9], x[11]);
+  s[12] = _mm_packs_epi32(v[8], v[9]);
+  s[13] = _mm_packs_epi32(v[10], v[11]);
+  s[14] = _mm_packs_epi32(v[12], v[13]);
+  s[15] = _mm_packs_epi32(v[14], v[15]);
+
+  // stage 4
+  u[0] = _mm_unpacklo_epi16(s[2], s[3]);
+  u[1] = _mm_unpackhi_epi16(s[2], s[3]);
+  u[2] = _mm_unpacklo_epi16(s[6], s[7]);
+  u[3] = _mm_unpackhi_epi16(s[6], s[7]);
+  u[4] = _mm_unpacklo_epi16(s[10], s[11]);
+  u[5] = _mm_unpackhi_epi16(s[10], s[11]);
+  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
+  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+  in[0] = s[0];
+  in[1] = _mm_sub_epi16(kZero, s[8]);
+  in[2] = s[12];
+  in[3] = _mm_sub_epi16(kZero, s[4]);
+  in[4] = _mm_packs_epi32(v[4], v[5]);
+  in[5] = _mm_packs_epi32(v[12], v[13]);
+  in[6] = _mm_packs_epi32(v[8], v[9]);
+  in[7] = _mm_packs_epi32(v[0], v[1]);
+  in[8] = _mm_packs_epi32(v[2], v[3]);
+  in[9] = _mm_packs_epi32(v[10], v[11]);
+  in[10] = _mm_packs_epi32(v[14], v[15]);
+  in[11] = _mm_packs_epi32(v[6], v[7]);
+  in[12] = s[5];
+  in[13] = _mm_sub_epi16(kZero, s[13]);
+  in[14] = s[9];
+  in[15] = _mm_sub_epi16(kZero, s[1]);
+}
+
+static void fdct16_sse2(__m128i *in0, __m128i *in1) {
+  fdct16_8col(in0);
+  fdct16_8col(in1);
+  array_transpose_16x16(in0, in1);
+}
+
+static void fadst16_sse2(__m128i *in0, __m128i *in1) {
+  fadst16_8col(in0);
+  fadst16_8col(in1);
+  array_transpose_16x16(in0, in1);
+}
+
+void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output,
+                       int stride, int tx_type) {
+  __m128i in0[16], in1[16];
+
+  switch (tx_type) {
+    case DCT_DCT:
+      vpx_fdct16x16_sse2(input, output, stride);
+      break;
+    case ADST_DCT:
+      load_buffer_16x16(input, in0, in1, stride);
+      fadst16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fdct16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case DCT_ADST:
+      load_buffer_16x16(input, in0, in1, stride);
+      fdct16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case ADST_ADST:
+      load_buffer_16x16(input, in0, in1, stride);
+      fadst16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
diff --git a/libs/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c b/libs/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c
new file mode 100644
index 0000000000..b09eac0d1a
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c
@@ -0,0 +1,472 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#if defined(_MSC_VER) && _MSC_VER <= 1500
+// Need to include math.h before calling tmmintrin.h/intrin.h
+// in certain versions of MSVS.
+#include <math.h>
+#endif
+#include <tmmintrin.h>  // SSSE3
+
+#include "./vp9_rtcd.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride,
+                             int16_t* coeff_ptr, intptr_t n_coeffs,
+                             int skip_block, const int16_t* zbin_ptr,
+                             const int16_t* round_ptr, const int16_t* quant_ptr,
+                             const int16_t* quant_shift_ptr,
+                             int16_t* qcoeff_ptr,
+                             int16_t* dqcoeff_ptr, const int16_t* dequant_ptr,
+                             uint16_t* eob_ptr,
+                             const int16_t* scan_ptr,
+                             const int16_t* iscan_ptr) {
+  __m128i zero;
+  int pass;
+  // Constants
+  //    When we use them, in one case, they are all the same. In all others
+  //    it's a pair of them that we need to repeat four times. This is done
+  //    by constructing the 32 bit constant corresponding to that pair.
+  const __m128i k__dual_p16_p16 = dual_set_epi16(23170, 23170);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  // Load input
+  __m128i in0  = _mm_load_si128((const __m128i *)(input + 0 * stride));
+  __m128i in1  = _mm_load_si128((const __m128i *)(input + 1 * stride));
+  __m128i in2  = _mm_load_si128((const __m128i *)(input + 2 * stride));
+  __m128i in3  = _mm_load_si128((const __m128i *)(input + 3 * stride));
+  __m128i in4  = _mm_load_si128((const __m128i *)(input + 4 * stride));
+  __m128i in5  = _mm_load_si128((const __m128i *)(input + 5 * stride));
+  __m128i in6  = _mm_load_si128((const __m128i *)(input + 6 * stride));
+  __m128i in7  = _mm_load_si128((const __m128i *)(input + 7 * stride));
+  __m128i *in[8];
+  int index = 0;
+
+  (void)scan_ptr;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)coeff_ptr;
+
+  // Pre-condition input (shift by two)
+  in0 = _mm_slli_epi16(in0, 2);
+  in1 = _mm_slli_epi16(in1, 2);
+  in2 = _mm_slli_epi16(in2, 2);
+  in3 = _mm_slli_epi16(in3, 2);
+  in4 = _mm_slli_epi16(in4, 2);
+  in5 = _mm_slli_epi16(in5, 2);
+  in6 = _mm_slli_epi16(in6, 2);
+  in7 = _mm_slli_epi16(in7, 2);
+
+  in[0] = &in0;
+  in[1] = &in1;
+  in[2] = &in2;
+  in[3] = &in3;
+  in[4] = &in4;
+  in[5] = &in5;
+  in[6] = &in6;
+  in[7] = &in7;
+
+  // We do two passes, first the columns, then the rows. The results of the
+  // first pass are transposed so that the same column code can be reused. The
+  // results of the second pass are also transposed so that the rows (processed
+  // as columns) are put back in row positions.
+  for (pass = 0; pass < 2; pass++) {
+    // To store results of each pass before the transpose.
+    __m128i res0, res1, res2, res3, res4, res5, res6, res7;
+    // Add/subtract
+    const __m128i q0 = _mm_add_epi16(in0, in7);
+    const __m128i q1 = _mm_add_epi16(in1, in6);
+    const __m128i q2 = _mm_add_epi16(in2, in5);
+    const __m128i q3 = _mm_add_epi16(in3, in4);
+    const __m128i q4 = _mm_sub_epi16(in3, in4);
+    const __m128i q5 = _mm_sub_epi16(in2, in5);
+    const __m128i q6 = _mm_sub_epi16(in1, in6);
+    const __m128i q7 = _mm_sub_epi16(in0, in7);
+    // Work on first four results
+    {
+      // Add/subtract
+      const __m128i r0 = _mm_add_epi16(q0, q3);
+      const __m128i r1 = _mm_add_epi16(q1, q2);
+      const __m128i r2 = _mm_sub_epi16(q1, q2);
+      const __m128i r3 = _mm_sub_epi16(q0, q3);
+      // Interleave to do the multiply by constants which gets us into 32bits
+      const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+      const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
+      const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+      const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
+
+      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
+      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
+
+      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
+      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
+      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
+      // dct_const_round_shift
+
+      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+
+      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+
+      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+
+      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+      // Combine
+
+      res0 = _mm_packs_epi32(w0, w1);
+      res4 = _mm_packs_epi32(w2, w3);
+      res2 = _mm_packs_epi32(w4, w5);
+      res6 = _mm_packs_epi32(w6, w7);
+    }
+    // Work on next four results
+    {
+      // Interleave to do the multiply by constants which gets us into 32bits
+      const __m128i d0 = _mm_sub_epi16(q6, q5);
+      const __m128i d1 = _mm_add_epi16(q6, q5);
+      const __m128i r0 = _mm_mulhrs_epi16(d0, k__dual_p16_p16);
+      const __m128i r1 = _mm_mulhrs_epi16(d1, k__dual_p16_p16);
+
+      // Add/subtract
+      const __m128i x0 = _mm_add_epi16(q4, r0);
+      const __m128i x1 = _mm_sub_epi16(q4, r0);
+      const __m128i x2 = _mm_sub_epi16(q7, r1);
+      const __m128i x3 = _mm_add_epi16(q7, r1);
+      // Interleave to do the multiply by constants which gets us into 32bits
+      const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
+      const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
+      const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
+      const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
+      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
+      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
+      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
+      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
+      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
+      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
+      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
+      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
+      // dct_const_round_shift
+      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+      // Combine
+      res1 = _mm_packs_epi32(w0, w1);
+      res7 = _mm_packs_epi32(w2, w3);
+      res5 = _mm_packs_epi32(w4, w5);
+      res3 = _mm_packs_epi32(w6, w7);
+    }
+    // Transpose the 8x8.
+    {
+      // 00 01 02 03 04 05 06 07
+      // 10 11 12 13 14 15 16 17
+      // 20 21 22 23 24 25 26 27
+      // 30 31 32 33 34 35 36 37
+      // 40 41 42 43 44 45 46 47
+      // 50 51 52 53 54 55 56 57
+      // 60 61 62 63 64 65 66 67
+      // 70 71 72 73 74 75 76 77
+      const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
+      const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
+      const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
+      const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
+      const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
+      const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
+      const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
+      const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
+      // 00 10 01 11 02 12 03 13
+      // 20 30 21 31 22 32 23 33
+      // 04 14 05 15 06 16 07 17
+      // 24 34 25 35 26 36 27 37
+      // 40 50 41 51 42 52 43 53
+      // 60 70 61 71 62 72 63 73
+      // 54 54 55 55 56 56 57 57
+      // 64 74 65 75 66 76 67 77
+      const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+      const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+      const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+      const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+      const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+      const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+      const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+      const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+      // 00 10 20 30 01 11 21 31
+      // 40 50 60 70 41 51 61 71
+      // 02 12 22 32 03 13 23 33
+      // 42 52 62 72 43 53 63 73
+      // 04 14 24 34 05 15 21 36
+      // 44 54 64 74 45 55 61 76
+      // 06 16 26 36 07 17 27 37
+      // 46 56 66 76 47 57 67 77
+      in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+      in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+      in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+      in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+      in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+      in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+      in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+      in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+      // 00 10 20 30 40 50 60 70
+      // 01 11 21 31 41 51 61 71
+      // 02 12 22 32 42 52 62 72
+      // 03 13 23 33 43 53 63 73
+      // 04 14 24 34 44 54 64 74
+      // 05 15 25 35 45 55 65 75
+      // 06 16 26 36 46 56 66 76
+      // 07 17 27 37 47 57 67 77
+    }
+  }
+  // Post-condition output and store it
+  {
+    // Post-condition (division by two)
+    //    division of two 16 bits signed numbers using shifts
+    //    n / 2 = (n - (n >> 15)) >> 1
+    const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
+    const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
+    const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
+    const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
+    const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
+    const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
+    const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
+    const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
+    in0 = _mm_sub_epi16(in0, sign_in0);
+    in1 = _mm_sub_epi16(in1, sign_in1);
+    in2 = _mm_sub_epi16(in2, sign_in2);
+    in3 = _mm_sub_epi16(in3, sign_in3);
+    in4 = _mm_sub_epi16(in4, sign_in4);
+    in5 = _mm_sub_epi16(in5, sign_in5);
+    in6 = _mm_sub_epi16(in6, sign_in6);
+    in7 = _mm_sub_epi16(in7, sign_in7);
+    in0 = _mm_srai_epi16(in0, 1);
+    in1 = _mm_srai_epi16(in1, 1);
+    in2 = _mm_srai_epi16(in2, 1);
+    in3 = _mm_srai_epi16(in3, 1);
+    in4 = _mm_srai_epi16(in4, 1);
+    in5 = _mm_srai_epi16(in5, 1);
+    in6 = _mm_srai_epi16(in6, 1);
+    in7 = _mm_srai_epi16(in7, 1);
+  }
+
+  iscan_ptr += n_coeffs;
+  qcoeff_ptr += n_coeffs;
+  dqcoeff_ptr += n_coeffs;
+  n_coeffs = -n_coeffs;
+  zero = _mm_setzero_si128();
+
+  if (!skip_block) {
+    __m128i eob;
+    __m128i round, quant, dequant, thr;
+    int16_t nzflag;
+    {
+      __m128i coeff0, coeff1;
+
+      // Setup global values
+      {
+        round = _mm_load_si128((const __m128i*)round_ptr);
+        quant = _mm_load_si128((const __m128i*)quant_ptr);
+        dequant = _mm_load_si128((const __m128i*)dequant_ptr);
+      }
+
+      {
+        __m128i coeff0_sign, coeff1_sign;
+        __m128i qcoeff0, qcoeff1;
+        __m128i qtmp0, qtmp1;
+        // Do DC and first 15 AC
+        coeff0 = *in[0];
+        coeff1 = *in[1];
+
+        // Poor man's sign extract
+        coeff0_sign = _mm_srai_epi16(coeff0, 15);
+        coeff1_sign = _mm_srai_epi16(coeff1, 15);
+        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+        round = _mm_unpackhi_epi64(round, round);
+        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+        quant = _mm_unpackhi_epi64(quant, quant);
+        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+        // Reinsert signs
+        qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+
+        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+        dequant = _mm_unpackhi_epi64(dequant, dequant);
+        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+      }
+
+      {
+        // Scan for eob
+        __m128i zero_coeff0, zero_coeff1;
+        __m128i nzero_coeff0, nzero_coeff1;
+        __m128i iscan0, iscan1;
+        __m128i eob1;
+        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+        iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
+        iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
+        // Add one to convert from indices to counts
+        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+        eob = _mm_and_si128(iscan0, nzero_coeff0);
+        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+        eob = _mm_max_epi16(eob, eob1);
+      }
+      n_coeffs += 8 * 2;
+    }
+
+    // AC only loop
+    index = 2;
+    thr = _mm_srai_epi16(dequant, 1);
+    while (n_coeffs < 0) {
+      __m128i coeff0, coeff1;
+      {
+        __m128i coeff0_sign, coeff1_sign;
+        __m128i qcoeff0, qcoeff1;
+        __m128i qtmp0, qtmp1;
+
+        assert(index < (int)(sizeof(in) / sizeof(in[0])) - 1);
+        coeff0 = *in[index];
+        coeff1 = *in[index + 1];
+
+        // Poor man's sign extract
+        coeff0_sign = _mm_srai_epi16(coeff0, 15);
+        coeff1_sign = _mm_srai_epi16(coeff1, 15);
+        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
+            _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
+
+        if (nzflag) {
+          qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+          qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+          qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+          qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+          // Reinsert signs
+          qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+          qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+          qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+          qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+          _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
+          _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+
+          coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+          coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+          _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
+          _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+        } else {
+          _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
+          _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
+
+          _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
+          _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
+        }
+      }
+
+      if (nzflag) {
+        // Scan for eob
+        __m128i zero_coeff0, zero_coeff1;
+        __m128i nzero_coeff0, nzero_coeff1;
+        __m128i iscan0, iscan1;
+        __m128i eob0, eob1;
+        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+        iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
+        iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
+        // Add one to convert from indices to counts
+        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+        eob0 = _mm_and_si128(iscan0, nzero_coeff0);
+        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+        eob0 = _mm_max_epi16(eob0, eob1);
+        eob = _mm_max_epi16(eob, eob0);
+      }
+      n_coeffs += 8 * 2;
+      index += 2;
+    }
+
+    // Accumulate EOB
+    {
+      __m128i eob_shuffled;
+      eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      *eob_ptr = _mm_extract_epi16(eob, 1);
+    }
+  } else {
+    do {
+      _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
+      _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
+      _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
+      _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
+      n_coeffs += 8 * 2;
+    } while (n_coeffs < 0);
+    *eob_ptr = 0;
+  }
+}
diff --git a/libs/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c b/libs/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c
new file mode 100644
index 0000000000..bf7c7af770
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c
@@ -0,0 +1,375 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+
+#include "vpx_ports/emmintrin_compat.h"
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/encoder/vp9_context_tree.h"
+#include "vp9/encoder/vp9_denoiser.h"
+#include "vpx_mem/vpx_mem.h"
+
+// Compute the sum of all pixel differences of this MB.
+static INLINE int sum_diff_16x1(__m128i acc_diff) {
+  const __m128i k_1 = _mm_set1_epi16(1);
+  const __m128i acc_diff_lo =
+      _mm_srai_epi16(_mm_unpacklo_epi8(acc_diff, acc_diff), 8);
+  const __m128i acc_diff_hi =
+      _mm_srai_epi16(_mm_unpackhi_epi8(acc_diff, acc_diff), 8);
+  const __m128i acc_diff_16 = _mm_add_epi16(acc_diff_lo, acc_diff_hi);
+  const __m128i hg_fe_dc_ba = _mm_madd_epi16(acc_diff_16, k_1);
+  const __m128i hgfe_dcba =
+      _mm_add_epi32(hg_fe_dc_ba, _mm_srli_si128(hg_fe_dc_ba, 8));
+  const __m128i hgfedcba =
+      _mm_add_epi32(hgfe_dcba, _mm_srli_si128(hgfe_dcba, 4));
+  return _mm_cvtsi128_si32(hgfedcba);
+}
+
+// Denoise a 16x1 vector.
+static INLINE __m128i vp9_denoiser_16x1_sse2(const uint8_t *sig,
+                                             const uint8_t *mc_running_avg_y,
+                                             uint8_t *running_avg_y,
+                                             const __m128i *k_0,
+                                             const __m128i *k_4,
+                                             const __m128i *k_8,
+                                             const __m128i *k_16,
+                                             const __m128i *l3,
+                                             const __m128i *l32,
+                                             const __m128i *l21,
+                                             __m128i acc_diff) {
+  // Calculate differences
+  const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0]));
+  const __m128i v_mc_running_avg_y =
+      _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0]));
+  __m128i v_running_avg_y;
+  const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
+  const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
+  // Obtain the sign. FF if diff is negative.
+  const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, *k_0);
+  // Clamp absolute difference to 16 to be used to get mask. Doing this
+  // allows us to use _mm_cmpgt_epi8, which operates on signed byte.
+  const __m128i clamped_absdiff =
+      _mm_min_epu8(_mm_or_si128(pdiff, ndiff), *k_16);
+  // Get masks for l2 l1 and l0 adjustments.
+  const __m128i mask2 = _mm_cmpgt_epi8(*k_16, clamped_absdiff);
+  const __m128i mask1 = _mm_cmpgt_epi8(*k_8, clamped_absdiff);
+  const __m128i mask0 = _mm_cmpgt_epi8(*k_4, clamped_absdiff);
+  // Get adjustments for l2, l1, and l0.
+  __m128i adj2 = _mm_and_si128(mask2, *l32);
+  const __m128i adj1 = _mm_and_si128(mask1, *l21);
+  const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff);
+  __m128i adj,  padj, nadj;
+
+  // Combine the adjustments and get absolute adjustments.
+  adj2 = _mm_add_epi8(adj2, adj1);
+  adj = _mm_sub_epi8(*l3, adj2);
+  adj = _mm_andnot_si128(mask0, adj);
+  adj = _mm_or_si128(adj, adj0);
+
+  // Restore the sign and get positive and negative adjustments.
+  padj = _mm_andnot_si128(diff_sign, adj);
+  nadj = _mm_and_si128(diff_sign, adj);
+
+  // Calculate filtered value.
+  v_running_avg_y = _mm_adds_epu8(v_sig, padj);
+  v_running_avg_y = _mm_subs_epu8(v_running_avg_y, nadj);
+  _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);
+
+  // Adjustments <=7, and each element in acc_diff can fit in signed
+  // char.
+  acc_diff = _mm_adds_epi8(acc_diff, padj);
+  acc_diff = _mm_subs_epi8(acc_diff, nadj);
+  return acc_diff;
+}
+
+// Denoise a 16x1 vector with a weaker filter.
+static INLINE __m128i vp9_denoiser_adj_16x1_sse2(
+    const uint8_t *sig, const uint8_t *mc_running_avg_y,
+    uint8_t *running_avg_y, const __m128i k_0,
+    const __m128i k_delta, __m128i acc_diff) {
+  __m128i v_running_avg_y = _mm_loadu_si128((__m128i *)(&running_avg_y[0]));
+  // Calculate differences.
+  const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0]));
+  const __m128i v_mc_running_avg_y =
+      _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0]));
+  const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
+  const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
+  // Obtain the sign. FF if diff is negative.
+  const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
+  // Clamp absolute difference to delta to get the adjustment.
+  const __m128i adj =
+      _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta);
+  // Restore the sign and get positive and negative adjustments.
+  __m128i padj, nadj;
+  padj = _mm_andnot_si128(diff_sign, adj);
+  nadj = _mm_and_si128(diff_sign, adj);
+  // Calculate filtered value.
+  v_running_avg_y = _mm_subs_epu8(v_running_avg_y, padj);
+  v_running_avg_y = _mm_adds_epu8(v_running_avg_y, nadj);
+  _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);
+
+  // Accumulate the adjustments.
+  acc_diff = _mm_subs_epi8(acc_diff, padj);
+  acc_diff = _mm_adds_epi8(acc_diff, nadj);
+  return acc_diff;
+}
+
+// Denoiser for 4xM and 8xM blocks.
+static int vp9_denoiser_NxM_sse2_small(
+    const uint8_t *sig, int sig_stride, const uint8_t *mc_running_avg_y,
+    int mc_avg_y_stride, uint8_t *running_avg_y, int avg_y_stride,
+    int increase_denoising, BLOCK_SIZE bs, int motion_magnitude, int width) {
+  int sum_diff_thresh, r, sum_diff = 0;
+  const int shift_inc  = (increase_denoising &&
+                          motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ?
+                         1 : 0;
+  uint8_t sig_buffer[8][16], mc_running_buffer[8][16], running_buffer[8][16];
+  __m128i acc_diff = _mm_setzero_si128();
+  const __m128i k_0 = _mm_setzero_si128();
+  const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
+  const __m128i k_8 = _mm_set1_epi8(8);
+  const __m128i k_16 = _mm_set1_epi8(16);
+  // Modify each level's adjustment according to motion_magnitude.
+  const __m128i l3 = _mm_set1_epi8(
+      (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6);
+  // Difference between level 3 and level 2 is 2.
+  const __m128i l32 = _mm_set1_epi8(2);
+  // Difference between level 2 and level 1 is 1.
+  const __m128i l21 = _mm_set1_epi8(1);
+  const uint8_t shift = (width == 4) ? 2 : 1;
+
+  for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> shift); ++r) {
+    memcpy(sig_buffer[r], sig, width);
+    memcpy(sig_buffer[r] + width, sig + sig_stride, width);
+    memcpy(mc_running_buffer[r], mc_running_avg_y, width);
+    memcpy(mc_running_buffer[r] + width,
+           mc_running_avg_y + mc_avg_y_stride, width);
+    memcpy(running_buffer[r], running_avg_y, width);
+    memcpy(running_buffer[r] + width, running_avg_y + avg_y_stride, width);
+    if (width == 4) {
+      memcpy(sig_buffer[r] + width * 2, sig + sig_stride * 2, width);
+      memcpy(sig_buffer[r] + width * 3, sig + sig_stride * 3, width);
+      memcpy(mc_running_buffer[r] + width * 2,
+             mc_running_avg_y + mc_avg_y_stride * 2, width);
+      memcpy(mc_running_buffer[r] + width * 3,
+             mc_running_avg_y + mc_avg_y_stride * 3, width);
+      memcpy(running_buffer[r] + width * 2,
+             running_avg_y + avg_y_stride * 2, width);
+      memcpy(running_buffer[r] + width * 3,
+             running_avg_y + avg_y_stride * 3, width);
+    }
+    acc_diff = vp9_denoiser_16x1_sse2(sig_buffer[r],
+                                      mc_running_buffer[r],
+                                      running_buffer[r],
+                                      &k_0, &k_4, &k_8, &k_16,
+                                      &l3, &l32, &l21, acc_diff);
+    memcpy(running_avg_y, running_buffer[r], width);
+    memcpy(running_avg_y + avg_y_stride, running_buffer[r] + width, width);
+    if (width == 4) {
+      memcpy(running_avg_y + avg_y_stride * 2,
+             running_buffer[r] + width * 2, width);
+      memcpy(running_avg_y + avg_y_stride * 3,
+             running_buffer[r] + width * 3, width);
+    }
+    // Update pointers for next iteration.
+    sig += (sig_stride << shift);
+    mc_running_avg_y += (mc_avg_y_stride << shift);
+    running_avg_y += (avg_y_stride << shift);
+  }
+
+  {
+    sum_diff = sum_diff_16x1(acc_diff);
+    sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
+    if (abs(sum_diff) > sum_diff_thresh) {
+      // Before returning to copy the block (i.e., apply no denoising),
+      // check if we can still apply some (weaker) temporal filtering to
+      // this block, that would otherwise not be denoised at all. Simplest
+      // is to apply an additional adjustment to running_avg_y to bring it
+      // closer to sig. The adjustment is capped by a maximum delta, and
+      // chosen such that in most cases the resulting sum_diff will be
+      // within the acceptable range given by sum_diff_thresh.
+
+      // The delta is set by the excess of absolute pixel diff over the
+      // threshold.
+      const int delta = ((abs(sum_diff) - sum_diff_thresh) >>
+                         num_pels_log2_lookup[bs]) + 1;
+      // Only apply the adjustment for max delta up to 3.
+      if (delta < 4) {
+        const __m128i k_delta = _mm_set1_epi8(delta);
+        running_avg_y -= avg_y_stride * (4 << b_height_log2_lookup[bs]);
+        for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> shift); ++r) {
+          acc_diff = vp9_denoiser_adj_16x1_sse2(
+              sig_buffer[r], mc_running_buffer[r], running_buffer[r],
+              k_0, k_delta, acc_diff);
+          memcpy(running_avg_y, running_buffer[r], width);
+          memcpy(running_avg_y + avg_y_stride,
+                 running_buffer[r] + width, width);
+          if (width == 4) {
+            memcpy(running_avg_y + avg_y_stride * 2,
+                   running_buffer[r] + width * 2, width);
+            memcpy(running_avg_y + avg_y_stride * 3,
+                   running_buffer[r] + width * 3, width);
+          }
+          // Update pointers for next iteration.
+          running_avg_y += (avg_y_stride << shift);
+        }
+        sum_diff = sum_diff_16x1(acc_diff);
+        if (abs(sum_diff) > sum_diff_thresh) {
+          return COPY_BLOCK;
+        }
+      } else {
+        return COPY_BLOCK;
+      }
+    }
+  }
+  return FILTER_BLOCK;
+}
+
+// Denoiser for 16xM, 32xM and 64xM blocks
+static int vp9_denoiser_NxM_sse2_big(const uint8_t *sig, int sig_stride,
+                                     const uint8_t *mc_running_avg_y,
+                                     int mc_avg_y_stride,
+                                     uint8_t *running_avg_y,
+                                     int avg_y_stride,
+                                     int increase_denoising, BLOCK_SIZE bs,
+                                     int motion_magnitude) {
+  int sum_diff_thresh, r, c, sum_diff = 0;
+  const int shift_inc  = (increase_denoising &&
+                          motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ?
+                         1 : 0;
+  __m128i acc_diff[4][4];
+  const __m128i k_0 = _mm_setzero_si128();
+  const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
+  const __m128i k_8 = _mm_set1_epi8(8);
+  const __m128i k_16 = _mm_set1_epi8(16);
+  // Modify each level's adjustment according to motion_magnitude.
+  const __m128i l3 = _mm_set1_epi8(
+      (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6);
+  // Difference between level 3 and level 2 is 2.
+  const __m128i l32 = _mm_set1_epi8(2);
+  // Difference between level 2 and level 1 is 1.
+  const __m128i l21 = _mm_set1_epi8(1);
+
+  for (c = 0; c < 4; ++c) {
+    for (r = 0; r < 4; ++r) {
+      acc_diff[c][r] = _mm_setzero_si128();
+    }
+  }
+
+  for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) {
+    for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) {
+      acc_diff[c>>4][r>>4] = vp9_denoiser_16x1_sse2(
+          sig, mc_running_avg_y, running_avg_y, &k_0, &k_4,
+          &k_8, &k_16, &l3, &l32, &l21, acc_diff[c>>4][r>>4]);
+      // Update pointers for next iteration.
+      sig += 16;
+      mc_running_avg_y += 16;
+      running_avg_y += 16;
+    }
+
+    if ((r + 1) % 16 == 0 || (bs == BLOCK_16X8 && r == 7)) {
+      for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) {
+        sum_diff += sum_diff_16x1(acc_diff[c>>4][r>>4]);
+      }
+    }
+
+    // Update pointers for next iteration.
+    sig = sig - 16 * ((4 << b_width_log2_lookup[bs]) >> 4) + sig_stride;
+    mc_running_avg_y = mc_running_avg_y -
+                       16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
+                       mc_avg_y_stride;
+    running_avg_y = running_avg_y -
+                    16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
+                    avg_y_stride;
+  }
+
+  {
+    sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
+    if (abs(sum_diff) > sum_diff_thresh) {
+      const int delta = ((abs(sum_diff) - sum_diff_thresh) >>
+                         num_pels_log2_lookup[bs]) + 1;
+
+      // Only apply the adjustment for max delta up to 3.
+      if (delta < 4) {
+        const __m128i k_delta = _mm_set1_epi8(delta);
+        sig -= sig_stride * (4 << b_height_log2_lookup[bs]);
+        mc_running_avg_y -= mc_avg_y_stride * (4 << b_height_log2_lookup[bs]);
+        running_avg_y -= avg_y_stride * (4 << b_height_log2_lookup[bs]);
+        sum_diff = 0;
+        for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) {
+          for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) {
+            acc_diff[c>>4][r>>4] = vp9_denoiser_adj_16x1_sse2(
+                sig, mc_running_avg_y, running_avg_y, k_0,
+                k_delta, acc_diff[c>>4][r>>4]);
+            // Update pointers for next iteration.
+            sig += 16;
+            mc_running_avg_y += 16;
+            running_avg_y += 16;
+          }
+
+          if ((r + 1) % 16 == 0 || (bs == BLOCK_16X8 && r == 7)) {
+            for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) {
+              sum_diff += sum_diff_16x1(acc_diff[c>>4][r>>4]);
+            }
+          }
+          sig = sig - 16 * ((4 << b_width_log2_lookup[bs]) >> 4) + sig_stride;
+          mc_running_avg_y = mc_running_avg_y -
+                             16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
+                             mc_avg_y_stride;
+          running_avg_y = running_avg_y -
+                          16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
+                          avg_y_stride;
+        }
+        if (abs(sum_diff) > sum_diff_thresh) {
+          return COPY_BLOCK;
+        }
+      } else {
+        return COPY_BLOCK;
+      }
+    }
+  }
+  return FILTER_BLOCK;
+}
+
+int vp9_denoiser_filter_sse2(const uint8_t *sig, int sig_stride,
+                             const uint8_t *mc_avg,
+                             int mc_avg_stride,
+                             uint8_t *avg, int avg_stride,
+                             int increase_denoising,
+                             BLOCK_SIZE bs,
+                             int motion_magnitude) {
+  if (bs == BLOCK_4X4 || bs == BLOCK_4X8) {
+    return vp9_denoiser_NxM_sse2_small(sig, sig_stride,
+                                       mc_avg, mc_avg_stride,
+                                       avg, avg_stride,
+                                       increase_denoising,
+                                       bs, motion_magnitude, 4);
+  } else if (bs == BLOCK_8X4 || bs == BLOCK_8X8 || bs == BLOCK_8X16) {
+    return vp9_denoiser_NxM_sse2_small(sig, sig_stride,
+                                       mc_avg, mc_avg_stride,
+                                       avg, avg_stride,
+                                       increase_denoising,
+                                       bs, motion_magnitude, 8);
+  } else if (bs == BLOCK_16X8 || bs == BLOCK_16X16 || bs == BLOCK_16X32 ||
+             bs == BLOCK_32X16|| bs == BLOCK_32X32 || bs == BLOCK_32X64 ||
+             bs == BLOCK_64X32 || bs == BLOCK_64X64) {
+    return vp9_denoiser_NxM_sse2_big(sig, sig_stride,
+                                     mc_avg, mc_avg_stride,
+                                     avg, avg_stride,
+                                     increase_denoising,
+                                     bs, motion_magnitude);
+  } else {
+    return COPY_BLOCK;
+  }
+}
diff --git a/libs/libvpx/vp9/encoder/x86/vp9_diamond_search_sad_avx.c b/libs/libvpx/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
new file mode 100644
index 0000000000..0bc417fc15
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
@@ -0,0 +1,323 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#if defined(_MSC_VER)
+# include <intrin.h>
+#endif
+#include <emmintrin.h>
+#include <smmintrin.h>
+
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vpx_ports/mem.h"
+
+#ifdef __GNUC__
+# define LIKELY(v)    __builtin_expect(v, 1)
+# define UNLIKELY(v)  __builtin_expect(v, 0)
+#else
+# define LIKELY(v)    (v)
+# define UNLIKELY(v)  (v)
+#endif
+
+static INLINE int_mv pack_int_mv(int16_t row, int16_t col) {
+  int_mv result;
+  result.as_mv.row = row;
+  result.as_mv.col = col;
+  return result;
+}
+
+static INLINE MV_JOINT_TYPE get_mv_joint(const int_mv mv) {
+  // This is simplified from the C implementation to utilise that
+  //  x->nmvjointsadcost[1] == x->nmvjointsadcost[2]  and
+  //  x->nmvjointsadcost[1] == x->nmvjointsadcost[3]
+  return mv.as_int == 0 ? 0 : 1;
+}
+
+static INLINE int mv_cost(const int_mv mv,
+                          const int *joint_cost, int *const comp_cost[2]) {
+  return joint_cost[get_mv_joint(mv)] +
+         comp_cost[0][mv.as_mv.row] + comp_cost[1][mv.as_mv.col];
+}
+
+static int mvsad_err_cost(const MACROBLOCK *x, const int_mv mv, const MV *ref,
+                          int sad_per_bit) {
+  const int_mv diff = pack_int_mv(mv.as_mv.row - ref->row,
+                                  mv.as_mv.col - ref->col);
+  return ROUND_POWER_OF_TWO((unsigned)mv_cost(diff, x->nmvjointsadcost,
+                                              x->nmvsadcost) *
+                                              sad_per_bit, VP9_PROB_COST_SHIFT);
+}
+
+/*****************************************************************************
+ * This function utilises 3 properties of the cost function lookup tables,   *
+ * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in       *
+ * vp9_encoder.c.                                                            *
+ * For the joint cost:                                                       *
+ *   - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3]           *
+ * For the component costs:                                                  *
+ *   - For all i: mvsadcost[0][i] == mvsadcost[1][i]                         *
+ *         (Equal costs for both components)                                 *
+ *   - For all i: mvsadcost[0][i] == mvsadcost[0][-i]                        *
+ *         (Cost function is even)                                           *
+ * If these do not hold, then this function cannot be used without           *
+ * modification, in which case you can revert to using the C implementation, *
+ * which does not rely on these properties.                                  *
+ *****************************************************************************/
+int vp9_diamond_search_sad_avx(const MACROBLOCK *x,
+                               const search_site_config *cfg,
+                               MV *ref_mv, MV *best_mv, int search_param,
+                               int sad_per_bit, int *num00,
+                               const vp9_variance_fn_ptr_t *fn_ptr,
+                               const MV *center_mv) {
+  const int_mv maxmv = pack_int_mv(x->mv_row_max, x->mv_col_max);
+  const __m128i v_max_mv_w = _mm_set1_epi32(maxmv.as_int);
+  const int_mv minmv = pack_int_mv(x->mv_row_min, x->mv_col_min);
+  const __m128i v_min_mv_w = _mm_set1_epi32(minmv.as_int);
+
+  const __m128i v_spb_d = _mm_set1_epi32(sad_per_bit);
+
+  const __m128i v_joint_cost_0_d = _mm_set1_epi32(x->nmvjointsadcost[0]);
+  const __m128i v_joint_cost_1_d = _mm_set1_epi32(x->nmvjointsadcost[1]);
+
+  // search_param determines the length of the initial step and hence the number
+  // of iterations.
+  // 0 = initial step (MAX_FIRST_STEP) pel
+  // 1 = (MAX_FIRST_STEP/2) pel,
+  // 2 = (MAX_FIRST_STEP/4) pel...
+  const       MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param];
+  const intptr_t *ss_os = &cfg->ss_os[cfg->searches_per_step * search_param];
+  const int tot_steps = cfg->total_steps - search_param;
+
+  const int_mv fcenter_mv = pack_int_mv(center_mv->row >> 3,
+                                        center_mv->col >> 3);
+  const __m128i vfcmv = _mm_set1_epi32(fcenter_mv.as_int);
+
+  const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row);
+  const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col);
+
+  int_mv bmv = pack_int_mv(ref_row, ref_col);
+  int_mv new_bmv = bmv;
+  __m128i v_bmv_w = _mm_set1_epi32(bmv.as_int);
+
+  const int what_stride = x->plane[0].src.stride;
+  const int in_what_stride = x->e_mbd.plane[0].pre[0].stride;
+  const uint8_t *const what = x->plane[0].src.buf;
+  const uint8_t *const in_what = x->e_mbd.plane[0].pre[0].buf +
+                                 ref_row * in_what_stride + ref_col;
+
+  // Work out the start point for the search
+  const uint8_t *best_address = in_what;
+  const uint8_t *new_best_address = best_address;
+#if ARCH_X86_64
+  __m128i v_ba_q = _mm_set1_epi64x((intptr_t)best_address);
+#else
+  __m128i v_ba_d = _mm_set1_epi32((intptr_t)best_address);
+#endif
+
+  unsigned int best_sad;
+
+  int i;
+  int j;
+  int step;
+
+  // Check the prerequisite cost function properties that are easy to check
+  // in an assert. See the function-level documentation for details on all
+  // prerequisites.
+  assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]);
+  assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]);
+
+  // Check the starting position
+  best_sad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride);
+  best_sad += mvsad_err_cost(x, bmv, &fcenter_mv.as_mv, sad_per_bit);
+
+  *num00 = 0;
+
+  for (i = 0, step = 0; step < tot_steps; step++) {
+    for (j = 0; j < cfg->searches_per_step; j += 4, i += 4) {
+      __m128i v_sad_d;
+      __m128i v_cost_d;
+      __m128i v_outside_d;
+      __m128i v_inside_d;
+      __m128i v_diff_mv_w;
+#if ARCH_X86_64
+      __m128i v_blocka[2];
+#else
+      __m128i v_blocka[1];
+#endif
+
+      // Compute the candidate motion vectors
+      const __m128i v_ss_mv_w = _mm_loadu_si128((const __m128i*)&ss_mv[i]);
+      const __m128i v_these_mv_w = _mm_add_epi16(v_bmv_w, v_ss_mv_w);
+      // Clamp them to the search bounds
+      __m128i v_these_mv_clamp_w = v_these_mv_w;
+      v_these_mv_clamp_w = _mm_min_epi16(v_these_mv_clamp_w, v_max_mv_w);
+      v_these_mv_clamp_w = _mm_max_epi16(v_these_mv_clamp_w, v_min_mv_w);
+      // The ones that did not change are inside the search area
+      v_inside_d = _mm_cmpeq_epi32(v_these_mv_clamp_w, v_these_mv_w);
+
+      // If none of them are inside, then move on
+      if (LIKELY(_mm_test_all_zeros(v_inside_d, v_inside_d))) {
+        continue;
+      }
+
+      // The inverse mask indicates which of the MVs are outside
+      v_outside_d = _mm_xor_si128(v_inside_d, _mm_set1_epi8(0xff));
+      // Shift right to keep the sign bit clear, we will use this later
+      // to set the cost to the maximum value.
+      v_outside_d = _mm_srli_epi32(v_outside_d, 1);
+
+      // Compute the difference MV
+      v_diff_mv_w = _mm_sub_epi16(v_these_mv_clamp_w, vfcmv);
+      // We utilise the fact that the cost function is even, and use the
+      // absolute difference. This allows us to use unsigned indexes later
+      // and reduces cache pressure somewhat as only a half of the table
+      // is ever referenced.
+      v_diff_mv_w = _mm_abs_epi16(v_diff_mv_w);
+
+      // Compute the SIMD pointer offsets.
+      {
+#if ARCH_X86_64  //  sizeof(intptr_t) == 8
+        // Load the offsets
+        __m128i v_bo10_q = _mm_loadu_si128((const __m128i*)&ss_os[i+0]);
+        __m128i v_bo32_q = _mm_loadu_si128((const __m128i*)&ss_os[i+2]);
+        // Set the ones falling outside to zero
+        v_bo10_q = _mm_and_si128(v_bo10_q,
+                                 _mm_cvtepi32_epi64(v_inside_d));
+        v_bo32_q = _mm_and_si128(v_bo32_q,
+                                 _mm_unpackhi_epi32(v_inside_d, v_inside_d));
+        // Compute the candidate addresses
+        v_blocka[0] = _mm_add_epi64(v_ba_q, v_bo10_q);
+        v_blocka[1] = _mm_add_epi64(v_ba_q, v_bo32_q);
+#else  // ARCH_X86 //  sizeof(intptr_t) == 4
+        __m128i v_bo_d = _mm_loadu_si128((const __m128i*)&ss_os[i]);
+        v_bo_d = _mm_and_si128(v_bo_d, v_inside_d);
+        v_blocka[0] = _mm_add_epi32(v_ba_d, v_bo_d);
+#endif
+      }
+
+      fn_ptr->sdx4df(what, what_stride,
+                     (const uint8_t **)&v_blocka[0], in_what_stride,
+                     (uint32_t*)&v_sad_d);
+
+      // Look up the component cost of the residual motion vector
+      {
+        const int32_t row0 = _mm_extract_epi16(v_diff_mv_w, 0);
+        const int32_t col0 = _mm_extract_epi16(v_diff_mv_w, 1);
+        const int32_t row1 = _mm_extract_epi16(v_diff_mv_w, 2);
+        const int32_t col1 = _mm_extract_epi16(v_diff_mv_w, 3);
+        const int32_t row2 = _mm_extract_epi16(v_diff_mv_w, 4);
+        const int32_t col2 = _mm_extract_epi16(v_diff_mv_w, 5);
+        const int32_t row3 = _mm_extract_epi16(v_diff_mv_w, 6);
+        const int32_t col3 = _mm_extract_epi16(v_diff_mv_w, 7);
+
+        // Note: This is a use case for vpgather in AVX2
+        const uint32_t cost0 = x->nmvsadcost[0][row0] + x->nmvsadcost[0][col0];
+        const uint32_t cost1 = x->nmvsadcost[0][row1] + x->nmvsadcost[0][col1];
+        const uint32_t cost2 = x->nmvsadcost[0][row2] + x->nmvsadcost[0][col2];
+        const uint32_t cost3 = x->nmvsadcost[0][row3] + x->nmvsadcost[0][col3];
+
+        __m128i v_cost_10_d, v_cost_32_d;
+
+        v_cost_10_d = _mm_cvtsi32_si128(cost0);
+        v_cost_10_d = _mm_insert_epi32(v_cost_10_d, cost1, 1);
+
+        v_cost_32_d = _mm_cvtsi32_si128(cost2);
+        v_cost_32_d = _mm_insert_epi32(v_cost_32_d, cost3, 1);
+
+        v_cost_d = _mm_unpacklo_epi64(v_cost_10_d, v_cost_32_d);
+      }
+
+      // Now add in the joint cost
+      {
+        const __m128i v_sel_d = _mm_cmpeq_epi32(v_diff_mv_w,
+                                                _mm_setzero_si128());
+        const __m128i v_joint_cost_d = _mm_blendv_epi8(v_joint_cost_1_d,
+                                                       v_joint_cost_0_d,
+                                                       v_sel_d);
+        v_cost_d = _mm_add_epi32(v_cost_d, v_joint_cost_d);
+      }
+
+      // Multiply by sad_per_bit
+      v_cost_d = _mm_mullo_epi32(v_cost_d, v_spb_d);
+      // ROUND_POWER_OF_TWO(v_cost_d, 8)
+      v_cost_d = _mm_add_epi32(v_cost_d, _mm_set1_epi32(0x80));
+      v_cost_d = _mm_srai_epi32(v_cost_d, 8);
+      // Add the cost to the sad
+      v_sad_d = _mm_add_epi32(v_sad_d, v_cost_d);
+
+      // Make the motion vectors outside the search area have max cost
+      // by or'ing in the comparison mask, this way the minimum search won't
+      // pick them.
+      v_sad_d = _mm_or_si128(v_sad_d, v_outside_d);
+
+      // Find the minimum value and index horizontally in v_sad_d
+      {
+        // Try speculatively on 16 bits, so we can use the minpos intrinsic
+        const __m128i v_sad_w = _mm_packus_epi32(v_sad_d, v_sad_d);
+        const __m128i v_minp_w = _mm_minpos_epu16(v_sad_w);
+
+        uint32_t local_best_sad = _mm_extract_epi16(v_minp_w, 0);
+        uint32_t local_best_idx = _mm_extract_epi16(v_minp_w, 1);
+
+        // If the local best value is not saturated, just use it, otherwise
+        // find the horizontal minimum again the hard way on 32 bits.
+        // This is executed rarely.
+        if (UNLIKELY(local_best_sad == 0xffff)) {
+          __m128i v_loval_d, v_hival_d, v_loidx_d, v_hiidx_d, v_sel_d;
+
+          v_loval_d = v_sad_d;
+          v_loidx_d = _mm_set_epi32(3, 2, 1, 0);
+          v_hival_d = _mm_srli_si128(v_loval_d, 8);
+          v_hiidx_d = _mm_srli_si128(v_loidx_d, 8);
+
+          v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d);
+
+          v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d);
+          v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d);
+          v_hival_d = _mm_srli_si128(v_loval_d, 4);
+          v_hiidx_d = _mm_srli_si128(v_loidx_d, 4);
+
+          v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d);
+
+          v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d);
+          v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d);
+
+          local_best_sad = _mm_extract_epi32(v_loval_d, 0);
+          local_best_idx = _mm_extract_epi32(v_loidx_d, 0);
+        }
+
+        // Update the global minimum if the local minimum is smaller
+        if (LIKELY(local_best_sad < best_sad)) {
+          new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx];
+          new_best_address = ((const uint8_t **)v_blocka)[local_best_idx];
+
+          best_sad = local_best_sad;
+        }
+      }
+    }
+
+    bmv = new_bmv;
+    best_address = new_best_address;
+
+    v_bmv_w = _mm_set1_epi32(bmv.as_int);
+#if ARCH_X86_64
+    v_ba_q = _mm_set1_epi64x((intptr_t)best_address);
+#else
+    v_ba_d = _mm_set1_epi32((intptr_t)best_address);
+#endif
+
+    if (UNLIKELY(best_address == in_what)) {
+      (*num00)++;
+    }
+  }
+
+  *best_mv = bmv.as_mv;
+  return best_sad;
+}
diff --git a/libs/libvpx/vp9/encoder/x86/vp9_error_intrin_avx2.c b/libs/libvpx/vp9/encoder/x86/vp9_error_intrin_avx2.c
new file mode 100644
index 0000000000..dfebaab0ac
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/x86/vp9_error_intrin_avx2.c
@@ -0,0 +1,73 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Usee of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>  // AVX2
+
+#include "./vp9_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+int64_t vp9_block_error_avx2(const int16_t *coeff,
+                             const int16_t *dqcoeff,
+                             intptr_t block_size,
+                             int64_t *ssz) {
+  __m256i sse_reg, ssz_reg, coeff_reg, dqcoeff_reg;
+  __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi;
+  __m256i sse_reg_64hi, ssz_reg_64hi;
+  __m128i sse_reg128, ssz_reg128;
+  int64_t sse;
+  int i;
+  const __m256i zero_reg = _mm256_set1_epi16(0);
+
+  // init sse and ssz registerd to zero
+  sse_reg = _mm256_set1_epi16(0);
+  ssz_reg = _mm256_set1_epi16(0);
+
+  for (i = 0 ; i < block_size ; i+= 16) {
+    // load 32 bytes from coeff and dqcoeff
+    coeff_reg = _mm256_loadu_si256((const __m256i *)(coeff + i));
+    dqcoeff_reg = _mm256_loadu_si256((const __m256i *)(dqcoeff + i));
+    // dqcoeff - coeff
+    dqcoeff_reg = _mm256_sub_epi16(dqcoeff_reg, coeff_reg);
+    // madd (dqcoeff - coeff)
+    dqcoeff_reg = _mm256_madd_epi16(dqcoeff_reg, dqcoeff_reg);
+    // madd coeff
+    coeff_reg = _mm256_madd_epi16(coeff_reg, coeff_reg);
+    // expand each double word of madd (dqcoeff - coeff) to quad word
+    exp_dqcoeff_lo = _mm256_unpacklo_epi32(dqcoeff_reg, zero_reg);
+    exp_dqcoeff_hi = _mm256_unpackhi_epi32(dqcoeff_reg, zero_reg);
+    // expand each double word of madd (coeff) to quad word
+    exp_coeff_lo = _mm256_unpacklo_epi32(coeff_reg, zero_reg);
+    exp_coeff_hi = _mm256_unpackhi_epi32(coeff_reg, zero_reg);
+    // add each quad word of madd (dqcoeff - coeff) and madd (coeff)
+    sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_lo);
+    ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_lo);
+    sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_hi);
+    ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_hi);
+  }
+  // save the higher 64 bit of each 128 bit lane
+  sse_reg_64hi = _mm256_srli_si256(sse_reg, 8);
+  ssz_reg_64hi = _mm256_srli_si256(ssz_reg, 8);
+  // add the higher 64 bit to the low 64 bit
+  sse_reg = _mm256_add_epi64(sse_reg, sse_reg_64hi);
+  ssz_reg = _mm256_add_epi64(ssz_reg, ssz_reg_64hi);
+
+  // add each 64 bit from each of the 128 bit lane of the 256 bit
+  sse_reg128 = _mm_add_epi64(_mm256_castsi256_si128(sse_reg),
+                             _mm256_extractf128_si256(sse_reg, 1));
+
+  ssz_reg128 = _mm_add_epi64(_mm256_castsi256_si128(ssz_reg),
+                             _mm256_extractf128_si256(ssz_reg, 1));
+
+  // store the results
+  _mm_storel_epi64((__m128i*)(&sse), sse_reg128);
+
+  _mm_storel_epi64((__m128i*)(ssz), ssz_reg128);
+  return sse;
+}
diff --git a/libs/libvpx/vp9/encoder/x86/vp9_error_sse2.asm b/libs/libvpx/vp9/encoder/x86/vp9_error_sse2.asm
new file mode 100644
index 0000000000..5b0238272b
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/x86/vp9_error_sse2.asm
@@ -0,0 +1,122 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%define private_prefix vp9
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; int64_t vp9_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size,
+;                         int64_t *ssz)
+
+INIT_XMM sse2
+cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
+  pxor      m4, m4                 ; sse accumulator
+  pxor      m6, m6                 ; ssz accumulator
+  pxor      m5, m5                 ; dedicated zero register
+  lea     uqcq, [uqcq+sizeq*2]
+  lea     dqcq, [dqcq+sizeq*2]
+  neg    sizeq
+.loop:
+  mova      m2, [uqcq+sizeq*2]
+  mova      m0, [dqcq+sizeq*2]
+  mova      m3, [uqcq+sizeq*2+mmsize]
+  mova      m1, [dqcq+sizeq*2+mmsize]
+  psubw     m0, m2
+  psubw     m1, m3
+  ; individual errors are max. 15bit+sign, so squares are 30bit, and
+  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
+  pmaddwd   m0, m0
+  pmaddwd   m1, m1
+  pmaddwd   m2, m2
+  pmaddwd   m3, m3
+  ; accumulate in 64bit
+  punpckldq m7, m0, m5
+  punpckhdq m0, m5
+  paddq     m4, m7
+  punpckldq m7, m1, m5
+  paddq     m4, m0
+  punpckhdq m1, m5
+  paddq     m4, m7
+  punpckldq m7, m2, m5
+  paddq     m4, m1
+  punpckhdq m2, m5
+  paddq     m6, m7
+  punpckldq m7, m3, m5
+  paddq     m6, m2
+  punpckhdq m3, m5
+  paddq     m6, m7
+  paddq     m6, m3
+  add    sizeq, mmsize
+  jl .loop
+
+  ; accumulate horizontally and store in return value
+  movhlps   m5, m4
+  movhlps   m7, m6
+  paddq     m4, m5
+  paddq     m6, m7
+%if ARCH_X86_64
+  movq    rax, m4
+  movq [sszq], m6
+%else
+  mov     eax, sszm
+  pshufd   m5, m4, 0x1
+  movq  [eax], m6
+  movd    eax, m4
+  movd    edx, m5
+%endif
+  RET
+
+; Compute the sum of squared difference between two int16_t vectors.
+; int64_t vp9_block_error_fp(int16_t *coeff, int16_t *dqcoeff,
+;                            intptr_t block_size)
+
+INIT_XMM sse2
+cglobal block_error_fp, 3, 3, 6, uqc, dqc, size
+  pxor      m4, m4                 ; sse accumulator
+  pxor      m5, m5                 ; dedicated zero register
+  lea     uqcq, [uqcq+sizeq*2]
+  lea     dqcq, [dqcq+sizeq*2]
+  neg    sizeq
+.loop:
+  mova      m2, [uqcq+sizeq*2]
+  mova      m0, [dqcq+sizeq*2]
+  mova      m3, [uqcq+sizeq*2+mmsize]
+  mova      m1, [dqcq+sizeq*2+mmsize]
+  psubw     m0, m2
+  psubw     m1, m3
+  ; individual errors are max. 15bit+sign, so squares are 30bit, and
+  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
+  pmaddwd   m0, m0
+  pmaddwd   m1, m1
+  ; accumulate in 64bit
+  punpckldq m3, m0, m5
+  punpckhdq m0, m5
+  paddq     m4, m3
+  punpckldq m3, m1, m5
+  paddq     m4, m0
+  punpckhdq m1, m5
+  paddq     m4, m3
+  paddq     m4, m1
+  add    sizeq, mmsize
+  jl .loop
+
+  ; accumulate horizontally and store in return value
+  movhlps   m5, m4
+  paddq     m4, m5
+%if ARCH_X86_64
+  movq    rax, m4
+%else
+  pshufd   m5, m4, 0x1
+  movd    eax, m4
+  movd    edx, m5
+%endif
+  RET
diff --git a/libs/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c b/libs/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c
new file mode 100644
index 0000000000..de903fa332
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c
@@ -0,0 +1,211 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#if defined(_MSC_VER) && _MSC_VER <= 1500
+// Need to include math.h before calling tmmintrin.h/intrin.h
+// in certain versions of MSVS.
+#include <math.h>
+#endif
+#include <tmmintrin.h>  // SSSE3
+
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_scale_rtcd.h"
+#include "vpx_scale/yv12config.h"
+
+extern void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src,
+                                         YV12_BUFFER_CONFIG *dst);
+
+void downsample_2_to_1_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             int w, int h) {
+  const __m128i mask = _mm_set1_epi16(0x00FF);
+  const int max_width = w & ~15;
+  int y;
+  for (y = 0; y < h; ++y) {
+    int x;
+    for (x = 0; x < max_width; x += 16) {
+      const __m128i a = _mm_loadu_si128((const __m128i *)(src + x * 2 +  0));
+      const __m128i b = _mm_loadu_si128((const __m128i *)(src + x * 2 + 16));
+      const __m128i a_and = _mm_and_si128(a, mask);
+      const __m128i b_and = _mm_and_si128(b, mask);
+      const __m128i c = _mm_packus_epi16(a_and, b_and);
+      _mm_storeu_si128((__m128i *)(dst + x), c);
+    }
+    for (; x < w; ++x)
+      dst[x] = src[x * 2];
+    src += src_stride * 2;
+    dst += dst_stride;
+  }
+}
+
+static INLINE __m128i filter(const __m128i *const a, const __m128i *const b,
+                             const __m128i *const c, const __m128i *const d,
+                             const __m128i *const e, const __m128i *const f,
+                             const __m128i *const g, const __m128i *const h) {
+  const __m128i coeffs_ab =
+      _mm_set_epi8(6, -1, 6, -1, 6, -1, 6, -1, 6, -1, 6, -1, 6, -1, 6, -1);
+  const __m128i coeffs_cd =
+      _mm_set_epi8(78, -19, 78, -19, 78, -19, 78, -19, 78, -19, 78, -19,
+                   78, -19, 78, -19);
+  const __m128i const64_x16 = _mm_set1_epi16(64);
+  const __m128i ab = _mm_unpacklo_epi8(*a, *b);
+  const __m128i cd = _mm_unpacklo_epi8(*c, *d);
+  const __m128i fe = _mm_unpacklo_epi8(*f, *e);
+  const __m128i hg = _mm_unpacklo_epi8(*h, *g);
+  const __m128i ab_terms = _mm_maddubs_epi16(ab, coeffs_ab);
+  const __m128i cd_terms = _mm_maddubs_epi16(cd, coeffs_cd);
+  const __m128i fe_terms = _mm_maddubs_epi16(fe, coeffs_cd);
+  const __m128i hg_terms = _mm_maddubs_epi16(hg, coeffs_ab);
+  // can not overflow
+  const __m128i abcd_terms = _mm_add_epi16(ab_terms, cd_terms);
+  // can not overflow
+  const __m128i fehg_terms = _mm_add_epi16(fe_terms, hg_terms);
+  // can overflow, use saturating add
+  const __m128i terms = _mm_adds_epi16(abcd_terms, fehg_terms);
+  const __m128i round = _mm_adds_epi16(terms, const64_x16);
+  const __m128i shift = _mm_srai_epi16(round, 7);
+  return _mm_packus_epi16(shift, shift);
+}
+
+static void eight_tap_row_ssse3(const uint8_t *src, uint8_t *dst, int w) {
+  const int max_width = w & ~7;
+  int x = 0;
+  for (; x < max_width; x += 8) {
+    const __m128i a = _mm_loadl_epi64((const __m128i *)(src + x + 0));
+    const __m128i b = _mm_loadl_epi64((const __m128i *)(src + x + 1));
+    const __m128i c = _mm_loadl_epi64((const __m128i *)(src + x + 2));
+    const __m128i d = _mm_loadl_epi64((const __m128i *)(src + x + 3));
+    const __m128i e = _mm_loadl_epi64((const __m128i *)(src + x + 4));
+    const __m128i f = _mm_loadl_epi64((const __m128i *)(src + x + 5));
+    const __m128i g = _mm_loadl_epi64((const __m128i *)(src + x + 6));
+    const __m128i h = _mm_loadl_epi64((const __m128i *)(src + x + 7));
+    const __m128i pack = filter(&a, &b, &c, &d, &e, &f, &g, &h);
+    _mm_storel_epi64((__m128i *)(dst + x), pack);
+  }
+}
+
+void upsample_1_to_2_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           int dst_w, int dst_h) {
+  dst_w /= 2;
+  dst_h /= 2;
+  {
+    DECLARE_ALIGNED(16, uint8_t, tmp[1920 * 8]);
+    uint8_t *tmp0 = tmp + dst_w * 0;
+    uint8_t *tmp1 = tmp + dst_w * 1;
+    uint8_t *tmp2 = tmp + dst_w * 2;
+    uint8_t *tmp3 = tmp + dst_w * 3;
+    uint8_t *tmp4 = tmp + dst_w * 4;
+    uint8_t *tmp5 = tmp + dst_w * 5;
+    uint8_t *tmp6 = tmp + dst_w * 6;
+    uint8_t *tmp7 = tmp + dst_w * 7;
+    uint8_t *tmp8 = NULL;
+    const int max_width = dst_w & ~7;
+    int y;
+    eight_tap_row_ssse3(src - src_stride * 3 - 3, tmp0, dst_w);
+    eight_tap_row_ssse3(src - src_stride * 2 - 3, tmp1, dst_w);
+    eight_tap_row_ssse3(src - src_stride * 1 - 3, tmp2, dst_w);
+    eight_tap_row_ssse3(src + src_stride * 0 - 3, tmp3, dst_w);
+    eight_tap_row_ssse3(src + src_stride * 1 - 3, tmp4, dst_w);
+    eight_tap_row_ssse3(src + src_stride * 2 - 3, tmp5, dst_w);
+    eight_tap_row_ssse3(src + src_stride * 3 - 3, tmp6, dst_w);
+    for (y = 0; y < dst_h; y++) {
+      int x;
+      eight_tap_row_ssse3(src + src_stride * 4 - 3, tmp7, dst_w);
+      for (x = 0; x < max_width; x += 8) {
+        const __m128i A = _mm_loadl_epi64((const __m128i *)(src  + x));
+        const __m128i B = _mm_loadl_epi64((const __m128i *)(tmp3 + x));
+        const __m128i AB = _mm_unpacklo_epi8(A, B);
+        __m128i C, D, CD;
+        _mm_storeu_si128((__m128i *)(dst + x * 2), AB);
+        {
+          const __m128i a =
+              _mm_loadl_epi64((const __m128i *)(src + x - src_stride * 3));
+          const __m128i b =
+              _mm_loadl_epi64((const __m128i *)(src + x - src_stride * 2));
+          const __m128i c =
+              _mm_loadl_epi64((const __m128i *)(src + x - src_stride * 1));
+          const __m128i d =
+              _mm_loadl_epi64((const __m128i *)(src + x + src_stride * 0));
+          const __m128i e =
+              _mm_loadl_epi64((const __m128i *)(src + x + src_stride * 1));
+          const __m128i f =
+              _mm_loadl_epi64((const __m128i *)(src + x + src_stride * 2));
+          const __m128i g =
+              _mm_loadl_epi64((const __m128i *)(src + x + src_stride * 3));
+          const __m128i h =
+              _mm_loadl_epi64((const __m128i *)(src + x + src_stride * 4));
+          C = filter(&a, &b, &c, &d, &e, &f, &g, &h);
+        }
+        {
+          const __m128i a = _mm_loadl_epi64((const __m128i *)(tmp0 + x));
+          const __m128i b = _mm_loadl_epi64((const __m128i *)(tmp1 + x));
+          const __m128i c = _mm_loadl_epi64((const __m128i *)(tmp2 + x));
+          const __m128i d = _mm_loadl_epi64((const __m128i *)(tmp3 + x));
+          const __m128i e = _mm_loadl_epi64((const __m128i *)(tmp4 + x));
+          const __m128i f = _mm_loadl_epi64((const __m128i *)(tmp5 + x));
+          const __m128i g = _mm_loadl_epi64((const __m128i *)(tmp6 + x));
+          const __m128i h = _mm_loadl_epi64((const __m128i *)(tmp7 + x));
+          D = filter(&a, &b, &c, &d, &e, &f, &g, &h);
+        }
+        CD = _mm_unpacklo_epi8(C, D);
+        _mm_storeu_si128((__m128i *)(dst + x * 2 + dst_stride), CD);
+      }
+      src += src_stride;
+      dst += dst_stride * 2;
+      tmp8 = tmp0;
+      tmp0 = tmp1;
+      tmp1 = tmp2;
+      tmp2 = tmp3;
+      tmp3 = tmp4;
+      tmp4 = tmp5;
+      tmp5 = tmp6;
+      tmp6 = tmp7;
+      tmp7 = tmp8;
+    }
+  }
+}
+
+void vp9_scale_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src,
+                                      YV12_BUFFER_CONFIG *dst) {
+  const int src_w = src->y_crop_width;
+  const int src_h = src->y_crop_height;
+  const int dst_w = dst->y_crop_width;
+  const int dst_h = dst->y_crop_height;
+  const int dst_uv_w = dst_w / 2;
+  const int dst_uv_h = dst_h / 2;
+
+  if (dst_w * 2 == src_w && dst_h * 2 == src_h) {
+    downsample_2_to_1_ssse3(src->y_buffer, src->y_stride,
+                            dst->y_buffer, dst->y_stride, dst_w, dst_h);
+    downsample_2_to_1_ssse3(src->u_buffer, src->uv_stride,
+                            dst->u_buffer, dst->uv_stride, dst_uv_w, dst_uv_h);
+    downsample_2_to_1_ssse3(src->v_buffer, src->uv_stride,
+                            dst->v_buffer, dst->uv_stride, dst_uv_w, dst_uv_h);
+    vpx_extend_frame_borders(dst);
+  } else if (dst_w == src_w * 2 && dst_h == src_h * 2) {
+    // The upsample() supports widths up to 1920 * 2.  If greater, fall back
+    // to vp9_scale_and_extend_frame_c().
+    if (dst_w/2 <= 1920) {
+      upsample_1_to_2_ssse3(src->y_buffer, src->y_stride,
+                            dst->y_buffer, dst->y_stride, dst_w, dst_h);
+      upsample_1_to_2_ssse3(src->u_buffer, src->uv_stride,
+                            dst->u_buffer, dst->uv_stride, dst_uv_w, dst_uv_h);
+      upsample_1_to_2_ssse3(src->v_buffer, src->uv_stride,
+                            dst->v_buffer, dst->uv_stride, dst_uv_w, dst_uv_h);
+      vpx_extend_frame_borders(dst);
+    } else {
+      vp9_scale_and_extend_frame_c(src, dst);
+    }
+  } else {
+    vp9_scale_and_extend_frame_c(src, dst);
+  }
+}
diff --git a/libs/libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c b/libs/libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c
new file mode 100644
index 0000000000..c245ccafa8
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c
@@ -0,0 +1,71 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>
+#include <stdio.h>
+
+#include "vp9/common/vp9_common.h"
+
+int64_t vp9_highbd_block_error_sse2(tran_low_t *coeff, tran_low_t *dqcoeff,
+                                    intptr_t block_size, int64_t *ssz,
+                                    int bps) {
+  int i, j, test;
+  uint32_t temp[4];
+  __m128i max, min, cmp0, cmp1, cmp2, cmp3;
+  int64_t error = 0, sqcoeff = 0;
+  const int shift = 2 * (bps - 8);
+  const int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+
+  for (i = 0; i < block_size; i+=8) {
+    // Load the data into xmm registers
+    __m128i mm_coeff = _mm_load_si128((__m128i*) (coeff + i));
+    __m128i mm_coeff2 = _mm_load_si128((__m128i*) (coeff + i + 4));
+    __m128i mm_dqcoeff = _mm_load_si128((__m128i*) (dqcoeff + i));
+    __m128i mm_dqcoeff2 = _mm_load_si128((__m128i*) (dqcoeff + i + 4));
+    // Check if any values require more than 15 bit
+    max = _mm_set1_epi32(0x3fff);
+    min = _mm_set1_epi32(0xffffc000);
+    cmp0 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff, max),
+            _mm_cmplt_epi32(mm_coeff, min));
+    cmp1 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff2, max),
+            _mm_cmplt_epi32(mm_coeff2, min));
+    cmp2 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff, max),
+            _mm_cmplt_epi32(mm_dqcoeff, min));
+    cmp3 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff2, max),
+            _mm_cmplt_epi32(mm_dqcoeff2, min));
+    test = _mm_movemask_epi8(_mm_or_si128(_mm_or_si128(cmp0, cmp1),
+            _mm_or_si128(cmp2, cmp3)));
+
+    if (!test) {
+      __m128i mm_diff, error_sse2, sqcoeff_sse2;;
+      mm_coeff = _mm_packs_epi32(mm_coeff, mm_coeff2);
+      mm_dqcoeff = _mm_packs_epi32(mm_dqcoeff, mm_dqcoeff2);
+      mm_diff = _mm_sub_epi16(mm_coeff, mm_dqcoeff);
+      error_sse2 = _mm_madd_epi16(mm_diff, mm_diff);
+      sqcoeff_sse2 = _mm_madd_epi16(mm_coeff, mm_coeff);
+      _mm_storeu_si128((__m128i*)temp, error_sse2);
+      error = error + temp[0] + temp[1] + temp[2] + temp[3];
+      _mm_storeu_si128((__m128i*)temp, sqcoeff_sse2);
+      sqcoeff += temp[0] + temp[1] + temp[2] + temp[3];
+    } else {
+      for (j = 0; j < 8; j++) {
+        const int64_t diff = coeff[i + j] - dqcoeff[i + j];
+        error +=  diff * diff;
+        sqcoeff += (int64_t)coeff[i + j] * (int64_t)coeff[i + j];
+      }
+    }
+  }
+  assert(error >= 0 && sqcoeff >= 0);
+  error = (error + rounding) >> shift;
+  sqcoeff = (sqcoeff + rounding) >> shift;
+
+  *ssz = sqcoeff;
+  return error;
+}
diff --git a/libs/libvpx/vp9/encoder/x86/vp9_highbd_error_avx.asm b/libs/libvpx/vp9/encoder/x86/vp9_highbd_error_avx.asm
new file mode 100644
index 0000000000..e476323e14
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/x86/vp9_highbd_error_avx.asm
@@ -0,0 +1,261 @@
+;
+;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%define private_prefix vp9
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+ALIGN 16
+
+;
+; int64_t vp9_highbd_block_error_8bit(int32_t *coeff, int32_t *dqcoeff,
+;                                     intptr_t block_size, int64_t *ssz)
+;
+
+INIT_XMM avx
+cglobal highbd_block_error_8bit, 4, 5, 8, uqc, dqc, size, ssz
+  vzeroupper
+
+  ; If only one iteration is required, then handle this as a special case.
+  ; It is the most frequent case, so we can have a significant gain here
+  ; by not setting up a loop and accumulators.
+  cmp    sizeq, 16
+  jne   .generic
+
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+  ;; Common case of size == 16
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+  ; Load input vectors
+  mova      xm0, [dqcq]
+  packssdw  xm0, [dqcq+16]
+  mova      xm2, [uqcq]
+  packssdw  xm2, [uqcq+16]
+
+  mova      xm1, [dqcq+32]
+  packssdw  xm1, [dqcq+48]
+  mova      xm3, [uqcq+32]
+  packssdw  xm3, [uqcq+48]
+
+  ; Compute the errors.
+  psubw     xm0, xm2
+  psubw     xm1, xm3
+
+  ; Individual errors are max 15bit+sign, so squares are 30bit, and
+  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit).
+  pmaddwd   xm2, xm2
+  pmaddwd   xm3, xm3
+
+  pmaddwd   xm0, xm0
+  pmaddwd   xm1, xm1
+
+  ; Squares are always positive, so we can use unsigned arithmetic after
+  ; squaring. As mentioned earlier 2 sums fit in 31 bits, so 4 sums will
+  ; fit in 32bits
+  paddd     xm2, xm3
+  paddd     xm0, xm1
+
+  ; Accumulate horizontally in 64 bits, there is no chance of overflow here
+  pxor      xm5, xm5
+
+  pblendw   xm3, xm5, xm2, 0x33 ; Zero extended  low of a pair of 32 bits
+  psrlq     xm2, 32             ; Zero extended high of a pair of 32 bits
+
+  pblendw   xm1, xm5, xm0, 0x33 ; Zero extended  low of a pair of 32 bits
+  psrlq     xm0, 32             ; Zero extended high of a pair of 32 bits
+
+  paddq     xm2, xm3
+  paddq     xm0, xm1
+
+  psrldq    xm3, xm2, 8
+  psrldq    xm1, xm0, 8
+
+  paddq     xm2, xm3
+  paddq     xm0, xm1
+
+  ; Store the return value
+%if ARCH_X86_64
+  movq      rax, xm0
+  movq   [sszq], xm2
+%else
+  movd      eax, xm0
+  pextrd    edx, xm0, 1
+  movq   [sszd], xm2
+%endif
+  RET
+
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+  ;; Generic case of size != 16, speculative low precision
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+  ALIGN 16
+.generic:
+  pxor      xm4, xm4                ; sse accumulator
+  pxor      xm5, xm5                ; overflow detection register for xm4
+  pxor      xm6, xm6                ; ssz accumulator
+  pxor      xm7, xm7                ; overflow detection register for xm6
+  lea      uqcq, [uqcq+sizeq*4]
+  lea      dqcq, [dqcq+sizeq*4]
+  neg     sizeq
+
+  ; Push the negative size as the high precision code might need it
+  push    sizeq
+
+.loop:
+  ; Load input vectors
+  mova      xm0, [dqcq+sizeq*4]
+  packssdw  xm0, [dqcq+sizeq*4+16]
+  mova      xm2, [uqcq+sizeq*4]
+  packssdw  xm2, [uqcq+sizeq*4+16]
+
+  mova      xm1, [dqcq+sizeq*4+32]
+  packssdw  xm1, [dqcq+sizeq*4+48]
+  mova      xm3, [uqcq+sizeq*4+32]
+  packssdw  xm3, [uqcq+sizeq*4+48]
+
+  add     sizeq, 16
+
+  ; Compute the squared errors.
+  ; Individual errors are max 15bit+sign, so squares are 30bit, and
+  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit).
+  psubw     xm0, xm2
+  pmaddwd   xm2, xm2
+  pmaddwd   xm0, xm0
+
+  psubw     xm1, xm3
+  pmaddwd   xm3, xm3
+  pmaddwd   xm1, xm1
+
+  ; Squares are always positive, so we can use unsigned arithmetic after
+  ; squaring. As mentioned earlier 2 sums fit in 31 bits, so 4 sums will
+  ; fit in 32bits
+  paddd     xm2, xm3
+  paddd     xm0, xm1
+
+  ; We accumulate using 32 bit arithmetic, but detect potential overflow
+  ; by checking if the MSB of the accumulators have ever been a set bit.
+  ; If yes, we redo the whole compute at the end on higher precision, but
+  ; this happens extremely rarely, so we still achieve a net gain.
+  paddd     xm4, xm0
+  paddd     xm6, xm2
+  por       xm5, xm4  ; OR in the accumulator for overflow detection
+  por       xm7, xm6  ; OR in the accumulator for overflow detection
+
+  jnz .loop
+
+  ; Add pairs horizontally (still only on 32 bits)
+  phaddd    xm4, xm4
+  por       xm5, xm4  ; OR in the accumulator for overflow detection
+  phaddd    xm6, xm6
+  por       xm7, xm6  ; OR in the accumulator for overflow detection
+
+  ; Check for possibility of overflow by testing if bit 32 of each dword lane
+  ; have ever been set. If they were not, then there was no overflow and the
+  ; final sum will fit in 32 bits. If overflow happened, then
+  ; we redo the whole computation on higher precision.
+  por       xm7, xm5
+  pmovmskb   r4, xm7
+  test       r4, 0x8888
+  jnz .highprec
+
+  phaddd    xm4, xm4
+  phaddd    xm6, xm6
+  pmovzxdq  xm4, xm4
+  pmovzxdq  xm6, xm6
+
+  ; Restore stack
+  pop     sizeq
+
+  ; Store the return value
+%if ARCH_X86_64
+  movq      rax, xm4
+  movq   [sszq], xm6
+%else
+  movd      eax, xm4
+  pextrd    edx, xm4, 1
+  movq   [sszd], xm6
+%endif
+  RET
+
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+  ;; Generic case of size != 16, high precision case
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+.highprec:
+  pxor      xm4, xm4                 ; sse accumulator
+  pxor      xm5, xm5                 ; dedicated zero register
+  pxor      xm6, xm6                 ; ssz accumulator
+  pop     sizeq
+
+.loophp:
+  mova      xm0, [dqcq+sizeq*4]
+  packssdw  xm0, [dqcq+sizeq*4+16]
+  mova      xm2, [uqcq+sizeq*4]
+  packssdw  xm2, [uqcq+sizeq*4+16]
+
+  mova      xm1, [dqcq+sizeq*4+32]
+  packssdw  xm1, [dqcq+sizeq*4+48]
+  mova      xm3, [uqcq+sizeq*4+32]
+  packssdw  xm3, [uqcq+sizeq*4+48]
+
+  add     sizeq, 16
+
+  ; individual errors are max. 15bit+sign, so squares are 30bit, and
+  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
+
+  psubw     xm0, xm2
+  pmaddwd   xm2, xm2
+  pmaddwd   xm0, xm0
+
+  psubw     xm1, xm3
+  pmaddwd   xm3, xm3
+  pmaddwd   xm1, xm1
+
+  ; accumulate in 64bit
+  punpckldq xm7, xm0, xm5
+  punpckhdq xm0, xm5
+  paddq     xm4, xm7
+
+  punpckldq xm7, xm2, xm5
+  punpckhdq xm2, xm5
+  paddq     xm6, xm7
+
+  punpckldq xm7, xm1, xm5
+  punpckhdq xm1, xm5
+  paddq     xm4, xm7
+
+  punpckldq xm7, xm3, xm5
+  punpckhdq xm3, xm5
+  paddq     xm6, xm7
+
+  paddq     xm4, xm0
+  paddq     xm4, xm1
+  paddq     xm6, xm2
+  paddq     xm6, xm3
+
+  jnz .loophp
+
+  ; Accumulate horizontally
+  movhlps   xm5, xm4
+  movhlps   xm7, xm6
+  paddq     xm4, xm5
+  paddq     xm6, xm7
+
+  ; Store the return value
+%if ARCH_X86_64
+  movq      rax, xm4
+  movq   [sszq], xm6
+%else
+  movd      eax, xm4
+  pextrd    edx, xm4, 1
+  movq   [sszd], xm6
+%endif
+  RET
+
+END
diff --git a/libs/libvpx/vp9/encoder/x86/vp9_highbd_error_sse2.asm b/libs/libvpx/vp9/encoder/x86/vp9_highbd_error_sse2.asm
new file mode 100644
index 0000000000..f3b8f01947
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/x86/vp9_highbd_error_sse2.asm
@@ -0,0 +1,98 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%define private_prefix vp9
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+ALIGN 16
+
+;
+; int64_t vp9_highbd_block_error_8bit(int32_t *coeff, int32_t *dqcoeff,
+;                                     intptr_t block_size, int64_t *ssz)
+;
+
+INIT_XMM sse2
+cglobal highbd_block_error_8bit, 3, 3, 8, uqc, dqc, size, ssz
+  pxor      m4, m4                 ; sse accumulator
+  pxor      m6, m6                 ; ssz accumulator
+  pxor      m5, m5                 ; dedicated zero register
+  lea     uqcq, [uqcq+sizeq*4]
+  lea     dqcq, [dqcq+sizeq*4]
+  neg    sizeq
+
+  ALIGN 16
+
+.loop:
+  mova      m0, [dqcq+sizeq*4]
+  packssdw  m0, [dqcq+sizeq*4+mmsize]
+  mova      m2, [uqcq+sizeq*4]
+  packssdw  m2, [uqcq+sizeq*4+mmsize]
+
+  mova      m1, [dqcq+sizeq*4+mmsize*2]
+  packssdw  m1, [dqcq+sizeq*4+mmsize*3]
+  mova      m3, [uqcq+sizeq*4+mmsize*2]
+  packssdw  m3, [uqcq+sizeq*4+mmsize*3]
+
+  add    sizeq, mmsize
+
+  ; individual errors are max. 15bit+sign, so squares are 30bit, and
+  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
+
+  psubw     m0, m2
+  pmaddwd   m2, m2
+  pmaddwd   m0, m0
+
+  psubw     m1, m3
+  pmaddwd   m3, m3
+  pmaddwd   m1, m1
+
+  ; accumulate in 64bit
+  punpckldq m7, m0, m5
+  punpckhdq m0, m5
+  paddq     m4, m7
+
+  punpckldq m7, m2, m5
+  punpckhdq m2, m5
+  paddq     m6, m7
+
+  punpckldq m7, m1, m5
+  punpckhdq m1, m5
+  paddq     m4, m7
+
+  punpckldq m7, m3, m5
+  punpckhdq m3, m5
+  paddq     m6, m7
+
+  paddq     m4, m0
+  paddq     m4, m1
+  paddq     m6, m2
+  paddq     m6, m3
+
+  jnz .loop
+
+  ; accumulate horizontally and store in return value
+  movhlps   m5, m4
+  movhlps   m7, m6
+  paddq     m4, m5
+  paddq     m6, m7
+
+%if ARCH_X86_64
+  movq    rax, m4
+  movq [sszq], m6
+%else
+  mov     eax, sszm
+  pshufd   m5, m4, 0x1
+  movq  [eax], m6
+  movd    eax, m4
+  movd    edx, m5
+%endif
+  RET
diff --git a/libs/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c b/libs/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c
new file mode 100644
index 0000000000..2071dfe3c9
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c
@@ -0,0 +1,211 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>
+#include <xmmintrin.h>
+
+#include "./vp9_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
+                          int skip_block, const int16_t* zbin_ptr,
+                          const int16_t* round_ptr, const int16_t* quant_ptr,
+                          const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr,
+                          int16_t* dqcoeff_ptr, const int16_t* dequant_ptr,
+                          uint16_t* eob_ptr,
+                          const int16_t* scan_ptr,
+                          const int16_t* iscan_ptr) {
+  __m128i zero;
+  __m128i thr;
+  int16_t nzflag;
+  (void)scan_ptr;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+
+  coeff_ptr += n_coeffs;
+  iscan_ptr += n_coeffs;
+  qcoeff_ptr += n_coeffs;
+  dqcoeff_ptr += n_coeffs;
+  n_coeffs = -n_coeffs;
+  zero = _mm_setzero_si128();
+
+  if (!skip_block) {
+    __m128i eob;
+    __m128i round, quant, dequant;
+    {
+      __m128i coeff0, coeff1;
+
+      // Setup global values
+      {
+        round = _mm_load_si128((const __m128i*)round_ptr);
+        quant = _mm_load_si128((const __m128i*)quant_ptr);
+        dequant = _mm_load_si128((const __m128i*)dequant_ptr);
+      }
+
+      {
+        __m128i coeff0_sign, coeff1_sign;
+        __m128i qcoeff0, qcoeff1;
+        __m128i qtmp0, qtmp1;
+        // Do DC and first 15 AC
+        coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs));
+        coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1);
+
+        // Poor man's sign extract
+        coeff0_sign = _mm_srai_epi16(coeff0, 15);
+        coeff1_sign = _mm_srai_epi16(coeff1, 15);
+        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+        round = _mm_unpackhi_epi64(round, round);
+        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+        quant = _mm_unpackhi_epi64(quant, quant);
+        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+        // Reinsert signs
+        qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+
+        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+        dequant = _mm_unpackhi_epi64(dequant, dequant);
+        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+      }
+
+      {
+        // Scan for eob
+        __m128i zero_coeff0, zero_coeff1;
+        __m128i nzero_coeff0, nzero_coeff1;
+        __m128i iscan0, iscan1;
+        __m128i eob1;
+        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+        iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
+        iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
+        // Add one to convert from indices to counts
+        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+        eob = _mm_and_si128(iscan0, nzero_coeff0);
+        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+        eob = _mm_max_epi16(eob, eob1);
+      }
+      n_coeffs += 8 * 2;
+    }
+
+    thr = _mm_srai_epi16(dequant, 1);
+
+    // AC only loop
+    while (n_coeffs < 0) {
+      __m128i coeff0, coeff1;
+      {
+        __m128i coeff0_sign, coeff1_sign;
+        __m128i qcoeff0, qcoeff1;
+        __m128i qtmp0, qtmp1;
+
+        coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs));
+        coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1);
+
+        // Poor man's sign extract
+        coeff0_sign = _mm_srai_epi16(coeff0, 15);
+        coeff1_sign = _mm_srai_epi16(coeff1, 15);
+        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
+            _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
+
+        if (nzflag) {
+          qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+          qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+          qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+          qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+          // Reinsert signs
+          qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+          qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+          qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+          qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+          _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
+          _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+
+          coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+          coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+          _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
+          _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+        } else {
+          _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
+          _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
+
+          _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
+          _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
+        }
+      }
+
+      if (nzflag) {
+        // Scan for eob
+        __m128i zero_coeff0, zero_coeff1;
+        __m128i nzero_coeff0, nzero_coeff1;
+        __m128i iscan0, iscan1;
+        __m128i eob0, eob1;
+        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+        iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
+        iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
+        // Add one to convert from indices to counts
+        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+        eob0 = _mm_and_si128(iscan0, nzero_coeff0);
+        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+        eob0 = _mm_max_epi16(eob0, eob1);
+        eob = _mm_max_epi16(eob, eob0);
+      }
+      n_coeffs += 8 * 2;
+    }
+
+    // Accumulate EOB
+    {
+      __m128i eob_shuffled;
+      eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      *eob_ptr = _mm_extract_epi16(eob, 1);
+    }
+  } else {
+    do {
+      _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
+      _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
+      _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
+      _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
+      n_coeffs += 8 * 2;
+    } while (n_coeffs < 0);
+    *eob_ptr = 0;
+  }
+}
diff --git a/libs/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm b/libs/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
new file mode 100644
index 0000000000..ec61c0c3a7
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
@@ -0,0 +1,201 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%define private_prefix vp9
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_1: times 8 dw 1
+
+SECTION .text
+
+%macro QUANTIZE_FP 2
+cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
+                                shift, qcoeff, dqcoeff, dequant, \
+                                eob, scan, iscan
+  cmp                    dword skipm, 0
+  jne .blank
+
+  ; actual quantize loop - setup pointers, rounders, etc.
+  movifnidn                   coeffq, coeffmp
+  movifnidn                  ncoeffq, ncoeffmp
+  mov                             r2, dequantmp
+  movifnidn                    zbinq, zbinmp
+  movifnidn                   roundq, roundmp
+  movifnidn                   quantq, quantmp
+  mova                            m1, [roundq]             ; m1 = round
+  mova                            m2, [quantq]             ; m2 = quant
+%ifidn %1, fp_32x32
+  pcmpeqw                         m5, m5
+  psrlw                           m5, 15
+  paddw                           m1, m5
+  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
+%endif
+  mova                            m3, [r2q]                ; m3 = dequant
+  mov                             r3, qcoeffmp
+  mov                             r4, dqcoeffmp
+  mov                             r5, iscanmp
+%ifidn %1, fp_32x32
+  psllw                           m2, 1
+%endif
+  pxor                            m5, m5                   ; m5 = dedicated zero
+
+  lea                         coeffq, [  coeffq+ncoeffq*2]
+  lea                            r5q, [  r5q+ncoeffq*2]
+  lea                            r3q, [ r3q+ncoeffq*2]
+  lea                            r4q, [r4q+ncoeffq*2]
+  neg                        ncoeffq
+
+  ; get DC and first 15 AC coeffs
+  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
+  pabsw                           m6, m9                   ; m6 = abs(m9)
+  pabsw                          m11, m10                  ; m11 = abs(m10)
+  pcmpeqw                         m7, m7
+
+  paddsw                          m6, m1                   ; m6 += round
+  punpckhqdq                      m1, m1
+  paddsw                         m11, m1                   ; m11 += round
+  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
+  punpckhqdq                      m2, m2
+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
+  psignw                          m8, m9                   ; m8 = reinsert sign
+  psignw                         m13, m10                  ; m13 = reinsert sign
+  mova            [r3q+ncoeffq*2+ 0], m8
+  mova            [r3q+ncoeffq*2+16], m13
+%ifidn %1, fp_32x32
+  pabsw                           m8, m8
+  pabsw                          m13, m13
+%endif
+  pmullw                          m8, m3                   ; r4[i] = r3[i] * q
+  punpckhqdq                      m3, m3
+  pmullw                         m13, m3                   ; r4[i] = r3[i] * q
+%ifidn %1, fp_32x32
+  psrlw                           m8, 1
+  psrlw                          m13, 1
+  psignw                          m8, m9
+  psignw                         m13, m10
+  psrlw                           m0, m3, 2
+%else
+  psrlw                           m0, m3, 1
+%endif
+  mova            [r4q+ncoeffq*2+ 0], m8
+  mova            [r4q+ncoeffq*2+16], m13
+  pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
+  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
+  mova                            m6, [  r5q+ncoeffq*2+ 0] ; m6 = scan[i]
+  mova                           m11, [  r5q+ncoeffq*2+16] ; m11 = scan[i]
+  psubw                           m6, m7                   ; m6 = scan[i] + 1
+  psubw                          m11, m7                   ; m11 = scan[i] + 1
+  pandn                           m8, m6                   ; m8 = max(eob)
+  pandn                          m13, m11                  ; m13 = max(eob)
+  pmaxsw                          m8, m13
+  add                        ncoeffq, mmsize
+  jz .accumulate_eob
+
+.ac_only_loop:
+  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
+  pabsw                           m6, m9                   ; m6 = abs(m9)
+  pabsw                          m11, m10                  ; m11 = abs(m10)
+
+  pcmpgtw                         m7, m6,  m0
+  pcmpgtw                        m12, m11, m0
+  pmovmskb                       r6d, m7
+  pmovmskb                       r2d, m12
+
+  or                              r6, r2
+  jz .skip_iter
+
+  pcmpeqw                         m7, m7
+
+  paddsw                          m6, m1                   ; m6 += round
+  paddsw                         m11, m1                   ; m11 += round
+  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
+  psignw                         m14, m9                   ; m14 = reinsert sign
+  psignw                         m13, m10                  ; m13 = reinsert sign
+  mova            [r3q+ncoeffq*2+ 0], m14
+  mova            [r3q+ncoeffq*2+16], m13
+%ifidn %1, fp_32x32
+  pabsw                          m14, m14
+  pabsw                          m13, m13
+%endif
+  pmullw                         m14, m3                   ; r4[i] = r3[i] * q
+  pmullw                         m13, m3                   ; r4[i] = r3[i] * q
+%ifidn %1, fp_32x32
+  psrlw                          m14, 1
+  psrlw                          m13, 1
+  psignw                         m14, m9
+  psignw                         m13, m10
+%endif
+  mova            [r4q+ncoeffq*2+ 0], m14
+  mova            [r4q+ncoeffq*2+16], m13
+  pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
+  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
+  mova                            m6, [  r5q+ncoeffq*2+ 0] ; m6 = scan[i]
+  mova                           m11, [  r5q+ncoeffq*2+16] ; m11 = scan[i]
+  psubw                           m6, m7                   ; m6 = scan[i] + 1
+  psubw                          m11, m7                   ; m11 = scan[i] + 1
+  pandn                          m14, m6                   ; m14 = max(eob)
+  pandn                          m13, m11                  ; m13 = max(eob)
+  pmaxsw                          m8, m14
+  pmaxsw                          m8, m13
+  add                        ncoeffq, mmsize
+  jl .ac_only_loop
+
+  jmp .accumulate_eob
+.skip_iter:
+  mova            [r3q+ncoeffq*2+ 0], m5
+  mova            [r3q+ncoeffq*2+16], m5
+  mova            [r4q+ncoeffq*2+ 0], m5
+  mova            [r4q+ncoeffq*2+16], m5
+  add                        ncoeffq, mmsize
+  jl .ac_only_loop
+
+.accumulate_eob:
+  ; horizontally accumulate/max eobs and write into [eob] memory pointer
+  mov                             r2, eobmp
+  pshufd                          m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0x1
+  pmaxsw                          m8, m7
+  pextrw                          r6, m8, 0
+  mov                           [r2], r6
+  RET
+
+  ; skip-block, i.e. just write all zeroes
+.blank:
+  mov                             r0, dqcoeffmp
+  movifnidn                  ncoeffq, ncoeffmp
+  mov                             r2, qcoeffmp
+  mov                             r3, eobmp
+
+  lea                            r0q, [r0q+ncoeffq*2]
+  lea                            r2q, [r2q+ncoeffq*2]
+  neg                        ncoeffq
+  pxor                            m7, m7
+.blank_loop:
+  mova            [r0q+ncoeffq*2+ 0], m7
+  mova            [r0q+ncoeffq*2+16], m7
+  mova            [r2q+ncoeffq*2+ 0], m7
+  mova            [r2q+ncoeffq*2+16], m7
+  add                        ncoeffq, mmsize
+  jl .blank_loop
+  mov                     word [r3q], 0
+  RET
+%endmacro
+
+INIT_XMM ssse3
+QUANTIZE_FP fp, 7
+QUANTIZE_FP fp_32x32, 7
diff --git a/libs/libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm b/libs/libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm
new file mode 100644
index 0000000000..21aaa93831
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm
@@ -0,0 +1,212 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+; void vp9_temporal_filter_apply_sse2 | arg
+;  (unsigned char  *frame1,           |  0
+;   unsigned int    stride,           |  1
+;   unsigned char  *frame2,           |  2
+;   unsigned int    block_width,      |  3
+;   unsigned int    block_height,     |  4
+;   int             strength,         |  5
+;   int             filter_weight,    |  6
+;   unsigned int   *accumulator,      |  7
+;   unsigned short *count)            |  8
+global sym(vp9_temporal_filter_apply_sse2) PRIVATE
+sym(vp9_temporal_filter_apply_sse2):
+
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ALIGN_STACK 16, rax
+    %define block_width    0
+    %define block_height  16
+    %define strength      32
+    %define filter_weight 48
+    %define rounding_bit  64
+    %define rbp_backup    80
+    %define stack_size    96
+    sub         rsp,           stack_size
+    mov         [rsp + rbp_backup], rbp
+    ; end prolog
+
+        mov         edx,            arg(3)
+        mov         [rsp + block_width], rdx
+        mov         edx,            arg(4)
+        mov         [rsp + block_height], rdx
+        movd        xmm6,           arg(5)
+        movdqa      [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read
+
+        ; calculate the rounding bit outside the loop
+        ; 0x8000 >> (16 - strength)
+        mov         rdx,            16
+        sub         rdx,            arg(5) ; 16 - strength
+        movq        xmm4,           rdx    ; can't use rdx w/ shift
+        movdqa      xmm5,           [GLOBAL(_const_top_bit)]
+        psrlw       xmm5,           xmm4
+        movdqa      [rsp + rounding_bit], xmm5
+
+        mov         rsi,            arg(0) ; src/frame1
+        mov         rdx,            arg(2) ; predictor frame
+        mov         rdi,            arg(7) ; accumulator
+        mov         rax,            arg(8) ; count
+
+        ; dup the filter weight and store for later
+        movd        xmm0,           arg(6) ; filter_weight
+        pshuflw     xmm0,           xmm0, 0
+        punpcklwd   xmm0,           xmm0
+        movdqa      [rsp + filter_weight], xmm0
+
+        mov         rbp,            arg(1) ; stride
+        pxor        xmm7,           xmm7   ; zero for extraction
+
+        mov         rcx,            [rsp + block_width]
+        imul        rcx,            [rsp + block_height]
+        add         rcx,            rdx
+        cmp         dword ptr [rsp + block_width], 8
+        jne         .temporal_filter_apply_load_16
+
+.temporal_filter_apply_load_8:
+        movq        xmm0,           [rsi]  ; first row
+        lea         rsi,            [rsi + rbp] ; += stride
+        punpcklbw   xmm0,           xmm7   ; src[ 0- 7]
+        movq        xmm1,           [rsi]  ; second row
+        lea         rsi,            [rsi + rbp] ; += stride
+        punpcklbw   xmm1,           xmm7   ; src[ 8-15]
+        jmp         .temporal_filter_apply_load_finished
+
+.temporal_filter_apply_load_16:
+        movdqa      xmm0,           [rsi]  ; src (frame1)
+        lea         rsi,            [rsi + rbp] ; += stride
+        movdqa      xmm1,           xmm0
+        punpcklbw   xmm0,           xmm7   ; src[ 0- 7]
+        punpckhbw   xmm1,           xmm7   ; src[ 8-15]
+
+.temporal_filter_apply_load_finished:
+        movdqa      xmm2,           [rdx]  ; predictor (frame2)
+        movdqa      xmm3,           xmm2
+        punpcklbw   xmm2,           xmm7   ; pred[ 0- 7]
+        punpckhbw   xmm3,           xmm7   ; pred[ 8-15]
+
+        ; modifier = src_byte - pixel_value
+        psubw       xmm0,           xmm2   ; src - pred[ 0- 7]
+        psubw       xmm1,           xmm3   ; src - pred[ 8-15]
+
+        ; modifier *= modifier
+        pmullw      xmm0,           xmm0   ; modifer[ 0- 7]^2
+        pmullw      xmm1,           xmm1   ; modifer[ 8-15]^2
+
+        ; modifier *= 3
+        pmullw      xmm0,           [GLOBAL(_const_3w)]
+        pmullw      xmm1,           [GLOBAL(_const_3w)]
+
+        ; modifer += 0x8000 >> (16 - strength)
+        paddw       xmm0,           [rsp + rounding_bit]
+        paddw       xmm1,           [rsp + rounding_bit]
+
+        ; modifier >>= strength
+        psrlw       xmm0,           [rsp + strength]
+        psrlw       xmm1,           [rsp + strength]
+
+        ; modifier = 16 - modifier
+        ; saturation takes care of modifier > 16
+        movdqa      xmm3,           [GLOBAL(_const_16w)]
+        movdqa      xmm2,           [GLOBAL(_const_16w)]
+        psubusw     xmm3,           xmm1
+        psubusw     xmm2,           xmm0
+
+        ; modifier *= filter_weight
+        pmullw      xmm2,           [rsp + filter_weight]
+        pmullw      xmm3,           [rsp + filter_weight]
+
+        ; count
+        movdqa      xmm4,           [rax]
+        movdqa      xmm5,           [rax+16]
+        ; += modifier
+        paddw       xmm4,           xmm2
+        paddw       xmm5,           xmm3
+        ; write back
+        movdqa      [rax],          xmm4
+        movdqa      [rax+16],       xmm5
+        lea         rax,            [rax + 16*2] ; count += 16*(sizeof(short))
+
+        ; load and extract the predictor up to shorts
+        pxor        xmm7,           xmm7
+        movdqa      xmm0,           [rdx]
+        lea         rdx,            [rdx + 16*1] ; pred += 16*(sizeof(char))
+        movdqa      xmm1,           xmm0
+        punpcklbw   xmm0,           xmm7   ; pred[ 0- 7]
+        punpckhbw   xmm1,           xmm7   ; pred[ 8-15]
+
+        ; modifier *= pixel_value
+        pmullw      xmm0,           xmm2
+        pmullw      xmm1,           xmm3
+
+        ; expand to double words
+        movdqa      xmm2,           xmm0
+        punpcklwd   xmm0,           xmm7   ; [ 0- 3]
+        punpckhwd   xmm2,           xmm7   ; [ 4- 7]
+        movdqa      xmm3,           xmm1
+        punpcklwd   xmm1,           xmm7   ; [ 8-11]
+        punpckhwd   xmm3,           xmm7   ; [12-15]
+
+        ; accumulator
+        movdqa      xmm4,           [rdi]
+        movdqa      xmm5,           [rdi+16]
+        movdqa      xmm6,           [rdi+32]
+        movdqa      xmm7,           [rdi+48]
+        ; += modifier
+        paddd       xmm4,           xmm0
+        paddd       xmm5,           xmm2
+        paddd       xmm6,           xmm1
+        paddd       xmm7,           xmm3
+        ; write back
+        movdqa      [rdi],          xmm4
+        movdqa      [rdi+16],       xmm5
+        movdqa      [rdi+32],       xmm6
+        movdqa      [rdi+48],       xmm7
+        lea         rdi,            [rdi + 16*4] ; accumulator += 16*(sizeof(int))
+
+        cmp         rdx,            rcx
+        je          .temporal_filter_apply_epilog
+        pxor        xmm7,           xmm7   ; zero for extraction
+        cmp         dword ptr [rsp + block_width], 16
+        je          .temporal_filter_apply_load_16
+        jmp         .temporal_filter_apply_load_8
+
+.temporal_filter_apply_epilog:
+    ; begin epilog
+    mov         rbp,            [rsp + rbp_backup]
+    add         rsp,            stack_size
+    pop         rsp
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+_const_3w:
+    times 8 dw 3
+align 16
+_const_top_bit:
+    times 8 dw 1<<15
+align 16
+_const_16w
+    times 8 dw 16
diff --git a/libs/libvpx/vp9/exports_dec b/libs/libvpx/vp9/exports_dec
new file mode 100644
index 0000000000..0a61fde398
--- /dev/null
+++ b/libs/libvpx/vp9/exports_dec
@@ -0,0 +1,2 @@
+data vpx_codec_vp9_dx_algo
+text vpx_codec_vp9_dx
diff --git a/libs/libvpx/vp9/exports_enc b/libs/libvpx/vp9/exports_enc
new file mode 100644
index 0000000000..2a0fef3eaf
--- /dev/null
+++ b/libs/libvpx/vp9/exports_enc
@@ -0,0 +1,2 @@
+data vpx_codec_vp9_cx_algo
+text vpx_codec_vp9_cx
diff --git a/libs/libvpx/vp9/vp9_common.mk b/libs/libvpx/vp9/vp9_common.mk
new file mode 100644
index 0000000000..d0135c6f8d
--- /dev/null
+++ b/libs/libvpx/vp9/vp9_common.mk
@@ -0,0 +1,95 @@
+##
+##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+VP9_COMMON_SRCS-yes += vp9_common.mk
+VP9_COMMON_SRCS-yes += vp9_iface_common.h
+VP9_COMMON_SRCS-yes += common/vp9_ppflags.h
+VP9_COMMON_SRCS-yes += common/vp9_alloccommon.c
+VP9_COMMON_SRCS-yes += common/vp9_blockd.c
+VP9_COMMON_SRCS-yes += common/vp9_debugmodes.c
+VP9_COMMON_SRCS-yes += common/vp9_entropy.c
+VP9_COMMON_SRCS-yes += common/vp9_entropymode.c
+VP9_COMMON_SRCS-yes += common/vp9_entropymv.c
+VP9_COMMON_SRCS-yes += common/vp9_frame_buffers.c
+VP9_COMMON_SRCS-yes += common/vp9_frame_buffers.h
+VP9_COMMON_SRCS-yes += common/vp9_idct.c
+VP9_COMMON_SRCS-yes += common/vp9_alloccommon.h
+VP9_COMMON_SRCS-yes += common/vp9_blockd.h
+VP9_COMMON_SRCS-yes += common/vp9_common.h
+VP9_COMMON_SRCS-yes += common/vp9_entropy.h
+VP9_COMMON_SRCS-yes += common/vp9_entropymode.h
+VP9_COMMON_SRCS-yes += common/vp9_entropymv.h
+VP9_COMMON_SRCS-yes += common/vp9_enums.h
+VP9_COMMON_SRCS-yes += common/vp9_filter.h
+VP9_COMMON_SRCS-yes += common/vp9_filter.c
+VP9_COMMON_SRCS-yes += common/vp9_idct.h
+VP9_COMMON_SRCS-yes += common/vp9_loopfilter.h
+VP9_COMMON_SRCS-yes += common/vp9_thread_common.h
+VP9_COMMON_SRCS-yes += common/vp9_mv.h
+VP9_COMMON_SRCS-yes += common/vp9_onyxc_int.h
+VP9_COMMON_SRCS-yes += common/vp9_pred_common.h
+VP9_COMMON_SRCS-yes += common/vp9_pred_common.c
+VP9_COMMON_SRCS-yes += common/vp9_quant_common.h
+VP9_COMMON_SRCS-yes += common/vp9_reconinter.h
+VP9_COMMON_SRCS-yes += common/vp9_reconintra.h
+VP9_COMMON_SRCS-yes += common/vp9_rtcd.c
+VP9_COMMON_SRCS-yes += common/vp9_rtcd_defs.pl
+VP9_COMMON_SRCS-yes += common/vp9_scale.h
+VP9_COMMON_SRCS-yes += common/vp9_scale.c
+VP9_COMMON_SRCS-yes += common/vp9_seg_common.h
+VP9_COMMON_SRCS-yes += common/vp9_seg_common.c
+VP9_COMMON_SRCS-yes += common/vp9_textblit.h
+VP9_COMMON_SRCS-yes += common/vp9_tile_common.h
+VP9_COMMON_SRCS-yes += common/vp9_tile_common.c
+VP9_COMMON_SRCS-yes += common/vp9_loopfilter.c
+VP9_COMMON_SRCS-yes += common/vp9_thread_common.c
+VP9_COMMON_SRCS-yes += common/vp9_mvref_common.c
+VP9_COMMON_SRCS-yes += common/vp9_mvref_common.h
+VP9_COMMON_SRCS-yes += common/vp9_quant_common.c
+VP9_COMMON_SRCS-yes += common/vp9_reconinter.c
+VP9_COMMON_SRCS-yes += common/vp9_reconintra.c
+VP9_COMMON_SRCS-$(CONFIG_POSTPROC_VISUALIZER) += common/vp9_textblit.c
+VP9_COMMON_SRCS-yes += common/vp9_common_data.c
+VP9_COMMON_SRCS-yes += common/vp9_common_data.h
+VP9_COMMON_SRCS-yes += common/vp9_scan.c
+VP9_COMMON_SRCS-yes += common/vp9_scan.h
+
+VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.h
+VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c
+VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.h
+VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.c
+ifeq ($(CONFIG_VP9_POSTPROC),yes)
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_mfqe_sse2.asm
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm
+endif
+
+ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans4_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans8_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans16_dspr2.c
+endif
+
+# common (msa)
+VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct4x4_msa.c
+VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct8x8_msa.c
+VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct16x16_msa.c
+
+ifeq ($(CONFIG_VP9_POSTPROC),yes)
+VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_mfqe_msa.c
+endif
+
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c
+
+ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht4x4_add_neon.c
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht8x8_add_neon.c
+endif
+
+$(eval $(call rtcd_h_template,vp9_rtcd,vp9/common/vp9_rtcd_defs.pl))
diff --git a/libs/libvpx/vp9/vp9_cx_iface.c b/libs/libvpx/vp9/vp9_cx_iface.c
new file mode 100644
index 0000000000..db7f537a69
--- /dev/null
+++ b/libs/libvpx/vp9/vp9_cx_iface.c
@@ -0,0 +1,1604 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_encoder.h"
+#include "vpx_ports/vpx_once.h"
+#include "vpx/internal/vpx_codec_internal.h"
+#include "./vpx_version.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vpx/vp8cx.h"
+#include "vp9/encoder/vp9_firstpass.h"
+#include "vp9/vp9_iface_common.h"
+
+struct vp9_extracfg {
+  int                         cpu_used;  // available cpu percentage in 1/16
+  unsigned int                enable_auto_alt_ref;
+  unsigned int                noise_sensitivity;
+  unsigned int                sharpness;
+  unsigned int                static_thresh;
+  unsigned int                tile_columns;
+  unsigned int                tile_rows;
+  unsigned int                arnr_max_frames;
+  unsigned int                arnr_strength;
+  unsigned int                min_gf_interval;
+  unsigned int                max_gf_interval;
+  vp8e_tuning                 tuning;
+  unsigned int                cq_level;  // constrained quality level
+  unsigned int                rc_max_intra_bitrate_pct;
+  unsigned int                rc_max_inter_bitrate_pct;
+  unsigned int                gf_cbr_boost_pct;
+  unsigned int                lossless;
+  unsigned int                frame_parallel_decoding_mode;
+  AQ_MODE                     aq_mode;
+  unsigned int                frame_periodic_boost;
+  vpx_bit_depth_t             bit_depth;
+  vp9e_tune_content           content;
+  vpx_color_space_t           color_space;
+  vpx_color_range_t           color_range;
+  int                         render_width;
+  int                         render_height;
+};
+
+static struct vp9_extracfg default_extra_cfg = {
+  0,                          // cpu_used
+  1,                          // enable_auto_alt_ref
+  0,                          // noise_sensitivity
+  0,                          // sharpness
+  0,                          // static_thresh
+  6,                          // tile_columns
+  0,                          // tile_rows
+  7,                          // arnr_max_frames
+  5,                          // arnr_strength
+  0,                          // min_gf_interval; 0 -> default decision
+  0,                          // max_gf_interval; 0 -> default decision
+  VP8_TUNE_PSNR,              // tuning
+  10,                         // cq_level
+  0,                          // rc_max_intra_bitrate_pct
+  0,                          // rc_max_inter_bitrate_pct
+  0,                          // gf_cbr_boost_pct
+  0,                          // lossless
+  1,                          // frame_parallel_decoding_mode
+  NO_AQ,                      // aq_mode
+  0,                          // frame_periodic_delta_q
+  VPX_BITS_8,                 // Bit depth
+  VP9E_CONTENT_DEFAULT,       // content
+  VPX_CS_UNKNOWN,             // color space
+  0,                          // color range
+  0,                          // render width
+  0,                          // render height
+};
+
+struct vpx_codec_alg_priv {
+  vpx_codec_priv_t        base;
+  vpx_codec_enc_cfg_t     cfg;
+  struct vp9_extracfg     extra_cfg;
+  VP9EncoderConfig        oxcf;
+  VP9_COMP               *cpi;
+  unsigned char          *cx_data;
+  size_t                  cx_data_sz;
+  unsigned char          *pending_cx_data;
+  size_t                  pending_cx_data_sz;
+  int                     pending_frame_count;
+  size_t                  pending_frame_sizes[8];
+  size_t                  pending_frame_magnitude;
+  vpx_image_t             preview_img;
+  vpx_enc_frame_flags_t   next_frame_flags;
+  vp8_postproc_cfg_t      preview_ppcfg;
+  vpx_codec_pkt_list_decl(256) pkt_list;
+  unsigned int                 fixed_kf_cntr;
+  vpx_codec_priv_output_cx_pkt_cb_pair_t output_cx_pkt_cb;
+  // BufferPool that holds all reference frames.
+  BufferPool              *buffer_pool;
+};
+
+static VP9_REFFRAME ref_frame_to_vp9_reframe(vpx_ref_frame_type_t frame) {
+  switch (frame) {
+    case VP8_LAST_FRAME:
+      return VP9_LAST_FLAG;
+    case VP8_GOLD_FRAME:
+      return VP9_GOLD_FLAG;
+    case VP8_ALTR_FRAME:
+      return VP9_ALT_FLAG;
+  }
+  assert(0 && "Invalid Reference Frame");
+  return VP9_LAST_FLAG;
+}
+
+static vpx_codec_err_t update_error_state(vpx_codec_alg_priv_t *ctx,
+    const struct vpx_internal_error_info *error) {
+  const vpx_codec_err_t res = error->error_code;
+
+  if (res != VPX_CODEC_OK)
+    ctx->base.err_detail = error->has_detail ? error->detail : NULL;
+
+  return res;
+}
+
+
+#undef ERROR
+#define ERROR(str) do {\
+    ctx->base.err_detail = str;\
+    return VPX_CODEC_INVALID_PARAM;\
+  } while (0)
+
+#define RANGE_CHECK(p, memb, lo, hi) do {\
+    if (!(((p)->memb == lo || (p)->memb > (lo)) && (p)->memb <= hi)) \
+      ERROR(#memb " out of range ["#lo".."#hi"]");\
+  } while (0)
+
+#define RANGE_CHECK_HI(p, memb, hi) do {\
+    if (!((p)->memb <= (hi))) \
+      ERROR(#memb " out of range [.."#hi"]");\
+  } while (0)
+
+#define RANGE_CHECK_LO(p, memb, lo) do {\
+    if (!((p)->memb >= (lo))) \
+      ERROR(#memb " out of range ["#lo"..]");\
+  } while (0)
+
+#define RANGE_CHECK_BOOL(p, memb) do {\
+    if (!!((p)->memb) != (p)->memb) ERROR(#memb " expected boolean");\
+  } while (0)
+
+static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
+                                       const vpx_codec_enc_cfg_t *cfg,
+                                       const struct vp9_extracfg *extra_cfg) {
+  RANGE_CHECK(cfg, g_w,                   1, 65535);  // 16 bits available
+  RANGE_CHECK(cfg, g_h,                   1, 65535);  // 16 bits available
+  RANGE_CHECK(cfg, g_timebase.den,        1, 1000000000);
+  RANGE_CHECK(cfg, g_timebase.num,        1, cfg->g_timebase.den);
+  RANGE_CHECK_HI(cfg, g_profile,          3);
+
+  RANGE_CHECK_HI(cfg, rc_max_quantizer,   63);
+  RANGE_CHECK_HI(cfg, rc_min_quantizer,   cfg->rc_max_quantizer);
+  RANGE_CHECK_BOOL(extra_cfg, lossless);
+  RANGE_CHECK(extra_cfg, aq_mode,           0, AQ_MODE_COUNT - 1);
+  RANGE_CHECK(extra_cfg, frame_periodic_boost, 0, 1);
+  RANGE_CHECK_HI(cfg, g_threads,          64);
+  RANGE_CHECK_HI(cfg, g_lag_in_frames,    MAX_LAG_BUFFERS);
+  RANGE_CHECK(cfg, rc_end_usage,          VPX_VBR, VPX_Q);
+  RANGE_CHECK_HI(cfg, rc_undershoot_pct,  100);
+  RANGE_CHECK_HI(cfg, rc_overshoot_pct,   100);
+  RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100);
+  RANGE_CHECK(cfg, kf_mode,               VPX_KF_DISABLED, VPX_KF_AUTO);
+  RANGE_CHECK_BOOL(cfg,                   rc_resize_allowed);
+  RANGE_CHECK_HI(cfg, rc_dropframe_thresh,   100);
+  RANGE_CHECK_HI(cfg, rc_resize_up_thresh,   100);
+  RANGE_CHECK_HI(cfg, rc_resize_down_thresh, 100);
+  RANGE_CHECK(cfg,        g_pass,         VPX_RC_ONE_PASS, VPX_RC_LAST_PASS);
+  RANGE_CHECK(extra_cfg, min_gf_interval, 0, (MAX_LAG_BUFFERS - 1));
+  RANGE_CHECK(extra_cfg, max_gf_interval, 0, (MAX_LAG_BUFFERS - 1));
+  if (extra_cfg->max_gf_interval > 0) {
+    RANGE_CHECK(extra_cfg, max_gf_interval, 2, (MAX_LAG_BUFFERS - 1));
+  }
+  if (extra_cfg->min_gf_interval > 0 && extra_cfg->max_gf_interval > 0) {
+    RANGE_CHECK(extra_cfg, max_gf_interval, extra_cfg->min_gf_interval,
+      (MAX_LAG_BUFFERS - 1));
+  }
+
+  if (cfg->rc_resize_allowed == 1) {
+    RANGE_CHECK(cfg, rc_scaled_width, 0, cfg->g_w);
+    RANGE_CHECK(cfg, rc_scaled_height, 0, cfg->g_h);
+  }
+
+  RANGE_CHECK(cfg, ss_number_layers, 1, VPX_SS_MAX_LAYERS);
+  RANGE_CHECK(cfg, ts_number_layers, 1, VPX_TS_MAX_LAYERS);
+
+  if (cfg->ss_number_layers * cfg->ts_number_layers > VPX_MAX_LAYERS)
+    ERROR("ss_number_layers * ts_number_layers is out of range");
+  if (cfg->ts_number_layers > 1) {
+    unsigned int sl, tl;
+    for (sl = 1; sl < cfg->ss_number_layers; ++sl) {
+      for (tl = 1; tl < cfg->ts_number_layers; ++tl) {
+        const int layer =
+            LAYER_IDS_TO_IDX(sl, tl, cfg->ts_number_layers);
+        if (cfg->layer_target_bitrate[layer] <
+            cfg->layer_target_bitrate[layer - 1])
+        ERROR("ts_target_bitrate entries are not increasing");
+      }
+    }
+
+    RANGE_CHECK(cfg, ts_rate_decimator[cfg->ts_number_layers - 1], 1, 1);
+    for (tl = cfg->ts_number_layers - 2; tl > 0; --tl)
+      if (cfg->ts_rate_decimator[tl - 1] != 2 * cfg->ts_rate_decimator[tl])
+        ERROR("ts_rate_decimator factors are not powers of 2");
+  }
+
+#if CONFIG_SPATIAL_SVC
+
+  if ((cfg->ss_number_layers > 1 || cfg->ts_number_layers > 1) &&
+      cfg->g_pass == VPX_RC_LAST_PASS) {
+    unsigned int i, alt_ref_sum = 0;
+    for (i = 0; i < cfg->ss_number_layers; ++i) {
+      if (cfg->ss_enable_auto_alt_ref[i])
+        ++alt_ref_sum;
+    }
+    if (alt_ref_sum > REF_FRAMES - cfg->ss_number_layers)
+      ERROR("Not enough ref buffers for svc alt ref frames");
+    if (cfg->ss_number_layers * cfg->ts_number_layers > 3 &&
+        cfg->g_error_resilient == 0)
+    ERROR("Multiple frame context are not supported for more than 3 layers");
+  }
+#endif
+
+  // VP9 does not support a lower bound on the keyframe interval in
+  // automatic keyframe placement mode.
+  if (cfg->kf_mode != VPX_KF_DISABLED &&
+      cfg->kf_min_dist != cfg->kf_max_dist &&
+      cfg->kf_min_dist > 0)
+    ERROR("kf_min_dist not supported in auto mode, use 0 "
+          "or kf_max_dist instead.");
+
+  RANGE_CHECK(extra_cfg, enable_auto_alt_ref, 0, 2);
+  RANGE_CHECK(extra_cfg, cpu_used, -8, 8);
+  RANGE_CHECK_HI(extra_cfg, noise_sensitivity, 6);
+  RANGE_CHECK(extra_cfg, tile_columns, 0, 6);
+  RANGE_CHECK(extra_cfg, tile_rows, 0, 2);
+  RANGE_CHECK_HI(extra_cfg, sharpness, 7);
+  RANGE_CHECK(extra_cfg, arnr_max_frames, 0, 15);
+  RANGE_CHECK_HI(extra_cfg, arnr_strength, 6);
+  RANGE_CHECK(extra_cfg, cq_level, 0, 63);
+  RANGE_CHECK(cfg, g_bit_depth, VPX_BITS_8, VPX_BITS_12);
+  RANGE_CHECK(cfg, g_input_bit_depth, 8, 12);
+  RANGE_CHECK(extra_cfg, content,
+              VP9E_CONTENT_DEFAULT, VP9E_CONTENT_INVALID - 1);
+
+  // TODO(yaowu): remove this when ssim tuning is implemented for vp9
+  if (extra_cfg->tuning == VP8_TUNE_SSIM)
+      ERROR("Option --tune=ssim is not currently supported in VP9.");
+
+  if (cfg->g_pass == VPX_RC_LAST_PASS) {
+    const size_t packet_sz = sizeof(FIRSTPASS_STATS);
+    const int n_packets = (int)(cfg->rc_twopass_stats_in.sz / packet_sz);
+    const FIRSTPASS_STATS *stats;
+
+    if (cfg->rc_twopass_stats_in.buf == NULL)
+      ERROR("rc_twopass_stats_in.buf not set.");
+
+    if (cfg->rc_twopass_stats_in.sz % packet_sz)
+      ERROR("rc_twopass_stats_in.sz indicates truncated packet.");
+
+    if (cfg->ss_number_layers > 1 || cfg->ts_number_layers > 1) {
+      int i;
+      unsigned int n_packets_per_layer[VPX_SS_MAX_LAYERS] = {0};
+
+      stats = cfg->rc_twopass_stats_in.buf;
+      for (i = 0; i < n_packets; ++i) {
+        const int layer_id = (int)stats[i].spatial_layer_id;
+        if (layer_id >= 0 && layer_id < (int)cfg->ss_number_layers) {
+          ++n_packets_per_layer[layer_id];
+        }
+      }
+
+      for (i = 0; i < (int)cfg->ss_number_layers; ++i) {
+        unsigned int layer_id;
+        if (n_packets_per_layer[i] < 2) {
+          ERROR("rc_twopass_stats_in requires at least two packets for each "
+                "layer.");
+        }
+
+        stats = (const FIRSTPASS_STATS *)cfg->rc_twopass_stats_in.buf +
+                n_packets - cfg->ss_number_layers + i;
+        layer_id = (int)stats->spatial_layer_id;
+
+        if (layer_id >= cfg->ss_number_layers
+            ||(unsigned int)(stats->count + 0.5) !=
+               n_packets_per_layer[layer_id] - 1)
+          ERROR("rc_twopass_stats_in missing EOS stats packet");
+      }
+    } else {
+      if (cfg->rc_twopass_stats_in.sz < 2 * packet_sz)
+        ERROR("rc_twopass_stats_in requires at least two packets.");
+
+      stats =
+          (const FIRSTPASS_STATS *)cfg->rc_twopass_stats_in.buf + n_packets - 1;
+
+      if ((int)(stats->count + 0.5) != n_packets - 1)
+        ERROR("rc_twopass_stats_in missing EOS stats packet");
+    }
+  }
+
+#if !CONFIG_VP9_HIGHBITDEPTH
+  if (cfg->g_profile > (unsigned int)PROFILE_1) {
+    ERROR("Profile > 1 not supported in this build configuration");
+  }
+#endif
+  if (cfg->g_profile <= (unsigned int)PROFILE_1 &&
+      cfg->g_bit_depth > VPX_BITS_8) {
+    ERROR("Codec high bit-depth not supported in profile < 2");
+  }
+  if (cfg->g_profile <= (unsigned int)PROFILE_1 &&
+      cfg->g_input_bit_depth > 8) {
+    ERROR("Source high bit-depth not supported in profile < 2");
+  }
+  if (cfg->g_profile > (unsigned int)PROFILE_1 &&
+      cfg->g_bit_depth == VPX_BITS_8) {
+    ERROR("Codec bit-depth 8 not supported in profile > 1");
+  }
+  RANGE_CHECK(extra_cfg, color_space, VPX_CS_UNKNOWN, VPX_CS_SRGB);
+  RANGE_CHECK(extra_cfg, color_range,
+              VPX_CR_STUDIO_RANGE, VPX_CR_FULL_RANGE);
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t validate_img(vpx_codec_alg_priv_t *ctx,
+                                    const vpx_image_t *img) {
+  switch (img->fmt) {
+    case VPX_IMG_FMT_YV12:
+    case VPX_IMG_FMT_I420:
+    case VPX_IMG_FMT_I42016:
+      break;
+    case VPX_IMG_FMT_I422:
+    case VPX_IMG_FMT_I444:
+    case VPX_IMG_FMT_I440:
+      if (ctx->cfg.g_profile != (unsigned int)PROFILE_1) {
+        ERROR("Invalid image format. I422, I444, I440 images are "
+              "not supported in profile.");
+      }
+      break;
+    case VPX_IMG_FMT_I42216:
+    case VPX_IMG_FMT_I44416:
+    case VPX_IMG_FMT_I44016:
+      if (ctx->cfg.g_profile != (unsigned int)PROFILE_1 &&
+          ctx->cfg.g_profile != (unsigned int)PROFILE_3) {
+        ERROR("Invalid image format. 16-bit I422, I444, I440 images are "
+              "not supported in profile.");
+      }
+      break;
+    default:
+      ERROR("Invalid image format. Only YV12, I420, I422, I444 images are "
+            "supported.");
+      break;
+  }
+
+  if (img->d_w != ctx->cfg.g_w || img->d_h != ctx->cfg.g_h)
+    ERROR("Image size must match encoder init configuration size");
+
+  return VPX_CODEC_OK;
+}
+
+static int get_image_bps(const vpx_image_t *img) {
+  switch (img->fmt) {
+    case VPX_IMG_FMT_YV12:
+    case VPX_IMG_FMT_I420: return 12;
+    case VPX_IMG_FMT_I422: return 16;
+    case VPX_IMG_FMT_I444: return 24;
+    case VPX_IMG_FMT_I440: return 16;
+    case VPX_IMG_FMT_I42016: return 24;
+    case VPX_IMG_FMT_I42216: return 32;
+    case VPX_IMG_FMT_I44416: return 48;
+    case VPX_IMG_FMT_I44016: return 32;
+    default: assert(0 && "Invalid image format"); break;
+  }
+  return 0;
+}
+
+static vpx_codec_err_t set_encoder_config(
+  VP9EncoderConfig *oxcf,
+  const vpx_codec_enc_cfg_t *cfg,
+  const struct vp9_extracfg *extra_cfg) {
+  const int is_vbr = cfg->rc_end_usage == VPX_VBR;
+  int sl, tl;
+  oxcf->profile = cfg->g_profile;
+  oxcf->max_threads = (int)cfg->g_threads;
+  oxcf->width   = cfg->g_w;
+  oxcf->height  = cfg->g_h;
+  oxcf->bit_depth = cfg->g_bit_depth;
+  oxcf->input_bit_depth = cfg->g_input_bit_depth;
+  // guess a frame rate if out of whack, use 30
+  oxcf->init_framerate = (double)cfg->g_timebase.den / cfg->g_timebase.num;
+  if (oxcf->init_framerate > 180)
+    oxcf->init_framerate = 30;
+
+  oxcf->mode = GOOD;
+
+  switch (cfg->g_pass) {
+    case VPX_RC_ONE_PASS:
+      oxcf->pass = 0;
+      break;
+    case VPX_RC_FIRST_PASS:
+      oxcf->pass = 1;
+      break;
+    case VPX_RC_LAST_PASS:
+      oxcf->pass = 2;
+      break;
+  }
+
+  oxcf->lag_in_frames = cfg->g_pass == VPX_RC_FIRST_PASS ? 0
+                                                         : cfg->g_lag_in_frames;
+  oxcf->rc_mode = cfg->rc_end_usage;
+
+  // Convert target bandwidth from Kbit/s to Bit/s
+  oxcf->target_bandwidth = 1000 * cfg->rc_target_bitrate;
+  oxcf->rc_max_intra_bitrate_pct = extra_cfg->rc_max_intra_bitrate_pct;
+  oxcf->rc_max_inter_bitrate_pct = extra_cfg->rc_max_inter_bitrate_pct;
+  oxcf->gf_cbr_boost_pct = extra_cfg->gf_cbr_boost_pct;
+
+  oxcf->best_allowed_q =
+      extra_cfg->lossless ? 0 : vp9_quantizer_to_qindex(cfg->rc_min_quantizer);
+  oxcf->worst_allowed_q =
+      extra_cfg->lossless ? 0 : vp9_quantizer_to_qindex(cfg->rc_max_quantizer);
+  oxcf->cq_level        = vp9_quantizer_to_qindex(extra_cfg->cq_level);
+  oxcf->fixed_q = -1;
+
+  oxcf->under_shoot_pct         = cfg->rc_undershoot_pct;
+  oxcf->over_shoot_pct          = cfg->rc_overshoot_pct;
+
+  oxcf->scaled_frame_width  = cfg->rc_scaled_width;
+  oxcf->scaled_frame_height = cfg->rc_scaled_height;
+  if (cfg->rc_resize_allowed == 1) {
+    oxcf->resize_mode =
+        (oxcf->scaled_frame_width == 0 || oxcf->scaled_frame_height == 0) ?
+            RESIZE_DYNAMIC : RESIZE_FIXED;
+  } else {
+    oxcf->resize_mode = RESIZE_NONE;
+  }
+
+  oxcf->maximum_buffer_size_ms   = is_vbr ? 240000 : cfg->rc_buf_sz;
+  oxcf->starting_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_initial_sz;
+  oxcf->optimal_buffer_level_ms  = is_vbr ? 60000 : cfg->rc_buf_optimal_sz;
+
+  oxcf->drop_frames_water_mark   = cfg->rc_dropframe_thresh;
+
+  oxcf->two_pass_vbrbias         = cfg->rc_2pass_vbr_bias_pct;
+  oxcf->two_pass_vbrmin_section  = cfg->rc_2pass_vbr_minsection_pct;
+  oxcf->two_pass_vbrmax_section  = cfg->rc_2pass_vbr_maxsection_pct;
+
+  oxcf->auto_key               = cfg->kf_mode == VPX_KF_AUTO &&
+                                 cfg->kf_min_dist != cfg->kf_max_dist;
+
+  oxcf->key_freq               = cfg->kf_max_dist;
+
+  oxcf->speed                  =  abs(extra_cfg->cpu_used);
+  oxcf->encode_breakout        =  extra_cfg->static_thresh;
+  oxcf->enable_auto_arf        =  extra_cfg->enable_auto_alt_ref;
+  oxcf->noise_sensitivity      =  extra_cfg->noise_sensitivity;
+  oxcf->sharpness              =  extra_cfg->sharpness;
+
+  oxcf->two_pass_stats_in      =  cfg->rc_twopass_stats_in;
+
+#if CONFIG_FP_MB_STATS
+  oxcf->firstpass_mb_stats_in  = cfg->rc_firstpass_mb_stats_in;
+#endif
+
+  oxcf->color_space = extra_cfg->color_space;
+  oxcf->color_range = extra_cfg->color_range;
+  oxcf->render_width  = extra_cfg->render_width;
+  oxcf->render_height = extra_cfg->render_height;
+  oxcf->arnr_max_frames = extra_cfg->arnr_max_frames;
+  oxcf->arnr_strength   = extra_cfg->arnr_strength;
+  oxcf->min_gf_interval = extra_cfg->min_gf_interval;
+  oxcf->max_gf_interval = extra_cfg->max_gf_interval;
+
+  oxcf->tuning = extra_cfg->tuning;
+  oxcf->content = extra_cfg->content;
+
+  oxcf->tile_columns = extra_cfg->tile_columns;
+  oxcf->tile_rows    = extra_cfg->tile_rows;
+
+  oxcf->error_resilient_mode         = cfg->g_error_resilient;
+  oxcf->frame_parallel_decoding_mode = extra_cfg->frame_parallel_decoding_mode;
+
+  oxcf->aq_mode = extra_cfg->aq_mode;
+
+  oxcf->frame_periodic_boost =  extra_cfg->frame_periodic_boost;
+
+  oxcf->ss_number_layers = cfg->ss_number_layers;
+  oxcf->ts_number_layers = cfg->ts_number_layers;
+  oxcf->temporal_layering_mode = (enum vp9e_temporal_layering_mode)
+      cfg->temporal_layering_mode;
+
+  for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
+#if CONFIG_SPATIAL_SVC
+    oxcf->ss_enable_auto_arf[sl] = cfg->ss_enable_auto_alt_ref[sl];
+#endif
+    for (tl = 0; tl < oxcf->ts_number_layers; ++tl) {
+      oxcf->layer_target_bitrate[sl * oxcf->ts_number_layers + tl] =
+          1000 * cfg->layer_target_bitrate[sl * oxcf->ts_number_layers + tl];
+    }
+  }
+  if (oxcf->ss_number_layers == 1 && oxcf->pass != 0) {
+    oxcf->ss_target_bitrate[0] = (int)oxcf->target_bandwidth;
+#if CONFIG_SPATIAL_SVC
+    oxcf->ss_enable_auto_arf[0] = extra_cfg->enable_auto_alt_ref;
+#endif
+  }
+  if (oxcf->ts_number_layers > 1) {
+    for (tl = 0; tl < VPX_TS_MAX_LAYERS; ++tl) {
+      oxcf->ts_rate_decimator[tl] = cfg->ts_rate_decimator[tl] ?
+          cfg->ts_rate_decimator[tl] : 1;
+    }
+  } else if (oxcf->ts_number_layers == 1) {
+    oxcf->ts_rate_decimator[0] = 1;
+  }
+  /*
+  printf("Current VP9 Settings: \n");
+  printf("target_bandwidth: %d\n", oxcf->target_bandwidth);
+  printf("noise_sensitivity: %d\n", oxcf->noise_sensitivity);
+  printf("sharpness: %d\n",    oxcf->sharpness);
+  printf("cpu_used: %d\n",  oxcf->cpu_used);
+  printf("Mode: %d\n",     oxcf->mode);
+  printf("auto_key: %d\n",  oxcf->auto_key);
+  printf("key_freq: %d\n", oxcf->key_freq);
+  printf("end_usage: %d\n", oxcf->end_usage);
+  printf("under_shoot_pct: %d\n", oxcf->under_shoot_pct);
+  printf("over_shoot_pct: %d\n", oxcf->over_shoot_pct);
+  printf("starting_buffer_level: %d\n", oxcf->starting_buffer_level);
+  printf("optimal_buffer_level: %d\n",  oxcf->optimal_buffer_level);
+  printf("maximum_buffer_size: %d\n", oxcf->maximum_buffer_size);
+  printf("fixed_q: %d\n",  oxcf->fixed_q);
+  printf("worst_allowed_q: %d\n", oxcf->worst_allowed_q);
+  printf("best_allowed_q: %d\n", oxcf->best_allowed_q);
+  printf("allow_spatial_resampling: %d\n", oxcf->allow_spatial_resampling);
+  printf("scaled_frame_width: %d\n", oxcf->scaled_frame_width);
+  printf("scaled_frame_height: %d\n", oxcf->scaled_frame_height);
+  printf("two_pass_vbrbias: %d\n",  oxcf->two_pass_vbrbias);
+  printf("two_pass_vbrmin_section: %d\n", oxcf->two_pass_vbrmin_section);
+  printf("two_pass_vbrmax_section: %d\n", oxcf->two_pass_vbrmax_section);
+  printf("lag_in_frames: %d\n", oxcf->lag_in_frames);
+  printf("enable_auto_arf: %d\n", oxcf->enable_auto_arf);
+  printf("Version: %d\n", oxcf->Version);
+  printf("encode_breakout: %d\n", oxcf->encode_breakout);
+  printf("error resilient: %d\n", oxcf->error_resilient_mode);
+  printf("frame parallel detokenization: %d\n",
+         oxcf->frame_parallel_decoding_mode);
+  */
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx,
+                                          const vpx_codec_enc_cfg_t  *cfg) {
+  vpx_codec_err_t res;
+  int force_key = 0;
+
+  if (cfg->g_w != ctx->cfg.g_w || cfg->g_h != ctx->cfg.g_h) {
+    if (cfg->g_lag_in_frames > 1 || cfg->g_pass != VPX_RC_ONE_PASS)
+      ERROR("Cannot change width or height after initialization");
+    if (!valid_ref_frame_size(ctx->cfg.g_w, ctx->cfg.g_h, cfg->g_w, cfg->g_h) ||
+        (ctx->cpi->initial_width && (int)cfg->g_w > ctx->cpi->initial_width) ||
+        (ctx->cpi->initial_height && (int)cfg->g_h > ctx->cpi->initial_height))
+      force_key = 1;
+  }
+
+  // Prevent increasing lag_in_frames. This check is stricter than it needs
+  // to be -- the limit is not increasing past the first lag_in_frames
+  // value, but we don't track the initial config, only the last successful
+  // config.
+  if (cfg->g_lag_in_frames > ctx->cfg.g_lag_in_frames)
+    ERROR("Cannot increase lag_in_frames");
+
+  res = validate_config(ctx, cfg, &ctx->extra_cfg);
+
+  if (res == VPX_CODEC_OK) {
+    ctx->cfg = *cfg;
+    set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
+    // On profile change, request a key frame
+    force_key |= ctx->cpi->common.profile != ctx->oxcf.profile;
+    vp9_change_config(ctx->cpi, &ctx->oxcf);
+  }
+
+  if (force_key)
+    ctx->next_frame_flags |= VPX_EFLAG_FORCE_KF;
+
+  return res;
+}
+
+static vpx_codec_err_t ctrl_get_quantizer(vpx_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  int *const arg = va_arg(args, int *);
+  if (arg == NULL)
+    return VPX_CODEC_INVALID_PARAM;
+  *arg = vp9_get_quantizer(ctx->cpi);
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t ctrl_get_quantizer64(vpx_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  int *const arg = va_arg(args, int *);
+  if (arg == NULL)
+    return VPX_CODEC_INVALID_PARAM;
+  *arg = vp9_qindex_to_quantizer(vp9_get_quantizer(ctx->cpi));
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t update_extra_cfg(vpx_codec_alg_priv_t *ctx,
+                                        const struct vp9_extracfg *extra_cfg) {
+  const vpx_codec_err_t res = validate_config(ctx, &ctx->cfg, extra_cfg);
+  if (res == VPX_CODEC_OK) {
+    ctx->extra_cfg = *extra_cfg;
+    set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
+    vp9_change_config(ctx->cpi, &ctx->oxcf);
+  }
+  return res;
+}
+
+static vpx_codec_err_t ctrl_set_cpuused(vpx_codec_alg_priv_t *ctx,
+                                        va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.cpu_used = CAST(VP8E_SET_CPUUSED, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_enable_auto_alt_ref(vpx_codec_alg_priv_t *ctx,
+                                                    va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_auto_alt_ref = CAST(VP8E_SET_ENABLEAUTOALTREF, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_noise_sensitivity(vpx_codec_alg_priv_t *ctx,
+                                                  va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.noise_sensitivity = CAST(VP9E_SET_NOISE_SENSITIVITY, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_sharpness(vpx_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.sharpness = CAST(VP8E_SET_SHARPNESS, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_static_thresh(vpx_codec_alg_priv_t *ctx,
+                                              va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.static_thresh = CAST(VP8E_SET_STATIC_THRESHOLD, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_tile_columns(vpx_codec_alg_priv_t *ctx,
+                                             va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.tile_columns = CAST(VP9E_SET_TILE_COLUMNS, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_tile_rows(vpx_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.tile_rows = CAST(VP9E_SET_TILE_ROWS, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_arnr_max_frames(vpx_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.arnr_max_frames = CAST(VP8E_SET_ARNR_MAXFRAMES, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_arnr_strength(vpx_codec_alg_priv_t *ctx,
+                                              va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.arnr_strength = CAST(VP8E_SET_ARNR_STRENGTH, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_arnr_type(vpx_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  (void)ctx;
+  (void)args;
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t ctrl_set_tuning(vpx_codec_alg_priv_t *ctx,
+                                       va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.tuning = CAST(VP8E_SET_TUNING, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_cq_level(vpx_codec_alg_priv_t *ctx,
+                                         va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.cq_level = CAST(VP8E_SET_CQ_LEVEL, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_rc_max_intra_bitrate_pct(
+    vpx_codec_alg_priv_t *ctx, va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.rc_max_intra_bitrate_pct =
+      CAST(VP8E_SET_MAX_INTRA_BITRATE_PCT, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_rc_max_inter_bitrate_pct(
+    vpx_codec_alg_priv_t *ctx, va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.rc_max_inter_bitrate_pct =
+      CAST(VP8E_SET_MAX_INTER_BITRATE_PCT, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_rc_gf_cbr_boost_pct(
+    vpx_codec_alg_priv_t *ctx, va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.gf_cbr_boost_pct =
+      CAST(VP9E_SET_GF_CBR_BOOST_PCT, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_lossless(vpx_codec_alg_priv_t *ctx,
+                                         va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.lossless = CAST(VP9E_SET_LOSSLESS, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_frame_parallel_decoding_mode(
+    vpx_codec_alg_priv_t *ctx, va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.frame_parallel_decoding_mode =
+      CAST(VP9E_SET_FRAME_PARALLEL_DECODING, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_aq_mode(vpx_codec_alg_priv_t *ctx,
+                                        va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.aq_mode = CAST(VP9E_SET_AQ_MODE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_min_gf_interval(vpx_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.min_gf_interval = CAST(VP9E_SET_MIN_GF_INTERVAL, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_max_gf_interval(vpx_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.max_gf_interval = CAST(VP9E_SET_MAX_GF_INTERVAL, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_frame_periodic_boost(vpx_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.frame_periodic_boost = CAST(VP9E_SET_FRAME_PERIODIC_BOOST, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx,
+                                    vpx_codec_priv_enc_mr_cfg_t *data) {
+  vpx_codec_err_t res = VPX_CODEC_OK;
+  (void)data;
+
+  if (ctx->priv == NULL) {
+    vpx_codec_alg_priv_t *const priv = vpx_calloc(1, sizeof(*priv));
+    if (priv == NULL)
+      return VPX_CODEC_MEM_ERROR;
+
+    ctx->priv = (vpx_codec_priv_t *)priv;
+    ctx->priv->init_flags = ctx->init_flags;
+    ctx->priv->enc.total_encoders = 1;
+    priv->buffer_pool =
+        (BufferPool *)vpx_calloc(1, sizeof(BufferPool));
+    if (priv->buffer_pool == NULL)
+      return VPX_CODEC_MEM_ERROR;
+
+#if CONFIG_MULTITHREAD
+    if (pthread_mutex_init(&priv->buffer_pool->pool_mutex, NULL)) {
+      return VPX_CODEC_MEM_ERROR;
+    }
+#endif
+
+    if (ctx->config.enc) {
+      // Update the reference to the config structure to an internal copy.
+      priv->cfg = *ctx->config.enc;
+      ctx->config.enc = &priv->cfg;
+    }
+
+    priv->extra_cfg = default_extra_cfg;
+    once(vp9_initialize_enc);
+
+    res = validate_config(priv, &priv->cfg, &priv->extra_cfg);
+
+    if (res == VPX_CODEC_OK) {
+      set_encoder_config(&priv->oxcf, &priv->cfg, &priv->extra_cfg);
+#if CONFIG_VP9_HIGHBITDEPTH
+      priv->oxcf.use_highbitdepth =
+          (ctx->init_flags & VPX_CODEC_USE_HIGHBITDEPTH) ? 1 : 0;
+#endif
+      priv->cpi = vp9_create_compressor(&priv->oxcf, priv->buffer_pool);
+      if (priv->cpi == NULL)
+        res = VPX_CODEC_MEM_ERROR;
+      else
+        priv->cpi->output_pkt_list = &priv->pkt_list.head;
+    }
+  }
+
+  return res;
+}
+
+static vpx_codec_err_t encoder_destroy(vpx_codec_alg_priv_t *ctx) {
+  free(ctx->cx_data);
+  vp9_remove_compressor(ctx->cpi);
+#if CONFIG_MULTITHREAD
+  pthread_mutex_destroy(&ctx->buffer_pool->pool_mutex);
+#endif
+  vpx_free(ctx->buffer_pool);
+  vpx_free(ctx);
+  return VPX_CODEC_OK;
+}
+
+static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
+                                    unsigned long duration,
+                                    unsigned long deadline) {
+  MODE new_mode = BEST;
+
+  switch (ctx->cfg.g_pass) {
+    case VPX_RC_ONE_PASS:
+      if (deadline > 0) {
+        const vpx_codec_enc_cfg_t *const cfg = &ctx->cfg;
+
+        // Convert duration parameter from stream timebase to microseconds.
+        const uint64_t duration_us = (uint64_t)duration * 1000000 *
+           (uint64_t)cfg->g_timebase.num /(uint64_t)cfg->g_timebase.den;
+
+        // If the deadline is more that the duration this frame is to be shown,
+        // use good quality mode. Otherwise use realtime mode.
+        new_mode = (deadline > duration_us) ? GOOD : REALTIME;
+      } else {
+        new_mode = BEST;
+      }
+      break;
+    case VPX_RC_FIRST_PASS:
+      break;
+    case VPX_RC_LAST_PASS:
+      new_mode = deadline > 0 ? GOOD : BEST;
+      break;
+  }
+
+  if (ctx->oxcf.mode != new_mode) {
+    ctx->oxcf.mode = new_mode;
+    vp9_change_config(ctx->cpi, &ctx->oxcf);
+  }
+}
+
+// Turn on to test if supplemental superframe data breaks decoding
+// #define TEST_SUPPLEMENTAL_SUPERFRAME_DATA
+static int write_superframe_index(vpx_codec_alg_priv_t *ctx) {
+  uint8_t marker = 0xc0;
+  unsigned int mask;
+  int mag, index_sz;
+
+  assert(ctx->pending_frame_count);
+  assert(ctx->pending_frame_count <= 8);
+
+  // Add the number of frames to the marker byte
+  marker |= ctx->pending_frame_count - 1;
+
+  // Choose the magnitude
+  for (mag = 0, mask = 0xff; mag < 4; mag++) {
+    if (ctx->pending_frame_magnitude < mask)
+      break;
+    mask <<= 8;
+    mask |= 0xff;
+  }
+  marker |= mag << 3;
+
+  // Write the index
+  index_sz = 2 + (mag + 1) * ctx->pending_frame_count;
+  if (ctx->pending_cx_data_sz + index_sz < ctx->cx_data_sz) {
+    uint8_t *x = ctx->pending_cx_data + ctx->pending_cx_data_sz;
+    int i, j;
+#ifdef TEST_SUPPLEMENTAL_SUPERFRAME_DATA
+    uint8_t marker_test = 0xc0;
+    int mag_test = 2;     // 1 - 4
+    int frames_test = 4;  // 1 - 8
+    int index_sz_test = 2 + mag_test * frames_test;
+    marker_test |= frames_test - 1;
+    marker_test |= (mag_test - 1) << 3;
+    *x++ = marker_test;
+    for (i = 0; i < mag_test * frames_test; ++i)
+      *x++ = 0;  // fill up with arbitrary data
+    *x++ = marker_test;
+    ctx->pending_cx_data_sz += index_sz_test;
+    printf("Added supplemental superframe data\n");
+#endif
+
+    *x++ = marker;
+    for (i = 0; i < ctx->pending_frame_count; i++) {
+      unsigned int this_sz = (unsigned int)ctx->pending_frame_sizes[i];
+
+      for (j = 0; j <= mag; j++) {
+        *x++ = this_sz & 0xff;
+        this_sz >>= 8;
+      }
+    }
+    *x++ = marker;
+    ctx->pending_cx_data_sz += index_sz;
+#ifdef TEST_SUPPLEMENTAL_SUPERFRAME_DATA
+    index_sz += index_sz_test;
+#endif
+  }
+  return index_sz;
+}
+
+// vp9 uses 10,000,000 ticks/second as time stamp
+#define TICKS_PER_SEC 10000000LL
+
+static int64_t timebase_units_to_ticks(const vpx_rational_t *timebase,
+                                       int64_t n) {
+  return n * TICKS_PER_SEC * timebase->num / timebase->den;
+}
+
+static int64_t ticks_to_timebase_units(const vpx_rational_t *timebase,
+                                       int64_t n) {
+  const int64_t round = TICKS_PER_SEC * timebase->num / 2 - 1;
+  return (n * timebase->den + round) / timebase->num / TICKS_PER_SEC;
+}
+
+static vpx_codec_frame_flags_t get_frame_pkt_flags(const VP9_COMP *cpi,
+                                                   unsigned int lib_flags) {
+  vpx_codec_frame_flags_t flags = lib_flags << 16;
+
+  if (lib_flags & FRAMEFLAGS_KEY ||
+      (cpi->use_svc &&
+          cpi->svc.layer_context[cpi->svc.spatial_layer_id *
+              cpi->svc.number_temporal_layers +
+              cpi->svc.temporal_layer_id].is_key_frame)
+     )
+    flags |= VPX_FRAME_IS_KEY;
+
+  if (cpi->droppable)
+    flags |= VPX_FRAME_IS_DROPPABLE;
+
+  return flags;
+}
+
+static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t  *ctx,
+                                      const vpx_image_t *img,
+                                      vpx_codec_pts_t pts,
+                                      unsigned long duration,
+                                      vpx_enc_frame_flags_t flags,
+                                      unsigned long deadline) {
+  vpx_codec_err_t res = VPX_CODEC_OK;
+  VP9_COMP *const cpi = ctx->cpi;
+  const vpx_rational_t *const timebase = &ctx->cfg.g_timebase;
+  size_t data_sz;
+
+  if (img != NULL) {
+    res = validate_img(ctx, img);
+    // TODO(jzern) the checks related to cpi's validity should be treated as a
+    // failure condition, encoder setup is done fully in init() currently.
+    if (res == VPX_CODEC_OK && cpi != NULL) {
+      // There's no codec control for multiple alt-refs so check the encoder
+      // instance for its status to determine the compressed data size.
+      data_sz = ctx->cfg.g_w * ctx->cfg.g_h * get_image_bps(img) / 8 *
+                (cpi->multi_arf_allowed ? 8 : 2);
+      if (data_sz < 4096)
+        data_sz = 4096;
+      if (ctx->cx_data == NULL || ctx->cx_data_sz < data_sz) {
+        ctx->cx_data_sz = data_sz;
+        free(ctx->cx_data);
+        ctx->cx_data = (unsigned char*)malloc(ctx->cx_data_sz);
+        if (ctx->cx_data == NULL) {
+          return VPX_CODEC_MEM_ERROR;
+        }
+      }
+    }
+  }
+
+  pick_quickcompress_mode(ctx, duration, deadline);
+  vpx_codec_pkt_list_init(&ctx->pkt_list);
+
+  // Handle Flags
+  if (((flags & VP8_EFLAG_NO_UPD_GF) && (flags & VP8_EFLAG_FORCE_GF)) ||
+       ((flags & VP8_EFLAG_NO_UPD_ARF) && (flags & VP8_EFLAG_FORCE_ARF))) {
+    ctx->base.err_detail = "Conflicting flags.";
+    return VPX_CODEC_INVALID_PARAM;
+  }
+
+  vp9_apply_encoding_flags(cpi, flags);
+
+  // Handle fixed keyframe intervals
+  if (ctx->cfg.kf_mode == VPX_KF_AUTO &&
+      ctx->cfg.kf_min_dist == ctx->cfg.kf_max_dist) {
+    if (++ctx->fixed_kf_cntr > ctx->cfg.kf_min_dist) {
+      flags |= VPX_EFLAG_FORCE_KF;
+      ctx->fixed_kf_cntr = 1;
+    }
+  }
+
+  // Initialize the encoder instance on the first frame.
+  if (res == VPX_CODEC_OK && cpi != NULL) {
+    unsigned int lib_flags = 0;
+    YV12_BUFFER_CONFIG sd;
+    int64_t dst_time_stamp = timebase_units_to_ticks(timebase, pts);
+    int64_t dst_end_time_stamp =
+        timebase_units_to_ticks(timebase, pts + duration);
+    size_t size, cx_data_sz;
+    unsigned char *cx_data;
+
+    // Set up internal flags
+    if (ctx->base.init_flags & VPX_CODEC_USE_PSNR)
+      cpi->b_calculate_psnr = 1;
+
+    if (img != NULL) {
+      res = image2yuvconfig(img, &sd);
+
+      // Store the original flags in to the frame buffer. Will extract the
+      // key frame flag when we actually encode this frame.
+      if (vp9_receive_raw_frame(cpi, flags | ctx->next_frame_flags,
+                                &sd, dst_time_stamp, dst_end_time_stamp)) {
+        res = update_error_state(ctx, &cpi->common.error);
+      }
+      ctx->next_frame_flags = 0;
+    }
+
+    cx_data = ctx->cx_data;
+    cx_data_sz = ctx->cx_data_sz;
+
+    /* Any pending invisible frames? */
+    if (ctx->pending_cx_data) {
+      memmove(cx_data, ctx->pending_cx_data, ctx->pending_cx_data_sz);
+      ctx->pending_cx_data = cx_data;
+      cx_data += ctx->pending_cx_data_sz;
+      cx_data_sz -= ctx->pending_cx_data_sz;
+
+      /* TODO: this is a minimal check, the underlying codec doesn't respect
+       * the buffer size anyway.
+       */
+      if (cx_data_sz < ctx->cx_data_sz / 2) {
+        ctx->base.err_detail = "Compressed data buffer too small";
+        return VPX_CODEC_ERROR;
+      }
+    }
+
+    while (cx_data_sz >= ctx->cx_data_sz / 2 &&
+           -1 != vp9_get_compressed_data(cpi, &lib_flags, &size,
+                                         cx_data, &dst_time_stamp,
+                                         &dst_end_time_stamp, !img)) {
+      if (size) {
+        vpx_codec_cx_pkt_t pkt;
+
+#if CONFIG_SPATIAL_SVC
+        if (cpi->use_svc)
+          cpi->svc.layer_context[cpi->svc.spatial_layer_id *
+              cpi->svc.number_temporal_layers].layer_size += size;
+#endif
+
+        // Pack invisible frames with the next visible frame
+        if (!cpi->common.show_frame ||
+            (cpi->use_svc &&
+             cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1)
+            ) {
+          if (ctx->pending_cx_data == 0)
+            ctx->pending_cx_data = cx_data;
+          ctx->pending_cx_data_sz += size;
+          ctx->pending_frame_sizes[ctx->pending_frame_count++] = size;
+          ctx->pending_frame_magnitude |= size;
+          cx_data += size;
+          cx_data_sz -= size;
+
+          if (ctx->output_cx_pkt_cb.output_cx_pkt) {
+            pkt.kind = VPX_CODEC_CX_FRAME_PKT;
+            pkt.data.frame.pts = ticks_to_timebase_units(timebase,
+                                                         dst_time_stamp);
+            pkt.data.frame.duration =
+               (unsigned long)ticks_to_timebase_units(timebase,
+                   dst_end_time_stamp - dst_time_stamp);
+            pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags);
+            pkt.data.frame.buf = ctx->pending_cx_data;
+            pkt.data.frame.sz  = size;
+            ctx->pending_cx_data = NULL;
+            ctx->pending_cx_data_sz = 0;
+            ctx->pending_frame_count = 0;
+            ctx->pending_frame_magnitude = 0;
+            ctx->output_cx_pkt_cb.output_cx_pkt(
+                &pkt, ctx->output_cx_pkt_cb.user_priv);
+          }
+          continue;
+        }
+
+        // Add the frame packet to the list of returned packets.
+        pkt.kind = VPX_CODEC_CX_FRAME_PKT;
+        pkt.data.frame.pts = ticks_to_timebase_units(timebase, dst_time_stamp);
+        pkt.data.frame.duration =
+           (unsigned long)ticks_to_timebase_units(timebase,
+               dst_end_time_stamp - dst_time_stamp);
+        pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags);
+
+        if (ctx->pending_cx_data) {
+          ctx->pending_frame_sizes[ctx->pending_frame_count++] = size;
+          ctx->pending_frame_magnitude |= size;
+          ctx->pending_cx_data_sz += size;
+          // write the superframe only for the case when
+          if (!ctx->output_cx_pkt_cb.output_cx_pkt)
+            size += write_superframe_index(ctx);
+          pkt.data.frame.buf = ctx->pending_cx_data;
+          pkt.data.frame.sz  = ctx->pending_cx_data_sz;
+          ctx->pending_cx_data = NULL;
+          ctx->pending_cx_data_sz = 0;
+          ctx->pending_frame_count = 0;
+          ctx->pending_frame_magnitude = 0;
+        } else {
+          pkt.data.frame.buf = cx_data;
+          pkt.data.frame.sz  = size;
+        }
+        pkt.data.frame.partition_id = -1;
+
+        if(ctx->output_cx_pkt_cb.output_cx_pkt)
+          ctx->output_cx_pkt_cb.output_cx_pkt(&pkt,
+                                              ctx->output_cx_pkt_cb.user_priv);
+        else
+          vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt);
+
+        cx_data += size;
+        cx_data_sz -= size;
+#if VPX_ENCODER_ABI_VERSION > (5 + VPX_CODEC_ABI_VERSION)
+#if CONFIG_SPATIAL_SVC
+        if (cpi->use_svc && !ctx->output_cx_pkt_cb.output_cx_pkt) {
+          vpx_codec_cx_pkt_t pkt_sizes, pkt_psnr;
+          int sl;
+          vp9_zero(pkt_sizes);
+          vp9_zero(pkt_psnr);
+          pkt_sizes.kind = VPX_CODEC_SPATIAL_SVC_LAYER_SIZES;
+          pkt_psnr.kind = VPX_CODEC_SPATIAL_SVC_LAYER_PSNR;
+          for (sl = 0; sl < cpi->svc.number_spatial_layers; ++sl) {
+            LAYER_CONTEXT *lc =
+                &cpi->svc.layer_context[sl * cpi->svc.number_temporal_layers];
+            pkt_sizes.data.layer_sizes[sl] = lc->layer_size;
+            pkt_psnr.data.layer_psnr[sl] = lc->psnr_pkt;
+            lc->layer_size = 0;
+          }
+
+          vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt_sizes);
+
+          vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt_psnr);
+        }
+#endif
+#endif
+        if (is_one_pass_cbr_svc(cpi) &&
+            (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)) {
+          // Encoded all spatial layers; exit loop.
+          break;
+        }
+      }
+    }
+  }
+
+  return res;
+}
+
+static const vpx_codec_cx_pkt_t *encoder_get_cxdata(vpx_codec_alg_priv_t *ctx,
+                                                    vpx_codec_iter_t *iter) {
+  return vpx_codec_pkt_list_get(&ctx->pkt_list.head, iter);
+}
+
+static vpx_codec_err_t ctrl_set_reference(vpx_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  vpx_ref_frame_t *const frame = va_arg(args, vpx_ref_frame_t *);
+
+  if (frame != NULL) {
+    YV12_BUFFER_CONFIG sd;
+
+    image2yuvconfig(&frame->img, &sd);
+    vp9_set_reference_enc(ctx->cpi, ref_frame_to_vp9_reframe(frame->frame_type),
+                          &sd);
+    return VPX_CODEC_OK;
+  } else {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+}
+
+static vpx_codec_err_t ctrl_copy_reference(vpx_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  vpx_ref_frame_t *const frame = va_arg(args, vpx_ref_frame_t *);
+
+  if (frame != NULL) {
+    YV12_BUFFER_CONFIG sd;
+
+    image2yuvconfig(&frame->img, &sd);
+    vp9_copy_reference_enc(ctx->cpi,
+                           ref_frame_to_vp9_reframe(frame->frame_type), &sd);
+    return VPX_CODEC_OK;
+  } else {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+}
+
+static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  vp9_ref_frame_t *const frame = va_arg(args, vp9_ref_frame_t *);
+
+  if (frame != NULL) {
+    YV12_BUFFER_CONFIG *fb = get_ref_frame(&ctx->cpi->common, frame->idx);
+    if (fb == NULL) return VPX_CODEC_ERROR;
+
+    yuvconfig2image(&frame->img, fb, NULL);
+    return VPX_CODEC_OK;
+  } else {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+}
+
+static vpx_codec_err_t ctrl_set_previewpp(vpx_codec_alg_priv_t *ctx,
+                                          va_list args) {
+#if CONFIG_VP9_POSTPROC
+  vp8_postproc_cfg_t *config = va_arg(args, vp8_postproc_cfg_t *);
+  if (config != NULL) {
+    ctx->preview_ppcfg = *config;
+    return VPX_CODEC_OK;
+  } else {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+#else
+  (void)ctx;
+  (void)args;
+  return VPX_CODEC_INCAPABLE;
+#endif
+}
+
+
+static vpx_image_t *encoder_get_preview(vpx_codec_alg_priv_t *ctx) {
+  YV12_BUFFER_CONFIG sd;
+  vp9_ppflags_t flags;
+  vp9_zero(flags);
+
+  if (ctx->preview_ppcfg.post_proc_flag) {
+    flags.post_proc_flag   = ctx->preview_ppcfg.post_proc_flag;
+    flags.deblocking_level = ctx->preview_ppcfg.deblocking_level;
+    flags.noise_level      = ctx->preview_ppcfg.noise_level;
+  }
+
+  if (vp9_get_preview_raw_frame(ctx->cpi, &sd, &flags) == 0) {
+    yuvconfig2image(&ctx->preview_img, &sd, NULL);
+    return &ctx->preview_img;
+  } else {
+    return NULL;
+  }
+}
+
+static vpx_codec_err_t ctrl_set_roi_map(vpx_codec_alg_priv_t *ctx,
+                                        va_list args) {
+  (void)ctx;
+  (void)args;
+
+  // TODO(yaowu): Need to re-implement and test for VP9.
+  return VPX_CODEC_INVALID_PARAM;
+}
+
+
+static vpx_codec_err_t ctrl_set_active_map(vpx_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  vpx_active_map_t *const map = va_arg(args, vpx_active_map_t *);
+
+  if (map) {
+    if (!vp9_set_active_map(ctx->cpi, map->active_map,
+                            (int)map->rows, (int)map->cols))
+      return VPX_CODEC_OK;
+    else
+      return VPX_CODEC_INVALID_PARAM;
+  } else {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+}
+
+static vpx_codec_err_t ctrl_get_active_map(vpx_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  vpx_active_map_t *const map = va_arg(args, vpx_active_map_t *);
+
+  if (map) {
+    if (!vp9_get_active_map(ctx->cpi, map->active_map,
+                            (int)map->rows, (int)map->cols))
+      return VPX_CODEC_OK;
+    else
+      return VPX_CODEC_INVALID_PARAM;
+  } else {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+}
+
+static vpx_codec_err_t ctrl_set_scale_mode(vpx_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  vpx_scaling_mode_t *const mode = va_arg(args, vpx_scaling_mode_t *);
+
+  if (mode) {
+    const int res = vp9_set_internal_size(ctx->cpi,
+                                          (VPX_SCALING)mode->h_scaling_mode,
+                                          (VPX_SCALING)mode->v_scaling_mode);
+    return (res == 0) ? VPX_CODEC_OK : VPX_CODEC_INVALID_PARAM;
+  } else {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+}
+
+static vpx_codec_err_t ctrl_set_svc(vpx_codec_alg_priv_t *ctx, va_list args) {
+  int data = va_arg(args, int);
+  const vpx_codec_enc_cfg_t *cfg = &ctx->cfg;
+  // Both one-pass and two-pass RC are supported now.
+  // User setting this has to make sure of the following.
+  // In two-pass setting: either (but not both)
+  //      cfg->ss_number_layers > 1, or cfg->ts_number_layers > 1
+  // In one-pass setting:
+  //      either or both cfg->ss_number_layers > 1, or cfg->ts_number_layers > 1
+
+  vp9_set_svc(ctx->cpi, data);
+
+  if (data == 1 &&
+      (cfg->g_pass == VPX_RC_FIRST_PASS ||
+       cfg->g_pass == VPX_RC_LAST_PASS) &&
+       cfg->ss_number_layers > 1 &&
+       cfg->ts_number_layers > 1) {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t ctrl_set_svc_layer_id(vpx_codec_alg_priv_t *ctx,
+                                             va_list args) {
+  vpx_svc_layer_id_t *const data = va_arg(args, vpx_svc_layer_id_t *);
+  VP9_COMP *const cpi = (VP9_COMP *)ctx->cpi;
+  SVC *const svc = &cpi->svc;
+
+  svc->first_spatial_layer_to_encode = data->spatial_layer_id;
+  svc->spatial_layer_to_encode = data->spatial_layer_id;
+  svc->temporal_layer_id = data->temporal_layer_id;
+  // Checks on valid layer_id input.
+  if (svc->temporal_layer_id < 0 ||
+      svc->temporal_layer_id >= (int)ctx->cfg.ts_number_layers) {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+  if (svc->first_spatial_layer_to_encode < 0 ||
+      svc->first_spatial_layer_to_encode >= (int)ctx->cfg.ss_number_layers) {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+  // First spatial layer to encode not implemented for two-pass.
+  if (is_two_pass_svc(cpi) && svc->first_spatial_layer_to_encode > 0)
+    return VPX_CODEC_INVALID_PARAM;
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t ctrl_get_svc_layer_id(vpx_codec_alg_priv_t *ctx,
+                                             va_list args) {
+  vpx_svc_layer_id_t *data = va_arg(args, vpx_svc_layer_id_t *);
+  VP9_COMP *const cpi = (VP9_COMP *)ctx->cpi;
+  SVC *const svc = &cpi->svc;
+
+  data->spatial_layer_id = svc->spatial_layer_id;
+  data->temporal_layer_id = svc->temporal_layer_id;
+
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t ctrl_set_svc_parameters(vpx_codec_alg_priv_t *ctx,
+                                               va_list args) {
+  VP9_COMP *const cpi = ctx->cpi;
+  vpx_svc_extra_cfg_t *const params = va_arg(args, vpx_svc_extra_cfg_t *);
+  int sl, tl;
+
+  // Number of temporal layers and number of spatial layers have to be set
+  // properly before calling this control function.
+  for (sl = 0; sl < cpi->svc.number_spatial_layers; ++sl) {
+    for (tl = 0; tl < cpi->svc.number_temporal_layers; ++tl) {
+      const int layer =
+          LAYER_IDS_TO_IDX(sl, tl, cpi->svc.number_temporal_layers);
+      LAYER_CONTEXT *lc =
+          &cpi->svc.layer_context[layer];
+      lc->max_q = params->max_quantizers[layer];
+      lc->min_q = params->min_quantizers[layer];
+      lc->scaling_factor_num = params->scaling_factor_num[sl];
+      lc->scaling_factor_den = params->scaling_factor_den[sl];
+    }
+  }
+
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t ctrl_set_svc_ref_frame_config(vpx_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  VP9_COMP *const cpi = ctx->cpi;
+  vpx_svc_ref_frame_config_t *data = va_arg(args, vpx_svc_ref_frame_config_t *);
+  int sl;
+  for (sl = 0; sl < cpi->svc.number_spatial_layers; ++sl) {
+    cpi->svc.ext_frame_flags[sl] = data->frame_flags[sl];
+    cpi->svc.ext_lst_fb_idx[sl] = data->lst_fb_idx[sl];
+    cpi->svc.ext_gld_fb_idx[sl] = data->gld_fb_idx[sl];
+    cpi->svc.ext_alt_fb_idx[sl] = data->alt_fb_idx[sl];
+  }
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t ctrl_register_cx_callback(vpx_codec_alg_priv_t *ctx,
+                                                 va_list args) {
+  vpx_codec_priv_output_cx_pkt_cb_pair_t *cbp =
+      (vpx_codec_priv_output_cx_pkt_cb_pair_t *)va_arg(args, void *);
+  ctx->output_cx_pkt_cb.output_cx_pkt = cbp->output_cx_pkt;
+  ctx->output_cx_pkt_cb.user_priv = cbp->user_priv;
+
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t ctrl_set_tune_content(vpx_codec_alg_priv_t *ctx,
+                                             va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.content = CAST(VP9E_SET_TUNE_CONTENT, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_color_space(vpx_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.color_space = CAST(VP9E_SET_COLOR_SPACE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_color_range(vpx_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.color_range = CAST(VP9E_SET_COLOR_RANGE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_render_size(vpx_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  int *const render_size = va_arg(args, int *);
+  extra_cfg.render_width  = render_size[0];
+  extra_cfg.render_height = render_size[1];
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
+  {VP8_COPY_REFERENCE,                ctrl_copy_reference},
+
+  // Setters
+  {VP8_SET_REFERENCE,                 ctrl_set_reference},
+  {VP8_SET_POSTPROC,                  ctrl_set_previewpp},
+  {VP8E_SET_ROI_MAP,                  ctrl_set_roi_map},
+  {VP8E_SET_ACTIVEMAP,                ctrl_set_active_map},
+  {VP8E_SET_SCALEMODE,                ctrl_set_scale_mode},
+  {VP8E_SET_CPUUSED,                  ctrl_set_cpuused},
+  {VP8E_SET_ENABLEAUTOALTREF,         ctrl_set_enable_auto_alt_ref},
+  {VP8E_SET_SHARPNESS,                ctrl_set_sharpness},
+  {VP8E_SET_STATIC_THRESHOLD,         ctrl_set_static_thresh},
+  {VP9E_SET_TILE_COLUMNS,             ctrl_set_tile_columns},
+  {VP9E_SET_TILE_ROWS,                ctrl_set_tile_rows},
+  {VP8E_SET_ARNR_MAXFRAMES,           ctrl_set_arnr_max_frames},
+  {VP8E_SET_ARNR_STRENGTH,            ctrl_set_arnr_strength},
+  {VP8E_SET_ARNR_TYPE,                ctrl_set_arnr_type},
+  {VP8E_SET_TUNING,                   ctrl_set_tuning},
+  {VP8E_SET_CQ_LEVEL,                 ctrl_set_cq_level},
+  {VP8E_SET_MAX_INTRA_BITRATE_PCT,    ctrl_set_rc_max_intra_bitrate_pct},
+  {VP9E_SET_MAX_INTER_BITRATE_PCT,    ctrl_set_rc_max_inter_bitrate_pct},
+  {VP9E_SET_GF_CBR_BOOST_PCT,         ctrl_set_rc_gf_cbr_boost_pct},
+  {VP9E_SET_LOSSLESS,                 ctrl_set_lossless},
+  {VP9E_SET_FRAME_PARALLEL_DECODING,  ctrl_set_frame_parallel_decoding_mode},
+  {VP9E_SET_AQ_MODE,                  ctrl_set_aq_mode},
+  {VP9E_SET_FRAME_PERIODIC_BOOST,     ctrl_set_frame_periodic_boost},
+  {VP9E_SET_SVC,                      ctrl_set_svc},
+  {VP9E_SET_SVC_PARAMETERS,           ctrl_set_svc_parameters},
+  {VP9E_REGISTER_CX_CALLBACK,         ctrl_register_cx_callback},
+  {VP9E_SET_SVC_LAYER_ID,             ctrl_set_svc_layer_id},
+  {VP9E_SET_TUNE_CONTENT,             ctrl_set_tune_content},
+  {VP9E_SET_COLOR_SPACE,              ctrl_set_color_space},
+  {VP9E_SET_COLOR_RANGE,              ctrl_set_color_range},
+  {VP9E_SET_NOISE_SENSITIVITY,        ctrl_set_noise_sensitivity},
+  {VP9E_SET_MIN_GF_INTERVAL,          ctrl_set_min_gf_interval},
+  {VP9E_SET_MAX_GF_INTERVAL,          ctrl_set_max_gf_interval},
+  {VP9E_SET_SVC_REF_FRAME_CONFIG,     ctrl_set_svc_ref_frame_config},
+  {VP9E_SET_RENDER_SIZE,              ctrl_set_render_size},
+
+  // Getters
+  {VP8E_GET_LAST_QUANTIZER,           ctrl_get_quantizer},
+  {VP8E_GET_LAST_QUANTIZER_64,        ctrl_get_quantizer64},
+  {VP9_GET_REFERENCE,                 ctrl_get_reference},
+  {VP9E_GET_SVC_LAYER_ID,             ctrl_get_svc_layer_id},
+  {VP9E_GET_ACTIVEMAP,                ctrl_get_active_map},
+
+  { -1, NULL},
+};
+
+static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
+  {
+    0,
+    {  // NOLINT
+      0,                  // g_usage
+      8,                  // g_threads
+      0,                  // g_profile
+
+      320,                // g_width
+      240,                // g_height
+      VPX_BITS_8,         // g_bit_depth
+      8,                  // g_input_bit_depth
+
+      {1, 30},            // g_timebase
+
+      0,                  // g_error_resilient
+
+      VPX_RC_ONE_PASS,    // g_pass
+
+      25,                 // g_lag_in_frames
+
+      0,                  // rc_dropframe_thresh
+      0,                  // rc_resize_allowed
+      0,                  // rc_scaled_width
+      0,                  // rc_scaled_height
+      60,                 // rc_resize_down_thresold
+      30,                 // rc_resize_up_thresold
+
+      VPX_VBR,            // rc_end_usage
+      {NULL, 0},          // rc_twopass_stats_in
+      {NULL, 0},          // rc_firstpass_mb_stats_in
+      256,                // rc_target_bandwidth
+      0,                  // rc_min_quantizer
+      63,                 // rc_max_quantizer
+      25,                 // rc_undershoot_pct
+      25,                 // rc_overshoot_pct
+
+      6000,               // rc_max_buffer_size
+      4000,               // rc_buffer_initial_size
+      5000,               // rc_buffer_optimal_size
+
+      50,                 // rc_two_pass_vbrbias
+      0,                  // rc_two_pass_vbrmin_section
+      2000,               // rc_two_pass_vbrmax_section
+
+      // keyframing settings (kf)
+      VPX_KF_AUTO,        // g_kfmode
+      0,                  // kf_min_dist
+      9999,               // kf_max_dist
+
+      VPX_SS_DEFAULT_LAYERS,  // ss_number_layers
+      {0},
+      {0},                    // ss_target_bitrate
+      1,                      // ts_number_layers
+      {0},                    // ts_target_bitrate
+      {0},                    // ts_rate_decimator
+      0,                      // ts_periodicity
+      {0},                    // ts_layer_id
+      {0},                  // layer_taget_bitrate
+      0                     // temporal_layering_mode
+    }
+  },
+};
+
+#ifndef VERSION_STRING
+#define VERSION_STRING
+#endif
+CODEC_INTERFACE(vpx_codec_vp9_cx) = {
+  "WebM Project VP9 Encoder" VERSION_STRING,
+  VPX_CODEC_INTERNAL_ABI_VERSION,
+#if CONFIG_VP9_HIGHBITDEPTH
+  VPX_CODEC_CAP_HIGHBITDEPTH |
+#endif
+  VPX_CODEC_CAP_ENCODER | VPX_CODEC_CAP_PSNR,  // vpx_codec_caps_t
+  encoder_init,       // vpx_codec_init_fn_t
+  encoder_destroy,    // vpx_codec_destroy_fn_t
+  encoder_ctrl_maps,  // vpx_codec_ctrl_fn_map_t
+  {  // NOLINT
+    NULL,  // vpx_codec_peek_si_fn_t
+    NULL,  // vpx_codec_get_si_fn_t
+    NULL,  // vpx_codec_decode_fn_t
+    NULL,  // vpx_codec_frame_get_fn_t
+    NULL   // vpx_codec_set_fb_fn_t
+  },
+  {  // NOLINT
+    1,                      // 1 cfg map
+    encoder_usage_cfg_map,  // vpx_codec_enc_cfg_map_t
+    encoder_encode,         // vpx_codec_encode_fn_t
+    encoder_get_cxdata,     // vpx_codec_get_cx_data_fn_t
+    encoder_set_config,     // vpx_codec_enc_config_set_fn_t
+    NULL,        // vpx_codec_get_global_headers_fn_t
+    encoder_get_preview,    // vpx_codec_get_preview_frame_fn_t
+    NULL         // vpx_codec_enc_mr_get_mem_loc_fn_t
+  }
+};
diff --git a/libs/libvpx/vp9/vp9_dx_iface.c b/libs/libvpx/vp9/vp9_dx_iface.c
new file mode 100644
index 0000000000..be5d1600a5
--- /dev/null
+++ b/libs/libvpx/vp9/vp9_dx_iface.c
@@ -0,0 +1,1084 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "./vpx_config.h"
+#include "./vpx_version.h"
+
+#include "vpx/internal/vpx_codec_internal.h"
+#include "vpx/vp8dx.h"
+#include "vpx/vpx_decoder.h"
+#include "vpx_dsp/bitreader_buffer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_util/vpx_thread.h"
+
+#include "vp9/common/vp9_alloccommon.h"
+#include "vp9/common/vp9_frame_buffers.h"
+
+#include "vp9/decoder/vp9_decodeframe.h"
+
+#include "vp9/vp9_dx_iface.h"
+#include "vp9/vp9_iface_common.h"
+
+#define VP9_CAP_POSTPROC (CONFIG_VP9_POSTPROC ? VPX_CODEC_CAP_POSTPROC : 0)
+
+static vpx_codec_err_t decoder_init(vpx_codec_ctx_t *ctx,
+                                    vpx_codec_priv_enc_mr_cfg_t *data) {
+  // This function only allocates space for the vpx_codec_alg_priv_t
+  // structure. More memory may be required at the time the stream
+  // information becomes known.
+  (void)data;
+
+  if (!ctx->priv) {
+    vpx_codec_alg_priv_t *const priv =
+        (vpx_codec_alg_priv_t *)vpx_calloc(1, sizeof(*priv));
+    if (priv == NULL)
+      return VPX_CODEC_MEM_ERROR;
+
+    ctx->priv = (vpx_codec_priv_t *)priv;
+    ctx->priv->init_flags = ctx->init_flags;
+    priv->si.sz = sizeof(priv->si);
+    priv->flushed = 0;
+    // Only do frame parallel decode when threads > 1.
+    priv->frame_parallel_decode =
+        (ctx->config.dec && (ctx->config.dec->threads > 1) &&
+         (ctx->init_flags & VPX_CODEC_USE_FRAME_THREADING)) ? 1 : 0;
+    if (ctx->config.dec) {
+      priv->cfg = *ctx->config.dec;
+      ctx->config.dec = &priv->cfg;
+    }
+  }
+
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t decoder_destroy(vpx_codec_alg_priv_t *ctx) {
+  if (ctx->frame_workers != NULL) {
+    int i;
+    for (i = 0; i < ctx->num_frame_workers; ++i) {
+      VPxWorker *const worker = &ctx->frame_workers[i];
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      vpx_get_worker_interface()->end(worker);
+      vp9_remove_common(&frame_worker_data->pbi->common);
+#if CONFIG_VP9_POSTPROC
+      vp9_free_postproc_buffers(&frame_worker_data->pbi->common);
+#endif
+      vp9_decoder_remove(frame_worker_data->pbi);
+      vpx_free(frame_worker_data->scratch_buffer);
+#if CONFIG_MULTITHREAD
+      pthread_mutex_destroy(&frame_worker_data->stats_mutex);
+      pthread_cond_destroy(&frame_worker_data->stats_cond);
+#endif
+      vpx_free(frame_worker_data);
+    }
+#if CONFIG_MULTITHREAD
+    pthread_mutex_destroy(&ctx->buffer_pool->pool_mutex);
+#endif
+  }
+
+  if (ctx->buffer_pool) {
+    vp9_free_ref_frame_buffers(ctx->buffer_pool);
+    vp9_free_internal_frame_buffers(&ctx->buffer_pool->int_frame_buffers);
+  }
+
+  vpx_free(ctx->frame_workers);
+  vpx_free(ctx->buffer_pool);
+  vpx_free(ctx);
+  return VPX_CODEC_OK;
+}
+
+static int parse_bitdepth_colorspace_sampling(
+    BITSTREAM_PROFILE profile, struct vpx_read_bit_buffer *rb) {
+  vpx_color_space_t color_space;
+  if (profile >= PROFILE_2)
+    rb->bit_offset += 1;  // Bit-depth 10 or 12.
+  color_space = (vpx_color_space_t)vpx_rb_read_literal(rb, 3);
+  if (color_space != VPX_CS_SRGB) {
+    rb->bit_offset += 1;  // [16,235] (including xvycc) vs [0,255] range.
+    if (profile == PROFILE_1 || profile == PROFILE_3) {
+      rb->bit_offset += 2;  // subsampling x/y.
+      rb->bit_offset += 1;  // unused.
+    }
+  } else {
+    if (profile == PROFILE_1 || profile == PROFILE_3) {
+      rb->bit_offset += 1;  // unused
+    } else {
+      // RGB is only available in version 1.
+      return 0;
+    }
+  }
+  return 1;
+}
+
+static vpx_codec_err_t decoder_peek_si_internal(const uint8_t *data,
+                                                unsigned int data_sz,
+                                                vpx_codec_stream_info_t *si,
+                                                int *is_intra_only,
+                                                vpx_decrypt_cb decrypt_cb,
+                                                void *decrypt_state) {
+  int intra_only_flag = 0;
+  uint8_t clear_buffer[9];
+
+  if (data + data_sz <= data)
+    return VPX_CODEC_INVALID_PARAM;
+
+  si->is_kf = 0;
+  si->w = si->h = 0;
+
+  if (decrypt_cb) {
+    data_sz = VPXMIN(sizeof(clear_buffer), data_sz);
+    decrypt_cb(decrypt_state, data, clear_buffer, data_sz);
+    data = clear_buffer;
+  }
+
+  {
+    int show_frame;
+    int error_resilient;
+    struct vpx_read_bit_buffer rb = { data, data + data_sz, 0, NULL, NULL };
+    const int frame_marker = vpx_rb_read_literal(&rb, 2);
+    const BITSTREAM_PROFILE profile = vp9_read_profile(&rb);
+
+    if (frame_marker != VP9_FRAME_MARKER)
+      return VPX_CODEC_UNSUP_BITSTREAM;
+
+    if (profile >= MAX_PROFILES)
+      return VPX_CODEC_UNSUP_BITSTREAM;
+
+    if ((profile >= 2 && data_sz <= 1) || data_sz < 1)
+      return VPX_CODEC_UNSUP_BITSTREAM;
+
+    if (vpx_rb_read_bit(&rb)) {  // show an existing frame
+      vpx_rb_read_literal(&rb, 3);  // Frame buffer to show.
+      return VPX_CODEC_OK;
+    }
+
+    if (data_sz <= 8)
+      return VPX_CODEC_UNSUP_BITSTREAM;
+
+    si->is_kf = !vpx_rb_read_bit(&rb);
+    show_frame = vpx_rb_read_bit(&rb);
+    error_resilient = vpx_rb_read_bit(&rb);
+
+    if (si->is_kf) {
+      if (!vp9_read_sync_code(&rb))
+        return VPX_CODEC_UNSUP_BITSTREAM;
+
+      if (!parse_bitdepth_colorspace_sampling(profile, &rb))
+        return VPX_CODEC_UNSUP_BITSTREAM;
+      vp9_read_frame_size(&rb, (int *)&si->w, (int *)&si->h);
+    } else {
+      intra_only_flag = show_frame ? 0 : vpx_rb_read_bit(&rb);
+
+      rb.bit_offset += error_resilient ? 0 : 2;  // reset_frame_context
+
+      if (intra_only_flag) {
+        if (!vp9_read_sync_code(&rb))
+          return VPX_CODEC_UNSUP_BITSTREAM;
+        if (profile > PROFILE_0) {
+          if (!parse_bitdepth_colorspace_sampling(profile, &rb))
+            return VPX_CODEC_UNSUP_BITSTREAM;
+        }
+        rb.bit_offset += REF_FRAMES;  // refresh_frame_flags
+        vp9_read_frame_size(&rb, (int *)&si->w, (int *)&si->h);
+      }
+    }
+  }
+  if (is_intra_only != NULL)
+    *is_intra_only = intra_only_flag;
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t decoder_peek_si(const uint8_t *data,
+                                       unsigned int data_sz,
+                                       vpx_codec_stream_info_t *si) {
+  return decoder_peek_si_internal(data, data_sz, si, NULL, NULL, NULL);
+}
+
+static vpx_codec_err_t decoder_get_si(vpx_codec_alg_priv_t *ctx,
+                                      vpx_codec_stream_info_t *si) {
+  const size_t sz = (si->sz >= sizeof(vp9_stream_info_t))
+                       ? sizeof(vp9_stream_info_t)
+                       : sizeof(vpx_codec_stream_info_t);
+  memcpy(si, &ctx->si, sz);
+  si->sz = (unsigned int)sz;
+
+  return VPX_CODEC_OK;
+}
+
+static void set_error_detail(vpx_codec_alg_priv_t *ctx,
+                             const char *const error) {
+  ctx->base.err_detail = error;
+}
+
+static vpx_codec_err_t update_error_state(vpx_codec_alg_priv_t *ctx,
+                           const struct vpx_internal_error_info *error) {
+  if (error->error_code)
+    set_error_detail(ctx, error->has_detail ? error->detail : NULL);
+
+  return error->error_code;
+}
+
+static void init_buffer_callbacks(vpx_codec_alg_priv_t *ctx) {
+  int i;
+
+  for (i = 0; i < ctx->num_frame_workers; ++i) {
+    VPxWorker *const worker = &ctx->frame_workers[i];
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    VP9_COMMON *const cm = &frame_worker_data->pbi->common;
+    BufferPool *const pool = cm->buffer_pool;
+
+    cm->new_fb_idx = INVALID_IDX;
+    cm->byte_alignment = ctx->byte_alignment;
+    cm->skip_loop_filter = ctx->skip_loop_filter;
+
+    if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) {
+      pool->get_fb_cb = ctx->get_ext_fb_cb;
+      pool->release_fb_cb = ctx->release_ext_fb_cb;
+      pool->cb_priv = ctx->ext_priv;
+    } else {
+      pool->get_fb_cb = vp9_get_frame_buffer;
+      pool->release_fb_cb = vp9_release_frame_buffer;
+
+      if (vp9_alloc_internal_frame_buffers(&pool->int_frame_buffers))
+        vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                           "Failed to initialize internal frame buffers");
+
+      pool->cb_priv = &pool->int_frame_buffers;
+    }
+  }
+}
+
+static void set_default_ppflags(vp8_postproc_cfg_t *cfg) {
+  cfg->post_proc_flag = VP8_DEBLOCK | VP8_DEMACROBLOCK;
+  cfg->deblocking_level = 4;
+  cfg->noise_level = 0;
+}
+
+static void set_ppflags(const vpx_codec_alg_priv_t *ctx,
+                        vp9_ppflags_t *flags) {
+  flags->post_proc_flag =
+      ctx->postproc_cfg.post_proc_flag;
+
+  flags->deblocking_level = ctx->postproc_cfg.deblocking_level;
+  flags->noise_level = ctx->postproc_cfg.noise_level;
+}
+
+static int frame_worker_hook(void *arg1, void *arg2) {
+  FrameWorkerData *const frame_worker_data = (FrameWorkerData *)arg1;
+  const uint8_t *data = frame_worker_data->data;
+  (void)arg2;
+
+  frame_worker_data->result =
+      vp9_receive_compressed_data(frame_worker_data->pbi,
+                                  frame_worker_data->data_size,
+                                  &data);
+  frame_worker_data->data_end = data;
+
+  if (frame_worker_data->pbi->frame_parallel_decode) {
+    // In frame parallel decoding, a worker thread must successfully decode all
+    // the compressed data.
+    if (frame_worker_data->result != 0 ||
+        frame_worker_data->data + frame_worker_data->data_size - 1 > data) {
+      VPxWorker *const worker = frame_worker_data->pbi->frame_worker_owner;
+      BufferPool *const pool = frame_worker_data->pbi->common.buffer_pool;
+      // Signal all the other threads that are waiting for this frame.
+      vp9_frameworker_lock_stats(worker);
+      frame_worker_data->frame_context_ready = 1;
+      lock_buffer_pool(pool);
+      frame_worker_data->pbi->cur_buf->buf.corrupted = 1;
+      unlock_buffer_pool(pool);
+      frame_worker_data->pbi->need_resync = 1;
+      vp9_frameworker_signal_stats(worker);
+      vp9_frameworker_unlock_stats(worker);
+      return 0;
+    }
+  } else if (frame_worker_data->result != 0) {
+    // Check decode result in serial decode.
+    frame_worker_data->pbi->cur_buf->buf.corrupted = 1;
+    frame_worker_data->pbi->need_resync = 1;
+  }
+  return !frame_worker_data->result;
+}
+
+static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) {
+  int i;
+  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
+
+  ctx->last_show_frame = -1;
+  ctx->next_submit_worker_id = 0;
+  ctx->last_submit_worker_id = 0;
+  ctx->next_output_worker_id = 0;
+  ctx->frame_cache_read = 0;
+  ctx->frame_cache_write = 0;
+  ctx->num_cache_frames = 0;
+  ctx->need_resync = 1;
+  ctx->num_frame_workers =
+      (ctx->frame_parallel_decode == 1) ? ctx->cfg.threads: 1;
+  if (ctx->num_frame_workers > MAX_DECODE_THREADS)
+    ctx->num_frame_workers = MAX_DECODE_THREADS;
+  ctx->available_threads = ctx->num_frame_workers;
+  ctx->flushed = 0;
+
+  ctx->buffer_pool = (BufferPool *)vpx_calloc(1, sizeof(BufferPool));
+  if (ctx->buffer_pool == NULL)
+    return VPX_CODEC_MEM_ERROR;
+
+#if CONFIG_MULTITHREAD
+    if (pthread_mutex_init(&ctx->buffer_pool->pool_mutex, NULL)) {
+      set_error_detail(ctx, "Failed to allocate buffer pool mutex");
+      return VPX_CODEC_MEM_ERROR;
+    }
+#endif
+
+  ctx->frame_workers = (VPxWorker *)
+      vpx_malloc(ctx->num_frame_workers * sizeof(*ctx->frame_workers));
+  if (ctx->frame_workers == NULL) {
+    set_error_detail(ctx, "Failed to allocate frame_workers");
+    return VPX_CODEC_MEM_ERROR;
+  }
+
+  for (i = 0; i < ctx->num_frame_workers; ++i) {
+    VPxWorker *const worker = &ctx->frame_workers[i];
+    FrameWorkerData *frame_worker_data = NULL;
+    winterface->init(worker);
+    worker->data1 = vpx_memalign(32, sizeof(FrameWorkerData));
+    if (worker->data1 == NULL) {
+      set_error_detail(ctx, "Failed to allocate frame_worker_data");
+      return VPX_CODEC_MEM_ERROR;
+    }
+    frame_worker_data = (FrameWorkerData *)worker->data1;
+    frame_worker_data->pbi = vp9_decoder_create(ctx->buffer_pool);
+    if (frame_worker_data->pbi == NULL) {
+      set_error_detail(ctx, "Failed to allocate frame_worker_data");
+      return VPX_CODEC_MEM_ERROR;
+    }
+    frame_worker_data->pbi->frame_worker_owner = worker;
+    frame_worker_data->worker_id = i;
+    frame_worker_data->scratch_buffer = NULL;
+    frame_worker_data->scratch_buffer_size = 0;
+    frame_worker_data->frame_context_ready = 0;
+    frame_worker_data->received_frame = 0;
+#if CONFIG_MULTITHREAD
+    if (pthread_mutex_init(&frame_worker_data->stats_mutex, NULL)) {
+      set_error_detail(ctx, "Failed to allocate frame_worker_data mutex");
+      return VPX_CODEC_MEM_ERROR;
+    }
+
+    if (pthread_cond_init(&frame_worker_data->stats_cond, NULL)) {
+      set_error_detail(ctx, "Failed to allocate frame_worker_data cond");
+      return VPX_CODEC_MEM_ERROR;
+    }
+#endif
+    // If decoding in serial mode, FrameWorker thread could create tile worker
+    // thread or loopfilter thread.
+    frame_worker_data->pbi->max_threads =
+        (ctx->frame_parallel_decode == 0) ? ctx->cfg.threads : 0;
+
+    frame_worker_data->pbi->inv_tile_order = ctx->invert_tile_order;
+    frame_worker_data->pbi->frame_parallel_decode = ctx->frame_parallel_decode;
+    frame_worker_data->pbi->common.frame_parallel_decode =
+        ctx->frame_parallel_decode;
+    worker->hook = (VPxWorkerHook)frame_worker_hook;
+    if (!winterface->reset(worker)) {
+      set_error_detail(ctx, "Frame Worker thread creation failed");
+      return VPX_CODEC_MEM_ERROR;
+    }
+  }
+
+  // If postprocessing was enabled by the application and a
+  // configuration has not been provided, default it.
+  if (!ctx->postproc_cfg_set &&
+      (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC))
+    set_default_ppflags(&ctx->postproc_cfg);
+
+  init_buffer_callbacks(ctx);
+
+  return VPX_CODEC_OK;
+}
+
+static INLINE void check_resync(vpx_codec_alg_priv_t *const ctx,
+                                const VP9Decoder *const pbi) {
+  // Clear resync flag if worker got a key frame or intra only frame.
+  if (ctx->need_resync == 1 && pbi->need_resync == 0 &&
+      (pbi->common.intra_only || pbi->common.frame_type == KEY_FRAME))
+    ctx->need_resync = 0;
+}
+
+static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
+                                  const uint8_t **data, unsigned int data_sz,
+                                  void *user_priv, int64_t deadline) {
+  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
+  (void)deadline;
+
+  // Determine the stream parameters. Note that we rely on peek_si to
+  // validate that we have a buffer that does not wrap around the top
+  // of the heap.
+  if (!ctx->si.h) {
+    int is_intra_only = 0;
+    const vpx_codec_err_t res =
+        decoder_peek_si_internal(*data, data_sz, &ctx->si, &is_intra_only,
+                                 ctx->decrypt_cb, ctx->decrypt_state);
+    if (res != VPX_CODEC_OK)
+      return res;
+
+    if (!ctx->si.is_kf && !is_intra_only)
+      return VPX_CODEC_ERROR;
+  }
+
+  if (!ctx->frame_parallel_decode) {
+    VPxWorker *const worker = ctx->frame_workers;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    frame_worker_data->data = *data;
+    frame_worker_data->data_size = data_sz;
+    frame_worker_data->user_priv = user_priv;
+    frame_worker_data->received_frame = 1;
+
+    // Set these even if already initialized.  The caller may have changed the
+    // decrypt config between frames.
+    frame_worker_data->pbi->decrypt_cb = ctx->decrypt_cb;
+    frame_worker_data->pbi->decrypt_state = ctx->decrypt_state;
+
+    worker->had_error = 0;
+    winterface->execute(worker);
+
+    // Update data pointer after decode.
+    *data = frame_worker_data->data_end;
+
+    if (worker->had_error)
+      return update_error_state(ctx, &frame_worker_data->pbi->common.error);
+
+    check_resync(ctx, frame_worker_data->pbi);
+  } else {
+    VPxWorker *const worker = &ctx->frame_workers[ctx->next_submit_worker_id];
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    // Copy context from last worker thread to next worker thread.
+    if (ctx->next_submit_worker_id != ctx->last_submit_worker_id)
+      vp9_frameworker_copy_context(
+          &ctx->frame_workers[ctx->next_submit_worker_id],
+          &ctx->frame_workers[ctx->last_submit_worker_id]);
+
+    frame_worker_data->pbi->ready_for_new_data = 0;
+    // Copy the compressed data into worker's internal buffer.
+    // TODO(hkuang): Will all the workers allocate the same size
+    // as the size of the first intra frame be better? This will
+    // avoid too many deallocate and allocate.
+    if (frame_worker_data->scratch_buffer_size < data_sz) {
+      frame_worker_data->scratch_buffer =
+          (uint8_t *)vpx_realloc(frame_worker_data->scratch_buffer, data_sz);
+      if (frame_worker_data->scratch_buffer == NULL) {
+        set_error_detail(ctx, "Failed to reallocate scratch buffer");
+        return VPX_CODEC_MEM_ERROR;
+      }
+      frame_worker_data->scratch_buffer_size = data_sz;
+    }
+    frame_worker_data->data_size = data_sz;
+    memcpy(frame_worker_data->scratch_buffer, *data, data_sz);
+
+    frame_worker_data->frame_decoded = 0;
+    frame_worker_data->frame_context_ready = 0;
+    frame_worker_data->received_frame = 1;
+    frame_worker_data->data = frame_worker_data->scratch_buffer;
+    frame_worker_data->user_priv = user_priv;
+
+    if (ctx->next_submit_worker_id != ctx->last_submit_worker_id)
+      ctx->last_submit_worker_id =
+          (ctx->last_submit_worker_id + 1) % ctx->num_frame_workers;
+
+    ctx->next_submit_worker_id =
+        (ctx->next_submit_worker_id + 1) % ctx->num_frame_workers;
+    --ctx->available_threads;
+    worker->had_error = 0;
+    winterface->launch(worker);
+  }
+
+  return VPX_CODEC_OK;
+}
+
+static void wait_worker_and_cache_frame(vpx_codec_alg_priv_t *ctx) {
+  YV12_BUFFER_CONFIG sd;
+  vp9_ppflags_t flags = {0, 0, 0};
+  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
+  VPxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
+  FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+  ctx->next_output_worker_id =
+      (ctx->next_output_worker_id + 1) % ctx->num_frame_workers;
+  // TODO(hkuang): Add worker error handling here.
+  winterface->sync(worker);
+  frame_worker_data->received_frame = 0;
+  ++ctx->available_threads;
+
+  check_resync(ctx, frame_worker_data->pbi);
+
+  if (vp9_get_raw_frame(frame_worker_data->pbi, &sd, &flags) == 0) {
+    VP9_COMMON *const cm = &frame_worker_data->pbi->common;
+    RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+    ctx->frame_cache[ctx->frame_cache_write].fb_idx = cm->new_fb_idx;
+    yuvconfig2image(&ctx->frame_cache[ctx->frame_cache_write].img, &sd,
+                    frame_worker_data->user_priv);
+    ctx->frame_cache[ctx->frame_cache_write].img.fb_priv =
+        frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
+    ctx->frame_cache_write =
+        (ctx->frame_cache_write + 1) % FRAME_CACHE_SIZE;
+    ++ctx->num_cache_frames;
+  }
+}
+
+static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
+                                      const uint8_t *data, unsigned int data_sz,
+                                      void *user_priv, long deadline) {
+  const uint8_t *data_start = data;
+  const uint8_t * const data_end = data + data_sz;
+  vpx_codec_err_t res;
+  uint32_t frame_sizes[8];
+  int frame_count;
+
+  if (data == NULL && data_sz == 0) {
+    ctx->flushed = 1;
+    return VPX_CODEC_OK;
+  }
+
+  // Reset flushed when receiving a valid frame.
+  ctx->flushed = 0;
+
+  // Initialize the decoder workers on the first frame.
+  if (ctx->frame_workers == NULL) {
+    const vpx_codec_err_t res = init_decoder(ctx);
+    if (res != VPX_CODEC_OK)
+      return res;
+  }
+
+  res = vp9_parse_superframe_index(data, data_sz, frame_sizes, &frame_count,
+                                   ctx->decrypt_cb, ctx->decrypt_state);
+  if (res != VPX_CODEC_OK)
+    return res;
+
+  if (ctx->frame_parallel_decode) {
+    // Decode in frame parallel mode. When decoding in this mode, the frame
+    // passed to the decoder must be either a normal frame or a superframe with
+    // superframe index so the decoder could get each frame's start position
+    // in the superframe.
+    if (frame_count > 0) {
+      int i;
+
+      for (i = 0; i < frame_count; ++i) {
+        const uint8_t *data_start_copy = data_start;
+        const uint32_t frame_size = frame_sizes[i];
+        if (data_start < data
+            || frame_size > (uint32_t) (data_end - data_start)) {
+          set_error_detail(ctx, "Invalid frame size in index");
+          return VPX_CODEC_CORRUPT_FRAME;
+        }
+
+        if (ctx->available_threads == 0) {
+          // No more threads for decoding. Wait until the next output worker
+          // finishes decoding. Then copy the decoded frame into cache.
+          if (ctx->num_cache_frames < FRAME_CACHE_SIZE) {
+            wait_worker_and_cache_frame(ctx);
+          } else {
+            // TODO(hkuang): Add unit test to test this path.
+            set_error_detail(ctx, "Frame output cache is full.");
+            return VPX_CODEC_ERROR;
+          }
+        }
+
+        res = decode_one(ctx, &data_start_copy, frame_size, user_priv,
+                         deadline);
+        if (res != VPX_CODEC_OK)
+          return res;
+        data_start += frame_size;
+      }
+    } else {
+      if (ctx->available_threads == 0) {
+        // No more threads for decoding. Wait until the next output worker
+        // finishes decoding. Then copy the decoded frame into cache.
+        if (ctx->num_cache_frames < FRAME_CACHE_SIZE) {
+          wait_worker_and_cache_frame(ctx);
+        } else {
+          // TODO(hkuang): Add unit test to test this path.
+          set_error_detail(ctx, "Frame output cache is full.");
+          return VPX_CODEC_ERROR;
+        }
+      }
+
+      res = decode_one(ctx, &data, data_sz, user_priv, deadline);
+      if (res != VPX_CODEC_OK)
+        return res;
+    }
+  } else {
+    // Decode in serial mode.
+    if (frame_count > 0) {
+      int i;
+
+      for (i = 0; i < frame_count; ++i) {
+        const uint8_t *data_start_copy = data_start;
+        const uint32_t frame_size = frame_sizes[i];
+        vpx_codec_err_t res;
+        if (data_start < data
+            || frame_size > (uint32_t) (data_end - data_start)) {
+          set_error_detail(ctx, "Invalid frame size in index");
+          return VPX_CODEC_CORRUPT_FRAME;
+        }
+
+        res = decode_one(ctx, &data_start_copy, frame_size, user_priv,
+                         deadline);
+        if (res != VPX_CODEC_OK)
+          return res;
+
+        data_start += frame_size;
+      }
+    } else {
+      while (data_start < data_end) {
+        const uint32_t frame_size = (uint32_t) (data_end - data_start);
+        const vpx_codec_err_t res = decode_one(ctx, &data_start, frame_size,
+                                               user_priv, deadline);
+        if (res != VPX_CODEC_OK)
+          return res;
+
+        // Account for suboptimal termination by the encoder.
+        while (data_start < data_end) {
+          const uint8_t marker = read_marker(ctx->decrypt_cb,
+                                             ctx->decrypt_state, data_start);
+          if (marker)
+            break;
+          ++data_start;
+        }
+      }
+    }
+  }
+
+  return res;
+}
+
+static void release_last_output_frame(vpx_codec_alg_priv_t *ctx) {
+  RefCntBuffer *const frame_bufs = ctx->buffer_pool->frame_bufs;
+  // Decrease reference count of last output frame in frame parallel mode.
+  if (ctx->frame_parallel_decode && ctx->last_show_frame >= 0) {
+    BufferPool *const pool = ctx->buffer_pool;
+    lock_buffer_pool(pool);
+    decrease_ref_count(ctx->last_show_frame, frame_bufs, pool);
+    unlock_buffer_pool(pool);
+  }
+}
+
+static vpx_image_t *decoder_get_frame(vpx_codec_alg_priv_t *ctx,
+                                      vpx_codec_iter_t *iter) {
+  vpx_image_t *img = NULL;
+
+  // Only return frame when all the cpu are busy or
+  // application fluhsed the decoder in frame parallel decode.
+  if (ctx->frame_parallel_decode && ctx->available_threads > 0 &&
+      !ctx->flushed) {
+    return NULL;
+  }
+
+  // Output the frames in the cache first.
+  if (ctx->num_cache_frames > 0) {
+    release_last_output_frame(ctx);
+    ctx->last_show_frame  = ctx->frame_cache[ctx->frame_cache_read].fb_idx;
+    if (ctx->need_resync)
+      return NULL;
+    img = &ctx->frame_cache[ctx->frame_cache_read].img;
+    ctx->frame_cache_read = (ctx->frame_cache_read + 1) % FRAME_CACHE_SIZE;
+    --ctx->num_cache_frames;
+    return img;
+  }
+
+  // iter acts as a flip flop, so an image is only returned on the first
+  // call to get_frame.
+  if (*iter == NULL && ctx->frame_workers != NULL) {
+    do {
+      YV12_BUFFER_CONFIG sd;
+      vp9_ppflags_t flags = {0, 0, 0};
+      const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
+      VPxWorker *const worker =
+          &ctx->frame_workers[ctx->next_output_worker_id];
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      ctx->next_output_worker_id =
+          (ctx->next_output_worker_id + 1) % ctx->num_frame_workers;
+      if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)
+        set_ppflags(ctx, &flags);
+      // Wait for the frame from worker thread.
+      if (winterface->sync(worker)) {
+        // Check if worker has received any frames.
+        if (frame_worker_data->received_frame == 1) {
+          ++ctx->available_threads;
+          frame_worker_data->received_frame = 0;
+          check_resync(ctx, frame_worker_data->pbi);
+        }
+        if (vp9_get_raw_frame(frame_worker_data->pbi, &sd, &flags) == 0) {
+          VP9_COMMON *const cm = &frame_worker_data->pbi->common;
+          RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+          release_last_output_frame(ctx);
+          ctx->last_show_frame = frame_worker_data->pbi->common.new_fb_idx;
+          if (ctx->need_resync)
+            return NULL;
+          yuvconfig2image(&ctx->img, &sd, frame_worker_data->user_priv);
+          ctx->img.fb_priv = frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
+          img = &ctx->img;
+          return img;
+        }
+      } else {
+        // Decoding failed. Release the worker thread.
+        frame_worker_data->received_frame = 0;
+        ++ctx->available_threads;
+        ctx->need_resync = 1;
+        if (ctx->flushed != 1)
+          return NULL;
+      }
+    } while (ctx->next_output_worker_id != ctx->next_submit_worker_id);
+  }
+  return NULL;
+}
+
+static vpx_codec_err_t decoder_set_fb_fn(
+    vpx_codec_alg_priv_t *ctx,
+    vpx_get_frame_buffer_cb_fn_t cb_get,
+    vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) {
+  if (cb_get == NULL || cb_release == NULL) {
+    return VPX_CODEC_INVALID_PARAM;
+  } else if (ctx->frame_workers == NULL) {
+    // If the decoder has already been initialized, do not accept changes to
+    // the frame buffer functions.
+    ctx->get_ext_fb_cb = cb_get;
+    ctx->release_ext_fb_cb = cb_release;
+    ctx->ext_priv = cb_priv;
+    return VPX_CODEC_OK;
+  }
+
+  return VPX_CODEC_ERROR;
+}
+
+static vpx_codec_err_t ctrl_set_reference(vpx_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  vpx_ref_frame_t *const data = va_arg(args, vpx_ref_frame_t *);
+
+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return VPX_CODEC_INCAPABLE;
+  }
+
+  if (data) {
+    vpx_ref_frame_t *const frame = (vpx_ref_frame_t *)data;
+    YV12_BUFFER_CONFIG sd;
+    VPxWorker *const worker = ctx->frame_workers;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    image2yuvconfig(&frame->img, &sd);
+    return vp9_set_reference_dec(&frame_worker_data->pbi->common,
+                                 (VP9_REFFRAME)frame->frame_type, &sd);
+  } else {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+}
+
+static vpx_codec_err_t ctrl_copy_reference(vpx_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
+
+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return VPX_CODEC_INCAPABLE;
+  }
+
+  if (data) {
+    vpx_ref_frame_t *frame = (vpx_ref_frame_t *) data;
+    YV12_BUFFER_CONFIG sd;
+    VPxWorker *const worker = ctx->frame_workers;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    image2yuvconfig(&frame->img, &sd);
+    return vp9_copy_reference_dec(frame_worker_data->pbi,
+                                  (VP9_REFFRAME)frame->frame_type, &sd);
+  } else {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+}
+
+static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  vp9_ref_frame_t *data = va_arg(args, vp9_ref_frame_t *);
+
+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return VPX_CODEC_INCAPABLE;
+  }
+
+  if (data) {
+    YV12_BUFFER_CONFIG* fb;
+    VPxWorker *const worker = ctx->frame_workers;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    fb = get_ref_frame(&frame_worker_data->pbi->common, data->idx);
+    if (fb == NULL) return VPX_CODEC_ERROR;
+    yuvconfig2image(&data->img, fb, NULL);
+    return VPX_CODEC_OK;
+  } else {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+}
+
+static vpx_codec_err_t ctrl_set_postproc(vpx_codec_alg_priv_t *ctx,
+                                         va_list args) {
+#if CONFIG_VP9_POSTPROC
+  vp8_postproc_cfg_t *data = va_arg(args, vp8_postproc_cfg_t *);
+
+  if (data) {
+    ctx->postproc_cfg_set = 1;
+    ctx->postproc_cfg = *((vp8_postproc_cfg_t *)data);
+    return VPX_CODEC_OK;
+  } else {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+#else
+  (void)ctx;
+  (void)args;
+  return VPX_CODEC_INCAPABLE;
+#endif
+}
+
+static vpx_codec_err_t ctrl_set_dbg_options(vpx_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  (void)ctx;
+  (void)args;
+  return VPX_CODEC_INCAPABLE;
+}
+
+static vpx_codec_err_t ctrl_get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
+                                                 va_list args) {
+  int *const update_info = va_arg(args, int *);
+
+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return VPX_CODEC_INCAPABLE;
+  }
+
+  if (update_info) {
+    if (ctx->frame_workers) {
+      VPxWorker *const worker = ctx->frame_workers;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      *update_info = frame_worker_data->pbi->refresh_frame_flags;
+      return VPX_CODEC_OK;
+    } else {
+      return VPX_CODEC_ERROR;
+    }
+  }
+
+  return VPX_CODEC_INVALID_PARAM;
+}
+
+static vpx_codec_err_t ctrl_get_frame_corrupted(vpx_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  int *corrupted = va_arg(args, int *);
+
+  if (corrupted) {
+    if (ctx->frame_workers) {
+      VPxWorker *const worker = ctx->frame_workers;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      RefCntBuffer *const frame_bufs =
+          frame_worker_data->pbi->common.buffer_pool->frame_bufs;
+      if (frame_worker_data->pbi->common.frame_to_show == NULL)
+        return VPX_CODEC_ERROR;
+      if (ctx->last_show_frame >= 0)
+        *corrupted = frame_bufs[ctx->last_show_frame].buf.corrupted;
+      return VPX_CODEC_OK;
+    } else {
+      return VPX_CODEC_ERROR;
+    }
+  }
+
+  return VPX_CODEC_INVALID_PARAM;
+}
+
+static vpx_codec_err_t ctrl_get_frame_size(vpx_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  int *const frame_size = va_arg(args, int *);
+
+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return VPX_CODEC_INCAPABLE;
+  }
+
+  if (frame_size) {
+    if (ctx->frame_workers) {
+      VPxWorker *const worker = ctx->frame_workers;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      const VP9_COMMON *const cm = &frame_worker_data->pbi->common;
+      frame_size[0] = cm->width;
+      frame_size[1] = cm->height;
+      return VPX_CODEC_OK;
+    } else {
+      return VPX_CODEC_ERROR;
+    }
+  }
+
+  return VPX_CODEC_INVALID_PARAM;
+}
+
+static vpx_codec_err_t ctrl_get_render_size(vpx_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  int *const render_size = va_arg(args, int *);
+
+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return VPX_CODEC_INCAPABLE;
+  }
+
+  if (render_size) {
+    if (ctx->frame_workers) {
+      VPxWorker *const worker = ctx->frame_workers;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      const VP9_COMMON *const cm = &frame_worker_data->pbi->common;
+      render_size[0] = cm->render_width;
+      render_size[1] = cm->render_height;
+      return VPX_CODEC_OK;
+    } else {
+      return VPX_CODEC_ERROR;
+    }
+  }
+
+  return VPX_CODEC_INVALID_PARAM;
+}
+
+static vpx_codec_err_t ctrl_get_bit_depth(vpx_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  unsigned int *const bit_depth = va_arg(args, unsigned int *);
+  VPxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
+
+  if (bit_depth) {
+    if (worker) {
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      const VP9_COMMON *const cm = &frame_worker_data->pbi->common;
+      *bit_depth = cm->bit_depth;
+      return VPX_CODEC_OK;
+    } else {
+      return VPX_CODEC_ERROR;
+    }
+  }
+
+  return VPX_CODEC_INVALID_PARAM;
+}
+
+static vpx_codec_err_t ctrl_set_invert_tile_order(vpx_codec_alg_priv_t *ctx,
+                                                  va_list args) {
+  ctx->invert_tile_order = va_arg(args, int);
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t ctrl_set_decryptor(vpx_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  vpx_decrypt_init *init = va_arg(args, vpx_decrypt_init *);
+  ctx->decrypt_cb = init ? init->decrypt_cb : NULL;
+  ctx->decrypt_state = init ? init->decrypt_state : NULL;
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t ctrl_set_byte_alignment(vpx_codec_alg_priv_t *ctx,
+                                               va_list args) {
+  const int legacy_byte_alignment = 0;
+  const int min_byte_alignment = 32;
+  const int max_byte_alignment = 1024;
+  const int byte_alignment = va_arg(args, int);
+
+  if (byte_alignment != legacy_byte_alignment &&
+      (byte_alignment < min_byte_alignment ||
+       byte_alignment > max_byte_alignment ||
+       (byte_alignment & (byte_alignment - 1)) != 0))
+    return VPX_CODEC_INVALID_PARAM;
+
+  ctx->byte_alignment = byte_alignment;
+  if (ctx->frame_workers) {
+    VPxWorker *const worker = ctx->frame_workers;
+    FrameWorkerData *const frame_worker_data =
+        (FrameWorkerData *)worker->data1;
+    frame_worker_data->pbi->common.byte_alignment = byte_alignment;
+  }
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t ctrl_set_skip_loop_filter(vpx_codec_alg_priv_t *ctx,
+                                                 va_list args) {
+  ctx->skip_loop_filter = va_arg(args, int);
+
+  if (ctx->frame_workers) {
+    VPxWorker *const worker = ctx->frame_workers;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    frame_worker_data->pbi->common.skip_loop_filter = ctx->skip_loop_filter;
+  }
+
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
+  {VP8_COPY_REFERENCE,            ctrl_copy_reference},
+
+  // Setters
+  {VP8_SET_REFERENCE,             ctrl_set_reference},
+  {VP8_SET_POSTPROC,              ctrl_set_postproc},
+  {VP8_SET_DBG_COLOR_REF_FRAME,   ctrl_set_dbg_options},
+  {VP8_SET_DBG_COLOR_MB_MODES,    ctrl_set_dbg_options},
+  {VP8_SET_DBG_COLOR_B_MODES,     ctrl_set_dbg_options},
+  {VP8_SET_DBG_DISPLAY_MV,        ctrl_set_dbg_options},
+  {VP9_INVERT_TILE_DECODE_ORDER,  ctrl_set_invert_tile_order},
+  {VPXD_SET_DECRYPTOR,            ctrl_set_decryptor},
+  {VP9_SET_BYTE_ALIGNMENT,        ctrl_set_byte_alignment},
+  {VP9_SET_SKIP_LOOP_FILTER,      ctrl_set_skip_loop_filter},
+
+  // Getters
+  {VP8D_GET_LAST_REF_UPDATES,     ctrl_get_last_ref_updates},
+  {VP8D_GET_FRAME_CORRUPTED,      ctrl_get_frame_corrupted},
+  {VP9_GET_REFERENCE,             ctrl_get_reference},
+  {VP9D_GET_DISPLAY_SIZE,         ctrl_get_render_size},
+  {VP9D_GET_BIT_DEPTH,            ctrl_get_bit_depth},
+  {VP9D_GET_FRAME_SIZE,           ctrl_get_frame_size},
+
+  { -1, NULL},
+};
+
+#ifndef VERSION_STRING
+#define VERSION_STRING
+#endif
+CODEC_INTERFACE(vpx_codec_vp9_dx) = {
+  "WebM Project VP9 Decoder" VERSION_STRING,
+  VPX_CODEC_INTERNAL_ABI_VERSION,
+  VPX_CODEC_CAP_DECODER | VP9_CAP_POSTPROC |
+      VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER,  // vpx_codec_caps_t
+  decoder_init,       // vpx_codec_init_fn_t
+  decoder_destroy,    // vpx_codec_destroy_fn_t
+  decoder_ctrl_maps,  // vpx_codec_ctrl_fn_map_t
+  { // NOLINT
+    decoder_peek_si,    // vpx_codec_peek_si_fn_t
+    decoder_get_si,     // vpx_codec_get_si_fn_t
+    decoder_decode,     // vpx_codec_decode_fn_t
+    decoder_get_frame,  // vpx_codec_frame_get_fn_t
+    decoder_set_fb_fn,  // vpx_codec_set_fb_fn_t
+  },
+  { // NOLINT
+    0,
+    NULL,  // vpx_codec_enc_cfg_map_t
+    NULL,  // vpx_codec_encode_fn_t
+    NULL,  // vpx_codec_get_cx_data_fn_t
+    NULL,  // vpx_codec_enc_config_set_fn_t
+    NULL,  // vpx_codec_get_global_headers_fn_t
+    NULL,  // vpx_codec_get_preview_frame_fn_t
+    NULL   // vpx_codec_enc_mr_get_mem_loc_fn_t
+  }
+};
diff --git a/libs/libvpx/vp9/vp9_dx_iface.h b/libs/libvpx/vp9/vp9_dx_iface.h
new file mode 100644
index 0000000000..e0e948e16c
--- /dev/null
+++ b/libs/libvpx/vp9/vp9_dx_iface.h
@@ -0,0 +1,65 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_VP9_DX_IFACE_H_
+#define VP9_VP9_DX_IFACE_H_
+
+#include "vp9/decoder/vp9_decoder.h"
+
+typedef vpx_codec_stream_info_t vp9_stream_info_t;
+
+// This limit is due to framebuffer numbers.
+// TODO(hkuang): Remove this limit after implementing ondemand framebuffers.
+#define FRAME_CACHE_SIZE 6   // Cache maximum 6 decoded frames.
+
+typedef struct cache_frame {
+  int fb_idx;
+  vpx_image_t img;
+} cache_frame;
+
+struct vpx_codec_alg_priv {
+  vpx_codec_priv_t        base;
+  vpx_codec_dec_cfg_t     cfg;
+  vp9_stream_info_t       si;
+  int                     postproc_cfg_set;
+  vp8_postproc_cfg_t      postproc_cfg;
+  vpx_decrypt_cb          decrypt_cb;
+  void                    *decrypt_state;
+  vpx_image_t             img;
+  int                     img_avail;
+  int                     flushed;
+  int                     invert_tile_order;
+  int                     last_show_frame;  // Index of last output frame.
+  int                     byte_alignment;
+  int                     skip_loop_filter;
+
+  // Frame parallel related.
+  int                     frame_parallel_decode;  // frame-based threading.
+  VPxWorker               *frame_workers;
+  int                     num_frame_workers;
+  int                     next_submit_worker_id;
+  int                     last_submit_worker_id;
+  int                     next_output_worker_id;
+  int                     available_threads;
+  cache_frame             frame_cache[FRAME_CACHE_SIZE];
+  int                     frame_cache_write;
+  int                     frame_cache_read;
+  int                     num_cache_frames;
+  int                     need_resync;      // wait for key/intra-only frame
+  // BufferPool that holds all reference frames. Shared by all the FrameWorkers.
+  BufferPool              *buffer_pool;
+
+  // External frame buffer info to save for VP9 common.
+  void *ext_priv;  // Private data associated with the external frame buffers.
+  vpx_get_frame_buffer_cb_fn_t get_ext_fb_cb;
+  vpx_release_frame_buffer_cb_fn_t release_ext_fb_cb;
+};
+
+#endif  // VP9_VP9_DX_IFACE_H_
diff --git a/libs/libvpx/vp9/vp9_iface_common.h b/libs/libvpx/vp9/vp9_iface_common.h
new file mode 100644
index 0000000000..938d4224ba
--- /dev/null
+++ b/libs/libvpx/vp9/vp9_iface_common.h
@@ -0,0 +1,136 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef VP9_VP9_IFACE_COMMON_H_
+#define VP9_VP9_IFACE_COMMON_H_
+
+#include "vpx_ports/mem.h"
+
+static void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG  *yv12,
+                            void *user_priv) {
+  /** vpx_img_wrap() doesn't allow specifying independent strides for
+    * the Y, U, and V planes, nor other alignment adjustments that
+    * might be representable by a YV12_BUFFER_CONFIG, so we just
+    * initialize all the fields.*/
+  int bps;
+  if (!yv12->subsampling_y) {
+    if (!yv12->subsampling_x) {
+      img->fmt = VPX_IMG_FMT_I444;
+      bps = 24;
+    } else {
+      img->fmt = VPX_IMG_FMT_I422;
+      bps = 16;
+    }
+  } else {
+    if (!yv12->subsampling_x) {
+      img->fmt = VPX_IMG_FMT_I440;
+      bps = 16;
+    } else {
+      img->fmt = VPX_IMG_FMT_I420;
+      bps = 12;
+    }
+  }
+  img->cs = yv12->color_space;
+  img->range = yv12->color_range;
+  img->bit_depth = 8;
+  img->w = yv12->y_stride;
+  img->h = ALIGN_POWER_OF_TWO(yv12->y_height + 2 * VP9_ENC_BORDER_IN_PIXELS, 3);
+  img->d_w = yv12->y_crop_width;
+  img->d_h = yv12->y_crop_height;
+  img->r_w = yv12->render_width;
+  img->r_h = yv12->render_height;
+  img->x_chroma_shift = yv12->subsampling_x;
+  img->y_chroma_shift = yv12->subsampling_y;
+  img->planes[VPX_PLANE_Y] = yv12->y_buffer;
+  img->planes[VPX_PLANE_U] = yv12->u_buffer;
+  img->planes[VPX_PLANE_V] = yv12->v_buffer;
+  img->planes[VPX_PLANE_ALPHA] = NULL;
+  img->stride[VPX_PLANE_Y] = yv12->y_stride;
+  img->stride[VPX_PLANE_U] = yv12->uv_stride;
+  img->stride[VPX_PLANE_V] = yv12->uv_stride;
+  img->stride[VPX_PLANE_ALPHA] = yv12->y_stride;
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (yv12->flags & YV12_FLAG_HIGHBITDEPTH) {
+    // vpx_image_t uses byte strides and a pointer to the first byte
+    // of the image.
+    img->fmt = (vpx_img_fmt_t)(img->fmt | VPX_IMG_FMT_HIGHBITDEPTH);
+    img->bit_depth = yv12->bit_depth;
+    img->planes[VPX_PLANE_Y] = (uint8_t*)CONVERT_TO_SHORTPTR(yv12->y_buffer);
+    img->planes[VPX_PLANE_U] = (uint8_t*)CONVERT_TO_SHORTPTR(yv12->u_buffer);
+    img->planes[VPX_PLANE_V] = (uint8_t*)CONVERT_TO_SHORTPTR(yv12->v_buffer);
+    img->planes[VPX_PLANE_ALPHA] = NULL;
+    img->stride[VPX_PLANE_Y] = 2 * yv12->y_stride;
+    img->stride[VPX_PLANE_U] = 2 * yv12->uv_stride;
+    img->stride[VPX_PLANE_V] = 2 * yv12->uv_stride;
+    img->stride[VPX_PLANE_ALPHA] = 2 * yv12->y_stride;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  img->bps = bps;
+  img->user_priv = user_priv;
+  img->img_data = yv12->buffer_alloc;
+  img->img_data_owner = 0;
+  img->self_allocd = 0;
+}
+
+static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img,
+                                       YV12_BUFFER_CONFIG *yv12) {
+  yv12->y_buffer = img->planes[VPX_PLANE_Y];
+  yv12->u_buffer = img->planes[VPX_PLANE_U];
+  yv12->v_buffer = img->planes[VPX_PLANE_V];
+
+  yv12->y_crop_width  = img->d_w;
+  yv12->y_crop_height = img->d_h;
+  yv12->render_width  = img->r_w;
+  yv12->render_height = img->r_h;
+  yv12->y_width  = img->d_w;
+  yv12->y_height = img->d_h;
+
+  yv12->uv_width = img->x_chroma_shift == 1 ? (1 + yv12->y_width) / 2
+                                            : yv12->y_width;
+  yv12->uv_height = img->y_chroma_shift == 1 ? (1 + yv12->y_height) / 2
+                                             : yv12->y_height;
+  yv12->uv_crop_width = yv12->uv_width;
+  yv12->uv_crop_height = yv12->uv_height;
+
+  yv12->y_stride = img->stride[VPX_PLANE_Y];
+  yv12->uv_stride = img->stride[VPX_PLANE_U];
+  yv12->color_space = img->cs;
+  yv12->color_range = img->range;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
+    // In vpx_image_t
+    //     planes point to uint8 address of start of data
+    //     stride counts uint8s to reach next row
+    // In YV12_BUFFER_CONFIG
+    //     y_buffer, u_buffer, v_buffer point to uint16 address of data
+    //     stride and border counts in uint16s
+    // This means that all the address calculations in the main body of code
+    // should work correctly.
+    // However, before we do any pixel operations we need to cast the address
+    // to a uint16 ponter and double its value.
+    yv12->y_buffer = CONVERT_TO_BYTEPTR(yv12->y_buffer);
+    yv12->u_buffer = CONVERT_TO_BYTEPTR(yv12->u_buffer);
+    yv12->v_buffer = CONVERT_TO_BYTEPTR(yv12->v_buffer);
+    yv12->y_stride >>= 1;
+    yv12->uv_stride >>= 1;
+    yv12->flags = YV12_FLAG_HIGHBITDEPTH;
+  } else {
+    yv12->flags = 0;
+  }
+  yv12->border  = (yv12->y_stride - img->w) / 2;
+#else
+  yv12->border  = (img->stride[VPX_PLANE_Y] - img->w) / 2;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  yv12->subsampling_x = img->x_chroma_shift;
+  yv12->subsampling_y = img->y_chroma_shift;
+  return VPX_CODEC_OK;
+}
+
+#endif  // VP9_VP9_IFACE_COMMON_H_
diff --git a/libs/libvpx/vp9/vp9cx.mk b/libs/libvpx/vp9/vp9cx.mk
new file mode 100644
index 0000000000..2930c23ddf
--- /dev/null
+++ b/libs/libvpx/vp9/vp9cx.mk
@@ -0,0 +1,145 @@
+##
+##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+VP9_CX_EXPORTS += exports_enc
+
+VP9_CX_SRCS-yes += $(VP9_COMMON_SRCS-yes)
+VP9_CX_SRCS-no  += $(VP9_COMMON_SRCS-no)
+VP9_CX_SRCS_REMOVE-yes += $(VP9_COMMON_SRCS_REMOVE-yes)
+VP9_CX_SRCS_REMOVE-no  += $(VP9_COMMON_SRCS_REMOVE-no)
+
+VP9_CX_SRCS-yes += vp9_cx_iface.c
+
+VP9_CX_SRCS-yes += encoder/vp9_bitstream.c
+VP9_CX_SRCS-yes += encoder/vp9_context_tree.c
+VP9_CX_SRCS-yes += encoder/vp9_context_tree.h
+VP9_CX_SRCS-yes += encoder/vp9_cost.h
+VP9_CX_SRCS-yes += encoder/vp9_cost.c
+VP9_CX_SRCS-yes += encoder/vp9_dct.c
+VP9_CX_SRCS-$(CONFIG_VP9_TEMPORAL_DENOISING) += encoder/vp9_denoiser.c
+VP9_CX_SRCS-$(CONFIG_VP9_TEMPORAL_DENOISING) += encoder/vp9_denoiser.h
+VP9_CX_SRCS-yes += encoder/vp9_encodeframe.c
+VP9_CX_SRCS-yes += encoder/vp9_encodeframe.h
+VP9_CX_SRCS-yes += encoder/vp9_encodemb.c
+VP9_CX_SRCS-yes += encoder/vp9_encodemv.c
+VP9_CX_SRCS-yes += encoder/vp9_ethread.h
+VP9_CX_SRCS-yes += encoder/vp9_ethread.c
+VP9_CX_SRCS-yes += encoder/vp9_extend.c
+VP9_CX_SRCS-yes += encoder/vp9_firstpass.c
+VP9_CX_SRCS-yes += encoder/vp9_block.h
+VP9_CX_SRCS-yes += encoder/vp9_bitstream.h
+VP9_CX_SRCS-yes += encoder/vp9_encodemb.h
+VP9_CX_SRCS-yes += encoder/vp9_encodemv.h
+VP9_CX_SRCS-yes += encoder/vp9_extend.h
+VP9_CX_SRCS-yes += encoder/vp9_firstpass.h
+VP9_CX_SRCS-yes += encoder/vp9_lookahead.c
+VP9_CX_SRCS-yes += encoder/vp9_lookahead.h
+VP9_CX_SRCS-yes += encoder/vp9_mcomp.h
+VP9_CX_SRCS-yes += encoder/vp9_encoder.h
+VP9_CX_SRCS-yes += encoder/vp9_quantize.h
+VP9_CX_SRCS-yes += encoder/vp9_ratectrl.h
+VP9_CX_SRCS-yes += encoder/vp9_rd.h
+VP9_CX_SRCS-yes += encoder/vp9_rdopt.h
+VP9_CX_SRCS-yes += encoder/vp9_pickmode.h
+VP9_CX_SRCS-yes += encoder/vp9_svc_layercontext.h
+VP9_CX_SRCS-yes += encoder/vp9_tokenize.h
+VP9_CX_SRCS-yes += encoder/vp9_treewriter.h
+VP9_CX_SRCS-yes += encoder/vp9_mcomp.c
+VP9_CX_SRCS-yes += encoder/vp9_encoder.c
+VP9_CX_SRCS-yes += encoder/vp9_picklpf.c
+VP9_CX_SRCS-yes += encoder/vp9_picklpf.h
+VP9_CX_SRCS-yes += encoder/vp9_quantize.c
+VP9_CX_SRCS-yes += encoder/vp9_ratectrl.c
+VP9_CX_SRCS-yes += encoder/vp9_rd.c
+VP9_CX_SRCS-yes += encoder/vp9_rdopt.c
+VP9_CX_SRCS-yes += encoder/vp9_pickmode.c
+VP9_CX_SRCS-yes += encoder/vp9_segmentation.c
+VP9_CX_SRCS-yes += encoder/vp9_segmentation.h
+VP9_CX_SRCS-yes += encoder/vp9_speed_features.c
+VP9_CX_SRCS-yes += encoder/vp9_speed_features.h
+VP9_CX_SRCS-yes += encoder/vp9_subexp.c
+VP9_CX_SRCS-yes += encoder/vp9_subexp.h
+VP9_CX_SRCS-yes += encoder/vp9_svc_layercontext.c
+VP9_CX_SRCS-yes += encoder/vp9_resize.c
+VP9_CX_SRCS-yes += encoder/vp9_resize.h
+VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_blockiness.c
+
+VP9_CX_SRCS-yes += encoder/vp9_tokenize.c
+VP9_CX_SRCS-yes += encoder/vp9_treewriter.c
+VP9_CX_SRCS-yes += encoder/vp9_aq_variance.c
+VP9_CX_SRCS-yes += encoder/vp9_aq_variance.h
+VP9_CX_SRCS-yes += encoder/vp9_aq_360.c
+VP9_CX_SRCS-yes += encoder/vp9_aq_360.h
+VP9_CX_SRCS-yes += encoder/vp9_aq_cyclicrefresh.c
+VP9_CX_SRCS-yes += encoder/vp9_aq_cyclicrefresh.h
+VP9_CX_SRCS-yes += encoder/vp9_aq_complexity.c
+VP9_CX_SRCS-yes += encoder/vp9_aq_complexity.h
+VP9_CX_SRCS-yes += encoder/vp9_skin_detection.c
+VP9_CX_SRCS-yes += encoder/vp9_skin_detection.h
+VP9_CX_SRCS-yes += encoder/vp9_noise_estimate.c
+VP9_CX_SRCS-yes += encoder/vp9_noise_estimate.h
+ifeq ($(CONFIG_VP9_POSTPROC),yes)
+VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.h
+VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.c
+endif
+VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.c
+VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.h
+VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c
+VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
+
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
+VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_diamond_search_sad_avx.c
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c
+endif
+
+ifeq ($(CONFIG_USE_X86INC),yes)
+VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_error_sse2.asm
+VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_highbd_error_avx.asm
+else
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm
+endif
+endif
+
+ifeq ($(ARCH_X86_64),yes)
+ifeq ($(CONFIG_USE_X86INC),yes)
+VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3_x86_64.asm
+endif
+endif
+
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c
+VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_dct_ssse3.c
+ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_frame_scale_ssse3.c
+endif
+
+ifeq ($(CONFIG_VP9_TEMPORAL_DENOISING),yes)
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_denoiser_sse2.c
+endif
+
+VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_intrin_avx2.c
+
+ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_error_neon.c
+endif
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c
+
+VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_error_msa.c
+VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct4x4_msa.c
+VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct8x8_msa.c
+VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct16x16_msa.c
+VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct_msa.h
+VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_temporal_filter_msa.c
+
+VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))
diff --git a/libs/libvpx/vp9/vp9dx.mk b/libs/libvpx/vp9/vp9dx.mk
new file mode 100644
index 0000000000..4c6fd00715
--- /dev/null
+++ b/libs/libvpx/vp9/vp9dx.mk
@@ -0,0 +1,34 @@
+##
+##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+VP9_DX_EXPORTS += exports_dec
+
+VP9_DX_SRCS-yes += $(VP9_COMMON_SRCS-yes)
+VP9_DX_SRCS-no  += $(VP9_COMMON_SRCS-no)
+VP9_DX_SRCS_REMOVE-yes += $(VP9_COMMON_SRCS_REMOVE-yes)
+VP9_DX_SRCS_REMOVE-no  += $(VP9_COMMON_SRCS_REMOVE-no)
+
+VP9_DX_SRCS-yes += vp9_dx_iface.c
+VP9_DX_SRCS-yes += vp9_dx_iface.h
+
+VP9_DX_SRCS-yes += decoder/vp9_decodemv.c
+VP9_DX_SRCS-yes += decoder/vp9_decodeframe.c
+VP9_DX_SRCS-yes += decoder/vp9_decodeframe.h
+VP9_DX_SRCS-yes += decoder/vp9_detokenize.c
+VP9_DX_SRCS-yes += decoder/vp9_decodemv.h
+VP9_DX_SRCS-yes += decoder/vp9_detokenize.h
+VP9_DX_SRCS-yes += decoder/vp9_dthread.c
+VP9_DX_SRCS-yes += decoder/vp9_dthread.h
+VP9_DX_SRCS-yes += decoder/vp9_decoder.c
+VP9_DX_SRCS-yes += decoder/vp9_decoder.h
+VP9_DX_SRCS-yes += decoder/vp9_dsubexp.c
+VP9_DX_SRCS-yes += decoder/vp9_dsubexp.h
+
+VP9_DX_SRCS-yes := $(filter-out $(VP9_DX_SRCS_REMOVE-yes),$(VP9_DX_SRCS-yes))
diff --git a/libs/libvpx/vpx/exports_com b/libs/libvpx/vpx/exports_com
new file mode 100644
index 0000000000..2ab05099f8
--- /dev/null
+++ b/libs/libvpx/vpx/exports_com
@@ -0,0 +1,16 @@
+text vpx_codec_build_config
+text vpx_codec_control_
+text vpx_codec_destroy
+text vpx_codec_err_to_string
+text vpx_codec_error
+text vpx_codec_error_detail
+text vpx_codec_get_caps
+text vpx_codec_iface_name
+text vpx_codec_version
+text vpx_codec_version_extra_str
+text vpx_codec_version_str
+text vpx_img_alloc
+text vpx_img_flip
+text vpx_img_free
+text vpx_img_set_rect
+text vpx_img_wrap
diff --git a/libs/libvpx/vpx/exports_dec b/libs/libvpx/vpx/exports_dec
new file mode 100644
index 0000000000..c694ebae12
--- /dev/null
+++ b/libs/libvpx/vpx/exports_dec
@@ -0,0 +1,8 @@
+text vpx_codec_dec_init_ver
+text vpx_codec_decode
+text vpx_codec_get_frame
+text vpx_codec_get_stream_info
+text vpx_codec_peek_stream_info
+text vpx_codec_register_put_frame_cb
+text vpx_codec_register_put_slice_cb
+text vpx_codec_set_frame_buffer_functions
diff --git a/libs/libvpx/vpx/exports_enc b/libs/libvpx/vpx/exports_enc
new file mode 100644
index 0000000000..e4707ba108
--- /dev/null
+++ b/libs/libvpx/vpx/exports_enc
@@ -0,0 +1,15 @@
+text vpx_codec_enc_config_default
+text vpx_codec_enc_config_set
+text vpx_codec_enc_init_multi_ver
+text vpx_codec_enc_init_ver
+text vpx_codec_encode
+text vpx_codec_get_cx_data
+text vpx_codec_get_global_headers
+text vpx_codec_get_preview_frame
+text vpx_codec_set_cx_data_buf
+text vpx_svc_dump_statistics
+text vpx_svc_encode
+text vpx_svc_get_message
+text vpx_svc_init
+text vpx_svc_release
+text vpx_svc_set_options
diff --git a/libs/libvpx/vpx/internal/vpx_codec_internal.h b/libs/libvpx/vpx/internal/vpx_codec_internal.h
new file mode 100644
index 0000000000..7380fcc7e2
--- /dev/null
+++ b/libs/libvpx/vpx/internal/vpx_codec_internal.h
@@ -0,0 +1,445 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+/*!\file
+ * \brief Describes the decoder algorithm interface for algorithm
+ *        implementations.
+ *
+ * This file defines the private structures and data types that are only
+ * relevant to implementing an algorithm, as opposed to using it.
+ *
+ * To create a decoder algorithm class, an interface structure is put
+ * into the global namespace:
+ *     <pre>
+ *     my_codec.c:
+ *       vpx_codec_iface_t my_codec = {
+ *           "My Codec v1.0",
+ *           VPX_CODEC_ALG_ABI_VERSION,
+ *           ...
+ *       };
+ *     </pre>
+ *
+ * An application instantiates a specific decoder instance by using
+ * vpx_codec_init() and a pointer to the algorithm's interface structure:
+ *     <pre>
+ *     my_app.c:
+ *       extern vpx_codec_iface_t my_codec;
+ *       {
+ *           vpx_codec_ctx_t algo;
+ *           res = vpx_codec_init(&algo, &my_codec);
+ *       }
+ *     </pre>
+ *
+ * Once initialized, the instance is manged using other functions from
+ * the vpx_codec_* family.
+ */
+#ifndef VPX_INTERNAL_VPX_CODEC_INTERNAL_H_
+#define VPX_INTERNAL_VPX_CODEC_INTERNAL_H_
+#include "../vpx_decoder.h"
+#include "../vpx_encoder.h"
+#include <stdarg.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\brief Current ABI version number
+ *
+ * \internal
+ * If this file is altered in any way that changes the ABI, this value
+ * must be bumped.  Examples include, but are not limited to, changing
+ * types, removing or reassigning enums, adding/removing/rearranging
+ * fields to structures
+ */
+#define VPX_CODEC_INTERNAL_ABI_VERSION (5) /**<\hideinitializer*/
+
+typedef struct vpx_codec_alg_priv  vpx_codec_alg_priv_t;
+typedef struct vpx_codec_priv_enc_mr_cfg vpx_codec_priv_enc_mr_cfg_t;
+
+/*!\brief init function pointer prototype
+ *
+ * Performs algorithm-specific initialization of the decoder context. This
+ * function is called by the generic vpx_codec_init() wrapper function, so
+ * plugins implementing this interface may trust the input parameters to be
+ * properly initialized.
+ *
+ * \param[in] ctx   Pointer to this instance's context
+ * \retval #VPX_CODEC_OK
+ *     The input stream was recognized and decoder initialized.
+ * \retval #VPX_CODEC_MEM_ERROR
+ *     Memory operation failed.
+ */
+typedef vpx_codec_err_t (*vpx_codec_init_fn_t)(vpx_codec_ctx_t *ctx,
+                                               vpx_codec_priv_enc_mr_cfg_t *data);
+
+/*!\brief destroy function pointer prototype
+ *
+ * Performs algorithm-specific destruction of the decoder context. This
+ * function is called by the generic vpx_codec_destroy() wrapper function,
+ * so plugins implementing this interface may trust the input parameters
+ * to be properly initialized.
+ *
+ * \param[in] ctx   Pointer to this instance's context
+ * \retval #VPX_CODEC_OK
+ *     The input stream was recognized and decoder initialized.
+ * \retval #VPX_CODEC_MEM_ERROR
+ *     Memory operation failed.
+ */
+typedef vpx_codec_err_t (*vpx_codec_destroy_fn_t)(vpx_codec_alg_priv_t *ctx);
+
+/*!\brief parse stream info function pointer prototype
+ *
+ * Performs high level parsing of the bitstream. This function is called by the
+ * generic vpx_codec_peek_stream_info() wrapper function, so plugins
+ * implementing this interface may trust the input parameters to be properly
+ * initialized.
+ *
+ * \param[in]      data    Pointer to a block of data to parse
+ * \param[in]      data_sz Size of the data buffer
+ * \param[in,out]  si      Pointer to stream info to update. The size member
+ *                         \ref MUST be properly initialized, but \ref MAY be
+ *                         clobbered by the algorithm. This parameter \ref MAY
+ *                         be NULL.
+ *
+ * \retval #VPX_CODEC_OK
+ *     Bitstream is parsable and stream information updated
+ */
+typedef vpx_codec_err_t (*vpx_codec_peek_si_fn_t)(const uint8_t         *data,
+                                                  unsigned int           data_sz,
+                                                  vpx_codec_stream_info_t *si);
+
+/*!\brief Return information about the current stream.
+ *
+ * Returns information about the stream that has been parsed during decoding.
+ *
+ * \param[in]      ctx     Pointer to this instance's context
+ * \param[in,out]  si      Pointer to stream info to update. The size member
+ *                         \ref MUST be properly initialized, but \ref MAY be
+ *                         clobbered by the algorithm. This parameter \ref MAY
+ *                         be NULL.
+ *
+ * \retval #VPX_CODEC_OK
+ *     Bitstream is parsable and stream information updated
+ */
+typedef vpx_codec_err_t (*vpx_codec_get_si_fn_t)(vpx_codec_alg_priv_t    *ctx,
+                                                 vpx_codec_stream_info_t *si);
+
+/*!\brief control function pointer prototype
+ *
+ * This function is used to exchange algorithm specific data with the decoder
+ * instance. This can be used to implement features specific to a particular
+ * algorithm.
+ *
+ * This function is called by the generic vpx_codec_control() wrapper
+ * function, so plugins implementing this interface may trust the input
+ * parameters to be properly initialized. However,  this interface does not
+ * provide type safety for the exchanged data or assign meanings to the
+ * control codes. Those details should be specified in the algorithm's
+ * header file. In particular, the ctrl_id parameter is guaranteed to exist
+ * in the algorithm's control mapping table, and the data parameter may be NULL.
+ *
+ *
+ * \param[in]     ctx              Pointer to this instance's context
+ * \param[in]     ctrl_id          Algorithm specific control identifier
+ * \param[in,out] data             Data to exchange with algorithm instance.
+ *
+ * \retval #VPX_CODEC_OK
+ *     The internal state data was deserialized.
+ */
+typedef vpx_codec_err_t (*vpx_codec_control_fn_t)(vpx_codec_alg_priv_t *ctx,
+                                                  va_list ap);
+
+/*!\brief control function pointer mapping
+ *
+ * This structure stores the mapping between control identifiers and
+ * implementing functions. Each algorithm provides a list of these
+ * mappings. This list is searched by the vpx_codec_control() wrapper
+ * function to determine which function to invoke. The special
+ * value {0, NULL} is used to indicate end-of-list, and must be
+ * present. The special value {0, <non-null>} can be used as a catch-all
+ * mapping. This implies that ctrl_id values chosen by the algorithm
+ * \ref MUST be non-zero.
+ */
+typedef const struct vpx_codec_ctrl_fn_map {
+  int ctrl_id;
+  vpx_codec_control_fn_t fn;
+} vpx_codec_ctrl_fn_map_t;
+
+/*!\brief decode data function pointer prototype
+ *
+ * Processes a buffer of coded data. If the processing results in a new
+ * decoded frame becoming available, #VPX_CODEC_CB_PUT_SLICE and
+ * #VPX_CODEC_CB_PUT_FRAME events are generated as appropriate. This
+ * function is called by the generic vpx_codec_decode() wrapper function,
+ * so plugins implementing this interface may trust the input parameters
+ * to be properly initialized.
+ *
+ * \param[in] ctx          Pointer to this instance's context
+ * \param[in] data         Pointer to this block of new coded data. If
+ *                         NULL, a #VPX_CODEC_CB_PUT_FRAME event is posted
+ *                         for the previously decoded frame.
+ * \param[in] data_sz      Size of the coded data, in bytes.
+ *
+ * \return Returns #VPX_CODEC_OK if the coded data was processed completely
+ *         and future pictures can be decoded without error. Otherwise,
+ *         see the descriptions of the other error codes in ::vpx_codec_err_t
+ *         for recoverability capabilities.
+ */
+typedef vpx_codec_err_t (*vpx_codec_decode_fn_t)(vpx_codec_alg_priv_t  *ctx,
+                                                 const uint8_t         *data,
+                                                 unsigned int     data_sz,
+                                                 void        *user_priv,
+                                                 long         deadline);
+
+/*!\brief Decoded frames iterator
+ *
+ * Iterates over a list of the frames available for display. The iterator
+ * storage should be initialized to NULL to start the iteration. Iteration is
+ * complete when this function returns NULL.
+ *
+ * The list of available frames becomes valid upon completion of the
+ * vpx_codec_decode call, and remains valid until the next call to vpx_codec_decode.
+ *
+ * \param[in]     ctx      Pointer to this instance's context
+ * \param[in out] iter     Iterator storage, initialized to NULL
+ *
+ * \return Returns a pointer to an image, if one is ready for display. Frames
+ *         produced will always be in PTS (presentation time stamp) order.
+ */
+typedef vpx_image_t *(*vpx_codec_get_frame_fn_t)(vpx_codec_alg_priv_t *ctx,
+                                                 vpx_codec_iter_t     *iter);
+
+/*!\brief Pass in external frame buffers for the decoder to use.
+ *
+ * Registers functions to be called when libvpx needs a frame buffer
+ * to decode the current frame and a function to be called when libvpx does
+ * not internally reference the frame buffer. This set function must
+ * be called before the first call to decode or libvpx will assume the
+ * default behavior of allocating frame buffers internally.
+ *
+ * \param[in] ctx          Pointer to this instance's context
+ * \param[in] cb_get       Pointer to the get callback function
+ * \param[in] cb_release   Pointer to the release callback function
+ * \param[in] cb_priv      Callback's private data
+ *
+ * \retval #VPX_CODEC_OK
+ *     External frame buffers will be used by libvpx.
+ * \retval #VPX_CODEC_INVALID_PARAM
+ *     One or more of the callbacks were NULL.
+ * \retval #VPX_CODEC_ERROR
+ *     Decoder context not initialized, or algorithm not capable of
+ *     using external frame buffers.
+ *
+ * \note
+ * When decoding VP9, the application may be required to pass in at least
+ * #VP9_MAXIMUM_REF_BUFFERS + #VPX_MAXIMUM_WORK_BUFFERS external frame
+ * buffers.
+ */
+typedef vpx_codec_err_t (*vpx_codec_set_fb_fn_t)(
+    vpx_codec_alg_priv_t *ctx,
+    vpx_get_frame_buffer_cb_fn_t cb_get,
+    vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv);
+
+
+typedef vpx_codec_err_t (*vpx_codec_encode_fn_t)(vpx_codec_alg_priv_t  *ctx,
+                                                 const vpx_image_t     *img,
+                                                 vpx_codec_pts_t        pts,
+                                                 unsigned long          duration,
+                                                 vpx_enc_frame_flags_t  flags,
+                                                 unsigned long          deadline);
+typedef const vpx_codec_cx_pkt_t *(*vpx_codec_get_cx_data_fn_t)(vpx_codec_alg_priv_t *ctx,
+                                                                vpx_codec_iter_t     *iter);
+
+typedef vpx_codec_err_t
+(*vpx_codec_enc_config_set_fn_t)(vpx_codec_alg_priv_t       *ctx,
+                                 const vpx_codec_enc_cfg_t  *cfg);
+typedef vpx_fixed_buf_t *
+(*vpx_codec_get_global_headers_fn_t)(vpx_codec_alg_priv_t   *ctx);
+
+typedef vpx_image_t *
+(*vpx_codec_get_preview_frame_fn_t)(vpx_codec_alg_priv_t   *ctx);
+
+typedef vpx_codec_err_t
+(*vpx_codec_enc_mr_get_mem_loc_fn_t)(const vpx_codec_enc_cfg_t     *cfg,
+                                     void **mem_loc);
+
+/*!\brief usage configuration mapping
+ *
+ * This structure stores the mapping between usage identifiers and
+ * configuration structures. Each algorithm provides a list of these
+ * mappings. This list is searched by the vpx_codec_enc_config_default()
+ * wrapper function to determine which config to return. The special value
+ * {-1, {0}} is used to indicate end-of-list, and must be present. At least
+ * one mapping must be present, in addition to the end-of-list.
+ *
+ */
+typedef const struct vpx_codec_enc_cfg_map {
+  int                 usage;
+  vpx_codec_enc_cfg_t cfg;
+} vpx_codec_enc_cfg_map_t;
+
+/*!\brief Decoder algorithm interface interface
+ *
+ * All decoders \ref MUST expose a variable of this type.
+ */
+struct vpx_codec_iface {
+  const char               *name;        /**< Identification String  */
+  int                       abi_version; /**< Implemented ABI version */
+  vpx_codec_caps_t          caps;    /**< Decoder capabilities */
+  vpx_codec_init_fn_t       init;    /**< \copydoc ::vpx_codec_init_fn_t */
+  vpx_codec_destroy_fn_t    destroy;     /**< \copydoc ::vpx_codec_destroy_fn_t */
+  vpx_codec_ctrl_fn_map_t  *ctrl_maps;   /**< \copydoc ::vpx_codec_ctrl_fn_map_t */
+  struct vpx_codec_dec_iface {
+    vpx_codec_peek_si_fn_t    peek_si;     /**< \copydoc ::vpx_codec_peek_si_fn_t */
+    vpx_codec_get_si_fn_t     get_si;      /**< \copydoc ::vpx_codec_get_si_fn_t */
+    vpx_codec_decode_fn_t     decode;      /**< \copydoc ::vpx_codec_decode_fn_t */
+    vpx_codec_get_frame_fn_t  get_frame;   /**< \copydoc ::vpx_codec_get_frame_fn_t */
+    vpx_codec_set_fb_fn_t     set_fb_fn;   /**< \copydoc ::vpx_codec_set_fb_fn_t */
+  } dec;
+  struct vpx_codec_enc_iface {
+    int                                cfg_map_count;
+    vpx_codec_enc_cfg_map_t           *cfg_maps;      /**< \copydoc ::vpx_codec_enc_cfg_map_t */
+    vpx_codec_encode_fn_t              encode;        /**< \copydoc ::vpx_codec_encode_fn_t */
+    vpx_codec_get_cx_data_fn_t         get_cx_data;   /**< \copydoc ::vpx_codec_get_cx_data_fn_t */
+    vpx_codec_enc_config_set_fn_t      cfg_set;       /**< \copydoc ::vpx_codec_enc_config_set_fn_t */
+    vpx_codec_get_global_headers_fn_t  get_glob_hdrs; /**< \copydoc ::vpx_codec_get_global_headers_fn_t */
+    vpx_codec_get_preview_frame_fn_t   get_preview;   /**< \copydoc ::vpx_codec_get_preview_frame_fn_t */
+    vpx_codec_enc_mr_get_mem_loc_fn_t  mr_get_mem_loc;   /**< \copydoc ::vpx_codec_enc_mr_get_mem_loc_fn_t */
+  } enc;
+};
+
+/*!\brief Callback function pointer / user data pair storage */
+typedef struct vpx_codec_priv_cb_pair {
+  union {
+    vpx_codec_put_frame_cb_fn_t    put_frame;
+    vpx_codec_put_slice_cb_fn_t    put_slice;
+  } u;
+  void                            *user_priv;
+} vpx_codec_priv_cb_pair_t;
+
+
+/*!\brief Instance private storage
+ *
+ * This structure is allocated by the algorithm's init function. It can be
+ * extended in one of two ways. First, a second, algorithm specific structure
+ * can be allocated and the priv member pointed to it. Alternatively, this
+ * structure can be made the first member of the algorithm specific structure,
+ * and the pointer cast to the proper type.
+ */
+struct vpx_codec_priv {
+  const char                     *err_detail;
+  vpx_codec_flags_t               init_flags;
+  struct {
+    vpx_codec_priv_cb_pair_t    put_frame_cb;
+    vpx_codec_priv_cb_pair_t    put_slice_cb;
+  } dec;
+  struct {
+    vpx_fixed_buf_t             cx_data_dst_buf;
+    unsigned int                cx_data_pad_before;
+    unsigned int                cx_data_pad_after;
+    vpx_codec_cx_pkt_t          cx_data_pkt;
+    unsigned int                total_encoders;
+  } enc;
+};
+
+/*
+ * Multi-resolution encoding internal configuration
+ */
+struct vpx_codec_priv_enc_mr_cfg
+{
+    unsigned int           mr_total_resolutions;
+    unsigned int           mr_encoder_id;
+    struct vpx_rational    mr_down_sampling_factor;
+    void*                  mr_low_res_mode_info;
+};
+
+#undef VPX_CTRL_USE_TYPE
+#define VPX_CTRL_USE_TYPE(id, typ) \
+  static VPX_INLINE typ id##__value(va_list args) {return va_arg(args, typ);}
+
+#undef VPX_CTRL_USE_TYPE_DEPRECATED
+#define VPX_CTRL_USE_TYPE_DEPRECATED(id, typ) \
+  static VPX_INLINE typ id##__value(va_list args) {return va_arg(args, typ);}
+
+#define CAST(id, arg) id##__value(arg)
+
+/* CODEC_INTERFACE convenience macro
+ *
+ * By convention, each codec interface is a struct with extern linkage, where
+ * the symbol is suffixed with _algo. A getter function is also defined to
+ * return a pointer to the struct, since in some cases it's easier to work
+ * with text symbols than data symbols (see issue #169). This function has
+ * the same name as the struct, less the _algo suffix. The CODEC_INTERFACE
+ * macro is provided to define this getter function automatically.
+ */
+#define CODEC_INTERFACE(id)\
+  vpx_codec_iface_t* id(void) { return &id##_algo; }\
+  vpx_codec_iface_t  id##_algo
+
+
+/* Internal Utility Functions
+ *
+ * The following functions are intended to be used inside algorithms as
+ * utilities for manipulating vpx_codec_* data structures.
+ */
+struct vpx_codec_pkt_list {
+  unsigned int            cnt;
+  unsigned int            max;
+  struct vpx_codec_cx_pkt pkts[1];
+};
+
+#define vpx_codec_pkt_list_decl(n)\
+  union {struct vpx_codec_pkt_list head;\
+    struct {struct vpx_codec_pkt_list head;\
+      struct vpx_codec_cx_pkt    pkts[n];} alloc;}
+
+#define vpx_codec_pkt_list_init(m)\
+  (m)->alloc.head.cnt = 0,\
+                        (m)->alloc.head.max = sizeof((m)->alloc.pkts) / sizeof((m)->alloc.pkts[0])
+
+int
+vpx_codec_pkt_list_add(struct vpx_codec_pkt_list *,
+                       const struct vpx_codec_cx_pkt *);
+
+const vpx_codec_cx_pkt_t *
+vpx_codec_pkt_list_get(struct vpx_codec_pkt_list *list,
+                       vpx_codec_iter_t           *iter);
+
+
+#include <stdio.h>
+#include <setjmp.h>
+
+struct vpx_internal_error_info {
+  vpx_codec_err_t  error_code;
+  int              has_detail;
+  char             detail[80];
+  int              setjmp;
+  jmp_buf          jmp;
+};
+
+#define CLANG_ANALYZER_NORETURN
+#if defined(__has_feature)
+#if __has_feature(attribute_analyzer_noreturn)
+#undef CLANG_ANALYZER_NORETURN
+#define CLANG_ANALYZER_NORETURN __attribute__((analyzer_noreturn))
+#endif
+#endif
+
+void vpx_internal_error(struct vpx_internal_error_info *info,
+                        vpx_codec_err_t                 error,
+                        const char                     *fmt,
+                        ...) CLANG_ANALYZER_NORETURN;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_INTERNAL_VPX_CODEC_INTERNAL_H_
diff --git a/libs/libvpx/vpx/internal/vpx_psnr.h b/libs/libvpx/vpx/internal/vpx_psnr.h
new file mode 100644
index 0000000000..07d81bb8d9
--- /dev/null
+++ b/libs/libvpx/vpx/internal/vpx_psnr.h
@@ -0,0 +1,34 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_INTERNAL_VPX_PSNR_H_
+#define VPX_INTERNAL_VPX_PSNR_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// TODO(dkovalev) change vpx_sse_to_psnr signature: double -> int64_t
+
+/*!\brief Converts SSE to PSNR
+ *
+ * Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PNSR).
+ *
+ * \param[in]    samples       Number of samples
+ * \param[in]    peak          Max sample value
+ * \param[in]    sse           Sum of squared errors
+ */
+double vpx_sse_to_psnr(double samples, double peak, double sse);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_INTERNAL_VPX_PSNR_H_
diff --git a/libs/libvpx/vpx/src/svc_encodeframe.c b/libs/libvpx/vpx/src/svc_encodeframe.c
new file mode 100644
index 0000000000..628afca31e
--- /dev/null
+++ b/libs/libvpx/vpx/src/svc_encodeframe.c
@@ -0,0 +1,693 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/**
+ * @file
+ * VP9 SVC encoding support via libvpx
+ */
+
+#include <assert.h>
+#include <math.h>
+#include <limits.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#define VPX_DISABLE_CTRL_TYPECHECKS 1
+#include "./vpx_config.h"
+#include "vpx/svc_context.h"
+#include "vpx/vp8cx.h"
+#include "vpx/vpx_encoder.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp9/common/vp9_onyxc_int.h"
+
+#ifdef __MINGW32__
+#define strtok_r strtok_s
+#ifndef MINGW_HAS_SECURE_API
+// proto from /usr/x86_64-w64-mingw32/include/sec_api/string_s.h
+_CRTIMP char *__cdecl strtok_s(char *str, const char *delim, char **context);
+#endif  /* MINGW_HAS_SECURE_API */
+#endif  /* __MINGW32__ */
+
+#ifdef _MSC_VER
+#define strdup _strdup
+#define strtok_r strtok_s
+#endif
+
+#define SVC_REFERENCE_FRAMES 8
+#define SUPERFRAME_SLOTS (8)
+#define SUPERFRAME_BUFFER_SIZE (SUPERFRAME_SLOTS * sizeof(uint32_t) + 2)
+
+#define MAX_QUANTIZER 63
+
+static const int DEFAULT_SCALE_FACTORS_NUM[VPX_SS_MAX_LAYERS] = {
+  4, 5, 7, 11, 16
+};
+
+static const int DEFAULT_SCALE_FACTORS_DEN[VPX_SS_MAX_LAYERS] = {
+  16, 16, 16, 16, 16
+};
+
+typedef enum {
+  QUANTIZER = 0,
+  BITRATE,
+  SCALE_FACTOR,
+  AUTO_ALT_REF,
+  ALL_OPTION_TYPES
+} LAYER_OPTION_TYPE;
+
+static const int option_max_values[ALL_OPTION_TYPES] = {
+  63, INT_MAX, INT_MAX, 1
+};
+
+static const int option_min_values[ALL_OPTION_TYPES] = {
+  0, 0, 1, 0
+};
+
+// One encoded frame
+typedef struct FrameData {
+  void                     *buf;    // compressed data buffer
+  size_t                    size;  // length of compressed data
+  vpx_codec_frame_flags_t   flags;    /**< flags for this frame */
+  struct FrameData         *next;
+} FrameData;
+
+static SvcInternal_t *get_svc_internal(SvcContext *svc_ctx) {
+  if (svc_ctx == NULL) return NULL;
+  if (svc_ctx->internal == NULL) {
+    SvcInternal_t *const si = (SvcInternal_t *)malloc(sizeof(*si));
+    if (si != NULL) {
+      memset(si, 0, sizeof(*si));
+    }
+    svc_ctx->internal = si;
+  }
+  return (SvcInternal_t *)svc_ctx->internal;
+}
+
+static const SvcInternal_t *get_const_svc_internal(
+    const SvcContext *svc_ctx) {
+  if (svc_ctx == NULL) return NULL;
+  return (const SvcInternal_t *)svc_ctx->internal;
+}
+
+static void svc_log_reset(SvcContext *svc_ctx) {
+  SvcInternal_t *const si = (SvcInternal_t *)svc_ctx->internal;
+  si->message_buffer[0] = '\0';
+}
+
+static int svc_log(SvcContext *svc_ctx, SVC_LOG_LEVEL level,
+                   const char *fmt, ...) {
+  char buf[512];
+  int retval = 0;
+  va_list ap;
+  SvcInternal_t *const si = get_svc_internal(svc_ctx);
+
+  if (level > svc_ctx->log_level) {
+    return retval;
+  }
+
+  va_start(ap, fmt);
+  retval = vsnprintf(buf, sizeof(buf), fmt, ap);
+  va_end(ap);
+
+  if (svc_ctx->log_print) {
+    printf("%s", buf);
+  } else {
+    strncat(si->message_buffer, buf,
+            sizeof(si->message_buffer) - strlen(si->message_buffer) - 1);
+  }
+
+  if (level == SVC_LOG_ERROR) {
+    si->codec_ctx->err_detail = si->message_buffer;
+  }
+  return retval;
+}
+
+static vpx_codec_err_t extract_option(LAYER_OPTION_TYPE type,
+                                      char *input,
+                                      int *value0,
+                                      int *value1) {
+  if (type == SCALE_FACTOR) {
+    *value0 = strtol(input, &input, 10);
+    if (*input++ != '/')
+      return VPX_CODEC_INVALID_PARAM;
+    *value1 = strtol(input, &input, 10);
+
+    if (*value0 < option_min_values[SCALE_FACTOR] ||
+        *value1 < option_min_values[SCALE_FACTOR] ||
+        *value0 > option_max_values[SCALE_FACTOR] ||
+        *value1 > option_max_values[SCALE_FACTOR] ||
+        *value0 > *value1)  // num shouldn't be greater than den
+      return VPX_CODEC_INVALID_PARAM;
+  } else {
+    *value0 = atoi(input);
+    if (*value0 < option_min_values[type] ||
+        *value0 > option_max_values[type])
+      return VPX_CODEC_INVALID_PARAM;
+  }
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t parse_layer_options_from_string(SvcContext *svc_ctx,
+                                                       LAYER_OPTION_TYPE type,
+                                                       const char *input,
+                                                       int *option0,
+                                                       int *option1) {
+  int i;
+  vpx_codec_err_t res = VPX_CODEC_OK;
+  char *input_string;
+  char *token;
+  const char *delim = ",";
+  char *save_ptr;
+
+  if (input == NULL || option0 == NULL ||
+      (option1 == NULL && type == SCALE_FACTOR))
+    return VPX_CODEC_INVALID_PARAM;
+
+  input_string = strdup(input);
+  token = strtok_r(input_string, delim, &save_ptr);
+  for (i = 0; i < svc_ctx->spatial_layers; ++i) {
+    if (token != NULL) {
+      res = extract_option(type, token, option0 + i, option1 + i);
+      if (res != VPX_CODEC_OK)
+        break;
+      token = strtok_r(NULL, delim, &save_ptr);
+    } else {
+      break;
+    }
+  }
+  if (res == VPX_CODEC_OK && i != svc_ctx->spatial_layers) {
+    svc_log(svc_ctx, SVC_LOG_ERROR,
+            "svc: layer params type: %d    %d values required, "
+            "but only %d specified\n", type, svc_ctx->spatial_layers, i);
+    res = VPX_CODEC_INVALID_PARAM;
+  }
+  free(input_string);
+  return res;
+}
+
+/**
+ * Parse SVC encoding options
+ * Format: encoding-mode=<svc_mode>,layers=<layer_count>
+ *         scale-factors=<n1>/<d1>,<n2>/<d2>,...
+ *         quantizers=<q1>,<q2>,...
+ * svc_mode = [i|ip|alt_ip|gf]
+ */
+static vpx_codec_err_t parse_options(SvcContext *svc_ctx, const char *options) {
+  char *input_string;
+  char *option_name;
+  char *option_value;
+  char *input_ptr;
+  SvcInternal_t *const si = get_svc_internal(svc_ctx);
+  vpx_codec_err_t res = VPX_CODEC_OK;
+  int i, alt_ref_enabled = 0;
+
+  if (options == NULL) return VPX_CODEC_OK;
+  input_string = strdup(options);
+
+  // parse option name
+  option_name = strtok_r(input_string, "=", &input_ptr);
+  while (option_name != NULL) {
+    // parse option value
+    option_value = strtok_r(NULL, " ", &input_ptr);
+    if (option_value == NULL) {
+      svc_log(svc_ctx, SVC_LOG_ERROR, "option missing value: %s\n",
+              option_name);
+      res = VPX_CODEC_INVALID_PARAM;
+      break;
+    }
+    if (strcmp("spatial-layers", option_name) == 0) {
+      svc_ctx->spatial_layers = atoi(option_value);
+    } else if (strcmp("temporal-layers", option_name) == 0) {
+      svc_ctx->temporal_layers = atoi(option_value);
+    } else if (strcmp("scale-factors", option_name) == 0) {
+      res = parse_layer_options_from_string(svc_ctx, SCALE_FACTOR, option_value,
+                                            si->svc_params.scaling_factor_num,
+                                            si->svc_params.scaling_factor_den);
+      if (res != VPX_CODEC_OK) break;
+    } else if (strcmp("max-quantizers", option_name) == 0) {
+      res = parse_layer_options_from_string(svc_ctx, QUANTIZER, option_value,
+                                            si->svc_params.max_quantizers,
+                                            NULL);
+      if (res != VPX_CODEC_OK) break;
+    } else if (strcmp("min-quantizers", option_name) == 0) {
+      res = parse_layer_options_from_string(svc_ctx, QUANTIZER, option_value,
+                                            si->svc_params.min_quantizers,
+                                            NULL);
+      if (res != VPX_CODEC_OK) break;
+    } else if (strcmp("auto-alt-refs", option_name) == 0) {
+      res = parse_layer_options_from_string(svc_ctx, AUTO_ALT_REF, option_value,
+                                            si->enable_auto_alt_ref, NULL);
+      if (res != VPX_CODEC_OK) break;
+    } else if (strcmp("bitrates", option_name) == 0) {
+      res = parse_layer_options_from_string(svc_ctx, BITRATE, option_value,
+                                            si->bitrates, NULL);
+      if (res != VPX_CODEC_OK) break;
+    } else if (strcmp("multi-frame-contexts", option_name) == 0) {
+      si->use_multiple_frame_contexts = atoi(option_value);
+    } else {
+      svc_log(svc_ctx, SVC_LOG_ERROR, "invalid option: %s\n", option_name);
+      res = VPX_CODEC_INVALID_PARAM;
+      break;
+    }
+    option_name = strtok_r(NULL, "=", &input_ptr);
+  }
+  free(input_string);
+
+  for (i = 0; i < svc_ctx->spatial_layers; ++i) {
+    if (si->svc_params.max_quantizers[i] > MAX_QUANTIZER ||
+        si->svc_params.max_quantizers[i] < 0 ||
+        si->svc_params.min_quantizers[i] > si->svc_params.max_quantizers[i] ||
+        si->svc_params.min_quantizers[i] < 0)
+      res = VPX_CODEC_INVALID_PARAM;
+  }
+
+  if (si->use_multiple_frame_contexts &&
+      (svc_ctx->spatial_layers > 3 ||
+       svc_ctx->spatial_layers * svc_ctx->temporal_layers > 4))
+    res = VPX_CODEC_INVALID_PARAM;
+
+  for (i = 0; i < svc_ctx->spatial_layers; ++i)
+    alt_ref_enabled += si->enable_auto_alt_ref[i];
+  if (alt_ref_enabled > REF_FRAMES - svc_ctx->spatial_layers) {
+    svc_log(svc_ctx, SVC_LOG_ERROR,
+            "svc: auto alt ref: Maxinum %d(REF_FRAMES - layers) layers could"
+            "enabled auto alt reference frame, but % layers are enabled\n",
+            REF_FRAMES - svc_ctx->spatial_layers, alt_ref_enabled);
+    res = VPX_CODEC_INVALID_PARAM;
+  }
+
+  return res;
+}
+
+vpx_codec_err_t vpx_svc_set_options(SvcContext *svc_ctx,
+                                    const char *options) {
+  SvcInternal_t *const si = get_svc_internal(svc_ctx);
+  if (svc_ctx == NULL || options == NULL || si == NULL) {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+  strncpy(si->options, options, sizeof(si->options));
+  si->options[sizeof(si->options) - 1] = '\0';
+  return VPX_CODEC_OK;
+}
+
+void assign_layer_bitrates(const SvcContext *svc_ctx,
+                           vpx_codec_enc_cfg_t *const enc_cfg) {
+  int i;
+  const SvcInternal_t *const si = get_const_svc_internal(svc_ctx);
+  int sl, tl, spatial_layer_target;
+
+  if (svc_ctx->temporal_layering_mode != 0) {
+    if (si->bitrates[0] != 0) {
+      enc_cfg->rc_target_bitrate = 0;
+      for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) {
+        enc_cfg->ss_target_bitrate[sl*svc_ctx->temporal_layers] = 0;
+        for (tl = 0; tl < svc_ctx->temporal_layers; ++tl) {
+          enc_cfg->ss_target_bitrate[sl*svc_ctx->temporal_layers]
+              += (unsigned int)si->bitrates[sl * svc_ctx->temporal_layers + tl];
+          enc_cfg->layer_target_bitrate[sl*svc_ctx->temporal_layers + tl]
+              = si->bitrates[sl * svc_ctx->temporal_layers + tl];
+        }
+      }
+    } else {
+      float total = 0;
+      float alloc_ratio[VPX_MAX_LAYERS] = {0};
+
+      for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) {
+        if (si->svc_params.scaling_factor_den[sl] > 0) {
+          alloc_ratio[sl] = (float)( (sl+1) );
+          total += alloc_ratio[sl];
+        }
+      }
+
+      for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) {
+        enc_cfg->ss_target_bitrate[sl] = spatial_layer_target =
+            (unsigned int)(enc_cfg->rc_target_bitrate *
+                alloc_ratio[sl] / total);
+        if (svc_ctx->temporal_layering_mode == 3) {
+          enc_cfg->layer_target_bitrate[sl * svc_ctx->temporal_layers] =
+              (spatial_layer_target*6)/10;  // 60%
+          enc_cfg->layer_target_bitrate[sl * svc_ctx->temporal_layers + 1] =
+              (spatial_layer_target*8)/10;  // 80%
+          enc_cfg->layer_target_bitrate[sl * svc_ctx->temporal_layers + 2] =
+              spatial_layer_target;
+        } else if (svc_ctx->temporal_layering_mode == 2 ||
+                   svc_ctx->temporal_layering_mode == 1) {
+          enc_cfg->layer_target_bitrate[sl * svc_ctx->temporal_layers] =
+              spatial_layer_target * 2 / 3;
+          enc_cfg->layer_target_bitrate[sl * svc_ctx->temporal_layers + 1] =
+              spatial_layer_target;
+        } else {
+          // User should explicitly assign bitrates in this case.
+          assert(0);
+        }
+      }
+    }
+  } else {
+    if (si->bitrates[0] != 0) {
+      enc_cfg->rc_target_bitrate = 0;
+      for (i = 0; i < svc_ctx->spatial_layers; ++i) {
+        enc_cfg->ss_target_bitrate[i] = (unsigned int)si->bitrates[i];
+        enc_cfg->rc_target_bitrate += si->bitrates[i];
+      }
+    } else {
+      float total = 0;
+      float alloc_ratio[VPX_MAX_LAYERS] = {0};
+
+      for (i = 0; i < svc_ctx->spatial_layers; ++i) {
+        if (si->svc_params.scaling_factor_den[i] > 0) {
+          alloc_ratio[i] = (float)(si->svc_params.scaling_factor_num[i] * 1.0 /
+                                   si->svc_params.scaling_factor_den[i]);
+
+          alloc_ratio[i] *= alloc_ratio[i];
+          total += alloc_ratio[i];
+        }
+      }
+      for (i = 0; i < VPX_SS_MAX_LAYERS; ++i) {
+        if (total > 0) {
+          enc_cfg->layer_target_bitrate[i] = (unsigned int)
+              (enc_cfg->rc_target_bitrate * alloc_ratio[i] / total);
+        }
+      }
+    }
+  }
+}
+
+vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
+                             vpx_codec_iface_t *iface,
+                             vpx_codec_enc_cfg_t *enc_cfg) {
+  vpx_codec_err_t res;
+  int i, sl , tl;
+  SvcInternal_t *const si = get_svc_internal(svc_ctx);
+  if (svc_ctx == NULL || codec_ctx == NULL || iface == NULL ||
+      enc_cfg == NULL) {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+  if (si == NULL) return VPX_CODEC_MEM_ERROR;
+
+  si->codec_ctx = codec_ctx;
+
+  si->width = enc_cfg->g_w;
+  si->height = enc_cfg->g_h;
+
+// wonkap: why is this necessary?
+  /*if (enc_cfg->kf_max_dist < 2) {
+    svc_log(svc_ctx, SVC_LOG_ERROR, "key frame distance too small: %d\n",
+            enc_cfg->kf_max_dist);
+    return VPX_CODEC_INVALID_PARAM;
+  }*/
+
+  si->kf_dist = enc_cfg->kf_max_dist;
+
+  if (svc_ctx->spatial_layers == 0)
+    svc_ctx->spatial_layers = VPX_SS_DEFAULT_LAYERS;
+  if (svc_ctx->spatial_layers < 1 ||
+      svc_ctx->spatial_layers > VPX_SS_MAX_LAYERS) {
+    svc_log(svc_ctx, SVC_LOG_ERROR, "spatial layers: invalid value: %d\n",
+            svc_ctx->spatial_layers);
+    return VPX_CODEC_INVALID_PARAM;
+  }
+
+  // Note: temporal_layering_mode only applies to one-pass CBR
+  // si->svc_params.temporal_layering_mode = svc_ctx->temporal_layering_mode;
+  if (svc_ctx->temporal_layering_mode == 3) {
+    svc_ctx->temporal_layers = 3;
+  } else if (svc_ctx->temporal_layering_mode == 2 ||
+             svc_ctx->temporal_layering_mode == 1) {
+    svc_ctx->temporal_layers = 2;
+  }
+
+  for (sl = 0; sl < VPX_SS_MAX_LAYERS; ++sl) {
+    si->svc_params.scaling_factor_num[sl] = DEFAULT_SCALE_FACTORS_NUM[sl];
+    si->svc_params.scaling_factor_den[sl] = DEFAULT_SCALE_FACTORS_DEN[sl];
+  }
+  for (tl = 0; tl < svc_ctx->temporal_layers; ++tl) {
+    for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) {
+      i = sl * svc_ctx->temporal_layers + tl;
+      si->svc_params.max_quantizers[i] = MAX_QUANTIZER;
+      si->svc_params.min_quantizers[i] = 0;
+    }
+  }
+
+  // Parse aggregate command line options. Options must start with
+  // "layers=xx" then followed by other options
+  res = parse_options(svc_ctx, si->options);
+  if (res != VPX_CODEC_OK) return res;
+
+  if (svc_ctx->spatial_layers < 1)
+    svc_ctx->spatial_layers = 1;
+  if (svc_ctx->spatial_layers > VPX_SS_MAX_LAYERS)
+    svc_ctx->spatial_layers = VPX_SS_MAX_LAYERS;
+
+  if (svc_ctx->temporal_layers < 1)
+    svc_ctx->temporal_layers = 1;
+  if (svc_ctx->temporal_layers > VPX_TS_MAX_LAYERS)
+    svc_ctx->temporal_layers = VPX_TS_MAX_LAYERS;
+
+  if (svc_ctx->temporal_layers * svc_ctx->spatial_layers > VPX_MAX_LAYERS) {
+      svc_log(svc_ctx, SVC_LOG_ERROR,
+          "spatial layers * temporal layers exceeds the maximum number of "
+          "allowed layers of %d\n",
+          svc_ctx->spatial_layers * svc_ctx->temporal_layers,
+          (int) VPX_MAX_LAYERS);
+      return VPX_CODEC_INVALID_PARAM;
+  }
+  assign_layer_bitrates(svc_ctx, enc_cfg);
+
+#if CONFIG_SPATIAL_SVC
+  for (i = 0; i < svc_ctx->spatial_layers; ++i)
+    enc_cfg->ss_enable_auto_alt_ref[i] = si->enable_auto_alt_ref[i];
+#endif
+
+  if (svc_ctx->temporal_layers > 1) {
+    int i;
+    for (i = 0; i < svc_ctx->temporal_layers; ++i) {
+      enc_cfg->ts_target_bitrate[i] = enc_cfg->rc_target_bitrate /
+                                      svc_ctx->temporal_layers;
+      enc_cfg->ts_rate_decimator[i] = 1 << (svc_ctx->temporal_layers - 1 - i);
+    }
+  }
+
+  if (svc_ctx->threads)
+    enc_cfg->g_threads = svc_ctx->threads;
+
+  // Modify encoder configuration
+  enc_cfg->ss_number_layers = svc_ctx->spatial_layers;
+  enc_cfg->ts_number_layers = svc_ctx->temporal_layers;
+
+  if (enc_cfg->rc_end_usage == VPX_CBR) {
+    enc_cfg->rc_resize_allowed = 0;
+    enc_cfg->rc_min_quantizer = 2;
+    enc_cfg->rc_max_quantizer = 56;
+    enc_cfg->rc_undershoot_pct = 50;
+    enc_cfg->rc_overshoot_pct = 50;
+    enc_cfg->rc_buf_initial_sz = 500;
+    enc_cfg->rc_buf_optimal_sz = 600;
+    enc_cfg->rc_buf_sz = 1000;
+    enc_cfg->rc_dropframe_thresh = 0;
+  }
+
+  if (enc_cfg->g_error_resilient == 0 && si->use_multiple_frame_contexts == 0)
+    enc_cfg->g_error_resilient = 1;
+
+  // Initialize codec
+  res = vpx_codec_enc_init(codec_ctx, iface, enc_cfg, VPX_CODEC_USE_PSNR);
+  if (res != VPX_CODEC_OK) {
+    svc_log(svc_ctx, SVC_LOG_ERROR, "svc_enc_init error\n");
+    return res;
+  }
+  if (svc_ctx->spatial_layers > 1 || svc_ctx->temporal_layers > 1) {
+    vpx_codec_control(codec_ctx, VP9E_SET_SVC, 1);
+    vpx_codec_control(codec_ctx, VP9E_SET_SVC_PARAMETERS, &si->svc_params);
+  }
+  return VPX_CODEC_OK;
+}
+
+/**
+ * Encode a frame into multiple layers
+ * Create a superframe containing the individual layers
+ */
+vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx,
+                               vpx_codec_ctx_t *codec_ctx,
+                               struct vpx_image *rawimg,
+                               vpx_codec_pts_t pts,
+                               int64_t duration, int deadline) {
+  vpx_codec_err_t res;
+  vpx_codec_iter_t iter;
+  const vpx_codec_cx_pkt_t *cx_pkt;
+  SvcInternal_t *const si = get_svc_internal(svc_ctx);
+  if (svc_ctx == NULL || codec_ctx == NULL || si == NULL) {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+
+  svc_log_reset(svc_ctx);
+
+  res = vpx_codec_encode(codec_ctx, rawimg, pts, (uint32_t)duration, 0,
+                         deadline);
+  if (res != VPX_CODEC_OK) {
+    return res;
+  }
+  // save compressed data
+  iter = NULL;
+  while ((cx_pkt = vpx_codec_get_cx_data(codec_ctx, &iter))) {
+    switch (cx_pkt->kind) {
+#if VPX_ENCODER_ABI_VERSION > (5 + VPX_CODEC_ABI_VERSION)
+#if CONFIG_SPATIAL_SVC
+      case VPX_CODEC_SPATIAL_SVC_LAYER_PSNR: {
+        int i;
+        for (i = 0; i < svc_ctx->spatial_layers; ++i) {
+          int j;
+          svc_log(svc_ctx, SVC_LOG_DEBUG,
+                  "SVC frame: %d, layer: %d, PSNR(Total/Y/U/V): "
+                  "%2.3f  %2.3f  %2.3f  %2.3f \n",
+                  si->psnr_pkt_received, i,
+                  cx_pkt->data.layer_psnr[i].psnr[0],
+                  cx_pkt->data.layer_psnr[i].psnr[1],
+                  cx_pkt->data.layer_psnr[i].psnr[2],
+                  cx_pkt->data.layer_psnr[i].psnr[3]);
+          svc_log(svc_ctx, SVC_LOG_DEBUG,
+                  "SVC frame: %d, layer: %d, SSE(Total/Y/U/V): "
+                  "%2.3f  %2.3f  %2.3f  %2.3f \n",
+                  si->psnr_pkt_received, i,
+                  cx_pkt->data.layer_psnr[i].sse[0],
+                  cx_pkt->data.layer_psnr[i].sse[1],
+                  cx_pkt->data.layer_psnr[i].sse[2],
+                  cx_pkt->data.layer_psnr[i].sse[3]);
+
+          for (j = 0; j < COMPONENTS; ++j) {
+            si->psnr_sum[i][j] +=
+                cx_pkt->data.layer_psnr[i].psnr[j];
+            si->sse_sum[i][j] += cx_pkt->data.layer_psnr[i].sse[j];
+          }
+        }
+        ++si->psnr_pkt_received;
+        break;
+      }
+      case VPX_CODEC_SPATIAL_SVC_LAYER_SIZES: {
+        int i;
+        for (i = 0; i < svc_ctx->spatial_layers; ++i)
+          si->bytes_sum[i] += cx_pkt->data.layer_sizes[i];
+        break;
+      }
+#endif
+#endif
+      case VPX_CODEC_PSNR_PKT:
+      {
+#if VPX_ENCODER_ABI_VERSION > (5 + VPX_CODEC_ABI_VERSION)
+        int j;
+        svc_log(svc_ctx, SVC_LOG_DEBUG,
+                "frame: %d, layer: %d, PSNR(Total/Y/U/V): "
+                "%2.3f  %2.3f  %2.3f  %2.3f \n",
+                si->psnr_pkt_received, 0,
+                cx_pkt->data.layer_psnr[0].psnr[0],
+                cx_pkt->data.layer_psnr[0].psnr[1],
+                cx_pkt->data.layer_psnr[0].psnr[2],
+                cx_pkt->data.layer_psnr[0].psnr[3]);
+        for (j = 0; j < COMPONENTS; ++j) {
+          si->psnr_sum[0][j] +=
+              cx_pkt->data.layer_psnr[0].psnr[j];
+          si->sse_sum[0][j] += cx_pkt->data.layer_psnr[0].sse[j];
+        }
+#endif
+      }
+      ++si->psnr_pkt_received;
+      break;
+      default: {
+        break;
+      }
+    }
+  }
+
+  return VPX_CODEC_OK;
+}
+
+const char *vpx_svc_get_message(const SvcContext *svc_ctx) {
+  const SvcInternal_t *const si = get_const_svc_internal(svc_ctx);
+  if (svc_ctx == NULL || si == NULL) return NULL;
+  return si->message_buffer;
+}
+
+static double calc_psnr(double d) {
+  if (d == 0) return 100;
+  return -10.0 * log(d) / log(10.0);
+}
+
+// dump accumulated statistics and reset accumulated values
+const char *vpx_svc_dump_statistics(SvcContext *svc_ctx) {
+  int number_of_frames;
+  int i, j;
+  uint32_t bytes_total = 0;
+  double scale[COMPONENTS];
+  double psnr[COMPONENTS];
+  double mse[COMPONENTS];
+  double y_scale;
+
+  SvcInternal_t *const si = get_svc_internal(svc_ctx);
+  if (svc_ctx == NULL || si == NULL) return NULL;
+
+  svc_log_reset(svc_ctx);
+
+  number_of_frames = si->psnr_pkt_received;
+  if (number_of_frames <= 0) return vpx_svc_get_message(svc_ctx);
+
+  svc_log(svc_ctx, SVC_LOG_INFO, "\n");
+  for (i = 0; i < svc_ctx->spatial_layers; ++i) {
+
+    svc_log(svc_ctx, SVC_LOG_INFO,
+            "Layer %d Average PSNR=[%2.3f, %2.3f, %2.3f, %2.3f], Bytes=[%u]\n",
+            i, (double)si->psnr_sum[i][0] / number_of_frames,
+            (double)si->psnr_sum[i][1] / number_of_frames,
+            (double)si->psnr_sum[i][2] / number_of_frames,
+            (double)si->psnr_sum[i][3] / number_of_frames, si->bytes_sum[i]);
+    // the following psnr calculation is deduced from ffmpeg.c#print_report
+    y_scale = si->width * si->height * 255.0 * 255.0 * number_of_frames;
+    scale[1] = y_scale;
+    scale[2] = scale[3] = y_scale / 4;  // U or V
+    scale[0] = y_scale * 1.5;           // total
+
+    for (j = 0; j < COMPONENTS; j++) {
+      psnr[j] = calc_psnr(si->sse_sum[i][j] / scale[j]);
+      mse[j] = si->sse_sum[i][j] * 255.0 * 255.0 / scale[j];
+    }
+    svc_log(svc_ctx, SVC_LOG_INFO,
+            "Layer %d Overall PSNR=[%2.3f, %2.3f, %2.3f, %2.3f]\n", i, psnr[0],
+            psnr[1], psnr[2], psnr[3]);
+    svc_log(svc_ctx, SVC_LOG_INFO,
+            "Layer %d Overall MSE=[%2.3f, %2.3f, %2.3f, %2.3f]\n", i, mse[0],
+            mse[1], mse[2], mse[3]);
+
+    bytes_total += si->bytes_sum[i];
+    // Clear sums for next time.
+    si->bytes_sum[i] = 0;
+    for (j = 0; j < COMPONENTS; ++j) {
+      si->psnr_sum[i][j] = 0;
+      si->sse_sum[i][j] = 0;
+    }
+  }
+
+  // only display statistics once
+  si->psnr_pkt_received = 0;
+
+  svc_log(svc_ctx, SVC_LOG_INFO, "Total Bytes=[%u]\n", bytes_total);
+  return vpx_svc_get_message(svc_ctx);
+}
+
+void vpx_svc_release(SvcContext *svc_ctx) {
+  SvcInternal_t *si;
+  if (svc_ctx == NULL) return;
+  // do not use get_svc_internal as it will unnecessarily allocate an
+  // SvcInternal_t if it was not already allocated
+  si = (SvcInternal_t *)svc_ctx->internal;
+  if (si != NULL) {
+    free(si);
+    svc_ctx->internal = NULL;
+  }
+}
+
diff --git a/libs/libvpx/vpx/src/vpx_codec.c b/libs/libvpx/vpx/src/vpx_codec.c
new file mode 100644
index 0000000000..5a495ce814
--- /dev/null
+++ b/libs/libvpx/vpx/src/vpx_codec.c
@@ -0,0 +1,158 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+/*!\file
+ * \brief Provides the high level interface to wrap decoder algorithms.
+ *
+ */
+#include <stdarg.h>
+#include <stdlib.h>
+#include "vpx/vpx_integer.h"
+#include "vpx/internal/vpx_codec_internal.h"
+#include "vpx_version.h"
+
+#define SAVE_STATUS(ctx,var) (ctx?(ctx->err = var):var)
+
+int vpx_codec_version(void) {
+  return VERSION_PACKED;
+}
+
+
+const char *vpx_codec_version_str(void) {
+  return VERSION_STRING_NOSP;
+}
+
+
+const char *vpx_codec_version_extra_str(void) {
+  return VERSION_EXTRA;
+}
+
+
+const char *vpx_codec_iface_name(vpx_codec_iface_t *iface) {
+  return iface ? iface->name : "<invalid interface>";
+}
+
+const char *vpx_codec_err_to_string(vpx_codec_err_t  err) {
+  switch (err) {
+    case VPX_CODEC_OK:
+      return "Success";
+    case VPX_CODEC_ERROR:
+      return "Unspecified internal error";
+    case VPX_CODEC_MEM_ERROR:
+      return "Memory allocation error";
+    case VPX_CODEC_ABI_MISMATCH:
+      return "ABI version mismatch";
+    case VPX_CODEC_INCAPABLE:
+      return "Codec does not implement requested capability";
+    case VPX_CODEC_UNSUP_BITSTREAM:
+      return "Bitstream not supported by this decoder";
+    case VPX_CODEC_UNSUP_FEATURE:
+      return "Bitstream required feature not supported by this decoder";
+    case VPX_CODEC_CORRUPT_FRAME:
+      return "Corrupt frame detected";
+    case  VPX_CODEC_INVALID_PARAM:
+      return "Invalid parameter";
+    case VPX_CODEC_LIST_END:
+      return "End of iterated list";
+  }
+
+  return "Unrecognized error code";
+}
+
+const char *vpx_codec_error(vpx_codec_ctx_t  *ctx) {
+  return (ctx) ? vpx_codec_err_to_string(ctx->err)
+         : vpx_codec_err_to_string(VPX_CODEC_INVALID_PARAM);
+}
+
+const char *vpx_codec_error_detail(vpx_codec_ctx_t  *ctx) {
+  if (ctx && ctx->err)
+    return ctx->priv ? ctx->priv->err_detail : ctx->err_detail;
+
+  return NULL;
+}
+
+
+vpx_codec_err_t vpx_codec_destroy(vpx_codec_ctx_t *ctx) {
+  vpx_codec_err_t res;
+
+  if (!ctx)
+    res = VPX_CODEC_INVALID_PARAM;
+  else if (!ctx->iface || !ctx->priv)
+    res = VPX_CODEC_ERROR;
+  else {
+    ctx->iface->destroy((vpx_codec_alg_priv_t *)ctx->priv);
+
+    ctx->iface = NULL;
+    ctx->name = NULL;
+    ctx->priv = NULL;
+    res = VPX_CODEC_OK;
+  }
+
+  return SAVE_STATUS(ctx, res);
+}
+
+
+vpx_codec_caps_t vpx_codec_get_caps(vpx_codec_iface_t *iface) {
+  return (iface) ? iface->caps : 0;
+}
+
+
+vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t  *ctx,
+                                   int               ctrl_id,
+                                   ...) {
+  vpx_codec_err_t res;
+
+  if (!ctx || !ctrl_id)
+    res = VPX_CODEC_INVALID_PARAM;
+  else if (!ctx->iface || !ctx->priv || !ctx->iface->ctrl_maps)
+    res = VPX_CODEC_ERROR;
+  else {
+    vpx_codec_ctrl_fn_map_t *entry;
+
+    res = VPX_CODEC_ERROR;
+
+    for (entry = ctx->iface->ctrl_maps; entry && entry->fn; entry++) {
+      if (!entry->ctrl_id || entry->ctrl_id == ctrl_id) {
+        va_list  ap;
+
+        va_start(ap, ctrl_id);
+        res = entry->fn((vpx_codec_alg_priv_t *)ctx->priv, ap);
+        va_end(ap);
+        break;
+      }
+    }
+  }
+
+  return SAVE_STATUS(ctx, res);
+}
+
+void vpx_internal_error(struct vpx_internal_error_info *info,
+                        vpx_codec_err_t                 error,
+                        const char                     *fmt,
+                        ...) {
+  va_list ap;
+
+  info->error_code = error;
+  info->has_detail = 0;
+
+  if (fmt) {
+    size_t  sz = sizeof(info->detail);
+
+    info->has_detail = 1;
+    va_start(ap, fmt);
+    vsnprintf(info->detail, sz - 1, fmt, ap);
+    va_end(ap);
+    info->detail[sz - 1] = '\0';
+  }
+
+  if (info->setjmp)
+    longjmp(info->jmp, info->error_code);
+}
diff --git a/libs/libvpx/vpx/src/vpx_decoder.c b/libs/libvpx/vpx/src/vpx_decoder.c
new file mode 100644
index 0000000000..802d8edd8a
--- /dev/null
+++ b/libs/libvpx/vpx/src/vpx_decoder.c
@@ -0,0 +1,197 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+/*!\file
+ * \brief Provides the high level interface to wrap decoder algorithms.
+ *
+ */
+#include <string.h>
+#include "vpx/internal/vpx_codec_internal.h"
+
+#define SAVE_STATUS(ctx,var) (ctx?(ctx->err = var):var)
+
+static vpx_codec_alg_priv_t *get_alg_priv(vpx_codec_ctx_t *ctx) {
+  return (vpx_codec_alg_priv_t *)ctx->priv;
+}
+
+vpx_codec_err_t vpx_codec_dec_init_ver(vpx_codec_ctx_t      *ctx,
+                                       vpx_codec_iface_t    *iface,
+                                       const vpx_codec_dec_cfg_t *cfg,
+                                       vpx_codec_flags_t     flags,
+                                       int                   ver) {
+  vpx_codec_err_t res;
+
+  if (ver != VPX_DECODER_ABI_VERSION)
+    res = VPX_CODEC_ABI_MISMATCH;
+  else if (!ctx || !iface)
+    res = VPX_CODEC_INVALID_PARAM;
+  else if (iface->abi_version != VPX_CODEC_INTERNAL_ABI_VERSION)
+    res = VPX_CODEC_ABI_MISMATCH;
+  else if ((flags & VPX_CODEC_USE_POSTPROC) && !(iface->caps & VPX_CODEC_CAP_POSTPROC))
+    res = VPX_CODEC_INCAPABLE;
+  else if ((flags & VPX_CODEC_USE_ERROR_CONCEALMENT) &&
+           !(iface->caps & VPX_CODEC_CAP_ERROR_CONCEALMENT))
+    res = VPX_CODEC_INCAPABLE;
+  else if ((flags & VPX_CODEC_USE_INPUT_FRAGMENTS) &&
+           !(iface->caps & VPX_CODEC_CAP_INPUT_FRAGMENTS))
+    res = VPX_CODEC_INCAPABLE;
+  else if (!(iface->caps & VPX_CODEC_CAP_DECODER))
+    res = VPX_CODEC_INCAPABLE;
+  else {
+    memset(ctx, 0, sizeof(*ctx));
+    ctx->iface = iface;
+    ctx->name = iface->name;
+    ctx->priv = NULL;
+    ctx->init_flags = flags;
+    ctx->config.dec = cfg;
+
+    res = ctx->iface->init(ctx, NULL);
+    if (res) {
+      ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL;
+      vpx_codec_destroy(ctx);
+    }
+  }
+
+  return SAVE_STATUS(ctx, res);
+}
+
+
+vpx_codec_err_t vpx_codec_peek_stream_info(vpx_codec_iface_t       *iface,
+                                           const uint8_t         *data,
+                                           unsigned int           data_sz,
+                                           vpx_codec_stream_info_t *si) {
+  vpx_codec_err_t res;
+
+  if (!iface || !data || !data_sz || !si
+      || si->sz < sizeof(vpx_codec_stream_info_t))
+    res = VPX_CODEC_INVALID_PARAM;
+  else {
+    /* Set default/unknown values */
+    si->w = 0;
+    si->h = 0;
+
+    res = iface->dec.peek_si(data, data_sz, si);
+  }
+
+  return res;
+}
+
+
+vpx_codec_err_t vpx_codec_get_stream_info(vpx_codec_ctx_t         *ctx,
+                                          vpx_codec_stream_info_t *si) {
+  vpx_codec_err_t res;
+
+  if (!ctx || !si || si->sz < sizeof(vpx_codec_stream_info_t))
+    res = VPX_CODEC_INVALID_PARAM;
+  else if (!ctx->iface || !ctx->priv)
+    res = VPX_CODEC_ERROR;
+  else {
+    /* Set default/unknown values */
+    si->w = 0;
+    si->h = 0;
+
+    res = ctx->iface->dec.get_si(get_alg_priv(ctx), si);
+  }
+
+  return SAVE_STATUS(ctx, res);
+}
+
+
+vpx_codec_err_t vpx_codec_decode(vpx_codec_ctx_t    *ctx,
+                                 const uint8_t        *data,
+                                 unsigned int    data_sz,
+                                 void       *user_priv,
+                                 long        deadline) {
+  vpx_codec_err_t res;
+
+  /* Sanity checks */
+  /* NULL data ptr allowed if data_sz is 0 too */
+  if (!ctx || (!data && data_sz) || (data && !data_sz))
+    res = VPX_CODEC_INVALID_PARAM;
+  else if (!ctx->iface || !ctx->priv)
+    res = VPX_CODEC_ERROR;
+  else {
+    res = ctx->iface->dec.decode(get_alg_priv(ctx), data, data_sz, user_priv,
+                                 deadline);
+  }
+
+  return SAVE_STATUS(ctx, res);
+}
+
+vpx_image_t *vpx_codec_get_frame(vpx_codec_ctx_t  *ctx,
+                                 vpx_codec_iter_t *iter) {
+  vpx_image_t *img;
+
+  if (!ctx || !iter || !ctx->iface || !ctx->priv)
+    img = NULL;
+  else
+    img = ctx->iface->dec.get_frame(get_alg_priv(ctx), iter);
+
+  return img;
+}
+
+
+vpx_codec_err_t vpx_codec_register_put_frame_cb(vpx_codec_ctx_t             *ctx,
+                                                vpx_codec_put_frame_cb_fn_t  cb,
+                                                void                      *user_priv) {
+  vpx_codec_err_t res;
+
+  if (!ctx || !cb)
+    res = VPX_CODEC_INVALID_PARAM;
+  else if (!ctx->iface || !ctx->priv
+           || !(ctx->iface->caps & VPX_CODEC_CAP_PUT_FRAME))
+    res = VPX_CODEC_ERROR;
+  else {
+    ctx->priv->dec.put_frame_cb.u.put_frame = cb;
+    ctx->priv->dec.put_frame_cb.user_priv = user_priv;
+    res = VPX_CODEC_OK;
+  }
+
+  return SAVE_STATUS(ctx, res);
+}
+
+
+vpx_codec_err_t vpx_codec_register_put_slice_cb(vpx_codec_ctx_t             *ctx,
+                                                vpx_codec_put_slice_cb_fn_t  cb,
+                                                void                      *user_priv) {
+  vpx_codec_err_t res;
+
+  if (!ctx || !cb)
+    res = VPX_CODEC_INVALID_PARAM;
+  else if (!ctx->iface || !ctx->priv
+           || !(ctx->iface->caps & VPX_CODEC_CAP_PUT_SLICE))
+    res = VPX_CODEC_ERROR;
+  else {
+    ctx->priv->dec.put_slice_cb.u.put_slice = cb;
+    ctx->priv->dec.put_slice_cb.user_priv = user_priv;
+    res = VPX_CODEC_OK;
+  }
+
+  return SAVE_STATUS(ctx, res);
+}
+
+vpx_codec_err_t vpx_codec_set_frame_buffer_functions(
+    vpx_codec_ctx_t *ctx, vpx_get_frame_buffer_cb_fn_t cb_get,
+    vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) {
+  vpx_codec_err_t res;
+
+  if (!ctx || !cb_get || !cb_release) {
+    res = VPX_CODEC_INVALID_PARAM;
+  } else if (!ctx->iface || !ctx->priv ||
+             !(ctx->iface->caps & VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER)) {
+    res = VPX_CODEC_ERROR;
+  } else {
+    res = ctx->iface->dec.set_fb_fn(get_alg_priv(ctx), cb_get, cb_release,
+                                    cb_priv);
+  }
+
+  return SAVE_STATUS(ctx, res);
+}
diff --git a/libs/libvpx/vpx/src/vpx_encoder.c b/libs/libvpx/vpx/src/vpx_encoder.c
new file mode 100644
index 0000000000..cd10c411ce
--- /dev/null
+++ b/libs/libvpx/vpx/src/vpx_encoder.c
@@ -0,0 +1,401 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+/*!\file
+ * \brief Provides the high level interface to wrap encoder algorithms.
+ *
+ */
+#include <limits.h>
+#include <string.h>
+#include "vpx_config.h"
+#include "vpx/internal/vpx_codec_internal.h"
+
+#define SAVE_STATUS(ctx,var) (ctx?(ctx->err = var):var)
+
+static vpx_codec_alg_priv_t *get_alg_priv(vpx_codec_ctx_t *ctx) {
+  return (vpx_codec_alg_priv_t *)ctx->priv;
+}
+
+vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t      *ctx,
+                                       vpx_codec_iface_t    *iface,
+                                       const vpx_codec_enc_cfg_t *cfg,
+                                       vpx_codec_flags_t     flags,
+                                       int                   ver) {
+  vpx_codec_err_t res;
+
+  if (ver != VPX_ENCODER_ABI_VERSION)
+    res = VPX_CODEC_ABI_MISMATCH;
+  else if (!ctx || !iface || !cfg)
+    res = VPX_CODEC_INVALID_PARAM;
+  else if (iface->abi_version != VPX_CODEC_INTERNAL_ABI_VERSION)
+    res = VPX_CODEC_ABI_MISMATCH;
+  else if (!(iface->caps & VPX_CODEC_CAP_ENCODER))
+    res = VPX_CODEC_INCAPABLE;
+  else if ((flags & VPX_CODEC_USE_PSNR)
+           && !(iface->caps & VPX_CODEC_CAP_PSNR))
+    res = VPX_CODEC_INCAPABLE;
+  else if ((flags & VPX_CODEC_USE_OUTPUT_PARTITION)
+           && !(iface->caps & VPX_CODEC_CAP_OUTPUT_PARTITION))
+    res = VPX_CODEC_INCAPABLE;
+  else {
+    ctx->iface = iface;
+    ctx->name = iface->name;
+    ctx->priv = NULL;
+    ctx->init_flags = flags;
+    ctx->config.enc = cfg;
+    res = ctx->iface->init(ctx, NULL);
+
+    if (res) {
+      ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL;
+      vpx_codec_destroy(ctx);
+    }
+  }
+
+  return SAVE_STATUS(ctx, res);
+}
+
+vpx_codec_err_t vpx_codec_enc_init_multi_ver(vpx_codec_ctx_t      *ctx,
+                                             vpx_codec_iface_t    *iface,
+                                             vpx_codec_enc_cfg_t  *cfg,
+                                             int                   num_enc,
+                                             vpx_codec_flags_t     flags,
+                                             vpx_rational_t       *dsf,
+                                             int                   ver) {
+  vpx_codec_err_t res = VPX_CODEC_OK;
+
+  if (ver != VPX_ENCODER_ABI_VERSION)
+    res = VPX_CODEC_ABI_MISMATCH;
+  else if (!ctx || !iface || !cfg || (num_enc > 16 || num_enc < 1))
+    res = VPX_CODEC_INVALID_PARAM;
+  else if (iface->abi_version != VPX_CODEC_INTERNAL_ABI_VERSION)
+    res = VPX_CODEC_ABI_MISMATCH;
+  else if (!(iface->caps & VPX_CODEC_CAP_ENCODER))
+    res = VPX_CODEC_INCAPABLE;
+  else if ((flags & VPX_CODEC_USE_PSNR)
+           && !(iface->caps & VPX_CODEC_CAP_PSNR))
+    res = VPX_CODEC_INCAPABLE;
+  else if ((flags & VPX_CODEC_USE_OUTPUT_PARTITION)
+           && !(iface->caps & VPX_CODEC_CAP_OUTPUT_PARTITION))
+    res = VPX_CODEC_INCAPABLE;
+  else {
+    int i;
+    void *mem_loc = NULL;
+
+    if (!(res = iface->enc.mr_get_mem_loc(cfg, &mem_loc))) {
+      for (i = 0; i < num_enc; i++) {
+        vpx_codec_priv_enc_mr_cfg_t mr_cfg;
+
+        /* Validate down-sampling factor. */
+        if (dsf->num < 1 || dsf->num > 4096 || dsf->den < 1 ||
+            dsf->den > dsf->num) {
+          res = VPX_CODEC_INVALID_PARAM;
+          break;
+        }
+
+        mr_cfg.mr_low_res_mode_info = mem_loc;
+        mr_cfg.mr_total_resolutions = num_enc;
+        mr_cfg.mr_encoder_id = num_enc - 1 - i;
+        mr_cfg.mr_down_sampling_factor.num = dsf->num;
+        mr_cfg.mr_down_sampling_factor.den = dsf->den;
+
+        /* Force Key-frame synchronization. Namely, encoder at higher
+         * resolution always use the same frame_type chosen by the
+         * lowest-resolution encoder.
+         */
+        if (mr_cfg.mr_encoder_id)
+          cfg->kf_mode = VPX_KF_DISABLED;
+
+        ctx->iface = iface;
+        ctx->name = iface->name;
+        ctx->priv = NULL;
+        ctx->init_flags = flags;
+        ctx->config.enc = cfg;
+        res = ctx->iface->init(ctx, &mr_cfg);
+
+        if (res) {
+          const char *error_detail =
+            ctx->priv ? ctx->priv->err_detail : NULL;
+          /* Destroy current ctx */
+          ctx->err_detail = error_detail;
+          vpx_codec_destroy(ctx);
+
+          /* Destroy already allocated high-level ctx */
+          while (i) {
+            ctx--;
+            ctx->err_detail = error_detail;
+            vpx_codec_destroy(ctx);
+            i--;
+          }
+        }
+
+        if (res)
+          break;
+
+        ctx++;
+        cfg++;
+        dsf++;
+      }
+      ctx--;
+    }
+  }
+
+  return SAVE_STATUS(ctx, res);
+}
+
+
+vpx_codec_err_t  vpx_codec_enc_config_default(vpx_codec_iface_t    *iface,
+                                              vpx_codec_enc_cfg_t  *cfg,
+                                              unsigned int          usage) {
+  vpx_codec_err_t res;
+  vpx_codec_enc_cfg_map_t *map;
+  int i;
+
+  if (!iface || !cfg || usage > INT_MAX)
+    res = VPX_CODEC_INVALID_PARAM;
+  else if (!(iface->caps & VPX_CODEC_CAP_ENCODER))
+    res = VPX_CODEC_INCAPABLE;
+  else {
+    res = VPX_CODEC_INVALID_PARAM;
+
+    for (i = 0; i < iface->enc.cfg_map_count; ++i) {
+      map = iface->enc.cfg_maps + i;
+      if (map->usage == (int)usage) {
+        *cfg = map->cfg;
+        cfg->g_usage = usage;
+        res = VPX_CODEC_OK;
+        break;
+      }
+    }
+  }
+
+  return res;
+}
+
+
+#if ARCH_X86 || ARCH_X86_64
+/* On X86, disable the x87 unit's internal 80 bit precision for better
+ * consistency with the SSE unit's 64 bit precision.
+ */
+#include "vpx_ports/x86.h"
+#define FLOATING_POINT_INIT() do {\
+    unsigned short x87_orig_mode = x87_set_double_precision();
+#define FLOATING_POINT_RESTORE() \
+  x87_set_control_word(x87_orig_mode); }while(0)
+
+
+#else
+static void FLOATING_POINT_INIT() {}
+static void FLOATING_POINT_RESTORE() {}
+#endif
+
+
+vpx_codec_err_t  vpx_codec_encode(vpx_codec_ctx_t            *ctx,
+                                  const vpx_image_t          *img,
+                                  vpx_codec_pts_t             pts,
+                                  unsigned long               duration,
+                                  vpx_enc_frame_flags_t       flags,
+                                  unsigned long               deadline) {
+  vpx_codec_err_t res = VPX_CODEC_OK;
+
+  if (!ctx || (img && !duration))
+    res = VPX_CODEC_INVALID_PARAM;
+  else if (!ctx->iface || !ctx->priv)
+    res = VPX_CODEC_ERROR;
+  else if (!(ctx->iface->caps & VPX_CODEC_CAP_ENCODER))
+    res = VPX_CODEC_INCAPABLE;
+  else {
+    unsigned int num_enc = ctx->priv->enc.total_encoders;
+
+    /* Execute in a normalized floating point environment, if the platform
+     * requires it.
+     */
+    FLOATING_POINT_INIT();
+
+    if (num_enc == 1)
+      res = ctx->iface->enc.encode(get_alg_priv(ctx), img, pts,
+                                   duration, flags, deadline);
+    else {
+      /* Multi-resolution encoding:
+       * Encode multi-levels in reverse order. For example,
+       * if mr_total_resolutions = 3, first encode level 2,
+       * then encode level 1, and finally encode level 0.
+       */
+      int i;
+
+      ctx += num_enc - 1;
+      if (img) img += num_enc - 1;
+
+      for (i = num_enc - 1; i >= 0; i--) {
+        if ((res = ctx->iface->enc.encode(get_alg_priv(ctx), img, pts,
+                                          duration, flags, deadline)))
+          break;
+
+        ctx--;
+        if (img) img--;
+      }
+      ctx++;
+    }
+
+    FLOATING_POINT_RESTORE();
+  }
+
+  return SAVE_STATUS(ctx, res);
+}
+
+
+const vpx_codec_cx_pkt_t *vpx_codec_get_cx_data(vpx_codec_ctx_t *ctx,
+                                                vpx_codec_iter_t *iter) {
+  const vpx_codec_cx_pkt_t *pkt = NULL;
+
+  if (ctx) {
+    if (!iter)
+      ctx->err = VPX_CODEC_INVALID_PARAM;
+    else if (!ctx->iface || !ctx->priv)
+      ctx->err = VPX_CODEC_ERROR;
+    else if (!(ctx->iface->caps & VPX_CODEC_CAP_ENCODER))
+      ctx->err = VPX_CODEC_INCAPABLE;
+    else
+      pkt = ctx->iface->enc.get_cx_data(get_alg_priv(ctx), iter);
+  }
+
+  if (pkt && pkt->kind == VPX_CODEC_CX_FRAME_PKT) {
+    // If the application has specified a destination area for the
+    // compressed data, and the codec has not placed the data there,
+    // and it fits, copy it.
+    vpx_codec_priv_t *const priv = ctx->priv;
+    char *const dst_buf = (char *)priv->enc.cx_data_dst_buf.buf;
+
+    if (dst_buf &&
+        pkt->data.raw.buf != dst_buf &&
+        pkt->data.raw.sz + priv->enc.cx_data_pad_before +
+            priv->enc.cx_data_pad_after <= priv->enc.cx_data_dst_buf.sz) {
+      vpx_codec_cx_pkt_t *modified_pkt = &priv->enc.cx_data_pkt;
+
+      memcpy(dst_buf + priv->enc.cx_data_pad_before, pkt->data.raw.buf,
+             pkt->data.raw.sz);
+      *modified_pkt = *pkt;
+      modified_pkt->data.raw.buf = dst_buf;
+      modified_pkt->data.raw.sz += priv->enc.cx_data_pad_before +
+                                       priv->enc.cx_data_pad_after;
+      pkt = modified_pkt;
+    }
+
+    if (dst_buf == pkt->data.raw.buf) {
+      priv->enc.cx_data_dst_buf.buf = dst_buf + pkt->data.raw.sz;
+      priv->enc.cx_data_dst_buf.sz -= pkt->data.raw.sz;
+    }
+  }
+
+  return pkt;
+}
+
+
+vpx_codec_err_t vpx_codec_set_cx_data_buf(vpx_codec_ctx_t       *ctx,
+                                          const vpx_fixed_buf_t *buf,
+                                          unsigned int           pad_before,
+                                          unsigned int           pad_after) {
+  if (!ctx || !ctx->priv)
+    return VPX_CODEC_INVALID_PARAM;
+
+  if (buf) {
+    ctx->priv->enc.cx_data_dst_buf = *buf;
+    ctx->priv->enc.cx_data_pad_before = pad_before;
+    ctx->priv->enc.cx_data_pad_after = pad_after;
+  } else {
+    ctx->priv->enc.cx_data_dst_buf.buf = NULL;
+    ctx->priv->enc.cx_data_dst_buf.sz = 0;
+    ctx->priv->enc.cx_data_pad_before = 0;
+    ctx->priv->enc.cx_data_pad_after = 0;
+  }
+
+  return VPX_CODEC_OK;
+}
+
+
+const vpx_image_t *vpx_codec_get_preview_frame(vpx_codec_ctx_t   *ctx) {
+  vpx_image_t *img = NULL;
+
+  if (ctx) {
+    if (!ctx->iface || !ctx->priv)
+      ctx->err = VPX_CODEC_ERROR;
+    else if (!(ctx->iface->caps & VPX_CODEC_CAP_ENCODER))
+      ctx->err = VPX_CODEC_INCAPABLE;
+    else if (!ctx->iface->enc.get_preview)
+      ctx->err = VPX_CODEC_INCAPABLE;
+    else
+      img = ctx->iface->enc.get_preview(get_alg_priv(ctx));
+  }
+
+  return img;
+}
+
+
+vpx_fixed_buf_t *vpx_codec_get_global_headers(vpx_codec_ctx_t   *ctx) {
+  vpx_fixed_buf_t *buf = NULL;
+
+  if (ctx) {
+    if (!ctx->iface || !ctx->priv)
+      ctx->err = VPX_CODEC_ERROR;
+    else if (!(ctx->iface->caps & VPX_CODEC_CAP_ENCODER))
+      ctx->err = VPX_CODEC_INCAPABLE;
+    else if (!ctx->iface->enc.get_glob_hdrs)
+      ctx->err = VPX_CODEC_INCAPABLE;
+    else
+      buf = ctx->iface->enc.get_glob_hdrs(get_alg_priv(ctx));
+  }
+
+  return buf;
+}
+
+
+vpx_codec_err_t  vpx_codec_enc_config_set(vpx_codec_ctx_t            *ctx,
+                                          const vpx_codec_enc_cfg_t  *cfg) {
+  vpx_codec_err_t res;
+
+  if (!ctx || !ctx->iface || !ctx->priv || !cfg)
+    res = VPX_CODEC_INVALID_PARAM;
+  else if (!(ctx->iface->caps & VPX_CODEC_CAP_ENCODER))
+    res = VPX_CODEC_INCAPABLE;
+  else
+    res = ctx->iface->enc.cfg_set(get_alg_priv(ctx), cfg);
+
+  return SAVE_STATUS(ctx, res);
+}
+
+
+int vpx_codec_pkt_list_add(struct vpx_codec_pkt_list *list,
+                           const struct vpx_codec_cx_pkt *pkt) {
+  if (list->cnt < list->max) {
+    list->pkts[list->cnt++] = *pkt;
+    return 0;
+  }
+
+  return 1;
+}
+
+
+const vpx_codec_cx_pkt_t *vpx_codec_pkt_list_get(struct vpx_codec_pkt_list *list,
+                                                 vpx_codec_iter_t           *iter) {
+  const vpx_codec_cx_pkt_t *pkt;
+
+  if (!(*iter)) {
+    *iter = list->pkts;
+  }
+
+  pkt = (const vpx_codec_cx_pkt_t *)*iter;
+
+  if ((size_t)(pkt - list->pkts) < list->cnt)
+    *iter = pkt + 1;
+  else
+    pkt = NULL;
+
+  return pkt;
+}
diff --git a/libs/libvpx/vpx/src/vpx_image.c b/libs/libvpx/vpx/src/vpx_image.c
new file mode 100644
index 0000000000..9aae12c794
--- /dev/null
+++ b/libs/libvpx/vpx/src/vpx_image.c
@@ -0,0 +1,285 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "vpx/vpx_image.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
+
+static vpx_image_t *img_alloc_helper(vpx_image_t *img,
+                                     vpx_img_fmt_t fmt,
+                                     unsigned int d_w,
+                                     unsigned int d_h,
+                                     unsigned int buf_align,
+                                     unsigned int stride_align,
+                                     unsigned char *img_data) {
+  unsigned int h, w, s, xcs, ycs, bps;
+  unsigned int stride_in_bytes;
+  int align;
+
+  /* Treat align==0 like align==1 */
+  if (!buf_align)
+    buf_align = 1;
+
+  /* Validate alignment (must be power of 2) */
+  if (buf_align & (buf_align - 1))
+    goto fail;
+
+  /* Treat align==0 like align==1 */
+  if (!stride_align)
+    stride_align = 1;
+
+  /* Validate alignment (must be power of 2) */
+  if (stride_align & (stride_align - 1))
+    goto fail;
+
+  /* Get sample size for this format */
+  switch (fmt) {
+    case VPX_IMG_FMT_RGB32:
+    case VPX_IMG_FMT_RGB32_LE:
+    case VPX_IMG_FMT_ARGB:
+    case VPX_IMG_FMT_ARGB_LE:
+      bps = 32;
+      break;
+    case VPX_IMG_FMT_RGB24:
+    case VPX_IMG_FMT_BGR24:
+      bps = 24;
+      break;
+    case VPX_IMG_FMT_RGB565:
+    case VPX_IMG_FMT_RGB565_LE:
+    case VPX_IMG_FMT_RGB555:
+    case VPX_IMG_FMT_RGB555_LE:
+    case VPX_IMG_FMT_UYVY:
+    case VPX_IMG_FMT_YUY2:
+    case VPX_IMG_FMT_YVYU:
+      bps = 16;
+      break;
+    case VPX_IMG_FMT_I420:
+    case VPX_IMG_FMT_YV12:
+    case VPX_IMG_FMT_VPXI420:
+    case VPX_IMG_FMT_VPXYV12:
+      bps = 12;
+      break;
+    case VPX_IMG_FMT_I422:
+    case VPX_IMG_FMT_I440:
+      bps = 16;
+      break;
+    case VPX_IMG_FMT_I444:
+      bps = 24;
+      break;
+    case VPX_IMG_FMT_I42016:
+      bps = 24;
+      break;
+    case VPX_IMG_FMT_I42216:
+    case VPX_IMG_FMT_I44016:
+      bps = 32;
+      break;
+    case VPX_IMG_FMT_I44416:
+      bps = 48;
+      break;
+    default:
+      bps = 16;
+      break;
+  }
+
+  /* Get chroma shift values for this format */
+  switch (fmt) {
+    case VPX_IMG_FMT_I420:
+    case VPX_IMG_FMT_YV12:
+    case VPX_IMG_FMT_VPXI420:
+    case VPX_IMG_FMT_VPXYV12:
+    case VPX_IMG_FMT_I422:
+    case VPX_IMG_FMT_I42016:
+    case VPX_IMG_FMT_I42216:
+      xcs = 1;
+      break;
+    default:
+      xcs = 0;
+      break;
+  }
+
+  switch (fmt) {
+    case VPX_IMG_FMT_I420:
+    case VPX_IMG_FMT_I440:
+    case VPX_IMG_FMT_YV12:
+    case VPX_IMG_FMT_VPXI420:
+    case VPX_IMG_FMT_VPXYV12:
+    case VPX_IMG_FMT_I42016:
+    case VPX_IMG_FMT_I44016:
+      ycs = 1;
+      break;
+    default:
+      ycs = 0;
+      break;
+  }
+
+  /* Calculate storage sizes given the chroma subsampling */
+  align = (1 << xcs) - 1;
+  w = (d_w + align) & ~align;
+  align = (1 << ycs) - 1;
+  h = (d_h + align) & ~align;
+  s = (fmt & VPX_IMG_FMT_PLANAR) ? w : bps * w / 8;
+  s = (s + stride_align - 1) & ~(stride_align - 1);
+  stride_in_bytes = (fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? s * 2 : s;
+
+  /* Allocate the new image */
+  if (!img) {
+    img = (vpx_image_t *)calloc(1, sizeof(vpx_image_t));
+
+    if (!img)
+      goto fail;
+
+    img->self_allocd = 1;
+  } else {
+    memset(img, 0, sizeof(vpx_image_t));
+  }
+
+  img->img_data = img_data;
+
+  if (!img_data) {
+    const uint64_t alloc_size = (fmt & VPX_IMG_FMT_PLANAR) ?
+                                (uint64_t)h * s * bps / 8 : (uint64_t)h * s;
+
+    if (alloc_size != (size_t)alloc_size)
+      goto fail;
+
+    img->img_data = (uint8_t *)vpx_memalign(buf_align, (size_t)alloc_size);
+    img->img_data_owner = 1;
+  }
+
+  if (!img->img_data)
+    goto fail;
+
+  img->fmt = fmt;
+  img->bit_depth = (fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 16 : 8;
+  img->w = w;
+  img->h = h;
+  img->x_chroma_shift = xcs;
+  img->y_chroma_shift = ycs;
+  img->bps = bps;
+
+  /* Calculate strides */
+  img->stride[VPX_PLANE_Y] = img->stride[VPX_PLANE_ALPHA] = stride_in_bytes;
+  img->stride[VPX_PLANE_U] = img->stride[VPX_PLANE_V] = stride_in_bytes >> xcs;
+
+  /* Default viewport to entire image */
+  if (!vpx_img_set_rect(img, 0, 0, d_w, d_h))
+    return img;
+
+fail:
+  vpx_img_free(img);
+  return NULL;
+}
+
+vpx_image_t *vpx_img_alloc(vpx_image_t  *img,
+                           vpx_img_fmt_t fmt,
+                           unsigned int  d_w,
+                           unsigned int  d_h,
+                           unsigned int  align) {
+  return img_alloc_helper(img, fmt, d_w, d_h, align, align, NULL);
+}
+
+vpx_image_t *vpx_img_wrap(vpx_image_t  *img,
+                          vpx_img_fmt_t fmt,
+                          unsigned int  d_w,
+                          unsigned int  d_h,
+                          unsigned int  stride_align,
+                          unsigned char       *img_data) {
+  /* By setting buf_align = 1, we don't change buffer alignment in this
+   * function. */
+  return img_alloc_helper(img, fmt, d_w, d_h, 1, stride_align, img_data);
+}
+
+int vpx_img_set_rect(vpx_image_t  *img,
+                     unsigned int  x,
+                     unsigned int  y,
+                     unsigned int  w,
+                     unsigned int  h) {
+  unsigned char      *data;
+
+  if (x + w <= img->w && y + h <= img->h) {
+    img->d_w = w;
+    img->d_h = h;
+
+    /* Calculate plane pointers */
+    if (!(img->fmt & VPX_IMG_FMT_PLANAR)) {
+      img->planes[VPX_PLANE_PACKED] =
+        img->img_data + x * img->bps / 8 + y * img->stride[VPX_PLANE_PACKED];
+    } else {
+      const int bytes_per_sample =
+          (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
+      data = img->img_data;
+
+      if (img->fmt & VPX_IMG_FMT_HAS_ALPHA) {
+        img->planes[VPX_PLANE_ALPHA] =
+            data + x * bytes_per_sample + y * img->stride[VPX_PLANE_ALPHA];
+        data += img->h * img->stride[VPX_PLANE_ALPHA];
+      }
+
+      img->planes[VPX_PLANE_Y] = data + x * bytes_per_sample +
+          y * img->stride[VPX_PLANE_Y];
+      data += img->h * img->stride[VPX_PLANE_Y];
+
+      if (!(img->fmt & VPX_IMG_FMT_UV_FLIP)) {
+        img->planes[VPX_PLANE_U] =
+            data + (x >> img->x_chroma_shift) * bytes_per_sample +
+            (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_U];
+        data += (img->h >> img->y_chroma_shift) * img->stride[VPX_PLANE_U];
+        img->planes[VPX_PLANE_V] =
+            data + (x >> img->x_chroma_shift) * bytes_per_sample +
+            (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_V];
+      } else {
+        img->planes[VPX_PLANE_V] =
+            data + (x >> img->x_chroma_shift) * bytes_per_sample +
+            (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_V];
+        data += (img->h >> img->y_chroma_shift) * img->stride[VPX_PLANE_V];
+        img->planes[VPX_PLANE_U] =
+            data + (x >> img->x_chroma_shift) * bytes_per_sample +
+            (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_U];
+      }
+    }
+    return 0;
+  }
+  return -1;
+}
+
+void vpx_img_flip(vpx_image_t *img) {
+  /* Note: In the calculation pointer adjustment calculation, we want the
+   * rhs to be promoted to a signed type. Section 6.3.1.8 of the ISO C99
+   * standard indicates that if the adjustment parameter is unsigned, the
+   * stride parameter will be promoted to unsigned, causing errors when
+   * the lhs is a larger type than the rhs.
+   */
+  img->planes[VPX_PLANE_Y] += (signed)(img->d_h - 1) * img->stride[VPX_PLANE_Y];
+  img->stride[VPX_PLANE_Y] = -img->stride[VPX_PLANE_Y];
+
+  img->planes[VPX_PLANE_U] += (signed)((img->d_h >> img->y_chroma_shift) - 1)
+                              * img->stride[VPX_PLANE_U];
+  img->stride[VPX_PLANE_U] = -img->stride[VPX_PLANE_U];
+
+  img->planes[VPX_PLANE_V] += (signed)((img->d_h >> img->y_chroma_shift) - 1)
+                              * img->stride[VPX_PLANE_V];
+  img->stride[VPX_PLANE_V] = -img->stride[VPX_PLANE_V];
+
+  img->planes[VPX_PLANE_ALPHA] += (signed)(img->d_h - 1) * img->stride[VPX_PLANE_ALPHA];
+  img->stride[VPX_PLANE_ALPHA] = -img->stride[VPX_PLANE_ALPHA];
+}
+
+void vpx_img_free(vpx_image_t *img) {
+  if (img) {
+    if (img->img_data && img->img_data_owner)
+      vpx_free(img->img_data);
+
+    if (img->self_allocd)
+      free(img);
+  }
+}
diff --git a/libs/libvpx/vpx/src/vpx_psnr.c b/libs/libvpx/vpx/src/vpx_psnr.c
new file mode 100644
index 0000000000..05843acb61
--- /dev/null
+++ b/libs/libvpx/vpx/src/vpx_psnr.c
@@ -0,0 +1,24 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+
+#include "vpx/internal/vpx_psnr.h"
+
+#define MAX_PSNR 100.0
+
+double vpx_sse_to_psnr(double samples, double peak, double sse) {
+  if (sse > 0.0) {
+    const double psnr = 10.0 * log10(samples * peak * peak / sse);
+    return psnr > MAX_PSNR ? MAX_PSNR : psnr;
+  } else {
+    return MAX_PSNR;
+  }
+}
diff --git a/libs/libvpx/vpx/svc_context.h b/libs/libvpx/vpx/svc_context.h
new file mode 100644
index 0000000000..5bc25189ba
--- /dev/null
+++ b/libs/libvpx/vpx/svc_context.h
@@ -0,0 +1,123 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/**
+ * SvcContext - input parameters and state to encode a multi-layered
+ * spatial SVC frame
+ */
+
+#ifndef VPX_SVC_CONTEXT_H_
+#define VPX_SVC_CONTEXT_H_
+
+#include "./vp8cx.h"
+#include "./vpx_encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum SVC_LOG_LEVEL {
+  SVC_LOG_ERROR,
+  SVC_LOG_INFO,
+  SVC_LOG_DEBUG
+} SVC_LOG_LEVEL;
+
+typedef struct {
+  // public interface to svc_command options
+  int spatial_layers;               // number of spatial layers
+  int temporal_layers;               // number of temporal layers
+  int temporal_layering_mode;
+  SVC_LOG_LEVEL log_level;  // amount of information to display
+  int log_print;  // when set, printf log messages instead of returning the
+                  // message with svc_get_message
+  int output_rc_stat;  // for outputting rc stats
+  int speed;  // speed setting for codec
+  int threads;
+  int aqmode;  // turns on aq-mode=3 (cyclic_refresh): 0=off, 1=on.
+  // private storage for vpx_svc_encode
+  void *internal;
+} SvcContext;
+
+#define OPTION_BUFFER_SIZE 1024
+#define COMPONENTS 4  // psnr & sse statistics maintained for total, y, u, v
+
+typedef struct SvcInternal {
+  char options[OPTION_BUFFER_SIZE];        // set by vpx_svc_set_options
+
+  // values extracted from option, quantizers
+  vpx_svc_extra_cfg_t svc_params;
+  int enable_auto_alt_ref[VPX_SS_MAX_LAYERS];
+  int bitrates[VPX_SS_MAX_LAYERS];
+
+  // accumulated statistics
+  double psnr_sum[VPX_SS_MAX_LAYERS][COMPONENTS];   // total/Y/U/V
+  uint64_t sse_sum[VPX_SS_MAX_LAYERS][COMPONENTS];
+  uint32_t bytes_sum[VPX_SS_MAX_LAYERS];
+
+  // codec encoding values
+  int width;    // width of highest layer
+  int height;   // height of highest layer
+  int kf_dist;  // distance between keyframes
+
+  // state variables
+  int psnr_pkt_received;
+  int layer;
+  int use_multiple_frame_contexts;
+
+  char message_buffer[2048];
+  vpx_codec_ctx_t *codec_ctx;
+} SvcInternal_t;
+
+/**
+ * Set SVC options
+ * options are supplied as a single string separated by spaces
+ * Format: encoding-mode=<i|ip|alt-ip|gf>
+ *         layers=<layer_count>
+ *         scaling-factors=<n1>/<d1>,<n2>/<d2>,...
+ *         quantizers=<q1>,<q2>,...
+ */
+vpx_codec_err_t vpx_svc_set_options(SvcContext *svc_ctx, const char *options);
+
+/**
+ * initialize SVC encoding
+ */
+vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx,
+                             vpx_codec_ctx_t *codec_ctx,
+                             vpx_codec_iface_t *iface,
+                             vpx_codec_enc_cfg_t *cfg);
+/**
+ * encode a frame of video with multiple layers
+ */
+vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx,
+                               vpx_codec_ctx_t *codec_ctx,
+                               struct vpx_image *rawimg,
+                               vpx_codec_pts_t pts,
+                               int64_t duration, int deadline);
+
+/**
+ * finished with svc encoding, release allocated resources
+ */
+void vpx_svc_release(SvcContext *svc_ctx);
+
+/**
+ * dump accumulated statistics and reset accumulated values
+ */
+const char *vpx_svc_dump_statistics(SvcContext *svc_ctx);
+
+/**
+ *  get status message from previous encode
+ */
+const char *vpx_svc_get_message(const SvcContext *svc_ctx);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_SVC_CONTEXT_H_
diff --git a/libs/libvpx/vpx/vp8.h b/libs/libvpx/vpx/vp8.h
new file mode 100644
index 0000000000..8a035f9770
--- /dev/null
+++ b/libs/libvpx/vpx/vp8.h
@@ -0,0 +1,148 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*!\defgroup vp8 VP8
+ * \ingroup codecs
+ * VP8 is vpx's newest video compression algorithm that uses motion
+ * compensated prediction, Discrete Cosine Transform (DCT) coding of the
+ * prediction error signal and context dependent entropy coding techniques
+ * based on arithmetic principles. It features:
+ *  - YUV 4:2:0 image format
+ *  - Macro-block based coding (16x16 luma plus two 8x8 chroma)
+ *  - 1/4 (1/8) pixel accuracy motion compensated prediction
+ *  - 4x4 DCT transform
+ *  - 128 level linear quantizer
+ *  - In loop deblocking filter
+ *  - Context-based entropy coding
+ *
+ * @{
+ */
+/*!\file
+ * \brief Provides controls common to both the VP8 encoder and decoder.
+ */
+#ifndef VPX_VP8_H_
+#define VPX_VP8_H_
+
+#include "./vpx_codec.h"
+#include "./vpx_image.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\brief Control functions
+ *
+ * The set of macros define the control functions of VP8 interface
+ */
+enum vp8_com_control_id {
+  VP8_SET_REFERENCE           = 1,    /**< pass in an external frame into decoder to be used as reference frame */
+  VP8_COPY_REFERENCE          = 2,    /**< get a copy of reference frame from the decoder */
+  VP8_SET_POSTPROC            = 3,    /**< set the decoder's post processing settings  */
+  VP8_SET_DBG_COLOR_REF_FRAME = 4,    /**< set the reference frames to color for each macroblock */
+  VP8_SET_DBG_COLOR_MB_MODES  = 5,    /**< set which macro block modes to color */
+  VP8_SET_DBG_COLOR_B_MODES   = 6,    /**< set which blocks modes to color */
+  VP8_SET_DBG_DISPLAY_MV      = 7,    /**< set which motion vector modes to draw */
+
+  /* TODO(jkoleszar): The encoder incorrectly reuses some of these values (5+)
+   * for its control ids. These should be migrated to something like the
+   * VP8_DECODER_CTRL_ID_START range next time we're ready to break the ABI.
+   */
+  VP9_GET_REFERENCE           = 128,  /**< get a pointer to a reference frame */
+  VP8_COMMON_CTRL_ID_MAX,
+  VP8_DECODER_CTRL_ID_START   = 256
+};
+
+/*!\brief post process flags
+ *
+ * The set of macros define VP8 decoder post processing flags
+ */
+enum vp8_postproc_level {
+  VP8_NOFILTERING             = 0,
+  VP8_DEBLOCK                 = 1 << 0,
+  VP8_DEMACROBLOCK            = 1 << 1,
+  VP8_ADDNOISE                = 1 << 2,
+  VP8_DEBUG_TXT_FRAME_INFO    = 1 << 3, /**< print frame information */
+  VP8_DEBUG_TXT_MBLK_MODES    = 1 << 4, /**< print macro block modes over each macro block */
+  VP8_DEBUG_TXT_DC_DIFF       = 1 << 5, /**< print dc diff for each macro block */
+  VP8_DEBUG_TXT_RATE_INFO     = 1 << 6, /**< print video rate info (encoder only) */
+  VP8_MFQE                    = 1 << 10
+};
+
+/*!\brief post process flags
+ *
+ * This define a structure that describe the post processing settings. For
+ * the best objective measure (using the PSNR metric) set post_proc_flag
+ * to VP8_DEBLOCK and deblocking_level to 1.
+ */
+
+typedef struct vp8_postproc_cfg {
+  int post_proc_flag;         /**< the types of post processing to be done, should be combination of "vp8_postproc_level" */
+  int deblocking_level;       /**< the strength of deblocking, valid range [0, 16] */
+  int noise_level;            /**< the strength of additive noise, valid range [0, 16] */
+} vp8_postproc_cfg_t;
+
+/*!\brief reference frame type
+ *
+ * The set of macros define the type of VP8 reference frames
+ */
+typedef enum vpx_ref_frame_type {
+  VP8_LAST_FRAME = 1,
+  VP8_GOLD_FRAME = 2,
+  VP8_ALTR_FRAME = 4
+} vpx_ref_frame_type_t;
+
+/*!\brief reference frame data struct
+ *
+ * Define the data struct to access vp8 reference frames.
+ */
+typedef struct vpx_ref_frame {
+  vpx_ref_frame_type_t  frame_type;   /**< which reference frame */
+  vpx_image_t           img;          /**< reference frame data in image format */
+} vpx_ref_frame_t;
+
+/*!\brief VP9 specific reference frame data struct
+ *
+ * Define the data struct to access vp9 reference frames.
+ */
+typedef struct vp9_ref_frame {
+  int idx; /**< frame index to get (input) */
+  vpx_image_t  img; /**< img structure to populate (output) */
+} vp9_ref_frame_t;
+
+/*!\cond */
+/*!\brief vp8 decoder control function parameter type
+ *
+ * defines the data type for each of VP8 decoder control function requires
+ */
+VPX_CTRL_USE_TYPE(VP8_SET_REFERENCE,           vpx_ref_frame_t *)
+#define VPX_CTRL_VP8_SET_REFERENCE
+VPX_CTRL_USE_TYPE(VP8_COPY_REFERENCE,          vpx_ref_frame_t *)
+#define VPX_CTRL_VP8_COPY_REFERENCE
+VPX_CTRL_USE_TYPE(VP8_SET_POSTPROC,            vp8_postproc_cfg_t *)
+#define VPX_CTRL_VP8_SET_POSTPROC
+VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_REF_FRAME, int)
+#define VPX_CTRL_VP8_SET_DBG_COLOR_REF_FRAME
+VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_MB_MODES,  int)
+#define VPX_CTRL_VP8_SET_DBG_COLOR_MB_MODES
+VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_B_MODES,   int)
+#define VPX_CTRL_VP8_SET_DBG_COLOR_B_MODES
+VPX_CTRL_USE_TYPE(VP8_SET_DBG_DISPLAY_MV,      int)
+#define VPX_CTRL_VP8_SET_DBG_DISPLAY_MV
+VPX_CTRL_USE_TYPE(VP9_GET_REFERENCE,           vp9_ref_frame_t *)
+#define VPX_CTRL_VP9_GET_REFERENCE
+
+/*!\endcond */
+/*! @} - end defgroup vp8 */
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_VP8_H_
diff --git a/libs/libvpx/vpx/vp8cx.h b/libs/libvpx/vpx/vp8cx.h
new file mode 100644
index 0000000000..bd99c6dc13
--- /dev/null
+++ b/libs/libvpx/vpx/vp8cx.h
@@ -0,0 +1,818 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef VPX_VP8CX_H_
+#define VPX_VP8CX_H_
+
+/*!\defgroup vp8_encoder WebM VP8/VP9 Encoder
+ * \ingroup vp8
+ *
+ * @{
+ */
+#include "./vp8.h"
+#include "./vpx_encoder.h"
+
+/*!\file
+ * \brief Provides definitions for using VP8 or VP9 encoder algorithm within the
+ *        vpx Codec Interface.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\name Algorithm interface for VP8
+ *
+ * This interface provides the capability to encode raw VP8 streams.
+ * @{
+ */
+extern vpx_codec_iface_t  vpx_codec_vp8_cx_algo;
+extern vpx_codec_iface_t *vpx_codec_vp8_cx(void);
+/*!@} - end algorithm interface member group*/
+
+/*!\name Algorithm interface for VP9
+ *
+ * This interface provides the capability to encode raw VP9 streams.
+ * @{
+ */
+extern vpx_codec_iface_t  vpx_codec_vp9_cx_algo;
+extern vpx_codec_iface_t *vpx_codec_vp9_cx(void);
+/*!@} - end algorithm interface member group*/
+
+/*!\name Algorithm interface for VP10
+ *
+ * This interface provides the capability to encode raw VP9 streams.
+ * @{
+ */
+extern vpx_codec_iface_t  vpx_codec_vp10_cx_algo;
+extern vpx_codec_iface_t *vpx_codec_vp10_cx(void);
+/*!@} - end algorithm interface member group*/
+
+/*
+ * Algorithm Flags
+ */
+
+/*!\brief Don't reference the last frame
+ *
+ * When this flag is set, the encoder will not use the last frame as a
+ * predictor. When not set, the encoder will choose whether to use the
+ * last frame or not automatically.
+ */
+#define VP8_EFLAG_NO_REF_LAST      (1<<16)
+
+
+/*!\brief Don't reference the golden frame
+ *
+ * When this flag is set, the encoder will not use the golden frame as a
+ * predictor. When not set, the encoder will choose whether to use the
+ * golden frame or not automatically.
+ */
+#define VP8_EFLAG_NO_REF_GF        (1<<17)
+
+
+/*!\brief Don't reference the alternate reference frame
+ *
+ * When this flag is set, the encoder will not use the alt ref frame as a
+ * predictor. When not set, the encoder will choose whether to use the
+ * alt ref frame or not automatically.
+ */
+#define VP8_EFLAG_NO_REF_ARF       (1<<21)
+
+
+/*!\brief Don't update the last frame
+ *
+ * When this flag is set, the encoder will not update the last frame with
+ * the contents of the current frame.
+ */
+#define VP8_EFLAG_NO_UPD_LAST      (1<<18)
+
+
+/*!\brief Don't update the golden frame
+ *
+ * When this flag is set, the encoder will not update the golden frame with
+ * the contents of the current frame.
+ */
+#define VP8_EFLAG_NO_UPD_GF        (1<<22)
+
+
+/*!\brief Don't update the alternate reference frame
+ *
+ * When this flag is set, the encoder will not update the alt ref frame with
+ * the contents of the current frame.
+ */
+#define VP8_EFLAG_NO_UPD_ARF       (1<<23)
+
+
+/*!\brief Force golden frame update
+ *
+ * When this flag is set, the encoder copy the contents of the current frame
+ * to the golden frame buffer.
+ */
+#define VP8_EFLAG_FORCE_GF         (1<<19)
+
+
+/*!\brief Force alternate reference frame update
+ *
+ * When this flag is set, the encoder copy the contents of the current frame
+ * to the alternate reference frame buffer.
+ */
+#define VP8_EFLAG_FORCE_ARF        (1<<24)
+
+
+/*!\brief Disable entropy update
+ *
+ * When this flag is set, the encoder will not update its internal entropy
+ * model based on the entropy of this frame.
+ */
+#define VP8_EFLAG_NO_UPD_ENTROPY   (1<<20)
+
+
+/*!\brief VPx encoder control functions
+ *
+ * This set of macros define the control functions available for VPx
+ * encoder interface.
+ *
+ * \sa #vpx_codec_control
+ */
+enum vp8e_enc_control_id {
+  /*!\brief Codec control function to pass an ROI map to encoder.
+   *
+   * Supported in codecs: VP8, VP9
+   */
+  VP8E_SET_ROI_MAP           = 8,
+
+  /*!\brief Codec control function to pass an Active map to encoder.
+   *
+   * Supported in codecs: VP8, VP9
+   */
+  VP8E_SET_ACTIVEMAP,
+
+  /*!\brief Codec control function to set encoder scaling mode.
+   *
+   * Supported in codecs: VP8, VP9
+   */
+  VP8E_SET_SCALEMODE         = 11,
+
+  /*!\brief Codec control function to set encoder internal speed settings.
+   *
+   * Changes in this value influences, among others, the encoder's selection
+   * of motion estimation methods. Values greater than 0 will increase encoder
+   * speed at the expense of quality.
+   *
+   * \note Valid range for VP8: -16..16
+   * \note Valid range for VP9: -8..8
+   *
+   * Supported in codecs: VP8, VP9
+   */
+  VP8E_SET_CPUUSED           = 13,
+
+  /*!\brief Codec control function to enable automatic set and use alf frames.
+   *
+   * Supported in codecs: VP8, VP9
+   */
+  VP8E_SET_ENABLEAUTOALTREF,
+
+  /*!\brief control function to set noise sensitivity
+   *
+   * 0: off, 1: OnYOnly, 2: OnYUV,
+   * 3: OnYUVAggressive, 4: Adaptive
+   *
+   * Supported in codecs: VP8
+   */
+  VP8E_SET_NOISE_SENSITIVITY,
+
+  /*!\brief Codec control function to set sharpness.
+   *
+   * Supported in codecs: VP8, VP9
+   */
+  VP8E_SET_SHARPNESS,
+
+  /*!\brief Codec control function to set the threshold for MBs treated static.
+   *
+   * Supported in codecs: VP8, VP9
+   */
+  VP8E_SET_STATIC_THRESHOLD,
+
+  /*!\brief Codec control function to set the number of token partitions.
+   *
+   * Supported in codecs: VP8
+   */
+  VP8E_SET_TOKEN_PARTITIONS,
+
+  /*!\brief Codec control function to get last quantizer chosen by the encoder.
+   *
+   * Return value uses internal quantizer scale defined by the codec.
+   *
+   * Supported in codecs: VP8, VP9
+   */
+  VP8E_GET_LAST_QUANTIZER,
+
+  /*!\brief Codec control function to get last quantizer chosen by the encoder.
+   *
+   * Return value uses the 0..63 scale as used by the rc_*_quantizer config
+   * parameters.
+   *
+   * Supported in codecs: VP8, VP9
+   */
+  VP8E_GET_LAST_QUANTIZER_64,
+
+  /*!\brief Codec control function to set the max no of frames to create arf.
+   *
+   * Supported in codecs: VP8, VP9
+   */
+  VP8E_SET_ARNR_MAXFRAMES,
+
+  /*!\brief Codec control function to set the filter strength for the arf.
+   *
+   * Supported in codecs: VP8, VP9
+   */
+  VP8E_SET_ARNR_STRENGTH,
+
+  /*!\deprecated control function to set the filter type to use for the arf. */
+  VP8E_SET_ARNR_TYPE,
+
+  /*!\brief Codec control function to set visual tuning.
+   *
+   * Supported in codecs: VP8, VP9
+   */
+  VP8E_SET_TUNING,
+
+  /*!\brief Codec control function to set constrained quality level.
+   *
+   * \attention For this value to be used vpx_codec_enc_cfg_t::g_usage must be
+   *            set to #VPX_CQ.
+   * \note Valid range: 0..63
+   *
+   * Supported in codecs: VP8, VP9
+   */
+  VP8E_SET_CQ_LEVEL,
+
+  /*!\brief Codec control function to set Max data rate for Intra frames.
+   *
+   * This value controls additional clamping on the maximum size of a
+   * keyframe. It is expressed as a percentage of the average
+   * per-frame bitrate, with the special (and default) value 0 meaning
+   * unlimited, or no additional clamping beyond the codec's built-in
+   * algorithm.
+   *
+   * For example, to allocate no more than 4.5 frames worth of bitrate
+   * to a keyframe, set this to 450.
+   *
+   * Supported in codecs: VP8, VP9
+   */
+  VP8E_SET_MAX_INTRA_BITRATE_PCT,
+
+  /*!\brief Codec control function to set reference and update frame flags.
+   *
+   *  Supported in codecs: VP8
+   */
+  VP8E_SET_FRAME_FLAGS,
+
+  /*!\brief Codec control function to set max data rate for Inter frames.
+   *
+   * This value controls additional clamping on the maximum size of an
+   * inter frame. It is expressed as a percentage of the average
+   * per-frame bitrate, with the special (and default) value 0 meaning
+   * unlimited, or no additional clamping beyond the codec's built-in
+   * algorithm.
+   *
+   * For example, to allow no more than 4.5 frames worth of bitrate
+   * to an inter frame, set this to 450.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_MAX_INTER_BITRATE_PCT,
+
+  /*!\brief Boost percentage for Golden Frame in CBR mode.
+   *
+   * This value controls the amount of boost given to Golden Frame in
+   * CBR mode. It is expressed as a percentage of the average
+   * per-frame bitrate, with the special (and default) value 0 meaning
+   * the feature is off, i.e., no golden frame boost in CBR mode and
+   * average bitrate target is used.
+   *
+   * For example, to allow 100% more bits, i.e, 2X, in a golden frame
+   * than average frame, set this to 100.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_GF_CBR_BOOST_PCT,
+
+  /*!\brief Codec control function to set the temporal layer id.
+   *
+   * For temporal scalability: this control allows the application to set the
+   * layer id for each frame to be encoded. Note that this control must be set
+   * for every frame prior to encoding. The usage of this control function
+   * supersedes the internal temporal pattern counter, which is now deprecated.
+   *
+   * Supported in codecs: VP8
+   */
+  VP8E_SET_TEMPORAL_LAYER_ID,
+
+  /*!\brief Codec control function to set encoder screen content mode.
+   *
+   * 0: off, 1: On, 2: On with more aggressive rate control.
+   *
+   * Supported in codecs: VP8
+   */
+  VP8E_SET_SCREEN_CONTENT_MODE,
+
+  /*!\brief Codec control function to set lossless encoding mode.
+   *
+   * VP9 can operate in lossless encoding mode, in which the bitstream
+   * produced will be able to decode and reconstruct a perfect copy of
+   * input source. This control function provides a mean to switch encoder
+   * into lossless coding mode(1) or normal coding mode(0) that may be lossy.
+   *                          0 = lossy coding mode
+   *                          1 = lossless coding mode
+   *
+   *  By default, encoder operates in normal coding mode (maybe lossy).
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_LOSSLESS,
+
+  /*!\brief Codec control function to set number of tile columns.
+   *
+   * In encoding and decoding, VP9 allows an input image frame be partitioned
+   * into separated vertical tile columns, which can be encoded or decoded
+   * independently. This enables easy implementation of parallel encoding and
+   * decoding. This control requests the encoder to use column tiles in
+   * encoding an input frame, with number of tile columns (in Log2 unit) as
+   * the parameter:
+   *             0 = 1 tile column
+   *             1 = 2 tile columns
+   *             2 = 4 tile columns
+   *             .....
+   *             n = 2**n tile columns
+   * The requested tile columns will be capped by encoder based on image size
+   * limitation (The minimum width of a tile column is 256 pixel, the maximum
+   * is 4096).
+   *
+   * By default, the value is 0, i.e. one single column tile for entire image.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_TILE_COLUMNS,
+
+  /*!\brief Codec control function to set number of tile rows.
+   *
+   * In encoding and decoding, VP9 allows an input image frame be partitioned
+   * into separated horizontal tile rows. Tile rows are encoded or decoded
+   * sequentially. Even though encoding/decoding of later tile rows depends on
+   * earlier ones, this allows the encoder to output data packets for tile rows
+   * prior to completely processing all tile rows in a frame, thereby reducing
+   * the latency in processing between input and output. The parameter
+   * for this control describes the number of tile rows, which has a valid
+   * range [0, 2]:
+   *            0 = 1 tile row
+   *            1 = 2 tile rows
+   *            2 = 4 tile rows
+   *
+   * By default, the value is 0, i.e. one single row tile for entire image.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_TILE_ROWS,
+
+  /*!\brief Codec control function to enable frame parallel decoding feature.
+   *
+   * VP9 has a bitstream feature to reduce decoding dependency between frames
+   * by turning off backward update of probability context used in encoding
+   * and decoding. This allows staged parallel processing of more than one
+   * video frames in the decoder. This control function provides a mean to
+   * turn this feature on or off for bitstreams produced by encoder.
+   *
+   * By default, this feature is off.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_FRAME_PARALLEL_DECODING,
+
+  /*!\brief Codec control function to set adaptive quantization mode.
+   *
+   * VP9 has a segment based feature that allows encoder to adaptively change
+   * quantization parameter for each segment within a frame to improve the
+   * subjective quality. This control makes encoder operate in one of the
+   * several AQ_modes supported.
+   *
+   * By default, encoder operates with AQ_Mode 0(adaptive quantization off).
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_AQ_MODE,
+
+  /*!\brief Codec control function to enable/disable periodic Q boost.
+   *
+   * One VP9 encoder speed feature is to enable quality boost by lowering
+   * frame level Q periodically. This control function provides a mean to
+   * turn on/off this feature.
+   *               0 = off
+   *               1 = on
+   *
+   * By default, the encoder is allowed to use this feature for appropriate
+   * encoding modes.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_FRAME_PERIODIC_BOOST,
+
+  /*!\brief Codec control function to set noise sensitivity.
+   *
+   *  0: off, 1: On(YOnly)
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_NOISE_SENSITIVITY,
+
+  /*!\brief Codec control function to turn on/off SVC in encoder.
+   * \note Return value is VPX_CODEC_INVALID_PARAM if the encoder does not
+   *       support SVC in its current encoding mode
+   *  0: off, 1: on
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_SVC,
+
+  /*!\brief Codec control function to set parameters for SVC.
+   * \note Parameters contain min_q, max_q, scaling factor for each of the
+   *       SVC layers.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_SVC_PARAMETERS,
+
+  /*!\brief Codec control function to set svc layer for spatial and temporal.
+   * \note Valid ranges: 0..#vpx_codec_enc_cfg::ss_number_layers for spatial
+   *                     layer and 0..#vpx_codec_enc_cfg::ts_number_layers for
+   *                     temporal layer.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_SVC_LAYER_ID,
+
+  /*!\brief Codec control function to set content type.
+   * \note Valid parameter range:
+   *              VP9E_CONTENT_DEFAULT = Regular video content (Default)
+   *              VP9E_CONTENT_SCREEN  = Screen capture content
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_TUNE_CONTENT,
+
+  /*!\brief Codec control function to get svc layer ID.
+   * \note The layer ID returned is for the data packet from the registered
+   *       callback function.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_GET_SVC_LAYER_ID,
+
+  /*!\brief Codec control function to register callback to get per layer packet.
+   * \note Parameter for this control function is a structure with a callback
+   *       function and a pointer to private data used by the callback.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_REGISTER_CX_CALLBACK,
+
+  /*!\brief Codec control function to set color space info.
+   * \note Valid ranges: 0..7, default is "UNKNOWN".
+   *                     0 = UNKNOWN,
+   *                     1 = BT_601
+   *                     2 = BT_709
+   *                     3 = SMPTE_170
+   *                     4 = SMPTE_240
+   *                     5 = BT_2020
+   *                     6 = RESERVED
+   *                     7 = SRGB
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_COLOR_SPACE,
+
+  /*!\brief Codec control function to set temporal layering mode.
+   * \note Valid ranges: 0..3, default is "0" (VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING).
+   *                     0 = VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING
+   *                     1 = VP9E_TEMPORAL_LAYERING_MODE_BYPASS
+   *                     2 = VP9E_TEMPORAL_LAYERING_MODE_0101
+   *                     3 = VP9E_TEMPORAL_LAYERING_MODE_0212
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_TEMPORAL_LAYERING_MODE,
+
+  /*!\brief Codec control function to set minimum interval between GF/ARF frames
+   *
+   * By default the value is set as 4.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_MIN_GF_INTERVAL,
+
+  /*!\brief Codec control function to set minimum interval between GF/ARF frames
+   *
+   * By default the value is set as 16.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_MAX_GF_INTERVAL,
+
+  /*!\brief Codec control function to get an Active map back from the encoder.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_GET_ACTIVEMAP,
+
+  /*!\brief Codec control function to set color range bit.
+   * \note Valid ranges: 0..1, default is 0
+   *                     0 = Limited range (16..235 or HBD equivalent)
+   *                     1 = Full range (0..255 or HBD equivalent)
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_COLOR_RANGE,
+
+  /*!\brief Codec control function to set the frame flags and buffer indices
+   * for spatial layers. The frame flags and buffer indices are set using the
+   * struct #vpx_svc_ref_frame_config defined below.
+   *
+   * Supported in codecs: VP9
+  */
+  VP9E_SET_SVC_REF_FRAME_CONFIG,
+
+  /*!\brief Codec control function to set intended rendering image size.
+   *
+   * By default, this is identical to the image size in pixels.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_RENDER_SIZE,
+};
+
+/*!\brief vpx 1-D scaling mode
+ *
+ * This set of constants define 1-D vpx scaling modes
+ */
+typedef enum vpx_scaling_mode_1d {
+  VP8E_NORMAL      = 0,
+  VP8E_FOURFIVE    = 1,
+  VP8E_THREEFIVE   = 2,
+  VP8E_ONETWO      = 3
+} VPX_SCALING_MODE;
+
+/*!\brief Temporal layering mode enum for VP9 SVC.
+ *
+ * This set of macros define the different temporal layering modes.
+ * Supported codecs: VP9 (in SVC mode)
+ *
+ */
+typedef enum vp9e_temporal_layering_mode {
+  /*!\brief No temporal layering.
+   * Used when only spatial layering is used.
+   */
+  VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING   = 0,
+
+  /*!\brief Bypass mode.
+   * Used when application needs to control temporal layering.
+   * This will only work when the number of spatial layers equals 1.
+   */
+  VP9E_TEMPORAL_LAYERING_MODE_BYPASS       = 1,
+
+  /*!\brief 0-1-0-1... temporal layering scheme with two temporal layers.
+   */
+  VP9E_TEMPORAL_LAYERING_MODE_0101         = 2,
+
+  /*!\brief 0-2-1-2... temporal layering scheme with three temporal layers.
+   */
+  VP9E_TEMPORAL_LAYERING_MODE_0212         = 3
+} VP9E_TEMPORAL_LAYERING_MODE;
+
+/*!\brief  vpx region of interest map
+ *
+ * These defines the data structures for the region of interest map
+ *
+ */
+
+typedef struct vpx_roi_map {
+  /*! An id between 0 and 3 for each 16x16 region within a frame. */
+  unsigned char *roi_map;
+  unsigned int rows;       /**< Number of rows. */
+  unsigned int cols;       /**< Number of columns. */
+  // TODO(paulwilkins): broken for VP9 which has 8 segments
+  // q and loop filter deltas for each segment
+  // (see MAX_MB_SEGMENTS)
+  int delta_q[4];          /**< Quantizer deltas. */
+  int delta_lf[4];         /**< Loop filter deltas. */
+  /*! Static breakout threshold for each segment. */
+  unsigned int static_threshold[4];
+} vpx_roi_map_t;
+
+/*!\brief  vpx active region map
+ *
+ * These defines the data structures for active region map
+ *
+ */
+
+
+typedef struct vpx_active_map {
+  unsigned char  *active_map; /**< specify an on (1) or off (0) each 16x16 region within a frame */
+  unsigned int    rows;       /**< number of rows */
+  unsigned int    cols;       /**< number of cols */
+} vpx_active_map_t;
+
+/*!\brief  vpx image scaling mode
+ *
+ * This defines the data structure for image scaling mode
+ *
+ */
+typedef struct vpx_scaling_mode {
+  VPX_SCALING_MODE    h_scaling_mode;  /**< horizontal scaling mode */
+  VPX_SCALING_MODE    v_scaling_mode;  /**< vertical scaling mode   */
+} vpx_scaling_mode_t;
+
+/*!\brief VP8 token partition mode
+ *
+ * This defines VP8 partitioning mode for compressed data, i.e., the number of
+ * sub-streams in the bitstream. Used for parallelized decoding.
+ *
+ */
+
+typedef enum {
+  VP8_ONE_TOKENPARTITION   = 0,
+  VP8_TWO_TOKENPARTITION   = 1,
+  VP8_FOUR_TOKENPARTITION  = 2,
+  VP8_EIGHT_TOKENPARTITION = 3
+} vp8e_token_partitions;
+
+/*!brief VP9 encoder content type */
+typedef enum {
+  VP9E_CONTENT_DEFAULT,
+  VP9E_CONTENT_SCREEN,
+  VP9E_CONTENT_INVALID
+} vp9e_tune_content;
+
+/*!\brief VP8 model tuning parameters
+ *
+ * Changes the encoder to tune for certain types of input material.
+ *
+ */
+typedef enum {
+  VP8_TUNE_PSNR,
+  VP8_TUNE_SSIM
+} vp8e_tuning;
+
+/*!\brief  vp9 svc layer parameters
+ *
+ * This defines the spatial and temporal layer id numbers for svc encoding.
+ * This is used with the #VP9E_SET_SVC_LAYER_ID control to set the spatial and
+ * temporal layer id for the current frame.
+ *
+ */
+typedef struct vpx_svc_layer_id {
+  int spatial_layer_id;       /**< Spatial layer id number. */
+  int temporal_layer_id;      /**< Temporal layer id number. */
+} vpx_svc_layer_id_t;
+
+/*!\brief  vp9 svc frame flag parameters.
+ *
+ * This defines the frame flags and buffer indices for each spatial layer for
+ * svc encoding.
+ * This is used with the #VP9E_SET_SVC_REF_FRAME_CONFIG control to set frame
+ * flags and buffer indices for each spatial layer for the current (super)frame.
+ *
+ */
+typedef struct vpx_svc_ref_frame_config {
+  int frame_flags[VPX_TS_MAX_LAYERS];  /**< Frame flags. */
+  int lst_fb_idx[VPX_TS_MAX_LAYERS];  /**< Last buffer index. */
+  int gld_fb_idx[VPX_TS_MAX_LAYERS];  /**< Golden buffer index. */
+  int alt_fb_idx[VPX_TS_MAX_LAYERS];  /**< Altref buffer index. */
+} vpx_svc_ref_frame_config_t;
+
+/*!\cond */
+/*!\brief VP8 encoder control function parameter type
+ *
+ * Defines the data types that VP8E control functions take. Note that
+ * additional common controls are defined in vp8.h
+ *
+ */
+
+VPX_CTRL_USE_TYPE(VP8E_SET_FRAME_FLAGS,        int)
+#define VPX_CTRL_VP8E_SET_FRAME_FLAGS
+VPX_CTRL_USE_TYPE(VP8E_SET_TEMPORAL_LAYER_ID,  int)
+#define VPX_CTRL_VP8E_SET_TEMPORAL_LAYER_ID
+VPX_CTRL_USE_TYPE(VP8E_SET_ROI_MAP,            vpx_roi_map_t *)
+#define VPX_CTRL_VP8E_SET_ROI_MAP
+VPX_CTRL_USE_TYPE(VP8E_SET_ACTIVEMAP,          vpx_active_map_t *)
+#define VPX_CTRL_VP8E_SET_ACTIVEMAP
+VPX_CTRL_USE_TYPE(VP8E_SET_SCALEMODE,          vpx_scaling_mode_t *)
+#define VPX_CTRL_VP8E_SET_SCALEMODE
+
+VPX_CTRL_USE_TYPE(VP9E_SET_SVC,                int)
+#define VPX_CTRL_VP9E_SET_SVC
+VPX_CTRL_USE_TYPE(VP9E_SET_SVC_PARAMETERS,     void *)
+#define VPX_CTRL_VP9E_SET_SVC_PARAMETERS
+VPX_CTRL_USE_TYPE(VP9E_REGISTER_CX_CALLBACK,   void *)
+#define VPX_CTRL_VP9E_REGISTER_CX_CALLBACK
+VPX_CTRL_USE_TYPE(VP9E_SET_SVC_LAYER_ID,       vpx_svc_layer_id_t *)
+#define VPX_CTRL_VP9E_SET_SVC_LAYER_ID
+
+VPX_CTRL_USE_TYPE(VP8E_SET_CPUUSED,            int)
+#define VPX_CTRL_VP8E_SET_CPUUSED
+VPX_CTRL_USE_TYPE(VP8E_SET_ENABLEAUTOALTREF,   unsigned int)
+#define VPX_CTRL_VP8E_SET_ENABLEAUTOALTREF
+VPX_CTRL_USE_TYPE(VP8E_SET_NOISE_SENSITIVITY,  unsigned int)
+#define VPX_CTRL_VP8E_SET_NOISE_SENSITIVITY
+VPX_CTRL_USE_TYPE(VP8E_SET_SHARPNESS,          unsigned int)
+#define VPX_CTRL_VP8E_SET_SHARPNESS
+VPX_CTRL_USE_TYPE(VP8E_SET_STATIC_THRESHOLD,   unsigned int)
+#define VPX_CTRL_VP8E_SET_STATIC_THRESHOLD
+VPX_CTRL_USE_TYPE(VP8E_SET_TOKEN_PARTITIONS,   int) /* vp8e_token_partitions */
+#define VPX_CTRL_VP8E_SET_TOKEN_PARTITIONS
+
+VPX_CTRL_USE_TYPE(VP8E_SET_ARNR_MAXFRAMES,     unsigned int)
+#define VPX_CTRL_VP8E_SET_ARNR_MAXFRAMES
+VPX_CTRL_USE_TYPE(VP8E_SET_ARNR_STRENGTH,     unsigned int)
+#define VPX_CTRL_VP8E_SET_ARNR_STRENGTH
+VPX_CTRL_USE_TYPE_DEPRECATED(VP8E_SET_ARNR_TYPE,     unsigned int)
+#define VPX_CTRL_VP8E_SET_ARNR_TYPE
+VPX_CTRL_USE_TYPE(VP8E_SET_TUNING,             int) /* vp8e_tuning */
+#define VPX_CTRL_VP8E_SET_TUNING
+VPX_CTRL_USE_TYPE(VP8E_SET_CQ_LEVEL,      unsigned int)
+#define VPX_CTRL_VP8E_SET_CQ_LEVEL
+
+VPX_CTRL_USE_TYPE(VP9E_SET_TILE_COLUMNS,  int)
+#define VPX_CTRL_VP9E_SET_TILE_COLUMNS
+VPX_CTRL_USE_TYPE(VP9E_SET_TILE_ROWS,  int)
+#define VPX_CTRL_VP9E_SET_TILE_ROWS
+
+VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER,     int *)
+#define VPX_CTRL_VP8E_GET_LAST_QUANTIZER
+VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER_64,  int *)
+#define VPX_CTRL_VP8E_GET_LAST_QUANTIZER_64
+VPX_CTRL_USE_TYPE(VP9E_GET_SVC_LAYER_ID,  vpx_svc_layer_id_t *)
+#define VPX_CTRL_VP9E_GET_SVC_LAYER_ID
+
+VPX_CTRL_USE_TYPE(VP8E_SET_MAX_INTRA_BITRATE_PCT, unsigned int)
+#define VPX_CTRL_VP8E_SET_MAX_INTRA_BITRATE_PCT
+VPX_CTRL_USE_TYPE(VP8E_SET_MAX_INTER_BITRATE_PCT, unsigned int)
+#define VPX_CTRL_VP8E_SET_MAX_INTER_BITRATE_PCT
+
+VPX_CTRL_USE_TYPE(VP8E_SET_SCREEN_CONTENT_MODE, unsigned int)
+#define VPX_CTRL_VP8E_SET_SCREEN_CONTENT_MODE
+
+VPX_CTRL_USE_TYPE(VP9E_SET_GF_CBR_BOOST_PCT, unsigned int)
+#define VPX_CTRL_VP9E_SET_GF_CBR_BOOST_PCT
+
+VPX_CTRL_USE_TYPE(VP9E_SET_LOSSLESS, unsigned int)
+#define VPX_CTRL_VP9E_SET_LOSSLESS
+
+VPX_CTRL_USE_TYPE(VP9E_SET_FRAME_PARALLEL_DECODING, unsigned int)
+#define VPX_CTRL_VP9E_SET_FRAME_PARALLEL_DECODING
+
+VPX_CTRL_USE_TYPE(VP9E_SET_AQ_MODE, unsigned int)
+#define VPX_CTRL_VP9E_SET_AQ_MODE
+
+VPX_CTRL_USE_TYPE(VP9E_SET_FRAME_PERIODIC_BOOST, unsigned int)
+#define VPX_CTRL_VP9E_SET_FRAME_PERIODIC_BOOST
+
+VPX_CTRL_USE_TYPE(VP9E_SET_NOISE_SENSITIVITY,  unsigned int)
+#define VPX_CTRL_VP9E_SET_NOISE_SENSITIVITY
+
+VPX_CTRL_USE_TYPE(VP9E_SET_TUNE_CONTENT, int) /* vp9e_tune_content */
+#define VPX_CTRL_VP9E_SET_TUNE_CONTENT
+
+VPX_CTRL_USE_TYPE(VP9E_SET_COLOR_SPACE, int)
+#define VPX_CTRL_VP9E_SET_COLOR_SPACE
+
+VPX_CTRL_USE_TYPE(VP9E_SET_MIN_GF_INTERVAL,  unsigned int)
+#define VPX_CTRL_VP9E_SET_MIN_GF_INTERVAL
+
+VPX_CTRL_USE_TYPE(VP9E_SET_MAX_GF_INTERVAL,  unsigned int)
+#define VPX_CTRL_VP9E_SET_MAX_GF_INTERVAL
+
+VPX_CTRL_USE_TYPE(VP9E_GET_ACTIVEMAP, vpx_active_map_t *)
+#define VPX_CTRL_VP9E_GET_ACTIVEMAP
+
+VPX_CTRL_USE_TYPE(VP9E_SET_COLOR_RANGE, int)
+#define VPX_CTRL_VP9E_SET_COLOR_RANGE
+
+VPX_CTRL_USE_TYPE(VP9E_SET_SVC_REF_FRAME_CONFIG, vpx_svc_ref_frame_config_t *)
+#define VPX_CTRL_VP9E_SET_SVC_REF_FRAME_CONFIG
+
+VPX_CTRL_USE_TYPE(VP9E_SET_RENDER_SIZE, int *)
+#define VPX_CTRL_VP9E_SET_RENDER_SIZE
+
+/*!\endcond */
+/*! @} - end defgroup vp8_encoder */
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_VP8CX_H_
diff --git a/libs/libvpx/vpx/vp8dx.h b/libs/libvpx/vpx/vp8dx.h
new file mode 100644
index 0000000000..1f02fd5958
--- /dev/null
+++ b/libs/libvpx/vpx/vp8dx.h
@@ -0,0 +1,185 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+/*!\defgroup vp8_decoder WebM VP8/VP9 Decoder
+ * \ingroup vp8
+ *
+ * @{
+ */
+/*!\file
+ * \brief Provides definitions for using VP8 or VP9 within the vpx Decoder
+ *        interface.
+ */
+#ifndef VPX_VP8DX_H_
+#define VPX_VP8DX_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Include controls common to both the encoder and decoder */
+#include "./vp8.h"
+
+/*!\name Algorithm interface for VP8
+ *
+ * This interface provides the capability to decode VP8 streams.
+ * @{
+ */
+extern vpx_codec_iface_t  vpx_codec_vp8_dx_algo;
+extern vpx_codec_iface_t *vpx_codec_vp8_dx(void);
+/*!@} - end algorithm interface member group*/
+
+/*!\name Algorithm interface for VP9
+ *
+ * This interface provides the capability to decode VP9 streams.
+ * @{
+ */
+extern vpx_codec_iface_t  vpx_codec_vp9_dx_algo;
+extern vpx_codec_iface_t *vpx_codec_vp9_dx(void);
+/*!@} - end algorithm interface member group*/
+
+/*!\name Algorithm interface for VP10
+ *
+ * This interface provides the capability to decode VP10 streams.
+ * @{
+ */
+extern vpx_codec_iface_t  vpx_codec_vp10_dx_algo;
+extern vpx_codec_iface_t *vpx_codec_vp10_dx(void);
+/*!@} - end algorithm interface member group*/
+
+/*!\enum vp8_dec_control_id
+ * \brief VP8 decoder control functions
+ *
+ * This set of macros define the control functions available for the VP8
+ * decoder interface.
+ *
+ * \sa #vpx_codec_control
+ */
+enum vp8_dec_control_id {
+  /** control function to get info on which reference frames were updated
+   *  by the last decode
+   */
+  VP8D_GET_LAST_REF_UPDATES = VP8_DECODER_CTRL_ID_START,
+
+  /** check if the indicated frame is corrupted */
+  VP8D_GET_FRAME_CORRUPTED,
+
+  /** control function to get info on which reference frames were used
+   *  by the last decode
+   */
+  VP8D_GET_LAST_REF_USED,
+
+  /** decryption function to decrypt encoded buffer data immediately
+   * before decoding. Takes a vpx_decrypt_init, which contains
+   * a callback function and opaque context pointer.
+   */
+  VPXD_SET_DECRYPTOR,
+  VP8D_SET_DECRYPTOR = VPXD_SET_DECRYPTOR,
+
+  /** control function to get the dimensions that the current frame is decoded
+   * at. This may be different to the intended display size for the frame as
+   * specified in the wrapper or frame header (see VP9D_GET_DISPLAY_SIZE). */
+  VP9D_GET_FRAME_SIZE,
+
+  /** control function to get the current frame's intended display dimensions
+   * (as specified in the wrapper or frame header). This may be different to
+   * the decoded dimensions of this frame (see VP9D_GET_FRAME_SIZE). */
+  VP9D_GET_DISPLAY_SIZE,
+
+  /** control function to get the bit depth of the stream. */
+  VP9D_GET_BIT_DEPTH,
+
+  /** control function to set the byte alignment of the planes in the reference
+   * buffers. Valid values are power of 2, from 32 to 1024. A value of 0 sets
+   * legacy alignment. I.e. Y plane is aligned to 32 bytes, U plane directly
+   * follows Y plane, and V plane directly follows U plane. Default value is 0.
+   */
+  VP9_SET_BYTE_ALIGNMENT,
+
+  /** control function to invert the decoding order to from right to left. The
+   * function is used in a test to confirm the decoding independence of tile
+   * columns. The function may be used in application where this order
+   * of decoding is desired.
+   *
+   * TODO(yaowu): Rework the unit test that uses this control, and in a future
+   *              release, this test-only control shall be removed.
+   */
+  VP9_INVERT_TILE_DECODE_ORDER,
+
+  /** control function to set the skip loop filter flag. Valid values are
+   * integers. The decoder will skip the loop filter when its value is set to
+   * nonzero. If the loop filter is skipped the decoder may accumulate decode
+   * artifacts. The default value is 0.
+   */
+  VP9_SET_SKIP_LOOP_FILTER,
+
+  VP8_DECODER_CTRL_ID_MAX
+};
+
+/** Decrypt n bytes of data from input -> output, using the decrypt_state
+ *  passed in VPXD_SET_DECRYPTOR.
+ */
+typedef void (*vpx_decrypt_cb)(void *decrypt_state, const unsigned char *input,
+                               unsigned char *output, int count);
+
+/*!\brief Structure to hold decryption state
+ *
+ * Defines a structure to hold the decryption state and access function.
+ */
+typedef struct vpx_decrypt_init {
+    /*! Decrypt callback. */
+    vpx_decrypt_cb decrypt_cb;
+
+    /*! Decryption state. */
+    void *decrypt_state;
+} vpx_decrypt_init;
+
+/*!\brief A deprecated alias for vpx_decrypt_init.
+ */
+typedef vpx_decrypt_init vp8_decrypt_init;
+
+
+/*!\cond */
+/*!\brief VP8 decoder control function parameter type
+ *
+ * Defines the data types that VP8D control functions take. Note that
+ * additional common controls are defined in vp8.h
+ *
+ */
+
+
+VPX_CTRL_USE_TYPE(VP8D_GET_LAST_REF_UPDATES,    int *)
+#define VPX_CTRL_VP8D_GET_LAST_REF_UPDATES
+VPX_CTRL_USE_TYPE(VP8D_GET_FRAME_CORRUPTED,     int *)
+#define VPX_CTRL_VP8D_GET_FRAME_CORRUPTED
+VPX_CTRL_USE_TYPE(VP8D_GET_LAST_REF_USED,       int *)
+#define VPX_CTRL_VP8D_GET_LAST_REF_USED
+VPX_CTRL_USE_TYPE(VPXD_SET_DECRYPTOR,           vpx_decrypt_init *)
+#define VPX_CTRL_VPXD_SET_DECRYPTOR
+VPX_CTRL_USE_TYPE(VP8D_SET_DECRYPTOR,           vpx_decrypt_init *)
+#define VPX_CTRL_VP8D_SET_DECRYPTOR
+VPX_CTRL_USE_TYPE(VP9D_GET_DISPLAY_SIZE,        int *)
+#define VPX_CTRL_VP9D_GET_DISPLAY_SIZE
+VPX_CTRL_USE_TYPE(VP9D_GET_BIT_DEPTH,           unsigned int *)
+#define VPX_CTRL_VP9D_GET_BIT_DEPTH
+VPX_CTRL_USE_TYPE(VP9D_GET_FRAME_SIZE,          int *)
+#define VPX_CTRL_VP9D_GET_FRAME_SIZE
+VPX_CTRL_USE_TYPE(VP9_INVERT_TILE_DECODE_ORDER, int)
+#define VPX_CTRL_VP9_INVERT_TILE_DECODE_ORDER
+
+/*!\endcond */
+/*! @} - end defgroup vp8_decoder */
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_VP8DX_H_
diff --git a/libs/libvpx/vpx/vpx_codec.h b/libs/libvpx/vpx/vpx_codec.h
new file mode 100644
index 0000000000..b6037bb4d7
--- /dev/null
+++ b/libs/libvpx/vpx/vpx_codec.h
@@ -0,0 +1,479 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+/*!\defgroup codec Common Algorithm Interface
+ * This abstraction allows applications to easily support multiple video
+ * formats with minimal code duplication. This section describes the interface
+ * common to all codecs (both encoders and decoders).
+ * @{
+ */
+
+/*!\file
+ * \brief Describes the codec algorithm interface to applications.
+ *
+ * This file describes the interface between an application and a
+ * video codec algorithm.
+ *
+ * An application instantiates a specific codec instance by using
+ * vpx_codec_init() and a pointer to the algorithm's interface structure:
+ *     <pre>
+ *     my_app.c:
+ *       extern vpx_codec_iface_t my_codec;
+ *       {
+ *           vpx_codec_ctx_t algo;
+ *           res = vpx_codec_init(&algo, &my_codec);
+ *       }
+ *     </pre>
+ *
+ * Once initialized, the instance is manged using other functions from
+ * the vpx_codec_* family.
+ */
+#ifndef VPX_VPX_CODEC_H_
+#define VPX_VPX_CODEC_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "./vpx_integer.h"
+#include "./vpx_image.h"
+
+  /*!\brief Decorator indicating a function is deprecated */
+#ifndef DEPRECATED
+#if defined(__GNUC__) && __GNUC__
+#define DEPRECATED          __attribute__ ((deprecated))
+#elif defined(_MSC_VER)
+#define DEPRECATED
+#else
+#define DEPRECATED
+#endif
+#endif  /* DEPRECATED */
+
+#ifndef DECLSPEC_DEPRECATED
+#if defined(__GNUC__) && __GNUC__
+#define DECLSPEC_DEPRECATED /**< \copydoc #DEPRECATED */
+#elif defined(_MSC_VER)
+#define DECLSPEC_DEPRECATED __declspec(deprecated) /**< \copydoc #DEPRECATED */
+#else
+#define DECLSPEC_DEPRECATED /**< \copydoc #DEPRECATED */
+#endif
+#endif  /* DECLSPEC_DEPRECATED */
+
+  /*!\brief Decorator indicating a function is potentially unused */
+#ifdef UNUSED
+#elif defined(__GNUC__) || defined(__clang__)
+#define UNUSED __attribute__ ((unused))
+#else
+#define UNUSED
+#endif
+
+  /*!\brief Current ABI version number
+   *
+   * \internal
+   * If this file is altered in any way that changes the ABI, this value
+   * must be bumped.  Examples include, but are not limited to, changing
+   * types, removing or reassigning enums, adding/removing/rearranging
+   * fields to structures
+   */
+#define VPX_CODEC_ABI_VERSION (3 + VPX_IMAGE_ABI_VERSION) /**<\hideinitializer*/
+
+  /*!\brief Algorithm return codes */
+  typedef enum {
+    /*!\brief Operation completed without error */
+    VPX_CODEC_OK,
+
+    /*!\brief Unspecified error */
+    VPX_CODEC_ERROR,
+
+    /*!\brief Memory operation failed */
+    VPX_CODEC_MEM_ERROR,
+
+    /*!\brief ABI version mismatch */
+    VPX_CODEC_ABI_MISMATCH,
+
+    /*!\brief Algorithm does not have required capability */
+    VPX_CODEC_INCAPABLE,
+
+    /*!\brief The given bitstream is not supported.
+     *
+     * The bitstream was unable to be parsed at the highest level. The decoder
+     * is unable to proceed. This error \ref SHOULD be treated as fatal to the
+     * stream. */
+    VPX_CODEC_UNSUP_BITSTREAM,
+
+    /*!\brief Encoded bitstream uses an unsupported feature
+     *
+     * The decoder does not implement a feature required by the encoder. This
+     * return code should only be used for features that prevent future
+     * pictures from being properly decoded. This error \ref MAY be treated as
+     * fatal to the stream or \ref MAY be treated as fatal to the current GOP.
+     */
+    VPX_CODEC_UNSUP_FEATURE,
+
+    /*!\brief The coded data for this stream is corrupt or incomplete
+     *
+     * There was a problem decoding the current frame.  This return code
+     * should only be used for failures that prevent future pictures from
+     * being properly decoded. This error \ref MAY be treated as fatal to the
+     * stream or \ref MAY be treated as fatal to the current GOP. If decoding
+     * is continued for the current GOP, artifacts may be present.
+     */
+    VPX_CODEC_CORRUPT_FRAME,
+
+    /*!\brief An application-supplied parameter is not valid.
+     *
+     */
+    VPX_CODEC_INVALID_PARAM,
+
+    /*!\brief An iterator reached the end of list.
+     *
+     */
+    VPX_CODEC_LIST_END
+
+  }
+  vpx_codec_err_t;
+
+
+  /*! \brief Codec capabilities bitfield
+   *
+   *  Each codec advertises the capabilities it supports as part of its
+   *  ::vpx_codec_iface_t interface structure. Capabilities are extra interfaces
+   *  or functionality, and are not required to be supported.
+   *
+   *  The available flags are specified by VPX_CODEC_CAP_* defines.
+   */
+  typedef long vpx_codec_caps_t;
+#define VPX_CODEC_CAP_DECODER 0x1 /**< Is a decoder */
+#define VPX_CODEC_CAP_ENCODER 0x2 /**< Is an encoder */
+
+
+  /*! \brief Initialization-time Feature Enabling
+   *
+   *  Certain codec features must be known at initialization time, to allow for
+   *  proper memory allocation.
+   *
+   *  The available flags are specified by VPX_CODEC_USE_* defines.
+   */
+  typedef long vpx_codec_flags_t;
+
+
+  /*!\brief Codec interface structure.
+   *
+   * Contains function pointers and other data private to the codec
+   * implementation. This structure is opaque to the application.
+   */
+  typedef const struct vpx_codec_iface vpx_codec_iface_t;
+
+
+  /*!\brief Codec private data structure.
+   *
+   * Contains data private to the codec implementation. This structure is opaque
+   * to the application.
+   */
+  typedef       struct vpx_codec_priv  vpx_codec_priv_t;
+
+
+  /*!\brief Iterator
+   *
+   * Opaque storage used for iterating over lists.
+   */
+  typedef const void *vpx_codec_iter_t;
+
+
+  /*!\brief Codec context structure
+   *
+   * All codecs \ref MUST support this context structure fully. In general,
+   * this data should be considered private to the codec algorithm, and
+   * not be manipulated or examined by the calling application. Applications
+   * may reference the 'name' member to get a printable description of the
+   * algorithm.
+   */
+  typedef struct vpx_codec_ctx {
+    const char              *name;        /**< Printable interface name */
+    vpx_codec_iface_t       *iface;       /**< Interface pointers */
+    vpx_codec_err_t          err;         /**< Last returned error */
+    const char              *err_detail;  /**< Detailed info, if available */
+    vpx_codec_flags_t        init_flags;  /**< Flags passed at init time */
+    union {
+      /**< Decoder Configuration Pointer */
+      const struct vpx_codec_dec_cfg *dec;
+      /**< Encoder Configuration Pointer */
+      const struct vpx_codec_enc_cfg *enc;
+      const void                     *raw;
+    }                        config;      /**< Configuration pointer aliasing union */
+    vpx_codec_priv_t        *priv;        /**< Algorithm private storage */
+  } vpx_codec_ctx_t;
+
+  /*!\brief Bit depth for codec
+   * *
+   * This enumeration determines the bit depth of the codec.
+   */
+  typedef enum vpx_bit_depth {
+    VPX_BITS_8  =  8,  /**<  8 bits */
+    VPX_BITS_10 = 10,  /**< 10 bits */
+    VPX_BITS_12 = 12,  /**< 12 bits */
+  } vpx_bit_depth_t;
+
+  /*
+   * Library Version Number Interface
+   *
+   * For example, see the following sample return values:
+   *     vpx_codec_version()           (1<<16 | 2<<8 | 3)
+   *     vpx_codec_version_str()       "v1.2.3-rc1-16-gec6a1ba"
+   *     vpx_codec_version_extra_str() "rc1-16-gec6a1ba"
+   */
+
+  /*!\brief Return the version information (as an integer)
+   *
+   * Returns a packed encoding of the library version number. This will only include
+   * the major.minor.patch component of the version number. Note that this encoded
+   * value should be accessed through the macros provided, as the encoding may change
+   * in the future.
+   *
+   */
+  int vpx_codec_version(void);
+#define VPX_VERSION_MAJOR(v) ((v>>16)&0xff) /**< extract major from packed version */
+#define VPX_VERSION_MINOR(v) ((v>>8)&0xff)  /**< extract minor from packed version */
+#define VPX_VERSION_PATCH(v) ((v>>0)&0xff)  /**< extract patch from packed version */
+
+  /*!\brief Return the version major number */
+#define vpx_codec_version_major() ((vpx_codec_version()>>16)&0xff)
+
+  /*!\brief Return the version minor number */
+#define vpx_codec_version_minor() ((vpx_codec_version()>>8)&0xff)
+
+  /*!\brief Return the version patch number */
+#define vpx_codec_version_patch() ((vpx_codec_version()>>0)&0xff)
+
+
+  /*!\brief Return the version information (as a string)
+   *
+   * Returns a printable string containing the full library version number. This may
+   * contain additional text following the three digit version number, as to indicate
+   * release candidates, prerelease versions, etc.
+   *
+   */
+  const char *vpx_codec_version_str(void);
+
+
+  /*!\brief Return the version information (as a string)
+   *
+   * Returns a printable "extra string". This is the component of the string returned
+   * by vpx_codec_version_str() following the three digit version number.
+   *
+   */
+  const char *vpx_codec_version_extra_str(void);
+
+
+  /*!\brief Return the build configuration
+   *
+   * Returns a printable string containing an encoded version of the build
+   * configuration. This may be useful to vpx support.
+   *
+   */
+  const char *vpx_codec_build_config(void);
+
+
+  /*!\brief Return the name for a given interface
+   *
+   * Returns a human readable string for name of the given codec interface.
+   *
+   * \param[in]    iface     Interface pointer
+   *
+   */
+  const char *vpx_codec_iface_name(vpx_codec_iface_t *iface);
+
+
+  /*!\brief Convert error number to printable string
+   *
+   * Returns a human readable string for the last error returned by the
+   * algorithm. The returned error will be one line and will not contain
+   * any newline characters.
+   *
+   *
+   * \param[in]    err     Error number.
+   *
+   */
+  const char *vpx_codec_err_to_string(vpx_codec_err_t  err);
+
+
+  /*!\brief Retrieve error synopsis for codec context
+   *
+   * Returns a human readable string for the last error returned by the
+   * algorithm. The returned error will be one line and will not contain
+   * any newline characters.
+   *
+   *
+   * \param[in]    ctx     Pointer to this instance's context.
+   *
+   */
+  const char *vpx_codec_error(vpx_codec_ctx_t  *ctx);
+
+
+  /*!\brief Retrieve detailed error information for codec context
+   *
+   * Returns a human readable string providing detailed information about
+   * the last error.
+   *
+   * \param[in]    ctx     Pointer to this instance's context.
+   *
+   * \retval NULL
+   *     No detailed information is available.
+   */
+  const char *vpx_codec_error_detail(vpx_codec_ctx_t  *ctx);
+
+
+  /* REQUIRED FUNCTIONS
+   *
+   * The following functions are required to be implemented for all codecs.
+   * They represent the base case functionality expected of all codecs.
+   */
+
+  /*!\brief Destroy a codec instance
+   *
+   * Destroys a codec context, freeing any associated memory buffers.
+   *
+   * \param[in] ctx   Pointer to this instance's context
+   *
+   * \retval #VPX_CODEC_OK
+   *     The codec algorithm initialized.
+   * \retval #VPX_CODEC_MEM_ERROR
+   *     Memory allocation failed.
+   */
+  vpx_codec_err_t vpx_codec_destroy(vpx_codec_ctx_t *ctx);
+
+
+  /*!\brief Get the capabilities of an algorithm.
+   *
+   * Retrieves the capabilities bitfield from the algorithm's interface.
+   *
+   * \param[in] iface   Pointer to the algorithm interface
+   *
+   */
+  vpx_codec_caps_t vpx_codec_get_caps(vpx_codec_iface_t *iface);
+
+
+  /*!\brief Control algorithm
+   *
+   * This function is used to exchange algorithm specific data with the codec
+   * instance. This can be used to implement features specific to a particular
+   * algorithm.
+   *
+   * This wrapper function dispatches the request to the helper function
+   * associated with the given ctrl_id. It tries to call this function
+   * transparently, but will return #VPX_CODEC_ERROR if the request could not
+   * be dispatched.
+   *
+   * Note that this function should not be used directly. Call the
+   * #vpx_codec_control wrapper macro instead.
+   *
+   * \param[in]     ctx              Pointer to this instance's context
+   * \param[in]     ctrl_id          Algorithm specific control identifier
+   *
+   * \retval #VPX_CODEC_OK
+   *     The control request was processed.
+   * \retval #VPX_CODEC_ERROR
+   *     The control request was not processed.
+   * \retval #VPX_CODEC_INVALID_PARAM
+   *     The data was not valid.
+   */
+  vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t  *ctx,
+                                     int               ctrl_id,
+                                     ...);
+#if defined(VPX_DISABLE_CTRL_TYPECHECKS) && VPX_DISABLE_CTRL_TYPECHECKS
+#    define vpx_codec_control(ctx,id,data) vpx_codec_control_(ctx,id,data)
+#    define VPX_CTRL_USE_TYPE(id, typ)
+#    define VPX_CTRL_USE_TYPE_DEPRECATED(id, typ)
+#    define VPX_CTRL_VOID(id, typ)
+
+#else
+  /*!\brief vpx_codec_control wrapper macro
+   *
+   * This macro allows for type safe conversions across the variadic parameter
+   * to vpx_codec_control_().
+   *
+   * \internal
+   * It works by dispatching the call to the control function through a wrapper
+   * function named with the id parameter.
+   */
+#    define vpx_codec_control(ctx,id,data) vpx_codec_control_##id(ctx,id,data)\
+  /**<\hideinitializer*/
+
+
+  /*!\brief vpx_codec_control type definition macro
+   *
+   * This macro allows for type safe conversions across the variadic parameter
+   * to vpx_codec_control_(). It defines the type of the argument for a given
+   * control identifier.
+   *
+   * \internal
+   * It defines a static function with
+   * the correctly typed arguments as a wrapper to the type-unsafe internal
+   * function.
+   */
+#    define VPX_CTRL_USE_TYPE(id, typ) \
+  static vpx_codec_err_t \
+  vpx_codec_control_##id(vpx_codec_ctx_t*, int, typ) UNUSED;\
+  \
+  static vpx_codec_err_t \
+  vpx_codec_control_##id(vpx_codec_ctx_t  *ctx, int ctrl_id, typ data) {\
+    return vpx_codec_control_(ctx, ctrl_id, data);\
+  } /**<\hideinitializer*/
+
+
+  /*!\brief vpx_codec_control deprecated type definition macro
+   *
+   * Like #VPX_CTRL_USE_TYPE, but indicates that the specified control is
+   * deprecated and should not be used. Consult the documentation for your
+   * codec for more information.
+   *
+   * \internal
+   * It defines a static function with the correctly typed arguments as a
+   * wrapper to the type-unsafe internal function.
+   */
+#    define VPX_CTRL_USE_TYPE_DEPRECATED(id, typ) \
+  DECLSPEC_DEPRECATED static vpx_codec_err_t \
+  vpx_codec_control_##id(vpx_codec_ctx_t*, int, typ) DEPRECATED UNUSED;\
+  \
+  DECLSPEC_DEPRECATED static vpx_codec_err_t \
+  vpx_codec_control_##id(vpx_codec_ctx_t  *ctx, int ctrl_id, typ data) {\
+    return vpx_codec_control_(ctx, ctrl_id, data);\
+  } /**<\hideinitializer*/
+
+
+  /*!\brief vpx_codec_control void type definition macro
+   *
+   * This macro allows for type safe conversions across the variadic parameter
+   * to vpx_codec_control_(). It indicates that a given control identifier takes
+   * no argument.
+   *
+   * \internal
+   * It defines a static function without a data argument as a wrapper to the
+   * type-unsafe internal function.
+   */
+#    define VPX_CTRL_VOID(id) \
+  static vpx_codec_err_t \
+  vpx_codec_control_##id(vpx_codec_ctx_t*, int) UNUSED;\
+  \
+  static vpx_codec_err_t \
+  vpx_codec_control_##id(vpx_codec_ctx_t  *ctx, int ctrl_id) {\
+    return vpx_codec_control_(ctx, ctrl_id);\
+  } /**<\hideinitializer*/
+
+
+#endif
+
+  /*!@} - end defgroup codec*/
+#ifdef __cplusplus
+}
+#endif
+#endif  // VPX_VPX_CODEC_H_
+
diff --git a/libs/libvpx/vpx/vpx_codec.mk b/libs/libvpx/vpx/vpx_codec.mk
new file mode 100644
index 0000000000..ccdef040c3
--- /dev/null
+++ b/libs/libvpx/vpx/vpx_codec.mk
@@ -0,0 +1,47 @@
+##
+##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+
+API_EXPORTS += exports
+
+API_SRCS-$(CONFIG_VP8_ENCODER) += vp8.h
+API_SRCS-$(CONFIG_VP8_ENCODER) += vp8cx.h
+API_DOC_SRCS-$(CONFIG_VP8_ENCODER) += vp8.h
+API_DOC_SRCS-$(CONFIG_VP8_ENCODER) += vp8cx.h
+ifeq ($(CONFIG_VP9_ENCODER),yes)
+  API_SRCS-$(CONFIG_SPATIAL_SVC) += src/svc_encodeframe.c
+  API_SRCS-$(CONFIG_SPATIAL_SVC) += svc_context.h
+endif
+
+API_SRCS-$(CONFIG_VP8_DECODER) += vp8.h
+API_SRCS-$(CONFIG_VP8_DECODER) += vp8dx.h
+API_DOC_SRCS-$(CONFIG_VP8_DECODER) += vp8.h
+API_DOC_SRCS-$(CONFIG_VP8_DECODER) += vp8dx.h
+
+API_DOC_SRCS-yes += vpx_codec.h
+API_DOC_SRCS-yes += vpx_decoder.h
+API_DOC_SRCS-yes += vpx_encoder.h
+API_DOC_SRCS-yes += vpx_frame_buffer.h
+API_DOC_SRCS-yes += vpx_image.h
+
+API_SRCS-yes += src/vpx_decoder.c
+API_SRCS-yes += vpx_decoder.h
+API_SRCS-yes += src/vpx_encoder.c
+API_SRCS-yes += vpx_encoder.h
+API_SRCS-yes += internal/vpx_codec_internal.h
+API_SRCS-yes += internal/vpx_psnr.h
+API_SRCS-yes += src/vpx_codec.c
+API_SRCS-yes += src/vpx_image.c
+API_SRCS-yes += src/vpx_psnr.c
+API_SRCS-yes += vpx_codec.h
+API_SRCS-yes += vpx_codec.mk
+API_SRCS-yes += vpx_frame_buffer.h
+API_SRCS-yes += vpx_image.h
+API_SRCS-yes += vpx_integer.h
diff --git a/libs/libvpx/vpx/vpx_decoder.h b/libs/libvpx/vpx/vpx_decoder.h
new file mode 100644
index 0000000000..62fd919756
--- /dev/null
+++ b/libs/libvpx/vpx/vpx_decoder.h
@@ -0,0 +1,378 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef VPX_VPX_DECODER_H_
+#define VPX_VPX_DECODER_H_
+
+/*!\defgroup decoder Decoder Algorithm Interface
+ * \ingroup codec
+ * This abstraction allows applications using this decoder to easily support
+ * multiple video formats with minimal code duplication. This section describes
+ * the interface common to all decoders.
+ * @{
+ */
+
+/*!\file
+ * \brief Describes the decoder algorithm interface to applications.
+ *
+ * This file describes the interface between an application and a
+ * video decoder algorithm.
+ *
+ */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "./vpx_codec.h"
+#include "./vpx_frame_buffer.h"
+
+  /*!\brief Current ABI version number
+   *
+   * \internal
+   * If this file is altered in any way that changes the ABI, this value
+   * must be bumped.  Examples include, but are not limited to, changing
+   * types, removing or reassigning enums, adding/removing/rearranging
+   * fields to structures
+   */
+#define VPX_DECODER_ABI_VERSION (3 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/
+
+  /*! \brief Decoder capabilities bitfield
+   *
+   *  Each decoder advertises the capabilities it supports as part of its
+   *  ::vpx_codec_iface_t interface structure. Capabilities are extra interfaces
+   *  or functionality, and are not required to be supported by a decoder.
+   *
+   *  The available flags are specified by VPX_CODEC_CAP_* defines.
+   */
+#define VPX_CODEC_CAP_PUT_SLICE  0x10000 /**< Will issue put_slice callbacks */
+#define VPX_CODEC_CAP_PUT_FRAME  0x20000 /**< Will issue put_frame callbacks */
+#define VPX_CODEC_CAP_POSTPROC   0x40000 /**< Can postprocess decoded frame */
+#define VPX_CODEC_CAP_ERROR_CONCEALMENT   0x80000 /**< Can conceal errors due to
+  packet loss */
+#define VPX_CODEC_CAP_INPUT_FRAGMENTS   0x100000 /**< Can receive encoded frames
+  one fragment at a time */
+
+  /*! \brief Initialization-time Feature Enabling
+   *
+   *  Certain codec features must be known at initialization time, to allow for
+   *  proper memory allocation.
+   *
+   *  The available flags are specified by VPX_CODEC_USE_* defines.
+   */
+#define VPX_CODEC_CAP_FRAME_THREADING   0x200000 /**< Can support frame-based
+                                                      multi-threading */
+#define VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER 0x400000 /**< Can support external
+                                                          frame buffers */
+
+#define VPX_CODEC_USE_POSTPROC   0x10000 /**< Postprocess decoded frame */
+#define VPX_CODEC_USE_ERROR_CONCEALMENT 0x20000 /**< Conceal errors in decoded
+  frames */
+#define VPX_CODEC_USE_INPUT_FRAGMENTS   0x40000 /**< The input frame should be
+  passed to the decoder one
+  fragment at a time */
+#define VPX_CODEC_USE_FRAME_THREADING   0x80000 /**< Enable frame-based
+                                                     multi-threading */
+
+  /*!\brief Stream properties
+   *
+   * This structure is used to query or set properties of the decoded
+   * stream. Algorithms may extend this structure with data specific
+   * to their bitstream by setting the sz member appropriately.
+   */
+  typedef struct vpx_codec_stream_info {
+    unsigned int sz;     /**< Size of this structure */
+    unsigned int w;      /**< Width (or 0 for unknown/default) */
+    unsigned int h;      /**< Height (or 0 for unknown/default) */
+    unsigned int is_kf;  /**< Current frame is a keyframe */
+  } vpx_codec_stream_info_t;
+
+  /* REQUIRED FUNCTIONS
+   *
+   * The following functions are required to be implemented for all decoders.
+   * They represent the base case functionality expected of all decoders.
+   */
+
+
+  /*!\brief Initialization Configurations
+   *
+   * This structure is used to pass init time configuration options to the
+   * decoder.
+   */
+  typedef struct vpx_codec_dec_cfg {
+    unsigned int threads; /**< Maximum number of threads to use, default 1 */
+    unsigned int w;      /**< Width */
+    unsigned int h;      /**< Height */
+  } vpx_codec_dec_cfg_t; /**< alias for struct vpx_codec_dec_cfg */
+
+
+  /*!\brief Initialize a decoder instance
+   *
+   * Initializes a decoder context using the given interface. Applications
+   * should call the vpx_codec_dec_init convenience macro instead of this
+   * function directly, to ensure that the ABI version number parameter
+   * is properly initialized.
+   *
+   * If the library was configured with --disable-multithread, this call
+   * is not thread safe and should be guarded with a lock if being used
+   * in a multithreaded context.
+   *
+   * \param[in]    ctx     Pointer to this instance's context.
+   * \param[in]    iface   Pointer to the algorithm interface to use.
+   * \param[in]    cfg     Configuration to use, if known. May be NULL.
+   * \param[in]    flags   Bitfield of VPX_CODEC_USE_* flags
+   * \param[in]    ver     ABI version number. Must be set to
+   *                       VPX_DECODER_ABI_VERSION
+   * \retval #VPX_CODEC_OK
+   *     The decoder algorithm initialized.
+   * \retval #VPX_CODEC_MEM_ERROR
+   *     Memory allocation failed.
+   */
+  vpx_codec_err_t vpx_codec_dec_init_ver(vpx_codec_ctx_t      *ctx,
+                                         vpx_codec_iface_t    *iface,
+                                         const vpx_codec_dec_cfg_t *cfg,
+                                         vpx_codec_flags_t     flags,
+                                         int                   ver);
+
+  /*!\brief Convenience macro for vpx_codec_dec_init_ver()
+   *
+   * Ensures the ABI version parameter is properly set.
+   */
+#define vpx_codec_dec_init(ctx, iface, cfg, flags) \
+  vpx_codec_dec_init_ver(ctx, iface, cfg, flags, VPX_DECODER_ABI_VERSION)
+
+
+  /*!\brief Parse stream info from a buffer
+   *
+   * Performs high level parsing of the bitstream. Construction of a decoder
+   * context is not necessary. Can be used to determine if the bitstream is
+   * of the proper format, and to extract information from the stream.
+   *
+   * \param[in]      iface   Pointer to the algorithm interface
+   * \param[in]      data    Pointer to a block of data to parse
+   * \param[in]      data_sz Size of the data buffer
+   * \param[in,out]  si      Pointer to stream info to update. The size member
+   *                         \ref MUST be properly initialized, but \ref MAY be
+   *                         clobbered by the algorithm. This parameter \ref MAY
+   *                         be NULL.
+   *
+   * \retval #VPX_CODEC_OK
+   *     Bitstream is parsable and stream information updated
+   */
+  vpx_codec_err_t vpx_codec_peek_stream_info(vpx_codec_iface_t       *iface,
+                                             const uint8_t           *data,
+                                             unsigned int             data_sz,
+                                             vpx_codec_stream_info_t *si);
+
+
+  /*!\brief Return information about the current stream.
+   *
+   * Returns information about the stream that has been parsed during decoding.
+   *
+   * \param[in]      ctx     Pointer to this instance's context
+   * \param[in,out]  si      Pointer to stream info to update. The size member
+   *                         \ref MUST be properly initialized, but \ref MAY be
+   *                         clobbered by the algorithm. This parameter \ref MAY
+   *                         be NULL.
+   *
+   * \retval #VPX_CODEC_OK
+   *     Bitstream is parsable and stream information updated
+   */
+  vpx_codec_err_t vpx_codec_get_stream_info(vpx_codec_ctx_t         *ctx,
+                                            vpx_codec_stream_info_t *si);
+
+
+  /*!\brief Decode data
+   *
+   * Processes a buffer of coded data. If the processing results in a new
+   * decoded frame becoming available, PUT_SLICE and PUT_FRAME events may be
+   * generated, as appropriate. Encoded data \ref MUST be passed in DTS (decode
+   * time stamp) order. Frames produced will always be in PTS (presentation
+   * time stamp) order.
+   * If the decoder is configured with VPX_CODEC_USE_INPUT_FRAGMENTS enabled,
+   * data and data_sz can contain a fragment of the encoded frame. Fragment
+   * \#n must contain at least partition \#n, but can also contain subsequent
+   * partitions (\#n+1 - \#n+i), and if so, fragments \#n+1, .., \#n+i must
+   * be empty. When no more data is available, this function should be called
+   * with NULL as data and 0 as data_sz. The memory passed to this function
+   * must be available until the frame has been decoded.
+   *
+   * \param[in] ctx          Pointer to this instance's context
+   * \param[in] data         Pointer to this block of new coded data. If
+   *                         NULL, a VPX_CODEC_CB_PUT_FRAME event is posted
+   *                         for the previously decoded frame.
+   * \param[in] data_sz      Size of the coded data, in bytes.
+   * \param[in] user_priv    Application specific data to associate with
+   *                         this frame.
+   * \param[in] deadline     Soft deadline the decoder should attempt to meet,
+   *                         in us. Set to zero for unlimited.
+   *
+   * \return Returns #VPX_CODEC_OK if the coded data was processed completely
+   *         and future pictures can be decoded without error. Otherwise,
+   *         see the descriptions of the other error codes in ::vpx_codec_err_t
+   *         for recoverability capabilities.
+   */
+  vpx_codec_err_t vpx_codec_decode(vpx_codec_ctx_t    *ctx,
+                                   const uint8_t        *data,
+                                   unsigned int            data_sz,
+                                   void               *user_priv,
+                                   long                deadline);
+
+
+  /*!\brief Decoded frames iterator
+   *
+   * Iterates over a list of the frames available for display. The iterator
+   * storage should be initialized to NULL to start the iteration. Iteration is
+   * complete when this function returns NULL.
+   *
+   * The list of available frames becomes valid upon completion of the
+   * vpx_codec_decode call, and remains valid until the next call to vpx_codec_decode.
+   *
+   * \param[in]     ctx      Pointer to this instance's context
+   * \param[in,out] iter     Iterator storage, initialized to NULL
+   *
+   * \return Returns a pointer to an image, if one is ready for display. Frames
+   *         produced will always be in PTS (presentation time stamp) order.
+   */
+  vpx_image_t *vpx_codec_get_frame(vpx_codec_ctx_t  *ctx,
+                                   vpx_codec_iter_t *iter);
+
+
+  /*!\defgroup cap_put_frame Frame-Based Decoding Functions
+   *
+   * The following functions are required to be implemented for all decoders
+   * that advertise the VPX_CODEC_CAP_PUT_FRAME capability. Calling these functions
+   * for codecs that don't advertise this capability will result in an error
+   * code being returned, usually VPX_CODEC_ERROR
+   * @{
+   */
+
+  /*!\brief put frame callback prototype
+   *
+   * This callback is invoked by the decoder to notify the application of
+   * the availability of decoded image data.
+   */
+  typedef void (*vpx_codec_put_frame_cb_fn_t)(void        *user_priv,
+                                              const vpx_image_t *img);
+
+
+  /*!\brief Register for notification of frame completion.
+   *
+   * Registers a given function to be called when a decoded frame is
+   * available.
+   *
+   * \param[in] ctx          Pointer to this instance's context
+   * \param[in] cb           Pointer to the callback function
+   * \param[in] user_priv    User's private data
+   *
+   * \retval #VPX_CODEC_OK
+   *     Callback successfully registered.
+   * \retval #VPX_CODEC_ERROR
+   *     Decoder context not initialized, or algorithm not capable of
+   *     posting slice completion.
+   */
+  vpx_codec_err_t vpx_codec_register_put_frame_cb(vpx_codec_ctx_t             *ctx,
+                                                  vpx_codec_put_frame_cb_fn_t  cb,
+                                                  void                        *user_priv);
+
+
+  /*!@} - end defgroup cap_put_frame */
+
+  /*!\defgroup cap_put_slice Slice-Based Decoding Functions
+   *
+   * The following functions are required to be implemented for all decoders
+   * that advertise the VPX_CODEC_CAP_PUT_SLICE capability. Calling these functions
+   * for codecs that don't advertise this capability will result in an error
+   * code being returned, usually VPX_CODEC_ERROR
+   * @{
+   */
+
+  /*!\brief put slice callback prototype
+   *
+   * This callback is invoked by the decoder to notify the application of
+   * the availability of partially decoded image data. The
+   */
+  typedef void (*vpx_codec_put_slice_cb_fn_t)(void         *user_priv,
+                                              const vpx_image_t      *img,
+                                              const vpx_image_rect_t *valid,
+                                              const vpx_image_rect_t *update);
+
+
+  /*!\brief Register for notification of slice completion.
+   *
+   * Registers a given function to be called when a decoded slice is
+   * available.
+   *
+   * \param[in] ctx          Pointer to this instance's context
+   * \param[in] cb           Pointer to the callback function
+   * \param[in] user_priv    User's private data
+   *
+   * \retval #VPX_CODEC_OK
+   *     Callback successfully registered.
+   * \retval #VPX_CODEC_ERROR
+   *     Decoder context not initialized, or algorithm not capable of
+   *     posting slice completion.
+   */
+  vpx_codec_err_t vpx_codec_register_put_slice_cb(vpx_codec_ctx_t             *ctx,
+                                                  vpx_codec_put_slice_cb_fn_t  cb,
+                                                  void                        *user_priv);
+
+
+  /*!@} - end defgroup cap_put_slice*/
+
+  /*!\defgroup cap_external_frame_buffer External Frame Buffer Functions
+   *
+   * The following section is required to be implemented for all decoders
+   * that advertise the VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER capability.
+   * Calling this function for codecs that don't advertise this capability
+   * will result in an error code being returned, usually VPX_CODEC_ERROR.
+   *
+   * \note
+   * Currently this only works with VP9.
+   * @{
+   */
+
+  /*!\brief Pass in external frame buffers for the decoder to use.
+   *
+   * Registers functions to be called when libvpx needs a frame buffer
+   * to decode the current frame and a function to be called when libvpx does
+   * not internally reference the frame buffer. This set function must
+   * be called before the first call to decode or libvpx will assume the
+   * default behavior of allocating frame buffers internally.
+   *
+   * \param[in] ctx          Pointer to this instance's context
+   * \param[in] cb_get       Pointer to the get callback function
+   * \param[in] cb_release   Pointer to the release callback function
+   * \param[in] cb_priv      Callback's private data
+   *
+   * \retval #VPX_CODEC_OK
+   *     External frame buffers will be used by libvpx.
+   * \retval #VPX_CODEC_INVALID_PARAM
+   *     One or more of the callbacks were NULL.
+   * \retval #VPX_CODEC_ERROR
+   *     Decoder context not initialized, or algorithm not capable of
+   *     using external frame buffers.
+   *
+   * \note
+   * When decoding VP9, the application may be required to pass in at least
+   * #VP9_MAXIMUM_REF_BUFFERS + #VPX_MAXIMUM_WORK_BUFFERS external frame
+   * buffers.
+   */
+  vpx_codec_err_t vpx_codec_set_frame_buffer_functions(
+      vpx_codec_ctx_t *ctx,
+      vpx_get_frame_buffer_cb_fn_t cb_get,
+      vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv);
+
+  /*!@} - end defgroup cap_external_frame_buffer */
+
+  /*!@} - end defgroup decoder*/
+#ifdef __cplusplus
+}
+#endif
+#endif  // VPX_VPX_DECODER_H_
+
diff --git a/libs/libvpx/vpx/vpx_encoder.h b/libs/libvpx/vpx/vpx_encoder.h
new file mode 100644
index 0000000000..955e873519
--- /dev/null
+++ b/libs/libvpx/vpx/vpx_encoder.h
@@ -0,0 +1,1043 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef VPX_VPX_ENCODER_H_
+#define VPX_VPX_ENCODER_H_
+
+/*!\defgroup encoder Encoder Algorithm Interface
+ * \ingroup codec
+ * This abstraction allows applications using this encoder to easily support
+ * multiple video formats with minimal code duplication. This section describes
+ * the interface common to all encoders.
+ * @{
+ */
+
+/*!\file
+ * \brief Describes the encoder algorithm interface to applications.
+ *
+ * This file describes the interface between an application and a
+ * video encoder algorithm.
+ *
+ */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "./vpx_codec.h"
+
+  /*! Temporal Scalability: Maximum length of the sequence defining frame
+   * layer membership
+   */
+#define VPX_TS_MAX_PERIODICITY 16
+
+  /*! Temporal Scalability: Maximum number of coding layers */
+#define VPX_TS_MAX_LAYERS       5
+
+  /*!\deprecated Use #VPX_TS_MAX_PERIODICITY instead. */
+#define MAX_PERIODICITY VPX_TS_MAX_PERIODICITY
+
+/*! Temporal+Spatial Scalability: Maximum number of coding layers */
+#define VPX_MAX_LAYERS  12  // 3 temporal + 4 spatial layers are allowed.
+
+/*!\deprecated Use #VPX_MAX_LAYERS instead. */
+#define MAX_LAYERS    VPX_MAX_LAYERS  // 3 temporal + 4 spatial layers allowed.
+
+/*! Spatial Scalability: Maximum number of coding layers */
+#define VPX_SS_MAX_LAYERS       5
+
+/*! Spatial Scalability: Default number of coding layers */
+#define VPX_SS_DEFAULT_LAYERS       1
+
+  /*!\brief Current ABI version number
+   *
+   * \internal
+   * If this file is altered in any way that changes the ABI, this value
+   * must be bumped.  Examples include, but are not limited to, changing
+   * types, removing or reassigning enums, adding/removing/rearranging
+   * fields to structures
+   */
+#define VPX_ENCODER_ABI_VERSION (5 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/
+
+
+  /*! \brief Encoder capabilities bitfield
+   *
+   *  Each encoder advertises the capabilities it supports as part of its
+   *  ::vpx_codec_iface_t interface structure. Capabilities are extra
+   *  interfaces or functionality, and are not required to be supported
+   *  by an encoder.
+   *
+   *  The available flags are specified by VPX_CODEC_CAP_* defines.
+   */
+#define VPX_CODEC_CAP_PSNR  0x10000 /**< Can issue PSNR packets */
+
+  /*! Can output one partition at a time. Each partition is returned in its
+   *  own VPX_CODEC_CX_FRAME_PKT, with the FRAME_IS_FRAGMENT flag set for
+   *  every partition but the last. In this mode all frames are always
+   *  returned partition by partition.
+   */
+#define VPX_CODEC_CAP_OUTPUT_PARTITION  0x20000
+
+/*! Can support input images at greater than 8 bitdepth.
+ */
+#define VPX_CODEC_CAP_HIGHBITDEPTH  0x40000
+
+  /*! \brief Initialization-time Feature Enabling
+   *
+   *  Certain codec features must be known at initialization time, to allow
+   *  for proper memory allocation.
+   *
+   *  The available flags are specified by VPX_CODEC_USE_* defines.
+   */
+#define VPX_CODEC_USE_PSNR  0x10000 /**< Calculate PSNR on each frame */
+#define VPX_CODEC_USE_OUTPUT_PARTITION  0x20000 /**< Make the encoder output one
+  partition at a time. */
+#define VPX_CODEC_USE_HIGHBITDEPTH 0x40000 /**< Use high bitdepth */
+
+
+  /*!\brief Generic fixed size buffer structure
+   *
+   * This structure is able to hold a reference to any fixed size buffer.
+   */
+  typedef struct vpx_fixed_buf {
+    void          *buf; /**< Pointer to the data */
+    size_t         sz;  /**< Length of the buffer, in chars */
+  } vpx_fixed_buf_t; /**< alias for struct vpx_fixed_buf */
+
+
+  /*!\brief Time Stamp Type
+   *
+   * An integer, which when multiplied by the stream's time base, provides
+   * the absolute time of a sample.
+   */
+  typedef int64_t vpx_codec_pts_t;
+
+
+  /*!\brief Compressed Frame Flags
+   *
+   * This type represents a bitfield containing information about a compressed
+   * frame that may be useful to an application. The most significant 16 bits
+   * can be used by an algorithm to provide additional detail, for example to
+   * support frame types that are codec specific (MPEG-1 D-frames for example)
+   */
+  typedef uint32_t vpx_codec_frame_flags_t;
+#define VPX_FRAME_IS_KEY       0x1 /**< frame is the start of a GOP */
+#define VPX_FRAME_IS_DROPPABLE 0x2 /**< frame can be dropped without affecting
+  the stream (no future frame depends on
+              this one) */
+#define VPX_FRAME_IS_INVISIBLE 0x4 /**< frame should be decoded but will not
+  be shown */
+#define VPX_FRAME_IS_FRAGMENT  0x8 /**< this is a fragment of the encoded
+  frame */
+
+  /*!\brief Error Resilient flags
+   *
+   * These flags define which error resilient features to enable in the
+   * encoder. The flags are specified through the
+   * vpx_codec_enc_cfg::g_error_resilient variable.
+   */
+  typedef uint32_t vpx_codec_er_flags_t;
+#define VPX_ERROR_RESILIENT_DEFAULT     0x1 /**< Improve resiliency against
+  losses of whole frames */
+#define VPX_ERROR_RESILIENT_PARTITIONS  0x2 /**< The frame partitions are
+  independently decodable by the
+  bool decoder, meaning that
+  partitions can be decoded even
+  though earlier partitions have
+  been lost. Note that intra
+  prediction is still done over
+  the partition boundary. */
+
+  /*!\brief Encoder output packet variants
+   *
+   * This enumeration lists the different kinds of data packets that can be
+   * returned by calls to vpx_codec_get_cx_data(). Algorithms \ref MAY
+   * extend this list to provide additional functionality.
+   */
+  enum vpx_codec_cx_pkt_kind {
+    VPX_CODEC_CX_FRAME_PKT,    /**< Compressed video frame */
+    VPX_CODEC_STATS_PKT,       /**< Two-pass statistics for this frame */
+    VPX_CODEC_FPMB_STATS_PKT,  /**< first pass mb statistics for this frame */
+    VPX_CODEC_PSNR_PKT,        /**< PSNR statistics for this frame */
+    // Spatial SVC is still experimental and may be removed before the next ABI
+    // bump.
+#if VPX_ENCODER_ABI_VERSION > (5 + VPX_CODEC_ABI_VERSION)
+    VPX_CODEC_SPATIAL_SVC_LAYER_SIZES, /**< Sizes for each layer in this frame*/
+    VPX_CODEC_SPATIAL_SVC_LAYER_PSNR, /**< PSNR for each layer in this frame*/
+#endif
+    VPX_CODEC_CUSTOM_PKT = 256 /**< Algorithm extensions  */
+  };
+
+
+  /*!\brief Encoder output packet
+   *
+   * This structure contains the different kinds of output data the encoder
+   * may produce while compressing a frame.
+   */
+  typedef struct vpx_codec_cx_pkt {
+    enum vpx_codec_cx_pkt_kind  kind; /**< packet variant */
+    union {
+      struct {
+        void                    *buf;      /**< compressed data buffer */
+        size_t                   sz;       /**< length of compressed data */
+        vpx_codec_pts_t          pts;      /**< time stamp to show frame
+                                                    (in timebase units) */
+        unsigned long            duration; /**< duration to show frame
+                                                    (in timebase units) */
+        vpx_codec_frame_flags_t  flags;    /**< flags for this frame */
+        int                      partition_id; /**< the partition id
+                                              defines the decoding order
+                                              of the partitions. Only
+                                              applicable when "output partition"
+                                              mode is enabled. First partition
+                                              has id 0.*/
+
+      } frame;  /**< data for compressed frame packet */
+      vpx_fixed_buf_t twopass_stats;  /**< data for two-pass packet */
+      vpx_fixed_buf_t firstpass_mb_stats; /**< first pass mb packet */
+      struct vpx_psnr_pkt {
+        unsigned int samples[4];  /**< Number of samples, total/y/u/v */
+        uint64_t     sse[4];      /**< sum squared error, total/y/u/v */
+        double       psnr[4];     /**< PSNR, total/y/u/v */
+      } psnr;                       /**< data for PSNR packet */
+      vpx_fixed_buf_t raw;     /**< data for arbitrary packets */
+      // Spatial SVC is still experimental and may be removed before the next
+      // ABI bump.
+#if VPX_ENCODER_ABI_VERSION > (5 + VPX_CODEC_ABI_VERSION)
+      size_t layer_sizes[VPX_SS_MAX_LAYERS];
+      struct vpx_psnr_pkt layer_psnr[VPX_SS_MAX_LAYERS];
+#endif
+
+      /* This packet size is fixed to allow codecs to extend this
+       * interface without having to manage storage for raw packets,
+       * i.e., if it's smaller than 128 bytes, you can store in the
+       * packet list directly.
+       */
+      char pad[128 - sizeof(enum vpx_codec_cx_pkt_kind)]; /**< fixed sz */
+    } data; /**< packet data */
+  } vpx_codec_cx_pkt_t; /**< alias for struct vpx_codec_cx_pkt */
+
+
+  /*!\brief Encoder return output buffer callback
+   *
+   * This callback function, when registered, returns with packets when each
+   * spatial layer is encoded.
+   */
+  // putting the definitions here for now. (agrange: find if there
+  // is a better place for this)
+  typedef void (* vpx_codec_enc_output_cx_pkt_cb_fn_t)(vpx_codec_cx_pkt_t *pkt,
+                                                       void *user_data);
+
+  /*!\brief Callback function pointer / user data pair storage */
+  typedef struct vpx_codec_enc_output_cx_cb_pair {
+    vpx_codec_enc_output_cx_pkt_cb_fn_t output_cx_pkt; /**< Callback function */
+    void                            *user_priv; /**< Pointer to private data */
+  } vpx_codec_priv_output_cx_pkt_cb_pair_t;
+
+  /*!\brief Rational Number
+   *
+   * This structure holds a fractional value.
+   */
+  typedef struct vpx_rational {
+    int num; /**< fraction numerator */
+    int den; /**< fraction denominator */
+  } vpx_rational_t; /**< alias for struct vpx_rational */
+
+
+  /*!\brief Multi-pass Encoding Pass */
+  enum vpx_enc_pass {
+    VPX_RC_ONE_PASS,   /**< Single pass mode */
+    VPX_RC_FIRST_PASS, /**< First pass of multi-pass mode */
+    VPX_RC_LAST_PASS   /**< Final pass of multi-pass mode */
+  };
+
+
+  /*!\brief Rate control mode */
+  enum vpx_rc_mode {
+    VPX_VBR,  /**< Variable Bit Rate (VBR) mode */
+    VPX_CBR,  /**< Constant Bit Rate (CBR) mode */
+    VPX_CQ,   /**< Constrained Quality (CQ)  mode */
+    VPX_Q,    /**< Constant Quality (Q) mode */
+  };
+
+
+  /*!\brief Keyframe placement mode.
+   *
+   * This enumeration determines whether keyframes are placed automatically by
+   * the encoder or whether this behavior is disabled. Older releases of this
+   * SDK were implemented such that VPX_KF_FIXED meant keyframes were disabled.
+   * This name is confusing for this behavior, so the new symbols to be used
+   * are VPX_KF_AUTO and VPX_KF_DISABLED.
+   */
+  enum vpx_kf_mode {
+    VPX_KF_FIXED, /**< deprecated, implies VPX_KF_DISABLED */
+    VPX_KF_AUTO,  /**< Encoder determines optimal placement automatically */
+    VPX_KF_DISABLED = 0 /**< Encoder does not place keyframes. */
+  };
+
+
+  /*!\brief Encoded Frame Flags
+   *
+   * This type indicates a bitfield to be passed to vpx_codec_encode(), defining
+   * per-frame boolean values. By convention, bits common to all codecs will be
+   * named VPX_EFLAG_*, and bits specific to an algorithm will be named
+   * /algo/_eflag_*. The lower order 16 bits are reserved for common use.
+   */
+  typedef long vpx_enc_frame_flags_t;
+#define VPX_EFLAG_FORCE_KF (1<<0)  /**< Force this frame to be a keyframe */
+
+
+  /*!\brief Encoder configuration structure
+   *
+   * This structure contains the encoder settings that have common representations
+   * across all codecs. This doesn't imply that all codecs support all features,
+   * however.
+   */
+  typedef struct vpx_codec_enc_cfg {
+    /*
+     * generic settings (g)
+     */
+
+    /*!\brief Algorithm specific "usage" value
+     *
+     * Algorithms may define multiple values for usage, which may convey the
+     * intent of how the application intends to use the stream. If this value
+     * is non-zero, consult the documentation for the codec to determine its
+     * meaning.
+     */
+    unsigned int           g_usage;
+
+
+    /*!\brief Maximum number of threads to use
+     *
+     * For multi-threaded implementations, use no more than this number of
+     * threads. The codec may use fewer threads than allowed. The value
+     * 0 is equivalent to the value 1.
+     */
+    unsigned int           g_threads;
+
+
+    /*!\brief Bitstream profile to use
+     *
+     * Some codecs support a notion of multiple bitstream profiles. Typically
+     * this maps to a set of features that are turned on or off. Often the
+     * profile to use is determined by the features of the intended decoder.
+     * Consult the documentation for the codec to determine the valid values
+     * for this parameter, or set to zero for a sane default.
+     */
+    unsigned int           g_profile;  /**< profile of bitstream to use */
+
+
+
+    /*!\brief Width of the frame
+     *
+     * This value identifies the presentation resolution of the frame,
+     * in pixels. Note that the frames passed as input to the encoder must
+     * have this resolution. Frames will be presented by the decoder in this
+     * resolution, independent of any spatial resampling the encoder may do.
+     */
+    unsigned int           g_w;
+
+
+    /*!\brief Height of the frame
+     *
+     * This value identifies the presentation resolution of the frame,
+     * in pixels. Note that the frames passed as input to the encoder must
+     * have this resolution. Frames will be presented by the decoder in this
+     * resolution, independent of any spatial resampling the encoder may do.
+     */
+    unsigned int           g_h;
+
+    /*!\brief Bit-depth of the codec
+     *
+     * This value identifies the bit_depth of the codec,
+     * Only certain bit-depths are supported as identified in the
+     * vpx_bit_depth_t enum.
+     */
+    vpx_bit_depth_t        g_bit_depth;
+
+    /*!\brief Bit-depth of the input frames
+     *
+     * This value identifies the bit_depth of the input frames in bits.
+     * Note that the frames passed as input to the encoder must have
+     * this bit-depth.
+     */
+    unsigned int           g_input_bit_depth;
+
+    /*!\brief Stream timebase units
+     *
+     * Indicates the smallest interval of time, in seconds, used by the stream.
+     * For fixed frame rate material, or variable frame rate material where
+     * frames are timed at a multiple of a given clock (ex: video capture),
+     * the \ref RECOMMENDED method is to set the timebase to the reciprocal
+     * of the frame rate (ex: 1001/30000 for 29.970 Hz NTSC). This allows the
+     * pts to correspond to the frame number, which can be handy. For
+     * re-encoding video from containers with absolute time timestamps, the
+     * \ref RECOMMENDED method is to set the timebase to that of the parent
+     * container or multimedia framework (ex: 1/1000 for ms, as in FLV).
+     */
+    struct vpx_rational    g_timebase;
+
+
+    /*!\brief Enable error resilient modes.
+     *
+     * The error resilient bitfield indicates to the encoder which features
+     * it should enable to take measures for streaming over lossy or noisy
+     * links.
+     */
+    vpx_codec_er_flags_t   g_error_resilient;
+
+
+    /*!\brief Multi-pass Encoding Mode
+     *
+     * This value should be set to the current phase for multi-pass encoding.
+     * For single pass, set to #VPX_RC_ONE_PASS.
+     */
+    enum vpx_enc_pass      g_pass;
+
+
+    /*!\brief Allow lagged encoding
+     *
+     * If set, this value allows the encoder to consume a number of input
+     * frames before producing output frames. This allows the encoder to
+     * base decisions for the current frame on future frames. This does
+     * increase the latency of the encoding pipeline, so it is not appropriate
+     * in all situations (ex: realtime encoding).
+     *
+     * Note that this is a maximum value -- the encoder may produce frames
+     * sooner than the given limit. Set this value to 0 to disable this
+     * feature.
+     */
+    unsigned int           g_lag_in_frames;
+
+
+    /*
+     * rate control settings (rc)
+     */
+
+    /*!\brief Temporal resampling configuration, if supported by the codec.
+     *
+     * Temporal resampling allows the codec to "drop" frames as a strategy to
+     * meet its target data rate. This can cause temporal discontinuities in
+     * the encoded video, which may appear as stuttering during playback. This
+     * trade-off is often acceptable, but for many applications is not. It can
+     * be disabled in these cases.
+     *
+     * Note that not all codecs support this feature. All vpx VPx codecs do.
+     * For other codecs, consult the documentation for that algorithm.
+     *
+     * This threshold is described as a percentage of the target data buffer.
+     * When the data buffer falls below this percentage of fullness, a
+     * dropped frame is indicated. Set the threshold to zero (0) to disable
+     * this feature.
+     */
+    unsigned int           rc_dropframe_thresh;
+
+
+    /*!\brief Enable/disable spatial resampling, if supported by the codec.
+     *
+     * Spatial resampling allows the codec to compress a lower resolution
+     * version of the frame, which is then upscaled by the encoder to the
+     * correct presentation resolution. This increases visual quality at
+     * low data rates, at the expense of CPU time on the encoder/decoder.
+     */
+    unsigned int           rc_resize_allowed;
+
+    /*!\brief Internal coded frame width.
+     *
+     * If spatial resampling is enabled this specifies the width of the
+     * encoded frame.
+     */
+    unsigned int           rc_scaled_width;
+
+    /*!\brief Internal coded frame height.
+     *
+     * If spatial resampling is enabled this specifies the height of the
+     * encoded frame.
+     */
+    unsigned int           rc_scaled_height;
+
+    /*!\brief Spatial resampling up watermark.
+     *
+     * This threshold is described as a percentage of the target data buffer.
+     * When the data buffer rises above this percentage of fullness, the
+     * encoder will step up to a higher resolution version of the frame.
+     */
+    unsigned int           rc_resize_up_thresh;
+
+
+    /*!\brief Spatial resampling down watermark.
+     *
+     * This threshold is described as a percentage of the target data buffer.
+     * When the data buffer falls below this percentage of fullness, the
+     * encoder will step down to a lower resolution version of the frame.
+     */
+    unsigned int           rc_resize_down_thresh;
+
+
+    /*!\brief Rate control algorithm to use.
+     *
+     * Indicates whether the end usage of this stream is to be streamed over
+     * a bandwidth constrained link, indicating that Constant Bit Rate (CBR)
+     * mode should be used, or whether it will be played back on a high
+     * bandwidth link, as from a local disk, where higher variations in
+     * bitrate are acceptable.
+     */
+    enum vpx_rc_mode       rc_end_usage;
+
+
+    /*!\brief Two-pass stats buffer.
+     *
+     * A buffer containing all of the stats packets produced in the first
+     * pass, concatenated.
+     */
+    vpx_fixed_buf_t   rc_twopass_stats_in;
+
+    /*!\brief first pass mb stats buffer.
+     *
+     * A buffer containing all of the first pass mb stats packets produced
+     * in the first pass, concatenated.
+     */
+    vpx_fixed_buf_t   rc_firstpass_mb_stats_in;
+
+    /*!\brief Target data rate
+     *
+     * Target bandwidth to use for this stream, in kilobits per second.
+     */
+    unsigned int           rc_target_bitrate;
+
+
+    /*
+     * quantizer settings
+     */
+
+
+    /*!\brief Minimum (Best Quality) Quantizer
+     *
+     * The quantizer is the most direct control over the quality of the
+     * encoded image. The range of valid values for the quantizer is codec
+     * specific. Consult the documentation for the codec to determine the
+     * values to use. To determine the range programmatically, call
+     * vpx_codec_enc_config_default() with a usage value of 0.
+     */
+    unsigned int           rc_min_quantizer;
+
+
+    /*!\brief Maximum (Worst Quality) Quantizer
+     *
+     * The quantizer is the most direct control over the quality of the
+     * encoded image. The range of valid values for the quantizer is codec
+     * specific. Consult the documentation for the codec to determine the
+     * values to use. To determine the range programmatically, call
+     * vpx_codec_enc_config_default() with a usage value of 0.
+     */
+    unsigned int           rc_max_quantizer;
+
+
+    /*
+     * bitrate tolerance
+     */
+
+
+    /*!\brief Rate control adaptation undershoot control
+     *
+     * This value, expressed as a percentage of the target bitrate,
+     * controls the maximum allowed adaptation speed of the codec.
+     * This factor controls the maximum amount of bits that can
+     * be subtracted from the target bitrate in order to compensate
+     * for prior overshoot.
+     *
+     * Valid values in the range 0-1000.
+     */
+    unsigned int           rc_undershoot_pct;
+
+
+    /*!\brief Rate control adaptation overshoot control
+     *
+     * This value, expressed as a percentage of the target bitrate,
+     * controls the maximum allowed adaptation speed of the codec.
+     * This factor controls the maximum amount of bits that can
+     * be added to the target bitrate in order to compensate for
+     * prior undershoot.
+     *
+     * Valid values in the range 0-1000.
+     */
+    unsigned int           rc_overshoot_pct;
+
+
+    /*
+     * decoder buffer model parameters
+     */
+
+
+    /*!\brief Decoder Buffer Size
+     *
+     * This value indicates the amount of data that may be buffered by the
+     * decoding application. Note that this value is expressed in units of
+     * time (milliseconds). For example, a value of 5000 indicates that the
+     * client will buffer (at least) 5000ms worth of encoded data. Use the
+     * target bitrate (#rc_target_bitrate) to convert to bits/bytes, if
+     * necessary.
+     */
+    unsigned int           rc_buf_sz;
+
+
+    /*!\brief Decoder Buffer Initial Size
+     *
+     * This value indicates the amount of data that will be buffered by the
+     * decoding application prior to beginning playback. This value is
+     * expressed in units of time (milliseconds). Use the target bitrate
+     * (#rc_target_bitrate) to convert to bits/bytes, if necessary.
+     */
+    unsigned int           rc_buf_initial_sz;
+
+
+    /*!\brief Decoder Buffer Optimal Size
+     *
+     * This value indicates the amount of data that the encoder should try
+     * to maintain in the decoder's buffer. This value is expressed in units
+     * of time (milliseconds). Use the target bitrate (#rc_target_bitrate)
+     * to convert to bits/bytes, if necessary.
+     */
+    unsigned int           rc_buf_optimal_sz;
+
+
+    /*
+     * 2 pass rate control parameters
+     */
+
+
+    /*!\brief Two-pass mode CBR/VBR bias
+     *
+     * Bias, expressed on a scale of 0 to 100, for determining target size
+     * for the current frame. The value 0 indicates the optimal CBR mode
+     * value should be used. The value 100 indicates the optimal VBR mode
+     * value should be used. Values in between indicate which way the
+     * encoder should "lean."
+     */
+    unsigned int           rc_2pass_vbr_bias_pct;       /**< RC mode bias between CBR and VBR(0-100: 0->CBR, 100->VBR)   */
+
+
+    /*!\brief Two-pass mode per-GOP minimum bitrate
+     *
+     * This value, expressed as a percentage of the target bitrate, indicates
+     * the minimum bitrate to be used for a single GOP (aka "section")
+     */
+    unsigned int           rc_2pass_vbr_minsection_pct;
+
+
+    /*!\brief Two-pass mode per-GOP maximum bitrate
+     *
+     * This value, expressed as a percentage of the target bitrate, indicates
+     * the maximum bitrate to be used for a single GOP (aka "section")
+     */
+    unsigned int           rc_2pass_vbr_maxsection_pct;
+
+
+    /*
+     * keyframing settings (kf)
+     */
+
+    /*!\brief Keyframe placement mode
+     *
+     * This value indicates whether the encoder should place keyframes at a
+     * fixed interval, or determine the optimal placement automatically
+     * (as governed by the #kf_min_dist and #kf_max_dist parameters)
+     */
+    enum vpx_kf_mode       kf_mode;
+
+
+    /*!\brief Keyframe minimum interval
+     *
+     * This value, expressed as a number of frames, prevents the encoder from
+     * placing a keyframe nearer than kf_min_dist to the previous keyframe. At
+     * least kf_min_dist frames non-keyframes will be coded before the next
+     * keyframe. Set kf_min_dist equal to kf_max_dist for a fixed interval.
+     */
+    unsigned int           kf_min_dist;
+
+
+    /*!\brief Keyframe maximum interval
+     *
+     * This value, expressed as a number of frames, forces the encoder to code
+     * a keyframe if one has not been coded in the last kf_max_dist frames.
+     * A value of 0 implies all frames will be keyframes. Set kf_min_dist
+     * equal to kf_max_dist for a fixed interval.
+     */
+    unsigned int           kf_max_dist;
+
+    /*
+     * Spatial scalability settings (ss)
+     */
+
+    /*!\brief Number of spatial coding layers.
+     *
+     * This value specifies the number of spatial coding layers to be used.
+     */
+    unsigned int           ss_number_layers;
+
+    /*!\brief Enable auto alt reference flags for each spatial layer.
+     *
+     * These values specify if auto alt reference frame is enabled for each
+     * spatial layer.
+     */
+    int                    ss_enable_auto_alt_ref[VPX_SS_MAX_LAYERS];
+
+    /*!\brief Target bitrate for each spatial layer.
+     *
+     * These values specify the target coding bitrate to be used for each
+     * spatial layer.
+     */
+    unsigned int           ss_target_bitrate[VPX_SS_MAX_LAYERS];
+
+    /*!\brief Number of temporal coding layers.
+     *
+     * This value specifies the number of temporal layers to be used.
+     */
+    unsigned int           ts_number_layers;
+
+    /*!\brief Target bitrate for each temporal layer.
+     *
+     * These values specify the target coding bitrate to be used for each
+     * temporal layer.
+     */
+    unsigned int           ts_target_bitrate[VPX_TS_MAX_LAYERS];
+
+    /*!\brief Frame rate decimation factor for each temporal layer.
+     *
+     * These values specify the frame rate decimation factors to apply
+     * to each temporal layer.
+     */
+    unsigned int           ts_rate_decimator[VPX_TS_MAX_LAYERS];
+
+    /*!\brief Length of the sequence defining frame temporal layer membership.
+     *
+     * This value specifies the length of the sequence that defines the
+     * membership of frames to temporal layers. For example, if the
+     * ts_periodicity = 8, then the frames are assigned to coding layers with a
+     * repeated sequence of length 8.
+    */
+    unsigned int           ts_periodicity;
+
+    /*!\brief Template defining the membership of frames to temporal layers.
+     *
+     * This array defines the membership of frames to temporal coding layers.
+     * For a 2-layer encoding that assigns even numbered frames to one temporal
+     * layer (0) and odd numbered frames to a second temporal layer (1) with
+     * ts_periodicity=8, then ts_layer_id = (0,1,0,1,0,1,0,1).
+    */
+    unsigned int           ts_layer_id[VPX_TS_MAX_PERIODICITY];
+
+    /*!\brief Target bitrate for each spatial/temporal layer.
+     *
+     * These values specify the target coding bitrate to be used for each
+     * spatial/temporal layer.
+     *
+     */
+    unsigned int           layer_target_bitrate[VPX_MAX_LAYERS];
+
+    /*!\brief Temporal layering mode indicating which temporal layering scheme to use.
+     *
+     * The value (refer to VP9E_TEMPORAL_LAYERING_MODE) specifies the
+     * temporal layering mode to use.
+     *
+     */
+    int                    temporal_layering_mode;
+  } vpx_codec_enc_cfg_t; /**< alias for struct vpx_codec_enc_cfg */
+
+  /*!\brief  vp9 svc extra configure parameters
+   *
+   * This defines max/min quantizers and scale factors for each layer
+   *
+   */
+  typedef struct vpx_svc_parameters {
+    int max_quantizers[VPX_MAX_LAYERS]; /**< Max Q for each layer */
+    int min_quantizers[VPX_MAX_LAYERS]; /**< Min Q for each layer */
+    int scaling_factor_num[VPX_MAX_LAYERS]; /**< Scaling factor-numerator */
+    int scaling_factor_den[VPX_MAX_LAYERS]; /**< Scaling factor-denominator */
+    int temporal_layering_mode; /**< Temporal layering mode */
+  } vpx_svc_extra_cfg_t;
+
+
+  /*!\brief Initialize an encoder instance
+   *
+   * Initializes a encoder context using the given interface. Applications
+   * should call the vpx_codec_enc_init convenience macro instead of this
+   * function directly, to ensure that the ABI version number parameter
+   * is properly initialized.
+   *
+   * If the library was configured with --disable-multithread, this call
+   * is not thread safe and should be guarded with a lock if being used
+   * in a multithreaded context.
+   *
+   * \param[in]    ctx     Pointer to this instance's context.
+   * \param[in]    iface   Pointer to the algorithm interface to use.
+   * \param[in]    cfg     Configuration to use, if known. May be NULL.
+   * \param[in]    flags   Bitfield of VPX_CODEC_USE_* flags
+   * \param[in]    ver     ABI version number. Must be set to
+   *                       VPX_ENCODER_ABI_VERSION
+   * \retval #VPX_CODEC_OK
+   *     The decoder algorithm initialized.
+   * \retval #VPX_CODEC_MEM_ERROR
+   *     Memory allocation failed.
+   */
+  vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t      *ctx,
+                                         vpx_codec_iface_t    *iface,
+                                         const vpx_codec_enc_cfg_t *cfg,
+                                         vpx_codec_flags_t     flags,
+                                         int                   ver);
+
+
+  /*!\brief Convenience macro for vpx_codec_enc_init_ver()
+   *
+   * Ensures the ABI version parameter is properly set.
+   */
+#define vpx_codec_enc_init(ctx, iface, cfg, flags) \
+  vpx_codec_enc_init_ver(ctx, iface, cfg, flags, VPX_ENCODER_ABI_VERSION)
+
+
+  /*!\brief Initialize multi-encoder instance
+   *
+   * Initializes multi-encoder context using the given interface.
+   * Applications should call the vpx_codec_enc_init_multi convenience macro
+   * instead of this function directly, to ensure that the ABI version number
+   * parameter is properly initialized.
+   *
+   * \param[in]    ctx     Pointer to this instance's context.
+   * \param[in]    iface   Pointer to the algorithm interface to use.
+   * \param[in]    cfg     Configuration to use, if known. May be NULL.
+   * \param[in]    num_enc Total number of encoders.
+   * \param[in]    flags   Bitfield of VPX_CODEC_USE_* flags
+   * \param[in]    dsf     Pointer to down-sampling factors.
+   * \param[in]    ver     ABI version number. Must be set to
+   *                       VPX_ENCODER_ABI_VERSION
+   * \retval #VPX_CODEC_OK
+   *     The decoder algorithm initialized.
+   * \retval #VPX_CODEC_MEM_ERROR
+   *     Memory allocation failed.
+   */
+  vpx_codec_err_t vpx_codec_enc_init_multi_ver(vpx_codec_ctx_t      *ctx,
+                                               vpx_codec_iface_t    *iface,
+                                               vpx_codec_enc_cfg_t  *cfg,
+                                               int                   num_enc,
+                                               vpx_codec_flags_t     flags,
+                                               vpx_rational_t       *dsf,
+                                               int                   ver);
+
+
+  /*!\brief Convenience macro for vpx_codec_enc_init_multi_ver()
+   *
+   * Ensures the ABI version parameter is properly set.
+   */
+#define vpx_codec_enc_init_multi(ctx, iface, cfg, num_enc, flags, dsf) \
+  vpx_codec_enc_init_multi_ver(ctx, iface, cfg, num_enc, flags, dsf, \
+                               VPX_ENCODER_ABI_VERSION)
+
+
+  /*!\brief Get a default configuration
+   *
+   * Initializes a encoder configuration structure with default values. Supports
+   * the notion of "usages" so that an algorithm may offer different default
+   * settings depending on the user's intended goal. This function \ref SHOULD
+   * be called by all applications to initialize the configuration structure
+   * before specializing the configuration with application specific values.
+   *
+   * \param[in]    iface     Pointer to the algorithm interface to use.
+   * \param[out]   cfg       Configuration buffer to populate.
+   * \param[in]    reserved  Must set to 0 for VP8 and VP9.
+   *
+   * \retval #VPX_CODEC_OK
+   *     The configuration was populated.
+   * \retval #VPX_CODEC_INCAPABLE
+   *     Interface is not an encoder interface.
+   * \retval #VPX_CODEC_INVALID_PARAM
+   *     A parameter was NULL, or the usage value was not recognized.
+   */
+  vpx_codec_err_t  vpx_codec_enc_config_default(vpx_codec_iface_t    *iface,
+                                                vpx_codec_enc_cfg_t  *cfg,
+                                                unsigned int          reserved);
+
+
+  /*!\brief Set or change configuration
+   *
+   * Reconfigures an encoder instance according to the given configuration.
+   *
+   * \param[in]    ctx     Pointer to this instance's context
+   * \param[in]    cfg     Configuration buffer to use
+   *
+   * \retval #VPX_CODEC_OK
+   *     The configuration was populated.
+   * \retval #VPX_CODEC_INCAPABLE
+   *     Interface is not an encoder interface.
+   * \retval #VPX_CODEC_INVALID_PARAM
+   *     A parameter was NULL, or the usage value was not recognized.
+   */
+  vpx_codec_err_t  vpx_codec_enc_config_set(vpx_codec_ctx_t            *ctx,
+                                            const vpx_codec_enc_cfg_t  *cfg);
+
+
+  /*!\brief Get global stream headers
+   *
+   * Retrieves a stream level global header packet, if supported by the codec.
+   *
+   * \param[in]    ctx     Pointer to this instance's context
+   *
+   * \retval NULL
+   *     Encoder does not support global header
+   * \retval Non-NULL
+   *     Pointer to buffer containing global header packet
+   */
+  vpx_fixed_buf_t *vpx_codec_get_global_headers(vpx_codec_ctx_t   *ctx);
+
+
+#define VPX_DL_REALTIME     (1)        /**< deadline parameter analogous to
+  *   VPx REALTIME mode. */
+#define VPX_DL_GOOD_QUALITY (1000000)  /**< deadline parameter analogous to
+  *   VPx GOOD QUALITY mode. */
+#define VPX_DL_BEST_QUALITY (0)        /**< deadline parameter analogous to
+  *   VPx BEST QUALITY mode. */
+  /*!\brief Encode a frame
+   *
+   * Encodes a video frame at the given "presentation time." The presentation
+   * time stamp (PTS) \ref MUST be strictly increasing.
+   *
+   * The encoder supports the notion of a soft real-time deadline. Given a
+   * non-zero value to the deadline parameter, the encoder will make a "best
+   * effort" guarantee to  return before the given time slice expires. It is
+   * implicit that limiting the available time to encode will degrade the
+   * output quality. The encoder can be given an unlimited time to produce the
+   * best possible frame by specifying a deadline of '0'. This deadline
+   * supercedes the VPx notion of "best quality, good quality, realtime".
+   * Applications that wish to map these former settings to the new deadline
+   * based system can use the symbols #VPX_DL_REALTIME, #VPX_DL_GOOD_QUALITY,
+   * and #VPX_DL_BEST_QUALITY.
+   *
+   * When the last frame has been passed to the encoder, this function should
+   * continue to be called, with the img parameter set to NULL. This will
+   * signal the end-of-stream condition to the encoder and allow it to encode
+   * any held buffers. Encoding is complete when vpx_codec_encode() is called
+   * and vpx_codec_get_cx_data() returns no data.
+   *
+   * \param[in]    ctx       Pointer to this instance's context
+   * \param[in]    img       Image data to encode, NULL to flush.
+   * \param[in]    pts       Presentation time stamp, in timebase units.
+   * \param[in]    duration  Duration to show frame, in timebase units.
+   * \param[in]    flags     Flags to use for encoding this frame.
+   * \param[in]    deadline  Time to spend encoding, in microseconds. (0=infinite)
+   *
+   * \retval #VPX_CODEC_OK
+   *     The configuration was populated.
+   * \retval #VPX_CODEC_INCAPABLE
+   *     Interface is not an encoder interface.
+   * \retval #VPX_CODEC_INVALID_PARAM
+   *     A parameter was NULL, the image format is unsupported, etc.
+   */
+  vpx_codec_err_t  vpx_codec_encode(vpx_codec_ctx_t            *ctx,
+                                    const vpx_image_t          *img,
+                                    vpx_codec_pts_t             pts,
+                                    unsigned long               duration,
+                                    vpx_enc_frame_flags_t       flags,
+                                    unsigned long               deadline);
+
+  /*!\brief Set compressed data output buffer
+   *
+   * Sets the buffer that the codec should output the compressed data
+   * into. This call effectively sets the buffer pointer returned in the
+   * next VPX_CODEC_CX_FRAME_PKT packet. Subsequent packets will be
+   * appended into this buffer. The buffer is preserved across frames,
+   * so applications must periodically call this function after flushing
+   * the accumulated compressed data to disk or to the network to reset
+   * the pointer to the buffer's head.
+   *
+   * `pad_before` bytes will be skipped before writing the compressed
+   * data, and `pad_after` bytes will be appended to the packet. The size
+   * of the packet will be the sum of the size of the actual compressed
+   * data, pad_before, and pad_after. The padding bytes will be preserved
+   * (not overwritten).
+   *
+   * Note that calling this function does not guarantee that the returned
+   * compressed data will be placed into the specified buffer. In the
+   * event that the encoded data will not fit into the buffer provided,
+   * the returned packet \ref MAY point to an internal buffer, as it would
+   * if this call were never used. In this event, the output packet will
+   * NOT have any padding, and the application must free space and copy it
+   * to the proper place. This is of particular note in configurations
+   * that may output multiple packets for a single encoded frame (e.g., lagged
+   * encoding) or if the application does not reset the buffer periodically.
+   *
+   * Applications may restore the default behavior of the codec providing
+   * the compressed data buffer by calling this function with a NULL
+   * buffer.
+   *
+   * Applications \ref MUSTNOT call this function during iteration of
+   * vpx_codec_get_cx_data().
+   *
+   * \param[in]    ctx         Pointer to this instance's context
+   * \param[in]    buf         Buffer to store compressed data into
+   * \param[in]    pad_before  Bytes to skip before writing compressed data
+   * \param[in]    pad_after   Bytes to skip after writing compressed data
+   *
+   * \retval #VPX_CODEC_OK
+   *     The buffer was set successfully.
+   * \retval #VPX_CODEC_INVALID_PARAM
+   *     A parameter was NULL, the image format is unsupported, etc.
+   */
+  vpx_codec_err_t vpx_codec_set_cx_data_buf(vpx_codec_ctx_t       *ctx,
+                                            const vpx_fixed_buf_t *buf,
+                                            unsigned int           pad_before,
+                                            unsigned int           pad_after);
+
+
+  /*!\brief Encoded data iterator
+   *
+   * Iterates over a list of data packets to be passed from the encoder to the
+   * application. The different kinds of packets available are enumerated in
+   * #vpx_codec_cx_pkt_kind.
+   *
+   * #VPX_CODEC_CX_FRAME_PKT packets should be passed to the application's
+   * muxer. Multiple compressed frames may be in the list.
+   * #VPX_CODEC_STATS_PKT packets should be appended to a global buffer.
+   *
+   * The application \ref MUST silently ignore any packet kinds that it does
+   * not recognize or support.
+   *
+   * The data buffers returned from this function are only guaranteed to be
+   * valid until the application makes another call to any vpx_codec_* function.
+   *
+   * \param[in]     ctx      Pointer to this instance's context
+   * \param[in,out] iter     Iterator storage, initialized to NULL
+   *
+   * \return Returns a pointer to an output data packet (compressed frame data,
+   *         two-pass statistics, etc.) or NULL to signal end-of-list.
+   *
+   */
+  const vpx_codec_cx_pkt_t *vpx_codec_get_cx_data(vpx_codec_ctx_t   *ctx,
+                                                  vpx_codec_iter_t  *iter);
+
+
+  /*!\brief Get Preview Frame
+   *
+   * Returns an image that can be used as a preview. Shows the image as it would
+   * exist at the decompressor. The application \ref MUST NOT write into this
+   * image buffer.
+   *
+   * \param[in]     ctx      Pointer to this instance's context
+   *
+   * \return Returns a pointer to a preview image, or NULL if no image is
+   *         available.
+   *
+   */
+  const vpx_image_t *vpx_codec_get_preview_frame(vpx_codec_ctx_t   *ctx);
+
+
+  /*!@} - end defgroup encoder*/
+#ifdef __cplusplus
+}
+#endif
+#endif  // VPX_VPX_ENCODER_H_
+
diff --git a/libs/libvpx/vpx/vpx_frame_buffer.h b/libs/libvpx/vpx/vpx_frame_buffer.h
new file mode 100644
index 0000000000..9036459af0
--- /dev/null
+++ b/libs/libvpx/vpx/vpx_frame_buffer.h
@@ -0,0 +1,83 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_FRAME_BUFFER_H_
+#define VPX_VPX_FRAME_BUFFER_H_
+
+/*!\file
+ * \brief Describes the decoder external frame buffer interface.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "./vpx_integer.h"
+
+/*!\brief The maximum number of work buffers used by libvpx.
+ *  Support maximum 4 threads to decode video in parallel.
+ *  Each thread will use one work buffer.
+ * TODO(hkuang): Add support to set number of worker threads dynamically.
+ */
+#define VPX_MAXIMUM_WORK_BUFFERS 8
+
+/*!\brief The maximum number of reference buffers that a VP9 encoder may use.
+ */
+#define VP9_MAXIMUM_REF_BUFFERS 8
+
+/*!\brief External frame buffer
+ *
+ * This structure holds allocated frame buffers used by the decoder.
+ */
+typedef struct vpx_codec_frame_buffer {
+  uint8_t *data;  /**< Pointer to the data buffer */
+  size_t size;  /**< Size of data in bytes */
+  void *priv;  /**< Frame's private data */
+} vpx_codec_frame_buffer_t;
+
+/*!\brief get frame buffer callback prototype
+ *
+ * This callback is invoked by the decoder to retrieve data for the frame
+ * buffer in order for the decode call to complete. The callback must
+ * allocate at least min_size in bytes and assign it to fb->data. The callback
+ * must zero out all the data allocated. Then the callback must set fb->size
+ * to the allocated size. The application does not need to align the allocated
+ * data. The callback is triggered when the decoder needs a frame buffer to
+ * decode a compressed image into. This function may be called more than once
+ * for every call to vpx_codec_decode. The application may set fb->priv to
+ * some data which will be passed back in the ximage and the release function
+ * call. |fb| is guaranteed to not be NULL. On success the callback must
+ * return 0. Any failure the callback must return a value less than 0.
+ *
+ * \param[in] priv         Callback's private data
+ * \param[in] new_size     Size in bytes needed by the buffer
+ * \param[in,out] fb       Pointer to vpx_codec_frame_buffer_t
+ */
+typedef int (*vpx_get_frame_buffer_cb_fn_t)(
+    void *priv, size_t min_size, vpx_codec_frame_buffer_t *fb);
+
+/*!\brief release frame buffer callback prototype
+ *
+ * This callback is invoked by the decoder when the frame buffer is not
+ * referenced by any other buffers. |fb| is guaranteed to not be NULL. On
+ * success the callback must return 0. Any failure the callback must return
+ * a value less than 0.
+ *
+ * \param[in] priv         Callback's private data
+ * \param[in] fb           Pointer to vpx_codec_frame_buffer_t
+ */
+typedef int (*vpx_release_frame_buffer_cb_fn_t)(
+    void *priv, vpx_codec_frame_buffer_t *fb);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_VPX_FRAME_BUFFER_H_
diff --git a/libs/libvpx/vpx/vpx_image.h b/libs/libvpx/vpx/vpx_image.h
new file mode 100644
index 0000000000..7958c69806
--- /dev/null
+++ b/libs/libvpx/vpx/vpx_image.h
@@ -0,0 +1,235 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+/*!\file
+ * \brief Describes the vpx image descriptor and associated operations
+ *
+ */
+#ifndef VPX_VPX_IMAGE_H_
+#define VPX_VPX_IMAGE_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+  /*!\brief Current ABI version number
+   *
+   * \internal
+   * If this file is altered in any way that changes the ABI, this value
+   * must be bumped.  Examples include, but are not limited to, changing
+   * types, removing or reassigning enums, adding/removing/rearranging
+   * fields to structures
+   */
+#define VPX_IMAGE_ABI_VERSION (4) /**<\hideinitializer*/
+
+
+#define VPX_IMG_FMT_PLANAR     0x100  /**< Image is a planar format. */
+#define VPX_IMG_FMT_UV_FLIP    0x200  /**< V plane precedes U in memory. */
+#define VPX_IMG_FMT_HAS_ALPHA  0x400  /**< Image has an alpha channel. */
+#define VPX_IMG_FMT_HIGHBITDEPTH 0x800  /**< Image uses 16bit framebuffer. */
+
+  /*!\brief List of supported image formats */
+  typedef enum vpx_img_fmt {
+    VPX_IMG_FMT_NONE,
+    VPX_IMG_FMT_RGB24,   /**< 24 bit per pixel packed RGB */
+    VPX_IMG_FMT_RGB32,   /**< 32 bit per pixel packed 0RGB */
+    VPX_IMG_FMT_RGB565,  /**< 16 bit per pixel, 565 */
+    VPX_IMG_FMT_RGB555,  /**< 16 bit per pixel, 555 */
+    VPX_IMG_FMT_UYVY,    /**< UYVY packed YUV */
+    VPX_IMG_FMT_YUY2,    /**< YUYV packed YUV */
+    VPX_IMG_FMT_YVYU,    /**< YVYU packed YUV */
+    VPX_IMG_FMT_BGR24,   /**< 24 bit per pixel packed BGR */
+    VPX_IMG_FMT_RGB32_LE, /**< 32 bit packed BGR0 */
+    VPX_IMG_FMT_ARGB,     /**< 32 bit packed ARGB, alpha=255 */
+    VPX_IMG_FMT_ARGB_LE,  /**< 32 bit packed BGRA, alpha=255 */
+    VPX_IMG_FMT_RGB565_LE,  /**< 16 bit per pixel, gggbbbbb rrrrrggg */
+    VPX_IMG_FMT_RGB555_LE,  /**< 16 bit per pixel, gggbbbbb 0rrrrrgg */
+    VPX_IMG_FMT_YV12    = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_UV_FLIP | 1, /**< planar YVU */
+    VPX_IMG_FMT_I420    = VPX_IMG_FMT_PLANAR | 2,
+    VPX_IMG_FMT_VPXYV12 = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_UV_FLIP | 3, /** < planar 4:2:0 format with vpx color space */
+    VPX_IMG_FMT_VPXI420 = VPX_IMG_FMT_PLANAR | 4,
+    VPX_IMG_FMT_I422    = VPX_IMG_FMT_PLANAR | 5,
+    VPX_IMG_FMT_I444    = VPX_IMG_FMT_PLANAR | 6,
+    VPX_IMG_FMT_I440    = VPX_IMG_FMT_PLANAR | 7,
+    VPX_IMG_FMT_444A    = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_HAS_ALPHA | 6,
+    VPX_IMG_FMT_I42016    = VPX_IMG_FMT_I420 | VPX_IMG_FMT_HIGHBITDEPTH,
+    VPX_IMG_FMT_I42216    = VPX_IMG_FMT_I422 | VPX_IMG_FMT_HIGHBITDEPTH,
+    VPX_IMG_FMT_I44416    = VPX_IMG_FMT_I444 | VPX_IMG_FMT_HIGHBITDEPTH,
+    VPX_IMG_FMT_I44016    = VPX_IMG_FMT_I440 | VPX_IMG_FMT_HIGHBITDEPTH
+  } vpx_img_fmt_t; /**< alias for enum vpx_img_fmt */
+
+  /*!\brief List of supported color spaces */
+  typedef enum vpx_color_space {
+    VPX_CS_UNKNOWN    = 0,  /**< Unknown */
+    VPX_CS_BT_601     = 1,  /**< BT.601 */
+    VPX_CS_BT_709     = 2,  /**< BT.709 */
+    VPX_CS_SMPTE_170  = 3,  /**< SMPTE.170 */
+    VPX_CS_SMPTE_240  = 4,  /**< SMPTE.240 */
+    VPX_CS_BT_2020    = 5,  /**< BT.2020 */
+    VPX_CS_RESERVED   = 6,  /**< Reserved */
+    VPX_CS_SRGB       = 7   /**< sRGB */
+  } vpx_color_space_t; /**< alias for enum vpx_color_space */
+
+  /*!\brief List of supported color range */
+  typedef enum vpx_color_range {
+    VPX_CR_STUDIO_RANGE = 0,    /**< Y [16..235], UV [16..240] */
+    VPX_CR_FULL_RANGE   = 1     /**< YUV/RGB [0..255] */
+  } vpx_color_range_t; /**< alias for enum vpx_color_range */
+
+  /**\brief Image Descriptor */
+  typedef struct vpx_image {
+    vpx_img_fmt_t fmt; /**< Image Format */
+    vpx_color_space_t cs; /**< Color Space */
+    vpx_color_range_t range; /**< Color Range */
+
+    /* Image storage dimensions */
+    unsigned int  w;           /**< Stored image width */
+    unsigned int  h;           /**< Stored image height */
+    unsigned int  bit_depth;   /**< Stored image bit-depth */
+
+    /* Image display dimensions */
+    unsigned int  d_w;   /**< Displayed image width */
+    unsigned int  d_h;   /**< Displayed image height */
+
+    /* Image intended rendering dimensions */
+    unsigned int  r_w;   /**< Intended rendering image width */
+    unsigned int  r_h;   /**< Intended rendering image height */
+
+    /* Chroma subsampling info */
+    unsigned int  x_chroma_shift;   /**< subsampling order, X */
+    unsigned int  y_chroma_shift;   /**< subsampling order, Y */
+
+    /* Image data pointers. */
+#define VPX_PLANE_PACKED 0   /**< To be used for all packed formats */
+#define VPX_PLANE_Y      0   /**< Y (Luminance) plane */
+#define VPX_PLANE_U      1   /**< U (Chroma) plane */
+#define VPX_PLANE_V      2   /**< V (Chroma) plane */
+#define VPX_PLANE_ALPHA  3   /**< A (Transparency) plane */
+    unsigned char *planes[4];  /**< pointer to the top left pixel for each plane */
+    int      stride[4];  /**< stride between rows for each plane */
+
+    int     bps; /**< bits per sample (for packed formats) */
+
+    /* The following member may be set by the application to associate data
+     * with this image.
+     */
+    void    *user_priv; /**< may be set by the application to associate data
+                         *   with this image. */
+
+    /* The following members should be treated as private. */
+    unsigned char *img_data;       /**< private */
+    int      img_data_owner; /**< private */
+    int      self_allocd;    /**< private */
+
+    void    *fb_priv; /**< Frame buffer data associated with the image. */
+  } vpx_image_t; /**< alias for struct vpx_image */
+
+  /**\brief Representation of a rectangle on a surface */
+  typedef struct vpx_image_rect {
+    unsigned int x; /**< leftmost column */
+    unsigned int y; /**< topmost row */
+    unsigned int w; /**< width */
+    unsigned int h; /**< height */
+  } vpx_image_rect_t; /**< alias for struct vpx_image_rect */
+
+  /*!\brief Open a descriptor, allocating storage for the underlying image
+   *
+   * Returns a descriptor for storing an image of the given format. The
+   * storage for the descriptor is allocated on the heap.
+   *
+   * \param[in]    img       Pointer to storage for descriptor. If this parameter
+   *                         is NULL, the storage for the descriptor will be
+   *                         allocated on the heap.
+   * \param[in]    fmt       Format for the image
+   * \param[in]    d_w       Width of the image
+   * \param[in]    d_h       Height of the image
+   * \param[in]    align     Alignment, in bytes, of the image buffer and
+   *                         each row in the image(stride).
+   *
+   * \return Returns a pointer to the initialized image descriptor. If the img
+   *         parameter is non-null, the value of the img parameter will be
+   *         returned.
+   */
+  vpx_image_t *vpx_img_alloc(vpx_image_t  *img,
+                             vpx_img_fmt_t fmt,
+                             unsigned int d_w,
+                             unsigned int d_h,
+                             unsigned int align);
+
+  /*!\brief Open a descriptor, using existing storage for the underlying image
+   *
+   * Returns a descriptor for storing an image of the given format. The
+   * storage for descriptor has been allocated elsewhere, and a descriptor is
+   * desired to "wrap" that storage.
+   *
+   * \param[in]    img       Pointer to storage for descriptor. If this parameter
+   *                         is NULL, the storage for the descriptor will be
+   *                         allocated on the heap.
+   * \param[in]    fmt       Format for the image
+   * \param[in]    d_w       Width of the image
+   * \param[in]    d_h       Height of the image
+   * \param[in]    align     Alignment, in bytes, of each row in the image.
+   * \param[in]    img_data  Storage to use for the image
+   *
+   * \return Returns a pointer to the initialized image descriptor. If the img
+   *         parameter is non-null, the value of the img parameter will be
+   *         returned.
+   */
+  vpx_image_t *vpx_img_wrap(vpx_image_t  *img,
+                            vpx_img_fmt_t fmt,
+                            unsigned int d_w,
+                            unsigned int d_h,
+                            unsigned int align,
+                            unsigned char      *img_data);
+
+
+  /*!\brief Set the rectangle identifying the displayed portion of the image
+   *
+   * Updates the displayed rectangle (aka viewport) on the image surface to
+   * match the specified coordinates and size.
+   *
+   * \param[in]    img       Image descriptor
+   * \param[in]    x         leftmost column
+   * \param[in]    y         topmost row
+   * \param[in]    w         width
+   * \param[in]    h         height
+   *
+   * \return 0 if the requested rectangle is valid, nonzero otherwise.
+   */
+  int vpx_img_set_rect(vpx_image_t  *img,
+                       unsigned int  x,
+                       unsigned int  y,
+                       unsigned int  w,
+                       unsigned int  h);
+
+
+  /*!\brief Flip the image vertically (top for bottom)
+   *
+   * Adjusts the image descriptor's pointers and strides to make the image
+   * be referenced upside-down.
+   *
+   * \param[in]    img       Image descriptor
+   */
+  void vpx_img_flip(vpx_image_t *img);
+
+  /*!\brief Close an image descriptor
+   *
+   * Frees all allocated storage associated with an image descriptor.
+   *
+   * \param[in]    img       Image descriptor
+   */
+  void vpx_img_free(vpx_image_t *img);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_VPX_IMAGE_H_
diff --git a/libs/libvpx/vpx/vpx_integer.h b/libs/libvpx/vpx/vpx_integer.h
new file mode 100644
index 0000000000..829c9d132c
--- /dev/null
+++ b/libs/libvpx/vpx/vpx_integer.h
@@ -0,0 +1,74 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VPX_VPX_INTEGER_H_
+#define VPX_VPX_INTEGER_H_
+
+/* get ptrdiff_t, size_t, wchar_t, NULL */
+#include <stddef.h>
+
+#if defined(_MSC_VER)
+#define VPX_FORCE_INLINE __forceinline
+#define VPX_INLINE __inline
+#else
+#define VPX_FORCE_INLINE __inline__ __attribute__(always_inline)
+// TODO(jbb): Allow a way to force inline off for older compilers.
+#define VPX_INLINE inline
+#endif
+
+#if (defined(_MSC_VER) && (_MSC_VER < 1600)) || defined(VPX_EMULATE_INTTYPES)
+typedef signed char  int8_t;
+typedef signed short int16_t;
+typedef signed int   int32_t;
+
+typedef unsigned char  uint8_t;
+typedef unsigned short uint16_t;
+typedef unsigned int   uint32_t;
+
+#if (defined(_MSC_VER) && (_MSC_VER < 1600))
+typedef signed __int64   int64_t;
+typedef unsigned __int64 uint64_t;
+#define INT64_MAX _I64_MAX
+#define INT32_MAX _I32_MAX
+#define INT32_MIN _I32_MIN
+#define INT16_MAX _I16_MAX
+#define INT16_MIN _I16_MIN
+#endif
+
+#ifndef _UINTPTR_T_DEFINED
+typedef size_t uintptr_t;
+#endif
+
+#else
+
+/* Most platforms have the C99 standard integer types. */
+
+#if defined(__cplusplus)
+# if !defined(__STDC_FORMAT_MACROS)
+#  define __STDC_FORMAT_MACROS
+# endif
+# if !defined(__STDC_LIMIT_MACROS)
+#  define __STDC_LIMIT_MACROS
+# endif
+#endif  // __cplusplus
+
+#include <stdint.h>
+
+#endif
+
+/* VS2010 defines stdint.h, but not inttypes.h */
+#if defined(_MSC_VER) && _MSC_VER < 1800
+#define PRId64 "I64d"
+#else
+#include <inttypes.h>
+#endif
+
+#endif  // VPX_VPX_INTEGER_H_
diff --git a/libs/libvpx/vpx_dsp/arm/avg_neon.c b/libs/libvpx/vpx_dsp/arm/avg_neon.c
new file mode 100644
index 0000000000..d054c4185f
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/avg_neon.c
@@ -0,0 +1,199 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+
+static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) {
+  const uint32x4_t a = vpaddlq_u16(v_16x8);
+  const uint64x2_t b = vpaddlq_u32(a);
+  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                                vreinterpret_u32_u64(vget_high_u64(b)));
+  return vget_lane_u32(c, 0);
+}
+
+unsigned int vpx_avg_4x4_neon(const uint8_t *s, int p) {
+  uint16x8_t v_sum;
+  uint32x2_t v_s0 = vdup_n_u32(0);
+  uint32x2_t v_s1 = vdup_n_u32(0);
+  v_s0 = vld1_lane_u32((const uint32_t *)s, v_s0, 0);
+  v_s0 = vld1_lane_u32((const uint32_t *)(s + p), v_s0, 1);
+  v_s1 = vld1_lane_u32((const uint32_t *)(s + 2 * p), v_s1, 0);
+  v_s1 = vld1_lane_u32((const uint32_t *)(s + 3 * p), v_s1, 1);
+  v_sum = vaddl_u8(vreinterpret_u8_u32(v_s0), vreinterpret_u8_u32(v_s1));
+  return (horizontal_add_u16x8(v_sum) + 8) >> 4;
+}
+
+unsigned int vpx_avg_8x8_neon(const uint8_t *s, int p) {
+  uint8x8_t v_s0 = vld1_u8(s);
+  const uint8x8_t v_s1 = vld1_u8(s + p);
+  uint16x8_t v_sum = vaddl_u8(v_s0, v_s1);
+
+  v_s0 = vld1_u8(s + 2 * p);
+  v_sum = vaddw_u8(v_sum, v_s0);
+
+  v_s0 = vld1_u8(s + 3 * p);
+  v_sum = vaddw_u8(v_sum, v_s0);
+
+  v_s0 = vld1_u8(s + 4 * p);
+  v_sum = vaddw_u8(v_sum, v_s0);
+
+  v_s0 = vld1_u8(s + 5 * p);
+  v_sum = vaddw_u8(v_sum, v_s0);
+
+  v_s0 = vld1_u8(s + 6 * p);
+  v_sum = vaddw_u8(v_sum, v_s0);
+
+  v_s0 = vld1_u8(s + 7 * p);
+  v_sum = vaddw_u8(v_sum, v_s0);
+
+  return (horizontal_add_u16x8(v_sum) + 32) >> 6;
+}
+
+// coeff: 16 bits, dynamic range [-32640, 32640].
+// length: value range {16, 64, 256, 1024}.
+int vpx_satd_neon(const int16_t *coeff, int length) {
+  const int16x4_t zero = vdup_n_s16(0);
+  int32x4_t accum = vdupq_n_s32(0);
+
+  do {
+    const int16x8_t src0 = vld1q_s16(coeff);
+    const int16x8_t src8 = vld1q_s16(coeff + 8);
+    accum = vabal_s16(accum, vget_low_s16(src0), zero);
+    accum = vabal_s16(accum, vget_high_s16(src0), zero);
+    accum = vabal_s16(accum, vget_low_s16(src8), zero);
+    accum = vabal_s16(accum, vget_high_s16(src8), zero);
+    length -= 16;
+    coeff += 16;
+  } while (length != 0);
+
+  {
+    // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
+    const int64x2_t s0 = vpaddlq_s32(accum);  // cascading summation of 'accum'.
+    const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)),
+                                  vreinterpret_s32_s64(vget_high_s64(s0)));
+    const int satd = vget_lane_s32(s1, 0);
+    return satd;
+  }
+}
+
+void vpx_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref,
+                          const int ref_stride, const int height) {
+  int i;
+  uint16x8_t vec_sum_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_hi = vdupq_n_u16(0);
+  const int shift_factor = ((height >> 5) + 3) * -1;
+  const int16x8_t vec_shift = vdupq_n_s16(shift_factor);
+
+  for (i = 0; i < height; i += 8) {
+    const uint8x16_t vec_row1 = vld1q_u8(ref);
+    const uint8x16_t vec_row2 = vld1q_u8(ref + ref_stride);
+    const uint8x16_t vec_row3 = vld1q_u8(ref + ref_stride * 2);
+    const uint8x16_t vec_row4 = vld1q_u8(ref + ref_stride * 3);
+    const uint8x16_t vec_row5 = vld1q_u8(ref + ref_stride * 4);
+    const uint8x16_t vec_row6 = vld1q_u8(ref + ref_stride * 5);
+    const uint8x16_t vec_row7 = vld1q_u8(ref + ref_stride * 6);
+    const uint8x16_t vec_row8 = vld1q_u8(ref + ref_stride * 7);
+
+    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row1));
+    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row1));
+
+    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row2));
+    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row2));
+
+    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row3));
+    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row3));
+
+    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row4));
+    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row4));
+
+    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row5));
+    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row5));
+
+    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row6));
+    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row6));
+
+    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row7));
+    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row7));
+
+    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row8));
+    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row8));
+
+    ref += ref_stride * 8;
+  }
+
+  vec_sum_lo = vshlq_u16(vec_sum_lo, vec_shift);
+  vec_sum_hi = vshlq_u16(vec_sum_hi, vec_shift);
+
+  vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_lo));
+  hbuf += 8;
+  vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_hi));
+}
+
+int16_t vpx_int_pro_col_neon(uint8_t const *ref, const int width) {
+  int i;
+  uint16x8_t vec_sum = vdupq_n_u16(0);
+
+  for (i = 0; i < width; i += 16) {
+    const uint8x16_t vec_row = vld1q_u8(ref);
+    vec_sum = vaddw_u8(vec_sum, vget_low_u8(vec_row));
+    vec_sum = vaddw_u8(vec_sum, vget_high_u8(vec_row));
+    ref += 16;
+  }
+
+  return horizontal_add_u16x8(vec_sum);
+}
+
+// ref, src = [0, 510] - max diff = 16-bits
+// bwl = {2, 3, 4}, width = {16, 32, 64}
+int vpx_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl) {
+  int width = 4 << bwl;
+  int32x4_t sse = vdupq_n_s32(0);
+  int16x8_t total = vdupq_n_s16(0);
+
+  assert(width >= 8);
+  assert((width % 8) == 0);
+
+  do {
+    const int16x8_t r = vld1q_s16(ref);
+    const int16x8_t s = vld1q_s16(src);
+    const int16x8_t diff = vsubq_s16(r, s);  // [-510, 510], 10 bits.
+    const int16x4_t diff_lo = vget_low_s16(diff);
+    const int16x4_t diff_hi = vget_high_s16(diff);
+    sse = vmlal_s16(sse, diff_lo, diff_lo);  // dynamic range 26 bits.
+    sse = vmlal_s16(sse, diff_hi, diff_hi);
+    total = vaddq_s16(total, diff);  // dynamic range 16 bits.
+
+    ref += 8;
+    src += 8;
+    width -= 8;
+  } while (width != 0);
+
+  {
+    // Note: 'total''s pairwise addition could be implemented similarly to
+    // horizontal_add_u16x8(), but one less vpaddl with 'total' when paired
+    // with the summation of 'sse' performed better on a Cortex-A15.
+    const int32x4_t t0 = vpaddlq_s16(total);  // cascading summation of 'total'
+    const int32x2_t t1 = vadd_s32(vget_low_s32(t0), vget_high_s32(t0));
+    const int32x2_t t2 = vpadd_s32(t1, t1);
+    const int t = vget_lane_s32(t2, 0);
+    const int64x2_t s0 = vpaddlq_s32(sse);  // cascading summation of 'sse'.
+    const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)),
+                                  vreinterpret_s32_s64(vget_high_s64(s0)));
+    const int s = vget_lane_s32(s1, 0);
+    const int shift_factor = bwl + 2;
+    return s - ((t * t) >> shift_factor);
+  }
+}
diff --git a/libs/libvpx/vpx_dsp/arm/bilinear_filter_media.asm b/libs/libvpx/vpx_dsp/arm/bilinear_filter_media.asm
new file mode 100644
index 0000000000..f3f9754c11
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/bilinear_filter_media.asm
@@ -0,0 +1,237 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vpx_filter_block2d_bil_first_pass_media|
+    EXPORT  |vpx_filter_block2d_bil_second_pass_media|
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+
+;-------------------------------------
+; r0    unsigned char  *src_ptr,
+; r1    unsigned short *dst_ptr,
+; r2    unsigned int    src_pitch,
+; r3    unsigned int    height,
+; stack unsigned int    width,
+; stack const short    *vpx_filter
+;-------------------------------------
+; The output is transposed stroed in output array to make it easy for second pass filtering.
+|vpx_filter_block2d_bil_first_pass_media| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #40]                  ; vpx_filter address
+    ldr     r4, [sp, #36]                   ; width
+
+    mov     r12, r3                         ; outer-loop counter
+
+    add     r7, r2, r4                      ; preload next row
+    pld     [r0, r7]
+
+    sub     r2, r2, r4                      ; src increment for height loop
+
+    ldr     r5, [r11]                       ; load up filter coefficients
+
+    mov     r3, r3, lsl #1                  ; height*2
+    add     r3, r3, #2                      ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1)
+
+    mov     r11, r1                         ; save dst_ptr for each row
+
+    cmp     r5, #128                        ; if filter coef = 128, then skip the filter
+    beq     bil_null_1st_filter
+
+|bil_height_loop_1st_v6|
+    ldrb    r6, [r0]                        ; load source data
+    ldrb    r7, [r0, #1]
+    ldrb    r8, [r0, #2]
+    mov     lr, r4, lsr #2                  ; 4-in-parellel loop counter
+
+|bil_width_loop_1st_v6|
+    ldrb    r9, [r0, #3]
+    ldrb    r10, [r0, #4]
+
+    pkhbt   r6, r6, r7, lsl #16             ; src[1] | src[0]
+    pkhbt   r7, r7, r8, lsl #16             ; src[2] | src[1]
+
+    smuad   r6, r6, r5                      ; apply the filter
+    pkhbt   r8, r8, r9, lsl #16             ; src[3] | src[2]
+    smuad   r7, r7, r5
+    pkhbt   r9, r9, r10, lsl #16            ; src[4] | src[3]
+
+    smuad   r8, r8, r5
+    smuad   r9, r9, r5
+
+    add     r0, r0, #4
+    subs    lr, lr, #1
+
+    add     r6, r6, #0x40                   ; round_shift_and_clamp
+    add     r7, r7, #0x40
+    usat    r6, #16, r6, asr #7
+    usat    r7, #16, r7, asr #7
+
+    strh    r6, [r1], r3                    ; result is transposed and stored
+
+    add     r8, r8, #0x40                   ; round_shift_and_clamp
+    strh    r7, [r1], r3
+    add     r9, r9, #0x40
+    usat    r8, #16, r8, asr #7
+    usat    r9, #16, r9, asr #7
+
+    strh    r8, [r1], r3                    ; result is transposed and stored
+
+    ldrneb  r6, [r0]                        ; load source data
+    strh    r9, [r1], r3
+
+    ldrneb  r7, [r0, #1]
+    ldrneb  r8, [r0, #2]
+
+    bne     bil_width_loop_1st_v6
+
+    add     r0, r0, r2                      ; move to next input row
+    subs    r12, r12, #1
+
+    add     r9, r2, r4, lsl #1              ; adding back block width
+    pld     [r0, r9]                        ; preload next row
+
+    add     r11, r11, #2                    ; move over to next column
+    mov     r1, r11
+
+    bne     bil_height_loop_1st_v6
+
+    ldmia   sp!, {r4 - r11, pc}
+
+|bil_null_1st_filter|
+|bil_height_loop_null_1st|
+    mov     lr, r4, lsr #2                  ; loop counter
+
+|bil_width_loop_null_1st|
+    ldrb    r6, [r0]                        ; load data
+    ldrb    r7, [r0, #1]
+    ldrb    r8, [r0, #2]
+    ldrb    r9, [r0, #3]
+
+    strh    r6, [r1], r3                    ; store it to immediate buffer
+    add     r0, r0, #4
+    strh    r7, [r1], r3
+    subs    lr, lr, #1
+    strh    r8, [r1], r3
+    strh    r9, [r1], r3
+
+    bne     bil_width_loop_null_1st
+
+    subs    r12, r12, #1
+    add     r0, r0, r2                      ; move to next input line
+    add     r11, r11, #2                    ; move over to next column
+    mov     r1, r11
+
+    bne     bil_height_loop_null_1st
+
+    ldmia   sp!, {r4 - r11, pc}
+
+    ENDP  ; |vpx_filter_block2d_bil_first_pass_media|
+
+
+;---------------------------------
+; r0    unsigned short *src_ptr,
+; r1    unsigned char  *dst_ptr,
+; r2    int             dst_pitch,
+; r3    unsigned int    height,
+; stack unsigned int    width,
+; stack const short    *vpx_filter
+;---------------------------------
+|vpx_filter_block2d_bil_second_pass_media| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #40]                  ; vpx_filter address
+    ldr     r4, [sp, #36]                   ; width
+
+    ldr     r5, [r11]                       ; load up filter coefficients
+    mov     r12, r4                         ; outer-loop counter = width, since we work on transposed data matrix
+    mov     r11, r1
+
+    cmp     r5, #128                        ; if filter coef = 128, then skip the filter
+    beq     bil_null_2nd_filter
+
+|bil_height_loop_2nd|
+    ldr     r6, [r0]                        ; load the data
+    ldr     r8, [r0, #4]
+    ldrh    r10, [r0, #8]
+    mov     lr, r3, lsr #2                  ; loop counter
+
+|bil_width_loop_2nd|
+    pkhtb   r7, r6, r8                      ; src[1] | src[2]
+    pkhtb   r9, r8, r10                     ; src[3] | src[4]
+
+    smuad   r6, r6, r5                      ; apply filter
+    smuad   r8, r8, r5                      ; apply filter
+
+    subs    lr, lr, #1
+
+    smuadx  r7, r7, r5                      ; apply filter
+    smuadx  r9, r9, r5                      ; apply filter
+
+    add     r0, r0, #8
+
+    add     r6, r6, #0x40                   ; round_shift_and_clamp
+    add     r7, r7, #0x40
+    usat    r6, #8, r6, asr #7
+    usat    r7, #8, r7, asr #7
+    strb    r6, [r1], r2                    ; the result is transposed back and stored
+
+    add     r8, r8, #0x40                   ; round_shift_and_clamp
+    strb    r7, [r1], r2
+    add     r9, r9, #0x40
+    usat    r8, #8, r8, asr #7
+    usat    r9, #8, r9, asr #7
+    strb    r8, [r1], r2                    ; the result is transposed back and stored
+
+    ldrne   r6, [r0]                        ; load data
+    strb    r9, [r1], r2
+    ldrne   r8, [r0, #4]
+    ldrneh  r10, [r0, #8]
+
+    bne     bil_width_loop_2nd
+
+    subs    r12, r12, #1
+    add     r0, r0, #4                      ; update src for next row
+    add     r11, r11, #1
+    mov     r1, r11
+
+    bne     bil_height_loop_2nd
+    ldmia   sp!, {r4 - r11, pc}
+
+|bil_null_2nd_filter|
+|bil_height_loop_null_2nd|
+    mov     lr, r3, lsr #2
+
+|bil_width_loop_null_2nd|
+    ldr     r6, [r0], #4                    ; load data
+    subs    lr, lr, #1
+    ldr     r8, [r0], #4
+
+    strb    r6, [r1], r2                    ; store data
+    mov     r7, r6, lsr #16
+    strb    r7, [r1], r2
+    mov     r9, r8, lsr #16
+    strb    r8, [r1], r2
+    strb    r9, [r1], r2
+
+    bne     bil_width_loop_null_2nd
+
+    subs    r12, r12, #1
+    add     r0, r0, #4
+    add     r11, r11, #1
+    mov     r1, r11
+
+    bne     bil_height_loop_null_2nd
+
+    ldmia   sp!, {r4 - r11, pc}
+    ENDP  ; |vpx_filter_block2d_second_pass_media|
+
+    END
diff --git a/libs/libvpx/vpx_dsp/arm/fwd_txfm_neon.c b/libs/libvpx/vpx_dsp/arm/fwd_txfm_neon.c
new file mode 100644
index 0000000000..9f9de98d90
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/fwd_txfm_neon.c
@@ -0,0 +1,220 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "vpx_dsp/txfm_common.h"
+
+void vpx_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
+  int i;
+  // stage 1
+  int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
+  int16x8_t input_1 = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2);
+  int16x8_t input_2 = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2);
+  int16x8_t input_3 = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2);
+  int16x8_t input_4 = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2);
+  int16x8_t input_5 = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2);
+  int16x8_t input_6 = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2);
+  int16x8_t input_7 = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2);
+  for (i = 0; i < 2; ++i) {
+    int16x8_t out_0, out_1, out_2, out_3, out_4, out_5, out_6, out_7;
+    const int16x8_t v_s0 = vaddq_s16(input_0, input_7);
+    const int16x8_t v_s1 = vaddq_s16(input_1, input_6);
+    const int16x8_t v_s2 = vaddq_s16(input_2, input_5);
+    const int16x8_t v_s3 = vaddq_s16(input_3, input_4);
+    const int16x8_t v_s4 = vsubq_s16(input_3, input_4);
+    const int16x8_t v_s5 = vsubq_s16(input_2, input_5);
+    const int16x8_t v_s6 = vsubq_s16(input_1, input_6);
+    const int16x8_t v_s7 = vsubq_s16(input_0, input_7);
+    // fdct4(step, step);
+    int16x8_t v_x0 = vaddq_s16(v_s0, v_s3);
+    int16x8_t v_x1 = vaddq_s16(v_s1, v_s2);
+    int16x8_t v_x2 = vsubq_s16(v_s1, v_s2);
+    int16x8_t v_x3 = vsubq_s16(v_s0, v_s3);
+    // fdct4(step, step);
+    int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
+    int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
+    int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
+    int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
+    int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_24_64);
+    int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_24_64);
+    int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_24_64);
+    int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_24_64);
+    v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), (int16_t)cospi_8_64);
+    v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), (int16_t)cospi_8_64);
+    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), (int16_t)cospi_8_64);
+    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), (int16_t)cospi_8_64);
+    v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64);
+    v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64);
+    v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64);
+    v_t1_hi = vmulq_n_s32(v_t1_hi, cospi_16_64);
+    {
+      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+      const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
+      const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
+      const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
+      const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
+      out_0 = vcombine_s16(a, c);  // 00 01 02 03 40 41 42 43
+      out_2 = vcombine_s16(e, g);  // 20 21 22 23 60 61 62 63
+      out_4 = vcombine_s16(b, d);  // 04 05 06 07 44 45 46 47
+      out_6 = vcombine_s16(f, h);  // 24 25 26 27 64 65 66 67
+    }
+    // Stage 2
+    v_x0 = vsubq_s16(v_s6, v_s5);
+    v_x1 = vaddq_s16(v_s6, v_s5);
+    v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), (int16_t)cospi_16_64);
+    v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), (int16_t)cospi_16_64);
+    v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_16_64);
+    v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_16_64);
+    {
+      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+      const int16x8_t ab = vcombine_s16(a, b);
+      const int16x8_t cd = vcombine_s16(c, d);
+      // Stage 3
+      v_x0 = vaddq_s16(v_s4, ab);
+      v_x1 = vsubq_s16(v_s4, ab);
+      v_x2 = vsubq_s16(v_s7, cd);
+      v_x3 = vaddq_s16(v_s7, cd);
+    }
+    // Stage 4
+    v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_4_64);
+    v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_4_64);
+    v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), (int16_t)cospi_28_64);
+    v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), (int16_t)cospi_28_64);
+    v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_12_64);
+    v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_12_64);
+    v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), (int16_t)cospi_20_64);
+    v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), (int16_t)cospi_20_64);
+    v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_12_64);
+    v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_12_64);
+    v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), (int16_t)cospi_20_64);
+    v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), (int16_t)cospi_20_64);
+    v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_28_64);
+    v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_28_64);
+    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), (int16_t)cospi_4_64);
+    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), (int16_t)cospi_4_64);
+    {
+      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+      const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
+      const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
+      const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
+      const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
+      out_1 = vcombine_s16(a, c);  // 10 11 12 13 50 51 52 53
+      out_3 = vcombine_s16(e, g);  // 30 31 32 33 70 71 72 73
+      out_5 = vcombine_s16(b, d);  // 14 15 16 17 54 55 56 57
+      out_7 = vcombine_s16(f, h);  // 34 35 36 37 74 75 76 77
+    }
+    // transpose 8x8
+    {
+      // 00 01 02 03 40 41 42 43
+      // 10 11 12 13 50 51 52 53
+      // 20 21 22 23 60 61 62 63
+      // 30 31 32 33 70 71 72 73
+      // 04 05 06 07 44 45 46 47
+      // 14 15 16 17 54 55 56 57
+      // 24 25 26 27 64 65 66 67
+      // 34 35 36 37 74 75 76 77
+      const int32x4x2_t r02_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_0),
+                                            vreinterpretq_s32_s16(out_2));
+      const int32x4x2_t r13_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_1),
+                                            vreinterpretq_s32_s16(out_3));
+      const int32x4x2_t r46_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_4),
+                                            vreinterpretq_s32_s16(out_6));
+      const int32x4x2_t r57_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_5),
+                                            vreinterpretq_s32_s16(out_7));
+      const int16x8x2_t r01_s16 =
+          vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]),
+                    vreinterpretq_s16_s32(r13_s32.val[0]));
+      const int16x8x2_t r23_s16 =
+          vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]),
+                    vreinterpretq_s16_s32(r13_s32.val[1]));
+      const int16x8x2_t r45_s16 =
+          vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]),
+                    vreinterpretq_s16_s32(r57_s32.val[0]));
+      const int16x8x2_t r67_s16 =
+          vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]),
+                    vreinterpretq_s16_s32(r57_s32.val[1]));
+      input_0 = r01_s16.val[0];
+      input_1 = r01_s16.val[1];
+      input_2 = r23_s16.val[0];
+      input_3 = r23_s16.val[1];
+      input_4 = r45_s16.val[0];
+      input_5 = r45_s16.val[1];
+      input_6 = r67_s16.val[0];
+      input_7 = r67_s16.val[1];
+      // 00 10 20 30 40 50 60 70
+      // 01 11 21 31 41 51 61 71
+      // 02 12 22 32 42 52 62 72
+      // 03 13 23 33 43 53 63 73
+      // 04 14 24 34 44 54 64 74
+      // 05 15 25 35 45 55 65 75
+      // 06 16 26 36 46 56 66 76
+      // 07 17 27 37 47 57 67 77
+    }
+  }  // for
+  {
+    // from vpx_dct_sse2.c
+    // Post-condition (division by two)
+    //    division of two 16 bits signed numbers using shifts
+    //    n / 2 = (n - (n >> 15)) >> 1
+    const int16x8_t sign_in0 = vshrq_n_s16(input_0, 15);
+    const int16x8_t sign_in1 = vshrq_n_s16(input_1, 15);
+    const int16x8_t sign_in2 = vshrq_n_s16(input_2, 15);
+    const int16x8_t sign_in3 = vshrq_n_s16(input_3, 15);
+    const int16x8_t sign_in4 = vshrq_n_s16(input_4, 15);
+    const int16x8_t sign_in5 = vshrq_n_s16(input_5, 15);
+    const int16x8_t sign_in6 = vshrq_n_s16(input_6, 15);
+    const int16x8_t sign_in7 = vshrq_n_s16(input_7, 15);
+    input_0 = vhsubq_s16(input_0, sign_in0);
+    input_1 = vhsubq_s16(input_1, sign_in1);
+    input_2 = vhsubq_s16(input_2, sign_in2);
+    input_3 = vhsubq_s16(input_3, sign_in3);
+    input_4 = vhsubq_s16(input_4, sign_in4);
+    input_5 = vhsubq_s16(input_5, sign_in5);
+    input_6 = vhsubq_s16(input_6, sign_in6);
+    input_7 = vhsubq_s16(input_7, sign_in7);
+    // store results
+    vst1q_s16(&final_output[0 * 8], input_0);
+    vst1q_s16(&final_output[1 * 8], input_1);
+    vst1q_s16(&final_output[2 * 8], input_2);
+    vst1q_s16(&final_output[3 * 8], input_3);
+    vst1q_s16(&final_output[4 * 8], input_4);
+    vst1q_s16(&final_output[5 * 8], input_5);
+    vst1q_s16(&final_output[6 * 8], input_6);
+    vst1q_s16(&final_output[7 * 8], input_7);
+  }
+}
+
+void vpx_fdct8x8_1_neon(const int16_t *input, int16_t *output, int stride) {
+  int r;
+  int16x8_t sum = vld1q_s16(&input[0]);
+  for (r = 1; r < 8; ++r) {
+    const int16x8_t input_00 = vld1q_s16(&input[r * stride]);
+    sum = vaddq_s16(sum, input_00);
+  }
+  {
+    const int32x4_t a = vpaddlq_s16(sum);
+    const int64x2_t b = vpaddlq_s32(a);
+    const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+                                 vreinterpret_s32_s64(vget_high_s64(b)));
+    output[0] = vget_lane_s16(vreinterpret_s16_s32(c), 0);
+    output[1] = 0;
+  }
+}
diff --git a/libs/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.asm b/libs/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.asm
new file mode 100644
index 0000000000..dc459e20d9
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.asm
@@ -0,0 +1,198 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vpx_idct16x16_1_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vpx_idct16x16_1_add_neon(int16_t *input, uint8_t *dest,
+;                                    int dest_stride)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride)
+
+|vpx_idct16x16_1_add_neon| PROC
+    ldrsh            r0, [r0]
+
+    ; generate cospi_16_64 = 11585
+    mov              r12, #0x2d00
+    add              r12, #0x41
+
+    ; out = dct_const_round_shift(input[0] * cospi_16_64)
+    mul              r0, r0, r12               ; input[0] * cospi_16_64
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; out = dct_const_round_shift(out * cospi_16_64)
+    mul              r0, r0, r12               ; out * cospi_16_64
+    mov              r12, r1                   ; save dest
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; a1 = ROUND_POWER_OF_TWO(out, 6)
+    add              r0, r0, #32               ; + (1 <<((6) - 1))
+    asr              r0, r0, #6                ; >> 6
+
+    vdup.s16         q0, r0                    ; duplicate a1
+    mov              r0, #8
+    sub              r2, #8
+
+    ; load destination data row0 - row3
+    vld1.64          {d2}, [r1], r0
+    vld1.64          {d3}, [r1], r2
+    vld1.64          {d4}, [r1], r0
+    vld1.64          {d5}, [r1], r2
+    vld1.64          {d6}, [r1], r0
+    vld1.64          {d7}, [r1], r2
+    vld1.64          {d16}, [r1], r0
+    vld1.64          {d17}, [r1], r2
+
+    vaddw.u8         q9, q0, d2                ; dest[x] + a1
+    vaddw.u8         q10, q0, d3               ; dest[x] + a1
+    vaddw.u8         q11, q0, d4               ; dest[x] + a1
+    vaddw.u8         q12, q0, d5               ; dest[x] + a1
+    vqmovun.s16      d2, q9                    ; clip_pixel
+    vqmovun.s16      d3, q10                   ; clip_pixel
+    vqmovun.s16      d30, q11                  ; clip_pixel
+    vqmovun.s16      d31, q12                  ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
+    vaddw.u8         q10, q0, d7                ; dest[x] + a1
+    vaddw.u8         q11, q0, d16               ; dest[x] + a1
+    vaddw.u8         q12, q0, d17               ; dest[x] + a1
+    vqmovun.s16      d2, q9                     ; clip_pixel
+    vqmovun.s16      d3, q10                    ; clip_pixel
+    vqmovun.s16      d30, q11                   ; clip_pixel
+    vqmovun.s16      d31, q12                   ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    ; load destination data row4 - row7
+    vld1.64          {d2}, [r1], r0
+    vld1.64          {d3}, [r1], r2
+    vld1.64          {d4}, [r1], r0
+    vld1.64          {d5}, [r1], r2
+    vld1.64          {d6}, [r1], r0
+    vld1.64          {d7}, [r1], r2
+    vld1.64          {d16}, [r1], r0
+    vld1.64          {d17}, [r1], r2
+
+    vaddw.u8         q9, q0, d2                ; dest[x] + a1
+    vaddw.u8         q10, q0, d3               ; dest[x] + a1
+    vaddw.u8         q11, q0, d4               ; dest[x] + a1
+    vaddw.u8         q12, q0, d5               ; dest[x] + a1
+    vqmovun.s16      d2, q9                    ; clip_pixel
+    vqmovun.s16      d3, q10                   ; clip_pixel
+    vqmovun.s16      d30, q11                  ; clip_pixel
+    vqmovun.s16      d31, q12                  ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
+    vaddw.u8         q10, q0, d7                ; dest[x] + a1
+    vaddw.u8         q11, q0, d16               ; dest[x] + a1
+    vaddw.u8         q12, q0, d17               ; dest[x] + a1
+    vqmovun.s16      d2, q9                     ; clip_pixel
+    vqmovun.s16      d3, q10                    ; clip_pixel
+    vqmovun.s16      d30, q11                   ; clip_pixel
+    vqmovun.s16      d31, q12                   ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    ; load destination data row8 - row11
+    vld1.64          {d2}, [r1], r0
+    vld1.64          {d3}, [r1], r2
+    vld1.64          {d4}, [r1], r0
+    vld1.64          {d5}, [r1], r2
+    vld1.64          {d6}, [r1], r0
+    vld1.64          {d7}, [r1], r2
+    vld1.64          {d16}, [r1], r0
+    vld1.64          {d17}, [r1], r2
+
+    vaddw.u8         q9, q0, d2                ; dest[x] + a1
+    vaddw.u8         q10, q0, d3               ; dest[x] + a1
+    vaddw.u8         q11, q0, d4               ; dest[x] + a1
+    vaddw.u8         q12, q0, d5               ; dest[x] + a1
+    vqmovun.s16      d2, q9                    ; clip_pixel
+    vqmovun.s16      d3, q10                   ; clip_pixel
+    vqmovun.s16      d30, q11                  ; clip_pixel
+    vqmovun.s16      d31, q12                  ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
+    vaddw.u8         q10, q0, d7                ; dest[x] + a1
+    vaddw.u8         q11, q0, d16               ; dest[x] + a1
+    vaddw.u8         q12, q0, d17               ; dest[x] + a1
+    vqmovun.s16      d2, q9                     ; clip_pixel
+    vqmovun.s16      d3, q10                    ; clip_pixel
+    vqmovun.s16      d30, q11                   ; clip_pixel
+    vqmovun.s16      d31, q12                   ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    ; load destination data row12 - row15
+    vld1.64          {d2}, [r1], r0
+    vld1.64          {d3}, [r1], r2
+    vld1.64          {d4}, [r1], r0
+    vld1.64          {d5}, [r1], r2
+    vld1.64          {d6}, [r1], r0
+    vld1.64          {d7}, [r1], r2
+    vld1.64          {d16}, [r1], r0
+    vld1.64          {d17}, [r1], r2
+
+    vaddw.u8         q9, q0, d2                ; dest[x] + a1
+    vaddw.u8         q10, q0, d3               ; dest[x] + a1
+    vaddw.u8         q11, q0, d4               ; dest[x] + a1
+    vaddw.u8         q12, q0, d5               ; dest[x] + a1
+    vqmovun.s16      d2, q9                    ; clip_pixel
+    vqmovun.s16      d3, q10                   ; clip_pixel
+    vqmovun.s16      d30, q11                  ; clip_pixel
+    vqmovun.s16      d31, q12                  ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
+    vaddw.u8         q10, q0, d7                ; dest[x] + a1
+    vaddw.u8         q11, q0, d16               ; dest[x] + a1
+    vaddw.u8         q12, q0, d17               ; dest[x] + a1
+    vqmovun.s16      d2, q9                     ; clip_pixel
+    vqmovun.s16      d3, q10                    ; clip_pixel
+    vqmovun.s16      d30, q11                   ; clip_pixel
+    vqmovun.s16      d31, q12                   ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    bx               lr
+    ENDP             ; |vpx_idct16x16_1_add_neon|
+
+    END
diff --git a/libs/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c b/libs/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c
new file mode 100644
index 0000000000..f734e48027
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c
@@ -0,0 +1,61 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "vpx_dsp/inv_txfm.h"
+#include "vpx_ports/mem.h"
+
+void vpx_idct16x16_1_add_neon(
+        int16_t *input,
+        uint8_t *dest,
+        int dest_stride) {
+    uint8x8_t d2u8, d3u8, d30u8, d31u8;
+    uint64x1_t d2u64, d3u64, d4u64, d5u64;
+    uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
+    int16x8_t q0s16;
+    uint8_t *d1, *d2;
+    int16_t i, j, a1, cospi_16_64 = 11585;
+    int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+    out = dct_const_round_shift(out * cospi_16_64);
+    a1 = ROUND_POWER_OF_TWO(out, 6);
+
+    q0s16 = vdupq_n_s16(a1);
+    q0u16 = vreinterpretq_u16_s16(q0s16);
+
+    for (d1 = d2 = dest, i = 0; i < 4; i++) {
+        for (j = 0; j < 2; j++) {
+            d2u64 = vld1_u64((const uint64_t *)d1);
+            d3u64 = vld1_u64((const uint64_t *)(d1 + 8));
+            d1 += dest_stride;
+            d4u64 = vld1_u64((const uint64_t *)d1);
+            d5u64 = vld1_u64((const uint64_t *)(d1 + 8));
+            d1 += dest_stride;
+
+            q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
+            q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
+            q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));
+            q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));
+
+            d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+            d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+            d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+            d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+
+            vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+            vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d3u8));
+            d2 += dest_stride;
+            vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
+            vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d31u8));
+            d2 += dest_stride;
+        }
+    }
+    return;
+}
diff --git a/libs/libvpx/vpx_dsp/arm/idct16x16_add_neon.asm b/libs/libvpx/vpx_dsp/arm/idct16x16_add_neon.asm
new file mode 100644
index 0000000000..22a0c95941
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/idct16x16_add_neon.asm
@@ -0,0 +1,1179 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vpx_idct16x16_256_add_neon_pass1|
+    EXPORT  |vpx_idct16x16_256_add_neon_pass2|
+    EXPORT  |vpx_idct16x16_10_add_neon_pass1|
+    EXPORT  |vpx_idct16x16_10_add_neon_pass2|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+    ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15.
+    MACRO
+    TRANSPOSE8X8
+    vswp            d17, d24
+    vswp            d23, d30
+    vswp            d21, d28
+    vswp            d19, d26
+    vtrn.32         q8, q10
+    vtrn.32         q9, q11
+    vtrn.32         q12, q14
+    vtrn.32         q13, q15
+    vtrn.16         q8, q9
+    vtrn.16         q10, q11
+    vtrn.16         q12, q13
+    vtrn.16         q14, q15
+    MEND
+
+    AREA    Block, CODE, READONLY ; name this block of code
+;void |vpx_idct16x16_256_add_neon_pass1|(int16_t *input,
+;                                          int16_t *output, int output_stride)
+;
+; r0  int16_t input
+; r1  int16_t *output
+; r2  int  output_stride)
+
+; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output
+; will be stored back into q8-q15 registers. This function will touch q0-q7
+; registers and use them as buffer during calculation.
+|vpx_idct16x16_256_add_neon_pass1| PROC
+
+    ; TODO(hkuang): Find a better way to load the elements.
+    ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15
+    vld2.s16        {q8,q9}, [r0]!
+    vld2.s16        {q9,q10}, [r0]!
+    vld2.s16        {q10,q11}, [r0]!
+    vld2.s16        {q11,q12}, [r0]!
+    vld2.s16        {q12,q13}, [r0]!
+    vld2.s16        {q13,q14}, [r0]!
+    vld2.s16        {q14,q15}, [r0]!
+    vld2.s16        {q1,q2}, [r0]!
+    vmov.s16        q15, q1
+
+    ; generate  cospi_28_64 = 3196
+    mov             r3, #0xc00
+    add             r3, #0x7c
+
+    ; generate cospi_4_64  = 16069
+    mov             r12, #0x3e00
+    add             r12, #0xc5
+
+    ; transpose the input data
+    TRANSPOSE8X8
+
+    ; stage 3
+    vdup.16         d0, r3                    ; duplicate cospi_28_64
+    vdup.16         d1, r12                   ; duplicate cospi_4_64
+
+    ; preloading to avoid stall
+    ; generate cospi_12_64 = 13623
+    mov             r3, #0x3500
+    add             r3, #0x37
+
+    ; generate cospi_20_64 = 9102
+    mov             r12, #0x2300
+    add             r12, #0x8e
+
+    ; step2[4] * cospi_28_64
+    vmull.s16       q2, d18, d0
+    vmull.s16       q3, d19, d0
+
+    ; step2[4] * cospi_4_64
+    vmull.s16       q5, d18, d1
+    vmull.s16       q6, d19, d1
+
+    ; temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64
+    vmlsl.s16       q2, d30, d1
+    vmlsl.s16       q3, d31, d1
+
+    ; temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64
+    vmlal.s16       q5, d30, d0
+    vmlal.s16       q6, d31, d0
+
+    vdup.16         d2, r3                    ; duplicate cospi_12_64
+    vdup.16         d3, r12                   ; duplicate cospi_20_64
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d8, q2, #14               ; >> 14
+    vqrshrn.s32     d9, q3, #14               ; >> 14
+
+    ; dct_const_round_shift(temp2)
+    vqrshrn.s32     d14, q5, #14              ; >> 14
+    vqrshrn.s32     d15, q6, #14              ; >> 14
+
+    ; preloading to avoid stall
+    ; generate cospi_16_64 = 11585
+    mov             r3, #0x2d00
+    add             r3, #0x41
+
+    ; generate cospi_24_64 = 6270
+    mov             r12, #0x1800
+    add             r12, #0x7e
+
+    ; step2[5] * cospi_12_64
+    vmull.s16       q2, d26, d2
+    vmull.s16       q3, d27, d2
+
+    ; step2[5] * cospi_20_64
+    vmull.s16       q9, d26, d3
+    vmull.s16       q15, d27, d3
+
+    ; temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64
+    vmlsl.s16       q2, d22, d3
+    vmlsl.s16       q3, d23, d3
+
+    ; temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64
+    vmlal.s16       q9, d22, d2
+    vmlal.s16       q15, d23, d2
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d10, q2, #14              ; >> 14
+    vqrshrn.s32     d11, q3, #14              ; >> 14
+
+    ; dct_const_round_shift(temp2)
+    vqrshrn.s32     d12, q9, #14              ; >> 14
+    vqrshrn.s32     d13, q15, #14             ; >> 14
+
+    ; stage 4
+    vdup.16         d30, r3                   ; cospi_16_64
+
+    ; step1[0] * cospi_16_64
+    vmull.s16       q2, d16, d30
+    vmull.s16       q11, d17, d30
+
+    ; step1[1] * cospi_16_64
+    vmull.s16       q0, d24, d30
+    vmull.s16       q1, d25, d30
+
+    ; generate cospi_8_64 = 15137
+    mov             r3, #0x3b00
+    add             r3, #0x21
+
+    vdup.16         d30, r12                  ; duplicate cospi_24_64
+    vdup.16         d31, r3                   ; duplicate cospi_8_64
+
+    ; temp1 = (step1[0] + step1[1]) * cospi_16_64
+    vadd.s32        q3, q2, q0
+    vadd.s32        q12, q11, q1
+
+    ; temp2 = (step1[0] - step1[1]) * cospi_16_64
+    vsub.s32        q13, q2, q0
+    vsub.s32        q1, q11, q1
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d16, q3, #14              ; >> 14
+    vqrshrn.s32     d17, q12, #14             ; >> 14
+
+    ; dct_const_round_shift(temp2)
+    vqrshrn.s32     d18, q13, #14             ; >> 14
+    vqrshrn.s32     d19, q1, #14              ; >> 14
+
+    ; step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
+    ; step1[2] * cospi_8_64
+    vmull.s16       q0, d20, d31
+    vmull.s16       q1, d21, d31
+
+    ; step1[2] * cospi_24_64
+    vmull.s16       q12, d20, d30
+    vmull.s16       q13, d21, d30
+
+    ; temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64
+    vmlal.s16       q0, d28, d30
+    vmlal.s16       q1, d29, d30
+
+    ; temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64
+    vmlsl.s16       q12, d28, d31
+    vmlsl.s16       q13, d29, d31
+
+    ; dct_const_round_shift(temp2)
+    vqrshrn.s32     d22, q0, #14              ; >> 14
+    vqrshrn.s32     d23, q1, #14              ; >> 14
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d20, q12, #14             ; >> 14
+    vqrshrn.s32     d21, q13, #14             ; >> 14
+
+    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5];
+    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5];
+    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7];
+    vadd.s16        q15, q6, q7               ; step2[7] = step1[6] + step1[7];
+
+    ; generate cospi_16_64 = 11585
+    mov             r3, #0x2d00
+    add             r3, #0x41
+
+    ; stage 5
+    vadd.s16        q0, q8, q11               ; step1[0] = step2[0] + step2[3];
+    vadd.s16        q1, q9, q10               ; step1[1] = step2[1] + step2[2];
+    vsub.s16        q2, q9, q10               ; step1[2] = step2[1] - step2[2];
+    vsub.s16        q3, q8, q11               ; step1[3] = step2[0] - step2[3];
+
+    vdup.16         d16, r3;                  ; duplicate cospi_16_64
+
+    ; step2[5] * cospi_16_64
+    vmull.s16       q11, d26, d16
+    vmull.s16       q12, d27, d16
+
+    ; step2[6] * cospi_16_64
+    vmull.s16       q9, d28, d16
+    vmull.s16       q10, d29, d16
+
+    ; temp1 = (step2[6] - step2[5]) * cospi_16_64
+    vsub.s32        q6, q9, q11
+    vsub.s32        q13, q10, q12
+
+    ; temp2 = (step2[5] + step2[6]) * cospi_16_64
+    vadd.s32        q9, q9, q11
+    vadd.s32        q10, q10, q12
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d10, q6, #14              ; >> 14
+    vqrshrn.s32     d11, q13, #14             ; >> 14
+
+    ; dct_const_round_shift(temp2)
+    vqrshrn.s32     d12, q9, #14              ; >> 14
+    vqrshrn.s32     d13, q10, #14             ; >> 14
+
+    ; stage 6
+    vadd.s16        q8, q0, q15                ; step2[0] = step1[0] + step1[7];
+    vadd.s16        q9, q1, q6                ; step2[1] = step1[1] + step1[6];
+    vadd.s16        q10, q2, q5               ; step2[2] = step1[2] + step1[5];
+    vadd.s16        q11, q3, q4               ; step2[3] = step1[3] + step1[4];
+    vsub.s16        q12, q3, q4               ; step2[4] = step1[3] - step1[4];
+    vsub.s16        q13, q2, q5               ; step2[5] = step1[2] - step1[5];
+    vsub.s16        q14, q1, q6               ; step2[6] = step1[1] - step1[6];
+    vsub.s16        q15, q0, q15              ; step2[7] = step1[0] - step1[7];
+
+    ; store the data
+    vst1.64         {d16}, [r1], r2
+    vst1.64         {d17}, [r1], r2
+    vst1.64         {d18}, [r1], r2
+    vst1.64         {d19}, [r1], r2
+    vst1.64         {d20}, [r1], r2
+    vst1.64         {d21}, [r1], r2
+    vst1.64         {d22}, [r1], r2
+    vst1.64         {d23}, [r1], r2
+    vst1.64         {d24}, [r1], r2
+    vst1.64         {d25}, [r1], r2
+    vst1.64         {d26}, [r1], r2
+    vst1.64         {d27}, [r1], r2
+    vst1.64         {d28}, [r1], r2
+    vst1.64         {d29}, [r1], r2
+    vst1.64         {d30}, [r1], r2
+    vst1.64         {d31}, [r1], r2
+
+    bx              lr
+    ENDP  ; |vpx_idct16x16_256_add_neon_pass1|
+
+;void vpx_idct16x16_256_add_neon_pass2(int16_t *src,
+;                                        int16_t *output,
+;                                        int16_t *pass1Output,
+;                                        int16_t skip_adding,
+;                                        uint8_t *dest,
+;                                        int dest_stride)
+;
+; r0  int16_t *src
+; r1  int16_t *output,
+; r2  int16_t *pass1Output,
+; r3  int16_t skip_adding,
+; r4  uint8_t *dest,
+; r5  int dest_stride)
+
+; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output
+; will be stored back into q8-q15 registers. This function will touch q0-q7
+; registers and use them as buffer during calculation.
+|vpx_idct16x16_256_add_neon_pass2| PROC
+    push            {r3-r9}
+
+    ; TODO(hkuang): Find a better way to load the elements.
+    ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15
+    vld2.s16        {q8,q9}, [r0]!
+    vld2.s16        {q9,q10}, [r0]!
+    vld2.s16        {q10,q11}, [r0]!
+    vld2.s16        {q11,q12}, [r0]!
+    vld2.s16        {q12,q13}, [r0]!
+    vld2.s16        {q13,q14}, [r0]!
+    vld2.s16        {q14,q15}, [r0]!
+    vld2.s16        {q0,q1}, [r0]!
+    vmov.s16        q15, q0;
+
+    ; generate  cospi_30_64 = 1606
+    mov             r3, #0x0600
+    add             r3, #0x46
+
+    ; generate cospi_2_64  = 16305
+    mov             r12, #0x3f00
+    add             r12, #0xb1
+
+    ; transpose the input data
+    TRANSPOSE8X8
+
+    ; stage 3
+    vdup.16         d12, r3                   ; duplicate cospi_30_64
+    vdup.16         d13, r12                  ; duplicate cospi_2_64
+
+    ; preloading to avoid stall
+    ; generate cospi_14_64 = 12665
+    mov             r3, #0x3100
+    add             r3, #0x79
+
+    ; generate cospi_18_64 = 10394
+    mov             r12, #0x2800
+    add             r12, #0x9a
+
+    ; step1[8] * cospi_30_64
+    vmull.s16       q2, d16, d12
+    vmull.s16       q3, d17, d12
+
+    ; step1[8] * cospi_2_64
+    vmull.s16       q1, d16, d13
+    vmull.s16       q4, d17, d13
+
+    ; temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64
+    vmlsl.s16       q2, d30, d13
+    vmlsl.s16       q3, d31, d13
+
+    ; temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64
+    vmlal.s16       q1, d30, d12
+    vmlal.s16       q4, d31, d12
+
+    vdup.16         d30, r3                   ; duplicate cospi_14_64
+    vdup.16         d31, r12                  ; duplicate cospi_18_64
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d0, q2, #14               ; >> 14
+    vqrshrn.s32     d1, q3, #14               ; >> 14
+
+    ; dct_const_round_shift(temp2)
+    vqrshrn.s32     d14, q1, #14              ; >> 14
+    vqrshrn.s32     d15, q4, #14              ; >> 14
+
+    ; preloading to avoid stall
+    ; generate cospi_22_64 = 7723
+    mov             r3, #0x1e00
+    add             r3, #0x2b
+
+    ; generate cospi_10_64 = 14449
+    mov             r12, #0x3800
+    add             r12, #0x71
+
+    ; step1[9] * cospi_14_64
+    vmull.s16       q2, d24, d30
+    vmull.s16       q3, d25, d30
+
+    ; step1[9] * cospi_18_64
+    vmull.s16       q4, d24, d31
+    vmull.s16       q5, d25, d31
+
+    ; temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64
+    vmlsl.s16       q2, d22, d31
+    vmlsl.s16       q3, d23, d31
+
+    ; temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64
+    vmlal.s16       q4, d22, d30
+    vmlal.s16       q5, d23, d30
+
+    vdup.16         d30, r3                   ; duplicate cospi_22_64
+    vdup.16         d31, r12                  ; duplicate cospi_10_64
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d2, q2, #14               ; >> 14
+    vqrshrn.s32     d3, q3, #14               ; >> 14
+
+    ; dct_const_round_shift(temp2)
+    vqrshrn.s32     d12, q4, #14              ; >> 14
+    vqrshrn.s32     d13, q5, #14              ; >> 14
+
+    ; step1[10] * cospi_22_64
+    vmull.s16       q11, d20, d30
+    vmull.s16       q12, d21, d30
+
+    ; step1[10] * cospi_10_64
+    vmull.s16       q4, d20, d31
+    vmull.s16       q5, d21, d31
+
+    ; temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64
+    vmlsl.s16       q11, d26, d31
+    vmlsl.s16       q12, d27, d31
+
+    ; temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64
+    vmlal.s16       q4, d26, d30
+    vmlal.s16       q5, d27, d30
+
+    ; preloading to avoid stall
+    ; generate cospi_6_64 = 15679
+    mov             r3, #0x3d00
+    add             r3, #0x3f
+
+    ; generate cospi_26_64 = 4756
+    mov             r12, #0x1200
+    add             r12, #0x94
+
+    vdup.16         d30, r3                   ; duplicate cospi_6_64
+    vdup.16         d31, r12                  ; duplicate cospi_26_64
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d4, q11, #14              ; >> 14
+    vqrshrn.s32     d5, q12, #14              ; >> 14
+
+    ; dct_const_round_shift(temp2)
+    vqrshrn.s32     d11, q5, #14              ; >> 14
+    vqrshrn.s32     d10, q4, #14              ; >> 14
+
+    ; step1[11] * cospi_6_64
+    vmull.s16       q10, d28, d30
+    vmull.s16       q11, d29, d30
+
+    ; step1[11] * cospi_26_64
+    vmull.s16       q12, d28, d31
+    vmull.s16       q13, d29, d31
+
+    ; temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64
+    vmlsl.s16       q10, d18, d31
+    vmlsl.s16       q11, d19, d31
+
+    ; temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64
+    vmlal.s16       q12, d18, d30
+    vmlal.s16       q13, d19, d30
+
+    vsub.s16        q9, q0, q1                ; step1[9]=step2[8]-step2[9]
+    vadd.s16        q0, q0, q1                ; step1[8]=step2[8]+step2[9]
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d6, q10, #14              ; >> 14
+    vqrshrn.s32     d7, q11, #14              ; >> 14
+
+    ; dct_const_round_shift(temp2)
+    vqrshrn.s32     d8, q12, #14              ; >> 14
+    vqrshrn.s32     d9, q13, #14              ; >> 14
+
+    ; stage 3
+    vsub.s16        q10, q3, q2               ; step1[10]=-step2[10]+step2[11]
+    vadd.s16        q11, q2, q3               ; step1[11]=step2[10]+step2[11]
+    vadd.s16        q12, q4, q5               ; step1[12]=step2[12]+step2[13]
+    vsub.s16        q13, q4, q5               ; step1[13]=step2[12]-step2[13]
+    vsub.s16        q14, q7, q6               ; step1[14]=-step2[14]+tep2[15]
+    vadd.s16        q7, q6, q7                ; step1[15]=step2[14]+step2[15]
+
+    ; stage 4
+    ; generate cospi_24_64 = 6270
+    mov             r3, #0x1800
+    add             r3, #0x7e
+
+    ; generate cospi_8_64 = 15137
+    mov             r12, #0x3b00
+    add             r12, #0x21
+
+    ; -step1[9] * cospi_8_64 + step1[14] * cospi_24_64
+    vdup.16         d30, r12                  ; duplicate cospi_8_64
+    vdup.16         d31, r3                   ; duplicate cospi_24_64
+
+    ; step1[9] * cospi_24_64
+    vmull.s16       q2, d18, d31
+    vmull.s16       q3, d19, d31
+
+    ; step1[14] * cospi_24_64
+    vmull.s16       q4, d28, d31
+    vmull.s16       q5, d29, d31
+
+    ; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64
+    vmlal.s16       q2, d28, d30
+    vmlal.s16       q3, d29, d30
+
+    ; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64
+    vmlsl.s16       q4, d18, d30
+    vmlsl.s16       q5, d19, d30
+
+    rsb             r12, #0
+    vdup.16         d30, r12                  ; duplicate -cospi_8_64
+
+    ; dct_const_round_shift(temp2)
+    vqrshrn.s32     d12, q2, #14              ; >> 14
+    vqrshrn.s32     d13, q3, #14              ; >> 14
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d2, q4, #14               ; >> 14
+    vqrshrn.s32     d3, q5, #14               ; >> 14
+
+    vmov.s16        q3, q11
+    vmov.s16        q4, q12
+
+    ; - step1[13] * cospi_8_64
+    vmull.s16       q11, d26, d30
+    vmull.s16       q12, d27, d30
+
+    ; -step1[10] * cospi_8_64
+    vmull.s16       q8, d20, d30
+    vmull.s16       q9, d21, d30
+
+    ; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64
+    vmlsl.s16       q11, d20, d31
+    vmlsl.s16       q12, d21, d31
+
+    ; temp1 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64
+    vmlal.s16       q8, d26, d31
+    vmlal.s16       q9, d27, d31
+
+    ; dct_const_round_shift(temp2)
+    vqrshrn.s32     d4, q11, #14              ; >> 14
+    vqrshrn.s32     d5, q12, #14              ; >> 14
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d10, q8, #14              ; >> 14
+    vqrshrn.s32     d11, q9, #14              ; >> 14
+
+    ; stage 5
+    vadd.s16        q8, q0, q3                ; step1[8] = step2[8]+step2[11];
+    vadd.s16        q9, q1, q2                ; step1[9] = step2[9]+step2[10];
+    vsub.s16        q10, q1, q2               ; step1[10] = step2[9]-step2[10];
+    vsub.s16        q11, q0, q3               ; step1[11] = step2[8]-step2[11];
+    vsub.s16        q12, q7, q4               ; step1[12] =-step2[12]+step2[15];
+    vsub.s16        q13, q6, q5               ; step1[13] =-step2[13]+step2[14];
+    vadd.s16        q14, q6, q5               ; step1[14] =step2[13]+step2[14];
+    vadd.s16        q15, q7, q4               ; step1[15] =step2[12]+step2[15];
+
+    ; stage 6.
+    ; generate cospi_16_64 = 11585
+    mov             r12, #0x2d00
+    add             r12, #0x41
+
+    vdup.16         d14, r12                  ; duplicate cospi_16_64
+
+    ; step1[13] * cospi_16_64
+    vmull.s16       q3, d26, d14
+    vmull.s16       q4, d27, d14
+
+    ; step1[10] * cospi_16_64
+    vmull.s16       q0, d20, d14
+    vmull.s16       q1, d21, d14
+
+    ; temp1 = (-step1[10] + step1[13]) * cospi_16_64
+    vsub.s32        q5, q3, q0
+    vsub.s32        q6, q4, q1
+
+    ; temp2 = (step1[10] + step1[13]) * cospi_16_64
+    vadd.s32        q10, q3, q0
+    vadd.s32        q4, q4, q1
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d4, q5, #14               ; >> 14
+    vqrshrn.s32     d5, q6, #14               ; >> 14
+
+    ; dct_const_round_shift(temp2)
+    vqrshrn.s32     d10, q10, #14             ; >> 14
+    vqrshrn.s32     d11, q4, #14              ; >> 14
+
+    ; step1[11] * cospi_16_64
+    vmull.s16       q0, d22, d14
+    vmull.s16       q1, d23, d14
+
+    ; step1[12] * cospi_16_64
+    vmull.s16       q13, d24, d14
+    vmull.s16       q6, d25, d14
+
+    ; temp1 = (-step1[11] + step1[12]) * cospi_16_64
+    vsub.s32        q10, q13, q0
+    vsub.s32        q4, q6, q1
+
+    ; temp2 = (step1[11] + step1[12]) * cospi_16_64
+    vadd.s32        q13, q13, q0
+    vadd.s32        q6, q6, q1
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d6, q10, #14              ; >> 14
+    vqrshrn.s32     d7, q4, #14               ; >> 14
+
+    ; dct_const_round_shift(temp2)
+    vqrshrn.s32     d8, q13, #14              ; >> 14
+    vqrshrn.s32     d9, q6, #14               ; >> 14
+
+    mov              r4, #16                  ; pass1Output stride
+    ldr              r3, [sp]                 ; load skip_adding
+    cmp              r3, #0                   ; check if need adding dest data
+    beq              skip_adding_dest
+
+    ldr              r7, [sp, #28]            ; dest used to save element 0-7
+    mov              r9, r7                   ; save dest pointer for later use
+    ldr              r8, [sp, #32]            ; load dest_stride
+
+    ; stage 7
+    ; load the data in pass1
+    vld1.s16        {q0}, [r2], r4            ; load data step2[0]
+    vld1.s16        {q1}, [r2], r4            ; load data step2[1]
+    vld1.s16        {q10}, [r2], r4           ; load data step2[2]
+    vld1.s16        {q11}, [r2], r4           ; load data step2[3]
+    vld1.64         {d12}, [r7], r8           ; load destinatoin data
+    vld1.64         {d13}, [r7], r8           ; load destinatoin data
+    vadd.s16        q12, q0, q15              ; step2[0] + step2[15]
+    vadd.s16        q13, q1, q14              ; step2[1] + step2[14]
+    vrshr.s16       q12, q12, #6              ; ROUND_POWER_OF_TWO
+    vrshr.s16       q13, q13, #6              ; ROUND_POWER_OF_TWO
+    vaddw.u8        q12, q12, d12             ; + dest[j * dest_stride + i]
+    vaddw.u8        q13, q13, d13             ; + dest[j * dest_stride + i]
+    vqmovun.s16     d12, q12                  ; clip pixel
+    vqmovun.s16     d13, q13                  ; clip pixel
+    vst1.64         {d12}, [r9], r8           ; store the data
+    vst1.64         {d13}, [r9], r8           ; store the data
+    vsub.s16        q14, q1, q14              ; step2[1] - step2[14]
+    vsub.s16        q15, q0, q15              ; step2[0] - step2[15]
+    vld1.64         {d12}, [r7], r8           ; load destinatoin data
+    vld1.64         {d13}, [r7], r8           ; load destinatoin data
+    vadd.s16        q12, q10, q5              ; step2[2] + step2[13]
+    vadd.s16        q13, q11, q4              ; step2[3] + step2[12]
+    vrshr.s16       q12, q12, #6              ; ROUND_POWER_OF_TWO
+    vrshr.s16       q13, q13, #6              ; ROUND_POWER_OF_TWO
+    vaddw.u8        q12, q12, d12             ; + dest[j * dest_stride + i]
+    vaddw.u8        q13, q13, d13             ; + dest[j * dest_stride + i]
+    vqmovun.s16     d12, q12                  ; clip pixel
+    vqmovun.s16     d13, q13                  ; clip pixel
+    vst1.64         {d12}, [r9], r8           ; store the data
+    vst1.64         {d13}, [r9], r8           ; store the data
+    vsub.s16        q4, q11, q4               ; step2[3] - step2[12]
+    vsub.s16        q5, q10, q5               ; step2[2] - step2[13]
+    vld1.s16        {q0}, [r2], r4            ; load data step2[4]
+    vld1.s16        {q1}, [r2], r4            ; load data step2[5]
+    vld1.s16        {q10}, [r2], r4           ; load data step2[6]
+    vld1.s16        {q11}, [r2], r4           ; load data step2[7]
+    vld1.64         {d12}, [r7], r8           ; load destinatoin data
+    vld1.64         {d13}, [r7], r8           ; load destinatoin data
+    vadd.s16        q12, q0, q3               ; step2[4] + step2[11]
+    vadd.s16        q13, q1, q2               ; step2[5] + step2[10]
+    vrshr.s16       q12, q12, #6              ; ROUND_POWER_OF_TWO
+    vrshr.s16       q13, q13, #6              ; ROUND_POWER_OF_TWO
+    vaddw.u8        q12, q12, d12             ; + dest[j * dest_stride + i]
+    vaddw.u8        q13, q13, d13             ; + dest[j * dest_stride + i]
+    vqmovun.s16     d12, q12                  ; clip pixel
+    vqmovun.s16     d13, q13                  ; clip pixel
+    vst1.64         {d12}, [r9], r8           ; store the data
+    vst1.64         {d13}, [r9], r8           ; store the data
+    vsub.s16        q2, q1, q2                ; step2[5] - step2[10]
+    vsub.s16        q3, q0, q3                ; step2[4] - step2[11]
+    vld1.64         {d12}, [r7], r8           ; load destinatoin data
+    vld1.64         {d13}, [r7], r8           ; load destinatoin data
+    vadd.s16        q12, q10, q9              ; step2[6] + step2[9]
+    vadd.s16        q13, q11, q8              ; step2[7] + step2[8]
+    vrshr.s16       q12, q12, #6              ; ROUND_POWER_OF_TWO
+    vrshr.s16       q13, q13, #6              ; ROUND_POWER_OF_TWO
+    vaddw.u8        q12, q12, d12             ; + dest[j * dest_stride + i]
+    vaddw.u8        q13, q13, d13             ; + dest[j * dest_stride + i]
+    vqmovun.s16     d12, q12                  ; clip pixel
+    vqmovun.s16     d13, q13                  ; clip pixel
+    vst1.64         {d12}, [r9], r8           ; store the data
+    vst1.64         {d13}, [r9], r8           ; store the data
+    vld1.64         {d12}, [r7], r8           ; load destinatoin data
+    vld1.64         {d13}, [r7], r8           ; load destinatoin data
+    vsub.s16        q8, q11, q8               ; step2[7] - step2[8]
+    vsub.s16        q9, q10, q9               ; step2[6] - step2[9]
+
+    ; store the data  output 8,9,10,11,12,13,14,15
+    vrshr.s16       q8, q8, #6                ; ROUND_POWER_OF_TWO
+    vaddw.u8        q8, q8, d12               ; + dest[j * dest_stride + i]
+    vqmovun.s16     d12, q8                   ; clip pixel
+    vst1.64         {d12}, [r9], r8           ; store the data
+    vld1.64         {d12}, [r7], r8           ; load destinatoin data
+    vrshr.s16       q9, q9, #6
+    vaddw.u8        q9, q9, d13               ; + dest[j * dest_stride + i]
+    vqmovun.s16     d13, q9                   ; clip pixel
+    vst1.64         {d13}, [r9], r8           ; store the data
+    vld1.64         {d13}, [r7], r8           ; load destinatoin data
+    vrshr.s16       q2, q2, #6
+    vaddw.u8        q2, q2, d12               ; + dest[j * dest_stride + i]
+    vqmovun.s16     d12, q2                   ; clip pixel
+    vst1.64         {d12}, [r9], r8           ; store the data
+    vld1.64         {d12}, [r7], r8           ; load destinatoin data
+    vrshr.s16       q3, q3, #6
+    vaddw.u8        q3, q3, d13               ; + dest[j * dest_stride + i]
+    vqmovun.s16     d13, q3                   ; clip pixel
+    vst1.64         {d13}, [r9], r8           ; store the data
+    vld1.64         {d13}, [r7], r8           ; load destinatoin data
+    vrshr.s16       q4, q4, #6
+    vaddw.u8        q4, q4, d12               ; + dest[j * dest_stride + i]
+    vqmovun.s16     d12, q4                   ; clip pixel
+    vst1.64         {d12}, [r9], r8           ; store the data
+    vld1.64         {d12}, [r7], r8           ; load destinatoin data
+    vrshr.s16       q5, q5, #6
+    vaddw.u8        q5, q5, d13               ; + dest[j * dest_stride + i]
+    vqmovun.s16     d13, q5                   ; clip pixel
+    vst1.64         {d13}, [r9], r8           ; store the data
+    vld1.64         {d13}, [r7], r8           ; load destinatoin data
+    vrshr.s16       q14, q14, #6
+    vaddw.u8        q14, q14, d12             ; + dest[j * dest_stride + i]
+    vqmovun.s16     d12, q14                  ; clip pixel
+    vst1.64         {d12}, [r9], r8           ; store the data
+    vld1.64         {d12}, [r7], r8           ; load destinatoin data
+    vrshr.s16       q15, q15, #6
+    vaddw.u8        q15, q15, d13             ; + dest[j * dest_stride + i]
+    vqmovun.s16     d13, q15                  ; clip pixel
+    vst1.64         {d13}, [r9], r8           ; store the data
+    b               end_idct16x16_pass2
+
+skip_adding_dest
+    ; stage 7
+    ; load the data in pass1
+    mov              r5, #24
+    mov              r3, #8
+
+    vld1.s16        {q0}, [r2], r4            ; load data step2[0]
+    vld1.s16        {q1}, [r2], r4            ; load data step2[1]
+    vadd.s16        q12, q0, q15              ; step2[0] + step2[15]
+    vadd.s16        q13, q1, q14              ; step2[1] + step2[14]
+    vld1.s16        {q10}, [r2], r4           ; load data step2[2]
+    vld1.s16        {q11}, [r2], r4           ; load data step2[3]
+    vst1.64         {d24}, [r1], r3           ; store output[0]
+    vst1.64         {d25}, [r1], r5
+    vst1.64         {d26}, [r1], r3           ; store output[1]
+    vst1.64         {d27}, [r1], r5
+    vadd.s16        q12, q10, q5              ; step2[2] + step2[13]
+    vadd.s16        q13, q11, q4              ; step2[3] + step2[12]
+    vsub.s16        q14, q1, q14              ; step2[1] - step2[14]
+    vsub.s16        q15, q0, q15              ; step2[0] - step2[15]
+    vst1.64         {d24}, [r1], r3           ; store output[2]
+    vst1.64         {d25}, [r1], r5
+    vst1.64         {d26}, [r1], r3           ; store output[3]
+    vst1.64         {d27}, [r1], r5
+    vsub.s16        q4, q11, q4               ; step2[3] - step2[12]
+    vsub.s16        q5, q10, q5               ; step2[2] - step2[13]
+    vld1.s16        {q0}, [r2], r4            ; load data step2[4]
+    vld1.s16        {q1}, [r2], r4            ; load data step2[5]
+    vadd.s16        q12, q0, q3               ; step2[4] + step2[11]
+    vadd.s16        q13, q1, q2               ; step2[5] + step2[10]
+    vld1.s16        {q10}, [r2], r4           ; load data step2[6]
+    vld1.s16        {q11}, [r2], r4           ; load data step2[7]
+    vst1.64         {d24}, [r1], r3           ; store output[4]
+    vst1.64         {d25}, [r1], r5
+    vst1.64         {d26}, [r1], r3           ; store output[5]
+    vst1.64         {d27}, [r1], r5
+    vadd.s16        q12, q10, q9              ; step2[6] + step2[9]
+    vadd.s16        q13, q11, q8              ; step2[7] + step2[8]
+    vsub.s16        q2, q1, q2                ; step2[5] - step2[10]
+    vsub.s16        q3, q0, q3                ; step2[4] - step2[11]
+    vsub.s16        q8, q11, q8               ; step2[7] - step2[8]
+    vsub.s16        q9, q10, q9               ; step2[6] - step2[9]
+    vst1.64         {d24}, [r1], r3           ; store output[6]
+    vst1.64         {d25}, [r1], r5
+    vst1.64         {d26}, [r1], r3           ; store output[7]
+    vst1.64         {d27}, [r1], r5
+
+    ; store the data  output 8,9,10,11,12,13,14,15
+    vst1.64         {d16}, [r1], r3
+    vst1.64         {d17}, [r1], r5
+    vst1.64         {d18}, [r1], r3
+    vst1.64         {d19}, [r1], r5
+    vst1.64         {d4}, [r1], r3
+    vst1.64         {d5}, [r1], r5
+    vst1.64         {d6}, [r1], r3
+    vst1.64         {d7}, [r1], r5
+    vst1.64         {d8}, [r1], r3
+    vst1.64         {d9}, [r1], r5
+    vst1.64         {d10}, [r1], r3
+    vst1.64         {d11}, [r1], r5
+    vst1.64         {d28}, [r1], r3
+    vst1.64         {d29}, [r1], r5
+    vst1.64         {d30}, [r1], r3
+    vst1.64         {d31}, [r1], r5
+end_idct16x16_pass2
+    pop             {r3-r9}
+    bx              lr
+    ENDP  ; |vpx_idct16x16_256_add_neon_pass2|
+
+;void |vpx_idct16x16_10_add_neon_pass1|(int16_t *input,
+;                                             int16_t *output, int output_stride)
+;
+; r0  int16_t input
+; r1  int16_t *output
+; r2  int  output_stride)
+
+; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output
+; will be stored back into q8-q15 registers. This function will touch q0-q7
+; registers and use them as buffer during calculation.
+|vpx_idct16x16_10_add_neon_pass1| PROC
+
+    ; TODO(hkuang): Find a better way to load the elements.
+    ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15
+    vld2.s16        {q8,q9}, [r0]!
+    vld2.s16        {q9,q10}, [r0]!
+    vld2.s16        {q10,q11}, [r0]!
+    vld2.s16        {q11,q12}, [r0]!
+    vld2.s16        {q12,q13}, [r0]!
+    vld2.s16        {q13,q14}, [r0]!
+    vld2.s16        {q14,q15}, [r0]!
+    vld2.s16        {q1,q2}, [r0]!
+    vmov.s16        q15, q1
+
+    ; generate  cospi_28_64*2 = 6392
+    mov             r3, #0x1800
+    add             r3, #0xf8
+
+    ; generate cospi_4_64*2  = 32138
+    mov             r12, #0x7d00
+    add             r12, #0x8a
+
+    ; transpose the input data
+    TRANSPOSE8X8
+
+    ; stage 3
+    vdup.16         q0, r3                    ; duplicate cospi_28_64*2
+    vdup.16         q1, r12                   ; duplicate cospi_4_64*2
+
+    ; The following instructions use vqrdmulh to do the
+    ; dct_const_round_shift(step2[4] * cospi_28_64). vvqrdmulh will multiply,
+    ; double, and return the high 16 bits, effectively giving >> 15. Doubling
+    ; the constant will change this to >> 14.
+    ; dct_const_round_shift(step2[4] * cospi_28_64);
+    vqrdmulh.s16    q4, q9, q0
+
+    ; preloading to avoid stall
+    ; generate cospi_16_64*2 = 23170
+    mov             r3, #0x5a00
+    add             r3, #0x82
+
+    ; dct_const_round_shift(step2[4] * cospi_4_64);
+    vqrdmulh.s16    q7, q9, q1
+
+    ; stage 4
+    vdup.16         q1, r3                    ; cospi_16_64*2
+
+    ; generate cospi_16_64 = 11585
+    mov             r3, #0x2d00
+    add             r3, #0x41
+
+    vdup.16         d4, r3;                   ; duplicate cospi_16_64
+
+    ; dct_const_round_shift(step1[0] * cospi_16_64)
+    vqrdmulh.s16    q8, q8, q1
+
+    ; step2[6] * cospi_16_64
+    vmull.s16       q9, d14, d4
+    vmull.s16       q10, d15, d4
+
+    ; step2[5] * cospi_16_64
+    vmull.s16       q12, d9, d4
+    vmull.s16       q11, d8, d4
+
+    ; temp1 = (step2[6] - step2[5]) * cospi_16_64
+    vsub.s32        q15, q10, q12
+    vsub.s32        q6, q9, q11
+
+    ; temp2 = (step2[5] + step2[6]) * cospi_16_64
+    vadd.s32        q9, q9, q11
+    vadd.s32        q10, q10, q12
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d11, q15, #14             ; >> 14
+    vqrshrn.s32     d10, q6, #14              ; >> 14
+
+    ; dct_const_round_shift(temp2)
+    vqrshrn.s32     d12, q9, #14              ; >> 14
+    vqrshrn.s32     d13, q10, #14             ; >> 14
+
+    ; stage 6
+    vadd.s16        q2, q8, q7                ; step2[0] = step1[0] + step1[7];
+    vadd.s16        q10, q8, q5               ; step2[2] = step1[2] + step1[5];
+    vadd.s16        q11, q8, q4               ; step2[3] = step1[3] + step1[4];
+    vadd.s16        q9, q8, q6                ; step2[1] = step1[1] + step1[6];
+    vsub.s16        q12, q8, q4               ; step2[4] = step1[3] - step1[4];
+    vsub.s16        q13, q8, q5               ; step2[5] = step1[2] - step1[5];
+    vsub.s16        q14, q8, q6               ; step2[6] = step1[1] - step1[6];
+    vsub.s16        q15, q8, q7               ; step2[7] = step1[0] - step1[7];
+
+    ; store the data
+    vst1.64         {d4}, [r1], r2
+    vst1.64         {d5}, [r1], r2
+    vst1.64         {d18}, [r1], r2
+    vst1.64         {d19}, [r1], r2
+    vst1.64         {d20}, [r1], r2
+    vst1.64         {d21}, [r1], r2
+    vst1.64         {d22}, [r1], r2
+    vst1.64         {d23}, [r1], r2
+    vst1.64         {d24}, [r1], r2
+    vst1.64         {d25}, [r1], r2
+    vst1.64         {d26}, [r1], r2
+    vst1.64         {d27}, [r1], r2
+    vst1.64         {d28}, [r1], r2
+    vst1.64         {d29}, [r1], r2
+    vst1.64         {d30}, [r1], r2
+    vst1.64         {d31}, [r1], r2
+
+    bx              lr
+    ENDP  ; |vpx_idct16x16_10_add_neon_pass1|
+
+;void vpx_idct16x16_10_add_neon_pass2(int16_t *src,
+;                                           int16_t *output,
+;                                           int16_t *pass1Output,
+;                                           int16_t skip_adding,
+;                                           uint8_t *dest,
+;                                           int dest_stride)
+;
+; r0  int16_t *src
+; r1  int16_t *output,
+; r2  int16_t *pass1Output,
+; r3  int16_t skip_adding,
+; r4  uint8_t *dest,
+; r5  int dest_stride)
+
+; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output
+; will be stored back into q8-q15 registers. This function will touch q0-q7
+; registers and use them as buffer during calculation.
+|vpx_idct16x16_10_add_neon_pass2| PROC
+    push            {r3-r9}
+
+    ; TODO(hkuang): Find a better way to load the elements.
+    ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15
+    vld2.s16        {q8,q9}, [r0]!
+    vld2.s16        {q9,q10}, [r0]!
+    vld2.s16        {q10,q11}, [r0]!
+    vld2.s16        {q11,q12}, [r0]!
+    vld2.s16        {q12,q13}, [r0]!
+    vld2.s16        {q13,q14}, [r0]!
+    vld2.s16        {q14,q15}, [r0]!
+    vld2.s16        {q0,q1}, [r0]!
+    vmov.s16        q15, q0;
+
+    ; generate 2*cospi_30_64 = 3212
+    mov             r3, #0xc00
+    add             r3, #0x8c
+
+    ; generate 2*cospi_2_64  = 32610
+    mov             r12, #0x7f00
+    add             r12, #0x62
+
+    ; transpose the input data
+    TRANSPOSE8X8
+
+    ; stage 3
+    vdup.16         q6, r3                    ; duplicate 2*cospi_30_64
+
+    ; dct_const_round_shift(step1[8] * cospi_30_64)
+    vqrdmulh.s16    q0, q8, q6
+
+    vdup.16         q6, r12                   ; duplicate 2*cospi_2_64
+
+    ; dct_const_round_shift(step1[8] * cospi_2_64)
+    vqrdmulh.s16    q7, q8, q6
+
+    ; preloading to avoid stall
+    ; generate 2*cospi_26_64 = 9512
+    mov             r12, #0x2500
+    add             r12, #0x28
+    rsb             r12, #0
+    vdup.16         q15, r12                  ; duplicate -2*cospi_26_64
+
+    ; generate 2*cospi_6_64 = 31358
+    mov             r3, #0x7a00
+    add             r3, #0x7e
+    vdup.16         q14, r3                   ; duplicate 2*cospi_6_64
+
+    ; dct_const_round_shift(- step1[12] * cospi_26_64)
+    vqrdmulh.s16    q3, q9, q15
+
+    ; dct_const_round_shift(step1[12] * cospi_6_64)
+    vqrdmulh.s16    q4, q9, q14
+
+    ; stage 4
+    ; generate cospi_24_64 = 6270
+    mov             r3, #0x1800
+    add             r3, #0x7e
+    vdup.16         d31, r3                   ; duplicate cospi_24_64
+
+    ; generate cospi_8_64 = 15137
+    mov             r12, #0x3b00
+    add             r12, #0x21
+    vdup.16         d30, r12                  ; duplicate cospi_8_64
+
+    ; step1[14] * cospi_24_64
+    vmull.s16       q12, d14, d31
+    vmull.s16       q5, d15, d31
+
+    ; step1[9] * cospi_24_64
+    vmull.s16       q2, d0, d31
+    vmull.s16       q11, d1, d31
+
+    ; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64
+    vmlsl.s16       q12, d0, d30
+    vmlsl.s16       q5, d1, d30
+
+    ; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64
+    vmlal.s16       q2, d14, d30
+    vmlal.s16       q11, d15, d30
+
+    rsb              r12, #0
+    vdup.16          d30, r12                 ; duplicate -cospi_8_64
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d2, q12, #14              ; >> 14
+    vqrshrn.s32     d3, q5, #14               ; >> 14
+
+    ; dct_const_round_shift(temp2)
+    vqrshrn.s32     d12, q2, #14              ; >> 14
+    vqrshrn.s32     d13, q11, #14             ; >> 14
+
+    ; - step1[13] * cospi_8_64
+    vmull.s16       q10, d8, d30
+    vmull.s16       q13, d9, d30
+
+    ; -step1[10] * cospi_8_64
+    vmull.s16       q8, d6, d30
+    vmull.s16       q9, d7, d30
+
+    ; temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64
+    vmlsl.s16       q10, d6, d31
+    vmlsl.s16       q13, d7, d31
+
+    ; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64
+    vmlal.s16       q8, d8, d31
+    vmlal.s16       q9, d9, d31
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d4, q10, #14              ; >> 14
+    vqrshrn.s32     d5, q13, #14              ; >> 14
+
+    ; dct_const_round_shift(temp2)
+    vqrshrn.s32     d10, q8, #14              ; >> 14
+    vqrshrn.s32     d11, q9, #14              ; >> 14
+
+    ; stage 5
+    vadd.s16        q8, q0, q3                ; step1[8] = step2[8]+step2[11];
+    vadd.s16        q9, q1, q2                ; step1[9] = step2[9]+step2[10];
+    vsub.s16        q10, q1, q2               ; step1[10] = step2[9]-step2[10];
+    vsub.s16        q11, q0, q3               ; step1[11] = step2[8]-step2[11];
+    vsub.s16        q12, q7, q4               ; step1[12] =-step2[12]+step2[15];
+    vsub.s16        q13, q6, q5               ; step1[13] =-step2[13]+step2[14];
+    vadd.s16        q14, q6, q5               ; step1[14] =step2[13]+step2[14];
+    vadd.s16        q15, q7, q4               ; step1[15] =step2[12]+step2[15];
+
+    ; stage 6.
+    ; generate cospi_16_64 = 11585
+    mov             r12, #0x2d00
+    add             r12, #0x41
+
+    vdup.16         d14, r12                  ; duplicate cospi_16_64
+
+    ; step1[13] * cospi_16_64
+    vmull.s16       q3, d26, d14
+    vmull.s16       q4, d27, d14
+
+    ; step1[10] * cospi_16_64
+    vmull.s16       q0, d20, d14
+    vmull.s16       q1, d21, d14
+
+    ; temp1 = (-step1[10] + step1[13]) * cospi_16_64
+    vsub.s32        q5, q3, q0
+    vsub.s32        q6, q4, q1
+
+    ; temp2 = (step1[10] + step1[13]) * cospi_16_64
+    vadd.s32        q0, q3, q0
+    vadd.s32        q1, q4, q1
+
+    ; dct_const_round_shift(temp1)
+    vqrshrn.s32     d4, q5, #14               ; >> 14
+    vqrshrn.s32     d5, q6, #14               ; >> 14
+
+    ; dct_const_round_shift(temp2)
+    vqrshrn.s32     d10, q0, #14              ; >> 14
+    vqrshrn.s32     d11, q1, #14              ; >> 14
+
+    ; step1[11] * cospi_16_64
+    vmull.s16       q0, d22, d14
+    vmull.s16       q1, d23, d14
+
+    ; step1[12] * cospi_16_64
+    vmull.s16       q13, d24, d14
+    vmull.s16       q6, d25, d14
+
+    ; temp1 = (-step1[11] + step1[12]) * cospi_16_64
+    vsub.s32        q10, q13, q0
+    vsub.s32        q4, q6, q1
+
+    ; temp2 = (step1[11] + step1[12]) * cospi_16_64
+    vadd.s32        q13, q13, q0
+    vadd.s32        q6, q6, q1
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d6, q10, #14              ; >> 14
+    vqrshrn.s32     d7, q4, #14               ; >> 14
+
+    ; dct_const_round_shift((step1[11] + step1[12]) * cospi_16_64);
+    vqrshrn.s32     d8, q13, #14              ; >> 14
+    vqrshrn.s32     d9, q6, #14               ; >> 14
+
+    mov              r4, #16                  ; pass1Output stride
+    ldr              r3, [sp]                 ; load skip_adding
+
+    ; stage 7
+    ; load the data in pass1
+    mov              r5, #24
+    mov              r3, #8
+
+    vld1.s16        {q0}, [r2], r4            ; load data step2[0]
+    vld1.s16        {q1}, [r2], r4            ; load data step2[1]
+    vadd.s16        q12, q0, q15              ; step2[0] + step2[15]
+    vadd.s16        q13, q1, q14              ; step2[1] + step2[14]
+    vld1.s16        {q10}, [r2], r4           ; load data step2[2]
+    vld1.s16        {q11}, [r2], r4           ; load data step2[3]
+    vst1.64         {d24}, [r1], r3           ; store output[0]
+    vst1.64         {d25}, [r1], r5
+    vst1.64         {d26}, [r1], r3           ; store output[1]
+    vst1.64         {d27}, [r1], r5
+    vadd.s16        q12, q10, q5              ; step2[2] + step2[13]
+    vadd.s16        q13, q11, q4              ; step2[3] + step2[12]
+    vsub.s16        q14, q1, q14              ; step2[1] - step2[14]
+    vsub.s16        q15, q0, q15              ; step2[0] - step2[15]
+    vst1.64         {d24}, [r1], r3           ; store output[2]
+    vst1.64         {d25}, [r1], r5
+    vst1.64         {d26}, [r1], r3           ; store output[3]
+    vst1.64         {d27}, [r1], r5
+    vsub.s16        q4, q11, q4               ; step2[3] - step2[12]
+    vsub.s16        q5, q10, q5               ; step2[2] - step2[13]
+    vld1.s16        {q0}, [r2], r4            ; load data step2[4]
+    vld1.s16        {q1}, [r2], r4            ; load data step2[5]
+    vadd.s16        q12, q0, q3               ; step2[4] + step2[11]
+    vadd.s16        q13, q1, q2               ; step2[5] + step2[10]
+    vld1.s16        {q10}, [r2], r4           ; load data step2[6]
+    vld1.s16        {q11}, [r2], r4           ; load data step2[7]
+    vst1.64         {d24}, [r1], r3           ; store output[4]
+    vst1.64         {d25}, [r1], r5
+    vst1.64         {d26}, [r1], r3           ; store output[5]
+    vst1.64         {d27}, [r1], r5
+    vadd.s16        q12, q10, q9              ; step2[6] + step2[9]
+    vadd.s16        q13, q11, q8              ; step2[7] + step2[8]
+    vsub.s16        q2, q1, q2                ; step2[5] - step2[10]
+    vsub.s16        q3, q0, q3                ; step2[4] - step2[11]
+    vsub.s16        q8, q11, q8               ; step2[7] - step2[8]
+    vsub.s16        q9, q10, q9               ; step2[6] - step2[9]
+    vst1.64         {d24}, [r1], r3           ; store output[6]
+    vst1.64         {d25}, [r1], r5
+    vst1.64         {d26}, [r1], r3           ; store output[7]
+    vst1.64         {d27}, [r1], r5
+
+    ; store the data  output 8,9,10,11,12,13,14,15
+    vst1.64         {d16}, [r1], r3
+    vst1.64         {d17}, [r1], r5
+    vst1.64         {d18}, [r1], r3
+    vst1.64         {d19}, [r1], r5
+    vst1.64         {d4}, [r1], r3
+    vst1.64         {d5}, [r1], r5
+    vst1.64         {d6}, [r1], r3
+    vst1.64         {d7}, [r1], r5
+    vst1.64         {d8}, [r1], r3
+    vst1.64         {d9}, [r1], r5
+    vst1.64         {d10}, [r1], r3
+    vst1.64         {d11}, [r1], r5
+    vst1.64         {d28}, [r1], r3
+    vst1.64         {d29}, [r1], r5
+    vst1.64         {d30}, [r1], r3
+    vst1.64         {d31}, [r1], r5
+end_idct10_16x16_pass2
+    pop             {r3-r9}
+    bx              lr
+    ENDP  ; |vpx_idct16x16_10_add_neon_pass2|
+    END
diff --git a/libs/libvpx/vpx_dsp/arm/idct16x16_add_neon.c b/libs/libvpx/vpx_dsp/arm/idct16x16_add_neon.c
new file mode 100644
index 0000000000..651ebb21f9
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/idct16x16_add_neon.c
@@ -0,0 +1,1317 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "vpx_dsp/txfm_common.h"
+
+static INLINE void TRANSPOSE8X8(
+        int16x8_t *q8s16,
+        int16x8_t *q9s16,
+        int16x8_t *q10s16,
+        int16x8_t *q11s16,
+        int16x8_t *q12s16,
+        int16x8_t *q13s16,
+        int16x8_t *q14s16,
+        int16x8_t *q15s16) {
+    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+    int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
+    int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
+
+    d16s16 = vget_low_s16(*q8s16);
+    d17s16 = vget_high_s16(*q8s16);
+    d18s16 = vget_low_s16(*q9s16);
+    d19s16 = vget_high_s16(*q9s16);
+    d20s16 = vget_low_s16(*q10s16);
+    d21s16 = vget_high_s16(*q10s16);
+    d22s16 = vget_low_s16(*q11s16);
+    d23s16 = vget_high_s16(*q11s16);
+    d24s16 = vget_low_s16(*q12s16);
+    d25s16 = vget_high_s16(*q12s16);
+    d26s16 = vget_low_s16(*q13s16);
+    d27s16 = vget_high_s16(*q13s16);
+    d28s16 = vget_low_s16(*q14s16);
+    d29s16 = vget_high_s16(*q14s16);
+    d30s16 = vget_low_s16(*q15s16);
+    d31s16 = vget_high_s16(*q15s16);
+
+    *q8s16  = vcombine_s16(d16s16, d24s16);  // vswp d17, d24
+    *q9s16  = vcombine_s16(d18s16, d26s16);  // vswp d19, d26
+    *q10s16 = vcombine_s16(d20s16, d28s16);  // vswp d21, d28
+    *q11s16 = vcombine_s16(d22s16, d30s16);  // vswp d23, d30
+    *q12s16 = vcombine_s16(d17s16, d25s16);
+    *q13s16 = vcombine_s16(d19s16, d27s16);
+    *q14s16 = vcombine_s16(d21s16, d29s16);
+    *q15s16 = vcombine_s16(d23s16, d31s16);
+
+    q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16),
+                        vreinterpretq_s32_s16(*q10s16));
+    q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16),
+                        vreinterpretq_s32_s16(*q11s16));
+    q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16),
+                        vreinterpretq_s32_s16(*q14s16));
+    q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16),
+                        vreinterpretq_s32_s16(*q15s16));
+
+    q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]),   // q8
+                        vreinterpretq_s16_s32(q1x2s32.val[0]));  // q9
+    q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]),   // q10
+                        vreinterpretq_s16_s32(q1x2s32.val[1]));  // q11
+    q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]),   // q12
+                        vreinterpretq_s16_s32(q3x2s32.val[0]));  // q13
+    q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]),   // q14
+                        vreinterpretq_s16_s32(q3x2s32.val[1]));  // q15
+
+    *q8s16  = q0x2s16.val[0];
+    *q9s16  = q0x2s16.val[1];
+    *q10s16 = q1x2s16.val[0];
+    *q11s16 = q1x2s16.val[1];
+    *q12s16 = q2x2s16.val[0];
+    *q13s16 = q2x2s16.val[1];
+    *q14s16 = q3x2s16.val[0];
+    *q15s16 = q3x2s16.val[1];
+    return;
+}
+
+void vpx_idct16x16_256_add_neon_pass1(
+        int16_t *in,
+        int16_t *out,
+        int output_stride) {
+    int16x4_t d0s16, d1s16, d2s16, d3s16;
+    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+    uint64x1_t d16u64, d17u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;
+    uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
+    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+    int32x4_t q0s32, q1s32, q2s32, q3s32, q5s32, q6s32, q9s32;
+    int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
+    int16x8x2_t q0x2s16;
+
+    q0x2s16 = vld2q_s16(in);
+    q8s16  = q0x2s16.val[0];
+    in += 16;
+    q0x2s16 = vld2q_s16(in);
+    q9s16  = q0x2s16.val[0];
+    in += 16;
+    q0x2s16 = vld2q_s16(in);
+    q10s16 = q0x2s16.val[0];
+    in += 16;
+    q0x2s16 = vld2q_s16(in);
+    q11s16 = q0x2s16.val[0];
+    in += 16;
+    q0x2s16 = vld2q_s16(in);
+    q12s16 = q0x2s16.val[0];
+    in += 16;
+    q0x2s16 = vld2q_s16(in);
+    q13s16 = q0x2s16.val[0];
+    in += 16;
+    q0x2s16 = vld2q_s16(in);
+    q14s16 = q0x2s16.val[0];
+    in += 16;
+    q0x2s16 = vld2q_s16(in);
+    q15s16 = q0x2s16.val[0];
+
+    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+                 &q12s16, &q13s16, &q14s16, &q15s16);
+
+    d16s16 = vget_low_s16(q8s16);
+    d17s16 = vget_high_s16(q8s16);
+    d18s16 = vget_low_s16(q9s16);
+    d19s16 = vget_high_s16(q9s16);
+    d20s16 = vget_low_s16(q10s16);
+    d21s16 = vget_high_s16(q10s16);
+    d22s16 = vget_low_s16(q11s16);
+    d23s16 = vget_high_s16(q11s16);
+    d24s16 = vget_low_s16(q12s16);
+    d25s16 = vget_high_s16(q12s16);
+    d26s16 = vget_low_s16(q13s16);
+    d27s16 = vget_high_s16(q13s16);
+    d28s16 = vget_low_s16(q14s16);
+    d29s16 = vget_high_s16(q14s16);
+    d30s16 = vget_low_s16(q15s16);
+    d31s16 = vget_high_s16(q15s16);
+
+    // stage 3
+    d0s16 = vdup_n_s16(cospi_28_64);
+    d1s16 = vdup_n_s16(cospi_4_64);
+
+    q2s32 = vmull_s16(d18s16, d0s16);
+    q3s32 = vmull_s16(d19s16, d0s16);
+    q5s32 = vmull_s16(d18s16, d1s16);
+    q6s32 = vmull_s16(d19s16, d1s16);
+
+    q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
+    q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
+    q5s32 = vmlal_s16(q5s32, d30s16, d0s16);
+    q6s32 = vmlal_s16(q6s32, d31s16, d0s16);
+
+    d2s16 = vdup_n_s16(cospi_12_64);
+    d3s16 = vdup_n_s16(cospi_20_64);
+
+    d8s16 = vqrshrn_n_s32(q2s32, 14);
+    d9s16 = vqrshrn_n_s32(q3s32, 14);
+    d14s16 = vqrshrn_n_s32(q5s32, 14);
+    d15s16 = vqrshrn_n_s32(q6s32, 14);
+    q4s16 = vcombine_s16(d8s16, d9s16);
+    q7s16 = vcombine_s16(d14s16, d15s16);
+
+    q2s32 = vmull_s16(d26s16, d2s16);
+    q3s32 = vmull_s16(d27s16, d2s16);
+    q9s32 = vmull_s16(d26s16, d3s16);
+    q15s32 = vmull_s16(d27s16, d3s16);
+
+    q2s32 = vmlsl_s16(q2s32, d22s16, d3s16);
+    q3s32 = vmlsl_s16(q3s32, d23s16, d3s16);
+    q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
+    q15s32 = vmlal_s16(q15s32, d23s16, d2s16);
+
+    d10s16 = vqrshrn_n_s32(q2s32, 14);
+    d11s16 = vqrshrn_n_s32(q3s32, 14);
+    d12s16 = vqrshrn_n_s32(q9s32, 14);
+    d13s16 = vqrshrn_n_s32(q15s32, 14);
+    q5s16 = vcombine_s16(d10s16, d11s16);
+    q6s16 = vcombine_s16(d12s16, d13s16);
+
+    // stage 4
+    d30s16 = vdup_n_s16(cospi_16_64);
+
+    q2s32 = vmull_s16(d16s16, d30s16);
+    q11s32 = vmull_s16(d17s16, d30s16);
+    q0s32 = vmull_s16(d24s16, d30s16);
+    q1s32 = vmull_s16(d25s16, d30s16);
+
+    d30s16 = vdup_n_s16(cospi_24_64);
+    d31s16 = vdup_n_s16(cospi_8_64);
+
+    q3s32 = vaddq_s32(q2s32, q0s32);
+    q12s32 = vaddq_s32(q11s32, q1s32);
+    q13s32 = vsubq_s32(q2s32, q0s32);
+    q1s32 = vsubq_s32(q11s32, q1s32);
+
+    d16s16 = vqrshrn_n_s32(q3s32, 14);
+    d17s16 = vqrshrn_n_s32(q12s32, 14);
+    d18s16 = vqrshrn_n_s32(q13s32, 14);
+    d19s16 = vqrshrn_n_s32(q1s32, 14);
+    q8s16 = vcombine_s16(d16s16, d17s16);
+    q9s16 = vcombine_s16(d18s16, d19s16);
+
+    q0s32 = vmull_s16(d20s16, d31s16);
+    q1s32 = vmull_s16(d21s16, d31s16);
+    q12s32 = vmull_s16(d20s16, d30s16);
+    q13s32 = vmull_s16(d21s16, d30s16);
+
+    q0s32 = vmlal_s16(q0s32, d28s16, d30s16);
+    q1s32 = vmlal_s16(q1s32, d29s16, d30s16);
+    q12s32 = vmlsl_s16(q12s32, d28s16, d31s16);
+    q13s32 = vmlsl_s16(q13s32, d29s16, d31s16);
+
+    d22s16 = vqrshrn_n_s32(q0s32, 14);
+    d23s16 = vqrshrn_n_s32(q1s32, 14);
+    d20s16 = vqrshrn_n_s32(q12s32, 14);
+    d21s16 = vqrshrn_n_s32(q13s32, 14);
+    q10s16 = vcombine_s16(d20s16, d21s16);
+    q11s16 = vcombine_s16(d22s16, d23s16);
+
+    q13s16 = vsubq_s16(q4s16, q5s16);
+    q4s16 = vaddq_s16(q4s16, q5s16);
+    q14s16 = vsubq_s16(q7s16, q6s16);
+    q15s16 = vaddq_s16(q6s16, q7s16);
+    d26s16 = vget_low_s16(q13s16);
+    d27s16 = vget_high_s16(q13s16);
+    d28s16 = vget_low_s16(q14s16);
+    d29s16 = vget_high_s16(q14s16);
+
+    // stage 5
+    q0s16 = vaddq_s16(q8s16, q11s16);
+    q1s16 = vaddq_s16(q9s16, q10s16);
+    q2s16 = vsubq_s16(q9s16, q10s16);
+    q3s16 = vsubq_s16(q8s16, q11s16);
+
+    d16s16 = vdup_n_s16(cospi_16_64);
+
+    q11s32 = vmull_s16(d26s16, d16s16);
+    q12s32 = vmull_s16(d27s16, d16s16);
+    q9s32 = vmull_s16(d28s16, d16s16);
+    q10s32 = vmull_s16(d29s16, d16s16);
+
+    q6s32 = vsubq_s32(q9s32, q11s32);
+    q13s32 = vsubq_s32(q10s32, q12s32);
+    q9s32 = vaddq_s32(q9s32, q11s32);
+    q10s32 = vaddq_s32(q10s32, q12s32);
+
+    d10s16 = vqrshrn_n_s32(q6s32, 14);
+    d11s16 = vqrshrn_n_s32(q13s32, 14);
+    d12s16 = vqrshrn_n_s32(q9s32, 14);
+    d13s16 = vqrshrn_n_s32(q10s32, 14);
+    q5s16 = vcombine_s16(d10s16, d11s16);
+    q6s16 = vcombine_s16(d12s16, d13s16);
+
+    // stage 6
+    q8s16 = vaddq_s16(q0s16, q15s16);
+    q9s16 = vaddq_s16(q1s16, q6s16);
+    q10s16 = vaddq_s16(q2s16, q5s16);
+    q11s16 = vaddq_s16(q3s16, q4s16);
+    q12s16 = vsubq_s16(q3s16, q4s16);
+    q13s16 = vsubq_s16(q2s16, q5s16);
+    q14s16 = vsubq_s16(q1s16, q6s16);
+    q15s16 = vsubq_s16(q0s16, q15s16);
+
+    d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
+    d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
+    d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
+    d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
+    d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));
+    d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));
+    d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));
+    d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));
+    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+    d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
+    d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
+    d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
+    d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
+
+    // store the data
+    output_stride >>= 1;  // output_stride / 2, out is int16_t
+    vst1_u64((uint64_t *)out, d16u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d17u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d18u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d19u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d20u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d21u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d22u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d23u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d24u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d25u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d26u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d27u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d28u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d29u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d30u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d31u64);
+    return;
+}
+
+void vpx_idct16x16_256_add_neon_pass2(
+        int16_t *src,
+        int16_t *out,
+        int16_t *pass1Output,
+        int16_t skip_adding,
+        uint8_t *dest,
+        int dest_stride) {
+    uint8_t *d;
+    uint8x8_t d12u8, d13u8;
+    int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+    uint64x1_t d24u64, d25u64, d26u64, d27u64;
+    int64x1_t d12s64, d13s64;
+    uint16x8_t q2u16, q3u16, q4u16, q5u16, q8u16;
+    uint16x8_t q9u16, q12u16, q13u16, q14u16, q15u16;
+    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+    int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
+    int32x4_t q10s32, q11s32, q12s32, q13s32;
+    int16x8x2_t q0x2s16;
+
+    q0x2s16 = vld2q_s16(src);
+    q8s16  = q0x2s16.val[0];
+    src += 16;
+    q0x2s16 = vld2q_s16(src);
+    q9s16  = q0x2s16.val[0];
+    src += 16;
+    q0x2s16 = vld2q_s16(src);
+    q10s16 = q0x2s16.val[0];
+    src += 16;
+    q0x2s16 = vld2q_s16(src);
+    q11s16 = q0x2s16.val[0];
+    src += 16;
+    q0x2s16 = vld2q_s16(src);
+    q12s16 = q0x2s16.val[0];
+    src += 16;
+    q0x2s16 = vld2q_s16(src);
+    q13s16 = q0x2s16.val[0];
+    src += 16;
+    q0x2s16 = vld2q_s16(src);
+    q14s16 = q0x2s16.val[0];
+    src += 16;
+    q0x2s16 = vld2q_s16(src);
+    q15s16 = q0x2s16.val[0];
+
+    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+                 &q12s16, &q13s16, &q14s16, &q15s16);
+
+    d16s16 = vget_low_s16(q8s16);
+    d17s16 = vget_high_s16(q8s16);
+    d18s16 = vget_low_s16(q9s16);
+    d19s16 = vget_high_s16(q9s16);
+    d20s16 = vget_low_s16(q10s16);
+    d21s16 = vget_high_s16(q10s16);
+    d22s16 = vget_low_s16(q11s16);
+    d23s16 = vget_high_s16(q11s16);
+    d24s16 = vget_low_s16(q12s16);
+    d25s16 = vget_high_s16(q12s16);
+    d26s16 = vget_low_s16(q13s16);
+    d27s16 = vget_high_s16(q13s16);
+    d28s16 = vget_low_s16(q14s16);
+    d29s16 = vget_high_s16(q14s16);
+    d30s16 = vget_low_s16(q15s16);
+    d31s16 = vget_high_s16(q15s16);
+
+    // stage 3
+    d12s16 = vdup_n_s16(cospi_30_64);
+    d13s16 = vdup_n_s16(cospi_2_64);
+
+    q2s32 = vmull_s16(d16s16, d12s16);
+    q3s32 = vmull_s16(d17s16, d12s16);
+    q1s32 = vmull_s16(d16s16, d13s16);
+    q4s32 = vmull_s16(d17s16, d13s16);
+
+    q2s32 = vmlsl_s16(q2s32, d30s16, d13s16);
+    q3s32 = vmlsl_s16(q3s32, d31s16, d13s16);
+    q1s32 = vmlal_s16(q1s32, d30s16, d12s16);
+    q4s32 = vmlal_s16(q4s32, d31s16, d12s16);
+
+    d0s16 = vqrshrn_n_s32(q2s32, 14);
+    d1s16 = vqrshrn_n_s32(q3s32, 14);
+    d14s16 = vqrshrn_n_s32(q1s32, 14);
+    d15s16 = vqrshrn_n_s32(q4s32, 14);
+    q0s16 = vcombine_s16(d0s16, d1s16);
+    q7s16 = vcombine_s16(d14s16, d15s16);
+
+    d30s16 = vdup_n_s16(cospi_14_64);
+    d31s16 = vdup_n_s16(cospi_18_64);
+
+    q2s32 = vmull_s16(d24s16, d30s16);
+    q3s32 = vmull_s16(d25s16, d30s16);
+    q4s32 = vmull_s16(d24s16, d31s16);
+    q5s32 = vmull_s16(d25s16, d31s16);
+
+    q2s32 = vmlsl_s16(q2s32, d22s16, d31s16);
+    q3s32 = vmlsl_s16(q3s32, d23s16, d31s16);
+    q4s32 = vmlal_s16(q4s32, d22s16, d30s16);
+    q5s32 = vmlal_s16(q5s32, d23s16, d30s16);
+
+    d2s16 = vqrshrn_n_s32(q2s32, 14);
+    d3s16 = vqrshrn_n_s32(q3s32, 14);
+    d12s16 = vqrshrn_n_s32(q4s32, 14);
+    d13s16 = vqrshrn_n_s32(q5s32, 14);
+    q1s16 = vcombine_s16(d2s16, d3s16);
+    q6s16 = vcombine_s16(d12s16, d13s16);
+
+    d30s16 = vdup_n_s16(cospi_22_64);
+    d31s16 = vdup_n_s16(cospi_10_64);
+
+    q11s32 = vmull_s16(d20s16, d30s16);
+    q12s32 = vmull_s16(d21s16, d30s16);
+    q4s32 = vmull_s16(d20s16, d31s16);
+    q5s32 = vmull_s16(d21s16, d31s16);
+
+    q11s32 = vmlsl_s16(q11s32, d26s16, d31s16);
+    q12s32 = vmlsl_s16(q12s32, d27s16, d31s16);
+    q4s32 = vmlal_s16(q4s32, d26s16, d30s16);
+    q5s32 = vmlal_s16(q5s32, d27s16, d30s16);
+
+    d4s16 = vqrshrn_n_s32(q11s32, 14);
+    d5s16 = vqrshrn_n_s32(q12s32, 14);
+    d11s16 = vqrshrn_n_s32(q5s32, 14);
+    d10s16 = vqrshrn_n_s32(q4s32, 14);
+    q2s16 = vcombine_s16(d4s16, d5s16);
+    q5s16 = vcombine_s16(d10s16, d11s16);
+
+    d30s16 = vdup_n_s16(cospi_6_64);
+    d31s16 = vdup_n_s16(cospi_26_64);
+
+    q10s32 = vmull_s16(d28s16, d30s16);
+    q11s32 = vmull_s16(d29s16, d30s16);
+    q12s32 = vmull_s16(d28s16, d31s16);
+    q13s32 = vmull_s16(d29s16, d31s16);
+
+    q10s32 = vmlsl_s16(q10s32, d18s16, d31s16);
+    q11s32 = vmlsl_s16(q11s32, d19s16, d31s16);
+    q12s32 = vmlal_s16(q12s32, d18s16, d30s16);
+    q13s32 = vmlal_s16(q13s32, d19s16, d30s16);
+
+    d6s16 = vqrshrn_n_s32(q10s32, 14);
+    d7s16 = vqrshrn_n_s32(q11s32, 14);
+    d8s16 = vqrshrn_n_s32(q12s32, 14);
+    d9s16 = vqrshrn_n_s32(q13s32, 14);
+    q3s16 = vcombine_s16(d6s16, d7s16);
+    q4s16 = vcombine_s16(d8s16, d9s16);
+
+    // stage 3
+    q9s16  = vsubq_s16(q0s16, q1s16);
+    q0s16  = vaddq_s16(q0s16, q1s16);
+    q10s16 = vsubq_s16(q3s16, q2s16);
+    q11s16 = vaddq_s16(q2s16, q3s16);
+    q12s16 = vaddq_s16(q4s16, q5s16);
+    q13s16 = vsubq_s16(q4s16, q5s16);
+    q14s16 = vsubq_s16(q7s16, q6s16);
+    q7s16  = vaddq_s16(q6s16, q7s16);
+
+    // stage 4
+    d18s16 = vget_low_s16(q9s16);
+    d19s16 = vget_high_s16(q9s16);
+    d20s16 = vget_low_s16(q10s16);
+    d21s16 = vget_high_s16(q10s16);
+    d26s16 = vget_low_s16(q13s16);
+    d27s16 = vget_high_s16(q13s16);
+    d28s16 = vget_low_s16(q14s16);
+    d29s16 = vget_high_s16(q14s16);
+
+    d30s16 = vdup_n_s16(cospi_8_64);
+    d31s16 = vdup_n_s16(cospi_24_64);
+
+    q2s32 = vmull_s16(d18s16, d31s16);
+    q3s32 = vmull_s16(d19s16, d31s16);
+    q4s32 = vmull_s16(d28s16, d31s16);
+    q5s32 = vmull_s16(d29s16, d31s16);
+
+    q2s32 = vmlal_s16(q2s32, d28s16, d30s16);
+    q3s32 = vmlal_s16(q3s32, d29s16, d30s16);
+    q4s32 = vmlsl_s16(q4s32, d18s16, d30s16);
+    q5s32 = vmlsl_s16(q5s32, d19s16, d30s16);
+
+    d12s16 = vqrshrn_n_s32(q2s32, 14);
+    d13s16 = vqrshrn_n_s32(q3s32, 14);
+    d2s16 = vqrshrn_n_s32(q4s32, 14);
+    d3s16 = vqrshrn_n_s32(q5s32, 14);
+    q1s16 = vcombine_s16(d2s16, d3s16);
+    q6s16 = vcombine_s16(d12s16, d13s16);
+
+    q3s16 = q11s16;
+    q4s16 = q12s16;
+
+    d30s16 = vdup_n_s16(-cospi_8_64);
+    q11s32 = vmull_s16(d26s16, d30s16);
+    q12s32 = vmull_s16(d27s16, d30s16);
+    q8s32 = vmull_s16(d20s16, d30s16);
+    q9s32 = vmull_s16(d21s16, d30s16);
+
+    q11s32 = vmlsl_s16(q11s32, d20s16, d31s16);
+    q12s32 = vmlsl_s16(q12s32, d21s16, d31s16);
+    q8s32 = vmlal_s16(q8s32, d26s16, d31s16);
+    q9s32 = vmlal_s16(q9s32, d27s16, d31s16);
+
+    d4s16 = vqrshrn_n_s32(q11s32, 14);
+    d5s16 = vqrshrn_n_s32(q12s32, 14);
+    d10s16 = vqrshrn_n_s32(q8s32, 14);
+    d11s16 = vqrshrn_n_s32(q9s32, 14);
+    q2s16 = vcombine_s16(d4s16, d5s16);
+    q5s16 = vcombine_s16(d10s16, d11s16);
+
+    // stage 5
+    q8s16  = vaddq_s16(q0s16, q3s16);
+    q9s16  = vaddq_s16(q1s16, q2s16);
+    q10s16 = vsubq_s16(q1s16, q2s16);
+    q11s16 = vsubq_s16(q0s16, q3s16);
+    q12s16 = vsubq_s16(q7s16, q4s16);
+    q13s16 = vsubq_s16(q6s16, q5s16);
+    q14s16 = vaddq_s16(q6s16, q5s16);
+    q15s16 = vaddq_s16(q7s16, q4s16);
+
+    // stage 6
+    d20s16 = vget_low_s16(q10s16);
+    d21s16 = vget_high_s16(q10s16);
+    d22s16 = vget_low_s16(q11s16);
+    d23s16 = vget_high_s16(q11s16);
+    d24s16 = vget_low_s16(q12s16);
+    d25s16 = vget_high_s16(q12s16);
+    d26s16 = vget_low_s16(q13s16);
+    d27s16 = vget_high_s16(q13s16);
+
+    d14s16 = vdup_n_s16(cospi_16_64);
+
+    q3s32 = vmull_s16(d26s16, d14s16);
+    q4s32 = vmull_s16(d27s16, d14s16);
+    q0s32 = vmull_s16(d20s16, d14s16);
+    q1s32 = vmull_s16(d21s16, d14s16);
+
+    q5s32 = vsubq_s32(q3s32, q0s32);
+    q6s32 = vsubq_s32(q4s32, q1s32);
+    q10s32 = vaddq_s32(q3s32, q0s32);
+    q4s32 = vaddq_s32(q4s32, q1s32);
+
+    d4s16 = vqrshrn_n_s32(q5s32, 14);
+    d5s16 = vqrshrn_n_s32(q6s32, 14);
+    d10s16 = vqrshrn_n_s32(q10s32, 14);
+    d11s16 = vqrshrn_n_s32(q4s32, 14);
+    q2s16 = vcombine_s16(d4s16, d5s16);
+    q5s16 = vcombine_s16(d10s16, d11s16);
+
+    q0s32 = vmull_s16(d22s16, d14s16);
+    q1s32 = vmull_s16(d23s16, d14s16);
+    q13s32 = vmull_s16(d24s16, d14s16);
+    q6s32 = vmull_s16(d25s16, d14s16);
+
+    q10s32 = vsubq_s32(q13s32, q0s32);
+    q4s32 = vsubq_s32(q6s32, q1s32);
+    q13s32 = vaddq_s32(q13s32, q0s32);
+    q6s32 = vaddq_s32(q6s32, q1s32);
+
+    d6s16 = vqrshrn_n_s32(q10s32, 14);
+    d7s16 = vqrshrn_n_s32(q4s32, 14);
+    d8s16 = vqrshrn_n_s32(q13s32, 14);
+    d9s16 = vqrshrn_n_s32(q6s32, 14);
+    q3s16 = vcombine_s16(d6s16, d7s16);
+    q4s16 = vcombine_s16(d8s16, d9s16);
+
+    // stage 7
+    if (skip_adding != 0) {
+        d = dest;
+        // load the data in pass1
+        q0s16 = vld1q_s16(pass1Output);
+        pass1Output += 8;
+        q1s16 = vld1q_s16(pass1Output);
+        pass1Output += 8;
+        d12s64 = vld1_s64((int64_t *)dest);
+        dest += dest_stride;
+        d13s64 = vld1_s64((int64_t *)dest);
+        dest += dest_stride;
+
+        q12s16 = vaddq_s16(q0s16, q15s16);
+        q13s16 = vaddq_s16(q1s16, q14s16);
+        q12s16 = vrshrq_n_s16(q12s16, 6);
+        q13s16 = vrshrq_n_s16(q13s16, 6);
+        q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
+                          vreinterpret_u8_s64(d12s64));
+        q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
+                          vreinterpret_u8_s64(d13s64));
+        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+        d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+        d += dest_stride;
+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
+        d += dest_stride;
+        q14s16 = vsubq_s16(q1s16, q14s16);
+        q15s16 = vsubq_s16(q0s16, q15s16);
+
+        q10s16 = vld1q_s16(pass1Output);
+        pass1Output += 8;
+        q11s16 = vld1q_s16(pass1Output);
+        pass1Output += 8;
+        d12s64 = vld1_s64((int64_t *)dest);
+        dest += dest_stride;
+        d13s64 = vld1_s64((int64_t *)dest);
+        dest += dest_stride;
+        q12s16 = vaddq_s16(q10s16, q5s16);
+        q13s16 = vaddq_s16(q11s16, q4s16);
+        q12s16 = vrshrq_n_s16(q12s16, 6);
+        q13s16 = vrshrq_n_s16(q13s16, 6);
+        q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
+                          vreinterpret_u8_s64(d12s64));
+        q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
+                          vreinterpret_u8_s64(d13s64));
+        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+        d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+        d += dest_stride;
+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
+        d += dest_stride;
+        q4s16 = vsubq_s16(q11s16, q4s16);
+        q5s16 = vsubq_s16(q10s16, q5s16);
+
+        q0s16 = vld1q_s16(pass1Output);
+        pass1Output += 8;
+        q1s16 = vld1q_s16(pass1Output);
+        pass1Output += 8;
+        d12s64 = vld1_s64((int64_t *)dest);
+        dest += dest_stride;
+        d13s64 = vld1_s64((int64_t *)dest);
+        dest += dest_stride;
+        q12s16 = vaddq_s16(q0s16, q3s16);
+        q13s16 = vaddq_s16(q1s16, q2s16);
+        q12s16 = vrshrq_n_s16(q12s16, 6);
+        q13s16 = vrshrq_n_s16(q13s16, 6);
+        q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
+                          vreinterpret_u8_s64(d12s64));
+        q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
+                          vreinterpret_u8_s64(d13s64));
+        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+        d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+        d += dest_stride;
+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
+        d += dest_stride;
+        q2s16 = vsubq_s16(q1s16, q2s16);
+        q3s16 = vsubq_s16(q0s16, q3s16);
+
+        q10s16 = vld1q_s16(pass1Output);
+        pass1Output += 8;
+        q11s16 = vld1q_s16(pass1Output);
+        d12s64 = vld1_s64((int64_t *)dest);
+        dest += dest_stride;
+        d13s64 = vld1_s64((int64_t *)dest);
+        dest += dest_stride;
+        q12s16 = vaddq_s16(q10s16, q9s16);
+        q13s16 = vaddq_s16(q11s16, q8s16);
+        q12s16 = vrshrq_n_s16(q12s16, 6);
+        q13s16 = vrshrq_n_s16(q13s16, 6);
+        q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
+                          vreinterpret_u8_s64(d12s64));
+        q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
+                          vreinterpret_u8_s64(d13s64));
+        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+        d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+        d += dest_stride;
+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
+        d += dest_stride;
+        q8s16 = vsubq_s16(q11s16, q8s16);
+        q9s16 = vsubq_s16(q10s16, q9s16);
+
+        // store the data  out 8,9,10,11,12,13,14,15
+        d12s64 = vld1_s64((int64_t *)dest);
+        dest += dest_stride;
+        q8s16 = vrshrq_n_s16(q8s16, 6);
+        q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
+                         vreinterpret_u8_s64(d12s64));
+        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+        d += dest_stride;
+
+        d12s64 = vld1_s64((int64_t *)dest);
+        dest += dest_stride;
+        q9s16 = vrshrq_n_s16(q9s16, 6);
+        q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
+                          vreinterpret_u8_s64(d12s64));
+        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+        d += dest_stride;
+
+        d12s64 = vld1_s64((int64_t *)dest);
+        dest += dest_stride;
+        q2s16 = vrshrq_n_s16(q2s16, 6);
+        q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2s16),
+                          vreinterpret_u8_s64(d12s64));
+        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+        d += dest_stride;
+
+        d12s64 = vld1_s64((int64_t *)dest);
+        dest += dest_stride;
+        q3s16 = vrshrq_n_s16(q3s16, 6);
+        q3u16 = vaddw_u8(vreinterpretq_u16_s16(q3s16),
+                         vreinterpret_u8_s64(d12s64));
+        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q3u16));
+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+        d += dest_stride;
+
+        d12s64 = vld1_s64((int64_t *)dest);
+        dest += dest_stride;
+        q4s16 = vrshrq_n_s16(q4s16, 6);
+        q4u16 = vaddw_u8(vreinterpretq_u16_s16(q4s16),
+                         vreinterpret_u8_s64(d12s64));
+        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q4u16));
+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+        d += dest_stride;
+
+        d12s64 = vld1_s64((int64_t *)dest);
+        dest += dest_stride;
+        q5s16 = vrshrq_n_s16(q5s16, 6);
+        q5u16 = vaddw_u8(vreinterpretq_u16_s16(q5s16),
+                         vreinterpret_u8_s64(d12s64));
+        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q5u16));
+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+        d += dest_stride;
+
+        d12s64 = vld1_s64((int64_t *)dest);
+        dest += dest_stride;
+        q14s16 = vrshrq_n_s16(q14s16, 6);
+        q14u16 = vaddw_u8(vreinterpretq_u16_s16(q14s16),
+                          vreinterpret_u8_s64(d12s64));
+        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q14u16));
+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+        d += dest_stride;
+
+        d12s64 = vld1_s64((int64_t *)dest);
+        q15s16 = vrshrq_n_s16(q15s16, 6);
+        q15u16 = vaddw_u8(vreinterpretq_u16_s16(q15s16),
+                          vreinterpret_u8_s64(d12s64));
+        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q15u16));
+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+    } else {  // skip_adding_dest
+        q0s16 = vld1q_s16(pass1Output);
+        pass1Output += 8;
+        q1s16 = vld1q_s16(pass1Output);
+        pass1Output += 8;
+        q12s16 = vaddq_s16(q0s16, q15s16);
+        q13s16 = vaddq_s16(q1s16, q14s16);
+        d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+        d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+        d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+        d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+        vst1_u64((uint64_t *)out, d24u64);
+        out += 4;
+        vst1_u64((uint64_t *)out, d25u64);
+        out += 12;
+        vst1_u64((uint64_t *)out, d26u64);
+        out += 4;
+        vst1_u64((uint64_t *)out, d27u64);
+        out += 12;
+        q14s16 = vsubq_s16(q1s16, q14s16);
+        q15s16 = vsubq_s16(q0s16, q15s16);
+
+        q10s16 = vld1q_s16(pass1Output);
+        pass1Output += 8;
+        q11s16 = vld1q_s16(pass1Output);
+        pass1Output += 8;
+        q12s16 = vaddq_s16(q10s16, q5s16);
+        q13s16 = vaddq_s16(q11s16, q4s16);
+        d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+        d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+        d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+        d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+        vst1_u64((uint64_t *)out, d24u64);
+        out += 4;
+        vst1_u64((uint64_t *)out, d25u64);
+        out += 12;
+        vst1_u64((uint64_t *)out, d26u64);
+        out += 4;
+        vst1_u64((uint64_t *)out, d27u64);
+        out += 12;
+        q4s16 = vsubq_s16(q11s16, q4s16);
+        q5s16 = vsubq_s16(q10s16, q5s16);
+
+        q0s16 = vld1q_s16(pass1Output);
+        pass1Output += 8;
+        q1s16 = vld1q_s16(pass1Output);
+        pass1Output += 8;
+        q12s16 = vaddq_s16(q0s16, q3s16);
+        q13s16 = vaddq_s16(q1s16, q2s16);
+        d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+        d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+        d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+        d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+        vst1_u64((uint64_t *)out, d24u64);
+        out += 4;
+        vst1_u64((uint64_t *)out, d25u64);
+        out += 12;
+        vst1_u64((uint64_t *)out, d26u64);
+        out += 4;
+        vst1_u64((uint64_t *)out, d27u64);
+        out += 12;
+        q2s16 = vsubq_s16(q1s16, q2s16);
+        q3s16 = vsubq_s16(q0s16, q3s16);
+
+        q10s16 = vld1q_s16(pass1Output);
+        pass1Output += 8;
+        q11s16 = vld1q_s16(pass1Output);
+        pass1Output += 8;
+        q12s16 = vaddq_s16(q10s16, q9s16);
+        q13s16 = vaddq_s16(q11s16, q8s16);
+        d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+        d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+        d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+        d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+        vst1_u64((uint64_t *)out, d24u64);
+        out += 4;
+        vst1_u64((uint64_t *)out, d25u64);
+        out += 12;
+        vst1_u64((uint64_t *)out, d26u64);
+        out += 4;
+        vst1_u64((uint64_t *)out, d27u64);
+        out += 12;
+        q8s16 = vsubq_s16(q11s16, q8s16);
+        q9s16 = vsubq_s16(q10s16, q9s16);
+
+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q8s16)));
+        out += 4;
+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q8s16)));
+        out += 12;
+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q9s16)));
+        out += 4;
+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q9s16)));
+        out += 12;
+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q2s16)));
+        out += 4;
+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q2s16)));
+        out += 12;
+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q3s16)));
+        out += 4;
+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q3s16)));
+        out += 12;
+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q4s16)));
+        out += 4;
+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q4s16)));
+        out += 12;
+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q5s16)));
+        out += 4;
+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q5s16)));
+        out += 12;
+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q14s16)));
+        out += 4;
+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q14s16)));
+        out += 12;
+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q15s16)));
+        out += 4;
+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q15s16)));
+    }
+    return;
+}
+
+void vpx_idct16x16_10_add_neon_pass1(
+        int16_t *in,
+        int16_t *out,
+        int output_stride) {
+    int16x4_t d4s16;
+    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+    uint64x1_t d4u64, d5u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;
+    uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
+    int16x8_t q0s16, q1s16, q2s16, q4s16, q5s16, q6s16, q7s16;
+    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+    int32x4_t q6s32, q9s32;
+    int32x4_t q10s32, q11s32, q12s32, q15s32;
+    int16x8x2_t q0x2s16;
+
+    q0x2s16 = vld2q_s16(in);
+    q8s16 = q0x2s16.val[0];
+    in += 16;
+    q0x2s16 = vld2q_s16(in);
+    q9s16 = q0x2s16.val[0];
+    in += 16;
+    q0x2s16 = vld2q_s16(in);
+    q10s16 = q0x2s16.val[0];
+    in += 16;
+    q0x2s16 = vld2q_s16(in);
+    q11s16 = q0x2s16.val[0];
+    in += 16;
+    q0x2s16 = vld2q_s16(in);
+    q12s16 = q0x2s16.val[0];
+    in += 16;
+    q0x2s16 = vld2q_s16(in);
+    q13s16 = q0x2s16.val[0];
+    in += 16;
+    q0x2s16 = vld2q_s16(in);
+    q14s16 = q0x2s16.val[0];
+    in += 16;
+    q0x2s16 = vld2q_s16(in);
+    q15s16 = q0x2s16.val[0];
+
+    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+                 &q12s16, &q13s16, &q14s16, &q15s16);
+
+    // stage 3
+    q0s16 = vdupq_n_s16(cospi_28_64 * 2);
+    q1s16 = vdupq_n_s16(cospi_4_64 * 2);
+
+    q4s16 = vqrdmulhq_s16(q9s16, q0s16);
+    q7s16 = vqrdmulhq_s16(q9s16, q1s16);
+
+    // stage 4
+    q1s16 = vdupq_n_s16(cospi_16_64 * 2);
+    d4s16 = vdup_n_s16(cospi_16_64);
+
+    q8s16 = vqrdmulhq_s16(q8s16, q1s16);
+
+    d8s16 = vget_low_s16(q4s16);
+    d9s16 = vget_high_s16(q4s16);
+    d14s16 = vget_low_s16(q7s16);
+    d15s16 = vget_high_s16(q7s16);
+    q9s32  = vmull_s16(d14s16, d4s16);
+    q10s32 = vmull_s16(d15s16, d4s16);
+    q12s32 = vmull_s16(d9s16, d4s16);
+    q11s32 = vmull_s16(d8s16, d4s16);
+
+    q15s32 = vsubq_s32(q10s32, q12s32);
+    q6s32 = vsubq_s32(q9s32, q11s32);
+    q9s32 = vaddq_s32(q9s32, q11s32);
+    q10s32 = vaddq_s32(q10s32, q12s32);
+
+    d11s16 = vqrshrn_n_s32(q15s32, 14);
+    d10s16 = vqrshrn_n_s32(q6s32, 14);
+    d12s16 = vqrshrn_n_s32(q9s32, 14);
+    d13s16 = vqrshrn_n_s32(q10s32, 14);
+    q5s16 = vcombine_s16(d10s16, d11s16);
+    q6s16 = vcombine_s16(d12s16, d13s16);
+
+    // stage 6
+    q2s16 = vaddq_s16(q8s16, q7s16);
+    q9s16 = vaddq_s16(q8s16, q6s16);
+    q10s16 = vaddq_s16(q8s16, q5s16);
+    q11s16 = vaddq_s16(q8s16, q4s16);
+    q12s16 = vsubq_s16(q8s16, q4s16);
+    q13s16 = vsubq_s16(q8s16, q5s16);
+    q14s16 = vsubq_s16(q8s16, q6s16);
+    q15s16 = vsubq_s16(q8s16, q7s16);
+
+    d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16));
+    d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16));
+    d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
+    d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
+    d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));
+    d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));
+    d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));
+    d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));
+    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+    d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
+    d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
+    d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
+    d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
+
+    // store the data
+    output_stride >>= 1;  // output_stride / 2, out is int16_t
+    vst1_u64((uint64_t *)out, d4u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d5u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d18u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d19u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d20u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d21u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d22u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d23u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d24u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d25u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d26u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d27u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d28u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d29u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d30u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d31u64);
+    return;
+}
+
+void vpx_idct16x16_10_add_neon_pass2(
+        int16_t *src,
+        int16_t *out,
+        int16_t *pass1Output,
+        int16_t skip_adding,
+        uint8_t *dest,
+        int dest_stride) {
+    int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+    int16x4_t d20s16, d21s16, d22s16, d23s16;
+    int16x4_t d24s16, d25s16, d26s16, d27s16, d30s16, d31s16;
+    uint64x1_t d4u64, d5u64, d6u64, d7u64, d8u64, d9u64, d10u64, d11u64;
+    uint64x1_t d16u64, d17u64, d18u64, d19u64;
+    uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
+    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+    int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
+    int32x4_t q10s32, q11s32, q12s32, q13s32;
+    int16x8x2_t q0x2s16;
+    (void)skip_adding;
+    (void)dest;
+    (void)dest_stride;
+
+    q0x2s16 = vld2q_s16(src);
+    q8s16 = q0x2s16.val[0];
+    src += 16;
+    q0x2s16 = vld2q_s16(src);
+    q9s16 = q0x2s16.val[0];
+    src += 16;
+    q0x2s16 = vld2q_s16(src);
+    q10s16 = q0x2s16.val[0];
+    src += 16;
+    q0x2s16 = vld2q_s16(src);
+    q11s16 = q0x2s16.val[0];
+    src += 16;
+    q0x2s16 = vld2q_s16(src);
+    q12s16 = q0x2s16.val[0];
+    src += 16;
+    q0x2s16 = vld2q_s16(src);
+    q13s16 = q0x2s16.val[0];
+    src += 16;
+    q0x2s16 = vld2q_s16(src);
+    q14s16 = q0x2s16.val[0];
+    src += 16;
+    q0x2s16 = vld2q_s16(src);
+    q15s16 = q0x2s16.val[0];
+
+    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+                 &q12s16, &q13s16, &q14s16, &q15s16);
+
+    // stage 3
+    q6s16 = vdupq_n_s16(cospi_30_64 * 2);
+    q0s16 = vqrdmulhq_s16(q8s16, q6s16);
+    q6s16 = vdupq_n_s16(cospi_2_64 * 2);
+    q7s16 = vqrdmulhq_s16(q8s16, q6s16);
+
+    q15s16 = vdupq_n_s16(-cospi_26_64 * 2);
+    q14s16 = vdupq_n_s16(cospi_6_64 * 2);
+    q3s16 = vqrdmulhq_s16(q9s16, q15s16);
+    q4s16 = vqrdmulhq_s16(q9s16, q14s16);
+
+    // stage 4
+    d0s16 = vget_low_s16(q0s16);
+    d1s16 = vget_high_s16(q0s16);
+    d6s16 = vget_low_s16(q3s16);
+    d7s16 = vget_high_s16(q3s16);
+    d8s16 = vget_low_s16(q4s16);
+    d9s16 = vget_high_s16(q4s16);
+    d14s16 = vget_low_s16(q7s16);
+    d15s16 = vget_high_s16(q7s16);
+
+    d30s16 = vdup_n_s16(cospi_8_64);
+    d31s16 = vdup_n_s16(cospi_24_64);
+
+    q12s32 = vmull_s16(d14s16, d31s16);
+    q5s32 = vmull_s16(d15s16, d31s16);
+    q2s32 = vmull_s16(d0s16, d31s16);
+    q11s32 = vmull_s16(d1s16, d31s16);
+
+    q12s32 = vmlsl_s16(q12s32, d0s16, d30s16);
+    q5s32 = vmlsl_s16(q5s32, d1s16, d30s16);
+    q2s32 = vmlal_s16(q2s32, d14s16, d30s16);
+    q11s32 = vmlal_s16(q11s32, d15s16, d30s16);
+
+    d2s16 = vqrshrn_n_s32(q12s32, 14);
+    d3s16 = vqrshrn_n_s32(q5s32, 14);
+    d12s16 = vqrshrn_n_s32(q2s32, 14);
+    d13s16 = vqrshrn_n_s32(q11s32, 14);
+    q1s16 = vcombine_s16(d2s16, d3s16);
+    q6s16 = vcombine_s16(d12s16, d13s16);
+
+    d30s16 = vdup_n_s16(-cospi_8_64);
+    q10s32 = vmull_s16(d8s16, d30s16);
+    q13s32 = vmull_s16(d9s16, d30s16);
+    q8s32 = vmull_s16(d6s16, d30s16);
+    q9s32 = vmull_s16(d7s16, d30s16);
+
+    q10s32 = vmlsl_s16(q10s32, d6s16, d31s16);
+    q13s32 = vmlsl_s16(q13s32, d7s16, d31s16);
+    q8s32 = vmlal_s16(q8s32, d8s16, d31s16);
+    q9s32 = vmlal_s16(q9s32, d9s16, d31s16);
+
+    d4s16 = vqrshrn_n_s32(q10s32, 14);
+    d5s16 = vqrshrn_n_s32(q13s32, 14);
+    d10s16 = vqrshrn_n_s32(q8s32, 14);
+    d11s16 = vqrshrn_n_s32(q9s32, 14);
+    q2s16 = vcombine_s16(d4s16, d5s16);
+    q5s16 = vcombine_s16(d10s16, d11s16);
+
+    // stage 5
+    q8s16  = vaddq_s16(q0s16, q3s16);
+    q9s16  = vaddq_s16(q1s16, q2s16);
+    q10s16 = vsubq_s16(q1s16, q2s16);
+    q11s16 = vsubq_s16(q0s16, q3s16);
+    q12s16 = vsubq_s16(q7s16, q4s16);
+    q13s16 = vsubq_s16(q6s16, q5s16);
+    q14s16 = vaddq_s16(q6s16, q5s16);
+    q15s16 = vaddq_s16(q7s16, q4s16);
+
+    // stage 6
+    d20s16 = vget_low_s16(q10s16);
+    d21s16 = vget_high_s16(q10s16);
+    d22s16 = vget_low_s16(q11s16);
+    d23s16 = vget_high_s16(q11s16);
+    d24s16 = vget_low_s16(q12s16);
+    d25s16 = vget_high_s16(q12s16);
+    d26s16 = vget_low_s16(q13s16);
+    d27s16 = vget_high_s16(q13s16);
+
+    d14s16 = vdup_n_s16(cospi_16_64);
+    q3s32 = vmull_s16(d26s16, d14s16);
+    q4s32 = vmull_s16(d27s16, d14s16);
+    q0s32 = vmull_s16(d20s16, d14s16);
+    q1s32 = vmull_s16(d21s16, d14s16);
+
+    q5s32 = vsubq_s32(q3s32, q0s32);
+    q6s32 = vsubq_s32(q4s32, q1s32);
+    q0s32 = vaddq_s32(q3s32, q0s32);
+    q4s32 = vaddq_s32(q4s32, q1s32);
+
+    d4s16 = vqrshrn_n_s32(q5s32, 14);
+    d5s16 = vqrshrn_n_s32(q6s32, 14);
+    d10s16 = vqrshrn_n_s32(q0s32, 14);
+    d11s16 = vqrshrn_n_s32(q4s32, 14);
+    q2s16 = vcombine_s16(d4s16, d5s16);
+    q5s16 = vcombine_s16(d10s16, d11s16);
+
+    q0s32 = vmull_s16(d22s16, d14s16);
+    q1s32 = vmull_s16(d23s16, d14s16);
+    q13s32 = vmull_s16(d24s16, d14s16);
+    q6s32 = vmull_s16(d25s16, d14s16);
+
+    q10s32 = vsubq_s32(q13s32, q0s32);
+    q4s32 = vsubq_s32(q6s32, q1s32);
+    q13s32 = vaddq_s32(q13s32, q0s32);
+    q6s32 = vaddq_s32(q6s32, q1s32);
+
+    d6s16 = vqrshrn_n_s32(q10s32, 14);
+    d7s16 = vqrshrn_n_s32(q4s32, 14);
+    d8s16 = vqrshrn_n_s32(q13s32, 14);
+    d9s16 = vqrshrn_n_s32(q6s32, 14);
+    q3s16 = vcombine_s16(d6s16, d7s16);
+    q4s16 = vcombine_s16(d8s16, d9s16);
+
+    // stage 7
+    q0s16 = vld1q_s16(pass1Output);
+    pass1Output += 8;
+    q1s16 = vld1q_s16(pass1Output);
+    pass1Output += 8;
+    q12s16 = vaddq_s16(q0s16, q15s16);
+    q13s16 = vaddq_s16(q1s16, q14s16);
+    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+    vst1_u64((uint64_t *)out, d24u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d25u64);
+    out += 12;
+    vst1_u64((uint64_t *)out, d26u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d27u64);
+    out += 12;
+    q14s16 = vsubq_s16(q1s16, q14s16);
+    q15s16 = vsubq_s16(q0s16, q15s16);
+
+    q10s16 = vld1q_s16(pass1Output);
+    pass1Output += 8;
+    q11s16 = vld1q_s16(pass1Output);
+    pass1Output += 8;
+    q12s16 = vaddq_s16(q10s16, q5s16);
+    q13s16 = vaddq_s16(q11s16, q4s16);
+    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+    vst1_u64((uint64_t *)out, d24u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d25u64);
+    out += 12;
+    vst1_u64((uint64_t *)out, d26u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d27u64);
+    out += 12;
+    q4s16 = vsubq_s16(q11s16, q4s16);
+    q5s16 = vsubq_s16(q10s16, q5s16);
+
+    q0s16 = vld1q_s16(pass1Output);
+    pass1Output += 8;
+    q1s16 = vld1q_s16(pass1Output);
+    pass1Output += 8;
+    q12s16 = vaddq_s16(q0s16, q3s16);
+    q13s16 = vaddq_s16(q1s16, q2s16);
+    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+    vst1_u64((uint64_t *)out, d24u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d25u64);
+    out += 12;
+    vst1_u64((uint64_t *)out, d26u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d27u64);
+    out += 12;
+    q2s16 = vsubq_s16(q1s16, q2s16);
+    q3s16 = vsubq_s16(q0s16, q3s16);
+
+    q10s16 = vld1q_s16(pass1Output);
+    pass1Output += 8;
+    q11s16 = vld1q_s16(pass1Output);
+    q12s16 = vaddq_s16(q10s16, q9s16);
+    q13s16 = vaddq_s16(q11s16, q8s16);
+    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+    vst1_u64((uint64_t *)out, d24u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d25u64);
+    out += 12;
+    vst1_u64((uint64_t *)out, d26u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d27u64);
+    out += 12;
+    q8s16 = vsubq_s16(q11s16, q8s16);
+    q9s16 = vsubq_s16(q10s16, q9s16);
+
+    d4u64  = vreinterpret_u64_s16(vget_low_s16(q2s16));
+    d5u64  = vreinterpret_u64_s16(vget_high_s16(q2s16));
+    d6u64  = vreinterpret_u64_s16(vget_low_s16(q3s16));
+    d7u64  = vreinterpret_u64_s16(vget_high_s16(q3s16));
+    d8u64  = vreinterpret_u64_s16(vget_low_s16(q4s16));
+    d9u64  = vreinterpret_u64_s16(vget_high_s16(q4s16));
+    d10u64 = vreinterpret_u64_s16(vget_low_s16(q5s16));
+    d11u64 = vreinterpret_u64_s16(vget_high_s16(q5s16));
+    d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
+    d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
+    d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
+    d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
+    d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
+    d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
+    d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
+    d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
+
+    vst1_u64((uint64_t *)out, d16u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d17u64);
+    out += 12;
+    vst1_u64((uint64_t *)out, d18u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d19u64);
+    out += 12;
+    vst1_u64((uint64_t *)out, d4u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d5u64);
+    out += 12;
+    vst1_u64((uint64_t *)out, d6u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d7u64);
+    out += 12;
+    vst1_u64((uint64_t *)out, d8u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d9u64);
+    out += 12;
+    vst1_u64((uint64_t *)out, d10u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d11u64);
+    out += 12;
+    vst1_u64((uint64_t *)out, d28u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d29u64);
+    out += 12;
+    vst1_u64((uint64_t *)out, d30u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d31u64);
+    return;
+}
diff --git a/libs/libvpx/vpx_dsp/arm/idct16x16_neon.c b/libs/libvpx/vpx_dsp/arm/idct16x16_neon.c
new file mode 100644
index 0000000000..352979aa16
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/idct16x16_neon.c
@@ -0,0 +1,185 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/vpx_dsp_common.h"
+
+void vpx_idct16x16_256_add_neon_pass1(const int16_t *input,
+                                      int16_t *output,
+                                      int output_stride);
+void vpx_idct16x16_256_add_neon_pass2(const int16_t *src,
+                                      int16_t *output,
+                                      int16_t *pass1Output,
+                                      int16_t skip_adding,
+                                      uint8_t *dest,
+                                      int dest_stride);
+void vpx_idct16x16_10_add_neon_pass1(const int16_t *input,
+                                     int16_t *output,
+                                     int output_stride);
+void vpx_idct16x16_10_add_neon_pass2(const int16_t *src,
+                                     int16_t *output,
+                                     int16_t *pass1Output,
+                                     int16_t skip_adding,
+                                     uint8_t *dest,
+                                     int dest_stride);
+
+#if HAVE_NEON_ASM
+/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */
+extern void vpx_push_neon(int64_t *store);
+extern void vpx_pop_neon(int64_t *store);
+#endif  // HAVE_NEON_ASM
+
+void vpx_idct16x16_256_add_neon(const int16_t *input,
+                                uint8_t *dest, int dest_stride) {
+#if HAVE_NEON_ASM
+  int64_t store_reg[8];
+#endif
+  int16_t pass1_output[16*16] = {0};
+  int16_t row_idct_output[16*16] = {0};
+
+#if HAVE_NEON_ASM
+  // save d8-d15 register values.
+  vpx_push_neon(store_reg);
+#endif
+
+  /* Parallel idct on the upper 8 rows */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  vpx_idct16x16_256_add_neon_pass1(input, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7
+  // which will be saved into row_idct_output.
+  vpx_idct16x16_256_add_neon_pass2(input+1,
+                                     row_idct_output,
+                                     pass1_output,
+                                     0,
+                                     dest,
+                                     dest_stride);
+
+  /* Parallel idct on the lower 8 rows */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  vpx_idct16x16_256_add_neon_pass1(input+8*16, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7
+  // which will be saved into row_idct_output.
+  vpx_idct16x16_256_add_neon_pass2(input+8*16+1,
+                                     row_idct_output+8,
+                                     pass1_output,
+                                     0,
+                                     dest,
+                                     dest_stride);
+
+  /* Parallel idct on the left 8 columns */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  vpx_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7.
+  // Then add the result to the destination data.
+  vpx_idct16x16_256_add_neon_pass2(row_idct_output+1,
+                                     row_idct_output,
+                                     pass1_output,
+                                     1,
+                                     dest,
+                                     dest_stride);
+
+  /* Parallel idct on the right 8 columns */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  vpx_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7.
+  // Then add the result to the destination data.
+  vpx_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,
+                                     row_idct_output+8,
+                                     pass1_output,
+                                     1,
+                                     dest+8,
+                                     dest_stride);
+
+#if HAVE_NEON_ASM
+  // restore d8-d15 register values.
+  vpx_pop_neon(store_reg);
+#endif
+
+  return;
+}
+
+void vpx_idct16x16_10_add_neon(const int16_t *input,
+                               uint8_t *dest, int dest_stride) {
+#if HAVE_NEON_ASM
+  int64_t store_reg[8];
+#endif
+  int16_t pass1_output[16*16] = {0};
+  int16_t row_idct_output[16*16] = {0};
+
+#if HAVE_NEON_ASM
+  // save d8-d15 register values.
+  vpx_push_neon(store_reg);
+#endif
+
+  /* Parallel idct on the upper 8 rows */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  vpx_idct16x16_10_add_neon_pass1(input, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7
+  // which will be saved into row_idct_output.
+  vpx_idct16x16_10_add_neon_pass2(input+1,
+                                        row_idct_output,
+                                        pass1_output,
+                                        0,
+                                        dest,
+                                        dest_stride);
+
+  /* Skip Parallel idct on the lower 8 rows as they are all 0s */
+
+  /* Parallel idct on the left 8 columns */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  vpx_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7.
+  // Then add the result to the destination data.
+  vpx_idct16x16_256_add_neon_pass2(row_idct_output+1,
+                                     row_idct_output,
+                                     pass1_output,
+                                     1,
+                                     dest,
+                                     dest_stride);
+
+  /* Parallel idct on the right 8 columns */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  vpx_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7.
+  // Then add the result to the destination data.
+  vpx_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,
+                                     row_idct_output+8,
+                                     pass1_output,
+                                     1,
+                                     dest+8,
+                                     dest_stride);
+
+#if HAVE_NEON_ASM
+  // restore d8-d15 register values.
+  vpx_pop_neon(store_reg);
+#endif
+
+  return;
+}
diff --git a/libs/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.asm b/libs/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.asm
new file mode 100644
index 0000000000..96d276b4d1
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.asm
@@ -0,0 +1,144 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+    EXPORT  |vpx_idct32x32_1_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+    ;TODO(hkuang): put the following macros in a seperate
+    ;file so other idct function could also use them.
+    MACRO
+    LD_16x8          $src, $stride
+    vld1.8           {q8}, [$src], $stride
+    vld1.8           {q9}, [$src], $stride
+    vld1.8           {q10}, [$src], $stride
+    vld1.8           {q11}, [$src], $stride
+    vld1.8           {q12}, [$src], $stride
+    vld1.8           {q13}, [$src], $stride
+    vld1.8           {q14}, [$src], $stride
+    vld1.8           {q15}, [$src], $stride
+    MEND
+
+    MACRO
+    ADD_DIFF_16x8    $diff
+    vqadd.u8         q8, q8, $diff
+    vqadd.u8         q9, q9, $diff
+    vqadd.u8         q10, q10, $diff
+    vqadd.u8         q11, q11, $diff
+    vqadd.u8         q12, q12, $diff
+    vqadd.u8         q13, q13, $diff
+    vqadd.u8         q14, q14, $diff
+    vqadd.u8         q15, q15, $diff
+    MEND
+
+    MACRO
+    SUB_DIFF_16x8    $diff
+    vqsub.u8         q8, q8, $diff
+    vqsub.u8         q9, q9, $diff
+    vqsub.u8         q10, q10, $diff
+    vqsub.u8         q11, q11, $diff
+    vqsub.u8         q12, q12, $diff
+    vqsub.u8         q13, q13, $diff
+    vqsub.u8         q14, q14, $diff
+    vqsub.u8         q15, q15, $diff
+    MEND
+
+    MACRO
+    ST_16x8          $dst, $stride
+    vst1.8           {q8}, [$dst], $stride
+    vst1.8           {q9}, [$dst], $stride
+    vst1.8           {q10},[$dst], $stride
+    vst1.8           {q11},[$dst], $stride
+    vst1.8           {q12},[$dst], $stride
+    vst1.8           {q13},[$dst], $stride
+    vst1.8           {q14},[$dst], $stride
+    vst1.8           {q15},[$dst], $stride
+    MEND
+
+;void vpx_idct32x32_1_add_neon(int16_t *input, uint8_t *dest,
+;                              int dest_stride)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride
+
+|vpx_idct32x32_1_add_neon| PROC
+    push             {lr}
+    pld              [r1]
+    add              r3, r1, #16               ; r3 dest + 16 for second loop
+    ldrsh            r0, [r0]
+
+    ; generate cospi_16_64 = 11585
+    mov              r12, #0x2d00
+    add              r12, #0x41
+
+    ; out = dct_const_round_shift(input[0] * cospi_16_64)
+    mul              r0, r0, r12               ; input[0] * cospi_16_64
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; out = dct_const_round_shift(out * cospi_16_64)
+    mul              r0, r0, r12               ; out * cospi_16_64
+    mov              r12, r1                   ; save dest
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; a1 = ROUND_POWER_OF_TWO(out, 6)
+    add              r0, r0, #32               ; + (1 <<((6) - 1))
+    asrs             r0, r0, #6                ; >> 6
+    bge              diff_positive_32_32
+
+diff_negative_32_32
+    neg              r0, r0
+    usat             r0, #8, r0
+    vdup.u8          q0, r0
+    mov              r0, #4
+
+diff_negative_32_32_loop
+    sub              r0, #1
+    LD_16x8          r1, r2
+    SUB_DIFF_16x8    q0
+    ST_16x8          r12, r2
+
+    LD_16x8          r1, r2
+    SUB_DIFF_16x8    q0
+    ST_16x8          r12, r2
+    cmp              r0, #2
+    moveq            r1, r3
+    moveq            r12, r3
+    cmp              r0, #0
+    bne              diff_negative_32_32_loop
+    pop              {pc}
+
+diff_positive_32_32
+    usat             r0, #8, r0
+    vdup.u8          q0, r0
+    mov              r0, #4
+
+diff_positive_32_32_loop
+    sub              r0, #1
+    LD_16x8          r1, r2
+    ADD_DIFF_16x8    q0
+    ST_16x8          r12, r2
+
+    LD_16x8          r1, r2
+    ADD_DIFF_16x8    q0
+    ST_16x8          r12, r2
+    cmp              r0, #2
+    moveq            r1, r3
+    moveq            r12, r3
+    cmp              r0, #0
+    bne              diff_positive_32_32_loop
+    pop              {pc}
+
+    ENDP             ; |vpx_idct32x32_1_add_neon|
+    END
diff --git a/libs/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c b/libs/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c
new file mode 100644
index 0000000000..c25c0c4a5c
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c
@@ -0,0 +1,165 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+
+#include "vpx_dsp/inv_txfm.h"
+#include "vpx_ports/mem.h"
+
+static INLINE void LD_16x8(
+        uint8_t *d,
+        int d_stride,
+        uint8x16_t *q8u8,
+        uint8x16_t *q9u8,
+        uint8x16_t *q10u8,
+        uint8x16_t *q11u8,
+        uint8x16_t *q12u8,
+        uint8x16_t *q13u8,
+        uint8x16_t *q14u8,
+        uint8x16_t *q15u8) {
+    *q8u8 = vld1q_u8(d);
+    d += d_stride;
+    *q9u8 = vld1q_u8(d);
+    d += d_stride;
+    *q10u8 = vld1q_u8(d);
+    d += d_stride;
+    *q11u8 = vld1q_u8(d);
+    d += d_stride;
+    *q12u8 = vld1q_u8(d);
+    d += d_stride;
+    *q13u8 = vld1q_u8(d);
+    d += d_stride;
+    *q14u8 = vld1q_u8(d);
+    d += d_stride;
+    *q15u8 = vld1q_u8(d);
+    return;
+}
+
+static INLINE void ADD_DIFF_16x8(
+        uint8x16_t qdiffu8,
+        uint8x16_t *q8u8,
+        uint8x16_t *q9u8,
+        uint8x16_t *q10u8,
+        uint8x16_t *q11u8,
+        uint8x16_t *q12u8,
+        uint8x16_t *q13u8,
+        uint8x16_t *q14u8,
+        uint8x16_t *q15u8) {
+    *q8u8 = vqaddq_u8(*q8u8, qdiffu8);
+    *q9u8 = vqaddq_u8(*q9u8, qdiffu8);
+    *q10u8 = vqaddq_u8(*q10u8, qdiffu8);
+    *q11u8 = vqaddq_u8(*q11u8, qdiffu8);
+    *q12u8 = vqaddq_u8(*q12u8, qdiffu8);
+    *q13u8 = vqaddq_u8(*q13u8, qdiffu8);
+    *q14u8 = vqaddq_u8(*q14u8, qdiffu8);
+    *q15u8 = vqaddq_u8(*q15u8, qdiffu8);
+    return;
+}
+
+static INLINE void SUB_DIFF_16x8(
+        uint8x16_t qdiffu8,
+        uint8x16_t *q8u8,
+        uint8x16_t *q9u8,
+        uint8x16_t *q10u8,
+        uint8x16_t *q11u8,
+        uint8x16_t *q12u8,
+        uint8x16_t *q13u8,
+        uint8x16_t *q14u8,
+        uint8x16_t *q15u8) {
+    *q8u8 = vqsubq_u8(*q8u8, qdiffu8);
+    *q9u8 = vqsubq_u8(*q9u8, qdiffu8);
+    *q10u8 = vqsubq_u8(*q10u8, qdiffu8);
+    *q11u8 = vqsubq_u8(*q11u8, qdiffu8);
+    *q12u8 = vqsubq_u8(*q12u8, qdiffu8);
+    *q13u8 = vqsubq_u8(*q13u8, qdiffu8);
+    *q14u8 = vqsubq_u8(*q14u8, qdiffu8);
+    *q15u8 = vqsubq_u8(*q15u8, qdiffu8);
+    return;
+}
+
+static INLINE void ST_16x8(
+        uint8_t *d,
+        int d_stride,
+        uint8x16_t *q8u8,
+        uint8x16_t *q9u8,
+        uint8x16_t *q10u8,
+        uint8x16_t *q11u8,
+        uint8x16_t *q12u8,
+        uint8x16_t *q13u8,
+        uint8x16_t *q14u8,
+        uint8x16_t *q15u8) {
+    vst1q_u8(d, *q8u8);
+    d += d_stride;
+    vst1q_u8(d, *q9u8);
+    d += d_stride;
+    vst1q_u8(d, *q10u8);
+    d += d_stride;
+    vst1q_u8(d, *q11u8);
+    d += d_stride;
+    vst1q_u8(d, *q12u8);
+    d += d_stride;
+    vst1q_u8(d, *q13u8);
+    d += d_stride;
+    vst1q_u8(d, *q14u8);
+    d += d_stride;
+    vst1q_u8(d, *q15u8);
+    return;
+}
+
+void vpx_idct32x32_1_add_neon(
+        int16_t *input,
+        uint8_t *dest,
+        int dest_stride) {
+    uint8x16_t q0u8, q8u8, q9u8, q10u8, q11u8, q12u8, q13u8, q14u8, q15u8;
+    int i, j, dest_stride8;
+    uint8_t *d;
+    int16_t a1, cospi_16_64 = 11585;
+    int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+
+    out = dct_const_round_shift(out * cospi_16_64);
+    a1 = ROUND_POWER_OF_TWO(out, 6);
+
+    dest_stride8 = dest_stride * 8;
+    if (a1 >= 0) {  // diff_positive_32_32
+        a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
+        q0u8 = vdupq_n_u8(a1);
+        for (i = 0; i < 2; i++, dest += 16) {  // diff_positive_32_32_loop
+            d = dest;
+            for (j = 0; j < 4; j++) {
+                LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,
+                                        &q12u8, &q13u8, &q14u8, &q15u8);
+                ADD_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8,
+                                    &q12u8, &q13u8, &q14u8, &q15u8);
+                ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,
+                                        &q12u8, &q13u8, &q14u8, &q15u8);
+                d += dest_stride8;
+            }
+        }
+    } else {  // diff_negative_32_32
+        a1 = -a1;
+        a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
+        q0u8 = vdupq_n_u8(a1);
+        for (i = 0; i < 2; i++, dest += 16) {  // diff_negative_32_32_loop
+            d = dest;
+            for (j = 0; j < 4; j++) {
+                LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,
+                                        &q12u8, &q13u8, &q14u8, &q15u8);
+                SUB_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8,
+                                    &q12u8, &q13u8, &q14u8, &q15u8);
+                ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,
+                                        &q12u8, &q13u8, &q14u8, &q15u8);
+                d += dest_stride8;
+            }
+        }
+    }
+    return;
+}
diff --git a/libs/libvpx/vpx_dsp/arm/idct32x32_add_neon.asm b/libs/libvpx/vpx_dsp/arm/idct32x32_add_neon.asm
new file mode 100644
index 0000000000..7483ee77e1
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/idct32x32_add_neon.asm
@@ -0,0 +1,1299 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+;TODO(cd): adjust these constant to be able to use vqdmulh for faster
+;          dct_const_round_shift(a * b) within butterfly calculations.
+cospi_1_64  EQU 16364
+cospi_2_64  EQU 16305
+cospi_3_64  EQU 16207
+cospi_4_64  EQU 16069
+cospi_5_64  EQU 15893
+cospi_6_64  EQU 15679
+cospi_7_64  EQU 15426
+cospi_8_64  EQU 15137
+cospi_9_64  EQU 14811
+cospi_10_64 EQU 14449
+cospi_11_64 EQU 14053
+cospi_12_64 EQU 13623
+cospi_13_64 EQU 13160
+cospi_14_64 EQU 12665
+cospi_15_64 EQU 12140
+cospi_16_64 EQU 11585
+cospi_17_64 EQU 11003
+cospi_18_64 EQU 10394
+cospi_19_64 EQU  9760
+cospi_20_64 EQU  9102
+cospi_21_64 EQU  8423
+cospi_22_64 EQU  7723
+cospi_23_64 EQU  7005
+cospi_24_64 EQU  6270
+cospi_25_64 EQU  5520
+cospi_26_64 EQU  4756
+cospi_27_64 EQU  3981
+cospi_28_64 EQU  3196
+cospi_29_64 EQU  2404
+cospi_30_64 EQU  1606
+cospi_31_64 EQU   804
+
+
+    EXPORT  |vpx_idct32x32_1024_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+    AREA     Block, CODE, READONLY
+
+    ; --------------------------------------------------------------------------
+    ; Load from transposed_buffer
+    ;   q13 = transposed_buffer[first_offset]
+    ;   q14 = transposed_buffer[second_offset]
+    ;   for proper address calculation, the last offset used when manipulating
+    ;   transposed_buffer must be passed in. use 0 for first use.
+    MACRO
+    LOAD_FROM_TRANSPOSED $prev_offset, $first_offset, $second_offset
+    ; address calculation with proper stride and loading
+    add r0, #($first_offset  - $prev_offset )*8*2
+    vld1.s16        {q14}, [r0]
+    add r0, #($second_offset - $first_offset)*8*2
+    vld1.s16        {q13}, [r0]
+    ; (used) two registers (q14, q13)
+    MEND
+    ; --------------------------------------------------------------------------
+    ; Load from output (used as temporary storage)
+    ;   reg1 = output[first_offset]
+    ;   reg2 = output[second_offset]
+    ;   for proper address calculation, the last offset used when manipulating
+    ;   output, whether reading or storing) must be passed in. use 0 for first
+    ;   use.
+    MACRO
+    LOAD_FROM_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2
+    ; address calculation with proper stride and loading
+    add r1, #($first_offset  - $prev_offset )*32*2
+    vld1.s16        {$reg1}, [r1]
+    add r1, #($second_offset - $first_offset)*32*2
+    vld1.s16        {$reg2}, [r1]
+    ; (used) two registers ($reg1, $reg2)
+    MEND
+    ; --------------------------------------------------------------------------
+    ; Store into output (sometimes as as temporary storage)
+    ;   output[first_offset] = reg1
+    ;   output[second_offset] = reg2
+    ;   for proper address calculation, the last offset used when manipulating
+    ;   output, whether reading or storing) must be passed in. use 0 for first
+    ;   use.
+    MACRO
+    STORE_IN_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2
+    ; address calculation with proper stride and storing
+    add r1, #($first_offset  - $prev_offset )*32*2
+    vst1.16 {$reg1}, [r1]
+    add r1, #($second_offset - $first_offset)*32*2
+    vst1.16 {$reg2}, [r1]
+    MEND
+    ; --------------------------------------------------------------------------
+    ; Combine-add results with current destination content
+    ;   q6-q9 contain the results (out[j * 32 + 0-31])
+    MACRO
+    STORE_COMBINE_CENTER_RESULTS
+    ; load dest[j * dest_stride + 0-31]
+    vld1.s16        {d8}, [r10], r2
+    vld1.s16        {d11}, [r9], r11
+    vld1.s16        {d9}, [r10]
+    vld1.s16        {d10}, [r9]
+    ; ROUND_POWER_OF_TWO
+    vrshr.s16       q7, q7, #6
+    vrshr.s16       q8, q8, #6
+    vrshr.s16       q9, q9, #6
+    vrshr.s16       q6, q6, #6
+    ; add to dest[j * dest_stride + 0-31]
+    vaddw.u8        q7, q7, d9
+    vaddw.u8        q8, q8, d10
+    vaddw.u8        q9, q9, d11
+    vaddw.u8        q6, q6, d8
+    ; clip pixel
+    vqmovun.s16     d9,  q7
+    vqmovun.s16     d10, q8
+    vqmovun.s16     d11, q9
+    vqmovun.s16     d8,  q6
+    ; store back into dest[j * dest_stride + 0-31]
+    vst1.16         {d9}, [r10], r11
+    vst1.16         {d10}, [r9], r2
+    vst1.16         {d8}, [r10]
+    vst1.16         {d11}, [r9]
+    ; update pointers (by dest_stride * 2)
+    sub r9,  r9,  r2, lsl #1
+    add r10, r10, r2, lsl #1
+    MEND
+    ; --------------------------------------------------------------------------
+    ; Combine-add results with current destination content
+    ;   q6-q9 contain the results (out[j * 32 + 0-31])
+    MACRO
+    STORE_COMBINE_CENTER_RESULTS_LAST
+    ; load dest[j * dest_stride + 0-31]
+    vld1.s16        {d8}, [r10], r2
+    vld1.s16        {d11}, [r9], r11
+    vld1.s16        {d9}, [r10]
+    vld1.s16        {d10}, [r9]
+    ; ROUND_POWER_OF_TWO
+    vrshr.s16       q7, q7, #6
+    vrshr.s16       q8, q8, #6
+    vrshr.s16       q9, q9, #6
+    vrshr.s16       q6, q6, #6
+    ; add to dest[j * dest_stride + 0-31]
+    vaddw.u8        q7, q7, d9
+    vaddw.u8        q8, q8, d10
+    vaddw.u8        q9, q9, d11
+    vaddw.u8        q6, q6, d8
+    ; clip pixel
+    vqmovun.s16     d9,  q7
+    vqmovun.s16     d10, q8
+    vqmovun.s16     d11, q9
+    vqmovun.s16     d8,  q6
+    ; store back into dest[j * dest_stride + 0-31]
+    vst1.16         {d9}, [r10], r11
+    vst1.16         {d10}, [r9], r2
+    vst1.16         {d8}, [r10]!
+    vst1.16         {d11}, [r9]!
+    ; update pointers (by dest_stride * 2)
+    sub r9,  r9,  r2, lsl #1
+    add r10, r10, r2, lsl #1
+    MEND
+    ; --------------------------------------------------------------------------
+    ; Combine-add results with current destination content
+    ;   q4-q7 contain the results (out[j * 32 + 0-31])
+    MACRO
+    STORE_COMBINE_EXTREME_RESULTS
+    ; load dest[j * dest_stride + 0-31]
+    vld1.s16        {d4}, [r7], r2
+    vld1.s16        {d7}, [r6], r11
+    vld1.s16        {d5}, [r7]
+    vld1.s16        {d6}, [r6]
+    ; ROUND_POWER_OF_TWO
+    vrshr.s16       q5, q5, #6
+    vrshr.s16       q6, q6, #6
+    vrshr.s16       q7, q7, #6
+    vrshr.s16       q4, q4, #6
+    ; add to dest[j * dest_stride + 0-31]
+    vaddw.u8        q5, q5, d5
+    vaddw.u8        q6, q6, d6
+    vaddw.u8        q7, q7, d7
+    vaddw.u8        q4, q4, d4
+    ; clip pixel
+    vqmovun.s16     d5, q5
+    vqmovun.s16     d6, q6
+    vqmovun.s16     d7, q7
+    vqmovun.s16     d4, q4
+    ; store back into dest[j * dest_stride + 0-31]
+    vst1.16         {d5}, [r7], r11
+    vst1.16         {d6}, [r6], r2
+    vst1.16         {d7}, [r6]
+    vst1.16         {d4}, [r7]
+    ; update pointers (by dest_stride * 2)
+    sub r6, r6, r2, lsl #1
+    add r7, r7, r2, lsl #1
+    MEND
+    ; --------------------------------------------------------------------------
+    ; Combine-add results with current destination content
+    ;   q4-q7 contain the results (out[j * 32 + 0-31])
+    MACRO
+    STORE_COMBINE_EXTREME_RESULTS_LAST
+    ; load dest[j * dest_stride + 0-31]
+    vld1.s16        {d4}, [r7], r2
+    vld1.s16        {d7}, [r6], r11
+    vld1.s16        {d5}, [r7]
+    vld1.s16        {d6}, [r6]
+    ; ROUND_POWER_OF_TWO
+    vrshr.s16       q5, q5, #6
+    vrshr.s16       q6, q6, #6
+    vrshr.s16       q7, q7, #6
+    vrshr.s16       q4, q4, #6
+    ; add to dest[j * dest_stride + 0-31]
+    vaddw.u8        q5, q5, d5
+    vaddw.u8        q6, q6, d6
+    vaddw.u8        q7, q7, d7
+    vaddw.u8        q4, q4, d4
+    ; clip pixel
+    vqmovun.s16     d5, q5
+    vqmovun.s16     d6, q6
+    vqmovun.s16     d7, q7
+    vqmovun.s16     d4, q4
+    ; store back into dest[j * dest_stride + 0-31]
+    vst1.16         {d5}, [r7], r11
+    vst1.16         {d6}, [r6], r2
+    vst1.16         {d7}, [r6]!
+    vst1.16         {d4}, [r7]!
+    ; update pointers (by dest_stride * 2)
+    sub r6, r6, r2, lsl #1
+    add r7, r7, r2, lsl #1
+    MEND
+    ; --------------------------------------------------------------------------
+    ; Touches q8-q12, q15 (q13-q14 are preserved)
+    ; valid output registers are anything but q8-q11
+    MACRO
+    DO_BUTTERFLY $regC, $regD, $regA, $regB, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4
+    ; TODO(cd): have special case to re-use constants when they are similar for
+    ;           consecutive butterflies
+    ; TODO(cd): have special case when both constants are the same, do the
+    ;           additions/subtractions before the multiplies.
+    ; generate the constants
+    ;   generate scalar constants
+    mov             r8,  #$first_constant  & 0xFF00
+    mov             r12, #$second_constant & 0xFF00
+    add             r8,  #$first_constant  & 0x00FF
+    add             r12, #$second_constant & 0x00FF
+    ;   generate vector constants
+    vdup.16         d30, r8
+    vdup.16         d31, r12
+    ; (used) two for inputs (regA-regD), one for constants (q15)
+    ; do some multiplications (ordered for maximum latency hiding)
+    vmull.s16 q8,  $regC, d30
+    vmull.s16 q10, $regA, d31
+    vmull.s16 q9,  $regD, d30
+    vmull.s16 q11, $regB, d31
+    vmull.s16 q12, $regC, d31
+    ; (used) five for intermediate (q8-q12), one for constants (q15)
+    ; do some addition/subtractions (to get back two register)
+    vsub.s32  q8, q8, q10
+    vsub.s32  q9, q9, q11
+    ; do more multiplications (ordered for maximum latency hiding)
+    vmull.s16 q10, $regD, d31
+    vmull.s16 q11, $regA, d30
+    vmull.s16 q15, $regB, d30
+    ; (used) six for intermediate (q8-q12, q15)
+    ; do more addition/subtractions
+    vadd.s32  q11, q12, q11
+    vadd.s32  q10, q10, q15
+    ; (used) four for intermediate (q8-q11)
+    ; dct_const_round_shift
+    vqrshrn.s32 $reg1, q8,  #14
+    vqrshrn.s32 $reg2, q9,  #14
+    vqrshrn.s32 $reg3, q11, #14
+    vqrshrn.s32 $reg4, q10, #14
+    ; (used) two for results, well four d registers
+    MEND
+    ; --------------------------------------------------------------------------
+    ; Touches q8-q12, q15 (q13-q14 are preserved)
+    ; valid output registers are anything but q8-q11
+    MACRO
+    DO_BUTTERFLY_STD $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4
+    DO_BUTTERFLY d28, d29, d26, d27, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4
+    MEND
+    ; --------------------------------------------------------------------------
+
+;void vpx_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int dest_stride);
+;
+;   r0  int16_t *input,
+;   r1  uint8_t *dest,
+;   r2  int dest_stride)
+; loop counters
+;   r4  bands loop counter
+;   r5  pass loop counter
+;   r8  transpose loop counter
+; combine-add pointers
+;   r6  dest + 31 * dest_stride, descending (30, 29, 28, ...)
+;   r7  dest +  0 * dest_stride, ascending  (1, 2, 3, ...)
+;   r9  dest + 15 * dest_stride, descending (14, 13, 12, ...)
+;   r10 dest + 16 * dest_stride, ascending  (17, 18, 19, ...)
+
+|vpx_idct32x32_1024_add_neon| PROC
+    ; This function does one pass of idct32x32 transform.
+    ;
+    ; This is done by transposing the input and then doing a 1d transform on
+    ; columns. In the first pass, the transposed columns are the original
+    ; rows. In the second pass, after the transposition, the colums are the
+    ; original columns.
+    ; The 1d transform is done by looping over bands of eight columns (the
+    ; idct32_bands loop). For each band, the transform input transposition
+    ; is done on demand, one band of four 8x8 matrices at a time. The four
+    ; matrices are transposed by pairs (the idct32_transpose_pair loop).
+    push  {r4-r11}
+    vpush {d8-d15}
+    ; stack operation
+    ; internal buffer used to transpose 8 lines into before transforming them
+    ;   int16_t transpose_buffer[32 * 8];
+    ;   at sp + [4096, 4607]
+    ; results of the first pass (transpose and transform rows)
+    ;   int16_t pass1[32 * 32];
+    ;   at sp + [0, 2047]
+    ; results of the second pass (transpose and transform columns)
+    ;   int16_t pass2[32 * 32];
+    ;   at sp + [2048, 4095]
+    sub sp, sp, #512+2048+2048
+
+    ; r6  = dest + 31 * dest_stride
+    ; r7  = dest +  0 * dest_stride
+    ; r9  = dest + 15 * dest_stride
+    ; r10 = dest + 16 * dest_stride
+    rsb r6,  r2, r2, lsl #5
+    rsb r9,  r2, r2, lsl #4
+    add r10, r1, r2, lsl #4
+    mov r7, r1
+    add r6, r6, r1
+    add r9, r9, r1
+    ; r11 = -dest_stride
+    neg r11, r2
+    ; r3 = input
+    mov r3, r0
+    ; parameters for first pass
+      ; r0 = transpose_buffer[32 * 8]
+    add r0, sp, #4096
+      ; r1 = pass1[32 * 32]
+    mov r1, sp
+
+    mov r5, #0          ; initialize pass loop counter
+idct32_pass_loop
+    mov r4, #4          ; initialize bands loop counter
+idct32_bands_loop
+    mov r8, #2          ; initialize transpose loop counter
+idct32_transpose_pair_loop
+    ; Load two horizontally consecutive 8x8 16bit data matrices. The first one
+    ; into q0-q7 and the second one into q8-q15. There is a stride of 64,
+    ; adjusted to 32 because of the two post-increments.
+    vld1.s16        {q8},  [r3]!
+    vld1.s16        {q0},  [r3]!
+    add r3, #32
+    vld1.s16        {q9},  [r3]!
+    vld1.s16        {q1},  [r3]!
+    add r3, #32
+    vld1.s16        {q10}, [r3]!
+    vld1.s16        {q2},  [r3]!
+    add r3, #32
+    vld1.s16        {q11}, [r3]!
+    vld1.s16        {q3},  [r3]!
+    add r3, #32
+    vld1.s16        {q12}, [r3]!
+    vld1.s16        {q4},  [r3]!
+    add r3, #32
+    vld1.s16        {q13}, [r3]!
+    vld1.s16        {q5},  [r3]!
+    add r3, #32
+    vld1.s16        {q14}, [r3]!
+    vld1.s16        {q6},  [r3]!
+    add r3, #32
+    vld1.s16        {q15}, [r3]!
+    vld1.s16        {q7},  [r3]!
+
+    ; Transpose the two 8x8 16bit data matrices.
+    vswp            d17, d24
+    vswp            d23, d30
+    vswp            d21, d28
+    vswp            d19, d26
+    vswp            d1,  d8
+    vswp            d7,  d14
+    vswp            d5,  d12
+    vswp            d3,  d10
+    vtrn.32         q8,  q10
+    vtrn.32         q9,  q11
+    vtrn.32         q12, q14
+    vtrn.32         q13, q15
+    vtrn.32         q0,  q2
+    vtrn.32         q1,  q3
+    vtrn.32         q4,  q6
+    vtrn.32         q5,  q7
+    vtrn.16         q8,  q9
+    vtrn.16         q10, q11
+    vtrn.16         q12, q13
+    vtrn.16         q14, q15
+    vtrn.16         q0,  q1
+    vtrn.16         q2,  q3
+    vtrn.16         q4,  q5
+    vtrn.16         q6,  q7
+
+    ; Store both matrices after each other. There is a stride of 32, which
+    ; adjusts to nothing because of the post-increments.
+    vst1.16        {q8},  [r0]!
+    vst1.16        {q9},  [r0]!
+    vst1.16        {q10}, [r0]!
+    vst1.16        {q11}, [r0]!
+    vst1.16        {q12}, [r0]!
+    vst1.16        {q13}, [r0]!
+    vst1.16        {q14}, [r0]!
+    vst1.16        {q15}, [r0]!
+    vst1.16        {q0},  [r0]!
+    vst1.16        {q1},  [r0]!
+    vst1.16        {q2},  [r0]!
+    vst1.16        {q3},  [r0]!
+    vst1.16        {q4},  [r0]!
+    vst1.16        {q5},  [r0]!
+    vst1.16        {q6},  [r0]!
+    vst1.16        {q7},  [r0]!
+
+    ; increment pointers by adjusted stride (not necessary for r0/out)
+    ;   go back by 7*32 for the seven lines moved fully by read and add
+    ;   go back by 32 for the eigth line only read
+    ;   advance by 16*2 to go the next pair
+    sub r3,  r3,  #7*32*2 + 32 - 16*2
+    ; transpose pair loop processing
+    subs r8, r8, #1
+    bne idct32_transpose_pair_loop
+
+    ; restore r0/input to its original value
+    sub r0, r0, #32*8*2
+
+    ; Instead of doing the transforms stage by stage, it is done by loading
+    ; some input values and doing as many stages as possible to minimize the
+    ; storing/loading of intermediate results. To fit within registers, the
+    ; final coefficients are cut into four blocks:
+    ; BLOCK A: 16-19,28-31
+    ; BLOCK B: 20-23,24-27
+    ; BLOCK C: 8-10,11-15
+    ; BLOCK D: 0-3,4-7
+    ; Blocks A and C are straight calculation through the various stages. In
+    ; block B, further calculations are performed using the results from
+    ; block A. In block D, further calculations are performed using the results
+    ; from block C and then the final calculations are done using results from
+    ; block A and B which have been combined at the end of block B.
+
+    ; --------------------------------------------------------------------------
+    ; BLOCK A: 16-19,28-31
+    ; --------------------------------------------------------------------------
+    ; generate 16,17,30,31
+    ; --------------------------------------------------------------------------
+    ; part of stage 1
+    ;temp1 = input[1 * 32] * cospi_31_64 - input[31 * 32] *  cospi_1_64;
+    ;temp2 = input[1 * 32] *  cospi_1_64 + input[31 * 32] * cospi_31_64;
+    ;step1b[16][i] = dct_const_round_shift(temp1);
+    ;step1b[31][i] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 0, 1, 31
+    DO_BUTTERFLY_STD cospi_31_64, cospi_1_64, d0, d1, d4, d5
+    ; --------------------------------------------------------------------------
+    ; part of stage 1
+    ;temp1 = input[17 * 32] * cospi_15_64 - input[15 * 32] * cospi_17_64;
+    ;temp2 = input[17 * 32] * cospi_17_64 + input[15 * 32] * cospi_15_64;
+    ;step1b[17][i] = dct_const_round_shift(temp1);
+    ;step1b[30][i] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 31, 17, 15
+    DO_BUTTERFLY_STD cospi_15_64, cospi_17_64, d2, d3, d6, d7
+    ; --------------------------------------------------------------------------
+    ; part of stage 2
+    ;step2[16] =  step1b[16][i] + step1b[17][i];
+    ;step2[17] =  step1b[16][i] - step1b[17][i];
+    ;step2[30] = -step1b[30][i] + step1b[31][i];
+    ;step2[31] =  step1b[30][i] + step1b[31][i];
+    vadd.s16  q4, q0, q1
+    vsub.s16  q13, q0, q1
+    vadd.s16  q6, q2, q3
+    vsub.s16  q14, q2, q3
+    ; --------------------------------------------------------------------------
+    ; part of stage 3
+    ;temp1 = step1b[30][i] * cospi_28_64 - step1b[17][i] * cospi_4_64;
+    ;temp2 = step1b[30][i] * cospi_4_64  - step1b[17][i] * cospi_28_64;
+    ;step3[17] = dct_const_round_shift(temp1);
+    ;step3[30] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d10, d11, d14, d15
+    ; --------------------------------------------------------------------------
+    ; generate 18,19,28,29
+    ; --------------------------------------------------------------------------
+    ; part of stage 1
+    ;temp1 = input[9 * 32] * cospi_23_64 - input[23 * 32] * cospi_9_64;
+    ;temp2 = input[9 * 32] *  cospi_9_64 + input[23 * 32] * cospi_23_64;
+    ;step1b[18][i] = dct_const_round_shift(temp1);
+    ;step1b[29][i] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 15, 9, 23
+    DO_BUTTERFLY_STD cospi_23_64, cospi_9_64, d0, d1, d4, d5
+    ; --------------------------------------------------------------------------
+    ; part of stage 1
+    ;temp1 = input[25 * 32] *  cospi_7_64 - input[7 * 32] * cospi_25_64;
+    ;temp2 = input[25 * 32] * cospi_25_64 + input[7 * 32] * cospi_7_64;
+    ;step1b[19][i] = dct_const_round_shift(temp1);
+    ;step1b[28][i] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 23, 25, 7
+    DO_BUTTERFLY_STD cospi_7_64, cospi_25_64, d2, d3, d6, d7
+    ; --------------------------------------------------------------------------
+    ; part of stage 2
+    ;step2[18] = -step1b[18][i] + step1b[19][i];
+    ;step2[19] =  step1b[18][i] + step1b[19][i];
+    ;step2[28] =  step1b[28][i] + step1b[29][i];
+    ;step2[29] =  step1b[28][i] - step1b[29][i];
+    vsub.s16  q13, q3, q2
+    vadd.s16  q3,  q3, q2
+    vsub.s16  q14, q1, q0
+    vadd.s16  q2,  q1, q0
+    ; --------------------------------------------------------------------------
+    ; part of stage 3
+    ;temp1 = step1b[18][i] * (-cospi_4_64)  - step1b[29][i] * (-cospi_28_64);
+    ;temp2 = step1b[18][i] * (-cospi_28_64) + step1b[29][i] * (-cospi_4_64);
+    ;step3[29] = dct_const_round_shift(temp1);
+    ;step3[18] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD (-cospi_4_64), (-cospi_28_64), d2, d3, d0, d1
+    ; --------------------------------------------------------------------------
+    ; combine 16-19,28-31
+    ; --------------------------------------------------------------------------
+    ; part of stage 4
+    ;step1[16] = step1b[16][i] + step1b[19][i];
+    ;step1[17] = step1b[17][i] + step1b[18][i];
+    ;step1[18] = step1b[17][i] - step1b[18][i];
+    ;step1[29] = step1b[30][i] - step1b[29][i];
+    ;step1[30] = step1b[30][i] + step1b[29][i];
+    ;step1[31] = step1b[31][i] + step1b[28][i];
+    vadd.s16  q8,  q4, q2
+    vadd.s16  q9,  q5, q0
+    vadd.s16  q10, q7, q1
+    vadd.s16  q15, q6, q3
+    vsub.s16  q13, q5, q0
+    vsub.s16  q14, q7, q1
+    STORE_IN_OUTPUT 0,  16, 31, q8,  q15
+    STORE_IN_OUTPUT 31, 17, 30, q9,  q10
+    ; --------------------------------------------------------------------------
+    ; part of stage 5
+    ;temp1 = step1b[29][i] * cospi_24_64 - step1b[18][i] * cospi_8_64;
+    ;temp2 = step1b[29][i] * cospi_8_64  + step1b[18][i] * cospi_24_64;
+    ;step2[18] = dct_const_round_shift(temp1);
+    ;step2[29] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d0, d1, d2, d3
+    STORE_IN_OUTPUT 30, 29, 18, q1, q0
+    ; --------------------------------------------------------------------------
+    ; part of stage 4
+    ;step1[19] = step1b[16][i] - step1b[19][i];
+    ;step1[28] = step1b[31][i] - step1b[28][i];
+    vsub.s16  q13, q4, q2
+    vsub.s16  q14, q6, q3
+    ; --------------------------------------------------------------------------
+    ; part of stage 5
+    ;temp1 = step1b[28][i] * cospi_24_64 - step1b[19][i] * cospi_8_64;
+    ;temp2 = step1b[28][i] * cospi_8_64  + step1b[19][i] * cospi_24_64;
+    ;step2[19] = dct_const_round_shift(temp1);
+    ;step2[28] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d8, d9, d12, d13
+    STORE_IN_OUTPUT 18, 19, 28, q4, q6
+    ; --------------------------------------------------------------------------
+
+
+    ; --------------------------------------------------------------------------
+    ; BLOCK B: 20-23,24-27
+    ; --------------------------------------------------------------------------
+    ; generate 20,21,26,27
+    ; --------------------------------------------------------------------------
+    ; part of stage 1
+    ;temp1 = input[5 * 32] * cospi_27_64 - input[27 * 32] * cospi_5_64;
+    ;temp2 = input[5 * 32] *  cospi_5_64 + input[27 * 32] * cospi_27_64;
+    ;step1b[20][i] = dct_const_round_shift(temp1);
+    ;step1b[27][i] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 7, 5, 27
+    DO_BUTTERFLY_STD cospi_27_64, cospi_5_64, d0, d1, d4, d5
+    ; --------------------------------------------------------------------------
+    ; part of stage 1
+    ;temp1 = input[21 * 32] * cospi_11_64 - input[11 * 32] * cospi_21_64;
+    ;temp2 = input[21 * 32] * cospi_21_64 + input[11 * 32] * cospi_11_64;
+    ;step1b[21][i] = dct_const_round_shift(temp1);
+    ;step1b[26][i] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 27, 21, 11
+    DO_BUTTERFLY_STD cospi_11_64, cospi_21_64, d2, d3, d6, d7
+    ; --------------------------------------------------------------------------
+    ; part of stage 2
+    ;step2[20] =  step1b[20][i] + step1b[21][i];
+    ;step2[21] =  step1b[20][i] - step1b[21][i];
+    ;step2[26] = -step1b[26][i] + step1b[27][i];
+    ;step2[27] =  step1b[26][i] + step1b[27][i];
+    vsub.s16  q13, q0, q1
+    vadd.s16  q0, q0, q1
+    vsub.s16  q14, q2, q3
+    vadd.s16  q2, q2, q3
+    ; --------------------------------------------------------------------------
+    ; part of stage 3
+    ;temp1 = step1b[26][i] * cospi_12_64 - step1b[21][i] * cospi_20_64;
+    ;temp2 = step1b[26][i] * cospi_20_64 + step1b[21][i] * cospi_12_64;
+    ;step3[21] = dct_const_round_shift(temp1);
+    ;step3[26] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7
+    ; --------------------------------------------------------------------------
+    ; generate 22,23,24,25
+    ; --------------------------------------------------------------------------
+    ; part of stage 1
+    ;temp1 = input[13 * 32] * cospi_19_64 - input[19 * 32] * cospi_13_64;
+    ;temp2 = input[13 * 32] * cospi_13_64 + input[19 * 32] * cospi_19_64;
+    ;step1b[22][i] = dct_const_round_shift(temp1);
+    ;step1b[25][i] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 11, 13, 19
+    DO_BUTTERFLY_STD cospi_19_64, cospi_13_64, d10, d11, d14, d15
+    ; --------------------------------------------------------------------------
+    ; part of stage 1
+    ;temp1 = input[29 * 32] *  cospi_3_64 - input[3 * 32] * cospi_29_64;
+    ;temp2 = input[29 * 32] * cospi_29_64 + input[3 * 32] * cospi_3_64;
+    ;step1b[23][i] = dct_const_round_shift(temp1);
+    ;step1b[24][i] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 19, 29, 3
+    DO_BUTTERFLY_STD cospi_3_64, cospi_29_64, d8, d9, d12, d13
+    ; --------------------------------------------------------------------------
+    ; part of stage 2
+    ;step2[22] = -step1b[22][i] + step1b[23][i];
+    ;step2[23] =  step1b[22][i] + step1b[23][i];
+    ;step2[24] =  step1b[24][i] + step1b[25][i];
+    ;step2[25] =  step1b[24][i] - step1b[25][i];
+    vsub.s16  q14, q4, q5
+    vadd.s16  q5, q4, q5
+    vsub.s16  q13, q6, q7
+    vadd.s16  q6, q6, q7
+    ; --------------------------------------------------------------------------
+    ; part of stage 3
+    ;temp1 = step1b[22][i] * (-cospi_20_64) - step1b[25][i] * (-cospi_12_64);
+    ;temp2 = step1b[22][i] * (-cospi_12_64) + step1b[25][i] * (-cospi_20_64);
+    ;step3[25] = dct_const_round_shift(temp1);
+    ;step3[22] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD (-cospi_20_64), (-cospi_12_64), d8, d9, d14, d15
+    ; --------------------------------------------------------------------------
+    ; combine 20-23,24-27
+    ; --------------------------------------------------------------------------
+    ; part of stage 4
+    ;step1[22] = step1b[22][i] + step1b[21][i];
+    ;step1[23] = step1b[23][i] + step1b[20][i];
+    vadd.s16  q10, q7, q1
+    vadd.s16  q11, q5, q0
+    ;step1[24] = step1b[24][i] + step1b[27][i];
+    ;step1[25] = step1b[25][i] + step1b[26][i];
+    vadd.s16  q12, q6, q2
+    vadd.s16  q15, q4, q3
+    ; --------------------------------------------------------------------------
+    ; part of stage 6
+    ;step3[16] = step1b[16][i] + step1b[23][i];
+    ;step3[17] = step1b[17][i] + step1b[22][i];
+    ;step3[22] = step1b[17][i] - step1b[22][i];
+    ;step3[23] = step1b[16][i] - step1b[23][i];
+    LOAD_FROM_OUTPUT 28, 16, 17, q14, q13
+    vadd.s16  q8,  q14, q11
+    vadd.s16  q9,  q13, q10
+    vsub.s16  q13, q13, q10
+    vsub.s16  q11, q14, q11
+    STORE_IN_OUTPUT 17, 17, 16, q9, q8
+    ; --------------------------------------------------------------------------
+    ; part of stage 6
+    ;step3[24] = step1b[31][i] - step1b[24][i];
+    ;step3[25] = step1b[30][i] - step1b[25][i];
+    ;step3[30] = step1b[30][i] + step1b[25][i];
+    ;step3[31] = step1b[31][i] + step1b[24][i];
+    LOAD_FROM_OUTPUT 16, 30, 31, q14, q9
+    vsub.s16  q8,  q9,  q12
+    vadd.s16  q10, q14, q15
+    vsub.s16  q14, q14, q15
+    vadd.s16  q12, q9,  q12
+    STORE_IN_OUTPUT 31, 30, 31, q10, q12
+    ; --------------------------------------------------------------------------
+    ; TODO(cd) do some register allocation change to remove these push/pop
+    vpush {q8}  ; [24]
+    vpush {q11} ; [23]
+    ; --------------------------------------------------------------------------
+    ; part of stage 7
+    ;temp1 = (step1b[25][i] - step1b[22][i]) * cospi_16_64;
+    ;temp2 = (step1b[25][i] + step1b[22][i]) * cospi_16_64;
+    ;step1[22] = dct_const_round_shift(temp1);
+    ;step1[25] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
+    STORE_IN_OUTPUT 31, 25, 22, q14, q13
+    ; --------------------------------------------------------------------------
+    ; part of stage 7
+    ;temp1 = (step1b[24][i] - step1b[23][i]) * cospi_16_64;
+    ;temp2 = (step1b[24][i] + step1b[23][i]) * cospi_16_64;
+    ;step1[23] = dct_const_round_shift(temp1);
+    ;step1[24] = dct_const_round_shift(temp2);
+    ; TODO(cd) do some register allocation change to remove these push/pop
+    vpop  {q13} ; [23]
+    vpop  {q14} ; [24]
+    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
+    STORE_IN_OUTPUT 22, 24, 23, q14, q13
+    ; --------------------------------------------------------------------------
+    ; part of stage 4
+    ;step1[20] = step1b[23][i] - step1b[20][i];
+    ;step1[27] = step1b[24][i] - step1b[27][i];
+    vsub.s16  q14, q5, q0
+    vsub.s16  q13, q6, q2
+    ; --------------------------------------------------------------------------
+    ; part of stage 5
+    ;temp1 = step1b[20][i] * (-cospi_8_64)  - step1b[27][i] * (-cospi_24_64);
+    ;temp2 = step1b[20][i] * (-cospi_24_64) + step1b[27][i] * (-cospi_8_64);
+    ;step2[27] = dct_const_round_shift(temp1);
+    ;step2[20] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d10, d11, d12, d13
+    ; --------------------------------------------------------------------------
+    ; part of stage 4
+    ;step1[21] = step1b[22][i] - step1b[21][i];
+    ;step1[26] = step1b[25][i] - step1b[26][i];
+    vsub.s16  q14,  q7, q1
+    vsub.s16  q13,  q4, q3
+    ; --------------------------------------------------------------------------
+    ; part of stage 5
+    ;temp1 = step1b[21][i] * (-cospi_8_64)  - step1b[26][i] * (-cospi_24_64);
+    ;temp2 = step1b[21][i] * (-cospi_24_64) + step1b[26][i] * (-cospi_8_64);
+    ;step2[26] = dct_const_round_shift(temp1);
+    ;step2[21] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d0, d1, d2, d3
+    ; --------------------------------------------------------------------------
+    ; part of stage 6
+    ;step3[18] = step1b[18][i] + step1b[21][i];
+    ;step3[19] = step1b[19][i] + step1b[20][i];
+    ;step3[20] = step1b[19][i] - step1b[20][i];
+    ;step3[21] = step1b[18][i] - step1b[21][i];
+    LOAD_FROM_OUTPUT 23, 18, 19, q14, q13
+    vadd.s16  q8,  q14, q1
+    vadd.s16  q9,  q13, q6
+    vsub.s16  q13, q13, q6
+    vsub.s16  q1,  q14, q1
+    STORE_IN_OUTPUT 19, 18, 19, q8, q9
+    ; --------------------------------------------------------------------------
+    ; part of stage 6
+    ;step3[27] = step1b[28][i] - step1b[27][i];
+    ;step3[28] = step1b[28][i] + step1b[27][i];
+    ;step3[29] = step1b[29][i] + step1b[26][i];
+    ;step3[26] = step1b[29][i] - step1b[26][i];
+    LOAD_FROM_OUTPUT 19, 28, 29, q8, q9
+    vsub.s16  q14, q8, q5
+    vadd.s16  q10, q8, q5
+    vadd.s16  q11, q9, q0
+    vsub.s16  q0, q9, q0
+    STORE_IN_OUTPUT 29, 28, 29, q10, q11
+    ; --------------------------------------------------------------------------
+    ; part of stage 7
+    ;temp1 = (step1b[27][i] - step1b[20][i]) * cospi_16_64;
+    ;temp2 = (step1b[27][i] + step1b[20][i]) * cospi_16_64;
+    ;step1[20] = dct_const_round_shift(temp1);
+    ;step1[27] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
+    STORE_IN_OUTPUT 29, 20, 27, q13, q14
+    ; --------------------------------------------------------------------------
+    ; part of stage 7
+    ;temp1 = (step1b[26][i] - step1b[21][i]) * cospi_16_64;
+    ;temp2 = (step1b[26][i] + step1b[21][i]) * cospi_16_64;
+    ;step1[21] = dct_const_round_shift(temp1);
+    ;step1[26] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY d0, d1, d2, d3, cospi_16_64, cospi_16_64, d2, d3, d0, d1
+    STORE_IN_OUTPUT 27, 21, 26, q1, q0
+    ; --------------------------------------------------------------------------
+
+
+    ; --------------------------------------------------------------------------
+    ; BLOCK C: 8-10,11-15
+    ; --------------------------------------------------------------------------
+    ; generate 8,9,14,15
+    ; --------------------------------------------------------------------------
+    ; part of stage 2
+    ;temp1 = input[2 * 32] * cospi_30_64 - input[30 * 32] * cospi_2_64;
+    ;temp2 = input[2 * 32] * cospi_2_64 + input[30 * 32] * cospi_30_64;
+    ;step2[8] = dct_const_round_shift(temp1);
+    ;step2[15] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 3, 2, 30
+    DO_BUTTERFLY_STD cospi_30_64, cospi_2_64, d0, d1, d4, d5
+    ; --------------------------------------------------------------------------
+    ; part of stage 2
+    ;temp1 = input[18 * 32] * cospi_14_64 - input[14 * 32] * cospi_18_64;
+    ;temp2 = input[18 * 32] * cospi_18_64 + input[14 * 32] * cospi_14_64;
+    ;step2[9] = dct_const_round_shift(temp1);
+    ;step2[14] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 30, 18, 14
+    DO_BUTTERFLY_STD cospi_14_64, cospi_18_64, d2, d3, d6, d7
+    ; --------------------------------------------------------------------------
+    ; part of stage 3
+    ;step3[8] = step1b[8][i] + step1b[9][i];
+    ;step3[9] = step1b[8][i] - step1b[9][i];
+    ;step3[14] = step1b[15][i] - step1b[14][i];
+    ;step3[15] = step1b[15][i] + step1b[14][i];
+    vsub.s16  q13, q0, q1
+    vadd.s16  q0, q0, q1
+    vsub.s16  q14, q2, q3
+    vadd.s16  q2, q2, q3
+    ; --------------------------------------------------------------------------
+    ; part of stage 4
+    ;temp1 = step1b[14][i] * cospi_24_64 - step1b[9][i] * cospi_8_64;
+    ;temp2 = step1b[14][i] * cospi_8_64  + step1b[9][i] * cospi_24_64;
+    ;step1[9]  = dct_const_round_shift(temp1);
+    ;step1[14] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d2, d3, d6, d7
+    ; --------------------------------------------------------------------------
+    ; generate 10,11,12,13
+    ; --------------------------------------------------------------------------
+    ; part of stage 2
+    ;temp1 = input[10 * 32] * cospi_22_64 - input[22 * 32] * cospi_10_64;
+    ;temp2 = input[10 * 32] * cospi_10_64 + input[22 * 32] * cospi_22_64;
+    ;step2[10] = dct_const_round_shift(temp1);
+    ;step2[13] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 14, 10, 22
+    DO_BUTTERFLY_STD cospi_22_64, cospi_10_64, d10, d11, d14, d15
+    ; --------------------------------------------------------------------------
+    ; part of stage 2
+    ;temp1 = input[26 * 32] * cospi_6_64 - input[6 * 32] * cospi_26_64;
+    ;temp2 = input[26 * 32] * cospi_26_64 + input[6 * 32] * cospi_6_64;
+    ;step2[11] = dct_const_round_shift(temp1);
+    ;step2[12] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 22, 26, 6
+    DO_BUTTERFLY_STD cospi_6_64, cospi_26_64, d8, d9, d12, d13
+    ; --------------------------------------------------------------------------
+    ; part of stage 3
+    ;step3[10] = step1b[11][i] - step1b[10][i];
+    ;step3[11] = step1b[11][i] + step1b[10][i];
+    ;step3[12] = step1b[12][i] + step1b[13][i];
+    ;step3[13] = step1b[12][i] - step1b[13][i];
+    vsub.s16  q14, q4, q5
+    vadd.s16  q5, q4, q5
+    vsub.s16  q13, q6, q7
+    vadd.s16  q6, q6, q7
+    ; --------------------------------------------------------------------------
+    ; part of stage 4
+    ;temp1 = step1b[10][i] * (-cospi_8_64)  - step1b[13][i] * (-cospi_24_64);
+    ;temp2 = step1b[10][i] * (-cospi_24_64) + step1b[13][i] * (-cospi_8_64);
+    ;step1[13] = dct_const_round_shift(temp1);
+    ;step1[10] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d8, d9, d14, d15
+    ; --------------------------------------------------------------------------
+    ; combine 8-10,11-15
+    ; --------------------------------------------------------------------------
+    ; part of stage 5
+    ;step2[8]  = step1b[8][i] + step1b[11][i];
+    ;step2[9]  = step1b[9][i] + step1b[10][i];
+    ;step2[10] = step1b[9][i] - step1b[10][i];
+    vadd.s16  q8,  q0, q5
+    vadd.s16  q9,  q1, q7
+    vsub.s16  q13, q1, q7
+    ;step2[13] = step1b[14][i] - step1b[13][i];
+    ;step2[14] = step1b[14][i] + step1b[13][i];
+    ;step2[15] = step1b[15][i] + step1b[12][i];
+    vsub.s16  q14, q3, q4
+    vadd.s16  q10, q3, q4
+    vadd.s16  q15, q2, q6
+    STORE_IN_OUTPUT 26, 8, 15, q8, q15
+    STORE_IN_OUTPUT 15, 9, 14, q9, q10
+    ; --------------------------------------------------------------------------
+    ; part of stage 6
+    ;temp1 = (step1b[13][i] - step1b[10][i]) * cospi_16_64;
+    ;temp2 = (step1b[13][i] + step1b[10][i]) * cospi_16_64;
+    ;step3[10] = dct_const_round_shift(temp1);
+    ;step3[13] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
+    STORE_IN_OUTPUT 14, 13, 10, q3, q1
+    ; --------------------------------------------------------------------------
+    ; part of stage 5
+    ;step2[11] = step1b[8][i] - step1b[11][i];
+    ;step2[12] = step1b[15][i] - step1b[12][i];
+    vsub.s16  q13, q0, q5
+    vsub.s16  q14,  q2, q6
+    ; --------------------------------------------------------------------------
+    ; part of stage 6
+    ;temp1 = (step1b[12][i] - step1b[11][i]) * cospi_16_64;
+    ;temp2 = (step1b[12][i] + step1b[11][i]) * cospi_16_64;
+    ;step3[11] = dct_const_round_shift(temp1);
+    ;step3[12] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
+    STORE_IN_OUTPUT 10, 11, 12, q1, q3
+    ; --------------------------------------------------------------------------
+
+
+    ; --------------------------------------------------------------------------
+    ; BLOCK D: 0-3,4-7
+    ; --------------------------------------------------------------------------
+    ; generate 4,5,6,7
+    ; --------------------------------------------------------------------------
+    ; part of stage 3
+    ;temp1 = input[4 * 32] * cospi_28_64 - input[28 * 32] * cospi_4_64;
+    ;temp2 = input[4 * 32] * cospi_4_64 + input[28 * 32] * cospi_28_64;
+    ;step3[4] = dct_const_round_shift(temp1);
+    ;step3[7] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 6, 4, 28
+    DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d0, d1, d4, d5
+    ; --------------------------------------------------------------------------
+    ; part of stage 3
+    ;temp1 = input[20 * 32] * cospi_12_64 - input[12 * 32] * cospi_20_64;
+    ;temp2 = input[20 * 32] * cospi_20_64 + input[12 * 32] * cospi_12_64;
+    ;step3[5] = dct_const_round_shift(temp1);
+    ;step3[6] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 28, 20, 12
+    DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7
+    ; --------------------------------------------------------------------------
+    ; part of stage 4
+    ;step1[4] = step1b[4][i] + step1b[5][i];
+    ;step1[5] = step1b[4][i] - step1b[5][i];
+    ;step1[6] = step1b[7][i] - step1b[6][i];
+    ;step1[7] = step1b[7][i] + step1b[6][i];
+    vsub.s16  q13, q0, q1
+    vadd.s16  q0, q0, q1
+    vsub.s16  q14, q2, q3
+    vadd.s16  q2, q2, q3
+    ; --------------------------------------------------------------------------
+    ; part of stage 5
+    ;temp1 = (step1b[6][i] - step1b[5][i]) * cospi_16_64;
+    ;temp2 = (step1b[5][i] + step1b[6][i]) * cospi_16_64;
+    ;step2[5] = dct_const_round_shift(temp1);
+    ;step2[6] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
+    ; --------------------------------------------------------------------------
+    ; generate 0,1,2,3
+    ; --------------------------------------------------------------------------
+    ; part of stage 4
+    ;temp1 = (input[0 * 32] - input[16 * 32]) * cospi_16_64;
+    ;temp2 = (input[0 * 32] + input[16 * 32]) * cospi_16_64;
+    ;step1[1] = dct_const_round_shift(temp1);
+    ;step1[0] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 12, 0, 16
+    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d10, d11, d14, d15
+    ; --------------------------------------------------------------------------
+    ; part of stage 4
+    ;temp1 = input[8 * 32] * cospi_24_64 - input[24 * 32] * cospi_8_64;
+    ;temp2 = input[8 * 32] * cospi_8_64 + input[24 * 32] * cospi_24_64;
+    ;step1[2] = dct_const_round_shift(temp1);
+    ;step1[3] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 16, 8, 24
+    DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d28, d29, d12, d13
+    ; --------------------------------------------------------------------------
+    ; part of stage 5
+    ;step2[0] = step1b[0][i] + step1b[3][i];
+    ;step2[1] = step1b[1][i] + step1b[2][i];
+    ;step2[2] = step1b[1][i] - step1b[2][i];
+    ;step2[3] = step1b[0][i] - step1b[3][i];
+    vadd.s16  q4, q7, q6
+    vsub.s16  q7, q7, q6
+    vsub.s16  q6, q5, q14
+    vadd.s16  q5, q5, q14
+    ; --------------------------------------------------------------------------
+    ; combine 0-3,4-7
+    ; --------------------------------------------------------------------------
+    ; part of stage 6
+    ;step3[0] = step1b[0][i] + step1b[7][i];
+    ;step3[1] = step1b[1][i] + step1b[6][i];
+    ;step3[2] = step1b[2][i] + step1b[5][i];
+    ;step3[3] = step1b[3][i] + step1b[4][i];
+    vadd.s16  q8,  q4, q2
+    vadd.s16  q9,  q5, q3
+    vadd.s16  q10, q6, q1
+    vadd.s16  q11, q7, q0
+    ;step3[4] = step1b[3][i] - step1b[4][i];
+    ;step3[5] = step1b[2][i] - step1b[5][i];
+    ;step3[6] = step1b[1][i] - step1b[6][i];
+    ;step3[7] = step1b[0][i] - step1b[7][i];
+    vsub.s16  q12, q7, q0
+    vsub.s16  q13, q6, q1
+    vsub.s16  q14, q5, q3
+    vsub.s16  q15, q4, q2
+    ; --------------------------------------------------------------------------
+    ; part of stage 7
+    ;step1[0] = step1b[0][i] + step1b[15][i];
+    ;step1[1] = step1b[1][i] + step1b[14][i];
+    ;step1[14] = step1b[1][i] - step1b[14][i];
+    ;step1[15] = step1b[0][i] - step1b[15][i];
+    LOAD_FROM_OUTPUT 12, 14, 15, q0, q1
+    vadd.s16  q2, q8, q1
+    vadd.s16  q3, q9, q0
+    vsub.s16  q4, q9, q0
+    vsub.s16  q5, q8, q1
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[14 * 32] = step1b[14][i] + step1b[17][i];
+    ;output[15 * 32] = step1b[15][i] + step1b[16][i];
+    ;output[16 * 32] = step1b[15][i] - step1b[16][i];
+    ;output[17 * 32] = step1b[14][i] - step1b[17][i];
+    LOAD_FROM_OUTPUT 15, 16, 17, q0, q1
+    vadd.s16  q8, q4, q1
+    vadd.s16  q9, q5, q0
+    vsub.s16  q6, q5, q0
+    vsub.s16  q7, q4, q1
+
+    cmp r5, #0
+    bgt idct32_bands_end_2nd_pass
+
+idct32_bands_end_1st_pass
+    STORE_IN_OUTPUT 17, 16, 17, q6, q7
+    STORE_IN_OUTPUT 17, 14, 15, q8, q9
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[ 0 * 32] = step1b[0][i] + step1b[31][i];
+    ;output[ 1 * 32] = step1b[1][i] + step1b[30][i];
+    ;output[30 * 32] = step1b[1][i] - step1b[30][i];
+    ;output[31 * 32] = step1b[0][i] - step1b[31][i];
+    LOAD_FROM_OUTPUT 15, 30, 31, q0, q1
+    vadd.s16  q4, q2, q1
+    vadd.s16  q5, q3, q0
+    vsub.s16  q6, q3, q0
+    vsub.s16  q7, q2, q1
+    STORE_IN_OUTPUT 31, 30, 31, q6, q7
+    STORE_IN_OUTPUT 31,  0,  1, q4, q5
+    ; --------------------------------------------------------------------------
+    ; part of stage 7
+    ;step1[2] = step1b[2][i] + step1b[13][i];
+    ;step1[3] = step1b[3][i] + step1b[12][i];
+    ;step1[12] = step1b[3][i] - step1b[12][i];
+    ;step1[13] = step1b[2][i] - step1b[13][i];
+    LOAD_FROM_OUTPUT 1, 12, 13, q0, q1
+    vadd.s16  q2, q10, q1
+    vadd.s16  q3, q11, q0
+    vsub.s16  q4, q11, q0
+    vsub.s16  q5, q10, q1
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[12 * 32] = step1b[12][i] + step1b[19][i];
+    ;output[13 * 32] = step1b[13][i] + step1b[18][i];
+    ;output[18 * 32] = step1b[13][i] - step1b[18][i];
+    ;output[19 * 32] = step1b[12][i] - step1b[19][i];
+    LOAD_FROM_OUTPUT 13, 18, 19, q0, q1
+    vadd.s16  q8, q4, q1
+    vadd.s16  q9, q5, q0
+    vsub.s16  q6, q5, q0
+    vsub.s16  q7, q4, q1
+    STORE_IN_OUTPUT 19, 18, 19, q6, q7
+    STORE_IN_OUTPUT 19, 12, 13, q8, q9
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[ 2 * 32] = step1b[2][i] + step1b[29][i];
+    ;output[ 3 * 32] = step1b[3][i] + step1b[28][i];
+    ;output[28 * 32] = step1b[3][i] - step1b[28][i];
+    ;output[29 * 32] = step1b[2][i] - step1b[29][i];
+    LOAD_FROM_OUTPUT 13, 28, 29, q0, q1
+    vadd.s16  q4, q2, q1
+    vadd.s16  q5, q3, q0
+    vsub.s16  q6, q3, q0
+    vsub.s16  q7, q2, q1
+    STORE_IN_OUTPUT 29, 28, 29, q6, q7
+    STORE_IN_OUTPUT 29,  2,  3, q4, q5
+    ; --------------------------------------------------------------------------
+    ; part of stage 7
+    ;step1[4] = step1b[4][i] + step1b[11][i];
+    ;step1[5] = step1b[5][i] + step1b[10][i];
+    ;step1[10] = step1b[5][i] - step1b[10][i];
+    ;step1[11] = step1b[4][i] - step1b[11][i];
+    LOAD_FROM_OUTPUT 3, 10, 11, q0, q1
+    vadd.s16  q2, q12, q1
+    vadd.s16  q3, q13, q0
+    vsub.s16  q4, q13, q0
+    vsub.s16  q5, q12, q1
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[10 * 32] = step1b[10][i] + step1b[21][i];
+    ;output[11 * 32] = step1b[11][i] + step1b[20][i];
+    ;output[20 * 32] = step1b[11][i] - step1b[20][i];
+    ;output[21 * 32] = step1b[10][i] - step1b[21][i];
+    LOAD_FROM_OUTPUT 11, 20, 21, q0, q1
+    vadd.s16  q8, q4, q1
+    vadd.s16  q9, q5, q0
+    vsub.s16  q6, q5, q0
+    vsub.s16  q7, q4, q1
+    STORE_IN_OUTPUT 21, 20, 21, q6, q7
+    STORE_IN_OUTPUT 21, 10, 11, q8, q9
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[ 4 * 32] = step1b[4][i] + step1b[27][i];
+    ;output[ 5 * 32] = step1b[5][i] + step1b[26][i];
+    ;output[26 * 32] = step1b[5][i] - step1b[26][i];
+    ;output[27 * 32] = step1b[4][i] - step1b[27][i];
+    LOAD_FROM_OUTPUT 11, 26, 27, q0, q1
+    vadd.s16  q4, q2, q1
+    vadd.s16  q5, q3, q0
+    vsub.s16  q6, q3, q0
+    vsub.s16  q7, q2, q1
+    STORE_IN_OUTPUT 27, 26, 27, q6, q7
+    STORE_IN_OUTPUT 27,  4,  5, q4, q5
+    ; --------------------------------------------------------------------------
+    ; part of stage 7
+    ;step1[6] = step1b[6][i] + step1b[9][i];
+    ;step1[7] = step1b[7][i] + step1b[8][i];
+    ;step1[8] = step1b[7][i] - step1b[8][i];
+    ;step1[9] = step1b[6][i] - step1b[9][i];
+    LOAD_FROM_OUTPUT 5, 8, 9, q0, q1
+    vadd.s16  q2, q14, q1
+    vadd.s16  q3, q15, q0
+    vsub.s16  q4, q15, q0
+    vsub.s16  q5, q14, q1
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[ 8 * 32] = step1b[8][i] + step1b[23][i];
+    ;output[ 9 * 32] = step1b[9][i] + step1b[22][i];
+    ;output[22 * 32] = step1b[9][i] - step1b[22][i];
+    ;output[23 * 32] = step1b[8][i] - step1b[23][i];
+    LOAD_FROM_OUTPUT 9, 22, 23, q0, q1
+    vadd.s16  q8, q4, q1
+    vadd.s16  q9, q5, q0
+    vsub.s16  q6, q5, q0
+    vsub.s16  q7, q4, q1
+    STORE_IN_OUTPUT 23, 22, 23, q6, q7
+    STORE_IN_OUTPUT 23, 8, 9, q8, q9
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[ 6 * 32] = step1b[6][i] + step1b[25][i];
+    ;output[ 7 * 32] = step1b[7][i] + step1b[24][i];
+    ;output[24 * 32] = step1b[7][i] - step1b[24][i];
+    ;output[25 * 32] = step1b[6][i] - step1b[25][i];
+    LOAD_FROM_OUTPUT 9, 24, 25, q0, q1
+    vadd.s16  q4, q2, q1
+    vadd.s16  q5, q3, q0
+    vsub.s16  q6, q3, q0
+    vsub.s16  q7, q2, q1
+    STORE_IN_OUTPUT 25, 24, 25, q6, q7
+    STORE_IN_OUTPUT 25,  6,  7, q4, q5
+
+    ; restore r0 by removing the last offset from the last
+    ;     operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2
+    sub r0, r0, #24*8*2
+    ; restore r1 by removing the last offset from the last
+    ;     operation (STORE_IN_OUTPUT 24,  6,  7) => 7*32*2
+    ; advance by 8 columns => 8*2
+    sub r1, r1, #7*32*2 - 8*2
+    ;   advance by 8 lines (8*32*2)
+    ;   go back by the two pairs from the loop (32*2)
+    add r3, r3, #8*32*2 - 32*2
+
+    ; bands loop processing
+    subs r4, r4, #1
+    bne idct32_bands_loop
+
+    ; parameters for second pass
+    ; the input of pass2 is the result of pass1. we have to remove the offset
+    ;   of 32 columns induced by the above idct32_bands_loop
+    sub r3, r1, #32*2
+      ; r1 = pass2[32 * 32]
+    add r1, sp, #2048
+
+    ; pass loop processing
+    add r5, r5, #1
+    b idct32_pass_loop
+
+idct32_bands_end_2nd_pass
+    STORE_COMBINE_CENTER_RESULTS
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[ 0 * 32] = step1b[0][i] + step1b[31][i];
+    ;output[ 1 * 32] = step1b[1][i] + step1b[30][i];
+    ;output[30 * 32] = step1b[1][i] - step1b[30][i];
+    ;output[31 * 32] = step1b[0][i] - step1b[31][i];
+    LOAD_FROM_OUTPUT 17, 30, 31, q0, q1
+    vadd.s16  q4, q2, q1
+    vadd.s16  q5, q3, q0
+    vsub.s16  q6, q3, q0
+    vsub.s16  q7, q2, q1
+    STORE_COMBINE_EXTREME_RESULTS
+    ; --------------------------------------------------------------------------
+    ; part of stage 7
+    ;step1[2] = step1b[2][i] + step1b[13][i];
+    ;step1[3] = step1b[3][i] + step1b[12][i];
+    ;step1[12] = step1b[3][i] - step1b[12][i];
+    ;step1[13] = step1b[2][i] - step1b[13][i];
+    LOAD_FROM_OUTPUT 31, 12, 13, q0, q1
+    vadd.s16  q2, q10, q1
+    vadd.s16  q3, q11, q0
+    vsub.s16  q4, q11, q0
+    vsub.s16  q5, q10, q1
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[12 * 32] = step1b[12][i] + step1b[19][i];
+    ;output[13 * 32] = step1b[13][i] + step1b[18][i];
+    ;output[18 * 32] = step1b[13][i] - step1b[18][i];
+    ;output[19 * 32] = step1b[12][i] - step1b[19][i];
+    LOAD_FROM_OUTPUT 13, 18, 19, q0, q1
+    vadd.s16  q8, q4, q1
+    vadd.s16  q9, q5, q0
+    vsub.s16  q6, q5, q0
+    vsub.s16  q7, q4, q1
+    STORE_COMBINE_CENTER_RESULTS
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[ 2 * 32] = step1b[2][i] + step1b[29][i];
+    ;output[ 3 * 32] = step1b[3][i] + step1b[28][i];
+    ;output[28 * 32] = step1b[3][i] - step1b[28][i];
+    ;output[29 * 32] = step1b[2][i] - step1b[29][i];
+    LOAD_FROM_OUTPUT 19, 28, 29, q0, q1
+    vadd.s16  q4, q2, q1
+    vadd.s16  q5, q3, q0
+    vsub.s16  q6, q3, q0
+    vsub.s16  q7, q2, q1
+    STORE_COMBINE_EXTREME_RESULTS
+    ; --------------------------------------------------------------------------
+    ; part of stage 7
+    ;step1[4] = step1b[4][i] + step1b[11][i];
+    ;step1[5] = step1b[5][i] + step1b[10][i];
+    ;step1[10] = step1b[5][i] - step1b[10][i];
+    ;step1[11] = step1b[4][i] - step1b[11][i];
+    LOAD_FROM_OUTPUT 29, 10, 11, q0, q1
+    vadd.s16  q2, q12, q1
+    vadd.s16  q3, q13, q0
+    vsub.s16  q4, q13, q0
+    vsub.s16  q5, q12, q1
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[10 * 32] = step1b[10][i] + step1b[21][i];
+    ;output[11 * 32] = step1b[11][i] + step1b[20][i];
+    ;output[20 * 32] = step1b[11][i] - step1b[20][i];
+    ;output[21 * 32] = step1b[10][i] - step1b[21][i];
+    LOAD_FROM_OUTPUT 11, 20, 21, q0, q1
+    vadd.s16  q8, q4, q1
+    vadd.s16  q9, q5, q0
+    vsub.s16  q6, q5, q0
+    vsub.s16  q7, q4, q1
+    STORE_COMBINE_CENTER_RESULTS
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[ 4 * 32] = step1b[4][i] + step1b[27][i];
+    ;output[ 5 * 32] = step1b[5][i] + step1b[26][i];
+    ;output[26 * 32] = step1b[5][i] - step1b[26][i];
+    ;output[27 * 32] = step1b[4][i] - step1b[27][i];
+    LOAD_FROM_OUTPUT 21, 26, 27, q0, q1
+    vadd.s16  q4, q2, q1
+    vadd.s16  q5, q3, q0
+    vsub.s16  q6, q3, q0
+    vsub.s16  q7, q2, q1
+    STORE_COMBINE_EXTREME_RESULTS
+    ; --------------------------------------------------------------------------
+    ; part of stage 7
+    ;step1[6] = step1b[6][i] + step1b[9][i];
+    ;step1[7] = step1b[7][i] + step1b[8][i];
+    ;step1[8] = step1b[7][i] - step1b[8][i];
+    ;step1[9] = step1b[6][i] - step1b[9][i];
+    LOAD_FROM_OUTPUT 27, 8, 9, q0, q1
+    vadd.s16  q2, q14, q1
+    vadd.s16  q3, q15, q0
+    vsub.s16  q4, q15, q0
+    vsub.s16  q5, q14, q1
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[ 8 * 32] = step1b[8][i] + step1b[23][i];
+    ;output[ 9 * 32] = step1b[9][i] + step1b[22][i];
+    ;output[22 * 32] = step1b[9][i] - step1b[22][i];
+    ;output[23 * 32] = step1b[8][i] - step1b[23][i];
+    LOAD_FROM_OUTPUT 9, 22, 23, q0, q1
+    vadd.s16  q8, q4, q1
+    vadd.s16  q9, q5, q0
+    vsub.s16  q6, q5, q0
+    vsub.s16  q7, q4, q1
+    STORE_COMBINE_CENTER_RESULTS_LAST
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[ 6 * 32] = step1b[6][i] + step1b[25][i];
+    ;output[ 7 * 32] = step1b[7][i] + step1b[24][i];
+    ;output[24 * 32] = step1b[7][i] - step1b[24][i];
+    ;output[25 * 32] = step1b[6][i] - step1b[25][i];
+    LOAD_FROM_OUTPUT 23, 24, 25, q0, q1
+    vadd.s16  q4, q2, q1
+    vadd.s16  q5, q3, q0
+    vsub.s16  q6, q3, q0
+    vsub.s16  q7, q2, q1
+    STORE_COMBINE_EXTREME_RESULTS_LAST
+    ; --------------------------------------------------------------------------
+    ; restore pointers to their initial indices for next band pass by
+    ;     removing/adding dest_stride * 8. The actual increment by eight
+    ;     is taken care of within the _LAST macros.
+    add r6,  r6,  r2, lsl #3
+    add r9,  r9,  r2, lsl #3
+    sub r7,  r7,  r2, lsl #3
+    sub r10, r10, r2, lsl #3
+
+    ; restore r0 by removing the last offset from the last
+    ;     operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2
+    sub r0, r0, #24*8*2
+    ; restore r1 by removing the last offset from the last
+    ;     operation (LOAD_FROM_OUTPUT 23, 24, 25) => 25*32*2
+    ; advance by 8 columns => 8*2
+    sub r1, r1, #25*32*2 - 8*2
+    ;   advance by 8 lines (8*32*2)
+    ;   go back by the two pairs from the loop (32*2)
+    add r3, r3, #8*32*2 - 32*2
+
+    ; bands loop processing
+    subs r4, r4, #1
+    bne idct32_bands_loop
+
+    ; stack operation
+    add sp, sp, #512+2048+2048
+    vpop {d8-d15}
+    pop  {r4-r11}
+    bx              lr
+    ENDP  ; |vpx_idct32x32_1024_add_neon|
+    END
diff --git a/libs/libvpx/vpx_dsp/arm/idct32x32_add_neon.c b/libs/libvpx/vpx_dsp/arm/idct32x32_add_neon.c
new file mode 100644
index 0000000000..025437eb96
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/idct32x32_add_neon.c
@@ -0,0 +1,719 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "vpx_dsp/txfm_common.h"
+
+#define LOAD_FROM_TRANSPOSED(prev, first, second) \
+    q14s16 = vld1q_s16(trans_buf + first * 8); \
+    q13s16 = vld1q_s16(trans_buf + second * 8);
+
+#define LOAD_FROM_OUTPUT(prev, first, second, qA, qB) \
+    qA = vld1q_s16(out + first * 32); \
+    qB = vld1q_s16(out + second * 32);
+
+#define STORE_IN_OUTPUT(prev, first, second, qA, qB) \
+    vst1q_s16(out + first * 32, qA); \
+    vst1q_s16(out + second * 32, qB);
+
+#define  STORE_COMBINE_CENTER_RESULTS(r10, r9) \
+       __STORE_COMBINE_CENTER_RESULTS(r10, r9, stride, \
+                                      q6s16, q7s16, q8s16, q9s16);
+static INLINE void __STORE_COMBINE_CENTER_RESULTS(
+        uint8_t *p1,
+        uint8_t *p2,
+        int stride,
+        int16x8_t q6s16,
+        int16x8_t q7s16,
+        int16x8_t q8s16,
+        int16x8_t q9s16) {
+    int16x4_t d8s16, d9s16, d10s16, d11s16;
+
+    d8s16 = vld1_s16((int16_t *)p1);
+    p1 += stride;
+    d11s16 = vld1_s16((int16_t *)p2);
+    p2 -= stride;
+    d9s16 = vld1_s16((int16_t *)p1);
+    d10s16 = vld1_s16((int16_t *)p2);
+
+    q7s16 = vrshrq_n_s16(q7s16, 6);
+    q8s16 = vrshrq_n_s16(q8s16, 6);
+    q9s16 = vrshrq_n_s16(q9s16, 6);
+    q6s16 = vrshrq_n_s16(q6s16, 6);
+
+    q7s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q7s16),
+                                           vreinterpret_u8_s16(d9s16)));
+    q8s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q8s16),
+                                           vreinterpret_u8_s16(d10s16)));
+    q9s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q9s16),
+                                           vreinterpret_u8_s16(d11s16)));
+    q6s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q6s16),
+                                           vreinterpret_u8_s16(d8s16)));
+
+    d9s16  = vreinterpret_s16_u8(vqmovun_s16(q7s16));
+    d10s16 = vreinterpret_s16_u8(vqmovun_s16(q8s16));
+    d11s16 = vreinterpret_s16_u8(vqmovun_s16(q9s16));
+    d8s16  = vreinterpret_s16_u8(vqmovun_s16(q6s16));
+
+    vst1_s16((int16_t *)p1, d9s16);
+    p1 -= stride;
+    vst1_s16((int16_t *)p2, d10s16);
+    p2 += stride;
+    vst1_s16((int16_t *)p1, d8s16);
+    vst1_s16((int16_t *)p2, d11s16);
+    return;
+}
+
+#define  STORE_COMBINE_EXTREME_RESULTS(r7, r6); \
+       __STORE_COMBINE_EXTREME_RESULTS(r7, r6, stride, \
+                                      q4s16, q5s16, q6s16, q7s16);
+static INLINE void __STORE_COMBINE_EXTREME_RESULTS(
+        uint8_t *p1,
+        uint8_t *p2,
+        int stride,
+        int16x8_t q4s16,
+        int16x8_t q5s16,
+        int16x8_t q6s16,
+        int16x8_t q7s16) {
+    int16x4_t d4s16, d5s16, d6s16, d7s16;
+
+    d4s16 = vld1_s16((int16_t *)p1);
+    p1 += stride;
+    d7s16 = vld1_s16((int16_t *)p2);
+    p2 -= stride;
+    d5s16 = vld1_s16((int16_t *)p1);
+    d6s16 = vld1_s16((int16_t *)p2);
+
+    q5s16 = vrshrq_n_s16(q5s16, 6);
+    q6s16 = vrshrq_n_s16(q6s16, 6);
+    q7s16 = vrshrq_n_s16(q7s16, 6);
+    q4s16 = vrshrq_n_s16(q4s16, 6);
+
+    q5s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q5s16),
+                                           vreinterpret_u8_s16(d5s16)));
+    q6s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q6s16),
+                                           vreinterpret_u8_s16(d6s16)));
+    q7s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q7s16),
+                                           vreinterpret_u8_s16(d7s16)));
+    q4s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q4s16),
+                                           vreinterpret_u8_s16(d4s16)));
+
+    d5s16 = vreinterpret_s16_u8(vqmovun_s16(q5s16));
+    d6s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16));
+    d7s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16));
+    d4s16 = vreinterpret_s16_u8(vqmovun_s16(q4s16));
+
+    vst1_s16((int16_t *)p1, d5s16);
+    p1 -= stride;
+    vst1_s16((int16_t *)p2, d6s16);
+    p2 += stride;
+    vst1_s16((int16_t *)p2, d7s16);
+    vst1_s16((int16_t *)p1, d4s16);
+    return;
+}
+
+#define DO_BUTTERFLY_STD(const_1, const_2, qA, qB) \
+        DO_BUTTERFLY(q14s16, q13s16, const_1, const_2, qA, qB);
+static INLINE void DO_BUTTERFLY(
+        int16x8_t q14s16,
+        int16x8_t q13s16,
+        int16_t first_const,
+        int16_t second_const,
+        int16x8_t *qAs16,
+        int16x8_t *qBs16) {
+    int16x4_t d30s16, d31s16;
+    int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q15s32;
+    int16x4_t dCs16, dDs16, dAs16, dBs16;
+
+    dCs16 = vget_low_s16(q14s16);
+    dDs16 = vget_high_s16(q14s16);
+    dAs16 = vget_low_s16(q13s16);
+    dBs16 = vget_high_s16(q13s16);
+
+    d30s16 = vdup_n_s16(first_const);
+    d31s16 = vdup_n_s16(second_const);
+
+    q8s32 = vmull_s16(dCs16, d30s16);
+    q10s32 = vmull_s16(dAs16, d31s16);
+    q9s32 = vmull_s16(dDs16, d30s16);
+    q11s32 = vmull_s16(dBs16, d31s16);
+    q12s32 = vmull_s16(dCs16, d31s16);
+
+    q8s32 = vsubq_s32(q8s32, q10s32);
+    q9s32 = vsubq_s32(q9s32, q11s32);
+
+    q10s32 = vmull_s16(dDs16, d31s16);
+    q11s32 = vmull_s16(dAs16, d30s16);
+    q15s32 = vmull_s16(dBs16, d30s16);
+
+    q11s32 = vaddq_s32(q12s32, q11s32);
+    q10s32 = vaddq_s32(q10s32, q15s32);
+
+    *qAs16 = vcombine_s16(vqrshrn_n_s32(q8s32, 14),
+                          vqrshrn_n_s32(q9s32, 14));
+    *qBs16 = vcombine_s16(vqrshrn_n_s32(q11s32, 14),
+                          vqrshrn_n_s32(q10s32, 14));
+    return;
+}
+
+static INLINE void idct32_transpose_pair(
+        int16_t *input,
+        int16_t *t_buf) {
+    int16_t *in;
+    int i;
+    const int stride = 32;
+    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+    int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
+    int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
+
+    for (i = 0; i < 4; i++, input += 8) {
+        in = input;
+        q8s16 = vld1q_s16(in);
+        in += stride;
+        q9s16 = vld1q_s16(in);
+        in += stride;
+        q10s16 = vld1q_s16(in);
+        in += stride;
+        q11s16 = vld1q_s16(in);
+        in += stride;
+        q12s16 = vld1q_s16(in);
+        in += stride;
+        q13s16 = vld1q_s16(in);
+        in += stride;
+        q14s16 = vld1q_s16(in);
+        in += stride;
+        q15s16 = vld1q_s16(in);
+
+        d16s16 = vget_low_s16(q8s16);
+        d17s16 = vget_high_s16(q8s16);
+        d18s16 = vget_low_s16(q9s16);
+        d19s16 = vget_high_s16(q9s16);
+        d20s16 = vget_low_s16(q10s16);
+        d21s16 = vget_high_s16(q10s16);
+        d22s16 = vget_low_s16(q11s16);
+        d23s16 = vget_high_s16(q11s16);
+        d24s16 = vget_low_s16(q12s16);
+        d25s16 = vget_high_s16(q12s16);
+        d26s16 = vget_low_s16(q13s16);
+        d27s16 = vget_high_s16(q13s16);
+        d28s16 = vget_low_s16(q14s16);
+        d29s16 = vget_high_s16(q14s16);
+        d30s16 = vget_low_s16(q15s16);
+        d31s16 = vget_high_s16(q15s16);
+
+        q8s16  = vcombine_s16(d16s16, d24s16);  // vswp d17, d24
+        q9s16  = vcombine_s16(d18s16, d26s16);  // vswp d19, d26
+        q10s16 = vcombine_s16(d20s16, d28s16);  // vswp d21, d28
+        q11s16 = vcombine_s16(d22s16, d30s16);  // vswp d23, d30
+        q12s16 = vcombine_s16(d17s16, d25s16);
+        q13s16 = vcombine_s16(d19s16, d27s16);
+        q14s16 = vcombine_s16(d21s16, d29s16);
+        q15s16 = vcombine_s16(d23s16, d31s16);
+
+        q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16),
+                            vreinterpretq_s32_s16(q10s16));
+        q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q9s16),
+                            vreinterpretq_s32_s16(q11s16));
+        q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q12s16),
+                            vreinterpretq_s32_s16(q14s16));
+        q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q13s16),
+                            vreinterpretq_s32_s16(q15s16));
+
+        q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]),   // q8
+                            vreinterpretq_s16_s32(q1x2s32.val[0]));  // q9
+        q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]),   // q10
+                            vreinterpretq_s16_s32(q1x2s32.val[1]));  // q11
+        q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]),   // q12
+                            vreinterpretq_s16_s32(q3x2s32.val[0]));  // q13
+        q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]),   // q14
+                            vreinterpretq_s16_s32(q3x2s32.val[1]));  // q15
+
+        vst1q_s16(t_buf, q0x2s16.val[0]);
+        t_buf += 8;
+        vst1q_s16(t_buf, q0x2s16.val[1]);
+        t_buf += 8;
+        vst1q_s16(t_buf, q1x2s16.val[0]);
+        t_buf += 8;
+        vst1q_s16(t_buf, q1x2s16.val[1]);
+        t_buf += 8;
+        vst1q_s16(t_buf, q2x2s16.val[0]);
+        t_buf += 8;
+        vst1q_s16(t_buf, q2x2s16.val[1]);
+        t_buf += 8;
+        vst1q_s16(t_buf, q3x2s16.val[0]);
+        t_buf += 8;
+        vst1q_s16(t_buf, q3x2s16.val[1]);
+        t_buf += 8;
+    }
+    return;
+}
+
+static INLINE void idct32_bands_end_1st_pass(
+        int16_t *out,
+        int16x8_t q2s16,
+        int16x8_t q3s16,
+        int16x8_t q6s16,
+        int16x8_t q7s16,
+        int16x8_t q8s16,
+        int16x8_t q9s16,
+        int16x8_t q10s16,
+        int16x8_t q11s16,
+        int16x8_t q12s16,
+        int16x8_t q13s16,
+        int16x8_t q14s16,
+        int16x8_t q15s16) {
+    int16x8_t q0s16, q1s16, q4s16, q5s16;
+
+    STORE_IN_OUTPUT(17, 16, 17, q6s16, q7s16);
+    STORE_IN_OUTPUT(17, 14, 15, q8s16, q9s16);
+
+    LOAD_FROM_OUTPUT(15, 30, 31, q0s16, q1s16);
+    q4s16 = vaddq_s16(q2s16, q1s16);
+    q5s16 = vaddq_s16(q3s16, q0s16);
+    q6s16 = vsubq_s16(q3s16, q0s16);
+    q7s16 = vsubq_s16(q2s16, q1s16);
+    STORE_IN_OUTPUT(31, 30, 31, q6s16, q7s16);
+    STORE_IN_OUTPUT(31, 0, 1, q4s16, q5s16);
+
+    LOAD_FROM_OUTPUT(1, 12, 13, q0s16, q1s16);
+    q2s16 = vaddq_s16(q10s16, q1s16);
+    q3s16 = vaddq_s16(q11s16, q0s16);
+    q4s16 = vsubq_s16(q11s16, q0s16);
+    q5s16 = vsubq_s16(q10s16, q1s16);
+
+    LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16);
+    q8s16 = vaddq_s16(q4s16, q1s16);
+    q9s16 = vaddq_s16(q5s16, q0s16);
+    q6s16 = vsubq_s16(q5s16, q0s16);
+    q7s16 = vsubq_s16(q4s16, q1s16);
+    STORE_IN_OUTPUT(19, 18, 19, q6s16, q7s16);
+    STORE_IN_OUTPUT(19, 12, 13, q8s16, q9s16);
+
+    LOAD_FROM_OUTPUT(13, 28, 29, q0s16, q1s16);
+    q4s16 = vaddq_s16(q2s16, q1s16);
+    q5s16 = vaddq_s16(q3s16, q0s16);
+    q6s16 = vsubq_s16(q3s16, q0s16);
+    q7s16 = vsubq_s16(q2s16, q1s16);
+    STORE_IN_OUTPUT(29, 28, 29, q6s16, q7s16);
+    STORE_IN_OUTPUT(29, 2, 3, q4s16, q5s16);
+
+    LOAD_FROM_OUTPUT(3, 10, 11, q0s16, q1s16);
+    q2s16 = vaddq_s16(q12s16, q1s16);
+    q3s16 = vaddq_s16(q13s16, q0s16);
+    q4s16 = vsubq_s16(q13s16, q0s16);
+    q5s16 = vsubq_s16(q12s16, q1s16);
+
+    LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16);
+    q8s16 = vaddq_s16(q4s16, q1s16);
+    q9s16 = vaddq_s16(q5s16, q0s16);
+    q6s16 = vsubq_s16(q5s16, q0s16);
+    q7s16 = vsubq_s16(q4s16, q1s16);
+    STORE_IN_OUTPUT(21, 20, 21, q6s16, q7s16);
+    STORE_IN_OUTPUT(21, 10, 11, q8s16, q9s16);
+
+    LOAD_FROM_OUTPUT(11, 26, 27, q0s16, q1s16);
+    q4s16 = vaddq_s16(q2s16, q1s16);
+    q5s16 = vaddq_s16(q3s16, q0s16);
+    q6s16 = vsubq_s16(q3s16, q0s16);
+    q7s16 = vsubq_s16(q2s16, q1s16);
+    STORE_IN_OUTPUT(27, 26, 27, q6s16, q7s16);
+    STORE_IN_OUTPUT(27, 4, 5, q4s16, q5s16);
+
+    LOAD_FROM_OUTPUT(5, 8, 9, q0s16, q1s16);
+    q2s16 = vaddq_s16(q14s16, q1s16);
+    q3s16 = vaddq_s16(q15s16, q0s16);
+    q4s16 = vsubq_s16(q15s16, q0s16);
+    q5s16 = vsubq_s16(q14s16, q1s16);
+
+    LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16);
+    q8s16 = vaddq_s16(q4s16, q1s16);
+    q9s16 = vaddq_s16(q5s16, q0s16);
+    q6s16 = vsubq_s16(q5s16, q0s16);
+    q7s16 = vsubq_s16(q4s16, q1s16);
+    STORE_IN_OUTPUT(23, 22, 23, q6s16, q7s16);
+    STORE_IN_OUTPUT(23, 8, 9, q8s16, q9s16);
+
+    LOAD_FROM_OUTPUT(9, 24, 25, q0s16, q1s16);
+    q4s16 = vaddq_s16(q2s16, q1s16);
+    q5s16 = vaddq_s16(q3s16, q0s16);
+    q6s16 = vsubq_s16(q3s16, q0s16);
+    q7s16 = vsubq_s16(q2s16, q1s16);
+    STORE_IN_OUTPUT(25, 24, 25, q6s16, q7s16);
+    STORE_IN_OUTPUT(25, 6, 7, q4s16, q5s16);
+    return;
+}
+
+static INLINE void idct32_bands_end_2nd_pass(
+        int16_t *out,
+        uint8_t *dest,
+        int stride,
+        int16x8_t q2s16,
+        int16x8_t q3s16,
+        int16x8_t q6s16,
+        int16x8_t q7s16,
+        int16x8_t q8s16,
+        int16x8_t q9s16,
+        int16x8_t q10s16,
+        int16x8_t q11s16,
+        int16x8_t q12s16,
+        int16x8_t q13s16,
+        int16x8_t q14s16,
+        int16x8_t q15s16) {
+    uint8_t *r6  = dest + 31 * stride;
+    uint8_t *r7  = dest/* +  0 * stride*/;
+    uint8_t *r9  = dest + 15 * stride;
+    uint8_t *r10 = dest + 16 * stride;
+    int str2 = stride << 1;
+    int16x8_t q0s16, q1s16, q4s16, q5s16;
+
+    STORE_COMBINE_CENTER_RESULTS(r10, r9);
+    r10 += str2; r9 -= str2;
+
+    LOAD_FROM_OUTPUT(17, 30, 31, q0s16, q1s16)
+    q4s16 = vaddq_s16(q2s16, q1s16);
+    q5s16 = vaddq_s16(q3s16, q0s16);
+    q6s16 = vsubq_s16(q3s16, q0s16);
+    q7s16 = vsubq_s16(q2s16, q1s16);
+    STORE_COMBINE_EXTREME_RESULTS(r7, r6);
+    r7 += str2; r6 -= str2;
+
+    LOAD_FROM_OUTPUT(31, 12, 13, q0s16, q1s16)
+    q2s16 = vaddq_s16(q10s16, q1s16);
+    q3s16 = vaddq_s16(q11s16, q0s16);
+    q4s16 = vsubq_s16(q11s16, q0s16);
+    q5s16 = vsubq_s16(q10s16, q1s16);
+
+    LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16)
+    q8s16 = vaddq_s16(q4s16, q1s16);
+    q9s16 = vaddq_s16(q5s16, q0s16);
+    q6s16 = vsubq_s16(q5s16, q0s16);
+    q7s16 = vsubq_s16(q4s16, q1s16);
+    STORE_COMBINE_CENTER_RESULTS(r10, r9);
+    r10 += str2; r9 -= str2;
+
+    LOAD_FROM_OUTPUT(19, 28, 29, q0s16, q1s16)
+    q4s16 = vaddq_s16(q2s16, q1s16);
+    q5s16 = vaddq_s16(q3s16, q0s16);
+    q6s16 = vsubq_s16(q3s16, q0s16);
+    q7s16 = vsubq_s16(q2s16, q1s16);
+    STORE_COMBINE_EXTREME_RESULTS(r7, r6);
+    r7 += str2; r6 -= str2;
+
+    LOAD_FROM_OUTPUT(29, 10, 11, q0s16, q1s16)
+    q2s16 = vaddq_s16(q12s16, q1s16);
+    q3s16 = vaddq_s16(q13s16, q0s16);
+    q4s16 = vsubq_s16(q13s16, q0s16);
+    q5s16 = vsubq_s16(q12s16, q1s16);
+
+    LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16)
+    q8s16 = vaddq_s16(q4s16, q1s16);
+    q9s16 = vaddq_s16(q5s16, q0s16);
+    q6s16 = vsubq_s16(q5s16, q0s16);
+    q7s16 = vsubq_s16(q4s16, q1s16);
+    STORE_COMBINE_CENTER_RESULTS(r10, r9);
+    r10 += str2; r9 -= str2;
+
+    LOAD_FROM_OUTPUT(21, 26, 27, q0s16, q1s16)
+    q4s16 = vaddq_s16(q2s16, q1s16);
+    q5s16 = vaddq_s16(q3s16, q0s16);
+    q6s16 = vsubq_s16(q3s16, q0s16);
+    q7s16 = vsubq_s16(q2s16, q1s16);
+    STORE_COMBINE_EXTREME_RESULTS(r7, r6);
+    r7 += str2; r6 -= str2;
+
+    LOAD_FROM_OUTPUT(27, 8, 9, q0s16, q1s16)
+    q2s16 = vaddq_s16(q14s16, q1s16);
+    q3s16 = vaddq_s16(q15s16, q0s16);
+    q4s16 = vsubq_s16(q15s16, q0s16);
+    q5s16 = vsubq_s16(q14s16, q1s16);
+
+    LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16)
+    q8s16 = vaddq_s16(q4s16, q1s16);
+    q9s16 = vaddq_s16(q5s16, q0s16);
+    q6s16 = vsubq_s16(q5s16, q0s16);
+    q7s16 = vsubq_s16(q4s16, q1s16);
+    STORE_COMBINE_CENTER_RESULTS(r10, r9);
+
+    LOAD_FROM_OUTPUT(23, 24, 25, q0s16, q1s16)
+    q4s16 = vaddq_s16(q2s16, q1s16);
+    q5s16 = vaddq_s16(q3s16, q0s16);
+    q6s16 = vsubq_s16(q3s16, q0s16);
+    q7s16 = vsubq_s16(q2s16, q1s16);
+    STORE_COMBINE_EXTREME_RESULTS(r7, r6);
+    return;
+}
+
+void vpx_idct32x32_1024_add_neon(
+        int16_t *input,
+        uint8_t *dest,
+        int stride) {
+    int i, idct32_pass_loop;
+    int16_t trans_buf[32 * 8];
+    int16_t pass1[32 * 32];
+    int16_t pass2[32 * 32];
+    int16_t *out;
+    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+
+    for (idct32_pass_loop = 0, out = pass1;
+         idct32_pass_loop < 2;
+         idct32_pass_loop++,
+         input = pass1,  // the input of pass2 is the result of pass1
+         out = pass2) {
+        for (i = 0;
+             i < 4; i++,
+             input += 32 * 8, out += 8) {  // idct32_bands_loop
+            idct32_transpose_pair(input, trans_buf);
+
+            // -----------------------------------------
+            // BLOCK A: 16-19,28-31
+            // -----------------------------------------
+            // generate 16,17,30,31
+            // part of stage 1
+            LOAD_FROM_TRANSPOSED(0, 1, 31)
+            DO_BUTTERFLY_STD(cospi_31_64, cospi_1_64, &q0s16, &q2s16)
+            LOAD_FROM_TRANSPOSED(31, 17, 15)
+            DO_BUTTERFLY_STD(cospi_15_64, cospi_17_64, &q1s16, &q3s16)
+            // part of stage 2
+            q4s16 = vaddq_s16(q0s16, q1s16);
+            q13s16 = vsubq_s16(q0s16, q1s16);
+            q6s16 = vaddq_s16(q2s16, q3s16);
+            q14s16 = vsubq_s16(q2s16, q3s16);
+            // part of stage 3
+            DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q5s16, &q7s16)
+
+            // generate 18,19,28,29
+            // part of stage 1
+            LOAD_FROM_TRANSPOSED(15, 9, 23)
+            DO_BUTTERFLY_STD(cospi_23_64, cospi_9_64, &q0s16, &q2s16)
+            LOAD_FROM_TRANSPOSED(23, 25, 7)
+            DO_BUTTERFLY_STD(cospi_7_64, cospi_25_64, &q1s16, &q3s16)
+            // part of stage 2
+            q13s16 = vsubq_s16(q3s16, q2s16);
+            q3s16 = vaddq_s16(q3s16, q2s16);
+            q14s16 = vsubq_s16(q1s16, q0s16);
+            q2s16 = vaddq_s16(q1s16, q0s16);
+            // part of stage 3
+            DO_BUTTERFLY_STD(-cospi_4_64, -cospi_28_64, &q1s16, &q0s16)
+            // part of stage 4
+            q8s16 = vaddq_s16(q4s16, q2s16);
+            q9s16 = vaddq_s16(q5s16, q0s16);
+            q10s16 = vaddq_s16(q7s16, q1s16);
+            q15s16 = vaddq_s16(q6s16, q3s16);
+            q13s16 = vsubq_s16(q5s16, q0s16);
+            q14s16 = vsubq_s16(q7s16, q1s16);
+            STORE_IN_OUTPUT(0, 16, 31, q8s16, q15s16)
+            STORE_IN_OUTPUT(31, 17, 30, q9s16, q10s16)
+            // part of stage 5
+            DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q0s16, &q1s16)
+            STORE_IN_OUTPUT(30, 29, 18, q1s16, q0s16)
+            // part of stage 4
+            q13s16 = vsubq_s16(q4s16, q2s16);
+            q14s16 = vsubq_s16(q6s16, q3s16);
+            // part of stage 5
+            DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q4s16, &q6s16)
+            STORE_IN_OUTPUT(18, 19, 28, q4s16, q6s16)
+
+            // -----------------------------------------
+            // BLOCK B: 20-23,24-27
+            // -----------------------------------------
+            // generate 20,21,26,27
+            // part of stage 1
+            LOAD_FROM_TRANSPOSED(7, 5, 27)
+            DO_BUTTERFLY_STD(cospi_27_64, cospi_5_64, &q0s16, &q2s16)
+            LOAD_FROM_TRANSPOSED(27, 21, 11)
+            DO_BUTTERFLY_STD(cospi_11_64, cospi_21_64, &q1s16, &q3s16)
+            // part of stage 2
+            q13s16 = vsubq_s16(q0s16, q1s16);
+            q0s16 = vaddq_s16(q0s16, q1s16);
+            q14s16 = vsubq_s16(q2s16, q3s16);
+            q2s16 = vaddq_s16(q2s16, q3s16);
+            // part of stage 3
+            DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
+
+            // generate 22,23,24,25
+            // part of stage 1
+            LOAD_FROM_TRANSPOSED(11, 13, 19)
+            DO_BUTTERFLY_STD(cospi_19_64, cospi_13_64, &q5s16, &q7s16)
+            LOAD_FROM_TRANSPOSED(19, 29, 3)
+            DO_BUTTERFLY_STD(cospi_3_64, cospi_29_64, &q4s16, &q6s16)
+            // part of stage 2
+            q14s16 = vsubq_s16(q4s16, q5s16);
+            q5s16  = vaddq_s16(q4s16, q5s16);
+            q13s16 = vsubq_s16(q6s16, q7s16);
+            q6s16  = vaddq_s16(q6s16, q7s16);
+            // part of stage 3
+            DO_BUTTERFLY_STD(-cospi_20_64, -cospi_12_64, &q4s16, &q7s16)
+            // part of stage 4
+            q10s16 = vaddq_s16(q7s16, q1s16);
+            q11s16 = vaddq_s16(q5s16, q0s16);
+            q12s16 = vaddq_s16(q6s16, q2s16);
+            q15s16 = vaddq_s16(q4s16, q3s16);
+            // part of stage 6
+            LOAD_FROM_OUTPUT(28, 16, 17, q14s16, q13s16)
+            q8s16 = vaddq_s16(q14s16, q11s16);
+            q9s16 = vaddq_s16(q13s16, q10s16);
+            q13s16 = vsubq_s16(q13s16, q10s16);
+            q11s16 = vsubq_s16(q14s16, q11s16);
+            STORE_IN_OUTPUT(17, 17, 16, q9s16, q8s16)
+            LOAD_FROM_OUTPUT(16, 30, 31, q14s16, q9s16)
+            q8s16  = vsubq_s16(q9s16, q12s16);
+            q10s16 = vaddq_s16(q14s16, q15s16);
+            q14s16 = vsubq_s16(q14s16, q15s16);
+            q12s16 = vaddq_s16(q9s16, q12s16);
+            STORE_IN_OUTPUT(31, 30, 31, q10s16, q12s16)
+            // part of stage 7
+            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
+            STORE_IN_OUTPUT(31, 25, 22, q14s16, q13s16)
+            q13s16 = q11s16;
+            q14s16 = q8s16;
+            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
+            STORE_IN_OUTPUT(22, 24, 23, q14s16, q13s16)
+            // part of stage 4
+            q14s16 = vsubq_s16(q5s16, q0s16);
+            q13s16 = vsubq_s16(q6s16, q2s16);
+            DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q5s16, &q6s16);
+            q14s16 = vsubq_s16(q7s16, q1s16);
+            q13s16 = vsubq_s16(q4s16, q3s16);
+            DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q0s16, &q1s16);
+            // part of stage 6
+            LOAD_FROM_OUTPUT(23, 18, 19, q14s16, q13s16)
+            q8s16 = vaddq_s16(q14s16, q1s16);
+            q9s16 = vaddq_s16(q13s16, q6s16);
+            q13s16 = vsubq_s16(q13s16, q6s16);
+            q1s16 = vsubq_s16(q14s16, q1s16);
+            STORE_IN_OUTPUT(19, 18, 19, q8s16, q9s16)
+            LOAD_FROM_OUTPUT(19, 28, 29, q8s16, q9s16)
+            q14s16 = vsubq_s16(q8s16, q5s16);
+            q10s16 = vaddq_s16(q8s16, q5s16);
+            q11s16 = vaddq_s16(q9s16, q0s16);
+            q0s16 = vsubq_s16(q9s16, q0s16);
+            STORE_IN_OUTPUT(29, 28, 29, q10s16, q11s16)
+            // part of stage 7
+            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
+            STORE_IN_OUTPUT(29, 20, 27, q13s16, q14s16)
+            DO_BUTTERFLY(q0s16, q1s16, cospi_16_64, cospi_16_64,
+                                                         &q1s16, &q0s16);
+            STORE_IN_OUTPUT(27, 21, 26, q1s16, q0s16)
+
+            // -----------------------------------------
+            // BLOCK C: 8-10,11-15
+            // -----------------------------------------
+            // generate 8,9,14,15
+            // part of stage 2
+            LOAD_FROM_TRANSPOSED(3, 2, 30)
+            DO_BUTTERFLY_STD(cospi_30_64, cospi_2_64, &q0s16, &q2s16)
+            LOAD_FROM_TRANSPOSED(30, 18, 14)
+            DO_BUTTERFLY_STD(cospi_14_64, cospi_18_64, &q1s16, &q3s16)
+            // part of stage 3
+            q13s16 = vsubq_s16(q0s16, q1s16);
+            q0s16 = vaddq_s16(q0s16, q1s16);
+            q14s16 = vsubq_s16(q2s16, q3s16);
+            q2s16 = vaddq_s16(q2s16, q3s16);
+            // part of stage 4
+            DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q1s16, &q3s16)
+
+            // generate 10,11,12,13
+            // part of stage 2
+            LOAD_FROM_TRANSPOSED(14, 10, 22)
+            DO_BUTTERFLY_STD(cospi_22_64, cospi_10_64, &q5s16, &q7s16)
+            LOAD_FROM_TRANSPOSED(22, 26, 6)
+            DO_BUTTERFLY_STD(cospi_6_64, cospi_26_64, &q4s16, &q6s16)
+            // part of stage 3
+            q14s16 = vsubq_s16(q4s16, q5s16);
+            q5s16 = vaddq_s16(q4s16, q5s16);
+            q13s16 = vsubq_s16(q6s16, q7s16);
+            q6s16 = vaddq_s16(q6s16, q7s16);
+            // part of stage 4
+            DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q4s16, &q7s16)
+            // part of stage 5
+            q8s16 = vaddq_s16(q0s16, q5s16);
+            q9s16 = vaddq_s16(q1s16, q7s16);
+            q13s16 = vsubq_s16(q1s16, q7s16);
+            q14s16 = vsubq_s16(q3s16, q4s16);
+            q10s16 = vaddq_s16(q3s16, q4s16);
+            q15s16 = vaddq_s16(q2s16, q6s16);
+            STORE_IN_OUTPUT(26, 8, 15, q8s16, q15s16)
+            STORE_IN_OUTPUT(15, 9, 14, q9s16, q10s16)
+            // part of stage 6
+            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
+            STORE_IN_OUTPUT(14, 13, 10, q3s16, q1s16)
+            q13s16 = vsubq_s16(q0s16, q5s16);
+            q14s16 = vsubq_s16(q2s16, q6s16);
+            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
+            STORE_IN_OUTPUT(10, 11, 12, q1s16, q3s16)
+
+            // -----------------------------------------
+            // BLOCK D: 0-3,4-7
+            // -----------------------------------------
+            // generate 4,5,6,7
+            // part of stage 3
+            LOAD_FROM_TRANSPOSED(6, 4, 28)
+            DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q0s16, &q2s16)
+            LOAD_FROM_TRANSPOSED(28, 20, 12)
+            DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
+            // part of stage 4
+            q13s16 = vsubq_s16(q0s16, q1s16);
+            q0s16 = vaddq_s16(q0s16, q1s16);
+            q14s16 = vsubq_s16(q2s16, q3s16);
+            q2s16 = vaddq_s16(q2s16, q3s16);
+            // part of stage 5
+            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
+
+            // generate 0,1,2,3
+            // part of stage 4
+            LOAD_FROM_TRANSPOSED(12, 0, 16)
+            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q5s16, &q7s16)
+            LOAD_FROM_TRANSPOSED(16, 8, 24)
+            DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q14s16, &q6s16)
+            // part of stage 5
+            q4s16 = vaddq_s16(q7s16, q6s16);
+            q7s16 = vsubq_s16(q7s16, q6s16);
+            q6s16 = vsubq_s16(q5s16, q14s16);
+            q5s16 = vaddq_s16(q5s16, q14s16);
+            // part of stage 6
+            q8s16 = vaddq_s16(q4s16, q2s16);
+            q9s16 = vaddq_s16(q5s16, q3s16);
+            q10s16 = vaddq_s16(q6s16, q1s16);
+            q11s16 = vaddq_s16(q7s16, q0s16);
+            q12s16 = vsubq_s16(q7s16, q0s16);
+            q13s16 = vsubq_s16(q6s16, q1s16);
+            q14s16 = vsubq_s16(q5s16, q3s16);
+            q15s16 = vsubq_s16(q4s16, q2s16);
+            // part of stage 7
+            LOAD_FROM_OUTPUT(12, 14, 15, q0s16, q1s16)
+            q2s16 = vaddq_s16(q8s16, q1s16);
+            q3s16 = vaddq_s16(q9s16, q0s16);
+            q4s16 = vsubq_s16(q9s16, q0s16);
+            q5s16 = vsubq_s16(q8s16, q1s16);
+            LOAD_FROM_OUTPUT(15, 16, 17, q0s16, q1s16)
+            q8s16 = vaddq_s16(q4s16, q1s16);
+            q9s16 = vaddq_s16(q5s16, q0s16);
+            q6s16 = vsubq_s16(q5s16, q0s16);
+            q7s16 = vsubq_s16(q4s16, q1s16);
+
+            if (idct32_pass_loop == 0) {
+                idct32_bands_end_1st_pass(out,
+                         q2s16, q3s16, q6s16, q7s16, q8s16, q9s16,
+                         q10s16, q11s16, q12s16, q13s16, q14s16, q15s16);
+            } else {
+                idct32_bands_end_2nd_pass(out, dest, stride,
+                         q2s16, q3s16, q6s16, q7s16, q8s16, q9s16,
+                         q10s16, q11s16, q12s16, q13s16, q14s16, q15s16);
+                dest += 8;
+            }
+        }
+    }
+    return;
+}
diff --git a/libs/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm b/libs/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm
new file mode 100644
index 0000000000..adab715dde
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm
@@ -0,0 +1,68 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vpx_idct4x4_1_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vpx_idct4x4_1_add_neon(int16_t *input, uint8_t *dest,
+;                                  int dest_stride)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride)
+
+|vpx_idct4x4_1_add_neon| PROC
+    ldrsh            r0, [r0]
+
+    ; generate cospi_16_64 = 11585
+    mov              r12, #0x2d00
+    add              r12, #0x41
+
+    ; out = dct_const_round_shift(input[0] * cospi_16_64)
+    mul              r0, r0, r12               ; input[0] * cospi_16_64
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; out = dct_const_round_shift(out * cospi_16_64)
+    mul              r0, r0, r12               ; out * cospi_16_64
+    mov              r12, r1                   ; save dest
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; a1 = ROUND_POWER_OF_TWO(out, 4)
+    add              r0, r0, #8                ; + (1 <<((4) - 1))
+    asr              r0, r0, #4                ; >> 4
+
+    vdup.s16         q0, r0                    ; duplicate a1
+
+    vld1.32          {d2[0]}, [r1], r2
+    vld1.32          {d2[1]}, [r1], r2
+    vld1.32          {d4[0]}, [r1], r2
+    vld1.32          {d4[1]}, [r1]
+
+    vaddw.u8         q8, q0, d2                ; dest[x] + a1
+    vaddw.u8         q9, q0, d4
+
+    vqmovun.s16      d6, q8                    ; clip_pixel
+    vqmovun.s16      d7, q9
+
+    vst1.32          {d6[0]}, [r12], r2
+    vst1.32          {d6[1]}, [r12], r2
+    vst1.32          {d7[0]}, [r12], r2
+    vst1.32          {d7[1]}, [r12]
+
+    bx               lr
+    ENDP             ; |vpx_idct4x4_1_add_neon|
+
+    END
diff --git a/libs/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c b/libs/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c
new file mode 100644
index 0000000000..ea618700c9
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c
@@ -0,0 +1,50 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "vpx_dsp/inv_txfm.h"
+#include "vpx_ports/mem.h"
+
+void vpx_idct4x4_1_add_neon(
+        int16_t *input,
+        uint8_t *dest,
+        int dest_stride) {
+    uint8x8_t d6u8;
+    uint32x2_t d2u32 = vdup_n_u32(0);
+    uint16x8_t q8u16;
+    int16x8_t q0s16;
+    uint8_t *d1, *d2;
+    int16_t i, a1, cospi_16_64 = 11585;
+    int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+    out = dct_const_round_shift(out * cospi_16_64);
+    a1 = ROUND_POWER_OF_TWO(out, 4);
+
+    q0s16 = vdupq_n_s16(a1);
+
+    // dc_only_idct_add
+    d1 = d2 = dest;
+    for (i = 0; i < 2; i++) {
+        d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 0);
+        d1 += dest_stride;
+        d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 1);
+        d1 += dest_stride;
+
+        q8u16 = vaddw_u8(vreinterpretq_u16_s16(q0s16),
+                         vreinterpret_u8_u32(d2u32));
+        d6u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+
+        vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 0);
+        d2 += dest_stride;
+        vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 1);
+        d2 += dest_stride;
+    }
+    return;
+}
diff --git a/libs/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm b/libs/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm
new file mode 100644
index 0000000000..877fbd6343
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm
@@ -0,0 +1,190 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vpx_idct4x4_16_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+    AREA     Block, CODE, READONLY ; name this block of code
+;void vpx_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride)
+
+|vpx_idct4x4_16_add_neon| PROC
+
+    ; The 2D transform is done with two passes which are actually pretty
+    ; similar. We first transform the rows. This is done by transposing
+    ; the inputs, doing an SIMD column transform (the columns are the
+    ; transposed rows) and then transpose the results (so that it goes back
+    ; in normal/row positions). Then, we transform the columns by doing
+    ; another SIMD column transform.
+    ; So, two passes of a transpose followed by a column transform.
+
+    ; load the inputs into q8-q9, d16-d19
+    vld1.s16        {q8,q9}, [r0]!
+
+    ; generate scalar constants
+    ; cospi_8_64 = 15137 = 0x3b21
+    mov             r0, #0x3b00
+    add             r0, #0x21
+    ; cospi_16_64 = 11585 = 0x2d41
+    mov             r3, #0x2d00
+    add             r3, #0x41
+    ; cospi_24_64 = 6270 = 0x 187e
+    mov             r12, #0x1800
+    add             r12, #0x7e
+
+    ; transpose the input data
+    ; 00 01 02 03   d16
+    ; 10 11 12 13   d17
+    ; 20 21 22 23   d18
+    ; 30 31 32 33   d19
+    vtrn.16         d16, d17
+    vtrn.16         d18, d19
+
+    ; generate constant vectors
+    vdup.16         d20, r0         ; replicate cospi_8_64
+    vdup.16         d21, r3         ; replicate cospi_16_64
+
+    ; 00 10 02 12   d16
+    ; 01 11 03 13   d17
+    ; 20 30 22 32   d18
+    ; 21 31 23 33   d19
+    vtrn.32         q8, q9
+    ; 00 10 20 30   d16
+    ; 01 11 21 31   d17
+    ; 02 12 22 32   d18
+    ; 03 13 23 33   d19
+
+    vdup.16         d22, r12        ; replicate cospi_24_64
+
+    ; do the transform on transposed rows
+
+    ; stage 1
+    vadd.s16  d23, d16, d18         ; (input[0] + input[2])
+    vsub.s16  d24, d16, d18         ; (input[0] - input[2])
+
+    vmull.s16 q15, d17, d22         ; input[1] * cospi_24_64
+    vmull.s16 q1,  d17, d20         ; input[1] * cospi_8_64
+
+    ; (input[0] + input[2]) * cospi_16_64;
+    ; (input[0] - input[2]) * cospi_16_64;
+    vmull.s16 q13, d23, d21
+    vmull.s16 q14, d24, d21
+
+    ; input[1] * cospi_24_64 - input[3] * cospi_8_64;
+    ; input[1] * cospi_8_64  + input[3] * cospi_24_64;
+    vmlsl.s16 q15, d19, d20
+    vmlal.s16 q1,  d19, d22
+
+    ; dct_const_round_shift
+    vqrshrn.s32 d26, q13, #14
+    vqrshrn.s32 d27, q14, #14
+    vqrshrn.s32 d29, q15, #14
+    vqrshrn.s32 d28, q1,  #14
+
+    ; stage 2
+    ; output[0] = step[0] + step[3];
+    ; output[1] = step[1] + step[2];
+    ; output[3] = step[0] - step[3];
+    ; output[2] = step[1] - step[2];
+    vadd.s16 q8,  q13, q14
+    vsub.s16 q9,  q13, q14
+    vswp     d18, d19
+
+    ; transpose the results
+    ; 00 01 02 03   d16
+    ; 10 11 12 13   d17
+    ; 20 21 22 23   d18
+    ; 30 31 32 33   d19
+    vtrn.16         d16, d17
+    vtrn.16         d18, d19
+    ; 00 10 02 12   d16
+    ; 01 11 03 13   d17
+    ; 20 30 22 32   d18
+    ; 21 31 23 33   d19
+    vtrn.32         q8, q9
+    ; 00 10 20 30   d16
+    ; 01 11 21 31   d17
+    ; 02 12 22 32   d18
+    ; 03 13 23 33   d19
+
+    ; do the transform on columns
+
+    ; stage 1
+    vadd.s16  d23, d16, d18         ; (input[0] + input[2])
+    vsub.s16  d24, d16, d18         ; (input[0] - input[2])
+
+    vmull.s16 q15, d17, d22         ; input[1] * cospi_24_64
+    vmull.s16 q1,  d17, d20         ; input[1] * cospi_8_64
+
+    ; (input[0] + input[2]) * cospi_16_64;
+    ; (input[0] - input[2]) * cospi_16_64;
+    vmull.s16 q13, d23, d21
+    vmull.s16 q14, d24, d21
+
+    ; input[1] * cospi_24_64 - input[3] * cospi_8_64;
+    ; input[1] * cospi_8_64  + input[3] * cospi_24_64;
+    vmlsl.s16 q15, d19, d20
+    vmlal.s16 q1,  d19, d22
+
+    ; dct_const_round_shift
+    vqrshrn.s32 d26, q13, #14
+    vqrshrn.s32 d27, q14, #14
+    vqrshrn.s32 d29, q15, #14
+    vqrshrn.s32 d28, q1,  #14
+
+    ; stage 2
+    ; output[0] = step[0] + step[3];
+    ; output[1] = step[1] + step[2];
+    ; output[3] = step[0] - step[3];
+    ; output[2] = step[1] - step[2];
+    vadd.s16 q8,  q13, q14
+    vsub.s16 q9,  q13, q14
+
+    ; The results are in two registers, one of them being swapped. This will
+    ; be taken care of by loading the 'dest' value in a swapped fashion and
+    ; also storing them in the same swapped fashion.
+    ; temp_out[0, 1] = d16, d17 = q8
+    ; temp_out[2, 3] = d19, d18 = q9 swapped
+
+    ; ROUND_POWER_OF_TWO(temp_out[j], 4)
+    vrshr.s16 q8, q8, #4
+    vrshr.s16 q9, q9, #4
+
+    vld1.32 {d26[0]}, [r1], r2
+    vld1.32 {d26[1]}, [r1], r2
+    vld1.32 {d27[1]}, [r1], r2
+    vld1.32 {d27[0]}, [r1]  ; no post-increment
+
+    ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]
+    vaddw.u8 q8, q8, d26
+    vaddw.u8 q9, q9, d27
+
+    ; clip_pixel
+    vqmovun.s16 d26, q8
+    vqmovun.s16 d27, q9
+
+    ; do the stores in reverse order with negative post-increment, by changing
+    ; the sign of the stride
+    rsb r2, r2, #0
+    vst1.32 {d27[0]}, [r1], r2
+    vst1.32 {d27[1]}, [r1], r2
+    vst1.32 {d26[1]}, [r1], r2
+    vst1.32 {d26[0]}, [r1]  ; no post-increment
+    bx              lr
+    ENDP  ; |vpx_idct4x4_16_add_neon|
+
+    END
diff --git a/libs/libvpx/vpx_dsp/arm/idct4x4_add_neon.c b/libs/libvpx/vpx_dsp/arm/idct4x4_add_neon.c
new file mode 100644
index 0000000000..3c975c99b7
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/idct4x4_add_neon.c
@@ -0,0 +1,151 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+void vpx_idct4x4_16_add_neon(
+        int16_t *input,
+        uint8_t *dest,
+        int dest_stride) {
+    uint8x8_t d26u8, d27u8;
+    uint32x2_t d26u32, d27u32;
+    uint16x8_t q8u16, q9u16;
+    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16;
+    int16x4_t d22s16, d23s16, d24s16, d26s16, d27s16, d28s16, d29s16;
+    int16x8_t q8s16, q9s16, q13s16, q14s16;
+    int32x4_t q1s32, q13s32, q14s32, q15s32;
+    int16x4x2_t d0x2s16, d1x2s16;
+    int32x4x2_t q0x2s32;
+    uint8_t *d;
+    int16_t cospi_8_64 = 15137;
+    int16_t cospi_16_64 = 11585;
+    int16_t cospi_24_64 = 6270;
+
+    d26u32 = d27u32 = vdup_n_u32(0);
+
+    q8s16 = vld1q_s16(input);
+    q9s16 = vld1q_s16(input + 8);
+
+    d16s16 = vget_low_s16(q8s16);
+    d17s16 = vget_high_s16(q8s16);
+    d18s16 = vget_low_s16(q9s16);
+    d19s16 = vget_high_s16(q9s16);
+
+    d0x2s16 = vtrn_s16(d16s16, d17s16);
+    d1x2s16 = vtrn_s16(d18s16, d19s16);
+    q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
+    q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);
+
+    d20s16 = vdup_n_s16(cospi_8_64);
+    d21s16 = vdup_n_s16(cospi_16_64);
+
+    q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16),
+                        vreinterpretq_s32_s16(q9s16));
+    d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
+    d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
+    d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
+    d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
+
+    d22s16 = vdup_n_s16(cospi_24_64);
+
+    // stage 1
+    d23s16 = vadd_s16(d16s16, d18s16);
+    d24s16 = vsub_s16(d16s16, d18s16);
+
+    q15s32 = vmull_s16(d17s16, d22s16);
+    q1s32  = vmull_s16(d17s16, d20s16);
+    q13s32 = vmull_s16(d23s16, d21s16);
+    q14s32 = vmull_s16(d24s16, d21s16);
+
+    q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);
+    q1s32  = vmlal_s16(q1s32,  d19s16, d22s16);
+
+    d26s16 = vqrshrn_n_s32(q13s32, 14);
+    d27s16 = vqrshrn_n_s32(q14s32, 14);
+    d29s16 = vqrshrn_n_s32(q15s32, 14);
+    d28s16 = vqrshrn_n_s32(q1s32,  14);
+    q13s16 = vcombine_s16(d26s16, d27s16);
+    q14s16 = vcombine_s16(d28s16, d29s16);
+
+    // stage 2
+    q8s16 = vaddq_s16(q13s16, q14s16);
+    q9s16 = vsubq_s16(q13s16, q14s16);
+
+    d16s16 = vget_low_s16(q8s16);
+    d17s16 = vget_high_s16(q8s16);
+    d18s16 = vget_high_s16(q9s16);  // vswp d18 d19
+    d19s16 = vget_low_s16(q9s16);
+
+    d0x2s16 = vtrn_s16(d16s16, d17s16);
+    d1x2s16 = vtrn_s16(d18s16, d19s16);
+    q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
+    q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);
+
+    q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16),
+                        vreinterpretq_s32_s16(q9s16));
+    d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
+    d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
+    d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
+    d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
+
+    // do the transform on columns
+    // stage 1
+    d23s16 = vadd_s16(d16s16, d18s16);
+    d24s16 = vsub_s16(d16s16, d18s16);
+
+    q15s32 = vmull_s16(d17s16, d22s16);
+    q1s32  = vmull_s16(d17s16, d20s16);
+    q13s32 = vmull_s16(d23s16, d21s16);
+    q14s32 = vmull_s16(d24s16, d21s16);
+
+    q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);
+    q1s32  = vmlal_s16(q1s32,  d19s16, d22s16);
+
+    d26s16 = vqrshrn_n_s32(q13s32, 14);
+    d27s16 = vqrshrn_n_s32(q14s32, 14);
+    d29s16 = vqrshrn_n_s32(q15s32, 14);
+    d28s16 = vqrshrn_n_s32(q1s32,  14);
+    q13s16 = vcombine_s16(d26s16, d27s16);
+    q14s16 = vcombine_s16(d28s16, d29s16);
+
+    // stage 2
+    q8s16 = vaddq_s16(q13s16, q14s16);
+    q9s16 = vsubq_s16(q13s16, q14s16);
+
+    q8s16 = vrshrq_n_s16(q8s16, 4);
+    q9s16 = vrshrq_n_s16(q9s16, 4);
+
+    d = dest;
+    d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 0);
+    d += dest_stride;
+    d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 1);
+    d += dest_stride;
+    d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 1);
+    d += dest_stride;
+    d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 0);
+
+    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
+                     vreinterpret_u8_u32(d26u32));
+    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
+                     vreinterpret_u8_u32(d27u32));
+
+    d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+    d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+
+    d = dest;
+    vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 0);
+    d += dest_stride;
+    vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 1);
+    d += dest_stride;
+    vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 1);
+    d += dest_stride;
+    vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 0);
+    return;
+}
diff --git a/libs/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm b/libs/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm
new file mode 100644
index 0000000000..dbbff364f3
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm
@@ -0,0 +1,88 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vpx_idct8x8_1_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vpx_idct8x8_1_add_neon(int16_t *input, uint8_t *dest,
+;                                  int dest_stride)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride)
+
+|vpx_idct8x8_1_add_neon| PROC
+    ldrsh            r0, [r0]
+
+    ; generate cospi_16_64 = 11585
+    mov              r12, #0x2d00
+    add              r12, #0x41
+
+    ; out = dct_const_round_shift(input[0] * cospi_16_64)
+    mul              r0, r0, r12               ; input[0] * cospi_16_64
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; out = dct_const_round_shift(out * cospi_16_64)
+    mul              r0, r0, r12               ; out * cospi_16_64
+    mov              r12, r1                   ; save dest
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; a1 = ROUND_POWER_OF_TWO(out, 5)
+    add              r0, r0, #16               ; + (1 <<((5) - 1))
+    asr              r0, r0, #5                ; >> 5
+
+    vdup.s16         q0, r0                    ; duplicate a1
+
+    ; load destination data
+    vld1.64          {d2}, [r1], r2
+    vld1.64          {d3}, [r1], r2
+    vld1.64          {d4}, [r1], r2
+    vld1.64          {d5}, [r1], r2
+    vld1.64          {d6}, [r1], r2
+    vld1.64          {d7}, [r1], r2
+    vld1.64          {d16}, [r1], r2
+    vld1.64          {d17}, [r1]
+
+    vaddw.u8         q9, q0, d2                ; dest[x] + a1
+    vaddw.u8         q10, q0, d3               ; dest[x] + a1
+    vaddw.u8         q11, q0, d4               ; dest[x] + a1
+    vaddw.u8         q12, q0, d5               ; dest[x] + a1
+    vqmovun.s16      d2, q9                    ; clip_pixel
+    vqmovun.s16      d3, q10                   ; clip_pixel
+    vqmovun.s16      d30, q11                  ; clip_pixel
+    vqmovun.s16      d31, q12                  ; clip_pixel
+    vst1.64          {d2}, [r12], r2
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r2
+    vst1.64          {d31}, [r12], r2
+
+    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
+    vaddw.u8         q10, q0, d7                ; dest[x] + a1
+    vaddw.u8         q11, q0, d16               ; dest[x] + a1
+    vaddw.u8         q12, q0, d17               ; dest[x] + a1
+    vqmovun.s16      d2, q9                     ; clip_pixel
+    vqmovun.s16      d3, q10                    ; clip_pixel
+    vqmovun.s16      d30, q11                   ; clip_pixel
+    vqmovun.s16      d31, q12                   ; clip_pixel
+    vst1.64          {d2}, [r12], r2
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r2
+    vst1.64          {d31}, [r12], r2
+
+    bx               lr
+    ENDP             ; |vpx_idct8x8_1_add_neon|
+
+    END
diff --git a/libs/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c b/libs/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c
new file mode 100644
index 0000000000..c1b801fad5
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c
@@ -0,0 +1,64 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "vpx_dsp/inv_txfm.h"
+#include "vpx_ports/mem.h"
+
+void vpx_idct8x8_1_add_neon(
+        int16_t *input,
+        uint8_t *dest,
+        int dest_stride) {
+    uint8x8_t d2u8, d3u8, d30u8, d31u8;
+    uint64x1_t d2u64, d3u64, d4u64, d5u64;
+    uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
+    int16x8_t q0s16;
+    uint8_t *d1, *d2;
+    int16_t i, a1, cospi_16_64 = 11585;
+    int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+    out = dct_const_round_shift(out * cospi_16_64);
+    a1 = ROUND_POWER_OF_TWO(out, 5);
+
+    q0s16 = vdupq_n_s16(a1);
+    q0u16 = vreinterpretq_u16_s16(q0s16);
+
+    d1 = d2 = dest;
+    for (i = 0; i < 2; i++) {
+        d2u64 = vld1_u64((const uint64_t *)d1);
+        d1 += dest_stride;
+        d3u64 = vld1_u64((const uint64_t *)d1);
+        d1 += dest_stride;
+        d4u64 = vld1_u64((const uint64_t *)d1);
+        d1 += dest_stride;
+        d5u64 = vld1_u64((const uint64_t *)d1);
+        d1 += dest_stride;
+
+        q9u16  = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
+        q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
+        q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));
+        q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));
+
+        d2u8  = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+        d3u8  = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+        d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+        d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+
+        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+        d2 += dest_stride;
+        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+        d2 += dest_stride;
+        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
+        d2 += dest_stride;
+        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d31u8));
+        d2 += dest_stride;
+    }
+    return;
+}
diff --git a/libs/libvpx/vpx_dsp/arm/idct8x8_add_neon.asm b/libs/libvpx/vpx_dsp/arm/idct8x8_add_neon.asm
new file mode 100644
index 0000000000..6ab59b41b7
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/idct8x8_add_neon.asm
@@ -0,0 +1,519 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vpx_idct8x8_64_add_neon|
+    EXPORT  |vpx_idct8x8_12_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+    ; Parallel 1D IDCT on all the columns of a 8x8 16bit data matrix which are
+    ; loaded in q8-q15. The output will be stored back into q8-q15 registers.
+    ; This macro will touch q0-q7 registers and use them as buffer during
+    ; calculation.
+    MACRO
+    IDCT8x8_1D
+    ; stage 1
+    vdup.16         d0, r3                    ; duplicate cospi_28_64
+    vdup.16         d1, r4                    ; duplicate cospi_4_64
+    vdup.16         d2, r5                    ; duplicate cospi_12_64
+    vdup.16         d3, r6                    ; duplicate cospi_20_64
+
+    ; input[1] * cospi_28_64
+    vmull.s16       q2, d18, d0
+    vmull.s16       q3, d19, d0
+
+    ; input[5] * cospi_12_64
+    vmull.s16       q5, d26, d2
+    vmull.s16       q6, d27, d2
+
+    ; input[1]*cospi_28_64-input[7]*cospi_4_64
+    vmlsl.s16       q2, d30, d1
+    vmlsl.s16       q3, d31, d1
+
+    ; input[5] * cospi_12_64 - input[3] * cospi_20_64
+    vmlsl.s16       q5, d22, d3
+    vmlsl.s16       q6, d23, d3
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d8, q2, #14               ; >> 14
+    vqrshrn.s32     d9, q3, #14               ; >> 14
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d10, q5, #14              ; >> 14
+    vqrshrn.s32     d11, q6, #14              ; >> 14
+
+    ; input[1] * cospi_4_64
+    vmull.s16       q2, d18, d1
+    vmull.s16       q3, d19, d1
+
+    ; input[5] * cospi_20_64
+    vmull.s16       q9, d26, d3
+    vmull.s16       q13, d27, d3
+
+    ; input[1]*cospi_4_64+input[7]*cospi_28_64
+    vmlal.s16       q2, d30, d0
+    vmlal.s16       q3, d31, d0
+
+    ; input[5] * cospi_20_64 + input[3] * cospi_12_64
+    vmlal.s16       q9, d22, d2
+    vmlal.s16       q13, d23, d2
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d14, q2, #14              ; >> 14
+    vqrshrn.s32     d15, q3, #14              ; >> 14
+
+    ; stage 2 & stage 3 - even half
+    vdup.16         d0, r7                    ; duplicate cospi_16_64
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d12, q9, #14              ; >> 14
+    vqrshrn.s32     d13, q13, #14              ; >> 14
+
+    ; input[0] * cospi_16_64
+    vmull.s16       q2, d16, d0
+    vmull.s16       q3, d17, d0
+
+    ; input[0] * cospi_16_64
+    vmull.s16       q13, d16, d0
+    vmull.s16       q15, d17, d0
+
+    ; (input[0] + input[2]) * cospi_16_64
+    vmlal.s16       q2,  d24, d0
+    vmlal.s16       q3, d25, d0
+
+    ; (input[0] - input[2]) * cospi_16_64
+    vmlsl.s16       q13, d24, d0
+    vmlsl.s16       q15, d25, d0
+
+    vdup.16         d0, r8                    ; duplicate cospi_24_64
+    vdup.16         d1, r9                    ; duplicate cospi_8_64
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d18, q2, #14              ; >> 14
+    vqrshrn.s32     d19, q3, #14              ; >> 14
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d22, q13, #14              ; >> 14
+    vqrshrn.s32     d23, q15, #14              ; >> 14
+
+    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
+    ; input[1] * cospi_24_64
+    vmull.s16       q2, d20, d0
+    vmull.s16       q3, d21, d0
+
+    ; input[1] * cospi_8_64
+    vmull.s16       q8, d20, d1
+    vmull.s16       q12, d21, d1
+
+    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
+    vmlsl.s16       q2, d28, d1
+    vmlsl.s16       q3, d29, d1
+
+    ; input[1] * cospi_8_64 + input[3] * cospi_24_64
+    vmlal.s16       q8, d28, d0
+    vmlal.s16       q12, d29, d0
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d26, q2, #14              ; >> 14
+    vqrshrn.s32     d27, q3, #14              ; >> 14
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d30, q8, #14              ; >> 14
+    vqrshrn.s32     d31, q12, #14              ; >> 14
+
+    vadd.s16        q0, q9, q15               ; output[0] = step[0] + step[3]
+    vadd.s16        q1, q11, q13              ; output[1] = step[1] + step[2]
+    vsub.s16        q2, q11, q13              ; output[2] = step[1] - step[2]
+    vsub.s16        q3, q9, q15               ; output[3] = step[0] - step[3]
+
+    ; stage 3 -odd half
+    vdup.16         d16, r7                   ; duplicate cospi_16_64
+
+    ; stage 2 - odd half
+    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5]
+    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5]
+    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7]
+    vadd.s16        q7, q7, q6                ; step2[7] = step1[6] + step1[7]
+
+    ; step2[6] * cospi_16_64
+    vmull.s16       q9, d28, d16
+    vmull.s16       q10, d29, d16
+
+    ; step2[6] * cospi_16_64
+    vmull.s16       q11, d28, d16
+    vmull.s16       q12, d29, d16
+
+    ; (step2[6] - step2[5]) * cospi_16_64
+    vmlsl.s16       q9, d26, d16
+    vmlsl.s16       q10, d27, d16
+
+    ; (step2[5] + step2[6]) * cospi_16_64
+    vmlal.s16       q11, d26, d16
+    vmlal.s16       q12, d27, d16
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d10, q9, #14              ; >> 14
+    vqrshrn.s32     d11, q10, #14             ; >> 14
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d12, q11, #14              ; >> 14
+    vqrshrn.s32     d13, q12, #14             ; >> 14
+
+    ; stage 4
+    vadd.s16        q8, q0, q7                ; output[0] = step1[0] + step1[7];
+    vadd.s16        q9, q1, q6                ; output[1] = step1[1] + step1[6];
+    vadd.s16        q10, q2, q5               ; output[2] = step1[2] + step1[5];
+    vadd.s16        q11, q3, q4               ; output[3] = step1[3] + step1[4];
+    vsub.s16        q12, q3, q4               ; output[4] = step1[3] - step1[4];
+    vsub.s16        q13, q2, q5               ; output[5] = step1[2] - step1[5];
+    vsub.s16        q14, q1, q6               ; output[6] = step1[1] - step1[6];
+    vsub.s16        q15, q0, q7               ; output[7] = step1[0] - step1[7];
+    MEND
+
+    ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15.
+    MACRO
+    TRANSPOSE8X8
+    vswp            d17, d24
+    vswp            d23, d30
+    vswp            d21, d28
+    vswp            d19, d26
+    vtrn.32         q8, q10
+    vtrn.32         q9, q11
+    vtrn.32         q12, q14
+    vtrn.32         q13, q15
+    vtrn.16         q8, q9
+    vtrn.16         q10, q11
+    vtrn.16         q12, q13
+    vtrn.16         q14, q15
+    MEND
+
+    AREA    Block, CODE, READONLY ; name this block of code
+;void vpx_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride)
+
+|vpx_idct8x8_64_add_neon| PROC
+    push            {r4-r9}
+    vpush           {d8-d15}
+    vld1.s16        {q8,q9}, [r0]!
+    vld1.s16        {q10,q11}, [r0]!
+    vld1.s16        {q12,q13}, [r0]!
+    vld1.s16        {q14,q15}, [r0]!
+
+    ; transpose the input data
+    TRANSPOSE8X8
+
+    ; generate  cospi_28_64 = 3196
+    mov             r3, #0x0c00
+    add             r3, #0x7c
+
+    ; generate cospi_4_64  = 16069
+    mov             r4, #0x3e00
+    add             r4, #0xc5
+
+    ; generate cospi_12_64 = 13623
+    mov             r5, #0x3500
+    add             r5, #0x37
+
+    ; generate cospi_20_64 = 9102
+    mov             r6, #0x2300
+    add             r6, #0x8e
+
+    ; generate cospi_16_64 = 11585
+    mov             r7, #0x2d00
+    add             r7, #0x41
+
+    ; generate cospi_24_64 = 6270
+    mov             r8, #0x1800
+    add             r8, #0x7e
+
+    ; generate cospi_8_64 = 15137
+    mov             r9, #0x3b00
+    add             r9, #0x21
+
+    ; First transform rows
+    IDCT8x8_1D
+
+    ; Transpose the matrix
+    TRANSPOSE8X8
+
+    ; Then transform columns
+    IDCT8x8_1D
+
+    ; ROUND_POWER_OF_TWO(temp_out[j], 5)
+    vrshr.s16       q8, q8, #5
+    vrshr.s16       q9, q9, #5
+    vrshr.s16       q10, q10, #5
+    vrshr.s16       q11, q11, #5
+    vrshr.s16       q12, q12, #5
+    vrshr.s16       q13, q13, #5
+    vrshr.s16       q14, q14, #5
+    vrshr.s16       q15, q15, #5
+
+    ; save dest pointer
+    mov             r0, r1
+
+    ; load destination data
+    vld1.64         {d0}, [r1], r2
+    vld1.64         {d1}, [r1], r2
+    vld1.64         {d2}, [r1], r2
+    vld1.64         {d3}, [r1], r2
+    vld1.64         {d4}, [r1], r2
+    vld1.64         {d5}, [r1], r2
+    vld1.64         {d6}, [r1], r2
+    vld1.64         {d7}, [r1]
+
+    ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
+    vaddw.u8        q8, q8, d0
+    vaddw.u8        q9, q9, d1
+    vaddw.u8        q10, q10, d2
+    vaddw.u8        q11, q11, d3
+    vaddw.u8        q12, q12, d4
+    vaddw.u8        q13, q13, d5
+    vaddw.u8        q14, q14, d6
+    vaddw.u8        q15, q15, d7
+
+    ; clip_pixel
+    vqmovun.s16     d0, q8
+    vqmovun.s16     d1, q9
+    vqmovun.s16     d2, q10
+    vqmovun.s16     d3, q11
+    vqmovun.s16     d4, q12
+    vqmovun.s16     d5, q13
+    vqmovun.s16     d6, q14
+    vqmovun.s16     d7, q15
+
+    ; store the data
+    vst1.64         {d0}, [r0], r2
+    vst1.64         {d1}, [r0], r2
+    vst1.64         {d2}, [r0], r2
+    vst1.64         {d3}, [r0], r2
+    vst1.64         {d4}, [r0], r2
+    vst1.64         {d5}, [r0], r2
+    vst1.64         {d6}, [r0], r2
+    vst1.64         {d7}, [r0], r2
+
+    vpop            {d8-d15}
+    pop             {r4-r9}
+    bx              lr
+    ENDP  ; |vpx_idct8x8_64_add_neon|
+
+;void vpx_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride)
+
+|vpx_idct8x8_12_add_neon| PROC
+    push            {r4-r9}
+    vpush           {d8-d15}
+    vld1.s16        {q8,q9}, [r0]!
+    vld1.s16        {q10,q11}, [r0]!
+    vld1.s16        {q12,q13}, [r0]!
+    vld1.s16        {q14,q15}, [r0]!
+
+    ; transpose the input data
+    TRANSPOSE8X8
+
+    ; generate  cospi_28_64 = 3196
+    mov             r3, #0x0c00
+    add             r3, #0x7c
+
+    ; generate cospi_4_64  = 16069
+    mov             r4, #0x3e00
+    add             r4, #0xc5
+
+    ; generate cospi_12_64 = 13623
+    mov             r5, #0x3500
+    add             r5, #0x37
+
+    ; generate cospi_20_64 = 9102
+    mov             r6, #0x2300
+    add             r6, #0x8e
+
+    ; generate cospi_16_64 = 11585
+    mov             r7, #0x2d00
+    add             r7, #0x41
+
+    ; generate cospi_24_64 = 6270
+    mov             r8, #0x1800
+    add             r8, #0x7e
+
+    ; generate cospi_8_64 = 15137
+    mov             r9, #0x3b00
+    add             r9, #0x21
+
+    ; First transform rows
+    ; stage 1
+    ; The following instructions use vqrdmulh to do the
+    ; dct_const_round_shift(input[1] * cospi_28_64). vqrdmulh will do doubling
+    ; multiply and shift the result by 16 bits instead of 14 bits. So we need
+    ; to double the constants before multiplying to compensate this.
+    mov             r12, r3, lsl #1
+    vdup.16         q0, r12                   ; duplicate cospi_28_64*2
+    mov             r12, r4, lsl #1
+    vdup.16         q1, r12                   ; duplicate cospi_4_64*2
+
+    ; dct_const_round_shift(input[1] * cospi_28_64)
+    vqrdmulh.s16    q4, q9, q0
+
+    mov             r12, r6, lsl #1
+    rsb             r12, #0
+    vdup.16         q0, r12                   ; duplicate -cospi_20_64*2
+
+    ; dct_const_round_shift(input[1] * cospi_4_64)
+    vqrdmulh.s16    q7, q9, q1
+
+    mov             r12, r5, lsl #1
+    vdup.16         q1, r12                   ; duplicate cospi_12_64*2
+
+    ; dct_const_round_shift(- input[3] * cospi_20_64)
+    vqrdmulh.s16    q5, q11, q0
+
+    mov             r12, r7, lsl #1
+    vdup.16         q0, r12                   ; duplicate cospi_16_64*2
+
+    ; dct_const_round_shift(input[3] * cospi_12_64)
+    vqrdmulh.s16    q6, q11, q1
+
+    ; stage 2 & stage 3 - even half
+    mov             r12, r8, lsl #1
+    vdup.16         q1, r12                   ; duplicate cospi_24_64*2
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrdmulh.s16    q9, q8, q0
+
+    mov             r12, r9, lsl #1
+    vdup.16         q0, r12                   ; duplicate cospi_8_64*2
+
+    ; dct_const_round_shift(input[1] * cospi_24_64)
+    vqrdmulh.s16    q13, q10, q1
+
+    ; dct_const_round_shift(input[1] * cospi_8_64)
+    vqrdmulh.s16    q15, q10, q0
+
+    ; stage 3 -odd half
+    vdup.16         d16, r7                   ; duplicate cospi_16_64
+
+    vadd.s16        q0, q9, q15               ; output[0] = step[0] + step[3]
+    vadd.s16        q1, q9, q13               ; output[1] = step[1] + step[2]
+    vsub.s16        q2, q9, q13               ; output[2] = step[1] - step[2]
+    vsub.s16        q3, q9, q15               ; output[3] = step[0] - step[3]
+
+    ; stage 2 - odd half
+    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5]
+    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5]
+    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7]
+    vadd.s16        q7, q7, q6                ; step2[7] = step1[6] + step1[7]
+
+    ; step2[6] * cospi_16_64
+    vmull.s16       q9, d28, d16
+    vmull.s16       q10, d29, d16
+
+    ; step2[6] * cospi_16_64
+    vmull.s16       q11, d28, d16
+    vmull.s16       q12, d29, d16
+
+    ; (step2[6] - step2[5]) * cospi_16_64
+    vmlsl.s16       q9, d26, d16
+    vmlsl.s16       q10, d27, d16
+
+    ; (step2[5] + step2[6]) * cospi_16_64
+    vmlal.s16       q11, d26, d16
+    vmlal.s16       q12, d27, d16
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d10, q9, #14              ; >> 14
+    vqrshrn.s32     d11, q10, #14             ; >> 14
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d12, q11, #14              ; >> 14
+    vqrshrn.s32     d13, q12, #14             ; >> 14
+
+    ; stage 4
+    vadd.s16        q8, q0, q7                ; output[0] = step1[0] + step1[7];
+    vadd.s16        q9, q1, q6                ; output[1] = step1[1] + step1[6];
+    vadd.s16        q10, q2, q5               ; output[2] = step1[2] + step1[5];
+    vadd.s16        q11, q3, q4               ; output[3] = step1[3] + step1[4];
+    vsub.s16        q12, q3, q4               ; output[4] = step1[3] - step1[4];
+    vsub.s16        q13, q2, q5               ; output[5] = step1[2] - step1[5];
+    vsub.s16        q14, q1, q6               ; output[6] = step1[1] - step1[6];
+    vsub.s16        q15, q0, q7               ; output[7] = step1[0] - step1[7];
+
+    ; Transpose the matrix
+    TRANSPOSE8X8
+
+    ; Then transform columns
+    IDCT8x8_1D
+
+    ; ROUND_POWER_OF_TWO(temp_out[j], 5)
+    vrshr.s16       q8, q8, #5
+    vrshr.s16       q9, q9, #5
+    vrshr.s16       q10, q10, #5
+    vrshr.s16       q11, q11, #5
+    vrshr.s16       q12, q12, #5
+    vrshr.s16       q13, q13, #5
+    vrshr.s16       q14, q14, #5
+    vrshr.s16       q15, q15, #5
+
+    ; save dest pointer
+    mov             r0, r1
+
+    ; load destination data
+    vld1.64         {d0}, [r1], r2
+    vld1.64         {d1}, [r1], r2
+    vld1.64         {d2}, [r1], r2
+    vld1.64         {d3}, [r1], r2
+    vld1.64         {d4}, [r1], r2
+    vld1.64         {d5}, [r1], r2
+    vld1.64         {d6}, [r1], r2
+    vld1.64         {d7}, [r1]
+
+    ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
+    vaddw.u8        q8, q8, d0
+    vaddw.u8        q9, q9, d1
+    vaddw.u8        q10, q10, d2
+    vaddw.u8        q11, q11, d3
+    vaddw.u8        q12, q12, d4
+    vaddw.u8        q13, q13, d5
+    vaddw.u8        q14, q14, d6
+    vaddw.u8        q15, q15, d7
+
+    ; clip_pixel
+    vqmovun.s16     d0, q8
+    vqmovun.s16     d1, q9
+    vqmovun.s16     d2, q10
+    vqmovun.s16     d3, q11
+    vqmovun.s16     d4, q12
+    vqmovun.s16     d5, q13
+    vqmovun.s16     d6, q14
+    vqmovun.s16     d7, q15
+
+    ; store the data
+    vst1.64         {d0}, [r0], r2
+    vst1.64         {d1}, [r0], r2
+    vst1.64         {d2}, [r0], r2
+    vst1.64         {d3}, [r0], r2
+    vst1.64         {d4}, [r0], r2
+    vst1.64         {d5}, [r0], r2
+    vst1.64         {d6}, [r0], r2
+    vst1.64         {d7}, [r0], r2
+
+    vpop            {d8-d15}
+    pop             {r4-r9}
+    bx              lr
+    ENDP  ; |vpx_idct8x8_12_add_neon|
+
+    END
diff --git a/libs/libvpx/vpx_dsp/arm/idct8x8_add_neon.c b/libs/libvpx/vpx_dsp/arm/idct8x8_add_neon.c
new file mode 100644
index 0000000000..4b2c2a6f83
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/idct8x8_add_neon.c
@@ -0,0 +1,540 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "vpx_dsp/txfm_common.h"
+
+static INLINE void TRANSPOSE8X8(
+        int16x8_t *q8s16,
+        int16x8_t *q9s16,
+        int16x8_t *q10s16,
+        int16x8_t *q11s16,
+        int16x8_t *q12s16,
+        int16x8_t *q13s16,
+        int16x8_t *q14s16,
+        int16x8_t *q15s16) {
+    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+    int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
+    int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
+
+    d16s16 = vget_low_s16(*q8s16);
+    d17s16 = vget_high_s16(*q8s16);
+    d18s16 = vget_low_s16(*q9s16);
+    d19s16 = vget_high_s16(*q9s16);
+    d20s16 = vget_low_s16(*q10s16);
+    d21s16 = vget_high_s16(*q10s16);
+    d22s16 = vget_low_s16(*q11s16);
+    d23s16 = vget_high_s16(*q11s16);
+    d24s16 = vget_low_s16(*q12s16);
+    d25s16 = vget_high_s16(*q12s16);
+    d26s16 = vget_low_s16(*q13s16);
+    d27s16 = vget_high_s16(*q13s16);
+    d28s16 = vget_low_s16(*q14s16);
+    d29s16 = vget_high_s16(*q14s16);
+    d30s16 = vget_low_s16(*q15s16);
+    d31s16 = vget_high_s16(*q15s16);
+
+    *q8s16  = vcombine_s16(d16s16, d24s16);  // vswp d17, d24
+    *q9s16  = vcombine_s16(d18s16, d26s16);  // vswp d19, d26
+    *q10s16 = vcombine_s16(d20s16, d28s16);  // vswp d21, d28
+    *q11s16 = vcombine_s16(d22s16, d30s16);  // vswp d23, d30
+    *q12s16 = vcombine_s16(d17s16, d25s16);
+    *q13s16 = vcombine_s16(d19s16, d27s16);
+    *q14s16 = vcombine_s16(d21s16, d29s16);
+    *q15s16 = vcombine_s16(d23s16, d31s16);
+
+    q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16),
+                        vreinterpretq_s32_s16(*q10s16));
+    q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16),
+                        vreinterpretq_s32_s16(*q11s16));
+    q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16),
+                        vreinterpretq_s32_s16(*q14s16));
+    q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16),
+                        vreinterpretq_s32_s16(*q15s16));
+
+    q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]),   // q8
+                        vreinterpretq_s16_s32(q1x2s32.val[0]));  // q9
+    q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]),   // q10
+                        vreinterpretq_s16_s32(q1x2s32.val[1]));  // q11
+    q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]),   // q12
+                        vreinterpretq_s16_s32(q3x2s32.val[0]));  // q13
+    q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]),   // q14
+                        vreinterpretq_s16_s32(q3x2s32.val[1]));  // q15
+
+    *q8s16  = q0x2s16.val[0];
+    *q9s16  = q0x2s16.val[1];
+    *q10s16 = q1x2s16.val[0];
+    *q11s16 = q1x2s16.val[1];
+    *q12s16 = q2x2s16.val[0];
+    *q13s16 = q2x2s16.val[1];
+    *q14s16 = q3x2s16.val[0];
+    *q15s16 = q3x2s16.val[1];
+    return;
+}
+
+static INLINE void IDCT8x8_1D(
+        int16x8_t *q8s16,
+        int16x8_t *q9s16,
+        int16x8_t *q10s16,
+        int16x8_t *q11s16,
+        int16x8_t *q12s16,
+        int16x8_t *q13s16,
+        int16x8_t *q14s16,
+        int16x8_t *q15s16) {
+    int16x4_t d0s16, d1s16, d2s16, d3s16;
+    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+    int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32;
+    int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
+
+    d0s16 = vdup_n_s16(cospi_28_64);
+    d1s16 = vdup_n_s16(cospi_4_64);
+    d2s16 = vdup_n_s16(cospi_12_64);
+    d3s16 = vdup_n_s16(cospi_20_64);
+
+    d16s16 = vget_low_s16(*q8s16);
+    d17s16 = vget_high_s16(*q8s16);
+    d18s16 = vget_low_s16(*q9s16);
+    d19s16 = vget_high_s16(*q9s16);
+    d20s16 = vget_low_s16(*q10s16);
+    d21s16 = vget_high_s16(*q10s16);
+    d22s16 = vget_low_s16(*q11s16);
+    d23s16 = vget_high_s16(*q11s16);
+    d24s16 = vget_low_s16(*q12s16);
+    d25s16 = vget_high_s16(*q12s16);
+    d26s16 = vget_low_s16(*q13s16);
+    d27s16 = vget_high_s16(*q13s16);
+    d28s16 = vget_low_s16(*q14s16);
+    d29s16 = vget_high_s16(*q14s16);
+    d30s16 = vget_low_s16(*q15s16);
+    d31s16 = vget_high_s16(*q15s16);
+
+    q2s32 = vmull_s16(d18s16, d0s16);
+    q3s32 = vmull_s16(d19s16, d0s16);
+    q5s32 = vmull_s16(d26s16, d2s16);
+    q6s32 = vmull_s16(d27s16, d2s16);
+
+    q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
+    q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
+    q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);
+    q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);
+
+    d8s16 = vqrshrn_n_s32(q2s32, 14);
+    d9s16 = vqrshrn_n_s32(q3s32, 14);
+    d10s16 = vqrshrn_n_s32(q5s32, 14);
+    d11s16 = vqrshrn_n_s32(q6s32, 14);
+    q4s16 = vcombine_s16(d8s16, d9s16);
+    q5s16 = vcombine_s16(d10s16, d11s16);
+
+    q2s32 = vmull_s16(d18s16, d1s16);
+    q3s32 = vmull_s16(d19s16, d1s16);
+    q9s32 = vmull_s16(d26s16, d3s16);
+    q13s32 = vmull_s16(d27s16, d3s16);
+
+    q2s32 = vmlal_s16(q2s32, d30s16, d0s16);
+    q3s32 = vmlal_s16(q3s32, d31s16, d0s16);
+    q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
+    q13s32 = vmlal_s16(q13s32, d23s16, d2s16);
+
+    d14s16 = vqrshrn_n_s32(q2s32, 14);
+    d15s16 = vqrshrn_n_s32(q3s32, 14);
+    d12s16 = vqrshrn_n_s32(q9s32, 14);
+    d13s16 = vqrshrn_n_s32(q13s32, 14);
+    q6s16 = vcombine_s16(d12s16, d13s16);
+    q7s16 = vcombine_s16(d14s16, d15s16);
+
+    d0s16 = vdup_n_s16(cospi_16_64);
+
+    q2s32 = vmull_s16(d16s16, d0s16);
+    q3s32 = vmull_s16(d17s16, d0s16);
+    q13s32 = vmull_s16(d16s16, d0s16);
+    q15s32 = vmull_s16(d17s16, d0s16);
+
+    q2s32 = vmlal_s16(q2s32, d24s16, d0s16);
+    q3s32 = vmlal_s16(q3s32, d25s16, d0s16);
+    q13s32 = vmlsl_s16(q13s32, d24s16, d0s16);
+    q15s32 = vmlsl_s16(q15s32, d25s16, d0s16);
+
+    d0s16 = vdup_n_s16(cospi_24_64);
+    d1s16 = vdup_n_s16(cospi_8_64);
+
+    d18s16 = vqrshrn_n_s32(q2s32, 14);
+    d19s16 = vqrshrn_n_s32(q3s32, 14);
+    d22s16 = vqrshrn_n_s32(q13s32, 14);
+    d23s16 = vqrshrn_n_s32(q15s32, 14);
+    *q9s16 = vcombine_s16(d18s16, d19s16);
+    *q11s16 = vcombine_s16(d22s16, d23s16);
+
+    q2s32 = vmull_s16(d20s16, d0s16);
+    q3s32 = vmull_s16(d21s16, d0s16);
+    q8s32 = vmull_s16(d20s16, d1s16);
+    q12s32 = vmull_s16(d21s16, d1s16);
+
+    q2s32 = vmlsl_s16(q2s32, d28s16, d1s16);
+    q3s32 = vmlsl_s16(q3s32, d29s16, d1s16);
+    q8s32 = vmlal_s16(q8s32, d28s16, d0s16);
+    q12s32 = vmlal_s16(q12s32, d29s16, d0s16);
+
+    d26s16 = vqrshrn_n_s32(q2s32, 14);
+    d27s16 = vqrshrn_n_s32(q3s32, 14);
+    d30s16 = vqrshrn_n_s32(q8s32, 14);
+    d31s16 = vqrshrn_n_s32(q12s32, 14);
+    *q13s16 = vcombine_s16(d26s16, d27s16);
+    *q15s16 = vcombine_s16(d30s16, d31s16);
+
+    q0s16 = vaddq_s16(*q9s16, *q15s16);
+    q1s16 = vaddq_s16(*q11s16, *q13s16);
+    q2s16 = vsubq_s16(*q11s16, *q13s16);
+    q3s16 = vsubq_s16(*q9s16, *q15s16);
+
+    *q13s16 = vsubq_s16(q4s16, q5s16);
+    q4s16 = vaddq_s16(q4s16, q5s16);
+    *q14s16 = vsubq_s16(q7s16, q6s16);
+    q7s16 = vaddq_s16(q7s16, q6s16);
+    d26s16 = vget_low_s16(*q13s16);
+    d27s16 = vget_high_s16(*q13s16);
+    d28s16 = vget_low_s16(*q14s16);
+    d29s16 = vget_high_s16(*q14s16);
+
+    d16s16 = vdup_n_s16(cospi_16_64);
+
+    q9s32 = vmull_s16(d28s16, d16s16);
+    q10s32 = vmull_s16(d29s16, d16s16);
+    q11s32 = vmull_s16(d28s16, d16s16);
+    q12s32 = vmull_s16(d29s16, d16s16);
+
+    q9s32 = vmlsl_s16(q9s32,  d26s16, d16s16);
+    q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
+    q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
+    q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
+
+    d10s16 = vqrshrn_n_s32(q9s32, 14);
+    d11s16 = vqrshrn_n_s32(q10s32, 14);
+    d12s16 = vqrshrn_n_s32(q11s32, 14);
+    d13s16 = vqrshrn_n_s32(q12s32, 14);
+    q5s16 = vcombine_s16(d10s16, d11s16);
+    q6s16 = vcombine_s16(d12s16, d13s16);
+
+    *q8s16 = vaddq_s16(q0s16, q7s16);
+    *q9s16 = vaddq_s16(q1s16, q6s16);
+    *q10s16 = vaddq_s16(q2s16, q5s16);
+    *q11s16 = vaddq_s16(q3s16, q4s16);
+    *q12s16 = vsubq_s16(q3s16, q4s16);
+    *q13s16 = vsubq_s16(q2s16, q5s16);
+    *q14s16 = vsubq_s16(q1s16, q6s16);
+    *q15s16 = vsubq_s16(q0s16, q7s16);
+    return;
+}
+
+void vpx_idct8x8_64_add_neon(
+        int16_t *input,
+        uint8_t *dest,
+        int dest_stride) {
+    uint8_t *d1, *d2;
+    uint8x8_t d0u8, d1u8, d2u8, d3u8;
+    uint64x1_t d0u64, d1u64, d2u64, d3u64;
+    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+    uint16x8_t q8u16, q9u16, q10u16, q11u16;
+
+    q8s16 = vld1q_s16(input);
+    q9s16 = vld1q_s16(input + 8);
+    q10s16 = vld1q_s16(input + 16);
+    q11s16 = vld1q_s16(input + 24);
+    q12s16 = vld1q_s16(input + 32);
+    q13s16 = vld1q_s16(input + 40);
+    q14s16 = vld1q_s16(input + 48);
+    q15s16 = vld1q_s16(input + 56);
+
+    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+                 &q12s16, &q13s16, &q14s16, &q15s16);
+
+    IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+               &q12s16, &q13s16, &q14s16, &q15s16);
+
+    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+                 &q12s16, &q13s16, &q14s16, &q15s16);
+
+    IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+               &q12s16, &q13s16, &q14s16, &q15s16);
+
+    q8s16 = vrshrq_n_s16(q8s16, 5);
+    q9s16 = vrshrq_n_s16(q9s16, 5);
+    q10s16 = vrshrq_n_s16(q10s16, 5);
+    q11s16 = vrshrq_n_s16(q11s16, 5);
+    q12s16 = vrshrq_n_s16(q12s16, 5);
+    q13s16 = vrshrq_n_s16(q13s16, 5);
+    q14s16 = vrshrq_n_s16(q14s16, 5);
+    q15s16 = vrshrq_n_s16(q15s16, 5);
+
+    d1 = d2 = dest;
+
+    d0u64 = vld1_u64((uint64_t *)d1);
+    d1 += dest_stride;
+    d1u64 = vld1_u64((uint64_t *)d1);
+    d1 += dest_stride;
+    d2u64 = vld1_u64((uint64_t *)d1);
+    d1 += dest_stride;
+    d3u64 = vld1_u64((uint64_t *)d1);
+    d1 += dest_stride;
+
+    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
+                     vreinterpret_u8_u64(d0u64));
+    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
+                     vreinterpret_u8_u64(d1u64));
+    q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
+                      vreinterpret_u8_u64(d2u64));
+    q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
+                      vreinterpret_u8_u64(d3u64));
+
+    d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+    d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+    d2 += dest_stride;
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+    d2 += dest_stride;
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+    d2 += dest_stride;
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+    d2 += dest_stride;
+
+    q8s16 = q12s16;
+    q9s16 = q13s16;
+    q10s16 = q14s16;
+    q11s16 = q15s16;
+
+    d0u64 = vld1_u64((uint64_t *)d1);
+    d1 += dest_stride;
+    d1u64 = vld1_u64((uint64_t *)d1);
+    d1 += dest_stride;
+    d2u64 = vld1_u64((uint64_t *)d1);
+    d1 += dest_stride;
+    d3u64 = vld1_u64((uint64_t *)d1);
+    d1 += dest_stride;
+
+    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
+                     vreinterpret_u8_u64(d0u64));
+    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
+                     vreinterpret_u8_u64(d1u64));
+    q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
+                      vreinterpret_u8_u64(d2u64));
+    q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
+                      vreinterpret_u8_u64(d3u64));
+
+    d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+    d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+    d2 += dest_stride;
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+    d2 += dest_stride;
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+    d2 += dest_stride;
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+    d2 += dest_stride;
+    return;
+}
+
+void vpx_idct8x8_12_add_neon(
+        int16_t *input,
+        uint8_t *dest,
+        int dest_stride) {
+    uint8_t *d1, *d2;
+    uint8x8_t d0u8, d1u8, d2u8, d3u8;
+    int16x4_t d10s16, d11s16, d12s16, d13s16, d16s16;
+    int16x4_t d26s16, d27s16, d28s16, d29s16;
+    uint64x1_t d0u64, d1u64, d2u64, d3u64;
+    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+    uint16x8_t q8u16, q9u16, q10u16, q11u16;
+    int32x4_t q9s32, q10s32, q11s32, q12s32;
+
+    q8s16 = vld1q_s16(input);
+    q9s16 = vld1q_s16(input + 8);
+    q10s16 = vld1q_s16(input + 16);
+    q11s16 = vld1q_s16(input + 24);
+    q12s16 = vld1q_s16(input + 32);
+    q13s16 = vld1q_s16(input + 40);
+    q14s16 = vld1q_s16(input + 48);
+    q15s16 = vld1q_s16(input + 56);
+
+    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+                 &q12s16, &q13s16, &q14s16, &q15s16);
+
+    // First transform rows
+    // stage 1
+    q0s16 = vdupq_n_s16(cospi_28_64 * 2);
+    q1s16 = vdupq_n_s16(cospi_4_64 * 2);
+
+    q4s16 = vqrdmulhq_s16(q9s16, q0s16);
+
+    q0s16 = vdupq_n_s16(-cospi_20_64 * 2);
+
+    q7s16 = vqrdmulhq_s16(q9s16, q1s16);
+
+    q1s16 = vdupq_n_s16(cospi_12_64 * 2);
+
+    q5s16 = vqrdmulhq_s16(q11s16, q0s16);
+
+    q0s16 = vdupq_n_s16(cospi_16_64 * 2);
+
+    q6s16 = vqrdmulhq_s16(q11s16, q1s16);
+
+    // stage 2 & stage 3 - even half
+    q1s16 = vdupq_n_s16(cospi_24_64 * 2);
+
+    q9s16 = vqrdmulhq_s16(q8s16, q0s16);
+
+    q0s16 = vdupq_n_s16(cospi_8_64 * 2);
+
+    q13s16 = vqrdmulhq_s16(q10s16, q1s16);
+
+    q15s16 = vqrdmulhq_s16(q10s16, q0s16);
+
+    // stage 3 -odd half
+    q0s16 = vaddq_s16(q9s16, q15s16);
+    q1s16 = vaddq_s16(q9s16, q13s16);
+    q2s16 = vsubq_s16(q9s16, q13s16);
+    q3s16 = vsubq_s16(q9s16, q15s16);
+
+    // stage 2 - odd half
+    q13s16 = vsubq_s16(q4s16, q5s16);
+    q4s16 = vaddq_s16(q4s16, q5s16);
+    q14s16 = vsubq_s16(q7s16, q6s16);
+    q7s16 = vaddq_s16(q7s16, q6s16);
+    d26s16 = vget_low_s16(q13s16);
+    d27s16 = vget_high_s16(q13s16);
+    d28s16 = vget_low_s16(q14s16);
+    d29s16 = vget_high_s16(q14s16);
+
+    d16s16 = vdup_n_s16(cospi_16_64);
+    q9s32 = vmull_s16(d28s16, d16s16);
+    q10s32 = vmull_s16(d29s16, d16s16);
+    q11s32 = vmull_s16(d28s16, d16s16);
+    q12s32 = vmull_s16(d29s16, d16s16);
+
+    q9s32 = vmlsl_s16(q9s32,  d26s16, d16s16);
+    q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
+    q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
+    q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
+
+    d10s16 = vqrshrn_n_s32(q9s32, 14);
+    d11s16 = vqrshrn_n_s32(q10s32, 14);
+    d12s16 = vqrshrn_n_s32(q11s32, 14);
+    d13s16 = vqrshrn_n_s32(q12s32, 14);
+    q5s16 = vcombine_s16(d10s16, d11s16);
+    q6s16 = vcombine_s16(d12s16, d13s16);
+
+    // stage 4
+    q8s16 = vaddq_s16(q0s16, q7s16);
+    q9s16 = vaddq_s16(q1s16, q6s16);
+    q10s16 = vaddq_s16(q2s16, q5s16);
+    q11s16 = vaddq_s16(q3s16, q4s16);
+    q12s16 = vsubq_s16(q3s16, q4s16);
+    q13s16 = vsubq_s16(q2s16, q5s16);
+    q14s16 = vsubq_s16(q1s16, q6s16);
+    q15s16 = vsubq_s16(q0s16, q7s16);
+
+    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+                 &q12s16, &q13s16, &q14s16, &q15s16);
+
+    IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+               &q12s16, &q13s16, &q14s16, &q15s16);
+
+    q8s16 = vrshrq_n_s16(q8s16, 5);
+    q9s16 = vrshrq_n_s16(q9s16, 5);
+    q10s16 = vrshrq_n_s16(q10s16, 5);
+    q11s16 = vrshrq_n_s16(q11s16, 5);
+    q12s16 = vrshrq_n_s16(q12s16, 5);
+    q13s16 = vrshrq_n_s16(q13s16, 5);
+    q14s16 = vrshrq_n_s16(q14s16, 5);
+    q15s16 = vrshrq_n_s16(q15s16, 5);
+
+    d1 = d2 = dest;
+
+    d0u64 = vld1_u64((uint64_t *)d1);
+    d1 += dest_stride;
+    d1u64 = vld1_u64((uint64_t *)d1);
+    d1 += dest_stride;
+    d2u64 = vld1_u64((uint64_t *)d1);
+    d1 += dest_stride;
+    d3u64 = vld1_u64((uint64_t *)d1);
+    d1 += dest_stride;
+
+    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
+                     vreinterpret_u8_u64(d0u64));
+    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
+                     vreinterpret_u8_u64(d1u64));
+    q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
+                      vreinterpret_u8_u64(d2u64));
+    q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
+                      vreinterpret_u8_u64(d3u64));
+
+    d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+    d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+    d2 += dest_stride;
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+    d2 += dest_stride;
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+    d2 += dest_stride;
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+    d2 += dest_stride;
+
+    q8s16 = q12s16;
+    q9s16 = q13s16;
+    q10s16 = q14s16;
+    q11s16 = q15s16;
+
+    d0u64 = vld1_u64((uint64_t *)d1);
+    d1 += dest_stride;
+    d1u64 = vld1_u64((uint64_t *)d1);
+    d1 += dest_stride;
+    d2u64 = vld1_u64((uint64_t *)d1);
+    d1 += dest_stride;
+    d3u64 = vld1_u64((uint64_t *)d1);
+    d1 += dest_stride;
+
+    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
+                     vreinterpret_u8_u64(d0u64));
+    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
+                     vreinterpret_u8_u64(d1u64));
+    q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
+                      vreinterpret_u8_u64(d2u64));
+    q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
+                      vreinterpret_u8_u64(d3u64));
+
+    d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+    d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+    d2 += dest_stride;
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+    d2 += dest_stride;
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+    d2 += dest_stride;
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+    d2 += dest_stride;
+    return;
+}
diff --git a/libs/libvpx/vpx_dsp/arm/intrapred_neon.c b/libs/libvpx/vpx_dsp/arm/intrapred_neon.c
new file mode 100644
index 0000000000..0a376104d2
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/intrapred_neon.c
@@ -0,0 +1,822 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+//------------------------------------------------------------------------------
+// DC 4x4
+
+// 'do_above' and 'do_left' facilitate branch removal when inlined.
+static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride,
+                          const uint8_t *above, const uint8_t *left,
+                          int do_above, int do_left) {
+  uint16x8_t sum_top;
+  uint16x8_t sum_left;
+  uint8x8_t dc0;
+
+  if (do_above) {
+    const uint8x8_t A = vld1_u8(above);  // top row
+    const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
+    const uint16x4_t p1 = vpadd_u16(p0, p0);
+    sum_top = vcombine_u16(p1, p1);
+  }
+
+  if (do_left) {
+    const uint8x8_t L = vld1_u8(left);  // left border
+    const uint16x4_t p0 = vpaddl_u8(L);  // cascading summation of the left
+    const uint16x4_t p1 = vpadd_u16(p0, p0);
+    sum_left = vcombine_u16(p1, p1);
+  }
+
+  if (do_above && do_left) {
+    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+    dc0 = vrshrn_n_u16(sum, 3);
+  } else if (do_above) {
+    dc0 = vrshrn_n_u16(sum_top, 2);
+  } else if (do_left) {
+    dc0 = vrshrn_n_u16(sum_left, 2);
+  } else {
+    dc0 = vdup_n_u8(0x80);
+  }
+
+  {
+    const uint8x8_t dc = vdup_lane_u8(dc0, 0);
+    int i;
+    for (i = 0; i < 4; ++i) {
+      vst1_lane_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc), 0);
+    }
+  }
+}
+
+void vpx_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  dc_4x4(dst, stride, above, left, 1, 1);
+}
+
+void vpx_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  dc_4x4(dst, stride, NULL, left, 0, 1);
+}
+
+void vpx_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  dc_4x4(dst, stride, above, NULL, 1, 0);
+}
+
+void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  dc_4x4(dst, stride, NULL, NULL, 0, 0);
+}
+
+//------------------------------------------------------------------------------
+// DC 8x8
+
+// 'do_above' and 'do_left' facilitate branch removal when inlined.
+static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride,
+                          const uint8_t *above, const uint8_t *left,
+                          int do_above, int do_left) {
+  uint16x8_t sum_top;
+  uint16x8_t sum_left;
+  uint8x8_t dc0;
+
+  if (do_above) {
+    const uint8x8_t A = vld1_u8(above);  // top row
+    const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
+    const uint16x4_t p1 = vpadd_u16(p0, p0);
+    const uint16x4_t p2 = vpadd_u16(p1, p1);
+    sum_top = vcombine_u16(p2, p2);
+  }
+
+  if (do_left) {
+    const uint8x8_t L = vld1_u8(left);  // left border
+    const uint16x4_t p0 = vpaddl_u8(L);  // cascading summation of the left
+    const uint16x4_t p1 = vpadd_u16(p0, p0);
+    const uint16x4_t p2 = vpadd_u16(p1, p1);
+    sum_left = vcombine_u16(p2, p2);
+  }
+
+  if (do_above && do_left) {
+    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+    dc0 = vrshrn_n_u16(sum, 4);
+  } else if (do_above) {
+    dc0 = vrshrn_n_u16(sum_top, 3);
+  } else if (do_left) {
+    dc0 = vrshrn_n_u16(sum_left, 3);
+  } else {
+    dc0 = vdup_n_u8(0x80);
+  }
+
+  {
+    const uint8x8_t dc = vdup_lane_u8(dc0, 0);
+    int i;
+    for (i = 0; i < 8; ++i) {
+      vst1_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc));
+    }
+  }
+}
+
+void vpx_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  dc_8x8(dst, stride, above, left, 1, 1);
+}
+
+void vpx_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  dc_8x8(dst, stride, NULL, left, 0, 1);
+}
+
+void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  dc_8x8(dst, stride, above, NULL, 1, 0);
+}
+
+void vpx_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  dc_8x8(dst, stride, NULL, NULL, 0, 0);
+}
+
+//------------------------------------------------------------------------------
+// DC 16x16
+
+// 'do_above' and 'do_left' facilitate branch removal when inlined.
+static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride,
+                            const uint8_t *above, const uint8_t *left,
+                            int do_above, int do_left) {
+  uint16x8_t sum_top;
+  uint16x8_t sum_left;
+  uint8x8_t dc0;
+
+  if (do_above) {
+    const uint8x16_t A = vld1q_u8(above);  // top row
+    const uint16x8_t p0 = vpaddlq_u8(A);  // cascading summation of the top
+    const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
+    const uint16x4_t p2 = vpadd_u16(p1, p1);
+    const uint16x4_t p3 = vpadd_u16(p2, p2);
+    sum_top = vcombine_u16(p3, p3);
+  }
+
+  if (do_left) {
+    const uint8x16_t L = vld1q_u8(left);  // left row
+    const uint16x8_t p0 = vpaddlq_u8(L);  // cascading summation of the left
+    const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
+    const uint16x4_t p2 = vpadd_u16(p1, p1);
+    const uint16x4_t p3 = vpadd_u16(p2, p2);
+    sum_left = vcombine_u16(p3, p3);
+  }
+
+  if (do_above && do_left) {
+    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+    dc0 = vrshrn_n_u16(sum, 5);
+  } else if (do_above) {
+    dc0 = vrshrn_n_u16(sum_top, 4);
+  } else if (do_left) {
+    dc0 = vrshrn_n_u16(sum_left, 4);
+  } else {
+    dc0 = vdup_n_u8(0x80);
+  }
+
+  {
+    const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
+    int i;
+    for (i = 0; i < 16; ++i) {
+      vst1q_u8(dst + i * stride, dc);
+    }
+  }
+}
+
+void vpx_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  dc_16x16(dst, stride, above, left, 1, 1);
+}
+
+void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  dc_16x16(dst, stride, NULL, left, 0, 1);
+}
+
+void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  dc_16x16(dst, stride, above, NULL, 1, 0);
+}
+
+void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  dc_16x16(dst, stride, NULL, NULL, 0, 0);
+}
+
+//------------------------------------------------------------------------------
+// DC 32x32
+
+// 'do_above' and 'do_left' facilitate branch removal when inlined.
+static INLINE void dc_32x32(uint8_t *dst, ptrdiff_t stride,
+                            const uint8_t *above, const uint8_t *left,
+                            int do_above, int do_left) {
+  uint16x8_t sum_top;
+  uint16x8_t sum_left;
+  uint8x8_t dc0;
+
+  if (do_above) {
+    const uint8x16_t A0 = vld1q_u8(above);  // top row
+    const uint8x16_t A1 = vld1q_u8(above + 16);
+    const uint16x8_t p0 = vpaddlq_u8(A0);  // cascading summation of the top
+    const uint16x8_t p1 = vpaddlq_u8(A1);
+    const uint16x8_t p2 = vaddq_u16(p0, p1);
+    const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
+    const uint16x4_t p4 = vpadd_u16(p3, p3);
+    const uint16x4_t p5 = vpadd_u16(p4, p4);
+    sum_top = vcombine_u16(p5, p5);
+  }
+
+  if (do_left) {
+    const uint8x16_t L0 = vld1q_u8(left);  // left row
+    const uint8x16_t L1 = vld1q_u8(left + 16);
+    const uint16x8_t p0 = vpaddlq_u8(L0);  // cascading summation of the left
+    const uint16x8_t p1 = vpaddlq_u8(L1);
+    const uint16x8_t p2 = vaddq_u16(p0, p1);
+    const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
+    const uint16x4_t p4 = vpadd_u16(p3, p3);
+    const uint16x4_t p5 = vpadd_u16(p4, p4);
+    sum_left = vcombine_u16(p5, p5);
+  }
+
+  if (do_above && do_left) {
+    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+    dc0 = vrshrn_n_u16(sum, 6);
+  } else if (do_above) {
+    dc0 = vrshrn_n_u16(sum_top, 5);
+  } else if (do_left) {
+    dc0 = vrshrn_n_u16(sum_left, 5);
+  } else {
+    dc0 = vdup_n_u8(0x80);
+  }
+
+  {
+    const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
+    int i;
+    for (i = 0; i < 32; ++i) {
+      vst1q_u8(dst + i * stride, dc);
+      vst1q_u8(dst + i * stride + 16, dc);
+    }
+  }
+}
+
+void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  dc_32x32(dst, stride, above, left, 1, 1);
+}
+
+void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  dc_32x32(dst, stride, NULL, left, 0, 1);
+}
+
+void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  dc_32x32(dst, stride, above, NULL, 1, 0);
+}
+
+void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  dc_32x32(dst, stride, NULL, NULL, 0, 0);
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(above));  // top row
+  const uint64x1_t A1 = vshr_n_u64(A0, 8);
+  const uint64x1_t A2 = vshr_n_u64(A0, 16);
+  const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);
+  const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
+  const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
+  const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGH00);
+  const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);
+  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
+  const uint32x2_t r0 = vreinterpret_u32_u8(avg2);
+  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
+  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
+  const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
+  (void)left;
+  vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
+  vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
+  vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
+  vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
+  dst[3 * stride + 3] = above[7];
+}
+
+void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  static const uint8_t shuffle1[8] = { 1, 2, 3, 4, 5, 6, 7, 7 };
+  static const uint8_t shuffle2[8] = { 2, 3, 4, 5, 6, 7, 7, 7 };
+  const uint8x8_t sh_12345677 = vld1_u8(shuffle1);
+  const uint8x8_t sh_23456777 = vld1_u8(shuffle2);
+  const uint8x8_t A0 = vld1_u8(above);  // top row
+  const uint8x8_t A1 = vtbl1_u8(A0, sh_12345677);
+  const uint8x8_t A2 = vtbl1_u8(A0, sh_23456777);
+  const uint8x8_t avg1 = vhadd_u8(A0, A2);
+  uint8x8_t row = vrhadd_u8(avg1, A1);
+  int i;
+  (void)left;
+  for (i = 0; i < 7; ++i) {
+    vst1_u8(dst + i * stride, row);
+    row = vtbl1_u8(row, sh_12345677);
+  }
+  vst1_u8(dst + i * stride, row);
+}
+
+void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                  const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t A0 = vld1q_u8(above);  // top row
+  const uint8x16_t above_right = vld1q_dup_u8(above + 15);
+  const uint8x16_t A1 = vextq_u8(A0, above_right, 1);
+  const uint8x16_t A2 = vextq_u8(A0, above_right, 2);
+  const uint8x16_t avg1 = vhaddq_u8(A0, A2);
+  uint8x16_t row = vrhaddq_u8(avg1, A1);
+  int i;
+  (void)left;
+  for (i = 0; i < 15; ++i) {
+    vst1q_u8(dst + i * stride, row);
+    row = vextq_u8(row, above_right, 1);
+  }
+  vst1q_u8(dst + i * stride, row);
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const uint8x8_t XABCD_u8 = vld1_u8(above - 1);
+  const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
+  const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
+  const uint32x2_t zero = vdup_n_u32(0);
+  const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0);
+  const uint8x8_t IJKL_u8 = vreinterpret_u8_u32(IJKL);
+  const uint64x1_t LKJI____ = vreinterpret_u64_u8(vrev32_u8(IJKL_u8));
+  const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
+  const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
+  const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
+  const uint8_t D = vget_lane_u8(XABCD_u8, 4);
+  const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
+  const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
+  const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
+  const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
+  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
+  const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
+  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
+  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
+  const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
+  vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
+  vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
+  vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
+  vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
+}
+
+#if !HAVE_NEON_ASM
+
+void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  int i;
+  uint32x2_t d0u32 = vdup_n_u32(0);
+  (void)left;
+
+  d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0);
+  for (i = 0; i < 4; i++, dst += stride)
+    vst1_lane_u32((uint32_t *)dst, d0u32, 0);
+}
+
+void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  int i;
+  uint8x8_t d0u8 = vdup_n_u8(0);
+  (void)left;
+
+  d0u8 = vld1_u8(above);
+  for (i = 0; i < 8; i++, dst += stride)
+    vst1_u8(dst, d0u8);
+}
+
+void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  int i;
+  uint8x16_t q0u8 = vdupq_n_u8(0);
+  (void)left;
+
+  q0u8 = vld1q_u8(above);
+  for (i = 0; i < 16; i++, dst += stride)
+    vst1q_u8(dst, q0u8);
+}
+
+void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  int i;
+  uint8x16_t q0u8 = vdupq_n_u8(0);
+  uint8x16_t q1u8 = vdupq_n_u8(0);
+  (void)left;
+
+  q0u8 = vld1q_u8(above);
+  q1u8 = vld1q_u8(above + 16);
+  for (i = 0; i < 32; i++, dst += stride) {
+    vst1q_u8(dst, q0u8);
+    vst1q_u8(dst + 16, q1u8);
+  }
+}
+
+void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  uint8x8_t d0u8 = vdup_n_u8(0);
+  uint32x2_t d1u32 = vdup_n_u32(0);
+  (void)above;
+
+  d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0);
+
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0);
+  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1);
+  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2);
+  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3);
+  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+}
+
+void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  uint8x8_t d0u8 = vdup_n_u8(0);
+  uint64x1_t d1u64 = vdup_n_u64(0);
+  (void)above;
+
+  d1u64 = vld1_u64((const uint64_t *)left);
+
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0);
+  vst1_u8(dst, d0u8);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1);
+  vst1_u8(dst, d0u8);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2);
+  vst1_u8(dst, d0u8);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3);
+  vst1_u8(dst, d0u8);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4);
+  vst1_u8(dst, d0u8);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5);
+  vst1_u8(dst, d0u8);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6);
+  vst1_u8(dst, d0u8);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7);
+  vst1_u8(dst, d0u8);
+}
+
+void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  int j;
+  uint8x8_t d2u8 = vdup_n_u8(0);
+  uint8x16_t q0u8 = vdupq_n_u8(0);
+  uint8x16_t q1u8 = vdupq_n_u8(0);
+  (void)above;
+
+  q1u8 = vld1q_u8(left);
+  d2u8 = vget_low_u8(q1u8);
+  for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
+    q0u8 = vdupq_lane_u8(d2u8, 0);
+    vst1q_u8(dst, q0u8);
+    dst += stride;
+    q0u8 = vdupq_lane_u8(d2u8, 1);
+    vst1q_u8(dst, q0u8);
+    dst += stride;
+    q0u8 = vdupq_lane_u8(d2u8, 2);
+    vst1q_u8(dst, q0u8);
+    dst += stride;
+    q0u8 = vdupq_lane_u8(d2u8, 3);
+    vst1q_u8(dst, q0u8);
+    dst += stride;
+    q0u8 = vdupq_lane_u8(d2u8, 4);
+    vst1q_u8(dst, q0u8);
+    dst += stride;
+    q0u8 = vdupq_lane_u8(d2u8, 5);
+    vst1q_u8(dst, q0u8);
+    dst += stride;
+    q0u8 = vdupq_lane_u8(d2u8, 6);
+    vst1q_u8(dst, q0u8);
+    dst += stride;
+    q0u8 = vdupq_lane_u8(d2u8, 7);
+    vst1q_u8(dst, q0u8);
+    dst += stride;
+  }
+}
+
+void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  int j, k;
+  uint8x8_t d2u8 = vdup_n_u8(0);
+  uint8x16_t q0u8 = vdupq_n_u8(0);
+  uint8x16_t q1u8 = vdupq_n_u8(0);
+  (void)above;
+
+  for (k = 0; k < 2; k++, left += 16) {
+    q1u8 = vld1q_u8(left);
+    d2u8 = vget_low_u8(q1u8);
+    for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
+      q0u8 = vdupq_lane_u8(d2u8, 0);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += stride;
+      q0u8 = vdupq_lane_u8(d2u8, 1);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += stride;
+      q0u8 = vdupq_lane_u8(d2u8, 2);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += stride;
+      q0u8 = vdupq_lane_u8(d2u8, 3);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += stride;
+      q0u8 = vdupq_lane_u8(d2u8, 4);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += stride;
+      q0u8 = vdupq_lane_u8(d2u8, 5);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += stride;
+      q0u8 = vdupq_lane_u8(d2u8, 6);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += stride;
+      q0u8 = vdupq_lane_u8(d2u8, 7);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += stride;
+    }
+  }
+}
+
+void vpx_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  int i;
+  uint16x8_t q1u16, q3u16;
+  int16x8_t q1s16;
+  uint8x8_t d0u8 = vdup_n_u8(0);
+  uint32x2_t d2u32 = vdup_n_u32(0);
+
+  d0u8 = vld1_dup_u8(above - 1);
+  d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0);
+  q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8);
+  for (i = 0; i < 4; i++, dst += stride) {
+    q1u16 = vdupq_n_u16((uint16_t)left[i]);
+    q1s16 = vaddq_s16(vreinterpretq_s16_u16(q1u16),
+                      vreinterpretq_s16_u16(q3u16));
+    d0u8 = vqmovun_s16(q1s16);
+    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+  }
+}
+
+void vpx_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  int j;
+  uint16x8_t q0u16, q3u16, q10u16;
+  int16x8_t q0s16;
+  uint16x4_t d20u16;
+  uint8x8_t d0u8, d2u8, d30u8;
+
+  d0u8 = vld1_dup_u8(above - 1);
+  d30u8 = vld1_u8(left);
+  d2u8 = vld1_u8(above);
+  q10u16 = vmovl_u8(d30u8);
+  q3u16 = vsubl_u8(d2u8, d0u8);
+  d20u16 = vget_low_u16(q10u16);
+  for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
+    q0u16 = vdupq_lane_u16(d20u16, 0);
+    q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
+                      vreinterpretq_s16_u16(q0u16));
+    d0u8 = vqmovun_s16(q0s16);
+    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
+    dst += stride;
+    q0u16 = vdupq_lane_u16(d20u16, 1);
+    q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
+                      vreinterpretq_s16_u16(q0u16));
+    d0u8 = vqmovun_s16(q0s16);
+    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
+    dst += stride;
+    q0u16 = vdupq_lane_u16(d20u16, 2);
+    q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
+                      vreinterpretq_s16_u16(q0u16));
+    d0u8 = vqmovun_s16(q0s16);
+    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
+    dst += stride;
+    q0u16 = vdupq_lane_u16(d20u16, 3);
+    q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
+                      vreinterpretq_s16_u16(q0u16));
+    d0u8 = vqmovun_s16(q0s16);
+    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
+    dst += stride;
+  }
+}
+
+void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  int j, k;
+  uint16x8_t q0u16, q2u16, q3u16, q8u16, q10u16;
+  uint8x16_t q0u8, q1u8;
+  int16x8_t q0s16, q1s16, q8s16, q11s16;
+  uint16x4_t d20u16;
+  uint8x8_t d2u8, d3u8, d18u8, d22u8, d23u8;
+
+  q0u8 = vld1q_dup_u8(above - 1);
+  q1u8 = vld1q_u8(above);
+  q2u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
+  q3u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
+  for (k = 0; k < 2; k++, left += 8) {
+    d18u8 = vld1_u8(left);
+    q10u16 = vmovl_u8(d18u8);
+    d20u16 = vget_low_u16(q10u16);
+    for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
+      q0u16 = vdupq_lane_u16(d20u16, 0);
+      q8u16 = vdupq_lane_u16(d20u16, 1);
+      q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                        vreinterpretq_s16_u16(q2u16));
+      q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                        vreinterpretq_s16_u16(q3u16));
+      q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
+                         vreinterpretq_s16_u16(q2u16));
+      q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
+                        vreinterpretq_s16_u16(q3u16));
+      d2u8 = vqmovun_s16(q1s16);
+      d3u8 = vqmovun_s16(q0s16);
+      d22u8 = vqmovun_s16(q11s16);
+      d23u8 = vqmovun_s16(q8s16);
+      vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
+      vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
+      dst += stride;
+      vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
+      vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
+      dst += stride;
+
+      q0u16 = vdupq_lane_u16(d20u16, 2);
+      q8u16 = vdupq_lane_u16(d20u16, 3);
+      q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                        vreinterpretq_s16_u16(q2u16));
+      q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                        vreinterpretq_s16_u16(q3u16));
+      q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
+                         vreinterpretq_s16_u16(q2u16));
+      q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
+                        vreinterpretq_s16_u16(q3u16));
+      d2u8 = vqmovun_s16(q1s16);
+      d3u8 = vqmovun_s16(q0s16);
+      d22u8 = vqmovun_s16(q11s16);
+      d23u8 = vqmovun_s16(q8s16);
+      vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
+      vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
+      dst += stride;
+      vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
+      vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
+      dst += stride;
+    }
+  }
+}
+
+void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  int j, k;
+  uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16;
+  uint8x16_t q0u8, q1u8, q2u8;
+  int16x8_t q12s16, q13s16, q14s16, q15s16;
+  uint16x4_t d6u16;
+  uint8x8_t d0u8, d1u8, d2u8, d3u8, d26u8;
+
+  q0u8 = vld1q_dup_u8(above - 1);
+  q1u8 = vld1q_u8(above);
+  q2u8 = vld1q_u8(above + 16);
+  q8u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
+  q9u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
+  q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q0u8));
+  q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q0u8));
+  for (k = 0; k < 4; k++, left += 8) {
+    d26u8 = vld1_u8(left);
+    q3u16 = vmovl_u8(d26u8);
+    d6u16 = vget_low_u16(q3u16);
+    for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) {
+      q0u16 = vdupq_lane_u16(d6u16, 0);
+      q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q8u16));
+      q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q9u16));
+      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q10u16));
+      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q11u16));
+      d0u8 = vqmovun_s16(q12s16);
+      d1u8 = vqmovun_s16(q13s16);
+      d2u8 = vqmovun_s16(q14s16);
+      d3u8 = vqmovun_s16(q15s16);
+      q0u8 = vcombine_u8(d0u8, d1u8);
+      q1u8 = vcombine_u8(d2u8, d3u8);
+      vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
+      vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
+      dst += stride;
+
+      q0u16 = vdupq_lane_u16(d6u16, 1);
+      q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q8u16));
+      q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q9u16));
+      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q10u16));
+      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q11u16));
+      d0u8 = vqmovun_s16(q12s16);
+      d1u8 = vqmovun_s16(q13s16);
+      d2u8 = vqmovun_s16(q14s16);
+      d3u8 = vqmovun_s16(q15s16);
+      q0u8 = vcombine_u8(d0u8, d1u8);
+      q1u8 = vcombine_u8(d2u8, d3u8);
+      vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
+      vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
+      dst += stride;
+
+      q0u16 = vdupq_lane_u16(d6u16, 2);
+      q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q8u16));
+      q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q9u16));
+      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q10u16));
+      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q11u16));
+      d0u8 = vqmovun_s16(q12s16);
+      d1u8 = vqmovun_s16(q13s16);
+      d2u8 = vqmovun_s16(q14s16);
+      d3u8 = vqmovun_s16(q15s16);
+      q0u8 = vcombine_u8(d0u8, d1u8);
+      q1u8 = vcombine_u8(d2u8, d3u8);
+      vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
+      vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
+      dst += stride;
+
+      q0u16 = vdupq_lane_u16(d6u16, 3);
+      q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q8u16));
+      q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q9u16));
+      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q10u16));
+      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q11u16));
+      d0u8 = vqmovun_s16(q12s16);
+      d1u8 = vqmovun_s16(q13s16);
+      d2u8 = vqmovun_s16(q14s16);
+      d3u8 = vqmovun_s16(q15s16);
+      q0u8 = vcombine_u8(d0u8, d1u8);
+      q1u8 = vcombine_u8(d2u8, d3u8);
+      vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
+      vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
+      dst += stride;
+    }
+  }
+}
+#endif  // !HAVE_NEON_ASM
diff --git a/libs/libvpx/vpx_dsp/arm/intrapred_neon_asm.asm b/libs/libvpx/vpx_dsp/arm/intrapred_neon_asm.asm
new file mode 100644
index 0000000000..115790d480
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/intrapred_neon_asm.asm
@@ -0,0 +1,630 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vpx_v_predictor_4x4_neon|
+    EXPORT  |vpx_v_predictor_8x8_neon|
+    EXPORT  |vpx_v_predictor_16x16_neon|
+    EXPORT  |vpx_v_predictor_32x32_neon|
+    EXPORT  |vpx_h_predictor_4x4_neon|
+    EXPORT  |vpx_h_predictor_8x8_neon|
+    EXPORT  |vpx_h_predictor_16x16_neon|
+    EXPORT  |vpx_h_predictor_32x32_neon|
+    EXPORT  |vpx_tm_predictor_4x4_neon|
+    EXPORT  |vpx_tm_predictor_8x8_neon|
+    EXPORT  |vpx_tm_predictor_16x16_neon|
+    EXPORT  |vpx_tm_predictor_32x32_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
+;                              const uint8_t *above,
+;                              const uint8_t *left)
+; r0  uint8_t *dst
+; r1  ptrdiff_t y_stride
+; r2  const uint8_t *above
+; r3  const uint8_t *left
+
+|vpx_v_predictor_4x4_neon| PROC
+    vld1.32             {d0[0]}, [r2]
+    vst1.32             {d0[0]}, [r0], r1
+    vst1.32             {d0[0]}, [r0], r1
+    vst1.32             {d0[0]}, [r0], r1
+    vst1.32             {d0[0]}, [r0], r1
+    bx                  lr
+    ENDP                ; |vpx_v_predictor_4x4_neon|
+
+;void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
+;                              const uint8_t *above,
+;                              const uint8_t *left)
+; r0  uint8_t *dst
+; r1  ptrdiff_t y_stride
+; r2  const uint8_t *above
+; r3  const uint8_t *left
+
+|vpx_v_predictor_8x8_neon| PROC
+    vld1.8              {d0}, [r2]
+    vst1.8              {d0}, [r0], r1
+    vst1.8              {d0}, [r0], r1
+    vst1.8              {d0}, [r0], r1
+    vst1.8              {d0}, [r0], r1
+    vst1.8              {d0}, [r0], r1
+    vst1.8              {d0}, [r0], r1
+    vst1.8              {d0}, [r0], r1
+    vst1.8              {d0}, [r0], r1
+    bx                  lr
+    ENDP                ; |vpx_v_predictor_8x8_neon|
+
+;void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
+;                                const uint8_t *above,
+;                                const uint8_t *left)
+; r0  uint8_t *dst
+; r1  ptrdiff_t y_stride
+; r2  const uint8_t *above
+; r3  const uint8_t *left
+
+|vpx_v_predictor_16x16_neon| PROC
+    vld1.8              {q0}, [r2]
+    vst1.8              {q0}, [r0], r1
+    vst1.8              {q0}, [r0], r1
+    vst1.8              {q0}, [r0], r1
+    vst1.8              {q0}, [r0], r1
+    vst1.8              {q0}, [r0], r1
+    vst1.8              {q0}, [r0], r1
+    vst1.8              {q0}, [r0], r1
+    vst1.8              {q0}, [r0], r1
+    vst1.8              {q0}, [r0], r1
+    vst1.8              {q0}, [r0], r1
+    vst1.8              {q0}, [r0], r1
+    vst1.8              {q0}, [r0], r1
+    vst1.8              {q0}, [r0], r1
+    vst1.8              {q0}, [r0], r1
+    vst1.8              {q0}, [r0], r1
+    vst1.8              {q0}, [r0], r1
+    bx                  lr
+    ENDP                ; |vpx_v_predictor_16x16_neon|
+
+;void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
+;                                const uint8_t *above,
+;                                const uint8_t *left)
+; r0  uint8_t *dst
+; r1  ptrdiff_t y_stride
+; r2  const uint8_t *above
+; r3  const uint8_t *left
+
+|vpx_v_predictor_32x32_neon| PROC
+    vld1.8              {q0, q1}, [r2]
+    mov                 r2, #2
+loop_v
+    vst1.8              {q0, q1}, [r0], r1
+    vst1.8              {q0, q1}, [r0], r1
+    vst1.8              {q0, q1}, [r0], r1
+    vst1.8              {q0, q1}, [r0], r1
+    vst1.8              {q0, q1}, [r0], r1
+    vst1.8              {q0, q1}, [r0], r1
+    vst1.8              {q0, q1}, [r0], r1
+    vst1.8              {q0, q1}, [r0], r1
+    vst1.8              {q0, q1}, [r0], r1
+    vst1.8              {q0, q1}, [r0], r1
+    vst1.8              {q0, q1}, [r0], r1
+    vst1.8              {q0, q1}, [r0], r1
+    vst1.8              {q0, q1}, [r0], r1
+    vst1.8              {q0, q1}, [r0], r1
+    vst1.8              {q0, q1}, [r0], r1
+    vst1.8              {q0, q1}, [r0], r1
+    subs                r2, r2, #1
+    bgt                 loop_v
+    bx                  lr
+    ENDP                ; |vpx_v_predictor_32x32_neon|
+
+;void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
+;                              const uint8_t *above,
+;                              const uint8_t *left)
+; r0  uint8_t *dst
+; r1  ptrdiff_t y_stride
+; r2  const uint8_t *above
+; r3  const uint8_t *left
+
+|vpx_h_predictor_4x4_neon| PROC
+    vld1.32             {d1[0]}, [r3]
+    vdup.8              d0, d1[0]
+    vst1.32             {d0[0]}, [r0], r1
+    vdup.8              d0, d1[1]
+    vst1.32             {d0[0]}, [r0], r1
+    vdup.8              d0, d1[2]
+    vst1.32             {d0[0]}, [r0], r1
+    vdup.8              d0, d1[3]
+    vst1.32             {d0[0]}, [r0], r1
+    bx                  lr
+    ENDP                ; |vpx_h_predictor_4x4_neon|
+
+;void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
+;                              const uint8_t *above,
+;                              const uint8_t *left)
+; r0  uint8_t *dst
+; r1  ptrdiff_t y_stride
+; r2  const uint8_t *above
+; r3  const uint8_t *left
+
+|vpx_h_predictor_8x8_neon| PROC
+    vld1.64             {d1}, [r3]
+    vdup.8              d0, d1[0]
+    vst1.64             {d0}, [r0], r1
+    vdup.8              d0, d1[1]
+    vst1.64             {d0}, [r0], r1
+    vdup.8              d0, d1[2]
+    vst1.64             {d0}, [r0], r1
+    vdup.8              d0, d1[3]
+    vst1.64             {d0}, [r0], r1
+    vdup.8              d0, d1[4]
+    vst1.64             {d0}, [r0], r1
+    vdup.8              d0, d1[5]
+    vst1.64             {d0}, [r0], r1
+    vdup.8              d0, d1[6]
+    vst1.64             {d0}, [r0], r1
+    vdup.8              d0, d1[7]
+    vst1.64             {d0}, [r0], r1
+    bx                  lr
+    ENDP                ; |vpx_h_predictor_8x8_neon|
+
+;void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
+;                                const uint8_t *above,
+;                                const uint8_t *left)
+; r0  uint8_t *dst
+; r1  ptrdiff_t y_stride
+; r2  const uint8_t *above
+; r3  const uint8_t *left
+
+|vpx_h_predictor_16x16_neon| PROC
+    vld1.8              {q1}, [r3]
+    vdup.8              q0, d2[0]
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d2[1]
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d2[2]
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d2[3]
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d2[4]
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d2[5]
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d2[6]
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d2[7]
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d3[0]
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d3[1]
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d3[2]
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d3[3]
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d3[4]
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d3[5]
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d3[6]
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d3[7]
+    vst1.8              {q0}, [r0], r1
+    bx                  lr
+    ENDP                ; |vpx_h_predictor_16x16_neon|
+
+;void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
+;                                const uint8_t *above,
+;                                const uint8_t *left)
+; r0  uint8_t *dst
+; r1  ptrdiff_t y_stride
+; r2  const uint8_t *above
+; r3  const uint8_t *left
+
+|vpx_h_predictor_32x32_neon| PROC
+    sub                 r1, r1, #16
+    mov                 r2, #2
+loop_h
+    vld1.8              {q1}, [r3]!
+    vdup.8              q0, d2[0]
+    vst1.8              {q0}, [r0]!
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d2[1]
+    vst1.8              {q0}, [r0]!
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d2[2]
+    vst1.8              {q0}, [r0]!
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d2[3]
+    vst1.8              {q0}, [r0]!
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d2[4]
+    vst1.8              {q0}, [r0]!
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d2[5]
+    vst1.8              {q0}, [r0]!
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d2[6]
+    vst1.8              {q0}, [r0]!
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d2[7]
+    vst1.8              {q0}, [r0]!
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d3[0]
+    vst1.8              {q0}, [r0]!
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d3[1]
+    vst1.8              {q0}, [r0]!
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d3[2]
+    vst1.8              {q0}, [r0]!
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d3[3]
+    vst1.8              {q0}, [r0]!
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d3[4]
+    vst1.8              {q0}, [r0]!
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d3[5]
+    vst1.8              {q0}, [r0]!
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d3[6]
+    vst1.8              {q0}, [r0]!
+    vst1.8              {q0}, [r0], r1
+    vdup.8              q0, d3[7]
+    vst1.8              {q0}, [r0]!
+    vst1.8              {q0}, [r0], r1
+    subs                r2, r2, #1
+    bgt                 loop_h
+    bx                  lr
+    ENDP                ; |vpx_h_predictor_32x32_neon|
+
+;void vpx_tm_predictor_4x4_neon (uint8_t *dst, ptrdiff_t y_stride,
+;                                const uint8_t *above,
+;                                const uint8_t *left)
+; r0  uint8_t *dst
+; r1  ptrdiff_t y_stride
+; r2  const uint8_t *above
+; r3  const uint8_t *left
+
+|vpx_tm_predictor_4x4_neon| PROC
+    ; Load ytop_left = above[-1];
+    sub                 r12, r2, #1
+    vld1.u8             {d0[]}, [r12]
+
+    ; Load above 4 pixels
+    vld1.32             {d2[0]}, [r2]
+
+    ; Compute above - ytop_left
+    vsubl.u8            q3, d2, d0
+
+    ; Load left row by row and compute left + (above - ytop_left)
+    ; 1st row and 2nd row
+    vld1.u8             {d2[]}, [r3]!
+    vld1.u8             {d4[]}, [r3]!
+    vmovl.u8            q1, d2
+    vmovl.u8            q2, d4
+    vadd.s16            q1, q1, q3
+    vadd.s16            q2, q2, q3
+    vqmovun.s16         d0, q1
+    vqmovun.s16         d1, q2
+    vst1.32             {d0[0]}, [r0], r1
+    vst1.32             {d1[0]}, [r0], r1
+
+    ; 3rd row and 4th row
+    vld1.u8             {d2[]}, [r3]!
+    vld1.u8             {d4[]}, [r3]
+    vmovl.u8            q1, d2
+    vmovl.u8            q2, d4
+    vadd.s16            q1, q1, q3
+    vadd.s16            q2, q2, q3
+    vqmovun.s16         d0, q1
+    vqmovun.s16         d1, q2
+    vst1.32             {d0[0]}, [r0], r1
+    vst1.32             {d1[0]}, [r0], r1
+    bx                  lr
+    ENDP                ; |vpx_tm_predictor_4x4_neon|
+
+;void vpx_tm_predictor_8x8_neon (uint8_t *dst, ptrdiff_t y_stride,
+;                                const uint8_t *above,
+;                                const uint8_t *left)
+; r0  uint8_t *dst
+; r1  ptrdiff_t y_stride
+; r2  const uint8_t *above
+; r3  const uint8_t *left
+
+|vpx_tm_predictor_8x8_neon| PROC
+    ; Load ytop_left = above[-1];
+    sub                 r12, r2, #1
+    vld1.8              {d0[]}, [r12]
+
+    ; preload 8 left
+    vld1.8              {d30}, [r3]
+
+    ; Load above 8 pixels
+    vld1.64             {d2}, [r2]
+
+    vmovl.u8            q10, d30
+
+    ; Compute above - ytop_left
+    vsubl.u8            q3, d2, d0
+
+    ; Load left row by row and compute left + (above - ytop_left)
+    ; 1st row and 2nd row
+    vdup.16             q0, d20[0]
+    vdup.16             q1, d20[1]
+    vadd.s16            q0, q3, q0
+    vadd.s16            q1, q3, q1
+
+    ; 3rd row and 4th row
+    vdup.16             q8, d20[2]
+    vdup.16             q9, d20[3]
+    vadd.s16            q8, q3, q8
+    vadd.s16            q9, q3, q9
+
+    vqmovun.s16         d0, q0
+    vqmovun.s16         d1, q1
+    vqmovun.s16         d2, q8
+    vqmovun.s16         d3, q9
+
+    vst1.64             {d0}, [r0], r1
+    vst1.64             {d1}, [r0], r1
+    vst1.64             {d2}, [r0], r1
+    vst1.64             {d3}, [r0], r1
+
+    ; 5th row and 6th row
+    vdup.16             q0, d21[0]
+    vdup.16             q1, d21[1]
+    vadd.s16            q0, q3, q0
+    vadd.s16            q1, q3, q1
+
+    ; 7th row and 8th row
+    vdup.16             q8, d21[2]
+    vdup.16             q9, d21[3]
+    vadd.s16            q8, q3, q8
+    vadd.s16            q9, q3, q9
+
+    vqmovun.s16         d0, q0
+    vqmovun.s16         d1, q1
+    vqmovun.s16         d2, q8
+    vqmovun.s16         d3, q9
+
+    vst1.64             {d0}, [r0], r1
+    vst1.64             {d1}, [r0], r1
+    vst1.64             {d2}, [r0], r1
+    vst1.64             {d3}, [r0], r1
+
+    bx                  lr
+    ENDP                ; |vpx_tm_predictor_8x8_neon|
+
+;void vpx_tm_predictor_16x16_neon (uint8_t *dst, ptrdiff_t y_stride,
+;                                const uint8_t *above,
+;                                const uint8_t *left)
+; r0  uint8_t *dst
+; r1  ptrdiff_t y_stride
+; r2  const uint8_t *above
+; r3  const uint8_t *left
+
+|vpx_tm_predictor_16x16_neon| PROC
+    ; Load ytop_left = above[-1];
+    sub                 r12, r2, #1
+    vld1.8              {d0[]}, [r12]
+
+    ; Load above 8 pixels
+    vld1.8              {q1}, [r2]
+
+    ; preload 8 left into r12
+    vld1.8              {d18}, [r3]!
+
+    ; Compute above - ytop_left
+    vsubl.u8            q2, d2, d0
+    vsubl.u8            q3, d3, d0
+
+    vmovl.u8            q10, d18
+
+    ; Load left row by row and compute left + (above - ytop_left)
+    ; Process 8 rows in each single loop and loop 2 times to process 16 rows.
+    mov                 r2, #2
+
+loop_16x16_neon
+    ; Process two rows.
+    vdup.16             q0, d20[0]
+    vdup.16             q8, d20[1]
+    vadd.s16            q1, q0, q2
+    vadd.s16            q0, q0, q3
+    vadd.s16            q11, q8, q2
+    vadd.s16            q8, q8, q3
+    vqmovun.s16         d2, q1
+    vqmovun.s16         d3, q0
+    vqmovun.s16         d22, q11
+    vqmovun.s16         d23, q8
+    vdup.16             q0, d20[2]                  ; proload next 2 rows data
+    vdup.16             q8, d20[3]
+    vst1.64             {d2,d3}, [r0], r1
+    vst1.64             {d22,d23}, [r0], r1
+
+    ; Process two rows.
+    vadd.s16            q1, q0, q2
+    vadd.s16            q0, q0, q3
+    vadd.s16            q11, q8, q2
+    vadd.s16            q8, q8, q3
+    vqmovun.s16         d2, q1
+    vqmovun.s16         d3, q0
+    vqmovun.s16         d22, q11
+    vqmovun.s16         d23, q8
+    vdup.16             q0, d21[0]                  ; proload next 2 rows data
+    vdup.16             q8, d21[1]
+    vst1.64             {d2,d3}, [r0], r1
+    vst1.64             {d22,d23}, [r0], r1
+
+    vadd.s16            q1, q0, q2
+    vadd.s16            q0, q0, q3
+    vadd.s16            q11, q8, q2
+    vadd.s16            q8, q8, q3
+    vqmovun.s16         d2, q1
+    vqmovun.s16         d3, q0
+    vqmovun.s16         d22, q11
+    vqmovun.s16         d23, q8
+    vdup.16             q0, d21[2]                  ; proload next 2 rows data
+    vdup.16             q8, d21[3]
+    vst1.64             {d2,d3}, [r0], r1
+    vst1.64             {d22,d23}, [r0], r1
+
+
+    vadd.s16            q1, q0, q2
+    vadd.s16            q0, q0, q3
+    vadd.s16            q11, q8, q2
+    vadd.s16            q8, q8, q3
+    vqmovun.s16         d2, q1
+    vqmovun.s16         d3, q0
+    vqmovun.s16         d22, q11
+    vqmovun.s16         d23, q8
+    vld1.8              {d18}, [r3]!                  ; preload 8 left into r12
+    vmovl.u8            q10, d18
+    vst1.64             {d2,d3}, [r0], r1
+    vst1.64             {d22,d23}, [r0], r1
+
+    subs                r2, r2, #1
+    bgt                 loop_16x16_neon
+
+    bx                  lr
+    ENDP                ; |vpx_tm_predictor_16x16_neon|
+
+;void vpx_tm_predictor_32x32_neon (uint8_t *dst, ptrdiff_t y_stride,
+;                                  const uint8_t *above,
+;                                  const uint8_t *left)
+; r0  uint8_t *dst
+; r1  ptrdiff_t y_stride
+; r2  const uint8_t *above
+; r3  const uint8_t *left
+
+|vpx_tm_predictor_32x32_neon| PROC
+    ; Load ytop_left = above[-1];
+    sub                 r12, r2, #1
+    vld1.8              {d0[]}, [r12]
+
+    ; Load above 32 pixels
+    vld1.8              {q1}, [r2]!
+    vld1.8              {q2}, [r2]
+
+    ; preload 8 left pixels
+    vld1.8              {d26}, [r3]!
+
+    ; Compute above - ytop_left
+    vsubl.u8            q8, d2, d0
+    vsubl.u8            q9, d3, d0
+    vsubl.u8            q10, d4, d0
+    vsubl.u8            q11, d5, d0
+
+    vmovl.u8            q3, d26
+
+    ; Load left row by row and compute left + (above - ytop_left)
+    ; Process 8 rows in each single loop and loop 4 times to process 32 rows.
+    mov                 r2, #4
+
+loop_32x32_neon
+    ; Process two rows.
+    vdup.16             q0, d6[0]
+    vdup.16             q2, d6[1]
+    vadd.s16            q12, q0, q8
+    vadd.s16            q13, q0, q9
+    vadd.s16            q14, q0, q10
+    vadd.s16            q15, q0, q11
+    vqmovun.s16         d0, q12
+    vqmovun.s16         d1, q13
+    vadd.s16            q12, q2, q8
+    vadd.s16            q13, q2, q9
+    vqmovun.s16         d2, q14
+    vqmovun.s16         d3, q15
+    vadd.s16            q14, q2, q10
+    vadd.s16            q15, q2, q11
+    vst1.64             {d0-d3}, [r0], r1
+    vqmovun.s16         d24, q12
+    vqmovun.s16         d25, q13
+    vqmovun.s16         d26, q14
+    vqmovun.s16         d27, q15
+    vdup.16             q1, d6[2]
+    vdup.16             q2, d6[3]
+    vst1.64             {d24-d27}, [r0], r1
+
+    ; Process two rows.
+    vadd.s16            q12, q1, q8
+    vadd.s16            q13, q1, q9
+    vadd.s16            q14, q1, q10
+    vadd.s16            q15, q1, q11
+    vqmovun.s16         d0, q12
+    vqmovun.s16         d1, q13
+    vadd.s16            q12, q2, q8
+    vadd.s16            q13, q2, q9
+    vqmovun.s16         d2, q14
+    vqmovun.s16         d3, q15
+    vadd.s16            q14, q2, q10
+    vadd.s16            q15, q2, q11
+    vst1.64             {d0-d3}, [r0], r1
+    vqmovun.s16         d24, q12
+    vqmovun.s16         d25, q13
+    vqmovun.s16         d26, q14
+    vqmovun.s16         d27, q15
+    vdup.16             q0, d7[0]
+    vdup.16             q2, d7[1]
+    vst1.64             {d24-d27}, [r0], r1
+
+    ; Process two rows.
+    vadd.s16            q12, q0, q8
+    vadd.s16            q13, q0, q9
+    vadd.s16            q14, q0, q10
+    vadd.s16            q15, q0, q11
+    vqmovun.s16         d0, q12
+    vqmovun.s16         d1, q13
+    vadd.s16            q12, q2, q8
+    vadd.s16            q13, q2, q9
+    vqmovun.s16         d2, q14
+    vqmovun.s16         d3, q15
+    vadd.s16            q14, q2, q10
+    vadd.s16            q15, q2, q11
+    vst1.64             {d0-d3}, [r0], r1
+    vqmovun.s16         d24, q12
+    vqmovun.s16         d25, q13
+    vqmovun.s16         d26, q14
+    vqmovun.s16         d27, q15
+    vdup.16             q0, d7[2]
+    vdup.16             q2, d7[3]
+    vst1.64             {d24-d27}, [r0], r1
+
+    ; Process two rows.
+    vadd.s16            q12, q0, q8
+    vadd.s16            q13, q0, q9
+    vadd.s16            q14, q0, q10
+    vadd.s16            q15, q0, q11
+    vqmovun.s16         d0, q12
+    vqmovun.s16         d1, q13
+    vadd.s16            q12, q2, q8
+    vadd.s16            q13, q2, q9
+    vqmovun.s16         d2, q14
+    vqmovun.s16         d3, q15
+    vadd.s16            q14, q2, q10
+    vadd.s16            q15, q2, q11
+    vst1.64             {d0-d3}, [r0], r1
+    vqmovun.s16         d24, q12
+    vqmovun.s16         d25, q13
+    vld1.8              {d0}, [r3]!                   ; preload 8 left pixels
+    vqmovun.s16         d26, q14
+    vqmovun.s16         d27, q15
+    vmovl.u8            q3, d0
+    vst1.64             {d24-d27}, [r0], r1
+
+    subs                r2, r2, #1
+    bgt                 loop_32x32_neon
+
+    bx                  lr
+    ENDP                ; |vpx_tm_predictor_32x32_neon|
+
+    END
diff --git a/libs/libvpx/vpx_dsp/arm/loopfilter_16_neon.asm b/libs/libvpx/vpx_dsp/arm/loopfilter_16_neon.asm
new file mode 100644
index 0000000000..5a8fdd6aff
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/loopfilter_16_neon.asm
@@ -0,0 +1,199 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vpx_lpf_horizontal_4_dual_neon|
+    ARM
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int p,
+;                                    const uint8_t *blimit0,
+;                                    const uint8_t *limit0,
+;                                    const uint8_t *thresh0,
+;                                    const uint8_t *blimit1,
+;                                    const uint8_t *limit1,
+;                                    const uint8_t *thresh1)
+; r0    uint8_t *s,
+; r1    int p,
+; r2    const uint8_t *blimit0,
+; r3    const uint8_t *limit0,
+; sp    const uint8_t *thresh0,
+; sp+4  const uint8_t *blimit1,
+; sp+8  const uint8_t *limit1,
+; sp+12 const uint8_t *thresh1,
+
+|vpx_lpf_horizontal_4_dual_neon| PROC
+    push        {lr}
+
+    ldr         r12, [sp, #4]              ; load thresh0
+    vld1.8      {d0}, [r2]                 ; load blimit0 to first half q
+    vld1.8      {d2}, [r3]                 ; load limit0 to first half q
+
+    add         r1, r1, r1                 ; double pitch
+    ldr         r2, [sp, #8]               ; load blimit1
+
+    vld1.8      {d4}, [r12]                ; load thresh0 to first half q
+
+    ldr         r3, [sp, #12]              ; load limit1
+    ldr         r12, [sp, #16]             ; load thresh1
+    vld1.8      {d1}, [r2]                 ; load blimit1 to 2nd half q
+
+    sub         r2, r0, r1, lsl #1         ; s[-4 * p]
+
+    vld1.8      {d3}, [r3]                 ; load limit1 to 2nd half q
+    vld1.8      {d5}, [r12]                ; load thresh1 to 2nd half q
+
+    vpush       {d8-d15}                   ; save neon registers
+
+    add         r3, r2, r1, lsr #1         ; s[-3 * p]
+
+    vld1.u8     {q3}, [r2@64], r1          ; p3
+    vld1.u8     {q4}, [r3@64], r1          ; p2
+    vld1.u8     {q5}, [r2@64], r1          ; p1
+    vld1.u8     {q6}, [r3@64], r1          ; p0
+    vld1.u8     {q7}, [r2@64], r1          ; q0
+    vld1.u8     {q8}, [r3@64], r1          ; q1
+    vld1.u8     {q9}, [r2@64]              ; q2
+    vld1.u8     {q10}, [r3@64]             ; q3
+
+    sub         r2, r2, r1, lsl #1
+    sub         r3, r3, r1, lsl #1
+
+    bl          vpx_loop_filter_neon_16
+
+    vst1.u8     {q5}, [r2@64], r1          ; store op1
+    vst1.u8     {q6}, [r3@64], r1          ; store op0
+    vst1.u8     {q7}, [r2@64], r1          ; store oq0
+    vst1.u8     {q8}, [r3@64], r1          ; store oq1
+
+    vpop        {d8-d15}                   ; restore neon registers
+
+    pop         {pc}
+    ENDP        ; |vpx_lpf_horizontal_4_dual_neon|
+
+; void vpx_loop_filter_neon_16();
+; This is a helper function for the loopfilters. The invidual functions do the
+; necessary load, transpose (if necessary) and store. This function uses
+; registers d8-d15, so the calling function must save those registers.
+;
+; r0-r3, r12 PRESERVE
+; q0    blimit
+; q1    limit
+; q2    thresh
+; q3    p3
+; q4    p2
+; q5    p1
+; q6    p0
+; q7    q0
+; q8    q1
+; q9    q2
+; q10   q3
+;
+; Outputs:
+; q5    op1
+; q6    op0
+; q7    oq0
+; q8    oq1
+|vpx_loop_filter_neon_16| PROC
+
+    ; filter_mask
+    vabd.u8     q11, q3, q4                 ; m1 = abs(p3 - p2)
+    vabd.u8     q12, q4, q5                 ; m2 = abs(p2 - p1)
+    vabd.u8     q13, q5, q6                 ; m3 = abs(p1 - p0)
+    vabd.u8     q14, q8, q7                 ; m4 = abs(q1 - q0)
+    vabd.u8     q3, q9, q8                  ; m5 = abs(q2 - q1)
+    vabd.u8     q4, q10, q9                 ; m6 = abs(q3 - q2)
+
+    ; only compare the largest value to limit
+    vmax.u8     q11, q11, q12               ; m7 = max(m1, m2)
+    vmax.u8     q12, q13, q14               ; m8 = max(m3, m4)
+
+    vabd.u8     q9, q6, q7                  ; abs(p0 - q0)
+
+    vmax.u8     q3, q3, q4                  ; m9 = max(m5, m6)
+
+    vmov.u8     q10, #0x80
+
+    vmax.u8     q15, q11, q12               ; m10 = max(m7, m8)
+
+    vcgt.u8     q13, q13, q2                ; (abs(p1 - p0) > thresh)*-1
+    vcgt.u8     q14, q14, q2                ; (abs(q1 - q0) > thresh)*-1
+    vmax.u8     q15, q15, q3                ; m11 = max(m10, m9)
+
+    vabd.u8     q2, q5, q8                  ; a = abs(p1 - q1)
+    vqadd.u8    q9, q9, q9                  ; b = abs(p0 - q0) * 2
+
+    veor        q7, q7, q10                 ; qs0
+
+    vcge.u8     q15, q1, q15                ; abs(m11) > limit
+
+    vshr.u8     q2, q2, #1                  ; a = a / 2
+    veor        q6, q6, q10                 ; ps0
+
+    veor        q5, q5, q10                 ; ps1
+    vqadd.u8    q9, q9, q2                  ; a = b + a
+
+    veor        q8, q8, q10                 ; qs1
+
+    vmov.u16    q4, #3
+
+    vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)
+    vsubl.s8    q11, d15, d13
+
+    vcge.u8     q9, q0, q9                  ; a > blimit
+
+    vqsub.s8    q1, q5, q8                  ; filter = clamp(ps1-qs1)
+    vorr        q14, q13, q14               ; hev
+
+    vmul.i16    q2, q2, q4                  ; 3 * ( qs0 - ps0)
+    vmul.i16    q11, q11, q4
+
+    vand        q1, q1, q14                 ; filter &= hev
+    vand        q15, q15, q9                ; mask
+
+    vmov.u8     q4, #3
+
+    vaddw.s8    q2, q2, d2                  ; filter + 3 * (qs0 - ps0)
+    vaddw.s8    q11, q11, d3
+
+    vmov.u8     q9, #4
+
+    ; filter = clamp(filter + 3 * ( qs0 - ps0))
+    vqmovn.s16  d2, q2
+    vqmovn.s16  d3, q11
+    vand        q1, q1, q15                 ; filter &= mask
+
+    vqadd.s8    q2, q1, q4                  ; filter2 = clamp(filter+3)
+    vqadd.s8    q1, q1, q9                  ; filter1 = clamp(filter+4)
+    vshr.s8     q2, q2, #3                  ; filter2 >>= 3
+    vshr.s8     q1, q1, #3                  ; filter1 >>= 3
+
+
+    vqadd.s8    q11, q6, q2                 ; u = clamp(ps0 + filter2)
+    vqsub.s8    q0, q7, q1                  ; u = clamp(qs0 - filter1)
+
+    ; outer tap adjustments
+    vrshr.s8    q1, q1, #1                  ; filter = ++filter1 >> 1
+
+    veor        q7, q0,  q10                ; *oq0 = u^0x80
+
+    vbic        q1, q1, q14                 ; filter &= ~hev
+
+    vqadd.s8    q13, q5, q1                 ; u = clamp(ps1 + filter)
+    vqsub.s8    q12, q8, q1                 ; u = clamp(qs1 - filter)
+
+    veor        q6, q11, q10                ; *op0 = u^0x80
+    veor        q5, q13, q10                ; *op1 = u^0x80
+    veor        q8, q12, q10                ; *oq1 = u^0x80
+
+    bx          lr
+    ENDP        ; |vpx_loop_filter_neon_16|
+
+    END
diff --git a/libs/libvpx/vpx_dsp/arm/loopfilter_16_neon.c b/libs/libvpx/vpx_dsp/arm/loopfilter_16_neon.c
new file mode 100644
index 0000000000..d24e6adc8a
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/loopfilter_16_neon.c
@@ -0,0 +1,179 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+static INLINE void loop_filter_neon_16(
+        uint8x16_t qblimit,  // blimit
+        uint8x16_t qlimit,   // limit
+        uint8x16_t qthresh,  // thresh
+        uint8x16_t q3,       // p3
+        uint8x16_t q4,       // p2
+        uint8x16_t q5,       // p1
+        uint8x16_t q6,       // p0
+        uint8x16_t q7,       // q0
+        uint8x16_t q8,       // q1
+        uint8x16_t q9,       // q2
+        uint8x16_t q10,      // q3
+        uint8x16_t *q5r,     // p1
+        uint8x16_t *q6r,     // p0
+        uint8x16_t *q7r,     // q0
+        uint8x16_t *q8r) {   // q1
+    uint8x16_t q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8;
+    int16x8_t q2s16, q11s16;
+    uint16x8_t q4u16;
+    int8x16_t q0s8, q1s8, q2s8, q11s8, q12s8, q13s8;
+    int8x8_t d2s8, d3s8;
+
+    q11u8 = vabdq_u8(q3, q4);
+    q12u8 = vabdq_u8(q4, q5);
+    q13u8 = vabdq_u8(q5, q6);
+    q14u8 = vabdq_u8(q8, q7);
+    q3 = vabdq_u8(q9, q8);
+    q4 = vabdq_u8(q10, q9);
+
+    q11u8 = vmaxq_u8(q11u8, q12u8);
+    q12u8 = vmaxq_u8(q13u8, q14u8);
+    q3 = vmaxq_u8(q3, q4);
+    q15u8 = vmaxq_u8(q11u8, q12u8);
+
+    q9 = vabdq_u8(q6, q7);
+
+    // vp8_hevmask
+    q13u8 = vcgtq_u8(q13u8, qthresh);
+    q14u8 = vcgtq_u8(q14u8, qthresh);
+    q15u8 = vmaxq_u8(q15u8, q3);
+
+    q2u8 = vabdq_u8(q5, q8);
+    q9 = vqaddq_u8(q9, q9);
+
+    q15u8 = vcgeq_u8(qlimit, q15u8);
+
+    // vp8_filter() function
+    // convert to signed
+    q10 = vdupq_n_u8(0x80);
+    q8 = veorq_u8(q8, q10);
+    q7 = veorq_u8(q7, q10);
+    q6 = veorq_u8(q6, q10);
+    q5 = veorq_u8(q5, q10);
+
+    q2u8 = vshrq_n_u8(q2u8, 1);
+    q9 = vqaddq_u8(q9, q2u8);
+
+    q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
+                     vget_low_s8(vreinterpretq_s8_u8(q6)));
+    q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
+                      vget_high_s8(vreinterpretq_s8_u8(q6)));
+
+    q9 = vcgeq_u8(qblimit, q9);
+
+    q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5),
+                    vreinterpretq_s8_u8(q8));
+
+    q14u8 = vorrq_u8(q13u8, q14u8);
+
+    q4u16 = vdupq_n_u16(3);
+    q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16));
+    q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16));
+
+    q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8);
+    q15u8 = vandq_u8(q15u8, q9);
+
+    q1s8 = vreinterpretq_s8_u8(q1u8);
+    q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
+    q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8));
+
+    q4 = vdupq_n_u8(3);
+    q9 = vdupq_n_u8(4);
+    // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
+    d2s8 = vqmovn_s16(q2s16);
+    d3s8 = vqmovn_s16(q11s16);
+    q1s8 = vcombine_s8(d2s8, d3s8);
+    q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8);
+    q1s8 = vreinterpretq_s8_u8(q1u8);
+
+    q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q4));
+    q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9));
+    q2s8 = vshrq_n_s8(q2s8, 3);
+    q1s8 = vshrq_n_s8(q1s8, 3);
+
+    q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8);
+    q0s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8);
+
+    q1s8 = vrshrq_n_s8(q1s8, 1);
+    q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
+
+    q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8);
+    q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8);
+
+    *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q10);
+    *q7r = veorq_u8(vreinterpretq_u8_s8(q0s8),  q10);
+    *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q10);
+    *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q10);
+    return;
+}
+
+void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int p /* pitch */,
+                                    const uint8_t *blimit0,
+                                    const uint8_t *limit0,
+                                    const uint8_t *thresh0,
+                                    const uint8_t *blimit1,
+                                    const uint8_t *limit1,
+                                    const uint8_t *thresh1) {
+    uint8x8_t dblimit0, dlimit0, dthresh0, dblimit1, dlimit1, dthresh1;
+    uint8x16_t qblimit, qlimit, qthresh;
+    uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8;
+
+    dblimit0 = vld1_u8(blimit0);
+    dlimit0 = vld1_u8(limit0);
+    dthresh0 = vld1_u8(thresh0);
+    dblimit1 = vld1_u8(blimit1);
+    dlimit1 = vld1_u8(limit1);
+    dthresh1 = vld1_u8(thresh1);
+    qblimit = vcombine_u8(dblimit0, dblimit1);
+    qlimit = vcombine_u8(dlimit0, dlimit1);
+    qthresh = vcombine_u8(dthresh0, dthresh1);
+
+    s -= (p << 2);
+
+    q3u8 = vld1q_u8(s);
+    s += p;
+    q4u8 = vld1q_u8(s);
+    s += p;
+    q5u8 = vld1q_u8(s);
+    s += p;
+    q6u8 = vld1q_u8(s);
+    s += p;
+    q7u8 = vld1q_u8(s);
+    s += p;
+    q8u8 = vld1q_u8(s);
+    s += p;
+    q9u8 = vld1q_u8(s);
+    s += p;
+    q10u8 = vld1q_u8(s);
+
+    loop_filter_neon_16(qblimit, qlimit, qthresh,
+                        q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8,
+                        &q5u8, &q6u8, &q7u8, &q8u8);
+
+    s -= (p * 5);
+    vst1q_u8(s, q5u8);
+    s += p;
+    vst1q_u8(s, q6u8);
+    s += p;
+    vst1q_u8(s, q7u8);
+    s += p;
+    vst1q_u8(s, q8u8);
+    return;
+}
diff --git a/libs/libvpx/vpx_dsp/arm/loopfilter_4_neon.asm b/libs/libvpx/vpx_dsp/arm/loopfilter_4_neon.asm
new file mode 100644
index 0000000000..e45e34cd4c
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/loopfilter_4_neon.asm
@@ -0,0 +1,277 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vpx_lpf_horizontal_4_neon|
+    EXPORT  |vpx_lpf_vertical_4_neon|
+    ARM
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; Currently vpx only works on iterations 8 at a time. The vp8 loop filter
+; works on 16 iterations at a time.
+; TODO(fgalligan): See about removing the count code as this function is only
+; called with a count of 1.
+;
+; void vpx_lpf_horizontal_4_neon(uint8_t *s,
+;                                int p /* pitch */,
+;                                const uint8_t *blimit,
+;                                const uint8_t *limit,
+;                                const uint8_t *thresh,
+;                                int count)
+;
+; r0    uint8_t *s,
+; r1    int p, /* pitch */
+; r2    const uint8_t *blimit,
+; r3    const uint8_t *limit,
+; sp    const uint8_t *thresh,
+; sp+4  int count
+|vpx_lpf_horizontal_4_neon| PROC
+    push        {lr}
+
+    vld1.8      {d0[]}, [r2]               ; duplicate *blimit
+    ldr         r12, [sp, #8]              ; load count
+    ldr         r2, [sp, #4]               ; load thresh
+    add         r1, r1, r1                 ; double pitch
+
+    cmp         r12, #0
+    beq         end_vpx_lf_h_edge
+
+    vld1.8      {d1[]}, [r3]               ; duplicate *limit
+    vld1.8      {d2[]}, [r2]               ; duplicate *thresh
+
+count_lf_h_loop
+    sub         r2, r0, r1, lsl #1         ; move src pointer down by 4 lines
+    add         r3, r2, r1, lsr #1         ; set to 3 lines down
+
+    vld1.u8     {d3}, [r2@64], r1          ; p3
+    vld1.u8     {d4}, [r3@64], r1          ; p2
+    vld1.u8     {d5}, [r2@64], r1          ; p1
+    vld1.u8     {d6}, [r3@64], r1          ; p0
+    vld1.u8     {d7}, [r2@64], r1          ; q0
+    vld1.u8     {d16}, [r3@64], r1         ; q1
+    vld1.u8     {d17}, [r2@64]             ; q2
+    vld1.u8     {d18}, [r3@64]             ; q3
+
+    sub         r2, r2, r1, lsl #1
+    sub         r3, r3, r1, lsl #1
+
+    bl          vpx_loop_filter_neon
+
+    vst1.u8     {d4}, [r2@64], r1          ; store op1
+    vst1.u8     {d5}, [r3@64], r1          ; store op0
+    vst1.u8     {d6}, [r2@64], r1          ; store oq0
+    vst1.u8     {d7}, [r3@64], r1          ; store oq1
+
+    add         r0, r0, #8
+    subs        r12, r12, #1
+    bne         count_lf_h_loop
+
+end_vpx_lf_h_edge
+    pop         {pc}
+    ENDP        ; |vpx_lpf_horizontal_4_neon|
+
+; Currently vpx only works on iterations 8 at a time. The vp8 loop filter
+; works on 16 iterations at a time.
+; TODO(fgalligan): See about removing the count code as this function is only
+; called with a count of 1.
+;
+; void vpx_lpf_vertical_4_neon(uint8_t *s,
+;                              int p /* pitch */,
+;                              const uint8_t *blimit,
+;                              const uint8_t *limit,
+;                              const uint8_t *thresh,
+;                              int count)
+;
+; r0    uint8_t *s,
+; r1    int p, /* pitch */
+; r2    const uint8_t *blimit,
+; r3    const uint8_t *limit,
+; sp    const uint8_t *thresh,
+; sp+4  int count
+|vpx_lpf_vertical_4_neon| PROC
+    push        {lr}
+
+    vld1.8      {d0[]}, [r2]              ; duplicate *blimit
+    ldr         r12, [sp, #8]             ; load count
+    vld1.8      {d1[]}, [r3]              ; duplicate *limit
+
+    ldr         r3, [sp, #4]              ; load thresh
+    sub         r2, r0, #4                ; move s pointer down by 4 columns
+    cmp         r12, #0
+    beq         end_vpx_lf_v_edge
+
+    vld1.8      {d2[]}, [r3]              ; duplicate *thresh
+
+count_lf_v_loop
+    vld1.u8     {d3}, [r2], r1             ; load s data
+    vld1.u8     {d4}, [r2], r1
+    vld1.u8     {d5}, [r2], r1
+    vld1.u8     {d6}, [r2], r1
+    vld1.u8     {d7}, [r2], r1
+    vld1.u8     {d16}, [r2], r1
+    vld1.u8     {d17}, [r2], r1
+    vld1.u8     {d18}, [r2]
+
+    ;transpose to 8x16 matrix
+    vtrn.32     d3, d7
+    vtrn.32     d4, d16
+    vtrn.32     d5, d17
+    vtrn.32     d6, d18
+
+    vtrn.16     d3, d5
+    vtrn.16     d4, d6
+    vtrn.16     d7, d17
+    vtrn.16     d16, d18
+
+    vtrn.8      d3, d4
+    vtrn.8      d5, d6
+    vtrn.8      d7, d16
+    vtrn.8      d17, d18
+
+    bl          vpx_loop_filter_neon
+
+    sub         r0, r0, #2
+
+    ;store op1, op0, oq0, oq1
+    vst4.8      {d4[0], d5[0], d6[0], d7[0]}, [r0], r1
+    vst4.8      {d4[1], d5[1], d6[1], d7[1]}, [r0], r1
+    vst4.8      {d4[2], d5[2], d6[2], d7[2]}, [r0], r1
+    vst4.8      {d4[3], d5[3], d6[3], d7[3]}, [r0], r1
+    vst4.8      {d4[4], d5[4], d6[4], d7[4]}, [r0], r1
+    vst4.8      {d4[5], d5[5], d6[5], d7[5]}, [r0], r1
+    vst4.8      {d4[6], d5[6], d6[6], d7[6]}, [r0], r1
+    vst4.8      {d4[7], d5[7], d6[7], d7[7]}, [r0]
+
+    add         r0, r0, r1, lsl #3         ; s += pitch * 8
+    subs        r12, r12, #1
+    subne       r2, r0, #4                 ; move s pointer down by 4 columns
+    bne         count_lf_v_loop
+
+end_vpx_lf_v_edge
+    pop         {pc}
+    ENDP        ; |vpx_lpf_vertical_4_neon|
+
+; void vpx_loop_filter_neon();
+; This is a helper function for the loopfilters. The invidual functions do the
+; necessary load, transpose (if necessary) and store. The function does not use
+; registers d8-d15.
+;
+; Inputs:
+; r0-r3, r12 PRESERVE
+; d0    blimit
+; d1    limit
+; d2    thresh
+; d3    p3
+; d4    p2
+; d5    p1
+; d6    p0
+; d7    q0
+; d16   q1
+; d17   q2
+; d18   q3
+;
+; Outputs:
+; d4    op1
+; d5    op0
+; d6    oq0
+; d7    oq1
+|vpx_loop_filter_neon| PROC
+    ; filter_mask
+    vabd.u8     d19, d3, d4                 ; m1 = abs(p3 - p2)
+    vabd.u8     d20, d4, d5                 ; m2 = abs(p2 - p1)
+    vabd.u8     d21, d5, d6                 ; m3 = abs(p1 - p0)
+    vabd.u8     d22, d16, d7                ; m4 = abs(q1 - q0)
+    vabd.u8     d3, d17, d16                ; m5 = abs(q2 - q1)
+    vabd.u8     d4, d18, d17                ; m6 = abs(q3 - q2)
+
+    ; only compare the largest value to limit
+    vmax.u8     d19, d19, d20               ; m1 = max(m1, m2)
+    vmax.u8     d20, d21, d22               ; m2 = max(m3, m4)
+
+    vabd.u8     d17, d6, d7                 ; abs(p0 - q0)
+
+    vmax.u8     d3, d3, d4                  ; m3 = max(m5, m6)
+
+    vmov.u8     d18, #0x80
+
+    vmax.u8     d23, d19, d20               ; m1 = max(m1, m2)
+
+    ; hevmask
+    vcgt.u8     d21, d21, d2                ; (abs(p1 - p0) > thresh)*-1
+    vcgt.u8     d22, d22, d2                ; (abs(q1 - q0) > thresh)*-1
+    vmax.u8     d23, d23, d3                ; m1 = max(m1, m3)
+
+    vabd.u8     d28, d5, d16                ; a = abs(p1 - q1)
+    vqadd.u8    d17, d17, d17               ; b = abs(p0 - q0) * 2
+
+    veor        d7, d7, d18                 ; qs0
+
+    vcge.u8     d23, d1, d23                ; abs(m1) > limit
+
+    ; filter() function
+    ; convert to signed
+
+    vshr.u8     d28, d28, #1                ; a = a / 2
+    veor        d6, d6, d18                 ; ps0
+
+    veor        d5, d5, d18                 ; ps1
+    vqadd.u8    d17, d17, d28               ; a = b + a
+
+    veor        d16, d16, d18               ; qs1
+
+    vmov.u8     d19, #3
+
+    vsub.s8     d28, d7, d6                 ; ( qs0 - ps0)
+
+    vcge.u8     d17, d0, d17                ; a > blimit
+
+    vqsub.s8    d27, d5, d16                ; filter = clamp(ps1-qs1)
+    vorr        d22, d21, d22               ; hevmask
+
+    vmull.s8    q12, d28, d19               ; 3 * ( qs0 - ps0)
+
+    vand        d27, d27, d22               ; filter &= hev
+    vand        d23, d23, d17               ; filter_mask
+
+    vaddw.s8    q12, q12, d27               ; filter + 3 * (qs0 - ps0)
+
+    vmov.u8     d17, #4
+
+    ; filter = clamp(filter + 3 * ( qs0 - ps0))
+    vqmovn.s16  d27, q12
+
+    vand        d27, d27, d23               ; filter &= mask
+
+    vqadd.s8    d28, d27, d19               ; filter2 = clamp(filter+3)
+    vqadd.s8    d27, d27, d17               ; filter1 = clamp(filter+4)
+    vshr.s8     d28, d28, #3                ; filter2 >>= 3
+    vshr.s8     d27, d27, #3                ; filter1 >>= 3
+
+    vqadd.s8    d19, d6, d28                ; u = clamp(ps0 + filter2)
+    vqsub.s8    d26, d7, d27                ; u = clamp(qs0 - filter1)
+
+    ; outer tap adjustments
+    vrshr.s8    d27, d27, #1                ; filter = ++filter1 >> 1
+
+    veor        d6, d26, d18                ; *oq0 = u^0x80
+
+    vbic        d27, d27, d22               ; filter &= ~hev
+
+    vqadd.s8    d21, d5, d27                ; u = clamp(ps1 + filter)
+    vqsub.s8    d20, d16, d27               ; u = clamp(qs1 - filter)
+
+    veor        d5, d19, d18                ; *op0 = u^0x80
+    veor        d4, d21, d18                ; *op1 = u^0x80
+    veor        d7, d20, d18                ; *oq1 = u^0x80
+
+    bx          lr
+    ENDP        ; |vpx_loop_filter_neon|
+
+    END
diff --git a/libs/libvpx/vpx_dsp/arm/loopfilter_4_neon.c b/libs/libvpx/vpx_dsp/arm/loopfilter_4_neon.c
new file mode 100644
index 0000000000..7ad411aea2
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/loopfilter_4_neon.c
@@ -0,0 +1,274 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+
+static INLINE void loop_filter_neon(
+        uint8x8_t dblimit,    // flimit
+        uint8x8_t dlimit,     // limit
+        uint8x8_t dthresh,    // thresh
+        uint8x8_t d3u8,       // p3
+        uint8x8_t d4u8,       // p2
+        uint8x8_t d5u8,       // p1
+        uint8x8_t d6u8,       // p0
+        uint8x8_t d7u8,       // q0
+        uint8x8_t d16u8,      // q1
+        uint8x8_t d17u8,      // q2
+        uint8x8_t d18u8,      // q3
+        uint8x8_t *d4ru8,     // p1
+        uint8x8_t *d5ru8,     // p0
+        uint8x8_t *d6ru8,     // q0
+        uint8x8_t *d7ru8) {   // q1
+    uint8x8_t d19u8, d20u8, d21u8, d22u8, d23u8, d27u8, d28u8;
+    int16x8_t q12s16;
+    int8x8_t d19s8, d20s8, d21s8, d26s8, d27s8, d28s8;
+
+    d19u8 = vabd_u8(d3u8, d4u8);
+    d20u8 = vabd_u8(d4u8, d5u8);
+    d21u8 = vabd_u8(d5u8, d6u8);
+    d22u8 = vabd_u8(d16u8, d7u8);
+    d3u8  = vabd_u8(d17u8, d16u8);
+    d4u8  = vabd_u8(d18u8, d17u8);
+
+    d19u8 = vmax_u8(d19u8, d20u8);
+    d20u8 = vmax_u8(d21u8, d22u8);
+    d3u8  = vmax_u8(d3u8,  d4u8);
+    d23u8 = vmax_u8(d19u8, d20u8);
+
+    d17u8 = vabd_u8(d6u8, d7u8);
+
+    d21u8 = vcgt_u8(d21u8, dthresh);
+    d22u8 = vcgt_u8(d22u8, dthresh);
+    d23u8 = vmax_u8(d23u8, d3u8);
+
+    d28u8 = vabd_u8(d5u8, d16u8);
+    d17u8 = vqadd_u8(d17u8, d17u8);
+
+    d23u8 = vcge_u8(dlimit, d23u8);
+
+    d18u8 = vdup_n_u8(0x80);
+    d5u8  = veor_u8(d5u8,  d18u8);
+    d6u8  = veor_u8(d6u8,  d18u8);
+    d7u8  = veor_u8(d7u8,  d18u8);
+    d16u8 = veor_u8(d16u8, d18u8);
+
+    d28u8 = vshr_n_u8(d28u8, 1);
+    d17u8 = vqadd_u8(d17u8, d28u8);
+
+    d19u8 = vdup_n_u8(3);
+
+    d28s8 = vsub_s8(vreinterpret_s8_u8(d7u8),
+                    vreinterpret_s8_u8(d6u8));
+
+    d17u8 = vcge_u8(dblimit, d17u8);
+
+    d27s8 = vqsub_s8(vreinterpret_s8_u8(d5u8),
+                     vreinterpret_s8_u8(d16u8));
+
+    d22u8 = vorr_u8(d21u8, d22u8);
+
+    q12s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d19u8));
+
+    d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d22u8);
+    d23u8 = vand_u8(d23u8, d17u8);
+
+    q12s16 = vaddw_s8(q12s16, vreinterpret_s8_u8(d27u8));
+
+    d17u8 = vdup_n_u8(4);
+
+    d27s8 = vqmovn_s16(q12s16);
+    d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d23u8);
+    d27s8 = vreinterpret_s8_u8(d27u8);
+
+    d28s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d19u8));
+    d27s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d17u8));
+    d28s8 = vshr_n_s8(d28s8, 3);
+    d27s8 = vshr_n_s8(d27s8, 3);
+
+    d19s8 = vqadd_s8(vreinterpret_s8_u8(d6u8), d28s8);
+    d26s8 = vqsub_s8(vreinterpret_s8_u8(d7u8), d27s8);
+
+    d27s8 = vrshr_n_s8(d27s8, 1);
+    d27s8 = vbic_s8(d27s8, vreinterpret_s8_u8(d22u8));
+
+    d21s8 = vqadd_s8(vreinterpret_s8_u8(d5u8), d27s8);
+    d20s8 = vqsub_s8(vreinterpret_s8_u8(d16u8), d27s8);
+
+    *d4ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d18u8);
+    *d5ru8 = veor_u8(vreinterpret_u8_s8(d19s8), d18u8);
+    *d6ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d18u8);
+    *d7ru8 = veor_u8(vreinterpret_u8_s8(d20s8), d18u8);
+    return;
+}
+
+void vpx_lpf_horizontal_4_neon(
+        uint8_t *src,
+        int pitch,
+        const uint8_t *blimit,
+        const uint8_t *limit,
+        const uint8_t *thresh,
+        int count) {
+    int i;
+    uint8_t *s, *psrc;
+    uint8x8_t dblimit, dlimit, dthresh;
+    uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
+
+    if (count == 0)  // end_vpx_lf_h_edge
+        return;
+
+    dblimit = vld1_u8(blimit);
+    dlimit = vld1_u8(limit);
+    dthresh = vld1_u8(thresh);
+
+    psrc = src - (pitch << 2);
+    for (i = 0; i < count; i++) {
+        s = psrc + i * 8;
+
+        d3u8 = vld1_u8(s);
+        s += pitch;
+        d4u8 = vld1_u8(s);
+        s += pitch;
+        d5u8 = vld1_u8(s);
+        s += pitch;
+        d6u8 = vld1_u8(s);
+        s += pitch;
+        d7u8 = vld1_u8(s);
+        s += pitch;
+        d16u8 = vld1_u8(s);
+        s += pitch;
+        d17u8 = vld1_u8(s);
+        s += pitch;
+        d18u8 = vld1_u8(s);
+
+        loop_filter_neon(dblimit, dlimit, dthresh,
+                         d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
+                         &d4u8, &d5u8, &d6u8, &d7u8);
+
+        s -= (pitch * 5);
+        vst1_u8(s, d4u8);
+        s += pitch;
+        vst1_u8(s, d5u8);
+        s += pitch;
+        vst1_u8(s, d6u8);
+        s += pitch;
+        vst1_u8(s, d7u8);
+    }
+    return;
+}
+
+void vpx_lpf_vertical_4_neon(
+        uint8_t *src,
+        int pitch,
+        const uint8_t *blimit,
+        const uint8_t *limit,
+        const uint8_t *thresh,
+        int count) {
+    int i, pitch8;
+    uint8_t *s;
+    uint8x8_t dblimit, dlimit, dthresh;
+    uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
+    uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
+    uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
+    uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
+    uint8x8x4_t d4Result;
+
+    if (count == 0)  // end_vpx_lf_h_edge
+        return;
+
+    dblimit = vld1_u8(blimit);
+    dlimit = vld1_u8(limit);
+    dthresh = vld1_u8(thresh);
+
+    pitch8 = pitch * 8;
+    for (i = 0; i < count; i++, src += pitch8) {
+        s = src - (i + 1) * 4;
+
+        d3u8 = vld1_u8(s);
+        s += pitch;
+        d4u8 = vld1_u8(s);
+        s += pitch;
+        d5u8 = vld1_u8(s);
+        s += pitch;
+        d6u8 = vld1_u8(s);
+        s += pitch;
+        d7u8 = vld1_u8(s);
+        s += pitch;
+        d16u8 = vld1_u8(s);
+        s += pitch;
+        d17u8 = vld1_u8(s);
+        s += pitch;
+        d18u8 = vld1_u8(s);
+
+        d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8),
+                      vreinterpret_u32_u8(d7u8));
+        d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8),
+                      vreinterpret_u32_u8(d16u8));
+        d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8),
+                      vreinterpret_u32_u8(d17u8));
+        d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8),
+                      vreinterpret_u32_u8(d18u8));
+
+        d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
+                          vreinterpret_u16_u32(d2tmp2.val[0]));
+        d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
+                          vreinterpret_u16_u32(d2tmp3.val[0]));
+        d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
+                          vreinterpret_u16_u32(d2tmp2.val[1]));
+        d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
+                          vreinterpret_u16_u32(d2tmp3.val[1]));
+
+        d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
+                         vreinterpret_u8_u16(d2tmp5.val[0]));
+        d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
+                         vreinterpret_u8_u16(d2tmp5.val[1]));
+        d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
+                          vreinterpret_u8_u16(d2tmp7.val[0]));
+        d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
+                          vreinterpret_u8_u16(d2tmp7.val[1]));
+
+        d3u8 = d2tmp8.val[0];
+        d4u8 = d2tmp8.val[1];
+        d5u8 = d2tmp9.val[0];
+        d6u8 = d2tmp9.val[1];
+        d7u8 = d2tmp10.val[0];
+        d16u8 = d2tmp10.val[1];
+        d17u8 = d2tmp11.val[0];
+        d18u8 = d2tmp11.val[1];
+
+        loop_filter_neon(dblimit, dlimit, dthresh,
+                         d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
+                         &d4u8, &d5u8, &d6u8, &d7u8);
+
+        d4Result.val[0] = d4u8;
+        d4Result.val[1] = d5u8;
+        d4Result.val[2] = d6u8;
+        d4Result.val[3] = d7u8;
+
+        src -= 2;
+        vst4_lane_u8(src, d4Result, 0);
+        src += pitch;
+        vst4_lane_u8(src, d4Result, 1);
+        src += pitch;
+        vst4_lane_u8(src, d4Result, 2);
+        src += pitch;
+        vst4_lane_u8(src, d4Result, 3);
+        src += pitch;
+        vst4_lane_u8(src, d4Result, 4);
+        src += pitch;
+        vst4_lane_u8(src, d4Result, 5);
+        src += pitch;
+        vst4_lane_u8(src, d4Result, 6);
+        src += pitch;
+        vst4_lane_u8(src, d4Result, 7);
+    }
+    return;
+}
diff --git a/libs/libvpx/vpx_dsp/arm/loopfilter_8_neon.asm b/libs/libvpx/vpx_dsp/arm/loopfilter_8_neon.asm
new file mode 100644
index 0000000000..e81734c046
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/loopfilter_8_neon.asm
@@ -0,0 +1,451 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vpx_lpf_horizontal_8_neon|
+    EXPORT  |vpx_lpf_vertical_8_neon|
+    ARM
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; Currently vpx only works on iterations 8 at a time. The vp8 loop filter
+; works on 16 iterations at a time.
+; TODO(fgalligan): See about removing the count code as this function is only
+; called with a count of 1.
+;
+; void vpx_lpf_horizontal_8_neon(uint8_t *s, int p,
+;                                const uint8_t *blimit,
+;                                const uint8_t *limit,
+;                                const uint8_t *thresh,
+;                                int count)
+; r0    uint8_t *s,
+; r1    int p, /* pitch */
+; r2    const uint8_t *blimit,
+; r3    const uint8_t *limit,
+; sp    const uint8_t *thresh,
+; sp+4  int count
+|vpx_lpf_horizontal_8_neon| PROC
+    push        {r4-r5, lr}
+
+    vld1.8      {d0[]}, [r2]               ; duplicate *blimit
+    ldr         r12, [sp, #16]             ; load count
+    ldr         r2, [sp, #12]              ; load thresh
+    add         r1, r1, r1                 ; double pitch
+
+    cmp         r12, #0
+    beq         end_vpx_mblf_h_edge
+
+    vld1.8      {d1[]}, [r3]               ; duplicate *limit
+    vld1.8      {d2[]}, [r2]               ; duplicate *thresh
+
+count_mblf_h_loop
+    sub         r3, r0, r1, lsl #1         ; move src pointer down by 4 lines
+    add         r2, r3, r1, lsr #1         ; set to 3 lines down
+
+    vld1.u8     {d3}, [r3@64], r1          ; p3
+    vld1.u8     {d4}, [r2@64], r1          ; p2
+    vld1.u8     {d5}, [r3@64], r1          ; p1
+    vld1.u8     {d6}, [r2@64], r1          ; p0
+    vld1.u8     {d7}, [r3@64], r1          ; q0
+    vld1.u8     {d16}, [r2@64], r1         ; q1
+    vld1.u8     {d17}, [r3@64]             ; q2
+    vld1.u8     {d18}, [r2@64], r1         ; q3
+
+    sub         r3, r3, r1, lsl #1
+    sub         r2, r2, r1, lsl #2
+
+    bl          vpx_mbloop_filter_neon
+
+    vst1.u8     {d0}, [r2@64], r1          ; store op2
+    vst1.u8     {d1}, [r3@64], r1          ; store op1
+    vst1.u8     {d2}, [r2@64], r1          ; store op0
+    vst1.u8     {d3}, [r3@64], r1          ; store oq0
+    vst1.u8     {d4}, [r2@64], r1          ; store oq1
+    vst1.u8     {d5}, [r3@64], r1          ; store oq2
+
+    add         r0, r0, #8
+    subs        r12, r12, #1
+    bne         count_mblf_h_loop
+
+end_vpx_mblf_h_edge
+    pop         {r4-r5, pc}
+
+    ENDP        ; |vpx_lpf_horizontal_8_neon|
+
+; void vpx_lpf_vertical_8_neon(uint8_t *s,
+;                              int pitch,
+;                              const uint8_t *blimit,
+;                              const uint8_t *limit,
+;                              const uint8_t *thresh,
+;                              int count)
+;
+; r0    uint8_t *s,
+; r1    int pitch,
+; r2    const uint8_t *blimit,
+; r3    const uint8_t *limit,
+; sp    const uint8_t *thresh,
+; sp+4  int count
+|vpx_lpf_vertical_8_neon| PROC
+    push        {r4-r5, lr}
+
+    vld1.8      {d0[]}, [r2]              ; duplicate *blimit
+    ldr         r12, [sp, #16]            ; load count
+    vld1.8      {d1[]}, [r3]              ; duplicate *limit
+
+    ldr         r3, [sp, #12]             ; load thresh
+    sub         r2, r0, #4                ; move s pointer down by 4 columns
+    cmp         r12, #0
+    beq         end_vpx_mblf_v_edge
+
+    vld1.8      {d2[]}, [r3]              ; duplicate *thresh
+
+count_mblf_v_loop
+    vld1.u8     {d3}, [r2], r1             ; load s data
+    vld1.u8     {d4}, [r2], r1
+    vld1.u8     {d5}, [r2], r1
+    vld1.u8     {d6}, [r2], r1
+    vld1.u8     {d7}, [r2], r1
+    vld1.u8     {d16}, [r2], r1
+    vld1.u8     {d17}, [r2], r1
+    vld1.u8     {d18}, [r2]
+
+    ;transpose to 8x16 matrix
+    vtrn.32     d3, d7
+    vtrn.32     d4, d16
+    vtrn.32     d5, d17
+    vtrn.32     d6, d18
+
+    vtrn.16     d3, d5
+    vtrn.16     d4, d6
+    vtrn.16     d7, d17
+    vtrn.16     d16, d18
+
+    vtrn.8      d3, d4
+    vtrn.8      d5, d6
+    vtrn.8      d7, d16
+    vtrn.8      d17, d18
+
+    sub         r2, r0, #3
+    add         r3, r0, #1
+
+    bl          vpx_mbloop_filter_neon
+
+    ;store op2, op1, op0, oq0
+    vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r2], r1
+    vst4.8      {d0[1], d1[1], d2[1], d3[1]}, [r2], r1
+    vst4.8      {d0[2], d1[2], d2[2], d3[2]}, [r2], r1
+    vst4.8      {d0[3], d1[3], d2[3], d3[3]}, [r2], r1
+    vst4.8      {d0[4], d1[4], d2[4], d3[4]}, [r2], r1
+    vst4.8      {d0[5], d1[5], d2[5], d3[5]}, [r2], r1
+    vst4.8      {d0[6], d1[6], d2[6], d3[6]}, [r2], r1
+    vst4.8      {d0[7], d1[7], d2[7], d3[7]}, [r2]
+
+    ;store oq1, oq2
+    vst2.8      {d4[0], d5[0]}, [r3], r1
+    vst2.8      {d4[1], d5[1]}, [r3], r1
+    vst2.8      {d4[2], d5[2]}, [r3], r1
+    vst2.8      {d4[3], d5[3]}, [r3], r1
+    vst2.8      {d4[4], d5[4]}, [r3], r1
+    vst2.8      {d4[5], d5[5]}, [r3], r1
+    vst2.8      {d4[6], d5[6]}, [r3], r1
+    vst2.8      {d4[7], d5[7]}, [r3]
+
+    add         r0, r0, r1, lsl #3         ; s += pitch * 8
+    subs        r12, r12, #1
+    subne       r2, r0, #4                 ; move s pointer down by 4 columns
+    bne         count_mblf_v_loop
+
+end_vpx_mblf_v_edge
+    pop         {r4-r5, pc}
+    ENDP        ; |vpx_lpf_vertical_8_neon|
+
+; void vpx_mbloop_filter_neon();
+; This is a helper function for the loopfilters. The invidual functions do the
+; necessary load, transpose (if necessary) and store. The function does not use
+; registers d8-d15.
+;
+; Inputs:
+; r0-r3, r12 PRESERVE
+; d0    blimit
+; d1    limit
+; d2    thresh
+; d3    p3
+; d4    p2
+; d5    p1
+; d6    p0
+; d7    q0
+; d16   q1
+; d17   q2
+; d18   q3
+;
+; Outputs:
+; d0    op2
+; d1    op1
+; d2    op0
+; d3    oq0
+; d4    oq1
+; d5    oq2
+|vpx_mbloop_filter_neon| PROC
+    ; filter_mask
+    vabd.u8     d19, d3, d4                ; m1 = abs(p3 - p2)
+    vabd.u8     d20, d4, d5                ; m2 = abs(p2 - p1)
+    vabd.u8     d21, d5, d6                ; m3 = abs(p1 - p0)
+    vabd.u8     d22, d16, d7               ; m4 = abs(q1 - q0)
+    vabd.u8     d23, d17, d16              ; m5 = abs(q2 - q1)
+    vabd.u8     d24, d18, d17              ; m6 = abs(q3 - q2)
+
+    ; only compare the largest value to limit
+    vmax.u8     d19, d19, d20              ; m1 = max(m1, m2)
+    vmax.u8     d20, d21, d22              ; m2 = max(m3, m4)
+
+    vabd.u8     d25, d6, d4                ; m7 = abs(p0 - p2)
+
+    vmax.u8     d23, d23, d24              ; m3 = max(m5, m6)
+
+    vabd.u8     d26, d7, d17               ; m8 = abs(q0 - q2)
+
+    vmax.u8     d19, d19, d20
+
+    vabd.u8     d24, d6, d7                ; m9 = abs(p0 - q0)
+    vabd.u8     d27, d3, d6                ; m10 = abs(p3 - p0)
+    vabd.u8     d28, d18, d7               ; m11 = abs(q3 - q0)
+
+    vmax.u8     d19, d19, d23
+
+    vabd.u8     d23, d5, d16               ; a = abs(p1 - q1)
+    vqadd.u8    d24, d24, d24              ; b = abs(p0 - q0) * 2
+
+    ; abs () > limit
+    vcge.u8     d19, d1, d19
+
+    ; only compare the largest value to thresh
+    vmax.u8     d25, d25, d26              ; m4 = max(m7, m8)
+    vmax.u8     d26, d27, d28              ; m5 = max(m10, m11)
+
+    vshr.u8     d23, d23, #1               ; a = a / 2
+
+    vmax.u8     d25, d25, d26              ; m4 = max(m4, m5)
+
+    vqadd.u8    d24, d24, d23              ; a = b + a
+
+    vmax.u8     d20, d20, d25              ; m2 = max(m2, m4)
+
+    vmov.u8     d23, #1
+    vcge.u8     d24, d0, d24               ; a > blimit
+
+    vcgt.u8     d21, d21, d2               ; (abs(p1 - p0) > thresh)*-1
+
+    vcge.u8     d20, d23, d20              ; flat
+
+    vand        d19, d19, d24              ; mask
+
+    vcgt.u8     d23, d22, d2               ; (abs(q1 - q0) > thresh)*-1
+
+    vand        d20, d20, d19              ; flat & mask
+
+    vmov.u8     d22, #0x80
+
+    vorr        d23, d21, d23              ; hev
+
+    ; This instruction will truncate the "flat & mask" masks down to 4 bits
+    ; each to fit into one 32 bit arm register. The values are stored in
+    ; q10.64[0].
+    vshrn.u16   d30, q10, #4
+    vmov.u32    r4, d30[0]                 ; flat & mask 4bits
+
+    adds        r5, r4, #1                 ; Check for all 1's
+
+    ; If mask and flat are 1's for all vectors, then we only need to execute
+    ; the power branch for all vectors.
+    beq         power_branch_only
+
+    cmp         r4, #0                     ; Check for 0, set flag for later
+
+    ; mbfilter() function
+    ; filter() function
+    ; convert to signed
+    veor        d21, d7, d22               ; qs0
+    veor        d24, d6, d22               ; ps0
+    veor        d25, d5, d22               ; ps1
+    veor        d26, d16, d22              ; qs1
+
+    vmov.u8     d27, #3
+
+    vsub.s8     d28, d21, d24              ; ( qs0 - ps0)
+
+    vqsub.s8    d29, d25, d26              ; filter = clamp(ps1-qs1)
+
+    vmull.s8    q15, d28, d27              ; 3 * ( qs0 - ps0)
+
+    vand        d29, d29, d23              ; filter &= hev
+
+    vaddw.s8    q15, q15, d29              ; filter + 3 * (qs0 - ps0)
+
+    vmov.u8     d29, #4
+
+    ; filter = clamp(filter + 3 * ( qs0 - ps0))
+    vqmovn.s16  d28, q15
+
+    vand        d28, d28, d19              ; filter &= mask
+
+    vqadd.s8    d30, d28, d27              ; filter2 = clamp(filter+3)
+    vqadd.s8    d29, d28, d29              ; filter1 = clamp(filter+4)
+    vshr.s8     d30, d30, #3               ; filter2 >>= 3
+    vshr.s8     d29, d29, #3               ; filter1 >>= 3
+
+    vqadd.s8    d24, d24, d30              ; op0 = clamp(ps0 + filter2)
+    vqsub.s8    d21, d21, d29              ; oq0 = clamp(qs0 - filter1)
+
+    ; outer tap adjustments: ++filter1 >> 1
+    vrshr.s8    d29, d29, #1
+    vbic        d29, d29, d23              ; filter &= ~hev
+
+    vqadd.s8    d25, d25, d29              ; op1 = clamp(ps1 + filter)
+    vqsub.s8    d26, d26, d29              ; oq1 = clamp(qs1 - filter)
+
+    ; If mask and flat are 0's for all vectors, then we only need to execute
+    ; the filter branch for all vectors.
+    beq         filter_branch_only
+
+    ; If mask and flat are mixed then we must perform both branches and
+    ; combine the data.
+    veor        d24, d24, d22              ; *f_op0 = u^0x80
+    veor        d21, d21, d22              ; *f_oq0 = u^0x80
+    veor        d25, d25, d22              ; *f_op1 = u^0x80
+    veor        d26, d26, d22              ; *f_oq1 = u^0x80
+
+    ; At this point we have already executed the filter branch. The filter
+    ; branch does not set op2 or oq2, so use p2 and q2. Execute the power
+    ; branch and combine the data.
+    vmov.u8     d23, #2
+    vaddl.u8    q14, d6, d7                ; r_op2 = p0 + q0
+    vmlal.u8    q14, d3, d27               ; r_op2 += p3 * 3
+    vmlal.u8    q14, d4, d23               ; r_op2 += p2 * 2
+
+    vbif        d0, d4, d20                ; op2 |= p2 & ~(flat & mask)
+
+    vaddw.u8    q14, d5                    ; r_op2 += p1
+
+    vbif        d1, d25, d20               ; op1 |= f_op1 & ~(flat & mask)
+
+    vqrshrn.u16 d30, q14, #3               ; r_op2
+
+    vsubw.u8    q14, d3                    ; r_op1 = r_op2 - p3
+    vsubw.u8    q14, d4                    ; r_op1 -= p2
+    vaddw.u8    q14, d5                    ; r_op1 += p1
+    vaddw.u8    q14, d16                   ; r_op1 += q1
+
+    vbif        d2, d24, d20               ; op0 |= f_op0 & ~(flat & mask)
+
+    vqrshrn.u16 d31, q14, #3               ; r_op1
+
+    vsubw.u8    q14, d3                    ; r_op0 = r_op1 - p3
+    vsubw.u8    q14, d5                    ; r_op0 -= p1
+    vaddw.u8    q14, d6                    ; r_op0 += p0
+    vaddw.u8    q14, d17                   ; r_op0 += q2
+
+    vbit        d0, d30, d20               ; op2 |= r_op2 & (flat & mask)
+
+    vqrshrn.u16 d23, q14, #3               ; r_op0
+
+    vsubw.u8    q14, d3                    ; r_oq0 = r_op0 - p3
+    vsubw.u8    q14, d6                    ; r_oq0 -= p0
+    vaddw.u8    q14, d7                    ; r_oq0 += q0
+
+    vbit        d1, d31, d20               ; op1 |= r_op1 & (flat & mask)
+
+    vaddw.u8    q14, d18                   ; oq0 += q3
+
+    vbit        d2, d23, d20               ; op0 |= r_op0 & (flat & mask)
+
+    vqrshrn.u16 d22, q14, #3               ; r_oq0
+
+    vsubw.u8    q14, d4                    ; r_oq1 = r_oq0 - p2
+    vsubw.u8    q14, d7                    ; r_oq1 -= q0
+    vaddw.u8    q14, d16                   ; r_oq1 += q1
+
+    vbif        d3, d21, d20               ; oq0 |= f_oq0 & ~(flat & mask)
+
+    vaddw.u8    q14, d18                   ; r_oq1 += q3
+
+    vbif        d4, d26, d20               ; oq1 |= f_oq1 & ~(flat & mask)
+
+    vqrshrn.u16 d6, q14, #3                ; r_oq1
+
+    vsubw.u8    q14, d5                    ; r_oq2 = r_oq1 - p1
+    vsubw.u8    q14, d16                   ; r_oq2 -= q1
+    vaddw.u8    q14, d17                   ; r_oq2 += q2
+    vaddw.u8    q14, d18                   ; r_oq2 += q3
+
+    vbif        d5, d17, d20               ; oq2 |= q2 & ~(flat & mask)
+
+    vqrshrn.u16 d7, q14, #3                ; r_oq2
+
+    vbit        d3, d22, d20               ; oq0 |= r_oq0 & (flat & mask)
+    vbit        d4, d6, d20                ; oq1 |= r_oq1 & (flat & mask)
+    vbit        d5, d7, d20                ; oq2 |= r_oq2 & (flat & mask)
+
+    bx          lr
+
+power_branch_only
+    vmov.u8     d27, #3
+    vmov.u8     d21, #2
+    vaddl.u8    q14, d6, d7                ; op2 = p0 + q0
+    vmlal.u8    q14, d3, d27               ; op2 += p3 * 3
+    vmlal.u8    q14, d4, d21               ; op2 += p2 * 2
+    vaddw.u8    q14, d5                    ; op2 += p1
+    vqrshrn.u16 d0, q14, #3                ; op2
+
+    vsubw.u8    q14, d3                    ; op1 = op2 - p3
+    vsubw.u8    q14, d4                    ; op1 -= p2
+    vaddw.u8    q14, d5                    ; op1 += p1
+    vaddw.u8    q14, d16                   ; op1 += q1
+    vqrshrn.u16 d1, q14, #3                ; op1
+
+    vsubw.u8    q14, d3                    ; op0 = op1 - p3
+    vsubw.u8    q14, d5                    ; op0 -= p1
+    vaddw.u8    q14, d6                    ; op0 += p0
+    vaddw.u8    q14, d17                   ; op0 += q2
+    vqrshrn.u16 d2, q14, #3                ; op0
+
+    vsubw.u8    q14, d3                    ; oq0 = op0 - p3
+    vsubw.u8    q14, d6                    ; oq0 -= p0
+    vaddw.u8    q14, d7                    ; oq0 += q0
+    vaddw.u8    q14, d18                   ; oq0 += q3
+    vqrshrn.u16 d3, q14, #3                ; oq0
+
+    vsubw.u8    q14, d4                    ; oq1 = oq0 - p2
+    vsubw.u8    q14, d7                    ; oq1 -= q0
+    vaddw.u8    q14, d16                   ; oq1 += q1
+    vaddw.u8    q14, d18                   ; oq1 += q3
+    vqrshrn.u16 d4, q14, #3                ; oq1
+
+    vsubw.u8    q14, d5                    ; oq2 = oq1 - p1
+    vsubw.u8    q14, d16                   ; oq2 -= q1
+    vaddw.u8    q14, d17                   ; oq2 += q2
+    vaddw.u8    q14, d18                   ; oq2 += q3
+    vqrshrn.u16 d5, q14, #3                ; oq2
+
+    bx          lr
+
+filter_branch_only
+    ; TODO(fgalligan): See if we can rearange registers so we do not need to
+    ; do the 2 vswp.
+    vswp        d0, d4                      ; op2
+    vswp        d5, d17                     ; oq2
+    veor        d2, d24, d22                ; *op0 = u^0x80
+    veor        d3, d21, d22                ; *oq0 = u^0x80
+    veor        d1, d25, d22                ; *op1 = u^0x80
+    veor        d4, d26, d22                ; *oq1 = u^0x80
+
+    bx          lr
+
+    ENDP        ; |vpx_mbloop_filter_neon|
+
+    END
diff --git a/libs/libvpx/vpx_dsp/arm/loopfilter_8_neon.c b/libs/libvpx/vpx_dsp/arm/loopfilter_8_neon.c
new file mode 100644
index 0000000000..a887e2ee54
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/loopfilter_8_neon.c
@@ -0,0 +1,453 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+
+static INLINE void mbloop_filter_neon(
+        uint8x8_t dblimit,   // mblimit
+        uint8x8_t dlimit,    // limit
+        uint8x8_t dthresh,   // thresh
+        uint8x8_t d3u8,      // p2
+        uint8x8_t d4u8,      // p2
+        uint8x8_t d5u8,      // p1
+        uint8x8_t d6u8,      // p0
+        uint8x8_t d7u8,      // q0
+        uint8x8_t d16u8,     // q1
+        uint8x8_t d17u8,     // q2
+        uint8x8_t d18u8,     // q3
+        uint8x8_t *d0ru8,    // p1
+        uint8x8_t *d1ru8,    // p1
+        uint8x8_t *d2ru8,    // p0
+        uint8x8_t *d3ru8,    // q0
+        uint8x8_t *d4ru8,    // q1
+        uint8x8_t *d5ru8) {  // q1
+    uint32_t flat;
+    uint8x8_t d0u8, d1u8, d2u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8;
+    uint8x8_t d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
+    int16x8_t q15s16;
+    uint16x8_t q10u16, q14u16;
+    int8x8_t d21s8, d24s8, d25s8, d26s8, d28s8, d29s8, d30s8;
+
+    d19u8 = vabd_u8(d3u8, d4u8);
+    d20u8 = vabd_u8(d4u8, d5u8);
+    d21u8 = vabd_u8(d5u8, d6u8);
+    d22u8 = vabd_u8(d16u8, d7u8);
+    d23u8 = vabd_u8(d17u8, d16u8);
+    d24u8 = vabd_u8(d18u8, d17u8);
+
+    d19u8 = vmax_u8(d19u8, d20u8);
+    d20u8 = vmax_u8(d21u8, d22u8);
+
+    d25u8 = vabd_u8(d6u8, d4u8);
+
+    d23u8 = vmax_u8(d23u8, d24u8);
+
+    d26u8 = vabd_u8(d7u8, d17u8);
+
+    d19u8 = vmax_u8(d19u8, d20u8);
+
+    d24u8 = vabd_u8(d6u8, d7u8);
+    d27u8 = vabd_u8(d3u8, d6u8);
+    d28u8 = vabd_u8(d18u8, d7u8);
+
+    d19u8 = vmax_u8(d19u8, d23u8);
+
+    d23u8 = vabd_u8(d5u8, d16u8);
+    d24u8 = vqadd_u8(d24u8, d24u8);
+
+
+    d19u8 = vcge_u8(dlimit, d19u8);
+
+
+    d25u8 = vmax_u8(d25u8, d26u8);
+    d26u8 = vmax_u8(d27u8, d28u8);
+
+    d23u8 = vshr_n_u8(d23u8, 1);
+
+    d25u8 = vmax_u8(d25u8, d26u8);
+
+    d24u8 = vqadd_u8(d24u8, d23u8);
+
+    d20u8 = vmax_u8(d20u8, d25u8);
+
+    d23u8 = vdup_n_u8(1);
+    d24u8 = vcge_u8(dblimit, d24u8);
+
+    d21u8 = vcgt_u8(d21u8, dthresh);
+
+    d20u8 = vcge_u8(d23u8, d20u8);
+
+    d19u8 = vand_u8(d19u8, d24u8);
+
+    d23u8 = vcgt_u8(d22u8, dthresh);
+
+    d20u8 = vand_u8(d20u8, d19u8);
+
+    d22u8 = vdup_n_u8(0x80);
+
+    d23u8 = vorr_u8(d21u8, d23u8);
+
+    q10u16 = vcombine_u16(vreinterpret_u16_u8(d20u8),
+                          vreinterpret_u16_u8(d21u8));
+
+    d30u8 = vshrn_n_u16(q10u16, 4);
+    flat = vget_lane_u32(vreinterpret_u32_u8(d30u8), 0);
+
+    if (flat == 0xffffffff) {  // Check for all 1's, power_branch_only
+        d27u8 = vdup_n_u8(3);
+        d21u8 = vdup_n_u8(2);
+        q14u16 = vaddl_u8(d6u8, d7u8);
+        q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
+        q14u16 = vmlal_u8(q14u16, d4u8, d21u8);
+        q14u16 = vaddw_u8(q14u16, d5u8);
+        *d0ru8 = vqrshrn_n_u16(q14u16, 3);
+
+        q14u16 = vsubw_u8(q14u16, d3u8);
+        q14u16 = vsubw_u8(q14u16, d4u8);
+        q14u16 = vaddw_u8(q14u16, d5u8);
+        q14u16 = vaddw_u8(q14u16, d16u8);
+        *d1ru8 = vqrshrn_n_u16(q14u16, 3);
+
+        q14u16 = vsubw_u8(q14u16, d3u8);
+        q14u16 = vsubw_u8(q14u16, d5u8);
+        q14u16 = vaddw_u8(q14u16, d6u8);
+        q14u16 = vaddw_u8(q14u16, d17u8);
+        *d2ru8 = vqrshrn_n_u16(q14u16, 3);
+
+        q14u16 = vsubw_u8(q14u16, d3u8);
+        q14u16 = vsubw_u8(q14u16, d6u8);
+        q14u16 = vaddw_u8(q14u16, d7u8);
+        q14u16 = vaddw_u8(q14u16, d18u8);
+        *d3ru8 = vqrshrn_n_u16(q14u16, 3);
+
+        q14u16 = vsubw_u8(q14u16, d4u8);
+        q14u16 = vsubw_u8(q14u16, d7u8);
+        q14u16 = vaddw_u8(q14u16, d16u8);
+        q14u16 = vaddw_u8(q14u16, d18u8);
+        *d4ru8 = vqrshrn_n_u16(q14u16, 3);
+
+        q14u16 = vsubw_u8(q14u16, d5u8);
+        q14u16 = vsubw_u8(q14u16, d16u8);
+        q14u16 = vaddw_u8(q14u16, d17u8);
+        q14u16 = vaddw_u8(q14u16, d18u8);
+        *d5ru8 = vqrshrn_n_u16(q14u16, 3);
+    } else {
+        d21u8 = veor_u8(d7u8,  d22u8);
+        d24u8 = veor_u8(d6u8,  d22u8);
+        d25u8 = veor_u8(d5u8,  d22u8);
+        d26u8 = veor_u8(d16u8, d22u8);
+
+        d27u8 = vdup_n_u8(3);
+
+        d28s8 = vsub_s8(vreinterpret_s8_u8(d21u8), vreinterpret_s8_u8(d24u8));
+        d29s8 = vqsub_s8(vreinterpret_s8_u8(d25u8), vreinterpret_s8_u8(d26u8));
+
+        q15s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d27u8));
+
+        d29s8 = vand_s8(d29s8, vreinterpret_s8_u8(d23u8));
+
+        q15s16 = vaddw_s8(q15s16, d29s8);
+
+        d29u8 = vdup_n_u8(4);
+
+        d28s8 = vqmovn_s16(q15s16);
+
+        d28s8 = vand_s8(d28s8, vreinterpret_s8_u8(d19u8));
+
+        d30s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d27u8));
+        d29s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d29u8));
+        d30s8 = vshr_n_s8(d30s8, 3);
+        d29s8 = vshr_n_s8(d29s8, 3);
+
+        d24s8 = vqadd_s8(vreinterpret_s8_u8(d24u8), d30s8);
+        d21s8 = vqsub_s8(vreinterpret_s8_u8(d21u8), d29s8);
+
+        d29s8 = vrshr_n_s8(d29s8, 1);
+        d29s8 = vbic_s8(d29s8, vreinterpret_s8_u8(d23u8));
+
+        d25s8 = vqadd_s8(vreinterpret_s8_u8(d25u8), d29s8);
+        d26s8 = vqsub_s8(vreinterpret_s8_u8(d26u8), d29s8);
+
+        if (flat == 0) {  // filter_branch_only
+            *d0ru8 = d4u8;
+            *d1ru8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
+            *d2ru8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
+            *d3ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
+            *d4ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
+            *d5ru8 = d17u8;
+            return;
+        }
+
+        d21u8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
+        d24u8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
+        d25u8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
+        d26u8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
+
+        d23u8 = vdup_n_u8(2);
+        q14u16 = vaddl_u8(d6u8, d7u8);
+        q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
+        q14u16 = vmlal_u8(q14u16, d4u8, d23u8);
+
+        d0u8 = vbsl_u8(d20u8, dblimit, d4u8);
+
+        q14u16 = vaddw_u8(q14u16, d5u8);
+
+        d1u8 = vbsl_u8(d20u8, dlimit, d25u8);
+
+        d30u8 = vqrshrn_n_u16(q14u16, 3);
+
+        q14u16 = vsubw_u8(q14u16, d3u8);
+        q14u16 = vsubw_u8(q14u16, d4u8);
+        q14u16 = vaddw_u8(q14u16, d5u8);
+        q14u16 = vaddw_u8(q14u16, d16u8);
+
+        d2u8 = vbsl_u8(d20u8, dthresh, d24u8);
+
+        d31u8 = vqrshrn_n_u16(q14u16, 3);
+
+        q14u16 = vsubw_u8(q14u16, d3u8);
+        q14u16 = vsubw_u8(q14u16, d5u8);
+        q14u16 = vaddw_u8(q14u16, d6u8);
+        q14u16 = vaddw_u8(q14u16, d17u8);
+
+        *d0ru8 = vbsl_u8(d20u8, d30u8, d0u8);
+
+        d23u8 = vqrshrn_n_u16(q14u16, 3);
+
+        q14u16 = vsubw_u8(q14u16, d3u8);
+        q14u16 = vsubw_u8(q14u16, d6u8);
+        q14u16 = vaddw_u8(q14u16, d7u8);
+
+        *d1ru8 = vbsl_u8(d20u8, d31u8, d1u8);
+
+        q14u16 = vaddw_u8(q14u16, d18u8);
+
+        *d2ru8 = vbsl_u8(d20u8, d23u8, d2u8);
+
+        d22u8 = vqrshrn_n_u16(q14u16, 3);
+
+        q14u16 = vsubw_u8(q14u16, d4u8);
+        q14u16 = vsubw_u8(q14u16, d7u8);
+        q14u16 = vaddw_u8(q14u16, d16u8);
+
+        d3u8 = vbsl_u8(d20u8, d3u8, d21u8);
+
+        q14u16 = vaddw_u8(q14u16, d18u8);
+
+        d4u8 = vbsl_u8(d20u8, d4u8, d26u8);
+
+        d6u8 = vqrshrn_n_u16(q14u16, 3);
+
+        q14u16 = vsubw_u8(q14u16, d5u8);
+        q14u16 = vsubw_u8(q14u16, d16u8);
+        q14u16 = vaddw_u8(q14u16, d17u8);
+        q14u16 = vaddw_u8(q14u16, d18u8);
+
+        d5u8 = vbsl_u8(d20u8, d5u8, d17u8);
+
+        d7u8 = vqrshrn_n_u16(q14u16, 3);
+
+        *d3ru8 = vbsl_u8(d20u8, d22u8, d3u8);
+        *d4ru8 = vbsl_u8(d20u8, d6u8, d4u8);
+        *d5ru8 = vbsl_u8(d20u8, d7u8, d5u8);
+    }
+    return;
+}
+
+void vpx_lpf_horizontal_8_neon(
+        uint8_t *src,
+        int pitch,
+        const uint8_t *blimit,
+        const uint8_t *limit,
+        const uint8_t *thresh,
+        int count) {
+    int i;
+    uint8_t *s, *psrc;
+    uint8x8_t dblimit, dlimit, dthresh;
+    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+    uint8x8_t d16u8, d17u8, d18u8;
+
+    if (count == 0)  // end_vpx_mblf_h_edge
+        return;
+
+    dblimit = vld1_u8(blimit);
+    dlimit = vld1_u8(limit);
+    dthresh = vld1_u8(thresh);
+
+    psrc = src - (pitch << 2);
+    for (i = 0; i < count; i++) {
+        s = psrc + i * 8;
+
+        d3u8  = vld1_u8(s);
+        s += pitch;
+        d4u8  = vld1_u8(s);
+        s += pitch;
+        d5u8  = vld1_u8(s);
+        s += pitch;
+        d6u8  = vld1_u8(s);
+        s += pitch;
+        d7u8  = vld1_u8(s);
+        s += pitch;
+        d16u8 = vld1_u8(s);
+        s += pitch;
+        d17u8 = vld1_u8(s);
+        s += pitch;
+        d18u8 = vld1_u8(s);
+
+        mbloop_filter_neon(dblimit, dlimit, dthresh,
+                           d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
+                           &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8);
+
+        s -= (pitch * 6);
+        vst1_u8(s, d0u8);
+        s += pitch;
+        vst1_u8(s, d1u8);
+        s += pitch;
+        vst1_u8(s, d2u8);
+        s += pitch;
+        vst1_u8(s, d3u8);
+        s += pitch;
+        vst1_u8(s, d4u8);
+        s += pitch;
+        vst1_u8(s, d5u8);
+    }
+    return;
+}
+
+void vpx_lpf_vertical_8_neon(
+        uint8_t *src,
+        int pitch,
+        const uint8_t *blimit,
+        const uint8_t *limit,
+        const uint8_t *thresh,
+        int count) {
+    int i;
+    uint8_t *s;
+    uint8x8_t dblimit, dlimit, dthresh;
+    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+    uint8x8_t d16u8, d17u8, d18u8;
+    uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
+    uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
+    uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
+    uint8x8x4_t d4Result;
+    uint8x8x2_t d2Result;
+
+    if (count == 0)
+        return;
+
+    dblimit = vld1_u8(blimit);
+    dlimit = vld1_u8(limit);
+    dthresh = vld1_u8(thresh);
+
+    for (i = 0; i < count; i++) {
+        s = src + (i * (pitch << 3)) - 4;
+
+        d3u8 = vld1_u8(s);
+        s += pitch;
+        d4u8 = vld1_u8(s);
+        s += pitch;
+        d5u8 = vld1_u8(s);
+        s += pitch;
+        d6u8 = vld1_u8(s);
+        s += pitch;
+        d7u8 = vld1_u8(s);
+        s += pitch;
+        d16u8 = vld1_u8(s);
+        s += pitch;
+        d17u8 = vld1_u8(s);
+        s += pitch;
+        d18u8 = vld1_u8(s);
+
+        d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8),
+                          vreinterpret_u32_u8(d7u8));
+        d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8),
+                          vreinterpret_u32_u8(d16u8));
+        d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8),
+                          vreinterpret_u32_u8(d17u8));
+        d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8),
+                          vreinterpret_u32_u8(d18u8));
+
+        d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
+                          vreinterpret_u16_u32(d2tmp2.val[0]));
+        d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
+                          vreinterpret_u16_u32(d2tmp3.val[0]));
+        d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
+                          vreinterpret_u16_u32(d2tmp2.val[1]));
+        d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
+                          vreinterpret_u16_u32(d2tmp3.val[1]));
+
+        d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
+                         vreinterpret_u8_u16(d2tmp5.val[0]));
+        d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
+                         vreinterpret_u8_u16(d2tmp5.val[1]));
+        d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
+                          vreinterpret_u8_u16(d2tmp7.val[0]));
+        d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
+                          vreinterpret_u8_u16(d2tmp7.val[1]));
+
+        d3u8 = d2tmp8.val[0];
+        d4u8 = d2tmp8.val[1];
+        d5u8 = d2tmp9.val[0];
+        d6u8 = d2tmp9.val[1];
+        d7u8 = d2tmp10.val[0];
+        d16u8 = d2tmp10.val[1];
+        d17u8 = d2tmp11.val[0];
+        d18u8 = d2tmp11.val[1];
+
+        mbloop_filter_neon(dblimit, dlimit, dthresh,
+                           d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
+                           &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8);
+
+        d4Result.val[0] = d0u8;
+        d4Result.val[1] = d1u8;
+        d4Result.val[2] = d2u8;
+        d4Result.val[3] = d3u8;
+
+        d2Result.val[0] = d4u8;
+        d2Result.val[1] = d5u8;
+
+        s = src - 3;
+        vst4_lane_u8(s, d4Result, 0);
+        s += pitch;
+        vst4_lane_u8(s, d4Result, 1);
+        s += pitch;
+        vst4_lane_u8(s, d4Result, 2);
+        s += pitch;
+        vst4_lane_u8(s, d4Result, 3);
+        s += pitch;
+        vst4_lane_u8(s, d4Result, 4);
+        s += pitch;
+        vst4_lane_u8(s, d4Result, 5);
+        s += pitch;
+        vst4_lane_u8(s, d4Result, 6);
+        s += pitch;
+        vst4_lane_u8(s, d4Result, 7);
+
+        s = src + 1;
+        vst2_lane_u8(s, d2Result, 0);
+        s += pitch;
+        vst2_lane_u8(s, d2Result, 1);
+        s += pitch;
+        vst2_lane_u8(s, d2Result, 2);
+        s += pitch;
+        vst2_lane_u8(s, d2Result, 3);
+        s += pitch;
+        vst2_lane_u8(s, d2Result, 4);
+        s += pitch;
+        vst2_lane_u8(s, d2Result, 5);
+        s += pitch;
+        vst2_lane_u8(s, d2Result, 6);
+        s += pitch;
+        vst2_lane_u8(s, d2Result, 7);
+    }
+    return;
+}
diff --git a/libs/libvpx/vpx_dsp/arm/loopfilter_mb_neon.asm b/libs/libvpx/vpx_dsp/arm/loopfilter_mb_neon.asm
new file mode 100644
index 0000000000..20d9cfb113
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/loopfilter_mb_neon.asm
@@ -0,0 +1,606 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vpx_lpf_horizontal_16_neon|
+    EXPORT  |vpx_lpf_vertical_16_neon|
+    ARM
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; void vpx_lpf_horizontal_16_neon(uint8_t *s, int p,
+;                                 const uint8_t *blimit,
+;                                 const uint8_t *limit,
+;                                 const uint8_t *thresh
+;                                 int count)
+; r0    uint8_t *s,
+; r1    int p, /* pitch */
+; r2    const uint8_t *blimit,
+; r3    const uint8_t *limit,
+; sp    const uint8_t *thresh,
+|vpx_lpf_horizontal_16_neon| PROC
+    push        {r4-r8, lr}
+    vpush       {d8-d15}
+    ldr         r4, [sp, #88]              ; load thresh
+    ldr         r12, [sp, #92]             ; load count
+
+h_count
+    vld1.8      {d16[]}, [r2]              ; load *blimit
+    vld1.8      {d17[]}, [r3]              ; load *limit
+    vld1.8      {d18[]}, [r4]              ; load *thresh
+
+    sub         r8, r0, r1, lsl #3         ; move src pointer down by 8 lines
+
+    vld1.u8     {d0}, [r8@64], r1          ; p7
+    vld1.u8     {d1}, [r8@64], r1          ; p6
+    vld1.u8     {d2}, [r8@64], r1          ; p5
+    vld1.u8     {d3}, [r8@64], r1          ; p4
+    vld1.u8     {d4}, [r8@64], r1          ; p3
+    vld1.u8     {d5}, [r8@64], r1          ; p2
+    vld1.u8     {d6}, [r8@64], r1          ; p1
+    vld1.u8     {d7}, [r8@64], r1          ; p0
+    vld1.u8     {d8}, [r8@64], r1          ; q0
+    vld1.u8     {d9}, [r8@64], r1          ; q1
+    vld1.u8     {d10}, [r8@64], r1         ; q2
+    vld1.u8     {d11}, [r8@64], r1         ; q3
+    vld1.u8     {d12}, [r8@64], r1         ; q4
+    vld1.u8     {d13}, [r8@64], r1         ; q5
+    vld1.u8     {d14}, [r8@64], r1         ; q6
+    vld1.u8     {d15}, [r8@64], r1         ; q7
+
+    bl          vpx_wide_mbfilter_neon
+
+    tst         r7, #1
+    beq         h_mbfilter
+
+    ; flat && mask were not set for any of the channels. Just store the values
+    ; from filter.
+    sub         r8, r0, r1, lsl #1
+
+    vst1.u8     {d25}, [r8@64], r1         ; store op1
+    vst1.u8     {d24}, [r8@64], r1         ; store op0
+    vst1.u8     {d23}, [r8@64], r1         ; store oq0
+    vst1.u8     {d26}, [r8@64], r1         ; store oq1
+
+    b           h_next
+
+h_mbfilter
+    tst         r7, #2
+    beq         h_wide_mbfilter
+
+    ; flat2 was not set for any of the channels. Just store the values from
+    ; mbfilter.
+    sub         r8, r0, r1, lsl #1
+    sub         r8, r8, r1
+
+    vst1.u8     {d18}, [r8@64], r1         ; store op2
+    vst1.u8     {d19}, [r8@64], r1         ; store op1
+    vst1.u8     {d20}, [r8@64], r1         ; store op0
+    vst1.u8     {d21}, [r8@64], r1         ; store oq0
+    vst1.u8     {d22}, [r8@64], r1         ; store oq1
+    vst1.u8     {d23}, [r8@64], r1         ; store oq2
+
+    b           h_next
+
+h_wide_mbfilter
+    sub         r8, r0, r1, lsl #3
+    add         r8, r8, r1
+
+    vst1.u8     {d16}, [r8@64], r1         ; store op6
+    vst1.u8     {d24}, [r8@64], r1         ; store op5
+    vst1.u8     {d25}, [r8@64], r1         ; store op4
+    vst1.u8     {d26}, [r8@64], r1         ; store op3
+    vst1.u8     {d27}, [r8@64], r1         ; store op2
+    vst1.u8     {d18}, [r8@64], r1         ; store op1
+    vst1.u8     {d19}, [r8@64], r1         ; store op0
+    vst1.u8     {d20}, [r8@64], r1         ; store oq0
+    vst1.u8     {d21}, [r8@64], r1         ; store oq1
+    vst1.u8     {d22}, [r8@64], r1         ; store oq2
+    vst1.u8     {d23}, [r8@64], r1         ; store oq3
+    vst1.u8     {d1}, [r8@64], r1          ; store oq4
+    vst1.u8     {d2}, [r8@64], r1          ; store oq5
+    vst1.u8     {d3}, [r8@64], r1          ; store oq6
+
+h_next
+    add         r0, r0, #8
+    subs        r12, r12, #1
+    bne         h_count
+
+    vpop        {d8-d15}
+    pop         {r4-r8, pc}
+
+    ENDP        ; |vpx_lpf_horizontal_16_neon|
+
+; void vpx_lpf_vertical_16_neon(uint8_t *s, int p,
+;                               const uint8_t *blimit,
+;                               const uint8_t *limit,
+;                               const uint8_t *thresh)
+; r0    uint8_t *s,
+; r1    int p, /* pitch */
+; r2    const uint8_t *blimit,
+; r3    const uint8_t *limit,
+; sp    const uint8_t *thresh,
+|vpx_lpf_vertical_16_neon| PROC
+    push        {r4-r8, lr}
+    vpush       {d8-d15}
+    ldr         r4, [sp, #88]              ; load thresh
+
+    vld1.8      {d16[]}, [r2]              ; load *blimit
+    vld1.8      {d17[]}, [r3]              ; load *limit
+    vld1.8      {d18[]}, [r4]              ; load *thresh
+
+    sub         r8, r0, #8
+
+    vld1.8      {d0}, [r8@64], r1
+    vld1.8      {d8}, [r0@64], r1
+    vld1.8      {d1}, [r8@64], r1
+    vld1.8      {d9}, [r0@64], r1
+    vld1.8      {d2}, [r8@64], r1
+    vld1.8      {d10}, [r0@64], r1
+    vld1.8      {d3}, [r8@64], r1
+    vld1.8      {d11}, [r0@64], r1
+    vld1.8      {d4}, [r8@64], r1
+    vld1.8      {d12}, [r0@64], r1
+    vld1.8      {d5}, [r8@64], r1
+    vld1.8      {d13}, [r0@64], r1
+    vld1.8      {d6}, [r8@64], r1
+    vld1.8      {d14}, [r0@64], r1
+    vld1.8      {d7}, [r8@64], r1
+    vld1.8      {d15}, [r0@64], r1
+
+    sub         r0, r0, r1, lsl #3
+
+    vtrn.32     q0, q2
+    vtrn.32     q1, q3
+    vtrn.32     q4, q6
+    vtrn.32     q5, q7
+
+    vtrn.16     q0, q1
+    vtrn.16     q2, q3
+    vtrn.16     q4, q5
+    vtrn.16     q6, q7
+
+    vtrn.8      d0, d1
+    vtrn.8      d2, d3
+    vtrn.8      d4, d5
+    vtrn.8      d6, d7
+
+    vtrn.8      d8, d9
+    vtrn.8      d10, d11
+    vtrn.8      d12, d13
+    vtrn.8      d14, d15
+
+    bl          vpx_wide_mbfilter_neon
+
+    tst         r7, #1
+    beq         v_mbfilter
+
+    ; flat && mask were not set for any of the channels. Just store the values
+    ; from filter.
+    sub         r8, r0, #2
+
+    vswp        d23, d25
+
+    vst4.8      {d23[0], d24[0], d25[0], d26[0]}, [r8], r1
+    vst4.8      {d23[1], d24[1], d25[1], d26[1]}, [r8], r1
+    vst4.8      {d23[2], d24[2], d25[2], d26[2]}, [r8], r1
+    vst4.8      {d23[3], d24[3], d25[3], d26[3]}, [r8], r1
+    vst4.8      {d23[4], d24[4], d25[4], d26[4]}, [r8], r1
+    vst4.8      {d23[5], d24[5], d25[5], d26[5]}, [r8], r1
+    vst4.8      {d23[6], d24[6], d25[6], d26[6]}, [r8], r1
+    vst4.8      {d23[7], d24[7], d25[7], d26[7]}, [r8], r1
+
+    b           v_end
+
+v_mbfilter
+    tst         r7, #2
+    beq         v_wide_mbfilter
+
+    ; flat2 was not set for any of the channels. Just store the values from
+    ; mbfilter.
+    sub         r8, r0, #3
+
+    vst3.8      {d18[0], d19[0], d20[0]}, [r8], r1
+    vst3.8      {d21[0], d22[0], d23[0]}, [r0], r1
+    vst3.8      {d18[1], d19[1], d20[1]}, [r8], r1
+    vst3.8      {d21[1], d22[1], d23[1]}, [r0], r1
+    vst3.8      {d18[2], d19[2], d20[2]}, [r8], r1
+    vst3.8      {d21[2], d22[2], d23[2]}, [r0], r1
+    vst3.8      {d18[3], d19[3], d20[3]}, [r8], r1
+    vst3.8      {d21[3], d22[3], d23[3]}, [r0], r1
+    vst3.8      {d18[4], d19[4], d20[4]}, [r8], r1
+    vst3.8      {d21[4], d22[4], d23[4]}, [r0], r1
+    vst3.8      {d18[5], d19[5], d20[5]}, [r8], r1
+    vst3.8      {d21[5], d22[5], d23[5]}, [r0], r1
+    vst3.8      {d18[6], d19[6], d20[6]}, [r8], r1
+    vst3.8      {d21[6], d22[6], d23[6]}, [r0], r1
+    vst3.8      {d18[7], d19[7], d20[7]}, [r8], r1
+    vst3.8      {d21[7], d22[7], d23[7]}, [r0], r1
+
+    b           v_end
+
+v_wide_mbfilter
+    sub         r8, r0, #8
+
+    vtrn.32     d0,  d26
+    vtrn.32     d16, d27
+    vtrn.32     d24, d18
+    vtrn.32     d25, d19
+
+    vtrn.16     d0,  d24
+    vtrn.16     d16, d25
+    vtrn.16     d26, d18
+    vtrn.16     d27, d19
+
+    vtrn.8      d0,  d16
+    vtrn.8      d24, d25
+    vtrn.8      d26, d27
+    vtrn.8      d18, d19
+
+    vtrn.32     d20, d1
+    vtrn.32     d21, d2
+    vtrn.32     d22, d3
+    vtrn.32     d23, d15
+
+    vtrn.16     d20, d22
+    vtrn.16     d21, d23
+    vtrn.16     d1,  d3
+    vtrn.16     d2,  d15
+
+    vtrn.8      d20, d21
+    vtrn.8      d22, d23
+    vtrn.8      d1,  d2
+    vtrn.8      d3,  d15
+
+    vst1.8      {d0}, [r8@64], r1
+    vst1.8      {d20}, [r0@64], r1
+    vst1.8      {d16}, [r8@64], r1
+    vst1.8      {d21}, [r0@64], r1
+    vst1.8      {d24}, [r8@64], r1
+    vst1.8      {d22}, [r0@64], r1
+    vst1.8      {d25}, [r8@64], r1
+    vst1.8      {d23}, [r0@64], r1
+    vst1.8      {d26}, [r8@64], r1
+    vst1.8      {d1}, [r0@64], r1
+    vst1.8      {d27}, [r8@64], r1
+    vst1.8      {d2}, [r0@64], r1
+    vst1.8      {d18}, [r8@64], r1
+    vst1.8      {d3}, [r0@64], r1
+    vst1.8      {d19}, [r8@64], r1
+    vst1.8      {d15}, [r0@64], r1
+
+v_end
+    vpop        {d8-d15}
+    pop         {r4-r8, pc}
+
+    ENDP        ; |vpx_lpf_vertical_16_neon|
+
+; void vpx_wide_mbfilter_neon();
+; This is a helper function for the loopfilters. The invidual functions do the
+; necessary load, transpose (if necessary) and store.
+;
+; r0-r3 PRESERVE
+; d16    blimit
+; d17    limit
+; d18    thresh
+; d0    p7
+; d1    p6
+; d2    p5
+; d3    p4
+; d4    p3
+; d5    p2
+; d6    p1
+; d7    p0
+; d8    q0
+; d9    q1
+; d10   q2
+; d11   q3
+; d12   q4
+; d13   q5
+; d14   q6
+; d15   q7
+|vpx_wide_mbfilter_neon| PROC
+    mov         r7, #0
+
+    ; filter_mask
+    vabd.u8     d19, d4, d5                ; abs(p3 - p2)
+    vabd.u8     d20, d5, d6                ; abs(p2 - p1)
+    vabd.u8     d21, d6, d7                ; abs(p1 - p0)
+    vabd.u8     d22, d9, d8                ; abs(q1 - q0)
+    vabd.u8     d23, d10, d9               ; abs(q2 - q1)
+    vabd.u8     d24, d11, d10              ; abs(q3 - q2)
+
+    ; only compare the largest value to limit
+    vmax.u8     d19, d19, d20              ; max(abs(p3 - p2), abs(p2 - p1))
+    vmax.u8     d20, d21, d22              ; max(abs(p1 - p0), abs(q1 - q0))
+    vmax.u8     d23, d23, d24              ; max(abs(q2 - q1), abs(q3 - q2))
+    vmax.u8     d19, d19, d20
+
+    vabd.u8     d24, d7, d8                ; abs(p0 - q0)
+
+    vmax.u8     d19, d19, d23
+
+    vabd.u8     d23, d6, d9                ; a = abs(p1 - q1)
+    vqadd.u8    d24, d24, d24              ; b = abs(p0 - q0) * 2
+
+    ; abs () > limit
+    vcge.u8     d19, d17, d19
+
+    ; flatmask4
+    vabd.u8     d25, d7, d5                ; abs(p0 - p2)
+    vabd.u8     d26, d8, d10               ; abs(q0 - q2)
+    vabd.u8     d27, d4, d7                ; abs(p3 - p0)
+    vabd.u8     d28, d11, d8               ; abs(q3 - q0)
+
+    ; only compare the largest value to thresh
+    vmax.u8     d25, d25, d26              ; max(abs(p0 - p2), abs(q0 - q2))
+    vmax.u8     d26, d27, d28              ; max(abs(p3 - p0), abs(q3 - q0))
+    vmax.u8     d25, d25, d26
+    vmax.u8     d20, d20, d25
+
+    vshr.u8     d23, d23, #1               ; a = a / 2
+    vqadd.u8    d24, d24, d23              ; a = b + a
+
+    vmov.u8     d30, #1
+    vcge.u8     d24, d16, d24              ; (a > blimit * 2 + limit) * -1
+
+    vcge.u8     d20, d30, d20              ; flat
+
+    vand        d19, d19, d24              ; mask
+
+    ; hevmask
+    vcgt.u8     d21, d21, d18              ; (abs(p1 - p0) > thresh)*-1
+    vcgt.u8     d22, d22, d18              ; (abs(q1 - q0) > thresh)*-1
+    vorr        d21, d21, d22              ; hev
+
+    vand        d16, d20, d19              ; flat && mask
+    vmov        r5, r6, d16
+
+    ; flatmask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7)
+    vabd.u8     d22, d3, d7                ; abs(p4 - p0)
+    vabd.u8     d23, d12, d8               ; abs(q4 - q0)
+    vabd.u8     d24, d7, d2                ; abs(p0 - p5)
+    vabd.u8     d25, d8, d13               ; abs(q0 - q5)
+    vabd.u8     d26, d1, d7                ; abs(p6 - p0)
+    vabd.u8     d27, d14, d8               ; abs(q6 - q0)
+    vabd.u8     d28, d0, d7                ; abs(p7 - p0)
+    vabd.u8     d29, d15, d8               ; abs(q7 - q0)
+
+    ; only compare the largest value to thresh
+    vmax.u8     d22, d22, d23              ; max(abs(p4 - p0), abs(q4 - q0))
+    vmax.u8     d23, d24, d25              ; max(abs(p0 - p5), abs(q0 - q5))
+    vmax.u8     d24, d26, d27              ; max(abs(p6 - p0), abs(q6 - q0))
+    vmax.u8     d25, d28, d29              ; max(abs(p7 - p0), abs(q7 - q0))
+
+    vmax.u8     d26, d22, d23
+    vmax.u8     d27, d24, d25
+    vmax.u8     d23, d26, d27
+
+    vcge.u8     d18, d30, d23              ; flat2
+
+    vmov.u8     d22, #0x80
+
+    orrs        r5, r5, r6                 ; Check for 0
+    orreq       r7, r7, #1                 ; Only do filter branch
+
+    vand        d17, d18, d16              ; flat2 && flat && mask
+    vmov        r5, r6, d17
+
+    ; mbfilter() function
+
+    ; filter() function
+    ; convert to signed
+    veor        d23, d8, d22               ; qs0
+    veor        d24, d7, d22               ; ps0
+    veor        d25, d6, d22               ; ps1
+    veor        d26, d9, d22               ; qs1
+
+    vmov.u8     d27, #3
+
+    vsub.s8     d28, d23, d24              ; ( qs0 - ps0)
+    vqsub.s8    d29, d25, d26              ; filter = clamp(ps1-qs1)
+    vmull.s8    q15, d28, d27              ; 3 * ( qs0 - ps0)
+    vand        d29, d29, d21              ; filter &= hev
+    vaddw.s8    q15, q15, d29              ; filter + 3 * (qs0 - ps0)
+    vmov.u8     d29, #4
+
+    ; filter = clamp(filter + 3 * ( qs0 - ps0))
+    vqmovn.s16  d28, q15
+
+    vand        d28, d28, d19              ; filter &= mask
+
+    vqadd.s8    d30, d28, d27              ; filter2 = clamp(filter+3)
+    vqadd.s8    d29, d28, d29              ; filter1 = clamp(filter+4)
+    vshr.s8     d30, d30, #3               ; filter2 >>= 3
+    vshr.s8     d29, d29, #3               ; filter1 >>= 3
+
+
+    vqadd.s8    d24, d24, d30              ; op0 = clamp(ps0 + filter2)
+    vqsub.s8    d23, d23, d29              ; oq0 = clamp(qs0 - filter1)
+
+    ; outer tap adjustments: ++filter1 >> 1
+    vrshr.s8    d29, d29, #1
+    vbic        d29, d29, d21              ; filter &= ~hev
+
+    vqadd.s8    d25, d25, d29              ; op1 = clamp(ps1 + filter)
+    vqsub.s8    d26, d26, d29              ; oq1 = clamp(qs1 - filter)
+
+    veor        d24, d24, d22              ; *f_op0 = u^0x80
+    veor        d23, d23, d22              ; *f_oq0 = u^0x80
+    veor        d25, d25, d22              ; *f_op1 = u^0x80
+    veor        d26, d26, d22              ; *f_oq1 = u^0x80
+
+    tst         r7, #1
+    bxne        lr
+
+    orrs        r5, r5, r6                 ; Check for 0
+    orreq       r7, r7, #2                 ; Only do mbfilter branch
+
+    ; mbfilter flat && mask branch
+    ; TODO(fgalligan): Can I decrease the cycles shifting to consective d's
+    ; and using vibt on the q's?
+    vmov.u8     d29, #2
+    vaddl.u8    q15, d7, d8                ; op2 = p0 + q0
+    vmlal.u8    q15, d4, d27               ; op2 = p0 + q0 + p3 * 3
+    vmlal.u8    q15, d5, d29               ; op2 = p0 + q0 + p3 * 3 + p2 * 2
+    vaddl.u8    q10, d4, d5
+    vaddw.u8    q15, d6                    ; op2=p1 + p0 + q0 + p3 * 3 + p2 *2
+    vaddl.u8    q14, d6, d9
+    vqrshrn.u16 d18, q15, #3               ; r_op2
+
+    vsub.i16    q15, q10
+    vaddl.u8    q10, d4, d6
+    vadd.i16    q15, q14
+    vaddl.u8    q14, d7, d10
+    vqrshrn.u16 d19, q15, #3               ; r_op1
+
+    vsub.i16    q15, q10
+    vadd.i16    q15, q14
+    vaddl.u8    q14, d8, d11
+    vqrshrn.u16 d20, q15, #3               ; r_op0
+
+    vsubw.u8    q15, d4                    ; oq0 = op0 - p3
+    vsubw.u8    q15, d7                    ; oq0 -= p0
+    vadd.i16    q15, q14
+    vaddl.u8    q14, d9, d11
+    vqrshrn.u16 d21, q15, #3               ; r_oq0
+
+    vsubw.u8    q15, d5                    ; oq1 = oq0 - p2
+    vsubw.u8    q15, d8                    ; oq1 -= q0
+    vadd.i16    q15, q14
+    vaddl.u8    q14, d10, d11
+    vqrshrn.u16 d22, q15, #3               ; r_oq1
+
+    vsubw.u8    q15, d6                    ; oq2 = oq0 - p1
+    vsubw.u8    q15, d9                    ; oq2 -= q1
+    vadd.i16    q15, q14
+    vqrshrn.u16 d27, q15, #3               ; r_oq2
+
+    ; Filter does not set op2 or oq2, so use p2 and q2.
+    vbif        d18, d5, d16               ; t_op2 |= p2 & ~(flat & mask)
+    vbif        d19, d25, d16              ; t_op1 |= f_op1 & ~(flat & mask)
+    vbif        d20, d24, d16              ; t_op0 |= f_op0 & ~(flat & mask)
+    vbif        d21, d23, d16              ; t_oq0 |= f_oq0 & ~(flat & mask)
+    vbif        d22, d26, d16              ; t_oq1 |= f_oq1 & ~(flat & mask)
+
+    vbit        d23, d27, d16              ; t_oq2 |= r_oq2 & (flat & mask)
+    vbif        d23, d10, d16              ; t_oq2 |= q2 & ~(flat & mask)
+
+    tst         r7, #2
+    bxne        lr
+
+    ; wide_mbfilter flat2 && flat && mask branch
+    vmov.u8     d16, #7
+    vaddl.u8    q15, d7, d8                ; op6 = p0 + q0
+    vaddl.u8    q12, d2, d3
+    vaddl.u8    q13, d4, d5
+    vaddl.u8    q14, d1, d6
+    vmlal.u8    q15, d0, d16               ; op6 += p7 * 3
+    vadd.i16    q12, q13
+    vadd.i16    q15, q14
+    vaddl.u8    q14, d2, d9
+    vadd.i16    q15, q12
+    vaddl.u8    q12, d0, d1
+    vaddw.u8    q15, d1
+    vaddl.u8    q13, d0, d2
+    vadd.i16    q14, q15, q14
+    vqrshrn.u16 d16, q15, #4               ; w_op6
+
+    vsub.i16    q15, q14, q12
+    vaddl.u8    q14, d3, d10
+    vqrshrn.u16 d24, q15, #4               ; w_op5
+
+    vsub.i16    q15, q13
+    vaddl.u8    q13, d0, d3
+    vadd.i16    q15, q14
+    vaddl.u8    q14, d4, d11
+    vqrshrn.u16 d25, q15, #4               ; w_op4
+
+    vadd.i16    q15, q14
+    vaddl.u8    q14, d0, d4
+    vsub.i16    q15, q13
+    vsub.i16    q14, q15, q14
+    vqrshrn.u16 d26, q15, #4               ; w_op3
+
+    vaddw.u8    q15, q14, d5               ; op2 += p2
+    vaddl.u8    q14, d0, d5
+    vaddw.u8    q15, d12                   ; op2 += q4
+    vbif        d26, d4, d17               ; op3 |= p3 & ~(f2 & f & m)
+    vqrshrn.u16 d27, q15, #4               ; w_op2
+
+    vsub.i16    q15, q14
+    vaddl.u8    q14, d0, d6
+    vaddw.u8    q15, d6                    ; op1 += p1
+    vaddw.u8    q15, d13                   ; op1 += q5
+    vbif        d27, d18, d17              ; op2 |= t_op2 & ~(f2 & f & m)
+    vqrshrn.u16 d18, q15, #4               ; w_op1
+
+    vsub.i16    q15, q14
+    vaddl.u8    q14, d0, d7
+    vaddw.u8    q15, d7                    ; op0 += p0
+    vaddw.u8    q15, d14                   ; op0 += q6
+    vbif        d18, d19, d17              ; op1 |= t_op1 & ~(f2 & f & m)
+    vqrshrn.u16 d19, q15, #4               ; w_op0
+
+    vsub.i16    q15, q14
+    vaddl.u8    q14, d1, d8
+    vaddw.u8    q15, d8                    ; oq0 += q0
+    vaddw.u8    q15, d15                   ; oq0 += q7
+    vbif        d19, d20, d17              ; op0 |= t_op0 & ~(f2 & f & m)
+    vqrshrn.u16 d20, q15, #4               ; w_oq0
+
+    vsub.i16    q15, q14
+    vaddl.u8    q14, d2, d9
+    vaddw.u8    q15, d9                    ; oq1 += q1
+    vaddl.u8    q4, d10, d15
+    vaddw.u8    q15, d15                   ; oq1 += q7
+    vbif        d20, d21, d17              ; oq0 |= t_oq0 & ~(f2 & f & m)
+    vqrshrn.u16 d21, q15, #4               ; w_oq1
+
+    vsub.i16    q15, q14
+    vaddl.u8    q14, d3, d10
+    vadd.i16    q15, q4
+    vaddl.u8    q4, d11, d15
+    vbif        d21, d22, d17              ; oq1 |= t_oq1 & ~(f2 & f & m)
+    vqrshrn.u16 d22, q15, #4               ; w_oq2
+
+    vsub.i16    q15, q14
+    vaddl.u8    q14, d4, d11
+    vadd.i16    q15, q4
+    vaddl.u8    q4, d12, d15
+    vbif        d22, d23, d17              ; oq2 |= t_oq2 & ~(f2 & f & m)
+    vqrshrn.u16 d23, q15, #4               ; w_oq3
+
+    vsub.i16    q15, q14
+    vaddl.u8    q14, d5, d12
+    vadd.i16    q15, q4
+    vaddl.u8    q4, d13, d15
+    vbif        d16, d1, d17               ; op6 |= p6 & ~(f2 & f & m)
+    vqrshrn.u16 d1, q15, #4                ; w_oq4
+
+    vsub.i16    q15, q14
+    vaddl.u8    q14, d6, d13
+    vadd.i16    q15, q4
+    vaddl.u8    q4, d14, d15
+    vbif        d24, d2, d17               ; op5 |= p5 & ~(f2 & f & m)
+    vqrshrn.u16 d2, q15, #4                ; w_oq5
+
+    vsub.i16    q15, q14
+    vbif        d25, d3, d17               ; op4 |= p4 & ~(f2 & f & m)
+    vadd.i16    q15, q4
+    vbif        d23, d11, d17              ; oq3 |= q3 & ~(f2 & f & m)
+    vqrshrn.u16 d3, q15, #4                ; w_oq6
+    vbif        d1, d12, d17               ; oq4 |= q4 & ~(f2 & f & m)
+    vbif        d2, d13, d17               ; oq5 |= q5 & ~(f2 & f & m)
+    vbif        d3, d14, d17               ; oq6 |= q6 & ~(f2 & f & m)
+
+    bx          lr
+    ENDP        ; |vpx_wide_mbfilter_neon|
+
+    END
diff --git a/libs/libvpx/vpx_dsp/arm/loopfilter_neon.c b/libs/libvpx/vpx_dsp/arm/loopfilter_neon.c
new file mode 100644
index 0000000000..eff87d29bd
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/loopfilter_neon.c
@@ -0,0 +1,58 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p,
+                                  const uint8_t *blimit0,
+                                  const uint8_t *limit0,
+                                  const uint8_t *thresh0,
+                                  const uint8_t *blimit1,
+                                  const uint8_t *limit1,
+                                  const uint8_t *thresh1) {
+  vpx_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0, 1);
+  vpx_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
+}
+
+#if HAVE_NEON_ASM
+void vpx_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */,
+                                    const uint8_t *blimit0,
+                                    const uint8_t *limit0,
+                                    const uint8_t *thresh0,
+                                    const uint8_t *blimit1,
+                                    const uint8_t *limit1,
+                                    const uint8_t *thresh1) {
+  vpx_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0, 1);
+  vpx_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1, 1);
+}
+
+void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p,
+                                  const uint8_t *blimit0,
+                                  const uint8_t *limit0,
+                                  const uint8_t *thresh0,
+                                  const uint8_t *blimit1,
+                                  const uint8_t *limit1,
+                                  const uint8_t *thresh1) {
+  vpx_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0, 1);
+  vpx_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
+}
+
+void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p,
+                                   const uint8_t *blimit,
+                                   const uint8_t *limit,
+                                   const uint8_t *thresh) {
+  vpx_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
+  vpx_lpf_vertical_16_neon(s + 8 * p, p, blimit, limit, thresh);
+}
+#endif  // HAVE_NEON_ASM
diff --git a/libs/libvpx/vpx_dsp/arm/sad4d_neon.c b/libs/libvpx/vpx_dsp/arm/sad4d_neon.c
new file mode 100644
index 0000000000..c7704dc1be
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/sad4d_neon.c
@@ -0,0 +1,226 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
+                                                    const uint16x8_t vec_hi) {
+  const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo),
+                                        vget_high_u16(vec_lo));
+  const uint32x4_t vec_l_hi = vaddl_u16(vget_low_u16(vec_hi),
+                                        vget_high_u16(vec_hi));
+  const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
+  const uint64x2_t b = vpaddlq_u32(a);
+  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                                vreinterpret_u32_u64(vget_high_u64(b)));
+  return vget_lane_u32(c, 0);
+}
+
+// Calculate the absolute difference of 64 bytes from vec_src_00, vec_src_16,
+// vec_src_32, vec_src_48 and ref. Accumulate partial sums in vec_sum_ref_lo
+// and vec_sum_ref_hi.
+static void sad_neon_64(const uint8x16_t vec_src_00,
+                        const uint8x16_t vec_src_16,
+                        const uint8x16_t vec_src_32,
+                        const uint8x16_t vec_src_48,
+                        const uint8_t *ref,
+                        uint16x8_t *vec_sum_ref_lo,
+                        uint16x8_t *vec_sum_ref_hi) {
+  const uint8x16_t vec_ref_00 = vld1q_u8(ref);
+  const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
+  const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32);
+  const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48);
+
+  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00),
+                             vget_low_u8(vec_ref_00));
+  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00),
+                             vget_high_u8(vec_ref_00));
+  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16),
+                             vget_low_u8(vec_ref_16));
+  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16),
+                             vget_high_u8(vec_ref_16));
+  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_32),
+                             vget_low_u8(vec_ref_32));
+  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_32),
+                             vget_high_u8(vec_ref_32));
+  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_48),
+                             vget_low_u8(vec_ref_48));
+  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_48),
+                             vget_high_u8(vec_ref_48));
+}
+
+// Calculate the absolute difference of 32 bytes from vec_src_00, vec_src_16,
+// and ref. Accumulate partial sums in vec_sum_ref_lo and vec_sum_ref_hi.
+static void sad_neon_32(const uint8x16_t vec_src_00,
+                        const uint8x16_t vec_src_16,
+                        const uint8_t *ref,
+                        uint16x8_t *vec_sum_ref_lo,
+                        uint16x8_t *vec_sum_ref_hi) {
+  const uint8x16_t vec_ref_00 = vld1q_u8(ref);
+  const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
+
+  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00),
+                             vget_low_u8(vec_ref_00));
+  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00),
+                             vget_high_u8(vec_ref_00));
+  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16),
+                             vget_low_u8(vec_ref_16));
+  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16),
+                             vget_high_u8(vec_ref_16));
+}
+
+void vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride,
+                          const uint8_t* const ref[4], int ref_stride,
+                          uint32_t *res) {
+  int i;
+  uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
+  const uint8_t *ref0, *ref1, *ref2, *ref3;
+  ref0 = ref[0];
+  ref1 = ref[1];
+  ref2 = ref[2];
+  ref3 = ref[3];
+
+  for (i = 0; i < 64; ++i) {
+    const uint8x16_t vec_src_00 = vld1q_u8(src);
+    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
+    const uint8x16_t vec_src_32 = vld1q_u8(src + 32);
+    const uint8x16_t vec_src_48 = vld1q_u8(src + 48);
+
+    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref0,
+                &vec_sum_ref0_lo, &vec_sum_ref0_hi);
+    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref1,
+                &vec_sum_ref1_lo, &vec_sum_ref1_hi);
+    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref2,
+                &vec_sum_ref2_lo, &vec_sum_ref2_hi);
+    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref3,
+                &vec_sum_ref3_lo, &vec_sum_ref3_hi);
+
+    src += src_stride;
+    ref0 += ref_stride;
+    ref1 += ref_stride;
+    ref2 += ref_stride;
+    ref3 += ref_stride;
+  }
+
+  res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
+  res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
+  res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
+  res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
+}
+
+void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride,
+                          const uint8_t* const ref[4], int ref_stride,
+                          uint32_t *res) {
+  int i;
+  uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
+  const uint8_t *ref0, *ref1, *ref2, *ref3;
+  ref0 = ref[0];
+  ref1 = ref[1];
+  ref2 = ref[2];
+  ref3 = ref[3];
+
+  for (i = 0; i < 32; ++i) {
+    const uint8x16_t vec_src_00 = vld1q_u8(src);
+    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
+
+    sad_neon_32(vec_src_00, vec_src_16, ref0,
+                &vec_sum_ref0_lo, &vec_sum_ref0_hi);
+    sad_neon_32(vec_src_00, vec_src_16, ref1,
+                &vec_sum_ref1_lo, &vec_sum_ref1_hi);
+    sad_neon_32(vec_src_00, vec_src_16, ref2,
+                &vec_sum_ref2_lo, &vec_sum_ref2_hi);
+    sad_neon_32(vec_src_00, vec_src_16, ref3,
+                &vec_sum_ref3_lo, &vec_sum_ref3_hi);
+
+    src += src_stride;
+    ref0 += ref_stride;
+    ref1 += ref_stride;
+    ref2 += ref_stride;
+    ref3 += ref_stride;
+  }
+
+  res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
+  res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
+  res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
+  res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
+}
+
+void vpx_sad16x16x4d_neon(const uint8_t *src, int src_stride,
+                          const uint8_t* const ref[4], int ref_stride,
+                          uint32_t *res) {
+  int i;
+  uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
+  const uint8_t *ref0, *ref1, *ref2, *ref3;
+  ref0 = ref[0];
+  ref1 = ref[1];
+  ref2 = ref[2];
+  ref3 = ref[3];
+
+  for (i = 0; i < 16; ++i) {
+    const uint8x16_t vec_src = vld1q_u8(src);
+    const uint8x16_t vec_ref0 = vld1q_u8(ref0);
+    const uint8x16_t vec_ref1 = vld1q_u8(ref1);
+    const uint8x16_t vec_ref2 = vld1q_u8(ref2);
+    const uint8x16_t vec_ref3 = vld1q_u8(ref3);
+
+    vec_sum_ref0_lo = vabal_u8(vec_sum_ref0_lo, vget_low_u8(vec_src),
+                               vget_low_u8(vec_ref0));
+    vec_sum_ref0_hi = vabal_u8(vec_sum_ref0_hi, vget_high_u8(vec_src),
+                               vget_high_u8(vec_ref0));
+    vec_sum_ref1_lo = vabal_u8(vec_sum_ref1_lo, vget_low_u8(vec_src),
+                               vget_low_u8(vec_ref1));
+    vec_sum_ref1_hi = vabal_u8(vec_sum_ref1_hi, vget_high_u8(vec_src),
+                               vget_high_u8(vec_ref1));
+    vec_sum_ref2_lo = vabal_u8(vec_sum_ref2_lo, vget_low_u8(vec_src),
+                               vget_low_u8(vec_ref2));
+    vec_sum_ref2_hi = vabal_u8(vec_sum_ref2_hi, vget_high_u8(vec_src),
+                               vget_high_u8(vec_ref2));
+    vec_sum_ref3_lo = vabal_u8(vec_sum_ref3_lo, vget_low_u8(vec_src),
+                               vget_low_u8(vec_ref3));
+    vec_sum_ref3_hi = vabal_u8(vec_sum_ref3_hi, vget_high_u8(vec_src),
+                               vget_high_u8(vec_ref3));
+
+    src += src_stride;
+    ref0 += ref_stride;
+    ref1 += ref_stride;
+    ref2 += ref_stride;
+    ref3 += ref_stride;
+  }
+
+  res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
+  res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
+  res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
+  res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
+}
diff --git a/libs/libvpx/vpx_dsp/arm/sad_media.asm b/libs/libvpx/vpx_dsp/arm/sad_media.asm
new file mode 100644
index 0000000000..aed1d3a22e
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/sad_media.asm
@@ -0,0 +1,95 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vpx_sad16x16_media|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    const unsigned char *src_ptr
+; r1    int  src_stride
+; r2    const unsigned char *ref_ptr
+; r3    int  ref_stride
+|vpx_sad16x16_media| PROC
+    stmfd   sp!, {r4-r12, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+    pld     [r0, r1, lsl #1]
+    pld     [r2, r3, lsl #1]
+
+    mov     r4, #0              ; sad = 0;
+    mov     r5, #8              ; loop count
+
+loop
+    ; 1st row
+    ldr     r6, [r0, #0x0]      ; load 4 src pixels (1A)
+    ldr     r8, [r2, #0x0]      ; load 4 ref pixels (1A)
+    ldr     r7, [r0, #0x4]      ; load 4 src pixels (1A)
+    ldr     r9, [r2, #0x4]      ; load 4 ref pixels (1A)
+    ldr     r10, [r0, #0x8]     ; load 4 src pixels (1B)
+    ldr     r11, [r0, #0xC]     ; load 4 src pixels (1B)
+
+    usada8  r4, r8, r6, r4      ; calculate sad for 4 pixels
+    usad8   r8, r7, r9          ; calculate sad for 4 pixels
+
+    ldr     r12, [r2, #0x8]     ; load 4 ref pixels (1B)
+    ldr     lr, [r2, #0xC]      ; load 4 ref pixels (1B)
+
+    add     r0, r0, r1          ; set src pointer to next row
+    add     r2, r2, r3          ; set dst pointer to next row
+
+    pld     [r0, r1, lsl #1]
+    pld     [r2, r3, lsl #1]
+
+    usada8  r4, r10, r12, r4    ; calculate sad for 4 pixels
+    usada8  r8, r11, lr, r8     ; calculate sad for 4 pixels
+
+    ldr     r6, [r0, #0x0]      ; load 4 src pixels (2A)
+    ldr     r7, [r0, #0x4]      ; load 4 src pixels (2A)
+    add     r4, r4, r8          ; add partial sad values
+
+    ; 2nd row
+    ldr     r8, [r2, #0x0]      ; load 4 ref pixels (2A)
+    ldr     r9, [r2, #0x4]      ; load 4 ref pixels (2A)
+    ldr     r10, [r0, #0x8]     ; load 4 src pixels (2B)
+    ldr     r11, [r0, #0xC]     ; load 4 src pixels (2B)
+
+    usada8  r4, r6, r8, r4      ; calculate sad for 4 pixels
+    usad8   r8, r7, r9          ; calculate sad for 4 pixels
+
+    ldr     r12, [r2, #0x8]     ; load 4 ref pixels (2B)
+    ldr     lr, [r2, #0xC]      ; load 4 ref pixels (2B)
+
+    add     r0, r0, r1          ; set src pointer to next row
+    add     r2, r2, r3          ; set dst pointer to next row
+
+    usada8  r4, r10, r12, r4    ; calculate sad for 4 pixels
+    usada8  r8, r11, lr, r8     ; calculate sad for 4 pixels
+
+    pld     [r0, r1, lsl #1]
+    pld     [r2, r3, lsl #1]
+
+    subs    r5, r5, #1          ; decrement loop counter
+    add     r4, r4, r8          ; add partial sad values
+
+    bne     loop
+
+    mov     r0, r4              ; return sad
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+    END
+
diff --git a/libs/libvpx/vpx_dsp/arm/sad_neon.c b/libs/libvpx/vpx_dsp/arm/sad_neon.c
new file mode 100644
index 0000000000..173f08ac3c
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/sad_neon.c
@@ -0,0 +1,232 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+
+unsigned int vpx_sad8x16_neon(
+        unsigned char *src_ptr,
+        int src_stride,
+        unsigned char *ref_ptr,
+        int ref_stride) {
+    uint8x8_t d0, d8;
+    uint16x8_t q12;
+    uint32x4_t q1;
+    uint64x2_t q3;
+    uint32x2_t d5;
+    int i;
+
+    d0 = vld1_u8(src_ptr);
+    src_ptr += src_stride;
+    d8 = vld1_u8(ref_ptr);
+    ref_ptr += ref_stride;
+    q12 = vabdl_u8(d0, d8);
+
+    for (i = 0; i < 15; i++) {
+        d0 = vld1_u8(src_ptr);
+        src_ptr += src_stride;
+        d8 = vld1_u8(ref_ptr);
+        ref_ptr += ref_stride;
+        q12 = vabal_u8(q12, d0, d8);
+    }
+
+    q1 = vpaddlq_u16(q12);
+    q3 = vpaddlq_u32(q1);
+    d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
+                  vreinterpret_u32_u64(vget_high_u64(q3)));
+
+    return vget_lane_u32(d5, 0);
+}
+
+unsigned int vpx_sad4x4_neon(
+        unsigned char *src_ptr,
+        int src_stride,
+        unsigned char *ref_ptr,
+        int ref_stride) {
+    uint8x8_t d0, d8;
+    uint16x8_t q12;
+    uint32x2_t d1;
+    uint64x1_t d3;
+    int i;
+
+    d0 = vld1_u8(src_ptr);
+    src_ptr += src_stride;
+    d8 = vld1_u8(ref_ptr);
+    ref_ptr += ref_stride;
+    q12 = vabdl_u8(d0, d8);
+
+    for (i = 0; i < 3; i++) {
+        d0 = vld1_u8(src_ptr);
+        src_ptr += src_stride;
+        d8 = vld1_u8(ref_ptr);
+        ref_ptr += ref_stride;
+        q12 = vabal_u8(q12, d0, d8);
+    }
+
+    d1 = vpaddl_u16(vget_low_u16(q12));
+    d3 = vpaddl_u32(d1);
+
+    return vget_lane_u32(vreinterpret_u32_u64(d3), 0);
+}
+
+unsigned int vpx_sad16x8_neon(
+        unsigned char *src_ptr,
+        int src_stride,
+        unsigned char *ref_ptr,
+        int ref_stride) {
+    uint8x16_t q0, q4;
+    uint16x8_t q12, q13;
+    uint32x4_t q1;
+    uint64x2_t q3;
+    uint32x2_t d5;
+    int i;
+
+    q0 = vld1q_u8(src_ptr);
+    src_ptr += src_stride;
+    q4 = vld1q_u8(ref_ptr);
+    ref_ptr += ref_stride;
+    q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
+    q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
+
+    for (i = 0; i < 7; i++) {
+        q0 = vld1q_u8(src_ptr);
+        src_ptr += src_stride;
+        q4 = vld1q_u8(ref_ptr);
+        ref_ptr += ref_stride;
+        q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
+        q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
+    }
+
+    q12 = vaddq_u16(q12, q13);
+    q1 = vpaddlq_u16(q12);
+    q3 = vpaddlq_u32(q1);
+    d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
+                  vreinterpret_u32_u64(vget_high_u64(q3)));
+
+    return vget_lane_u32(d5, 0);
+}
+
+static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
+                                                    const uint16x8_t vec_hi) {
+  const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo),
+                                        vget_high_u16(vec_lo));
+  const uint32x4_t vec_l_hi = vaddl_u16(vget_low_u16(vec_hi),
+                                        vget_high_u16(vec_hi));
+  const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
+  const uint64x2_t b = vpaddlq_u32(a);
+  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                                vreinterpret_u32_u64(vget_high_u64(b)));
+  return vget_lane_u32(c, 0);
+}
+static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) {
+  const uint32x4_t a = vpaddlq_u16(vec_16x8);
+  const uint64x2_t b = vpaddlq_u32(a);
+  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                                vreinterpret_u32_u64(vget_high_u64(b)));
+  return vget_lane_u32(c, 0);
+}
+
+unsigned int vpx_sad64x64_neon(const uint8_t *src, int src_stride,
+                               const uint8_t *ref, int ref_stride) {
+  int i;
+  uint16x8_t vec_accum_lo = vdupq_n_u16(0);
+  uint16x8_t vec_accum_hi = vdupq_n_u16(0);
+  for (i = 0; i < 64; ++i) {
+    const uint8x16_t vec_src_00 = vld1q_u8(src);
+    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
+    const uint8x16_t vec_src_32 = vld1q_u8(src + 32);
+    const uint8x16_t vec_src_48 = vld1q_u8(src + 48);
+    const uint8x16_t vec_ref_00 = vld1q_u8(ref);
+    const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
+    const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32);
+    const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48);
+    src += src_stride;
+    ref += ref_stride;
+    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00),
+                            vget_low_u8(vec_ref_00));
+    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00),
+                            vget_high_u8(vec_ref_00));
+    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16),
+                            vget_low_u8(vec_ref_16));
+    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),
+                            vget_high_u8(vec_ref_16));
+    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_32),
+                            vget_low_u8(vec_ref_32));
+    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_32),
+                            vget_high_u8(vec_ref_32));
+    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_48),
+                            vget_low_u8(vec_ref_48));
+    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_48),
+                            vget_high_u8(vec_ref_48));
+  }
+  return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi);
+}
+
+unsigned int vpx_sad32x32_neon(const uint8_t *src, int src_stride,
+                               const uint8_t *ref, int ref_stride) {
+  int i;
+  uint16x8_t vec_accum_lo = vdupq_n_u16(0);
+  uint16x8_t vec_accum_hi = vdupq_n_u16(0);
+
+  for (i = 0; i < 32; ++i) {
+    const uint8x16_t vec_src_00 = vld1q_u8(src);
+    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
+    const uint8x16_t vec_ref_00 = vld1q_u8(ref);
+    const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
+    src += src_stride;
+    ref += ref_stride;
+    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00),
+                            vget_low_u8(vec_ref_00));
+    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00),
+                            vget_high_u8(vec_ref_00));
+    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16),
+                            vget_low_u8(vec_ref_16));
+    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),
+                            vget_high_u8(vec_ref_16));
+  }
+  return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
+}
+
+unsigned int vpx_sad16x16_neon(const uint8_t *src, int src_stride,
+                               const uint8_t *ref, int ref_stride) {
+  int i;
+  uint16x8_t vec_accum_lo = vdupq_n_u16(0);
+  uint16x8_t vec_accum_hi = vdupq_n_u16(0);
+
+  for (i = 0; i < 16; ++i) {
+    const uint8x16_t vec_src = vld1q_u8(src);
+    const uint8x16_t vec_ref = vld1q_u8(ref);
+    src += src_stride;
+    ref += ref_stride;
+    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src),
+                            vget_low_u8(vec_ref));
+    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src),
+                            vget_high_u8(vec_ref));
+  }
+  return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
+}
+
+unsigned int vpx_sad8x8_neon(const uint8_t *src, int src_stride,
+                             const uint8_t *ref, int ref_stride) {
+  int i;
+  uint16x8_t vec_accum = vdupq_n_u16(0);
+
+  for (i = 0; i < 8; ++i) {
+    const uint8x8_t vec_src = vld1_u8(src);
+    const uint8x8_t vec_ref = vld1_u8(ref);
+    src += src_stride;
+    ref += ref_stride;
+    vec_accum = vabal_u8(vec_accum, vec_src, vec_ref);
+  }
+  return horizontal_add_16x8(vec_accum);
+}
diff --git a/libs/libvpx/vpx_dsp/arm/save_reg_neon.asm b/libs/libvpx/vpx_dsp/arm/save_reg_neon.asm
new file mode 100644
index 0000000000..c9ca10801d
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/save_reg_neon.asm
@@ -0,0 +1,36 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vpx_push_neon|
+    EXPORT  |vpx_pop_neon|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_push_neon| PROC
+    vst1.i64            {d8, d9, d10, d11}, [r0]!
+    vst1.i64            {d12, d13, d14, d15}, [r0]!
+    bx              lr
+
+    ENDP
+
+|vpx_pop_neon| PROC
+    vld1.i64            {d8, d9, d10, d11}, [r0]!
+    vld1.i64            {d12, d13, d14, d15}, [r0]!
+    bx              lr
+
+    ENDP
+
+    END
+
diff --git a/libs/libvpx/vpx_dsp/arm/subpel_variance_media.c b/libs/libvpx/vpx_dsp/arm/subpel_variance_media.c
new file mode 100644
index 0000000000..e7d8c85fb5
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/subpel_variance_media.c
@@ -0,0 +1,105 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+#if HAVE_MEDIA
+static const int16_t bilinear_filters_media[8][2] = {
+  { 128,   0 },
+  { 112,  16 },
+  {  96,  32 },
+  {  80,  48 },
+  {  64,  64 },
+  {  48,  80 },
+  {  32,  96 },
+  {  16, 112 }
+};
+
+extern void vpx_filter_block2d_bil_first_pass_media(const uint8_t *src_ptr,
+                                                    uint16_t *dst_ptr,
+                                                    uint32_t src_pitch,
+                                                    uint32_t height,
+                                                    uint32_t width,
+                                                    const int16_t *filter);
+
+extern void vpx_filter_block2d_bil_second_pass_media(const uint16_t *src_ptr,
+                                                     uint8_t *dst_ptr,
+                                                     int32_t src_pitch,
+                                                     uint32_t height,
+                                                     uint32_t width,
+                                                     const int16_t *filter);
+
+
+unsigned int vpx_sub_pixel_variance8x8_media(const uint8_t *src_ptr,
+                                             int src_pixels_per_line,
+                                             int xoffset, int yoffset,
+                                             const uint8_t *dst_ptr,
+                                             int dst_pixels_per_line,
+                                             unsigned int *sse) {
+  uint16_t first_pass[10*8];
+  uint8_t  second_pass[8*8];
+  const int16_t *HFilter, *VFilter;
+
+  HFilter = bilinear_filters_media[xoffset];
+  VFilter = bilinear_filters_media[yoffset];
+
+  vpx_filter_block2d_bil_first_pass_media(src_ptr, first_pass,
+                                          src_pixels_per_line,
+                                          9, 8, HFilter);
+  vpx_filter_block2d_bil_second_pass_media(first_pass, second_pass,
+                                           8, 8, 8, VFilter);
+
+  return vpx_variance8x8_media(second_pass, 8, dst_ptr,
+                               dst_pixels_per_line, sse);
+}
+
+unsigned int vpx_sub_pixel_variance16x16_media(const uint8_t *src_ptr,
+                                               int src_pixels_per_line,
+                                               int xoffset,
+                                               int yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse) {
+  uint16_t first_pass[36*16];
+  uint8_t  second_pass[20*16];
+  const int16_t *HFilter, *VFilter;
+  unsigned int var;
+
+  if (xoffset == 4 && yoffset == 0) {
+    var = vpx_variance_halfpixvar16x16_h_media(src_ptr, src_pixels_per_line,
+                                               dst_ptr, dst_pixels_per_line,
+                                               sse);
+  } else if (xoffset == 0 && yoffset == 4) {
+    var = vpx_variance_halfpixvar16x16_v_media(src_ptr, src_pixels_per_line,
+                                               dst_ptr, dst_pixels_per_line,
+                                               sse);
+  } else if (xoffset == 4 && yoffset == 4) {
+    var = vpx_variance_halfpixvar16x16_hv_media(src_ptr, src_pixels_per_line,
+                                                dst_ptr, dst_pixels_per_line,
+                                                sse);
+  } else {
+    HFilter = bilinear_filters_media[xoffset];
+    VFilter = bilinear_filters_media[yoffset];
+
+    vpx_filter_block2d_bil_first_pass_media(src_ptr, first_pass,
+                                            src_pixels_per_line,
+                                            17, 16, HFilter);
+    vpx_filter_block2d_bil_second_pass_media(first_pass, second_pass,
+                                             16, 16, 16, VFilter);
+
+    var = vpx_variance16x16_media(second_pass, 16, dst_ptr,
+                                  dst_pixels_per_line, sse);
+  }
+  return var;
+}
+#endif  // HAVE_MEDIA
diff --git a/libs/libvpx/vpx_dsp/arm/subpel_variance_neon.c b/libs/libvpx/vpx_dsp/arm/subpel_variance_neon.c
new file mode 100644
index 0000000000..40e2cc89b3
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/subpel_variance_neon.c
@@ -0,0 +1,152 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx_ports/mem.h"
+#include "vpx/vpx_integer.h"
+
+#include "vpx_dsp/variance.h"
+
+static const uint8_t bilinear_filters[8][2] = {
+  { 128,   0, },
+  { 112,  16, },
+  {  96,  32, },
+  {  80,  48, },
+  {  64,  64, },
+  {  48,  80, },
+  {  32,  96, },
+  {  16, 112, },
+};
+
+static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
+                                      uint8_t *output_ptr,
+                                      unsigned int src_pixels_per_line,
+                                      int pixel_step,
+                                      unsigned int output_height,
+                                      unsigned int output_width,
+                                      const uint8_t *filter) {
+  const uint8x8_t f0 = vmov_n_u8(filter[0]);
+  const uint8x8_t f1 = vmov_n_u8(filter[1]);
+  unsigned int i;
+  for (i = 0; i < output_height; ++i) {
+    const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
+    const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]);
+    const uint16x8_t a = vmull_u8(src_0, f0);
+    const uint16x8_t b = vmlal_u8(a, src_1, f1);
+    const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
+    vst1_u8(&output_ptr[0], out);
+    // Next row...
+    src_ptr += src_pixels_per_line;
+    output_ptr += output_width;
+  }
+}
+
+static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
+                                       uint8_t *output_ptr,
+                                       unsigned int src_pixels_per_line,
+                                       int pixel_step,
+                                       unsigned int output_height,
+                                       unsigned int output_width,
+                                       const uint8_t *filter) {
+  const uint8x8_t f0 = vmov_n_u8(filter[0]);
+  const uint8x8_t f1 = vmov_n_u8(filter[1]);
+  unsigned int i, j;
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; j += 16) {
+      const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]);
+      const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]);
+      const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0);
+      const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1);
+      const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS);
+      const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0);
+      const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1);
+      const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS);
+      vst1q_u8(&output_ptr[j], vcombine_u8(out_lo, out_hi));
+    }
+    // Next row...
+    src_ptr += src_pixels_per_line;
+    output_ptr += output_width;
+  }
+}
+
+unsigned int vpx_sub_pixel_variance8x8_neon(const uint8_t *src,
+                                            int src_stride,
+                                            int xoffset,
+                                            int yoffset,
+                                            const uint8_t *dst,
+                                            int dst_stride,
+                                            unsigned int *sse) {
+  DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]);
+  DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]);
+
+  var_filter_block2d_bil_w8(src, fdata3, src_stride, 1,
+                            9, 8,
+                            bilinear_filters[xoffset]);
+  var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8,
+                            8, bilinear_filters[yoffset]);
+  return vpx_variance8x8_neon(temp2, 8, dst, dst_stride, sse);
+}
+
+unsigned int vpx_sub_pixel_variance16x16_neon(const uint8_t *src,
+                                              int src_stride,
+                                              int xoffset,
+                                              int yoffset,
+                                              const uint8_t *dst,
+                                              int dst_stride,
+                                              unsigned int *sse) {
+  DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]);
+  DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]);
+
+  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,
+                             17, 16,
+                             bilinear_filters[xoffset]);
+  var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16,
+                             16, bilinear_filters[yoffset]);
+  return vpx_variance16x16_neon(temp2, 16, dst, dst_stride, sse);
+}
+
+unsigned int vpx_sub_pixel_variance32x32_neon(const uint8_t *src,
+                                              int src_stride,
+                                              int xoffset,
+                                              int yoffset,
+                                              const uint8_t *dst,
+                                              int dst_stride,
+                                              unsigned int *sse) {
+  DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]);
+  DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]);
+
+  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,
+                             33, 32,
+                             bilinear_filters[xoffset]);
+  var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32,
+                             32, bilinear_filters[yoffset]);
+  return vpx_variance32x32_neon(temp2, 32, dst, dst_stride, sse);
+}
+
+unsigned int vpx_sub_pixel_variance64x64_neon(const uint8_t *src,
+                                              int src_stride,
+                                              int xoffset,
+                                              int yoffset,
+                                              const uint8_t *dst,
+                                              int dst_stride,
+                                              unsigned int *sse) {
+  DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]);
+  DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]);
+
+  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,
+                             65, 64,
+                             bilinear_filters[xoffset]);
+  var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64,
+                             64, bilinear_filters[yoffset]);
+  return vpx_variance64x64_neon(temp2, 64, dst, dst_stride, sse);
+}
diff --git a/libs/libvpx/vpx_dsp/arm/subtract_neon.c b/libs/libvpx/vpx_dsp/arm/subtract_neon.c
new file mode 100644
index 0000000000..7b146095ea
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/subtract_neon.c
@@ -0,0 +1,80 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+void vpx_subtract_block_neon(int rows, int cols,
+                             int16_t *diff, ptrdiff_t diff_stride,
+                             const uint8_t *src, ptrdiff_t src_stride,
+                             const uint8_t *pred, ptrdiff_t pred_stride) {
+  int r, c;
+
+  if (cols > 16) {
+    for (r = 0; r < rows; ++r) {
+      for (c = 0; c < cols; c += 32) {
+        const uint8x16_t v_src_00 = vld1q_u8(&src[c + 0]);
+        const uint8x16_t v_src_16 = vld1q_u8(&src[c + 16]);
+        const uint8x16_t v_pred_00 = vld1q_u8(&pred[c +  0]);
+        const uint8x16_t v_pred_16 = vld1q_u8(&pred[c + 16]);
+        const uint16x8_t v_diff_lo_00 = vsubl_u8(vget_low_u8(v_src_00),
+                                                 vget_low_u8(v_pred_00));
+        const uint16x8_t v_diff_hi_00 = vsubl_u8(vget_high_u8(v_src_00),
+                                                 vget_high_u8(v_pred_00));
+        const uint16x8_t v_diff_lo_16 = vsubl_u8(vget_low_u8(v_src_16),
+                                                 vget_low_u8(v_pred_16));
+        const uint16x8_t v_diff_hi_16 = vsubl_u8(vget_high_u8(v_src_16),
+                                                 vget_high_u8(v_pred_16));
+        vst1q_s16(&diff[c +  0], vreinterpretq_s16_u16(v_diff_lo_00));
+        vst1q_s16(&diff[c +  8], vreinterpretq_s16_u16(v_diff_hi_00));
+        vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(v_diff_lo_16));
+        vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(v_diff_hi_16));
+      }
+      diff += diff_stride;
+      pred += pred_stride;
+      src  += src_stride;
+    }
+  } else if (cols > 8) {
+    for (r = 0; r < rows; ++r) {
+      const uint8x16_t v_src = vld1q_u8(&src[0]);
+      const uint8x16_t v_pred = vld1q_u8(&pred[0]);
+      const uint16x8_t v_diff_lo = vsubl_u8(vget_low_u8(v_src),
+                                            vget_low_u8(v_pred));
+      const uint16x8_t v_diff_hi = vsubl_u8(vget_high_u8(v_src),
+                                            vget_high_u8(v_pred));
+      vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_lo));
+      vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_hi));
+      diff += diff_stride;
+      pred += pred_stride;
+      src  += src_stride;
+    }
+  } else if (cols > 4) {
+    for (r = 0; r < rows; ++r) {
+      const uint8x8_t v_src = vld1_u8(&src[0]);
+      const uint8x8_t v_pred = vld1_u8(&pred[0]);
+      const uint16x8_t v_diff = vsubl_u8(v_src, v_pred);
+      vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff));
+      diff += diff_stride;
+      pred += pred_stride;
+      src  += src_stride;
+    }
+  } else {
+    for (r = 0; r < rows; ++r) {
+      for (c = 0; c < cols; ++c)
+        diff[c] = src[c] - pred[c];
+
+      diff += diff_stride;
+      pred += pred_stride;
+      src  += src_stride;
+    }
+  }
+}
diff --git a/libs/libvpx/vpx_dsp/arm/variance_halfpixvar16x16_h_media.asm b/libs/libvpx/vpx_dsp/arm/variance_halfpixvar16x16_h_media.asm
new file mode 100644
index 0000000000..dab845a204
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/variance_halfpixvar16x16_h_media.asm
@@ -0,0 +1,182 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vpx_variance_halfpixvar16x16_h_media|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vpx_variance_halfpixvar16x16_h_media| PROC
+
+    stmfd   sp!, {r4-r12, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
+    mov     r8, #0              ; initialize sum = 0
+    ldr     r10, c80808080
+    mov     r11, #0             ; initialize sse = 0
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+    mov     lr, #0              ; constant zero
+loop
+    ; 1st 4 pixels
+    ldr     r4, [r0, #0]        ; load 4 src pixels
+    ldr     r6, [r0, #1]        ; load 4 src pixels with 1 byte offset
+    ldr     r5, [r2, #0]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    pld     [r0, r1, lsl #1]
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+    ; calculate total sum
+    adds    r8, r8, r4          ; add positive differences to sum
+    subs    r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r4, [r0, #4]        ; load 4 src pixels
+    ldr     r6, [r0, #5]        ; load 4 src pixels with 1 byte offset
+    ldr     r5, [r2, #4]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r4, [r0, #8]        ; load 4 src pixels
+    ldr     r6, [r0, #9]        ; load 4 src pixels with 1 byte offset
+    ldr     r5, [r2, #8]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11  ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r4, [r0, #12]       ; load 4 src pixels
+    ldr     r6, [r0, #13]       ; load 4 src pixels with 1 byte offset
+    ldr     r5, [r2, #12]       ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    subs    r12, r12, #1
+
+    bne     loop
+
+    ; return stuff
+    ldr     r6, [sp, #40]       ; get address of sse
+    mul     r0, r8, r8          ; sum * sum
+    str     r11, [r6]           ; store sse
+    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
+
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+c80808080
+    DCD     0x80808080
+
+    END
+
diff --git a/libs/libvpx/vpx_dsp/arm/variance_halfpixvar16x16_hv_media.asm b/libs/libvpx/vpx_dsp/arm/variance_halfpixvar16x16_hv_media.asm
new file mode 100644
index 0000000000..01953b7094
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/variance_halfpixvar16x16_hv_media.asm
@@ -0,0 +1,222 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vpx_variance_halfpixvar16x16_hv_media|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vpx_variance_halfpixvar16x16_hv_media| PROC
+
+    stmfd   sp!, {r4-r12, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
+    mov     r8, #0              ; initialize sum = 0
+    ldr     r10, c80808080
+    mov     r11, #0             ; initialize sse = 0
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+    mov     lr, #0              ; constant zero
+loop
+    add     r9, r0, r1          ; pointer to pixels on the next row
+    ; 1st 4 pixels
+    ldr     r4, [r0, #0]        ; load source pixels a, row N
+    ldr     r6, [r0, #1]        ; load source pixels b, row N
+    ldr     r5, [r9, #0]        ; load source pixels c, row N+1
+    ldr     r7, [r9, #1]        ; load source pixels d, row N+1
+
+    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+    mvn     r7, r7
+    uhsub8  r5, r5, r7
+    eor     r5, r5, r10
+    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+    mvn     r5, r5
+    uhsub8  r4, r4, r5
+    ldr     r5, [r2, #0]        ; load 4 ref pixels
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    pld     [r0, r1, lsl #1]
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+    ; calculate total sum
+    adds    r8, r8, r4          ; add positive differences to sum
+    subs    r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r4, [r0, #4]        ; load source pixels a, row N
+    ldr     r6, [r0, #5]        ; load source pixels b, row N
+    ldr     r5, [r9, #4]        ; load source pixels c, row N+1
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    ldr     r7, [r9, #5]        ; load source pixels d, row N+1
+
+    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+    mvn     r7, r7
+    uhsub8  r5, r5, r7
+    eor     r5, r5, r10
+    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+    mvn     r5, r5
+    uhsub8  r4, r4, r5
+    ldr     r5, [r2, #4]        ; load 4 ref pixels
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r4, [r0, #8]        ; load source pixels a, row N
+    ldr     r6, [r0, #9]        ; load source pixels b, row N
+    ldr     r5, [r9, #8]        ; load source pixels c, row N+1
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    ldr     r7, [r9, #9]        ; load source pixels d, row N+1
+
+    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+    mvn     r7, r7
+    uhsub8  r5, r5, r7
+    eor     r5, r5, r10
+    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+    mvn     r5, r5
+    uhsub8  r4, r4, r5
+    ldr     r5, [r2, #8]        ; load 4 ref pixels
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r4, [r0, #12]       ; load source pixels a, row N
+    ldr     r6, [r0, #13]       ; load source pixels b, row N
+    ldr     r5, [r9, #12]       ; load source pixels c, row N+1
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+    ldr     r7, [r9, #13]       ; load source pixels d, row N+1
+
+    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+    mvn     r7, r7
+    uhsub8  r5, r5, r7
+    eor     r5, r5, r10
+    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+    mvn     r5, r5
+    uhsub8  r4, r4, r5
+    ldr     r5, [r2, #12]       ; load 4 ref pixels
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+    subs    r12, r12, #1
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    bne     loop
+
+    ; return stuff
+    ldr     r6, [sp, #40]       ; get address of sse
+    mul     r0, r8, r8          ; sum * sum
+    str     r11, [r6]           ; store sse
+    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
+
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+c80808080
+    DCD     0x80808080
+
+    END
diff --git a/libs/libvpx/vpx_dsp/arm/variance_halfpixvar16x16_v_media.asm b/libs/libvpx/vpx_dsp/arm/variance_halfpixvar16x16_v_media.asm
new file mode 100644
index 0000000000..0d17acb38f
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/variance_halfpixvar16x16_v_media.asm
@@ -0,0 +1,184 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vpx_variance_halfpixvar16x16_v_media|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vpx_variance_halfpixvar16x16_v_media| PROC
+
+    stmfd   sp!, {r4-r12, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
+    mov     r8, #0              ; initialize sum = 0
+    ldr     r10, c80808080
+    mov     r11, #0             ; initialize sse = 0
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+    mov     lr, #0              ; constant zero
+loop
+    add     r9, r0, r1          ; set src pointer to next row
+    ; 1st 4 pixels
+    ldr     r4, [r0, #0]        ; load 4 src pixels
+    ldr     r6, [r9, #0]        ; load 4 src pixels from next row
+    ldr     r5, [r2, #0]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    pld     [r0, r1, lsl #1]
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+    ; calculate total sum
+    adds    r8, r8, r4          ; add positive differences to sum
+    subs    r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r4, [r0, #4]        ; load 4 src pixels
+    ldr     r6, [r9, #4]        ; load 4 src pixels from next row
+    ldr     r5, [r2, #4]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r4, [r0, #8]        ; load 4 src pixels
+    ldr     r6, [r9, #8]        ; load 4 src pixels from next row
+    ldr     r5, [r2, #8]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r4, [r0, #12]       ; load 4 src pixels
+    ldr     r6, [r9, #12]       ; load 4 src pixels from next row
+    ldr     r5, [r2, #12]       ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+
+    subs    r12, r12, #1
+
+    bne     loop
+
+    ; return stuff
+    ldr     r6, [sp, #40]       ; get address of sse
+    mul     r0, r8, r8          ; sum * sum
+    str     r11, [r6]           ; store sse
+    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
+
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+c80808080
+    DCD     0x80808080
+
+    END
+
diff --git a/libs/libvpx/vpx_dsp/arm/variance_media.asm b/libs/libvpx/vpx_dsp/arm/variance_media.asm
new file mode 100644
index 0000000000..f7f9e14b0a
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/variance_media.asm
@@ -0,0 +1,358 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vpx_variance16x16_media|
+    EXPORT  |vpx_variance8x8_media|
+    EXPORT  |vpx_mse16x16_media|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vpx_variance16x16_media| PROC
+
+    stmfd   sp!, {r4-r12, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
+    mov     r8, #0              ; initialize sum = 0
+    mov     r11, #0             ; initialize sse = 0
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+
+loop16x16
+    ; 1st 4 pixels
+    ldr     r4, [r0, #0]        ; load 4 src pixels
+    ldr     r5, [r2, #0]        ; load 4 ref pixels
+
+    mov     lr, #0              ; constant zero
+
+    usub8   r6, r4, r5          ; calculate difference
+    pld     [r0, r1, lsl #1]
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+    ; calculate total sum
+    adds    r8, r8, r4          ; add positive differences to sum
+    subs    r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r4, [r0, #4]        ; load 4 src pixels
+    ldr     r5, [r2, #4]        ; load 4 ref pixels
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r4, [r0, #8]        ; load 4 src pixels
+    ldr     r5, [r2, #8]        ; load 4 ref pixels
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r4, [r0, #12]       ; load 4 src pixels
+    ldr     r5, [r2, #12]       ; load 4 ref pixels
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+
+    subs    r12, r12, #1
+
+    bne     loop16x16
+
+    ; return stuff
+    ldr     r6, [sp, #40]       ; get address of sse
+    mul     r0, r8, r8          ; sum * sum
+    str     r11, [r6]           ; store sse
+    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
+
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vpx_variance8x8_media| PROC
+
+    push    {r4-r10, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
+    mov     r12, #8             ; set loop counter to 8 (=block height)
+    mov     r4, #0              ; initialize sum = 0
+    mov     r5, #0              ; initialize sse = 0
+
+loop8x8
+    ; 1st 4 pixels
+    ldr     r6, [r0, #0x0]      ; load 4 src pixels
+    ldr     r7, [r2, #0x0]      ; load 4 ref pixels
+
+    mov     lr, #0              ; constant zero
+
+    usub8   r8, r6, r7          ; calculate difference
+    pld     [r0, r1, lsl #1]
+    sel     r10, r8, lr         ; select bytes with positive difference
+    usub8   r9, r7, r6          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r6, r10, lr         ; calculate sum of positive differences
+    usad8   r7, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r10         ; differences of all 4 pixels
+    ; calculate total sum
+    add    r4, r4, r6           ; add positive differences to sum
+    sub    r4, r4, r7           ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r7, r8              ; byte (two pixels) to halfwords
+    uxtb16  r10, r8, ror #8     ; another two pixels to halfwords
+    smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r6, [r0, #0x4]      ; load 4 src pixels
+    ldr     r7, [r2, #0x4]      ; load 4 ref pixels
+    smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r8, r6, r7          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r10, r8, lr         ; select bytes with positive difference
+    usub8   r9, r7, r6          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r6, r10, lr         ; calculate sum of positive differences
+    usad8   r7, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r10         ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r4, r4, r6          ; add positive differences to sum
+    sub     r4, r4, r7          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r7, r8              ; byte (two pixels) to halfwords
+    uxtb16  r10, r8, ror #8     ; another two pixels to halfwords
+    smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)
+    subs    r12, r12, #1        ; next row
+    smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)
+
+    bne     loop8x8
+
+    ; return stuff
+    ldr     r8, [sp, #32]       ; get address of sse
+    mul     r1, r4, r4          ; sum * sum
+    str     r5, [r8]            ; store sse
+    sub     r0, r5, r1, ASR #6  ; return (sse - ((sum * sum) >> 6))
+
+    pop     {r4-r10, pc}
+
+    ENDP
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+;
+;note: Based on vpx_variance16x16_media. In this function, sum is never used.
+;      So, we can remove this part of calculation.
+
+|vpx_mse16x16_media| PROC
+
+    push    {r4-r9, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+    mov     r4, #0              ; initialize sse = 0
+
+loopmse
+    ; 1st 4 pixels
+    ldr     r5, [r0, #0x0]      ; load 4 src pixels
+    ldr     r6, [r2, #0x0]      ; load 4 ref pixels
+
+    mov     lr, #0              ; constant zero
+
+    usub8   r8, r5, r6          ; calculate difference
+    pld     [r0, r1, lsl #1]
+    sel     r7, r8, lr          ; select bytes with positive difference
+    usub8   r9, r6, r5          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r5, r7, lr          ; calculate sum of positive differences
+    usad8   r6, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r7          ; differences of all 4 pixels
+
+    ldr     r5, [r0, #0x4]      ; load 4 src pixels
+
+    ; calculate sse
+    uxtb16  r6, r8              ; byte (two pixels) to halfwords
+    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
+    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r6, [r2, #0x4]      ; load 4 ref pixels
+    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
+
+    usub8   r8, r5, r6          ; calculate difference
+    sel     r7, r8, lr          ; select bytes with positive difference
+    usub8   r9, r6, r5          ; calculate difference with reversed operands
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r5, r7, lr          ; calculate sum of positive differences
+    usad8   r6, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r7          ; differences of all 4 pixels
+    ldr     r5, [r0, #0x8]      ; load 4 src pixels
+    ; calculate sse
+    uxtb16  r6, r8              ; byte (two pixels) to halfwords
+    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
+    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r6, [r2, #0x8]      ; load 4 ref pixels
+    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
+
+    usub8   r8, r5, r6          ; calculate difference
+    sel     r7, r8, lr          ; select bytes with positive difference
+    usub8   r9, r6, r5          ; calculate difference with reversed operands
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r5, r7, lr          ; calculate sum of positive differences
+    usad8   r6, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r7          ; differences of all 4 pixels
+
+    ldr     r5, [r0, #0xc]      ; load 4 src pixels
+
+    ; calculate sse
+    uxtb16  r6, r8              ; byte (two pixels) to halfwords
+    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
+    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r6, [r2, #0xc]      ; load 4 ref pixels
+    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
+
+    usub8   r8, r5, r6          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r8, lr          ; select bytes with positive difference
+    usub8   r9, r6, r5          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r5, r7, lr          ; calculate sum of positive differences
+    usad8   r6, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r7          ; differences of all 4 pixels
+
+    subs    r12, r12, #1        ; next row
+
+    ; calculate sse
+    uxtb16  r6, r8              ; byte (two pixels) to halfwords
+    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
+    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
+    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
+
+    bne     loopmse
+
+    ; return stuff
+    ldr     r1, [sp, #28]       ; get address of sse
+    mov     r0, r4              ; return sse
+    str     r4, [r1]            ; store sse
+
+    pop     {r4-r9, pc}
+
+    ENDP
+
+    END
diff --git a/libs/libvpx/vpx_dsp/arm/variance_neon.c b/libs/libvpx/vpx_dsp/arm/variance_neon.c
new file mode 100644
index 0000000000..ede6e7bbb0
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/variance_neon.c
@@ -0,0 +1,418 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
+  const int32x4_t a = vpaddlq_s16(v_16x8);
+  const int64x2_t b = vpaddlq_s32(a);
+  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+                               vreinterpret_s32_s64(vget_high_s64(b)));
+  return vget_lane_s32(c, 0);
+}
+
+static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
+  const int64x2_t b = vpaddlq_s32(v_32x4);
+  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+                               vreinterpret_s32_s64(vget_high_s64(b)));
+  return vget_lane_s32(c, 0);
+}
+
+// w * h must be less than 2048 or local variable v_sum may overflow.
+static void variance_neon_w8(const uint8_t *a, int a_stride,
+                             const uint8_t *b, int b_stride,
+                             int w, int h, uint32_t *sse, int *sum) {
+  int i, j;
+  int16x8_t v_sum = vdupq_n_s16(0);
+  int32x4_t v_sse_lo = vdupq_n_s32(0);
+  int32x4_t v_sse_hi = vdupq_n_s32(0);
+
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; j += 8) {
+      const uint8x8_t v_a = vld1_u8(&a[j]);
+      const uint8x8_t v_b = vld1_u8(&b[j]);
+      const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
+      const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
+      v_sum = vaddq_s16(v_sum, sv_diff);
+      v_sse_lo = vmlal_s16(v_sse_lo,
+                           vget_low_s16(sv_diff),
+                           vget_low_s16(sv_diff));
+      v_sse_hi = vmlal_s16(v_sse_hi,
+                           vget_high_s16(sv_diff),
+                           vget_high_s16(sv_diff));
+    }
+    a += a_stride;
+    b += b_stride;
+  }
+
+  *sum = horizontal_add_s16x8(v_sum);
+  *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
+}
+
+void vpx_get8x8var_neon(const uint8_t *a, int a_stride,
+                        const uint8_t *b, int b_stride,
+                        unsigned int *sse, int *sum) {
+  variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, sum);
+}
+
+void vpx_get16x16var_neon(const uint8_t *a, int a_stride,
+                          const uint8_t *b, int b_stride,
+                          unsigned int *sse, int *sum) {
+  variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, sum);
+}
+
+unsigned int vpx_variance8x8_neon(const uint8_t *a, int a_stride,
+                                  const uint8_t *b, int b_stride,
+                                  unsigned int *sse) {
+  int sum;
+  variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum);
+  return *sse - (((int64_t)sum * sum) >> 6);  //  >> 6 = / 8 * 8
+}
+
+unsigned int vpx_variance16x16_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum);
+  return *sse - (((int64_t)sum * sum) >> 8);  //  >> 8 = / 16 * 16
+}
+
+unsigned int vpx_variance32x32_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum);
+  return *sse - (((int64_t)sum * sum) >> 10);  // >> 10 = / 32 * 32
+}
+
+unsigned int vpx_variance32x64_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum1, sum2;
+  uint32_t sse1, sse2;
+  variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1);
+  variance_neon_w8(a + (32 * a_stride), a_stride,
+                   b + (32 * b_stride), b_stride, 32, 32,
+                   &sse2, &sum2);
+  *sse = sse1 + sse2;
+  sum1 += sum2;
+  return *sse - (((int64_t)sum1 * sum1) >> 11);  // >> 11 = / 32 * 64
+}
+
+unsigned int vpx_variance64x32_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum1, sum2;
+  uint32_t sse1, sse2;
+  variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
+  variance_neon_w8(a + (16 * a_stride), a_stride,
+                   b + (16 * b_stride), b_stride, 64, 16,
+                   &sse2, &sum2);
+  *sse = sse1 + sse2;
+  sum1 += sum2;
+  return *sse - (((int64_t)sum1 * sum1) >> 11);  // >> 11 = / 32 * 64
+}
+
+unsigned int vpx_variance64x64_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum1, sum2;
+  uint32_t sse1, sse2;
+
+  variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
+  variance_neon_w8(a + (16 * a_stride), a_stride,
+                   b + (16 * b_stride), b_stride, 64, 16,
+                   &sse2, &sum2);
+  sse1 += sse2;
+  sum1 += sum2;
+
+  variance_neon_w8(a + (16 * 2 * a_stride), a_stride,
+                   b + (16 * 2 * b_stride), b_stride,
+                   64, 16, &sse2, &sum2);
+  sse1 += sse2;
+  sum1 += sum2;
+
+  variance_neon_w8(a + (16 * 3 * a_stride), a_stride,
+                   b + (16 * 3 * b_stride), b_stride,
+                   64, 16, &sse2, &sum2);
+  *sse = sse1 + sse2;
+  sum1 += sum2;
+  return *sse - (((int64_t)sum1 * sum1) >> 12);  // >> 12 = / 64 * 64
+}
+
+unsigned int vpx_variance16x8_neon(
+        const unsigned char *src_ptr,
+        int source_stride,
+        const unsigned char *ref_ptr,
+        int recon_stride,
+        unsigned int *sse) {
+    int i;
+    int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
+    uint32x2_t d0u32, d10u32;
+    int64x1_t d0s64, d1s64;
+    uint8x16_t q0u8, q1u8, q2u8, q3u8;
+    uint16x8_t q11u16, q12u16, q13u16, q14u16;
+    int32x4_t q8s32, q9s32, q10s32;
+    int64x2_t q0s64, q1s64, q5s64;
+
+    q8s32 = vdupq_n_s32(0);
+    q9s32 = vdupq_n_s32(0);
+    q10s32 = vdupq_n_s32(0);
+
+    for (i = 0; i < 4; i++) {
+        q0u8 = vld1q_u8(src_ptr);
+        src_ptr += source_stride;
+        q1u8 = vld1q_u8(src_ptr);
+        src_ptr += source_stride;
+        __builtin_prefetch(src_ptr);
+
+        q2u8 = vld1q_u8(ref_ptr);
+        ref_ptr += recon_stride;
+        q3u8 = vld1q_u8(ref_ptr);
+        ref_ptr += recon_stride;
+        __builtin_prefetch(ref_ptr);
+
+        q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
+        q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
+        q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
+        q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
+
+        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
+        q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
+        q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
+
+        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
+        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
+        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+
+        d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+        d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
+        q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
+        q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
+
+        d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
+        d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
+        q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
+        q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
+    }
+
+    q10s32 = vaddq_s32(q10s32, q9s32);
+    q0s64 = vpaddlq_s32(q8s32);
+    q1s64 = vpaddlq_s32(q10s32);
+
+    d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
+    d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+
+    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
+                      vreinterpret_s32_s64(d0s64));
+    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
+
+    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
+    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
+
+    return vget_lane_u32(d0u32, 0);
+}
+
+unsigned int vpx_variance8x16_neon(
+        const unsigned char *src_ptr,
+        int source_stride,
+        const unsigned char *ref_ptr,
+        int recon_stride,
+        unsigned int *sse) {
+    int i;
+    uint8x8_t d0u8, d2u8, d4u8, d6u8;
+    int16x4_t d22s16, d23s16, d24s16, d25s16;
+    uint32x2_t d0u32, d10u32;
+    int64x1_t d0s64, d1s64;
+    uint16x8_t q11u16, q12u16;
+    int32x4_t q8s32, q9s32, q10s32;
+    int64x2_t q0s64, q1s64, q5s64;
+
+    q8s32 = vdupq_n_s32(0);
+    q9s32 = vdupq_n_s32(0);
+    q10s32 = vdupq_n_s32(0);
+
+    for (i = 0; i < 8; i++) {
+        d0u8 = vld1_u8(src_ptr);
+        src_ptr += source_stride;
+        d2u8 = vld1_u8(src_ptr);
+        src_ptr += source_stride;
+        __builtin_prefetch(src_ptr);
+
+        d4u8 = vld1_u8(ref_ptr);
+        ref_ptr += recon_stride;
+        d6u8 = vld1_u8(ref_ptr);
+        ref_ptr += recon_stride;
+        __builtin_prefetch(ref_ptr);
+
+        q11u16 = vsubl_u8(d0u8, d4u8);
+        q12u16 = vsubl_u8(d2u8, d6u8);
+
+        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
+        q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
+        q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
+
+        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
+        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
+        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+    }
+
+    q10s32 = vaddq_s32(q10s32, q9s32);
+    q0s64 = vpaddlq_s32(q8s32);
+    q1s64 = vpaddlq_s32(q10s32);
+
+    d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
+    d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+
+    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
+                      vreinterpret_s32_s64(d0s64));
+    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
+
+    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
+    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
+
+    return vget_lane_u32(d0u32, 0);
+}
+
+unsigned int vpx_mse16x16_neon(
+        const unsigned char *src_ptr,
+        int source_stride,
+        const unsigned char *ref_ptr,
+        int recon_stride,
+        unsigned int *sse) {
+    int i;
+    int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
+    int64x1_t d0s64;
+    uint8x16_t q0u8, q1u8, q2u8, q3u8;
+    int32x4_t q7s32, q8s32, q9s32, q10s32;
+    uint16x8_t q11u16, q12u16, q13u16, q14u16;
+    int64x2_t q1s64;
+
+    q7s32 = vdupq_n_s32(0);
+    q8s32 = vdupq_n_s32(0);
+    q9s32 = vdupq_n_s32(0);
+    q10s32 = vdupq_n_s32(0);
+
+    for (i = 0; i < 8; i++) {  // mse16x16_neon_loop
+        q0u8 = vld1q_u8(src_ptr);
+        src_ptr += source_stride;
+        q1u8 = vld1q_u8(src_ptr);
+        src_ptr += source_stride;
+        q2u8 = vld1q_u8(ref_ptr);
+        ref_ptr += recon_stride;
+        q3u8 = vld1q_u8(ref_ptr);
+        ref_ptr += recon_stride;
+
+        q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
+        q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
+        q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
+        q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
+
+        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+        q7s32 = vmlal_s16(q7s32, d22s16, d22s16);
+        q8s32 = vmlal_s16(q8s32, d23s16, d23s16);
+
+        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
+        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+
+        d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+        d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+        q7s32 = vmlal_s16(q7s32, d26s16, d26s16);
+        q8s32 = vmlal_s16(q8s32, d27s16, d27s16);
+
+        d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
+        d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
+        q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
+        q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
+    }
+
+    q7s32 = vaddq_s32(q7s32, q8s32);
+    q9s32 = vaddq_s32(q9s32, q10s32);
+    q10s32 = vaddq_s32(q7s32, q9s32);
+
+    q1s64 = vpaddlq_s32(q10s32);
+    d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+
+    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0);
+    return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
+}
+
+unsigned int vpx_get4x4sse_cs_neon(
+        const unsigned char *src_ptr,
+        int source_stride,
+        const unsigned char *ref_ptr,
+        int recon_stride) {
+    int16x4_t d22s16, d24s16, d26s16, d28s16;
+    int64x1_t d0s64;
+    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+    int32x4_t q7s32, q8s32, q9s32, q10s32;
+    uint16x8_t q11u16, q12u16, q13u16, q14u16;
+    int64x2_t q1s64;
+
+    d0u8 = vld1_u8(src_ptr);
+    src_ptr += source_stride;
+    d4u8 = vld1_u8(ref_ptr);
+    ref_ptr += recon_stride;
+    d1u8 = vld1_u8(src_ptr);
+    src_ptr += source_stride;
+    d5u8 = vld1_u8(ref_ptr);
+    ref_ptr += recon_stride;
+    d2u8 = vld1_u8(src_ptr);
+    src_ptr += source_stride;
+    d6u8 = vld1_u8(ref_ptr);
+    ref_ptr += recon_stride;
+    d3u8 = vld1_u8(src_ptr);
+    src_ptr += source_stride;
+    d7u8 = vld1_u8(ref_ptr);
+    ref_ptr += recon_stride;
+
+    q11u16 = vsubl_u8(d0u8, d4u8);
+    q12u16 = vsubl_u8(d1u8, d5u8);
+    q13u16 = vsubl_u8(d2u8, d6u8);
+    q14u16 = vsubl_u8(d3u8, d7u8);
+
+    d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16));
+    d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16));
+    d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16));
+    d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16));
+
+    q7s32 = vmull_s16(d22s16, d22s16);
+    q8s32 = vmull_s16(d24s16, d24s16);
+    q9s32 = vmull_s16(d26s16, d26s16);
+    q10s32 = vmull_s16(d28s16, d28s16);
+
+    q7s32 = vaddq_s32(q7s32, q8s32);
+    q9s32 = vaddq_s32(q9s32, q10s32);
+    q9s32 = vaddq_s32(q7s32, q9s32);
+
+    q1s64 = vpaddlq_s32(q9s32);
+    d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+
+    return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
+}
diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon.c b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon.c
new file mode 100644
index 0000000000..8632250138
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon.c
@@ -0,0 +1,373 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+static INLINE int32x4_t MULTIPLY_BY_Q0(
+    int16x4_t dsrc0,
+    int16x4_t dsrc1,
+    int16x4_t dsrc2,
+    int16x4_t dsrc3,
+    int16x4_t dsrc4,
+    int16x4_t dsrc5,
+    int16x4_t dsrc6,
+    int16x4_t dsrc7,
+    int16x8_t q0s16) {
+  int32x4_t qdst;
+  int16x4_t d0s16, d1s16;
+
+  d0s16 = vget_low_s16(q0s16);
+  d1s16 = vget_high_s16(q0s16);
+
+  qdst = vmull_lane_s16(dsrc0, d0s16, 0);
+  qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1);
+  qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2);
+  qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3);
+  qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0);
+  qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1);
+  qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2);
+  qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3);
+  return qdst;
+}
+
+void vpx_convolve8_avg_horiz_neon(
+    const uint8_t *src,
+    ptrdiff_t src_stride,
+    uint8_t *dst,
+    ptrdiff_t dst_stride,
+    const int16_t *filter_x,
+    int x_step_q4,
+    const int16_t *filter_y,  // unused
+    int y_step_q4,            // unused
+    int w,
+    int h) {
+  int width;
+  const uint8_t *s;
+  uint8_t *d;
+  uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8;
+  uint32x2_t d2u32, d3u32, d6u32, d7u32, d28u32, d29u32, d30u32, d31u32;
+  uint8x16_t q1u8, q3u8, q12u8, q13u8, q14u8, q15u8;
+  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16;
+  int16x4_t d24s16, d25s16, d26s16, d27s16;
+  uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16;
+  int16x8_t q0s16;
+  uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
+  int32x4_t q1s32, q2s32, q14s32, q15s32;
+  uint16x8x2_t q0x2u16;
+  uint8x8x2_t d0x2u8, d1x2u8;
+  uint32x2x2_t d0x2u32;
+  uint16x4x2_t d0x2u16, d1x2u16;
+  uint32x4x2_t q0x2u32;
+
+  assert(x_step_q4 == 16);
+
+  q0s16 = vld1q_s16(filter_x);
+
+  src -= 3;  // adjust for taps
+  for (; h > 0; h -= 4) {  // loop_horiz_v
+    s = src;
+    d24u8 = vld1_u8(s);
+    s += src_stride;
+    d25u8 = vld1_u8(s);
+    s += src_stride;
+    d26u8 = vld1_u8(s);
+    s += src_stride;
+    d27u8 = vld1_u8(s);
+
+    q12u8 = vcombine_u8(d24u8, d25u8);
+    q13u8 = vcombine_u8(d26u8, d27u8);
+
+    q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8),
+                        vreinterpretq_u16_u8(q13u8));
+    d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
+    d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
+    d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
+    d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1]));
+    d0x2u8 = vtrn_u8(d24u8, d25u8);
+    d1x2u8 = vtrn_u8(d26u8, d27u8);
+
+    __builtin_prefetch(src + src_stride * 4);
+    __builtin_prefetch(src + src_stride * 5);
+
+    q8u16 = vmovl_u8(d0x2u8.val[0]);
+    q9u16 = vmovl_u8(d0x2u8.val[1]);
+    q10u16 = vmovl_u8(d1x2u8.val[0]);
+    q11u16 = vmovl_u8(d1x2u8.val[1]);
+
+    src += 7;
+    d16u16 = vget_low_u16(q8u16);
+    d17u16 = vget_high_u16(q8u16);
+    d18u16 = vget_low_u16(q9u16);
+    d19u16 = vget_high_u16(q9u16);
+    q8u16 = vcombine_u16(d16u16, d18u16);  // vswp 17 18
+    q9u16 = vcombine_u16(d17u16, d19u16);
+
+    d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
+    d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));  // vmov 23 21
+    for (width = w;
+         width > 0;
+         width -= 4, src += 4, dst += 4) {  // loop_horiz
+      s = src;
+      d28u32 = vld1_dup_u32((const uint32_t *)s);
+      s += src_stride;
+      d29u32 = vld1_dup_u32((const uint32_t *)s);
+      s += src_stride;
+      d31u32 = vld1_dup_u32((const uint32_t *)s);
+      s += src_stride;
+      d30u32 = vld1_dup_u32((const uint32_t *)s);
+
+      __builtin_prefetch(src + 64);
+
+      d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32),
+                         vreinterpret_u16_u32(d31u32));
+      d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32),
+                         vreinterpret_u16_u32(d30u32));
+      d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]),   // d28
+                       vreinterpret_u8_u16(d1x2u16.val[0]));  // d29
+      d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]),   // d31
+                       vreinterpret_u8_u16(d1x2u16.val[1]));  // d30
+
+      __builtin_prefetch(src + 64 + src_stride);
+
+      q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
+      q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
+      q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8),
+                          vreinterpretq_u32_u8(q15u8));
+
+      d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
+      d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
+      q12u16 = vmovl_u8(d28u8);
+      q13u16 = vmovl_u8(d29u8);
+
+      __builtin_prefetch(src + 64 + src_stride * 2);
+
+      d = dst;
+      d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0);
+      d += dst_stride;
+      d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0);
+      d += dst_stride;
+      d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1);
+      d += dst_stride;
+      d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1);
+
+      d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
+      d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
+      d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
+      d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
+      d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+      d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+      d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+      d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+      d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+
+      q1s32  = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16,
+                              d18s16, d19s16, d23s16, d24s16, q0s16);
+      q2s32  = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16,
+                              d19s16, d23s16, d24s16, d26s16, q0s16);
+      q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16,
+                              d23s16, d24s16, d26s16, d27s16, q0s16);
+      q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16,
+                              d24s16, d26s16, d27s16, d25s16, q0s16);
+
+      __builtin_prefetch(src + 64 + src_stride * 3);
+
+      d2u16 = vqrshrun_n_s32(q1s32, 7);
+      d3u16 = vqrshrun_n_s32(q2s32, 7);
+      d4u16 = vqrshrun_n_s32(q14s32, 7);
+      d5u16 = vqrshrun_n_s32(q15s32, 7);
+
+      q1u16 = vcombine_u16(d2u16, d3u16);
+      q2u16 = vcombine_u16(d4u16, d5u16);
+
+      d2u8 = vqmovn_u16(q1u16);
+      d3u8 = vqmovn_u16(q2u16);
+
+      d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8),
+                         vreinterpret_u16_u8(d3u8));
+      d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
+                         vreinterpret_u32_u16(d0x2u16.val[1]));
+      d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
+                       vreinterpret_u8_u32(d0x2u32.val[1]));
+
+      q1u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
+      q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32));
+
+      q1u8 = vrhaddq_u8(q1u8, q3u8);
+
+      d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8));
+      d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8));
+
+      d = dst;
+      vst1_lane_u32((uint32_t *)d, d2u32, 0);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d3u32, 0);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d2u32, 1);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d3u32, 1);
+
+      q8u16 = q9u16;
+      d20s16 = d23s16;
+      q11u16 = q12u16;
+      q9u16 = q13u16;
+      d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+    }
+    src += src_stride * 4 - w - 7;
+    dst += dst_stride * 4 - w;
+  }
+  return;
+}
+
+void vpx_convolve8_avg_vert_neon(
+    const uint8_t *src,
+    ptrdiff_t src_stride,
+    uint8_t *dst,
+    ptrdiff_t dst_stride,
+    const int16_t *filter_x,  // unused
+    int x_step_q4,            // unused
+    const int16_t *filter_y,
+    int y_step_q4,
+    int w,
+    int h) {
+  int height;
+  const uint8_t *s;
+  uint8_t *d;
+  uint8x8_t d2u8, d3u8;
+  uint32x2_t d2u32, d3u32, d6u32, d7u32;
+  uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32;
+  uint8x16_t q1u8, q3u8;
+  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16;
+  int16x4_t d24s16, d25s16, d26s16, d27s16;
+  uint16x4_t d2u16, d3u16, d4u16, d5u16;
+  int16x8_t q0s16;
+  uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
+  int32x4_t q1s32, q2s32, q14s32, q15s32;
+
+  assert(y_step_q4 == 16);
+
+  src -= src_stride * 3;
+  q0s16 = vld1q_s16(filter_y);
+  for (; w > 0; w -= 4, src += 4, dst += 4) {  // loop_vert_h
+    s = src;
+    d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0);
+    s += src_stride;
+    d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1);
+    s += src_stride;
+    d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0);
+    s += src_stride;
+    d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1);
+    s += src_stride;
+    d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0);
+    s += src_stride;
+    d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1);
+    s += src_stride;
+    d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0);
+    s += src_stride;
+
+    q8u16  = vmovl_u8(vreinterpret_u8_u32(d16u32));
+    q9u16  = vmovl_u8(vreinterpret_u8_u32(d18u32));
+    q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32));
+    q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32));
+
+    d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
+    d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
+    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+    d = dst;
+    for (height = h; height > 0; height -= 4) {  // loop_vert
+      d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0);
+      s += src_stride;
+      d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0);
+      s += src_stride;
+      d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1);
+      s += src_stride;
+      d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1);
+      s += src_stride;
+
+      q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32));
+      q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32));
+
+      d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0);
+      d += dst_stride;
+      d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1);
+      d += dst_stride;
+      d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0);
+      d += dst_stride;
+      d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1);
+      d -= dst_stride * 3;
+
+      d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
+      d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
+      d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
+      d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
+      d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+      d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+      d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+      d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+
+      __builtin_prefetch(s);
+      __builtin_prefetch(s + src_stride);
+      q1s32  = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16,
+                              d20s16, d21s16, d22s16, d24s16, q0s16);
+      __builtin_prefetch(s + src_stride * 2);
+      __builtin_prefetch(s + src_stride * 3);
+      q2s32  = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16,
+                              d21s16, d22s16, d24s16, d26s16, q0s16);
+      __builtin_prefetch(d);
+      __builtin_prefetch(d + dst_stride);
+      q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16,
+                              d22s16, d24s16, d26s16, d27s16, q0s16);
+      __builtin_prefetch(d + dst_stride * 2);
+      __builtin_prefetch(d + dst_stride * 3);
+      q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16,
+                              d24s16, d26s16, d27s16, d25s16, q0s16);
+
+      d2u16 = vqrshrun_n_s32(q1s32, 7);
+      d3u16 = vqrshrun_n_s32(q2s32, 7);
+      d4u16 = vqrshrun_n_s32(q14s32, 7);
+      d5u16 = vqrshrun_n_s32(q15s32, 7);
+
+      q1u16 = vcombine_u16(d2u16, d3u16);
+      q2u16 = vcombine_u16(d4u16, d5u16);
+
+      d2u8 = vqmovn_u16(q1u16);
+      d3u8 = vqmovn_u16(q2u16);
+
+      q1u8 = vcombine_u8(d2u8, d3u8);
+      q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32));
+
+      q1u8 = vrhaddq_u8(q1u8, q3u8);
+
+      d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8));
+      d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8));
+
+      vst1_lane_u32((uint32_t *)d, d2u32, 0);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d2u32, 1);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d3u32, 0);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d3u32, 1);
+      d += dst_stride;
+
+      q8u16 = q10u16;
+      d18s16 = d22s16;
+      d19s16 = d24s16;
+      q10u16 = q13u16;
+      d22s16 = d25s16;
+    }
+  }
+  return;
+}
diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm
new file mode 100644
index 0000000000..e279d570fc
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm
@@ -0,0 +1,292 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    ; These functions are only valid when:
+    ; x_step_q4 == 16
+    ; w%4 == 0
+    ; h%4 == 0
+    ; taps == 8
+    ; VP9_FILTER_WEIGHT == 128
+    ; VP9_FILTER_SHIFT == 7
+
+    EXPORT  |vpx_convolve8_avg_horiz_neon|
+    EXPORT  |vpx_convolve8_avg_vert_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+    ; Multiply and accumulate by q0
+    MACRO
+    MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7
+    vmull.s16 $dst, $src0, d0[0]
+    vmlal.s16 $dst, $src1, d0[1]
+    vmlal.s16 $dst, $src2, d0[2]
+    vmlal.s16 $dst, $src3, d0[3]
+    vmlal.s16 $dst, $src4, d1[0]
+    vmlal.s16 $dst, $src5, d1[1]
+    vmlal.s16 $dst, $src6, d1[2]
+    vmlal.s16 $dst, $src7, d1[3]
+    MEND
+
+; r0    const uint8_t *src
+; r1    int src_stride
+; r2    uint8_t *dst
+; r3    int dst_stride
+; sp[]const int16_t *filter_x
+; sp[]int x_step_q4
+; sp[]const int16_t *filter_y ; unused
+; sp[]int y_step_q4           ; unused
+; sp[]int w
+; sp[]int h
+
+|vpx_convolve8_avg_horiz_neon| PROC
+    push            {r4-r10, lr}
+
+    sub             r0, r0, #3              ; adjust for taps
+
+    ldr             r5, [sp, #32]           ; filter_x
+    ldr             r6, [sp, #48]           ; w
+    ldr             r7, [sp, #52]           ; h
+
+    vld1.s16        {q0}, [r5]              ; filter_x
+
+    sub             r8, r1, r1, lsl #2      ; -src_stride * 3
+    add             r8, r8, #4              ; -src_stride * 3 + 4
+
+    sub             r4, r3, r3, lsl #2      ; -dst_stride * 3
+    add             r4, r4, #4              ; -dst_stride * 3 + 4
+
+    rsb             r9, r6, r1, lsl #2      ; reset src for outer loop
+    sub             r9, r9, #7
+    rsb             r12, r6, r3, lsl #2     ; reset dst for outer loop
+
+    mov             r10, r6                 ; w loop counter
+
+vpx_convolve8_avg_loop_horiz_v
+    vld1.8          {d24}, [r0], r1
+    vld1.8          {d25}, [r0], r1
+    vld1.8          {d26}, [r0], r1
+    vld1.8          {d27}, [r0], r8
+
+    vtrn.16         q12, q13
+    vtrn.8          d24, d25
+    vtrn.8          d26, d27
+
+    pld             [r0, r1, lsl #2]
+
+    vmovl.u8        q8, d24
+    vmovl.u8        q9, d25
+    vmovl.u8        q10, d26
+    vmovl.u8        q11, d27
+
+    ; save a few instructions in the inner loop
+    vswp            d17, d18
+    vmov            d23, d21
+
+    add             r0, r0, #3
+
+vpx_convolve8_avg_loop_horiz
+    add             r5, r0, #64
+
+    vld1.32         {d28[]}, [r0], r1
+    vld1.32         {d29[]}, [r0], r1
+    vld1.32         {d31[]}, [r0], r1
+    vld1.32         {d30[]}, [r0], r8
+
+    pld             [r5]
+
+    vtrn.16         d28, d31
+    vtrn.16         d29, d30
+    vtrn.8          d28, d29
+    vtrn.8          d31, d30
+
+    pld             [r5, r1]
+
+    ; extract to s16
+    vtrn.32         q14, q15
+    vmovl.u8        q12, d28
+    vmovl.u8        q13, d29
+
+    pld             [r5, r1, lsl #1]
+
+    ; slightly out of order load to match the existing data
+    vld1.u32        {d6[0]}, [r2], r3
+    vld1.u32        {d7[0]}, [r2], r3
+    vld1.u32        {d6[1]}, [r2], r3
+    vld1.u32        {d7[1]}, [r2], r3
+
+    sub             r2, r2, r3, lsl #2      ; reset for store
+
+    ; src[] * filter_x
+    MULTIPLY_BY_Q0  q1,  d16, d17, d20, d22, d18, d19, d23, d24
+    MULTIPLY_BY_Q0  q2,  d17, d20, d22, d18, d19, d23, d24, d26
+    MULTIPLY_BY_Q0  q14, d20, d22, d18, d19, d23, d24, d26, d27
+    MULTIPLY_BY_Q0  q15, d22, d18, d19, d23, d24, d26, d27, d25
+
+    pld             [r5, -r8]
+
+    ; += 64 >> 7
+    vqrshrun.s32    d2, q1, #7
+    vqrshrun.s32    d3, q2, #7
+    vqrshrun.s32    d4, q14, #7
+    vqrshrun.s32    d5, q15, #7
+
+    ; saturate
+    vqmovn.u16      d2, q1
+    vqmovn.u16      d3, q2
+
+    ; transpose
+    vtrn.16         d2, d3
+    vtrn.32         d2, d3
+    vtrn.8          d2, d3
+
+    ; average the new value and the dst value
+    vrhadd.u8       q1, q1, q3
+
+    vst1.u32        {d2[0]}, [r2@32], r3
+    vst1.u32        {d3[0]}, [r2@32], r3
+    vst1.u32        {d2[1]}, [r2@32], r3
+    vst1.u32        {d3[1]}, [r2@32], r4
+
+    vmov            q8,  q9
+    vmov            d20, d23
+    vmov            q11, q12
+    vmov            q9,  q13
+
+    subs            r6, r6, #4              ; w -= 4
+    bgt             vpx_convolve8_avg_loop_horiz
+
+    ; outer loop
+    mov             r6, r10                 ; restore w counter
+    add             r0, r0, r9              ; src += src_stride * 4 - w
+    add             r2, r2, r12             ; dst += dst_stride * 4 - w
+    subs            r7, r7, #4              ; h -= 4
+    bgt vpx_convolve8_avg_loop_horiz_v
+
+    pop             {r4-r10, pc}
+
+    ENDP
+
+|vpx_convolve8_avg_vert_neon| PROC
+    push            {r4-r8, lr}
+
+    ; adjust for taps
+    sub             r0, r0, r1
+    sub             r0, r0, r1, lsl #1
+
+    ldr             r4, [sp, #32]           ; filter_y
+    ldr             r6, [sp, #40]           ; w
+    ldr             lr, [sp, #44]           ; h
+
+    vld1.s16        {q0}, [r4]              ; filter_y
+
+    lsl             r1, r1, #1
+    lsl             r3, r3, #1
+
+vpx_convolve8_avg_loop_vert_h
+    mov             r4, r0
+    add             r7, r0, r1, asr #1
+    mov             r5, r2
+    add             r8, r2, r3, asr #1
+    mov             r12, lr                 ; h loop counter
+
+    vld1.u32        {d16[0]}, [r4], r1
+    vld1.u32        {d16[1]}, [r7], r1
+    vld1.u32        {d18[0]}, [r4], r1
+    vld1.u32        {d18[1]}, [r7], r1
+    vld1.u32        {d20[0]}, [r4], r1
+    vld1.u32        {d20[1]}, [r7], r1
+    vld1.u32        {d22[0]}, [r4], r1
+
+    vmovl.u8        q8, d16
+    vmovl.u8        q9, d18
+    vmovl.u8        q10, d20
+    vmovl.u8        q11, d22
+
+vpx_convolve8_avg_loop_vert
+    ; always process a 4x4 block at a time
+    vld1.u32        {d24[0]}, [r7], r1
+    vld1.u32        {d26[0]}, [r4], r1
+    vld1.u32        {d26[1]}, [r7], r1
+    vld1.u32        {d24[1]}, [r4], r1
+
+    ; extract to s16
+    vmovl.u8        q12, d24
+    vmovl.u8        q13, d26
+
+    vld1.u32        {d6[0]}, [r5@32], r3
+    vld1.u32        {d6[1]}, [r8@32], r3
+    vld1.u32        {d7[0]}, [r5@32], r3
+    vld1.u32        {d7[1]}, [r8@32], r3
+
+    pld             [r7]
+    pld             [r4]
+
+    ; src[] * filter_y
+    MULTIPLY_BY_Q0  q1,  d16, d17, d18, d19, d20, d21, d22, d24
+
+    pld             [r7, r1]
+    pld             [r4, r1]
+
+    MULTIPLY_BY_Q0  q2,  d17, d18, d19, d20, d21, d22, d24, d26
+
+    pld             [r5]
+    pld             [r8]
+
+    MULTIPLY_BY_Q0  q14, d18, d19, d20, d21, d22, d24, d26, d27
+
+    pld             [r5, r3]
+    pld             [r8, r3]
+
+    MULTIPLY_BY_Q0  q15, d19, d20, d21, d22, d24, d26, d27, d25
+
+    ; += 64 >> 7
+    vqrshrun.s32    d2, q1, #7
+    vqrshrun.s32    d3, q2, #7
+    vqrshrun.s32    d4, q14, #7
+    vqrshrun.s32    d5, q15, #7
+
+    ; saturate
+    vqmovn.u16      d2, q1
+    vqmovn.u16      d3, q2
+
+    ; average the new value and the dst value
+    vrhadd.u8       q1, q1, q3
+
+    sub             r5, r5, r3, lsl #1      ; reset for store
+    sub             r8, r8, r3, lsl #1
+
+    vst1.u32        {d2[0]}, [r5@32], r3
+    vst1.u32        {d2[1]}, [r8@32], r3
+    vst1.u32        {d3[0]}, [r5@32], r3
+    vst1.u32        {d3[1]}, [r8@32], r3
+
+    vmov            q8, q10
+    vmov            d18, d22
+    vmov            d19, d24
+    vmov            q10, q13
+    vmov            d22, d25
+
+    subs            r12, r12, #4            ; h -= 4
+    bgt             vpx_convolve8_avg_loop_vert
+
+    ; outer loop
+    add             r0, r0, #4
+    add             r2, r2, #4
+    subs            r6, r6, #4              ; w -= 4
+    bgt             vpx_convolve8_avg_loop_vert_h
+
+    pop             {r4-r8, pc}
+
+    ENDP
+    END
diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c
new file mode 100644
index 0000000000..9bd715e2c6
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c
@@ -0,0 +1,340 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+static INLINE int32x4_t MULTIPLY_BY_Q0(
+    int16x4_t dsrc0,
+    int16x4_t dsrc1,
+    int16x4_t dsrc2,
+    int16x4_t dsrc3,
+    int16x4_t dsrc4,
+    int16x4_t dsrc5,
+    int16x4_t dsrc6,
+    int16x4_t dsrc7,
+    int16x8_t q0s16) {
+  int32x4_t qdst;
+  int16x4_t d0s16, d1s16;
+
+  d0s16 = vget_low_s16(q0s16);
+  d1s16 = vget_high_s16(q0s16);
+
+  qdst = vmull_lane_s16(dsrc0, d0s16, 0);
+  qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1);
+  qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2);
+  qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3);
+  qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0);
+  qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1);
+  qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2);
+  qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3);
+  return qdst;
+}
+
+void vpx_convolve8_horiz_neon(
+    const uint8_t *src,
+    ptrdiff_t src_stride,
+    uint8_t *dst,
+    ptrdiff_t dst_stride,
+    const int16_t *filter_x,
+    int x_step_q4,
+    const int16_t *filter_y,  // unused
+    int y_step_q4,            // unused
+    int w,
+    int h) {
+  int width;
+  const uint8_t *s, *psrc;
+  uint8_t *d, *pdst;
+  uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8;
+  uint32x2_t d2u32, d3u32, d28u32, d29u32, d30u32, d31u32;
+  uint8x16_t q12u8, q13u8, q14u8, q15u8;
+  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16;
+  int16x4_t d24s16, d25s16, d26s16, d27s16;
+  uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16;
+  int16x8_t q0s16;
+  uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
+  int32x4_t q1s32, q2s32, q14s32, q15s32;
+  uint16x8x2_t q0x2u16;
+  uint8x8x2_t d0x2u8, d1x2u8;
+  uint32x2x2_t d0x2u32;
+  uint16x4x2_t d0x2u16, d1x2u16;
+  uint32x4x2_t q0x2u32;
+
+  assert(x_step_q4 == 16);
+
+  q0s16 = vld1q_s16(filter_x);
+
+  src -= 3;  // adjust for taps
+  for (; h > 0; h -= 4,
+    src += src_stride * 4,
+    dst += dst_stride * 4) {  // loop_horiz_v
+    s = src;
+    d24u8 = vld1_u8(s);
+    s += src_stride;
+    d25u8 = vld1_u8(s);
+    s += src_stride;
+    d26u8 = vld1_u8(s);
+    s += src_stride;
+    d27u8 = vld1_u8(s);
+
+    q12u8 = vcombine_u8(d24u8, d25u8);
+    q13u8 = vcombine_u8(d26u8, d27u8);
+
+    q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8),
+                        vreinterpretq_u16_u8(q13u8));
+    d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
+    d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
+    d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
+    d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1]));
+    d0x2u8 = vtrn_u8(d24u8, d25u8);
+    d1x2u8 = vtrn_u8(d26u8, d27u8);
+
+    __builtin_prefetch(src + src_stride * 4);
+    __builtin_prefetch(src + src_stride * 5);
+    __builtin_prefetch(src + src_stride * 6);
+
+    q8u16  = vmovl_u8(d0x2u8.val[0]);
+    q9u16  = vmovl_u8(d0x2u8.val[1]);
+    q10u16 = vmovl_u8(d1x2u8.val[0]);
+    q11u16 = vmovl_u8(d1x2u8.val[1]);
+
+    d16u16 = vget_low_u16(q8u16);
+    d17u16 = vget_high_u16(q8u16);
+    d18u16 = vget_low_u16(q9u16);
+    d19u16 = vget_high_u16(q9u16);
+    q8u16 = vcombine_u16(d16u16, d18u16);  // vswp 17 18
+    q9u16 = vcombine_u16(d17u16, d19u16);
+
+    d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
+    d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));  // vmov 23 21
+    for (width = w, psrc = src + 7, pdst = dst;
+         width > 0;
+         width -= 4, psrc += 4, pdst += 4) {  // loop_horiz
+      s = psrc;
+      d28u32 = vld1_dup_u32((const uint32_t *)s);
+      s += src_stride;
+      d29u32 = vld1_dup_u32((const uint32_t *)s);
+      s += src_stride;
+      d31u32 = vld1_dup_u32((const uint32_t *)s);
+      s += src_stride;
+      d30u32 = vld1_dup_u32((const uint32_t *)s);
+
+      __builtin_prefetch(psrc + 64);
+
+      d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32),
+                         vreinterpret_u16_u32(d31u32));
+      d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32),
+                         vreinterpret_u16_u32(d30u32));
+      d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]),   // d28
+                       vreinterpret_u8_u16(d1x2u16.val[0]));  // d29
+      d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]),   // d31
+                       vreinterpret_u8_u16(d1x2u16.val[1]));  // d30
+
+      __builtin_prefetch(psrc + 64 + src_stride);
+
+      q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
+      q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
+      q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8),
+                          vreinterpretq_u32_u8(q15u8));
+
+      d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
+      d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
+      q12u16 = vmovl_u8(d28u8);
+      q13u16 = vmovl_u8(d29u8);
+
+      __builtin_prefetch(psrc + 64 + src_stride * 2);
+
+      d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
+      d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
+      d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
+      d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
+      d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+      d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+      d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+      d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+      d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+
+      q1s32  = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16,
+                              d18s16, d19s16, d23s16, d24s16, q0s16);
+      q2s32  = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16,
+                              d19s16, d23s16, d24s16, d26s16, q0s16);
+      q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16,
+                              d23s16, d24s16, d26s16, d27s16, q0s16);
+      q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16,
+                              d24s16, d26s16, d27s16, d25s16, q0s16);
+
+      __builtin_prefetch(psrc + 60 + src_stride * 3);
+
+      d2u16 = vqrshrun_n_s32(q1s32, 7);
+      d3u16 = vqrshrun_n_s32(q2s32, 7);
+      d4u16 = vqrshrun_n_s32(q14s32, 7);
+      d5u16 = vqrshrun_n_s32(q15s32, 7);
+
+      q1u16 = vcombine_u16(d2u16, d3u16);
+      q2u16 = vcombine_u16(d4u16, d5u16);
+
+      d2u8 = vqmovn_u16(q1u16);
+      d3u8 = vqmovn_u16(q2u16);
+
+      d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8),
+                         vreinterpret_u16_u8(d3u8));
+      d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
+                         vreinterpret_u32_u16(d0x2u16.val[1]));
+      d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
+                       vreinterpret_u8_u32(d0x2u32.val[1]));
+
+      d2u32 = vreinterpret_u32_u8(d0x2u8.val[0]);
+      d3u32 = vreinterpret_u32_u8(d0x2u8.val[1]);
+
+      d = pdst;
+      vst1_lane_u32((uint32_t *)d, d2u32, 0);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d3u32, 0);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d2u32, 1);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d3u32, 1);
+
+      q8u16 = q9u16;
+      d20s16 = d23s16;
+      q11u16 = q12u16;
+      q9u16 = q13u16;
+      d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+    }
+  }
+  return;
+}
+
+void vpx_convolve8_vert_neon(
+    const uint8_t *src,
+    ptrdiff_t src_stride,
+    uint8_t *dst,
+    ptrdiff_t dst_stride,
+    const int16_t *filter_x,  // unused
+    int x_step_q4,            // unused
+    const int16_t *filter_y,
+    int y_step_q4,
+    int w,
+    int h) {
+  int height;
+  const uint8_t *s;
+  uint8_t *d;
+  uint32x2_t d2u32, d3u32;
+  uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32;
+  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16;
+  int16x4_t d24s16, d25s16, d26s16, d27s16;
+  uint16x4_t d2u16, d3u16, d4u16, d5u16;
+  int16x8_t q0s16;
+  uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
+  int32x4_t q1s32, q2s32, q14s32, q15s32;
+
+  assert(y_step_q4 == 16);
+
+  src -= src_stride * 3;
+  q0s16 = vld1q_s16(filter_y);
+  for (; w > 0; w -= 4, src += 4, dst += 4) {  // loop_vert_h
+    s = src;
+    d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0);
+    s += src_stride;
+    d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1);
+    s += src_stride;
+    d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0);
+    s += src_stride;
+    d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1);
+    s += src_stride;
+    d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0);
+    s += src_stride;
+    d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1);
+    s += src_stride;
+    d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0);
+    s += src_stride;
+
+    q8u16  = vmovl_u8(vreinterpret_u8_u32(d16u32));
+    q9u16  = vmovl_u8(vreinterpret_u8_u32(d18u32));
+    q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32));
+    q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32));
+
+    d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
+    d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
+    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+    d = dst;
+    for (height = h; height > 0; height -= 4) {  // loop_vert
+      d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0);
+      s += src_stride;
+      d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0);
+      s += src_stride;
+      d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1);
+      s += src_stride;
+      d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1);
+      s += src_stride;
+
+      q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32));
+      q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32));
+
+      d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
+      d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
+      d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
+      d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
+      d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+      d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+      d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+      d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+
+      __builtin_prefetch(d);
+      __builtin_prefetch(d + dst_stride);
+      q1s32  = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16,
+                              d20s16, d21s16, d22s16, d24s16, q0s16);
+      __builtin_prefetch(d + dst_stride * 2);
+      __builtin_prefetch(d + dst_stride * 3);
+      q2s32  = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16,
+                              d21s16, d22s16, d24s16, d26s16, q0s16);
+      __builtin_prefetch(s);
+      __builtin_prefetch(s + src_stride);
+      q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16,
+                              d22s16, d24s16, d26s16, d27s16, q0s16);
+      __builtin_prefetch(s + src_stride * 2);
+      __builtin_prefetch(s + src_stride * 3);
+      q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16,
+                              d24s16, d26s16, d27s16, d25s16, q0s16);
+
+      d2u16 = vqrshrun_n_s32(q1s32, 7);
+      d3u16 = vqrshrun_n_s32(q2s32, 7);
+      d4u16 = vqrshrun_n_s32(q14s32, 7);
+      d5u16 = vqrshrun_n_s32(q15s32, 7);
+
+      q1u16 = vcombine_u16(d2u16, d3u16);
+      q2u16 = vcombine_u16(d4u16, d5u16);
+
+      d2u32 = vreinterpret_u32_u8(vqmovn_u16(q1u16));
+      d3u32 = vreinterpret_u32_u8(vqmovn_u16(q2u16));
+
+      vst1_lane_u32((uint32_t *)d, d2u32, 0);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d2u32, 1);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d3u32, 0);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d3u32, 1);
+      d += dst_stride;
+
+      q8u16 = q10u16;
+      d18s16 = d22s16;
+      d19s16 = d24s16;
+      q10u16 = q13u16;
+      d22s16 = d25s16;
+    }
+  }
+  return;
+}
diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.asm b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.asm
new file mode 100644
index 0000000000..2d0f2ae065
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.asm
@@ -0,0 +1,270 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    ; These functions are only valid when:
+    ; x_step_q4 == 16
+    ; w%4 == 0
+    ; h%4 == 0
+    ; taps == 8
+    ; VP9_FILTER_WEIGHT == 128
+    ; VP9_FILTER_SHIFT == 7
+
+    EXPORT  |vpx_convolve8_horiz_neon|
+    EXPORT  |vpx_convolve8_vert_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+    ; Multiply and accumulate by q0
+    MACRO
+    MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7
+    vmull.s16 $dst, $src0, d0[0]
+    vmlal.s16 $dst, $src1, d0[1]
+    vmlal.s16 $dst, $src2, d0[2]
+    vmlal.s16 $dst, $src3, d0[3]
+    vmlal.s16 $dst, $src4, d1[0]
+    vmlal.s16 $dst, $src5, d1[1]
+    vmlal.s16 $dst, $src6, d1[2]
+    vmlal.s16 $dst, $src7, d1[3]
+    MEND
+
+; r0    const uint8_t *src
+; r1    int src_stride
+; r2    uint8_t *dst
+; r3    int dst_stride
+; sp[]const int16_t *filter_x
+; sp[]int x_step_q4
+; sp[]const int16_t *filter_y ; unused
+; sp[]int y_step_q4           ; unused
+; sp[]int w
+; sp[]int h
+
+|vpx_convolve8_horiz_neon| PROC
+    push            {r4-r10, lr}
+
+    sub             r0, r0, #3              ; adjust for taps
+
+    ldr             r5, [sp, #32]           ; filter_x
+    ldr             r6, [sp, #48]           ; w
+    ldr             r7, [sp, #52]           ; h
+
+    vld1.s16        {q0}, [r5]              ; filter_x
+
+    sub             r8, r1, r1, lsl #2      ; -src_stride * 3
+    add             r8, r8, #4              ; -src_stride * 3 + 4
+
+    sub             r4, r3, r3, lsl #2      ; -dst_stride * 3
+    add             r4, r4, #4              ; -dst_stride * 3 + 4
+
+    rsb             r9, r6, r1, lsl #2      ; reset src for outer loop
+    sub             r9, r9, #7
+    rsb             r12, r6, r3, lsl #2     ; reset dst for outer loop
+
+    mov             r10, r6                 ; w loop counter
+
+vpx_convolve8_loop_horiz_v
+    vld1.8          {d24}, [r0], r1
+    vld1.8          {d25}, [r0], r1
+    vld1.8          {d26}, [r0], r1
+    vld1.8          {d27}, [r0], r8
+
+    vtrn.16         q12, q13
+    vtrn.8          d24, d25
+    vtrn.8          d26, d27
+
+    pld             [r0, r1, lsl #2]
+
+    vmovl.u8        q8, d24
+    vmovl.u8        q9, d25
+    vmovl.u8        q10, d26
+    vmovl.u8        q11, d27
+
+    ; save a few instructions in the inner loop
+    vswp            d17, d18
+    vmov            d23, d21
+
+    add             r0, r0, #3
+
+vpx_convolve8_loop_horiz
+    add             r5, r0, #64
+
+    vld1.32         {d28[]}, [r0], r1
+    vld1.32         {d29[]}, [r0], r1
+    vld1.32         {d31[]}, [r0], r1
+    vld1.32         {d30[]}, [r0], r8
+
+    pld             [r5]
+
+    vtrn.16         d28, d31
+    vtrn.16         d29, d30
+    vtrn.8          d28, d29
+    vtrn.8          d31, d30
+
+    pld             [r5, r1]
+
+    ; extract to s16
+    vtrn.32         q14, q15
+    vmovl.u8        q12, d28
+    vmovl.u8        q13, d29
+
+    pld             [r5, r1, lsl #1]
+
+    ; src[] * filter_x
+    MULTIPLY_BY_Q0  q1,  d16, d17, d20, d22, d18, d19, d23, d24
+    MULTIPLY_BY_Q0  q2,  d17, d20, d22, d18, d19, d23, d24, d26
+    MULTIPLY_BY_Q0  q14, d20, d22, d18, d19, d23, d24, d26, d27
+    MULTIPLY_BY_Q0  q15, d22, d18, d19, d23, d24, d26, d27, d25
+
+    pld             [r5, -r8]
+
+    ; += 64 >> 7
+    vqrshrun.s32    d2, q1, #7
+    vqrshrun.s32    d3, q2, #7
+    vqrshrun.s32    d4, q14, #7
+    vqrshrun.s32    d5, q15, #7
+
+    ; saturate
+    vqmovn.u16      d2, q1
+    vqmovn.u16      d3, q2
+
+    ; transpose
+    vtrn.16         d2, d3
+    vtrn.32         d2, d3
+    vtrn.8          d2, d3
+
+    vst1.u32        {d2[0]}, [r2@32], r3
+    vst1.u32        {d3[0]}, [r2@32], r3
+    vst1.u32        {d2[1]}, [r2@32], r3
+    vst1.u32        {d3[1]}, [r2@32], r4
+
+    vmov            q8,  q9
+    vmov            d20, d23
+    vmov            q11, q12
+    vmov            q9,  q13
+
+    subs            r6, r6, #4              ; w -= 4
+    bgt             vpx_convolve8_loop_horiz
+
+    ; outer loop
+    mov             r6, r10                 ; restore w counter
+    add             r0, r0, r9              ; src += src_stride * 4 - w
+    add             r2, r2, r12             ; dst += dst_stride * 4 - w
+    subs            r7, r7, #4              ; h -= 4
+    bgt vpx_convolve8_loop_horiz_v
+
+    pop             {r4-r10, pc}
+
+    ENDP
+
+|vpx_convolve8_vert_neon| PROC
+    push            {r4-r8, lr}
+
+    ; adjust for taps
+    sub             r0, r0, r1
+    sub             r0, r0, r1, lsl #1
+
+    ldr             r4, [sp, #32]           ; filter_y
+    ldr             r6, [sp, #40]           ; w
+    ldr             lr, [sp, #44]           ; h
+
+    vld1.s16        {q0}, [r4]              ; filter_y
+
+    lsl             r1, r1, #1
+    lsl             r3, r3, #1
+
+vpx_convolve8_loop_vert_h
+    mov             r4, r0
+    add             r7, r0, r1, asr #1
+    mov             r5, r2
+    add             r8, r2, r3, asr #1
+    mov             r12, lr                 ; h loop counter
+
+    vld1.u32        {d16[0]}, [r4], r1
+    vld1.u32        {d16[1]}, [r7], r1
+    vld1.u32        {d18[0]}, [r4], r1
+    vld1.u32        {d18[1]}, [r7], r1
+    vld1.u32        {d20[0]}, [r4], r1
+    vld1.u32        {d20[1]}, [r7], r1
+    vld1.u32        {d22[0]}, [r4], r1
+
+    vmovl.u8        q8, d16
+    vmovl.u8        q9, d18
+    vmovl.u8        q10, d20
+    vmovl.u8        q11, d22
+
+vpx_convolve8_loop_vert
+    ; always process a 4x4 block at a time
+    vld1.u32        {d24[0]}, [r7], r1
+    vld1.u32        {d26[0]}, [r4], r1
+    vld1.u32        {d26[1]}, [r7], r1
+    vld1.u32        {d24[1]}, [r4], r1
+
+    ; extract to s16
+    vmovl.u8        q12, d24
+    vmovl.u8        q13, d26
+
+    pld             [r5]
+    pld             [r8]
+
+    ; src[] * filter_y
+    MULTIPLY_BY_Q0  q1,  d16, d17, d18, d19, d20, d21, d22, d24
+
+    pld             [r5, r3]
+    pld             [r8, r3]
+
+    MULTIPLY_BY_Q0  q2,  d17, d18, d19, d20, d21, d22, d24, d26
+
+    pld             [r7]
+    pld             [r4]
+
+    MULTIPLY_BY_Q0  q14, d18, d19, d20, d21, d22, d24, d26, d27
+
+    pld             [r7, r1]
+    pld             [r4, r1]
+
+    MULTIPLY_BY_Q0  q15, d19, d20, d21, d22, d24, d26, d27, d25
+
+    ; += 64 >> 7
+    vqrshrun.s32    d2, q1, #7
+    vqrshrun.s32    d3, q2, #7
+    vqrshrun.s32    d4, q14, #7
+    vqrshrun.s32    d5, q15, #7
+
+    ; saturate
+    vqmovn.u16      d2, q1
+    vqmovn.u16      d3, q2
+
+    vst1.u32        {d2[0]}, [r5@32], r3
+    vst1.u32        {d2[1]}, [r8@32], r3
+    vst1.u32        {d3[0]}, [r5@32], r3
+    vst1.u32        {d3[1]}, [r8@32], r3
+
+    vmov            q8, q10
+    vmov            d18, d22
+    vmov            d19, d24
+    vmov            q10, q13
+    vmov            d22, d25
+
+    subs            r12, r12, #4            ; h -= 4
+    bgt             vpx_convolve8_loop_vert
+
+    ; outer loop
+    add             r0, r0, #4
+    add             r2, r2, #4
+    subs            r6, r6, #4              ; w -= 4
+    bgt             vpx_convolve8_loop_vert_h
+
+    pop             {r4-r8, pc}
+
+    ENDP
+    END
diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c b/libs/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c
new file mode 100644
index 0000000000..dc58a332f8
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c
@@ -0,0 +1,147 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+void vpx_convolve_avg_neon(
+    const uint8_t *src,    // r0
+    ptrdiff_t src_stride,  // r1
+    uint8_t *dst,          // r2
+    ptrdiff_t dst_stride,  // r3
+    const int16_t *filter_x,
+    int filter_x_stride,
+    const int16_t *filter_y,
+    int filter_y_stride,
+    int w,
+    int h) {
+  uint8_t *d;
+  uint8x8_t d0u8, d1u8, d2u8, d3u8;
+  uint32x2_t d0u32, d2u32;
+  uint8x16_t q0u8, q1u8, q2u8, q3u8, q8u8, q9u8, q10u8, q11u8;
+  (void)filter_x;  (void)filter_x_stride;
+  (void)filter_y;  (void)filter_y_stride;
+
+  d = dst;
+  if (w > 32) {  // avg64
+    for (; h > 0; h -= 1) {
+      q0u8  = vld1q_u8(src);
+      q1u8  = vld1q_u8(src + 16);
+      q2u8  = vld1q_u8(src + 32);
+      q3u8  = vld1q_u8(src + 48);
+      src += src_stride;
+      q8u8  = vld1q_u8(d);
+      q9u8  = vld1q_u8(d + 16);
+      q10u8 = vld1q_u8(d + 32);
+      q11u8 = vld1q_u8(d + 48);
+      d += dst_stride;
+
+      q0u8 = vrhaddq_u8(q0u8, q8u8);
+      q1u8 = vrhaddq_u8(q1u8, q9u8);
+      q2u8 = vrhaddq_u8(q2u8, q10u8);
+      q3u8 = vrhaddq_u8(q3u8, q11u8);
+
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q1u8);
+      vst1q_u8(dst + 32, q2u8);
+      vst1q_u8(dst + 48, q3u8);
+      dst += dst_stride;
+    }
+  } else if (w == 32) {  // avg32
+    for (; h > 0; h -= 2) {
+      q0u8 = vld1q_u8(src);
+      q1u8 = vld1q_u8(src + 16);
+      src += src_stride;
+      q2u8 = vld1q_u8(src);
+      q3u8 = vld1q_u8(src + 16);
+      src += src_stride;
+      q8u8 = vld1q_u8(d);
+      q9u8 = vld1q_u8(d + 16);
+      d += dst_stride;
+      q10u8 = vld1q_u8(d);
+      q11u8 = vld1q_u8(d + 16);
+      d += dst_stride;
+
+      q0u8 = vrhaddq_u8(q0u8, q8u8);
+      q1u8 = vrhaddq_u8(q1u8, q9u8);
+      q2u8 = vrhaddq_u8(q2u8, q10u8);
+      q3u8 = vrhaddq_u8(q3u8, q11u8);
+
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q1u8);
+      dst += dst_stride;
+      vst1q_u8(dst, q2u8);
+      vst1q_u8(dst + 16, q3u8);
+      dst += dst_stride;
+    }
+  } else if (w > 8) {  // avg16
+    for (; h > 0; h -= 2) {
+      q0u8 = vld1q_u8(src);
+      src += src_stride;
+      q1u8 = vld1q_u8(src);
+      src += src_stride;
+      q2u8 = vld1q_u8(d);
+      d += dst_stride;
+      q3u8 = vld1q_u8(d);
+      d += dst_stride;
+
+      q0u8 = vrhaddq_u8(q0u8, q2u8);
+      q1u8 = vrhaddq_u8(q1u8, q3u8);
+
+      vst1q_u8(dst, q0u8);
+      dst += dst_stride;
+      vst1q_u8(dst, q1u8);
+      dst += dst_stride;
+    }
+  } else if (w == 8) {  // avg8
+    for (; h > 0; h -= 2) {
+      d0u8 = vld1_u8(src);
+      src += src_stride;
+      d1u8 = vld1_u8(src);
+      src += src_stride;
+      d2u8 = vld1_u8(d);
+      d += dst_stride;
+      d3u8 = vld1_u8(d);
+      d += dst_stride;
+
+      q0u8 = vcombine_u8(d0u8, d1u8);
+      q1u8 = vcombine_u8(d2u8, d3u8);
+      q0u8 = vrhaddq_u8(q0u8, q1u8);
+
+      vst1_u8(dst, vget_low_u8(q0u8));
+      dst += dst_stride;
+      vst1_u8(dst, vget_high_u8(q0u8));
+      dst += dst_stride;
+    }
+  } else {  // avg4
+    for (; h > 0; h -= 2) {
+      d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 0);
+      src += src_stride;
+      d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 1);
+      src += src_stride;
+      d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 0);
+      d += dst_stride;
+      d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 1);
+      d += dst_stride;
+
+      d0u8 = vrhadd_u8(vreinterpret_u8_u32(d0u32),
+                       vreinterpret_u8_u32(d2u32));
+
+      d0u32 = vreinterpret_u32_u8(d0u8);
+      vst1_lane_u32((uint32_t *)dst, d0u32, 0);
+      dst += dst_stride;
+      vst1_lane_u32((uint32_t *)dst, d0u32, 1);
+      dst += dst_stride;
+    }
+  }
+  return;
+}
diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm b/libs/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm
new file mode 100644
index 0000000000..97e6189fda
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm
@@ -0,0 +1,116 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vpx_convolve_avg_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve_avg_neon| PROC
+    push                {r4-r6, lr}
+    ldrd                r4, r5, [sp, #32]
+    mov                 r6, r2
+
+    cmp                 r4, #32
+    bgt                 avg64
+    beq                 avg32
+    cmp                 r4, #8
+    bgt                 avg16
+    beq                 avg8
+    b                   avg4
+
+avg64
+    sub                 lr, r1, #32
+    sub                 r4, r3, #32
+avg64_h
+    pld                 [r0, r1, lsl #1]
+    vld1.8              {q0-q1}, [r0]!
+    vld1.8              {q2-q3}, [r0], lr
+    pld                 [r2, r3]
+    vld1.8              {q8-q9},   [r6@128]!
+    vld1.8              {q10-q11}, [r6@128], r4
+    vrhadd.u8           q0, q0, q8
+    vrhadd.u8           q1, q1, q9
+    vrhadd.u8           q2, q2, q10
+    vrhadd.u8           q3, q3, q11
+    vst1.8              {q0-q1}, [r2@128]!
+    vst1.8              {q2-q3}, [r2@128], r4
+    subs                r5, r5, #1
+    bgt                 avg64_h
+    pop                 {r4-r6, pc}
+
+avg32
+    vld1.8              {q0-q1}, [r0], r1
+    vld1.8              {q2-q3}, [r0], r1
+    vld1.8              {q8-q9},   [r6@128], r3
+    vld1.8              {q10-q11}, [r6@128], r3
+    pld                 [r0]
+    vrhadd.u8           q0, q0, q8
+    pld                 [r0, r1]
+    vrhadd.u8           q1, q1, q9
+    pld                 [r6]
+    vrhadd.u8           q2, q2, q10
+    pld                 [r6, r3]
+    vrhadd.u8           q3, q3, q11
+    vst1.8              {q0-q1}, [r2@128], r3
+    vst1.8              {q2-q3}, [r2@128], r3
+    subs                r5, r5, #2
+    bgt                 avg32
+    pop                 {r4-r6, pc}
+
+avg16
+    vld1.8              {q0}, [r0], r1
+    vld1.8              {q1}, [r0], r1
+    vld1.8              {q2}, [r6@128], r3
+    vld1.8              {q3}, [r6@128], r3
+    pld                 [r0]
+    pld                 [r0, r1]
+    vrhadd.u8           q0, q0, q2
+    pld                 [r6]
+    pld                 [r6, r3]
+    vrhadd.u8           q1, q1, q3
+    vst1.8              {q0}, [r2@128], r3
+    vst1.8              {q1}, [r2@128], r3
+    subs                r5, r5, #2
+    bgt                 avg16
+    pop                 {r4-r6, pc}
+
+avg8
+    vld1.8              {d0}, [r0], r1
+    vld1.8              {d1}, [r0], r1
+    vld1.8              {d2}, [r6@64], r3
+    vld1.8              {d3}, [r6@64], r3
+    pld                 [r0]
+    pld                 [r0, r1]
+    vrhadd.u8           q0, q0, q1
+    pld                 [r6]
+    pld                 [r6, r3]
+    vst1.8              {d0}, [r2@64], r3
+    vst1.8              {d1}, [r2@64], r3
+    subs                r5, r5, #2
+    bgt                 avg8
+    pop                 {r4-r6, pc}
+
+avg4
+    vld1.32             {d0[0]}, [r0], r1
+    vld1.32             {d0[1]}, [r0], r1
+    vld1.32             {d2[0]}, [r6@32], r3
+    vld1.32             {d2[1]}, [r6@32], r3
+    vrhadd.u8           d0, d0, d2
+    vst1.32             {d0[0]}, [r2@32], r3
+    vst1.32             {d0[1]}, [r2@32], r3
+    subs                r5, r5, #2
+    bgt                 avg4
+    pop                 {r4-r6, pc}
+    ENDP
+
+    END
diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c b/libs/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c
new file mode 100644
index 0000000000..d8fb97a861
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c
@@ -0,0 +1,94 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+void vpx_convolve_copy_neon(
+    const uint8_t *src,    // r0
+    ptrdiff_t src_stride,  // r1
+    uint8_t *dst,          // r2
+    ptrdiff_t dst_stride,  // r3
+    const int16_t *filter_x,
+    int filter_x_stride,
+    const int16_t *filter_y,
+    int filter_y_stride,
+    int w,
+    int h) {
+  uint8x8_t d0u8, d2u8;
+  uint8x16_t q0u8, q1u8, q2u8, q3u8;
+  (void)filter_x;  (void)filter_x_stride;
+  (void)filter_y;  (void)filter_y_stride;
+
+  if (w > 32) {  // copy64
+    for (; h > 0; h--) {
+      q0u8 = vld1q_u8(src);
+      q1u8 = vld1q_u8(src + 16);
+      q2u8 = vld1q_u8(src + 32);
+      q3u8 = vld1q_u8(src + 48);
+      src += src_stride;
+
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q1u8);
+      vst1q_u8(dst + 32, q2u8);
+      vst1q_u8(dst + 48, q3u8);
+      dst += dst_stride;
+    }
+  } else if (w == 32) {  // copy32
+    for (; h > 0; h -= 2) {
+      q0u8 = vld1q_u8(src);
+      q1u8 = vld1q_u8(src + 16);
+      src += src_stride;
+      q2u8 = vld1q_u8(src);
+      q3u8 = vld1q_u8(src + 16);
+      src += src_stride;
+
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q1u8);
+      dst += dst_stride;
+      vst1q_u8(dst, q2u8);
+      vst1q_u8(dst + 16, q3u8);
+      dst += dst_stride;
+    }
+  } else if (w > 8) {  // copy16
+    for (; h > 0; h -= 2) {
+      q0u8 = vld1q_u8(src);
+      src += src_stride;
+      q1u8 = vld1q_u8(src);
+      src += src_stride;
+
+      vst1q_u8(dst, q0u8);
+      dst += dst_stride;
+      vst1q_u8(dst, q1u8);
+      dst += dst_stride;
+    }
+  } else if (w == 8) {  // copy8
+    for (; h > 0; h -= 2) {
+      d0u8 = vld1_u8(src);
+      src += src_stride;
+      d2u8 = vld1_u8(src);
+      src += src_stride;
+
+      vst1_u8(dst, d0u8);
+      dst += dst_stride;
+      vst1_u8(dst, d2u8);
+      dst += dst_stride;
+    }
+  } else {  // copy4
+    for (; h > 0; h--) {
+      *(uint32_t *)dst = *(const uint32_t *)src;
+      src += src_stride;
+      dst += dst_stride;
+    }
+  }
+  return;
+}
diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm b/libs/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm
new file mode 100644
index 0000000000..89164ad48b
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm
@@ -0,0 +1,84 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vpx_convolve_copy_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve_copy_neon| PROC
+    push                {r4-r5, lr}
+    ldrd                r4, r5, [sp, #28]
+
+    cmp                 r4, #32
+    bgt                 copy64
+    beq                 copy32
+    cmp                 r4, #8
+    bgt                 copy16
+    beq                 copy8
+    b                   copy4
+
+copy64
+    sub                 lr, r1, #32
+    sub                 r3, r3, #32
+copy64_h
+    pld                 [r0, r1, lsl #1]
+    vld1.8              {q0-q1}, [r0]!
+    vld1.8              {q2-q3}, [r0], lr
+    vst1.8              {q0-q1}, [r2@128]!
+    vst1.8              {q2-q3}, [r2@128], r3
+    subs                r5, r5, #1
+    bgt                 copy64_h
+    pop                 {r4-r5, pc}
+
+copy32
+    pld                 [r0, r1, lsl #1]
+    vld1.8              {q0-q1}, [r0], r1
+    pld                 [r0, r1, lsl #1]
+    vld1.8              {q2-q3}, [r0], r1
+    vst1.8              {q0-q1}, [r2@128], r3
+    vst1.8              {q2-q3}, [r2@128], r3
+    subs                r5, r5, #2
+    bgt                 copy32
+    pop                 {r4-r5, pc}
+
+copy16
+    pld                 [r0, r1, lsl #1]
+    vld1.8              {q0}, [r0], r1
+    pld                 [r0, r1, lsl #1]
+    vld1.8              {q1}, [r0], r1
+    vst1.8              {q0}, [r2@128], r3
+    vst1.8              {q1}, [r2@128], r3
+    subs                r5, r5, #2
+    bgt                 copy16
+    pop                 {r4-r5, pc}
+
+copy8
+    pld                 [r0, r1, lsl #1]
+    vld1.8              {d0}, [r0], r1
+    pld                 [r0, r1, lsl #1]
+    vld1.8              {d2}, [r0], r1
+    vst1.8              {d0}, [r2@64], r3
+    vst1.8              {d2}, [r2@64], r3
+    subs                r5, r5, #2
+    bgt                 copy8
+    pop                 {r4-r5, pc}
+
+copy4
+    ldr                 r12, [r0], r1
+    str                 r12, [r2], r3
+    subs                r5, r5, #1
+    bgt                 copy4
+    pop                 {r4-r5, pc}
+    ENDP
+
+    END
diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve_neon.c b/libs/libvpx/vpx_dsp/arm/vpx_convolve_neon.c
new file mode 100644
index 0000000000..1506ce6203
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/vpx_convolve_neon.c
@@ -0,0 +1,72 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_ports/mem.h"
+
+void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride,
+                        uint8_t *dst, ptrdiff_t dst_stride,
+                        const int16_t *filter_x, int x_step_q4,
+                        const int16_t *filter_y, int y_step_q4,
+                        int w, int h) {
+  /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
+   * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).
+   */
+  DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);
+
+  // Account for the vertical phase needing 3 lines prior and 4 lines post
+  int intermediate_height = h + 7;
+
+  assert(y_step_q4 == 16);
+  assert(x_step_q4 == 16);
+
+  /* Filter starting 3 lines back. The neon implementation will ignore the
+   * given height and filter a multiple of 4 lines. Since this goes in to
+   * the temp buffer which has lots of extra room and is subsequently discarded
+   * this is safe if somewhat less than ideal.
+   */
+  vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride,
+                           temp, 64,
+                           filter_x, x_step_q4, filter_y, y_step_q4,
+                           w, intermediate_height);
+
+  /* Step into the temp buffer 3 lines to get the actual frame data */
+  vpx_convolve8_vert_neon(temp + 64 * 3, 64,
+                          dst, dst_stride,
+                          filter_x, x_step_q4, filter_y, y_step_q4,
+                          w, h);
+}
+
+void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
+                            const int16_t *filter_x, int x_step_q4,
+                            const int16_t *filter_y, int y_step_q4,
+                            int w, int h) {
+  DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);
+  int intermediate_height = h + 7;
+
+  assert(y_step_q4 == 16);
+  assert(x_step_q4 == 16);
+
+  /* This implementation has the same issues as above. In addition, we only want
+   * to average the values after both passes.
+   */
+  vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride,
+                           temp, 64,
+                           filter_x, x_step_q4, filter_y, y_step_q4,
+                           w, intermediate_height);
+  vpx_convolve8_avg_vert_neon(temp + 64 * 3,
+                              64, dst, dst_stride,
+                              filter_x, x_step_q4, filter_y, y_step_q4,
+                              w, h);
+}
diff --git a/libs/libvpx/vpx_dsp/avg.c b/libs/libvpx/vpx_dsp/avg.c
new file mode 100644
index 0000000000..26fe7859a5
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/avg.c
@@ -0,0 +1,231 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <stdlib.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/mem.h"
+
+unsigned int vpx_avg_8x8_c(const uint8_t *s, int p) {
+  int i, j;
+  int sum = 0;
+  for (i = 0; i < 8; ++i, s+=p)
+    for (j = 0; j < 8; sum += s[j], ++j) {}
+
+  return (sum + 32) >> 6;
+}
+
+unsigned int vpx_avg_4x4_c(const uint8_t *s, int p) {
+  int i, j;
+  int sum = 0;
+  for (i = 0; i < 4; ++i, s+=p)
+    for (j = 0; j < 4; sum += s[j], ++j) {}
+
+  return (sum + 8) >> 4;
+}
+
+// src_diff: first pass, 9 bit, dynamic range [-255, 255]
+//           second pass, 12 bit, dynamic range [-2040, 2040]
+static void hadamard_col8(const int16_t *src_diff, int src_stride,
+                          int16_t *coeff) {
+  int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
+  int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];
+  int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride];
+  int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride];
+  int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride];
+  int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride];
+  int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride];
+  int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride];
+
+  int16_t c0 = b0 + b2;
+  int16_t c1 = b1 + b3;
+  int16_t c2 = b0 - b2;
+  int16_t c3 = b1 - b3;
+  int16_t c4 = b4 + b6;
+  int16_t c5 = b5 + b7;
+  int16_t c6 = b4 - b6;
+  int16_t c7 = b5 - b7;
+
+  coeff[0] = c0 + c4;
+  coeff[7] = c1 + c5;
+  coeff[3] = c2 + c6;
+  coeff[4] = c3 + c7;
+  coeff[2] = c0 - c4;
+  coeff[6] = c1 - c5;
+  coeff[1] = c2 - c6;
+  coeff[5] = c3 - c7;
+}
+
+void vpx_hadamard_8x8_c(int16_t const *src_diff, int src_stride,
+                        int16_t *coeff) {
+  int idx;
+  int16_t buffer[64];
+  int16_t *tmp_buf = &buffer[0];
+  for (idx = 0; idx < 8; ++idx) {
+    hadamard_col8(src_diff, src_stride, tmp_buf);  // src_diff: 9 bit
+                                                   // dynamic range [-255, 255]
+    tmp_buf += 8;
+    ++src_diff;
+  }
+
+  tmp_buf = &buffer[0];
+  for (idx = 0; idx < 8; ++idx) {
+    hadamard_col8(tmp_buf, 8, coeff);  // tmp_buf: 12 bit
+                                       // dynamic range [-2040, 2040]
+    coeff += 8;  // coeff: 15 bit
+                 // dynamic range [-16320, 16320]
+    ++tmp_buf;
+  }
+}
+
+// In place 16x16 2D Hadamard transform
+void vpx_hadamard_16x16_c(int16_t const *src_diff, int src_stride,
+                          int16_t *coeff) {
+  int idx;
+  for (idx = 0; idx < 4; ++idx) {
+    // src_diff: 9 bit, dynamic range [-255, 255]
+    int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride
+                                + (idx & 0x01) * 8;
+    vpx_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);
+  }
+
+  // coeff: 15 bit, dynamic range [-16320, 16320]
+  for (idx = 0; idx < 64; ++idx) {
+    int16_t a0 = coeff[0];
+    int16_t a1 = coeff[64];
+    int16_t a2 = coeff[128];
+    int16_t a3 = coeff[192];
+
+    int16_t b0 = (a0 + a1) >> 1;  // (a0 + a1): 16 bit, [-32640, 32640]
+    int16_t b1 = (a0 - a1) >> 1;  // b0-b3: 15 bit, dynamic range
+    int16_t b2 = (a2 + a3) >> 1;  // [-16320, 16320]
+    int16_t b3 = (a2 - a3) >> 1;
+
+    coeff[0]   = b0 + b2;  // 16 bit, [-32640, 32640]
+    coeff[64]  = b1 + b3;
+    coeff[128] = b0 - b2;
+    coeff[192] = b1 - b3;
+
+    ++coeff;
+  }
+}
+
+// coeff: 16 bits, dynamic range [-32640, 32640].
+// length: value range {16, 64, 256, 1024}.
+int vpx_satd_c(const int16_t *coeff, int length) {
+  int i;
+  int satd = 0;
+  for (i = 0; i < length; ++i)
+    satd += abs(coeff[i]);
+
+  // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
+  return satd;
+}
+
+// Integer projection onto row vectors.
+// height: value range {16, 32, 64}.
+void vpx_int_pro_row_c(int16_t hbuf[16], uint8_t const *ref,
+                       const int ref_stride, const int height) {
+  int idx;
+  const int norm_factor = height >> 1;
+  for (idx = 0; idx < 16; ++idx) {
+    int i;
+    hbuf[idx] = 0;
+    // hbuf[idx]: 14 bit, dynamic range [0, 16320].
+    for (i = 0; i < height; ++i)
+      hbuf[idx] += ref[i * ref_stride];
+    // hbuf[idx]: 9 bit, dynamic range [0, 510].
+    hbuf[idx] /= norm_factor;
+    ++ref;
+  }
+}
+
+// width: value range {16, 32, 64}.
+int16_t vpx_int_pro_col_c(uint8_t const *ref, const int width) {
+  int idx;
+  int16_t sum = 0;
+  // sum: 14 bit, dynamic range [0, 16320]
+  for (idx = 0; idx < width; ++idx)
+    sum += ref[idx];
+  return sum;
+}
+
+// ref: [0 - 510]
+// src: [0 - 510]
+// bwl: {2, 3, 4}
+int vpx_vector_var_c(int16_t const *ref, int16_t const *src,
+                     const int bwl) {
+  int i;
+  int width = 4 << bwl;
+  int sse = 0, mean = 0, var;
+
+  for (i = 0; i < width; ++i) {
+    int diff = ref[i] - src[i];  // diff: dynamic range [-510, 510], 10 bits.
+    mean += diff;                // mean: dynamic range 16 bits.
+    sse += diff * diff;          // sse:  dynamic range 26 bits.
+  }
+
+  // (mean * mean): dynamic range 31 bits.
+  var = sse - ((mean * mean) >> (bwl + 2));
+  return var;
+}
+
+void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp,
+                      int *min, int *max) {
+  int i, j;
+  *min = 255;
+  *max = 0;
+  for (i = 0; i < 8; ++i, s += p, d += dp) {
+    for (j = 0; j < 8; ++j) {
+      int diff = abs(s[j]-d[j]);
+      *min = diff < *min ? diff : *min;
+      *max = diff > *max ? diff : *max;
+    }
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+unsigned int vpx_highbd_avg_8x8_c(const uint8_t *s8, int p) {
+  int i, j;
+  int sum = 0;
+  const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
+  for (i = 0; i < 8; ++i, s+=p)
+    for (j = 0; j < 8; sum += s[j], ++j) {}
+
+  return (sum + 32) >> 6;
+}
+
+unsigned int vpx_highbd_avg_4x4_c(const uint8_t *s8, int p) {
+  int i, j;
+  int sum = 0;
+  const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
+  for (i = 0; i < 4; ++i, s+=p)
+    for (j = 0; j < 4; sum += s[j], ++j) {}
+
+  return (sum + 8) >> 4;
+}
+
+void vpx_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8,
+                             int dp, int *min, int *max) {
+  int i, j;
+  const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
+  const uint16_t* d = CONVERT_TO_SHORTPTR(d8);
+  *min = 255;
+  *max = 0;
+  for (i = 0; i < 8; ++i, s += p, d += dp) {
+    for (j = 0; j < 8; ++j) {
+      int diff = abs(s[j]-d[j]);
+      *min = diff < *min ? diff : *min;
+      *max = diff > *max ? diff : *max;
+    }
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+
diff --git a/libs/libvpx/vpx_dsp/bitreader.c b/libs/libvpx/vpx_dsp/bitreader.c
new file mode 100644
index 0000000000..6ad806ac3f
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/bitreader.c
@@ -0,0 +1,103 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <stdlib.h>
+
+#include "./vpx_config.h"
+
+#include "vpx_dsp/bitreader.h"
+#include "vpx_dsp/prob.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_ports/mem.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_util/endian_inl.h"
+
+int vpx_reader_init(vpx_reader *r,
+                    const uint8_t *buffer,
+                    size_t size,
+                    vpx_decrypt_cb decrypt_cb,
+                    void *decrypt_state) {
+  if (size && !buffer) {
+    return 1;
+  } else {
+    r->buffer_end = buffer + size;
+    r->buffer = buffer;
+    r->value = 0;
+    r->count = -8;
+    r->range = 255;
+    r->decrypt_cb = decrypt_cb;
+    r->decrypt_state = decrypt_state;
+    vpx_reader_fill(r);
+    return vpx_read_bit(r) != 0;  // marker bit
+  }
+}
+
+void vpx_reader_fill(vpx_reader *r) {
+  const uint8_t *const buffer_end = r->buffer_end;
+  const uint8_t *buffer = r->buffer;
+  const uint8_t *buffer_start = buffer;
+  BD_VALUE value = r->value;
+  int count = r->count;
+  const size_t bytes_left = buffer_end - buffer;
+  const size_t bits_left = bytes_left * CHAR_BIT;
+  int shift = BD_VALUE_SIZE - CHAR_BIT - (count + CHAR_BIT);
+
+  if (r->decrypt_cb) {
+    size_t n = VPXMIN(sizeof(r->clear_buffer), bytes_left);
+    r->decrypt_cb(r->decrypt_state, buffer, r->clear_buffer, (int)n);
+    buffer = r->clear_buffer;
+    buffer_start = r->clear_buffer;
+  }
+  if (bits_left > BD_VALUE_SIZE) {
+      const int bits = (shift & 0xfffffff8) + CHAR_BIT;
+      BD_VALUE nv;
+      BD_VALUE big_endian_values;
+      memcpy(&big_endian_values, buffer, sizeof(BD_VALUE));
+#if SIZE_MAX == 0xffffffffffffffffULL
+        big_endian_values = HToBE64(big_endian_values);
+#else
+        big_endian_values = HToBE32(big_endian_values);
+#endif
+      nv = big_endian_values >> (BD_VALUE_SIZE - bits);
+      count += bits;
+      buffer += (bits >> 3);
+      value = r->value | (nv << (shift & 0x7));
+  } else {
+    const int bits_over = (int)(shift + CHAR_BIT - bits_left);
+    int loop_end = 0;
+    if (bits_over >= 0) {
+      count += LOTS_OF_BITS;
+      loop_end = bits_over;
+    }
+
+    if (bits_over < 0 || bits_left) {
+      while (shift >= loop_end) {
+        count += CHAR_BIT;
+        value |= (BD_VALUE)*buffer++ << shift;
+        shift -= CHAR_BIT;
+      }
+    }
+  }
+
+  // NOTE: Variable 'buffer' may not relate to 'r->buffer' after decryption,
+  // so we increase 'r->buffer' by the amount that 'buffer' moved, rather than
+  // assign 'buffer' to 'r->buffer'.
+  r->buffer += buffer - buffer_start;
+  r->value = value;
+  r->count = count;
+}
+
+const uint8_t *vpx_reader_find_end(vpx_reader *r) {
+  // Find the end of the coded buffer
+  while (r->count > CHAR_BIT && r->count < BD_VALUE_SIZE) {
+    r->count -= CHAR_BIT;
+    r->buffer--;
+  }
+  return r->buffer;
+}
diff --git a/libs/libvpx/vpx_dsp/bitreader.h b/libs/libvpx/vpx_dsp/bitreader.h
new file mode 100644
index 0000000000..9a441b4107
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/bitreader.h
@@ -0,0 +1,140 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_BITREADER_H_
+#define VPX_DSP_BITREADER_H_
+
+#include <stddef.h>
+#include <limits.h>
+
+#include "./vpx_config.h"
+#include "vpx_ports/mem.h"
+#include "vpx/vp8dx.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/prob.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef size_t BD_VALUE;
+
+#define BD_VALUE_SIZE ((int)sizeof(BD_VALUE) * CHAR_BIT)
+
+// This is meant to be a large, positive constant that can still be efficiently
+// loaded as an immediate (on platforms like ARM, for example).
+// Even relatively modest values like 100 would work fine.
+#define LOTS_OF_BITS 0x40000000
+
+typedef struct {
+  // Be careful when reordering this struct, it may impact the cache negatively.
+  BD_VALUE value;
+  unsigned int range;
+  int count;
+  const uint8_t *buffer_end;
+  const uint8_t *buffer;
+  vpx_decrypt_cb decrypt_cb;
+  void *decrypt_state;
+  uint8_t clear_buffer[sizeof(BD_VALUE) + 1];
+} vpx_reader;
+
+int vpx_reader_init(vpx_reader *r,
+                    const uint8_t *buffer,
+                    size_t size,
+                    vpx_decrypt_cb decrypt_cb,
+                    void *decrypt_state);
+
+void vpx_reader_fill(vpx_reader *r);
+
+const uint8_t *vpx_reader_find_end(vpx_reader *r);
+
+static INLINE int vpx_reader_has_error(vpx_reader *r) {
+  // Check if we have reached the end of the buffer.
+  //
+  // Variable 'count' stores the number of bits in the 'value' buffer, minus
+  // 8. The top byte is part of the algorithm, and the remainder is buffered
+  // to be shifted into it. So if count == 8, the top 16 bits of 'value' are
+  // occupied, 8 for the algorithm and 8 in the buffer.
+  //
+  // When reading a byte from the user's buffer, count is filled with 8 and
+  // one byte is filled into the value buffer. When we reach the end of the
+  // data, count is additionally filled with LOTS_OF_BITS. So when
+  // count == LOTS_OF_BITS - 1, the user's data has been exhausted.
+  //
+  // 1 if we have tried to decode bits after the end of stream was encountered.
+  // 0 No error.
+  return r->count > BD_VALUE_SIZE && r->count < LOTS_OF_BITS;
+}
+
+static INLINE int vpx_read(vpx_reader *r, int prob) {
+  unsigned int bit = 0;
+  BD_VALUE value;
+  BD_VALUE bigsplit;
+  int count;
+  unsigned int range;
+  unsigned int split = (r->range * prob + (256 - prob)) >> CHAR_BIT;
+
+  if (r->count < 0)
+    vpx_reader_fill(r);
+
+  value = r->value;
+  count = r->count;
+
+  bigsplit = (BD_VALUE)split << (BD_VALUE_SIZE - CHAR_BIT);
+
+  range = split;
+
+  if (value >= bigsplit) {
+    range = r->range - split;
+    value = value - bigsplit;
+    bit = 1;
+  }
+
+  {
+    register int shift = vpx_norm[range];
+    range <<= shift;
+    value <<= shift;
+    count -= shift;
+  }
+  r->value = value;
+  r->count = count;
+  r->range = range;
+
+  return bit;
+}
+
+static INLINE int vpx_read_bit(vpx_reader *r) {
+  return vpx_read(r, 128);  // vpx_prob_half
+}
+
+static INLINE int vpx_read_literal(vpx_reader *r, int bits) {
+  int literal = 0, bit;
+
+  for (bit = bits - 1; bit >= 0; bit--)
+    literal |= vpx_read_bit(r) << bit;
+
+  return literal;
+}
+
+static INLINE int vpx_read_tree(vpx_reader *r, const vpx_tree_index *tree,
+                                const vpx_prob *probs) {
+  vpx_tree_index i = 0;
+
+  while ((i = tree[i + vpx_read(r, probs[i >> 1])]) > 0)
+    continue;
+
+  return -i;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_DSP_BITREADER_H_
diff --git a/libs/libvpx/vpx_dsp/bitreader_buffer.c b/libs/libvpx/vpx_dsp/bitreader_buffer.c
new file mode 100644
index 0000000000..d7b55cf9f4
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/bitreader_buffer.c
@@ -0,0 +1,53 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "./vpx_config.h"
+#include "./bitreader_buffer.h"
+
+size_t vpx_rb_bytes_read(struct vpx_read_bit_buffer *rb) {
+  return (rb->bit_offset + 7) >> 3;
+}
+
+int vpx_rb_read_bit(struct vpx_read_bit_buffer *rb) {
+  const size_t off = rb->bit_offset;
+  const size_t p = off >> 3;
+  const int q = 7 - (int)(off & 0x7);
+  if (rb->bit_buffer + p < rb->bit_buffer_end) {
+    const int bit = (rb->bit_buffer[p] >> q) & 1;
+    rb->bit_offset = off + 1;
+    return bit;
+  } else {
+    rb->error_handler(rb->error_handler_data);
+    return 0;
+  }
+}
+
+int vpx_rb_read_literal(struct vpx_read_bit_buffer *rb, int bits) {
+  int value = 0, bit;
+  for (bit = bits - 1; bit >= 0; bit--)
+    value |= vpx_rb_read_bit(rb) << bit;
+  return value;
+}
+
+int vpx_rb_read_signed_literal(struct vpx_read_bit_buffer *rb,
+                               int bits) {
+  const int value = vpx_rb_read_literal(rb, bits);
+  return vpx_rb_read_bit(rb) ? -value : value;
+}
+
+int vpx_rb_read_inv_signed_literal(struct vpx_read_bit_buffer *rb,
+                                   int bits) {
+#if CONFIG_MISC_FIXES
+  const int nbits = sizeof(unsigned) * 8 - bits - 1;
+  const unsigned value = (unsigned)vpx_rb_read_literal(rb, bits + 1) << nbits;
+  return ((int) value) >> nbits;
+#else
+  return vpx_rb_read_signed_literal(rb, bits);
+#endif
+}
diff --git a/libs/libvpx/vpx_dsp/bitreader_buffer.h b/libs/libvpx/vpx_dsp/bitreader_buffer.h
new file mode 100644
index 0000000000..8a48a95ed1
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/bitreader_buffer.h
@@ -0,0 +1,47 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_BITREADER_BUFFER_H_
+#define VPX_DSP_BITREADER_BUFFER_H_
+
+#include <limits.h>
+
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void (*vpx_rb_error_handler)(void *data);
+
+struct vpx_read_bit_buffer {
+  const uint8_t *bit_buffer;
+  const uint8_t *bit_buffer_end;
+  size_t bit_offset;
+
+  void *error_handler_data;
+  vpx_rb_error_handler error_handler;
+};
+
+size_t vpx_rb_bytes_read(struct vpx_read_bit_buffer *rb);
+
+int vpx_rb_read_bit(struct vpx_read_bit_buffer *rb);
+
+int vpx_rb_read_literal(struct vpx_read_bit_buffer *rb, int bits);
+
+int vpx_rb_read_signed_literal(struct vpx_read_bit_buffer *rb, int bits);
+
+int vpx_rb_read_inv_signed_literal(struct vpx_read_bit_buffer *rb, int bits);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_DSP_BITREADER_BUFFER_H_
diff --git a/libs/libvpx/vpx_dsp/bitwriter.c b/libs/libvpx/vpx_dsp/bitwriter.c
new file mode 100644
index 0000000000..5b232e346e
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/bitwriter.c
@@ -0,0 +1,34 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./bitwriter.h"
+
+void vpx_start_encode(vpx_writer *br, uint8_t *source) {
+  br->lowvalue = 0;
+  br->range    = 255;
+  br->count    = -24;
+  br->buffer   = source;
+  br->pos      = 0;
+  vpx_write_bit(br, 0);
+}
+
+void vpx_stop_encode(vpx_writer *br) {
+  int i;
+
+  for (i = 0; i < 32; i++)
+    vpx_write_bit(br, 0);
+
+  // Ensure there's no ambigous collision with any index marker bytes
+  if ((br->buffer[br->pos - 1] & 0xe0) == 0xc0)
+    br->buffer[br->pos++] = 0;
+}
+
diff --git a/libs/libvpx/vpx_dsp/bitwriter.h b/libs/libvpx/vpx_dsp/bitwriter.h
new file mode 100644
index 0000000000..d904997af3
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/bitwriter.h
@@ -0,0 +1,98 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_BITWRITER_H_
+#define VPX_DSP_BITWRITER_H_
+
+#include "vpx_ports/mem.h"
+
+#include "vpx_dsp/prob.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct vpx_writer {
+  unsigned int lowvalue;
+  unsigned int range;
+  int count;
+  unsigned int pos;
+  uint8_t *buffer;
+} vpx_writer;
+
+void vpx_start_encode(vpx_writer *bc, uint8_t *buffer);
+void vpx_stop_encode(vpx_writer *bc);
+
+static INLINE void vpx_write(vpx_writer *br, int bit, int probability) {
+  unsigned int split;
+  int count = br->count;
+  unsigned int range = br->range;
+  unsigned int lowvalue = br->lowvalue;
+  register int shift;
+
+  split = 1 + (((range - 1) * probability) >> 8);
+
+  range = split;
+
+  if (bit) {
+    lowvalue += split;
+    range = br->range - split;
+  }
+
+  shift = vpx_norm[range];
+
+  range <<= shift;
+  count += shift;
+
+  if (count >= 0) {
+    int offset = shift - count;
+
+    if ((lowvalue << (offset - 1)) & 0x80000000) {
+      int x = br->pos - 1;
+
+      while (x >= 0 && br->buffer[x] == 0xff) {
+        br->buffer[x] = 0;
+        x--;
+      }
+
+      br->buffer[x] += 1;
+    }
+
+    br->buffer[br->pos++] = (lowvalue >> (24 - offset));
+    lowvalue <<= offset;
+    shift = count;
+    lowvalue &= 0xffffff;
+    count -= 8;
+  }
+
+  lowvalue <<= shift;
+  br->count = count;
+  br->lowvalue = lowvalue;
+  br->range = range;
+}
+
+static INLINE void vpx_write_bit(vpx_writer *w, int bit) {
+  vpx_write(w, bit, 128);  // vpx_prob_half
+}
+
+static INLINE void vpx_write_literal(vpx_writer *w, int data, int bits) {
+  int bit;
+
+  for (bit = bits - 1; bit >= 0; bit--)
+    vpx_write_bit(w, 1 & (data >> bit));
+}
+
+#define vpx_write_prob(w, v) vpx_write_literal((w), (v), 8)
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_DSP_BITWRITER_H_
diff --git a/libs/libvpx/vpx_dsp/bitwriter_buffer.c b/libs/libvpx/vpx_dsp/bitwriter_buffer.c
new file mode 100644
index 0000000000..6182a72221
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/bitwriter_buffer.c
@@ -0,0 +1,48 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <limits.h>
+#include <stdlib.h>
+
+#include "./vpx_config.h"
+#include "./bitwriter_buffer.h"
+
+size_t vpx_wb_bytes_written(const struct vpx_write_bit_buffer *wb) {
+  return wb->bit_offset / CHAR_BIT + (wb->bit_offset % CHAR_BIT > 0);
+}
+
+void vpx_wb_write_bit(struct vpx_write_bit_buffer *wb, int bit) {
+  const int off = (int)wb->bit_offset;
+  const int p = off / CHAR_BIT;
+  const int q = CHAR_BIT - 1 - off % CHAR_BIT;
+  if (q == CHAR_BIT -1) {
+    wb->bit_buffer[p] = bit << q;
+  } else {
+    wb->bit_buffer[p] &= ~(1 << q);
+    wb->bit_buffer[p] |= bit << q;
+  }
+  wb->bit_offset = off + 1;
+}
+
+void vpx_wb_write_literal(struct vpx_write_bit_buffer *wb, int data, int bits) {
+  int bit;
+  for (bit = bits - 1; bit >= 0; bit--)
+    vpx_wb_write_bit(wb, (data >> bit) & 1);
+}
+
+void vpx_wb_write_inv_signed_literal(struct vpx_write_bit_buffer *wb,
+                                     int data, int bits) {
+#if CONFIG_MISC_FIXES
+  vpx_wb_write_literal(wb, data, bits + 1);
+#else
+  vpx_wb_write_literal(wb, abs(data), bits);
+  vpx_wb_write_bit(wb, data < 0);
+#endif
+}
diff --git a/libs/libvpx/vpx_dsp/bitwriter_buffer.h b/libs/libvpx/vpx_dsp/bitwriter_buffer.h
new file mode 100644
index 0000000000..a123a2fe8c
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/bitwriter_buffer.h
@@ -0,0 +1,38 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_BITWRITER_BUFFER_H_
+#define VPX_DSP_BITWRITER_BUFFER_H_
+
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct vpx_write_bit_buffer {
+  uint8_t *bit_buffer;
+  size_t bit_offset;
+};
+
+size_t vpx_wb_bytes_written(const struct vpx_write_bit_buffer *wb);
+
+void vpx_wb_write_bit(struct vpx_write_bit_buffer *wb, int bit);
+
+void vpx_wb_write_literal(struct vpx_write_bit_buffer *wb, int data, int bits);
+
+void vpx_wb_write_inv_signed_literal(struct vpx_write_bit_buffer *wb, int data,
+                                     int bits);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_DSP_BITWRITER_BUFFER_H_
diff --git a/libs/libvpx/vpx_dsp/fastssim.c b/libs/libvpx/vpx_dsp/fastssim.c
new file mode 100644
index 0000000000..1405a30e00
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/fastssim.c
@@ -0,0 +1,468 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ *  This code was originally written by: Nathan E. Egge, at the Daala
+ *  project.
+ */
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/ssim.h"
+#include "vpx_ports/system_state.h"
+/* TODO(jbb): High bit depth version of this code needed */
+typedef struct fs_level fs_level;
+typedef struct fs_ctx fs_ctx;
+
+#define SSIM_C1 (255 * 255 * 0.01 * 0.01)
+#define SSIM_C2 (255 * 255 * 0.03 * 0.03)
+
+#define FS_MINI(_a, _b) ((_a) < (_b) ? (_a) : (_b))
+#define FS_MAXI(_a, _b) ((_a) > (_b) ? (_a) : (_b))
+
+struct fs_level {
+  uint16_t *im1;
+  uint16_t *im2;
+  double *ssim;
+  int w;
+  int h;
+};
+
+struct fs_ctx {
+  fs_level *level;
+  int nlevels;
+  unsigned *col_buf;
+};
+
+static void fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) {
+  unsigned char *data;
+  size_t data_size;
+  int lw;
+  int lh;
+  int l;
+  lw = (_w + 1) >> 1;
+  lh = (_h + 1) >> 1;
+  data_size = _nlevels * sizeof(fs_level)
+      + 2 * (lw + 8) * 8 * sizeof(*_ctx->col_buf);
+  for (l = 0; l < _nlevels; l++) {
+    size_t im_size;
+    size_t level_size;
+    im_size = lw * (size_t) lh;
+    level_size = 2 * im_size * sizeof(*_ctx->level[l].im1);
+    level_size += sizeof(*_ctx->level[l].ssim) - 1;
+    level_size /= sizeof(*_ctx->level[l].ssim);
+    level_size += im_size;
+    level_size *= sizeof(*_ctx->level[l].ssim);
+    data_size += level_size;
+    lw = (lw + 1) >> 1;
+    lh = (lh + 1) >> 1;
+  }
+  data = (unsigned char *) malloc(data_size);
+  _ctx->level = (fs_level *) data;
+  _ctx->nlevels = _nlevels;
+  data += _nlevels * sizeof(*_ctx->level);
+  lw = (_w + 1) >> 1;
+  lh = (_h + 1) >> 1;
+  for (l = 0; l < _nlevels; l++) {
+    size_t im_size;
+    size_t level_size;
+    _ctx->level[l].w = lw;
+    _ctx->level[l].h = lh;
+    im_size = lw * (size_t) lh;
+    level_size = 2 * im_size * sizeof(*_ctx->level[l].im1);
+    level_size += sizeof(*_ctx->level[l].ssim) - 1;
+    level_size /= sizeof(*_ctx->level[l].ssim);
+    level_size *= sizeof(*_ctx->level[l].ssim);
+    _ctx->level[l].im1 = (uint16_t *) data;
+    _ctx->level[l].im2 = _ctx->level[l].im1 + im_size;
+    data += level_size;
+    _ctx->level[l].ssim = (double *) data;
+    data += im_size * sizeof(*_ctx->level[l].ssim);
+    lw = (lw + 1) >> 1;
+    lh = (lh + 1) >> 1;
+  }
+  _ctx->col_buf = (unsigned *) data;
+}
+
+static void fs_ctx_clear(fs_ctx *_ctx) {
+  free(_ctx->level);
+}
+
+static void fs_downsample_level(fs_ctx *_ctx, int _l) {
+  const uint16_t *src1;
+  const uint16_t *src2;
+  uint16_t *dst1;
+  uint16_t *dst2;
+  int w2;
+  int h2;
+  int w;
+  int h;
+  int i;
+  int j;
+  w = _ctx->level[_l].w;
+  h = _ctx->level[_l].h;
+  dst1 = _ctx->level[_l].im1;
+  dst2 = _ctx->level[_l].im2;
+  w2 = _ctx->level[_l - 1].w;
+  h2 = _ctx->level[_l - 1].h;
+  src1 = _ctx->level[_l - 1].im1;
+  src2 = _ctx->level[_l - 1].im2;
+  for (j = 0; j < h; j++) {
+    int j0offs;
+    int j1offs;
+    j0offs = 2 * j * w2;
+    j1offs = FS_MINI(2 * j + 1, h2) * w2;
+    for (i = 0; i < w; i++) {
+      int i0;
+      int i1;
+      i0 = 2 * i;
+      i1 = FS_MINI(i0 + 1, w2);
+      dst1[j * w + i] = src1[j0offs + i0] + src1[j0offs + i1]
+          + src1[j1offs + i0] + src1[j1offs + i1];
+      dst2[j * w + i] = src2[j0offs + i0] + src2[j0offs + i1]
+          + src2[j1offs + i0] + src2[j1offs + i1];
+    }
+  }
+}
+
+static void fs_downsample_level0(fs_ctx *_ctx, const unsigned char *_src1,
+                                 int _s1ystride, const unsigned char *_src2,
+                                 int _s2ystride, int _w, int _h) {
+  uint16_t *dst1;
+  uint16_t *dst2;
+  int w;
+  int h;
+  int i;
+  int j;
+  w = _ctx->level[0].w;
+  h = _ctx->level[0].h;
+  dst1 = _ctx->level[0].im1;
+  dst2 = _ctx->level[0].im2;
+  for (j = 0; j < h; j++) {
+    int j0;
+    int j1;
+    j0 = 2 * j;
+    j1 = FS_MINI(j0 + 1, _h);
+    for (i = 0; i < w; i++) {
+      int i0;
+      int i1;
+      i0 = 2 * i;
+      i1 = FS_MINI(i0 + 1, _w);
+      dst1[j * w + i] = _src1[j0 * _s1ystride + i0]
+          + _src1[j0 * _s1ystride + i1] + _src1[j1 * _s1ystride + i0]
+          + _src1[j1 * _s1ystride + i1];
+      dst2[j * w + i] = _src2[j0 * _s2ystride + i0]
+          + _src2[j0 * _s2ystride + i1] + _src2[j1 * _s2ystride + i0]
+          + _src2[j1 * _s2ystride + i1];
+    }
+  }
+}
+
+static void fs_apply_luminance(fs_ctx *_ctx, int _l) {
+  unsigned *col_sums_x;
+  unsigned *col_sums_y;
+  uint16_t *im1;
+  uint16_t *im2;
+  double *ssim;
+  double c1;
+  int w;
+  int h;
+  int j0offs;
+  int j1offs;
+  int i;
+  int j;
+  w = _ctx->level[_l].w;
+  h = _ctx->level[_l].h;
+  col_sums_x = _ctx->col_buf;
+  col_sums_y = col_sums_x + w;
+  im1 = _ctx->level[_l].im1;
+  im2 = _ctx->level[_l].im2;
+  for (i = 0; i < w; i++)
+    col_sums_x[i] = 5 * im1[i];
+  for (i = 0; i < w; i++)
+    col_sums_y[i] = 5 * im2[i];
+  for (j = 1; j < 4; j++) {
+    j1offs = FS_MINI(j, h - 1) * w;
+    for (i = 0; i < w; i++)
+      col_sums_x[i] += im1[j1offs + i];
+    for (i = 0; i < w; i++)
+      col_sums_y[i] += im2[j1offs + i];
+  }
+  ssim = _ctx->level[_l].ssim;
+  c1 = (double) (SSIM_C1 * 4096 * (1 << 4 * _l));
+  for (j = 0; j < h; j++) {
+    unsigned mux;
+    unsigned muy;
+    int i0;
+    int i1;
+    mux = 5 * col_sums_x[0];
+    muy = 5 * col_sums_y[0];
+    for (i = 1; i < 4; i++) {
+      i1 = FS_MINI(i, w - 1);
+      mux += col_sums_x[i1];
+      muy += col_sums_y[i1];
+    }
+    for (i = 0; i < w; i++) {
+      ssim[j * w + i] *= (2 * mux * (double) muy + c1)
+          / (mux * (double) mux + muy * (double) muy + c1);
+      if (i + 1 < w) {
+        i0 = FS_MAXI(0, i - 4);
+        i1 = FS_MINI(i + 4, w - 1);
+        mux += col_sums_x[i1] - col_sums_x[i0];
+        muy += col_sums_x[i1] - col_sums_x[i0];
+      }
+    }
+    if (j + 1 < h) {
+      j0offs = FS_MAXI(0, j - 4) * w;
+      for (i = 0; i < w; i++)
+        col_sums_x[i] -= im1[j0offs + i];
+      for (i = 0; i < w; i++)
+        col_sums_y[i] -= im2[j0offs + i];
+      j1offs = FS_MINI(j + 4, h - 1) * w;
+      for (i = 0; i < w; i++)
+        col_sums_x[i] += im1[j1offs + i];
+      for (i = 0; i < w; i++)
+        col_sums_y[i] += im2[j1offs + i];
+    }
+  }
+}
+
+#define FS_COL_SET(_col, _joffs, _ioffs) \
+  do { \
+    unsigned gx; \
+    unsigned gy; \
+    gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+    gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+    col_sums_gx2[(_col)] = gx * (double)gx; \
+    col_sums_gy2[(_col)] = gy * (double)gy; \
+    col_sums_gxgy[(_col)] = gx * (double)gy; \
+  } \
+  while (0)
+
+#define FS_COL_ADD(_col, _joffs, _ioffs) \
+  do { \
+    unsigned gx; \
+    unsigned gy; \
+    gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+    gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+    col_sums_gx2[(_col)] += gx * (double)gx; \
+    col_sums_gy2[(_col)] += gy * (double)gy; \
+    col_sums_gxgy[(_col)] += gx * (double)gy; \
+  } \
+  while (0)
+
+#define FS_COL_SUB(_col, _joffs, _ioffs) \
+  do { \
+    unsigned gx; \
+    unsigned gy; \
+    gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+    gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+    col_sums_gx2[(_col)] -= gx * (double)gx; \
+    col_sums_gy2[(_col)] -= gy * (double)gy; \
+    col_sums_gxgy[(_col)] -= gx * (double)gy; \
+  } \
+  while (0)
+
+#define FS_COL_COPY(_col1, _col2) \
+  do { \
+    col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)]; \
+    col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)]; \
+    col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)]; \
+  } \
+  while (0)
+
+#define FS_COL_HALVE(_col1, _col2) \
+  do { \
+    col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 0.5; \
+    col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 0.5; \
+    col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)] * 0.5; \
+  } \
+  while (0)
+
+#define FS_COL_DOUBLE(_col1, _col2) \
+  do { \
+    col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 2; \
+    col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 2; \
+    col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)] * 2; \
+  } \
+  while (0)
+
+static void fs_calc_structure(fs_ctx *_ctx, int _l) {
+  uint16_t *im1;
+  uint16_t *im2;
+  unsigned *gx_buf;
+  unsigned *gy_buf;
+  double *ssim;
+  double col_sums_gx2[8];
+  double col_sums_gy2[8];
+  double col_sums_gxgy[8];
+  double c2;
+  int stride;
+  int w;
+  int h;
+  int i;
+  int j;
+  w = _ctx->level[_l].w;
+  h = _ctx->level[_l].h;
+  im1 = _ctx->level[_l].im1;
+  im2 = _ctx->level[_l].im2;
+  ssim = _ctx->level[_l].ssim;
+  gx_buf = _ctx->col_buf;
+  stride = w + 8;
+  gy_buf = gx_buf + 8 * stride;
+  memset(gx_buf, 0, 2 * 8 * stride * sizeof(*gx_buf));
+  c2 = SSIM_C2 * (1 << 4 * _l) * 16 * 104;
+  for (j = 0; j < h + 4; j++) {
+    if (j < h - 1) {
+      for (i = 0; i < w - 1; i++) {
+        unsigned g1;
+        unsigned g2;
+        unsigned gx;
+        unsigned gy;
+        g1 = abs(im1[(j + 1) * w + i + 1] - im1[j * w + i]);
+        g2 = abs(im1[(j + 1) * w + i] - im1[j * w + i + 1]);
+        gx = 4 * FS_MAXI(g1, g2) + FS_MINI(g1, g2);
+        g1 = abs(im2[(j + 1) * w + i + 1] - im2[j * w + i]);
+        g2 = abs(im2[(j + 1) * w + i] - im2[j * w + i + 1]);
+        gy = 4 * FS_MAXI(g1, g2) + FS_MINI(g1, g2);
+        gx_buf[(j & 7) * stride + i + 4] = gx;
+        gy_buf[(j & 7) * stride + i + 4] = gy;
+      }
+    } else {
+      memset(gx_buf + (j & 7) * stride, 0, stride * sizeof(*gx_buf));
+      memset(gy_buf + (j & 7) * stride, 0, stride * sizeof(*gy_buf));
+    }
+    if (j >= 4) {
+      int k;
+      col_sums_gx2[3] = col_sums_gx2[2] = col_sums_gx2[1] = col_sums_gx2[0] = 0;
+      col_sums_gy2[3] = col_sums_gy2[2] = col_sums_gy2[1] = col_sums_gy2[0] = 0;
+      col_sums_gxgy[3] = col_sums_gxgy[2] = col_sums_gxgy[1] =
+          col_sums_gxgy[0] = 0;
+      for (i = 4; i < 8; i++) {
+        FS_COL_SET(i, -1, 0);
+        FS_COL_ADD(i, 0, 0);
+        for (k = 1; k < 8 - i; k++) {
+          FS_COL_DOUBLE(i, i);
+          FS_COL_ADD(i, -k - 1, 0);
+          FS_COL_ADD(i, k, 0);
+        }
+      }
+      for (i = 0; i < w; i++) {
+        double mugx2;
+        double mugy2;
+        double mugxgy;
+        mugx2 = col_sums_gx2[0];
+        for (k = 1; k < 8; k++)
+          mugx2 += col_sums_gx2[k];
+        mugy2 = col_sums_gy2[0];
+        for (k = 1; k < 8; k++)
+          mugy2 += col_sums_gy2[k];
+        mugxgy = col_sums_gxgy[0];
+        for (k = 1; k < 8; k++)
+          mugxgy += col_sums_gxgy[k];
+        ssim[(j - 4) * w + i] = (2 * mugxgy + c2) / (mugx2 + mugy2 + c2);
+        if (i + 1 < w) {
+          FS_COL_SET(0, -1, 1);
+          FS_COL_ADD(0, 0, 1);
+          FS_COL_SUB(2, -3, 2);
+          FS_COL_SUB(2, 2, 2);
+          FS_COL_HALVE(1, 2);
+          FS_COL_SUB(3, -4, 3);
+          FS_COL_SUB(3, 3, 3);
+          FS_COL_HALVE(2, 3);
+          FS_COL_COPY(3, 4);
+          FS_COL_DOUBLE(4, 5);
+          FS_COL_ADD(4, -4, 5);
+          FS_COL_ADD(4, 3, 5);
+          FS_COL_DOUBLE(5, 6);
+          FS_COL_ADD(5, -3, 6);
+          FS_COL_ADD(5, 2, 6);
+          FS_COL_DOUBLE(6, 7);
+          FS_COL_ADD(6, -2, 7);
+          FS_COL_ADD(6, 1, 7);
+          FS_COL_SET(7, -1, 8);
+          FS_COL_ADD(7, 0, 8);
+        }
+      }
+    }
+  }
+}
+
+#define FS_NLEVELS (4)
+
+/*These weights were derived from the default weights found in Wang's original
+ Matlab implementation: {0.0448, 0.2856, 0.2363, 0.1333}.
+ We drop the finest scale and renormalize the rest to sum to 1.*/
+
+static const double FS_WEIGHTS[FS_NLEVELS] = {0.2989654541015625,
+    0.3141326904296875, 0.2473602294921875, 0.1395416259765625};
+
+static double fs_average(fs_ctx *_ctx, int _l) {
+  double *ssim;
+  double ret;
+  int w;
+  int h;
+  int i;
+  int j;
+  w = _ctx->level[_l].w;
+  h = _ctx->level[_l].h;
+  ssim = _ctx->level[_l].ssim;
+  ret = 0;
+  for (j = 0; j < h; j++)
+    for (i = 0; i < w; i++)
+      ret += ssim[j * w + i];
+  return pow(ret / (w * h), FS_WEIGHTS[_l]);
+}
+
+static double calc_ssim(const unsigned char *_src, int _systride,
+                 const unsigned char *_dst, int _dystride, int _w, int _h) {
+  fs_ctx ctx;
+  double ret;
+  int l;
+  ret = 1;
+  fs_ctx_init(&ctx, _w, _h, FS_NLEVELS);
+  fs_downsample_level0(&ctx, _src, _systride, _dst, _dystride, _w, _h);
+  for (l = 0; l < FS_NLEVELS - 1; l++) {
+    fs_calc_structure(&ctx, l);
+    ret *= fs_average(&ctx, l);
+    fs_downsample_level(&ctx, l + 1);
+  }
+  fs_calc_structure(&ctx, l);
+  fs_apply_luminance(&ctx, l);
+  ret *= fs_average(&ctx, l);
+  fs_ctx_clear(&ctx);
+  return ret;
+}
+
+static double convert_ssim_db(double _ssim, double _weight) {
+  return 10 * (log10(_weight) - log10(_weight - _ssim));
+}
+
+double vpx_calc_fastssim(const YV12_BUFFER_CONFIG *source,
+                         const YV12_BUFFER_CONFIG *dest,
+                         double *ssim_y, double *ssim_u, double *ssim_v) {
+  double ssimv;
+  vpx_clear_system_state();
+
+  *ssim_y = calc_ssim(source->y_buffer, source->y_stride, dest->y_buffer,
+                      dest->y_stride, source->y_crop_width,
+                      source->y_crop_height);
+
+  *ssim_u = calc_ssim(source->u_buffer, source->uv_stride, dest->u_buffer,
+                      dest->uv_stride, source->uv_crop_width,
+                      source->uv_crop_height);
+
+  *ssim_v = calc_ssim(source->v_buffer, source->uv_stride, dest->v_buffer,
+                      dest->uv_stride, source->uv_crop_width,
+                      source->uv_crop_height);
+  ssimv = (*ssim_y) * .8 + .1 * ((*ssim_u) + (*ssim_v));
+
+  return convert_ssim_db(ssimv, 1.0);
+}
diff --git a/libs/libvpx/vpx_dsp/fwd_txfm.c b/libs/libvpx/vpx_dsp/fwd_txfm.c
new file mode 100644
index 0000000000..7baaa8b0d0
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/fwd_txfm.c
@@ -0,0 +1,822 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/fwd_txfm.h"
+
+void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
+  // The 2D transform is done with two passes which are actually pretty
+  // similar. In the first one, we transform the columns and transpose
+  // the results. In the second one, we transform the rows. To achieve that,
+  // as the first pass results are transposed, we transpose the columns (that
+  // is the transposed rows) and transpose the results (so that it goes back
+  // in normal/row positions).
+  int pass;
+  // We need an intermediate buffer between passes.
+  tran_low_t intermediate[4 * 4];
+  const int16_t *in_pass0 = input;
+  const tran_low_t *in = NULL;
+  tran_low_t *out = intermediate;
+  // Do the two transform/transpose passes
+  for (pass = 0; pass < 2; ++pass) {
+    tran_high_t input[4];      // canbe16
+    tran_high_t step[4];       // canbe16
+    tran_high_t temp1, temp2;  // needs32
+    int i;
+    for (i = 0; i < 4; ++i) {
+      // Load inputs.
+      if (0 == pass) {
+        input[0] = in_pass0[0 * stride] * 16;
+        input[1] = in_pass0[1 * stride] * 16;
+        input[2] = in_pass0[2 * stride] * 16;
+        input[3] = in_pass0[3 * stride] * 16;
+        if (i == 0 && input[0]) {
+          input[0] += 1;
+        }
+      } else {
+        input[0] = in[0 * 4];
+        input[1] = in[1 * 4];
+        input[2] = in[2 * 4];
+        input[3] = in[3 * 4];
+      }
+      // Transform.
+      step[0] = input[0] + input[3];
+      step[1] = input[1] + input[2];
+      step[2] = input[1] - input[2];
+      step[3] = input[0] - input[3];
+      temp1 = (step[0] + step[1]) * cospi_16_64;
+      temp2 = (step[0] - step[1]) * cospi_16_64;
+      out[0] = (tran_low_t)fdct_round_shift(temp1);
+      out[2] = (tran_low_t)fdct_round_shift(temp2);
+      temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
+      temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
+      out[1] = (tran_low_t)fdct_round_shift(temp1);
+      out[3] = (tran_low_t)fdct_round_shift(temp2);
+      // Do next column (which is a transposed row in second/horizontal pass)
+      in_pass0++;
+      in++;
+      out += 4;
+    }
+    // Setup in/out for next pass.
+    in = intermediate;
+    out = output;
+  }
+
+  {
+    int i, j;
+    for (i = 0; i < 4; ++i) {
+      for (j = 0; j < 4; ++j)
+        output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
+    }
+  }
+}
+
+void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) {
+  int r, c;
+  tran_low_t sum = 0;
+  for (r = 0; r < 4; ++r)
+    for (c = 0; c < 4; ++c)
+      sum += input[r * stride + c];
+
+  output[0] = sum << 1;
+  output[1] = 0;
+}
+
+void vpx_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
+  int i, j;
+  tran_low_t intermediate[64];
+  int pass;
+  tran_low_t *output = intermediate;
+  const tran_low_t *in = NULL;
+
+  // Transform columns
+  for (pass = 0; pass < 2; ++pass) {
+    tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
+    tran_high_t t0, t1, t2, t3;                  // needs32
+    tran_high_t x0, x1, x2, x3;                  // canbe16
+
+    int i;
+    for (i = 0; i < 8; i++) {
+      // stage 1
+      if (pass == 0) {
+        s0 = (input[0 * stride] + input[7 * stride]) * 4;
+        s1 = (input[1 * stride] + input[6 * stride]) * 4;
+        s2 = (input[2 * stride] + input[5 * stride]) * 4;
+        s3 = (input[3 * stride] + input[4 * stride]) * 4;
+        s4 = (input[3 * stride] - input[4 * stride]) * 4;
+        s5 = (input[2 * stride] - input[5 * stride]) * 4;
+        s6 = (input[1 * stride] - input[6 * stride]) * 4;
+        s7 = (input[0 * stride] - input[7 * stride]) * 4;
+        ++input;
+      } else {
+        s0 = in[0 * 8] + in[7 * 8];
+        s1 = in[1 * 8] + in[6 * 8];
+        s2 = in[2 * 8] + in[5 * 8];
+        s3 = in[3 * 8] + in[4 * 8];
+        s4 = in[3 * 8] - in[4 * 8];
+        s5 = in[2 * 8] - in[5 * 8];
+        s6 = in[1 * 8] - in[6 * 8];
+        s7 = in[0 * 8] - in[7 * 8];
+        ++in;
+      }
+
+      // fdct4(step, step);
+      x0 = s0 + s3;
+      x1 = s1 + s2;
+      x2 = s1 - s2;
+      x3 = s0 - s3;
+      t0 = (x0 + x1) * cospi_16_64;
+      t1 = (x0 - x1) * cospi_16_64;
+      t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
+      t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
+      output[0] = (tran_low_t)fdct_round_shift(t0);
+      output[2] = (tran_low_t)fdct_round_shift(t2);
+      output[4] = (tran_low_t)fdct_round_shift(t1);
+      output[6] = (tran_low_t)fdct_round_shift(t3);
+
+      // Stage 2
+      t0 = (s6 - s5) * cospi_16_64;
+      t1 = (s6 + s5) * cospi_16_64;
+      t2 = fdct_round_shift(t0);
+      t3 = fdct_round_shift(t1);
+
+      // Stage 3
+      x0 = s4 + t2;
+      x1 = s4 - t2;
+      x2 = s7 - t3;
+      x3 = s7 + t3;
+
+      // Stage 4
+      t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
+      t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
+      t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+      t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
+      output[1] = (tran_low_t)fdct_round_shift(t0);
+      output[3] = (tran_low_t)fdct_round_shift(t2);
+      output[5] = (tran_low_t)fdct_round_shift(t1);
+      output[7] = (tran_low_t)fdct_round_shift(t3);
+      output += 8;
+    }
+    in  = intermediate;
+    output = final_output;
+  }
+
+  // Rows
+  for (i = 0; i < 8; ++i) {
+    for (j = 0; j < 8; ++j)
+      final_output[j + i * 8] /= 2;
+  }
+}
+
+void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) {
+  int r, c;
+  tran_low_t sum = 0;
+  for (r = 0; r < 8; ++r)
+    for (c = 0; c < 8; ++c)
+      sum += input[r * stride + c];
+
+  output[0] = sum;
+  output[1] = 0;
+}
+
+void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
+  // The 2D transform is done with two passes which are actually pretty
+  // similar. In the first one, we transform the columns and transpose
+  // the results. In the second one, we transform the rows. To achieve that,
+  // as the first pass results are transposed, we transpose the columns (that
+  // is the transposed rows) and transpose the results (so that it goes back
+  // in normal/row positions).
+  int pass;
+  // We need an intermediate buffer between passes.
+  tran_low_t intermediate[256];
+  const int16_t *in_pass0 = input;
+  const tran_low_t *in = NULL;
+  tran_low_t *out = intermediate;
+  // Do the two transform/transpose passes
+  for (pass = 0; pass < 2; ++pass) {
+    tran_high_t step1[8];      // canbe16
+    tran_high_t step2[8];      // canbe16
+    tran_high_t step3[8];      // canbe16
+    tran_high_t input[8];      // canbe16
+    tran_high_t temp1, temp2;  // needs32
+    int i;
+    for (i = 0; i < 16; i++) {
+      if (0 == pass) {
+        // Calculate input for the first 8 results.
+        input[0] = (in_pass0[0 * stride] + in_pass0[15 * stride]) * 4;
+        input[1] = (in_pass0[1 * stride] + in_pass0[14 * stride]) * 4;
+        input[2] = (in_pass0[2 * stride] + in_pass0[13 * stride]) * 4;
+        input[3] = (in_pass0[3 * stride] + in_pass0[12 * stride]) * 4;
+        input[4] = (in_pass0[4 * stride] + in_pass0[11 * stride]) * 4;
+        input[5] = (in_pass0[5 * stride] + in_pass0[10 * stride]) * 4;
+        input[6] = (in_pass0[6 * stride] + in_pass0[ 9 * stride]) * 4;
+        input[7] = (in_pass0[7 * stride] + in_pass0[ 8 * stride]) * 4;
+        // Calculate input for the next 8 results.
+        step1[0] = (in_pass0[7 * stride] - in_pass0[ 8 * stride]) * 4;
+        step1[1] = (in_pass0[6 * stride] - in_pass0[ 9 * stride]) * 4;
+        step1[2] = (in_pass0[5 * stride] - in_pass0[10 * stride]) * 4;
+        step1[3] = (in_pass0[4 * stride] - in_pass0[11 * stride]) * 4;
+        step1[4] = (in_pass0[3 * stride] - in_pass0[12 * stride]) * 4;
+        step1[5] = (in_pass0[2 * stride] - in_pass0[13 * stride]) * 4;
+        step1[6] = (in_pass0[1 * stride] - in_pass0[14 * stride]) * 4;
+        step1[7] = (in_pass0[0 * stride] - in_pass0[15 * stride]) * 4;
+      } else {
+        // Calculate input for the first 8 results.
+        input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2);
+        input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2);
+        input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2);
+        input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2);
+        input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2);
+        input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2);
+        input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2);
+        input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2);
+        // Calculate input for the next 8 results.
+        step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2);
+        step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2);
+        step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2);
+        step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2);
+        step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2);
+        step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2);
+        step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2);
+        step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2);
+      }
+      // Work on the first eight values; fdct8(input, even_results);
+      {
+        tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
+        tran_high_t t0, t1, t2, t3;                  // needs32
+        tran_high_t x0, x1, x2, x3;                  // canbe16
+
+        // stage 1
+        s0 = input[0] + input[7];
+        s1 = input[1] + input[6];
+        s2 = input[2] + input[5];
+        s3 = input[3] + input[4];
+        s4 = input[3] - input[4];
+        s5 = input[2] - input[5];
+        s6 = input[1] - input[6];
+        s7 = input[0] - input[7];
+
+        // fdct4(step, step);
+        x0 = s0 + s3;
+        x1 = s1 + s2;
+        x2 = s1 - s2;
+        x3 = s0 - s3;
+        t0 = (x0 + x1) * cospi_16_64;
+        t1 = (x0 - x1) * cospi_16_64;
+        t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
+        t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
+        out[0] = (tran_low_t)fdct_round_shift(t0);
+        out[4] = (tran_low_t)fdct_round_shift(t2);
+        out[8] = (tran_low_t)fdct_round_shift(t1);
+        out[12] = (tran_low_t)fdct_round_shift(t3);
+
+        // Stage 2
+        t0 = (s6 - s5) * cospi_16_64;
+        t1 = (s6 + s5) * cospi_16_64;
+        t2 = fdct_round_shift(t0);
+        t3 = fdct_round_shift(t1);
+
+        // Stage 3
+        x0 = s4 + t2;
+        x1 = s4 - t2;
+        x2 = s7 - t3;
+        x3 = s7 + t3;
+
+        // Stage 4
+        t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
+        t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
+        t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+        t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
+        out[2] = (tran_low_t)fdct_round_shift(t0);
+        out[6] = (tran_low_t)fdct_round_shift(t2);
+        out[10] = (tran_low_t)fdct_round_shift(t1);
+        out[14] = (tran_low_t)fdct_round_shift(t3);
+      }
+      // Work on the next eight values; step1 -> odd_results
+      {
+        // step 2
+        temp1 = (step1[5] - step1[2]) * cospi_16_64;
+        temp2 = (step1[4] - step1[3]) * cospi_16_64;
+        step2[2] = fdct_round_shift(temp1);
+        step2[3] = fdct_round_shift(temp2);
+        temp1 = (step1[4] + step1[3]) * cospi_16_64;
+        temp2 = (step1[5] + step1[2]) * cospi_16_64;
+        step2[4] = fdct_round_shift(temp1);
+        step2[5] = fdct_round_shift(temp2);
+        // step 3
+        step3[0] = step1[0] + step2[3];
+        step3[1] = step1[1] + step2[2];
+        step3[2] = step1[1] - step2[2];
+        step3[3] = step1[0] - step2[3];
+        step3[4] = step1[7] - step2[4];
+        step3[5] = step1[6] - step2[5];
+        step3[6] = step1[6] + step2[5];
+        step3[7] = step1[7] + step2[4];
+        // step 4
+        temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
+        temp2 = step3[2] * cospi_24_64 + step3[5] *  cospi_8_64;
+        step2[1] = fdct_round_shift(temp1);
+        step2[2] = fdct_round_shift(temp2);
+        temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
+        temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
+        step2[5] = fdct_round_shift(temp1);
+        step2[6] = fdct_round_shift(temp2);
+        // step 5
+        step1[0] = step3[0] + step2[1];
+        step1[1] = step3[0] - step2[1];
+        step1[2] = step3[3] + step2[2];
+        step1[3] = step3[3] - step2[2];
+        step1[4] = step3[4] - step2[5];
+        step1[5] = step3[4] + step2[5];
+        step1[6] = step3[7] - step2[6];
+        step1[7] = step3[7] + step2[6];
+        // step 6
+        temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
+        temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
+        out[1] = (tran_low_t)fdct_round_shift(temp1);
+        out[9] = (tran_low_t)fdct_round_shift(temp2);
+        temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
+        temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
+        out[5] = (tran_low_t)fdct_round_shift(temp1);
+        out[13] = (tran_low_t)fdct_round_shift(temp2);
+        temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
+        temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
+        out[3] = (tran_low_t)fdct_round_shift(temp1);
+        out[11] = (tran_low_t)fdct_round_shift(temp2);
+        temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
+        temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
+        out[7] = (tran_low_t)fdct_round_shift(temp1);
+        out[15] = (tran_low_t)fdct_round_shift(temp2);
+      }
+      // Do next column (which is a transposed row in second/horizontal pass)
+      in++;
+      in_pass0++;
+      out += 16;
+    }
+    // Setup in/out for next pass.
+    in = intermediate;
+    out = output;
+  }
+}
+
+void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) {
+  int r, c;
+  tran_low_t sum = 0;
+  for (r = 0; r < 16; ++r)
+    for (c = 0; c < 16; ++c)
+      sum += input[r * stride + c];
+
+  output[0] = sum >> 1;
+  output[1] = 0;
+}
+
+static INLINE tran_high_t dct_32_round(tran_high_t input) {
+  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+  // TODO(debargha, peter.derivaz): Find new bounds for this assert,
+  // and make the bounds consts.
+  // assert(-131072 <= rv && rv <= 131071);
+  return rv;
+}
+
+static INLINE tran_high_t half_round_shift(tran_high_t input) {
+  tran_high_t rv = (input + 1 + (input < 0)) >> 2;
+  return rv;
+}
+
+void vpx_fdct32(const tran_high_t *input, tran_high_t *output, int round) {
+  tran_high_t step[32];
+  // Stage 1
+  step[0] = input[0] + input[(32 - 1)];
+  step[1] = input[1] + input[(32 - 2)];
+  step[2] = input[2] + input[(32 - 3)];
+  step[3] = input[3] + input[(32 - 4)];
+  step[4] = input[4] + input[(32 - 5)];
+  step[5] = input[5] + input[(32 - 6)];
+  step[6] = input[6] + input[(32 - 7)];
+  step[7] = input[7] + input[(32 - 8)];
+  step[8] = input[8] + input[(32 - 9)];
+  step[9] = input[9] + input[(32 - 10)];
+  step[10] = input[10] + input[(32 - 11)];
+  step[11] = input[11] + input[(32 - 12)];
+  step[12] = input[12] + input[(32 - 13)];
+  step[13] = input[13] + input[(32 - 14)];
+  step[14] = input[14] + input[(32 - 15)];
+  step[15] = input[15] + input[(32 - 16)];
+  step[16] = -input[16] + input[(32 - 17)];
+  step[17] = -input[17] + input[(32 - 18)];
+  step[18] = -input[18] + input[(32 - 19)];
+  step[19] = -input[19] + input[(32 - 20)];
+  step[20] = -input[20] + input[(32 - 21)];
+  step[21] = -input[21] + input[(32 - 22)];
+  step[22] = -input[22] + input[(32 - 23)];
+  step[23] = -input[23] + input[(32 - 24)];
+  step[24] = -input[24] + input[(32 - 25)];
+  step[25] = -input[25] + input[(32 - 26)];
+  step[26] = -input[26] + input[(32 - 27)];
+  step[27] = -input[27] + input[(32 - 28)];
+  step[28] = -input[28] + input[(32 - 29)];
+  step[29] = -input[29] + input[(32 - 30)];
+  step[30] = -input[30] + input[(32 - 31)];
+  step[31] = -input[31] + input[(32 - 32)];
+
+  // Stage 2
+  output[0] = step[0] + step[16 - 1];
+  output[1] = step[1] + step[16 - 2];
+  output[2] = step[2] + step[16 - 3];
+  output[3] = step[3] + step[16 - 4];
+  output[4] = step[4] + step[16 - 5];
+  output[5] = step[5] + step[16 - 6];
+  output[6] = step[6] + step[16 - 7];
+  output[7] = step[7] + step[16 - 8];
+  output[8] = -step[8] + step[16 - 9];
+  output[9] = -step[9] + step[16 - 10];
+  output[10] = -step[10] + step[16 - 11];
+  output[11] = -step[11] + step[16 - 12];
+  output[12] = -step[12] + step[16 - 13];
+  output[13] = -step[13] + step[16 - 14];
+  output[14] = -step[14] + step[16 - 15];
+  output[15] = -step[15] + step[16 - 16];
+
+  output[16] = step[16];
+  output[17] = step[17];
+  output[18] = step[18];
+  output[19] = step[19];
+
+  output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64);
+  output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64);
+  output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64);
+  output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64);
+
+  output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64);
+  output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64);
+  output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64);
+  output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64);
+
+  output[28] = step[28];
+  output[29] = step[29];
+  output[30] = step[30];
+  output[31] = step[31];
+
+  // dump the magnitude by 4, hence the intermediate values are within
+  // the range of 16 bits.
+  if (round) {
+    output[0] = half_round_shift(output[0]);
+    output[1] = half_round_shift(output[1]);
+    output[2] = half_round_shift(output[2]);
+    output[3] = half_round_shift(output[3]);
+    output[4] = half_round_shift(output[4]);
+    output[5] = half_round_shift(output[5]);
+    output[6] = half_round_shift(output[6]);
+    output[7] = half_round_shift(output[7]);
+    output[8] = half_round_shift(output[8]);
+    output[9] = half_round_shift(output[9]);
+    output[10] = half_round_shift(output[10]);
+    output[11] = half_round_shift(output[11]);
+    output[12] = half_round_shift(output[12]);
+    output[13] = half_round_shift(output[13]);
+    output[14] = half_round_shift(output[14]);
+    output[15] = half_round_shift(output[15]);
+
+    output[16] = half_round_shift(output[16]);
+    output[17] = half_round_shift(output[17]);
+    output[18] = half_round_shift(output[18]);
+    output[19] = half_round_shift(output[19]);
+    output[20] = half_round_shift(output[20]);
+    output[21] = half_round_shift(output[21]);
+    output[22] = half_round_shift(output[22]);
+    output[23] = half_round_shift(output[23]);
+    output[24] = half_round_shift(output[24]);
+    output[25] = half_round_shift(output[25]);
+    output[26] = half_round_shift(output[26]);
+    output[27] = half_round_shift(output[27]);
+    output[28] = half_round_shift(output[28]);
+    output[29] = half_round_shift(output[29]);
+    output[30] = half_round_shift(output[30]);
+    output[31] = half_round_shift(output[31]);
+  }
+
+  // Stage 3
+  step[0] = output[0] + output[(8 - 1)];
+  step[1] = output[1] + output[(8 - 2)];
+  step[2] = output[2] + output[(8 - 3)];
+  step[3] = output[3] + output[(8 - 4)];
+  step[4] = -output[4] + output[(8 - 5)];
+  step[5] = -output[5] + output[(8 - 6)];
+  step[6] = -output[6] + output[(8 - 7)];
+  step[7] = -output[7] + output[(8 - 8)];
+  step[8] = output[8];
+  step[9] = output[9];
+  step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64);
+  step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64);
+  step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64);
+  step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64);
+  step[14] = output[14];
+  step[15] = output[15];
+
+  step[16] = output[16] + output[23];
+  step[17] = output[17] + output[22];
+  step[18] = output[18] + output[21];
+  step[19] = output[19] + output[20];
+  step[20] = -output[20] + output[19];
+  step[21] = -output[21] + output[18];
+  step[22] = -output[22] + output[17];
+  step[23] = -output[23] + output[16];
+  step[24] = -output[24] + output[31];
+  step[25] = -output[25] + output[30];
+  step[26] = -output[26] + output[29];
+  step[27] = -output[27] + output[28];
+  step[28] = output[28] + output[27];
+  step[29] = output[29] + output[26];
+  step[30] = output[30] + output[25];
+  step[31] = output[31] + output[24];
+
+  // Stage 4
+  output[0] = step[0] + step[3];
+  output[1] = step[1] + step[2];
+  output[2] = -step[2] + step[1];
+  output[3] = -step[3] + step[0];
+  output[4] = step[4];
+  output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64);
+  output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64);
+  output[7] = step[7];
+  output[8] = step[8] + step[11];
+  output[9] = step[9] + step[10];
+  output[10] = -step[10] + step[9];
+  output[11] = -step[11] + step[8];
+  output[12] = -step[12] + step[15];
+  output[13] = -step[13] + step[14];
+  output[14] = step[14] + step[13];
+  output[15] = step[15] + step[12];
+
+  output[16] = step[16];
+  output[17] = step[17];
+  output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64);
+  output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64);
+  output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64);
+  output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64);
+  output[22] = step[22];
+  output[23] = step[23];
+  output[24] = step[24];
+  output[25] = step[25];
+  output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64);
+  output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64);
+  output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64);
+  output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64);
+  output[30] = step[30];
+  output[31] = step[31];
+
+  // Stage 5
+  step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64);
+  step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64);
+  step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64);
+  step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64);
+  step[4] = output[4] + output[5];
+  step[5] = -output[5] + output[4];
+  step[6] = -output[6] + output[7];
+  step[7] = output[7] + output[6];
+  step[8] = output[8];
+  step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64);
+  step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64);
+  step[11] = output[11];
+  step[12] = output[12];
+  step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64);
+  step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64);
+  step[15] = output[15];
+
+  step[16] = output[16] + output[19];
+  step[17] = output[17] + output[18];
+  step[18] = -output[18] + output[17];
+  step[19] = -output[19] + output[16];
+  step[20] = -output[20] + output[23];
+  step[21] = -output[21] + output[22];
+  step[22] = output[22] + output[21];
+  step[23] = output[23] + output[20];
+  step[24] = output[24] + output[27];
+  step[25] = output[25] + output[26];
+  step[26] = -output[26] + output[25];
+  step[27] = -output[27] + output[24];
+  step[28] = -output[28] + output[31];
+  step[29] = -output[29] + output[30];
+  step[30] = output[30] + output[29];
+  step[31] = output[31] + output[28];
+
+  // Stage 6
+  output[0] = step[0];
+  output[1] = step[1];
+  output[2] = step[2];
+  output[3] = step[3];
+  output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64);
+  output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64);
+  output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64);
+  output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64);
+  output[8] = step[8] + step[9];
+  output[9] = -step[9] + step[8];
+  output[10] = -step[10] + step[11];
+  output[11] = step[11] + step[10];
+  output[12] = step[12] + step[13];
+  output[13] = -step[13] + step[12];
+  output[14] = -step[14] + step[15];
+  output[15] = step[15] + step[14];
+
+  output[16] = step[16];
+  output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64);
+  output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64);
+  output[19] = step[19];
+  output[20] = step[20];
+  output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64);
+  output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64);
+  output[23] = step[23];
+  output[24] = step[24];
+  output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64);
+  output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64);
+  output[27] = step[27];
+  output[28] = step[28];
+  output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64);
+  output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64);
+  output[31] = step[31];
+
+  // Stage 7
+  step[0] = output[0];
+  step[1] = output[1];
+  step[2] = output[2];
+  step[3] = output[3];
+  step[4] = output[4];
+  step[5] = output[5];
+  step[6] = output[6];
+  step[7] = output[7];
+  step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64);
+  step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64);
+  step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64);
+  step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64);
+  step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64);
+  step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64);
+  step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64);
+  step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64);
+
+  step[16] = output[16] + output[17];
+  step[17] = -output[17] + output[16];
+  step[18] = -output[18] + output[19];
+  step[19] = output[19] + output[18];
+  step[20] = output[20] + output[21];
+  step[21] = -output[21] + output[20];
+  step[22] = -output[22] + output[23];
+  step[23] = output[23] + output[22];
+  step[24] = output[24] + output[25];
+  step[25] = -output[25] + output[24];
+  step[26] = -output[26] + output[27];
+  step[27] = output[27] + output[26];
+  step[28] = output[28] + output[29];
+  step[29] = -output[29] + output[28];
+  step[30] = -output[30] + output[31];
+  step[31] = output[31] + output[30];
+
+  // Final stage --- outputs indices are bit-reversed.
+  output[0]  = step[0];
+  output[16] = step[1];
+  output[8]  = step[2];
+  output[24] = step[3];
+  output[4]  = step[4];
+  output[20] = step[5];
+  output[12] = step[6];
+  output[28] = step[7];
+  output[2]  = step[8];
+  output[18] = step[9];
+  output[10] = step[10];
+  output[26] = step[11];
+  output[6]  = step[12];
+  output[22] = step[13];
+  output[14] = step[14];
+  output[30] = step[15];
+
+  output[1]  = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64);
+  output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64);
+  output[9]  = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64);
+  output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64);
+  output[5]  = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64);
+  output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64);
+  output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64);
+  output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64);
+  output[3]  = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64);
+  output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64);
+  output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64);
+  output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64);
+  output[7]  = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64);
+  output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64);
+  output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64);
+  output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
+}
+
+void vpx_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
+  int i, j;
+  tran_high_t output[32 * 32];
+
+  // Columns
+  for (i = 0; i < 32; ++i) {
+    tran_high_t temp_in[32], temp_out[32];
+    for (j = 0; j < 32; ++j)
+      temp_in[j] = input[j * stride + i] * 4;
+    vpx_fdct32(temp_in, temp_out, 0);
+    for (j = 0; j < 32; ++j)
+      output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+  }
+
+  // Rows
+  for (i = 0; i < 32; ++i) {
+    tran_high_t temp_in[32], temp_out[32];
+    for (j = 0; j < 32; ++j)
+      temp_in[j] = output[j + i * 32];
+    vpx_fdct32(temp_in, temp_out, 0);
+    for (j = 0; j < 32; ++j)
+      out[j + i * 32] =
+          (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
+  }
+}
+
+// Note that although we use dct_32_round in dct32 computation flow,
+// this 2d fdct32x32 for rate-distortion optimization loop is operating
+// within 16 bits precision.
+void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
+  int i, j;
+  tran_high_t output[32 * 32];
+
+  // Columns
+  for (i = 0; i < 32; ++i) {
+    tran_high_t temp_in[32], temp_out[32];
+    for (j = 0; j < 32; ++j)
+      temp_in[j] = input[j * stride + i] * 4;
+    vpx_fdct32(temp_in, temp_out, 0);
+    for (j = 0; j < 32; ++j)
+      // TODO(cd): see quality impact of only doing
+      //           output[j * 32 + i] = (temp_out[j] + 1) >> 2;
+      //           PS: also change code in vpx_dsp/x86/vpx_dct_sse2.c
+      output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+  }
+
+  // Rows
+  for (i = 0; i < 32; ++i) {
+    tran_high_t temp_in[32], temp_out[32];
+    for (j = 0; j < 32; ++j)
+      temp_in[j] = output[j + i * 32];
+    vpx_fdct32(temp_in, temp_out, 1);
+    for (j = 0; j < 32; ++j)
+      out[j + i * 32] = (tran_low_t)temp_out[j];
+  }
+}
+
+void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) {
+  int r, c;
+  tran_low_t sum = 0;
+  for (r = 0; r < 32; ++r)
+    for (c = 0; c < 32; ++c)
+      sum += input[r * stride + c];
+
+  output[0] = sum >> 3;
+  output[1] = 0;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output,
+                          int stride) {
+  vpx_fdct4x4_c(input, output, stride);
+}
+
+void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *final_output,
+                          int stride) {
+  vpx_fdct8x8_c(input, final_output, stride);
+}
+
+void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *final_output,
+                            int stride) {
+  vpx_fdct8x8_1_c(input, final_output, stride);
+}
+
+void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output,
+                            int stride) {
+  vpx_fdct16x16_c(input, output, stride);
+}
+
+void vpx_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output,
+                              int stride) {
+  vpx_fdct16x16_1_c(input, output, stride);
+}
+
+void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
+  vpx_fdct32x32_c(input, out, stride);
+}
+
+void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *out,
+                               int stride) {
+  vpx_fdct32x32_rd_c(input, out, stride);
+}
+
+void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *out,
+                              int stride) {
+  vpx_fdct32x32_1_c(input, out, stride);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/libs/libvpx/vpx_dsp/fwd_txfm.h b/libs/libvpx/vpx_dsp/fwd_txfm.h
new file mode 100644
index 0000000000..29e139c73b
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/fwd_txfm.h
@@ -0,0 +1,25 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_FWD_TXFM_H_
+#define VPX_DSP_FWD_TXFM_H_
+
+#include "vpx_dsp/txfm_common.h"
+
+static INLINE tran_high_t fdct_round_shift(tran_high_t input) {
+  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+  // TODO(debargha, peter.derivaz): Find new bounds for this assert
+  // and make the bounds consts.
+  // assert(INT16_MIN <= rv && rv <= INT16_MAX);
+  return rv;
+}
+
+void vpx_fdct32(const tran_high_t *input, tran_high_t *output, int round);
+#endif  // VPX_DSP_FWD_TXFM_H_
diff --git a/libs/libvpx/vpx_dsp/intrapred.c b/libs/libvpx/vpx_dsp/intrapred.c
new file mode 100644
index 0000000000..cc4a74bd26
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/intrapred.c
@@ -0,0 +1,870 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+
+#define DST(x, y) dst[(x) + (y) * stride]
+#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
+#define AVG2(a, b) (((a) + (b) + 1) >> 1)
+
+static INLINE void d207_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                  const uint8_t *above, const uint8_t *left) {
+  int r, c;
+  (void) above;
+  // first column
+  for (r = 0; r < bs - 1; ++r)
+    dst[r * stride] = AVG2(left[r], left[r + 1]);
+  dst[(bs - 1) * stride] = left[bs - 1];
+  dst++;
+
+  // second column
+  for (r = 0; r < bs - 2; ++r)
+    dst[r * stride] = AVG3(left[r], left[r + 1], left[r + 2]);
+  dst[(bs - 2) * stride] = AVG3(left[bs - 2], left[bs - 1], left[bs - 1]);
+  dst[(bs - 1) * stride] = left[bs - 1];
+  dst++;
+
+  // rest of last row
+  for (c = 0; c < bs - 2; ++c)
+    dst[(bs - 1) * stride + c] = left[bs - 1];
+
+  for (r = bs - 2; r >= 0; --r)
+    for (c = 0; c < bs - 2; ++c)
+      dst[r * stride + c] = dst[(r + 1) * stride + c - 2];
+}
+
+#if CONFIG_MISC_FIXES
+static INLINE void d207e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                   const uint8_t *above, const uint8_t *left) {
+  int r, c;
+  (void) above;
+
+  for (r = 0; r < bs; ++r) {
+    for (c = 0; c < bs; ++c) {
+      dst[c] = c & 1 ? AVG3(left[(c >> 1) + r], left[(c >> 1) + r + 1],
+                            left[(c >> 1) + r + 2])
+          : AVG2(left[(c >> 1) + r], left[(c >> 1) + r + 1]);
+    }
+    dst += stride;
+  }
+}
+#endif  // CONFIG_MISC_FIXES
+
+static INLINE void d63_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                 const uint8_t *above, const uint8_t *left) {
+  int r, c;
+  int size;
+  (void)left;
+  for (c = 0; c < bs; ++c) {
+    dst[c] = AVG2(above[c], above[c + 1]);
+    dst[stride + c] = AVG3(above[c], above[c + 1], above[c + 2]);
+  }
+  for (r = 2, size = bs - 2; r < bs; r += 2, --size) {
+    memcpy(dst + (r + 0) * stride, dst + (r >> 1), size);
+    memset(dst + (r + 0) * stride + size, above[bs - 1], bs - size);
+    memcpy(dst + (r + 1) * stride, dst + stride + (r >> 1), size);
+    memset(dst + (r + 1) * stride + size, above[bs - 1], bs - size);
+  }
+}
+
+#if CONFIG_MISC_FIXES
+static INLINE void d63e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                  const uint8_t *above, const uint8_t *left) {
+  int r, c;
+  (void) left;
+  for (r = 0; r < bs; ++r) {
+    for (c = 0; c < bs; ++c) {
+      dst[c] = r & 1 ? AVG3(above[(r >> 1) + c], above[(r >> 1) + c + 1],
+                            above[(r >> 1) + c + 2])
+          : AVG2(above[(r >> 1) + c], above[(r >> 1) + c + 1]);
+    }
+    dst += stride;
+  }
+}
+#endif  // CONFIG_MISC_FIXES
+
+static INLINE void d45_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                 const uint8_t *above, const uint8_t *left) {
+  const uint8_t above_right = above[bs - 1];
+  const uint8_t *const dst_row0 = dst;
+  int x, size;
+  (void)left;
+
+  for (x = 0; x < bs - 1; ++x) {
+    dst[x] = AVG3(above[x], above[x + 1], above[x + 2]);
+  }
+  dst[bs - 1] = above_right;
+  dst += stride;
+  for (x = 1, size = bs - 2; x < bs; ++x, --size) {
+    memcpy(dst, dst_row0 + x, size);
+    memset(dst + size, above_right, x + 1);
+    dst += stride;
+  }
+}
+
+#if CONFIG_MISC_FIXES
+static INLINE void d45e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                  const uint8_t *above, const uint8_t *left) {
+  int r, c;
+  (void) left;
+  for (r = 0; r < bs; ++r) {
+    for (c = 0; c < bs; ++c) {
+      dst[c] = AVG3(above[r + c], above[r + c + 1],
+                    above[r + c + 1 + (r + c + 2 < bs * 2)]);
+    }
+    dst += stride;
+  }
+}
+#endif  // CONFIG_MISC_FIXES
+
+static INLINE void d117_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                  const uint8_t *above, const uint8_t *left) {
+  int r, c;
+
+  // first row
+  for (c = 0; c < bs; c++)
+    dst[c] = AVG2(above[c - 1], above[c]);
+  dst += stride;
+
+  // second row
+  dst[0] = AVG3(left[0], above[-1], above[0]);
+  for (c = 1; c < bs; c++)
+    dst[c] = AVG3(above[c - 2], above[c - 1], above[c]);
+  dst += stride;
+
+  // the rest of first col
+  dst[0] = AVG3(above[-1], left[0], left[1]);
+  for (r = 3; r < bs; ++r)
+    dst[(r - 2) * stride] = AVG3(left[r - 3], left[r - 2], left[r - 1]);
+
+  // the rest of the block
+  for (r = 2; r < bs; ++r) {
+    for (c = 1; c < bs; c++)
+      dst[c] = dst[-2 * stride + c - 1];
+    dst += stride;
+  }
+}
+
+static INLINE void d135_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                  const uint8_t *above, const uint8_t *left) {
+  int i;
+#if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ > 7
+  // silence a spurious -Warray-bounds warning, possibly related to:
+  // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=56273
+  uint8_t border[69];
+#else
+  uint8_t border[32 + 32 - 1];  // outer border from bottom-left to top-right
+#endif
+
+  // dst(bs, bs - 2)[0], i.e., border starting at bottom-left
+  for (i = 0; i < bs - 2; ++i) {
+    border[i] = AVG3(left[bs - 3 - i], left[bs - 2 - i], left[bs - 1 - i]);
+  }
+  border[bs - 2] = AVG3(above[-1], left[0], left[1]);
+  border[bs - 1] = AVG3(left[0], above[-1], above[0]);
+  border[bs - 0] = AVG3(above[-1], above[0], above[1]);
+  // dst[0][2, size), i.e., remaining top border ascending
+  for (i = 0; i < bs - 2; ++i) {
+    border[bs + 1 + i] = AVG3(above[i], above[i + 1], above[i + 2]);
+  }
+
+  for (i = 0; i < bs; ++i) {
+    memcpy(dst + i * stride, border + bs - 1 - i, bs);
+  }
+}
+
+static INLINE void d153_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                  const uint8_t *above, const uint8_t *left) {
+  int r, c;
+  dst[0] = AVG2(above[-1], left[0]);
+  for (r = 1; r < bs; r++)
+    dst[r * stride] = AVG2(left[r - 1], left[r]);
+  dst++;
+
+  dst[0] = AVG3(left[0], above[-1], above[0]);
+  dst[stride] = AVG3(above[-1], left[0], left[1]);
+  for (r = 2; r < bs; r++)
+    dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]);
+  dst++;
+
+  for (c = 0; c < bs - 2; c++)
+    dst[c] = AVG3(above[c - 1], above[c], above[c + 1]);
+  dst += stride;
+
+  for (r = 1; r < bs; ++r) {
+    for (c = 0; c < bs - 2; c++)
+      dst[c] = dst[-stride + c - 2];
+    dst += stride;
+  }
+}
+
+static INLINE void v_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                               const uint8_t *above, const uint8_t *left) {
+  int r;
+  (void) left;
+
+  for (r = 0; r < bs; r++) {
+    memcpy(dst, above, bs);
+    dst += stride;
+  }
+}
+
+static INLINE void h_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                               const uint8_t *above, const uint8_t *left) {
+  int r;
+  (void) above;
+
+  for (r = 0; r < bs; r++) {
+    memset(dst, left[r], bs);
+    dst += stride;
+  }
+}
+
+static INLINE void tm_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                const uint8_t *above, const uint8_t *left) {
+  int r, c;
+  int ytop_left = above[-1];
+
+  for (r = 0; r < bs; r++) {
+    for (c = 0; c < bs; c++)
+      dst[c] = clip_pixel(left[r] + above[c] - ytop_left);
+    dst += stride;
+  }
+}
+
+static INLINE void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                    const uint8_t *above, const uint8_t *left) {
+  int r;
+  (void) above;
+  (void) left;
+
+  for (r = 0; r < bs; r++) {
+    memset(dst, 128, bs);
+    dst += stride;
+  }
+}
+
+static INLINE void dc_left_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  int i, r, expected_dc, sum = 0;
+  (void) above;
+
+  for (i = 0; i < bs; i++)
+    sum += left[i];
+  expected_dc = (sum + (bs >> 1)) / bs;
+
+  for (r = 0; r < bs; r++) {
+    memset(dst, expected_dc, bs);
+    dst += stride;
+  }
+}
+
+static INLINE void dc_top_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                    const uint8_t *above, const uint8_t *left) {
+  int i, r, expected_dc, sum = 0;
+  (void) left;
+
+  for (i = 0; i < bs; i++)
+    sum += above[i];
+  expected_dc = (sum + (bs >> 1)) / bs;
+
+  for (r = 0; r < bs; r++) {
+    memset(dst, expected_dc, bs);
+    dst += stride;
+  }
+}
+
+static INLINE void dc_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                const uint8_t *above, const uint8_t *left) {
+  int i, r, expected_dc, sum = 0;
+  const int count = 2 * bs;
+
+  for (i = 0; i < bs; i++) {
+    sum += above[i];
+    sum += left[i];
+  }
+
+  expected_dc = (sum + (count >> 1)) / count;
+
+  for (r = 0; r < bs; r++) {
+    memset(dst, expected_dc, bs);
+    dst += stride;
+  }
+}
+
+void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
+                            const uint8_t *above, const uint8_t *left) {
+  const int H = above[-1];
+  const int I = left[0];
+  const int J = left[1];
+  const int K = left[2];
+  const int L = left[3];
+
+  memset(dst + stride * 0, AVG3(H, I, J), 4);
+  memset(dst + stride * 1, AVG3(I, J, K), 4);
+  memset(dst + stride * 2, AVG3(J, K, L), 4);
+  memset(dst + stride * 3, AVG3(K, L, L), 4);
+}
+
+void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
+                            const uint8_t *above, const uint8_t *left) {
+  const int H = above[-1];
+  const int I = above[0];
+  const int J = above[1];
+  const int K = above[2];
+  const int L = above[3];
+  const int M = above[4];
+  (void)left;
+
+  dst[0] = AVG3(H, I, J);
+  dst[1] = AVG3(I, J, K);
+  dst[2] = AVG3(J, K, L);
+  dst[3] = AVG3(K, L, M);
+  memcpy(dst + stride * 1, dst, 4);
+  memcpy(dst + stride * 2, dst, 4);
+  memcpy(dst + stride * 3, dst, 4);
+}
+
+void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  const int I = left[0];
+  const int J = left[1];
+  const int K = left[2];
+  const int L = left[3];
+  (void)above;
+  DST(0, 0) =             AVG2(I, J);
+  DST(2, 0) = DST(0, 1) = AVG2(J, K);
+  DST(2, 1) = DST(0, 2) = AVG2(K, L);
+  DST(1, 0) =             AVG3(I, J, K);
+  DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
+  DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
+  DST(3, 2) = DST(2, 2) =
+      DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
+}
+
+void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
+                             const uint8_t *above, const uint8_t *left) {
+  const int A = above[0];
+  const int B = above[1];
+  const int C = above[2];
+  const int D = above[3];
+  const int E = above[4];
+  const int F = above[5];
+  const int G = above[6];
+  (void)left;
+  DST(0, 0) =             AVG2(A, B);
+  DST(1, 0) = DST(0, 2) = AVG2(B, C);
+  DST(2, 0) = DST(1, 2) = AVG2(C, D);
+  DST(3, 0) = DST(2, 2) = AVG2(D, E);
+              DST(3, 2) = AVG2(E, F);  // differs from vp8
+
+  DST(0, 1) =             AVG3(A, B, C);
+  DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
+  DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
+  DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
+              DST(3, 3) = AVG3(E, F, G);  // differs from vp8
+}
+
+void vpx_d63f_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  const int A = above[0];
+  const int B = above[1];
+  const int C = above[2];
+  const int D = above[3];
+  const int E = above[4];
+  const int F = above[5];
+  const int G = above[6];
+  const int H = above[7];
+  (void)left;
+  DST(0, 0) =             AVG2(A, B);
+  DST(1, 0) = DST(0, 2) = AVG2(B, C);
+  DST(2, 0) = DST(1, 2) = AVG2(C, D);
+  DST(3, 0) = DST(2, 2) = AVG2(D, E);
+              DST(3, 2) = AVG3(E, F, G);
+
+  DST(0, 1) =             AVG3(A, B, C);
+  DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
+  DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
+  DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
+              DST(3, 3) = AVG3(F, G, H);
+}
+
+void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
+                             const uint8_t *above, const uint8_t *left) {
+  const int A = above[0];
+  const int B = above[1];
+  const int C = above[2];
+  const int D = above[3];
+  const int E = above[4];
+  const int F = above[5];
+  const int G = above[6];
+  const int H = above[7];
+  (void)stride;
+  (void)left;
+  DST(0, 0)                                     = AVG3(A, B, C);
+  DST(1, 0) = DST(0, 1)                         = AVG3(B, C, D);
+  DST(2, 0) = DST(1, 1) = DST(0, 2)             = AVG3(C, D, E);
+  DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
+              DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G);
+                          DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
+                                      DST(3, 3) = H;  // differs from vp8
+}
+
+void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  const int A = above[0];
+  const int B = above[1];
+  const int C = above[2];
+  const int D = above[3];
+  const int E = above[4];
+  const int F = above[5];
+  const int G = above[6];
+  const int H = above[7];
+  (void)stride;
+  (void)left;
+  DST(0, 0)                                     = AVG3(A, B, C);
+  DST(1, 0) = DST(0, 1)                         = AVG3(B, C, D);
+  DST(2, 0) = DST(1, 1) = DST(0, 2)             = AVG3(C, D, E);
+  DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
+              DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G);
+                          DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
+                                      DST(3, 3) = AVG3(G, H, H);
+}
+
+void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  const int I = left[0];
+  const int J = left[1];
+  const int K = left[2];
+  const int X = above[-1];
+  const int A = above[0];
+  const int B = above[1];
+  const int C = above[2];
+  const int D = above[3];
+  DST(0, 0) = DST(1, 2) = AVG2(X, A);
+  DST(1, 0) = DST(2, 2) = AVG2(A, B);
+  DST(2, 0) = DST(3, 2) = AVG2(B, C);
+  DST(3, 0)             = AVG2(C, D);
+
+  DST(0, 3) =             AVG3(K, J, I);
+  DST(0, 2) =             AVG3(J, I, X);
+  DST(0, 1) = DST(1, 3) = AVG3(I, X, A);
+  DST(1, 1) = DST(2, 3) = AVG3(X, A, B);
+  DST(2, 1) = DST(3, 3) = AVG3(A, B, C);
+  DST(3, 1) =             AVG3(B, C, D);
+}
+
+void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  const int I = left[0];
+  const int J = left[1];
+  const int K = left[2];
+  const int L = left[3];
+  const int X = above[-1];
+  const int A = above[0];
+  const int B = above[1];
+  const int C = above[2];
+  const int D = above[3];
+  (void)stride;
+  DST(0, 3)                                     = AVG3(J, K, L);
+  DST(1, 3) = DST(0, 2)                         = AVG3(I, J, K);
+  DST(2, 3) = DST(1, 2) = DST(0, 1)             = AVG3(X, I, J);
+  DST(3, 3) = DST(2, 2) = DST(1, 1) = DST(0, 0) = AVG3(A, X, I);
+              DST(3, 2) = DST(2, 1) = DST(1, 0) = AVG3(B, A, X);
+                          DST(3, 1) = DST(2, 0) = AVG3(C, B, A);
+                                      DST(3, 0) = AVG3(D, C, B);
+}
+
+void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  const int I = left[0];
+  const int J = left[1];
+  const int K = left[2];
+  const int L = left[3];
+  const int X = above[-1];
+  const int A = above[0];
+  const int B = above[1];
+  const int C = above[2];
+
+  DST(0, 0) = DST(2, 1) = AVG2(I, X);
+  DST(0, 1) = DST(2, 2) = AVG2(J, I);
+  DST(0, 2) = DST(2, 3) = AVG2(K, J);
+  DST(0, 3)             = AVG2(L, K);
+
+  DST(3, 0)             = AVG3(A, B, C);
+  DST(2, 0)             = AVG3(X, A, B);
+  DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
+  DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
+  DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
+  DST(1, 3)             = AVG3(L, K, J);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void highbd_d207_predictor(uint16_t *dst, ptrdiff_t stride,
+                                         int bs, const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  int r, c;
+  (void) above;
+  (void) bd;
+
+  // First column.
+  for (r = 0; r < bs - 1; ++r) {
+    dst[r * stride] = AVG2(left[r], left[r + 1]);
+  }
+  dst[(bs - 1) * stride] = left[bs - 1];
+  dst++;
+
+  // Second column.
+  for (r = 0; r < bs - 2; ++r) {
+    dst[r * stride] = AVG3(left[r], left[r + 1], left[r + 2]);
+  }
+  dst[(bs - 2) * stride] = AVG3(left[bs - 2], left[bs - 1], left[bs - 1]);
+  dst[(bs - 1) * stride] = left[bs - 1];
+  dst++;
+
+  // Rest of last row.
+  for (c = 0; c < bs - 2; ++c)
+    dst[(bs - 1) * stride + c] = left[bs - 1];
+
+  for (r = bs - 2; r >= 0; --r) {
+    for (c = 0; c < bs - 2; ++c)
+      dst[r * stride + c] = dst[(r + 1) * stride + c - 2];
+  }
+}
+
+#if CONFIG_MISC_FIXES
+static INLINE void highbd_d207e_predictor(uint16_t *dst, ptrdiff_t stride,
+                                          int bs, const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  int r, c;
+  (void) above;
+  (void) bd;
+
+  for (r = 0; r < bs; ++r) {
+    for (c = 0; c < bs; ++c) {
+      dst[c] = c & 1 ? AVG3(left[(c >> 1) + r], left[(c >> 1) + r + 1],
+                            left[(c >> 1) + r + 2])
+          : AVG2(left[(c >> 1) + r], left[(c >> 1) + r + 1]);
+    }
+    dst += stride;
+  }
+}
+#endif  // CONFIG_MISC_FIXES
+
+static INLINE void highbd_d63_predictor(uint16_t *dst, ptrdiff_t stride,
+                                        int bs, const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  int r, c;
+  (void) left;
+  (void) bd;
+  for (r = 0; r < bs; ++r) {
+    for (c = 0; c < bs; ++c) {
+      dst[c] = r & 1 ? AVG3(above[(r >> 1) + c], above[(r >> 1) + c + 1],
+                            above[(r >> 1) + c + 2])
+          : AVG2(above[(r >> 1) + c], above[(r >> 1) + c + 1]);
+    }
+    dst += stride;
+  }
+}
+
+#define highbd_d63e_predictor highbd_d63_predictor
+
+static INLINE void highbd_d45_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  int r, c;
+  (void) left;
+  (void) bd;
+  for (r = 0; r < bs; ++r) {
+    for (c = 0; c < bs; ++c) {
+      dst[c] = r + c + 2 < bs * 2 ? AVG3(above[r + c], above[r + c + 1],
+                                         above[r + c + 2])
+          : above[bs * 2 - 1];
+    }
+    dst += stride;
+  }
+}
+
+#if CONFIG_MISC_FIXES
+static INLINE void highbd_d45e_predictor(uint16_t *dst, ptrdiff_t stride,
+                                         int bs, const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  int r, c;
+  (void) left;
+  (void) bd;
+  for (r = 0; r < bs; ++r) {
+    for (c = 0; c < bs; ++c) {
+      dst[c] = AVG3(above[r + c], above[r + c + 1],
+                    above[r + c + 1 + (r + c + 2 < bs * 2)]);
+    }
+    dst += stride;
+  }
+}
+#endif  // CONFIG_MISC_FIXES
+
+static INLINE void highbd_d117_predictor(uint16_t *dst, ptrdiff_t stride,
+                                         int bs, const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  int r, c;
+  (void) bd;
+
+  // first row
+  for (c = 0; c < bs; c++)
+    dst[c] = AVG2(above[c - 1], above[c]);
+  dst += stride;
+
+  // second row
+  dst[0] = AVG3(left[0], above[-1], above[0]);
+  for (c = 1; c < bs; c++)
+    dst[c] = AVG3(above[c - 2], above[c - 1], above[c]);
+  dst += stride;
+
+  // the rest of first col
+  dst[0] = AVG3(above[-1], left[0], left[1]);
+  for (r = 3; r < bs; ++r)
+    dst[(r - 2) * stride] = AVG3(left[r - 3], left[r - 2], left[r - 1]);
+
+  // the rest of the block
+  for (r = 2; r < bs; ++r) {
+    for (c = 1; c < bs; c++)
+      dst[c] = dst[-2 * stride + c - 1];
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_d135_predictor(uint16_t *dst, ptrdiff_t stride,
+                                         int bs, const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  int r, c;
+  (void) bd;
+  dst[0] = AVG3(left[0], above[-1], above[0]);
+  for (c = 1; c < bs; c++)
+    dst[c] = AVG3(above[c - 2], above[c - 1], above[c]);
+
+  dst[stride] = AVG3(above[-1], left[0], left[1]);
+  for (r = 2; r < bs; ++r)
+    dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]);
+
+  dst += stride;
+  for (r = 1; r < bs; ++r) {
+    for (c = 1; c < bs; c++)
+      dst[c] = dst[-stride + c - 1];
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_d153_predictor(uint16_t *dst, ptrdiff_t stride,
+                                         int bs, const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  int r, c;
+  (void) bd;
+  dst[0] = AVG2(above[-1], left[0]);
+  for (r = 1; r < bs; r++)
+    dst[r * stride] = AVG2(left[r - 1], left[r]);
+  dst++;
+
+  dst[0] = AVG3(left[0], above[-1], above[0]);
+  dst[stride] = AVG3(above[-1], left[0], left[1]);
+  for (r = 2; r < bs; r++)
+    dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]);
+  dst++;
+
+  for (c = 0; c < bs - 2; c++)
+    dst[c] = AVG3(above[c - 1], above[c], above[c + 1]);
+  dst += stride;
+
+  for (r = 1; r < bs; ++r) {
+    for (c = 0; c < bs - 2; c++)
+      dst[c] = dst[-stride + c - 2];
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_v_predictor(uint16_t *dst, ptrdiff_t stride,
+                                      int bs, const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  int r;
+  (void) left;
+  (void) bd;
+  for (r = 0; r < bs; r++) {
+    memcpy(dst, above, bs * sizeof(uint16_t));
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_h_predictor(uint16_t *dst, ptrdiff_t stride,
+                                      int bs, const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  int r;
+  (void) above;
+  (void) bd;
+  for (r = 0; r < bs; r++) {
+    vpx_memset16(dst, left[r], bs);
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_tm_predictor(uint16_t *dst, ptrdiff_t stride,
+                                       int bs, const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  int r, c;
+  int ytop_left = above[-1];
+  (void) bd;
+
+  for (r = 0; r < bs; r++) {
+    for (c = 0; c < bs; c++)
+      dst[c] = clip_pixel_highbd(left[r] + above[c] - ytop_left, bd);
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_dc_128_predictor(uint16_t *dst, ptrdiff_t stride,
+                                           int bs, const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  int r;
+  (void) above;
+  (void) left;
+
+  for (r = 0; r < bs; r++) {
+    vpx_memset16(dst, 128 << (bd - 8), bs);
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_dc_left_predictor(uint16_t *dst, ptrdiff_t stride,
+                                            int bs, const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  int i, r, expected_dc, sum = 0;
+  (void) above;
+  (void) bd;
+
+  for (i = 0; i < bs; i++)
+    sum += left[i];
+  expected_dc = (sum + (bs >> 1)) / bs;
+
+  for (r = 0; r < bs; r++) {
+    vpx_memset16(dst, expected_dc, bs);
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_dc_top_predictor(uint16_t *dst, ptrdiff_t stride,
+                                           int bs, const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  int i, r, expected_dc, sum = 0;
+  (void) left;
+  (void) bd;
+
+  for (i = 0; i < bs; i++)
+    sum += above[i];
+  expected_dc = (sum + (bs >> 1)) / bs;
+
+  for (r = 0; r < bs; r++) {
+    vpx_memset16(dst, expected_dc, bs);
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride,
+                                       int bs, const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  int i, r, expected_dc, sum = 0;
+  const int count = 2 * bs;
+  (void) bd;
+
+  for (i = 0; i < bs; i++) {
+    sum += above[i];
+    sum += left[i];
+  }
+
+  expected_dc = (sum + (count >> 1)) / count;
+
+  for (r = 0; r < bs; r++) {
+    vpx_memset16(dst, expected_dc, bs);
+    dst += stride;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+// This serves as a wrapper function, so that all the prediction functions
+// can be unified and accessed as a pointer array. Note that the boundary
+// above and left are not necessarily used all the time.
+#define intra_pred_sized(type, size) \
+  void vpx_##type##_predictor_##size##x##size##_c(uint8_t *dst, \
+                                                  ptrdiff_t stride, \
+                                                  const uint8_t *above, \
+                                                  const uint8_t *left) { \
+    type##_predictor(dst, stride, size, above, left); \
+  }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+#define intra_pred_highbd_sized(type, size) \
+  void vpx_highbd_##type##_predictor_##size##x##size##_c( \
+      uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
+      const uint16_t *left, int bd) { \
+    highbd_##type##_predictor(dst, stride, size, above, left, bd); \
+  }
+
+#define intra_pred_allsizes(type) \
+  intra_pred_sized(type, 4) \
+  intra_pred_sized(type, 8) \
+  intra_pred_sized(type, 16) \
+  intra_pred_sized(type, 32) \
+  intra_pred_highbd_sized(type, 4) \
+  intra_pred_highbd_sized(type, 8) \
+  intra_pred_highbd_sized(type, 16) \
+  intra_pred_highbd_sized(type, 32)
+
+#define intra_pred_no_4x4(type) \
+  intra_pred_sized(type, 8) \
+  intra_pred_sized(type, 16) \
+  intra_pred_sized(type, 32) \
+  intra_pred_highbd_sized(type, 4) \
+  intra_pred_highbd_sized(type, 8) \
+  intra_pred_highbd_sized(type, 16) \
+  intra_pred_highbd_sized(type, 32)
+
+#else
+#define intra_pred_allsizes(type) \
+  intra_pred_sized(type, 4) \
+  intra_pred_sized(type, 8) \
+  intra_pred_sized(type, 16) \
+  intra_pred_sized(type, 32)
+
+#define intra_pred_no_4x4(type) \
+  intra_pred_sized(type, 8) \
+  intra_pred_sized(type, 16) \
+  intra_pred_sized(type, 32)
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+intra_pred_no_4x4(d207)
+intra_pred_no_4x4(d63)
+intra_pred_no_4x4(d45)
+#if CONFIG_MISC_FIXES
+intra_pred_allsizes(d207e)
+intra_pred_allsizes(d63e)
+intra_pred_no_4x4(d45e)
+#endif
+intra_pred_no_4x4(d117)
+intra_pred_no_4x4(d135)
+intra_pred_no_4x4(d153)
+intra_pred_allsizes(v)
+intra_pred_allsizes(h)
+intra_pred_allsizes(tm)
+intra_pred_allsizes(dc_128)
+intra_pred_allsizes(dc_left)
+intra_pred_allsizes(dc_top)
+intra_pred_allsizes(dc)
+#undef intra_pred_allsizes
diff --git a/libs/libvpx/vpx_dsp/inv_txfm.c b/libs/libvpx/vpx_dsp/inv_txfm.c
new file mode 100644
index 0000000000..a0f59bf757
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/inv_txfm.c
@@ -0,0 +1,2513 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <string.h>
+
+#include "vpx_dsp/inv_txfm.h"
+
+void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+/* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
+   0.5 shifts per pixel. */
+  int i;
+  tran_low_t output[16];
+  tran_high_t a1, b1, c1, d1, e1;
+  const tran_low_t *ip = input;
+  tran_low_t *op = output;
+
+  for (i = 0; i < 4; i++) {
+    a1 = ip[0] >> UNIT_QUANT_SHIFT;
+    c1 = ip[1] >> UNIT_QUANT_SHIFT;
+    d1 = ip[2] >> UNIT_QUANT_SHIFT;
+    b1 = ip[3] >> UNIT_QUANT_SHIFT;
+    a1 += c1;
+    d1 -= b1;
+    e1 = (a1 - d1) >> 1;
+    b1 = e1 - b1;
+    c1 = e1 - c1;
+    a1 -= b1;
+    d1 += c1;
+    op[0] = WRAPLOW(a1, 8);
+    op[1] = WRAPLOW(b1, 8);
+    op[2] = WRAPLOW(c1, 8);
+    op[3] = WRAPLOW(d1, 8);
+    ip += 4;
+    op += 4;
+  }
+
+  ip = output;
+  for (i = 0; i < 4; i++) {
+    a1 = ip[4 * 0];
+    c1 = ip[4 * 1];
+    d1 = ip[4 * 2];
+    b1 = ip[4 * 3];
+    a1 += c1;
+    d1 -= b1;
+    e1 = (a1 - d1) >> 1;
+    b1 = e1 - b1;
+    c1 = e1 - c1;
+    a1 -= b1;
+    d1 += c1;
+    dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1);
+    dest[stride * 1] = clip_pixel_add(dest[stride * 1], b1);
+    dest[stride * 2] = clip_pixel_add(dest[stride * 2], c1);
+    dest[stride * 3] = clip_pixel_add(dest[stride * 3], d1);
+
+    ip++;
+    dest++;
+  }
+}
+
+void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) {
+  int i;
+  tran_high_t a1, e1;
+  tran_low_t tmp[4];
+  const tran_low_t *ip = in;
+  tran_low_t *op = tmp;
+
+  a1 = ip[0] >> UNIT_QUANT_SHIFT;
+  e1 = a1 >> 1;
+  a1 -= e1;
+  op[0] = WRAPLOW(a1, 8);
+  op[1] = op[2] = op[3] = WRAPLOW(e1, 8);
+
+  ip = tmp;
+  for (i = 0; i < 4; i++) {
+    e1 = ip[0] >> 1;
+    a1 = ip[0] - e1;
+    dest[dest_stride * 0] = clip_pixel_add(dest[dest_stride * 0], a1);
+    dest[dest_stride * 1] = clip_pixel_add(dest[dest_stride * 1], e1);
+    dest[dest_stride * 2] = clip_pixel_add(dest[dest_stride * 2], e1);
+    dest[dest_stride * 3] = clip_pixel_add(dest[dest_stride * 3], e1);
+    ip++;
+    dest++;
+  }
+}
+
+void idct4_c(const tran_low_t *input, tran_low_t *output) {
+  tran_low_t step[4];
+  tran_high_t temp1, temp2;
+  // stage 1
+  temp1 = (input[0] + input[2]) * cospi_16_64;
+  temp2 = (input[0] - input[2]) * cospi_16_64;
+  step[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
+  temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
+  step[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  // stage 2
+  output[0] = WRAPLOW(step[0] + step[3], 8);
+  output[1] = WRAPLOW(step[1] + step[2], 8);
+  output[2] = WRAPLOW(step[1] - step[2], 8);
+  output[3] = WRAPLOW(step[0] - step[3], 8);
+}
+
+void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+  tran_low_t out[4 * 4];
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[4], temp_out[4];
+
+  // Rows
+  for (i = 0; i < 4; ++i) {
+    idct4_c(input, outptr);
+    input += 4;
+    outptr += 4;
+  }
+
+  // Columns
+  for (i = 0; i < 4; ++i) {
+    for (j = 0; j < 4; ++j)
+      temp_in[j] = out[j * 4 + i];
+    idct4_c(temp_in, temp_out);
+    for (j = 0; j < 4; ++j) {
+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+                                            ROUND_POWER_OF_TWO(temp_out[j], 4));
+    }
+  }
+}
+
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
+                         int dest_stride) {
+  int i;
+  tran_high_t a1;
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
+  a1 = ROUND_POWER_OF_TWO(out, 4);
+
+  for (i = 0; i < 4; i++) {
+    dest[0] = clip_pixel_add(dest[0], a1);
+    dest[1] = clip_pixel_add(dest[1], a1);
+    dest[2] = clip_pixel_add(dest[2], a1);
+    dest[3] = clip_pixel_add(dest[3], a1);
+    dest += dest_stride;
+  }
+}
+
+void idct8_c(const tran_low_t *input, tran_low_t *output) {
+  tran_low_t step1[8], step2[8];
+  tran_high_t temp1, temp2;
+  // stage 1
+  step1[0] = input[0];
+  step1[2] = input[4];
+  step1[1] = input[2];
+  step1[3] = input[6];
+  temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
+  temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
+  step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
+  temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  // stage 2
+  temp1 = (step1[0] + step1[2]) * cospi_16_64;
+  temp2 = (step1[0] - step1[2]) * cospi_16_64;
+  step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
+  temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
+  step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[4] = WRAPLOW(step1[4] + step1[5], 8);
+  step2[5] = WRAPLOW(step1[4] - step1[5], 8);
+  step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
+  step2[7] = WRAPLOW(step1[6] + step1[7], 8);
+
+  // stage 3
+  step1[0] = WRAPLOW(step2[0] + step2[3], 8);
+  step1[1] = WRAPLOW(step2[1] + step2[2], 8);
+  step1[2] = WRAPLOW(step2[1] - step2[2], 8);
+  step1[3] = WRAPLOW(step2[0] - step2[3], 8);
+  step1[4] = step2[4];
+  temp1 = (step2[6] - step2[5]) * cospi_16_64;
+  temp2 = (step2[5] + step2[6]) * cospi_16_64;
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[7] = step2[7];
+
+  // stage 4
+  output[0] = WRAPLOW(step1[0] + step1[7], 8);
+  output[1] = WRAPLOW(step1[1] + step1[6], 8);
+  output[2] = WRAPLOW(step1[2] + step1[5], 8);
+  output[3] = WRAPLOW(step1[3] + step1[4], 8);
+  output[4] = WRAPLOW(step1[3] - step1[4], 8);
+  output[5] = WRAPLOW(step1[2] - step1[5], 8);
+  output[6] = WRAPLOW(step1[1] - step1[6], 8);
+  output[7] = WRAPLOW(step1[0] - step1[7], 8);
+}
+
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+  tran_low_t out[8 * 8];
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[8], temp_out[8];
+
+  // First transform rows
+  for (i = 0; i < 8; ++i) {
+    idct8_c(input, outptr);
+    input += 8;
+    outptr += 8;
+  }
+
+  // Then transform columns
+  for (i = 0; i < 8; ++i) {
+    for (j = 0; j < 8; ++j)
+      temp_in[j] = out[j * 8 + i];
+    idct8_c(temp_in, temp_out);
+    for (j = 0; j < 8; ++j) {
+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+                                            ROUND_POWER_OF_TWO(temp_out[j], 5));
+    }
+  }
+}
+
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+  int i, j;
+  tran_high_t a1;
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
+  a1 = ROUND_POWER_OF_TWO(out, 5);
+  for (j = 0; j < 8; ++j) {
+    for (i = 0; i < 8; ++i)
+      dest[i] = clip_pixel_add(dest[i], a1);
+    dest += stride;
+  }
+}
+
+void iadst4_c(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+  tran_low_t x0 = input[0];
+  tran_low_t x1 = input[1];
+  tran_low_t x2 = input[2];
+  tran_low_t x3 = input[3];
+
+  if (!(x0 | x1 | x2 | x3)) {
+    output[0] = output[1] = output[2] = output[3] = 0;
+    return;
+  }
+
+  s0 = sinpi_1_9 * x0;
+  s1 = sinpi_2_9 * x0;
+  s2 = sinpi_3_9 * x1;
+  s3 = sinpi_4_9 * x2;
+  s4 = sinpi_1_9 * x2;
+  s5 = sinpi_2_9 * x3;
+  s6 = sinpi_4_9 * x3;
+  s7 = x0 - x2 + x3;
+
+  s0 = s0 + s3 + s5;
+  s1 = s1 - s4 - s6;
+  s3 = s2;
+  s2 = sinpi_3_9 * s7;
+
+  // 1-D transform scaling factor is sqrt(2).
+  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
+  // + 1b (addition) = 29b.
+  // Hence the output bit depth is 15b.
+  output[0] = WRAPLOW(dct_const_round_shift(s0 + s3), 8);
+  output[1] = WRAPLOW(dct_const_round_shift(s1 + s3), 8);
+  output[2] = WRAPLOW(dct_const_round_shift(s2), 8);
+  output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3), 8);
+}
+
+void iadst8_c(const tran_low_t *input, tran_low_t *output) {
+  int s0, s1, s2, s3, s4, s5, s6, s7;
+
+  tran_high_t x0 = input[7];
+  tran_high_t x1 = input[0];
+  tran_high_t x2 = input[5];
+  tran_high_t x3 = input[2];
+  tran_high_t x4 = input[3];
+  tran_high_t x5 = input[4];
+  tran_high_t x6 = input[1];
+  tran_high_t x7 = input[6];
+
+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
+    output[0] = output[1] = output[2] = output[3] = output[4]
+              = output[5] = output[6] = output[7] = 0;
+    return;
+  }
+
+  // stage 1
+  s0 = (int)(cospi_2_64  * x0 + cospi_30_64 * x1);
+  s1 = (int)(cospi_30_64 * x0 - cospi_2_64  * x1);
+  s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);
+  s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);
+  s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);
+  s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);
+  s6 = (int)(cospi_26_64 * x6 + cospi_6_64  * x7);
+  s7 = (int)(cospi_6_64  * x6 - cospi_26_64 * x7);
+
+  x0 = WRAPLOW(dct_const_round_shift(s0 + s4), 8);
+  x1 = WRAPLOW(dct_const_round_shift(s1 + s5), 8);
+  x2 = WRAPLOW(dct_const_round_shift(s2 + s6), 8);
+  x3 = WRAPLOW(dct_const_round_shift(s3 + s7), 8);
+  x4 = WRAPLOW(dct_const_round_shift(s0 - s4), 8);
+  x5 = WRAPLOW(dct_const_round_shift(s1 - s5), 8);
+  x6 = WRAPLOW(dct_const_round_shift(s2 - s6), 8);
+  x7 = WRAPLOW(dct_const_round_shift(s3 - s7), 8);
+
+  // stage 2
+  s0 = (int)x0;
+  s1 = (int)x1;
+  s2 = (int)x2;
+  s3 = (int)x3;
+  s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5);
+  s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5);
+  s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
+  s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);
+
+  x0 = WRAPLOW(s0 + s2, 8);
+  x1 = WRAPLOW(s1 + s3, 8);
+  x2 = WRAPLOW(s0 - s2, 8);
+  x3 = WRAPLOW(s1 - s3, 8);
+  x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8);
+  x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8);
+  x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8);
+  x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);
+
+  // stage 3
+  s2 = (int)(cospi_16_64 * (x2 + x3));
+  s3 = (int)(cospi_16_64 * (x2 - x3));
+  s6 = (int)(cospi_16_64 * (x6 + x7));
+  s7 = (int)(cospi_16_64 * (x6 - x7));
+
+  x2 = WRAPLOW(dct_const_round_shift(s2), 8);
+  x3 = WRAPLOW(dct_const_round_shift(s3), 8);
+  x6 = WRAPLOW(dct_const_round_shift(s6), 8);
+  x7 = WRAPLOW(dct_const_round_shift(s7), 8);
+
+  output[0] = WRAPLOW(x0, 8);
+  output[1] = WRAPLOW(-x4, 8);
+  output[2] = WRAPLOW(x6, 8);
+  output[3] = WRAPLOW(-x2, 8);
+  output[4] = WRAPLOW(x3, 8);
+  output[5] = WRAPLOW(-x7, 8);
+  output[6] = WRAPLOW(x5, 8);
+  output[7] = WRAPLOW(-x1, 8);
+}
+
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+  tran_low_t out[8 * 8] = { 0 };
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[8], temp_out[8];
+
+  // First transform rows
+  // only first 4 row has non-zero coefs
+  for (i = 0; i < 4; ++i) {
+    idct8_c(input, outptr);
+    input += 8;
+    outptr += 8;
+  }
+
+  // Then transform columns
+  for (i = 0; i < 8; ++i) {
+    for (j = 0; j < 8; ++j)
+      temp_in[j] = out[j * 8 + i];
+    idct8_c(temp_in, temp_out);
+    for (j = 0; j < 8; ++j) {
+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+                                            ROUND_POWER_OF_TWO(temp_out[j], 5));
+    }
+  }
+}
+
+void idct16_c(const tran_low_t *input, tran_low_t *output) {
+  tran_low_t step1[16], step2[16];
+  tran_high_t temp1, temp2;
+
+  // stage 1
+  step1[0] = input[0/2];
+  step1[1] = input[16/2];
+  step1[2] = input[8/2];
+  step1[3] = input[24/2];
+  step1[4] = input[4/2];
+  step1[5] = input[20/2];
+  step1[6] = input[12/2];
+  step1[7] = input[28/2];
+  step1[8] = input[2/2];
+  step1[9] = input[18/2];
+  step1[10] = input[10/2];
+  step1[11] = input[26/2];
+  step1[12] = input[6/2];
+  step1[13] = input[22/2];
+  step1[14] = input[14/2];
+  step1[15] = input[30/2];
+
+  // stage 2
+  step2[0] = step1[0];
+  step2[1] = step1[1];
+  step2[2] = step1[2];
+  step2[3] = step1[3];
+  step2[4] = step1[4];
+  step2[5] = step1[5];
+  step2[6] = step1[6];
+  step2[7] = step1[7];
+
+  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
+  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
+  step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
+  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
+  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
+  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  // stage 3
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[2];
+  step1[3] = step2[3];
+
+  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
+  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
+  step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
+  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  step1[8] = WRAPLOW(step2[8] + step2[9], 8);
+  step1[9] = WRAPLOW(step2[8] - step2[9], 8);
+  step1[10] = WRAPLOW(-step2[10] + step2[11], 8);
+  step1[11] = WRAPLOW(step2[10] + step2[11], 8);
+  step1[12] = WRAPLOW(step2[12] + step2[13], 8);
+  step1[13] = WRAPLOW(step2[12] - step2[13], 8);
+  step1[14] = WRAPLOW(-step2[14] + step2[15], 8);
+  step1[15] = WRAPLOW(step2[14] + step2[15], 8);
+
+  // stage 4
+  temp1 = (step1[0] + step1[1]) * cospi_16_64;
+  temp2 = (step1[0] - step1[1]) * cospi_16_64;
+  step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
+  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
+  step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[4] = WRAPLOW(step1[4] + step1[5], 8);
+  step2[5] = WRAPLOW(step1[4] - step1[5], 8);
+  step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
+  step2[7] = WRAPLOW(step1[6] + step1[7], 8);
+
+  step2[8] = step1[8];
+  step2[15] = step1[15];
+  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
+  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
+  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+
+  // stage 5
+  step1[0] = WRAPLOW(step2[0] + step2[3], 8);
+  step1[1] = WRAPLOW(step2[1] + step2[2], 8);
+  step1[2] = WRAPLOW(step2[1] - step2[2], 8);
+  step1[3] = WRAPLOW(step2[0] - step2[3], 8);
+  step1[4] = step2[4];
+  temp1 = (step2[6] - step2[5]) * cospi_16_64;
+  temp2 = (step2[5] + step2[6]) * cospi_16_64;
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[7] = step2[7];
+
+  step1[8] = WRAPLOW(step2[8] + step2[11], 8);
+  step1[9] = WRAPLOW(step2[9] + step2[10], 8);
+  step1[10] = WRAPLOW(step2[9] - step2[10], 8);
+  step1[11] = WRAPLOW(step2[8] - step2[11], 8);
+  step1[12] = WRAPLOW(-step2[12] + step2[15], 8);
+  step1[13] = WRAPLOW(-step2[13] + step2[14], 8);
+  step1[14] = WRAPLOW(step2[13] + step2[14], 8);
+  step1[15] = WRAPLOW(step2[12] + step2[15], 8);
+
+  // stage 6
+  step2[0] = WRAPLOW(step1[0] + step1[7], 8);
+  step2[1] = WRAPLOW(step1[1] + step1[6], 8);
+  step2[2] = WRAPLOW(step1[2] + step1[5], 8);
+  step2[3] = WRAPLOW(step1[3] + step1[4], 8);
+  step2[4] = WRAPLOW(step1[3] - step1[4], 8);
+  step2[5] = WRAPLOW(step1[2] - step1[5], 8);
+  step2[6] = WRAPLOW(step1[1] - step1[6], 8);
+  step2[7] = WRAPLOW(step1[0] - step1[7], 8);
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
+  temp2 = (step1[10] + step1[13]) * cospi_16_64;
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
+  temp2 = (step1[11] + step1[12]) * cospi_16_64;
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[14] = step1[14];
+  step2[15] = step1[15];
+
+  // stage 7
+  output[0] = WRAPLOW(step2[0] + step2[15], 8);
+  output[1] = WRAPLOW(step2[1] + step2[14], 8);
+  output[2] = WRAPLOW(step2[2] + step2[13], 8);
+  output[3] = WRAPLOW(step2[3] + step2[12], 8);
+  output[4] = WRAPLOW(step2[4] + step2[11], 8);
+  output[5] = WRAPLOW(step2[5] + step2[10], 8);
+  output[6] = WRAPLOW(step2[6] + step2[9], 8);
+  output[7] = WRAPLOW(step2[7] + step2[8], 8);
+  output[8] = WRAPLOW(step2[7] - step2[8], 8);
+  output[9] = WRAPLOW(step2[6] - step2[9], 8);
+  output[10] = WRAPLOW(step2[5] - step2[10], 8);
+  output[11] = WRAPLOW(step2[4] - step2[11], 8);
+  output[12] = WRAPLOW(step2[3] - step2[12], 8);
+  output[13] = WRAPLOW(step2[2] - step2[13], 8);
+  output[14] = WRAPLOW(step2[1] - step2[14], 8);
+  output[15] = WRAPLOW(step2[0] - step2[15], 8);
+}
+
+void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
+                             int stride) {
+  tran_low_t out[16 * 16];
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[16], temp_out[16];
+
+  // First transform rows
+  for (i = 0; i < 16; ++i) {
+    idct16_c(input, outptr);
+    input += 16;
+    outptr += 16;
+  }
+
+  // Then transform columns
+  for (i = 0; i < 16; ++i) {
+    for (j = 0; j < 16; ++j)
+      temp_in[j] = out[j * 16 + i];
+    idct16_c(temp_in, temp_out);
+    for (j = 0; j < 16; ++j) {
+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
+    }
+  }
+}
+
+void iadst16_c(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
+  tran_high_t s9, s10, s11, s12, s13, s14, s15;
+
+  tran_high_t x0 = input[15];
+  tran_high_t x1 = input[0];
+  tran_high_t x2 = input[13];
+  tran_high_t x3 = input[2];
+  tran_high_t x4 = input[11];
+  tran_high_t x5 = input[4];
+  tran_high_t x6 = input[9];
+  tran_high_t x7 = input[6];
+  tran_high_t x8 = input[7];
+  tran_high_t x9 = input[8];
+  tran_high_t x10 = input[5];
+  tran_high_t x11 = input[10];
+  tran_high_t x12 = input[3];
+  tran_high_t x13 = input[12];
+  tran_high_t x14 = input[1];
+  tran_high_t x15 = input[14];
+
+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
+           | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
+    output[0] = output[1] = output[2] = output[3] = output[4]
+              = output[5] = output[6] = output[7] = output[8]
+              = output[9] = output[10] = output[11] = output[12]
+              = output[13] = output[14] = output[15] = 0;
+    return;
+  }
+
+  // stage 1
+  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
+  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
+  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
+  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
+  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
+  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
+  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
+  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
+  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
+  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
+  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
+  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
+  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
+  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
+  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
+  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
+
+  x0 = WRAPLOW(dct_const_round_shift(s0 + s8), 8);
+  x1 = WRAPLOW(dct_const_round_shift(s1 + s9), 8);
+  x2 = WRAPLOW(dct_const_round_shift(s2 + s10), 8);
+  x3 = WRAPLOW(dct_const_round_shift(s3 + s11), 8);
+  x4 = WRAPLOW(dct_const_round_shift(s4 + s12), 8);
+  x5 = WRAPLOW(dct_const_round_shift(s5 + s13), 8);
+  x6 = WRAPLOW(dct_const_round_shift(s6 + s14), 8);
+  x7 = WRAPLOW(dct_const_round_shift(s7 + s15), 8);
+  x8 = WRAPLOW(dct_const_round_shift(s0 - s8), 8);
+  x9 = WRAPLOW(dct_const_round_shift(s1 - s9), 8);
+  x10 = WRAPLOW(dct_const_round_shift(s2 - s10), 8);
+  x11 = WRAPLOW(dct_const_round_shift(s3 - s11), 8);
+  x12 = WRAPLOW(dct_const_round_shift(s4 - s12), 8);
+  x13 = WRAPLOW(dct_const_round_shift(s5 - s13), 8);
+  x14 = WRAPLOW(dct_const_round_shift(s6 - s14), 8);
+  x15 = WRAPLOW(dct_const_round_shift(s7 - s15), 8);
+
+  // stage 2
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = x4;
+  s5 = x5;
+  s6 = x6;
+  s7 = x7;
+  s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;
+  s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;
+  s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;
+  s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;
+  s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
+  s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;
+  s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
+  s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;
+
+  x0 = WRAPLOW(s0 + s4, 8);
+  x1 = WRAPLOW(s1 + s5, 8);
+  x2 = WRAPLOW(s2 + s6, 8);
+  x3 = WRAPLOW(s3 + s7, 8);
+  x4 = WRAPLOW(s0 - s4, 8);
+  x5 = WRAPLOW(s1 - s5, 8);
+  x6 = WRAPLOW(s2 - s6, 8);
+  x7 = WRAPLOW(s3 - s7, 8);
+  x8 = WRAPLOW(dct_const_round_shift(s8 + s12), 8);
+  x9 = WRAPLOW(dct_const_round_shift(s9 + s13), 8);
+  x10 = WRAPLOW(dct_const_round_shift(s10 + s14), 8);
+  x11 = WRAPLOW(dct_const_round_shift(s11 + s15), 8);
+  x12 = WRAPLOW(dct_const_round_shift(s8 - s12), 8);
+  x13 = WRAPLOW(dct_const_round_shift(s9 - s13), 8);
+  x14 = WRAPLOW(dct_const_round_shift(s10 - s14), 8);
+  x15 = WRAPLOW(dct_const_round_shift(s11 - s15), 8);
+
+  // stage 3
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = x4 * cospi_8_64  + x5 * cospi_24_64;
+  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
+  s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
+  s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;
+  s8 = x8;
+  s9 = x9;
+  s10 = x10;
+  s11 = x11;
+  s12 = x12 * cospi_8_64  + x13 * cospi_24_64;
+  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
+  s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
+  s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;
+
+  x0 = WRAPLOW(check_range(s0 + s2), 8);
+  x1 = WRAPLOW(check_range(s1 + s3), 8);
+  x2 = WRAPLOW(check_range(s0 - s2), 8);
+  x3 = WRAPLOW(check_range(s1 - s3), 8);
+  x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8);
+  x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8);
+  x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8);
+  x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);
+  x8 = WRAPLOW(check_range(s8 + s10), 8);
+  x9 = WRAPLOW(check_range(s9 + s11), 8);
+  x10 = WRAPLOW(check_range(s8 - s10), 8);
+  x11 = WRAPLOW(check_range(s9 - s11), 8);
+  x12 = WRAPLOW(dct_const_round_shift(s12 + s14), 8);
+  x13 = WRAPLOW(dct_const_round_shift(s13 + s15), 8);
+  x14 = WRAPLOW(dct_const_round_shift(s12 - s14), 8);
+  x15 = WRAPLOW(dct_const_round_shift(s13 - s15), 8);
+
+  // stage 4
+  s2 = (- cospi_16_64) * (x2 + x3);
+  s3 = cospi_16_64 * (x2 - x3);
+  s6 = cospi_16_64 * (x6 + x7);
+  s7 = cospi_16_64 * (- x6 + x7);
+  s10 = cospi_16_64 * (x10 + x11);
+  s11 = cospi_16_64 * (- x10 + x11);
+  s14 = (- cospi_16_64) * (x14 + x15);
+  s15 = cospi_16_64 * (x14 - x15);
+
+  x2 = WRAPLOW(dct_const_round_shift(s2), 8);
+  x3 = WRAPLOW(dct_const_round_shift(s3), 8);
+  x6 = WRAPLOW(dct_const_round_shift(s6), 8);
+  x7 = WRAPLOW(dct_const_round_shift(s7), 8);
+  x10 = WRAPLOW(dct_const_round_shift(s10), 8);
+  x11 = WRAPLOW(dct_const_round_shift(s11), 8);
+  x14 = WRAPLOW(dct_const_round_shift(s14), 8);
+  x15 = WRAPLOW(dct_const_round_shift(s15), 8);
+
+  output[0] = WRAPLOW(x0, 8);
+  output[1] = WRAPLOW(-x8, 8);
+  output[2] = WRAPLOW(x12, 8);
+  output[3] = WRAPLOW(-x4, 8);
+  output[4] = WRAPLOW(x6, 8);
+  output[5] = WRAPLOW(x14, 8);
+  output[6] = WRAPLOW(x10, 8);
+  output[7] = WRAPLOW(x2, 8);
+  output[8] = WRAPLOW(x3, 8);
+  output[9] = WRAPLOW(x11, 8);
+  output[10] = WRAPLOW(x15, 8);
+  output[11] = WRAPLOW(x7, 8);
+  output[12] = WRAPLOW(x5, 8);
+  output[13] = WRAPLOW(-x13, 8);
+  output[14] = WRAPLOW(x9, 8);
+  output[15] = WRAPLOW(-x1, 8);
+}
+
+void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
+                            int stride) {
+  tran_low_t out[16 * 16] = { 0 };
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[16], temp_out[16];
+
+  // First transform rows. Since all non-zero dct coefficients are in
+  // upper-left 4x4 area, we only need to calculate first 4 rows here.
+  for (i = 0; i < 4; ++i) {
+    idct16_c(input, outptr);
+    input += 16;
+    outptr += 16;
+  }
+
+  // Then transform columns
+  for (i = 0; i < 16; ++i) {
+    for (j = 0; j < 16; ++j)
+      temp_in[j] = out[j*16 + i];
+    idct16_c(temp_in, temp_out);
+    for (j = 0; j < 16; ++j) {
+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
+    }
+  }
+}
+
+void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+  int i, j;
+  tran_high_t a1;
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
+  a1 = ROUND_POWER_OF_TWO(out, 6);
+  for (j = 0; j < 16; ++j) {
+    for (i = 0; i < 16; ++i)
+      dest[i] = clip_pixel_add(dest[i], a1);
+    dest += stride;
+  }
+}
+
+void idct32_c(const tran_low_t *input, tran_low_t *output) {
+  tran_low_t step1[32], step2[32];
+  tran_high_t temp1, temp2;
+
+  // stage 1
+  step1[0] = input[0];
+  step1[1] = input[16];
+  step1[2] = input[8];
+  step1[3] = input[24];
+  step1[4] = input[4];
+  step1[5] = input[20];
+  step1[6] = input[12];
+  step1[7] = input[28];
+  step1[8] = input[2];
+  step1[9] = input[18];
+  step1[10] = input[10];
+  step1[11] = input[26];
+  step1[12] = input[6];
+  step1[13] = input[22];
+  step1[14] = input[14];
+  step1[15] = input[30];
+
+  temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
+  temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
+  step1[16] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[31] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
+  temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
+  step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
+  temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
+  step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
+  temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
+  step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
+  temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
+  step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
+  temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
+  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
+  temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
+  step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
+  temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
+  step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  // stage 2
+  step2[0] = step1[0];
+  step2[1] = step1[1];
+  step2[2] = step1[2];
+  step2[3] = step1[3];
+  step2[4] = step1[4];
+  step2[5] = step1[5];
+  step2[6] = step1[6];
+  step2[7] = step1[7];
+
+  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
+  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
+  step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
+  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
+  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
+  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  step2[16] = WRAPLOW(step1[16] + step1[17], 8);
+  step2[17] = WRAPLOW(step1[16] - step1[17], 8);
+  step2[18] = WRAPLOW(-step1[18] + step1[19], 8);
+  step2[19] = WRAPLOW(step1[18] + step1[19], 8);
+  step2[20] = WRAPLOW(step1[20] + step1[21], 8);
+  step2[21] = WRAPLOW(step1[20] - step1[21], 8);
+  step2[22] = WRAPLOW(-step1[22] + step1[23], 8);
+  step2[23] = WRAPLOW(step1[22] + step1[23], 8);
+  step2[24] = WRAPLOW(step1[24] + step1[25], 8);
+  step2[25] = WRAPLOW(step1[24] - step1[25], 8);
+  step2[26] = WRAPLOW(-step1[26] + step1[27], 8);
+  step2[27] = WRAPLOW(step1[26] + step1[27], 8);
+  step2[28] = WRAPLOW(step1[28] + step1[29], 8);
+  step2[29] = WRAPLOW(step1[28] - step1[29], 8);
+  step2[30] = WRAPLOW(-step1[30] + step1[31], 8);
+  step2[31] = WRAPLOW(step1[30] + step1[31], 8);
+
+  // stage 3
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[2];
+  step1[3] = step2[3];
+
+  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
+  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
+  step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
+  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  step1[8] = WRAPLOW(step2[8] + step2[9], 8);
+  step1[9] = WRAPLOW(step2[8] - step2[9], 8);
+  step1[10] = WRAPLOW(-step2[10] + step2[11], 8);
+  step1[11] = WRAPLOW(step2[10] + step2[11], 8);
+  step1[12] = WRAPLOW(step2[12] + step2[13], 8);
+  step1[13] = WRAPLOW(step2[12] - step2[13], 8);
+  step1[14] = WRAPLOW(-step2[14] + step2[15], 8);
+  step1[15] = WRAPLOW(step2[14] + step2[15], 8);
+
+  step1[16] = step2[16];
+  step1[31] = step2[31];
+  temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
+  temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
+  step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
+  temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
+  step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[19] = step2[19];
+  step1[20] = step2[20];
+  temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
+  temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
+  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
+  temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
+  step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[27] = step2[27];
+  step1[28] = step2[28];
+
+  // stage 4
+  temp1 = (step1[0] + step1[1]) * cospi_16_64;
+  temp2 = (step1[0] - step1[1]) * cospi_16_64;
+  step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
+  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
+  step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[4] = WRAPLOW(step1[4] + step1[5], 8);
+  step2[5] = WRAPLOW(step1[4] - step1[5], 8);
+  step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
+  step2[7] = WRAPLOW(step1[6] + step1[7], 8);
+
+  step2[8] = step1[8];
+  step2[15] = step1[15];
+  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
+  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
+  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+
+  step2[16] = WRAPLOW(step1[16] + step1[19], 8);
+  step2[17] = WRAPLOW(step1[17] + step1[18], 8);
+  step2[18] = WRAPLOW(step1[17] - step1[18], 8);
+  step2[19] = WRAPLOW(step1[16] - step1[19], 8);
+  step2[20] = WRAPLOW(-step1[20] + step1[23], 8);
+  step2[21] = WRAPLOW(-step1[21] + step1[22], 8);
+  step2[22] = WRAPLOW(step1[21] + step1[22], 8);
+  step2[23] = WRAPLOW(step1[20] + step1[23], 8);
+
+  step2[24] = WRAPLOW(step1[24] + step1[27], 8);
+  step2[25] = WRAPLOW(step1[25] + step1[26], 8);
+  step2[26] = WRAPLOW(step1[25] - step1[26], 8);
+  step2[27] = WRAPLOW(step1[24] - step1[27], 8);
+  step2[28] = WRAPLOW(-step1[28] + step1[31], 8);
+  step2[29] = WRAPLOW(-step1[29] + step1[30], 8);
+  step2[30] = WRAPLOW(step1[29] + step1[30], 8);
+  step2[31] = WRAPLOW(step1[28] + step1[31], 8);
+
+  // stage 5
+  step1[0] = WRAPLOW(step2[0] + step2[3], 8);
+  step1[1] = WRAPLOW(step2[1] + step2[2], 8);
+  step1[2] = WRAPLOW(step2[1] - step2[2], 8);
+  step1[3] = WRAPLOW(step2[0] - step2[3], 8);
+  step1[4] = step2[4];
+  temp1 = (step2[6] - step2[5]) * cospi_16_64;
+  temp2 = (step2[5] + step2[6]) * cospi_16_64;
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[7] = step2[7];
+
+  step1[8] = WRAPLOW(step2[8] + step2[11], 8);
+  step1[9] = WRAPLOW(step2[9] + step2[10], 8);
+  step1[10] = WRAPLOW(step2[9] - step2[10], 8);
+  step1[11] = WRAPLOW(step2[8] - step2[11], 8);
+  step1[12] = WRAPLOW(-step2[12] + step2[15], 8);
+  step1[13] = WRAPLOW(-step2[13] + step2[14], 8);
+  step1[14] = WRAPLOW(step2[13] + step2[14], 8);
+  step1[15] = WRAPLOW(step2[12] + step2[15], 8);
+
+  step1[16] = step2[16];
+  step1[17] = step2[17];
+  temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
+  temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
+  step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
+  temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
+  step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
+  temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
+  step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
+  temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
+  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[22] = step2[22];
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[25] = step2[25];
+  step1[30] = step2[30];
+  step1[31] = step2[31];
+
+  // stage 6
+  step2[0] = WRAPLOW(step1[0] + step1[7], 8);
+  step2[1] = WRAPLOW(step1[1] + step1[6], 8);
+  step2[2] = WRAPLOW(step1[2] + step1[5], 8);
+  step2[3] = WRAPLOW(step1[3] + step1[4], 8);
+  step2[4] = WRAPLOW(step1[3] - step1[4], 8);
+  step2[5] = WRAPLOW(step1[2] - step1[5], 8);
+  step2[6] = WRAPLOW(step1[1] - step1[6], 8);
+  step2[7] = WRAPLOW(step1[0] - step1[7], 8);
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
+  temp2 = (step1[10] + step1[13]) * cospi_16_64;
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
+  temp2 = (step1[11] + step1[12]) * cospi_16_64;
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[14] = step1[14];
+  step2[15] = step1[15];
+
+  step2[16] = WRAPLOW(step1[16] + step1[23], 8);
+  step2[17] = WRAPLOW(step1[17] + step1[22], 8);
+  step2[18] = WRAPLOW(step1[18] + step1[21], 8);
+  step2[19] = WRAPLOW(step1[19] + step1[20], 8);
+  step2[20] = WRAPLOW(step1[19] - step1[20], 8);
+  step2[21] = WRAPLOW(step1[18] - step1[21], 8);
+  step2[22] = WRAPLOW(step1[17] - step1[22], 8);
+  step2[23] = WRAPLOW(step1[16] - step1[23], 8);
+
+  step2[24] = WRAPLOW(-step1[24] + step1[31], 8);
+  step2[25] = WRAPLOW(-step1[25] + step1[30], 8);
+  step2[26] = WRAPLOW(-step1[26] + step1[29], 8);
+  step2[27] = WRAPLOW(-step1[27] + step1[28], 8);
+  step2[28] = WRAPLOW(step1[27] + step1[28], 8);
+  step2[29] = WRAPLOW(step1[26] + step1[29], 8);
+  step2[30] = WRAPLOW(step1[25] + step1[30], 8);
+  step2[31] = WRAPLOW(step1[24] + step1[31], 8);
+
+  // stage 7
+  step1[0] = WRAPLOW(step2[0] + step2[15], 8);
+  step1[1] = WRAPLOW(step2[1] + step2[14], 8);
+  step1[2] = WRAPLOW(step2[2] + step2[13], 8);
+  step1[3] = WRAPLOW(step2[3] + step2[12], 8);
+  step1[4] = WRAPLOW(step2[4] + step2[11], 8);
+  step1[5] = WRAPLOW(step2[5] + step2[10], 8);
+  step1[6] = WRAPLOW(step2[6] + step2[9], 8);
+  step1[7] = WRAPLOW(step2[7] + step2[8], 8);
+  step1[8] = WRAPLOW(step2[7] - step2[8], 8);
+  step1[9] = WRAPLOW(step2[6] - step2[9], 8);
+  step1[10] = WRAPLOW(step2[5] - step2[10], 8);
+  step1[11] = WRAPLOW(step2[4] - step2[11], 8);
+  step1[12] = WRAPLOW(step2[3] - step2[12], 8);
+  step1[13] = WRAPLOW(step2[2] - step2[13], 8);
+  step1[14] = WRAPLOW(step2[1] - step2[14], 8);
+  step1[15] = WRAPLOW(step2[0] - step2[15], 8);
+
+  step1[16] = step2[16];
+  step1[17] = step2[17];
+  step1[18] = step2[18];
+  step1[19] = step2[19];
+  temp1 = (-step2[20] + step2[27]) * cospi_16_64;
+  temp2 = (step2[20] + step2[27]) * cospi_16_64;
+  step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = (-step2[21] + step2[26]) * cospi_16_64;
+  temp2 = (step2[21] + step2[26]) * cospi_16_64;
+  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = (-step2[22] + step2[25]) * cospi_16_64;
+  temp2 = (step2[22] + step2[25]) * cospi_16_64;
+  step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = (-step2[23] + step2[24]) * cospi_16_64;
+  temp2 = (step2[23] + step2[24]) * cospi_16_64;
+  step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[28] = step2[28];
+  step1[29] = step2[29];
+  step1[30] = step2[30];
+  step1[31] = step2[31];
+
+  // final stage
+  output[0] = WRAPLOW(step1[0] + step1[31], 8);
+  output[1] = WRAPLOW(step1[1] + step1[30], 8);
+  output[2] = WRAPLOW(step1[2] + step1[29], 8);
+  output[3] = WRAPLOW(step1[3] + step1[28], 8);
+  output[4] = WRAPLOW(step1[4] + step1[27], 8);
+  output[5] = WRAPLOW(step1[5] + step1[26], 8);
+  output[6] = WRAPLOW(step1[6] + step1[25], 8);
+  output[7] = WRAPLOW(step1[7] + step1[24], 8);
+  output[8] = WRAPLOW(step1[8] + step1[23], 8);
+  output[9] = WRAPLOW(step1[9] + step1[22], 8);
+  output[10] = WRAPLOW(step1[10] + step1[21], 8);
+  output[11] = WRAPLOW(step1[11] + step1[20], 8);
+  output[12] = WRAPLOW(step1[12] + step1[19], 8);
+  output[13] = WRAPLOW(step1[13] + step1[18], 8);
+  output[14] = WRAPLOW(step1[14] + step1[17], 8);
+  output[15] = WRAPLOW(step1[15] + step1[16], 8);
+  output[16] = WRAPLOW(step1[15] - step1[16], 8);
+  output[17] = WRAPLOW(step1[14] - step1[17], 8);
+  output[18] = WRAPLOW(step1[13] - step1[18], 8);
+  output[19] = WRAPLOW(step1[12] - step1[19], 8);
+  output[20] = WRAPLOW(step1[11] - step1[20], 8);
+  output[21] = WRAPLOW(step1[10] - step1[21], 8);
+  output[22] = WRAPLOW(step1[9] - step1[22], 8);
+  output[23] = WRAPLOW(step1[8] - step1[23], 8);
+  output[24] = WRAPLOW(step1[7] - step1[24], 8);
+  output[25] = WRAPLOW(step1[6] - step1[25], 8);
+  output[26] = WRAPLOW(step1[5] - step1[26], 8);
+  output[27] = WRAPLOW(step1[4] - step1[27], 8);
+  output[28] = WRAPLOW(step1[3] - step1[28], 8);
+  output[29] = WRAPLOW(step1[2] - step1[29], 8);
+  output[30] = WRAPLOW(step1[1] - step1[30], 8);
+  output[31] = WRAPLOW(step1[0] - step1[31], 8);
+}
+
+void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
+                              int stride) {
+  tran_low_t out[32 * 32];
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[32], temp_out[32];
+
+  // Rows
+  for (i = 0; i < 32; ++i) {
+    int16_t zero_coeff[16];
+    for (j = 0; j < 16; ++j)
+      zero_coeff[j] = input[2 * j] | input[2 * j + 1];
+    for (j = 0; j < 8; ++j)
+      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+    for (j = 0; j < 4; ++j)
+      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+    for (j = 0; j < 2; ++j)
+      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+
+    if (zero_coeff[0] | zero_coeff[1])
+      idct32_c(input, outptr);
+    else
+      memset(outptr, 0, sizeof(tran_low_t) * 32);
+    input += 32;
+    outptr += 32;
+  }
+
+  // Columns
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 32; ++j)
+      temp_in[j] = out[j * 32 + i];
+    idct32_c(temp_in, temp_out);
+    for (j = 0; j < 32; ++j) {
+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
+    }
+  }
+}
+
+void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
+                             int stride) {
+  tran_low_t out[32 * 32] = {0};
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[32], temp_out[32];
+
+  // Rows
+  // only upper-left 16x16 has non-zero coeff
+  for (i = 0; i < 16; ++i) {
+    idct32_c(input, outptr);
+    input += 32;
+    outptr += 32;
+  }
+
+  // Columns
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 32; ++j)
+      temp_in[j] = out[j * 32 + i];
+    idct32_c(temp_in, temp_out);
+    for (j = 0; j < 32; ++j) {
+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
+    }
+  }
+}
+
+void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
+                            int stride) {
+  tran_low_t out[32 * 32] = {0};
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[32], temp_out[32];
+
+  // Rows
+  // only upper-left 8x8 has non-zero coeff
+  for (i = 0; i < 8; ++i) {
+    idct32_c(input, outptr);
+    input += 32;
+    outptr += 32;
+  }
+
+  // Columns
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 32; ++j)
+      temp_in[j] = out[j * 32 + i];
+    idct32_c(temp_in, temp_out);
+    for (j = 0; j < 32; ++j) {
+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
+    }
+  }
+}
+
+void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+  int i, j;
+  tran_high_t a1;
+
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
+  a1 = ROUND_POWER_OF_TWO(out, 6);
+
+  for (j = 0; j < 32; ++j) {
+    for (i = 0; i < 32; ++i)
+      dest[i] = clip_pixel_add(dest[i], a1);
+    dest += stride;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
+                                 int stride, int bd) {
+  /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
+     0.5 shifts per pixel. */
+  int i;
+  tran_low_t output[16];
+  tran_high_t a1, b1, c1, d1, e1;
+  const tran_low_t *ip = input;
+  tran_low_t *op = output;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  for (i = 0; i < 4; i++) {
+    a1 = ip[0] >> UNIT_QUANT_SHIFT;
+    c1 = ip[1] >> UNIT_QUANT_SHIFT;
+    d1 = ip[2] >> UNIT_QUANT_SHIFT;
+    b1 = ip[3] >> UNIT_QUANT_SHIFT;
+    a1 += c1;
+    d1 -= b1;
+    e1 = (a1 - d1) >> 1;
+    b1 = e1 - b1;
+    c1 = e1 - c1;
+    a1 -= b1;
+    d1 += c1;
+    op[0] = WRAPLOW(a1, bd);
+    op[1] = WRAPLOW(b1, bd);
+    op[2] = WRAPLOW(c1, bd);
+    op[3] = WRAPLOW(d1, bd);
+    ip += 4;
+    op += 4;
+  }
+
+  ip = output;
+  for (i = 0; i < 4; i++) {
+    a1 = ip[4 * 0];
+    c1 = ip[4 * 1];
+    d1 = ip[4 * 2];
+    b1 = ip[4 * 3];
+    a1 += c1;
+    d1 -= b1;
+    e1 = (a1 - d1) >> 1;
+    b1 = e1 - b1;
+    c1 = e1 - c1;
+    a1 -= b1;
+    d1 += c1;
+    dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd);
+    dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], b1, bd);
+    dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], c1, bd);
+    dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], d1, bd);
+
+    ip++;
+    dest++;
+  }
+}
+
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
+                                int dest_stride, int bd) {
+  int i;
+  tran_high_t a1, e1;
+  tran_low_t tmp[4];
+  const tran_low_t *ip = in;
+  tran_low_t *op = tmp;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+  (void) bd;
+
+  a1 = ip[0] >> UNIT_QUANT_SHIFT;
+  e1 = a1 >> 1;
+  a1 -= e1;
+  op[0] = WRAPLOW(a1, bd);
+  op[1] = op[2] = op[3] = WRAPLOW(e1, bd);
+
+  ip = tmp;
+  for (i = 0; i < 4; i++) {
+    e1 = ip[0] >> 1;
+    a1 = ip[0] - e1;
+    dest[dest_stride * 0] = highbd_clip_pixel_add(
+        dest[dest_stride * 0], a1, bd);
+    dest[dest_stride * 1] = highbd_clip_pixel_add(
+        dest[dest_stride * 1], e1, bd);
+    dest[dest_stride * 2] = highbd_clip_pixel_add(
+        dest[dest_stride * 2], e1, bd);
+    dest[dest_stride * 3] = highbd_clip_pixel_add(
+        dest[dest_stride * 3], e1, bd);
+    ip++;
+    dest++;
+  }
+}
+
+void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
+  tran_low_t step[4];
+  tran_high_t temp1, temp2;
+  (void) bd;
+  // stage 1
+  temp1 = (input[0] + input[2]) * cospi_16_64;
+  temp2 = (input[0] - input[2]) * cospi_16_64;
+  step[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
+  temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
+  step[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  // stage 2
+  output[0] = WRAPLOW(step[0] + step[3], bd);
+  output[1] = WRAPLOW(step[1] + step[2], bd);
+  output[2] = WRAPLOW(step[1] - step[2], bd);
+  output[3] = WRAPLOW(step[0] - step[3], bd);
+}
+
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
+                                 int stride, int bd) {
+  tran_low_t out[4 * 4];
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[4], temp_out[4];
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  // Rows
+  for (i = 0; i < 4; ++i) {
+    vpx_highbd_idct4_c(input, outptr, bd);
+    input += 4;
+    outptr += 4;
+  }
+
+  // Columns
+  for (i = 0; i < 4; ++i) {
+    for (j = 0; j < 4; ++j)
+      temp_in[j] = out[j * 4 + i];
+    vpx_highbd_idct4_c(temp_in, temp_out, bd);
+    for (j = 0; j < 4; ++j) {
+      dest[j * stride + i] = highbd_clip_pixel_add(
+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
+    }
+  }
+}
+
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
+                                int dest_stride, int bd) {
+  int i;
+  tran_high_t a1;
+  tran_low_t out = WRAPLOW(
+      highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
+  a1 = ROUND_POWER_OF_TWO(out, 4);
+
+  for (i = 0; i < 4; i++) {
+    dest[0] = highbd_clip_pixel_add(dest[0], a1, bd);
+    dest[1] = highbd_clip_pixel_add(dest[1], a1, bd);
+    dest[2] = highbd_clip_pixel_add(dest[2], a1, bd);
+    dest[3] = highbd_clip_pixel_add(dest[3], a1, bd);
+    dest += dest_stride;
+  }
+}
+
+void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
+  tran_low_t step1[8], step2[8];
+  tran_high_t temp1, temp2;
+  // stage 1
+  step1[0] = input[0];
+  step1[2] = input[4];
+  step1[1] = input[2];
+  step1[3] = input[6];
+  temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
+  temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
+  step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
+  temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
+  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  // stage 2 & stage 3 - even half
+  vpx_highbd_idct4_c(step1, step1, bd);
+
+  // stage 2 - odd half
+  step2[4] = WRAPLOW(step1[4] + step1[5], bd);
+  step2[5] = WRAPLOW(step1[4] - step1[5], bd);
+  step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
+  step2[7] = WRAPLOW(step1[6] + step1[7], bd);
+
+  // stage 3 - odd half
+  step1[4] = step2[4];
+  temp1 = (step2[6] - step2[5]) * cospi_16_64;
+  temp2 = (step2[5] + step2[6]) * cospi_16_64;
+  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[7] = step2[7];
+
+  // stage 4
+  output[0] = WRAPLOW(step1[0] + step1[7], bd);
+  output[1] = WRAPLOW(step1[1] + step1[6], bd);
+  output[2] = WRAPLOW(step1[2] + step1[5], bd);
+  output[3] = WRAPLOW(step1[3] + step1[4], bd);
+  output[4] = WRAPLOW(step1[3] - step1[4], bd);
+  output[5] = WRAPLOW(step1[2] - step1[5], bd);
+  output[6] = WRAPLOW(step1[1] - step1[6], bd);
+  output[7] = WRAPLOW(step1[0] - step1[7], bd);
+}
+
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
+                                 int stride, int bd) {
+  tran_low_t out[8 * 8];
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[8], temp_out[8];
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  // First transform rows.
+  for (i = 0; i < 8; ++i) {
+    vpx_highbd_idct8_c(input, outptr, bd);
+    input += 8;
+    outptr += 8;
+  }
+
+  // Then transform columns.
+  for (i = 0; i < 8; ++i) {
+    for (j = 0; j < 8; ++j)
+      temp_in[j] = out[j * 8 + i];
+    vpx_highbd_idct8_c(temp_in, temp_out, bd);
+    for (j = 0; j < 8; ++j) {
+      dest[j * stride + i] = highbd_clip_pixel_add(
+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+    }
+  }
+}
+
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
+                                int stride, int bd) {
+  int i, j;
+  tran_high_t a1;
+  tran_low_t out = WRAPLOW(
+      highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
+  a1 = ROUND_POWER_OF_TWO(out, 5);
+  for (j = 0; j < 8; ++j) {
+    for (i = 0; i < 8; ++i)
+      dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
+    dest += stride;
+  }
+}
+
+void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+  tran_low_t x0 = input[0];
+  tran_low_t x1 = input[1];
+  tran_low_t x2 = input[2];
+  tran_low_t x3 = input[3];
+  (void) bd;
+
+  if (!(x0 | x1 | x2 | x3)) {
+    memset(output, 0, 4 * sizeof(*output));
+    return;
+  }
+
+  s0 = sinpi_1_9 * x0;
+  s1 = sinpi_2_9 * x0;
+  s2 = sinpi_3_9 * x1;
+  s3 = sinpi_4_9 * x2;
+  s4 = sinpi_1_9 * x2;
+  s5 = sinpi_2_9 * x3;
+  s6 = sinpi_4_9 * x3;
+  s7 = (tran_high_t)(x0 - x2 + x3);
+
+  s0 = s0 + s3 + s5;
+  s1 = s1 - s4 - s6;
+  s3 = s2;
+  s2 = sinpi_3_9 * s7;
+
+  // 1-D transform scaling factor is sqrt(2).
+  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
+  // + 1b (addition) = 29b.
+  // Hence the output bit depth is 15b.
+  output[0] = WRAPLOW(highbd_dct_const_round_shift(s0 + s3, bd), bd);
+  output[1] = WRAPLOW(highbd_dct_const_round_shift(s1 + s3, bd), bd);
+  output[2] = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
+  output[3] = WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3, bd), bd);
+}
+
+void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+  tran_low_t x0 = input[7];
+  tran_low_t x1 = input[0];
+  tran_low_t x2 = input[5];
+  tran_low_t x3 = input[2];
+  tran_low_t x4 = input[3];
+  tran_low_t x5 = input[4];
+  tran_low_t x6 = input[1];
+  tran_low_t x7 = input[6];
+  (void) bd;
+
+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
+    memset(output, 0, 8 * sizeof(*output));
+    return;
+  }
+
+  // stage 1
+  s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
+  s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
+  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
+  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
+  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
+  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
+  s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
+  s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
+
+  x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s4, bd), bd);
+  x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s5, bd), bd);
+  x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s6, bd), bd);
+  x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s7, bd), bd);
+  x4 = WRAPLOW(highbd_dct_const_round_shift(s0 - s4, bd), bd);
+  x5 = WRAPLOW(highbd_dct_const_round_shift(s1 - s5, bd), bd);
+  x6 = WRAPLOW(highbd_dct_const_round_shift(s2 - s6, bd), bd);
+  x7 = WRAPLOW(highbd_dct_const_round_shift(s3 - s7, bd), bd);
+
+  // stage 2
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;
+  s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;
+  s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
+  s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
+
+  x0 = WRAPLOW(s0 + s2, bd);
+  x1 = WRAPLOW(s1 + s3, bd);
+  x2 = WRAPLOW(s0 - s2, bd);
+  x3 = WRAPLOW(s1 - s3, bd);
+  x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd);
+  x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd);
+  x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd);
+  x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd);
+
+  // stage 3
+  s2 = cospi_16_64 * (x2 + x3);
+  s3 = cospi_16_64 * (x2 - x3);
+  s6 = cospi_16_64 * (x6 + x7);
+  s7 = cospi_16_64 * (x6 - x7);
+
+  x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
+  x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd);
+  x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd);
+  x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd);
+
+  output[0] = WRAPLOW(x0, bd);
+  output[1] = WRAPLOW(-x4, bd);
+  output[2] = WRAPLOW(x6, bd);
+  output[3] = WRAPLOW(-x2, bd);
+  output[4] = WRAPLOW(x3, bd);
+  output[5] = WRAPLOW(-x7, bd);
+  output[6] = WRAPLOW(x5, bd);
+  output[7] = WRAPLOW(-x1, bd);
+}
+
+void vpx_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
+                                 int stride, int bd) {
+  tran_low_t out[8 * 8] = { 0 };
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[8], temp_out[8];
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  // First transform rows.
+  // Only first 4 row has non-zero coefs.
+  for (i = 0; i < 4; ++i) {
+    vpx_highbd_idct8_c(input, outptr, bd);
+    input += 8;
+    outptr += 8;
+  }
+  // Then transform columns.
+  for (i = 0; i < 8; ++i) {
+    for (j = 0; j < 8; ++j)
+      temp_in[j] = out[j * 8 + i];
+    vpx_highbd_idct8_c(temp_in, temp_out, bd);
+    for (j = 0; j < 8; ++j) {
+      dest[j * stride + i] = highbd_clip_pixel_add(
+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+    }
+  }
+}
+
+void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
+  tran_low_t step1[16], step2[16];
+  tran_high_t temp1, temp2;
+  (void) bd;
+
+  // stage 1
+  step1[0] = input[0/2];
+  step1[1] = input[16/2];
+  step1[2] = input[8/2];
+  step1[3] = input[24/2];
+  step1[4] = input[4/2];
+  step1[5] = input[20/2];
+  step1[6] = input[12/2];
+  step1[7] = input[28/2];
+  step1[8] = input[2/2];
+  step1[9] = input[18/2];
+  step1[10] = input[10/2];
+  step1[11] = input[26/2];
+  step1[12] = input[6/2];
+  step1[13] = input[22/2];
+  step1[14] = input[14/2];
+  step1[15] = input[30/2];
+
+  // stage 2
+  step2[0] = step1[0];
+  step2[1] = step1[1];
+  step2[2] = step1[2];
+  step2[3] = step1[3];
+  step2[4] = step1[4];
+  step2[5] = step1[5];
+  step2[6] = step1[6];
+  step2[7] = step1[7];
+
+  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
+  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
+  step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
+  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
+  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
+  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
+  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
+  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
+  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  // stage 3
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[2];
+  step1[3] = step2[3];
+
+  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
+  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
+  step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
+  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
+  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  step1[8] = WRAPLOW(step2[8] + step2[9], bd);
+  step1[9] = WRAPLOW(step2[8] - step2[9], bd);
+  step1[10] = WRAPLOW(-step2[10] + step2[11], bd);
+  step1[11] = WRAPLOW(step2[10] + step2[11], bd);
+  step1[12] = WRAPLOW(step2[12] + step2[13], bd);
+  step1[13] = WRAPLOW(step2[12] - step2[13], bd);
+  step1[14] = WRAPLOW(-step2[14] + step2[15], bd);
+  step1[15] = WRAPLOW(step2[14] + step2[15], bd);
+
+  // stage 4
+  temp1 = (step1[0] + step1[1]) * cospi_16_64;
+  temp2 = (step1[0] - step1[1]) * cospi_16_64;
+  step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
+  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
+  step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[4] = WRAPLOW(step1[4] + step1[5], bd);
+  step2[5] = WRAPLOW(step1[4] - step1[5], bd);
+  step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
+  step2[7] = WRAPLOW(step1[6] + step1[7], bd);
+
+  step2[8] = step1[8];
+  step2[15] = step1[15];
+  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
+  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
+  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
+  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
+  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+
+  // stage 5
+  step1[0] = WRAPLOW(step2[0] + step2[3], bd);
+  step1[1] = WRAPLOW(step2[1] + step2[2], bd);
+  step1[2] = WRAPLOW(step2[1] - step2[2], bd);
+  step1[3] = WRAPLOW(step2[0] - step2[3], bd);
+  step1[4] = step2[4];
+  temp1 = (step2[6] - step2[5]) * cospi_16_64;
+  temp2 = (step2[5] + step2[6]) * cospi_16_64;
+  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[7] = step2[7];
+
+  step1[8] = WRAPLOW(step2[8] + step2[11], bd);
+  step1[9] = WRAPLOW(step2[9] + step2[10], bd);
+  step1[10] = WRAPLOW(step2[9] - step2[10], bd);
+  step1[11] = WRAPLOW(step2[8] - step2[11], bd);
+  step1[12] = WRAPLOW(-step2[12] + step2[15], bd);
+  step1[13] = WRAPLOW(-step2[13] + step2[14], bd);
+  step1[14] = WRAPLOW(step2[13] + step2[14], bd);
+  step1[15] = WRAPLOW(step2[12] + step2[15], bd);
+
+  // stage 6
+  step2[0] = WRAPLOW(step1[0] + step1[7], bd);
+  step2[1] = WRAPLOW(step1[1] + step1[6], bd);
+  step2[2] = WRAPLOW(step1[2] + step1[5], bd);
+  step2[3] = WRAPLOW(step1[3] + step1[4], bd);
+  step2[4] = WRAPLOW(step1[3] - step1[4], bd);
+  step2[5] = WRAPLOW(step1[2] - step1[5], bd);
+  step2[6] = WRAPLOW(step1[1] - step1[6], bd);
+  step2[7] = WRAPLOW(step1[0] - step1[7], bd);
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
+  temp2 = (step1[10] + step1[13]) * cospi_16_64;
+  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
+  temp2 = (step1[11] + step1[12]) * cospi_16_64;
+  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[14] = step1[14];
+  step2[15] = step1[15];
+
+  // stage 7
+  output[0] = WRAPLOW(step2[0] + step2[15], bd);
+  output[1] = WRAPLOW(step2[1] + step2[14], bd);
+  output[2] = WRAPLOW(step2[2] + step2[13], bd);
+  output[3] = WRAPLOW(step2[3] + step2[12], bd);
+  output[4] = WRAPLOW(step2[4] + step2[11], bd);
+  output[5] = WRAPLOW(step2[5] + step2[10], bd);
+  output[6] = WRAPLOW(step2[6] + step2[9], bd);
+  output[7] = WRAPLOW(step2[7] + step2[8], bd);
+  output[8] = WRAPLOW(step2[7] - step2[8], bd);
+  output[9] = WRAPLOW(step2[6] - step2[9], bd);
+  output[10] = WRAPLOW(step2[5] - step2[10], bd);
+  output[11] = WRAPLOW(step2[4] - step2[11], bd);
+  output[12] = WRAPLOW(step2[3] - step2[12], bd);
+  output[13] = WRAPLOW(step2[2] - step2[13], bd);
+  output[14] = WRAPLOW(step2[1] - step2[14], bd);
+  output[15] = WRAPLOW(step2[0] - step2[15], bd);
+}
+
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
+                                    int stride, int bd) {
+  tran_low_t out[16 * 16];
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[16], temp_out[16];
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  // First transform rows.
+  for (i = 0; i < 16; ++i) {
+    vpx_highbd_idct16_c(input, outptr, bd);
+    input += 16;
+    outptr += 16;
+  }
+
+  // Then transform columns.
+  for (i = 0; i < 16; ++i) {
+    for (j = 0; j < 16; ++j)
+      temp_in[j] = out[j * 16 + i];
+    vpx_highbd_idct16_c(temp_in, temp_out, bd);
+    for (j = 0; j < 16; ++j) {
+      dest[j * stride + i] = highbd_clip_pixel_add(
+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+    }
+  }
+}
+
+void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
+  tran_high_t s9, s10, s11, s12, s13, s14, s15;
+
+  tran_low_t x0 = input[15];
+  tran_low_t x1 = input[0];
+  tran_low_t x2 = input[13];
+  tran_low_t x3 = input[2];
+  tran_low_t x4 = input[11];
+  tran_low_t x5 = input[4];
+  tran_low_t x6 = input[9];
+  tran_low_t x7 = input[6];
+  tran_low_t x8 = input[7];
+  tran_low_t x9 = input[8];
+  tran_low_t x10 = input[5];
+  tran_low_t x11 = input[10];
+  tran_low_t x12 = input[3];
+  tran_low_t x13 = input[12];
+  tran_low_t x14 = input[1];
+  tran_low_t x15 = input[14];
+  (void) bd;
+
+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
+           | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
+    memset(output, 0, 16 * sizeof(*output));
+    return;
+  }
+
+  // stage 1
+  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
+  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
+  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
+  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
+  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
+  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
+  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
+  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
+  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
+  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
+  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
+  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
+  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
+  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
+  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
+  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
+
+  x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s8, bd), bd);
+  x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s9, bd), bd);
+  x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s10, bd), bd);
+  x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s11, bd), bd);
+  x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s12, bd), bd);
+  x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s13, bd), bd);
+  x6 = WRAPLOW(highbd_dct_const_round_shift(s6 + s14, bd), bd);
+  x7 = WRAPLOW(highbd_dct_const_round_shift(s7 + s15, bd), bd);
+  x8  = WRAPLOW(highbd_dct_const_round_shift(s0 - s8, bd), bd);
+  x9  = WRAPLOW(highbd_dct_const_round_shift(s1 - s9, bd), bd);
+  x10 = WRAPLOW(highbd_dct_const_round_shift(s2 - s10, bd), bd);
+  x11 = WRAPLOW(highbd_dct_const_round_shift(s3 - s11, bd), bd);
+  x12 = WRAPLOW(highbd_dct_const_round_shift(s4 - s12, bd), bd);
+  x13 = WRAPLOW(highbd_dct_const_round_shift(s5 - s13, bd), bd);
+  x14 = WRAPLOW(highbd_dct_const_round_shift(s6 - s14, bd), bd);
+  x15 = WRAPLOW(highbd_dct_const_round_shift(s7 - s15, bd), bd);
+
+  // stage 2
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = x4;
+  s5 = x5;
+  s6 = x6;
+  s7 = x7;
+  s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
+  s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
+  s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
+  s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
+  s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
+  s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
+  s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
+  s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
+
+  x0 = WRAPLOW(s0 + s4, bd);
+  x1 = WRAPLOW(s1 + s5, bd);
+  x2 = WRAPLOW(s2 + s6, bd);
+  x3 = WRAPLOW(s3 + s7, bd);
+  x4 = WRAPLOW(s0 - s4, bd);
+  x5 = WRAPLOW(s1 - s5, bd);
+  x6 = WRAPLOW(s2 - s6, bd);
+  x7 = WRAPLOW(s3 - s7, bd);
+  x8 = WRAPLOW(highbd_dct_const_round_shift(s8 + s12, bd), bd);
+  x9 = WRAPLOW(highbd_dct_const_round_shift(s9 + s13, bd), bd);
+  x10 = WRAPLOW(highbd_dct_const_round_shift(s10 + s14, bd), bd);
+  x11 = WRAPLOW(highbd_dct_const_round_shift(s11 + s15, bd), bd);
+  x12 = WRAPLOW(highbd_dct_const_round_shift(s8 - s12, bd), bd);
+  x13 = WRAPLOW(highbd_dct_const_round_shift(s9 - s13, bd), bd);
+  x14 = WRAPLOW(highbd_dct_const_round_shift(s10 - s14, bd), bd);
+  x15 = WRAPLOW(highbd_dct_const_round_shift(s11 - s15, bd), bd);
+
+  // stage 3
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
+  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
+  s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
+  s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
+  s8 = x8;
+  s9 = x9;
+  s10 = x10;
+  s11 = x11;
+  s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
+  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
+  s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
+  s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
+
+  x0 = WRAPLOW(s0 + s2, bd);
+  x1 = WRAPLOW(s1 + s3, bd);
+  x2 = WRAPLOW(s0 - s2, bd);
+  x3 = WRAPLOW(s1 - s3, bd);
+  x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd);
+  x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd);
+  x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd);
+  x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd);
+  x8 = WRAPLOW(s8 + s10, bd);
+  x9 = WRAPLOW(s9 + s11, bd);
+  x10 = WRAPLOW(s8 - s10, bd);
+  x11 = WRAPLOW(s9 - s11, bd);
+  x12 = WRAPLOW(highbd_dct_const_round_shift(s12 + s14, bd), bd);
+  x13 = WRAPLOW(highbd_dct_const_round_shift(s13 + s15, bd), bd);
+  x14 = WRAPLOW(highbd_dct_const_round_shift(s12 - s14, bd), bd);
+  x15 = WRAPLOW(highbd_dct_const_round_shift(s13 - s15, bd), bd);
+
+  // stage 4
+  s2 = (- cospi_16_64) * (x2 + x3);
+  s3 = cospi_16_64 * (x2 - x3);
+  s6 = cospi_16_64 * (x6 + x7);
+  s7 = cospi_16_64 * (-x6 + x7);
+  s10 = cospi_16_64 * (x10 + x11);
+  s11 = cospi_16_64 * (-x10 + x11);
+  s14 = (- cospi_16_64) * (x14 + x15);
+  s15 = cospi_16_64 * (x14 - x15);
+
+  x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
+  x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd);
+  x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd);
+  x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd);
+  x10 = WRAPLOW(highbd_dct_const_round_shift(s10, bd), bd);
+  x11 = WRAPLOW(highbd_dct_const_round_shift(s11, bd), bd);
+  x14 = WRAPLOW(highbd_dct_const_round_shift(s14, bd), bd);
+  x15 = WRAPLOW(highbd_dct_const_round_shift(s15, bd), bd);
+
+  output[0] = WRAPLOW(x0, bd);
+  output[1] = WRAPLOW(-x8, bd);
+  output[2] = WRAPLOW(x12, bd);
+  output[3] = WRAPLOW(-x4, bd);
+  output[4] = WRAPLOW(x6, bd);
+  output[5] = WRAPLOW(x14, bd);
+  output[6] = WRAPLOW(x10, bd);
+  output[7] = WRAPLOW(x2, bd);
+  output[8] = WRAPLOW(x3, bd);
+  output[9] = WRAPLOW(x11, bd);
+  output[10] = WRAPLOW(x15, bd);
+  output[11] = WRAPLOW(x7, bd);
+  output[12] = WRAPLOW(x5, bd);
+  output[13] = WRAPLOW(-x13, bd);
+  output[14] = WRAPLOW(x9, bd);
+  output[15] = WRAPLOW(-x1, bd);
+}
+
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
+                                   int stride, int bd) {
+  tran_low_t out[16 * 16] = { 0 };
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[16], temp_out[16];
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  // First transform rows. Since all non-zero dct coefficients are in
+  // upper-left 4x4 area, we only need to calculate first 4 rows here.
+  for (i = 0; i < 4; ++i) {
+    vpx_highbd_idct16_c(input, outptr, bd);
+    input += 16;
+    outptr += 16;
+  }
+
+  // Then transform columns.
+  for (i = 0; i < 16; ++i) {
+    for (j = 0; j < 16; ++j)
+      temp_in[j] = out[j*16 + i];
+    vpx_highbd_idct16_c(temp_in, temp_out, bd);
+    for (j = 0; j < 16; ++j) {
+      dest[j * stride + i] = highbd_clip_pixel_add(
+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+    }
+  }
+}
+
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
+                                  int stride, int bd) {
+  int i, j;
+  tran_high_t a1;
+  tran_low_t out = WRAPLOW(
+      highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
+  a1 = ROUND_POWER_OF_TWO(out, 6);
+  for (j = 0; j < 16; ++j) {
+    for (i = 0; i < 16; ++i)
+      dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
+    dest += stride;
+  }
+}
+
+static void highbd_idct32_c(const tran_low_t *input,
+                            tran_low_t *output, int bd) {
+  tran_low_t step1[32], step2[32];
+  tran_high_t temp1, temp2;
+  (void) bd;
+
+  // stage 1
+  step1[0] = input[0];
+  step1[1] = input[16];
+  step1[2] = input[8];
+  step1[3] = input[24];
+  step1[4] = input[4];
+  step1[5] = input[20];
+  step1[6] = input[12];
+  step1[7] = input[28];
+  step1[8] = input[2];
+  step1[9] = input[18];
+  step1[10] = input[10];
+  step1[11] = input[26];
+  step1[12] = input[6];
+  step1[13] = input[22];
+  step1[14] = input[14];
+  step1[15] = input[30];
+
+  temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
+  temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
+  step1[16] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[31] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
+  temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
+  step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
+  temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
+  step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
+  temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
+  step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
+  temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
+  step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
+  temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
+  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
+  temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
+  step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
+  temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
+  step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  // stage 2
+  step2[0] = step1[0];
+  step2[1] = step1[1];
+  step2[2] = step1[2];
+  step2[3] = step1[3];
+  step2[4] = step1[4];
+  step2[5] = step1[5];
+  step2[6] = step1[6];
+  step2[7] = step1[7];
+
+  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
+  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
+  step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
+  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
+  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
+  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
+  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
+  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
+  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  step2[16] = WRAPLOW(step1[16] + step1[17], bd);
+  step2[17] = WRAPLOW(step1[16] - step1[17], bd);
+  step2[18] = WRAPLOW(-step1[18] + step1[19], bd);
+  step2[19] = WRAPLOW(step1[18] + step1[19], bd);
+  step2[20] = WRAPLOW(step1[20] + step1[21], bd);
+  step2[21] = WRAPLOW(step1[20] - step1[21], bd);
+  step2[22] = WRAPLOW(-step1[22] + step1[23], bd);
+  step2[23] = WRAPLOW(step1[22] + step1[23], bd);
+  step2[24] = WRAPLOW(step1[24] + step1[25], bd);
+  step2[25] = WRAPLOW(step1[24] - step1[25], bd);
+  step2[26] = WRAPLOW(-step1[26] + step1[27], bd);
+  step2[27] = WRAPLOW(step1[26] + step1[27], bd);
+  step2[28] = WRAPLOW(step1[28] + step1[29], bd);
+  step2[29] = WRAPLOW(step1[28] - step1[29], bd);
+  step2[30] = WRAPLOW(-step1[30] + step1[31], bd);
+  step2[31] = WRAPLOW(step1[30] + step1[31], bd);
+
+  // stage 3
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[2];
+  step1[3] = step2[3];
+
+  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
+  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
+  step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
+  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
+  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+
+  step1[8] = WRAPLOW(step2[8] + step2[9], bd);
+  step1[9] = WRAPLOW(step2[8] - step2[9], bd);
+  step1[10] = WRAPLOW(-step2[10] + step2[11], bd);
+  step1[11] = WRAPLOW(step2[10] + step2[11], bd);
+  step1[12] = WRAPLOW(step2[12] + step2[13], bd);
+  step1[13] = WRAPLOW(step2[12] - step2[13], bd);
+  step1[14] = WRAPLOW(-step2[14] + step2[15], bd);
+  step1[15] = WRAPLOW(step2[14] + step2[15], bd);
+
+  step1[16] = step2[16];
+  step1[31] = step2[31];
+  temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
+  temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
+  step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
+  temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
+  step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[19] = step2[19];
+  step1[20] = step2[20];
+  temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
+  temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
+  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
+  temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
+  step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[27] = step2[27];
+  step1[28] = step2[28];
+
+  // stage 4
+  temp1 = (step1[0] + step1[1]) * cospi_16_64;
+  temp2 = (step1[0] - step1[1]) * cospi_16_64;
+  step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
+  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
+  step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[4] = WRAPLOW(step1[4] + step1[5], bd);
+  step2[5] = WRAPLOW(step1[4] - step1[5], bd);
+  step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
+  step2[7] = WRAPLOW(step1[6] + step1[7], bd);
+
+  step2[8] = step1[8];
+  step2[15] = step1[15];
+  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
+  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
+  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
+  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
+  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+
+  step2[16] = WRAPLOW(step1[16] + step1[19], bd);
+  step2[17] = WRAPLOW(step1[17] + step1[18], bd);
+  step2[18] = WRAPLOW(step1[17] - step1[18], bd);
+  step2[19] = WRAPLOW(step1[16] - step1[19], bd);
+  step2[20] = WRAPLOW(-step1[20] + step1[23], bd);
+  step2[21] = WRAPLOW(-step1[21] + step1[22], bd);
+  step2[22] = WRAPLOW(step1[21] + step1[22], bd);
+  step2[23] = WRAPLOW(step1[20] + step1[23], bd);
+
+  step2[24] = WRAPLOW(step1[24] + step1[27], bd);
+  step2[25] = WRAPLOW(step1[25] + step1[26], bd);
+  step2[26] = WRAPLOW(step1[25] - step1[26], bd);
+  step2[27] = WRAPLOW(step1[24] - step1[27], bd);
+  step2[28] = WRAPLOW(-step1[28] + step1[31], bd);
+  step2[29] = WRAPLOW(-step1[29] + step1[30], bd);
+  step2[30] = WRAPLOW(step1[29] + step1[30], bd);
+  step2[31] = WRAPLOW(step1[28] + step1[31], bd);
+
+  // stage 5
+  step1[0] = WRAPLOW(step2[0] + step2[3], bd);
+  step1[1] = WRAPLOW(step2[1] + step2[2], bd);
+  step1[2] = WRAPLOW(step2[1] - step2[2], bd);
+  step1[3] = WRAPLOW(step2[0] - step2[3], bd);
+  step1[4] = step2[4];
+  temp1 = (step2[6] - step2[5]) * cospi_16_64;
+  temp2 = (step2[5] + step2[6]) * cospi_16_64;
+  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[7] = step2[7];
+
+  step1[8] = WRAPLOW(step2[8] + step2[11], bd);
+  step1[9] = WRAPLOW(step2[9] + step2[10], bd);
+  step1[10] = WRAPLOW(step2[9] - step2[10], bd);
+  step1[11] = WRAPLOW(step2[8] - step2[11], bd);
+  step1[12] = WRAPLOW(-step2[12] + step2[15], bd);
+  step1[13] = WRAPLOW(-step2[13] + step2[14], bd);
+  step1[14] = WRAPLOW(step2[13] + step2[14], bd);
+  step1[15] = WRAPLOW(step2[12] + step2[15], bd);
+
+  step1[16] = step2[16];
+  step1[17] = step2[17];
+  temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
+  temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
+  step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
+  temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
+  step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
+  temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
+  step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
+  temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
+  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[22] = step2[22];
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[25] = step2[25];
+  step1[30] = step2[30];
+  step1[31] = step2[31];
+
+  // stage 6
+  step2[0] = WRAPLOW(step1[0] + step1[7], bd);
+  step2[1] = WRAPLOW(step1[1] + step1[6], bd);
+  step2[2] = WRAPLOW(step1[2] + step1[5], bd);
+  step2[3] = WRAPLOW(step1[3] + step1[4], bd);
+  step2[4] = WRAPLOW(step1[3] - step1[4], bd);
+  step2[5] = WRAPLOW(step1[2] - step1[5], bd);
+  step2[6] = WRAPLOW(step1[1] - step1[6], bd);
+  step2[7] = WRAPLOW(step1[0] - step1[7], bd);
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
+  temp2 = (step1[10] + step1[13]) * cospi_16_64;
+  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
+  temp2 = (step1[11] + step1[12]) * cospi_16_64;
+  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[14] = step1[14];
+  step2[15] = step1[15];
+
+  step2[16] = WRAPLOW(step1[16] + step1[23], bd);
+  step2[17] = WRAPLOW(step1[17] + step1[22], bd);
+  step2[18] = WRAPLOW(step1[18] + step1[21], bd);
+  step2[19] = WRAPLOW(step1[19] + step1[20], bd);
+  step2[20] = WRAPLOW(step1[19] - step1[20], bd);
+  step2[21] = WRAPLOW(step1[18] - step1[21], bd);
+  step2[22] = WRAPLOW(step1[17] - step1[22], bd);
+  step2[23] = WRAPLOW(step1[16] - step1[23], bd);
+
+  step2[24] = WRAPLOW(-step1[24] + step1[31], bd);
+  step2[25] = WRAPLOW(-step1[25] + step1[30], bd);
+  step2[26] = WRAPLOW(-step1[26] + step1[29], bd);
+  step2[27] = WRAPLOW(-step1[27] + step1[28], bd);
+  step2[28] = WRAPLOW(step1[27] + step1[28], bd);
+  step2[29] = WRAPLOW(step1[26] + step1[29], bd);
+  step2[30] = WRAPLOW(step1[25] + step1[30], bd);
+  step2[31] = WRAPLOW(step1[24] + step1[31], bd);
+
+  // stage 7
+  step1[0] = WRAPLOW(step2[0] + step2[15], bd);
+  step1[1] = WRAPLOW(step2[1] + step2[14], bd);
+  step1[2] = WRAPLOW(step2[2] + step2[13], bd);
+  step1[3] = WRAPLOW(step2[3] + step2[12], bd);
+  step1[4] = WRAPLOW(step2[4] + step2[11], bd);
+  step1[5] = WRAPLOW(step2[5] + step2[10], bd);
+  step1[6] = WRAPLOW(step2[6] + step2[9], bd);
+  step1[7] = WRAPLOW(step2[7] + step2[8], bd);
+  step1[8] = WRAPLOW(step2[7] - step2[8], bd);
+  step1[9] = WRAPLOW(step2[6] - step2[9], bd);
+  step1[10] = WRAPLOW(step2[5] - step2[10], bd);
+  step1[11] = WRAPLOW(step2[4] - step2[11], bd);
+  step1[12] = WRAPLOW(step2[3] - step2[12], bd);
+  step1[13] = WRAPLOW(step2[2] - step2[13], bd);
+  step1[14] = WRAPLOW(step2[1] - step2[14], bd);
+  step1[15] = WRAPLOW(step2[0] - step2[15], bd);
+
+  step1[16] = step2[16];
+  step1[17] = step2[17];
+  step1[18] = step2[18];
+  step1[19] = step2[19];
+  temp1 = (-step2[20] + step2[27]) * cospi_16_64;
+  temp2 = (step2[20] + step2[27]) * cospi_16_64;
+  step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = (-step2[21] + step2[26]) * cospi_16_64;
+  temp2 = (step2[21] + step2[26]) * cospi_16_64;
+  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = (-step2[22] + step2[25]) * cospi_16_64;
+  temp2 = (step2[22] + step2[25]) * cospi_16_64;
+  step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  temp1 = (-step2[23] + step2[24]) * cospi_16_64;
+  temp2 = (step2[23] + step2[24]) * cospi_16_64;
+  step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[28] = step2[28];
+  step1[29] = step2[29];
+  step1[30] = step2[30];
+  step1[31] = step2[31];
+
+  // final stage
+  output[0] = WRAPLOW(step1[0] + step1[31], bd);
+  output[1] = WRAPLOW(step1[1] + step1[30], bd);
+  output[2] = WRAPLOW(step1[2] + step1[29], bd);
+  output[3] = WRAPLOW(step1[3] + step1[28], bd);
+  output[4] = WRAPLOW(step1[4] + step1[27], bd);
+  output[5] = WRAPLOW(step1[5] + step1[26], bd);
+  output[6] = WRAPLOW(step1[6] + step1[25], bd);
+  output[7] = WRAPLOW(step1[7] + step1[24], bd);
+  output[8] = WRAPLOW(step1[8] + step1[23], bd);
+  output[9] = WRAPLOW(step1[9] + step1[22], bd);
+  output[10] = WRAPLOW(step1[10] + step1[21], bd);
+  output[11] = WRAPLOW(step1[11] + step1[20], bd);
+  output[12] = WRAPLOW(step1[12] + step1[19], bd);
+  output[13] = WRAPLOW(step1[13] + step1[18], bd);
+  output[14] = WRAPLOW(step1[14] + step1[17], bd);
+  output[15] = WRAPLOW(step1[15] + step1[16], bd);
+  output[16] = WRAPLOW(step1[15] - step1[16], bd);
+  output[17] = WRAPLOW(step1[14] - step1[17], bd);
+  output[18] = WRAPLOW(step1[13] - step1[18], bd);
+  output[19] = WRAPLOW(step1[12] - step1[19], bd);
+  output[20] = WRAPLOW(step1[11] - step1[20], bd);
+  output[21] = WRAPLOW(step1[10] - step1[21], bd);
+  output[22] = WRAPLOW(step1[9] - step1[22], bd);
+  output[23] = WRAPLOW(step1[8] - step1[23], bd);
+  output[24] = WRAPLOW(step1[7] - step1[24], bd);
+  output[25] = WRAPLOW(step1[6] - step1[25], bd);
+  output[26] = WRAPLOW(step1[5] - step1[26], bd);
+  output[27] = WRAPLOW(step1[4] - step1[27], bd);
+  output[28] = WRAPLOW(step1[3] - step1[28], bd);
+  output[29] = WRAPLOW(step1[2] - step1[29], bd);
+  output[30] = WRAPLOW(step1[1] - step1[30], bd);
+  output[31] = WRAPLOW(step1[0] - step1[31], bd);
+}
+
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
+                                     int stride, int bd) {
+  tran_low_t out[32 * 32];
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[32], temp_out[32];
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  // Rows
+  for (i = 0; i < 32; ++i) {
+    tran_low_t zero_coeff[16];
+    for (j = 0; j < 16; ++j)
+      zero_coeff[j] = input[2 * j] | input[2 * j + 1];
+    for (j = 0; j < 8; ++j)
+      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+    for (j = 0; j < 4; ++j)
+      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+    for (j = 0; j < 2; ++j)
+      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+
+    if (zero_coeff[0] | zero_coeff[1])
+      highbd_idct32_c(input, outptr, bd);
+    else
+      memset(outptr, 0, sizeof(tran_low_t) * 32);
+    input += 32;
+    outptr += 32;
+  }
+
+  // Columns
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 32; ++j)
+      temp_in[j] = out[j * 32 + i];
+    highbd_idct32_c(temp_in, temp_out, bd);
+    for (j = 0; j < 32; ++j) {
+      dest[j * stride + i] = highbd_clip_pixel_add(
+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+    }
+  }
+}
+
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
+                                   int stride, int bd) {
+  tran_low_t out[32 * 32] = {0};
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[32], temp_out[32];
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  // Rows
+  // Only upper-left 8x8 has non-zero coeff.
+  for (i = 0; i < 8; ++i) {
+    highbd_idct32_c(input, outptr, bd);
+    input += 32;
+    outptr += 32;
+  }
+  // Columns
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 32; ++j)
+      temp_in[j] = out[j * 32 + i];
+    highbd_idct32_c(temp_in, temp_out, bd);
+    for (j = 0; j < 32; ++j) {
+      dest[j * stride + i] = highbd_clip_pixel_add(
+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+    }
+  }
+}
+
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
+                                  int stride, int bd) {
+  int i, j;
+  int a1;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  tran_low_t out = WRAPLOW(
+      highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
+  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
+  a1 = ROUND_POWER_OF_TWO(out, 6);
+
+  for (j = 0; j < 32; ++j) {
+    for (i = 0; i < 32; ++i)
+      dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
+    dest += stride;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/libs/libvpx/vpx_dsp/inv_txfm.h b/libs/libvpx/vpx_dsp/inv_txfm.h
new file mode 100644
index 0000000000..23588139ed
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/inv_txfm.h
@@ -0,0 +1,123 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_INV_TXFM_H_
+#define VPX_DSP_INV_TXFM_H_
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_ports/mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE tran_low_t check_range(tran_high_t input) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+  // For valid VP9 input streams, intermediate stage coefficients should always
+  // stay within the range of a signed 16 bit integer. Coefficients can go out
+  // of this range for invalid/corrupt VP9 streams. However, strictly checking
+  // this range for every intermediate coefficient can burdensome for a decoder,
+  // therefore the following assertion is only enabled when configured with
+  // --enable-coefficient-range-checking.
+  assert(INT16_MIN <= input);
+  assert(input <= INT16_MAX);
+#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
+  return (tran_low_t)input;
+}
+
+static INLINE tran_low_t dct_const_round_shift(tran_high_t input) {
+  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+  return check_range(rv);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE tran_low_t highbd_check_range(tran_high_t input,
+                                            int bd) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+  // For valid highbitdepth VP9 streams, intermediate stage coefficients will
+  // stay within the ranges:
+  // - 8 bit: signed 16 bit integer
+  // - 10 bit: signed 18 bit integer
+  // - 12 bit: signed 20 bit integer
+  const int32_t int_max = (1 << (7 + bd)) - 1;
+  const int32_t int_min = -int_max - 1;
+  assert(int_min <= input);
+  assert(input <= int_max);
+  (void) int_min;
+#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
+  (void) bd;
+  return (tran_low_t)input;
+}
+
+static INLINE tran_low_t highbd_dct_const_round_shift(tran_high_t input,
+                                                      int bd) {
+  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+  return highbd_check_range(rv, bd);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if CONFIG_EMULATE_HARDWARE
+// When CONFIG_EMULATE_HARDWARE is 1 the transform performs a
+// non-normative method to handle overflows. A stream that causes
+// overflows  in the inverse transform is considered invalid in VP9,
+// and a hardware implementer is free to choose any reasonable
+// method to handle overflows. However to aid in hardware
+// verification they can use a specific implementation of the
+// WRAPLOW() macro below that is identical to their intended
+// hardware implementation (and also use configure options to trigger
+// the C-implementation of the transform).
+//
+// The particular WRAPLOW implementation below performs strict
+// overflow wrapping to match common hardware implementations.
+// bd of 8 uses trans_low with 16bits, need to remove 16bits
+// bd of 10 uses trans_low with 18bits, need to remove 14bits
+// bd of 12 uses trans_low with 20bits, need to remove 12bits
+// bd of x uses trans_low with 8+x bits, need to remove 24-x bits
+#define WRAPLOW(x, bd) ((((int32_t)(x)) << (24 - bd)) >> (24 - bd))
+#else
+#define WRAPLOW(x, bd) ((int32_t)(x))
+#endif  // CONFIG_EMULATE_HARDWARE
+
+void idct4_c(const tran_low_t *input, tran_low_t *output);
+void idct8_c(const tran_low_t *input, tran_low_t *output);
+void idct16_c(const tran_low_t *input, tran_low_t *output);
+void idct32_c(const tran_low_t *input, tran_low_t *output);
+void iadst4_c(const tran_low_t *input, tran_low_t *output);
+void iadst8_c(const tran_low_t *input, tran_low_t *output);
+void iadst16_c(const tran_low_t *input, tran_low_t *output);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd);
+void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd);
+void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd);
+
+void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd);
+void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd);
+void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd);
+
+static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
+                                             int bd) {
+  trans = WRAPLOW(trans, bd);
+  return clip_pixel_highbd(WRAPLOW(dest + trans, bd), bd);
+}
+#endif
+
+static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) {
+  trans = WRAPLOW(trans, 8);
+  return clip_pixel(WRAPLOW(dest + trans, 8));
+}
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_DSP_INV_TXFM_H_
diff --git a/libs/libvpx/vpx_dsp/loopfilter.c b/libs/libvpx/vpx_dsp/loopfilter.c
new file mode 100644
index 0000000000..66f4d9576c
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/loopfilter.c
@@ -0,0 +1,745 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vpx_config.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_ports/mem.h"
+
+static INLINE int8_t signed_char_clamp(int t) {
+  return (int8_t)clamp(t, -128, 127);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE int16_t signed_char_clamp_high(int t, int bd) {
+  switch (bd) {
+    case 10:
+      return (int16_t)clamp(t, -128*4, 128*4-1);
+    case 12:
+      return (int16_t)clamp(t, -128*16, 128*16-1);
+    case 8:
+    default:
+      return (int16_t)clamp(t, -128, 128-1);
+  }
+}
+#endif
+
+// should we apply any filter at all: 11111111 yes, 00000000 no
+static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit,
+                                 uint8_t p3, uint8_t p2,
+                                 uint8_t p1, uint8_t p0,
+                                 uint8_t q0, uint8_t q1,
+                                 uint8_t q2, uint8_t q3) {
+  int8_t mask = 0;
+  mask |= (abs(p3 - p2) > limit) * -1;
+  mask |= (abs(p2 - p1) > limit) * -1;
+  mask |= (abs(p1 - p0) > limit) * -1;
+  mask |= (abs(q1 - q0) > limit) * -1;
+  mask |= (abs(q2 - q1) > limit) * -1;
+  mask |= (abs(q3 - q2) > limit) * -1;
+  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+  return ~mask;
+}
+
+static INLINE int8_t flat_mask4(uint8_t thresh,
+                                uint8_t p3, uint8_t p2,
+                                uint8_t p1, uint8_t p0,
+                                uint8_t q0, uint8_t q1,
+                                uint8_t q2, uint8_t q3) {
+  int8_t mask = 0;
+  mask |= (abs(p1 - p0) > thresh) * -1;
+  mask |= (abs(q1 - q0) > thresh) * -1;
+  mask |= (abs(p2 - p0) > thresh) * -1;
+  mask |= (abs(q2 - q0) > thresh) * -1;
+  mask |= (abs(p3 - p0) > thresh) * -1;
+  mask |= (abs(q3 - q0) > thresh) * -1;
+  return ~mask;
+}
+
+static INLINE int8_t flat_mask5(uint8_t thresh,
+                                uint8_t p4, uint8_t p3,
+                                uint8_t p2, uint8_t p1,
+                                uint8_t p0, uint8_t q0,
+                                uint8_t q1, uint8_t q2,
+                                uint8_t q3, uint8_t q4) {
+  int8_t mask = ~flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3);
+  mask |= (abs(p4 - p0) > thresh) * -1;
+  mask |= (abs(q4 - q0) > thresh) * -1;
+  return ~mask;
+}
+
+// is there high edge variance internal edge: 11111111 yes, 00000000 no
+static INLINE int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0,
+                              uint8_t q0, uint8_t q1) {
+  int8_t hev = 0;
+  hev  |= (abs(p1 - p0) > thresh) * -1;
+  hev  |= (abs(q1 - q0) > thresh) * -1;
+  return hev;
+}
+
+static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1,
+                           uint8_t *op0, uint8_t *oq0, uint8_t *oq1) {
+  int8_t filter1, filter2;
+
+  const int8_t ps1 = (int8_t) *op1 ^ 0x80;
+  const int8_t ps0 = (int8_t) *op0 ^ 0x80;
+  const int8_t qs0 = (int8_t) *oq0 ^ 0x80;
+  const int8_t qs1 = (int8_t) *oq1 ^ 0x80;
+  const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1);
+
+  // add outer taps if we have high edge variance
+  int8_t filter = signed_char_clamp(ps1 - qs1) & hev;
+
+  // inner taps
+  filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask;
+
+  // save bottom 3 bits so that we round one side +4 and the other +3
+  // if it equals 4 we'll set to adjust by -1 to account for the fact
+  // we'd round 3 the other way
+  filter1 = signed_char_clamp(filter + 4) >> 3;
+  filter2 = signed_char_clamp(filter + 3) >> 3;
+
+  *oq0 = signed_char_clamp(qs0 - filter1) ^ 0x80;
+  *op0 = signed_char_clamp(ps0 + filter2) ^ 0x80;
+
+  // outer tap adjustments
+  filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
+
+  *oq1 = signed_char_clamp(qs1 - filter) ^ 0x80;
+  *op1 = signed_char_clamp(ps1 + filter) ^ 0x80;
+}
+
+void vpx_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */,
+                            const uint8_t *blimit, const uint8_t *limit,
+                            const uint8_t *thresh, int count) {
+  int i;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < 8 * count; ++i) {
+    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+    const uint8_t q0 = s[0 * p],  q1 = s[1 * p],  q2 = s[2 * p],  q3 = s[3 * p];
+    const int8_t mask = filter_mask(*limit, *blimit,
+                                    p3, p2, p1, p0, q0, q1, q2, q3);
+    filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p);
+    ++s;
+  }
+}
+
+void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
+                                 const uint8_t *limit0, const uint8_t *thresh0,
+                                 const uint8_t *blimit1, const uint8_t *limit1,
+                                 const uint8_t *thresh1) {
+  vpx_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, 1);
+  vpx_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, 1);
+}
+
+void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
+                          const uint8_t *limit, const uint8_t *thresh,
+                          int count) {
+  int i;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < 8 * count; ++i) {
+    const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+    const uint8_t q0 = s[0],  q1 = s[1],  q2 = s[2],  q3 = s[3];
+    const int8_t mask = filter_mask(*limit, *blimit,
+                                    p3, p2, p1, p0, q0, q1, q2, q3);
+    filter4(mask, *thresh, s - 2, s - 1, s, s + 1);
+    s += pitch;
+  }
+}
+
+void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
+                               const uint8_t *limit0, const uint8_t *thresh0,
+                               const uint8_t *blimit1, const uint8_t *limit1,
+                               const uint8_t *thresh1) {
+  vpx_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, 1);
+  vpx_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1,
+                                  thresh1, 1);
+}
+
+static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat,
+                           uint8_t *op3, uint8_t *op2,
+                           uint8_t *op1, uint8_t *op0,
+                           uint8_t *oq0, uint8_t *oq1,
+                           uint8_t *oq2, uint8_t *oq3) {
+  if (flat && mask) {
+    const uint8_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
+    const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
+
+    // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
+    *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
+    *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
+    *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
+    *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
+    *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
+    *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
+  } else {
+    filter4(mask, thresh, op1,  op0, oq0, oq1);
+  }
+}
+
+void vpx_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit,
+                            const uint8_t *limit, const uint8_t *thresh,
+                            int count) {
+  int i;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < 8 * count; ++i) {
+    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+
+    const int8_t mask = filter_mask(*limit, *blimit,
+                                    p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+    filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
+                                 s,         s + 1 * p, s + 2 * p, s + 3 * p);
+    ++s;
+  }
+}
+
+void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
+                                 const uint8_t *limit0, const uint8_t *thresh0,
+                                 const uint8_t *blimit1, const uint8_t *limit1,
+                                 const uint8_t *thresh1) {
+  vpx_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, 1);
+  vpx_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, 1);
+}
+
+void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
+                          const uint8_t *limit, const uint8_t *thresh,
+                          int count) {
+  int i;
+
+  for (i = 0; i < 8 * count; ++i) {
+    const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+    const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+    const int8_t mask = filter_mask(*limit, *blimit,
+                                    p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+    filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1,
+                                 s,     s + 1, s + 2, s + 3);
+    s += pitch;
+  }
+}
+
+void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
+                               const uint8_t *limit0, const uint8_t *thresh0,
+                               const uint8_t *blimit1, const uint8_t *limit1,
+                               const uint8_t *thresh1) {
+  vpx_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, 1);
+  vpx_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1,
+                                    thresh1, 1);
+}
+
+static INLINE void filter16(int8_t mask, uint8_t thresh,
+                            uint8_t flat, uint8_t flat2,
+                            uint8_t *op7, uint8_t *op6,
+                            uint8_t *op5, uint8_t *op4,
+                            uint8_t *op3, uint8_t *op2,
+                            uint8_t *op1, uint8_t *op0,
+                            uint8_t *oq0, uint8_t *oq1,
+                            uint8_t *oq2, uint8_t *oq3,
+                            uint8_t *oq4, uint8_t *oq5,
+                            uint8_t *oq6, uint8_t *oq7) {
+  if (flat2 && flat && mask) {
+    const uint8_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4,
+                  p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
+
+    const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3,
+                  q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7;
+
+    // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
+    *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 +
+                              q0, 4);
+    *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 +
+                              q0 + q1, 4);
+    *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 +
+                              q0 + q1 + q2, 4);
+    *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 +
+                              q0 + q1 + q2 + q3, 4);
+    *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 +
+                              q0 + q1 + q2 + q3 + q4, 4);
+    *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 +
+                              q0 + q1 + q2 + q3 + q4 + q5, 4);
+    *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
+                              q0 + q1 + q2 + q3 + q4 + q5 + q6, 4);
+    *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 +
+                              q0 * 2 + q1 + q2 + q3 + q4 + q5 + q6 + q7, 4);
+    *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 +
+                              q0 + q1 * 2 + q2 + q3 + q4 + q5 + q6 + q7 * 2, 4);
+    *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 +
+                              q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3, 4);
+    *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 +
+                              q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4);
+    *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 +
+                              q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4);
+    *oq5 = ROUND_POWER_OF_TWO(p1 + p0 +
+                              q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);
+    *oq6 = ROUND_POWER_OF_TWO(p0 +
+                              q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
+  } else {
+    filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
+  }
+}
+
+void vpx_lpf_horizontal_16_c(uint8_t *s, int p, const uint8_t *blimit,
+                             const uint8_t *limit, const uint8_t *thresh,
+                             int count) {
+  int i;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < 8 * count; ++i) {
+    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+    const int8_t mask = filter_mask(*limit, *blimit,
+                                    p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t flat2 = flat_mask5(1,
+                             s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0,
+                             q0, s[4 * p], s[5 * p], s[6 * p], s[7 * p]);
+
+    filter16(mask, *thresh, flat, flat2,
+             s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p,
+             s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
+             s,         s + 1 * p, s + 2 * p, s + 3 * p,
+             s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p);
+    ++s;
+  }
+}
+
+static void mb_lpf_vertical_edge_w(uint8_t *s, int p,
+                                   const uint8_t *blimit,
+                                   const uint8_t *limit,
+                                   const uint8_t *thresh,
+                                   int count) {
+  int i;
+
+  for (i = 0; i < count; ++i) {
+    const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+    const uint8_t q0 = s[0], q1 = s[1],  q2 = s[2], q3 = s[3];
+    const int8_t mask = filter_mask(*limit, *blimit,
+                                    p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t flat2 = flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0,
+                                    q0, s[4], s[5], s[6], s[7]);
+
+    filter16(mask, *thresh, flat, flat2,
+             s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1,
+             s,     s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7);
+    s += p;
+  }
+}
+
+void vpx_lpf_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
+                           const uint8_t *limit, const uint8_t *thresh) {
+  mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8);
+}
+
+void vpx_lpf_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
+                                const uint8_t *limit, const uint8_t *thresh) {
+  mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+// Should we apply any filter at all: 11111111 yes, 00000000 no ?
+static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit,
+                                        uint16_t p3, uint16_t p2,
+                                        uint16_t p1, uint16_t p0,
+                                        uint16_t q0, uint16_t q1,
+                                        uint16_t q2, uint16_t q3, int bd) {
+  int8_t mask = 0;
+  int16_t limit16 = (uint16_t)limit << (bd - 8);
+  int16_t blimit16 = (uint16_t)blimit << (bd - 8);
+  mask |= (abs(p3 - p2) > limit16) * -1;
+  mask |= (abs(p2 - p1) > limit16) * -1;
+  mask |= (abs(p1 - p0) > limit16) * -1;
+  mask |= (abs(q1 - q0) > limit16) * -1;
+  mask |= (abs(q2 - q1) > limit16) * -1;
+  mask |= (abs(q3 - q2) > limit16) * -1;
+  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit16) * -1;
+  return ~mask;
+}
+
+static INLINE int8_t highbd_flat_mask4(uint8_t thresh,
+                                       uint16_t p3, uint16_t p2,
+                                       uint16_t p1, uint16_t p0,
+                                       uint16_t q0, uint16_t q1,
+                                       uint16_t q2, uint16_t q3, int bd) {
+  int8_t mask = 0;
+  int16_t thresh16 = (uint16_t)thresh << (bd - 8);
+  mask |= (abs(p1 - p0) > thresh16) * -1;
+  mask |= (abs(q1 - q0) > thresh16) * -1;
+  mask |= (abs(p2 - p0) > thresh16) * -1;
+  mask |= (abs(q2 - q0) > thresh16) * -1;
+  mask |= (abs(p3 - p0) > thresh16) * -1;
+  mask |= (abs(q3 - q0) > thresh16) * -1;
+  return ~mask;
+}
+
+static INLINE int8_t highbd_flat_mask5(uint8_t thresh,
+                                       uint16_t p4, uint16_t p3,
+                                       uint16_t p2, uint16_t p1,
+                                       uint16_t p0, uint16_t q0,
+                                       uint16_t q1, uint16_t q2,
+                                       uint16_t q3, uint16_t q4, int bd) {
+  int8_t mask = ~highbd_flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+  int16_t thresh16 = (uint16_t)thresh << (bd - 8);
+  mask |= (abs(p4 - p0) > thresh16) * -1;
+  mask |= (abs(q4 - q0) > thresh16) * -1;
+  return ~mask;
+}
+
+// Is there high edge variance internal edge:
+// 11111111_11111111 yes, 00000000_00000000 no ?
+static INLINE int16_t highbd_hev_mask(uint8_t thresh, uint16_t p1, uint16_t p0,
+                                      uint16_t q0, uint16_t q1, int bd) {
+  int16_t hev = 0;
+  int16_t thresh16 = (uint16_t)thresh << (bd - 8);
+  hev |= (abs(p1 - p0) > thresh16) * -1;
+  hev |= (abs(q1 - q0) > thresh16) * -1;
+  return hev;
+}
+
+static INLINE void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t *op1,
+                                  uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
+                                  int bd) {
+  int16_t filter1, filter2;
+  // ^0x80 equivalent to subtracting 0x80 from the values to turn them
+  // into -128 to +127 instead of 0 to 255.
+  int shift = bd - 8;
+  const int16_t ps1 = (int16_t)*op1 - (0x80 << shift);
+  const int16_t ps0 = (int16_t)*op0 - (0x80 << shift);
+  const int16_t qs0 = (int16_t)*oq0 - (0x80 << shift);
+  const int16_t qs1 = (int16_t)*oq1 - (0x80 << shift);
+  const uint16_t hev = highbd_hev_mask(thresh, *op1, *op0, *oq0, *oq1, bd);
+
+  // Add outer taps if we have high edge variance.
+  int16_t filter = signed_char_clamp_high(ps1 - qs1, bd) & hev;
+
+  // Inner taps.
+  filter = signed_char_clamp_high(filter + 3 * (qs0 - ps0), bd) & mask;
+
+  // Save bottom 3 bits so that we round one side +4 and the other +3
+  // if it equals 4 we'll set to adjust by -1 to account for the fact
+  // we'd round 3 the other way.
+  filter1 = signed_char_clamp_high(filter + 4, bd) >> 3;
+  filter2 = signed_char_clamp_high(filter + 3, bd) >> 3;
+
+  *oq0 = signed_char_clamp_high(qs0 - filter1, bd) + (0x80 << shift);
+  *op0 = signed_char_clamp_high(ps0 + filter2, bd) + (0x80 << shift);
+
+  // Outer tap adjustments.
+  filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
+
+  *oq1 = signed_char_clamp_high(qs1 - filter, bd) + (0x80 << shift);
+  *op1 = signed_char_clamp_high(ps1 + filter, bd) + (0x80 << shift);
+}
+
+void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */,
+                                   const uint8_t *blimit, const uint8_t *limit,
+                                   const uint8_t *thresh, int count, int bd) {
+  int i;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < 8 * count; ++i) {
+    const uint16_t p3 = s[-4 * p];
+    const uint16_t p2 = s[-3 * p];
+    const uint16_t p1 = s[-2 * p];
+    const uint16_t p0 = s[-p];
+    const uint16_t q0 = s[0 * p];
+    const uint16_t q1 = s[1 * p];
+    const uint16_t q2 = s[2 * p];
+    const uint16_t q3 = s[3 * p];
+    const int8_t mask = highbd_filter_mask(*limit, *blimit,
+                                           p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    highbd_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd);
+    ++s;
+  }
+}
+
+void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int p,
+                                        const uint8_t *blimit0,
+                                        const uint8_t *limit0,
+                                        const uint8_t *thresh0,
+                                        const uint8_t *blimit1,
+                                        const uint8_t *limit1,
+                                        const uint8_t *thresh1,
+                                        int bd) {
+  vpx_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, 1, bd);
+  vpx_highbd_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, 1, bd);
+}
+
+void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit,
+                                 const uint8_t *limit, const uint8_t *thresh,
+                                 int count, int bd) {
+  int i;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < 8 * count; ++i) {
+    const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+    const uint16_t q0 = s[0],  q1 = s[1],  q2 = s[2],  q3 = s[3];
+    const int8_t mask = highbd_filter_mask(*limit, *blimit,
+                                           p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    highbd_filter4(mask, *thresh, s - 2, s - 1, s, s + 1, bd);
+    s += pitch;
+  }
+}
+
+void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch,
+                                      const uint8_t *blimit0,
+                                      const uint8_t *limit0,
+                                      const uint8_t *thresh0,
+                                      const uint8_t *blimit1,
+                                      const uint8_t *limit1,
+                                      const uint8_t *thresh1,
+                                      int bd) {
+  vpx_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, 1, bd);
+  vpx_highbd_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1,
+                              thresh1, 1, bd);
+}
+
+static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, uint8_t flat,
+                                  uint16_t *op3, uint16_t *op2,
+                                  uint16_t *op1, uint16_t *op0,
+                                  uint16_t *oq0, uint16_t *oq1,
+                                  uint16_t *oq2, uint16_t *oq3, int bd) {
+  if (flat && mask) {
+    const uint16_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
+    const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
+
+    // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
+    *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
+    *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
+    *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
+    *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
+    *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
+    *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
+  } else {
+    highbd_filter4(mask, thresh, op1,  op0, oq0, oq1, bd);
+  }
+}
+
+void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit,
+                                   const uint8_t *limit, const uint8_t *thresh,
+                                   int count, int bd) {
+  int i;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < 8 * count; ++i) {
+    const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+    const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+
+    const int8_t mask = highbd_filter_mask(*limit, *blimit,
+                                         p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3,
+                                          bd);
+    highbd_filter8(mask, *thresh, flat,
+                 s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
+                 s, s + 1 * p, s + 2 * p, s + 3 * p, bd);
+    ++s;
+  }
+}
+
+void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int p,
+                                        const uint8_t *blimit0,
+                                        const uint8_t *limit0,
+                                        const uint8_t *thresh0,
+                                        const uint8_t *blimit1,
+                                        const uint8_t *limit1,
+                                        const uint8_t *thresh1,
+                                        int bd) {
+  vpx_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, 1, bd);
+  vpx_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, 1, bd);
+}
+
+void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit,
+                                 const uint8_t *limit, const uint8_t *thresh,
+                                 int count, int bd) {
+  int i;
+
+  for (i = 0; i < 8 * count; ++i) {
+    const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+    const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+    const int8_t mask = highbd_filter_mask(*limit, *blimit,
+                                           p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3,
+                                          bd);
+    highbd_filter8(mask, *thresh, flat,
+                 s - 4, s - 3, s - 2, s - 1,
+                 s, s + 1, s + 2, s + 3,
+                 bd);
+    s += pitch;
+  }
+}
+
+void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch,
+                                      const uint8_t *blimit0,
+                                      const uint8_t *limit0,
+                                      const uint8_t *thresh0,
+                                      const uint8_t *blimit1,
+                                      const uint8_t *limit1,
+                                      const uint8_t *thresh1,
+                                      int bd) {
+  vpx_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, 1, bd);
+  vpx_highbd_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1,
+                              thresh1, 1, bd);
+}
+
+static INLINE void highbd_filter16(int8_t mask, uint8_t thresh,
+                                   uint8_t flat, uint8_t flat2,
+                                   uint16_t *op7, uint16_t *op6,
+                                   uint16_t *op5, uint16_t *op4,
+                                   uint16_t *op3, uint16_t *op2,
+                                   uint16_t *op1, uint16_t *op0,
+                                   uint16_t *oq0, uint16_t *oq1,
+                                   uint16_t *oq2, uint16_t *oq3,
+                                   uint16_t *oq4, uint16_t *oq5,
+                                   uint16_t *oq6, uint16_t *oq7, int bd) {
+  if (flat2 && flat && mask) {
+    const uint16_t p7 = *op7;
+    const uint16_t p6 = *op6;
+    const uint16_t p5 = *op5;
+    const uint16_t p4 = *op4;
+    const uint16_t p3 = *op3;
+    const uint16_t p2 = *op2;
+    const uint16_t p1 = *op1;
+    const uint16_t p0 = *op0;
+    const uint16_t q0 = *oq0;
+    const uint16_t q1 = *oq1;
+    const uint16_t q2 = *oq2;
+    const uint16_t q3 = *oq3;
+    const uint16_t q4 = *oq4;
+    const uint16_t q5 = *oq5;
+    const uint16_t q6 = *oq6;
+    const uint16_t q7 = *oq7;
+
+    // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
+    *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 +
+                              q0, 4);
+    *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 +
+                              q0 + q1, 4);
+    *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 +
+                              q0 + q1 + q2, 4);
+    *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 +
+                              q0 + q1 + q2 + q3, 4);
+    *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 +
+                              q0 + q1 + q2 + q3 + q4, 4);
+    *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 +
+                              q0 + q1 + q2 + q3 + q4 + q5, 4);
+    *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
+                              q0 + q1 + q2 + q3 + q4 + q5 + q6, 4);
+    *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 +
+                              q0 * 2 + q1 + q2 + q3 + q4 + q5 + q6 + q7, 4);
+    *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 +
+                              q0 + q1 * 2 + q2 + q3 + q4 + q5 + q6 + q7 * 2, 4);
+    *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 +
+                              q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3, 4);
+    *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 +
+                              q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4);
+    *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 +
+                              q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4);
+    *oq5 = ROUND_POWER_OF_TWO(p1 + p0 +
+                              q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);
+    *oq6 = ROUND_POWER_OF_TWO(p0 +
+                              q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
+  } else {
+    highbd_filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3,
+                   bd);
+  }
+}
+
+void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int p, const uint8_t *blimit,
+                                    const uint8_t *limit, const uint8_t *thresh,
+                                    int count, int bd) {
+  int i;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < 8 * count; ++i) {
+    const uint16_t p3 = s[-4 * p];
+    const uint16_t p2 = s[-3 * p];
+    const uint16_t p1 = s[-2 * p];
+    const uint16_t p0 = s[-p];
+    const uint16_t q0 = s[0 * p];
+    const uint16_t q1 = s[1 * p];
+    const uint16_t q2 = s[2 * p];
+    const uint16_t q3 = s[3 * p];
+    const int8_t mask = highbd_filter_mask(*limit, *blimit,
+                                           p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3,
+                                          bd);
+    const int8_t flat2 = highbd_flat_mask5(
+        1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0,
+        q0, s[4 * p], s[5 * p], s[6 * p], s[7 * p], bd);
+
+    highbd_filter16(mask, *thresh, flat, flat2,
+                    s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p,
+                    s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
+                    s, s + 1 * p, s + 2 * p, s + 3 * p,
+                    s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p,
+                    bd);
+    ++s;
+  }
+}
+
+static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
+                                          const uint8_t *blimit,
+                                          const uint8_t *limit,
+                                          const uint8_t *thresh,
+                                          int count, int bd) {
+  int i;
+
+  for (i = 0; i < count; ++i) {
+    const uint16_t p3 = s[-4];
+    const uint16_t p2 = s[-3];
+    const uint16_t p1 = s[-2];
+    const uint16_t p0 = s[-1];
+    const uint16_t q0 = s[0];
+    const uint16_t q1 = s[1];
+    const uint16_t q2 = s[2];
+    const uint16_t q3 = s[3];
+    const int8_t mask = highbd_filter_mask(*limit, *blimit,
+                                           p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3,
+                                          bd);
+    const int8_t flat2 = highbd_flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0,
+                                           q0, s[4], s[5], s[6], s[7], bd);
+
+    highbd_filter16(mask, *thresh, flat, flat2,
+                    s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1,
+                    s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7,
+                    bd);
+    s += p;
+  }
+}
+
+void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit,
+                                  const uint8_t *limit, const uint8_t *thresh,
+                                  int bd) {
+  highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8, bd);
+}
+
+void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int p,
+                                       const uint8_t *blimit,
+                                       const uint8_t *limit,
+                                       const uint8_t *thresh,
+                                       int bd) {
+  highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16, bd);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/libs/libvpx/vpx_dsp/mips/avg_msa.c b/libs/libvpx/vpx_dsp/mips/avg_msa.c
new file mode 100644
index 0000000000..52a24ed379
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/avg_msa.c
@@ -0,0 +1,56 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/macros_msa.h"
+
+uint32_t vpx_avg_8x8_msa(const uint8_t *src, int32_t src_stride) {
+  uint32_t sum_out;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v8u16 sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
+  v4u32 sum = { 0 };
+
+  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+  HADD_UB4_UH(src0, src1, src2, src3, sum0, sum1, sum2, sum3);
+  HADD_UB4_UH(src4, src5, src6, src7, sum4, sum5, sum6, sum7);
+  ADD4(sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum0, sum2, sum4, sum6);
+  ADD2(sum0, sum2, sum4, sum6, sum0, sum4);
+  sum0 += sum4;
+
+  sum = __msa_hadd_u_w(sum0, sum0);
+  sum0 = (v8u16)__msa_pckev_h((v8i16)sum, (v8i16)sum);
+  sum = __msa_hadd_u_w(sum0, sum0);
+  sum = (v4u32)__msa_srari_w((v4i32)sum, 6);
+  sum_out = __msa_copy_u_w((v4i32)sum, 0);
+
+  return sum_out;
+}
+
+uint32_t vpx_avg_4x4_msa(const uint8_t *src, int32_t src_stride) {
+  uint32_t sum_out;
+  uint32_t src0, src1, src2, src3;
+  v16u8 vec = { 0 };
+  v8u16 sum0;
+  v4u32 sum1;
+  v2u64 sum2;
+
+  LW4(src, src_stride, src0, src1, src2, src3);
+  INSERT_W4_UB(src0, src1, src2, src3, vec);
+
+  sum0 = __msa_hadd_u_h(vec, vec);
+  sum1 = __msa_hadd_u_w(sum0, sum0);
+  sum0 = (v8u16)__msa_pckev_h((v8i16)sum1, (v8i16)sum1);
+  sum1 = __msa_hadd_u_w(sum0, sum0);
+  sum2 = __msa_hadd_u_d(sum1, sum1);
+  sum1 = (v4u32)__msa_srari_w((v4i32)sum2, 4);
+  sum_out = __msa_copy_u_w((v4i32)sum1, 0);
+
+  return sum_out;
+}
diff --git a/libs/libvpx/vpx_dsp/mips/common_dspr2.c b/libs/libvpx/vpx_dsp/mips/common_dspr2.c
new file mode 100644
index 0000000000..b22f084a02
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/common_dspr2.c
@@ -0,0 +1,30 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/mips/common_dspr2.h"
+
+#if HAVE_DSPR2
+uint8_t vpx_ff_cropTbl_a[256 + 2 * CROP_WIDTH];
+uint8_t *vpx_ff_cropTbl;
+
+void vpx_dsputil_static_init(void) {
+  int i;
+
+  for (i = 0; i < 256; i++) vpx_ff_cropTbl_a[i + CROP_WIDTH] = i;
+
+  for (i = 0; i < CROP_WIDTH; i++) {
+    vpx_ff_cropTbl_a[i] = 0;
+    vpx_ff_cropTbl_a[i + CROP_WIDTH + 256] = 255;
+  }
+
+  vpx_ff_cropTbl = &vpx_ff_cropTbl_a[CROP_WIDTH];
+}
+
+#endif
diff --git a/libs/libvpx/vpx_dsp/mips/common_dspr2.h b/libs/libvpx/vpx_dsp/mips/common_dspr2.h
new file mode 100644
index 0000000000..7a10bf1c40
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/common_dspr2.h
@@ -0,0 +1,64 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_COMMON_MIPS_DSPR2_H_
+#define VPX_COMMON_MIPS_DSPR2_H_
+
+#include <assert.h>
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#if HAVE_DSPR2
+#define CROP_WIDTH 512
+
+extern uint8_t *vpx_ff_cropTbl;  // From "vpx_dsp/mips/intrapred4_dspr2.c"
+
+static INLINE void prefetch_load(const unsigned char *src) {
+  __asm__ __volatile__ (
+      "pref   0,  0(%[src])   \n\t"
+      :
+      : [src] "r" (src)
+  );
+}
+
+/* prefetch data for store */
+static INLINE void prefetch_store(unsigned char *dst) {
+  __asm__ __volatile__ (
+      "pref   1,  0(%[dst])   \n\t"
+      :
+      : [dst] "r" (dst)
+  );
+}
+
+static INLINE void prefetch_load_streamed(const unsigned char *src) {
+  __asm__ __volatile__ (
+      "pref   4,  0(%[src])   \n\t"
+      :
+      : [src] "r" (src)
+  );
+}
+
+/* prefetch data for store */
+static INLINE void prefetch_store_streamed(unsigned char *dst) {
+  __asm__ __volatile__ (
+      "pref   5,  0(%[dst])   \n\t"
+      :
+      : [dst] "r" (dst)
+  );
+}
+#endif  // #if HAVE_DSPR2
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_COMMON_MIPS_DSPR2_H_
diff --git a/libs/libvpx/vpx_dsp/mips/convolve2_avg_dspr2.c b/libs/libvpx/vpx_dsp/mips/convolve2_avg_dspr2.c
new file mode 100644
index 0000000000..3c767672fb
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/convolve2_avg_dspr2.c
@@ -0,0 +1,273 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/convolve_common_dspr2.h"
+#include "vpx_dsp/vpx_convolve.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_ports/mem.h"
+
+#if HAVE_DSPR2
+static void convolve_bi_avg_vert_4_dspr2(const uint8_t *src,
+                                         int32_t src_stride,
+                                         uint8_t *dst,
+                                         int32_t dst_stride,
+                                         const int16_t *filter_y,
+                                         int32_t w,
+                                         int32_t h) {
+  int32_t       x, y;
+  const uint8_t *src_ptr;
+  uint8_t       *dst_ptr;
+  uint8_t       *cm = vpx_ff_cropTbl;
+  uint32_t      vector4a = 64;
+  uint32_t      load1, load2;
+  uint32_t      p1, p2;
+  uint32_t      scratch1, scratch2;
+  uint32_t      store1, store2;
+  int32_t       Temp1, Temp2;
+  const int16_t *filter = &filter_y[3];
+  uint32_t      filter45;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    prefetch_store(dst + dst_stride);
+
+    for (x = 0; x < w; x += 4) {
+      src_ptr = src + x;
+      dst_ptr = dst + x;
+
+      __asm__ __volatile__ (
+          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
+
+          "mtlo             %[vector4a],  $ac0                            \n\t"
+          "mtlo             %[vector4a],  $ac1                            \n\t"
+          "mtlo             %[vector4a],  $ac2                            \n\t"
+          "mtlo             %[vector4a],  $ac3                            \n\t"
+          "mthi             $zero,        $ac0                            \n\t"
+          "mthi             $zero,        $ac1                            \n\t"
+          "mthi             $zero,        $ac2                            \n\t"
+          "mthi             $zero,        $ac3                            \n\t"
+
+          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac0,         %[p1],          %[filter45]     \n\t"
+          "dpa.w.ph         $ac1,         %[p2],          %[filter45]     \n\t"
+
+          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac2,         %[p1],          %[filter45]     \n\t"
+          "dpa.w.ph         $ac3,         %[p2],          %[filter45]     \n\t"
+
+          "extp             %[Temp1],     $ac0,           31              \n\t"
+          "extp             %[Temp2],     $ac1,           31              \n\t"
+
+          "lbu              %[scratch1],  0(%[dst_ptr])                   \n\t"
+          "lbu              %[scratch2],  1(%[dst_ptr])                   \n\t"
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 1 */
+          "extp             %[Temp1],     $ac2,           31              \n\t"
+
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 2 */
+          "extp             %[Temp2],     $ac3,           31              \n\t"
+          "lbu              %[scratch1],  2(%[dst_ptr])                   \n\t"
+
+          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
+          "lbu              %[scratch2],  3(%[dst_ptr])                   \n\t"
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 3 */
+          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 4 */
+
+          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
+
+          : [load1] "=&r" (load1), [load2] "=&r" (load2),
+            [p1] "=&r" (p1), [p2] "=&r" (p2),
+            [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+            [store1] "=&r" (store1), [store2] "=&r" (store2),
+            [src_ptr] "+r" (src_ptr)
+          : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
+            [src_stride] "r" (src_stride), [cm] "r" (cm),
+            [dst_ptr] "r" (dst_ptr)
+      );
+    }
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src,
+                                          int32_t src_stride,
+                                          uint8_t *dst,
+                                          int32_t dst_stride,
+                                          const int16_t *filter_y,
+                                          int32_t h) {
+  int32_t       x, y;
+  const uint8_t *src_ptr;
+  uint8_t       *dst_ptr;
+  uint8_t       *cm = vpx_ff_cropTbl;
+  uint32_t      vector4a = 64;
+  uint32_t      load1, load2;
+  uint32_t      p1, p2;
+  uint32_t      scratch1, scratch2;
+  uint32_t      store1, store2;
+  int32_t       Temp1, Temp2;
+  const int16_t *filter = &filter_y[3];
+  uint32_t filter45;;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    prefetch_store(dst + dst_stride);
+    prefetch_store(dst + dst_stride + 32);
+
+    for (x = 0; x < 64; x += 4) {
+      src_ptr = src + x;
+      dst_ptr = dst + x;
+
+      __asm__ __volatile__ (
+          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
+
+          "mtlo             %[vector4a],  $ac0                            \n\t"
+          "mtlo             %[vector4a],  $ac1                            \n\t"
+          "mtlo             %[vector4a],  $ac2                            \n\t"
+          "mtlo             %[vector4a],  $ac3                            \n\t"
+          "mthi             $zero,        $ac0                            \n\t"
+          "mthi             $zero,        $ac1                            \n\t"
+          "mthi             $zero,        $ac2                            \n\t"
+          "mthi             $zero,        $ac3                            \n\t"
+
+          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac0,         %[p1],          %[filter45]     \n\t"
+          "dpa.w.ph         $ac1,         %[p2],          %[filter45]     \n\t"
+
+          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac2,         %[p1],          %[filter45]     \n\t"
+          "dpa.w.ph         $ac3,         %[p2],          %[filter45]     \n\t"
+
+          "extp             %[Temp1],     $ac0,           31              \n\t"
+          "extp             %[Temp2],     $ac1,           31              \n\t"
+
+          "lbu              %[scratch1],  0(%[dst_ptr])                   \n\t"
+          "lbu              %[scratch2],  1(%[dst_ptr])                   \n\t"
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 1 */
+          "extp             %[Temp1],     $ac2,           31              \n\t"
+
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 2 */
+          "extp             %[Temp2],     $ac3,           31              \n\t"
+          "lbu              %[scratch1],  2(%[dst_ptr])                   \n\t"
+
+          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
+          "lbu              %[scratch2],  3(%[dst_ptr])                   \n\t"
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 3 */
+          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 4 */
+
+          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
+
+          : [load1] "=&r" (load1), [load2] "=&r" (load2),
+            [p1] "=&r" (p1), [p2] "=&r" (p2),
+            [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+            [store1] "=&r" (store1), [store2] "=&r" (store2),
+            [src_ptr] "+r" (src_ptr)
+          : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
+            [src_stride] "r" (src_stride), [cm] "r" (cm),
+            [dst_ptr] "r" (dst_ptr)
+      );
+    }
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const int16_t *filter_x, int x_step_q4,
+                                  const int16_t *filter_y, int y_step_q4,
+                                  int w, int h) {
+  uint32_t pos = 38;
+
+  assert(y_step_q4 == 16);
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  prefetch_store(dst);
+
+  switch (w) {
+    case 4:
+    case 8:
+    case 16:
+    case 32:
+      convolve_bi_avg_vert_4_dspr2(src, src_stride,
+                                   dst, dst_stride,
+                                   filter_y, w, h);
+      break;
+    case 64:
+      prefetch_store(dst + 32);
+      convolve_bi_avg_vert_64_dspr2(src, src_stride,
+                                    dst, dst_stride,
+                                    filter_y, h);
+      break;
+    default:
+      vpx_convolve8_avg_vert_c(src, src_stride,
+                               dst, dst_stride,
+                               filter_x, x_step_q4,
+                               filter_y, y_step_q4,
+                               w, h);
+      break;
+  }
+}
+#endif
diff --git a/libs/libvpx/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c b/libs/libvpx/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c
new file mode 100644
index 0000000000..932a73d39b
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c
@@ -0,0 +1,825 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/convolve_common_dspr2.h"
+#include "vpx_dsp/vpx_convolve.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_ports/mem.h"
+
+#if HAVE_DSPR2
+static void convolve_bi_avg_horiz_4_dspr2(const uint8_t *src,
+                                          int32_t src_stride,
+                                          uint8_t *dst,
+                                          int32_t dst_stride,
+                                          const int16_t *filter_x0,
+                                          int32_t h) {
+  int32_t y;
+  uint8_t *cm = vpx_ff_cropTbl;
+  int32_t  Temp1, Temp2, Temp3, Temp4;
+  uint32_t vector4a = 64;
+  uint32_t tp1, tp2;
+  uint32_t p1, p2, p3;
+  uint32_t tn1, tn2;
+  const int16_t *filter = &filter_x0[3];
+  uint32_t      filter45;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+    prefetch_store(dst + dst_stride);
+
+    __asm__ __volatile__ (
+        "ulw              %[tp1],         0(%[src])                      \n\t"
+        "ulw              %[tp2],         4(%[src])                      \n\t"
+
+        /* even 1. pixel */
+        "mtlo             %[vector4a],    $ac3                           \n\t"
+        "mthi             $zero,          $ac3                           \n\t"
+        "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
+        "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
+        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
+        "extp             %[Temp1],       $ac3,           31             \n\t"
+
+        /* even 2. pixel */
+        "mtlo             %[vector4a],    $ac2                           \n\t"
+        "mthi             $zero,          $ac2                           \n\t"
+        "balign           %[tp2],         %[tp1],         3              \n\t"
+        "dpa.w.ph         $ac2,           %[p2],          %[filter45]    \n\t"
+        "extp             %[Temp3],       $ac2,           31             \n\t"
+
+        "lbu              %[p2],          3(%[dst])                      \n\t"  /* load odd 2 */
+
+        /* odd 1. pixel */
+        "lbux             %[tp1],         %[Temp1](%[cm])                \n\t"  /* even 1 */
+        "mtlo             %[vector4a],    $ac3                           \n\t"
+        "mthi             $zero,          $ac3                           \n\t"
+        "lbu              %[Temp1],       1(%[dst])                      \n\t"  /* load odd 1 */
+        "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p3],          %[tp2]                         \n\t"
+        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
+        "extp             %[Temp2],       $ac3,           31             \n\t"
+
+        "lbu              %[tn2],         0(%[dst])                      \n\t"  /* load even 1 */
+
+        /* odd 2. pixel */
+        "lbux             %[tp2],         %[Temp3](%[cm])                \n\t"  /* even 2 */
+        "mtlo             %[vector4a],    $ac2                           \n\t"
+        "mthi             $zero,          $ac2                           \n\t"
+        "lbux             %[tn1],         %[Temp2](%[cm])                \n\t"  /* odd 1 */
+        "addqh_r.w        %[tn2],         %[tn2],         %[tp1]         \n\t"  /* average even 1 */
+        "dpa.w.ph         $ac2,           %[p3],          %[filter45]    \n\t"
+        "extp             %[Temp4],       $ac2,           31             \n\t"
+
+        "lbu              %[tp1],         2(%[dst])                      \n\t"  /* load even 2 */
+        "sb               %[tn2],         0(%[dst])                      \n\t"  /* store even 1 */
+
+        /* clamp */
+        "addqh_r.w        %[Temp1],       %[Temp1],       %[tn1]         \n\t"  /* average odd 1 */
+        "lbux             %[p3],          %[Temp4](%[cm])                \n\t"  /* odd 2 */
+        "sb               %[Temp1],       1(%[dst])                      \n\t"  /* store odd 1 */
+
+        "addqh_r.w        %[tp1],         %[tp1],         %[tp2]         \n\t"  /* average even 2 */
+        "sb               %[tp1],         2(%[dst])                      \n\t"  /* store even 2 */
+
+        "addqh_r.w        %[p2],          %[p2],          %[p3]          \n\t"  /* average odd 2 */
+        "sb               %[p2],          3(%[dst])                      \n\t"  /* store odd 2 */
+
+        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+          [tn1] "=&r" (tn1), [tn2] "=&r" (tn2),
+          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3),
+          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+          [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
+        : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
+          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+    );
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src,
+                                         int32_t src_stride,
+                                         uint8_t *dst,
+                                         int32_t dst_stride,
+                                         const int16_t *filter_x0,
+                                         int32_t h) {
+  int32_t y;
+  uint8_t *cm = vpx_ff_cropTbl;
+  uint32_t vector4a = 64;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t tp1, tp2, tp3, tp4;
+  uint32_t p1, p2, p3, p4, n1;
+  uint32_t st0, st1;
+  const int16_t *filter = &filter_x0[3];
+  uint32_t filter45;;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+    prefetch_store(dst + dst_stride);
+
+    __asm__ __volatile__ (
+        "ulw              %[tp1],         0(%[src])                      \n\t"
+        "ulw              %[tp2],         4(%[src])                      \n\t"
+
+        /* even 1. pixel */
+        "mtlo             %[vector4a],    $ac3                           \n\t"
+        "mthi             $zero,          $ac3                           \n\t"
+        "mtlo             %[vector4a],    $ac2                           \n\t"
+        "mthi             $zero,          $ac2                           \n\t"
+        "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
+        "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
+        "preceu.ph.qbr    %[p3],          %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p4],          %[tp2]                         \n\t"
+        "ulw              %[tp3],         8(%[src])                      \n\t"
+        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
+        "extp             %[Temp1],       $ac3,           31             \n\t"
+        "lbu              %[Temp2],       0(%[dst])                      \n\t"
+        "lbu              %[tp4],         2(%[dst])                      \n\t"
+
+        /* even 2. pixel */
+        "dpa.w.ph         $ac2,           %[p2],          %[filter45]    \n\t"
+        "extp             %[Temp3],       $ac2,           31             \n\t"
+
+        /* even 3. pixel */
+        "lbux             %[st0],         %[Temp1](%[cm])                \n\t"
+        "mtlo             %[vector4a],    $ac1                           \n\t"
+        "mthi             $zero,          $ac1                           \n\t"
+        "lbux             %[st1],         %[Temp3](%[cm])                \n\t"
+        "dpa.w.ph         $ac1,           %[p3],          %[filter45]    \n\t"
+        "extp             %[Temp1],       $ac1,           31             \n\t"
+
+        "addqh_r.w        %[Temp2],       %[Temp2],       %[st0]         \n\t"
+        "addqh_r.w        %[tp4],         %[tp4],         %[st1]         \n\t"
+        "sb               %[Temp2],       0(%[dst])                      \n\t"
+        "sb               %[tp4],         2(%[dst])                      \n\t"
+
+        /* even 4. pixel */
+        "mtlo             %[vector4a],    $ac2                           \n\t"
+        "mthi             $zero,          $ac2                           \n\t"
+        "mtlo             %[vector4a],    $ac3                           \n\t"
+        "mthi             $zero,          $ac3                           \n\t"
+
+        "balign           %[tp3],         %[tp2],         3              \n\t"
+        "balign           %[tp2],         %[tp1],         3              \n\t"
+
+        "lbux             %[st0],         %[Temp1](%[cm])                \n\t"
+        "lbu              %[Temp2],       4(%[dst])                      \n\t"
+        "addqh_r.w        %[Temp2],       %[Temp2],       %[st0]         \n\t"
+
+        "dpa.w.ph         $ac2,           %[p4],          %[filter45]    \n\t"
+        "extp             %[Temp3],       $ac2,           31             \n\t"
+
+        /* odd 1. pixel */
+        "mtlo             %[vector4a],    $ac1                           \n\t"
+        "mthi             $zero,          $ac1                           \n\t"
+        "sb               %[Temp2],       4(%[dst])                      \n\t"
+        "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p2],          %[tp2]                         \n\t"
+        "preceu.ph.qbr    %[p3],          %[tp3]                         \n\t"
+        "preceu.ph.qbl    %[p4],          %[tp3]                         \n\t"
+        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
+        "extp             %[Temp2],       $ac3,           31             \n\t"
+
+        "lbu              %[tp1],         6(%[dst])                      \n\t"
+
+        /* odd 2. pixel */
+        "mtlo             %[vector4a],    $ac3                           \n\t"
+        "mthi             $zero,          $ac3                           \n\t"
+        "mtlo             %[vector4a],    $ac2                           \n\t"
+        "mthi             $zero,          $ac2                           \n\t"
+        "lbux             %[st0],         %[Temp3](%[cm])                \n\t"
+        "dpa.w.ph         $ac1,           %[p2],          %[filter45]    \n\t"
+        "extp             %[Temp3],       $ac1,           31             \n\t"
+
+        "lbu              %[tp2],         1(%[dst])                      \n\t"
+        "lbu              %[tp3],         3(%[dst])                      \n\t"
+        "addqh_r.w        %[tp1],         %[tp1],         %[st0]         \n\t"
+
+        /* odd 3. pixel */
+        "lbux             %[st1],         %[Temp2](%[cm])                \n\t"
+        "dpa.w.ph         $ac3,           %[p3],          %[filter45]    \n\t"
+        "addqh_r.w        %[tp2],         %[tp2],         %[st1]         \n\t"
+        "extp             %[Temp2],       $ac3,           31             \n\t"
+
+        "lbu              %[tp4],         5(%[dst])                      \n\t"
+
+        /* odd 4. pixel */
+        "sb               %[tp2],         1(%[dst])                      \n\t"
+        "sb               %[tp1],         6(%[dst])                      \n\t"
+        "dpa.w.ph         $ac2,           %[p4],          %[filter45]    \n\t"
+        "extp             %[Temp1],       $ac2,           31             \n\t"
+
+        "lbu              %[tp1],         7(%[dst])                      \n\t"
+
+        /* clamp */
+        "lbux             %[p4],          %[Temp3](%[cm])                \n\t"
+        "addqh_r.w        %[tp3],         %[tp3],         %[p4]          \n\t"
+
+        "lbux             %[p2],          %[Temp2](%[cm])                \n\t"
+        "addqh_r.w        %[tp4],         %[tp4],         %[p2]          \n\t"
+
+        "lbux             %[p1],          %[Temp1](%[cm])                \n\t"
+        "addqh_r.w        %[tp1],         %[tp1],         %[p1]          \n\t"
+
+        /* store bytes */
+        "sb               %[tp3],         3(%[dst])                      \n\t"
+        "sb               %[tp4],         5(%[dst])                      \n\t"
+        "sb               %[tp1],         7(%[dst])                      \n\t"
+
+        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+          [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
+          [st0] "=&r" (st0), [st1] "=&r" (st1),
+          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+          [n1] "=&r" (n1),
+          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+        : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
+          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+    );
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr,
+                                          int32_t src_stride,
+                                          uint8_t *dst_ptr,
+                                          int32_t dst_stride,
+                                          const int16_t *filter_x0,
+                                          int32_t h,
+                                          int32_t count) {
+  int32_t y, c;
+  const uint8_t *src;
+  uint8_t *dst;
+  uint8_t *cm = vpx_ff_cropTbl;
+  uint32_t vector_64 = 64;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t qload1, qload2, qload3;
+  uint32_t p1, p2, p3, p4, p5;
+  uint32_t st1, st2, st3;
+  const int16_t *filter = &filter_x0[3];
+  uint32_t filter45;;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    src = src_ptr;
+    dst = dst_ptr;
+
+    /* prefetch data to cache memory */
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_store(dst_ptr + dst_stride);
+
+    for (c = 0; c < count; c++) {
+      __asm__ __volatile__ (
+          "ulw              %[qload1],    0(%[src])                    \n\t"
+          "ulw              %[qload2],    4(%[src])                    \n\t"
+
+          /* even 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "ulw              %[qload3],    8(%[src])                    \n\t"
+          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* even 1 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
+          "lbu              %[st2],       0(%[dst])                    \n\t" /* load even 1 from dst */
+
+          /* even 2. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "ulw              %[qload1],    12(%[src])                   \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[filter45]  \n\t" /* even 1 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
+
+          "lbu              %[qload3],    2(%[dst])                    \n\t" /* load even 2 from dst */
+
+          /* even 3. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 1 */
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "sb               %[st2],       0(%[dst])                    \n\t" /* store even 1 to dst */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter45]  \n\t" /* even 3 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
+
+          /* even 4. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "addqh_r.w        %[qload3],    %[qload3],      %[st2]       \n\t" /* average even 2 */
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[qload3],    2(%[dst])                    \n\t" /* store even 2 to dst */
+          "lbu              %[qload3],    4(%[dst])                    \n\t" /* load even 3 from dst */
+          "lbu              %[qload1],    6(%[dst])                    \n\t" /* load even 4 from dst */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter45]  \n\t" /* even 4 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
+
+          /* even 5. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 3 */
+          "sb               %[qload3],    4(%[dst])                    \n\t" /* store even 3 to dst */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter45]  \n\t" /* even 5 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
+
+          /* even 6. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average even 4 */
+          "sb               %[qload1],    6(%[dst])                    \n\t" /* store even 4 to dst */
+          "dpa.w.ph         $ac3,         %[p5],          %[filter45]  \n\t" /* even 6 */
+          "lbu              %[qload2],    8(%[dst])                    \n\t" /* load even 5 from dst */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
+
+          /* even 7. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 5 */
+          "sb               %[qload2],    8(%[dst])                    \n\t" /* store even 5 to dst */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* even 7 */
+          "lbu              %[qload3],    10(%[dst])                   \n\t" /* load even 6 from dst */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
+
+          "lbu              %[st2],       12(%[dst])                   \n\t" /* load even 7 from dst */
+
+          /* even 8. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 6 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* even 8 */
+          "sb               %[qload3],    10(%[dst])                   \n\t" /* store even 6 to dst */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
+
+          /* ODD pixels */
+          "ulw              %[qload1],    1(%[src])                   \n\t"
+          "ulw              %[qload2],    5(%[src])                    \n\t"
+
+          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 7 */
+
+          /* odd 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "sb               %[st2],       12(%[dst])                   \n\t" /* store even 7 to dst */
+          "ulw              %[qload3],    9(%[src])                    \n\t"
+          "dpa.w.ph         $ac3,         %[p1],          %[filter45]  \n\t" /* odd 1 */
+          "lbu              %[qload2],    14(%[dst])                   \n\t" /* load even 8 from dst */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
+
+          "lbu              %[st1],       1(%[dst])                    \n\t" /* load odd 1 from dst */
+
+          /* odd 2. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 8 */
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "sb               %[qload2],    14(%[dst])                   \n\t" /* store even 8 to dst */
+          "ulw              %[qload1],    13(%[src])                   \n\t"
+          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* odd 2 */
+          "lbu              %[qload3],    3(%[dst])                    \n\t" /* load odd 2 from dst */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
+
+          /* odd 3. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "addqh_r.w        %[st3],       %[st3],         %[st1]       \n\t" /* average odd 1 */
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* odd 3 */
+          "sb               %[st3],       1(%[dst])                    \n\t" /* store odd 1 to dst */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
+
+          /* odd 4. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "addqh_r.w        %[qload3],    %[qload3],      %[st1]       \n\t" /* average odd 2 */
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[qload3],    3(%[dst])                    \n\t" /* store odd 2 to dst */
+          "lbu              %[qload1],    5(%[dst])                    \n\t" /* load odd 3 from dst */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter45]  \n\t" /* odd 4 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
+
+          "lbu              %[st1],       7(%[dst])                    \n\t" /* load odd 4 from dst */
+
+          /* odd 5. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "addqh_r.w        %[qload1],    %[qload1],      %[st2]       \n\t" /* average odd 3 */
+          "sb               %[qload1],    5(%[dst])                    \n\t" /* store odd 3 to dst */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* odd 5 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
+
+          "lbu              %[qload1],    9(%[dst])                    \n\t" /* load odd 5 from dst */
+
+          /* odd 6. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "addqh_r.w        %[st1],       %[st1],         %[st3]       \n\t" /* average odd 4 */
+          "sb               %[st1],       7(%[dst])                    \n\t" /* store odd 4 to dst */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter45]  \n\t" /* odd 6 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
+
+          /* odd 7. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 5 */
+          "sb               %[qload1],    9(%[dst])                    \n\t" /* store odd 5 to dst */
+          "lbu              %[qload2],    11(%[dst])                   \n\t" /* load odd 6 from dst */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter45]  \n\t" /* odd 7 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
+
+          "lbu              %[qload3],    13(%[dst])                   \n\t" /* load odd 7 from dst */
+
+          /* odd 8. pixel */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter45]  \n\t" /* odd 8 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
+
+          "lbu              %[qload1],    15(%[dst])                   \n\t" /* load odd 8 from dst */
+
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
+          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average odd 6 */
+
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
+          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average odd 7 */
+
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
+          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 8 */
+
+          "sb               %[qload2],    11(%[dst])                   \n\t" /* store odd 6 to dst */
+          "sb               %[qload3],    13(%[dst])                   \n\t" /* store odd 7 to dst */
+          "sb               %[qload1],    15(%[dst])                   \n\t" /* store odd 8 to dst */
+
+          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2),
+            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+            [qload3] "=&r" (qload3), [p5] "=&r" (p5),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+          : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
+            [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+      );
+
+      src += 16;
+      dst += 16;
+    }
+
+    /* Next row... */
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr,
+                                          int32_t src_stride,
+                                          uint8_t *dst_ptr,
+                                          int32_t dst_stride,
+                                          const int16_t *filter_x0,
+                                          int32_t h) {
+  int32_t y, c;
+  const uint8_t *src;
+  uint8_t *dst;
+  uint8_t *cm = vpx_ff_cropTbl;
+  uint32_t vector_64 = 64;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t qload1, qload2, qload3;
+  uint32_t p1, p2, p3, p4, p5;
+  uint32_t st1, st2, st3;
+  const int16_t *filter = &filter_x0[3];
+  uint32_t filter45;;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    src = src_ptr;
+    dst = dst_ptr;
+
+    /* prefetch data to cache memory */
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_load(src_ptr + src_stride + 64);
+    prefetch_store(dst_ptr + dst_stride);
+    prefetch_store(dst_ptr + dst_stride + 32);
+
+    for (c = 0; c < 4; c++) {
+      __asm__ __volatile__ (
+          "ulw              %[qload1],    0(%[src])                    \n\t"
+          "ulw              %[qload2],    4(%[src])                    \n\t"
+
+          /* even 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "ulw              %[qload3],    8(%[src])                    \n\t"
+          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* even 1 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
+          "lbu              %[st2],       0(%[dst])                    \n\t" /* load even 1 from dst */
+
+          /* even 2. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "ulw              %[qload1],    12(%[src])                   \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[filter45]  \n\t" /* even 1 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
+
+          "lbu              %[qload3],    2(%[dst])                    \n\t" /* load even 2 from dst */
+
+          /* even 3. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 1 */
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "sb               %[st2],       0(%[dst])                    \n\t" /* store even 1 to dst */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter45]  \n\t" /* even 3 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
+
+          /* even 4. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "addqh_r.w        %[qload3],    %[qload3],      %[st2]       \n\t" /* average even 2 */
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[qload3],    2(%[dst])                    \n\t" /* store even 2 to dst */
+          "lbu              %[qload3],    4(%[dst])                    \n\t" /* load even 3 from dst */
+          "lbu              %[qload1],    6(%[dst])                    \n\t" /* load even 4 from dst */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter45]  \n\t" /* even 4 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
+
+          /* even 5. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 3 */
+          "sb               %[qload3],    4(%[dst])                    \n\t" /* store even 3 to dst */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter45]  \n\t" /* even 5 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
+
+          /* even 6. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average even 4 */
+          "sb               %[qload1],    6(%[dst])                    \n\t" /* store even 4 to dst */
+          "dpa.w.ph         $ac3,         %[p5],          %[filter45]  \n\t" /* even 6 */
+          "lbu              %[qload2],    8(%[dst])                    \n\t" /* load even 5 from dst */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
+
+          /* even 7. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 5 */
+          "sb               %[qload2],    8(%[dst])                    \n\t" /* store even 5 to dst */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* even 7 */
+          "lbu              %[qload3],    10(%[dst])                   \n\t" /* load even 6 from dst */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
+
+          "lbu              %[st2],       12(%[dst])                   \n\t" /* load even 7 from dst */
+
+          /* even 8. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 6 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* even 8 */
+          "sb               %[qload3],    10(%[dst])                   \n\t" /* store even 6 to dst */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
+
+          /* ODD pixels */
+          "ulw              %[qload1],    1(%[src])                   \n\t"
+          "ulw              %[qload2],    5(%[src])                    \n\t"
+
+          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 7 */
+
+          /* odd 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "sb               %[st2],       12(%[dst])                   \n\t" /* store even 7 to dst */
+          "ulw              %[qload3],    9(%[src])                    \n\t"
+          "dpa.w.ph         $ac3,         %[p1],          %[filter45]  \n\t" /* odd 1 */
+          "lbu              %[qload2],    14(%[dst])                   \n\t" /* load even 8 from dst */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
+
+          "lbu              %[st1],       1(%[dst])                    \n\t" /* load odd 1 from dst */
+
+          /* odd 2. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 8 */
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "sb               %[qload2],    14(%[dst])                   \n\t" /* store even 8 to dst */
+          "ulw              %[qload1],    13(%[src])                   \n\t"
+          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* odd 2 */
+          "lbu              %[qload3],    3(%[dst])                    \n\t" /* load odd 2 from dst */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
+
+          /* odd 3. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "addqh_r.w        %[st3],       %[st3],         %[st1]       \n\t" /* average odd 1 */
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* odd 3 */
+          "sb               %[st3],       1(%[dst])                    \n\t" /* store odd 1 to dst */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
+
+          /* odd 4. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "addqh_r.w        %[qload3],    %[qload3],      %[st1]       \n\t" /* average odd 2 */
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[qload3],    3(%[dst])                    \n\t" /* store odd 2 to dst */
+          "lbu              %[qload1],    5(%[dst])                    \n\t" /* load odd 3 from dst */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter45]  \n\t" /* odd 4 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
+
+          "lbu              %[st1],       7(%[dst])                    \n\t" /* load odd 4 from dst */
+
+          /* odd 5. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "addqh_r.w        %[qload1],    %[qload1],      %[st2]       \n\t" /* average odd 3 */
+          "sb               %[qload1],    5(%[dst])                    \n\t" /* store odd 3 to dst */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* odd 5 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
+
+          "lbu              %[qload1],    9(%[dst])                    \n\t" /* load odd 5 from dst */
+
+          /* odd 6. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "addqh_r.w        %[st1],       %[st1],         %[st3]       \n\t" /* average odd 4 */
+          "sb               %[st1],       7(%[dst])                    \n\t" /* store odd 4 to dst */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter45]  \n\t" /* odd 6 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
+
+          /* odd 7. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 5 */
+          "sb               %[qload1],    9(%[dst])                    \n\t" /* store odd 5 to dst */
+          "lbu              %[qload2],    11(%[dst])                   \n\t" /* load odd 6 from dst */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter45]  \n\t" /* odd 7 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
+
+          "lbu              %[qload3],    13(%[dst])                   \n\t" /* load odd 7 from dst */
+
+          /* odd 8. pixel */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter45]  \n\t" /* odd 8 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
+
+          "lbu              %[qload1],    15(%[dst])                   \n\t" /* load odd 8 from dst */
+
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
+          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average odd 6 */
+
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
+          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average odd 7 */
+
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
+          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 8 */
+
+          "sb               %[qload2],    11(%[dst])                   \n\t" /* store odd 6 to dst */
+          "sb               %[qload3],    13(%[dst])                   \n\t" /* store odd 7 to dst */
+          "sb               %[qload1],    15(%[dst])                   \n\t" /* store odd 8 to dst */
+
+          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2),
+            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+            [qload3] "=&r" (qload3), [p5] "=&r" (p5),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+          : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
+            [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+      );
+
+      src += 16;
+      dst += 16;
+    }
+
+    /* Next row... */
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                                   uint8_t *dst, ptrdiff_t dst_stride,
+                                   const int16_t *filter_x, int x_step_q4,
+                                   const int16_t *filter_y, int y_step_q4,
+                                   int w, int h) {
+  uint32_t pos = 38;
+
+  assert(x_step_q4 == 16);
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  /* prefetch data to cache memory */
+  prefetch_load(src);
+  prefetch_load(src + 32);
+  prefetch_store(dst);
+
+  switch (w) {
+    case 4:
+      convolve_bi_avg_horiz_4_dspr2(src, src_stride,
+                                   dst, dst_stride,
+                                   filter_x, h);
+      break;
+    case 8:
+      convolve_bi_avg_horiz_8_dspr2(src, src_stride,
+                                   dst, dst_stride,
+                                   filter_x, h);
+      break;
+    case 16:
+      convolve_bi_avg_horiz_16_dspr2(src, src_stride,
+                                    dst, dst_stride,
+                                    filter_x, h, 1);
+      break;
+    case 32:
+      convolve_bi_avg_horiz_16_dspr2(src, src_stride,
+                                    dst, dst_stride,
+                                    filter_x, h, 2);
+      break;
+    case 64:
+      prefetch_load(src + 64);
+      prefetch_store(dst + 32);
+
+      convolve_bi_avg_horiz_64_dspr2(src, src_stride,
+                                    dst, dst_stride,
+                                    filter_x, h);
+      break;
+    default:
+      vpx_convolve8_avg_horiz_c(src, src_stride,
+                                dst, dst_stride,
+                                filter_x, x_step_q4,
+                                filter_y, y_step_q4,
+                                w, h);
+      break;
+  }
+}
+#endif
diff --git a/libs/libvpx/vpx_dsp/mips/convolve2_dspr2.c b/libs/libvpx/vpx_dsp/mips/convolve2_dspr2.c
new file mode 100644
index 0000000000..d111029d42
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/convolve2_dspr2.c
@@ -0,0 +1,782 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/convolve_common_dspr2.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_ports/mem.h"
+
+#if HAVE_DSPR2
+static void convolve_bi_horiz_4_transposed_dspr2(const uint8_t *src,
+                                                 int32_t src_stride,
+                                                 uint8_t *dst,
+                                                 int32_t dst_stride,
+                                                 const int16_t *filter_x0,
+                                                 int32_t h) {
+  int32_t       y;
+  uint8_t       *cm = vpx_ff_cropTbl;
+  uint8_t       *dst_ptr;
+  int32_t       Temp1, Temp2;
+  uint32_t      vector4a = 64;
+  uint32_t      tp1, tp2;
+  uint32_t      p1, p2;
+  const int16_t *filter = &filter_x0[3];
+  uint32_t      filter45;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    dst_ptr = dst;
+    /* prefetch data to cache memory */
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+
+    __asm__ __volatile__ (
+        "ulw              %[tp1],         0(%[src])                      \n\t"
+        "ulw              %[tp2],         4(%[src])                      \n\t"
+
+        /* even 1. pixel */
+        "mtlo             %[vector4a],    $ac3                           \n\t"
+        "mthi             $zero,          $ac3                           \n\t"
+        "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
+        "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
+        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
+        "extp             %[Temp1],       $ac3,           31             \n\t"
+
+        /* even 2. pixel */
+        "mtlo             %[vector4a],    $ac2                           \n\t"
+        "mthi             $zero,          $ac2                           \n\t"
+        "balign           %[tp2],         %[tp1],         3              \n\t"
+        "dpa.w.ph         $ac2,           %[p2],          %[filter45]    \n\t"
+        "extp             %[Temp2],       $ac2,           31             \n\t"
+
+        /* odd 1. pixel */
+        "lbux             %[tp1],         %[Temp1](%[cm])                \n\t"
+        "mtlo             %[vector4a],    $ac3                           \n\t"
+        "mthi             $zero,          $ac3                           \n\t"
+        "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p2],          %[tp2]                         \n\t"
+        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
+        "extp             %[Temp1],       $ac3,           31             \n\t"
+
+        /* odd 2. pixel */
+        "lbux             %[tp2],         %[Temp2](%[cm])                \n\t"
+        "mtlo             %[vector4a],    $ac2                           \n\t"
+        "mthi             $zero,          $ac2                           \n\t"
+        "dpa.w.ph         $ac2,           %[p2],          %[filter45]    \n\t"
+        "extp             %[Temp2],       $ac2,           31             \n\t"
+
+        /* clamp */
+        "lbux             %[p1],          %[Temp1](%[cm])                \n\t"
+        "lbux             %[p2],          %[Temp2](%[cm])                \n\t"
+
+        /* store bytes */
+        "sb               %[tp1],         0(%[dst_ptr])                  \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
+
+        "sb               %[p1],          0(%[dst_ptr])                  \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
+
+        "sb               %[tp2],         0(%[dst_ptr])                  \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
+
+        "sb               %[p2],          0(%[dst_ptr])                  \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
+
+        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+          [p1] "=&r" (p1), [p2] "=&r" (p2),
+          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+          [dst_ptr] "+r" (dst_ptr)
+        : [filter45] "r" (filter45),[vector4a] "r" (vector4a),
+          [cm] "r" (cm), [src] "r" (src), [dst_stride] "r" (dst_stride)
+    );
+
+    /* Next row... */
+    src += src_stride;
+    dst += 1;
+  }
+}
+
+static void convolve_bi_horiz_8_transposed_dspr2(const uint8_t *src,
+                                                 int32_t src_stride,
+                                                 uint8_t *dst,
+                                                 int32_t dst_stride,
+                                                 const int16_t *filter_x0,
+                                                 int32_t h) {
+  int32_t y;
+  uint8_t *cm = vpx_ff_cropTbl;
+  uint8_t *dst_ptr;
+  uint32_t vector4a = 64;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t tp1, tp2, tp3;
+  uint32_t p1, p2, p3, p4;
+  uint8_t *odd_dst;
+  uint32_t dst_pitch_2 = (dst_stride << 1);
+  const int16_t *filter = &filter_x0[3];
+  uint32_t      filter45;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+
+    dst_ptr = dst;
+    odd_dst = (dst_ptr + dst_stride);
+
+    __asm__ __volatile__ (
+        "ulw              %[tp1],         0(%[src])                       \n\t"
+        "ulw              %[tp2],         4(%[src])                       \n\t"
+
+        /* even 1. pixel */
+        "mtlo             %[vector4a],    $ac3                            \n\t"
+        "mthi             $zero,          $ac3                            \n\t"
+        "mtlo             %[vector4a],    $ac2                            \n\t"
+        "mthi             $zero,          $ac2                            \n\t"
+        "preceu.ph.qbr    %[p1],          %[tp1]                          \n\t"
+        "preceu.ph.qbl    %[p2],          %[tp1]                          \n\t"
+        "preceu.ph.qbr    %[p3],          %[tp2]                          \n\t"
+        "preceu.ph.qbl    %[p4],          %[tp2]                          \n\t"
+        "ulw              %[tp3],         8(%[src])                       \n\t"
+        "dpa.w.ph         $ac3,           %[p1],          %[filter45]     \n\t"
+        "extp             %[Temp1],       $ac3,           31              \n\t"
+
+        /* even 2. pixel */
+        "dpa.w.ph         $ac2,           %[p2],          %[filter45]     \n\t"
+        "extp             %[Temp3],       $ac2,           31              \n\t"
+
+        /* even 3. pixel */
+        "lbux             %[Temp2],       %[Temp1](%[cm])                 \n\t"
+        "mtlo             %[vector4a],    $ac1                            \n\t"
+        "mthi             $zero,          $ac1                            \n\t"
+        "balign           %[tp3],         %[tp2],         3              \n\t"
+        "balign           %[tp2],         %[tp1],         3              \n\t"
+        "dpa.w.ph         $ac1,           %[p3],          %[filter45]     \n\t"
+        "lbux             %[tp1],         %[Temp3](%[cm])                 \n\t"
+        "extp             %[p3],          $ac1,           31              \n\t"
+
+        /* even 4. pixel */
+        "mtlo             %[vector4a],    $ac2                            \n\t"
+        "mthi             $zero,          $ac2                            \n\t"
+        "mtlo             %[vector4a],    $ac3                            \n\t"
+        "mthi             $zero,          $ac3                            \n\t"
+        "sb               %[Temp2],       0(%[dst_ptr])                   \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
+        "sb               %[tp1],         0(%[dst_ptr])                   \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
+
+        "dpa.w.ph         $ac2,           %[p4],          %[filter45]     \n\t"
+        "extp             %[Temp3],       $ac2,           31              \n\t"
+
+        "lbux             %[Temp1],         %[p3](%[cm])                    \n\t"
+
+        /* odd 1. pixel */
+        "mtlo             %[vector4a],    $ac1                            \n\t"
+        "mthi             $zero,          $ac1                            \n\t"
+        "preceu.ph.qbr    %[p1],          %[tp2]                          \n\t"
+        "preceu.ph.qbl    %[p2],          %[tp2]                          \n\t"
+        "preceu.ph.qbr    %[p3],          %[tp3]                          \n\t"
+        "preceu.ph.qbl    %[p4],          %[tp3]                          \n\t"
+        "sb               %[Temp1],       0(%[dst_ptr])                   \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
+
+        "dpa.w.ph         $ac3,           %[p1],          %[filter45]     \n\t"
+        "extp             %[Temp2],       $ac3,           31              \n\t"
+
+        /* odd 2. pixel */
+        "lbux             %[tp1],         %[Temp3](%[cm])                 \n\t"
+        "mtlo             %[vector4a],    $ac3                            \n\t"
+        "mthi             $zero,          $ac3                            \n\t"
+        "mtlo             %[vector4a],    $ac2                            \n\t"
+        "mthi             $zero,          $ac2                            \n\t"
+        "dpa.w.ph         $ac1,           %[p2],          %[filter45]     \n\t"
+        "sb               %[tp1],         0(%[dst_ptr])                   \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
+        "extp             %[Temp3],       $ac1,           31              \n\t"
+
+        /* odd 3. pixel */
+        "lbux             %[tp3],         %[Temp2](%[cm])                 \n\t"
+        "dpa.w.ph         $ac3,           %[p3],          %[filter45]     \n\t"
+        "extp             %[Temp2],       $ac3,           31              \n\t"
+
+        /* odd 4. pixel */
+        "sb               %[tp3],         0(%[odd_dst])                   \n\t"
+        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
+        "dpa.w.ph         $ac2,           %[p4],          %[filter45]     \n\t"
+        "extp             %[Temp1],       $ac2,           31              \n\t"
+
+        /* clamp */
+        "lbux             %[p4],          %[Temp3](%[cm])                 \n\t"
+        "lbux             %[p2],          %[Temp2](%[cm])                 \n\t"
+        "lbux             %[p1],          %[Temp1](%[cm])                 \n\t"
+
+        /* store bytes */
+        "sb               %[p4],          0(%[odd_dst])                   \n\t"
+        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
+
+        "sb               %[p2],          0(%[odd_dst])                   \n\t"
+        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
+
+        "sb               %[p1],          0(%[odd_dst])                   \n\t"
+
+        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3),
+          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+          [dst_ptr] "+r" (dst_ptr), [odd_dst] "+r" (odd_dst)
+        : [filter45] "r" (filter45),[vector4a] "r" (vector4a), [cm] "r" (cm),
+          [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
+    );
+
+    /* Next row... */
+    src += src_stride;
+    dst += 1;
+  }
+}
+
+static void convolve_bi_horiz_16_transposed_dspr2(const uint8_t *src_ptr,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst_ptr,
+                                                  int32_t dst_stride,
+                                                  const int16_t *filter_x0,
+                                                  int32_t h,
+                                                  int32_t count) {
+  int32_t       c, y;
+  const uint8_t *src;
+  uint8_t       *dst;
+  uint8_t       *cm = vpx_ff_cropTbl;
+  uint32_t      vector_64 = 64;
+  int32_t       Temp1, Temp2, Temp3;
+  uint32_t      qload1, qload2;
+  uint32_t      p1, p2, p3, p4, p5;
+  uint32_t      st1, st2, st3;
+  uint32_t      dst_pitch_2 = (dst_stride << 1);
+  uint8_t       *odd_dst;
+  const int16_t *filter = &filter_x0[3];
+  uint32_t      filter45;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+
+    src = src_ptr;
+    dst = dst_ptr;
+
+    odd_dst = (dst + dst_stride);
+
+    for (c = 0; c < count; c++) {
+      __asm__ __volatile__ (
+          "ulw              %[qload1],        0(%[src])                       \n\t"
+          "ulw              %[qload2],        4(%[src])                       \n\t"
+
+          /* even 1. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 1 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 2 */
+          "mthi             $zero,            $ac2                            \n\t"
+          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
+          "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
+          "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
+          "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
+          "ulw              %[qload1],        8(%[src])                       \n\t"
+          "dpa.w.ph         $ac1,             %[p1],          %[filter45]     \n\t" /* even 1 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 1 */
+
+          /* even 2. pixel */
+          "mtlo             %[vector_64],     $ac3                            \n\t" /* even 3 */
+          "mthi             $zero,            $ac3                            \n\t"
+          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
+          "preceu.ph.qbl    %[p5],            %[qload1]                       \n\t"
+          "ulw              %[qload2],        12(%[src])                      \n\t"
+          "dpa.w.ph         $ac2,             %[p2],          %[filter45]     \n\t" /* even 1 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 1 */
+          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 1 */
+
+          /* even 3. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 4 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "preceu.ph.qbr    %[p2],            %[qload2]                       \n\t"
+          "sb               %[st1],           0(%[dst])                       \n\t" /* even 1 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]             \n\t"
+          "dpa.w.ph         $ac3,             %[p3],          %[filter45]     \n\t" /* even 3 */
+          "extp             %[Temp3],         $ac3,           31              \n\t" /* even 3 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 1 */
+
+          /* even 4. pixel */
+          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 5 */
+          "mthi             $zero,            $ac2                            \n\t"
+          "preceu.ph.qbl    %[p3],            %[qload2]                       \n\t"
+          "sb               %[st2],           0(%[dst])                       \n\t" /* even 2 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac1,             %[p4],          %[filter45]     \n\t" /* even 4 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 4 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 3 */
+
+          /* even 5. pixel */
+          "mtlo             %[vector_64],     $ac3                            \n\t" /* even 6 */
+          "mthi             $zero,            $ac3                            \n\t"
+          "sb               %[st3],           0(%[dst])                       \n\t" /* even 3 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac2,             %[p1],          %[filter45]     \n\t" /* even 5 */
+          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 5 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 4 */
+
+          /* even 6. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 7 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "sb               %[st1],           0(%[dst])                       \n\t" /* even 4 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "ulw              %[qload1],        20(%[src])                      \n\t"
+          "dpa.w.ph         $ac3,             %[p5],          %[filter45]     \n\t" /* even 6 */
+          "extp             %[Temp3],         $ac3,           31              \n\t" /* even 6 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 5 */
+
+          /* even 7. pixel */
+          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 8 */
+          "mthi             $zero,            $ac2                            \n\t"
+          "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
+          "sb               %[st2],           0(%[dst])                       \n\t" /* even 5 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac1,             %[p2],          %[filter45]     \n\t" /* even 7 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 7 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 6 */
+
+          /* even 8. pixel */
+          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 1 */
+          "mthi             $zero,            $ac3                            \n\t"
+          "dpa.w.ph         $ac2,             %[p3],          %[filter45]     \n\t" /* even 8 */
+          "sb               %[st3],           0(%[dst])                       \n\t" /* even 6 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 8 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 7 */
+
+          /* ODD pixels */
+          "ulw              %[qload1],        1(%[src])                       \n\t"
+          "ulw              %[qload2],        5(%[src])                       \n\t"
+
+          /* odd 1. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 2 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
+          "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
+          "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
+          "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
+          "sb               %[st1],           0(%[dst])                       \n\t" /* even 7 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "ulw              %[qload2],        9(%[src])                       \n\t"
+          "dpa.w.ph         $ac3,             %[p1],          %[filter45]     \n\t" /* odd 1 */
+          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 1 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 8 */
+
+          /* odd 2. pixel */
+          "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 3 */
+          "mthi             $zero,            $ac2                            \n\t"
+          "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"
+          "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"
+          "sb               %[st2],           0(%[dst])                       \n\t" /* even 8 */
+          "ulw              %[qload1],        13(%[src])                      \n\t"
+          "dpa.w.ph         $ac1,             %[p2],          %[filter45]     \n\t" /* odd 2 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 2 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 1 */
+
+          /* odd 3. pixel */
+          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 4 */
+          "mthi             $zero,            $ac3                            \n\t"
+          "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"
+          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 1 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac2,             %[p3],          %[filter45]     \n\t" /* odd 3 */
+          "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 3 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 2 */
+
+          /* odd 4. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 5 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"
+          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 2 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac3,             %[p4],          %[filter45]     \n\t" /* odd 4 */
+          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 4 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 3 */
+
+          /* odd 5. pixel */
+          "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 6 */
+          "mthi             $zero,            $ac2                            \n\t"
+          "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 3 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac1,             %[p1],          %[filter45]     \n\t" /* odd 5 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 5 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 4 */
+
+          /* odd 6. pixel */
+          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 7 */
+          "mthi             $zero,            $ac3                            \n\t"
+          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 4 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+          "ulw              %[qload1],        21(%[src])                      \n\t"
+          "dpa.w.ph         $ac2,             %[p5],          %[filter45]     \n\t" /* odd 6 */
+          "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 6 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 5 */
+
+          /* odd 7. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 8 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
+          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 5 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac3,             %[p2],          %[filter45]     \n\t" /* odd 7 */
+          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 7 */
+
+          /* odd 8. pixel */
+          "dpa.w.ph         $ac1,             %[p3],          %[filter45]     \n\t" /* odd 8 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 8 */
+
+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 6 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 7 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 8 */
+
+          "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 6 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+
+          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 7 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+
+          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 8 */
+
+          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
+            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+            [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
+          : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
+            [cm] "r" (cm),
+            [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
+      );
+
+      src += 16;
+      dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
+      odd_dst = (dst + dst_stride);
+    }
+
+    /* Next row... */
+    src_ptr += src_stride;
+    dst_ptr += 1;
+  }
+}
+
+static void convolve_bi_horiz_64_transposed_dspr2(const uint8_t *src_ptr,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst_ptr,
+                                                  int32_t dst_stride,
+                                                  const int16_t *filter_x0,
+                                                  int32_t h) {
+  int32_t       c, y;
+  const uint8_t *src;
+  uint8_t       *dst;
+  uint8_t       *cm = vpx_ff_cropTbl;
+  uint32_t      vector_64 = 64;
+  int32_t       Temp1, Temp2, Temp3;
+  uint32_t      qload1, qload2;
+  uint32_t      p1, p2, p3, p4, p5;
+  uint32_t      st1, st2, st3;
+  uint32_t      dst_pitch_2 = (dst_stride << 1);
+  uint8_t       *odd_dst;
+  const int16_t *filter = &filter_x0[3];
+  uint32_t      filter45;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_load(src_ptr + src_stride + 64);
+
+    src = src_ptr;
+    dst = dst_ptr;
+
+    odd_dst = (dst + dst_stride);
+
+    for (c = 0; c < 4; c++) {
+      __asm__ __volatile__ (
+          "ulw              %[qload1],        0(%[src])                       \n\t"
+          "ulw              %[qload2],        4(%[src])                       \n\t"
+
+          /* even 1. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 1 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 2 */
+          "mthi             $zero,            $ac2                            \n\t"
+          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
+          "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
+          "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
+          "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
+          "ulw              %[qload1],        8(%[src])                       \n\t"
+          "dpa.w.ph         $ac1,             %[p1],          %[filter45]     \n\t" /* even 1 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 1 */
+
+          /* even 2. pixel */
+          "mtlo             %[vector_64],     $ac3                            \n\t" /* even 3 */
+          "mthi             $zero,            $ac3                            \n\t"
+          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
+          "preceu.ph.qbl    %[p5],            %[qload1]                       \n\t"
+          "ulw              %[qload2],        12(%[src])                      \n\t"
+          "dpa.w.ph         $ac2,             %[p2],          %[filter45]     \n\t" /* even 1 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 1 */
+          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 1 */
+
+          /* even 3. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 4 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "preceu.ph.qbr    %[p2],            %[qload2]                       \n\t"
+          "sb               %[st1],           0(%[dst])                       \n\t" /* even 1 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]             \n\t"
+          "dpa.w.ph         $ac3,             %[p3],          %[filter45]     \n\t" /* even 3 */
+          "extp             %[Temp3],         $ac3,           31              \n\t" /* even 3 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 1 */
+
+          /* even 4. pixel */
+          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 5 */
+          "mthi             $zero,            $ac2                            \n\t"
+          "preceu.ph.qbl    %[p3],            %[qload2]                       \n\t"
+          "sb               %[st2],           0(%[dst])                       \n\t" /* even 2 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac1,             %[p4],          %[filter45]     \n\t" /* even 4 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 4 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 3 */
+
+          /* even 5. pixel */
+          "mtlo             %[vector_64],     $ac3                            \n\t" /* even 6 */
+          "mthi             $zero,            $ac3                            \n\t"
+          "sb               %[st3],           0(%[dst])                       \n\t" /* even 3 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac2,             %[p1],          %[filter45]     \n\t" /* even 5 */
+          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 5 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 4 */
+
+          /* even 6. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 7 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "sb               %[st1],           0(%[dst])                       \n\t" /* even 4 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "ulw              %[qload1],        20(%[src])                      \n\t"
+          "dpa.w.ph         $ac3,             %[p5],          %[filter45]     \n\t" /* even 6 */
+          "extp             %[Temp3],         $ac3,           31              \n\t" /* even 6 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 5 */
+
+          /* even 7. pixel */
+          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 8 */
+          "mthi             $zero,            $ac2                            \n\t"
+          "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
+          "sb               %[st2],           0(%[dst])                       \n\t" /* even 5 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac1,             %[p2],          %[filter45]     \n\t" /* even 7 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 7 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 6 */
+
+          /* even 8. pixel */
+          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 1 */
+          "mthi             $zero,            $ac3                            \n\t"
+          "dpa.w.ph         $ac2,             %[p3],          %[filter45]     \n\t" /* even 8 */
+          "sb               %[st3],           0(%[dst])                       \n\t" /* even 6 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 8 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 7 */
+
+          /* ODD pixels */
+          "ulw              %[qload1],        1(%[src])                       \n\t"
+          "ulw              %[qload2],        5(%[src])                       \n\t"
+
+          /* odd 1. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 2 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
+          "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
+          "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
+          "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
+          "sb               %[st1],           0(%[dst])                       \n\t" /* even 7 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "ulw              %[qload2],        9(%[src])                       \n\t"
+          "dpa.w.ph         $ac3,             %[p1],          %[filter45]     \n\t" /* odd 1 */
+          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 1 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 8 */
+
+          /* odd 2. pixel */
+          "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 3 */
+          "mthi             $zero,            $ac2                            \n\t"
+          "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"
+          "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"
+          "sb               %[st2],           0(%[dst])                       \n\t" /* even 8 */
+          "ulw              %[qload1],        13(%[src])                      \n\t"
+          "dpa.w.ph         $ac1,             %[p2],          %[filter45]     \n\t" /* odd 2 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 2 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 1 */
+
+          /* odd 3. pixel */
+          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 4 */
+          "mthi             $zero,            $ac3                            \n\t"
+          "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"
+          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 1 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac2,             %[p3],          %[filter45]     \n\t" /* odd 3 */
+          "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 3 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 2 */
+
+          /* odd 4. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 5 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"
+          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 2 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac3,             %[p4],          %[filter45]     \n\t" /* odd 4 */
+          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 4 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 3 */
+
+          /* odd 5. pixel */
+          "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 6 */
+          "mthi             $zero,            $ac2                            \n\t"
+          "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 3 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac1,             %[p1],          %[filter45]     \n\t" /* odd 5 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 5 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 4 */
+
+          /* odd 6. pixel */
+          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 7 */
+          "mthi             $zero,            $ac3                            \n\t"
+          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 4 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+          "ulw              %[qload1],        21(%[src])                      \n\t"
+          "dpa.w.ph         $ac2,             %[p5],          %[filter45]     \n\t" /* odd 6 */
+          "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 6 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 5 */
+
+          /* odd 7. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 8 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
+          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 5 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac3,             %[p2],          %[filter45]     \n\t" /* odd 7 */
+          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 7 */
+
+          /* odd 8. pixel */
+          "dpa.w.ph         $ac1,             %[p3],          %[filter45]     \n\t" /* odd 8 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 8 */
+
+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 6 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 7 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 8 */
+
+          "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 6 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+
+          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 7 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+
+          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 8 */
+
+          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
+            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+            [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
+          : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
+            [cm] "r" (cm),
+            [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
+      );
+
+      src += 16;
+      dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
+      odd_dst = (dst + dst_stride);
+    }
+
+    /* Next row... */
+    src_ptr += src_stride;
+    dst_ptr += 1;
+  }
+}
+
+void convolve_bi_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const int16_t *filter, int w, int h) {
+  int x, y;
+
+  for (y = 0; y < h; ++y) {
+    for (x = 0; x < w; ++x) {
+      int sum = 0;
+
+      sum += src[x] * filter[3];
+      sum += src[x + 1] * filter[4];
+
+      dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+    }
+
+    src += src_stride;
+    dst += 1;
+  }
+}
+
+void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                         uint8_t *dst, ptrdiff_t dst_stride,
+                         const int16_t *filter,
+                         int w, int h) {
+  uint32_t pos = 38;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  /* prefetch data to cache memory */
+  prefetch_load(src);
+  prefetch_load(src + 32);
+
+  switch (w) {
+    case 4:
+      convolve_bi_horiz_4_transposed_dspr2(src, src_stride,
+                                           dst, dst_stride,
+                                           filter, h);
+      break;
+    case 8:
+      convolve_bi_horiz_8_transposed_dspr2(src, src_stride,
+                                           dst, dst_stride,
+                                           filter, h);
+      break;
+    case 16:
+    case 32:
+      convolve_bi_horiz_16_transposed_dspr2(src, src_stride,
+                                            dst, dst_stride,
+                                            filter, h,
+                                            (w/16));
+      break;
+    case 64:
+      prefetch_load(src + 32);
+      convolve_bi_horiz_64_transposed_dspr2(src, src_stride,
+                                            dst, dst_stride,
+                                            filter, h);
+      break;
+    default:
+      convolve_bi_horiz_transposed(src, src_stride,
+                                   dst, dst_stride,
+                                   filter, w, h);
+      break;
+  }
+}
+#endif
diff --git a/libs/libvpx/vpx_dsp/mips/convolve2_horiz_dspr2.c b/libs/libvpx/vpx_dsp/mips/convolve2_horiz_dspr2.c
new file mode 100644
index 0000000000..9fe1a3454b
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/convolve2_horiz_dspr2.c
@@ -0,0 +1,705 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/convolve_common_dspr2.h"
+#include "vpx_dsp/vpx_convolve.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_ports/mem.h"
+
+#if HAVE_DSPR2
+static void convolve_bi_horiz_4_dspr2(const uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int16_t *filter_x0,
+                                      int32_t h) {
+  int32_t y;
+  uint8_t *cm = vpx_ff_cropTbl;
+  int32_t Temp1, Temp2, Temp3, Temp4;
+  uint32_t vector4a = 64;
+  uint32_t tp1, tp2;
+  uint32_t p1, p2;
+  const int16_t *filter = &filter_x0[3];
+  uint32_t filter45;;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+    prefetch_store(dst + dst_stride);
+
+    __asm__ __volatile__ (
+        "ulw              %[tp1],      0(%[src])                      \n\t"
+        "ulw              %[tp2],      4(%[src])                      \n\t"
+
+        /* even 1. pixel */
+        "mtlo             %[vector4a], $ac3                           \n\t"
+        "mthi             $zero,       $ac3                           \n\t"
+        "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
+        "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
+        "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
+        "extp             %[Temp1],    $ac3,           31             \n\t"
+
+        /* even 2. pixel */
+        "mtlo             %[vector4a], $ac2                           \n\t"
+        "mthi             $zero,       $ac2                           \n\t"
+        "balign           %[tp2],      %[tp1],         3              \n\t"
+        "dpa.w.ph         $ac2,        %[p2],          %[filter45]    \n\t"
+        "extp             %[Temp3],    $ac2,           31             \n\t"
+
+        /* odd 1. pixel */
+        "lbux             %[tp1],      %[Temp1](%[cm])                \n\t"
+        "mtlo             %[vector4a], $ac3                           \n\t"
+        "mthi             $zero,       $ac3                           \n\t"
+        "preceu.ph.qbr    %[p1],       %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p2],       %[tp2]                         \n\t"
+        "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
+        "extp             %[Temp2],    $ac3,           31             \n\t"
+
+        /* odd 2. pixel */
+        "lbux             %[tp2],      %[Temp3](%[cm])                \n\t"
+        "mtlo             %[vector4a], $ac2                           \n\t"
+        "mthi             $zero,       $ac2                           \n\t"
+        "dpa.w.ph         $ac2,        %[p2],          %[filter45]    \n\t"
+        "extp             %[Temp4],    $ac2,           31             \n\t"
+
+        /* clamp */
+        "lbux             %[p1],       %[Temp2](%[cm])                \n\t"
+        "lbux             %[p2],       %[Temp4](%[cm])                \n\t"
+
+        /* store bytes */
+        "sb               %[tp1],      0(%[dst])                      \n\t"
+        "sb               %[p1],       1(%[dst])                      \n\t"
+        "sb               %[tp2],      2(%[dst])                      \n\t"
+        "sb               %[p2],       3(%[dst])                      \n\t"
+
+        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+          [p1] "=&r" (p1), [p2] "=&r" (p2),
+          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+          [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
+        : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
+          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+    );
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_bi_horiz_8_dspr2(const uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int16_t *filter_x0,
+                                      int32_t h) {
+  int32_t y;
+  uint8_t *cm = vpx_ff_cropTbl;
+  uint32_t vector4a = 64;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t tp1, tp2, tp3;
+  uint32_t p1, p2, p3, p4;
+  uint32_t st0, st1;
+  const int16_t *filter = &filter_x0[3];
+  uint32_t filter45;;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+    prefetch_store(dst + dst_stride);
+
+    __asm__ __volatile__ (
+        "ulw              %[tp1],      0(%[src])                      \n\t"
+        "ulw              %[tp2],      4(%[src])                      \n\t"
+
+        /* even 1. pixel */
+        "mtlo             %[vector4a], $ac3                           \n\t"
+        "mthi             $zero,       $ac3                           \n\t"
+        "mtlo             %[vector4a], $ac2                           \n\t"
+        "mthi             $zero,       $ac2                           \n\t"
+        "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
+        "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
+        "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
+        "ulw              %[tp3],      8(%[src])                      \n\t"
+        "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
+        "extp             %[Temp1],    $ac3,           31             \n\t"
+
+        /* even 2. pixel */
+        "dpa.w.ph         $ac2,        %[p2],          %[filter45]    \n\t"
+        "extp             %[Temp3],    $ac2,           31             \n\t"
+
+        /* even 3. pixel */
+        "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
+        "mtlo             %[vector4a], $ac1                           \n\t"
+        "mthi             $zero,       $ac1                           \n\t"
+        "dpa.w.ph         $ac1,        %[p3],          %[filter45]    \n\t"
+        "extp             %[Temp1],    $ac1,           31             \n\t"
+
+        /* even 4. pixel */
+        "mtlo             %[vector4a], $ac2                           \n\t"
+        "mthi             $zero,       $ac2                           \n\t"
+        "mtlo             %[vector4a], $ac3                           \n\t"
+        "mthi             $zero,       $ac3                           \n\t"
+        "sb               %[st0],      0(%[dst])                      \n\t"
+        "lbux             %[st1],      %[Temp3](%[cm])                \n\t"
+
+        "balign           %[tp3],      %[tp2],         3              \n\t"
+        "balign           %[tp2],      %[tp1],         3              \n\t"
+
+        "dpa.w.ph         $ac2,        %[p4],          %[filter45]    \n\t"
+        "extp             %[Temp3],    $ac2,           31             \n\t"
+
+        "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
+
+        /* odd 1. pixel */
+        "mtlo             %[vector4a], $ac1                           \n\t"
+        "mthi             $zero,       $ac1                           \n\t"
+        "sb               %[st1],      2(%[dst])                      \n\t"
+        "preceu.ph.qbr    %[p1],       %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p2],       %[tp2]                         \n\t"
+        "preceu.ph.qbr    %[p3],       %[tp3]                         \n\t"
+        "preceu.ph.qbl    %[p4],       %[tp3]                         \n\t"
+        "sb               %[st0],      4(%[dst])                      \n\t"
+        "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
+        "extp             %[Temp2],    $ac3,           31             \n\t"
+
+        /* odd 2. pixel */
+        "mtlo             %[vector4a], $ac3                           \n\t"
+        "mthi             $zero,       $ac3                           \n\t"
+        "mtlo             %[vector4a], $ac2                           \n\t"
+        "mthi             $zero,       $ac2                           \n\t"
+        "lbux             %[st0],      %[Temp3](%[cm])                \n\t"
+        "dpa.w.ph         $ac1,        %[p2],          %[filter45]    \n\t"
+        "extp             %[Temp3],    $ac1,           31             \n\t"
+
+        /* odd 3. pixel */
+        "lbux             %[st1],      %[Temp2](%[cm])                \n\t"
+        "dpa.w.ph         $ac3,        %[p3],          %[filter45]    \n\t"
+        "extp             %[Temp2],    $ac3,           31             \n\t"
+
+        /* odd 4. pixel */
+        "sb               %[st1],      1(%[dst])                      \n\t"
+        "sb               %[st0],      6(%[dst])                      \n\t"
+        "dpa.w.ph         $ac2,        %[p4],          %[filter45]    \n\t"
+        "extp             %[Temp1],    $ac2,           31             \n\t"
+
+        /* clamp */
+        "lbux             %[p4],       %[Temp3](%[cm])                \n\t"
+        "lbux             %[p2],       %[Temp2](%[cm])                \n\t"
+        "lbux             %[p1],       %[Temp1](%[cm])                \n\t"
+
+        /* store bytes */
+        "sb               %[p4],       3(%[dst])                      \n\t"
+        "sb               %[p2],       5(%[dst])                      \n\t"
+        "sb               %[p1],       7(%[dst])                      \n\t"
+
+        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3),
+          [st0] "=&r" (st0), [st1] "=&r" (st1),
+          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+        : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
+          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+    );
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr,
+                                       int32_t src_stride,
+                                       uint8_t *dst_ptr,
+                                       int32_t dst_stride,
+                                       const int16_t *filter_x0,
+                                       int32_t h,
+                                       int32_t count) {
+  int32_t y, c;
+  const uint8_t *src;
+  uint8_t *dst;
+  uint8_t *cm = vpx_ff_cropTbl;
+  uint32_t vector_64 = 64;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t qload1, qload2, qload3;
+  uint32_t p1, p2, p3, p4, p5;
+  uint32_t st1, st2, st3;
+  const int16_t *filter = &filter_x0[3];
+  uint32_t filter45;;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    src = src_ptr;
+    dst = dst_ptr;
+
+    /* prefetch data to cache memory */
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_store(dst_ptr + dst_stride);
+
+    for (c = 0; c < count; c++) {
+      __asm__ __volatile__ (
+          "ulw              %[qload1],    0(%[src])                    \n\t"
+          "ulw              %[qload2],    4(%[src])                    \n\t"
+
+          /* even 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "ulw              %[qload3],    8(%[src])                    \n\t"
+          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* even 1 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
+
+          /* even 2. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "ulw              %[qload1],    12(%[src])                   \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[filter45]  \n\t" /* even 1 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
+
+          /* even 3. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter45]  \n\t" /* even 3 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
+
+          /* even 4. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter45]  \n\t" /* even 4 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
+
+          /* even 5. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter45]  \n\t" /* even 5 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
+
+          /* even 6. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
+          "dpa.w.ph         $ac3,         %[p5],          %[filter45]  \n\t" /* even 6 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
+
+          /* even 7. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* even 7 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
+
+          /* even 8. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* even 8 */
+          "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
+
+          /* ODD pixels */
+          "ulw              %[qload1],    1(%[src])                    \n\t"
+          "ulw              %[qload2],    5(%[src])                    \n\t"
+
+          /* odd 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
+          "ulw              %[qload3],    9(%[src])                    \n\t"
+          "dpa.w.ph         $ac3,         %[p1],          %[filter45]  \n\t" /* odd 1 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
+
+          /* odd 2. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
+          "ulw              %[qload1],    13(%[src])                   \n\t"
+          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* odd 2 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
+
+          /* odd 3. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* odd 3 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
+
+          /* odd 4. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter45]  \n\t" /* odd 4 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
+
+          /* odd 5. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* odd 5 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
+
+          /* odd 6. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter45]  \n\t" /* odd 6 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
+
+          /* odd 7. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter45]  \n\t" /* odd 7 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
+
+          /* odd 8. pixel */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter45]  \n\t" /* odd 8 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
+
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
+
+          "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
+          "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
+          "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
+
+          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
+            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+            [p5] "=&r" (p5),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+          : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
+            [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+      );
+
+      src += 16;
+      dst += 16;
+    }
+
+    /* Next row... */
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr,
+                                       int32_t src_stride,
+                                       uint8_t *dst_ptr,
+                                       int32_t dst_stride,
+                                       const int16_t *filter_x0,
+                                       int32_t h) {
+  int32_t y, c;
+  const uint8_t *src;
+  uint8_t *dst;
+  uint8_t *cm = vpx_ff_cropTbl;
+  uint32_t vector_64 = 64;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t qload1, qload2, qload3;
+  uint32_t p1, p2, p3, p4, p5;
+  uint32_t st1, st2, st3;
+  const int16_t *filter = &filter_x0[3];
+  uint32_t filter45;;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    src = src_ptr;
+    dst = dst_ptr;
+
+    /* prefetch data to cache memory */
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_load(src_ptr + src_stride + 64);
+    prefetch_store(dst_ptr + dst_stride);
+    prefetch_store(dst_ptr + dst_stride + 32);
+
+    for (c = 0; c < 4; c++) {
+      __asm__ __volatile__ (
+          "ulw              %[qload1],    0(%[src])                    \n\t"
+          "ulw              %[qload2],    4(%[src])                    \n\t"
+
+          /* even 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "ulw              %[qload3],    8(%[src])                    \n\t"
+          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* even 1 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
+
+          /* even 2. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "ulw              %[qload1],    12(%[src])                   \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[filter45]  \n\t" /* even 1 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
+
+          /* even 3. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter45]  \n\t" /* even 3 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
+
+          /* even 4. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter45]  \n\t" /* even 4 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
+
+          /* even 5. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter45]  \n\t" /* even 5 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
+
+          /* even 6. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
+          "dpa.w.ph         $ac3,         %[p5],          %[filter45]  \n\t" /* even 6 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
+
+          /* even 7. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* even 7 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
+
+          /* even 8. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* even 8 */
+          "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
+
+          /* ODD pixels */
+          "ulw              %[qload1],    1(%[src])                    \n\t"
+          "ulw              %[qload2],    5(%[src])                    \n\t"
+
+          /* odd 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
+          "ulw              %[qload3],    9(%[src])                    \n\t"
+          "dpa.w.ph         $ac3,         %[p1],          %[filter45]  \n\t" /* odd 1 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
+
+          /* odd 2. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
+          "ulw              %[qload1],    13(%[src])                   \n\t"
+          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* odd 2 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
+
+          /* odd 3. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* odd 3 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
+
+          /* odd 4. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter45]  \n\t" /* odd 4 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
+
+          /* odd 5. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* odd 5 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
+
+          /* odd 6. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter45]  \n\t" /* odd 6 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
+
+          /* odd 7. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter45]  \n\t" /* odd 7 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
+
+          /* odd 8. pixel */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter45]  \n\t" /* odd 8 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
+
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
+
+          "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
+          "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
+          "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
+
+          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
+            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+            [p5] "=&r" (p5),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+          : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
+            [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+      );
+
+      src += 16;
+      dst += 16;
+    }
+
+    /* Next row... */
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const int16_t *filter_x, int x_step_q4,
+                               const int16_t *filter_y, int y_step_q4,
+                               int w, int h) {
+  uint32_t pos = 38;
+
+  assert(x_step_q4 == 16);
+
+  prefetch_load((const uint8_t *)filter_x);
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  /* prefetch data to cache memory */
+  prefetch_load(src);
+  prefetch_load(src + 32);
+  prefetch_store(dst);
+
+  switch (w) {
+    case 4:
+      convolve_bi_horiz_4_dspr2(src, (int32_t)src_stride,
+                                dst, (int32_t)dst_stride,
+                                filter_x, (int32_t)h);
+      break;
+    case 8:
+      convolve_bi_horiz_8_dspr2(src, (int32_t)src_stride,
+                                dst, (int32_t)dst_stride,
+                                filter_x, (int32_t)h);
+      break;
+    case 16:
+      convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride,
+                                 dst, (int32_t)dst_stride,
+                                 filter_x, (int32_t)h, 1);
+      break;
+    case 32:
+      convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride,
+                                 dst, (int32_t)dst_stride,
+                                 filter_x, (int32_t)h, 2);
+      break;
+    case 64:
+      prefetch_load(src + 64);
+      prefetch_store(dst + 32);
+
+      convolve_bi_horiz_64_dspr2(src, (int32_t)src_stride,
+                                 dst, (int32_t)dst_stride,
+                                 filter_x, (int32_t)h);
+      break;
+    default:
+      vpx_convolve8_horiz_c(src, src_stride,
+                            dst, dst_stride,
+                            filter_x, x_step_q4,
+                            filter_y, y_step_q4,
+                            w, h);
+      break;
+  }
+}
+#endif
diff --git a/libs/libvpx/vpx_dsp/mips/convolve2_vert_dspr2.c b/libs/libvpx/vpx_dsp/mips/convolve2_vert_dspr2.c
new file mode 100644
index 0000000000..dde6ffd54f
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/convolve2_vert_dspr2.c
@@ -0,0 +1,258 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/convolve_common_dspr2.h"
+#include "vpx_dsp/vpx_convolve.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_ports/mem.h"
+
+#if HAVE_DSPR2
+static void convolve_bi_vert_4_dspr2(const uint8_t *src,
+                                     int32_t src_stride,
+                                     uint8_t *dst,
+                                     int32_t dst_stride,
+                                     const int16_t *filter_y,
+                                     int32_t w,
+                                     int32_t h) {
+  int32_t       x, y;
+  const uint8_t *src_ptr;
+  uint8_t       *dst_ptr;
+  uint8_t       *cm = vpx_ff_cropTbl;
+  uint32_t      vector4a = 64;
+  uint32_t      load1, load2;
+  uint32_t      p1, p2;
+  uint32_t      scratch1;
+  uint32_t      store1, store2;
+  int32_t       Temp1, Temp2;
+  const int16_t *filter = &filter_y[3];
+  uint32_t      filter45;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    prefetch_store(dst + dst_stride);
+
+    for (x = 0; x < w; x += 4) {
+      src_ptr = src + x;
+      dst_ptr = dst + x;
+
+      __asm__ __volatile__ (
+          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
+
+          "mtlo             %[vector4a],  $ac0                            \n\t"
+          "mtlo             %[vector4a],  $ac1                            \n\t"
+          "mtlo             %[vector4a],  $ac2                            \n\t"
+          "mtlo             %[vector4a],  $ac3                            \n\t"
+          "mthi             $zero,        $ac0                            \n\t"
+          "mthi             $zero,        $ac1                            \n\t"
+          "mthi             $zero,        $ac2                            \n\t"
+          "mthi             $zero,        $ac3                            \n\t"
+
+          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
+
+          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac0,         %[p1],          %[filter45]     \n\t"
+          "dpa.w.ph         $ac1,         %[p2],          %[filter45]     \n\t"
+
+          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
+
+          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac2,         %[p1],          %[filter45]     \n\t"
+          "dpa.w.ph         $ac3,         %[p2],          %[filter45]     \n\t"
+
+          "extp             %[Temp1],     $ac0,           31              \n\t"
+          "extp             %[Temp2],     $ac1,           31              \n\t"
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "extp             %[Temp1],     $ac2,           31              \n\t"
+
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+          "extp             %[Temp2],     $ac3,           31              \n\t"
+
+          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+
+          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
+
+          : [load1] "=&r" (load1), [load2] "=&r" (load2),
+            [p1] "=&r" (p1), [p2] "=&r" (p2),
+            [scratch1] "=&r" (scratch1),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+            [store1] "=&r" (store1), [store2] "=&r" (store2),
+            [src_ptr] "+r" (src_ptr)
+          : [filter45] "r" (filter45),[vector4a] "r" (vector4a),
+            [src_stride] "r" (src_stride),
+            [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
+      );
+    }
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_bi_vert_64_dspr2(const uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int16_t *filter_y,
+                                      int32_t h) {
+  int32_t       x, y;
+  const uint8_t *src_ptr;
+  uint8_t       *dst_ptr;
+  uint8_t       *cm = vpx_ff_cropTbl;
+  uint32_t      vector4a = 64;
+  uint32_t      load1, load2;
+  uint32_t      p1, p2;
+  uint32_t      scratch1;
+  uint32_t      store1, store2;
+  int32_t       Temp1, Temp2;
+  const int16_t *filter = &filter_y[3];
+  uint32_t      filter45;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    prefetch_store(dst + dst_stride);
+
+    for (x = 0; x < 64; x += 4) {
+      src_ptr = src + x;
+      dst_ptr = dst + x;
+
+      __asm__ __volatile__ (
+          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
+
+          "mtlo             %[vector4a],  $ac0                            \n\t"
+          "mtlo             %[vector4a],  $ac1                            \n\t"
+          "mtlo             %[vector4a],  $ac2                            \n\t"
+          "mtlo             %[vector4a],  $ac3                            \n\t"
+          "mthi             $zero,        $ac0                            \n\t"
+          "mthi             $zero,        $ac1                            \n\t"
+          "mthi             $zero,        $ac2                            \n\t"
+          "mthi             $zero,        $ac3                            \n\t"
+
+          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
+
+          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac0,         %[p1],          %[filter45]     \n\t"
+          "dpa.w.ph         $ac1,         %[p2],          %[filter45]     \n\t"
+
+          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
+
+          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac2,         %[p1],          %[filter45]     \n\t"
+          "dpa.w.ph         $ac3,         %[p2],          %[filter45]     \n\t"
+
+          "extp             %[Temp1],     $ac0,           31              \n\t"
+          "extp             %[Temp2],     $ac1,           31              \n\t"
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "extp             %[Temp1],     $ac2,           31              \n\t"
+
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+          "extp             %[Temp2],     $ac3,           31              \n\t"
+
+          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+
+          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
+
+          : [load1] "=&r" (load1), [load2] "=&r" (load2),
+            [p1] "=&r" (p1), [p2] "=&r" (p2),
+            [scratch1] "=&r" (scratch1),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+            [store1] "=&r" (store1), [store2] "=&r" (store2),
+            [src_ptr] "+r" (src_ptr)
+          : [filter45] "r" (filter45),[vector4a] "r" (vector4a),
+            [src_stride] "r" (src_stride),
+            [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
+      );
+    }
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const int16_t *filter_x, int x_step_q4,
+                              const int16_t *filter_y, int y_step_q4,
+                              int w, int h) {
+  uint32_t pos = 38;
+
+  assert(y_step_q4 == 16);
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  prefetch_store(dst);
+
+  switch (w) {
+    case 4 :
+    case 8 :
+    case 16 :
+    case 32 :
+      convolve_bi_vert_4_dspr2(src, src_stride,
+                               dst, dst_stride,
+                               filter_y, w, h);
+      break;
+    case 64 :
+      prefetch_store(dst + 32);
+      convolve_bi_vert_64_dspr2(src, src_stride,
+                                dst, dst_stride,
+                                filter_y, h);
+      break;
+    default:
+      vpx_convolve8_vert_c(src, src_stride,
+                           dst, dst_stride,
+                           filter_x, x_step_q4,
+                           filter_y, y_step_q4,
+                           w, h);
+      break;
+  }
+}
+#endif
diff --git a/libs/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c b/libs/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c
new file mode 100644
index 0000000000..43da9e54fb
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c
@@ -0,0 +1,677 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/convolve_common_dspr2.h"
+#include "vpx_dsp/vpx_convolve.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_ports/mem.h"
+
+#if HAVE_DSPR2
+static void convolve_avg_vert_4_dspr2(const uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int16_t *filter_y,
+                                      int32_t w,
+                                      int32_t h) {
+  int32_t       x, y;
+  const uint8_t *src_ptr;
+  uint8_t       *dst_ptr;
+  uint8_t       *cm = vpx_ff_cropTbl;
+  uint32_t      vector4a = 64;
+  uint32_t      load1, load2, load3, load4;
+  uint32_t      p1, p2;
+  uint32_t      n1, n2;
+  uint32_t      scratch1, scratch2;
+  uint32_t      store1, store2;
+  int32_t       vector1b, vector2b, vector3b, vector4b;
+  int32_t       Temp1, Temp2;
+
+  vector1b = ((const int32_t *)filter_y)[0];
+  vector2b = ((const int32_t *)filter_y)[1];
+  vector3b = ((const int32_t *)filter_y)[2];
+  vector4b = ((const int32_t *)filter_y)[3];
+
+  src -= 3 * src_stride;
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    prefetch_store(dst + dst_stride);
+
+    for (x = 0; x < w; x += 4) {
+      src_ptr = src + x;
+      dst_ptr = dst + x;
+
+      __asm__ __volatile__ (
+          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
+
+          "mtlo             %[vector4a],  $ac0                            \n\t"
+          "mtlo             %[vector4a],  $ac1                            \n\t"
+          "mtlo             %[vector4a],  $ac2                            \n\t"
+          "mtlo             %[vector4a],  $ac3                            \n\t"
+          "mthi             $zero,        $ac0                            \n\t"
+          "mthi             $zero,        $ac1                            \n\t"
+          "mthi             $zero,        $ac2                            \n\t"
+          "mthi             $zero,        $ac3                            \n\t"
+
+          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac0,         %[p1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac0,         %[p2],          %[vector2b]     \n\t"
+          "dpa.w.ph         $ac1,         %[n1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac1,         %[n2],          %[vector2b]     \n\t"
+
+          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac2,         %[p1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[vector2b]     \n\t"
+          "dpa.w.ph         $ac3,         %[n1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac3,         %[n2],          %[vector2b]     \n\t"
+
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
+
+          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac0,         %[p1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac0,         %[p2],          %[vector4b]     \n\t"
+          "extp             %[Temp1],     $ac0,           31              \n\t"
+          "dpa.w.ph         $ac1,         %[n1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac1,         %[n2],          %[vector4b]     \n\t"
+          "extp             %[Temp2],     $ac1,           31              \n\t"
+
+          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "lbu              %[scratch1],  0(%[dst_ptr])                   \n\t"
+          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+          "lbu              %[scratch2],  1(%[dst_ptr])                   \n\t"
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "dpa.w.ph         $ac2,         %[p1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[vector4b]     \n\t"
+          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 1 */
+          "extp             %[Temp1],     $ac2,           31              \n\t"
+
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+          "dpa.w.ph         $ac3,         %[n1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac3,         %[n2],          %[vector4b]     \n\t"
+          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 2 */
+          "extp             %[Temp2],     $ac3,           31              \n\t"
+          "lbu              %[scratch1],  2(%[dst_ptr])                   \n\t"
+
+          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
+          "lbu              %[scratch2],  3(%[dst_ptr])                   \n\t"
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 3 */
+          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 4 */
+
+          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
+
+          : [load1] "=&r" (load1), [load2] "=&r" (load2),
+            [load3] "=&r" (load3), [load4] "=&r" (load4),
+            [p1] "=&r" (p1), [p2] "=&r" (p2), [n1] "=&r" (n1), [n2] "=&r" (n2),
+            [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+            [store1] "=&r" (store1), [store2] "=&r" (store2),
+            [src_ptr] "+r" (src_ptr)
+          : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+            [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
+            [vector4a] "r" (vector4a),
+            [src_stride] "r" (src_stride), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
+      );
+    }
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_avg_vert_64_dspr2(const uint8_t *src,
+                                       int32_t src_stride,
+                                       uint8_t *dst,
+                                       int32_t dst_stride,
+                                       const int16_t *filter_y,
+                                       int32_t h) {
+  int32_t       x, y;
+  const uint8_t *src_ptr;
+  uint8_t       *dst_ptr;
+  uint8_t       *cm = vpx_ff_cropTbl;
+  uint32_t      vector4a = 64;
+  uint32_t      load1, load2, load3, load4;
+  uint32_t      p1, p2;
+  uint32_t      n1, n2;
+  uint32_t      scratch1, scratch2;
+  uint32_t      store1, store2;
+  int32_t       vector1b, vector2b, vector3b, vector4b;
+  int32_t       Temp1, Temp2;
+
+  vector1b = ((const int32_t *)filter_y)[0];
+  vector2b = ((const int32_t *)filter_y)[1];
+  vector3b = ((const int32_t *)filter_y)[2];
+  vector4b = ((const int32_t *)filter_y)[3];
+
+  src -= 3 * src_stride;
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    prefetch_store(dst + dst_stride);
+    prefetch_store(dst + dst_stride + 32);
+
+    for (x = 0; x < 64; x += 4) {
+      src_ptr = src + x;
+      dst_ptr = dst + x;
+
+      __asm__ __volatile__ (
+          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
+
+          "mtlo             %[vector4a],  $ac0                            \n\t"
+          "mtlo             %[vector4a],  $ac1                            \n\t"
+          "mtlo             %[vector4a],  $ac2                            \n\t"
+          "mtlo             %[vector4a],  $ac3                            \n\t"
+          "mthi             $zero,        $ac0                            \n\t"
+          "mthi             $zero,        $ac1                            \n\t"
+          "mthi             $zero,        $ac2                            \n\t"
+          "mthi             $zero,        $ac3                            \n\t"
+
+          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac0,         %[p1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac0,         %[p2],          %[vector2b]     \n\t"
+          "dpa.w.ph         $ac1,         %[n1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac1,         %[n2],          %[vector2b]     \n\t"
+
+          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac2,         %[p1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[vector2b]     \n\t"
+          "dpa.w.ph         $ac3,         %[n1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac3,         %[n2],          %[vector2b]     \n\t"
+
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
+
+          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac0,         %[p1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac0,         %[p2],          %[vector4b]     \n\t"
+          "extp             %[Temp1],     $ac0,           31              \n\t"
+          "dpa.w.ph         $ac1,         %[n1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac1,         %[n2],          %[vector4b]     \n\t"
+          "extp             %[Temp2],     $ac1,           31              \n\t"
+
+          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "lbu              %[scratch1],  0(%[dst_ptr])                   \n\t"
+          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+          "lbu              %[scratch2],  1(%[dst_ptr])                   \n\t"
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "dpa.w.ph         $ac2,         %[p1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[vector4b]     \n\t"
+          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 1 */
+          "extp             %[Temp1],     $ac2,           31              \n\t"
+
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+          "dpa.w.ph         $ac3,         %[n1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac3,         %[n2],          %[vector4b]     \n\t"
+          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 2 */
+          "extp             %[Temp2],     $ac3,           31              \n\t"
+          "lbu              %[scratch1],  2(%[dst_ptr])                   \n\t"
+
+          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
+          "lbu              %[scratch2],  3(%[dst_ptr])                   \n\t"
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 3 */
+          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 4 */
+
+          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
+
+          : [load1] "=&r" (load1), [load2] "=&r" (load2),
+            [load3] "=&r" (load3), [load4] "=&r" (load4),
+            [p1] "=&r" (p1), [p2] "=&r" (p2), [n1] "=&r" (n1), [n2] "=&r" (n2),
+            [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+            [store1] "=&r" (store1), [store2] "=&r" (store2),
+            [src_ptr] "+r" (src_ptr)
+          : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+            [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
+            [vector4a] "r" (vector4a),
+            [src_stride] "r" (src_stride), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
+      );
+    }
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const int16_t *filter_x, int x_step_q4,
+                                  const int16_t *filter_y, int y_step_q4,
+                                  int w, int h) {
+  assert(y_step_q4 == 16);
+  assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+  if (((const int32_t *)filter_y)[0] == 0) {
+    vpx_convolve2_avg_vert_dspr2(src, src_stride,
+                                 dst, dst_stride,
+                                 filter_x, x_step_q4,
+                                 filter_y, y_step_q4,
+                                 w, h);
+  } else {
+    uint32_t pos = 38;
+
+    /* bit positon for extract from acc */
+    __asm__ __volatile__ (
+      "wrdsp      %[pos],     1           \n\t"
+      :
+      : [pos] "r" (pos)
+    );
+
+    prefetch_store(dst);
+
+    switch (w) {
+      case 4:
+      case 8:
+      case 16:
+      case 32:
+        convolve_avg_vert_4_dspr2(src, src_stride,
+                                  dst, dst_stride,
+                                  filter_y, w, h);
+        break;
+      case 64:
+        prefetch_store(dst + 32);
+        convolve_avg_vert_64_dspr2(src, src_stride,
+                                   dst, dst_stride,
+                                   filter_y, h);
+        break;
+      default:
+        vpx_convolve8_avg_vert_c(src, src_stride,
+                                 dst, dst_stride,
+                                 filter_x, x_step_q4,
+                                 filter_y, y_step_q4,
+                                 w, h);
+        break;
+    }
+  }
+}
+
+void vpx_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const int16_t *filter_x, int x_step_q4,
+                             const int16_t *filter_y, int y_step_q4,
+                             int w, int h) {
+  /* Fixed size intermediate buffer places limits on parameters. */
+  DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]);
+  int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
+
+  assert(w <= 64);
+  assert(h <= 64);
+  assert(x_step_q4 == 16);
+  assert(y_step_q4 == 16);
+
+  if (intermediate_height < h)
+    intermediate_height = h;
+
+  vpx_convolve8_horiz(src - (src_stride * 3), src_stride,
+                      temp, 64,
+                      filter_x, x_step_q4,
+                      filter_y, y_step_q4,
+                      w, intermediate_height);
+
+  vpx_convolve8_avg_vert(temp + 64 * 3, 64,
+                         dst, dst_stride,
+                         filter_x, x_step_q4,
+                         filter_y, y_step_q4,
+                         w, h);
+}
+
+void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
+                            const int16_t *filter_x, int filter_x_stride,
+                            const int16_t *filter_y, int filter_y_stride,
+                            int w, int h) {
+  int x, y;
+  uint32_t tp1, tp2, tn1;
+  uint32_t tp3, tp4, tn2;
+
+  /* prefetch data to cache memory */
+  prefetch_load(src);
+  prefetch_load(src + 32);
+  prefetch_store(dst);
+
+  switch (w) {
+    case 4:
+      /* 1 word storage */
+      for (y = h; y--; ) {
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_store(dst + dst_stride);
+
+        __asm__ __volatile__ (
+            "ulw              %[tp1],         0(%[src])      \n\t"
+            "ulw              %[tp2],         0(%[dst])      \n\t"
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
+
+            : [tn1] "=&r" (tn1), [tp1] "=&r" (tp1),
+              [tp2] "=&r" (tp2)
+            : [src] "r" (src), [dst] "r" (dst)
+        );
+
+        src += src_stride;
+        dst += dst_stride;
+      }
+      break;
+    case 8:
+      /* 2 word storage */
+      for (y = h; y--; ) {
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_store(dst + dst_stride);
+
+        __asm__ __volatile__ (
+            "ulw              %[tp1],         0(%[src])      \n\t"
+            "ulw              %[tp2],         0(%[dst])      \n\t"
+            "ulw              %[tp3],         4(%[src])      \n\t"
+            "ulw              %[tp4],         4(%[dst])      \n\t"
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
+            "sw               %[tn2],         4(%[dst])      \n\t"  /* store */
+
+            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
+              [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
+            : [src] "r" (src), [dst] "r" (dst)
+        );
+
+        src += src_stride;
+        dst += dst_stride;
+      }
+      break;
+    case 16:
+      /* 4 word storage */
+      for (y = h; y--; ) {
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_store(dst + dst_stride);
+
+        __asm__ __volatile__ (
+            "ulw              %[tp1],         0(%[src])      \n\t"
+            "ulw              %[tp2],         0(%[dst])      \n\t"
+            "ulw              %[tp3],         4(%[src])      \n\t"
+            "ulw              %[tp4],         4(%[dst])      \n\t"
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "ulw              %[tp1],         8(%[src])      \n\t"
+            "ulw              %[tp2],         8(%[dst])      \n\t"
+            "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
+            "sw               %[tn2],         4(%[dst])      \n\t"  /* store */
+            "ulw              %[tp3],         12(%[src])     \n\t"
+            "ulw              %[tp4],         12(%[dst])     \n\t"
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "sw               %[tn1],         8(%[dst])      \n\t"  /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
+            "sw               %[tn2],         12(%[dst])     \n\t"  /* store */
+
+            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
+              [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
+            : [src] "r" (src), [dst] "r" (dst)
+        );
+
+        src += src_stride;
+        dst += dst_stride;
+      }
+      break;
+    case 32:
+      /* 8 word storage */
+      for (y = h; y--; ) {
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_store(dst + dst_stride);
+
+        __asm__ __volatile__ (
+            "ulw              %[tp1],         0(%[src])      \n\t"
+            "ulw              %[tp2],         0(%[dst])      \n\t"
+            "ulw              %[tp3],         4(%[src])      \n\t"
+            "ulw              %[tp4],         4(%[dst])      \n\t"
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "ulw              %[tp1],         8(%[src])      \n\t"
+            "ulw              %[tp2],         8(%[dst])      \n\t"
+            "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
+            "sw               %[tn2],         4(%[dst])      \n\t"  /* store */
+            "ulw              %[tp3],         12(%[src])     \n\t"
+            "ulw              %[tp4],         12(%[dst])     \n\t"
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "ulw              %[tp1],         16(%[src])     \n\t"
+            "ulw              %[tp2],         16(%[dst])     \n\t"
+            "sw               %[tn1],         8(%[dst])      \n\t"  /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
+            "sw               %[tn2],         12(%[dst])     \n\t"  /* store */
+            "ulw              %[tp3],         20(%[src])     \n\t"
+            "ulw              %[tp4],         20(%[dst])     \n\t"
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "ulw              %[tp1],         24(%[src])     \n\t"
+            "ulw              %[tp2],         24(%[dst])     \n\t"
+            "sw               %[tn1],         16(%[dst])     \n\t"  /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
+            "sw               %[tn2],         20(%[dst])     \n\t"  /* store */
+            "ulw              %[tp3],         28(%[src])     \n\t"
+            "ulw              %[tp4],         28(%[dst])     \n\t"
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "sw               %[tn1],         24(%[dst])     \n\t"  /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
+            "sw               %[tn2],         28(%[dst])     \n\t"  /* store */
+
+            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
+              [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
+            : [src] "r" (src), [dst] "r" (dst)
+        );
+
+        src += src_stride;
+        dst += dst_stride;
+      }
+      break;
+    case 64:
+      prefetch_load(src + 64);
+      prefetch_store(dst + 32);
+
+      /* 16 word storage */
+      for (y = h; y--; ) {
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_load(src + src_stride + 64);
+        prefetch_store(dst + dst_stride);
+        prefetch_store(dst + dst_stride + 32);
+
+        __asm__ __volatile__ (
+            "ulw              %[tp1],         0(%[src])      \n\t"
+            "ulw              %[tp2],         0(%[dst])      \n\t"
+            "ulw              %[tp3],         4(%[src])      \n\t"
+            "ulw              %[tp4],         4(%[dst])      \n\t"
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "ulw              %[tp1],         8(%[src])      \n\t"
+            "ulw              %[tp2],         8(%[dst])      \n\t"
+            "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
+            "sw               %[tn2],         4(%[dst])      \n\t"  /* store */
+            "ulw              %[tp3],         12(%[src])     \n\t"
+            "ulw              %[tp4],         12(%[dst])     \n\t"
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "ulw              %[tp1],         16(%[src])     \n\t"
+            "ulw              %[tp2],         16(%[dst])     \n\t"
+            "sw               %[tn1],         8(%[dst])      \n\t"  /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
+            "sw               %[tn2],         12(%[dst])     \n\t"  /* store */
+            "ulw              %[tp3],         20(%[src])     \n\t"
+            "ulw              %[tp4],         20(%[dst])     \n\t"
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "ulw              %[tp1],         24(%[src])     \n\t"
+            "ulw              %[tp2],         24(%[dst])     \n\t"
+            "sw               %[tn1],         16(%[dst])     \n\t"  /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
+            "sw               %[tn2],         20(%[dst])     \n\t"  /* store */
+            "ulw              %[tp3],         28(%[src])     \n\t"
+            "ulw              %[tp4],         28(%[dst])     \n\t"
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "ulw              %[tp1],         32(%[src])     \n\t"
+            "ulw              %[tp2],         32(%[dst])     \n\t"
+            "sw               %[tn1],         24(%[dst])     \n\t"  /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
+            "sw               %[tn2],         28(%[dst])     \n\t"  /* store */
+            "ulw              %[tp3],         36(%[src])     \n\t"
+            "ulw              %[tp4],         36(%[dst])     \n\t"
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "ulw              %[tp1],         40(%[src])     \n\t"
+            "ulw              %[tp2],         40(%[dst])     \n\t"
+            "sw               %[tn1],         32(%[dst])     \n\t"  /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
+            "sw               %[tn2],         36(%[dst])     \n\t"  /* store */
+            "ulw              %[tp3],         44(%[src])     \n\t"
+            "ulw              %[tp4],         44(%[dst])     \n\t"
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "ulw              %[tp1],         48(%[src])     \n\t"
+            "ulw              %[tp2],         48(%[dst])     \n\t"
+            "sw               %[tn1],         40(%[dst])     \n\t"  /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
+            "sw               %[tn2],         44(%[dst])     \n\t"  /* store */
+            "ulw              %[tp3],         52(%[src])     \n\t"
+            "ulw              %[tp4],         52(%[dst])     \n\t"
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "ulw              %[tp1],         56(%[src])     \n\t"
+            "ulw              %[tp2],         56(%[dst])     \n\t"
+            "sw               %[tn1],         48(%[dst])     \n\t"  /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
+            "sw               %[tn2],         52(%[dst])     \n\t"  /* store */
+            "ulw              %[tp3],         60(%[src])     \n\t"
+            "ulw              %[tp4],         60(%[dst])     \n\t"
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "sw               %[tn1],         56(%[dst])     \n\t"  /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
+            "sw               %[tn2],         60(%[dst])     \n\t"  /* store */
+
+            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
+              [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
+            : [src] "r" (src), [dst] "r" (dst)
+        );
+
+        src += src_stride;
+        dst += dst_stride;
+      }
+      break;
+    default:
+      for (y = h; y > 0; --y) {
+        for (x = 0; x < w; ++x) {
+          dst[x] = (dst[x] + src[x] + 1) >> 1;
+        }
+
+        src += src_stride;
+        dst += dst_stride;
+      }
+      break;
+  }
+}
+#endif
diff --git a/libs/libvpx/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c b/libs/libvpx/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c
new file mode 100644
index 0000000000..db0c2a4da5
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c
@@ -0,0 +1,1025 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/convolve_common_dspr2.h"
+#include "vpx_dsp/vpx_convolve.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_ports/mem.h"
+
+#if HAVE_DSPR2
+static void convolve_avg_horiz_4_dspr2(const uint8_t *src,
+                                       int32_t src_stride,
+                                       uint8_t *dst,
+                                       int32_t dst_stride,
+                                       const int16_t *filter_x0,
+                                       int32_t h) {
+  int32_t y;
+  uint8_t *cm = vpx_ff_cropTbl;
+  int32_t  vector1b, vector2b, vector3b, vector4b;
+  int32_t  Temp1, Temp2, Temp3, Temp4;
+  uint32_t vector4a = 64;
+  uint32_t tp1, tp2;
+  uint32_t p1, p2, p3, p4;
+  uint32_t n1, n2, n3, n4;
+  uint32_t tn1, tn2;
+
+  vector1b = ((const int32_t *)filter_x0)[0];
+  vector2b = ((const int32_t *)filter_x0)[1];
+  vector3b = ((const int32_t *)filter_x0)[2];
+  vector4b = ((const int32_t *)filter_x0)[3];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+    prefetch_store(dst + dst_stride);
+
+    __asm__ __volatile__ (
+        "ulw              %[tp1],         0(%[src])                      \n\t"
+        "ulw              %[tp2],         4(%[src])                      \n\t"
+
+        /* even 1. pixel */
+        "mtlo             %[vector4a],    $ac3                           \n\t"
+        "mthi             $zero,          $ac3                           \n\t"
+        "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
+        "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
+        "preceu.ph.qbr    %[p3],          %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p4],          %[tp2]                         \n\t"
+        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
+        "ulw              %[tn2],         8(%[src])                      \n\t"
+        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
+        "extp             %[Temp1],       $ac3,           31             \n\t"
+
+        /* even 2. pixel */
+        "mtlo             %[vector4a],    $ac2                           \n\t"
+        "mthi             $zero,          $ac2                           \n\t"
+        "preceu.ph.qbr    %[p1],          %[tn2]                         \n\t"
+        "balign           %[tn1],         %[tn2],         3              \n\t"
+        "balign           %[tn2],         %[tp2],         3              \n\t"
+        "balign           %[tp2],         %[tp1],         3              \n\t"
+        "dpa.w.ph         $ac2,           %[p2],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac2,           %[p3],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac2,           %[p4],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac2,           %[p1],          %[vector4b]    \n\t"
+        "extp             %[Temp3],       $ac2,           31             \n\t"
+
+        "lbu              %[p2],          3(%[dst])                      \n\t"  /* load odd 2 */
+
+        /* odd 1. pixel */
+        "lbux             %[tp1],         %[Temp1](%[cm])                \n\t"  /* even 1 */
+        "mtlo             %[vector4a],    $ac3                           \n\t"
+        "mthi             $zero,          $ac3                           \n\t"
+        "lbu              %[Temp1],       1(%[dst])                      \n\t"  /* load odd 1 */
+        "preceu.ph.qbr    %[n1],          %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[n2],          %[tp2]                         \n\t"
+        "preceu.ph.qbr    %[n3],          %[tn2]                         \n\t"
+        "preceu.ph.qbl    %[n4],          %[tn2]                         \n\t"
+        "dpa.w.ph         $ac3,           %[n1],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac3,           %[n2],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac3,           %[n3],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac3,           %[n4],          %[vector4b]    \n\t"
+        "extp             %[Temp2],       $ac3,           31             \n\t"
+
+        "lbu              %[tn2],         0(%[dst])                      \n\t"  /* load even 1 */
+
+        /* odd 2. pixel */
+        "lbux             %[tp2],         %[Temp3](%[cm])                \n\t"  /* even 2 */
+        "mtlo             %[vector4a],    $ac2                           \n\t"
+        "mthi             $zero,          $ac2                           \n\t"
+        "preceu.ph.qbr    %[n1],          %[tn1]                         \n\t"
+        "lbux             %[tn1],         %[Temp2](%[cm])                \n\t"  /* odd 1 */
+        "addqh_r.w        %[tn2],         %[tn2],         %[tp1]         \n\t"  /* average even 1 */
+        "dpa.w.ph         $ac2,           %[n2],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac2,           %[n3],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac2,           %[n4],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac2,           %[n1],          %[vector4b]    \n\t"
+        "extp             %[Temp4],       $ac2,           31             \n\t"
+
+        "lbu              %[tp1],         2(%[dst])                      \n\t"  /* load even 2 */
+        "sb               %[tn2],         0(%[dst])                      \n\t"  /* store even 1 */
+
+        /* clamp */
+        "addqh_r.w        %[Temp1],       %[Temp1],       %[tn1]         \n\t"  /* average odd 1 */
+        "lbux             %[n2],          %[Temp4](%[cm])                \n\t"  /* odd 2 */
+        "sb               %[Temp1],       1(%[dst])                      \n\t"  /* store odd 1 */
+
+        "addqh_r.w        %[tp1],         %[tp1],         %[tp2]         \n\t"  /* average even 2 */
+        "sb               %[tp1],         2(%[dst])                      \n\t"  /* store even 2 */
+
+        "addqh_r.w        %[p2],          %[p2],          %[n2]          \n\t"  /* average odd 2 */
+        "sb               %[p2],          3(%[dst])                      \n\t"  /* store odd 2 */
+
+        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+          [tn1] "=&r" (tn1), [tn2] "=&r" (tn2),
+          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+          [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3), [n4] "=&r" (n4),
+          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+          [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
+        : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+          [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
+          [vector4a] "r" (vector4a),
+          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+    );
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_avg_horiz_8_dspr2(const uint8_t *src,
+                                       int32_t src_stride,
+                                       uint8_t *dst,
+                                       int32_t dst_stride,
+                                       const int16_t *filter_x0,
+                                       int32_t h) {
+  int32_t y;
+  uint8_t *cm = vpx_ff_cropTbl;
+  uint32_t vector4a = 64;
+  int32_t vector1b, vector2b, vector3b, vector4b;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t tp1, tp2;
+  uint32_t p1, p2, p3, p4, n1;
+  uint32_t tn1, tn2, tn3;
+  uint32_t st0, st1;
+
+  vector1b = ((const int32_t *)filter_x0)[0];
+  vector2b = ((const int32_t *)filter_x0)[1];
+  vector3b = ((const int32_t *)filter_x0)[2];
+  vector4b = ((const int32_t *)filter_x0)[3];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+    prefetch_store(dst + dst_stride);
+
+    __asm__ __volatile__ (
+        "ulw              %[tp1],         0(%[src])                      \n\t"
+        "ulw              %[tp2],         4(%[src])                      \n\t"
+
+        /* even 1. pixel */
+        "mtlo             %[vector4a],    $ac3                           \n\t"
+        "mthi             $zero,          $ac3                           \n\t"
+        "mtlo             %[vector4a],    $ac2                           \n\t"
+        "mthi             $zero,          $ac2                           \n\t"
+        "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
+        "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
+        "preceu.ph.qbr    %[p3],          %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p4],          %[tp2]                         \n\t"
+        "ulw              %[tn2],         8(%[src])                      \n\t"
+        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
+        "extp             %[Temp1],       $ac3,           31             \n\t"
+        "lbu              %[Temp2],       0(%[dst])                      \n\t"
+        "lbu              %[tn3],         2(%[dst])                      \n\t"
+
+        /* even 2. pixel */
+        "preceu.ph.qbr    %[p1],          %[tn2]                         \n\t"
+        "preceu.ph.qbl    %[n1],          %[tn2]                         \n\t"
+        "ulw              %[tn1],         12(%[src])                     \n\t"
+        "dpa.w.ph         $ac2,           %[p2],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac2,           %[p3],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac2,           %[p4],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac2,           %[p1],          %[vector4b]    \n\t"
+        "extp             %[Temp3],       $ac2,           31             \n\t"
+
+        /* even 3. pixel */
+        "lbux             %[st0],         %[Temp1](%[cm])                \n\t"
+        "mtlo             %[vector4a],    $ac1                           \n\t"
+        "mthi             $zero,          $ac1                           \n\t"
+        "preceu.ph.qbr    %[p2],          %[tn1]                         \n\t"
+        "lbux             %[st1],         %[Temp3](%[cm])                \n\t"
+        "dpa.w.ph         $ac1,           %[p3],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac1,           %[p4],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac1,           %[p1],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac1,           %[n1],          %[vector4b]    \n\t"
+        "extp             %[Temp1],       $ac1,           31             \n\t"
+
+        "addqh_r.w        %[Temp2],       %[Temp2],       %[st0]         \n\t"
+        "addqh_r.w        %[tn3],         %[tn3],         %[st1]         \n\t"
+        "sb               %[Temp2],       0(%[dst])                      \n\t"
+        "sb               %[tn3],         2(%[dst])                      \n\t"
+
+        /* even 4. pixel */
+        "mtlo             %[vector4a],    $ac2                           \n\t"
+        "mthi             $zero,          $ac2                           \n\t"
+        "mtlo             %[vector4a],    $ac3                           \n\t"
+        "mthi             $zero,          $ac3                           \n\t"
+
+        "balign           %[tn3],         %[tn1],         3              \n\t"
+        "balign           %[tn1],         %[tn2],         3              \n\t"
+        "balign           %[tn2],         %[tp2],         3              \n\t"
+        "balign           %[tp2],         %[tp1],         3              \n\t"
+
+        "lbux             %[st0],         %[Temp1](%[cm])                \n\t"
+        "lbu              %[Temp2],       4(%[dst])                      \n\t"
+        "addqh_r.w        %[Temp2],       %[Temp2],       %[st0]         \n\t"
+
+        "dpa.w.ph         $ac2,           %[p4],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac2,           %[p1],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac2,           %[n1],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac2,           %[p2],          %[vector4b]    \n\t"
+        "extp             %[Temp3],       $ac2,           31             \n\t"
+
+        /* odd 1. pixel */
+        "mtlo             %[vector4a],    $ac1                           \n\t"
+        "mthi             $zero,          $ac1                           \n\t"
+        "sb               %[Temp2],       4(%[dst])                      \n\t"
+        "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p2],          %[tp2]                         \n\t"
+        "preceu.ph.qbr    %[p3],          %[tn2]                         \n\t"
+        "preceu.ph.qbl    %[p4],          %[tn2]                         \n\t"
+        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
+        "extp             %[Temp2],       $ac3,           31             \n\t"
+
+        "lbu              %[tp1],         6(%[dst])                      \n\t"
+
+        /* odd 2. pixel */
+        "mtlo             %[vector4a],    $ac3                           \n\t"
+        "mthi             $zero,          $ac3                           \n\t"
+        "mtlo             %[vector4a],    $ac2                           \n\t"
+        "mthi             $zero,          $ac2                           \n\t"
+        "preceu.ph.qbr    %[p1],          %[tn1]                         \n\t"
+        "preceu.ph.qbl    %[n1],          %[tn1]                         \n\t"
+        "lbux             %[st0],         %[Temp3](%[cm])                \n\t"
+        "dpa.w.ph         $ac1,           %[p2],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac1,           %[p3],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac1,           %[p4],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac1,           %[p1],          %[vector4b]    \n\t"
+        "extp             %[Temp3],       $ac1,           31             \n\t"
+
+        "lbu              %[tp2],         1(%[dst])                      \n\t"
+        "lbu              %[tn2],         3(%[dst])                      \n\t"
+        "addqh_r.w        %[tp1],         %[tp1],         %[st0]         \n\t"
+
+        /* odd 3. pixel */
+        "lbux             %[st1],         %[Temp2](%[cm])                \n\t"
+        "preceu.ph.qbr    %[p2],          %[tn3]                         \n\t"
+        "dpa.w.ph         $ac3,           %[p3],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac3,           %[p4],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac3,           %[p1],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac3,           %[n1],          %[vector4b]    \n\t"
+        "addqh_r.w        %[tp2],         %[tp2],         %[st1]         \n\t"
+        "extp             %[Temp2],       $ac3,           31             \n\t"
+
+        "lbu              %[tn3],         5(%[dst])                      \n\t"
+
+        /* odd 4. pixel */
+        "sb               %[tp2],         1(%[dst])                      \n\t"
+        "sb               %[tp1],         6(%[dst])                      \n\t"
+        "dpa.w.ph         $ac2,           %[p4],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac2,           %[p1],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac2,           %[n1],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac2,           %[p2],          %[vector4b]    \n\t"
+        "extp             %[Temp1],       $ac2,           31             \n\t"
+
+        "lbu              %[tn1],         7(%[dst])                      \n\t"
+
+        /* clamp */
+        "lbux             %[p4],          %[Temp3](%[cm])                \n\t"
+        "addqh_r.w        %[tn2],         %[tn2],         %[p4]          \n\t"
+
+        "lbux             %[p2],          %[Temp2](%[cm])                \n\t"
+        "addqh_r.w        %[tn3],         %[tn3],         %[p2]          \n\t"
+
+        "lbux             %[n1],          %[Temp1](%[cm])                \n\t"
+        "addqh_r.w        %[tn1],         %[tn1],         %[n1]          \n\t"
+
+        /* store bytes */
+        "sb               %[tn2],         3(%[dst])                      \n\t"
+        "sb               %[tn3],         5(%[dst])                      \n\t"
+        "sb               %[tn1],         7(%[dst])                      \n\t"
+
+        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+          [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), [tn3] "=&r" (tn3),
+          [st0] "=&r" (st0), [st1] "=&r" (st1),
+          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+          [n1] "=&r" (n1),
+          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+        : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+          [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
+          [vector4a] "r" (vector4a),
+          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+    );
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_avg_horiz_16_dspr2(const uint8_t *src_ptr,
+                                        int32_t src_stride,
+                                        uint8_t *dst_ptr,
+                                        int32_t dst_stride,
+                                        const int16_t *filter_x0,
+                                        int32_t h,
+                                        int32_t count) {
+  int32_t y, c;
+  const uint8_t *src;
+  uint8_t *dst;
+  uint8_t *cm = vpx_ff_cropTbl;
+  uint32_t vector_64 = 64;
+  int32_t filter12, filter34, filter56, filter78;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t qload1, qload2, qload3;
+  uint32_t p1, p2, p3, p4, p5;
+  uint32_t st1, st2, st3;
+
+  filter12 = ((const int32_t *)filter_x0)[0];
+  filter34 = ((const int32_t *)filter_x0)[1];
+  filter56 = ((const int32_t *)filter_x0)[2];
+  filter78 = ((const int32_t *)filter_x0)[3];
+
+  for (y = h; y--;) {
+    src = src_ptr;
+    dst = dst_ptr;
+
+    /* prefetch data to cache memory */
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_store(dst_ptr + dst_stride);
+
+    for (c = 0; c < count; c++) {
+      __asm__ __volatile__ (
+          "ulw              %[qload1],    0(%[src])                    \n\t"
+          "ulw              %[qload2],    4(%[src])                    \n\t"
+
+          /* even 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "ulw              %[qload3],    8(%[src])                    \n\t"
+          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
+          "lbu              %[st2],       0(%[dst])                    \n\t" /* load even 1 from dst */
+
+          /* even 2. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "ulw              %[qload1],    12(%[src])                   \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
+
+          "lbu              %[qload3],    2(%[dst])                    \n\t" /* load even 2 from dst */
+
+          /* even 3. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 1 */
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "sb               %[st2],       0(%[dst])                    \n\t" /* store even 1 to dst */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
+
+          /* even 4. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "addqh_r.w        %[qload3],    %[qload3],      %[st2]       \n\t" /* average even 2 */
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[qload3],    2(%[dst])                    \n\t" /* store even 2 to dst */
+          "ulw              %[qload2],    16(%[src])                   \n\t"
+          "lbu              %[qload3],    4(%[dst])                    \n\t" /* load even 3 from dst */
+          "lbu              %[qload1],    6(%[dst])                    \n\t" /* load even 4 from dst */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
+
+          /* even 5. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 3 */
+          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
+          "sb               %[qload3],    4(%[dst])                    \n\t" /* store even 3 to dst */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
+
+          /* even 6. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average even 4 */
+          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
+          "sb               %[qload1],    6(%[dst])                    \n\t" /* store even 4 to dst */
+          "ulw              %[qload3],    20(%[src])                   \n\t"
+          "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
+          "lbu              %[qload2],    8(%[dst])                    \n\t" /* load even 5 from dst */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
+
+          /* even 7. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 5 */
+          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
+          "sb               %[qload2],    8(%[dst])                    \n\t" /* store even 5 to dst */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
+          "lbu              %[qload3],    10(%[dst])                   \n\t" /* load even 6 from dst */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
+
+          "lbu              %[st2],       12(%[dst])                   \n\t" /* load even 7 from dst */
+
+          /* even 8. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 6 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
+          "sb               %[qload3],    10(%[dst])                   \n\t" /* store even 6 to dst */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
+
+          /* ODD pixels */
+          "ulw              %[qload1],    1(%[src])                   \n\t"
+          "ulw              %[qload2],    5(%[src])                    \n\t"
+
+          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 7 */
+
+          /* odd 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "sb               %[st2],       12(%[dst])                   \n\t" /* store even 7 to dst */
+          "ulw              %[qload3],    9(%[src])                    \n\t"
+          "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
+          "lbu              %[qload2],    14(%[dst])                   \n\t" /* load even 8 from dst */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
+
+          "lbu              %[st1],       1(%[dst])                    \n\t" /* load odd 1 from dst */
+
+          /* odd 2. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 8 */
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "sb               %[qload2],    14(%[dst])                   \n\t" /* store even 8 to dst */
+          "ulw              %[qload1],    13(%[src])                   \n\t"
+          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
+          "lbu              %[qload3],    3(%[dst])                    \n\t" /* load odd 2 from dst */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
+
+          /* odd 3. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "addqh_r.w        %[st3],       %[st3],         %[st1]       \n\t" /* average odd 1 */
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
+          "sb               %[st3],       1(%[dst])                    \n\t" /* store odd 1 to dst */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
+
+          /* odd 4. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "addqh_r.w        %[qload3],    %[qload3],      %[st1]       \n\t" /* average odd 2 */
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[qload3],    3(%[dst])                    \n\t" /* store odd 2 to dst */
+          "lbu              %[qload1],    5(%[dst])                    \n\t" /* load odd 3 from dst */
+          "ulw              %[qload2],    17(%[src])                   \n\t"
+          "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
+
+          "lbu              %[st1],       7(%[dst])                    \n\t" /* load odd 4 from dst */
+
+          /* odd 5. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "addqh_r.w        %[qload1],    %[qload1],      %[st2]       \n\t" /* average odd 3 */
+          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
+          "sb               %[qload1],    5(%[dst])                    \n\t" /* store odd 3 to dst */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
+
+          "lbu              %[qload1],    9(%[dst])                    \n\t" /* load odd 5 from dst */
+
+          /* odd 6. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "addqh_r.w        %[st1],       %[st1],         %[st3]       \n\t" /* average odd 4 */
+          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
+          "sb               %[st1],       7(%[dst])                    \n\t" /* store odd 4 to dst */
+          "ulw              %[qload3],    21(%[src])                   \n\t"
+          "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
+
+          /* odd 7. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 5 */
+          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
+          "sb               %[qload1],    9(%[dst])                    \n\t" /* store odd 5 to dst */
+          "lbu              %[qload2],    11(%[dst])                   \n\t" /* load odd 6 from dst */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
+
+          "lbu              %[qload3],    13(%[dst])                   \n\t" /* load odd 7 from dst */
+
+          /* odd 8. pixel */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
+
+          "lbu              %[qload1],    15(%[dst])                   \n\t" /* load odd 8 from dst */
+
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
+          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average odd 6 */
+
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
+          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average odd 7 */
+
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
+          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 8 */
+
+          "sb               %[qload2],    11(%[dst])                   \n\t" /* store odd 6 to dst */
+          "sb               %[qload3],    13(%[dst])                   \n\t" /* store odd 7 to dst */
+          "sb               %[qload1],    15(%[dst])                   \n\t" /* store odd 8 to dst */
+
+          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2),
+            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+            [qload3] "=&r" (qload3), [p5] "=&r" (p5),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+          : [filter12] "r" (filter12), [filter34] "r" (filter34),
+            [filter56] "r" (filter56), [filter78] "r" (filter78),
+            [vector_64] "r" (vector_64),
+            [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+      );
+
+      src += 16;
+      dst += 16;
+    }
+
+    /* Next row... */
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr,
+                                        int32_t src_stride,
+                                        uint8_t *dst_ptr,
+                                        int32_t dst_stride,
+                                        const int16_t *filter_x0,
+                                        int32_t h) {
+  int32_t y, c;
+  const uint8_t *src;
+  uint8_t *dst;
+  uint8_t *cm = vpx_ff_cropTbl;
+  uint32_t vector_64 = 64;
+  int32_t filter12, filter34, filter56, filter78;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t qload1, qload2, qload3;
+  uint32_t p1, p2, p3, p4, p5;
+  uint32_t st1, st2, st3;
+
+  filter12 = ((const int32_t *)filter_x0)[0];
+  filter34 = ((const int32_t *)filter_x0)[1];
+  filter56 = ((const int32_t *)filter_x0)[2];
+  filter78 = ((const int32_t *)filter_x0)[3];
+
+  for (y = h; y--;) {
+    src = src_ptr;
+    dst = dst_ptr;
+
+    /* prefetch data to cache memory */
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_load(src_ptr + src_stride + 64);
+    prefetch_store(dst_ptr + dst_stride);
+    prefetch_store(dst_ptr + dst_stride + 32);
+
+    for (c = 0; c < 4; c++) {
+      __asm__ __volatile__ (
+          "ulw              %[qload1],    0(%[src])                    \n\t"
+          "ulw              %[qload2],    4(%[src])                    \n\t"
+
+          /* even 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "ulw              %[qload3],    8(%[src])                    \n\t"
+          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
+          "lbu              %[st2],       0(%[dst])                    \n\t" /* load even 1 from dst */
+
+          /* even 2. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "ulw              %[qload1],    12(%[src])                   \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
+
+          "lbu              %[qload3],    2(%[dst])                    \n\t" /* load even 2 from dst */
+
+          /* even 3. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 1 */
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "sb               %[st2],       0(%[dst])                    \n\t" /* store even 1 to dst */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
+
+          /* even 4. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "addqh_r.w        %[qload3],    %[qload3],      %[st2]       \n\t" /* average even 2 */
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[qload3],    2(%[dst])                    \n\t" /* store even 2 to dst */
+          "ulw              %[qload2],    16(%[src])                   \n\t"
+          "lbu              %[qload3],    4(%[dst])                    \n\t" /* load even 3 from dst */
+          "lbu              %[qload1],    6(%[dst])                    \n\t" /* load even 4 from dst */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
+
+          /* even 5. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 3 */
+          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
+          "sb               %[qload3],    4(%[dst])                    \n\t" /* store even 3 to dst */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
+
+          /* even 6. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average even 4 */
+          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
+          "sb               %[qload1],    6(%[dst])                    \n\t" /* store even 4 to dst */
+          "ulw              %[qload3],    20(%[src])                   \n\t"
+          "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
+          "lbu              %[qload2],    8(%[dst])                    \n\t" /* load even 5 from dst */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
+
+          /* even 7. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 5 */
+          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
+          "sb               %[qload2],    8(%[dst])                    \n\t" /* store even 5 to dst */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
+          "lbu              %[qload3],    10(%[dst])                   \n\t" /* load even 6 from dst */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
+
+          "lbu              %[st2],       12(%[dst])                   \n\t" /* load even 7 from dst */
+
+          /* even 8. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 6 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
+          "sb               %[qload3],    10(%[dst])                   \n\t" /* store even 6 to dst */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
+
+          /* ODD pixels */
+          "ulw              %[qload1],    1(%[src])                   \n\t"
+          "ulw              %[qload2],    5(%[src])                    \n\t"
+
+          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 7 */
+
+          /* odd 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "sb               %[st2],       12(%[dst])                   \n\t" /* store even 7 to dst */
+          "ulw              %[qload3],    9(%[src])                    \n\t"
+          "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
+          "lbu              %[qload2],    14(%[dst])                   \n\t" /* load even 8 from dst */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
+
+          "lbu              %[st1],       1(%[dst])                    \n\t" /* load odd 1 from dst */
+
+          /* odd 2. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 8 */
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "sb               %[qload2],    14(%[dst])                   \n\t" /* store even 8 to dst */
+          "ulw              %[qload1],    13(%[src])                   \n\t"
+          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
+          "lbu              %[qload3],    3(%[dst])                    \n\t" /* load odd 2 from dst */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
+
+          /* odd 3. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "addqh_r.w        %[st3],       %[st3],         %[st1]       \n\t" /* average odd 1 */
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
+          "sb               %[st3],       1(%[dst])                    \n\t" /* store odd 1 to dst */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
+
+          /* odd 4. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "addqh_r.w        %[qload3],    %[qload3],      %[st1]       \n\t" /* average odd 2 */
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[qload3],    3(%[dst])                    \n\t" /* store odd 2 to dst */
+          "lbu              %[qload1],    5(%[dst])                    \n\t" /* load odd 3 from dst */
+          "ulw              %[qload2],    17(%[src])                   \n\t"
+          "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
+
+          "lbu              %[st1],       7(%[dst])                    \n\t" /* load odd 4 from dst */
+
+          /* odd 5. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "addqh_r.w        %[qload1],    %[qload1],      %[st2]       \n\t" /* average odd 3 */
+          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
+          "sb               %[qload1],    5(%[dst])                    \n\t" /* store odd 3 to dst */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
+
+          "lbu              %[qload1],    9(%[dst])                    \n\t" /* load odd 5 from dst */
+
+          /* odd 6. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "addqh_r.w        %[st1],       %[st1],         %[st3]       \n\t" /* average odd 4 */
+          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
+          "sb               %[st1],       7(%[dst])                    \n\t" /* store odd 4 to dst */
+          "ulw              %[qload3],    21(%[src])                   \n\t"
+          "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
+
+          /* odd 7. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 5 */
+          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
+          "sb               %[qload1],    9(%[dst])                    \n\t" /* store odd 5 to dst */
+          "lbu              %[qload2],    11(%[dst])                   \n\t" /* load odd 6 from dst */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
+
+          "lbu              %[qload3],    13(%[dst])                   \n\t" /* load odd 7 from dst */
+
+          /* odd 8. pixel */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
+
+          "lbu              %[qload1],    15(%[dst])                   \n\t" /* load odd 8 from dst */
+
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
+          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average odd 6 */
+
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
+          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average odd 7 */
+
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
+          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 8 */
+
+          "sb               %[qload2],    11(%[dst])                   \n\t" /* store odd 6 to dst */
+          "sb               %[qload3],    13(%[dst])                   \n\t" /* store odd 7 to dst */
+          "sb               %[qload1],    15(%[dst])                   \n\t" /* store odd 8 to dst */
+
+          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2),
+            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+            [qload3] "=&r" (qload3), [p5] "=&r" (p5),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+          : [filter12] "r" (filter12), [filter34] "r" (filter34),
+            [filter56] "r" (filter56), [filter78] "r" (filter78),
+            [vector_64] "r" (vector_64),
+            [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+      );
+
+      src += 16;
+      dst += 16;
+    }
+
+    /* Next row... */
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                                   uint8_t *dst, ptrdiff_t dst_stride,
+                                   const int16_t *filter_x, int x_step_q4,
+                                   const int16_t *filter_y, int y_step_q4,
+                                   int w, int h) {
+  assert(x_step_q4 == 16);
+  assert(((const int32_t *)filter_x)[1] != 0x800000);
+
+  if (((const int32_t *)filter_x)[0] == 0) {
+    vpx_convolve2_avg_horiz_dspr2(src, src_stride,
+                                  dst, dst_stride,
+                                  filter_x, x_step_q4,
+                                  filter_y, y_step_q4,
+                                  w, h);
+  } else {
+    uint32_t pos = 38;
+
+    src -= 3;
+
+    /* bit positon for extract from acc */
+    __asm__ __volatile__ (
+      "wrdsp      %[pos],     1           \n\t"
+      :
+      : [pos] "r" (pos)
+    );
+
+    /* prefetch data to cache memory */
+    prefetch_load(src);
+    prefetch_load(src + 32);
+    prefetch_store(dst);
+
+    switch (w) {
+      case 4:
+        convolve_avg_horiz_4_dspr2(src, src_stride,
+                                   dst, dst_stride,
+                                   filter_x, h);
+        break;
+      case 8:
+        convolve_avg_horiz_8_dspr2(src, src_stride,
+                                   dst, dst_stride,
+                                   filter_x, h);
+        break;
+      case 16:
+        convolve_avg_horiz_16_dspr2(src, src_stride,
+                                    dst, dst_stride,
+                                    filter_x, h, 1);
+        break;
+      case 32:
+        convolve_avg_horiz_16_dspr2(src, src_stride,
+                                    dst, dst_stride,
+                                    filter_x, h, 2);
+        break;
+      case 64:
+        prefetch_load(src + 64);
+        prefetch_store(dst + 32);
+
+        convolve_avg_horiz_64_dspr2(src, src_stride,
+                                    dst, dst_stride,
+                                    filter_x, h);
+        break;
+      default:
+        vpx_convolve8_avg_horiz_c(src + 3, src_stride,
+                                  dst, dst_stride,
+                                  filter_x, x_step_q4,
+                                  filter_y, y_step_q4,
+                                  w, h);
+        break;
+    }
+  }
+}
+#endif
diff --git a/libs/libvpx/vpx_dsp/mips/convolve8_dspr2.c b/libs/libvpx/vpx_dsp/mips/convolve8_dspr2.c
new file mode 100644
index 0000000000..ddad186922
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/convolve8_dspr2.c
@@ -0,0 +1,1257 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/convolve_common_dspr2.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_ports/mem.h"
+
+#if HAVE_DSPR2
+static void convolve_horiz_4_transposed_dspr2(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              const int16_t *filter_x0,
+                                              int32_t h) {
+  int32_t y;
+  uint8_t *cm = vpx_ff_cropTbl;
+  uint8_t *dst_ptr;
+  int32_t vector1b, vector2b, vector3b, vector4b;
+  int32_t Temp1, Temp2, Temp3, Temp4;
+  uint32_t vector4a = 64;
+  uint32_t tp1, tp2;
+  uint32_t p1, p2, p3, p4;
+  uint32_t tn1, tn2;
+
+  vector1b = ((const int32_t *)filter_x0)[0];
+  vector2b = ((const int32_t *)filter_x0)[1];
+  vector3b = ((const int32_t *)filter_x0)[2];
+  vector4b = ((const int32_t *)filter_x0)[3];
+
+  for (y = h; y--;) {
+    dst_ptr = dst;
+    /* prefetch data to cache memory */
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+
+    __asm__ __volatile__ (
+        "ulw              %[tp1],         0(%[src])                      \n\t"
+        "ulw              %[tp2],         4(%[src])                      \n\t"
+
+        /* even 1. pixel */
+        "mtlo             %[vector4a],    $ac3                           \n\t"
+        "mthi             $zero,          $ac3                           \n\t"
+        "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
+        "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
+        "preceu.ph.qbr    %[p3],          %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p4],          %[tp2]                         \n\t"
+        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
+        "ulw              %[tn2],         8(%[src])                      \n\t"
+        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
+        "extp             %[Temp1],       $ac3,           31             \n\t"
+
+        /* even 2. pixel */
+        "mtlo             %[vector4a],    $ac2                           \n\t"
+        "mthi             $zero,          $ac2                           \n\t"
+        "preceu.ph.qbr    %[p1],          %[tn2]                         \n\t"
+        "balign           %[tn1],         %[tn2],         3              \n\t"
+        "balign           %[tn2],         %[tp2],         3              \n\t"
+        "balign           %[tp2],         %[tp1],         3              \n\t"
+        "dpa.w.ph         $ac2,           %[p2],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac2,           %[p3],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac2,           %[p4],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac2,           %[p1],          %[vector4b]    \n\t"
+        "extp             %[Temp3],       $ac2,           31             \n\t"
+
+        /* odd 1. pixel */
+        "lbux             %[tp1],         %[Temp1](%[cm])                \n\t"
+        "mtlo             %[vector4a],    $ac3                           \n\t"
+        "mthi             $zero,          $ac3                           \n\t"
+        "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p2],          %[tp2]                         \n\t"
+        "preceu.ph.qbr    %[p3],          %[tn2]                         \n\t"
+        "preceu.ph.qbl    %[p4],          %[tn2]                         \n\t"
+        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
+        "extp             %[Temp2],       $ac3,           31             \n\t"
+
+        /* odd 2. pixel */
+        "lbux             %[tp2],         %[Temp3](%[cm])                \n\t"
+        "mtlo             %[vector4a],    $ac2                           \n\t"
+        "mthi             $zero,          $ac2                           \n\t"
+        "preceu.ph.qbr    %[p1],          %[tn1]                         \n\t"
+        "dpa.w.ph         $ac2,           %[p2],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac2,           %[p3],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac2,           %[p4],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac2,           %[p1],          %[vector4b]    \n\t"
+        "extp             %[Temp4],       $ac2,           31             \n\t"
+
+        /* clamp */
+        "lbux             %[tn1],         %[Temp2](%[cm])                \n\t"
+        "lbux             %[p2],          %[Temp4](%[cm])                \n\t"
+
+        /* store bytes */
+        "sb               %[tp1],         0(%[dst_ptr])                  \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
+
+        "sb               %[tn1],         0(%[dst_ptr])                  \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
+
+        "sb               %[tp2],         0(%[dst_ptr])                  \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
+
+        "sb               %[p2],          0(%[dst_ptr])                  \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
+
+        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn1] "=&r" (tn1), [tn2] "=&r" (tn2),
+          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4),
+          [dst_ptr] "+r" (dst_ptr)
+        : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+          [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
+          [vector4a] "r" (vector4a),
+          [cm] "r" (cm), [src] "r" (src), [dst_stride] "r" (dst_stride)
+    );
+
+    /* Next row... */
+    src += src_stride;
+    dst += 1;
+  }
+}
+
+static void convolve_horiz_8_transposed_dspr2(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              const int16_t *filter_x0,
+                                              int32_t h) {
+  int32_t y;
+  uint8_t *cm = vpx_ff_cropTbl;
+  uint8_t *dst_ptr;
+  uint32_t vector4a = 64;
+  int32_t vector1b, vector2b, vector3b, vector4b;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t tp1, tp2, tp3;
+  uint32_t p1, p2, p3, p4, n1;
+  uint8_t *odd_dst;
+  uint32_t dst_pitch_2 = (dst_stride << 1);
+
+  vector1b = ((const int32_t *)filter_x0)[0];
+  vector2b = ((const int32_t *)filter_x0)[1];
+  vector3b = ((const int32_t *)filter_x0)[2];
+  vector4b = ((const int32_t *)filter_x0)[3];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+
+    dst_ptr = dst;
+    odd_dst = (dst_ptr + dst_stride);
+
+    __asm__ __volatile__ (
+        "ulw              %[tp2],         0(%[src])                       \n\t"
+        "ulw              %[tp1],         4(%[src])                       \n\t"
+
+        /* even 1. pixel */
+        "mtlo             %[vector4a],    $ac3                            \n\t"
+        "mthi             $zero,          $ac3                            \n\t"
+        "mtlo             %[vector4a],    $ac2                            \n\t"
+        "mthi             $zero,          $ac2                            \n\t"
+        "preceu.ph.qbr    %[p1],          %[tp2]                          \n\t"
+        "preceu.ph.qbl    %[p2],          %[tp2]                          \n\t"
+        "preceu.ph.qbr    %[p3],          %[tp1]                          \n\t"
+        "preceu.ph.qbl    %[p4],          %[tp1]                          \n\t"
+        "ulw              %[tp3],         8(%[src])                       \n\t"
+        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]     \n\t"
+        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]     \n\t"
+        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]     \n\t"
+        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]     \n\t"
+        "extp             %[Temp1],       $ac3,           31              \n\t"
+
+        /* even 2. pixel */
+        "preceu.ph.qbr    %[p1],          %[tp3]                          \n\t"
+        "preceu.ph.qbl    %[n1],          %[tp3]                          \n\t"
+        "ulw              %[tp2],         12(%[src])                      \n\t"
+        "dpa.w.ph         $ac2,           %[p2],          %[vector1b]     \n\t"
+        "dpa.w.ph         $ac2,           %[p3],          %[vector2b]     \n\t"
+        "dpa.w.ph         $ac2,           %[p4],          %[vector3b]     \n\t"
+        "dpa.w.ph         $ac2,           %[p1],          %[vector4b]     \n\t"
+        "extp             %[Temp3],       $ac2,           31              \n\t"
+
+        /* even 3. pixel */
+        "lbux             %[Temp2],       %[Temp1](%[cm])                 \n\t"
+        "mtlo             %[vector4a],    $ac1                            \n\t"
+        "mthi             $zero,          $ac1                            \n\t"
+        "preceu.ph.qbr    %[p2],          %[tp2]                          \n\t"
+        "dpa.w.ph         $ac1,           %[p3],          %[vector1b]     \n\t"
+        "dpa.w.ph         $ac1,           %[p4],          %[vector2b]     \n\t"
+        "dpa.w.ph         $ac1,           %[p1],          %[vector3b]     \n\t"
+        "lbux             %[tp3],         %[Temp3](%[cm])                 \n\t"
+        "dpa.w.ph         $ac1,           %[n1],          %[vector4b]     \n\t"
+        "extp             %[p3],          $ac1,           31              \n\t"
+
+        /* even 4. pixel */
+        "mtlo             %[vector4a],    $ac2                            \n\t"
+        "mthi             $zero,          $ac2                            \n\t"
+        "mtlo             %[vector4a],    $ac3                            \n\t"
+        "mthi             $zero,          $ac3                            \n\t"
+        "sb               %[Temp2],       0(%[dst_ptr])                   \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
+        "sb               %[tp3],         0(%[dst_ptr])                   \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
+
+        "ulw              %[tp1],         1(%[src])                       \n\t"
+        "ulw              %[tp3],         5(%[src])                       \n\t"
+
+        "dpa.w.ph         $ac2,           %[p4],          %[vector1b]     \n\t"
+        "dpa.w.ph         $ac2,           %[p1],          %[vector2b]     \n\t"
+        "dpa.w.ph         $ac2,           %[n1],          %[vector3b]     \n\t"
+        "dpa.w.ph         $ac2,           %[p2],          %[vector4b]     \n\t"
+        "extp             %[Temp3],       $ac2,           31              \n\t"
+
+        "lbux             %[tp2],         %[p3](%[cm])                    \n\t"
+
+        /* odd 1. pixel */
+        "mtlo             %[vector4a],    $ac1                            \n\t"
+        "mthi             $zero,          $ac1                            \n\t"
+        "preceu.ph.qbr    %[p1],          %[tp1]                          \n\t"
+        "preceu.ph.qbl    %[p2],          %[tp1]                          \n\t"
+        "preceu.ph.qbr    %[p3],          %[tp3]                          \n\t"
+        "preceu.ph.qbl    %[p4],          %[tp3]                          \n\t"
+        "sb               %[tp2],         0(%[dst_ptr])                   \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
+        "ulw              %[tp2],         9(%[src])                       \n\t"
+
+        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]     \n\t"
+        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]     \n\t"
+        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]     \n\t"
+        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]     \n\t"
+        "extp             %[Temp2],       $ac3,           31              \n\t"
+
+        /* odd 2. pixel */
+        "lbux             %[tp1],         %[Temp3](%[cm])                 \n\t"
+        "mtlo             %[vector4a],    $ac3                            \n\t"
+        "mthi             $zero,          $ac3                            \n\t"
+        "mtlo             %[vector4a],    $ac2                            \n\t"
+        "mthi             $zero,          $ac2                            \n\t"
+        "preceu.ph.qbr    %[p1],          %[tp2]                          \n\t"
+        "preceu.ph.qbl    %[n1],          %[tp2]                          \n\t"
+        "ulw              %[Temp1],       13(%[src])                      \n\t"
+        "dpa.w.ph         $ac1,           %[p2],          %[vector1b]     \n\t"
+        "sb               %[tp1],         0(%[dst_ptr])                   \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
+        "dpa.w.ph         $ac1,           %[p3],          %[vector2b]     \n\t"
+        "dpa.w.ph         $ac1,           %[p4],          %[vector3b]     \n\t"
+        "dpa.w.ph         $ac1,           %[p1],          %[vector4b]     \n\t"
+        "extp             %[Temp3],       $ac1,           31              \n\t"
+
+        /* odd 3. pixel */
+        "lbux             %[tp3],         %[Temp2](%[cm])                 \n\t"
+        "preceu.ph.qbr    %[p2],          %[Temp1]                        \n\t"
+        "dpa.w.ph         $ac3,           %[p3],          %[vector1b]     \n\t"
+        "dpa.w.ph         $ac3,           %[p4],          %[vector2b]     \n\t"
+        "dpa.w.ph         $ac3,           %[p1],          %[vector3b]     \n\t"
+        "dpa.w.ph         $ac3,           %[n1],          %[vector4b]     \n\t"
+        "extp             %[Temp2],       $ac3,           31              \n\t"
+
+        /* odd 4. pixel */
+        "sb               %[tp3],         0(%[odd_dst])                   \n\t"
+        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
+        "dpa.w.ph         $ac2,           %[p4],          %[vector1b]     \n\t"
+        "dpa.w.ph         $ac2,           %[p1],          %[vector2b]     \n\t"
+        "dpa.w.ph         $ac2,           %[n1],          %[vector3b]     \n\t"
+        "dpa.w.ph         $ac2,           %[p2],          %[vector4b]     \n\t"
+        "extp             %[Temp1],       $ac2,           31              \n\t"
+
+        /* clamp */
+        "lbux             %[p4],          %[Temp3](%[cm])                 \n\t"
+        "lbux             %[p2],          %[Temp2](%[cm])                 \n\t"
+        "lbux             %[n1],          %[Temp1](%[cm])                 \n\t"
+
+        /* store bytes */
+        "sb               %[p4],          0(%[odd_dst])                   \n\t"
+        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
+
+        "sb               %[p2],          0(%[odd_dst])                   \n\t"
+        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
+
+        "sb               %[n1],          0(%[odd_dst])                   \n\t"
+
+        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3),
+          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+          [n1] "=&r" (n1),
+          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+          [dst_ptr] "+r" (dst_ptr), [odd_dst] "+r" (odd_dst)
+        : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+          [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
+          [vector4a] "r" (vector4a), [cm] "r" (cm),
+          [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
+    );
+
+    /* Next row... */
+    src += src_stride;
+    dst += 1;
+  }
+}
+
+static void convolve_horiz_16_transposed_dspr2(const uint8_t *src_ptr,
+                                               int32_t src_stride,
+                                               uint8_t *dst_ptr,
+                                               int32_t dst_stride,
+                                               const int16_t *filter_x0,
+                                               int32_t h,
+                                               int32_t count) {
+  int32_t c, y;
+  const uint8_t *src;
+  uint8_t *dst;
+  uint8_t *cm = vpx_ff_cropTbl;
+  uint32_t vector_64 = 64;
+  int32_t  filter12, filter34, filter56, filter78;
+  int32_t  Temp1, Temp2, Temp3;
+  uint32_t qload1, qload2;
+  uint32_t p1, p2, p3, p4, p5;
+  uint32_t st1, st2, st3;
+  uint32_t dst_pitch_2 = (dst_stride << 1);
+  uint8_t  *odd_dst;
+
+  filter12 = ((const int32_t *)filter_x0)[0];
+  filter34 = ((const int32_t *)filter_x0)[1];
+  filter56 = ((const int32_t *)filter_x0)[2];
+  filter78 = ((const int32_t *)filter_x0)[3];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+
+    src = src_ptr;
+    dst = dst_ptr;
+
+    odd_dst = (dst + dst_stride);
+
+    for (c = 0; c < count; c++) {
+      __asm__ __volatile__ (
+          "ulw              %[qload1],        0(%[src])                       \n\t"
+          "ulw              %[qload2],        4(%[src])                       \n\t"
+
+          /* even 1. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 1 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 2 */
+          "mthi             $zero,            $ac2                            \n\t"
+          "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
+          "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
+          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
+          "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
+          "ulw              %[qload2],        8(%[src])                       \n\t"
+          "dpa.w.ph         $ac1,             %[p1],          %[filter12]     \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,             %[p2],          %[filter34]     \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,             %[p3],          %[filter56]     \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,             %[p4],          %[filter78]     \n\t" /* even 1 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 1 */
+
+          /* even 2. pixel */
+          "mtlo             %[vector_64],     $ac3                            \n\t" /* even 3 */
+          "mthi             $zero,            $ac3                            \n\t"
+          "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"
+          "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"
+          "ulw              %[qload1],        12(%[src])                      \n\t"
+          "dpa.w.ph         $ac2,             %[p2],          %[filter12]     \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,             %[p3],          %[filter34]     \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,             %[p4],          %[filter56]     \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,             %[p1],          %[filter78]     \n\t" /* even 1 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 1 */
+          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 1 */
+
+          /* even 3. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 4 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"
+          "sb               %[st1],           0(%[dst])                       \n\t" /* even 1 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]             \n\t"
+          "dpa.w.ph         $ac3,             %[p3],          %[filter12]     \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,             %[p4],          %[filter34]     \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,             %[p1],          %[filter56]     \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,             %[p5],          %[filter78]     \n\t" /* even 3 */
+          "extp             %[Temp3],         $ac3,           31              \n\t" /* even 3 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 1 */
+
+          /* even 4. pixel */
+          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 5 */
+          "mthi             $zero,            $ac2                            \n\t"
+          "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"
+          "sb               %[st2],           0(%[dst])                       \n\t" /* even 2 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "ulw              %[qload2],        16(%[src])                      \n\t"
+          "dpa.w.ph         $ac1,             %[p4],          %[filter12]     \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,             %[p1],          %[filter34]     \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,             %[p5],          %[filter56]     \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,             %[p2],          %[filter78]     \n\t" /* even 4 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 4 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 3 */
+
+          /* even 5. pixel */
+          "mtlo             %[vector_64],     $ac3                            \n\t" /* even 6 */
+          "mthi             $zero,            $ac3                            \n\t"
+          "preceu.ph.qbr    %[p4],            %[qload2]                       \n\t"
+          "sb               %[st3],           0(%[dst])                       \n\t" /* even 3 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac2,             %[p1],          %[filter12]     \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,             %[p5],          %[filter34]     \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,             %[p2],          %[filter56]     \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,             %[p3],          %[filter78]     \n\t" /* even 5 */
+          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 5 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 4 */
+
+          /* even 6. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 7 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "preceu.ph.qbl    %[p1],            %[qload2]                       \n\t"
+          "sb               %[st1],           0(%[dst])                       \n\t" /* even 4 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "ulw              %[qload1],        20(%[src])                      \n\t"
+          "dpa.w.ph         $ac3,             %[p5],          %[filter12]     \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,             %[p2],          %[filter34]     \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,             %[p3],          %[filter56]     \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,             %[p4],          %[filter78]     \n\t" /* even 6 */
+          "extp             %[Temp3],         $ac3,           31              \n\t" /* even 6 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 5 */
+
+          /* even 7. pixel */
+          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 8 */
+          "mthi             $zero,            $ac2                            \n\t"
+          "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
+          "sb               %[st2],           0(%[dst])                       \n\t" /* even 5 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac1,             %[p2],          %[filter12]     \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,             %[p3],          %[filter34]     \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,             %[p4],          %[filter56]     \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,             %[p1],          %[filter78]     \n\t" /* even 7 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 7 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 6 */
+
+          /* even 8. pixel */
+          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 1 */
+          "mthi             $zero,            $ac3                            \n\t"
+          "dpa.w.ph         $ac2,             %[p3],          %[filter12]     \n\t" /* even 8 */
+          "dpa.w.ph         $ac2,             %[p4],          %[filter34]     \n\t" /* even 8 */
+          "sb               %[st3],           0(%[dst])                       \n\t" /* even 6 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac2,             %[p1],          %[filter56]     \n\t" /* even 8 */
+          "dpa.w.ph         $ac2,             %[p5],          %[filter78]     \n\t" /* even 8 */
+          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 8 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 7 */
+
+          /* ODD pixels */
+          "ulw              %[qload1],        1(%[src])                       \n\t"
+          "ulw              %[qload2],        5(%[src])                       \n\t"
+
+          /* odd 1. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 2 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
+          "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
+          "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
+          "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
+          "sb               %[st1],           0(%[dst])                       \n\t" /* even 7 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "ulw              %[qload2],        9(%[src])                       \n\t"
+          "dpa.w.ph         $ac3,             %[p1],          %[filter12]     \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,             %[p2],          %[filter34]     \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,             %[p3],          %[filter56]     \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,             %[p4],          %[filter78]     \n\t" /* odd 1 */
+          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 1 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 8 */
+
+          /* odd 2. pixel */
+          "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 3 */
+          "mthi             $zero,            $ac2                            \n\t"
+          "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"
+          "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"
+          "sb               %[st2],           0(%[dst])                       \n\t" /* even 8 */
+          "ulw              %[qload1],        13(%[src])                      \n\t"
+          "dpa.w.ph         $ac1,             %[p2],          %[filter12]     \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,             %[p3],          %[filter34]     \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,             %[p4],          %[filter56]     \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,             %[p1],          %[filter78]     \n\t" /* odd 2 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 2 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 1 */
+
+          /* odd 3. pixel */
+          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 4 */
+          "mthi             $zero,            $ac3                            \n\t"
+          "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"
+          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 1 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac2,             %[p3],          %[filter12]     \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,             %[p4],          %[filter34]     \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,             %[p1],          %[filter56]     \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,             %[p5],          %[filter78]     \n\t" /* odd 3 */
+          "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 3 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 2 */
+
+          /* odd 4. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 5 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"
+          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 2 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+          "ulw              %[qload2],        17(%[src])                      \n\t"
+          "dpa.w.ph         $ac3,             %[p4],          %[filter12]     \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,             %[p1],          %[filter34]     \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,             %[p5],          %[filter56]     \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,             %[p2],          %[filter78]     \n\t" /* odd 4 */
+          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 4 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 3 */
+
+          /* odd 5. pixel */
+          "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 6 */
+          "mthi             $zero,            $ac2                            \n\t"
+          "preceu.ph.qbr    %[p4],            %[qload2]                       \n\t"
+          "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 3 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac1,             %[p1],          %[filter12]     \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,             %[p5],          %[filter34]     \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,             %[p2],          %[filter56]     \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,             %[p3],          %[filter78]     \n\t" /* odd 5 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 5 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 4 */
+
+          /* odd 6. pixel */
+          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 7 */
+          "mthi             $zero,            $ac3                            \n\t"
+          "preceu.ph.qbl    %[p1],            %[qload2]                       \n\t"
+          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 4 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+          "ulw              %[qload1],        21(%[src])                      \n\t"
+          "dpa.w.ph         $ac2,             %[p5],          %[filter12]     \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,             %[p2],          %[filter34]     \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,             %[p3],          %[filter56]     \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,             %[p4],          %[filter78]     \n\t" /* odd 6 */
+          "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 6 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 5 */
+
+          /* odd 7. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 8 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
+          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 5 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac3,             %[p2],          %[filter12]     \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,             %[p3],          %[filter34]     \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,             %[p4],          %[filter56]     \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,             %[p1],          %[filter78]     \n\t" /* odd 7 */
+          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 7 */
+
+          /* odd 8. pixel */
+          "dpa.w.ph         $ac1,             %[p3],          %[filter12]     \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,             %[p4],          %[filter34]     \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,             %[p1],          %[filter56]     \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,             %[p5],          %[filter78]     \n\t" /* odd 8 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 8 */
+
+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 6 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 7 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 8 */
+
+          "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 6 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+
+          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 7 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+
+          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 8 */
+
+          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
+            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+            [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
+          : [filter12] "r" (filter12), [filter34] "r" (filter34),
+            [filter56] "r" (filter56), [filter78] "r" (filter78),
+            [vector_64] "r" (vector_64), [cm] "r" (cm),
+            [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
+      );
+
+      src += 16;
+      dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
+      odd_dst = (dst + dst_stride);
+    }
+
+    /* Next row... */
+    src_ptr += src_stride;
+
+    dst_ptr += 1;
+  }
+}
+
+static void convolve_horiz_64_transposed_dspr2(const uint8_t *src_ptr,
+                                               int32_t src_stride,
+                                               uint8_t *dst_ptr,
+                                               int32_t dst_stride,
+                                               const int16_t *filter_x0,
+                                               int32_t h) {
+  int32_t c, y;
+  const uint8_t *src;
+  uint8_t *dst;
+  uint8_t *cm = vpx_ff_cropTbl;
+  uint32_t vector_64 = 64;
+  int32_t  filter12, filter34, filter56, filter78;
+  int32_t  Temp1, Temp2, Temp3;
+  uint32_t qload1, qload2;
+  uint32_t p1, p2, p3, p4, p5;
+  uint32_t st1, st2, st3;
+  uint32_t dst_pitch_2 = (dst_stride << 1);
+  uint8_t  *odd_dst;
+
+  filter12 = ((const int32_t *)filter_x0)[0];
+  filter34 = ((const int32_t *)filter_x0)[1];
+  filter56 = ((const int32_t *)filter_x0)[2];
+  filter78 = ((const int32_t *)filter_x0)[3];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_load(src_ptr + src_stride + 64);
+
+    src = src_ptr;
+    dst = dst_ptr;
+
+    odd_dst = (dst + dst_stride);
+
+    for (c = 0; c < 4; c++) {
+      __asm__ __volatile__ (
+          "ulw              %[qload1],        0(%[src])                       \n\t"
+          "ulw              %[qload2],        4(%[src])                       \n\t"
+
+          /* even 1. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 1 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 2 */
+          "mthi             $zero,            $ac2                            \n\t"
+          "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
+          "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
+          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
+          "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
+          "ulw              %[qload2],        8(%[src])                       \n\t"
+          "dpa.w.ph         $ac1,             %[p1],          %[filter12]     \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,             %[p2],          %[filter34]     \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,             %[p3],          %[filter56]     \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,             %[p4],          %[filter78]     \n\t" /* even 1 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 1 */
+
+          /* even 2. pixel */
+          "mtlo             %[vector_64],     $ac3                            \n\t" /* even 3 */
+          "mthi             $zero,            $ac3                            \n\t"
+          "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"
+          "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"
+          "ulw              %[qload1],        12(%[src])                      \n\t"
+          "dpa.w.ph         $ac2,             %[p2],          %[filter12]     \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,             %[p3],          %[filter34]     \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,             %[p4],          %[filter56]     \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,             %[p1],          %[filter78]     \n\t" /* even 1 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 1 */
+          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 1 */
+
+          /* even 3. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 4 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"
+          "sb               %[st1],           0(%[dst])                       \n\t" /* even 1 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]             \n\t"
+          "dpa.w.ph         $ac3,             %[p3],          %[filter12]     \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,             %[p4],          %[filter34]     \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,             %[p1],          %[filter56]     \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,             %[p5],          %[filter78]     \n\t" /* even 3 */
+          "extp             %[Temp3],         $ac3,           31              \n\t" /* even 3 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 1 */
+
+          /* even 4. pixel */
+          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 5 */
+          "mthi             $zero,            $ac2                            \n\t"
+          "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"
+          "sb               %[st2],           0(%[dst])                       \n\t" /* even 2 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "ulw              %[qload2],        16(%[src])                      \n\t"
+          "dpa.w.ph         $ac1,             %[p4],          %[filter12]     \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,             %[p1],          %[filter34]     \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,             %[p5],          %[filter56]     \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,             %[p2],          %[filter78]     \n\t" /* even 4 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 4 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 3 */
+
+          /* even 5. pixel */
+          "mtlo             %[vector_64],     $ac3                            \n\t" /* even 6 */
+          "mthi             $zero,            $ac3                            \n\t"
+          "preceu.ph.qbr    %[p4],            %[qload2]                       \n\t"
+          "sb               %[st3],           0(%[dst])                       \n\t" /* even 3 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac2,             %[p1],          %[filter12]     \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,             %[p5],          %[filter34]     \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,             %[p2],          %[filter56]     \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,             %[p3],          %[filter78]     \n\t" /* even 5 */
+          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 5 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 4 */
+
+          /* even 6. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 7 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "preceu.ph.qbl    %[p1],            %[qload2]                       \n\t"
+          "sb               %[st1],           0(%[dst])                       \n\t" /* even 4 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "ulw              %[qload1],        20(%[src])                      \n\t"
+          "dpa.w.ph         $ac3,             %[p5],          %[filter12]     \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,             %[p2],          %[filter34]     \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,             %[p3],          %[filter56]     \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,             %[p4],          %[filter78]     \n\t" /* even 6 */
+          "extp             %[Temp3],         $ac3,           31              \n\t" /* even 6 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 5 */
+
+          /* even 7. pixel */
+          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 8 */
+          "mthi             $zero,            $ac2                            \n\t"
+          "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
+          "sb               %[st2],           0(%[dst])                       \n\t" /* even 5 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac1,             %[p2],          %[filter12]     \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,             %[p3],          %[filter34]     \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,             %[p4],          %[filter56]     \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,             %[p1],          %[filter78]     \n\t" /* even 7 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 7 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 6 */
+
+          /* even 8. pixel */
+          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 1 */
+          "mthi             $zero,            $ac3                            \n\t"
+          "dpa.w.ph         $ac2,             %[p3],          %[filter12]     \n\t" /* even 8 */
+          "dpa.w.ph         $ac2,             %[p4],          %[filter34]     \n\t" /* even 8 */
+          "sb               %[st3],           0(%[dst])                       \n\t" /* even 6 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac2,             %[p1],          %[filter56]     \n\t" /* even 8 */
+          "dpa.w.ph         $ac2,             %[p5],          %[filter78]     \n\t" /* even 8 */
+          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 8 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 7 */
+
+          /* ODD pixels */
+          "ulw              %[qload1],        1(%[src])                       \n\t"
+          "ulw              %[qload2],        5(%[src])                       \n\t"
+
+          /* odd 1. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 2 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
+          "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
+          "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
+          "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
+          "sb               %[st1],           0(%[dst])                       \n\t" /* even 7 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
+          "ulw              %[qload2],        9(%[src])                       \n\t"
+          "dpa.w.ph         $ac3,             %[p1],          %[filter12]     \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,             %[p2],          %[filter34]     \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,             %[p3],          %[filter56]     \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,             %[p4],          %[filter78]     \n\t" /* odd 1 */
+          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 1 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 8 */
+
+          /* odd 2. pixel */
+          "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 3 */
+          "mthi             $zero,            $ac2                            \n\t"
+          "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"
+          "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"
+          "sb               %[st2],           0(%[dst])                       \n\t" /* even 8 */
+          "ulw              %[qload1],        13(%[src])                      \n\t"
+          "dpa.w.ph         $ac1,             %[p2],          %[filter12]     \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,             %[p3],          %[filter34]     \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,             %[p4],          %[filter56]     \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,             %[p1],          %[filter78]     \n\t" /* odd 2 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 2 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 1 */
+
+          /* odd 3. pixel */
+          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 4 */
+          "mthi             $zero,            $ac3                            \n\t"
+          "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"
+          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 1 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac2,             %[p3],          %[filter12]     \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,             %[p4],          %[filter34]     \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,             %[p1],          %[filter56]     \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,             %[p5],          %[filter78]     \n\t" /* odd 3 */
+          "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 3 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 2 */
+
+          /* odd 4. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 5 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"
+          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 2 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+          "ulw              %[qload2],        17(%[src])                      \n\t"
+          "dpa.w.ph         $ac3,             %[p4],          %[filter12]     \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,             %[p1],          %[filter34]     \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,             %[p5],          %[filter56]     \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,             %[p2],          %[filter78]     \n\t" /* odd 4 */
+          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 4 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 3 */
+
+          /* odd 5. pixel */
+          "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 6 */
+          "mthi             $zero,            $ac2                            \n\t"
+          "preceu.ph.qbr    %[p4],            %[qload2]                       \n\t"
+          "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 3 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac1,             %[p1],          %[filter12]     \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,             %[p5],          %[filter34]     \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,             %[p2],          %[filter56]     \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,             %[p3],          %[filter78]     \n\t" /* odd 5 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 5 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 4 */
+
+          /* odd 6. pixel */
+          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 7 */
+          "mthi             $zero,            $ac3                            \n\t"
+          "preceu.ph.qbl    %[p1],            %[qload2]                       \n\t"
+          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 4 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+          "ulw              %[qload1],        21(%[src])                      \n\t"
+          "dpa.w.ph         $ac2,             %[p5],          %[filter12]     \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,             %[p2],          %[filter34]     \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,             %[p3],          %[filter56]     \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,             %[p4],          %[filter78]     \n\t" /* odd 6 */
+          "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 6 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 5 */
+
+          /* odd 7. pixel */
+          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 8 */
+          "mthi             $zero,            $ac1                            \n\t"
+          "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
+          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 5 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+          "dpa.w.ph         $ac3,             %[p2],          %[filter12]     \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,             %[p3],          %[filter34]     \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,             %[p4],          %[filter56]     \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,             %[p1],          %[filter78]     \n\t" /* odd 7 */
+          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 7 */
+
+          /* odd 8. pixel */
+          "dpa.w.ph         $ac1,             %[p3],          %[filter12]     \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,             %[p4],          %[filter34]     \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,             %[p1],          %[filter56]     \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,             %[p5],          %[filter78]     \n\t" /* odd 8 */
+          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 8 */
+
+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 6 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 7 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 8 */
+
+          "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 6 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+
+          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 7 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
+
+          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 8 */
+
+          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
+            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+            [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
+          : [filter12] "r" (filter12), [filter34] "r" (filter34),
+            [filter56] "r" (filter56), [filter78] "r" (filter78),
+            [vector_64] "r" (vector_64), [cm] "r" (cm),
+            [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
+      );
+
+      src += 16;
+      dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
+      odd_dst = (dst + dst_stride);
+    }
+
+    /* Next row... */
+    src_ptr += src_stride;
+
+    dst_ptr += 1;
+  }
+}
+
+void convolve_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const int16_t *filter, int w, int h) {
+  int x, y, k;
+
+  for (y = 0; y < h; ++y) {
+    for (x = 0; x < w; ++x) {
+      int sum = 0;
+
+      for (k = 0; k < 8; ++k)
+        sum += src[x + k] * filter[k];
+
+      dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+    }
+
+    src += src_stride;
+    dst += 1;
+  }
+}
+
+void copy_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           int w, int h) {
+  int x, y;
+
+  for (y = 0; y < h; ++y) {
+    for (x = 0; x < w; ++x) {
+      dst[x * dst_stride] = src[x];
+    }
+
+    src += src_stride;
+    dst += 1;
+  }
+}
+
+void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                         uint8_t *dst, ptrdiff_t dst_stride,
+                         const int16_t *filter_x, int x_step_q4,
+                         const int16_t *filter_y, int y_step_q4,
+                         int w, int h) {
+  DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]);
+  int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
+  uint32_t pos = 38;
+
+  assert(x_step_q4 == 16);
+  assert(y_step_q4 == 16);
+  assert(((const int32_t *)filter_x)[1] != 0x800000);
+  assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  if (intermediate_height < h)
+    intermediate_height = h;
+
+  /* copy the src to dst */
+  if (filter_x[3] == 0x80) {
+    copy_horiz_transposed(src - src_stride * 3, src_stride,
+                          temp, intermediate_height,
+                          w, intermediate_height);
+  } else if (((const int32_t *)filter_x)[0] == 0) {
+    vpx_convolve2_dspr2(src - src_stride * 3, src_stride,
+                        temp, intermediate_height,
+                        filter_x,
+                        w, intermediate_height);
+  } else {
+    src -= (src_stride * 3 + 3);
+
+    /* prefetch data to cache memory */
+    prefetch_load(src);
+    prefetch_load(src + 32);
+
+    switch (w) {
+      case 4:
+        convolve_horiz_4_transposed_dspr2(src, src_stride,
+                                          temp, intermediate_height,
+                                          filter_x, intermediate_height);
+        break;
+      case 8:
+        convolve_horiz_8_transposed_dspr2(src, src_stride,
+                                          temp, intermediate_height,
+                                          filter_x, intermediate_height);
+        break;
+      case 16:
+      case 32:
+        convolve_horiz_16_transposed_dspr2(src, src_stride,
+                                           temp, intermediate_height,
+                                           filter_x, intermediate_height,
+                                           (w/16));
+        break;
+      case 64:
+        prefetch_load(src + 32);
+        convolve_horiz_64_transposed_dspr2(src, src_stride,
+                                           temp, intermediate_height,
+                                           filter_x, intermediate_height);
+        break;
+      default:
+        convolve_horiz_transposed(src, src_stride,
+                                  temp, intermediate_height,
+                                  filter_x, w, intermediate_height);
+        break;
+    }
+  }
+
+  /* copy the src to dst */
+  if (filter_y[3] == 0x80) {
+    copy_horiz_transposed(temp + 3, intermediate_height,
+                          dst, dst_stride,
+                          h, w);
+  } else if (((const int32_t *)filter_y)[0] == 0) {
+    vpx_convolve2_dspr2(temp + 3, intermediate_height,
+                        dst, dst_stride,
+                        filter_y,
+                        h, w);
+  } else {
+    switch (h) {
+      case 4:
+        convolve_horiz_4_transposed_dspr2(temp, intermediate_height,
+                                          dst, dst_stride,
+                                          filter_y, w);
+        break;
+      case 8:
+        convolve_horiz_8_transposed_dspr2(temp, intermediate_height,
+                                          dst, dst_stride,
+                                          filter_y, w);
+        break;
+      case 16:
+      case 32:
+        convolve_horiz_16_transposed_dspr2(temp, intermediate_height,
+                                           dst, dst_stride,
+                                           filter_y, w, (h/16));
+        break;
+      case 64:
+        convolve_horiz_64_transposed_dspr2(temp, intermediate_height,
+                                           dst, dst_stride,
+                                           filter_y, w);
+        break;
+      default:
+        convolve_horiz_transposed(temp, intermediate_height,
+                                  dst, dst_stride,
+                                  filter_y, h, w);
+        break;
+    }
+  }
+}
+
+void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const int16_t *filter_x, int filter_x_stride,
+                             const int16_t *filter_y, int filter_y_stride,
+                             int w, int h) {
+  int x, y;
+
+  /* prefetch data to cache memory */
+  prefetch_load(src);
+  prefetch_load(src + 32);
+  prefetch_store(dst);
+
+  switch (w) {
+    case 4:
+      {
+      uint32_t tp1;
+
+      /* 1 word storage */
+      for (y = h; y--; ) {
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_store(dst + dst_stride);
+
+        __asm__ __volatile__ (
+            "ulw              %[tp1],         (%[src])      \n\t"
+            "sw               %[tp1],         (%[dst])      \n\t"  /* store */
+
+            : [tp1] "=&r" (tp1)
+            : [src] "r" (src), [dst] "r" (dst)
+        );
+
+        src += src_stride;
+        dst += dst_stride;
+      }
+      }
+      break;
+    case 8:
+      {
+      uint32_t tp1, tp2;
+
+      /* 2 word storage */
+      for (y = h; y--; ) {
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_store(dst + dst_stride);
+
+        __asm__ __volatile__ (
+            "ulw              %[tp1],         0(%[src])      \n\t"
+            "ulw              %[tp2],         4(%[src])      \n\t"
+            "sw               %[tp1],         0(%[dst])      \n\t"  /* store */
+            "sw               %[tp2],         4(%[dst])      \n\t"  /* store */
+
+            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2)
+            : [src] "r" (src), [dst] "r" (dst)
+        );
+
+        src += src_stride;
+        dst += dst_stride;
+      }
+      }
+      break;
+    case 16:
+      {
+      uint32_t tp1, tp2, tp3, tp4;
+
+      /* 4 word storage */
+      for (y = h; y--; ) {
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_store(dst + dst_stride);
+
+        __asm__ __volatile__ (
+            "ulw              %[tp1],         0(%[src])      \n\t"
+            "ulw              %[tp2],         4(%[src])      \n\t"
+            "ulw              %[tp3],         8(%[src])      \n\t"
+            "ulw              %[tp4],         12(%[src])     \n\t"
+
+            "sw               %[tp1],         0(%[dst])      \n\t"  /* store */
+            "sw               %[tp2],         4(%[dst])      \n\t"  /* store */
+            "sw               %[tp3],         8(%[dst])      \n\t"  /* store */
+            "sw               %[tp4],         12(%[dst])     \n\t"  /* store */
+
+            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4)
+            : [src] "r" (src), [dst] "r" (dst)
+        );
+
+        src += src_stride;
+        dst += dst_stride;
+      }
+      }
+      break;
+    case 32:
+      {
+      uint32_t tp1, tp2, tp3, tp4;
+      uint32_t tp5, tp6, tp7, tp8;
+
+      /* 8 word storage */
+      for (y = h; y--; ) {
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_store(dst + dst_stride);
+
+        __asm__ __volatile__ (
+            "ulw              %[tp1],         0(%[src])      \n\t"
+            "ulw              %[tp2],         4(%[src])      \n\t"
+            "ulw              %[tp3],         8(%[src])      \n\t"
+            "ulw              %[tp4],         12(%[src])     \n\t"
+            "ulw              %[tp5],         16(%[src])     \n\t"
+            "ulw              %[tp6],         20(%[src])     \n\t"
+            "ulw              %[tp7],         24(%[src])     \n\t"
+            "ulw              %[tp8],         28(%[src])     \n\t"
+
+            "sw               %[tp1],         0(%[dst])      \n\t"  /* store */
+            "sw               %[tp2],         4(%[dst])      \n\t"  /* store */
+            "sw               %[tp3],         8(%[dst])      \n\t"  /* store */
+            "sw               %[tp4],         12(%[dst])     \n\t"  /* store */
+            "sw               %[tp5],         16(%[dst])     \n\t"  /* store */
+            "sw               %[tp6],         20(%[dst])     \n\t"  /* store */
+            "sw               %[tp7],         24(%[dst])     \n\t"  /* store */
+            "sw               %[tp8],         28(%[dst])     \n\t"  /* store */
+
+            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
+              [tp5] "=&r" (tp5), [tp6] "=&r" (tp6),
+              [tp7] "=&r" (tp7), [tp8] "=&r" (tp8)
+            : [src] "r" (src), [dst] "r" (dst)
+        );
+
+        src += src_stride;
+        dst += dst_stride;
+      }
+      }
+      break;
+    case 64:
+      {
+      uint32_t tp1, tp2, tp3, tp4;
+      uint32_t tp5, tp6, tp7, tp8;
+
+      prefetch_load(src + 64);
+      prefetch_store(dst + 32);
+
+      /* 16 word storage */
+      for (y = h; y--; ) {
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_load(src + src_stride + 64);
+        prefetch_store(dst + dst_stride);
+        prefetch_store(dst + dst_stride + 32);
+
+        __asm__ __volatile__ (
+            "ulw              %[tp1],         0(%[src])      \n\t"
+            "ulw              %[tp2],         4(%[src])      \n\t"
+            "ulw              %[tp3],         8(%[src])      \n\t"
+            "ulw              %[tp4],         12(%[src])     \n\t"
+            "ulw              %[tp5],         16(%[src])     \n\t"
+            "ulw              %[tp6],         20(%[src])     \n\t"
+            "ulw              %[tp7],         24(%[src])     \n\t"
+            "ulw              %[tp8],         28(%[src])     \n\t"
+
+            "sw               %[tp1],         0(%[dst])      \n\t"  /* store */
+            "sw               %[tp2],         4(%[dst])      \n\t"  /* store */
+            "sw               %[tp3],         8(%[dst])      \n\t"  /* store */
+            "sw               %[tp4],         12(%[dst])     \n\t"  /* store */
+            "sw               %[tp5],         16(%[dst])     \n\t"  /* store */
+            "sw               %[tp6],         20(%[dst])     \n\t"  /* store */
+            "sw               %[tp7],         24(%[dst])     \n\t"  /* store */
+            "sw               %[tp8],         28(%[dst])     \n\t"  /* store */
+
+            "ulw              %[tp1],         32(%[src])     \n\t"
+            "ulw              %[tp2],         36(%[src])     \n\t"
+            "ulw              %[tp3],         40(%[src])     \n\t"
+            "ulw              %[tp4],         44(%[src])     \n\t"
+            "ulw              %[tp5],         48(%[src])     \n\t"
+            "ulw              %[tp6],         52(%[src])     \n\t"
+            "ulw              %[tp7],         56(%[src])     \n\t"
+            "ulw              %[tp8],         60(%[src])     \n\t"
+
+            "sw               %[tp1],         32(%[dst])     \n\t"  /* store */
+            "sw               %[tp2],         36(%[dst])     \n\t"  /* store */
+            "sw               %[tp3],         40(%[dst])     \n\t"  /* store */
+            "sw               %[tp4],         44(%[dst])     \n\t"  /* store */
+            "sw               %[tp5],         48(%[dst])     \n\t"  /* store */
+            "sw               %[tp6],         52(%[dst])     \n\t"  /* store */
+            "sw               %[tp7],         56(%[dst])     \n\t"  /* store */
+            "sw               %[tp8],         60(%[dst])     \n\t"  /* store */
+
+            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
+              [tp5] "=&r" (tp5), [tp6] "=&r" (tp6),
+              [tp7] "=&r" (tp7), [tp8] "=&r" (tp8)
+            : [src] "r" (src), [dst] "r" (dst)
+        );
+
+        src += src_stride;
+        dst += dst_stride;
+      }
+      }
+      break;
+    default:
+      for (y = h; y--; ) {
+        for (x = 0; x < w; ++x) {
+          dst[x] = src[x];
+        }
+
+        src += src_stride;
+        dst += dst_stride;
+      }
+      break;
+  }
+}
+#endif
diff --git a/libs/libvpx/vpx_dsp/mips/convolve8_horiz_dspr2.c b/libs/libvpx/vpx_dsp/mips/convolve8_horiz_dspr2.c
new file mode 100644
index 0000000000..ae78bab892
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/convolve8_horiz_dspr2.c
@@ -0,0 +1,910 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/convolve_common_dspr2.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_ports/mem.h"
+
+#if HAVE_DSPR2
+static void convolve_horiz_4_dspr2(const uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   const int16_t *filter_x0,
+                                   int32_t h) {
+  int32_t y;
+  uint8_t *cm = vpx_ff_cropTbl;
+  int32_t vector1b, vector2b, vector3b, vector4b;
+  int32_t Temp1, Temp2, Temp3, Temp4;
+  uint32_t vector4a = 64;
+  uint32_t tp1, tp2;
+  uint32_t p1, p2, p3, p4;
+  uint32_t n1, n2, n3, n4;
+  uint32_t tn1, tn2;
+
+  vector1b = ((const int32_t *)filter_x0)[0];
+  vector2b = ((const int32_t *)filter_x0)[1];
+  vector3b = ((const int32_t *)filter_x0)[2];
+  vector4b = ((const int32_t *)filter_x0)[3];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+    prefetch_store(dst + dst_stride);
+
+    __asm__ __volatile__ (
+        "ulw              %[tp1],      0(%[src])                      \n\t"
+        "ulw              %[tp2],      4(%[src])                      \n\t"
+
+        /* even 1. pixel */
+        "mtlo             %[vector4a], $ac3                           \n\t"
+        "mthi             $zero,       $ac3                           \n\t"
+        "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
+        "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
+        "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
+        "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
+        "ulw              %[tn2],      8(%[src])                      \n\t"
+        "dpa.w.ph         $ac3,        %[p4],          %[vector4b]    \n\t"
+        "extp             %[Temp1],    $ac3,           31             \n\t"
+
+        /* even 2. pixel */
+        "mtlo             %[vector4a], $ac2                           \n\t"
+        "mthi             $zero,       $ac2                           \n\t"
+        "preceu.ph.qbr    %[p1],       %[tn2]                         \n\t"
+        "balign           %[tn1],      %[tn2],         3              \n\t"
+        "balign           %[tn2],      %[tp2],         3              \n\t"
+        "balign           %[tp2],      %[tp1],         3              \n\t"
+        "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac2,        %[p4],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac2,        %[p1],          %[vector4b]    \n\t"
+        "extp             %[Temp3],    $ac2,           31             \n\t"
+
+        /* odd 1. pixel */
+        "lbux             %[tp1],      %[Temp1](%[cm])                \n\t"
+        "mtlo             %[vector4a], $ac3                           \n\t"
+        "mthi             $zero,       $ac3                           \n\t"
+        "preceu.ph.qbr    %[n1],       %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[n2],       %[tp2]                         \n\t"
+        "preceu.ph.qbr    %[n3],       %[tn2]                         \n\t"
+        "preceu.ph.qbl    %[n4],       %[tn2]                         \n\t"
+        "dpa.w.ph         $ac3,        %[n1],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac3,        %[n2],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac3,        %[n3],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac3,        %[n4],          %[vector4b]    \n\t"
+        "extp             %[Temp2],    $ac3,           31             \n\t"
+
+        /* odd 2. pixel */
+        "lbux             %[tp2],      %[Temp3](%[cm])                \n\t"
+        "mtlo             %[vector4a], $ac2                           \n\t"
+        "mthi             $zero,       $ac2                           \n\t"
+        "preceu.ph.qbr    %[n1],       %[tn1]                         \n\t"
+        "dpa.w.ph         $ac2,        %[n2],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac2,        %[n3],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac2,        %[n4],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac2,        %[n1],          %[vector4b]    \n\t"
+        "extp             %[Temp4],    $ac2,           31             \n\t"
+
+        /* clamp */
+        "lbux             %[tn1],      %[Temp2](%[cm])                \n\t"
+        "lbux             %[n2],       %[Temp4](%[cm])                \n\t"
+
+        /* store bytes */
+        "sb               %[tp1],      0(%[dst])                      \n\t"
+        "sb               %[tn1],      1(%[dst])                      \n\t"
+        "sb               %[tp2],      2(%[dst])                      \n\t"
+        "sb               %[n2],       3(%[dst])                      \n\t"
+
+        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+          [tn1] "=&r" (tn1), [tn2] "=&r" (tn2),
+          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+          [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3), [n4] "=&r" (n4),
+          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+          [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
+        : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+          [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
+          [vector4a] "r" (vector4a),
+          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+    );
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_horiz_8_dspr2(const uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   const int16_t *filter_x0,
+                                   int32_t h) {
+  int32_t y;
+  uint8_t *cm = vpx_ff_cropTbl;
+  uint32_t vector4a = 64;
+  int32_t vector1b, vector2b, vector3b, vector4b;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t tp1, tp2;
+  uint32_t p1, p2, p3, p4, n1;
+  uint32_t tn1, tn2, tn3;
+  uint32_t st0, st1;
+
+  vector1b = ((const int32_t *)filter_x0)[0];
+  vector2b = ((const int32_t *)filter_x0)[1];
+  vector3b = ((const int32_t *)filter_x0)[2];
+  vector4b = ((const int32_t *)filter_x0)[3];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+    prefetch_store(dst + dst_stride);
+
+    __asm__ __volatile__ (
+        "ulw              %[tp1],      0(%[src])                      \n\t"
+        "ulw              %[tp2],      4(%[src])                      \n\t"
+
+        /* even 1. pixel */
+        "mtlo             %[vector4a], $ac3                           \n\t"
+        "mthi             $zero,       $ac3                           \n\t"
+        "mtlo             %[vector4a], $ac2                           \n\t"
+        "mthi             $zero,       $ac2                           \n\t"
+        "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
+        "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
+        "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
+        "ulw              %[tn2],      8(%[src])                      \n\t"
+        "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac3,        %[p4],          %[vector4b]    \n\t"
+        "extp             %[Temp1],    $ac3,           31             \n\t"
+
+        /* even 2. pixel */
+        "preceu.ph.qbr    %[p1],       %[tn2]                         \n\t"
+        "preceu.ph.qbl    %[n1],       %[tn2]                         \n\t"
+        "ulw              %[tn1],      12(%[src])                     \n\t"
+        "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac2,        %[p4],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac2,        %[p1],          %[vector4b]    \n\t"
+        "extp             %[Temp3],    $ac2,           31             \n\t"
+
+        /* even 3. pixel */
+        "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
+        "mtlo             %[vector4a], $ac1                           \n\t"
+        "mthi             $zero,       $ac1                           \n\t"
+        "preceu.ph.qbr    %[p2],       %[tn1]                         \n\t"
+        "dpa.w.ph         $ac1,        %[p3],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac1,        %[p4],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac1,        %[p1],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac1,        %[n1],          %[vector4b]    \n\t"
+        "extp             %[Temp1],    $ac1,           31             \n\t"
+
+        /* even 4. pixel */
+        "mtlo             %[vector4a], $ac2                           \n\t"
+        "mthi             $zero,       $ac2                           \n\t"
+        "mtlo             %[vector4a], $ac3                           \n\t"
+        "mthi             $zero,       $ac3                           \n\t"
+        "sb               %[st0],      0(%[dst])                      \n\t"
+        "lbux             %[st1],      %[Temp3](%[cm])                \n\t"
+
+        "balign           %[tn3],      %[tn1],         3              \n\t"
+        "balign           %[tn1],      %[tn2],         3              \n\t"
+        "balign           %[tn2],      %[tp2],         3              \n\t"
+        "balign           %[tp2],      %[tp1],         3              \n\t"
+
+        "dpa.w.ph         $ac2,        %[p4],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac2,        %[p1],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac2,        %[n1],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac2,        %[p2],          %[vector4b]    \n\t"
+        "extp             %[Temp3],    $ac2,           31             \n\t"
+
+        "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
+
+        /* odd 1. pixel */
+        "mtlo             %[vector4a], $ac1                           \n\t"
+        "mthi             $zero,       $ac1                           \n\t"
+        "sb               %[st1],      2(%[dst])                      \n\t"
+        "preceu.ph.qbr    %[p1],       %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p2],       %[tp2]                         \n\t"
+        "preceu.ph.qbr    %[p3],       %[tn2]                         \n\t"
+        "preceu.ph.qbl    %[p4],       %[tn2]                         \n\t"
+        "sb               %[st0],      4(%[dst])                      \n\t"
+        "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac3,        %[p4],          %[vector4b]    \n\t"
+        "extp             %[Temp2],    $ac3,           31             \n\t"
+
+        /* odd 2. pixel */
+        "mtlo             %[vector4a], $ac3                           \n\t"
+        "mthi             $zero,       $ac3                           \n\t"
+        "mtlo             %[vector4a], $ac2                           \n\t"
+        "mthi             $zero,       $ac2                           \n\t"
+        "preceu.ph.qbr    %[p1],       %[tn1]                         \n\t"
+        "preceu.ph.qbl    %[n1],       %[tn1]                         \n\t"
+        "lbux             %[st0],      %[Temp3](%[cm])                \n\t"
+        "dpa.w.ph         $ac1,        %[p2],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac1,        %[p3],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac1,        %[p4],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac1,        %[p1],          %[vector4b]    \n\t"
+        "extp             %[Temp3],    $ac1,           31             \n\t"
+
+        /* odd 3. pixel */
+        "lbux             %[st1],      %[Temp2](%[cm])                \n\t"
+        "preceu.ph.qbr    %[p2],       %[tn3]                         \n\t"
+        "dpa.w.ph         $ac3,        %[p3],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac3,        %[p4],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac3,        %[p1],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac3,        %[n1],          %[vector4b]    \n\t"
+        "extp             %[Temp2],    $ac3,           31             \n\t"
+
+        /* odd 4. pixel */
+        "sb               %[st1],      1(%[dst])                      \n\t"
+        "sb               %[st0],      6(%[dst])                      \n\t"
+        "dpa.w.ph         $ac2,        %[p4],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac2,        %[p1],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac2,        %[n1],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac2,        %[p2],          %[vector4b]    \n\t"
+        "extp             %[Temp1],    $ac2,           31             \n\t"
+
+        /* clamp */
+        "lbux             %[p4],       %[Temp3](%[cm])                \n\t"
+        "lbux             %[p2],       %[Temp2](%[cm])                \n\t"
+        "lbux             %[n1],       %[Temp1](%[cm])                \n\t"
+
+        /* store bytes */
+        "sb               %[p4],       3(%[dst])                      \n\t"
+        "sb               %[p2],       5(%[dst])                      \n\t"
+        "sb               %[n1],       7(%[dst])                      \n\t"
+
+        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+          [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), [tn3] "=&r" (tn3),
+          [st0] "=&r" (st0), [st1] "=&r" (st1),
+          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+          [n1] "=&r" (n1),
+          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+        : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+          [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
+          [vector4a] "r" (vector4a),
+          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+    );
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_horiz_16_dspr2(const uint8_t *src_ptr,
+                                    int32_t src_stride,
+                                    uint8_t *dst_ptr,
+                                    int32_t dst_stride,
+                                    const int16_t *filter_x0,
+                                    int32_t h,
+                                    int32_t count) {
+  int32_t y, c;
+  const uint8_t *src;
+  uint8_t *dst;
+  uint8_t *cm = vpx_ff_cropTbl;
+  uint32_t vector_64 = 64;
+  int32_t filter12, filter34, filter56, filter78;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t qload1, qload2, qload3;
+  uint32_t p1, p2, p3, p4, p5;
+  uint32_t st1, st2, st3;
+
+  filter12 = ((const int32_t *)filter_x0)[0];
+  filter34 = ((const int32_t *)filter_x0)[1];
+  filter56 = ((const int32_t *)filter_x0)[2];
+  filter78 = ((const int32_t *)filter_x0)[3];
+
+  for (y = h; y--;) {
+    src = src_ptr;
+    dst = dst_ptr;
+
+    /* prefetch data to cache memory */
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_store(dst_ptr + dst_stride);
+
+    for (c = 0; c < count; c++) {
+      __asm__ __volatile__ (
+          "ulw              %[qload1],    0(%[src])                    \n\t"
+          "ulw              %[qload2],    4(%[src])                    \n\t"
+
+          /* even 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "ulw              %[qload3],    8(%[src])                    \n\t"
+          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
+
+          /* even 2. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "ulw              %[qload1],    12(%[src])                   \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
+
+          /* even 3. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
+
+          /* even 4. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
+          "ulw              %[qload2],    16(%[src])                   \n\t"
+          "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
+
+          /* even 5. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
+          "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
+
+          /* even 6. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
+          "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
+          "ulw              %[qload3],    20(%[src])                   \n\t"
+          "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
+
+          /* even 7. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
+          "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
+
+          /* even 8. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
+          "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
+
+          /* ODD pixels */
+          "ulw              %[qload1],    1(%[src])                    \n\t"
+          "ulw              %[qload2],    5(%[src])                    \n\t"
+
+          /* odd 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
+          "ulw              %[qload3],    9(%[src])                    \n\t"
+          "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
+
+          /* odd 2. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
+          "ulw              %[qload1],    13(%[src])                   \n\t"
+          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
+
+          /* odd 3. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
+
+          /* odd 4. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
+          "ulw              %[qload2],    17(%[src])                   \n\t"
+          "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
+
+          /* odd 5. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
+          "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
+
+          /* odd 6. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
+          "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
+          "ulw              %[qload3],    21(%[src])                   \n\t"
+          "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
+
+          /* odd 7. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
+          "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
+
+          /* odd 8. pixel */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
+
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
+
+          "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
+          "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
+          "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
+
+          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
+            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+            [p5] "=&r" (p5),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+          : [filter12] "r" (filter12), [filter34] "r" (filter34),
+            [filter56] "r" (filter56), [filter78] "r" (filter78),
+            [vector_64] "r" (vector_64),
+            [cm] "r" (cm), [dst] "r" (dst),
+            [src] "r" (src)
+      );
+
+      src += 16;
+      dst += 16;
+    }
+
+    /* Next row... */
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+static void convolve_horiz_64_dspr2(const uint8_t *src_ptr,
+                                    int32_t src_stride,
+                                    uint8_t *dst_ptr,
+                                    int32_t dst_stride,
+                                    const int16_t *filter_x0,
+                                    int32_t h) {
+  int32_t y, c;
+  const uint8_t *src;
+  uint8_t *dst;
+  uint8_t *cm = vpx_ff_cropTbl;
+  uint32_t vector_64 = 64;
+  int32_t filter12, filter34, filter56, filter78;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t qload1, qload2, qload3;
+  uint32_t p1, p2, p3, p4, p5;
+  uint32_t st1, st2, st3;
+
+  filter12 = ((const int32_t *)filter_x0)[0];
+  filter34 = ((const int32_t *)filter_x0)[1];
+  filter56 = ((const int32_t *)filter_x0)[2];
+  filter78 = ((const int32_t *)filter_x0)[3];
+
+  for (y = h; y--;) {
+    src = src_ptr;
+    dst = dst_ptr;
+
+    /* prefetch data to cache memory */
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_load(src_ptr + src_stride + 64);
+    prefetch_store(dst_ptr + dst_stride);
+    prefetch_store(dst_ptr + dst_stride + 32);
+
+    for (c = 0; c < 4; c++) {
+      __asm__ __volatile__ (
+          "ulw              %[qload1],    0(%[src])                    \n\t"
+          "ulw              %[qload2],    4(%[src])                    \n\t"
+
+          /* even 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "ulw              %[qload3],    8(%[src])                    \n\t"
+          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
+
+          /* even 2. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "ulw              %[qload1],    12(%[src])                   \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
+
+          /* even 3. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
+
+          /* even 4. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
+          "ulw              %[qload2],    16(%[src])                   \n\t"
+          "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
+
+          /* even 5. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
+          "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
+
+          /* even 6. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
+          "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
+          "ulw              %[qload3],    20(%[src])                   \n\t"
+          "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
+
+          /* even 7. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
+          "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
+
+          /* even 8. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
+          "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
+
+          /* ODD pixels */
+          "ulw              %[qload1],    1(%[src])                    \n\t"
+          "ulw              %[qload2],    5(%[src])                    \n\t"
+
+          /* odd 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
+          "ulw              %[qload3],    9(%[src])                    \n\t"
+          "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
+
+          /* odd 2. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
+          "ulw              %[qload1],    13(%[src])                   \n\t"
+          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
+
+          /* odd 3. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
+
+          /* odd 4. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
+          "ulw              %[qload2],    17(%[src])                   \n\t"
+          "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
+
+          /* odd 5. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
+          "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
+
+          /* odd 6. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
+          "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
+          "ulw              %[qload3],    21(%[src])                   \n\t"
+          "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
+
+          /* odd 7. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
+          "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
+
+          /* odd 8. pixel */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
+
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
+
+          "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
+          "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
+          "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
+
+          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
+            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+            [p5] "=&r" (p5),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+          : [filter12] "r" (filter12), [filter34] "r" (filter34),
+            [filter56] "r" (filter56), [filter78] "r" (filter78),
+            [vector_64] "r" (vector_64),
+            [cm] "r" (cm), [dst] "r" (dst),
+            [src] "r" (src)
+      );
+
+      src += 16;
+      dst += 16;
+    }
+
+    /* Next row... */
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const int16_t *filter_x, int x_step_q4,
+                               const int16_t *filter_y, int y_step_q4,
+                               int w, int h) {
+  assert(x_step_q4 == 16);
+  assert(((const int32_t *)filter_x)[1] != 0x800000);
+
+  if (((const int32_t *)filter_x)[0] == 0) {
+    vpx_convolve2_horiz_dspr2(src, src_stride,
+                              dst, dst_stride,
+                              filter_x, x_step_q4,
+                              filter_y, y_step_q4,
+                              w, h);
+  } else {
+    uint32_t pos = 38;
+
+    prefetch_load((const uint8_t *)filter_x);
+    src -= 3;
+
+    /* bit positon for extract from acc */
+    __asm__ __volatile__ (
+      "wrdsp      %[pos],     1           \n\t"
+      :
+      : [pos] "r" (pos)
+    );
+
+    /* prefetch data to cache memory */
+    prefetch_load(src);
+    prefetch_load(src + 32);
+    prefetch_store(dst);
+
+    switch (w) {
+      case 4:
+        convolve_horiz_4_dspr2(src, (int32_t)src_stride,
+                               dst, (int32_t)dst_stride,
+                               filter_x, (int32_t)h);
+        break;
+      case 8:
+        convolve_horiz_8_dspr2(src, (int32_t)src_stride,
+                               dst, (int32_t)dst_stride,
+                               filter_x, (int32_t)h);
+        break;
+      case 16:
+        convolve_horiz_16_dspr2(src, (int32_t)src_stride,
+                                dst, (int32_t)dst_stride,
+                                filter_x, (int32_t)h, 1);
+        break;
+      case 32:
+        convolve_horiz_16_dspr2(src, (int32_t)src_stride,
+                                dst, (int32_t)dst_stride,
+                                filter_x, (int32_t)h, 2);
+        break;
+      case 64:
+        prefetch_load(src + 64);
+        prefetch_store(dst + 32);
+
+        convolve_horiz_64_dspr2(src, (int32_t)src_stride,
+                                dst, (int32_t)dst_stride,
+                                filter_x, (int32_t)h);
+        break;
+      default:
+        vpx_convolve8_horiz_c(src + 3, src_stride,
+                              dst, dst_stride,
+                              filter_x, x_step_q4,
+                              filter_y, y_step_q4,
+                              w, h);
+        break;
+    }
+  }
+}
+#endif
diff --git a/libs/libvpx/vpx_dsp/mips/convolve8_vert_dspr2.c b/libs/libvpx/vpx_dsp/mips/convolve8_vert_dspr2.c
new file mode 100644
index 0000000000..d553828c59
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/convolve8_vert_dspr2.c
@@ -0,0 +1,383 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/convolve_common_dspr2.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_ports/mem.h"
+
+#if HAVE_DSPR2
+static void convolve_vert_4_dspr2(const uint8_t *src,
+                                  int32_t src_stride,
+                                  uint8_t *dst,
+                                  int32_t dst_stride,
+                                  const int16_t *filter_y,
+                                  int32_t w,
+                                  int32_t h) {
+  int32_t x, y;
+  const uint8_t *src_ptr;
+  uint8_t *dst_ptr;
+  uint8_t *cm = vpx_ff_cropTbl;
+  uint32_t vector4a = 64;
+  uint32_t load1, load2, load3, load4;
+  uint32_t p1, p2;
+  uint32_t n1, n2;
+  uint32_t scratch1, scratch2;
+  uint32_t store1, store2;
+  int32_t vector1b, vector2b, vector3b, vector4b;
+  int32_t Temp1, Temp2;
+
+  vector1b = ((const int32_t *)filter_y)[0];
+  vector2b = ((const int32_t *)filter_y)[1];
+  vector3b = ((const int32_t *)filter_y)[2];
+  vector4b = ((const int32_t *)filter_y)[3];
+
+  src -= 3 * src_stride;
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    prefetch_store(dst + dst_stride);
+
+    for (x = 0; x < w; x += 4) {
+      src_ptr = src + x;
+      dst_ptr = dst + x;
+
+      __asm__ __volatile__ (
+          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
+
+          "mtlo             %[vector4a],  $ac0                            \n\t"
+          "mtlo             %[vector4a],  $ac1                            \n\t"
+          "mtlo             %[vector4a],  $ac2                            \n\t"
+          "mtlo             %[vector4a],  $ac3                            \n\t"
+          "mthi             $zero,        $ac0                            \n\t"
+          "mthi             $zero,        $ac1                            \n\t"
+          "mthi             $zero,        $ac2                            \n\t"
+          "mthi             $zero,        $ac3                            \n\t"
+
+          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac0,         %[p1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac0,         %[p2],          %[vector2b]     \n\t"
+          "dpa.w.ph         $ac1,         %[n1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac1,         %[n2],          %[vector2b]     \n\t"
+
+          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac2,         %[p1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[vector2b]     \n\t"
+          "dpa.w.ph         $ac3,         %[n1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac3,         %[n2],          %[vector2b]     \n\t"
+
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
+
+          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac0,         %[p1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac0,         %[p2],          %[vector4b]     \n\t"
+          "extp             %[Temp1],     $ac0,           31              \n\t"
+          "dpa.w.ph         $ac1,         %[n1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac1,         %[n2],          %[vector4b]     \n\t"
+          "extp             %[Temp2],     $ac1,           31              \n\t"
+
+          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "dpa.w.ph         $ac2,         %[p1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[vector4b]     \n\t"
+          "extp             %[Temp1],     $ac2,           31              \n\t"
+
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+          "dpa.w.ph         $ac3,         %[n1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac3,         %[n2],          %[vector4b]     \n\t"
+          "extp             %[Temp2],     $ac3,           31              \n\t"
+
+          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+
+          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
+
+          : [load1] "=&r" (load1), [load2] "=&r" (load2),
+            [load3] "=&r" (load3), [load4] "=&r" (load4),
+            [p1] "=&r" (p1), [p2] "=&r" (p2),
+            [n1] "=&r" (n1), [n2] "=&r" (n2),
+            [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+            [store1] "=&r" (store1), [store2] "=&r" (store2),
+            [src_ptr] "+r" (src_ptr)
+          : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+            [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
+            [vector4a] "r" (vector4a), [src_stride] "r" (src_stride),
+            [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
+      );
+    }
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_vert_64_dspr2(const uint8_t *src,
+                                   int32_t src_stride,
+                                   uint8_t *dst,
+                                   int32_t dst_stride,
+                                   const int16_t *filter_y,
+                                   int32_t h) {
+  int32_t x, y;
+  const uint8_t *src_ptr;
+  uint8_t *dst_ptr;
+  uint8_t *cm = vpx_ff_cropTbl;
+  uint32_t vector4a = 64;
+  uint32_t load1, load2, load3, load4;
+  uint32_t p1, p2;
+  uint32_t n1, n2;
+  uint32_t scratch1, scratch2;
+  uint32_t store1, store2;
+  int32_t vector1b, vector2b, vector3b, vector4b;
+  int32_t Temp1, Temp2;
+
+  vector1b = ((const int32_t *)filter_y)[0];
+  vector2b = ((const int32_t *)filter_y)[1];
+  vector3b = ((const int32_t *)filter_y)[2];
+  vector4b = ((const int32_t *)filter_y)[3];
+
+  src -= 3 * src_stride;
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    prefetch_store(dst + dst_stride);
+    prefetch_store(dst + dst_stride + 32);
+
+    for (x = 0; x < 64; x += 4) {
+      src_ptr = src + x;
+      dst_ptr = dst + x;
+
+      __asm__ __volatile__ (
+          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
+
+          "mtlo             %[vector4a],  $ac0                            \n\t"
+          "mtlo             %[vector4a],  $ac1                            \n\t"
+          "mtlo             %[vector4a],  $ac2                            \n\t"
+          "mtlo             %[vector4a],  $ac3                            \n\t"
+          "mthi             $zero,        $ac0                            \n\t"
+          "mthi             $zero,        $ac1                            \n\t"
+          "mthi             $zero,        $ac2                            \n\t"
+          "mthi             $zero,        $ac3                            \n\t"
+
+          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac0,         %[p1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac0,         %[p2],          %[vector2b]     \n\t"
+          "dpa.w.ph         $ac1,         %[n1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac1,         %[n2],          %[vector2b]     \n\t"
+
+          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac2,         %[p1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[vector2b]     \n\t"
+          "dpa.w.ph         $ac3,         %[n1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac3,         %[n2],          %[vector2b]     \n\t"
+
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
+
+          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac0,         %[p1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac0,         %[p2],          %[vector4b]     \n\t"
+          "extp             %[Temp1],     $ac0,           31              \n\t"
+          "dpa.w.ph         $ac1,         %[n1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac1,         %[n2],          %[vector4b]     \n\t"
+          "extp             %[Temp2],     $ac1,           31              \n\t"
+
+          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "dpa.w.ph         $ac2,         %[p1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[vector4b]     \n\t"
+          "extp             %[Temp1],     $ac2,           31              \n\t"
+
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+          "dpa.w.ph         $ac3,         %[n1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac3,         %[n2],          %[vector4b]     \n\t"
+          "extp             %[Temp2],     $ac3,           31              \n\t"
+
+          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+
+          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
+
+          : [load1] "=&r" (load1), [load2] "=&r" (load2),
+            [load3] "=&r" (load3), [load4] "=&r" (load4),
+            [p1] "=&r" (p1), [p2] "=&r" (p2),
+            [n1] "=&r" (n1), [n2] "=&r" (n2),
+            [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+            [store1] "=&r" (store1), [store2] "=&r" (store2),
+            [src_ptr] "+r" (src_ptr)
+          : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+            [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
+            [vector4a] "r" (vector4a), [src_stride] "r" (src_stride),
+            [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
+      );
+    }
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void vpx_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const int16_t *filter_x, int x_step_q4,
+                              const int16_t *filter_y, int y_step_q4,
+                              int w, int h) {
+  assert(y_step_q4 == 16);
+  assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+  if (((const int32_t *)filter_y)[0] == 0) {
+    vpx_convolve2_vert_dspr2(src, src_stride,
+                             dst, dst_stride,
+                             filter_x, x_step_q4,
+                             filter_y, y_step_q4,
+                             w, h);
+  } else {
+    uint32_t pos = 38;
+
+    /* bit positon for extract from acc */
+    __asm__ __volatile__ (
+      "wrdsp      %[pos],     1           \n\t"
+      :
+      : [pos] "r" (pos)
+    );
+
+    prefetch_store(dst);
+
+    switch (w) {
+      case 4 :
+      case 8 :
+      case 16 :
+      case 32 :
+        convolve_vert_4_dspr2(src, src_stride,
+                              dst, dst_stride,
+                              filter_y, w, h);
+        break;
+      case 64 :
+        prefetch_store(dst + 32);
+        convolve_vert_64_dspr2(src, src_stride,
+                               dst, dst_stride,
+                               filter_y, h);
+        break;
+      default:
+        vpx_convolve8_vert_c(src, src_stride,
+                             dst, dst_stride,
+                             filter_x, x_step_q4,
+                             filter_y, y_step_q4,
+                             w, h);
+        break;
+    }
+  }
+}
+
+#endif
diff --git a/libs/libvpx/vpx_dsp/mips/convolve_common_dspr2.h b/libs/libvpx/vpx_dsp/mips/convolve_common_dspr2.h
new file mode 100644
index 0000000000..66d77a2854
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/convolve_common_dspr2.h
@@ -0,0 +1,59 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_MIPS_VPX_COMMON_DSPR2_H_
+#define VPX_DSP_MIPS_VPX_COMMON_DSPR2_H_
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/mips/common_dspr2.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if HAVE_DSPR2
+void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const int16_t *filter_x, int x_step_q4,
+                               const int16_t *filter_y, int y_step_q4,
+                               int w, int h);
+
+void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                                   uint8_t *dst, ptrdiff_t dst_stride,
+                                   const int16_t *filter_x, int x_step_q4,
+                                   const int16_t *filter_y, int y_step_q4,
+                                   int w, int h);
+
+void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const int16_t *filter_x, int x_step_q4,
+                                  const int16_t *filter_y, int y_step_q4,
+                                  int w, int h);
+
+void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                         uint8_t *dst, ptrdiff_t dst_stride,
+                         const int16_t *filter,
+                         int w, int h);
+
+void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const int16_t *filter_x, int x_step_q4,
+                              const int16_t *filter_y, int y_step_q4,
+                              int w, int h);
+
+#endif  // #if HAVE_DSPR2
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_DSP_MIPS_VPX_COMMON_DSPR2_H_
diff --git a/libs/libvpx/vpx_dsp/mips/fwd_dct32x32_msa.c b/libs/libvpx/vpx_dsp/mips/fwd_dct32x32_msa.c
new file mode 100644
index 0000000000..2115a348c2
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/fwd_dct32x32_msa.c
@@ -0,0 +1,955 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/mips/fwd_txfm_msa.h"
+
+static void fdct8x32_1d_column_load_butterfly(const int16_t *input,
+                                              int32_t src_stride,
+                                              int16_t *temp_buff) {
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+  v8i16 step0, step1, step2, step3;
+  v8i16 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1;
+  v8i16 step0_1, step1_1, step2_1, step3_1;
+
+  /* 1st and 2nd set */
+  LD_SH4(input, src_stride, in0, in1, in2, in3);
+  LD_SH4(input + (28 * src_stride), src_stride, in4, in5, in6, in7);
+  LD_SH4(input + (4 * src_stride), src_stride, in0_1, in1_1, in2_1, in3_1);
+  LD_SH4(input + (24 * src_stride), src_stride, in4_1, in5_1, in6_1, in7_1);
+  SLLI_4V(in0, in1, in2, in3, 2);
+  SLLI_4V(in4, in5, in6, in7, 2);
+  SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2);
+  SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2);
+  BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7,
+              step0, step1, step2, step3, in4, in5, in6, in7);
+  BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
+              step0_1, step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1);
+  ST_SH4(step0, step1, step2, step3, temp_buff, 8);
+  ST_SH4(in4, in5, in6, in7, temp_buff + (28 * 8), 8);
+  ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (4 * 8), 8);
+  ST_SH4(in4_1, in5_1, in6_1, in7_1, temp_buff + (24 * 8), 8);
+
+  /* 3rd and 4th set */
+  LD_SH4(input + (8 * src_stride), src_stride, in0, in1, in2, in3);
+  LD_SH4(input + (20 * src_stride), src_stride, in4, in5, in6, in7);
+  LD_SH4(input + (12 * src_stride), src_stride, in0_1, in1_1, in2_1, in3_1);
+  LD_SH4(input + (16 * src_stride), src_stride, in4_1, in5_1, in6_1, in7_1);
+  SLLI_4V(in0, in1, in2, in3, 2);
+  SLLI_4V(in4, in5, in6, in7, 2);
+  SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2);
+  SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2);
+  BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7,
+              step0, step1, step2, step3, in4, in5, in6, in7);
+  BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
+              step0_1, step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1);
+  ST_SH4(step0, step1, step2, step3, temp_buff + (8 * 8), 8);
+  ST_SH4(in4, in5, in6, in7, temp_buff + (20 * 8), 8);
+  ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (12 * 8), 8);
+  ST_SH4(in4_1, in5_1, in6_1, in7_1, temp_buff + (15 * 8) + 8, 8);
+}
+
+static void fdct8x32_1d_column_even_store(int16_t *input, int16_t *temp) {
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+  v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
+  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8i16 temp0, temp1;
+
+  /* fdct even */
+  LD_SH4(input, 8, in0, in1, in2, in3);
+  LD_SH4(input + 96, 8, in12, in13, in14, in15);
+  BUTTERFLY_8(in0, in1, in2, in3, in12, in13, in14, in15,
+              vec0, vec1, vec2, vec3, in12, in13, in14, in15);
+  LD_SH4(input + 32, 8, in4, in5, in6, in7);
+  LD_SH4(input + 64, 8, in8, in9, in10, in11);
+  BUTTERFLY_8(in4, in5, in6, in7, in8, in9, in10, in11,
+              vec4, vec5, vec6, vec7, in8, in9, in10, in11);
+
+  /* Stage 3 */
+  ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3);
+  BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0);
+  DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0);
+  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  ST_SH(temp0, temp);
+  ST_SH(temp1, temp + 512);
+
+  DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
+  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  ST_SH(temp0, temp + 256);
+  ST_SH(temp1, temp + 768);
+
+  SUB4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, vec7, vec6, vec5, vec4);
+  DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
+  ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
+  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  ST_SH(temp0, temp + 128);
+  ST_SH(temp1, temp + 896);
+
+  SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
+  DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
+  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  ST_SH(temp0, temp + 640);
+  ST_SH(temp1, temp + 384);
+
+  DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
+  DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
+  ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
+  DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
+  ADD2(in0, in1, in2, in3, vec0, vec7);
+  DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
+  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  ST_SH(temp0, temp + 64);
+  ST_SH(temp1, temp + 960);
+
+  SUB2(in0, in1, in2, in3, in0, in2);
+  DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
+  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  ST_SH(temp0, temp + 576);
+  ST_SH(temp1, temp + 448);
+
+  SUB2(in9, vec2, in14, vec5, vec2, vec5);
+  DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
+  SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5);
+  DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
+  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  ST_SH(temp0, temp + 320);
+  ST_SH(temp1, temp + 704);
+
+  ADD2(in3, in2, in0, in1, vec3, vec4);
+  DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
+  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  ST_SH(temp0, temp + 192);
+  ST_SH(temp1, temp + 832);
+}
+
+static void fdct8x32_1d_column_odd_store(int16_t *input, int16_t *temp_ptr) {
+  v8i16 in16, in17, in18, in19, in20, in21, in22, in23;
+  v8i16 in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5;
+
+  in20 = LD_SH(input + 32);
+  in21 = LD_SH(input + 40);
+  in26 = LD_SH(input + 80);
+  in27 = LD_SH(input + 88);
+
+  DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
+  DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
+
+  in18 = LD_SH(input + 16);
+  in19 = LD_SH(input + 24);
+  in28 = LD_SH(input + 96);
+  in29 = LD_SH(input + 104);
+
+  vec4 = in19 - in20;
+  ST_SH(vec4, input + 32);
+  vec4 = in18 - in21;
+  ST_SH(vec4, input + 40);
+  vec4 = in29 - in26;
+  ST_SH(vec4, input + 80);
+  vec4 = in28 - in27;
+  ST_SH(vec4, input + 88);
+
+  in21 = in18 + in21;
+  in20 = in19 + in20;
+  in27 = in28 + in27;
+  in26 = in29 + in26;
+
+  LD_SH4(input + 48, 8, in22, in23, in24, in25);
+  DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
+  DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
+
+  in16 = LD_SH(input);
+  in17 = LD_SH(input + 8);
+  in30 = LD_SH(input + 112);
+  in31 = LD_SH(input + 120);
+
+  vec4 = in17 - in22;
+  ST_SH(vec4, input + 16);
+  vec4 = in16 - in23;
+  ST_SH(vec4, input + 24);
+  vec4 = in31 - in24;
+  ST_SH(vec4, input + 96);
+  vec4 = in30 - in25;
+  ST_SH(vec4, input + 104);
+
+  ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31);
+  DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
+  DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
+  ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25);
+  DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
+  ADD2(in27, in26, in25, in24, in23, in20);
+  DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
+  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  ST_SH(vec5, temp_ptr);
+  ST_SH(vec4, temp_ptr + 960);
+
+  SUB2(in27, in26, in25, in24, in22, in21);
+  DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
+  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  ST_SH(vec5, temp_ptr + 448);
+  ST_SH(vec4, temp_ptr + 512);
+
+  SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20);
+  DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25);
+  SUB2(in26, in27, in24, in25, in23, in20);
+  DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
+  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  ST_SH(vec4, temp_ptr + 704);
+  ST_SH(vec5, temp_ptr + 256);
+
+  ADD2(in26, in27, in24, in25, in22, in21);
+  DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
+  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  ST_SH(vec4, temp_ptr + 192);
+  ST_SH(vec5, temp_ptr + 768);
+
+  LD_SH4(input + 16, 8, in22, in23, in20, in21);
+  LD_SH4(input + 80, 8, in26, in27, in24, in25);
+  in16 = in20;
+  in17 = in21;
+  DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27);
+  DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26);
+  SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31);
+  DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
+  ADD2(in28, in29, in31, in30, in16, in19);
+  DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
+  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  ST_SH(vec5, temp_ptr + 832);
+  ST_SH(vec4, temp_ptr + 128);
+
+  SUB2(in28, in29, in31, in30, in17, in18);
+  DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
+  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  ST_SH(vec5, temp_ptr + 320);
+  ST_SH(vec4, temp_ptr + 640);
+  ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19);
+  DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31);
+  SUB2(in29, in28, in30, in31, in16, in19);
+  DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
+  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  ST_SH(vec5, temp_ptr + 576);
+  ST_SH(vec4, temp_ptr + 384);
+
+  ADD2(in29, in28, in30, in31, in17, in18);
+  DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
+  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  ST_SH(vec5, temp_ptr + 64);
+  ST_SH(vec4, temp_ptr + 896);
+}
+
+static void fdct8x32_1d_column(const int16_t *input, int32_t src_stride,
+                               int16_t *tmp_buf, int16_t *tmp_buf_big) {
+  fdct8x32_1d_column_load_butterfly(input, src_stride, tmp_buf);
+  fdct8x32_1d_column_even_store(tmp_buf, tmp_buf_big);
+  fdct8x32_1d_column_odd_store(tmp_buf + 128, (tmp_buf_big + 32));
+}
+
+static void fdct8x32_1d_row_load_butterfly(int16_t *temp_buff,
+                                           int16_t *output) {
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+  v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
+  v8i16 step0, step1, step2, step3, step4, step5, step6, step7;
+
+  LD_SH8(temp_buff, 32, in0, in1, in2, in3, in4, in5, in6, in7);
+  LD_SH8(temp_buff + 24, 32, in8, in9, in10, in11, in12, in13, in14, in15);
+  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                     in0, in1, in2, in3, in4, in5, in6, in7);
+  TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15,
+                     in8, in9, in10, in11, in12, in13, in14, in15);
+  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
+               in8, in9, in10, in11, in12, in13, in14, in15,
+               step0, step1, step2, step3, step4, step5, step6, step7,
+               in8, in9, in10, in11, in12, in13, in14, in15);
+  ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7, output, 8);
+  ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 24 * 8), 8);
+
+  /* 2nd set */
+  LD_SH8(temp_buff + 8, 32, in0, in1, in2, in3, in4, in5, in6, in7);
+  LD_SH8(temp_buff + 16, 32, in8, in9, in10, in11, in12, in13, in14, in15);
+  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                     in0, in1, in2, in3, in4, in5, in6, in7);
+  TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15,
+                     in8, in9, in10, in11, in12, in13, in14, in15);
+  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
+               in8, in9, in10, in11, in12, in13, in14, in15,
+               step0, step1, step2, step3, step4, step5, step6, step7,
+               in8, in9, in10, in11, in12, in13, in14, in15);
+  ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7,
+         (output + 8 * 8), 8);
+  ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 16 * 8), 8);
+}
+
+static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr,
+                                    int16_t *out) {
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+  v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
+  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v4i32 vec0_l, vec1_l, vec2_l, vec3_l, vec4_l, vec5_l, vec6_l, vec7_l;
+  v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec4_r, vec5_r, vec6_r, vec7_r;
+  v4i32 tmp0_w, tmp1_w, tmp2_w, tmp3_w;
+
+  /* fdct32 even */
+  /* stage 2 */
+  LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+  LD_SH8(input + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
+
+  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
+               in8, in9, in10, in11, in12, in13, in14, in15,
+               vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7,
+               in8, in9, in10, in11, in12, in13, in14, in15);
+  ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, interm_ptr, 8);
+  ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, interm_ptr + 64, 8);
+
+  /* Stage 3 */
+  UNPCK_SH_SW(vec0, vec0_l, vec0_r);
+  UNPCK_SH_SW(vec1, vec1_l, vec1_r);
+  UNPCK_SH_SW(vec2, vec2_l, vec2_r);
+  UNPCK_SH_SW(vec3, vec3_l, vec3_r);
+  UNPCK_SH_SW(vec4, vec4_l, vec4_r);
+  UNPCK_SH_SW(vec5, vec5_l, vec5_r);
+  UNPCK_SH_SW(vec6, vec6_l, vec6_r);
+  UNPCK_SH_SW(vec7, vec7_l, vec7_r);
+  ADD4(vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r, vec3_r, vec4_r,
+       tmp0_w, tmp1_w, tmp2_w, tmp3_w);
+  BUTTERFLY_4(tmp0_w, tmp1_w, tmp2_w, tmp3_w, vec4_r, vec6_r, vec7_r, vec5_r);
+  ADD4(vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l, vec3_l, vec4_l,
+       vec0_r, vec1_r, vec2_r, vec3_r);
+
+  tmp3_w = vec0_r + vec3_r;
+  vec0_r = vec0_r - vec3_r;
+  vec3_r = vec1_r + vec2_r;
+  vec1_r = vec1_r - vec2_r;
+
+  DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64,
+                    cospi_16_64, vec4_r, tmp3_w, vec6_r, vec3_r);
+  FDCT32_POSTPROC_NEG_W(vec4_r);
+  FDCT32_POSTPROC_NEG_W(tmp3_w);
+  FDCT32_POSTPROC_NEG_W(vec6_r);
+  FDCT32_POSTPROC_NEG_W(vec3_r);
+  PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5);
+  ST_SH2(vec5, vec4, out, 8);
+
+  DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64,
+                    cospi_8_64, vec4_r, tmp3_w, vec6_r, vec3_r);
+  FDCT32_POSTPROC_NEG_W(vec4_r);
+  FDCT32_POSTPROC_NEG_W(tmp3_w);
+  FDCT32_POSTPROC_NEG_W(vec6_r);
+  FDCT32_POSTPROC_NEG_W(vec3_r);
+  PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5);
+  ST_SH2(vec5, vec4, out + 16, 8);
+
+  LD_SH8(interm_ptr, 8, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
+  SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7);
+  DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
+  ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, in5, in4);
+  FDCT_POSTPROC_2V_NEG_H(in4, in5);
+  ST_SH(in4, out + 32);
+  ST_SH(in5, out + 56);
+
+  SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
+  DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, in5, in4);
+  FDCT_POSTPROC_2V_NEG_H(in4, in5);
+  ST_SH(in4, out + 40);
+  ST_SH(in5, out + 48);
+
+  LD_SH8(interm_ptr + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
+  DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
+  DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
+  ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
+  DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
+  ADD2(in0, in1, in2, in3, vec0, vec7);
+  DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, in5, in4);
+  FDCT_POSTPROC_2V_NEG_H(in4, in5);
+  ST_SH(in4, out + 64);
+  ST_SH(in5, out + 120);
+
+  SUB2(in0, in1, in2, in3, in0, in2);
+  DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, in5, in4);
+  FDCT_POSTPROC_2V_NEG_H(in4, in5);
+  ST_SH(in4, out + 72);
+  ST_SH(in5, out + 112);
+
+  SUB2(in9, vec2, in14, vec5, vec2, vec5);
+  DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
+  SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5);
+  DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, in5, in4);
+  FDCT_POSTPROC_2V_NEG_H(in4, in5);
+  ST_SH(in4, out + 80);
+  ST_SH(in5, out + 104);
+
+  ADD2(in3, in2, in0, in1, vec3, vec4);
+  DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, in4, in5);
+  FDCT_POSTPROC_2V_NEG_H(in4, in5);
+  ST_SH(in4, out + 96);
+  ST_SH(in5, out + 88);
+}
+
+static void fdct8x32_1d_row_even(int16_t *temp, int16_t *out) {
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+  v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
+  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1;
+
+  /* fdct32 even */
+  /* stage 2 */
+  LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+  LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
+
+  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
+               in8, in9, in10, in11, in12, in13, in14, in15,
+               vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7,
+               in8, in9, in10, in11, in12, in13, in14, in15);
+
+  /* Stage 3 */
+  ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3);
+  BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0);
+  DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0);
+  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  ST_SH(temp0, out);
+  ST_SH(temp1, out + 8);
+
+  DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
+  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  ST_SH(temp0, out + 16);
+  ST_SH(temp1, out + 24);
+
+  SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7);
+  DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
+  ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
+  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  ST_SH(temp0, out + 32);
+  ST_SH(temp1, out + 56);
+
+  SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
+  DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
+  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  ST_SH(temp0, out + 40);
+  ST_SH(temp1, out + 48);
+
+  DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
+  DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
+  ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
+  DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
+  ADD2(in0, in1, in2, in3, vec0, vec7);
+  DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
+  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  ST_SH(temp0, out + 64);
+  ST_SH(temp1, out + 120);
+
+  SUB2(in0, in1, in2, in3, in0, in2);
+  DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
+  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  ST_SH(temp0, out + 72);
+  ST_SH(temp1, out + 112);
+
+  SUB2(in9, vec2, in14, vec5, vec2, vec5);
+  DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
+  SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5)
+  DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
+  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  ST_SH(temp0, out + 80);
+  ST_SH(temp1, out + 104);
+
+  ADD2(in3, in2, in0, in1, vec3, vec4);
+  DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
+  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  ST_SH(temp0, out + 96);
+  ST_SH(temp1, out + 88);
+}
+
+static void fdct8x32_1d_row_odd(int16_t *temp, int16_t *interm_ptr,
+                                int16_t *out) {
+  v8i16 in16, in17, in18, in19, in20, in21, in22, in23;
+  v8i16 in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5;
+
+  in20 = LD_SH(temp + 32);
+  in21 = LD_SH(temp + 40);
+  in26 = LD_SH(temp + 80);
+  in27 = LD_SH(temp + 88);
+
+  DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
+  DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
+
+  in18 = LD_SH(temp + 16);
+  in19 = LD_SH(temp + 24);
+  in28 = LD_SH(temp + 96);
+  in29 = LD_SH(temp + 104);
+
+  vec4 = in19 - in20;
+  ST_SH(vec4, interm_ptr + 32);
+  vec4 = in18 - in21;
+  ST_SH(vec4, interm_ptr + 88);
+  vec4 = in28 - in27;
+  ST_SH(vec4, interm_ptr + 56);
+  vec4 = in29 - in26;
+  ST_SH(vec4, interm_ptr + 64);
+
+  ADD4(in18, in21, in19, in20, in28, in27, in29, in26, in21, in20, in27, in26);
+
+  in22 = LD_SH(temp + 48);
+  in23 = LD_SH(temp + 56);
+  in24 = LD_SH(temp + 64);
+  in25 = LD_SH(temp + 72);
+
+  DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
+  DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
+
+  in16 = LD_SH(temp);
+  in17 = LD_SH(temp + 8);
+  in30 = LD_SH(temp + 112);
+  in31 = LD_SH(temp + 120);
+
+  vec4 = in17 - in22;
+  ST_SH(vec4, interm_ptr + 40);
+  vec4 = in30 - in25;
+  ST_SH(vec4, interm_ptr + 48);
+  vec4 = in31 - in24;
+  ST_SH(vec4, interm_ptr + 72);
+  vec4 = in16 - in23;
+  ST_SH(vec4, interm_ptr + 80);
+
+  ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31);
+  DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
+  DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
+
+  ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25);
+  DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
+  ADD2(in27, in26, in25, in24, in23, in20);
+
+  DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
+  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  ST_SH(vec5, out);
+  ST_SH(vec4, out + 120);
+
+  SUB2(in27, in26, in25, in24, in22, in21);
+
+  DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
+  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  ST_SH(vec5, out + 112);
+  ST_SH(vec4, out + 8);
+
+  SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20);
+  DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25);
+  SUB2(in26, in27, in24, in25, in23, in20);
+
+  DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
+  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  ST_SH(vec4, out + 16);
+  ST_SH(vec5, out + 104);
+
+  ADD2(in26, in27, in24, in25, in22, in21);
+  DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
+  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  ST_SH(vec4, out + 24);
+  ST_SH(vec5, out + 96);
+
+  in20 = LD_SH(interm_ptr + 32);
+  in21 = LD_SH(interm_ptr + 88);
+  in27 = LD_SH(interm_ptr + 56);
+  in26 = LD_SH(interm_ptr + 64);
+
+  in16 = in20;
+  in17 = in21;
+  DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27);
+  DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26);
+
+  in22 = LD_SH(interm_ptr + 40);
+  in25 = LD_SH(interm_ptr + 48);
+  in24 = LD_SH(interm_ptr + 72);
+  in23 = LD_SH(interm_ptr + 80);
+
+  SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31);
+  DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
+  ADD2(in28, in29, in31, in30, in16, in19);
+  DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
+  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  ST_SH(vec5, out + 32);
+  ST_SH(vec4, out + 88);
+
+  SUB2(in28, in29, in31, in30, in17, in18);
+  DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
+  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  ST_SH(vec5, out + 40);
+  ST_SH(vec4, out + 80);
+
+  ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19);
+  DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31);
+  SUB2(in29, in28, in30, in31, in16, in19);
+
+  DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
+  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  ST_SH(vec5, out + 72);
+  ST_SH(vec4, out + 48);
+
+  ADD2(in29, in28, in30, in31, in17, in18);
+
+  DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
+  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  ST_SH(vec4, out + 56);
+  ST_SH(vec5, out + 64);
+}
+
+static void fdct8x32_1d_row_transpose_store(int16_t *temp, int16_t *output) {
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+  v8i16 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1;
+
+  /* 1st set */
+  in0 = LD_SH(temp);
+  in4 = LD_SH(temp + 32);
+  in2 = LD_SH(temp + 64);
+  in6 = LD_SH(temp + 96);
+  in1 = LD_SH(temp + 128);
+  in7 = LD_SH(temp + 152);
+  in3 = LD_SH(temp + 192);
+  in5 = LD_SH(temp + 216);
+
+  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                     in0, in1, in2, in3, in4, in5, in6, in7);
+
+  /* 2nd set */
+  in0_1 = LD_SH(temp + 16);
+  in1_1 = LD_SH(temp + 232);
+  in2_1 = LD_SH(temp + 80);
+  in3_1 = LD_SH(temp + 168);
+  in4_1 = LD_SH(temp + 48);
+  in5_1 = LD_SH(temp + 176);
+  in6_1 = LD_SH(temp + 112);
+  in7_1 = LD_SH(temp + 240);
+
+  ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 32);
+  TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
+                     in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1);
+
+  /* 3rd set */
+  in0 = LD_SH(temp + 8);
+  in1 = LD_SH(temp + 136);
+  in2 = LD_SH(temp + 72);
+  in3 = LD_SH(temp + 200);
+  in4 = LD_SH(temp + 40);
+  in5 = LD_SH(temp + 208);
+  in6 = LD_SH(temp + 104);
+  in7 = LD_SH(temp + 144);
+
+  ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
+         output + 8, 32);
+  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                     in0, in1, in2, in3, in4, in5, in6, in7);
+  ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output + 16, 32);
+
+  /* 4th set */
+  in0_1 = LD_SH(temp + 24);
+  in1_1 = LD_SH(temp + 224);
+  in2_1 = LD_SH(temp + 88);
+  in3_1 = LD_SH(temp + 160);
+  in4_1 = LD_SH(temp + 56);
+  in5_1 = LD_SH(temp + 184);
+  in6_1 = LD_SH(temp + 120);
+  in7_1 = LD_SH(temp + 248);
+
+  TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
+                     in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1);
+  ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
+         output + 24, 32);
+}
+
+static void fdct32x8_1d_row(int16_t *temp, int16_t *temp_buf,
+                            int16_t *output) {
+  fdct8x32_1d_row_load_butterfly(temp, temp_buf);
+  fdct8x32_1d_row_even(temp_buf, temp_buf);
+  fdct8x32_1d_row_odd(temp_buf + 128, temp, temp_buf + 128);
+  fdct8x32_1d_row_transpose_store(temp_buf, output);
+}
+
+static void fdct32x8_1d_row_4x(int16_t *tmp_buf_big, int16_t *tmp_buf,
+                               int16_t *output) {
+  fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf);
+  fdct8x32_1d_row_even_4x(tmp_buf, tmp_buf_big, tmp_buf);
+  fdct8x32_1d_row_odd(tmp_buf + 128, tmp_buf_big, tmp_buf + 128);
+  fdct8x32_1d_row_transpose_store(tmp_buf, output);
+}
+
+void vpx_fdct32x32_msa(const int16_t *input, int16_t *output,
+                       int32_t src_stride) {
+  int32_t i;
+  DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]);
+  DECLARE_ALIGNED(32, int16_t, tmp_buf[256]);
+
+  /* column transform */
+  for (i = 0; i < 4; ++i) {
+    fdct8x32_1d_column(input + (8 * i), src_stride, tmp_buf,
+                       tmp_buf_big + (8 * i));
+  }
+
+  /* row transform */
+  fdct32x8_1d_row_4x(tmp_buf_big, tmp_buf, output);
+
+  /* row transform */
+  for (i = 1; i < 4; ++i) {
+    fdct32x8_1d_row(tmp_buf_big + (i * 256), tmp_buf, output + (i * 256));
+  }
+}
+
+static void fdct8x32_1d_row_even_rd(int16_t *temp, int16_t *out) {
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+  v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
+  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1;
+
+  /* fdct32 even */
+  /* stage 2 */
+  LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+  LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
+
+  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
+               in8, in9, in10, in11, in12, in13, in14, in15,
+               vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7,
+               in8, in9, in10, in11, in12, in13, in14, in15);
+  FDCT_POSTPROC_2V_NEG_H(vec0, vec1);
+  FDCT_POSTPROC_2V_NEG_H(vec2, vec3);
+  FDCT_POSTPROC_2V_NEG_H(vec4, vec5);
+  FDCT_POSTPROC_2V_NEG_H(vec6, vec7);
+  FDCT_POSTPROC_2V_NEG_H(in8, in9);
+  FDCT_POSTPROC_2V_NEG_H(in10, in11);
+  FDCT_POSTPROC_2V_NEG_H(in12, in13);
+  FDCT_POSTPROC_2V_NEG_H(in14, in15);
+
+  /* Stage 3 */
+  ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3);
+
+  temp0 = in0 + in3;
+  in0 = in0 - in3;
+  in3 = in1 + in2;
+  in1 = in1 - in2;
+
+  DOTP_CONST_PAIR(temp0, in3, cospi_16_64, cospi_16_64, temp1, temp0);
+  ST_SH(temp0, out);
+  ST_SH(temp1, out + 8);
+
+  DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
+  ST_SH(temp0, out + 16);
+  ST_SH(temp1, out + 24);
+
+  SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7);
+  DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
+  ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
+  ST_SH(temp0, out + 32);
+  ST_SH(temp1, out + 56);
+
+  SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
+  DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
+  ST_SH(temp0, out + 40);
+  ST_SH(temp1, out + 48);
+
+  DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
+  DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
+  ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
+  DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
+  ADD2(in0, in1, in2, in3, vec0, vec7);
+  DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
+  ST_SH(temp0, out + 64);
+  ST_SH(temp1, out + 120);
+
+  SUB2(in0, in1, in2, in3, in0, in2);
+  DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
+  ST_SH(temp0, out + 72);
+  ST_SH(temp1, out + 112);
+
+  SUB2(in9, vec2, in14, vec5, vec2, vec5);
+  DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
+  SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5);
+  DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
+  ST_SH(temp0, out + 80);
+  ST_SH(temp1, out + 104);
+
+  ADD2(in3, in2, in0, in1, vec3, vec4);
+  DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
+  ST_SH(temp0, out + 96);
+  ST_SH(temp1, out + 88);
+}
+
+static void fdct8x32_1d_row_odd_rd(int16_t *temp, int16_t *interm_ptr,
+                                   int16_t *out) {
+  v8i16 in16, in17, in18, in19, in20, in21, in22, in23;
+  v8i16 in24, in25, in26, in27, in28, in29, in30, in31;
+  v8i16 vec4, vec5;
+
+  in20 = LD_SH(temp + 32);
+  in21 = LD_SH(temp + 40);
+  in26 = LD_SH(temp + 80);
+  in27 = LD_SH(temp + 88);
+
+  DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
+  DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
+
+  FDCT_POSTPROC_2V_NEG_H(in20, in21);
+  FDCT_POSTPROC_2V_NEG_H(in26, in27);
+
+  in18 = LD_SH(temp + 16);
+  in19 = LD_SH(temp + 24);
+  in28 = LD_SH(temp + 96);
+  in29 = LD_SH(temp + 104);
+
+  FDCT_POSTPROC_2V_NEG_H(in18, in19);
+  FDCT_POSTPROC_2V_NEG_H(in28, in29);
+
+  vec4 = in19 - in20;
+  ST_SH(vec4, interm_ptr + 32);
+  vec4 = in18 - in21;
+  ST_SH(vec4, interm_ptr + 88);
+  vec4 = in29 - in26;
+  ST_SH(vec4, interm_ptr + 64);
+  vec4 = in28 - in27;
+  ST_SH(vec4, interm_ptr + 56);
+
+  ADD4(in18, in21, in19, in20, in28, in27, in29, in26, in21, in20, in27, in26);
+
+  in22 = LD_SH(temp + 48);
+  in23 = LD_SH(temp + 56);
+  in24 = LD_SH(temp + 64);
+  in25 = LD_SH(temp + 72);
+
+  DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
+  DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
+  FDCT_POSTPROC_2V_NEG_H(in22, in23);
+  FDCT_POSTPROC_2V_NEG_H(in24, in25);
+
+  in16 = LD_SH(temp);
+  in17 = LD_SH(temp + 8);
+  in30 = LD_SH(temp + 112);
+  in31 = LD_SH(temp + 120);
+
+  FDCT_POSTPROC_2V_NEG_H(in16, in17);
+  FDCT_POSTPROC_2V_NEG_H(in30, in31);
+
+  vec4 = in17 - in22;
+  ST_SH(vec4, interm_ptr + 40);
+  vec4 = in30 - in25;
+  ST_SH(vec4, interm_ptr + 48);
+  vec4 = in31 - in24;
+  ST_SH(vec4, interm_ptr + 72);
+  vec4 = in16 - in23;
+  ST_SH(vec4, interm_ptr + 80);
+
+  ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31);
+  DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
+  DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
+  ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25);
+  DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
+  ADD2(in27, in26, in25, in24, in23, in20);
+  DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
+  ST_SH(vec5, out);
+  ST_SH(vec4, out + 120);
+
+  SUB2(in27, in26, in25, in24, in22, in21);
+  DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
+  ST_SH(vec5, out + 112);
+  ST_SH(vec4, out + 8);
+
+  SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20);
+  DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25);
+  SUB2(in26, in27, in24, in25, in23, in20);
+  DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
+  ST_SH(vec4, out + 16);
+  ST_SH(vec5, out + 104);
+
+  ADD2(in26, in27, in24, in25, in22, in21);
+  DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
+  ST_SH(vec4, out + 24);
+  ST_SH(vec5, out + 96);
+
+  in20 = LD_SH(interm_ptr + 32);
+  in21 = LD_SH(interm_ptr + 88);
+  in27 = LD_SH(interm_ptr + 56);
+  in26 = LD_SH(interm_ptr + 64);
+
+  in16 = in20;
+  in17 = in21;
+  DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27);
+  DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26);
+
+  in22 = LD_SH(interm_ptr + 40);
+  in25 = LD_SH(interm_ptr + 48);
+  in24 = LD_SH(interm_ptr + 72);
+  in23 = LD_SH(interm_ptr + 80);
+
+  SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31);
+  DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
+  in16 = in28 + in29;
+  in19 = in31 + in30;
+  DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
+  ST_SH(vec5, out + 32);
+  ST_SH(vec4, out + 88);
+
+  SUB2(in28, in29, in31, in30, in17, in18);
+  DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
+  ST_SH(vec5, out + 40);
+  ST_SH(vec4, out + 80);
+
+  ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19);
+  DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31);
+  SUB2(in29, in28, in30, in31, in16, in19);
+  DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
+  ST_SH(vec5, out + 72);
+  ST_SH(vec4, out + 48);
+
+  ADD2(in29, in28, in30, in31, in17, in18);
+  DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
+  ST_SH(vec4, out + 56);
+  ST_SH(vec5, out + 64);
+}
+
+static void fdct32x8_1d_row_rd(int16_t *tmp_buf_big, int16_t *tmp_buf,
+                               int16_t *output) {
+  fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf);
+  fdct8x32_1d_row_even_rd(tmp_buf, tmp_buf);
+  fdct8x32_1d_row_odd_rd((tmp_buf + 128), tmp_buf_big, (tmp_buf + 128));
+  fdct8x32_1d_row_transpose_store(tmp_buf, output);
+}
+
+void vpx_fdct32x32_rd_msa(const int16_t *input, int16_t *out,
+                          int32_t src_stride) {
+  int32_t i;
+  DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]);
+  DECLARE_ALIGNED(32, int16_t, tmp_buf[256]);
+
+  /* column transform */
+  for (i = 0; i < 4; ++i) {
+    fdct8x32_1d_column(input + (8 * i), src_stride, &tmp_buf[0],
+                       &tmp_buf_big[0] + (8 * i));
+  }
+
+  /* row transform */
+  for (i = 0; i < 4; ++i) {
+    fdct32x8_1d_row_rd(&tmp_buf_big[0] + (8 * i * 32), &tmp_buf[0],
+                       out + (8 * i * 32));
+  }
+}
+
+void vpx_fdct32x32_1_msa(const int16_t *input, int16_t *out, int32_t stride) {
+  out[1] = 0;
+
+  out[0] = LD_HADD(input, stride);
+  out[0] += LD_HADD(input + 8, stride);
+  out[0] += LD_HADD(input + 16, stride);
+  out[0] += LD_HADD(input + 24, stride);
+  out[0] += LD_HADD(input + 32 * 8, stride);
+  out[0] += LD_HADD(input + 32 * 8 + 8, stride);
+  out[0] += LD_HADD(input + 32 * 8 + 16, stride);
+  out[0] += LD_HADD(input + 32 * 8 + 24, stride);
+  out[0] += LD_HADD(input + 32 * 16, stride);
+  out[0] += LD_HADD(input + 32 * 16 + 8, stride);
+  out[0] += LD_HADD(input + 32 * 16 + 16, stride);
+  out[0] += LD_HADD(input + 32 * 16 + 24, stride);
+  out[0] += LD_HADD(input + 32 * 24, stride);
+  out[0] += LD_HADD(input + 32 * 24 + 8, stride);
+  out[0] += LD_HADD(input + 32 * 24 + 16, stride);
+  out[0] += LD_HADD(input + 32 * 24 + 24, stride);
+  out[0] >>= 3;
+}
diff --git a/libs/libvpx/vpx_dsp/mips/fwd_txfm_msa.c b/libs/libvpx/vpx_dsp/mips/fwd_txfm_msa.c
new file mode 100644
index 0000000000..f66dd5fce2
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/fwd_txfm_msa.c
@@ -0,0 +1,247 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/mips/fwd_txfm_msa.h"
+
+void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
+                        int32_t src_stride) {
+  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+  v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
+  v8i16 stp21, stp22, stp23, stp24, stp25, stp26, stp30;
+  v8i16 stp31, stp32, stp33, stp34, stp35, stp36, stp37;
+  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, cnst0, cnst1, cnst4, cnst5;
+  v8i16 coeff = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64,
+                 -cospi_8_64, -cospi_24_64, cospi_12_64, cospi_20_64 };
+  v8i16 coeff1 = { cospi_2_64, cospi_30_64, cospi_14_64, cospi_18_64,
+                   cospi_10_64, cospi_22_64, cospi_6_64, cospi_26_64 };
+  v8i16 coeff2 = { -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64,
+                   0, 0, 0, 0 };
+
+  LD_SH16(input, src_stride,
+          in0, in1, in2, in3, in4, in5, in6, in7,
+          in8, in9, in10, in11, in12, in13, in14, in15);
+  SLLI_4V(in0, in1, in2, in3, 2);
+  SLLI_4V(in4, in5, in6, in7, 2);
+  SLLI_4V(in8, in9, in10, in11, 2);
+  SLLI_4V(in12, in13, in14, in15, 2);
+  ADD4(in0, in15, in1, in14, in2, in13, in3, in12, tmp0, tmp1, tmp2, tmp3);
+  ADD4(in4, in11, in5, in10, in6, in9, in7, in8, tmp4, tmp5, tmp6, tmp7);
+  FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
+                tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
+  ST_SH8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp_ptr, 32);
+  SUB4(in0, in15, in1, in14, in2, in13, in3, in12, in15, in14, in13, in12);
+  SUB4(in4, in11, in5, in10, in6, in9, in7, in8, in11, in10, in9, in8);
+
+  tmp_ptr += 16;
+
+  /* stp 1 */
+  ILVL_H2_SH(in10, in13, in11, in12, vec2, vec4);
+  ILVR_H2_SH(in10, in13, in11, in12, vec3, vec5);
+
+  cnst4 = __msa_splati_h(coeff, 0);
+  stp25 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst4);
+
+  cnst5 = __msa_splati_h(coeff, 1);
+  cnst5 = __msa_ilvev_h(cnst5, cnst4);
+  stp22 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst5);
+  stp24 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst4);
+  stp23 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst5);
+
+  /* stp2 */
+  BUTTERFLY_4(in8, in9, stp22, stp23, stp30, stp31, stp32, stp33);
+  BUTTERFLY_4(in15, in14, stp25, stp24, stp37, stp36, stp35, stp34);
+  ILVL_H2_SH(stp36, stp31, stp35, stp32, vec2, vec4);
+  ILVR_H2_SH(stp36, stp31, stp35, stp32, vec3, vec5);
+  SPLATI_H2_SH(coeff, 2, 3, cnst0, cnst1);
+  cnst0 = __msa_ilvev_h(cnst0, cnst1);
+  stp26 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst0);
+
+  cnst0 = __msa_splati_h(coeff, 4);
+  cnst1 = __msa_ilvev_h(cnst1, cnst0);
+  stp21 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst1);
+
+  BUTTERFLY_4(stp30, stp37, stp26, stp21, in8, in15, in14, in9);
+  ILVRL_H2_SH(in15, in8, vec1, vec0);
+  SPLATI_H2_SH(coeff1, 0, 1, cnst0, cnst1);
+  cnst0 = __msa_ilvev_h(cnst0, cnst1);
+
+  in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
+  ST_SH(in8, tmp_ptr);
+
+  cnst0 = __msa_splati_h(coeff2, 0);
+  cnst0 = __msa_ilvev_h(cnst1, cnst0);
+  in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
+  ST_SH(in8, tmp_ptr + 224);
+
+  ILVRL_H2_SH(in14, in9, vec1, vec0);
+  SPLATI_H2_SH(coeff1, 2, 3, cnst0, cnst1);
+  cnst1 = __msa_ilvev_h(cnst1, cnst0);
+
+  in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1);
+  ST_SH(in8, tmp_ptr + 128);
+
+  cnst1 = __msa_splati_h(coeff2, 2);
+  cnst0 = __msa_ilvev_h(cnst0, cnst1);
+  in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
+  ST_SH(in8, tmp_ptr + 96);
+
+  SPLATI_H2_SH(coeff, 2, 5, cnst0, cnst1);
+  cnst1 = __msa_ilvev_h(cnst1, cnst0);
+
+  stp25 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1);
+
+  cnst1 = __msa_splati_h(coeff, 3);
+  cnst1 = __msa_ilvev_h(cnst0, cnst1);
+  stp22 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1);
+
+  /* stp4 */
+  ADD2(stp34, stp25, stp33, stp22, in13, in10);
+
+  ILVRL_H2_SH(in13, in10, vec1, vec0);
+  SPLATI_H2_SH(coeff1, 4, 5, cnst0, cnst1);
+  cnst0 = __msa_ilvev_h(cnst0, cnst1);
+  in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
+  ST_SH(in8, tmp_ptr + 64);
+
+  cnst0 = __msa_splati_h(coeff2, 1);
+  cnst0 = __msa_ilvev_h(cnst1, cnst0);
+  in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
+  ST_SH(in8, tmp_ptr + 160);
+
+  SUB2(stp34, stp25, stp33, stp22, in12, in11);
+  ILVRL_H2_SH(in12, in11, vec1, vec0);
+  SPLATI_H2_SH(coeff1, 6, 7, cnst0, cnst1);
+  cnst1 = __msa_ilvev_h(cnst1, cnst0);
+
+  in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1);
+  ST_SH(in8, tmp_ptr + 192);
+
+  cnst1 = __msa_splati_h(coeff2, 3);
+  cnst0 = __msa_ilvev_h(cnst0, cnst1);
+  in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
+  ST_SH(in8, tmp_ptr + 32);
+}
+
+void fdct16x8_1d_row(int16_t *input, int16_t *output) {
+  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+  v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
+
+  LD_SH8(input, 16, in0, in1, in2, in3, in4, in5, in6, in7);
+  LD_SH8((input + 8), 16, in8, in9, in10, in11, in12, in13, in14, in15);
+  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                     in0, in1, in2, in3, in4, in5, in6, in7);
+  TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15,
+                     in8, in9, in10, in11, in12, in13, in14, in15);
+  ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
+  ADD4(in4, 1, in5, 1, in6, 1, in7, 1, in4, in5, in6, in7);
+  ADD4(in8, 1, in9, 1, in10, 1, in11, 1, in8, in9, in10, in11);
+  ADD4(in12, 1, in13, 1, in14, 1, in15, 1, in12, in13, in14, in15);
+  SRA_4V(in0, in1, in2, in3, 2);
+  SRA_4V(in4, in5, in6, in7, 2);
+  SRA_4V(in8, in9, in10, in11, 2);
+  SRA_4V(in12, in13, in14, in15, 2);
+  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
+               in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5,
+               tmp6, tmp7, in8, in9, in10, in11, in12, in13, in14, in15);
+  ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, input, 16);
+  FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
+                tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
+  LD_SH8(input, 16, in8, in9, in10, in11, in12, in13, in14, in15);
+  FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15,
+                   in0, in1, in2, in3, in4, in5, in6, in7);
+  TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3,
+                     tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3);
+  ST_SH8(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, output, 16);
+  TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7,
+                     tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7);
+  ST_SH8(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, output + 8, 16);
+}
+
+void vpx_fdct4x4_msa(const int16_t *input, int16_t *output,
+                     int32_t src_stride) {
+  v8i16 in0, in1, in2, in3;
+
+  LD_SH4(input, src_stride, in0, in1, in2, in3);
+
+  /* fdct4 pre-process */
+  {
+    v8i16 vec, mask;
+    v16i8 zero = { 0 };
+    v16i8 one = __msa_ldi_b(1);
+
+    mask = (v8i16)__msa_sldi_b(zero, one, 15);
+    SLLI_4V(in0, in1, in2, in3, 4);
+    vec = __msa_ceqi_h(in0, 0);
+    vec = vec ^ 255;
+    vec = mask & vec;
+    in0 += vec;
+  }
+
+  VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+  TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+  VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+  TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+  ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
+  SRA_4V(in0, in1, in2, in3, 2);
+  PCKEV_D2_SH(in1, in0, in3, in2, in0, in2);
+  ST_SH2(in0, in2, output, 8);
+}
+
+void vpx_fdct8x8_msa(const int16_t *input, int16_t *output,
+                     int32_t src_stride) {
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+  LD_SH8(input, src_stride, in0, in1, in2, in3, in4, in5, in6, in7);
+  SLLI_4V(in0, in1, in2, in3, 2);
+  SLLI_4V(in4, in5, in6, in7, 2);
+  VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
+            in0, in1, in2, in3, in4, in5, in6, in7);
+  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                     in0, in1, in2, in3, in4, in5, in6, in7);
+  VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
+            in0, in1, in2, in3, in4, in5, in6, in7);
+  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                     in0, in1, in2, in3, in4, in5, in6, in7);
+  SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7);
+  ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8);
+}
+
+void vpx_fdct8x8_1_msa(const int16_t *input, int16_t *out, int32_t stride) {
+  out[0] = LD_HADD(input, stride);
+  out[1] = 0;
+}
+
+void vpx_fdct16x16_msa(const int16_t *input, int16_t *output,
+                       int32_t src_stride) {
+  int32_t i;
+  DECLARE_ALIGNED(32, int16_t, tmp_buf[16 * 16]);
+
+  /* column transform */
+  for (i = 0; i < 2; ++i) {
+    fdct8x16_1d_column((input + 8 * i), (&tmp_buf[0] + 8 * i), src_stride);
+  }
+
+  /* row transform */
+  for (i = 0; i < 2; ++i) {
+    fdct16x8_1d_row((&tmp_buf[0] + (128 * i)), (output + (128 * i)));
+  }
+}
+
+void vpx_fdct16x16_1_msa(const int16_t *input, int16_t *out, int32_t stride) {
+  out[1] = 0;
+
+  out[0] = LD_HADD(input, stride);
+  out[0] += LD_HADD(input + 8, stride);
+  out[0] += LD_HADD(input + 16 * 8, stride);
+  out[0] += LD_HADD(input + 16 * 8 + 8, stride);
+  out[0] >>= 1;
+}
diff --git a/libs/libvpx/vpx_dsp/mips/fwd_txfm_msa.h b/libs/libvpx/vpx_dsp/mips/fwd_txfm_msa.h
new file mode 100644
index 0000000000..d1e160eed5
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/fwd_txfm_msa.h
@@ -0,0 +1,373 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_MIPS_FWD_TXFM_MSA_H_
+#define VPX_DSP_MIPS_FWD_TXFM_MSA_H_
+
+#include "vpx_dsp/mips/txfm_macros_msa.h"
+#include "vpx_dsp/txfm_common.h"
+
+#define LD_HADD(psrc, stride) ({                                      \
+  v8i16 in0_m, in1_m, in2_m, in3_m, in4_m, in5_m, in6_m, in7_m;       \
+  v4i32 vec_w_m;                                                      \
+                                                                      \
+  LD_SH4((psrc), stride, in0_m, in1_m, in2_m, in3_m);                 \
+  ADD2(in0_m, in1_m, in2_m, in3_m, in0_m, in2_m);                     \
+  LD_SH4(((psrc) + 4 * stride), stride, in4_m, in5_m, in6_m, in7_m);  \
+  ADD4(in4_m, in5_m, in6_m, in7_m, in0_m, in2_m, in4_m, in6_m,        \
+       in4_m, in6_m, in0_m, in4_m);                                   \
+  in0_m += in4_m;                                                     \
+                                                                      \
+  vec_w_m = __msa_hadd_s_w(in0_m, in0_m);                             \
+  HADD_SW_S32(vec_w_m);                                               \
+})
+
+#define VP9_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3) {     \
+  v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m;                         \
+  v8i16 vec0_m, vec1_m, vec2_m, vec3_m;                             \
+  v4i32 vec4_m, vec5_m, vec6_m, vec7_m;                             \
+  v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64,          \
+                    cospi_24_64, -cospi_8_64, 0, 0, 0 };            \
+                                                                    \
+  BUTTERFLY_4(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m);  \
+  ILVR_H2_SH(vec1_m, vec0_m, vec3_m, vec2_m, vec0_m, vec2_m);       \
+  SPLATI_H2_SH(coeff_m, 0, 1, cnst0_m, cnst1_m);                    \
+  cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m);                        \
+  vec5_m = __msa_dotp_s_w(vec0_m, cnst1_m);                         \
+                                                                    \
+  SPLATI_H2_SH(coeff_m, 4, 3, cnst2_m, cnst3_m);                    \
+  cnst2_m = __msa_ilvev_h(cnst3_m, cnst2_m);                        \
+  vec7_m = __msa_dotp_s_w(vec2_m, cnst2_m);                         \
+                                                                    \
+  vec4_m = __msa_dotp_s_w(vec0_m, cnst0_m);                         \
+  cnst2_m = __msa_splati_h(coeff_m, 2);                             \
+  cnst2_m = __msa_ilvev_h(cnst2_m, cnst3_m);                        \
+  vec6_m = __msa_dotp_s_w(vec2_m, cnst2_m);                         \
+                                                                    \
+  SRARI_W4_SW(vec4_m, vec5_m, vec6_m, vec7_m, DCT_CONST_BITS);      \
+  PCKEV_H4_SH(vec4_m, vec4_m, vec5_m, vec5_m, vec6_m, vec6_m,       \
+              vec7_m, vec7_m, out0, out2, out1, out3);              \
+}
+
+#define SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7) {        \
+  v8i16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;  \
+                                                                         \
+  SRLI_H4_SH(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m, 15);    \
+  SRLI_H4_SH(in4, in5, in6, in7, vec4_m, vec5_m, vec6_m, vec7_m, 15);    \
+  AVE_SH4_SH(vec0_m, in0, vec1_m, in1, vec2_m, in2, vec3_m, in3,         \
+             in0, in1, in2, in3);                                        \
+  AVE_SH4_SH(vec4_m, in4, vec5_m, in5, vec6_m, in6, vec7_m, in7,         \
+             in4, in5, in6, in7);                                        \
+}
+
+#define VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7,            \
+                  out0, out1, out2, out3, out4, out5, out6, out7) {  \
+  v8i16 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m;                    \
+  v8i16 s7_m, x0_m, x1_m, x2_m, x3_m;                                \
+  v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64,           \
+                    cospi_24_64, cospi_4_64, cospi_28_64,            \
+                    cospi_12_64, cospi_20_64 };                      \
+                                                                     \
+  /* FDCT stage1 */                                                  \
+  BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7,                \
+              s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m);       \
+  BUTTERFLY_4(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m);       \
+  ILVL_H2_SH(x1_m, x0_m, x3_m, x2_m, s0_m, s2_m);                    \
+  ILVR_H2_SH(x1_m, x0_m, x3_m, x2_m, s1_m, s3_m);                    \
+  SPLATI_H2_SH(coeff_m, 0, 1, x0_m, x1_m);                           \
+  x1_m = __msa_ilvev_h(x1_m, x0_m);                                  \
+  out4 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m);                    \
+                                                                     \
+  SPLATI_H2_SH(coeff_m, 2, 3, x2_m, x3_m);                           \
+  x2_m = -x2_m;                                                      \
+  x2_m = __msa_ilvev_h(x3_m, x2_m);                                  \
+  out6 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m);                    \
+                                                                     \
+  out0 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m);                    \
+  x2_m = __msa_splati_h(coeff_m, 2);                                 \
+  x2_m = __msa_ilvev_h(x2_m, x3_m);                                  \
+  out2 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m);                    \
+                                                                     \
+  /* stage2 */                                                       \
+  ILVRL_H2_SH(s5_m, s6_m, s1_m, s0_m);                               \
+                                                                     \
+  s6_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m);                    \
+  s5_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m);                    \
+                                                                     \
+  /* stage3 */                                                       \
+  BUTTERFLY_4(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m);       \
+                                                                     \
+  /* stage4 */                                                       \
+  ILVL_H2_SH(x3_m, x0_m, x2_m, x1_m, s4_m, s6_m);                    \
+  ILVR_H2_SH(x3_m, x0_m, x2_m, x1_m, s5_m, s7_m);                    \
+                                                                     \
+  SPLATI_H2_SH(coeff_m, 4, 5, x0_m, x1_m);                           \
+  x1_m = __msa_ilvev_h(x0_m, x1_m);                                  \
+  out1 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m);                    \
+                                                                     \
+  SPLATI_H2_SH(coeff_m, 6, 7, x2_m, x3_m);                           \
+  x2_m = __msa_ilvev_h(x3_m, x2_m);                                  \
+  out5 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m);                    \
+                                                                     \
+  x1_m = __msa_splati_h(coeff_m, 5);                                 \
+  x0_m = -x0_m;                                                      \
+  x0_m = __msa_ilvev_h(x1_m, x0_m);                                  \
+  out7 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m);                    \
+                                                                     \
+  x2_m = __msa_splati_h(coeff_m, 6);                                 \
+  x3_m = -x3_m;                                                      \
+  x2_m = __msa_ilvev_h(x2_m, x3_m);                                  \
+  out3 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m);                    \
+}
+
+#define FDCT8x16_EVEN(in0, in1, in2, in3, in4, in5, in6, in7,                \
+                      out0, out1, out2, out3, out4, out5, out6, out7) {      \
+  v8i16 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m;                      \
+  v8i16 x0_m, x1_m, x2_m, x3_m;                                              \
+  v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64,      \
+                    cospi_4_64, cospi_28_64, cospi_12_64, cospi_20_64 };     \
+                                                                             \
+  /* FDCT stage1 */                                                          \
+  BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7,                        \
+              s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m);               \
+  BUTTERFLY_4(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m);               \
+  ILVL_H2_SH(x1_m, x0_m, x3_m, x2_m, s0_m, s2_m);                            \
+  ILVR_H2_SH(x1_m, x0_m, x3_m, x2_m, s1_m, s3_m);                            \
+  SPLATI_H2_SH(coeff_m, 0, 1, x0_m, x1_m);                                   \
+  x1_m = __msa_ilvev_h(x1_m, x0_m);                                          \
+  out4 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m);                            \
+                                                                             \
+  SPLATI_H2_SH(coeff_m, 2, 3, x2_m, x3_m);                                   \
+  x2_m = -x2_m;                                                              \
+  x2_m = __msa_ilvev_h(x3_m, x2_m);                                          \
+  out6 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m);                            \
+                                                                             \
+  out0 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m);                            \
+  x2_m = __msa_splati_h(coeff_m, 2);                                         \
+  x2_m = __msa_ilvev_h(x2_m, x3_m);                                          \
+  out2 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m);                            \
+                                                                             \
+  /* stage2 */                                                               \
+  ILVRL_H2_SH(s5_m, s6_m, s1_m, s0_m);                                       \
+                                                                             \
+  s6_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m);                            \
+  s5_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m);                            \
+                                                                             \
+  /* stage3 */                                                               \
+  BUTTERFLY_4(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m);               \
+                                                                             \
+  /* stage4 */                                                               \
+  ILVL_H2_SH(x3_m, x0_m, x2_m, x1_m, s4_m, s6_m);                            \
+  ILVR_H2_SH(x3_m, x0_m, x2_m, x1_m, s5_m, s7_m);                            \
+                                                                             \
+  SPLATI_H2_SH(coeff_m, 4, 5, x0_m, x1_m);                                   \
+  x1_m = __msa_ilvev_h(x0_m, x1_m);                                          \
+  out1 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m);                            \
+                                                                             \
+  SPLATI_H2_SH(coeff_m, 6, 7, x2_m, x3_m);                                   \
+  x2_m = __msa_ilvev_h(x3_m, x2_m);                                          \
+  out5 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m);                            \
+                                                                             \
+  x1_m = __msa_splati_h(coeff_m, 5);                                         \
+  x0_m = -x0_m;                                                              \
+  x0_m = __msa_ilvev_h(x1_m, x0_m);                                          \
+  out7 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m);                            \
+                                                                             \
+  x2_m = __msa_splati_h(coeff_m, 6);                                         \
+  x3_m = -x3_m;                                                              \
+  x2_m = __msa_ilvev_h(x2_m, x3_m);                                          \
+  out3 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m);                            \
+}
+
+#define FDCT8x16_ODD(input0, input1, input2, input3,               \
+                     input4, input5, input6, input7,               \
+                     out1, out3, out5, out7,                       \
+                     out9, out11, out13, out15) {                  \
+  v8i16 stp21_m, stp22_m, stp23_m, stp24_m, stp25_m, stp26_m;      \
+  v8i16 stp30_m, stp31_m, stp32_m, stp33_m, stp34_m, stp35_m;      \
+  v8i16 stp36_m, stp37_m, vec0_m, vec1_m;                          \
+  v8i16 vec2_m, vec3_m, vec4_m, vec5_m, vec6_m;                    \
+  v8i16 cnst0_m, cnst1_m, cnst4_m, cnst5_m;                        \
+  v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64,         \
+                    cospi_24_64, -cospi_8_64, -cospi_24_64,        \
+                    cospi_12_64, cospi_20_64 };                    \
+  v8i16 coeff1_m = { cospi_2_64, cospi_30_64, cospi_14_64,         \
+                     cospi_18_64, cospi_10_64, cospi_22_64,        \
+                     cospi_6_64, cospi_26_64 };                    \
+  v8i16 coeff2_m = { -cospi_2_64, -cospi_10_64, -cospi_18_64,      \
+                     -cospi_26_64, 0, 0, 0, 0 };                   \
+                                                                   \
+  /* stp 1 */                                                      \
+  ILVL_H2_SH(input2, input5, input3, input4, vec2_m, vec4_m);      \
+  ILVR_H2_SH(input2, input5, input3, input4, vec3_m, vec5_m);      \
+                                                                   \
+  cnst4_m = __msa_splati_h(coeff_m, 0);                            \
+  stp25_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst4_m);        \
+                                                                   \
+  cnst5_m = __msa_splati_h(coeff_m, 1);                            \
+  cnst5_m = __msa_ilvev_h(cnst5_m, cnst4_m);                       \
+  stp22_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst5_m);        \
+  stp24_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst4_m);        \
+  stp23_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst5_m);        \
+                                                                   \
+  /* stp2 */                                                       \
+  BUTTERFLY_4(input0, input1, stp22_m, stp23_m,                    \
+              stp30_m, stp31_m, stp32_m, stp33_m);                 \
+  BUTTERFLY_4(input7, input6, stp25_m, stp24_m,                    \
+              stp37_m, stp36_m, stp35_m, stp34_m);                 \
+                                                                   \
+  ILVL_H2_SH(stp36_m, stp31_m, stp35_m, stp32_m, vec2_m, vec4_m);  \
+  ILVR_H2_SH(stp36_m, stp31_m, stp35_m, stp32_m, vec3_m, vec5_m);  \
+                                                                   \
+  SPLATI_H2_SH(coeff_m, 2, 3, cnst0_m, cnst1_m);                   \
+  cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m);                       \
+  stp26_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m);        \
+                                                                   \
+  cnst0_m = __msa_splati_h(coeff_m, 4);                            \
+  cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m);                       \
+  stp21_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m);        \
+                                                                   \
+  SPLATI_H2_SH(coeff_m, 5, 2, cnst0_m, cnst1_m);                   \
+  cnst1_m = __msa_ilvev_h(cnst0_m, cnst1_m);                       \
+  stp25_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m);        \
+                                                                   \
+  cnst0_m = __msa_splati_h(coeff_m, 3);                            \
+  cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m);                       \
+  stp22_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m);        \
+                                                                   \
+  /* stp4 */                                                       \
+  BUTTERFLY_4(stp30_m, stp37_m, stp26_m, stp21_m,                  \
+              vec6_m, vec2_m, vec4_m, vec5_m);                     \
+  BUTTERFLY_4(stp33_m, stp34_m, stp25_m, stp22_m,                  \
+              stp21_m, stp23_m, stp24_m, stp31_m);                 \
+                                                                   \
+  ILVRL_H2_SH(vec2_m, vec6_m, vec1_m, vec0_m);                     \
+  SPLATI_H2_SH(coeff1_m, 0, 1, cnst0_m, cnst1_m);                  \
+  cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m);                       \
+                                                                   \
+  out1 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);           \
+                                                                   \
+  cnst0_m = __msa_splati_h(coeff2_m, 0);                           \
+  cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m);                       \
+  out15 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);          \
+                                                                   \
+  ILVRL_H2_SH(vec4_m, vec5_m, vec1_m, vec0_m);                     \
+  SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m);                  \
+  cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m);                       \
+                                                                   \
+  out9 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m);           \
+                                                                   \
+  cnst1_m = __msa_splati_h(coeff2_m, 2);                           \
+  cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m);                       \
+  out7 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);           \
+                                                                   \
+  ILVRL_H2_SH(stp23_m, stp21_m, vec1_m, vec0_m);                   \
+  SPLATI_H2_SH(coeff1_m, 4, 5, cnst0_m, cnst1_m);                  \
+  cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m);                       \
+  out5 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);           \
+                                                                   \
+  cnst0_m = __msa_splati_h(coeff2_m, 1);                           \
+  cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m);                       \
+  out11 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);          \
+                                                                   \
+  ILVRL_H2_SH(stp24_m, stp31_m, vec1_m, vec0_m);                   \
+  SPLATI_H2_SH(coeff1_m, 6, 7, cnst0_m, cnst1_m);                  \
+  cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m);                       \
+                                                                   \
+  out13 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m);          \
+                                                                   \
+  cnst1_m = __msa_splati_h(coeff2_m, 3);                           \
+  cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m);                       \
+  out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);           \
+}
+
+#define FDCT_POSTPROC_2V_NEG_H(vec0, vec1) {      \
+  v8i16 tp0_m, tp1_m;                             \
+  v8i16 one_m = __msa_ldi_h(1);                   \
+                                                  \
+  tp0_m = __msa_clti_s_h(vec0, 0);                \
+  tp1_m = __msa_clti_s_h(vec1, 0);                \
+  vec0 += 1;                                      \
+  vec1 += 1;                                      \
+  tp0_m = one_m & tp0_m;                          \
+  tp1_m = one_m & tp1_m;                          \
+  vec0 += tp0_m;                                  \
+  vec1 += tp1_m;                                  \
+  vec0 >>= 2;                                     \
+  vec1 >>= 2;                                     \
+}
+
+#define FDCT32_POSTPROC_NEG_W(vec) {      \
+  v4i32 temp_m;                           \
+  v4i32 one_m = __msa_ldi_w(1);           \
+                                          \
+  temp_m = __msa_clti_s_w(vec, 0);        \
+  vec += 1;                               \
+  temp_m = one_m & temp_m;                \
+  vec += temp_m;                          \
+  vec >>= 2;                              \
+}
+
+#define FDCT32_POSTPROC_2V_POS_H(vec0, vec1) {      \
+  v8i16 tp0_m, tp1_m;                               \
+  v8i16 one = __msa_ldi_h(1);                       \
+                                                    \
+  tp0_m = __msa_clei_s_h(vec0, 0);                  \
+  tp1_m = __msa_clei_s_h(vec1, 0);                  \
+  tp0_m = (v8i16)__msa_xori_b((v16u8)tp0_m, 255);   \
+  tp1_m = (v8i16)__msa_xori_b((v16u8)tp1_m, 255);   \
+  vec0 += 1;                                        \
+  vec1 += 1;                                        \
+  tp0_m = one & tp0_m;                              \
+  tp1_m = one & tp1_m;                              \
+  vec0 += tp0_m;                                    \
+  vec1 += tp1_m;                                    \
+  vec0 >>= 2;                                       \
+  vec1 >>= 2;                                       \
+}
+
+#define DOTP_CONST_PAIR_W(reg0_left, reg1_left, reg0_right,      \
+                          reg1_right, const0, const1,            \
+                          out0, out1, out2, out3) {              \
+  v4i32 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m;          \
+  v2i64 tp0_m, tp1_m, tp2_m, tp3_m;                              \
+  v4i32 k0_m = __msa_fill_w((int32_t) const0);                   \
+                                                                 \
+  s0_m = __msa_fill_w((int32_t) const1);                         \
+  k0_m = __msa_ilvev_w(s0_m, k0_m);                              \
+                                                                 \
+  ILVRL_W2_SW(-reg1_left, reg0_left, s1_m, s0_m);                \
+  ILVRL_W2_SW(reg0_left, reg1_left, s3_m, s2_m);                 \
+  ILVRL_W2_SW(-reg1_right, reg0_right, s5_m, s4_m);              \
+  ILVRL_W2_SW(reg0_right, reg1_right, s7_m, s6_m);               \
+                                                                 \
+  DOTP_SW2_SD(s0_m, s1_m, k0_m, k0_m, tp0_m, tp1_m);             \
+  DOTP_SW2_SD(s4_m, s5_m, k0_m, k0_m, tp2_m, tp3_m);             \
+  tp0_m = __msa_srari_d(tp0_m, DCT_CONST_BITS);                  \
+  tp1_m = __msa_srari_d(tp1_m, DCT_CONST_BITS);                  \
+  tp2_m = __msa_srari_d(tp2_m, DCT_CONST_BITS);                  \
+  tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS);                  \
+  out0 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m);              \
+  out1 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m);              \
+                                                                 \
+  DOTP_SW2_SD(s2_m, s3_m, k0_m, k0_m, tp0_m, tp1_m);             \
+  DOTP_SW2_SD(s6_m, s7_m, k0_m, k0_m, tp2_m, tp3_m);             \
+  tp0_m = __msa_srari_d(tp0_m, DCT_CONST_BITS);                  \
+  tp1_m = __msa_srari_d(tp1_m, DCT_CONST_BITS);                  \
+  tp2_m = __msa_srari_d(tp2_m, DCT_CONST_BITS);                  \
+  tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS);                  \
+  out2 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m);              \
+  out3 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m);              \
+}
+
+void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
+                        int32_t src_stride);
+void fdct16x8_1d_row(int16_t *input, int16_t *output);
+#endif  // VPX_DSP_MIPS_FWD_TXFM_MSA_H_
diff --git a/libs/libvpx/vpx_dsp/mips/idct16x16_msa.c b/libs/libvpx/vpx_dsp/mips/idct16x16_msa.c
new file mode 100644
index 0000000000..5faac715e8
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/idct16x16_msa.c
@@ -0,0 +1,487 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/mips/inv_txfm_msa.h"
+
+void vpx_idct16_1d_rows_msa(const int16_t *input, int16_t *output) {
+  v8i16 loc0, loc1, loc2, loc3;
+  v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14;
+  v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15;
+  v8i16 tmp5, tmp6, tmp7;
+
+  LD_SH8(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
+  input += 8;
+  LD_SH8(input, 16, reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15);
+
+  TRANSPOSE8x8_SH_SH(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7,
+                     reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
+  TRANSPOSE8x8_SH_SH(reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15,
+                     reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15);
+  DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14);
+  DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6);
+  BUTTERFLY_4(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2);
+  DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3);
+  DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8);
+  DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12);
+  BUTTERFLY_4(reg8, reg0, reg4, reg12, reg2, reg6, reg10, reg14);
+  SUB4(reg2, loc1, reg14, loc0, reg6, loc3, reg10, loc2, reg0, reg12, reg4,
+       reg8);
+  ADD4(reg2, loc1, reg14, loc0, reg6, loc3, reg10, loc2, reg2, reg14, reg6,
+       reg10);
+
+  /* stage 2 */
+  DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15);
+  DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3);
+
+  reg9 = reg1 - loc2;
+  reg1 = reg1 + loc2;
+  reg7 = reg15 - loc3;
+  reg15 = reg15 + loc3;
+
+  DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11);
+  DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1);
+  BUTTERFLY_4(loc0, loc1, reg11, reg5, reg13, reg3, reg11, reg5);
+
+  loc1 = reg15 + reg3;
+  reg3 = reg15 - reg3;
+  loc2 = reg2 + loc1;
+  reg15 = reg2 - loc1;
+
+  loc1 = reg1 + reg13;
+  reg13 = reg1 - reg13;
+  loc0 = reg0 + loc1;
+  loc1 = reg0 - loc1;
+  tmp6 = loc0;
+  tmp7 = loc1;
+  reg0 = loc2;
+
+  DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9);
+  DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5, reg11);
+
+  loc0 = reg9 + reg5;
+  reg5 = reg9 - reg5;
+  reg2 = reg6 + loc0;
+  reg1 = reg6 - loc0;
+
+  loc0 = reg7 + reg11;
+  reg11 = reg7 - reg11;
+  loc1 = reg4 + loc0;
+  loc2 = reg4 - loc0;
+  tmp5 = loc1;
+
+  DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11);
+  BUTTERFLY_4(reg8, reg10, reg11, reg5, loc0, reg4, reg9, loc1);
+
+  reg10 = loc0;
+  reg11 = loc1;
+
+  DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13);
+  BUTTERFLY_4(reg12, reg14, reg13, reg3, reg8, reg6, reg7, reg5);
+
+  reg13 = loc2;
+
+  /* Transpose and store the output */
+  reg12 = tmp5;
+  reg14 = tmp6;
+  reg3 = tmp7;
+
+  /* transpose block */
+  TRANSPOSE8x8_SH_SH(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14,
+                     reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14);
+  ST_SH8(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, output, 16);
+
+  /* transpose block */
+  TRANSPOSE8x8_SH_SH(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15,
+                     reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15);
+  ST_SH8(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, (output + 8), 16);
+}
+
+void vpx_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
+                                      int32_t dst_stride) {
+  v8i16 loc0, loc1, loc2, loc3;
+  v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14;
+  v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15;
+  v8i16 tmp5, tmp6, tmp7;
+
+  /* load up 8x8 */
+  LD_SH8(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
+  input += 8 * 16;
+  /* load bottom 8x8 */
+  LD_SH8(input, 16, reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15);
+
+  DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14);
+  DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6);
+  BUTTERFLY_4(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2);
+  DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3);
+  DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8);
+  DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12);
+  BUTTERFLY_4(reg8, reg0, reg4, reg12, reg2, reg6, reg10, reg14);
+
+  reg0 = reg2 - loc1;
+  reg2 = reg2 + loc1;
+  reg12 = reg14 - loc0;
+  reg14 = reg14 + loc0;
+  reg4 = reg6 - loc3;
+  reg6 = reg6 + loc3;
+  reg8 = reg10 - loc2;
+  reg10 = reg10 + loc2;
+
+  /* stage 2 */
+  DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15);
+  DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3);
+
+  reg9 = reg1 - loc2;
+  reg1 = reg1 + loc2;
+  reg7 = reg15 - loc3;
+  reg15 = reg15 + loc3;
+
+  DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11);
+  DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1);
+  BUTTERFLY_4(loc0, loc1, reg11, reg5, reg13, reg3, reg11, reg5);
+
+  loc1 = reg15 + reg3;
+  reg3 = reg15 - reg3;
+  loc2 = reg2 + loc1;
+  reg15 = reg2 - loc1;
+
+  loc1 = reg1 + reg13;
+  reg13 = reg1 - reg13;
+  loc0 = reg0 + loc1;
+  loc1 = reg0 - loc1;
+  tmp6 = loc0;
+  tmp7 = loc1;
+  reg0 = loc2;
+
+  DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9);
+  DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5, reg11);
+
+  loc0 = reg9 + reg5;
+  reg5 = reg9 - reg5;
+  reg2 = reg6 + loc0;
+  reg1 = reg6 - loc0;
+
+  loc0 = reg7 + reg11;
+  reg11 = reg7 - reg11;
+  loc1 = reg4 + loc0;
+  loc2 = reg4 - loc0;
+  tmp5 = loc1;
+
+  DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11);
+  BUTTERFLY_4(reg8, reg10, reg11, reg5, loc0, reg4, reg9, loc1);
+
+  reg10 = loc0;
+  reg11 = loc1;
+
+  DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13);
+  BUTTERFLY_4(reg12, reg14, reg13, reg3, reg8, reg6, reg7, reg5);
+  reg13 = loc2;
+
+  /* Transpose and store the output */
+  reg12 = tmp5;
+  reg14 = tmp6;
+  reg3 = tmp7;
+
+  SRARI_H4_SH(reg0, reg2, reg4, reg6, 6);
+  VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg0, reg2, reg4, reg6);
+  dst += (4 * dst_stride);
+  SRARI_H4_SH(reg8, reg10, reg12, reg14, 6);
+  VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg8, reg10, reg12, reg14);
+  dst += (4 * dst_stride);
+  SRARI_H4_SH(reg3, reg13, reg11, reg5, 6);
+  VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg3, reg13, reg11, reg5);
+  dst += (4 * dst_stride);
+  SRARI_H4_SH(reg7, reg9, reg1, reg15, 6);
+  VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg7, reg9, reg1, reg15);
+}
+
+void vpx_idct16x16_256_add_msa(const int16_t *input, uint8_t *dst,
+                               int32_t dst_stride) {
+  int32_t i;
+  DECLARE_ALIGNED(32, int16_t, out_arr[16 * 16]);
+  int16_t *out = out_arr;
+
+  /* transform rows */
+  for (i = 0; i < 2; ++i) {
+    /* process 16 * 8 block */
+    vpx_idct16_1d_rows_msa((input + (i << 7)), (out + (i << 7)));
+  }
+
+  /* transform columns */
+  for (i = 0; i < 2; ++i) {
+    /* process 8 * 16 block */
+    vpx_idct16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)),
+                                     dst_stride);
+  }
+}
+
+void vpx_idct16x16_10_add_msa(const int16_t *input, uint8_t *dst,
+                              int32_t dst_stride) {
+  uint8_t i;
+  DECLARE_ALIGNED(32, int16_t, out_arr[16 * 16]);
+  int16_t *out = out_arr;
+
+  /* process 16 * 8 block */
+  vpx_idct16_1d_rows_msa(input, out);
+
+  /* short case just considers top 4 rows as valid output */
+  out += 4 * 16;
+  for (i = 12; i--;) {
+    __asm__ __volatile__ (
+        "sw     $zero,   0(%[out])     \n\t"
+        "sw     $zero,   4(%[out])     \n\t"
+        "sw     $zero,   8(%[out])     \n\t"
+        "sw     $zero,  12(%[out])     \n\t"
+        "sw     $zero,  16(%[out])     \n\t"
+        "sw     $zero,  20(%[out])     \n\t"
+        "sw     $zero,  24(%[out])     \n\t"
+        "sw     $zero,  28(%[out])     \n\t"
+
+        :
+        : [out] "r" (out)
+    );
+
+    out += 16;
+  }
+
+  out = out_arr;
+
+  /* transform columns */
+  for (i = 0; i < 2; ++i) {
+    /* process 8 * 16 block */
+    vpx_idct16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)),
+                                     dst_stride);
+  }
+}
+
+void vpx_idct16x16_1_add_msa(const int16_t *input, uint8_t *dst,
+                             int32_t dst_stride) {
+  uint8_t i;
+  int16_t out;
+  v8i16 vec, res0, res1, res2, res3, res4, res5, res6, res7;
+  v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
+
+  out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS);
+  out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS);
+  out = ROUND_POWER_OF_TWO(out, 6);
+
+  vec = __msa_fill_h(out);
+
+  for (i = 4; i--;) {
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    UNPCK_UB_SH(dst0, res0, res4);
+    UNPCK_UB_SH(dst1, res1, res5);
+    UNPCK_UB_SH(dst2, res2, res6);
+    UNPCK_UB_SH(dst3, res3, res7);
+    ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3);
+    ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, res7);
+    CLIP_SH4_0_255(res0, res1, res2, res3);
+    CLIP_SH4_0_255(res4, res5, res6, res7);
+    PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3,
+                tmp0, tmp1, tmp2, tmp3);
+    ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+    dst += (4 * dst_stride);
+  }
+}
+
+void vpx_iadst16_1d_rows_msa(const int16_t *input, int16_t *output) {
+  v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+  v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15;
+
+  /* load input data */
+  LD_SH16(input, 8,
+          l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14, l7, l15);
+  TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7,
+                     l0, l1, l2, l3, l4, l5, l6, l7);
+  TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15,
+                     l8, l9, l10, l11, l12, l13, l14, l15);
+
+  /* ADST in horizontal */
+  VP9_IADST8x16_1D(l0, l1, l2, l3, l4, l5, l6, l7,
+                   l8, l9, l10, l11, l12, l13, l14, l15,
+                   r0, r1, r2, r3, r4, r5, r6, r7,
+                   r8, r9, r10, r11, r12, r13, r14, r15);
+
+  l1 = -r8;
+  l3 = -r4;
+  l13 = -r13;
+  l15 = -r1;
+
+  TRANSPOSE8x8_SH_SH(r0, l1, r12, l3, r6, r14, r10, r2,
+                     l0, l1, l2, l3, l4, l5, l6, l7);
+  ST_SH8(l0, l1, l2, l3, l4, l5, l6, l7, output, 16);
+  TRANSPOSE8x8_SH_SH(r3, r11, r15, r7, r5, l13, r9, l15,
+                     l8, l9, l10, l11, l12, l13, l14, l15);
+  ST_SH8(l8, l9, l10, l11, l12, l13, l14, l15, (output + 8), 16);
+}
+
+void vpx_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
+                                       int32_t dst_stride) {
+  v8i16 v0, v2, v4, v6, k0, k1, k2, k3;
+  v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+  v8i16 out0, out1, out2, out3, out4, out5, out6, out7;
+  v8i16 out8, out9, out10, out11, out12, out13, out14, out15;
+  v8i16 g0, g1, g2, g3, g4, g5, g6, g7, g8, g9, g10, g11, g12, g13, g14, g15;
+  v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11;
+  v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
+  v8i16 res8, res9, res10, res11, res12, res13, res14, res15;
+  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+  v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
+  v16i8 zero = { 0 };
+
+  r0 = LD_SH(input + 0 * 16);
+  r3 = LD_SH(input + 3 * 16);
+  r4 = LD_SH(input + 4 * 16);
+  r7 = LD_SH(input + 7 * 16);
+  r8 = LD_SH(input + 8 * 16);
+  r11 = LD_SH(input + 11 * 16);
+  r12 = LD_SH(input + 12 * 16);
+  r15 = LD_SH(input + 15 * 16);
+
+  /* stage 1 */
+  k0 = VP9_SET_COSPI_PAIR(cospi_1_64, cospi_31_64);
+  k1 = VP9_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64);
+  k2 = VP9_SET_COSPI_PAIR(cospi_17_64, cospi_15_64);
+  k3 = VP9_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64);
+  MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3);
+  k0 = VP9_SET_COSPI_PAIR(cospi_9_64, cospi_23_64);
+  k1 = VP9_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64);
+  k2 = VP9_SET_COSPI_PAIR(cospi_25_64, cospi_7_64);
+  k3 = VP9_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64);
+  MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11);
+  BUTTERFLY_4(g0, g2, g10, g8, h8, h9, v2, v0);
+  k0 = VP9_SET_COSPI_PAIR(cospi_4_64, cospi_28_64);
+  k1 = VP9_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64);
+  k2 = VP9_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64);
+  MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3);
+
+  r1 = LD_SH(input + 1 * 16);
+  r2 = LD_SH(input + 2 * 16);
+  r5 = LD_SH(input + 5 * 16);
+  r6 = LD_SH(input + 6 * 16);
+  r9 = LD_SH(input + 9 * 16);
+  r10 = LD_SH(input + 10 * 16);
+  r13 = LD_SH(input + 13 * 16);
+  r14 = LD_SH(input + 14 * 16);
+
+  k0 = VP9_SET_COSPI_PAIR(cospi_5_64, cospi_27_64);
+  k1 = VP9_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64);
+  k2 = VP9_SET_COSPI_PAIR(cospi_21_64, cospi_11_64);
+  k3 = VP9_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64);
+  MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, g4, g5, g6, g7);
+  k0 = VP9_SET_COSPI_PAIR(cospi_13_64, cospi_19_64);
+  k1 = VP9_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64);
+  k2 = VP9_SET_COSPI_PAIR(cospi_29_64, cospi_3_64);
+  k3 = VP9_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64);
+  MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g12, g13, g14, g15);
+  BUTTERFLY_4(g4, g6, g14, g12, h10, h11, v6, v4);
+  BUTTERFLY_4(h8, h9, h11, h10, out0, out1, h11, h10);
+  out1 = -out1;
+  SRARI_H2_SH(out0, out1, 6);
+  dst0 = LD_UB(dst + 0 * dst_stride);
+  dst1 = LD_UB(dst + 15 * dst_stride);
+  ILVR_B2_SH(zero, dst0, zero, dst1, res0, res1);
+  ADD2(res0, out0, res1, out1, res0, res1);
+  CLIP_SH2_0_255(res0, res1);
+  PCKEV_B2_SH(res0, res0, res1, res1, res0, res1);
+  ST8x1_UB(res0, dst);
+  ST8x1_UB(res1, dst + 15 * dst_stride);
+
+  k0 = VP9_SET_COSPI_PAIR(cospi_12_64, cospi_20_64);
+  k1 = VP9_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64);
+  k2 = VP9_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64);
+  MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7);
+  BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10);
+  out8 = -out8;
+
+  SRARI_H2_SH(out8, out9, 6);
+  dst8 = LD_UB(dst + 1 * dst_stride);
+  dst9 = LD_UB(dst + 14 * dst_stride);
+  ILVR_B2_SH(zero, dst8, zero, dst9, res8, res9);
+  ADD2(res8, out8, res9, out9, res8, res9);
+  CLIP_SH2_0_255(res8, res9);
+  PCKEV_B2_SH(res8, res8, res9, res9, res8, res9);
+  ST8x1_UB(res8, dst + dst_stride);
+  ST8x1_UB(res9, dst + 14 * dst_stride);
+
+  k0 = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);
+  k1 = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);
+  k2 = VP9_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64);
+  MADD_BF(v0, v2, v4, v6, k0, k1, k2, k0, out4, out6, out5, out7);
+  out4 = -out4;
+  SRARI_H2_SH(out4, out5, 6);
+  dst4 = LD_UB(dst + 3 * dst_stride);
+  dst5 = LD_UB(dst + 12 * dst_stride);
+  ILVR_B2_SH(zero, dst4, zero, dst5, res4, res5);
+  ADD2(res4, out4, res5, out5, res4, res5);
+  CLIP_SH2_0_255(res4, res5);
+  PCKEV_B2_SH(res4, res4, res5, res5, res4, res5);
+  ST8x1_UB(res4, dst + 3 * dst_stride);
+  ST8x1_UB(res5, dst + 12 * dst_stride);
+
+  MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15);
+  out13 = -out13;
+  SRARI_H2_SH(out12, out13, 6);
+  dst12 = LD_UB(dst + 2 * dst_stride);
+  dst13 = LD_UB(dst + 13 * dst_stride);
+  ILVR_B2_SH(zero, dst12, zero, dst13, res12, res13);
+  ADD2(res12, out12, res13, out13, res12, res13);
+  CLIP_SH2_0_255(res12, res13);
+  PCKEV_B2_SH(res12, res12, res13, res13, res12, res13);
+  ST8x1_UB(res12, dst + 2 * dst_stride);
+  ST8x1_UB(res13, dst + 13 * dst_stride);
+
+  k0 = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64);
+  k3 = VP9_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64);
+  MADD_SHORT(out6, out7, k0, k3, out6, out7);
+  SRARI_H2_SH(out6, out7, 6);
+  dst6 = LD_UB(dst + 4 * dst_stride);
+  dst7 = LD_UB(dst + 11 * dst_stride);
+  ILVR_B2_SH(zero, dst6, zero, dst7, res6, res7);
+  ADD2(res6, out6, res7, out7, res6, res7);
+  CLIP_SH2_0_255(res6, res7);
+  PCKEV_B2_SH(res6, res6, res7, res7, res6, res7);
+  ST8x1_UB(res6, dst + 4 * dst_stride);
+  ST8x1_UB(res7, dst + 11 * dst_stride);
+
+  MADD_SHORT(out10, out11, k0, k3, out10, out11);
+  SRARI_H2_SH(out10, out11, 6);
+  dst10 = LD_UB(dst + 6 * dst_stride);
+  dst11 = LD_UB(dst + 9 * dst_stride);
+  ILVR_B2_SH(zero, dst10, zero, dst11, res10, res11);
+  ADD2(res10, out10, res11, out11, res10, res11);
+  CLIP_SH2_0_255(res10, res11);
+  PCKEV_B2_SH(res10, res10, res11, res11, res10, res11);
+  ST8x1_UB(res10, dst + 6 * dst_stride);
+  ST8x1_UB(res11, dst + 9 * dst_stride);
+
+  k1 = VP9_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64);
+  k2 = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);
+  MADD_SHORT(h10, h11, k1, k2, out2, out3);
+  SRARI_H2_SH(out2, out3, 6);
+  dst2 = LD_UB(dst + 7 * dst_stride);
+  dst3 = LD_UB(dst + 8 * dst_stride);
+  ILVR_B2_SH(zero, dst2, zero, dst3, res2, res3);
+  ADD2(res2, out2, res3, out3, res2, res3);
+  CLIP_SH2_0_255(res2, res3);
+  PCKEV_B2_SH(res2, res2, res3, res3, res2, res3);
+  ST8x1_UB(res2, dst + 7 * dst_stride);
+  ST8x1_UB(res3, dst + 8 * dst_stride);
+
+  MADD_SHORT(out14, out15, k1, k2, out14, out15);
+  SRARI_H2_SH(out14, out15, 6);
+  dst14 = LD_UB(dst + 5 * dst_stride);
+  dst15 = LD_UB(dst + 10 * dst_stride);
+  ILVR_B2_SH(zero, dst14, zero, dst15, res14, res15);
+  ADD2(res14, out14, res15, out15, res14, res15);
+  CLIP_SH2_0_255(res14, res15);
+  PCKEV_B2_SH(res14, res14, res15, res15, res14, res15);
+  ST8x1_UB(res14, dst + 5 * dst_stride);
+  ST8x1_UB(res15, dst + 10 * dst_stride);
+}
diff --git a/libs/libvpx/vpx_dsp/mips/idct32x32_msa.c b/libs/libvpx/vpx_dsp/mips/idct32x32_msa.c
new file mode 100644
index 0000000000..d5b3966e0e
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/idct32x32_msa.c
@@ -0,0 +1,739 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/mips/inv_txfm_msa.h"
+
+static void idct32x8_row_transpose_store(const int16_t *input,
+                                         int16_t *tmp_buf) {
+  v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7;
+
+  /* 1st & 2nd 8x8 */
+  LD_SH8(input, 32, m0, n0, m1, n1, m2, n2, m3, n3);
+  LD_SH8((input + 8), 32, m4, n4, m5, n5, m6, n6, m7, n7);
+  TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3,
+                     m0, n0, m1, n1, m2, n2, m3, n3);
+  TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7,
+                     m4, n4, m5, n5, m6, n6, m7, n7);
+  ST_SH8(m0, n0, m1, n1, m2, n2, m3, n3, (tmp_buf), 8);
+  ST_SH4(m4, n4, m5, n5, (tmp_buf + 8 * 8), 8);
+  ST_SH4(m6, n6, m7, n7, (tmp_buf + 12 * 8), 8);
+
+  /* 3rd & 4th 8x8 */
+  LD_SH8((input + 16), 32, m0, n0, m1, n1, m2, n2, m3, n3);
+  LD_SH8((input + 24), 32, m4, n4, m5, n5, m6, n6, m7, n7);
+  TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3,
+                     m0, n0, m1, n1, m2, n2, m3, n3);
+  TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7,
+                     m4, n4, m5, n5, m6, n6, m7, n7);
+  ST_SH4(m0, n0, m1, n1, (tmp_buf + 16 * 8), 8);
+  ST_SH4(m2, n2, m3, n3, (tmp_buf + 20 * 8), 8);
+  ST_SH4(m4, n4, m5, n5, (tmp_buf + 24 * 8), 8);
+  ST_SH4(m6, n6, m7, n7, (tmp_buf + 28 * 8), 8);
+}
+
+static void idct32x8_row_even_process_store(int16_t *tmp_buf,
+                                            int16_t *tmp_eve_buf) {
+  v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+  v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7;
+
+  /* Even stage 1 */
+  LD_SH8(tmp_buf, 32, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
+
+  DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7);
+  DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3);
+  BUTTERFLY_4(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0);
+  DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+
+  loc1 = vec3;
+  loc0 = vec1;
+
+  DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4);
+  DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6);
+  BUTTERFLY_4(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0);
+  BUTTERFLY_4(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4);
+  BUTTERFLY_4(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5);
+
+  /* Even stage 2 */
+  LD_SH8((tmp_buf + 16), 32, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
+  DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7);
+  DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3);
+  DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5);
+  DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1);
+
+  vec0 = reg0 + reg4;
+  reg0 = reg0 - reg4;
+  reg4 = reg6 + reg2;
+  reg6 = reg6 - reg2;
+  reg2 = reg1 + reg5;
+  reg1 = reg1 - reg5;
+  reg5 = reg7 + reg3;
+  reg7 = reg7 - reg3;
+  reg3 = vec0;
+
+  vec1 = reg2;
+  reg2 = reg3 + reg4;
+  reg3 = reg3 - reg4;
+  reg4 = reg5 - vec1;
+  reg5 = reg5 + vec1;
+
+  DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7);
+  DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1);
+
+  vec0 = reg0 - reg6;
+  reg0 = reg0 + reg6;
+  vec1 = reg7 - reg1;
+  reg7 = reg7 + reg1;
+
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1);
+  DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4);
+
+  /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */
+  BUTTERFLY_4(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0);
+  ST_SH(loc0, (tmp_eve_buf + 15 * 8));
+  ST_SH(loc1, (tmp_eve_buf));
+  ST_SH(loc2, (tmp_eve_buf + 14 * 8));
+  ST_SH(loc3, (tmp_eve_buf + 8));
+
+  BUTTERFLY_4(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0);
+  ST_SH(loc0, (tmp_eve_buf + 13 * 8));
+  ST_SH(loc1, (tmp_eve_buf + 2 * 8));
+  ST_SH(loc2, (tmp_eve_buf + 12 * 8));
+  ST_SH(loc3, (tmp_eve_buf + 3 * 8));
+
+  /* Store 8 */
+  BUTTERFLY_4(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0);
+  ST_SH(loc0, (tmp_eve_buf + 11 * 8));
+  ST_SH(loc1, (tmp_eve_buf + 4 * 8));
+  ST_SH(loc2, (tmp_eve_buf + 10 * 8));
+  ST_SH(loc3, (tmp_eve_buf + 5 * 8));
+
+  BUTTERFLY_4(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0);
+  ST_SH(loc0, (tmp_eve_buf + 9 * 8));
+  ST_SH(loc1, (tmp_eve_buf + 6 * 8));
+  ST_SH(loc2, (tmp_eve_buf + 8 * 8));
+  ST_SH(loc3, (tmp_eve_buf + 7 * 8));
+}
+
+static void idct32x8_row_odd_process_store(int16_t *tmp_buf,
+                                           int16_t *tmp_odd_buf) {
+  v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+  v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+
+  /* Odd stage 1 */
+  reg0 = LD_SH(tmp_buf + 8);
+  reg1 = LD_SH(tmp_buf + 7 * 8);
+  reg2 = LD_SH(tmp_buf + 9 * 8);
+  reg3 = LD_SH(tmp_buf + 15 * 8);
+  reg4 = LD_SH(tmp_buf + 17 * 8);
+  reg5 = LD_SH(tmp_buf + 23 * 8);
+  reg6 = LD_SH(tmp_buf + 25 * 8);
+  reg7 = LD_SH(tmp_buf + 31 * 8);
+
+  DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7);
+  DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4);
+  DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5);
+  DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6);
+
+  vec0 = reg0 + reg3;
+  reg0 = reg0 - reg3;
+  reg3 = reg7 + reg4;
+  reg7 = reg7 - reg4;
+  reg4 = reg1 + reg2;
+  reg1 = reg1 - reg2;
+  reg2 = reg6 + reg5;
+  reg6 = reg6 - reg5;
+  reg5 = vec0;
+
+  /* 4 Stores */
+  ADD2(reg5, reg4, reg3, reg2, vec0, vec1);
+  ST_SH2(vec0, vec1, (tmp_odd_buf + 4 * 8), 8);
+
+  SUB2(reg5, reg4, reg3, reg2, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1);
+  ST_SH2(vec0, vec1, (tmp_odd_buf), 8);
+
+  /* 4 Stores */
+  DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7);
+  DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6);
+  BUTTERFLY_4(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3);
+  ST_SH2(vec0, vec1, (tmp_odd_buf + 6 * 8), 8);
+
+  DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3);
+  ST_SH2(vec2, vec3, (tmp_odd_buf + 2 * 8), 8);
+
+  /* Odd stage 2 */
+  /* 8 loads */
+  reg0 = LD_SH(tmp_buf + 3 * 8);
+  reg1 = LD_SH(tmp_buf + 5 * 8);
+  reg2 = LD_SH(tmp_buf + 11 * 8);
+  reg3 = LD_SH(tmp_buf + 13 * 8);
+  reg4 = LD_SH(tmp_buf + 19 * 8);
+  reg5 = LD_SH(tmp_buf + 21 * 8);
+  reg6 = LD_SH(tmp_buf + 27 * 8);
+  reg7 = LD_SH(tmp_buf + 29 * 8);
+
+  DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6);
+  DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5);
+  DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4);
+  DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
+
+  /* 4 Stores */
+  SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4,
+       vec0, vec1, vec2, vec3);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
+  DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
+
+  BUTTERFLY_4(loc3, loc2, loc0, loc1, vec1, vec0, vec2, vec3);
+  ST_SH2(vec0, vec1, (tmp_odd_buf + 12 * 8), 3 * 8);
+
+  DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1);
+  ST_SH2(vec0, vec1, (tmp_odd_buf + 10 * 8), 8);
+
+  /* 4 Stores */
+  ADD4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4,
+       vec1, vec2, vec0, vec3);
+  BUTTERFLY_4(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2);
+  ST_SH(reg0, (tmp_odd_buf + 13 * 8));
+  ST_SH(reg1, (tmp_odd_buf + 14 * 8));
+
+  DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1);
+  ST_SH2(reg0, reg1, (tmp_odd_buf + 8 * 8), 8);
+
+  /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */
+
+  /* Load 8 & Store 8 */
+  LD_SH4(tmp_odd_buf, 8, reg0, reg1, reg2, reg3);
+  LD_SH4((tmp_odd_buf + 8 * 8), 8, reg4, reg5, reg6, reg7);
+
+  ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7,
+       loc0, loc1, loc2, loc3);
+  ST_SH4(loc0, loc1, loc2, loc3, tmp_odd_buf, 8);
+
+  SUB2(reg0, reg4, reg1, reg5, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+
+  SUB2(reg2, reg6, reg3, reg7, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+  ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 8 * 8), 8);
+
+  /* Load 8 & Store 8 */
+  LD_SH4((tmp_odd_buf + 4 * 8), 8, reg1, reg2, reg0, reg3);
+  LD_SH4((tmp_odd_buf + 12 * 8), 8, reg4, reg5, reg6, reg7);
+
+  ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7,
+       loc0, loc1, loc2, loc3);
+  ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8);
+
+  SUB2(reg0, reg4, reg3, reg7, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+
+  SUB2(reg1, reg5, reg2, reg6, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+  ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8);
+}
+
+static void idct_butterfly_transpose_store(int16_t *tmp_buf,
+                                           int16_t *tmp_eve_buf,
+                                           int16_t *tmp_odd_buf,
+                                           int16_t *dst) {
+  v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+  v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7;
+
+  /* FINAL BUTTERFLY : Dependency on Even & Odd */
+  vec0 = LD_SH(tmp_odd_buf);
+  vec1 = LD_SH(tmp_odd_buf + 9 * 8);
+  vec2 = LD_SH(tmp_odd_buf + 14 * 8);
+  vec3 = LD_SH(tmp_odd_buf + 6 * 8);
+  loc0 = LD_SH(tmp_eve_buf);
+  loc1 = LD_SH(tmp_eve_buf + 8 * 8);
+  loc2 = LD_SH(tmp_eve_buf + 4 * 8);
+  loc3 = LD_SH(tmp_eve_buf + 12 * 8);
+
+  ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, m4, m2, m6);
+
+  ST_SH((loc0 - vec3), (tmp_buf + 31 * 8));
+  ST_SH((loc1 - vec2), (tmp_buf + 23 * 8));
+  ST_SH((loc2 - vec1), (tmp_buf + 27 * 8));
+  ST_SH((loc3 - vec0), (tmp_buf + 19 * 8));
+
+  /* Load 8 & Store 8 */
+  vec0 = LD_SH(tmp_odd_buf + 4 * 8);
+  vec1 = LD_SH(tmp_odd_buf + 13 * 8);
+  vec2 = LD_SH(tmp_odd_buf + 10 * 8);
+  vec3 = LD_SH(tmp_odd_buf + 3 * 8);
+  loc0 = LD_SH(tmp_eve_buf + 2 * 8);
+  loc1 = LD_SH(tmp_eve_buf + 10 * 8);
+  loc2 = LD_SH(tmp_eve_buf + 6 * 8);
+  loc3 = LD_SH(tmp_eve_buf + 14 * 8);
+
+  ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, m5, m3, m7);
+
+  ST_SH((loc0 - vec3), (tmp_buf + 29 * 8));
+  ST_SH((loc1 - vec2), (tmp_buf + 21 * 8));
+  ST_SH((loc2 - vec1), (tmp_buf + 25 * 8));
+  ST_SH((loc3 - vec0), (tmp_buf + 17 * 8));
+
+  /* Load 8 & Store 8 */
+  vec0 = LD_SH(tmp_odd_buf + 2 * 8);
+  vec1 = LD_SH(tmp_odd_buf + 11 * 8);
+  vec2 = LD_SH(tmp_odd_buf + 12 * 8);
+  vec3 = LD_SH(tmp_odd_buf + 7 * 8);
+  loc0 = LD_SH(tmp_eve_buf + 1 * 8);
+  loc1 = LD_SH(tmp_eve_buf + 9 * 8);
+  loc2 = LD_SH(tmp_eve_buf + 5 * 8);
+  loc3 = LD_SH(tmp_eve_buf + 13 * 8);
+
+  ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, n4, n2, n6);
+
+  ST_SH((loc0 - vec3), (tmp_buf + 30 * 8));
+  ST_SH((loc1 - vec2), (tmp_buf + 22 * 8));
+  ST_SH((loc2 - vec1), (tmp_buf + 26 * 8));
+  ST_SH((loc3 - vec0), (tmp_buf + 18 * 8));
+
+  /* Load 8 & Store 8 */
+  vec0 = LD_SH(tmp_odd_buf + 5 * 8);
+  vec1 = LD_SH(tmp_odd_buf + 15 * 8);
+  vec2 = LD_SH(tmp_odd_buf + 8 * 8);
+  vec3 = LD_SH(tmp_odd_buf + 1 * 8);
+  loc0 = LD_SH(tmp_eve_buf + 3 * 8);
+  loc1 = LD_SH(tmp_eve_buf + 11 * 8);
+  loc2 = LD_SH(tmp_eve_buf + 7 * 8);
+  loc3 = LD_SH(tmp_eve_buf + 15 * 8);
+
+  ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, n5, n3, n7);
+
+  ST_SH((loc0 - vec3), (tmp_buf + 28 * 8));
+  ST_SH((loc1 - vec2), (tmp_buf + 20 * 8));
+  ST_SH((loc2 - vec1), (tmp_buf + 24 * 8));
+  ST_SH((loc3 - vec0), (tmp_buf + 16 * 8));
+
+  /* Transpose : 16 vectors */
+  /* 1st & 2nd 8x8 */
+  TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3,
+                     m0, n0, m1, n1, m2, n2, m3, n3);
+  ST_SH4(m0, n0, m1, n1, (dst + 0), 32);
+  ST_SH4(m2, n2, m3, n3, (dst + 4 * 32), 32);
+
+  TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7,
+                     m4, n4, m5, n5, m6, n6, m7, n7);
+  ST_SH4(m4, n4, m5, n5, (dst + 8), 32);
+  ST_SH4(m6, n6, m7, n7, (dst + 8 + 4 * 32), 32);
+
+  /* 3rd & 4th 8x8 */
+  LD_SH8((tmp_buf + 8 * 16), 8, m0, n0, m1, n1, m2, n2, m3, n3);
+  LD_SH8((tmp_buf + 12 * 16), 8, m4, n4, m5, n5, m6, n6, m7, n7);
+  TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3,
+                     m0, n0, m1, n1, m2, n2, m3, n3);
+  ST_SH4(m0, n0, m1, n1, (dst + 16), 32);
+  ST_SH4(m2, n2, m3, n3, (dst + 16 + 4 * 32), 32);
+
+  TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7,
+                     m4, n4, m5, n5, m6, n6, m7, n7);
+  ST_SH4(m4, n4, m5, n5, (dst + 24), 32);
+  ST_SH4(m6, n6, m7, n7, (dst + 24 + 4 * 32), 32);
+}
+
+static void idct32x8_1d_rows_msa(const int16_t *input, int16_t *output) {
+  DECLARE_ALIGNED(32, int16_t, tmp_buf[8 * 32]);
+  DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]);
+  DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]);
+
+  idct32x8_row_transpose_store(input, &tmp_buf[0]);
+  idct32x8_row_even_process_store(&tmp_buf[0], &tmp_eve_buf[0]);
+  idct32x8_row_odd_process_store(&tmp_buf[0], &tmp_odd_buf[0]);
+  idct_butterfly_transpose_store(&tmp_buf[0], &tmp_eve_buf[0],
+                                 &tmp_odd_buf[0], output);
+}
+
+static void idct8x32_column_even_process_store(int16_t *tmp_buf,
+                                               int16_t *tmp_eve_buf) {
+  v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+  v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7;
+
+  /* Even stage 1 */
+  LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
+  tmp_buf += (2 * 32);
+
+  DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7);
+  DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3);
+  BUTTERFLY_4(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0);
+  DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+
+  loc1 = vec3;
+  loc0 = vec1;
+
+  DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4);
+  DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6);
+  BUTTERFLY_4(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0);
+  BUTTERFLY_4(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4);
+  BUTTERFLY_4(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5);
+
+  /* Even stage 2 */
+  /* Load 8 */
+  LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
+
+  DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7);
+  DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3);
+  DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5);
+  DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1);
+
+  vec0 = reg0 + reg4;
+  reg0 = reg0 - reg4;
+  reg4 = reg6 + reg2;
+  reg6 = reg6 - reg2;
+  reg2 = reg1 + reg5;
+  reg1 = reg1 - reg5;
+  reg5 = reg7 + reg3;
+  reg7 = reg7 - reg3;
+  reg3 = vec0;
+
+  vec1 = reg2;
+  reg2 = reg3 + reg4;
+  reg3 = reg3 - reg4;
+  reg4 = reg5 - vec1;
+  reg5 = reg5 + vec1;
+
+  DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7);
+  DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1);
+
+  vec0 = reg0 - reg6;
+  reg0 = reg0 + reg6;
+  vec1 = reg7 - reg1;
+  reg7 = reg7 + reg1;
+
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1);
+  DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4);
+
+  /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */
+  /* Store 8 */
+  BUTTERFLY_4(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0);
+  ST_SH2(loc1, loc3, tmp_eve_buf, 8);
+  ST_SH2(loc2, loc0, (tmp_eve_buf + 14 * 8), 8);
+
+  BUTTERFLY_4(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0);
+  ST_SH2(loc1, loc3, (tmp_eve_buf + 2 * 8), 8);
+  ST_SH2(loc2, loc0, (tmp_eve_buf + 12 * 8), 8);
+
+  /* Store 8 */
+  BUTTERFLY_4(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0);
+  ST_SH2(loc1, loc3, (tmp_eve_buf + 4 * 8), 8);
+  ST_SH2(loc2, loc0, (tmp_eve_buf + 10 * 8), 8);
+
+  BUTTERFLY_4(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0);
+  ST_SH2(loc1, loc3, (tmp_eve_buf + 6 * 8), 8);
+  ST_SH2(loc2, loc0, (tmp_eve_buf + 8 * 8), 8);
+}
+
+static void idct8x32_column_odd_process_store(int16_t *tmp_buf,
+                                              int16_t *tmp_odd_buf) {
+  v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+  v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+
+  /* Odd stage 1 */
+  reg0 = LD_SH(tmp_buf + 32);
+  reg1 = LD_SH(tmp_buf + 7 * 32);
+  reg2 = LD_SH(tmp_buf + 9 * 32);
+  reg3 = LD_SH(tmp_buf + 15 * 32);
+  reg4 = LD_SH(tmp_buf + 17 * 32);
+  reg5 = LD_SH(tmp_buf + 23 * 32);
+  reg6 = LD_SH(tmp_buf + 25 * 32);
+  reg7 = LD_SH(tmp_buf + 31 * 32);
+
+  DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7);
+  DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4);
+  DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5);
+  DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6);
+
+  vec0 = reg0 + reg3;
+  reg0 = reg0 - reg3;
+  reg3 = reg7 + reg4;
+  reg7 = reg7 - reg4;
+  reg4 = reg1 + reg2;
+  reg1 = reg1 - reg2;
+  reg2 = reg6 + reg5;
+  reg6 = reg6 - reg5;
+  reg5 = vec0;
+
+  /* 4 Stores */
+  ADD2(reg5, reg4, reg3, reg2, vec0, vec1);
+  ST_SH2(vec0, vec1, (tmp_odd_buf + 4 * 8), 8);
+  SUB2(reg5, reg4, reg3, reg2, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1);
+  ST_SH2(vec0, vec1, tmp_odd_buf, 8);
+
+  /* 4 Stores */
+  DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7);
+  DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6);
+  BUTTERFLY_4(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3);
+  ST_SH2(vec0, vec1, (tmp_odd_buf + 6 * 8), 8);
+  DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3);
+  ST_SH2(vec2, vec3, (tmp_odd_buf + 2 * 8), 8);
+
+  /* Odd stage 2 */
+  /* 8 loads */
+  reg0 = LD_SH(tmp_buf + 3 * 32);
+  reg1 = LD_SH(tmp_buf + 5 * 32);
+  reg2 = LD_SH(tmp_buf + 11 * 32);
+  reg3 = LD_SH(tmp_buf + 13 * 32);
+  reg4 = LD_SH(tmp_buf + 19 * 32);
+  reg5 = LD_SH(tmp_buf + 21 * 32);
+  reg6 = LD_SH(tmp_buf + 27 * 32);
+  reg7 = LD_SH(tmp_buf + 29 * 32);
+
+  DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6);
+  DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5);
+  DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4);
+  DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
+
+  /* 4 Stores */
+  SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0, vec1, vec2, vec3);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
+  DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
+  BUTTERFLY_4(loc2, loc3, loc1, loc0, vec0, vec1, vec3, vec2);
+  ST_SH2(vec0, vec1, (tmp_odd_buf + 12 * 8), 3 * 8);
+  DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1);
+  ST_SH2(vec0, vec1, (tmp_odd_buf + 10 * 8), 8);
+
+  /* 4 Stores */
+  ADD4(reg0, reg3, reg1, reg2, reg5, reg6, reg4, reg7, vec0, vec1, vec2, vec3);
+  BUTTERFLY_4(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2);
+  ST_SH2(reg0, reg1, (tmp_odd_buf + 13 * 8), 8);
+  DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1);
+  ST_SH2(reg0, reg1, (tmp_odd_buf + 8 * 8), 8);
+
+  /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */
+  /* Load 8 & Store 8 */
+  LD_SH4(tmp_odd_buf, 8, reg0, reg1, reg2, reg3);
+  LD_SH4((tmp_odd_buf + 8 * 8), 8, reg4, reg5, reg6, reg7);
+
+  ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3);
+  ST_SH4(loc0, loc1, loc2, loc3, tmp_odd_buf, 8);
+
+  SUB2(reg0, reg4, reg1, reg5, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+
+  SUB2(reg2, reg6, reg3, reg7, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+  ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 8 * 8), 8);
+
+  /* Load 8 & Store 8 */
+  LD_SH4((tmp_odd_buf + 4 * 8), 8, reg1, reg2, reg0, reg3);
+  LD_SH4((tmp_odd_buf + 12 * 8), 8, reg4, reg5, reg6, reg7);
+
+  ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3);
+  ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8);
+
+  SUB2(reg0, reg4, reg3, reg7, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+
+  SUB2(reg1, reg5, reg2, reg6, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+  ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8);
+}
+
+static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf,
+                                             int16_t *tmp_odd_buf,
+                                             uint8_t *dst,
+                                             int32_t dst_stride) {
+  v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+  v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7;
+
+  /* FINAL BUTTERFLY : Dependency on Even & Odd */
+  vec0 = LD_SH(tmp_odd_buf);
+  vec1 = LD_SH(tmp_odd_buf + 9 * 8);
+  vec2 = LD_SH(tmp_odd_buf + 14 * 8);
+  vec3 = LD_SH(tmp_odd_buf + 6 * 8);
+  loc0 = LD_SH(tmp_eve_buf);
+  loc1 = LD_SH(tmp_eve_buf + 8 * 8);
+  loc2 = LD_SH(tmp_eve_buf + 4 * 8);
+  loc3 = LD_SH(tmp_eve_buf + 12 * 8);
+
+  ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, m4, m2, m6);
+  SRARI_H4_SH(m0, m2, m4, m6, 6);
+  VP9_ADDBLK_ST8x4_UB(dst, (4 * dst_stride), m0, m2, m4, m6);
+
+  SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m6, m2, m4, m0);
+  SRARI_H4_SH(m0, m2, m4, m6, 6);
+  VP9_ADDBLK_ST8x4_UB((dst + 19 * dst_stride), (4 * dst_stride),
+                      m0, m2, m4, m6);
+
+  /* Load 8 & Store 8 */
+  vec0 = LD_SH(tmp_odd_buf + 4 * 8);
+  vec1 = LD_SH(tmp_odd_buf + 13 * 8);
+  vec2 = LD_SH(tmp_odd_buf + 10 * 8);
+  vec3 = LD_SH(tmp_odd_buf + 3 * 8);
+  loc0 = LD_SH(tmp_eve_buf + 2 * 8);
+  loc1 = LD_SH(tmp_eve_buf + 10 * 8);
+  loc2 = LD_SH(tmp_eve_buf + 6 * 8);
+  loc3 = LD_SH(tmp_eve_buf + 14 * 8);
+
+  ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, m5, m3, m7);
+  SRARI_H4_SH(m1, m3, m5, m7, 6);
+  VP9_ADDBLK_ST8x4_UB((dst + 2 * dst_stride), (4 * dst_stride),
+                      m1, m3, m5, m7);
+
+  SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m7, m3, m5, m1);
+  SRARI_H4_SH(m1, m3, m5, m7, 6);
+  VP9_ADDBLK_ST8x4_UB((dst + 17 * dst_stride), (4 * dst_stride),
+                      m1, m3, m5, m7);
+
+  /* Load 8 & Store 8 */
+  vec0 = LD_SH(tmp_odd_buf + 2 * 8);
+  vec1 = LD_SH(tmp_odd_buf + 11 * 8);
+  vec2 = LD_SH(tmp_odd_buf + 12 * 8);
+  vec3 = LD_SH(tmp_odd_buf + 7 * 8);
+  loc0 = LD_SH(tmp_eve_buf + 1 * 8);
+  loc1 = LD_SH(tmp_eve_buf + 9 * 8);
+  loc2 = LD_SH(tmp_eve_buf + 5 * 8);
+  loc3 = LD_SH(tmp_eve_buf + 13 * 8);
+
+  ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, n4, n2, n6);
+  SRARI_H4_SH(n0, n2, n4, n6, 6);
+  VP9_ADDBLK_ST8x4_UB((dst + 1 * dst_stride), (4 * dst_stride),
+                      n0, n2, n4, n6);
+
+  SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n6, n2, n4, n0);
+  SRARI_H4_SH(n0, n2, n4, n6, 6);
+  VP9_ADDBLK_ST8x4_UB((dst + 18 * dst_stride), (4 * dst_stride),
+                      n0, n2, n4, n6);
+
+  /* Load 8 & Store 8 */
+  vec0 = LD_SH(tmp_odd_buf + 5 * 8);
+  vec1 = LD_SH(tmp_odd_buf + 15 * 8);
+  vec2 = LD_SH(tmp_odd_buf + 8 * 8);
+  vec3 = LD_SH(tmp_odd_buf + 1 * 8);
+  loc0 = LD_SH(tmp_eve_buf + 3 * 8);
+  loc1 = LD_SH(tmp_eve_buf + 11 * 8);
+  loc2 = LD_SH(tmp_eve_buf + 7 * 8);
+  loc3 = LD_SH(tmp_eve_buf + 15 * 8);
+
+  ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, n5, n3, n7);
+  SRARI_H4_SH(n1, n3, n5, n7, 6);
+  VP9_ADDBLK_ST8x4_UB((dst + 3 * dst_stride), (4 * dst_stride),
+                      n1, n3, n5, n7);
+
+  SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n7, n3, n5, n1);
+  SRARI_H4_SH(n1, n3, n5, n7, 6);
+  VP9_ADDBLK_ST8x4_UB((dst + 16 * dst_stride), (4 * dst_stride),
+                      n1, n3, n5, n7);
+}
+
+static void idct8x32_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
+                                           int32_t dst_stride) {
+  DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]);
+  DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]);
+
+  idct8x32_column_even_process_store(input, &tmp_eve_buf[0]);
+  idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]);
+  idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0],
+                                   dst, dst_stride);
+}
+
+void vpx_idct32x32_1024_add_msa(const int16_t *input, uint8_t *dst,
+                                int32_t dst_stride) {
+  int32_t i;
+  DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]);
+  int16_t *out_ptr = out_arr;
+
+  /* transform rows */
+  for (i = 0; i < 4; ++i) {
+    /* process 32 * 8 block */
+    idct32x8_1d_rows_msa((input + (i << 8)), (out_ptr + (i << 8)));
+  }
+
+  /* transform columns */
+  for (i = 0; i < 4; ++i) {
+    /* process 8 * 32 block */
+    idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)),
+                                   dst_stride);
+  }
+}
+
+void vpx_idct32x32_34_add_msa(const int16_t *input, uint8_t *dst,
+                              int32_t dst_stride) {
+  int32_t i;
+  DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]);
+  int16_t *out_ptr = out_arr;
+
+  for (i = 32; i--;) {
+    __asm__ __volatile__ (
+        "sw     $zero,      0(%[out_ptr])     \n\t"
+        "sw     $zero,      4(%[out_ptr])     \n\t"
+        "sw     $zero,      8(%[out_ptr])     \n\t"
+        "sw     $zero,     12(%[out_ptr])     \n\t"
+        "sw     $zero,     16(%[out_ptr])     \n\t"
+        "sw     $zero,     20(%[out_ptr])     \n\t"
+        "sw     $zero,     24(%[out_ptr])     \n\t"
+        "sw     $zero,     28(%[out_ptr])     \n\t"
+        "sw     $zero,     32(%[out_ptr])     \n\t"
+        "sw     $zero,     36(%[out_ptr])     \n\t"
+        "sw     $zero,     40(%[out_ptr])     \n\t"
+        "sw     $zero,     44(%[out_ptr])     \n\t"
+        "sw     $zero,     48(%[out_ptr])     \n\t"
+        "sw     $zero,     52(%[out_ptr])     \n\t"
+        "sw     $zero,     56(%[out_ptr])     \n\t"
+        "sw     $zero,     60(%[out_ptr])     \n\t"
+
+        :
+        : [out_ptr] "r" (out_ptr)
+    );
+
+    out_ptr += 32;
+  }
+
+  out_ptr = out_arr;
+
+  /* rows: only upper-left 8x8 has non-zero coeff */
+  idct32x8_1d_rows_msa(input, out_ptr);
+
+  /* transform columns */
+  for (i = 0; i < 4; ++i) {
+    /* process 8 * 32 block */
+    idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)),
+                                   dst_stride);
+  }
+}
+
+void vpx_idct32x32_1_add_msa(const int16_t *input, uint8_t *dst,
+                             int32_t dst_stride) {
+  int32_t i;
+  int16_t out;
+  v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
+  v8i16 res0, res1, res2, res3, res4, res5, res6, res7, vec;
+
+  out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS);
+  out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS);
+  out = ROUND_POWER_OF_TWO(out, 6);
+
+  vec = __msa_fill_h(out);
+
+  for (i = 16; i--;) {
+    LD_UB2(dst, 16, dst0, dst1);
+    LD_UB2(dst + dst_stride, 16, dst2, dst3);
+
+    UNPCK_UB_SH(dst0, res0, res4);
+    UNPCK_UB_SH(dst1, res1, res5);
+    UNPCK_UB_SH(dst2, res2, res6);
+    UNPCK_UB_SH(dst3, res3, res7);
+    ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3);
+    ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, res7);
+    CLIP_SH4_0_255(res0, res1, res2, res3);
+    CLIP_SH4_0_255(res4, res5, res6, res7);
+    PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3,
+                tmp0, tmp1, tmp2, tmp3);
+
+    ST_UB2(tmp0, tmp1, dst, 16);
+    dst += dst_stride;
+    ST_UB2(tmp2, tmp3, dst, 16);
+    dst += dst_stride;
+  }
+}
diff --git a/libs/libvpx/vpx_dsp/mips/idct4x4_msa.c b/libs/libvpx/vpx_dsp/mips/idct4x4_msa.c
new file mode 100644
index 0000000000..f289d8edab
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/idct4x4_msa.c
@@ -0,0 +1,98 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/mips/inv_txfm_msa.h"
+
+void vpx_iwht4x4_16_add_msa(const int16_t *input, uint8_t *dst,
+                            int32_t dst_stride) {
+  v8i16 in0, in1, in2, in3;
+  v4i32 in0_r, in1_r, in2_r, in3_r, in4_r;
+
+  /* load vector elements of 4x4 block */
+  LD4x4_SH(input, in0, in2, in3, in1);
+  TRANSPOSE4x4_SH_SH(in0, in2, in3, in1, in0, in2, in3, in1);
+  UNPCK_R_SH_SW(in0, in0_r);
+  UNPCK_R_SH_SW(in2, in2_r);
+  UNPCK_R_SH_SW(in3, in3_r);
+  UNPCK_R_SH_SW(in1, in1_r);
+  SRA_4V(in0_r, in1_r, in2_r, in3_r, UNIT_QUANT_SHIFT);
+
+  in0_r += in2_r;
+  in3_r -= in1_r;
+  in4_r = (in0_r - in3_r) >> 1;
+  in1_r = in4_r - in1_r;
+  in2_r = in4_r - in2_r;
+  in0_r -= in1_r;
+  in3_r += in2_r;
+
+  TRANSPOSE4x4_SW_SW(in0_r, in1_r, in2_r, in3_r, in0_r, in1_r, in2_r, in3_r);
+
+  in0_r += in1_r;
+  in2_r -= in3_r;
+  in4_r = (in0_r - in2_r) >> 1;
+  in3_r = in4_r - in3_r;
+  in1_r = in4_r - in1_r;
+  in0_r -= in3_r;
+  in2_r += in1_r;
+
+  PCKEV_H4_SH(in0_r, in0_r, in1_r, in1_r, in2_r, in2_r, in3_r, in3_r,
+              in0, in1, in2, in3);
+  ADDBLK_ST4x4_UB(in0, in3, in1, in2, dst, dst_stride);
+}
+
+void vpx_iwht4x4_1_add_msa(const int16_t *input, uint8_t *dst,
+                           int32_t dst_stride) {
+  int16_t a1, e1;
+  v8i16 in1, in0 = { 0 };
+
+  a1 = input[0] >> UNIT_QUANT_SHIFT;
+  e1 = a1 >> 1;
+  a1 -= e1;
+
+  in0 = __msa_insert_h(in0, 0, a1);
+  in0 = __msa_insert_h(in0, 1, e1);
+  in0 = __msa_insert_h(in0, 2, e1);
+  in0 = __msa_insert_h(in0, 3, e1);
+
+  in1 = in0 >> 1;
+  in0 -= in1;
+
+  ADDBLK_ST4x4_UB(in0, in1, in1, in1, dst, dst_stride);
+}
+
+void vpx_idct4x4_16_add_msa(const int16_t *input, uint8_t *dst,
+                            int32_t dst_stride) {
+  v8i16 in0, in1, in2, in3;
+
+  /* load vector elements of 4x4 block */
+  LD4x4_SH(input, in0, in1, in2, in3);
+  /* rows */
+  TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+  VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+  /* columns */
+  TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+  VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
+  /* rounding (add 2^3, divide by 2^4) */
+  SRARI_H4_SH(in0, in1, in2, in3, 4);
+  ADDBLK_ST4x4_UB(in0, in1, in2, in3, dst, dst_stride);
+}
+
+void vpx_idct4x4_1_add_msa(const int16_t *input, uint8_t *dst,
+                           int32_t dst_stride) {
+  int16_t out;
+  v8i16 vec;
+
+  out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS);
+  out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS);
+  out = ROUND_POWER_OF_TWO(out, 4);
+  vec = __msa_fill_h(out);
+
+  ADDBLK_ST4x4_UB(vec, vec, vec, vec, dst, dst_stride);
+}
diff --git a/libs/libvpx/vpx_dsp/mips/idct8x8_msa.c b/libs/libvpx/vpx_dsp/mips/idct8x8_msa.c
new file mode 100644
index 0000000000..fd667e4566
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/idct8x8_msa.c
@@ -0,0 +1,116 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/mips/inv_txfm_msa.h"
+
+void vpx_idct8x8_64_add_msa(const int16_t *input, uint8_t *dst,
+                            int32_t dst_stride) {
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+  /* load vector elements of 8x8 block */
+  LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+
+  /* rows transform */
+  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                     in0, in1, in2, in3, in4, in5, in6, in7);
+  /* 1D idct8x8 */
+  VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
+                 in0, in1, in2, in3, in4, in5, in6, in7);
+  /* columns transform */
+  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                     in0, in1, in2, in3, in4, in5, in6, in7);
+  /* 1D idct8x8 */
+  VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
+                 in0, in1, in2, in3, in4, in5, in6, in7);
+  /* final rounding (add 2^4, divide by 2^5) and shift */
+  SRARI_H4_SH(in0, in1, in2, in3, 5);
+  SRARI_H4_SH(in4, in5, in6, in7, 5);
+  /* add block and store 8x8 */
+  VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3);
+  dst += (4 * dst_stride);
+  VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7);
+}
+
+void vpx_idct8x8_12_add_msa(const int16_t *input, uint8_t *dst,
+                            int32_t dst_stride) {
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+  v8i16 s0, s1, s2, s3, s4, s5, s6, s7, k0, k1, k2, k3, m0, m1, m2, m3;
+  v4i32 tmp0, tmp1, tmp2, tmp3;
+  v8i16 zero = { 0 };
+
+  /* load vector elements of 8x8 block */
+  LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+  TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+
+  /* stage1 */
+  ILVL_H2_SH(in3, in0, in2, in1, s0, s1);
+  k0 = VP9_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64);
+  k1 = VP9_SET_COSPI_PAIR(cospi_4_64, cospi_28_64);
+  k2 = VP9_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64);
+  k3 = VP9_SET_COSPI_PAIR(cospi_12_64, cospi_20_64);
+  DOTP_SH4_SW(s0, s0, s1, s1, k0, k1, k2, k3, tmp0, tmp1, tmp2, tmp3);
+  SRARI_W4_SW(tmp0, tmp1, tmp2, tmp3, DCT_CONST_BITS);
+  PCKEV_H2_SH(zero, tmp0, zero, tmp1, s0, s1);
+  PCKEV_H2_SH(zero, tmp2, zero, tmp3, s2, s3);
+  BUTTERFLY_4(s0, s1, s3, s2, s4, s7, s6, s5);
+
+  /* stage2 */
+  ILVR_H2_SH(in3, in1, in2, in0, s1, s0);
+  k0 = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64);
+  k1 = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);
+  k2 = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);
+  k3 = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);
+  DOTP_SH4_SW(s0, s0, s1, s1, k0, k1, k2, k3, tmp0, tmp1, tmp2, tmp3);
+  SRARI_W4_SW(tmp0, tmp1, tmp2, tmp3, DCT_CONST_BITS);
+  PCKEV_H2_SH(zero, tmp0, zero, tmp1, s0, s1);
+  PCKEV_H2_SH(zero, tmp2, zero, tmp3, s2, s3);
+  BUTTERFLY_4(s0, s1, s2, s3, m0, m1, m2, m3);
+
+  /* stage3 */
+  s0 = __msa_ilvr_h(s6, s5);
+
+  k1 = VP9_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64);
+  DOTP_SH2_SW(s0, s0, k1, k0, tmp0, tmp1);
+  SRARI_W2_SW(tmp0, tmp1, DCT_CONST_BITS);
+  PCKEV_H2_SH(zero, tmp0, zero, tmp1, s2, s3);
+
+  /* stage4 */
+  BUTTERFLY_8(m0, m1, m2, m3, s4, s2, s3, s7,
+              in0, in1, in2, in3, in4, in5, in6, in7);
+  TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                     in0, in1, in2, in3, in4, in5, in6, in7);
+  VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
+                 in0, in1, in2, in3, in4, in5, in6, in7);
+
+  /* final rounding (add 2^4, divide by 2^5) and shift */
+  SRARI_H4_SH(in0, in1, in2, in3, 5);
+  SRARI_H4_SH(in4, in5, in6, in7, 5);
+
+  /* add block and store 8x8 */
+  VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3);
+  dst += (4 * dst_stride);
+  VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7);
+}
+
+void vpx_idct8x8_1_add_msa(const int16_t *input, uint8_t *dst,
+                           int32_t dst_stride) {
+  int16_t out;
+  int32_t val;
+  v8i16 vec;
+
+  out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS);
+  out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS);
+  val = ROUND_POWER_OF_TWO(out, 5);
+  vec = __msa_fill_h(val);
+
+  VP9_ADDBLK_ST8x4_UB(dst, dst_stride, vec, vec, vec, vec);
+  dst += (4 * dst_stride);
+  VP9_ADDBLK_ST8x4_UB(dst, dst_stride, vec, vec, vec, vec);
+}
diff --git a/libs/libvpx/vpx_dsp/mips/intrapred16_dspr2.c b/libs/libvpx/vpx_dsp/mips/intrapred16_dspr2.c
new file mode 100644
index 0000000000..11444c718e
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/intrapred16_dspr2.c
@@ -0,0 +1,329 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/mips/common_dspr2.h"
+
+#if HAVE_DSPR2
+void vpx_h_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  int32_t  tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+  int32_t  tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+
+  __asm__ __volatile__ (
+      "lb         %[tmp1],      (%[left])                    \n\t"
+      "lb         %[tmp2],      1(%[left])                   \n\t"
+      "lb         %[tmp3],      2(%[left])                   \n\t"
+      "lb         %[tmp4],      3(%[left])                   \n\t"
+      "lb         %[tmp5],      4(%[left])                   \n\t"
+      "lb         %[tmp6],      5(%[left])                   \n\t"
+      "lb         %[tmp7],      6(%[left])                   \n\t"
+      "lb         %[tmp8],      7(%[left])                   \n\t"
+      "lb         %[tmp9],      8(%[left])                   \n\t"
+      "lb         %[tmp10],     9(%[left])                   \n\t"
+      "lb         %[tmp11],     10(%[left])                  \n\t"
+      "lb         %[tmp12],     11(%[left])                  \n\t"
+      "lb         %[tmp13],     12(%[left])                  \n\t"
+      "lb         %[tmp14],     13(%[left])                  \n\t"
+      "lb         %[tmp15],     14(%[left])                  \n\t"
+      "lb         %[tmp16],     15(%[left])                  \n\t"
+
+      "replv.qb   %[tmp1],      %[tmp1]                      \n\t"
+      "replv.qb   %[tmp2],      %[tmp2]                      \n\t"
+      "replv.qb   %[tmp3],      %[tmp3]                      \n\t"
+      "replv.qb   %[tmp4],      %[tmp4]                      \n\t"
+      "replv.qb   %[tmp5],      %[tmp5]                      \n\t"
+      "replv.qb   %[tmp6],      %[tmp6]                      \n\t"
+      "replv.qb   %[tmp7],      %[tmp7]                      \n\t"
+      "replv.qb   %[tmp8],      %[tmp8]                      \n\t"
+      "replv.qb   %[tmp9],      %[tmp9]                      \n\t"
+      "replv.qb   %[tmp10],     %[tmp10]                     \n\t"
+      "replv.qb   %[tmp11],     %[tmp11]                     \n\t"
+      "replv.qb   %[tmp12],     %[tmp12]                     \n\t"
+      "replv.qb   %[tmp13],     %[tmp13]                     \n\t"
+      "replv.qb   %[tmp14],     %[tmp14]                     \n\t"
+      "replv.qb   %[tmp15],     %[tmp15]                     \n\t"
+      "replv.qb   %[tmp16],     %[tmp16]                     \n\t"
+
+      "sw         %[tmp1],      (%[dst])                     \n\t"
+      "sw         %[tmp1],      4(%[dst])                    \n\t"
+      "sw         %[tmp1],      8(%[dst])                    \n\t"
+      "sw         %[tmp1],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp2],      (%[dst])                     \n\t"
+      "sw         %[tmp2],      4(%[dst])                    \n\t"
+      "sw         %[tmp2],      8(%[dst])                    \n\t"
+      "sw         %[tmp2],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp3],      (%[dst])                     \n\t"
+      "sw         %[tmp3],      4(%[dst])                    \n\t"
+      "sw         %[tmp3],      8(%[dst])                    \n\t"
+      "sw         %[tmp3],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp4],      (%[dst])                     \n\t"
+      "sw         %[tmp4],      4(%[dst])                    \n\t"
+      "sw         %[tmp4],      8(%[dst])                    \n\t"
+      "sw         %[tmp4],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp5],      (%[dst])                     \n\t"
+      "sw         %[tmp5],      4(%[dst])                    \n\t"
+      "sw         %[tmp5],      8(%[dst])                    \n\t"
+      "sw         %[tmp5],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp6],      (%[dst])                     \n\t"
+      "sw         %[tmp6],      4(%[dst])                    \n\t"
+      "sw         %[tmp6],      8(%[dst])                    \n\t"
+      "sw         %[tmp6],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp7],      (%[dst])                     \n\t"
+      "sw         %[tmp7],      4(%[dst])                    \n\t"
+      "sw         %[tmp7],      8(%[dst])                    \n\t"
+      "sw         %[tmp7],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp8],      (%[dst])                     \n\t"
+      "sw         %[tmp8],      4(%[dst])                    \n\t"
+      "sw         %[tmp8],      8(%[dst])                    \n\t"
+      "sw         %[tmp8],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp9],      (%[dst])                     \n\t"
+      "sw         %[tmp9],      4(%[dst])                    \n\t"
+      "sw         %[tmp9],      8(%[dst])                    \n\t"
+      "sw         %[tmp9],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp10],     (%[dst])                     \n\t"
+      "sw         %[tmp10],     4(%[dst])                    \n\t"
+      "sw         %[tmp10],     8(%[dst])                    \n\t"
+      "sw         %[tmp10],     12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp11],     (%[dst])                     \n\t"
+      "sw         %[tmp11],     4(%[dst])                    \n\t"
+      "sw         %[tmp11],     8(%[dst])                    \n\t"
+      "sw         %[tmp11],     12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp12],     (%[dst])                     \n\t"
+      "sw         %[tmp12],     4(%[dst])                    \n\t"
+      "sw         %[tmp12],     8(%[dst])                    \n\t"
+      "sw         %[tmp12],     12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp13],     (%[dst])                     \n\t"
+      "sw         %[tmp13],     4(%[dst])                    \n\t"
+      "sw         %[tmp13],     8(%[dst])                    \n\t"
+      "sw         %[tmp13],     12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp14],     (%[dst])                     \n\t"
+      "sw         %[tmp14],     4(%[dst])                    \n\t"
+      "sw         %[tmp14],     8(%[dst])                    \n\t"
+      "sw         %[tmp14],     12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp15],     (%[dst])                     \n\t"
+      "sw         %[tmp15],     4(%[dst])                    \n\t"
+      "sw         %[tmp15],     8(%[dst])                    \n\t"
+      "sw         %[tmp15],     12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp16],     (%[dst])                     \n\t"
+      "sw         %[tmp16],     4(%[dst])                    \n\t"
+      "sw         %[tmp16],     8(%[dst])                    \n\t"
+      "sw         %[tmp16],     12(%[dst])                   \n\t"
+
+      : [tmp1] "=&r" (tmp1),   [tmp2] "=&r" (tmp2),
+        [tmp3] "=&r" (tmp3),   [tmp4] "=&r" (tmp4),
+        [tmp5] "=&r" (tmp5),   [tmp7] "=&r" (tmp7),
+        [tmp6] "=&r" (tmp6),   [tmp8] "=&r" (tmp8),
+        [tmp9] "=&r" (tmp9),   [tmp10] "=&r" (tmp10),
+        [tmp11] "=&r" (tmp11), [tmp12] "=&r" (tmp12),
+        [tmp13] "=&r" (tmp13), [tmp14] "=&r" (tmp14),
+        [tmp15] "=&r" (tmp15), [tmp16] "=&r" (tmp16)
+      : [left] "r" (left), [dst] "r" (dst), [stride] "r" (stride)
+  );
+}
+
+void vpx_dc_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride,
+                                  const uint8_t *above, const uint8_t *left) {
+  int32_t  expected_dc;
+  int32_t  average;
+  int32_t  tmp, above1, above_l1, above_r1, left1, left_r1, left_l1;
+  int32_t  above2, left2;
+
+  __asm__ __volatile__ (
+      "lw              %[above1],           (%[above])                    \n\t"
+      "lw              %[above2],           4(%[above])                   \n\t"
+      "lw              %[left1],            (%[left])                     \n\t"
+      "lw              %[left2],            4(%[left])                    \n\t"
+
+      "preceu.ph.qbl   %[above_l1],         %[above1]                     \n\t"
+      "preceu.ph.qbr   %[above_r1],         %[above1]                     \n\t"
+      "preceu.ph.qbl   %[left_l1],          %[left1]                      \n\t"
+      "preceu.ph.qbr   %[left_r1],          %[left1]                      \n\t"
+
+      "addu.ph         %[average],          %[above_r1],     %[above_l1]  \n\t"
+      "addu.ph         %[average],          %[average],      %[left_l1]   \n\t"
+      "addu.ph         %[average],          %[average],      %[left_r1]   \n\t"
+
+      "preceu.ph.qbl   %[above_l1],         %[above2]                     \n\t"
+      "preceu.ph.qbr   %[above_r1],         %[above2]                     \n\t"
+      "preceu.ph.qbl   %[left_l1],          %[left2]                      \n\t"
+      "preceu.ph.qbr   %[left_r1],          %[left2]                      \n\t"
+
+      "addu.ph         %[average],          %[average],      %[above_l1]  \n\t"
+      "addu.ph         %[average],          %[average],      %[above_r1]  \n\t"
+      "addu.ph         %[average],          %[average],      %[left_l1]   \n\t"
+      "addu.ph         %[average],          %[average],      %[left_r1]   \n\t"
+
+      "lw              %[above1],           8(%[above])                   \n\t"
+      "lw              %[above2],           12(%[above])                  \n\t"
+      "lw              %[left1],            8(%[left])                    \n\t"
+      "lw              %[left2],            12(%[left])                   \n\t"
+
+      "preceu.ph.qbl   %[above_l1],         %[above1]                     \n\t"
+      "preceu.ph.qbr   %[above_r1],         %[above1]                     \n\t"
+      "preceu.ph.qbl   %[left_l1],          %[left1]                      \n\t"
+      "preceu.ph.qbr   %[left_r1],          %[left1]                      \n\t"
+
+      "addu.ph         %[average],          %[average],      %[above_l1]  \n\t"
+      "addu.ph         %[average],          %[average],      %[above_r1]  \n\t"
+      "addu.ph         %[average],          %[average],      %[left_l1]   \n\t"
+      "addu.ph         %[average],          %[average],      %[left_r1]   \n\t"
+
+      "preceu.ph.qbl   %[above_l1],         %[above2]                     \n\t"
+      "preceu.ph.qbr   %[above_r1],         %[above2]                     \n\t"
+      "preceu.ph.qbl   %[left_l1],          %[left2]                      \n\t"
+      "preceu.ph.qbr   %[left_r1],          %[left2]                      \n\t"
+
+      "addu.ph         %[average],          %[average],      %[above_l1]  \n\t"
+      "addu.ph         %[average],          %[average],      %[above_r1]  \n\t"
+      "addu.ph         %[average],          %[average],      %[left_l1]   \n\t"
+      "addu.ph         %[average],          %[average],      %[left_r1]   \n\t"
+
+      "addiu           %[average],          %[average],      16           \n\t"
+      "srl             %[tmp],              %[average],      16           \n\t"
+      "addu.ph         %[average],          %[tmp],          %[average]   \n\t"
+      "srl             %[expected_dc],      %[average],      5            \n\t"
+      "replv.qb        %[expected_dc],      %[expected_dc]                \n\t"
+
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      : [left1] "=&r" (left1), [above1] "=&r" (above1),
+        [left_l1] "=&r" (left_l1), [above_l1] "=&r" (above_l1),
+        [left_r1] "=&r" (left_r1), [above_r1] "=&r" (above_r1),
+        [above2] "=&r" (above2), [left2] "=&r" (left2),
+        [average] "=&r" (average), [tmp] "=&r" (tmp),
+        [expected_dc] "=&r" (expected_dc)
+      : [above] "r" (above), [left] "r" (left),
+        [dst] "r" (dst), [stride] "r" (stride)
+  );
+}
+#endif  // #if HAVE_DSPR2
diff --git a/libs/libvpx/vpx_dsp/mips/intrapred4_dspr2.c b/libs/libvpx/vpx_dsp/mips/intrapred4_dspr2.c
new file mode 100644
index 0000000000..03baf4c9cc
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/intrapred4_dspr2.c
@@ -0,0 +1,229 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/mips/common_dspr2.h"
+
+#if HAVE_DSPR2
+void vpx_h_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  int32_t  tmp1, tmp2, tmp3, tmp4;
+
+  __asm__ __volatile__ (
+      "lb         %[tmp1],      (%[left])                    \n\t"
+      "lb         %[tmp2],      1(%[left])                   \n\t"
+      "lb         %[tmp3],      2(%[left])                   \n\t"
+      "lb         %[tmp4],      3(%[left])                   \n\t"
+      "replv.qb   %[tmp1],      %[tmp1]                      \n\t"
+      "replv.qb   %[tmp2],      %[tmp2]                      \n\t"
+      "replv.qb   %[tmp3],      %[tmp3]                      \n\t"
+      "replv.qb   %[tmp4],      %[tmp4]                      \n\t"
+      "sw         %[tmp1],      (%[dst])                     \n\t"
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp2],      (%[dst])                     \n\t"
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp3],      (%[dst])                     \n\t"
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp4],      (%[dst])                     \n\t"
+
+      : [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2),
+        [tmp3] "=&r" (tmp3), [tmp4] "=&r" (tmp4)
+      : [left] "r" (left), [dst] "r" (dst), [stride] "r" (stride)
+  );
+}
+
+void vpx_dc_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  int32_t  expected_dc;
+  int32_t  average;
+  int32_t  tmp, above_c, above_l, above_r, left_c, left_r, left_l;
+
+  __asm__ __volatile__ (
+      "lw              %[above_c],         (%[above])                    \n\t"
+      "lw              %[left_c],          (%[left])                     \n\t"
+
+      "preceu.ph.qbl   %[above_l],         %[above_c]                    \n\t"
+      "preceu.ph.qbr   %[above_r],         %[above_c]                    \n\t"
+      "preceu.ph.qbl   %[left_l],          %[left_c]                     \n\t"
+      "preceu.ph.qbr   %[left_r],          %[left_c]                     \n\t"
+
+      "addu.ph         %[average],         %[above_r],       %[above_l]  \n\t"
+      "addu.ph         %[average],         %[average],       %[left_l]   \n\t"
+      "addu.ph         %[average],         %[average],       %[left_r]   \n\t"
+      "addiu           %[average],         %[average],       4           \n\t"
+      "srl             %[tmp],             %[average],       16          \n\t"
+      "addu.ph         %[average],         %[tmp],           %[average]  \n\t"
+      "srl             %[expected_dc],     %[average],       3           \n\t"
+      "replv.qb        %[expected_dc],     %[expected_dc]                \n\t"
+
+      "sw              %[expected_dc],     (%[dst])                      \n\t"
+      "add             %[dst],              %[dst],          %[stride]   \n\t"
+      "sw              %[expected_dc],     (%[dst])                      \n\t"
+      "add             %[dst],              %[dst],          %[stride]   \n\t"
+      "sw              %[expected_dc],     (%[dst])                      \n\t"
+      "add             %[dst],              %[dst],          %[stride]   \n\t"
+      "sw              %[expected_dc],     (%[dst])                      \n\t"
+
+      : [above_c] "=&r" (above_c), [above_l] "=&r" (above_l),
+        [above_r] "=&r" (above_r), [left_c] "=&r" (left_c),
+        [left_l] "=&r" (left_l), [left_r] "=&r" (left_r),
+        [average] "=&r" (average), [tmp] "=&r" (tmp),
+        [expected_dc] "=&r" (expected_dc)
+      : [above] "r" (above), [left] "r" (left),
+        [dst] "r" (dst), [stride] "r" (stride)
+  );
+}
+
+void vpx_tm_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  int32_t  abovel, abover;
+  int32_t  left0, left1, left2, left3;
+  int32_t  res0, res1;
+  int32_t  resl;
+  int32_t  resr;
+  int32_t  top_left;
+  uint8_t  *cm = vpx_ff_cropTbl;
+
+  __asm__ __volatile__ (
+      "ulw             %[resl],       (%[above])                         \n\t"
+
+      "lbu             %[left0],       (%[left])                         \n\t"
+      "lbu             %[left1],       1(%[left])                        \n\t"
+      "lbu             %[left2],       2(%[left])                        \n\t"
+      "lbu             %[left3],       3(%[left])                        \n\t"
+
+      "lbu             %[top_left],    -1(%[above])                      \n\t"
+
+      "preceu.ph.qbl   %[abovel],      %[resl]                           \n\t"
+      "preceu.ph.qbr   %[abover],      %[resl]                           \n\t"
+
+      "replv.ph        %[left0],       %[left0]                          \n\t"
+      "replv.ph        %[left1],       %[left1]                          \n\t"
+      "replv.ph        %[left2],       %[left2]                          \n\t"
+      "replv.ph        %[left3],       %[left3]                          \n\t"
+
+      "replv.ph        %[top_left],    %[top_left]                       \n\t"
+
+      "addu.ph         %[resl],        %[abovel],         %[left0]       \n\t"
+      "subu.ph         %[resl],        %[resl],           %[top_left]    \n\t"
+
+      "addu.ph         %[resr],        %[abover],         %[left0]       \n\t"
+      "subu.ph         %[resr],        %[resr],           %[top_left]    \n\t"
+
+      "sll             %[res0],        %[resr],           16             \n\t"
+      "sra             %[res0],        %[res0],           16             \n\t"
+      "lbux            %[res0],        %[res0](%[cm])                    \n\t"
+
+      "sra             %[res1],        %[resr],           16             \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                    \n\t"
+      "sb              %[res0],        (%[dst])                          \n\t"
+
+      "sll             %[res0],        %[resl],           16             \n\t"
+      "sra             %[res0],        %[res0],           16             \n\t"
+      "lbux            %[res0],        %[res0](%[cm])                    \n\t"
+      "sb              %[res1],        1(%[dst])                         \n\t"
+
+      "sra             %[res1],        %[resl],           16             \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                    \n\t"
+
+      "addu.ph         %[resl],        %[abovel],         %[left1]       \n\t"
+      "subu.ph         %[resl],        %[resl],           %[top_left]    \n\t"
+
+      "addu.ph         %[resr],        %[abover],         %[left1]       \n\t"
+      "subu.ph         %[resr],        %[resr],           %[top_left]    \n\t"
+
+      "sb              %[res0],        2(%[dst])                         \n\t"
+      "sb              %[res1],        3(%[dst])                         \n\t"
+
+      "add             %[dst],          %[dst],           %[stride]      \n\t"
+
+      "sll             %[res0],        %[resr],           16             \n\t"
+      "sra             %[res0],        %[res0],           16             \n\t"
+      "lbux            %[res0],        %[res0](%[cm])                    \n\t"
+
+      "sra             %[res1],        %[resr],           16             \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                    \n\t"
+      "sb              %[res0],        (%[dst])                          \n\t"
+
+      "sll             %[res0],        %[resl],           16             \n\t"
+      "sra             %[res0],        %[res0],           16             \n\t"
+      "lbux            %[res0],        %[res0](%[cm])                    \n\t"
+
+      "sb              %[res1],        1(%[dst])                         \n\t"
+      "sra             %[res1],        %[resl],           16             \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                    \n\t"
+
+      "addu.ph         %[resl],        %[abovel],         %[left2]       \n\t"
+      "subu.ph         %[resl],        %[resl],           %[top_left]    \n\t"
+
+      "addu.ph         %[resr],        %[abover],         %[left2]       \n\t"
+      "subu.ph         %[resr],        %[resr],           %[top_left]    \n\t"
+
+      "sb              %[res0],        2(%[dst])                         \n\t"
+      "sb              %[res1],        3(%[dst])                         \n\t"
+
+      "add             %[dst],          %[dst],           %[stride]      \n\t"
+
+      "sll             %[res0],        %[resr],           16             \n\t"
+      "sra             %[res0],        %[res0],           16             \n\t"
+      "lbux            %[res0],        %[res0](%[cm])                    \n\t"
+
+
+      "sra             %[res1],        %[resr],           16             \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                    \n\t"
+      "sb              %[res0],        (%[dst])                          \n\t"
+
+      "sll             %[res0],        %[resl],           16             \n\t"
+      "sra             %[res0],        %[res0],           16             \n\t"
+      "lbux            %[res0],        %[res0](%[cm])                    \n\t"
+
+
+      "sb              %[res1],        1(%[dst])                         \n\t"
+      "sra             %[res1],        %[resl],           16             \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                    \n\t"
+
+      "addu.ph         %[resl],        %[abovel],        %[left3]        \n\t"
+      "subu.ph         %[resl],        %[resl],          %[top_left]     \n\t"
+
+      "addu.ph         %[resr],        %[abover],        %[left3]        \n\t"
+      "subu.ph         %[resr],        %[resr],          %[top_left]     \n\t"
+
+      "sb              %[res0],        2(%[dst])                         \n\t"
+      "sb              %[res1],        3(%[dst])                         \n\t"
+
+      "add             %[dst],          %[dst],          %[stride]       \n\t"
+
+      "sll             %[res0],        %[resr],           16             \n\t"
+      "sra             %[res0],        %[res0],           16             \n\t"
+      "lbux            %[res0],        %[res0](%[cm])                    \n\t"
+
+      "sra             %[res1],        %[resr],           16             \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                    \n\t"
+      "sb              %[res0],        (%[dst])                          \n\t"
+
+      "sll             %[res0],        %[resl],           16             \n\t"
+      "sra             %[res0],        %[res0],           16             \n\t"
+      "lbux            %[res0],        %[res0](%[cm])                    \n\t"
+      "sb              %[res1],        1(%[dst])                         \n\t"
+
+      "sra             %[res1],        %[resl],           16             \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                    \n\t"
+
+      "sb              %[res0],        2(%[dst])                         \n\t"
+      "sb              %[res1],        3(%[dst])                         \n\t"
+
+      : [abovel] "=&r" (abovel), [abover] "=&r" (abover),
+        [left0] "=&r" (left0), [left1] "=&r" (left1), [left2] "=&r" (left2),
+        [res0] "=&r" (res0), [res1] "=&r" (res1), [left3] "=&r" (left3),
+        [resl] "=&r" (resl), [resr] "=&r" (resr), [top_left] "=&r" (top_left)
+      : [above] "r" (above), [left] "r" (left),
+        [dst] "r" (dst), [stride] "r" (stride), [cm] "r" (cm)
+  );
+}
+#endif  // #if HAVE_DSPR2
diff --git a/libs/libvpx/vpx_dsp/mips/intrapred8_dspr2.c b/libs/libvpx/vpx_dsp/mips/intrapred8_dspr2.c
new file mode 100644
index 0000000000..196ff5a062
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/intrapred8_dspr2.c
@@ -0,0 +1,607 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/mips/common_dspr2.h"
+
+#if HAVE_DSPR2
+void vpx_h_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  int32_t  tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+
+  __asm__ __volatile__ (
+      "lb         %[tmp1],      (%[left])                   \n\t"
+      "lb         %[tmp2],      1(%[left])                  \n\t"
+      "lb         %[tmp3],      2(%[left])                  \n\t"
+      "lb         %[tmp4],      3(%[left])                  \n\t"
+      "lb         %[tmp5],      4(%[left])                  \n\t"
+      "lb         %[tmp6],      5(%[left])                  \n\t"
+      "lb         %[tmp7],      6(%[left])                  \n\t"
+      "lb         %[tmp8],      7(%[left])                  \n\t"
+
+      "replv.qb   %[tmp1],      %[tmp1]                     \n\t"
+      "replv.qb   %[tmp2],      %[tmp2]                     \n\t"
+      "replv.qb   %[tmp3],      %[tmp3]                     \n\t"
+      "replv.qb   %[tmp4],      %[tmp4]                     \n\t"
+      "replv.qb   %[tmp5],      %[tmp5]                     \n\t"
+      "replv.qb   %[tmp6],      %[tmp6]                     \n\t"
+      "replv.qb   %[tmp7],      %[tmp7]                     \n\t"
+      "replv.qb   %[tmp8],      %[tmp8]                     \n\t"
+
+      "sw         %[tmp1],      (%[dst])                    \n\t"
+      "sw         %[tmp1],      4(%[dst])                   \n\t"
+      "add        %[dst],       %[dst],         %[stride]   \n\t"
+      "sw         %[tmp2],      (%[dst])                    \n\t"
+      "sw         %[tmp2],      4(%[dst])                   \n\t"
+      "add        %[dst],       %[dst],         %[stride]   \n\t"
+      "sw         %[tmp3],      (%[dst])                    \n\t"
+      "sw         %[tmp3],      4(%[dst])                   \n\t"
+      "add        %[dst],       %[dst],         %[stride]   \n\t"
+      "sw         %[tmp4],      (%[dst])                    \n\t"
+      "sw         %[tmp4],      4(%[dst])                   \n\t"
+      "add        %[dst],       %[dst],         %[stride]   \n\t"
+      "sw         %[tmp5],      (%[dst])                    \n\t"
+      "sw         %[tmp5],      4(%[dst])                   \n\t"
+      "add        %[dst],       %[dst],         %[stride]   \n\t"
+      "sw         %[tmp6],      (%[dst])                    \n\t"
+      "sw         %[tmp6],      4(%[dst])                   \n\t"
+      "add        %[dst],       %[dst],         %[stride]   \n\t"
+      "sw         %[tmp7],      (%[dst])                    \n\t"
+      "sw         %[tmp7],      4(%[dst])                   \n\t"
+      "add        %[dst],       %[dst],         %[stride]   \n\t"
+      "sw         %[tmp8],      (%[dst])                    \n\t"
+      "sw         %[tmp8],      4(%[dst])                   \n\t"
+
+      : [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2),
+        [tmp3] "=&r" (tmp3), [tmp4] "=&r" (tmp4),
+        [tmp5] "=&r" (tmp5), [tmp7] "=&r" (tmp7),
+        [tmp6] "=&r" (tmp6), [tmp8] "=&r" (tmp8)
+      : [left] "r" (left), [dst] "r" (dst),
+        [stride] "r" (stride)
+  );
+}
+
+void vpx_dc_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  int32_t  expected_dc;
+  int32_t  average;
+  int32_t  tmp, above1, above_l1, above_r1, left1, left_r1, left_l1;
+  int32_t  above2, above_l2, above_r2, left2, left_r2, left_l2;
+
+  __asm__ __volatile__ (
+      "lw              %[above1],         (%[above])                      \n\t"
+      "lw              %[above2],         4(%[above])                     \n\t"
+      "lw              %[left1],          (%[left])                       \n\t"
+      "lw              %[left2],          4(%[left])                      \n\t"
+
+      "preceu.ph.qbl   %[above_l1],       %[above1]                       \n\t"
+      "preceu.ph.qbr   %[above_r1],       %[above1]                       \n\t"
+      "preceu.ph.qbl   %[left_l1],        %[left1]                        \n\t"
+      "preceu.ph.qbr   %[left_r1],        %[left1]                        \n\t"
+
+      "preceu.ph.qbl   %[above_l2],       %[above2]                       \n\t"
+      "preceu.ph.qbr   %[above_r2],       %[above2]                       \n\t"
+      "preceu.ph.qbl   %[left_l2],        %[left2]                        \n\t"
+      "preceu.ph.qbr   %[left_r2],        %[left2]                        \n\t"
+
+      "addu.ph         %[average],        %[above_r1],      %[above_l1]   \n\t"
+      "addu.ph         %[average],        %[average],       %[left_l1]    \n\t"
+      "addu.ph         %[average],        %[average],       %[left_r1]    \n\t"
+
+      "addu.ph         %[average],        %[average],       %[above_l2]   \n\t"
+      "addu.ph         %[average],        %[average],       %[above_r2]   \n\t"
+      "addu.ph         %[average],        %[average],       %[left_l2]    \n\t"
+      "addu.ph         %[average],        %[average],       %[left_r2]    \n\t"
+
+      "addiu           %[average],        %[average],       8             \n\t"
+
+      "srl             %[tmp],            %[average],       16            \n\t"
+      "addu.ph         %[average],        %[tmp],           %[average]    \n\t"
+      "srl             %[expected_dc],    %[average],       4             \n\t"
+      "replv.qb        %[expected_dc],    %[expected_dc]                  \n\t"
+
+      "sw              %[expected_dc],    (%[dst])                        \n\t"
+      "sw              %[expected_dc],    4(%[dst])                       \n\t"
+
+      "add             %[dst],             %[dst],          %[stride]     \n\t"
+      "sw              %[expected_dc],    (%[dst])                        \n\t"
+      "sw              %[expected_dc],    4(%[dst])                       \n\t"
+
+      "add             %[dst],             %[dst],          %[stride]     \n\t"
+      "sw              %[expected_dc],    (%[dst])                        \n\t"
+      "sw              %[expected_dc],    4(%[dst])                       \n\t"
+
+      "add             %[dst],             %[dst],          %[stride]     \n\t"
+      "sw              %[expected_dc],    (%[dst])                        \n\t"
+      "sw              %[expected_dc],    4(%[dst])                       \n\t"
+
+      "add             %[dst],             %[dst],          %[stride]     \n\t"
+      "sw              %[expected_dc],    (%[dst])                        \n\t"
+      "sw              %[expected_dc],    4(%[dst])                       \n\t"
+
+      "add             %[dst],             %[dst],          %[stride]     \n\t"
+      "sw              %[expected_dc],    (%[dst])                        \n\t"
+      "sw              %[expected_dc],    4(%[dst])                       \n\t"
+
+      "add             %[dst],             %[dst],          %[stride]     \n\t"
+      "sw              %[expected_dc],    (%[dst])                        \n\t"
+      "sw              %[expected_dc],    4(%[dst])                       \n\t"
+
+      "add             %[dst],             %[dst],          %[stride]     \n\t"
+      "sw              %[expected_dc],    (%[dst])                        \n\t"
+      "sw              %[expected_dc],    4(%[dst])                       \n\t"
+
+      : [above1] "=&r" (above1), [above_l1] "=&r" (above_l1),
+        [above_r1] "=&r" (above_r1), [left1] "=&r" (left1),
+        [left_l1] "=&r" (left_l1), [left_r1] "=&r" (left_r1),
+        [above2] "=&r" (above2), [above_l2] "=&r" (above_l2),
+        [above_r2] "=&r" (above_r2), [left2] "=&r" (left2),
+        [left_l2] "=&r" (left_l2), [left_r2] "=&r" (left_r2),
+        [average] "=&r" (average), [tmp] "=&r" (tmp),
+        [expected_dc] "=&r" (expected_dc)
+      : [above] "r" (above), [left] "r" (left), [dst] "r" (dst),
+        [stride] "r" (stride)
+  );
+}
+
+void vpx_tm_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  int32_t   abovel, abover;
+  int32_t   abovel_1, abover_1;
+  int32_t   left0;
+  int32_t   res0, res1, res2, res3;
+  int32_t   reshw;
+  int32_t   top_left;
+  uint8_t   *cm = vpx_ff_cropTbl;
+
+  __asm__ __volatile__ (
+      "ulw             %[reshw],       (%[above])                         \n\t"
+      "ulw             %[top_left],    4(%[above])                        \n\t"
+
+      "lbu             %[left0],       (%[left])                          \n\t"
+
+      "preceu.ph.qbl   %[abovel],      %[reshw]                           \n\t"
+      "preceu.ph.qbr   %[abover],      %[reshw]                           \n\t"
+      "preceu.ph.qbl   %[abovel_1],    %[top_left]                        \n\t"
+      "preceu.ph.qbr   %[abover_1],    %[top_left]                        \n\t"
+
+      "lbu             %[top_left],    -1(%[above])                       \n\t"
+      "replv.ph        %[left0],       %[left0]                           \n\t"
+
+      "replv.ph        %[top_left],    %[top_left]                        \n\t"
+
+      "addu.ph         %[reshw],       %[abovel],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        (%[dst])                           \n\t"
+      "sb              %[res1],        1(%[dst])                          \n\t"
+      "sb              %[res2],        2(%[dst])                          \n\t"
+      "sb              %[res3],        3(%[dst])                          \n\t"
+
+      "addu.ph         %[reshw],       %[abovel_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbu             %[left0],       1(%[left])                         \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        4(%[dst])                          \n\t"
+      "sb              %[res1],        5(%[dst])                          \n\t"
+      "sb              %[res2],        6(%[dst])                          \n\t"
+      "sb              %[res3],        7(%[dst])                          \n\t"
+
+      "replv.ph        %[left0],       %[left0]                           \n\t"
+      "add             %[dst],          %[dst],             %[stride]     \n\t"
+
+      "addu.ph         %[reshw],       %[abovel],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        (%[dst])                           \n\t"
+      "sb              %[res1],        1(%[dst])                          \n\t"
+      "sb              %[res2],        2(%[dst])                          \n\t"
+      "sb              %[res3],        3(%[dst])                          \n\t"
+
+      "addu.ph         %[reshw],       %[abovel_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbu             %[left0],       2(%[left])                         \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        4(%[dst])                          \n\t"
+      "sb              %[res1],        5(%[dst])                          \n\t"
+      "sb              %[res2],        6(%[dst])                          \n\t"
+      "sb              %[res3],        7(%[dst])                          \n\t"
+
+      "replv.ph        %[left0],       %[left0]                           \n\t"
+      "add             %[dst],          %[dst],             %[stride]     \n\t"
+
+      "addu.ph         %[reshw],       %[abovel],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        (%[dst])                           \n\t"
+      "sb              %[res1],        1(%[dst])                          \n\t"
+      "sb              %[res2],        2(%[dst])                          \n\t"
+      "sb              %[res3],        3(%[dst])                          \n\t"
+
+      "addu.ph         %[reshw],       %[abovel_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbu             %[left0],       3(%[left])                         \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        4(%[dst])                          \n\t"
+      "sb              %[res1],        5(%[dst])                          \n\t"
+      "sb              %[res2],        6(%[dst])                          \n\t"
+      "sb              %[res3],        7(%[dst])                          \n\t"
+
+      "replv.ph        %[left0],       %[left0]                           \n\t"
+      "add             %[dst],          %[dst],             %[stride]     \n\t"
+
+      "addu.ph         %[reshw],       %[abovel],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        (%[dst])                           \n\t"
+      "sb              %[res1],        1(%[dst])                          \n\t"
+      "sb              %[res2],        2(%[dst])                          \n\t"
+      "sb              %[res3],        3(%[dst])                          \n\t"
+
+      "addu.ph         %[reshw],       %[abovel_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbu             %[left0],       4(%[left])                         \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        4(%[dst])                          \n\t"
+      "sb              %[res1],        5(%[dst])                          \n\t"
+      "sb              %[res2],        6(%[dst])                          \n\t"
+      "sb              %[res3],        7(%[dst])                          \n\t"
+
+      "replv.ph        %[left0],       %[left0]                           \n\t"
+      "add             %[dst],          %[dst],             %[stride]     \n\t"
+
+      "addu.ph         %[reshw],       %[abovel],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        (%[dst])                           \n\t"
+      "sb              %[res1],        1(%[dst])                          \n\t"
+      "sb              %[res2],        2(%[dst])                          \n\t"
+      "sb              %[res3],        3(%[dst])                          \n\t"
+
+      "addu.ph         %[reshw],       %[abovel_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbu             %[left0],       5(%[left])                         \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        4(%[dst])                          \n\t"
+      "sb              %[res1],        5(%[dst])                          \n\t"
+      "sb              %[res2],        6(%[dst])                          \n\t"
+      "sb              %[res3],        7(%[dst])                          \n\t"
+
+      "replv.ph        %[left0],       %[left0]                           \n\t"
+      "add             %[dst],          %[dst],             %[stride]     \n\t"
+
+      "addu.ph         %[reshw],       %[abovel],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        (%[dst])                           \n\t"
+      "sb              %[res1],        1(%[dst])                          \n\t"
+      "sb              %[res2],        2(%[dst])                          \n\t"
+      "sb              %[res3],        3(%[dst])                          \n\t"
+
+      "addu.ph         %[reshw],       %[abovel_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbu             %[left0],       6(%[left])                         \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        4(%[dst])                          \n\t"
+      "sb              %[res1],        5(%[dst])                          \n\t"
+      "sb              %[res2],        6(%[dst])                          \n\t"
+      "sb              %[res3],        7(%[dst])                          \n\t"
+
+      "replv.ph        %[left0],       %[left0]                           \n\t"
+      "add             %[dst],          %[dst],             %[stride]     \n\t"
+
+      "addu.ph         %[reshw],       %[abovel],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        (%[dst])                           \n\t"
+      "sb              %[res1],        1(%[dst])                          \n\t"
+      "sb              %[res2],        2(%[dst])                          \n\t"
+      "sb              %[res3],        3(%[dst])                          \n\t"
+
+      "addu.ph         %[reshw],       %[abovel_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbu             %[left0],       7(%[left])                         \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        4(%[dst])                          \n\t"
+      "sb              %[res1],        5(%[dst])                          \n\t"
+      "sb              %[res2],        6(%[dst])                          \n\t"
+      "sb              %[res3],        7(%[dst])                          \n\t"
+
+      "replv.ph        %[left0],       %[left0]                           \n\t"
+      "add             %[dst],          %[dst],             %[stride]     \n\t"
+
+      "addu.ph         %[reshw],       %[abovel],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        (%[dst])                           \n\t"
+      "sb              %[res1],        1(%[dst])                          \n\t"
+      "sb              %[res2],        2(%[dst])                          \n\t"
+      "sb              %[res3],        3(%[dst])                          \n\t"
+
+      "addu.ph         %[reshw],       %[abovel_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        4(%[dst])                          \n\t"
+      "sb              %[res1],        5(%[dst])                          \n\t"
+      "sb              %[res2],        6(%[dst])                          \n\t"
+      "sb              %[res3],        7(%[dst])                          \n\t"
+
+      : [abovel] "=&r" (abovel), [abover] "=&r" (abover),
+        [abovel_1] "=&r" (abovel_1), [abover_1] "=&r" (abover_1),
+        [left0] "=&r" (left0), [res2] "=&r" (res2), [res3] "=&r" (res3),
+        [res0] "=&r" (res0), [res1] "=&r" (res1),
+        [reshw] "=&r" (reshw), [top_left] "=&r" (top_left)
+      : [above] "r" (above), [left] "r" (left),
+        [dst] "r" (dst), [stride] "r" (stride), [cm] "r" (cm)
+  );
+}
+#endif  // #if HAVE_DSPR2
diff --git a/libs/libvpx/vpx_dsp/mips/intrapred_msa.c b/libs/libvpx/vpx_dsp/mips/intrapred_msa.c
new file mode 100644
index 0000000000..f6fbe40162
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/intrapred_msa.c
@@ -0,0 +1,737 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/macros_msa.h"
+
+#define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) {  \
+  out0 = __msa_subs_u_h(out0, in0);                \
+  out1 = __msa_subs_u_h(out1, in1);                \
+}
+
+static void intra_predict_vert_4x4_msa(const uint8_t *src, uint8_t *dst,
+                                       int32_t dst_stride) {
+  uint32_t src_data;
+
+  src_data = LW(src);
+
+  SW4(src_data, src_data, src_data, src_data, dst, dst_stride);
+}
+
+static void intra_predict_vert_8x8_msa(const uint8_t *src, uint8_t *dst,
+                                       int32_t dst_stride) {
+  uint32_t row;
+  uint32_t src_data1, src_data2;
+
+  src_data1 = LW(src);
+  src_data2 = LW(src + 4);
+
+  for (row = 8; row--;) {
+    SW(src_data1, dst);
+    SW(src_data2, (dst + 4));
+    dst += dst_stride;
+  }
+}
+
+static void intra_predict_vert_16x16_msa(const uint8_t *src, uint8_t *dst,
+                                         int32_t dst_stride) {
+  uint32_t row;
+  v16u8 src0;
+
+  src0 = LD_UB(src);
+
+  for (row = 16; row--;) {
+    ST_UB(src0, dst);
+    dst += dst_stride;
+  }
+}
+
+static void intra_predict_vert_32x32_msa(const uint8_t *src, uint8_t *dst,
+                                         int32_t dst_stride) {
+  uint32_t row;
+  v16u8 src1, src2;
+
+  src1 = LD_UB(src);
+  src2 = LD_UB(src + 16);
+
+  for (row = 32; row--;) {
+    ST_UB2(src1, src2, dst, 16);
+    dst += dst_stride;
+  }
+}
+
+static void intra_predict_horiz_4x4_msa(const uint8_t *src, uint8_t *dst,
+                                        int32_t dst_stride) {
+  uint32_t out0, out1, out2, out3;
+
+  out0 = src[0] * 0x01010101;
+  out1 = src[1] * 0x01010101;
+  out2 = src[2] * 0x01010101;
+  out3 = src[3] * 0x01010101;
+
+  SW4(out0, out1, out2, out3, dst, dst_stride);
+}
+
+static void intra_predict_horiz_8x8_msa(const uint8_t *src, uint8_t *dst,
+                                        int32_t dst_stride) {
+  uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+  out0 = src[0] * 0x0101010101010101ull;
+  out1 = src[1] * 0x0101010101010101ull;
+  out2 = src[2] * 0x0101010101010101ull;
+  out3 = src[3] * 0x0101010101010101ull;
+  out4 = src[4] * 0x0101010101010101ull;
+  out5 = src[5] * 0x0101010101010101ull;
+  out6 = src[6] * 0x0101010101010101ull;
+  out7 = src[7] * 0x0101010101010101ull;
+
+  SD4(out0, out1, out2, out3, dst, dst_stride);
+  dst += (4 * dst_stride);
+  SD4(out4, out5, out6, out7, dst, dst_stride);
+}
+
+static void intra_predict_horiz_16x16_msa(const uint8_t *src, uint8_t *dst,
+                                          int32_t dst_stride) {
+  uint32_t row;
+  uint8_t inp0, inp1, inp2, inp3;
+  v16u8 src0, src1, src2, src3;
+
+  for (row = 4; row--;) {
+    inp0 = src[0];
+    inp1 = src[1];
+    inp2 = src[2];
+    inp3 = src[3];
+    src += 4;
+
+    src0 = (v16u8)__msa_fill_b(inp0);
+    src1 = (v16u8)__msa_fill_b(inp1);
+    src2 = (v16u8)__msa_fill_b(inp2);
+    src3 = (v16u8)__msa_fill_b(inp3);
+
+    ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+    dst += (4 * dst_stride);
+  }
+}
+
+static void intra_predict_horiz_32x32_msa(const uint8_t *src, uint8_t *dst,
+                                          int32_t dst_stride) {
+  uint32_t row;
+  uint8_t inp0, inp1, inp2, inp3;
+  v16u8 src0, src1, src2, src3;
+
+  for (row = 8; row--;) {
+    inp0 = src[0];
+    inp1 = src[1];
+    inp2 = src[2];
+    inp3 = src[3];
+    src += 4;
+
+    src0 = (v16u8)__msa_fill_b(inp0);
+    src1 = (v16u8)__msa_fill_b(inp1);
+    src2 = (v16u8)__msa_fill_b(inp2);
+    src3 = (v16u8)__msa_fill_b(inp3);
+
+    ST_UB2(src0, src0, dst, 16);
+    dst += dst_stride;
+    ST_UB2(src1, src1, dst, 16);
+    dst += dst_stride;
+    ST_UB2(src2, src2, dst, 16);
+    dst += dst_stride;
+    ST_UB2(src3, src3, dst, 16);
+    dst += dst_stride;
+  }
+}
+
+static void intra_predict_dc_4x4_msa(const uint8_t *src_top,
+                                     const uint8_t *src_left,
+                                     uint8_t *dst, int32_t dst_stride) {
+  uint32_t val0, val1;
+  v16i8 store, src = { 0 };
+  v8u16 sum_h;
+  v4u32 sum_w;
+  v2u64 sum_d;
+
+  val0 = LW(src_top);
+  val1 = LW(src_left);
+  INSERT_W2_SB(val0, val1, src);
+  sum_h = __msa_hadd_u_h((v16u8)src, (v16u8)src);
+  sum_w = __msa_hadd_u_w(sum_h, sum_h);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3);
+  store = __msa_splati_b((v16i8)sum_w, 0);
+  val0 = __msa_copy_u_w((v4i32)store, 0);
+
+  SW4(val0, val0, val0, val0, dst, dst_stride);
+}
+
+static void intra_predict_dc_tl_4x4_msa(const uint8_t *src, uint8_t *dst,
+                                        int32_t dst_stride) {
+  uint32_t val0;
+  v16i8 store, data = { 0 };
+  v8u16 sum_h;
+  v4u32 sum_w;
+
+  val0 = LW(src);
+  data = (v16i8)__msa_insert_w((v4i32)data, 0, val0);
+  sum_h = __msa_hadd_u_h((v16u8)data, (v16u8)data);
+  sum_w = __msa_hadd_u_w(sum_h, sum_h);
+  sum_w = (v4u32)__msa_srari_w((v4i32)sum_w, 2);
+  store = __msa_splati_b((v16i8)sum_w, 0);
+  val0 = __msa_copy_u_w((v4i32)store, 0);
+
+  SW4(val0, val0, val0, val0, dst, dst_stride);
+}
+
+static void intra_predict_128dc_4x4_msa(uint8_t *dst, int32_t dst_stride) {
+  uint32_t out;
+  const v16i8 store = __msa_ldi_b(128);
+
+  out = __msa_copy_u_w((v4i32)store, 0);
+
+  SW4(out, out, out, out, dst, dst_stride);
+}
+
+static void intra_predict_dc_8x8_msa(const uint8_t *src_top,
+                                     const uint8_t *src_left,
+                                     uint8_t *dst, int32_t dst_stride) {
+  uint64_t val0, val1;
+  v16i8 store;
+  v16u8 src = { 0 };
+  v8u16 sum_h;
+  v4u32 sum_w;
+  v2u64 sum_d;
+
+  val0 = LD(src_top);
+  val1 = LD(src_left);
+  INSERT_D2_UB(val0, val1, src);
+  sum_h = __msa_hadd_u_h(src, src);
+  sum_w = __msa_hadd_u_w(sum_h, sum_h);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4);
+  store = __msa_splati_b((v16i8)sum_w, 0);
+  val0 = __msa_copy_u_d((v2i64)store, 0);
+
+  SD4(val0, val0, val0, val0, dst, dst_stride);
+  dst += (4 * dst_stride);
+  SD4(val0, val0, val0, val0, dst, dst_stride);
+}
+
+static void intra_predict_dc_tl_8x8_msa(const uint8_t *src, uint8_t *dst,
+                                        int32_t dst_stride) {
+  uint64_t val0;
+  v16i8 store;
+  v16u8 data = { 0 };
+  v8u16 sum_h;
+  v4u32 sum_w;
+  v2u64 sum_d;
+
+  val0 = LD(src);
+  data = (v16u8)__msa_insert_d((v2i64)data, 0, val0);
+  sum_h = __msa_hadd_u_h(data, data);
+  sum_w = __msa_hadd_u_w(sum_h, sum_h);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3);
+  store = __msa_splati_b((v16i8)sum_w, 0);
+  val0 = __msa_copy_u_d((v2i64)store, 0);
+
+  SD4(val0, val0, val0, val0, dst, dst_stride);
+  dst += (4 * dst_stride);
+  SD4(val0, val0, val0, val0, dst, dst_stride);
+}
+
+static void intra_predict_128dc_8x8_msa(uint8_t *dst, int32_t dst_stride) {
+  uint64_t out;
+  const v16i8 store = __msa_ldi_b(128);
+
+  out = __msa_copy_u_d((v2i64)store, 0);
+
+  SD4(out, out, out, out, dst, dst_stride);
+  dst += (4 * dst_stride);
+  SD4(out, out, out, out, dst, dst_stride);
+}
+
+static void intra_predict_dc_16x16_msa(const uint8_t *src_top,
+                                       const uint8_t *src_left,
+                                       uint8_t *dst, int32_t dst_stride) {
+  v16u8 top, left, out;
+  v8u16 sum_h, sum_top, sum_left;
+  v4u32 sum_w;
+  v2u64 sum_d;
+
+  top = LD_UB(src_top);
+  left = LD_UB(src_left);
+  HADD_UB2_UH(top, left, sum_top, sum_left);
+  sum_h = sum_top + sum_left;
+  sum_w = __msa_hadd_u_w(sum_h, sum_h);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5);
+  out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
+
+  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+  dst += (8 * dst_stride);
+  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+}
+
+static void intra_predict_dc_tl_16x16_msa(const uint8_t *src, uint8_t *dst,
+                                          int32_t dst_stride) {
+  v16u8 data, out;
+  v8u16 sum_h;
+  v4u32 sum_w;
+  v2u64 sum_d;
+
+  data = LD_UB(src);
+  sum_h = __msa_hadd_u_h(data, data);
+  sum_w = __msa_hadd_u_w(sum_h, sum_h);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4);
+  out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
+
+  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+  dst += (8 * dst_stride);
+  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+}
+
+static void intra_predict_128dc_16x16_msa(uint8_t *dst, int32_t dst_stride) {
+  const v16u8 out = (v16u8)__msa_ldi_b(128);
+
+  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+  dst += (8 * dst_stride);
+  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+}
+
+static void intra_predict_dc_32x32_msa(const uint8_t *src_top,
+                                       const uint8_t *src_left,
+                                       uint8_t *dst, int32_t dst_stride) {
+  uint32_t row;
+  v16u8 top0, top1, left0, left1, out;
+  v8u16 sum_h, sum_top0, sum_top1, sum_left0, sum_left1;
+  v4u32 sum_w;
+  v2u64 sum_d;
+
+  LD_UB2(src_top, 16, top0, top1);
+  LD_UB2(src_left, 16, left0, left1);
+  HADD_UB2_UH(top0, top1, sum_top0, sum_top1);
+  HADD_UB2_UH(left0, left1, sum_left0, sum_left1);
+  sum_h = sum_top0 + sum_top1;
+  sum_h += sum_left0 + sum_left1;
+  sum_w = __msa_hadd_u_w(sum_h, sum_h);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 6);
+  out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
+
+  for (row = 16; row--;) {
+    ST_UB2(out, out, dst, 16);
+    dst += dst_stride;
+    ST_UB2(out, out, dst, 16);
+    dst += dst_stride;
+  }
+}
+
+static void intra_predict_dc_tl_32x32_msa(const uint8_t *src, uint8_t *dst,
+                                          int32_t dst_stride) {
+  uint32_t row;
+  v16u8 data0, data1, out;
+  v8u16 sum_h, sum_data0, sum_data1;
+  v4u32 sum_w;
+  v2u64 sum_d;
+
+  LD_UB2(src, 16, data0, data1);
+  HADD_UB2_UH(data0, data1, sum_data0, sum_data1);
+  sum_h = sum_data0 + sum_data1;
+  sum_w = __msa_hadd_u_w(sum_h, sum_h);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5);
+  out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
+
+  for (row = 16; row--;) {
+    ST_UB2(out, out, dst, 16);
+    dst += dst_stride;
+    ST_UB2(out, out, dst, 16);
+    dst += dst_stride;
+  }
+}
+
+static void intra_predict_128dc_32x32_msa(uint8_t *dst, int32_t dst_stride) {
+  uint32_t row;
+  const v16u8 out = (v16u8)__msa_ldi_b(128);
+
+  for (row = 16; row--;) {
+    ST_UB2(out, out, dst, 16);
+    dst += dst_stride;
+    ST_UB2(out, out, dst, 16);
+    dst += dst_stride;
+  }
+}
+
+static void intra_predict_tm_4x4_msa(const uint8_t *src_top_ptr,
+                                     const uint8_t *src_left,
+                                     uint8_t *dst, int32_t dst_stride) {
+  uint32_t val;
+  uint8_t top_left = src_top_ptr[-1];
+  v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 };
+  v16u8 src0, src1, src2, src3;
+  v8u16 src_top_left, vec0, vec1, vec2, vec3;
+
+  src_top_left = (v8u16)__msa_fill_h(top_left);
+  val = LW(src_top_ptr);
+  src_top = (v16i8)__msa_insert_w((v4i32)src_top, 0, val);
+
+  src_left0 = __msa_fill_b(src_left[0]);
+  src_left1 = __msa_fill_b(src_left[1]);
+  src_left2 = __msa_fill_b(src_left[2]);
+  src_left3 = __msa_fill_b(src_left[3]);
+
+  ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top,
+             src_left3, src_top, src0, src1, src2, src3);
+  HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+  IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1);
+  IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
+  SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
+  PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
+  ST4x4_UB(tmp0, tmp1, 0, 2, 0, 2, dst, dst_stride);
+}
+
+static void intra_predict_tm_8x8_msa(const uint8_t *src_top_ptr,
+                                     const uint8_t *src_left,
+                                     uint8_t *dst, int32_t dst_stride) {
+  uint64_t val;
+  uint8_t top_left = src_top_ptr[-1];
+  uint32_t loop_cnt;
+  v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 };
+  v8u16 src_top_left, vec0, vec1, vec2, vec3;
+  v16u8 src0, src1, src2, src3;
+
+  val = LD(src_top_ptr);
+  src_top = (v16i8)__msa_insert_d((v2i64)src_top, 0, val);
+  src_top_left = (v8u16)__msa_fill_h(top_left);
+
+  for (loop_cnt = 2; loop_cnt--;) {
+    src_left0 = __msa_fill_b(src_left[0]);
+    src_left1 = __msa_fill_b(src_left[1]);
+    src_left2 = __msa_fill_b(src_left[2]);
+    src_left3 = __msa_fill_b(src_left[3]);
+    src_left += 4;
+
+    ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top,
+               src_left3, src_top, src0, src1, src2, src3);
+    HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
+    SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
+    PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
+    ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+    dst += (4 * dst_stride);
+  }
+}
+
+static void intra_predict_tm_16x16_msa(const uint8_t *src_top_ptr,
+                                       const uint8_t *src_left,
+                                       uint8_t *dst, int32_t dst_stride) {
+  uint8_t top_left = src_top_ptr[-1];
+  uint32_t loop_cnt;
+  v16i8 src_top, src_left0, src_left1, src_left2, src_left3;
+  v8u16 src_top_left, res_r, res_l;
+
+  src_top = LD_SB(src_top_ptr);
+  src_top_left = (v8u16)__msa_fill_h(top_left);
+
+  for (loop_cnt = 4; loop_cnt--;) {
+    src_left0 = __msa_fill_b(src_left[0]);
+    src_left1 = __msa_fill_b(src_left[1]);
+    src_left2 = __msa_fill_b(src_left[2]);
+    src_left3 = __msa_fill_b(src_left[3]);
+    src_left += 4;
+
+    ILVRL_B2_UH(src_left0, src_top, res_r, res_l);
+    HADD_UB2_UH(res_r, res_l, res_r, res_l);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
+
+    SAT_UH2_UH(res_r, res_l, 7);
+    PCKEV_ST_SB(res_r, res_l, dst);
+    dst += dst_stride;
+
+    ILVRL_B2_UH(src_left1, src_top, res_r, res_l);
+    HADD_UB2_UH(res_r, res_l, res_r, res_l);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
+    SAT_UH2_UH(res_r, res_l, 7);
+    PCKEV_ST_SB(res_r, res_l, dst);
+    dst += dst_stride;
+
+    ILVRL_B2_UH(src_left2, src_top, res_r, res_l);
+    HADD_UB2_UH(res_r, res_l, res_r, res_l);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
+    SAT_UH2_UH(res_r, res_l, 7);
+    PCKEV_ST_SB(res_r, res_l, dst);
+    dst += dst_stride;
+
+    ILVRL_B2_UH(src_left3, src_top, res_r, res_l);
+    HADD_UB2_UH(res_r, res_l, res_r, res_l);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
+    SAT_UH2_UH(res_r, res_l, 7);
+    PCKEV_ST_SB(res_r, res_l, dst);
+    dst += dst_stride;
+  }
+}
+
+static void intra_predict_tm_32x32_msa(const uint8_t *src_top,
+                                       const uint8_t *src_left,
+                                       uint8_t *dst, int32_t dst_stride) {
+  uint8_t top_left = src_top[-1];
+  uint32_t loop_cnt;
+  v16i8 src_top0, src_top1, src_left0, src_left1, src_left2, src_left3;
+  v8u16 src_top_left, res_r0, res_r1, res_l0, res_l1;
+
+  LD_SB2(src_top, 16, src_top0, src_top1);
+  src_top_left = (v8u16)__msa_fill_h(top_left);
+
+  for (loop_cnt = 8; loop_cnt--;) {
+    src_left0 = __msa_fill_b(src_left[0]);
+    src_left1 = __msa_fill_b(src_left[1]);
+    src_left2 = __msa_fill_b(src_left[2]);
+    src_left3 = __msa_fill_b(src_left[3]);
+    src_left += 4;
+
+    ILVR_B2_UH(src_left0, src_top0, src_left0, src_top1, res_r0, res_r1);
+    ILVL_B2_UH(src_left0, src_top0, src_left0, src_top1, res_l0, res_l1);
+    HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
+    SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
+    PCKEV_ST_SB(res_r0, res_l0, dst);
+    PCKEV_ST_SB(res_r1, res_l1, dst + 16);
+    dst += dst_stride;
+
+    ILVR_B2_UH(src_left1, src_top0, src_left1, src_top1, res_r0, res_r1);
+    ILVL_B2_UH(src_left1, src_top0, src_left1, src_top1, res_l0, res_l1);
+    HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
+    SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
+    PCKEV_ST_SB(res_r0, res_l0, dst);
+    PCKEV_ST_SB(res_r1, res_l1, dst + 16);
+    dst += dst_stride;
+
+    ILVR_B2_UH(src_left2, src_top0, src_left2, src_top1, res_r0, res_r1);
+    ILVL_B2_UH(src_left2, src_top0, src_left2, src_top1, res_l0, res_l1);
+    HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
+    SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
+    PCKEV_ST_SB(res_r0, res_l0, dst);
+    PCKEV_ST_SB(res_r1, res_l1, dst + 16);
+    dst += dst_stride;
+
+    ILVR_B2_UH(src_left3, src_top0, src_left3, src_top1, res_r0, res_r1);
+    ILVL_B2_UH(src_left3, src_top0, src_left3, src_top1, res_l0, res_l1);
+    HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
+    SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
+    PCKEV_ST_SB(res_r0, res_l0, dst);
+    PCKEV_ST_SB(res_r1, res_l1, dst + 16);
+    dst += dst_stride;
+  }
+}
+
+void vpx_v_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+                             const uint8_t *above, const uint8_t *left) {
+  (void)left;
+
+  intra_predict_vert_4x4_msa(above, dst, y_stride);
+}
+
+void vpx_v_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+                             const uint8_t *above, const uint8_t *left) {
+  (void)left;
+
+  intra_predict_vert_8x8_msa(above, dst, y_stride);
+}
+
+void vpx_v_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+                               const uint8_t *above, const uint8_t *left) {
+  (void)left;
+
+  intra_predict_vert_16x16_msa(above, dst, y_stride);
+}
+
+void vpx_v_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+                               const uint8_t *above, const uint8_t *left) {
+  (void)left;
+
+  intra_predict_vert_32x32_msa(above, dst, y_stride);
+}
+
+void vpx_h_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+                             const uint8_t *above, const uint8_t *left) {
+  (void)above;
+
+  intra_predict_horiz_4x4_msa(left, dst, y_stride);
+}
+
+void vpx_h_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+                             const uint8_t *above, const uint8_t *left) {
+  (void)above;
+
+  intra_predict_horiz_8x8_msa(left, dst, y_stride);
+}
+
+void vpx_h_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+                               const uint8_t *above, const uint8_t *left) {
+  (void)above;
+
+  intra_predict_horiz_16x16_msa(left, dst, y_stride);
+}
+
+void vpx_h_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+                               const uint8_t *above, const uint8_t *left) {
+  (void)above;
+
+  intra_predict_horiz_32x32_msa(left, dst, y_stride);
+}
+
+void vpx_dc_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+                              const uint8_t *above, const uint8_t *left) {
+  intra_predict_dc_4x4_msa(above, left, dst, y_stride);
+}
+
+void vpx_dc_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+                              const uint8_t *above, const uint8_t *left) {
+  intra_predict_dc_8x8_msa(above, left, dst, y_stride);
+}
+
+void vpx_dc_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                const uint8_t *above, const uint8_t *left) {
+  intra_predict_dc_16x16_msa(above, left, dst, y_stride);
+}
+
+void vpx_dc_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                const uint8_t *above, const uint8_t *left) {
+  intra_predict_dc_32x32_msa(above, left, dst, y_stride);
+}
+
+void vpx_dc_top_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                  const uint8_t *above, const uint8_t *left) {
+  (void)left;
+
+  intra_predict_dc_tl_4x4_msa(above, dst, y_stride);
+}
+
+void vpx_dc_top_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                  const uint8_t *above, const uint8_t *left) {
+  (void)left;
+
+  intra_predict_dc_tl_8x8_msa(above, dst, y_stride);
+}
+
+void vpx_dc_top_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)left;
+
+  intra_predict_dc_tl_16x16_msa(above, dst, y_stride);
+}
+
+void vpx_dc_top_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)left;
+
+  intra_predict_dc_tl_32x32_msa(above, dst, y_stride);
+}
+
+void vpx_dc_left_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)above;
+
+  intra_predict_dc_tl_4x4_msa(left, dst, y_stride);
+}
+
+void vpx_dc_left_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)above;
+
+  intra_predict_dc_tl_8x8_msa(left, dst, y_stride);
+}
+
+void vpx_dc_left_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+
+  intra_predict_dc_tl_16x16_msa(left, dst, y_stride);
+}
+
+void vpx_dc_left_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+
+  intra_predict_dc_tl_32x32_msa(left, dst, y_stride);
+}
+
+void vpx_dc_128_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                  const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+
+  intra_predict_128dc_4x4_msa(dst, y_stride);
+}
+
+void vpx_dc_128_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                  const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+
+  intra_predict_128dc_8x8_msa(dst, y_stride);
+}
+
+void vpx_dc_128_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+
+  intra_predict_128dc_16x16_msa(dst, y_stride);
+}
+
+void vpx_dc_128_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+
+  intra_predict_128dc_32x32_msa(dst, y_stride);
+}
+
+void vpx_tm_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+                              const uint8_t *above, const uint8_t *left) {
+  intra_predict_tm_4x4_msa(above, left, dst, y_stride);
+}
+
+void vpx_tm_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+                              const uint8_t *above, const uint8_t *left) {
+  intra_predict_tm_8x8_msa(above, left, dst, y_stride);
+}
+
+void vpx_tm_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                const uint8_t *above, const uint8_t *left) {
+  intra_predict_tm_16x16_msa(above, left, dst, y_stride);
+}
+
+void vpx_tm_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                const uint8_t *above, const uint8_t *left) {
+  intra_predict_tm_32x32_msa(above, left, dst, y_stride);
+}
diff --git a/libs/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h b/libs/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h
new file mode 100644
index 0000000000..abd8509118
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h
@@ -0,0 +1,73 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_MIPS_INV_TXFM_DSPR2_H_
+#define VPX_DSP_MIPS_INV_TXFM_DSPR2_H_
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/inv_txfm.h"
+#include "vpx_dsp/mips/common_dspr2.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if HAVE_DSPR2
+#define DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input)                    ({   \
+                                                                               \
+  int32_t tmp, out;                                                            \
+  int     dct_cost_rounding = DCT_CONST_ROUNDING;                              \
+  int     in = input;                                                          \
+                                                                               \
+  __asm__ __volatile__ (                                                       \
+      /* out = dct_const_round_shift(input_dc * cospi_16_64); */               \
+      "mtlo     %[dct_cost_rounding],   $ac1                              \n\t"\
+      "mthi     $zero,                  $ac1                              \n\t"\
+      "madd     $ac1,                   %[in],            %[cospi_16_64]  \n\t"\
+      "extp     %[tmp],                 $ac1,             31              \n\t"\
+                                                                               \
+      /* out = dct_const_round_shift(out * cospi_16_64); */                    \
+      "mtlo     %[dct_cost_rounding],   $ac2                              \n\t"\
+      "mthi     $zero,                  $ac2                              \n\t"\
+      "madd     $ac2,                   %[tmp],           %[cospi_16_64]  \n\t"\
+      "extp     %[out],                 $ac2,             31              \n\t"\
+                                                                               \
+      : [tmp] "=&r" (tmp), [out] "=r" (out)                                    \
+      : [in] "r" (in),                                                         \
+        [dct_cost_rounding] "r" (dct_cost_rounding),                           \
+        [cospi_16_64] "r" (cospi_16_64)                                        \
+   );                                                                          \
+  out;                                                                    })
+
+void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
+                                   int dest_stride);
+void vpx_idct4_rows_dspr2(const int16_t *input, int16_t *output);
+void vpx_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
+                                     int dest_stride);
+void iadst4_dspr2(const int16_t *input, int16_t *output);
+void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows);
+void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
+                                 int dest_stride);
+void iadst8_dspr2(const int16_t *input, int16_t *output);
+void idct16_rows_dspr2(const int16_t *input, int16_t *output,
+                       uint32_t no_rows);
+void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
+                               int dest_stride);
+void iadst16_dspr2(const int16_t *input, int16_t *output);
+
+#endif  // #if HAVE_DSPR2
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_DSP_MIPS_INV_TXFM_DSPR2_H_
diff --git a/libs/libvpx/vpx_dsp/mips/inv_txfm_msa.h b/libs/libvpx/vpx_dsp/mips/inv_txfm_msa.h
new file mode 100644
index 0000000000..1458561a61
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/inv_txfm_msa.h
@@ -0,0 +1,410 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_MIPS_INV_TXFM_MSA_H_
+#define VPX_DSP_MIPS_INV_TXFM_MSA_H_
+
+#include "vpx_dsp/mips/macros_msa.h"
+#include "vpx_dsp/mips/txfm_macros_msa.h"
+#include "vpx_dsp/txfm_common.h"
+
+#define VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,               \
+                  out0, out1, out2, out3, out4, out5, out6, out7) {     \
+  v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst4_m;                    \
+  v8i16 vec0_m, vec1_m, vec2_m, vec3_m, s0_m, s1_m;                     \
+  v8i16 coeff0_m = { cospi_2_64, cospi_6_64, cospi_10_64, cospi_14_64,  \
+    cospi_18_64, cospi_22_64, cospi_26_64, cospi_30_64 };               \
+  v8i16 coeff1_m = { cospi_8_64, -cospi_8_64, cospi_16_64,              \
+    -cospi_16_64, cospi_24_64, -cospi_24_64, 0, 0 };                    \
+                                                                        \
+  SPLATI_H2_SH(coeff0_m, 0, 7, cnst0_m, cnst1_m);                       \
+  cnst2_m = -cnst0_m;                                                   \
+  ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m);    \
+  SPLATI_H2_SH(coeff0_m, 4, 3, cnst2_m, cnst3_m);                       \
+  cnst4_m = -cnst2_m;                                                   \
+  ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m);    \
+                                                                        \
+  ILVRL_H2_SH(in0, in7, vec1_m, vec0_m);                                \
+  ILVRL_H2_SH(in4, in3, vec3_m, vec2_m);                                \
+  DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m,        \
+                        cnst1_m, cnst2_m, cnst3_m, in7, in0,            \
+                        in4, in3);                                      \
+                                                                        \
+  SPLATI_H2_SH(coeff0_m, 2, 5, cnst0_m, cnst1_m);                       \
+  cnst2_m = -cnst0_m;                                                   \
+  ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m);    \
+  SPLATI_H2_SH(coeff0_m, 6, 1, cnst2_m, cnst3_m);                       \
+  cnst4_m = -cnst2_m;                                                   \
+  ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m);    \
+                                                                        \
+  ILVRL_H2_SH(in2, in5, vec1_m, vec0_m);                                \
+  ILVRL_H2_SH(in6, in1, vec3_m, vec2_m);                                \
+                                                                        \
+  DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m,        \
+                        cnst1_m, cnst2_m, cnst3_m, in5, in2,            \
+                        in6, in1);                                      \
+  BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5);                \
+  out7 = -s0_m;                                                         \
+  out0 = s1_m;                                                          \
+                                                                        \
+  SPLATI_H4_SH(coeff1_m, 0, 4, 1, 5,                                    \
+               cnst0_m, cnst1_m, cnst2_m, cnst3_m);                     \
+                                                                        \
+  ILVEV_H2_SH(cnst3_m, cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst2_m);    \
+  cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m);                            \
+  cnst1_m = cnst0_m;                                                    \
+                                                                        \
+  ILVRL_H2_SH(in4, in3, vec1_m, vec0_m);                                \
+  ILVRL_H2_SH(in6, in1, vec3_m, vec2_m);                                \
+  DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m,        \
+                        cnst2_m, cnst3_m, cnst1_m, out1, out6,          \
+                        s0_m, s1_m);                                    \
+                                                                        \
+  SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m);                       \
+  cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m);                            \
+                                                                        \
+  ILVRL_H2_SH(in2, in5, vec1_m, vec0_m);                                \
+  ILVRL_H2_SH(s0_m, s1_m, vec3_m, vec2_m);                              \
+  out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);                \
+  out4 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m);                \
+  out2 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m);                \
+  out5 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m);                \
+                                                                        \
+  out1 = -out1;                                                         \
+  out3 = -out3;                                                         \
+  out5 = -out5;                                                         \
+}
+
+#define VP9_SET_COSPI_PAIR(c0_h, c1_h) ({  \
+  v8i16 out0_m, r0_m, r1_m;                \
+                                           \
+  r0_m = __msa_fill_h(c0_h);               \
+  r1_m = __msa_fill_h(c1_h);               \
+  out0_m = __msa_ilvev_h(r1_m, r0_m);      \
+                                           \
+  out0_m;                                  \
+})
+
+#define VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3) {  \
+  uint8_t *dst_m = (uint8_t *) (dst);                               \
+  v16u8 dst0_m, dst1_m, dst2_m, dst3_m;                             \
+  v16i8 tmp0_m, tmp1_m;                                             \
+  v16i8 zero_m = { 0 };                                             \
+  v8i16 res0_m, res1_m, res2_m, res3_m;                             \
+                                                                    \
+  LD_UB4(dst_m, dst_stride, dst0_m, dst1_m, dst2_m, dst3_m);        \
+  ILVR_B4_SH(zero_m, dst0_m, zero_m, dst1_m, zero_m, dst2_m,        \
+             zero_m, dst3_m, res0_m, res1_m, res2_m, res3_m);       \
+  ADD4(res0_m, in0, res1_m, in1, res2_m, in2, res3_m, in3,          \
+       res0_m, res1_m, res2_m, res3_m);                             \
+  CLIP_SH4_0_255(res0_m, res1_m, res2_m, res3_m);                   \
+  PCKEV_B2_SB(res1_m, res0_m, res3_m, res2_m, tmp0_m, tmp1_m);      \
+  ST8x4_UB(tmp0_m, tmp1_m, dst_m, dst_stride);                      \
+}
+
+#define VP9_IDCT4x4(in0, in1, in2, in3, out0, out1, out2, out3) {   \
+  v8i16 c0_m, c1_m, c2_m, c3_m;                                     \
+  v8i16 step0_m, step1_m;                                           \
+  v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                             \
+                                                                    \
+  c0_m = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64);              \
+  c1_m = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);             \
+  step0_m = __msa_ilvr_h(in2, in0);                                 \
+  DOTP_SH2_SW(step0_m, step0_m, c0_m, c1_m, tmp0_m, tmp1_m);        \
+                                                                    \
+  c2_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);              \
+  c3_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);               \
+  step1_m = __msa_ilvr_h(in3, in1);                                 \
+  DOTP_SH2_SW(step1_m, step1_m, c2_m, c3_m, tmp2_m, tmp3_m);        \
+  SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS);      \
+                                                                    \
+  PCKEV_H2_SW(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp0_m, tmp2_m);      \
+  SLDI_B2_0_SW(tmp0_m, tmp2_m, tmp1_m, tmp3_m, 8);                  \
+  BUTTERFLY_4((v8i16)tmp0_m, (v8i16)tmp1_m,                         \
+              (v8i16)tmp2_m, (v8i16)tmp3_m,                         \
+              out0, out1, out2, out3);                              \
+}
+
+#define VP9_IADST4x4(in0, in1, in2, in3, out0, out1, out2, out3) {  \
+  v8i16 res0_m, res1_m, c0_m, c1_m;                                 \
+  v8i16 k1_m, k2_m, k3_m, k4_m;                                     \
+  v8i16 zero_m = { 0 };                                             \
+  v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                             \
+  v4i32 int0_m, int1_m, int2_m, int3_m;                             \
+  v8i16 mask_m = { sinpi_1_9, sinpi_2_9, sinpi_3_9,                 \
+    sinpi_4_9, -sinpi_1_9, -sinpi_2_9, -sinpi_3_9,                  \
+    -sinpi_4_9 };                                                   \
+                                                                    \
+  SPLATI_H4_SH(mask_m, 3, 0, 1, 2, c0_m, c1_m, k1_m, k2_m);         \
+  ILVEV_H2_SH(c0_m, c1_m, k1_m, k2_m, c0_m, c1_m);                  \
+  ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m);                   \
+  DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp2_m, tmp1_m);          \
+  int0_m = tmp2_m + tmp1_m;                                         \
+                                                                    \
+  SPLATI_H2_SH(mask_m, 4, 7, k4_m, k3_m);                           \
+  ILVEV_H2_SH(k4_m, k1_m, k3_m, k2_m, c0_m, c1_m);                  \
+  DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m);          \
+  int1_m = tmp0_m + tmp1_m;                                         \
+                                                                    \
+  c0_m = __msa_splati_h(mask_m, 6);                                 \
+  ILVL_H2_SH(k2_m, c0_m, zero_m, k2_m, c0_m, c1_m);                 \
+  ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m);                   \
+  DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m);          \
+  int2_m = tmp0_m + tmp1_m;                                         \
+                                                                    \
+  c0_m = __msa_splati_h(mask_m, 6);                                 \
+  c0_m = __msa_ilvev_h(c0_m, k1_m);                                 \
+                                                                    \
+  res0_m = __msa_ilvr_h((in1), (in3));                              \
+  tmp0_m = __msa_dotp_s_w(res0_m, c0_m);                            \
+  int3_m = tmp2_m + tmp0_m;                                         \
+                                                                    \
+  res0_m = __msa_ilvr_h((in2), (in3));                              \
+  c1_m = __msa_ilvev_h(k4_m, k3_m);                                 \
+                                                                    \
+  tmp2_m = __msa_dotp_s_w(res0_m, c1_m);                            \
+  res1_m = __msa_ilvr_h((in0), (in2));                              \
+  c1_m = __msa_ilvev_h(k1_m, zero_m);                               \
+                                                                    \
+  tmp3_m = __msa_dotp_s_w(res1_m, c1_m);                            \
+  int3_m += tmp2_m;                                                 \
+  int3_m += tmp3_m;                                                 \
+                                                                    \
+  SRARI_W4_SW(int0_m, int1_m, int2_m, int3_m, DCT_CONST_BITS);      \
+  PCKEV_H2_SH(int0_m, int0_m, int1_m, int1_m, out0, out1);          \
+  PCKEV_H2_SH(int2_m, int2_m, int3_m, int3_m, out2, out3);          \
+}
+
+#define VP9_SET_CONST_PAIR(mask_h, idx1_h, idx2_h) ({  \
+  v8i16 c0_m, c1_m;                                    \
+                                                       \
+  SPLATI_H2_SH(mask_h, idx1_h, idx2_h, c0_m, c1_m);    \
+  c0_m = __msa_ilvev_h(c1_m, c0_m);                    \
+                                                       \
+  c0_m;                                                \
+})
+
+/* multiply and add macro */
+#define VP9_MADD(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3,        \
+                 out0, out1, out2, out3) {                              \
+  v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m;                     \
+  v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                 \
+                                                                        \
+  ILVRL_H2_SH(inp1, inp0, madd_s1_m, madd_s0_m);                        \
+  ILVRL_H2_SH(inp3, inp2, madd_s3_m, madd_s2_m);                        \
+  DOTP_SH4_SW(madd_s1_m, madd_s0_m, madd_s1_m, madd_s0_m,               \
+              cst0, cst0, cst1, cst1, tmp0_m, tmp1_m, tmp2_m, tmp3_m);  \
+  SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS);          \
+  PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out1);              \
+  DOTP_SH4_SW(madd_s3_m, madd_s2_m, madd_s3_m, madd_s2_m,               \
+              cst2, cst2, cst3, cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m);  \
+  SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS);          \
+  PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out2, out3);              \
+}
+
+/* idct 8x8 macro */
+#define VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,               \
+                       out0, out1, out2, out3, out4, out5, out6, out7) {     \
+  v8i16 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m;              \
+  v8i16 k0_m, k1_m, k2_m, k3_m, res0_m, res1_m, res2_m, res3_m;              \
+  v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                      \
+  v8i16 mask_m = { cospi_28_64, cospi_4_64, cospi_20_64, cospi_12_64,        \
+    cospi_16_64, -cospi_4_64, -cospi_20_64, -cospi_16_64 };                  \
+                                                                             \
+  k0_m = VP9_SET_CONST_PAIR(mask_m, 0, 5);                                   \
+  k1_m = VP9_SET_CONST_PAIR(mask_m, 1, 0);                                   \
+  k2_m = VP9_SET_CONST_PAIR(mask_m, 6, 3);                                   \
+  k3_m = VP9_SET_CONST_PAIR(mask_m, 3, 2);                                   \
+  VP9_MADD(in1, in7, in3, in5, k0_m, k1_m, k2_m, k3_m, in1, in7, in3, in5);  \
+  SUB2(in1, in3, in7, in5, res0_m, res1_m);                                  \
+  k0_m = VP9_SET_CONST_PAIR(mask_m, 4, 7);                                   \
+  k1_m = __msa_splati_h(mask_m, 4);                                          \
+                                                                             \
+  ILVRL_H2_SH(res0_m, res1_m, res2_m, res3_m);                               \
+  DOTP_SH4_SW(res2_m, res3_m, res2_m, res3_m, k0_m, k0_m, k1_m, k1_m,        \
+              tmp0_m, tmp1_m, tmp2_m, tmp3_m);                               \
+  SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS);               \
+  tp4_m = in1 + in3;                                                         \
+  PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tp5_m, tp6_m);                 \
+  tp7_m = in7 + in5;                                                         \
+  k2_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);                       \
+  k3_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);                        \
+  VP9_MADD(in0, in4, in2, in6, k1_m, k0_m, k2_m, k3_m,                       \
+           in0, in4, in2, in6);                                              \
+  BUTTERFLY_4(in0, in4, in2, in6, tp0_m, tp1_m, tp2_m, tp3_m);               \
+  BUTTERFLY_8(tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m,        \
+              out0, out1, out2, out3, out4, out5, out6, out7);               \
+}
+
+#define VP9_IADST8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,            \
+                        out0, out1, out2, out3, out4, out5, out6, out7) {  \
+  v4i32 r0_m, r1_m, r2_m, r3_m, r4_m, r5_m, r6_m, r7_m;                    \
+  v4i32 m0_m, m1_m, m2_m, m3_m, t0_m, t1_m;                                \
+  v8i16 res0_m, res1_m, res2_m, res3_m, k0_m, k1_m, in_s0, in_s1;          \
+  v8i16 mask1_m = { cospi_2_64, cospi_30_64, -cospi_2_64,                  \
+    cospi_10_64, cospi_22_64, -cospi_10_64, cospi_18_64, cospi_14_64 };    \
+  v8i16 mask2_m = { cospi_14_64, -cospi_18_64, cospi_26_64,                \
+    cospi_6_64, -cospi_26_64, cospi_8_64, cospi_24_64, -cospi_8_64 };      \
+  v8i16 mask3_m = { -cospi_24_64, cospi_8_64, cospi_16_64,                 \
+    -cospi_16_64, 0, 0, 0, 0 };                                            \
+                                                                           \
+  k0_m = VP9_SET_CONST_PAIR(mask1_m, 0, 1);                                \
+  k1_m = VP9_SET_CONST_PAIR(mask1_m, 1, 2);                                \
+  ILVRL_H2_SH(in1, in0, in_s1, in_s0);                                     \
+  DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m,          \
+              r0_m, r1_m, r2_m, r3_m);                                     \
+  k0_m = VP9_SET_CONST_PAIR(mask1_m, 6, 7);                                \
+  k1_m = VP9_SET_CONST_PAIR(mask2_m, 0, 1);                                \
+  ILVRL_H2_SH(in5, in4, in_s1, in_s0);                                     \
+  DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m,          \
+              r4_m, r5_m, r6_m, r7_m);                                     \
+  ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m,                     \
+       m0_m, m1_m, m2_m, m3_m);                                            \
+  SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                     \
+  PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res0_m, res1_m);                     \
+  SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m,                     \
+       m0_m, m1_m, m2_m, m3_m);                                            \
+  SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                     \
+  PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, t0_m, t1_m);                         \
+  k0_m = VP9_SET_CONST_PAIR(mask1_m, 3, 4);                                \
+  k1_m = VP9_SET_CONST_PAIR(mask1_m, 4, 5);                                \
+  ILVRL_H2_SH(in3, in2, in_s1, in_s0);                                     \
+  DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m,          \
+              r0_m, r1_m, r2_m, r3_m);                                     \
+  k0_m = VP9_SET_CONST_PAIR(mask2_m, 2, 3);                                \
+  k1_m = VP9_SET_CONST_PAIR(mask2_m, 3, 4);                                \
+  ILVRL_H2_SH(in7, in6, in_s1, in_s0);                                     \
+  DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m,          \
+              r4_m, r5_m, r6_m, r7_m);                                     \
+  ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m,                     \
+       m0_m, m1_m, m2_m, m3_m);                                            \
+  SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                     \
+  PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res2_m, res3_m);                     \
+  SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m,                     \
+       m0_m, m1_m, m2_m, m3_m);                                            \
+  SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                     \
+  PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, r2_m, r3_m);                         \
+  ILVRL_H2_SW(r3_m, r2_m, m2_m, m3_m);                                     \
+  BUTTERFLY_4(res0_m, res1_m, res3_m, res2_m, out0, in7, in4, in3);        \
+  k0_m = VP9_SET_CONST_PAIR(mask2_m, 5, 6);                                \
+  k1_m = VP9_SET_CONST_PAIR(mask2_m, 6, 7);                                \
+  ILVRL_H2_SH(t1_m, t0_m, in_s1, in_s0);                                   \
+  DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m,          \
+              r0_m, r1_m, r2_m, r3_m);                                     \
+  k1_m = VP9_SET_CONST_PAIR(mask3_m, 0, 1);                                \
+  DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m,              \
+              r4_m, r5_m, r6_m, r7_m);                                     \
+  ADD4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m,                     \
+       m0_m, m1_m, m2_m, m3_m);                                            \
+  SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                     \
+  PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in1, out6);                          \
+  SUB4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m,                     \
+       m0_m, m1_m, m2_m, m3_m);                                            \
+  SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                     \
+  PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in2, in5);                           \
+  k0_m = VP9_SET_CONST_PAIR(mask3_m, 2, 2);                                \
+  k1_m = VP9_SET_CONST_PAIR(mask3_m, 2, 3);                                \
+  ILVRL_H2_SH(in4, in3, in_s1, in_s0);                                     \
+  DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m,          \
+              m0_m, m1_m, m2_m, m3_m);                                     \
+  SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                     \
+  PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in3, out4);                          \
+  ILVRL_H2_SW(in5, in2, m2_m, m3_m);                                       \
+  DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m,              \
+              m0_m, m1_m, m2_m, m3_m);                                     \
+  SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                     \
+  PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, out2, in5);                          \
+                                                                           \
+  out1 = -in1;                                                             \
+  out3 = -in3;                                                             \
+  out5 = -in5;                                                             \
+  out7 = -in7;                                                             \
+}
+
+#define VP9_IADST8x16_1D(r0, r1, r2, r3, r4, r5, r6, r7, r8,        \
+                         r9, r10, r11, r12, r13, r14, r15,          \
+                         out0, out1, out2, out3, out4, out5,        \
+                         out6, out7, out8, out9, out10, out11,      \
+                         out12, out13, out14, out15) {              \
+  v8i16 g0_m, g1_m, g2_m, g3_m, g4_m, g5_m, g6_m, g7_m;             \
+  v8i16 g8_m, g9_m, g10_m, g11_m, g12_m, g13_m, g14_m, g15_m;       \
+  v8i16 h0_m, h1_m, h2_m, h3_m, h4_m, h5_m, h6_m, h7_m;             \
+  v8i16 h8_m, h9_m, h10_m, h11_m;                                   \
+  v8i16 k0_m, k1_m, k2_m, k3_m;                                     \
+                                                                    \
+  /* stage 1 */                                                     \
+  k0_m = VP9_SET_COSPI_PAIR(cospi_1_64, cospi_31_64);               \
+  k1_m = VP9_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64);              \
+  k2_m = VP9_SET_COSPI_PAIR(cospi_17_64, cospi_15_64);              \
+  k3_m = VP9_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64);             \
+  MADD_BF(r15, r0, r7, r8, k0_m, k1_m, k2_m, k3_m,                  \
+          g0_m, g1_m, g2_m, g3_m);                                  \
+  k0_m = VP9_SET_COSPI_PAIR(cospi_5_64, cospi_27_64);               \
+  k1_m = VP9_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64);              \
+  k2_m = VP9_SET_COSPI_PAIR(cospi_21_64, cospi_11_64);              \
+  k3_m = VP9_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64);             \
+  MADD_BF(r13, r2, r5, r10, k0_m, k1_m, k2_m, k3_m,                 \
+          g4_m, g5_m, g6_m, g7_m);                                  \
+  k0_m = VP9_SET_COSPI_PAIR(cospi_9_64, cospi_23_64);               \
+  k1_m = VP9_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64);              \
+  k2_m = VP9_SET_COSPI_PAIR(cospi_25_64, cospi_7_64);               \
+  k3_m = VP9_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64);              \
+  MADD_BF(r11, r4, r3, r12, k0_m, k1_m, k2_m, k3_m,                 \
+          g8_m, g9_m, g10_m, g11_m);                                \
+  k0_m = VP9_SET_COSPI_PAIR(cospi_13_64, cospi_19_64);              \
+  k1_m = VP9_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64);             \
+  k2_m = VP9_SET_COSPI_PAIR(cospi_29_64, cospi_3_64);               \
+  k3_m = VP9_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64);              \
+  MADD_BF(r9, r6, r1, r14, k0_m, k1_m, k2_m, k3_m,                  \
+          g12_m, g13_m, g14_m, g15_m);                              \
+                                                                    \
+  /* stage 2 */                                                     \
+  k0_m = VP9_SET_COSPI_PAIR(cospi_4_64, cospi_28_64);               \
+  k1_m = VP9_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64);              \
+  k2_m = VP9_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64);              \
+  MADD_BF(g1_m, g3_m, g9_m, g11_m, k0_m, k1_m, k2_m, k0_m,          \
+          h0_m, h1_m, h2_m, h3_m);                                  \
+  k0_m = VP9_SET_COSPI_PAIR(cospi_12_64, cospi_20_64);              \
+  k1_m = VP9_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64);             \
+  k2_m = VP9_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64);             \
+  MADD_BF(g7_m, g5_m, g15_m, g13_m, k0_m, k1_m, k2_m, k0_m,         \
+          h4_m, h5_m, h6_m, h7_m);                                  \
+  BUTTERFLY_4(h0_m, h2_m, h6_m, h4_m, out8, out9, out11, out10);    \
+  BUTTERFLY_8(g0_m, g2_m, g4_m, g6_m, g14_m, g12_m, g10_m, g8_m,    \
+              h8_m, h9_m, h10_m, h11_m, h6_m, h4_m, h2_m, h0_m);    \
+                                                                    \
+  /* stage 3 */                                                     \
+  BUTTERFLY_4(h8_m, h9_m, h11_m, h10_m, out0, out1, h11_m, h10_m);  \
+  k0_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);               \
+  k1_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);              \
+  k2_m = VP9_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64);              \
+  MADD_BF(h0_m, h2_m, h4_m, h6_m, k0_m, k1_m, k2_m, k0_m,           \
+          out4, out6, out5, out7);                                  \
+  MADD_BF(h1_m, h3_m, h5_m, h7_m, k0_m, k1_m, k2_m, k0_m,           \
+          out12, out14, out13, out15);                              \
+                                                                    \
+  /* stage 4 */                                                     \
+  k0_m = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64);              \
+  k1_m = VP9_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64);            \
+  k2_m = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);             \
+  k3_m = VP9_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64);             \
+  MADD_SHORT(h10_m, h11_m, k1_m, k2_m, out2, out3);                 \
+  MADD_SHORT(out6, out7, k0_m, k3_m, out6, out7);                   \
+  MADD_SHORT(out10, out11, k0_m, k3_m, out10, out11);               \
+  MADD_SHORT(out14, out15, k1_m, k2_m, out14, out15);               \
+}
+
+void vpx_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
+                                      int32_t dst_stride);
+void vpx_idct16_1d_rows_msa(const int16_t *input, int16_t *output);
+void vpx_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
+                                       int32_t dst_stride);
+void vpx_iadst16_1d_rows_msa(const int16_t *input, int16_t *output);
+#endif  // VPX_DSP_MIPS_INV_TXFM_MSA_H_
diff --git a/libs/libvpx/vpx_dsp/mips/itrans16_dspr2.c b/libs/libvpx/vpx_dsp/mips/itrans16_dspr2.c
new file mode 100644
index 0000000000..6d41e6190b
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/itrans16_dspr2.c
@@ -0,0 +1,1227 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/inv_txfm_dspr2.h"
+#include "vpx_dsp/txfm_common.h"
+
+#if HAVE_DSPR2
+void idct16_rows_dspr2(const int16_t *input, int16_t *output,
+                       uint32_t no_rows) {
+  int i;
+  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
+  int step1_10, step1_11, step1_12, step1_13;
+  int step2_0, step2_1, step2_2, step2_3;
+  int step2_8, step2_9, step2_10, step2_11;
+  int step2_12, step2_13, step2_14, step2_15;
+  int load1, load2, load3, load4, load5, load6, load7, load8;
+  int result1, result2, result3, result4;
+  const int const_2_power_13 = 8192;
+
+  for (i = no_rows; i--; ) {
+    /* prefetch row */
+    prefetch_load((const uint8_t *)(input + 16));
+
+    __asm__ __volatile__ (
+        "lh       %[load1],              0(%[input])                    \n\t"
+        "lh       %[load2],             16(%[input])                    \n\t"
+        "lh       %[load3],              8(%[input])                    \n\t"
+        "lh       %[load4],             24(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+        "add      %[result1],           %[load1],       %[load2]        \n\t"
+        "sub      %[result2],           %[load1],       %[load2]        \n\t"
+        "madd     $ac1,                 %[result1],     %[cospi_16_64]  \n\t"
+        "madd     $ac2,                 %[result2],     %[cospi_16_64]  \n\t"
+        "extp     %[step2_0],           $ac1,           31              \n\t"
+        "extp     %[step2_1],           $ac2,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+        "madd     $ac3,                 %[load3],       %[cospi_24_64]  \n\t"
+        "msub     $ac3,                 %[load4],       %[cospi_8_64]   \n\t"
+        "extp     %[step2_2],           $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "madd     $ac1,                 %[load3],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_24_64]  \n\t"
+        "extp     %[step2_3],           $ac1,           31              \n\t"
+
+        "add      %[step1_0],           %[step2_0],     %[step2_3]      \n\t"
+        "add      %[step1_1],           %[step2_1],     %[step2_2]      \n\t"
+        "sub      %[step1_2],           %[step2_1],     %[step2_2]      \n\t"
+        "sub      %[step1_3],           %[step2_0],     %[step2_3]      \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1),
+          [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3),
+          [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
+          [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load5],             2(%[input])                     \n\t"
+        "lh       %[load6],             30(%[input])                    \n\t"
+        "lh       %[load7],             18(%[input])                    \n\t"
+        "lh       %[load8],             14(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load5],       %[cospi_30_64]  \n\t"
+        "msub     $ac1,                 %[load6],       %[cospi_2_64]   \n\t"
+        "extp     %[result1],           $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load7],       %[cospi_14_64]  \n\t"
+        "msub     $ac3,                 %[load8],       %[cospi_18_64]  \n\t"
+        "extp     %[result2],           $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac1,                 %[load7],       %[cospi_18_64]  \n\t"
+        "madd     $ac1,                 %[load8],       %[cospi_14_64]  \n\t"
+        "extp     %[result3],           $ac1,           31              \n\t"
+
+        "madd     $ac2,                 %[load5],       %[cospi_2_64]   \n\t"
+        "madd     $ac2,                 %[load6],       %[cospi_30_64]  \n\t"
+        "extp     %[result4],           $ac2,           31              \n\t"
+
+        "sub      %[load5],             %[result1],     %[result2]      \n\t"
+        "sub      %[load6],             %[result4],     %[result3]      \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load6],       %[cospi_24_64]  \n\t"
+        "msub     $ac1,                 %[load5],       %[cospi_8_64]   \n\t"
+        "madd     $ac3,                 %[load5],       %[cospi_24_64]  \n\t"
+        "madd     $ac3,                 %[load6],       %[cospi_8_64]   \n\t"
+
+        "extp     %[step2_9],           $ac1,           31              \n\t"
+        "extp     %[step2_14],          $ac3,           31              \n\t"
+        "add      %[step2_8],           %[result1],     %[result2]      \n\t"
+        "add      %[step2_15],          %[result4],     %[result3]      \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6),
+          [load7] "=&r" (load7), [load8] "=&r" (load8),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [result3] "=&r" (result3), [result4] "=&r" (result4),
+          [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15),
+          [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),
+          [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             10(%[input])                    \n\t"
+        "lh       %[load2],             22(%[input])                    \n\t"
+        "lh       %[load3],             26(%[input])                    \n\t"
+        "lh       %[load4],             6(%[input])                     \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_22_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_10_64]  \n\t"
+        "extp     %[result1],           $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load3],       %[cospi_6_64]   \n\t"
+        "msub     $ac3,                 %[load4],       %[cospi_26_64]  \n\t"
+        "extp     %[result2],           $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_10_64]  \n\t"
+        "madd     $ac1,                 %[load2],       %[cospi_22_64]  \n\t"
+        "extp     %[result3],           $ac1,           31              \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_26_64]  \n\t"
+        "madd     $ac2,                 %[load4],       %[cospi_6_64]   \n\t"
+        "extp     %[result4],           $ac2,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[result2],     %[result1]      \n\t"
+        "sub      %[load2],             %[result4],     %[result3]      \n\t"
+
+        "msub     $ac1,                 %[load1],       %[cospi_24_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_8_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_24_64]  \n\t"
+        "msub     $ac3,                 %[load1],       %[cospi_8_64]   \n\t"
+
+        "extp     %[step2_10],          $ac1,           31              \n\t"
+        "extp     %[step2_13],          $ac3,           31              \n\t"
+        "add      %[step2_11],          %[result1],     %[result2]      \n\t"
+        "add      %[step2_12],          %[result4],     %[result3]      \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [result3] "=&r" (result3), [result4] "=&r" (result4),
+          [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
+          [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),
+          [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load5],             4(%[input])                     \n\t"
+        "lh       %[load6],             28(%[input])                    \n\t"
+        "lh       %[load7],             20(%[input])                    \n\t"
+        "lh       %[load8],             12(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load5],       %[cospi_28_64]  \n\t"
+        "msub     $ac1,                 %[load6],       %[cospi_4_64]   \n\t"
+        "extp     %[result1],           $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load7],       %[cospi_12_64]  \n\t"
+        "msub     $ac3,                 %[load8],       %[cospi_20_64]  \n\t"
+        "extp     %[result2],           $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac1,                 %[load7],       %[cospi_20_64]  \n\t"
+        "madd     $ac1,                 %[load8],       %[cospi_12_64]  \n\t"
+        "extp     %[result3],           $ac1,           31              \n\t"
+
+        "madd     $ac2,                 %[load5],       %[cospi_4_64]   \n\t"
+        "madd     $ac2,                 %[load6],       %[cospi_28_64]  \n\t"
+        "extp     %[result4],           $ac2,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load5],             %[result4],     %[result3]      \n\t"
+        "sub      %[load5],             %[load5],       %[result1]      \n\t"
+        "add      %[load5],             %[load5],       %[result2]      \n\t"
+
+        "sub      %[load6],             %[result1],     %[result2]      \n\t"
+        "sub      %[load6],             %[load6],       %[result3]      \n\t"
+        "add      %[load6],             %[load6],       %[result4]      \n\t"
+
+        "madd     $ac1,                 %[load5],       %[cospi_16_64]  \n\t"
+        "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"
+
+        "extp     %[step1_5],           $ac1,           31              \n\t"
+        "extp     %[step1_6],           $ac3,           31              \n\t"
+        "add      %[step1_4],           %[result1],     %[result2]      \n\t"
+        "add      %[step1_7],           %[result4],     %[result3]      \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6),
+          [load7] "=&r" (load7), [load8] "=&r" (load8),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [result3] "=&r" (result3), [result4] "=&r" (result4),
+          [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
+          [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),
+          [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    __asm__ __volatile__ (
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+
+        "sub      %[load5],             %[step2_14],    %[step2_13]     \n\t"
+        "sub      %[load5],             %[load5],       %[step2_9]      \n\t"
+        "add      %[load5],             %[load5],       %[step2_10]     \n\t"
+
+        "madd     $ac0,                 %[load5],       %[cospi_16_64]  \n\t"
+
+        "sub      %[load6],             %[step2_14],    %[step2_13]     \n\t"
+        "sub      %[load6],             %[load6],       %[step2_10]     \n\t"
+        "add      %[load6],             %[load6],       %[step2_9]      \n\t"
+
+        "madd     $ac1,                 %[load6],       %[cospi_16_64]  \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load5],             %[step2_15],    %[step2_12]     \n\t"
+        "sub      %[load5],             %[load5],       %[step2_8]      \n\t"
+        "add      %[load5],             %[load5],       %[step2_11]     \n\t"
+
+        "madd     $ac2,                 %[load5],       %[cospi_16_64]  \n\t"
+
+        "sub      %[load6],             %[step2_15],    %[step2_12]     \n\t"
+        "sub      %[load6],             %[load6],       %[step2_11]     \n\t"
+        "add      %[load6],             %[load6],       %[step2_8]      \n\t"
+
+        "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"
+
+        "extp     %[step1_10],          $ac0,           31              \n\t"
+        "extp     %[step1_13],          $ac1,           31              \n\t"
+        "extp     %[step1_11],          $ac2,           31              \n\t"
+        "extp     %[step1_12],          $ac3,           31              \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6),
+          [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11),
+          [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_14] "r" (step2_14), [step2_13] "r" (step2_13),
+          [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),
+          [step2_15] "r" (step2_15), [step2_12] "r" (step2_12),
+          [step2_8] "r" (step2_8), [step2_11] "r" (step2_11),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    __asm__ __volatile__ (
+        "add      %[load5],             %[step1_0],     %[step1_7]      \n\t"
+        "add      %[load5],             %[load5],       %[step2_12]     \n\t"
+        "add      %[load5],             %[load5],       %[step2_15]     \n\t"
+        "add      %[load6],             %[step1_1],     %[step1_6]      \n\t"
+        "add      %[load6],             %[load6],       %[step2_13]     \n\t"
+        "add      %[load6],             %[load6],       %[step2_14]     \n\t"
+        "sh       %[load5],             0(%[output])                    \n\t"
+        "sh       %[load6],             32(%[output])                   \n\t"
+        "sub      %[load5],             %[step1_1],     %[step1_6]      \n\t"
+        "add      %[load5],             %[load5],       %[step2_9]      \n\t"
+        "add      %[load5],             %[load5],       %[step2_10]     \n\t"
+        "sub      %[load6],             %[step1_0],     %[step1_7]      \n\t"
+        "add      %[load6],             %[load6],       %[step2_8]      \n\t"
+        "add      %[load6],             %[load6],       %[step2_11]     \n\t"
+        "sh       %[load5],             192(%[output])                  \n\t"
+        "sh       %[load6],             224(%[output])                  \n\t"
+        "sub      %[load5],             %[step1_0],     %[step1_7]      \n\t"
+        "sub      %[load5],             %[load5],       %[step2_8]      \n\t"
+        "sub      %[load5],             %[load5],       %[step2_11]     \n\t"
+        "sub      %[load6],             %[step1_1],     %[step1_6]      \n\t"
+        "sub      %[load6],             %[load6],       %[step2_9]      \n\t"
+        "sub      %[load6],             %[load6],       %[step2_10]     \n\t"
+        "sh       %[load5],             256(%[output])                  \n\t"
+        "sh       %[load6],             288(%[output])                  \n\t"
+        "add      %[load5],             %[step1_1],     %[step1_6]      \n\t"
+        "sub      %[load5],             %[load5],       %[step2_13]     \n\t"
+        "sub      %[load5],             %[load5],       %[step2_14]     \n\t"
+        "add      %[load6],             %[step1_0],     %[step1_7]      \n\t"
+        "sub      %[load6],             %[load6],       %[step2_12]     \n\t"
+        "sub      %[load6],             %[load6],       %[step2_15]     \n\t"
+        "sh       %[load5],             448(%[output])                  \n\t"
+        "sh       %[load6],             480(%[output])                  \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6)
+        : [output] "r" (output),
+          [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),
+          [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),
+          [step2_8] "r" (step2_8), [step2_9] "r" (step2_9),
+          [step2_10] "r" (step2_10), [step2_11] "r" (step2_11),
+          [step2_12] "r" (step2_12), [step2_13] "r" (step2_13),
+          [step2_14] "r" (step2_14), [step2_15] "r" (step2_15)
+    );
+
+    __asm__ __volatile__ (
+        "add      %[load5],             %[step1_2],     %[step1_5]      \n\t"
+        "add      %[load5],             %[load5],       %[step1_13]     \n\t"
+        "add      %[load6],             %[step1_3],     %[step1_4]      \n\t"
+        "add      %[load6],             %[load6],       %[step1_12]     \n\t"
+        "sh       %[load5],             64(%[output])                   \n\t"
+        "sh       %[load6],             96(%[output])                   \n\t"
+        "sub      %[load5],             %[step1_3],     %[step1_4]      \n\t"
+        "add      %[load5],             %[load5],       %[step1_11]     \n\t"
+        "sub      %[load6],             %[step1_2],     %[step1_5]      \n\t"
+        "add      %[load6],             %[load6],       %[step1_10]     \n\t"
+        "sh       %[load5],             128(%[output])                  \n\t"
+        "sh       %[load6],             160(%[output])                  \n\t"
+        "sub      %[load5],             %[step1_2],     %[step1_5]      \n\t"
+        "sub      %[load5],             %[load5],       %[step1_10]     \n\t"
+        "sub      %[load6],             %[step1_3],     %[step1_4]      \n\t"
+        "sub      %[load6],             %[load6],       %[step1_11]     \n\t"
+        "sh       %[load5],             320(%[output])                  \n\t"
+        "sh       %[load6],             352(%[output])                  \n\t"
+        "add      %[load5],             %[step1_3],     %[step1_4]      \n\t"
+        "sub      %[load5],             %[load5],       %[step1_12]     \n\t"
+        "add      %[load6],             %[step1_2],     %[step1_5]      \n\t"
+        "sub      %[load6],             %[load6],       %[step1_13]     \n\t"
+        "sh       %[load5],             384(%[output])                  \n\t"
+        "sh       %[load6],             416(%[output])                  \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6)
+        : [output] "r" (output),
+          [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),
+          [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),
+          [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),
+          [step1_12] "r" (step1_12), [step1_13] "r" (step1_13)
+    );
+
+    input += 16;
+    output += 1;
+  }
+}
+
+void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
+                               int dest_stride) {
+  int i;
+  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
+  int step1_8, step1_9, step1_10, step1_11;
+  int step1_12, step1_13, step1_14, step1_15;
+  int step2_0, step2_1, step2_2, step2_3;
+  int step2_8, step2_9, step2_10, step2_11;
+  int step2_12, step2_13, step2_14, step2_15;
+  int load1, load2, load3, load4, load5, load6, load7, load8;
+  int result1, result2, result3, result4;
+  const int const_2_power_13 = 8192;
+  uint8_t *dest_pix;
+  uint8_t *cm = vpx_ff_cropTbl;
+
+  /* prefetch vpx_ff_cropTbl */
+  prefetch_load(vpx_ff_cropTbl);
+  prefetch_load(vpx_ff_cropTbl +  32);
+  prefetch_load(vpx_ff_cropTbl +  64);
+  prefetch_load(vpx_ff_cropTbl +  96);
+  prefetch_load(vpx_ff_cropTbl + 128);
+  prefetch_load(vpx_ff_cropTbl + 160);
+  prefetch_load(vpx_ff_cropTbl + 192);
+  prefetch_load(vpx_ff_cropTbl + 224);
+
+  for (i = 0; i < 16; ++i) {
+    dest_pix = (dest + i);
+    __asm__ __volatile__ (
+        "lh       %[load1],              0(%[input])                    \n\t"
+        "lh       %[load2],             16(%[input])                    \n\t"
+        "lh       %[load3],              8(%[input])                    \n\t"
+        "lh       %[load4],             24(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+        "add      %[result1],           %[load1],       %[load2]        \n\t"
+        "sub      %[result2],           %[load1],       %[load2]        \n\t"
+        "madd     $ac1,                 %[result1],     %[cospi_16_64]  \n\t"
+        "madd     $ac2,                 %[result2],     %[cospi_16_64]  \n\t"
+        "extp     %[step2_0],           $ac1,           31              \n\t"
+        "extp     %[step2_1],           $ac2,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+        "madd     $ac3,                 %[load3],       %[cospi_24_64]  \n\t"
+        "msub     $ac3,                 %[load4],       %[cospi_8_64]   \n\t"
+        "extp     %[step2_2],           $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "madd     $ac1,                 %[load3],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_24_64]  \n\t"
+        "extp     %[step2_3],           $ac1,           31              \n\t"
+
+        "add      %[step1_0],           %[step2_0],     %[step2_3]      \n\t"
+        "add      %[step1_1],           %[step2_1],     %[step2_2]      \n\t"
+        "sub      %[step1_2],           %[step2_1],     %[step2_2]      \n\t"
+        "sub      %[step1_3],           %[step2_0],     %[step2_3]      \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1),
+          [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3),
+          [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
+          [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load5],             2(%[input])                     \n\t"
+        "lh       %[load6],             30(%[input])                    \n\t"
+        "lh       %[load7],             18(%[input])                    \n\t"
+        "lh       %[load8],             14(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load5],       %[cospi_30_64]  \n\t"
+        "msub     $ac1,                 %[load6],       %[cospi_2_64]   \n\t"
+        "extp     %[result1],           $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load7],       %[cospi_14_64]  \n\t"
+        "msub     $ac3,                 %[load8],       %[cospi_18_64]  \n\t"
+        "extp     %[result2],           $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac1,                 %[load7],       %[cospi_18_64]  \n\t"
+        "madd     $ac1,                 %[load8],       %[cospi_14_64]  \n\t"
+        "extp     %[result3],           $ac1,           31              \n\t"
+
+        "madd     $ac2,                 %[load5],        %[cospi_2_64]  \n\t"
+        "madd     $ac2,                 %[load6],        %[cospi_30_64] \n\t"
+        "extp     %[result4],           $ac2,            31             \n\t"
+
+        "sub      %[load5],             %[result1],     %[result2]      \n\t"
+        "sub      %[load6],             %[result4],     %[result3]      \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load6],       %[cospi_24_64]  \n\t"
+        "msub     $ac1,                 %[load5],       %[cospi_8_64]   \n\t"
+        "madd     $ac3,                 %[load5],       %[cospi_24_64]  \n\t"
+        "madd     $ac3,                 %[load6],       %[cospi_8_64]   \n\t"
+
+        "extp     %[step2_9],           $ac1,           31              \n\t"
+        "extp     %[step2_14],          $ac3,           31              \n\t"
+        "add      %[step2_8],           %[result1],     %[result2]      \n\t"
+        "add      %[step2_15],          %[result4],     %[result3]      \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6),
+          [load7] "=&r" (load7), [load8] "=&r" (load8),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [result3] "=&r" (result3), [result4] "=&r" (result4),
+          [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15),
+          [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),
+          [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             10(%[input])                    \n\t"
+        "lh       %[load2],             22(%[input])                    \n\t"
+        "lh       %[load3],             26(%[input])                    \n\t"
+        "lh       %[load4],             6(%[input])                     \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],    %[cospi_22_64]     \n\t"
+        "msub     $ac1,                 %[load2],    %[cospi_10_64]     \n\t"
+        "extp     %[result1],           $ac1,        31                 \n\t"
+
+        "madd     $ac3,                 %[load3],    %[cospi_6_64]      \n\t"
+        "msub     $ac3,                 %[load4],    %[cospi_26_64]     \n\t"
+        "extp     %[result2],           $ac3,        31                 \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac1,                 %[load1],    %[cospi_10_64]     \n\t"
+        "madd     $ac1,                 %[load2],    %[cospi_22_64]     \n\t"
+        "extp     %[result3],           $ac1,        31                 \n\t"
+
+        "madd     $ac2,                 %[load3],    %[cospi_26_64]     \n\t"
+        "madd     $ac2,                 %[load4],    %[cospi_6_64]      \n\t"
+        "extp     %[result4],           $ac2,        31                 \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[result2],     %[result1]      \n\t"
+        "sub      %[load2],             %[result4],     %[result3]      \n\t"
+
+        "msub     $ac1,                 %[load1],       %[cospi_24_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_8_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_24_64]  \n\t"
+        "msub     $ac3,                 %[load1],       %[cospi_8_64]   \n\t"
+
+        "extp     %[step2_10],          $ac1,           31              \n\t"
+        "extp     %[step2_13],          $ac3,           31              \n\t"
+        "add      %[step2_11],          %[result1],     %[result2]      \n\t"
+        "add      %[step2_12],          %[result4],     %[result3]      \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [result3] "=&r" (result3), [result4] "=&r" (result4),
+          [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
+          [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),
+          [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load5],             4(%[input])                   \n\t"
+        "lh       %[load6],             28(%[input])                  \n\t"
+        "lh       %[load7],             20(%[input])                  \n\t"
+        "lh       %[load8],             12(%[input])                  \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                          \n\t"
+        "mthi     $zero,                $ac1                          \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                          \n\t"
+        "mthi     $zero,                $ac3                          \n\t"
+
+        "madd     $ac1,                 %[load5],    %[cospi_28_64]   \n\t"
+        "msub     $ac1,                 %[load6],    %[cospi_4_64]    \n\t"
+        "extp     %[result1],           $ac1,        31               \n\t"
+
+        "madd     $ac3,                 %[load7],    %[cospi_12_64]   \n\t"
+        "msub     $ac3,                 %[load8],    %[cospi_20_64]   \n\t"
+        "extp     %[result2],           $ac3,        31               \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                          \n\t"
+        "mthi     $zero,                $ac1                          \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                          \n\t"
+        "mthi     $zero,                $ac2                          \n\t"
+
+        "madd     $ac1,                 %[load7],    %[cospi_20_64]   \n\t"
+        "madd     $ac1,                 %[load8],    %[cospi_12_64]   \n\t"
+        "extp     %[result3],           $ac1,        31               \n\t"
+
+        "madd     $ac2,                 %[load5],    %[cospi_4_64]    \n\t"
+        "madd     $ac2,                 %[load6],    %[cospi_28_64]   \n\t"
+        "extp     %[result4],           $ac2,        31               \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load5],             %[result4],     %[result3]      \n\t"
+        "sub      %[load5],             %[load5],       %[result1]      \n\t"
+        "add      %[load5],             %[load5],       %[result2]      \n\t"
+
+        "sub      %[load6],             %[result1],     %[result2]      \n\t"
+        "sub      %[load6],             %[load6],       %[result3]      \n\t"
+        "add      %[load6],             %[load6],       %[result4]      \n\t"
+
+        "madd     $ac1,                 %[load5],       %[cospi_16_64]  \n\t"
+        "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"
+
+        "extp     %[step1_5],           $ac1,           31              \n\t"
+        "extp     %[step1_6],           $ac3,           31              \n\t"
+
+        "add      %[step1_4],           %[result1],     %[result2]      \n\t"
+        "add      %[step1_7],           %[result4],     %[result3]      \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6),
+          [load7] "=&r" (load7), [load8] "=&r" (load8),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [result3] "=&r" (result3), [result4] "=&r" (result4),
+          [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
+          [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),
+          [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    __asm__ __volatile__ (
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+
+        "sub      %[load5],             %[step2_14],    %[step2_13]     \n\t"
+        "sub      %[load5],             %[load5],       %[step2_9]      \n\t"
+        "add      %[load5],             %[load5],       %[step2_10]     \n\t"
+
+        "madd     $ac0,                 %[load5],       %[cospi_16_64]  \n\t"
+
+        "sub      %[load6],             %[step2_14],    %[step2_13]     \n\t"
+        "sub      %[load6],             %[load6],       %[step2_10]     \n\t"
+        "add      %[load6],             %[load6],       %[step2_9]      \n\t"
+
+        "madd     $ac1,                 %[load6],       %[cospi_16_64]  \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load5],             %[step2_15],    %[step2_12]     \n\t"
+        "sub      %[load5],             %[load5],       %[step2_8]      \n\t"
+        "add      %[load5],             %[load5],       %[step2_11]     \n\t"
+
+        "madd     $ac2,                 %[load5],       %[cospi_16_64]  \n\t"
+
+        "sub      %[load6],             %[step2_15],    %[step2_12]     \n\t"
+        "sub      %[load6],             %[load6],       %[step2_11]     \n\t"
+        "add      %[load6],             %[load6],       %[step2_8]      \n\t"
+
+        "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"
+
+        "extp     %[step1_10],          $ac0,           31              \n\t"
+        "extp     %[step1_13],          $ac1,           31              \n\t"
+        "extp     %[step1_11],          $ac2,           31              \n\t"
+        "extp     %[step1_12],          $ac3,           31              \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6),
+          [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11),
+          [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_14] "r" (step2_14), [step2_13] "r" (step2_13),
+          [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),
+          [step2_15] "r" (step2_15), [step2_12] "r" (step2_12),
+          [step2_8] "r" (step2_8), [step2_11] "r" (step2_11),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    step1_8 = step2_8 + step2_11;
+    step1_9 = step2_9 + step2_10;
+    step1_14 = step2_13 + step2_14;
+    step1_15 = step2_12 + step2_15;
+
+    __asm__ __volatile__ (
+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
+        "add      %[load5],         %[step1_0],         %[step1_7]      \n\t"
+        "add      %[load5],         %[load5],           %[step1_15]     \n\t"
+        "addi     %[load5],         %[load5],           32              \n\t"
+        "sra      %[load5],         %[load5],           6               \n\t"
+        "add      %[load7],         %[load7],           %[load5]        \n\t"
+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
+        "add      %[load6],         %[step1_1],         %[step1_6]      \n\t"
+        "add      %[load6],         %[load6],           %[step1_14]     \n\t"
+        "sb       %[load5],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
+        "addi     %[load6],         %[load6],           32              \n\t"
+        "sra      %[load6],         %[load6],           6               \n\t"
+        "add      %[load8],         %[load8],           %[load6]        \n\t"
+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
+        "sb       %[load6],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
+        "add      %[load5],         %[step1_2],         %[step1_5]      \n\t"
+        "add      %[load5],         %[load5],           %[step1_13]     \n\t"
+        "addi     %[load5],         %[load5],           32              \n\t"
+        "sra      %[load5],         %[load5],           6               \n\t"
+        "add      %[load7],         %[load7],           %[load5]        \n\t"
+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
+        "add      %[load6],         %[step1_3],         %[step1_4]      \n\t"
+        "add      %[load6],         %[load6],           %[step1_12]     \n\t"
+        "sb       %[load5],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
+        "addi     %[load6],         %[load6],           32              \n\t"
+        "sra      %[load6],         %[load6],           6               \n\t"
+        "add      %[load8],         %[load8],           %[load6]        \n\t"
+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
+        "sb       %[load6],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
+        "sub      %[load5],         %[step1_3],         %[step1_4]      \n\t"
+        "add      %[load5],         %[load5],           %[step1_11]     \n\t"
+        "addi     %[load5],         %[load5],           32              \n\t"
+        "sra      %[load5],         %[load5],           6               \n\t"
+        "add      %[load7],         %[load7],           %[load5]        \n\t"
+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
+        "sub      %[load6],         %[step1_2],         %[step1_5]      \n\t"
+        "add      %[load6],         %[load6],           %[step1_10]     \n\t"
+        "sb       %[load5],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
+        "addi     %[load6],         %[load6],           32              \n\t"
+        "sra      %[load6],         %[load6],           6               \n\t"
+        "add      %[load8],         %[load8],           %[load6]        \n\t"
+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
+        "sb       %[load6],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "sub      %[load5],         %[step1_1],         %[step1_6]      \n\t"
+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
+        "add      %[load5],         %[load5],           %[step1_9]      \n\t"
+        "addi     %[load5],         %[load5],           32              \n\t"
+        "sra      %[load5],         %[load5],           6               \n\t"
+        "add      %[load7],         %[load7],           %[load5]        \n\t"
+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
+        "sub      %[load6],         %[step1_0],         %[step1_7]      \n\t"
+        "add      %[load6],         %[load6],           %[step1_8]      \n\t"
+        "sb       %[load5],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
+        "addi     %[load6],         %[load6],           32              \n\t"
+        "sra      %[load6],         %[load6],           6               \n\t"
+        "add      %[load8],         %[load8],           %[load6]        \n\t"
+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
+        "sb       %[load6],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
+        "sub      %[load5],         %[step1_0],         %[step1_7]      \n\t"
+        "sub      %[load5],         %[load5],           %[step1_8]      \n\t"
+        "addi     %[load5],         %[load5],           32              \n\t"
+        "sra      %[load5],         %[load5],           6               \n\t"
+        "add      %[load7],         %[load7],           %[load5]        \n\t"
+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
+        "sub      %[load6],         %[step1_1],         %[step1_6]      \n\t"
+        "sub      %[load6],         %[load6],           %[step1_9]      \n\t"
+        "sb       %[load5],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
+        "addi     %[load6],         %[load6],           32              \n\t"
+        "sra      %[load6],         %[load6],           6               \n\t"
+        "add      %[load8],         %[load8],           %[load6]        \n\t"
+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
+        "sb       %[load6],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
+        "sub      %[load5],         %[step1_2],         %[step1_5]      \n\t"
+        "sub      %[load5],         %[load5],           %[step1_10]     \n\t"
+        "addi     %[load5],         %[load5],           32              \n\t"
+        "sra      %[load5],         %[load5],           6               \n\t"
+        "add      %[load7],         %[load7],           %[load5]        \n\t"
+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
+        "sub      %[load6],         %[step1_3],         %[step1_4]      \n\t"
+        "sub      %[load6],         %[load6],           %[step1_11]     \n\t"
+        "sb       %[load5],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
+        "addi     %[load6],         %[load6],           32              \n\t"
+        "sra      %[load6],         %[load6],           6               \n\t"
+        "add      %[load8],         %[load8],           %[load6]        \n\t"
+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
+        "sb       %[load6],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
+        "add      %[load5],         %[step1_3],         %[step1_4]      \n\t"
+        "sub      %[load5],         %[load5],           %[step1_12]     \n\t"
+        "addi     %[load5],         %[load5],           32              \n\t"
+        "sra      %[load5],         %[load5],           6               \n\t"
+        "add      %[load7],         %[load7],           %[load5]        \n\t"
+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
+        "add      %[load6],         %[step1_2],         %[step1_5]      \n\t"
+        "sub      %[load6],         %[load6],           %[step1_13]     \n\t"
+        "sb       %[load5],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
+        "addi     %[load6],         %[load6],           32              \n\t"
+        "sra      %[load6],         %[load6],           6               \n\t"
+        "add      %[load8],         %[load8],           %[load6]        \n\t"
+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
+        "sb       %[load6],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
+        "add      %[load5],         %[step1_1],         %[step1_6]      \n\t"
+        "sub      %[load5],         %[load5],           %[step1_14]     \n\t"
+        "addi     %[load5],         %[load5],           32              \n\t"
+        "sra      %[load5],         %[load5],           6               \n\t"
+        "add      %[load7],         %[load7],           %[load5]        \n\t"
+        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
+        "add      %[load6],         %[step1_0],         %[step1_7]      \n\t"
+        "sub      %[load6],         %[load6],           %[step1_15]     \n\t"
+        "sb       %[load5],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
+        "addi     %[load6],         %[load6],           32              \n\t"
+        "sra      %[load6],         %[load6],           6               \n\t"
+        "add      %[load8],         %[load8],           %[load6]        \n\t"
+        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
+        "sb       %[load6],         0(%[dest_pix])                      \n\t"
+
+        : [load5] "=&r" (load5), [load6] "=&r" (load6), [load7] "=&r" (load7),
+          [load8] "=&r" (load8), [dest_pix] "+r" (dest_pix)
+        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+          [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),
+          [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),
+          [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),
+          [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),
+          [step1_8] "r" (step1_8), [step1_9] "r" (step1_9),
+          [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),
+          [step1_12] "r" (step1_12), [step1_13] "r" (step1_13),
+          [step1_14] "r" (step1_14), [step1_15] "r" (step1_15)
+    );
+
+    input += 16;
+  }
+}
+
+void vpx_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
+                                 int dest_stride) {
+  DECLARE_ALIGNED(32, int16_t,  out[16 * 16]);
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp    %[pos],    1    \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  // First transform rows
+  idct16_rows_dspr2(input, out, 16);
+
+  // Then transform columns and add to dest
+  idct16_cols_add_blk_dspr2(out, dest, dest_stride);
+}
+
+void vpx_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,
+                                int dest_stride) {
+  DECLARE_ALIGNED(32, int16_t,  out[16 * 16]);
+  int16_t *outptr = out;
+  uint32_t i;
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp    %[pos],    1    \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  // First transform rows. Since all non-zero dct coefficients are in
+  // upper-left 4x4 area, we only need to calculate first 4 rows here.
+  idct16_rows_dspr2(input, outptr, 4);
+
+  outptr += 4;
+  for (i = 0; i < 6; ++i) {
+    __asm__ __volatile__ (
+        "sw     $zero,    0(%[outptr])     \n\t"
+        "sw     $zero,   32(%[outptr])     \n\t"
+        "sw     $zero,   64(%[outptr])     \n\t"
+        "sw     $zero,   96(%[outptr])     \n\t"
+        "sw     $zero,  128(%[outptr])     \n\t"
+        "sw     $zero,  160(%[outptr])     \n\t"
+        "sw     $zero,  192(%[outptr])     \n\t"
+        "sw     $zero,  224(%[outptr])     \n\t"
+        "sw     $zero,  256(%[outptr])     \n\t"
+        "sw     $zero,  288(%[outptr])     \n\t"
+        "sw     $zero,  320(%[outptr])     \n\t"
+        "sw     $zero,  352(%[outptr])     \n\t"
+        "sw     $zero,  384(%[outptr])     \n\t"
+        "sw     $zero,  416(%[outptr])     \n\t"
+        "sw     $zero,  448(%[outptr])     \n\t"
+        "sw     $zero,  480(%[outptr])     \n\t"
+
+        :
+        : [outptr] "r" (outptr)
+    );
+
+    outptr += 2;
+  }
+
+  // Then transform columns
+  idct16_cols_add_blk_dspr2(out, dest, dest_stride);
+}
+
+void vpx_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
+                               int dest_stride) {
+  uint32_t pos = 45;
+  int32_t out;
+  int32_t r;
+  int32_t a1, absa1;
+  int32_t vector_a1;
+  int32_t t1, t2, t3, t4;
+  int32_t vector_1, vector_2, vector_3, vector_4;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+
+    :
+    : [pos] "r" (pos)
+  );
+
+  out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
+  __asm__ __volatile__ (
+      "addi     %[out],     %[out],     32      \n\t"
+      "sra      %[a1],      %[out],     6       \n\t"
+
+      : [out] "+r" (out), [a1] "=r" (a1)
+      :
+  );
+
+  if (a1 < 0) {
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    __asm__ __volatile__ (
+        "abs        %[absa1],       %[a1]       \n\t"
+        "replv.qb   %[vector_a1],   %[absa1]    \n\t"
+
+        : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
+        : [a1] "r" (a1)
+    );
+
+    for (r = 16; r--;) {
+      __asm__ __volatile__ (
+          "lw             %[t1],          0(%[dest])                      \n\t"
+          "lw             %[t2],          4(%[dest])                      \n\t"
+          "lw             %[t3],          8(%[dest])                      \n\t"
+          "lw             %[t4],          12(%[dest])                     \n\t"
+          "subu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
+          "subu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
+          "subu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
+          "subu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
+          "sw             %[vector_1],    0(%[dest])                      \n\t"
+          "sw             %[vector_2],    4(%[dest])                      \n\t"
+          "sw             %[vector_3],    8(%[dest])                      \n\t"
+          "sw             %[vector_4],    12(%[dest])                     \n\t"
+          "add            %[dest],        %[dest],        %[dest_stride]  \n\t"
+
+          : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
+            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
+            [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
+            [dest] "+&r" (dest)
+          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+      );
+    }
+  } else {
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    __asm__ __volatile__ (
+        "replv.qb   %[vector_a1],   %[a1]   \n\t"
+
+        : [vector_a1] "=r" (vector_a1)
+        : [a1] "r" (a1)
+    );
+
+    for (r = 16; r--;) {
+      __asm__ __volatile__ (
+          "lw             %[t1],          0(%[dest])                      \n\t"
+          "lw             %[t2],          4(%[dest])                      \n\t"
+          "lw             %[t3],          8(%[dest])                      \n\t"
+          "lw             %[t4],          12(%[dest])                     \n\t"
+          "addu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
+          "addu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
+          "addu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
+          "addu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
+          "sw             %[vector_1],    0(%[dest])                      \n\t"
+          "sw             %[vector_2],    4(%[dest])                      \n\t"
+          "sw             %[vector_3],    8(%[dest])                      \n\t"
+          "sw             %[vector_4],    12(%[dest])                     \n\t"
+          "add            %[dest],        %[dest],        %[dest_stride]  \n\t"
+
+          : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
+            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
+            [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
+            [dest] "+&r" (dest)
+          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+      );
+    }
+  }
+}
+
+void iadst16_dspr2(const int16_t *input, int16_t *output) {
+  int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
+
+  int x0 = input[15];
+  int x1 = input[0];
+  int x2 = input[13];
+  int x3 = input[2];
+  int x4 = input[11];
+  int x5 = input[4];
+  int x6 = input[9];
+  int x7 = input[6];
+  int x8 = input[7];
+  int x9 = input[8];
+  int x10 = input[5];
+  int x11 = input[10];
+  int x12 = input[3];
+  int x13 = input[12];
+  int x14 = input[1];
+  int x15 = input[14];
+
+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
+           | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
+    output[0] = output[1] = output[2] = output[3] = output[4]
+              = output[5] = output[6] = output[7] = output[8]
+              = output[9] = output[10] = output[11] = output[12]
+              = output[13] = output[14] = output[15] = 0;
+    return;
+  }
+
+  // stage 1
+  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
+  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
+  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
+  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
+  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
+  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
+  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
+  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
+  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
+  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
+  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
+  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
+  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
+  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
+  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
+  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
+
+  x0 = dct_const_round_shift(s0 + s8);
+  x1 = dct_const_round_shift(s1 + s9);
+  x2 = dct_const_round_shift(s2 + s10);
+  x3 = dct_const_round_shift(s3 + s11);
+  x4 = dct_const_round_shift(s4 + s12);
+  x5 = dct_const_round_shift(s5 + s13);
+  x6 = dct_const_round_shift(s6 + s14);
+  x7 = dct_const_round_shift(s7 + s15);
+  x8  = dct_const_round_shift(s0 - s8);
+  x9  = dct_const_round_shift(s1 - s9);
+  x10 = dct_const_round_shift(s2 - s10);
+  x11 = dct_const_round_shift(s3 - s11);
+  x12 = dct_const_round_shift(s4 - s12);
+  x13 = dct_const_round_shift(s5 - s13);
+  x14 = dct_const_round_shift(s6 - s14);
+  x15 = dct_const_round_shift(s7 - s15);
+
+  // stage 2
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = x4;
+  s5 = x5;
+  s6 = x6;
+  s7 = x7;
+  s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;
+  s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;
+  s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;
+  s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;
+  s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
+  s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;
+  s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
+  s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;
+
+  x0 = s0 + s4;
+  x1 = s1 + s5;
+  x2 = s2 + s6;
+  x3 = s3 + s7;
+  x4 = s0 - s4;
+  x5 = s1 - s5;
+  x6 = s2 - s6;
+  x7 = s3 - s7;
+  x8 = dct_const_round_shift(s8 + s12);
+  x9 = dct_const_round_shift(s9 + s13);
+  x10 = dct_const_round_shift(s10 + s14);
+  x11 = dct_const_round_shift(s11 + s15);
+  x12 = dct_const_round_shift(s8 - s12);
+  x13 = dct_const_round_shift(s9 - s13);
+  x14 = dct_const_round_shift(s10 - s14);
+  x15 = dct_const_round_shift(s11 - s15);
+
+  // stage 3
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = x4 * cospi_8_64  + x5 * cospi_24_64;
+  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
+  s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
+  s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;
+  s8 = x8;
+  s9 = x9;
+  s10 = x10;
+  s11 = x11;
+  s12 = x12 * cospi_8_64  + x13 * cospi_24_64;
+  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
+  s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
+  s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;
+
+  x0 = s0 + s2;
+  x1 = s1 + s3;
+  x2 = s0 - s2;
+  x3 = s1 - s3;
+  x4 = dct_const_round_shift(s4 + s6);
+  x5 = dct_const_round_shift(s5 + s7);
+  x6 = dct_const_round_shift(s4 - s6);
+  x7 = dct_const_round_shift(s5 - s7);
+  x8 = s8 + s10;
+  x9 = s9 + s11;
+  x10 = s8 - s10;
+  x11 = s9 - s11;
+  x12 = dct_const_round_shift(s12 + s14);
+  x13 = dct_const_round_shift(s13 + s15);
+  x14 = dct_const_round_shift(s12 - s14);
+  x15 = dct_const_round_shift(s13 - s15);
+
+  // stage 4
+  s2 = (- cospi_16_64) * (x2 + x3);
+  s3 = cospi_16_64 * (x2 - x3);
+  s6 = cospi_16_64 * (x6 + x7);
+  s7 = cospi_16_64 * (- x6 + x7);
+  s10 = cospi_16_64 * (x10 + x11);
+  s11 = cospi_16_64 * (- x10 + x11);
+  s14 = (- cospi_16_64) * (x14 + x15);
+  s15 = cospi_16_64 * (x14 - x15);
+
+  x2 = dct_const_round_shift(s2);
+  x3 = dct_const_round_shift(s3);
+  x6 = dct_const_round_shift(s6);
+  x7 = dct_const_round_shift(s7);
+  x10 = dct_const_round_shift(s10);
+  x11 = dct_const_round_shift(s11);
+  x14 = dct_const_round_shift(s14);
+  x15 = dct_const_round_shift(s15);
+
+  output[0] =  x0;
+  output[1] = -x8;
+  output[2] =  x12;
+  output[3] = -x4;
+  output[4] =  x6;
+  output[5] =  x14;
+  output[6] =  x10;
+  output[7] =  x2;
+  output[8] =  x3;
+  output[9] =  x11;
+  output[10] =  x15;
+  output[11] =  x7;
+  output[12] =  x5;
+  output[13] = -x13;
+  output[14] =  x9;
+  output[15] = -x1;
+}
+
+
+#endif  // HAVE_DSPR2
diff --git a/libs/libvpx/vpx_dsp/mips/itrans32_cols_dspr2.c b/libs/libvpx/vpx_dsp/mips/itrans32_cols_dspr2.c
new file mode 100644
index 0000000000..553acb0f5b
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/itrans32_cols_dspr2.c
@@ -0,0 +1,1068 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "vpx_dsp/mips/inv_txfm_dspr2.h"
+#include "vpx_dsp/txfm_common.h"
+
+#if HAVE_DSPR2
+void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
+                                   int dest_stride) {
+  int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
+  int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
+  int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19;
+  int16_t step1_20, step1_21, step1_22, step1_23, step1_24, step1_25, step1_26;
+  int16_t step1_27, step1_28, step1_29, step1_30, step1_31;
+  int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
+  int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13;
+  int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20;
+  int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27;
+  int16_t step2_28, step2_29, step2_30, step2_31;
+  int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14;
+  int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21;
+  int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27;
+  int16_t step3_28, step3_29, step3_30, step3_31;
+  int temp0, temp1, temp2, temp3;
+  int load1, load2, load3, load4;
+  int result1, result2;
+  int i, temp21;
+  uint8_t *dest_pix, *dest_pix1;
+  const int const_2_power_13 = 8192;
+  uint8_t *cm = vpx_ff_cropTbl;
+
+  /* prefetch vpx_ff_cropTbl */
+  prefetch_load(vpx_ff_cropTbl);
+  prefetch_load(vpx_ff_cropTbl +  32);
+  prefetch_load(vpx_ff_cropTbl +  64);
+  prefetch_load(vpx_ff_cropTbl +  96);
+  prefetch_load(vpx_ff_cropTbl + 128);
+  prefetch_load(vpx_ff_cropTbl + 160);
+  prefetch_load(vpx_ff_cropTbl + 192);
+  prefetch_load(vpx_ff_cropTbl + 224);
+
+  for (i = 0; i < 32; ++i) {
+    dest_pix = dest + i;
+    dest_pix1 = dest + i + 31 * dest_stride;
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             2(%[input])                     \n\t"
+        "lh       %[load2],             62(%[input])                    \n\t"
+        "lh       %[load3],             34(%[input])                    \n\t"
+        "lh       %[load4],             30(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_31_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_1_64]   \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load1],       %[cospi_1_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_31_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_15_64]  \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_17_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+
+        "madd     $ac1,                 %[load3],       %[cospi_17_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_15_64]  \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp3],       %[temp2]        \n\t"
+        "sub      %[load2],             %[temp0],       %[temp1]        \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"
+        "madd     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"
+
+        "extp     %[step1_17],          $ac1,           31              \n\t"
+        "extp     %[step1_30],          $ac3,           31              \n\t"
+        "add      %[step1_16],          %[temp0],       %[temp1]        \n\t"
+        "add      %[step1_31],          %[temp2],       %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
+          [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step1_16] "=r" (step1_16), [step1_17] "=r" (step1_17),
+          [step1_30] "=r" (step1_30), [step1_31] "=r" (step1_31)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_31_64] "r" (cospi_31_64), [cospi_1_64] "r" (cospi_1_64),
+          [cospi_4_64] "r" (cospi_4_64), [cospi_17_64] "r" (cospi_17_64),
+          [cospi_15_64] "r" (cospi_15_64), [cospi_28_64] "r" (cospi_28_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             18(%[input])                    \n\t"
+        "lh       %[load2],             46(%[input])                    \n\t"
+        "lh       %[load3],             50(%[input])                    \n\t"
+        "lh       %[load4],             14(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_23_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_9_64]   \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load1],       %[cospi_9_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_23_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_7_64]   \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_25_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+
+        "madd     $ac1,                 %[load3],       %[cospi_25_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_7_64]   \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"
+        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"
+
+        "msub     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"
+        "msub     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"
+
+        "extp     %[step1_18],          $ac1,           31              \n\t"
+        "extp     %[step1_29],          $ac3,           31              \n\t"
+        "add      %[step1_19],          %[temp0],       %[temp1]        \n\t"
+        "add      %[step1_28],          %[temp2],       %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
+          [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step1_18] "=r" (step1_18), [step1_19] "=r" (step1_19),
+          [step1_28] "=r" (step1_28), [step1_29] "=r" (step1_29)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_23_64] "r" (cospi_23_64), [cospi_9_64] "r" (cospi_9_64),
+          [cospi_4_64] "r" (cospi_4_64), [cospi_7_64] "r" (cospi_7_64),
+          [cospi_25_64] "r" (cospi_25_64), [cospi_28_64] "r" (cospi_28_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             10(%[input])                    \n\t"
+        "lh       %[load2],             54(%[input])                    \n\t"
+        "lh       %[load3],             42(%[input])                    \n\t"
+        "lh       %[load4],             22(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_27_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_5_64]   \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load1],       %[cospi_5_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_27_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_11_64]  \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_21_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+
+        "madd     $ac1,                 %[load3],       %[cospi_21_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_11_64]  \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp0],       %[temp1]        \n\t"
+        "sub      %[load2],             %[temp3],       %[temp2]        \n\t"
+
+        "madd     $ac1,                 %[load2],       %[cospi_12_64]  \n\t"
+        "msub     $ac1,                 %[load1],       %[cospi_20_64]  \n\t"
+        "madd     $ac3,                 %[load1],       %[cospi_12_64]  \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_20_64]  \n\t"
+
+        "extp     %[step1_21],          $ac1,           31              \n\t"
+        "extp     %[step1_26],          $ac3,           31              \n\t"
+        "add      %[step1_20],          %[temp0],       %[temp1]        \n\t"
+        "add      %[step1_27],          %[temp2],       %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
+          [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step1_20] "=r" (step1_20), [step1_21] "=r" (step1_21),
+          [step1_26] "=r" (step1_26), [step1_27] "=r" (step1_27)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_27_64] "r" (cospi_27_64), [cospi_5_64] "r" (cospi_5_64),
+          [cospi_11_64] "r" (cospi_11_64), [cospi_21_64] "r" (cospi_21_64),
+          [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             26(%[input])                    \n\t"
+        "lh       %[load2],             38(%[input])                    \n\t"
+        "lh       %[load3],             58(%[input])                    \n\t"
+        "lh       %[load4],              6(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_19_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_13_64]  \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+        "madd     $ac3,                 %[load1],       %[cospi_13_64]  \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_19_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_3_64]   \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_29_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+        "madd     $ac1,                 %[load3],       %[cospi_29_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_3_64]   \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"
+        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"
+        "msub     $ac1,                 %[load1],       %[cospi_12_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_20_64]  \n\t"
+        "msub     $ac3,                 %[load1],       %[cospi_20_64]  \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_12_64]  \n\t"
+        "extp     %[step1_22],          $ac1,           31              \n\t"
+        "extp     %[step1_25],          $ac3,           31              \n\t"
+        "add      %[step1_23],          %[temp0],       %[temp1]        \n\t"
+        "add      %[step1_24],          %[temp2],       %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
+          [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step1_22] "=r" (step1_22), [step1_23] "=r" (step1_23),
+          [step1_24] "=r" (step1_24), [step1_25] "=r" (step1_25)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_19_64] "r" (cospi_19_64), [cospi_13_64] "r" (cospi_13_64),
+          [cospi_3_64] "r" (cospi_3_64), [cospi_29_64] "r" (cospi_29_64),
+          [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],              4(%[input])                    \n\t"
+        "lh       %[load2],             60(%[input])                    \n\t"
+        "lh       %[load3],             36(%[input])                    \n\t"
+        "lh       %[load4],             28(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_30_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_2_64]   \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+        "madd     $ac3,                 %[load1],       %[cospi_2_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_30_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_14_64]  \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_18_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+        "madd     $ac1,                 %[load3],       %[cospi_18_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_14_64]  \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp0],       %[temp1]        \n\t"
+        "sub      %[load2],             %[temp3],       %[temp2]        \n\t"
+        "msub     $ac1,                 %[load1],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[load2],       %[cospi_24_64]  \n\t"
+        "madd     $ac3,                 %[load1],       %[cospi_24_64]  \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_8_64]   \n\t"
+        "extp     %[step2_9],           $ac1,           31              \n\t"
+        "extp     %[step2_14],          $ac3,           31              \n\t"
+        "add      %[step2_8],           %[temp0],       %[temp1]        \n\t"
+        "add      %[step2_15],          %[temp2],       %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
+          [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step2_8] "=r" (step2_8), [step2_9] "=r" (step2_9),
+          [step2_14] "=r" (step2_14), [step2_15] "=r" (step2_15)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),
+          [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),
+          [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             20(%[input])                    \n\t"
+        "lh       %[load2],             44(%[input])                    \n\t"
+        "lh       %[load3],             52(%[input])                    \n\t"
+        "lh       %[load4],             12(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_22_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_10_64]  \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+        "madd     $ac3,                 %[load1],       %[cospi_10_64]  \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_22_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_6_64]   \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_26_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+        "madd     $ac1,                 %[load3],       %[cospi_26_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_6_64]   \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"
+        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"
+        "msub     $ac1,                 %[load1],       %[cospi_24_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_8_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_24_64]  \n\t"
+        "msub     $ac3,                 %[load1],       %[cospi_8_64]   \n\t"
+        "extp     %[step2_10],          $ac1,           31              \n\t"
+        "extp     %[step2_13],          $ac3,           31              \n\t"
+        "add      %[step2_11],          %[temp0],       %[temp1]        \n\t"
+        "add      %[step2_12],          %[temp2],       %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
+          [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
+          [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),
+          [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),
+          [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)
+    );
+
+    __asm__ __volatile__ (
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "sub      %[temp0],             %[step2_14],    %[step2_13]     \n\t"
+        "sub      %[temp0],             %[temp0],       %[step2_9]      \n\t"
+        "add      %[temp0],             %[temp0],       %[step2_10]     \n\t"
+        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "sub      %[temp1],             %[step2_14],    %[step2_13]     \n\t"
+        "add      %[temp1],             %[temp1],       %[step2_9]      \n\t"
+        "sub      %[temp1],             %[temp1],       %[step2_10]     \n\t"
+        "madd     $ac1,                 %[temp1],       %[cospi_16_64]  \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+        "sub      %[temp0],             %[step2_15],    %[step2_12]     \n\t"
+        "sub      %[temp0],             %[temp0],       %[step2_8]      \n\t"
+        "add      %[temp0],             %[temp0],       %[step2_11]     \n\t"
+        "madd     $ac2,                 %[temp0],       %[cospi_16_64]  \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+        "sub      %[temp1],             %[step2_15],    %[step2_12]     \n\t"
+        "add      %[temp1],             %[temp1],       %[step2_8]      \n\t"
+        "sub      %[temp1],             %[temp1],       %[step2_11]     \n\t"
+        "madd     $ac3,                 %[temp1],       %[cospi_16_64]  \n\t"
+
+        "add      %[step3_8],           %[step2_8],     %[step2_11]     \n\t"
+        "add      %[step3_9],           %[step2_9],     %[step2_10]     \n\t"
+        "add      %[step3_14],          %[step2_13],    %[step2_14]     \n\t"
+        "add      %[step3_15],          %[step2_12],    %[step2_15]     \n\t"
+        "extp     %[step3_10],          $ac0,           31              \n\t"
+        "extp     %[step3_13],          $ac1,           31              \n\t"
+        "extp     %[step3_11],          $ac2,           31              \n\t"
+        "extp     %[step3_12],          $ac3,           31              \n\t"
+
+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [step3_8] "=r" (step3_8), [step3_9] "=r" (step3_9),
+          [step3_10] "=r" (step3_10), [step3_11] "=r" (step3_11),
+          [step3_12] "=r" (step3_12), [step3_13] "=r" (step3_13),
+          [step3_14] "=r" (step3_14), [step3_15] "=r" (step3_15)
+        : [const_2_power_13] "r" (const_2_power_13), [step2_8] "r" (step2_8),
+          [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),
+          [step2_11] "r" (step2_11), [step2_12] "r" (step2_12),
+          [step2_13] "r" (step2_13), [step2_14] "r" (step2_14),
+          [step2_15] "r" (step2_15), [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    step2_18 = step1_17 - step1_18;
+    step2_29 = step1_30 - step1_29;
+
+    __asm__ __volatile__ (
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "msub     $ac0,                 %[step2_18],    %[cospi_8_64]   \n\t"
+        "madd     $ac0,                 %[step2_29],    %[cospi_24_64]  \n\t"
+        "extp     %[step3_18],          $ac0,           31              \n\t"
+
+        : [step3_18] "=r" (step3_18)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_18] "r" (step2_18), [step2_29] "r" (step2_29),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64;
+    step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    step2_19 = step1_16 - step1_19;
+    step2_28 = step1_31 - step1_28;
+
+    __asm__ __volatile__ (
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "msub     $ac0,                 %[step2_19],    %[cospi_8_64]   \n\t"
+        "madd     $ac0,                 %[step2_28],    %[cospi_24_64]  \n\t"
+        "extp     %[step3_19],          $ac0,           31              \n\t"
+
+        : [step3_19] "=r" (step3_19)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_19] "r" (step2_19), [step2_28] "r" (step2_28),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64;
+    step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    step3_16 = step1_16 + step1_19;
+    step3_17 = step1_17 + step1_18;
+    step3_30 = step1_29 + step1_30;
+    step3_31 = step1_28 + step1_31;
+
+    step2_20 = step1_23 - step1_20;
+    step2_27 = step1_24 - step1_27;
+
+    __asm__ __volatile__ (
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "msub     $ac0,                 %[step2_20],    %[cospi_24_64]  \n\t"
+        "msub     $ac0,                 %[step2_27],    %[cospi_8_64]   \n\t"
+        "extp     %[step3_20],          $ac0,           31              \n\t"
+
+        : [step3_20] "=r" (step3_20)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_20] "r" (step2_20), [step2_27] "r" (step2_27),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64;
+    step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    step2_21 = step1_22 - step1_21;
+    step2_26 = step1_25 - step1_26;
+
+    __asm__ __volatile__ (
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "msub     $ac1,                 %[step2_21],    %[cospi_24_64]  \n\t"
+        "msub     $ac1,                 %[step2_26],    %[cospi_8_64]   \n\t"
+        "extp     %[step3_21],          $ac1,           31              \n\t"
+
+        : [step3_21] "=r" (step3_21)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_21] "r" (step2_21), [step2_26] "r" (step2_26),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64;
+    step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    step3_22 = step1_21 + step1_22;
+    step3_23 = step1_20 + step1_23;
+    step3_24 = step1_24 + step1_27;
+    step3_25 = step1_25 + step1_26;
+
+    step2_16 = step3_16 + step3_23;
+    step2_17 = step3_17 + step3_22;
+    step2_18 = step3_18 + step3_21;
+    step2_19 = step3_19 + step3_20;
+    step2_20 = step3_19 - step3_20;
+    step2_21 = step3_18 - step3_21;
+    step2_22 = step3_17 - step3_22;
+    step2_23 = step3_16 - step3_23;
+
+    step2_24 = step3_31 - step3_24;
+    step2_25 = step3_30 - step3_25;
+    step2_26 = step3_29 - step3_26;
+    step2_27 = step3_28 - step3_27;
+    step2_28 = step3_28 + step3_27;
+    step2_29 = step3_29 + step3_26;
+    step2_30 = step3_30 + step3_25;
+    step2_31 = step3_31 + step3_24;
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             0(%[input])                     \n\t"
+        "lh       %[load2],             32(%[input])                    \n\t"
+        "lh       %[load3],             16(%[input])                    \n\t"
+        "lh       %[load4],             48(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+        "add      %[result1],           %[load1],       %[load2]        \n\t"
+        "sub      %[result2],           %[load1],       %[load2]        \n\t"
+        "madd     $ac1,                 %[result1],     %[cospi_16_64]  \n\t"
+        "madd     $ac2,                 %[result2],     %[cospi_16_64]  \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+        "madd     $ac3,                 %[load3],       %[cospi_24_64]  \n\t"
+        "msub     $ac3,                 %[load4],       %[cospi_8_64]   \n\t"
+        "extp     %[temp2],             $ac3,           31              \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "madd     $ac1,                 %[load3],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_24_64]  \n\t"
+        "extp     %[temp3],             $ac1,           31              \n\t"
+        "add      %[step1_0],           %[temp0],       %[temp3]        \n\t"
+        "add      %[step1_1],           %[temp1],       %[temp2]        \n\t"
+        "sub      %[step1_2],           %[temp1],       %[temp2]        \n\t"
+        "sub      %[step1_3],           %[temp0],       %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
+          [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             8(%[input])                     \n\t"
+        "lh       %[load2],             56(%[input])                    \n\t"
+        "lh       %[load3],             40(%[input])                    \n\t"
+        "lh       %[load4],             24(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+        "madd     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_12_64]  \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_20_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+        "madd     $ac1,                 %[load3],       %[cospi_20_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_12_64]  \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp3],       %[temp2]        \n\t"
+        "sub      %[load1],             %[load1],       %[temp0]        \n\t"
+        "add      %[load1],             %[load1],       %[temp1]        \n\t"
+        "sub      %[load2],             %[temp0],       %[temp1]        \n\t"
+        "sub      %[load2],             %[load2],       %[temp2]        \n\t"
+        "add      %[load2],             %[load2],       %[temp3]        \n\t"
+        "madd     $ac1,                 %[load1],       %[cospi_16_64]  \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_16_64]  \n\t"
+
+        "extp     %[step1_5],           $ac1,           31              \n\t"
+        "extp     %[step1_6],           $ac3,           31              \n\t"
+        "add      %[step1_4],           %[temp0],       %[temp1]        \n\t"
+        "add      %[step1_7],           %[temp3],       %[temp2]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
+          [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),
+          [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    step2_0 = step1_0 + step1_7;
+    step2_1 = step1_1 + step1_6;
+    step2_2 = step1_2 + step1_5;
+    step2_3 = step1_3 + step1_4;
+    step2_4 = step1_3 - step1_4;
+    step2_5 = step1_2 - step1_5;
+    step2_6 = step1_1 - step1_6;
+    step2_7 = step1_0 - step1_7;
+
+    // stage 7
+    step1_0 = step2_0 + step3_15;
+    step1_1 = step2_1 + step3_14;
+    step1_2 = step2_2 + step3_13;
+    step1_3 = step2_3 + step3_12;
+    step1_4 = step2_4 + step3_11;
+    step1_5 = step2_5 + step3_10;
+    step1_6 = step2_6 + step3_9;
+    step1_7 = step2_7 + step3_8;
+    step1_8 = step2_7 - step3_8;
+    step1_9 = step2_6 - step3_9;
+    step1_10 = step2_5 - step3_10;
+    step1_11 = step2_4 - step3_11;
+    step1_12 = step2_3 - step3_12;
+    step1_13 = step2_2 - step3_13;
+    step1_14 = step2_1 - step3_14;
+    step1_15 = step2_0 - step3_15;
+
+    __asm__ __volatile__ (
+        "sub      %[temp0],             %[step2_27],    %[step2_20]     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
+        "extp     %[step1_20],          $ac0,           31              \n\t"
+
+        : [temp0] "=&r" (temp0), [step1_20] "=r" (step1_20)
+        : [const_2_power_13] "r" (const_2_power_13), [step2_20] "r" (step2_20),
+          [step2_27] "r" (step2_27), [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    temp21 = (step2_20 + step2_27) * cospi_16_64;
+    step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    __asm__ __volatile__ (
+        "sub      %[temp0],             %[step2_26],    %[step2_21]     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
+        "extp     %[step1_21],          $ac0,           31              \n\t"
+
+        : [temp0] "=&r" (temp0), [step1_21] "=r" (step1_21)
+        : [const_2_power_13] "r" (const_2_power_13), [step2_26] "r" (step2_26),
+          [step2_21] "r" (step2_21), [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    temp21 = (step2_21 + step2_26) * cospi_16_64;
+    step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    __asm__ __volatile__ (
+        "sub      %[temp0],             %[step2_25],    %[step2_22]     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
+        "extp     %[step1_22],          $ac0,           31              \n\t"
+
+        : [temp0] "=&r" (temp0), [step1_22] "=r" (step1_22)
+        : [const_2_power_13] "r" (const_2_power_13), [step2_25] "r" (step2_25),
+          [step2_22] "r" (step2_22), [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    temp21 = (step2_22 + step2_25) * cospi_16_64;
+    step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    __asm__ __volatile__ (
+        "sub      %[temp0],             %[step2_24],    %[step2_23]     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
+        "extp     %[step1_23],          $ac0,           31              \n\t"
+
+        : [temp0] "=&r" (temp0), [step1_23] "=r" (step1_23)
+        : [const_2_power_13] "r" (const_2_power_13), [step2_24] "r" (step2_24),
+          [step2_23] "r" (step2_23), [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    temp21 = (step2_23 + step2_24) * cospi_16_64;
+    step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    __asm__ __volatile__ (
+        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
+        "add      %[temp0],         %[step1_0],         %[step2_31]     \n\t"
+        "addi     %[temp0],         %[temp0],           32              \n\t"
+        "sra      %[temp0],         %[temp0],           6               \n\t"
+        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "add      %[temp1],         %[step1_1],         %[step2_30]     \n\t"
+        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
+        "addi     %[temp1],         %[temp1],           32              \n\t"
+        "sra      %[temp1],         %[temp1],           6               \n\t"
+        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
+        "add      %[temp0],         %[step1_2],         %[step2_29]     \n\t"
+        "addi     %[temp0],         %[temp0],           32              \n\t"
+        "sra      %[temp0],         %[temp0],           6               \n\t"
+        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "add      %[temp1],         %[step1_3],         %[step2_28]     \n\t"
+        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
+        "addi     %[temp1],         %[temp1],           32              \n\t"
+        "sra      %[temp1],         %[temp1],           6               \n\t"
+        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+          [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix)
+        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+          [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),
+          [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),
+          [step2_28] "r" (step2_28), [step2_29] "r" (step2_29),
+          [step2_30] "r" (step2_30), [step2_31] "r" (step2_31)
+    );
+
+    step3_12 = ROUND_POWER_OF_TWO((step1_3 - step2_28), 6);
+    step3_13 = ROUND_POWER_OF_TWO((step1_2 - step2_29), 6);
+    step3_14 = ROUND_POWER_OF_TWO((step1_1 - step2_30), 6);
+    step3_15 = ROUND_POWER_OF_TWO((step1_0 - step2_31), 6);
+
+    __asm__ __volatile__ (
+        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp3],         %[temp3],           %[step3_14]     \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+
+        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp2],         %[temp2],           %[step3_13]     \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp3],         %[temp3],           %[step3_12]     \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+
+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+          [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1)
+        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+          [step3_12] "r" (step3_12), [step3_13] "r" (step3_13),
+          [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)
+    );
+
+    __asm__ __volatile__ (
+        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
+        "add      %[temp0],         %[step1_4],         %[step1_27]     \n\t"
+        "addi     %[temp0],         %[temp0],           32              \n\t"
+        "sra      %[temp0],         %[temp0],           6               \n\t"
+        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "add      %[temp1],         %[step1_5],         %[step1_26]     \n\t"
+        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
+        "addi     %[temp1],         %[temp1],           32              \n\t"
+        "sra      %[temp1],         %[temp1],           6               \n\t"
+        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
+        "add      %[temp0],         %[step1_6],         %[step1_25]     \n\t"
+        "addi     %[temp0],         %[temp0],           32              \n\t"
+        "sra      %[temp0],         %[temp0],           6               \n\t"
+        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "add      %[temp1],         %[step1_7],         %[step1_24]     \n\t"
+        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
+        "addi     %[temp1],         %[temp1],           32              \n\t"
+        "sra      %[temp1],         %[temp1],           6               \n\t"
+        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+          [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix)
+        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+          [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),
+          [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),
+          [step1_24] "r" (step1_24), [step1_25] "r" (step1_25),
+          [step1_26] "r" (step1_26), [step1_27] "r" (step1_27)
+    );
+
+    step3_12 = ROUND_POWER_OF_TWO((step1_7 - step1_24), 6);
+    step3_13 = ROUND_POWER_OF_TWO((step1_6 - step1_25), 6);
+    step3_14 = ROUND_POWER_OF_TWO((step1_5 - step1_26), 6);
+    step3_15 = ROUND_POWER_OF_TWO((step1_4 - step1_27), 6);
+
+    __asm__ __volatile__ (
+        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp3],         %[temp3],           %[step3_14]     \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+
+        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp2],         %[temp2],           %[step3_13]     \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp3],         %[temp3],           %[step3_12]     \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+
+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+          [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1)
+        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+          [step3_12] "r" (step3_12), [step3_13] "r" (step3_13),
+          [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)
+    );
+
+    __asm__ __volatile__ (
+        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
+        "add      %[temp0],         %[step1_8],         %[step1_23]     \n\t"
+        "addi     %[temp0],         %[temp0],           32              \n\t"
+        "sra      %[temp0],         %[temp0],           6               \n\t"
+        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "add      %[temp1],         %[step1_9],         %[step1_22]     \n\t"
+        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
+        "addi     %[temp1],         %[temp1],           32              \n\t"
+        "sra      %[temp1],         %[temp1],           6               \n\t"
+        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
+        "add      %[temp0],         %[step1_10],        %[step1_21]     \n\t"
+        "addi     %[temp0],         %[temp0],           32              \n\t"
+        "sra      %[temp0],         %[temp0],           6               \n\t"
+        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "add      %[temp1],         %[step1_11],        %[step1_20]     \n\t"
+        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
+        "addi     %[temp1],         %[temp1],           32              \n\t"
+        "sra      %[temp1],         %[temp1],           6               \n\t"
+        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+          [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix)
+        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+          [step1_8] "r" (step1_8), [step1_9] "r" (step1_9),
+          [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),
+          [step1_20] "r" (step1_20), [step1_21] "r" (step1_21),
+          [step1_22] "r" (step1_22), [step1_23] "r" (step1_23)
+    );
+
+    step3_12 = ROUND_POWER_OF_TWO((step1_11 - step1_20), 6);
+    step3_13 = ROUND_POWER_OF_TWO((step1_10 - step1_21), 6);
+    step3_14 = ROUND_POWER_OF_TWO((step1_9 - step1_22), 6);
+    step3_15 = ROUND_POWER_OF_TWO((step1_8 - step1_23), 6);
+
+    __asm__ __volatile__ (
+        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp3],         %[temp3],           %[step3_14]     \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+
+        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp2],         %[temp2],           %[step3_13]     \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp3],         %[temp3],           %[step3_12]     \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+
+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+          [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1)
+        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+          [step3_12] "r" (step3_12), [step3_13] "r" (step3_13),
+          [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)
+    );
+
+    __asm__ __volatile__ (
+        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
+        "add      %[temp0],         %[step1_12],        %[step2_19]     \n\t"
+        "addi     %[temp0],         %[temp0],           32              \n\t"
+        "sra      %[temp0],         %[temp0],           6               \n\t"
+        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "add      %[temp1],         %[step1_13],        %[step2_18]     \n\t"
+        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
+        "addi     %[temp1],         %[temp1],           32              \n\t"
+        "sra      %[temp1],         %[temp1],           6               \n\t"
+        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+
+        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
+        "add      %[temp0],         %[step1_14],        %[step2_17]     \n\t"
+        "addi     %[temp0],         %[temp0],           32              \n\t"
+        "sra      %[temp0],         %[temp0],           6               \n\t"
+        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "add      %[temp1],         %[step1_15],        %[step2_16]     \n\t"
+        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
+        "addi     %[temp1],         %[temp1],           32              \n\t"
+        "sra      %[temp1],         %[temp1],           6               \n\t"
+        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
+
+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+          [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix)
+        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+          [step1_12] "r" (step1_12), [step1_13] "r" (step1_13),
+          [step1_14] "r" (step1_14), [step1_15] "r" (step1_15),
+          [step2_16] "r" (step2_16), [step2_17] "r" (step2_17),
+          [step2_18] "r" (step2_18), [step2_19] "r" (step2_19)
+    );
+
+    step3_12 = ROUND_POWER_OF_TWO((step1_15 - step2_16), 6);
+    step3_13 = ROUND_POWER_OF_TWO((step1_14 - step2_17), 6);
+    step3_14 = ROUND_POWER_OF_TWO((step1_13 - step2_18), 6);
+    step3_15 = ROUND_POWER_OF_TWO((step1_12 - step2_19), 6);
+
+    __asm__ __volatile__ (
+        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp3],         %[temp3],           %[step3_14]     \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+
+        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp2],         %[temp2],           %[step3_13]     \n\t"
+        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
+        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
+        "add      %[temp3],         %[temp3],           %[step3_12]     \n\t"
+        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
+        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
+
+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+          [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1)
+        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+          [step3_12] "r" (step3_12), [step3_13] "r" (step3_13),
+          [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)
+    );
+
+    input += 32;
+  }
+}
+#endif  // #if HAVE_DSPR2
diff --git a/libs/libvpx/vpx_dsp/mips/itrans32_dspr2.c b/libs/libvpx/vpx_dsp/mips/itrans32_dspr2.c
new file mode 100644
index 0000000000..523da1df1b
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/itrans32_dspr2.c
@@ -0,0 +1,1073 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "vpx_dsp/mips/inv_txfm_dspr2.h"
+#include "vpx_dsp/txfm_common.h"
+
+#if HAVE_DSPR2
+static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
+                              uint32_t no_rows) {
+  int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
+  int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
+  int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20;
+  int16_t step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27;
+  int16_t step1_28, step1_29, step1_30, step1_31;
+  int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
+  int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13;
+  int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20;
+  int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27;
+  int16_t step2_28, step2_29, step2_30, step2_31;
+  int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14;
+  int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21;
+  int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28;
+  int16_t step3_29, step3_30, step3_31;
+  int temp0, temp1, temp2, temp3;
+  int load1, load2, load3, load4;
+  int result1, result2;
+  int temp21;
+  int i;
+  const int const_2_power_13 = 8192;
+  const int32_t *input_int;
+
+  for (i = no_rows; i--; ) {
+    input_int = (const int32_t *)input;
+
+    if (!(input_int[0]  | input_int[1]  | input_int[2]  | input_int[3]  |
+          input_int[4]  | input_int[5]  | input_int[6]  | input_int[7]  |
+          input_int[8]  | input_int[9]  | input_int[10] | input_int[11] |
+          input_int[12] | input_int[13] | input_int[14] | input_int[15])) {
+      input += 32;
+
+      __asm__ __volatile__ (
+          "sh     $zero,     0(%[output])     \n\t"
+          "sh     $zero,    64(%[output])     \n\t"
+          "sh     $zero,   128(%[output])     \n\t"
+          "sh     $zero,   192(%[output])     \n\t"
+          "sh     $zero,   256(%[output])     \n\t"
+          "sh     $zero,   320(%[output])     \n\t"
+          "sh     $zero,   384(%[output])     \n\t"
+          "sh     $zero,   448(%[output])     \n\t"
+          "sh     $zero,   512(%[output])     \n\t"
+          "sh     $zero,   576(%[output])     \n\t"
+          "sh     $zero,   640(%[output])     \n\t"
+          "sh     $zero,   704(%[output])     \n\t"
+          "sh     $zero,   768(%[output])     \n\t"
+          "sh     $zero,   832(%[output])     \n\t"
+          "sh     $zero,   896(%[output])     \n\t"
+          "sh     $zero,   960(%[output])     \n\t"
+          "sh     $zero,  1024(%[output])     \n\t"
+          "sh     $zero,  1088(%[output])     \n\t"
+          "sh     $zero,  1152(%[output])     \n\t"
+          "sh     $zero,  1216(%[output])     \n\t"
+          "sh     $zero,  1280(%[output])     \n\t"
+          "sh     $zero,  1344(%[output])     \n\t"
+          "sh     $zero,  1408(%[output])     \n\t"
+          "sh     $zero,  1472(%[output])     \n\t"
+          "sh     $zero,  1536(%[output])     \n\t"
+          "sh     $zero,  1600(%[output])     \n\t"
+          "sh     $zero,  1664(%[output])     \n\t"
+          "sh     $zero,  1728(%[output])     \n\t"
+          "sh     $zero,  1792(%[output])     \n\t"
+          "sh     $zero,  1856(%[output])     \n\t"
+          "sh     $zero,  1920(%[output])     \n\t"
+          "sh     $zero,  1984(%[output])     \n\t"
+
+          :
+          : [output] "r" (output)
+      );
+
+      output += 1;
+
+      continue;
+    }
+
+    /* prefetch row */
+    prefetch_load((const uint8_t *)(input + 32));
+    prefetch_load((const uint8_t *)(input + 48));
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             2(%[input])                     \n\t"
+        "lh       %[load2],             62(%[input])                    \n\t"
+        "lh       %[load3],             34(%[input])                    \n\t"
+        "lh       %[load4],             30(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_31_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_1_64]   \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load1],       %[cospi_1_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_31_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_15_64]  \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_17_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+
+        "madd     $ac1,                 %[load3],       %[cospi_17_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_15_64]  \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp3],       %[temp2]        \n\t"
+        "sub      %[load2],             %[temp0],       %[temp1]        \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"
+        "madd     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"
+
+        "extp     %[step1_17],          $ac1,           31              \n\t"
+        "extp     %[step1_30],          $ac3,           31              \n\t"
+        "add      %[step1_16],          %[temp0],       %[temp1]        \n\t"
+        "add      %[step1_31],          %[temp2],       %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step1_16] "=r" (step1_16), [step1_17] "=r" (step1_17),
+          [step1_30] "=r" (step1_30), [step1_31] "=r" (step1_31)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_31_64] "r" (cospi_31_64), [cospi_1_64] "r" (cospi_1_64),
+          [cospi_4_64] "r" (cospi_4_64), [cospi_17_64] "r" (cospi_17_64),
+          [cospi_15_64] "r" (cospi_15_64), [cospi_28_64] "r" (cospi_28_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             18(%[input])                    \n\t"
+        "lh       %[load2],             46(%[input])                    \n\t"
+        "lh       %[load3],             50(%[input])                    \n\t"
+        "lh       %[load4],             14(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_23_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_9_64]   \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load1],       %[cospi_9_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_23_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_7_64]   \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_25_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+
+        "madd     $ac1,                 %[load3],       %[cospi_25_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_7_64]   \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"
+        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"
+
+        "msub     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"
+        "msub     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"
+
+        "extp     %[step1_18],          $ac1,           31              \n\t"
+        "extp     %[step1_29],          $ac3,           31              \n\t"
+        "add      %[step1_19],          %[temp0],       %[temp1]        \n\t"
+        "add      %[step1_28],          %[temp2],       %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step1_18] "=r" (step1_18), [step1_19] "=r" (step1_19),
+          [step1_28] "=r" (step1_28), [step1_29] "=r" (step1_29)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_23_64] "r" (cospi_23_64), [cospi_9_64] "r" (cospi_9_64),
+          [cospi_4_64] "r" (cospi_4_64), [cospi_7_64] "r" (cospi_7_64),
+          [cospi_25_64] "r" (cospi_25_64), [cospi_28_64] "r" (cospi_28_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             10(%[input])                    \n\t"
+        "lh       %[load2],             54(%[input])                    \n\t"
+        "lh       %[load3],             42(%[input])                    \n\t"
+        "lh       %[load4],             22(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_27_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_5_64]   \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load1],       %[cospi_5_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_27_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_11_64]  \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_21_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+
+        "madd     $ac1,                 %[load3],       %[cospi_21_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_11_64]  \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp0],       %[temp1]        \n\t"
+        "sub      %[load2],             %[temp3],       %[temp2]        \n\t"
+
+        "madd     $ac1,                 %[load2],       %[cospi_12_64]  \n\t"
+        "msub     $ac1,                 %[load1],       %[cospi_20_64]  \n\t"
+        "madd     $ac3,                 %[load1],       %[cospi_12_64]  \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_20_64]  \n\t"
+
+        "extp     %[step1_21],          $ac1,           31              \n\t"
+        "extp     %[step1_26],          $ac3,           31              \n\t"
+        "add      %[step1_20],          %[temp0],       %[temp1]        \n\t"
+        "add      %[step1_27],          %[temp2],       %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step1_20] "=r" (step1_20), [step1_21] "=r" (step1_21),
+          [step1_26] "=r" (step1_26), [step1_27] "=r" (step1_27)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_27_64] "r" (cospi_27_64), [cospi_5_64] "r" (cospi_5_64),
+          [cospi_11_64] "r" (cospi_11_64), [cospi_21_64] "r" (cospi_21_64),
+          [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             26(%[input])                    \n\t"
+        "lh       %[load2],             38(%[input])                    \n\t"
+        "lh       %[load3],             58(%[input])                    \n\t"
+        "lh       %[load4],              6(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_19_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_13_64]  \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load1],       %[cospi_13_64]  \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_19_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_3_64]   \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_29_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+
+        "madd     $ac1,                 %[load3],       %[cospi_29_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_3_64]   \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"
+        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"
+
+        "msub     $ac1,                 %[load1],       %[cospi_12_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_20_64]  \n\t"
+        "msub     $ac3,                 %[load1],       %[cospi_20_64]  \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_12_64]  \n\t"
+
+        "extp     %[step1_22],          $ac1,           31              \n\t"
+        "extp     %[step1_25],          $ac3,           31              \n\t"
+        "add      %[step1_23],          %[temp0],       %[temp1]        \n\t"
+        "add      %[step1_24],          %[temp2],       %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step1_22] "=r" (step1_22), [step1_23] "=r" (step1_23),
+          [step1_24] "=r" (step1_24), [step1_25] "=r" (step1_25)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_19_64] "r" (cospi_19_64), [cospi_13_64] "r" (cospi_13_64),
+          [cospi_3_64] "r" (cospi_3_64), [cospi_29_64] "r" (cospi_29_64),
+          [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],              4(%[input])                    \n\t"
+        "lh       %[load2],             60(%[input])                    \n\t"
+        "lh       %[load3],             36(%[input])                    \n\t"
+        "lh       %[load4],             28(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_30_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_2_64]   \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load1],       %[cospi_2_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_30_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_14_64]  \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_18_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+
+        "madd     $ac1,                 %[load3],       %[cospi_18_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_14_64]  \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp0],       %[temp1]        \n\t"
+        "sub      %[load2],             %[temp3],       %[temp2]        \n\t"
+
+        "msub     $ac1,                 %[load1],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[load2],       %[cospi_24_64]  \n\t"
+        "madd     $ac3,                 %[load1],       %[cospi_24_64]  \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_8_64]   \n\t"
+
+        "extp     %[step2_9],           $ac1,           31              \n\t"
+        "extp     %[step2_14],          $ac3,           31              \n\t"
+        "add      %[step2_8],           %[temp0],       %[temp1]        \n\t"
+        "add      %[step2_15],          %[temp2],       %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step2_8] "=r" (step2_8), [step2_9] "=r" (step2_9),
+          [step2_14] "=r" (step2_14), [step2_15] "=r" (step2_15)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),
+          [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),
+          [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             20(%[input])                    \n\t"
+        "lh       %[load2],             44(%[input])                    \n\t"
+        "lh       %[load3],             52(%[input])                    \n\t"
+        "lh       %[load4],             12(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_22_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_10_64]  \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load1],       %[cospi_10_64]  \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_22_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_6_64]   \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_26_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+
+        "madd     $ac1,                 %[load3],       %[cospi_26_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_6_64]   \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"
+        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"
+
+        "msub     $ac1,                 %[load1],       %[cospi_24_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_8_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_24_64]  \n\t"
+        "msub     $ac3,                 %[load1],       %[cospi_8_64]   \n\t"
+
+        "extp     %[step2_10],          $ac1,           31              \n\t"
+        "extp     %[step2_13],          $ac3,           31              \n\t"
+        "add      %[step2_11],          %[temp0],       %[temp1]        \n\t"
+        "add      %[step2_12],          %[temp2],       %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
+          [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),
+          [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),
+          [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)
+    );
+
+    __asm__ __volatile__ (
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "sub      %[temp0],             %[step2_14],    %[step2_13]     \n\t"
+        "sub      %[temp0],             %[temp0],       %[step2_9]      \n\t"
+        "add      %[temp0],             %[temp0],       %[step2_10]     \n\t"
+        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "sub      %[temp1],             %[step2_14],    %[step2_13]     \n\t"
+        "add      %[temp1],             %[temp1],       %[step2_9]      \n\t"
+        "sub      %[temp1],             %[temp1],       %[step2_10]     \n\t"
+        "madd     $ac1,                 %[temp1],       %[cospi_16_64]  \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+        "sub      %[temp0],             %[step2_15],    %[step2_12]     \n\t"
+        "sub      %[temp0],             %[temp0],       %[step2_8]      \n\t"
+        "add      %[temp0],             %[temp0],       %[step2_11]     \n\t"
+        "madd     $ac2,                 %[temp0],       %[cospi_16_64]  \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+        "sub      %[temp1],             %[step2_15],    %[step2_12]     \n\t"
+        "add      %[temp1],             %[temp1],       %[step2_8]      \n\t"
+        "sub      %[temp1],             %[temp1],       %[step2_11]     \n\t"
+        "madd     $ac3,                 %[temp1],       %[cospi_16_64]  \n\t"
+
+        "add      %[step3_8],           %[step2_8],     %[step2_11]     \n\t"
+        "add      %[step3_9],           %[step2_9],     %[step2_10]     \n\t"
+        "add      %[step3_14],          %[step2_13],    %[step2_14]     \n\t"
+        "add      %[step3_15],          %[step2_12],    %[step2_15]     \n\t"
+
+        "extp     %[step3_10],          $ac0,           31              \n\t"
+        "extp     %[step3_13],          $ac1,           31              \n\t"
+        "extp     %[step3_11],          $ac2,           31              \n\t"
+        "extp     %[step3_12],          $ac3,           31              \n\t"
+
+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [step3_8] "=r" (step3_8), [step3_9] "=r" (step3_9),
+          [step3_10] "=r" (step3_10), [step3_11] "=r" (step3_11),
+          [step3_12] "=r" (step3_12), [step3_13] "=r" (step3_13),
+          [step3_14] "=r" (step3_14), [step3_15] "=r" (step3_15)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_8] "r" (step2_8), [step2_9] "r" (step2_9),
+          [step2_10] "r" (step2_10), [step2_11] "r" (step2_11),
+          [step2_12] "r" (step2_12), [step2_13] "r" (step2_13),
+          [step2_14] "r" (step2_14), [step2_15] "r" (step2_15),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    step2_18 = step1_17 - step1_18;
+    step2_29 = step1_30 - step1_29;
+
+    __asm__ __volatile__ (
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "msub     $ac0,                 %[step2_18],    %[cospi_8_64]   \n\t"
+        "madd     $ac0,                 %[step2_29],    %[cospi_24_64]  \n\t"
+        "extp     %[step3_18],          $ac0,           31              \n\t"
+
+        : [step3_18] "=r" (step3_18)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_18] "r" (step2_18), [step2_29] "r" (step2_29),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64;
+    step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    step2_19 = step1_16 - step1_19;
+    step2_28 = step1_31 - step1_28;
+
+    __asm__ __volatile__ (
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "msub     $ac0,                 %[step2_19],    %[cospi_8_64]   \n\t"
+        "madd     $ac0,                 %[step2_28],    %[cospi_24_64]  \n\t"
+        "extp     %[step3_19],          $ac0,           31              \n\t"
+
+        : [step3_19] "=r" (step3_19)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_19] "r" (step2_19), [step2_28] "r" (step2_28),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64;
+    step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    step3_16 = step1_16 + step1_19;
+    step3_17 = step1_17 + step1_18;
+    step3_30 = step1_29 + step1_30;
+    step3_31 = step1_28 + step1_31;
+
+    step2_20 = step1_23 - step1_20;
+    step2_27 = step1_24 - step1_27;
+
+    __asm__ __volatile__ (
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "msub     $ac0,                 %[step2_20],    %[cospi_24_64]  \n\t"
+        "msub     $ac0,                 %[step2_27],    %[cospi_8_64]   \n\t"
+        "extp     %[step3_20],          $ac0,           31              \n\t"
+
+        : [step3_20] "=r" (step3_20)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_20] "r" (step2_20), [step2_27] "r" (step2_27),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64;
+    step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    step2_21 = step1_22 - step1_21;
+    step2_26 = step1_25 - step1_26;
+
+    __asm__ __volatile__ (
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "msub     $ac1,                 %[step2_21],    %[cospi_24_64]  \n\t"
+        "msub     $ac1,                 %[step2_26],    %[cospi_8_64]   \n\t"
+        "extp     %[step3_21],          $ac1,           31              \n\t"
+
+        : [step3_21] "=r" (step3_21)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_21] "r" (step2_21), [step2_26] "r" (step2_26),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+    );
+
+    temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64;
+    step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    step3_22 = step1_21 + step1_22;
+    step3_23 = step1_20 + step1_23;
+    step3_24 = step1_24 + step1_27;
+    step3_25 = step1_25 + step1_26;
+
+    step2_16 = step3_16 + step3_23;
+    step2_17 = step3_17 + step3_22;
+    step2_18 = step3_18 + step3_21;
+    step2_19 = step3_19 + step3_20;
+    step2_20 = step3_19 - step3_20;
+    step2_21 = step3_18 - step3_21;
+    step2_22 = step3_17 - step3_22;
+    step2_23 = step3_16 - step3_23;
+
+    step2_24 = step3_31 - step3_24;
+    step2_25 = step3_30 - step3_25;
+    step2_26 = step3_29 - step3_26;
+    step2_27 = step3_28 - step3_27;
+    step2_28 = step3_28 + step3_27;
+    step2_29 = step3_29 + step3_26;
+    step2_30 = step3_30 + step3_25;
+    step2_31 = step3_31 + step3_24;
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             0(%[input])                     \n\t"
+        "lh       %[load2],             32(%[input])                    \n\t"
+        "lh       %[load3],             16(%[input])                    \n\t"
+        "lh       %[load4],             48(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+        "add      %[result1],           %[load1],       %[load2]        \n\t"
+        "sub      %[result2],           %[load1],       %[load2]        \n\t"
+        "madd     $ac1,                 %[result1],     %[cospi_16_64]  \n\t"
+        "madd     $ac2,                 %[result2],     %[cospi_16_64]  \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+        "madd     $ac3,                 %[load3],       %[cospi_24_64]  \n\t"
+        "msub     $ac3,                 %[load4],       %[cospi_8_64]   \n\t"
+        "extp     %[temp2],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "madd     $ac1,                 %[load3],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_24_64]  \n\t"
+        "extp     %[temp3],             $ac1,           31              \n\t"
+
+        "add      %[step1_0],          %[temp0],        %[temp3]        \n\t"
+        "add      %[step1_1],          %[temp1],        %[temp2]        \n\t"
+        "sub      %[step1_2],          %[temp1],        %[temp2]        \n\t"
+        "sub      %[step1_3],          %[temp0],        %[temp3]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [result1] "=&r" (result1), [result2] "=&r" (result2),
+          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
+          [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_16_64] "r" (cospi_16_64),
+          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+
+    );
+
+    __asm__ __volatile__ (
+        "lh       %[load1],             8(%[input])                     \n\t"
+        "lh       %[load2],             56(%[input])                    \n\t"
+        "lh       %[load3],             40(%[input])                    \n\t"
+        "lh       %[load4],             24(%[input])                    \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"
+        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"
+        "extp     %[temp0],             $ac1,           31              \n\t"
+
+        "madd     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"
+        "extp     %[temp3],             $ac3,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+
+        "madd     $ac2,                 %[load3],       %[cospi_12_64]  \n\t"
+        "msub     $ac2,                 %[load4],       %[cospi_20_64]  \n\t"
+        "extp     %[temp1],             $ac2,           31              \n\t"
+
+        "madd     $ac1,                 %[load3],       %[cospi_20_64]  \n\t"
+        "madd     $ac1,                 %[load4],       %[cospi_12_64]  \n\t"
+        "extp     %[temp2],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
+        "sub      %[load1],             %[temp3],       %[temp2]        \n\t"
+        "sub      %[load1],             %[load1],       %[temp0]        \n\t"
+        "add      %[load1],             %[load1],       %[temp1]        \n\t"
+
+        "sub      %[load2],             %[temp0],       %[temp1]        \n\t"
+        "sub      %[load2],             %[load2],       %[temp2]        \n\t"
+        "add      %[load2],             %[load2],       %[temp3]        \n\t"
+
+        "madd     $ac1,                 %[load1],       %[cospi_16_64]  \n\t"
+        "madd     $ac3,                 %[load2],       %[cospi_16_64]  \n\t"
+
+        "extp     %[step1_5],           $ac1,           31              \n\t"
+        "extp     %[step1_6],           $ac3,           31              \n\t"
+        "add      %[step1_4],           %[temp0],       %[temp1]        \n\t"
+        "add      %[step1_7],           %[temp3],       %[temp2]        \n\t"
+
+        : [load1] "=&r" (load1), [load2] "=&r" (load2),
+          [load3] "=&r" (load3), [load4] "=&r" (load4),
+          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
+          [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
+        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+          [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),
+          [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    step2_0 = step1_0 + step1_7;
+    step2_1 = step1_1 + step1_6;
+    step2_2 = step1_2 + step1_5;
+    step2_3 = step1_3 + step1_4;
+    step2_4 = step1_3 - step1_4;
+    step2_5 = step1_2 - step1_5;
+    step2_6 = step1_1 - step1_6;
+    step2_7 = step1_0 - step1_7;
+
+    step1_0 = step2_0 + step3_15;
+    step1_1 = step2_1 + step3_14;
+    step1_2 = step2_2 + step3_13;
+    step1_3 = step2_3 + step3_12;
+    step1_4 = step2_4 + step3_11;
+    step1_5 = step2_5 + step3_10;
+    step1_6 = step2_6 + step3_9;
+    step1_7 = step2_7 + step3_8;
+    step1_8 = step2_7 - step3_8;
+    step1_9 = step2_6 - step3_9;
+    step1_10 = step2_5 - step3_10;
+    step1_11 = step2_4 - step3_11;
+    step1_12 = step2_3 - step3_12;
+    step1_13 = step2_2 - step3_13;
+    step1_14 = step2_1 - step3_14;
+    step1_15 = step2_0 - step3_15;
+
+    __asm__ __volatile__ (
+        "sub      %[temp0],             %[step2_27],    %[step2_20]     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
+        "extp     %[step1_20],          $ac0,           31              \n\t"
+
+        : [temp0] "=&r" (temp0), [step1_20] "=r" (step1_20)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_20] "r" (step2_20), [step2_27] "r" (step2_27),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    temp21 = (step2_20 + step2_27) * cospi_16_64;
+    step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    __asm__ __volatile__ (
+        "sub      %[temp0],             %[step2_26],    %[step2_21]     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
+        "extp     %[step1_21],          $ac0,           31              \n\t"
+
+        : [temp0] "=&r" (temp0), [step1_21] "=r" (step1_21)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_26] "r" (step2_26), [step2_21] "r" (step2_21),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    temp21 = (step2_21 + step2_26) * cospi_16_64;
+    step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    __asm__ __volatile__ (
+        "sub      %[temp0],             %[step2_25],    %[step2_22]     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
+        "extp     %[step1_22],          $ac0,           31              \n\t"
+
+        : [temp0] "=&r" (temp0), [step1_22] "=r" (step1_22)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_25] "r" (step2_25), [step2_22] "r" (step2_22),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    temp21 = (step2_22 + step2_25) * cospi_16_64;
+    step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    __asm__ __volatile__ (
+        "sub      %[temp0],             %[step2_24],    %[step2_23]     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
+        "extp     %[step1_23],          $ac0,           31              \n\t"
+
+        : [temp0] "=&r" (temp0), [step1_23] "=r" (step1_23)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [step2_24] "r" (step2_24), [step2_23] "r" (step2_23),
+          [cospi_16_64] "r" (cospi_16_64)
+    );
+
+    temp21 = (step2_23 + step2_24) * cospi_16_64;
+    step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+    // final stage
+    output[0 * 32] = step1_0 + step2_31;
+    output[1 * 32] = step1_1 + step2_30;
+    output[2 * 32] = step1_2 + step2_29;
+    output[3 * 32] = step1_3 + step2_28;
+    output[4 * 32] = step1_4 + step1_27;
+    output[5 * 32] = step1_5 + step1_26;
+    output[6 * 32] = step1_6 + step1_25;
+    output[7 * 32] = step1_7 + step1_24;
+    output[8 * 32] = step1_8 + step1_23;
+    output[9 * 32] = step1_9 + step1_22;
+    output[10 * 32] = step1_10 + step1_21;
+    output[11 * 32] = step1_11 + step1_20;
+    output[12 * 32] = step1_12 + step2_19;
+    output[13 * 32] = step1_13 + step2_18;
+    output[14 * 32] = step1_14 + step2_17;
+    output[15 * 32] = step1_15 + step2_16;
+    output[16 * 32] = step1_15 - step2_16;
+    output[17 * 32] = step1_14 - step2_17;
+    output[18 * 32] = step1_13 - step2_18;
+    output[19 * 32] = step1_12 - step2_19;
+    output[20 * 32] = step1_11 - step1_20;
+    output[21 * 32] = step1_10 - step1_21;
+    output[22 * 32] = step1_9 - step1_22;
+    output[23 * 32] = step1_8 - step1_23;
+    output[24 * 32] = step1_7 - step1_24;
+    output[25 * 32] = step1_6 - step1_25;
+    output[26 * 32] = step1_5 - step1_26;
+    output[27 * 32] = step1_4 - step1_27;
+    output[28 * 32] = step1_3 - step2_28;
+    output[29 * 32] = step1_2 - step2_29;
+    output[30 * 32] = step1_1 - step2_30;
+    output[31 * 32] = step1_0 - step2_31;
+
+    input += 32;
+    output += 1;
+  }
+}
+
+void vpx_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest,
+                                  int dest_stride) {
+  DECLARE_ALIGNED(32, int16_t,  out[32 * 32]);
+  int16_t *outptr = out;
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  // Rows
+  idct32_rows_dspr2(input, outptr, 32);
+
+  // Columns
+  vpx_idct32_cols_add_blk_dspr2(out, dest, dest_stride);
+}
+
+void vpx_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest,
+                                int stride) {
+  DECLARE_ALIGNED(32, int16_t,  out[32 * 32]);
+  int16_t *outptr = out;
+  uint32_t i;
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  // Rows
+  idct32_rows_dspr2(input, outptr, 8);
+
+  outptr += 8;
+  __asm__ __volatile__ (
+      "sw     $zero,      0(%[outptr])     \n\t"
+      "sw     $zero,      4(%[outptr])     \n\t"
+      "sw     $zero,      8(%[outptr])     \n\t"
+      "sw     $zero,     12(%[outptr])     \n\t"
+      "sw     $zero,     16(%[outptr])     \n\t"
+      "sw     $zero,     20(%[outptr])     \n\t"
+      "sw     $zero,     24(%[outptr])     \n\t"
+      "sw     $zero,     28(%[outptr])     \n\t"
+      "sw     $zero,     32(%[outptr])     \n\t"
+      "sw     $zero,     36(%[outptr])     \n\t"
+      "sw     $zero,     40(%[outptr])     \n\t"
+      "sw     $zero,     44(%[outptr])     \n\t"
+
+      :
+      : [outptr] "r" (outptr)
+  );
+
+  for (i = 0; i < 31; ++i) {
+    outptr += 32;
+
+    __asm__ __volatile__ (
+        "sw     $zero,      0(%[outptr])     \n\t"
+        "sw     $zero,      4(%[outptr])     \n\t"
+        "sw     $zero,      8(%[outptr])     \n\t"
+        "sw     $zero,     12(%[outptr])     \n\t"
+        "sw     $zero,     16(%[outptr])     \n\t"
+        "sw     $zero,     20(%[outptr])     \n\t"
+        "sw     $zero,     24(%[outptr])     \n\t"
+        "sw     $zero,     28(%[outptr])     \n\t"
+        "sw     $zero,     32(%[outptr])     \n\t"
+        "sw     $zero,     36(%[outptr])     \n\t"
+        "sw     $zero,     40(%[outptr])     \n\t"
+        "sw     $zero,     44(%[outptr])     \n\t"
+
+        :
+        : [outptr] "r" (outptr)
+    );
+  }
+
+  // Columns
+  vpx_idct32_cols_add_blk_dspr2(out, dest, stride);
+}
+
+void vpx_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest,
+                               int stride) {
+  int       r, out;
+  int32_t   a1, absa1;
+  int32_t   vector_a1;
+  int32_t   t1, t2, t3, t4;
+  int32_t   vector_1, vector_2, vector_3, vector_4;
+  uint32_t  pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+
+    :
+    : [pos] "r" (pos)
+  );
+
+  out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
+  __asm__ __volatile__ (
+      "addi     %[out],    %[out],    32      \n\t"
+      "sra      %[a1],     %[out],    6       \n\t"
+
+      : [out] "+r" (out), [a1] "=r" (a1)
+      :
+  );
+
+  if (a1 < 0) {
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    __asm__ __volatile__ (
+        "abs        %[absa1],     %[a1]         \n\t"
+        "replv.qb   %[vector_a1], %[absa1]      \n\t"
+
+        : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
+        : [a1] "r" (a1)
+    );
+
+    for (r = 32; r--;) {
+      __asm__ __volatile__ (
+          "lw             %[t1],          0(%[dest])                      \n\t"
+          "lw             %[t2],          4(%[dest])                      \n\t"
+          "lw             %[t3],          8(%[dest])                      \n\t"
+          "lw             %[t4],          12(%[dest])                     \n\t"
+          "subu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
+          "subu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
+          "subu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
+          "subu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
+          "sw             %[vector_1],    0(%[dest])                      \n\t"
+          "sw             %[vector_2],    4(%[dest])                      \n\t"
+          "sw             %[vector_3],    8(%[dest])                      \n\t"
+          "sw             %[vector_4],    12(%[dest])                     \n\t"
+
+          "lw             %[t1],          16(%[dest])                     \n\t"
+          "lw             %[t2],          20(%[dest])                     \n\t"
+          "lw             %[t3],          24(%[dest])                     \n\t"
+          "lw             %[t4],          28(%[dest])                     \n\t"
+          "subu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
+          "subu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
+          "subu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
+          "subu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
+          "sw             %[vector_1],    16(%[dest])                     \n\t"
+          "sw             %[vector_2],    20(%[dest])                     \n\t"
+          "sw             %[vector_3],    24(%[dest])                     \n\t"
+          "sw             %[vector_4],    28(%[dest])                     \n\t"
+
+          "add            %[dest],        %[dest],        %[stride]       \n\t"
+
+          : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
+            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
+            [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
+            [dest] "+&r" (dest)
+          : [stride] "r" (stride), [vector_a1] "r" (vector_a1)
+      );
+    }
+  } else {
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    __asm__ __volatile__ (
+        "replv.qb       %[vector_a1],   %[a1]     \n\t"
+
+        : [vector_a1] "=r" (vector_a1)
+        : [a1] "r" (a1)
+    );
+
+    for (r = 32; r--;) {
+      __asm__ __volatile__ (
+          "lw             %[t1],          0(%[dest])                      \n\t"
+          "lw             %[t2],          4(%[dest])                      \n\t"
+          "lw             %[t3],          8(%[dest])                      \n\t"
+          "lw             %[t4],          12(%[dest])                     \n\t"
+          "addu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
+          "addu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
+          "addu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
+          "addu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
+          "sw             %[vector_1],    0(%[dest])                      \n\t"
+          "sw             %[vector_2],    4(%[dest])                      \n\t"
+          "sw             %[vector_3],    8(%[dest])                      \n\t"
+          "sw             %[vector_4],    12(%[dest])                     \n\t"
+
+          "lw             %[t1],          16(%[dest])                     \n\t"
+          "lw             %[t2],          20(%[dest])                     \n\t"
+          "lw             %[t3],          24(%[dest])                     \n\t"
+          "lw             %[t4],          28(%[dest])                     \n\t"
+          "addu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
+          "addu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
+          "addu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
+          "addu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
+          "sw             %[vector_1],    16(%[dest])                     \n\t"
+          "sw             %[vector_2],    20(%[dest])                     \n\t"
+          "sw             %[vector_3],    24(%[dest])                     \n\t"
+          "sw             %[vector_4],    28(%[dest])                     \n\t"
+
+          "add            %[dest],        %[dest],        %[stride]       \n\t"
+
+          : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
+            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
+            [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
+            [dest] "+&r" (dest)
+          : [stride] "r" (stride), [vector_a1] "r" (vector_a1)
+      );
+    }
+  }
+}
+#endif  // #if HAVE_DSPR2
diff --git a/libs/libvpx/vpx_dsp/mips/itrans4_dspr2.c b/libs/libvpx/vpx_dsp/mips/itrans4_dspr2.c
new file mode 100644
index 0000000000..ecb8bd3de7
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/itrans4_dspr2.c
@@ -0,0 +1,359 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/inv_txfm_dspr2.h"
+#include "vpx_dsp/txfm_common.h"
+
+#if HAVE_DSPR2
+void vpx_idct4_rows_dspr2(const int16_t *input, int16_t *output) {
+  int16_t   step_0, step_1, step_2, step_3;
+  int       Temp0, Temp1, Temp2, Temp3;
+  const int const_2_power_13 = 8192;
+  int       i;
+
+  for (i = 4; i--; ) {
+    __asm__ __volatile__ (
+        /*
+          temp_1 = (input[0] + input[2]) * cospi_16_64;
+          step_0 = dct_const_round_shift(temp_1);
+
+          temp_2 = (input[0] - input[2]) * cospi_16_64;
+          step_1 = dct_const_round_shift(temp_2);
+        */
+        "lh       %[Temp0],             0(%[input])                     \n\t"
+        "lh       %[Temp1],             4(%[input])                     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
+        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
+        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
+        "lh       %[Temp0],             2(%[input])                     \n\t"
+        "lh       %[Temp1],             6(%[input])                     \n\t"
+        "extp     %[step_0],            $ac0,           31              \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+
+        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
+        "extp     %[step_1],            $ac1,           31              \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+
+        /*
+          temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
+          step_2 = dct_const_round_shift(temp1);
+        */
+        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
+        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
+        "extp     %[step_2],            $ac0,           31              \n\t"
+
+        /*
+          temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
+          step_3 = dct_const_round_shift(temp2);
+        */
+        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
+        "extp     %[step_3],            $ac1,           31              \n\t"
+
+        /*
+          output[0]  = step_0 + step_3;
+          output[4]  = step_1 + step_2;
+          output[8]  = step_1 - step_2;
+          output[12] = step_0 - step_3;
+        */
+        "add      %[Temp0],             %[step_0],      %[step_3]       \n\t"
+        "sh       %[Temp0],             0(%[output])                    \n\t"
+
+        "add      %[Temp1],             %[step_1],      %[step_2]       \n\t"
+        "sh       %[Temp1],             8(%[output])                    \n\t"
+
+        "sub      %[Temp2],             %[step_1],      %[step_2]       \n\t"
+        "sh       %[Temp2],             16(%[output])                   \n\t"
+
+        "sub      %[Temp3],             %[step_0],      %[step_3]       \n\t"
+        "sh       %[Temp3],             24(%[output])                   \n\t"
+
+      : [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
+        [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+        [step_0] "=&r" (step_0), [step_1] "=&r" (step_1),
+        [step_2] "=&r" (step_2), [step_3] "=&r" (step_3),
+        [output] "+r" (output)
+      : [const_2_power_13] "r" (const_2_power_13),
+        [cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r" (cospi_16_64),
+        [cospi_24_64] "r" (cospi_24_64),
+        [input] "r" (input)
+    );
+
+    input += 4;
+    output += 1;
+  }
+}
+
+void vpx_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
+                                     int dest_stride) {
+  int16_t   step_0, step_1, step_2, step_3;
+  int       Temp0, Temp1, Temp2, Temp3;
+  const int const_2_power_13 = 8192;
+  int       i;
+  uint8_t   *dest_pix;
+  uint8_t   *cm = vpx_ff_cropTbl;
+
+  /* prefetch vpx_ff_cropTbl */
+  prefetch_load(vpx_ff_cropTbl);
+  prefetch_load(vpx_ff_cropTbl +  32);
+  prefetch_load(vpx_ff_cropTbl +  64);
+  prefetch_load(vpx_ff_cropTbl +  96);
+  prefetch_load(vpx_ff_cropTbl + 128);
+  prefetch_load(vpx_ff_cropTbl + 160);
+  prefetch_load(vpx_ff_cropTbl + 192);
+  prefetch_load(vpx_ff_cropTbl + 224);
+
+  for (i = 0; i < 4; ++i) {
+      dest_pix = (dest + i);
+
+    __asm__ __volatile__ (
+        /*
+          temp_1 = (input[0] + input[2]) * cospi_16_64;
+          step_0 = dct_const_round_shift(temp_1);
+
+          temp_2 = (input[0] - input[2]) * cospi_16_64;
+          step_1 = dct_const_round_shift(temp_2);
+        */
+        "lh       %[Temp0],             0(%[input])                     \n\t"
+        "lh       %[Temp1],             4(%[input])                     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
+        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
+        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
+        "lh       %[Temp0],             2(%[input])                     \n\t"
+        "lh       %[Temp1],             6(%[input])                     \n\t"
+        "extp     %[step_0],            $ac0,           31              \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+
+        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
+        "extp     %[step_1],            $ac1,           31              \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+
+        /*
+          temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
+          step_2 = dct_const_round_shift(temp1);
+        */
+        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
+        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
+        "extp     %[step_2],            $ac0,           31              \n\t"
+
+        /*
+          temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
+          step_3 = dct_const_round_shift(temp2);
+        */
+        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
+        "extp     %[step_3],            $ac1,           31              \n\t"
+
+        /*
+          output[0]  = step_0 + step_3;
+          output[4]  = step_1 + step_2;
+          output[8]  = step_1 - step_2;
+          output[12] = step_0 - step_3;
+        */
+        "add      %[Temp0],             %[step_0],      %[step_3]       \n\t"
+        "addi     %[Temp0],             %[Temp0],       8               \n\t"
+        "sra      %[Temp0],             %[Temp0],       4               \n\t"
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "add      %[Temp0],             %[step_1],      %[step_2]       \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "addi     %[Temp0],             %[Temp0],       8               \n\t"
+        "sra      %[Temp0],             %[Temp0],       4               \n\t"
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "sub      %[Temp0],             %[step_1],      %[step_2]       \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "addi     %[Temp0],             %[Temp0],       8               \n\t"
+        "sra      %[Temp0],             %[Temp0],       4               \n\t"
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "sub      %[Temp0],             %[step_0],      %[step_3]       \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "addi     %[Temp0],             %[Temp0],       8               \n\t"
+        "sra      %[Temp0],             %[Temp0],       4               \n\t"
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+
+      : [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
+        [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+        [step_0] "=&r" (step_0), [step_1] "=&r" (step_1),
+        [step_2] "=&r" (step_2), [step_3] "=&r" (step_3),
+        [dest_pix] "+r" (dest_pix)
+      : [const_2_power_13] "r" (const_2_power_13),
+        [cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r" (cospi_16_64),
+        [cospi_24_64] "r" (cospi_24_64),
+        [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride)
+    );
+
+    input += 4;
+  }
+}
+
+void vpx_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
+                              int dest_stride) {
+  DECLARE_ALIGNED(32, int16_t, out[4 * 4]);
+  int16_t *outptr = out;
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  // Rows
+  vpx_idct4_rows_dspr2(input, outptr);
+
+  // Columns
+  vpx_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+}
+
+void vpx_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest,
+                             int dest_stride) {
+  int       a1, absa1;
+  int       r;
+  int32_t   out;
+  int       t2, vector_a1, vector_a;
+  uint32_t  pos = 45;
+  int16_t   input_dc = input[0];
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+
+    :
+    : [pos] "r" (pos)
+  );
+
+  out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input_dc);
+  __asm__ __volatile__ (
+      "addi     %[out],     %[out],    8       \n\t"
+      "sra      %[a1],      %[out],    4       \n\t"
+
+      : [out] "+r" (out), [a1] "=r" (a1)
+      :
+  );
+
+  if (a1 < 0) {
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    __asm__ __volatile__ (
+        "abs        %[absa1],     %[a1]         \n\t"
+        "replv.qb   %[vector_a1], %[absa1]      \n\t"
+
+        : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
+        : [a1] "r" (a1)
+    );
+
+    for (r = 4; r--;) {
+      __asm__ __volatile__ (
+          "lw             %[t2],          0(%[dest])                      \n\t"
+          "subu_s.qb      %[vector_a],    %[t2],          %[vector_a1]    \n\t"
+          "sw             %[vector_a],    0(%[dest])                      \n\t"
+          "add            %[dest],        %[dest],        %[dest_stride]  \n\t"
+
+          : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a),
+            [dest] "+&r" (dest)
+          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+      );
+    }
+  } else {
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    __asm__ __volatile__ (
+        "replv.qb       %[vector_a1],   %[a1]     \n\t"
+        : [vector_a1] "=r" (vector_a1)
+        : [a1] "r" (a1)
+    );
+
+    for (r = 4; r--;) {
+      __asm__ __volatile__ (
+          "lw           %[t2],          0(%[dest])                        \n\t"
+          "addu_s.qb    %[vector_a],    %[t2],            %[vector_a1]    \n\t"
+          "sw           %[vector_a],    0(%[dest])                        \n\t"
+          "add          %[dest],        %[dest],          %[dest_stride]  \n\t"
+
+          : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a),
+            [dest] "+&r" (dest)
+          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+      );
+    }
+  }
+}
+
+void iadst4_dspr2(const int16_t *input, int16_t *output) {
+  int s0, s1, s2, s3, s4, s5, s6, s7;
+  int x0, x1, x2, x3;
+
+  x0 = input[0];
+  x1 = input[1];
+  x2 = input[2];
+  x3 = input[3];
+
+  if (!(x0 | x1 | x2 | x3)) {
+    output[0] = output[1] = output[2] = output[3] = 0;
+    return;
+  }
+
+  s0 = sinpi_1_9 * x0;
+  s1 = sinpi_2_9 * x0;
+  s2 = sinpi_3_9 * x1;
+  s3 = sinpi_4_9 * x2;
+  s4 = sinpi_1_9 * x2;
+  s5 = sinpi_2_9 * x3;
+  s6 = sinpi_4_9 * x3;
+  s7 = x0 - x2 + x3;
+
+  x0 = s0 + s3 + s5;
+  x1 = s1 - s4 - s6;
+  x2 = sinpi_3_9 * s7;
+  x3 = s2;
+
+  s0 = x0 + x3;
+  s1 = x1 + x3;
+  s2 = x2;
+  s3 = x0 + x1 - x3;
+
+  // 1-D transform scaling factor is sqrt(2).
+  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
+  // + 1b (addition) = 29b.
+  // Hence the output bit depth is 15b.
+  output[0] = dct_const_round_shift(s0);
+  output[1] = dct_const_round_shift(s1);
+  output[2] = dct_const_round_shift(s2);
+  output[3] = dct_const_round_shift(s3);
+}
+#endif  // #if HAVE_DSPR2
diff --git a/libs/libvpx/vpx_dsp/mips/itrans8_dspr2.c b/libs/libvpx/vpx_dsp/mips/itrans8_dspr2.c
new file mode 100644
index 0000000000..823e845d59
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/itrans8_dspr2.c
@@ -0,0 +1,668 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/inv_txfm_dspr2.h"
+#include "vpx_dsp/txfm_common.h"
+
+#if HAVE_DSPR2
+void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows) {
+  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
+  const int const_2_power_13 = 8192;
+  int Temp0, Temp1, Temp2, Temp3, Temp4;
+  int i;
+
+  for (i = no_rows; i--; ) {
+    __asm__ __volatile__ (
+        /*
+          temp_1 = (input[0] + input[4]) * cospi_16_64;
+          step2_0 = dct_const_round_shift(temp_1);
+
+          temp_2 = (input[0] - input[4]) * cospi_16_64;
+          step2_1 = dct_const_round_shift(temp_2);
+        */
+        "lh       %[Temp0],             0(%[input])                     \n\t"
+        "lh       %[Temp1],             8(%[input])                     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
+        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
+        "extp     %[Temp4],             $ac0,           31              \n\t"
+
+        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
+        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "extp     %[Temp2],             $ac1,           31              \n\t"
+
+        /*
+          temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64;
+          step2_2 = dct_const_round_shift(temp_1);
+        */
+        "lh       %[Temp0],             4(%[input])                     \n\t"
+        "lh       %[Temp1],             12(%[input])                    \n\t"
+        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
+        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "extp     %[Temp3],             $ac0,           31              \n\t"
+
+        /*
+          step1_1 = step2_1 + step2_2;
+          step1_2 = step2_1 - step2_2;
+        */
+        "add      %[step1_1],           %[Temp2],       %[Temp3]        \n\t"
+        "sub      %[step1_2],           %[Temp2],       %[Temp3]        \n\t"
+
+        /*
+          temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64;
+          step2_3 = dct_const_round_shift(temp_2);
+        */
+        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
+        "extp     %[Temp1],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+
+        /*
+          step1_0 = step2_0 + step2_3;
+          step1_3 = step2_0 - step2_3;
+        */
+        "add      %[step1_0],           %[Temp4],       %[Temp1]        \n\t"
+        "sub      %[step1_3],           %[Temp4],       %[Temp1]        \n\t"
+
+        /*
+          temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
+          step1_4 = dct_const_round_shift(temp_1);
+        */
+        "lh       %[Temp0],             2(%[input])                     \n\t"
+        "madd     $ac0,                 %[Temp0],       %[cospi_28_64]  \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "lh       %[Temp1],             14(%[input])                    \n\t"
+        "lh       %[Temp0],             2(%[input])                     \n\t"
+        "msub     $ac0,                 %[Temp1],       %[cospi_4_64]   \n\t"
+        "extp     %[step1_4],           $ac0,           31              \n\t"
+
+        /*
+          temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
+          step1_7 = dct_const_round_shift(temp_2);
+        */
+        "madd     $ac1,                 %[Temp0],       %[cospi_4_64]   \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_28_64]  \n\t"
+        "extp     %[step1_7],           $ac1,           31              \n\t"
+
+        /*
+          temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
+          step1_5 = dct_const_round_shift(temp_1);
+        */
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "lh       %[Temp0],             10(%[input])                    \n\t"
+        "madd     $ac0,                 %[Temp0],       %[cospi_12_64]  \n\t"
+        "lh       %[Temp1],             6(%[input])                     \n\t"
+        "msub     $ac0,                 %[Temp1],       %[cospi_20_64]  \n\t"
+        "extp     %[step1_5],           $ac0,           31              \n\t"
+
+        /*
+          temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
+          step1_6 = dct_const_round_shift(temp_2);
+        */
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "lh       %[Temp0],             10(%[input])                    \n\t"
+        "madd     $ac1,                 %[Temp0],       %[cospi_20_64]  \n\t"
+        "lh       %[Temp1],             6(%[input])                     \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_12_64]  \n\t"
+        "extp     %[step1_6],           $ac1,           31              \n\t"
+
+        /*
+          temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64;
+          temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64;
+        */
+        "sub      %[Temp0],             %[step1_7],     %[step1_6]      \n\t"
+        "sub      %[Temp0],             %[Temp0],       %[step1_4]      \n\t"
+        "add      %[Temp0],             %[Temp0],       %[step1_5]      \n\t"
+        "sub      %[Temp1],             %[step1_4],     %[step1_5]      \n\t"
+        "sub      %[Temp1],             %[Temp1],       %[step1_6]      \n\t"
+        "add      %[Temp1],             %[Temp1],       %[step1_7]      \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+
+        "madd     $ac0,                 %[Temp0],       %[cospi_16_64]  \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_16_64]  \n\t"
+
+        /*
+          step1_4 = step1_4 + step1_5;
+          step1_7 = step1_6 + step1_7;
+        */
+        "add      %[step1_4],           %[step1_4],     %[step1_5]      \n\t"
+        "add      %[step1_7],           %[step1_7],     %[step1_6]      \n\t"
+
+        "extp     %[step1_5],           $ac0,           31              \n\t"
+        "extp     %[step1_6],           $ac1,           31              \n\t"
+
+        "add      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"
+        "sh       %[Temp0],             0(%[output])                    \n\t"
+        "add      %[Temp1],             %[step1_1],     %[step1_6]      \n\t"
+        "sh       %[Temp1],             16(%[output])                   \n\t"
+        "add      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"
+        "sh       %[Temp0],             32(%[output])                   \n\t"
+        "add      %[Temp1],             %[step1_3],     %[step1_4]      \n\t"
+        "sh       %[Temp1],             48(%[output])                   \n\t"
+
+        "sub      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"
+        "sh       %[Temp0],             64(%[output])                   \n\t"
+        "sub      %[Temp1],             %[step1_2],     %[step1_5]      \n\t"
+        "sh       %[Temp1],             80(%[output])                   \n\t"
+        "sub      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"
+        "sh       %[Temp0],             96(%[output])                   \n\t"
+        "sub      %[Temp1],             %[step1_0],     %[step1_7]      \n\t"
+        "sh       %[Temp1],             112(%[output])                  \n\t"
+
+        : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1),
+          [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3),
+          [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5),
+          [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7),
+          [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
+          [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+          [Temp4] "=&r" (Temp4)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64),
+          [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64),
+          [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64),
+          [cospi_24_64] "r" (cospi_24_64),
+          [output] "r" (output), [input] "r" (input)
+    );
+
+    input += 8;
+    output += 1;
+  }
+}
+
+void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
+                                 int dest_stride) {
+  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
+  int Temp0, Temp1, Temp2, Temp3;
+  int i;
+  const int const_2_power_13 = 8192;
+  uint8_t *dest_pix;
+  uint8_t *cm = vpx_ff_cropTbl;
+
+  /* prefetch vpx_ff_cropTbl */
+  prefetch_load(vpx_ff_cropTbl);
+  prefetch_load(vpx_ff_cropTbl +  32);
+  prefetch_load(vpx_ff_cropTbl +  64);
+  prefetch_load(vpx_ff_cropTbl +  96);
+  prefetch_load(vpx_ff_cropTbl + 128);
+  prefetch_load(vpx_ff_cropTbl + 160);
+  prefetch_load(vpx_ff_cropTbl + 192);
+  prefetch_load(vpx_ff_cropTbl + 224);
+
+  for (i = 0; i < 8; ++i) {
+      dest_pix = (dest + i);
+
+    __asm__ __volatile__ (
+        /*
+          temp_1 = (input[0] + input[4]) * cospi_16_64;
+          step2_0 = dct_const_round_shift(temp_1);
+
+          temp_2 = (input[0] - input[4]) * cospi_16_64;
+          step2_1 = dct_const_round_shift(temp_2);
+        */
+        "lh       %[Temp0],             0(%[input])                     \n\t"
+        "lh       %[Temp1],             8(%[input])                     \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
+        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
+        "extp     %[step1_6],           $ac0,           31              \n\t"
+
+        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
+        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "extp     %[Temp2],             $ac1,           31              \n\t"
+
+        /*
+          temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64;
+          step2_2 = dct_const_round_shift(temp_1);
+        */
+        "lh       %[Temp0],             4(%[input])                     \n\t"
+        "lh       %[Temp1],             12(%[input])                    \n\t"
+        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
+        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "extp     %[Temp3],             $ac0,           31              \n\t"
+
+        /*
+          step1_1 = step2_1 + step2_2;
+          step1_2 = step2_1 - step2_2;
+        */
+        "add      %[step1_1],           %[Temp2],       %[Temp3]        \n\t"
+        "sub      %[step1_2],           %[Temp2],       %[Temp3]        \n\t"
+
+        /*
+          temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64;
+          step2_3 = dct_const_round_shift(temp_2);
+        */
+        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
+        "extp     %[Temp1],             $ac1,           31              \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+
+        /*
+          step1_0 = step2_0 + step2_3;
+          step1_3 = step2_0 - step2_3;
+        */
+        "add      %[step1_0],           %[step1_6],     %[Temp1]        \n\t"
+        "sub      %[step1_3],           %[step1_6],     %[Temp1]        \n\t"
+
+        /*
+          temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
+          step1_4 = dct_const_round_shift(temp_1);
+        */
+        "lh       %[Temp0],             2(%[input])                     \n\t"
+        "madd     $ac0,                 %[Temp0],       %[cospi_28_64]  \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "lh       %[Temp1],             14(%[input])                    \n\t"
+        "lh       %[Temp0],             2(%[input])                     \n\t"
+        "msub     $ac0,                 %[Temp1],       %[cospi_4_64]   \n\t"
+        "extp     %[step1_4],           $ac0,           31              \n\t"
+
+        /*
+          temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
+          step1_7 = dct_const_round_shift(temp_2);
+        */
+        "madd     $ac1,                 %[Temp0],       %[cospi_4_64]   \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_28_64]  \n\t"
+        "extp     %[step1_7],           $ac1,           31              \n\t"
+
+        /*
+          temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
+          step1_5 = dct_const_round_shift(temp_1);
+        */
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "lh       %[Temp0],             10(%[input])                    \n\t"
+        "madd     $ac0,                 %[Temp0],       %[cospi_12_64]  \n\t"
+        "lh       %[Temp1],             6(%[input])                     \n\t"
+        "msub     $ac0,                 %[Temp1],       %[cospi_20_64]  \n\t"
+        "extp     %[step1_5],           $ac0,           31              \n\t"
+
+        /*
+          temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
+          step1_6 = dct_const_round_shift(temp_2);
+        */
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "lh       %[Temp0],             10(%[input])                    \n\t"
+        "madd     $ac1,                 %[Temp0],       %[cospi_20_64]  \n\t"
+        "lh       %[Temp1],             6(%[input])                     \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_12_64]  \n\t"
+        "extp     %[step1_6],           $ac1,           31              \n\t"
+
+        /*
+          temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64;
+          temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64;
+        */
+        "sub      %[Temp0],             %[step1_7],     %[step1_6]      \n\t"
+        "sub      %[Temp0],             %[Temp0],       %[step1_4]      \n\t"
+        "add      %[Temp0],             %[Temp0],       %[step1_5]      \n\t"
+        "sub      %[Temp1],             %[step1_4],     %[step1_5]      \n\t"
+        "sub      %[Temp1],             %[Temp1],       %[step1_6]      \n\t"
+        "add      %[Temp1],             %[Temp1],       %[step1_7]      \n\t"
+
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+
+        "madd     $ac0,                 %[Temp0],       %[cospi_16_64]  \n\t"
+        "madd     $ac1,                 %[Temp1],       %[cospi_16_64]  \n\t"
+
+        /*
+          step1_4 = step1_4 + step1_5;
+          step1_7 = step1_6 + step1_7;
+        */
+        "add      %[step1_4],           %[step1_4],     %[step1_5]      \n\t"
+        "add      %[step1_7],           %[step1_7],     %[step1_6]      \n\t"
+
+        "extp     %[step1_5],           $ac0,           31              \n\t"
+        "extp     %[step1_6],           $ac1,           31              \n\t"
+
+        /* add block */
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "add      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"
+        "addi     %[Temp0],             %[Temp0],       16              \n\t"
+        "sra      %[Temp0],             %[Temp0],       5               \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "add      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addi     %[Temp0],             %[Temp0],       16              \n\t"
+        "sra      %[Temp0],             %[Temp0],       5               \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "add      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addi     %[Temp0],             %[Temp0],       16              \n\t"
+        "sra      %[Temp0],             %[Temp0],       5               \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "add      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addi     %[Temp0],             %[Temp0],       16              \n\t"
+        "sra      %[Temp0],             %[Temp0],       5               \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "sub      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addi     %[Temp0],             %[Temp0],       16              \n\t"
+        "sra      %[Temp0],             %[Temp0],       5               \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "sub      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addi     %[Temp0],             %[Temp0],       16              \n\t"
+        "sra      %[Temp0],             %[Temp0],       5               \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "sub      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addi     %[Temp0],             %[Temp0],       16              \n\t"
+        "sra      %[Temp0],             %[Temp0],       5               \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "sub      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+
+        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addi     %[Temp0],             %[Temp0],       16              \n\t"
+        "sra      %[Temp0],             %[Temp0],       5               \n\t"
+        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
+        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+
+        : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1),
+          [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3),
+          [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5),
+          [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7),
+          [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
+          [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+          [dest_pix] "+r" (dest_pix)
+        : [const_2_power_13] "r" (const_2_power_13),
+          [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64),
+          [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64),
+          [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64),
+          [cospi_24_64] "r" (cospi_24_64),
+          [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride)
+    );
+
+    input += 8;
+  }
+}
+
+void vpx_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
+                              int dest_stride) {
+  DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
+  int16_t *outptr = out;
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp    %[pos],    1    \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  // First transform rows
+  idct8_rows_dspr2(input, outptr, 8);
+
+  // Then transform columns and add to dest
+  idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+}
+
+void vpx_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest,
+                              int dest_stride) {
+  DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
+  int16_t *outptr = out;
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp    %[pos],    1    \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  // First transform rows
+  idct8_rows_dspr2(input, outptr, 4);
+
+  outptr += 4;
+
+  __asm__ __volatile__ (
+      "sw  $zero,   0(%[outptr])  \n\t"
+      "sw  $zero,   4(%[outptr])  \n\t"
+      "sw  $zero,  16(%[outptr])  \n\t"
+      "sw  $zero,  20(%[outptr])  \n\t"
+      "sw  $zero,  32(%[outptr])  \n\t"
+      "sw  $zero,  36(%[outptr])  \n\t"
+      "sw  $zero,  48(%[outptr])  \n\t"
+      "sw  $zero,  52(%[outptr])  \n\t"
+      "sw  $zero,  64(%[outptr])  \n\t"
+      "sw  $zero,  68(%[outptr])  \n\t"
+      "sw  $zero,  80(%[outptr])  \n\t"
+      "sw  $zero,  84(%[outptr])  \n\t"
+      "sw  $zero,  96(%[outptr])  \n\t"
+      "sw  $zero, 100(%[outptr])  \n\t"
+      "sw  $zero, 112(%[outptr])  \n\t"
+      "sw  $zero, 116(%[outptr])  \n\t"
+
+      :
+      : [outptr] "r" (outptr)
+  );
+
+
+  // Then transform columns and add to dest
+  idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+}
+
+void vpx_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest,
+                             int dest_stride) {
+  uint32_t pos = 45;
+  int32_t out;
+  int32_t r;
+  int32_t a1, absa1;
+  int32_t t1, t2, vector_a1, vector_1, vector_2;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+
+    :
+    : [pos] "r" (pos)
+  );
+
+  out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
+  __asm__ __volatile__ (
+      "addi     %[out],     %[out],     16      \n\t"
+      "sra      %[a1],      %[out],     5       \n\t"
+
+      : [out] "+r" (out), [a1] "=r" (a1)
+      :
+  );
+
+  if (a1 < 0) {
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    __asm__ __volatile__ (
+        "abs        %[absa1],       %[a1]       \n\t"
+        "replv.qb   %[vector_a1],   %[absa1]    \n\t"
+
+        : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
+        : [a1] "r" (a1)
+    );
+
+    for (r = 8; r--;) {
+      __asm__ __volatile__ (
+          "lw           %[t1],          0(%[dest])                      \n\t"
+          "lw           %[t2],          4(%[dest])                      \n\t"
+          "subu_s.qb    %[vector_1],    %[t1],          %[vector_a1]    \n\t"
+          "subu_s.qb    %[vector_2],    %[t2],          %[vector_a1]    \n\t"
+          "sw           %[vector_1],    0(%[dest])                      \n\t"
+          "sw           %[vector_2],    4(%[dest])                      \n\t"
+          "add          %[dest],        %[dest],        %[dest_stride]  \n\t"
+
+          : [t1] "=&r" (t1), [t2] "=&r" (t2),
+            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
+            [dest] "+&r" (dest)
+          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+      );
+    }
+  } else {
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    __asm__ __volatile__ (
+        "replv.qb   %[vector_a1],   %[a1]   \n\t"
+
+        : [vector_a1] "=r" (vector_a1)
+        : [a1] "r" (a1)
+    );
+
+    for (r = 8; r--;) {
+      __asm__ __volatile__ (
+          "lw           %[t1],          0(%[dest])                      \n\t"
+          "lw           %[t2],          4(%[dest])                      \n\t"
+          "addu_s.qb    %[vector_1],    %[t1],          %[vector_a1]    \n\t"
+          "addu_s.qb    %[vector_2],    %[t2],          %[vector_a1]    \n\t"
+          "sw           %[vector_1],    0(%[dest])                      \n\t"
+          "sw           %[vector_2],    4(%[dest])                      \n\t"
+          "add          %[dest],        %[dest],        %[dest_stride]  \n\t"
+
+          : [t1] "=&r" (t1), [t2] "=&r" (t2),
+            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
+            [dest] "+r" (dest)
+          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+      );
+    }
+  }
+}
+
+void iadst8_dspr2(const int16_t *input, int16_t *output) {
+  int s0, s1, s2, s3, s4, s5, s6, s7;
+  int x0, x1, x2, x3, x4, x5, x6, x7;
+
+  x0 = input[7];
+  x1 = input[0];
+  x2 = input[5];
+  x3 = input[2];
+  x4 = input[3];
+  x5 = input[4];
+  x6 = input[1];
+  x7 = input[6];
+
+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
+    output[0] = output[1] = output[2] = output[3] = output[4]
+              = output[5] = output[6] = output[7] = 0;
+    return;
+  }
+
+  // stage 1
+  s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
+  s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
+  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
+  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
+  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
+  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
+  s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
+  s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
+
+  x0 = ROUND_POWER_OF_TWO((s0 + s4), DCT_CONST_BITS);
+  x1 = ROUND_POWER_OF_TWO((s1 + s5), DCT_CONST_BITS);
+  x2 = ROUND_POWER_OF_TWO((s2 + s6), DCT_CONST_BITS);
+  x3 = ROUND_POWER_OF_TWO((s3 + s7), DCT_CONST_BITS);
+  x4 = ROUND_POWER_OF_TWO((s0 - s4), DCT_CONST_BITS);
+  x5 = ROUND_POWER_OF_TWO((s1 - s5), DCT_CONST_BITS);
+  x6 = ROUND_POWER_OF_TWO((s2 - s6), DCT_CONST_BITS);
+  x7 = ROUND_POWER_OF_TWO((s3 - s7), DCT_CONST_BITS);
+
+  // stage 2
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;
+  s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;
+  s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
+  s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
+
+  x0 = s0 + s2;
+  x1 = s1 + s3;
+  x2 = s0 - s2;
+  x3 = s1 - s3;
+  x4 = ROUND_POWER_OF_TWO((s4 + s6), DCT_CONST_BITS);
+  x5 = ROUND_POWER_OF_TWO((s5 + s7), DCT_CONST_BITS);
+  x6 = ROUND_POWER_OF_TWO((s4 - s6), DCT_CONST_BITS);
+  x7 = ROUND_POWER_OF_TWO((s5 - s7), DCT_CONST_BITS);
+
+  // stage 3
+  s2 = cospi_16_64 * (x2 + x3);
+  s3 = cospi_16_64 * (x2 - x3);
+  s6 = cospi_16_64 * (x6 + x7);
+  s7 = cospi_16_64 * (x6 - x7);
+
+  x2 = ROUND_POWER_OF_TWO((s2), DCT_CONST_BITS);
+  x3 = ROUND_POWER_OF_TWO((s3), DCT_CONST_BITS);
+  x6 = ROUND_POWER_OF_TWO((s6), DCT_CONST_BITS);
+  x7 = ROUND_POWER_OF_TWO((s7), DCT_CONST_BITS);
+
+  output[0] =  x0;
+  output[1] = -x4;
+  output[2] =  x6;
+  output[3] = -x2;
+  output[4] =  x3;
+  output[5] = -x7;
+  output[6] =  x5;
+  output[7] = -x1;
+}
+#endif  // HAVE_DSPR2
diff --git a/libs/libvpx/vpx_dsp/mips/loopfilter_16_msa.c b/libs/libvpx/vpx_dsp/mips/loopfilter_16_msa.c
new file mode 100644
index 0000000000..b7c9f7bd0e
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/loopfilter_16_msa.c
@@ -0,0 +1,1480 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_ports/mem.h"
+#include "vpx_dsp/mips/loopfilter_msa.h"
+
+int32_t vpx_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch,
+                                 uint8_t *filter48,
+                                 const uint8_t *b_limit_ptr,
+                                 const uint8_t *limit_ptr,
+                                 const uint8_t *thresh_ptr) {
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+  v16u8 flat, mask, hev, thresh, b_limit, limit;
+  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
+  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
+  v16u8 zero = { 0 };
+
+  /* load vector elements */
+  LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
+  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+  limit = (v16u8)__msa_fill_b(*limit_ptr);
+
+  /* mask and hev */
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+               hev, mask, flat);
+  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  if (__msa_test_bz_v(flat)) {
+    ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
+
+    return 1;
+  } else {
+    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
+               zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
+               q2_r, q3_r);
+    VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+
+    ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
+    ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
+    VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+                p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+    /* convert 16 bit output data into 8 bit */
+    PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
+                p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
+                p0_filt8_r, q0_filt8_r);
+    PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
+                q2_filt8_r);
+
+    /* store pixel values */
+    p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
+    p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
+    p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
+    q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
+    q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
+    q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
+
+    ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
+    filter48 += (4 * 16);
+    ST_UB2(q1_out, q2_out, filter48, 16);
+    filter48 += (2 * 16);
+    ST_UB(flat, filter48);
+
+    return 0;
+  }
+}
+
+void vpx_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) {
+  v16u8 flat, flat2, filter8;
+  v16i8 zero = { 0 };
+  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+  v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
+  v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
+  v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in;
+  v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in;
+  v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
+  v8i16 l_out, r_out;
+
+  flat = LD_UB(filter48 + 96);
+
+  LD_UB8((src - 8 * pitch), pitch, p7, p6, p5, p4, p3, p2, p1, p0);
+  LD_UB8(src, pitch, q0, q1, q2, q3, q4, q5, q6, q7);
+  VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+
+  if (__msa_test_bz_v(flat2)) {
+    LD_UB4(filter48, 16, p2, p1, p0, q0);
+    LD_UB2(filter48 + 4 * 16, 16, q1, q2);
+
+    src -= 3 * pitch;
+    ST_UB4(p2, p1, p0, q0, src, pitch);
+    src += (4 * pitch);
+    ST_UB2(q1, q2, src, pitch);
+  } else {
+    src -= 7 * pitch;
+
+    ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
+               zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
+               p2_r_in, p1_r_in, p0_r_in);
+
+    q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
+
+    tmp0_r = p7_r_in << 3;
+    tmp0_r -= p7_r_in;
+    tmp0_r += p6_r_in;
+    tmp0_r += q0_r_in;
+    tmp1_r = p6_r_in + p5_r_in;
+    tmp1_r += p4_r_in;
+    tmp1_r += p3_r_in;
+    tmp1_r += p2_r_in;
+    tmp1_r += p1_r_in;
+    tmp1_r += p0_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
+               p5_l_in, p4_l_in);
+    ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
+               p1_l_in, p0_l_in);
+    q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0);
+
+    tmp0_l = p7_l_in << 3;
+    tmp0_l -= p7_l_in;
+    tmp0_l += p6_l_in;
+    tmp0_l += q0_l_in;
+    tmp1_l = p6_l_in + p5_l_in;
+    tmp1_l += p4_l_in;
+    tmp1_l += p3_l_in;
+    tmp1_l += p2_l_in;
+    tmp1_l += p1_l_in;
+    tmp1_l += p0_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
+    ST_UB(p6, src);
+    src += pitch;
+
+    /* p5 */
+    q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
+    tmp0_r = p5_r_in - p6_r_in;
+    tmp0_r += q1_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1);
+    tmp0_l = p5_l_in - p6_l_in;
+    tmp0_l += q1_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
+    ST_UB(p5, src);
+    src += pitch;
+
+    /* p4 */
+    q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
+    tmp0_r = p4_r_in - p5_r_in;
+    tmp0_r += q2_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = (v8i16)__msa_srari_h((v8i16)tmp1_r, 4);
+
+    q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2);
+    tmp0_l = p4_l_in - p5_l_in;
+    tmp0_l += q2_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
+    ST_UB(p4, src);
+    src += pitch;
+
+    /* p3 */
+    q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
+    tmp0_r = p3_r_in - p4_r_in;
+    tmp0_r += q3_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3);
+    tmp0_l = p3_l_in - p4_l_in;
+    tmp0_l += q3_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
+    ST_UB(p3, src);
+    src += pitch;
+
+    /* p2 */
+    q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
+    filter8 = LD_UB(filter48);
+    tmp0_r = p2_r_in - p3_r_in;
+    tmp0_r += q4_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4);
+    tmp0_l = p2_l_in - p3_l_in;
+    tmp0_l += q4_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST_UB(filter8, src);
+    src += pitch;
+
+    /* p1 */
+    q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
+    filter8 = LD_UB(filter48 + 16);
+    tmp0_r = p1_r_in - p2_r_in;
+    tmp0_r += q5_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5);
+    tmp0_l = p1_l_in - p2_l_in;
+    tmp0_l += q5_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST_UB(filter8, src);
+    src += pitch;
+
+    /* p0 */
+    q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
+    filter8 = LD_UB(filter48 + 32);
+    tmp0_r = p0_r_in - p1_r_in;
+    tmp0_r += q6_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6);
+    tmp0_l = p0_l_in - p1_l_in;
+    tmp0_l += q6_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST_UB(filter8, src);
+    src += pitch;
+
+    /* q0 */
+    q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
+    filter8 = LD_UB(filter48 + 48);
+    tmp0_r = q7_r_in - p0_r_in;
+    tmp0_r += q0_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7);
+    tmp0_l = q7_l_in - p0_l_in;
+    tmp0_l += q0_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST_UB(filter8, src);
+    src += pitch;
+
+    /* q1 */
+    filter8 = LD_UB(filter48 + 64);
+    tmp0_r = q7_r_in - q0_r_in;
+    tmp0_r += q1_r_in;
+    tmp0_r -= p6_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    tmp0_l = q7_l_in - q0_l_in;
+    tmp0_l += q1_l_in;
+    tmp0_l -= p6_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST_UB(filter8, src);
+    src += pitch;
+
+    /* q2 */
+    filter8 = LD_UB(filter48 + 80);
+    tmp0_r = q7_r_in - q1_r_in;
+    tmp0_r += q2_r_in;
+    tmp0_r -= p5_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    tmp0_l = q7_l_in - q1_l_in;
+    tmp0_l += q2_l_in;
+    tmp0_l -= p5_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST_UB(filter8, src);
+    src += pitch;
+
+    /* q3 */
+    tmp0_r = q7_r_in - q2_r_in;
+    tmp0_r += q3_r_in;
+    tmp0_r -= p4_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    tmp0_l = q7_l_in - q2_l_in;
+    tmp0_l += q3_l_in;
+    tmp0_l -= p4_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
+    ST_UB(q3, src);
+    src += pitch;
+
+    /* q4 */
+    tmp0_r = q7_r_in - q3_r_in;
+    tmp0_r += q4_r_in;
+    tmp0_r -= p3_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    tmp0_l = q7_l_in - q3_l_in;
+    tmp0_l += q4_l_in;
+    tmp0_l -= p3_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
+    ST_UB(q4, src);
+    src += pitch;
+
+    /* q5 */
+    tmp0_r = q7_r_in - q4_r_in;
+    tmp0_r += q5_r_in;
+    tmp0_r -= p2_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    tmp0_l = q7_l_in - q4_l_in;
+    tmp0_l += q5_l_in;
+    tmp0_l -= p2_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
+    ST_UB(q5, src);
+    src += pitch;
+
+    /* q6 */
+    tmp0_r = q7_r_in - q5_r_in;
+    tmp0_r += q6_r_in;
+    tmp0_r -= p1_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    tmp0_l = q7_l_in - q5_l_in;
+    tmp0_l += q6_l_in;
+    tmp0_l -= p1_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
+    ST_UB(q6, src);
+  }
+}
+
+void vpx_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch,
+                                    const uint8_t *b_limit_ptr,
+                                    const uint8_t *limit_ptr,
+                                    const uint8_t *thresh_ptr,
+                                    int32_t count) {
+  DECLARE_ALIGNED(32, uint8_t, filter48[16 * 8]);
+  uint8_t early_exit = 0;
+
+  (void)count;
+
+  early_exit = vpx_hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], b_limit_ptr,
+                                        limit_ptr, thresh_ptr);
+
+  if (0 == early_exit) {
+    vpx_hz_lpf_t16_16w(src, pitch, filter48);
+  }
+}
+
+void vpx_lpf_horizontal_16_msa(uint8_t *src, int32_t pitch,
+                               const uint8_t *b_limit_ptr,
+                               const uint8_t *limit_ptr,
+                               const uint8_t *thresh_ptr,
+                               int32_t count) {
+  if (1 == count) {
+    uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
+    uint64_t dword0, dword1;
+    v16u8 flat2, mask, hev, flat, thresh, b_limit, limit;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7;
+    v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+    v16u8 p0_filter16, p1_filter16;
+    v8i16 p2_filter8, p1_filter8, p0_filter8;
+    v8i16 q0_filter8, q1_filter8, q2_filter8;
+    v8u16 p7_r, p6_r, p5_r, p4_r, q7_r, q6_r, q5_r, q4_r;
+    v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
+    v16i8 zero = { 0 };
+    v8u16 tmp0, tmp1, tmp2;
+
+    /* load vector elements */
+    LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh = (v16u8)__msa_fill_b(*thresh_ptr);
+    b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+    limit = (v16u8)__msa_fill_b(*limit_ptr);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+    VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+                       q1_out);
+
+    flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
+
+    if (__msa_test_bz_v(flat)) {
+      p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
+      p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
+      q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
+      q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
+      SD4(p1_d, p0_d, q0_d, q1_d, src - 2 * pitch, pitch);
+    } else {
+      /* convert 8 bit input data into 16 bit */
+      ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
+                 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
+                 q3_r);
+      VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8,
+                  p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
+
+      /* convert 16 bit output data into 8 bit */
+      PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8,
+                  zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8,
+                  q0_filter8);
+      PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);
+
+      /* store pixel values */
+      p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat);
+      p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat);
+      p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat);
+      q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat);
+      q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat);
+      q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat);
+
+      /* load 16 vector elements */
+      LD_UB4((src - 8 * pitch), pitch, p7, p6, p5, p4);
+      LD_UB4(src + (4 * pitch), pitch, q4, q5, q6, q7);
+
+      VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+
+      if (__msa_test_bz_v(flat2)) {
+        p2_d = __msa_copy_u_d((v2i64)p2_out, 0);
+        p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
+        p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
+        q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
+        q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
+        q2_d = __msa_copy_u_d((v2i64)q2_out, 0);
+
+        SD4(p2_d, p1_d, p0_d, q0_d, src - 3 * pitch, pitch);
+        SD(q1_d, src + pitch);
+        SD(q2_d, src + 2 * pitch);
+      } else {
+        /* LSB(right) 8 pixel operation */
+        ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, q4, zero, q5,
+                   zero, q6, zero, q7, p7_r, p6_r, p5_r, p4_r, q4_r, q5_r, q6_r,
+                   q7_r);
+
+        tmp0 = p7_r << 3;
+        tmp0 -= p7_r;
+        tmp0 += p6_r;
+        tmp0 += q0_r;
+
+        src -= 7 * pitch;
+
+        /* calculation of p6 and p5 */
+        tmp1 = p6_r + p5_r + p4_r + p3_r;
+        tmp1 += (p2_r + p1_r + p0_r);
+        tmp1 += tmp0;
+        p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        tmp0 = p5_r - p6_r + q1_r - p7_r;
+        tmp1 += tmp0;
+        p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
+                    p1_filter16);
+        p0_filter16 = __msa_bmnz_v(p6, p0_filter16, flat2);
+        p1_filter16 = __msa_bmnz_v(p5, p1_filter16, flat2);
+        dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
+        dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
+        SD(dword0, src);
+        src += pitch;
+        SD(dword1, src);
+        src += pitch;
+
+        /* calculation of p4 and p3 */
+        tmp0 = p4_r - p5_r + q2_r - p7_r;
+        tmp2 = p3_r - p4_r + q3_r - p7_r;
+        tmp1 += tmp0;
+        p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        tmp1 += tmp2;
+        p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
+                    p1_filter16);
+        p0_filter16 = __msa_bmnz_v(p4, p0_filter16, flat2);
+        p1_filter16 = __msa_bmnz_v(p3, p1_filter16, flat2);
+        dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
+        dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
+        SD(dword0, src);
+        src += pitch;
+        SD(dword1, src);
+        src += pitch;
+
+        /* calculation of p2 and p1 */
+        tmp0 = p2_r - p3_r + q4_r - p7_r;
+        tmp2 = p1_r - p2_r + q5_r - p7_r;
+        tmp1 += tmp0;
+        p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        tmp1 += tmp2;
+        p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
+                    p1_filter16);
+        p0_filter16 = __msa_bmnz_v(p2_out, p0_filter16, flat2);
+        p1_filter16 = __msa_bmnz_v(p1_out, p1_filter16, flat2);
+        dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
+        dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
+        SD(dword0, src);
+        src += pitch;
+        SD(dword1, src);
+        src += pitch;
+
+        /* calculation of p0 and q0 */
+        tmp0 = (p0_r - p1_r) + (q6_r - p7_r);
+        tmp2 = (q7_r - p0_r) + (q0_r - p7_r);
+        tmp1 += tmp0;
+        p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        tmp1 += tmp2;
+        p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
+                    p1_filter16);
+        p0_filter16 = __msa_bmnz_v(p0_out, p0_filter16, flat2);
+        p1_filter16 = __msa_bmnz_v(q0_out, p1_filter16, flat2);
+        dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
+        dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
+        SD(dword0, src);
+        src += pitch;
+        SD(dword1, src);
+        src += pitch;
+
+        /* calculation of q1 and q2 */
+        tmp0 = q7_r - q0_r + q1_r - p6_r;
+        tmp2 = q7_r - q1_r + q2_r - p5_r;
+        tmp1 += tmp0;
+        p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        tmp1 += tmp2;
+        p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
+                    p1_filter16);
+        p0_filter16 = __msa_bmnz_v(q1_out, p0_filter16, flat2);
+        p1_filter16 = __msa_bmnz_v(q2_out, p1_filter16, flat2);
+        dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
+        dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
+        SD(dword0, src);
+        src += pitch;
+        SD(dword1, src);
+        src += pitch;
+
+        /* calculation of q3 and q4 */
+        tmp0 = (q7_r - q2_r) + (q3_r - p4_r);
+        tmp2 = (q7_r - q3_r) + (q4_r - p3_r);
+        tmp1 += tmp0;
+        p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        tmp1 += tmp2;
+        p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
+                    p1_filter16);
+        p0_filter16 = __msa_bmnz_v(q3, p0_filter16, flat2);
+        p1_filter16 = __msa_bmnz_v(q4, p1_filter16, flat2);
+        dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
+        dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
+        SD(dword0, src);
+        src += pitch;
+        SD(dword1, src);
+        src += pitch;
+
+        /* calculation of q5 and q6 */
+        tmp0 = (q7_r - q4_r) + (q5_r - p2_r);
+        tmp2 = (q7_r - q5_r) + (q6_r - p1_r);
+        tmp1 += tmp0;
+        p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        tmp1 += tmp2;
+        p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
+                    p1_filter16);
+        p0_filter16 = __msa_bmnz_v(q5, p0_filter16, flat2);
+        p1_filter16 = __msa_bmnz_v(q6, p1_filter16, flat2);
+        dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
+        dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
+        SD(dword0, src);
+        src += pitch;
+        SD(dword1, src);
+      }
+    }
+  } else {
+    vpx_lpf_horizontal_16_dual_msa(src, pitch, b_limit_ptr, limit_ptr,
+                                   thresh_ptr, count);
+  }
+}
+
+static void transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch,
+                                   uint8_t *output, int32_t out_pitch) {
+  v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org;
+  v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+
+  LD_UB8(input, in_pitch,
+         p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org);
+  /* 8x8 transpose */
+  TRANSPOSE8x8_UB_UB(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org,
+                     p0_org, p7, p6, p5, p4, p3, p2, p1, p0);
+  /* 8x8 transpose */
+  ILVL_B4_SB(p5_org, p7_org, p4_org, p6_org, p1_org, p3_org, p0_org, p2_org,
+             tmp0, tmp1, tmp2, tmp3);
+  ILVR_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp4, tmp6);
+  ILVL_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp7);
+  ILVR_W2_UB(tmp6, tmp4, tmp7, tmp5, q0, q4);
+  ILVL_W2_UB(tmp6, tmp4, tmp7, tmp5, q2, q6);
+  SLDI_B4_0_UB(q0, q2, q4, q6, q1, q3, q5, q7, 8);
+
+  ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
+  output += (8 * out_pitch);
+  ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
+}
+
+static void transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch,
+                                   uint8_t *output, int32_t out_pitch) {
+  v16u8 p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o;
+  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+
+  LD_UB8(input, in_pitch, p7, p6, p5, p4, p3, p2, p1, p0);
+  LD_UB8(input + (8 * in_pitch), in_pitch, q0, q1, q2, q3, q4, q5, q6, q7);
+  TRANSPOSE16x8_UB_UB(p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5,
+                      q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o);
+  ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch);
+}
+
+static void transpose_16x16(uint8_t *input, int32_t in_pitch,
+                            uint8_t *output, int32_t out_pitch) {
+  v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
+  v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
+  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+  v8i16 tmp0, tmp1, tmp4, tmp5, tmp6, tmp7;
+  v4i32 tmp2, tmp3;
+
+  LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+  input += (8 * in_pitch);
+  LD_UB8(input, in_pitch,
+         row8, row9, row10, row11, row12, row13, row14, row15);
+
+  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                      row8, row9, row10, row11, row12, row13, row14, row15,
+                      p7, p6, p5, p4, p3, p2, p1, p0);
+
+  /* transpose 16x8 matrix into 8x16 */
+  /* total 8 intermediate register and 32 instructions */
+  q7 = (v16u8)__msa_ilvod_d((v2i64)row8, (v2i64)row0);
+  q6 = (v16u8)__msa_ilvod_d((v2i64)row9, (v2i64)row1);
+  q5 = (v16u8)__msa_ilvod_d((v2i64)row10, (v2i64)row2);
+  q4 = (v16u8)__msa_ilvod_d((v2i64)row11, (v2i64)row3);
+  q3 = (v16u8)__msa_ilvod_d((v2i64)row12, (v2i64)row4);
+  q2 = (v16u8)__msa_ilvod_d((v2i64)row13, (v2i64)row5);
+  q1 = (v16u8)__msa_ilvod_d((v2i64)row14, (v2i64)row6);
+  q0 = (v16u8)__msa_ilvod_d((v2i64)row15, (v2i64)row7);
+
+  ILVEV_B2_SH(q7, q6, q5, q4, tmp0, tmp1);
+  tmp4 = (v8i16)__msa_ilvod_b((v16i8)q6, (v16i8)q7);
+  tmp5 = (v8i16)__msa_ilvod_b((v16i8)q4, (v16i8)q5);
+
+  ILVEV_B2_UB(q3, q2, q1, q0, q5, q7);
+  tmp6 = (v8i16)__msa_ilvod_b((v16i8)q2, (v16i8)q3);
+  tmp7 = (v8i16)__msa_ilvod_b((v16i8)q0, (v16i8)q1);
+
+  ILVEV_H2_SW(tmp0, tmp1, q5, q7, tmp2, tmp3);
+  q0 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
+  q4 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
+
+  tmp2 = (v4i32)__msa_ilvod_h(tmp1, tmp0);
+  tmp3 = (v4i32)__msa_ilvod_h((v8i16)q7, (v8i16)q5);
+  q2 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
+  q6 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
+
+  ILVEV_H2_SW(tmp4, tmp5, tmp6, tmp7, tmp2, tmp3);
+  q1 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
+  q5 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
+
+  tmp2 = (v4i32)__msa_ilvod_h(tmp5, tmp4);
+  tmp3 = (v4i32)__msa_ilvod_h(tmp7, tmp6);
+  q3 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
+  q7 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
+
+  ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
+  output += (8 * out_pitch);
+  ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
+}
+
+int32_t vpx_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
+                                uint8_t *src_org, int32_t pitch_org,
+                                const uint8_t *b_limit_ptr,
+                                const uint8_t *limit_ptr,
+                                const uint8_t *thresh_ptr) {
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+  v16u8 flat, mask, hev, thresh, b_limit, limit;
+  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
+  v16i8 zero = { 0 };
+  v8i16 vec0, vec1, vec2, vec3;
+
+  /* load vector elements */
+  LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
+
+  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
+  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+  limit = (v16u8)__msa_fill_b(*limit_ptr);
+
+  /* mask and hev */
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+               hev, mask, flat);
+  /* flat4 */
+  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+  /* filter4 */
+  VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
+
+  if (__msa_test_bz_v(flat)) {
+    ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+    ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+    ST4x8_UB(vec2, vec3, (src_org - 2), pitch_org);
+    return 1;
+  } else {
+    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
+               zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
+               q3_r);
+    VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+
+    /* convert 16 bit output data into 8 bit */
+    p2_r = (v8u16)__msa_pckev_b((v16i8)p2_filt8_r, (v16i8)p2_filt8_r);
+    p1_r = (v8u16)__msa_pckev_b((v16i8)p1_filt8_r, (v16i8)p1_filt8_r);
+    p0_r = (v8u16)__msa_pckev_b((v16i8)p0_filt8_r, (v16i8)p0_filt8_r);
+    q0_r = (v8u16)__msa_pckev_b((v16i8)q0_filt8_r, (v16i8)q0_filt8_r);
+    q1_r = (v8u16)__msa_pckev_b((v16i8)q1_filt8_r, (v16i8)q1_filt8_r);
+    q2_r = (v8u16)__msa_pckev_b((v16i8)q2_filt8_r, (v16i8)q2_filt8_r);
+
+    /* store pixel values */
+    p2_out = __msa_bmnz_v(p2, (v16u8)p2_r, flat);
+    p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_r, flat);
+    p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_r, flat);
+    q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_r, flat);
+    q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_r, flat);
+    q2_out = __msa_bmnz_v(q2, (v16u8)q2_r, flat);
+
+    ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
+    filter48 += (4 * 16);
+    ST_UB2(q1_out, q2_out, filter48, 16);
+    filter48 += (2 * 16);
+    ST_UB(flat, filter48);
+
+    return 0;
+  }
+}
+
+int32_t vpx_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch,
+                          uint8_t *filter48) {
+  v16i8 zero = { 0 };
+  v16u8 filter8, flat, flat2;
+  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+  v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
+  v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
+  v8u16 tmp0_r, tmp1_r;
+  v8i16 r_out;
+
+  flat = LD_UB(filter48 + 6 * 16);
+
+  LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
+  LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
+
+  VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+
+  if (__msa_test_bz_v(flat2)) {
+    v8i16 vec0, vec1, vec2, vec3, vec4;
+
+    LD_UB4(filter48, 16, p2, p1, p0, q0);
+    LD_UB2(filter48 + 4 * 16, 16, q1, q2);
+
+    ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
+    ILVRL_H2_SH(vec1, vec0, vec3, vec4);
+    vec2 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1);
+
+    src_org -= 3;
+    ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
+    ST2x4_UB(vec2, 0, (src_org + 4), pitch);
+    src_org += (4 * pitch);
+    ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
+    ST2x4_UB(vec2, 4, (src_org + 4), pitch);
+
+    return 1;
+  } else {
+    src -= 7 * 16;
+
+    ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
+               zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
+               p3_r_in, p2_r_in, p1_r_in, p0_r_in);
+    q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
+
+    tmp0_r = p7_r_in << 3;
+    tmp0_r -= p7_r_in;
+    tmp0_r += p6_r_in;
+    tmp0_r += q0_r_in;
+    tmp1_r = p6_r_in + p5_r_in;
+    tmp1_r += p4_r_in;
+    tmp1_r += p3_r_in;
+    tmp1_r += p2_r_in;
+    tmp1_r += p1_r_in;
+    tmp1_r += p0_r_in;
+    tmp1_r += tmp0_r;
+
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
+    ST8x1_UB(p6, src);
+    src += 16;
+
+    /* p5 */
+    q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
+    tmp0_r = p5_r_in - p6_r_in;
+    tmp0_r += q1_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
+    ST8x1_UB(p5, src);
+    src += 16;
+
+    /* p4 */
+    q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
+    tmp0_r = p4_r_in - p5_r_in;
+    tmp0_r += q2_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
+    ST8x1_UB(p4, src);
+    src += 16;
+
+    /* p3 */
+    q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
+    tmp0_r = p3_r_in - p4_r_in;
+    tmp0_r += q3_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
+    ST8x1_UB(p3, src);
+    src += 16;
+
+    /* p2 */
+    q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
+    filter8 = LD_UB(filter48);
+    tmp0_r = p2_r_in - p3_r_in;
+    tmp0_r += q4_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST8x1_UB(filter8, src);
+    src += 16;
+
+    /* p1 */
+    q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
+    filter8 = LD_UB(filter48 + 16);
+    tmp0_r = p1_r_in - p2_r_in;
+    tmp0_r += q5_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST8x1_UB(filter8, src);
+    src += 16;
+
+    /* p0 */
+    q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
+    filter8 = LD_UB(filter48 + 32);
+    tmp0_r = p0_r_in - p1_r_in;
+    tmp0_r += q6_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST8x1_UB(filter8, src);
+    src += 16;
+
+    /* q0 */
+    q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
+    filter8 = LD_UB(filter48 + 48);
+    tmp0_r = q7_r_in - p0_r_in;
+    tmp0_r += q0_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST8x1_UB(filter8, src);
+    src += 16;
+
+    /* q1 */
+    filter8 = LD_UB(filter48 + 64);
+    tmp0_r = q7_r_in - q0_r_in;
+    tmp0_r += q1_r_in;
+    tmp0_r -= p6_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST8x1_UB(filter8, src);
+    src += 16;
+
+    /* q2 */
+    filter8 = LD_UB(filter48 + 80);
+    tmp0_r = q7_r_in - q1_r_in;
+    tmp0_r += q2_r_in;
+    tmp0_r -= p5_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST8x1_UB(filter8, src);
+    src += 16;
+
+    /* q3 */
+    tmp0_r = q7_r_in - q2_r_in;
+    tmp0_r += q3_r_in;
+    tmp0_r -= p4_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
+    ST8x1_UB(q3, src);
+    src += 16;
+
+    /* q4 */
+    tmp0_r = q7_r_in - q3_r_in;
+    tmp0_r += q4_r_in;
+    tmp0_r -= p3_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
+    ST8x1_UB(q4, src);
+    src += 16;
+
+    /* q5 */
+    tmp0_r = q7_r_in - q4_r_in;
+    tmp0_r += q5_r_in;
+    tmp0_r -= p2_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
+    ST8x1_UB(q5, src);
+    src += 16;
+
+    /* q6 */
+    tmp0_r = q7_r_in - q5_r_in;
+    tmp0_r += q6_r_in;
+    tmp0_r -= p1_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
+    ST8x1_UB(q6, src);
+
+    return 0;
+  }
+}
+
+void vpx_lpf_vertical_16_msa(uint8_t *src, int32_t pitch,
+                             const uint8_t *b_limit_ptr,
+                             const uint8_t *limit_ptr,
+                             const uint8_t *thresh_ptr) {
+  uint8_t early_exit = 0;
+  DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]);
+  uint8_t *filter48 = &transposed_input[16 * 16];
+
+  transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16);
+
+  early_exit = vpx_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8),
+                                       &filter48[0], src, pitch, b_limit_ptr,
+                                       limit_ptr, thresh_ptr);
+
+  if (0 == early_exit) {
+    early_exit = vpx_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch,
+                                   &filter48[0]);
+
+    if (0 == early_exit) {
+      transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch);
+    }
+  }
+}
+
+int32_t vpx_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
+                                 uint8_t *src_org, int32_t pitch,
+                                 const uint8_t *b_limit_ptr,
+                                 const uint8_t *limit_ptr,
+                                 const uint8_t *thresh_ptr) {
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+  v16u8 flat, mask, hev, thresh, b_limit, limit;
+  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
+  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
+  v16i8 zero = { 0 };
+  v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
+
+  /* load vector elements */
+  LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
+
+  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
+  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+  limit = (v16u8)__msa_fill_b(*limit_ptr);
+
+  /* mask and hev */
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+               hev, mask, flat);
+  /* flat4 */
+  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+  /* filter4 */
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  if (__msa_test_bz_v(flat)) {
+    ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+    ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+    ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+    ILVRL_H2_SH(vec1, vec0, vec4, vec5);
+
+    src_org -= 2;
+    ST4x8_UB(vec2, vec3, src_org, pitch);
+    src_org += 8 * pitch;
+    ST4x8_UB(vec4, vec5, src_org, pitch);
+
+    return 1;
+  } else {
+    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
+               zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
+               q3_r);
+    VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+    ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
+    ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
+    VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+                p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+    /* convert 16 bit output data into 8 bit */
+    PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
+                p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
+                p0_filt8_r, q0_filt8_r);
+    PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
+                q2_filt8_r);
+
+    /* store pixel values */
+    p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
+    p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
+    p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
+    q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
+    q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
+    q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
+
+    ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
+    filter48 += (4 * 16);
+    ST_UB2(q1_out, q2_out, filter48, 16);
+    filter48 += (2 * 16);
+    ST_UB(flat, filter48);
+
+    return 0;
+  }
+}
+
+int32_t vpx_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch,
+                           uint8_t *filter48) {
+  v16u8 flat, flat2, filter8;
+  v16i8 zero = { 0 };
+  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+  v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
+  v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
+  v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in;
+  v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in;
+  v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
+  v8i16 l_out, r_out;
+
+  flat = LD_UB(filter48 + 6 * 16);
+
+  LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
+  LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
+
+  VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+
+  if (__msa_test_bz_v(flat2)) {
+    v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+
+    LD_UB4(filter48, 16, p2, p1, p0, q0);
+    LD_UB2(filter48 + 4 * 16, 16, q1, q2);
+
+    ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
+    ILVRL_H2_SH(vec1, vec0, vec3, vec4);
+    ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
+    ILVRL_H2_SH(vec1, vec0, vec6, vec7);
+    ILVRL_B2_SH(q2, q1, vec2, vec5);
+
+    src_org -= 3;
+    ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
+    ST2x4_UB(vec2, 0, (src_org + 4), pitch);
+    src_org += (4 * pitch);
+    ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
+    ST2x4_UB(vec2, 4, (src_org + 4), pitch);
+    src_org += (4 * pitch);
+    ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src_org, pitch);
+    ST2x4_UB(vec5, 0, (src_org + 4), pitch);
+    src_org += (4 * pitch);
+    ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src_org, pitch);
+    ST2x4_UB(vec5, 4, (src_org + 4), pitch);
+
+    return 1;
+  } else {
+    src -= 7 * 16;
+
+    ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
+               zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
+               p3_r_in, p2_r_in, p1_r_in, p0_r_in);
+    q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
+
+    tmp0_r = p7_r_in << 3;
+    tmp0_r -= p7_r_in;
+    tmp0_r += p6_r_in;
+    tmp0_r += q0_r_in;
+    tmp1_r = p6_r_in + p5_r_in;
+    tmp1_r += p4_r_in;
+    tmp1_r += p3_r_in;
+    tmp1_r += p2_r_in;
+    tmp1_r += p1_r_in;
+    tmp1_r += p0_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
+               p5_l_in, p4_l_in);
+    ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
+               p1_l_in, p0_l_in);
+    q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0);
+
+    tmp0_l = p7_l_in << 3;
+    tmp0_l -= p7_l_in;
+    tmp0_l += p6_l_in;
+    tmp0_l += q0_l_in;
+    tmp1_l = p6_l_in + p5_l_in;
+    tmp1_l += p4_l_in;
+    tmp1_l += p3_l_in;
+    tmp1_l += p2_l_in;
+    tmp1_l += p1_l_in;
+    tmp1_l += p0_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
+    ST_UB(p6, src);
+    src += 16;
+
+    /* p5 */
+    q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
+    tmp0_r = p5_r_in - p6_r_in;
+    tmp0_r += q1_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1);
+    tmp0_l = p5_l_in - p6_l_in;
+    tmp0_l += q1_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
+    ST_UB(p5, src);
+    src += 16;
+
+    /* p4 */
+    q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
+    tmp0_r = p4_r_in - p5_r_in;
+    tmp0_r += q2_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2);
+    tmp0_l = p4_l_in - p5_l_in;
+    tmp0_l += q2_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
+    ST_UB(p4, src);
+    src += 16;
+
+    /* p3 */
+    q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
+    tmp0_r = p3_r_in - p4_r_in;
+    tmp0_r += q3_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3);
+    tmp0_l = p3_l_in - p4_l_in;
+    tmp0_l += q3_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
+    ST_UB(p3, src);
+    src += 16;
+
+    /* p2 */
+    q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
+    filter8 = LD_UB(filter48);
+    tmp0_r = p2_r_in - p3_r_in;
+    tmp0_r += q4_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4);
+    tmp0_l = p2_l_in - p3_l_in;
+    tmp0_l += q4_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST_UB(filter8, src);
+    src += 16;
+
+    /* p1 */
+    q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
+    filter8 = LD_UB(filter48 + 16);
+    tmp0_r = p1_r_in - p2_r_in;
+    tmp0_r += q5_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5);
+    tmp0_l = p1_l_in - p2_l_in;
+    tmp0_l += q5_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)(tmp1_l), 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST_UB(filter8, src);
+    src += 16;
+
+    /* p0 */
+    q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
+    filter8 = LD_UB(filter48 + 32);
+    tmp0_r = p0_r_in - p1_r_in;
+    tmp0_r += q6_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6);
+    tmp0_l = p0_l_in - p1_l_in;
+    tmp0_l += q6_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST_UB(filter8, src);
+    src += 16;
+
+    /* q0 */
+    q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
+    filter8 = LD_UB(filter48 + 48);
+    tmp0_r = q7_r_in - p0_r_in;
+    tmp0_r += q0_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7);
+    tmp0_l = q7_l_in - p0_l_in;
+    tmp0_l += q0_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST_UB(filter8, src);
+    src += 16;
+
+    /* q1 */
+    filter8 = LD_UB(filter48 + 64);
+    tmp0_r = q7_r_in - q0_r_in;
+    tmp0_r += q1_r_in;
+    tmp0_r -= p6_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    tmp0_l = q7_l_in - q0_l_in;
+    tmp0_l += q1_l_in;
+    tmp0_l -= p6_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST_UB(filter8, src);
+    src += 16;
+
+    /* q2 */
+    filter8 = LD_UB(filter48 + 80);
+    tmp0_r = q7_r_in - q1_r_in;
+    tmp0_r += q2_r_in;
+    tmp0_r -= p5_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    tmp0_l = q7_l_in - q1_l_in;
+    tmp0_l += q2_l_in;
+    tmp0_l -= p5_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST_UB(filter8, src);
+    src += 16;
+
+    /* q3 */
+    tmp0_r = q7_r_in - q2_r_in;
+    tmp0_r += q3_r_in;
+    tmp0_r -= p4_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    tmp0_l = q7_l_in - q2_l_in;
+    tmp0_l += q3_l_in;
+    tmp0_l -= p4_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
+    ST_UB(q3, src);
+    src += 16;
+
+    /* q4 */
+    tmp0_r = q7_r_in - q3_r_in;
+    tmp0_r += q4_r_in;
+    tmp0_r -= p3_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    tmp0_l = q7_l_in - q3_l_in;
+    tmp0_l += q4_l_in;
+    tmp0_l -= p3_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
+    ST_UB(q4, src);
+    src += 16;
+
+    /* q5 */
+    tmp0_r = q7_r_in - q4_r_in;
+    tmp0_r += q5_r_in;
+    tmp0_r -= p2_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    tmp0_l = q7_l_in - q4_l_in;
+    tmp0_l += q5_l_in;
+    tmp0_l -= p2_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
+    ST_UB(q5, src);
+    src += 16;
+
+    /* q6 */
+    tmp0_r = q7_r_in - q5_r_in;
+    tmp0_r += q6_r_in;
+    tmp0_r -= p1_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    tmp0_l = q7_l_in - q5_l_in;
+    tmp0_l += q6_l_in;
+    tmp0_l -= p1_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
+    ST_UB(q6, src);
+
+    return 0;
+  }
+}
+
+void vpx_lpf_vertical_16_dual_msa(uint8_t *src, int32_t pitch,
+                                  const uint8_t *b_limit_ptr,
+                                  const uint8_t *limit_ptr,
+                                  const uint8_t *thresh_ptr) {
+  uint8_t early_exit = 0;
+  DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]);
+  uint8_t *filter48 = &transposed_input[16 * 16];
+
+  transpose_16x16((src - 8), pitch, &transposed_input[0], 16);
+
+  early_exit = vpx_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8),
+                                        &filter48[0], src, pitch, b_limit_ptr,
+                                        limit_ptr, thresh_ptr);
+
+  if (0 == early_exit) {
+    early_exit = vpx_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch,
+                                    &filter48[0]);
+
+    if (0 == early_exit) {
+      transpose_16x16(transposed_input, 16, (src - 8), pitch);
+    }
+  }
+}
diff --git a/libs/libvpx/vpx_dsp/mips/loopfilter_4_msa.c b/libs/libvpx/vpx_dsp/mips/loopfilter_4_msa.c
new file mode 100644
index 0000000000..daf5f38bf7
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/loopfilter_4_msa.c
@@ -0,0 +1,152 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/mips/loopfilter_msa.h"
+
+void vpx_lpf_horizontal_4_msa(uint8_t *src, int32_t pitch,
+                              const uint8_t *b_limit_ptr,
+                              const uint8_t *limit_ptr,
+                              const uint8_t *thresh_ptr,
+                              int32_t count) {
+  uint64_t p1_d, p0_d, q0_d, q1_d;
+  v16u8 mask, hev, flat, thresh, b_limit, limit;
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out;
+
+  (void)count;
+
+  /* load vector elements */
+  LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
+  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+  limit = (v16u8)__msa_fill_b(*limit_ptr);
+
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+               hev, mask, flat);
+  VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
+  p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
+  q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
+  q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
+  SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
+}
+
+void vpx_lpf_horizontal_4_dual_msa(uint8_t *src, int32_t pitch,
+                                   const uint8_t *b_limit0_ptr,
+                                   const uint8_t *limit0_ptr,
+                                   const uint8_t *thresh0_ptr,
+                                   const uint8_t *b_limit1_ptr,
+                                   const uint8_t *limit1_ptr,
+                                   const uint8_t *thresh1_ptr) {
+  v16u8 mask, hev, flat, thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+
+  /* load vector elements */
+  LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+  thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr);
+  thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr);
+  thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0);
+
+  b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr);
+  b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr);
+  b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0);
+
+  limit0 = (v16u8)__msa_fill_b(*limit0_ptr);
+  limit1 = (v16u8)__msa_fill_b(*limit1_ptr);
+  limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0);
+
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
+               hev, mask, flat);
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
+
+  ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch);
+}
+
+void vpx_lpf_vertical_4_msa(uint8_t *src, int32_t pitch,
+                            const uint8_t *b_limit_ptr,
+                            const uint8_t *limit_ptr,
+                            const uint8_t *thresh_ptr,
+                            int32_t count) {
+  v16u8 mask, hev, flat, limit, thresh, b_limit;
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+  v8i16 vec0, vec1, vec2, vec3;
+
+  (void)count;
+
+  LD_UB8((src - 4), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
+  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+  limit = (v16u8)__msa_fill_b(*limit_ptr);
+
+  TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3,
+                     p3, p2, p1, p0, q0, q1, q2, q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+               hev, mask, flat);
+  VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
+  ILVR_B2_SH(p0, p1, q1, q0, vec0, vec1);
+  ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+
+  src -= 2;
+  ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
+  src += 4 * pitch;
+  ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
+}
+
+void vpx_lpf_vertical_4_dual_msa(uint8_t *src, int32_t pitch,
+                                 const uint8_t *b_limit0_ptr,
+                                 const uint8_t *limit0_ptr,
+                                 const uint8_t *thresh0_ptr,
+                                 const uint8_t *b_limit1_ptr,
+                                 const uint8_t *limit1_ptr,
+                                 const uint8_t *thresh1_ptr) {
+  v16u8 mask, hev, flat;
+  v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+  v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
+  v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
+  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+
+  LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+  LD_UB8(src - 4 + (8 * pitch), pitch,
+         row8, row9, row10, row11, row12, row13, row14, row15);
+
+  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                      row8, row9, row10, row11, row12, row13, row14, row15,
+                      p3, p2, p1, p0, q0, q1, q2, q3);
+
+  thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr);
+  thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr);
+  thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0);
+
+  b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr);
+  b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr);
+  b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0);
+
+  limit0 = (v16u8)__msa_fill_b(*limit0_ptr);
+  limit1 = (v16u8)__msa_fill_b(*limit1_ptr);
+  limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0);
+
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
+               hev, mask, flat);
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
+  ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
+  ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3);
+  ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
+  ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5);
+
+  src -= 2;
+
+  ST4x8_UB(tmp2, tmp3, src, pitch);
+  src += (8 * pitch);
+  ST4x8_UB(tmp4, tmp5, src, pitch);
+}
diff --git a/libs/libvpx/vpx_dsp/mips/loopfilter_8_msa.c b/libs/libvpx/vpx_dsp/mips/loopfilter_8_msa.c
new file mode 100644
index 0000000000..00b6db5509
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/loopfilter_8_msa.c
@@ -0,0 +1,348 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/mips/loopfilter_msa.h"
+
+void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch,
+                              const uint8_t *b_limit_ptr,
+                              const uint8_t *limit_ptr,
+                              const uint8_t *thresh_ptr,
+                              int32_t count) {
+  uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
+  v16u8 mask, hev, flat, thresh, b_limit, limit;
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+  v8i16 p2_filter8, p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8;
+  v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
+  v16i8 zero = { 0 };
+
+  (void)count;
+
+  /* load vector elements */
+  LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
+  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+  limit = (v16u8)__msa_fill_b(*limit_ptr);
+
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+               hev, mask, flat);
+  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+  VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
+
+  if (__msa_test_bz_v(flat)) {
+    p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
+    p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
+    q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
+    q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
+    SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
+  } else {
+    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
+               zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
+               q2_r, q3_r);
+    VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8,
+                p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
+
+    /* convert 16 bit output data into 8 bit */
+    PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8,
+                zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8,
+                q0_filter8);
+    PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);
+
+    /* store pixel values */
+    p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat);
+    p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat);
+    p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat);
+    q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat);
+    q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat);
+    q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat);
+
+    p2_d = __msa_copy_u_d((v2i64)p2_out, 0);
+    p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
+    p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
+    q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
+    q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
+    q2_d = __msa_copy_u_d((v2i64)q2_out, 0);
+
+    src -= 3 * pitch;
+
+    SD4(p2_d, p1_d, p0_d, q0_d, src, pitch);
+    src += (4 * pitch);
+    SD(q1_d, src);
+    src += pitch;
+    SD(q2_d, src);
+  }
+}
+
+void vpx_lpf_horizontal_8_dual_msa(uint8_t *src, int32_t pitch,
+                                   const uint8_t *b_limit0,
+                                   const uint8_t *limit0,
+                                   const uint8_t *thresh0,
+                                   const uint8_t *b_limit1,
+                                   const uint8_t *limit1,
+                                   const uint8_t *thresh1) {
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+  v16u8 flat, mask, hev, tmp, thresh, b_limit, limit;
+  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
+  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
+  v16u8 zero = { 0 };
+
+  /* load vector elements */
+  LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+  thresh = (v16u8)__msa_fill_b(*thresh0);
+  tmp = (v16u8)__msa_fill_b(*thresh1);
+  thresh = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)thresh);
+
+  b_limit = (v16u8)__msa_fill_b(*b_limit0);
+  tmp = (v16u8)__msa_fill_b(*b_limit1);
+  b_limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)b_limit);
+
+  limit = (v16u8)__msa_fill_b(*limit0);
+  tmp = (v16u8)__msa_fill_b(*limit1);
+  limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)limit);
+
+  /* mask and hev */
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+               hev, mask, flat);
+  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  if (__msa_test_bz_v(flat)) {
+    ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
+  } else {
+    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
+               zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
+               q2_r, q3_r);
+    VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+
+    ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
+    ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
+    VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+                p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+    /* convert 16 bit output data into 8 bit */
+    PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
+                p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
+                p0_filt8_r, q0_filt8_r);
+    PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
+                q2_filt8_r);
+
+    /* store pixel values */
+    p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
+    p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
+    p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
+    q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
+    q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
+    q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
+
+    src -= 3 * pitch;
+
+    ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
+    src += (4 * pitch);
+    ST_UB2(q1_out, q2_out, src, pitch);
+    src += (2 * pitch);
+  }
+}
+
+void vpx_lpf_vertical_8_msa(uint8_t *src, int32_t pitch,
+                            const uint8_t *b_limit_ptr,
+                            const uint8_t *limit_ptr,
+                            const uint8_t *thresh_ptr,
+                            int32_t count) {
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+  v16u8 p1_out, p0_out, q0_out, q1_out;
+  v16u8 flat, mask, hev, thresh, b_limit, limit;
+  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
+  v16u8 zero = { 0 };
+  v8i16 vec0, vec1, vec2, vec3, vec4;
+
+  (void)count;
+
+  /* load vector elements */
+  LD_UB8(src - 4, pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+  TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3,
+                     p3, p2, p1, p0, q0, q1, q2, q3);
+
+  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
+  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+  limit = (v16u8)__msa_fill_b(*limit_ptr);
+
+  /* mask and hev */
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+               hev, mask, flat);
+  /* flat4 */
+  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+  /* filter4 */
+  VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
+
+  if (__msa_test_bz_v(flat)) {
+    /* Store 4 pixels p1-_q1 */
+    ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+    ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+
+    src -= 2;
+    ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
+    src += 4 * pitch;
+    ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
+  } else {
+    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
+               zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
+               q3_r);
+    VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+    /* convert 16 bit output data into 8 bit */
+    PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r, p0_filt8_r,
+                p0_filt8_r, q0_filt8_r, q0_filt8_r, p2_filt8_r, p1_filt8_r,
+                p0_filt8_r, q0_filt8_r);
+    PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r, q1_filt8_r,
+                q2_filt8_r);
+
+    /* store pixel values */
+    p2 = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
+    p1 = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
+    p0 = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
+    q0 = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
+    q1 = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
+    q2 = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
+
+    /* Store 6 pixels p2-_q2 */
+    ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
+    ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+    vec4 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1);
+
+    src -= 3;
+    ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
+    ST2x4_UB(vec4, 0, src + 4, pitch);
+    src += (4 * pitch);
+    ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
+    ST2x4_UB(vec4, 4, src + 4, pitch);
+  }
+}
+
+void vpx_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch,
+                                 const uint8_t *b_limit0,
+                                 const uint8_t *limit0,
+                                 const uint8_t *thresh0,
+                                 const uint8_t *b_limit1,
+                                 const uint8_t *limit1,
+                                 const uint8_t *thresh1) {
+  uint8_t *temp_src;
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+  v16u8 p1_out, p0_out, q0_out, q1_out;
+  v16u8 flat, mask, hev, thresh, b_limit, limit;
+  v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
+  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
+  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
+  v16u8 zero = { 0 };
+  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+
+  temp_src = src - 4;
+
+  LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
+  temp_src += (8 * pitch);
+  LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
+
+  /* transpose 16x8 matrix into 8x16 */
+  TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7,
+                      q3, q2, q1, q0, row12, row13, row14, row15,
+                      p3, p2, p1, p0, q0, q1, q2, q3);
+
+  thresh = (v16u8)__msa_fill_b(*thresh0);
+  vec0 = (v8i16)__msa_fill_b(*thresh1);
+  thresh = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)thresh);
+
+  b_limit = (v16u8)__msa_fill_b(*b_limit0);
+  vec0 = (v8i16)__msa_fill_b(*b_limit1);
+  b_limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)b_limit);
+
+  limit = (v16u8)__msa_fill_b(*limit0);
+  vec0 = (v8i16)__msa_fill_b(*limit1);
+  limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)limit);
+
+  /* mask and hev */
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+               hev, mask, flat);
+  /* flat4 */
+  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+  /* filter4 */
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  if (__msa_test_bz_v(flat)) {
+    ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+    ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+    ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+    ILVRL_H2_SH(vec1, vec0, vec4, vec5);
+
+    src -= 2;
+    ST4x8_UB(vec2, vec3, src, pitch);
+    src += 8 * pitch;
+    ST4x8_UB(vec4, vec5, src, pitch);
+  } else {
+    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
+               zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
+               q3_r);
+    VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+
+    ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
+    ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
+
+    /* filter8 */
+    VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+                p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+    /* convert 16 bit output data into 8 bit */
+    PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
+                p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
+                p0_filt8_r, q0_filt8_r);
+    PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
+                q2_filt8_r);
+
+    /* store pixel values */
+    p2 = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
+    p1 = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
+    p0 = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
+    q0 = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
+    q1 = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
+    q2 = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
+
+    ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
+    ILVRL_H2_SH(vec1, vec0, vec3, vec4);
+    ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
+    ILVRL_H2_SH(vec1, vec0, vec6, vec7);
+    ILVRL_B2_SH(q2, q1, vec2, vec5);
+
+    src -= 3;
+    ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
+    ST2x4_UB(vec2, 0, src + 4, pitch);
+    src += (4 * pitch);
+    ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch);
+    ST2x4_UB(vec2, 4, src + 4, pitch);
+    src += (4 * pitch);
+    ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch);
+    ST2x4_UB(vec5, 0, src + 4, pitch);
+    src += (4 * pitch);
+    ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch);
+    ST2x4_UB(vec5, 4, src + 4, pitch);
+  }
+}
diff --git a/libs/libvpx/vpx_dsp/mips/loopfilter_filters_dspr2.c b/libs/libvpx/vpx_dsp/mips/loopfilter_filters_dspr2.c
new file mode 100644
index 0000000000..99a96d89b9
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/loopfilter_filters_dspr2.c
@@ -0,0 +1,361 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/mips/common_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_filters_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_macros_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_masks_dspr2.h"
+#include "vpx_mem/vpx_mem.h"
+
+#if HAVE_DSPR2
+void vpx_lpf_horizontal_4_dspr2(unsigned char *s,
+                                int pitch,
+                                const uint8_t *blimit,
+                                const uint8_t *limit,
+                                const uint8_t *thresh,
+                                int count) {
+  uint8_t   i;
+  uint32_t  mask;
+  uint32_t  hev;
+  uint32_t  pm1, p0, p1, p2, p3, p4, p5, p6;
+  uint8_t   *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
+  uint32_t  thresh_vec, flimit_vec, limit_vec;
+  uint32_t  uflimit, ulimit, uthresh;
+
+  uflimit = *blimit;
+  ulimit = *limit;
+  uthresh = *thresh;
+
+  /* create quad-byte */
+  __asm__ __volatile__ (
+      "replv.qb       %[thresh_vec],    %[uthresh]    \n\t"
+      "replv.qb       %[flimit_vec],    %[uflimit]    \n\t"
+      "replv.qb       %[limit_vec],     %[ulimit]     \n\t"
+
+      : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
+        [limit_vec] "=r" (limit_vec)
+      : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
+  );
+
+  /* prefetch data for store */
+  prefetch_store(s);
+
+  /* loop filter designed to work using chars so that we can make maximum use
+     of 8 bit simd instructions. */
+  for (i = 0; i < 2; i++) {
+    sm1 = s - (pitch << 2);
+    s0 = sm1 + pitch;
+    s1 = s0 + pitch;
+    s2 = s - pitch;
+    s3 = s;
+    s4 = s + pitch;
+    s5 = s4 + pitch;
+    s6 = s5 + pitch;
+
+    __asm__ __volatile__ (
+        "lw     %[p1],  (%[s1])    \n\t"
+        "lw     %[p2],  (%[s2])    \n\t"
+        "lw     %[p3],  (%[s3])    \n\t"
+        "lw     %[p4],  (%[s4])    \n\t"
+
+        : [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4)
+        : [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
+    );
+
+    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+       mask will be zero and filtering is not needed */
+    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
+      __asm__ __volatile__ (
+          "lw       %[pm1], (%[sm1])   \n\t"
+          "lw       %[p0],  (%[s0])    \n\t"
+          "lw       %[p5],  (%[s5])    \n\t"
+          "lw       %[p6],  (%[s6])    \n\t"
+
+          : [pm1] "=&r" (pm1), [p0] "=&r" (p0), [p5] "=&r" (p5),
+            [p6] "=&r" (p6)
+          : [sm1] "r" (sm1), [s0] "r" (s0), [s5] "r" (s5), [s6] "r" (s6)
+      );
+
+      filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2,
+                            pm1, p0, p3, p4, p5, p6,
+                            thresh_vec, &hev, &mask);
+
+      /* if mask == 0 do filtering is not needed */
+      if (mask) {
+        /* filtering */
+        filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
+
+        __asm__ __volatile__ (
+            "sw     %[p1],  (%[s1])    \n\t"
+            "sw     %[p2],  (%[s2])    \n\t"
+            "sw     %[p3],  (%[s3])    \n\t"
+            "sw     %[p4],  (%[s4])    \n\t"
+
+            :
+            : [p1] "r" (p1), [p2] "r" (p2), [p3] "r" (p3), [p4] "r" (p4),
+              [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
+        );
+      }
+    }
+
+    s = s + 4;
+  }
+}
+
+void vpx_lpf_vertical_4_dspr2(unsigned char *s,
+                              int pitch,
+                              const uint8_t *blimit,
+                              const uint8_t *limit,
+                              const uint8_t *thresh,
+                              int count) {
+  uint8_t   i;
+  uint32_t  mask, hev;
+  uint32_t  pm1, p0, p1, p2, p3, p4, p5, p6;
+  uint8_t   *s1, *s2, *s3, *s4;
+  uint32_t  prim1, prim2, sec3, sec4, prim3, prim4;
+  uint32_t  thresh_vec, flimit_vec, limit_vec;
+  uint32_t  uflimit, ulimit, uthresh;
+
+  uflimit = *blimit;
+  ulimit = *limit;
+  uthresh = *thresh;
+
+  /* create quad-byte */
+  __asm__ __volatile__ (
+      "replv.qb       %[thresh_vec],    %[uthresh]    \n\t"
+      "replv.qb       %[flimit_vec],    %[uflimit]    \n\t"
+      "replv.qb       %[limit_vec],     %[ulimit]     \n\t"
+
+      : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
+        [limit_vec] "=r" (limit_vec)
+      : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
+  );
+
+  /* prefetch data for store */
+  prefetch_store(s + pitch);
+
+  for (i = 0; i < 2; i++) {
+    s1 = s;
+    s2 = s + pitch;
+    s3 = s2 + pitch;
+    s4 = s3 + pitch;
+    s  = s4 + pitch;
+
+    /* load quad-byte vectors
+     * memory is 4 byte aligned
+     */
+    p2  = *((uint32_t *)(s1 - 4));
+    p6  = *((uint32_t *)(s1));
+    p1  = *((uint32_t *)(s2 - 4));
+    p5  = *((uint32_t *)(s2));
+    p0  = *((uint32_t *)(s3 - 4));
+    p4  = *((uint32_t *)(s3));
+    pm1 = *((uint32_t *)(s4 - 4));
+    p3  = *((uint32_t *)(s4));
+
+    /* transpose pm1, p0, p1, p2 */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
+        "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
+
+        "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
+        "append         %[p1],      %[sec3],    16          \n\t"
+        "append         %[pm1],     %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    /* transpose p3, p4, p5, p6 */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
+        "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
+
+        "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
+        "append         %[p5],      %[sec3],    16          \n\t"
+        "append         %[p3],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+     * mask will be zero and filtering is not needed
+     */
+    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
+      filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1,
+                            p0, p3, p4, p5, p6, thresh_vec,
+                            &hev, &mask);
+
+      /* if mask == 0 do filtering is not needed */
+      if (mask) {
+        /* filtering */
+        filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
+
+        /* unpack processed 4x4 neighborhood
+         * don't use transpose on output data
+         * because memory isn't aligned
+         */
+        __asm__ __volatile__ (
+            "sb     %[p4],   1(%[s4])    \n\t"
+            "sb     %[p3],   0(%[s4])    \n\t"
+            "sb     %[p2],  -1(%[s4])    \n\t"
+            "sb     %[p1],  -2(%[s4])    \n\t"
+
+            :
+            : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1),
+              [s4] "r" (s4)
+        );
+
+        __asm__ __volatile__ (
+            "srl    %[p4],  %[p4],  8     \n\t"
+            "srl    %[p3],  %[p3],  8     \n\t"
+            "srl    %[p2],  %[p2],  8     \n\t"
+            "srl    %[p1],  %[p1],  8     \n\t"
+
+            : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+            :
+        );
+
+        __asm__ __volatile__ (
+            "sb     %[p4],   1(%[s3])    \n\t"
+            "sb     %[p3],   0(%[s3])    \n\t"
+            "sb     %[p2],  -1(%[s3])    \n\t"
+            "sb     %[p1],  -2(%[s3])    \n\t"
+
+            : [p1] "+r" (p1)
+            : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [s3] "r" (s3)
+        );
+
+        __asm__ __volatile__ (
+            "srl    %[p4],  %[p4],  8     \n\t"
+            "srl    %[p3],  %[p3],  8     \n\t"
+            "srl    %[p2],  %[p2],  8     \n\t"
+            "srl    %[p1],  %[p1],  8     \n\t"
+
+            : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+            :
+        );
+
+        __asm__ __volatile__ (
+            "sb     %[p4],   1(%[s2])    \n\t"
+            "sb     %[p3],   0(%[s2])    \n\t"
+            "sb     %[p2],  -1(%[s2])    \n\t"
+            "sb     %[p1],  -2(%[s2])    \n\t"
+
+            :
+            : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1),
+              [s2] "r" (s2)
+        );
+
+        __asm__ __volatile__ (
+            "srl    %[p4],  %[p4],  8     \n\t"
+            "srl    %[p3],  %[p3],  8     \n\t"
+            "srl    %[p2],  %[p2],  8     \n\t"
+            "srl    %[p1],  %[p1],  8     \n\t"
+
+            : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+            :
+        );
+
+        __asm__ __volatile__ (
+            "sb     %[p4],   1(%[s1])    \n\t"
+            "sb     %[p3],   0(%[s1])    \n\t"
+            "sb     %[p2],  -1(%[s1])    \n\t"
+            "sb     %[p1],  -2(%[s1])    \n\t"
+
+            :
+            : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1),
+              [s1] "r" (s1)
+        );
+      }
+    }
+  }
+}
+
+void vpx_lpf_horizontal_4_dual_dspr2(uint8_t *s, int p /* pitch */,
+                                     const uint8_t *blimit0,
+                                     const uint8_t *limit0,
+                                     const uint8_t *thresh0,
+                                     const uint8_t *blimit1,
+                                     const uint8_t *limit1,
+                                     const uint8_t *thresh1) {
+  vpx_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0, 1);
+  vpx_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1, 1);
+}
+
+void vpx_lpf_horizontal_8_dual_dspr2(uint8_t *s, int p /* pitch */,
+                                     const uint8_t *blimit0,
+                                     const uint8_t *limit0,
+                                     const uint8_t *thresh0,
+                                     const uint8_t *blimit1,
+                                     const uint8_t *limit1,
+                                     const uint8_t *thresh1) {
+  vpx_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0, 1);
+  vpx_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1, 1);
+}
+
+void vpx_lpf_vertical_4_dual_dspr2(uint8_t *s, int p,
+                                   const uint8_t *blimit0,
+                                   const uint8_t *limit0,
+                                   const uint8_t *thresh0,
+                                   const uint8_t *blimit1,
+                                   const uint8_t *limit1,
+                                   const uint8_t *thresh1) {
+  vpx_lpf_vertical_4_dspr2(s, p, blimit0, limit0, thresh0, 1);
+  vpx_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1, 1);
+}
+
+void vpx_lpf_vertical_8_dual_dspr2(uint8_t *s, int p,
+                                   const uint8_t *blimit0,
+                                   const uint8_t *limit0,
+                                   const uint8_t *thresh0,
+                                   const uint8_t *blimit1,
+                                   const uint8_t *limit1,
+                                   const uint8_t *thresh1) {
+  vpx_lpf_vertical_8_dspr2(s, p, blimit0, limit0, thresh0, 1);
+  vpx_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1,
+                                       1);
+}
+
+void vpx_lpf_vertical_16_dual_dspr2(uint8_t *s, int p,
+                                    const uint8_t *blimit,
+                                    const uint8_t *limit,
+                                    const uint8_t *thresh) {
+  vpx_lpf_vertical_16_dspr2(s, p, blimit, limit, thresh);
+  vpx_lpf_vertical_16_dspr2(s + 8 * p, p, blimit, limit, thresh);
+}
+#endif  // #if HAVE_DSPR2
diff --git a/libs/libvpx/vpx_dsp/mips/loopfilter_filters_dspr2.h b/libs/libvpx/vpx_dsp/mips/loopfilter_filters_dspr2.h
new file mode 100644
index 0000000000..4a1506ba12
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/loopfilter_filters_dspr2.h
@@ -0,0 +1,764 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_FILTERS_DSPR2_H_
+#define VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_FILTERS_DSPR2_H_
+
+#include <stdlib.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if HAVE_DSPR2
+/* inputs & outputs are quad-byte vectors */
+static INLINE void filter_dspr2(uint32_t mask, uint32_t hev,
+                                uint32_t *ps1, uint32_t *ps0,
+                                uint32_t *qs0, uint32_t *qs1) {
+  int32_t   vpx_filter_l, vpx_filter_r;
+  int32_t   Filter1_l, Filter1_r, Filter2_l, Filter2_r;
+  int32_t   subr_r, subr_l;
+  uint32_t  t1, t2, HWM, t3;
+  uint32_t  hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
+  int32_t   vps1, vps0, vqs0, vqs1;
+  int32_t   vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
+  uint32_t  N128;
+
+  N128 = 0x80808080;
+  t1  = 0x03000300;
+  t2  = 0x04000400;
+  t3  = 0x01000100;
+  HWM = 0xFF00FF00;
+
+  vps0 = (*ps0) ^ N128;
+  vps1 = (*ps1) ^ N128;
+  vqs0 = (*qs0) ^ N128;
+  vqs1 = (*qs1) ^ N128;
+
+  /* use halfword pairs instead quad-bytes because of accuracy */
+  vps0_l = vps0 & HWM;
+  vps0_r = vps0 << 8;
+  vps0_r = vps0_r & HWM;
+
+  vps1_l = vps1 & HWM;
+  vps1_r = vps1 << 8;
+  vps1_r = vps1_r & HWM;
+
+  vqs0_l = vqs0 & HWM;
+  vqs0_r = vqs0 << 8;
+  vqs0_r = vqs0_r & HWM;
+
+  vqs1_l = vqs1 & HWM;
+  vqs1_r = vqs1 << 8;
+  vqs1_r = vqs1_r & HWM;
+
+  mask_l = mask & HWM;
+  mask_r = mask << 8;
+  mask_r = mask_r & HWM;
+
+  hev_l = hev & HWM;
+  hev_r = hev << 8;
+  hev_r = hev_r & HWM;
+
+  __asm__ __volatile__ (
+      /* vpx_filter = vp8_signed_char_clamp(ps1 - qs1); */
+      "subq_s.ph    %[vpx_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
+      "subq_s.ph    %[vpx_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
+
+      /* qs0 - ps0 */
+      "subq_s.ph    %[subr_l],       %[vqs0_l],       %[vps0_l]       \n\t"
+      "subq_s.ph    %[subr_r],       %[vqs0_r],       %[vps0_r]       \n\t"
+
+      /* vpx_filter &= hev; */
+      "and          %[vpx_filter_l], %[vpx_filter_l], %[hev_l]        \n\t"
+      "and          %[vpx_filter_r], %[vpx_filter_r], %[hev_r]        \n\t"
+
+      /* vpx_filter = vp8_signed_char_clamp(vpx_filter + 3 * (qs0 - ps0)); */
+      "addq_s.ph    %[vpx_filter_l], %[vpx_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[vpx_filter_r], %[vpx_filter_r], %[subr_r]       \n\t"
+      "xor          %[invhev_l],     %[hev_l],        %[HWM]          \n\t"
+      "addq_s.ph    %[vpx_filter_l], %[vpx_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[vpx_filter_r], %[vpx_filter_r], %[subr_r]       \n\t"
+      "xor          %[invhev_r],     %[hev_r],        %[HWM]          \n\t"
+      "addq_s.ph    %[vpx_filter_l], %[vpx_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[vpx_filter_r], %[vpx_filter_r], %[subr_r]       \n\t"
+
+      /* vpx_filter &= mask; */
+      "and          %[vpx_filter_l], %[vpx_filter_l], %[mask_l]       \n\t"
+      "and          %[vpx_filter_r], %[vpx_filter_r], %[mask_r]       \n\t"
+
+      : [vpx_filter_l] "=&r" (vpx_filter_l),
+        [vpx_filter_r] "=&r" (vpx_filter_r),
+        [subr_l] "=&r" (subr_l), [subr_r] "=&r" (subr_r),
+        [invhev_l] "=&r" (invhev_l), [invhev_r] "=&r" (invhev_r)
+      : [vps0_l] "r" (vps0_l), [vps0_r] "r" (vps0_r), [vps1_l] "r" (vps1_l),
+        [vps1_r] "r" (vps1_r), [vqs0_l] "r" (vqs0_l), [vqs0_r] "r" (vqs0_r),
+        [vqs1_l] "r" (vqs1_l), [vqs1_r] "r" (vqs1_r),
+        [mask_l] "r" (mask_l), [mask_r] "r" (mask_r),
+        [hev_l] "r" (hev_l), [hev_r] "r" (hev_r),
+        [HWM] "r" (HWM)
+  );
+
+  /* save bottom 3 bits so that we round one side +4 and the other +3 */
+  __asm__ __volatile__ (
+      /* Filter2 = vp8_signed_char_clamp(vpx_filter + 3) >>= 3; */
+      "addq_s.ph    %[Filter1_l],    %[vpx_filter_l], %[t2]           \n\t"
+      "addq_s.ph    %[Filter1_r],    %[vpx_filter_r], %[t2]           \n\t"
+
+      /* Filter1 = vp8_signed_char_clamp(vpx_filter + 4) >>= 3; */
+      "addq_s.ph    %[Filter2_l],    %[vpx_filter_l], %[t1]           \n\t"
+      "addq_s.ph    %[Filter2_r],    %[vpx_filter_r], %[t1]           \n\t"
+      "shra.ph      %[Filter1_r],    %[Filter1_r],    3               \n\t"
+      "shra.ph      %[Filter1_l],    %[Filter1_l],    3               \n\t"
+
+      "shra.ph      %[Filter2_l],    %[Filter2_l],    3               \n\t"
+      "shra.ph      %[Filter2_r],    %[Filter2_r],    3               \n\t"
+
+      "and          %[Filter1_l],    %[Filter1_l],    %[HWM]          \n\t"
+      "and          %[Filter1_r],    %[Filter1_r],    %[HWM]          \n\t"
+
+      /* vps0 = vp8_signed_char_clamp(ps0 + Filter2); */
+      "addq_s.ph    %[vps0_l],       %[vps0_l],       %[Filter2_l]    \n\t"
+      "addq_s.ph    %[vps0_r],       %[vps0_r],       %[Filter2_r]    \n\t"
+
+      /* vqs0 = vp8_signed_char_clamp(qs0 - Filter1); */
+      "subq_s.ph    %[vqs0_l],       %[vqs0_l],       %[Filter1_l]    \n\t"
+      "subq_s.ph    %[vqs0_r],       %[vqs0_r],       %[Filter1_r]    \n\t"
+
+      : [Filter1_l] "=&r" (Filter1_l), [Filter1_r] "=&r" (Filter1_r),
+        [Filter2_l] "=&r" (Filter2_l), [Filter2_r] "=&r" (Filter2_r),
+        [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
+        [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
+      : [t1] "r" (t1), [t2] "r" (t2), [HWM] "r" (HWM),
+        [vpx_filter_l] "r" (vpx_filter_l), [vpx_filter_r] "r" (vpx_filter_r)
+  );
+
+  __asm__ __volatile__ (
+      /* (vpx_filter += 1) >>= 1 */
+      "addqh.ph    %[Filter1_l],    %[Filter1_l],     %[t3]           \n\t"
+      "addqh.ph    %[Filter1_r],    %[Filter1_r],     %[t3]           \n\t"
+
+      /* vpx_filter &= ~hev; */
+      "and          %[Filter1_l],    %[Filter1_l],    %[invhev_l]     \n\t"
+      "and          %[Filter1_r],    %[Filter1_r],    %[invhev_r]     \n\t"
+
+      /* vps1 = vp8_signed_char_clamp(ps1 + vpx_filter); */
+      "addq_s.ph    %[vps1_l],       %[vps1_l],       %[Filter1_l]    \n\t"
+      "addq_s.ph    %[vps1_r],       %[vps1_r],       %[Filter1_r]    \n\t"
+
+      /* vqs1 = vp8_signed_char_clamp(qs1 - vpx_filter); */
+      "subq_s.ph    %[vqs1_l],       %[vqs1_l],       %[Filter1_l]    \n\t"
+      "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[Filter1_r]    \n\t"
+
+      : [Filter1_l] "+r" (Filter1_l), [Filter1_r] "+r" (Filter1_r),
+        [vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r),
+        [vqs1_l] "+r" (vqs1_l), [vqs1_r] "+r" (vqs1_r)
+      : [t3] "r" (t3), [invhev_l] "r" (invhev_l), [invhev_r] "r" (invhev_r)
+  );
+
+  /* Create quad-bytes from halfword pairs */
+  vqs0_l = vqs0_l & HWM;
+  vqs1_l = vqs1_l & HWM;
+  vps0_l = vps0_l & HWM;
+  vps1_l = vps1_l & HWM;
+
+  __asm__ __volatile__ (
+      "shrl.ph      %[vqs0_r],       %[vqs0_r],       8   \n\t"
+      "shrl.ph      %[vps0_r],       %[vps0_r],       8   \n\t"
+      "shrl.ph      %[vqs1_r],       %[vqs1_r],       8   \n\t"
+      "shrl.ph      %[vps1_r],       %[vps1_r],       8   \n\t"
+
+      : [vps1_r] "+r" (vps1_r), [vqs1_r] "+r" (vqs1_r),
+        [vps0_r] "+r" (vps0_r), [vqs0_r] "+r" (vqs0_r)
+      :
+  );
+
+  vqs0 = vqs0_l | vqs0_r;
+  vqs1 = vqs1_l | vqs1_r;
+  vps0 = vps0_l | vps0_r;
+  vps1 = vps1_l | vps1_r;
+
+  *ps0 = vps0 ^ N128;
+  *ps1 = vps1 ^ N128;
+  *qs0 = vqs0 ^ N128;
+  *qs1 = vqs1 ^ N128;
+}
+
+static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev,
+                                 uint32_t ps1, uint32_t ps0,
+                                 uint32_t qs0, uint32_t qs1,
+                                 uint32_t *p1_f0, uint32_t *p0_f0,
+                                 uint32_t *q0_f0, uint32_t *q1_f0) {
+  int32_t   vpx_filter_l, vpx_filter_r;
+  int32_t   Filter1_l, Filter1_r, Filter2_l, Filter2_r;
+  int32_t   subr_r, subr_l;
+  uint32_t  t1, t2, HWM, t3;
+  uint32_t  hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
+  int32_t   vps1, vps0, vqs0, vqs1;
+  int32_t   vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
+  uint32_t  N128;
+
+  N128 = 0x80808080;
+  t1  = 0x03000300;
+  t2  = 0x04000400;
+  t3  = 0x01000100;
+  HWM = 0xFF00FF00;
+
+  vps0 = (ps0) ^ N128;
+  vps1 = (ps1) ^ N128;
+  vqs0 = (qs0) ^ N128;
+  vqs1 = (qs1) ^ N128;
+
+  /* use halfword pairs instead quad-bytes because of accuracy */
+  vps0_l = vps0 & HWM;
+  vps0_r = vps0 << 8;
+  vps0_r = vps0_r & HWM;
+
+  vps1_l = vps1 & HWM;
+  vps1_r = vps1 << 8;
+  vps1_r = vps1_r & HWM;
+
+  vqs0_l = vqs0 & HWM;
+  vqs0_r = vqs0 << 8;
+  vqs0_r = vqs0_r & HWM;
+
+  vqs1_l = vqs1 & HWM;
+  vqs1_r = vqs1 << 8;
+  vqs1_r = vqs1_r & HWM;
+
+  mask_l = mask & HWM;
+  mask_r = mask << 8;
+  mask_r = mask_r & HWM;
+
+  hev_l = hev & HWM;
+  hev_r = hev << 8;
+  hev_r = hev_r & HWM;
+
+  __asm__ __volatile__ (
+      /* vpx_filter = vp8_signed_char_clamp(ps1 - qs1); */
+      "subq_s.ph    %[vpx_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
+      "subq_s.ph    %[vpx_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
+
+      /* qs0 - ps0 */
+      "subq_s.ph    %[subr_l],       %[vqs0_l],       %[vps0_l]       \n\t"
+      "subq_s.ph    %[subr_r],       %[vqs0_r],       %[vps0_r]       \n\t"
+
+      /* vpx_filter &= hev; */
+      "and          %[vpx_filter_l], %[vpx_filter_l], %[hev_l]        \n\t"
+      "and          %[vpx_filter_r], %[vpx_filter_r], %[hev_r]        \n\t"
+
+      /* vpx_filter = vp8_signed_char_clamp(vpx_filter + 3 * (qs0 - ps0)); */
+      "addq_s.ph    %[vpx_filter_l], %[vpx_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[vpx_filter_r], %[vpx_filter_r], %[subr_r]       \n\t"
+      "xor          %[invhev_l],     %[hev_l],        %[HWM]          \n\t"
+      "addq_s.ph    %[vpx_filter_l], %[vpx_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[vpx_filter_r], %[vpx_filter_r], %[subr_r]       \n\t"
+      "xor          %[invhev_r],     %[hev_r],        %[HWM]          \n\t"
+      "addq_s.ph    %[vpx_filter_l], %[vpx_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[vpx_filter_r], %[vpx_filter_r], %[subr_r]       \n\t"
+
+      /* vpx_filter &= mask; */
+      "and          %[vpx_filter_l], %[vpx_filter_l], %[mask_l]       \n\t"
+      "and          %[vpx_filter_r], %[vpx_filter_r], %[mask_r]       \n\t"
+
+      : [vpx_filter_l] "=&r" (vpx_filter_l),
+        [vpx_filter_r] "=&r" (vpx_filter_r),
+        [subr_l] "=&r" (subr_l), [subr_r] "=&r" (subr_r),
+        [invhev_l] "=&r" (invhev_l), [invhev_r] "=&r" (invhev_r)
+      : [vps0_l] "r" (vps0_l), [vps0_r] "r" (vps0_r), [vps1_l] "r" (vps1_l),
+        [vps1_r] "r" (vps1_r), [vqs0_l] "r" (vqs0_l), [vqs0_r] "r" (vqs0_r),
+        [vqs1_l] "r" (vqs1_l), [vqs1_r] "r" (vqs1_r),
+        [mask_l] "r" (mask_l), [mask_r] "r" (mask_r),
+        [hev_l] "r" (hev_l), [hev_r] "r" (hev_r), [HWM] "r" (HWM)
+  );
+
+  /* save bottom 3 bits so that we round one side +4 and the other +3 */
+  __asm__ __volatile__ (
+      /* Filter2 = vp8_signed_char_clamp(vpx_filter + 3) >>= 3; */
+      "addq_s.ph    %[Filter1_l],    %[vpx_filter_l], %[t2]           \n\t"
+      "addq_s.ph    %[Filter1_r],    %[vpx_filter_r], %[t2]           \n\t"
+
+      /* Filter1 = vp8_signed_char_clamp(vpx_filter + 4) >>= 3; */
+      "addq_s.ph    %[Filter2_l],    %[vpx_filter_l], %[t1]           \n\t"
+      "addq_s.ph    %[Filter2_r],    %[vpx_filter_r], %[t1]           \n\t"
+      "shra.ph      %[Filter1_r],    %[Filter1_r],    3               \n\t"
+      "shra.ph      %[Filter1_l],    %[Filter1_l],    3               \n\t"
+
+      "shra.ph      %[Filter2_l],    %[Filter2_l],    3               \n\t"
+      "shra.ph      %[Filter2_r],    %[Filter2_r],    3               \n\t"
+
+      "and          %[Filter1_l],    %[Filter1_l],    %[HWM]          \n\t"
+      "and          %[Filter1_r],    %[Filter1_r],    %[HWM]          \n\t"
+
+      /* vps0 = vp8_signed_char_clamp(ps0 + Filter2); */
+      "addq_s.ph    %[vps0_l],       %[vps0_l],       %[Filter2_l]    \n\t"
+      "addq_s.ph    %[vps0_r],       %[vps0_r],       %[Filter2_r]    \n\t"
+
+      /* vqs0 = vp8_signed_char_clamp(qs0 - Filter1); */
+      "subq_s.ph    %[vqs0_l],       %[vqs0_l],       %[Filter1_l]    \n\t"
+      "subq_s.ph    %[vqs0_r],       %[vqs0_r],       %[Filter1_r]    \n\t"
+
+      : [Filter1_l] "=&r" (Filter1_l), [Filter1_r] "=&r" (Filter1_r),
+        [Filter2_l] "=&r" (Filter2_l), [Filter2_r] "=&r" (Filter2_r),
+        [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
+        [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
+      : [t1] "r" (t1), [t2] "r" (t2), [HWM] "r" (HWM),
+        [vpx_filter_l] "r" (vpx_filter_l), [vpx_filter_r] "r" (vpx_filter_r)
+  );
+
+  __asm__ __volatile__ (
+      /* (vpx_filter += 1) >>= 1 */
+      "addqh.ph    %[Filter1_l],    %[Filter1_l],     %[t3]           \n\t"
+      "addqh.ph    %[Filter1_r],    %[Filter1_r],     %[t3]           \n\t"
+
+      /* vpx_filter &= ~hev; */
+      "and          %[Filter1_l],    %[Filter1_l],    %[invhev_l]     \n\t"
+      "and          %[Filter1_r],    %[Filter1_r],    %[invhev_r]     \n\t"
+
+      /* vps1 = vp8_signed_char_clamp(ps1 + vpx_filter); */
+      "addq_s.ph    %[vps1_l],       %[vps1_l],       %[Filter1_l]    \n\t"
+      "addq_s.ph    %[vps1_r],       %[vps1_r],       %[Filter1_r]    \n\t"
+
+      /* vqs1 = vp8_signed_char_clamp(qs1 - vpx_filter); */
+      "subq_s.ph    %[vqs1_l],       %[vqs1_l],       %[Filter1_l]    \n\t"
+      "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[Filter1_r]    \n\t"
+
+      : [Filter1_l] "+r" (Filter1_l), [Filter1_r] "+r" (Filter1_r),
+        [vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r),
+        [vqs1_l] "+r" (vqs1_l), [vqs1_r] "+r" (vqs1_r)
+      : [t3] "r" (t3), [invhev_l] "r" (invhev_l), [invhev_r] "r" (invhev_r)
+  );
+
+  /* Create quad-bytes from halfword pairs */
+  vqs0_l = vqs0_l & HWM;
+  vqs1_l = vqs1_l & HWM;
+  vps0_l = vps0_l & HWM;
+  vps1_l = vps1_l & HWM;
+
+  __asm__ __volatile__ (
+      "shrl.ph      %[vqs0_r],       %[vqs0_r],       8   \n\t"
+      "shrl.ph      %[vps0_r],       %[vps0_r],       8   \n\t"
+      "shrl.ph      %[vqs1_r],       %[vqs1_r],       8   \n\t"
+      "shrl.ph      %[vps1_r],       %[vps1_r],       8   \n\t"
+
+      : [vps1_r] "+r" (vps1_r), [vqs1_r] "+r" (vqs1_r),
+        [vps0_r] "+r" (vps0_r), [vqs0_r] "+r" (vqs0_r)
+      :
+  );
+
+  vqs0 = vqs0_l | vqs0_r;
+  vqs1 = vqs1_l | vqs1_r;
+  vps0 = vps0_l | vps0_r;
+  vps1 = vps1_l | vps1_r;
+
+  *p0_f0 = vps0 ^ N128;
+  *p1_f0 = vps1 ^ N128;
+  *q0_f0 = vqs0 ^ N128;
+  *q1_f0 = vqs1 ^ N128;
+}
+
+static INLINE void mbfilter_dspr2(uint32_t *op3, uint32_t *op2,
+                                  uint32_t *op1, uint32_t *op0,
+                                  uint32_t *oq0, uint32_t *oq1,
+                                  uint32_t *oq2, uint32_t *oq3) {
+  /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
+  const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
+  const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
+  uint32_t       res_op2, res_op1, res_op0;
+  uint32_t       res_oq0, res_oq1, res_oq2;
+  uint32_t       tmp;
+  uint32_t       add_p210_q012;
+  uint32_t       u32Four = 0x00040004;
+
+  /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3)  1 */
+  /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3)  2 */
+  /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3)  3 */
+  /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3)  4 */
+  /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3)  5 */
+  /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3)  6 */
+
+  __asm__ __volatile__ (
+      "addu.ph    %[add_p210_q012],  %[p2],             %[p1]            \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[p0]            \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q0]            \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q1]            \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q2]            \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[u32Four]       \n\t"
+
+      "shll.ph    %[tmp],            %[p3],             1                \n\t"
+      "addu.ph    %[res_op2],        %[tmp],            %[p3]            \n\t"
+      "addu.ph    %[res_op1],        %[p3],             %[p3]            \n\t"
+      "addu.ph    %[res_op2],        %[res_op2],        %[p2]            \n\t"
+      "addu.ph    %[res_op1],        %[res_op1],        %[p1]            \n\t"
+      "addu.ph    %[res_op2],        %[res_op2],        %[add_p210_q012] \n\t"
+      "addu.ph    %[res_op1],        %[res_op1],        %[add_p210_q012] \n\t"
+      "subu.ph    %[res_op2],        %[res_op2],        %[q1]            \n\t"
+      "subu.ph    %[res_op1],        %[res_op1],        %[q2]            \n\t"
+      "subu.ph    %[res_op2],        %[res_op2],        %[q2]            \n\t"
+      "shrl.ph    %[res_op1],        %[res_op1],        3                \n\t"
+      "shrl.ph    %[res_op2],        %[res_op2],        3                \n\t"
+      "addu.ph    %[res_op0],        %[p3],             %[p0]            \n\t"
+      "addu.ph    %[res_oq0],        %[q0],             %[q3]            \n\t"
+      "addu.ph    %[res_op0],        %[res_op0],        %[add_p210_q012] \n\t"
+      "addu.ph    %[res_oq0],        %[res_oq0],        %[add_p210_q012] \n\t"
+      "addu.ph    %[res_oq1],        %[q3],             %[q3]            \n\t"
+      "shll.ph    %[tmp],            %[q3],             1                \n\t"
+      "addu.ph    %[res_oq1],        %[res_oq1],        %[q1]            \n\t"
+      "addu.ph    %[res_oq2],        %[tmp],            %[q3]            \n\t"
+      "addu.ph    %[res_oq1],        %[res_oq1],        %[add_p210_q012] \n\t"
+      "addu.ph    %[res_oq2],        %[res_oq2],        %[add_p210_q012] \n\t"
+      "subu.ph    %[res_oq1],        %[res_oq1],        %[p2]            \n\t"
+      "addu.ph    %[res_oq2],        %[res_oq2],        %[q2]            \n\t"
+      "shrl.ph    %[res_oq1],        %[res_oq1],        3                \n\t"
+      "subu.ph    %[res_oq2],        %[res_oq2],        %[p2]            \n\t"
+      "shrl.ph    %[res_oq0],        %[res_oq0],        3                \n\t"
+      "subu.ph    %[res_oq2],        %[res_oq2],        %[p1]            \n\t"
+      "shrl.ph    %[res_op0],        %[res_op0],        3                \n\t"
+      "shrl.ph    %[res_oq2],        %[res_oq2],        3                \n\t"
+
+      : [add_p210_q012] "=&r" (add_p210_q012),
+        [tmp] "=&r" (tmp), [res_op2] "=&r" (res_op2),
+        [res_op1] "=&r" (res_op1), [res_op0] "=&r" (res_op0),
+        [res_oq0] "=&r" (res_oq0), [res_oq1] "=&r" (res_oq1),
+        [res_oq2] "=&r" (res_oq2)
+      : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [q1] "r" (q1),
+        [p2] "r" (p2), [q2] "r" (q2), [p3] "r" (p3), [q3] "r" (q3),
+        [u32Four] "r" (u32Four)
+  );
+
+  *op2 = res_op2;
+  *op1 = res_op1;
+  *op0 = res_op0;
+  *oq0 = res_oq0;
+  *oq1 = res_oq1;
+  *oq2 = res_oq2;
+}
+
+static INLINE void mbfilter1_dspr2(uint32_t p3, uint32_t p2,
+                                   uint32_t p1, uint32_t p0,
+                                   uint32_t q0, uint32_t q1,
+                                   uint32_t q2, uint32_t q3,
+                                   uint32_t *op2_f1,
+                                   uint32_t *op1_f1, uint32_t *op0_f1,
+                                   uint32_t *oq0_f1, uint32_t *oq1_f1,
+                                   uint32_t *oq2_f1) {
+  /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
+  uint32_t  res_op2, res_op1, res_op0;
+  uint32_t  res_oq0, res_oq1, res_oq2;
+  uint32_t  tmp;
+  uint32_t  add_p210_q012;
+  uint32_t  u32Four = 0x00040004;
+
+  /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3)   1 */
+  /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3)   2 */
+  /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3)   3 */
+  /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3)   4 */
+  /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3)   5 */
+  /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3)   6 */
+
+  __asm__ __volatile__ (
+      "addu.ph    %[add_p210_q012],  %[p2],             %[p1]             \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[p0]             \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q0]             \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q1]             \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q2]             \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[u32Four]        \n\t"
+
+      "shll.ph    %[tmp],            %[p3],             1                 \n\t"
+      "addu.ph    %[res_op2],        %[tmp],            %[p3]             \n\t"
+      "addu.ph    %[res_op1],        %[p3],             %[p3]             \n\t"
+      "addu.ph    %[res_op2],        %[res_op2],        %[p2]             \n\t"
+      "addu.ph    %[res_op1],        %[res_op1],        %[p1]             \n\t"
+      "addu.ph    %[res_op2],        %[res_op2],        %[add_p210_q012]  \n\t"
+      "addu.ph    %[res_op1],        %[res_op1],        %[add_p210_q012]  \n\t"
+      "subu.ph    %[res_op2],        %[res_op2],        %[q1]             \n\t"
+      "subu.ph    %[res_op1],        %[res_op1],        %[q2]             \n\t"
+      "subu.ph    %[res_op2],        %[res_op2],        %[q2]             \n\t"
+      "shrl.ph    %[res_op1],        %[res_op1],        3                 \n\t"
+      "shrl.ph    %[res_op2],        %[res_op2],        3                 \n\t"
+      "addu.ph    %[res_op0],        %[p3],             %[p0]             \n\t"
+      "addu.ph    %[res_oq0],        %[q0],             %[q3]             \n\t"
+      "addu.ph    %[res_op0],        %[res_op0],        %[add_p210_q012]  \n\t"
+      "addu.ph    %[res_oq0],        %[res_oq0],        %[add_p210_q012]  \n\t"
+      "addu.ph    %[res_oq1],        %[q3],             %[q3]             \n\t"
+      "shll.ph    %[tmp],            %[q3],             1                 \n\t"
+      "addu.ph    %[res_oq1],        %[res_oq1],        %[q1]             \n\t"
+      "addu.ph    %[res_oq2],        %[tmp],            %[q3]             \n\t"
+      "addu.ph    %[res_oq1],        %[res_oq1],        %[add_p210_q012]  \n\t"
+      "addu.ph    %[res_oq2],        %[res_oq2],        %[add_p210_q012]  \n\t"
+      "subu.ph    %[res_oq1],        %[res_oq1],        %[p2]             \n\t"
+      "addu.ph    %[res_oq2],        %[res_oq2],        %[q2]             \n\t"
+      "shrl.ph    %[res_oq1],        %[res_oq1],        3                 \n\t"
+      "subu.ph    %[res_oq2],        %[res_oq2],        %[p2]             \n\t"
+      "shrl.ph    %[res_oq0],        %[res_oq0],        3                 \n\t"
+      "subu.ph    %[res_oq2],        %[res_oq2],        %[p1]             \n\t"
+      "shrl.ph    %[res_op0],        %[res_op0],        3                 \n\t"
+      "shrl.ph    %[res_oq2],        %[res_oq2],        3                 \n\t"
+
+      : [add_p210_q012] "=&r" (add_p210_q012), [tmp] "=&r" (tmp),
+        [res_op2] "=&r" (res_op2), [res_op1] "=&r" (res_op1),
+        [res_op0] "=&r" (res_op0), [res_oq0] "=&r" (res_oq0),
+        [res_oq1] "=&r" (res_oq1), [res_oq2] "=&r" (res_oq2)
+      : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [q1] "r" (q1),
+        [p2] "r" (p2), [q2] "r" (q2), [p3] "r" (p3), [q3] "r" (q3),
+        [u32Four] "r" (u32Four)
+  );
+
+  *op2_f1 = res_op2;
+  *op1_f1 = res_op1;
+  *op0_f1 = res_op0;
+  *oq0_f1 = res_oq0;
+  *oq1_f1 = res_oq1;
+  *oq2_f1 = res_oq2;
+}
+
+static INLINE void wide_mbfilter_dspr2(uint32_t *op7, uint32_t *op6,
+                                       uint32_t *op5, uint32_t *op4,
+                                       uint32_t *op3, uint32_t *op2,
+                                       uint32_t *op1, uint32_t *op0,
+                                       uint32_t *oq0, uint32_t *oq1,
+                                       uint32_t *oq2, uint32_t *oq3,
+                                       uint32_t *oq4, uint32_t *oq5,
+                                       uint32_t *oq6, uint32_t *oq7) {
+  const uint32_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4;
+  const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
+  const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
+  const uint32_t q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7;
+  uint32_t       res_op6, res_op5, res_op4, res_op3, res_op2, res_op1, res_op0;
+  uint32_t       res_oq0, res_oq1, res_oq2, res_oq3, res_oq4, res_oq5, res_oq6;
+  uint32_t       tmp;
+  uint32_t       add_p6toq6;
+  uint32_t       u32Eight = 0x00080008;
+
+  __asm__ __volatile__ (
+      /* addition of p6,p5,p4,p3,p2,p1,p0,q0,q1,q2,q3,q4,q5,q6
+         which is used most of the time */
+      "addu.ph      %[add_p6toq6],     %[p6],              %[p5]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p4]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p3]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p2]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p1]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p0]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q0]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q1]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q2]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q3]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q4]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q5]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q6]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[u32Eight]   \n\t"
+
+      : [add_p6toq6] "=&r" (add_p6toq6)
+      : [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4),
+        [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
+        [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2), [q3] "r" (q3),
+        [q4] "r" (q4), [q5] "r" (q5), [q6] "r" (q6),
+        [u32Eight] "r" (u32Eight)
+  );
+
+  __asm__ __volatile__ (
+      /* *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 +
+                                   p3 + p2 + p1 + p0 + q0, 4) */
+      "shll.ph       %[tmp],            %[p7],            3               \n\t"
+      "subu.ph       %[res_op6],        %[tmp],           %[p7]           \n\t"
+      "addu.ph       %[res_op6],        %[res_op6],       %[p6]           \n\t"
+      "addu.ph       %[res_op6],        %[res_op6],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_op6],        %[res_op6],       %[q1]           \n\t"
+      "subu.ph       %[res_op6],        %[res_op6],       %[q2]           \n\t"
+      "subu.ph       %[res_op6],        %[res_op6],       %[q3]           \n\t"
+      "subu.ph       %[res_op6],        %[res_op6],       %[q4]           \n\t"
+      "subu.ph       %[res_op6],        %[res_op6],       %[q5]           \n\t"
+      "subu.ph       %[res_op6],        %[res_op6],       %[q6]           \n\t"
+      "shrl.ph       %[res_op6],        %[res_op6],       4               \n\t"
+
+      /* *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 +
+                                   p2 + p1 + p0 + q0 + q1, 4) */
+      "shll.ph       %[tmp],            %[p7],            2               \n\t"
+      "addu.ph       %[res_op5],        %[tmp],           %[p7]           \n\t"
+      "addu.ph       %[res_op5],        %[res_op5],       %[p7]           \n\t"
+      "addu.ph       %[res_op5],        %[res_op5],       %[p5]           \n\t"
+      "addu.ph       %[res_op5],        %[res_op5],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_op5],        %[res_op5],       %[q2]           \n\t"
+      "subu.ph       %[res_op5],        %[res_op5],       %[q3]           \n\t"
+      "subu.ph       %[res_op5],        %[res_op5],       %[q4]           \n\t"
+      "subu.ph       %[res_op5],        %[res_op5],       %[q5]           \n\t"
+      "subu.ph       %[res_op5],        %[res_op5],       %[q6]           \n\t"
+      "shrl.ph       %[res_op5],        %[res_op5],       4               \n\t"
+
+      /* *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 +
+                                   p1 + p0 + q0 + q1 + q2, 4) */
+      "shll.ph       %[tmp],            %[p7],            2               \n\t"
+      "addu.ph       %[res_op4],        %[tmp],           %[p7]           \n\t"
+      "addu.ph       %[res_op4],        %[res_op4],       %[p4]           \n\t"
+      "addu.ph       %[res_op4],        %[res_op4],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_op4],        %[res_op4],       %[q3]           \n\t"
+      "subu.ph       %[res_op4],        %[res_op4],       %[q4]           \n\t"
+      "subu.ph       %[res_op4],        %[res_op4],       %[q5]           \n\t"
+      "subu.ph       %[res_op4],        %[res_op4],       %[q6]           \n\t"
+      "shrl.ph       %[res_op4],        %[res_op4],       4               \n\t"
+
+      /* *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 +
+                                   p1 + p0 + q0 + q1 + q2 + q3, 4) */
+      "shll.ph       %[tmp],            %[p7],            2               \n\t"
+      "addu.ph       %[res_op3],        %[tmp],           %[p3]           \n\t"
+      "addu.ph       %[res_op3],        %[res_op3],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_op3],        %[res_op3],       %[q4]           \n\t"
+      "subu.ph       %[res_op3],        %[res_op3],       %[q5]           \n\t"
+      "subu.ph       %[res_op3],        %[res_op3],       %[q6]           \n\t"
+      "shrl.ph       %[res_op3],        %[res_op3],       4               \n\t"
+
+      /* *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 +
+                                   p0 + q0 + q1 + q2 + q3 + q4, 4) */
+      "shll.ph       %[tmp],            %[p7],            1               \n\t"
+      "addu.ph       %[res_op2],        %[tmp],           %[p7]           \n\t"
+      "addu.ph       %[res_op2],        %[res_op2],       %[p2]           \n\t"
+      "addu.ph       %[res_op2],        %[res_op2],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_op2],        %[res_op2],       %[q5]           \n\t"
+      "subu.ph       %[res_op2],        %[res_op2],       %[q6]           \n\t"
+      "shrl.ph       %[res_op2],        %[res_op2],       4               \n\t"
+
+      /* *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 +
+                                   p0 + q0 + q1 + q2 + q3 + q4 + q5, 4); */
+      "shll.ph       %[tmp],            %[p7],            1               \n\t"
+      "addu.ph       %[res_op1],        %[tmp],           %[p1]           \n\t"
+      "addu.ph       %[res_op1],        %[res_op1],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_op1],        %[res_op1],       %[q6]           \n\t"
+      "shrl.ph       %[res_op1],        %[res_op1],       4               \n\t"
+
+      /* *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
+                                  q0 + q1 + q2 + q3 + q4 + q5 + q6, 4) */
+      "addu.ph       %[res_op0],        %[p7],            %[p0]           \n\t"
+      "addu.ph       %[res_op0],        %[res_op0],       %[add_p6toq6]   \n\t"
+      "shrl.ph       %[res_op0],        %[res_op0],       4               \n\t"
+
+      : [res_op6] "=&r" (res_op6), [res_op5] "=&r" (res_op5),
+        [res_op4] "=&r" (res_op4), [res_op3] "=&r" (res_op3),
+        [res_op2] "=&r" (res_op2), [res_op1] "=&r" (res_op1),
+        [res_op0] "=&r" (res_op0), [tmp] "=&r" (tmp)
+      : [p7] "r" (p7), [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4),
+        [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
+        [q2] "r" (q2), [q1] "r" (q1),
+        [q3] "r" (q3), [q4] "r" (q4), [q5] "r" (q5), [q6] "r" (q6),
+        [add_p6toq6] "r" (add_p6toq6)
+  );
+
+  *op6 = res_op6;
+  *op5 = res_op5;
+  *op4 = res_op4;
+  *op3 = res_op3;
+  *op2 = res_op2;
+  *op1 = res_op1;
+  *op0 = res_op0;
+
+  __asm__ __volatile__ (
+      /* *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 +
+                                   q1 + q2 + q3 + q4 + q5 + q6 + q7, 4); */
+      "addu.ph       %[res_oq0],        %[q7],            %[q0]           \n\t"
+      "addu.ph       %[res_oq0],        %[res_oq0],       %[add_p6toq6]   \n\t"
+      "shrl.ph       %[res_oq0],        %[res_oq0],       4               \n\t"
+
+      /* *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 +
+                                   q2 + q3 + q4 + q5 + q6 + q7 * 2, 4) */
+      "shll.ph       %[tmp],            %[q7],            1               \n\t"
+      "addu.ph       %[res_oq1],        %[tmp],           %[q1]           \n\t"
+      "addu.ph       %[res_oq1],        %[res_oq1],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_oq1],        %[res_oq1],       %[p6]           \n\t"
+      "shrl.ph       %[res_oq1],        %[res_oq1],       4               \n\t"
+
+      /* *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 +
+                                   q3 + q4 + q5 + q6 + q7 * 3, 4) */
+      "shll.ph       %[tmp],            %[q7],            1               \n\t"
+      "addu.ph       %[res_oq2],        %[tmp],           %[q7]           \n\t"
+      "addu.ph       %[res_oq2],        %[res_oq2],       %[q2]           \n\t"
+      "addu.ph       %[res_oq2],        %[res_oq2],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_oq2],        %[res_oq2],       %[p5]           \n\t"
+      "subu.ph       %[res_oq2],        %[res_oq2],       %[p6]           \n\t"
+      "shrl.ph       %[res_oq2],        %[res_oq2],       4               \n\t"
+
+      /* *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + q0 + q1 + q2 +
+                                   q3 * 2 + q4 + q5 + q6 + q7 * 4, 4) */
+      "shll.ph       %[tmp],            %[q7],            2               \n\t"
+      "addu.ph       %[res_oq3],        %[tmp],           %[q3]           \n\t"
+      "addu.ph       %[res_oq3],        %[res_oq3],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_oq3],        %[res_oq3],       %[p4]           \n\t"
+      "subu.ph       %[res_oq3],        %[res_oq3],       %[p5]           \n\t"
+      "subu.ph       %[res_oq3],        %[res_oq3],       %[p6]           \n\t"
+      "shrl.ph       %[res_oq3],        %[res_oq3],       4               \n\t"
+
+      /* *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q1 + q2 + q3 +
+                                   q4 * 2 + q5 + q6 + q7 * 5, 4) */
+      "shll.ph       %[tmp],            %[q7],            2               \n\t"
+      "addu.ph       %[res_oq4],        %[tmp],           %[q7]           \n\t"
+      "addu.ph       %[res_oq4],        %[res_oq4],       %[q4]           \n\t"
+      "addu.ph       %[res_oq4],        %[res_oq4],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_oq4],        %[res_oq4],       %[p3]           \n\t"
+      "subu.ph       %[res_oq4],        %[res_oq4],       %[p4]           \n\t"
+      "subu.ph       %[res_oq4],        %[res_oq4],       %[p5]           \n\t"
+      "subu.ph       %[res_oq4],        %[res_oq4],       %[p6]           \n\t"
+      "shrl.ph       %[res_oq4],        %[res_oq4],       4               \n\t"
+
+      /* *oq5 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q2 + q3 + q4 +
+                                   q5 * 2 + q6 + q7 * 6, 4) */
+      "shll.ph       %[tmp],            %[q7],            2               \n\t"
+      "addu.ph       %[res_oq5],        %[tmp],           %[q7]           \n\t"
+      "addu.ph       %[res_oq5],        %[res_oq5],       %[q7]           \n\t"
+      "addu.ph       %[res_oq5],        %[res_oq5],       %[q5]           \n\t"
+      "addu.ph       %[res_oq5],        %[res_oq5],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_oq5],        %[res_oq5],       %[p2]           \n\t"
+      "subu.ph       %[res_oq5],        %[res_oq5],       %[p3]           \n\t"
+      "subu.ph       %[res_oq5],        %[res_oq5],       %[p4]           \n\t"
+      "subu.ph       %[res_oq5],        %[res_oq5],       %[p5]           \n\t"
+      "subu.ph       %[res_oq5],        %[res_oq5],       %[p6]           \n\t"
+      "shrl.ph       %[res_oq5],        %[res_oq5],       4               \n\t"
+
+      /* *oq6 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 +
+                                   q4 + q5 + q6 * 2 + q7 * 7, 4) */
+      "shll.ph       %[tmp],            %[q7],            3               \n\t"
+      "subu.ph       %[res_oq6],        %[tmp],           %[q7]           \n\t"
+      "addu.ph       %[res_oq6],        %[res_oq6],       %[q6]           \n\t"
+      "addu.ph       %[res_oq6],        %[res_oq6],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_oq6],        %[res_oq6],       %[p1]           \n\t"
+      "subu.ph       %[res_oq6],        %[res_oq6],       %[p2]           \n\t"
+      "subu.ph       %[res_oq6],        %[res_oq6],       %[p3]           \n\t"
+      "subu.ph       %[res_oq6],        %[res_oq6],       %[p4]           \n\t"
+      "subu.ph       %[res_oq6],        %[res_oq6],       %[p5]           \n\t"
+      "subu.ph       %[res_oq6],        %[res_oq6],       %[p6]           \n\t"
+      "shrl.ph       %[res_oq6],        %[res_oq6],       4               \n\t"
+
+      : [res_oq6] "=&r" (res_oq6), [res_oq5] "=&r" (res_oq5),
+        [res_oq4] "=&r" (res_oq4), [res_oq3] "=&r" (res_oq3),
+        [res_oq2] "=&r" (res_oq2), [res_oq1] "=&r" (res_oq1),
+        [res_oq0] "=&r" (res_oq0), [tmp] "=&r" (tmp)
+      : [q7] "r" (q7), [q6] "r" (q6), [q5] "r" (q5), [q4] "r" (q4),
+        [q3] "r" (q3), [q2] "r" (q2), [q1] "r" (q1), [q0] "r" (q0),
+        [p1] "r" (p1), [p2] "r" (p2),
+        [p3] "r" (p3), [p4] "r" (p4), [p5] "r" (p5), [p6] "r" (p6),
+        [add_p6toq6] "r" (add_p6toq6)
+  );
+
+  *oq0 = res_oq0;
+  *oq1 = res_oq1;
+  *oq2 = res_oq2;
+  *oq3 = res_oq3;
+  *oq4 = res_oq4;
+  *oq5 = res_oq5;
+  *oq6 = res_oq6;
+}
+#endif  // #if HAVE_DSPR2
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_FILTERS_DSPR2_H_
diff --git a/libs/libvpx/vpx_dsp/mips/loopfilter_macros_dspr2.h b/libs/libvpx/vpx_dsp/mips/loopfilter_macros_dspr2.h
new file mode 100644
index 0000000000..994ff185a2
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/loopfilter_macros_dspr2.h
@@ -0,0 +1,478 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MACROS_DSPR2_H_
+#define VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MACROS_DSPR2_H_
+
+#include <stdlib.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if HAVE_DSPR2
+#define STORE_F0() {                                                    \
+    __asm__ __volatile__ (                                              \
+        "sb     %[q1_f0],    1(%[s4])           \n\t"                   \
+        "sb     %[q0_f0],    0(%[s4])           \n\t"                   \
+        "sb     %[p0_f0],   -1(%[s4])           \n\t"                   \
+        "sb     %[p1_f0],   -2(%[s4])           \n\t"                   \
+                                                                        \
+        :                                                               \
+        : [q1_f0] "r" (q1_f0), [q0_f0] "r" (q0_f0),                     \
+          [p0_f0] "r" (p0_f0), [p1_f0] "r" (p1_f0),                     \
+          [s4] "r" (s4)                                                 \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "srl    %[q1_f0],   %[q1_f0],   8       \n\t"                   \
+        "srl    %[q0_f0],   %[q0_f0],   8       \n\t"                   \
+        "srl    %[p0_f0],   %[p0_f0],   8       \n\t"                   \
+        "srl    %[p1_f0],   %[p1_f0],   8       \n\t"                   \
+                                                                        \
+        : [q1_f0] "+r" (q1_f0), [q0_f0] "+r" (q0_f0),                   \
+          [p0_f0] "+r" (p0_f0), [p1_f0] "+r" (p1_f0)                    \
+        :                                                               \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "sb     %[q1_f0],    1(%[s3])           \n\t"                   \
+        "sb     %[q0_f0],    0(%[s3])           \n\t"                   \
+        "sb     %[p0_f0],   -1(%[s3])           \n\t"                   \
+        "sb     %[p1_f0],   -2(%[s3])           \n\t"                   \
+                                                                        \
+        : [p1_f0] "+r" (p1_f0)                                          \
+        : [q1_f0] "r" (q1_f0), [q0_f0] "r" (q0_f0),                     \
+          [s3] "r" (s3), [p0_f0] "r" (p0_f0)                            \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "srl    %[q1_f0],   %[q1_f0],   8       \n\t"                   \
+        "srl    %[q0_f0],   %[q0_f0],   8       \n\t"                   \
+        "srl    %[p0_f0],   %[p0_f0],   8       \n\t"                   \
+        "srl    %[p1_f0],   %[p1_f0],   8       \n\t"                   \
+                                                                        \
+        : [q1_f0] "+r" (q1_f0), [q0_f0] "+r" (q0_f0),                   \
+          [p0_f0] "+r" (p0_f0), [p1_f0] "+r" (p1_f0)                    \
+        :                                                               \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "sb     %[q1_f0],    1(%[s2])           \n\t"                   \
+        "sb     %[q0_f0],    0(%[s2])           \n\t"                   \
+        "sb     %[p0_f0],   -1(%[s2])           \n\t"                   \
+        "sb     %[p1_f0],   -2(%[s2])           \n\t"                   \
+                                                                        \
+        :                                                               \
+        : [q1_f0] "r" (q1_f0), [q0_f0] "r" (q0_f0),                     \
+          [p0_f0] "r" (p0_f0), [p1_f0] "r" (p1_f0),                     \
+          [s2] "r" (s2)                                                 \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "srl    %[q1_f0],   %[q1_f0],   8       \n\t"                   \
+        "srl    %[q0_f0],   %[q0_f0],   8       \n\t"                   \
+        "srl    %[p0_f0],   %[p0_f0],   8       \n\t"                   \
+        "srl    %[p1_f0],   %[p1_f0],   8       \n\t"                   \
+                                                                        \
+        : [q1_f0] "+r" (q1_f0), [q0_f0] "+r" (q0_f0),                   \
+          [p0_f0] "+r" (p0_f0), [p1_f0] "+r" (p1_f0)                    \
+        :                                                               \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "sb     %[q1_f0],    1(%[s1])           \n\t"                   \
+        "sb     %[q0_f0],    0(%[s1])           \n\t"                   \
+        "sb     %[p0_f0],   -1(%[s1])           \n\t"                   \
+        "sb     %[p1_f0],   -2(%[s1])           \n\t"                   \
+                                                                        \
+        :                                                               \
+        : [q1_f0] "r" (q1_f0), [q0_f0] "r" (q0_f0),                     \
+          [p0_f0] "r" (p0_f0), [p1_f0] "r" (p1_f0),                     \
+          [s1] "r" (s1)                                                 \
+    );                                                                  \
+}
+
+#define STORE_F1() {                                                    \
+    __asm__ __volatile__ (                                              \
+        "sb     %[q2_r],     2(%[s4])           \n\t"                   \
+        "sb     %[q1_r],     1(%[s4])           \n\t"                   \
+        "sb     %[q0_r],     0(%[s4])           \n\t"                   \
+        "sb     %[p0_r],    -1(%[s4])           \n\t"                   \
+        "sb     %[p1_r],    -2(%[s4])           \n\t"                   \
+        "sb     %[p2_r],    -3(%[s4])           \n\t"                   \
+                                                                        \
+        :                                                               \
+        : [q2_r] "r" (q2_r), [q1_r] "r" (q1_r), [q0_r] "r" (q0_r),      \
+          [p0_r] "r" (p0_r), [p1_r] "r" (p1_r), [p2_r] "r" (p2_r),      \
+          [s4] "r" (s4)                                                 \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "srl    %[q2_r],    %[q2_r],    16      \n\t"                   \
+        "srl    %[q1_r],    %[q1_r],    16      \n\t"                   \
+        "srl    %[q0_r],    %[q0_r],    16      \n\t"                   \
+        "srl    %[p0_r],    %[p0_r],    16      \n\t"                   \
+        "srl    %[p1_r],    %[p1_r],    16      \n\t"                   \
+        "srl    %[p2_r],    %[p2_r],    16      \n\t"                   \
+                                                                        \
+        : [q2_r] "+r" (q2_r), [q1_r] "+r" (q1_r), [q0_r] "+r" (q0_r),   \
+          [p0_r] "+r" (p0_r), [p1_r] "+r" (p1_r), [p2_r] "+r" (p2_r)    \
+        :                                                               \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "sb     %[q2_r],     2(%[s3])           \n\t"                   \
+        "sb     %[q1_r],     1(%[s3])           \n\t"                   \
+        "sb     %[q0_r],     0(%[s3])           \n\t"                   \
+        "sb     %[p0_r],    -1(%[s3])           \n\t"                   \
+        "sb     %[p1_r],    -2(%[s3])           \n\t"                   \
+        "sb     %[p2_r],    -3(%[s3])           \n\t"                   \
+                                                                        \
+        :                                                               \
+        : [q2_r] "r" (q2_r), [q1_r] "r" (q1_r), [q0_r] "r" (q0_r),      \
+          [p0_r] "r" (p0_r), [p1_r] "r" (p1_r), [p2_r] "r" (p2_r),      \
+          [s3] "r" (s3)                                                 \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "sb     %[q2_l],     2(%[s2])           \n\t"                   \
+        "sb     %[q1_l],     1(%[s2])           \n\t"                   \
+        "sb     %[q0_l],     0(%[s2])           \n\t"                   \
+        "sb     %[p0_l],    -1(%[s2])           \n\t"                   \
+        "sb     %[p1_l],    -2(%[s2])           \n\t"                   \
+        "sb     %[p2_l],    -3(%[s2])           \n\t"                   \
+                                                                        \
+        :                                                               \
+        : [q2_l] "r" (q2_l), [q1_l] "r" (q1_l), [q0_l] "r" (q0_l),      \
+          [p0_l] "r" (p0_l), [p1_l] "r" (p1_l), [p2_l] "r" (p2_l),      \
+          [s2] "r" (s2)                                                 \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "srl    %[q2_l],    %[q2_l],    16      \n\t"                   \
+        "srl    %[q1_l],    %[q1_l],    16      \n\t"                   \
+        "srl    %[q0_l],    %[q0_l],    16      \n\t"                   \
+        "srl    %[p0_l],    %[p0_l],    16      \n\t"                   \
+        "srl    %[p1_l],    %[p1_l],    16      \n\t"                   \
+        "srl    %[p2_l],    %[p2_l],    16      \n\t"                   \
+                                                                        \
+        : [q2_l] "+r" (q2_l), [q1_l] "+r" (q1_l), [q0_l] "+r" (q0_l),   \
+          [p0_l] "+r" (p0_l), [p1_l] "+r" (p1_l), [p2_l] "+r" (p2_l)    \
+        :                                                               \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "sb     %[q2_l],     2(%[s1])           \n\t"                   \
+        "sb     %[q1_l],     1(%[s1])           \n\t"                   \
+        "sb     %[q0_l],     0(%[s1])           \n\t"                   \
+        "sb     %[p0_l],    -1(%[s1])           \n\t"                   \
+        "sb     %[p1_l],    -2(%[s1])           \n\t"                   \
+        "sb     %[p2_l],    -3(%[s1])           \n\t"                   \
+                                                                        \
+        :                                                               \
+        : [q2_l] "r" (q2_l), [q1_l] "r" (q1_l), [q0_l] "r" (q0_l),      \
+          [p0_l] "r" (p0_l), [p1_l] "r" (p1_l), [p2_l] "r" (p2_l),      \
+          [s1] "r" (s1)                                                 \
+    );                                                                  \
+}
+
+#define STORE_F2() {                                                    \
+    __asm__ __volatile__ (                                              \
+        "sb     %[q6_r],     6(%[s4])           \n\t"                   \
+        "sb     %[q5_r],     5(%[s4])           \n\t"                   \
+        "sb     %[q4_r],     4(%[s4])           \n\t"                   \
+        "sb     %[q3_r],     3(%[s4])           \n\t"                   \
+        "sb     %[q2_r],     2(%[s4])           \n\t"                   \
+        "sb     %[q1_r],     1(%[s4])           \n\t"                   \
+        "sb     %[q0_r],     0(%[s4])           \n\t"                   \
+        "sb     %[p0_r],    -1(%[s4])           \n\t"                   \
+        "sb     %[p1_r],    -2(%[s4])           \n\t"                   \
+        "sb     %[p2_r],    -3(%[s4])           \n\t"                   \
+        "sb     %[p3_r],    -4(%[s4])           \n\t"                   \
+        "sb     %[p4_r],    -5(%[s4])           \n\t"                   \
+        "sb     %[p5_r],    -6(%[s4])           \n\t"                   \
+        "sb     %[p6_r],    -7(%[s4])           \n\t"                   \
+                                                                        \
+        :                                                               \
+        : [q6_r] "r" (q6_r), [q5_r] "r" (q5_r), [q4_r] "r" (q4_r),      \
+          [q3_r] "r" (q3_r), [q2_r] "r" (q2_r), [q1_r] "r" (q1_r),      \
+          [q0_r] "r" (q0_r),                                            \
+          [p0_r] "r" (p0_r), [p1_r] "r" (p1_r), [p2_r] "r" (p2_r),      \
+          [p3_r] "r" (p3_r), [p4_r] "r" (p4_r), [p5_r] "r" (p5_r),      \
+          [p6_r] "r" (p6_r),                                            \
+          [s4] "r" (s4)                                                 \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "srl    %[q6_r],    %[q6_r],    16      \n\t"                   \
+        "srl    %[q5_r],    %[q5_r],    16      \n\t"                   \
+        "srl    %[q4_r],    %[q4_r],    16      \n\t"                   \
+        "srl    %[q3_r],    %[q3_r],    16      \n\t"                   \
+        "srl    %[q2_r],    %[q2_r],    16      \n\t"                   \
+        "srl    %[q1_r],    %[q1_r],    16      \n\t"                   \
+        "srl    %[q0_r],    %[q0_r],    16      \n\t"                   \
+        "srl    %[p0_r],    %[p0_r],    16      \n\t"                   \
+        "srl    %[p1_r],    %[p1_r],    16      \n\t"                   \
+        "srl    %[p2_r],    %[p2_r],    16      \n\t"                   \
+        "srl    %[p3_r],    %[p3_r],    16      \n\t"                   \
+        "srl    %[p4_r],    %[p4_r],    16      \n\t"                   \
+        "srl    %[p5_r],    %[p5_r],    16      \n\t"                   \
+        "srl    %[p6_r],    %[p6_r],    16      \n\t"                   \
+                                                                        \
+        : [q6_r] "+r" (q6_r), [q5_r] "+r" (q5_r), [q4_r] "+r" (q4_r),   \
+          [q3_r] "+r" (q3_r), [q2_r] "+r" (q2_r), [q1_r] "+r" (q1_r),   \
+          [q0_r] "+r" (q0_r),                                           \
+          [p0_r] "+r" (p0_r), [p1_r] "+r" (p1_r), [p2_r] "+r" (p2_r),   \
+          [p3_r] "+r" (p3_r), [p4_r] "+r" (p4_r), [p5_r] "+r" (p5_r),   \
+          [p6_r] "+r" (p6_r)                                            \
+        :                                                               \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "sb     %[q6_r],     6(%[s3])           \n\t"                   \
+        "sb     %[q5_r],     5(%[s3])           \n\t"                   \
+        "sb     %[q4_r],     4(%[s3])           \n\t"                   \
+        "sb     %[q3_r],     3(%[s3])           \n\t"                   \
+        "sb     %[q2_r],     2(%[s3])           \n\t"                   \
+        "sb     %[q1_r],     1(%[s3])           \n\t"                   \
+        "sb     %[q0_r],     0(%[s3])           \n\t"                   \
+        "sb     %[p0_r],    -1(%[s3])           \n\t"                   \
+        "sb     %[p1_r],    -2(%[s3])           \n\t"                   \
+        "sb     %[p2_r],    -3(%[s3])           \n\t"                   \
+        "sb     %[p3_r],    -4(%[s3])           \n\t"                   \
+        "sb     %[p4_r],    -5(%[s3])           \n\t"                   \
+        "sb     %[p5_r],    -6(%[s3])           \n\t"                   \
+        "sb     %[p6_r],    -7(%[s3])           \n\t"                   \
+                                                                        \
+        :                                                               \
+        : [q6_r] "r" (q6_r), [q5_r] "r" (q5_r), [q4_r] "r" (q4_r),      \
+          [q3_r] "r" (q3_r), [q2_r] "r" (q2_r), [q1_r] "r" (q1_r),      \
+          [q0_r] "r" (q0_r),                                            \
+          [p0_r] "r" (p0_r), [p1_r] "r" (p1_r), [p2_r] "r" (p2_r),      \
+          [p3_r] "r" (p3_r), [p4_r] "r" (p4_r), [p5_r] "r" (p5_r),      \
+          [p6_r] "r" (p6_r),                                            \
+          [s3] "r" (s3)                                                 \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "sb     %[q6_l],     6(%[s2])           \n\t"                   \
+        "sb     %[q5_l],     5(%[s2])           \n\t"                   \
+        "sb     %[q4_l],     4(%[s2])           \n\t"                   \
+        "sb     %[q3_l],     3(%[s2])           \n\t"                   \
+        "sb     %[q2_l],     2(%[s2])           \n\t"                   \
+        "sb     %[q1_l],     1(%[s2])           \n\t"                   \
+        "sb     %[q0_l],     0(%[s2])           \n\t"                   \
+        "sb     %[p0_l],    -1(%[s2])           \n\t"                   \
+        "sb     %[p1_l],    -2(%[s2])           \n\t"                   \
+        "sb     %[p2_l],    -3(%[s2])           \n\t"                   \
+        "sb     %[p3_l],    -4(%[s2])           \n\t"                   \
+        "sb     %[p4_l],    -5(%[s2])           \n\t"                   \
+        "sb     %[p5_l],    -6(%[s2])           \n\t"                   \
+        "sb     %[p6_l],    -7(%[s2])           \n\t"                   \
+                                                                        \
+        :                                                               \
+        : [q6_l] "r" (q6_l), [q5_l] "r" (q5_l), [q4_l] "r" (q4_l),      \
+          [q3_l] "r" (q3_l), [q2_l] "r" (q2_l), [q1_l] "r" (q1_l),      \
+          [q0_l] "r" (q0_l),                                            \
+          [p0_l] "r" (p0_l), [p1_l] "r" (p1_l), [p2_l] "r" (p2_l),      \
+          [p3_l] "r" (p3_l), [p4_l] "r" (p4_l), [p5_l] "r" (p5_l),      \
+          [p6_l] "r" (p6_l),                                            \
+          [s2] "r" (s2)                                                 \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "srl    %[q6_l],    %[q6_l],    16     \n\t"                    \
+        "srl    %[q5_l],    %[q5_l],    16     \n\t"                    \
+        "srl    %[q4_l],    %[q4_l],    16     \n\t"                    \
+        "srl    %[q3_l],    %[q3_l],    16     \n\t"                    \
+        "srl    %[q2_l],    %[q2_l],    16     \n\t"                    \
+        "srl    %[q1_l],    %[q1_l],    16     \n\t"                    \
+        "srl    %[q0_l],    %[q0_l],    16     \n\t"                    \
+        "srl    %[p0_l],    %[p0_l],    16     \n\t"                    \
+        "srl    %[p1_l],    %[p1_l],    16     \n\t"                    \
+        "srl    %[p2_l],    %[p2_l],    16     \n\t"                    \
+        "srl    %[p3_l],    %[p3_l],    16     \n\t"                    \
+        "srl    %[p4_l],    %[p4_l],    16     \n\t"                    \
+        "srl    %[p5_l],    %[p5_l],    16     \n\t"                    \
+        "srl    %[p6_l],    %[p6_l],    16     \n\t"                    \
+                                                                        \
+        : [q6_l] "+r" (q6_l), [q5_l] "+r" (q5_l), [q4_l] "+r" (q4_l),   \
+          [q3_l] "+r" (q3_l), [q2_l] "+r" (q2_l), [q1_l] "+r" (q1_l),   \
+          [q0_l] "+r" (q0_l),                                           \
+          [p0_l] "+r" (p0_l), [p1_l] "+r" (p1_l), [p2_l] "+r" (p2_l),   \
+          [p3_l] "+r" (p3_l), [p4_l] "+r" (p4_l), [p5_l] "+r" (p5_l),   \
+          [p6_l] "+r" (p6_l)                                            \
+        :                                                               \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "sb     %[q6_l],     6(%[s1])           \n\t"                   \
+        "sb     %[q5_l],     5(%[s1])           \n\t"                   \
+        "sb     %[q4_l],     4(%[s1])           \n\t"                   \
+        "sb     %[q3_l],     3(%[s1])           \n\t"                   \
+        "sb     %[q2_l],     2(%[s1])           \n\t"                   \
+        "sb     %[q1_l],     1(%[s1])           \n\t"                   \
+        "sb     %[q0_l],     0(%[s1])           \n\t"                   \
+        "sb     %[p0_l],    -1(%[s1])           \n\t"                   \
+        "sb     %[p1_l],    -2(%[s1])           \n\t"                   \
+        "sb     %[p2_l],    -3(%[s1])           \n\t"                   \
+        "sb     %[p3_l],    -4(%[s1])           \n\t"                   \
+        "sb     %[p4_l],    -5(%[s1])           \n\t"                   \
+        "sb     %[p5_l],    -6(%[s1])           \n\t"                   \
+        "sb     %[p6_l],    -7(%[s1])           \n\t"                   \
+                                                                        \
+        :                                                               \
+        : [q6_l] "r" (q6_l), [q5_l] "r" (q5_l), [q4_l] "r" (q4_l),      \
+          [q3_l] "r" (q3_l), [q2_l] "r" (q2_l), [q1_l] "r" (q1_l),      \
+          [q0_l] "r" (q0_l),                                            \
+          [p0_l] "r" (p0_l), [p1_l] "r" (p1_l), [p2_l] "r" (p2_l),      \
+          [p3_l] "r" (p3_l), [p4_l] "r" (p4_l), [p5_l] "r" (p5_l),      \
+          [p6_l] "r" (p6_l),                                            \
+          [s1] "r" (s1)                                                 \
+    );                                                                  \
+}
+
+#define PACK_LEFT_0TO3() {                                              \
+    __asm__ __volatile__ (                                              \
+        "preceu.ph.qbl   %[p3_l],   %[p3]   \n\t"                       \
+        "preceu.ph.qbl   %[p2_l],   %[p2]   \n\t"                       \
+        "preceu.ph.qbl   %[p1_l],   %[p1]   \n\t"                       \
+        "preceu.ph.qbl   %[p0_l],   %[p0]   \n\t"                       \
+        "preceu.ph.qbl   %[q0_l],   %[q0]   \n\t"                       \
+        "preceu.ph.qbl   %[q1_l],   %[q1]   \n\t"                       \
+        "preceu.ph.qbl   %[q2_l],   %[q2]   \n\t"                       \
+        "preceu.ph.qbl   %[q3_l],   %[q3]   \n\t"                       \
+                                                                        \
+        : [p3_l] "=&r" (p3_l), [p2_l] "=&r" (p2_l),                     \
+          [p1_l] "=&r" (p1_l), [p0_l] "=&r" (p0_l),                     \
+          [q0_l] "=&r" (q0_l), [q1_l] "=&r" (q1_l),                     \
+          [q2_l] "=&r" (q2_l), [q3_l] "=&r" (q3_l)                      \
+        : [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),   \
+          [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2), [q3] "r" (q3)    \
+    );                                                                  \
+}
+
+#define PACK_LEFT_4TO7() {                                              \
+    __asm__ __volatile__ (                                              \
+        "preceu.ph.qbl   %[p7_l],   %[p7]   \n\t"                       \
+        "preceu.ph.qbl   %[p6_l],   %[p6]   \n\t"                       \
+        "preceu.ph.qbl   %[p5_l],   %[p5]   \n\t"                       \
+        "preceu.ph.qbl   %[p4_l],   %[p4]   \n\t"                       \
+        "preceu.ph.qbl   %[q4_l],   %[q4]   \n\t"                       \
+        "preceu.ph.qbl   %[q5_l],   %[q5]   \n\t"                       \
+        "preceu.ph.qbl   %[q6_l],   %[q6]   \n\t"                       \
+        "preceu.ph.qbl   %[q7_l],   %[q7]   \n\t"                       \
+                                                                        \
+        : [p7_l] "=&r" (p7_l), [p6_l] "=&r" (p6_l),                     \
+          [p5_l] "=&r" (p5_l), [p4_l] "=&r" (p4_l),                     \
+          [q4_l] "=&r" (q4_l), [q5_l] "=&r" (q5_l),                     \
+          [q6_l] "=&r" (q6_l), [q7_l] "=&r" (q7_l)                      \
+        : [p7] "r" (p7), [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4),   \
+          [q4] "r" (q4), [q5] "r" (q5), [q6] "r" (q6), [q7] "r" (q7)    \
+    );                                                                  \
+}
+
+#define PACK_RIGHT_0TO3() {                                             \
+    __asm__ __volatile__ (                                              \
+        "preceu.ph.qbr   %[p3_r],   %[p3]  \n\t"                        \
+        "preceu.ph.qbr   %[p2_r],   %[p2]   \n\t"                       \
+        "preceu.ph.qbr   %[p1_r],   %[p1]   \n\t"                       \
+        "preceu.ph.qbr   %[p0_r],   %[p0]   \n\t"                       \
+        "preceu.ph.qbr   %[q0_r],   %[q0]   \n\t"                       \
+        "preceu.ph.qbr   %[q1_r],   %[q1]   \n\t"                       \
+        "preceu.ph.qbr   %[q2_r],   %[q2]   \n\t"                       \
+        "preceu.ph.qbr   %[q3_r],   %[q3]   \n\t"                       \
+                                                                        \
+        : [p3_r] "=&r" (p3_r), [p2_r] "=&r" (p2_r),                     \
+          [p1_r] "=&r" (p1_r), [p0_r] "=&r" (p0_r),                     \
+          [q0_r] "=&r" (q0_r), [q1_r] "=&r" (q1_r),                     \
+          [q2_r] "=&r" (q2_r), [q3_r] "=&r" (q3_r)                      \
+        : [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),   \
+          [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2), [q3] "r" (q3)    \
+    );                                                                  \
+}
+
+#define PACK_RIGHT_4TO7() {                                             \
+    __asm__ __volatile__ (                                              \
+        "preceu.ph.qbr   %[p7_r],   %[p7]   \n\t"                       \
+        "preceu.ph.qbr   %[p6_r],   %[p6]   \n\t"                       \
+        "preceu.ph.qbr   %[p5_r],   %[p5]   \n\t"                       \
+        "preceu.ph.qbr   %[p4_r],   %[p4]   \n\t"                       \
+        "preceu.ph.qbr   %[q4_r],   %[q4]   \n\t"                       \
+        "preceu.ph.qbr   %[q5_r],   %[q5]   \n\t"                       \
+        "preceu.ph.qbr   %[q6_r],   %[q6]   \n\t"                       \
+        "preceu.ph.qbr   %[q7_r],   %[q7]   \n\t"                       \
+                                                                        \
+        : [p7_r] "=&r" (p7_r), [p6_r] "=&r" (p6_r),                     \
+          [p5_r] "=&r" (p5_r), [p4_r] "=&r" (p4_r),                     \
+          [q4_r] "=&r" (q4_r), [q5_r] "=&r" (q5_r),                     \
+          [q6_r] "=&r" (q6_r), [q7_r] "=&r" (q7_r)                      \
+        : [p7] "r" (p7), [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4),   \
+          [q4] "r" (q4), [q5] "r" (q5), [q6] "r" (q6), [q7] "r" (q7)    \
+    );                                                                  \
+}
+
+#define COMBINE_LEFT_RIGHT_0TO2() {                                     \
+    __asm__ __volatile__ (                                              \
+        "precr.qb.ph    %[p2],  %[p2_l],    %[p2_r]    \n\t"            \
+        "precr.qb.ph    %[p1],  %[p1_l],    %[p1_r]    \n\t"            \
+        "precr.qb.ph    %[p0],  %[p0_l],    %[p0_r]    \n\t"            \
+        "precr.qb.ph    %[q0],  %[q0_l],    %[q0_r]    \n\t"            \
+        "precr.qb.ph    %[q1],  %[q1_l],    %[q1_r]    \n\t"            \
+        "precr.qb.ph    %[q2],  %[q2_l],    %[q2_r]    \n\t"            \
+                                                                        \
+        : [p2] "=&r" (p2), [p1] "=&r" (p1), [p0] "=&r" (p0),            \
+          [q0] "=&r" (q0), [q1] "=&r" (q1), [q2] "=&r" (q2)             \
+        : [p2_l] "r" (p2_l), [p2_r] "r" (p2_r),                         \
+          [p1_l] "r" (p1_l), [p1_r] "r" (p1_r),                         \
+          [p0_l] "r" (p0_l), [p0_r] "r" (p0_r),                         \
+          [q0_l] "r" (q0_l), [q0_r] "r" (q0_r),                         \
+          [q1_l] "r" (q1_l), [q1_r] "r" (q1_r),                         \
+          [q2_l] "r" (q2_l), [q2_r] "r" (q2_r)                          \
+    );                                                                  \
+}
+
+#define COMBINE_LEFT_RIGHT_3TO6() {                                     \
+    __asm__ __volatile__ (                                              \
+        "precr.qb.ph    %[p6],  %[p6_l],    %[p6_r]    \n\t"            \
+        "precr.qb.ph    %[p5],  %[p5_l],    %[p5_r]    \n\t"            \
+        "precr.qb.ph    %[p4],  %[p4_l],    %[p4_r]    \n\t"            \
+        "precr.qb.ph    %[p3],  %[p3_l],    %[p3_r]    \n\t"            \
+        "precr.qb.ph    %[q3],  %[q3_l],    %[q3_r]    \n\t"            \
+        "precr.qb.ph    %[q4],  %[q4_l],    %[q4_r]    \n\t"            \
+        "precr.qb.ph    %[q5],  %[q5_l],    %[q5_r]    \n\t"            \
+        "precr.qb.ph    %[q6],  %[q6_l],    %[q6_r]    \n\t"            \
+                                                                        \
+        : [p6] "=&r" (p6),[p5] "=&r" (p5),                              \
+          [p4] "=&r" (p4),[p3] "=&r" (p3),                              \
+          [q3] "=&r" (q3),[q4] "=&r" (q4),                              \
+          [q5] "=&r" (q5),[q6] "=&r" (q6)                               \
+        : [p6_l] "r" (p6_l), [p5_l] "r" (p5_l),                         \
+          [p4_l] "r" (p4_l), [p3_l] "r" (p3_l),                         \
+          [p6_r] "r" (p6_r), [p5_r] "r" (p5_r),                         \
+          [p4_r] "r" (p4_r), [p3_r] "r" (p3_r),                         \
+          [q3_l] "r" (q3_l), [q4_l] "r" (q4_l),                         \
+          [q5_l] "r" (q5_l), [q6_l] "r" (q6_l),                         \
+          [q3_r] "r" (q3_r), [q4_r] "r" (q4_r),                         \
+          [q5_r] "r" (q5_r), [q6_r] "r" (q6_r)                          \
+    );                                                                  \
+}
+
+#endif  // #if HAVE_DSPR2
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MACROS_DSPR2_H_
diff --git a/libs/libvpx/vpx_dsp/mips/loopfilter_masks_dspr2.h b/libs/libvpx/vpx_dsp/mips/loopfilter_masks_dspr2.h
new file mode 100644
index 0000000000..2c964afaa7
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/loopfilter_masks_dspr2.h
@@ -0,0 +1,373 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MASKS_DSPR2_H_
+#define VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MASKS_DSPR2_H_
+
+#include <stdlib.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if HAVE_DSPR2
+/* processing 4 pixels at the same time
+ * compute hev and mask in the same function */
+static INLINE void filter_hev_mask_dspr2(uint32_t limit, uint32_t flimit,
+                                         uint32_t p1, uint32_t p0,
+                                         uint32_t p3, uint32_t p2,
+                                         uint32_t q0, uint32_t q1,
+                                         uint32_t q2, uint32_t q3,
+                                         uint32_t thresh, uint32_t *hev,
+                                         uint32_t *mask) {
+  uint32_t  c, r, r3, r_k;
+  uint32_t  s1, s2, s3;
+  uint32_t  ones = 0xFFFFFFFF;
+  uint32_t  hev1;
+
+  __asm__ __volatile__ (
+      /* mask |= (abs(p3 - p2) > limit) */
+      "subu_s.qb      %[c],   %[p3],     %[p2]        \n\t"
+      "subu_s.qb      %[r_k], %[p2],     %[p3]        \n\t"
+      "or             %[r_k], %[r_k],    %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+      "or             %[r],   $0,        %[c]         \n\t"
+
+      /* mask |= (abs(p2 - p1) > limit) */
+      "subu_s.qb      %[c],   %[p2],     %[p1]        \n\t"
+      "subu_s.qb      %[r_k], %[p1],     %[p2]        \n\t"
+      "or             %[r_k], %[r_k],    %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+      "or             %[r],   %[r],      %[c]         \n\t"
+
+      /* mask |= (abs(p1 - p0) > limit)
+       * hev  |= (abs(p1 - p0) > thresh)
+       */
+      "subu_s.qb      %[c],   %[p1],     %[p0]        \n\t"
+      "subu_s.qb      %[r_k], %[p0],     %[p1]        \n\t"
+      "or             %[r_k], %[r_k],    %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[thresh], %[r_k]       \n\t"
+      "or             %[r3],  $0,        %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+      "or             %[r],   %[r],      %[c]         \n\t"
+
+      /* mask |= (abs(q1 - q0) > limit)
+       * hev  |= (abs(q1 - q0) > thresh)
+       */
+      "subu_s.qb      %[c],   %[q1],     %[q0]        \n\t"
+      "subu_s.qb      %[r_k], %[q0],     %[q1]        \n\t"
+      "or             %[r_k], %[r_k],    %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[thresh], %[r_k]       \n\t"
+      "or             %[r3],  %[r3],     %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+      "or             %[r],   %[r],      %[c]         \n\t"
+
+      /* mask |= (abs(q2 - q1) > limit) */
+      "subu_s.qb      %[c],   %[q2],     %[q1]        \n\t"
+      "subu_s.qb      %[r_k], %[q1],     %[q2]        \n\t"
+      "or             %[r_k], %[r_k],    %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+      "or             %[r],   %[r],      %[c]         \n\t"
+      "sll            %[r3],    %[r3],    24          \n\t"
+
+      /* mask |= (abs(q3 - q2) > limit) */
+      "subu_s.qb      %[c],   %[q3],     %[q2]        \n\t"
+      "subu_s.qb      %[r_k], %[q2],     %[q3]        \n\t"
+      "or             %[r_k], %[r_k],    %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+      "or             %[r],   %[r],      %[c]         \n\t"
+
+      : [c] "=&r" (c), [r_k] "=&r" (r_k),
+        [r] "=&r" (r), [r3] "=&r" (r3)
+      : [limit] "r" (limit), [p3] "r" (p3), [p2] "r" (p2),
+        [p1] "r" (p1), [p0] "r" (p0), [q1] "r" (q1), [q0] "r" (q0),
+        [q2] "r" (q2), [q3] "r" (q3), [thresh] "r" (thresh)
+  );
+
+  __asm__ __volatile__ (
+      /* abs(p0 - q0) */
+      "subu_s.qb      %[c],   %[p0],     %[q0]        \n\t"
+      "subu_s.qb      %[r_k], %[q0],     %[p0]        \n\t"
+      "wrdsp          %[r3]                           \n\t"
+      "or             %[s1],  %[r_k],    %[c]         \n\t"
+
+      /* abs(p1 - q1) */
+      "subu_s.qb      %[c],    %[p1],    %[q1]        \n\t"
+      "addu_s.qb      %[s3],   %[s1],    %[s1]        \n\t"
+      "pick.qb        %[hev1], %[ones],  $0           \n\t"
+      "subu_s.qb      %[r_k],  %[q1],    %[p1]        \n\t"
+      "or             %[s2],   %[r_k],   %[c]         \n\t"
+
+      /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > flimit * 2 + limit */
+      "shrl.qb        %[s2],   %[s2],     1           \n\t"
+      "addu_s.qb      %[s1],   %[s2],     %[s3]       \n\t"
+      "cmpgu.lt.qb    %[c],    %[flimit], %[s1]       \n\t"
+      "or             %[r],    %[r],      %[c]        \n\t"
+      "sll            %[r],    %[r],      24          \n\t"
+
+      "wrdsp          %[r]                            \n\t"
+      "pick.qb        %[s2],  $0,         %[ones]     \n\t"
+
+      : [c] "=&r" (c), [r_k] "=&r" (r_k), [s1] "=&r" (s1), [hev1] "=&r" (hev1),
+        [s2] "=&r" (s2), [r] "+r" (r), [s3] "=&r" (s3)
+      : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [r3] "r" (r3),
+        [q1] "r" (q1), [ones] "r" (ones), [flimit] "r" (flimit)
+  );
+
+  *hev = hev1;
+  *mask = s2;
+}
+
+static INLINE void filter_hev_mask_flatmask4_dspr2(uint32_t limit,
+                                                   uint32_t flimit,
+                                                   uint32_t thresh,
+                                                   uint32_t p1, uint32_t p0,
+                                                   uint32_t p3, uint32_t p2,
+                                                   uint32_t q0, uint32_t q1,
+                                                   uint32_t q2, uint32_t q3,
+                                                   uint32_t *hev,
+                                                   uint32_t *mask,
+                                                   uint32_t *flat) {
+  uint32_t  c, r, r3, r_k, r_flat;
+  uint32_t  s1, s2, s3;
+  uint32_t  ones = 0xFFFFFFFF;
+  uint32_t  flat_thresh = 0x01010101;
+  uint32_t  hev1;
+  uint32_t  flat1;
+
+  __asm__ __volatile__ (
+      /* mask |= (abs(p3 - p2) > limit) */
+      "subu_s.qb      %[c],       %[p3],          %[p2]        \n\t"
+      "subu_s.qb      %[r_k],     %[p2],          %[p3]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
+      "or             %[r],       $0,             %[c]         \n\t"
+
+      /* mask |= (abs(p2 - p1) > limit) */
+      "subu_s.qb      %[c],       %[p2],          %[p1]        \n\t"
+      "subu_s.qb      %[r_k],     %[p1],          %[p2]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
+      "or             %[r],       %[r],           %[c]         \n\t"
+
+      /* mask |= (abs(p1 - p0) > limit)
+       * hev  |= (abs(p1 - p0) > thresh)
+       * flat |= (abs(p1 - p0) > thresh)
+       */
+      "subu_s.qb      %[c],       %[p1],          %[p0]        \n\t"
+      "subu_s.qb      %[r_k],     %[p0],          %[p1]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[thresh],      %[r_k]       \n\t"
+      "or             %[r3],      $0,             %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
+      "or             %[r],       %[r],           %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  $0,             %[c]         \n\t"
+
+      /* mask |= (abs(q1 - q0) > limit)
+       * hev  |= (abs(q1 - q0) > thresh)
+       * flat |= (abs(q1 - q0) > thresh)
+       */
+      "subu_s.qb      %[c],       %[q1],          %[q0]        \n\t"
+      "subu_s.qb      %[r_k],     %[q0],          %[q1]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[thresh],      %[r_k]       \n\t"
+      "or             %[r3],      %[r3],          %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
+      "or             %[r],       %[r],           %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+
+      /* flat |= (abs(p0 - p2) > thresh) */
+      "subu_s.qb      %[c],       %[p0],          %[p2]        \n\t"
+      "subu_s.qb      %[r_k],     %[p2],          %[p0]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+
+      /* flat |= (abs(q0 - q2) > thresh) */
+      "subu_s.qb      %[c],       %[q0],          %[q2]        \n\t"
+      "subu_s.qb      %[r_k],     %[q2],          %[q0]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+
+      /* flat |= (abs(p3 - p0) > thresh) */
+      "subu_s.qb      %[c],       %[p3],          %[p0]        \n\t"
+      "subu_s.qb      %[r_k],     %[p0],          %[p3]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+
+      /* flat |= (abs(q3 - q0) > thresh) */
+      "subu_s.qb      %[c],       %[q3],          %[q0]        \n\t"
+      "subu_s.qb      %[r_k],     %[q0],          %[q3]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+      "sll            %[r_flat],  %[r_flat],      24           \n\t"
+      /* look at stall here */
+      "wrdsp          %[r_flat]                                \n\t"
+      "pick.qb        %[flat1],   $0,             %[ones]      \n\t"
+
+      /* mask |= (abs(q2 - q1) > limit) */
+      "subu_s.qb      %[c],       %[q2],          %[q1]        \n\t"
+      "subu_s.qb      %[r_k],     %[q1],          %[q2]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
+      "or             %[r],       %[r],           %[c]         \n\t"
+      "sll            %[r3],      %[r3],          24           \n\t"
+
+      /* mask |= (abs(q3 - q2) > limit) */
+      "subu_s.qb      %[c],       %[q3],          %[q2]        \n\t"
+      "subu_s.qb      %[r_k],     %[q2],          %[q3]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
+      "or             %[r],       %[r],           %[c]         \n\t"
+
+      : [c] "=&r" (c), [r_k] "=&r" (r_k), [r] "=&r" (r), [r3] "=&r" (r3),
+        [r_flat] "=&r" (r_flat), [flat1] "=&r" (flat1)
+      : [limit] "r" (limit), [p3] "r" (p3), [p2] "r" (p2),
+        [p1] "r" (p1), [p0] "r" (p0), [q1] "r" (q1), [q0] "r" (q0),
+        [q2] "r" (q2), [q3] "r" (q3), [thresh] "r" (thresh),
+        [flat_thresh] "r" (flat_thresh), [ones] "r" (ones)
+  );
+
+  __asm__ __volatile__ (
+      /* abs(p0 - q0) */
+      "subu_s.qb      %[c],   %[p0],     %[q0]        \n\t"
+      "subu_s.qb      %[r_k], %[q0],     %[p0]        \n\t"
+      "wrdsp          %[r3]                           \n\t"
+      "or             %[s1],  %[r_k],    %[c]         \n\t"
+
+      /* abs(p1 - q1) */
+      "subu_s.qb      %[c],    %[p1],    %[q1]        \n\t"
+      "addu_s.qb      %[s3],   %[s1],    %[s1]        \n\t"
+      "pick.qb        %[hev1], %[ones],  $0           \n\t"
+      "subu_s.qb      %[r_k],  %[q1],    %[p1]        \n\t"
+      "or             %[s2],   %[r_k],   %[c]         \n\t"
+
+      /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > flimit * 2 + limit */
+      "shrl.qb        %[s2],   %[s2],     1           \n\t"
+      "addu_s.qb      %[s1],   %[s2],     %[s3]       \n\t"
+      "cmpgu.lt.qb    %[c],    %[flimit], %[s1]       \n\t"
+      "or             %[r],    %[r],      %[c]        \n\t"
+      "sll            %[r],    %[r],      24          \n\t"
+
+      "wrdsp          %[r]                            \n\t"
+      "pick.qb        %[s2],   $0,        %[ones]     \n\t"
+
+      : [c] "=&r" (c), [r_k] "=&r" (r_k), [s1] "=&r" (s1), [hev1] "=&r" (hev1),
+        [s2] "=&r" (s2), [r] "+r" (r), [s3] "=&r" (s3)
+      : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [r3] "r" (r3),
+        [q1] "r" (q1), [ones] "r" (ones), [flimit] "r" (flimit)
+  );
+
+  *hev = hev1;
+  *mask = s2;
+  *flat = flat1;
+}
+
+static INLINE void flatmask5(uint32_t p4, uint32_t p3,
+                             uint32_t p2, uint32_t p1,
+                             uint32_t p0, uint32_t q0,
+                             uint32_t q1, uint32_t q2,
+                             uint32_t q3, uint32_t q4,
+                             uint32_t *flat2) {
+  uint32_t  c, r, r_k, r_flat;
+  uint32_t  ones = 0xFFFFFFFF;
+  uint32_t  flat_thresh = 0x01010101;
+  uint32_t  flat1, flat3;
+
+  __asm__ __volatile__ (
+      /* flat |= (abs(p4 - p0) > thresh) */
+      "subu_s.qb      %[c],   %[p4],           %[p0]        \n\t"
+      "subu_s.qb      %[r_k], %[p0],           %[p4]        \n\t"
+      "or             %[r_k], %[r_k],          %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[flat_thresh],  %[r_k]       \n\t"
+      "or             %[r],   $0,              %[c]         \n\t"
+
+      /* flat |= (abs(q4 - q0) > thresh) */
+      "subu_s.qb      %[c],     %[q4],           %[q0]     \n\t"
+      "subu_s.qb      %[r_k],   %[q0],           %[q4]     \n\t"
+      "or             %[r_k],   %[r_k],          %[c]      \n\t"
+      "cmpgu.lt.qb    %[c],     %[flat_thresh],  %[r_k]    \n\t"
+      "or             %[r],     %[r],            %[c]      \n\t"
+      "sll            %[r],     %[r],            24        \n\t"
+      "wrdsp          %[r]                                 \n\t"
+      "pick.qb        %[flat3], $0,           %[ones]      \n\t"
+
+      /* flat |= (abs(p1 - p0) > thresh) */
+      "subu_s.qb      %[c],       %[p1],          %[p0]        \n\t"
+      "subu_s.qb      %[r_k],     %[p0],          %[p1]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  $0,             %[c]         \n\t"
+
+      /* flat |= (abs(q1 - q0) > thresh) */
+      "subu_s.qb      %[c],      %[q1],           %[q0]        \n\t"
+      "subu_s.qb      %[r_k],    %[q0],           %[q1]        \n\t"
+      "or             %[r_k],    %[r_k],          %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],      %[flat_thresh],  %[r_k]       \n\t"
+      "or             %[r_flat], %[r_flat],       %[c]         \n\t"
+
+      /* flat |= (abs(p0 - p2) > thresh) */
+      "subu_s.qb      %[c],       %[p0],          %[p2]        \n\t"
+      "subu_s.qb      %[r_k],     %[p2],          %[p0]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+
+      /* flat |= (abs(q0 - q2) > thresh) */
+      "subu_s.qb      %[c],       %[q0],          %[q2]        \n\t"
+      "subu_s.qb      %[r_k],     %[q2],          %[q0]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+
+      /* flat |= (abs(p3 - p0) > thresh) */
+      "subu_s.qb      %[c],       %[p3],          %[p0]        \n\t"
+      "subu_s.qb      %[r_k],     %[p0],          %[p3]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+
+      /* flat |= (abs(q3 - q0) > thresh) */
+      "subu_s.qb      %[c],       %[q3],          %[q0]        \n\t"
+      "subu_s.qb      %[r_k],     %[q0],          %[q3]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+      "sll            %[r_flat],  %[r_flat],      24           \n\t"
+      "wrdsp          %[r_flat]                                \n\t"
+      "pick.qb        %[flat1],   $0,             %[ones]      \n\t"
+      /* flat & flatmask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3) */
+      "and            %[flat1],  %[flat3],        %[flat1]     \n\t"
+
+      : [c] "=&r" (c), [r_k] "=&r" (r_k), [r] "=&r" (r),
+        [r_flat] "=&r" (r_flat), [flat1] "=&r" (flat1), [flat3] "=&r" (flat3)
+      : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2),
+        [p1] "r" (p1), [p0] "r" (p0), [q0] "r" (q0), [q1] "r" (q1),
+        [q2] "r" (q2), [q3] "r" (q3), [q4] "r" (q4),
+        [flat_thresh] "r" (flat_thresh), [ones] "r" (ones)
+  );
+
+  *flat2 = flat1;
+}
+#endif  // #if HAVE_DSPR2
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MASKS_DSPR2_H_
diff --git a/libs/libvpx/vpx_dsp/mips/loopfilter_mb_dspr2.c b/libs/libvpx/vpx_dsp/mips/loopfilter_mb_dspr2.c
new file mode 100644
index 0000000000..4138f56978
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/loopfilter_mb_dspr2.c
@@ -0,0 +1,651 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/mips/common_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_filters_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_macros_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_masks_dspr2.h"
+#include "vpx_mem/vpx_mem.h"
+
+#if HAVE_DSPR2
+void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
+                                int pitch,
+                                const uint8_t *blimit,
+                                const uint8_t *limit,
+                                const uint8_t *thresh,
+                                int count) {
+  uint32_t  mask;
+  uint32_t  hev, flat;
+  uint8_t   i;
+  uint8_t   *sp3, *sp2, *sp1, *sp0, *sq0, *sq1, *sq2, *sq3;
+  uint32_t  thresh_vec, flimit_vec, limit_vec;
+  uint32_t  uflimit, ulimit, uthresh;
+  uint32_t  p1_f0, p0_f0, q0_f0, q1_f0;
+  uint32_t  p3, p2, p1, p0, q0, q1, q2, q3;
+  uint32_t  p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l;
+  uint32_t  p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r;
+
+  uflimit = *blimit;
+  ulimit  = *limit;
+  uthresh = *thresh;
+
+  /* create quad-byte */
+  __asm__ __volatile__ (
+      "replv.qb       %[thresh_vec],    %[uthresh]    \n\t"
+      "replv.qb       %[flimit_vec],    %[uflimit]    \n\t"
+      "replv.qb       %[limit_vec],     %[ulimit]     \n\t"
+
+      : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
+        [limit_vec] "=r" (limit_vec)
+      : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
+  );
+
+  /* prefetch data for store */
+  prefetch_store(s);
+
+  for (i = 0; i < 2; i++) {
+    sp3 = s - (pitch << 2);
+    sp2 = sp3 + pitch;
+    sp1 = sp2 + pitch;
+    sp0 = sp1 + pitch;
+    sq0 = s;
+    sq1 = s + pitch;
+    sq2 = sq1 + pitch;
+    sq3 = sq2 + pitch;
+
+    __asm__ __volatile__ (
+        "lw     %[p3],      (%[sp3])    \n\t"
+        "lw     %[p2],      (%[sp2])    \n\t"
+        "lw     %[p1],      (%[sp1])    \n\t"
+        "lw     %[p0],      (%[sp0])    \n\t"
+        "lw     %[q0],      (%[sq0])    \n\t"
+        "lw     %[q1],      (%[sq1])    \n\t"
+        "lw     %[q2],      (%[sq2])    \n\t"
+        "lw     %[q3],      (%[sq3])    \n\t"
+
+        : [p3] "=&r" (p3), [p2] "=&r" (p2), [p1] "=&r" (p1), [p0] "=&r" (p0),
+          [q3] "=&r" (q3), [q2] "=&r" (q2), [q1] "=&r" (q1), [q0] "=&r" (q0)
+        : [sp3] "r" (sp3), [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+          [sq3] "r" (sq3), [sq2] "r" (sq2), [sq1] "r" (sq1), [sq0] "r" (sq0)
+    );
+
+    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
+                                    p1, p0, p3, p2, q0, q1, q2, q3,
+                                    &hev, &mask, &flat);
+
+    if ((flat == 0) && (mask != 0)) {
+      filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+      __asm__ __volatile__ (
+          "sw       %[p1_f0],   (%[sp1])    \n\t"
+          "sw       %[p0_f0],   (%[sp0])    \n\t"
+          "sw       %[q0_f0],   (%[sq0])    \n\t"
+          "sw       %[q1_f0],   (%[sq1])    \n\t"
+
+          :
+          : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+            [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+            [sp1] "r" (sp1), [sp0] "r" (sp0),
+            [sq0] "r" (sq0), [sq1] "r" (sq1)
+      );
+    } else if ((mask & flat) == 0xFFFFFFFF) {
+      /* left 2 element operation */
+      PACK_LEFT_0TO3()
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                     &q0_l, &q1_l, &q2_l, &q3_l);
+
+      /* right 2 element operation */
+      PACK_RIGHT_0TO3()
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                     &q0_r, &q1_r, &q2_r, &q3_r);
+
+      COMBINE_LEFT_RIGHT_0TO2()
+
+      __asm__ __volatile__ (
+          "sw       %[p2],      (%[sp2])    \n\t"
+          "sw       %[p1],      (%[sp1])    \n\t"
+          "sw       %[p0],      (%[sp0])    \n\t"
+          "sw       %[q0],      (%[sq0])    \n\t"
+          "sw       %[q1],      (%[sq1])    \n\t"
+          "sw       %[q2],      (%[sq2])    \n\t"
+
+          :
+          : [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
+            [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2),
+            [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+            [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+      );
+    } else if ((flat != 0) && (mask != 0)) {
+      /* filtering */
+      filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+      /* left 2 element operation */
+      PACK_LEFT_0TO3()
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                     &q0_l, &q1_l, &q2_l, &q3_l);
+
+      /* right 2 element operation */
+      PACK_RIGHT_0TO3()
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                     &q0_r, &q1_r, &q2_r, &q3_r);
+
+      if (mask & flat & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb     %[p2_r],    (%[sp2])    \n\t"
+            "sb     %[p1_r],    (%[sp1])    \n\t"
+            "sb     %[p0_r],    (%[sp0])    \n\t"
+            "sb     %[q0_r],    (%[sq0])    \n\t"
+            "sb     %[q1_r],    (%[sq1])    \n\t"
+            "sb     %[q2_r],    (%[sq2])    \n\t"
+
+            :
+            : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
+              [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+        );
+      } else if (mask & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb         %[p1_f0],  (%[sp1])    \n\t"
+            "sb         %[p0_f0],  (%[sp0])    \n\t"
+            "sb         %[q0_f0],  (%[sq0])    \n\t"
+            "sb         %[q1_f0],  (%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p2_r],    %[p2_r],    16      \n\t"
+          "srl      %[p1_r],    %[p1_r],    16      \n\t"
+          "srl      %[p0_r],    %[p0_r],    16      \n\t"
+          "srl      %[q0_r],    %[q0_r],    16      \n\t"
+          "srl      %[q1_r],    %[q1_r],    16      \n\t"
+          "srl      %[q2_r],    %[q2_r],    16      \n\t"
+          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
+
+          : [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r),
+            [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb     %[p2_r],    +1(%[sp2])    \n\t"
+            "sb     %[p1_r],    +1(%[sp1])    \n\t"
+            "sb     %[p0_r],    +1(%[sp0])    \n\t"
+            "sb     %[q0_r],    +1(%[sq0])    \n\t"
+            "sb     %[q1_r],    +1(%[sq1])    \n\t"
+            "sb     %[q2_r],    +1(%[sq2])    \n\t"
+
+            :
+            : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
+              [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+        );
+      } else if (mask & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb     %[p1_f0],   +1(%[sp1])    \n\t"
+            "sb     %[p0_f0],   +1(%[sp0])    \n\t"
+            "sb     %[q0_f0],   +1(%[sq0])    \n\t"
+            "sb     %[q1_f0],   +1(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p1_f0],   %[p1_f0],   8     \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8     \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
+
+          : [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0),
+            [q0] "+r" (q0), [q1] "+r" (q1), [q2] "+r" (q2),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & 0x00FF0000) {
+        __asm__ __volatile__ (
+            "sb     %[p2_l],    +2(%[sp2])    \n\t"
+            "sb     %[p1_l],    +2(%[sp1])    \n\t"
+            "sb     %[p0_l],    +2(%[sp0])    \n\t"
+            "sb     %[q0_l],    +2(%[sq0])    \n\t"
+            "sb     %[q1_l],    +2(%[sq1])    \n\t"
+            "sb     %[q2_l],    +2(%[sq2])    \n\t"
+
+            :
+            : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
+              [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+        );
+      } else if (mask & 0x00FF0000) {
+        __asm__ __volatile__ (
+            "sb     %[p1_f0],   +2(%[sp1])    \n\t"
+            "sb     %[p0_f0],   +2(%[sp0])    \n\t"
+            "sb     %[q0_f0],   +2(%[sq0])    \n\t"
+            "sb     %[q1_f0],   +2(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p2_l],    %[p2_l],    16      \n\t"
+          "srl      %[p1_l],    %[p1_l],    16      \n\t"
+          "srl      %[p0_l],    %[p0_l],    16      \n\t"
+          "srl      %[q0_l],    %[q0_l],    16      \n\t"
+          "srl      %[q1_l],    %[q1_l],    16      \n\t"
+          "srl      %[q2_l],    %[q2_l],    16      \n\t"
+          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
+
+          : [p2_l] "+r" (p2_l), [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l),
+            [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb     %[p2_l],    +3(%[sp2])    \n\t"
+            "sb     %[p1_l],    +3(%[sp1])    \n\t"
+            "sb     %[p0_l],    +3(%[sp0])    \n\t"
+            "sb     %[q0_l],    +3(%[sq0])    \n\t"
+            "sb     %[q1_l],    +3(%[sq1])    \n\t"
+            "sb     %[q2_l],    +3(%[sq2])    \n\t"
+
+            :
+            : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
+              [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+        );
+      } else if (mask & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb     %[p1_f0],   +3(%[sp1])    \n\t"
+            "sb     %[p0_f0],   +3(%[sp0])    \n\t"
+            "sb     %[q0_f0],   +3(%[sq0])    \n\t"
+            "sb     %[q1_f0],   +3(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1)
+        );
+      }
+    }
+
+    s = s + 4;
+  }
+}
+
+void vpx_lpf_vertical_8_dspr2(unsigned char *s,
+                              int pitch,
+                              const uint8_t *blimit,
+                              const uint8_t *limit,
+                              const uint8_t *thresh,
+                              int count) {
+  uint8_t   i;
+  uint32_t  mask, hev, flat;
+  uint8_t   *s1, *s2, *s3, *s4;
+  uint32_t  prim1, prim2, sec3, sec4, prim3, prim4;
+  uint32_t  thresh_vec, flimit_vec, limit_vec;
+  uint32_t  uflimit, ulimit, uthresh;
+  uint32_t  p3, p2, p1, p0, q3, q2, q1, q0;
+  uint32_t  p1_f0, p0_f0, q0_f0, q1_f0;
+  uint32_t  p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l;
+  uint32_t  p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r;
+
+  uflimit = *blimit;
+  ulimit  = *limit;
+  uthresh = *thresh;
+
+  /* create quad-byte */
+  __asm__ __volatile__ (
+      "replv.qb     %[thresh_vec],  %[uthresh]    \n\t"
+      "replv.qb     %[flimit_vec],  %[uflimit]    \n\t"
+      "replv.qb     %[limit_vec],   %[ulimit]     \n\t"
+
+      : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
+        [limit_vec] "=r" (limit_vec)
+      : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
+  );
+
+  prefetch_store(s + pitch);
+
+  for (i = 0; i < 2; i++) {
+    s1 = s;
+    s2 = s + pitch;
+    s3 = s2 + pitch;
+    s4 = s3 + pitch;
+    s  = s4 + pitch;
+
+    __asm__ __volatile__ (
+        "lw     %[p0],  -4(%[s1])    \n\t"
+        "lw     %[p1],  -4(%[s2])    \n\t"
+        "lw     %[p2],  -4(%[s3])    \n\t"
+        "lw     %[p3],  -4(%[s4])    \n\t"
+        "lw     %[q3],    (%[s1])    \n\t"
+        "lw     %[q2],    (%[s2])    \n\t"
+        "lw     %[q1],    (%[s3])    \n\t"
+        "lw     %[q0],    (%[s4])    \n\t"
+
+        : [p3] "=&r" (p3), [p2] "=&r" (p2), [p1] "=&r" (p1), [p0] "=&r" (p0),
+          [q0] "=&r" (q0), [q1] "=&r" (q1), [q2] "=&r" (q2), [q3] "=&r" (q3)
+        : [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
+    );
+
+    /* transpose p3, p2, p1, p0
+       original (when loaded from memory)
+       register       -4    -3   -2     -1
+         p0         p0_0  p0_1  p0_2  p0_3
+         p1         p1_0  p1_1  p1_2  p1_3
+         p2         p2_0  p2_1  p2_2  p2_3
+         p3         p3_0  p3_1  p3_2  p3_3
+
+       after transpose
+       register
+         p0         p3_3  p2_3  p1_3  p0_3
+         p1         p3_2  p2_2  p1_2  p0_2
+         p2         p3_1  p2_1  p1_1  p0_1
+         p3         p3_0  p2_0  p1_0  p0_0
+    */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[p0],      %[p1]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p0],      %[p1]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p2],      %[p3]       \n\t"
+        "precr.qb.ph    %[prim4],   %[p2],      %[p3]       \n\t"
+
+        "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p0],      %[p1],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p2],      %[p3],      %[sec4]     \n\t"
+        "append         %[p1],      %[sec3],    16          \n\t"
+        "append         %[p3],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [p0] "+r" (p0), [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    /* transpose q0, q1, q2, q3
+       original (when loaded from memory)
+       register       +1    +2    +3    +4
+         q3         q3_0  q3_1  q3_2  q3_3
+         q2         q2_0  q2_1  q2_2  q2_3
+         q1         q1_0  q1_1  q1_2  q1_3
+         q0         q0_0  q0_1  q0_2  q0_3
+
+       after transpose
+       register
+         q3         q0_3  q1_3  q2_3  q3_3
+         q2         q0_2  q1_2  q2_2  q3_2
+         q1         q0_1  q1_1  q2_1  q3_1
+         q0         q0_0  q1_0  q2_0  q3_0
+    */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[q3],      %[q2]       \n\t"
+        "precr.qb.ph    %[prim2],   %[q3],      %[q2]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[q1],      %[q0]       \n\t"
+        "precr.qb.ph    %[prim4],   %[q1],      %[q0]       \n\t"
+
+        "precrq.qb.ph   %[q2],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[q0],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[q3],      %[q2],      %[sec3]     \n\t"
+        "precrq.ph.w    %[q1],      %[q0],      %[sec4]     \n\t"
+        "append         %[q2],      %[sec3],    16          \n\t"
+        "append         %[q0],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [q3] "+r" (q3), [q2] "+r" (q2), [q1] "+r" (q1), [q0] "+r" (q0),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
+                                    p1, p0, p3, p2, q0, q1, q2, q3,
+                                    &hev, &mask, &flat);
+
+    if ((flat == 0) && (mask != 0)) {
+      filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      STORE_F0()
+    } else if ((mask & flat) == 0xFFFFFFFF) {
+      /* left 2 element operation */
+      PACK_LEFT_0TO3()
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                     &q0_l, &q1_l, &q2_l, &q3_l);
+
+      /* right 2 element operation */
+      PACK_RIGHT_0TO3()
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                     &q0_r, &q1_r, &q2_r, &q3_r);
+
+      STORE_F1()
+    } else if ((flat != 0) && (mask != 0)) {
+      filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+      /* left 2 element operation */
+      PACK_LEFT_0TO3()
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                     &q0_l, &q1_l, &q2_l, &q3_l);
+
+      /* right 2 element operation */
+      PACK_RIGHT_0TO3()
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                     &q0_r, &q1_r, &q2_r, &q3_r);
+
+      if (mask & flat & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb         %[p2_r],  -3(%[s4])    \n\t"
+            "sb         %[p1_r],  -2(%[s4])    \n\t"
+            "sb         %[p0_r],  -1(%[s4])    \n\t"
+            "sb         %[q0_r],    (%[s4])    \n\t"
+            "sb         %[q1_r],  +1(%[s4])    \n\t"
+            "sb         %[q2_r],  +2(%[s4])    \n\t"
+
+            :
+            : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
+              [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
+              [s4] "r" (s4)
+        );
+      } else if (mask & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb         %[p1_f0],  -2(%[s4])    \n\t"
+            "sb         %[p0_f0],  -1(%[s4])    \n\t"
+            "sb         %[q0_f0],    (%[s4])    \n\t"
+            "sb         %[q1_f0],  +1(%[s4])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [s4] "r" (s4)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p2_r],    %[p2_r],    16      \n\t"
+          "srl      %[p1_r],    %[p1_r],    16      \n\t"
+          "srl      %[p0_r],    %[p0_r],    16      \n\t"
+          "srl      %[q0_r],    %[q0_r],    16      \n\t"
+          "srl      %[q1_r],    %[q1_r],    16      \n\t"
+          "srl      %[q2_r],    %[q2_r],    16      \n\t"
+          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
+
+          : [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r),
+            [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb         %[p2_r],  -3(%[s3])    \n\t"
+            "sb         %[p1_r],  -2(%[s3])    \n\t"
+            "sb         %[p0_r],  -1(%[s3])    \n\t"
+            "sb         %[q0_r],    (%[s3])    \n\t"
+            "sb         %[q1_r],  +1(%[s3])    \n\t"
+            "sb         %[q2_r],  +2(%[s3])    \n\t"
+
+            :
+            : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
+              [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
+              [s3] "r" (s3)
+        );
+      } else if (mask & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb         %[p1_f0],  -2(%[s3])    \n\t"
+            "sb         %[p0_f0],  -1(%[s3])    \n\t"
+            "sb         %[q0_f0],    (%[s3])    \n\t"
+            "sb         %[q1_f0],  +1(%[s3])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [s3] "r" (s3)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p1_f0],   %[p1_f0],   8     \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8     \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
+
+          : [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0),
+            [q0] "+r" (q0), [q1] "+r" (q1), [q2] "+r" (q2),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & 0x00FF0000) {
+        __asm__ __volatile__ (
+          "sb         %[p2_l],  -3(%[s2])    \n\t"
+          "sb         %[p1_l],  -2(%[s2])    \n\t"
+          "sb         %[p0_l],  -1(%[s2])    \n\t"
+          "sb         %[q0_l],    (%[s2])    \n\t"
+          "sb         %[q1_l],  +1(%[s2])    \n\t"
+          "sb         %[q2_l],  +2(%[s2])    \n\t"
+
+          :
+          : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
+            [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+            [s2] "r" (s2)
+        );
+      } else if (mask & 0x00FF0000) {
+        __asm__ __volatile__ (
+            "sb         %[p1_f0],  -2(%[s2])    \n\t"
+            "sb         %[p0_f0],  -1(%[s2])    \n\t"
+            "sb         %[q0_f0],    (%[s2])    \n\t"
+            "sb         %[q1_f0],  +1(%[s2])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [s2] "r" (s2)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p2_l],    %[p2_l],    16      \n\t"
+          "srl      %[p1_l],    %[p1_l],    16      \n\t"
+          "srl      %[p0_l],    %[p0_l],    16      \n\t"
+          "srl      %[q0_l],    %[q0_l],    16      \n\t"
+          "srl      %[q1_l],    %[q1_l],    16      \n\t"
+          "srl      %[q2_l],    %[q2_l],    16      \n\t"
+          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
+
+          : [p2_l] "+r" (p2_l), [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l),
+            [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb         %[p2_l],  -3(%[s1])    \n\t"
+            "sb         %[p1_l],  -2(%[s1])    \n\t"
+            "sb         %[p0_l],  -1(%[s1])    \n\t"
+            "sb         %[q0_l],    (%[s1])    \n\t"
+            "sb         %[q1_l],  +1(%[s1])    \n\t"
+            "sb         %[q2_l],  +2(%[s1])    \n\t"
+
+            :
+            : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
+              [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+              [s1] "r" (s1)
+        );
+      } else if (mask & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb         %[p1_f0],  -2(%[s1])    \n\t"
+            "sb         %[p0_f0],  -1(%[s1])    \n\t"
+            "sb         %[q0_f0],    (%[s1])    \n\t"
+            "sb         %[q1_f0],  +1(%[s1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), [q0_f0] "r" (q0_f0),
+              [q1_f0] "r" (q1_f0), [s1] "r" (s1)
+        );
+      }
+    }
+  }
+}
+#endif  // #if HAVE_DSPR2
diff --git a/libs/libvpx/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c b/libs/libvpx/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c
new file mode 100644
index 0000000000..8a48650738
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c
@@ -0,0 +1,794 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/mips/common_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_filters_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_macros_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_masks_dspr2.h"
+#include "vpx_mem/vpx_mem.h"
+
+#if HAVE_DSPR2
+void vpx_lpf_horizontal_16_dspr2(unsigned char *s,
+                                 int pitch,
+                                 const uint8_t *blimit,
+                                 const uint8_t *limit,
+                                 const uint8_t *thresh,
+                                 int count) {
+  uint32_t  mask;
+  uint32_t  hev, flat, flat2;
+  uint8_t   i;
+  uint8_t   *sp7, *sp6, *sp5, *sp4, *sp3, *sp2, *sp1, *sp0;
+  uint8_t   *sq0, *sq1, *sq2, *sq3, *sq4, *sq5, *sq6, *sq7;
+  uint32_t  thresh_vec, flimit_vec, limit_vec;
+  uint32_t  uflimit, ulimit, uthresh;
+  uint32_t  p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+  uint32_t  p1_f0, p0_f0, q0_f0, q1_f0;
+  uint32_t  p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l;
+  uint32_t  q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l;
+  uint32_t  p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r;
+  uint32_t  q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r;
+  uint32_t  p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1;
+  uint32_t  q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1;
+
+  uflimit = *blimit;
+  ulimit  = *limit;
+  uthresh = *thresh;
+
+  /* create quad-byte */
+  __asm__ __volatile__ (
+      "replv.qb       %[thresh_vec],    %[uthresh]      \n\t"
+      "replv.qb       %[flimit_vec],    %[uflimit]      \n\t"
+      "replv.qb       %[limit_vec],     %[ulimit]       \n\t"
+
+      : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
+        [limit_vec] "=r" (limit_vec)
+      : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
+  );
+
+  /* prefetch data for store */
+  prefetch_store(s);
+
+  for (i = 0; i < (2 * count); i++) {
+    sp7 = s - (pitch << 3);
+    sp6 = sp7 + pitch;
+    sp5 = sp6 + pitch;
+    sp4 = sp5 + pitch;
+    sp3 = sp4 + pitch;
+    sp2 = sp3 + pitch;
+    sp1 = sp2 + pitch;
+    sp0 = sp1 + pitch;
+    sq0 = s;
+    sq1 = s + pitch;
+    sq2 = sq1 + pitch;
+    sq3 = sq2 + pitch;
+    sq4 = sq3 + pitch;
+    sq5 = sq4 + pitch;
+    sq6 = sq5 + pitch;
+    sq7 = sq6 + pitch;
+
+    __asm__ __volatile__ (
+        "lw     %[p7],      (%[sp7])            \n\t"
+        "lw     %[p6],      (%[sp6])            \n\t"
+        "lw     %[p5],      (%[sp5])            \n\t"
+        "lw     %[p4],      (%[sp4])            \n\t"
+        "lw     %[p3],      (%[sp3])            \n\t"
+        "lw     %[p2],      (%[sp2])            \n\t"
+        "lw     %[p1],      (%[sp1])            \n\t"
+        "lw     %[p0],      (%[sp0])            \n\t"
+
+        : [p3] "=&r" (p3), [p2] "=&r" (p2), [p1] "=&r" (p1), [p0] "=&r" (p0),
+          [p7] "=&r" (p7), [p6] "=&r" (p6), [p5] "=&r" (p5), [p4] "=&r" (p4)
+        : [sp3] "r" (sp3), [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+          [sp4] "r" (sp4), [sp5] "r" (sp5), [sp6] "r" (sp6), [sp7] "r" (sp7)
+    );
+
+    __asm__ __volatile__ (
+        "lw     %[q0],      (%[sq0])            \n\t"
+        "lw     %[q1],      (%[sq1])            \n\t"
+        "lw     %[q2],      (%[sq2])            \n\t"
+        "lw     %[q3],      (%[sq3])            \n\t"
+        "lw     %[q4],      (%[sq4])            \n\t"
+        "lw     %[q5],      (%[sq5])            \n\t"
+        "lw     %[q6],      (%[sq6])            \n\t"
+        "lw     %[q7],      (%[sq7])            \n\t"
+
+        : [q3] "=&r" (q3), [q2] "=&r" (q2), [q1] "=&r" (q1), [q0] "=&r" (q0),
+          [q7] "=&r" (q7), [q6] "=&r" (q6), [q5] "=&r" (q5), [q4] "=&r" (q4)
+        : [sq3] "r" (sq3), [sq2] "r" (sq2), [sq1] "r" (sq1), [sq0] "r" (sq0),
+          [sq4] "r" (sq4), [sq5] "r" (sq5), [sq6] "r" (sq6), [sq7] "r" (sq7)
+    );
+
+    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
+                                    p1, p0, p3, p2, q0, q1, q2, q3,
+                                    &hev, &mask, &flat);
+
+    flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
+
+    /* f0 */
+    if (((flat2 == 0) && (flat == 0) && (mask != 0)) ||
+        ((flat2 != 0) && (flat == 0) && (mask != 0))) {
+      filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+      __asm__ __volatile__ (
+          "sw       %[p1_f0],   (%[sp1])            \n\t"
+          "sw       %[p0_f0],   (%[sp0])            \n\t"
+          "sw       %[q0_f0],   (%[sq0])            \n\t"
+          "sw       %[q1_f0],   (%[sq1])            \n\t"
+
+          :
+          : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+            [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+            [sp1] "r" (sp1), [sp0] "r" (sp0),
+            [sq0] "r" (sq0), [sq1] "r" (sq1)
+      );
+    } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) &&
+               (mask == 0xFFFFFFFF)) {
+      /* f2 */
+      PACK_LEFT_0TO3()
+      PACK_LEFT_4TO7()
+      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
+                          &p3_l, &p2_l, &p1_l, &p0_l,
+                          &q0_l, &q1_l, &q2_l, &q3_l,
+                          &q4_l, &q5_l, &q6_l, &q7_l);
+
+      PACK_RIGHT_0TO3()
+      PACK_RIGHT_4TO7()
+      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
+                          &p3_r, &p2_r, &p1_r, &p0_r,
+                          &q0_r, &q1_r, &q2_r, &q3_r,
+                          &q4_r, &q5_r, &q6_r, &q7_r);
+
+      COMBINE_LEFT_RIGHT_0TO2()
+      COMBINE_LEFT_RIGHT_3TO6()
+
+      __asm__ __volatile__ (
+          "sw         %[p6], (%[sp6])    \n\t"
+          "sw         %[p5], (%[sp5])    \n\t"
+          "sw         %[p4], (%[sp4])    \n\t"
+          "sw         %[p3], (%[sp3])    \n\t"
+          "sw         %[p2], (%[sp2])    \n\t"
+          "sw         %[p1], (%[sp1])    \n\t"
+          "sw         %[p0], (%[sp0])    \n\t"
+
+          :
+          : [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3),
+            [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
+            [sp6] "r" (sp6), [sp5] "r" (sp5), [sp4] "r" (sp4), [sp3] "r" (sp3),
+            [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0)
+      );
+
+      __asm__ __volatile__ (
+          "sw         %[q6], (%[sq6])    \n\t"
+          "sw         %[q5], (%[sq5])    \n\t"
+          "sw         %[q4], (%[sq4])    \n\t"
+          "sw         %[q3], (%[sq3])    \n\t"
+          "sw         %[q2], (%[sq2])    \n\t"
+          "sw         %[q1], (%[sq1])    \n\t"
+          "sw         %[q0], (%[sq0])    \n\t"
+
+          :
+          : [q6] "r" (q6), [q5] "r" (q5), [q4] "r" (q4), [q3] "r" (q3),
+            [q2] "r" (q2), [q1] "r" (q1), [q0] "r" (q0),
+            [sq6] "r" (sq6), [sq5] "r" (sq5), [sq4] "r" (sq4), [sq3] "r" (sq3),
+            [sq2] "r" (sq2), [sq1] "r" (sq1), [sq0] "r" (sq0)
+      );
+    } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) {
+      /* f1 */
+      /* left 2 element operation */
+      PACK_LEFT_0TO3()
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                     &q0_l, &q1_l, &q2_l, &q3_l);
+
+      /* right 2 element operation */
+      PACK_RIGHT_0TO3()
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                     &q0_r, &q1_r, &q2_r, &q3_r);
+
+      COMBINE_LEFT_RIGHT_0TO2()
+
+      __asm__ __volatile__ (
+          "sw         %[p2], (%[sp2])    \n\t"
+          "sw         %[p1], (%[sp1])    \n\t"
+          "sw         %[p0], (%[sp0])    \n\t"
+          "sw         %[q0], (%[sq0])    \n\t"
+          "sw         %[q1], (%[sq1])    \n\t"
+          "sw         %[q2], (%[sq2])    \n\t"
+
+          :
+          : [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
+            [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2),
+            [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+            [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+      );
+    } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) {
+      /* f0+f1 */
+      filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+      /* left 2 element operation */
+      PACK_LEFT_0TO3()
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                     &q0_l, &q1_l, &q2_l, &q3_l);
+
+      /* right 2 element operation */
+      PACK_RIGHT_0TO3()
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                     &q0_r, &q1_r, &q2_r, &q3_r);
+
+      if (mask & flat & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb         %[p2_r],  (%[sp2])    \n\t"
+            "sb         %[p1_r],  (%[sp1])    \n\t"
+            "sb         %[p0_r],  (%[sp0])    \n\t"
+            "sb         %[q0_r],  (%[sq0])    \n\t"
+            "sb         %[q1_r],  (%[sq1])    \n\t"
+            "sb         %[q2_r],  (%[sq2])    \n\t"
+
+            :
+            : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
+              [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+        );
+      } else if (mask & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb         %[p1_f0],  (%[sp1])    \n\t"
+            "sb         %[p0_f0],  (%[sp0])    \n\t"
+            "sb         %[q0_f0],  (%[sq0])    \n\t"
+            "sb         %[q1_f0],  (%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p2_r],    %[p2_r],    16      \n\t"
+          "srl      %[p1_r],    %[p1_r],    16      \n\t"
+          "srl      %[p0_r],    %[p0_r],    16      \n\t"
+          "srl      %[q0_r],    %[q0_r],    16      \n\t"
+          "srl      %[q1_r],    %[q1_r],    16      \n\t"
+          "srl      %[q2_r],    %[q2_r],    16      \n\t"
+          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
+
+          : [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r),
+            [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb         %[p2_r],  +1(%[sp2])    \n\t"
+            "sb         %[p1_r],  +1(%[sp1])    \n\t"
+            "sb         %[p0_r],  +1(%[sp0])    \n\t"
+            "sb         %[q0_r],  +1(%[sq0])    \n\t"
+            "sb         %[q1_r],  +1(%[sq1])    \n\t"
+            "sb         %[q2_r],  +1(%[sq2])    \n\t"
+
+            :
+            : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
+              [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+        );
+      } else if (mask & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb         %[p1_f0],  +1(%[sp1])    \n\t"
+            "sb         %[p0_f0],  +1(%[sp0])    \n\t"
+            "sb         %[q0_f0],  +1(%[sq0])    \n\t"
+            "sb         %[q1_f0],  +1(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p1_f0],   %[p1_f0],   8     \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8     \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
+
+          : [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & 0x00FF0000) {
+        __asm__ __volatile__ (
+            "sb         %[p2_l],  +2(%[sp2])    \n\t"
+            "sb         %[p1_l],  +2(%[sp1])    \n\t"
+            "sb         %[p0_l],  +2(%[sp0])    \n\t"
+            "sb         %[q0_l],  +2(%[sq0])    \n\t"
+            "sb         %[q1_l],  +2(%[sq1])    \n\t"
+            "sb         %[q2_l],  +2(%[sq2])    \n\t"
+
+            :
+            : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
+              [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+        );
+      } else if (mask & 0x00FF0000) {
+        __asm__ __volatile__ (
+            "sb         %[p1_f0],  +2(%[sp1])    \n\t"
+            "sb         %[p0_f0],  +2(%[sp0])    \n\t"
+            "sb         %[q0_f0],  +2(%[sq0])    \n\t"
+            "sb         %[q1_f0],  +2(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p2_l],    %[p2_l],    16      \n\t"
+          "srl      %[p1_l],    %[p1_l],    16      \n\t"
+          "srl      %[p0_l],    %[p0_l],    16      \n\t"
+          "srl      %[q0_l],    %[q0_l],    16      \n\t"
+          "srl      %[q1_l],    %[q1_l],    16      \n\t"
+          "srl      %[q2_l],    %[q2_l],    16      \n\t"
+          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
+
+          : [p2_l] "+r" (p2_l), [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l),
+            [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb         %[p2_l],  +3(%[sp2])    \n\t"
+            "sb         %[p1_l],  +3(%[sp1])    \n\t"
+            "sb         %[p0_l],  +3(%[sp0])    \n\t"
+            "sb         %[q0_l],  +3(%[sq0])    \n\t"
+            "sb         %[q1_l],  +3(%[sq1])    \n\t"
+            "sb         %[q2_l],  +3(%[sq2])    \n\t"
+
+            :
+            : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
+              [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+        );
+      } else if (mask & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb         %[p1_f0],  +3(%[sp1])    \n\t"
+            "sb         %[p0_f0],  +3(%[sp0])    \n\t"
+            "sb         %[q0_f0],  +3(%[sq0])    \n\t"
+            "sb         %[q1_f0],  +3(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1)
+        );
+      }
+    } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) {
+      /* f0 + f1 + f2 */
+      /* f0  function */
+      filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+      /* f1  function */
+      /* left 2 element operation */
+      PACK_LEFT_0TO3()
+      mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l,
+                      q0_l, q1_l, q2_l, q3_l,
+                      &p2_l_f1, &p1_l_f1, &p0_l_f1,
+                      &q0_l_f1, &q1_l_f1, &q2_l_f1);
+
+      /* right 2 element operation */
+      PACK_RIGHT_0TO3()
+      mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r,
+                      q0_r, q1_r, q2_r, q3_r,
+                      &p2_r_f1, &p1_r_f1, &p0_r_f1,
+                      &q0_r_f1, &q1_r_f1, &q2_r_f1);
+
+      /* f2  function */
+      PACK_LEFT_4TO7()
+      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
+                          &p3_l, &p2_l, &p1_l, &p0_l,
+                          &q0_l, &q1_l, &q2_l, &q3_l,
+                          &q4_l, &q5_l, &q6_l, &q7_l);
+
+      PACK_RIGHT_4TO7()
+      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
+                          &p3_r, &p2_r, &p1_r, &p0_r,
+                          &q0_r, &q1_r, &q2_r, &q3_r,
+                          &q4_r, &q5_r, &q6_r, &q7_r);
+
+      if (mask & flat & flat2 & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb         %[p6_r],  (%[sp6])    \n\t"
+            "sb         %[p5_r],  (%[sp5])    \n\t"
+            "sb         %[p4_r],  (%[sp4])    \n\t"
+            "sb         %[p3_r],  (%[sp3])    \n\t"
+            "sb         %[p2_r],  (%[sp2])    \n\t"
+            "sb         %[p1_r],  (%[sp1])    \n\t"
+            "sb         %[p0_r],  (%[sp0])    \n\t"
+
+            :
+            : [p6_r] "r" (p6_r), [p5_r] "r" (p5_r), [p4_r] "r" (p4_r),
+              [p3_r] "r" (p3_r), [p2_r] "r" (p2_r), [p1_r] "r" (p1_r),
+              [sp6] "r" (sp6), [sp5] "r" (sp5), [sp4] "r" (sp4),
+              [sp3] "r" (sp3), [sp2] "r" (sp2), [sp1] "r" (sp1),
+              [p0_r] "r" (p0_r), [sp0] "r" (sp0)
+        );
+
+        __asm__ __volatile__ (
+            "sb         %[q0_r],  (%[sq0])    \n\t"
+            "sb         %[q1_r],  (%[sq1])    \n\t"
+            "sb         %[q2_r],  (%[sq2])    \n\t"
+            "sb         %[q3_r],  (%[sq3])    \n\t"
+            "sb         %[q4_r],  (%[sq4])    \n\t"
+            "sb         %[q5_r],  (%[sq5])    \n\t"
+            "sb         %[q6_r],  (%[sq6])    \n\t"
+
+            :
+            : [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
+              [q3_r] "r" (q3_r), [q4_r] "r" (q4_r), [q5_r] "r" (q5_r),
+              [q6_r] "r" (q6_r),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2),
+              [sq3] "r" (sq3), [sq4] "r" (sq4), [sq5] "r" (sq5),
+              [sq6] "r" (sq6)
+        );
+      } else if (mask & flat & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb         %[p2_r_f1],  (%[sp2])    \n\t"
+            "sb         %[p1_r_f1],  (%[sp1])    \n\t"
+            "sb         %[p0_r_f1],  (%[sp0])    \n\t"
+            "sb         %[q0_r_f1],  (%[sq0])    \n\t"
+            "sb         %[q1_r_f1],  (%[sq1])    \n\t"
+            "sb         %[q2_r_f1],  (%[sq2])    \n\t"
+
+            :
+            : [p2_r_f1] "r" (p2_r_f1), [p1_r_f1] "r" (p1_r_f1),
+              [p0_r_f1] "r" (p0_r_f1), [q0_r_f1] "r" (q0_r_f1),
+              [q1_r_f1] "r" (q1_r_f1), [q2_r_f1] "r" (q2_r_f1),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+        );
+      } else if (mask & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb         %[p1_f0],  (%[sp1])    \n\t"
+            "sb         %[p0_f0],  (%[sp0])    \n\t"
+            "sb         %[q0_f0],  (%[sq0])    \n\t"
+            "sb         %[q1_f0],  (%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), [q0_f0] "r" (q0_f0),
+              [q1_f0] "r" (q1_f0), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl        %[p6_r], %[p6_r], 16     \n\t"
+          "srl        %[p5_r], %[p5_r], 16     \n\t"
+          "srl        %[p4_r], %[p4_r], 16     \n\t"
+          "srl        %[p3_r], %[p3_r], 16     \n\t"
+          "srl        %[p2_r], %[p2_r], 16     \n\t"
+          "srl        %[p1_r], %[p1_r], 16     \n\t"
+          "srl        %[p0_r], %[p0_r], 16     \n\t"
+          "srl        %[q0_r], %[q0_r], 16     \n\t"
+          "srl        %[q1_r], %[q1_r], 16     \n\t"
+          "srl        %[q2_r], %[q2_r], 16     \n\t"
+          "srl        %[q3_r], %[q3_r], 16     \n\t"
+          "srl        %[q4_r], %[q4_r], 16     \n\t"
+          "srl        %[q5_r], %[q5_r], 16     \n\t"
+          "srl        %[q6_r], %[q6_r], 16     \n\t"
+
+          : [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r),
+            [q3_r] "+r" (q3_r), [q4_r] "+r" (q4_r), [q5_r] "+r" (q5_r),
+            [p6_r] "+r" (p6_r), [p5_r] "+r" (p5_r), [p4_r] "+r" (p4_r),
+            [p3_r] "+r" (p3_r), [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r),
+            [q6_r] "+r" (q6_r), [p0_r] "+r" (p0_r)
+          :
+      );
+
+      __asm__ __volatile__ (
+          "srl        %[p2_r_f1], %[p2_r_f1], 16     \n\t"
+          "srl        %[p1_r_f1], %[p1_r_f1], 16     \n\t"
+          "srl        %[p0_r_f1], %[p0_r_f1], 16     \n\t"
+          "srl        %[q0_r_f1], %[q0_r_f1], 16     \n\t"
+          "srl        %[q1_r_f1], %[q1_r_f1], 16     \n\t"
+          "srl        %[q2_r_f1], %[q2_r_f1], 16     \n\t"
+          "srl        %[p1_f0],   %[p1_f0],   8      \n\t"
+          "srl        %[p0_f0],   %[p0_f0],   8      \n\t"
+          "srl        %[q0_f0],   %[q0_f0],   8      \n\t"
+          "srl        %[q1_f0],   %[q1_f0],   8      \n\t"
+
+          : [p2_r_f1] "+r" (p2_r_f1), [p1_r_f1] "+r" (p1_r_f1),
+            [p0_r_f1] "+r" (p0_r_f1), [q0_r_f1] "+r" (q0_r_f1),
+            [q1_r_f1] "+r" (q1_r_f1), [q2_r_f1] "+r" (q2_r_f1),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & flat2 & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb         %[p6_r],  +1(%[sp6])    \n\t"
+            "sb         %[p5_r],  +1(%[sp5])    \n\t"
+            "sb         %[p4_r],  +1(%[sp4])    \n\t"
+            "sb         %[p3_r],  +1(%[sp3])    \n\t"
+            "sb         %[p2_r],  +1(%[sp2])    \n\t"
+            "sb         %[p1_r],  +1(%[sp1])    \n\t"
+            "sb         %[p0_r],  +1(%[sp0])    \n\t"
+
+            :
+            : [p6_r] "r" (p6_r), [p5_r] "r" (p5_r), [p4_r] "r" (p4_r),
+              [p3_r] "r" (p3_r), [p2_r] "r" (p2_r), [p1_r] "r" (p1_r),
+              [p0_r] "r" (p0_r), [sp6] "r" (sp6), [sp5] "r" (sp5),
+              [sp4] "r" (sp4), [sp3] "r" (sp3),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0)
+        );
+
+        __asm__ __volatile__ (
+            "sb         %[q0_r],  +1(%[sq0])    \n\t"
+            "sb         %[q1_r],  +1(%[sq1])    \n\t"
+            "sb         %[q2_r],  +1(%[sq2])    \n\t"
+            "sb         %[q3_r],  +1(%[sq3])    \n\t"
+            "sb         %[q4_r],  +1(%[sq4])    \n\t"
+            "sb         %[q5_r],  +1(%[sq5])    \n\t"
+            "sb         %[q6_r],  +1(%[sq6])    \n\t"
+
+            :
+            : [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
+              [q3_r] "r" (q3_r), [q4_r] "r" (q4_r), [q5_r] "r" (q5_r),
+              [q6_r] "r" (q6_r), [sq0] "r" (sq0), [sq1] "r" (sq1),
+              [sq2] "r" (sq2), [sq3] "r" (sq3),
+              [sq4] "r" (sq4), [sq5] "r" (sq5), [sq6] "r" (sq6)
+        );
+      } else if (mask & flat & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb         %[p2_r_f1],  +1(%[sp2])    \n\t"
+            "sb         %[p1_r_f1],  +1(%[sp1])    \n\t"
+            "sb         %[p0_r_f1],  +1(%[sp0])    \n\t"
+            "sb         %[q0_r_f1],  +1(%[sq0])    \n\t"
+            "sb         %[q1_r_f1],  +1(%[sq1])    \n\t"
+            "sb         %[q2_r_f1],  +1(%[sq2])    \n\t"
+
+            :
+            : [p2_r_f1] "r" (p2_r_f1), [p1_r_f1] "r" (p1_r_f1),
+              [p0_r_f1] "r" (p0_r_f1), [q0_r_f1] "r" (q0_r_f1),
+              [q1_r_f1] "r" (q1_r_f1), [q2_r_f1] "r" (q2_r_f1),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+        );
+      } else if (mask & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb         %[p1_f0],  +1(%[sp1])    \n\t"
+            "sb         %[p0_f0],  +1(%[sp0])    \n\t"
+            "sb         %[q0_f0],  +1(%[sq0])    \n\t"
+            "sb         %[q1_f0],  +1(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), [q0_f0] "r" (q0_f0),
+              [q1_f0] "r" (q1_f0), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl        %[p1_f0], %[p1_f0], 8     \n\t"
+          "srl        %[p0_f0], %[p0_f0], 8     \n\t"
+          "srl        %[q0_f0], %[q0_f0], 8     \n\t"
+          "srl        %[q1_f0], %[q1_f0], 8     \n\t"
+
+          : [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & flat2 & 0x00FF0000) {
+        __asm__ __volatile__ (
+            "sb         %[p6_l],  +2(%[sp6])    \n\t"
+            "sb         %[p5_l],  +2(%[sp5])    \n\t"
+            "sb         %[p4_l],  +2(%[sp4])    \n\t"
+            "sb         %[p3_l],  +2(%[sp3])    \n\t"
+            "sb         %[p2_l],  +2(%[sp2])    \n\t"
+            "sb         %[p1_l],  +2(%[sp1])    \n\t"
+            "sb         %[p0_l],  +2(%[sp0])    \n\t"
+
+            :
+            : [p6_l] "r" (p6_l), [p5_l] "r" (p5_l), [p4_l] "r" (p4_l),
+              [p3_l] "r" (p3_l), [p2_l] "r" (p2_l), [p1_l] "r" (p1_l),
+              [p0_l] "r" (p0_l), [sp6] "r" (sp6), [sp5] "r" (sp5),
+              [sp4] "r" (sp4), [sp3] "r" (sp3),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0)
+        );
+
+        __asm__ __volatile__ (
+            "sb         %[q0_l],  +2(%[sq0])    \n\t"
+            "sb         %[q1_l],  +2(%[sq1])    \n\t"
+            "sb         %[q2_l],  +2(%[sq2])    \n\t"
+            "sb         %[q3_l],  +2(%[sq3])    \n\t"
+            "sb         %[q4_l],  +2(%[sq4])    \n\t"
+            "sb         %[q5_l],  +2(%[sq5])    \n\t"
+            "sb         %[q6_l],  +2(%[sq6])    \n\t"
+
+            :
+            : [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+              [q3_l] "r" (q3_l), [q4_l] "r" (q4_l), [q5_l] "r" (q5_l),
+              [q6_l] "r" (q6_l), [sq0] "r" (sq0), [sq1] "r" (sq1),
+              [sq2] "r" (sq2), [sq3] "r" (sq3),
+              [sq4] "r" (sq4), [sq5] "r" (sq5), [sq6] "r" (sq6)
+        );
+      } else if (mask & flat & 0x00FF0000) {
+        __asm__ __volatile__ (
+            "sb         %[p2_l_f1],  +2(%[sp2])    \n\t"
+            "sb         %[p1_l_f1],  +2(%[sp1])    \n\t"
+            "sb         %[p0_l_f1],  +2(%[sp0])    \n\t"
+            "sb         %[q0_l_f1],  +2(%[sq0])    \n\t"
+            "sb         %[q1_l_f1],  +2(%[sq1])    \n\t"
+            "sb         %[q2_l_f1],  +2(%[sq2])    \n\t"
+
+            :
+            : [p2_l_f1] "r" (p2_l_f1), [p1_l_f1] "r" (p1_l_f1),
+              [p0_l_f1] "r" (p0_l_f1), [q0_l_f1] "r" (q0_l_f1),
+              [q1_l_f1] "r" (q1_l_f1), [q2_l_f1] "r" (q2_l_f1),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+        );
+      } else if (mask & 0x00FF0000) {
+        __asm__ __volatile__ (
+            "sb         %[p1_f0],  +2(%[sp1])    \n\t"
+            "sb         %[p0_f0],  +2(%[sp0])    \n\t"
+            "sb         %[q0_f0],  +2(%[sq0])    \n\t"
+            "sb         %[q1_f0],  +2(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), [q0_f0] "r" (q0_f0),
+              [q1_f0] "r" (q1_f0), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p6_l],    %[p6_l],    16   \n\t"
+          "srl      %[p5_l],    %[p5_l],    16   \n\t"
+          "srl      %[p4_l],    %[p4_l],    16   \n\t"
+          "srl      %[p3_l],    %[p3_l],    16   \n\t"
+          "srl      %[p2_l],    %[p2_l],    16   \n\t"
+          "srl      %[p1_l],    %[p1_l],    16   \n\t"
+          "srl      %[p0_l],    %[p0_l],    16   \n\t"
+          "srl      %[q0_l],    %[q0_l],    16   \n\t"
+          "srl      %[q1_l],    %[q1_l],    16   \n\t"
+          "srl      %[q2_l],    %[q2_l],    16   \n\t"
+          "srl      %[q3_l],    %[q3_l],    16   \n\t"
+          "srl      %[q4_l],    %[q4_l],    16   \n\t"
+          "srl      %[q5_l],    %[q5_l],    16   \n\t"
+          "srl      %[q6_l],    %[q6_l],    16   \n\t"
+
+          : [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l),
+            [q3_l] "+r" (q3_l), [q4_l] "+r" (q4_l), [q5_l] "+r" (q5_l),
+            [q6_l] "+r" (q6_l), [p6_l] "+r" (p6_l), [p5_l] "+r" (p5_l),
+            [p4_l] "+r" (p4_l), [p3_l] "+r" (p3_l), [p2_l] "+r" (p2_l),
+            [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l)
+          :
+      );
+
+      __asm__ __volatile__ (
+          "srl      %[p2_l_f1],   %[p2_l_f1],   16   \n\t"
+          "srl      %[p1_l_f1],   %[p1_l_f1],   16   \n\t"
+          "srl      %[p0_l_f1],   %[p0_l_f1],   16   \n\t"
+          "srl      %[q0_l_f1],   %[q0_l_f1],   16   \n\t"
+          "srl      %[q1_l_f1],   %[q1_l_f1],   16   \n\t"
+          "srl      %[q2_l_f1],   %[q2_l_f1],   16   \n\t"
+          "srl      %[p1_f0],     %[p1_f0],     8    \n\t"
+          "srl      %[p0_f0],     %[p0_f0],     8    \n\t"
+          "srl      %[q0_f0],     %[q0_f0],     8    \n\t"
+          "srl      %[q1_f0],     %[q1_f0],     8    \n\t"
+
+          : [p2_l_f1] "+r" (p2_l_f1), [p1_l_f1] "+r" (p1_l_f1),
+            [p0_l_f1] "+r" (p0_l_f1), [q0_l_f1] "+r" (q0_l_f1),
+            [q1_l_f1] "+r" (q1_l_f1), [q2_l_f1] "+r" (q2_l_f1),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & flat2 & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb     %[p6_l],    +3(%[sp6])    \n\t"
+            "sb     %[p5_l],    +3(%[sp5])    \n\t"
+            "sb     %[p4_l],    +3(%[sp4])    \n\t"
+            "sb     %[p3_l],    +3(%[sp3])    \n\t"
+            "sb     %[p2_l],    +3(%[sp2])    \n\t"
+            "sb     %[p1_l],    +3(%[sp1])    \n\t"
+            "sb     %[p0_l],    +3(%[sp0])    \n\t"
+
+            :
+            : [p6_l] "r" (p6_l), [p5_l] "r" (p5_l), [p4_l] "r" (p4_l),
+              [p3_l] "r" (p3_l), [p2_l] "r" (p2_l), [p1_l] "r" (p1_l),
+              [p0_l] "r" (p0_l), [sp6] "r" (sp6), [sp5] "r" (sp5),
+              [sp4] "r" (sp4), [sp3] "r" (sp3), [sp2] "r" (sp2),
+              [sp1] "r" (sp1), [sp0] "r" (sp0)
+        );
+
+        __asm__ __volatile__ (
+            "sb     %[q0_l],    +3(%[sq0])    \n\t"
+            "sb     %[q1_l],    +3(%[sq1])    \n\t"
+            "sb     %[q2_l],    +3(%[sq2])    \n\t"
+            "sb     %[q3_l],    +3(%[sq3])    \n\t"
+            "sb     %[q4_l],    +3(%[sq4])    \n\t"
+            "sb     %[q5_l],    +3(%[sq5])    \n\t"
+            "sb     %[q6_l],    +3(%[sq6])    \n\t"
+
+            :
+            : [q0_l] "r" (q0_l), [q1_l] "r" (q1_l),
+              [q2_l] "r" (q2_l), [q3_l] "r" (q3_l),
+              [q4_l] "r" (q4_l), [q5_l] "r" (q5_l),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2),
+              [sq3] "r" (sq3), [sq4] "r" (sq4), [sq5] "r" (sq5),
+              [q6_l] "r" (q6_l), [sq6] "r" (sq6)
+        );
+      } else if (mask & flat & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb     %[p2_l_f1],     +3(%[sp2])    \n\t"
+            "sb     %[p1_l_f1],     +3(%[sp1])    \n\t"
+            "sb     %[p0_l_f1],     +3(%[sp0])    \n\t"
+            "sb     %[q0_l_f1],     +3(%[sq0])    \n\t"
+            "sb     %[q1_l_f1],     +3(%[sq1])    \n\t"
+            "sb     %[q2_l_f1],     +3(%[sq2])    \n\t"
+
+            :
+            : [p2_l_f1] "r" (p2_l_f1), [p1_l_f1] "r" (p1_l_f1),
+              [p0_l_f1] "r" (p0_l_f1), [q0_l_f1] "r" (q0_l_f1),
+              [q1_l_f1] "r" (q1_l_f1), [q2_l_f1] "r" (q2_l_f1),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+        );
+      } else if (mask & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb     %[p1_f0],   +3(%[sp1])    \n\t"
+            "sb     %[p0_f0],   +3(%[sp0])    \n\t"
+            "sb     %[q0_f0],   +3(%[sq0])    \n\t"
+            "sb     %[q1_f0],   +3(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1)
+        );
+      }
+    }
+
+    s = s + 4;
+  }
+}
+#endif  // #if HAVE_DSPR2
diff --git a/libs/libvpx/vpx_dsp/mips/loopfilter_mb_vert_dspr2.c b/libs/libvpx/vpx_dsp/mips/loopfilter_mb_vert_dspr2.c
new file mode 100644
index 0000000000..e580f014e9
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/loopfilter_mb_vert_dspr2.c
@@ -0,0 +1,839 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/mips/common_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_filters_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_macros_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_masks_dspr2.h"
+#include "vpx_mem/vpx_mem.h"
+
+#if HAVE_DSPR2
+void vpx_lpf_vertical_16_dspr2(uint8_t *s,
+                               int pitch,
+                               const uint8_t *blimit,
+                               const uint8_t *limit,
+                               const uint8_t *thresh) {
+  uint8_t   i;
+  uint32_t  mask, hev, flat, flat2;
+  uint8_t   *s1, *s2, *s3, *s4;
+  uint32_t  prim1, prim2, sec3, sec4, prim3, prim4;
+  uint32_t  thresh_vec, flimit_vec, limit_vec;
+  uint32_t  uflimit, ulimit, uthresh;
+  uint32_t  p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+  uint32_t  p1_f0, p0_f0, q0_f0, q1_f0;
+  uint32_t  p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l;
+  uint32_t  q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l;
+  uint32_t  p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r;
+  uint32_t  q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r;
+  uint32_t  p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1;
+  uint32_t  q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1;
+
+  uflimit = *blimit;
+  ulimit = *limit;
+  uthresh = *thresh;
+
+  /* create quad-byte */
+  __asm__ __volatile__ (
+      "replv.qb     %[thresh_vec],     %[uthresh]    \n\t"
+      "replv.qb     %[flimit_vec],     %[uflimit]    \n\t"
+      "replv.qb     %[limit_vec],      %[ulimit]     \n\t"
+
+      : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
+        [limit_vec] "=r" (limit_vec)
+      : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
+  );
+
+  prefetch_store(s + pitch);
+
+  for (i = 0; i < 2; i++) {
+    s1 = s;
+    s2 = s + pitch;
+    s3 = s2 + pitch;
+    s4 = s3 + pitch;
+    s  = s4 + pitch;
+
+    __asm__ __volatile__ (
+        "lw     %[p0],  -4(%[s1])    \n\t"
+        "lw     %[p1],  -4(%[s2])    \n\t"
+        "lw     %[p2],  -4(%[s3])    \n\t"
+        "lw     %[p3],  -4(%[s4])    \n\t"
+        "lw     %[p4],  -8(%[s1])    \n\t"
+        "lw     %[p5],  -8(%[s2])    \n\t"
+        "lw     %[p6],  -8(%[s3])    \n\t"
+        "lw     %[p7],  -8(%[s4])    \n\t"
+
+        : [p3] "=&r" (p3), [p2] "=&r" (p2), [p1] "=&r" (p1),
+          [p0] "=&r" (p0), [p7] "=&r" (p7), [p6] "=&r" (p6),
+          [p5] "=&r" (p5), [p4] "=&r" (p4)
+        : [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
+    );
+
+    __asm__ __volatile__ (
+        "lw     %[q3],  (%[s1])     \n\t"
+        "lw     %[q2],  (%[s2])     \n\t"
+        "lw     %[q1],  (%[s3])     \n\t"
+        "lw     %[q0],  (%[s4])     \n\t"
+        "lw     %[q7],  +4(%[s1])   \n\t"
+        "lw     %[q6],  +4(%[s2])   \n\t"
+        "lw     %[q5],  +4(%[s3])   \n\t"
+        "lw     %[q4],  +4(%[s4])   \n\t"
+
+        : [q3] "=&r" (q3), [q2] "=&r" (q2), [q1] "=&r" (q1),
+          [q0] "=&r" (q0), [q7] "=&r" (q7), [q6] "=&r" (q6),
+          [q5] "=&r" (q5), [q4] "=&r" (q4)
+        : [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
+    );
+
+    /* transpose p3, p2, p1, p0
+       original (when loaded from memory)
+       register       -4    -3   -2     -1
+         p0         p0_0  p0_1  p0_2  p0_3
+         p1         p1_0  p1_1  p1_2  p1_3
+         p2         p2_0  p2_1  p2_2  p2_3
+         p3         p3_0  p3_1  p3_2  p3_3
+
+       after transpose
+       register
+         p0         p3_3  p2_3  p1_3  p0_3
+         p1         p3_2  p2_2  p1_2  p0_2
+         p2         p3_1  p2_1  p1_1  p0_1
+         p3         p3_0  p2_0  p1_0  p0_0
+    */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[p0],      %[p1]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p0],      %[p1]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p2],      %[p3]       \n\t"
+        "precr.qb.ph    %[prim4],   %[p2],      %[p3]       \n\t"
+
+        "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p0],      %[p1],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p2],      %[p3],      %[sec4]     \n\t"
+        "append         %[p1],      %[sec3],    16          \n\t"
+        "append         %[p3],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [p0] "+r" (p0), [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    /* transpose q0, q1, q2, q3
+       original (when loaded from memory)
+       register       +1    +2    +3    +4
+         q3         q3_0  q3_1  q3_2  q3_3
+         q2         q2_0  q2_1  q2_2  q2_3
+         q1         q1_0  q1_1  q1_2  q1_3
+         q0         q0_0  q0_1  q0_2  q0_3
+
+       after transpose
+       register
+         q3         q0_3  q1_3  q2_3  q3_3
+         q2         q0_2  q1_2  q2_2  q3_2
+         q1         q0_1  q1_1  q2_1  q3_1
+         q0         q0_0  q1_0  q2_0  q3_0
+    */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[q3],      %[q2]       \n\t"
+        "precr.qb.ph    %[prim2],   %[q3],      %[q2]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[q1],      %[q0]       \n\t"
+        "precr.qb.ph    %[prim4],   %[q1],      %[q0]       \n\t"
+
+        "precrq.qb.ph   %[q2],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[q0],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[q3],      %[q2],      %[sec3]     \n\t"
+        "precrq.ph.w    %[q1],      %[q0],      %[sec4]     \n\t"
+        "append         %[q2],      %[sec3],    16          \n\t"
+        "append         %[q0],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [q3] "+r" (q3), [q2] "+r" (q2), [q1] "+r" (q1), [q0] "+r" (q0),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    /* transpose p7, p6, p5, p4
+       original (when loaded from memory)
+       register      -8    -7   -6     -5
+         p4         p4_0  p4_1  p4_2  p4_3
+         p5         p5_0  p5_1  p5_2  p5_3
+         p6         p6_0  p6_1  p6_2  p6_3
+         p7         p7_0  p7_1  p7_2  p7_3
+
+       after transpose
+       register
+         p4         p7_3  p6_3  p5_3  p4_3
+         p5         p7_2  p6_2  p5_2  p4_2
+         p6         p7_1  p6_1  p5_1  p4_1
+         p7         p7_0  p6_0  p5_0  p4_0
+    */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[p4],      %[p5]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p4],      %[p5]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p6],      %[p7]       \n\t"
+        "precr.qb.ph    %[prim4],   %[p6],      %[p7]       \n\t"
+
+        "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[p7],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p4],      %[p5],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p6],      %[p7],      %[sec4]     \n\t"
+        "append         %[p5],      %[sec3],    16          \n\t"
+        "append         %[p7],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [p4] "+r" (p4), [p5] "+r" (p5), [p6] "+r" (p6), [p7] "+r" (p7),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    /* transpose q4, q5, q6, q7
+       original (when loaded from memory)
+       register      +5    +6    +7    +8
+         q7         q7_0  q7_1  q7_2  q7_3
+         q6         q6_0  q6_1  q6_2  q6_3
+         q5         q5_0  q5_1  q5_2  q5_3
+         q4         q4_0  q4_1  q4_2  q4_3
+
+       after transpose
+       register
+         q7         q4_3  q5_3  q26_3  q7_3
+         q6         q4_2  q5_2  q26_2  q7_2
+         q5         q4_1  q5_1  q26_1  q7_1
+         q4         q4_0  q5_0  q26_0  q7_0
+    */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[q7],      %[q6]       \n\t"
+        "precr.qb.ph    %[prim2],   %[q7],      %[q6]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[q5],      %[q4]       \n\t"
+        "precr.qb.ph    %[prim4],   %[q5],      %[q4]       \n\t"
+
+        "precrq.qb.ph   %[q6],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[q4],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[q7],      %[q6],      %[sec3]     \n\t"
+        "precrq.ph.w    %[q5],      %[q4],      %[sec4]     \n\t"
+        "append         %[q6],      %[sec3],    16          \n\t"
+        "append         %[q4],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [q7] "+r" (q7), [q6] "+r" (q6), [q5] "+r" (q5), [q4] "+r" (q4),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
+                                    p1, p0, p3, p2, q0, q1, q2, q3,
+                                    &hev, &mask, &flat);
+
+    flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
+
+    /* f0 */
+    if (((flat2 == 0) && (flat == 0) && (mask != 0)) ||
+        ((flat2 != 0) && (flat == 0) && (mask != 0))) {
+      filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      STORE_F0()
+    } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) &&
+               (mask == 0xFFFFFFFF)) {
+      /* f2 */
+      PACK_LEFT_0TO3()
+      PACK_LEFT_4TO7()
+      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
+                          &p3_l, &p2_l, &p1_l, &p0_l,
+                          &q0_l, &q1_l, &q2_l, &q3_l,
+                          &q4_l, &q5_l, &q6_l, &q7_l);
+
+      PACK_RIGHT_0TO3()
+      PACK_RIGHT_4TO7()
+      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
+                          &p3_r, &p2_r, &p1_r, &p0_r,
+                          &q0_r, &q1_r, &q2_r, &q3_r,
+                          &q4_r, &q5_r, &q6_r, &q7_r);
+
+      STORE_F2()
+    } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) {
+      /* f1 */
+      PACK_LEFT_0TO3()
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                     &q0_l, &q1_l, &q2_l, &q3_l);
+
+      PACK_RIGHT_0TO3()
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                     &q0_r, &q1_r, &q2_r, &q3_r);
+
+      STORE_F1()
+    } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) {
+      /* f0 + f1 */
+      filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+      /* left 2 element operation */
+      PACK_LEFT_0TO3()
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                     &q0_l, &q1_l, &q2_l, &q3_l);
+
+      /* right 2 element operation */
+      PACK_RIGHT_0TO3()
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                     &q0_r, &q1_r, &q2_r, &q3_r);
+
+      if (mask & flat & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb     %[p2_r],    -3(%[s4])    \n\t"
+            "sb     %[p1_r],    -2(%[s4])    \n\t"
+            "sb     %[p0_r],    -1(%[s4])    \n\t"
+            "sb     %[q0_r],      (%[s4])    \n\t"
+            "sb     %[q1_r],    +1(%[s4])    \n\t"
+            "sb     %[q2_r],    +2(%[s4])    \n\t"
+
+            :
+            : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
+              [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
+              [s4] "r" (s4)
+        );
+      } else if (mask & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb         %[p1_f0],  -2(%[s4])    \n\t"
+            "sb         %[p0_f0],  -1(%[s4])    \n\t"
+            "sb         %[q0_f0],    (%[s4])    \n\t"
+            "sb         %[q1_f0],  +1(%[s4])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [s4] "r" (s4)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p2_r],    %[p2_r],    16      \n\t"
+          "srl      %[p1_r],    %[p1_r],    16      \n\t"
+          "srl      %[p0_r],    %[p0_r],    16      \n\t"
+          "srl      %[q0_r],    %[q0_r],    16      \n\t"
+          "srl      %[q1_r],    %[q1_r],    16      \n\t"
+          "srl      %[q2_r],    %[q2_r],    16      \n\t"
+          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
+
+          : [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r),
+            [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb     %[p2_r],    -3(%[s3])    \n\t"
+            "sb     %[p1_r],    -2(%[s3])    \n\t"
+            "sb     %[p0_r],    -1(%[s3])    \n\t"
+            "sb     %[q0_r],      (%[s3])    \n\t"
+            "sb     %[q1_r],    +1(%[s3])    \n\t"
+            "sb     %[q2_r],    +2(%[s3])    \n\t"
+
+            :
+            : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
+              [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
+              [s3] "r" (s3)
+        );
+      } else if (mask & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb     %[p1_f0],   -2(%[s3])    \n\t"
+            "sb     %[p0_f0],   -1(%[s3])    \n\t"
+            "sb     %[q0_f0],     (%[s3])    \n\t"
+            "sb     %[q1_f0],   +1(%[s3])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [s3] "r" (s3)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p1_f0],   %[p1_f0],   8     \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8     \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
+
+          : [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & 0x00FF0000) {
+        __asm__ __volatile__ (
+          "sb       %[p2_l],    -3(%[s2])    \n\t"
+          "sb       %[p1_l],    -2(%[s2])    \n\t"
+          "sb       %[p0_l],    -1(%[s2])    \n\t"
+          "sb       %[q0_l],      (%[s2])    \n\t"
+          "sb       %[q1_l],    +1(%[s2])    \n\t"
+          "sb       %[q2_l],    +2(%[s2])    \n\t"
+
+          :
+          : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
+            [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+            [s2] "r" (s2)
+        );
+      } else if (mask & 0x00FF0000) {
+        __asm__ __volatile__ (
+            "sb     %[p1_f0],   -2(%[s2])    \n\t"
+            "sb     %[p0_f0],   -1(%[s2])    \n\t"
+            "sb     %[q0_f0],     (%[s2])    \n\t"
+            "sb     %[q1_f0],   +1(%[s2])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [s2] "r" (s2)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p2_l],    %[p2_l],    16      \n\t"
+          "srl      %[p1_l],    %[p1_l],    16      \n\t"
+          "srl      %[p0_l],    %[p0_l],    16      \n\t"
+          "srl      %[q0_l],    %[q0_l],    16      \n\t"
+          "srl      %[q1_l],    %[q1_l],    16      \n\t"
+          "srl      %[q2_l],    %[q2_l],    16      \n\t"
+          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
+
+          : [p2_l] "+r" (p2_l), [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l),
+            [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb     %[p2_l],    -3(%[s1])    \n\t"
+            "sb     %[p1_l],    -2(%[s1])    \n\t"
+            "sb     %[p0_l],    -1(%[s1])    \n\t"
+            "sb     %[q0_l],      (%[s1])    \n\t"
+            "sb     %[q1_l],    +1(%[s1])    \n\t"
+            "sb     %[q2_l],    +2(%[s1])    \n\t"
+
+            :
+            : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
+              [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+              [s1] "r" (s1)
+        );
+      } else if (mask & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb     %[p1_f0],   -2(%[s1])    \n\t"
+            "sb     %[p0_f0],   -1(%[s1])    \n\t"
+            "sb     %[q0_f0],     (%[s1])    \n\t"
+            "sb     %[q1_f0],   +1(%[s1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [s1] "r" (s1)
+        );
+      }
+    } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) {
+      /* f0+f1+f2 */
+      filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+      PACK_LEFT_0TO3()
+      mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l,
+                      q0_l, q1_l, q2_l, q3_l,
+                      &p2_l_f1, &p1_l_f1, &p0_l_f1,
+                      &q0_l_f1, &q1_l_f1, &q2_l_f1);
+
+      PACK_RIGHT_0TO3()
+      mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r,
+                      q0_r, q1_r, q2_r, q3_r,
+                      &p2_r_f1, &p1_r_f1, &p0_r_f1,
+                      &q0_r_f1, &q1_r_f1, &q2_r_f1);
+
+      PACK_LEFT_4TO7()
+      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
+                          &p3_l, &p2_l, &p1_l, &p0_l,
+                          &q0_l, &q1_l, &q2_l, &q3_l,
+                          &q4_l, &q5_l, &q6_l, &q7_l);
+
+      PACK_RIGHT_4TO7()
+      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
+                          &p3_r, &p2_r, &p1_r, &p0_r,
+                          &q0_r, &q1_r, &q2_r, &q3_r,
+                          &q4_r, &q5_r, &q6_r, &q7_r);
+
+      if (mask & flat & flat2 & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb     %[p6_r],    -7(%[s4])    \n\t"
+            "sb     %[p5_r],    -6(%[s4])    \n\t"
+            "sb     %[p4_r],    -5(%[s4])    \n\t"
+            "sb     %[p3_r],    -4(%[s4])    \n\t"
+            "sb     %[p2_r],    -3(%[s4])    \n\t"
+            "sb     %[p1_r],    -2(%[s4])    \n\t"
+            "sb     %[p0_r],    -1(%[s4])    \n\t"
+
+            :
+            : [p6_r] "r" (p6_r), [p5_r] "r" (p5_r),
+              [p4_r] "r" (p4_r), [p3_r] "r" (p3_r),
+              [p2_r] "r" (p2_r), [p1_r] "r" (p1_r),
+              [p0_r] "r" (p0_r), [s4] "r" (s4)
+        );
+
+        __asm__ __volatile__ (
+            "sb     %[q0_r],      (%[s4])    \n\t"
+            "sb     %[q1_r],    +1(%[s4])    \n\t"
+            "sb     %[q2_r],    +2(%[s4])    \n\t"
+            "sb     %[q3_r],    +3(%[s4])    \n\t"
+            "sb     %[q4_r],    +4(%[s4])    \n\t"
+            "sb     %[q5_r],    +5(%[s4])    \n\t"
+            "sb     %[q6_r],    +6(%[s4])    \n\t"
+
+            :
+            : [q0_r] "r" (q0_r), [q1_r] "r" (q1_r),
+              [q2_r] "r" (q2_r), [q3_r] "r" (q3_r),
+              [q4_r] "r" (q4_r), [q5_r] "r" (q5_r),
+              [q6_r] "r" (q6_r), [s4] "r" (s4)
+        );
+      } else if (mask & flat & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb     %[p2_r_f1],     -3(%[s4])    \n\t"
+            "sb     %[p1_r_f1],     -2(%[s4])    \n\t"
+            "sb     %[p0_r_f1],     -1(%[s4])    \n\t"
+            "sb     %[q0_r_f1],       (%[s4])    \n\t"
+            "sb     %[q1_r_f1],     +1(%[s4])    \n\t"
+            "sb     %[q2_r_f1],     +2(%[s4])    \n\t"
+
+            :
+            : [p2_r_f1] "r" (p2_r_f1), [p1_r_f1] "r" (p1_r_f1),
+              [p0_r_f1] "r" (p0_r_f1), [q0_r_f1] "r" (q0_r_f1),
+              [q1_r_f1] "r" (q1_r_f1), [q2_r_f1] "r" (q2_r_f1),
+              [s4] "r" (s4)
+        );
+      } else if (mask & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb     %[p1_f0],   -2(%[s4])    \n\t"
+            "sb     %[p0_f0],   -1(%[s4])    \n\t"
+            "sb     %[q0_f0],     (%[s4])    \n\t"
+            "sb     %[q1_f0],   +1(%[s4])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [s4] "r" (s4)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p6_r],        %[p6_r],        16     \n\t"
+          "srl      %[p5_r],        %[p5_r],        16     \n\t"
+          "srl      %[p4_r],        %[p4_r],        16     \n\t"
+          "srl      %[p3_r],        %[p3_r],        16     \n\t"
+          "srl      %[p2_r],        %[p2_r],        16     \n\t"
+          "srl      %[p1_r],        %[p1_r],        16     \n\t"
+          "srl      %[p0_r],        %[p0_r],        16     \n\t"
+          "srl      %[q0_r],        %[q0_r],        16     \n\t"
+          "srl      %[q1_r],        %[q1_r],        16     \n\t"
+          "srl      %[q2_r],        %[q2_r],        16     \n\t"
+          "srl      %[q3_r],        %[q3_r],        16     \n\t"
+          "srl      %[q4_r],        %[q4_r],        16     \n\t"
+          "srl      %[q5_r],        %[q5_r],        16     \n\t"
+          "srl      %[q6_r],        %[q6_r],        16     \n\t"
+
+          : [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r),
+            [q2_r] "+r" (q2_r), [q3_r] "+r" (q3_r),
+            [q4_r] "+r" (q4_r), [q5_r] "+r" (q5_r),
+            [q6_r] "+r" (q6_r), [p6_r] "+r" (p6_r),
+            [p5_r] "+r" (p5_r), [p4_r] "+r" (p4_r),
+            [p3_r] "+r" (p3_r), [p2_r] "+r" (p2_r),
+            [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r)
+          :
+      );
+
+      __asm__ __volatile__ (
+          "srl      %[p2_r_f1],     %[p2_r_f1],     16      \n\t"
+          "srl      %[p1_r_f1],     %[p1_r_f1],     16      \n\t"
+          "srl      %[p0_r_f1],     %[p0_r_f1],     16      \n\t"
+          "srl      %[q0_r_f1],     %[q0_r_f1],     16      \n\t"
+          "srl      %[q1_r_f1],     %[q1_r_f1],     16      \n\t"
+          "srl      %[q2_r_f1],     %[q2_r_f1],     16      \n\t"
+          "srl      %[p1_f0],       %[p1_f0],       8       \n\t"
+          "srl      %[p0_f0],       %[p0_f0],       8       \n\t"
+          "srl      %[q0_f0],       %[q0_f0],       8       \n\t"
+          "srl      %[q1_f0],       %[q1_f0],       8       \n\t"
+
+          : [p2_r_f1] "+r" (p2_r_f1), [p1_r_f1] "+r" (p1_r_f1),
+            [p0_r_f1] "+r" (p0_r_f1), [q0_r_f1] "+r" (q0_r_f1),
+            [q1_r_f1] "+r" (q1_r_f1), [q2_r_f1] "+r" (q2_r_f1),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & flat2 & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb     %[p6_r],    -7(%[s3])    \n\t"
+            "sb     %[p5_r],    -6(%[s3])    \n\t"
+            "sb     %[p4_r],    -5(%[s3])    \n\t"
+            "sb     %[p3_r],    -4(%[s3])    \n\t"
+            "sb     %[p2_r],    -3(%[s3])    \n\t"
+            "sb     %[p1_r],    -2(%[s3])    \n\t"
+            "sb     %[p0_r],    -1(%[s3])    \n\t"
+
+            :
+            : [p6_r] "r" (p6_r), [p5_r] "r" (p5_r), [p4_r] "r" (p4_r),
+              [p3_r] "r" (p3_r), [p2_r] "r" (p2_r), [p1_r] "r" (p1_r),
+              [p0_r] "r" (p0_r), [s3] "r" (s3)
+        );
+
+        __asm__ __volatile__ (
+            "sb     %[q0_r],      (%[s3])    \n\t"
+            "sb     %[q1_r],    +1(%[s3])    \n\t"
+            "sb     %[q2_r],    +2(%[s3])    \n\t"
+            "sb     %[q3_r],    +3(%[s3])    \n\t"
+            "sb     %[q4_r],    +4(%[s3])    \n\t"
+            "sb     %[q5_r],    +5(%[s3])    \n\t"
+            "sb     %[q6_r],    +6(%[s3])    \n\t"
+
+            :
+            : [q0_r] "r" (q0_r), [q1_r] "r" (q1_r),
+              [q2_r] "r" (q2_r), [q3_r] "r" (q3_r),
+              [q4_r] "r" (q4_r), [q5_r] "r" (q5_r),
+              [q6_r] "r" (q6_r), [s3] "r" (s3)
+        );
+      } else if (mask & flat & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb     %[p2_r_f1],     -3(%[s3])    \n\t"
+            "sb     %[p1_r_f1],     -2(%[s3])    \n\t"
+            "sb     %[p0_r_f1],     -1(%[s3])    \n\t"
+            "sb     %[q0_r_f1],       (%[s3])    \n\t"
+            "sb     %[q1_r_f1],     +1(%[s3])    \n\t"
+            "sb     %[q2_r_f1],     +2(%[s3])    \n\t"
+
+            :
+            : [p2_r_f1] "r" (p2_r_f1), [p1_r_f1] "r" (p1_r_f1),
+              [p0_r_f1] "r" (p0_r_f1), [q0_r_f1] "r" (q0_r_f1),
+              [q1_r_f1] "r" (q1_r_f1), [q2_r_f1] "r" (q2_r_f1),
+              [s3] "r" (s3)
+        );
+      } else if (mask & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb     %[p1_f0],   -2(%[s3])    \n\t"
+            "sb     %[p0_f0],   -1(%[s3])    \n\t"
+            "sb     %[q0_f0],     (%[s3])    \n\t"
+            "sb     %[q1_f0],   +1(%[s3])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [s3] "r" (s3)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p1_f0],   %[p1_f0],   8     \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8     \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
+
+          : [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & flat2 & 0x00FF0000) {
+        __asm__ __volatile__ (
+            "sb     %[p6_l],    -7(%[s2])    \n\t"
+            "sb     %[p5_l],    -6(%[s2])    \n\t"
+            "sb     %[p4_l],    -5(%[s2])    \n\t"
+            "sb     %[p3_l],    -4(%[s2])    \n\t"
+            "sb     %[p2_l],    -3(%[s2])    \n\t"
+            "sb     %[p1_l],    -2(%[s2])    \n\t"
+            "sb     %[p0_l],    -1(%[s2])    \n\t"
+
+            :
+            : [p6_l] "r" (p6_l), [p5_l] "r" (p5_l), [p4_l] "r" (p4_l),
+              [p3_l] "r" (p3_l), [p2_l] "r" (p2_l), [p1_l] "r" (p1_l),
+              [p0_l] "r" (p0_l), [s2] "r" (s2)
+        );
+
+        __asm__ __volatile__ (
+            "sb     %[q0_l],      (%[s2])    \n\t"
+            "sb     %[q1_l],    +1(%[s2])    \n\t"
+            "sb     %[q2_l],    +2(%[s2])    \n\t"
+            "sb     %[q3_l],    +3(%[s2])    \n\t"
+            "sb     %[q4_l],    +4(%[s2])    \n\t"
+            "sb     %[q5_l],    +5(%[s2])    \n\t"
+            "sb     %[q6_l],    +6(%[s2])    \n\t"
+
+            :
+            : [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+              [q3_l] "r" (q3_l), [q4_l] "r" (q4_l), [q5_l] "r" (q5_l),
+              [q6_l] "r" (q6_l), [s2] "r" (s2)
+        );
+      } else if (mask & flat & 0x00FF0000) {
+        __asm__ __volatile__ (
+            "sb     %[p2_l_f1],     -3(%[s2])    \n\t"
+            "sb     %[p1_l_f1],     -2(%[s2])    \n\t"
+            "sb     %[p0_l_f1],     -1(%[s2])    \n\t"
+            "sb     %[q0_l_f1],       (%[s2])    \n\t"
+            "sb     %[q1_l_f1],     +1(%[s2])    \n\t"
+            "sb     %[q2_l_f1],     +2(%[s2])    \n\t"
+
+            :
+            : [p2_l_f1] "r" (p2_l_f1), [p1_l_f1] "r" (p1_l_f1),
+              [p0_l_f1] "r" (p0_l_f1), [q0_l_f1] "r" (q0_l_f1),
+              [q1_l_f1] "r" (q1_l_f1), [q2_l_f1] "r" (q2_l_f1),
+              [s2] "r" (s2)
+        );
+      } else if (mask & 0x00FF0000) {
+        __asm__ __volatile__ (
+            "sb     %[p1_f0],   -2(%[s2])    \n\t"
+            "sb     %[p0_f0],   -1(%[s2])    \n\t"
+            "sb     %[q0_f0],     (%[s2])    \n\t"
+            "sb     %[q1_f0],   +1(%[s2])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [s2] "r" (s2)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p6_l],        %[p6_l],        16     \n\t"
+          "srl      %[p5_l],        %[p5_l],        16     \n\t"
+          "srl      %[p4_l],        %[p4_l],        16     \n\t"
+          "srl      %[p3_l],        %[p3_l],        16     \n\t"
+          "srl      %[p2_l],        %[p2_l],        16     \n\t"
+          "srl      %[p1_l],        %[p1_l],        16     \n\t"
+          "srl      %[p0_l],        %[p0_l],        16     \n\t"
+          "srl      %[q0_l],        %[q0_l],        16     \n\t"
+          "srl      %[q1_l],        %[q1_l],        16     \n\t"
+          "srl      %[q2_l],        %[q2_l],        16     \n\t"
+          "srl      %[q3_l],        %[q3_l],        16     \n\t"
+          "srl      %[q4_l],        %[q4_l],        16     \n\t"
+          "srl      %[q5_l],        %[q5_l],        16     \n\t"
+          "srl      %[q6_l],        %[q6_l],        16     \n\t"
+
+          : [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l),
+            [q3_l] "+r" (q3_l), [q4_l] "+r" (q4_l), [q5_l] "+r" (q5_l),
+            [q6_l] "+r" (q6_l), [p6_l] "+r" (p6_l), [p5_l] "+r" (p5_l),
+            [p4_l] "+r" (p4_l), [p3_l] "+r" (p3_l), [p2_l] "+r" (p2_l),
+            [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l)
+          :
+      );
+
+      __asm__ __volatile__ (
+          "srl      %[p2_l_f1],     %[p2_l_f1],     16      \n\t"
+          "srl      %[p1_l_f1],     %[p1_l_f1],     16      \n\t"
+          "srl      %[p0_l_f1],     %[p0_l_f1],     16      \n\t"
+          "srl      %[q0_l_f1],     %[q0_l_f1],     16      \n\t"
+          "srl      %[q1_l_f1],     %[q1_l_f1],     16      \n\t"
+          "srl      %[q2_l_f1],     %[q2_l_f1],     16      \n\t"
+          "srl      %[p1_f0],       %[p1_f0],       8       \n\t"
+          "srl      %[p0_f0],       %[p0_f0],       8       \n\t"
+          "srl      %[q0_f0],       %[q0_f0],       8       \n\t"
+          "srl      %[q1_f0],       %[q1_f0],       8       \n\t"
+
+          : [p2_l_f1] "+r" (p2_l_f1), [p1_l_f1] "+r" (p1_l_f1),
+            [p0_l_f1] "+r" (p0_l_f1), [q0_l_f1] "+r" (q0_l_f1),
+            [q1_l_f1] "+r" (q1_l_f1), [q2_l_f1] "+r" (q2_l_f1),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & flat2 & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb     %[p6_l],    -7(%[s1])    \n\t"
+            "sb     %[p5_l],    -6(%[s1])    \n\t"
+            "sb     %[p4_l],    -5(%[s1])    \n\t"
+            "sb     %[p3_l],    -4(%[s1])    \n\t"
+            "sb     %[p2_l],    -3(%[s1])    \n\t"
+            "sb     %[p1_l],    -2(%[s1])    \n\t"
+            "sb     %[p0_l],    -1(%[s1])    \n\t"
+
+            :
+            : [p6_l] "r" (p6_l), [p5_l] "r" (p5_l), [p4_l] "r" (p4_l),
+              [p3_l] "r" (p3_l), [p2_l] "r" (p2_l), [p1_l] "r" (p1_l),
+              [p0_l] "r" (p0_l),
+              [s1] "r" (s1)
+        );
+
+        __asm__ __volatile__ (
+            "sb     %[q0_l],     (%[s1])    \n\t"
+            "sb     %[q1_l],    1(%[s1])    \n\t"
+            "sb     %[q2_l],    2(%[s1])    \n\t"
+            "sb     %[q3_l],    3(%[s1])    \n\t"
+            "sb     %[q4_l],    4(%[s1])    \n\t"
+            "sb     %[q5_l],    5(%[s1])    \n\t"
+            "sb     %[q6_l],    6(%[s1])    \n\t"
+
+            :
+            : [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+              [q3_l] "r" (q3_l), [q4_l] "r" (q4_l), [q5_l] "r" (q5_l),
+              [q6_l] "r" (q6_l),
+              [s1] "r" (s1)
+        );
+      } else if (mask & flat & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb     %[p2_l_f1],     -3(%[s1])    \n\t"
+            "sb     %[p1_l_f1],     -2(%[s1])    \n\t"
+            "sb     %[p0_l_f1],     -1(%[s1])    \n\t"
+            "sb     %[q0_l_f1],       (%[s1])    \n\t"
+            "sb     %[q1_l_f1],     +1(%[s1])    \n\t"
+            "sb     %[q2_l_f1],     +2(%[s1])    \n\t"
+
+            :
+            : [p2_l_f1] "r" (p2_l_f1), [p1_l_f1] "r" (p1_l_f1),
+              [p0_l_f1] "r" (p0_l_f1), [q0_l_f1] "r" (q0_l_f1),
+              [q1_l_f1] "r" (q1_l_f1), [q2_l_f1] "r" (q2_l_f1),
+              [s1] "r" (s1)
+        );
+      } else if (mask & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb     %[p1_f0],   -2(%[s1])    \n\t"
+            "sb     %[p0_f0],   -1(%[s1])    \n\t"
+            "sb     %[q0_f0],     (%[s1])    \n\t"
+            "sb     %[q1_f0],   +1(%[s1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [s1] "r" (s1)
+        );
+      }
+    }
+  }
+}
+#endif  // #if HAVE_DSPR2
diff --git a/libs/libvpx/vpx_dsp/mips/loopfilter_msa.h b/libs/libvpx/vpx_dsp/mips/loopfilter_msa.h
new file mode 100644
index 0000000000..62b170610b
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/loopfilter_msa.h
@@ -0,0 +1,246 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_LOOPFILTER_MSA_H_
+#define VPX_DSP_LOOPFILTER_MSA_H_
+
+#include "vpx_dsp/mips/macros_msa.h"
+
+#define VP9_LPF_FILTER4_8W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in,  \
+                           p1_out, p0_out, q0_out, q1_out) {             \
+  v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign;                    \
+  v16i8 filt, filt1, filt2, cnst4b, cnst3b;                              \
+  v8i16 q0_sub_p0_r, filt_r, cnst3h;                                     \
+                                                                         \
+  p1_m = (v16i8)__msa_xori_b(p1_in, 0x80);                               \
+  p0_m = (v16i8)__msa_xori_b(p0_in, 0x80);                               \
+  q0_m = (v16i8)__msa_xori_b(q0_in, 0x80);                               \
+  q1_m = (v16i8)__msa_xori_b(q1_in, 0x80);                               \
+                                                                         \
+  filt = __msa_subs_s_b(p1_m, q1_m);                                     \
+  filt = filt & (v16i8)hev_in;                                           \
+  q0_sub_p0 = q0_m - p0_m;                                               \
+  filt_sign = __msa_clti_s_b(filt, 0);                                   \
+                                                                         \
+  cnst3h = __msa_ldi_h(3);                                               \
+  q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0);               \
+  q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h);       \
+  filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt);                         \
+  filt_r += q0_sub_p0_r;                                                 \
+  filt_r = __msa_sat_s_h(filt_r, 7);                                     \
+                                                                         \
+  /* combine left and right part */                                      \
+  filt = __msa_pckev_b((v16i8)filt_r, (v16i8)filt_r);                    \
+                                                                         \
+  filt = filt & (v16i8)mask_in;                                          \
+  cnst4b = __msa_ldi_b(4);                                               \
+  filt1 = __msa_adds_s_b(filt, cnst4b);                                  \
+  filt1 >>= 3;                                                           \
+                                                                         \
+  cnst3b = __msa_ldi_b(3);                                               \
+  filt2 = __msa_adds_s_b(filt, cnst3b);                                  \
+  filt2 >>= 3;                                                           \
+                                                                         \
+  q0_m = __msa_subs_s_b(q0_m, filt1);                                    \
+  q0_out = __msa_xori_b((v16u8)q0_m, 0x80);                              \
+  p0_m = __msa_adds_s_b(p0_m, filt2);                                    \
+  p0_out = __msa_xori_b((v16u8)p0_m, 0x80);                              \
+                                                                         \
+  filt = __msa_srari_b(filt1, 1);                                        \
+  hev_in = __msa_xori_b((v16u8)hev_in, 0xff);                            \
+  filt = filt & (v16i8)hev_in;                                           \
+                                                                         \
+  q1_m = __msa_subs_s_b(q1_m, filt);                                     \
+  q1_out = __msa_xori_b((v16u8)q1_m, 0x80);                              \
+  p1_m = __msa_adds_s_b(p1_m, filt);                                     \
+  p1_out = __msa_xori_b((v16u8)p1_m, 0x80);                              \
+}
+
+#define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in,  \
+                           p1_out, p0_out, q0_out, q1_out) {             \
+  v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign;                    \
+  v16i8 filt, filt1, filt2, cnst4b, cnst3b;                              \
+  v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h;                \
+                                                                         \
+  p1_m = (v16i8)__msa_xori_b(p1_in, 0x80);                               \
+  p0_m = (v16i8)__msa_xori_b(p0_in, 0x80);                               \
+  q0_m = (v16i8)__msa_xori_b(q0_in, 0x80);                               \
+  q1_m = (v16i8)__msa_xori_b(q1_in, 0x80);                               \
+                                                                         \
+  filt = __msa_subs_s_b(p1_m, q1_m);                                     \
+                                                                         \
+  filt = filt & (v16i8)hev_in;                                           \
+                                                                         \
+  q0_sub_p0 = q0_m - p0_m;                                               \
+  filt_sign = __msa_clti_s_b(filt, 0);                                   \
+                                                                         \
+  cnst3h = __msa_ldi_h(3);                                               \
+  q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0);               \
+  q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h);       \
+  filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt);                         \
+  filt_r += q0_sub_p0_r;                                                 \
+  filt_r = __msa_sat_s_h(filt_r, 7);                                     \
+                                                                         \
+  q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0, q0_sub_p0);               \
+  q0_sub_p0_l = __msa_dotp_s_h((v16i8)q0_sub_p0_l, (v16i8)cnst3h);       \
+  filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt);                         \
+  filt_l += q0_sub_p0_l;                                                 \
+  filt_l = __msa_sat_s_h(filt_l, 7);                                     \
+                                                                         \
+  filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r);                    \
+  filt = filt & (v16i8)mask_in;                                          \
+                                                                         \
+  cnst4b = __msa_ldi_b(4);                                               \
+  filt1 = __msa_adds_s_b(filt, cnst4b);                                  \
+  filt1 >>= 3;                                                           \
+                                                                         \
+  cnst3b = __msa_ldi_b(3);                                               \
+  filt2 = __msa_adds_s_b(filt, cnst3b);                                  \
+  filt2 >>= 3;                                                           \
+                                                                         \
+  q0_m = __msa_subs_s_b(q0_m, filt1);                                    \
+  q0_out = __msa_xori_b((v16u8)q0_m, 0x80);                              \
+  p0_m = __msa_adds_s_b(p0_m, filt2);                                    \
+  p0_out = __msa_xori_b((v16u8)p0_m, 0x80);                              \
+                                                                         \
+  filt = __msa_srari_b(filt1, 1);                                        \
+  hev_in = __msa_xori_b((v16u8)hev_in, 0xff);                            \
+  filt = filt & (v16i8)hev_in;                                           \
+                                                                         \
+  q1_m = __msa_subs_s_b(q1_m, filt);                                     \
+  q1_out = __msa_xori_b((v16u8)q1_m, 0x80);                              \
+  p1_m = __msa_adds_s_b(p1_m, filt);                                     \
+  p1_out = __msa_xori_b((v16u8)p1_m, 0x80);                              \
+}
+
+#define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) {  \
+  v16u8 tmp, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0;         \
+  v16u8 zero_in = { 0 };                                                 \
+                                                                         \
+  tmp = __msa_ori_b(zero_in, 1);                                         \
+  p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in);                            \
+  q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in);                            \
+  p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in);                            \
+  q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in);                            \
+                                                                         \
+  p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0);                 \
+  flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out);                       \
+  p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0);                 \
+  flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out);                       \
+                                                                         \
+  flat_out = (tmp < (v16u8)flat_out);                                    \
+  flat_out = __msa_xori_b(flat_out, 0xff);                               \
+  flat_out = flat_out & (mask);                                          \
+}
+
+#define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in,  \
+                  q5_in, q6_in, q7_in, flat_in, flat2_out) {        \
+  v16u8 tmp, zero_in = { 0 };                                       \
+  v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0;         \
+  v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0;         \
+                                                                    \
+  tmp = __msa_ori_b(zero_in, 1);                                    \
+  p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in);                       \
+  q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in);                       \
+  p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in);                       \
+  q5_a_sub_q0 = __msa_asub_u_b(q5_in, q0_in);                       \
+  p6_a_sub_p0 = __msa_asub_u_b(p6_in, p0_in);                       \
+  q6_a_sub_q0 = __msa_asub_u_b(q6_in, q0_in);                       \
+  p7_a_sub_p0 = __msa_asub_u_b(p7_in, p0_in);                       \
+  q7_a_sub_q0 = __msa_asub_u_b(q7_in, q0_in);                       \
+                                                                    \
+  p4_a_sub_p0 = __msa_max_u_b(p4_a_sub_p0, q4_a_sub_q0);            \
+  flat2_out = __msa_max_u_b(p5_a_sub_p0, q5_a_sub_q0);              \
+  flat2_out = __msa_max_u_b(p4_a_sub_p0, flat2_out);                \
+  p6_a_sub_p0 = __msa_max_u_b(p6_a_sub_p0, q6_a_sub_q0);            \
+  flat2_out = __msa_max_u_b(p6_a_sub_p0, flat2_out);                \
+  p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0);            \
+  flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out);                \
+                                                                    \
+  flat2_out = (tmp < (v16u8)flat2_out);                             \
+  flat2_out = __msa_xori_b(flat2_out, 0xff);                        \
+  flat2_out = flat2_out & flat_in;                                  \
+}
+
+#define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in,                  \
+                    q0_in, q1_in, q2_in, q3_in,                  \
+                    p2_filt8_out, p1_filt8_out, p0_filt8_out,    \
+                    q0_filt8_out, q1_filt8_out, q2_filt8_out) {  \
+  v8u16 tmp0, tmp1, tmp2;                                        \
+                                                                 \
+  tmp2 = p2_in + p1_in + p0_in;                                  \
+  tmp0 = p3_in << 1;                                             \
+                                                                 \
+  tmp0 = tmp0 + tmp2 + q0_in;                                    \
+  tmp1 = tmp0 + p3_in + p2_in;                                   \
+  p2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3);           \
+                                                                 \
+  tmp1 = tmp0 + p1_in + q1_in;                                   \
+  p1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3);           \
+                                                                 \
+  tmp1 = q2_in + q1_in + q0_in;                                  \
+  tmp2 = tmp2 + tmp1;                                            \
+  tmp0 = tmp2 + (p0_in);                                         \
+  tmp0 = tmp0 + (p3_in);                                         \
+  p0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp0, 3);           \
+                                                                 \
+  tmp0 = q2_in + q3_in;                                          \
+  tmp0 = p0_in + tmp1 + tmp0;                                    \
+  tmp1 = q3_in + q3_in;                                          \
+  tmp1 = tmp1 + tmp0;                                            \
+  q2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3);           \
+                                                                 \
+  tmp0 = tmp2 + q3_in;                                           \
+  tmp1 = tmp0 + q0_in;                                           \
+  q0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3);           \
+                                                                 \
+  tmp1 = tmp0 - p2_in;                                           \
+  tmp0 = q1_in + q3_in;                                          \
+  tmp1 = tmp0 + tmp1;                                            \
+  q1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3);           \
+}
+
+#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in,                 \
+                     q0_in, q1_in, q2_in, q3_in,                 \
+                     limit_in, b_limit_in, thresh_in,            \
+                     hev_out, mask_out, flat_out) {              \
+  v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m;  \
+  v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m;  \
+                                                                 \
+  /* absolute subtraction of pixel values */                     \
+  p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in);                   \
+  p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in);                   \
+  p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in);                   \
+  q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in);                   \
+  q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in);                   \
+  q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in);                   \
+  p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in);                   \
+  p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in);                   \
+                                                                 \
+  /* calculation of hev */                                       \
+  flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m);          \
+  hev_out = thresh_in < (v16u8)flat_out;                         \
+                                                                 \
+  /* calculation of mask */                                      \
+  p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m);     \
+  p1_asub_q1_m >>= 1;                                            \
+  p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m);     \
+                                                                 \
+  mask_out = b_limit_in < p0_asub_q0_m;                          \
+  mask_out = __msa_max_u_b(flat_out, mask_out);                  \
+  p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m);      \
+  mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out);              \
+  q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m);      \
+  mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out);              \
+                                                                 \
+  mask_out = limit_in < (v16u8)mask_out;                         \
+  mask_out = __msa_xori_b(mask_out, 0xff);                       \
+}
+#endif  /* VPX_DSP_LOOPFILTER_MSA_H_ */
diff --git a/libs/libvpx/vpx_dsp/mips/macros_msa.h b/libs/libvpx/vpx_dsp/mips/macros_msa.h
new file mode 100644
index 0000000000..91e3615cf8
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/macros_msa.h
@@ -0,0 +1,1932 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_MIPS_MACROS_MSA_H_
+#define VPX_DSP_MIPS_MACROS_MSA_H_
+
+#include <msa.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+#define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc))
+#define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
+#define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
+
+#define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc))
+#define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
+#define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
+
+#define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc))
+#define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
+
+#define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
+#define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
+#define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
+
+#define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
+#define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
+
+#define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
+#define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
+
+#if (__mips_isa_rev >= 6)
+#define LH(psrc) ({                                 \
+  const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
+  uint16_t val_m;                                   \
+                                                    \
+  __asm__ __volatile__ (                            \
+      "lh  %[val_m],  %[psrc_m]  \n\t"              \
+                                                    \
+      : [val_m] "=r" (val_m)                        \
+      : [psrc_m] "m" (*psrc_m)                      \
+  );                                                \
+                                                    \
+  val_m;                                            \
+})
+
+#define LW(psrc) ({                                 \
+  const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
+  uint32_t val_m;                                   \
+                                                    \
+  __asm__ __volatile__ (                            \
+      "lw  %[val_m],  %[psrc_m]  \n\t"              \
+                                                    \
+      : [val_m] "=r" (val_m)                        \
+      : [psrc_m] "m" (*psrc_m)                      \
+  );                                                \
+                                                    \
+  val_m;                                            \
+})
+
+#if (__mips == 64)
+#define LD(psrc) ({                                 \
+  const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
+  uint64_t val_m = 0;                               \
+                                                    \
+  __asm__ __volatile__ (                            \
+      "ld  %[val_m],  %[psrc_m]  \n\t"              \
+                                                    \
+      : [val_m] "=r" (val_m)                        \
+      : [psrc_m] "m" (*psrc_m)                      \
+  );                                                \
+                                                    \
+  val_m;                                            \
+})
+#else  // !(__mips == 64)
+#define LD(psrc) ({                                        \
+  const uint8_t *psrc_m = (const uint8_t *)(psrc);         \
+  uint32_t val0_m, val1_m;                                 \
+  uint64_t val_m = 0;                                      \
+                                                           \
+  val0_m = LW(psrc_m);                                     \
+  val1_m = LW(psrc_m + 4);                                 \
+                                                           \
+  val_m = (uint64_t)(val1_m);                              \
+  val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000);  \
+  val_m = (uint64_t)(val_m | (uint64_t)val0_m);            \
+                                                           \
+  val_m;                                                   \
+})
+#endif  // (__mips == 64)
+
+#define SH(val, pdst) {                 \
+  uint8_t *pdst_m = (uint8_t *)(pdst);  \
+  const uint16_t val_m = (val);         \
+                                        \
+  __asm__ __volatile__ (                \
+      "sh  %[val_m],  %[pdst_m]  \n\t"  \
+                                        \
+      : [pdst_m] "=m" (*pdst_m)         \
+      : [val_m] "r" (val_m)             \
+  );                                    \
+}
+
+#define SW(val, pdst) {                 \
+  uint8_t *pdst_m = (uint8_t *)(pdst);  \
+  const uint32_t val_m = (val);         \
+                                        \
+  __asm__ __volatile__ (                \
+      "sw  %[val_m],  %[pdst_m]  \n\t"  \
+                                        \
+      : [pdst_m] "=m" (*pdst_m)         \
+      : [val_m] "r" (val_m)             \
+  );                                    \
+}
+
+#define SD(val, pdst) {                 \
+  uint8_t *pdst_m = (uint8_t *)(pdst);  \
+  const uint64_t val_m = (val);         \
+                                        \
+  __asm__ __volatile__ (                \
+      "sd  %[val_m],  %[pdst_m]  \n\t"  \
+                                        \
+      : [pdst_m] "=m" (*pdst_m)         \
+      : [val_m] "r" (val_m)             \
+  );                                    \
+}
+#else  // !(__mips_isa_rev >= 6)
+#define LH(psrc) ({                                 \
+  const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
+  uint16_t val_m;                                   \
+                                                    \
+  __asm__ __volatile__ (                            \
+      "ulh  %[val_m],  %[psrc_m]  \n\t"             \
+                                                    \
+      : [val_m] "=r" (val_m)                        \
+      : [psrc_m] "m" (*psrc_m)                      \
+  );                                                \
+                                                    \
+  val_m;                                            \
+})
+
+#define LW(psrc) ({                                 \
+  const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
+  uint32_t val_m;                                   \
+                                                    \
+  __asm__ __volatile__ (                            \
+      "ulw  %[val_m],  %[psrc_m]  \n\t"             \
+                                                    \
+      : [val_m] "=r" (val_m)                        \
+      : [psrc_m] "m" (*psrc_m)                      \
+  );                                                \
+                                                    \
+  val_m;                                            \
+})
+
+#if (__mips == 64)
+#define LD(psrc) ({                                 \
+  const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
+  uint64_t val_m = 0;                               \
+                                                    \
+  __asm__ __volatile__ (                            \
+      "uld  %[val_m],  %[psrc_m]  \n\t"             \
+                                                    \
+      : [val_m] "=r" (val_m)                        \
+      : [psrc_m] "m" (*psrc_m)                      \
+  );                                                \
+                                                    \
+  val_m;                                            \
+})
+#else  // !(__mips == 64)
+#define LD(psrc) ({                                        \
+  const uint8_t *psrc_m1 = (const uint8_t *)(psrc);        \
+  uint32_t val0_m, val1_m;                                 \
+  uint64_t val_m = 0;                                      \
+                                                           \
+  val0_m = LW(psrc_m1);                                    \
+  val1_m = LW(psrc_m1 + 4);                                \
+                                                           \
+  val_m = (uint64_t)(val1_m);                              \
+  val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000);  \
+  val_m = (uint64_t)(val_m | (uint64_t)val0_m);            \
+                                                           \
+  val_m;                                                   \
+})
+#endif  // (__mips == 64)
+
+#define SH(val, pdst) {                  \
+  uint8_t *pdst_m = (uint8_t *)(pdst);   \
+  const uint16_t val_m = (val);          \
+                                         \
+  __asm__ __volatile__ (                 \
+      "ush  %[val_m],  %[pdst_m]  \n\t"  \
+                                         \
+      : [pdst_m] "=m" (*pdst_m)          \
+      : [val_m] "r" (val_m)              \
+  );                                     \
+}
+
+#define SW(val, pdst) {                  \
+  uint8_t *pdst_m = (uint8_t *)(pdst);   \
+  const uint32_t val_m = (val);          \
+                                         \
+  __asm__ __volatile__ (                 \
+      "usw  %[val_m],  %[pdst_m]  \n\t"  \
+                                         \
+      : [pdst_m] "=m" (*pdst_m)          \
+      : [val_m] "r" (val_m)              \
+  );                                     \
+}
+
+#define SD(val, pdst) {                                     \
+  uint8_t *pdst_m1 = (uint8_t *)(pdst);                     \
+  uint32_t val0_m, val1_m;                                  \
+                                                            \
+  val0_m = (uint32_t)((val) & 0x00000000FFFFFFFF);          \
+  val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF);  \
+                                                            \
+  SW(val0_m, pdst_m1);                                      \
+  SW(val1_m, pdst_m1 + 4);                                  \
+}
+#endif  // (__mips_isa_rev >= 6)
+
+/* Description : Load 4 words with stride
+   Arguments   : Inputs  - psrc, stride
+                 Outputs - out0, out1, out2, out3
+   Details     : Load word in 'out0' from (psrc)
+                 Load word in 'out1' from (psrc + stride)
+                 Load word in 'out2' from (psrc + 2 * stride)
+                 Load word in 'out3' from (psrc + 3 * stride)
+*/
+#define LW4(psrc, stride, out0, out1, out2, out3) {  \
+  out0 = LW((psrc));                                 \
+  out1 = LW((psrc) + stride);                        \
+  out2 = LW((psrc) + 2 * stride);                    \
+  out3 = LW((psrc) + 3 * stride);                    \
+}
+
+/* Description : Load double words with stride
+   Arguments   : Inputs  - psrc, stride
+                 Outputs - out0, out1
+   Details     : Load double word in 'out0' from (psrc)
+                 Load double word in 'out1' from (psrc + stride)
+*/
+#define LD2(psrc, stride, out0, out1) {  \
+  out0 = LD((psrc));                     \
+  out1 = LD((psrc) + stride);            \
+}
+#define LD4(psrc, stride, out0, out1, out2, out3) {  \
+  LD2((psrc), stride, out0, out1);                   \
+  LD2((psrc) + 2 * stride, stride, out2, out3);      \
+}
+
+/* Description : Store 4 words with stride
+   Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
+   Details     : Store word from 'in0' to (pdst)
+                 Store word from 'in1' to (pdst + stride)
+                 Store word from 'in2' to (pdst + 2 * stride)
+                 Store word from 'in3' to (pdst + 3 * stride)
+*/
+#define SW4(in0, in1, in2, in3, pdst, stride) {  \
+  SW(in0, (pdst))                                \
+  SW(in1, (pdst) + stride);                      \
+  SW(in2, (pdst) + 2 * stride);                  \
+  SW(in3, (pdst) + 3 * stride);                  \
+}
+
+/* Description : Store 4 double words with stride
+   Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
+   Details     : Store double word from 'in0' to (pdst)
+                 Store double word from 'in1' to (pdst + stride)
+                 Store double word from 'in2' to (pdst + 2 * stride)
+                 Store double word from 'in3' to (pdst + 3 * stride)
+*/
+#define SD4(in0, in1, in2, in3, pdst, stride) {  \
+  SD(in0, (pdst))                                \
+  SD(in1, (pdst) + stride);                      \
+  SD(in2, (pdst) + 2 * stride);                  \
+  SD(in3, (pdst) + 3 * stride);                  \
+}
+
+/* Description : Load vectors with 16 byte elements with stride
+   Arguments   : Inputs  - psrc, stride
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Load 16 byte elements in 'out0' from (psrc)
+                 Load 16 byte elements in 'out1' from (psrc + stride)
+*/
+#define LD_B2(RTYPE, psrc, stride, out0, out1) {  \
+  out0 = LD_B(RTYPE, (psrc));                     \
+  out1 = LD_B(RTYPE, (psrc) + stride);            \
+}
+#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
+#define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
+
+#define LD_B3(RTYPE, psrc, stride, out0, out1, out2) {  \
+  LD_B2(RTYPE, (psrc), stride, out0, out1);             \
+  out2 = LD_B(RTYPE, (psrc) + 2 * stride);              \
+}
+#define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__)
+
+#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) {  \
+  LD_B2(RTYPE, (psrc), stride, out0, out1);                   \
+  LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3);     \
+}
+#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
+#define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
+
+#define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) {  \
+  LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);             \
+  out4 = LD_B(RTYPE, (psrc) + 4 * stride);                          \
+}
+#define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)
+#define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__)
+
+#define LD_B7(RTYPE, psrc, stride,                             \
+              out0, out1, out2, out3, out4, out5, out6) {      \
+  LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4);  \
+  LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6);       \
+}
+#define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__)
+
+#define LD_B8(RTYPE, psrc, stride,                                    \
+              out0, out1, out2, out3, out4, out5, out6, out7) {       \
+  LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \
+  LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);  \
+}
+#define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
+#define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
+
+/* Description : Load vectors with 8 halfword elements with stride
+   Arguments   : Inputs  - psrc, stride
+                 Outputs - out0, out1
+   Details     : Load 8 halfword elements in 'out0' from (psrc)
+                 Load 8 halfword elements in 'out1' from (psrc + stride)
+*/
+#define LD_H2(RTYPE, psrc, stride, out0, out1) {  \
+  out0 = LD_H(RTYPE, (psrc));                     \
+  out1 = LD_H(RTYPE, (psrc) + (stride));          \
+}
+#define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
+
+#define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) {  \
+  LD_H2(RTYPE, (psrc), stride, out0, out1);                   \
+  LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3);      \
+}
+#define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
+
+#define LD_H8(RTYPE, psrc, stride,                                    \
+              out0, out1, out2, out3, out4, out5, out6, out7) {       \
+  LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \
+  LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);  \
+}
+#define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__)
+
+#define LD_H16(RTYPE, psrc, stride,                                     \
+               out0, out1, out2, out3, out4, out5, out6, out7,          \
+               out8, out9, out10, out11, out12, out13, out14, out15) {  \
+  LD_H8(RTYPE, (psrc), stride,                                          \
+        out0, out1, out2, out3, out4, out5, out6, out7);                \
+  LD_H8(RTYPE, (psrc) + 8 * stride, stride,                             \
+        out8, out9, out10, out11, out12, out13, out14, out15);          \
+}
+#define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__)
+
+/* Description : Load 4x4 block of signed halfword elements from 1D source
+                 data into 4 vectors (Each vector with 4 signed halfwords)
+   Arguments   : Input   - psrc
+                 Outputs - out0, out1, out2, out3
+*/
+#define LD4x4_SH(psrc, out0, out1, out2, out3) {         \
+  out0 = LD_SH(psrc);                                    \
+  out2 = LD_SH(psrc + 8);                                \
+  out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0);  \
+  out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2);  \
+}
+
+/* Description : Load 2 vectors of signed word elements with stride
+   Arguments   : Inputs  - psrc, stride
+                 Outputs - out0, out1
+                 Return Type - signed word
+*/
+#define LD_SW2(psrc, stride, out0, out1) {  \
+  out0 = LD_SW((psrc));                     \
+  out1 = LD_SW((psrc) + stride);            \
+}
+
+/* Description : Store vectors of 16 byte elements with stride
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : Store 16 byte elements from 'in0' to (pdst)
+                 Store 16 byte elements from 'in1' to (pdst + stride)
+*/
+#define ST_B2(RTYPE, in0, in1, pdst, stride) {  \
+  ST_B(RTYPE, in0, (pdst));                     \
+  ST_B(RTYPE, in1, (pdst) + stride);            \
+}
+#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
+
+#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) {  \
+  ST_B2(RTYPE, in0, in1, (pdst), stride);                 \
+  ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride);    \
+}
+#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
+
+#define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,      \
+              pdst, stride) {                                     \
+  ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride);                 \
+  ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);  \
+}
+#define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
+
+/* Description : Store vectors of 8 halfword elements with stride
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : Store 8 halfword elements from 'in0' to (pdst)
+                 Store 8 halfword elements from 'in1' to (pdst + stride)
+*/
+#define ST_H2(RTYPE, in0, in1, pdst, stride) {  \
+  ST_H(RTYPE, in0, (pdst));                     \
+  ST_H(RTYPE, in1, (pdst) + stride);            \
+}
+#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
+
+#define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride) {  \
+  ST_H2(RTYPE, in0, in1, (pdst), stride);                 \
+  ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride);    \
+}
+#define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__)
+
+#define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) {  \
+  ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride);                           \
+  ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);              \
+}
+#define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__)
+
+/* Description : Store vectors of word elements with stride
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : Store 4 word elements from 'in0' to (pdst)
+                 Store 4 word elements from 'in1' to (pdst + stride)
+*/
+#define ST_SW2(in0, in1, pdst, stride) {  \
+  ST_SW(in0, (pdst));                     \
+  ST_SW(in1, (pdst) + stride);            \
+}
+
+/* Description : Store 2x4 byte block to destination memory from input vector
+   Arguments   : Inputs - in, stidx, pdst, stride
+   Details     : Index 'stidx' halfword element from 'in' vector is copied to
+                 the GP register and stored to (pdst)
+                 Index 'stidx+1' halfword element from 'in' vector is copied to
+                 the GP register and stored to (pdst + stride)
+                 Index 'stidx+2' halfword element from 'in' vector is copied to
+                 the GP register and stored to (pdst + 2 * stride)
+                 Index 'stidx+3' halfword element from 'in' vector is copied to
+                 the GP register and stored to (pdst + 3 * stride)
+*/
+#define ST2x4_UB(in, stidx, pdst, stride) {         \
+  uint16_t out0_m, out1_m, out2_m, out3_m;          \
+  uint8_t *pblk_2x4_m = (uint8_t *)(pdst);          \
+                                                    \
+  out0_m = __msa_copy_u_h((v8i16)in, (stidx));      \
+  out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1));  \
+  out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2));  \
+  out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3));  \
+                                                    \
+  SH(out0_m, pblk_2x4_m);                           \
+  SH(out1_m, pblk_2x4_m + stride);                  \
+  SH(out2_m, pblk_2x4_m + 2 * stride);              \
+  SH(out3_m, pblk_2x4_m + 3 * stride);              \
+}
+
+/* Description : Store 4x2 byte block to destination memory from input vector
+   Arguments   : Inputs - in, pdst, stride
+   Details     : Index 0 word element from 'in' vector is copied to the GP
+                 register and stored to (pdst)
+                 Index 1 word element from 'in' vector is copied to the GP
+                 register and stored to (pdst + stride)
+*/
+#define ST4x2_UB(in, pdst, stride) {        \
+  uint32_t out0_m, out1_m;                  \
+  uint8_t *pblk_4x2_m = (uint8_t *)(pdst);  \
+                                            \
+  out0_m = __msa_copy_u_w((v4i32)in, 0);    \
+  out1_m = __msa_copy_u_w((v4i32)in, 1);    \
+                                            \
+  SW(out0_m, pblk_4x2_m);                   \
+  SW(out1_m, pblk_4x2_m + stride);          \
+}
+
+/* Description : Store 4x4 byte block to destination memory from input vector
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : 'Idx0' word element from input vector 'in0' is copied to the
+                 GP register and stored to (pdst)
+                 'Idx1' word element from input vector 'in0' is copied to the
+                 GP register and stored to (pdst + stride)
+                 'Idx2' word element from input vector 'in0' is copied to the
+                 GP register and stored to (pdst + 2 * stride)
+                 'Idx3' word element from input vector 'in0' is copied to the
+                 GP register and stored to (pdst + 3 * stride)
+*/
+#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) {  \
+  uint32_t out0_m, out1_m, out2_m, out3_m;                          \
+  uint8_t *pblk_4x4_m = (uint8_t *)(pdst);                          \
+                                                                    \
+  out0_m = __msa_copy_u_w((v4i32)in0, idx0);                        \
+  out1_m = __msa_copy_u_w((v4i32)in0, idx1);                        \
+  out2_m = __msa_copy_u_w((v4i32)in1, idx2);                        \
+  out3_m = __msa_copy_u_w((v4i32)in1, idx3);                        \
+                                                                    \
+  SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride);          \
+}
+#define ST4x8_UB(in0, in1, pdst, stride) {                        \
+  uint8_t *pblk_4x8 = (uint8_t *)(pdst);                          \
+                                                                  \
+  ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride);               \
+  ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride);  \
+}
+
+/* Description : Store 8x1 byte block to destination memory from input vector
+   Arguments   : Inputs - in, pdst
+   Details     : Index 0 double word element from 'in' vector is copied to the
+                 GP register and stored to (pdst)
+*/
+#define ST8x1_UB(in, pdst) {              \
+  uint64_t out0_m;                        \
+                                          \
+  out0_m = __msa_copy_u_d((v2i64)in, 0);  \
+  SD(out0_m, pdst);                       \
+}
+
+/* Description : Store 8x2 byte block to destination memory from input vector
+   Arguments   : Inputs - in, pdst, stride
+   Details     : Index 0 double word element from 'in' vector is copied to the
+                 GP register and stored to (pdst)
+                 Index 1 double word element from 'in' vector is copied to the
+                 GP register and stored to (pdst + stride)
+*/
+#define ST8x2_UB(in, pdst, stride) {        \
+  uint64_t out0_m, out1_m;                  \
+  uint8_t *pblk_8x2_m = (uint8_t *)(pdst);  \
+                                            \
+  out0_m = __msa_copy_u_d((v2i64)in, 0);    \
+  out1_m = __msa_copy_u_d((v2i64)in, 1);    \
+                                            \
+  SD(out0_m, pblk_8x2_m);                   \
+  SD(out1_m, pblk_8x2_m + stride);          \
+}
+
+/* Description : Store 8x4 byte block to destination memory from input
+                 vectors
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : Index 0 double word element from 'in0' vector is copied to the
+                 GP register and stored to (pdst)
+                 Index 1 double word element from 'in0' vector is copied to the
+                 GP register and stored to (pdst + stride)
+                 Index 0 double word element from 'in1' vector is copied to the
+                 GP register and stored to (pdst + 2 * stride)
+                 Index 1 double word element from 'in1' vector is copied to the
+                 GP register and stored to (pdst + 3 * stride)
+*/
+#define ST8x4_UB(in0, in1, pdst, stride) {                  \
+  uint64_t out0_m, out1_m, out2_m, out3_m;                  \
+  uint8_t *pblk_8x4_m = (uint8_t *)(pdst);                  \
+                                                            \
+  out0_m = __msa_copy_u_d((v2i64)in0, 0);                   \
+  out1_m = __msa_copy_u_d((v2i64)in0, 1);                   \
+  out2_m = __msa_copy_u_d((v2i64)in1, 0);                   \
+  out3_m = __msa_copy_u_d((v2i64)in1, 1);                   \
+                                                            \
+  SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride);  \
+}
+
+/* Description : average with rounding (in0 + in1 + 1) / 2.
+   Arguments   : Inputs  - in0, in1, in2, in3,
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Each unsigned byte element from 'in0' vector is added with
+                 each unsigned byte element from 'in1' vector. Then the average
+                 with rounding is calculated and written to 'out0'
+*/
+#define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
+  out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1);    \
+  out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3);    \
+}
+#define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
+
+#define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                 out0, out1, out2, out3) {                       \
+  AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1)                \
+  AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3)                \
+}
+#define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)
+
+/* Description : Immediate number of elements to slide with zero
+   Arguments   : Inputs  - in0, in1, slide_val
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Byte elements from 'zero_m' vector are slid into 'in0' by
+                 value specified in the 'slide_val'
+*/
+#define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) {          \
+  v16i8 zero_m = { 0 };                                              \
+  out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val);  \
+  out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val);  \
+}
+#define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__)
+
+#define SLDI_B4_0(RTYPE, in0, in1, in2, in3,            \
+                  out0, out1, out2, out3, slide_val) {  \
+  SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val);    \
+  SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val);    \
+}
+#define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__)
+
+/* Description : Immediate number of elements to slide
+   Arguments   : Inputs  - in0_0, in0_1, in1_0, in1_1, slide_val
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Byte elements from 'in0_0' vector are slid into 'in1_0' by
+                 value specified in the 'slide_val'
+*/
+#define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) {  \
+  out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val);         \
+  out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val);         \
+}
+#define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__)
+#define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
+
+#define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2,      \
+                out0, out1, out2, slide_val) {                        \
+  SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val)   \
+  out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val);  \
+}
+#define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__)
+#define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
+
+/* Description : Shuffle byte vector elements as per mask vector
+   Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Byte elements from 'in0' & 'in1' are copied selectively to
+                 'out0' as per control vector 'mask0'
+*/
+#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) {  \
+  out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0);     \
+  out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2);     \
+}
+#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
+#define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
+#define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
+
+#define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3,     \
+                out0, out1, out2, out3) {                        \
+  VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1);  \
+  VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3);  \
+}
+#define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
+#define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__)
+
+/* Description : Dot product of byte vector elements
+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Unsigned byte elements from 'mult0' are multiplied with
+                 unsigned byte elements from 'cnst0' producing a result
+                 twice the size of input i.e. unsigned halfword.
+                 The multiplication result of adjacent odd-even elements
+                 are added together and written to the 'out0' vector
+*/
+#define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \
+  out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0);        \
+  out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1);        \
+}
+#define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
+
+#define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3,         \
+                 cnst0, cnst1, cnst2, cnst3,                \
+                 out0, out1, out2, out3) {                  \
+  DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);  \
+  DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);  \
+}
+#define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
+
+/* Description : Dot product of byte vector elements
+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Signed byte elements from 'mult0' are multiplied with
+                 signed byte elements from 'cnst0' producing a result
+                 twice the size of input i.e. signed halfword.
+                 The multiplication result of adjacent odd-even elements
+                 are added together and written to the 'out0' vector
+*/
+#define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \
+  out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0);        \
+  out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1);        \
+}
+#define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
+
+#define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3,                     \
+                 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) {  \
+  DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);              \
+  DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);              \
+}
+#define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
+
+/* Description : Dot product of halfword vector elements
+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Signed halfword elements from 'mult0' are multiplied with
+                 signed halfword elements from 'cnst0' producing a result
+                 twice the size of input i.e. signed word.
+                 The multiplication result of adjacent odd-even elements
+                 are added together and written to the 'out0' vector
+*/
+#define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \
+  out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0);        \
+  out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1);        \
+}
+#define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
+
+#define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3,         \
+                 cnst0, cnst1, cnst2, cnst3,                \
+                 out0, out1, out2, out3) {                  \
+  DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);  \
+  DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);  \
+}
+#define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
+
+/* Description : Dot product of word vector elements
+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Signed word elements from 'mult0' are multiplied with
+                 signed word elements from 'cnst0' producing a result
+                 twice the size of input i.e. signed double word.
+                 The multiplication result of adjacent odd-even elements
+                 are added together and written to the 'out0' vector
+*/
+#define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \
+  out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0);        \
+  out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1);        \
+}
+#define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__)
+
+/* Description : Dot product & addition of byte vector elements
+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Signed byte elements from 'mult0' are multiplied with
+                 signed byte elements from 'cnst0' producing a result
+                 twice the size of input i.e. signed halfword.
+                 The multiplication result of adjacent odd-even elements
+                 are added to the 'out0' vector
+*/
+#define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {         \
+  out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0);  \
+  out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1);  \
+}
+#define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
+
+#define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3,                     \
+                  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) {  \
+  DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);              \
+  DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);              \
+}
+#define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
+
+/* Description : Dot product & addition of halfword vector elements
+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Signed halfword elements from 'mult0' are multiplied with
+                 signed halfword elements from 'cnst0' producing a result
+                 twice the size of input i.e. signed word.
+                 The multiplication result of adjacent odd-even elements
+                 are added to the 'out0' vector
+*/
+#define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {         \
+  out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0);  \
+  out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1);  \
+}
+#define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
+
+/* Description : Dot product & addition of double word vector elements
+   Arguments   : Inputs  - mult0, mult1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Each signed word element from 'mult0' is multiplied with itself
+                 producing an intermediate result twice the size of input
+                 i.e. signed double word
+                 The multiplication result of adjacent odd-even elements
+                 are added to the 'out0' vector
+*/
+#define DPADD_SD2(RTYPE, mult0, mult1, out0, out1) {                       \
+  out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0);  \
+  out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1);  \
+}
+#define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__)
+
+/* Description : Minimum values between unsigned elements of
+                 either vector are copied to the output vector
+   Arguments   : Inputs  - in0, in1, min_vec
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
+   Details     : Minimum of unsigned halfword element values from 'in0' and
+                 'min_vec' are written to output vector 'in0'
+*/
+#define MIN_UH2(RTYPE, in0, in1, min_vec) {         \
+  in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec);  \
+  in1 = (RTYPE)__msa_min_u_h((v8u16)in1, min_vec);  \
+}
+#define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__)
+
+#define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) {  \
+  MIN_UH2(RTYPE, in0, in1, min_vec);                   \
+  MIN_UH2(RTYPE, in2, in3, min_vec);                   \
+}
+#define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__)
+
+/* Description : Clips all signed halfword elements of input vector
+                 between 0 & 255
+   Arguments   : Input  - in
+                 Output - out_m
+                 Return Type - signed halfword
+*/
+#define CLIP_SH_0_255(in) ({                          \
+  v8i16 max_m = __msa_ldi_h(255);                     \
+  v8i16 out_m;                                        \
+                                                      \
+  out_m = __msa_maxi_s_h((v8i16)in, 0);               \
+  out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m);  \
+  out_m;                                              \
+})
+#define CLIP_SH2_0_255(in0, in1) {  \
+  in0 = CLIP_SH_0_255(in0);         \
+  in1 = CLIP_SH_0_255(in1);         \
+}
+#define CLIP_SH4_0_255(in0, in1, in2, in3) {  \
+  CLIP_SH2_0_255(in0, in1);                   \
+  CLIP_SH2_0_255(in2, in3);                   \
+}
+
+/* Description : Horizontal addition of 4 signed word elements of input vector
+   Arguments   : Input  - in       (signed word vector)
+                 Output - sum_m    (i32 sum)
+                 Return Type - signed word (GP)
+   Details     : 4 signed word elements of 'in' vector are added together and
+                 the resulting integer sum is returned
+*/
+#define HADD_SW_S32(in) ({                        \
+  v2i64 res0_m, res1_m;                           \
+  int32_t sum_m;                                  \
+                                                  \
+  res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in);  \
+  res1_m = __msa_splati_d(res0_m, 1);             \
+  res0_m = res0_m + res1_m;                       \
+  sum_m = __msa_copy_s_w((v4i32)res0_m, 0);       \
+  sum_m;                                          \
+})
+
+/* Description : Horizontal addition of 8 unsigned halfword elements
+   Arguments   : Inputs  - in       (unsigned halfword vector)
+                 Outputs - sum_m    (u32 sum)
+                 Return Type - unsigned word
+   Details     : 8 unsigned halfword elements of input vector are added
+                 together and the resulting integer sum is returned
+*/
+#define HADD_UH_U32(in) ({                           \
+  v4u32 res_m;                                       \
+  v2u64 res0_m, res1_m;                              \
+  uint32_t sum_m;                                    \
+                                                     \
+  res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in);      \
+  res0_m = __msa_hadd_u_d(res_m, res_m);             \
+  res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1);  \
+  res0_m = res0_m + res1_m;                          \
+  sum_m = __msa_copy_u_w((v4i32)res0_m, 0);          \
+  sum_m;                                             \
+})
+
+/* Description : Horizontal addition of unsigned byte vector elements
+   Arguments   : Inputs  - in0, in1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Each unsigned odd byte element from 'in0' is added to
+                 even unsigned byte element from 'in0' (pairwise) and the
+                 halfword result is written to 'out0'
+*/
+#define HADD_UB2(RTYPE, in0, in1, out0, out1) {          \
+  out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0);  \
+  out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1);  \
+}
+#define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
+
+#define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) {  \
+  HADD_UB2(RTYPE, in0, in1, out0, out1);                               \
+  HADD_UB2(RTYPE, in2, in3, out2, out3);                               \
+}
+#define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__)
+
+/* Description : Horizontal subtraction of unsigned byte vector elements
+   Arguments   : Inputs  - in0, in1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Each unsigned odd byte element from 'in0' is subtracted from
+                 even unsigned byte element from 'in0' (pairwise) and the
+                 halfword result is written to 'out0'
+*/
+#define HSUB_UB2(RTYPE, in0, in1, out0, out1) {          \
+  out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0);  \
+  out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1);  \
+}
+#define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
+
+/* Description : SAD (Sum of Absolute Difference)
+   Arguments   : Inputs  - in0, in1, ref0, ref1
+                 Outputs - sad_m                 (halfword vector)
+                 Return Type - unsigned halfword
+   Details     : Absolute difference of all the byte elements from 'in0' with
+                 'ref0' is calculated and preserved in 'diff0'. Then even-odd
+                 pairs are added together to generate 8 halfword results.
+*/
+#define SAD_UB2_UH(in0, in1, ref0, ref1) ({                 \
+  v16u8 diff0_m, diff1_m;                                   \
+  v8u16 sad_m = { 0 };                                      \
+                                                            \
+  diff0_m = __msa_asub_u_b((v16u8)in0, (v16u8)ref0);        \
+  diff1_m = __msa_asub_u_b((v16u8)in1, (v16u8)ref1);        \
+                                                            \
+  sad_m += __msa_hadd_u_h((v16u8)diff0_m, (v16u8)diff0_m);  \
+  sad_m += __msa_hadd_u_h((v16u8)diff1_m, (v16u8)diff1_m);  \
+                                                            \
+  sad_m;                                                    \
+})
+
+/* Description : Horizontal subtraction of signed halfword vector elements
+   Arguments   : Inputs  - in0, in1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Each signed odd halfword element from 'in0' is subtracted from
+                 even signed halfword element from 'in0' (pairwise) and the
+                 word result is written to 'out0'
+*/
+#define HSUB_UH2(RTYPE, in0, in1, out0, out1) {          \
+  out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0);  \
+  out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1);  \
+}
+#define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__)
+
+/* Description : Set element n input vector to GPR value
+   Arguments   : Inputs - in0, in1, in2, in3
+                 Output - out
+                 Return Type - as per RTYPE
+   Details     : Set element 0 in vector 'out' to value specified in 'in0'
+*/
+#define INSERT_W2(RTYPE, in0, in1, out) {           \
+  out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0);  \
+  out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1);  \
+}
+#define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
+
+#define INSERT_W4(RTYPE, in0, in1, in2, in3, out) {  \
+  out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0);   \
+  out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1);   \
+  out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2);   \
+  out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3);   \
+}
+#define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
+#define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
+
+#define INSERT_D2(RTYPE, in0, in1, out) {           \
+  out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0);  \
+  out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1);  \
+}
+#define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
+#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
+
+/* Description : Interleave even byte elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even byte elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'
+*/
+#define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
+  out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0);     \
+  out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2);     \
+}
+#define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
+#define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
+
+/* Description : Interleave even halfword elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even halfword elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'
+*/
+#define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
+  out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0);     \
+  out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2);     \
+}
+#define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
+#define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
+#define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
+
+/* Description : Interleave even word elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even word elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'
+*/
+#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
+  out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0);     \
+  out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2);     \
+}
+#define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
+
+/* Description : Interleave even double word elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even double word elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'
+*/
+#define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
+  out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0);     \
+  out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2);     \
+}
+#define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
+
+/* Description : Interleave left half of byte elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Left half of byte elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'.
+*/
+#define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
+  out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1);     \
+  out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3);     \
+}
+#define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
+#define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
+#define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
+#define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
+
+#define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                out0, out1, out2, out3) {                       \
+  ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1);               \
+  ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3);               \
+}
+#define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
+#define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
+
+/* Description : Interleave left half of halfword elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Left half of halfword elements of 'in0' and 'in1' are
+                 interleaved and written to 'out0'.
+*/
+#define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
+  out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1);     \
+  out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3);     \
+}
+#define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
+
+/* Description : Interleave left half of word elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Left half of word elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'.
+*/
+#define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
+  out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1);     \
+  out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3);     \
+}
+#define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__)
+#define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
+
+/* Description : Interleave right half of byte elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Right half of byte elements of 'in0' and 'in1' are interleaved
+                 and written to out0.
+*/
+#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
+  out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1);     \
+  out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3);     \
+}
+#define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
+#define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
+#define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
+#define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
+
+#define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                out0, out1, out2, out3) {                       \
+  ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);               \
+  ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3);               \
+}
+#define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
+#define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
+#define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
+#define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
+
+#define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,     \
+                in8, in9, in10, in11, in12, in13, in14, in15,      \
+                out0, out1, out2, out3, out4, out5, out6, out7) {  \
+  ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,           \
+          out0, out1, out2, out3);                                 \
+  ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15,     \
+          out4, out5, out6, out7);                                 \
+}
+#define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)
+
+/* Description : Interleave right half of halfword elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Right half of halfword elements of 'in0' and 'in1' are
+                 interleaved and written to 'out0'.
+*/
+#define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
+  out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1);     \
+  out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3);     \
+}
+#define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
+
+#define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                out0, out1, out2, out3) {                       \
+  ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);               \
+  ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3);               \
+}
+#define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
+
+#define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
+  out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1);     \
+  out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3);     \
+}
+#define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
+#define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
+
+#define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                out0, out1, out2, out3) {                       \
+  ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1);               \
+  ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3);               \
+}
+#define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__)
+
+/* Description : Interleave right half of double word elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Right half of double word elements of 'in0' and 'in1' are
+                 interleaved and written to 'out0'.
+*/
+#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) {   \
+  out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1));  \
+  out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3));  \
+}
+#define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
+#define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
+#define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
+
+#define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) {  \
+  ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);                         \
+  out2 = (RTYPE)__msa_ilvr_d((v2i64)(in4), (v2i64)(in5));                 \
+}
+#define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
+
+#define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                out0, out1, out2, out3) {                       \
+  ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);               \
+  ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3);               \
+}
+#define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
+#define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
+
+/* Description : Interleave both left and right half of input vectors
+   Arguments   : Inputs  - in0, in1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Right half of byte elements from 'in0' and 'in1' are
+                 interleaved and written to 'out0'
+*/
+#define ILVRL_B2(RTYPE, in0, in1, out0, out1) {        \
+  out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1);  \
+  out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1);  \
+}
+#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
+#define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
+#define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
+#define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
+
+#define ILVRL_H2(RTYPE, in0, in1, out0, out1) {        \
+  out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1);  \
+  out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1);  \
+}
+#define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
+#define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
+
+#define ILVRL_W2(RTYPE, in0, in1, out0, out1) {        \
+  out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1);  \
+  out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1);  \
+}
+#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
+#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
+
+/* Description : Saturate the halfword element values to the max
+                 unsigned value of (sat_val + 1) bits
+                 The element data width remains unchanged
+   Arguments   : Inputs  - in0, in1, sat_val
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
+   Details     : Each unsigned halfword element from 'in0' is saturated to the
+                 value generated with (sat_val + 1) bit range.
+                 The results are written in place
+*/
+#define SAT_UH2(RTYPE, in0, in1, sat_val) {         \
+  in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val);  \
+  in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val);  \
+}
+#define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
+
+#define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) {  \
+  SAT_UH2(RTYPE, in0, in1, sat_val);                   \
+  SAT_UH2(RTYPE, in2, in3, sat_val)                    \
+}
+#define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
+
+/* Description : Saturate the halfword element values to the max
+                 unsigned value of (sat_val + 1) bits
+                 The element data width remains unchanged
+   Arguments   : Inputs  - in0, in1, sat_val
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
+   Details     : Each unsigned halfword element from 'in0' is saturated to the
+                 value generated with (sat_val + 1) bit range
+                 The results are written in place
+*/
+#define SAT_SH2(RTYPE, in0, in1, sat_val) {         \
+  in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val);  \
+  in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val);  \
+}
+#define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
+
+#define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) {  \
+  SAT_SH2(RTYPE, in0, in1, sat_val);                   \
+  SAT_SH2(RTYPE, in2, in3, sat_val);                   \
+}
+#define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
+
+/* Description : Indexed halfword element values are replicated to all
+                 elements in output vector
+   Arguments   : Inputs  - in, idx0, idx1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : 'idx0' element value from 'in' vector is replicated to all
+                  elements in 'out0' vector
+                  Valid index range for halfword operation is 0-7
+*/
+#define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) {  \
+  out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0);        \
+  out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1);        \
+}
+#define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
+
+#define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3,  \
+                  out0, out1, out2, out3) {           \
+  SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1);       \
+  SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3);       \
+}
+#define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
+#define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
+
+/* Description : Pack even byte elements of vector pairs
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even byte elements of 'in0' are copied to the left half of
+                 'out0' & even byte elements of 'in1' are copied to the right
+                 half of 'out0'.
+*/
+#define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
+  out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1);     \
+  out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3);     \
+}
+#define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
+#define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
+#define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
+
+#define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                 out0, out1, out2, out3) {                       \
+  PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);               \
+  PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3);               \
+}
+#define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
+#define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
+#define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
+
+/* Description : Pack even halfword elements of vector pairs
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even halfword elements of 'in0' are copied to the left half of
+                 'out0' & even halfword elements of 'in1' are copied to the
+                 right half of 'out0'.
+*/
+#define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
+  out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1);     \
+  out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3);     \
+}
+#define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
+#define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
+
+#define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                 out0, out1, out2, out3) {                       \
+  PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1);               \
+  PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3);               \
+}
+#define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
+
+/* Description : Pack even double word elements of vector pairs
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even double elements of 'in0' are copied to the left half of
+                 'out0' & even double elements of 'in1' are copied to the right
+                 half of 'out0'.
+*/
+#define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
+  out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1);     \
+  out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3);     \
+}
+#define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
+#define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
+
+#define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                 out0, out1, out2, out3) {                       \
+  PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1);               \
+  PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3);               \
+}
+#define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__)
+
+/* Description : Each byte element is logically xor'ed with immediate 128
+   Arguments   : Inputs  - in0, in1
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
+   Details     : Each unsigned byte element from input vector 'in0' is
+                 logically xor'ed with 128 and the result is stored in-place.
+*/
+#define XORI_B2_128(RTYPE, in0, in1) {         \
+  in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128);  \
+  in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128);  \
+}
+#define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
+#define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
+
+#define XORI_B3_128(RTYPE, in0, in1, in2) {    \
+  XORI_B2_128(RTYPE, in0, in1);                \
+  in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128);  \
+}
+#define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
+
+#define XORI_B4_128(RTYPE, in0, in1, in2, in3) {  \
+  XORI_B2_128(RTYPE, in0, in1);                   \
+  XORI_B2_128(RTYPE, in2, in3);                   \
+}
+#define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
+#define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
+
+#define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) {  \
+  XORI_B4_128(RTYPE, in0, in1, in2, in3);                        \
+  XORI_B3_128(RTYPE, in4, in5, in6);                             \
+}
+#define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
+
+/* Description : Average of signed halfword elements -> (a + b) / 2
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+                 Outputs - out0, out1, out2, out3
+                 Return Type - as per RTYPE
+   Details     : Each signed halfword element from 'in0' is added to each
+                 signed halfword element of 'in1' with full precision resulting
+                 in one extra bit in the result. The result is then divided by
+                 2 and written to 'out0'
+*/
+#define AVE_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                out0, out1, out2, out3) {                       \
+  out0 = (RTYPE)__msa_ave_s_h((v8i16)in0, (v8i16)in1);          \
+  out1 = (RTYPE)__msa_ave_s_h((v8i16)in2, (v8i16)in3);          \
+  out2 = (RTYPE)__msa_ave_s_h((v8i16)in4, (v8i16)in5);          \
+  out3 = (RTYPE)__msa_ave_s_h((v8i16)in6, (v8i16)in7);          \
+}
+#define AVE_SH4_SH(...) AVE_SH4(v8i16, __VA_ARGS__)
+
+/* Description : Addition of signed halfword elements and signed saturation
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Signed halfword elements from 'in0' are added to signed
+                 halfword elements of 'in1'. The result is then signed saturated
+                 between halfword data type range
+*/
+#define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
+  out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1);    \
+  out1 = (RTYPE)__msa_adds_s_h((v8i16)in2, (v8i16)in3);    \
+}
+#define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
+
+#define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                 out0, out1, out2, out3) {                       \
+  ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1);               \
+  ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3);               \
+}
+#define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
+
+/* Description : Shift left all elements of vector (generic for all data types)
+   Arguments   : Inputs  - in0, in1, in2, in3, shift
+                 Outputs - in place operation
+                 Return Type - as per input vector RTYPE
+   Details     : Each element of vector 'in0' is left shifted by 'shift' and
+                 the result is written in-place.
+*/
+#define SLLI_4V(in0, in1, in2, in3, shift) {  \
+  in0 = in0 << shift;                         \
+  in1 = in1 << shift;                         \
+  in2 = in2 << shift;                         \
+  in3 = in3 << shift;                         \
+}
+
+/* Description : Arithmetic shift right all elements of vector
+                 (generic for all data types)
+   Arguments   : Inputs  - in0, in1, in2, in3, shift
+                 Outputs - in place operation
+                 Return Type - as per input vector RTYPE
+   Details     : Each element of vector 'in0' is right shifted by 'shift' and
+                 the result is written in-place. 'shift' is a GP variable.
+*/
+#define SRA_4V(in0, in1, in2, in3, shift) {  \
+  in0 = in0 >> shift;                        \
+  in1 = in1 >> shift;                        \
+  in2 = in2 >> shift;                        \
+  in3 = in3 >> shift;                        \
+}
+
+/* Description : Shift right arithmetic rounded words
+   Arguments   : Inputs  - in0, in1, shift
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
+   Details     : Each element of vector 'in0' is shifted right arithmetically by
+                 the number of bits in the corresponding element in the vector
+                 'shift'. The last discarded bit is added to shifted value for
+                 rounding and the result is written in-place.
+                 'shift' is a vector.
+*/
+#define SRAR_W2(RTYPE, in0, in1, shift) {               \
+  in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift);  \
+  in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift);  \
+}
+
+#define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) {  \
+  SRAR_W2(RTYPE, in0, in1, shift)                    \
+  SRAR_W2(RTYPE, in2, in3, shift)                    \
+}
+#define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
+
+/* Description : Shift right arithmetic rounded (immediate)
+   Arguments   : Inputs  - in0, in1, shift
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
+   Details     : Each element of vector 'in0' is shifted right arithmetically by
+                 the value in 'shift'. The last discarded bit is added to the
+                 shifted value for rounding and the result is written in-place.
+                 'shift' is an immediate value.
+*/
+#define SRARI_H2(RTYPE, in0, in1, shift) {        \
+  in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift);  \
+  in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift);  \
+}
+#define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
+#define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
+
+#define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) {  \
+  SRARI_H2(RTYPE, in0, in1, shift);                   \
+  SRARI_H2(RTYPE, in2, in3, shift);                   \
+}
+#define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
+#define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
+
+#define SRARI_W2(RTYPE, in0, in1, shift) {        \
+  in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift);  \
+  in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift);  \
+}
+#define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
+
+#define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) {  \
+  SRARI_W2(RTYPE, in0, in1, shift);                   \
+  SRARI_W2(RTYPE, in2, in3, shift);                   \
+}
+#define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
+
+/* Description : Logical shift right all elements of vector (immediate)
+   Arguments   : Inputs  - in0, in1, in2, in3, shift
+                 Outputs - out0, out1, out2, out3
+                 Return Type - as per RTYPE
+   Details     : Each element of vector 'in0' is right shifted by 'shift' and
+                 the result is written in-place. 'shift' is an immediate value.
+*/
+#define SRLI_H4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, shift) {  \
+  out0 = (RTYPE)__msa_srli_h((v8i16)in0, shift);                             \
+  out1 = (RTYPE)__msa_srli_h((v8i16)in1, shift);                             \
+  out2 = (RTYPE)__msa_srli_h((v8i16)in2, shift);                             \
+  out3 = (RTYPE)__msa_srli_h((v8i16)in3, shift);                             \
+}
+#define SRLI_H4_SH(...) SRLI_H4(v8i16, __VA_ARGS__)
+
+/* Description : Multiplication of pairs of vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+   Details     : Each element from 'in0' is multiplied with elements from 'in1'
+                 and the result is written to 'out0'
+*/
+#define MUL2(in0, in1, in2, in3, out0, out1) {  \
+  out0 = in0 * in1;                             \
+  out1 = in2 * in3;                             \
+}
+#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7,  \
+             out0, out1, out2, out3) {                \
+  MUL2(in0, in1, in2, in3, out0, out1);               \
+  MUL2(in4, in5, in6, in7, out2, out3);               \
+}
+
+/* Description : Addition of 2 pairs of vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+   Details     : Each element in 'in0' is added to 'in1' and result is written
+                 to 'out0'.
+*/
+#define ADD2(in0, in1, in2, in3, out0, out1) {  \
+  out0 = in0 + in1;                             \
+  out1 = in2 + in3;                             \
+}
+#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7,  \
+             out0, out1, out2, out3) {                \
+  ADD2(in0, in1, in2, in3, out0, out1);               \
+  ADD2(in4, in5, in6, in7, out2, out3);               \
+}
+
+/* Description : Subtraction of 2 pairs of vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+   Details     : Each element in 'in1' is subtracted from 'in0' and result is
+                 written to 'out0'.
+*/
+#define SUB2(in0, in1, in2, in3, out0, out1) {  \
+  out0 = in0 - in1;                             \
+  out1 = in2 - in3;                             \
+}
+#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7,  \
+             out0, out1, out2, out3) {                \
+  out0 = in0 - in1;                                   \
+  out1 = in2 - in3;                                   \
+  out2 = in4 - in5;                                   \
+  out3 = in6 - in7;                                   \
+}
+
+/* Description : Sign extend halfword elements from right half of the vector
+   Arguments   : Input  - in    (halfword vector)
+                 Output - out   (sign extended word vector)
+                 Return Type - signed word
+   Details     : Sign bit of halfword elements from input vector 'in' is
+                 extracted and interleaved with same vector 'in0' to generate
+                 4 word elements keeping sign intact
+*/
+#define UNPCK_R_SH_SW(in, out) {                 \
+  v8i16 sign_m;                                  \
+                                                 \
+  sign_m = __msa_clti_s_h((v8i16)in, 0);         \
+  out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in);  \
+}
+
+/* Description : Zero extend unsigned byte elements to halfword elements
+   Arguments   : Input   - in          (unsigned byte vector)
+                 Outputs - out0, out1  (unsigned  halfword vectors)
+                 Return Type - signed halfword
+   Details     : Zero extended right half of vector is returned in 'out0'
+                 Zero extended left half of vector is returned in 'out1'
+*/
+#define UNPCK_UB_SH(in, out0, out1) {   \
+  v16i8 zero_m = { 0 };                 \
+                                        \
+  ILVRL_B2_SH(zero_m, in, out0, out1);  \
+}
+
+/* Description : Sign extend halfword elements from input vector and return
+                 the result in pair of vectors
+   Arguments   : Input   - in            (halfword vector)
+                 Outputs - out0, out1   (sign extended word vectors)
+                 Return Type - signed word
+   Details     : Sign bit of halfword elements from input vector 'in' is
+                 extracted and interleaved right with same vector 'in0' to
+                 generate 4 signed word elements in 'out0'
+                 Then interleaved left with same vector 'in0' to
+                 generate 4 signed word elements in 'out1'
+*/
+#define UNPCK_SH_SW(in, out0, out1) {    \
+  v8i16 tmp_m;                           \
+                                         \
+  tmp_m = __msa_clti_s_h((v8i16)in, 0);  \
+  ILVRL_H2_SW(tmp_m, in, out0, out1);    \
+}
+
+/* Description : Butterfly of 4 input vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1, out2, out3
+   Details     : Butterfly operation
+*/
+#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) {  \
+  out0 = in0 + in3;                                                \
+  out1 = in1 + in2;                                                \
+                                                                   \
+  out2 = in1 - in2;                                                \
+  out3 = in0 - in3;                                                \
+}
+
+/* Description : Butterfly of 8 input vectors
+   Arguments   : Inputs  - in0 ...  in7
+                 Outputs - out0 .. out7
+   Details     : Butterfly operation
+*/
+#define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7,            \
+                    out0, out1, out2, out3, out4, out5, out6, out7) {  \
+  out0 = in0 + in7;                                                    \
+  out1 = in1 + in6;                                                    \
+  out2 = in2 + in5;                                                    \
+  out3 = in3 + in4;                                                    \
+                                                                       \
+  out4 = in3 - in4;                                                    \
+  out5 = in2 - in5;                                                    \
+  out6 = in1 - in6;                                                    \
+  out7 = in0 - in7;                                                    \
+}
+
+/* Description : Butterfly of 16 input vectors
+   Arguments   : Inputs  - in0 ...  in15
+                 Outputs - out0 .. out15
+   Details     : Butterfly operation
+*/
+#define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,                  \
+                     in8, in9,  in10, in11, in12, in13, in14, in15,           \
+                     out0, out1, out2, out3, out4, out5, out6, out7,          \
+                     out8, out9, out10, out11, out12, out13, out14, out15) {  \
+  out0 = in0 + in15;                                                          \
+  out1 = in1 + in14;                                                          \
+  out2 = in2 + in13;                                                          \
+  out3 = in3 + in12;                                                          \
+  out4 = in4 + in11;                                                          \
+  out5 = in5 + in10;                                                          \
+  out6 = in6 + in9;                                                           \
+  out7 = in7 + in8;                                                           \
+                                                                              \
+  out8 = in7 - in8;                                                           \
+  out9 = in6 - in9;                                                           \
+  out10 = in5 - in10;                                                         \
+  out11 = in4 - in11;                                                         \
+  out12 = in3 - in12;                                                         \
+  out13 = in2 - in13;                                                         \
+  out14 = in1 - in14;                                                         \
+  out15 = in0 - in15;                                                         \
+}
+
+/* Description : Transpose input 8x8 byte block
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+                 Return Type - as per RTYPE
+*/
+#define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,     \
+                        out0, out1, out2, out3, out4, out5, out6, out7) {  \
+  v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                    \
+  v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                    \
+                                                                           \
+  ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5,                       \
+             tmp0_m, tmp1_m, tmp2_m, tmp3_m);                              \
+  ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m);                             \
+  ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m);                             \
+  ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2);                             \
+  ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6);                             \
+  SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8);                             \
+  SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8);                             \
+}
+#define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
+
+/* Description : Transpose 16x8 block into 8x16 with byte elements in vectors
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
+                           in8, in9, in10, in11, in12, in13, in14, in15
+                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+                 Return Type - unsigned byte
+*/
+#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,            \
+                            in8, in9, in10, in11, in12, in13, in14, in15,      \
+                            out0, out1, out2, out3, out4, out5, out6, out7) {  \
+  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                        \
+  v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                        \
+                                                                               \
+  ILVEV_D2_UB(in0, in8, in1, in9, out7, out6);                                 \
+  ILVEV_D2_UB(in2, in10, in3, in11, out5, out4);                               \
+  ILVEV_D2_UB(in4, in12, in5, in13, out3, out2);                               \
+  ILVEV_D2_UB(in6, in14, in7, in15, out1, out0);                               \
+                                                                               \
+  tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7);                     \
+  tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7);                     \
+  tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5);                     \
+  tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5);                     \
+  out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3);                       \
+  tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3);                     \
+  out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1);                       \
+  tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1);                     \
+                                                                               \
+  ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m);                     \
+  out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
+  out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
+                                                                               \
+  tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m);                 \
+  tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5);                     \
+  out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
+  out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
+                                                                               \
+  ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m);                 \
+  out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
+  out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
+                                                                               \
+  tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);                 \
+  tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);                 \
+  tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);                 \
+  tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);                 \
+  out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
+  out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
+}
+
+/* Description : Transpose 4x4 block with half word elements in vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1, out2, out3
+                 Return Type - signed halfword
+*/
+#define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) {  \
+  v8i16 s0_m, s1_m;                                                       \
+                                                                          \
+  ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m);                             \
+  ILVRL_W2_SH(s1_m, s0_m, out0, out2);                                    \
+  out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0);                   \
+  out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2);                   \
+}
+
+/* Description : Transpose 4x8 block with half word elements in vectors
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+                 Return Type - signed halfword
+*/
+#define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,            \
+                           out0, out1, out2, out3, out4, out5, out6, out7) {  \
+  v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                       \
+  v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n;                                       \
+  v8i16 zero_m = { 0 };                                                       \
+                                                                              \
+  ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,                          \
+             tmp0_n, tmp1_n, tmp2_n, tmp3_n);                                 \
+  ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m);                                \
+  ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m);                                \
+                                                                              \
+  out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m);                   \
+  out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m);                   \
+  out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m);                   \
+  out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m);                   \
+                                                                              \
+  out4 = zero_m;                                                              \
+  out5 = zero_m;                                                              \
+  out6 = zero_m;                                                              \
+  out7 = zero_m;                                                              \
+}
+
+/* Description : Transpose 8x4 block with half word elements in vectors
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+                 Return Type - signed halfword
+*/
+#define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) {  \
+  v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
+                                                                          \
+  ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m);                         \
+  ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m);                         \
+  ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2);                 \
+  ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3);                 \
+}
+
+/* Description : Transpose 8x8 block with half word elements in vectors
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+                 Return Type - as per RTYPE
+*/
+#define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,     \
+                       out0, out1, out2, out3, out4, out5, out6, out7) {  \
+  v8i16 s0_m, s1_m;                                                       \
+  v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
+  v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                   \
+                                                                          \
+  ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                             \
+  ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m);                                \
+  ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                             \
+  ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m);                                \
+  ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                             \
+  ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m);                                \
+  ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                             \
+  ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m);                                \
+  PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m,         \
+           tmp3_m, tmp7_m, out0, out2, out4, out6);                       \
+  out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m);              \
+  out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m);              \
+  out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m);              \
+  out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m);              \
+}
+#define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__)
+
+/* Description : Transpose 4x4 block with word elements in vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1, out2, out3
+                 Return Type - signed word
+*/
+#define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) {  \
+  v4i32 s0_m, s1_m, s2_m, s3_m;                                           \
+                                                                          \
+  ILVRL_W2_SW(in1, in0, s0_m, s1_m);                                      \
+  ILVRL_W2_SW(in3, in2, s2_m, s3_m);                                      \
+                                                                          \
+  out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m);                   \
+  out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m);                   \
+  out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m);                   \
+  out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m);                   \
+}
+
+/* Description : Add block 4x4
+   Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
+   Details     : Least significant 4 bytes from each input vector are added to
+                 the destination bytes, clipped between 0-255 and stored.
+*/
+#define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) {     \
+  uint32_t src0_m, src1_m, src2_m, src3_m;                      \
+  v8i16 inp0_m, inp1_m, res0_m, res1_m;                         \
+  v16i8 dst0_m = { 0 };                                         \
+  v16i8 dst1_m = { 0 };                                         \
+  v16i8 zero_m = { 0 };                                         \
+                                                                \
+  ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m)                \
+  LW4(pdst, stride,  src0_m, src1_m, src2_m, src3_m);           \
+  INSERT_W2_SB(src0_m, src1_m, dst0_m);                         \
+  INSERT_W2_SB(src2_m, src3_m, dst1_m);                         \
+  ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m);   \
+  ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m);         \
+  CLIP_SH2_0_255(res0_m, res1_m);                               \
+  PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m);  \
+  ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride);           \
+}
+
+/* Description : Pack even elements of input vectors & xor with 128
+   Arguments   : Inputs - in0, in1
+                 Output - out_m
+                 Return Type - unsigned byte
+   Details     : Signed byte even elements from 'in0' and 'in1' are packed
+                 together in one vector and the resulting vector is xor'ed with
+                 128 to shift the range from signed to unsigned byte
+*/
+#define PCKEV_XORI128_UB(in0, in1) ({                    \
+  v16u8 out_m;                                           \
+                                                         \
+  out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0);  \
+  out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128);        \
+  out_m;                                                 \
+})
+
+/* Description : Converts inputs to unsigned bytes, interleave, average & store
+                 as 8x4 unsigned byte block
+   Arguments   : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3,
+                          pdst, stride
+*/
+#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3,                      \
+                                dst0, dst1, dst2, dst3, pdst, stride) {  \
+  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                  \
+  uint8_t *pdst_m = (uint8_t *)(pdst);                                   \
+                                                                         \
+  tmp0_m = PCKEV_XORI128_UB(in0, in1);                                   \
+  tmp1_m = PCKEV_XORI128_UB(in2, in3);                                   \
+  ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m);                    \
+  AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);           \
+  ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride);                              \
+}
+
+/* Description : Pack even byte elements and store byte vector in destination
+                 memory
+   Arguments   : Inputs - in0, in1, pdst
+*/
+#define PCKEV_ST_SB(in0, in1, pdst) {             \
+  v16i8 tmp_m;                                    \
+                                                  \
+  tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0);  \
+  ST_SB(tmp_m, (pdst));                           \
+}
+
+/* Description : Horizontal 2 tap filter kernel code
+   Arguments   : Inputs - in0, in1, mask, coeff, shift
+*/
+#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) ({    \
+  v16i8 tmp0_m;                                                \
+  v8u16 tmp1_m;                                                \
+                                                               \
+  tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0);  \
+  tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff);        \
+  tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift);         \
+                                                               \
+  tmp1_m;                                                      \
+})
+#endif  /* VPX_DSP_MIPS_MACROS_MSA_H_ */
diff --git a/libs/libvpx/vpx_dsp/mips/sad_msa.c b/libs/libvpx/vpx_dsp/mips/sad_msa.c
new file mode 100644
index 0000000000..3bdec28e6e
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/sad_msa.c
@@ -0,0 +1,1525 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/macros_msa.h"
+
+#define SAD_INSVE_W4(RTYPE, in0, in1, in2, in3, out) {    \
+  out = (RTYPE)__msa_insve_w((v4i32)out, 0, (v4i32)in0);  \
+  out = (RTYPE)__msa_insve_w((v4i32)out, 1, (v4i32)in1);  \
+  out = (RTYPE)__msa_insve_w((v4i32)out, 2, (v4i32)in2);  \
+  out = (RTYPE)__msa_insve_w((v4i32)out, 3, (v4i32)in3);  \
+}
+#define SAD_INSVE_W4_UB(...) SAD_INSVE_W4(v16u8, __VA_ARGS__)
+
+static uint32_t sad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
+                               const uint8_t *ref_ptr, int32_t ref_stride,
+                               int32_t height) {
+  int32_t ht_cnt;
+  uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+  v16u8 src = { 0 };
+  v16u8 ref = { 0 };
+  v16u8 diff;
+  v8u16 sad = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LW4(src_ptr, src_stride, src0, src1, src2, src3);
+    src_ptr += (4 * src_stride);
+    LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    ref_ptr += (4 * ref_stride);
+
+    INSERT_W4_UB(src0, src1, src2, src3, src);
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+
+    diff = __msa_asub_u_b(src, ref);
+    sad += __msa_hadd_u_h(diff, diff);
+  }
+
+  return HADD_UH_U32(sad);
+}
+
+static uint32_t sad_8width_msa(const uint8_t *src, int32_t src_stride,
+                               const uint8_t *ref, int32_t ref_stride,
+                               int32_t height) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+  v8u16 sad = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
+    ref += (4 * ref_stride);
+
+    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
+                src0, src1, ref0, ref1);
+    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+  }
+
+  return HADD_UH_U32(sad);
+}
+
+static uint32_t sad_16width_msa(const uint8_t *src, int32_t src_stride,
+                                const uint8_t *ref, int32_t ref_stride,
+                                int32_t height) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, ref0, ref1;
+  v8u16 sad = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB2(src, src_stride, src0, src1);
+    src += (2 * src_stride);
+    LD_UB2(ref, ref_stride, ref0, ref1);
+    ref += (2 * ref_stride);
+    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    LD_UB2(src, src_stride, src0, src1);
+    src += (2 * src_stride);
+    LD_UB2(ref, ref_stride, ref0, ref1);
+    ref += (2 * ref_stride);
+    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+  }
+
+  return HADD_UH_U32(sad);
+}
+
+static uint32_t sad_32width_msa(const uint8_t *src, int32_t src_stride,
+                                const uint8_t *ref, int32_t ref_stride,
+                                int32_t height) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, ref0, ref1;
+  v8u16 sad = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB2(src, 16, src0, src1);
+    src += src_stride;
+    LD_UB2(ref, 16, ref0, ref1);
+    ref += ref_stride;
+    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    LD_UB2(src, 16, src0, src1);
+    src += src_stride;
+    LD_UB2(ref, 16, ref0, ref1);
+    ref += ref_stride;
+    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    LD_UB2(src, 16, src0, src1);
+    src += src_stride;
+    LD_UB2(ref, 16, ref0, ref1);
+    ref += ref_stride;
+    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    LD_UB2(src, 16, src0, src1);
+    src += src_stride;
+    LD_UB2(ref, 16, ref0, ref1);
+    ref += ref_stride;
+    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+  }
+
+  return HADD_UH_U32(sad);
+}
+
+static uint32_t sad_64width_msa(const uint8_t *src, int32_t src_stride,
+                                const uint8_t *ref, int32_t ref_stride,
+                                int32_t height) {
+  int32_t ht_cnt;
+  uint32_t sad = 0;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3;
+  v8u16 sad0 = { 0 };
+  v8u16 sad1 = { 0 };
+
+  for (ht_cnt = (height >> 1); ht_cnt--;) {
+    LD_UB4(src, 16, src0, src1, src2, src3);
+    src += src_stride;
+    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
+    ref += ref_stride;
+    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+
+    LD_UB4(src, 16, src0, src1, src2, src3);
+    src += src_stride;
+    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
+    ref += ref_stride;
+    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+  }
+
+  sad = HADD_UH_U32(sad0);
+  sad += HADD_UH_U32(sad1);
+
+  return sad;
+}
+
+static void sad_4width_x3_msa(const uint8_t *src_ptr, int32_t src_stride,
+                              const uint8_t *ref_ptr, int32_t ref_stride,
+                              int32_t height, uint32_t *sad_array) {
+  int32_t ht_cnt;
+  uint32_t src0, src1, src2, src3;
+  v16u8 src = { 0 };
+  v16u8 ref = { 0 };
+  v16u8 ref0, ref1, ref2, ref3, diff;
+  v8u16 sad0 = { 0 };
+  v8u16 sad1 = { 0 };
+  v8u16 sad2 = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LW4(src_ptr, src_stride, src0, src1, src2, src3);
+    src_ptr += (4 * src_stride);
+    INSERT_W4_UB(src0, src1, src2, src3, src);
+
+    LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    ref_ptr += (4 * ref_stride);
+    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
+    diff = __msa_asub_u_b(src, ref);
+    sad0 += __msa_hadd_u_h(diff, diff);
+
+    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
+    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
+    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
+    diff = __msa_asub_u_b(src, ref);
+    sad1 += __msa_hadd_u_h(diff, diff);
+
+    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
+    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
+    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
+    diff = __msa_asub_u_b(src, ref);
+    sad2 += __msa_hadd_u_h(diff, diff);
+  }
+
+  sad_array[0] = HADD_UH_U32(sad0);
+  sad_array[1] = HADD_UH_U32(sad1);
+  sad_array[2] = HADD_UH_U32(sad2);
+}
+
+static void sad_8width_x3_msa(const uint8_t *src, int32_t src_stride,
+                              const uint8_t *ref, int32_t ref_stride,
+                              int32_t height, uint32_t *sad_array) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref00, ref11, ref22, ref33;
+  v8u16 sad0 = { 0 };
+  v8u16 sad1 = { 0 };
+  v8u16 sad2 = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
+    ref += (4 * ref_stride);
+    PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22,
+                src0, src1, ref0, ref1);
+    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
+    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
+    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
+    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
+    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
+    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
+    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
+  }
+
+  sad_array[0] = HADD_UH_U32(sad0);
+  sad_array[1] = HADD_UH_U32(sad1);
+  sad_array[2] = HADD_UH_U32(sad2);
+}
+
+static void sad_16width_x3_msa(const uint8_t *src_ptr, int32_t src_stride,
+                               const uint8_t *ref_ptr, int32_t ref_stride,
+                               int32_t height, uint32_t *sad_array) {
+  int32_t ht_cnt;
+  v16u8 src, ref, ref0, ref1, diff;
+  v8u16 sad0 = { 0 };
+  v8u16 sad1 = { 0 };
+  v8u16 sad2 = { 0 };
+
+  for (ht_cnt = (height >> 1); ht_cnt--;) {
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+
+    diff = __msa_asub_u_b(src, ref0);
+    sad0 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
+    diff = __msa_asub_u_b(src, ref);
+    sad1 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
+    diff = __msa_asub_u_b(src, ref);
+    sad2 += __msa_hadd_u_h(diff, diff);
+
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+
+    diff = __msa_asub_u_b(src, ref0);
+    sad0 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
+    diff = __msa_asub_u_b(src, ref);
+    sad1 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
+    diff = __msa_asub_u_b(src, ref);
+    sad2 += __msa_hadd_u_h(diff, diff);
+  }
+
+  sad_array[0] = HADD_UH_U32(sad0);
+  sad_array[1] = HADD_UH_U32(sad1);
+  sad_array[2] = HADD_UH_U32(sad2);
+}
+
+static void sad_32width_x3_msa(const uint8_t *src, int32_t src_stride,
+                               const uint8_t *ref, int32_t ref_stride,
+                               int32_t height, uint32_t *sad_array) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, ref0_0, ref0_1, ref0_2, ref0, ref1;
+  v8u16 sad0 = { 0 };
+  v8u16 sad1 = { 0 };
+  v8u16 sad2 = { 0 };
+
+  for (ht_cnt = height >> 1; ht_cnt--;) {
+    LD_UB2(src, 16, src0, src1);
+    src += src_stride;
+    LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2);
+    ref += ref_stride;
+
+    sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
+    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
+    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    LD_UB2(src, 16, src0, src1);
+    src += src_stride;
+    LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2);
+    ref += ref_stride;
+
+    sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
+    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
+    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
+  }
+
+  sad_array[0] = HADD_UH_U32(sad0);
+  sad_array[1] = HADD_UH_U32(sad1);
+  sad_array[2] = HADD_UH_U32(sad2);
+}
+
+static void sad_64width_x3_msa(const uint8_t *src, int32_t src_stride,
+                               const uint8_t *ref, int32_t ref_stride,
+                               int32_t height, uint32_t *sad_array) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4, ref0, ref1, ref2, ref3;
+  v8u16 sad0_0 = { 0 };
+  v8u16 sad0_1 = { 0 };
+  v8u16 sad1_0 = { 0 };
+  v8u16 sad1_1 = { 0 };
+  v8u16 sad2_0 = { 0 };
+  v8u16 sad2_1 = { 0 };
+  v4u32 sad;
+
+  for (ht_cnt = height; ht_cnt--;) {
+    LD_UB4(src, 16, src0, src1, src2, src3);
+    src += src_stride;
+    LD_UB4(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3);
+    ref0_4 = LD_UB(ref + 64);
+    ref += ref_stride;
+
+    sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
+    sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
+    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1);
+    sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
+    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2);
+    sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+  }
+
+  sad = __msa_hadd_u_w(sad0_0, sad0_0);
+  sad += __msa_hadd_u_w(sad0_1, sad0_1);
+  sad_array[0] = HADD_SW_S32((v4i32)sad);
+
+  sad = __msa_hadd_u_w(sad1_0, sad1_0);
+  sad += __msa_hadd_u_w(sad1_1, sad1_1);
+  sad_array[1] = HADD_SW_S32((v4i32)sad);
+
+  sad = __msa_hadd_u_w(sad2_0, sad2_0);
+  sad += __msa_hadd_u_w(sad2_1, sad2_1);
+  sad_array[2] = HADD_SW_S32((v4i32)sad);
+}
+
+static void sad_4width_x8_msa(const uint8_t *src_ptr, int32_t src_stride,
+                              const uint8_t *ref_ptr, int32_t ref_stride,
+                              int32_t height, uint32_t *sad_array) {
+  int32_t ht_cnt;
+  uint32_t src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3, diff;
+  v16u8 src = { 0 };
+  v16u8 ref = { 0 };
+  v8u16 sad0 = { 0 };
+  v8u16 sad1 = { 0 };
+  v8u16 sad2 = { 0 };
+  v8u16 sad3 = { 0 };
+  v8u16 sad4 = { 0 };
+  v8u16 sad5 = { 0 };
+  v8u16 sad6 = { 0 };
+  v8u16 sad7 = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LW4(src_ptr, src_stride, src0, src1, src2, src3);
+    INSERT_W4_UB(src0, src1, src2, src3, src);
+    src_ptr += (4 * src_stride);
+    LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    ref_ptr += (4 * ref_stride);
+
+    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
+    diff = __msa_asub_u_b(src, ref);
+    sad0 += __msa_hadd_u_h(diff, diff);
+
+    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
+    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
+    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
+    diff = __msa_asub_u_b(src, ref);
+    sad1 += __msa_hadd_u_h(diff, diff);
+
+    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
+    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
+    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
+    diff = __msa_asub_u_b(src, ref);
+    sad2 += __msa_hadd_u_h(diff, diff);
+
+    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
+    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
+    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
+    diff = __msa_asub_u_b(src, ref);
+    sad3 += __msa_hadd_u_h(diff, diff);
+
+    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
+    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
+    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
+    diff = __msa_asub_u_b(src, ref);
+    sad4 += __msa_hadd_u_h(diff, diff);
+
+    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
+    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
+    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
+    diff = __msa_asub_u_b(src, ref);
+    sad5 += __msa_hadd_u_h(diff, diff);
+
+    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
+    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
+    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
+    diff = __msa_asub_u_b(src, ref);
+    sad6 += __msa_hadd_u_h(diff, diff);
+
+    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
+    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
+    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
+    diff = __msa_asub_u_b(src, ref);
+    sad7 += __msa_hadd_u_h(diff, diff);
+  }
+
+  sad_array[0] = HADD_UH_U32(sad0);
+  sad_array[1] = HADD_UH_U32(sad1);
+  sad_array[2] = HADD_UH_U32(sad2);
+  sad_array[3] = HADD_UH_U32(sad3);
+  sad_array[4] = HADD_UH_U32(sad4);
+  sad_array[5] = HADD_UH_U32(sad5);
+  sad_array[6] = HADD_UH_U32(sad6);
+  sad_array[7] = HADD_UH_U32(sad7);
+}
+
+static void sad_8width_x8_msa(const uint8_t *src, int32_t src_stride,
+                              const uint8_t *ref, int32_t ref_stride,
+                              int32_t height, uint32_t *sad_array) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref00, ref11, ref22, ref33;
+  v8u16 sad0 = { 0 };
+  v8u16 sad1 = { 0 };
+  v8u16 sad2 = { 0 };
+  v8u16 sad3 = { 0 };
+  v8u16 sad4 = { 0 };
+  v8u16 sad5 = { 0 };
+  v8u16 sad6 = { 0 };
+  v8u16 sad7 = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
+    ref += (4 * ref_stride);
+    PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22,
+                src0, src1, ref0, ref1);
+    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
+    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
+    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
+    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
+    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
+    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
+    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
+    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
+    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
+    sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
+    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
+    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
+    sad4 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
+    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
+    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
+    sad5 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
+    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
+    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
+    sad6 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
+    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
+    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
+    sad7 += SAD_UB2_UH(src0, src1, ref0, ref1);
+  }
+
+  sad_array[0] = HADD_UH_U32(sad0);
+  sad_array[1] = HADD_UH_U32(sad1);
+  sad_array[2] = HADD_UH_U32(sad2);
+  sad_array[3] = HADD_UH_U32(sad3);
+  sad_array[4] = HADD_UH_U32(sad4);
+  sad_array[5] = HADD_UH_U32(sad5);
+  sad_array[6] = HADD_UH_U32(sad6);
+  sad_array[7] = HADD_UH_U32(sad7);
+}
+
+static void sad_16width_x8_msa(const uint8_t *src_ptr, int32_t src_stride,
+                               const uint8_t *ref_ptr, int32_t ref_stride,
+                               int32_t height, uint32_t *sad_array) {
+  int32_t ht_cnt;
+  v16u8 src, ref0, ref1, ref;
+  v16u8 diff;
+  v8u16 sad0 = { 0 };
+  v8u16 sad1 = { 0 };
+  v8u16 sad2 = { 0 };
+  v8u16 sad3 = { 0 };
+  v8u16 sad4 = { 0 };
+  v8u16 sad5 = { 0 };
+  v8u16 sad6 = { 0 };
+  v8u16 sad7 = { 0 };
+
+  for (ht_cnt = (height >> 1); ht_cnt--;) {
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+
+    diff = __msa_asub_u_b(src, ref0);
+    sad0 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
+    diff = __msa_asub_u_b(src, ref);
+    sad1 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
+    diff = __msa_asub_u_b(src, ref);
+    sad2 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3);
+    diff = __msa_asub_u_b(src, ref);
+    sad3 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4);
+    diff = __msa_asub_u_b(src, ref);
+    sad4 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5);
+    diff = __msa_asub_u_b(src, ref);
+    sad5 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6);
+    diff = __msa_asub_u_b(src, ref);
+    sad6 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7);
+    diff = __msa_asub_u_b(src, ref);
+    sad7 += __msa_hadd_u_h(diff, diff);
+
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+
+    diff = __msa_asub_u_b(src, ref0);
+    sad0 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
+    diff = __msa_asub_u_b(src, ref);
+    sad1 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
+    diff = __msa_asub_u_b(src, ref);
+    sad2 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3);
+    diff = __msa_asub_u_b(src, ref);
+    sad3 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4);
+    diff = __msa_asub_u_b(src, ref);
+    sad4 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5);
+    diff = __msa_asub_u_b(src, ref);
+    sad5 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6);
+    diff = __msa_asub_u_b(src, ref);
+    sad6 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7);
+    diff = __msa_asub_u_b(src, ref);
+    sad7 += __msa_hadd_u_h(diff, diff);
+  }
+
+  sad_array[0] = HADD_UH_U32(sad0);
+  sad_array[1] = HADD_UH_U32(sad1);
+  sad_array[2] = HADD_UH_U32(sad2);
+  sad_array[3] = HADD_UH_U32(sad3);
+  sad_array[4] = HADD_UH_U32(sad4);
+  sad_array[5] = HADD_UH_U32(sad5);
+  sad_array[6] = HADD_UH_U32(sad6);
+  sad_array[7] = HADD_UH_U32(sad7);
+}
+
+static void sad_32width_x8_msa(const uint8_t *src, int32_t src_stride,
+                               const uint8_t *ref, int32_t ref_stride,
+                               int32_t height, uint32_t *sad_array) {
+  int32_t ht_cnt;
+  v16u8 src0, src1;
+  v16u8 ref0, ref1, ref0_0, ref0_1, ref0_2;
+  v8u16 sad0 = { 0 };
+  v8u16 sad1 = { 0 };
+  v8u16 sad2 = { 0 };
+  v8u16 sad3 = { 0 };
+  v8u16 sad4 = { 0 };
+  v8u16 sad5 = { 0 };
+  v8u16 sad6 = { 0 };
+  v8u16 sad7 = { 0 };
+
+  for (ht_cnt = height; ht_cnt--;) {
+    LD_UB2(src, 16, src0, src1);
+    src += src_stride;
+    LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2);
+    ref += ref_stride;
+
+    sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
+    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
+    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3);
+    sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4);
+    sad4 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5);
+    sad5 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6);
+    sad6 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7);
+    sad7 += SAD_UB2_UH(src0, src1, ref0, ref1);
+  }
+
+  sad_array[0] = HADD_UH_U32(sad0);
+  sad_array[1] = HADD_UH_U32(sad1);
+  sad_array[2] = HADD_UH_U32(sad2);
+  sad_array[3] = HADD_UH_U32(sad3);
+  sad_array[4] = HADD_UH_U32(sad4);
+  sad_array[5] = HADD_UH_U32(sad5);
+  sad_array[6] = HADD_UH_U32(sad6);
+  sad_array[7] = HADD_UH_U32(sad7);
+}
+
+static void sad_64width_x8_msa(const uint8_t *src, int32_t src_stride,
+                               const uint8_t *ref, int32_t ref_stride,
+                               int32_t height, uint32_t *sad_array) {
+  const uint8_t *src_dup, *ref_dup;
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4;
+  v16u8 ref0, ref1, ref2, ref3;
+  v8u16 sad0_0 = { 0 };
+  v8u16 sad0_1 = { 0 };
+  v8u16 sad1_0 = { 0 };
+  v8u16 sad1_1 = { 0 };
+  v8u16 sad2_0 = { 0 };
+  v8u16 sad2_1 = { 0 };
+  v8u16 sad3_0 = { 0 };
+  v8u16 sad3_1 = { 0 };
+  v4u32 sad;
+
+  src_dup = src;
+  ref_dup = ref;
+
+  for (ht_cnt = height; ht_cnt--;) {
+    LD_UB4(src, 16, src0, src1, src2, src3);
+    src += src_stride;
+    LD_UB5(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4);
+    ref += ref_stride;
+
+    sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
+    sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
+    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1);
+    sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
+    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2);
+    sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3);
+    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 3);
+    sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+  }
+
+  sad = __msa_hadd_u_w(sad0_0, sad0_0);
+  sad += __msa_hadd_u_w(sad0_1, sad0_1);
+  sad_array[0] = HADD_SW_S32(sad);
+
+  sad = __msa_hadd_u_w(sad1_0, sad1_0);
+  sad += __msa_hadd_u_w(sad1_1, sad1_1);
+  sad_array[1] = HADD_SW_S32(sad);
+
+  sad = __msa_hadd_u_w(sad2_0, sad2_0);
+  sad += __msa_hadd_u_w(sad2_1, sad2_1);
+  sad_array[2] = HADD_SW_S32(sad);
+
+  sad = __msa_hadd_u_w(sad3_0, sad3_0);
+  sad += __msa_hadd_u_w(sad3_1, sad3_1);
+  sad_array[3] = HADD_SW_S32(sad);
+
+  sad0_0 = (v8u16)__msa_ldi_h(0);
+  sad0_1 = (v8u16)__msa_ldi_h(0);
+  sad1_0 = (v8u16)__msa_ldi_h(0);
+  sad1_1 = (v8u16)__msa_ldi_h(0);
+  sad2_0 = (v8u16)__msa_ldi_h(0);
+  sad2_1 = (v8u16)__msa_ldi_h(0);
+  sad3_0 = (v8u16)__msa_ldi_h(0);
+  sad3_1 = (v8u16)__msa_ldi_h(0);
+
+  for (ht_cnt = 64; ht_cnt--;) {
+    LD_UB4(src_dup, 16, src0, src1, src2, src3);
+    src_dup += src_stride;
+    LD_UB5(ref_dup, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4);
+    ref_dup += ref_stride;
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4);
+    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 4);
+    sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5);
+    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 5);
+    sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6);
+    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 6);
+    sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7);
+    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 7);
+    sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+  }
+
+  sad = __msa_hadd_u_w(sad0_0, sad0_0);
+  sad += __msa_hadd_u_w(sad0_1, sad0_1);
+  sad_array[4] = HADD_SW_S32(sad);
+
+  sad = __msa_hadd_u_w(sad1_0, sad1_0);
+  sad += __msa_hadd_u_w(sad1_1, sad1_1);
+  sad_array[5] = HADD_SW_S32(sad);
+
+  sad = __msa_hadd_u_w(sad2_0, sad2_0);
+  sad += __msa_hadd_u_w(sad2_1, sad2_1);
+  sad_array[6] = HADD_SW_S32(sad);
+
+  sad = __msa_hadd_u_w(sad3_0, sad3_0);
+  sad += __msa_hadd_u_w(sad3_1, sad3_1);
+  sad_array[7] = HADD_SW_S32(sad);
+}
+
+static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
+                               const uint8_t * const aref_ptr[],
+                               int32_t ref_stride,
+                               int32_t height, uint32_t *sad_array) {
+  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+  int32_t ht_cnt;
+  uint32_t src0, src1, src2, src3;
+  uint32_t ref0, ref1, ref2, ref3;
+  v16u8 src = { 0 };
+  v16u8 ref = { 0 };
+  v16u8 diff;
+  v8u16 sad0 = { 0 };
+  v8u16 sad1 = { 0 };
+  v8u16 sad2 = { 0 };
+  v8u16 sad3 = { 0 };
+
+  ref0_ptr = aref_ptr[0];
+  ref1_ptr = aref_ptr[1];
+  ref2_ptr = aref_ptr[2];
+  ref3_ptr = aref_ptr[3];
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LW4(src_ptr, src_stride, src0, src1, src2, src3);
+    INSERT_W4_UB(src0, src1, src2, src3, src);
+    src_ptr += (4 * src_stride);
+
+    LW4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    ref0_ptr += (4 * ref_stride);
+
+    diff = __msa_asub_u_b(src, ref);
+    sad0 += __msa_hadd_u_h(diff, diff);
+
+    LW4(ref1_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    ref1_ptr += (4 * ref_stride);
+
+    diff = __msa_asub_u_b(src, ref);
+    sad1 += __msa_hadd_u_h(diff, diff);
+
+    LW4(ref2_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    ref2_ptr += (4 * ref_stride);
+
+    diff = __msa_asub_u_b(src, ref);
+    sad2 += __msa_hadd_u_h(diff, diff);
+
+    LW4(ref3_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    ref3_ptr += (4 * ref_stride);
+
+    diff = __msa_asub_u_b(src, ref);
+    sad3 += __msa_hadd_u_h(diff, diff);
+  }
+
+  sad_array[0] = HADD_UH_U32(sad0);
+  sad_array[1] = HADD_UH_U32(sad1);
+  sad_array[2] = HADD_UH_U32(sad2);
+  sad_array[3] = HADD_UH_U32(sad3);
+}
+
+static void sad_8width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
+                               const uint8_t * const aref_ptr[],
+                               int32_t ref_stride,
+                               int32_t height, uint32_t *sad_array) {
+  int32_t ht_cnt;
+  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
+  v16u8 ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15;
+  v8u16 sad0 = { 0 };
+  v8u16 sad1 = { 0 };
+  v8u16 sad2 = { 0 };
+  v8u16 sad3 = { 0 };
+
+  ref0_ptr = aref_ptr[0];
+  ref1_ptr = aref_ptr[1];
+  ref2_ptr = aref_ptr[2];
+  ref3_ptr = aref_ptr[3];
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
+    src_ptr += (4 * src_stride);
+    LD_UB4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    ref0_ptr += (4 * ref_stride);
+    LD_UB4(ref1_ptr, ref_stride, ref4, ref5, ref6, ref7);
+    ref1_ptr += (4 * ref_stride);
+    LD_UB4(ref2_ptr, ref_stride, ref8, ref9, ref10, ref11);
+    ref2_ptr += (4 * ref_stride);
+    LD_UB4(ref3_ptr, ref_stride, ref12, ref13, ref14, ref15);
+    ref3_ptr += (4 * ref_stride);
+
+    PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
+    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    PCKEV_D2_UB(ref5, ref4, ref7, ref6, ref0, ref1);
+    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    PCKEV_D2_UB(ref9, ref8, ref11, ref10, ref0, ref1);
+    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    PCKEV_D2_UB(ref13, ref12, ref15, ref14, ref0, ref1);
+    sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
+  }
+
+  sad_array[0] = HADD_UH_U32(sad0);
+  sad_array[1] = HADD_UH_U32(sad1);
+  sad_array[2] = HADD_UH_U32(sad2);
+  sad_array[3] = HADD_UH_U32(sad3);
+}
+
+static void sad_16width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                const uint8_t * const aref_ptr[],
+                                int32_t ref_stride,
+                                int32_t height, uint32_t *sad_array) {
+  int32_t ht_cnt;
+  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+  v16u8 src, ref0, ref1, ref2, ref3, diff;
+  v8u16 sad0 = { 0 };
+  v8u16 sad1 = { 0 };
+  v8u16 sad2 = { 0 };
+  v8u16 sad3 = { 0 };
+
+  ref0_ptr = aref_ptr[0];
+  ref1_ptr = aref_ptr[1];
+  ref2_ptr = aref_ptr[2];
+  ref3_ptr = aref_ptr[3];
+
+  for (ht_cnt = (height >> 1); ht_cnt--;) {
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref0 = LD_UB(ref0_ptr);
+    ref0_ptr += ref_stride;
+    ref1 = LD_UB(ref1_ptr);
+    ref1_ptr += ref_stride;
+    ref2 = LD_UB(ref2_ptr);
+    ref2_ptr += ref_stride;
+    ref3 = LD_UB(ref3_ptr);
+    ref3_ptr += ref_stride;
+
+    diff = __msa_asub_u_b(src, ref0);
+    sad0 += __msa_hadd_u_h(diff, diff);
+    diff = __msa_asub_u_b(src, ref1);
+    sad1 += __msa_hadd_u_h(diff, diff);
+    diff = __msa_asub_u_b(src, ref2);
+    sad2 += __msa_hadd_u_h(diff, diff);
+    diff = __msa_asub_u_b(src, ref3);
+    sad3 += __msa_hadd_u_h(diff, diff);
+
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref0 = LD_UB(ref0_ptr);
+    ref0_ptr += ref_stride;
+    ref1 = LD_UB(ref1_ptr);
+    ref1_ptr += ref_stride;
+    ref2 = LD_UB(ref2_ptr);
+    ref2_ptr += ref_stride;
+    ref3 = LD_UB(ref3_ptr);
+    ref3_ptr += ref_stride;
+
+    diff = __msa_asub_u_b(src, ref0);
+    sad0 += __msa_hadd_u_h(diff, diff);
+    diff = __msa_asub_u_b(src, ref1);
+    sad1 += __msa_hadd_u_h(diff, diff);
+    diff = __msa_asub_u_b(src, ref2);
+    sad2 += __msa_hadd_u_h(diff, diff);
+    diff = __msa_asub_u_b(src, ref3);
+    sad3 += __msa_hadd_u_h(diff, diff);
+  }
+
+  sad_array[0] = HADD_UH_U32(sad0);
+  sad_array[1] = HADD_UH_U32(sad1);
+  sad_array[2] = HADD_UH_U32(sad2);
+  sad_array[3] = HADD_UH_U32(sad3);
+}
+
+static void sad_32width_x4d_msa(const uint8_t *src, int32_t src_stride,
+                                const uint8_t * const aref_ptr[],
+                                int32_t ref_stride,
+                                int32_t height, uint32_t *sad_array) {
+  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+  int32_t ht_cnt;
+  v16u8 src0, src1, ref0, ref1;
+  v8u16 sad0 = { 0 };
+  v8u16 sad1 = { 0 };
+  v8u16 sad2 = { 0 };
+  v8u16 sad3 = { 0 };
+
+  ref0_ptr = aref_ptr[0];
+  ref1_ptr = aref_ptr[1];
+  ref2_ptr = aref_ptr[2];
+  ref3_ptr = aref_ptr[3];
+
+  for (ht_cnt = height; ht_cnt--;) {
+    LD_UB2(src, 16, src0, src1);
+    src += src_stride;
+
+    LD_UB2(ref0_ptr, 16, ref0, ref1);
+    ref0_ptr += ref_stride;
+    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    LD_UB2(ref1_ptr, 16, ref0, ref1);
+    ref1_ptr += ref_stride;
+    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    LD_UB2(ref2_ptr, 16, ref0, ref1);
+    ref2_ptr += ref_stride;
+    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    LD_UB2(ref3_ptr, 16, ref0, ref1);
+    ref3_ptr += ref_stride;
+    sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
+  }
+
+  sad_array[0] = HADD_UH_U32(sad0);
+  sad_array[1] = HADD_UH_U32(sad1);
+  sad_array[2] = HADD_UH_U32(sad2);
+  sad_array[3] = HADD_UH_U32(sad3);
+}
+
+static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride,
+                                const uint8_t * const aref_ptr[],
+                                int32_t ref_stride,
+                                int32_t height, uint32_t *sad_array) {
+  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3;
+  v8u16 sad0_0 = { 0 };
+  v8u16 sad0_1 = { 0 };
+  v8u16 sad1_0 = { 0 };
+  v8u16 sad1_1 = { 0 };
+  v8u16 sad2_0 = { 0 };
+  v8u16 sad2_1 = { 0 };
+  v8u16 sad3_0 = { 0 };
+  v8u16 sad3_1 = { 0 };
+
+  ref0_ptr = aref_ptr[0];
+  ref1_ptr = aref_ptr[1];
+  ref2_ptr = aref_ptr[2];
+  ref3_ptr = aref_ptr[3];
+
+  for (ht_cnt = height; ht_cnt--;) {
+    LD_UB4(src, 16, src0, src1, src2, src3);
+    src += src_stride;
+
+    LD_UB4(ref0_ptr, 16, ref0, ref1, ref2, ref3);
+    ref0_ptr += ref_stride;
+    sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+
+    LD_UB4(ref1_ptr, 16, ref0, ref1, ref2, ref3);
+    ref1_ptr += ref_stride;
+    sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+
+    LD_UB4(ref2_ptr, 16, ref0, ref1, ref2, ref3);
+    ref2_ptr += ref_stride;
+    sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+
+    LD_UB4(ref3_ptr, 16, ref0, ref1, ref2, ref3);
+    ref3_ptr += ref_stride;
+    sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+  }
+
+  sad_array[0] = HADD_UH_U32(sad0_0);
+  sad_array[0] += HADD_UH_U32(sad0_1);
+  sad_array[1] = HADD_UH_U32(sad1_0);
+  sad_array[1] += HADD_UH_U32(sad1_1);
+  sad_array[2] = HADD_UH_U32(sad2_0);
+  sad_array[2] += HADD_UH_U32(sad2_1);
+  sad_array[3] = HADD_UH_U32(sad3_0);
+  sad_array[3] += HADD_UH_U32(sad3_1);
+}
+
+static uint32_t avgsad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                  const uint8_t *ref_ptr, int32_t ref_stride,
+                                  int32_t height, const uint8_t *sec_pred) {
+  int32_t ht_cnt;
+  uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+  v16u8 src = { 0 };
+  v16u8 ref = { 0 };
+  v16u8 diff, pred, comp;
+  v8u16 sad = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LW4(src_ptr, src_stride, src0, src1, src2, src3);
+    src_ptr += (4 * src_stride);
+    LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    ref_ptr += (4 * ref_stride);
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+
+    INSERT_W4_UB(src0, src1, src2, src3, src);
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+
+    comp = __msa_aver_u_b(pred, ref);
+    diff = __msa_asub_u_b(src, comp);
+    sad += __msa_hadd_u_h(diff, diff);
+  }
+
+  return HADD_UH_U32(sad);
+}
+
+static uint32_t avgsad_8width_msa(const uint8_t *src, int32_t src_stride,
+                                  const uint8_t *ref, int32_t ref_stride,
+                                  int32_t height, const uint8_t *sec_pred) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+  v16u8 diff0, diff1, pred0, pred1;
+  v8u16 sad = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
+    ref += (4 * ref_stride);
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
+                src0, src1, ref0, ref1);
+    AVER_UB2_UB(pred0, ref0, pred1, ref1, diff0, diff1);
+    sad += SAD_UB2_UH(src0, src1, diff0, diff1);
+  }
+
+  return HADD_UH_U32(sad);
+}
+
+static uint32_t avgsad_16width_msa(const uint8_t *src, int32_t src_stride,
+                                   const uint8_t *ref, int32_t ref_stride,
+                                   int32_t height, const uint8_t *sec_pred) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+  v16u8 pred0, pred1, pred2, pred3, comp0, comp1;
+  v8u16 sad = { 0 };
+
+  for (ht_cnt = (height >> 3); ht_cnt--;) {
+    LD_UB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
+    ref += (4 * ref_stride);
+    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+    sec_pred += (4 * 16);
+    AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
+    sad += SAD_UB2_UH(src0, src1, comp0, comp1);
+    AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
+    sad += SAD_UB2_UH(src2, src3, comp0, comp1);
+
+    LD_UB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
+    ref += (4 * ref_stride);
+    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+    sec_pred += (4 * 16);
+    AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
+    sad += SAD_UB2_UH(src0, src1, comp0, comp1);
+    AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
+    sad += SAD_UB2_UH(src2, src3, comp0, comp1);
+  }
+
+  return HADD_UH_U32(sad);
+}
+
+static uint32_t avgsad_32width_msa(const uint8_t *src, int32_t src_stride,
+                                   const uint8_t *ref, int32_t ref_stride,
+                                   int32_t height, const uint8_t *sec_pred) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
+  v16u8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+  v16u8 comp0, comp1;
+  v8u16 sad = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB4(src, src_stride, src0, src2, src4, src6);
+    LD_UB4(src + 16, src_stride, src1, src3, src5, src7);
+    src += (4 * src_stride);
+
+    LD_UB4(ref, ref_stride, ref0, ref2, ref4, ref6);
+    LD_UB4(ref + 16, ref_stride, ref1, ref3, ref5, ref7);
+    ref += (4 * ref_stride);
+
+    LD_UB4(sec_pred, 32, pred0, pred2, pred4, pred6);
+    LD_UB4(sec_pred + 16, 32, pred1, pred3, pred5, pred7);
+    sec_pred += (4 * 32);
+
+    AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
+    sad += SAD_UB2_UH(src0, src1, comp0, comp1);
+    AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
+    sad += SAD_UB2_UH(src2, src3, comp0, comp1);
+    AVER_UB2_UB(pred4, ref4, pred5, ref5, comp0, comp1);
+    sad += SAD_UB2_UH(src4, src5, comp0, comp1);
+    AVER_UB2_UB(pred6, ref6, pred7, ref7, comp0, comp1);
+    sad += SAD_UB2_UH(src6, src7, comp0, comp1);
+  }
+
+  return HADD_UH_U32(sad);
+}
+
+static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
+                                   const uint8_t *ref, int32_t ref_stride,
+                                   int32_t height, const uint8_t *sec_pred) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 comp0, comp1, comp2, comp3;
+  v16u8 pred0, pred1, pred2, pred3;
+  v8u16 sad0 = { 0 };
+  v8u16 sad1 = { 0 };
+  v4u32 sad;
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB4(src, 16, src0, src1, src2, src3);
+    src += src_stride;
+    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
+    ref += ref_stride;
+    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3,
+                comp0, comp1, comp2, comp3);
+    sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
+    sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
+
+    LD_UB4(src, 16, src0, src1, src2, src3);
+    src += src_stride;
+    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
+    ref += ref_stride;
+    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3,
+                comp0, comp1, comp2, comp3);
+    sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
+    sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
+
+    LD_UB4(src, 16, src0, src1, src2, src3);
+    src += src_stride;
+    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
+    ref += ref_stride;
+    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3,
+                comp0, comp1, comp2, comp3);
+    sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
+    sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
+
+    LD_UB4(src, 16, src0, src1, src2, src3);
+    src += src_stride;
+    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
+    ref += ref_stride;
+    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3,
+                comp0, comp1, comp2, comp3);
+    sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
+    sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
+  }
+
+  sad = __msa_hadd_u_w(sad0, sad0);
+  sad += __msa_hadd_u_w(sad1, sad1);
+
+  return HADD_SW_S32(sad);
+}
+
+#define VPX_SAD_4xHEIGHT_MSA(height)                                        \
+uint32_t vpx_sad4x##height##_msa(const uint8_t *src, int32_t src_stride,    \
+                                 const uint8_t *ref, int32_t ref_stride) {  \
+  return sad_4width_msa(src, src_stride,  ref, ref_stride, height);         \
+}
+
+#define VPX_SAD_8xHEIGHT_MSA(height)                                        \
+uint32_t vpx_sad8x##height##_msa(const uint8_t *src, int32_t src_stride,    \
+                                 const uint8_t *ref, int32_t ref_stride) {  \
+  return sad_8width_msa(src, src_stride, ref, ref_stride, height);          \
+}
+
+#define VPX_SAD_16xHEIGHT_MSA(height)                                        \
+uint32_t vpx_sad16x##height##_msa(const uint8_t *src, int32_t src_stride,    \
+                                  const uint8_t *ref, int32_t ref_stride) {  \
+  return sad_16width_msa(src, src_stride, ref, ref_stride, height);          \
+}
+
+#define VPX_SAD_32xHEIGHT_MSA(height)                                        \
+uint32_t vpx_sad32x##height##_msa(const uint8_t *src, int32_t src_stride,    \
+                                  const uint8_t *ref, int32_t ref_stride) {  \
+  return sad_32width_msa(src, src_stride, ref, ref_stride, height);          \
+}
+
+#define VPX_SAD_64xHEIGHT_MSA(height)                                        \
+uint32_t vpx_sad64x##height##_msa(const uint8_t *src, int32_t src_stride,    \
+                                  const uint8_t *ref, int32_t ref_stride) {  \
+  return sad_64width_msa(src, src_stride, ref, ref_stride, height);          \
+}
+
+#define VPX_SAD_4xHEIGHTx3_MSA(height)                                  \
+void vpx_sad4x##height##x3_msa(const uint8_t *src, int32_t src_stride,  \
+                               const uint8_t *ref, int32_t ref_stride,  \
+                               uint32_t *sads) {                        \
+  sad_4width_x3_msa(src, src_stride, ref, ref_stride, height, sads);    \
+}
+
+#define VPX_SAD_8xHEIGHTx3_MSA(height)                                  \
+void vpx_sad8x##height##x3_msa(const uint8_t *src, int32_t src_stride,  \
+                               const uint8_t *ref, int32_t ref_stride,  \
+                               uint32_t *sads) {                        \
+  sad_8width_x3_msa(src, src_stride, ref, ref_stride, height, sads);    \
+}
+
+#define VPX_SAD_16xHEIGHTx3_MSA(height)                                  \
+void vpx_sad16x##height##x3_msa(const uint8_t *src, int32_t src_stride,  \
+                                const uint8_t *ref, int32_t ref_stride,  \
+                                uint32_t *sads) {                        \
+  sad_16width_x3_msa(src, src_stride, ref, ref_stride, height, sads);    \
+}
+
+#define VPX_SAD_32xHEIGHTx3_MSA(height)                                  \
+void vpx_sad32x##height##x3_msa(const uint8_t *src, int32_t src_stride,  \
+                                const uint8_t *ref, int32_t ref_stride,  \
+                                uint32_t *sads) {                        \
+  sad_32width_x3_msa(src, src_stride, ref, ref_stride, height, sads);    \
+}
+
+#define VPX_SAD_64xHEIGHTx3_MSA(height)                                  \
+void vpx_sad64x##height##x3_msa(const uint8_t *src, int32_t src_stride,  \
+                                const uint8_t *ref, int32_t ref_stride,  \
+                                uint32_t *sads) {                        \
+  sad_64width_x3_msa(src, src_stride, ref, ref_stride, height, sads);    \
+}
+
+#define VPX_SAD_4xHEIGHTx8_MSA(height)                                  \
+void vpx_sad4x##height##x8_msa(const uint8_t *src, int32_t src_stride,  \
+                               const uint8_t *ref, int32_t ref_stride,  \
+                               uint32_t *sads) {                        \
+  sad_4width_x8_msa(src, src_stride, ref, ref_stride, height, sads);    \
+}
+
+#define VPX_SAD_8xHEIGHTx8_MSA(height)                                  \
+void vpx_sad8x##height##x8_msa(const uint8_t *src, int32_t src_stride,  \
+                               const uint8_t *ref, int32_t ref_stride,  \
+                               uint32_t *sads) {                        \
+  sad_8width_x8_msa(src, src_stride, ref, ref_stride, height, sads);    \
+}
+
+#define VPX_SAD_16xHEIGHTx8_MSA(height)                                  \
+void vpx_sad16x##height##x8_msa(const uint8_t *src, int32_t src_stride,  \
+                                const uint8_t *ref, int32_t ref_stride,  \
+                                uint32_t *sads) {                        \
+  sad_16width_x8_msa(src, src_stride, ref, ref_stride, height, sads);    \
+}
+
+#define VPX_SAD_32xHEIGHTx8_MSA(height)                                  \
+void vpx_sad32x##height##x8_msa(const uint8_t *src, int32_t src_stride,  \
+                                const uint8_t *ref, int32_t ref_stride,  \
+                                uint32_t *sads) {                        \
+  sad_32width_x8_msa(src, src_stride, ref, ref_stride, height, sads);    \
+}
+
+#define VPX_SAD_64xHEIGHTx8_MSA(height)                                  \
+void vpx_sad64x##height##x8_msa(const uint8_t *src, int32_t src_stride,  \
+                                const uint8_t *ref, int32_t ref_stride,  \
+                                uint32_t *sads) {                        \
+  sad_64width_x8_msa(src, src_stride, ref, ref_stride, height, sads);    \
+}
+
+#define VPX_SAD_4xHEIGHTx4D_MSA(height)                                  \
+void vpx_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride,  \
+                                const uint8_t *const refs[],             \
+                                int32_t ref_stride, uint32_t *sads) {    \
+  sad_4width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);   \
+}
+
+#define VPX_SAD_8xHEIGHTx4D_MSA(height)                                  \
+void vpx_sad8x##height##x4d_msa(const uint8_t *src, int32_t src_stride,  \
+                                const uint8_t *const refs[],             \
+                                int32_t ref_stride, uint32_t *sads) {    \
+  sad_8width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);   \
+}
+
+#define VPX_SAD_16xHEIGHTx4D_MSA(height)                                  \
+void vpx_sad16x##height##x4d_msa(const uint8_t *src, int32_t src_stride,  \
+                                 const uint8_t *const refs[],             \
+                                 int32_t ref_stride, uint32_t *sads) {    \
+  sad_16width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);   \
+}
+
+#define VPX_SAD_32xHEIGHTx4D_MSA(height)                                  \
+void vpx_sad32x##height##x4d_msa(const uint8_t *src, int32_t src_stride,  \
+                                 const uint8_t *const refs[],             \
+                                 int32_t ref_stride, uint32_t *sads) {    \
+  sad_32width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);   \
+}
+
+#define VPX_SAD_64xHEIGHTx4D_MSA(height)                                  \
+void vpx_sad64x##height##x4d_msa(const uint8_t *src, int32_t src_stride,  \
+                                 const uint8_t *const refs[],             \
+                                 int32_t ref_stride, uint32_t *sads) {    \
+  sad_64width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);   \
+}
+
+#define VPX_AVGSAD_4xHEIGHT_MSA(height)                                       \
+uint32_t vpx_sad4x##height##_avg_msa(const uint8_t *src, int32_t src_stride,  \
+                                     const uint8_t *ref, int32_t ref_stride,  \
+                                     const uint8_t *second_pred) {            \
+  return avgsad_4width_msa(src, src_stride, ref, ref_stride,                  \
+                           height, second_pred);                              \
+}
+
+#define VPX_AVGSAD_8xHEIGHT_MSA(height)                                       \
+uint32_t vpx_sad8x##height##_avg_msa(const uint8_t *src, int32_t src_stride,  \
+                                     const uint8_t *ref, int32_t ref_stride,  \
+                                     const uint8_t *second_pred) {            \
+  return avgsad_8width_msa(src, src_stride, ref, ref_stride,                  \
+                           height, second_pred);                              \
+}
+
+#define VPX_AVGSAD_16xHEIGHT_MSA(height)                                       \
+uint32_t vpx_sad16x##height##_avg_msa(const uint8_t *src, int32_t src_stride,  \
+                                      const uint8_t *ref, int32_t ref_stride,  \
+                                      const uint8_t *second_pred) {            \
+  return avgsad_16width_msa(src, src_stride, ref, ref_stride,                  \
+                            height, second_pred);                              \
+}
+
+#define VPX_AVGSAD_32xHEIGHT_MSA(height)                                       \
+uint32_t vpx_sad32x##height##_avg_msa(const uint8_t *src, int32_t src_stride,  \
+                                      const uint8_t *ref, int32_t ref_stride,  \
+                                      const uint8_t *second_pred) {            \
+  return avgsad_32width_msa(src, src_stride, ref, ref_stride,                  \
+                            height, second_pred);                              \
+}
+
+#define VPX_AVGSAD_64xHEIGHT_MSA(height)                                       \
+uint32_t vpx_sad64x##height##_avg_msa(const uint8_t *src, int32_t src_stride,  \
+                                      const uint8_t *ref, int32_t ref_stride,  \
+                                      const uint8_t *second_pred) {            \
+  return avgsad_64width_msa(src, src_stride, ref, ref_stride,                  \
+                            height, second_pred);                              \
+}
+
+// 64x64
+VPX_SAD_64xHEIGHT_MSA(64);
+VPX_SAD_64xHEIGHTx3_MSA(64);
+VPX_SAD_64xHEIGHTx8_MSA(64);
+VPX_SAD_64xHEIGHTx4D_MSA(64);
+VPX_AVGSAD_64xHEIGHT_MSA(64);
+
+// 64x32
+VPX_SAD_64xHEIGHT_MSA(32);
+VPX_SAD_64xHEIGHTx3_MSA(32);
+VPX_SAD_64xHEIGHTx8_MSA(32);
+VPX_SAD_64xHEIGHTx4D_MSA(32);
+VPX_AVGSAD_64xHEIGHT_MSA(32);
+
+// 32x64
+VPX_SAD_32xHEIGHT_MSA(64);
+VPX_SAD_32xHEIGHTx3_MSA(64);
+VPX_SAD_32xHEIGHTx8_MSA(64);
+VPX_SAD_32xHEIGHTx4D_MSA(64);
+VPX_AVGSAD_32xHEIGHT_MSA(64);
+
+// 32x32
+VPX_SAD_32xHEIGHT_MSA(32);
+VPX_SAD_32xHEIGHTx3_MSA(32);
+VPX_SAD_32xHEIGHTx8_MSA(32);
+VPX_SAD_32xHEIGHTx4D_MSA(32);
+VPX_AVGSAD_32xHEIGHT_MSA(32);
+
+// 32x16
+VPX_SAD_32xHEIGHT_MSA(16);
+VPX_SAD_32xHEIGHTx3_MSA(16);
+VPX_SAD_32xHEIGHTx8_MSA(16);
+VPX_SAD_32xHEIGHTx4D_MSA(16);
+VPX_AVGSAD_32xHEIGHT_MSA(16);
+
+// 16x32
+VPX_SAD_16xHEIGHT_MSA(32);
+VPX_SAD_16xHEIGHTx3_MSA(32);
+VPX_SAD_16xHEIGHTx8_MSA(32);
+VPX_SAD_16xHEIGHTx4D_MSA(32);
+VPX_AVGSAD_16xHEIGHT_MSA(32);
+
+// 16x16
+VPX_SAD_16xHEIGHT_MSA(16);
+VPX_SAD_16xHEIGHTx3_MSA(16);
+VPX_SAD_16xHEIGHTx8_MSA(16);
+VPX_SAD_16xHEIGHTx4D_MSA(16);
+VPX_AVGSAD_16xHEIGHT_MSA(16);
+
+// 16x8
+VPX_SAD_16xHEIGHT_MSA(8);
+VPX_SAD_16xHEIGHTx3_MSA(8);
+VPX_SAD_16xHEIGHTx8_MSA(8);
+VPX_SAD_16xHEIGHTx4D_MSA(8);
+VPX_AVGSAD_16xHEIGHT_MSA(8);
+
+// 8x16
+VPX_SAD_8xHEIGHT_MSA(16);
+VPX_SAD_8xHEIGHTx3_MSA(16);
+VPX_SAD_8xHEIGHTx8_MSA(16);
+VPX_SAD_8xHEIGHTx4D_MSA(16);
+VPX_AVGSAD_8xHEIGHT_MSA(16);
+
+// 8x8
+VPX_SAD_8xHEIGHT_MSA(8);
+VPX_SAD_8xHEIGHTx3_MSA(8);
+VPX_SAD_8xHEIGHTx8_MSA(8);
+VPX_SAD_8xHEIGHTx4D_MSA(8);
+VPX_AVGSAD_8xHEIGHT_MSA(8);
+
+// 8x4
+VPX_SAD_8xHEIGHT_MSA(4);
+VPX_SAD_8xHEIGHTx3_MSA(4);
+VPX_SAD_8xHEIGHTx8_MSA(4);
+VPX_SAD_8xHEIGHTx4D_MSA(4);
+VPX_AVGSAD_8xHEIGHT_MSA(4);
+
+// 4x8
+VPX_SAD_4xHEIGHT_MSA(8);
+VPX_SAD_4xHEIGHTx3_MSA(8);
+VPX_SAD_4xHEIGHTx8_MSA(8);
+VPX_SAD_4xHEIGHTx4D_MSA(8);
+VPX_AVGSAD_4xHEIGHT_MSA(8);
+
+// 4x4
+VPX_SAD_4xHEIGHT_MSA(4);
+VPX_SAD_4xHEIGHTx3_MSA(4);
+VPX_SAD_4xHEIGHTx8_MSA(4);
+VPX_SAD_4xHEIGHTx4D_MSA(4);
+VPX_AVGSAD_4xHEIGHT_MSA(4);
diff --git a/libs/libvpx/vpx_dsp/mips/sub_pixel_variance_msa.c b/libs/libvpx/vpx_dsp/mips/sub_pixel_variance_msa.c
new file mode 100644
index 0000000000..a592a2d078
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/sub_pixel_variance_msa.c
@@ -0,0 +1,1952 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/mem.h"
+#include "vpx_dsp/mips/macros_msa.h"
+#include "vpx_dsp/variance.h"
+
+static const uint8_t bilinear_filters_msa[8][2] = {
+  { 128,   0, },
+  { 112,  16, },
+  {  96,  32, },
+  {  80,  48, },
+  {  64,  64, },
+  {  48,  80, },
+  {  32,  96, },
+  {  16, 112, },
+};
+
+#define CALC_MSE_AVG_B(src, ref, var, sub) {                       \
+  v16u8 src_l0_m, src_l1_m;                                        \
+  v8i16 res_l0_m, res_l1_m;                                        \
+                                                                   \
+  ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m);                       \
+  HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m);             \
+  DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var);  \
+                                                                   \
+  sub += res_l0_m + res_l1_m;                                      \
+}
+
+#define VARIANCE_WxH(sse, diff, shift) \
+  sse - (((uint32_t)diff * diff) >> shift)
+
+#define VARIANCE_LARGE_WxH(sse, diff, shift) \
+  sse - (((int64_t)diff * diff) >> shift)
+
+static uint32_t avg_sse_diff_4width_msa(const uint8_t *src_ptr,
+                                        int32_t src_stride,
+                                        const uint8_t *ref_ptr,
+                                        int32_t ref_stride,
+                                        const uint8_t *sec_pred,
+                                        int32_t height,
+                                        int32_t *diff) {
+  int32_t ht_cnt;
+  uint32_t src0, src1, src2, src3;
+  uint32_t ref0, ref1, ref2, ref3;
+  v16u8 pred, src = { 0 };
+  v16u8 ref = { 0 };
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+    LW4(src_ptr, src_stride, src0, src1, src2, src3);
+    src_ptr += (4 * src_stride);
+    LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    ref_ptr += (4 * ref_stride);
+
+    INSERT_W4_UB(src0, src1, src2, src3, src);
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+
+    src = __msa_aver_u_b(src, pred);
+    CALC_MSE_AVG_B(src, ref, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t avg_sse_diff_8width_msa(const uint8_t *src_ptr,
+                                        int32_t src_stride,
+                                        const uint8_t *ref_ptr,
+                                        int32_t ref_stride,
+                                        const uint8_t *sec_pred,
+                                        int32_t height,
+                                        int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 pred0, pred1;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
+    src_ptr += (4 * src_stride);
+    LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    ref_ptr += (4 * ref_stride);
+
+    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
+                src0, src1, ref0, ref1);
+    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t avg_sse_diff_16width_msa(const uint8_t *src_ptr,
+                                         int32_t src_stride,
+                                         const uint8_t *ref_ptr,
+                                         int32_t ref_stride,
+                                         const uint8_t *sec_pred,
+                                         int32_t height,
+                                         int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src, ref, pred;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref = LD_UB(ref_ptr);
+    ref_ptr += ref_stride;
+    src = __msa_aver_u_b(src, pred);
+    CALC_MSE_AVG_B(src, ref, var, avg);
+
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref = LD_UB(ref_ptr);
+    ref_ptr += ref_stride;
+    src = __msa_aver_u_b(src, pred);
+    CALC_MSE_AVG_B(src, ref, var, avg);
+
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref = LD_UB(ref_ptr);
+    ref_ptr += ref_stride;
+    src = __msa_aver_u_b(src, pred);
+    CALC_MSE_AVG_B(src, ref, var, avg);
+
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref = LD_UB(ref_ptr);
+    ref_ptr += ref_stride;
+    src = __msa_aver_u_b(src, pred);
+    CALC_MSE_AVG_B(src, ref, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t avg_sse_diff_32width_msa(const uint8_t *src_ptr,
+                                         int32_t src_stride,
+                                         const uint8_t *ref_ptr,
+                                         int32_t ref_stride,
+                                         const uint8_t *sec_pred,
+                                         int32_t height,
+                                         int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, ref0, ref1, pred0, pred1;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t avg_sse_diff_32x64_msa(const uint8_t *src_ptr,
+                                       int32_t src_stride,
+                                       const uint8_t *ref_ptr,
+                                       int32_t ref_stride,
+                                       const uint8_t *sec_pred,
+                                       int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, ref0, ref1, pred0, pred1;
+  v8i16 avg0 = { 0 };
+  v8i16 avg1 = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = 16; ht_cnt--;) {
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+  }
+
+  vec = __msa_hadd_s_w(avg0, avg0);
+  vec += __msa_hadd_s_w(avg1, avg1);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t avg_sse_diff_64x32_msa(const uint8_t *src_ptr,
+                                       int32_t src_stride,
+                                       const uint8_t *ref_ptr,
+                                       int32_t ref_stride,
+                                       const uint8_t *sec_pred,
+                                       int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 pred0, pred1, pred2, pred3;
+  v8i16 avg0 = { 0 };
+  v8i16 avg1 = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = 16; ht_cnt--;) {
+    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+    src_ptr += src_stride;
+    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+    AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+                src0, src1, src2, src3);
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src2, ref2, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+    CALC_MSE_AVG_B(src3, ref3, var, avg1);
+
+    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+    src_ptr += src_stride;
+    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+    AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+                src0, src1, src2, src3);
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src2, ref2, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+    CALC_MSE_AVG_B(src3, ref3, var, avg1);
+  }
+
+  vec = __msa_hadd_s_w(avg0, avg0);
+  vec += __msa_hadd_s_w(avg1, avg1);
+
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t avg_sse_diff_64x64_msa(const uint8_t *src_ptr,
+                                       int32_t src_stride,
+                                       const uint8_t *ref_ptr,
+                                       int32_t ref_stride,
+                                       const uint8_t *sec_pred,
+                                       int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 pred0, pred1, pred2, pred3;
+  v8i16 avg0 = { 0 };
+  v8i16 avg1 = { 0 };
+  v8i16 avg2 = { 0 };
+  v8i16 avg3 = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = 32; ht_cnt--;) {
+    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+    src_ptr += src_stride;
+    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+    AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+                src0, src1, src2, src3);
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+    CALC_MSE_AVG_B(src2, ref2, var, avg2);
+    CALC_MSE_AVG_B(src3, ref3, var, avg3);
+
+    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+    src_ptr += src_stride;
+    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+    AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+                src0, src1, src2, src3);
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+    CALC_MSE_AVG_B(src2, ref2, var, avg2);
+    CALC_MSE_AVG_B(src3, ref3, var, avg3);
+  }
+
+  vec = __msa_hadd_s_w(avg0, avg0);
+  vec += __msa_hadd_s_w(avg1, avg1);
+  vec += __msa_hadd_s_w(avg2, avg2);
+  vec += __msa_hadd_s_w(avg3, avg3);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_4width_h_msa(const uint8_t *src,
+                                                int32_t src_stride,
+                                                const uint8_t *dst,
+                                                int32_t dst_stride,
+                                                const uint8_t *filter,
+                                                int32_t height,
+                                                int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  uint32_t ref0, ref1, ref2, ref3;
+  v16u8 filt0, ref = { 0 };
+  v16i8 src0, src1, src2, src3;
+  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+  v8u16 vec0, vec1, vec2, vec3;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                vec0, vec1, vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
+                src0, src1, src2, src3);
+    ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
+    src0 = (v16i8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
+    CALC_MSE_AVG_B(src0, ref, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_8width_h_msa(const uint8_t *src,
+                                                int32_t src_stride,
+                                                const uint8_t *dst,
+                                                int32_t dst_stride,
+                                                const uint8_t *filter,
+                                                int32_t height,
+                                                int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16u8 filt0, out, ref0, ref1, ref2, ref3;
+  v16i8 src0, src1, src2, src3;
+  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+  v8u16 vec0, vec1, vec2, vec3;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                vec0, vec1, vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
+                src0, src1, src2, src3);
+    out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
+    CALC_MSE_AVG_B(out, ref0, var, avg);
+    out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
+    CALC_MSE_AVG_B(out, ref1, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_16width_h_msa(const uint8_t *src,
+                                                 int32_t src_stride,
+                                                 const uint8_t *dst,
+                                                 int32_t dst_stride,
+                                                 const uint8_t *filter,
+                                                 int32_t height,
+                                                 int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+  v16u8 dst0, dst1, dst2, dst3, filt0;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src0, src2, src4, src6);
+    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+    src += (4 * src_stride);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    dst += (4 * dst_stride);
+
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
+    VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                out0, out1, out2, out3);
+    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
+                out4, out5, out6, out7);
+    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
+    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
+    PCKEV_B4_SB(out1, out0, out3, out2, out5, out4, out7, out6,
+                src0, src1, src2, src3);
+    CALC_MSE_AVG_B(src0, dst0, var, avg);
+    CALC_MSE_AVG_B(src1, dst1, var, avg);
+    CALC_MSE_AVG_B(src2, dst2, var, avg);
+    CALC_MSE_AVG_B(src3, dst3, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_32width_h_msa(const uint8_t *src,
+                                                 int32_t src_stride,
+                                                 const uint8_t *dst,
+                                                 int32_t dst_stride,
+                                                 const uint8_t *filter,
+                                                 int32_t height,
+                                                 int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[2];
+
+  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
+    sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,
+                                            filter, height, &diff0[loop_cnt]);
+    src += 16;
+    dst += 16;
+  }
+
+  *diff = diff0[0] + diff0[1];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_sse_diff_64width_h_msa(const uint8_t *src,
+                                                 int32_t src_stride,
+                                                 const uint8_t *dst,
+                                                 int32_t dst_stride,
+                                                 const uint8_t *filter,
+                                                 int32_t height,
+                                                 int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[4];
+
+  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+    sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,
+                                            filter, height, &diff0[loop_cnt]);
+    src += 16;
+    dst += 16;
+  }
+
+  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_sse_diff_4width_v_msa(const uint8_t *src,
+                                                int32_t src_stride,
+                                                const uint8_t *dst,
+                                                int32_t dst_stride,
+                                                const uint8_t *filter,
+                                                int32_t height,
+                                                int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  uint32_t ref0, ref1, ref2, ref3;
+  v16u8 src0, src1, src2, src3, src4, out;
+  v16u8 src10_r, src32_r, src21_r, src43_r;
+  v16u8 ref = { 0 };
+  v16u8 src2110, src4332;
+  v16u8 filt0;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+  v8u16 tmp0, tmp1;
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_r, src21_r, src32_r, src43_r);
+    ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+    DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+    CALC_MSE_AVG_B(out, ref, var, avg);
+    src0 = src4;
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_8width_v_msa(const uint8_t *src,
+                                                int32_t src_stride,
+                                                const uint8_t *dst,
+                                                int32_t dst_stride,
+                                                const uint8_t *filter,
+                                                int32_t height,
+                                                int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 ref0, ref1, ref2, ref3;
+  v8u16 vec0, vec1, vec2, vec3;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v16u8 filt0;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+    ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3,
+               vec0, vec1, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                tmp0, tmp1, tmp2, tmp3);
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+    src0 = src4;
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_16width_v_msa(const uint8_t *src,
+                                                 int32_t src_stride,
+                                                 const uint8_t *dst,
+                                                 int32_t dst_stride,
+                                                 const uint8_t *filter,
+                                                 int32_t height,
+                                                 int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 out0, out1, out2, out3;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v16u8 filt0;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+    ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
+    ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
+    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
+
+    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
+
+    src0 = src4;
+
+    CALC_MSE_AVG_B(out0, ref0, var, avg);
+    CALC_MSE_AVG_B(out1, ref1, var, avg);
+    CALC_MSE_AVG_B(out2, ref2, var, avg);
+    CALC_MSE_AVG_B(out3, ref3, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_32width_v_msa(const uint8_t *src,
+                                                 int32_t src_stride,
+                                                 const uint8_t *dst,
+                                                 int32_t dst_stride,
+                                                 const uint8_t *filter,
+                                                 int32_t height,
+                                                 int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[2];
+
+  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
+    sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,
+                                            filter, height, &diff0[loop_cnt]);
+    src += 16;
+    dst += 16;
+  }
+
+  *diff = diff0[0] + diff0[1];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_sse_diff_64width_v_msa(const uint8_t *src,
+                                                 int32_t src_stride,
+                                                 const uint8_t *dst,
+                                                 int32_t dst_stride,
+                                                 const uint8_t *filter,
+                                                 int32_t height,
+                                                 int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[4];
+
+  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+    sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,
+                                            filter, height, &diff0[loop_cnt]);
+    src += 16;
+    dst += 16;
+  }
+
+  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_sse_diff_4width_hv_msa(const uint8_t *src,
+                                                 int32_t src_stride,
+                                                 const uint8_t *dst,
+                                                 int32_t dst_stride,
+                                                 const uint8_t *filter_horiz,
+                                                 const uint8_t *filter_vert,
+                                                 int32_t height,
+                                                 int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  uint32_t ref0, ref1, ref2, ref3;
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 out, ref = { 0 };
+  v16u8 filt_vt, filt_hz, vec0, vec1;
+  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4;
+  v8u16 tmp0, tmp1;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter_horiz);
+  filt_hz = (v16u8)__msa_fill_h(filtval);
+  filtval = LH(filter_vert);
+  filt_vt = (v16u8)__msa_fill_h(filtval);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
+    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+    hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
+    hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+    CALC_MSE_AVG_B(out, ref, var, avg);
+    src0 = src4;
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_8width_hv_msa(const uint8_t *src,
+                                                 int32_t src_stride,
+                                                 const uint8_t *dst,
+                                                 int32_t dst_stride,
+                                                 const uint8_t *filter_horiz,
+                                                 const uint8_t *filter_vert,
+                                                 int32_t height,
+                                                 int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 out0, out1;
+  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+  v8u16 hz_out0, hz_out1;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v16u8 filt_vt, filt_hz, vec0;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter_horiz);
+  filt_hz = (v16u8)__msa_fill_h(filtval);
+  filtval = LH(filter_vert);
+  filt_vt = (v16u8)__msa_fill_h(filtval);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+    tmp0 = __msa_dotp_u_h(vec0, filt_vt);
+    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+    tmp1 = __msa_dotp_u_h(vec0, filt_vt);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+    tmp2 = __msa_dotp_u_h(vec0, filt_vt);
+    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+    tmp3 = __msa_dotp_u_h(vec0, filt_vt);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+    CALC_MSE_AVG_B(out0, ref0, var, avg);
+    CALC_MSE_AVG_B(out1, ref1, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_16width_hv_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  const uint8_t *dst,
+                                                  int32_t dst_stride,
+                                                  const uint8_t *filter_horiz,
+                                                  const uint8_t *filter_vert,
+                                                  int32_t height,
+                                                  int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 filt_hz, filt_vt, vec0, vec1;
+  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+  v8u16 hz_out0, hz_out1, hz_out2, hz_out3;
+  v8u16 tmp0, tmp1;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter_horiz);
+  filt_hz = (v16u8)__msa_fill_h(filtval);
+  filtval = LH(filter_vert);
+  filt_vt = (v16u8)__msa_fill_h(filtval);
+
+  LD_UB2(src, 8, src0, src1);
+  src += src_stride;
+
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src0, src2, src4, src6);
+    LD_UB4(src + 8, src_stride, src1, src3, src5, src7);
+    src += (4 * src_stride);
+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+    hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    src0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    src1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+    hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    src2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    src3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+    CALC_MSE_AVG_B(src2, ref2, var, avg);
+    CALC_MSE_AVG_B(src3, ref3, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_32width_hv_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  const uint8_t *dst,
+                                                  int32_t dst_stride,
+                                                  const uint8_t *filter_horiz,
+                                                  const uint8_t *filter_vert,
+                                                  int32_t height,
+                                                  int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[2];
+
+  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
+    sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,
+                                             filter_horiz, filter_vert, height,
+                                             &diff0[loop_cnt]);
+    src += 16;
+    dst += 16;
+  }
+
+  *diff = diff0[0] + diff0[1];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_sse_diff_64width_hv_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  const uint8_t *dst,
+                                                  int32_t dst_stride,
+                                                  const uint8_t *filter_horiz,
+                                                  const uint8_t *filter_vert,
+                                                  int32_t height,
+                                                  int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[4];
+
+  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+    sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,
+                                             filter_horiz, filter_vert, height,
+                                             &diff0[loop_cnt]);
+    src += 16;
+    dst += 16;
+  }
+
+  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_4width_h_msa(const uint8_t *src,
+                                                    int32_t src_stride,
+                                                    const uint8_t *dst,
+                                                    int32_t dst_stride,
+                                                    const uint8_t *sec_pred,
+                                                    const uint8_t *filter,
+                                                    int32_t height,
+                                                    int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  uint32_t ref0, ref1, ref2, ref3;
+  v16u8 out, pred, filt0, ref = { 0 };
+  v16i8 src0, src1, src2, src3;
+  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+  v8u16 vec0, vec1, vec2, vec3;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                vec0, vec1, vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
+                src0, src1, src2, src3);
+    ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
+    out = (v16u8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
+    out = __msa_aver_u_b(out, pred);
+    CALC_MSE_AVG_B(out, ref, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_8width_h_msa(const uint8_t *src,
+                                                    int32_t src_stride,
+                                                    const uint8_t *dst,
+                                                    int32_t dst_stride,
+                                                    const uint8_t *sec_pred,
+                                                    const uint8_t *filter,
+                                                    int32_t height,
+                                                    int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16u8 out, pred, filt0;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16i8 src0, src1, src2, src3;
+  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+  v8u16 vec0, vec1, vec2, vec3;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                vec0, vec1, vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
+                src0, src1, src2, src3);
+    out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
+
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+    out = __msa_aver_u_b(out, pred);
+    CALC_MSE_AVG_B(out, ref0, var, avg);
+    out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+    out = __msa_aver_u_b(out, pred);
+    CALC_MSE_AVG_B(out, ref1, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t subpel_avg_ssediff_16w_h_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             const uint8_t *dst,
+                                             int32_t dst_stride,
+                                             const uint8_t *sec_pred,
+                                             const uint8_t *filter,
+                                             int32_t height,
+                                             int32_t *diff,
+                                             int32_t width) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+  v16u8 dst0, dst1, dst2, dst3;
+  v16u8 tmp0, tmp1, tmp2, tmp3;
+  v16u8 pred0, pred1, pred2, pred3, filt0;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src0, src2, src4, src6);
+    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+    src += (4 * src_stride);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    dst += (4 * dst_stride);
+    LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
+    sec_pred += (4 * width);
+
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
+    VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                out0, out1, out2, out3);
+    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
+                out4, out5, out6, out7);
+    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
+    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
+    PCKEV_B4_UB(out1, out0, out3, out2, out5, out4, out7, out6,
+                tmp0, tmp1, tmp2, tmp3);
+    AVER_UB4_UB(tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3, pred3,
+                tmp0, tmp1, tmp2, tmp3);
+
+    CALC_MSE_AVG_B(tmp0, dst0, var, avg);
+    CALC_MSE_AVG_B(tmp1, dst1, var, avg);
+    CALC_MSE_AVG_B(tmp2, dst2, var, avg);
+    CALC_MSE_AVG_B(tmp3, dst3, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_16width_h_msa(const uint8_t *src,
+                                                     int32_t src_stride,
+                                                     const uint8_t *dst,
+                                                     int32_t dst_stride,
+                                                     const uint8_t *sec_pred,
+                                                     const uint8_t *filter,
+                                                     int32_t height,
+                                                     int32_t *diff) {
+  return subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride,
+                                      sec_pred, filter, height, diff, 16);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_32width_h_msa(const uint8_t *src,
+                                                     int32_t src_stride,
+                                                     const uint8_t *dst,
+                                                     int32_t dst_stride,
+                                                     const uint8_t *sec_pred,
+                                                     const uint8_t *filter,
+                                                     int32_t height,
+                                                     int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[2];
+
+  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
+    sse += subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride,
+                                        sec_pred, filter, height,
+                                        &diff0[loop_cnt], 32);
+    src += 16;
+    dst += 16;
+    sec_pred += 16;
+  }
+
+  *diff = diff0[0] + diff0[1];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_64width_h_msa(const uint8_t *src,
+                                                     int32_t src_stride,
+                                                     const uint8_t *dst,
+                                                     int32_t dst_stride,
+                                                     const uint8_t *sec_pred,
+                                                     const uint8_t *filter,
+                                                     int32_t height,
+                                                     int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[4];
+
+  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+    sse += subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride,
+                                        sec_pred, filter, height,
+                                        &diff0[loop_cnt], 64);
+    src += 16;
+    dst += 16;
+    sec_pred += 16;
+  }
+
+  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_4width_v_msa(const uint8_t *src,
+                                                    int32_t src_stride,
+                                                    const uint8_t *dst,
+                                                    int32_t dst_stride,
+                                                    const uint8_t *sec_pred,
+                                                    const uint8_t *filter,
+                                                    int32_t height,
+                                                    int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  uint32_t ref0, ref1, ref2, ref3;
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 src10_r, src32_r, src21_r, src43_r;
+  v16u8 out, pred, ref = { 0 };
+  v16u8 src2110, src4332, filt0;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+  v8u16 tmp0, tmp1;
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_r, src21_r, src32_r, src43_r);
+    ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+    DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+
+    out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+    out = __msa_aver_u_b(out, pred);
+    CALC_MSE_AVG_B(out, ref, var, avg);
+    src0 = src4;
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_8width_v_msa(const uint8_t *src,
+                                                    int32_t src_stride,
+                                                    const uint8_t *dst,
+                                                    int32_t dst_stride,
+                                                    const uint8_t *sec_pred,
+                                                    const uint8_t *filter,
+                                                    int32_t height,
+                                                    int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 pred0, pred1, filt0;
+  v8u16 vec0, vec1, vec2, vec3;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+    ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3,
+               vec0, vec1, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                tmp0, tmp1, tmp2, tmp3);
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
+    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+    src0 = src4;
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t subpel_avg_ssediff_16w_v_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             const uint8_t *dst,
+                                             int32_t dst_stride,
+                                             const uint8_t *sec_pred,
+                                             const uint8_t *filter,
+                                             int32_t height,
+                                             int32_t *diff,
+                                             int32_t width) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 pred0, pred1, pred2, pred3;
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 out0, out1, out2, out3, filt0;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+    LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
+    sec_pred += (4 * width);
+
+    ILVR_B2_UH(src1, src0, src2, src1, vec0, vec2);
+    ILVL_B2_UH(src1, src0, src2, src1, vec1, vec3);
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    ILVR_B2_UH(src3, src2, src4, src3, vec4, vec6);
+    ILVL_B2_UH(src3, src2, src4, src3, vec5, vec7);
+    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
+
+    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
+
+    src0 = src4;
+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3,
+                out0, out1, out2, out3);
+
+    CALC_MSE_AVG_B(out0, ref0, var, avg);
+    CALC_MSE_AVG_B(out1, ref1, var, avg);
+    CALC_MSE_AVG_B(out2, ref2, var, avg);
+    CALC_MSE_AVG_B(out3, ref3, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_16width_v_msa(const uint8_t *src,
+                                                     int32_t src_stride,
+                                                     const uint8_t *dst,
+                                                     int32_t dst_stride,
+                                                     const uint8_t *sec_pred,
+                                                     const uint8_t *filter,
+                                                     int32_t height,
+                                                     int32_t *diff) {
+  return subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride,
+                                      sec_pred, filter, height, diff, 16);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_32width_v_msa(const uint8_t *src,
+                                                     int32_t src_stride,
+                                                     const uint8_t *dst,
+                                                     int32_t dst_stride,
+                                                     const uint8_t *sec_pred,
+                                                     const uint8_t *filter,
+                                                     int32_t height,
+                                                     int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[2];
+
+  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
+    sse += subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride,
+                                        sec_pred, filter, height,
+                                        &diff0[loop_cnt], 32);
+    src += 16;
+    dst += 16;
+    sec_pred += 16;
+  }
+
+  *diff = diff0[0] + diff0[1];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_64width_v_msa(const uint8_t *src,
+                                                     int32_t src_stride,
+                                                     const uint8_t *dst,
+                                                     int32_t dst_stride,
+                                                     const uint8_t *sec_pred,
+                                                     const uint8_t *filter,
+                                                     int32_t height,
+                                                     int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[4];
+
+  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+    sse += subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride,
+                                        sec_pred, filter, height,
+                                        &diff0[loop_cnt], 64);
+    src += 16;
+    dst += 16;
+    sec_pred += 16;
+  }
+
+  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_4width_hv_msa(
+  const uint8_t *src, int32_t src_stride,
+  const uint8_t *dst, int32_t dst_stride,
+  const uint8_t *sec_pred,
+  const uint8_t *filter_horiz, const uint8_t *filter_vert,
+  int32_t height, int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  uint32_t ref0, ref1, ref2, ref3;
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+  v16u8 filt_hz, filt_vt, vec0, vec1;
+  v16u8 out, pred, ref = { 0 };
+  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter_horiz);
+  filt_hz = (v16u8)__msa_fill_h(filtval);
+  filtval = LH(filter_vert);
+  filt_vt = (v16u8)__msa_fill_h(filtval);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
+    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+    hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
+    hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+    out = __msa_aver_u_b(out, pred);
+    CALC_MSE_AVG_B(out, ref, var, avg);
+    src0 = src4;
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_8width_hv_msa(
+  const uint8_t *src, int32_t src_stride,
+  const uint8_t *dst, int32_t dst_stride,
+  const uint8_t *sec_pred,
+  const uint8_t *filter_horiz, const uint8_t *filter_vert,
+  int32_t height, int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 pred0, pred1, out0, out1;
+  v16u8 filt_hz, filt_vt, vec0;
+  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+  v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter_horiz);
+  filt_hz = (v16u8)__msa_fill_h(filtval);
+  filtval = LH(filter_vert);
+  filt_vt = (v16u8)__msa_fill_h(filtval);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+    tmp0 = __msa_dotp_u_h(vec0, filt_vt);
+    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+    tmp1 = __msa_dotp_u_h(vec0, filt_vt);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+    tmp2 = __msa_dotp_u_h(vec0, filt_vt);
+    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+    tmp3 = __msa_dotp_u_h(vec0, filt_vt);
+
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+    AVER_UB2_UB(out0, pred0, out1, pred1, out0, out1);
+
+    CALC_MSE_AVG_B(out0, ref0, var, avg);
+    CALC_MSE_AVG_B(out1, ref1, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t subpel_avg_ssediff_16w_hv_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              const uint8_t *dst,
+                                              int32_t dst_stride,
+                                              const uint8_t *sec_pred,
+                                              const uint8_t *filter_horiz,
+                                              const uint8_t *filter_vert,
+                                              int32_t height,
+                                              int32_t *diff,
+                                              int32_t width) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 pred0, pred1, pred2, pred3;
+  v16u8 out0, out1, out2, out3;
+  v16u8 filt_hz, filt_vt, vec0, vec1;
+  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter_horiz);
+  filt_hz = (v16u8)__msa_fill_h(filtval);
+  filtval = LH(filter_vert);
+  filt_vt = (v16u8)__msa_fill_h(filtval);
+
+  LD_UB2(src, 8, src0, src1);
+  src += src_stride;
+
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src0, src2, src4, src6);
+    LD_UB4(src + 8, src_stride, src1, src3, src5, src7);
+    src += (4 * src_stride);
+    LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
+    sec_pred += (4 * width);
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+    hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    out1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+    hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    out3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3,
+                out0, out1, out2, out3);
+
+    CALC_MSE_AVG_B(out0, ref0, var, avg);
+    CALC_MSE_AVG_B(out1, ref1, var, avg);
+    CALC_MSE_AVG_B(out2, ref2, var, avg);
+    CALC_MSE_AVG_B(out3, ref3, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_16width_hv_msa(
+  const uint8_t *src, int32_t src_stride,
+  const uint8_t *dst, int32_t dst_stride,
+  const uint8_t *sec_pred,
+  const uint8_t *filter_horiz, const uint8_t *filter_vert,
+  int32_t height, int32_t *diff) {
+  return subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
+                                       sec_pred, filter_horiz, filter_vert,
+                                       height, diff, 16);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_32width_hv_msa(
+  const uint8_t *src, int32_t src_stride,
+  const uint8_t *dst, int32_t dst_stride,
+  const uint8_t *sec_pred,
+  const uint8_t *filter_horiz, const uint8_t *filter_vert,
+  int32_t height, int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[2];
+
+  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
+    sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
+                                         sec_pred, filter_horiz, filter_vert,
+                                         height, &diff0[loop_cnt], 32);
+    src += 16;
+    dst += 16;
+    sec_pred += 16;
+  }
+
+  *diff = diff0[0] + diff0[1];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_64width_hv_msa(
+  const uint8_t *src, int32_t src_stride,
+  const uint8_t *dst, int32_t dst_stride,
+  const uint8_t *sec_pred,
+  const uint8_t *filter_horiz, const uint8_t *filter_vert,
+  int32_t height, int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[4];
+
+  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+    sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
+                                         sec_pred, filter_horiz, filter_vert,
+                                         height, &diff0[loop_cnt], 64);
+    src += 16;
+    dst += 16;
+    sec_pred += 16;
+  }
+
+  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+  return sse;
+}
+
+#define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4);
+#define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5);
+#define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5);
+#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6);
+#define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7);
+#define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7);
+#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8);
+
+#define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
+#define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
+#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10);
+#define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
+#define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
+#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
+
+#define VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht)                         \
+uint32_t vpx_sub_pixel_variance##wd##x##ht##_msa(const uint8_t *src,     \
+                                                 int32_t src_stride,     \
+                                                 int32_t xoffset,        \
+                                                 int32_t yoffset,        \
+                                                 const uint8_t *ref,     \
+                                                 int32_t ref_stride,     \
+                                                 uint32_t *sse) {        \
+  int32_t diff;                                                          \
+  uint32_t var;                                                          \
+  const uint8_t *h_filter = bilinear_filters_msa[xoffset];               \
+  const uint8_t *v_filter = bilinear_filters_msa[yoffset];               \
+                                                                         \
+  if (yoffset) {                                                         \
+    if (xoffset) {                                                       \
+      *sse = sub_pixel_sse_diff_##wd##width_hv_msa(src, src_stride,      \
+                                                   ref, ref_stride,      \
+                                                   h_filter, v_filter,   \
+                                                   ht, &diff);           \
+    } else {                                                             \
+      *sse = sub_pixel_sse_diff_##wd##width_v_msa(src, src_stride,       \
+                                                  ref, ref_stride,       \
+                                                  v_filter, ht, &diff);  \
+    }                                                                    \
+                                                                         \
+    var = VARIANCE_##wd##Wx##ht##H(*sse, diff);                          \
+  } else {                                                               \
+    if (xoffset) {                                                       \
+      *sse = sub_pixel_sse_diff_##wd##width_h_msa(src, src_stride,       \
+                                                  ref, ref_stride,       \
+                                                  h_filter, ht, &diff);  \
+                                                                         \
+      var = VARIANCE_##wd##Wx##ht##H(*sse, diff);                        \
+    } else {                                                             \
+      var = vpx_variance##wd##x##ht##_msa(src, src_stride,               \
+                                          ref, ref_stride, sse);         \
+    }                                                                    \
+  }                                                                      \
+                                                                         \
+  return var;                                                            \
+}
+
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8);
+
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 4);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 8);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 16);
+
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 8);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 16);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 32);
+
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 16);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 32);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 64);
+
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64);
+
+#define VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(wd, ht)                          \
+uint32_t vpx_sub_pixel_avg_variance##wd##x##ht##_msa(                         \
+  const uint8_t *src_ptr, int32_t src_stride,                                 \
+  int32_t xoffset, int32_t yoffset,                                           \
+  const uint8_t *ref_ptr, int32_t ref_stride,                                 \
+  uint32_t *sse, const uint8_t *sec_pred) {                                   \
+  int32_t diff;                                                               \
+  const uint8_t *h_filter = bilinear_filters_msa[xoffset];                    \
+  const uint8_t *v_filter = bilinear_filters_msa[yoffset];                    \
+                                                                              \
+  if (yoffset) {                                                              \
+    if (xoffset) {                                                            \
+      *sse = sub_pixel_avg_sse_diff_##wd##width_hv_msa(src_ptr, src_stride,   \
+                                                       ref_ptr, ref_stride,   \
+                                                       sec_pred, h_filter,    \
+                                                       v_filter, ht, &diff);  \
+    } else {                                                                  \
+      *sse = sub_pixel_avg_sse_diff_##wd##width_v_msa(src_ptr, src_stride,    \
+                                                      ref_ptr, ref_stride,    \
+                                                      sec_pred, v_filter,     \
+                                                      ht, &diff);             \
+    }                                                                         \
+  } else {                                                                    \
+    if (xoffset) {                                                            \
+      *sse = sub_pixel_avg_sse_diff_##wd##width_h_msa(src_ptr, src_stride,    \
+                                                      ref_ptr, ref_stride,    \
+                                                      sec_pred, h_filter,     \
+                                                      ht, &diff);             \
+    } else {                                                                  \
+      *sse = avg_sse_diff_##wd##width_msa(src_ptr, src_stride,                \
+                                          ref_ptr, ref_stride,                \
+                                          sec_pred, ht, &diff);               \
+    }                                                                         \
+  }                                                                           \
+                                                                              \
+  return VARIANCE_##wd##Wx##ht##H(*sse, diff);                                \
+}
+
+VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 4);
+VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 8);
+
+VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 4);
+VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 8);
+VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 16);
+
+VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 8);
+VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 16);
+VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 32);
+
+VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 16);
+VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 32);
+
+uint32_t vpx_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr,
+                                             int32_t src_stride,
+                                             int32_t xoffset,
+                                             int32_t yoffset,
+                                             const uint8_t *ref_ptr,
+                                             int32_t ref_stride,
+                                             uint32_t *sse,
+                                             const uint8_t *sec_pred) {
+  int32_t diff;
+  const uint8_t *h_filter = bilinear_filters_msa[xoffset];
+  const uint8_t *v_filter = bilinear_filters_msa[yoffset];
+
+  if (yoffset) {
+    if (xoffset) {
+      *sse = sub_pixel_avg_sse_diff_32width_hv_msa(src_ptr, src_stride,
+                                                   ref_ptr, ref_stride,
+                                                   sec_pred, h_filter,
+                                                   v_filter, 64, &diff);
+    } else {
+      *sse = sub_pixel_avg_sse_diff_32width_v_msa(src_ptr, src_stride,
+                                                  ref_ptr, ref_stride,
+                                                  sec_pred, v_filter,
+                                                  64, &diff);
+    }
+  } else {
+    if (xoffset) {
+      *sse = sub_pixel_avg_sse_diff_32width_h_msa(src_ptr, src_stride,
+                                                  ref_ptr, ref_stride,
+                                                  sec_pred, h_filter,
+                                                  64, &diff);
+    } else {
+      *sse = avg_sse_diff_32x64_msa(src_ptr, src_stride, ref_ptr, ref_stride,
+                                    sec_pred, &diff);
+    }
+  }
+
+  return VARIANCE_32Wx64H(*sse, diff);
+}
+
+#define VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(ht)                          \
+uint32_t vpx_sub_pixel_avg_variance64x##ht##_msa(const uint8_t *src_ptr,     \
+                                                 int32_t src_stride,         \
+                                                 int32_t xoffset,            \
+                                                 int32_t yoffset,            \
+                                                 const uint8_t *ref_ptr,     \
+                                                 int32_t ref_stride,         \
+                                                 uint32_t *sse,              \
+                                                 const uint8_t *sec_pred) {  \
+  int32_t diff;                                                              \
+  const uint8_t *h_filter = bilinear_filters_msa[xoffset];                   \
+  const uint8_t *v_filter = bilinear_filters_msa[yoffset];                   \
+                                                                             \
+  if (yoffset) {                                                             \
+    if (xoffset) {                                                           \
+      *sse = sub_pixel_avg_sse_diff_64width_hv_msa(src_ptr, src_stride,      \
+                                                   ref_ptr, ref_stride,      \
+                                                   sec_pred, h_filter,       \
+                                                   v_filter, ht, &diff);     \
+    } else {                                                                 \
+      *sse = sub_pixel_avg_sse_diff_64width_v_msa(src_ptr, src_stride,       \
+                                                  ref_ptr, ref_stride,       \
+                                                  sec_pred, v_filter,        \
+                                                  ht, &diff);                \
+    }                                                                        \
+  } else {                                                                   \
+    if (xoffset) {                                                           \
+      *sse = sub_pixel_avg_sse_diff_64width_h_msa(src_ptr, src_stride,       \
+                                                  ref_ptr, ref_stride,       \
+                                                  sec_pred, h_filter,        \
+                                                  ht, &diff);                \
+    } else {                                                                 \
+      *sse = avg_sse_diff_64x##ht##_msa(src_ptr, src_stride,                 \
+                                        ref_ptr, ref_stride,                 \
+                                        sec_pred, &diff);                    \
+    }                                                                        \
+  }                                                                          \
+                                                                             \
+  return VARIANCE_64Wx##ht##H(*sse, diff);                                   \
+}
+
+VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(32);
+VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(64);
diff --git a/libs/libvpx/vpx_dsp/mips/subtract_msa.c b/libs/libvpx/vpx_dsp/mips/subtract_msa.c
new file mode 100644
index 0000000000..9ac43c5cd5
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/subtract_msa.c
@@ -0,0 +1,264 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/macros_msa.h"
+
+static void sub_blk_4x4_msa(const uint8_t *src_ptr, int32_t src_stride,
+                            const uint8_t *pred_ptr, int32_t pred_stride,
+                            int16_t *diff_ptr, int32_t diff_stride) {
+  uint32_t src0, src1, src2, src3;
+  uint32_t pred0, pred1, pred2, pred3;
+  v16i8 src = { 0 };
+  v16i8 pred = { 0 };
+  v16u8 src_l0, src_l1;
+  v8i16 diff0, diff1;
+
+  LW4(src_ptr, src_stride, src0, src1, src2, src3);
+  LW4(pred_ptr, pred_stride, pred0, pred1, pred2, pred3);
+  INSERT_W4_SB(src0, src1, src2, src3, src);
+  INSERT_W4_SB(pred0, pred1, pred2, pred3, pred);
+  ILVRL_B2_UB(src, pred, src_l0, src_l1);
+  HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+  ST8x4_UB(diff0, diff1, diff_ptr, (2 * diff_stride));
+}
+
+static void sub_blk_8x8_msa(const uint8_t *src_ptr, int32_t src_stride,
+                            const uint8_t *pred_ptr, int32_t pred_stride,
+                            int16_t *diff_ptr, int32_t diff_stride) {
+  uint32_t loop_cnt;
+  uint64_t src0, src1, pred0, pred1;
+  v16i8 src = { 0 };
+  v16i8 pred = { 0 };
+  v16u8 src_l0, src_l1;
+  v8i16 diff0, diff1;
+
+  for (loop_cnt = 4; loop_cnt--;) {
+    LD2(src_ptr, src_stride, src0, src1);
+    src_ptr += (2 * src_stride);
+    LD2(pred_ptr, pred_stride, pred0, pred1);
+    pred_ptr += (2 * pred_stride);
+
+    INSERT_D2_SB(src0, src1, src);
+    INSERT_D2_SB(pred0, pred1, pred);
+    ILVRL_B2_UB(src, pred, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff_ptr, diff_stride);
+    diff_ptr += (2 * diff_stride);
+  }
+}
+
+static void sub_blk_16x16_msa(const uint8_t *src, int32_t src_stride,
+                              const uint8_t *pred, int32_t pred_stride,
+                              int16_t *diff, int32_t diff_stride) {
+  int8_t count;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+  v16u8 src_l0, src_l1;
+  v8i16 diff0, diff1;
+
+  for (count = 2; count--;) {
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+
+    LD_SB8(pred, pred_stride,
+           pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7);
+    pred += (8 * pred_stride);
+
+    ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    diff += diff_stride;
+
+    ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    diff += diff_stride;
+
+    ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    diff += diff_stride;
+
+    ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    diff += diff_stride;
+
+    ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    diff += diff_stride;
+
+    ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    diff += diff_stride;
+
+    ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    diff += diff_stride;
+
+    ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    diff += diff_stride;
+  }
+}
+
+static void sub_blk_32x32_msa(const uint8_t *src, int32_t src_stride,
+                              const uint8_t *pred, int32_t pred_stride,
+                              int16_t *diff, int32_t diff_stride) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+  v16u8 src_l0, src_l1;
+  v8i16 diff0, diff1;
+
+  for (loop_cnt = 8; loop_cnt--;) {
+    LD_SB2(src, 16, src0, src1);
+    src += src_stride;
+    LD_SB2(src, 16, src2, src3);
+    src += src_stride;
+    LD_SB2(src, 16, src4, src5);
+    src += src_stride;
+    LD_SB2(src, 16, src6, src7);
+    src += src_stride;
+
+    LD_SB2(pred, 16, pred0, pred1);
+    pred += pred_stride;
+    LD_SB2(pred, 16, pred2, pred3);
+    pred += pred_stride;
+    LD_SB2(pred, 16, pred4, pred5);
+    pred += pred_stride;
+    LD_SB2(pred, 16, pred6, pred7);
+    pred += pred_stride;
+
+    ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff + 16, 8);
+    diff += diff_stride;
+
+    ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff + 16, 8);
+    diff += diff_stride;
+
+    ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff + 16, 8);
+    diff += diff_stride;
+
+    ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff + 16, 8);
+    diff += diff_stride;
+  }
+}
+
+static void sub_blk_64x64_msa(const uint8_t *src, int32_t src_stride,
+                              const uint8_t *pred, int32_t pred_stride,
+                              int16_t *diff, int32_t diff_stride) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+  v16u8 src_l0, src_l1;
+  v8i16 diff0, diff1;
+
+  for (loop_cnt = 32; loop_cnt--;) {
+    LD_SB4(src, 16, src0, src1, src2, src3);
+    src += src_stride;
+    LD_SB4(src, 16, src4, src5, src6, src7);
+    src += src_stride;
+
+    LD_SB4(pred, 16, pred0, pred1, pred2, pred3);
+    pred += pred_stride;
+    LD_SB4(pred, 16, pred4, pred5, pred6, pred7);
+    pred += pred_stride;
+
+    ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff + 16, 8);
+    ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff + 32, 8);
+    ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff + 48, 8);
+    diff += diff_stride;
+
+    ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff + 16, 8);
+    ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff + 32, 8);
+    ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff + 48, 8);
+    diff += diff_stride;
+  }
+}
+
+void vpx_subtract_block_msa(int32_t rows, int32_t cols,
+                            int16_t *diff_ptr, ptrdiff_t diff_stride,
+                            const uint8_t *src_ptr, ptrdiff_t src_stride,
+                            const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+  if (rows == cols) {
+    switch (rows) {
+      case 4:
+        sub_blk_4x4_msa(src_ptr, src_stride, pred_ptr, pred_stride,
+                        diff_ptr, diff_stride);
+        break;
+      case 8:
+        sub_blk_8x8_msa(src_ptr, src_stride, pred_ptr, pred_stride,
+                        diff_ptr, diff_stride);
+        break;
+      case 16:
+        sub_blk_16x16_msa(src_ptr, src_stride, pred_ptr, pred_stride,
+                          diff_ptr, diff_stride);
+        break;
+      case 32:
+        sub_blk_32x32_msa(src_ptr, src_stride, pred_ptr, pred_stride,
+                          diff_ptr, diff_stride);
+        break;
+      case 64:
+        sub_blk_64x64_msa(src_ptr, src_stride, pred_ptr, pred_stride,
+                          diff_ptr, diff_stride);
+        break;
+      default:
+        vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr,
+                             src_stride, pred_ptr, pred_stride);
+        break;
+    }
+  } else {
+    vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride,
+                         pred_ptr, pred_stride);
+  }
+}
diff --git a/libs/libvpx/vpx_dsp/mips/txfm_macros_msa.h b/libs/libvpx/vpx_dsp/mips/txfm_macros_msa.h
new file mode 100644
index 0000000000..68c63d56f6
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/txfm_macros_msa.h
@@ -0,0 +1,93 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_
+#define VPX_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_
+
+#include "vpx_dsp/mips/macros_msa.h"
+
+#define DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) {      \
+  v8i16 k0_m = __msa_fill_h(cnst0);                                  \
+  v4i32 s0_m, s1_m, s2_m, s3_m;                                      \
+                                                                     \
+  s0_m = (v4i32)__msa_fill_h(cnst1);                                 \
+  k0_m = __msa_ilvev_h((v8i16)s0_m, k0_m);                           \
+                                                                     \
+  ILVRL_H2_SW((-reg1), reg0, s1_m, s0_m);                            \
+  ILVRL_H2_SW(reg0, reg1, s3_m, s2_m);                               \
+  DOTP_SH2_SW(s1_m, s0_m, k0_m, k0_m, s1_m, s0_m);                   \
+  SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS);                           \
+  out0 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m);                    \
+                                                                     \
+  DOTP_SH2_SW(s3_m, s2_m, k0_m, k0_m, s1_m, s0_m);                   \
+  SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS);                           \
+  out1 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m);                    \
+}
+
+#define DOT_ADD_SUB_SRARI_PCK(in0, in1, in2, in3, in4, in5, in6, in7,      \
+                              dst0, dst1, dst2, dst3) {                    \
+  v4i32 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m;                                 \
+  v4i32 tp5_m, tp6_m, tp7_m, tp8_m, tp9_m;                                 \
+                                                                           \
+  DOTP_SH4_SW(in0, in1, in0, in1, in4, in4, in5, in5,                      \
+              tp0_m, tp2_m, tp3_m, tp4_m);                                 \
+  DOTP_SH4_SW(in2, in3, in2, in3, in6, in6, in7, in7,                      \
+              tp5_m, tp6_m, tp7_m, tp8_m);                                 \
+  BUTTERFLY_4(tp0_m, tp3_m, tp7_m, tp5_m, tp1_m, tp9_m, tp7_m, tp5_m);     \
+  BUTTERFLY_4(tp2_m, tp4_m, tp8_m, tp6_m, tp3_m, tp0_m, tp4_m, tp2_m);     \
+  SRARI_W4_SW(tp1_m, tp9_m, tp7_m, tp5_m, DCT_CONST_BITS);                 \
+  SRARI_W4_SW(tp3_m, tp0_m, tp4_m, tp2_m, DCT_CONST_BITS);                 \
+  PCKEV_H4_SH(tp1_m, tp3_m, tp9_m, tp0_m, tp7_m, tp4_m, tp5_m, tp2_m,      \
+              dst0, dst1, dst2, dst3);                                     \
+}
+
+#define DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2) ({       \
+  v8i16 dst_m;                                        \
+  v4i32 tp0_m, tp1_m;                                 \
+                                                      \
+  DOTP_SH2_SW(in0, in1, in2, in2, tp1_m, tp0_m);      \
+  SRARI_W2_SW(tp1_m, tp0_m, DCT_CONST_BITS);          \
+  dst_m = __msa_pckev_h((v8i16)tp1_m, (v8i16)tp0_m);  \
+                                                      \
+  dst_m;                                              \
+})
+
+#define MADD_SHORT(m0, m1, c0, c1, res0, res1) {                    \
+  v4i32 madd0_m, madd1_m, madd2_m, madd3_m;                         \
+  v8i16 madd_s0_m, madd_s1_m;                                       \
+                                                                    \
+  ILVRL_H2_SH(m1, m0, madd_s0_m, madd_s1_m);                        \
+  DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s0_m, madd_s1_m,           \
+              c0, c0, c1, c1, madd0_m, madd1_m, madd2_m, madd3_m);  \
+  SRARI_W4_SW(madd0_m, madd1_m, madd2_m, madd3_m, DCT_CONST_BITS);  \
+  PCKEV_H2_SH(madd1_m, madd0_m, madd3_m, madd2_m, res0, res1);      \
+}
+
+#define MADD_BF(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3,         \
+                out0, out1, out2, out3) {                               \
+  v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m;                     \
+  v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m, m4_m, m5_m;                     \
+                                                                        \
+  ILVRL_H2_SH(inp1, inp0, madd_s0_m, madd_s1_m);                        \
+  ILVRL_H2_SH(inp3, inp2, madd_s2_m, madd_s3_m);                        \
+  DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m,               \
+              cst0, cst0, cst2, cst2, tmp0_m, tmp1_m, tmp2_m, tmp3_m);  \
+  BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m,                           \
+              m4_m, m5_m, tmp3_m, tmp2_m);                              \
+  SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS);              \
+  PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out0, out1);                  \
+  DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m,               \
+              cst1, cst1, cst3, cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m);  \
+  BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m,                           \
+              m4_m, m5_m, tmp3_m, tmp2_m);                              \
+  SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS);              \
+  PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out2, out3);                  \
+}
+#endif  // VPX_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_
diff --git a/libs/libvpx/vpx_dsp/mips/variance_msa.c b/libs/libvpx/vpx_dsp/mips/variance_msa.c
new file mode 100644
index 0000000000..33e175560f
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/variance_msa.c
@@ -0,0 +1,633 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/macros_msa.h"
+
+#define CALC_MSE_B(src, ref, var) {                                \
+  v16u8 src_l0_m, src_l1_m;                                        \
+  v8i16 res_l0_m, res_l1_m;                                        \
+                                                                   \
+  ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m);                       \
+  HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m);             \
+  DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var);  \
+}
+
+#define CALC_MSE_AVG_B(src, ref, var, sub) {                       \
+  v16u8 src_l0_m, src_l1_m;                                        \
+  v8i16 res_l0_m, res_l1_m;                                        \
+                                                                   \
+  ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m);                       \
+  HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m);             \
+  DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var);  \
+                                                                   \
+  sub += res_l0_m + res_l1_m;                                      \
+}
+
+#define VARIANCE_WxH(sse, diff, shift) \
+  sse - (((uint32_t)diff * diff) >> shift)
+
+#define VARIANCE_LARGE_WxH(sse, diff, shift) \
+  sse - (((int64_t)diff * diff) >> shift)
+
+static uint32_t sse_diff_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                    const uint8_t *ref_ptr, int32_t ref_stride,
+                                    int32_t height, int32_t *diff) {
+  uint32_t src0, src1, src2, src3;
+  uint32_t ref0, ref1, ref2, ref3;
+  int32_t ht_cnt;
+  v16u8 src = { 0 };
+  v16u8 ref = { 0 };
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LW4(src_ptr, src_stride, src0, src1, src2, src3);
+    src_ptr += (4 * src_stride);
+    LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    ref_ptr += (4 * ref_stride);
+
+    INSERT_W4_UB(src0, src1, src2, src3, src);
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    CALC_MSE_AVG_B(src, ref, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sse_diff_8width_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                    const uint8_t *ref_ptr, int32_t ref_stride,
+                                    int32_t height, int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
+    src_ptr += (4 * src_stride);
+    LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    ref_ptr += (4 * ref_stride);
+
+    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
+                src0, src1, ref0, ref1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sse_diff_16width_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                     const uint8_t *ref_ptr, int32_t ref_stride,
+                                     int32_t height, int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src, ref;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref = LD_UB(ref_ptr);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src, ref, var, avg);
+
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref = LD_UB(ref_ptr);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src, ref, var, avg);
+
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref = LD_UB(ref_ptr);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src, ref, var, avg);
+
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref = LD_UB(ref_ptr);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src, ref, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sse_diff_32width_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                     const uint8_t *ref_ptr, int32_t ref_stride,
+                                     int32_t height, int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, ref0, ref1;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sse_diff_32x64_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                   const uint8_t *ref_ptr, int32_t ref_stride,
+                                   int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, ref0, ref1;
+  v8i16 avg0 = { 0 };
+  v8i16 avg1 = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = 16; ht_cnt--;) {
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+  }
+
+  vec = __msa_hadd_s_w(avg0, avg0);
+  vec += __msa_hadd_s_w(avg1, avg1);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sse_diff_64x32_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                   const uint8_t *ref_ptr, int32_t ref_stride,
+                                   int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3;
+  v8i16 avg0 = { 0 };
+  v8i16 avg1 = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = 16; ht_cnt--;) {
+    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+    src_ptr += src_stride;
+    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src2, ref2, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+    CALC_MSE_AVG_B(src3, ref3, var, avg1);
+
+    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+    src_ptr += src_stride;
+    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src2, ref2, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+    CALC_MSE_AVG_B(src3, ref3, var, avg1);
+  }
+
+  vec = __msa_hadd_s_w(avg0, avg0);
+  vec += __msa_hadd_s_w(avg1, avg1);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sse_diff_64x64_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                   const uint8_t *ref_ptr, int32_t ref_stride,
+                                   int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3;
+  v8i16 avg0 = { 0 };
+  v8i16 avg1 = { 0 };
+  v8i16 avg2 = { 0 };
+  v8i16 avg3 = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = 32; ht_cnt--;) {
+    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+    src_ptr += src_stride;
+    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+    CALC_MSE_AVG_B(src2, ref2, var, avg2);
+    CALC_MSE_AVG_B(src3, ref3, var, avg3);
+    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+    src_ptr += src_stride;
+    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+    CALC_MSE_AVG_B(src2, ref2, var, avg2);
+    CALC_MSE_AVG_B(src3, ref3, var, avg3);
+  }
+
+  vec = __msa_hadd_s_w(avg0, avg0);
+  vec += __msa_hadd_s_w(avg1, avg1);
+  vec += __msa_hadd_s_w(avg2, avg2);
+  vec += __msa_hadd_s_w(avg3, avg3);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t get_mb_ss_msa(const int16_t *src) {
+  uint32_t sum, cnt;
+  v8i16 src0, src1, src2, src3;
+  v4i32 src0_l, src1_l, src2_l, src3_l;
+  v4i32 src0_r, src1_r, src2_r, src3_r;
+  v2i64 sq_src_l = { 0 };
+  v2i64 sq_src_r = { 0 };
+
+  for (cnt = 8; cnt--;) {
+    LD_SH4(src, 8, src0, src1, src2, src3);
+    src += 4 * 8;
+
+    UNPCK_SH_SW(src0, src0_l, src0_r);
+    UNPCK_SH_SW(src1, src1_l, src1_r);
+    UNPCK_SH_SW(src2, src2_l, src2_r);
+    UNPCK_SH_SW(src3, src3_l, src3_r);
+
+    DPADD_SD2_SD(src0_l, src0_r, sq_src_l, sq_src_r);
+    DPADD_SD2_SD(src1_l, src1_r, sq_src_l, sq_src_r);
+    DPADD_SD2_SD(src2_l, src2_r, sq_src_l, sq_src_r);
+    DPADD_SD2_SD(src3_l, src3_r, sq_src_l, sq_src_r);
+  }
+
+  sq_src_l += __msa_splati_d(sq_src_l, 1);
+  sq_src_r += __msa_splati_d(sq_src_r, 1);
+
+  sum = __msa_copy_s_d(sq_src_l, 0);
+  sum += __msa_copy_s_d(sq_src_r, 0);
+
+  return sum;
+}
+
+static uint32_t sse_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
+                               const uint8_t *ref_ptr, int32_t ref_stride,
+                               int32_t height) {
+  int32_t ht_cnt;
+  uint32_t src0, src1, src2, src3;
+  uint32_t ref0, ref1, ref2, ref3;
+  v16u8 src = { 0 };
+  v16u8 ref = { 0 };
+  v4i32 var = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LW4(src_ptr, src_stride, src0, src1, src2, src3);
+    src_ptr += (4 * src_stride);
+    LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    ref_ptr += (4 * ref_stride);
+
+    INSERT_W4_UB(src0, src1, src2, src3, src);
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    CALC_MSE_B(src, ref, var);
+  }
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sse_8width_msa(const uint8_t *src_ptr, int32_t src_stride,
+                               const uint8_t *ref_ptr, int32_t ref_stride,
+                               int32_t height) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3;
+  v4i32 var = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
+    src_ptr += (4 * src_stride);
+    LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    ref_ptr += (4 * ref_stride);
+
+    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
+                src0, src1, ref0, ref1);
+    CALC_MSE_B(src0, ref0, var);
+    CALC_MSE_B(src1, ref1, var);
+  }
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sse_16width_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                const uint8_t *ref_ptr, int32_t ref_stride,
+                                int32_t height) {
+  int32_t ht_cnt;
+  v16u8 src, ref;
+  v4i32 var = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref = LD_UB(ref_ptr);
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src, ref, var);
+
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref = LD_UB(ref_ptr);
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src, ref, var);
+
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref = LD_UB(ref_ptr);
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src, ref, var);
+
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref = LD_UB(ref_ptr);
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src, ref, var);
+  }
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sse_32width_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                const uint8_t *ref_ptr, int32_t ref_stride,
+                                int32_t height) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, ref0, ref1;
+  v4i32 var = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src0, ref0, var);
+    CALC_MSE_B(src1, ref1, var);
+
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src0, ref0, var);
+    CALC_MSE_B(src1, ref1, var);
+
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src0, ref0, var);
+    CALC_MSE_B(src1, ref1, var);
+
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src0, ref0, var);
+    CALC_MSE_B(src1, ref1, var);
+  }
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sse_64width_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                const uint8_t *ref_ptr, int32_t ref_stride,
+                                int32_t height) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3;
+  v4i32 var = { 0 };
+
+  for (ht_cnt = height >> 1; ht_cnt--;) {
+    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+    src_ptr += src_stride;
+    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src0, ref0, var);
+    CALC_MSE_B(src2, ref2, var);
+    CALC_MSE_B(src1, ref1, var);
+    CALC_MSE_B(src3, ref3, var);
+
+    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+    src_ptr += src_stride;
+    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src0, ref0, var);
+    CALC_MSE_B(src2, ref2, var);
+    CALC_MSE_B(src1, ref1, var);
+    CALC_MSE_B(src3, ref3, var);
+  }
+
+  return HADD_SW_S32(var);
+}
+
+uint32_t vpx_get4x4sse_cs_msa(const uint8_t *src_ptr, int32_t src_stride,
+                              const uint8_t *ref_ptr, int32_t ref_stride) {
+  uint32_t err = 0;
+  uint32_t src0, src1, src2, src3;
+  uint32_t ref0, ref1, ref2, ref3;
+  v16i8 src = { 0 };
+  v16i8 ref = { 0 };
+  v16u8 src_vec0, src_vec1;
+  v8i16 diff0, diff1;
+  v4i32 err0 = { 0 };
+  v4i32 err1 = { 0 };
+
+  LW4(src_ptr, src_stride, src0, src1, src2, src3);
+  LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+  INSERT_W4_SB(src0, src1, src2, src3, src);
+  INSERT_W4_SB(ref0, ref1, ref2, ref3, ref);
+  ILVRL_B2_UB(src, ref, src_vec0, src_vec1);
+  HSUB_UB2_SH(src_vec0, src_vec1, diff0, diff1);
+  DPADD_SH2_SW(diff0, diff1, diff0, diff1, err0, err1);
+  err = HADD_SW_S32(err0);
+  err += HADD_SW_S32(err1);
+
+  return err;
+}
+
+#define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4);
+#define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5);
+#define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5);
+#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6);
+#define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7);
+#define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7);
+#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8);
+
+#define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
+#define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
+#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10);
+#define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
+#define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
+#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
+
+#define VPX_VARIANCE_WDXHT_MSA(wd, ht)                               \
+uint32_t vpx_variance##wd##x##ht##_msa(const uint8_t *src,           \
+                                       int32_t src_stride,           \
+                                       const uint8_t *ref,           \
+                                       int32_t ref_stride,           \
+                                       uint32_t *sse) {              \
+  int32_t diff;                                                      \
+                                                                     \
+  *sse = sse_diff_##wd##width_msa(src, src_stride, ref, ref_stride,  \
+                                  ht, &diff);                        \
+                                                                     \
+  return VARIANCE_##wd##Wx##ht##H(*sse, diff);                       \
+}
+
+VPX_VARIANCE_WDXHT_MSA(4, 4);
+VPX_VARIANCE_WDXHT_MSA(4, 8);
+
+VPX_VARIANCE_WDXHT_MSA(8, 4)
+VPX_VARIANCE_WDXHT_MSA(8, 8)
+VPX_VARIANCE_WDXHT_MSA(8, 16)
+
+VPX_VARIANCE_WDXHT_MSA(16, 8)
+VPX_VARIANCE_WDXHT_MSA(16, 16)
+VPX_VARIANCE_WDXHT_MSA(16, 32)
+
+VPX_VARIANCE_WDXHT_MSA(32, 16)
+VPX_VARIANCE_WDXHT_MSA(32, 32)
+
+uint32_t vpx_variance32x64_msa(const uint8_t *src, int32_t src_stride,
+                               const uint8_t *ref, int32_t ref_stride,
+                               uint32_t *sse) {
+  int32_t diff;
+
+  *sse = sse_diff_32x64_msa(src, src_stride, ref, ref_stride, &diff);
+
+  return VARIANCE_32Wx64H(*sse, diff);
+}
+
+uint32_t vpx_variance64x32_msa(const uint8_t *src, int32_t src_stride,
+                               const uint8_t *ref, int32_t ref_stride,
+                               uint32_t *sse) {
+  int32_t diff;
+
+  *sse = sse_diff_64x32_msa(src, src_stride, ref, ref_stride, &diff);
+
+  return VARIANCE_64Wx32H(*sse, diff);
+}
+
+uint32_t vpx_variance64x64_msa(const uint8_t *src, int32_t src_stride,
+                               const uint8_t *ref, int32_t ref_stride,
+                               uint32_t *sse) {
+  int32_t diff;
+
+  *sse = sse_diff_64x64_msa(src, src_stride, ref, ref_stride, &diff);
+
+  return VARIANCE_64Wx64H(*sse, diff);
+}
+
+uint32_t vpx_mse8x8_msa(const uint8_t *src, int32_t src_stride,
+                        const uint8_t *ref, int32_t ref_stride,
+                        uint32_t *sse) {
+  *sse = sse_8width_msa(src, src_stride, ref, ref_stride, 8);
+
+  return *sse;
+}
+
+uint32_t vpx_mse8x16_msa(const uint8_t *src, int32_t src_stride,
+                         const uint8_t *ref, int32_t ref_stride,
+                         uint32_t *sse) {
+  *sse = sse_8width_msa(src, src_stride, ref, ref_stride, 16);
+
+  return *sse;
+}
+
+uint32_t vpx_mse16x8_msa(const uint8_t *src, int32_t src_stride,
+                         const uint8_t *ref, int32_t ref_stride,
+                         uint32_t *sse) {
+  *sse = sse_16width_msa(src, src_stride, ref, ref_stride, 8);
+
+  return *sse;
+}
+
+uint32_t vpx_mse16x16_msa(const uint8_t *src, int32_t src_stride,
+                          const uint8_t *ref, int32_t ref_stride,
+                          uint32_t *sse) {
+  *sse = sse_16width_msa(src, src_stride, ref, ref_stride, 16);
+
+  return *sse;
+}
+
+void vpx_get8x8var_msa(const uint8_t *src, int32_t src_stride,
+                       const uint8_t *ref, int32_t ref_stride,
+                       uint32_t *sse, int32_t *sum) {
+  *sse = sse_diff_8width_msa(src, src_stride, ref, ref_stride, 8, sum);
+}
+
+void vpx_get16x16var_msa(const uint8_t *src, int32_t src_stride,
+                         const uint8_t *ref, int32_t ref_stride,
+                         uint32_t *sse, int32_t *sum) {
+  *sse = sse_diff_16width_msa(src, src_stride, ref, ref_stride, 16, sum);
+}
+
+uint32_t vpx_get_mb_ss_msa(const int16_t *src) {
+  return get_mb_ss_msa(src);
+}
diff --git a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c
new file mode 100644
index 0000000000..f6244d834b
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c
@@ -0,0 +1,743 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/vpx_convolve_msa.h"
+
+static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter) {
+  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+  v16u8 dst0, dst1, dst2, dst3, res2, res3;
+  v16u8 mask0, mask1, mask2, mask3;
+  v8i16 filt, res0, res1;
+
+  mask0 = LD_UB(&mc_filt_mask_arr[16]);
+  src -= 3;
+
+  /* rearranging filter */
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  XORI_B4_128_SB(src0, src1, src2, src3);
+  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filt0, filt1, filt2, filt3, res0, res1);
+  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+  SRARI_H2_SH(res0, res1, FILTER_BITS);
+  SAT_SH2_SH(res0, res1, 7);
+  PCKEV_B2_UB(res0, res0, res1, res1, res2, res3);
+  ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
+  XORI_B2_128_UB(res2, res3);
+  AVER_UB2_UB(res2, dst0, res3, dst2, res2, res3);
+  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter) {
+  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+  v16u8 mask0, mask1, mask2, mask3, res0, res1, res2, res3;
+  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+  v8i16 filt, vec0, vec1, vec2, vec3;
+
+  mask0 = LD_UB(&mc_filt_mask_arr[16]);
+  src -= 3;
+
+  /* rearranging filter */
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  XORI_B4_128_SB(src0, src1, src2, src3);
+  src += (4 * src_stride);
+  LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filt0, filt1, filt2, filt3, vec0, vec1);
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  XORI_B4_128_SB(src0, src1, src2, src3);
+  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filt0, filt1, filt2, filt3, vec2, vec3);
+  SRARI_H4_SH(vec0, vec1, vec2, vec3, FILTER_BITS);
+  SAT_SH4_SH(vec0, vec1, vec2, vec3, 7);
+  PCKEV_B4_UB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, res0, res1, res2,
+              res3);
+  ILVR_D2_UB(res1, res0, res3, res2, res0, res2);
+  XORI_B2_128_UB(res0, res2);
+  ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4,
+             dst6);
+  ILVR_D2_UB(dst2, dst0, dst6, dst4, dst0, dst4);
+  AVER_UB2_UB(res0, dst0, res2, dst4, res0, res2);
+  ST4x8_UB(res0, res2, dst, dst_stride);
+}
+
+static void common_hz_8t_and_aver_dst_4w_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride,
+                                             int8_t *filter,
+                                             int32_t height) {
+  if (4 == height) {
+    common_hz_8t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
+  } else if (8 == height) {
+    common_hz_8t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
+static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride,
+                                             int8_t *filter,
+                                             int32_t height) {
+  int32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+  v16u8 mask0, mask1, mask2, mask3, dst0, dst1, dst2, dst3;
+  v8i16 filt, out0, out1, out2, out3;
+
+  mask0 = LD_UB(&mc_filt_mask_arr[0]);
+  src -= 3;
+
+  /* rearranging filter */
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    src += (4 * src_stride);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1,
+                               out2, out3);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3,
+                            dst, dst_stride);
+    dst += (4 * dst_stride);
+  }
+}
+
+static void common_hz_8t_and_aver_dst_16w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter,
+                                              int32_t height) {
+  int32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+  v16u8 mask0, mask1, mask2, mask3, dst0, dst1;
+  v8i16 filt, out0, out1, out2, out3;
+  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+
+  mask0 = LD_UB(&mc_filt_mask_arr[0]);
+  src -= 3;
+
+  /* rearranging filter */
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  for (loop_cnt = height >> 1; loop_cnt--;) {
+    LD_SB2(src, src_stride, src0, src2);
+    LD_SB2(src + 8, src_stride, src1, src3);
+    src += (2 * src_stride);
+
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, vec12);
+    VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, vec13);
+    VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
+               vec14);
+    VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
+               vec15);
+    DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+                vec2, vec3);
+    DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
+                vec9, vec10, vec11);
+    DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, vec1,
+                 vec2, vec3);
+    DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8,
+                 vec9, vec10, vec11);
+    ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1,
+                out2, out3);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    PCKEV_XORI128_AVG_ST_UB(out1, out0, dst0, dst);
+    dst += dst_stride;
+    PCKEV_XORI128_AVG_ST_UB(out3, out2, dst1, dst);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_8t_and_aver_dst_32w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter,
+                                              int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+  v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
+  v8i16 filt, out0, out1, out2, out3;
+  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+
+  mask0 = LD_UB(&mc_filt_mask_arr[0]);
+  src -= 3;
+
+  /* rearranging filter */
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  for (loop_cnt = height; loop_cnt--;) {
+    src0 = LD_SB(src);
+    src2 = LD_SB(src + 16);
+    src3 = LD_SB(src + 24);
+    src1 = __msa_sldi_b(src2, src0, 8);
+    src += src_stride;
+
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, vec12);
+    VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, vec13);
+    VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
+               vec14);
+    VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
+               vec15);
+    DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+                vec2, vec3);
+    DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
+                vec9, vec10, vec11);
+    DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, vec1,
+                 vec2, vec3);
+    DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8,
+                 vec9, vec10, vec11);
+    ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1,
+                out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    LD_UB2(dst, 16, dst1, dst2);
+    PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, dst);
+    PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, dst + 16);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_8t_and_aver_dst_64w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter,
+                                              int32_t height) {
+  uint32_t loop_cnt, cnt;
+  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+  v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
+  v8i16 filt, out0, out1, out2, out3;
+  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+
+  mask0 = LD_UB(&mc_filt_mask_arr[0]);
+  src -= 3;
+
+  /* rearranging filter */
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  for (loop_cnt = height; loop_cnt--;) {
+    for (cnt = 0; cnt < 2; ++cnt) {
+      src0 = LD_SB(&src[cnt << 5]);
+      src2 = LD_SB(&src[16 + (cnt << 5)]);
+      src3 = LD_SB(&src[24 + (cnt << 5)]);
+      src1 = __msa_sldi_b(src2, src0, 8);
+
+      XORI_B4_128_SB(src0, src1, src2, src3);
+      VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8,
+                 vec12);
+      VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9,
+                 vec13);
+      VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
+                 vec14);
+      VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
+                 vec15);
+      DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
+                  vec1, vec2, vec3);
+      DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
+                  vec9, vec10, vec11);
+      DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0,
+                   vec1, vec2, vec3);
+      DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8,
+                   vec9, vec10, vec11);
+      ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1,
+                  out2, out3);
+      SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+      SAT_SH4_SH(out0, out1, out2, out3, 7);
+      LD_UB2(&dst[cnt << 5], 16, dst1, dst2);
+      PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, &dst[cnt << 5]);
+      PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, &dst[16 + (cnt << 5)]);
+    }
+
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter) {
+  v16i8 src0, src1, src2, src3, mask;
+  v16u8 filt0, dst0, dst1, dst2, dst3, vec0, vec1, res0, res1;
+  v8u16 vec2, vec3, filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[16]);
+
+  /* rearranging filter */
+  filt = LD_UH(filter);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
+  DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
+  SRARI_H2_UH(vec2, vec3, FILTER_BITS);
+  PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
+  ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
+  AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1);
+  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter) {
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
+  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+  v8u16 vec4, vec5, vec6, vec7, filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[16]);
+
+  /* rearranging filter */
+  filt = LD_UH(filter);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+  LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
+  VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
+  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
+              vec6, vec7);
+  SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
+  PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
+              res3);
+  ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4,
+             dst6);
+  AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2,
+              res3);
+  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+  dst += (4 * dst_stride);
+  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hz_2t_and_aver_dst_4w_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride,
+                                             int8_t *filter,
+                                             int32_t height) {
+  if (4 == height) {
+    common_hz_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
+  } else if (8 == height) {
+    common_hz_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
+static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter) {
+  v16i8 src0, src1, src2, src3, mask;
+  v16u8 filt0, dst0, dst1, dst2, dst3;
+  v8u16 vec0, vec1, vec2, vec3, filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[0]);
+
+  /* rearranging filter */
+  filt = LD_UH(filter);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+              vec2, vec3);
+  SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+  PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
+                     dst, dst_stride);
+}
+
+static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride,
+                                                  int8_t *filter,
+                                                  int32_t height) {
+  v16i8 src0, src1, src2, src3, mask;
+  v16u8 filt0, dst0, dst1, dst2, dst3;
+  v8u16 vec0, vec1, vec2, vec3, filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[0]);
+
+  /* rearranging filter */
+  filt = LD_UH(filter);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  src += (4 * src_stride);
+  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+              vec2, vec3);
+  SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  src += (4 * src_stride);
+  PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
+                     dst, dst_stride);
+  dst += (4 * dst_stride);
+
+  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+              vec2, vec3);
+  SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+  PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
+                     dst, dst_stride);
+  dst += (4 * dst_stride);
+
+  if (16 == height) {
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+                vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
+                       dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+                vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
+                       dst, dst_stride);
+  }
+}
+
+static void common_hz_2t_and_aver_dst_8w_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride,
+                                             int8_t *filter,
+                                             int32_t height) {
+  if (4 == height) {
+    common_hz_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter);
+  } else {
+    common_hz_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
+                                          filter, height);
+  }
+}
+
+static void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter,
+                                              int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  v16u8 filt0, dst0, dst1, dst2, dst3;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[0]);
+
+  /* rearranging filter */
+  filt = LD_UH(filter);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  LD_SB4(src, src_stride, src0, src2, src4, src6);
+  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+  src += (4 * src_stride);
+
+  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
+              res2, res3);
+  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
+              res6, res7);
+  SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
+  SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
+  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+  PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
+  dst += dst_stride;
+  PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
+  dst += dst_stride;
+  PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
+  dst += dst_stride;
+  PCKEV_AVG_ST_UB(res7, res6, dst3, dst);
+  dst += dst_stride;
+
+  for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
+    LD_SB4(src, src_stride, src0, src2, src4, src6);
+    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+    src += (4 * src_stride);
+
+    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
+                res2, res3);
+    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
+                res6, res7);
+    SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
+    SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
+    dst += dst_stride;
+    PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
+    dst += dst_stride;
+    PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
+    dst += dst_stride;
+    PCKEV_AVG_ST_UB(res7, res6, dst3, dst);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_2t_and_aver_dst_32w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter,
+                                              int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  v16u8 filt0, dst0, dst1, dst2, dst3;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[0]);
+
+  /* rearranging filter */
+  filt = LD_UH(filter);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  for (loop_cnt = (height >> 1); loop_cnt--;) {
+    src0 = LD_SB(src);
+    src2 = LD_SB(src + 16);
+    src3 = LD_SB(src + 24);
+    src1 = __msa_sldi_b(src2, src0, 8);
+    src += src_stride;
+    src4 = LD_SB(src);
+    src6 = LD_SB(src + 16);
+    src7 = LD_SB(src + 24);
+    src5 = __msa_sldi_b(src6, src4, 8);
+    src += src_stride;
+
+    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
+                res2, res3);
+    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
+                res6, res7);
+    SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
+    SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
+    LD_UB2(dst, 16, dst0, dst1);
+    PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
+    PCKEV_AVG_ST_UB(res3, res2, dst1, (dst + 16));
+    dst += dst_stride;
+    LD_UB2(dst, 16, dst2, dst3);
+    PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
+    PCKEV_AVG_ST_UB(res7, res6, dst3, (dst + 16));
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter,
+                                              int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  v16u8 filt0, dst0, dst1, dst2, dst3;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[0]);
+
+  /* rearranging filter */
+  filt = LD_UH(filter);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  for (loop_cnt = height; loop_cnt--;) {
+    LD_SB4(src, 16, src0, src2, src4, src6);
+    src7 = LD_SB(src + 56);
+    SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8);
+    src += src_stride;
+
+    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
+                out2, out3);
+    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
+                out6, out7);
+    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
+    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
+    LD_UB4(dst, 16, dst0, dst1, dst2, dst3);
+    PCKEV_AVG_ST_UB(out1, out0, dst0, dst);
+    PCKEV_AVG_ST_UB(out3, out2, dst1, dst + 16);
+    PCKEV_AVG_ST_UB(out5, out4, dst2, dst + 32);
+    PCKEV_AVG_ST_UB(out7, out6, dst3, dst + 48);
+    dst += dst_stride;
+  }
+}
+
+void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
+                                 uint8_t *dst, ptrdiff_t dst_stride,
+                                 const int16_t *filter_x, int x_step_q4,
+                                 const int16_t *filter_y, int y_step_q4,
+                                 int w, int h) {
+  int8_t cnt, filt_hor[8];
+
+  assert(x_step_q4 == 16);
+  assert(((const int32_t *)filter_x)[1] != 0x800000);
+
+  for (cnt = 0; cnt < 8; ++cnt) {
+    filt_hor[cnt] = filter_x[cnt];
+  }
+
+  if (((const int32_t *)filter_x)[0] == 0) {
+    switch (w) {
+      case 4:
+        common_hz_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride,
+                                         dst, (int32_t)dst_stride,
+                                         &filt_hor[3], h);
+        break;
+      case 8:
+        common_hz_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride,
+                                         dst, (int32_t)dst_stride,
+                                         &filt_hor[3], h);
+        break;
+      case 16:
+        common_hz_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride,
+                                          dst, (int32_t)dst_stride,
+                                          &filt_hor[3], h);
+        break;
+      case 32:
+        common_hz_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride,
+                                          dst, (int32_t)dst_stride,
+                                          &filt_hor[3], h);
+        break;
+      case 64:
+        common_hz_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride,
+                                          dst, (int32_t)dst_stride,
+                                          &filt_hor[3], h);
+        break;
+      default:
+        vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
+                                  filter_x, x_step_q4, filter_y, y_step_q4,
+                                  w, h);
+        break;
+    }
+  } else {
+    switch (w) {
+      case 4:
+        common_hz_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride,
+                                         dst, (int32_t)dst_stride,
+                                         filt_hor, h);
+        break;
+      case 8:
+        common_hz_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride,
+                                         dst, (int32_t)dst_stride,
+                                         filt_hor, h);
+        break;
+      case 16:
+        common_hz_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride,
+                                          dst, (int32_t)dst_stride,
+                                          filt_hor, h);
+        break;
+      case 32:
+        common_hz_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride,
+                                          dst, (int32_t)dst_stride,
+                                          filt_hor, h);
+        break;
+      case 64:
+        common_hz_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride,
+                                          dst, (int32_t)dst_stride,
+                                          filt_hor, h);
+        break;
+      default:
+        vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
+                                  filter_x, x_step_q4, filter_y, y_step_q4,
+                                  w, h);
+        break;
+    }
+  }
+}
diff --git a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c
new file mode 100644
index 0000000000..2abde6de83
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c
@@ -0,0 +1,661 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/vpx_convolve_msa.h"
+
+static void common_hv_8ht_8vt_and_aver_dst_4w_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride,
+                                                  int8_t *filter_horiz,
+                                                  int8_t *filter_vert,
+                                                  int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3, tmp0, tmp1;
+  v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+  v8i16 hz_out7, hz_out8, hz_out9, res0, res1, vec0, vec1, vec2, vec3, vec4;
+  v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+
+  mask0 = LD_UB(&mc_filt_mask_arr[16]);
+  src -= (3 + 3 * src_stride);
+
+  /* rearranging filter */
+  filt = LD_SH(filter_horiz);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+  src += (7 * src_stride);
+
+  hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+  hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+  hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+  hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+  SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8);
+
+  filt = LD_SH(filter_vert);
+  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+
+  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+  vec2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src7, src8, src9, src10);
+    XORI_B4_128_SB(src7, src8, src9, src10);
+    src += (4 * src_stride);
+
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3,
+                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+    hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
+    vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
+    res0 = FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+
+    hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3,
+                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+    hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8);
+    vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
+    res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
+
+    SRARI_H2_SH(res0, res1, FILTER_BITS);
+    SAT_SH2_SH(res0, res1, 7);
+    PCKEV_B2_UB(res0, res0, res1, res1, tmp0, tmp1);
+    XORI_B2_128_UB(tmp0, tmp1);
+    AVER_UB2_UB(tmp0, dst0, tmp1, dst2, tmp0, tmp1);
+    ST4x4_UB(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    hz_out5 = hz_out9;
+    vec0 = vec2;
+    vec1 = vec3;
+    vec2 = vec4;
+  }
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_8w_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride,
+                                                  int8_t *filter_horiz,
+                                                  int8_t *filter_vert,
+                                                  int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+  v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+  v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3;
+  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+  v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
+  v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
+
+  mask0 = LD_UB(&mc_filt_mask_arr[0]);
+  src -= (3 + 3 * src_stride);
+
+  /* rearranging filter */
+  filt = LD_SH(filter_horiz);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+  src += (7 * src_stride);
+
+  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+  hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+  hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+  hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+  hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+  hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+  hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+  hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+
+  filt = LD_SH(filter_vert);
+  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+
+  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
+  ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
+  ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src7, src8, src9, src10);
+    XORI_B4_128_SB(src7, src8, src9, src10);
+    src += (4 * src_stride);
+
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+    hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3,
+                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+    out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
+    tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+
+    hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3,
+                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+    out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
+    tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+
+    hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3,
+                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+    out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
+    tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+
+    hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
+                               filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+    out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9);
+    tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+
+    SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+    SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+    CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst2, dst3,
+                            dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    hz_out6 = hz_out10;
+    out0 = out2;
+    out1 = out3;
+    out2 = out8;
+    out4 = out6;
+    out5 = out7;
+    out6 = out9;
+  }
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_16w_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   int8_t *filter_horiz,
+                                                   int8_t *filter_vert,
+                                                   int32_t height) {
+  int32_t multiple8_cnt;
+  for (multiple8_cnt = 2; multiple8_cnt--;) {
+    common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
+                                          filter_horiz, filter_vert, height);
+    src += 8;
+    dst += 8;
+  }
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_32w_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   int8_t *filter_horiz,
+                                                   int8_t *filter_vert,
+                                                   int32_t height) {
+  int32_t multiple8_cnt;
+  for (multiple8_cnt = 4; multiple8_cnt--;) {
+    common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
+                                          filter_horiz, filter_vert, height);
+    src += 8;
+    dst += 8;
+  }
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_64w_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   int8_t *filter_horiz,
+                                                   int8_t *filter_vert,
+                                                   int32_t height) {
+  int32_t multiple8_cnt;
+  for (multiple8_cnt = 8; multiple8_cnt--;) {
+    common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
+                                          filter_horiz, filter_vert, height);
+    src += 8;
+    dst += 8;
+  }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   int8_t *filter_horiz,
+                                                   int8_t *filter_vert) {
+  v16i8 src0, src1, src2, src3, src4, mask;
+  v16u8 filt_hz, filt_vt, vec0, vec1;
+  v16u8 dst0, dst1, dst2, dst3, res0, res1;
+  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[16]);
+
+  /* rearranging filter */
+  filt = LD_UH(filter_horiz);
+  filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  filt = LD_UH(filter_vert);
+  filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
+  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
+  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+  hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
+  hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
+  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+
+  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+  ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
+  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+  SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+  PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
+  AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1);
+  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   int8_t *filter_horiz,
+                                                   int8_t *filter_vert) {
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
+  v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
+  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+  v8u16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3;
+  v8i16 filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[16]);
+
+  /* rearranging filter */
+  filt = LD_SH(filter_horiz);
+  filt_hz = (v16u8)__msa_splati_h(filt, 0);
+
+  filt = LD_SH(filter_vert);
+  filt_vt = (v16u8)__msa_splati_h(filt, 0);
+
+  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+  src += (8 * src_stride);
+  src8 = LD_SB(src);
+
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
+  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
+  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS);
+  hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS);
+  hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS);
+  SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
+             hz_out3, hz_out5, 8);
+  hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6);
+
+  LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+  ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2,
+             dst4, dst6);
+  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+  ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
+  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
+              tmp0, tmp1, tmp2, tmp3);
+  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+  PCKEV_B4_UB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, res0, res1,
+              res2, res3);
+  AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1,
+              res2, res3);
+  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+  dst += (4 * dst_stride);
+  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_4w_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride,
+                                                  int8_t *filter_horiz,
+                                                  int8_t *filter_vert,
+                                                  int32_t height) {
+  if (4 == height) {
+    common_hv_2ht_2vt_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
+                                           filter_horiz, filter_vert);
+  } else if (8 == height) {
+    common_hv_2ht_2vt_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
+                                           filter_horiz, filter_vert);
+  }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   int8_t *filter_horiz,
+                                                   int8_t *filter_vert) {
+  v16i8 src0, src1, src2, src3, src4, mask;
+  v16u8 filt_hz, filt_vt, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3;
+  v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+  v8i16 filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[0]);
+
+  /* rearranging filter */
+  filt = LD_SH(filter_horiz);
+  filt_hz = (v16u8)__msa_splati_h(filt, 0);
+
+  filt = LD_SH(filter_vert);
+  filt_vt = (v16u8)__msa_splati_h(filt, 0);
+
+  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+  src += (5 * src_stride);
+
+  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+  hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+  vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+  tmp0 = __msa_dotp_u_h(vec0, filt_vt);
+
+  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+  vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+  tmp1 = __msa_dotp_u_h(vec1, filt_vt);
+
+  hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+  vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+  tmp2 = __msa_dotp_u_h(vec2, filt_vt);
+
+  hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+  vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+  tmp3 = __msa_dotp_u_h(vec3, filt_vt);
+
+  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+  PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
+                     dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(const uint8_t *src,
+                                                       int32_t src_stride,
+                                                       uint8_t *dst,
+                                                       int32_t dst_stride,
+                                                       int8_t *filter_horiz,
+                                                       int8_t *filter_vert,
+                                                       int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, mask;
+  v16u8 filt_hz, filt_vt, vec0, dst0, dst1, dst2, dst3;
+  v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+  v8i16 filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[0]);
+
+  /* rearranging filter */
+  filt = LD_SH(filter_horiz);
+  filt_hz = (v16u8)__msa_splati_h(filt, 0);
+
+  filt = LD_SH(filter_vert);
+  filt_vt = (v16u8)__msa_splati_h(filt, 0);
+
+  src0 = LD_SB(src);
+  src += src_stride;
+
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+    tmp0 = __msa_dotp_u_h(vec0, filt_vt);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+    tmp1 = __msa_dotp_u_h(vec0, filt_vt);
+
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+    tmp2 = __msa_dotp_u_h(vec0, filt_vt);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+    tmp3 = __msa_dotp_u_h(vec0, filt_vt);
+
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
+                       dst, dst_stride);
+    dst += (4 * dst_stride);
+  }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_8w_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride,
+                                                  int8_t *filter_horiz,
+                                                  int8_t *filter_vert,
+                                                  int32_t height) {
+  if (4 == height) {
+    common_hv_2ht_2vt_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride,
+                                           filter_horiz, filter_vert);
+  } else {
+    common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
+                                               filter_horiz, filter_vert,
+                                               height);
+  }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_16w_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   int8_t *filter_horiz,
+                                                   int8_t *filter_vert,
+                                                   int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  v16u8 filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3;
+  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
+  v8i16 filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[0]);
+
+  /* rearranging filter */
+  filt = LD_SH(filter_horiz);
+  filt_hz = (v16u8)__msa_splati_h(filt, 0);
+
+  filt = LD_SH(filter_vert);
+  filt_vt = (v16u8)__msa_splati_h(filt, 0);
+
+  LD_SB2(src, 8, src0, src1);
+  src += src_stride;
+
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src0, src2, src4, src6);
+    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+    src += (4 * src_stride);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+    hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
+    dst += dst_stride;
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_AVG_ST_UB(tmp1, tmp0, dst1, dst);
+    dst += dst_stride;
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+    hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
+    dst += dst_stride;
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_AVG_ST_UB(tmp1, tmp0, dst3, dst);
+    dst += dst_stride;
+  }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_32w_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   int8_t *filter_horiz,
+                                                   int8_t *filter_vert,
+                                                   int32_t height) {
+  int32_t multiple8_cnt;
+  for (multiple8_cnt = 2; multiple8_cnt--;) {
+    common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride,
+                                           filter_horiz, filter_vert, height);
+    src += 16;
+    dst += 16;
+  }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_64w_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   int8_t *filter_horiz,
+                                                   int8_t *filter_vert,
+                                                   int32_t height) {
+  int32_t multiple8_cnt;
+  for (multiple8_cnt = 4; multiple8_cnt--;) {
+    common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride,
+                                           filter_horiz, filter_vert, height);
+    src += 16;
+    dst += 16;
+  }
+}
+
+void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const int16_t *filter_x, int x_step_q4,
+                           const int16_t *filter_y, int y_step_q4,
+                           int w, int h) {
+  int8_t cnt, filt_hor[8], filt_ver[8];
+
+  assert(x_step_q4 == 16);
+  assert(y_step_q4 == 16);
+  assert(((const int32_t *)filter_x)[1] != 0x800000);
+  assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+  for (cnt = 0; cnt < 8; ++cnt) {
+    filt_hor[cnt] = filter_x[cnt];
+    filt_ver[cnt] = filter_y[cnt];
+  }
+
+  if (((const int32_t *)filter_x)[0] == 0 &&
+      ((const int32_t *)filter_y)[0] == 0) {
+    switch (w) {
+      case 4:
+        common_hv_2ht_2vt_and_aver_dst_4w_msa(src, (int32_t)src_stride,
+                                              dst, (int32_t)dst_stride,
+                                              &filt_hor[3], &filt_ver[3], h);
+        break;
+      case 8:
+        common_hv_2ht_2vt_and_aver_dst_8w_msa(src, (int32_t)src_stride,
+                                              dst, (int32_t)dst_stride,
+                                              &filt_hor[3], &filt_ver[3], h);
+        break;
+      case 16:
+        common_hv_2ht_2vt_and_aver_dst_16w_msa(src, (int32_t)src_stride,
+                                               dst, (int32_t)dst_stride,
+                                               &filt_hor[3], &filt_ver[3], h);
+        break;
+      case 32:
+        common_hv_2ht_2vt_and_aver_dst_32w_msa(src, (int32_t)src_stride,
+                                               dst, (int32_t)dst_stride,
+                                               &filt_hor[3], &filt_ver[3], h);
+        break;
+      case 64:
+        common_hv_2ht_2vt_and_aver_dst_64w_msa(src, (int32_t)src_stride,
+                                               dst, (int32_t)dst_stride,
+                                               &filt_hor[3], &filt_ver[3], h);
+        break;
+      default:
+        vpx_convolve8_avg_c(src, src_stride, dst, dst_stride,
+                            filter_x, x_step_q4, filter_y, y_step_q4,
+                            w, h);
+        break;
+    }
+  } else if (((const int32_t *)filter_x)[0] == 0 ||
+             ((const int32_t *)filter_y)[0] == 0) {
+    vpx_convolve8_avg_c(src, src_stride, dst, dst_stride,
+                        filter_x, x_step_q4, filter_y, y_step_q4,
+                        w, h);
+  } else {
+    switch (w) {
+      case 4:
+        common_hv_8ht_8vt_and_aver_dst_4w_msa(src, (int32_t)src_stride,
+                                              dst, (int32_t)dst_stride,
+                                              filt_hor, filt_ver, h);
+        break;
+      case 8:
+        common_hv_8ht_8vt_and_aver_dst_8w_msa(src, (int32_t)src_stride,
+                                              dst, (int32_t)dst_stride,
+                                              filt_hor, filt_ver, h);
+        break;
+      case 16:
+        common_hv_8ht_8vt_and_aver_dst_16w_msa(src, (int32_t)src_stride,
+                                               dst, (int32_t)dst_stride,
+                                               filt_hor, filt_ver, h);
+        break;
+      case 32:
+        common_hv_8ht_8vt_and_aver_dst_32w_msa(src, (int32_t)src_stride,
+                                               dst, (int32_t)dst_stride,
+                                               filt_hor, filt_ver, h);
+        break;
+      case 64:
+        common_hv_8ht_8vt_and_aver_dst_64w_msa(src, (int32_t)src_stride,
+                                               dst, (int32_t)dst_stride,
+                                               filt_hor, filt_ver, h);
+        break;
+      default:
+        vpx_convolve8_avg_c(src, src_stride, dst, dst_stride,
+                            filter_x, x_step_q4, filter_y, y_step_q4,
+                            w, h);
+        break;
+    }
+  }
+}
diff --git a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c
new file mode 100644
index 0000000000..0164e41aa1
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c
@@ -0,0 +1,718 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/vpx_convolve_msa.h"
+
+static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride,
+                                             int8_t *filter,
+                                             int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  v16u8 dst0, dst1, dst2, dst3, out;
+  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+  v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
+  v16i8 src10998, filt0, filt1, filt2, filt3;
+  v8i16 filt, out10, out32;
+
+  src -= (3 * src_stride);
+
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+  src += (7 * src_stride);
+
+  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+             src54_r, src21_r);
+  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+  ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
+             src4332, src6554);
+  XORI_B3_128_SB(src2110, src4332, src6554);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src7, src8, src9, src10);
+    src += (4 * src_stride);
+
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+               src87_r, src98_r, src109_r);
+    ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
+    XORI_B2_128_SB(src8776, src10998);
+    out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
+                                filt1, filt2, filt3);
+    out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
+                                filt1, filt2, filt3);
+    SRARI_H2_SH(out10, out32, FILTER_BITS);
+    SAT_SH2_SH(out10, out32, 7);
+    out = PCKEV_XORI128_UB(out10, out32);
+    ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
+
+    dst0 = (v16u8)__msa_ilvr_d((v2i64)dst2, (v2i64)dst0);
+    out = __msa_aver_u_b(out, dst0);
+
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    src2110 = src6554;
+    src4332 = src8776;
+    src6554 = src10998;
+    src6 = src10;
+  }
+}
+
+static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride,
+                                             int8_t *filter,
+                                             int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  v16u8 dst0, dst1, dst2, dst3;
+  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+  v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
+  v8i16 filt, out0, out1, out2, out3;
+
+  src -= (3 * src_stride);
+
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+  src += (7 * src_stride);
+
+  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+             src54_r, src21_r);
+  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src7, src8, src9, src10);
+    src += (4 * src_stride);
+
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    XORI_B4_128_SB(src7, src8, src9, src10);
+    ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+               src87_r, src98_r, src109_r);
+    out0 = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
+                               filt1, filt2, filt3);
+    out1 = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
+                               filt1, filt2, filt3);
+    out2 = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
+                               filt1, filt2, filt3);
+    out3 = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
+                               filt1, filt2, filt3);
+    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3,
+                            dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    src10_r = src54_r;
+    src32_r = src76_r;
+    src54_r = src98_r;
+    src21_r = src65_r;
+    src43_r = src87_r;
+    src65_r = src109_r;
+    src6 = src10;
+  }
+}
+
+static void common_vt_8t_and_aver_dst_16w_mult_msa(const uint8_t *src,
+                                                   int32_t src_stride,
+                                                   uint8_t *dst,
+                                                   int32_t dst_stride,
+                                                   int8_t *filter,
+                                                   int32_t height,
+                                                   int32_t width) {
+  const uint8_t *src_tmp;
+  uint8_t *dst_tmp;
+  uint32_t loop_cnt, cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
+  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
+  v16i8 filt0, filt1, filt2, filt3;
+  v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
+  v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt;
+
+  src -= (3 * src_stride);
+
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  for (cnt = (width >> 4); cnt--;) {
+    src_tmp = src;
+    dst_tmp = dst;
+
+    LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+    src_tmp += (7 * src_stride);
+
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+               src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+    ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
+               src54_l, src21_l);
+    ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+      LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
+      src_tmp += (4 * src_stride);
+
+      LD_UB4(dst_tmp, dst_stride, dst0, dst1, dst2, dst3);
+      XORI_B4_128_SB(src7, src8, src9, src10);
+      ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+                 src87_r, src98_r, src109_r);
+      ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
+                 src87_l, src98_l, src109_l);
+      out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
+                                   filt1, filt2, filt3);
+      out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
+                                   filt1, filt2, filt3);
+      out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
+                                   filt1, filt2, filt3);
+      out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
+                                   filt1, filt2, filt3);
+      out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
+                                   filt1, filt2, filt3);
+      out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
+                                   filt1, filt2, filt3);
+      out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
+                                   filt1, filt2, filt3);
+      out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
+                                   filt1, filt2, filt3);
+      SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
+      SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS);
+      SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+      SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+      PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+                  out3_r, tmp0, tmp1, tmp2, tmp3);
+      XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
+      AVER_UB4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst0, dst1,
+                  dst2, dst3);
+      ST_UB4(dst0, dst1, dst2, dst3, dst_tmp, dst_stride);
+      dst_tmp += (4 * dst_stride);
+
+      src10_r = src54_r;
+      src32_r = src76_r;
+      src54_r = src98_r;
+      src21_r = src65_r;
+      src43_r = src87_r;
+      src65_r = src109_r;
+      src10_l = src54_l;
+      src32_l = src76_l;
+      src54_l = src98_l;
+      src21_l = src65_l;
+      src43_l = src87_l;
+      src65_l = src109_l;
+      src6 = src10;
+    }
+
+    src += 16;
+    dst += 16;
+  }
+}
+
+static void common_vt_8t_and_aver_dst_16w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter,
+                                              int32_t height) {
+  common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
+                                         filter, height, 16);
+}
+
+static void common_vt_8t_and_aver_dst_32w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter,
+                                              int32_t height) {
+  common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
+                                         filter, height, 32);
+}
+
+static void common_vt_8t_and_aver_dst_64w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter,
+                                              int32_t height) {
+  common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
+                                         filter, height, 64);
+}
+
+static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter) {
+  v16i8 src0, src1, src2, src3, src4;
+  v16u8 dst0, dst1, dst2, dst3, out, filt0, src2110, src4332;
+  v16i8 src10_r, src32_r, src21_r, src43_r;
+  v8i16 filt;
+  v8u16 tmp0, tmp1;
+
+  filt = LD_SH(filter);
+  filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  src += (4 * src_stride);
+
+  src4 = LD_SB(src);
+  src += src_stride;
+
+  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+  ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
+  dst0 = (v16u8)__msa_ilvr_d((v2i64)dst1, (v2i64)dst0);
+  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+             src32_r, src43_r);
+  ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+  DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
+  SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+
+  out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+  out = __msa_aver_u_b(out, dst0);
+
+  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter) {
+  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src87_r;
+  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+  v16u8 src2110, src4332, src6554, src8776, filt0;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v8i16 filt;
+
+  filt = LD_SH(filter);
+  filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+  src += (8 * src_stride);
+  src8 = LD_SB(src);
+
+  LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+  ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1,
+             dst2, dst3);
+  ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
+  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+             src32_r, src43_r);
+  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
+             src76_r, src87_r);
+  ILVR_D4_UB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
+             src87_r, src76_r, src2110, src4332, src6554, src8776);
+  DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
+              tmp0, tmp1, tmp2, tmp3);
+  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
+  AVER_UB2_UB(src2110, dst0, src4332, dst1, src2110, src4332);
+  ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
+  dst += (4 * dst_stride);
+  ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_vt_2t_and_aver_dst_4w_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride,
+                                             int8_t *filter,
+                                             int32_t height) {
+  if (4 == height) {
+    common_vt_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
+  } else if (8 == height) {
+    common_vt_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
+static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter) {
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3, filt0;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v8i16 filt;
+
+  /* rearranging filter_y */
+  filt = LD_SH(filter);
+  filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+  LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
+  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+  ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
+  ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
+  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
+              tmp2, tmp3);
+  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+  PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
+                     dst, dst_stride);
+}
+
+static void common_vt_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  uint8_t *dst,
+                                                  int32_t dst_stride,
+                                                  int8_t *filter,
+                                                  int32_t height) {
+  uint32_t loop_cnt;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+  v16u8 dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v8i16 filt;
+
+  /* rearranging filter_y */
+  filt = LD_SH(filter);
+  filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 3); loop_cnt--;) {
+    LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
+    src += (8 * src_stride);
+    LD_UB8(dst, dst_stride, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8);
+
+    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1,
+               vec2, vec3);
+    ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5,
+               vec6, vec7);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
+                tmp2, tmp3);
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+    PCKEV_AVG_ST8x4_UB(tmp0, dst1, tmp1, dst2, tmp2, dst3, tmp3, dst4,
+                       dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1,
+                tmp2, tmp3);
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+    PCKEV_AVG_ST8x4_UB(tmp0, dst5, tmp1, dst6, tmp2, dst7, tmp3, dst8,
+                       dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    src0 = src8;
+  }
+}
+
+static void common_vt_2t_and_aver_dst_8w_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             uint8_t *dst,
+                                             int32_t dst_stride,
+                                             int8_t *filter,
+                                             int32_t height) {
+  if (4 == height) {
+    common_vt_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter);
+  } else {
+    common_vt_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
+                                          filter, height);
+  }
+}
+
+static void common_vt_2t_and_aver_dst_16w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter,
+                                              int32_t height) {
+  uint32_t loop_cnt;
+  v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2, dst3, filt0;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 tmp0, tmp1, tmp2, tmp3, filt;
+
+  /* rearranging filter_y */
+  filt = LD_UH(filter);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+    ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
+    dst += dst_stride;
+
+    ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
+    ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
+    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst);
+    dst += dst_stride;
+
+    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
+    dst += dst_stride;
+
+    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst);
+    dst += dst_stride;
+
+    src0 = src4;
+  }
+}
+
+static void common_vt_2t_and_aver_dst_32w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter,
+                                              int32_t height) {
+  uint32_t loop_cnt;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
+  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+  v8u16 tmp0, tmp1, tmp2, tmp3, filt;
+
+  /* rearranging filter_y */
+  filt = LD_UH(filter);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  LD_UB2(src, 16, src0, src5);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+    ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+
+    LD_UB4(src + 16, src_stride, src6, src7, src8, src9);
+    LD_UB4(dst + 16, dst_stride, dst4, dst5, dst6, dst7);
+    src += (4 * src_stride);
+
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
+
+    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride);
+
+    ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
+    ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
+    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst + 2 * dst_stride);
+
+    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst + 3 * dst_stride);
+
+    ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
+    ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 16);
+
+    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 16 + dst_stride);
+
+    ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
+    ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
+    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_AVG_ST_UB(tmp1, tmp0, dst6, dst + 16 + 2 * dst_stride);
+
+    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_AVG_ST_UB(tmp3, tmp2, dst7, dst + 16 + 3 * dst_stride);
+    dst += (4 * dst_stride);
+
+    src0 = src4;
+    src5 = src9;
+  }
+}
+
+static void common_vt_2t_and_aver_dst_64w_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter,
+                                              int32_t height) {
+  uint32_t loop_cnt;
+  v16u8 src0, src1, src2, src3, src4, src5;
+  v16u8 src6, src7, src8, src9, src10, src11, filt0;
+  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  v8u16 filt;
+
+  /* rearranging filter_y */
+  filt = LD_UH(filter);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  LD_UB4(src, 16, src0, src3, src6, src9);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 1); loop_cnt--;) {
+    LD_UB2(src, src_stride, src1, src2);
+    LD_UB2(dst, dst_stride, dst0, dst1);
+    LD_UB2(src + 16, src_stride, src4, src5);
+    LD_UB2(dst + 16, dst_stride, dst2, dst3);
+    LD_UB2(src + 32, src_stride, src7, src8);
+    LD_UB2(dst + 32, dst_stride, dst4, dst5);
+    LD_UB2(src + 48, src_stride, src10, src11);
+    LD_UB2(dst + 48, dst_stride, dst6, dst7);
+    src += (2 * src_stride);
+
+    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+    ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
+
+    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride);
+
+    ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
+    ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
+    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
+    SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
+    PCKEV_AVG_ST_UB(tmp5, tmp4, dst2, dst + 16);
+
+    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
+    SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
+    PCKEV_AVG_ST_UB(tmp7, tmp6, dst3, dst + 16 + dst_stride);
+
+    ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
+    ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 32);
+
+    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 32 + dst_stride);
+
+    ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
+    ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
+    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
+    SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
+    PCKEV_AVG_ST_UB(tmp5, tmp4, dst6, (dst + 48));
+
+    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
+    SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
+    PCKEV_AVG_ST_UB(tmp7, tmp6, dst7, dst + 48 + dst_stride);
+    dst += (2 * dst_stride);
+
+    src0 = src2;
+    src3 = src5;
+    src6 = src8;
+    src9 = src11;
+  }
+}
+
+void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
+                                uint8_t *dst, ptrdiff_t dst_stride,
+                                const int16_t *filter_x, int x_step_q4,
+                                const int16_t *filter_y, int y_step_q4,
+                                int w, int h) {
+  int8_t cnt, filt_ver[8];
+
+  assert(y_step_q4 == 16);
+  assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+  for (cnt = 0; cnt < 8; ++cnt) {
+    filt_ver[cnt] = filter_y[cnt];
+  }
+
+  if (((const int32_t *)filter_y)[0] == 0) {
+    switch (w) {
+      case 4:
+        common_vt_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride,
+                                         dst, (int32_t)dst_stride,
+                                         &filt_ver[3], h);
+        break;
+      case 8:
+        common_vt_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride,
+                                         dst, (int32_t)dst_stride,
+                                         &filt_ver[3], h);
+        break;
+      case 16:
+        common_vt_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride,
+                                          dst, (int32_t)dst_stride,
+                                          &filt_ver[3], h);
+        break;
+      case 32:
+        common_vt_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride,
+                                          dst, (int32_t)dst_stride,
+                                          &filt_ver[3], h);
+        break;
+      case 64:
+        common_vt_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride,
+                                          dst, (int32_t)dst_stride,
+                                          &filt_ver[3], h);
+        break;
+      default:
+        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
+                                 filter_x, x_step_q4, filter_y, y_step_q4,
+                                 w, h);
+        break;
+    }
+  } else {
+    switch (w) {
+      case 4:
+        common_vt_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride,
+                                         dst, (int32_t)dst_stride,
+                                         filt_ver, h);
+        break;
+      case 8:
+        common_vt_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride,
+                                         dst, (int32_t)dst_stride,
+                                         filt_ver, h);
+        break;
+      case 16:
+        common_vt_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride,
+                                          dst, (int32_t)dst_stride,
+                                          filt_ver, h);
+
+        break;
+      case 32:
+        common_vt_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride,
+                                          dst, (int32_t)dst_stride,
+                                          filt_ver, h);
+        break;
+      case 64:
+        common_vt_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride,
+                                          dst, (int32_t)dst_stride,
+                                          filt_ver, h);
+        break;
+      default:
+        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
+                                 filter_x, x_step_q4, filter_y, y_step_q4,
+                                 w, h);
+        break;
+    }
+  }
+}
diff --git a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c
new file mode 100644
index 0000000000..dbd120b0d5
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c
@@ -0,0 +1,703 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/vpx_convolve_msa.h"
+
+static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  v16u8 mask0, mask1, mask2, mask3, out;
+  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+  v8i16 filt, out0, out1;
+
+  mask0 = LD_UB(&mc_filt_mask_arr[16]);
+  src -= 3;
+
+  /* rearranging filter */
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  XORI_B4_128_SB(src0, src1, src2, src3);
+  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filt0, filt1, filt2, filt3, out0, out1);
+  SRARI_H2_SH(out0, out1, FILTER_BITS);
+  SAT_SH2_SH(out0, out1, 7);
+  out = PCKEV_XORI128_UB(out0, out1);
+  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  v16i8 filt0, filt1, filt2, filt3;
+  v16i8 src0, src1, src2, src3;
+  v16u8 mask0, mask1, mask2, mask3, out;
+  v8i16 filt, out0, out1, out2, out3;
+
+  mask0 = LD_UB(&mc_filt_mask_arr[16]);
+  src -= 3;
+
+  /* rearranging filter */
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  XORI_B4_128_SB(src0, src1, src2, src3);
+  src += (4 * src_stride);
+  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filt0, filt1, filt2, filt3, out0, out1);
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  XORI_B4_128_SB(src0, src1, src2, src3);
+  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filt0, filt1, filt2, filt3, out2, out3);
+  SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+  SAT_SH4_SH(out0, out1, out2, out3, 7);
+  out = PCKEV_XORI128_UB(out0, out1);
+  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+  dst += (4 * dst_stride);
+  out = PCKEV_XORI128_UB(out2, out3);
+  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  if (4 == height) {
+    common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+  } else if (8 == height) {
+    common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
+static void common_hz_8t_8x4_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+  v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
+  v8i16 filt, out0, out1, out2, out3;
+
+  mask0 = LD_UB(&mc_filt_mask_arr[0]);
+  src -= 3;
+
+  /* rearranging filter */
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  XORI_B4_128_SB(src0, src1, src2, src3);
+  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filt0, filt1, filt2, filt3, out0, out1, out2,
+                             out3);
+  SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+  SAT_SH4_SH(out0, out1, out2, out3, 7);
+  tmp0 = PCKEV_XORI128_UB(out0, out1);
+  tmp1 = PCKEV_XORI128_UB(out2, out3);
+  ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+}
+
+static void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+  v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
+  v8i16 filt, out0, out1, out2, out3;
+
+  mask0 = LD_UB(&mc_filt_mask_arr[0]);
+  src -= 3;
+
+  /* rearranging filter */
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    src += (4 * src_stride);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1,
+                               out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    tmp0 = PCKEV_XORI128_UB(out0, out1);
+    tmp1 = PCKEV_XORI128_UB(out2, out3);
+    ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+    dst += (4 * dst_stride);
+  }
+}
+
+static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  if (4 == height) {
+    common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter);
+  } else {
+    common_hz_8t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
+  }
+}
+
+static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+  v16u8 mask0, mask1, mask2, mask3, out;
+  v8i16 filt, out0, out1, out2, out3;
+
+  mask0 = LD_UB(&mc_filt_mask_arr[0]);
+  src -= 3;
+
+  /* rearranging filter */
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  for (loop_cnt = (height >> 1); loop_cnt--;) {
+    LD_SB2(src, src_stride, src0, src2);
+    LD_SB2(src + 8, src_stride, src1, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    src += (2 * src_stride);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1,
+                               out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST_UB(out, dst);
+    dst += dst_stride;
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST_UB(out, dst);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+  v16u8 mask0, mask1, mask2, mask3, out;
+  v8i16 filt, out0, out1, out2, out3;
+
+  mask0 = LD_UB(&mc_filt_mask_arr[0]);
+  src -= 3;
+
+  /* rearranging filter */
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  for (loop_cnt = (height >> 1); loop_cnt--;) {
+    src0 = LD_SB(src);
+    src2 = LD_SB(src + 16);
+    src3 = LD_SB(src + 24);
+    src1 = __msa_sldi_b(src2, src0, 8);
+    src += src_stride;
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1,
+                               out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+
+    src0 = LD_SB(src);
+    src2 = LD_SB(src + 16);
+    src3 = LD_SB(src + 24);
+    src1 = __msa_sldi_b(src2, src0, 8);
+    src += src_stride;
+
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST_UB(out, dst);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST_UB(out, dst + 16);
+    dst += dst_stride;
+
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1,
+                               out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST_UB(out, dst);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST_UB(out, dst + 16);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  int32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+  v16u8 mask0, mask1, mask2, mask3, out;
+  v8i16 filt, out0, out1, out2, out3;
+
+  mask0 = LD_UB(&mc_filt_mask_arr[0]);
+  src -= 3;
+
+  /* rearranging filter */
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  for (loop_cnt = height; loop_cnt--;) {
+    src0 = LD_SB(src);
+    src2 = LD_SB(src + 16);
+    src3 = LD_SB(src + 24);
+    src1 = __msa_sldi_b(src2, src0, 8);
+
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1,
+                               out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST_UB(out, dst);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST_UB(out, dst + 16);
+
+    src0 = LD_SB(src + 32);
+    src2 = LD_SB(src + 48);
+    src3 = LD_SB(src + 56);
+    src1 = __msa_sldi_b(src2, src0, 8);
+    src += src_stride;
+
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1,
+                               out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST_UB(out, dst + 32);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST_UB(out, dst + 48);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  v16i8 src0, src1, src2, src3, mask;
+  v16u8 filt0, vec0, vec1, res0, res1;
+  v8u16 vec2, vec3, filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[16]);
+
+  /* rearranging filter */
+  filt = LD_UH(filter);
+  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
+  DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
+  SRARI_H2_UH(vec2, vec3, FILTER_BITS);
+  PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
+  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  v16u8 vec0, vec1, vec2, vec3, filt0;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  v16i8 res0, res1, res2, res3;
+  v8u16 vec4, vec5, vec6, vec7, filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[16]);
+
+  /* rearranging filter */
+  filt = LD_UH(filter);
+  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
+  VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
+  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
+              vec6, vec7);
+  SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
+  PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1,
+              res2, res3);
+  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+  dst += (4 * dst_stride);
+  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hz_2t_4w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  if (4 == height) {
+    common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+  } else if (8 == height) {
+    common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
+static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  v16u8 filt0;
+  v16i8 src0, src1, src2, src3, mask;
+  v8u16 vec0, vec1, vec2, vec3, filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[0]);
+
+  /* rearranging filter */
+  filt = LD_UH(filter);
+  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+              vec2, vec3);
+  SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+  PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
+  ST8x4_UB(src0, src1, dst, dst_stride);
+}
+
+static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int8_t *filter, int32_t height) {
+  v16u8 filt0;
+  v16i8 src0, src1, src2, src3, mask, out0, out1;
+  v8u16 vec0, vec1, vec2, vec3, filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[0]);
+
+  /* rearranging filter */
+  filt = LD_UH(filter);
+  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  src += (4 * src_stride);
+
+  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+              vec2, vec3);
+  SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  src += (4 * src_stride);
+
+  PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+  ST8x4_UB(out0, out1, dst, dst_stride);
+  dst += (4 * dst_stride);
+
+  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+              vec2, vec3);
+  SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+  PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+  ST8x4_UB(out0, out1, dst, dst_stride);
+  dst += (4 * dst_stride);
+
+  if (16 == height) {
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+                vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+
+    PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+                vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+    PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+    ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
+  }
+}
+
+static void common_hz_2t_8w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  if (4 == height) {
+    common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
+  } else {
+    common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
+  }
+}
+
+static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[0]);
+
+  loop_cnt = (height >> 2) - 1;
+
+  /* rearranging filter */
+  filt = LD_UH(filter);
+  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+  LD_SB4(src, src_stride, src0, src2, src4, src6);
+  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+  src += (4 * src_stride);
+
+  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
+              out2, out3);
+  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
+              out6, out7);
+  SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
+  SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
+  PCKEV_ST_SB(out0, out1, dst);
+  dst += dst_stride;
+  PCKEV_ST_SB(out2, out3, dst);
+  dst += dst_stride;
+  PCKEV_ST_SB(out4, out5, dst);
+  dst += dst_stride;
+  PCKEV_ST_SB(out6, out7, dst);
+  dst += dst_stride;
+
+  for (; loop_cnt--;) {
+    LD_SB4(src, src_stride, src0, src2, src4, src6);
+    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+    src += (4 * src_stride);
+
+    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
+                out2, out3);
+    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
+                out6, out7);
+    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
+    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
+    PCKEV_ST_SB(out0, out1, dst);
+    dst += dst_stride;
+    PCKEV_ST_SB(out2, out3, dst);
+    dst += dst_stride;
+    PCKEV_ST_SB(out4, out5, dst);
+    dst += dst_stride;
+    PCKEV_ST_SB(out6, out7, dst);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_2t_32w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[0]);
+
+  /* rearranging filter */
+  filt = LD_UH(filter);
+  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+  for (loop_cnt = height >> 1; loop_cnt--;) {
+    src0 = LD_SB(src);
+    src2 = LD_SB(src + 16);
+    src3 = LD_SB(src + 24);
+    src1 = __msa_sldi_b(src2, src0, 8);
+    src += src_stride;
+    src4 = LD_SB(src);
+    src6 = LD_SB(src + 16);
+    src7 = LD_SB(src + 24);
+    src5 = __msa_sldi_b(src6, src4, 8);
+    src += src_stride;
+
+    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
+                out2, out3);
+    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
+                out6, out7);
+    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
+    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
+    PCKEV_ST_SB(out0, out1, dst);
+    PCKEV_ST_SB(out2, out3, dst + 16);
+    dst += dst_stride;
+    PCKEV_ST_SB(out4, out5, dst);
+    PCKEV_ST_SB(out6, out7, dst + 16);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[0]);
+
+  /* rearranging filter */
+  filt = LD_UH(filter);
+  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+
+  for (loop_cnt = height; loop_cnt--;) {
+    src0 = LD_SB(src);
+    src2 = LD_SB(src + 16);
+    src4 = LD_SB(src + 32);
+    src6 = LD_SB(src + 48);
+    src7 = LD_SB(src + 56);
+    SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8);
+    src += src_stride;
+
+    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
+                out2, out3);
+    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
+                out6, out7);
+    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
+    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
+    PCKEV_ST_SB(out0, out1, dst);
+    PCKEV_ST_SB(out2, out3, dst + 16);
+    PCKEV_ST_SB(out4, out5, dst + 32);
+    PCKEV_ST_SB(out6, out7, dst + 48);
+    dst += dst_stride;
+  }
+}
+
+void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const int16_t *filter_x, int x_step_q4,
+                             const int16_t *filter_y, int y_step_q4,
+                             int w, int h) {
+  int8_t cnt, filt_hor[8];
+
+  assert(x_step_q4 == 16);
+  assert(((const int32_t *)filter_x)[1] != 0x800000);
+
+  for (cnt = 0; cnt < 8; ++cnt) {
+    filt_hor[cnt] = filter_x[cnt];
+  }
+
+  if (((const int32_t *)filter_x)[0] == 0) {
+    switch (w) {
+      case 4:
+        common_hz_2t_4w_msa(src, (int32_t)src_stride,
+                            dst, (int32_t)dst_stride,
+                            &filt_hor[3], h);
+        break;
+      case 8:
+        common_hz_2t_8w_msa(src, (int32_t)src_stride,
+                            dst, (int32_t)dst_stride,
+                            &filt_hor[3], h);
+        break;
+      case 16:
+        common_hz_2t_16w_msa(src, (int32_t)src_stride,
+                             dst, (int32_t)dst_stride,
+                             &filt_hor[3], h);
+        break;
+      case 32:
+        common_hz_2t_32w_msa(src, (int32_t)src_stride,
+                             dst, (int32_t)dst_stride,
+                             &filt_hor[3], h);
+        break;
+      case 64:
+        common_hz_2t_64w_msa(src, (int32_t)src_stride,
+                             dst, (int32_t)dst_stride,
+                             &filt_hor[3], h);
+        break;
+      default:
+        vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride,
+                              filter_x, x_step_q4, filter_y, y_step_q4,
+                              w, h);
+        break;
+    }
+  } else {
+    switch (w) {
+      case 4:
+        common_hz_8t_4w_msa(src, (int32_t)src_stride,
+                            dst, (int32_t)dst_stride,
+                            filt_hor, h);
+        break;
+      case 8:
+        common_hz_8t_8w_msa(src, (int32_t)src_stride,
+                            dst, (int32_t)dst_stride,
+                            filt_hor, h);
+        break;
+      case 16:
+        common_hz_8t_16w_msa(src, (int32_t)src_stride,
+                             dst, (int32_t)dst_stride,
+                             filt_hor, h);
+        break;
+      case 32:
+        common_hz_8t_32w_msa(src, (int32_t)src_stride,
+                             dst, (int32_t)dst_stride,
+                             filt_hor, h);
+        break;
+      case 64:
+        common_hz_8t_64w_msa(src, (int32_t)src_stride,
+                             dst, (int32_t)dst_stride,
+                             filt_hor, h);
+        break;
+      default:
+        vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride,
+                              filter_x, x_step_q4, filter_y, y_step_q4,
+                              w, h);
+        break;
+    }
+  }
+}
diff --git a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c
new file mode 100644
index 0000000000..7546f13150
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c
@@ -0,0 +1,635 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/vpx_convolve_msa.h"
+
+const uint8_t mc_filt_mask_arr[16 * 3] = {
+  /* 8 width cases */
+  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+  /* 4 width cases */
+  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+  /* 4 width cases */
+  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+static void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int8_t *filter_horiz, int8_t *filter_vert,
+                                     int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+  v16u8 mask0, mask1, mask2, mask3, out;
+  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+  v8i16 hz_out7, hz_out8, hz_out9, tmp0, tmp1, out0, out1, out2, out3, out4;
+  v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+
+  mask0 = LD_UB(&mc_filt_mask_arr[16]);
+  src -= (3 + 3 * src_stride);
+
+  /* rearranging filter */
+  filt = LD_SH(filter_horiz);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+  src += (7 * src_stride);
+
+  hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+  hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+  hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+  hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+  SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8);
+
+  filt = LD_SH(filter_vert);
+  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+
+  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
+  out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src7, src8, src9, src10);
+    XORI_B4_128_SB(src7, src8, src9, src10);
+    src += (4 * src_stride);
+
+    hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3,
+                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+    hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
+    out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
+    tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+
+    hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3,
+                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+    hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8);
+    out4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
+    tmp1 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out4, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    SRARI_H2_SH(tmp0, tmp1, FILTER_BITS);
+    SAT_SH2_SH(tmp0, tmp1, 7);
+    out = PCKEV_XORI128_UB(tmp0, tmp1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    hz_out5 = hz_out9;
+    out0 = out2;
+    out1 = out3;
+    out2 = out4;
+  }
+}
+
+static void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int8_t *filter_horiz, int8_t *filter_vert,
+                                     int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+  v16u8 mask0, mask1, mask2, mask3, vec0, vec1;
+  v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+  v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
+  v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
+
+  mask0 = LD_UB(&mc_filt_mask_arr[0]);
+  src -= (3 + 3 * src_stride);
+
+  /* rearranging filter */
+  filt = LD_SH(filter_horiz);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+  src += (7 * src_stride);
+
+  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+  hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+  hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+  hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+  hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+  hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+  hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+  hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+
+  filt = LD_SH(filter_vert);
+  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+
+  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
+  ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
+  ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src7, src8, src9, src10);
+    src += (4 * src_stride);
+
+    XORI_B4_128_SB(src7, src8, src9, src10);
+
+    hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3,
+                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+    out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
+    tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+
+    hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3,
+                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+    out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
+    tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+
+    hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3,
+                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+    out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
+    tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+
+    hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
+                               filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+    out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9);
+    tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+    SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+    vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
+    vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
+    ST8x4_UB(vec0, vec1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    hz_out6 = hz_out10;
+    out0 = out2;
+    out1 = out3;
+    out2 = out8;
+    out4 = out6;
+    out5 = out7;
+    out6 = out9;
+  }
+}
+
+static void common_hv_8ht_8vt_16w_msa(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz, int8_t *filter_vert,
+                                      int32_t height) {
+  int32_t multiple8_cnt;
+  for (multiple8_cnt = 2; multiple8_cnt--;) {
+    common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+                             filter_vert, height);
+    src += 8;
+    dst += 8;
+  }
+}
+
+static void common_hv_8ht_8vt_32w_msa(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz, int8_t *filter_vert,
+                                      int32_t height) {
+  int32_t multiple8_cnt;
+  for (multiple8_cnt = 4; multiple8_cnt--;) {
+    common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+                             filter_vert, height);
+    src += 8;
+    dst += 8;
+  }
+}
+
+static void common_hv_8ht_8vt_64w_msa(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz, int8_t *filter_vert,
+                                      int32_t height) {
+  int32_t multiple8_cnt;
+  for (multiple8_cnt = 8; multiple8_cnt--;) {
+    common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+                             filter_vert, height);
+    src += 8;
+    dst += 8;
+  }
+}
+
+static void common_hv_2ht_2vt_4x4_msa(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz,
+                                      int8_t *filter_vert) {
+  v16i8 src0, src1, src2, src3, src4, mask;
+  v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1;
+  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1;
+
+  mask = LD_SB(&mc_filt_mask_arr[16]);
+
+  /* rearranging filter */
+  filt = LD_UH(filter_horiz);
+  filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  filt = LD_UH(filter_vert);
+  filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
+  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
+  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+  hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
+  hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
+
+  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+  SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+  PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
+  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz,
+                                      int8_t *filter_vert) {
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
+  v16i8 res0, res1, res2, res3;
+  v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
+  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+  v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[16]);
+
+  /* rearranging filter */
+  filt = LD_UH(filter_horiz);
+  filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  filt = LD_UH(filter_vert);
+  filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+  src += (8 * src_stride);
+  src8 = LD_SB(src);
+
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
+  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
+  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS);
+  hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS);
+  hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS);
+  SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
+             hz_out3, hz_out5, 8);
+  hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6);
+
+  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+  ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
+  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
+              vec4, vec5, vec6, vec7);
+  SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
+  PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1,
+              res2, res3);
+  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+  dst += (4 * dst_stride);
+  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_4w_msa(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int8_t *filter_horiz, int8_t *filter_vert,
+                                     int32_t height) {
+  if (4 == height) {
+    common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride, filter_horiz,
+                              filter_vert);
+  } else if (8 == height) {
+    common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride, filter_horiz,
+                              filter_vert);
+  }
+}
+
+static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz,
+                                      int8_t *filter_vert) {
+  v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
+  v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
+  v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+  v8i16 filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[0]);
+
+  /* rearranging filter */
+  filt = LD_SH(filter_horiz);
+  filt_hz = (v16u8)__msa_splati_h(filt, 0);
+
+  filt = LD_SH(filter_vert);
+  filt_vt = (v16u8)__msa_splati_h(filt, 0);
+
+  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+  hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+  vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+  tmp0 = __msa_dotp_u_h(vec0, filt_vt);
+
+  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+  vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+  tmp1 = __msa_dotp_u_h(vec1, filt_vt);
+
+  hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+  vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+  tmp2 = __msa_dotp_u_h(vec2, filt_vt);
+
+  hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+  vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+  tmp3 = __msa_dotp_u_h(vec3, filt_vt);
+
+  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+  ST8x4_UB(out0, out1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src,
+                                          int32_t src_stride,
+                                          uint8_t *dst,
+                                          int32_t dst_stride,
+                                          int8_t *filter_horiz,
+                                          int8_t *filter_vert,
+                                          int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
+  v16u8 filt_hz, filt_vt, vec0;
+  v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+  v8i16 filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[0]);
+
+  /* rearranging filter */
+  filt = LD_SH(filter_horiz);
+  filt_hz = (v16u8)__msa_splati_h(filt, 0);
+
+  filt = LD_SH(filter_vert);
+  filt_vt = (v16u8)__msa_splati_h(filt, 0);
+
+  src0 = LD_SB(src);
+  src += src_stride;
+
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+
+  for (loop_cnt = (height >> 3); loop_cnt--;) {
+    LD_SB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+    tmp1 = __msa_dotp_u_h(vec0, filt_vt);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+    tmp2 = __msa_dotp_u_h(vec0, filt_vt);
+
+    SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+    tmp3 = __msa_dotp_u_h(vec0, filt_vt);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+    LD_SB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+    tmp4 = __msa_dotp_u_h(vec0, filt_vt);
+
+    SRARI_H2_UH(tmp3, tmp4, FILTER_BITS);
+    PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+    tmp5 = __msa_dotp_u_h(vec0, filt_vt);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+    tmp6 = __msa_dotp_u_h(vec0, filt_vt);
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+    tmp7 = __msa_dotp_u_h(vec0, filt_vt);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+    tmp8 = __msa_dotp_u_h(vec0, filt_vt);
+
+    SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, FILTER_BITS);
+    PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+    dst += (4 * dst_stride);
+  }
+}
+
+static void common_hv_2ht_2vt_8w_msa(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int8_t *filter_horiz, int8_t *filter_vert,
+                                     int32_t height) {
+  if (4 == height) {
+    common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride, filter_horiz,
+                              filter_vert);
+  } else {
+    common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride,
+                                  filter_horiz, filter_vert, height);
+  }
+}
+
+static void common_hv_2ht_2vt_16w_msa(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz, int8_t *filter_vert,
+                                      int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  v16u8 filt_hz, filt_vt, vec0, vec1;
+  v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
+  v8i16 filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[0]);
+
+  /* rearranging filter */
+  filt = LD_SH(filter_horiz);
+  filt_hz = (v16u8)__msa_splati_h(filt, 0);
+
+  filt = LD_SH(filter_vert);
+  filt_vt = (v16u8)__msa_splati_h(filt, 0);
+
+  LD_SB2(src, 8, src0, src1);
+  src += src_stride;
+
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src0, src2, src4, src6);
+    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+    src += (4 * src_stride);
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+    hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+    SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
+    PCKEV_ST_SB(tmp1, tmp2, dst);
+    dst += dst_stride;
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+    SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
+    PCKEV_ST_SB(tmp1, tmp2, dst);
+    dst += dst_stride;
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+    hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+    SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
+    PCKEV_ST_SB(tmp1, tmp2, dst);
+    dst += dst_stride;
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+    SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
+    PCKEV_ST_SB(tmp1, tmp2, dst);
+    dst += dst_stride;
+  }
+}
+
+static void common_hv_2ht_2vt_32w_msa(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz, int8_t *filter_vert,
+                                      int32_t height) {
+  int32_t multiple8_cnt;
+  for (multiple8_cnt = 2; multiple8_cnt--;) {
+    common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+                              filter_vert, height);
+    src += 16;
+    dst += 16;
+  }
+}
+
+static void common_hv_2ht_2vt_64w_msa(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz, int8_t *filter_vert,
+                                      int32_t height) {
+  int32_t multiple8_cnt;
+  for (multiple8_cnt = 4; multiple8_cnt--;) {
+    common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+                              filter_vert, height);
+    src += 16;
+    dst += 16;
+  }
+}
+
+void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride,
+                       uint8_t *dst, ptrdiff_t dst_stride,
+                       const int16_t *filter_x, int32_t x_step_q4,
+                       const int16_t *filter_y, int32_t y_step_q4,
+                       int32_t w, int32_t h) {
+  int8_t cnt, filt_hor[8], filt_ver[8];
+
+  assert(x_step_q4 == 16);
+  assert(y_step_q4 == 16);
+  assert(((const int32_t *)filter_x)[1] != 0x800000);
+  assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+  for (cnt = 0; cnt < 8; ++cnt) {
+    filt_hor[cnt] = filter_x[cnt];
+    filt_ver[cnt] = filter_y[cnt];
+  }
+
+  if (((const int32_t *)filter_x)[0] == 0 &&
+      ((const int32_t *)filter_y)[0] == 0) {
+    switch (w) {
+      case 4:
+        common_hv_2ht_2vt_4w_msa(src, (int32_t)src_stride,
+                                 dst, (int32_t)dst_stride,
+                                 &filt_hor[3], &filt_ver[3], (int32_t)h);
+        break;
+      case 8:
+        common_hv_2ht_2vt_8w_msa(src, (int32_t)src_stride,
+                                 dst, (int32_t)dst_stride,
+                                 &filt_hor[3], &filt_ver[3], (int32_t)h);
+        break;
+      case 16:
+        common_hv_2ht_2vt_16w_msa(src, (int32_t)src_stride,
+                                  dst, (int32_t)dst_stride,
+                                  &filt_hor[3], &filt_ver[3], (int32_t)h);
+        break;
+      case 32:
+        common_hv_2ht_2vt_32w_msa(src, (int32_t)src_stride,
+                                  dst, (int32_t)dst_stride,
+                                  &filt_hor[3], &filt_ver[3], (int32_t)h);
+        break;
+      case 64:
+        common_hv_2ht_2vt_64w_msa(src, (int32_t)src_stride,
+                                  dst, (int32_t)dst_stride,
+                                  &filt_hor[3], &filt_ver[3], (int32_t)h);
+        break;
+      default:
+        vpx_convolve8_c(src, src_stride, dst, dst_stride,
+                        filter_x, x_step_q4, filter_y, y_step_q4,
+                        w, h);
+        break;
+    }
+  } else if (((const int32_t *)filter_x)[0] == 0 ||
+             ((const int32_t *)filter_y)[0] == 0) {
+    vpx_convolve8_c(src, src_stride, dst, dst_stride,
+                    filter_x, x_step_q4, filter_y, y_step_q4,
+                    w, h);
+  } else {
+    switch (w) {
+      case 4:
+        common_hv_8ht_8vt_4w_msa(src, (int32_t)src_stride,
+                                 dst, (int32_t)dst_stride,
+                                 filt_hor, filt_ver, (int32_t)h);
+        break;
+      case 8:
+        common_hv_8ht_8vt_8w_msa(src, (int32_t)src_stride,
+                                 dst, (int32_t)dst_stride,
+                                 filt_hor, filt_ver, (int32_t)h);
+        break;
+      case 16:
+        common_hv_8ht_8vt_16w_msa(src, (int32_t)src_stride,
+                                  dst, (int32_t)dst_stride,
+                                  filt_hor, filt_ver, (int32_t)h);
+        break;
+      case 32:
+        common_hv_8ht_8vt_32w_msa(src, (int32_t)src_stride,
+                                  dst, (int32_t)dst_stride,
+                                  filt_hor, filt_ver, (int32_t)h);
+        break;
+      case 64:
+        common_hv_8ht_8vt_64w_msa(src, (int32_t)src_stride,
+                                  dst, (int32_t)dst_stride,
+                                  filt_hor, filt_ver, (int32_t)h);
+        break;
+      default:
+        vpx_convolve8_c(src, src_stride, dst, dst_stride,
+                        filter_x, x_step_q4, filter_y, y_step_q4,
+                        w, h);
+        break;
+    }
+  }
+}
diff --git a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c
new file mode 100644
index 0000000000..527d457199
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c
@@ -0,0 +1,710 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/vpx_convolve_msa.h"
+
+static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+  v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
+  v16i8 src10998, filt0, filt1, filt2, filt3;
+  v16u8 out;
+  v8i16 filt, out10, out32;
+
+  src -= (3 * src_stride);
+
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+  src += (7 * src_stride);
+
+  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+             src54_r, src21_r);
+  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+  ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
+             src4332, src6554);
+  XORI_B3_128_SB(src2110, src4332, src6554);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src7, src8, src9, src10);
+    src += (4 * src_stride);
+
+    ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+               src87_r, src98_r, src109_r);
+    ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
+    XORI_B2_128_SB(src8776, src10998);
+    out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
+                                filt1, filt2, filt3);
+    out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
+                                filt1, filt2, filt3);
+    SRARI_H2_SH(out10, out32, FILTER_BITS);
+    SAT_SH2_SH(out10, out32, 7);
+    out = PCKEV_XORI128_UB(out10, out32);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    src2110 = src6554;
+    src4332 = src8776;
+    src6554 = src10998;
+    src6 = src10;
+  }
+}
+
+static void common_vt_8t_8w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+  v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
+  v16u8 tmp0, tmp1;
+  v8i16 filt, out0_r, out1_r, out2_r, out3_r;
+
+  src -= (3 * src_stride);
+
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+  src += (7 * src_stride);
+  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+             src54_r, src21_r);
+  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src7, src8, src9, src10);
+    XORI_B4_128_SB(src7, src8, src9, src10);
+    src += (4 * src_stride);
+
+    ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+               src87_r, src98_r, src109_r);
+    out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
+                                 filt1, filt2, filt3);
+    out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
+                                 filt1, filt2, filt3);
+    out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
+                                 filt1, filt2, filt3);
+    out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
+                                 filt1, filt2, filt3);
+    SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
+    SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+    tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
+    tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
+    ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    src10_r = src54_r;
+    src32_r = src76_r;
+    src54_r = src98_r;
+    src21_r = src65_r;
+    src43_r = src87_r;
+    src65_r = src109_r;
+    src6 = src10;
+  }
+}
+
+static void common_vt_8t_16w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  v16i8 filt0, filt1, filt2, filt3;
+  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
+  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
+  v16u8 tmp0, tmp1, tmp2, tmp3;
+  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+
+  src -= (3 * src_stride);
+
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+  src += (7 * src_stride);
+  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+             src54_r, src21_r);
+  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
+             src54_l, src21_l);
+  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src7, src8, src9, src10);
+    XORI_B4_128_SB(src7, src8, src9, src10);
+    src += (4 * src_stride);
+
+    ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+               src87_r, src98_r, src109_r);
+    ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
+               src87_l, src98_l, src109_l);
+    out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
+                                 filt1, filt2, filt3);
+    out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
+                                 filt1, filt2, filt3);
+    out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
+                                 filt1, filt2, filt3);
+    out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
+                                 filt1, filt2, filt3);
+    out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
+                                 filt1, filt2, filt3);
+    out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
+                                 filt1, filt2, filt3);
+    out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
+                                 filt1, filt2, filt3);
+    out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
+                                 filt1, filt2, filt3);
+    SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
+    SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS);
+    SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+    SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+    PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, out3_r,
+                tmp0, tmp1, tmp2, tmp3);
+    XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
+    ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    src10_r = src54_r;
+    src32_r = src76_r;
+    src54_r = src98_r;
+    src21_r = src65_r;
+    src43_r = src87_r;
+    src65_r = src109_r;
+    src10_l = src54_l;
+    src32_l = src76_l;
+    src54_l = src98_l;
+    src21_l = src65_l;
+    src43_l = src87_l;
+    src65_l = src109_l;
+    src6 = src10;
+  }
+}
+
+static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter, int32_t height,
+                                      int32_t width) {
+  const uint8_t *src_tmp;
+  uint8_t *dst_tmp;
+  uint32_t loop_cnt, cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  v16i8 filt0, filt1, filt2, filt3;
+  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
+  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
+  v16u8 tmp0, tmp1, tmp2, tmp3;
+  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+
+  src -= (3 * src_stride);
+
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  for (cnt = (width >> 4); cnt--;) {
+    src_tmp = src;
+    dst_tmp = dst;
+
+    LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+    src_tmp += (7 * src_stride);
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
+               src32_r, src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+    ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
+               src32_l, src54_l, src21_l);
+    ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+      LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
+      XORI_B4_128_SB(src7, src8, src9, src10);
+      src_tmp += (4 * src_stride);
+      ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+                 src87_r, src98_r, src109_r);
+      ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
+                 src87_l, src98_l, src109_l);
+      out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
+                                   filt1, filt2, filt3);
+      out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
+                                   filt1, filt2, filt3);
+      out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
+                                   filt1, filt2, filt3);
+      out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
+                                   filt1, filt2, filt3);
+      out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
+                                   filt1, filt2, filt3);
+      out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
+                                   filt1, filt2, filt3);
+      out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
+                                   filt1, filt2, filt3);
+      out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
+                                   filt1, filt2, filt3);
+      SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
+      SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS);
+      SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+      SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+      PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+                  out3_r, tmp0, tmp1, tmp2, tmp3);
+      XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
+      ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
+      dst_tmp += (4 * dst_stride);
+
+      src10_r = src54_r;
+      src32_r = src76_r;
+      src54_r = src98_r;
+      src21_r = src65_r;
+      src43_r = src87_r;
+      src65_r = src109_r;
+      src10_l = src54_l;
+      src32_l = src76_l;
+      src54_l = src98_l;
+      src21_l = src65_l;
+      src43_l = src87_l;
+      src65_l = src109_l;
+      src6 = src10;
+    }
+
+    src += 16;
+    dst += 16;
+  }
+}
+
+static void common_vt_8t_32w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
+                            32);
+}
+
+static void common_vt_8t_64w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
+                            64);
+}
+
+static void common_vt_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  v16i8 src0, src1, src2, src3, src4;
+  v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
+  v16u8 filt0;
+  v8i16 filt;
+  v8u16 tmp0, tmp1;
+
+  filt = LD_SH(filter);
+  filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+  src += (5 * src_stride);
+
+  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+             src32_r, src43_r);
+  ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+  DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
+  SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+  src2110 = __msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+  ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r;
+  v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v16u8 filt0;
+  v8i16 filt;
+
+  filt = LD_SH(filter);
+  filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+  src += (8 * src_stride);
+
+  src8 = LD_SB(src);
+  src += src_stride;
+
+  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+             src32_r, src43_r);
+  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
+             src76_r, src87_r);
+  ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
+             src87_r, src76_r, src2110, src4332, src6554, src8776);
+  DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
+              tmp0, tmp1, tmp2, tmp3);
+  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
+  ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
+  ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
+}
+
+static void common_vt_2t_4w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  if (4 == height) {
+    common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+  } else if (8 == height) {
+    common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
+static void common_vt_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
+  v16i8 out0, out1;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v8i16 filt;
+
+  /* rearranging filter_y */
+  filt = LD_SH(filter);
+  filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+  LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
+  ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
+  ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
+  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
+              tmp2, tmp3);
+  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+  ST8x4_UB(out0, out1, dst, dst_stride);
+}
+
+static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+  v16i8 out0, out1;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v8i16 filt;
+
+  /* rearranging filter_y */
+  filt = LD_SH(filter);
+  filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 3); loop_cnt--;) {
+    LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
+    src += (8 * src_stride);
+
+    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1,
+               vec2, vec3);
+    ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5,
+               vec6, vec7);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
+                tmp2, tmp3);
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1,
+                tmp2, tmp3);
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    src0 = src8;
+  }
+}
+
+static void common_vt_2t_8w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  if (4 == height) {
+    common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
+  } else {
+    common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
+  }
+}
+
+static void common_vt_2t_16w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v8i16 filt;
+
+  /* rearranging filter_y */
+  filt = LD_SH(filter);
+  filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+
+    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+    ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_ST_SB(tmp0, tmp1, dst);
+    dst += dst_stride;
+
+    ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
+    ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
+    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_ST_SB(tmp2, tmp3, dst);
+    dst += dst_stride;
+
+    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_ST_SB(tmp0, tmp1, dst);
+    dst += dst_stride;
+
+    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_ST_SB(tmp2, tmp3, dst);
+    dst += dst_stride;
+
+    src0 = src4;
+  }
+}
+
+static void common_vt_2t_32w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v8i16 filt;
+
+  /* rearranging filter_y */
+  filt = LD_SH(filter);
+  filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+  src0 = LD_UB(src);
+  src5 = LD_UB(src + 16);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+    ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+
+    LD_UB4(src + 16, src_stride, src6, src7, src8, src9);
+    src += (4 * src_stride);
+
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_ST_SB(tmp0, tmp1, dst);
+    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
+
+    ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
+    ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
+    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_ST_SB(tmp0, tmp1, dst + 2 * dst_stride);
+
+    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_ST_SB(tmp2, tmp3, dst + 3 * dst_stride);
+
+    ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
+    ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_ST_SB(tmp0, tmp1, dst + 16);
+
+    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_ST_SB(tmp2, tmp3, dst + 16 + dst_stride);
+
+    ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
+    ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
+    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_ST_SB(tmp0, tmp1, dst + 16 + 2 * dst_stride);
+
+    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_ST_SB(tmp2, tmp3, dst + 16 + 3 * dst_stride);
+    dst += (4 * dst_stride);
+
+    src0 = src4;
+    src5 = src9;
+  }
+}
+
+static void common_vt_2t_64w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  v16u8 src11, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+  v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  v8i16 filt;
+
+  /* rearranging filter_y */
+  filt = LD_SH(filter);
+  filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+  LD_UB4(src, 16, src0, src3, src6, src9);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 1); loop_cnt--;) {
+    LD_UB2(src, src_stride, src1, src2);
+    LD_UB2(src + 16, src_stride, src4, src5);
+    LD_UB2(src + 32, src_stride, src7, src8);
+    LD_UB2(src + 48, src_stride, src10, src11);
+    src += (2 * src_stride);
+
+    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+    ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_ST_SB(tmp0, tmp1, dst);
+
+    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
+
+    ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
+    ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
+    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
+    SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
+    PCKEV_ST_SB(tmp4, tmp5, dst + 16);
+
+    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
+    SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
+    PCKEV_ST_SB(tmp6, tmp7, dst + 16 + dst_stride);
+
+    ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
+    ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_ST_SB(tmp0, tmp1, dst + 32);
+
+    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_ST_SB(tmp2, tmp3, dst + 32 + dst_stride);
+
+    ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
+    ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
+    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
+    SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
+    PCKEV_ST_SB(tmp4, tmp5, dst + 48);
+
+    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
+    SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
+    PCKEV_ST_SB(tmp6, tmp7, dst + 48 + dst_stride);
+    dst += (2 * dst_stride);
+
+    src0 = src2;
+    src3 = src5;
+    src6 = src8;
+    src9 = src11;
+  }
+}
+
+void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
+                            const int16_t *filter_x, int x_step_q4,
+                            const int16_t *filter_y, int y_step_q4,
+                            int w, int h) {
+  int8_t cnt, filt_ver[8];
+
+  assert(y_step_q4 == 16);
+  assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+  for (cnt = 8; cnt--;) {
+    filt_ver[cnt] = filter_y[cnt];
+  }
+
+  if (((const int32_t *)filter_y)[0] == 0) {
+    switch (w) {
+      case 4:
+        common_vt_2t_4w_msa(src, (int32_t)src_stride,
+                            dst, (int32_t)dst_stride,
+                            &filt_ver[3], h);
+        break;
+      case 8:
+        common_vt_2t_8w_msa(src, (int32_t)src_stride,
+                            dst, (int32_t)dst_stride,
+                            &filt_ver[3], h);
+        break;
+      case 16:
+        common_vt_2t_16w_msa(src, (int32_t)src_stride,
+                             dst, (int32_t)dst_stride,
+                             &filt_ver[3], h);
+        break;
+      case 32:
+        common_vt_2t_32w_msa(src, (int32_t)src_stride,
+                             dst, (int32_t)dst_stride,
+                             &filt_ver[3], h);
+        break;
+      case 64:
+        common_vt_2t_64w_msa(src, (int32_t)src_stride,
+                             dst, (int32_t)dst_stride,
+                             &filt_ver[3], h);
+        break;
+      default:
+        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride,
+                             filter_x, x_step_q4, filter_y, y_step_q4,
+                             w, h);
+        break;
+    }
+  } else {
+    switch (w) {
+      case 4:
+        common_vt_8t_4w_msa(src, (int32_t)src_stride,
+                            dst, (int32_t)dst_stride,
+                            filt_ver, h);
+        break;
+      case 8:
+        common_vt_8t_8w_msa(src, (int32_t)src_stride,
+                            dst, (int32_t)dst_stride,
+                            filt_ver, h);
+        break;
+      case 16:
+        common_vt_8t_16w_msa(src, (int32_t)src_stride,
+                             dst, (int32_t)dst_stride,
+                             filt_ver, h);
+        break;
+      case 32:
+        common_vt_8t_32w_msa(src, (int32_t)src_stride,
+                             dst, (int32_t)dst_stride,
+                             filt_ver, h);
+        break;
+      case 64:
+        common_vt_8t_64w_msa(src, (int32_t)src_stride,
+                             dst, (int32_t)dst_stride,
+                             filt_ver, h);
+        break;
+      default:
+        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride,
+                             filter_x, x_step_q4, filter_y, y_step_q4,
+                             w, h);
+        break;
+    }
+  }
+}
diff --git a/libs/libvpx/vpx_dsp/mips/vpx_convolve_avg_msa.c b/libs/libvpx/vpx_dsp/mips/vpx_convolve_avg_msa.c
new file mode 100644
index 0000000000..4c3d978031
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/vpx_convolve_avg_msa.c
@@ -0,0 +1,232 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/mips/macros_msa.h"
+
+static void avg_width4_msa(const uint8_t *src, int32_t src_stride,
+                           uint8_t *dst, int32_t dst_stride, int32_t height) {
+  int32_t cnt;
+  uint32_t out0, out1, out2, out3;
+  v16u8 src0, src1, src2, src3;
+  v16u8 dst0, dst1, dst2, dst3;
+
+  if (0 == (height % 4)) {
+    for (cnt = (height / 4); cnt--;) {
+      LD_UB4(src, src_stride, src0, src1, src2, src3);
+      src += (4 * src_stride);
+
+      LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+      AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                  dst0, dst1, dst2, dst3);
+
+      out0 = __msa_copy_u_w((v4i32)dst0, 0);
+      out1 = __msa_copy_u_w((v4i32)dst1, 0);
+      out2 = __msa_copy_u_w((v4i32)dst2, 0);
+      out3 = __msa_copy_u_w((v4i32)dst3, 0);
+      SW4(out0, out1, out2, out3, dst, dst_stride);
+      dst += (4 * dst_stride);
+    }
+  } else if (0 == (height % 2)) {
+    for (cnt = (height / 2); cnt--;) {
+      LD_UB2(src, src_stride, src0, src1);
+      src += (2 * src_stride);
+
+      LD_UB2(dst, dst_stride, dst0, dst1);
+
+      AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
+
+      out0 = __msa_copy_u_w((v4i32)dst0, 0);
+      out1 = __msa_copy_u_w((v4i32)dst1, 0);
+      SW(out0, dst);
+      dst += dst_stride;
+      SW(out1, dst);
+      dst += dst_stride;
+    }
+  }
+}
+
+static void avg_width8_msa(const uint8_t *src, int32_t src_stride,
+                           uint8_t *dst, int32_t dst_stride, int32_t height) {
+  int32_t cnt;
+  uint64_t out0, out1, out2, out3;
+  v16u8 src0, src1, src2, src3;
+  v16u8 dst0, dst1, dst2, dst3;
+
+  for (cnt = (height / 4); cnt--;) {
+    LD_UB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+    AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                dst0, dst1, dst2, dst3);
+
+    out0 = __msa_copy_u_d((v2i64)dst0, 0);
+    out1 = __msa_copy_u_d((v2i64)dst1, 0);
+    out2 = __msa_copy_u_d((v2i64)dst2, 0);
+    out3 = __msa_copy_u_d((v2i64)dst3, 0);
+    SD4(out0, out1, out2, out3, dst, dst_stride);
+    dst += (4 * dst_stride);
+  }
+}
+
+static void avg_width16_msa(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride, int32_t height) {
+  int32_t cnt;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+  for (cnt = (height / 8); cnt--;) {
+    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+    LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+
+    AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                dst0, dst1, dst2, dst3);
+    AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
+                dst4, dst5, dst6, dst7);
+    ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
+    dst += (8 * dst_stride);
+  }
+}
+
+static void avg_width32_msa(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride, int32_t height) {
+  int32_t cnt;
+  uint8_t *dst_dup = dst;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
+  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+  v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
+
+  for (cnt = (height / 8); cnt--;) {
+    LD_UB4(src, src_stride, src0, src2, src4, src6);
+    LD_UB4(src + 16, src_stride, src1, src3, src5, src7);
+    src += (4 * src_stride);
+    LD_UB4(dst_dup, dst_stride, dst0, dst2, dst4, dst6);
+    LD_UB4(dst_dup + 16, dst_stride, dst1, dst3, dst5, dst7);
+    dst_dup += (4 * dst_stride);
+    LD_UB4(src, src_stride, src8, src10, src12, src14);
+    LD_UB4(src + 16, src_stride, src9, src11, src13, src15);
+    src += (4 * src_stride);
+    LD_UB4(dst_dup, dst_stride, dst8, dst10, dst12, dst14);
+    LD_UB4(dst_dup + 16, dst_stride, dst9, dst11, dst13, dst15);
+    dst_dup += (4 * dst_stride);
+
+    AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                dst0, dst1, dst2, dst3);
+    AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
+                dst4, dst5, dst6, dst7);
+    AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11,
+                dst8, dst9, dst10, dst11);
+    AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15,
+                dst12, dst13, dst14, dst15);
+
+    ST_UB4(dst0, dst2, dst4, dst6, dst, dst_stride);
+    ST_UB4(dst1, dst3, dst5, dst7, dst + 16, dst_stride);
+    dst += (4 * dst_stride);
+    ST_UB4(dst8, dst10, dst12, dst14, dst, dst_stride);
+    ST_UB4(dst9, dst11, dst13, dst15, dst + 16, dst_stride);
+    dst += (4 * dst_stride);
+  }
+}
+
+static void avg_width64_msa(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride, int32_t height) {
+  int32_t cnt;
+  uint8_t *dst_dup = dst;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
+  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+  v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
+
+  for (cnt = (height / 4); cnt--;) {
+    LD_UB4(src, 16, src0, src1, src2, src3);
+    src += src_stride;
+    LD_UB4(src, 16, src4, src5, src6, src7);
+    src += src_stride;
+    LD_UB4(src, 16, src8, src9, src10, src11);
+    src += src_stride;
+    LD_UB4(src, 16, src12, src13, src14, src15);
+    src += src_stride;
+
+    LD_UB4(dst_dup, 16, dst0, dst1, dst2, dst3);
+    dst_dup += dst_stride;
+    LD_UB4(dst_dup, 16, dst4, dst5, dst6, dst7);
+    dst_dup += dst_stride;
+    LD_UB4(dst_dup, 16, dst8, dst9, dst10, dst11);
+    dst_dup += dst_stride;
+    LD_UB4(dst_dup, 16, dst12, dst13, dst14, dst15);
+    dst_dup += dst_stride;
+
+    AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                dst0, dst1, dst2, dst3);
+    AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
+                dst4, dst5, dst6, dst7);
+    AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11,
+                dst8, dst9, dst10, dst11);
+    AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15,
+                dst12, dst13, dst14, dst15);
+
+    ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
+    dst += dst_stride;
+    ST_UB4(dst4, dst5, dst6, dst7, dst, 16);
+    dst += dst_stride;
+    ST_UB4(dst8, dst9, dst10, dst11, dst, 16);
+    dst += dst_stride;
+    ST_UB4(dst12, dst13, dst14, dst15, dst, 16);
+    dst += dst_stride;
+  }
+}
+
+void vpx_convolve_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
+                          uint8_t *dst, ptrdiff_t dst_stride,
+                          const int16_t *filter_x, int32_t filter_x_stride,
+                          const int16_t *filter_y, int32_t filter_y_stride,
+                          int32_t w, int32_t h) {
+  (void)filter_x;
+  (void)filter_y;
+  (void)filter_x_stride;
+  (void)filter_y_stride;
+
+  switch (w) {
+    case 4: {
+      avg_width4_msa(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 8: {
+      avg_width8_msa(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 16: {
+      avg_width16_msa(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 32: {
+      avg_width32_msa(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 64: {
+      avg_width64_msa(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    default: {
+      int32_t lp, cnt;
+      for (cnt = h; cnt--;) {
+        for (lp = 0; lp < w; ++lp) {
+          dst[lp] = (((dst[lp] + src[lp]) + 1) >> 1);
+        }
+        src += src_stride;
+        dst += dst_stride;
+      }
+      break;
+    }
+  }
+}
diff --git a/libs/libvpx/vpx_dsp/mips/vpx_convolve_copy_msa.c b/libs/libvpx/vpx_dsp/mips/vpx_convolve_copy_msa.c
new file mode 100644
index 0000000000..ba4012281e
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/vpx_convolve_copy_msa.c
@@ -0,0 +1,247 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string.h>
+#include "vpx_dsp/mips/macros_msa.h"
+
+static void copy_width8_msa(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride, int32_t height) {
+  int32_t cnt;
+  uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+  if (0 == height % 12) {
+    for (cnt = (height / 12); cnt--;) {
+      LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+      src += (8 * src_stride);
+
+      out0 = __msa_copy_u_d((v2i64)src0, 0);
+      out1 = __msa_copy_u_d((v2i64)src1, 0);
+      out2 = __msa_copy_u_d((v2i64)src2, 0);
+      out3 = __msa_copy_u_d((v2i64)src3, 0);
+      out4 = __msa_copy_u_d((v2i64)src4, 0);
+      out5 = __msa_copy_u_d((v2i64)src5, 0);
+      out6 = __msa_copy_u_d((v2i64)src6, 0);
+      out7 = __msa_copy_u_d((v2i64)src7, 0);
+
+      SD4(out0, out1, out2, out3, dst, dst_stride);
+      dst += (4 * dst_stride);
+      SD4(out4, out5, out6, out7, dst, dst_stride);
+      dst += (4 * dst_stride);
+
+      LD_UB4(src, src_stride, src0, src1, src2, src3);
+      src += (4 * src_stride);
+
+      out0 = __msa_copy_u_d((v2i64)src0, 0);
+      out1 = __msa_copy_u_d((v2i64)src1, 0);
+      out2 = __msa_copy_u_d((v2i64)src2, 0);
+      out3 = __msa_copy_u_d((v2i64)src3, 0);
+      SD4(out0, out1, out2, out3, dst, dst_stride);
+      dst += (4 * dst_stride);
+    }
+  } else if (0 == height % 8) {
+    for (cnt = height >> 3; cnt--;) {
+      LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+      src += (8 * src_stride);
+
+      out0 = __msa_copy_u_d((v2i64)src0, 0);
+      out1 = __msa_copy_u_d((v2i64)src1, 0);
+      out2 = __msa_copy_u_d((v2i64)src2, 0);
+      out3 = __msa_copy_u_d((v2i64)src3, 0);
+      out4 = __msa_copy_u_d((v2i64)src4, 0);
+      out5 = __msa_copy_u_d((v2i64)src5, 0);
+      out6 = __msa_copy_u_d((v2i64)src6, 0);
+      out7 = __msa_copy_u_d((v2i64)src7, 0);
+
+      SD4(out0, out1, out2, out3, dst, dst_stride);
+      dst += (4 * dst_stride);
+      SD4(out4, out5, out6, out7, dst, dst_stride);
+      dst += (4 * dst_stride);
+    }
+  } else if (0 == height % 4) {
+    for (cnt = (height / 4); cnt--;) {
+      LD_UB4(src, src_stride, src0, src1, src2, src3);
+      src += (4 * src_stride);
+      out0 = __msa_copy_u_d((v2i64)src0, 0);
+      out1 = __msa_copy_u_d((v2i64)src1, 0);
+      out2 = __msa_copy_u_d((v2i64)src2, 0);
+      out3 = __msa_copy_u_d((v2i64)src3, 0);
+
+      SD4(out0, out1, out2, out3, dst, dst_stride);
+      dst += (4 * dst_stride);
+    }
+  } else if (0 == height % 2) {
+    for (cnt = (height / 2); cnt--;) {
+      LD_UB2(src, src_stride, src0, src1);
+      src += (2 * src_stride);
+      out0 = __msa_copy_u_d((v2i64)src0, 0);
+      out1 = __msa_copy_u_d((v2i64)src1, 0);
+
+      SD(out0, dst);
+      dst += dst_stride;
+      SD(out1, dst);
+      dst += dst_stride;
+    }
+  }
+}
+
+static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  int32_t height, int32_t width) {
+  int32_t cnt, loop_cnt;
+  const uint8_t *src_tmp;
+  uint8_t *dst_tmp;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+  for (cnt = (width >> 4); cnt--;) {
+    src_tmp = src;
+    dst_tmp = dst;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+      LD_UB8(src_tmp, src_stride,
+             src0, src1, src2, src3, src4, src5, src6, src7);
+      src_tmp += (8 * src_stride);
+
+      ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
+             dst_tmp, dst_stride);
+      dst_tmp += (8 * dst_stride);
+    }
+
+    src += 16;
+    dst += 16;
+  }
+}
+
+static void copy_width16_msa(const uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride, int32_t height) {
+  int32_t cnt;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+  if (0 == height % 12) {
+    for (cnt = (height / 12); cnt--;) {
+      LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+      src += (8 * src_stride);
+      ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
+      dst += (8 * dst_stride);
+
+      LD_UB4(src, src_stride, src0, src1, src2, src3);
+      src += (4 * src_stride);
+      ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+      dst += (4 * dst_stride);
+    }
+  } else if (0 == height % 8) {
+    copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
+  } else if (0 == height % 4) {
+    for (cnt = (height >> 2); cnt--;) {
+      LD_UB4(src, src_stride, src0, src1, src2, src3);
+      src += (4 * src_stride);
+
+      ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+      dst += (4 * dst_stride);
+    }
+  }
+}
+
+static void copy_width32_msa(const uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride, int32_t height) {
+  int32_t cnt;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+  if (0 == height % 12) {
+    for (cnt = (height / 12); cnt--;) {
+      LD_UB4(src, src_stride, src0, src1, src2, src3);
+      LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
+      src += (4 * src_stride);
+      ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+      ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
+      dst += (4 * dst_stride);
+
+      LD_UB4(src, src_stride, src0, src1, src2, src3);
+      LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
+      src += (4 * src_stride);
+      ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+      ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
+      dst += (4 * dst_stride);
+
+      LD_UB4(src, src_stride, src0, src1, src2, src3);
+      LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
+      src += (4 * src_stride);
+      ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+      ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
+      dst += (4 * dst_stride);
+    }
+  } else if (0 == height % 8) {
+    copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 32);
+  } else if (0 == height % 4) {
+    for (cnt = (height >> 2); cnt--;) {
+      LD_UB4(src, src_stride, src0, src1, src2, src3);
+      LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
+      src += (4 * src_stride);
+      ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+      ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
+      dst += (4 * dst_stride);
+    }
+  }
+}
+
+static void copy_width64_msa(const uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride, int32_t height) {
+  copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 64);
+}
+
+void vpx_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const int16_t *filter_x, int32_t filter_x_stride,
+                           const int16_t *filter_y, int32_t filter_y_stride,
+                           int32_t w, int32_t h) {
+  (void)filter_x;
+  (void)filter_y;
+  (void)filter_x_stride;
+  (void)filter_y_stride;
+
+  switch (w) {
+    case 4: {
+      uint32_t cnt, tmp;
+      /* 1 word storage */
+      for (cnt = h; cnt--;) {
+        tmp = LW(src);
+        SW(tmp, dst);
+        src += src_stride;
+        dst += dst_stride;
+      }
+      break;
+    }
+    case 8: {
+      copy_width8_msa(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 16: {
+      copy_width16_msa(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 32: {
+      copy_width32_msa(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 64: {
+      copy_width64_msa(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    default: {
+      uint32_t cnt;
+      for (cnt = h; cnt--;) {
+        memcpy(dst, src, w);
+        src += src_stride;
+        dst += dst_stride;
+      }
+      break;
+    }
+  }
+}
diff --git a/libs/libvpx/vpx_dsp/mips/vpx_convolve_msa.h b/libs/libvpx/vpx_dsp/mips/vpx_convolve_msa.h
new file mode 100644
index 0000000000..e0013983ae
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/vpx_convolve_msa.h
@@ -0,0 +1,119 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_
+#define VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_
+
+#include "vpx_dsp/mips/macros_msa.h"
+#include "vpx_dsp/vpx_filter.h"
+
+extern const uint8_t mc_filt_mask_arr[16 * 3];
+
+#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3,         \
+                            filt0, filt1, filt2, filt3) ({  \
+  v8i16 tmp0, tmp1;                                         \
+                                                            \
+  tmp0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0);         \
+  tmp0 = __msa_dpadd_s_h(tmp0, (v16i8)vec1, (v16i8)filt1);  \
+  tmp1 = __msa_dotp_s_h((v16i8)vec2, (v16i8)filt2);         \
+  tmp1 = __msa_dpadd_s_h(tmp1, (v16i8)vec3, (v16i8)filt3);  \
+  tmp0 = __msa_adds_s_h(tmp0, tmp1);                        \
+                                                            \
+  tmp0;                                                     \
+})
+
+#define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3,        \
+                        filt_h0, filt_h1, filt_h2, filt_h3) ({         \
+  v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                                \
+  v8i16 hz_out_m;                                                      \
+                                                                       \
+  VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,                   \
+             vec0_m, vec1_m, vec2_m, vec3_m);                          \
+  hz_out_m = FILT_8TAP_DPADD_S_H(vec0_m, vec1_m, vec2_m, vec3_m,       \
+                                 filt_h0, filt_h1, filt_h2, filt_h3);  \
+                                                                       \
+  hz_out_m = __msa_srari_h(hz_out_m, FILTER_BITS);                     \
+  hz_out_m = __msa_sat_s_h(hz_out_m, 7);                               \
+                                                                       \
+  hz_out_m;                                                            \
+})
+
+#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3,               \
+                                   mask0, mask1, mask2, mask3,           \
+                                   filt0, filt1, filt2, filt3,           \
+                                   out0, out1) {                         \
+  v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;  \
+  v8i16 res0_m, res1_m, res2_m, res3_m;                                  \
+                                                                         \
+  VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);      \
+  DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m);             \
+  VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);      \
+  DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m);            \
+  VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m);      \
+  DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m);             \
+  VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m);      \
+  DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m);            \
+  ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1);               \
+}
+
+#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3,                  \
+                                   mask0, mask1, mask2, mask3,              \
+                                   filt0, filt1, filt2, filt3,              \
+                                   out0, out1, out2, out3) {                \
+  v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;     \
+  v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m;     \
+                                                                            \
+  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);         \
+  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);         \
+  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,   \
+              res0_m, res1_m, res2_m, res3_m);                              \
+  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m);         \
+  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m);         \
+  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2,   \
+              res4_m, res5_m, res6_m, res7_m);                              \
+  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m);         \
+  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m);         \
+  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1,  \
+               res0_m, res1_m, res2_m, res3_m);                             \
+  VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m);         \
+  VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m);         \
+  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3,  \
+               res4_m, res5_m, res6_m, res7_m);                             \
+  ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m,       \
+              res7_m, out0, out1, out2, out3);                              \
+}
+
+#define PCKEV_XORI128_AVG_ST_UB(in0, in1, dst, pdst) {  \
+  v16u8 tmp_m;                                          \
+                                                        \
+  tmp_m = PCKEV_XORI128_UB(in1, in0);                   \
+  tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst);            \
+  ST_UB(tmp_m, (pdst));                                 \
+}
+
+#define PCKEV_AVG_ST_UB(in0, in1, dst, pdst) {           \
+  v16u8 tmp_m;                                           \
+                                                         \
+  tmp_m = (v16u8)__msa_pckev_b((v16i8)in0, (v16i8)in1);  \
+  tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst);             \
+  ST_UB(tmp_m, (pdst));                                  \
+}
+
+#define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3,  \
+                           pdst, stride) {                              \
+  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                 \
+  uint8_t *pdst_m = (uint8_t *)(pdst);                                  \
+                                                                        \
+  PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m);                      \
+  PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m);                  \
+  AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);          \
+  ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride);                             \
+}
+#endif  /* VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_ */
diff --git a/libs/libvpx/vpx_dsp/prob.c b/libs/libvpx/vpx_dsp/prob.c
new file mode 100644
index 0000000000..639d24dd2f
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/prob.c
@@ -0,0 +1,53 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./prob.h"
+
+const uint8_t vpx_norm[256] = {
+  0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+static unsigned int tree_merge_probs_impl(unsigned int i,
+                                          const vpx_tree_index *tree,
+                                          const vpx_prob *pre_probs,
+                                          const unsigned int *counts,
+                                          vpx_prob *probs) {
+  const int l = tree[i];
+  const unsigned int left_count = (l <= 0)
+                 ? counts[-l]
+                 : tree_merge_probs_impl(l, tree, pre_probs, counts, probs);
+  const int r = tree[i + 1];
+  const unsigned int right_count = (r <= 0)
+                 ? counts[-r]
+                 : tree_merge_probs_impl(r, tree, pre_probs, counts, probs);
+  const unsigned int ct[2] = { left_count, right_count };
+  probs[i >> 1] = mode_mv_merge_probs(pre_probs[i >> 1], ct);
+  return left_count + right_count;
+}
+
+void vpx_tree_merge_probs(const vpx_tree_index *tree, const vpx_prob *pre_probs,
+                          const unsigned int *counts, vpx_prob *probs) {
+  tree_merge_probs_impl(0, tree, pre_probs, counts, probs);
+}
diff --git a/libs/libvpx/vpx_dsp/prob.h b/libs/libvpx/vpx_dsp/prob.h
new file mode 100644
index 0000000000..c3cb103ffb
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/prob.h
@@ -0,0 +1,103 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_PROB_H_
+#define VPX_DSP_PROB_H_
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_common.h"
+
+#include "vpx_ports/mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef uint8_t vpx_prob;
+
+#define MAX_PROB 255
+
+#define vpx_prob_half ((vpx_prob) 128)
+
+typedef int8_t vpx_tree_index;
+
+#define TREE_SIZE(leaf_count) (2 * (leaf_count) - 2)
+
+#define vpx_complement(x) (255 - x)
+
+#define MODE_MV_COUNT_SAT 20
+
+/* We build coding trees compactly in arrays.
+   Each node of the tree is a pair of vpx_tree_indices.
+   Array index often references a corresponding probability table.
+   Index <= 0 means done encoding/decoding and value = -Index,
+   Index > 0 means need another bit, specification at index.
+   Nonnegative indices are always even;  processing begins at node 0. */
+
+typedef const vpx_tree_index vpx_tree[];
+
+static INLINE vpx_prob clip_prob(int p) {
+  return (p > 255) ? 255 : (p < 1) ? 1 : p;
+}
+
+static INLINE vpx_prob get_prob(int num, int den) {
+  return (den == 0) ? 128u : clip_prob(((int64_t)num * 256 + (den >> 1)) / den);
+}
+
+static INLINE vpx_prob get_binary_prob(int n0, int n1) {
+  return get_prob(n0, n0 + n1);
+}
+
+/* This function assumes prob1 and prob2 are already within [1,255] range. */
+static INLINE vpx_prob weighted_prob(int prob1, int prob2, int factor) {
+  return ROUND_POWER_OF_TWO(prob1 * (256 - factor) + prob2 * factor, 8);
+}
+
+static INLINE vpx_prob merge_probs(vpx_prob pre_prob,
+                                   const unsigned int ct[2],
+                                   unsigned int count_sat,
+                                   unsigned int max_update_factor) {
+  const vpx_prob prob = get_binary_prob(ct[0], ct[1]);
+  const unsigned int count = VPXMIN(ct[0] + ct[1], count_sat);
+  const unsigned int factor = max_update_factor * count / count_sat;
+  return weighted_prob(pre_prob, prob, factor);
+}
+
+// MODE_MV_MAX_UPDATE_FACTOR (128) * count / MODE_MV_COUNT_SAT;
+static const int count_to_update_factor[MODE_MV_COUNT_SAT + 1] = {
+  0, 6, 12, 19, 25, 32, 38, 44, 51, 57, 64,
+  70, 76, 83, 89, 96, 102, 108, 115, 121, 128
+};
+
+static INLINE vpx_prob mode_mv_merge_probs(vpx_prob pre_prob,
+                                           const unsigned int ct[2]) {
+  const unsigned int den = ct[0] + ct[1];
+  if (den == 0) {
+    return pre_prob;
+  } else {
+    const unsigned int count = VPXMIN(den, MODE_MV_COUNT_SAT);
+    const unsigned int factor = count_to_update_factor[count];
+    const vpx_prob prob =
+        clip_prob(((int64_t)(ct[0]) * 256 + (den >> 1)) / den);
+    return weighted_prob(pre_prob, prob, factor);
+  }
+}
+
+void vpx_tree_merge_probs(const vpx_tree_index *tree, const vpx_prob *pre_probs,
+                          const unsigned int *counts, vpx_prob *probs);
+
+
+DECLARE_ALIGNED(16, extern const uint8_t, vpx_norm[256]);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_DSP_PROB_H_
diff --git a/libs/libvpx/vpx_dsp/psnrhvs.c b/libs/libvpx/vpx_dsp/psnrhvs.c
new file mode 100644
index 0000000000..3001705791
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/psnrhvs.c
@@ -0,0 +1,227 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ *  This code was originally written by: Gregory Maxwell, at the Daala
+ *  project.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/ssim.h"
+#include "vpx_ports/system_state.h"
+
+#if !defined(M_PI)
+# define M_PI (3.141592653589793238462643)
+#endif
+#include <string.h>
+
+static void od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x,
+                           int xstride) {
+  (void) xstride;
+  vpx_fdct8x8(x, y, ystride);
+}
+
+/* Normalized inverse quantization matrix for 8x8 DCT at the point of
+ * transparency. This is not the JPEG based matrix from the paper,
+ this one gives a slightly higher MOS agreement.*/
+static const float csf_y[8][8] = {
+    {1.6193873005, 2.2901594831, 2.08509755623, 1.48366094411, 1.00227514334,
+     0.678296995242, 0.466224900598, 0.3265091542},
+    {2.2901594831, 1.94321815382, 2.04793073064, 1.68731108984, 1.2305666963,
+     0.868920337363, 0.61280991668, 0.436405793551},
+    {2.08509755623, 2.04793073064, 1.34329019223, 1.09205635862, 0.875748795257,
+     0.670882927016, 0.501731932449, 0.372504254596},
+    {1.48366094411, 1.68731108984, 1.09205635862, 0.772819797575,
+     0.605636379554, 0.48309405692, 0.380429446972, 0.295774038565},
+    {1.00227514334, 1.2305666963, 0.875748795257, 0.605636379554,
+     0.448996256676, 0.352889268808, 0.283006984131, 0.226951348204},
+    {0.678296995242, 0.868920337363, 0.670882927016, 0.48309405692,
+     0.352889268808, 0.27032073436, 0.215017739696, 0.17408067321},
+    {0.466224900598, 0.61280991668, 0.501731932449, 0.380429446972,
+     0.283006984131, 0.215017739696, 0.168869545842, 0.136153931001},
+    {0.3265091542, 0.436405793551, 0.372504254596, 0.295774038565,
+     0.226951348204, 0.17408067321, 0.136153931001, 0.109083846276}};
+static const float csf_cb420[8][8] = {
+    {1.91113096927, 2.46074210438, 1.18284184739, 1.14982565193, 1.05017074788,
+     0.898018824055, 0.74725392039, 0.615105596242},
+    {2.46074210438, 1.58529308355, 1.21363250036, 1.38190029285, 1.33100189972,
+     1.17428548929, 0.996404342439, 0.830890433625},
+    {1.18284184739, 1.21363250036, 0.978712413627, 1.02624506078, 1.03145147362,
+     0.960060382087, 0.849823426169, 0.731221236837},
+    {1.14982565193, 1.38190029285, 1.02624506078, 0.861317501629,
+     0.801821139099, 0.751437590932, 0.685398513368, 0.608694761374},
+    {1.05017074788, 1.33100189972, 1.03145147362, 0.801821139099,
+     0.676555426187, 0.605503172737, 0.55002013668, 0.495804539034},
+    {0.898018824055, 1.17428548929, 0.960060382087, 0.751437590932,
+     0.605503172737, 0.514674450957, 0.454353482512, 0.407050308965},
+    {0.74725392039, 0.996404342439, 0.849823426169, 0.685398513368,
+     0.55002013668, 0.454353482512, 0.389234902883, 0.342353999733},
+    {0.615105596242, 0.830890433625, 0.731221236837, 0.608694761374,
+     0.495804539034, 0.407050308965, 0.342353999733, 0.295530605237}};
+static const float csf_cr420[8][8] = {
+    {2.03871978502, 2.62502345193, 1.26180942886, 1.11019789803, 1.01397751469,
+     0.867069376285, 0.721500455585, 0.593906509971},
+    {2.62502345193, 1.69112867013, 1.17180569821, 1.3342742857, 1.28513006198,
+     1.13381474809, 0.962064122248, 0.802254508198},
+    {1.26180942886, 1.17180569821, 0.944981930573, 0.990876405848,
+     0.995903384143, 0.926972725286, 0.820534991409, 0.706020324706},
+    {1.11019789803, 1.3342742857, 0.990876405848, 0.831632933426, 0.77418706195,
+     0.725539939514, 0.661776842059, 0.587716619023},
+    {1.01397751469, 1.28513006198, 0.995903384143, 0.77418706195,
+     0.653238524286, 0.584635025748, 0.531064164893, 0.478717061273},
+    {0.867069376285, 1.13381474809, 0.926972725286, 0.725539939514,
+     0.584635025748, 0.496936637883, 0.438694579826, 0.393021669543},
+    {0.721500455585, 0.962064122248, 0.820534991409, 0.661776842059,
+     0.531064164893, 0.438694579826, 0.375820256136, 0.330555063063},
+    {0.593906509971, 0.802254508198, 0.706020324706, 0.587716619023,
+     0.478717061273, 0.393021669543, 0.330555063063, 0.285345396658}};
+
+static double convert_score_db(double _score, double _weight) {
+  return 10 * (log10(255 * 255) - log10(_weight * _score));
+}
+
+static double calc_psnrhvs(const unsigned char *_src, int _systride,
+                           const unsigned char *_dst, int _dystride,
+                           double _par, int _w, int _h, int _step,
+                           const float _csf[8][8]) {
+  float ret;
+  int16_t dct_s[8 * 8], dct_d[8 * 8];
+  tran_low_t dct_s_coef[8 * 8], dct_d_coef[8 * 8];
+  float mask[8][8];
+  int pixels;
+  int x;
+  int y;
+  (void) _par;
+  ret = pixels = 0;
+  /*In the PSNR-HVS-M paper[1] the authors describe the construction of
+   their masking table as "we have used the quantization table for the
+   color component Y of JPEG [6] that has been also obtained on the
+   basis of CSF. Note that the values in quantization table JPEG have
+   been normalized and then squared." Their CSF matrix (from PSNR-HVS)
+   was also constructed from the JPEG matrices. I can not find any obvious
+   scheme of normalizing to produce their table, but if I multiply their
+   CSF by 0.38857 and square the result I get their masking table.
+   I have no idea where this constant comes from, but deviating from it
+   too greatly hurts MOS agreement.
+
+   [1] Nikolay Ponomarenko, Flavia Silvestri, Karen Egiazarian, Marco Carli,
+   Jaakko Astola, Vladimir Lukin, "On between-coefficient contrast masking
+   of DCT basis functions", CD-ROM Proceedings of the Third
+   International Workshop on Video Processing and Quality Metrics for Consumer
+   Electronics VPQM-07, Scottsdale, Arizona, USA, 25-26 January, 2007, 4 p.*/
+  for (x = 0; x < 8; x++)
+    for (y = 0; y < 8; y++)
+      mask[x][y] = (_csf[x][y] * 0.3885746225901003)
+          * (_csf[x][y] * 0.3885746225901003);
+  for (y = 0; y < _h - 7; y += _step) {
+    for (x = 0; x < _w - 7; x += _step) {
+      int i;
+      int j;
+      float s_means[4];
+      float d_means[4];
+      float s_vars[4];
+      float d_vars[4];
+      float s_gmean = 0;
+      float d_gmean = 0;
+      float s_gvar = 0;
+      float d_gvar = 0;
+      float s_mask = 0;
+      float d_mask = 0;
+      for (i = 0; i < 4; i++)
+        s_means[i] = d_means[i] = s_vars[i] = d_vars[i] = 0;
+      for (i = 0; i < 8; i++) {
+        for (j = 0; j < 8; j++) {
+          int sub = ((i & 12) >> 2) + ((j & 12) >> 1);
+          dct_s[i * 8 + j] = _src[(y + i) * _systride + (j + x)];
+          dct_d[i * 8 + j] = _dst[(y + i) * _dystride + (j + x)];
+          s_gmean += dct_s[i * 8 + j];
+          d_gmean += dct_d[i * 8 + j];
+          s_means[sub] += dct_s[i * 8 + j];
+          d_means[sub] += dct_d[i * 8 + j];
+        }
+      }
+      s_gmean /= 64.f;
+      d_gmean /= 64.f;
+      for (i = 0; i < 4; i++)
+        s_means[i] /= 16.f;
+      for (i = 0; i < 4; i++)
+        d_means[i] /= 16.f;
+      for (i = 0; i < 8; i++) {
+        for (j = 0; j < 8; j++) {
+          int sub = ((i & 12) >> 2) + ((j & 12) >> 1);
+          s_gvar += (dct_s[i * 8 + j] - s_gmean) * (dct_s[i * 8 + j] - s_gmean);
+          d_gvar += (dct_d[i * 8 + j] - d_gmean) * (dct_d[i * 8 + j] - d_gmean);
+          s_vars[sub] += (dct_s[i * 8 + j] - s_means[sub])
+              * (dct_s[i * 8 + j] - s_means[sub]);
+          d_vars[sub] += (dct_d[i * 8 + j] - d_means[sub])
+              * (dct_d[i * 8 + j] - d_means[sub]);
+        }
+      }
+      s_gvar *= 1 / 63.f * 64;
+      d_gvar *= 1 / 63.f * 64;
+      for (i = 0; i < 4; i++)
+        s_vars[i] *= 1 / 15.f * 16;
+      for (i = 0; i < 4; i++)
+        d_vars[i] *= 1 / 15.f * 16;
+      if (s_gvar > 0)
+        s_gvar = (s_vars[0] + s_vars[1] + s_vars[2] + s_vars[3]) / s_gvar;
+      if (d_gvar > 0)
+        d_gvar = (d_vars[0] + d_vars[1] + d_vars[2] + d_vars[3]) / d_gvar;
+      od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8);
+      od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8);
+      for (i = 0; i < 8; i++)
+        for (j = (i == 0); j < 8; j++)
+          s_mask += dct_s_coef[i * 8 + j] * dct_s_coef[i * 8 + j] * mask[i][j];
+      for (i = 0; i < 8; i++)
+        for (j = (i == 0); j < 8; j++)
+          d_mask += dct_d_coef[i * 8 + j] * dct_d_coef[i * 8 + j] * mask[i][j];
+      s_mask = sqrt(s_mask * s_gvar) / 32.f;
+      d_mask = sqrt(d_mask * d_gvar) / 32.f;
+      if (d_mask > s_mask)
+        s_mask = d_mask;
+      for (i = 0; i < 8; i++) {
+        for (j = 0; j < 8; j++) {
+          float err;
+          err = fabs((float)(dct_s_coef[i * 8 + j] - dct_d_coef[i * 8 + j]));
+          if (i != 0 || j != 0)
+            err = err < s_mask / mask[i][j] ? 0 : err - s_mask / mask[i][j];
+          ret += (err * _csf[i][j]) * (err * _csf[i][j]);
+          pixels++;
+        }
+      }
+    }
+  }
+  ret /= pixels;
+  return ret;
+}
+double vpx_psnrhvs(const YV12_BUFFER_CONFIG *source,
+                   const YV12_BUFFER_CONFIG *dest, double *y_psnrhvs,
+                   double *u_psnrhvs, double *v_psnrhvs) {
+  double psnrhvs;
+  const double par = 1.0;
+  const int step = 7;
+  vpx_clear_system_state();
+  *y_psnrhvs = calc_psnrhvs(source->y_buffer, source->y_stride, dest->y_buffer,
+                            dest->y_stride, par, source->y_crop_width,
+                            source->y_crop_height, step, csf_y);
+
+  *u_psnrhvs = calc_psnrhvs(source->u_buffer, source->uv_stride, dest->u_buffer,
+                            dest->uv_stride, par, source->uv_crop_width,
+                            source->uv_crop_height, step, csf_cb420);
+
+  *v_psnrhvs = calc_psnrhvs(source->v_buffer, source->uv_stride, dest->v_buffer,
+                            dest->uv_stride, par, source->uv_crop_width,
+                            source->uv_crop_height, step, csf_cr420);
+  psnrhvs = (*y_psnrhvs) * .8 + .1 * ((*u_psnrhvs) + (*v_psnrhvs));
+
+  return convert_score_db(psnrhvs, 1.0);
+}
diff --git a/libs/libvpx/vpx_dsp/quantize.c b/libs/libvpx/vpx_dsp/quantize.c
new file mode 100644
index 0000000000..e4e741a908
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/quantize.c
@@ -0,0 +1,337 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/quantize.h"
+#include "vpx_mem/vpx_mem.h"
+
+void vpx_quantize_dc(const tran_low_t *coeff_ptr,
+                     int n_coeffs, int skip_block,
+                     const int16_t *round_ptr, const int16_t quant,
+                     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                     const int16_t dequant_ptr, uint16_t *eob_ptr) {
+  const int rc = 0;
+  const int coeff = coeff_ptr[rc];
+  const int coeff_sign = (coeff >> 31);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  int tmp, eob = -1;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+    tmp = (tmp * quant) >> 16;
+    qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr;
+    if (tmp)
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr,
+                            int n_coeffs, int skip_block,
+                            const int16_t *round_ptr, const int16_t quant,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            const int16_t dequant_ptr, uint16_t *eob_ptr) {
+  int eob = -1;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    const int coeff = coeff_ptr[0];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int64_t tmp = abs_coeff + round_ptr[0];
+    const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 16);
+    qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr;
+    if (abs_qcoeff)
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+#endif
+
+void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
+                           const int16_t *round_ptr, const int16_t quant,
+                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                           const int16_t dequant_ptr, uint16_t *eob_ptr) {
+  const int n_coeffs = 1024;
+  const int rc = 0;
+  const int coeff = coeff_ptr[rc];
+  const int coeff_sign = (coeff >> 31);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  int tmp, eob = -1;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
+                INT16_MIN, INT16_MAX);
+    tmp = (tmp * quant) >> 15;
+    qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2;
+    if (tmp)
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
+                                  int skip_block,
+                                  const int16_t *round_ptr,
+                                  const int16_t quant,
+                                  tran_low_t *qcoeff_ptr,
+                                  tran_low_t *dqcoeff_ptr,
+                                  const int16_t dequant_ptr,
+                                  uint16_t *eob_ptr) {
+  const int n_coeffs = 1024;
+  int eob = -1;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    const int coeff = coeff_ptr[0];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 1);
+    const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 15);
+    qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / 2;
+    if (abs_qcoeff)
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+#endif
+
+void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                      int skip_block,
+                      const int16_t *zbin_ptr, const int16_t *round_ptr,
+                      const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
+                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                      const int16_t *dequant_ptr,
+                      uint16_t *eob_ptr,
+                      const int16_t *scan, const int16_t *iscan) {
+  int i, non_zero_count = (int)n_coeffs, eob = -1;
+  const int zbins[2] = {zbin_ptr[0], zbin_ptr[1]};
+  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = (int)n_coeffs - 1; i >= 0; i--) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+
+      if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
+        non_zero_count--;
+      else
+        break;
+    }
+
+    // Quantization pass: All coefficients with index >= zero_flag are
+    // skippable. Note: zero_flag can be zero.
+    for (i = 0; i < non_zero_count; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+      if (abs_coeff >= zbins[rc != 0]) {
+        int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+        tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+                  quant_shift_ptr[rc != 0]) >> 16;  // quantization
+        qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;
+        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+
+        if (tmp)
+          eob = i;
+      }
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             int skip_block, const int16_t *zbin_ptr,
+                             const int16_t *round_ptr, const int16_t *quant_ptr,
+                             const int16_t *quant_shift_ptr,
+                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                             const int16_t *dequant_ptr,
+                             uint16_t *eob_ptr, const int16_t *scan,
+                             const int16_t *iscan) {
+  int i, non_zero_count = (int)n_coeffs, eob = -1;
+  const int zbins[2] = {zbin_ptr[0], zbin_ptr[1]};
+  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = (int)n_coeffs - 1; i >= 0; i--) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+
+      if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
+        non_zero_count--;
+      else
+        break;
+    }
+
+    // Quantization pass: All coefficients with index >= zero_flag are
+    // skippable. Note: zero_flag can be zero.
+    for (i = 0; i < non_zero_count; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+      if (abs_coeff >= zbins[rc != 0]) {
+        const int64_t tmp1 = abs_coeff + round_ptr[rc != 0];
+        const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+        const uint32_t abs_qcoeff =
+            (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 16);
+        qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+        if (abs_qcoeff)
+          eob = i;
+      }
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+#endif
+
+void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            int skip_block,
+                            const int16_t *zbin_ptr, const int16_t *round_ptr,
+                            const int16_t *quant_ptr,
+                            const int16_t *quant_shift_ptr,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            const int16_t *dequant_ptr,
+                            uint16_t *eob_ptr,
+                            const int16_t *scan, const int16_t *iscan) {
+  const int zbins[2] = {ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
+                        ROUND_POWER_OF_TWO(zbin_ptr[1], 1)};
+  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
+
+  int idx = 0;
+  int idx_arr[1024];
+  int i, eob = -1;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+
+      // If the coefficient is out of the base ZBIN range, keep it for
+      // quantization.
+      if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0])
+        idx_arr[idx++] = i;
+    }
+
+    // Quantization pass: only process the coefficients selected in
+    // pre-scan pass. Note: idx can be zero.
+    for (i = 0; i < idx; i++) {
+      const int rc = scan[idx_arr[i]];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      int tmp;
+      int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+      abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
+      tmp = ((((abs_coeff * quant_ptr[rc != 0]) >> 16) + abs_coeff) *
+               quant_shift_ptr[rc != 0]) >> 15;
+
+      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+
+      if (tmp)
+        eob = idx_arr[i];
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
+                                   intptr_t n_coeffs, int skip_block,
+                                   const int16_t *zbin_ptr,
+                                   const int16_t *round_ptr,
+                                   const int16_t *quant_ptr,
+                                   const int16_t *quant_shift_ptr,
+                                   tran_low_t *qcoeff_ptr,
+                                   tran_low_t *dqcoeff_ptr,
+                                   const int16_t *dequant_ptr,
+                                   uint16_t *eob_ptr,
+                                   const int16_t *scan, const int16_t *iscan) {
+  const int zbins[2] = {ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
+                        ROUND_POWER_OF_TWO(zbin_ptr[1], 1)};
+  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
+
+  int idx = 0;
+  int idx_arr[1024];
+  int i, eob = -1;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+
+      // If the coefficient is out of the base ZBIN range, keep it for
+      // quantization.
+      if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0])
+        idx_arr[idx++] = i;
+    }
+
+    // Quantization pass: only process the coefficients selected in
+    // pre-scan pass. Note: idx can be zero.
+    for (i = 0; i < idx; i++) {
+      const int rc = scan[idx_arr[i]];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int64_t tmp1 = abs_coeff
+                         + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+      const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+      const uint32_t abs_qcoeff =
+          (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
+      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+      if (abs_qcoeff)
+        eob = idx_arr[i];
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+#endif
diff --git a/libs/libvpx/vpx_dsp/quantize.h b/libs/libvpx/vpx_dsp/quantize.h
new file mode 100644
index 0000000000..89ec597924
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/quantize.h
@@ -0,0 +1,51 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_QUANTIZE_H_
+#define VPX_DSP_QUANTIZE_H_
+
+#include "./vpx_config.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vpx_quantize_dc(const tran_low_t *coeff_ptr,
+                     int n_coeffs, int skip_block,
+                     const int16_t *round_ptr, const int16_t quant_ptr,
+                     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                     const int16_t dequant_ptr, uint16_t *eob_ptr);
+void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
+                           const int16_t *round_ptr, const int16_t quant_ptr,
+                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                           const int16_t dequant_ptr, uint16_t *eob_ptr);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr,
+                            int n_coeffs, int skip_block,
+                            const int16_t *round_ptr, const int16_t quant_ptr,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            const int16_t dequant_ptr, uint16_t *eob_ptr);
+void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
+                                  int skip_block,
+                                  const int16_t *round_ptr,
+                                  const int16_t quant_ptr,
+                                  tran_low_t *qcoeff_ptr,
+                                  tran_low_t *dqcoeff_ptr,
+                                  const int16_t dequant_ptr,
+                                  uint16_t *eob_ptr);
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_DSP_QUANTIZE_H_
diff --git a/libs/libvpx/vpx_dsp/sad.c b/libs/libvpx/vpx_dsp/sad.c
new file mode 100644
index 0000000000..c0c3ff9964
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/sad.c
@@ -0,0 +1,318 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+/* Sum the difference between every corresponding element of the buffers. */
+static INLINE unsigned int sad(const uint8_t *a, int a_stride,
+                               const uint8_t *b, int b_stride,
+                               int width, int height) {
+  int y, x;
+  unsigned int sad = 0;
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++)
+      sad += abs(a[x] - b[x]);
+
+    a += a_stride;
+    b += b_stride;
+  }
+  return sad;
+}
+
+// TODO(johannkoenig): this moved to vpx_dsp, should be able to clean this up.
+/* Remove dependency on vp9 variance function by duplicating vp9_comp_avg_pred.
+ * The function averages every corresponding element of the buffers and stores
+ * the value in a third buffer, comp_pred.
+ * pred and comp_pred are assumed to have stride = width
+ * In the usage below comp_pred is a local array.
+ */
+static INLINE void avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width,
+                            int height, const uint8_t *ref, int ref_stride) {
+  int i, j;
+
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      const int tmp = pred[j] + ref[j];
+      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
+    }
+    comp_pred += width;
+    pred += width;
+    ref += ref_stride;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void highbd_avg_pred(uint16_t *comp_pred, const uint8_t *pred8,
+                                   int width, int height, const uint8_t *ref8,
+                                   int ref_stride) {
+  int i, j;
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      const int tmp = pred[j] + ref[j];
+      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
+    }
+    comp_pred += width;
+    pred += width;
+    ref += ref_stride;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#define sadMxN(m, n) \
+unsigned int vpx_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
+                                  const uint8_t *ref, int ref_stride) { \
+  return sad(src, src_stride, ref, ref_stride, m, n); \
+} \
+unsigned int vpx_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \
+                                      const uint8_t *ref, int ref_stride, \
+                                      const uint8_t *second_pred) { \
+  uint8_t comp_pred[m * n]; \
+  avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \
+  return sad(src, src_stride, comp_pred, m, m, n); \
+}
+
+// depending on call sites, pass **ref_array to avoid & in subsequent call and
+// de-dup with 4D below.
+#define sadMxNxK(m, n, k) \
+void vpx_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \
+                                const uint8_t *ref_array, int ref_stride, \
+                                uint32_t *sad_array) { \
+  int i; \
+  for (i = 0; i < k; ++i) \
+    sad_array[i] = vpx_sad##m##x##n##_c(src, src_stride, &ref_array[i], ref_stride); \
+}
+
+// This appears to be equivalent to the above when k == 4 and refs is const
+#define sadMxNx4D(m, n) \
+void vpx_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \
+                             const uint8_t *const ref_array[], int ref_stride, \
+                             uint32_t *sad_array) { \
+  int i; \
+  for (i = 0; i < 4; ++i) \
+    sad_array[i] = vpx_sad##m##x##n##_c(src, src_stride, ref_array[i], ref_stride); \
+}
+
+// 64x64
+sadMxN(64, 64)
+sadMxNxK(64, 64, 3)
+sadMxNxK(64, 64, 8)
+sadMxNx4D(64, 64)
+
+// 64x32
+sadMxN(64, 32)
+sadMxNx4D(64, 32)
+
+// 32x64
+sadMxN(32, 64)
+sadMxNx4D(32, 64)
+
+// 32x32
+sadMxN(32, 32)
+sadMxNxK(32, 32, 3)
+sadMxNxK(32, 32, 8)
+sadMxNx4D(32, 32)
+
+// 32x16
+sadMxN(32, 16)
+sadMxNx4D(32, 16)
+
+// 16x32
+sadMxN(16, 32)
+sadMxNx4D(16, 32)
+
+// 16x16
+sadMxN(16, 16)
+sadMxNxK(16, 16, 3)
+sadMxNxK(16, 16, 8)
+sadMxNx4D(16, 16)
+
+// 16x8
+sadMxN(16, 8)
+sadMxNxK(16, 8, 3)
+sadMxNxK(16, 8, 8)
+sadMxNx4D(16, 8)
+
+// 8x16
+sadMxN(8, 16)
+sadMxNxK(8, 16, 3)
+sadMxNxK(8, 16, 8)
+sadMxNx4D(8, 16)
+
+// 8x8
+sadMxN(8, 8)
+sadMxNxK(8, 8, 3)
+sadMxNxK(8, 8, 8)
+sadMxNx4D(8, 8)
+
+// 8x4
+sadMxN(8, 4)
+sadMxNxK(8, 4, 8)
+sadMxNx4D(8, 4)
+
+// 4x8
+sadMxN(4, 8)
+sadMxNxK(4, 8, 8)
+sadMxNx4D(4, 8)
+
+// 4x4
+sadMxN(4, 4)
+sadMxNxK(4, 4, 3)
+sadMxNxK(4, 4, 8)
+sadMxNx4D(4, 4)
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE unsigned int highbd_sad(const uint8_t *a8, int a_stride,
+                                      const uint8_t *b8, int b_stride,
+                                      int width, int height) {
+  int y, x;
+  unsigned int sad = 0;
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++)
+      sad += abs(a[x] - b[x]);
+
+    a += a_stride;
+    b += b_stride;
+  }
+  return sad;
+}
+
+static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride,
+                                       const uint16_t *b, int b_stride,
+                                       int width, int height) {
+  int y, x;
+  unsigned int sad = 0;
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++)
+      sad += abs(a[x] - b[x]);
+
+    a += a_stride;
+    b += b_stride;
+  }
+  return sad;
+}
+
+#define highbd_sadMxN(m, n) \
+unsigned int vpx_highbd_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
+                                         const uint8_t *ref, int ref_stride) { \
+  return highbd_sad(src, src_stride, ref, ref_stride, m, n); \
+} \
+unsigned int vpx_highbd_sad##m##x##n##_avg_c(const uint8_t *src, \
+                                             int src_stride, \
+                                             const uint8_t *ref, \
+                                             int ref_stride, \
+                                             const uint8_t *second_pred) { \
+  uint16_t comp_pred[m * n]; \
+  highbd_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \
+  return highbd_sadb(src, src_stride, comp_pred, m, m, n); \
+}
+
+#define highbd_sadMxNxK(m, n, k) \
+void vpx_highbd_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \
+                                       const uint8_t *ref_array, int ref_stride, \
+                                       uint32_t *sad_array) { \
+  int i; \
+  for (i = 0; i < k; ++i) { \
+    sad_array[i] = vpx_highbd_sad##m##x##n##_c(src, src_stride, &ref_array[i], \
+                                               ref_stride); \
+  } \
+}
+
+#define highbd_sadMxNx4D(m, n) \
+void vpx_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \
+                                    const uint8_t *const ref_array[], \
+                                    int ref_stride, uint32_t *sad_array) { \
+  int i; \
+  for (i = 0; i < 4; ++i) { \
+    sad_array[i] = vpx_highbd_sad##m##x##n##_c(src, src_stride, ref_array[i], \
+                                               ref_stride); \
+  } \
+}
+
+// 64x64
+highbd_sadMxN(64, 64)
+highbd_sadMxNxK(64, 64, 3)
+highbd_sadMxNxK(64, 64, 8)
+highbd_sadMxNx4D(64, 64)
+
+// 64x32
+highbd_sadMxN(64, 32)
+highbd_sadMxNx4D(64, 32)
+
+// 32x64
+highbd_sadMxN(32, 64)
+highbd_sadMxNx4D(32, 64)
+
+// 32x32
+highbd_sadMxN(32, 32)
+highbd_sadMxNxK(32, 32, 3)
+highbd_sadMxNxK(32, 32, 8)
+highbd_sadMxNx4D(32, 32)
+
+// 32x16
+highbd_sadMxN(32, 16)
+highbd_sadMxNx4D(32, 16)
+
+// 16x32
+highbd_sadMxN(16, 32)
+highbd_sadMxNx4D(16, 32)
+
+// 16x16
+highbd_sadMxN(16, 16)
+highbd_sadMxNxK(16, 16, 3)
+highbd_sadMxNxK(16, 16, 8)
+highbd_sadMxNx4D(16, 16)
+
+// 16x8
+highbd_sadMxN(16, 8)
+highbd_sadMxNxK(16, 8, 3)
+highbd_sadMxNxK(16, 8, 8)
+highbd_sadMxNx4D(16, 8)
+
+// 8x16
+highbd_sadMxN(8, 16)
+highbd_sadMxNxK(8, 16, 3)
+highbd_sadMxNxK(8, 16, 8)
+highbd_sadMxNx4D(8, 16)
+
+// 8x8
+highbd_sadMxN(8, 8)
+highbd_sadMxNxK(8, 8, 3)
+highbd_sadMxNxK(8, 8, 8)
+highbd_sadMxNx4D(8, 8)
+
+// 8x4
+highbd_sadMxN(8, 4)
+highbd_sadMxNxK(8, 4, 8)
+highbd_sadMxNx4D(8, 4)
+
+// 4x8
+highbd_sadMxN(4, 8)
+highbd_sadMxNxK(4, 8, 8)
+highbd_sadMxNx4D(4, 8)
+
+// 4x4
+highbd_sadMxN(4, 4)
+highbd_sadMxNxK(4, 4, 3)
+highbd_sadMxNxK(4, 4, 8)
+highbd_sadMxNx4D(4, 4)
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/libs/libvpx/vpx_dsp/ssim.c b/libs/libvpx/vpx_dsp/ssim.c
new file mode 100644
index 0000000000..cfe5bb331c
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/ssim.c
@@ -0,0 +1,505 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/ssim.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/system_state.h"
+
+void vpx_ssim_parms_16x16_c(const uint8_t *s, int sp, const uint8_t *r,
+                            int rp, uint32_t *sum_s, uint32_t *sum_r,
+                            uint32_t *sum_sq_s, uint32_t *sum_sq_r,
+                            uint32_t *sum_sxr) {
+  int i, j;
+  for (i = 0; i < 16; i++, s += sp, r += rp) {
+    for (j = 0; j < 16; j++) {
+      *sum_s += s[j];
+      *sum_r += r[j];
+      *sum_sq_s += s[j] * s[j];
+      *sum_sq_r += r[j] * r[j];
+      *sum_sxr += s[j] * r[j];
+    }
+  }
+}
+void vpx_ssim_parms_8x8_c(const uint8_t *s, int sp, const uint8_t *r, int rp,
+                          uint32_t *sum_s, uint32_t *sum_r,
+                          uint32_t *sum_sq_s, uint32_t *sum_sq_r,
+                          uint32_t *sum_sxr) {
+  int i, j;
+  for (i = 0; i < 8; i++, s += sp, r += rp) {
+    for (j = 0; j < 8; j++) {
+      *sum_s += s[j];
+      *sum_r += r[j];
+      *sum_sq_s += s[j] * s[j];
+      *sum_sq_r += r[j] * r[j];
+      *sum_sxr += s[j] * r[j];
+    }
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_ssim_parms_8x8_c(const uint16_t *s, int sp,
+                                 const uint16_t *r, int rp,
+                                 uint32_t *sum_s, uint32_t *sum_r,
+                                 uint32_t *sum_sq_s, uint32_t *sum_sq_r,
+                                 uint32_t *sum_sxr) {
+  int i, j;
+  for (i = 0; i < 8; i++, s += sp, r += rp) {
+    for (j = 0; j < 8; j++) {
+      *sum_s += s[j];
+      *sum_r += r[j];
+      *sum_sq_s += s[j] * s[j];
+      *sum_sq_r += r[j] * r[j];
+      *sum_sxr += s[j] * r[j];
+    }
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static const int64_t cc1 =  26634;  // (64^2*(.01*255)^2
+static const int64_t cc2 = 239708;  // (64^2*(.03*255)^2
+
+static double similarity(uint32_t sum_s, uint32_t sum_r,
+                         uint32_t sum_sq_s, uint32_t sum_sq_r,
+                         uint32_t sum_sxr, int count) {
+  int64_t ssim_n, ssim_d;
+  int64_t c1, c2;
+
+  // scale the constants by number of pixels
+  c1 = (cc1 * count * count) >> 12;
+  c2 = (cc2 * count * count) >> 12;
+
+  ssim_n = (2 * sum_s * sum_r + c1) * ((int64_t) 2 * count * sum_sxr -
+                                       (int64_t) 2 * sum_s * sum_r + c2);
+
+  ssim_d = (sum_s * sum_s + sum_r * sum_r + c1) *
+           ((int64_t)count * sum_sq_s - (int64_t)sum_s * sum_s +
+            (int64_t)count * sum_sq_r - (int64_t) sum_r * sum_r + c2);
+
+  return ssim_n * 1.0 / ssim_d;
+}
+
+static double ssim_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp) {
+  uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
+  vpx_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
+                     &sum_sxr);
+  return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static double highbd_ssim_8x8(const uint16_t *s, int sp, const uint16_t *r,
+                              int rp, unsigned int bd) {
+  uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
+  const int oshift = bd - 8;
+  vpx_highbd_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
+                            &sum_sxr);
+  return similarity(sum_s >> oshift,
+                    sum_r >> oshift,
+                    sum_sq_s >> (2 * oshift),
+                    sum_sq_r >> (2 * oshift),
+                    sum_sxr >> (2 * oshift),
+                    64);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+// We are using a 8x8 moving window with starting location of each 8x8 window
+// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
+// block boundaries to penalize blocking artifacts.
+static double vpx_ssim2(const uint8_t *img1, const uint8_t *img2,
+                        int stride_img1, int stride_img2, int width,
+                        int height) {
+  int i, j;
+  int samples = 0;
+  double ssim_total = 0;
+
+  // sample point start with each 4x4 location
+  for (i = 0; i <= height - 8;
+       i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
+    for (j = 0; j <= width - 8; j += 4) {
+      double v = ssim_8x8(img1 + j, stride_img1, img2 + j, stride_img2);
+      ssim_total += v;
+      samples++;
+    }
+  }
+  ssim_total /= samples;
+  return ssim_total;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static double vpx_highbd_ssim2(const uint8_t *img1, const uint8_t *img2,
+                               int stride_img1, int stride_img2, int width,
+                               int height, unsigned int bd) {
+  int i, j;
+  int samples = 0;
+  double ssim_total = 0;
+
+  // sample point start with each 4x4 location
+  for (i = 0; i <= height - 8;
+       i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
+    for (j = 0; j <= width - 8; j += 4) {
+      double v = highbd_ssim_8x8(CONVERT_TO_SHORTPTR(img1 + j), stride_img1,
+                                 CONVERT_TO_SHORTPTR(img2 + j), stride_img2,
+                                 bd);
+      ssim_total += v;
+      samples++;
+    }
+  }
+  ssim_total /= samples;
+  return ssim_total;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+double vpx_calc_ssim(const YV12_BUFFER_CONFIG *source,
+                     const YV12_BUFFER_CONFIG *dest,
+                     double *weight) {
+  double a, b, c;
+  double ssimv;
+
+  a = vpx_ssim2(source->y_buffer, dest->y_buffer,
+                source->y_stride, dest->y_stride,
+                source->y_crop_width, source->y_crop_height);
+
+  b = vpx_ssim2(source->u_buffer, dest->u_buffer,
+                source->uv_stride, dest->uv_stride,
+                source->uv_crop_width, source->uv_crop_height);
+
+  c = vpx_ssim2(source->v_buffer, dest->v_buffer,
+                source->uv_stride, dest->uv_stride,
+                source->uv_crop_width, source->uv_crop_height);
+
+  ssimv = a * .8 + .1 * (b + c);
+
+  *weight = 1;
+
+  return ssimv;
+}
+
+double vpx_calc_ssimg(const YV12_BUFFER_CONFIG *source,
+                      const YV12_BUFFER_CONFIG *dest,
+                      double *ssim_y, double *ssim_u, double *ssim_v) {
+  double ssim_all = 0;
+  double a, b, c;
+
+  a = vpx_ssim2(source->y_buffer, dest->y_buffer,
+                source->y_stride, dest->y_stride,
+                source->y_crop_width, source->y_crop_height);
+
+  b = vpx_ssim2(source->u_buffer, dest->u_buffer,
+                source->uv_stride, dest->uv_stride,
+                source->uv_crop_width, source->uv_crop_height);
+
+  c = vpx_ssim2(source->v_buffer, dest->v_buffer,
+                source->uv_stride, dest->uv_stride,
+                source->uv_crop_width, source->uv_crop_height);
+  *ssim_y = a;
+  *ssim_u = b;
+  *ssim_v = c;
+  ssim_all = (a * 4 + b + c) / 6;
+
+  return ssim_all;
+}
+
+// traditional ssim as per: http://en.wikipedia.org/wiki/Structural_similarity
+//
+// Re working out the math ->
+//
+// ssim(x,y) =  (2*mean(x)*mean(y) + c1)*(2*cov(x,y)+c2) /
+//   ((mean(x)^2+mean(y)^2+c1)*(var(x)+var(y)+c2))
+//
+// mean(x) = sum(x) / n
+//
+// cov(x,y) = (n*sum(xi*yi)-sum(x)*sum(y))/(n*n)
+//
+// var(x) = (n*sum(xi*xi)-sum(xi)*sum(xi))/(n*n)
+//
+// ssim(x,y) =
+//   (2*sum(x)*sum(y)/(n*n) + c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))/(n*n)+c2) /
+//   (((sum(x)*sum(x)+sum(y)*sum(y))/(n*n) +c1) *
+//    ((n*sum(xi*xi) - sum(xi)*sum(xi))/(n*n)+
+//     (n*sum(yi*yi) - sum(yi)*sum(yi))/(n*n)+c2)))
+//
+// factoring out n*n
+//
+// ssim(x,y) =
+//   (2*sum(x)*sum(y) + n*n*c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))+n*n*c2) /
+//   (((sum(x)*sum(x)+sum(y)*sum(y)) + n*n*c1) *
+//    (n*sum(xi*xi)-sum(xi)*sum(xi)+n*sum(yi*yi)-sum(yi)*sum(yi)+n*n*c2))
+//
+// Replace c1 with n*n * c1 for the final step that leads to this code:
+// The final step scales by 12 bits so we don't lose precision in the constants.
+
+static double ssimv_similarity(const Ssimv *sv, int64_t n) {
+  // Scale the constants by number of pixels.
+  const int64_t c1 = (cc1 * n * n) >> 12;
+  const int64_t c2 = (cc2 * n * n) >> 12;
+
+  const double l = 1.0 * (2 * sv->sum_s * sv->sum_r + c1) /
+      (sv->sum_s * sv->sum_s + sv->sum_r * sv->sum_r + c1);
+
+  // Since these variables are unsigned sums, convert to double so
+  // math is done in double arithmetic.
+  const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2)
+      / (n * sv->sum_sq_s - sv->sum_s * sv->sum_s + n * sv->sum_sq_r
+         - sv->sum_r * sv->sum_r + c2);
+
+  return l * v;
+}
+
+// The first term of the ssim metric is a luminance factor.
+//
+// (2*mean(x)*mean(y) + c1)/ (mean(x)^2+mean(y)^2+c1)
+//
+// This luminance factor is super sensitive to the dark side of luminance
+// values and completely insensitive on the white side.  check out 2 sets
+// (1,3) and (250,252) the term gives ( 2*1*3/(1+9) = .60
+// 2*250*252/ (250^2+252^2) => .99999997
+//
+// As a result in this tweaked version of the calculation in which the
+// luminance is taken as percentage off from peak possible.
+//
+// 255 * 255 - (sum_s - sum_r) / count * (sum_s - sum_r) / count
+//
+static double ssimv_similarity2(const Ssimv *sv, int64_t n) {
+  // Scale the constants by number of pixels.
+  const int64_t c1 = (cc1 * n * n) >> 12;
+  const int64_t c2 = (cc2 * n * n) >> 12;
+
+  const double mean_diff = (1.0 * sv->sum_s - sv->sum_r) / n;
+  const double l = (255 * 255 - mean_diff * mean_diff + c1) / (255 * 255 + c1);
+
+  // Since these variables are unsigned, sums convert to double so
+  // math is done in double arithmetic.
+  const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2)
+      / (n * sv->sum_sq_s - sv->sum_s * sv->sum_s +
+         n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2);
+
+  return l * v;
+}
+static void ssimv_parms(uint8_t *img1, int img1_pitch, uint8_t *img2,
+                        int img2_pitch, Ssimv *sv) {
+  vpx_ssim_parms_8x8(img1, img1_pitch, img2, img2_pitch,
+                     &sv->sum_s, &sv->sum_r, &sv->sum_sq_s, &sv->sum_sq_r,
+                     &sv->sum_sxr);
+}
+
+double vpx_get_ssim_metrics(uint8_t *img1, int img1_pitch,
+                            uint8_t *img2, int img2_pitch,
+                            int width, int height,
+                            Ssimv *sv2, Metrics *m,
+                            int do_inconsistency) {
+  double dssim_total = 0;
+  double ssim_total = 0;
+  double ssim2_total = 0;
+  double inconsistency_total = 0;
+  int i, j;
+  int c = 0;
+  double norm;
+  double old_ssim_total = 0;
+  vpx_clear_system_state();
+  // We can sample points as frequently as we like start with 1 per 4x4.
+  for (i = 0; i < height; i += 4,
+       img1 += img1_pitch * 4, img2 += img2_pitch * 4) {
+    for (j = 0; j < width; j += 4, ++c) {
+      Ssimv sv = {0};
+      double ssim;
+      double ssim2;
+      double dssim;
+      uint32_t var_new;
+      uint32_t var_old;
+      uint32_t mean_new;
+      uint32_t mean_old;
+      double ssim_new;
+      double ssim_old;
+
+      // Not sure there's a great way to handle the edge pixels
+      // in ssim when using a window. Seems biased against edge pixels
+      // however you handle this. This uses only samples that are
+      // fully in the frame.
+      if (j + 8 <= width && i + 8 <= height) {
+        ssimv_parms(img1 + j, img1_pitch, img2 + j, img2_pitch, &sv);
+      }
+
+      ssim = ssimv_similarity(&sv, 64);
+      ssim2 = ssimv_similarity2(&sv, 64);
+
+      sv.ssim = ssim2;
+
+      // dssim is calculated to use as an actual error metric and
+      // is scaled up to the same range as sum square error.
+      // Since we are subsampling every 16th point maybe this should be
+      // *16 ?
+      dssim = 255 * 255 * (1 - ssim2) / 2;
+
+      // Here I introduce a new error metric: consistency-weighted
+      // SSIM-inconsistency.  This metric isolates frames where the
+      // SSIM 'suddenly' changes, e.g. if one frame in every 8 is much
+      // sharper or blurrier than the others. Higher values indicate a
+      // temporally inconsistent SSIM. There are two ideas at work:
+      //
+      // 1) 'SSIM-inconsistency': the total inconsistency value
+      // reflects how much SSIM values are changing between this
+      // source / reference frame pair and the previous pair.
+      //
+      // 2) 'consistency-weighted': weights de-emphasize areas in the
+      // frame where the scene content has changed. Changes in scene
+      // content are detected via changes in local variance and local
+      // mean.
+      //
+      // Thus the overall measure reflects how inconsistent the SSIM
+      // values are, over consistent regions of the frame.
+      //
+      // The metric has three terms:
+      //
+      // term 1 -> uses change in scene Variance to weight error score
+      //  2 * var(Fi)*var(Fi-1) / (var(Fi)^2+var(Fi-1)^2)
+      //  larger changes from one frame to the next mean we care
+      //  less about consistency.
+      //
+      // term 2 -> uses change in local scene luminance to weight error
+      //  2 * avg(Fi)*avg(Fi-1) / (avg(Fi)^2+avg(Fi-1)^2)
+      //  larger changes from one frame to the next mean we care
+      //  less about consistency.
+      //
+      // term3 -> measures inconsistency in ssim scores between frames
+      //   1 - ( 2 * ssim(Fi)*ssim(Fi-1)/(ssim(Fi)^2+sssim(Fi-1)^2).
+      //
+      // This term compares the ssim score for the same location in 2
+      // subsequent frames.
+      var_new = sv.sum_sq_s - sv.sum_s * sv.sum_s / 64;
+      var_old = sv2[c].sum_sq_s - sv2[c].sum_s * sv2[c].sum_s / 64;
+      mean_new = sv.sum_s;
+      mean_old = sv2[c].sum_s;
+      ssim_new = sv.ssim;
+      ssim_old = sv2[c].ssim;
+
+      if (do_inconsistency) {
+        // We do the metric once for every 4x4 block in the image. Since
+        // we are scaling the error to SSE for use in a psnr calculation
+        // 1.0 = 4x4x255x255 the worst error we can possibly have.
+        static const double kScaling = 4. * 4 * 255 * 255;
+
+        // The constants have to be non 0 to avoid potential divide by 0
+        // issues other than that they affect kind of a weighting between
+        // the terms.  No testing of what the right terms should be has been
+        // done.
+        static const double c1 = 1, c2 = 1, c3 = 1;
+
+        // This measures how much consistent variance is in two consecutive
+        // source frames. 1.0 means they have exactly the same variance.
+        const double variance_term = (2.0 * var_old * var_new + c1) /
+            (1.0 * var_old * var_old + 1.0 * var_new * var_new + c1);
+
+        // This measures how consistent the local mean are between two
+        // consecutive frames. 1.0 means they have exactly the same mean.
+        const double mean_term = (2.0 * mean_old * mean_new + c2) /
+            (1.0 * mean_old * mean_old + 1.0 * mean_new * mean_new + c2);
+
+        // This measures how consistent the ssims of two
+        // consecutive frames is. 1.0 means they are exactly the same.
+        double ssim_term = pow((2.0 * ssim_old * ssim_new + c3) /
+                               (ssim_old * ssim_old + ssim_new * ssim_new + c3),
+                               5);
+
+        double this_inconsistency;
+
+        // Floating point math sometimes makes this > 1 by a tiny bit.
+        // We want the metric to scale between 0 and 1.0 so we can convert
+        // it to an snr scaled value.
+        if (ssim_term > 1)
+          ssim_term = 1;
+
+        // This converts the consistency metric to an inconsistency metric
+        // ( so we can scale it like psnr to something like sum square error.
+        // The reason for the variance and mean terms is the assumption that
+        // if there are big changes in the source we shouldn't penalize
+        // inconsistency in ssim scores a bit less as it will be less visible
+        // to the user.
+        this_inconsistency = (1 - ssim_term) * variance_term * mean_term;
+
+        this_inconsistency *= kScaling;
+        inconsistency_total += this_inconsistency;
+      }
+      sv2[c] = sv;
+      ssim_total += ssim;
+      ssim2_total += ssim2;
+      dssim_total += dssim;
+
+      old_ssim_total += ssim_old;
+    }
+    old_ssim_total += 0;
+  }
+
+  norm = 1. / (width / 4) / (height / 4);
+  ssim_total *= norm;
+  ssim2_total *= norm;
+  m->ssim2 = ssim2_total;
+  m->ssim = ssim_total;
+  if (old_ssim_total == 0)
+    inconsistency_total = 0;
+
+  m->ssimc = inconsistency_total;
+
+  m->dssim = dssim_total;
+  return inconsistency_total;
+}
+
+
+#if CONFIG_VP9_HIGHBITDEPTH
+double vpx_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
+                            const YV12_BUFFER_CONFIG *dest,
+                            double *weight, unsigned int bd) {
+  double a, b, c;
+  double ssimv;
+
+  a = vpx_highbd_ssim2(source->y_buffer, dest->y_buffer,
+                       source->y_stride, dest->y_stride,
+                       source->y_crop_width, source->y_crop_height, bd);
+
+  b = vpx_highbd_ssim2(source->u_buffer, dest->u_buffer,
+                       source->uv_stride, dest->uv_stride,
+                       source->uv_crop_width, source->uv_crop_height, bd);
+
+  c = vpx_highbd_ssim2(source->v_buffer, dest->v_buffer,
+                       source->uv_stride, dest->uv_stride,
+                       source->uv_crop_width, source->uv_crop_height, bd);
+
+  ssimv = a * .8 + .1 * (b + c);
+
+  *weight = 1;
+
+  return ssimv;
+}
+
+double vpx_highbd_calc_ssimg(const YV12_BUFFER_CONFIG *source,
+                             const YV12_BUFFER_CONFIG *dest, double *ssim_y,
+                             double *ssim_u, double *ssim_v, unsigned int bd) {
+  double ssim_all = 0;
+  double a, b, c;
+
+  a = vpx_highbd_ssim2(source->y_buffer, dest->y_buffer,
+                       source->y_stride, dest->y_stride,
+                       source->y_crop_width, source->y_crop_height, bd);
+
+  b = vpx_highbd_ssim2(source->u_buffer, dest->u_buffer,
+                       source->uv_stride, dest->uv_stride,
+                       source->uv_crop_width, source->uv_crop_height, bd);
+
+  c = vpx_highbd_ssim2(source->v_buffer, dest->v_buffer,
+                       source->uv_stride, dest->uv_stride,
+                       source->uv_crop_width, source->uv_crop_height, bd);
+  *ssim_y = a;
+  *ssim_u = b;
+  *ssim_v = c;
+  ssim_all = (a * 4 + b + c) / 6;
+
+  return ssim_all;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/libs/libvpx/vpx_dsp/ssim.h b/libs/libvpx/vpx_dsp/ssim.h
new file mode 100644
index 0000000000..132f7f9e19
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/ssim.h
@@ -0,0 +1,101 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_SSIM_H_
+#define VPX_DSP_SSIM_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "./vpx_config.h"
+#include "vpx_scale/yv12config.h"
+
+// metrics used for calculating ssim, ssim2, dssim, and ssimc
+typedef struct {
+  // source sum ( over 8x8 region )
+  uint32_t sum_s;
+
+  // reference sum (over 8x8 region )
+  uint32_t sum_r;
+
+  // source sum squared ( over 8x8 region )
+  uint32_t sum_sq_s;
+
+  // reference sum squared (over 8x8 region )
+  uint32_t sum_sq_r;
+
+  // sum of source times reference (over 8x8 region)
+  uint32_t sum_sxr;
+
+  // calculated ssim score between source and reference
+  double ssim;
+} Ssimv;
+
+// metrics collected on a frame basis
+typedef struct {
+  // ssim consistency error metric ( see code for explanation )
+  double ssimc;
+
+  // standard ssim
+  double ssim;
+
+  // revised ssim ( see code for explanation)
+  double ssim2;
+
+  // ssim restated as an error metric like sse
+  double dssim;
+
+  // dssim converted to decibels
+  double dssimd;
+
+  // ssimc converted to decibels
+  double ssimcd;
+} Metrics;
+
+double vpx_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2,
+                      int img2_pitch, int width, int height, Ssimv *sv2,
+                      Metrics *m, int do_inconsistency);
+
+double vpx_calc_ssim(const YV12_BUFFER_CONFIG *source,
+                     const YV12_BUFFER_CONFIG *dest,
+                     double *weight);
+
+double vpx_calc_ssimg(const YV12_BUFFER_CONFIG *source,
+                      const YV12_BUFFER_CONFIG *dest,
+                      double *ssim_y, double *ssim_u, double *ssim_v);
+
+double vpx_calc_fastssim(const YV12_BUFFER_CONFIG *source,
+                         const YV12_BUFFER_CONFIG *dest,
+                         double *ssim_y, double *ssim_u, double *ssim_v);
+
+double vpx_psnrhvs(const YV12_BUFFER_CONFIG *source,
+                   const YV12_BUFFER_CONFIG *dest,
+                   double *ssim_y, double *ssim_u, double *ssim_v);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+double vpx_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
+                            const YV12_BUFFER_CONFIG *dest,
+                            double *weight,
+                            unsigned int bd);
+
+double vpx_highbd_calc_ssimg(const YV12_BUFFER_CONFIG *source,
+                             const YV12_BUFFER_CONFIG *dest,
+                             double *ssim_y,
+                             double *ssim_u,
+                             double *ssim_v,
+                             unsigned int bd);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_DSP_SSIM_H_
diff --git a/libs/libvpx/vpx_dsp/subtract.c b/libs/libvpx/vpx_dsp/subtract.c
new file mode 100644
index 0000000000..556e0134f3
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/subtract.c
@@ -0,0 +1,56 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+void vpx_subtract_block_c(int rows, int cols,
+                          int16_t *diff, ptrdiff_t diff_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          const uint8_t *pred, ptrdiff_t pred_stride) {
+  int r, c;
+
+  for (r = 0; r < rows; r++) {
+    for (c = 0; c < cols; c++)
+      diff[c] = src[c] - pred[c];
+
+    diff += diff_stride;
+    pred += pred_stride;
+    src  += src_stride;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_subtract_block_c(int rows, int cols,
+                                 int16_t *diff, ptrdiff_t diff_stride,
+                                 const uint8_t *src8, ptrdiff_t src_stride,
+                                 const uint8_t *pred8, ptrdiff_t pred_stride,
+                                 int bd) {
+  int r, c;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  (void) bd;
+
+  for (r = 0; r < rows; r++) {
+    for (c = 0; c < cols; c++) {
+      diff[c] = src[c] - pred[c];
+    }
+
+    diff += diff_stride;
+    pred += pred_stride;
+    src  += src_stride;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/libs/libvpx/vpx_dsp/txfm_common.h b/libs/libvpx/vpx_dsp/txfm_common.h
new file mode 100644
index 0000000000..442e6a57b5
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/txfm_common.h
@@ -0,0 +1,66 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_TXFM_COMMON_H_
+#define VPX_DSP_TXFM_COMMON_H_
+
+#include "vpx_dsp/vpx_dsp_common.h"
+
+// Constants and Macros used by all idct/dct functions
+#define DCT_CONST_BITS 14
+#define DCT_CONST_ROUNDING  (1 << (DCT_CONST_BITS - 1))
+
+#define UNIT_QUANT_SHIFT 2
+#define UNIT_QUANT_FACTOR (1 << UNIT_QUANT_SHIFT)
+
+// Constants:
+//  for (int i = 1; i< 32; ++i)
+//    printf("static const int cospi_%d_64 = %.0f;\n", i,
+//           round(16384 * cos(i*M_PI/64)));
+// Note: sin(k*Pi/64) = cos((32-k)*Pi/64)
+static const tran_high_t cospi_1_64  = 16364;
+static const tran_high_t cospi_2_64  = 16305;
+static const tran_high_t cospi_3_64  = 16207;
+static const tran_high_t cospi_4_64  = 16069;
+static const tran_high_t cospi_5_64  = 15893;
+static const tran_high_t cospi_6_64  = 15679;
+static const tran_high_t cospi_7_64  = 15426;
+static const tran_high_t cospi_8_64  = 15137;
+static const tran_high_t cospi_9_64  = 14811;
+static const tran_high_t cospi_10_64 = 14449;
+static const tran_high_t cospi_11_64 = 14053;
+static const tran_high_t cospi_12_64 = 13623;
+static const tran_high_t cospi_13_64 = 13160;
+static const tran_high_t cospi_14_64 = 12665;
+static const tran_high_t cospi_15_64 = 12140;
+static const tran_high_t cospi_16_64 = 11585;
+static const tran_high_t cospi_17_64 = 11003;
+static const tran_high_t cospi_18_64 = 10394;
+static const tran_high_t cospi_19_64 = 9760;
+static const tran_high_t cospi_20_64 = 9102;
+static const tran_high_t cospi_21_64 = 8423;
+static const tran_high_t cospi_22_64 = 7723;
+static const tran_high_t cospi_23_64 = 7005;
+static const tran_high_t cospi_24_64 = 6270;
+static const tran_high_t cospi_25_64 = 5520;
+static const tran_high_t cospi_26_64 = 4756;
+static const tran_high_t cospi_27_64 = 3981;
+static const tran_high_t cospi_28_64 = 3196;
+static const tran_high_t cospi_29_64 = 2404;
+static const tran_high_t cospi_30_64 = 1606;
+static const tran_high_t cospi_31_64 = 804;
+
+//  16384 * sqrt(2) * sin(kPi/9) * 2 / 3
+static const tran_high_t sinpi_1_9 = 5283;
+static const tran_high_t sinpi_2_9 = 9929;
+static const tran_high_t sinpi_3_9 = 13377;
+static const tran_high_t sinpi_4_9 = 15212;
+
+#endif  // VPX_DSP_TXFM_COMMON_H_
diff --git a/libs/libvpx/vpx_dsp/variance.c b/libs/libvpx/vpx_dsp/variance.c
new file mode 100644
index 0000000000..e8bddb0a0e
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/variance.c
@@ -0,0 +1,621 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_ports/mem.h"
+#include "vpx/vpx_integer.h"
+
+#include "vpx_dsp/variance.h"
+
+static const uint8_t bilinear_filters[8][2] = {
+  { 128,   0  },
+  { 112,  16  },
+  {  96,  32  },
+  {  80,  48  },
+  {  64,  64  },
+  {  48,  80  },
+  {  32,  96  },
+  {  16, 112  },
+};
+
+uint32_t vpx_get4x4sse_cs_c(const uint8_t *a, int  a_stride,
+                            const uint8_t *b, int  b_stride) {
+  int distortion = 0;
+  int r, c;
+
+  for (r = 0; r < 4; ++r) {
+    for (c = 0; c < 4; ++c) {
+      int diff = a[c] - b[c];
+      distortion += diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+
+  return distortion;
+}
+
+uint32_t vpx_get_mb_ss_c(const int16_t *a) {
+  unsigned int i, sum = 0;
+
+  for (i = 0; i < 256; ++i) {
+    sum += a[i] * a[i];
+  }
+
+  return sum;
+}
+
+uint32_t vpx_variance_halfpixvar16x16_h_c(const uint8_t *a, int a_stride,
+                                          const uint8_t *b, int b_stride,
+                                          uint32_t *sse) {
+  return vpx_sub_pixel_variance16x16_c(a, a_stride, 4, 0,
+                                       b, b_stride, sse);
+}
+
+
+uint32_t vpx_variance_halfpixvar16x16_v_c(const uint8_t *a, int a_stride,
+                                          const uint8_t *b, int b_stride,
+                                          uint32_t *sse) {
+  return vpx_sub_pixel_variance16x16_c(a, a_stride, 0, 4,
+                                       b, b_stride, sse);
+}
+
+uint32_t vpx_variance_halfpixvar16x16_hv_c(const uint8_t *a, int a_stride,
+                                           const uint8_t *b, int b_stride,
+                                           uint32_t *sse) {
+  return vpx_sub_pixel_variance16x16_c(a, a_stride, 4, 4,
+                                       b, b_stride, sse);
+}
+
+static void variance(const uint8_t *a, int  a_stride,
+                     const uint8_t *b, int  b_stride,
+                     int  w, int  h, uint32_t *sse, int *sum) {
+  int i, j;
+
+  *sum = 0;
+  *sse = 0;
+
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; ++j) {
+      const int diff = a[j] - b[j];
+      *sum += diff;
+      *sse += diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+}
+
+// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
+// or vertical direction to produce the filtered output block. Used to implement
+// the first-pass of 2-D separable filter.
+//
+// Produces int16_t output to retain precision for the next pass. Two filter
+// taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is
+// applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).
+// It defines the offset required to move from one input to the next.
+static void var_filter_block2d_bil_first_pass(const uint8_t *a, uint16_t *b,
+                                              unsigned int src_pixels_per_line,
+                                              int pixel_step,
+                                              unsigned int output_height,
+                                              unsigned int output_width,
+                                              const uint8_t *filter) {
+  unsigned int i, j;
+
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; ++j) {
+      b[j] = ROUND_POWER_OF_TWO((int)a[0] * filter[0] +
+                          (int)a[pixel_step] * filter[1],
+                          FILTER_BITS);
+
+      ++a;
+    }
+
+    a += src_pixels_per_line - output_width;
+    b += output_width;
+  }
+}
+
+// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
+// or vertical direction to produce the filtered output block. Used to implement
+// the second-pass of 2-D separable filter.
+//
+// Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two
+// filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the
+// filter is applied horizontally (pixel_step = 1) or vertically
+// (pixel_step = stride). It defines the offset required to move from one input
+// to the next. Output is 8-bit.
+static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b,
+                                               unsigned int src_pixels_per_line,
+                                               unsigned int pixel_step,
+                                               unsigned int output_height,
+                                               unsigned int output_width,
+                                               const uint8_t *filter) {
+  unsigned int  i, j;
+
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; ++j) {
+      b[j] = ROUND_POWER_OF_TWO((int)a[0] * filter[0] +
+                          (int)a[pixel_step] * filter[1],
+                          FILTER_BITS);
+      ++a;
+    }
+
+    a += src_pixels_per_line - output_width;
+    b += output_width;
+  }
+}
+
+#define VAR(W, H) \
+uint32_t vpx_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+                                   const uint8_t *b, int b_stride, \
+                                   uint32_t *sse) { \
+  int sum; \
+  variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+  return *sse - (((int64_t)sum * sum) / (W * H)); \
+}
+
+#define SUBPIX_VAR(W, H) \
+uint32_t vpx_sub_pixel_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+                                             int xoffset, int  yoffset, \
+                                             const uint8_t *b, int b_stride, \
+                                             uint32_t *sse) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint8_t temp2[H * W]; \
+\
+  var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
+                                    bilinear_filters[xoffset]); \
+  var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                     bilinear_filters[yoffset]); \
+\
+  return vpx_variance##W##x##H##_c(temp2, W, b, b_stride, sse); \
+}
+
+#define SUBPIX_AVG_VAR(W, H) \
+uint32_t vpx_sub_pixel_avg_variance##W##x##H##_c(const uint8_t *a, \
+                                                 int  a_stride, \
+                                                 int xoffset, int  yoffset, \
+                                                 const uint8_t *b, \
+                                                 int b_stride, \
+                                                 uint32_t *sse, \
+                                                 const uint8_t *second_pred) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint8_t temp2[H * W]; \
+  DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
+\
+  var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
+                                    bilinear_filters[xoffset]); \
+  var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                     bilinear_filters[yoffset]); \
+\
+  vpx_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \
+\
+  return vpx_variance##W##x##H##_c(temp3, W, b, b_stride, sse); \
+}
+
+/* Identical to the variance call except it takes an additional parameter, sum,
+ * and returns that value using pass-by-reference instead of returning
+ * sse - sum^2 / w*h
+ */
+#define GET_VAR(W, H) \
+void vpx_get##W##x##H##var_c(const uint8_t *a, int a_stride, \
+                             const uint8_t *b, int b_stride, \
+                             uint32_t *sse, int *sum) { \
+  variance(a, a_stride, b, b_stride, W, H, sse, sum); \
+}
+
+/* Identical to the variance call except it does not calculate the
+ * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
+ * variable.
+ */
+#define MSE(W, H) \
+uint32_t vpx_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
+                              const uint8_t *b, int b_stride, \
+                              uint32_t *sse) { \
+  int sum; \
+  variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+  return *sse; \
+}
+
+/* All three forms of the variance are available in the same sizes. */
+#define VARIANCES(W, H) \
+    VAR(W, H) \
+    SUBPIX_VAR(W, H) \
+    SUBPIX_AVG_VAR(W, H)
+
+VARIANCES(64, 64)
+VARIANCES(64, 32)
+VARIANCES(32, 64)
+VARIANCES(32, 32)
+VARIANCES(32, 16)
+VARIANCES(16, 32)
+VARIANCES(16, 16)
+VARIANCES(16, 8)
+VARIANCES(8, 16)
+VARIANCES(8, 8)
+VARIANCES(8, 4)
+VARIANCES(4, 8)
+VARIANCES(4, 4)
+
+GET_VAR(16, 16)
+GET_VAR(8, 8)
+
+MSE(16, 16)
+MSE(16, 8)
+MSE(8, 16)
+MSE(8, 8)
+
+void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred,
+                         int width, int height,
+                         const uint8_t *ref, int ref_stride) {
+  int i, j;
+
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      const int tmp = pred[j] + ref[j];
+      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
+    }
+    comp_pred += width;
+    pred += width;
+    ref += ref_stride;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_variance64(const uint8_t *a8, int  a_stride,
+                              const uint8_t *b8, int  b_stride,
+                              int w, int h, uint64_t *sse, uint64_t *sum) {
+  int i, j;
+
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  *sum = 0;
+  *sse = 0;
+
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; ++j) {
+      const int diff = a[j] - b[j];
+      *sum += diff;
+      *sse += diff * diff;
+    }
+    a += a_stride;
+    b += b_stride;
+  }
+}
+
+static void highbd_8_variance(const uint8_t *a8, int  a_stride,
+                              const uint8_t *b8, int  b_stride,
+                              int w, int h, uint32_t *sse, int *sum) {
+  uint64_t sse_long = 0;
+  uint64_t sum_long = 0;
+  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+  *sse = (uint32_t)sse_long;
+  *sum = (int)sum_long;
+}
+
+static void highbd_10_variance(const uint8_t *a8, int  a_stride,
+                               const uint8_t *b8, int  b_stride,
+                               int w, int h, uint32_t *sse, int *sum) {
+  uint64_t sse_long = 0;
+  uint64_t sum_long = 0;
+  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
+  *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
+}
+
+static void highbd_12_variance(const uint8_t *a8, int  a_stride,
+                               const uint8_t *b8, int  b_stride,
+                               int w, int h, uint32_t *sse, int *sum) {
+  uint64_t sse_long = 0;
+  uint64_t sum_long = 0;
+  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
+  *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
+}
+
+#define HIGHBD_VAR(W, H) \
+uint32_t vpx_highbd_8_variance##W##x##H##_c(const uint8_t *a, \
+                                            int a_stride, \
+                                            const uint8_t *b, \
+                                            int b_stride, \
+                                            uint32_t *sse) { \
+  int sum; \
+  highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+  return *sse - (((int64_t)sum * sum) / (W * H)); \
+} \
+\
+uint32_t vpx_highbd_10_variance##W##x##H##_c(const uint8_t *a, \
+                                             int a_stride, \
+                                             const uint8_t *b, \
+                                             int b_stride, \
+                                             uint32_t *sse) { \
+  int sum; \
+  highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+  return *sse - (((int64_t)sum * sum) / (W * H)); \
+} \
+\
+uint32_t vpx_highbd_12_variance##W##x##H##_c(const uint8_t *a, \
+                                             int a_stride, \
+                                             const uint8_t *b, \
+                                             int b_stride, \
+                                             uint32_t *sse) { \
+  int sum; \
+  highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+  return *sse - (((int64_t)sum * sum) / (W * H)); \
+}
+
+#define HIGHBD_GET_VAR(S) \
+void vpx_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
+                                      const uint8_t *ref, int ref_stride, \
+                                      uint32_t *sse, int *sum) { \
+  highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
+} \
+\
+void vpx_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
+                                       const uint8_t *ref, int ref_stride, \
+                                       uint32_t *sse, int *sum) { \
+  highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
+} \
+\
+void vpx_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
+                                       const uint8_t *ref, int ref_stride, \
+                                       uint32_t *sse, int *sum) { \
+  highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
+}
+
+#define HIGHBD_MSE(W, H) \
+uint32_t vpx_highbd_8_mse##W##x##H##_c(const uint8_t *src, \
+                                       int src_stride, \
+                                       const uint8_t *ref, \
+                                       int ref_stride, \
+                                       uint32_t *sse) { \
+  int sum; \
+  highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
+  return *sse; \
+} \
+\
+uint32_t vpx_highbd_10_mse##W##x##H##_c(const uint8_t *src, \
+                                        int src_stride, \
+                                        const uint8_t *ref, \
+                                        int ref_stride, \
+                                        uint32_t *sse) { \
+  int sum; \
+  highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
+  return *sse; \
+} \
+\
+uint32_t vpx_highbd_12_mse##W##x##H##_c(const uint8_t *src, \
+                                        int src_stride, \
+                                        const uint8_t *ref, \
+                                        int ref_stride, \
+                                        uint32_t *sse) { \
+  int sum; \
+  highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
+  return *sse; \
+}
+
+static void highbd_var_filter_block2d_bil_first_pass(
+    const uint8_t *src_ptr8,
+    uint16_t *output_ptr,
+    unsigned int src_pixels_per_line,
+    int pixel_step,
+    unsigned int output_height,
+    unsigned int output_width,
+    const uint8_t *filter) {
+  unsigned int i, j;
+  uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; ++j) {
+      output_ptr[j] =
+          ROUND_POWER_OF_TWO((int)src_ptr[0] * filter[0] +
+                             (int)src_ptr[pixel_step] * filter[1],
+                             FILTER_BITS);
+
+      ++src_ptr;
+    }
+
+    // Next row...
+    src_ptr += src_pixels_per_line - output_width;
+    output_ptr += output_width;
+  }
+}
+
+static void highbd_var_filter_block2d_bil_second_pass(
+    const uint16_t *src_ptr,
+    uint16_t *output_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int pixel_step,
+    unsigned int output_height,
+    unsigned int output_width,
+    const uint8_t *filter) {
+  unsigned int  i, j;
+
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; ++j) {
+      output_ptr[j] =
+          ROUND_POWER_OF_TWO((int)src_ptr[0] * filter[0] +
+                             (int)src_ptr[pixel_step] * filter[1],
+                             FILTER_BITS);
+      ++src_ptr;
+    }
+
+    src_ptr += src_pixels_per_line - output_width;
+    output_ptr += output_width;
+  }
+}
+
+#define HIGHBD_SUBPIX_VAR(W, H) \
+uint32_t vpx_highbd_8_sub_pixel_variance##W##x##H##_c( \
+  const uint8_t *src, int  src_stride, \
+  int xoffset, int  yoffset, \
+  const uint8_t *dst, int dst_stride, \
+  uint32_t *sse) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint16_t temp2[H * W]; \
+\
+  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+                                           W, bilinear_filters[xoffset]); \
+  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                            bilinear_filters[yoffset]); \
+\
+  return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \
+                                          dst_stride, sse); \
+} \
+\
+uint32_t vpx_highbd_10_sub_pixel_variance##W##x##H##_c( \
+  const uint8_t *src, int  src_stride, \
+  int xoffset, int  yoffset, \
+  const uint8_t *dst, int dst_stride, \
+  uint32_t *sse) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint16_t temp2[H * W]; \
+\
+  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+                                           W, bilinear_filters[xoffset]); \
+  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                            bilinear_filters[yoffset]); \
+\
+  return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
+                                             W, dst, dst_stride, sse); \
+} \
+\
+uint32_t vpx_highbd_12_sub_pixel_variance##W##x##H##_c( \
+  const uint8_t *src, int  src_stride, \
+  int xoffset, int  yoffset, \
+  const uint8_t *dst, int dst_stride, \
+  uint32_t *sse) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint16_t temp2[H * W]; \
+\
+  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+                                           W, bilinear_filters[xoffset]); \
+  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                            bilinear_filters[yoffset]); \
+\
+  return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
+                                             W, dst, dst_stride, sse); \
+}
+
+#define HIGHBD_SUBPIX_AVG_VAR(W, H) \
+uint32_t vpx_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \
+  const uint8_t *src, int  src_stride, \
+  int xoffset, int  yoffset, \
+  const uint8_t *dst, int dst_stride, \
+  uint32_t *sse, \
+  const uint8_t *second_pred) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint16_t temp2[H * W]; \
+  DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
+\
+  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+                                           W, bilinear_filters[xoffset]); \
+  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                            bilinear_filters[yoffset]); \
+\
+  vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
+                           CONVERT_TO_BYTEPTR(temp2), W); \
+\
+  return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
+                                          dst_stride, sse); \
+} \
+\
+uint32_t vpx_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \
+  const uint8_t *src, int  src_stride, \
+  int xoffset, int  yoffset, \
+  const uint8_t *dst, int dst_stride, \
+  uint32_t *sse, \
+  const uint8_t *second_pred) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint16_t temp2[H * W]; \
+  DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
+\
+  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+                                           W, bilinear_filters[xoffset]); \
+  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                            bilinear_filters[yoffset]); \
+\
+  vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
+                           CONVERT_TO_BYTEPTR(temp2), W); \
+\
+  return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
+                                             W, dst, dst_stride, sse); \
+} \
+\
+uint32_t vpx_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \
+  const uint8_t *src, int  src_stride, \
+  int xoffset, int  yoffset, \
+  const uint8_t *dst, int dst_stride, \
+  uint32_t *sse, \
+  const uint8_t *second_pred) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint16_t temp2[H * W]; \
+  DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
+\
+  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+                                           W, bilinear_filters[xoffset]); \
+  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                            bilinear_filters[yoffset]); \
+\
+  vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
+                           CONVERT_TO_BYTEPTR(temp2), W); \
+\
+  return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
+                                             W, dst, dst_stride, sse); \
+}
+
+/* All three forms of the variance are available in the same sizes. */
+#define HIGHBD_VARIANCES(W, H) \
+    HIGHBD_VAR(W, H) \
+    HIGHBD_SUBPIX_VAR(W, H) \
+    HIGHBD_SUBPIX_AVG_VAR(W, H)
+
+HIGHBD_VARIANCES(64, 64)
+HIGHBD_VARIANCES(64, 32)
+HIGHBD_VARIANCES(32, 64)
+HIGHBD_VARIANCES(32, 32)
+HIGHBD_VARIANCES(32, 16)
+HIGHBD_VARIANCES(16, 32)
+HIGHBD_VARIANCES(16, 16)
+HIGHBD_VARIANCES(16, 8)
+HIGHBD_VARIANCES(8, 16)
+HIGHBD_VARIANCES(8, 8)
+HIGHBD_VARIANCES(8, 4)
+HIGHBD_VARIANCES(4, 8)
+HIGHBD_VARIANCES(4, 4)
+
+HIGHBD_GET_VAR(8)
+HIGHBD_GET_VAR(16)
+
+HIGHBD_MSE(16, 16)
+HIGHBD_MSE(16, 8)
+HIGHBD_MSE(8, 16)
+HIGHBD_MSE(8, 8)
+
+void vpx_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8,
+                              int width, int height, const uint8_t *ref8,
+                              int ref_stride) {
+  int i, j;
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      const int tmp = pred[j] + ref[j];
+      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
+    }
+    comp_pred += width;
+    pred += width;
+    ref += ref_stride;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/libs/libvpx/vpx_dsp/variance.h b/libs/libvpx/vpx_dsp/variance.h
new file mode 100644
index 0000000000..cd0fd98785
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/variance.h
@@ -0,0 +1,94 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_VARIANCE_H_
+#define VPX_DSP_VARIANCE_H_
+
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define FILTER_BITS 7
+#define FILTER_WEIGHT 128
+
+typedef unsigned int(*vpx_sad_fn_t)(const uint8_t *a, int a_stride,
+                                    const uint8_t *b_ptr, int b_stride);
+
+typedef unsigned int(*vpx_sad_avg_fn_t)(const uint8_t *a_ptr, int a_stride,
+                                        const uint8_t *b_ptr, int b_stride,
+                                        const uint8_t *second_pred);
+
+typedef void (*vp8_copy32xn_fn_t)(const uint8_t *a, int a_stride,
+                                  uint8_t *b, int b_stride, int n);
+
+typedef void (*vpx_sad_multi_fn_t)(const uint8_t *a, int a_stride,
+                                   const uint8_t *b, int b_stride,
+                                   unsigned int *sad_array);
+
+typedef void (*vpx_sad_multi_d_fn_t)(const uint8_t *a, int a_stride,
+                                     const uint8_t *const b_array[],
+                                     int b_stride,
+                                     unsigned int *sad_array);
+
+typedef unsigned int (*vpx_variance_fn_t)(const uint8_t *a, int a_stride,
+                                          const uint8_t *b, int b_stride,
+                                          unsigned int *sse);
+
+typedef unsigned int (*vpx_subpixvariance_fn_t)(const uint8_t *a, int a_stride,
+                                                int xoffset, int yoffset,
+                                                const uint8_t *b, int b_stride,
+                                                unsigned int *sse);
+
+typedef unsigned int (*vpx_subp_avg_variance_fn_t)(const uint8_t *a_ptr,
+                                                   int a_stride,
+                                                   int xoffset, int yoffset,
+                                                   const uint8_t *b_ptr,
+                                                   int b_stride,
+                                                   unsigned int *sse,
+                                                   const uint8_t *second_pred);
+#if CONFIG_VP8
+typedef struct variance_vtable {
+  vpx_sad_fn_t            sdf;
+  vpx_variance_fn_t       vf;
+  vpx_subpixvariance_fn_t svf;
+  vpx_variance_fn_t       svf_halfpix_h;
+  vpx_variance_fn_t       svf_halfpix_v;
+  vpx_variance_fn_t       svf_halfpix_hv;
+  vpx_sad_multi_fn_t      sdx3f;
+  vpx_sad_multi_fn_t      sdx8f;
+  vpx_sad_multi_d_fn_t    sdx4df;
+#if ARCH_X86 || ARCH_X86_64
+  vp8_copy32xn_fn_t       copymem;
+#endif
+} vp8_variance_fn_ptr_t;
+#endif  // CONFIG_VP8
+
+#if CONFIG_VP9 || CONFIG_VP10
+typedef struct vp9_variance_vtable {
+  vpx_sad_fn_t               sdf;
+  vpx_sad_avg_fn_t           sdaf;
+  vpx_variance_fn_t          vf;
+  vpx_subpixvariance_fn_t    svf;
+  vpx_subp_avg_variance_fn_t svaf;
+  vpx_sad_multi_fn_t         sdx3f;
+  vpx_sad_multi_fn_t         sdx8f;
+  vpx_sad_multi_d_fn_t       sdx4df;
+} vp9_variance_fn_ptr_t;
+#endif  // CONFIG_VP9 || CONFIG_VP10
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_DSP_VARIANCE_H_
diff --git a/libs/libvpx/vpx_dsp/vpx_convolve.c b/libs/libvpx/vpx_dsp/vpx_convolve.c
new file mode 100644
index 0000000000..2d1c927cbe
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/vpx_convolve.c
@@ -0,0 +1,612 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <string.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_convolve.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_ports/mem.h"
+
+static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const InterpKernel *x_filters,
+                           int x0_q4, int x_step_q4, int w, int h) {
+  int x, y;
+  src -= SUBPEL_TAPS / 2 - 1;
+  for (y = 0; y < h; ++y) {
+    int x_q4 = x0_q4;
+    for (x = 0; x < w; ++x) {
+      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+      int k, sum = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k)
+        sum += src_x[k] * x_filter[k];
+      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+      x_q4 += x_step_q4;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const InterpKernel *x_filters,
+                               int x0_q4, int x_step_q4, int w, int h) {
+  int x, y;
+  src -= SUBPEL_TAPS / 2 - 1;
+  for (y = 0; y < h; ++y) {
+    int x_q4 = x0_q4;
+    for (x = 0; x < w; ++x) {
+      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+      int k, sum = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k)
+        sum += src_x[k] * x_filter[k];
+      dst[x] = ROUND_POWER_OF_TWO(dst[x] +
+          clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
+      x_q4 += x_step_q4;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
+                          uint8_t *dst, ptrdiff_t dst_stride,
+                          const InterpKernel *y_filters,
+                          int y0_q4, int y_step_q4, int w, int h) {
+  int x, y;
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+  for (x = 0; x < w; ++x) {
+    int y_q4 = y0_q4;
+    for (y = 0; y < h; ++y) {
+      const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+      int k, sum = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k)
+        sum += src_y[k * src_stride] * y_filter[k];
+      dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+      y_q4 += y_step_q4;
+    }
+    ++src;
+    ++dst;
+  }
+}
+
+static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const InterpKernel *y_filters,
+                              int y0_q4, int y_step_q4, int w, int h) {
+  int x, y;
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+  for (x = 0; x < w; ++x) {
+    int y_q4 = y0_q4;
+    for (y = 0; y < h; ++y) {
+      const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+      int k, sum = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k)
+        sum += src_y[k * src_stride] * y_filter[k];
+      dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] +
+          clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
+      y_q4 += y_step_q4;
+    }
+    ++src;
+    ++dst;
+  }
+}
+
+static void convolve(const uint8_t *src, ptrdiff_t src_stride,
+                     uint8_t *dst, ptrdiff_t dst_stride,
+                     const InterpKernel *const x_filters,
+                     int x0_q4, int x_step_q4,
+                     const InterpKernel *const y_filters,
+                     int y0_q4, int y_step_q4,
+                     int w, int h) {
+  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+  // 2d filtering proceeds in 2 steps:
+  //   (1) Interpolate horizontally into an intermediate buffer, temp.
+  //   (2) Interpolate temp vertically to derive the sub-pixel result.
+  // Deriving the maximum number of rows in the temp buffer (135):
+  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+  // --Largest block size is 64x64 pixels.
+  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+  //   original frame (in 1/16th pixel units).
+  // --Must round-up because block may be located at sub-pixel position.
+  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+  uint8_t temp[135 * 64];
+  int intermediate_height =
+          (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+  assert(w <= 64);
+  assert(h <= 64);
+  assert(y_step_q4 <= 32);
+  assert(x_step_q4 <= 32);
+
+  convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,
+                 x_filters, x0_q4, x_step_q4, w, intermediate_height);
+  convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
+                y_filters, y0_q4, y_step_q4, w, h);
+}
+
+static const InterpKernel *get_filter_base(const int16_t *filter) {
+  // NOTE: This assumes that the filter table is 256-byte aligned.
+  // TODO(agrange) Modify to make independent of table alignment.
+  return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
+}
+
+static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
+  return (int)((const InterpKernel *)(intptr_t)f - base);
+}
+
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const int16_t *filter_x, int x_step_q4,
+                           const int16_t *filter_y, int y_step_q4,
+                           int w, int h) {
+  const InterpKernel *const filters_x = get_filter_base(filter_x);
+  const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+  (void)filter_y;
+  (void)y_step_q4;
+
+  convolve_horiz(src, src_stride, dst, dst_stride, filters_x,
+                 x0_q4, x_step_q4, w, h);
+}
+
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const int16_t *filter_x, int x_step_q4,
+                               const int16_t *filter_y, int y_step_q4,
+                               int w, int h) {
+  const InterpKernel *const filters_x = get_filter_base(filter_x);
+  const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+  (void)filter_y;
+  (void)y_step_q4;
+
+  convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x,
+                     x0_q4, x_step_q4, w, h);
+}
+
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+                          uint8_t *dst, ptrdiff_t dst_stride,
+                          const int16_t *filter_x, int x_step_q4,
+                          const int16_t *filter_y, int y_step_q4,
+                          int w, int h) {
+  const InterpKernel *const filters_y = get_filter_base(filter_y);
+  const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+  (void)filter_x;
+  (void)x_step_q4;
+
+  convolve_vert(src, src_stride, dst, dst_stride, filters_y,
+                y0_q4, y_step_q4, w, h);
+}
+
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const int16_t *filter_x, int x_step_q4,
+                              const int16_t *filter_y, int y_step_q4,
+                              int w, int h) {
+  const InterpKernel *const filters_y = get_filter_base(filter_y);
+  const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+  (void)filter_x;
+  (void)x_step_q4;
+
+  convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y,
+                    y0_q4, y_step_q4, w, h);
+}
+
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
+                     uint8_t *dst, ptrdiff_t dst_stride,
+                     const int16_t *filter_x, int x_step_q4,
+                     const int16_t *filter_y, int y_step_q4,
+                     int w, int h) {
+  const InterpKernel *const filters_x = get_filter_base(filter_x);
+  const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+  const InterpKernel *const filters_y = get_filter_base(filter_y);
+  const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+  convolve(src, src_stride, dst, dst_stride,
+           filters_x, x0_q4, x_step_q4,
+           filters_y, y0_q4, y_step_q4, w, h);
+}
+
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
+                         uint8_t *dst, ptrdiff_t dst_stride,
+                         const int16_t *filter_x, int x_step_q4,
+                         const int16_t *filter_y, int y_step_q4,
+                         int w, int h) {
+  /* Fixed size intermediate buffer places limits on parameters. */
+  DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]);
+  assert(w <= 64);
+  assert(h <= 64);
+
+  vpx_convolve8_c(src, src_stride, temp, 64,
+                  filter_x, x_step_q4, filter_y, y_step_q4, w, h);
+  vpx_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h);
+}
+
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
+                         uint8_t *dst, ptrdiff_t dst_stride,
+                         const int16_t *filter_x, int filter_x_stride,
+                         const int16_t *filter_y, int filter_y_stride,
+                         int w, int h) {
+  int r;
+
+  (void)filter_x;  (void)filter_x_stride;
+  (void)filter_y;  (void)filter_y_stride;
+
+  for (r = h; r > 0; --r) {
+    memcpy(dst, src, w);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
+                        uint8_t *dst, ptrdiff_t dst_stride,
+                        const int16_t *filter_x, int filter_x_stride,
+                        const int16_t *filter_y, int filter_y_stride,
+                        int w, int h) {
+  int x, y;
+
+  (void)filter_x;  (void)filter_x_stride;
+  (void)filter_y;  (void)filter_y_stride;
+
+  for (y = 0; y < h; ++y) {
+    for (x = 0; x < w; ++x)
+      dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
+
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+                        uint8_t *dst, ptrdiff_t dst_stride,
+                        const int16_t *filter_x, int x_step_q4,
+                        const int16_t *filter_y, int y_step_q4,
+                        int w, int h) {
+  vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
+                        filter_y, y_step_q4, w, h);
+}
+
+void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+                       uint8_t *dst, ptrdiff_t dst_stride,
+                       const int16_t *filter_x, int x_step_q4,
+                       const int16_t *filter_y, int y_step_q4,
+                       int w, int h) {
+  vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
+                       filter_y, y_step_q4, w, h);
+}
+
+void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride,
+                     uint8_t *dst, ptrdiff_t dst_stride,
+                     const int16_t *filter_x, int x_step_q4,
+                     const int16_t *filter_y, int y_step_q4,
+                     int w, int h) {
+  vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
+                  filter_y, y_step_q4, w, h);
+}
+
+void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
+                            const int16_t *filter_x, int x_step_q4,
+                            const int16_t *filter_y, int y_step_q4,
+                            int w, int h) {
+  vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
+                            x_step_q4, filter_y, y_step_q4, w, h);
+}
+
+void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const int16_t *filter_x, int x_step_q4,
+                           const int16_t *filter_y, int y_step_q4,
+                           int w, int h) {
+  vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
+                           x_step_q4, filter_y, y_step_q4, w, h);
+}
+
+void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride,
+                     uint8_t *dst, ptrdiff_t dst_stride,
+                     const int16_t *filter_x, int x_step_q4,
+                     const int16_t *filter_y, int y_step_q4,
+                     int w, int h) {
+  vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
+                      filter_y, y_step_q4, w, h);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
+                                  uint8_t *dst8, ptrdiff_t dst_stride,
+                                  const InterpKernel *x_filters,
+                                  int x0_q4, int x_step_q4,
+                                  int w, int h, int bd) {
+  int x, y;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  src -= SUBPEL_TAPS / 2 - 1;
+  for (y = 0; y < h; ++y) {
+    int x_q4 = x0_q4;
+    for (x = 0; x < w; ++x) {
+      const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+      int k, sum = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k)
+        sum += src_x[k] * x_filter[k];
+      dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+      x_q4 += x_step_q4;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride,
+                                      uint8_t *dst8, ptrdiff_t dst_stride,
+                                      const InterpKernel *x_filters,
+                                      int x0_q4, int x_step_q4,
+                                      int w, int h, int bd) {
+  int x, y;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  src -= SUBPEL_TAPS / 2 - 1;
+  for (y = 0; y < h; ++y) {
+    int x_q4 = x0_q4;
+    for (x = 0; x < w; ++x) {
+      const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+      int k, sum = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k)
+        sum += src_x[k] * x_filter[k];
+      dst[x] = ROUND_POWER_OF_TWO(dst[x] +
+          clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), 1);
+      x_q4 += x_step_q4;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
+                                 uint8_t *dst8, ptrdiff_t dst_stride,
+                                 const InterpKernel *y_filters,
+                                 int y0_q4, int y_step_q4, int w, int h,
+                                 int bd) {
+  int x, y;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  for (x = 0; x < w; ++x) {
+    int y_q4 = y0_q4;
+    for (y = 0; y < h; ++y) {
+      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+      int k, sum = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k)
+        sum += src_y[k * src_stride] * y_filter[k];
+      dst[y * dst_stride] = clip_pixel_highbd(
+          ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+      y_q4 += y_step_q4;
+    }
+    ++src;
+    ++dst;
+  }
+}
+
+static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride,
+                                     uint8_t *dst8, ptrdiff_t dst_stride,
+                                     const InterpKernel *y_filters,
+                                     int y0_q4, int y_step_q4, int w, int h,
+                                     int bd) {
+  int x, y;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  for (x = 0; x < w; ++x) {
+    int y_q4 = y0_q4;
+    for (y = 0; y < h; ++y) {
+      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+      int k, sum = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k)
+        sum += src_y[k * src_stride] * y_filter[k];
+      dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] +
+          clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), 1);
+      y_q4 += y_step_q4;
+    }
+    ++src;
+    ++dst;
+  }
+}
+
+static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
+                            const InterpKernel *const x_filters,
+                            int x0_q4, int x_step_q4,
+                            const InterpKernel *const y_filters,
+                            int y0_q4, int y_step_q4,
+                            int w, int h, int bd) {
+  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+  // 2d filtering proceeds in 2 steps:
+  //   (1) Interpolate horizontally into an intermediate buffer, temp.
+  //   (2) Interpolate temp vertically to derive the sub-pixel result.
+  // Deriving the maximum number of rows in the temp buffer (135):
+  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+  // --Largest block size is 64x64 pixels.
+  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+  //   original frame (in 1/16th pixel units).
+  // --Must round-up because block may be located at sub-pixel position.
+  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+  uint16_t temp[64 * 135];
+  int intermediate_height =
+          (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+  assert(w <= 64);
+  assert(h <= 64);
+  assert(y_step_q4 <= 32);
+  assert(x_step_q4 <= 32);
+
+  highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+                        src_stride, CONVERT_TO_BYTEPTR(temp), 64,
+                        x_filters, x0_q4, x_step_q4, w,
+                        intermediate_height, bd);
+  highbd_convolve_vert(CONVERT_TO_BYTEPTR(temp) + 64 * (SUBPEL_TAPS / 2 - 1),
+                       64, dst, dst_stride, y_filters, y0_q4, y_step_q4,
+                       w, h, bd);
+}
+
+
+void vpx_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const int16_t *filter_x, int x_step_q4,
+                                  const int16_t *filter_y, int y_step_q4,
+                                  int w, int h, int bd) {
+  const InterpKernel *const filters_x = get_filter_base(filter_x);
+  const int x0_q4 = get_filter_offset(filter_x, filters_x);
+  (void)filter_y;
+  (void)y_step_q4;
+
+  highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x,
+                        x0_q4, x_step_q4, w, h, bd);
+}
+
+void vpx_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const int16_t *filter_x, int x_step_q4,
+                                      const int16_t *filter_y, int y_step_q4,
+                                      int w, int h, int bd) {
+  const InterpKernel *const filters_x = get_filter_base(filter_x);
+  const int x0_q4 = get_filter_offset(filter_x, filters_x);
+  (void)filter_y;
+  (void)y_step_q4;
+
+  highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x,
+                            x0_q4, x_step_q4, w, h, bd);
+}
+
+void vpx_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+                                 uint8_t *dst, ptrdiff_t dst_stride,
+                                 const int16_t *filter_x, int x_step_q4,
+                                 const int16_t *filter_y, int y_step_q4,
+                                 int w, int h, int bd) {
+  const InterpKernel *const filters_y = get_filter_base(filter_y);
+  const int y0_q4 = get_filter_offset(filter_y, filters_y);
+  (void)filter_x;
+  (void)x_step_q4;
+
+  highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y,
+                       y0_q4, y_step_q4, w, h, bd);
+}
+
+void vpx_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+                                     uint8_t *dst, ptrdiff_t dst_stride,
+                                     const int16_t *filter_x, int x_step_q4,
+                                     const int16_t *filter_y, int y_step_q4,
+                                     int w, int h, int bd) {
+  const InterpKernel *const filters_y = get_filter_base(filter_y);
+  const int y0_q4 = get_filter_offset(filter_y, filters_y);
+  (void)filter_x;
+  (void)x_step_q4;
+
+  highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y,
+                           y0_q4, y_step_q4, w, h, bd);
+}
+
+void vpx_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
+                            const int16_t *filter_x, int x_step_q4,
+                            const int16_t *filter_y, int y_step_q4,
+                            int w, int h, int bd) {
+  const InterpKernel *const filters_x = get_filter_base(filter_x);
+  const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+  const InterpKernel *const filters_y = get_filter_base(filter_y);
+  const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+  highbd_convolve(src, src_stride, dst, dst_stride,
+                  filters_x, x0_q4, x_step_q4,
+                  filters_y, y0_q4, y_step_q4, w, h, bd);
+}
+
+void vpx_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
+                                uint8_t *dst, ptrdiff_t dst_stride,
+                                const int16_t *filter_x, int x_step_q4,
+                                const int16_t *filter_y, int y_step_q4,
+                                int w, int h, int bd) {
+  // Fixed size intermediate buffer places limits on parameters.
+  DECLARE_ALIGNED(16, uint16_t, temp[64 * 64]);
+  assert(w <= 64);
+  assert(h <= 64);
+
+  vpx_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), 64,
+                         filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);
+  vpx_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), 64, dst, dst_stride,
+                            NULL, 0, NULL, 0, w, h, bd);
+}
+
+void vpx_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
+                                uint8_t *dst8, ptrdiff_t dst_stride,
+                                const int16_t *filter_x, int filter_x_stride,
+                                const int16_t *filter_y, int filter_y_stride,
+                                int w, int h, int bd) {
+  int r;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  (void)filter_x;
+  (void)filter_y;
+  (void)filter_x_stride;
+  (void)filter_y_stride;
+  (void)bd;
+
+  for (r = h; r > 0; --r) {
+    memcpy(dst, src, w * sizeof(uint16_t));
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void vpx_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride,
+                               uint8_t *dst8, ptrdiff_t dst_stride,
+                               const int16_t *filter_x, int filter_x_stride,
+                               const int16_t *filter_y, int filter_y_stride,
+                               int w, int h, int bd) {
+  int x, y;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  (void)filter_x;
+  (void)filter_y;
+  (void)filter_x_stride;
+  (void)filter_y_stride;
+  (void)bd;
+
+  for (y = 0; y < h; ++y) {
+    for (x = 0; x < w; ++x) {
+      dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+#endif
diff --git a/libs/libvpx/vpx_dsp/vpx_convolve.h b/libs/libvpx/vpx_dsp/vpx_convolve.h
new file mode 100644
index 0000000000..9ed3f1750f
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/vpx_convolve.h
@@ -0,0 +1,38 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef VPX_DSP_VPX_CONVOLVE_H_
+#define VPX_DSP_VPX_CONVOLVE_H_
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const int16_t *filter_x, int x_step_q4,
+                              const int16_t *filter_y, int y_step_q4,
+                              int w, int h);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef void (*highbd_convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
+                                     uint8_t *dst, ptrdiff_t dst_stride,
+                                     const int16_t *filter_x, int x_step_q4,
+                                     const int16_t *filter_y, int y_step_q4,
+                                     int w, int h, int bd);
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_DSP_VPX_CONVOLVE_H_
diff --git a/libs/libvpx/vpx_dsp/vpx_dsp.mk b/libs/libvpx/vpx_dsp/vpx_dsp.mk
new file mode 100644
index 0000000000..e394688c71
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/vpx_dsp.mk
@@ -0,0 +1,351 @@
+##
+## Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+DSP_SRCS-yes += vpx_dsp.mk
+DSP_SRCS-yes += vpx_dsp_common.h
+
+DSP_SRCS-$(HAVE_MSA)    += mips/macros_msa.h
+
+# bit reader
+DSP_SRCS-yes += prob.h
+DSP_SRCS-yes += prob.c
+
+ifeq ($(CONFIG_ENCODERS),yes)
+DSP_SRCS-yes += bitwriter.h
+DSP_SRCS-yes += bitwriter.c
+DSP_SRCS-yes += bitwriter_buffer.c
+DSP_SRCS-yes += bitwriter_buffer.h
+DSP_SRCS-$(CONFIG_INTERNAL_STATS) += ssim.c
+DSP_SRCS-$(CONFIG_INTERNAL_STATS) += ssim.h
+DSP_SRCS-$(CONFIG_INTERNAL_STATS) += psnrhvs.c
+DSP_SRCS-$(CONFIG_INTERNAL_STATS) += fastssim.c
+endif
+
+ifeq ($(CONFIG_DECODERS),yes)
+DSP_SRCS-yes += bitreader.h
+DSP_SRCS-yes += bitreader.c
+DSP_SRCS-yes += bitreader_buffer.c
+DSP_SRCS-yes += bitreader_buffer.h
+endif
+
+# intra predictions
+DSP_SRCS-yes += intrapred.c
+
+ifeq ($(CONFIG_USE_X86INC),yes)
+DSP_SRCS-$(HAVE_SSE) += x86/intrapred_sse2.asm
+DSP_SRCS-$(HAVE_SSE2) += x86/intrapred_sse2.asm
+DSP_SRCS-$(HAVE_SSSE3) += x86/intrapred_ssse3.asm
+DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_ssse3.asm
+endif  # CONFIG_USE_X86INC
+
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+ifeq ($(CONFIG_USE_X86INC),yes)
+DSP_SRCS-$(HAVE_SSE)  += x86/highbd_intrapred_sse2.asm
+DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_sse2.asm
+endif  # CONFIG_USE_X86INC
+endif  # CONFIG_VP9_HIGHBITDEPTH
+
+DSP_SRCS-$(HAVE_NEON_ASM) += arm/intrapred_neon_asm$(ASM)
+DSP_SRCS-$(HAVE_NEON) += arm/intrapred_neon.c
+DSP_SRCS-$(HAVE_MSA) += mips/intrapred_msa.c
+DSP_SRCS-$(HAVE_DSPR2)  += mips/intrapred4_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2)  += mips/intrapred8_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2)  += mips/intrapred16_dspr2.c
+
+DSP_SRCS-$(HAVE_DSPR2)  += mips/common_dspr2.h
+DSP_SRCS-$(HAVE_DSPR2)  += mips/common_dspr2.c
+
+# interpolation filters
+DSP_SRCS-yes += vpx_convolve.c
+DSP_SRCS-yes += vpx_convolve.h
+DSP_SRCS-yes += vpx_filter.h
+
+DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/convolve.h
+DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/vpx_asm_stubs.c
+DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_subpixel_8t_sse2.asm
+DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_subpixel_bilinear_sse2.asm
+DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_ssse3.asm
+DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_bilinear_ssse3.asm
+DSP_SRCS-$(HAVE_AVX2)  += x86/vpx_subpixel_8t_intrin_avx2.c
+DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_intrin_ssse3.c
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_high_subpixel_8t_sse2.asm
+DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_high_subpixel_bilinear_sse2.asm
+endif
+ifeq ($(CONFIG_USE_X86INC),yes)
+DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_convolve_copy_sse2.asm
+endif
+
+ifeq ($(HAVE_NEON_ASM),yes)
+DSP_SRCS-yes += arm/vpx_convolve_copy_neon_asm$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_avg_neon_asm$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_neon_asm$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve_avg_neon_asm$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve_neon.c
+else
+ifeq ($(HAVE_NEON),yes)
+DSP_SRCS-yes += arm/vpx_convolve_copy_neon.c
+DSP_SRCS-yes += arm/vpx_convolve8_avg_neon.c
+DSP_SRCS-yes += arm/vpx_convolve8_neon.c
+DSP_SRCS-yes += arm/vpx_convolve_avg_neon.c
+DSP_SRCS-yes += arm/vpx_convolve_neon.c
+endif  # HAVE_NEON
+endif  # HAVE_NEON_ASM
+
+# common (msa)
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_avg_horiz_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_avg_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_avg_vert_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_horiz_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_vert_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve_avg_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve_copy_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve_msa.h
+
+# common (dspr2)
+DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve_common_dspr2.h
+DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve2_avg_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve2_avg_horiz_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve2_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve2_horiz_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve2_vert_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve8_avg_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve8_avg_horiz_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve8_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve8_horiz_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve8_vert_dspr2.c
+
+# loop filters
+DSP_SRCS-yes += loopfilter.c
+
+DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64)   += x86/loopfilter_sse2.c
+DSP_SRCS-$(HAVE_AVX2)                += x86/loopfilter_avx2.c
+DSP_SRCS-$(HAVE_MMX)                 += x86/loopfilter_mmx.asm
+
+DSP_SRCS-$(HAVE_NEON)   += arm/loopfilter_neon.c
+ifeq ($(HAVE_NEON_ASM),yes)
+DSP_SRCS-yes  += arm/loopfilter_mb_neon$(ASM)
+DSP_SRCS-yes  += arm/loopfilter_16_neon$(ASM)
+DSP_SRCS-yes  += arm/loopfilter_8_neon$(ASM)
+DSP_SRCS-yes  += arm/loopfilter_4_neon$(ASM)
+else
+ifeq ($(HAVE_NEON),yes)
+DSP_SRCS-yes   += arm/loopfilter_16_neon.c
+DSP_SRCS-yes   += arm/loopfilter_8_neon.c
+DSP_SRCS-yes   += arm/loopfilter_4_neon.c
+endif  # HAVE_NEON
+endif  # HAVE_NEON_ASM
+
+DSP_SRCS-$(HAVE_MSA)    += mips/loopfilter_msa.h
+DSP_SRCS-$(HAVE_MSA)    += mips/loopfilter_16_msa.c
+DSP_SRCS-$(HAVE_MSA)    += mips/loopfilter_8_msa.c
+DSP_SRCS-$(HAVE_MSA)    += mips/loopfilter_4_msa.c
+DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_filters_dspr2.h
+DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_filters_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_macros_dspr2.h
+DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_masks_dspr2.h
+DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_mb_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_mb_horiz_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_mb_vert_dspr2.c
+
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_loopfilter_sse2.c
+endif  # CONFIG_VP9_HIGHBITDEPTH
+
+DSP_SRCS-yes            += txfm_common.h
+DSP_SRCS-$(HAVE_SSE2)   += x86/txfm_common_sse2.h
+DSP_SRCS-$(HAVE_MSA)    += mips/txfm_macros_msa.h
+# forward transform
+ifneq ($(filter yes,$(CONFIG_VP9_ENCODER) $(CONFIG_VP10_ENCODER)),)
+DSP_SRCS-yes            += fwd_txfm.c
+DSP_SRCS-yes            += fwd_txfm.h
+DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_txfm_sse2.h
+DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_txfm_sse2.c
+DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_txfm_impl_sse2.h
+DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_dct32x32_impl_sse2.h
+ifeq ($(ARCH_X86_64),yes)
+ifeq ($(CONFIG_USE_X86INC),yes)
+DSP_SRCS-$(HAVE_SSSE3)  += x86/fwd_txfm_ssse3_x86_64.asm
+endif
+endif
+DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_txfm_avx2.c
+DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_dct32x32_impl_avx2.h
+DSP_SRCS-$(HAVE_NEON)   += arm/fwd_txfm_neon.c
+DSP_SRCS-$(HAVE_MSA)    += mips/fwd_txfm_msa.h
+DSP_SRCS-$(HAVE_MSA)    += mips/fwd_txfm_msa.c
+DSP_SRCS-$(HAVE_MSA)    += mips/fwd_dct32x32_msa.c
+endif  # CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
+
+# inverse transform
+ifneq ($(filter yes,$(CONFIG_VP9) $(CONFIG_VP10)),)
+DSP_SRCS-yes            += inv_txfm.h
+DSP_SRCS-yes            += inv_txfm.c
+DSP_SRCS-$(HAVE_SSE2)   += x86/inv_txfm_sse2.h
+DSP_SRCS-$(HAVE_SSE2)   += x86/inv_txfm_sse2.c
+ifeq ($(CONFIG_USE_X86INC),yes)
+DSP_SRCS-$(HAVE_SSE2)   += x86/inv_wht_sse2.asm
+ifeq ($(ARCH_X86_64),yes)
+DSP_SRCS-$(HAVE_SSSE3)  += x86/inv_txfm_ssse3_x86_64.asm
+endif  # ARCH_X86_64
+endif  # CONFIG_USE_X86INC
+
+ifeq ($(HAVE_NEON_ASM),yes)
+DSP_SRCS-yes  += arm/save_reg_neon$(ASM)
+DSP_SRCS-yes  += arm/idct4x4_1_add_neon$(ASM)
+DSP_SRCS-yes  += arm/idct4x4_add_neon$(ASM)
+DSP_SRCS-yes  += arm/idct8x8_1_add_neon$(ASM)
+DSP_SRCS-yes  += arm/idct8x8_add_neon$(ASM)
+DSP_SRCS-yes  += arm/idct16x16_1_add_neon$(ASM)
+DSP_SRCS-yes  += arm/idct16x16_add_neon$(ASM)
+DSP_SRCS-yes  += arm/idct32x32_1_add_neon$(ASM)
+DSP_SRCS-yes  += arm/idct32x32_add_neon$(ASM)
+else
+ifeq ($(HAVE_NEON),yes)
+DSP_SRCS-yes  += arm/idct4x4_1_add_neon.c
+DSP_SRCS-yes  += arm/idct4x4_add_neon.c
+DSP_SRCS-yes  += arm/idct8x8_1_add_neon.c
+DSP_SRCS-yes  += arm/idct8x8_add_neon.c
+DSP_SRCS-yes  += arm/idct16x16_1_add_neon.c
+DSP_SRCS-yes  += arm/idct16x16_add_neon.c
+DSP_SRCS-yes  += arm/idct32x32_1_add_neon.c
+DSP_SRCS-yes  += arm/idct32x32_add_neon.c
+endif  # HAVE_NEON
+endif  # HAVE_NEON_ASM
+DSP_SRCS-$(HAVE_NEON)  += arm/idct16x16_neon.c
+
+DSP_SRCS-$(HAVE_MSA)   += mips/inv_txfm_msa.h
+DSP_SRCS-$(HAVE_MSA)   += mips/idct4x4_msa.c
+DSP_SRCS-$(HAVE_MSA)   += mips/idct8x8_msa.c
+DSP_SRCS-$(HAVE_MSA)   += mips/idct16x16_msa.c
+DSP_SRCS-$(HAVE_MSA)   += mips/idct32x32_msa.c
+
+ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_DSPR2) += mips/inv_txfm_dspr2.h
+DSP_SRCS-$(HAVE_DSPR2) += mips/itrans4_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/itrans8_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/itrans16_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_cols_dspr2.c
+endif  # CONFIG_VP9_HIGHBITDEPTH
+endif  # CONFIG_VP9 || CONFIG_VP10
+
+# quantization
+ifneq ($(filter yes, $(CONFIG_VP9_ENCODER) $(CONFIG_VP10_ENCODER)),)
+DSP_SRCS-yes            += quantize.c
+DSP_SRCS-yes            += quantize.h
+
+DSP_SRCS-$(HAVE_SSE2)   += x86/quantize_sse2.c
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_quantize_intrin_sse2.c
+endif
+ifeq ($(ARCH_X86_64),yes)
+ifeq ($(CONFIG_USE_X86INC),yes)
+DSP_SRCS-$(HAVE_SSSE3)  += x86/quantize_ssse3_x86_64.asm
+DSP_SRCS-$(HAVE_AVX)    += x86/quantize_avx_x86_64.asm
+endif
+endif
+
+# avg
+DSP_SRCS-yes           += avg.c
+DSP_SRCS-$(HAVE_SSE2)  += x86/avg_intrin_sse2.c
+DSP_SRCS-$(HAVE_NEON)  += arm/avg_neon.c
+DSP_SRCS-$(HAVE_MSA)   += mips/avg_msa.c
+ifeq ($(ARCH_X86_64),yes)
+ifeq ($(CONFIG_USE_X86INC),yes)
+DSP_SRCS-$(HAVE_SSSE3) += x86/avg_ssse3_x86_64.asm
+endif
+endif
+
+endif  # CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
+
+ifeq ($(CONFIG_ENCODERS),yes)
+DSP_SRCS-yes            += sad.c
+DSP_SRCS-yes            += subtract.c
+
+DSP_SRCS-$(HAVE_MEDIA)  += arm/sad_media$(ASM)
+DSP_SRCS-$(HAVE_NEON)   += arm/sad4d_neon.c
+DSP_SRCS-$(HAVE_NEON)   += arm/sad_neon.c
+DSP_SRCS-$(HAVE_NEON)   += arm/subtract_neon.c
+
+DSP_SRCS-$(HAVE_MSA)    += mips/sad_msa.c
+DSP_SRCS-$(HAVE_MSA)    += mips/subtract_msa.c
+
+DSP_SRCS-$(HAVE_MMX)    += x86/sad_mmx.asm
+DSP_SRCS-$(HAVE_SSE3)   += x86/sad_sse3.asm
+DSP_SRCS-$(HAVE_SSSE3)  += x86/sad_ssse3.asm
+DSP_SRCS-$(HAVE_SSE4_1) += x86/sad_sse4.asm
+DSP_SRCS-$(HAVE_AVX2)   += x86/sad4d_avx2.c
+DSP_SRCS-$(HAVE_AVX2)   += x86/sad_avx2.c
+
+ifeq ($(CONFIG_USE_X86INC),yes)
+DSP_SRCS-$(HAVE_SSE)    += x86/sad4d_sse2.asm
+DSP_SRCS-$(HAVE_SSE)    += x86/sad_sse2.asm
+DSP_SRCS-$(HAVE_SSE2)   += x86/sad4d_sse2.asm
+DSP_SRCS-$(HAVE_SSE2)   += x86/sad_sse2.asm
+DSP_SRCS-$(HAVE_SSE2)   += x86/subtract_sse2.asm
+
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
+DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm
+endif  # CONFIG_VP9_HIGHBITDEPTH
+endif  # CONFIG_USE_X86INC
+
+endif  # CONFIG_ENCODERS
+
+ifneq ($(filter yes,$(CONFIG_ENCODERS) $(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)),)
+DSP_SRCS-yes            += variance.c
+DSP_SRCS-yes            += variance.h
+
+DSP_SRCS-$(HAVE_MEDIA)  += arm/bilinear_filter_media$(ASM)
+DSP_SRCS-$(HAVE_MEDIA)  += arm/subpel_variance_media.c
+DSP_SRCS-$(HAVE_MEDIA)  += arm/variance_halfpixvar16x16_h_media$(ASM)
+DSP_SRCS-$(HAVE_MEDIA)  += arm/variance_halfpixvar16x16_hv_media$(ASM)
+DSP_SRCS-$(HAVE_MEDIA)  += arm/variance_halfpixvar16x16_v_media$(ASM)
+DSP_SRCS-$(HAVE_MEDIA)  += arm/variance_media$(ASM)
+DSP_SRCS-$(HAVE_NEON)   += arm/subpel_variance_neon.c
+DSP_SRCS-$(HAVE_NEON)   += arm/variance_neon.c
+
+DSP_SRCS-$(HAVE_MSA)    += mips/variance_msa.c
+DSP_SRCS-$(HAVE_MSA)    += mips/sub_pixel_variance_msa.c
+
+DSP_SRCS-$(HAVE_MMX)    += x86/variance_mmx.c
+DSP_SRCS-$(HAVE_MMX)    += x86/variance_impl_mmx.asm
+DSP_SRCS-$(HAVE_SSE)    += x86/variance_sse2.c
+DSP_SRCS-$(HAVE_SSE2)   += x86/variance_sse2.c  # Contains SSE2 and SSSE3
+DSP_SRCS-$(HAVE_SSE2)   += x86/halfpix_variance_sse2.c
+DSP_SRCS-$(HAVE_SSE2)   += x86/halfpix_variance_impl_sse2.asm
+DSP_SRCS-$(HAVE_AVX2)   += x86/variance_avx2.c
+DSP_SRCS-$(HAVE_AVX2)   += x86/variance_impl_avx2.c
+
+ifeq ($(ARCH_X86_64),yes)
+DSP_SRCS-$(HAVE_SSE2)   += x86/ssim_opt_x86_64.asm
+endif  # ARCH_X86_64
+
+ifeq ($(CONFIG_USE_X86INC),yes)
+DSP_SRCS-$(HAVE_SSE)    += x86/subpel_variance_sse2.asm
+DSP_SRCS-$(HAVE_SSE2)   += x86/subpel_variance_sse2.asm  # Contains SSE2 and SSSE3
+endif  # CONFIG_USE_X86INC
+
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_sse2.c
+DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_impl_sse2.asm
+ifeq ($(CONFIG_USE_X86INC),yes)
+DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_subpel_variance_impl_sse2.asm
+endif  # CONFIG_USE_X86INC
+endif  # CONFIG_VP9_HIGHBITDEPTH
+endif  # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
+
+DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes)
+
+DSP_SRCS-yes += vpx_dsp_rtcd.c
+DSP_SRCS-yes += vpx_dsp_rtcd_defs.pl
+
+$(eval $(call rtcd_h_template,vpx_dsp_rtcd,vpx_dsp/vpx_dsp_rtcd_defs.pl))
diff --git a/libs/libvpx/vpx_dsp/vpx_dsp_common.h b/libs/libvpx/vpx_dsp/vpx_dsp_common.h
new file mode 100644
index 0000000000..a9e180e793
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/vpx_dsp_common.h
@@ -0,0 +1,70 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_COMMON_H_
+#define VPX_DSP_COMMON_H_
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_ports/mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define VPXMIN(x, y) (((x) < (y)) ? (x) : (y))
+#define VPXMAX(x, y) (((x) > (y)) ? (x) : (y))
+
+#if CONFIG_VP9_HIGHBITDEPTH
+// Note:
+// tran_low_t  is the datatype used for final transform coefficients.
+// tran_high_t is the datatype used for intermediate transform stages.
+typedef int64_t tran_high_t;
+typedef int32_t tran_low_t;
+#else
+// Note:
+// tran_low_t  is the datatype used for final transform coefficients.
+// tran_high_t is the datatype used for intermediate transform stages.
+typedef int32_t tran_high_t;
+typedef int16_t tran_low_t;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static INLINE uint8_t clip_pixel(int val) {
+  return (val > 255) ? 255 : (val < 0) ? 0 : val;
+}
+
+static INLINE int clamp(int value, int low, int high) {
+  return value < low ? low : (value > high ? high : value);
+}
+
+static INLINE double fclamp(double value, double low, double high) {
+  return value < low ? low : (value > high ? high : value);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE uint16_t clip_pixel_highbd(int val, int bd) {
+  switch (bd) {
+    case 8:
+    default:
+      return (uint16_t)clamp(val, 0, 255);
+    case 10:
+      return (uint16_t)clamp(val, 0, 1023);
+    case 12:
+      return (uint16_t)clamp(val, 0, 4095);
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_DSP_COMMON_H_
diff --git a/libs/libvpx/vpx_dsp/vpx_dsp_rtcd.c b/libs/libvpx/vpx_dsp/vpx_dsp_rtcd.c
new file mode 100644
index 0000000000..5fe27b614b
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/vpx_dsp_rtcd.c
@@ -0,0 +1,17 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "./vpx_config.h"
+#define RTCD_C
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/vpx_once.h"
+
+void vpx_dsp_rtcd() {
+  once(setup_rtcd_internal);
+}
diff --git a/libs/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl b/libs/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
new file mode 100644
index 0000000000..73726d217c
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -0,0 +1,1905 @@
+sub vpx_dsp_forward_decls() {
+print <<EOF
+/*
+ * DSP
+ */
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+EOF
+}
+forward_decls qw/vpx_dsp_forward_decls/;
+
+# x86inc.asm had specific constraints. break it out so it's easy to disable.
+# zero all the variables to avoid tricky else conditions.
+$mmx_x86inc = $sse_x86inc = $sse2_x86inc = $ssse3_x86inc = $avx_x86inc =
+  $avx2_x86inc = '';
+$mmx_x86_64_x86inc = $sse_x86_64_x86inc = $sse2_x86_64_x86inc =
+  $ssse3_x86_64_x86inc = $avx_x86_64_x86inc = $avx2_x86_64_x86inc = '';
+if (vpx_config("CONFIG_USE_X86INC") eq "yes") {
+  $mmx_x86inc = 'mmx';
+  $sse_x86inc = 'sse';
+  $sse2_x86inc = 'sse2';
+  $ssse3_x86inc = 'ssse3';
+  $avx_x86inc = 'avx';
+  $avx2_x86inc = 'avx2';
+  if ($opts{arch} eq "x86_64") {
+    $mmx_x86_64_x86inc = 'mmx';
+    $sse_x86_64_x86inc = 'sse';
+    $sse2_x86_64_x86inc = 'sse2';
+    $ssse3_x86_64_x86inc = 'ssse3';
+    $avx_x86_64_x86inc = 'avx';
+    $avx2_x86_64_x86inc = 'avx2';
+  }
+}
+
+# optimizations which depend on multiple features
+$avx2_ssse3 = '';
+if ((vpx_config("HAVE_AVX2") eq "yes") && (vpx_config("HAVE_SSSE3") eq "yes")) {
+  $avx2_ssse3 = 'avx2';
+}
+
+# functions that are 64 bit only.
+$mmx_x86_64 = $sse2_x86_64 = $ssse3_x86_64 = $avx_x86_64 = $avx2_x86_64 = '';
+if ($opts{arch} eq "x86_64") {
+  $mmx_x86_64 = 'mmx';
+  $sse2_x86_64 = 'sse2';
+  $ssse3_x86_64 = 'ssse3';
+  $avx_x86_64 = 'avx';
+  $avx2_x86_64 = 'avx2';
+}
+
+#
+# Intra prediction
+#
+
+add_proto qw/void vpx_d207_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d207_predictor_4x4/, "$ssse3_x86inc";
+
+add_proto qw/void vpx_d207e_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d207e_predictor_4x4/;
+
+add_proto qw/void vpx_d45_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d45_predictor_4x4 neon/, "$ssse3_x86inc";
+
+add_proto qw/void vpx_d45e_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d45e_predictor_4x4/;
+
+add_proto qw/void vpx_d63_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d63_predictor_4x4/, "$ssse3_x86inc";
+
+add_proto qw/void vpx_d63e_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d63e_predictor_4x4/;
+
+add_proto qw/void vpx_d63f_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d63f_predictor_4x4/;
+
+add_proto qw/void vpx_h_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_h_predictor_4x4 neon dspr2 msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_he_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_he_predictor_4x4/;
+
+add_proto qw/void vpx_d117_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d117_predictor_4x4/;
+
+add_proto qw/void vpx_d135_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d135_predictor_4x4 neon/;
+
+add_proto qw/void vpx_d153_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d153_predictor_4x4/, "$ssse3_x86inc";
+
+add_proto qw/void vpx_v_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_v_predictor_4x4 neon msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_ve_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_ve_predictor_4x4/;
+
+add_proto qw/void vpx_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_tm_predictor_4x4 neon dspr2 msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_predictor_4x4 dspr2 msa neon/, "$sse2_x86inc";
+
+add_proto qw/void vpx_dc_top_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_top_predictor_4x4 msa neon/, "$sse2_x86inc";
+
+add_proto qw/void vpx_dc_left_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_left_predictor_4x4 msa neon/, "$sse2_x86inc";
+
+add_proto qw/void vpx_dc_128_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_128_predictor_4x4 msa neon/, "$sse2_x86inc";
+
+add_proto qw/void vpx_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d207_predictor_8x8/, "$ssse3_x86inc";
+
+add_proto qw/void vpx_d207e_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d207e_predictor_8x8/;
+
+add_proto qw/void vpx_d45_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d45_predictor_8x8 neon/, "$ssse3_x86inc";
+
+add_proto qw/void vpx_d45e_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d45e_predictor_8x8/;
+
+add_proto qw/void vpx_d63_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d63_predictor_8x8/, "$ssse3_x86inc";
+
+add_proto qw/void vpx_d63e_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d63e_predictor_8x8/;
+
+add_proto qw/void vpx_h_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_h_predictor_8x8 neon dspr2 msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d117_predictor_8x8/;
+
+add_proto qw/void vpx_d135_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d135_predictor_8x8/;
+
+add_proto qw/void vpx_d153_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d153_predictor_8x8/, "$ssse3_x86inc";
+
+add_proto qw/void vpx_v_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_v_predictor_8x8 neon msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_tm_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_tm_predictor_8x8 neon dspr2 msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_predictor_8x8 dspr2 neon msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_dc_top_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_top_predictor_8x8 neon msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_dc_left_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_left_predictor_8x8 neon msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_dc_128_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_128_predictor_8x8 neon msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_d207_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d207_predictor_16x16/, "$ssse3_x86inc";
+
+add_proto qw/void vpx_d207e_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d207e_predictor_16x16/;
+
+add_proto qw/void vpx_d45_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d45_predictor_16x16 neon/, "$ssse3_x86inc";
+
+add_proto qw/void vpx_d45e_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d45e_predictor_16x16/;
+
+add_proto qw/void vpx_d63_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d63_predictor_16x16/, "$ssse3_x86inc";
+
+add_proto qw/void vpx_d63e_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d63e_predictor_16x16/;
+
+add_proto qw/void vpx_h_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_h_predictor_16x16 neon dspr2 msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d117_predictor_16x16/;
+
+add_proto qw/void vpx_d135_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d135_predictor_16x16/;
+
+add_proto qw/void vpx_d153_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d153_predictor_16x16/, "$ssse3_x86inc";
+
+add_proto qw/void vpx_v_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_v_predictor_16x16 neon msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_tm_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_tm_predictor_16x16 neon msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_predictor_16x16 dspr2 neon msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_dc_top_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_top_predictor_16x16 neon msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_dc_left_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_left_predictor_16x16 neon msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_dc_128_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_128_predictor_16x16 neon msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_d207_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d207_predictor_32x32/, "$ssse3_x86inc";
+
+add_proto qw/void vpx_d207e_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d207e_predictor_32x32/;
+
+add_proto qw/void vpx_d45_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d45_predictor_32x32/, "$ssse3_x86inc";
+
+add_proto qw/void vpx_d45e_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d45e_predictor_32x32/;
+
+add_proto qw/void vpx_d63_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d63_predictor_32x32/, "$ssse3_x86inc";
+
+add_proto qw/void vpx_d63e_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d63e_predictor_32x32/;
+
+add_proto qw/void vpx_h_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_h_predictor_32x32 neon msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d117_predictor_32x32/;
+
+add_proto qw/void vpx_d135_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d135_predictor_32x32/;
+
+add_proto qw/void vpx_d153_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d153_predictor_32x32/, "$ssse3_x86inc";
+
+add_proto qw/void vpx_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_v_predictor_32x32 neon msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_tm_predictor_32x32 neon msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_predictor_32x32 msa neon/, "$sse2_x86inc";
+
+add_proto qw/void vpx_dc_top_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_top_predictor_32x32 msa neon/, "$sse2_x86inc";
+
+add_proto qw/void vpx_dc_left_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_left_predictor_32x32 msa neon/, "$sse2_x86inc";
+
+add_proto qw/void vpx_dc_128_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_128_predictor_32x32 msa neon/, "$sse2_x86inc";
+
+# High bitdepth functions
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void vpx_highbd_d207_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d207_predictor_4x4/;
+
+  add_proto qw/void vpx_highbd_d207e_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d207e_predictor_4x4/;
+
+  add_proto qw/void vpx_highbd_d45_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d45_predictor_4x4/;
+
+  add_proto qw/void vpx_highbd_d45e_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d45e_predictor_4x4/;
+
+  add_proto qw/void vpx_highbd_d63_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d63_predictor_4x4/;
+
+  add_proto qw/void vpx_highbd_d63e_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d63e_predictor_4x4/;
+
+  add_proto qw/void vpx_highbd_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_h_predictor_4x4/;
+
+  add_proto qw/void vpx_highbd_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d117_predictor_4x4/;
+
+  add_proto qw/void vpx_highbd_d135_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d135_predictor_4x4/;
+
+  add_proto qw/void vpx_highbd_d153_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d153_predictor_4x4/;
+
+  add_proto qw/void vpx_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_v_predictor_4x4/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_tm_predictor_4x4/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_highbd_dc_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_predictor_4x4/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_highbd_dc_top_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_top_predictor_4x4/;
+
+  add_proto qw/void vpx_highbd_dc_left_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_left_predictor_4x4/;
+
+  add_proto qw/void vpx_highbd_dc_128_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_128_predictor_4x4/;
+
+  add_proto qw/void vpx_highbd_d207_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d207_predictor_8x8/;
+
+  add_proto qw/void vpx_highbd_d207e_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d207e_predictor_8x8/;
+
+  add_proto qw/void vpx_highbd_d45_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d45_predictor_8x8/;
+
+  add_proto qw/void vpx_highbd_d45e_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d45e_predictor_8x8/;
+
+  add_proto qw/void vpx_highbd_d63_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d63_predictor_8x8/;
+
+  add_proto qw/void vpx_highbd_d63e_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d63e_predictor_8x8/;
+
+  add_proto qw/void vpx_highbd_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_h_predictor_8x8/;
+
+  add_proto qw/void vpx_highbd_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d117_predictor_8x8/;
+
+  add_proto qw/void vpx_highbd_d135_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d135_predictor_8x8/;
+
+  add_proto qw/void vpx_highbd_d153_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d153_predictor_8x8/;
+
+  add_proto qw/void vpx_highbd_v_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_v_predictor_8x8/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_highbd_tm_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_tm_predictor_8x8/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_highbd_dc_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_predictor_8x8/, "$sse2_x86inc";;
+
+  add_proto qw/void vpx_highbd_dc_top_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_top_predictor_8x8/;
+
+  add_proto qw/void vpx_highbd_dc_left_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_left_predictor_8x8/;
+
+  add_proto qw/void vpx_highbd_dc_128_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_128_predictor_8x8/;
+
+  add_proto qw/void vpx_highbd_d207_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d207_predictor_16x16/;
+
+  add_proto qw/void vpx_highbd_d207e_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d207e_predictor_16x16/;
+
+  add_proto qw/void vpx_highbd_d45_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d45_predictor_16x16/;
+
+  add_proto qw/void vpx_highbd_d45e_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d45e_predictor_16x16/;
+
+  add_proto qw/void vpx_highbd_d63_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d63_predictor_16x16/;
+
+  add_proto qw/void vpx_highbd_d63e_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d63e_predictor_16x16/;
+
+  add_proto qw/void vpx_highbd_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_h_predictor_16x16/;
+
+  add_proto qw/void vpx_highbd_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d117_predictor_16x16/;
+
+  add_proto qw/void vpx_highbd_d135_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d135_predictor_16x16/;
+
+  add_proto qw/void vpx_highbd_d153_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d153_predictor_16x16/;
+
+  add_proto qw/void vpx_highbd_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_v_predictor_16x16/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_highbd_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_tm_predictor_16x16/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_highbd_dc_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_predictor_16x16/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_highbd_dc_top_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_top_predictor_16x16/;
+
+  add_proto qw/void vpx_highbd_dc_left_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_left_predictor_16x16/;
+
+  add_proto qw/void vpx_highbd_dc_128_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_128_predictor_16x16/;
+
+  add_proto qw/void vpx_highbd_d207_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d207_predictor_32x32/;
+
+  add_proto qw/void vpx_highbd_d207e_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d207e_predictor_32x32/;
+
+  add_proto qw/void vpx_highbd_d45_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d45_predictor_32x32/;
+
+  add_proto qw/void vpx_highbd_d45e_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d45e_predictor_32x32/;
+
+  add_proto qw/void vpx_highbd_d63_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d63_predictor_32x32/;
+
+  add_proto qw/void vpx_highbd_d63e_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d63e_predictor_32x32/;
+
+  add_proto qw/void vpx_highbd_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_h_predictor_32x32/;
+
+  add_proto qw/void vpx_highbd_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d117_predictor_32x32/;
+
+  add_proto qw/void vpx_highbd_d135_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d135_predictor_32x32/;
+
+  add_proto qw/void vpx_highbd_d153_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d153_predictor_32x32/;
+
+  add_proto qw/void vpx_highbd_v_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_v_predictor_32x32/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_highbd_tm_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_tm_predictor_32x32/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_highbd_dc_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_predictor_32x32/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_highbd_dc_top_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_top_predictor_32x32/;
+
+  add_proto qw/void vpx_highbd_dc_left_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_left_predictor_32x32/;
+
+  add_proto qw/void vpx_highbd_dc_128_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_128_predictor_32x32/;
+}  # CONFIG_VP9_HIGHBITDEPTH
+
+#
+# Sub Pixel Filters
+#
+add_proto qw/void vpx_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve_copy neon dspr2 msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve_avg neon dspr2 msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8 sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
+
+add_proto qw/void vpx_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8_horiz sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
+
+add_proto qw/void vpx_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8_vert sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
+
+add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8_avg sse2 ssse3 neon dspr2 msa/;
+
+add_proto qw/void vpx_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 neon dspr2 msa/;
+
+add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8_avg_vert sse2 ssse3 neon dspr2 msa/;
+
+add_proto qw/void vpx_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vpx_scaled_2d ssse3/;
+
+add_proto qw/void vpx_scaled_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vpx_scaled_horiz/;
+
+add_proto qw/void vpx_scaled_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vpx_scaled_vert/;
+
+add_proto qw/void vpx_scaled_avg_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vpx_scaled_avg_2d/;
+
+add_proto qw/void vpx_scaled_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vpx_scaled_avg_horiz/;
+
+add_proto qw/void vpx_scaled_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vpx_scaled_avg_vert/;
+
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  #
+  # Sub Pixel Filters
+  #
+  add_proto qw/void vpx_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vpx_highbd_convolve_copy/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_highbd_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vpx_highbd_convolve_avg/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_highbd_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vpx_highbd_convolve8/, "$sse2_x86_64";
+
+  add_proto qw/void vpx_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vpx_highbd_convolve8_horiz/, "$sse2_x86_64";
+
+  add_proto qw/void vpx_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vpx_highbd_convolve8_vert/, "$sse2_x86_64";
+
+  add_proto qw/void vpx_highbd_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vpx_highbd_convolve8_avg/, "$sse2_x86_64";
+
+  add_proto qw/void vpx_highbd_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vpx_highbd_convolve8_avg_horiz/, "$sse2_x86_64";
+
+  add_proto qw/void vpx_highbd_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  specialize qw/vpx_highbd_convolve8_avg_vert/, "$sse2_x86_64";
+}  # CONFIG_VP9_HIGHBITDEPTH
+
+#
+# Loopfilter
+#
+add_proto qw/void vpx_lpf_vertical_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/vpx_lpf_vertical_16 sse2 neon_asm dspr2 msa/;
+$vpx_lpf_vertical_16_neon_asm=vpx_lpf_vertical_16_neon;
+
+add_proto qw/void vpx_lpf_vertical_16_dual/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/vpx_lpf_vertical_16_dual sse2 neon_asm dspr2 msa/;
+$vpx_lpf_vertical_16_dual_neon_asm=vpx_lpf_vertical_16_dual_neon;
+
+add_proto qw/void vpx_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
+specialize qw/vpx_lpf_vertical_8 sse2 neon dspr2 msa/;
+
+add_proto qw/void vpx_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/vpx_lpf_vertical_8_dual sse2 neon_asm dspr2 msa/;
+$vpx_lpf_vertical_8_dual_neon_asm=vpx_lpf_vertical_8_dual_neon;
+
+add_proto qw/void vpx_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
+specialize qw/vpx_lpf_vertical_4 mmx neon dspr2 msa/;
+
+add_proto qw/void vpx_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/vpx_lpf_vertical_4_dual sse2 neon dspr2 msa/;
+
+add_proto qw/void vpx_lpf_horizontal_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
+specialize qw/vpx_lpf_horizontal_16 sse2 avx2 neon_asm dspr2 msa/;
+$vpx_lpf_horizontal_16_neon_asm=vpx_lpf_horizontal_16_neon;
+
+add_proto qw/void vpx_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
+specialize qw/vpx_lpf_horizontal_8 sse2 neon dspr2 msa/;
+
+add_proto qw/void vpx_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/vpx_lpf_horizontal_8_dual sse2 neon_asm dspr2 msa/;
+$vpx_lpf_horizontal_8_dual_neon_asm=vpx_lpf_horizontal_8_dual_neon;
+
+add_proto qw/void vpx_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
+specialize qw/vpx_lpf_horizontal_4 mmx neon dspr2 msa/;
+
+add_proto qw/void vpx_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/vpx_lpf_horizontal_4_dual sse2 neon dspr2 msa/;
+
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void vpx_highbd_lpf_vertical_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+  specialize qw/vpx_highbd_lpf_vertical_16 sse2/;
+
+  add_proto qw/void vpx_highbd_lpf_vertical_16_dual/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+  specialize qw/vpx_highbd_lpf_vertical_16_dual sse2/;
+
+  add_proto qw/void vpx_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
+  specialize qw/vpx_highbd_lpf_vertical_8 sse2/;
+
+  add_proto qw/void vpx_highbd_lpf_vertical_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+  specialize qw/vpx_highbd_lpf_vertical_8_dual sse2/;
+
+  add_proto qw/void vpx_highbd_lpf_vertical_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
+  specialize qw/vpx_highbd_lpf_vertical_4 sse2/;
+
+  add_proto qw/void vpx_highbd_lpf_vertical_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+  specialize qw/vpx_highbd_lpf_vertical_4_dual sse2/;
+
+  add_proto qw/void vpx_highbd_lpf_horizontal_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
+  specialize qw/vpx_highbd_lpf_horizontal_16 sse2/;
+
+  add_proto qw/void vpx_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
+  specialize qw/vpx_highbd_lpf_horizontal_8 sse2/;
+
+  add_proto qw/void vpx_highbd_lpf_horizontal_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+  specialize qw/vpx_highbd_lpf_horizontal_8_dual sse2/;
+
+  add_proto qw/void vpx_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
+  specialize qw/vpx_highbd_lpf_horizontal_4 sse2/;
+
+  add_proto qw/void vpx_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+  specialize qw/vpx_highbd_lpf_horizontal_4_dual sse2/;
+}  # CONFIG_VP9_HIGHBITDEPTH
+
+#
+# Encoder functions.
+#
+
+#
+# Forward transform
+#
+if ((vpx_config("CONFIG_VP9_ENCODER") eq "yes") || (vpx_config("CONFIG_VP10_ENCODER") eq "yes")) {
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void vpx_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_fdct4x4 sse2/;
+
+  add_proto qw/void vpx_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_fdct4x4_1 sse2/;
+
+  add_proto qw/void vpx_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_fdct8x8 sse2/;
+
+  add_proto qw/void vpx_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_fdct8x8_1 sse2/;
+
+  add_proto qw/void vpx_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_fdct16x16 sse2/;
+
+  add_proto qw/void vpx_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_fdct16x16_1 sse2/;
+
+  add_proto qw/void vpx_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_fdct32x32 sse2/;
+
+  add_proto qw/void vpx_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_fdct32x32_rd sse2/;
+
+  add_proto qw/void vpx_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_fdct32x32_1 sse2/;
+
+  add_proto qw/void vpx_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_highbd_fdct4x4 sse2/;
+
+  add_proto qw/void vpx_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_highbd_fdct8x8 sse2/;
+
+  add_proto qw/void vpx_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_highbd_fdct8x8_1/;
+
+  add_proto qw/void vpx_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_highbd_fdct16x16 sse2/;
+
+  add_proto qw/void vpx_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_highbd_fdct16x16_1/;
+
+  add_proto qw/void vpx_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_highbd_fdct32x32 sse2/;
+
+  add_proto qw/void vpx_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_highbd_fdct32x32_rd sse2/;
+
+  add_proto qw/void vpx_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_highbd_fdct32x32_1/;
+} else {
+  add_proto qw/void vpx_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_fdct4x4 sse2 msa/;
+
+  add_proto qw/void vpx_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_fdct4x4_1 sse2/;
+
+  add_proto qw/void vpx_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_fdct8x8 sse2 neon msa/, "$ssse3_x86_64_x86inc";
+
+  add_proto qw/void vpx_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_fdct8x8_1 sse2 neon msa/;
+
+  add_proto qw/void vpx_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_fdct16x16 sse2 msa/;
+
+  add_proto qw/void vpx_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_fdct16x16_1 sse2 msa/;
+
+  add_proto qw/void vpx_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_fdct32x32 sse2 avx2 msa/;
+
+  add_proto qw/void vpx_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_fdct32x32_rd sse2 avx2 msa/;
+
+  add_proto qw/void vpx_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_fdct32x32_1 sse2 msa/;
+}  # CONFIG_VP9_HIGHBITDEPTH
+}  # CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
+
+#
+# Inverse transform
+if ((vpx_config("CONFIG_VP9") eq "yes") || (vpx_config("CONFIG_VP10") eq "yes")) {
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  # Note as optimized versions of these functions are added we need to add a check to ensure
+  # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
+  add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vpx_iwht4x4_1_add/;
+
+  add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vpx_iwht4x4_16_add/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vpx_highbd_idct4x4_1_add/;
+
+  add_proto qw/void vpx_highbd_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vpx_highbd_idct8x8_1_add/;
+
+  add_proto qw/void vpx_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vpx_highbd_idct16x16_1_add/;
+
+  add_proto qw/void vpx_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vpx_highbd_idct32x32_1024_add/;
+
+  add_proto qw/void vpx_highbd_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vpx_highbd_idct32x32_34_add/;
+
+  add_proto qw/void vpx_highbd_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vpx_highbd_idct32x32_1_add/;
+
+  add_proto qw/void vpx_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vpx_highbd_iwht4x4_1_add/;
+
+  add_proto qw/void vpx_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vpx_highbd_iwht4x4_16_add/;
+
+  # Force C versions if CONFIG_EMULATE_HARDWARE is 1
+  if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
+    add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct4x4_16_add/;
+
+    add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct4x4_1_add/;
+
+    add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct8x8_64_add/;
+
+    add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct8x8_12_add/;
+
+    add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct8x8_1_add/;
+
+    add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct16x16_256_add/;
+
+    add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct16x16_10_add/;
+
+    add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct16x16_1_add/;
+
+    add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct32x32_1024_add/;
+
+    add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct32x32_135_add/;
+
+    add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct32x32_34_add/;
+
+    add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct32x32_1_add/;
+
+    add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vpx_highbd_idct4x4_16_add/;
+
+    add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vpx_highbd_idct8x8_64_add/;
+
+    add_proto qw/void vpx_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vpx_highbd_idct8x8_10_add/;
+
+    add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vpx_highbd_idct16x16_256_add/;
+
+    add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vpx_highbd_idct16x16_10_add/;
+  } else {
+    add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct4x4_16_add sse2/;
+
+    add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct4x4_1_add sse2/;
+
+    add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct8x8_64_add sse2/, "$ssse3_x86_64_x86inc";
+
+    add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct8x8_12_add sse2/, "$ssse3_x86_64_x86inc";
+
+    add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct8x8_1_add sse2/;
+
+    add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct16x16_256_add sse2/;
+
+    add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct16x16_10_add sse2/;
+
+    add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct16x16_1_add sse2/;
+
+    add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct32x32_1024_add sse2/, "$ssse3_x86_64_x86inc";
+
+    add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct32x32_135_add sse2/, "$ssse3_x86_64_x86inc";
+    # Need to add 135 eob idct32x32 implementations.
+    $vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2;
+
+    add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct32x32_34_add sse2/, "$ssse3_x86_64_x86inc";
+
+    add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct32x32_1_add sse2/;
+
+    add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vpx_highbd_idct4x4_16_add sse2/;
+
+    add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vpx_highbd_idct8x8_64_add sse2/;
+
+    add_proto qw/void vpx_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vpx_highbd_idct8x8_10_add sse2/;
+
+    add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vpx_highbd_idct16x16_256_add sse2/;
+
+    add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vpx_highbd_idct16x16_10_add sse2/;
+  }  # CONFIG_EMULATE_HARDWARE
+} else {
+  # Force C versions if CONFIG_EMULATE_HARDWARE is 1
+  if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
+    add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct4x4_1_add/;
+
+    add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct4x4_16_add/;
+
+    add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct8x8_1_add/;
+
+    add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct8x8_64_add/;
+
+    add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct8x8_12_add/;
+
+    add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct16x16_1_add/;
+
+    add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct16x16_256_add/;
+
+    add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct16x16_10_add/;
+
+    add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct32x32_1024_add/;
+
+    add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct32x32_135_add/;
+
+    add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct32x32_34_add/;
+
+    add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct32x32_1_add/;
+
+    add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_iwht4x4_1_add/;
+
+    add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_iwht4x4_16_add/;
+  } else {
+    add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct4x4_1_add sse2 neon dspr2 msa/;
+
+    add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct4x4_16_add sse2 neon dspr2 msa/;
+
+    add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct8x8_1_add sse2 neon dspr2 msa/;
+
+    add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct8x8_64_add sse2 neon dspr2 msa/, "$ssse3_x86_64_x86inc";
+
+    add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct8x8_12_add sse2 neon dspr2 msa/, "$ssse3_x86_64_x86inc";
+
+    add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct16x16_1_add sse2 neon dspr2 msa/;
+
+    add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct16x16_256_add sse2 neon dspr2 msa/;
+
+    add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct16x16_10_add sse2 neon dspr2 msa/;
+
+    add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct32x32_1024_add sse2 neon dspr2 msa/, "$ssse3_x86_64_x86inc";
+
+    add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct32x32_135_add sse2 neon dspr2 msa/, "$ssse3_x86_64_x86inc";
+    # Need to add 135 eob idct32x32 implementations.
+    $vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2;
+    $vpx_idct32x32_135_add_neon=vpx_idct32x32_1024_add_neon;
+    $vpx_idct32x32_135_add_dspr2=vpx_idct32x32_1024_add_dspr2;
+    $vpx_idct32x32_135_add_msa=vpx_idct32x32_1024_add_msa;
+
+    add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct32x32_34_add sse2 neon_asm dspr2 msa/, "$ssse3_x86_64_x86inc";
+    # Need to add 34 eob idct32x32 neon implementation.
+    $vpx_idct32x32_34_add_neon_asm=vpx_idct32x32_1024_add_neon;
+
+    add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct32x32_1_add sse2 neon dspr2 msa/;
+
+    add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_iwht4x4_1_add msa/;
+
+    add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_iwht4x4_16_add msa/, "$sse2_x86inc";
+  }  # CONFIG_EMULATE_HARDWARE
+}  # CONFIG_VP9_HIGHBITDEPTH
+}  # CONFIG_VP9 || CONFIG_VP10
+
+#
+# Quantization
+#
+if ((vpx_config("CONFIG_VP9_ENCODER") eq "yes") || (vpx_config("CONFIG_VP10_ENCODER") eq "yes")) {
+  add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vpx_quantize_b sse2/, "$ssse3_x86_64_x86inc", "$avx_x86_64_x86inc";
+
+  add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vpx_quantize_b_32x32/, "$ssse3_x86_64_x86inc", "$avx_x86_64_x86inc";
+
+  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    specialize qw/vpx_highbd_quantize_b sse2/;
+
+    add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    specialize qw/vpx_highbd_quantize_b_32x32 sse2/;
+  }  # CONFIG_VP9_HIGHBITDEPTH
+}  # CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
+
+if (vpx_config("CONFIG_ENCODERS") eq "yes") {
+#
+# Block subtraction
+#
+add_proto qw/void vpx_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
+specialize qw/vpx_subtract_block neon msa/, "$sse2_x86inc";
+
+#
+# Single block SAD
+#
+add_proto qw/unsigned int vpx_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad64x64 avx2 neon msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad64x32 avx2 msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad32x64 avx2 msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad32x32 avx2 neon msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad32x16 avx2 msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad16x32 msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad16x16 mmx media neon msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad16x8 mmx neon msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad8x16 mmx neon msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad8x8 mmx neon msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad8x4 msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad4x8 msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad4x4 mmx neon msa/, "$sse2_x86inc";
+
+#
+# Avg
+#
+if ((vpx_config("CONFIG_VP9_ENCODER") eq "yes") || (vpx_config("CONFIG_VP10_ENCODER") eq "yes")) {
+  add_proto qw/unsigned int vpx_avg_8x8/, "const uint8_t *, int p";
+  specialize qw/vpx_avg_8x8 sse2 neon msa/;
+
+  add_proto qw/unsigned int vpx_avg_4x4/, "const uint8_t *, int p";
+  specialize qw/vpx_avg_4x4 sse2 neon msa/;
+
+  add_proto qw/void vpx_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
+  specialize qw/vpx_minmax_8x8 sse2/;
+
+  add_proto qw/void vpx_hadamard_8x8/, "int16_t const *src_diff, int src_stride, int16_t *coeff";
+  specialize qw/vpx_hadamard_8x8 sse2/, "$ssse3_x86_64_x86inc";
+
+  add_proto qw/void vpx_hadamard_16x16/, "int16_t const *src_diff, int src_stride, int16_t *coeff";
+  specialize qw/vpx_hadamard_16x16 sse2/;
+
+  add_proto qw/int vpx_satd/, "const int16_t *coeff, int length";
+  specialize qw/vpx_satd sse2 neon/;
+
+  add_proto qw/void vpx_int_pro_row/, "int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height";
+  specialize qw/vpx_int_pro_row sse2 neon/;
+
+  add_proto qw/int16_t vpx_int_pro_col/, "uint8_t const *ref, const int width";
+  specialize qw/vpx_int_pro_col sse2 neon/;
+
+  add_proto qw/int vpx_vector_var/, "int16_t const *ref, int16_t const *src, const int bwl";
+  specialize qw/vpx_vector_var neon sse2/;
+}  # CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
+
+add_proto qw/unsigned int vpx_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad64x64_avg avx2 msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad64x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad64x32_avg avx2 msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad32x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad32x64_avg avx2 msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad32x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad32x32_avg avx2 msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad32x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad32x16_avg avx2 msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad16x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad16x32_avg msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad16x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad16x16_avg msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad16x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad16x8_avg msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad8x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad8x16_avg msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad8x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad8x8_avg msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad8x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad8x4_avg msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad4x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad4x8_avg msa/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad4x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad4x4_avg msa/, "$sse2_x86inc";
+
+#
+# Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
+#
+# Blocks of 3
+add_proto qw/void vpx_sad64x64x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad64x64x3 msa/;
+
+add_proto qw/void vpx_sad32x32x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad32x32x3 msa/;
+
+add_proto qw/void vpx_sad16x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad16x16x3 sse3 ssse3 msa/;
+
+add_proto qw/void vpx_sad16x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad16x8x3 sse3 ssse3 msa/;
+
+add_proto qw/void vpx_sad8x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad8x16x3 sse3 msa/;
+
+add_proto qw/void vpx_sad8x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad8x8x3 sse3 msa/;
+
+add_proto qw/void vpx_sad4x4x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad4x4x3 sse3 msa/;
+
+# Blocks of 8
+add_proto qw/void vpx_sad64x64x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad64x64x8 msa/;
+
+add_proto qw/void vpx_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad32x32x8 msa/;
+
+add_proto qw/void vpx_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad16x16x8 sse4_1 msa/;
+
+add_proto qw/void vpx_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad16x8x8 sse4_1 msa/;
+
+add_proto qw/void vpx_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad8x16x8 sse4_1 msa/;
+
+add_proto qw/void vpx_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad8x8x8 sse4_1 msa/;
+
+add_proto qw/void vpx_sad8x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad8x4x8 msa/;
+
+add_proto qw/void vpx_sad4x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad4x8x8 msa/;
+
+add_proto qw/void vpx_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad4x4x8 sse4_1 msa/;
+
+#
+# Multi-block SAD, comparing a reference to N independent blocks
+#
+add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad64x64x4d avx2 neon msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad64x32x4d msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad32x64x4d msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad32x32x4d avx2 neon msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad32x16x4d msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad16x32x4d msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad16x16x4d neon msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad16x8x4d msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad8x16x4d msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad8x8x4d msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad8x4x4d msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad4x8x4d msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad4x4x4d msa/, "$sse2_x86inc";
+
+#
+# Structured Similarity (SSIM)
+#
+if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") {
+    add_proto qw/void vpx_ssim_parms_8x8/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
+    specialize qw/vpx_ssim_parms_8x8/, "$sse2_x86_64";
+
+    add_proto qw/void vpx_ssim_parms_16x16/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
+    specialize qw/vpx_ssim_parms_16x16/, "$sse2_x86_64";
+}
+
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  #
+  # Block subtraction
+  #
+  add_proto qw/void vpx_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
+  specialize qw/vpx_highbd_subtract_block/;
+
+  #
+  # Single block SAD
+  #
+  add_proto qw/unsigned int vpx_highbd_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad64x64/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad64x32/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad32x64/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad32x32/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad32x16/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad16x32/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad16x16/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad16x8/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad8x16/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad8x8/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad8x4/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad4x8/;
+
+  add_proto qw/unsigned int vpx_highbd_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad4x4/;
+
+  #
+  # Avg
+  #
+  add_proto qw/unsigned int vpx_highbd_avg_8x8/, "const uint8_t *, int p";
+  specialize qw/vpx_highbd_avg_8x8/;
+  add_proto qw/unsigned int vpx_highbd_avg_4x4/, "const uint8_t *, int p";
+  specialize qw/vpx_highbd_avg_4x4/;
+  add_proto qw/void vpx_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
+  specialize qw/vpx_highbd_minmax_8x8/;
+
+  add_proto qw/unsigned int vpx_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_sad64x64_avg/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad64x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_sad64x32_avg/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad32x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_sad32x64_avg/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad32x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_sad32x32_avg/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad32x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_sad32x16_avg/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad16x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_sad16x32_avg/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad16x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_sad16x16_avg/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad16x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_sad16x8_avg/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad8x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_sad8x16_avg/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad8x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_sad8x8_avg/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad8x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_sad8x4_avg/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad4x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_sad4x8_avg/;
+
+  add_proto qw/unsigned int vpx_highbd_sad4x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_sad4x4_avg/;
+
+  #
+  # Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
+  #
+  # Blocks of 3
+  add_proto qw/void vpx_highbd_sad64x64x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad64x64x3/;
+
+  add_proto qw/void vpx_highbd_sad32x32x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad32x32x3/;
+
+  add_proto qw/void vpx_highbd_sad16x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad16x16x3/;
+
+  add_proto qw/void vpx_highbd_sad16x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad16x8x3/;
+
+  add_proto qw/void vpx_highbd_sad8x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad8x16x3/;
+
+  add_proto qw/void vpx_highbd_sad8x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad8x8x3/;
+
+  add_proto qw/void vpx_highbd_sad4x4x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad4x4x3/;
+
+  # Blocks of 8
+  add_proto qw/void vpx_highbd_sad64x64x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad64x64x8/;
+
+  add_proto qw/void vpx_highbd_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad32x32x8/;
+
+  add_proto qw/void vpx_highbd_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad16x16x8/;
+
+  add_proto qw/void vpx_highbd_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad16x8x8/;
+
+  add_proto qw/void vpx_highbd_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad8x16x8/;
+
+  add_proto qw/void vpx_highbd_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad8x8x8/;
+
+  add_proto qw/void vpx_highbd_sad8x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad8x4x8/;
+
+  add_proto qw/void vpx_highbd_sad4x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad4x8x8/;
+
+  add_proto qw/void vpx_highbd_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad4x4x8/;
+
+  #
+  # Multi-block SAD, comparing a reference to N independent blocks
+  #
+  add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad64x64x4d/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad64x32x4d/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad32x64x4d/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad32x32x4d/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad32x16x4d/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad16x32x4d/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad16x16x4d/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad16x8x4d/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad8x16x4d/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad8x8x4d/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad8x4x4d/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad4x8x4d/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad4x4x4d/, "$sse2_x86inc";
+
+  #
+  # Structured Similarity (SSIM)
+  #
+  if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") {
+    add_proto qw/void vpx_highbd_ssim_parms_8x8/, "const uint16_t *s, int sp, const uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
+    specialize qw/vpx_highbd_ssim_parms_8x8/;
+  }
+}  # CONFIG_VP9_HIGHBITDEPTH
+}  # CONFIG_ENCODERS
+
+if (vpx_config("CONFIG_ENCODERS") eq "yes" || vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
+
+#
+# Variance
+#
+add_proto qw/unsigned int vpx_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance64x64 sse2 avx2 neon msa/;
+
+add_proto qw/unsigned int vpx_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance64x32 sse2 avx2 neon msa/;
+
+add_proto qw/unsigned int vpx_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance32x64 sse2 neon msa/;
+
+add_proto qw/unsigned int vpx_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance32x32 sse2 avx2 neon msa/;
+
+add_proto qw/unsigned int vpx_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance32x16 sse2 avx2 msa/;
+
+add_proto qw/unsigned int vpx_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance16x32 sse2 msa/;
+
+add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance16x16 mmx sse2 avx2 media neon msa/;
+
+add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance16x8 mmx sse2 neon msa/;
+
+add_proto qw/unsigned int vpx_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance8x16 mmx sse2 neon msa/;
+
+add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance8x8 mmx sse2 media neon msa/;
+
+add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance8x4 sse2 msa/;
+
+add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance4x8 sse2 msa/;
+
+add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance4x4 mmx sse2 msa/;
+
+#
+# Specialty Variance
+#
+add_proto qw/void vpx_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  specialize qw/vpx_get16x16var sse2 avx2 neon msa/;
+
+add_proto qw/void vpx_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  specialize qw/vpx_get8x8var mmx sse2 neon msa/;
+
+add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vpx_mse16x16 mmx sse2 avx2 media neon msa/;
+
+add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vpx_mse16x8 sse2 msa/;
+
+add_proto qw/unsigned int vpx_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vpx_mse8x16 sse2 msa/;
+
+add_proto qw/unsigned int vpx_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vpx_mse8x8 sse2 msa/;
+
+add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *";
+  specialize qw/vpx_get_mb_ss mmx sse2 msa/;
+
+add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride";
+  specialize qw/vpx_get4x4sse_cs neon msa/;
+
+add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
+
+#
+# Subpixel Variance
+#
+add_proto qw/uint32_t vpx_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance64x64 avx2 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance64x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance32x64 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance32x32 avx2 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance32x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance16x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance16x16 mmx media neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance16x8 mmx msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance8x16 mmx msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance8x8 mmx media neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance4x8 msa/, "$sse_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance4x4 mmx msa/, "$sse_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance64x64 avx2 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance64x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance32x64 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance32x32 avx2 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance32x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance16x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance16x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance16x8 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance8x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance8x8 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance4x8 msa/, "$sse_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance4x4 msa/, "$sse_x86inc", "$ssse3_x86inc";
+
+#
+# Specialty Subpixel
+#
+add_proto qw/uint32_t vpx_variance_halfpixvar16x16_h/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse";
+  specialize qw/vpx_variance_halfpixvar16x16_h mmx sse2 media/;
+
+add_proto qw/uint32_t vpx_variance_halfpixvar16x16_v/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse";
+  specialize qw/vpx_variance_halfpixvar16x16_v mmx sse2 media/;
+
+add_proto qw/uint32_t vpx_variance_halfpixvar16x16_hv/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse";
+  specialize qw/vpx_variance_halfpixvar16x16_hv mmx sse2 media/;
+
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance64x64 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance64x32 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance32x64 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance32x32 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance32x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance16x32 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance16x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance16x8 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance8x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance8x8 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+
+  add_proto qw/unsigned int vpx_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance64x64 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance64x32 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance32x64 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance32x32 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance32x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance16x32 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance16x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance16x8 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance8x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance8x8 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+
+  add_proto qw/unsigned int vpx_highbd_8_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance64x64 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance64x32 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance32x64 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance32x32 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance32x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance16x32 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance16x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance16x8 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance8x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance8x8 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_8_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_8_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+
+  add_proto qw/void vpx_highbd_8_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  add_proto qw/void vpx_highbd_8_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+
+  add_proto qw/void vpx_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  add_proto qw/void vpx_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+
+  add_proto qw/void vpx_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  add_proto qw/void vpx_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+
+  add_proto qw/unsigned int vpx_highbd_8_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_mse16x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_8_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_8_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_mse8x8 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_mse16x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_10_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_10_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_mse8x8 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_mse16x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_12_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_12_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_mse8x8 sse2/;
+
+  add_proto qw/void vpx_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride";
+
+  #
+  # Subpixel Variance
+  #
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance64x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance64x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance32x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance32x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance32x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance16x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance16x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance16x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance8x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance8x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance8x4/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance64x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance64x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance32x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance32x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance32x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance16x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance16x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance16x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance8x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance8x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance8x4/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance64x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance64x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance32x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance32x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance32x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance16x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance16x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance16x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance8x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance8x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance8x4/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+
+}  # CONFIG_VP9_HIGHBITDEPTH
+}  # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
+
+1;
diff --git a/libs/libvpx/vpx_dsp/vpx_filter.h b/libs/libvpx/vpx_dsp/vpx_filter.h
new file mode 100644
index 0000000000..2617febf3b
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/vpx_filter.h
@@ -0,0 +1,34 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_VPX_FILTER_H_
+#define VPX_DSP_VPX_FILTER_H_
+
+#include "vpx/vpx_integer.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define FILTER_BITS 7
+
+#define SUBPEL_BITS 4
+#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
+#define SUBPEL_SHIFTS (1 << SUBPEL_BITS)
+#define SUBPEL_TAPS 8
+
+typedef int16_t InterpKernel[SUBPEL_TAPS];
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_DSP_VPX_FILTER_H_
diff --git a/libs/libvpx/vpx_dsp/x86/avg_intrin_sse2.c b/libs/libvpx/vpx_dsp/x86/avg_intrin_sse2.c
new file mode 100644
index 0000000000..f9af6cf974
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/avg_intrin_sse2.c
@@ -0,0 +1,423 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/mem.h"
+
+void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
+                         int *min, int *max) {
+  __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;
+  u0  = _mm_setzero_si128();
+  // Row 0
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff0 = _mm_max_epi16(diff, negdiff);
+  // Row 1
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(absdiff0, absdiff);
+  minabsdiff = _mm_min_epi16(absdiff0, absdiff);
+  // Row 2
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+  // Row 3
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+  // Row 4
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+  // Row 5
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+  // Row 6
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+  // Row 7
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+
+  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8));
+  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32));
+  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16));
+  *max = _mm_extract_epi16(maxabsdiff, 0);
+
+  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8));
+  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32));
+  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16));
+  *min = _mm_extract_epi16(minabsdiff, 0);
+}
+
+unsigned int vpx_avg_8x8_sse2(const uint8_t *s, int p) {
+  __m128i s0, s1, u0;
+  unsigned int avg = 0;
+  u0  = _mm_setzero_si128();
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+
+  s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8));
+  s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 32));
+  s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
+  avg = _mm_extract_epi16(s0, 0);
+  return (avg + 32) >> 6;
+}
+
+unsigned int vpx_avg_4x4_sse2(const uint8_t *s, int p) {
+  __m128i s0, s1, u0;
+  unsigned int avg = 0;
+  u0  = _mm_setzero_si128();
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+
+  s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4));
+  s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
+  avg = _mm_extract_epi16(s0, 0);
+  return (avg + 8) >> 4;
+}
+
+static void hadamard_col8_sse2(__m128i *in, int iter) {
+  __m128i a0 = in[0];
+  __m128i a1 = in[1];
+  __m128i a2 = in[2];
+  __m128i a3 = in[3];
+  __m128i a4 = in[4];
+  __m128i a5 = in[5];
+  __m128i a6 = in[6];
+  __m128i a7 = in[7];
+
+  __m128i b0 = _mm_add_epi16(a0, a1);
+  __m128i b1 = _mm_sub_epi16(a0, a1);
+  __m128i b2 = _mm_add_epi16(a2, a3);
+  __m128i b3 = _mm_sub_epi16(a2, a3);
+  __m128i b4 = _mm_add_epi16(a4, a5);
+  __m128i b5 = _mm_sub_epi16(a4, a5);
+  __m128i b6 = _mm_add_epi16(a6, a7);
+  __m128i b7 = _mm_sub_epi16(a6, a7);
+
+  a0 = _mm_add_epi16(b0, b2);
+  a1 = _mm_add_epi16(b1, b3);
+  a2 = _mm_sub_epi16(b0, b2);
+  a3 = _mm_sub_epi16(b1, b3);
+  a4 = _mm_add_epi16(b4, b6);
+  a5 = _mm_add_epi16(b5, b7);
+  a6 = _mm_sub_epi16(b4, b6);
+  a7 = _mm_sub_epi16(b5, b7);
+
+  if (iter == 0) {
+    b0 = _mm_add_epi16(a0, a4);
+    b7 = _mm_add_epi16(a1, a5);
+    b3 = _mm_add_epi16(a2, a6);
+    b4 = _mm_add_epi16(a3, a7);
+    b2 = _mm_sub_epi16(a0, a4);
+    b6 = _mm_sub_epi16(a1, a5);
+    b1 = _mm_sub_epi16(a2, a6);
+    b5 = _mm_sub_epi16(a3, a7);
+
+    a0 = _mm_unpacklo_epi16(b0, b1);
+    a1 = _mm_unpacklo_epi16(b2, b3);
+    a2 = _mm_unpackhi_epi16(b0, b1);
+    a3 = _mm_unpackhi_epi16(b2, b3);
+    a4 = _mm_unpacklo_epi16(b4, b5);
+    a5 = _mm_unpacklo_epi16(b6, b7);
+    a6 = _mm_unpackhi_epi16(b4, b5);
+    a7 = _mm_unpackhi_epi16(b6, b7);
+
+    b0 = _mm_unpacklo_epi32(a0, a1);
+    b1 = _mm_unpacklo_epi32(a4, a5);
+    b2 = _mm_unpackhi_epi32(a0, a1);
+    b3 = _mm_unpackhi_epi32(a4, a5);
+    b4 = _mm_unpacklo_epi32(a2, a3);
+    b5 = _mm_unpacklo_epi32(a6, a7);
+    b6 = _mm_unpackhi_epi32(a2, a3);
+    b7 = _mm_unpackhi_epi32(a6, a7);
+
+    in[0] = _mm_unpacklo_epi64(b0, b1);
+    in[1] = _mm_unpackhi_epi64(b0, b1);
+    in[2] = _mm_unpacklo_epi64(b2, b3);
+    in[3] = _mm_unpackhi_epi64(b2, b3);
+    in[4] = _mm_unpacklo_epi64(b4, b5);
+    in[5] = _mm_unpackhi_epi64(b4, b5);
+    in[6] = _mm_unpacklo_epi64(b6, b7);
+    in[7] = _mm_unpackhi_epi64(b6, b7);
+  } else {
+    in[0] = _mm_add_epi16(a0, a4);
+    in[7] = _mm_add_epi16(a1, a5);
+    in[3] = _mm_add_epi16(a2, a6);
+    in[4] = _mm_add_epi16(a3, a7);
+    in[2] = _mm_sub_epi16(a0, a4);
+    in[6] = _mm_sub_epi16(a1, a5);
+    in[1] = _mm_sub_epi16(a2, a6);
+    in[5] = _mm_sub_epi16(a3, a7);
+  }
+}
+
+void vpx_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride,
+                           int16_t *coeff) {
+  __m128i src[8];
+  src[0] = _mm_load_si128((const __m128i *)src_diff);
+  src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+
+  hadamard_col8_sse2(src, 0);
+  hadamard_col8_sse2(src, 1);
+
+  _mm_store_si128((__m128i *)coeff, src[0]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[1]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[2]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[3]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[4]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[5]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[6]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[7]);
+}
+
+void vpx_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride,
+                             int16_t *coeff) {
+  int idx;
+  for (idx = 0; idx < 4; ++idx) {
+    int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride
+                                + (idx & 0x01) * 8;
+    vpx_hadamard_8x8_sse2(src_ptr, src_stride, coeff + idx * 64);
+  }
+
+  for (idx = 0; idx < 64; idx += 8) {
+    __m128i coeff0 = _mm_load_si128((const __m128i *)coeff);
+    __m128i coeff1 = _mm_load_si128((const __m128i *)(coeff + 64));
+    __m128i coeff2 = _mm_load_si128((const __m128i *)(coeff + 128));
+    __m128i coeff3 = _mm_load_si128((const __m128i *)(coeff + 192));
+
+    __m128i b0 = _mm_add_epi16(coeff0, coeff1);
+    __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
+    __m128i b2 = _mm_add_epi16(coeff2, coeff3);
+    __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
+
+    b0 = _mm_srai_epi16(b0, 1);
+    b1 = _mm_srai_epi16(b1, 1);
+    b2 = _mm_srai_epi16(b2, 1);
+    b3 = _mm_srai_epi16(b3, 1);
+
+    coeff0 = _mm_add_epi16(b0, b2);
+    coeff1 = _mm_add_epi16(b1, b3);
+    _mm_store_si128((__m128i *)coeff, coeff0);
+    _mm_store_si128((__m128i *)(coeff + 64), coeff1);
+
+    coeff2 = _mm_sub_epi16(b0, b2);
+    coeff3 = _mm_sub_epi16(b1, b3);
+    _mm_store_si128((__m128i *)(coeff + 128), coeff2);
+    _mm_store_si128((__m128i *)(coeff + 192), coeff3);
+
+    coeff += 8;
+  }
+}
+
+int vpx_satd_sse2(const int16_t *coeff, int length) {
+  int i;
+  const __m128i zero = _mm_setzero_si128();
+  __m128i accum = zero;
+
+  for (i = 0; i < length; i += 8) {
+    const __m128i src_line = _mm_load_si128((const __m128i *)coeff);
+    const __m128i inv = _mm_sub_epi16(zero, src_line);
+    const __m128i abs = _mm_max_epi16(src_line, inv);  // abs(src_line)
+    const __m128i abs_lo = _mm_unpacklo_epi16(abs, zero);
+    const __m128i abs_hi = _mm_unpackhi_epi16(abs, zero);
+    const __m128i sum = _mm_add_epi32(abs_lo, abs_hi);
+    accum = _mm_add_epi32(accum, sum);
+    coeff += 8;
+  }
+
+  {  // cascading summation of accum
+    __m128i hi = _mm_srli_si128(accum, 8);
+    accum = _mm_add_epi32(accum, hi);
+    hi = _mm_srli_epi64(accum, 32);
+    accum = _mm_add_epi32(accum, hi);
+  }
+
+  return _mm_cvtsi128_si32(accum);
+}
+
+void vpx_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref,
+                          const int ref_stride, const int height) {
+  int idx;
+  __m128i zero = _mm_setzero_si128();
+  __m128i src_line = _mm_loadu_si128((const __m128i *)ref);
+  __m128i s0 = _mm_unpacklo_epi8(src_line, zero);
+  __m128i s1 = _mm_unpackhi_epi8(src_line, zero);
+  __m128i t0, t1;
+  int height_1 = height - 1;
+  ref += ref_stride;
+
+  for (idx = 1; idx < height_1; idx += 2) {
+    src_line = _mm_loadu_si128((const __m128i *)ref);
+    t0 = _mm_unpacklo_epi8(src_line, zero);
+    t1 = _mm_unpackhi_epi8(src_line, zero);
+    s0 = _mm_adds_epu16(s0, t0);
+    s1 = _mm_adds_epu16(s1, t1);
+    ref += ref_stride;
+
+    src_line = _mm_loadu_si128((const __m128i *)ref);
+    t0 = _mm_unpacklo_epi8(src_line, zero);
+    t1 = _mm_unpackhi_epi8(src_line, zero);
+    s0 = _mm_adds_epu16(s0, t0);
+    s1 = _mm_adds_epu16(s1, t1);
+    ref += ref_stride;
+  }
+
+  src_line = _mm_loadu_si128((const __m128i *)ref);
+  t0 = _mm_unpacklo_epi8(src_line, zero);
+  t1 = _mm_unpackhi_epi8(src_line, zero);
+  s0 = _mm_adds_epu16(s0, t0);
+  s1 = _mm_adds_epu16(s1, t1);
+
+  if (height == 64) {
+    s0 = _mm_srai_epi16(s0, 5);
+    s1 = _mm_srai_epi16(s1, 5);
+  } else if (height == 32) {
+    s0 = _mm_srai_epi16(s0, 4);
+    s1 = _mm_srai_epi16(s1, 4);
+  } else {
+    s0 = _mm_srai_epi16(s0, 3);
+    s1 = _mm_srai_epi16(s1, 3);
+  }
+
+  _mm_storeu_si128((__m128i *)hbuf, s0);
+  hbuf += 8;
+  _mm_storeu_si128((__m128i *)hbuf, s1);
+}
+
+int16_t vpx_int_pro_col_sse2(uint8_t const *ref, const int width) {
+  __m128i zero = _mm_setzero_si128();
+  __m128i src_line = _mm_load_si128((const __m128i *)ref);
+  __m128i s0 = _mm_sad_epu8(src_line, zero);
+  __m128i s1;
+  int i;
+
+  for (i = 16; i < width; i += 16) {
+    ref += 16;
+    src_line = _mm_load_si128((const __m128i *)ref);
+    s1 = _mm_sad_epu8(src_line, zero);
+    s0 = _mm_adds_epu16(s0, s1);
+  }
+
+  s1 = _mm_srli_si128(s0, 8);
+  s0 = _mm_adds_epu16(s0, s1);
+
+  return _mm_extract_epi16(s0, 0);
+}
+
+int vpx_vector_var_sse2(int16_t const *ref, int16_t const *src,
+                        const int bwl) {
+  int idx;
+  int width = 4 << bwl;
+  int16_t mean;
+  __m128i v0 = _mm_loadu_si128((const __m128i *)ref);
+  __m128i v1 = _mm_load_si128((const __m128i *)src);
+  __m128i diff = _mm_subs_epi16(v0, v1);
+  __m128i sum = diff;
+  __m128i sse = _mm_madd_epi16(diff, diff);
+
+  ref += 8;
+  src += 8;
+
+  for (idx = 8; idx < width; idx += 8) {
+    v0 = _mm_loadu_si128((const __m128i *)ref);
+    v1 = _mm_load_si128((const __m128i *)src);
+    diff = _mm_subs_epi16(v0, v1);
+
+    sum = _mm_add_epi16(sum, diff);
+    v0  = _mm_madd_epi16(diff, diff);
+    sse = _mm_add_epi32(sse, v0);
+
+    ref += 8;
+    src += 8;
+  }
+
+  v0  = _mm_srli_si128(sum, 8);
+  sum = _mm_add_epi16(sum, v0);
+  v0  = _mm_srli_epi64(sum, 32);
+  sum = _mm_add_epi16(sum, v0);
+  v0  = _mm_srli_epi32(sum, 16);
+  sum = _mm_add_epi16(sum, v0);
+
+  v1  = _mm_srli_si128(sse, 8);
+  sse = _mm_add_epi32(sse, v1);
+  v1  = _mm_srli_epi64(sse, 32);
+  sse = _mm_add_epi32(sse, v1);
+
+  mean = _mm_extract_epi16(sum, 0);
+
+  return _mm_cvtsi128_si32(sse) - ((mean * mean) >> (bwl + 2));
+}
diff --git a/libs/libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm b/libs/libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm
new file mode 100644
index 0000000000..26412e8e43
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm
@@ -0,0 +1,121 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%define private_prefix vpx
+
+%include "third_party/x86inc/x86inc.asm"
+
+; This file provides SSSE3 version of the hadamard transformation. Part
+; of the macro definitions are originally derived from the ffmpeg project.
+; The current version applies to x86 64-bit only.
+
+SECTION .text
+
+%if ARCH_X86_64
+; matrix transpose
+%macro INTERLEAVE_2X 4
+  punpckh%1          m%4, m%2, m%3
+  punpckl%1          m%2, m%3
+  SWAP               %3,  %4
+%endmacro
+
+%macro TRANSPOSE8X8 9
+  INTERLEAVE_2X  wd, %1, %2, %9
+  INTERLEAVE_2X  wd, %3, %4, %9
+  INTERLEAVE_2X  wd, %5, %6, %9
+  INTERLEAVE_2X  wd, %7, %8, %9
+
+  INTERLEAVE_2X  dq, %1, %3, %9
+  INTERLEAVE_2X  dq, %2, %4, %9
+  INTERLEAVE_2X  dq, %5, %7, %9
+  INTERLEAVE_2X  dq, %6, %8, %9
+
+  INTERLEAVE_2X  qdq, %1, %5, %9
+  INTERLEAVE_2X  qdq, %3, %7, %9
+  INTERLEAVE_2X  qdq, %2, %6, %9
+  INTERLEAVE_2X  qdq, %4, %8, %9
+
+  SWAP  %2, %5
+  SWAP  %4, %7
+%endmacro
+
+%macro HMD8_1D 0
+  psubw              m8, m0, m1
+  psubw              m9, m2, m3
+  paddw              m0, m1
+  paddw              m2, m3
+  SWAP               1, 8
+  SWAP               3, 9
+  psubw              m8, m4, m5
+  psubw              m9, m6, m7
+  paddw              m4, m5
+  paddw              m6, m7
+  SWAP               5, 8
+  SWAP               7, 9
+
+  psubw              m8, m0, m2
+  psubw              m9, m1, m3
+  paddw              m0, m2
+  paddw              m1, m3
+  SWAP               2, 8
+  SWAP               3, 9
+  psubw              m8, m4, m6
+  psubw              m9, m5, m7
+  paddw              m4, m6
+  paddw              m5, m7
+  SWAP               6, 8
+  SWAP               7, 9
+
+  psubw              m8, m0, m4
+  psubw              m9, m1, m5
+  paddw              m0, m4
+  paddw              m1, m5
+  SWAP               4, 8
+  SWAP               5, 9
+  psubw              m8, m2, m6
+  psubw              m9, m3, m7
+  paddw              m2, m6
+  paddw              m3, m7
+  SWAP               6, 8
+  SWAP               7, 9
+%endmacro
+
+INIT_XMM ssse3
+cglobal hadamard_8x8, 3, 5, 10, input, stride, output
+  lea                r3, [2 * strideq]
+  lea                r4, [4 * strideq]
+
+  mova               m0, [inputq]
+  mova               m1, [inputq + r3]
+  lea                inputq, [inputq + r4]
+  mova               m2, [inputq]
+  mova               m3, [inputq + r3]
+  lea                inputq, [inputq + r4]
+  mova               m4, [inputq]
+  mova               m5, [inputq + r3]
+  lea                inputq, [inputq + r4]
+  mova               m6, [inputq]
+  mova               m7, [inputq + r3]
+
+  HMD8_1D
+  TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
+  HMD8_1D
+
+  mova              [outputq +   0], m0
+  mova              [outputq +  16], m1
+  mova              [outputq +  32], m2
+  mova              [outputq +  48], m3
+  mova              [outputq +  64], m4
+  mova              [outputq +  80], m5
+  mova              [outputq +  96], m6
+  mova              [outputq + 112], m7
+
+  RET
+%endif
diff --git a/libs/libvpx/vpx_dsp/x86/convolve.h b/libs/libvpx/vpx_dsp/x86/convolve.h
new file mode 100644
index 0000000000..b6fbfcf928
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/convolve.h
@@ -0,0 +1,290 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef VPX_DSP_X86_CONVOLVE_H_
+#define VPX_DSP_X86_CONVOLVE_H_
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+typedef void filter8_1dfunction (
+  const uint8_t *src_ptr,
+  ptrdiff_t src_pitch,
+  uint8_t *output_ptr,
+  ptrdiff_t out_pitch,
+  uint32_t output_height,
+  const int16_t *filter
+);
+
+#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
+  void vpx_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \
+                                    uint8_t *dst, ptrdiff_t dst_stride, \
+                                    const int16_t *filter_x, int x_step_q4, \
+                                    const int16_t *filter_y, int y_step_q4, \
+                                    int w, int h) { \
+  assert(filter[3] != 128); \
+  assert(step_q4 == 16); \
+  if (filter[0] || filter[1] || filter[2]) { \
+    while (w >= 16) { \
+      vpx_filter_block1d16_##dir##8_##avg##opt(src_start, \
+                                               src_stride, \
+                                               dst, \
+                                               dst_stride, \
+                                               h, \
+                                               filter); \
+      src += 16; \
+      dst += 16; \
+      w -= 16; \
+    } \
+    while (w >= 8) { \
+      vpx_filter_block1d8_##dir##8_##avg##opt(src_start, \
+                                              src_stride, \
+                                              dst, \
+                                              dst_stride, \
+                                              h, \
+                                              filter); \
+      src += 8; \
+      dst += 8; \
+      w -= 8; \
+    } \
+    while (w >= 4) { \
+      vpx_filter_block1d4_##dir##8_##avg##opt(src_start, \
+                                              src_stride, \
+                                              dst, \
+                                              dst_stride, \
+                                              h, \
+                                              filter); \
+      src += 4; \
+      dst += 4; \
+      w -= 4; \
+    } \
+  } else { \
+    while (w >= 16) { \
+      vpx_filter_block1d16_##dir##2_##avg##opt(src, \
+                                               src_stride, \
+                                               dst, \
+                                               dst_stride, \
+                                               h, \
+                                               filter); \
+      src += 16; \
+      dst += 16; \
+      w -= 16; \
+    } \
+    while (w >= 8) { \
+      vpx_filter_block1d8_##dir##2_##avg##opt(src, \
+                                              src_stride, \
+                                              dst, \
+                                              dst_stride, \
+                                              h, \
+                                              filter); \
+      src += 8; \
+      dst += 8; \
+      w -= 8; \
+    } \
+    while (w >= 4) { \
+      vpx_filter_block1d4_##dir##2_##avg##opt(src, \
+                                              src_stride, \
+                                              dst, \
+                                              dst_stride, \
+                                              h, \
+                                              filter); \
+      src += 4; \
+      dst += 4; \
+      w -= 4; \
+    } \
+  } \
+}
+
+#define FUN_CONV_2D(avg, opt) \
+void vpx_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
+                              uint8_t *dst, ptrdiff_t dst_stride, \
+                              const int16_t *filter_x, int x_step_q4, \
+                              const int16_t *filter_y, int y_step_q4, \
+                              int w, int h) { \
+  assert(filter_x[3] != 128); \
+  assert(filter_y[3] != 128); \
+  assert(w <= 64); \
+  assert(h <= 64); \
+  assert(x_step_q4 == 16); \
+  assert(y_step_q4 == 16); \
+  if (filter_x[0] || filter_x[1] || filter_x[2]|| \
+      filter_y[0] || filter_y[1] || filter_y[2]) { \
+    DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]); \
+    vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
+                              filter_x, x_step_q4, filter_y, y_step_q4, \
+                              w, h + 7); \
+    vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \
+                                    filter_x, x_step_q4, filter_y, \
+                                    y_step_q4, w, h); \
+  } else { \
+    DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]); \
+    vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \
+                              filter_x, x_step_q4, filter_y, y_step_q4, \
+                              w, h + 1); \
+    vpx_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \
+                                    filter_x, x_step_q4, filter_y, \
+                                    y_step_q4, w, h); \
+  } \
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+typedef void highbd_filter8_1dfunction (
+  const uint16_t *src_ptr,
+  const ptrdiff_t src_pitch,
+  uint16_t *output_ptr,
+  ptrdiff_t out_pitch,
+  unsigned int output_height,
+  const int16_t *filter,
+  int bd
+);
+
+#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
+  void vpx_highbd_convolve8_##name##_##opt(const uint8_t *src8, \
+                                           ptrdiff_t src_stride, \
+                                           uint8_t *dst8, \
+                                           ptrdiff_t dst_stride, \
+                                           const int16_t *filter_x, \
+                                           int x_step_q4, \
+                                           const int16_t *filter_y, \
+                                           int y_step_q4, \
+                                           int w, int h, int bd) { \
+  if (step_q4 == 16 && filter[3] != 128) { \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+    if (filter[0] || filter[1] || filter[2]) { \
+      while (w >= 16) { \
+        vpx_highbd_filter_block1d16_##dir##8_##avg##opt(src_start, \
+                                                        src_stride, \
+                                                        dst, \
+                                                        dst_stride, \
+                                                        h, \
+                                                        filter, \
+                                                        bd); \
+        src += 16; \
+        dst += 16; \
+        w -= 16; \
+      } \
+      while (w >= 8) { \
+        vpx_highbd_filter_block1d8_##dir##8_##avg##opt(src_start, \
+                                                       src_stride, \
+                                                       dst, \
+                                                       dst_stride, \
+                                                       h, \
+                                                       filter, \
+                                                       bd); \
+        src += 8; \
+        dst += 8; \
+        w -= 8; \
+      } \
+      while (w >= 4) { \
+        vpx_highbd_filter_block1d4_##dir##8_##avg##opt(src_start, \
+                                                       src_stride, \
+                                                       dst, \
+                                                       dst_stride, \
+                                                       h, \
+                                                       filter, \
+                                                       bd); \
+        src += 4; \
+        dst += 4; \
+        w -= 4; \
+      } \
+    } else { \
+      while (w >= 16) { \
+        vpx_highbd_filter_block1d16_##dir##2_##avg##opt(src, \
+                                                        src_stride, \
+                                                        dst, \
+                                                        dst_stride, \
+                                                        h, \
+                                                        filter, \
+                                                        bd); \
+        src += 16; \
+        dst += 16; \
+        w -= 16; \
+      } \
+      while (w >= 8) { \
+        vpx_highbd_filter_block1d8_##dir##2_##avg##opt(src, \
+                                                       src_stride, \
+                                                       dst, \
+                                                       dst_stride, \
+                                                       h, \
+                                                       filter, \
+                                                       bd); \
+        src += 8; \
+        dst += 8; \
+        w -= 8; \
+      } \
+      while (w >= 4) { \
+        vpx_highbd_filter_block1d4_##dir##2_##avg##opt(src, \
+                                                       src_stride, \
+                                                       dst, \
+                                                       dst_stride, \
+                                                       h, \
+                                                       filter, \
+                                                       bd); \
+        src += 4; \
+        dst += 4; \
+        w -= 4; \
+      } \
+    } \
+  } \
+  if (w) { \
+    vpx_highbd_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \
+                                    filter_x, x_step_q4, filter_y, y_step_q4, \
+                                    w, h, bd); \
+  } \
+}
+
+#define HIGH_FUN_CONV_2D(avg, opt) \
+void vpx_highbd_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
+                                     uint8_t *dst, ptrdiff_t dst_stride, \
+                                     const int16_t *filter_x, int x_step_q4, \
+                                     const int16_t *filter_y, int y_step_q4, \
+                                     int w, int h, int bd) { \
+  assert(w <= 64); \
+  assert(h <= 64); \
+  if (x_step_q4 == 16 && y_step_q4 == 16) { \
+    if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
+        filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
+      DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \
+      vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \
+                                       CONVERT_TO_BYTEPTR(fdata2), 64, \
+                                       filter_x, x_step_q4, \
+                                       filter_y, y_step_q4, \
+                                       w, h + 7, bd); \
+      vpx_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2) + 192, \
+                                             64, dst, dst_stride, \
+                                             filter_x, x_step_q4, \
+                                             filter_y, y_step_q4, \
+                                             w, h, bd); \
+    } else { \
+      DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \
+      vpx_highbd_convolve8_horiz_##opt(src, src_stride, \
+                                       CONVERT_TO_BYTEPTR(fdata2), 64, \
+                                       filter_x, x_step_q4, \
+                                       filter_y, y_step_q4, \
+                                       w, h + 1, bd); \
+      vpx_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), 64, \
+                                             dst, dst_stride, \
+                                             filter_x, x_step_q4, \
+                                             filter_y, y_step_q4, \
+                                             w, h, bd); \
+    } \
+  } else { \
+    vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
+                                  filter_x, x_step_q4, filter_y, y_step_q4, w, \
+                                  h, bd); \
+  } \
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#endif  // VPX_DSP_X86_CONVOLVE_H_
diff --git a/libs/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h b/libs/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
new file mode 100644
index 0000000000..4df39dff86
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
@@ -0,0 +1,2711 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>  // AVX2
+
+#include "vpx_dsp/txfm_common.h"
+
+#define pair256_set_epi16(a, b) \
+  _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
+
+#define pair256_set_epi32(a, b) \
+  _mm256_set_epi32((int)(b), (int)(a), (int)(b), (int)(a), \
+                   (int)(b), (int)(a), (int)(b), (int)(a))
+
+#if FDCT32x32_HIGH_PRECISION
+static INLINE __m256i k_madd_epi32_avx2(__m256i a, __m256i b) {
+  __m256i buf0, buf1;
+  buf0 = _mm256_mul_epu32(a, b);
+  a = _mm256_srli_epi64(a, 32);
+  b = _mm256_srli_epi64(b, 32);
+  buf1 = _mm256_mul_epu32(a, b);
+  return _mm256_add_epi64(buf0, buf1);
+}
+
+static INLINE __m256i k_packs_epi64_avx2(__m256i a, __m256i b) {
+  __m256i buf0 = _mm256_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
+  __m256i buf1 = _mm256_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
+  return _mm256_unpacklo_epi64(buf0, buf1);
+}
+#endif
+
+void FDCT32x32_2D_AVX2(const int16_t *input,
+                  int16_t *output_org, int stride) {
+  // Calculate pre-multiplied strides
+  const int str1 = stride;
+  const int str2 = 2 * stride;
+  const int str3 = 2 * stride + str1;
+  // We need an intermediate buffer between passes.
+  DECLARE_ALIGNED(32, int16_t, intermediate[32 * 32]);
+  // Constants
+  //    When we use them, in one case, they are all the same. In all others
+  //    it's a pair of them that we need to repeat four times. This is done
+  //    by constructing the 32 bit constant corresponding to that pair.
+  const __m256i k__cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
+  const __m256i k__cospi_p16_m16 = pair256_set_epi16(+cospi_16_64, -cospi_16_64);
+  const __m256i k__cospi_m08_p24 = pair256_set_epi16(-cospi_8_64,   cospi_24_64);
+  const __m256i k__cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
+  const __m256i k__cospi_p24_p08 = pair256_set_epi16(+cospi_24_64,  cospi_8_64);
+  const __m256i k__cospi_p12_p20 = pair256_set_epi16(+cospi_12_64,  cospi_20_64);
+  const __m256i k__cospi_m20_p12 = pair256_set_epi16(-cospi_20_64,  cospi_12_64);
+  const __m256i k__cospi_m04_p28 = pair256_set_epi16(-cospi_4_64,   cospi_28_64);
+  const __m256i k__cospi_p28_p04 = pair256_set_epi16(+cospi_28_64,  cospi_4_64);
+  const __m256i k__cospi_m28_m04 = pair256_set_epi16(-cospi_28_64, -cospi_4_64);
+  const __m256i k__cospi_m12_m20 = pair256_set_epi16(-cospi_12_64, -cospi_20_64);
+  const __m256i k__cospi_p30_p02 = pair256_set_epi16(+cospi_30_64,  cospi_2_64);
+  const __m256i k__cospi_p14_p18 = pair256_set_epi16(+cospi_14_64,  cospi_18_64);
+  const __m256i k__cospi_p22_p10 = pair256_set_epi16(+cospi_22_64,  cospi_10_64);
+  const __m256i k__cospi_p06_p26 = pair256_set_epi16(+cospi_6_64,   cospi_26_64);
+  const __m256i k__cospi_m26_p06 = pair256_set_epi16(-cospi_26_64,  cospi_6_64);
+  const __m256i k__cospi_m10_p22 = pair256_set_epi16(-cospi_10_64,  cospi_22_64);
+  const __m256i k__cospi_m18_p14 = pair256_set_epi16(-cospi_18_64,  cospi_14_64);
+  const __m256i k__cospi_m02_p30 = pair256_set_epi16(-cospi_2_64,   cospi_30_64);
+  const __m256i k__cospi_p31_p01 = pair256_set_epi16(+cospi_31_64,  cospi_1_64);
+  const __m256i k__cospi_p15_p17 = pair256_set_epi16(+cospi_15_64,  cospi_17_64);
+  const __m256i k__cospi_p23_p09 = pair256_set_epi16(+cospi_23_64,  cospi_9_64);
+  const __m256i k__cospi_p07_p25 = pair256_set_epi16(+cospi_7_64,   cospi_25_64);
+  const __m256i k__cospi_m25_p07 = pair256_set_epi16(-cospi_25_64,  cospi_7_64);
+  const __m256i k__cospi_m09_p23 = pair256_set_epi16(-cospi_9_64,   cospi_23_64);
+  const __m256i k__cospi_m17_p15 = pair256_set_epi16(-cospi_17_64,  cospi_15_64);
+  const __m256i k__cospi_m01_p31 = pair256_set_epi16(-cospi_1_64,   cospi_31_64);
+  const __m256i k__cospi_p27_p05 = pair256_set_epi16(+cospi_27_64,  cospi_5_64);
+  const __m256i k__cospi_p11_p21 = pair256_set_epi16(+cospi_11_64,  cospi_21_64);
+  const __m256i k__cospi_p19_p13 = pair256_set_epi16(+cospi_19_64,  cospi_13_64);
+  const __m256i k__cospi_p03_p29 = pair256_set_epi16(+cospi_3_64,   cospi_29_64);
+  const __m256i k__cospi_m29_p03 = pair256_set_epi16(-cospi_29_64,  cospi_3_64);
+  const __m256i k__cospi_m13_p19 = pair256_set_epi16(-cospi_13_64,  cospi_19_64);
+  const __m256i k__cospi_m21_p11 = pair256_set_epi16(-cospi_21_64,  cospi_11_64);
+  const __m256i k__cospi_m05_p27 = pair256_set_epi16(-cospi_5_64,   cospi_27_64);
+  const __m256i k__DCT_CONST_ROUNDING = _mm256_set1_epi32(DCT_CONST_ROUNDING);
+  const __m256i kZero = _mm256_set1_epi16(0);
+  const __m256i kOne  = _mm256_set1_epi16(1);
+  // Do the two transform/transpose passes
+  int pass;
+  for (pass = 0; pass < 2; ++pass) {
+    // We process sixteen columns (transposed rows in second pass) at a time.
+    int column_start;
+    for (column_start = 0; column_start < 32; column_start += 16) {
+      __m256i step1[32];
+      __m256i step2[32];
+      __m256i step3[32];
+      __m256i out[32];
+      // Stage 1
+      // Note: even though all the loads below are aligned, using the aligned
+      //       intrinsic make the code slightly slower.
+      if (0 == pass) {
+        const int16_t *in  = &input[column_start];
+        // step1[i] =  (in[ 0 * stride] + in[(32 -  1) * stride]) << 2;
+        // Note: the next four blocks could be in a loop. That would help the
+        //       instruction cache but is actually slower.
+        {
+          const int16_t *ina =  in +  0 * str1;
+          const int16_t *inb =  in + 31 * str1;
+          __m256i *step1a = &step1[ 0];
+          __m256i *step1b = &step1[31];
+          const __m256i ina0  = _mm256_loadu_si256((const __m256i *)(ina));
+          const __m256i ina1  = _mm256_loadu_si256((const __m256i *)(ina + str1));
+          const __m256i ina2  = _mm256_loadu_si256((const __m256i *)(ina + str2));
+          const __m256i ina3  = _mm256_loadu_si256((const __m256i *)(ina + str3));
+          const __m256i inb3  = _mm256_loadu_si256((const __m256i *)(inb - str3));
+          const __m256i inb2  = _mm256_loadu_si256((const __m256i *)(inb - str2));
+          const __m256i inb1  = _mm256_loadu_si256((const __m256i *)(inb - str1));
+          const __m256i inb0  = _mm256_loadu_si256((const __m256i *)(inb));
+          step1a[ 0] = _mm256_add_epi16(ina0, inb0);
+          step1a[ 1] = _mm256_add_epi16(ina1, inb1);
+          step1a[ 2] = _mm256_add_epi16(ina2, inb2);
+          step1a[ 3] = _mm256_add_epi16(ina3, inb3);
+          step1b[-3] = _mm256_sub_epi16(ina3, inb3);
+          step1b[-2] = _mm256_sub_epi16(ina2, inb2);
+          step1b[-1] = _mm256_sub_epi16(ina1, inb1);
+          step1b[-0] = _mm256_sub_epi16(ina0, inb0);
+          step1a[ 0] = _mm256_slli_epi16(step1a[ 0], 2);
+          step1a[ 1] = _mm256_slli_epi16(step1a[ 1], 2);
+          step1a[ 2] = _mm256_slli_epi16(step1a[ 2], 2);
+          step1a[ 3] = _mm256_slli_epi16(step1a[ 3], 2);
+          step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
+          step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
+          step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
+          step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
+        }
+        {
+          const int16_t *ina =  in +  4 * str1;
+          const int16_t *inb =  in + 27 * str1;
+          __m256i *step1a = &step1[ 4];
+          __m256i *step1b = &step1[27];
+          const __m256i ina0  = _mm256_loadu_si256((const __m256i *)(ina));
+          const __m256i ina1  = _mm256_loadu_si256((const __m256i *)(ina + str1));
+          const __m256i ina2  = _mm256_loadu_si256((const __m256i *)(ina + str2));
+          const __m256i ina3  = _mm256_loadu_si256((const __m256i *)(ina + str3));
+          const __m256i inb3  = _mm256_loadu_si256((const __m256i *)(inb - str3));
+          const __m256i inb2  = _mm256_loadu_si256((const __m256i *)(inb - str2));
+          const __m256i inb1  = _mm256_loadu_si256((const __m256i *)(inb - str1));
+          const __m256i inb0  = _mm256_loadu_si256((const __m256i *)(inb));
+          step1a[ 0] = _mm256_add_epi16(ina0, inb0);
+          step1a[ 1] = _mm256_add_epi16(ina1, inb1);
+          step1a[ 2] = _mm256_add_epi16(ina2, inb2);
+          step1a[ 3] = _mm256_add_epi16(ina3, inb3);
+          step1b[-3] = _mm256_sub_epi16(ina3, inb3);
+          step1b[-2] = _mm256_sub_epi16(ina2, inb2);
+          step1b[-1] = _mm256_sub_epi16(ina1, inb1);
+          step1b[-0] = _mm256_sub_epi16(ina0, inb0);
+          step1a[ 0] = _mm256_slli_epi16(step1a[ 0], 2);
+          step1a[ 1] = _mm256_slli_epi16(step1a[ 1], 2);
+          step1a[ 2] = _mm256_slli_epi16(step1a[ 2], 2);
+          step1a[ 3] = _mm256_slli_epi16(step1a[ 3], 2);
+          step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
+          step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
+          step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
+          step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
+        }
+        {
+          const int16_t *ina =  in +  8 * str1;
+          const int16_t *inb =  in + 23 * str1;
+          __m256i *step1a = &step1[ 8];
+          __m256i *step1b = &step1[23];
+          const __m256i ina0  = _mm256_loadu_si256((const __m256i *)(ina));
+          const __m256i ina1  = _mm256_loadu_si256((const __m256i *)(ina + str1));
+          const __m256i ina2  = _mm256_loadu_si256((const __m256i *)(ina + str2));
+          const __m256i ina3  = _mm256_loadu_si256((const __m256i *)(ina + str3));
+          const __m256i inb3  = _mm256_loadu_si256((const __m256i *)(inb - str3));
+          const __m256i inb2  = _mm256_loadu_si256((const __m256i *)(inb - str2));
+          const __m256i inb1  = _mm256_loadu_si256((const __m256i *)(inb - str1));
+          const __m256i inb0  = _mm256_loadu_si256((const __m256i *)(inb));
+          step1a[ 0] = _mm256_add_epi16(ina0, inb0);
+          step1a[ 1] = _mm256_add_epi16(ina1, inb1);
+          step1a[ 2] = _mm256_add_epi16(ina2, inb2);
+          step1a[ 3] = _mm256_add_epi16(ina3, inb3);
+          step1b[-3] = _mm256_sub_epi16(ina3, inb3);
+          step1b[-2] = _mm256_sub_epi16(ina2, inb2);
+          step1b[-1] = _mm256_sub_epi16(ina1, inb1);
+          step1b[-0] = _mm256_sub_epi16(ina0, inb0);
+          step1a[ 0] = _mm256_slli_epi16(step1a[ 0], 2);
+          step1a[ 1] = _mm256_slli_epi16(step1a[ 1], 2);
+          step1a[ 2] = _mm256_slli_epi16(step1a[ 2], 2);
+          step1a[ 3] = _mm256_slli_epi16(step1a[ 3], 2);
+          step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
+          step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
+          step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
+          step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
+        }
+        {
+          const int16_t *ina =  in + 12 * str1;
+          const int16_t *inb =  in + 19 * str1;
+          __m256i *step1a = &step1[12];
+          __m256i *step1b = &step1[19];
+          const __m256i ina0  = _mm256_loadu_si256((const __m256i *)(ina));
+          const __m256i ina1  = _mm256_loadu_si256((const __m256i *)(ina + str1));
+          const __m256i ina2  = _mm256_loadu_si256((const __m256i *)(ina + str2));
+          const __m256i ina3  = _mm256_loadu_si256((const __m256i *)(ina + str3));
+          const __m256i inb3  = _mm256_loadu_si256((const __m256i *)(inb - str3));
+          const __m256i inb2  = _mm256_loadu_si256((const __m256i *)(inb - str2));
+          const __m256i inb1  = _mm256_loadu_si256((const __m256i *)(inb - str1));
+          const __m256i inb0  = _mm256_loadu_si256((const __m256i *)(inb));
+          step1a[ 0] = _mm256_add_epi16(ina0, inb0);
+          step1a[ 1] = _mm256_add_epi16(ina1, inb1);
+          step1a[ 2] = _mm256_add_epi16(ina2, inb2);
+          step1a[ 3] = _mm256_add_epi16(ina3, inb3);
+          step1b[-3] = _mm256_sub_epi16(ina3, inb3);
+          step1b[-2] = _mm256_sub_epi16(ina2, inb2);
+          step1b[-1] = _mm256_sub_epi16(ina1, inb1);
+          step1b[-0] = _mm256_sub_epi16(ina0, inb0);
+          step1a[ 0] = _mm256_slli_epi16(step1a[ 0], 2);
+          step1a[ 1] = _mm256_slli_epi16(step1a[ 1], 2);
+          step1a[ 2] = _mm256_slli_epi16(step1a[ 2], 2);
+          step1a[ 3] = _mm256_slli_epi16(step1a[ 3], 2);
+          step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
+          step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
+          step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
+          step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
+        }
+      } else {
+        int16_t *in = &intermediate[column_start];
+        // step1[i] =  in[ 0 * 32] + in[(32 -  1) * 32];
+        // Note: using the same approach as above to have common offset is
+        //       counter-productive as all offsets can be calculated at compile
+        //       time.
+        // Note: the next four blocks could be in a loop. That would help the
+        //       instruction cache but is actually slower.
+        {
+          __m256i in00  = _mm256_loadu_si256((const __m256i *)(in +  0 * 32));
+          __m256i in01  = _mm256_loadu_si256((const __m256i *)(in +  1 * 32));
+          __m256i in02  = _mm256_loadu_si256((const __m256i *)(in +  2 * 32));
+          __m256i in03  = _mm256_loadu_si256((const __m256i *)(in +  3 * 32));
+          __m256i in28  = _mm256_loadu_si256((const __m256i *)(in + 28 * 32));
+          __m256i in29  = _mm256_loadu_si256((const __m256i *)(in + 29 * 32));
+          __m256i in30  = _mm256_loadu_si256((const __m256i *)(in + 30 * 32));
+          __m256i in31  = _mm256_loadu_si256((const __m256i *)(in + 31 * 32));
+          step1[ 0] = _mm256_add_epi16(in00, in31);
+          step1[ 1] = _mm256_add_epi16(in01, in30);
+          step1[ 2] = _mm256_add_epi16(in02, in29);
+          step1[ 3] = _mm256_add_epi16(in03, in28);
+          step1[28] = _mm256_sub_epi16(in03, in28);
+          step1[29] = _mm256_sub_epi16(in02, in29);
+          step1[30] = _mm256_sub_epi16(in01, in30);
+          step1[31] = _mm256_sub_epi16(in00, in31);
+        }
+        {
+          __m256i in04  = _mm256_loadu_si256((const __m256i *)(in +  4 * 32));
+          __m256i in05  = _mm256_loadu_si256((const __m256i *)(in +  5 * 32));
+          __m256i in06  = _mm256_loadu_si256((const __m256i *)(in +  6 * 32));
+          __m256i in07  = _mm256_loadu_si256((const __m256i *)(in +  7 * 32));
+          __m256i in24  = _mm256_loadu_si256((const __m256i *)(in + 24 * 32));
+          __m256i in25  = _mm256_loadu_si256((const __m256i *)(in + 25 * 32));
+          __m256i in26  = _mm256_loadu_si256((const __m256i *)(in + 26 * 32));
+          __m256i in27  = _mm256_loadu_si256((const __m256i *)(in + 27 * 32));
+          step1[ 4] = _mm256_add_epi16(in04, in27);
+          step1[ 5] = _mm256_add_epi16(in05, in26);
+          step1[ 6] = _mm256_add_epi16(in06, in25);
+          step1[ 7] = _mm256_add_epi16(in07, in24);
+          step1[24] = _mm256_sub_epi16(in07, in24);
+          step1[25] = _mm256_sub_epi16(in06, in25);
+          step1[26] = _mm256_sub_epi16(in05, in26);
+          step1[27] = _mm256_sub_epi16(in04, in27);
+        }
+        {
+          __m256i in08  = _mm256_loadu_si256((const __m256i *)(in +  8 * 32));
+          __m256i in09  = _mm256_loadu_si256((const __m256i *)(in +  9 * 32));
+          __m256i in10  = _mm256_loadu_si256((const __m256i *)(in + 10 * 32));
+          __m256i in11  = _mm256_loadu_si256((const __m256i *)(in + 11 * 32));
+          __m256i in20  = _mm256_loadu_si256((const __m256i *)(in + 20 * 32));
+          __m256i in21  = _mm256_loadu_si256((const __m256i *)(in + 21 * 32));
+          __m256i in22  = _mm256_loadu_si256((const __m256i *)(in + 22 * 32));
+          __m256i in23  = _mm256_loadu_si256((const __m256i *)(in + 23 * 32));
+          step1[ 8] = _mm256_add_epi16(in08, in23);
+          step1[ 9] = _mm256_add_epi16(in09, in22);
+          step1[10] = _mm256_add_epi16(in10, in21);
+          step1[11] = _mm256_add_epi16(in11, in20);
+          step1[20] = _mm256_sub_epi16(in11, in20);
+          step1[21] = _mm256_sub_epi16(in10, in21);
+          step1[22] = _mm256_sub_epi16(in09, in22);
+          step1[23] = _mm256_sub_epi16(in08, in23);
+        }
+        {
+          __m256i in12  = _mm256_loadu_si256((const __m256i *)(in + 12 * 32));
+          __m256i in13  = _mm256_loadu_si256((const __m256i *)(in + 13 * 32));
+          __m256i in14  = _mm256_loadu_si256((const __m256i *)(in + 14 * 32));
+          __m256i in15  = _mm256_loadu_si256((const __m256i *)(in + 15 * 32));
+          __m256i in16  = _mm256_loadu_si256((const __m256i *)(in + 16 * 32));
+          __m256i in17  = _mm256_loadu_si256((const __m256i *)(in + 17 * 32));
+          __m256i in18  = _mm256_loadu_si256((const __m256i *)(in + 18 * 32));
+          __m256i in19  = _mm256_loadu_si256((const __m256i *)(in + 19 * 32));
+          step1[12] = _mm256_add_epi16(in12, in19);
+          step1[13] = _mm256_add_epi16(in13, in18);
+          step1[14] = _mm256_add_epi16(in14, in17);
+          step1[15] = _mm256_add_epi16(in15, in16);
+          step1[16] = _mm256_sub_epi16(in15, in16);
+          step1[17] = _mm256_sub_epi16(in14, in17);
+          step1[18] = _mm256_sub_epi16(in13, in18);
+          step1[19] = _mm256_sub_epi16(in12, in19);
+        }
+      }
+      // Stage 2
+      {
+        step2[ 0] = _mm256_add_epi16(step1[0], step1[15]);
+        step2[ 1] = _mm256_add_epi16(step1[1], step1[14]);
+        step2[ 2] = _mm256_add_epi16(step1[2], step1[13]);
+        step2[ 3] = _mm256_add_epi16(step1[3], step1[12]);
+        step2[ 4] = _mm256_add_epi16(step1[4], step1[11]);
+        step2[ 5] = _mm256_add_epi16(step1[5], step1[10]);
+        step2[ 6] = _mm256_add_epi16(step1[6], step1[ 9]);
+        step2[ 7] = _mm256_add_epi16(step1[7], step1[ 8]);
+        step2[ 8] = _mm256_sub_epi16(step1[7], step1[ 8]);
+        step2[ 9] = _mm256_sub_epi16(step1[6], step1[ 9]);
+        step2[10] = _mm256_sub_epi16(step1[5], step1[10]);
+        step2[11] = _mm256_sub_epi16(step1[4], step1[11]);
+        step2[12] = _mm256_sub_epi16(step1[3], step1[12]);
+        step2[13] = _mm256_sub_epi16(step1[2], step1[13]);
+        step2[14] = _mm256_sub_epi16(step1[1], step1[14]);
+        step2[15] = _mm256_sub_epi16(step1[0], step1[15]);
+      }
+      {
+        const __m256i s2_20_0 = _mm256_unpacklo_epi16(step1[27], step1[20]);
+        const __m256i s2_20_1 = _mm256_unpackhi_epi16(step1[27], step1[20]);
+        const __m256i s2_21_0 = _mm256_unpacklo_epi16(step1[26], step1[21]);
+        const __m256i s2_21_1 = _mm256_unpackhi_epi16(step1[26], step1[21]);
+        const __m256i s2_22_0 = _mm256_unpacklo_epi16(step1[25], step1[22]);
+        const __m256i s2_22_1 = _mm256_unpackhi_epi16(step1[25], step1[22]);
+        const __m256i s2_23_0 = _mm256_unpacklo_epi16(step1[24], step1[23]);
+        const __m256i s2_23_1 = _mm256_unpackhi_epi16(step1[24], step1[23]);
+        const __m256i s2_20_2 = _mm256_madd_epi16(s2_20_0, k__cospi_p16_m16);
+        const __m256i s2_20_3 = _mm256_madd_epi16(s2_20_1, k__cospi_p16_m16);
+        const __m256i s2_21_2 = _mm256_madd_epi16(s2_21_0, k__cospi_p16_m16);
+        const __m256i s2_21_3 = _mm256_madd_epi16(s2_21_1, k__cospi_p16_m16);
+        const __m256i s2_22_2 = _mm256_madd_epi16(s2_22_0, k__cospi_p16_m16);
+        const __m256i s2_22_3 = _mm256_madd_epi16(s2_22_1, k__cospi_p16_m16);
+        const __m256i s2_23_2 = _mm256_madd_epi16(s2_23_0, k__cospi_p16_m16);
+        const __m256i s2_23_3 = _mm256_madd_epi16(s2_23_1, k__cospi_p16_m16);
+        const __m256i s2_24_2 = _mm256_madd_epi16(s2_23_0, k__cospi_p16_p16);
+        const __m256i s2_24_3 = _mm256_madd_epi16(s2_23_1, k__cospi_p16_p16);
+        const __m256i s2_25_2 = _mm256_madd_epi16(s2_22_0, k__cospi_p16_p16);
+        const __m256i s2_25_3 = _mm256_madd_epi16(s2_22_1, k__cospi_p16_p16);
+        const __m256i s2_26_2 = _mm256_madd_epi16(s2_21_0, k__cospi_p16_p16);
+        const __m256i s2_26_3 = _mm256_madd_epi16(s2_21_1, k__cospi_p16_p16);
+        const __m256i s2_27_2 = _mm256_madd_epi16(s2_20_0, k__cospi_p16_p16);
+        const __m256i s2_27_3 = _mm256_madd_epi16(s2_20_1, k__cospi_p16_p16);
+        // dct_const_round_shift
+        const __m256i s2_20_4 = _mm256_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_20_5 = _mm256_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_21_4 = _mm256_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_21_5 = _mm256_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_22_4 = _mm256_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_22_5 = _mm256_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_23_4 = _mm256_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_23_5 = _mm256_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_24_4 = _mm256_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_24_5 = _mm256_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_25_4 = _mm256_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_25_5 = _mm256_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_26_4 = _mm256_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_26_5 = _mm256_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_27_4 = _mm256_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_27_5 = _mm256_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_20_6 = _mm256_srai_epi32(s2_20_4, DCT_CONST_BITS);
+        const __m256i s2_20_7 = _mm256_srai_epi32(s2_20_5, DCT_CONST_BITS);
+        const __m256i s2_21_6 = _mm256_srai_epi32(s2_21_4, DCT_CONST_BITS);
+        const __m256i s2_21_7 = _mm256_srai_epi32(s2_21_5, DCT_CONST_BITS);
+        const __m256i s2_22_6 = _mm256_srai_epi32(s2_22_4, DCT_CONST_BITS);
+        const __m256i s2_22_7 = _mm256_srai_epi32(s2_22_5, DCT_CONST_BITS);
+        const __m256i s2_23_6 = _mm256_srai_epi32(s2_23_4, DCT_CONST_BITS);
+        const __m256i s2_23_7 = _mm256_srai_epi32(s2_23_5, DCT_CONST_BITS);
+        const __m256i s2_24_6 = _mm256_srai_epi32(s2_24_4, DCT_CONST_BITS);
+        const __m256i s2_24_7 = _mm256_srai_epi32(s2_24_5, DCT_CONST_BITS);
+        const __m256i s2_25_6 = _mm256_srai_epi32(s2_25_4, DCT_CONST_BITS);
+        const __m256i s2_25_7 = _mm256_srai_epi32(s2_25_5, DCT_CONST_BITS);
+        const __m256i s2_26_6 = _mm256_srai_epi32(s2_26_4, DCT_CONST_BITS);
+        const __m256i s2_26_7 = _mm256_srai_epi32(s2_26_5, DCT_CONST_BITS);
+        const __m256i s2_27_6 = _mm256_srai_epi32(s2_27_4, DCT_CONST_BITS);
+        const __m256i s2_27_7 = _mm256_srai_epi32(s2_27_5, DCT_CONST_BITS);
+        // Combine
+        step2[20] = _mm256_packs_epi32(s2_20_6, s2_20_7);
+        step2[21] = _mm256_packs_epi32(s2_21_6, s2_21_7);
+        step2[22] = _mm256_packs_epi32(s2_22_6, s2_22_7);
+        step2[23] = _mm256_packs_epi32(s2_23_6, s2_23_7);
+        step2[24] = _mm256_packs_epi32(s2_24_6, s2_24_7);
+        step2[25] = _mm256_packs_epi32(s2_25_6, s2_25_7);
+        step2[26] = _mm256_packs_epi32(s2_26_6, s2_26_7);
+        step2[27] = _mm256_packs_epi32(s2_27_6, s2_27_7);
+      }
+
+#if !FDCT32x32_HIGH_PRECISION
+      // dump the magnitude by half, hence the intermediate values are within
+      // the range of 16 bits.
+      if (1 == pass) {
+        __m256i s3_00_0 = _mm256_cmpgt_epi16(kZero,step2[ 0]);
+        __m256i s3_01_0 = _mm256_cmpgt_epi16(kZero,step2[ 1]);
+        __m256i s3_02_0 = _mm256_cmpgt_epi16(kZero,step2[ 2]);
+        __m256i s3_03_0 = _mm256_cmpgt_epi16(kZero,step2[ 3]);
+        __m256i s3_04_0 = _mm256_cmpgt_epi16(kZero,step2[ 4]);
+        __m256i s3_05_0 = _mm256_cmpgt_epi16(kZero,step2[ 5]);
+        __m256i s3_06_0 = _mm256_cmpgt_epi16(kZero,step2[ 6]);
+        __m256i s3_07_0 = _mm256_cmpgt_epi16(kZero,step2[ 7]);
+        __m256i s2_08_0 = _mm256_cmpgt_epi16(kZero,step2[ 8]);
+        __m256i s2_09_0 = _mm256_cmpgt_epi16(kZero,step2[ 9]);
+        __m256i s3_10_0 = _mm256_cmpgt_epi16(kZero,step2[10]);
+        __m256i s3_11_0 = _mm256_cmpgt_epi16(kZero,step2[11]);
+        __m256i s3_12_0 = _mm256_cmpgt_epi16(kZero,step2[12]);
+        __m256i s3_13_0 = _mm256_cmpgt_epi16(kZero,step2[13]);
+        __m256i s2_14_0 = _mm256_cmpgt_epi16(kZero,step2[14]);
+        __m256i s2_15_0 = _mm256_cmpgt_epi16(kZero,step2[15]);
+        __m256i s3_16_0 = _mm256_cmpgt_epi16(kZero,step1[16]);
+        __m256i s3_17_0 = _mm256_cmpgt_epi16(kZero,step1[17]);
+        __m256i s3_18_0 = _mm256_cmpgt_epi16(kZero,step1[18]);
+        __m256i s3_19_0 = _mm256_cmpgt_epi16(kZero,step1[19]);
+        __m256i s3_20_0 = _mm256_cmpgt_epi16(kZero,step2[20]);
+        __m256i s3_21_0 = _mm256_cmpgt_epi16(kZero,step2[21]);
+        __m256i s3_22_0 = _mm256_cmpgt_epi16(kZero,step2[22]);
+        __m256i s3_23_0 = _mm256_cmpgt_epi16(kZero,step2[23]);
+        __m256i s3_24_0 = _mm256_cmpgt_epi16(kZero,step2[24]);
+        __m256i s3_25_0 = _mm256_cmpgt_epi16(kZero,step2[25]);
+        __m256i s3_26_0 = _mm256_cmpgt_epi16(kZero,step2[26]);
+        __m256i s3_27_0 = _mm256_cmpgt_epi16(kZero,step2[27]);
+        __m256i s3_28_0 = _mm256_cmpgt_epi16(kZero,step1[28]);
+        __m256i s3_29_0 = _mm256_cmpgt_epi16(kZero,step1[29]);
+        __m256i s3_30_0 = _mm256_cmpgt_epi16(kZero,step1[30]);
+        __m256i s3_31_0 = _mm256_cmpgt_epi16(kZero,step1[31]);
+
+        step2[ 0] = _mm256_sub_epi16(step2[ 0], s3_00_0);
+        step2[ 1] = _mm256_sub_epi16(step2[ 1], s3_01_0);
+        step2[ 2] = _mm256_sub_epi16(step2[ 2], s3_02_0);
+        step2[ 3] = _mm256_sub_epi16(step2[ 3], s3_03_0);
+        step2[ 4] = _mm256_sub_epi16(step2[ 4], s3_04_0);
+        step2[ 5] = _mm256_sub_epi16(step2[ 5], s3_05_0);
+        step2[ 6] = _mm256_sub_epi16(step2[ 6], s3_06_0);
+        step2[ 7] = _mm256_sub_epi16(step2[ 7], s3_07_0);
+        step2[ 8] = _mm256_sub_epi16(step2[ 8], s2_08_0);
+        step2[ 9] = _mm256_sub_epi16(step2[ 9], s2_09_0);
+        step2[10] = _mm256_sub_epi16(step2[10], s3_10_0);
+        step2[11] = _mm256_sub_epi16(step2[11], s3_11_0);
+        step2[12] = _mm256_sub_epi16(step2[12], s3_12_0);
+        step2[13] = _mm256_sub_epi16(step2[13], s3_13_0);
+        step2[14] = _mm256_sub_epi16(step2[14], s2_14_0);
+        step2[15] = _mm256_sub_epi16(step2[15], s2_15_0);
+        step1[16] = _mm256_sub_epi16(step1[16], s3_16_0);
+        step1[17] = _mm256_sub_epi16(step1[17], s3_17_0);
+        step1[18] = _mm256_sub_epi16(step1[18], s3_18_0);
+        step1[19] = _mm256_sub_epi16(step1[19], s3_19_0);
+        step2[20] = _mm256_sub_epi16(step2[20], s3_20_0);
+        step2[21] = _mm256_sub_epi16(step2[21], s3_21_0);
+        step2[22] = _mm256_sub_epi16(step2[22], s3_22_0);
+        step2[23] = _mm256_sub_epi16(step2[23], s3_23_0);
+        step2[24] = _mm256_sub_epi16(step2[24], s3_24_0);
+        step2[25] = _mm256_sub_epi16(step2[25], s3_25_0);
+        step2[26] = _mm256_sub_epi16(step2[26], s3_26_0);
+        step2[27] = _mm256_sub_epi16(step2[27], s3_27_0);
+        step1[28] = _mm256_sub_epi16(step1[28], s3_28_0);
+        step1[29] = _mm256_sub_epi16(step1[29], s3_29_0);
+        step1[30] = _mm256_sub_epi16(step1[30], s3_30_0);
+        step1[31] = _mm256_sub_epi16(step1[31], s3_31_0);
+
+        step2[ 0] = _mm256_add_epi16(step2[ 0], kOne);
+        step2[ 1] = _mm256_add_epi16(step2[ 1], kOne);
+        step2[ 2] = _mm256_add_epi16(step2[ 2], kOne);
+        step2[ 3] = _mm256_add_epi16(step2[ 3], kOne);
+        step2[ 4] = _mm256_add_epi16(step2[ 4], kOne);
+        step2[ 5] = _mm256_add_epi16(step2[ 5], kOne);
+        step2[ 6] = _mm256_add_epi16(step2[ 6], kOne);
+        step2[ 7] = _mm256_add_epi16(step2[ 7], kOne);
+        step2[ 8] = _mm256_add_epi16(step2[ 8], kOne);
+        step2[ 9] = _mm256_add_epi16(step2[ 9], kOne);
+        step2[10] = _mm256_add_epi16(step2[10], kOne);
+        step2[11] = _mm256_add_epi16(step2[11], kOne);
+        step2[12] = _mm256_add_epi16(step2[12], kOne);
+        step2[13] = _mm256_add_epi16(step2[13], kOne);
+        step2[14] = _mm256_add_epi16(step2[14], kOne);
+        step2[15] = _mm256_add_epi16(step2[15], kOne);
+        step1[16] = _mm256_add_epi16(step1[16], kOne);
+        step1[17] = _mm256_add_epi16(step1[17], kOne);
+        step1[18] = _mm256_add_epi16(step1[18], kOne);
+        step1[19] = _mm256_add_epi16(step1[19], kOne);
+        step2[20] = _mm256_add_epi16(step2[20], kOne);
+        step2[21] = _mm256_add_epi16(step2[21], kOne);
+        step2[22] = _mm256_add_epi16(step2[22], kOne);
+        step2[23] = _mm256_add_epi16(step2[23], kOne);
+        step2[24] = _mm256_add_epi16(step2[24], kOne);
+        step2[25] = _mm256_add_epi16(step2[25], kOne);
+        step2[26] = _mm256_add_epi16(step2[26], kOne);
+        step2[27] = _mm256_add_epi16(step2[27], kOne);
+        step1[28] = _mm256_add_epi16(step1[28], kOne);
+        step1[29] = _mm256_add_epi16(step1[29], kOne);
+        step1[30] = _mm256_add_epi16(step1[30], kOne);
+        step1[31] = _mm256_add_epi16(step1[31], kOne);
+
+        step2[ 0] = _mm256_srai_epi16(step2[ 0], 2);
+        step2[ 1] = _mm256_srai_epi16(step2[ 1], 2);
+        step2[ 2] = _mm256_srai_epi16(step2[ 2], 2);
+        step2[ 3] = _mm256_srai_epi16(step2[ 3], 2);
+        step2[ 4] = _mm256_srai_epi16(step2[ 4], 2);
+        step2[ 5] = _mm256_srai_epi16(step2[ 5], 2);
+        step2[ 6] = _mm256_srai_epi16(step2[ 6], 2);
+        step2[ 7] = _mm256_srai_epi16(step2[ 7], 2);
+        step2[ 8] = _mm256_srai_epi16(step2[ 8], 2);
+        step2[ 9] = _mm256_srai_epi16(step2[ 9], 2);
+        step2[10] = _mm256_srai_epi16(step2[10], 2);
+        step2[11] = _mm256_srai_epi16(step2[11], 2);
+        step2[12] = _mm256_srai_epi16(step2[12], 2);
+        step2[13] = _mm256_srai_epi16(step2[13], 2);
+        step2[14] = _mm256_srai_epi16(step2[14], 2);
+        step2[15] = _mm256_srai_epi16(step2[15], 2);
+        step1[16] = _mm256_srai_epi16(step1[16], 2);
+        step1[17] = _mm256_srai_epi16(step1[17], 2);
+        step1[18] = _mm256_srai_epi16(step1[18], 2);
+        step1[19] = _mm256_srai_epi16(step1[19], 2);
+        step2[20] = _mm256_srai_epi16(step2[20], 2);
+        step2[21] = _mm256_srai_epi16(step2[21], 2);
+        step2[22] = _mm256_srai_epi16(step2[22], 2);
+        step2[23] = _mm256_srai_epi16(step2[23], 2);
+        step2[24] = _mm256_srai_epi16(step2[24], 2);
+        step2[25] = _mm256_srai_epi16(step2[25], 2);
+        step2[26] = _mm256_srai_epi16(step2[26], 2);
+        step2[27] = _mm256_srai_epi16(step2[27], 2);
+        step1[28] = _mm256_srai_epi16(step1[28], 2);
+        step1[29] = _mm256_srai_epi16(step1[29], 2);
+        step1[30] = _mm256_srai_epi16(step1[30], 2);
+        step1[31] = _mm256_srai_epi16(step1[31], 2);
+      }
+#endif
+
+#if FDCT32x32_HIGH_PRECISION
+      if (pass == 0) {
+#endif
+      // Stage 3
+      {
+        step3[0] = _mm256_add_epi16(step2[(8 - 1)], step2[0]);
+        step3[1] = _mm256_add_epi16(step2[(8 - 2)], step2[1]);
+        step3[2] = _mm256_add_epi16(step2[(8 - 3)], step2[2]);
+        step3[3] = _mm256_add_epi16(step2[(8 - 4)], step2[3]);
+        step3[4] = _mm256_sub_epi16(step2[(8 - 5)], step2[4]);
+        step3[5] = _mm256_sub_epi16(step2[(8 - 6)], step2[5]);
+        step3[6] = _mm256_sub_epi16(step2[(8 - 7)], step2[6]);
+        step3[7] = _mm256_sub_epi16(step2[(8 - 8)], step2[7]);
+      }
+      {
+        const __m256i s3_10_0 = _mm256_unpacklo_epi16(step2[13], step2[10]);
+        const __m256i s3_10_1 = _mm256_unpackhi_epi16(step2[13], step2[10]);
+        const __m256i s3_11_0 = _mm256_unpacklo_epi16(step2[12], step2[11]);
+        const __m256i s3_11_1 = _mm256_unpackhi_epi16(step2[12], step2[11]);
+        const __m256i s3_10_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_m16);
+        const __m256i s3_10_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_m16);
+        const __m256i s3_11_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_m16);
+        const __m256i s3_11_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_m16);
+        const __m256i s3_12_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_p16);
+        const __m256i s3_12_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_p16);
+        const __m256i s3_13_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_p16);
+        const __m256i s3_13_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_p16);
+        // dct_const_round_shift
+        const __m256i s3_10_4 = _mm256_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_10_5 = _mm256_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_11_4 = _mm256_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_11_5 = _mm256_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_12_4 = _mm256_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_12_5 = _mm256_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_13_4 = _mm256_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_13_5 = _mm256_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_10_6 = _mm256_srai_epi32(s3_10_4, DCT_CONST_BITS);
+        const __m256i s3_10_7 = _mm256_srai_epi32(s3_10_5, DCT_CONST_BITS);
+        const __m256i s3_11_6 = _mm256_srai_epi32(s3_11_4, DCT_CONST_BITS);
+        const __m256i s3_11_7 = _mm256_srai_epi32(s3_11_5, DCT_CONST_BITS);
+        const __m256i s3_12_6 = _mm256_srai_epi32(s3_12_4, DCT_CONST_BITS);
+        const __m256i s3_12_7 = _mm256_srai_epi32(s3_12_5, DCT_CONST_BITS);
+        const __m256i s3_13_6 = _mm256_srai_epi32(s3_13_4, DCT_CONST_BITS);
+        const __m256i s3_13_7 = _mm256_srai_epi32(s3_13_5, DCT_CONST_BITS);
+        // Combine
+        step3[10] = _mm256_packs_epi32(s3_10_6, s3_10_7);
+        step3[11] = _mm256_packs_epi32(s3_11_6, s3_11_7);
+        step3[12] = _mm256_packs_epi32(s3_12_6, s3_12_7);
+        step3[13] = _mm256_packs_epi32(s3_13_6, s3_13_7);
+      }
+      {
+        step3[16] = _mm256_add_epi16(step2[23], step1[16]);
+        step3[17] = _mm256_add_epi16(step2[22], step1[17]);
+        step3[18] = _mm256_add_epi16(step2[21], step1[18]);
+        step3[19] = _mm256_add_epi16(step2[20], step1[19]);
+        step3[20] = _mm256_sub_epi16(step1[19], step2[20]);
+        step3[21] = _mm256_sub_epi16(step1[18], step2[21]);
+        step3[22] = _mm256_sub_epi16(step1[17], step2[22]);
+        step3[23] = _mm256_sub_epi16(step1[16], step2[23]);
+        step3[24] = _mm256_sub_epi16(step1[31], step2[24]);
+        step3[25] = _mm256_sub_epi16(step1[30], step2[25]);
+        step3[26] = _mm256_sub_epi16(step1[29], step2[26]);
+        step3[27] = _mm256_sub_epi16(step1[28], step2[27]);
+        step3[28] = _mm256_add_epi16(step2[27], step1[28]);
+        step3[29] = _mm256_add_epi16(step2[26], step1[29]);
+        step3[30] = _mm256_add_epi16(step2[25], step1[30]);
+        step3[31] = _mm256_add_epi16(step2[24], step1[31]);
+      }
+
+      // Stage 4
+      {
+        step1[ 0] = _mm256_add_epi16(step3[ 3], step3[ 0]);
+        step1[ 1] = _mm256_add_epi16(step3[ 2], step3[ 1]);
+        step1[ 2] = _mm256_sub_epi16(step3[ 1], step3[ 2]);
+        step1[ 3] = _mm256_sub_epi16(step3[ 0], step3[ 3]);
+        step1[ 8] = _mm256_add_epi16(step3[11], step2[ 8]);
+        step1[ 9] = _mm256_add_epi16(step3[10], step2[ 9]);
+        step1[10] = _mm256_sub_epi16(step2[ 9], step3[10]);
+        step1[11] = _mm256_sub_epi16(step2[ 8], step3[11]);
+        step1[12] = _mm256_sub_epi16(step2[15], step3[12]);
+        step1[13] = _mm256_sub_epi16(step2[14], step3[13]);
+        step1[14] = _mm256_add_epi16(step3[13], step2[14]);
+        step1[15] = _mm256_add_epi16(step3[12], step2[15]);
+      }
+      {
+        const __m256i s1_05_0 = _mm256_unpacklo_epi16(step3[6], step3[5]);
+        const __m256i s1_05_1 = _mm256_unpackhi_epi16(step3[6], step3[5]);
+        const __m256i s1_05_2 = _mm256_madd_epi16(s1_05_0, k__cospi_p16_m16);
+        const __m256i s1_05_3 = _mm256_madd_epi16(s1_05_1, k__cospi_p16_m16);
+        const __m256i s1_06_2 = _mm256_madd_epi16(s1_05_0, k__cospi_p16_p16);
+        const __m256i s1_06_3 = _mm256_madd_epi16(s1_05_1, k__cospi_p16_p16);
+        // dct_const_round_shift
+        const __m256i s1_05_4 = _mm256_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
+        const __m256i s1_05_5 = _mm256_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
+        const __m256i s1_06_4 = _mm256_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
+        const __m256i s1_06_5 = _mm256_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
+        const __m256i s1_05_6 = _mm256_srai_epi32(s1_05_4, DCT_CONST_BITS);
+        const __m256i s1_05_7 = _mm256_srai_epi32(s1_05_5, DCT_CONST_BITS);
+        const __m256i s1_06_6 = _mm256_srai_epi32(s1_06_4, DCT_CONST_BITS);
+        const __m256i s1_06_7 = _mm256_srai_epi32(s1_06_5, DCT_CONST_BITS);
+        // Combine
+        step1[5] = _mm256_packs_epi32(s1_05_6, s1_05_7);
+        step1[6] = _mm256_packs_epi32(s1_06_6, s1_06_7);
+      }
+      {
+        const __m256i s1_18_0 = _mm256_unpacklo_epi16(step3[18], step3[29]);
+        const __m256i s1_18_1 = _mm256_unpackhi_epi16(step3[18], step3[29]);
+        const __m256i s1_19_0 = _mm256_unpacklo_epi16(step3[19], step3[28]);
+        const __m256i s1_19_1 = _mm256_unpackhi_epi16(step3[19], step3[28]);
+        const __m256i s1_20_0 = _mm256_unpacklo_epi16(step3[20], step3[27]);
+        const __m256i s1_20_1 = _mm256_unpackhi_epi16(step3[20], step3[27]);
+        const __m256i s1_21_0 = _mm256_unpacklo_epi16(step3[21], step3[26]);
+        const __m256i s1_21_1 = _mm256_unpackhi_epi16(step3[21], step3[26]);
+        const __m256i s1_18_2 = _mm256_madd_epi16(s1_18_0, k__cospi_m08_p24);
+        const __m256i s1_18_3 = _mm256_madd_epi16(s1_18_1, k__cospi_m08_p24);
+        const __m256i s1_19_2 = _mm256_madd_epi16(s1_19_0, k__cospi_m08_p24);
+        const __m256i s1_19_3 = _mm256_madd_epi16(s1_19_1, k__cospi_m08_p24);
+        const __m256i s1_20_2 = _mm256_madd_epi16(s1_20_0, k__cospi_m24_m08);
+        const __m256i s1_20_3 = _mm256_madd_epi16(s1_20_1, k__cospi_m24_m08);
+        const __m256i s1_21_2 = _mm256_madd_epi16(s1_21_0, k__cospi_m24_m08);
+        const __m256i s1_21_3 = _mm256_madd_epi16(s1_21_1, k__cospi_m24_m08);
+        const __m256i s1_26_2 = _mm256_madd_epi16(s1_21_0, k__cospi_m08_p24);
+        const __m256i s1_26_3 = _mm256_madd_epi16(s1_21_1, k__cospi_m08_p24);
+        const __m256i s1_27_2 = _mm256_madd_epi16(s1_20_0, k__cospi_m08_p24);
+        const __m256i s1_27_3 = _mm256_madd_epi16(s1_20_1, k__cospi_m08_p24);
+        const __m256i s1_28_2 = _mm256_madd_epi16(s1_19_0, k__cospi_p24_p08);
+        const __m256i s1_28_3 = _mm256_madd_epi16(s1_19_1, k__cospi_p24_p08);
+        const __m256i s1_29_2 = _mm256_madd_epi16(s1_18_0, k__cospi_p24_p08);
+        const __m256i s1_29_3 = _mm256_madd_epi16(s1_18_1, k__cospi_p24_p08);
+        // dct_const_round_shift
+        const __m256i s1_18_4 = _mm256_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
+        const __m256i s1_18_5 = _mm256_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
+        const __m256i s1_19_4 = _mm256_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
+        const __m256i s1_19_5 = _mm256_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
+        const __m256i s1_20_4 = _mm256_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
+        const __m256i s1_20_5 = _mm256_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
+        const __m256i s1_21_4 = _mm256_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
+        const __m256i s1_21_5 = _mm256_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
+        const __m256i s1_26_4 = _mm256_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
+        const __m256i s1_26_5 = _mm256_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
+        const __m256i s1_27_4 = _mm256_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
+        const __m256i s1_27_5 = _mm256_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
+        const __m256i s1_28_4 = _mm256_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
+        const __m256i s1_28_5 = _mm256_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
+        const __m256i s1_29_4 = _mm256_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
+        const __m256i s1_29_5 = _mm256_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
+        const __m256i s1_18_6 = _mm256_srai_epi32(s1_18_4, DCT_CONST_BITS);
+        const __m256i s1_18_7 = _mm256_srai_epi32(s1_18_5, DCT_CONST_BITS);
+        const __m256i s1_19_6 = _mm256_srai_epi32(s1_19_4, DCT_CONST_BITS);
+        const __m256i s1_19_7 = _mm256_srai_epi32(s1_19_5, DCT_CONST_BITS);
+        const __m256i s1_20_6 = _mm256_srai_epi32(s1_20_4, DCT_CONST_BITS);
+        const __m256i s1_20_7 = _mm256_srai_epi32(s1_20_5, DCT_CONST_BITS);
+        const __m256i s1_21_6 = _mm256_srai_epi32(s1_21_4, DCT_CONST_BITS);
+        const __m256i s1_21_7 = _mm256_srai_epi32(s1_21_5, DCT_CONST_BITS);
+        const __m256i s1_26_6 = _mm256_srai_epi32(s1_26_4, DCT_CONST_BITS);
+        const __m256i s1_26_7 = _mm256_srai_epi32(s1_26_5, DCT_CONST_BITS);
+        const __m256i s1_27_6 = _mm256_srai_epi32(s1_27_4, DCT_CONST_BITS);
+        const __m256i s1_27_7 = _mm256_srai_epi32(s1_27_5, DCT_CONST_BITS);
+        const __m256i s1_28_6 = _mm256_srai_epi32(s1_28_4, DCT_CONST_BITS);
+        const __m256i s1_28_7 = _mm256_srai_epi32(s1_28_5, DCT_CONST_BITS);
+        const __m256i s1_29_6 = _mm256_srai_epi32(s1_29_4, DCT_CONST_BITS);
+        const __m256i s1_29_7 = _mm256_srai_epi32(s1_29_5, DCT_CONST_BITS);
+        // Combine
+        step1[18] = _mm256_packs_epi32(s1_18_6, s1_18_7);
+        step1[19] = _mm256_packs_epi32(s1_19_6, s1_19_7);
+        step1[20] = _mm256_packs_epi32(s1_20_6, s1_20_7);
+        step1[21] = _mm256_packs_epi32(s1_21_6, s1_21_7);
+        step1[26] = _mm256_packs_epi32(s1_26_6, s1_26_7);
+        step1[27] = _mm256_packs_epi32(s1_27_6, s1_27_7);
+        step1[28] = _mm256_packs_epi32(s1_28_6, s1_28_7);
+        step1[29] = _mm256_packs_epi32(s1_29_6, s1_29_7);
+      }
+      // Stage 5
+      {
+        step2[4] = _mm256_add_epi16(step1[5], step3[4]);
+        step2[5] = _mm256_sub_epi16(step3[4], step1[5]);
+        step2[6] = _mm256_sub_epi16(step3[7], step1[6]);
+        step2[7] = _mm256_add_epi16(step1[6], step3[7]);
+      }
+      {
+        const __m256i out_00_0 = _mm256_unpacklo_epi16(step1[0], step1[1]);
+        const __m256i out_00_1 = _mm256_unpackhi_epi16(step1[0], step1[1]);
+        const __m256i out_08_0 = _mm256_unpacklo_epi16(step1[2], step1[3]);
+        const __m256i out_08_1 = _mm256_unpackhi_epi16(step1[2], step1[3]);
+        const __m256i out_00_2 = _mm256_madd_epi16(out_00_0, k__cospi_p16_p16);
+        const __m256i out_00_3 = _mm256_madd_epi16(out_00_1, k__cospi_p16_p16);
+        const __m256i out_16_2 = _mm256_madd_epi16(out_00_0, k__cospi_p16_m16);
+        const __m256i out_16_3 = _mm256_madd_epi16(out_00_1, k__cospi_p16_m16);
+        const __m256i out_08_2 = _mm256_madd_epi16(out_08_0, k__cospi_p24_p08);
+        const __m256i out_08_3 = _mm256_madd_epi16(out_08_1, k__cospi_p24_p08);
+        const __m256i out_24_2 = _mm256_madd_epi16(out_08_0, k__cospi_m08_p24);
+        const __m256i out_24_3 = _mm256_madd_epi16(out_08_1, k__cospi_m08_p24);
+        // dct_const_round_shift
+        const __m256i out_00_4 = _mm256_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_00_5 = _mm256_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_16_4 = _mm256_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_16_5 = _mm256_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_08_4 = _mm256_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_08_5 = _mm256_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_24_4 = _mm256_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_24_5 = _mm256_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_00_6 = _mm256_srai_epi32(out_00_4, DCT_CONST_BITS);
+        const __m256i out_00_7 = _mm256_srai_epi32(out_00_5, DCT_CONST_BITS);
+        const __m256i out_16_6 = _mm256_srai_epi32(out_16_4, DCT_CONST_BITS);
+        const __m256i out_16_7 = _mm256_srai_epi32(out_16_5, DCT_CONST_BITS);
+        const __m256i out_08_6 = _mm256_srai_epi32(out_08_4, DCT_CONST_BITS);
+        const __m256i out_08_7 = _mm256_srai_epi32(out_08_5, DCT_CONST_BITS);
+        const __m256i out_24_6 = _mm256_srai_epi32(out_24_4, DCT_CONST_BITS);
+        const __m256i out_24_7 = _mm256_srai_epi32(out_24_5, DCT_CONST_BITS);
+        // Combine
+        out[ 0] = _mm256_packs_epi32(out_00_6, out_00_7);
+        out[16] = _mm256_packs_epi32(out_16_6, out_16_7);
+        out[ 8] = _mm256_packs_epi32(out_08_6, out_08_7);
+        out[24] = _mm256_packs_epi32(out_24_6, out_24_7);
+      }
+      {
+        const __m256i s2_09_0 = _mm256_unpacklo_epi16(step1[ 9], step1[14]);
+        const __m256i s2_09_1 = _mm256_unpackhi_epi16(step1[ 9], step1[14]);
+        const __m256i s2_10_0 = _mm256_unpacklo_epi16(step1[10], step1[13]);
+        const __m256i s2_10_1 = _mm256_unpackhi_epi16(step1[10], step1[13]);
+        const __m256i s2_09_2 = _mm256_madd_epi16(s2_09_0, k__cospi_m08_p24);
+        const __m256i s2_09_3 = _mm256_madd_epi16(s2_09_1, k__cospi_m08_p24);
+        const __m256i s2_10_2 = _mm256_madd_epi16(s2_10_0, k__cospi_m24_m08);
+        const __m256i s2_10_3 = _mm256_madd_epi16(s2_10_1, k__cospi_m24_m08);
+        const __m256i s2_13_2 = _mm256_madd_epi16(s2_10_0, k__cospi_m08_p24);
+        const __m256i s2_13_3 = _mm256_madd_epi16(s2_10_1, k__cospi_m08_p24);
+        const __m256i s2_14_2 = _mm256_madd_epi16(s2_09_0, k__cospi_p24_p08);
+        const __m256i s2_14_3 = _mm256_madd_epi16(s2_09_1, k__cospi_p24_p08);
+        // dct_const_round_shift
+        const __m256i s2_09_4 = _mm256_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_09_5 = _mm256_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_10_4 = _mm256_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_10_5 = _mm256_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_13_4 = _mm256_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_13_5 = _mm256_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_14_4 = _mm256_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_14_5 = _mm256_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_09_6 = _mm256_srai_epi32(s2_09_4, DCT_CONST_BITS);
+        const __m256i s2_09_7 = _mm256_srai_epi32(s2_09_5, DCT_CONST_BITS);
+        const __m256i s2_10_6 = _mm256_srai_epi32(s2_10_4, DCT_CONST_BITS);
+        const __m256i s2_10_7 = _mm256_srai_epi32(s2_10_5, DCT_CONST_BITS);
+        const __m256i s2_13_6 = _mm256_srai_epi32(s2_13_4, DCT_CONST_BITS);
+        const __m256i s2_13_7 = _mm256_srai_epi32(s2_13_5, DCT_CONST_BITS);
+        const __m256i s2_14_6 = _mm256_srai_epi32(s2_14_4, DCT_CONST_BITS);
+        const __m256i s2_14_7 = _mm256_srai_epi32(s2_14_5, DCT_CONST_BITS);
+        // Combine
+        step2[ 9] = _mm256_packs_epi32(s2_09_6, s2_09_7);
+        step2[10] = _mm256_packs_epi32(s2_10_6, s2_10_7);
+        step2[13] = _mm256_packs_epi32(s2_13_6, s2_13_7);
+        step2[14] = _mm256_packs_epi32(s2_14_6, s2_14_7);
+      }
+      {
+        step2[16] = _mm256_add_epi16(step1[19], step3[16]);
+        step2[17] = _mm256_add_epi16(step1[18], step3[17]);
+        step2[18] = _mm256_sub_epi16(step3[17], step1[18]);
+        step2[19] = _mm256_sub_epi16(step3[16], step1[19]);
+        step2[20] = _mm256_sub_epi16(step3[23], step1[20]);
+        step2[21] = _mm256_sub_epi16(step3[22], step1[21]);
+        step2[22] = _mm256_add_epi16(step1[21], step3[22]);
+        step2[23] = _mm256_add_epi16(step1[20], step3[23]);
+        step2[24] = _mm256_add_epi16(step1[27], step3[24]);
+        step2[25] = _mm256_add_epi16(step1[26], step3[25]);
+        step2[26] = _mm256_sub_epi16(step3[25], step1[26]);
+        step2[27] = _mm256_sub_epi16(step3[24], step1[27]);
+        step2[28] = _mm256_sub_epi16(step3[31], step1[28]);
+        step2[29] = _mm256_sub_epi16(step3[30], step1[29]);
+        step2[30] = _mm256_add_epi16(step1[29], step3[30]);
+        step2[31] = _mm256_add_epi16(step1[28], step3[31]);
+      }
+      // Stage 6
+      {
+        const __m256i out_04_0 = _mm256_unpacklo_epi16(step2[4], step2[7]);
+        const __m256i out_04_1 = _mm256_unpackhi_epi16(step2[4], step2[7]);
+        const __m256i out_20_0 = _mm256_unpacklo_epi16(step2[5], step2[6]);
+        const __m256i out_20_1 = _mm256_unpackhi_epi16(step2[5], step2[6]);
+        const __m256i out_12_0 = _mm256_unpacklo_epi16(step2[5], step2[6]);
+        const __m256i out_12_1 = _mm256_unpackhi_epi16(step2[5], step2[6]);
+        const __m256i out_28_0 = _mm256_unpacklo_epi16(step2[4], step2[7]);
+        const __m256i out_28_1 = _mm256_unpackhi_epi16(step2[4], step2[7]);
+        const __m256i out_04_2 = _mm256_madd_epi16(out_04_0, k__cospi_p28_p04);
+        const __m256i out_04_3 = _mm256_madd_epi16(out_04_1, k__cospi_p28_p04);
+        const __m256i out_20_2 = _mm256_madd_epi16(out_20_0, k__cospi_p12_p20);
+        const __m256i out_20_3 = _mm256_madd_epi16(out_20_1, k__cospi_p12_p20);
+        const __m256i out_12_2 = _mm256_madd_epi16(out_12_0, k__cospi_m20_p12);
+        const __m256i out_12_3 = _mm256_madd_epi16(out_12_1, k__cospi_m20_p12);
+        const __m256i out_28_2 = _mm256_madd_epi16(out_28_0, k__cospi_m04_p28);
+        const __m256i out_28_3 = _mm256_madd_epi16(out_28_1, k__cospi_m04_p28);
+        // dct_const_round_shift
+        const __m256i out_04_4 = _mm256_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_04_5 = _mm256_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_20_4 = _mm256_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_20_5 = _mm256_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_12_4 = _mm256_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_12_5 = _mm256_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_28_4 = _mm256_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_28_5 = _mm256_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_04_6 = _mm256_srai_epi32(out_04_4, DCT_CONST_BITS);
+        const __m256i out_04_7 = _mm256_srai_epi32(out_04_5, DCT_CONST_BITS);
+        const __m256i out_20_6 = _mm256_srai_epi32(out_20_4, DCT_CONST_BITS);
+        const __m256i out_20_7 = _mm256_srai_epi32(out_20_5, DCT_CONST_BITS);
+        const __m256i out_12_6 = _mm256_srai_epi32(out_12_4, DCT_CONST_BITS);
+        const __m256i out_12_7 = _mm256_srai_epi32(out_12_5, DCT_CONST_BITS);
+        const __m256i out_28_6 = _mm256_srai_epi32(out_28_4, DCT_CONST_BITS);
+        const __m256i out_28_7 = _mm256_srai_epi32(out_28_5, DCT_CONST_BITS);
+        // Combine
+        out[ 4] = _mm256_packs_epi32(out_04_6, out_04_7);
+        out[20] = _mm256_packs_epi32(out_20_6, out_20_7);
+        out[12] = _mm256_packs_epi32(out_12_6, out_12_7);
+        out[28] = _mm256_packs_epi32(out_28_6, out_28_7);
+      }
+      {
+        step3[ 8] = _mm256_add_epi16(step2[ 9], step1[ 8]);
+        step3[ 9] = _mm256_sub_epi16(step1[ 8], step2[ 9]);
+        step3[10] = _mm256_sub_epi16(step1[11], step2[10]);
+        step3[11] = _mm256_add_epi16(step2[10], step1[11]);
+        step3[12] = _mm256_add_epi16(step2[13], step1[12]);
+        step3[13] = _mm256_sub_epi16(step1[12], step2[13]);
+        step3[14] = _mm256_sub_epi16(step1[15], step2[14]);
+        step3[15] = _mm256_add_epi16(step2[14], step1[15]);
+      }
+      {
+        const __m256i s3_17_0 = _mm256_unpacklo_epi16(step2[17], step2[30]);
+        const __m256i s3_17_1 = _mm256_unpackhi_epi16(step2[17], step2[30]);
+        const __m256i s3_18_0 = _mm256_unpacklo_epi16(step2[18], step2[29]);
+        const __m256i s3_18_1 = _mm256_unpackhi_epi16(step2[18], step2[29]);
+        const __m256i s3_21_0 = _mm256_unpacklo_epi16(step2[21], step2[26]);
+        const __m256i s3_21_1 = _mm256_unpackhi_epi16(step2[21], step2[26]);
+        const __m256i s3_22_0 = _mm256_unpacklo_epi16(step2[22], step2[25]);
+        const __m256i s3_22_1 = _mm256_unpackhi_epi16(step2[22], step2[25]);
+        const __m256i s3_17_2 = _mm256_madd_epi16(s3_17_0, k__cospi_m04_p28);
+        const __m256i s3_17_3 = _mm256_madd_epi16(s3_17_1, k__cospi_m04_p28);
+        const __m256i s3_18_2 = _mm256_madd_epi16(s3_18_0, k__cospi_m28_m04);
+        const __m256i s3_18_3 = _mm256_madd_epi16(s3_18_1, k__cospi_m28_m04);
+        const __m256i s3_21_2 = _mm256_madd_epi16(s3_21_0, k__cospi_m20_p12);
+        const __m256i s3_21_3 = _mm256_madd_epi16(s3_21_1, k__cospi_m20_p12);
+        const __m256i s3_22_2 = _mm256_madd_epi16(s3_22_0, k__cospi_m12_m20);
+        const __m256i s3_22_3 = _mm256_madd_epi16(s3_22_1, k__cospi_m12_m20);
+        const __m256i s3_25_2 = _mm256_madd_epi16(s3_22_0, k__cospi_m20_p12);
+        const __m256i s3_25_3 = _mm256_madd_epi16(s3_22_1, k__cospi_m20_p12);
+        const __m256i s3_26_2 = _mm256_madd_epi16(s3_21_0, k__cospi_p12_p20);
+        const __m256i s3_26_3 = _mm256_madd_epi16(s3_21_1, k__cospi_p12_p20);
+        const __m256i s3_29_2 = _mm256_madd_epi16(s3_18_0, k__cospi_m04_p28);
+        const __m256i s3_29_3 = _mm256_madd_epi16(s3_18_1, k__cospi_m04_p28);
+        const __m256i s3_30_2 = _mm256_madd_epi16(s3_17_0, k__cospi_p28_p04);
+        const __m256i s3_30_3 = _mm256_madd_epi16(s3_17_1, k__cospi_p28_p04);
+        // dct_const_round_shift
+        const __m256i s3_17_4 = _mm256_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_17_5 = _mm256_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_18_4 = _mm256_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_18_5 = _mm256_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_21_4 = _mm256_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_21_5 = _mm256_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_22_4 = _mm256_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_22_5 = _mm256_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_17_6 = _mm256_srai_epi32(s3_17_4, DCT_CONST_BITS);
+        const __m256i s3_17_7 = _mm256_srai_epi32(s3_17_5, DCT_CONST_BITS);
+        const __m256i s3_18_6 = _mm256_srai_epi32(s3_18_4, DCT_CONST_BITS);
+        const __m256i s3_18_7 = _mm256_srai_epi32(s3_18_5, DCT_CONST_BITS);
+        const __m256i s3_21_6 = _mm256_srai_epi32(s3_21_4, DCT_CONST_BITS);
+        const __m256i s3_21_7 = _mm256_srai_epi32(s3_21_5, DCT_CONST_BITS);
+        const __m256i s3_22_6 = _mm256_srai_epi32(s3_22_4, DCT_CONST_BITS);
+        const __m256i s3_22_7 = _mm256_srai_epi32(s3_22_5, DCT_CONST_BITS);
+        const __m256i s3_25_4 = _mm256_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_25_5 = _mm256_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_26_4 = _mm256_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_26_5 = _mm256_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_29_4 = _mm256_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_29_5 = _mm256_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_30_4 = _mm256_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_30_5 = _mm256_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_25_6 = _mm256_srai_epi32(s3_25_4, DCT_CONST_BITS);
+        const __m256i s3_25_7 = _mm256_srai_epi32(s3_25_5, DCT_CONST_BITS);
+        const __m256i s3_26_6 = _mm256_srai_epi32(s3_26_4, DCT_CONST_BITS);
+        const __m256i s3_26_7 = _mm256_srai_epi32(s3_26_5, DCT_CONST_BITS);
+        const __m256i s3_29_6 = _mm256_srai_epi32(s3_29_4, DCT_CONST_BITS);
+        const __m256i s3_29_7 = _mm256_srai_epi32(s3_29_5, DCT_CONST_BITS);
+        const __m256i s3_30_6 = _mm256_srai_epi32(s3_30_4, DCT_CONST_BITS);
+        const __m256i s3_30_7 = _mm256_srai_epi32(s3_30_5, DCT_CONST_BITS);
+        // Combine
+        step3[17] = _mm256_packs_epi32(s3_17_6, s3_17_7);
+        step3[18] = _mm256_packs_epi32(s3_18_6, s3_18_7);
+        step3[21] = _mm256_packs_epi32(s3_21_6, s3_21_7);
+        step3[22] = _mm256_packs_epi32(s3_22_6, s3_22_7);
+        // Combine
+        step3[25] = _mm256_packs_epi32(s3_25_6, s3_25_7);
+        step3[26] = _mm256_packs_epi32(s3_26_6, s3_26_7);
+        step3[29] = _mm256_packs_epi32(s3_29_6, s3_29_7);
+        step3[30] = _mm256_packs_epi32(s3_30_6, s3_30_7);
+      }
+      // Stage 7
+      {
+        const __m256i out_02_0 = _mm256_unpacklo_epi16(step3[ 8], step3[15]);
+        const __m256i out_02_1 = _mm256_unpackhi_epi16(step3[ 8], step3[15]);
+        const __m256i out_18_0 = _mm256_unpacklo_epi16(step3[ 9], step3[14]);
+        const __m256i out_18_1 = _mm256_unpackhi_epi16(step3[ 9], step3[14]);
+        const __m256i out_10_0 = _mm256_unpacklo_epi16(step3[10], step3[13]);
+        const __m256i out_10_1 = _mm256_unpackhi_epi16(step3[10], step3[13]);
+        const __m256i out_26_0 = _mm256_unpacklo_epi16(step3[11], step3[12]);
+        const __m256i out_26_1 = _mm256_unpackhi_epi16(step3[11], step3[12]);
+        const __m256i out_02_2 = _mm256_madd_epi16(out_02_0, k__cospi_p30_p02);
+        const __m256i out_02_3 = _mm256_madd_epi16(out_02_1, k__cospi_p30_p02);
+        const __m256i out_18_2 = _mm256_madd_epi16(out_18_0, k__cospi_p14_p18);
+        const __m256i out_18_3 = _mm256_madd_epi16(out_18_1, k__cospi_p14_p18);
+        const __m256i out_10_2 = _mm256_madd_epi16(out_10_0, k__cospi_p22_p10);
+        const __m256i out_10_3 = _mm256_madd_epi16(out_10_1, k__cospi_p22_p10);
+        const __m256i out_26_2 = _mm256_madd_epi16(out_26_0, k__cospi_p06_p26);
+        const __m256i out_26_3 = _mm256_madd_epi16(out_26_1, k__cospi_p06_p26);
+        const __m256i out_06_2 = _mm256_madd_epi16(out_26_0, k__cospi_m26_p06);
+        const __m256i out_06_3 = _mm256_madd_epi16(out_26_1, k__cospi_m26_p06);
+        const __m256i out_22_2 = _mm256_madd_epi16(out_10_0, k__cospi_m10_p22);
+        const __m256i out_22_3 = _mm256_madd_epi16(out_10_1, k__cospi_m10_p22);
+        const __m256i out_14_2 = _mm256_madd_epi16(out_18_0, k__cospi_m18_p14);
+        const __m256i out_14_3 = _mm256_madd_epi16(out_18_1, k__cospi_m18_p14);
+        const __m256i out_30_2 = _mm256_madd_epi16(out_02_0, k__cospi_m02_p30);
+        const __m256i out_30_3 = _mm256_madd_epi16(out_02_1, k__cospi_m02_p30);
+        // dct_const_round_shift
+        const __m256i out_02_4 = _mm256_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_02_5 = _mm256_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_18_4 = _mm256_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_18_5 = _mm256_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_10_4 = _mm256_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_10_5 = _mm256_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_26_4 = _mm256_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_26_5 = _mm256_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_06_4 = _mm256_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_06_5 = _mm256_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_22_4 = _mm256_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_22_5 = _mm256_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_14_4 = _mm256_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_14_5 = _mm256_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_30_4 = _mm256_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_30_5 = _mm256_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_02_6 = _mm256_srai_epi32(out_02_4, DCT_CONST_BITS);
+        const __m256i out_02_7 = _mm256_srai_epi32(out_02_5, DCT_CONST_BITS);
+        const __m256i out_18_6 = _mm256_srai_epi32(out_18_4, DCT_CONST_BITS);
+        const __m256i out_18_7 = _mm256_srai_epi32(out_18_5, DCT_CONST_BITS);
+        const __m256i out_10_6 = _mm256_srai_epi32(out_10_4, DCT_CONST_BITS);
+        const __m256i out_10_7 = _mm256_srai_epi32(out_10_5, DCT_CONST_BITS);
+        const __m256i out_26_6 = _mm256_srai_epi32(out_26_4, DCT_CONST_BITS);
+        const __m256i out_26_7 = _mm256_srai_epi32(out_26_5, DCT_CONST_BITS);
+        const __m256i out_06_6 = _mm256_srai_epi32(out_06_4, DCT_CONST_BITS);
+        const __m256i out_06_7 = _mm256_srai_epi32(out_06_5, DCT_CONST_BITS);
+        const __m256i out_22_6 = _mm256_srai_epi32(out_22_4, DCT_CONST_BITS);
+        const __m256i out_22_7 = _mm256_srai_epi32(out_22_5, DCT_CONST_BITS);
+        const __m256i out_14_6 = _mm256_srai_epi32(out_14_4, DCT_CONST_BITS);
+        const __m256i out_14_7 = _mm256_srai_epi32(out_14_5, DCT_CONST_BITS);
+        const __m256i out_30_6 = _mm256_srai_epi32(out_30_4, DCT_CONST_BITS);
+        const __m256i out_30_7 = _mm256_srai_epi32(out_30_5, DCT_CONST_BITS);
+        // Combine
+        out[ 2] = _mm256_packs_epi32(out_02_6, out_02_7);
+        out[18] = _mm256_packs_epi32(out_18_6, out_18_7);
+        out[10] = _mm256_packs_epi32(out_10_6, out_10_7);
+        out[26] = _mm256_packs_epi32(out_26_6, out_26_7);
+        out[ 6] = _mm256_packs_epi32(out_06_6, out_06_7);
+        out[22] = _mm256_packs_epi32(out_22_6, out_22_7);
+        out[14] = _mm256_packs_epi32(out_14_6, out_14_7);
+        out[30] = _mm256_packs_epi32(out_30_6, out_30_7);
+      }
+      {
+        step1[16] = _mm256_add_epi16(step3[17], step2[16]);
+        step1[17] = _mm256_sub_epi16(step2[16], step3[17]);
+        step1[18] = _mm256_sub_epi16(step2[19], step3[18]);
+        step1[19] = _mm256_add_epi16(step3[18], step2[19]);
+        step1[20] = _mm256_add_epi16(step3[21], step2[20]);
+        step1[21] = _mm256_sub_epi16(step2[20], step3[21]);
+        step1[22] = _mm256_sub_epi16(step2[23], step3[22]);
+        step1[23] = _mm256_add_epi16(step3[22], step2[23]);
+        step1[24] = _mm256_add_epi16(step3[25], step2[24]);
+        step1[25] = _mm256_sub_epi16(step2[24], step3[25]);
+        step1[26] = _mm256_sub_epi16(step2[27], step3[26]);
+        step1[27] = _mm256_add_epi16(step3[26], step2[27]);
+        step1[28] = _mm256_add_epi16(step3[29], step2[28]);
+        step1[29] = _mm256_sub_epi16(step2[28], step3[29]);
+        step1[30] = _mm256_sub_epi16(step2[31], step3[30]);
+        step1[31] = _mm256_add_epi16(step3[30], step2[31]);
+      }
+      // Final stage --- outputs indices are bit-reversed.
+      {
+        const __m256i out_01_0 = _mm256_unpacklo_epi16(step1[16], step1[31]);
+        const __m256i out_01_1 = _mm256_unpackhi_epi16(step1[16], step1[31]);
+        const __m256i out_17_0 = _mm256_unpacklo_epi16(step1[17], step1[30]);
+        const __m256i out_17_1 = _mm256_unpackhi_epi16(step1[17], step1[30]);
+        const __m256i out_09_0 = _mm256_unpacklo_epi16(step1[18], step1[29]);
+        const __m256i out_09_1 = _mm256_unpackhi_epi16(step1[18], step1[29]);
+        const __m256i out_25_0 = _mm256_unpacklo_epi16(step1[19], step1[28]);
+        const __m256i out_25_1 = _mm256_unpackhi_epi16(step1[19], step1[28]);
+        const __m256i out_01_2 = _mm256_madd_epi16(out_01_0, k__cospi_p31_p01);
+        const __m256i out_01_3 = _mm256_madd_epi16(out_01_1, k__cospi_p31_p01);
+        const __m256i out_17_2 = _mm256_madd_epi16(out_17_0, k__cospi_p15_p17);
+        const __m256i out_17_3 = _mm256_madd_epi16(out_17_1, k__cospi_p15_p17);
+        const __m256i out_09_2 = _mm256_madd_epi16(out_09_0, k__cospi_p23_p09);
+        const __m256i out_09_3 = _mm256_madd_epi16(out_09_1, k__cospi_p23_p09);
+        const __m256i out_25_2 = _mm256_madd_epi16(out_25_0, k__cospi_p07_p25);
+        const __m256i out_25_3 = _mm256_madd_epi16(out_25_1, k__cospi_p07_p25);
+        const __m256i out_07_2 = _mm256_madd_epi16(out_25_0, k__cospi_m25_p07);
+        const __m256i out_07_3 = _mm256_madd_epi16(out_25_1, k__cospi_m25_p07);
+        const __m256i out_23_2 = _mm256_madd_epi16(out_09_0, k__cospi_m09_p23);
+        const __m256i out_23_3 = _mm256_madd_epi16(out_09_1, k__cospi_m09_p23);
+        const __m256i out_15_2 = _mm256_madd_epi16(out_17_0, k__cospi_m17_p15);
+        const __m256i out_15_3 = _mm256_madd_epi16(out_17_1, k__cospi_m17_p15);
+        const __m256i out_31_2 = _mm256_madd_epi16(out_01_0, k__cospi_m01_p31);
+        const __m256i out_31_3 = _mm256_madd_epi16(out_01_1, k__cospi_m01_p31);
+        // dct_const_round_shift
+        const __m256i out_01_4 = _mm256_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_01_5 = _mm256_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_17_4 = _mm256_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_17_5 = _mm256_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_09_4 = _mm256_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_09_5 = _mm256_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_25_4 = _mm256_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_25_5 = _mm256_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_07_4 = _mm256_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_07_5 = _mm256_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_23_4 = _mm256_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_23_5 = _mm256_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_15_4 = _mm256_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_15_5 = _mm256_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_31_4 = _mm256_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_31_5 = _mm256_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_01_6 = _mm256_srai_epi32(out_01_4, DCT_CONST_BITS);
+        const __m256i out_01_7 = _mm256_srai_epi32(out_01_5, DCT_CONST_BITS);
+        const __m256i out_17_6 = _mm256_srai_epi32(out_17_4, DCT_CONST_BITS);
+        const __m256i out_17_7 = _mm256_srai_epi32(out_17_5, DCT_CONST_BITS);
+        const __m256i out_09_6 = _mm256_srai_epi32(out_09_4, DCT_CONST_BITS);
+        const __m256i out_09_7 = _mm256_srai_epi32(out_09_5, DCT_CONST_BITS);
+        const __m256i out_25_6 = _mm256_srai_epi32(out_25_4, DCT_CONST_BITS);
+        const __m256i out_25_7 = _mm256_srai_epi32(out_25_5, DCT_CONST_BITS);
+        const __m256i out_07_6 = _mm256_srai_epi32(out_07_4, DCT_CONST_BITS);
+        const __m256i out_07_7 = _mm256_srai_epi32(out_07_5, DCT_CONST_BITS);
+        const __m256i out_23_6 = _mm256_srai_epi32(out_23_4, DCT_CONST_BITS);
+        const __m256i out_23_7 = _mm256_srai_epi32(out_23_5, DCT_CONST_BITS);
+        const __m256i out_15_6 = _mm256_srai_epi32(out_15_4, DCT_CONST_BITS);
+        const __m256i out_15_7 = _mm256_srai_epi32(out_15_5, DCT_CONST_BITS);
+        const __m256i out_31_6 = _mm256_srai_epi32(out_31_4, DCT_CONST_BITS);
+        const __m256i out_31_7 = _mm256_srai_epi32(out_31_5, DCT_CONST_BITS);
+        // Combine
+        out[ 1] = _mm256_packs_epi32(out_01_6, out_01_7);
+        out[17] = _mm256_packs_epi32(out_17_6, out_17_7);
+        out[ 9] = _mm256_packs_epi32(out_09_6, out_09_7);
+        out[25] = _mm256_packs_epi32(out_25_6, out_25_7);
+        out[ 7] = _mm256_packs_epi32(out_07_6, out_07_7);
+        out[23] = _mm256_packs_epi32(out_23_6, out_23_7);
+        out[15] = _mm256_packs_epi32(out_15_6, out_15_7);
+        out[31] = _mm256_packs_epi32(out_31_6, out_31_7);
+      }
+      {
+        const __m256i out_05_0 = _mm256_unpacklo_epi16(step1[20], step1[27]);
+        const __m256i out_05_1 = _mm256_unpackhi_epi16(step1[20], step1[27]);
+        const __m256i out_21_0 = _mm256_unpacklo_epi16(step1[21], step1[26]);
+        const __m256i out_21_1 = _mm256_unpackhi_epi16(step1[21], step1[26]);
+        const __m256i out_13_0 = _mm256_unpacklo_epi16(step1[22], step1[25]);
+        const __m256i out_13_1 = _mm256_unpackhi_epi16(step1[22], step1[25]);
+        const __m256i out_29_0 = _mm256_unpacklo_epi16(step1[23], step1[24]);
+        const __m256i out_29_1 = _mm256_unpackhi_epi16(step1[23], step1[24]);
+        const __m256i out_05_2 = _mm256_madd_epi16(out_05_0, k__cospi_p27_p05);
+        const __m256i out_05_3 = _mm256_madd_epi16(out_05_1, k__cospi_p27_p05);
+        const __m256i out_21_2 = _mm256_madd_epi16(out_21_0, k__cospi_p11_p21);
+        const __m256i out_21_3 = _mm256_madd_epi16(out_21_1, k__cospi_p11_p21);
+        const __m256i out_13_2 = _mm256_madd_epi16(out_13_0, k__cospi_p19_p13);
+        const __m256i out_13_3 = _mm256_madd_epi16(out_13_1, k__cospi_p19_p13);
+        const __m256i out_29_2 = _mm256_madd_epi16(out_29_0, k__cospi_p03_p29);
+        const __m256i out_29_3 = _mm256_madd_epi16(out_29_1, k__cospi_p03_p29);
+        const __m256i out_03_2 = _mm256_madd_epi16(out_29_0, k__cospi_m29_p03);
+        const __m256i out_03_3 = _mm256_madd_epi16(out_29_1, k__cospi_m29_p03);
+        const __m256i out_19_2 = _mm256_madd_epi16(out_13_0, k__cospi_m13_p19);
+        const __m256i out_19_3 = _mm256_madd_epi16(out_13_1, k__cospi_m13_p19);
+        const __m256i out_11_2 = _mm256_madd_epi16(out_21_0, k__cospi_m21_p11);
+        const __m256i out_11_3 = _mm256_madd_epi16(out_21_1, k__cospi_m21_p11);
+        const __m256i out_27_2 = _mm256_madd_epi16(out_05_0, k__cospi_m05_p27);
+        const __m256i out_27_3 = _mm256_madd_epi16(out_05_1, k__cospi_m05_p27);
+        // dct_const_round_shift
+        const __m256i out_05_4 = _mm256_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_05_5 = _mm256_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_21_4 = _mm256_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_21_5 = _mm256_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_13_4 = _mm256_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_13_5 = _mm256_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_29_4 = _mm256_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_29_5 = _mm256_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_03_4 = _mm256_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_03_5 = _mm256_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_19_4 = _mm256_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_19_5 = _mm256_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_11_4 = _mm256_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_11_5 = _mm256_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_27_4 = _mm256_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_27_5 = _mm256_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_05_6 = _mm256_srai_epi32(out_05_4, DCT_CONST_BITS);
+        const __m256i out_05_7 = _mm256_srai_epi32(out_05_5, DCT_CONST_BITS);
+        const __m256i out_21_6 = _mm256_srai_epi32(out_21_4, DCT_CONST_BITS);
+        const __m256i out_21_7 = _mm256_srai_epi32(out_21_5, DCT_CONST_BITS);
+        const __m256i out_13_6 = _mm256_srai_epi32(out_13_4, DCT_CONST_BITS);
+        const __m256i out_13_7 = _mm256_srai_epi32(out_13_5, DCT_CONST_BITS);
+        const __m256i out_29_6 = _mm256_srai_epi32(out_29_4, DCT_CONST_BITS);
+        const __m256i out_29_7 = _mm256_srai_epi32(out_29_5, DCT_CONST_BITS);
+        const __m256i out_03_6 = _mm256_srai_epi32(out_03_4, DCT_CONST_BITS);
+        const __m256i out_03_7 = _mm256_srai_epi32(out_03_5, DCT_CONST_BITS);
+        const __m256i out_19_6 = _mm256_srai_epi32(out_19_4, DCT_CONST_BITS);
+        const __m256i out_19_7 = _mm256_srai_epi32(out_19_5, DCT_CONST_BITS);
+        const __m256i out_11_6 = _mm256_srai_epi32(out_11_4, DCT_CONST_BITS);
+        const __m256i out_11_7 = _mm256_srai_epi32(out_11_5, DCT_CONST_BITS);
+        const __m256i out_27_6 = _mm256_srai_epi32(out_27_4, DCT_CONST_BITS);
+        const __m256i out_27_7 = _mm256_srai_epi32(out_27_5, DCT_CONST_BITS);
+        // Combine
+        out[ 5] = _mm256_packs_epi32(out_05_6, out_05_7);
+        out[21] = _mm256_packs_epi32(out_21_6, out_21_7);
+        out[13] = _mm256_packs_epi32(out_13_6, out_13_7);
+        out[29] = _mm256_packs_epi32(out_29_6, out_29_7);
+        out[ 3] = _mm256_packs_epi32(out_03_6, out_03_7);
+        out[19] = _mm256_packs_epi32(out_19_6, out_19_7);
+        out[11] = _mm256_packs_epi32(out_11_6, out_11_7);
+        out[27] = _mm256_packs_epi32(out_27_6, out_27_7);
+      }
+#if FDCT32x32_HIGH_PRECISION
+      } else {
+        __m256i lstep1[64], lstep2[64], lstep3[64];
+        __m256i u[32], v[32], sign[16];
+        const __m256i K32One = _mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1);
+        // start using 32-bit operations
+        // stage 3
+        {
+          // expanding to 32-bit length priori to addition operations
+          lstep2[ 0] = _mm256_unpacklo_epi16(step2[ 0], kZero);
+          lstep2[ 1] = _mm256_unpackhi_epi16(step2[ 0], kZero);
+          lstep2[ 2] = _mm256_unpacklo_epi16(step2[ 1], kZero);
+          lstep2[ 3] = _mm256_unpackhi_epi16(step2[ 1], kZero);
+          lstep2[ 4] = _mm256_unpacklo_epi16(step2[ 2], kZero);
+          lstep2[ 5] = _mm256_unpackhi_epi16(step2[ 2], kZero);
+          lstep2[ 6] = _mm256_unpacklo_epi16(step2[ 3], kZero);
+          lstep2[ 7] = _mm256_unpackhi_epi16(step2[ 3], kZero);
+          lstep2[ 8] = _mm256_unpacklo_epi16(step2[ 4], kZero);
+          lstep2[ 9] = _mm256_unpackhi_epi16(step2[ 4], kZero);
+          lstep2[10] = _mm256_unpacklo_epi16(step2[ 5], kZero);
+          lstep2[11] = _mm256_unpackhi_epi16(step2[ 5], kZero);
+          lstep2[12] = _mm256_unpacklo_epi16(step2[ 6], kZero);
+          lstep2[13] = _mm256_unpackhi_epi16(step2[ 6], kZero);
+          lstep2[14] = _mm256_unpacklo_epi16(step2[ 7], kZero);
+          lstep2[15] = _mm256_unpackhi_epi16(step2[ 7], kZero);
+          lstep2[ 0] = _mm256_madd_epi16(lstep2[ 0], kOne);
+          lstep2[ 1] = _mm256_madd_epi16(lstep2[ 1], kOne);
+          lstep2[ 2] = _mm256_madd_epi16(lstep2[ 2], kOne);
+          lstep2[ 3] = _mm256_madd_epi16(lstep2[ 3], kOne);
+          lstep2[ 4] = _mm256_madd_epi16(lstep2[ 4], kOne);
+          lstep2[ 5] = _mm256_madd_epi16(lstep2[ 5], kOne);
+          lstep2[ 6] = _mm256_madd_epi16(lstep2[ 6], kOne);
+          lstep2[ 7] = _mm256_madd_epi16(lstep2[ 7], kOne);
+          lstep2[ 8] = _mm256_madd_epi16(lstep2[ 8], kOne);
+          lstep2[ 9] = _mm256_madd_epi16(lstep2[ 9], kOne);
+          lstep2[10] = _mm256_madd_epi16(lstep2[10], kOne);
+          lstep2[11] = _mm256_madd_epi16(lstep2[11], kOne);
+          lstep2[12] = _mm256_madd_epi16(lstep2[12], kOne);
+          lstep2[13] = _mm256_madd_epi16(lstep2[13], kOne);
+          lstep2[14] = _mm256_madd_epi16(lstep2[14], kOne);
+          lstep2[15] = _mm256_madd_epi16(lstep2[15], kOne);
+
+          lstep3[ 0] = _mm256_add_epi32(lstep2[14], lstep2[ 0]);
+          lstep3[ 1] = _mm256_add_epi32(lstep2[15], lstep2[ 1]);
+          lstep3[ 2] = _mm256_add_epi32(lstep2[12], lstep2[ 2]);
+          lstep3[ 3] = _mm256_add_epi32(lstep2[13], lstep2[ 3]);
+          lstep3[ 4] = _mm256_add_epi32(lstep2[10], lstep2[ 4]);
+          lstep3[ 5] = _mm256_add_epi32(lstep2[11], lstep2[ 5]);
+          lstep3[ 6] = _mm256_add_epi32(lstep2[ 8], lstep2[ 6]);
+          lstep3[ 7] = _mm256_add_epi32(lstep2[ 9], lstep2[ 7]);
+          lstep3[ 8] = _mm256_sub_epi32(lstep2[ 6], lstep2[ 8]);
+          lstep3[ 9] = _mm256_sub_epi32(lstep2[ 7], lstep2[ 9]);
+          lstep3[10] = _mm256_sub_epi32(lstep2[ 4], lstep2[10]);
+          lstep3[11] = _mm256_sub_epi32(lstep2[ 5], lstep2[11]);
+          lstep3[12] = _mm256_sub_epi32(lstep2[ 2], lstep2[12]);
+          lstep3[13] = _mm256_sub_epi32(lstep2[ 3], lstep2[13]);
+          lstep3[14] = _mm256_sub_epi32(lstep2[ 0], lstep2[14]);
+          lstep3[15] = _mm256_sub_epi32(lstep2[ 1], lstep2[15]);
+        }
+        {
+          const __m256i s3_10_0 = _mm256_unpacklo_epi16(step2[13], step2[10]);
+          const __m256i s3_10_1 = _mm256_unpackhi_epi16(step2[13], step2[10]);
+          const __m256i s3_11_0 = _mm256_unpacklo_epi16(step2[12], step2[11]);
+          const __m256i s3_11_1 = _mm256_unpackhi_epi16(step2[12], step2[11]);
+          const __m256i s3_10_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_m16);
+          const __m256i s3_10_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_m16);
+          const __m256i s3_11_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_m16);
+          const __m256i s3_11_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_m16);
+          const __m256i s3_12_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_p16);
+          const __m256i s3_12_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_p16);
+          const __m256i s3_13_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_p16);
+          const __m256i s3_13_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_p16);
+          // dct_const_round_shift
+          const __m256i s3_10_4 = _mm256_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
+          const __m256i s3_10_5 = _mm256_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
+          const __m256i s3_11_4 = _mm256_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
+          const __m256i s3_11_5 = _mm256_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
+          const __m256i s3_12_4 = _mm256_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
+          const __m256i s3_12_5 = _mm256_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
+          const __m256i s3_13_4 = _mm256_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
+          const __m256i s3_13_5 = _mm256_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
+          lstep3[20] = _mm256_srai_epi32(s3_10_4, DCT_CONST_BITS);
+          lstep3[21] = _mm256_srai_epi32(s3_10_5, DCT_CONST_BITS);
+          lstep3[22] = _mm256_srai_epi32(s3_11_4, DCT_CONST_BITS);
+          lstep3[23] = _mm256_srai_epi32(s3_11_5, DCT_CONST_BITS);
+          lstep3[24] = _mm256_srai_epi32(s3_12_4, DCT_CONST_BITS);
+          lstep3[25] = _mm256_srai_epi32(s3_12_5, DCT_CONST_BITS);
+          lstep3[26] = _mm256_srai_epi32(s3_13_4, DCT_CONST_BITS);
+          lstep3[27] = _mm256_srai_epi32(s3_13_5, DCT_CONST_BITS);
+        }
+        {
+          lstep2[40] = _mm256_unpacklo_epi16(step2[20], kZero);
+          lstep2[41] = _mm256_unpackhi_epi16(step2[20], kZero);
+          lstep2[42] = _mm256_unpacklo_epi16(step2[21], kZero);
+          lstep2[43] = _mm256_unpackhi_epi16(step2[21], kZero);
+          lstep2[44] = _mm256_unpacklo_epi16(step2[22], kZero);
+          lstep2[45] = _mm256_unpackhi_epi16(step2[22], kZero);
+          lstep2[46] = _mm256_unpacklo_epi16(step2[23], kZero);
+          lstep2[47] = _mm256_unpackhi_epi16(step2[23], kZero);
+          lstep2[48] = _mm256_unpacklo_epi16(step2[24], kZero);
+          lstep2[49] = _mm256_unpackhi_epi16(step2[24], kZero);
+          lstep2[50] = _mm256_unpacklo_epi16(step2[25], kZero);
+          lstep2[51] = _mm256_unpackhi_epi16(step2[25], kZero);
+          lstep2[52] = _mm256_unpacklo_epi16(step2[26], kZero);
+          lstep2[53] = _mm256_unpackhi_epi16(step2[26], kZero);
+          lstep2[54] = _mm256_unpacklo_epi16(step2[27], kZero);
+          lstep2[55] = _mm256_unpackhi_epi16(step2[27], kZero);
+          lstep2[40] = _mm256_madd_epi16(lstep2[40], kOne);
+          lstep2[41] = _mm256_madd_epi16(lstep2[41], kOne);
+          lstep2[42] = _mm256_madd_epi16(lstep2[42], kOne);
+          lstep2[43] = _mm256_madd_epi16(lstep2[43], kOne);
+          lstep2[44] = _mm256_madd_epi16(lstep2[44], kOne);
+          lstep2[45] = _mm256_madd_epi16(lstep2[45], kOne);
+          lstep2[46] = _mm256_madd_epi16(lstep2[46], kOne);
+          lstep2[47] = _mm256_madd_epi16(lstep2[47], kOne);
+          lstep2[48] = _mm256_madd_epi16(lstep2[48], kOne);
+          lstep2[49] = _mm256_madd_epi16(lstep2[49], kOne);
+          lstep2[50] = _mm256_madd_epi16(lstep2[50], kOne);
+          lstep2[51] = _mm256_madd_epi16(lstep2[51], kOne);
+          lstep2[52] = _mm256_madd_epi16(lstep2[52], kOne);
+          lstep2[53] = _mm256_madd_epi16(lstep2[53], kOne);
+          lstep2[54] = _mm256_madd_epi16(lstep2[54], kOne);
+          lstep2[55] = _mm256_madd_epi16(lstep2[55], kOne);
+
+          lstep1[32] = _mm256_unpacklo_epi16(step1[16], kZero);
+          lstep1[33] = _mm256_unpackhi_epi16(step1[16], kZero);
+          lstep1[34] = _mm256_unpacklo_epi16(step1[17], kZero);
+          lstep1[35] = _mm256_unpackhi_epi16(step1[17], kZero);
+          lstep1[36] = _mm256_unpacklo_epi16(step1[18], kZero);
+          lstep1[37] = _mm256_unpackhi_epi16(step1[18], kZero);
+          lstep1[38] = _mm256_unpacklo_epi16(step1[19], kZero);
+          lstep1[39] = _mm256_unpackhi_epi16(step1[19], kZero);
+          lstep1[56] = _mm256_unpacklo_epi16(step1[28], kZero);
+          lstep1[57] = _mm256_unpackhi_epi16(step1[28], kZero);
+          lstep1[58] = _mm256_unpacklo_epi16(step1[29], kZero);
+          lstep1[59] = _mm256_unpackhi_epi16(step1[29], kZero);
+          lstep1[60] = _mm256_unpacklo_epi16(step1[30], kZero);
+          lstep1[61] = _mm256_unpackhi_epi16(step1[30], kZero);
+          lstep1[62] = _mm256_unpacklo_epi16(step1[31], kZero);
+          lstep1[63] = _mm256_unpackhi_epi16(step1[31], kZero);
+          lstep1[32] = _mm256_madd_epi16(lstep1[32], kOne);
+          lstep1[33] = _mm256_madd_epi16(lstep1[33], kOne);
+          lstep1[34] = _mm256_madd_epi16(lstep1[34], kOne);
+          lstep1[35] = _mm256_madd_epi16(lstep1[35], kOne);
+          lstep1[36] = _mm256_madd_epi16(lstep1[36], kOne);
+          lstep1[37] = _mm256_madd_epi16(lstep1[37], kOne);
+          lstep1[38] = _mm256_madd_epi16(lstep1[38], kOne);
+          lstep1[39] = _mm256_madd_epi16(lstep1[39], kOne);
+          lstep1[56] = _mm256_madd_epi16(lstep1[56], kOne);
+          lstep1[57] = _mm256_madd_epi16(lstep1[57], kOne);
+          lstep1[58] = _mm256_madd_epi16(lstep1[58], kOne);
+          lstep1[59] = _mm256_madd_epi16(lstep1[59], kOne);
+          lstep1[60] = _mm256_madd_epi16(lstep1[60], kOne);
+          lstep1[61] = _mm256_madd_epi16(lstep1[61], kOne);
+          lstep1[62] = _mm256_madd_epi16(lstep1[62], kOne);
+          lstep1[63] = _mm256_madd_epi16(lstep1[63], kOne);
+
+          lstep3[32] = _mm256_add_epi32(lstep2[46], lstep1[32]);
+          lstep3[33] = _mm256_add_epi32(lstep2[47], lstep1[33]);
+
+          lstep3[34] = _mm256_add_epi32(lstep2[44], lstep1[34]);
+          lstep3[35] = _mm256_add_epi32(lstep2[45], lstep1[35]);
+          lstep3[36] = _mm256_add_epi32(lstep2[42], lstep1[36]);
+          lstep3[37] = _mm256_add_epi32(lstep2[43], lstep1[37]);
+          lstep3[38] = _mm256_add_epi32(lstep2[40], lstep1[38]);
+          lstep3[39] = _mm256_add_epi32(lstep2[41], lstep1[39]);
+          lstep3[40] = _mm256_sub_epi32(lstep1[38], lstep2[40]);
+          lstep3[41] = _mm256_sub_epi32(lstep1[39], lstep2[41]);
+          lstep3[42] = _mm256_sub_epi32(lstep1[36], lstep2[42]);
+          lstep3[43] = _mm256_sub_epi32(lstep1[37], lstep2[43]);
+          lstep3[44] = _mm256_sub_epi32(lstep1[34], lstep2[44]);
+          lstep3[45] = _mm256_sub_epi32(lstep1[35], lstep2[45]);
+          lstep3[46] = _mm256_sub_epi32(lstep1[32], lstep2[46]);
+          lstep3[47] = _mm256_sub_epi32(lstep1[33], lstep2[47]);
+          lstep3[48] = _mm256_sub_epi32(lstep1[62], lstep2[48]);
+          lstep3[49] = _mm256_sub_epi32(lstep1[63], lstep2[49]);
+          lstep3[50] = _mm256_sub_epi32(lstep1[60], lstep2[50]);
+          lstep3[51] = _mm256_sub_epi32(lstep1[61], lstep2[51]);
+          lstep3[52] = _mm256_sub_epi32(lstep1[58], lstep2[52]);
+          lstep3[53] = _mm256_sub_epi32(lstep1[59], lstep2[53]);
+          lstep3[54] = _mm256_sub_epi32(lstep1[56], lstep2[54]);
+          lstep3[55] = _mm256_sub_epi32(lstep1[57], lstep2[55]);
+          lstep3[56] = _mm256_add_epi32(lstep2[54], lstep1[56]);
+          lstep3[57] = _mm256_add_epi32(lstep2[55], lstep1[57]);
+          lstep3[58] = _mm256_add_epi32(lstep2[52], lstep1[58]);
+          lstep3[59] = _mm256_add_epi32(lstep2[53], lstep1[59]);
+          lstep3[60] = _mm256_add_epi32(lstep2[50], lstep1[60]);
+          lstep3[61] = _mm256_add_epi32(lstep2[51], lstep1[61]);
+          lstep3[62] = _mm256_add_epi32(lstep2[48], lstep1[62]);
+          lstep3[63] = _mm256_add_epi32(lstep2[49], lstep1[63]);
+        }
+
+        // stage 4
+        {
+          // expanding to 32-bit length priori to addition operations
+          lstep2[16] = _mm256_unpacklo_epi16(step2[ 8], kZero);
+          lstep2[17] = _mm256_unpackhi_epi16(step2[ 8], kZero);
+          lstep2[18] = _mm256_unpacklo_epi16(step2[ 9], kZero);
+          lstep2[19] = _mm256_unpackhi_epi16(step2[ 9], kZero);
+          lstep2[28] = _mm256_unpacklo_epi16(step2[14], kZero);
+          lstep2[29] = _mm256_unpackhi_epi16(step2[14], kZero);
+          lstep2[30] = _mm256_unpacklo_epi16(step2[15], kZero);
+          lstep2[31] = _mm256_unpackhi_epi16(step2[15], kZero);
+          lstep2[16] = _mm256_madd_epi16(lstep2[16], kOne);
+          lstep2[17] = _mm256_madd_epi16(lstep2[17], kOne);
+          lstep2[18] = _mm256_madd_epi16(lstep2[18], kOne);
+          lstep2[19] = _mm256_madd_epi16(lstep2[19], kOne);
+          lstep2[28] = _mm256_madd_epi16(lstep2[28], kOne);
+          lstep2[29] = _mm256_madd_epi16(lstep2[29], kOne);
+          lstep2[30] = _mm256_madd_epi16(lstep2[30], kOne);
+          lstep2[31] = _mm256_madd_epi16(lstep2[31], kOne);
+
+          lstep1[ 0] = _mm256_add_epi32(lstep3[ 6], lstep3[ 0]);
+          lstep1[ 1] = _mm256_add_epi32(lstep3[ 7], lstep3[ 1]);
+          lstep1[ 2] = _mm256_add_epi32(lstep3[ 4], lstep3[ 2]);
+          lstep1[ 3] = _mm256_add_epi32(lstep3[ 5], lstep3[ 3]);
+          lstep1[ 4] = _mm256_sub_epi32(lstep3[ 2], lstep3[ 4]);
+          lstep1[ 5] = _mm256_sub_epi32(lstep3[ 3], lstep3[ 5]);
+          lstep1[ 6] = _mm256_sub_epi32(lstep3[ 0], lstep3[ 6]);
+          lstep1[ 7] = _mm256_sub_epi32(lstep3[ 1], lstep3[ 7]);
+          lstep1[16] = _mm256_add_epi32(lstep3[22], lstep2[16]);
+          lstep1[17] = _mm256_add_epi32(lstep3[23], lstep2[17]);
+          lstep1[18] = _mm256_add_epi32(lstep3[20], lstep2[18]);
+          lstep1[19] = _mm256_add_epi32(lstep3[21], lstep2[19]);
+          lstep1[20] = _mm256_sub_epi32(lstep2[18], lstep3[20]);
+          lstep1[21] = _mm256_sub_epi32(lstep2[19], lstep3[21]);
+          lstep1[22] = _mm256_sub_epi32(lstep2[16], lstep3[22]);
+          lstep1[23] = _mm256_sub_epi32(lstep2[17], lstep3[23]);
+          lstep1[24] = _mm256_sub_epi32(lstep2[30], lstep3[24]);
+          lstep1[25] = _mm256_sub_epi32(lstep2[31], lstep3[25]);
+          lstep1[26] = _mm256_sub_epi32(lstep2[28], lstep3[26]);
+          lstep1[27] = _mm256_sub_epi32(lstep2[29], lstep3[27]);
+          lstep1[28] = _mm256_add_epi32(lstep3[26], lstep2[28]);
+          lstep1[29] = _mm256_add_epi32(lstep3[27], lstep2[29]);
+          lstep1[30] = _mm256_add_epi32(lstep3[24], lstep2[30]);
+          lstep1[31] = _mm256_add_epi32(lstep3[25], lstep2[31]);
+        }
+        {
+        // to be continued...
+        //
+        const __m256i k32_p16_p16 = pair256_set_epi32(cospi_16_64, cospi_16_64);
+        const __m256i k32_p16_m16 = pair256_set_epi32(cospi_16_64, -cospi_16_64);
+
+        u[0] = _mm256_unpacklo_epi32(lstep3[12], lstep3[10]);
+        u[1] = _mm256_unpackhi_epi32(lstep3[12], lstep3[10]);
+        u[2] = _mm256_unpacklo_epi32(lstep3[13], lstep3[11]);
+        u[3] = _mm256_unpackhi_epi32(lstep3[13], lstep3[11]);
+
+        // TODO(jingning): manually inline k_madd_epi32_avx2_ to further hide
+        // instruction latency.
+        v[ 0] = k_madd_epi32_avx2(u[0], k32_p16_m16);
+        v[ 1] = k_madd_epi32_avx2(u[1], k32_p16_m16);
+        v[ 2] = k_madd_epi32_avx2(u[2], k32_p16_m16);
+        v[ 3] = k_madd_epi32_avx2(u[3], k32_p16_m16);
+        v[ 4] = k_madd_epi32_avx2(u[0], k32_p16_p16);
+        v[ 5] = k_madd_epi32_avx2(u[1], k32_p16_p16);
+        v[ 6] = k_madd_epi32_avx2(u[2], k32_p16_p16);
+        v[ 7] = k_madd_epi32_avx2(u[3], k32_p16_p16);
+
+        u[0] = k_packs_epi64_avx2(v[0], v[1]);
+        u[1] = k_packs_epi64_avx2(v[2], v[3]);
+        u[2] = k_packs_epi64_avx2(v[4], v[5]);
+        u[3] = k_packs_epi64_avx2(v[6], v[7]);
+
+        v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+        v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+        v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+        v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+
+        lstep1[10] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+        lstep1[11] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+        lstep1[12] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+        lstep1[13] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+        }
+        {
+          const __m256i k32_m08_p24 = pair256_set_epi32(-cospi_8_64, cospi_24_64);
+          const __m256i k32_m24_m08 = pair256_set_epi32(-cospi_24_64, -cospi_8_64);
+          const __m256i k32_p24_p08 = pair256_set_epi32(cospi_24_64, cospi_8_64);
+
+          u[ 0] = _mm256_unpacklo_epi32(lstep3[36], lstep3[58]);
+          u[ 1] = _mm256_unpackhi_epi32(lstep3[36], lstep3[58]);
+          u[ 2] = _mm256_unpacklo_epi32(lstep3[37], lstep3[59]);
+          u[ 3] = _mm256_unpackhi_epi32(lstep3[37], lstep3[59]);
+          u[ 4] = _mm256_unpacklo_epi32(lstep3[38], lstep3[56]);
+          u[ 5] = _mm256_unpackhi_epi32(lstep3[38], lstep3[56]);
+          u[ 6] = _mm256_unpacklo_epi32(lstep3[39], lstep3[57]);
+          u[ 7] = _mm256_unpackhi_epi32(lstep3[39], lstep3[57]);
+          u[ 8] = _mm256_unpacklo_epi32(lstep3[40], lstep3[54]);
+          u[ 9] = _mm256_unpackhi_epi32(lstep3[40], lstep3[54]);
+          u[10] = _mm256_unpacklo_epi32(lstep3[41], lstep3[55]);
+          u[11] = _mm256_unpackhi_epi32(lstep3[41], lstep3[55]);
+          u[12] = _mm256_unpacklo_epi32(lstep3[42], lstep3[52]);
+          u[13] = _mm256_unpackhi_epi32(lstep3[42], lstep3[52]);
+          u[14] = _mm256_unpacklo_epi32(lstep3[43], lstep3[53]);
+          u[15] = _mm256_unpackhi_epi32(lstep3[43], lstep3[53]);
+
+          v[ 0] = k_madd_epi32_avx2(u[ 0], k32_m08_p24);
+          v[ 1] = k_madd_epi32_avx2(u[ 1], k32_m08_p24);
+          v[ 2] = k_madd_epi32_avx2(u[ 2], k32_m08_p24);
+          v[ 3] = k_madd_epi32_avx2(u[ 3], k32_m08_p24);
+          v[ 4] = k_madd_epi32_avx2(u[ 4], k32_m08_p24);
+          v[ 5] = k_madd_epi32_avx2(u[ 5], k32_m08_p24);
+          v[ 6] = k_madd_epi32_avx2(u[ 6], k32_m08_p24);
+          v[ 7] = k_madd_epi32_avx2(u[ 7], k32_m08_p24);
+          v[ 8] = k_madd_epi32_avx2(u[ 8], k32_m24_m08);
+          v[ 9] = k_madd_epi32_avx2(u[ 9], k32_m24_m08);
+          v[10] = k_madd_epi32_avx2(u[10], k32_m24_m08);
+          v[11] = k_madd_epi32_avx2(u[11], k32_m24_m08);
+          v[12] = k_madd_epi32_avx2(u[12], k32_m24_m08);
+          v[13] = k_madd_epi32_avx2(u[13], k32_m24_m08);
+          v[14] = k_madd_epi32_avx2(u[14], k32_m24_m08);
+          v[15] = k_madd_epi32_avx2(u[15], k32_m24_m08);
+          v[16] = k_madd_epi32_avx2(u[12], k32_m08_p24);
+          v[17] = k_madd_epi32_avx2(u[13], k32_m08_p24);
+          v[18] = k_madd_epi32_avx2(u[14], k32_m08_p24);
+          v[19] = k_madd_epi32_avx2(u[15], k32_m08_p24);
+          v[20] = k_madd_epi32_avx2(u[ 8], k32_m08_p24);
+          v[21] = k_madd_epi32_avx2(u[ 9], k32_m08_p24);
+          v[22] = k_madd_epi32_avx2(u[10], k32_m08_p24);
+          v[23] = k_madd_epi32_avx2(u[11], k32_m08_p24);
+          v[24] = k_madd_epi32_avx2(u[ 4], k32_p24_p08);
+          v[25] = k_madd_epi32_avx2(u[ 5], k32_p24_p08);
+          v[26] = k_madd_epi32_avx2(u[ 6], k32_p24_p08);
+          v[27] = k_madd_epi32_avx2(u[ 7], k32_p24_p08);
+          v[28] = k_madd_epi32_avx2(u[ 0], k32_p24_p08);
+          v[29] = k_madd_epi32_avx2(u[ 1], k32_p24_p08);
+          v[30] = k_madd_epi32_avx2(u[ 2], k32_p24_p08);
+          v[31] = k_madd_epi32_avx2(u[ 3], k32_p24_p08);
+
+          u[ 0] = k_packs_epi64_avx2(v[ 0], v[ 1]);
+          u[ 1] = k_packs_epi64_avx2(v[ 2], v[ 3]);
+          u[ 2] = k_packs_epi64_avx2(v[ 4], v[ 5]);
+          u[ 3] = k_packs_epi64_avx2(v[ 6], v[ 7]);
+          u[ 4] = k_packs_epi64_avx2(v[ 8], v[ 9]);
+          u[ 5] = k_packs_epi64_avx2(v[10], v[11]);
+          u[ 6] = k_packs_epi64_avx2(v[12], v[13]);
+          u[ 7] = k_packs_epi64_avx2(v[14], v[15]);
+          u[ 8] = k_packs_epi64_avx2(v[16], v[17]);
+          u[ 9] = k_packs_epi64_avx2(v[18], v[19]);
+          u[10] = k_packs_epi64_avx2(v[20], v[21]);
+          u[11] = k_packs_epi64_avx2(v[22], v[23]);
+          u[12] = k_packs_epi64_avx2(v[24], v[25]);
+          u[13] = k_packs_epi64_avx2(v[26], v[27]);
+          u[14] = k_packs_epi64_avx2(v[28], v[29]);
+          u[15] = k_packs_epi64_avx2(v[30], v[31]);
+
+          v[ 0] = _mm256_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+          v[ 1] = _mm256_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+          v[ 2] = _mm256_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+          v[ 3] = _mm256_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+          v[ 4] = _mm256_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+          v[ 5] = _mm256_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+          v[ 6] = _mm256_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+          v[ 7] = _mm256_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+          v[ 8] = _mm256_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+          v[ 9] = _mm256_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+          v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+          v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+          v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+          v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+          v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+          v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+          lstep1[36] = _mm256_srai_epi32(v[ 0], DCT_CONST_BITS);
+          lstep1[37] = _mm256_srai_epi32(v[ 1], DCT_CONST_BITS);
+          lstep1[38] = _mm256_srai_epi32(v[ 2], DCT_CONST_BITS);
+          lstep1[39] = _mm256_srai_epi32(v[ 3], DCT_CONST_BITS);
+          lstep1[40] = _mm256_srai_epi32(v[ 4], DCT_CONST_BITS);
+          lstep1[41] = _mm256_srai_epi32(v[ 5], DCT_CONST_BITS);
+          lstep1[42] = _mm256_srai_epi32(v[ 6], DCT_CONST_BITS);
+          lstep1[43] = _mm256_srai_epi32(v[ 7], DCT_CONST_BITS);
+          lstep1[52] = _mm256_srai_epi32(v[ 8], DCT_CONST_BITS);
+          lstep1[53] = _mm256_srai_epi32(v[ 9], DCT_CONST_BITS);
+          lstep1[54] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
+          lstep1[55] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
+          lstep1[56] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
+          lstep1[57] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
+          lstep1[58] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
+          lstep1[59] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
+        }
+        // stage 5
+        {
+          lstep2[ 8] = _mm256_add_epi32(lstep1[10], lstep3[ 8]);
+          lstep2[ 9] = _mm256_add_epi32(lstep1[11], lstep3[ 9]);
+          lstep2[10] = _mm256_sub_epi32(lstep3[ 8], lstep1[10]);
+          lstep2[11] = _mm256_sub_epi32(lstep3[ 9], lstep1[11]);
+          lstep2[12] = _mm256_sub_epi32(lstep3[14], lstep1[12]);
+          lstep2[13] = _mm256_sub_epi32(lstep3[15], lstep1[13]);
+          lstep2[14] = _mm256_add_epi32(lstep1[12], lstep3[14]);
+          lstep2[15] = _mm256_add_epi32(lstep1[13], lstep3[15]);
+        }
+        {
+          const __m256i k32_p16_p16 = pair256_set_epi32(cospi_16_64, cospi_16_64);
+          const __m256i k32_p16_m16 = pair256_set_epi32(cospi_16_64, -cospi_16_64);
+          const __m256i k32_p24_p08 = pair256_set_epi32(cospi_24_64, cospi_8_64);
+          const __m256i k32_m08_p24 = pair256_set_epi32(-cospi_8_64, cospi_24_64);
+
+          u[0] = _mm256_unpacklo_epi32(lstep1[0], lstep1[2]);
+          u[1] = _mm256_unpackhi_epi32(lstep1[0], lstep1[2]);
+          u[2] = _mm256_unpacklo_epi32(lstep1[1], lstep1[3]);
+          u[3] = _mm256_unpackhi_epi32(lstep1[1], lstep1[3]);
+          u[4] = _mm256_unpacklo_epi32(lstep1[4], lstep1[6]);
+          u[5] = _mm256_unpackhi_epi32(lstep1[4], lstep1[6]);
+          u[6] = _mm256_unpacklo_epi32(lstep1[5], lstep1[7]);
+          u[7] = _mm256_unpackhi_epi32(lstep1[5], lstep1[7]);
+
+          // TODO(jingning): manually inline k_madd_epi32_avx2_ to further hide
+          // instruction latency.
+          v[ 0] = k_madd_epi32_avx2(u[0], k32_p16_p16);
+          v[ 1] = k_madd_epi32_avx2(u[1], k32_p16_p16);
+          v[ 2] = k_madd_epi32_avx2(u[2], k32_p16_p16);
+          v[ 3] = k_madd_epi32_avx2(u[3], k32_p16_p16);
+          v[ 4] = k_madd_epi32_avx2(u[0], k32_p16_m16);
+          v[ 5] = k_madd_epi32_avx2(u[1], k32_p16_m16);
+          v[ 6] = k_madd_epi32_avx2(u[2], k32_p16_m16);
+          v[ 7] = k_madd_epi32_avx2(u[3], k32_p16_m16);
+          v[ 8] = k_madd_epi32_avx2(u[4], k32_p24_p08);
+          v[ 9] = k_madd_epi32_avx2(u[5], k32_p24_p08);
+          v[10] = k_madd_epi32_avx2(u[6], k32_p24_p08);
+          v[11] = k_madd_epi32_avx2(u[7], k32_p24_p08);
+          v[12] = k_madd_epi32_avx2(u[4], k32_m08_p24);
+          v[13] = k_madd_epi32_avx2(u[5], k32_m08_p24);
+          v[14] = k_madd_epi32_avx2(u[6], k32_m08_p24);
+          v[15] = k_madd_epi32_avx2(u[7], k32_m08_p24);
+
+          u[0] = k_packs_epi64_avx2(v[0], v[1]);
+          u[1] = k_packs_epi64_avx2(v[2], v[3]);
+          u[2] = k_packs_epi64_avx2(v[4], v[5]);
+          u[3] = k_packs_epi64_avx2(v[6], v[7]);
+          u[4] = k_packs_epi64_avx2(v[8], v[9]);
+          u[5] = k_packs_epi64_avx2(v[10], v[11]);
+          u[6] = k_packs_epi64_avx2(v[12], v[13]);
+          u[7] = k_packs_epi64_avx2(v[14], v[15]);
+
+          v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+          v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+          v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+          v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+          v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+          v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+          v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+          v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+          u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+          u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+          u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+          u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+          u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
+          u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
+          u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
+          u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
+
+          sign[0] = _mm256_cmpgt_epi32(kZero,u[0]);
+          sign[1] = _mm256_cmpgt_epi32(kZero,u[1]);
+          sign[2] = _mm256_cmpgt_epi32(kZero,u[2]);
+          sign[3] = _mm256_cmpgt_epi32(kZero,u[3]);
+          sign[4] = _mm256_cmpgt_epi32(kZero,u[4]);
+          sign[5] = _mm256_cmpgt_epi32(kZero,u[5]);
+          sign[6] = _mm256_cmpgt_epi32(kZero,u[6]);
+          sign[7] = _mm256_cmpgt_epi32(kZero,u[7]);
+
+          u[0] = _mm256_sub_epi32(u[0], sign[0]);
+          u[1] = _mm256_sub_epi32(u[1], sign[1]);
+          u[2] = _mm256_sub_epi32(u[2], sign[2]);
+          u[3] = _mm256_sub_epi32(u[3], sign[3]);
+          u[4] = _mm256_sub_epi32(u[4], sign[4]);
+          u[5] = _mm256_sub_epi32(u[5], sign[5]);
+          u[6] = _mm256_sub_epi32(u[6], sign[6]);
+          u[7] = _mm256_sub_epi32(u[7], sign[7]);
+
+          u[0] = _mm256_add_epi32(u[0], K32One);
+          u[1] = _mm256_add_epi32(u[1], K32One);
+          u[2] = _mm256_add_epi32(u[2], K32One);
+          u[3] = _mm256_add_epi32(u[3], K32One);
+          u[4] = _mm256_add_epi32(u[4], K32One);
+          u[5] = _mm256_add_epi32(u[5], K32One);
+          u[6] = _mm256_add_epi32(u[6], K32One);
+          u[7] = _mm256_add_epi32(u[7], K32One);
+
+          u[0] = _mm256_srai_epi32(u[0], 2);
+          u[1] = _mm256_srai_epi32(u[1], 2);
+          u[2] = _mm256_srai_epi32(u[2], 2);
+          u[3] = _mm256_srai_epi32(u[3], 2);
+          u[4] = _mm256_srai_epi32(u[4], 2);
+          u[5] = _mm256_srai_epi32(u[5], 2);
+          u[6] = _mm256_srai_epi32(u[6], 2);
+          u[7] = _mm256_srai_epi32(u[7], 2);
+
+          // Combine
+          out[ 0] = _mm256_packs_epi32(u[0], u[1]);
+          out[16] = _mm256_packs_epi32(u[2], u[3]);
+          out[ 8] = _mm256_packs_epi32(u[4], u[5]);
+          out[24] = _mm256_packs_epi32(u[6], u[7]);
+        }
+        {
+          const __m256i k32_m08_p24 = pair256_set_epi32(-cospi_8_64, cospi_24_64);
+          const __m256i k32_m24_m08 = pair256_set_epi32(-cospi_24_64, -cospi_8_64);
+          const __m256i k32_p24_p08 = pair256_set_epi32(cospi_24_64, cospi_8_64);
+
+          u[0] = _mm256_unpacklo_epi32(lstep1[18], lstep1[28]);
+          u[1] = _mm256_unpackhi_epi32(lstep1[18], lstep1[28]);
+          u[2] = _mm256_unpacklo_epi32(lstep1[19], lstep1[29]);
+          u[3] = _mm256_unpackhi_epi32(lstep1[19], lstep1[29]);
+          u[4] = _mm256_unpacklo_epi32(lstep1[20], lstep1[26]);
+          u[5] = _mm256_unpackhi_epi32(lstep1[20], lstep1[26]);
+          u[6] = _mm256_unpacklo_epi32(lstep1[21], lstep1[27]);
+          u[7] = _mm256_unpackhi_epi32(lstep1[21], lstep1[27]);
+
+          v[0] = k_madd_epi32_avx2(u[0], k32_m08_p24);
+          v[1] = k_madd_epi32_avx2(u[1], k32_m08_p24);
+          v[2] = k_madd_epi32_avx2(u[2], k32_m08_p24);
+          v[3] = k_madd_epi32_avx2(u[3], k32_m08_p24);
+          v[4] = k_madd_epi32_avx2(u[4], k32_m24_m08);
+          v[5] = k_madd_epi32_avx2(u[5], k32_m24_m08);
+          v[6] = k_madd_epi32_avx2(u[6], k32_m24_m08);
+          v[7] = k_madd_epi32_avx2(u[7], k32_m24_m08);
+          v[ 8] = k_madd_epi32_avx2(u[4], k32_m08_p24);
+          v[ 9] = k_madd_epi32_avx2(u[5], k32_m08_p24);
+          v[10] = k_madd_epi32_avx2(u[6], k32_m08_p24);
+          v[11] = k_madd_epi32_avx2(u[7], k32_m08_p24);
+          v[12] = k_madd_epi32_avx2(u[0], k32_p24_p08);
+          v[13] = k_madd_epi32_avx2(u[1], k32_p24_p08);
+          v[14] = k_madd_epi32_avx2(u[2], k32_p24_p08);
+          v[15] = k_madd_epi32_avx2(u[3], k32_p24_p08);
+
+          u[0] = k_packs_epi64_avx2(v[0], v[1]);
+          u[1] = k_packs_epi64_avx2(v[2], v[3]);
+          u[2] = k_packs_epi64_avx2(v[4], v[5]);
+          u[3] = k_packs_epi64_avx2(v[6], v[7]);
+          u[4] = k_packs_epi64_avx2(v[8], v[9]);
+          u[5] = k_packs_epi64_avx2(v[10], v[11]);
+          u[6] = k_packs_epi64_avx2(v[12], v[13]);
+          u[7] = k_packs_epi64_avx2(v[14], v[15]);
+
+          u[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+          u[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+          u[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+          u[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+          u[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+          u[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+          u[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+          u[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+          lstep2[18] = _mm256_srai_epi32(u[0], DCT_CONST_BITS);
+          lstep2[19] = _mm256_srai_epi32(u[1], DCT_CONST_BITS);
+          lstep2[20] = _mm256_srai_epi32(u[2], DCT_CONST_BITS);
+          lstep2[21] = _mm256_srai_epi32(u[3], DCT_CONST_BITS);
+          lstep2[26] = _mm256_srai_epi32(u[4], DCT_CONST_BITS);
+          lstep2[27] = _mm256_srai_epi32(u[5], DCT_CONST_BITS);
+          lstep2[28] = _mm256_srai_epi32(u[6], DCT_CONST_BITS);
+          lstep2[29] = _mm256_srai_epi32(u[7], DCT_CONST_BITS);
+        }
+        {
+          lstep2[32] = _mm256_add_epi32(lstep1[38], lstep3[32]);
+          lstep2[33] = _mm256_add_epi32(lstep1[39], lstep3[33]);
+          lstep2[34] = _mm256_add_epi32(lstep1[36], lstep3[34]);
+          lstep2[35] = _mm256_add_epi32(lstep1[37], lstep3[35]);
+          lstep2[36] = _mm256_sub_epi32(lstep3[34], lstep1[36]);
+          lstep2[37] = _mm256_sub_epi32(lstep3[35], lstep1[37]);
+          lstep2[38] = _mm256_sub_epi32(lstep3[32], lstep1[38]);
+          lstep2[39] = _mm256_sub_epi32(lstep3[33], lstep1[39]);
+          lstep2[40] = _mm256_sub_epi32(lstep3[46], lstep1[40]);
+          lstep2[41] = _mm256_sub_epi32(lstep3[47], lstep1[41]);
+          lstep2[42] = _mm256_sub_epi32(lstep3[44], lstep1[42]);
+          lstep2[43] = _mm256_sub_epi32(lstep3[45], lstep1[43]);
+          lstep2[44] = _mm256_add_epi32(lstep1[42], lstep3[44]);
+          lstep2[45] = _mm256_add_epi32(lstep1[43], lstep3[45]);
+          lstep2[46] = _mm256_add_epi32(lstep1[40], lstep3[46]);
+          lstep2[47] = _mm256_add_epi32(lstep1[41], lstep3[47]);
+          lstep2[48] = _mm256_add_epi32(lstep1[54], lstep3[48]);
+          lstep2[49] = _mm256_add_epi32(lstep1[55], lstep3[49]);
+          lstep2[50] = _mm256_add_epi32(lstep1[52], lstep3[50]);
+          lstep2[51] = _mm256_add_epi32(lstep1[53], lstep3[51]);
+          lstep2[52] = _mm256_sub_epi32(lstep3[50], lstep1[52]);
+          lstep2[53] = _mm256_sub_epi32(lstep3[51], lstep1[53]);
+          lstep2[54] = _mm256_sub_epi32(lstep3[48], lstep1[54]);
+          lstep2[55] = _mm256_sub_epi32(lstep3[49], lstep1[55]);
+          lstep2[56] = _mm256_sub_epi32(lstep3[62], lstep1[56]);
+          lstep2[57] = _mm256_sub_epi32(lstep3[63], lstep1[57]);
+          lstep2[58] = _mm256_sub_epi32(lstep3[60], lstep1[58]);
+          lstep2[59] = _mm256_sub_epi32(lstep3[61], lstep1[59]);
+          lstep2[60] = _mm256_add_epi32(lstep1[58], lstep3[60]);
+          lstep2[61] = _mm256_add_epi32(lstep1[59], lstep3[61]);
+          lstep2[62] = _mm256_add_epi32(lstep1[56], lstep3[62]);
+          lstep2[63] = _mm256_add_epi32(lstep1[57], lstep3[63]);
+        }
+        // stage 6
+        {
+          const __m256i k32_p28_p04 = pair256_set_epi32(cospi_28_64, cospi_4_64);
+          const __m256i k32_p12_p20 = pair256_set_epi32(cospi_12_64, cospi_20_64);
+          const __m256i k32_m20_p12 = pair256_set_epi32(-cospi_20_64, cospi_12_64);
+          const __m256i k32_m04_p28 = pair256_set_epi32(-cospi_4_64, cospi_28_64);
+
+          u[0] = _mm256_unpacklo_epi32(lstep2[ 8], lstep2[14]);
+          u[1] = _mm256_unpackhi_epi32(lstep2[ 8], lstep2[14]);
+          u[2] = _mm256_unpacklo_epi32(lstep2[ 9], lstep2[15]);
+          u[3] = _mm256_unpackhi_epi32(lstep2[ 9], lstep2[15]);
+          u[4] = _mm256_unpacklo_epi32(lstep2[10], lstep2[12]);
+          u[5] = _mm256_unpackhi_epi32(lstep2[10], lstep2[12]);
+          u[6] = _mm256_unpacklo_epi32(lstep2[11], lstep2[13]);
+          u[7] = _mm256_unpackhi_epi32(lstep2[11], lstep2[13]);
+          u[8] = _mm256_unpacklo_epi32(lstep2[10], lstep2[12]);
+          u[9] = _mm256_unpackhi_epi32(lstep2[10], lstep2[12]);
+          u[10] = _mm256_unpacklo_epi32(lstep2[11], lstep2[13]);
+          u[11] = _mm256_unpackhi_epi32(lstep2[11], lstep2[13]);
+          u[12] = _mm256_unpacklo_epi32(lstep2[ 8], lstep2[14]);
+          u[13] = _mm256_unpackhi_epi32(lstep2[ 8], lstep2[14]);
+          u[14] = _mm256_unpacklo_epi32(lstep2[ 9], lstep2[15]);
+          u[15] = _mm256_unpackhi_epi32(lstep2[ 9], lstep2[15]);
+
+          v[0] = k_madd_epi32_avx2(u[0], k32_p28_p04);
+          v[1] = k_madd_epi32_avx2(u[1], k32_p28_p04);
+          v[2] = k_madd_epi32_avx2(u[2], k32_p28_p04);
+          v[3] = k_madd_epi32_avx2(u[3], k32_p28_p04);
+          v[4] = k_madd_epi32_avx2(u[4], k32_p12_p20);
+          v[5] = k_madd_epi32_avx2(u[5], k32_p12_p20);
+          v[6] = k_madd_epi32_avx2(u[6], k32_p12_p20);
+          v[7] = k_madd_epi32_avx2(u[7], k32_p12_p20);
+          v[ 8] = k_madd_epi32_avx2(u[ 8], k32_m20_p12);
+          v[ 9] = k_madd_epi32_avx2(u[ 9], k32_m20_p12);
+          v[10] = k_madd_epi32_avx2(u[10], k32_m20_p12);
+          v[11] = k_madd_epi32_avx2(u[11], k32_m20_p12);
+          v[12] = k_madd_epi32_avx2(u[12], k32_m04_p28);
+          v[13] = k_madd_epi32_avx2(u[13], k32_m04_p28);
+          v[14] = k_madd_epi32_avx2(u[14], k32_m04_p28);
+          v[15] = k_madd_epi32_avx2(u[15], k32_m04_p28);
+
+          u[0] = k_packs_epi64_avx2(v[0], v[1]);
+          u[1] = k_packs_epi64_avx2(v[2], v[3]);
+          u[2] = k_packs_epi64_avx2(v[4], v[5]);
+          u[3] = k_packs_epi64_avx2(v[6], v[7]);
+          u[4] = k_packs_epi64_avx2(v[8], v[9]);
+          u[5] = k_packs_epi64_avx2(v[10], v[11]);
+          u[6] = k_packs_epi64_avx2(v[12], v[13]);
+          u[7] = k_packs_epi64_avx2(v[14], v[15]);
+
+          v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+          v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+          v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+          v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+          v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+          v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+          v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+          v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+          u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+          u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+          u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+          u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+          u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
+          u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
+          u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
+          u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
+
+          sign[0] = _mm256_cmpgt_epi32(kZero,u[0]);
+          sign[1] = _mm256_cmpgt_epi32(kZero,u[1]);
+          sign[2] = _mm256_cmpgt_epi32(kZero,u[2]);
+          sign[3] = _mm256_cmpgt_epi32(kZero,u[3]);
+          sign[4] = _mm256_cmpgt_epi32(kZero,u[4]);
+          sign[5] = _mm256_cmpgt_epi32(kZero,u[5]);
+          sign[6] = _mm256_cmpgt_epi32(kZero,u[6]);
+          sign[7] = _mm256_cmpgt_epi32(kZero,u[7]);
+
+          u[0] = _mm256_sub_epi32(u[0], sign[0]);
+          u[1] = _mm256_sub_epi32(u[1], sign[1]);
+          u[2] = _mm256_sub_epi32(u[2], sign[2]);
+          u[3] = _mm256_sub_epi32(u[3], sign[3]);
+          u[4] = _mm256_sub_epi32(u[4], sign[4]);
+          u[5] = _mm256_sub_epi32(u[5], sign[5]);
+          u[6] = _mm256_sub_epi32(u[6], sign[6]);
+          u[7] = _mm256_sub_epi32(u[7], sign[7]);
+
+          u[0] = _mm256_add_epi32(u[0], K32One);
+          u[1] = _mm256_add_epi32(u[1], K32One);
+          u[2] = _mm256_add_epi32(u[2], K32One);
+          u[3] = _mm256_add_epi32(u[3], K32One);
+          u[4] = _mm256_add_epi32(u[4], K32One);
+          u[5] = _mm256_add_epi32(u[5], K32One);
+          u[6] = _mm256_add_epi32(u[6], K32One);
+          u[7] = _mm256_add_epi32(u[7], K32One);
+
+          u[0] = _mm256_srai_epi32(u[0], 2);
+          u[1] = _mm256_srai_epi32(u[1], 2);
+          u[2] = _mm256_srai_epi32(u[2], 2);
+          u[3] = _mm256_srai_epi32(u[3], 2);
+          u[4] = _mm256_srai_epi32(u[4], 2);
+          u[5] = _mm256_srai_epi32(u[5], 2);
+          u[6] = _mm256_srai_epi32(u[6], 2);
+          u[7] = _mm256_srai_epi32(u[7], 2);
+
+          out[ 4] = _mm256_packs_epi32(u[0], u[1]);
+          out[20] = _mm256_packs_epi32(u[2], u[3]);
+          out[12] = _mm256_packs_epi32(u[4], u[5]);
+          out[28] = _mm256_packs_epi32(u[6], u[7]);
+        }
+        {
+          lstep3[16] = _mm256_add_epi32(lstep2[18], lstep1[16]);
+          lstep3[17] = _mm256_add_epi32(lstep2[19], lstep1[17]);
+          lstep3[18] = _mm256_sub_epi32(lstep1[16], lstep2[18]);
+          lstep3[19] = _mm256_sub_epi32(lstep1[17], lstep2[19]);
+          lstep3[20] = _mm256_sub_epi32(lstep1[22], lstep2[20]);
+          lstep3[21] = _mm256_sub_epi32(lstep1[23], lstep2[21]);
+          lstep3[22] = _mm256_add_epi32(lstep2[20], lstep1[22]);
+          lstep3[23] = _mm256_add_epi32(lstep2[21], lstep1[23]);
+          lstep3[24] = _mm256_add_epi32(lstep2[26], lstep1[24]);
+          lstep3[25] = _mm256_add_epi32(lstep2[27], lstep1[25]);
+          lstep3[26] = _mm256_sub_epi32(lstep1[24], lstep2[26]);
+          lstep3[27] = _mm256_sub_epi32(lstep1[25], lstep2[27]);
+          lstep3[28] = _mm256_sub_epi32(lstep1[30], lstep2[28]);
+          lstep3[29] = _mm256_sub_epi32(lstep1[31], lstep2[29]);
+          lstep3[30] = _mm256_add_epi32(lstep2[28], lstep1[30]);
+          lstep3[31] = _mm256_add_epi32(lstep2[29], lstep1[31]);
+        }
+        {
+          const __m256i k32_m04_p28 = pair256_set_epi32(-cospi_4_64, cospi_28_64);
+          const __m256i k32_m28_m04 = pair256_set_epi32(-cospi_28_64, -cospi_4_64);
+          const __m256i k32_m20_p12 = pair256_set_epi32(-cospi_20_64, cospi_12_64);
+          const __m256i k32_m12_m20 = pair256_set_epi32(-cospi_12_64,
+                                                     -cospi_20_64);
+          const __m256i k32_p12_p20 = pair256_set_epi32(cospi_12_64, cospi_20_64);
+          const __m256i k32_p28_p04 = pair256_set_epi32(cospi_28_64, cospi_4_64);
+
+          u[ 0] = _mm256_unpacklo_epi32(lstep2[34], lstep2[60]);
+          u[ 1] = _mm256_unpackhi_epi32(lstep2[34], lstep2[60]);
+          u[ 2] = _mm256_unpacklo_epi32(lstep2[35], lstep2[61]);
+          u[ 3] = _mm256_unpackhi_epi32(lstep2[35], lstep2[61]);
+          u[ 4] = _mm256_unpacklo_epi32(lstep2[36], lstep2[58]);
+          u[ 5] = _mm256_unpackhi_epi32(lstep2[36], lstep2[58]);
+          u[ 6] = _mm256_unpacklo_epi32(lstep2[37], lstep2[59]);
+          u[ 7] = _mm256_unpackhi_epi32(lstep2[37], lstep2[59]);
+          u[ 8] = _mm256_unpacklo_epi32(lstep2[42], lstep2[52]);
+          u[ 9] = _mm256_unpackhi_epi32(lstep2[42], lstep2[52]);
+          u[10] = _mm256_unpacklo_epi32(lstep2[43], lstep2[53]);
+          u[11] = _mm256_unpackhi_epi32(lstep2[43], lstep2[53]);
+          u[12] = _mm256_unpacklo_epi32(lstep2[44], lstep2[50]);
+          u[13] = _mm256_unpackhi_epi32(lstep2[44], lstep2[50]);
+          u[14] = _mm256_unpacklo_epi32(lstep2[45], lstep2[51]);
+          u[15] = _mm256_unpackhi_epi32(lstep2[45], lstep2[51]);
+
+          v[ 0] = k_madd_epi32_avx2(u[ 0], k32_m04_p28);
+          v[ 1] = k_madd_epi32_avx2(u[ 1], k32_m04_p28);
+          v[ 2] = k_madd_epi32_avx2(u[ 2], k32_m04_p28);
+          v[ 3] = k_madd_epi32_avx2(u[ 3], k32_m04_p28);
+          v[ 4] = k_madd_epi32_avx2(u[ 4], k32_m28_m04);
+          v[ 5] = k_madd_epi32_avx2(u[ 5], k32_m28_m04);
+          v[ 6] = k_madd_epi32_avx2(u[ 6], k32_m28_m04);
+          v[ 7] = k_madd_epi32_avx2(u[ 7], k32_m28_m04);
+          v[ 8] = k_madd_epi32_avx2(u[ 8], k32_m20_p12);
+          v[ 9] = k_madd_epi32_avx2(u[ 9], k32_m20_p12);
+          v[10] = k_madd_epi32_avx2(u[10], k32_m20_p12);
+          v[11] = k_madd_epi32_avx2(u[11], k32_m20_p12);
+          v[12] = k_madd_epi32_avx2(u[12], k32_m12_m20);
+          v[13] = k_madd_epi32_avx2(u[13], k32_m12_m20);
+          v[14] = k_madd_epi32_avx2(u[14], k32_m12_m20);
+          v[15] = k_madd_epi32_avx2(u[15], k32_m12_m20);
+          v[16] = k_madd_epi32_avx2(u[12], k32_m20_p12);
+          v[17] = k_madd_epi32_avx2(u[13], k32_m20_p12);
+          v[18] = k_madd_epi32_avx2(u[14], k32_m20_p12);
+          v[19] = k_madd_epi32_avx2(u[15], k32_m20_p12);
+          v[20] = k_madd_epi32_avx2(u[ 8], k32_p12_p20);
+          v[21] = k_madd_epi32_avx2(u[ 9], k32_p12_p20);
+          v[22] = k_madd_epi32_avx2(u[10], k32_p12_p20);
+          v[23] = k_madd_epi32_avx2(u[11], k32_p12_p20);
+          v[24] = k_madd_epi32_avx2(u[ 4], k32_m04_p28);
+          v[25] = k_madd_epi32_avx2(u[ 5], k32_m04_p28);
+          v[26] = k_madd_epi32_avx2(u[ 6], k32_m04_p28);
+          v[27] = k_madd_epi32_avx2(u[ 7], k32_m04_p28);
+          v[28] = k_madd_epi32_avx2(u[ 0], k32_p28_p04);
+          v[29] = k_madd_epi32_avx2(u[ 1], k32_p28_p04);
+          v[30] = k_madd_epi32_avx2(u[ 2], k32_p28_p04);
+          v[31] = k_madd_epi32_avx2(u[ 3], k32_p28_p04);
+
+          u[ 0] = k_packs_epi64_avx2(v[ 0], v[ 1]);
+          u[ 1] = k_packs_epi64_avx2(v[ 2], v[ 3]);
+          u[ 2] = k_packs_epi64_avx2(v[ 4], v[ 5]);
+          u[ 3] = k_packs_epi64_avx2(v[ 6], v[ 7]);
+          u[ 4] = k_packs_epi64_avx2(v[ 8], v[ 9]);
+          u[ 5] = k_packs_epi64_avx2(v[10], v[11]);
+          u[ 6] = k_packs_epi64_avx2(v[12], v[13]);
+          u[ 7] = k_packs_epi64_avx2(v[14], v[15]);
+          u[ 8] = k_packs_epi64_avx2(v[16], v[17]);
+          u[ 9] = k_packs_epi64_avx2(v[18], v[19]);
+          u[10] = k_packs_epi64_avx2(v[20], v[21]);
+          u[11] = k_packs_epi64_avx2(v[22], v[23]);
+          u[12] = k_packs_epi64_avx2(v[24], v[25]);
+          u[13] = k_packs_epi64_avx2(v[26], v[27]);
+          u[14] = k_packs_epi64_avx2(v[28], v[29]);
+          u[15] = k_packs_epi64_avx2(v[30], v[31]);
+
+          v[ 0] = _mm256_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+          v[ 1] = _mm256_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+          v[ 2] = _mm256_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+          v[ 3] = _mm256_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+          v[ 4] = _mm256_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+          v[ 5] = _mm256_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+          v[ 6] = _mm256_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+          v[ 7] = _mm256_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+          v[ 8] = _mm256_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+          v[ 9] = _mm256_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+          v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+          v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+          v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+          v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+          v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+          v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+          lstep3[34] = _mm256_srai_epi32(v[ 0], DCT_CONST_BITS);
+          lstep3[35] = _mm256_srai_epi32(v[ 1], DCT_CONST_BITS);
+          lstep3[36] = _mm256_srai_epi32(v[ 2], DCT_CONST_BITS);
+          lstep3[37] = _mm256_srai_epi32(v[ 3], DCT_CONST_BITS);
+          lstep3[42] = _mm256_srai_epi32(v[ 4], DCT_CONST_BITS);
+          lstep3[43] = _mm256_srai_epi32(v[ 5], DCT_CONST_BITS);
+          lstep3[44] = _mm256_srai_epi32(v[ 6], DCT_CONST_BITS);
+          lstep3[45] = _mm256_srai_epi32(v[ 7], DCT_CONST_BITS);
+          lstep3[50] = _mm256_srai_epi32(v[ 8], DCT_CONST_BITS);
+          lstep3[51] = _mm256_srai_epi32(v[ 9], DCT_CONST_BITS);
+          lstep3[52] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
+          lstep3[53] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
+          lstep3[58] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
+          lstep3[59] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
+          lstep3[60] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
+          lstep3[61] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
+        }
+        // stage 7
+        {
+          const __m256i k32_p30_p02 = pair256_set_epi32(cospi_30_64, cospi_2_64);
+          const __m256i k32_p14_p18 = pair256_set_epi32(cospi_14_64, cospi_18_64);
+          const __m256i k32_p22_p10 = pair256_set_epi32(cospi_22_64, cospi_10_64);
+          const __m256i k32_p06_p26 = pair256_set_epi32(cospi_6_64,  cospi_26_64);
+          const __m256i k32_m26_p06 = pair256_set_epi32(-cospi_26_64, cospi_6_64);
+          const __m256i k32_m10_p22 = pair256_set_epi32(-cospi_10_64, cospi_22_64);
+          const __m256i k32_m18_p14 = pair256_set_epi32(-cospi_18_64, cospi_14_64);
+          const __m256i k32_m02_p30 = pair256_set_epi32(-cospi_2_64, cospi_30_64);
+
+          u[ 0] = _mm256_unpacklo_epi32(lstep3[16], lstep3[30]);
+          u[ 1] = _mm256_unpackhi_epi32(lstep3[16], lstep3[30]);
+          u[ 2] = _mm256_unpacklo_epi32(lstep3[17], lstep3[31]);
+          u[ 3] = _mm256_unpackhi_epi32(lstep3[17], lstep3[31]);
+          u[ 4] = _mm256_unpacklo_epi32(lstep3[18], lstep3[28]);
+          u[ 5] = _mm256_unpackhi_epi32(lstep3[18], lstep3[28]);
+          u[ 6] = _mm256_unpacklo_epi32(lstep3[19], lstep3[29]);
+          u[ 7] = _mm256_unpackhi_epi32(lstep3[19], lstep3[29]);
+          u[ 8] = _mm256_unpacklo_epi32(lstep3[20], lstep3[26]);
+          u[ 9] = _mm256_unpackhi_epi32(lstep3[20], lstep3[26]);
+          u[10] = _mm256_unpacklo_epi32(lstep3[21], lstep3[27]);
+          u[11] = _mm256_unpackhi_epi32(lstep3[21], lstep3[27]);
+          u[12] = _mm256_unpacklo_epi32(lstep3[22], lstep3[24]);
+          u[13] = _mm256_unpackhi_epi32(lstep3[22], lstep3[24]);
+          u[14] = _mm256_unpacklo_epi32(lstep3[23], lstep3[25]);
+          u[15] = _mm256_unpackhi_epi32(lstep3[23], lstep3[25]);
+
+          v[ 0] = k_madd_epi32_avx2(u[ 0], k32_p30_p02);
+          v[ 1] = k_madd_epi32_avx2(u[ 1], k32_p30_p02);
+          v[ 2] = k_madd_epi32_avx2(u[ 2], k32_p30_p02);
+          v[ 3] = k_madd_epi32_avx2(u[ 3], k32_p30_p02);
+          v[ 4] = k_madd_epi32_avx2(u[ 4], k32_p14_p18);
+          v[ 5] = k_madd_epi32_avx2(u[ 5], k32_p14_p18);
+          v[ 6] = k_madd_epi32_avx2(u[ 6], k32_p14_p18);
+          v[ 7] = k_madd_epi32_avx2(u[ 7], k32_p14_p18);
+          v[ 8] = k_madd_epi32_avx2(u[ 8], k32_p22_p10);
+          v[ 9] = k_madd_epi32_avx2(u[ 9], k32_p22_p10);
+          v[10] = k_madd_epi32_avx2(u[10], k32_p22_p10);
+          v[11] = k_madd_epi32_avx2(u[11], k32_p22_p10);
+          v[12] = k_madd_epi32_avx2(u[12], k32_p06_p26);
+          v[13] = k_madd_epi32_avx2(u[13], k32_p06_p26);
+          v[14] = k_madd_epi32_avx2(u[14], k32_p06_p26);
+          v[15] = k_madd_epi32_avx2(u[15], k32_p06_p26);
+          v[16] = k_madd_epi32_avx2(u[12], k32_m26_p06);
+          v[17] = k_madd_epi32_avx2(u[13], k32_m26_p06);
+          v[18] = k_madd_epi32_avx2(u[14], k32_m26_p06);
+          v[19] = k_madd_epi32_avx2(u[15], k32_m26_p06);
+          v[20] = k_madd_epi32_avx2(u[ 8], k32_m10_p22);
+          v[21] = k_madd_epi32_avx2(u[ 9], k32_m10_p22);
+          v[22] = k_madd_epi32_avx2(u[10], k32_m10_p22);
+          v[23] = k_madd_epi32_avx2(u[11], k32_m10_p22);
+          v[24] = k_madd_epi32_avx2(u[ 4], k32_m18_p14);
+          v[25] = k_madd_epi32_avx2(u[ 5], k32_m18_p14);
+          v[26] = k_madd_epi32_avx2(u[ 6], k32_m18_p14);
+          v[27] = k_madd_epi32_avx2(u[ 7], k32_m18_p14);
+          v[28] = k_madd_epi32_avx2(u[ 0], k32_m02_p30);
+          v[29] = k_madd_epi32_avx2(u[ 1], k32_m02_p30);
+          v[30] = k_madd_epi32_avx2(u[ 2], k32_m02_p30);
+          v[31] = k_madd_epi32_avx2(u[ 3], k32_m02_p30);
+
+          u[ 0] = k_packs_epi64_avx2(v[ 0], v[ 1]);
+          u[ 1] = k_packs_epi64_avx2(v[ 2], v[ 3]);
+          u[ 2] = k_packs_epi64_avx2(v[ 4], v[ 5]);
+          u[ 3] = k_packs_epi64_avx2(v[ 6], v[ 7]);
+          u[ 4] = k_packs_epi64_avx2(v[ 8], v[ 9]);
+          u[ 5] = k_packs_epi64_avx2(v[10], v[11]);
+          u[ 6] = k_packs_epi64_avx2(v[12], v[13]);
+          u[ 7] = k_packs_epi64_avx2(v[14], v[15]);
+          u[ 8] = k_packs_epi64_avx2(v[16], v[17]);
+          u[ 9] = k_packs_epi64_avx2(v[18], v[19]);
+          u[10] = k_packs_epi64_avx2(v[20], v[21]);
+          u[11] = k_packs_epi64_avx2(v[22], v[23]);
+          u[12] = k_packs_epi64_avx2(v[24], v[25]);
+          u[13] = k_packs_epi64_avx2(v[26], v[27]);
+          u[14] = k_packs_epi64_avx2(v[28], v[29]);
+          u[15] = k_packs_epi64_avx2(v[30], v[31]);
+
+          v[ 0] = _mm256_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+          v[ 1] = _mm256_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+          v[ 2] = _mm256_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+          v[ 3] = _mm256_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+          v[ 4] = _mm256_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+          v[ 5] = _mm256_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+          v[ 6] = _mm256_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+          v[ 7] = _mm256_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+          v[ 8] = _mm256_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+          v[ 9] = _mm256_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+          v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+          v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+          v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+          v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+          v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+          v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+          u[ 0] = _mm256_srai_epi32(v[ 0], DCT_CONST_BITS);
+          u[ 1] = _mm256_srai_epi32(v[ 1], DCT_CONST_BITS);
+          u[ 2] = _mm256_srai_epi32(v[ 2], DCT_CONST_BITS);
+          u[ 3] = _mm256_srai_epi32(v[ 3], DCT_CONST_BITS);
+          u[ 4] = _mm256_srai_epi32(v[ 4], DCT_CONST_BITS);
+          u[ 5] = _mm256_srai_epi32(v[ 5], DCT_CONST_BITS);
+          u[ 6] = _mm256_srai_epi32(v[ 6], DCT_CONST_BITS);
+          u[ 7] = _mm256_srai_epi32(v[ 7], DCT_CONST_BITS);
+          u[ 8] = _mm256_srai_epi32(v[ 8], DCT_CONST_BITS);
+          u[ 9] = _mm256_srai_epi32(v[ 9], DCT_CONST_BITS);
+          u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
+          u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
+          u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
+          u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
+          u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
+          u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
+
+          v[ 0] = _mm256_cmpgt_epi32(kZero,u[ 0]);
+          v[ 1] = _mm256_cmpgt_epi32(kZero,u[ 1]);
+          v[ 2] = _mm256_cmpgt_epi32(kZero,u[ 2]);
+          v[ 3] = _mm256_cmpgt_epi32(kZero,u[ 3]);
+          v[ 4] = _mm256_cmpgt_epi32(kZero,u[ 4]);
+          v[ 5] = _mm256_cmpgt_epi32(kZero,u[ 5]);
+          v[ 6] = _mm256_cmpgt_epi32(kZero,u[ 6]);
+          v[ 7] = _mm256_cmpgt_epi32(kZero,u[ 7]);
+          v[ 8] = _mm256_cmpgt_epi32(kZero,u[ 8]);
+          v[ 9] = _mm256_cmpgt_epi32(kZero,u[ 9]);
+          v[10] = _mm256_cmpgt_epi32(kZero,u[10]);
+          v[11] = _mm256_cmpgt_epi32(kZero,u[11]);
+          v[12] = _mm256_cmpgt_epi32(kZero,u[12]);
+          v[13] = _mm256_cmpgt_epi32(kZero,u[13]);
+          v[14] = _mm256_cmpgt_epi32(kZero,u[14]);
+          v[15] = _mm256_cmpgt_epi32(kZero,u[15]);
+
+          u[ 0] = _mm256_sub_epi32(u[ 0], v[ 0]);
+          u[ 1] = _mm256_sub_epi32(u[ 1], v[ 1]);
+          u[ 2] = _mm256_sub_epi32(u[ 2], v[ 2]);
+          u[ 3] = _mm256_sub_epi32(u[ 3], v[ 3]);
+          u[ 4] = _mm256_sub_epi32(u[ 4], v[ 4]);
+          u[ 5] = _mm256_sub_epi32(u[ 5], v[ 5]);
+          u[ 6] = _mm256_sub_epi32(u[ 6], v[ 6]);
+          u[ 7] = _mm256_sub_epi32(u[ 7], v[ 7]);
+          u[ 8] = _mm256_sub_epi32(u[ 8], v[ 8]);
+          u[ 9] = _mm256_sub_epi32(u[ 9], v[ 9]);
+          u[10] = _mm256_sub_epi32(u[10], v[10]);
+          u[11] = _mm256_sub_epi32(u[11], v[11]);
+          u[12] = _mm256_sub_epi32(u[12], v[12]);
+          u[13] = _mm256_sub_epi32(u[13], v[13]);
+          u[14] = _mm256_sub_epi32(u[14], v[14]);
+          u[15] = _mm256_sub_epi32(u[15], v[15]);
+
+          v[ 0] = _mm256_add_epi32(u[ 0], K32One);
+          v[ 1] = _mm256_add_epi32(u[ 1], K32One);
+          v[ 2] = _mm256_add_epi32(u[ 2], K32One);
+          v[ 3] = _mm256_add_epi32(u[ 3], K32One);
+          v[ 4] = _mm256_add_epi32(u[ 4], K32One);
+          v[ 5] = _mm256_add_epi32(u[ 5], K32One);
+          v[ 6] = _mm256_add_epi32(u[ 6], K32One);
+          v[ 7] = _mm256_add_epi32(u[ 7], K32One);
+          v[ 8] = _mm256_add_epi32(u[ 8], K32One);
+          v[ 9] = _mm256_add_epi32(u[ 9], K32One);
+          v[10] = _mm256_add_epi32(u[10], K32One);
+          v[11] = _mm256_add_epi32(u[11], K32One);
+          v[12] = _mm256_add_epi32(u[12], K32One);
+          v[13] = _mm256_add_epi32(u[13], K32One);
+          v[14] = _mm256_add_epi32(u[14], K32One);
+          v[15] = _mm256_add_epi32(u[15], K32One);
+
+          u[ 0] = _mm256_srai_epi32(v[ 0], 2);
+          u[ 1] = _mm256_srai_epi32(v[ 1], 2);
+          u[ 2] = _mm256_srai_epi32(v[ 2], 2);
+          u[ 3] = _mm256_srai_epi32(v[ 3], 2);
+          u[ 4] = _mm256_srai_epi32(v[ 4], 2);
+          u[ 5] = _mm256_srai_epi32(v[ 5], 2);
+          u[ 6] = _mm256_srai_epi32(v[ 6], 2);
+          u[ 7] = _mm256_srai_epi32(v[ 7], 2);
+          u[ 8] = _mm256_srai_epi32(v[ 8], 2);
+          u[ 9] = _mm256_srai_epi32(v[ 9], 2);
+          u[10] = _mm256_srai_epi32(v[10], 2);
+          u[11] = _mm256_srai_epi32(v[11], 2);
+          u[12] = _mm256_srai_epi32(v[12], 2);
+          u[13] = _mm256_srai_epi32(v[13], 2);
+          u[14] = _mm256_srai_epi32(v[14], 2);
+          u[15] = _mm256_srai_epi32(v[15], 2);
+
+          out[ 2] = _mm256_packs_epi32(u[0], u[1]);
+          out[18] = _mm256_packs_epi32(u[2], u[3]);
+          out[10] = _mm256_packs_epi32(u[4], u[5]);
+          out[26] = _mm256_packs_epi32(u[6], u[7]);
+          out[ 6] = _mm256_packs_epi32(u[8], u[9]);
+          out[22] = _mm256_packs_epi32(u[10], u[11]);
+          out[14] = _mm256_packs_epi32(u[12], u[13]);
+          out[30] = _mm256_packs_epi32(u[14], u[15]);
+        }
+        {
+          lstep1[32] = _mm256_add_epi32(lstep3[34], lstep2[32]);
+          lstep1[33] = _mm256_add_epi32(lstep3[35], lstep2[33]);
+          lstep1[34] = _mm256_sub_epi32(lstep2[32], lstep3[34]);
+          lstep1[35] = _mm256_sub_epi32(lstep2[33], lstep3[35]);
+          lstep1[36] = _mm256_sub_epi32(lstep2[38], lstep3[36]);
+          lstep1[37] = _mm256_sub_epi32(lstep2[39], lstep3[37]);
+          lstep1[38] = _mm256_add_epi32(lstep3[36], lstep2[38]);
+          lstep1[39] = _mm256_add_epi32(lstep3[37], lstep2[39]);
+          lstep1[40] = _mm256_add_epi32(lstep3[42], lstep2[40]);
+          lstep1[41] = _mm256_add_epi32(lstep3[43], lstep2[41]);
+          lstep1[42] = _mm256_sub_epi32(lstep2[40], lstep3[42]);
+          lstep1[43] = _mm256_sub_epi32(lstep2[41], lstep3[43]);
+          lstep1[44] = _mm256_sub_epi32(lstep2[46], lstep3[44]);
+          lstep1[45] = _mm256_sub_epi32(lstep2[47], lstep3[45]);
+          lstep1[46] = _mm256_add_epi32(lstep3[44], lstep2[46]);
+          lstep1[47] = _mm256_add_epi32(lstep3[45], lstep2[47]);
+          lstep1[48] = _mm256_add_epi32(lstep3[50], lstep2[48]);
+          lstep1[49] = _mm256_add_epi32(lstep3[51], lstep2[49]);
+          lstep1[50] = _mm256_sub_epi32(lstep2[48], lstep3[50]);
+          lstep1[51] = _mm256_sub_epi32(lstep2[49], lstep3[51]);
+          lstep1[52] = _mm256_sub_epi32(lstep2[54], lstep3[52]);
+          lstep1[53] = _mm256_sub_epi32(lstep2[55], lstep3[53]);
+          lstep1[54] = _mm256_add_epi32(lstep3[52], lstep2[54]);
+          lstep1[55] = _mm256_add_epi32(lstep3[53], lstep2[55]);
+          lstep1[56] = _mm256_add_epi32(lstep3[58], lstep2[56]);
+          lstep1[57] = _mm256_add_epi32(lstep3[59], lstep2[57]);
+          lstep1[58] = _mm256_sub_epi32(lstep2[56], lstep3[58]);
+          lstep1[59] = _mm256_sub_epi32(lstep2[57], lstep3[59]);
+          lstep1[60] = _mm256_sub_epi32(lstep2[62], lstep3[60]);
+          lstep1[61] = _mm256_sub_epi32(lstep2[63], lstep3[61]);
+          lstep1[62] = _mm256_add_epi32(lstep3[60], lstep2[62]);
+          lstep1[63] = _mm256_add_epi32(lstep3[61], lstep2[63]);
+        }
+        // stage 8
+        {
+          const __m256i k32_p31_p01 = pair256_set_epi32(cospi_31_64, cospi_1_64);
+          const __m256i k32_p15_p17 = pair256_set_epi32(cospi_15_64, cospi_17_64);
+          const __m256i k32_p23_p09 = pair256_set_epi32(cospi_23_64, cospi_9_64);
+          const __m256i k32_p07_p25 = pair256_set_epi32(cospi_7_64, cospi_25_64);
+          const __m256i k32_m25_p07 = pair256_set_epi32(-cospi_25_64, cospi_7_64);
+          const __m256i k32_m09_p23 = pair256_set_epi32(-cospi_9_64, cospi_23_64);
+          const __m256i k32_m17_p15 = pair256_set_epi32(-cospi_17_64, cospi_15_64);
+          const __m256i k32_m01_p31 = pair256_set_epi32(-cospi_1_64, cospi_31_64);
+
+          u[ 0] = _mm256_unpacklo_epi32(lstep1[32], lstep1[62]);
+          u[ 1] = _mm256_unpackhi_epi32(lstep1[32], lstep1[62]);
+          u[ 2] = _mm256_unpacklo_epi32(lstep1[33], lstep1[63]);
+          u[ 3] = _mm256_unpackhi_epi32(lstep1[33], lstep1[63]);
+          u[ 4] = _mm256_unpacklo_epi32(lstep1[34], lstep1[60]);
+          u[ 5] = _mm256_unpackhi_epi32(lstep1[34], lstep1[60]);
+          u[ 6] = _mm256_unpacklo_epi32(lstep1[35], lstep1[61]);
+          u[ 7] = _mm256_unpackhi_epi32(lstep1[35], lstep1[61]);
+          u[ 8] = _mm256_unpacklo_epi32(lstep1[36], lstep1[58]);
+          u[ 9] = _mm256_unpackhi_epi32(lstep1[36], lstep1[58]);
+          u[10] = _mm256_unpacklo_epi32(lstep1[37], lstep1[59]);
+          u[11] = _mm256_unpackhi_epi32(lstep1[37], lstep1[59]);
+          u[12] = _mm256_unpacklo_epi32(lstep1[38], lstep1[56]);
+          u[13] = _mm256_unpackhi_epi32(lstep1[38], lstep1[56]);
+          u[14] = _mm256_unpacklo_epi32(lstep1[39], lstep1[57]);
+          u[15] = _mm256_unpackhi_epi32(lstep1[39], lstep1[57]);
+
+          v[ 0] = k_madd_epi32_avx2(u[ 0], k32_p31_p01);
+          v[ 1] = k_madd_epi32_avx2(u[ 1], k32_p31_p01);
+          v[ 2] = k_madd_epi32_avx2(u[ 2], k32_p31_p01);
+          v[ 3] = k_madd_epi32_avx2(u[ 3], k32_p31_p01);
+          v[ 4] = k_madd_epi32_avx2(u[ 4], k32_p15_p17);
+          v[ 5] = k_madd_epi32_avx2(u[ 5], k32_p15_p17);
+          v[ 6] = k_madd_epi32_avx2(u[ 6], k32_p15_p17);
+          v[ 7] = k_madd_epi32_avx2(u[ 7], k32_p15_p17);
+          v[ 8] = k_madd_epi32_avx2(u[ 8], k32_p23_p09);
+          v[ 9] = k_madd_epi32_avx2(u[ 9], k32_p23_p09);
+          v[10] = k_madd_epi32_avx2(u[10], k32_p23_p09);
+          v[11] = k_madd_epi32_avx2(u[11], k32_p23_p09);
+          v[12] = k_madd_epi32_avx2(u[12], k32_p07_p25);
+          v[13] = k_madd_epi32_avx2(u[13], k32_p07_p25);
+          v[14] = k_madd_epi32_avx2(u[14], k32_p07_p25);
+          v[15] = k_madd_epi32_avx2(u[15], k32_p07_p25);
+          v[16] = k_madd_epi32_avx2(u[12], k32_m25_p07);
+          v[17] = k_madd_epi32_avx2(u[13], k32_m25_p07);
+          v[18] = k_madd_epi32_avx2(u[14], k32_m25_p07);
+          v[19] = k_madd_epi32_avx2(u[15], k32_m25_p07);
+          v[20] = k_madd_epi32_avx2(u[ 8], k32_m09_p23);
+          v[21] = k_madd_epi32_avx2(u[ 9], k32_m09_p23);
+          v[22] = k_madd_epi32_avx2(u[10], k32_m09_p23);
+          v[23] = k_madd_epi32_avx2(u[11], k32_m09_p23);
+          v[24] = k_madd_epi32_avx2(u[ 4], k32_m17_p15);
+          v[25] = k_madd_epi32_avx2(u[ 5], k32_m17_p15);
+          v[26] = k_madd_epi32_avx2(u[ 6], k32_m17_p15);
+          v[27] = k_madd_epi32_avx2(u[ 7], k32_m17_p15);
+          v[28] = k_madd_epi32_avx2(u[ 0], k32_m01_p31);
+          v[29] = k_madd_epi32_avx2(u[ 1], k32_m01_p31);
+          v[30] = k_madd_epi32_avx2(u[ 2], k32_m01_p31);
+          v[31] = k_madd_epi32_avx2(u[ 3], k32_m01_p31);
+
+          u[ 0] = k_packs_epi64_avx2(v[ 0], v[ 1]);
+          u[ 1] = k_packs_epi64_avx2(v[ 2], v[ 3]);
+          u[ 2] = k_packs_epi64_avx2(v[ 4], v[ 5]);
+          u[ 3] = k_packs_epi64_avx2(v[ 6], v[ 7]);
+          u[ 4] = k_packs_epi64_avx2(v[ 8], v[ 9]);
+          u[ 5] = k_packs_epi64_avx2(v[10], v[11]);
+          u[ 6] = k_packs_epi64_avx2(v[12], v[13]);
+          u[ 7] = k_packs_epi64_avx2(v[14], v[15]);
+          u[ 8] = k_packs_epi64_avx2(v[16], v[17]);
+          u[ 9] = k_packs_epi64_avx2(v[18], v[19]);
+          u[10] = k_packs_epi64_avx2(v[20], v[21]);
+          u[11] = k_packs_epi64_avx2(v[22], v[23]);
+          u[12] = k_packs_epi64_avx2(v[24], v[25]);
+          u[13] = k_packs_epi64_avx2(v[26], v[27]);
+          u[14] = k_packs_epi64_avx2(v[28], v[29]);
+          u[15] = k_packs_epi64_avx2(v[30], v[31]);
+
+          v[ 0] = _mm256_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+          v[ 1] = _mm256_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+          v[ 2] = _mm256_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+          v[ 3] = _mm256_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+          v[ 4] = _mm256_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+          v[ 5] = _mm256_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+          v[ 6] = _mm256_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+          v[ 7] = _mm256_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+          v[ 8] = _mm256_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+          v[ 9] = _mm256_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+          v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+          v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+          v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+          v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+          v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+          v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+          u[ 0] = _mm256_srai_epi32(v[ 0], DCT_CONST_BITS);
+          u[ 1] = _mm256_srai_epi32(v[ 1], DCT_CONST_BITS);
+          u[ 2] = _mm256_srai_epi32(v[ 2], DCT_CONST_BITS);
+          u[ 3] = _mm256_srai_epi32(v[ 3], DCT_CONST_BITS);
+          u[ 4] = _mm256_srai_epi32(v[ 4], DCT_CONST_BITS);
+          u[ 5] = _mm256_srai_epi32(v[ 5], DCT_CONST_BITS);
+          u[ 6] = _mm256_srai_epi32(v[ 6], DCT_CONST_BITS);
+          u[ 7] = _mm256_srai_epi32(v[ 7], DCT_CONST_BITS);
+          u[ 8] = _mm256_srai_epi32(v[ 8], DCT_CONST_BITS);
+          u[ 9] = _mm256_srai_epi32(v[ 9], DCT_CONST_BITS);
+          u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
+          u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
+          u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
+          u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
+          u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
+          u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
+
+          v[ 0] = _mm256_cmpgt_epi32(kZero,u[ 0]);
+          v[ 1] = _mm256_cmpgt_epi32(kZero,u[ 1]);
+          v[ 2] = _mm256_cmpgt_epi32(kZero,u[ 2]);
+          v[ 3] = _mm256_cmpgt_epi32(kZero,u[ 3]);
+          v[ 4] = _mm256_cmpgt_epi32(kZero,u[ 4]);
+          v[ 5] = _mm256_cmpgt_epi32(kZero,u[ 5]);
+          v[ 6] = _mm256_cmpgt_epi32(kZero,u[ 6]);
+          v[ 7] = _mm256_cmpgt_epi32(kZero,u[ 7]);
+          v[ 8] = _mm256_cmpgt_epi32(kZero,u[ 8]);
+          v[ 9] = _mm256_cmpgt_epi32(kZero,u[ 9]);
+          v[10] = _mm256_cmpgt_epi32(kZero,u[10]);
+          v[11] = _mm256_cmpgt_epi32(kZero,u[11]);
+          v[12] = _mm256_cmpgt_epi32(kZero,u[12]);
+          v[13] = _mm256_cmpgt_epi32(kZero,u[13]);
+          v[14] = _mm256_cmpgt_epi32(kZero,u[14]);
+          v[15] = _mm256_cmpgt_epi32(kZero,u[15]);
+
+          u[ 0] = _mm256_sub_epi32(u[ 0], v[ 0]);
+          u[ 1] = _mm256_sub_epi32(u[ 1], v[ 1]);
+          u[ 2] = _mm256_sub_epi32(u[ 2], v[ 2]);
+          u[ 3] = _mm256_sub_epi32(u[ 3], v[ 3]);
+          u[ 4] = _mm256_sub_epi32(u[ 4], v[ 4]);
+          u[ 5] = _mm256_sub_epi32(u[ 5], v[ 5]);
+          u[ 6] = _mm256_sub_epi32(u[ 6], v[ 6]);
+          u[ 7] = _mm256_sub_epi32(u[ 7], v[ 7]);
+          u[ 8] = _mm256_sub_epi32(u[ 8], v[ 8]);
+          u[ 9] = _mm256_sub_epi32(u[ 9], v[ 9]);
+          u[10] = _mm256_sub_epi32(u[10], v[10]);
+          u[11] = _mm256_sub_epi32(u[11], v[11]);
+          u[12] = _mm256_sub_epi32(u[12], v[12]);
+          u[13] = _mm256_sub_epi32(u[13], v[13]);
+          u[14] = _mm256_sub_epi32(u[14], v[14]);
+          u[15] = _mm256_sub_epi32(u[15], v[15]);
+
+          v[0] = _mm256_add_epi32(u[0], K32One);
+          v[1] = _mm256_add_epi32(u[1], K32One);
+          v[2] = _mm256_add_epi32(u[2], K32One);
+          v[3] = _mm256_add_epi32(u[3], K32One);
+          v[4] = _mm256_add_epi32(u[4], K32One);
+          v[5] = _mm256_add_epi32(u[5], K32One);
+          v[6] = _mm256_add_epi32(u[6], K32One);
+          v[7] = _mm256_add_epi32(u[7], K32One);
+          v[8] = _mm256_add_epi32(u[8], K32One);
+          v[9] = _mm256_add_epi32(u[9], K32One);
+          v[10] = _mm256_add_epi32(u[10], K32One);
+          v[11] = _mm256_add_epi32(u[11], K32One);
+          v[12] = _mm256_add_epi32(u[12], K32One);
+          v[13] = _mm256_add_epi32(u[13], K32One);
+          v[14] = _mm256_add_epi32(u[14], K32One);
+          v[15] = _mm256_add_epi32(u[15], K32One);
+
+          u[0] = _mm256_srai_epi32(v[0], 2);
+          u[1] = _mm256_srai_epi32(v[1], 2);
+          u[2] = _mm256_srai_epi32(v[2], 2);
+          u[3] = _mm256_srai_epi32(v[3], 2);
+          u[4] = _mm256_srai_epi32(v[4], 2);
+          u[5] = _mm256_srai_epi32(v[5], 2);
+          u[6] = _mm256_srai_epi32(v[6], 2);
+          u[7] = _mm256_srai_epi32(v[7], 2);
+          u[8] = _mm256_srai_epi32(v[8], 2);
+          u[9] = _mm256_srai_epi32(v[9], 2);
+          u[10] = _mm256_srai_epi32(v[10], 2);
+          u[11] = _mm256_srai_epi32(v[11], 2);
+          u[12] = _mm256_srai_epi32(v[12], 2);
+          u[13] = _mm256_srai_epi32(v[13], 2);
+          u[14] = _mm256_srai_epi32(v[14], 2);
+          u[15] = _mm256_srai_epi32(v[15], 2);
+
+          out[ 1] = _mm256_packs_epi32(u[0], u[1]);
+          out[17] = _mm256_packs_epi32(u[2], u[3]);
+          out[ 9] = _mm256_packs_epi32(u[4], u[5]);
+          out[25] = _mm256_packs_epi32(u[6], u[7]);
+          out[ 7] = _mm256_packs_epi32(u[8], u[9]);
+          out[23] = _mm256_packs_epi32(u[10], u[11]);
+          out[15] = _mm256_packs_epi32(u[12], u[13]);
+          out[31] = _mm256_packs_epi32(u[14], u[15]);
+        }
+        {
+          const __m256i k32_p27_p05 = pair256_set_epi32(cospi_27_64, cospi_5_64);
+          const __m256i k32_p11_p21 = pair256_set_epi32(cospi_11_64, cospi_21_64);
+          const __m256i k32_p19_p13 = pair256_set_epi32(cospi_19_64, cospi_13_64);
+          const __m256i k32_p03_p29 = pair256_set_epi32(cospi_3_64, cospi_29_64);
+          const __m256i k32_m29_p03 = pair256_set_epi32(-cospi_29_64, cospi_3_64);
+          const __m256i k32_m13_p19 = pair256_set_epi32(-cospi_13_64, cospi_19_64);
+          const __m256i k32_m21_p11 = pair256_set_epi32(-cospi_21_64, cospi_11_64);
+          const __m256i k32_m05_p27 = pair256_set_epi32(-cospi_5_64, cospi_27_64);
+
+          u[ 0] = _mm256_unpacklo_epi32(lstep1[40], lstep1[54]);
+          u[ 1] = _mm256_unpackhi_epi32(lstep1[40], lstep1[54]);
+          u[ 2] = _mm256_unpacklo_epi32(lstep1[41], lstep1[55]);
+          u[ 3] = _mm256_unpackhi_epi32(lstep1[41], lstep1[55]);
+          u[ 4] = _mm256_unpacklo_epi32(lstep1[42], lstep1[52]);
+          u[ 5] = _mm256_unpackhi_epi32(lstep1[42], lstep1[52]);
+          u[ 6] = _mm256_unpacklo_epi32(lstep1[43], lstep1[53]);
+          u[ 7] = _mm256_unpackhi_epi32(lstep1[43], lstep1[53]);
+          u[ 8] = _mm256_unpacklo_epi32(lstep1[44], lstep1[50]);
+          u[ 9] = _mm256_unpackhi_epi32(lstep1[44], lstep1[50]);
+          u[10] = _mm256_unpacklo_epi32(lstep1[45], lstep1[51]);
+          u[11] = _mm256_unpackhi_epi32(lstep1[45], lstep1[51]);
+          u[12] = _mm256_unpacklo_epi32(lstep1[46], lstep1[48]);
+          u[13] = _mm256_unpackhi_epi32(lstep1[46], lstep1[48]);
+          u[14] = _mm256_unpacklo_epi32(lstep1[47], lstep1[49]);
+          u[15] = _mm256_unpackhi_epi32(lstep1[47], lstep1[49]);
+
+          v[ 0] = k_madd_epi32_avx2(u[ 0], k32_p27_p05);
+          v[ 1] = k_madd_epi32_avx2(u[ 1], k32_p27_p05);
+          v[ 2] = k_madd_epi32_avx2(u[ 2], k32_p27_p05);
+          v[ 3] = k_madd_epi32_avx2(u[ 3], k32_p27_p05);
+          v[ 4] = k_madd_epi32_avx2(u[ 4], k32_p11_p21);
+          v[ 5] = k_madd_epi32_avx2(u[ 5], k32_p11_p21);
+          v[ 6] = k_madd_epi32_avx2(u[ 6], k32_p11_p21);
+          v[ 7] = k_madd_epi32_avx2(u[ 7], k32_p11_p21);
+          v[ 8] = k_madd_epi32_avx2(u[ 8], k32_p19_p13);
+          v[ 9] = k_madd_epi32_avx2(u[ 9], k32_p19_p13);
+          v[10] = k_madd_epi32_avx2(u[10], k32_p19_p13);
+          v[11] = k_madd_epi32_avx2(u[11], k32_p19_p13);
+          v[12] = k_madd_epi32_avx2(u[12], k32_p03_p29);
+          v[13] = k_madd_epi32_avx2(u[13], k32_p03_p29);
+          v[14] = k_madd_epi32_avx2(u[14], k32_p03_p29);
+          v[15] = k_madd_epi32_avx2(u[15], k32_p03_p29);
+          v[16] = k_madd_epi32_avx2(u[12], k32_m29_p03);
+          v[17] = k_madd_epi32_avx2(u[13], k32_m29_p03);
+          v[18] = k_madd_epi32_avx2(u[14], k32_m29_p03);
+          v[19] = k_madd_epi32_avx2(u[15], k32_m29_p03);
+          v[20] = k_madd_epi32_avx2(u[ 8], k32_m13_p19);
+          v[21] = k_madd_epi32_avx2(u[ 9], k32_m13_p19);
+          v[22] = k_madd_epi32_avx2(u[10], k32_m13_p19);
+          v[23] = k_madd_epi32_avx2(u[11], k32_m13_p19);
+          v[24] = k_madd_epi32_avx2(u[ 4], k32_m21_p11);
+          v[25] = k_madd_epi32_avx2(u[ 5], k32_m21_p11);
+          v[26] = k_madd_epi32_avx2(u[ 6], k32_m21_p11);
+          v[27] = k_madd_epi32_avx2(u[ 7], k32_m21_p11);
+          v[28] = k_madd_epi32_avx2(u[ 0], k32_m05_p27);
+          v[29] = k_madd_epi32_avx2(u[ 1], k32_m05_p27);
+          v[30] = k_madd_epi32_avx2(u[ 2], k32_m05_p27);
+          v[31] = k_madd_epi32_avx2(u[ 3], k32_m05_p27);
+
+          u[ 0] = k_packs_epi64_avx2(v[ 0], v[ 1]);
+          u[ 1] = k_packs_epi64_avx2(v[ 2], v[ 3]);
+          u[ 2] = k_packs_epi64_avx2(v[ 4], v[ 5]);
+          u[ 3] = k_packs_epi64_avx2(v[ 6], v[ 7]);
+          u[ 4] = k_packs_epi64_avx2(v[ 8], v[ 9]);
+          u[ 5] = k_packs_epi64_avx2(v[10], v[11]);
+          u[ 6] = k_packs_epi64_avx2(v[12], v[13]);
+          u[ 7] = k_packs_epi64_avx2(v[14], v[15]);
+          u[ 8] = k_packs_epi64_avx2(v[16], v[17]);
+          u[ 9] = k_packs_epi64_avx2(v[18], v[19]);
+          u[10] = k_packs_epi64_avx2(v[20], v[21]);
+          u[11] = k_packs_epi64_avx2(v[22], v[23]);
+          u[12] = k_packs_epi64_avx2(v[24], v[25]);
+          u[13] = k_packs_epi64_avx2(v[26], v[27]);
+          u[14] = k_packs_epi64_avx2(v[28], v[29]);
+          u[15] = k_packs_epi64_avx2(v[30], v[31]);
+
+          v[ 0] = _mm256_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+          v[ 1] = _mm256_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+          v[ 2] = _mm256_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+          v[ 3] = _mm256_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+          v[ 4] = _mm256_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+          v[ 5] = _mm256_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+          v[ 6] = _mm256_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+          v[ 7] = _mm256_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+          v[ 8] = _mm256_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+          v[ 9] = _mm256_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+          v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+          v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+          v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+          v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+          v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+          v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+          u[ 0] = _mm256_srai_epi32(v[ 0], DCT_CONST_BITS);
+          u[ 1] = _mm256_srai_epi32(v[ 1], DCT_CONST_BITS);
+          u[ 2] = _mm256_srai_epi32(v[ 2], DCT_CONST_BITS);
+          u[ 3] = _mm256_srai_epi32(v[ 3], DCT_CONST_BITS);
+          u[ 4] = _mm256_srai_epi32(v[ 4], DCT_CONST_BITS);
+          u[ 5] = _mm256_srai_epi32(v[ 5], DCT_CONST_BITS);
+          u[ 6] = _mm256_srai_epi32(v[ 6], DCT_CONST_BITS);
+          u[ 7] = _mm256_srai_epi32(v[ 7], DCT_CONST_BITS);
+          u[ 8] = _mm256_srai_epi32(v[ 8], DCT_CONST_BITS);
+          u[ 9] = _mm256_srai_epi32(v[ 9], DCT_CONST_BITS);
+          u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
+          u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
+          u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
+          u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
+          u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
+          u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
+
+          v[ 0] = _mm256_cmpgt_epi32(kZero,u[ 0]);
+          v[ 1] = _mm256_cmpgt_epi32(kZero,u[ 1]);
+          v[ 2] = _mm256_cmpgt_epi32(kZero,u[ 2]);
+          v[ 3] = _mm256_cmpgt_epi32(kZero,u[ 3]);
+          v[ 4] = _mm256_cmpgt_epi32(kZero,u[ 4]);
+          v[ 5] = _mm256_cmpgt_epi32(kZero,u[ 5]);
+          v[ 6] = _mm256_cmpgt_epi32(kZero,u[ 6]);
+          v[ 7] = _mm256_cmpgt_epi32(kZero,u[ 7]);
+          v[ 8] = _mm256_cmpgt_epi32(kZero,u[ 8]);
+          v[ 9] = _mm256_cmpgt_epi32(kZero,u[ 9]);
+          v[10] = _mm256_cmpgt_epi32(kZero,u[10]);
+          v[11] = _mm256_cmpgt_epi32(kZero,u[11]);
+          v[12] = _mm256_cmpgt_epi32(kZero,u[12]);
+          v[13] = _mm256_cmpgt_epi32(kZero,u[13]);
+          v[14] = _mm256_cmpgt_epi32(kZero,u[14]);
+          v[15] = _mm256_cmpgt_epi32(kZero,u[15]);
+
+          u[ 0] = _mm256_sub_epi32(u[ 0], v[ 0]);
+          u[ 1] = _mm256_sub_epi32(u[ 1], v[ 1]);
+          u[ 2] = _mm256_sub_epi32(u[ 2], v[ 2]);
+          u[ 3] = _mm256_sub_epi32(u[ 3], v[ 3]);
+          u[ 4] = _mm256_sub_epi32(u[ 4], v[ 4]);
+          u[ 5] = _mm256_sub_epi32(u[ 5], v[ 5]);
+          u[ 6] = _mm256_sub_epi32(u[ 6], v[ 6]);
+          u[ 7] = _mm256_sub_epi32(u[ 7], v[ 7]);
+          u[ 8] = _mm256_sub_epi32(u[ 8], v[ 8]);
+          u[ 9] = _mm256_sub_epi32(u[ 9], v[ 9]);
+          u[10] = _mm256_sub_epi32(u[10], v[10]);
+          u[11] = _mm256_sub_epi32(u[11], v[11]);
+          u[12] = _mm256_sub_epi32(u[12], v[12]);
+          u[13] = _mm256_sub_epi32(u[13], v[13]);
+          u[14] = _mm256_sub_epi32(u[14], v[14]);
+          u[15] = _mm256_sub_epi32(u[15], v[15]);
+
+          v[0] = _mm256_add_epi32(u[0], K32One);
+          v[1] = _mm256_add_epi32(u[1], K32One);
+          v[2] = _mm256_add_epi32(u[2], K32One);
+          v[3] = _mm256_add_epi32(u[3], K32One);
+          v[4] = _mm256_add_epi32(u[4], K32One);
+          v[5] = _mm256_add_epi32(u[5], K32One);
+          v[6] = _mm256_add_epi32(u[6], K32One);
+          v[7] = _mm256_add_epi32(u[7], K32One);
+          v[8] = _mm256_add_epi32(u[8], K32One);
+          v[9] = _mm256_add_epi32(u[9], K32One);
+          v[10] = _mm256_add_epi32(u[10], K32One);
+          v[11] = _mm256_add_epi32(u[11], K32One);
+          v[12] = _mm256_add_epi32(u[12], K32One);
+          v[13] = _mm256_add_epi32(u[13], K32One);
+          v[14] = _mm256_add_epi32(u[14], K32One);
+          v[15] = _mm256_add_epi32(u[15], K32One);
+
+          u[0] = _mm256_srai_epi32(v[0], 2);
+          u[1] = _mm256_srai_epi32(v[1], 2);
+          u[2] = _mm256_srai_epi32(v[2], 2);
+          u[3] = _mm256_srai_epi32(v[3], 2);
+          u[4] = _mm256_srai_epi32(v[4], 2);
+          u[5] = _mm256_srai_epi32(v[5], 2);
+          u[6] = _mm256_srai_epi32(v[6], 2);
+          u[7] = _mm256_srai_epi32(v[7], 2);
+          u[8] = _mm256_srai_epi32(v[8], 2);
+          u[9] = _mm256_srai_epi32(v[9], 2);
+          u[10] = _mm256_srai_epi32(v[10], 2);
+          u[11] = _mm256_srai_epi32(v[11], 2);
+          u[12] = _mm256_srai_epi32(v[12], 2);
+          u[13] = _mm256_srai_epi32(v[13], 2);
+          u[14] = _mm256_srai_epi32(v[14], 2);
+          u[15] = _mm256_srai_epi32(v[15], 2);
+
+          out[ 5] = _mm256_packs_epi32(u[0], u[1]);
+          out[21] = _mm256_packs_epi32(u[2], u[3]);
+          out[13] = _mm256_packs_epi32(u[4], u[5]);
+          out[29] = _mm256_packs_epi32(u[6], u[7]);
+          out[ 3] = _mm256_packs_epi32(u[8], u[9]);
+          out[19] = _mm256_packs_epi32(u[10], u[11]);
+          out[11] = _mm256_packs_epi32(u[12], u[13]);
+          out[27] = _mm256_packs_epi32(u[14], u[15]);
+        }
+      }
+#endif
+      // Transpose the results, do it as four 8x8 transposes.
+      {
+        int transpose_block;
+        int16_t *output_currStep,*output_nextStep;
+        if (0 == pass){
+                 output_currStep = &intermediate[column_start * 32];
+                 output_nextStep = &intermediate[(column_start + 8) * 32];
+        } else{
+                 output_currStep = &output_org[column_start * 32];
+                 output_nextStep = &output_org[(column_start + 8) * 32];
+        }
+        for (transpose_block = 0; transpose_block < 4; ++transpose_block) {
+          __m256i *this_out = &out[8 * transpose_block];
+          // 00  01  02  03  04  05  06  07  08  09  10  11  12  13  14  15
+          // 20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
+          // 40  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55
+          // 60  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75
+          // 80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95
+          // 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
+          // 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
+          // 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
+          const __m256i tr0_0 = _mm256_unpacklo_epi16(this_out[0], this_out[1]);
+          const __m256i tr0_1 = _mm256_unpacklo_epi16(this_out[2], this_out[3]);
+          const __m256i tr0_2 = _mm256_unpackhi_epi16(this_out[0], this_out[1]);
+          const __m256i tr0_3 = _mm256_unpackhi_epi16(this_out[2], this_out[3]);
+          const __m256i tr0_4 = _mm256_unpacklo_epi16(this_out[4], this_out[5]);
+          const __m256i tr0_5 = _mm256_unpacklo_epi16(this_out[6], this_out[7]);
+          const __m256i tr0_6 = _mm256_unpackhi_epi16(this_out[4], this_out[5]);
+          const __m256i tr0_7 = _mm256_unpackhi_epi16(this_out[6], this_out[7]);
+          // 00  20  01  21  02  22  03  23  08  28  09  29  10  30  11  31
+          // 40  60  41  61  42  62  43  63  48  68  49  69  50  70  51  71
+          // 04  24  05  25  06  26  07  27  12  32  13  33  14  34  15  35
+          // 44  64  45  65  46  66  47  67  52  72  53  73  54  74  55  75
+          // 80  100 81  101 82  102 83  103 88  108 89  109 90  110 91  101
+          // 120 140 121 141 122 142 123 143 128 148 129 149 130 150 131 151
+          // 84  104 85  105 86  106 87  107 92  112 93  113 94  114 95  115
+          // 124 144 125 145 126 146 127 147 132 152 133 153 134 154 135 155
+
+          const __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_1);
+          const __m256i tr1_1 = _mm256_unpacklo_epi32(tr0_2, tr0_3);
+          const __m256i tr1_2 = _mm256_unpackhi_epi32(tr0_0, tr0_1);
+          const __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_2, tr0_3);
+          const __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_5);
+          const __m256i tr1_5 = _mm256_unpacklo_epi32(tr0_6, tr0_7);
+          const __m256i tr1_6 = _mm256_unpackhi_epi32(tr0_4, tr0_5);
+          const __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_6, tr0_7);
+          // 00 20  40  60  01 21  41  61  08 28  48  68  09 29  49  69
+          // 04 24  44  64  05 25  45  65  12 32  52  72  13 33  53  73
+          // 02 22  42  62  03 23  43  63  10 30  50  70  11 31  51  71
+          // 06 26  46  66  07 27  47  67  14 34  54  74  15 35  55  75
+          // 80 100 120 140 81 101 121 141 88 108 128 148 89 109 129 149
+          // 84 104 124 144 85 105 125 145 92 112 132 152 93 113 133 153
+          // 82 102 122 142 83 103 123 143 90 110 130 150 91 101 131 151
+          // 86 106 126 146 87 107 127 147 94 114 134 154 95 115 135 155
+          __m256i tr2_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4);
+          __m256i tr2_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4);
+          __m256i tr2_2 = _mm256_unpacklo_epi64(tr1_2, tr1_6);
+          __m256i tr2_3 = _mm256_unpackhi_epi64(tr1_2, tr1_6);
+          __m256i tr2_4 = _mm256_unpacklo_epi64(tr1_1, tr1_5);
+          __m256i tr2_5 = _mm256_unpackhi_epi64(tr1_1, tr1_5);
+          __m256i tr2_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7);
+          __m256i tr2_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7);
+          // 00 20 40 60 80 100 120 140 08 28 48 68 88 108 128 148
+          // 01 21 41 61 81 101 121 141 09 29 49 69 89 109 129 149
+          // 02 22 42 62 82 102 122 142 10 30 50 70 90 110 130 150
+          // 03 23 43 63 83 103 123 143 11 31 51 71 91 101 131 151
+          // 04 24 44 64 84 104 124 144 12 32 52 72 92 112 132 152
+          // 05 25 45 65 85 105 125 145 13 33 53 73 93 113 133 153
+          // 06 26 46 66 86 106 126 146 14 34 54 74 94 114 134 154
+          // 07 27 47 67 87 107 127 147 15 35 55 75 95 115 135 155
+          if (0 == pass) {
+            // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2;
+            // TODO(cd): see quality impact of only doing
+            //           output[j] = (output[j] + 1) >> 2;
+            //           which would remove the code between here ...
+            __m256i tr2_0_0 = _mm256_cmpgt_epi16(tr2_0, kZero);
+            __m256i tr2_1_0 = _mm256_cmpgt_epi16(tr2_1, kZero);
+            __m256i tr2_2_0 = _mm256_cmpgt_epi16(tr2_2, kZero);
+            __m256i tr2_3_0 = _mm256_cmpgt_epi16(tr2_3, kZero);
+            __m256i tr2_4_0 = _mm256_cmpgt_epi16(tr2_4, kZero);
+            __m256i tr2_5_0 = _mm256_cmpgt_epi16(tr2_5, kZero);
+            __m256i tr2_6_0 = _mm256_cmpgt_epi16(tr2_6, kZero);
+            __m256i tr2_7_0 = _mm256_cmpgt_epi16(tr2_7, kZero);
+            tr2_0 = _mm256_sub_epi16(tr2_0, tr2_0_0);
+            tr2_1 = _mm256_sub_epi16(tr2_1, tr2_1_0);
+            tr2_2 = _mm256_sub_epi16(tr2_2, tr2_2_0);
+            tr2_3 = _mm256_sub_epi16(tr2_3, tr2_3_0);
+            tr2_4 = _mm256_sub_epi16(tr2_4, tr2_4_0);
+            tr2_5 = _mm256_sub_epi16(tr2_5, tr2_5_0);
+            tr2_6 = _mm256_sub_epi16(tr2_6, tr2_6_0);
+            tr2_7 = _mm256_sub_epi16(tr2_7, tr2_7_0);
+            //           ... and here.
+            //           PS: also change code in vp9/encoder/vp9_dct.c
+            tr2_0 = _mm256_add_epi16(tr2_0, kOne);
+            tr2_1 = _mm256_add_epi16(tr2_1, kOne);
+            tr2_2 = _mm256_add_epi16(tr2_2, kOne);
+            tr2_3 = _mm256_add_epi16(tr2_3, kOne);
+            tr2_4 = _mm256_add_epi16(tr2_4, kOne);
+            tr2_5 = _mm256_add_epi16(tr2_5, kOne);
+            tr2_6 = _mm256_add_epi16(tr2_6, kOne);
+            tr2_7 = _mm256_add_epi16(tr2_7, kOne);
+            tr2_0 = _mm256_srai_epi16(tr2_0, 2);
+            tr2_1 = _mm256_srai_epi16(tr2_1, 2);
+            tr2_2 = _mm256_srai_epi16(tr2_2, 2);
+            tr2_3 = _mm256_srai_epi16(tr2_3, 2);
+            tr2_4 = _mm256_srai_epi16(tr2_4, 2);
+            tr2_5 = _mm256_srai_epi16(tr2_5, 2);
+            tr2_6 = _mm256_srai_epi16(tr2_6, 2);
+            tr2_7 = _mm256_srai_epi16(tr2_7, 2);
+          }
+          // Note: even though all these stores are aligned, using the aligned
+          //       intrinsic make the code slightly slower.
+          _mm_storeu_si128((__m128i *)(output_currStep + 0 * 32), _mm256_castsi256_si128(tr2_0));
+          _mm_storeu_si128((__m128i *)(output_currStep + 1 * 32), _mm256_castsi256_si128(tr2_1));
+          _mm_storeu_si128((__m128i *)(output_currStep + 2 * 32), _mm256_castsi256_si128(tr2_2));
+          _mm_storeu_si128((__m128i *)(output_currStep + 3 * 32), _mm256_castsi256_si128(tr2_3));
+          _mm_storeu_si128((__m128i *)(output_currStep + 4 * 32), _mm256_castsi256_si128(tr2_4));
+          _mm_storeu_si128((__m128i *)(output_currStep + 5 * 32), _mm256_castsi256_si128(tr2_5));
+          _mm_storeu_si128((__m128i *)(output_currStep + 6 * 32), _mm256_castsi256_si128(tr2_6));
+          _mm_storeu_si128((__m128i *)(output_currStep + 7 * 32), _mm256_castsi256_si128(tr2_7));
+
+          _mm_storeu_si128((__m128i *)(output_nextStep + 0 * 32), _mm256_extractf128_si256(tr2_0,1));
+          _mm_storeu_si128((__m128i *)(output_nextStep + 1 * 32), _mm256_extractf128_si256(tr2_1,1));
+          _mm_storeu_si128((__m128i *)(output_nextStep + 2 * 32), _mm256_extractf128_si256(tr2_2,1));
+          _mm_storeu_si128((__m128i *)(output_nextStep + 3 * 32), _mm256_extractf128_si256(tr2_3,1));
+          _mm_storeu_si128((__m128i *)(output_nextStep + 4 * 32), _mm256_extractf128_si256(tr2_4,1));
+          _mm_storeu_si128((__m128i *)(output_nextStep + 5 * 32), _mm256_extractf128_si256(tr2_5,1));
+          _mm_storeu_si128((__m128i *)(output_nextStep + 6 * 32), _mm256_extractf128_si256(tr2_6,1));
+          _mm_storeu_si128((__m128i *)(output_nextStep + 7 * 32), _mm256_extractf128_si256(tr2_7,1));
+          // Process next 8x8
+          output_currStep += 8;
+          output_nextStep += 8;
+        }
+      }
+    }
+  }
+}  // NOLINT
diff --git a/libs/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h b/libs/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h
new file mode 100644
index 0000000000..b85ae103fa
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h
@@ -0,0 +1,3153 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>  // SSE2
+
+#include "vpx_dsp/fwd_txfm.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+// TODO(jingning) The high bit-depth version needs re-work for performance.
+// The current SSE2 implementation also causes cross reference to the static
+// functions in the C implementation file.
+#if DCT_HIGH_BIT_DEPTH
+#define ADD_EPI16 _mm_adds_epi16
+#define SUB_EPI16 _mm_subs_epi16
+#if FDCT32x32_HIGH_PRECISION
+void vpx_fdct32x32_rows_c(const int16_t *intermediate, tran_low_t *out) {
+    int i, j;
+    for (i = 0; i < 32; ++i) {
+      tran_high_t temp_in[32], temp_out[32];
+      for (j = 0; j < 32; ++j)
+        temp_in[j] = intermediate[j * 32 + i];
+      vpx_fdct32(temp_in, temp_out, 0);
+      for (j = 0; j < 32; ++j)
+        out[j + i * 32] =
+            (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
+    }
+}
+  #define HIGH_FDCT32x32_2D_C vpx_highbd_fdct32x32_c
+  #define HIGH_FDCT32x32_2D_ROWS_C vpx_fdct32x32_rows_c
+#else
+void vpx_fdct32x32_rd_rows_c(const int16_t *intermediate, tran_low_t *out) {
+    int i, j;
+    for (i = 0; i < 32; ++i) {
+      tran_high_t temp_in[32], temp_out[32];
+      for (j = 0; j < 32; ++j)
+        temp_in[j] = intermediate[j * 32 + i];
+      vpx_fdct32(temp_in, temp_out, 1);
+      for (j = 0; j < 32; ++j)
+        out[j + i * 32] = (tran_low_t)temp_out[j];
+    }
+}
+  #define HIGH_FDCT32x32_2D_C vpx_highbd_fdct32x32_rd_c
+  #define HIGH_FDCT32x32_2D_ROWS_C vpx_fdct32x32_rd_rows_c
+#endif  // FDCT32x32_HIGH_PRECISION
+#else
+#define ADD_EPI16 _mm_add_epi16
+#define SUB_EPI16 _mm_sub_epi16
+#endif  // DCT_HIGH_BIT_DEPTH
+
+
+void FDCT32x32_2D(const int16_t *input,
+                  tran_low_t *output_org, int stride) {
+  // Calculate pre-multiplied strides
+  const int str1 = stride;
+  const int str2 = 2 * stride;
+  const int str3 = 2 * stride + str1;
+  // We need an intermediate buffer between passes.
+  DECLARE_ALIGNED(16, int16_t, intermediate[32 * 32]);
+  // Constants
+  //    When we use them, in one case, they are all the same. In all others
+  //    it's a pair of them that we need to repeat four times. This is done
+  //    by constructing the 32 bit constant corresponding to that pair.
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64,   cospi_24_64);
+  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64,  cospi_8_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64,  cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64,  cospi_12_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64,   cospi_28_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64,  cospi_4_64);
+  const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
+  const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
+  const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64,  cospi_2_64);
+  const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64,  cospi_18_64);
+  const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64,  cospi_10_64);
+  const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64,   cospi_26_64);
+  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64,  cospi_6_64);
+  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64,  cospi_22_64);
+  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64,  cospi_14_64);
+  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64,   cospi_30_64);
+  const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64,  cospi_1_64);
+  const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64,  cospi_17_64);
+  const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64,  cospi_9_64);
+  const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64,   cospi_25_64);
+  const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64,  cospi_7_64);
+  const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64,   cospi_23_64);
+  const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64,  cospi_15_64);
+  const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64,   cospi_31_64);
+  const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64,  cospi_5_64);
+  const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64,  cospi_21_64);
+  const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64,  cospi_13_64);
+  const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64,   cospi_29_64);
+  const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64,  cospi_3_64);
+  const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64,  cospi_19_64);
+  const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64,  cospi_11_64);
+  const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64,   cospi_27_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i kZero = _mm_set1_epi16(0);
+  const __m128i kOne  = _mm_set1_epi16(1);
+  // Do the two transform/transpose passes
+  int pass;
+#if DCT_HIGH_BIT_DEPTH
+  int overflow;
+#endif
+  for (pass = 0; pass < 2; ++pass) {
+    // We process eight columns (transposed rows in second pass) at a time.
+    int column_start;
+    for (column_start = 0; column_start < 32; column_start += 8) {
+      __m128i step1[32];
+      __m128i step2[32];
+      __m128i step3[32];
+      __m128i out[32];
+      // Stage 1
+      // Note: even though all the loads below are aligned, using the aligned
+      //       intrinsic make the code slightly slower.
+      if (0 == pass) {
+        const int16_t *in  = &input[column_start];
+        // step1[i] =  (in[ 0 * stride] + in[(32 -  1) * stride]) << 2;
+        // Note: the next four blocks could be in a loop. That would help the
+        //       instruction cache but is actually slower.
+        {
+          const int16_t *ina =  in +  0 * str1;
+          const int16_t *inb =  in + 31 * str1;
+          __m128i *step1a = &step1[ 0];
+          __m128i *step1b = &step1[31];
+          const __m128i ina0  = _mm_loadu_si128((const __m128i *)(ina));
+          const __m128i ina1  = _mm_loadu_si128((const __m128i *)(ina + str1));
+          const __m128i ina2  = _mm_loadu_si128((const __m128i *)(ina + str2));
+          const __m128i ina3  = _mm_loadu_si128((const __m128i *)(ina + str3));
+          const __m128i inb3  = _mm_loadu_si128((const __m128i *)(inb - str3));
+          const __m128i inb2  = _mm_loadu_si128((const __m128i *)(inb - str2));
+          const __m128i inb1  = _mm_loadu_si128((const __m128i *)(inb - str1));
+          const __m128i inb0  = _mm_loadu_si128((const __m128i *)(inb));
+          step1a[ 0] = _mm_add_epi16(ina0, inb0);
+          step1a[ 1] = _mm_add_epi16(ina1, inb1);
+          step1a[ 2] = _mm_add_epi16(ina2, inb2);
+          step1a[ 3] = _mm_add_epi16(ina3, inb3);
+          step1b[-3] = _mm_sub_epi16(ina3, inb3);
+          step1b[-2] = _mm_sub_epi16(ina2, inb2);
+          step1b[-1] = _mm_sub_epi16(ina1, inb1);
+          step1b[-0] = _mm_sub_epi16(ina0, inb0);
+          step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
+          step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
+          step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
+          step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
+          step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
+          step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
+          step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
+          step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
+        }
+        {
+          const int16_t *ina =  in +  4 * str1;
+          const int16_t *inb =  in + 27 * str1;
+          __m128i *step1a = &step1[ 4];
+          __m128i *step1b = &step1[27];
+          const __m128i ina0  = _mm_loadu_si128((const __m128i *)(ina));
+          const __m128i ina1  = _mm_loadu_si128((const __m128i *)(ina + str1));
+          const __m128i ina2  = _mm_loadu_si128((const __m128i *)(ina + str2));
+          const __m128i ina3  = _mm_loadu_si128((const __m128i *)(ina + str3));
+          const __m128i inb3  = _mm_loadu_si128((const __m128i *)(inb - str3));
+          const __m128i inb2  = _mm_loadu_si128((const __m128i *)(inb - str2));
+          const __m128i inb1  = _mm_loadu_si128((const __m128i *)(inb - str1));
+          const __m128i inb0  = _mm_loadu_si128((const __m128i *)(inb));
+          step1a[ 0] = _mm_add_epi16(ina0, inb0);
+          step1a[ 1] = _mm_add_epi16(ina1, inb1);
+          step1a[ 2] = _mm_add_epi16(ina2, inb2);
+          step1a[ 3] = _mm_add_epi16(ina3, inb3);
+          step1b[-3] = _mm_sub_epi16(ina3, inb3);
+          step1b[-2] = _mm_sub_epi16(ina2, inb2);
+          step1b[-1] = _mm_sub_epi16(ina1, inb1);
+          step1b[-0] = _mm_sub_epi16(ina0, inb0);
+          step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
+          step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
+          step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
+          step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
+          step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
+          step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
+          step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
+          step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
+        }
+        {
+          const int16_t *ina =  in +  8 * str1;
+          const int16_t *inb =  in + 23 * str1;
+          __m128i *step1a = &step1[ 8];
+          __m128i *step1b = &step1[23];
+          const __m128i ina0  = _mm_loadu_si128((const __m128i *)(ina));
+          const __m128i ina1  = _mm_loadu_si128((const __m128i *)(ina + str1));
+          const __m128i ina2  = _mm_loadu_si128((const __m128i *)(ina + str2));
+          const __m128i ina3  = _mm_loadu_si128((const __m128i *)(ina + str3));
+          const __m128i inb3  = _mm_loadu_si128((const __m128i *)(inb - str3));
+          const __m128i inb2  = _mm_loadu_si128((const __m128i *)(inb - str2));
+          const __m128i inb1  = _mm_loadu_si128((const __m128i *)(inb - str1));
+          const __m128i inb0  = _mm_loadu_si128((const __m128i *)(inb));
+          step1a[ 0] = _mm_add_epi16(ina0, inb0);
+          step1a[ 1] = _mm_add_epi16(ina1, inb1);
+          step1a[ 2] = _mm_add_epi16(ina2, inb2);
+          step1a[ 3] = _mm_add_epi16(ina3, inb3);
+          step1b[-3] = _mm_sub_epi16(ina3, inb3);
+          step1b[-2] = _mm_sub_epi16(ina2, inb2);
+          step1b[-1] = _mm_sub_epi16(ina1, inb1);
+          step1b[-0] = _mm_sub_epi16(ina0, inb0);
+          step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
+          step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
+          step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
+          step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
+          step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
+          step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
+          step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
+          step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
+        }
+        {
+          const int16_t *ina =  in + 12 * str1;
+          const int16_t *inb =  in + 19 * str1;
+          __m128i *step1a = &step1[12];
+          __m128i *step1b = &step1[19];
+          const __m128i ina0  = _mm_loadu_si128((const __m128i *)(ina));
+          const __m128i ina1  = _mm_loadu_si128((const __m128i *)(ina + str1));
+          const __m128i ina2  = _mm_loadu_si128((const __m128i *)(ina + str2));
+          const __m128i ina3  = _mm_loadu_si128((const __m128i *)(ina + str3));
+          const __m128i inb3  = _mm_loadu_si128((const __m128i *)(inb - str3));
+          const __m128i inb2  = _mm_loadu_si128((const __m128i *)(inb - str2));
+          const __m128i inb1  = _mm_loadu_si128((const __m128i *)(inb - str1));
+          const __m128i inb0  = _mm_loadu_si128((const __m128i *)(inb));
+          step1a[ 0] = _mm_add_epi16(ina0, inb0);
+          step1a[ 1] = _mm_add_epi16(ina1, inb1);
+          step1a[ 2] = _mm_add_epi16(ina2, inb2);
+          step1a[ 3] = _mm_add_epi16(ina3, inb3);
+          step1b[-3] = _mm_sub_epi16(ina3, inb3);
+          step1b[-2] = _mm_sub_epi16(ina2, inb2);
+          step1b[-1] = _mm_sub_epi16(ina1, inb1);
+          step1b[-0] = _mm_sub_epi16(ina0, inb0);
+          step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
+          step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
+          step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
+          step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
+          step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
+          step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
+          step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
+          step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
+        }
+      } else {
+        int16_t *in = &intermediate[column_start];
+        // step1[i] =  in[ 0 * 32] + in[(32 -  1) * 32];
+        // Note: using the same approach as above to have common offset is
+        //       counter-productive as all offsets can be calculated at compile
+        //       time.
+        // Note: the next four blocks could be in a loop. That would help the
+        //       instruction cache but is actually slower.
+        {
+          __m128i in00  = _mm_loadu_si128((const __m128i *)(in +  0 * 32));
+          __m128i in01  = _mm_loadu_si128((const __m128i *)(in +  1 * 32));
+          __m128i in02  = _mm_loadu_si128((const __m128i *)(in +  2 * 32));
+          __m128i in03  = _mm_loadu_si128((const __m128i *)(in +  3 * 32));
+          __m128i in28  = _mm_loadu_si128((const __m128i *)(in + 28 * 32));
+          __m128i in29  = _mm_loadu_si128((const __m128i *)(in + 29 * 32));
+          __m128i in30  = _mm_loadu_si128((const __m128i *)(in + 30 * 32));
+          __m128i in31  = _mm_loadu_si128((const __m128i *)(in + 31 * 32));
+          step1[0] = ADD_EPI16(in00, in31);
+          step1[1] = ADD_EPI16(in01, in30);
+          step1[2] = ADD_EPI16(in02, in29);
+          step1[3] = ADD_EPI16(in03, in28);
+          step1[28] = SUB_EPI16(in03, in28);
+          step1[29] = SUB_EPI16(in02, in29);
+          step1[30] = SUB_EPI16(in01, in30);
+          step1[31] = SUB_EPI16(in00, in31);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x8(&step1[0], &step1[1], &step1[2],
+                                             &step1[3], &step1[28], &step1[29],
+                                             &step1[30], &step1[31]);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+        }
+        {
+          __m128i in04  = _mm_loadu_si128((const __m128i *)(in +  4 * 32));
+          __m128i in05  = _mm_loadu_si128((const __m128i *)(in +  5 * 32));
+          __m128i in06  = _mm_loadu_si128((const __m128i *)(in +  6 * 32));
+          __m128i in07  = _mm_loadu_si128((const __m128i *)(in +  7 * 32));
+          __m128i in24  = _mm_loadu_si128((const __m128i *)(in + 24 * 32));
+          __m128i in25  = _mm_loadu_si128((const __m128i *)(in + 25 * 32));
+          __m128i in26  = _mm_loadu_si128((const __m128i *)(in + 26 * 32));
+          __m128i in27  = _mm_loadu_si128((const __m128i *)(in + 27 * 32));
+          step1[4] = ADD_EPI16(in04, in27);
+          step1[5] = ADD_EPI16(in05, in26);
+          step1[6] = ADD_EPI16(in06, in25);
+          step1[7] = ADD_EPI16(in07, in24);
+          step1[24] = SUB_EPI16(in07, in24);
+          step1[25] = SUB_EPI16(in06, in25);
+          step1[26] = SUB_EPI16(in05, in26);
+          step1[27] = SUB_EPI16(in04, in27);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x8(&step1[4], &step1[5], &step1[6],
+                                             &step1[7], &step1[24], &step1[25],
+                                             &step1[26], &step1[27]);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+        }
+        {
+          __m128i in08  = _mm_loadu_si128((const __m128i *)(in +  8 * 32));
+          __m128i in09  = _mm_loadu_si128((const __m128i *)(in +  9 * 32));
+          __m128i in10  = _mm_loadu_si128((const __m128i *)(in + 10 * 32));
+          __m128i in11  = _mm_loadu_si128((const __m128i *)(in + 11 * 32));
+          __m128i in20  = _mm_loadu_si128((const __m128i *)(in + 20 * 32));
+          __m128i in21  = _mm_loadu_si128((const __m128i *)(in + 21 * 32));
+          __m128i in22  = _mm_loadu_si128((const __m128i *)(in + 22 * 32));
+          __m128i in23  = _mm_loadu_si128((const __m128i *)(in + 23 * 32));
+          step1[8] = ADD_EPI16(in08, in23);
+          step1[9] = ADD_EPI16(in09, in22);
+          step1[10] = ADD_EPI16(in10, in21);
+          step1[11] = ADD_EPI16(in11, in20);
+          step1[20] = SUB_EPI16(in11, in20);
+          step1[21] = SUB_EPI16(in10, in21);
+          step1[22] = SUB_EPI16(in09, in22);
+          step1[23] = SUB_EPI16(in08, in23);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x8(&step1[8], &step1[9], &step1[10],
+                                             &step1[11], &step1[20], &step1[21],
+                                             &step1[22], &step1[23]);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+        }
+        {
+          __m128i in12  = _mm_loadu_si128((const __m128i *)(in + 12 * 32));
+          __m128i in13  = _mm_loadu_si128((const __m128i *)(in + 13 * 32));
+          __m128i in14  = _mm_loadu_si128((const __m128i *)(in + 14 * 32));
+          __m128i in15  = _mm_loadu_si128((const __m128i *)(in + 15 * 32));
+          __m128i in16  = _mm_loadu_si128((const __m128i *)(in + 16 * 32));
+          __m128i in17  = _mm_loadu_si128((const __m128i *)(in + 17 * 32));
+          __m128i in18  = _mm_loadu_si128((const __m128i *)(in + 18 * 32));
+          __m128i in19  = _mm_loadu_si128((const __m128i *)(in + 19 * 32));
+          step1[12] = ADD_EPI16(in12, in19);
+          step1[13] = ADD_EPI16(in13, in18);
+          step1[14] = ADD_EPI16(in14, in17);
+          step1[15] = ADD_EPI16(in15, in16);
+          step1[16] = SUB_EPI16(in15, in16);
+          step1[17] = SUB_EPI16(in14, in17);
+          step1[18] = SUB_EPI16(in13, in18);
+          step1[19] = SUB_EPI16(in12, in19);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x8(&step1[12], &step1[13], &step1[14],
+                                             &step1[15], &step1[16], &step1[17],
+                                             &step1[18], &step1[19]);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+        }
+      }
+      // Stage 2
+      {
+        step2[0] = ADD_EPI16(step1[0], step1[15]);
+        step2[1] = ADD_EPI16(step1[1], step1[14]);
+        step2[2] = ADD_EPI16(step1[2], step1[13]);
+        step2[3] = ADD_EPI16(step1[3], step1[12]);
+        step2[4] = ADD_EPI16(step1[4], step1[11]);
+        step2[5] = ADD_EPI16(step1[5], step1[10]);
+        step2[6] = ADD_EPI16(step1[6], step1[ 9]);
+        step2[7] = ADD_EPI16(step1[7], step1[ 8]);
+        step2[8] = SUB_EPI16(step1[7], step1[ 8]);
+        step2[9] = SUB_EPI16(step1[6], step1[ 9]);
+        step2[10] = SUB_EPI16(step1[5], step1[10]);
+        step2[11] = SUB_EPI16(step1[4], step1[11]);
+        step2[12] = SUB_EPI16(step1[3], step1[12]);
+        step2[13] = SUB_EPI16(step1[2], step1[13]);
+        step2[14] = SUB_EPI16(step1[1], step1[14]);
+        step2[15] = SUB_EPI16(step1[0], step1[15]);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x16(
+            &step2[0], &step2[1], &step2[2], &step2[3],
+            &step2[4], &step2[5], &step2[6], &step2[7],
+            &step2[8], &step2[9], &step2[10], &step2[11],
+            &step2[12], &step2[13], &step2[14], &step2[15]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+      {
+        const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]);
+        const __m128i s2_20_1 = _mm_unpackhi_epi16(step1[27], step1[20]);
+        const __m128i s2_21_0 = _mm_unpacklo_epi16(step1[26], step1[21]);
+        const __m128i s2_21_1 = _mm_unpackhi_epi16(step1[26], step1[21]);
+        const __m128i s2_22_0 = _mm_unpacklo_epi16(step1[25], step1[22]);
+        const __m128i s2_22_1 = _mm_unpackhi_epi16(step1[25], step1[22]);
+        const __m128i s2_23_0 = _mm_unpacklo_epi16(step1[24], step1[23]);
+        const __m128i s2_23_1 = _mm_unpackhi_epi16(step1[24], step1[23]);
+        const __m128i s2_20_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_m16);
+        const __m128i s2_20_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_m16);
+        const __m128i s2_21_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_m16);
+        const __m128i s2_21_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_m16);
+        const __m128i s2_22_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_m16);
+        const __m128i s2_22_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_m16);
+        const __m128i s2_23_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_m16);
+        const __m128i s2_23_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_m16);
+        const __m128i s2_24_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_p16);
+        const __m128i s2_24_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_p16);
+        const __m128i s2_25_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_p16);
+        const __m128i s2_25_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_p16);
+        const __m128i s2_26_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_p16);
+        const __m128i s2_26_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_p16);
+        const __m128i s2_27_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_p16);
+        const __m128i s2_27_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_p16);
+        // dct_const_round_shift
+        const __m128i s2_20_4 = _mm_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING);
+        const __m128i s2_20_5 = _mm_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING);
+        const __m128i s2_21_4 = _mm_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING);
+        const __m128i s2_21_5 = _mm_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING);
+        const __m128i s2_22_4 = _mm_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING);
+        const __m128i s2_22_5 = _mm_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING);
+        const __m128i s2_23_4 = _mm_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING);
+        const __m128i s2_23_5 = _mm_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING);
+        const __m128i s2_24_4 = _mm_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING);
+        const __m128i s2_24_5 = _mm_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING);
+        const __m128i s2_25_4 = _mm_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING);
+        const __m128i s2_25_5 = _mm_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING);
+        const __m128i s2_26_4 = _mm_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING);
+        const __m128i s2_26_5 = _mm_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING);
+        const __m128i s2_27_4 = _mm_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING);
+        const __m128i s2_27_5 = _mm_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING);
+        const __m128i s2_20_6 = _mm_srai_epi32(s2_20_4, DCT_CONST_BITS);
+        const __m128i s2_20_7 = _mm_srai_epi32(s2_20_5, DCT_CONST_BITS);
+        const __m128i s2_21_6 = _mm_srai_epi32(s2_21_4, DCT_CONST_BITS);
+        const __m128i s2_21_7 = _mm_srai_epi32(s2_21_5, DCT_CONST_BITS);
+        const __m128i s2_22_6 = _mm_srai_epi32(s2_22_4, DCT_CONST_BITS);
+        const __m128i s2_22_7 = _mm_srai_epi32(s2_22_5, DCT_CONST_BITS);
+        const __m128i s2_23_6 = _mm_srai_epi32(s2_23_4, DCT_CONST_BITS);
+        const __m128i s2_23_7 = _mm_srai_epi32(s2_23_5, DCT_CONST_BITS);
+        const __m128i s2_24_6 = _mm_srai_epi32(s2_24_4, DCT_CONST_BITS);
+        const __m128i s2_24_7 = _mm_srai_epi32(s2_24_5, DCT_CONST_BITS);
+        const __m128i s2_25_6 = _mm_srai_epi32(s2_25_4, DCT_CONST_BITS);
+        const __m128i s2_25_7 = _mm_srai_epi32(s2_25_5, DCT_CONST_BITS);
+        const __m128i s2_26_6 = _mm_srai_epi32(s2_26_4, DCT_CONST_BITS);
+        const __m128i s2_26_7 = _mm_srai_epi32(s2_26_5, DCT_CONST_BITS);
+        const __m128i s2_27_6 = _mm_srai_epi32(s2_27_4, DCT_CONST_BITS);
+        const __m128i s2_27_7 = _mm_srai_epi32(s2_27_5, DCT_CONST_BITS);
+        // Combine
+        step2[20] = _mm_packs_epi32(s2_20_6, s2_20_7);
+        step2[21] = _mm_packs_epi32(s2_21_6, s2_21_7);
+        step2[22] = _mm_packs_epi32(s2_22_6, s2_22_7);
+        step2[23] = _mm_packs_epi32(s2_23_6, s2_23_7);
+        step2[24] = _mm_packs_epi32(s2_24_6, s2_24_7);
+        step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7);
+        step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7);
+        step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x8(&step2[20], &step2[21], &step2[22],
+                                           &step2[23], &step2[24], &step2[25],
+                                           &step2[26], &step2[27]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+
+#if !FDCT32x32_HIGH_PRECISION
+      // dump the magnitude by half, hence the intermediate values are within
+      // the range of 16 bits.
+      if (1 == pass) {
+        __m128i s3_00_0 = _mm_cmplt_epi16(step2[ 0], kZero);
+        __m128i s3_01_0 = _mm_cmplt_epi16(step2[ 1], kZero);
+        __m128i s3_02_0 = _mm_cmplt_epi16(step2[ 2], kZero);
+        __m128i s3_03_0 = _mm_cmplt_epi16(step2[ 3], kZero);
+        __m128i s3_04_0 = _mm_cmplt_epi16(step2[ 4], kZero);
+        __m128i s3_05_0 = _mm_cmplt_epi16(step2[ 5], kZero);
+        __m128i s3_06_0 = _mm_cmplt_epi16(step2[ 6], kZero);
+        __m128i s3_07_0 = _mm_cmplt_epi16(step2[ 7], kZero);
+        __m128i s2_08_0 = _mm_cmplt_epi16(step2[ 8], kZero);
+        __m128i s2_09_0 = _mm_cmplt_epi16(step2[ 9], kZero);
+        __m128i s3_10_0 = _mm_cmplt_epi16(step2[10], kZero);
+        __m128i s3_11_0 = _mm_cmplt_epi16(step2[11], kZero);
+        __m128i s3_12_0 = _mm_cmplt_epi16(step2[12], kZero);
+        __m128i s3_13_0 = _mm_cmplt_epi16(step2[13], kZero);
+        __m128i s2_14_0 = _mm_cmplt_epi16(step2[14], kZero);
+        __m128i s2_15_0 = _mm_cmplt_epi16(step2[15], kZero);
+        __m128i s3_16_0 = _mm_cmplt_epi16(step1[16], kZero);
+        __m128i s3_17_0 = _mm_cmplt_epi16(step1[17], kZero);
+        __m128i s3_18_0 = _mm_cmplt_epi16(step1[18], kZero);
+        __m128i s3_19_0 = _mm_cmplt_epi16(step1[19], kZero);
+        __m128i s3_20_0 = _mm_cmplt_epi16(step2[20], kZero);
+        __m128i s3_21_0 = _mm_cmplt_epi16(step2[21], kZero);
+        __m128i s3_22_0 = _mm_cmplt_epi16(step2[22], kZero);
+        __m128i s3_23_0 = _mm_cmplt_epi16(step2[23], kZero);
+        __m128i s3_24_0 = _mm_cmplt_epi16(step2[24], kZero);
+        __m128i s3_25_0 = _mm_cmplt_epi16(step2[25], kZero);
+        __m128i s3_26_0 = _mm_cmplt_epi16(step2[26], kZero);
+        __m128i s3_27_0 = _mm_cmplt_epi16(step2[27], kZero);
+        __m128i s3_28_0 = _mm_cmplt_epi16(step1[28], kZero);
+        __m128i s3_29_0 = _mm_cmplt_epi16(step1[29], kZero);
+        __m128i s3_30_0 = _mm_cmplt_epi16(step1[30], kZero);
+        __m128i s3_31_0 = _mm_cmplt_epi16(step1[31], kZero);
+
+        step2[0] = SUB_EPI16(step2[ 0], s3_00_0);
+        step2[1] = SUB_EPI16(step2[ 1], s3_01_0);
+        step2[2] = SUB_EPI16(step2[ 2], s3_02_0);
+        step2[3] = SUB_EPI16(step2[ 3], s3_03_0);
+        step2[4] = SUB_EPI16(step2[ 4], s3_04_0);
+        step2[5] = SUB_EPI16(step2[ 5], s3_05_0);
+        step2[6] = SUB_EPI16(step2[ 6], s3_06_0);
+        step2[7] = SUB_EPI16(step2[ 7], s3_07_0);
+        step2[8] = SUB_EPI16(step2[ 8], s2_08_0);
+        step2[9] = SUB_EPI16(step2[ 9], s2_09_0);
+        step2[10] = SUB_EPI16(step2[10], s3_10_0);
+        step2[11] = SUB_EPI16(step2[11], s3_11_0);
+        step2[12] = SUB_EPI16(step2[12], s3_12_0);
+        step2[13] = SUB_EPI16(step2[13], s3_13_0);
+        step2[14] = SUB_EPI16(step2[14], s2_14_0);
+        step2[15] = SUB_EPI16(step2[15], s2_15_0);
+        step1[16] = SUB_EPI16(step1[16], s3_16_0);
+        step1[17] = SUB_EPI16(step1[17], s3_17_0);
+        step1[18] = SUB_EPI16(step1[18], s3_18_0);
+        step1[19] = SUB_EPI16(step1[19], s3_19_0);
+        step2[20] = SUB_EPI16(step2[20], s3_20_0);
+        step2[21] = SUB_EPI16(step2[21], s3_21_0);
+        step2[22] = SUB_EPI16(step2[22], s3_22_0);
+        step2[23] = SUB_EPI16(step2[23], s3_23_0);
+        step2[24] = SUB_EPI16(step2[24], s3_24_0);
+        step2[25] = SUB_EPI16(step2[25], s3_25_0);
+        step2[26] = SUB_EPI16(step2[26], s3_26_0);
+        step2[27] = SUB_EPI16(step2[27], s3_27_0);
+        step1[28] = SUB_EPI16(step1[28], s3_28_0);
+        step1[29] = SUB_EPI16(step1[29], s3_29_0);
+        step1[30] = SUB_EPI16(step1[30], s3_30_0);
+        step1[31] = SUB_EPI16(step1[31], s3_31_0);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x32(
+            &step2[0], &step2[1], &step2[2], &step2[3],
+            &step2[4], &step2[5], &step2[6], &step2[7],
+            &step2[8], &step2[9], &step2[10], &step2[11],
+            &step2[12], &step2[13], &step2[14], &step2[15],
+            &step1[16], &step1[17], &step1[18], &step1[19],
+            &step2[20], &step2[21], &step2[22], &step2[23],
+            &step2[24], &step2[25], &step2[26], &step2[27],
+            &step1[28], &step1[29], &step1[30], &step1[31]);
+        if (overflow) {
+          HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+        step2[0] = _mm_add_epi16(step2[ 0], kOne);
+        step2[1] = _mm_add_epi16(step2[ 1], kOne);
+        step2[2] = _mm_add_epi16(step2[ 2], kOne);
+        step2[3] = _mm_add_epi16(step2[ 3], kOne);
+        step2[4] = _mm_add_epi16(step2[ 4], kOne);
+        step2[5] = _mm_add_epi16(step2[ 5], kOne);
+        step2[6] = _mm_add_epi16(step2[ 6], kOne);
+        step2[7] = _mm_add_epi16(step2[ 7], kOne);
+        step2[8] = _mm_add_epi16(step2[ 8], kOne);
+        step2[9] = _mm_add_epi16(step2[ 9], kOne);
+        step2[10] = _mm_add_epi16(step2[10], kOne);
+        step2[11] = _mm_add_epi16(step2[11], kOne);
+        step2[12] = _mm_add_epi16(step2[12], kOne);
+        step2[13] = _mm_add_epi16(step2[13], kOne);
+        step2[14] = _mm_add_epi16(step2[14], kOne);
+        step2[15] = _mm_add_epi16(step2[15], kOne);
+        step1[16] = _mm_add_epi16(step1[16], kOne);
+        step1[17] = _mm_add_epi16(step1[17], kOne);
+        step1[18] = _mm_add_epi16(step1[18], kOne);
+        step1[19] = _mm_add_epi16(step1[19], kOne);
+        step2[20] = _mm_add_epi16(step2[20], kOne);
+        step2[21] = _mm_add_epi16(step2[21], kOne);
+        step2[22] = _mm_add_epi16(step2[22], kOne);
+        step2[23] = _mm_add_epi16(step2[23], kOne);
+        step2[24] = _mm_add_epi16(step2[24], kOne);
+        step2[25] = _mm_add_epi16(step2[25], kOne);
+        step2[26] = _mm_add_epi16(step2[26], kOne);
+        step2[27] = _mm_add_epi16(step2[27], kOne);
+        step1[28] = _mm_add_epi16(step1[28], kOne);
+        step1[29] = _mm_add_epi16(step1[29], kOne);
+        step1[30] = _mm_add_epi16(step1[30], kOne);
+        step1[31] = _mm_add_epi16(step1[31], kOne);
+
+        step2[0] = _mm_srai_epi16(step2[ 0], 2);
+        step2[1] = _mm_srai_epi16(step2[ 1], 2);
+        step2[2] = _mm_srai_epi16(step2[ 2], 2);
+        step2[3] = _mm_srai_epi16(step2[ 3], 2);
+        step2[4] = _mm_srai_epi16(step2[ 4], 2);
+        step2[5] = _mm_srai_epi16(step2[ 5], 2);
+        step2[6] = _mm_srai_epi16(step2[ 6], 2);
+        step2[7] = _mm_srai_epi16(step2[ 7], 2);
+        step2[8] = _mm_srai_epi16(step2[ 8], 2);
+        step2[9] = _mm_srai_epi16(step2[ 9], 2);
+        step2[10] = _mm_srai_epi16(step2[10], 2);
+        step2[11] = _mm_srai_epi16(step2[11], 2);
+        step2[12] = _mm_srai_epi16(step2[12], 2);
+        step2[13] = _mm_srai_epi16(step2[13], 2);
+        step2[14] = _mm_srai_epi16(step2[14], 2);
+        step2[15] = _mm_srai_epi16(step2[15], 2);
+        step1[16] = _mm_srai_epi16(step1[16], 2);
+        step1[17] = _mm_srai_epi16(step1[17], 2);
+        step1[18] = _mm_srai_epi16(step1[18], 2);
+        step1[19] = _mm_srai_epi16(step1[19], 2);
+        step2[20] = _mm_srai_epi16(step2[20], 2);
+        step2[21] = _mm_srai_epi16(step2[21], 2);
+        step2[22] = _mm_srai_epi16(step2[22], 2);
+        step2[23] = _mm_srai_epi16(step2[23], 2);
+        step2[24] = _mm_srai_epi16(step2[24], 2);
+        step2[25] = _mm_srai_epi16(step2[25], 2);
+        step2[26] = _mm_srai_epi16(step2[26], 2);
+        step2[27] = _mm_srai_epi16(step2[27], 2);
+        step1[28] = _mm_srai_epi16(step1[28], 2);
+        step1[29] = _mm_srai_epi16(step1[29], 2);
+        step1[30] = _mm_srai_epi16(step1[30], 2);
+        step1[31] = _mm_srai_epi16(step1[31], 2);
+      }
+#endif  // !FDCT32x32_HIGH_PRECISION
+
+#if FDCT32x32_HIGH_PRECISION
+      if (pass == 0) {
+#endif
+      // Stage 3
+      {
+        step3[0] = ADD_EPI16(step2[(8 - 1)], step2[0]);
+        step3[1] = ADD_EPI16(step2[(8 - 2)], step2[1]);
+        step3[2] = ADD_EPI16(step2[(8 - 3)], step2[2]);
+        step3[3] = ADD_EPI16(step2[(8 - 4)], step2[3]);
+        step3[4] = SUB_EPI16(step2[(8 - 5)], step2[4]);
+        step3[5] = SUB_EPI16(step2[(8 - 6)], step2[5]);
+        step3[6] = SUB_EPI16(step2[(8 - 7)], step2[6]);
+        step3[7] = SUB_EPI16(step2[(8 - 8)], step2[7]);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x8(&step3[0], &step3[1], &step3[2],
+                                           &step3[3], &step3[4], &step3[5],
+                                           &step3[6], &step3[7]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+      {
+        const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
+        const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
+        const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
+        const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
+        const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
+        const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
+        const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
+        const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
+        const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
+        const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
+        const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
+        const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
+        // dct_const_round_shift
+        const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
+        const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
+        const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
+        const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
+        const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
+        const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
+        const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
+        const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
+        const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
+        const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
+        const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
+        const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
+        const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
+        const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
+        const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
+        const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
+        // Combine
+        step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7);
+        step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7);
+        step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7);
+        step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x4(&step3[10], &step3[11],
+                                           &step3[12], &step3[13]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+      {
+        step3[16] = ADD_EPI16(step2[23], step1[16]);
+        step3[17] = ADD_EPI16(step2[22], step1[17]);
+        step3[18] = ADD_EPI16(step2[21], step1[18]);
+        step3[19] = ADD_EPI16(step2[20], step1[19]);
+        step3[20] = SUB_EPI16(step1[19], step2[20]);
+        step3[21] = SUB_EPI16(step1[18], step2[21]);
+        step3[22] = SUB_EPI16(step1[17], step2[22]);
+        step3[23] = SUB_EPI16(step1[16], step2[23]);
+        step3[24] = SUB_EPI16(step1[31], step2[24]);
+        step3[25] = SUB_EPI16(step1[30], step2[25]);
+        step3[26] = SUB_EPI16(step1[29], step2[26]);
+        step3[27] = SUB_EPI16(step1[28], step2[27]);
+        step3[28] = ADD_EPI16(step2[27], step1[28]);
+        step3[29] = ADD_EPI16(step2[26], step1[29]);
+        step3[30] = ADD_EPI16(step2[25], step1[30]);
+        step3[31] = ADD_EPI16(step2[24], step1[31]);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x16(
+            &step3[16], &step3[17], &step3[18], &step3[19],
+            &step3[20], &step3[21], &step3[22], &step3[23],
+            &step3[24], &step3[25], &step3[26], &step3[27],
+            &step3[28], &step3[29], &step3[30], &step3[31]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+
+      // Stage 4
+      {
+        step1[0] = ADD_EPI16(step3[ 3], step3[ 0]);
+        step1[1] = ADD_EPI16(step3[ 2], step3[ 1]);
+        step1[2] = SUB_EPI16(step3[ 1], step3[ 2]);
+        step1[3] = SUB_EPI16(step3[ 0], step3[ 3]);
+        step1[8] = ADD_EPI16(step3[11], step2[ 8]);
+        step1[9] = ADD_EPI16(step3[10], step2[ 9]);
+        step1[10] = SUB_EPI16(step2[ 9], step3[10]);
+        step1[11] = SUB_EPI16(step2[ 8], step3[11]);
+        step1[12] = SUB_EPI16(step2[15], step3[12]);
+        step1[13] = SUB_EPI16(step2[14], step3[13]);
+        step1[14] = ADD_EPI16(step3[13], step2[14]);
+        step1[15] = ADD_EPI16(step3[12], step2[15]);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x16(
+            &step1[0], &step1[1], &step1[2], &step1[3],
+            &step1[4], &step1[5], &step1[6], &step1[7],
+            &step1[8], &step1[9], &step1[10], &step1[11],
+            &step1[12], &step1[13], &step1[14], &step1[15]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+      {
+        const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]);
+        const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]);
+        const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16);
+        const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16);
+        const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16);
+        const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16);
+        // dct_const_round_shift
+        const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
+        const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
+        const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
+        const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
+        const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS);
+        const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS);
+        const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS);
+        const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS);
+        // Combine
+        step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7);
+        step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x2(&step1[5], &step1[6]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+      {
+        const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]);
+        const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]);
+        const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]);
+        const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]);
+        const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]);
+        const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]);
+        const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]);
+        const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]);
+        const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24);
+        const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24);
+        const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24);
+        const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24);
+        const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08);
+        const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08);
+        const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08);
+        const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08);
+        const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24);
+        const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24);
+        const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24);
+        const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24);
+        const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08);
+        const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08);
+        const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08);
+        const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08);
+        // dct_const_round_shift
+        const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
+        const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
+        const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
+        const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
+        const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
+        const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
+        const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
+        const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
+        const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
+        const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
+        const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
+        const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
+        const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
+        const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
+        const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
+        const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
+        const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS);
+        const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS);
+        const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS);
+        const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS);
+        const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS);
+        const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS);
+        const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS);
+        const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS);
+        const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS);
+        const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS);
+        const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS);
+        const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS);
+        const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS);
+        const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS);
+        const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS);
+        const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS);
+        // Combine
+        step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7);
+        step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7);
+        step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7);
+        step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7);
+        step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7);
+        step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7);
+        step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7);
+        step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x8(&step1[18], &step1[19], &step1[20],
+                                           &step1[21], &step1[26], &step1[27],
+                                           &step1[28], &step1[29]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+      // Stage 5
+      {
+        step2[4] = ADD_EPI16(step1[5], step3[4]);
+        step2[5] = SUB_EPI16(step3[4], step1[5]);
+        step2[6] = SUB_EPI16(step3[7], step1[6]);
+        step2[7] = ADD_EPI16(step1[6], step3[7]);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x4(&step2[4], &step2[5],
+                                           &step2[6], &step2[7]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+      {
+        const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]);
+        const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]);
+        const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]);
+        const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]);
+        const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16);
+        const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16);
+        const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16);
+        const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16);
+        const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08);
+        const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08);
+        const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24);
+        const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24);
+        // dct_const_round_shift
+        const __m128i out_00_4 = _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_00_5 = _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_16_4 = _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_16_5 = _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_08_4 = _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_08_5 = _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_24_4 = _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_24_5 = _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS);
+        const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS);
+        const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS);
+        const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS);
+        const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS);
+        const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS);
+        const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS);
+        const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS);
+        // Combine
+        out[ 0] = _mm_packs_epi32(out_00_6, out_00_7);
+        out[16] = _mm_packs_epi32(out_16_6, out_16_7);
+        out[ 8] = _mm_packs_epi32(out_08_6, out_08_7);
+        out[24] = _mm_packs_epi32(out_24_6, out_24_7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x4(&out[0], &out[16],
+                                           &out[8], &out[24]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+      {
+        const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[ 9], step1[14]);
+        const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[ 9], step1[14]);
+        const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]);
+        const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]);
+        const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24);
+        const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24);
+        const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08);
+        const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08);
+        const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24);
+        const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24);
+        const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08);
+        const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08);
+        // dct_const_round_shift
+        const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
+        const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
+        const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
+        const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
+        const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
+        const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
+        const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
+        const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
+        const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS);
+        const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS);
+        const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS);
+        const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS);
+        const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS);
+        const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS);
+        const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS);
+        const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS);
+        // Combine
+        step2[ 9] = _mm_packs_epi32(s2_09_6, s2_09_7);
+        step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7);
+        step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7);
+        step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x4(&step2[9], &step2[10],
+                                           &step2[13], &step2[14]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+      {
+        step2[16] = ADD_EPI16(step1[19], step3[16]);
+        step2[17] = ADD_EPI16(step1[18], step3[17]);
+        step2[18] = SUB_EPI16(step3[17], step1[18]);
+        step2[19] = SUB_EPI16(step3[16], step1[19]);
+        step2[20] = SUB_EPI16(step3[23], step1[20]);
+        step2[21] = SUB_EPI16(step3[22], step1[21]);
+        step2[22] = ADD_EPI16(step1[21], step3[22]);
+        step2[23] = ADD_EPI16(step1[20], step3[23]);
+        step2[24] = ADD_EPI16(step1[27], step3[24]);
+        step2[25] = ADD_EPI16(step1[26], step3[25]);
+        step2[26] = SUB_EPI16(step3[25], step1[26]);
+        step2[27] = SUB_EPI16(step3[24], step1[27]);
+        step2[28] = SUB_EPI16(step3[31], step1[28]);
+        step2[29] = SUB_EPI16(step3[30], step1[29]);
+        step2[30] = ADD_EPI16(step1[29], step3[30]);
+        step2[31] = ADD_EPI16(step1[28], step3[31]);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x16(
+            &step2[16], &step2[17], &step2[18], &step2[19],
+            &step2[20], &step2[21], &step2[22], &step2[23],
+            &step2[24], &step2[25], &step2[26], &step2[27],
+            &step2[28], &step2[29], &step2[30], &step2[31]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+      // Stage 6
+      {
+        const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
+        const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
+        const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
+        const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
+        const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
+        const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
+        const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
+        const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
+        const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04);
+        const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04);
+        const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20);
+        const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20);
+        const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12);
+        const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12);
+        const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28);
+        const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28);
+        // dct_const_round_shift
+        const __m128i out_04_4 = _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_04_5 = _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_20_4 = _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_20_5 = _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_12_4 = _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_12_5 = _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_28_4 = _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_28_5 = _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS);
+        const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS);
+        const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS);
+        const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS);
+        const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS);
+        const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS);
+        const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS);
+        const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS);
+        // Combine
+        out[4] = _mm_packs_epi32(out_04_6, out_04_7);
+        out[20] = _mm_packs_epi32(out_20_6, out_20_7);
+        out[12] = _mm_packs_epi32(out_12_6, out_12_7);
+        out[28] = _mm_packs_epi32(out_28_6, out_28_7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x4(&out[4], &out[20],
+                                           &out[12], &out[28]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+      {
+        step3[8] = ADD_EPI16(step2[ 9], step1[ 8]);
+        step3[9] = SUB_EPI16(step1[ 8], step2[ 9]);
+        step3[10] = SUB_EPI16(step1[11], step2[10]);
+        step3[11] = ADD_EPI16(step2[10], step1[11]);
+        step3[12] = ADD_EPI16(step2[13], step1[12]);
+        step3[13] = SUB_EPI16(step1[12], step2[13]);
+        step3[14] = SUB_EPI16(step1[15], step2[14]);
+        step3[15] = ADD_EPI16(step2[14], step1[15]);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x8(&step3[8], &step3[9], &step3[10],
+                                           &step3[11], &step3[12], &step3[13],
+                                           &step3[14], &step3[15]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+      {
+        const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]);
+        const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]);
+        const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]);
+        const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]);
+        const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]);
+        const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]);
+        const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]);
+        const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]);
+        const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28);
+        const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28);
+        const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04);
+        const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04);
+        const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12);
+        const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12);
+        const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20);
+        const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20);
+        const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12);
+        const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12);
+        const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20);
+        const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20);
+        const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28);
+        const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28);
+        const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04);
+        const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04);
+        // dct_const_round_shift
+        const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
+        const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
+        const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
+        const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
+        const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
+        const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
+        const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
+        const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
+        const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS);
+        const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS);
+        const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS);
+        const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS);
+        const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS);
+        const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS);
+        const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS);
+        const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS);
+        const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
+        const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
+        const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
+        const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
+        const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
+        const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
+        const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
+        const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
+        const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS);
+        const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS);
+        const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS);
+        const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS);
+        const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS);
+        const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS);
+        const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS);
+        const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS);
+        // Combine
+        step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7);
+        step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7);
+        step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7);
+        step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7);
+        // Combine
+        step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7);
+        step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7);
+        step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7);
+        step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x8(&step3[17], &step3[18], &step3[21],
+                                           &step3[22], &step3[25], &step3[26],
+                                           &step3[29], &step3[30]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+      // Stage 7
+      {
+        const __m128i out_02_0 = _mm_unpacklo_epi16(step3[ 8], step3[15]);
+        const __m128i out_02_1 = _mm_unpackhi_epi16(step3[ 8], step3[15]);
+        const __m128i out_18_0 = _mm_unpacklo_epi16(step3[ 9], step3[14]);
+        const __m128i out_18_1 = _mm_unpackhi_epi16(step3[ 9], step3[14]);
+        const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]);
+        const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]);
+        const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]);
+        const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]);
+        const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02);
+        const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02);
+        const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18);
+        const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18);
+        const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10);
+        const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10);
+        const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26);
+        const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26);
+        const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06);
+        const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06);
+        const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22);
+        const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22);
+        const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14);
+        const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14);
+        const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30);
+        const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30);
+        // dct_const_round_shift
+        const __m128i out_02_4 = _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_02_5 = _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_18_4 = _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_18_5 = _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_10_4 = _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_10_5 = _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_26_4 = _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_26_5 = _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_06_4 = _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_06_5 = _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_22_4 = _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_22_5 = _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_14_4 = _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_14_5 = _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_30_4 = _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_30_5 = _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS);
+        const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS);
+        const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS);
+        const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS);
+        const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS);
+        const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS);
+        const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS);
+        const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS);
+        const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS);
+        const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS);
+        const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS);
+        const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS);
+        const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS);
+        const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS);
+        const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS);
+        const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS);
+        // Combine
+        out[ 2] = _mm_packs_epi32(out_02_6, out_02_7);
+        out[18] = _mm_packs_epi32(out_18_6, out_18_7);
+        out[10] = _mm_packs_epi32(out_10_6, out_10_7);
+        out[26] = _mm_packs_epi32(out_26_6, out_26_7);
+        out[ 6] = _mm_packs_epi32(out_06_6, out_06_7);
+        out[22] = _mm_packs_epi32(out_22_6, out_22_7);
+        out[14] = _mm_packs_epi32(out_14_6, out_14_7);
+        out[30] = _mm_packs_epi32(out_30_6, out_30_7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x8(&out[2], &out[18], &out[10],
+                                           &out[26], &out[6], &out[22],
+                                           &out[14], &out[30]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+      {
+        step1[16] = ADD_EPI16(step3[17], step2[16]);
+        step1[17] = SUB_EPI16(step2[16], step3[17]);
+        step1[18] = SUB_EPI16(step2[19], step3[18]);
+        step1[19] = ADD_EPI16(step3[18], step2[19]);
+        step1[20] = ADD_EPI16(step3[21], step2[20]);
+        step1[21] = SUB_EPI16(step2[20], step3[21]);
+        step1[22] = SUB_EPI16(step2[23], step3[22]);
+        step1[23] = ADD_EPI16(step3[22], step2[23]);
+        step1[24] = ADD_EPI16(step3[25], step2[24]);
+        step1[25] = SUB_EPI16(step2[24], step3[25]);
+        step1[26] = SUB_EPI16(step2[27], step3[26]);
+        step1[27] = ADD_EPI16(step3[26], step2[27]);
+        step1[28] = ADD_EPI16(step3[29], step2[28]);
+        step1[29] = SUB_EPI16(step2[28], step3[29]);
+        step1[30] = SUB_EPI16(step2[31], step3[30]);
+        step1[31] = ADD_EPI16(step3[30], step2[31]);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x16(
+            &step1[16], &step1[17], &step1[18], &step1[19],
+            &step1[20], &step1[21], &step1[22], &step1[23],
+            &step1[24], &step1[25], &step1[26], &step1[27],
+            &step1[28], &step1[29], &step1[30], &step1[31]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+      // Final stage --- outputs indices are bit-reversed.
+      {
+        const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]);
+        const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]);
+        const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]);
+        const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]);
+        const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]);
+        const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]);
+        const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]);
+        const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]);
+        const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01);
+        const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01);
+        const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17);
+        const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17);
+        const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09);
+        const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09);
+        const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25);
+        const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25);
+        const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07);
+        const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07);
+        const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23);
+        const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23);
+        const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15);
+        const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15);
+        const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31);
+        const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31);
+        // dct_const_round_shift
+        const __m128i out_01_4 = _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_01_5 = _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_17_4 = _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_17_5 = _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_09_4 = _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_09_5 = _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_25_4 = _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_25_5 = _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_07_4 = _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_07_5 = _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_23_4 = _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_23_5 = _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_15_4 = _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_15_5 = _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_31_4 = _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_31_5 = _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS);
+        const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS);
+        const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS);
+        const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS);
+        const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS);
+        const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS);
+        const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS);
+        const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS);
+        const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS);
+        const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS);
+        const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS);
+        const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS);
+        const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS);
+        const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS);
+        const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS);
+        const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS);
+        // Combine
+        out[ 1] = _mm_packs_epi32(out_01_6, out_01_7);
+        out[17] = _mm_packs_epi32(out_17_6, out_17_7);
+        out[ 9] = _mm_packs_epi32(out_09_6, out_09_7);
+        out[25] = _mm_packs_epi32(out_25_6, out_25_7);
+        out[ 7] = _mm_packs_epi32(out_07_6, out_07_7);
+        out[23] = _mm_packs_epi32(out_23_6, out_23_7);
+        out[15] = _mm_packs_epi32(out_15_6, out_15_7);
+        out[31] = _mm_packs_epi32(out_31_6, out_31_7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x8(&out[1], &out[17], &out[9],
+                                           &out[25], &out[7], &out[23],
+                                           &out[15], &out[31]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+      {
+        const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]);
+        const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]);
+        const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]);
+        const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]);
+        const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]);
+        const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]);
+        const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]);
+        const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]);
+        const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05);
+        const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05);
+        const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21);
+        const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21);
+        const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13);
+        const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13);
+        const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29);
+        const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29);
+        const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03);
+        const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03);
+        const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19);
+        const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19);
+        const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11);
+        const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11);
+        const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27);
+        const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27);
+        // dct_const_round_shift
+        const __m128i out_05_4 = _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_05_5 = _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_21_4 = _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_21_5 = _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_13_4 = _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_13_5 = _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_29_4 = _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_29_5 = _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_03_4 = _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_03_5 = _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_19_4 = _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_19_5 = _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_11_4 = _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_11_5 = _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_27_4 = _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
+        const __m128i out_27_5 = _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
+        const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS);
+        const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS);
+        const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS);
+        const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS);
+        const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS);
+        const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS);
+        const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS);
+        const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS);
+        const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS);
+        const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS);
+        const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS);
+        const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS);
+        const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS);
+        const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS);
+        const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS);
+        const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS);
+        // Combine
+        out[ 5] = _mm_packs_epi32(out_05_6, out_05_7);
+        out[21] = _mm_packs_epi32(out_21_6, out_21_7);
+        out[13] = _mm_packs_epi32(out_13_6, out_13_7);
+        out[29] = _mm_packs_epi32(out_29_6, out_29_7);
+        out[ 3] = _mm_packs_epi32(out_03_6, out_03_7);
+        out[19] = _mm_packs_epi32(out_19_6, out_19_7);
+        out[11] = _mm_packs_epi32(out_11_6, out_11_7);
+        out[27] = _mm_packs_epi32(out_27_6, out_27_7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x8(&out[5], &out[21], &out[13],
+                                           &out[29], &out[3], &out[19],
+                                           &out[11], &out[27]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+#if FDCT32x32_HIGH_PRECISION
+      } else {
+        __m128i lstep1[64], lstep2[64], lstep3[64];
+        __m128i u[32], v[32], sign[16];
+        const __m128i K32One = _mm_set_epi32(1, 1, 1, 1);
+        // start using 32-bit operations
+        // stage 3
+        {
+          // expanding to 32-bit length priori to addition operations
+          lstep2[ 0] = _mm_unpacklo_epi16(step2[ 0], kZero);
+          lstep2[ 1] = _mm_unpackhi_epi16(step2[ 0], kZero);
+          lstep2[ 2] = _mm_unpacklo_epi16(step2[ 1], kZero);
+          lstep2[ 3] = _mm_unpackhi_epi16(step2[ 1], kZero);
+          lstep2[ 4] = _mm_unpacklo_epi16(step2[ 2], kZero);
+          lstep2[ 5] = _mm_unpackhi_epi16(step2[ 2], kZero);
+          lstep2[ 6] = _mm_unpacklo_epi16(step2[ 3], kZero);
+          lstep2[ 7] = _mm_unpackhi_epi16(step2[ 3], kZero);
+          lstep2[ 8] = _mm_unpacklo_epi16(step2[ 4], kZero);
+          lstep2[ 9] = _mm_unpackhi_epi16(step2[ 4], kZero);
+          lstep2[10] = _mm_unpacklo_epi16(step2[ 5], kZero);
+          lstep2[11] = _mm_unpackhi_epi16(step2[ 5], kZero);
+          lstep2[12] = _mm_unpacklo_epi16(step2[ 6], kZero);
+          lstep2[13] = _mm_unpackhi_epi16(step2[ 6], kZero);
+          lstep2[14] = _mm_unpacklo_epi16(step2[ 7], kZero);
+          lstep2[15] = _mm_unpackhi_epi16(step2[ 7], kZero);
+          lstep2[ 0] = _mm_madd_epi16(lstep2[ 0], kOne);
+          lstep2[ 1] = _mm_madd_epi16(lstep2[ 1], kOne);
+          lstep2[ 2] = _mm_madd_epi16(lstep2[ 2], kOne);
+          lstep2[ 3] = _mm_madd_epi16(lstep2[ 3], kOne);
+          lstep2[ 4] = _mm_madd_epi16(lstep2[ 4], kOne);
+          lstep2[ 5] = _mm_madd_epi16(lstep2[ 5], kOne);
+          lstep2[ 6] = _mm_madd_epi16(lstep2[ 6], kOne);
+          lstep2[ 7] = _mm_madd_epi16(lstep2[ 7], kOne);
+          lstep2[ 8] = _mm_madd_epi16(lstep2[ 8], kOne);
+          lstep2[ 9] = _mm_madd_epi16(lstep2[ 9], kOne);
+          lstep2[10] = _mm_madd_epi16(lstep2[10], kOne);
+          lstep2[11] = _mm_madd_epi16(lstep2[11], kOne);
+          lstep2[12] = _mm_madd_epi16(lstep2[12], kOne);
+          lstep2[13] = _mm_madd_epi16(lstep2[13], kOne);
+          lstep2[14] = _mm_madd_epi16(lstep2[14], kOne);
+          lstep2[15] = _mm_madd_epi16(lstep2[15], kOne);
+
+          lstep3[ 0] = _mm_add_epi32(lstep2[14], lstep2[ 0]);
+          lstep3[ 1] = _mm_add_epi32(lstep2[15], lstep2[ 1]);
+          lstep3[ 2] = _mm_add_epi32(lstep2[12], lstep2[ 2]);
+          lstep3[ 3] = _mm_add_epi32(lstep2[13], lstep2[ 3]);
+          lstep3[ 4] = _mm_add_epi32(lstep2[10], lstep2[ 4]);
+          lstep3[ 5] = _mm_add_epi32(lstep2[11], lstep2[ 5]);
+          lstep3[ 6] = _mm_add_epi32(lstep2[ 8], lstep2[ 6]);
+          lstep3[ 7] = _mm_add_epi32(lstep2[ 9], lstep2[ 7]);
+          lstep3[ 8] = _mm_sub_epi32(lstep2[ 6], lstep2[ 8]);
+          lstep3[ 9] = _mm_sub_epi32(lstep2[ 7], lstep2[ 9]);
+          lstep3[10] = _mm_sub_epi32(lstep2[ 4], lstep2[10]);
+          lstep3[11] = _mm_sub_epi32(lstep2[ 5], lstep2[11]);
+          lstep3[12] = _mm_sub_epi32(lstep2[ 2], lstep2[12]);
+          lstep3[13] = _mm_sub_epi32(lstep2[ 3], lstep2[13]);
+          lstep3[14] = _mm_sub_epi32(lstep2[ 0], lstep2[14]);
+          lstep3[15] = _mm_sub_epi32(lstep2[ 1], lstep2[15]);
+        }
+        {
+          const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
+          const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
+          const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
+          const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
+          const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
+          const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
+          const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
+          const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
+          const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
+          const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
+          const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
+          const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
+          // dct_const_round_shift
+          const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
+          const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
+          const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
+          const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
+          const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
+          const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
+          const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
+          const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
+          lstep3[20] = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
+          lstep3[21] = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
+          lstep3[22] = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
+          lstep3[23] = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
+          lstep3[24] = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
+          lstep3[25] = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
+          lstep3[26] = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
+          lstep3[27] = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
+        }
+        {
+          lstep2[40] = _mm_unpacklo_epi16(step2[20], kZero);
+          lstep2[41] = _mm_unpackhi_epi16(step2[20], kZero);
+          lstep2[42] = _mm_unpacklo_epi16(step2[21], kZero);
+          lstep2[43] = _mm_unpackhi_epi16(step2[21], kZero);
+          lstep2[44] = _mm_unpacklo_epi16(step2[22], kZero);
+          lstep2[45] = _mm_unpackhi_epi16(step2[22], kZero);
+          lstep2[46] = _mm_unpacklo_epi16(step2[23], kZero);
+          lstep2[47] = _mm_unpackhi_epi16(step2[23], kZero);
+          lstep2[48] = _mm_unpacklo_epi16(step2[24], kZero);
+          lstep2[49] = _mm_unpackhi_epi16(step2[24], kZero);
+          lstep2[50] = _mm_unpacklo_epi16(step2[25], kZero);
+          lstep2[51] = _mm_unpackhi_epi16(step2[25], kZero);
+          lstep2[52] = _mm_unpacklo_epi16(step2[26], kZero);
+          lstep2[53] = _mm_unpackhi_epi16(step2[26], kZero);
+          lstep2[54] = _mm_unpacklo_epi16(step2[27], kZero);
+          lstep2[55] = _mm_unpackhi_epi16(step2[27], kZero);
+          lstep2[40] = _mm_madd_epi16(lstep2[40], kOne);
+          lstep2[41] = _mm_madd_epi16(lstep2[41], kOne);
+          lstep2[42] = _mm_madd_epi16(lstep2[42], kOne);
+          lstep2[43] = _mm_madd_epi16(lstep2[43], kOne);
+          lstep2[44] = _mm_madd_epi16(lstep2[44], kOne);
+          lstep2[45] = _mm_madd_epi16(lstep2[45], kOne);
+          lstep2[46] = _mm_madd_epi16(lstep2[46], kOne);
+          lstep2[47] = _mm_madd_epi16(lstep2[47], kOne);
+          lstep2[48] = _mm_madd_epi16(lstep2[48], kOne);
+          lstep2[49] = _mm_madd_epi16(lstep2[49], kOne);
+          lstep2[50] = _mm_madd_epi16(lstep2[50], kOne);
+          lstep2[51] = _mm_madd_epi16(lstep2[51], kOne);
+          lstep2[52] = _mm_madd_epi16(lstep2[52], kOne);
+          lstep2[53] = _mm_madd_epi16(lstep2[53], kOne);
+          lstep2[54] = _mm_madd_epi16(lstep2[54], kOne);
+          lstep2[55] = _mm_madd_epi16(lstep2[55], kOne);
+
+          lstep1[32] = _mm_unpacklo_epi16(step1[16], kZero);
+          lstep1[33] = _mm_unpackhi_epi16(step1[16], kZero);
+          lstep1[34] = _mm_unpacklo_epi16(step1[17], kZero);
+          lstep1[35] = _mm_unpackhi_epi16(step1[17], kZero);
+          lstep1[36] = _mm_unpacklo_epi16(step1[18], kZero);
+          lstep1[37] = _mm_unpackhi_epi16(step1[18], kZero);
+          lstep1[38] = _mm_unpacklo_epi16(step1[19], kZero);
+          lstep1[39] = _mm_unpackhi_epi16(step1[19], kZero);
+          lstep1[56] = _mm_unpacklo_epi16(step1[28], kZero);
+          lstep1[57] = _mm_unpackhi_epi16(step1[28], kZero);
+          lstep1[58] = _mm_unpacklo_epi16(step1[29], kZero);
+          lstep1[59] = _mm_unpackhi_epi16(step1[29], kZero);
+          lstep1[60] = _mm_unpacklo_epi16(step1[30], kZero);
+          lstep1[61] = _mm_unpackhi_epi16(step1[30], kZero);
+          lstep1[62] = _mm_unpacklo_epi16(step1[31], kZero);
+          lstep1[63] = _mm_unpackhi_epi16(step1[31], kZero);
+          lstep1[32] = _mm_madd_epi16(lstep1[32], kOne);
+          lstep1[33] = _mm_madd_epi16(lstep1[33], kOne);
+          lstep1[34] = _mm_madd_epi16(lstep1[34], kOne);
+          lstep1[35] = _mm_madd_epi16(lstep1[35], kOne);
+          lstep1[36] = _mm_madd_epi16(lstep1[36], kOne);
+          lstep1[37] = _mm_madd_epi16(lstep1[37], kOne);
+          lstep1[38] = _mm_madd_epi16(lstep1[38], kOne);
+          lstep1[39] = _mm_madd_epi16(lstep1[39], kOne);
+          lstep1[56] = _mm_madd_epi16(lstep1[56], kOne);
+          lstep1[57] = _mm_madd_epi16(lstep1[57], kOne);
+          lstep1[58] = _mm_madd_epi16(lstep1[58], kOne);
+          lstep1[59] = _mm_madd_epi16(lstep1[59], kOne);
+          lstep1[60] = _mm_madd_epi16(lstep1[60], kOne);
+          lstep1[61] = _mm_madd_epi16(lstep1[61], kOne);
+          lstep1[62] = _mm_madd_epi16(lstep1[62], kOne);
+          lstep1[63] = _mm_madd_epi16(lstep1[63], kOne);
+
+          lstep3[32] = _mm_add_epi32(lstep2[46], lstep1[32]);
+          lstep3[33] = _mm_add_epi32(lstep2[47], lstep1[33]);
+
+          lstep3[34] = _mm_add_epi32(lstep2[44], lstep1[34]);
+          lstep3[35] = _mm_add_epi32(lstep2[45], lstep1[35]);
+          lstep3[36] = _mm_add_epi32(lstep2[42], lstep1[36]);
+          lstep3[37] = _mm_add_epi32(lstep2[43], lstep1[37]);
+          lstep3[38] = _mm_add_epi32(lstep2[40], lstep1[38]);
+          lstep3[39] = _mm_add_epi32(lstep2[41], lstep1[39]);
+          lstep3[40] = _mm_sub_epi32(lstep1[38], lstep2[40]);
+          lstep3[41] = _mm_sub_epi32(lstep1[39], lstep2[41]);
+          lstep3[42] = _mm_sub_epi32(lstep1[36], lstep2[42]);
+          lstep3[43] = _mm_sub_epi32(lstep1[37], lstep2[43]);
+          lstep3[44] = _mm_sub_epi32(lstep1[34], lstep2[44]);
+          lstep3[45] = _mm_sub_epi32(lstep1[35], lstep2[45]);
+          lstep3[46] = _mm_sub_epi32(lstep1[32], lstep2[46]);
+          lstep3[47] = _mm_sub_epi32(lstep1[33], lstep2[47]);
+          lstep3[48] = _mm_sub_epi32(lstep1[62], lstep2[48]);
+          lstep3[49] = _mm_sub_epi32(lstep1[63], lstep2[49]);
+          lstep3[50] = _mm_sub_epi32(lstep1[60], lstep2[50]);
+          lstep3[51] = _mm_sub_epi32(lstep1[61], lstep2[51]);
+          lstep3[52] = _mm_sub_epi32(lstep1[58], lstep2[52]);
+          lstep3[53] = _mm_sub_epi32(lstep1[59], lstep2[53]);
+          lstep3[54] = _mm_sub_epi32(lstep1[56], lstep2[54]);
+          lstep3[55] = _mm_sub_epi32(lstep1[57], lstep2[55]);
+          lstep3[56] = _mm_add_epi32(lstep2[54], lstep1[56]);
+          lstep3[57] = _mm_add_epi32(lstep2[55], lstep1[57]);
+          lstep3[58] = _mm_add_epi32(lstep2[52], lstep1[58]);
+          lstep3[59] = _mm_add_epi32(lstep2[53], lstep1[59]);
+          lstep3[60] = _mm_add_epi32(lstep2[50], lstep1[60]);
+          lstep3[61] = _mm_add_epi32(lstep2[51], lstep1[61]);
+          lstep3[62] = _mm_add_epi32(lstep2[48], lstep1[62]);
+          lstep3[63] = _mm_add_epi32(lstep2[49], lstep1[63]);
+        }
+
+        // stage 4
+        {
+          // expanding to 32-bit length priori to addition operations
+          lstep2[16] = _mm_unpacklo_epi16(step2[ 8], kZero);
+          lstep2[17] = _mm_unpackhi_epi16(step2[ 8], kZero);
+          lstep2[18] = _mm_unpacklo_epi16(step2[ 9], kZero);
+          lstep2[19] = _mm_unpackhi_epi16(step2[ 9], kZero);
+          lstep2[28] = _mm_unpacklo_epi16(step2[14], kZero);
+          lstep2[29] = _mm_unpackhi_epi16(step2[14], kZero);
+          lstep2[30] = _mm_unpacklo_epi16(step2[15], kZero);
+          lstep2[31] = _mm_unpackhi_epi16(step2[15], kZero);
+          lstep2[16] = _mm_madd_epi16(lstep2[16], kOne);
+          lstep2[17] = _mm_madd_epi16(lstep2[17], kOne);
+          lstep2[18] = _mm_madd_epi16(lstep2[18], kOne);
+          lstep2[19] = _mm_madd_epi16(lstep2[19], kOne);
+          lstep2[28] = _mm_madd_epi16(lstep2[28], kOne);
+          lstep2[29] = _mm_madd_epi16(lstep2[29], kOne);
+          lstep2[30] = _mm_madd_epi16(lstep2[30], kOne);
+          lstep2[31] = _mm_madd_epi16(lstep2[31], kOne);
+
+          lstep1[ 0] = _mm_add_epi32(lstep3[ 6], lstep3[ 0]);
+          lstep1[ 1] = _mm_add_epi32(lstep3[ 7], lstep3[ 1]);
+          lstep1[ 2] = _mm_add_epi32(lstep3[ 4], lstep3[ 2]);
+          lstep1[ 3] = _mm_add_epi32(lstep3[ 5], lstep3[ 3]);
+          lstep1[ 4] = _mm_sub_epi32(lstep3[ 2], lstep3[ 4]);
+          lstep1[ 5] = _mm_sub_epi32(lstep3[ 3], lstep3[ 5]);
+          lstep1[ 6] = _mm_sub_epi32(lstep3[ 0], lstep3[ 6]);
+          lstep1[ 7] = _mm_sub_epi32(lstep3[ 1], lstep3[ 7]);
+          lstep1[16] = _mm_add_epi32(lstep3[22], lstep2[16]);
+          lstep1[17] = _mm_add_epi32(lstep3[23], lstep2[17]);
+          lstep1[18] = _mm_add_epi32(lstep3[20], lstep2[18]);
+          lstep1[19] = _mm_add_epi32(lstep3[21], lstep2[19]);
+          lstep1[20] = _mm_sub_epi32(lstep2[18], lstep3[20]);
+          lstep1[21] = _mm_sub_epi32(lstep2[19], lstep3[21]);
+          lstep1[22] = _mm_sub_epi32(lstep2[16], lstep3[22]);
+          lstep1[23] = _mm_sub_epi32(lstep2[17], lstep3[23]);
+          lstep1[24] = _mm_sub_epi32(lstep2[30], lstep3[24]);
+          lstep1[25] = _mm_sub_epi32(lstep2[31], lstep3[25]);
+          lstep1[26] = _mm_sub_epi32(lstep2[28], lstep3[26]);
+          lstep1[27] = _mm_sub_epi32(lstep2[29], lstep3[27]);
+          lstep1[28] = _mm_add_epi32(lstep3[26], lstep2[28]);
+          lstep1[29] = _mm_add_epi32(lstep3[27], lstep2[29]);
+          lstep1[30] = _mm_add_epi32(lstep3[24], lstep2[30]);
+          lstep1[31] = _mm_add_epi32(lstep3[25], lstep2[31]);
+        }
+        {
+        // to be continued...
+        //
+        const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
+        const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
+
+        u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]);
+        u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]);
+        u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]);
+        u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]);
+
+        // TODO(jingning): manually inline k_madd_epi32_ to further hide
+        // instruction latency.
+        v[0] = k_madd_epi32(u[0], k32_p16_m16);
+        v[1] = k_madd_epi32(u[1], k32_p16_m16);
+        v[2] = k_madd_epi32(u[2], k32_p16_m16);
+        v[3] = k_madd_epi32(u[3], k32_p16_m16);
+        v[4] = k_madd_epi32(u[0], k32_p16_p16);
+        v[5] = k_madd_epi32(u[1], k32_p16_p16);
+        v[6] = k_madd_epi32(u[2], k32_p16_p16);
+        v[7] = k_madd_epi32(u[3], k32_p16_p16);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = k_check_epi32_overflow_8(&v[0], &v[1], &v[2], &v[3],
+                                            &v[4], &v[5], &v[6], &v[7], &kZero);
+        if (overflow) {
+          HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+        u[0] = k_packs_epi64(v[0], v[1]);
+        u[1] = k_packs_epi64(v[2], v[3]);
+        u[2] = k_packs_epi64(v[4], v[5]);
+        u[3] = k_packs_epi64(v[6], v[7]);
+
+        v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+        v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+        v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+        v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+
+        lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+        lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+        lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+        lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+        }
+        {
+          const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
+          const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64);
+          const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
+
+          u[ 0] = _mm_unpacklo_epi32(lstep3[36], lstep3[58]);
+          u[ 1] = _mm_unpackhi_epi32(lstep3[36], lstep3[58]);
+          u[ 2] = _mm_unpacklo_epi32(lstep3[37], lstep3[59]);
+          u[ 3] = _mm_unpackhi_epi32(lstep3[37], lstep3[59]);
+          u[ 4] = _mm_unpacklo_epi32(lstep3[38], lstep3[56]);
+          u[ 5] = _mm_unpackhi_epi32(lstep3[38], lstep3[56]);
+          u[ 6] = _mm_unpacklo_epi32(lstep3[39], lstep3[57]);
+          u[ 7] = _mm_unpackhi_epi32(lstep3[39], lstep3[57]);
+          u[ 8] = _mm_unpacklo_epi32(lstep3[40], lstep3[54]);
+          u[ 9] = _mm_unpackhi_epi32(lstep3[40], lstep3[54]);
+          u[10] = _mm_unpacklo_epi32(lstep3[41], lstep3[55]);
+          u[11] = _mm_unpackhi_epi32(lstep3[41], lstep3[55]);
+          u[12] = _mm_unpacklo_epi32(lstep3[42], lstep3[52]);
+          u[13] = _mm_unpackhi_epi32(lstep3[42], lstep3[52]);
+          u[14] = _mm_unpacklo_epi32(lstep3[43], lstep3[53]);
+          u[15] = _mm_unpackhi_epi32(lstep3[43], lstep3[53]);
+
+          v[ 0] = k_madd_epi32(u[ 0], k32_m08_p24);
+          v[ 1] = k_madd_epi32(u[ 1], k32_m08_p24);
+          v[ 2] = k_madd_epi32(u[ 2], k32_m08_p24);
+          v[ 3] = k_madd_epi32(u[ 3], k32_m08_p24);
+          v[ 4] = k_madd_epi32(u[ 4], k32_m08_p24);
+          v[ 5] = k_madd_epi32(u[ 5], k32_m08_p24);
+          v[ 6] = k_madd_epi32(u[ 6], k32_m08_p24);
+          v[ 7] = k_madd_epi32(u[ 7], k32_m08_p24);
+          v[ 8] = k_madd_epi32(u[ 8], k32_m24_m08);
+          v[ 9] = k_madd_epi32(u[ 9], k32_m24_m08);
+          v[10] = k_madd_epi32(u[10], k32_m24_m08);
+          v[11] = k_madd_epi32(u[11], k32_m24_m08);
+          v[12] = k_madd_epi32(u[12], k32_m24_m08);
+          v[13] = k_madd_epi32(u[13], k32_m24_m08);
+          v[14] = k_madd_epi32(u[14], k32_m24_m08);
+          v[15] = k_madd_epi32(u[15], k32_m24_m08);
+          v[16] = k_madd_epi32(u[12], k32_m08_p24);
+          v[17] = k_madd_epi32(u[13], k32_m08_p24);
+          v[18] = k_madd_epi32(u[14], k32_m08_p24);
+          v[19] = k_madd_epi32(u[15], k32_m08_p24);
+          v[20] = k_madd_epi32(u[ 8], k32_m08_p24);
+          v[21] = k_madd_epi32(u[ 9], k32_m08_p24);
+          v[22] = k_madd_epi32(u[10], k32_m08_p24);
+          v[23] = k_madd_epi32(u[11], k32_m08_p24);
+          v[24] = k_madd_epi32(u[ 4], k32_p24_p08);
+          v[25] = k_madd_epi32(u[ 5], k32_p24_p08);
+          v[26] = k_madd_epi32(u[ 6], k32_p24_p08);
+          v[27] = k_madd_epi32(u[ 7], k32_p24_p08);
+          v[28] = k_madd_epi32(u[ 0], k32_p24_p08);
+          v[29] = k_madd_epi32(u[ 1], k32_p24_p08);
+          v[30] = k_madd_epi32(u[ 2], k32_p24_p08);
+          v[31] = k_madd_epi32(u[ 3], k32_p24_p08);
+
+#if DCT_HIGH_BIT_DEPTH
+          overflow = k_check_epi32_overflow_32(
+              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+              &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+              &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23],
+              &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31],
+              &kZero);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+          u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
+          u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
+          u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
+          u[ 3] = k_packs_epi64(v[ 6], v[ 7]);
+          u[ 4] = k_packs_epi64(v[ 8], v[ 9]);
+          u[ 5] = k_packs_epi64(v[10], v[11]);
+          u[ 6] = k_packs_epi64(v[12], v[13]);
+          u[ 7] = k_packs_epi64(v[14], v[15]);
+          u[ 8] = k_packs_epi64(v[16], v[17]);
+          u[ 9] = k_packs_epi64(v[18], v[19]);
+          u[10] = k_packs_epi64(v[20], v[21]);
+          u[11] = k_packs_epi64(v[22], v[23]);
+          u[12] = k_packs_epi64(v[24], v[25]);
+          u[13] = k_packs_epi64(v[26], v[27]);
+          u[14] = k_packs_epi64(v[28], v[29]);
+          u[15] = k_packs_epi64(v[30], v[31]);
+
+          v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+          v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+          v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+          v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+          v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+          v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+          v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+          v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+          v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+          v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+          v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+          v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+          v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+          v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+          v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+          v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+          lstep1[36] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS);
+          lstep1[37] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS);
+          lstep1[38] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS);
+          lstep1[39] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS);
+          lstep1[40] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS);
+          lstep1[41] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS);
+          lstep1[42] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS);
+          lstep1[43] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS);
+          lstep1[52] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS);
+          lstep1[53] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS);
+          lstep1[54] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+          lstep1[55] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+          lstep1[56] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+          lstep1[57] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+          lstep1[58] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+          lstep1[59] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+        }
+        // stage 5
+        {
+          lstep2[ 8] = _mm_add_epi32(lstep1[10], lstep3[ 8]);
+          lstep2[ 9] = _mm_add_epi32(lstep1[11], lstep3[ 9]);
+          lstep2[10] = _mm_sub_epi32(lstep3[ 8], lstep1[10]);
+          lstep2[11] = _mm_sub_epi32(lstep3[ 9], lstep1[11]);
+          lstep2[12] = _mm_sub_epi32(lstep3[14], lstep1[12]);
+          lstep2[13] = _mm_sub_epi32(lstep3[15], lstep1[13]);
+          lstep2[14] = _mm_add_epi32(lstep1[12], lstep3[14]);
+          lstep2[15] = _mm_add_epi32(lstep1[13], lstep3[15]);
+        }
+        {
+          const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
+          const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
+          const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
+          const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
+
+          u[0] = _mm_unpacklo_epi32(lstep1[0], lstep1[2]);
+          u[1] = _mm_unpackhi_epi32(lstep1[0], lstep1[2]);
+          u[2] = _mm_unpacklo_epi32(lstep1[1], lstep1[3]);
+          u[3] = _mm_unpackhi_epi32(lstep1[1], lstep1[3]);
+          u[4] = _mm_unpacklo_epi32(lstep1[4], lstep1[6]);
+          u[5] = _mm_unpackhi_epi32(lstep1[4], lstep1[6]);
+          u[6] = _mm_unpacklo_epi32(lstep1[5], lstep1[7]);
+          u[7] = _mm_unpackhi_epi32(lstep1[5], lstep1[7]);
+
+          // TODO(jingning): manually inline k_madd_epi32_ to further hide
+          // instruction latency.
+          v[ 0] = k_madd_epi32(u[0], k32_p16_p16);
+          v[ 1] = k_madd_epi32(u[1], k32_p16_p16);
+          v[ 2] = k_madd_epi32(u[2], k32_p16_p16);
+          v[ 3] = k_madd_epi32(u[3], k32_p16_p16);
+          v[ 4] = k_madd_epi32(u[0], k32_p16_m16);
+          v[ 5] = k_madd_epi32(u[1], k32_p16_m16);
+          v[ 6] = k_madd_epi32(u[2], k32_p16_m16);
+          v[ 7] = k_madd_epi32(u[3], k32_p16_m16);
+          v[ 8] = k_madd_epi32(u[4], k32_p24_p08);
+          v[ 9] = k_madd_epi32(u[5], k32_p24_p08);
+          v[10] = k_madd_epi32(u[6], k32_p24_p08);
+          v[11] = k_madd_epi32(u[7], k32_p24_p08);
+          v[12] = k_madd_epi32(u[4], k32_m08_p24);
+          v[13] = k_madd_epi32(u[5], k32_m08_p24);
+          v[14] = k_madd_epi32(u[6], k32_m08_p24);
+          v[15] = k_madd_epi32(u[7], k32_m08_p24);
+
+#if DCT_HIGH_BIT_DEPTH
+          overflow = k_check_epi32_overflow_16(
+              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+              &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+              &kZero);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+          u[0] = k_packs_epi64(v[0], v[1]);
+          u[1] = k_packs_epi64(v[2], v[3]);
+          u[2] = k_packs_epi64(v[4], v[5]);
+          u[3] = k_packs_epi64(v[6], v[7]);
+          u[4] = k_packs_epi64(v[8], v[9]);
+          u[5] = k_packs_epi64(v[10], v[11]);
+          u[6] = k_packs_epi64(v[12], v[13]);
+          u[7] = k_packs_epi64(v[14], v[15]);
+
+          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+          v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+          v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+          v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+          v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+          u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+          u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+          u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+          u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+          u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+          u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+          u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+          u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+
+          sign[0] = _mm_cmplt_epi32(u[0], kZero);
+          sign[1] = _mm_cmplt_epi32(u[1], kZero);
+          sign[2] = _mm_cmplt_epi32(u[2], kZero);
+          sign[3] = _mm_cmplt_epi32(u[3], kZero);
+          sign[4] = _mm_cmplt_epi32(u[4], kZero);
+          sign[5] = _mm_cmplt_epi32(u[5], kZero);
+          sign[6] = _mm_cmplt_epi32(u[6], kZero);
+          sign[7] = _mm_cmplt_epi32(u[7], kZero);
+
+          u[0] = _mm_sub_epi32(u[0], sign[0]);
+          u[1] = _mm_sub_epi32(u[1], sign[1]);
+          u[2] = _mm_sub_epi32(u[2], sign[2]);
+          u[3] = _mm_sub_epi32(u[3], sign[3]);
+          u[4] = _mm_sub_epi32(u[4], sign[4]);
+          u[5] = _mm_sub_epi32(u[5], sign[5]);
+          u[6] = _mm_sub_epi32(u[6], sign[6]);
+          u[7] = _mm_sub_epi32(u[7], sign[7]);
+
+          u[0] = _mm_add_epi32(u[0], K32One);
+          u[1] = _mm_add_epi32(u[1], K32One);
+          u[2] = _mm_add_epi32(u[2], K32One);
+          u[3] = _mm_add_epi32(u[3], K32One);
+          u[4] = _mm_add_epi32(u[4], K32One);
+          u[5] = _mm_add_epi32(u[5], K32One);
+          u[6] = _mm_add_epi32(u[6], K32One);
+          u[7] = _mm_add_epi32(u[7], K32One);
+
+          u[0] = _mm_srai_epi32(u[0], 2);
+          u[1] = _mm_srai_epi32(u[1], 2);
+          u[2] = _mm_srai_epi32(u[2], 2);
+          u[3] = _mm_srai_epi32(u[3], 2);
+          u[4] = _mm_srai_epi32(u[4], 2);
+          u[5] = _mm_srai_epi32(u[5], 2);
+          u[6] = _mm_srai_epi32(u[6], 2);
+          u[7] = _mm_srai_epi32(u[7], 2);
+
+          // Combine
+          out[ 0] = _mm_packs_epi32(u[0], u[1]);
+          out[16] = _mm_packs_epi32(u[2], u[3]);
+          out[ 8] = _mm_packs_epi32(u[4], u[5]);
+          out[24] = _mm_packs_epi32(u[6], u[7]);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x4(&out[0], &out[16],
+                                             &out[8], &out[24]);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+        }
+        {
+          const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
+          const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64);
+          const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
+
+          u[0] = _mm_unpacklo_epi32(lstep1[18], lstep1[28]);
+          u[1] = _mm_unpackhi_epi32(lstep1[18], lstep1[28]);
+          u[2] = _mm_unpacklo_epi32(lstep1[19], lstep1[29]);
+          u[3] = _mm_unpackhi_epi32(lstep1[19], lstep1[29]);
+          u[4] = _mm_unpacklo_epi32(lstep1[20], lstep1[26]);
+          u[5] = _mm_unpackhi_epi32(lstep1[20], lstep1[26]);
+          u[6] = _mm_unpacklo_epi32(lstep1[21], lstep1[27]);
+          u[7] = _mm_unpackhi_epi32(lstep1[21], lstep1[27]);
+
+          v[0] = k_madd_epi32(u[0], k32_m08_p24);
+          v[1] = k_madd_epi32(u[1], k32_m08_p24);
+          v[2] = k_madd_epi32(u[2], k32_m08_p24);
+          v[3] = k_madd_epi32(u[3], k32_m08_p24);
+          v[4] = k_madd_epi32(u[4], k32_m24_m08);
+          v[5] = k_madd_epi32(u[5], k32_m24_m08);
+          v[6] = k_madd_epi32(u[6], k32_m24_m08);
+          v[7] = k_madd_epi32(u[7], k32_m24_m08);
+          v[ 8] = k_madd_epi32(u[4], k32_m08_p24);
+          v[ 9] = k_madd_epi32(u[5], k32_m08_p24);
+          v[10] = k_madd_epi32(u[6], k32_m08_p24);
+          v[11] = k_madd_epi32(u[7], k32_m08_p24);
+          v[12] = k_madd_epi32(u[0], k32_p24_p08);
+          v[13] = k_madd_epi32(u[1], k32_p24_p08);
+          v[14] = k_madd_epi32(u[2], k32_p24_p08);
+          v[15] = k_madd_epi32(u[3], k32_p24_p08);
+
+#if DCT_HIGH_BIT_DEPTH
+          overflow = k_check_epi32_overflow_16(
+              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+              &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+              &kZero);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+          u[0] = k_packs_epi64(v[0], v[1]);
+          u[1] = k_packs_epi64(v[2], v[3]);
+          u[2] = k_packs_epi64(v[4], v[5]);
+          u[3] = k_packs_epi64(v[6], v[7]);
+          u[4] = k_packs_epi64(v[8], v[9]);
+          u[5] = k_packs_epi64(v[10], v[11]);
+          u[6] = k_packs_epi64(v[12], v[13]);
+          u[7] = k_packs_epi64(v[14], v[15]);
+
+          u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+          u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+          u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+          u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+          u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+          u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+          u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+          u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+          lstep2[18] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+          lstep2[19] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+          lstep2[20] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+          lstep2[21] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+          lstep2[26] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+          lstep2[27] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+          lstep2[28] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+          lstep2[29] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+        }
+        {
+          lstep2[32] = _mm_add_epi32(lstep1[38], lstep3[32]);
+          lstep2[33] = _mm_add_epi32(lstep1[39], lstep3[33]);
+          lstep2[34] = _mm_add_epi32(lstep1[36], lstep3[34]);
+          lstep2[35] = _mm_add_epi32(lstep1[37], lstep3[35]);
+          lstep2[36] = _mm_sub_epi32(lstep3[34], lstep1[36]);
+          lstep2[37] = _mm_sub_epi32(lstep3[35], lstep1[37]);
+          lstep2[38] = _mm_sub_epi32(lstep3[32], lstep1[38]);
+          lstep2[39] = _mm_sub_epi32(lstep3[33], lstep1[39]);
+          lstep2[40] = _mm_sub_epi32(lstep3[46], lstep1[40]);
+          lstep2[41] = _mm_sub_epi32(lstep3[47], lstep1[41]);
+          lstep2[42] = _mm_sub_epi32(lstep3[44], lstep1[42]);
+          lstep2[43] = _mm_sub_epi32(lstep3[45], lstep1[43]);
+          lstep2[44] = _mm_add_epi32(lstep1[42], lstep3[44]);
+          lstep2[45] = _mm_add_epi32(lstep1[43], lstep3[45]);
+          lstep2[46] = _mm_add_epi32(lstep1[40], lstep3[46]);
+          lstep2[47] = _mm_add_epi32(lstep1[41], lstep3[47]);
+          lstep2[48] = _mm_add_epi32(lstep1[54], lstep3[48]);
+          lstep2[49] = _mm_add_epi32(lstep1[55], lstep3[49]);
+          lstep2[50] = _mm_add_epi32(lstep1[52], lstep3[50]);
+          lstep2[51] = _mm_add_epi32(lstep1[53], lstep3[51]);
+          lstep2[52] = _mm_sub_epi32(lstep3[50], lstep1[52]);
+          lstep2[53] = _mm_sub_epi32(lstep3[51], lstep1[53]);
+          lstep2[54] = _mm_sub_epi32(lstep3[48], lstep1[54]);
+          lstep2[55] = _mm_sub_epi32(lstep3[49], lstep1[55]);
+          lstep2[56] = _mm_sub_epi32(lstep3[62], lstep1[56]);
+          lstep2[57] = _mm_sub_epi32(lstep3[63], lstep1[57]);
+          lstep2[58] = _mm_sub_epi32(lstep3[60], lstep1[58]);
+          lstep2[59] = _mm_sub_epi32(lstep3[61], lstep1[59]);
+          lstep2[60] = _mm_add_epi32(lstep1[58], lstep3[60]);
+          lstep2[61] = _mm_add_epi32(lstep1[59], lstep3[61]);
+          lstep2[62] = _mm_add_epi32(lstep1[56], lstep3[62]);
+          lstep2[63] = _mm_add_epi32(lstep1[57], lstep3[63]);
+        }
+        // stage 6
+        {
+          const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64);
+          const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64);
+          const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64);
+          const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64);
+
+          u[0] = _mm_unpacklo_epi32(lstep2[ 8], lstep2[14]);
+          u[1] = _mm_unpackhi_epi32(lstep2[ 8], lstep2[14]);
+          u[2] = _mm_unpacklo_epi32(lstep2[ 9], lstep2[15]);
+          u[3] = _mm_unpackhi_epi32(lstep2[ 9], lstep2[15]);
+          u[4] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]);
+          u[5] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]);
+          u[6] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]);
+          u[7] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]);
+          u[8] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]);
+          u[9] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]);
+          u[10] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]);
+          u[11] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]);
+          u[12] = _mm_unpacklo_epi32(lstep2[ 8], lstep2[14]);
+          u[13] = _mm_unpackhi_epi32(lstep2[ 8], lstep2[14]);
+          u[14] = _mm_unpacklo_epi32(lstep2[ 9], lstep2[15]);
+          u[15] = _mm_unpackhi_epi32(lstep2[ 9], lstep2[15]);
+
+          v[0] = k_madd_epi32(u[0], k32_p28_p04);
+          v[1] = k_madd_epi32(u[1], k32_p28_p04);
+          v[2] = k_madd_epi32(u[2], k32_p28_p04);
+          v[3] = k_madd_epi32(u[3], k32_p28_p04);
+          v[4] = k_madd_epi32(u[4], k32_p12_p20);
+          v[5] = k_madd_epi32(u[5], k32_p12_p20);
+          v[6] = k_madd_epi32(u[6], k32_p12_p20);
+          v[7] = k_madd_epi32(u[7], k32_p12_p20);
+          v[ 8] = k_madd_epi32(u[ 8], k32_m20_p12);
+          v[ 9] = k_madd_epi32(u[ 9], k32_m20_p12);
+          v[10] = k_madd_epi32(u[10], k32_m20_p12);
+          v[11] = k_madd_epi32(u[11], k32_m20_p12);
+          v[12] = k_madd_epi32(u[12], k32_m04_p28);
+          v[13] = k_madd_epi32(u[13], k32_m04_p28);
+          v[14] = k_madd_epi32(u[14], k32_m04_p28);
+          v[15] = k_madd_epi32(u[15], k32_m04_p28);
+
+#if DCT_HIGH_BIT_DEPTH
+          overflow = k_check_epi32_overflow_16(
+              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+              &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+              &kZero);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+          u[0] = k_packs_epi64(v[0], v[1]);
+          u[1] = k_packs_epi64(v[2], v[3]);
+          u[2] = k_packs_epi64(v[4], v[5]);
+          u[3] = k_packs_epi64(v[6], v[7]);
+          u[4] = k_packs_epi64(v[8], v[9]);
+          u[5] = k_packs_epi64(v[10], v[11]);
+          u[6] = k_packs_epi64(v[12], v[13]);
+          u[7] = k_packs_epi64(v[14], v[15]);
+
+          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+          v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+          v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+          v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+          v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+          u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+          u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+          u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+          u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+          u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+          u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+          u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+          u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+
+          sign[0] = _mm_cmplt_epi32(u[0], kZero);
+          sign[1] = _mm_cmplt_epi32(u[1], kZero);
+          sign[2] = _mm_cmplt_epi32(u[2], kZero);
+          sign[3] = _mm_cmplt_epi32(u[3], kZero);
+          sign[4] = _mm_cmplt_epi32(u[4], kZero);
+          sign[5] = _mm_cmplt_epi32(u[5], kZero);
+          sign[6] = _mm_cmplt_epi32(u[6], kZero);
+          sign[7] = _mm_cmplt_epi32(u[7], kZero);
+
+          u[0] = _mm_sub_epi32(u[0], sign[0]);
+          u[1] = _mm_sub_epi32(u[1], sign[1]);
+          u[2] = _mm_sub_epi32(u[2], sign[2]);
+          u[3] = _mm_sub_epi32(u[3], sign[3]);
+          u[4] = _mm_sub_epi32(u[4], sign[4]);
+          u[5] = _mm_sub_epi32(u[5], sign[5]);
+          u[6] = _mm_sub_epi32(u[6], sign[6]);
+          u[7] = _mm_sub_epi32(u[7], sign[7]);
+
+          u[0] = _mm_add_epi32(u[0], K32One);
+          u[1] = _mm_add_epi32(u[1], K32One);
+          u[2] = _mm_add_epi32(u[2], K32One);
+          u[3] = _mm_add_epi32(u[3], K32One);
+          u[4] = _mm_add_epi32(u[4], K32One);
+          u[5] = _mm_add_epi32(u[5], K32One);
+          u[6] = _mm_add_epi32(u[6], K32One);
+          u[7] = _mm_add_epi32(u[7], K32One);
+
+          u[0] = _mm_srai_epi32(u[0], 2);
+          u[1] = _mm_srai_epi32(u[1], 2);
+          u[2] = _mm_srai_epi32(u[2], 2);
+          u[3] = _mm_srai_epi32(u[3], 2);
+          u[4] = _mm_srai_epi32(u[4], 2);
+          u[5] = _mm_srai_epi32(u[5], 2);
+          u[6] = _mm_srai_epi32(u[6], 2);
+          u[7] = _mm_srai_epi32(u[7], 2);
+
+          out[ 4] = _mm_packs_epi32(u[0], u[1]);
+          out[20] = _mm_packs_epi32(u[2], u[3]);
+          out[12] = _mm_packs_epi32(u[4], u[5]);
+          out[28] = _mm_packs_epi32(u[6], u[7]);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x4(&out[4], &out[20],
+                                             &out[12], &out[28]);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+        }
+        {
+          lstep3[16] = _mm_add_epi32(lstep2[18], lstep1[16]);
+          lstep3[17] = _mm_add_epi32(lstep2[19], lstep1[17]);
+          lstep3[18] = _mm_sub_epi32(lstep1[16], lstep2[18]);
+          lstep3[19] = _mm_sub_epi32(lstep1[17], lstep2[19]);
+          lstep3[20] = _mm_sub_epi32(lstep1[22], lstep2[20]);
+          lstep3[21] = _mm_sub_epi32(lstep1[23], lstep2[21]);
+          lstep3[22] = _mm_add_epi32(lstep2[20], lstep1[22]);
+          lstep3[23] = _mm_add_epi32(lstep2[21], lstep1[23]);
+          lstep3[24] = _mm_add_epi32(lstep2[26], lstep1[24]);
+          lstep3[25] = _mm_add_epi32(lstep2[27], lstep1[25]);
+          lstep3[26] = _mm_sub_epi32(lstep1[24], lstep2[26]);
+          lstep3[27] = _mm_sub_epi32(lstep1[25], lstep2[27]);
+          lstep3[28] = _mm_sub_epi32(lstep1[30], lstep2[28]);
+          lstep3[29] = _mm_sub_epi32(lstep1[31], lstep2[29]);
+          lstep3[30] = _mm_add_epi32(lstep2[28], lstep1[30]);
+          lstep3[31] = _mm_add_epi32(lstep2[29], lstep1[31]);
+        }
+        {
+          const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64);
+          const __m128i k32_m28_m04 = pair_set_epi32(-cospi_28_64, -cospi_4_64);
+          const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64);
+          const __m128i k32_m12_m20 = pair_set_epi32(-cospi_12_64,
+                                                     -cospi_20_64);
+          const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64);
+          const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64);
+
+          u[ 0] = _mm_unpacklo_epi32(lstep2[34], lstep2[60]);
+          u[ 1] = _mm_unpackhi_epi32(lstep2[34], lstep2[60]);
+          u[ 2] = _mm_unpacklo_epi32(lstep2[35], lstep2[61]);
+          u[ 3] = _mm_unpackhi_epi32(lstep2[35], lstep2[61]);
+          u[ 4] = _mm_unpacklo_epi32(lstep2[36], lstep2[58]);
+          u[ 5] = _mm_unpackhi_epi32(lstep2[36], lstep2[58]);
+          u[ 6] = _mm_unpacklo_epi32(lstep2[37], lstep2[59]);
+          u[ 7] = _mm_unpackhi_epi32(lstep2[37], lstep2[59]);
+          u[ 8] = _mm_unpacklo_epi32(lstep2[42], lstep2[52]);
+          u[ 9] = _mm_unpackhi_epi32(lstep2[42], lstep2[52]);
+          u[10] = _mm_unpacklo_epi32(lstep2[43], lstep2[53]);
+          u[11] = _mm_unpackhi_epi32(lstep2[43], lstep2[53]);
+          u[12] = _mm_unpacklo_epi32(lstep2[44], lstep2[50]);
+          u[13] = _mm_unpackhi_epi32(lstep2[44], lstep2[50]);
+          u[14] = _mm_unpacklo_epi32(lstep2[45], lstep2[51]);
+          u[15] = _mm_unpackhi_epi32(lstep2[45], lstep2[51]);
+
+          v[ 0] = k_madd_epi32(u[ 0], k32_m04_p28);
+          v[ 1] = k_madd_epi32(u[ 1], k32_m04_p28);
+          v[ 2] = k_madd_epi32(u[ 2], k32_m04_p28);
+          v[ 3] = k_madd_epi32(u[ 3], k32_m04_p28);
+          v[ 4] = k_madd_epi32(u[ 4], k32_m28_m04);
+          v[ 5] = k_madd_epi32(u[ 5], k32_m28_m04);
+          v[ 6] = k_madd_epi32(u[ 6], k32_m28_m04);
+          v[ 7] = k_madd_epi32(u[ 7], k32_m28_m04);
+          v[ 8] = k_madd_epi32(u[ 8], k32_m20_p12);
+          v[ 9] = k_madd_epi32(u[ 9], k32_m20_p12);
+          v[10] = k_madd_epi32(u[10], k32_m20_p12);
+          v[11] = k_madd_epi32(u[11], k32_m20_p12);
+          v[12] = k_madd_epi32(u[12], k32_m12_m20);
+          v[13] = k_madd_epi32(u[13], k32_m12_m20);
+          v[14] = k_madd_epi32(u[14], k32_m12_m20);
+          v[15] = k_madd_epi32(u[15], k32_m12_m20);
+          v[16] = k_madd_epi32(u[12], k32_m20_p12);
+          v[17] = k_madd_epi32(u[13], k32_m20_p12);
+          v[18] = k_madd_epi32(u[14], k32_m20_p12);
+          v[19] = k_madd_epi32(u[15], k32_m20_p12);
+          v[20] = k_madd_epi32(u[ 8], k32_p12_p20);
+          v[21] = k_madd_epi32(u[ 9], k32_p12_p20);
+          v[22] = k_madd_epi32(u[10], k32_p12_p20);
+          v[23] = k_madd_epi32(u[11], k32_p12_p20);
+          v[24] = k_madd_epi32(u[ 4], k32_m04_p28);
+          v[25] = k_madd_epi32(u[ 5], k32_m04_p28);
+          v[26] = k_madd_epi32(u[ 6], k32_m04_p28);
+          v[27] = k_madd_epi32(u[ 7], k32_m04_p28);
+          v[28] = k_madd_epi32(u[ 0], k32_p28_p04);
+          v[29] = k_madd_epi32(u[ 1], k32_p28_p04);
+          v[30] = k_madd_epi32(u[ 2], k32_p28_p04);
+          v[31] = k_madd_epi32(u[ 3], k32_p28_p04);
+
+#if DCT_HIGH_BIT_DEPTH
+          overflow = k_check_epi32_overflow_32(
+              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+              &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+              &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23],
+              &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31],
+              &kZero);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+          u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
+          u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
+          u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
+          u[ 3] = k_packs_epi64(v[ 6], v[ 7]);
+          u[ 4] = k_packs_epi64(v[ 8], v[ 9]);
+          u[ 5] = k_packs_epi64(v[10], v[11]);
+          u[ 6] = k_packs_epi64(v[12], v[13]);
+          u[ 7] = k_packs_epi64(v[14], v[15]);
+          u[ 8] = k_packs_epi64(v[16], v[17]);
+          u[ 9] = k_packs_epi64(v[18], v[19]);
+          u[10] = k_packs_epi64(v[20], v[21]);
+          u[11] = k_packs_epi64(v[22], v[23]);
+          u[12] = k_packs_epi64(v[24], v[25]);
+          u[13] = k_packs_epi64(v[26], v[27]);
+          u[14] = k_packs_epi64(v[28], v[29]);
+          u[15] = k_packs_epi64(v[30], v[31]);
+
+          v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+          v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+          v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+          v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+          v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+          v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+          v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+          v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+          v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+          v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+          v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+          v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+          v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+          v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+          v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+          v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+          lstep3[34] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS);
+          lstep3[35] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS);
+          lstep3[36] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS);
+          lstep3[37] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS);
+          lstep3[42] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS);
+          lstep3[43] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS);
+          lstep3[44] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS);
+          lstep3[45] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS);
+          lstep3[50] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS);
+          lstep3[51] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS);
+          lstep3[52] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+          lstep3[53] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+          lstep3[58] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+          lstep3[59] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+          lstep3[60] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+          lstep3[61] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+        }
+        // stage 7
+        {
+          const __m128i k32_p30_p02 = pair_set_epi32(cospi_30_64, cospi_2_64);
+          const __m128i k32_p14_p18 = pair_set_epi32(cospi_14_64, cospi_18_64);
+          const __m128i k32_p22_p10 = pair_set_epi32(cospi_22_64, cospi_10_64);
+          const __m128i k32_p06_p26 = pair_set_epi32(cospi_6_64,  cospi_26_64);
+          const __m128i k32_m26_p06 = pair_set_epi32(-cospi_26_64, cospi_6_64);
+          const __m128i k32_m10_p22 = pair_set_epi32(-cospi_10_64, cospi_22_64);
+          const __m128i k32_m18_p14 = pair_set_epi32(-cospi_18_64, cospi_14_64);
+          const __m128i k32_m02_p30 = pair_set_epi32(-cospi_2_64, cospi_30_64);
+
+          u[ 0] = _mm_unpacklo_epi32(lstep3[16], lstep3[30]);
+          u[ 1] = _mm_unpackhi_epi32(lstep3[16], lstep3[30]);
+          u[ 2] = _mm_unpacklo_epi32(lstep3[17], lstep3[31]);
+          u[ 3] = _mm_unpackhi_epi32(lstep3[17], lstep3[31]);
+          u[ 4] = _mm_unpacklo_epi32(lstep3[18], lstep3[28]);
+          u[ 5] = _mm_unpackhi_epi32(lstep3[18], lstep3[28]);
+          u[ 6] = _mm_unpacklo_epi32(lstep3[19], lstep3[29]);
+          u[ 7] = _mm_unpackhi_epi32(lstep3[19], lstep3[29]);
+          u[ 8] = _mm_unpacklo_epi32(lstep3[20], lstep3[26]);
+          u[ 9] = _mm_unpackhi_epi32(lstep3[20], lstep3[26]);
+          u[10] = _mm_unpacklo_epi32(lstep3[21], lstep3[27]);
+          u[11] = _mm_unpackhi_epi32(lstep3[21], lstep3[27]);
+          u[12] = _mm_unpacklo_epi32(lstep3[22], lstep3[24]);
+          u[13] = _mm_unpackhi_epi32(lstep3[22], lstep3[24]);
+          u[14] = _mm_unpacklo_epi32(lstep3[23], lstep3[25]);
+          u[15] = _mm_unpackhi_epi32(lstep3[23], lstep3[25]);
+
+          v[ 0] = k_madd_epi32(u[ 0], k32_p30_p02);
+          v[ 1] = k_madd_epi32(u[ 1], k32_p30_p02);
+          v[ 2] = k_madd_epi32(u[ 2], k32_p30_p02);
+          v[ 3] = k_madd_epi32(u[ 3], k32_p30_p02);
+          v[ 4] = k_madd_epi32(u[ 4], k32_p14_p18);
+          v[ 5] = k_madd_epi32(u[ 5], k32_p14_p18);
+          v[ 6] = k_madd_epi32(u[ 6], k32_p14_p18);
+          v[ 7] = k_madd_epi32(u[ 7], k32_p14_p18);
+          v[ 8] = k_madd_epi32(u[ 8], k32_p22_p10);
+          v[ 9] = k_madd_epi32(u[ 9], k32_p22_p10);
+          v[10] = k_madd_epi32(u[10], k32_p22_p10);
+          v[11] = k_madd_epi32(u[11], k32_p22_p10);
+          v[12] = k_madd_epi32(u[12], k32_p06_p26);
+          v[13] = k_madd_epi32(u[13], k32_p06_p26);
+          v[14] = k_madd_epi32(u[14], k32_p06_p26);
+          v[15] = k_madd_epi32(u[15], k32_p06_p26);
+          v[16] = k_madd_epi32(u[12], k32_m26_p06);
+          v[17] = k_madd_epi32(u[13], k32_m26_p06);
+          v[18] = k_madd_epi32(u[14], k32_m26_p06);
+          v[19] = k_madd_epi32(u[15], k32_m26_p06);
+          v[20] = k_madd_epi32(u[ 8], k32_m10_p22);
+          v[21] = k_madd_epi32(u[ 9], k32_m10_p22);
+          v[22] = k_madd_epi32(u[10], k32_m10_p22);
+          v[23] = k_madd_epi32(u[11], k32_m10_p22);
+          v[24] = k_madd_epi32(u[ 4], k32_m18_p14);
+          v[25] = k_madd_epi32(u[ 5], k32_m18_p14);
+          v[26] = k_madd_epi32(u[ 6], k32_m18_p14);
+          v[27] = k_madd_epi32(u[ 7], k32_m18_p14);
+          v[28] = k_madd_epi32(u[ 0], k32_m02_p30);
+          v[29] = k_madd_epi32(u[ 1], k32_m02_p30);
+          v[30] = k_madd_epi32(u[ 2], k32_m02_p30);
+          v[31] = k_madd_epi32(u[ 3], k32_m02_p30);
+
+#if DCT_HIGH_BIT_DEPTH
+          overflow = k_check_epi32_overflow_32(
+              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+              &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+              &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23],
+              &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31],
+              &kZero);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+          u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
+          u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
+          u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
+          u[ 3] = k_packs_epi64(v[ 6], v[ 7]);
+          u[ 4] = k_packs_epi64(v[ 8], v[ 9]);
+          u[ 5] = k_packs_epi64(v[10], v[11]);
+          u[ 6] = k_packs_epi64(v[12], v[13]);
+          u[ 7] = k_packs_epi64(v[14], v[15]);
+          u[ 8] = k_packs_epi64(v[16], v[17]);
+          u[ 9] = k_packs_epi64(v[18], v[19]);
+          u[10] = k_packs_epi64(v[20], v[21]);
+          u[11] = k_packs_epi64(v[22], v[23]);
+          u[12] = k_packs_epi64(v[24], v[25]);
+          u[13] = k_packs_epi64(v[26], v[27]);
+          u[14] = k_packs_epi64(v[28], v[29]);
+          u[15] = k_packs_epi64(v[30], v[31]);
+
+          v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+          v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+          v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+          v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+          v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+          v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+          v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+          v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+          v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+          v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+          v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+          v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+          v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+          v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+          v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+          v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+          u[ 0] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS);
+          u[ 1] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS);
+          u[ 2] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS);
+          u[ 3] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS);
+          u[ 4] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS);
+          u[ 5] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS);
+          u[ 6] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS);
+          u[ 7] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS);
+          u[ 8] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS);
+          u[ 9] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS);
+          u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+          u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+          u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+          u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+          u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+          u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+
+          v[ 0] = _mm_cmplt_epi32(u[ 0], kZero);
+          v[ 1] = _mm_cmplt_epi32(u[ 1], kZero);
+          v[ 2] = _mm_cmplt_epi32(u[ 2], kZero);
+          v[ 3] = _mm_cmplt_epi32(u[ 3], kZero);
+          v[ 4] = _mm_cmplt_epi32(u[ 4], kZero);
+          v[ 5] = _mm_cmplt_epi32(u[ 5], kZero);
+          v[ 6] = _mm_cmplt_epi32(u[ 6], kZero);
+          v[ 7] = _mm_cmplt_epi32(u[ 7], kZero);
+          v[ 8] = _mm_cmplt_epi32(u[ 8], kZero);
+          v[ 9] = _mm_cmplt_epi32(u[ 9], kZero);
+          v[10] = _mm_cmplt_epi32(u[10], kZero);
+          v[11] = _mm_cmplt_epi32(u[11], kZero);
+          v[12] = _mm_cmplt_epi32(u[12], kZero);
+          v[13] = _mm_cmplt_epi32(u[13], kZero);
+          v[14] = _mm_cmplt_epi32(u[14], kZero);
+          v[15] = _mm_cmplt_epi32(u[15], kZero);
+
+          u[ 0] = _mm_sub_epi32(u[ 0], v[ 0]);
+          u[ 1] = _mm_sub_epi32(u[ 1], v[ 1]);
+          u[ 2] = _mm_sub_epi32(u[ 2], v[ 2]);
+          u[ 3] = _mm_sub_epi32(u[ 3], v[ 3]);
+          u[ 4] = _mm_sub_epi32(u[ 4], v[ 4]);
+          u[ 5] = _mm_sub_epi32(u[ 5], v[ 5]);
+          u[ 6] = _mm_sub_epi32(u[ 6], v[ 6]);
+          u[ 7] = _mm_sub_epi32(u[ 7], v[ 7]);
+          u[ 8] = _mm_sub_epi32(u[ 8], v[ 8]);
+          u[ 9] = _mm_sub_epi32(u[ 9], v[ 9]);
+          u[10] = _mm_sub_epi32(u[10], v[10]);
+          u[11] = _mm_sub_epi32(u[11], v[11]);
+          u[12] = _mm_sub_epi32(u[12], v[12]);
+          u[13] = _mm_sub_epi32(u[13], v[13]);
+          u[14] = _mm_sub_epi32(u[14], v[14]);
+          u[15] = _mm_sub_epi32(u[15], v[15]);
+
+          v[ 0] = _mm_add_epi32(u[ 0], K32One);
+          v[ 1] = _mm_add_epi32(u[ 1], K32One);
+          v[ 2] = _mm_add_epi32(u[ 2], K32One);
+          v[ 3] = _mm_add_epi32(u[ 3], K32One);
+          v[ 4] = _mm_add_epi32(u[ 4], K32One);
+          v[ 5] = _mm_add_epi32(u[ 5], K32One);
+          v[ 6] = _mm_add_epi32(u[ 6], K32One);
+          v[ 7] = _mm_add_epi32(u[ 7], K32One);
+          v[ 8] = _mm_add_epi32(u[ 8], K32One);
+          v[ 9] = _mm_add_epi32(u[ 9], K32One);
+          v[10] = _mm_add_epi32(u[10], K32One);
+          v[11] = _mm_add_epi32(u[11], K32One);
+          v[12] = _mm_add_epi32(u[12], K32One);
+          v[13] = _mm_add_epi32(u[13], K32One);
+          v[14] = _mm_add_epi32(u[14], K32One);
+          v[15] = _mm_add_epi32(u[15], K32One);
+
+          u[ 0] = _mm_srai_epi32(v[ 0], 2);
+          u[ 1] = _mm_srai_epi32(v[ 1], 2);
+          u[ 2] = _mm_srai_epi32(v[ 2], 2);
+          u[ 3] = _mm_srai_epi32(v[ 3], 2);
+          u[ 4] = _mm_srai_epi32(v[ 4], 2);
+          u[ 5] = _mm_srai_epi32(v[ 5], 2);
+          u[ 6] = _mm_srai_epi32(v[ 6], 2);
+          u[ 7] = _mm_srai_epi32(v[ 7], 2);
+          u[ 8] = _mm_srai_epi32(v[ 8], 2);
+          u[ 9] = _mm_srai_epi32(v[ 9], 2);
+          u[10] = _mm_srai_epi32(v[10], 2);
+          u[11] = _mm_srai_epi32(v[11], 2);
+          u[12] = _mm_srai_epi32(v[12], 2);
+          u[13] = _mm_srai_epi32(v[13], 2);
+          u[14] = _mm_srai_epi32(v[14], 2);
+          u[15] = _mm_srai_epi32(v[15], 2);
+
+          out[ 2] = _mm_packs_epi32(u[0], u[1]);
+          out[18] = _mm_packs_epi32(u[2], u[3]);
+          out[10] = _mm_packs_epi32(u[4], u[5]);
+          out[26] = _mm_packs_epi32(u[6], u[7]);
+          out[ 6] = _mm_packs_epi32(u[8], u[9]);
+          out[22] = _mm_packs_epi32(u[10], u[11]);
+          out[14] = _mm_packs_epi32(u[12], u[13]);
+          out[30] = _mm_packs_epi32(u[14], u[15]);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x8(&out[2], &out[18], &out[10],
+                                             &out[26], &out[6], &out[22],
+                                             &out[14], &out[30]);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+        }
+        {
+          lstep1[32] = _mm_add_epi32(lstep3[34], lstep2[32]);
+          lstep1[33] = _mm_add_epi32(lstep3[35], lstep2[33]);
+          lstep1[34] = _mm_sub_epi32(lstep2[32], lstep3[34]);
+          lstep1[35] = _mm_sub_epi32(lstep2[33], lstep3[35]);
+          lstep1[36] = _mm_sub_epi32(lstep2[38], lstep3[36]);
+          lstep1[37] = _mm_sub_epi32(lstep2[39], lstep3[37]);
+          lstep1[38] = _mm_add_epi32(lstep3[36], lstep2[38]);
+          lstep1[39] = _mm_add_epi32(lstep3[37], lstep2[39]);
+          lstep1[40] = _mm_add_epi32(lstep3[42], lstep2[40]);
+          lstep1[41] = _mm_add_epi32(lstep3[43], lstep2[41]);
+          lstep1[42] = _mm_sub_epi32(lstep2[40], lstep3[42]);
+          lstep1[43] = _mm_sub_epi32(lstep2[41], lstep3[43]);
+          lstep1[44] = _mm_sub_epi32(lstep2[46], lstep3[44]);
+          lstep1[45] = _mm_sub_epi32(lstep2[47], lstep3[45]);
+          lstep1[46] = _mm_add_epi32(lstep3[44], lstep2[46]);
+          lstep1[47] = _mm_add_epi32(lstep3[45], lstep2[47]);
+          lstep1[48] = _mm_add_epi32(lstep3[50], lstep2[48]);
+          lstep1[49] = _mm_add_epi32(lstep3[51], lstep2[49]);
+          lstep1[50] = _mm_sub_epi32(lstep2[48], lstep3[50]);
+          lstep1[51] = _mm_sub_epi32(lstep2[49], lstep3[51]);
+          lstep1[52] = _mm_sub_epi32(lstep2[54], lstep3[52]);
+          lstep1[53] = _mm_sub_epi32(lstep2[55], lstep3[53]);
+          lstep1[54] = _mm_add_epi32(lstep3[52], lstep2[54]);
+          lstep1[55] = _mm_add_epi32(lstep3[53], lstep2[55]);
+          lstep1[56] = _mm_add_epi32(lstep3[58], lstep2[56]);
+          lstep1[57] = _mm_add_epi32(lstep3[59], lstep2[57]);
+          lstep1[58] = _mm_sub_epi32(lstep2[56], lstep3[58]);
+          lstep1[59] = _mm_sub_epi32(lstep2[57], lstep3[59]);
+          lstep1[60] = _mm_sub_epi32(lstep2[62], lstep3[60]);
+          lstep1[61] = _mm_sub_epi32(lstep2[63], lstep3[61]);
+          lstep1[62] = _mm_add_epi32(lstep3[60], lstep2[62]);
+          lstep1[63] = _mm_add_epi32(lstep3[61], lstep2[63]);
+        }
+        // stage 8
+        {
+          const __m128i k32_p31_p01 = pair_set_epi32(cospi_31_64, cospi_1_64);
+          const __m128i k32_p15_p17 = pair_set_epi32(cospi_15_64, cospi_17_64);
+          const __m128i k32_p23_p09 = pair_set_epi32(cospi_23_64, cospi_9_64);
+          const __m128i k32_p07_p25 = pair_set_epi32(cospi_7_64, cospi_25_64);
+          const __m128i k32_m25_p07 = pair_set_epi32(-cospi_25_64, cospi_7_64);
+          const __m128i k32_m09_p23 = pair_set_epi32(-cospi_9_64, cospi_23_64);
+          const __m128i k32_m17_p15 = pair_set_epi32(-cospi_17_64, cospi_15_64);
+          const __m128i k32_m01_p31 = pair_set_epi32(-cospi_1_64, cospi_31_64);
+
+          u[ 0] = _mm_unpacklo_epi32(lstep1[32], lstep1[62]);
+          u[ 1] = _mm_unpackhi_epi32(lstep1[32], lstep1[62]);
+          u[ 2] = _mm_unpacklo_epi32(lstep1[33], lstep1[63]);
+          u[ 3] = _mm_unpackhi_epi32(lstep1[33], lstep1[63]);
+          u[ 4] = _mm_unpacklo_epi32(lstep1[34], lstep1[60]);
+          u[ 5] = _mm_unpackhi_epi32(lstep1[34], lstep1[60]);
+          u[ 6] = _mm_unpacklo_epi32(lstep1[35], lstep1[61]);
+          u[ 7] = _mm_unpackhi_epi32(lstep1[35], lstep1[61]);
+          u[ 8] = _mm_unpacklo_epi32(lstep1[36], lstep1[58]);
+          u[ 9] = _mm_unpackhi_epi32(lstep1[36], lstep1[58]);
+          u[10] = _mm_unpacklo_epi32(lstep1[37], lstep1[59]);
+          u[11] = _mm_unpackhi_epi32(lstep1[37], lstep1[59]);
+          u[12] = _mm_unpacklo_epi32(lstep1[38], lstep1[56]);
+          u[13] = _mm_unpackhi_epi32(lstep1[38], lstep1[56]);
+          u[14] = _mm_unpacklo_epi32(lstep1[39], lstep1[57]);
+          u[15] = _mm_unpackhi_epi32(lstep1[39], lstep1[57]);
+
+          v[ 0] = k_madd_epi32(u[ 0], k32_p31_p01);
+          v[ 1] = k_madd_epi32(u[ 1], k32_p31_p01);
+          v[ 2] = k_madd_epi32(u[ 2], k32_p31_p01);
+          v[ 3] = k_madd_epi32(u[ 3], k32_p31_p01);
+          v[ 4] = k_madd_epi32(u[ 4], k32_p15_p17);
+          v[ 5] = k_madd_epi32(u[ 5], k32_p15_p17);
+          v[ 6] = k_madd_epi32(u[ 6], k32_p15_p17);
+          v[ 7] = k_madd_epi32(u[ 7], k32_p15_p17);
+          v[ 8] = k_madd_epi32(u[ 8], k32_p23_p09);
+          v[ 9] = k_madd_epi32(u[ 9], k32_p23_p09);
+          v[10] = k_madd_epi32(u[10], k32_p23_p09);
+          v[11] = k_madd_epi32(u[11], k32_p23_p09);
+          v[12] = k_madd_epi32(u[12], k32_p07_p25);
+          v[13] = k_madd_epi32(u[13], k32_p07_p25);
+          v[14] = k_madd_epi32(u[14], k32_p07_p25);
+          v[15] = k_madd_epi32(u[15], k32_p07_p25);
+          v[16] = k_madd_epi32(u[12], k32_m25_p07);
+          v[17] = k_madd_epi32(u[13], k32_m25_p07);
+          v[18] = k_madd_epi32(u[14], k32_m25_p07);
+          v[19] = k_madd_epi32(u[15], k32_m25_p07);
+          v[20] = k_madd_epi32(u[ 8], k32_m09_p23);
+          v[21] = k_madd_epi32(u[ 9], k32_m09_p23);
+          v[22] = k_madd_epi32(u[10], k32_m09_p23);
+          v[23] = k_madd_epi32(u[11], k32_m09_p23);
+          v[24] = k_madd_epi32(u[ 4], k32_m17_p15);
+          v[25] = k_madd_epi32(u[ 5], k32_m17_p15);
+          v[26] = k_madd_epi32(u[ 6], k32_m17_p15);
+          v[27] = k_madd_epi32(u[ 7], k32_m17_p15);
+          v[28] = k_madd_epi32(u[ 0], k32_m01_p31);
+          v[29] = k_madd_epi32(u[ 1], k32_m01_p31);
+          v[30] = k_madd_epi32(u[ 2], k32_m01_p31);
+          v[31] = k_madd_epi32(u[ 3], k32_m01_p31);
+
+#if DCT_HIGH_BIT_DEPTH
+          overflow = k_check_epi32_overflow_32(
+              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+              &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+              &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23],
+              &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31],
+              &kZero);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+          u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
+          u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
+          u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
+          u[ 3] = k_packs_epi64(v[ 6], v[ 7]);
+          u[ 4] = k_packs_epi64(v[ 8], v[ 9]);
+          u[ 5] = k_packs_epi64(v[10], v[11]);
+          u[ 6] = k_packs_epi64(v[12], v[13]);
+          u[ 7] = k_packs_epi64(v[14], v[15]);
+          u[ 8] = k_packs_epi64(v[16], v[17]);
+          u[ 9] = k_packs_epi64(v[18], v[19]);
+          u[10] = k_packs_epi64(v[20], v[21]);
+          u[11] = k_packs_epi64(v[22], v[23]);
+          u[12] = k_packs_epi64(v[24], v[25]);
+          u[13] = k_packs_epi64(v[26], v[27]);
+          u[14] = k_packs_epi64(v[28], v[29]);
+          u[15] = k_packs_epi64(v[30], v[31]);
+
+          v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+          v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+          v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+          v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+          v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+          v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+          v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+          v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+          v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+          v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+          v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+          v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+          v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+          v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+          v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+          v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+          u[ 0] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS);
+          u[ 1] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS);
+          u[ 2] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS);
+          u[ 3] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS);
+          u[ 4] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS);
+          u[ 5] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS);
+          u[ 6] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS);
+          u[ 7] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS);
+          u[ 8] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS);
+          u[ 9] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS);
+          u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+          u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+          u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+          u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+          u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+          u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+
+          v[ 0] = _mm_cmplt_epi32(u[ 0], kZero);
+          v[ 1] = _mm_cmplt_epi32(u[ 1], kZero);
+          v[ 2] = _mm_cmplt_epi32(u[ 2], kZero);
+          v[ 3] = _mm_cmplt_epi32(u[ 3], kZero);
+          v[ 4] = _mm_cmplt_epi32(u[ 4], kZero);
+          v[ 5] = _mm_cmplt_epi32(u[ 5], kZero);
+          v[ 6] = _mm_cmplt_epi32(u[ 6], kZero);
+          v[ 7] = _mm_cmplt_epi32(u[ 7], kZero);
+          v[ 8] = _mm_cmplt_epi32(u[ 8], kZero);
+          v[ 9] = _mm_cmplt_epi32(u[ 9], kZero);
+          v[10] = _mm_cmplt_epi32(u[10], kZero);
+          v[11] = _mm_cmplt_epi32(u[11], kZero);
+          v[12] = _mm_cmplt_epi32(u[12], kZero);
+          v[13] = _mm_cmplt_epi32(u[13], kZero);
+          v[14] = _mm_cmplt_epi32(u[14], kZero);
+          v[15] = _mm_cmplt_epi32(u[15], kZero);
+
+          u[ 0] = _mm_sub_epi32(u[ 0], v[ 0]);
+          u[ 1] = _mm_sub_epi32(u[ 1], v[ 1]);
+          u[ 2] = _mm_sub_epi32(u[ 2], v[ 2]);
+          u[ 3] = _mm_sub_epi32(u[ 3], v[ 3]);
+          u[ 4] = _mm_sub_epi32(u[ 4], v[ 4]);
+          u[ 5] = _mm_sub_epi32(u[ 5], v[ 5]);
+          u[ 6] = _mm_sub_epi32(u[ 6], v[ 6]);
+          u[ 7] = _mm_sub_epi32(u[ 7], v[ 7]);
+          u[ 8] = _mm_sub_epi32(u[ 8], v[ 8]);
+          u[ 9] = _mm_sub_epi32(u[ 9], v[ 9]);
+          u[10] = _mm_sub_epi32(u[10], v[10]);
+          u[11] = _mm_sub_epi32(u[11], v[11]);
+          u[12] = _mm_sub_epi32(u[12], v[12]);
+          u[13] = _mm_sub_epi32(u[13], v[13]);
+          u[14] = _mm_sub_epi32(u[14], v[14]);
+          u[15] = _mm_sub_epi32(u[15], v[15]);
+
+          v[0] = _mm_add_epi32(u[0], K32One);
+          v[1] = _mm_add_epi32(u[1], K32One);
+          v[2] = _mm_add_epi32(u[2], K32One);
+          v[3] = _mm_add_epi32(u[3], K32One);
+          v[4] = _mm_add_epi32(u[4], K32One);
+          v[5] = _mm_add_epi32(u[5], K32One);
+          v[6] = _mm_add_epi32(u[6], K32One);
+          v[7] = _mm_add_epi32(u[7], K32One);
+          v[8] = _mm_add_epi32(u[8], K32One);
+          v[9] = _mm_add_epi32(u[9], K32One);
+          v[10] = _mm_add_epi32(u[10], K32One);
+          v[11] = _mm_add_epi32(u[11], K32One);
+          v[12] = _mm_add_epi32(u[12], K32One);
+          v[13] = _mm_add_epi32(u[13], K32One);
+          v[14] = _mm_add_epi32(u[14], K32One);
+          v[15] = _mm_add_epi32(u[15], K32One);
+
+          u[0] = _mm_srai_epi32(v[0], 2);
+          u[1] = _mm_srai_epi32(v[1], 2);
+          u[2] = _mm_srai_epi32(v[2], 2);
+          u[3] = _mm_srai_epi32(v[3], 2);
+          u[4] = _mm_srai_epi32(v[4], 2);
+          u[5] = _mm_srai_epi32(v[5], 2);
+          u[6] = _mm_srai_epi32(v[6], 2);
+          u[7] = _mm_srai_epi32(v[7], 2);
+          u[8] = _mm_srai_epi32(v[8], 2);
+          u[9] = _mm_srai_epi32(v[9], 2);
+          u[10] = _mm_srai_epi32(v[10], 2);
+          u[11] = _mm_srai_epi32(v[11], 2);
+          u[12] = _mm_srai_epi32(v[12], 2);
+          u[13] = _mm_srai_epi32(v[13], 2);
+          u[14] = _mm_srai_epi32(v[14], 2);
+          u[15] = _mm_srai_epi32(v[15], 2);
+
+          out[ 1] = _mm_packs_epi32(u[0], u[1]);
+          out[17] = _mm_packs_epi32(u[2], u[3]);
+          out[ 9] = _mm_packs_epi32(u[4], u[5]);
+          out[25] = _mm_packs_epi32(u[6], u[7]);
+          out[ 7] = _mm_packs_epi32(u[8], u[9]);
+          out[23] = _mm_packs_epi32(u[10], u[11]);
+          out[15] = _mm_packs_epi32(u[12], u[13]);
+          out[31] = _mm_packs_epi32(u[14], u[15]);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x8(&out[1], &out[17], &out[9],
+                                             &out[25], &out[7], &out[23],
+                                             &out[15], &out[31]);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+        }
+        {
+          const __m128i k32_p27_p05 = pair_set_epi32(cospi_27_64, cospi_5_64);
+          const __m128i k32_p11_p21 = pair_set_epi32(cospi_11_64, cospi_21_64);
+          const __m128i k32_p19_p13 = pair_set_epi32(cospi_19_64, cospi_13_64);
+          const __m128i k32_p03_p29 = pair_set_epi32(cospi_3_64, cospi_29_64);
+          const __m128i k32_m29_p03 = pair_set_epi32(-cospi_29_64, cospi_3_64);
+          const __m128i k32_m13_p19 = pair_set_epi32(-cospi_13_64, cospi_19_64);
+          const __m128i k32_m21_p11 = pair_set_epi32(-cospi_21_64, cospi_11_64);
+          const __m128i k32_m05_p27 = pair_set_epi32(-cospi_5_64, cospi_27_64);
+
+          u[ 0] = _mm_unpacklo_epi32(lstep1[40], lstep1[54]);
+          u[ 1] = _mm_unpackhi_epi32(lstep1[40], lstep1[54]);
+          u[ 2] = _mm_unpacklo_epi32(lstep1[41], lstep1[55]);
+          u[ 3] = _mm_unpackhi_epi32(lstep1[41], lstep1[55]);
+          u[ 4] = _mm_unpacklo_epi32(lstep1[42], lstep1[52]);
+          u[ 5] = _mm_unpackhi_epi32(lstep1[42], lstep1[52]);
+          u[ 6] = _mm_unpacklo_epi32(lstep1[43], lstep1[53]);
+          u[ 7] = _mm_unpackhi_epi32(lstep1[43], lstep1[53]);
+          u[ 8] = _mm_unpacklo_epi32(lstep1[44], lstep1[50]);
+          u[ 9] = _mm_unpackhi_epi32(lstep1[44], lstep1[50]);
+          u[10] = _mm_unpacklo_epi32(lstep1[45], lstep1[51]);
+          u[11] = _mm_unpackhi_epi32(lstep1[45], lstep1[51]);
+          u[12] = _mm_unpacklo_epi32(lstep1[46], lstep1[48]);
+          u[13] = _mm_unpackhi_epi32(lstep1[46], lstep1[48]);
+          u[14] = _mm_unpacklo_epi32(lstep1[47], lstep1[49]);
+          u[15] = _mm_unpackhi_epi32(lstep1[47], lstep1[49]);
+
+          v[ 0] = k_madd_epi32(u[ 0], k32_p27_p05);
+          v[ 1] = k_madd_epi32(u[ 1], k32_p27_p05);
+          v[ 2] = k_madd_epi32(u[ 2], k32_p27_p05);
+          v[ 3] = k_madd_epi32(u[ 3], k32_p27_p05);
+          v[ 4] = k_madd_epi32(u[ 4], k32_p11_p21);
+          v[ 5] = k_madd_epi32(u[ 5], k32_p11_p21);
+          v[ 6] = k_madd_epi32(u[ 6], k32_p11_p21);
+          v[ 7] = k_madd_epi32(u[ 7], k32_p11_p21);
+          v[ 8] = k_madd_epi32(u[ 8], k32_p19_p13);
+          v[ 9] = k_madd_epi32(u[ 9], k32_p19_p13);
+          v[10] = k_madd_epi32(u[10], k32_p19_p13);
+          v[11] = k_madd_epi32(u[11], k32_p19_p13);
+          v[12] = k_madd_epi32(u[12], k32_p03_p29);
+          v[13] = k_madd_epi32(u[13], k32_p03_p29);
+          v[14] = k_madd_epi32(u[14], k32_p03_p29);
+          v[15] = k_madd_epi32(u[15], k32_p03_p29);
+          v[16] = k_madd_epi32(u[12], k32_m29_p03);
+          v[17] = k_madd_epi32(u[13], k32_m29_p03);
+          v[18] = k_madd_epi32(u[14], k32_m29_p03);
+          v[19] = k_madd_epi32(u[15], k32_m29_p03);
+          v[20] = k_madd_epi32(u[ 8], k32_m13_p19);
+          v[21] = k_madd_epi32(u[ 9], k32_m13_p19);
+          v[22] = k_madd_epi32(u[10], k32_m13_p19);
+          v[23] = k_madd_epi32(u[11], k32_m13_p19);
+          v[24] = k_madd_epi32(u[ 4], k32_m21_p11);
+          v[25] = k_madd_epi32(u[ 5], k32_m21_p11);
+          v[26] = k_madd_epi32(u[ 6], k32_m21_p11);
+          v[27] = k_madd_epi32(u[ 7], k32_m21_p11);
+          v[28] = k_madd_epi32(u[ 0], k32_m05_p27);
+          v[29] = k_madd_epi32(u[ 1], k32_m05_p27);
+          v[30] = k_madd_epi32(u[ 2], k32_m05_p27);
+          v[31] = k_madd_epi32(u[ 3], k32_m05_p27);
+
+#if DCT_HIGH_BIT_DEPTH
+          overflow = k_check_epi32_overflow_32(
+              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+              &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+              &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23],
+              &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31],
+              &kZero);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+          u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
+          u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
+          u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
+          u[ 3] = k_packs_epi64(v[ 6], v[ 7]);
+          u[ 4] = k_packs_epi64(v[ 8], v[ 9]);
+          u[ 5] = k_packs_epi64(v[10], v[11]);
+          u[ 6] = k_packs_epi64(v[12], v[13]);
+          u[ 7] = k_packs_epi64(v[14], v[15]);
+          u[ 8] = k_packs_epi64(v[16], v[17]);
+          u[ 9] = k_packs_epi64(v[18], v[19]);
+          u[10] = k_packs_epi64(v[20], v[21]);
+          u[11] = k_packs_epi64(v[22], v[23]);
+          u[12] = k_packs_epi64(v[24], v[25]);
+          u[13] = k_packs_epi64(v[26], v[27]);
+          u[14] = k_packs_epi64(v[28], v[29]);
+          u[15] = k_packs_epi64(v[30], v[31]);
+
+          v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+          v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+          v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+          v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+          v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+          v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+          v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+          v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+          v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+          v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+          v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+          v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+          v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+          v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+          v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+          v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+          u[ 0] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS);
+          u[ 1] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS);
+          u[ 2] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS);
+          u[ 3] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS);
+          u[ 4] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS);
+          u[ 5] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS);
+          u[ 6] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS);
+          u[ 7] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS);
+          u[ 8] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS);
+          u[ 9] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS);
+          u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+          u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+          u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+          u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+          u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+          u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+
+          v[ 0] = _mm_cmplt_epi32(u[ 0], kZero);
+          v[ 1] = _mm_cmplt_epi32(u[ 1], kZero);
+          v[ 2] = _mm_cmplt_epi32(u[ 2], kZero);
+          v[ 3] = _mm_cmplt_epi32(u[ 3], kZero);
+          v[ 4] = _mm_cmplt_epi32(u[ 4], kZero);
+          v[ 5] = _mm_cmplt_epi32(u[ 5], kZero);
+          v[ 6] = _mm_cmplt_epi32(u[ 6], kZero);
+          v[ 7] = _mm_cmplt_epi32(u[ 7], kZero);
+          v[ 8] = _mm_cmplt_epi32(u[ 8], kZero);
+          v[ 9] = _mm_cmplt_epi32(u[ 9], kZero);
+          v[10] = _mm_cmplt_epi32(u[10], kZero);
+          v[11] = _mm_cmplt_epi32(u[11], kZero);
+          v[12] = _mm_cmplt_epi32(u[12], kZero);
+          v[13] = _mm_cmplt_epi32(u[13], kZero);
+          v[14] = _mm_cmplt_epi32(u[14], kZero);
+          v[15] = _mm_cmplt_epi32(u[15], kZero);
+
+          u[ 0] = _mm_sub_epi32(u[ 0], v[ 0]);
+          u[ 1] = _mm_sub_epi32(u[ 1], v[ 1]);
+          u[ 2] = _mm_sub_epi32(u[ 2], v[ 2]);
+          u[ 3] = _mm_sub_epi32(u[ 3], v[ 3]);
+          u[ 4] = _mm_sub_epi32(u[ 4], v[ 4]);
+          u[ 5] = _mm_sub_epi32(u[ 5], v[ 5]);
+          u[ 6] = _mm_sub_epi32(u[ 6], v[ 6]);
+          u[ 7] = _mm_sub_epi32(u[ 7], v[ 7]);
+          u[ 8] = _mm_sub_epi32(u[ 8], v[ 8]);
+          u[ 9] = _mm_sub_epi32(u[ 9], v[ 9]);
+          u[10] = _mm_sub_epi32(u[10], v[10]);
+          u[11] = _mm_sub_epi32(u[11], v[11]);
+          u[12] = _mm_sub_epi32(u[12], v[12]);
+          u[13] = _mm_sub_epi32(u[13], v[13]);
+          u[14] = _mm_sub_epi32(u[14], v[14]);
+          u[15] = _mm_sub_epi32(u[15], v[15]);
+
+          v[0] = _mm_add_epi32(u[0], K32One);
+          v[1] = _mm_add_epi32(u[1], K32One);
+          v[2] = _mm_add_epi32(u[2], K32One);
+          v[3] = _mm_add_epi32(u[3], K32One);
+          v[4] = _mm_add_epi32(u[4], K32One);
+          v[5] = _mm_add_epi32(u[5], K32One);
+          v[6] = _mm_add_epi32(u[6], K32One);
+          v[7] = _mm_add_epi32(u[7], K32One);
+          v[8] = _mm_add_epi32(u[8], K32One);
+          v[9] = _mm_add_epi32(u[9], K32One);
+          v[10] = _mm_add_epi32(u[10], K32One);
+          v[11] = _mm_add_epi32(u[11], K32One);
+          v[12] = _mm_add_epi32(u[12], K32One);
+          v[13] = _mm_add_epi32(u[13], K32One);
+          v[14] = _mm_add_epi32(u[14], K32One);
+          v[15] = _mm_add_epi32(u[15], K32One);
+
+          u[0] = _mm_srai_epi32(v[0], 2);
+          u[1] = _mm_srai_epi32(v[1], 2);
+          u[2] = _mm_srai_epi32(v[2], 2);
+          u[3] = _mm_srai_epi32(v[3], 2);
+          u[4] = _mm_srai_epi32(v[4], 2);
+          u[5] = _mm_srai_epi32(v[5], 2);
+          u[6] = _mm_srai_epi32(v[6], 2);
+          u[7] = _mm_srai_epi32(v[7], 2);
+          u[8] = _mm_srai_epi32(v[8], 2);
+          u[9] = _mm_srai_epi32(v[9], 2);
+          u[10] = _mm_srai_epi32(v[10], 2);
+          u[11] = _mm_srai_epi32(v[11], 2);
+          u[12] = _mm_srai_epi32(v[12], 2);
+          u[13] = _mm_srai_epi32(v[13], 2);
+          u[14] = _mm_srai_epi32(v[14], 2);
+          u[15] = _mm_srai_epi32(v[15], 2);
+
+          out[ 5] = _mm_packs_epi32(u[0], u[1]);
+          out[21] = _mm_packs_epi32(u[2], u[3]);
+          out[13] = _mm_packs_epi32(u[4], u[5]);
+          out[29] = _mm_packs_epi32(u[6], u[7]);
+          out[ 3] = _mm_packs_epi32(u[8], u[9]);
+          out[19] = _mm_packs_epi32(u[10], u[11]);
+          out[11] = _mm_packs_epi32(u[12], u[13]);
+          out[27] = _mm_packs_epi32(u[14], u[15]);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x8(&out[5], &out[21], &out[13],
+                                             &out[29], &out[3], &out[19],
+                                             &out[11], &out[27]);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+        }
+      }
+#endif  // FDCT32x32_HIGH_PRECISION
+      // Transpose the results, do it as four 8x8 transposes.
+      {
+        int transpose_block;
+        int16_t *output0 = &intermediate[column_start * 32];
+        tran_low_t *output1 = &output_org[column_start * 32];
+        for (transpose_block = 0; transpose_block < 4; ++transpose_block) {
+          __m128i *this_out = &out[8 * transpose_block];
+          // 00 01 02 03 04 05 06 07
+          // 10 11 12 13 14 15 16 17
+          // 20 21 22 23 24 25 26 27
+          // 30 31 32 33 34 35 36 37
+          // 40 41 42 43 44 45 46 47
+          // 50 51 52 53 54 55 56 57
+          // 60 61 62 63 64 65 66 67
+          // 70 71 72 73 74 75 76 77
+          const __m128i tr0_0 = _mm_unpacklo_epi16(this_out[0], this_out[1]);
+          const __m128i tr0_1 = _mm_unpacklo_epi16(this_out[2], this_out[3]);
+          const __m128i tr0_2 = _mm_unpackhi_epi16(this_out[0], this_out[1]);
+          const __m128i tr0_3 = _mm_unpackhi_epi16(this_out[2], this_out[3]);
+          const __m128i tr0_4 = _mm_unpacklo_epi16(this_out[4], this_out[5]);
+          const __m128i tr0_5 = _mm_unpacklo_epi16(this_out[6], this_out[7]);
+          const __m128i tr0_6 = _mm_unpackhi_epi16(this_out[4], this_out[5]);
+          const __m128i tr0_7 = _mm_unpackhi_epi16(this_out[6], this_out[7]);
+          // 00 10 01 11 02 12 03 13
+          // 20 30 21 31 22 32 23 33
+          // 04 14 05 15 06 16 07 17
+          // 24 34 25 35 26 36 27 37
+          // 40 50 41 51 42 52 43 53
+          // 60 70 61 71 62 72 63 73
+          // 54 54 55 55 56 56 57 57
+          // 64 74 65 75 66 76 67 77
+          const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+          const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+          const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+          const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+          const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+          const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+          const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+          const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+          // 00 10 20 30 01 11 21 31
+          // 40 50 60 70 41 51 61 71
+          // 02 12 22 32 03 13 23 33
+          // 42 52 62 72 43 53 63 73
+          // 04 14 24 34 05 15 21 36
+          // 44 54 64 74 45 55 61 76
+          // 06 16 26 36 07 17 27 37
+          // 46 56 66 76 47 57 67 77
+          __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+          __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+          __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+          __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+          __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+          __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+          __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+          __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+          // 00 10 20 30 40 50 60 70
+          // 01 11 21 31 41 51 61 71
+          // 02 12 22 32 42 52 62 72
+          // 03 13 23 33 43 53 63 73
+          // 04 14 24 34 44 54 64 74
+          // 05 15 25 35 45 55 65 75
+          // 06 16 26 36 46 56 66 76
+          // 07 17 27 37 47 57 67 77
+          if (0 == pass) {
+            // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2;
+            // TODO(cd): see quality impact of only doing
+            //           output[j] = (output[j] + 1) >> 2;
+            //           which would remove the code between here ...
+            __m128i tr2_0_0 = _mm_cmpgt_epi16(tr2_0, kZero);
+            __m128i tr2_1_0 = _mm_cmpgt_epi16(tr2_1, kZero);
+            __m128i tr2_2_0 = _mm_cmpgt_epi16(tr2_2, kZero);
+            __m128i tr2_3_0 = _mm_cmpgt_epi16(tr2_3, kZero);
+            __m128i tr2_4_0 = _mm_cmpgt_epi16(tr2_4, kZero);
+            __m128i tr2_5_0 = _mm_cmpgt_epi16(tr2_5, kZero);
+            __m128i tr2_6_0 = _mm_cmpgt_epi16(tr2_6, kZero);
+            __m128i tr2_7_0 = _mm_cmpgt_epi16(tr2_7, kZero);
+            tr2_0 = _mm_sub_epi16(tr2_0, tr2_0_0);
+            tr2_1 = _mm_sub_epi16(tr2_1, tr2_1_0);
+            tr2_2 = _mm_sub_epi16(tr2_2, tr2_2_0);
+            tr2_3 = _mm_sub_epi16(tr2_3, tr2_3_0);
+            tr2_4 = _mm_sub_epi16(tr2_4, tr2_4_0);
+            tr2_5 = _mm_sub_epi16(tr2_5, tr2_5_0);
+            tr2_6 = _mm_sub_epi16(tr2_6, tr2_6_0);
+            tr2_7 = _mm_sub_epi16(tr2_7, tr2_7_0);
+            //           ... and here.
+            //           PS: also change code in vp9/encoder/vp9_dct.c
+            tr2_0 = _mm_add_epi16(tr2_0, kOne);
+            tr2_1 = _mm_add_epi16(tr2_1, kOne);
+            tr2_2 = _mm_add_epi16(tr2_2, kOne);
+            tr2_3 = _mm_add_epi16(tr2_3, kOne);
+            tr2_4 = _mm_add_epi16(tr2_4, kOne);
+            tr2_5 = _mm_add_epi16(tr2_5, kOne);
+            tr2_6 = _mm_add_epi16(tr2_6, kOne);
+            tr2_7 = _mm_add_epi16(tr2_7, kOne);
+            tr2_0 = _mm_srai_epi16(tr2_0, 2);
+            tr2_1 = _mm_srai_epi16(tr2_1, 2);
+            tr2_2 = _mm_srai_epi16(tr2_2, 2);
+            tr2_3 = _mm_srai_epi16(tr2_3, 2);
+            tr2_4 = _mm_srai_epi16(tr2_4, 2);
+            tr2_5 = _mm_srai_epi16(tr2_5, 2);
+            tr2_6 = _mm_srai_epi16(tr2_6, 2);
+            tr2_7 = _mm_srai_epi16(tr2_7, 2);
+          }
+          // Note: even though all these stores are aligned, using the aligned
+          //       intrinsic make the code slightly slower.
+          if (pass == 0) {
+            _mm_storeu_si128((__m128i *)(output0 + 0 * 32), tr2_0);
+            _mm_storeu_si128((__m128i *)(output0 + 1 * 32), tr2_1);
+            _mm_storeu_si128((__m128i *)(output0 + 2 * 32), tr2_2);
+            _mm_storeu_si128((__m128i *)(output0 + 3 * 32), tr2_3);
+            _mm_storeu_si128((__m128i *)(output0 + 4 * 32), tr2_4);
+            _mm_storeu_si128((__m128i *)(output0 + 5 * 32), tr2_5);
+            _mm_storeu_si128((__m128i *)(output0 + 6 * 32), tr2_6);
+            _mm_storeu_si128((__m128i *)(output0 + 7 * 32), tr2_7);
+            // Process next 8x8
+            output0 += 8;
+          } else {
+            storeu_output(&tr2_0, (output1 + 0 * 32));
+            storeu_output(&tr2_1, (output1 + 1 * 32));
+            storeu_output(&tr2_2, (output1 + 2 * 32));
+            storeu_output(&tr2_3, (output1 + 3 * 32));
+            storeu_output(&tr2_4, (output1 + 4 * 32));
+            storeu_output(&tr2_5, (output1 + 5 * 32));
+            storeu_output(&tr2_6, (output1 + 6 * 32));
+            storeu_output(&tr2_7, (output1 + 7 * 32));
+            // Process next 8x8
+            output1 += 8;
+          }
+        }
+      }
+    }
+  }
+}  // NOLINT
+
+#undef ADD_EPI16
+#undef SUB_EPI16
+#undef HIGH_FDCT32x32_2D_C
+#undef HIGH_FDCT32x32_2D_ROWS_C
diff --git a/libs/libvpx/vpx_dsp/x86/fwd_txfm_avx2.c b/libs/libvpx/vpx_dsp/x86/fwd_txfm_avx2.c
new file mode 100644
index 0000000000..6d9da6aa89
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/fwd_txfm_avx2.c
@@ -0,0 +1,23 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+
+#define FDCT32x32_2D_AVX2 vpx_fdct32x32_rd_avx2
+#define FDCT32x32_HIGH_PRECISION 0
+#include "vpx_dsp/x86/fwd_dct32x32_impl_avx2.h"
+#undef  FDCT32x32_2D_AVX2
+#undef  FDCT32x32_HIGH_PRECISION
+
+#define FDCT32x32_2D_AVX2 vpx_fdct32x32_avx2
+#define FDCT32x32_HIGH_PRECISION 1
+#include "vpx_dsp/x86/fwd_dct32x32_impl_avx2.h" // NOLINT
+#undef  FDCT32x32_2D_AVX2
+#undef  FDCT32x32_HIGH_PRECISION
diff --git a/libs/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h b/libs/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h
new file mode 100644
index 0000000000..69889e2e98
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h
@@ -0,0 +1,1027 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>  // SSE2
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/x86/fwd_txfm_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+#include "vpx_ports/mem.h"
+
+// TODO(jingning) The high bit-depth functions need rework for performance.
+// After we properly fix the high bit-depth function implementations, this
+// file's dependency should be substantially simplified.
+#if DCT_HIGH_BIT_DEPTH
+#define ADD_EPI16 _mm_adds_epi16
+#define SUB_EPI16 _mm_subs_epi16
+
+#else
+#define ADD_EPI16 _mm_add_epi16
+#define SUB_EPI16 _mm_sub_epi16
+#endif
+
+void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) {
+  // This 2D transform implements 4 vertical 1D transforms followed
+  // by 4 horizontal 1D transforms.  The multiplies and adds are as given
+  // by Chen, Smith and Fralick ('77).  The commands for moving the data
+  // around have been minimized by hand.
+  // For the purposes of the comments, the 16 inputs are referred to at i0
+  // through iF (in raster order), intermediate variables are a0, b0, c0
+  // through f, and correspond to the in-place computations mapped to input
+  // locations.  The outputs, o0 through oF are labeled according to the
+  // output locations.
+
+  // Constants
+  // These are the coefficients used for the multiplies.
+  // In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64),
+  // where cospi_N_64 = cos(N pi /64)
+  const __m128i k__cospi_A = octa_set_epi16(cospi_16_64, cospi_16_64,
+                                            cospi_16_64, cospi_16_64,
+                                            cospi_16_64, -cospi_16_64,
+                                            cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_B = octa_set_epi16(cospi_16_64, -cospi_16_64,
+                                            cospi_16_64, -cospi_16_64,
+                                            cospi_16_64, cospi_16_64,
+                                            cospi_16_64, cospi_16_64);
+  const __m128i k__cospi_C = octa_set_epi16(cospi_8_64, cospi_24_64,
+                                            cospi_8_64, cospi_24_64,
+                                            cospi_24_64, -cospi_8_64,
+                                            cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_D = octa_set_epi16(cospi_24_64, -cospi_8_64,
+                                            cospi_24_64, -cospi_8_64,
+                                            cospi_8_64, cospi_24_64,
+                                            cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_E = octa_set_epi16(cospi_16_64, cospi_16_64,
+                                            cospi_16_64, cospi_16_64,
+                                            cospi_16_64, cospi_16_64,
+                                            cospi_16_64, cospi_16_64);
+  const __m128i k__cospi_F = octa_set_epi16(cospi_16_64, -cospi_16_64,
+                                            cospi_16_64, -cospi_16_64,
+                                            cospi_16_64, -cospi_16_64,
+                                            cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_G = octa_set_epi16(cospi_8_64, cospi_24_64,
+                                            cospi_8_64, cospi_24_64,
+                                            -cospi_8_64, -cospi_24_64,
+                                            -cospi_8_64, -cospi_24_64);
+  const __m128i k__cospi_H = octa_set_epi16(cospi_24_64, -cospi_8_64,
+                                            cospi_24_64, -cospi_8_64,
+                                            -cospi_24_64, cospi_8_64,
+                                            -cospi_24_64, cospi_8_64);
+
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  // This second rounding constant saves doing some extra adds at the end
+  const __m128i k__DCT_CONST_ROUNDING2 = _mm_set1_epi32(DCT_CONST_ROUNDING
+                                               +(DCT_CONST_ROUNDING << 1));
+  const int DCT_CONST_BITS2 =  DCT_CONST_BITS + 2;
+  const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
+  const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
+  __m128i in0, in1;
+#if DCT_HIGH_BIT_DEPTH
+  __m128i cmp0, cmp1;
+  int test, overflow;
+#endif
+
+  // Load inputs.
+  in0  = _mm_loadl_epi64((const __m128i *)(input +  0 * stride));
+  in1  = _mm_loadl_epi64((const __m128i *)(input +  1 * stride));
+  in1  = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *)
+                                                 (input +  2 * stride)));
+  in0  = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *)
+                                                 (input +  3 * stride)));
+  // in0 = [i0 i1 i2 i3 iC iD iE iF]
+  // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
+#if DCT_HIGH_BIT_DEPTH
+  // Check inputs small enough to use optimised code
+  cmp0 = _mm_xor_si128(_mm_cmpgt_epi16(in0, _mm_set1_epi16(0x3ff)),
+                       _mm_cmplt_epi16(in0, _mm_set1_epi16(0xfc00)));
+  cmp1 = _mm_xor_si128(_mm_cmpgt_epi16(in1, _mm_set1_epi16(0x3ff)),
+                       _mm_cmplt_epi16(in1, _mm_set1_epi16(0xfc00)));
+  test = _mm_movemask_epi8(_mm_or_si128(cmp0, cmp1));
+  if (test) {
+    vpx_highbd_fdct4x4_c(input, output, stride);
+    return;
+  }
+#endif  // DCT_HIGH_BIT_DEPTH
+
+  // multiply by 16 to give some extra precision
+  in0 = _mm_slli_epi16(in0, 4);
+  in1 = _mm_slli_epi16(in1, 4);
+  // if (i == 0 && input[0]) input[0] += 1;
+  // add 1 to the upper left pixel if it is non-zero, which helps reduce
+  // the round-trip error
+  {
+    // The mask will only contain whether the first value is zero, all
+    // other comparison will fail as something shifted by 4 (above << 4)
+    // can never be equal to one. To increment in the non-zero case, we
+    // add the mask and one for the first element:
+    //   - if zero, mask = -1, v = v - 1 + 1 = v
+    //   - if non-zero, mask = 0, v = v + 0 + 1 = v + 1
+    __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a);
+    in0 = _mm_add_epi16(in0, mask);
+    in0 = _mm_add_epi16(in0, k__nonzero_bias_b);
+  }
+  // There are 4 total stages, alternating between an add/subtract stage
+  // followed by an multiply-and-add stage.
+  {
+    // Stage 1: Add/subtract
+
+    // in0 = [i0 i1 i2 i3 iC iD iE iF]
+    // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
+    const __m128i r0 = _mm_unpacklo_epi16(in0, in1);
+    const __m128i r1 = _mm_unpackhi_epi16(in0, in1);
+    // r0 = [i0 i4 i1 i5 i2 i6 i3 i7]
+    // r1 = [iC i8 iD i9 iE iA iF iB]
+    const __m128i r2 = _mm_shuffle_epi32(r0, 0xB4);
+    const __m128i r3 = _mm_shuffle_epi32(r1, 0xB4);
+    // r2 = [i0 i4 i1 i5 i3 i7 i2 i6]
+    // r3 = [iC i8 iD i9 iF iB iE iA]
+
+    const __m128i t0 = _mm_add_epi16(r2, r3);
+    const __m128i t1 = _mm_sub_epi16(r2, r3);
+    // t0 = [a0 a4 a1 a5 a3 a7 a2 a6]
+    // t1 = [aC a8 aD a9 aF aB aE aA]
+
+    // Stage 2: multiply by constants (which gets us into 32 bits).
+    // The constants needed here are:
+    // k__cospi_A = [p16 p16 p16 p16 p16 m16 p16 m16]
+    // k__cospi_B = [p16 m16 p16 m16 p16 p16 p16 p16]
+    // k__cospi_C = [p08 p24 p08 p24 p24 m08 p24 m08]
+    // k__cospi_D = [p24 m08 p24 m08 p08 p24 p08 p24]
+    const __m128i u0 = _mm_madd_epi16(t0, k__cospi_A);
+    const __m128i u2 = _mm_madd_epi16(t0, k__cospi_B);
+    const __m128i u1 = _mm_madd_epi16(t1, k__cospi_C);
+    const __m128i u3 = _mm_madd_epi16(t1, k__cospi_D);
+    // Then add and right-shift to get back to 16-bit range
+    const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+    const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+    const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+    const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+    const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+    const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+    const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+    const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+    // w0 = [b0 b1 b7 b6]
+    // w1 = [b8 b9 bF bE]
+    // w2 = [b4 b5 b3 b2]
+    // w3 = [bC bD bB bA]
+    const __m128i x0 = _mm_packs_epi32(w0, w1);
+    const __m128i x1 = _mm_packs_epi32(w2, w3);
+#if DCT_HIGH_BIT_DEPTH
+    overflow = check_epi16_overflow_x2(&x0, &x1);
+    if (overflow) {
+      vpx_highbd_fdct4x4_c(input, output, stride);
+      return;
+    }
+#endif  // DCT_HIGH_BIT_DEPTH
+    // x0 = [b0 b1 b7 b6 b8 b9 bF bE]
+    // x1 = [b4 b5 b3 b2 bC bD bB bA]
+    in0 = _mm_shuffle_epi32(x0, 0xD8);
+    in1 = _mm_shuffle_epi32(x1, 0x8D);
+    // in0 = [b0 b1 b8 b9 b7 b6 bF bE]
+    // in1 = [b3 b2 bB bA b4 b5 bC bD]
+  }
+  {
+    // vertical DCTs finished. Now we do the horizontal DCTs.
+    // Stage 3: Add/subtract
+
+    const __m128i t0 = ADD_EPI16(in0, in1);
+    const __m128i t1 = SUB_EPI16(in0, in1);
+    // t0 = [c0 c1 c8 c9  c4  c5  cC  cD]
+    // t1 = [c3 c2 cB cA -c7 -c6 -cF -cE]
+#if DCT_HIGH_BIT_DEPTH
+    overflow = check_epi16_overflow_x2(&t0, &t1);
+    if (overflow) {
+      vpx_highbd_fdct4x4_c(input, output, stride);
+      return;
+    }
+#endif  // DCT_HIGH_BIT_DEPTH
+
+    // Stage 4: multiply by constants (which gets us into 32 bits).
+    {
+      // The constants needed here are:
+      // k__cospi_E = [p16 p16 p16 p16 p16 p16 p16 p16]
+      // k__cospi_F = [p16 m16 p16 m16 p16 m16 p16 m16]
+      // k__cospi_G = [p08 p24 p08 p24 m08 m24 m08 m24]
+      // k__cospi_H = [p24 m08 p24 m08 m24 p08 m24 p08]
+      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_E);
+      const __m128i u1 = _mm_madd_epi16(t0, k__cospi_F);
+      const __m128i u2 = _mm_madd_epi16(t1, k__cospi_G);
+      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_H);
+      // Then add and right-shift to get back to 16-bit range
+      // but this combines the final right-shift as well to save operations
+      // This unusual rounding operations is to maintain bit-accurate
+      // compatibility with the c version of this function which has two
+      // rounding steps in a row.
+      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING2);
+      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING2);
+      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING2);
+      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING2);
+      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS2);
+      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS2);
+      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS2);
+      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS2);
+      // w0 = [o0 o4 o8 oC]
+      // w1 = [o2 o6 oA oE]
+      // w2 = [o1 o5 o9 oD]
+      // w3 = [o3 o7 oB oF]
+      // remember the o's are numbered according to the correct output location
+      const __m128i x0 = _mm_packs_epi32(w0, w1);
+      const __m128i x1 = _mm_packs_epi32(w2, w3);
+#if DCT_HIGH_BIT_DEPTH
+      overflow = check_epi16_overflow_x2(&x0, &x1);
+      if (overflow) {
+        vpx_highbd_fdct4x4_c(input, output, stride);
+        return;
+      }
+#endif  // DCT_HIGH_BIT_DEPTH
+      {
+        // x0 = [o0 o4 o8 oC o2 o6 oA oE]
+        // x1 = [o1 o5 o9 oD o3 o7 oB oF]
+        const __m128i y0 = _mm_unpacklo_epi16(x0, x1);
+        const __m128i y1 = _mm_unpackhi_epi16(x0, x1);
+        // y0 = [o0 o1 o4 o5 o8 o9 oC oD]
+        // y1 = [o2 o3 o6 o7 oA oB oE oF]
+        in0 = _mm_unpacklo_epi32(y0, y1);
+        // in0 = [o0 o1 o2 o3 o4 o5 o6 o7]
+        in1 = _mm_unpackhi_epi32(y0, y1);
+        // in1 = [o8 o9 oA oB oC oD oE oF]
+      }
+    }
+  }
+  // Post-condition (v + 1) >> 2 is now incorporated into previous
+  // add and right-shift commands.  Only 2 store instructions needed
+  // because we are using the fact that 1/3 are stored just after 0/2.
+  storeu_output(&in0, output + 0 * 4);
+  storeu_output(&in1, output + 2 * 4);
+}
+
+
+void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
+  int pass;
+  // Constants
+  //    When we use them, in one case, they are all the same. In all others
+  //    it's a pair of them that we need to repeat four times. This is done
+  //    by constructing the 32 bit constant corresponding to that pair.
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+#if DCT_HIGH_BIT_DEPTH
+  int overflow;
+#endif
+  // Load input
+  __m128i in0  = _mm_load_si128((const __m128i *)(input + 0 * stride));
+  __m128i in1  = _mm_load_si128((const __m128i *)(input + 1 * stride));
+  __m128i in2  = _mm_load_si128((const __m128i *)(input + 2 * stride));
+  __m128i in3  = _mm_load_si128((const __m128i *)(input + 3 * stride));
+  __m128i in4  = _mm_load_si128((const __m128i *)(input + 4 * stride));
+  __m128i in5  = _mm_load_si128((const __m128i *)(input + 5 * stride));
+  __m128i in6  = _mm_load_si128((const __m128i *)(input + 6 * stride));
+  __m128i in7  = _mm_load_si128((const __m128i *)(input + 7 * stride));
+  // Pre-condition input (shift by two)
+  in0 = _mm_slli_epi16(in0, 2);
+  in1 = _mm_slli_epi16(in1, 2);
+  in2 = _mm_slli_epi16(in2, 2);
+  in3 = _mm_slli_epi16(in3, 2);
+  in4 = _mm_slli_epi16(in4, 2);
+  in5 = _mm_slli_epi16(in5, 2);
+  in6 = _mm_slli_epi16(in6, 2);
+  in7 = _mm_slli_epi16(in7, 2);
+
+  // We do two passes, first the columns, then the rows. The results of the
+  // first pass are transposed so that the same column code can be reused. The
+  // results of the second pass are also transposed so that the rows (processed
+  // as columns) are put back in row positions.
+  for (pass = 0; pass < 2; pass++) {
+    // To store results of each pass before the transpose.
+    __m128i res0, res1, res2, res3, res4, res5, res6, res7;
+    // Add/subtract
+    const __m128i q0 = ADD_EPI16(in0, in7);
+    const __m128i q1 = ADD_EPI16(in1, in6);
+    const __m128i q2 = ADD_EPI16(in2, in5);
+    const __m128i q3 = ADD_EPI16(in3, in4);
+    const __m128i q4 = SUB_EPI16(in3, in4);
+    const __m128i q5 = SUB_EPI16(in2, in5);
+    const __m128i q6 = SUB_EPI16(in1, in6);
+    const __m128i q7 = SUB_EPI16(in0, in7);
+#if DCT_HIGH_BIT_DEPTH
+    if (pass == 1) {
+      overflow = check_epi16_overflow_x8(&q0, &q1, &q2, &q3,
+                                         &q4, &q5, &q6, &q7);
+      if (overflow) {
+        vpx_highbd_fdct8x8_c(input, output, stride);
+        return;
+      }
+    }
+#endif  // DCT_HIGH_BIT_DEPTH
+    // Work on first four results
+    {
+      // Add/subtract
+      const __m128i r0 = ADD_EPI16(q0, q3);
+      const __m128i r1 = ADD_EPI16(q1, q2);
+      const __m128i r2 = SUB_EPI16(q1, q2);
+      const __m128i r3 = SUB_EPI16(q0, q3);
+#if DCT_HIGH_BIT_DEPTH
+      overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3);
+      if (overflow) {
+        vpx_highbd_fdct8x8_c(input, output, stride);
+        return;
+      }
+#endif  // DCT_HIGH_BIT_DEPTH
+      // Interleave to do the multiply by constants which gets us into 32bits
+      {
+        const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+        const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
+        const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+        const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
+        const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+        const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
+        const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+        const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
+        const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
+        const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
+        const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+        const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
+        // dct_const_round_shift
+        const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+        const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+        const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+        const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+        const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+        const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+        const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+        const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+        const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+        const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+        const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+        const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+        const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+        const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+        const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+        const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+        // Combine
+        res0 = _mm_packs_epi32(w0, w1);
+        res4 = _mm_packs_epi32(w2, w3);
+        res2 = _mm_packs_epi32(w4, w5);
+        res6 = _mm_packs_epi32(w6, w7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x4(&res0, &res4, &res2, &res6);
+        if (overflow) {
+          vpx_highbd_fdct8x8_c(input, output, stride);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+    }
+    // Work on next four results
+    {
+      // Interleave to do the multiply by constants which gets us into 32bits
+      const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
+      const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
+      const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
+      const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
+      const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
+      const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
+      // dct_const_round_shift
+      const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
+      const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
+      const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
+      const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
+      const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
+      const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
+      const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
+      const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
+      // Combine
+      const __m128i r0 = _mm_packs_epi32(s0, s1);
+      const __m128i r1 = _mm_packs_epi32(s2, s3);
+#if DCT_HIGH_BIT_DEPTH
+      overflow = check_epi16_overflow_x2(&r0, &r1);
+      if (overflow) {
+        vpx_highbd_fdct8x8_c(input, output, stride);
+        return;
+      }
+#endif  // DCT_HIGH_BIT_DEPTH
+      {
+        // Add/subtract
+        const __m128i x0 = ADD_EPI16(q4, r0);
+        const __m128i x1 = SUB_EPI16(q4, r0);
+        const __m128i x2 = SUB_EPI16(q7, r1);
+        const __m128i x3 = ADD_EPI16(q7, r1);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3);
+        if (overflow) {
+          vpx_highbd_fdct8x8_c(input, output, stride);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+        // Interleave to do the multiply by constants which gets us into 32bits
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
+          const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
+          const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
+          const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
+          const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
+          const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
+          const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
+          const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
+          const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
+          const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+          const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+          const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+          const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+          const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+          const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+          const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+          // Combine
+          res1 = _mm_packs_epi32(w0, w1);
+          res7 = _mm_packs_epi32(w2, w3);
+          res5 = _mm_packs_epi32(w4, w5);
+          res3 = _mm_packs_epi32(w6, w7);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x4(&res1, &res7, &res5, &res3);
+          if (overflow) {
+            vpx_highbd_fdct8x8_c(input, output, stride);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+        }
+      }
+    }
+    // Transpose the 8x8.
+    {
+      // 00 01 02 03 04 05 06 07
+      // 10 11 12 13 14 15 16 17
+      // 20 21 22 23 24 25 26 27
+      // 30 31 32 33 34 35 36 37
+      // 40 41 42 43 44 45 46 47
+      // 50 51 52 53 54 55 56 57
+      // 60 61 62 63 64 65 66 67
+      // 70 71 72 73 74 75 76 77
+      const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
+      const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
+      const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
+      const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
+      const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
+      const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
+      const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
+      const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
+      // 00 10 01 11 02 12 03 13
+      // 20 30 21 31 22 32 23 33
+      // 04 14 05 15 06 16 07 17
+      // 24 34 25 35 26 36 27 37
+      // 40 50 41 51 42 52 43 53
+      // 60 70 61 71 62 72 63 73
+      // 54 54 55 55 56 56 57 57
+      // 64 74 65 75 66 76 67 77
+      const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+      const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+      const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+      const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+      const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+      const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+      const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+      const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+      // 00 10 20 30 01 11 21 31
+      // 40 50 60 70 41 51 61 71
+      // 02 12 22 32 03 13 23 33
+      // 42 52 62 72 43 53 63 73
+      // 04 14 24 34 05 15 21 36
+      // 44 54 64 74 45 55 61 76
+      // 06 16 26 36 07 17 27 37
+      // 46 56 66 76 47 57 67 77
+      in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+      in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+      in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+      in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+      in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+      in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+      in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+      in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+      // 00 10 20 30 40 50 60 70
+      // 01 11 21 31 41 51 61 71
+      // 02 12 22 32 42 52 62 72
+      // 03 13 23 33 43 53 63 73
+      // 04 14 24 34 44 54 64 74
+      // 05 15 25 35 45 55 65 75
+      // 06 16 26 36 46 56 66 76
+      // 07 17 27 37 47 57 67 77
+    }
+  }
+  // Post-condition output and store it
+  {
+    // Post-condition (division by two)
+    //    division of two 16 bits signed numbers using shifts
+    //    n / 2 = (n - (n >> 15)) >> 1
+    const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
+    const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
+    const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
+    const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
+    const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
+    const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
+    const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
+    const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
+    in0 = _mm_sub_epi16(in0, sign_in0);
+    in1 = _mm_sub_epi16(in1, sign_in1);
+    in2 = _mm_sub_epi16(in2, sign_in2);
+    in3 = _mm_sub_epi16(in3, sign_in3);
+    in4 = _mm_sub_epi16(in4, sign_in4);
+    in5 = _mm_sub_epi16(in5, sign_in5);
+    in6 = _mm_sub_epi16(in6, sign_in6);
+    in7 = _mm_sub_epi16(in7, sign_in7);
+    in0 = _mm_srai_epi16(in0, 1);
+    in1 = _mm_srai_epi16(in1, 1);
+    in2 = _mm_srai_epi16(in2, 1);
+    in3 = _mm_srai_epi16(in3, 1);
+    in4 = _mm_srai_epi16(in4, 1);
+    in5 = _mm_srai_epi16(in5, 1);
+    in6 = _mm_srai_epi16(in6, 1);
+    in7 = _mm_srai_epi16(in7, 1);
+    // store results
+    store_output(&in0, (output + 0 * 8));
+    store_output(&in1, (output + 1 * 8));
+    store_output(&in2, (output + 2 * 8));
+    store_output(&in3, (output + 3 * 8));
+    store_output(&in4, (output + 4 * 8));
+    store_output(&in5, (output + 5 * 8));
+    store_output(&in6, (output + 6 * 8));
+    store_output(&in7, (output + 7 * 8));
+  }
+}
+
+void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
+  // The 2D transform is done with two passes which are actually pretty
+  // similar. In the first one, we transform the columns and transpose
+  // the results. In the second one, we transform the rows. To achieve that,
+  // as the first pass results are transposed, we transpose the columns (that
+  // is the transposed rows) and transpose the results (so that it goes back
+  // in normal/row positions).
+  int pass;
+  // We need an intermediate buffer between passes.
+  DECLARE_ALIGNED(16, int16_t, intermediate[256]);
+  const int16_t *in = input;
+  int16_t *out0 = intermediate;
+  tran_low_t *out1 = output;
+  // Constants
+  //    When we use them, in one case, they are all the same. In all others
+  //    it's a pair of them that we need to repeat four times. This is done
+  //    by constructing the 32 bit constant corresponding to that pair.
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
+  const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
+  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
+  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
+  const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
+  const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
+  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
+  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i kOne = _mm_set1_epi16(1);
+  // Do the two transform/transpose passes
+  for (pass = 0; pass < 2; ++pass) {
+    // We process eight columns (transposed rows in second pass) at a time.
+    int column_start;
+#if DCT_HIGH_BIT_DEPTH
+    int overflow;
+#endif
+    for (column_start = 0; column_start < 16; column_start += 8) {
+      __m128i in00, in01, in02, in03, in04, in05, in06, in07;
+      __m128i in08, in09, in10, in11, in12, in13, in14, in15;
+      __m128i input0, input1, input2, input3, input4, input5, input6, input7;
+      __m128i step1_0, step1_1, step1_2, step1_3;
+      __m128i step1_4, step1_5, step1_6, step1_7;
+      __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
+      __m128i step3_0, step3_1, step3_2, step3_3;
+      __m128i step3_4, step3_5, step3_6, step3_7;
+      __m128i res00, res01, res02, res03, res04, res05, res06, res07;
+      __m128i res08, res09, res10, res11, res12, res13, res14, res15;
+      // Load and pre-condition input.
+      if (0 == pass) {
+        in00  = _mm_load_si128((const __m128i *)(in +  0 * stride));
+        in01  = _mm_load_si128((const __m128i *)(in +  1 * stride));
+        in02  = _mm_load_si128((const __m128i *)(in +  2 * stride));
+        in03  = _mm_load_si128((const __m128i *)(in +  3 * stride));
+        in04  = _mm_load_si128((const __m128i *)(in +  4 * stride));
+        in05  = _mm_load_si128((const __m128i *)(in +  5 * stride));
+        in06  = _mm_load_si128((const __m128i *)(in +  6 * stride));
+        in07  = _mm_load_si128((const __m128i *)(in +  7 * stride));
+        in08  = _mm_load_si128((const __m128i *)(in +  8 * stride));
+        in09  = _mm_load_si128((const __m128i *)(in +  9 * stride));
+        in10  = _mm_load_si128((const __m128i *)(in + 10 * stride));
+        in11  = _mm_load_si128((const __m128i *)(in + 11 * stride));
+        in12  = _mm_load_si128((const __m128i *)(in + 12 * stride));
+        in13  = _mm_load_si128((const __m128i *)(in + 13 * stride));
+        in14  = _mm_load_si128((const __m128i *)(in + 14 * stride));
+        in15  = _mm_load_si128((const __m128i *)(in + 15 * stride));
+        // x = x << 2
+        in00 = _mm_slli_epi16(in00, 2);
+        in01 = _mm_slli_epi16(in01, 2);
+        in02 = _mm_slli_epi16(in02, 2);
+        in03 = _mm_slli_epi16(in03, 2);
+        in04 = _mm_slli_epi16(in04, 2);
+        in05 = _mm_slli_epi16(in05, 2);
+        in06 = _mm_slli_epi16(in06, 2);
+        in07 = _mm_slli_epi16(in07, 2);
+        in08 = _mm_slli_epi16(in08, 2);
+        in09 = _mm_slli_epi16(in09, 2);
+        in10 = _mm_slli_epi16(in10, 2);
+        in11 = _mm_slli_epi16(in11, 2);
+        in12 = _mm_slli_epi16(in12, 2);
+        in13 = _mm_slli_epi16(in13, 2);
+        in14 = _mm_slli_epi16(in14, 2);
+        in15 = _mm_slli_epi16(in15, 2);
+      } else {
+        in00  = _mm_load_si128((const __m128i *)(in +  0 * 16));
+        in01  = _mm_load_si128((const __m128i *)(in +  1 * 16));
+        in02  = _mm_load_si128((const __m128i *)(in +  2 * 16));
+        in03  = _mm_load_si128((const __m128i *)(in +  3 * 16));
+        in04  = _mm_load_si128((const __m128i *)(in +  4 * 16));
+        in05  = _mm_load_si128((const __m128i *)(in +  5 * 16));
+        in06  = _mm_load_si128((const __m128i *)(in +  6 * 16));
+        in07  = _mm_load_si128((const __m128i *)(in +  7 * 16));
+        in08  = _mm_load_si128((const __m128i *)(in +  8 * 16));
+        in09  = _mm_load_si128((const __m128i *)(in +  9 * 16));
+        in10  = _mm_load_si128((const __m128i *)(in + 10 * 16));
+        in11  = _mm_load_si128((const __m128i *)(in + 11 * 16));
+        in12  = _mm_load_si128((const __m128i *)(in + 12 * 16));
+        in13  = _mm_load_si128((const __m128i *)(in + 13 * 16));
+        in14  = _mm_load_si128((const __m128i *)(in + 14 * 16));
+        in15  = _mm_load_si128((const __m128i *)(in + 15 * 16));
+        // x = (x + 1) >> 2
+        in00 = _mm_add_epi16(in00, kOne);
+        in01 = _mm_add_epi16(in01, kOne);
+        in02 = _mm_add_epi16(in02, kOne);
+        in03 = _mm_add_epi16(in03, kOne);
+        in04 = _mm_add_epi16(in04, kOne);
+        in05 = _mm_add_epi16(in05, kOne);
+        in06 = _mm_add_epi16(in06, kOne);
+        in07 = _mm_add_epi16(in07, kOne);
+        in08 = _mm_add_epi16(in08, kOne);
+        in09 = _mm_add_epi16(in09, kOne);
+        in10 = _mm_add_epi16(in10, kOne);
+        in11 = _mm_add_epi16(in11, kOne);
+        in12 = _mm_add_epi16(in12, kOne);
+        in13 = _mm_add_epi16(in13, kOne);
+        in14 = _mm_add_epi16(in14, kOne);
+        in15 = _mm_add_epi16(in15, kOne);
+        in00 = _mm_srai_epi16(in00, 2);
+        in01 = _mm_srai_epi16(in01, 2);
+        in02 = _mm_srai_epi16(in02, 2);
+        in03 = _mm_srai_epi16(in03, 2);
+        in04 = _mm_srai_epi16(in04, 2);
+        in05 = _mm_srai_epi16(in05, 2);
+        in06 = _mm_srai_epi16(in06, 2);
+        in07 = _mm_srai_epi16(in07, 2);
+        in08 = _mm_srai_epi16(in08, 2);
+        in09 = _mm_srai_epi16(in09, 2);
+        in10 = _mm_srai_epi16(in10, 2);
+        in11 = _mm_srai_epi16(in11, 2);
+        in12 = _mm_srai_epi16(in12, 2);
+        in13 = _mm_srai_epi16(in13, 2);
+        in14 = _mm_srai_epi16(in14, 2);
+        in15 = _mm_srai_epi16(in15, 2);
+      }
+      in += 8;
+      // Calculate input for the first 8 results.
+      {
+        input0 = ADD_EPI16(in00, in15);
+        input1 = ADD_EPI16(in01, in14);
+        input2 = ADD_EPI16(in02, in13);
+        input3 = ADD_EPI16(in03, in12);
+        input4 = ADD_EPI16(in04, in11);
+        input5 = ADD_EPI16(in05, in10);
+        input6 = ADD_EPI16(in06, in09);
+        input7 = ADD_EPI16(in07, in08);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x8(&input0, &input1, &input2, &input3,
+                                           &input4, &input5, &input6, &input7);
+        if (overflow) {
+          vpx_highbd_fdct16x16_c(input, output, stride);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+      // Calculate input for the next 8 results.
+      {
+        step1_0 = SUB_EPI16(in07, in08);
+        step1_1 = SUB_EPI16(in06, in09);
+        step1_2 = SUB_EPI16(in05, in10);
+        step1_3 = SUB_EPI16(in04, in11);
+        step1_4 = SUB_EPI16(in03, in12);
+        step1_5 = SUB_EPI16(in02, in13);
+        step1_6 = SUB_EPI16(in01, in14);
+        step1_7 = SUB_EPI16(in00, in15);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x8(&step1_0, &step1_1,
+                                           &step1_2, &step1_3,
+                                           &step1_4, &step1_5,
+                                           &step1_6, &step1_7);
+        if (overflow) {
+          vpx_highbd_fdct16x16_c(input, output, stride);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+      // Work on the first eight values; fdct8(input, even_results);
+      {
+        // Add/subtract
+        const __m128i q0 = ADD_EPI16(input0, input7);
+        const __m128i q1 = ADD_EPI16(input1, input6);
+        const __m128i q2 = ADD_EPI16(input2, input5);
+        const __m128i q3 = ADD_EPI16(input3, input4);
+        const __m128i q4 = SUB_EPI16(input3, input4);
+        const __m128i q5 = SUB_EPI16(input2, input5);
+        const __m128i q6 = SUB_EPI16(input1, input6);
+        const __m128i q7 = SUB_EPI16(input0, input7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x8(&q0, &q1, &q2, &q3,
+                                           &q4, &q5, &q6, &q7);
+        if (overflow) {
+          vpx_highbd_fdct16x16_c(input, output, stride);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+        // Work on first four results
+        {
+          // Add/subtract
+          const __m128i r0 = ADD_EPI16(q0, q3);
+          const __m128i r1 = ADD_EPI16(q1, q2);
+          const __m128i r2 = SUB_EPI16(q1, q2);
+          const __m128i r3 = SUB_EPI16(q0, q3);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3);
+          if (overflow) {
+            vpx_highbd_fdct16x16_c(input, output, stride);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+          // Interleave to do the multiply by constants which gets us
+          // into 32 bits.
+          {
+            const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+            const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
+            const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+            const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
+            res00 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+            res08 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+            res04 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+            res12 = mult_round_shift(&t2, &t3, &k__cospi_m08_p24,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+            overflow = check_epi16_overflow_x4(&res00, &res08, &res04, &res12);
+            if (overflow) {
+              vpx_highbd_fdct16x16_c(input, output, stride);
+              return;
+            }
+#endif  // DCT_HIGH_BIT_DEPTH
+          }
+        }
+        // Work on next four results
+        {
+          // Interleave to do the multiply by constants which gets us
+          // into 32 bits.
+          const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
+          const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
+          const __m128i r0 = mult_round_shift(&d0, &d1, &k__cospi_p16_m16,
+                                              &k__DCT_CONST_ROUNDING,
+                                              DCT_CONST_BITS);
+          const __m128i r1 = mult_round_shift(&d0, &d1, &k__cospi_p16_p16,
+                                              &k__DCT_CONST_ROUNDING,
+                                              DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x2(&r0, &r1);
+          if (overflow) {
+            vpx_highbd_fdct16x16_c(input, output, stride);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+          {
+            // Add/subtract
+            const __m128i x0 = ADD_EPI16(q4, r0);
+            const __m128i x1 = SUB_EPI16(q4, r0);
+            const __m128i x2 = SUB_EPI16(q7, r1);
+            const __m128i x3 = ADD_EPI16(q7, r1);
+#if DCT_HIGH_BIT_DEPTH
+            overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3);
+            if (overflow) {
+              vpx_highbd_fdct16x16_c(input, output, stride);
+              return;
+            }
+#endif  // DCT_HIGH_BIT_DEPTH
+            // Interleave to do the multiply by constants which gets us
+            // into 32 bits.
+            {
+              const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
+              const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
+              const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
+              const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
+              res02 = mult_round_shift(&t0, &t1, &k__cospi_p28_p04,
+                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+              res14 = mult_round_shift(&t0, &t1, &k__cospi_m04_p28,
+                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+              res10 = mult_round_shift(&t2, &t3, &k__cospi_p12_p20,
+                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+              res06 = mult_round_shift(&t2, &t3, &k__cospi_m20_p12,
+                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+              overflow = check_epi16_overflow_x4(&res02, &res14,
+                                                 &res10, &res06);
+              if (overflow) {
+                vpx_highbd_fdct16x16_c(input, output, stride);
+                return;
+              }
+#endif  // DCT_HIGH_BIT_DEPTH
+            }
+          }
+        }
+      }
+      // Work on the next eight values; step1 -> odd_results
+      {
+        // step 2
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
+          const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
+          const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
+          const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
+          step2_2 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          step2_3 = mult_round_shift(&t2, &t3, &k__cospi_p16_m16,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          step2_5 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          step2_4 = mult_round_shift(&t2, &t3, &k__cospi_p16_p16,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x4(&step2_2, &step2_3, &step2_5,
+                                             &step2_4);
+          if (overflow) {
+            vpx_highbd_fdct16x16_c(input, output, stride);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+        }
+        // step 3
+        {
+          step3_0 = ADD_EPI16(step1_0, step2_3);
+          step3_1 = ADD_EPI16(step1_1, step2_2);
+          step3_2 = SUB_EPI16(step1_1, step2_2);
+          step3_3 = SUB_EPI16(step1_0, step2_3);
+          step3_4 = SUB_EPI16(step1_7, step2_4);
+          step3_5 = SUB_EPI16(step1_6, step2_5);
+          step3_6 = ADD_EPI16(step1_6, step2_5);
+          step3_7 = ADD_EPI16(step1_7, step2_4);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x8(&step3_0, &step3_1,
+                                             &step3_2, &step3_3,
+                                             &step3_4, &step3_5,
+                                             &step3_6, &step3_7);
+          if (overflow) {
+            vpx_highbd_fdct16x16_c(input, output, stride);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+        }
+        // step 4
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
+          const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
+          const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
+          const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
+          step2_1 = mult_round_shift(&t0, &t1, &k__cospi_m08_p24,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          step2_2 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          step2_6 = mult_round_shift(&t0, &t1, &k__cospi_p24_p08,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          step2_5 = mult_round_shift(&t2, &t3, &k__cospi_p08_m24,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x4(&step2_1, &step2_2, &step2_6,
+                                             &step2_5);
+          if (overflow) {
+            vpx_highbd_fdct16x16_c(input, output, stride);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+        }
+        // step 5
+        {
+          step1_0 = ADD_EPI16(step3_0, step2_1);
+          step1_1 = SUB_EPI16(step3_0, step2_1);
+          step1_2 = ADD_EPI16(step3_3, step2_2);
+          step1_3 = SUB_EPI16(step3_3, step2_2);
+          step1_4 = SUB_EPI16(step3_4, step2_5);
+          step1_5 = ADD_EPI16(step3_4, step2_5);
+          step1_6 = SUB_EPI16(step3_7, step2_6);
+          step1_7 = ADD_EPI16(step3_7, step2_6);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x8(&step1_0, &step1_1,
+                                             &step1_2, &step1_3,
+                                             &step1_4, &step1_5,
+                                             &step1_6, &step1_7);
+          if (overflow) {
+            vpx_highbd_fdct16x16_c(input, output, stride);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+        }
+        // step 6
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
+          const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
+          const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
+          const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
+          res01 = mult_round_shift(&t0, &t1, &k__cospi_p30_p02,
+                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          res09 = mult_round_shift(&t2, &t3, &k__cospi_p14_p18,
+                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          res15 = mult_round_shift(&t0, &t1, &k__cospi_m02_p30,
+                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          res07 = mult_round_shift(&t2, &t3, &k__cospi_m18_p14,
+                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x4(&res01, &res09, &res15, &res07);
+          if (overflow) {
+            vpx_highbd_fdct16x16_c(input, output, stride);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+        }
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
+          const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
+          const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
+          const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
+          res05 = mult_round_shift(&t0, &t1, &k__cospi_p22_p10,
+                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          res13 = mult_round_shift(&t2, &t3, &k__cospi_p06_p26,
+                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          res11 = mult_round_shift(&t0, &t1, &k__cospi_m10_p22,
+                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          res03 = mult_round_shift(&t2, &t3, &k__cospi_m26_p06,
+                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x4(&res05, &res13, &res11, &res03);
+          if (overflow) {
+            vpx_highbd_fdct16x16_c(input, output, stride);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+        }
+      }
+      // Transpose the results, do it as two 8x8 transposes.
+      transpose_and_output8x8(&res00, &res01, &res02, &res03,
+                              &res04, &res05, &res06, &res07,
+                              pass, out0, out1);
+      transpose_and_output8x8(&res08, &res09, &res10, &res11,
+                              &res12, &res13, &res14, &res15,
+                              pass, out0 + 8, out1 + 8);
+      if (pass == 0) {
+        out0 += 8*16;
+      } else {
+        out1 += 8*16;
+      }
+    }
+    // Setup in/out for next pass.
+    in = intermediate;
+  }
+}
+
+#undef ADD_EPI16
+#undef SUB_EPI16
diff --git a/libs/libvpx/vpx_dsp/x86/fwd_txfm_sse2.c b/libs/libvpx/vpx_dsp/x86/fwd_txfm_sse2.c
new file mode 100644
index 0000000000..bca72e8749
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/fwd_txfm_sse2.c
@@ -0,0 +1,271 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>  // SSE2
+
+#include "./vpx_config.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/x86/fwd_txfm_sse2.h"
+
+void vpx_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
+  __m128i in0, in1;
+  __m128i tmp;
+  const __m128i zero = _mm_setzero_si128();
+  in0  = _mm_loadl_epi64((const __m128i *)(input +  0 * stride));
+  in1  = _mm_loadl_epi64((const __m128i *)(input +  1 * stride));
+  in1  = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *)
+         (input +  2 * stride)));
+  in0  = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *)
+         (input +  3 * stride)));
+
+  tmp = _mm_add_epi16(in0, in1);
+  in0 = _mm_unpacklo_epi16(zero, tmp);
+  in1 = _mm_unpackhi_epi16(zero, tmp);
+  in0 = _mm_srai_epi32(in0, 16);
+  in1 = _mm_srai_epi32(in1, 16);
+
+  tmp = _mm_add_epi32(in0, in1);
+  in0 = _mm_unpacklo_epi32(tmp, zero);
+  in1 = _mm_unpackhi_epi32(tmp, zero);
+
+  tmp = _mm_add_epi32(in0, in1);
+  in0 = _mm_srli_si128(tmp, 8);
+
+  in1 = _mm_add_epi32(tmp, in0);
+  in0 = _mm_slli_epi32(in1, 1);
+  store_output(&in0, output);
+}
+
+void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
+  __m128i in0  = _mm_load_si128((const __m128i *)(input + 0 * stride));
+  __m128i in1  = _mm_load_si128((const __m128i *)(input + 1 * stride));
+  __m128i in2  = _mm_load_si128((const __m128i *)(input + 2 * stride));
+  __m128i in3  = _mm_load_si128((const __m128i *)(input + 3 * stride));
+  __m128i u0, u1, sum;
+
+  u0 = _mm_add_epi16(in0, in1);
+  u1 = _mm_add_epi16(in2, in3);
+
+  in0  = _mm_load_si128((const __m128i *)(input + 4 * stride));
+  in1  = _mm_load_si128((const __m128i *)(input + 5 * stride));
+  in2  = _mm_load_si128((const __m128i *)(input + 6 * stride));
+  in3  = _mm_load_si128((const __m128i *)(input + 7 * stride));
+
+  sum = _mm_add_epi16(u0, u1);
+
+  in0 = _mm_add_epi16(in0, in1);
+  in2 = _mm_add_epi16(in2, in3);
+  sum = _mm_add_epi16(sum, in0);
+
+  u0  = _mm_setzero_si128();
+  sum = _mm_add_epi16(sum, in2);
+
+  in0 = _mm_unpacklo_epi16(u0, sum);
+  in1 = _mm_unpackhi_epi16(u0, sum);
+  in0 = _mm_srai_epi32(in0, 16);
+  in1 = _mm_srai_epi32(in1, 16);
+
+  sum = _mm_add_epi32(in0, in1);
+  in0 = _mm_unpacklo_epi32(sum, u0);
+  in1 = _mm_unpackhi_epi32(sum, u0);
+
+  sum = _mm_add_epi32(in0, in1);
+  in0 = _mm_srli_si128(sum, 8);
+
+  in1 = _mm_add_epi32(sum, in0);
+  store_output(&in1, output);
+}
+
+void vpx_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output,
+                          int stride) {
+  __m128i in0, in1, in2, in3;
+  __m128i u0, u1;
+  __m128i sum = _mm_setzero_si128();
+  int i;
+
+  for (i = 0; i < 2; ++i) {
+    input += 8 * i;
+    in0  = _mm_load_si128((const __m128i *)(input +  0 * stride));
+    in1  = _mm_load_si128((const __m128i *)(input +  1 * stride));
+    in2  = _mm_load_si128((const __m128i *)(input +  2 * stride));
+    in3  = _mm_load_si128((const __m128i *)(input +  3 * stride));
+
+    u0 = _mm_add_epi16(in0, in1);
+    u1 = _mm_add_epi16(in2, in3);
+    sum = _mm_add_epi16(sum, u0);
+
+    in0  = _mm_load_si128((const __m128i *)(input +  4 * stride));
+    in1  = _mm_load_si128((const __m128i *)(input +  5 * stride));
+    in2  = _mm_load_si128((const __m128i *)(input +  6 * stride));
+    in3  = _mm_load_si128((const __m128i *)(input +  7 * stride));
+
+    sum = _mm_add_epi16(sum, u1);
+    u0  = _mm_add_epi16(in0, in1);
+    u1  = _mm_add_epi16(in2, in3);
+    sum = _mm_add_epi16(sum, u0);
+
+    in0  = _mm_load_si128((const __m128i *)(input +  8 * stride));
+    in1  = _mm_load_si128((const __m128i *)(input +  9 * stride));
+    in2  = _mm_load_si128((const __m128i *)(input + 10 * stride));
+    in3  = _mm_load_si128((const __m128i *)(input + 11 * stride));
+
+    sum = _mm_add_epi16(sum, u1);
+    u0  = _mm_add_epi16(in0, in1);
+    u1  = _mm_add_epi16(in2, in3);
+    sum = _mm_add_epi16(sum, u0);
+
+    in0  = _mm_load_si128((const __m128i *)(input + 12 * stride));
+    in1  = _mm_load_si128((const __m128i *)(input + 13 * stride));
+    in2  = _mm_load_si128((const __m128i *)(input + 14 * stride));
+    in3  = _mm_load_si128((const __m128i *)(input + 15 * stride));
+
+    sum = _mm_add_epi16(sum, u1);
+    u0  = _mm_add_epi16(in0, in1);
+    u1  = _mm_add_epi16(in2, in3);
+    sum = _mm_add_epi16(sum, u0);
+
+    sum = _mm_add_epi16(sum, u1);
+  }
+
+  u0  = _mm_setzero_si128();
+  in0 = _mm_unpacklo_epi16(u0, sum);
+  in1 = _mm_unpackhi_epi16(u0, sum);
+  in0 = _mm_srai_epi32(in0, 16);
+  in1 = _mm_srai_epi32(in1, 16);
+
+  sum = _mm_add_epi32(in0, in1);
+  in0 = _mm_unpacklo_epi32(sum, u0);
+  in1 = _mm_unpackhi_epi32(sum, u0);
+
+  sum = _mm_add_epi32(in0, in1);
+  in0 = _mm_srli_si128(sum, 8);
+
+  in1 = _mm_add_epi32(sum, in0);
+  in1 = _mm_srai_epi32(in1, 1);
+  store_output(&in1, output);
+}
+
+void vpx_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output,
+                          int stride) {
+  __m128i in0, in1, in2, in3;
+  __m128i u0, u1;
+  __m128i sum = _mm_setzero_si128();
+  int i;
+
+  for (i = 0; i < 8; ++i) {
+    in0  = _mm_load_si128((const __m128i *)(input +  0));
+    in1  = _mm_load_si128((const __m128i *)(input +  8));
+    in2  = _mm_load_si128((const __m128i *)(input + 16));
+    in3  = _mm_load_si128((const __m128i *)(input + 24));
+
+    input += stride;
+    u0 = _mm_add_epi16(in0, in1);
+    u1 = _mm_add_epi16(in2, in3);
+    sum = _mm_add_epi16(sum, u0);
+
+    in0  = _mm_load_si128((const __m128i *)(input +  0));
+    in1  = _mm_load_si128((const __m128i *)(input +  8));
+    in2  = _mm_load_si128((const __m128i *)(input + 16));
+    in3  = _mm_load_si128((const __m128i *)(input + 24));
+
+    input += stride;
+    sum = _mm_add_epi16(sum, u1);
+    u0  = _mm_add_epi16(in0, in1);
+    u1  = _mm_add_epi16(in2, in3);
+    sum = _mm_add_epi16(sum, u0);
+
+    in0  = _mm_load_si128((const __m128i *)(input +  0));
+    in1  = _mm_load_si128((const __m128i *)(input +  8));
+    in2  = _mm_load_si128((const __m128i *)(input + 16));
+    in3  = _mm_load_si128((const __m128i *)(input + 24));
+
+    input += stride;
+    sum = _mm_add_epi16(sum, u1);
+    u0  = _mm_add_epi16(in0, in1);
+    u1  = _mm_add_epi16(in2, in3);
+    sum = _mm_add_epi16(sum, u0);
+
+    in0  = _mm_load_si128((const __m128i *)(input +  0));
+    in1  = _mm_load_si128((const __m128i *)(input +  8));
+    in2  = _mm_load_si128((const __m128i *)(input + 16));
+    in3  = _mm_load_si128((const __m128i *)(input + 24));
+
+    input += stride;
+    sum = _mm_add_epi16(sum, u1);
+    u0  = _mm_add_epi16(in0, in1);
+    u1  = _mm_add_epi16(in2, in3);
+    sum = _mm_add_epi16(sum, u0);
+
+    sum = _mm_add_epi16(sum, u1);
+  }
+
+  u0  = _mm_setzero_si128();
+  in0 = _mm_unpacklo_epi16(u0, sum);
+  in1 = _mm_unpackhi_epi16(u0, sum);
+  in0 = _mm_srai_epi32(in0, 16);
+  in1 = _mm_srai_epi32(in1, 16);
+
+  sum = _mm_add_epi32(in0, in1);
+  in0 = _mm_unpacklo_epi32(sum, u0);
+  in1 = _mm_unpackhi_epi32(sum, u0);
+
+  sum = _mm_add_epi32(in0, in1);
+  in0 = _mm_srli_si128(sum, 8);
+
+  in1 = _mm_add_epi32(sum, in0);
+  in1 = _mm_srai_epi32(in1, 3);
+  store_output(&in1, output);
+}
+
+#define DCT_HIGH_BIT_DEPTH 0
+#define FDCT4x4_2D vpx_fdct4x4_sse2
+#define FDCT8x8_2D vpx_fdct8x8_sse2
+#define FDCT16x16_2D vpx_fdct16x16_sse2
+#include "vpx_dsp/x86/fwd_txfm_impl_sse2.h"
+#undef  FDCT4x4_2D
+#undef  FDCT8x8_2D
+#undef  FDCT16x16_2D
+
+#define FDCT32x32_2D vpx_fdct32x32_rd_sse2
+#define FDCT32x32_HIGH_PRECISION 0
+#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h"
+#undef  FDCT32x32_2D
+#undef  FDCT32x32_HIGH_PRECISION
+
+#define FDCT32x32_2D vpx_fdct32x32_sse2
+#define FDCT32x32_HIGH_PRECISION 1
+#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h"  // NOLINT
+#undef  FDCT32x32_2D
+#undef  FDCT32x32_HIGH_PRECISION
+#undef  DCT_HIGH_BIT_DEPTH
+
+#if CONFIG_VP9_HIGHBITDEPTH
+#define DCT_HIGH_BIT_DEPTH 1
+#define FDCT4x4_2D vpx_highbd_fdct4x4_sse2
+#define FDCT8x8_2D vpx_highbd_fdct8x8_sse2
+#define FDCT16x16_2D vpx_highbd_fdct16x16_sse2
+#include "vpx_dsp/x86/fwd_txfm_impl_sse2.h" // NOLINT
+#undef  FDCT4x4_2D
+#undef  FDCT8x8_2D
+#undef  FDCT16x16_2D
+
+#define FDCT32x32_2D vpx_highbd_fdct32x32_rd_sse2
+#define FDCT32x32_HIGH_PRECISION 0
+#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT
+#undef  FDCT32x32_2D
+#undef  FDCT32x32_HIGH_PRECISION
+
+#define FDCT32x32_2D vpx_highbd_fdct32x32_sse2
+#define FDCT32x32_HIGH_PRECISION 1
+#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT
+#undef  FDCT32x32_2D
+#undef  FDCT32x32_HIGH_PRECISION
+#undef  DCT_HIGH_BIT_DEPTH
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/libs/libvpx/vpx_dsp/x86/fwd_txfm_sse2.h b/libs/libvpx/vpx_dsp/x86/fwd_txfm_sse2.h
new file mode 100644
index 0000000000..94d5befbfe
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/fwd_txfm_sse2.h
@@ -0,0 +1,454 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_X86_FWD_TXFM_SSE2_H_
+#define VPX_DSP_X86_FWD_TXFM_SSE2_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define pair_set_epi32(a, b) \
+  _mm_set_epi32((int)(b), (int)(a), (int)(b), (int)(a))
+
+static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) {
+  __m128i buf0, buf1;
+  buf0 = _mm_mul_epu32(a, b);
+  a = _mm_srli_epi64(a, 32);
+  b = _mm_srli_epi64(b, 32);
+  buf1 = _mm_mul_epu32(a, b);
+  return _mm_add_epi64(buf0, buf1);
+}
+
+static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) {
+  __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
+  __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
+  return _mm_unpacklo_epi64(buf0, buf1);
+}
+
+static INLINE int check_epi16_overflow_x2(const __m128i *preg0,
+                                          const __m128i *preg1) {
+  const __m128i max_overflow = _mm_set1_epi16(0x7fff);
+  const __m128i min_overflow = _mm_set1_epi16(0x8000);
+  __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
+                              _mm_cmpeq_epi16(*preg0, min_overflow));
+  __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
+                              _mm_cmpeq_epi16(*preg1, min_overflow));
+  cmp0 = _mm_or_si128(cmp0, cmp1);
+  return _mm_movemask_epi8(cmp0);
+}
+
+static INLINE int check_epi16_overflow_x4(const __m128i *preg0,
+                                          const __m128i *preg1,
+                                          const __m128i *preg2,
+                                          const __m128i *preg3) {
+  const __m128i max_overflow = _mm_set1_epi16(0x7fff);
+  const __m128i min_overflow = _mm_set1_epi16(0x8000);
+  __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
+                              _mm_cmpeq_epi16(*preg0, min_overflow));
+  __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
+                              _mm_cmpeq_epi16(*preg1, min_overflow));
+  __m128i cmp2 = _mm_or_si128(_mm_cmpeq_epi16(*preg2, max_overflow),
+                              _mm_cmpeq_epi16(*preg2, min_overflow));
+  __m128i cmp3 = _mm_or_si128(_mm_cmpeq_epi16(*preg3, max_overflow),
+                              _mm_cmpeq_epi16(*preg3, min_overflow));
+  cmp0 = _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3));
+  return _mm_movemask_epi8(cmp0);
+}
+
+static INLINE int check_epi16_overflow_x8(const __m128i *preg0,
+                                          const __m128i *preg1,
+                                          const __m128i *preg2,
+                                          const __m128i *preg3,
+                                          const __m128i *preg4,
+                                          const __m128i *preg5,
+                                          const __m128i *preg6,
+                                          const __m128i *preg7) {
+  int res0, res1;
+  res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
+  res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
+  return res0 + res1;
+}
+
+static INLINE int check_epi16_overflow_x12(const __m128i *preg0,
+                                           const __m128i *preg1,
+                                           const __m128i *preg2,
+                                           const __m128i *preg3,
+                                           const __m128i *preg4,
+                                           const __m128i *preg5,
+                                           const __m128i *preg6,
+                                           const __m128i *preg7,
+                                           const __m128i *preg8,
+                                           const __m128i *preg9,
+                                           const __m128i *preg10,
+                                           const __m128i *preg11) {
+  int res0, res1;
+  res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
+  res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
+  if (!res0)
+    res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
+  return res0 + res1;
+}
+
+static INLINE int check_epi16_overflow_x16(const __m128i *preg0,
+                                           const __m128i *preg1,
+                                           const __m128i *preg2,
+                                           const __m128i *preg3,
+                                           const __m128i *preg4,
+                                           const __m128i *preg5,
+                                           const __m128i *preg6,
+                                           const __m128i *preg7,
+                                           const __m128i *preg8,
+                                           const __m128i *preg9,
+                                           const __m128i *preg10,
+                                           const __m128i *preg11,
+                                           const __m128i *preg12,
+                                           const __m128i *preg13,
+                                           const __m128i *preg14,
+                                           const __m128i *preg15) {
+  int res0, res1;
+  res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
+  res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
+  if (!res0) {
+    res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
+    if (!res1)
+      res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
+  }
+  return res0 + res1;
+}
+
+static INLINE int check_epi16_overflow_x32(const __m128i *preg0,
+                                           const __m128i *preg1,
+                                           const __m128i *preg2,
+                                           const __m128i *preg3,
+                                           const __m128i *preg4,
+                                           const __m128i *preg5,
+                                           const __m128i *preg6,
+                                           const __m128i *preg7,
+                                           const __m128i *preg8,
+                                           const __m128i *preg9,
+                                           const __m128i *preg10,
+                                           const __m128i *preg11,
+                                           const __m128i *preg12,
+                                           const __m128i *preg13,
+                                           const __m128i *preg14,
+                                           const __m128i *preg15,
+                                           const __m128i *preg16,
+                                           const __m128i *preg17,
+                                           const __m128i *preg18,
+                                           const __m128i *preg19,
+                                           const __m128i *preg20,
+                                           const __m128i *preg21,
+                                           const __m128i *preg22,
+                                           const __m128i *preg23,
+                                           const __m128i *preg24,
+                                           const __m128i *preg25,
+                                           const __m128i *preg26,
+                                           const __m128i *preg27,
+                                           const __m128i *preg28,
+                                           const __m128i *preg29,
+                                           const __m128i *preg30,
+                                           const __m128i *preg31) {
+  int res0, res1;
+  res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
+  res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
+  if (!res0) {
+    res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
+    if (!res1) {
+      res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
+      if (!res0) {
+        res0 = check_epi16_overflow_x4(preg16, preg17, preg18, preg19);
+        if (!res1) {
+          res1 = check_epi16_overflow_x4(preg20, preg21, preg22, preg23);
+          if (!res0) {
+            res0 = check_epi16_overflow_x4(preg24, preg25, preg26, preg27);
+            if (!res1)
+              res1 = check_epi16_overflow_x4(preg28, preg29, preg30, preg31);
+          }
+        }
+      }
+    }
+  }
+  return res0 + res1;
+}
+
+static INLINE int k_check_epi32_overflow_4(const __m128i *preg0,
+                                           const __m128i *preg1,
+                                           const __m128i *preg2,
+                                           const __m128i *preg3,
+                                           const __m128i *zero) {
+  __m128i minus_one = _mm_set1_epi32(-1);
+  // Check for overflows
+  __m128i reg0_shifted = _mm_slli_epi64(*preg0, 1);
+  __m128i reg1_shifted = _mm_slli_epi64(*preg1, 1);
+  __m128i reg2_shifted = _mm_slli_epi64(*preg2, 1);
+  __m128i reg3_shifted = _mm_slli_epi64(*preg3, 1);
+  __m128i reg0_top_dwords = _mm_shuffle_epi32(
+      reg0_shifted, _MM_SHUFFLE(0, 0, 3, 1));
+  __m128i reg1_top_dwords = _mm_shuffle_epi32(
+      reg1_shifted, _MM_SHUFFLE(0, 0, 3, 1));
+  __m128i reg2_top_dwords = _mm_shuffle_epi32(
+      reg2_shifted, _MM_SHUFFLE(0, 0, 3, 1));
+  __m128i reg3_top_dwords = _mm_shuffle_epi32(
+      reg3_shifted, _MM_SHUFFLE(0, 0, 3, 1));
+  __m128i top_dwords_01 = _mm_unpacklo_epi64(reg0_top_dwords, reg1_top_dwords);
+  __m128i top_dwords_23 = _mm_unpacklo_epi64(reg2_top_dwords, reg3_top_dwords);
+  __m128i valid_positve_01 = _mm_cmpeq_epi32(top_dwords_01, *zero);
+  __m128i valid_positve_23 = _mm_cmpeq_epi32(top_dwords_23, *zero);
+  __m128i valid_negative_01 = _mm_cmpeq_epi32(top_dwords_01, minus_one);
+  __m128i valid_negative_23 = _mm_cmpeq_epi32(top_dwords_23, minus_one);
+  int overflow_01 = _mm_movemask_epi8(
+      _mm_cmpeq_epi32(valid_positve_01, valid_negative_01));
+  int overflow_23 = _mm_movemask_epi8(
+      _mm_cmpeq_epi32(valid_positve_23, valid_negative_23));
+  return (overflow_01 + overflow_23);
+}
+
+static INLINE int k_check_epi32_overflow_8(const __m128i *preg0,
+                                           const __m128i *preg1,
+                                           const __m128i *preg2,
+                                           const __m128i *preg3,
+                                           const __m128i *preg4,
+                                           const __m128i *preg5,
+                                           const __m128i *preg6,
+                                           const __m128i *preg7,
+                                           const __m128i *zero) {
+  int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
+  if (!overflow) {
+    overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
+  }
+  return overflow;
+}
+
+static INLINE int k_check_epi32_overflow_16(const __m128i *preg0,
+                                            const __m128i *preg1,
+                                            const __m128i *preg2,
+                                            const __m128i *preg3,
+                                            const __m128i *preg4,
+                                            const __m128i *preg5,
+                                            const __m128i *preg6,
+                                            const __m128i *preg7,
+                                            const __m128i *preg8,
+                                            const __m128i *preg9,
+                                            const __m128i *preg10,
+                                            const __m128i *preg11,
+                                            const __m128i *preg12,
+                                            const __m128i *preg13,
+                                            const __m128i *preg14,
+                                            const __m128i *preg15,
+                                            const __m128i *zero) {
+  int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
+  if (!overflow) {
+    overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
+    if (!overflow) {
+      overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11,
+                                          zero);
+      if (!overflow) {
+        overflow = k_check_epi32_overflow_4(preg12, preg13, preg14, preg15,
+                                            zero);
+      }
+    }
+  }
+  return overflow;
+}
+
+static INLINE int k_check_epi32_overflow_32(const __m128i *preg0,
+                                            const __m128i *preg1,
+                                            const __m128i *preg2,
+                                            const __m128i *preg3,
+                                            const __m128i *preg4,
+                                            const __m128i *preg5,
+                                            const __m128i *preg6,
+                                            const __m128i *preg7,
+                                            const __m128i *preg8,
+                                            const __m128i *preg9,
+                                            const __m128i *preg10,
+                                            const __m128i *preg11,
+                                            const __m128i *preg12,
+                                            const __m128i *preg13,
+                                            const __m128i *preg14,
+                                            const __m128i *preg15,
+                                            const __m128i *preg16,
+                                            const __m128i *preg17,
+                                            const __m128i *preg18,
+                                            const __m128i *preg19,
+                                            const __m128i *preg20,
+                                            const __m128i *preg21,
+                                            const __m128i *preg22,
+                                            const __m128i *preg23,
+                                            const __m128i *preg24,
+                                            const __m128i *preg25,
+                                            const __m128i *preg26,
+                                            const __m128i *preg27,
+                                            const __m128i *preg28,
+                                            const __m128i *preg29,
+                                            const __m128i *preg30,
+                                            const __m128i *preg31,
+                                            const __m128i *zero) {
+  int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
+  if (!overflow) {
+    overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
+    if (!overflow) {
+      overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, zero);
+      if (!overflow) {
+        overflow = k_check_epi32_overflow_4(preg12, preg13, preg14, preg15,
+                                            zero);
+        if (!overflow) {
+          overflow = k_check_epi32_overflow_4(preg16, preg17, preg18, preg19,
+                                              zero);
+          if (!overflow) {
+            overflow = k_check_epi32_overflow_4(preg20, preg21,
+                                                preg22, preg23, zero);
+            if (!overflow) {
+              overflow = k_check_epi32_overflow_4(preg24, preg25,
+                                                  preg26, preg27, zero);
+              if (!overflow) {
+                overflow = k_check_epi32_overflow_4(preg28, preg29,
+                                                    preg30, preg31, zero);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  return overflow;
+}
+
+static INLINE void store_output(const __m128i *poutput, tran_low_t* dst_ptr) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
+  __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
+  __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
+  _mm_store_si128((__m128i *)(dst_ptr), out0);
+  _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
+#else
+  _mm_store_si128((__m128i *)(dst_ptr), *poutput);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}
+
+static INLINE void storeu_output(const __m128i *poutput, tran_low_t* dst_ptr) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
+  __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
+  __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
+  _mm_storeu_si128((__m128i *)(dst_ptr), out0);
+  _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1);
+#else
+  _mm_storeu_si128((__m128i *)(dst_ptr), *poutput);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}
+
+
+static INLINE __m128i mult_round_shift(const __m128i *pin0,
+                                       const __m128i *pin1,
+                                       const __m128i *pmultiplier,
+                                       const __m128i *prounding,
+                                       const int shift) {
+  const __m128i u0 = _mm_madd_epi16(*pin0, *pmultiplier);
+  const __m128i u1 = _mm_madd_epi16(*pin1, *pmultiplier);
+  const __m128i v0 = _mm_add_epi32(u0, *prounding);
+  const __m128i v1 = _mm_add_epi32(u1, *prounding);
+  const __m128i w0 = _mm_srai_epi32(v0, shift);
+  const __m128i w1 = _mm_srai_epi32(v1, shift);
+  return _mm_packs_epi32(w0, w1);
+}
+
+static INLINE void transpose_and_output8x8(
+    const __m128i *pin00, const __m128i *pin01,
+    const __m128i *pin02, const __m128i *pin03,
+    const __m128i *pin04, const __m128i *pin05,
+    const __m128i *pin06, const __m128i *pin07,
+    const int pass, int16_t* out0_ptr,
+    tran_low_t* out1_ptr) {
+  // 00 01 02 03 04 05 06 07
+  // 10 11 12 13 14 15 16 17
+  // 20 21 22 23 24 25 26 27
+  // 30 31 32 33 34 35 36 37
+  // 40 41 42 43 44 45 46 47
+  // 50 51 52 53 54 55 56 57
+  // 60 61 62 63 64 65 66 67
+  // 70 71 72 73 74 75 76 77
+  const __m128i tr0_0 = _mm_unpacklo_epi16(*pin00, *pin01);
+  const __m128i tr0_1 = _mm_unpacklo_epi16(*pin02, *pin03);
+  const __m128i tr0_2 = _mm_unpackhi_epi16(*pin00, *pin01);
+  const __m128i tr0_3 = _mm_unpackhi_epi16(*pin02, *pin03);
+  const __m128i tr0_4 = _mm_unpacklo_epi16(*pin04, *pin05);
+  const __m128i tr0_5 = _mm_unpacklo_epi16(*pin06, *pin07);
+  const __m128i tr0_6 = _mm_unpackhi_epi16(*pin04, *pin05);
+  const __m128i tr0_7 = _mm_unpackhi_epi16(*pin06, *pin07);
+  // 00 10 01 11 02 12 03 13
+  // 20 30 21 31 22 32 23 33
+  // 04 14 05 15 06 16 07 17
+  // 24 34 25 35 26 36 27 37
+  // 40 50 41 51 42 52 43 53
+  // 60 70 61 71 62 72 63 73
+  // 54 54 55 55 56 56 57 57
+  // 64 74 65 75 66 76 67 77
+  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+  // 00 10 20 30 01 11 21 31
+  // 40 50 60 70 41 51 61 71
+  // 02 12 22 32 03 13 23 33
+  // 42 52 62 72 43 53 63 73
+  // 04 14 24 34 05 15 21 36
+  // 44 54 64 74 45 55 61 76
+  // 06 16 26 36 07 17 27 37
+  // 46 56 66 76 47 57 67 77
+  const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+  const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+  const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+  const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+  const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+  const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+  const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+  const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+  // 00 10 20 30 40 50 60 70
+  // 01 11 21 31 41 51 61 71
+  // 02 12 22 32 42 52 62 72
+  // 03 13 23 33 43 53 63 73
+  // 04 14 24 34 44 54 64 74
+  // 05 15 25 35 45 55 65 75
+  // 06 16 26 36 46 56 66 76
+  // 07 17 27 37 47 57 67 77
+  if (pass == 0) {
+    _mm_storeu_si128((__m128i*)(out0_ptr + 0 * 16), tr2_0);
+    _mm_storeu_si128((__m128i*)(out0_ptr + 1 * 16), tr2_1);
+    _mm_storeu_si128((__m128i*)(out0_ptr + 2 * 16), tr2_2);
+    _mm_storeu_si128((__m128i*)(out0_ptr + 3 * 16), tr2_3);
+    _mm_storeu_si128((__m128i*)(out0_ptr + 4 * 16), tr2_4);
+    _mm_storeu_si128((__m128i*)(out0_ptr + 5 * 16), tr2_5);
+    _mm_storeu_si128((__m128i*)(out0_ptr + 6 * 16), tr2_6);
+    _mm_storeu_si128((__m128i*)(out0_ptr + 7 * 16), tr2_7);
+  } else {
+    storeu_output(&tr2_0, (out1_ptr + 0 * 16));
+    storeu_output(&tr2_1, (out1_ptr + 1 * 16));
+    storeu_output(&tr2_2, (out1_ptr + 2 * 16));
+    storeu_output(&tr2_3, (out1_ptr + 3 * 16));
+    storeu_output(&tr2_4, (out1_ptr + 4 * 16));
+    storeu_output(&tr2_5, (out1_ptr + 5 * 16));
+    storeu_output(&tr2_6, (out1_ptr + 6 * 16));
+    storeu_output(&tr2_7, (out1_ptr + 7 * 16));
+  }
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_DSP_X86_FWD_TXFM_SSE2_H_
diff --git a/libs/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm b/libs/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm
new file mode 100644
index 0000000000..78a1dbb24f
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm
@@ -0,0 +1,183 @@
+;
+;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+; This file provides SSSE3 version of the forward transformation. Part
+; of the macro definitions are originally derived from the ffmpeg project.
+; The current version applies to x86 64-bit only.
+
+SECTION_RODATA
+
+pw_11585x2: times 8 dw 23170
+pd_8192:    times 4 dd 8192
+
+%macro TRANSFORM_COEFFS 2
+pw_%1_%2:   dw  %1,  %2,  %1,  %2,  %1,  %2,  %1,  %2
+pw_%2_m%1:  dw  %2, -%1,  %2, -%1,  %2, -%1,  %2, -%1
+%endmacro
+
+TRANSFORM_COEFFS 11585,  11585
+TRANSFORM_COEFFS 15137,   6270
+TRANSFORM_COEFFS 16069,   3196
+TRANSFORM_COEFFS  9102,  13623
+
+SECTION .text
+
+%if ARCH_X86_64
+%macro SUM_SUB 3
+  psubw  m%3, m%1, m%2
+  paddw  m%1, m%2
+  SWAP    %2, %3
+%endmacro
+
+; butterfly operation
+%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2
+  pmaddwd            m%1, m%3, %5
+  pmaddwd            m%2, m%3, %6
+  paddd              m%1,  %4
+  paddd              m%2,  %4
+  psrad              m%1,  14
+  psrad              m%2,  14
+%endmacro
+
+%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2
+  punpckhwd          m%6, m%2, m%1
+  MUL_ADD_2X         %7,  %6,  %6,  %5, [pw_%4_%3], [pw_%3_m%4]
+  punpcklwd          m%2, m%1
+  MUL_ADD_2X         %1,  %2,  %2,  %5, [pw_%4_%3], [pw_%3_m%4]
+  packssdw           m%1, m%7
+  packssdw           m%2, m%6
+%endmacro
+
+; matrix transpose
+%macro INTERLEAVE_2X 4
+  punpckh%1          m%4, m%2, m%3
+  punpckl%1          m%2, m%3
+  SWAP               %3,  %4
+%endmacro
+
+%macro TRANSPOSE8X8 9
+  INTERLEAVE_2X  wd, %1, %2, %9
+  INTERLEAVE_2X  wd, %3, %4, %9
+  INTERLEAVE_2X  wd, %5, %6, %9
+  INTERLEAVE_2X  wd, %7, %8, %9
+
+  INTERLEAVE_2X  dq, %1, %3, %9
+  INTERLEAVE_2X  dq, %2, %4, %9
+  INTERLEAVE_2X  dq, %5, %7, %9
+  INTERLEAVE_2X  dq, %6, %8, %9
+
+  INTERLEAVE_2X  qdq, %1, %5, %9
+  INTERLEAVE_2X  qdq, %3, %7, %9
+  INTERLEAVE_2X  qdq, %2, %6, %9
+  INTERLEAVE_2X  qdq, %4, %8, %9
+
+  SWAP  %2, %5
+  SWAP  %4, %7
+%endmacro
+
+; 1D forward 8x8 DCT transform
+%macro FDCT8_1D 1
+  SUM_SUB            0,  7,  9
+  SUM_SUB            1,  6,  9
+  SUM_SUB            2,  5,  9
+  SUM_SUB            3,  4,  9
+
+  SUM_SUB            0,  3,  9
+  SUM_SUB            1,  2,  9
+  SUM_SUB            6,  5,  9
+%if %1 == 0
+  SUM_SUB            0,  1,  9
+%endif
+
+  BUTTERFLY_4X       2,  3,  6270,  15137,  m8,  9,  10
+
+  pmulhrsw           m6, m12
+  pmulhrsw           m5, m12
+%if %1 == 0
+  pmulhrsw           m0, m12
+  pmulhrsw           m1, m12
+%else
+  BUTTERFLY_4X       1,  0,  11585, 11585,  m8,  9,  10
+  SWAP               0,  1
+%endif
+
+  SUM_SUB            4,  5,  9
+  SUM_SUB            7,  6,  9
+  BUTTERFLY_4X       4,  7,  3196,  16069,  m8,  9,  10
+  BUTTERFLY_4X       5,  6,  13623,  9102,  m8,  9,  10
+  SWAP               1,  4
+  SWAP               3,  6
+%endmacro
+
+%macro DIVIDE_ROUND_2X 4 ; dst1, dst2, tmp1, tmp2
+  psraw              m%3, m%1, 15
+  psraw              m%4, m%2, 15
+  psubw              m%1, m%3
+  psubw              m%2, m%4
+  psraw              m%1, 1
+  psraw              m%2, 1
+%endmacro
+
+INIT_XMM ssse3
+cglobal fdct8x8, 3, 5, 13, input, output, stride
+
+  mova               m8, [pd_8192]
+  mova              m12, [pw_11585x2]
+  pxor              m11, m11
+
+  lea                r3, [2 * strideq]
+  lea                r4, [4 * strideq]
+  mova               m0, [inputq]
+  mova               m1, [inputq + r3]
+  lea                inputq, [inputq + r4]
+  mova               m2, [inputq]
+  mova               m3, [inputq + r3]
+  lea                inputq, [inputq + r4]
+  mova               m4, [inputq]
+  mova               m5, [inputq + r3]
+  lea                inputq, [inputq + r4]
+  mova               m6, [inputq]
+  mova               m7, [inputq + r3]
+
+  ; left shift by 2 to increase forward transformation precision
+  psllw              m0, 2
+  psllw              m1, 2
+  psllw              m2, 2
+  psllw              m3, 2
+  psllw              m4, 2
+  psllw              m5, 2
+  psllw              m6, 2
+  psllw              m7, 2
+
+  ; column transform
+  FDCT8_1D  0
+  TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
+
+  FDCT8_1D  1
+  TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
+
+  DIVIDE_ROUND_2X   0, 1, 9, 10
+  DIVIDE_ROUND_2X   2, 3, 9, 10
+  DIVIDE_ROUND_2X   4, 5, 9, 10
+  DIVIDE_ROUND_2X   6, 7, 9, 10
+
+  mova              [outputq +   0], m0
+  mova              [outputq +  16], m1
+  mova              [outputq +  32], m2
+  mova              [outputq +  48], m3
+  mova              [outputq +  64], m4
+  mova              [outputq +  80], m5
+  mova              [outputq +  96], m6
+  mova              [outputq + 112], m7
+
+  RET
+%endif
diff --git a/libs/libvpx/vpx_dsp/x86/halfpix_variance_impl_sse2.asm b/libs/libvpx/vpx_dsp/x86/halfpix_variance_impl_sse2.asm
new file mode 100644
index 0000000000..cc26bb6124
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/halfpix_variance_impl_sse2.asm
@@ -0,0 +1,346 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vpx_half_horiz_vert_variance16x_h_sse2(unsigned char *ref,
+;                                            int ref_stride,
+;                                            unsigned char *src,
+;                                            int src_stride,
+;                                            unsigned int height,
+;                                            int *sum,
+;                                            unsigned int *sumsquared)
+global sym(vpx_half_horiz_vert_variance16x_h_sse2) PRIVATE
+sym(vpx_half_horiz_vert_variance16x_h_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        pxor            xmm6,           xmm6                ;  error accumulator
+        pxor            xmm7,           xmm7                ;  sse eaccumulator
+        mov             rsi,            arg(0) ;ref
+
+        mov             rdi,            arg(2) ;src
+        movsxd          rcx,            dword ptr arg(4) ;height
+        movsxd          rax,            dword ptr arg(1) ;ref_stride
+        movsxd          rdx,            dword ptr arg(3)    ;src_stride
+
+        pxor            xmm0,           xmm0                ;
+
+        movdqu          xmm5,           XMMWORD PTR [rsi]
+        movdqu          xmm3,           XMMWORD PTR [rsi+1]
+        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1
+
+        lea             rsi,            [rsi + rax]
+
+vpx_half_horiz_vert_variance16x_h_1:
+        movdqu          xmm1,           XMMWORD PTR [rsi]     ;
+        movdqu          xmm2,           XMMWORD PTR [rsi+1]   ;
+        pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1
+
+        pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above
+
+        movdqa          xmm4,           xmm5
+        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
+        punpckhbw       xmm4,           xmm0
+
+        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
+        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
+        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
+
+        movq            xmm3,           QWORD PTR [rdi+8]
+        punpcklbw       xmm3,           xmm0
+        psubw           xmm4,           xmm3
+
+        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
+        paddw           xmm6,           xmm4
+        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
+        pmaddwd         xmm4,           xmm4
+        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
+        paddd           xmm7,           xmm4
+
+        movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row
+
+        lea             rsi,            [rsi + rax]
+        lea             rdi,            [rdi + rdx]
+
+        sub             rcx,            1                   ;
+        jnz             vpx_half_horiz_vert_variance16x_h_1     ;
+
+        pxor        xmm1,           xmm1
+        pxor        xmm5,           xmm5
+
+        punpcklwd   xmm0,           xmm6
+        punpckhwd   xmm1,           xmm6
+        psrad       xmm0,           16
+        psrad       xmm1,           16
+        paddd       xmm0,           xmm1
+        movdqa      xmm1,           xmm0
+
+        movdqa      xmm6,           xmm7
+        punpckldq   xmm6,           xmm5
+        punpckhdq   xmm7,           xmm5
+        paddd       xmm6,           xmm7
+
+        punpckldq   xmm0,           xmm5
+        punpckhdq   xmm1,           xmm5
+        paddd       xmm0,           xmm1
+
+        movdqa      xmm7,           xmm6
+        movdqa      xmm1,           xmm0
+
+        psrldq      xmm7,           8
+        psrldq      xmm1,           8
+
+        paddd       xmm6,           xmm7
+        paddd       xmm0,           xmm1
+
+        mov         rsi,            arg(5) ;[Sum]
+        mov         rdi,            arg(6) ;[SSE]
+
+        movd        [rsi],       xmm0
+        movd        [rdi],       xmm6
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vpx_half_vert_variance16x_h_sse2(unsigned char *ref,
+;                                      int ref_stride,
+;                                      unsigned char *src,
+;                                      int src_stride,
+;                                      unsigned int height,
+;                                      int *sum,
+;                                      unsigned int *sumsquared)
+global sym(vpx_half_vert_variance16x_h_sse2) PRIVATE
+sym(vpx_half_vert_variance16x_h_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        pxor            xmm6,           xmm6                ;  error accumulator
+        pxor            xmm7,           xmm7                ;  sse eaccumulator
+        mov             rsi,            arg(0)              ;ref
+
+        mov             rdi,            arg(2)              ;src
+        movsxd          rcx,            dword ptr arg(4)    ;height
+        movsxd          rax,            dword ptr arg(1)    ;ref_stride
+        movsxd          rdx,            dword ptr arg(3)    ;src_stride
+
+        movdqu          xmm5,           XMMWORD PTR [rsi]
+        lea             rsi,            [rsi + rax          ]
+        pxor            xmm0,           xmm0
+
+vpx_half_vert_variance16x_h_1:
+        movdqu          xmm3,           XMMWORD PTR [rsi]
+
+        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
+        movdqa          xmm4,           xmm5
+        punpcklbw       xmm5,           xmm0
+        punpckhbw       xmm4,           xmm0
+
+        movq            xmm2,           QWORD PTR [rdi]
+        punpcklbw       xmm2,           xmm0
+        psubw           xmm5,           xmm2
+        movq            xmm2,           QWORD PTR [rdi+8]
+        punpcklbw       xmm2,           xmm0
+        psubw           xmm4,           xmm2
+
+        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
+        paddw           xmm6,           xmm4
+        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
+        pmaddwd         xmm4,           xmm4
+        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
+        paddd           xmm7,           xmm4
+
+        movdqa          xmm5,           xmm3
+
+        lea             rsi,            [rsi + rax]
+        lea             rdi,            [rdi + rdx]
+
+        sub             rcx,            1
+        jnz             vpx_half_vert_variance16x_h_1
+
+        pxor        xmm1,           xmm1
+        pxor        xmm5,           xmm5
+
+        punpcklwd   xmm0,           xmm6
+        punpckhwd   xmm1,           xmm6
+        psrad       xmm0,           16
+        psrad       xmm1,           16
+        paddd       xmm0,           xmm1
+        movdqa      xmm1,           xmm0
+
+        movdqa      xmm6,           xmm7
+        punpckldq   xmm6,           xmm5
+        punpckhdq   xmm7,           xmm5
+        paddd       xmm6,           xmm7
+
+        punpckldq   xmm0,           xmm5
+        punpckhdq   xmm1,           xmm5
+        paddd       xmm0,           xmm1
+
+        movdqa      xmm7,           xmm6
+        movdqa      xmm1,           xmm0
+
+        psrldq      xmm7,           8
+        psrldq      xmm1,           8
+
+        paddd       xmm6,           xmm7
+        paddd       xmm0,           xmm1
+
+        mov         rsi,            arg(5) ;[Sum]
+        mov         rdi,            arg(6) ;[SSE]
+
+        movd        [rsi],       xmm0
+        movd        [rdi],       xmm6
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vpx_half_horiz_variance16x_h_sse2(unsigned char *ref,
+;                                       int ref_stride
+;                                       unsigned char *src,
+;                                       int src_stride,
+;                                       unsigned int height,
+;                                       int *sum,
+;                                       unsigned int *sumsquared)
+global sym(vpx_half_horiz_variance16x_h_sse2) PRIVATE
+sym(vpx_half_horiz_variance16x_h_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        pxor            xmm6,           xmm6                ;  error accumulator
+        pxor            xmm7,           xmm7                ;  sse eaccumulator
+        mov             rsi,            arg(0) ;ref
+
+        mov             rdi,            arg(2) ;src
+        movsxd          rcx,            dword ptr arg(4) ;height
+        movsxd          rax,            dword ptr arg(1) ;ref_stride
+        movsxd          rdx,            dword ptr arg(3)    ;src_stride
+
+        pxor            xmm0,           xmm0                ;
+
+vpx_half_horiz_variance16x_h_1:
+        movdqu          xmm5,           XMMWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s15
+        movdqu          xmm3,           XMMWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s16
+
+        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
+        movdqa          xmm1,           xmm5
+        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
+        punpckhbw       xmm1,           xmm0
+
+        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
+        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
+        movq            xmm2,           QWORD PTR [rdi+8]
+        punpcklbw       xmm2,           xmm0
+
+        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
+        psubw           xmm1,           xmm2
+        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
+        paddw           xmm6,           xmm1
+        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
+        pmaddwd         xmm1,           xmm1
+        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
+        paddd           xmm7,           xmm1
+
+        lea             rsi,            [rsi + rax]
+        lea             rdi,            [rdi + rdx]
+
+        sub             rcx,            1                   ;
+        jnz             vpx_half_horiz_variance16x_h_1        ;
+
+        pxor        xmm1,           xmm1
+        pxor        xmm5,           xmm5
+
+        punpcklwd   xmm0,           xmm6
+        punpckhwd   xmm1,           xmm6
+        psrad       xmm0,           16
+        psrad       xmm1,           16
+        paddd       xmm0,           xmm1
+        movdqa      xmm1,           xmm0
+
+        movdqa      xmm6,           xmm7
+        punpckldq   xmm6,           xmm5
+        punpckhdq   xmm7,           xmm5
+        paddd       xmm6,           xmm7
+
+        punpckldq   xmm0,           xmm5
+        punpckhdq   xmm1,           xmm5
+        paddd       xmm0,           xmm1
+
+        movdqa      xmm7,           xmm6
+        movdqa      xmm1,           xmm0
+
+        psrldq      xmm7,           8
+        psrldq      xmm1,           8
+
+        paddd       xmm6,           xmm7
+        paddd       xmm0,           xmm1
+
+        mov         rsi,            arg(5) ;[Sum]
+        mov         rdi,            arg(6) ;[SSE]
+
+        movd        [rsi],       xmm0
+        movd        [rdi],       xmm6
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+;    short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
+align 16
+xmm_bi_rd:
+    times 8 dw 64
+align 16
+vpx_bilinear_filters_sse2:
+    dw 128, 128, 128, 128, 128, 128, 128, 128,  0,  0,  0,  0,  0,  0,  0,  0
+    dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
+    dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
+    dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
+    dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
+    dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
+    dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
+    dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112
diff --git a/libs/libvpx/vpx_dsp/x86/halfpix_variance_sse2.c b/libs/libvpx/vpx_dsp/x86/halfpix_variance_sse2.c
new file mode 100644
index 0000000000..5782155bf8
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/halfpix_variance_sse2.c
@@ -0,0 +1,74 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+void vpx_half_horiz_vert_variance16x_h_sse2(const unsigned char *ref,
+                                            int ref_stride,
+                                            const unsigned char *src,
+                                            int src_stride,
+                                            unsigned int height,
+                                            int *sum,
+                                            unsigned int *sumsquared);
+void vpx_half_horiz_variance16x_h_sse2(const unsigned char *ref, int ref_stride,
+                                       const unsigned char *src, int src_stride,
+                                       unsigned int height, int *sum,
+                                       unsigned int *sumsquared);
+void vpx_half_vert_variance16x_h_sse2(const unsigned char *ref, int ref_stride,
+                                      const unsigned char *src, int src_stride,
+                                      unsigned int height, int *sum,
+                                      unsigned int *sumsquared);
+
+uint32_t vpx_variance_halfpixvar16x16_h_sse2(const unsigned char *src,
+                                             int src_stride,
+                                             const unsigned char *dst,
+                                             int dst_stride,
+                                             uint32_t *sse) {
+  int xsum0;
+  unsigned int xxsum0;
+
+  vpx_half_horiz_variance16x_h_sse2(src, src_stride, dst, dst_stride, 16,
+                                    &xsum0, &xxsum0);
+
+  *sse = xxsum0;
+  return (xxsum0 - (((uint32_t)xsum0 * xsum0) >> 8));
+}
+
+uint32_t vpx_variance_halfpixvar16x16_v_sse2(const unsigned char *src,
+                                             int src_stride,
+                                             const unsigned char *dst,
+                                             int dst_stride,
+                                             uint32_t *sse) {
+  int xsum0;
+  unsigned int xxsum0;
+  vpx_half_vert_variance16x_h_sse2(src, src_stride, dst, dst_stride, 16,
+                                   &xsum0, &xxsum0);
+
+  *sse = xxsum0;
+  return (xxsum0 - (((uint32_t)xsum0 * xsum0) >> 8));
+}
+
+
+uint32_t vpx_variance_halfpixvar16x16_hv_sse2(const unsigned char *src,
+                                              int src_stride,
+                                              const unsigned char *dst,
+                                              int dst_stride,
+                                              uint32_t *sse) {
+  int xsum0;
+  unsigned int xxsum0;
+
+  vpx_half_horiz_vert_variance16x_h_sse2(src, src_stride, dst, dst_stride, 16,
+                                         &xsum0, &xxsum0);
+
+  *sse = xxsum0;
+  return (xxsum0 - (((uint32_t)xsum0 * xsum0) >> 8));
+}
diff --git a/libs/libvpx/vpx_dsp/x86/highbd_intrapred_sse2.asm b/libs/libvpx/vpx_dsp/x86/highbd_intrapred_sse2.asm
new file mode 100644
index 0000000000..c61b62104f
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/highbd_intrapred_sse2.asm
@@ -0,0 +1,453 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_4:  times 8 dw 4
+pw_8:  times 8 dw 8
+pw_16: times 4 dd 16
+pw_32: times 4 dd 32
+
+SECTION .text
+INIT_XMM sse2
+cglobal highbd_dc_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  movq                  m0, [aboveq]
+  movq                  m2, [leftq]
+  paddw                 m0, m2
+  pshuflw               m1, m0, 0xe
+  paddw                 m0, m1
+  pshuflw               m1, m0, 0x1
+  paddw                 m0, m1
+  paddw                 m0, [GLOBAL(pw_4)]
+  psraw                 m0, 3
+  pshuflw               m0, m0, 0x0
+  movq    [dstq          ], m0
+  movq    [dstq+strideq*2], m0
+  lea                 dstq, [dstq+strideq*4]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq*2], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal highbd_dc_predictor_8x8, 4, 5, 4, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  mova                  m0, [aboveq]
+  mova                  m2, [leftq]
+  DEFINE_ARGS dst, stride, stride3, one
+  mov                 oned, 0x00010001
+  lea             stride3q, [strideq*3]
+  movd                  m3, oned
+  pshufd                m3, m3, 0x0
+  paddw                 m0, m2
+  pmaddwd               m0, m3
+  packssdw              m0, m1
+  pmaddwd               m0, m3
+  packssdw              m0, m1
+  pmaddwd               m0, m3
+  paddw                 m0, [GLOBAL(pw_8)]
+  psrlw                 m0, 4
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+  mova   [dstq           ], m0
+  mova   [dstq+strideq*2 ], m0
+  mova   [dstq+strideq*4 ], m0
+  mova   [dstq+stride3q*2], m0
+  lea                 dstq, [dstq+strideq*8]
+  mova   [dstq           ], m0
+  mova   [dstq+strideq*2 ], m0
+  mova   [dstq+strideq*4 ], m0
+  mova   [dstq+stride3q*2], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal highbd_dc_predictor_16x16, 4, 5, 5, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  mova                  m0, [aboveq]
+  mova                  m3, [aboveq+16]
+  mova                  m2, [leftq]
+  mova                  m4, [leftq+16]
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 4
+  paddw                 m0, m2
+  paddw                 m0, m3
+  paddw                 m0, m4
+  movhlps               m2, m0
+  paddw                 m0, m2
+  punpcklwd             m0, m1
+  movhlps               m2, m0
+  paddd                 m0, m2
+  punpckldq             m0, m1
+  movhlps               m2, m0
+  paddd                 m0, m2
+  paddd                 m0, [GLOBAL(pw_16)]
+  psrad                 m0, 5
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+.loop:
+  mova   [dstq              ], m0
+  mova   [dstq           +16], m0
+  mova   [dstq+strideq*2    ], m0
+  mova   [dstq+strideq*2 +16], m0
+  mova   [dstq+strideq*4    ], m0
+  mova   [dstq+strideq*4 +16], m0
+  mova   [dstq+stride3q*2   ], m0
+  mova   [dstq+stride3q*2+16], m0
+  lea                 dstq, [dstq+strideq*8]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM sse2
+cglobal highbd_dc_predictor_32x32, 4, 5, 7, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  mova                  m0, [aboveq]
+  mova                  m2, [aboveq+16]
+  mova                  m3, [aboveq+32]
+  mova                  m4, [aboveq+48]
+  paddw                 m0, m2
+  paddw                 m3, m4
+  mova                  m2, [leftq]
+  mova                  m4, [leftq+16]
+  mova                  m5, [leftq+32]
+  mova                  m6, [leftq+48]
+  paddw                 m2, m4
+  paddw                 m5, m6
+  paddw                 m0, m3
+  paddw                 m2, m5
+  pxor                  m1, m1
+  paddw                 m0, m2
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 8
+  movhlps               m2, m0
+  paddw                 m0, m2
+  punpcklwd             m0, m1
+  movhlps               m2, m0
+  paddd                 m0, m2
+  punpckldq             m0, m1
+  movhlps               m2, m0
+  paddd                 m0, m2
+  paddd                 m0, [GLOBAL(pw_32)]
+  psrad                 m0, 6
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+.loop:
+  mova [dstq               ], m0
+  mova [dstq          +16  ], m0
+  mova [dstq          +32  ], m0
+  mova [dstq          +48  ], m0
+  mova [dstq+strideq*2     ], m0
+  mova [dstq+strideq*2+16  ], m0
+  mova [dstq+strideq*2+32  ], m0
+  mova [dstq+strideq*2+48  ], m0
+  mova [dstq+strideq*4     ], m0
+  mova [dstq+strideq*4+16  ], m0
+  mova [dstq+strideq*4+32  ], m0
+  mova [dstq+strideq*4+48  ], m0
+  mova [dstq+stride3q*2    ], m0
+  mova [dstq+stride3q*2 +16], m0
+  mova [dstq+stride3q*2 +32], m0
+  mova [dstq+stride3q*2 +48], m0
+  lea                 dstq, [dstq+strideq*8]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM sse2
+cglobal highbd_v_predictor_4x4, 3, 3, 1, dst, stride, above
+  movq                  m0, [aboveq]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq*2], m0
+  lea                 dstq, [dstq+strideq*4]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq*2], m0
+  RET
+
+INIT_XMM sse2
+cglobal highbd_v_predictor_8x8, 3, 3, 1, dst, stride, above
+  mova                  m0, [aboveq]
+  DEFINE_ARGS dst, stride, stride3
+  lea             stride3q, [strideq*3]
+  mova   [dstq           ], m0
+  mova   [dstq+strideq*2 ], m0
+  mova   [dstq+strideq*4 ], m0
+  mova   [dstq+stride3q*2], m0
+  lea                 dstq, [dstq+strideq*8]
+  mova   [dstq           ], m0
+  mova   [dstq+strideq*2 ], m0
+  mova   [dstq+strideq*4 ], m0
+  mova   [dstq+stride3q*2], m0
+  RET
+
+INIT_XMM sse2
+cglobal highbd_v_predictor_16x16, 3, 4, 2, dst, stride, above
+  mova                  m0, [aboveq]
+  mova                  m1, [aboveq+16]
+  DEFINE_ARGS dst, stride, stride3, nlines4
+  lea             stride3q, [strideq*3]
+  mov              nlines4d, 4
+.loop:
+  mova    [dstq              ], m0
+  mova    [dstq           +16], m1
+  mova    [dstq+strideq*2    ], m0
+  mova    [dstq+strideq*2 +16], m1
+  mova    [dstq+strideq*4    ], m0
+  mova    [dstq+strideq*4 +16], m1
+  mova    [dstq+stride3q*2   ], m0
+  mova    [dstq+stride3q*2+16], m1
+  lea                 dstq, [dstq+strideq*8]
+  dec             nlines4d
+  jnz .loop
+  REP_RET
+
+INIT_XMM sse2
+cglobal highbd_v_predictor_32x32, 3, 4, 4, dst, stride, above
+  mova                  m0, [aboveq]
+  mova                  m1, [aboveq+16]
+  mova                  m2, [aboveq+32]
+  mova                  m3, [aboveq+48]
+  DEFINE_ARGS dst, stride, stride3, nlines4
+  lea             stride3q, [strideq*3]
+  mov              nlines4d, 8
+.loop:
+  mova [dstq               ], m0
+  mova [dstq            +16], m1
+  mova [dstq            +32], m2
+  mova [dstq            +48], m3
+  mova [dstq+strideq*2     ], m0
+  mova [dstq+strideq*2  +16], m1
+  mova [dstq+strideq*2  +32], m2
+  mova [dstq+strideq*2  +48], m3
+  mova [dstq+strideq*4     ], m0
+  mova [dstq+strideq*4  +16], m1
+  mova [dstq+strideq*4  +32], m2
+  mova [dstq+strideq*4  +48], m3
+  mova [dstq+stride3q*2    ], m0
+  mova [dstq+stride3q*2 +16], m1
+  mova [dstq+stride3q*2 +32], m2
+  mova [dstq+stride3q*2 +48], m3
+  lea                 dstq, [dstq+strideq*8]
+  dec             nlines4d
+  jnz .loop
+  REP_RET
+
+INIT_XMM sse2
+cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bps
+  movd                  m1, [aboveq-2]
+  movq                  m0, [aboveq]
+  pshuflw               m1, m1, 0x0
+  movlhps               m0, m0         ; t1 t2 t3 t4 t1 t2 t3 t4
+  movlhps               m1, m1         ; tl tl tl tl tl tl tl tl
+  ; Get the values to compute the maximum value at this bit depth
+  pcmpeqw               m3, m3
+  movd                  m4, bpsd
+  psubw                 m0, m1         ; t1-tl t2-tl t3-tl t4-tl
+  psllw                 m3, m4
+  pcmpeqw               m2, m2
+  pxor                  m4, m4         ; min possible value
+  pxor                  m3, m2         ; max possible value
+  mova                  m1, [leftq]
+  pshuflw               m2, m1, 0x0
+  pshuflw               m5, m1, 0x55
+  movlhps               m2, m5         ; l1 l1 l1 l1 l2 l2 l2 l2
+  paddw                 m2, m0
+  ;Clamp to the bit-depth
+  pminsw                m2, m3
+  pmaxsw                m2, m4
+  ;Store the values
+  movq    [dstq          ], m2
+  movhpd  [dstq+strideq*2], m2
+  lea                 dstq, [dstq+strideq*4]
+  pshuflw               m2, m1, 0xaa
+  pshuflw               m5, m1, 0xff
+  movlhps               m2, m5
+  paddw                 m2, m0
+  ;Clamp to the bit-depth
+  pminsw                m2, m3
+  pmaxsw                m2, m4
+  ;Store the values
+  movq    [dstq          ], m2
+  movhpd  [dstq+strideq*2], m2
+  RET
+
+INIT_XMM sse2
+cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one
+  movd                  m1, [aboveq-2]
+  mova                  m0, [aboveq]
+  pshuflw               m1, m1, 0x0
+  ; Get the values to compute the maximum value at this bit depth
+  mov                 oned, 1
+  pxor                  m3, m3
+  pxor                  m4, m4
+  pinsrw                m3, oned, 0
+  pinsrw                m4, bpsd, 0
+  pshuflw               m3, m3, 0x0
+  DEFINE_ARGS dst, stride, line, left
+  punpcklqdq            m3, m3
+  mov                lineq, -4
+  mova                  m2, m3
+  punpcklqdq            m1, m1
+  psllw                 m3, m4
+  add                leftq, 16
+  psubw                 m3, m2 ; max possible value
+  pxor                  m4, m4 ; min possible value
+  psubw                 m0, m1
+.loop:
+  movd                  m1, [leftq+lineq*4]
+  movd                  m2, [leftq+lineq*4+2]
+  pshuflw               m1, m1, 0x0
+  pshuflw               m2, m2, 0x0
+  punpcklqdq            m1, m1
+  punpcklqdq            m2, m2
+  paddw                 m1, m0
+  paddw                 m2, m0
+  ;Clamp to the bit-depth
+  pminsw                m1, m3
+  pminsw                m2, m3
+  pmaxsw                m1, m4
+  pmaxsw                m2, m4
+  ;Store the values
+  mova      [dstq          ], m1
+  mova      [dstq+strideq*2], m2
+  lea                 dstq, [dstq+strideq*4]
+  inc                lineq
+  jnz .loop
+  REP_RET
+
+INIT_XMM sse2
+cglobal highbd_tm_predictor_16x16, 5, 5, 8, dst, stride, above, left, bps
+  movd                  m2, [aboveq-2]
+  mova                  m0, [aboveq]
+  mova                  m1, [aboveq+16]
+  pshuflw               m2, m2, 0x0
+  ; Get the values to compute the maximum value at this bit depth
+  pcmpeqw               m3, m3
+  movd                  m4, bpsd
+  punpcklqdq            m2, m2
+  psllw                 m3, m4
+  pcmpeqw               m5, m5
+  pxor                  m4, m4         ; min possible value
+  pxor                  m3, m5         ; max possible value
+  DEFINE_ARGS dst, stride, line, left
+  mov                lineq, -8
+  psubw                 m0, m2
+  psubw                 m1, m2
+.loop:
+  movd                  m7, [leftq]
+  pshuflw               m5, m7, 0x0
+  pshuflw               m2, m7, 0x55
+  punpcklqdq            m5, m5         ; l1 l1 l1 l1 l1 l1 l1 l1
+  punpcklqdq            m2, m2         ; l2 l2 l2 l2 l2 l2 l2 l2
+  paddw                 m6, m5, m0     ; t1-tl+l1 to t4-tl+l1
+  paddw                 m5, m1         ; t5-tl+l1 to t8-tl+l1
+  pminsw                m6, m3
+  pminsw                m5, m3
+  pmaxsw                m6, m4         ; Clamp to the bit-depth
+  pmaxsw                m5, m4
+  mova   [dstq           ], m6
+  mova   [dstq        +16], m5
+  paddw                 m6, m2, m0
+  paddw                 m2, m1
+  pminsw                m6, m3
+  pminsw                m2, m3
+  pmaxsw                m6, m4
+  pmaxsw                m2, m4
+  mova   [dstq+strideq*2 ], m6
+  mova [dstq+strideq*2+16], m2
+  lea                 dstq, [dstq+strideq*4]
+  inc                lineq
+  lea                leftq, [leftq+4]
+
+  jnz .loop
+  REP_RET
+
+INIT_XMM sse2
+cglobal highbd_tm_predictor_32x32, 5, 5, 8, dst, stride, above, left, bps
+  movd                  m0, [aboveq-2]
+  mova                  m1, [aboveq]
+  mova                  m2, [aboveq+16]
+  mova                  m3, [aboveq+32]
+  mova                  m4, [aboveq+48]
+  pshuflw               m0, m0, 0x0
+  ; Get the values to compute the maximum value at this bit depth
+  pcmpeqw               m5, m5
+  movd                  m6, bpsd
+  psllw                 m5, m6
+  pcmpeqw               m7, m7
+  pxor                  m6, m6         ; min possible value
+  pxor                  m5, m7         ; max possible value
+  punpcklqdq            m0, m0
+  DEFINE_ARGS dst, stride, line, left
+  mov                lineq, -16
+  psubw                 m1, m0
+  psubw                 m2, m0
+  psubw                 m3, m0
+  psubw                 m4, m0
+.loop:
+  movd                  m7, [leftq]
+  pshuflw               m7, m7, 0x0
+  punpcklqdq            m7, m7         ; l1 l1 l1 l1 l1 l1 l1 l1
+  paddw                 m0, m7, m1
+  pminsw                m0, m5
+  pmaxsw                m0, m6
+  mova   [dstq           ], m0
+  paddw                 m0, m7, m2
+  pminsw                m0, m5
+  pmaxsw                m0, m6
+  mova   [dstq        +16], m0
+  paddw                 m0, m7, m3
+  pminsw                m0, m5
+  pmaxsw                m0, m6
+  mova   [dstq        +32], m0
+  paddw                 m0, m7, m4
+  pminsw                m0, m5
+  pmaxsw                m0, m6
+  mova   [dstq        +48], m0
+  movd                  m7, [leftq+2]
+  pshuflw               m7, m7, 0x0
+  punpcklqdq            m7, m7         ; l2 l2 l2 l2 l2 l2 l2 l2
+  paddw                 m0, m7, m1
+  pminsw                m0, m5
+  pmaxsw                m0, m6
+  mova   [dstq+strideq*2 ], m0
+  paddw                 m0, m7, m2
+  pminsw                m0, m5
+  pmaxsw                m0, m6
+  mova   [dstq+strideq*2+16], m0
+  paddw                 m0, m7, m3
+  pminsw                m0, m5
+  pmaxsw                m0, m6
+  mova   [dstq+strideq*2+32], m0
+  paddw                 m0, m7, m4
+  pminsw                m0, m5
+  pmaxsw                m0, m6
+  mova   [dstq+strideq*2+48], m0
+  lea                 dstq, [dstq+strideq*4]
+  lea                leftq, [leftq+4]
+  inc                lineq
+  jnz .loop
+  REP_RET
diff --git a/libs/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c b/libs/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c
new file mode 100644
index 0000000000..c4fd5e1a02
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c
@@ -0,0 +1,1214 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>  // SSE2
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/emmintrin_compat.h"
+
+static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) {
+  __m128i ubounded;
+  __m128i lbounded;
+  __m128i retval;
+
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i t80, max, min;
+
+  if (bd == 8) {
+    t80 = _mm_set1_epi16(0x80);
+    max = _mm_subs_epi16(
+              _mm_subs_epi16(_mm_slli_epi16(one, 8), one), t80);
+  } else if (bd == 10) {
+    t80 = _mm_set1_epi16(0x200);
+    max = _mm_subs_epi16(
+              _mm_subs_epi16(_mm_slli_epi16(one, 10), one), t80);
+  } else {  // bd == 12
+    t80 = _mm_set1_epi16(0x800);
+    max = _mm_subs_epi16(
+              _mm_subs_epi16(_mm_slli_epi16(one, 12), one), t80);
+  }
+
+  min = _mm_subs_epi16(zero, t80);
+
+  ubounded = _mm_cmpgt_epi16(value, max);
+  lbounded = _mm_cmplt_epi16(value, min);
+  retval = _mm_andnot_si128(_mm_or_si128(ubounded, lbounded), value);
+  ubounded = _mm_and_si128(ubounded, max);
+  lbounded = _mm_and_si128(lbounded, min);
+  retval = _mm_or_si128(retval, ubounded);
+  retval = _mm_or_si128(retval, lbounded);
+  return retval;
+}
+
+// TODO(debargha, peter): Break up large functions into smaller ones
+// in this file.
+static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
+                                                   int p,
+                                                   const uint8_t *_blimit,
+                                                   const uint8_t *_limit,
+                                                   const uint8_t *_thresh,
+                                                   int bd) {
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i blimit, limit, thresh;
+  __m128i q7, p7, q6, p6, q5, p5, q4, p4, q3, p3, q2, p2, q1, p1, q0, p0;
+  __m128i mask, hev, flat, flat2, abs_p1p0, abs_q1q0;
+  __m128i ps1, qs1, ps0, qs0;
+  __m128i abs_p0q0, abs_p1q1, ffff, work;
+  __m128i filt, work_a, filter1, filter2;
+  __m128i flat2_q6, flat2_p6, flat2_q5, flat2_p5, flat2_q4, flat2_p4;
+  __m128i flat2_q3, flat2_p3, flat2_q2, flat2_p2, flat2_q1, flat2_p1;
+  __m128i flat2_q0, flat2_p0;
+  __m128i flat_q2, flat_p2, flat_q1, flat_p1, flat_q0, flat_p0;
+  __m128i pixelFilter_p, pixelFilter_q;
+  __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
+  __m128i sum_p7, sum_q7, sum_p3, sum_q3;
+  __m128i t4, t3, t80, t1;
+  __m128i eight, four;
+
+  if (bd == 8) {
+    blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
+    limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
+    thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
+  } else if (bd == 10) {
+    blimit = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
+    limit = _mm_slli_epi16(
+          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
+    thresh = _mm_slli_epi16(
+          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
+  } else {  // bd == 12
+    blimit = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
+    limit = _mm_slli_epi16(
+          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
+    thresh = _mm_slli_epi16(
+          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
+  }
+
+  q4 = _mm_load_si128((__m128i *)(s + 4 * p));
+  p4 = _mm_load_si128((__m128i *)(s - 5 * p));
+  q3 = _mm_load_si128((__m128i *)(s + 3 * p));
+  p3 = _mm_load_si128((__m128i *)(s - 4 * p));
+  q2 = _mm_load_si128((__m128i *)(s + 2 * p));
+  p2 = _mm_load_si128((__m128i *)(s - 3 * p));
+  q1 = _mm_load_si128((__m128i *)(s + 1 * p));
+  p1 = _mm_load_si128((__m128i *)(s - 2 * p));
+  q0 = _mm_load_si128((__m128i *)(s + 0 * p));
+  p0 = _mm_load_si128((__m128i *)(s - 1 * p));
+
+  //  highbd_filter_mask
+  abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
+  abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1));
+
+  ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
+
+  abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
+  abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
+
+  //  highbd_hev_mask (in C code this is actually called from highbd_filter4)
+  flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
+  hev = _mm_subs_epu16(flat, thresh);
+  hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
+
+  abs_p0q0 =_mm_adds_epu16(abs_p0q0, abs_p0q0);  // abs(p0 - q0) * 2
+  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);  // abs(p1 - q1) / 2
+  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
+  mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
+  mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p1, p0),
+                                    _mm_subs_epu16(p0, p1)),
+                       _mm_or_si128(_mm_subs_epu16(q1, q0),
+                                    _mm_subs_epu16(q0, q1)));
+  mask = _mm_max_epi16(work, mask);
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p1),
+                                    _mm_subs_epu16(p1, p2)),
+                       _mm_or_si128(_mm_subs_epu16(q2, q1),
+                                    _mm_subs_epu16(q1, q2)));
+  mask = _mm_max_epi16(work, mask);
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p3, p2),
+                                    _mm_subs_epu16(p2, p3)),
+                       _mm_or_si128(_mm_subs_epu16(q3, q2),
+                                    _mm_subs_epu16(q2, q3)));
+  mask = _mm_max_epi16(work, mask);
+
+  mask = _mm_subs_epu16(mask, limit);
+  mask = _mm_cmpeq_epi16(mask, zero);  // return ~mask
+
+  // lp filter
+  // highbd_filter4
+  t4 = _mm_set1_epi16(4);
+  t3 = _mm_set1_epi16(3);
+  if (bd == 8)
+    t80 = _mm_set1_epi16(0x80);
+  else if (bd == 10)
+    t80 = _mm_set1_epi16(0x200);
+  else  // bd == 12
+    t80 = _mm_set1_epi16(0x800);
+
+  t1 = _mm_set1_epi16(0x1);
+
+  ps1 = _mm_subs_epi16(p1, t80);
+  qs1 = _mm_subs_epi16(q1, t80);
+  ps0 = _mm_subs_epi16(p0, t80);
+  qs0 = _mm_subs_epi16(q0, t80);
+
+  filt = _mm_and_si128(
+      signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd), hev);
+  work_a = _mm_subs_epi16(qs0, ps0);
+  filt = _mm_adds_epi16(filt, work_a);
+  filt = _mm_adds_epi16(filt, work_a);
+  filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd);
+  filt = _mm_and_si128(filt, mask);
+  filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd);
+  filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd);
+
+  // Filter1 >> 3
+  filter1 = _mm_srai_epi16(filter1, 0x3);
+  filter2 = _mm_srai_epi16(filter2, 0x3);
+
+  qs0 = _mm_adds_epi16(
+      signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd),
+      t80);
+  ps0 = _mm_adds_epi16(
+      signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd),
+      t80);
+  filt = _mm_adds_epi16(filter1, t1);
+  filt = _mm_srai_epi16(filt, 1);
+  filt = _mm_andnot_si128(hev, filt);
+  qs1 = _mm_adds_epi16(
+      signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd),
+      t80);
+  ps1 = _mm_adds_epi16(
+      signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd),
+      t80);
+
+  // end highbd_filter4
+  // loopfilter done
+
+  // highbd_flat_mask4
+  flat = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p0),
+                                    _mm_subs_epu16(p0, p2)),
+                       _mm_or_si128(_mm_subs_epu16(p3, p0),
+                                    _mm_subs_epu16(p0, p3)));
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(q2, q0),
+                                    _mm_subs_epu16(q0, q2)),
+                       _mm_or_si128(_mm_subs_epu16(q3, q0),
+                                    _mm_subs_epu16(q0, q3)));
+  flat = _mm_max_epi16(work, flat);
+  work = _mm_max_epi16(abs_p1p0, abs_q1q0);
+  flat = _mm_max_epi16(work, flat);
+
+  if (bd == 8)
+    flat = _mm_subs_epu16(flat, one);
+  else if (bd == 10)
+    flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2));
+  else  // bd == 12
+    flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4));
+
+  flat = _mm_cmpeq_epi16(flat, zero);
+  // end flat_mask4
+
+  // flat & mask = flat && mask (as used in filter8)
+  // (because, in both vars, each block of 16 either all 1s or all 0s)
+  flat = _mm_and_si128(flat, mask);
+
+  p5 = _mm_load_si128((__m128i *)(s - 6 * p));
+  q5 = _mm_load_si128((__m128i *)(s + 5 * p));
+  p6 = _mm_load_si128((__m128i *)(s - 7 * p));
+  q6 = _mm_load_si128((__m128i *)(s + 6 * p));
+  p7 = _mm_load_si128((__m128i *)(s - 8 * p));
+  q7 = _mm_load_si128((__m128i *)(s + 7 * p));
+
+  // highbd_flat_mask5 (arguments passed in are p0, q0, p4-p7, q4-q7
+  // but referred to as p0-p4 & q0-q4 in fn)
+  flat2 = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p4, p0),
+                                     _mm_subs_epu16(p0, p4)),
+                        _mm_or_si128(_mm_subs_epu16(q4, q0),
+                                     _mm_subs_epu16(q0, q4)));
+
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p5, p0),
+                                    _mm_subs_epu16(p0, p5)),
+                       _mm_or_si128(_mm_subs_epu16(q5, q0),
+                                    _mm_subs_epu16(q0, q5)));
+  flat2 = _mm_max_epi16(work, flat2);
+
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p6, p0),
+                                    _mm_subs_epu16(p0, p6)),
+                       _mm_or_si128(_mm_subs_epu16(q6, q0),
+                                    _mm_subs_epu16(q0, q6)));
+  flat2 = _mm_max_epi16(work, flat2);
+
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p7, p0),
+                                    _mm_subs_epu16(p0, p7)),
+                       _mm_or_si128(_mm_subs_epu16(q7, q0),
+                                    _mm_subs_epu16(q0, q7)));
+  flat2 = _mm_max_epi16(work, flat2);
+
+  if (bd == 8)
+    flat2 = _mm_subs_epu16(flat2, one);
+  else if (bd == 10)
+    flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 2));
+  else  // bd == 12
+    flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 4));
+
+  flat2 = _mm_cmpeq_epi16(flat2, zero);
+  flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
+  // end highbd_flat_mask5
+
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // flat and wide flat calculations
+  eight = _mm_set1_epi16(8);
+  four = _mm_set1_epi16(4);
+
+  pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6, p5),
+                                _mm_add_epi16(p4, p3));
+  pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6, q5),
+                                _mm_add_epi16(q4, q3));
+
+  pixetFilter_p2p1p0 = _mm_add_epi16(p0, _mm_add_epi16(p2, p1));
+  pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+  pixetFilter_q2q1q0 = _mm_add_epi16(q0, _mm_add_epi16(q2, q1));
+  pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+  pixelFilter_p = _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p,
+                                                      pixelFilter_q));
+  pixetFilter_p2p1p0 =   _mm_add_epi16(four,
+                                       _mm_add_epi16(pixetFilter_p2p1p0,
+                                                     pixetFilter_q2q1q0));
+  flat2_p0 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                                          _mm_add_epi16(p7, p0)), 4);
+  flat2_q0 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                                          _mm_add_epi16(q7, q0)), 4);
+  flat_p0 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
+                                         _mm_add_epi16(p3, p0)), 3);
+  flat_q0 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
+                                         _mm_add_epi16(q3, q0)), 3);
+
+  sum_p7 = _mm_add_epi16(p7, p7);
+  sum_q7 = _mm_add_epi16(q7, q7);
+  sum_p3 = _mm_add_epi16(p3, p3);
+  sum_q3 = _mm_add_epi16(q3, q3);
+
+  pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6);
+  pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6);
+  flat2_p1 = _mm_srli_epi16(
+      _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1)), 4);
+  flat2_q1 = _mm_srli_epi16(
+      _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1)), 4);
+
+  pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2);
+  pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2);
+  flat_p1 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
+                                         _mm_add_epi16(sum_p3, p1)), 3);
+  flat_q1 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
+                                         _mm_add_epi16(sum_q3, q1)), 3);
+
+  sum_p7 = _mm_add_epi16(sum_p7, p7);
+  sum_q7 = _mm_add_epi16(sum_q7, q7);
+  sum_p3 = _mm_add_epi16(sum_p3, p3);
+  sum_q3 = _mm_add_epi16(sum_q3, q3);
+
+  pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5);
+  pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5);
+  flat2_p2 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                                          _mm_add_epi16(sum_p7, p2)), 4);
+  flat2_q2 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+                                          _mm_add_epi16(sum_q7, q2)), 4);
+
+  pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1);
+  pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1);
+  flat_p2 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
+                                         _mm_add_epi16(sum_p3, p2)), 3);
+  flat_q2 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
+                                         _mm_add_epi16(sum_q3, q2)), 3);
+
+  sum_p7 = _mm_add_epi16(sum_p7, p7);
+  sum_q7 = _mm_add_epi16(sum_q7, q7);
+  pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4);
+  pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4);
+  flat2_p3 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                                          _mm_add_epi16(sum_p7, p3)), 4);
+  flat2_q3 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+                                          _mm_add_epi16(sum_q7, q3)), 4);
+
+  sum_p7 = _mm_add_epi16(sum_p7, p7);
+  sum_q7 = _mm_add_epi16(sum_q7, q7);
+  pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3);
+  pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3);
+  flat2_p4 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                                          _mm_add_epi16(sum_p7, p4)), 4);
+  flat2_q4 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+                                          _mm_add_epi16(sum_q7, q4)), 4);
+
+  sum_p7 = _mm_add_epi16(sum_p7, p7);
+  sum_q7 = _mm_add_epi16(sum_q7, q7);
+  pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2);
+  pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2);
+  flat2_p5 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                                          _mm_add_epi16(sum_p7, p5)), 4);
+  flat2_q5 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+                                          _mm_add_epi16(sum_q7, q5)), 4);
+
+  sum_p7 = _mm_add_epi16(sum_p7, p7);
+  sum_q7 = _mm_add_epi16(sum_q7, q7);
+  pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1);
+  pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1);
+  flat2_p6 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                                          _mm_add_epi16(sum_p7, p6)), 4);
+  flat2_q6 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+                                          _mm_add_epi16(sum_q7, q6)), 4);
+
+  //  wide flat
+  //  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+  //  highbd_filter8
+  p2 = _mm_andnot_si128(flat, p2);
+  //  p2 remains unchanged if !(flat && mask)
+  flat_p2 = _mm_and_si128(flat, flat_p2);
+  //  when (flat && mask)
+  p2 = _mm_or_si128(p2, flat_p2);  // full list of p2 values
+  q2 = _mm_andnot_si128(flat, q2);
+  flat_q2 = _mm_and_si128(flat, flat_q2);
+  q2 = _mm_or_si128(q2, flat_q2);  // full list of q2 values
+
+  ps1 = _mm_andnot_si128(flat, ps1);
+  //  p1 takes the value assigned to in in filter4 if !(flat && mask)
+  flat_p1 = _mm_and_si128(flat, flat_p1);
+  //  when (flat && mask)
+  p1 = _mm_or_si128(ps1, flat_p1);  // full list of p1 values
+  qs1 = _mm_andnot_si128(flat, qs1);
+  flat_q1 = _mm_and_si128(flat, flat_q1);
+  q1 = _mm_or_si128(qs1, flat_q1);  // full list of q1 values
+
+  ps0 = _mm_andnot_si128(flat, ps0);
+  //  p0 takes the value assigned to in in filter4 if !(flat && mask)
+  flat_p0 = _mm_and_si128(flat, flat_p0);
+  //  when (flat && mask)
+  p0 = _mm_or_si128(ps0, flat_p0);  // full list of p0 values
+  qs0 = _mm_andnot_si128(flat, qs0);
+  flat_q0 = _mm_and_si128(flat, flat_q0);
+  q0 = _mm_or_si128(qs0, flat_q0);  // full list of q0 values
+  // end highbd_filter8
+
+  // highbd_filter16
+  p6 = _mm_andnot_si128(flat2, p6);
+  //  p6 remains unchanged if !(flat2 && flat && mask)
+  flat2_p6 = _mm_and_si128(flat2, flat2_p6);
+  //  get values for when (flat2 && flat && mask)
+  p6 = _mm_or_si128(p6, flat2_p6);  // full list of p6 values
+  q6 = _mm_andnot_si128(flat2, q6);
+  //  q6 remains unchanged if !(flat2 && flat && mask)
+  flat2_q6 = _mm_and_si128(flat2, flat2_q6);
+  //  get values for when (flat2 && flat && mask)
+  q6 = _mm_or_si128(q6, flat2_q6);  // full list of q6 values
+  _mm_store_si128((__m128i *)(s - 7 * p), p6);
+  _mm_store_si128((__m128i *)(s + 6 * p), q6);
+
+  p5 = _mm_andnot_si128(flat2, p5);
+  //  p5 remains unchanged if !(flat2 && flat && mask)
+  flat2_p5 = _mm_and_si128(flat2, flat2_p5);
+  //  get values for when (flat2 && flat && mask)
+  p5 = _mm_or_si128(p5, flat2_p5);
+  //  full list of p5 values
+  q5 = _mm_andnot_si128(flat2, q5);
+  //  q5 remains unchanged if !(flat2 && flat && mask)
+  flat2_q5 = _mm_and_si128(flat2, flat2_q5);
+  //  get values for when (flat2 && flat && mask)
+  q5 = _mm_or_si128(q5, flat2_q5);
+  //  full list of q5 values
+  _mm_store_si128((__m128i *)(s - 6 * p), p5);
+  _mm_store_si128((__m128i *)(s + 5 * p), q5);
+
+  p4 = _mm_andnot_si128(flat2, p4);
+  //  p4 remains unchanged if !(flat2 && flat && mask)
+  flat2_p4 = _mm_and_si128(flat2, flat2_p4);
+  //  get values for when (flat2 && flat && mask)
+  p4 = _mm_or_si128(p4, flat2_p4);  // full list of p4 values
+  q4 = _mm_andnot_si128(flat2, q4);
+  //  q4 remains unchanged if !(flat2 && flat && mask)
+  flat2_q4 = _mm_and_si128(flat2, flat2_q4);
+  //  get values for when (flat2 && flat && mask)
+  q4 = _mm_or_si128(q4, flat2_q4);  // full list of q4 values
+  _mm_store_si128((__m128i *)(s - 5 * p), p4);
+  _mm_store_si128((__m128i *)(s + 4 * p), q4);
+
+  p3 = _mm_andnot_si128(flat2, p3);
+  //  p3 takes value from highbd_filter8 if !(flat2 && flat && mask)
+  flat2_p3 = _mm_and_si128(flat2, flat2_p3);
+  //  get values for when (flat2 && flat && mask)
+  p3 = _mm_or_si128(p3, flat2_p3);  // full list of p3 values
+  q3 = _mm_andnot_si128(flat2, q3);
+  //  q3 takes value from highbd_filter8 if !(flat2 && flat && mask)
+  flat2_q3 = _mm_and_si128(flat2, flat2_q3);
+  //  get values for when (flat2 && flat && mask)
+  q3 = _mm_or_si128(q3, flat2_q3);  // full list of q3 values
+  _mm_store_si128((__m128i *)(s - 4 * p), p3);
+  _mm_store_si128((__m128i *)(s + 3 * p), q3);
+
+  p2 = _mm_andnot_si128(flat2, p2);
+  //  p2 takes value from highbd_filter8 if !(flat2 && flat && mask)
+  flat2_p2 = _mm_and_si128(flat2, flat2_p2);
+  //  get values for when (flat2 && flat && mask)
+  p2 = _mm_or_si128(p2, flat2_p2);
+  //  full list of p2 values
+  q2 = _mm_andnot_si128(flat2, q2);
+  //  q2 takes value from highbd_filter8 if !(flat2 && flat && mask)
+  flat2_q2 = _mm_and_si128(flat2, flat2_q2);
+  //  get values for when (flat2 && flat && mask)
+  q2 = _mm_or_si128(q2, flat2_q2);  // full list of q2 values
+  _mm_store_si128((__m128i *)(s - 3 * p), p2);
+  _mm_store_si128((__m128i *)(s + 2 * p), q2);
+
+  p1 = _mm_andnot_si128(flat2, p1);
+  //  p1 takes value from highbd_filter8 if !(flat2 && flat && mask)
+  flat2_p1 = _mm_and_si128(flat2, flat2_p1);
+  //  get values for when (flat2 && flat && mask)
+  p1 = _mm_or_si128(p1, flat2_p1);  // full list of p1 values
+  q1 = _mm_andnot_si128(flat2, q1);
+  //  q1 takes value from highbd_filter8 if !(flat2 && flat && mask)
+  flat2_q1 = _mm_and_si128(flat2, flat2_q1);
+  //  get values for when (flat2 && flat && mask)
+  q1 = _mm_or_si128(q1, flat2_q1);  // full list of q1 values
+  _mm_store_si128((__m128i *)(s - 2 * p), p1);
+  _mm_store_si128((__m128i *)(s + 1 * p), q1);
+
+  p0 = _mm_andnot_si128(flat2, p0);
+  //  p0 takes value from highbd_filter8 if !(flat2 && flat && mask)
+  flat2_p0 = _mm_and_si128(flat2, flat2_p0);
+  //  get values for when (flat2 && flat && mask)
+  p0 = _mm_or_si128(p0, flat2_p0);  // full list of p0 values
+  q0 = _mm_andnot_si128(flat2, q0);
+  //  q0 takes value from highbd_filter8 if !(flat2 && flat && mask)
+  flat2_q0 = _mm_and_si128(flat2, flat2_q0);
+  //  get values for when (flat2 && flat && mask)
+  q0 = _mm_or_si128(q0, flat2_q0);  // full list of q0 values
+  _mm_store_si128((__m128i *)(s - 1 * p), p0);
+  _mm_store_si128((__m128i *)(s - 0 * p), q0);
+}
+
+static void highbd_mb_lpf_horizontal_edge_w_sse2_16(uint16_t *s,
+                                                    int p,
+                                                    const uint8_t *_blimit,
+                                                    const uint8_t *_limit,
+                                                    const uint8_t *_thresh,
+                                                    int bd) {
+  highbd_mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh, bd);
+  highbd_mb_lpf_horizontal_edge_w_sse2_8(s + 8, p, _blimit, _limit, _thresh,
+                                         bd);
+}
+
+// TODO(yunqingwang): remove count and call these 2 functions(8 or 16) directly.
+void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
+                                       const uint8_t *_blimit,
+                                       const uint8_t *_limit,
+                                       const uint8_t *_thresh,
+                                       int count, int bd) {
+  if (count == 1)
+    highbd_mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh, bd);
+  else
+    highbd_mb_lpf_horizontal_edge_w_sse2_16(s, p, _blimit, _limit, _thresh, bd);
+}
+
+void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
+                                      const uint8_t *_blimit,
+                                      const uint8_t *_limit,
+                                      const uint8_t *_thresh,
+                                      int count, int bd) {
+  DECLARE_ALIGNED(16, uint16_t, flat_op2[16]);
+  DECLARE_ALIGNED(16, uint16_t, flat_op1[16]);
+  DECLARE_ALIGNED(16, uint16_t, flat_op0[16]);
+  DECLARE_ALIGNED(16, uint16_t, flat_oq2[16]);
+  DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]);
+  DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]);
+  const __m128i zero = _mm_set1_epi16(0);
+  __m128i blimit, limit, thresh;
+  __m128i mask, hev, flat;
+  __m128i p3 = _mm_load_si128((__m128i *)(s - 4 * p));
+  __m128i q3 = _mm_load_si128((__m128i *)(s + 3 * p));
+  __m128i p2 = _mm_load_si128((__m128i *)(s - 3 * p));
+  __m128i q2 = _mm_load_si128((__m128i *)(s + 2 * p));
+  __m128i p1 = _mm_load_si128((__m128i *)(s - 2 * p));
+  __m128i q1 = _mm_load_si128((__m128i *)(s + 1 * p));
+  __m128i p0 = _mm_load_si128((__m128i *)(s - 1 * p));
+  __m128i q0 = _mm_load_si128((__m128i *)(s + 0 * p));
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i ffff = _mm_cmpeq_epi16(one, one);
+  __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
+  const __m128i four = _mm_set1_epi16(4);
+  __m128i workp_a, workp_b, workp_shft;
+
+  const __m128i t4 = _mm_set1_epi16(4);
+  const __m128i t3 = _mm_set1_epi16(3);
+  __m128i t80;
+  const __m128i t1 = _mm_set1_epi16(0x1);
+  __m128i ps1, ps0, qs0, qs1;
+  __m128i filt;
+  __m128i work_a;
+  __m128i filter1, filter2;
+
+  (void)count;
+
+  if (bd == 8) {
+    blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
+    limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
+    thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
+    t80 = _mm_set1_epi16(0x80);
+  } else if (bd == 10) {
+    blimit = _mm_slli_epi16(
+          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
+    limit = _mm_slli_epi16(
+          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
+    thresh = _mm_slli_epi16(
+          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
+    t80 = _mm_set1_epi16(0x200);
+  } else {  // bd == 12
+    blimit = _mm_slli_epi16(
+          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
+    limit = _mm_slli_epi16(
+          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
+    thresh = _mm_slli_epi16(
+          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
+    t80 = _mm_set1_epi16(0x800);
+  }
+
+  ps1 = _mm_subs_epi16(p1, t80);
+  ps0 = _mm_subs_epi16(p0, t80);
+  qs0 = _mm_subs_epi16(q0, t80);
+  qs1 = _mm_subs_epi16(q1, t80);
+
+  // filter_mask and hev_mask
+  abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0),
+                          _mm_subs_epu16(p0, p1));
+  abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0),
+                          _mm_subs_epu16(q0, q1));
+
+  abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0),
+                          _mm_subs_epu16(q0, p0));
+  abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1),
+                          _mm_subs_epu16(q1, p1));
+  flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
+  hev = _mm_subs_epu16(flat, thresh);
+  hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
+
+  abs_p0q0 =_mm_adds_epu16(abs_p0q0, abs_p0q0);
+  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
+  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
+  mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
+  // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+  // So taking maximums continues to work:
+  mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
+  mask = _mm_max_epi16(abs_p1p0, mask);
+  // mask |= (abs(p1 - p0) > limit) * -1;
+  mask = _mm_max_epi16(abs_q1q0, mask);
+  // mask |= (abs(q1 - q0) > limit) * -1;
+
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p1),
+                                    _mm_subs_epu16(p1, p2)),
+                       _mm_or_si128(_mm_subs_epu16(q2, q1),
+                                    _mm_subs_epu16(q1, q2)));
+  mask = _mm_max_epi16(work, mask);
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p3, p2),
+                                    _mm_subs_epu16(p2, p3)),
+                       _mm_or_si128(_mm_subs_epu16(q3, q2),
+                                    _mm_subs_epu16(q2, q3)));
+  mask = _mm_max_epi16(work, mask);
+  mask = _mm_subs_epu16(mask, limit);
+  mask = _mm_cmpeq_epi16(mask, zero);
+
+  // flat_mask4
+  flat = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p0),
+                                    _mm_subs_epu16(p0, p2)),
+                       _mm_or_si128(_mm_subs_epu16(q2, q0),
+                                    _mm_subs_epu16(q0, q2)));
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p3, p0),
+                                    _mm_subs_epu16(p0, p3)),
+                       _mm_or_si128(_mm_subs_epu16(q3, q0),
+                                    _mm_subs_epu16(q0, q3)));
+  flat = _mm_max_epi16(work, flat);
+  flat = _mm_max_epi16(abs_p1p0, flat);
+  flat = _mm_max_epi16(abs_q1q0, flat);
+
+  if (bd == 8)
+    flat = _mm_subs_epu16(flat, one);
+  else if (bd == 10)
+    flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2));
+  else  // bd == 12
+    flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4));
+
+  flat = _mm_cmpeq_epi16(flat, zero);
+  flat = _mm_and_si128(flat, mask);  // flat & mask
+
+  // Added before shift for rounding part of ROUND_POWER_OF_TWO
+
+  workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
+  workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
+  workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
+  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+  _mm_store_si128((__m128i *)&flat_op2[0], workp_shft);
+
+  workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
+  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+  _mm_store_si128((__m128i *)&flat_op1[0], workp_shft);
+
+  workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
+  workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
+  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+  _mm_store_si128((__m128i *)&flat_op0[0], workp_shft);
+
+  workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
+  workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
+  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+  _mm_store_si128((__m128i *)&flat_oq0[0], workp_shft);
+
+  workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
+  workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
+  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+  _mm_store_si128((__m128i *)&flat_oq1[0], workp_shft);
+
+  workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
+  workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
+  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+  _mm_store_si128((__m128i *)&flat_oq2[0], workp_shft);
+
+  // lp filter
+  filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd);
+  filt = _mm_and_si128(filt, hev);
+  work_a = _mm_subs_epi16(qs0, ps0);
+  filt = _mm_adds_epi16(filt, work_a);
+  filt = _mm_adds_epi16(filt, work_a);
+  filt = _mm_adds_epi16(filt, work_a);
+  // (vpx_filter + 3 * (qs0 - ps0)) & mask
+  filt = signed_char_clamp_bd_sse2(filt, bd);
+  filt = _mm_and_si128(filt, mask);
+
+  filter1 = _mm_adds_epi16(filt, t4);
+  filter2 = _mm_adds_epi16(filt, t3);
+
+  // Filter1 >> 3
+  filter1 = signed_char_clamp_bd_sse2(filter1, bd);
+  filter1 = _mm_srai_epi16(filter1, 3);
+
+  // Filter2 >> 3
+  filter2 = signed_char_clamp_bd_sse2(filter2, bd);
+  filter2 = _mm_srai_epi16(filter2, 3);
+
+  // filt >> 1
+  filt = _mm_adds_epi16(filter1, t1);
+  filt = _mm_srai_epi16(filt, 1);
+  // filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
+  filt = _mm_andnot_si128(hev, filt);
+
+  work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd);
+  work_a = _mm_adds_epi16(work_a, t80);
+  q0 = _mm_load_si128((__m128i *)flat_oq0);
+  work_a = _mm_andnot_si128(flat, work_a);
+  q0 = _mm_and_si128(flat, q0);
+  q0 = _mm_or_si128(work_a, q0);
+
+  work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd);
+  work_a = _mm_adds_epi16(work_a, t80);
+  q1 = _mm_load_si128((__m128i *)flat_oq1);
+  work_a = _mm_andnot_si128(flat, work_a);
+  q1 = _mm_and_si128(flat, q1);
+  q1 = _mm_or_si128(work_a, q1);
+
+  work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
+  q2 = _mm_load_si128((__m128i *)flat_oq2);
+  work_a = _mm_andnot_si128(flat, work_a);
+  q2 = _mm_and_si128(flat, q2);
+  q2 = _mm_or_si128(work_a, q2);
+
+  work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd);
+  work_a = _mm_adds_epi16(work_a, t80);
+  p0 = _mm_load_si128((__m128i *)flat_op0);
+  work_a = _mm_andnot_si128(flat, work_a);
+  p0 = _mm_and_si128(flat, p0);
+  p0 = _mm_or_si128(work_a, p0);
+
+  work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd);
+  work_a = _mm_adds_epi16(work_a, t80);
+  p1 = _mm_load_si128((__m128i *)flat_op1);
+  work_a = _mm_andnot_si128(flat, work_a);
+  p1 = _mm_and_si128(flat, p1);
+  p1 = _mm_or_si128(work_a, p1);
+
+  work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
+  p2 = _mm_load_si128((__m128i *)flat_op2);
+  work_a = _mm_andnot_si128(flat, work_a);
+  p2 = _mm_and_si128(flat, p2);
+  p2 = _mm_or_si128(work_a, p2);
+
+  _mm_store_si128((__m128i *)(s - 3 * p), p2);
+  _mm_store_si128((__m128i *)(s - 2 * p), p1);
+  _mm_store_si128((__m128i *)(s - 1 * p), p0);
+  _mm_store_si128((__m128i *)(s + 0 * p), q0);
+  _mm_store_si128((__m128i *)(s + 1 * p), q1);
+  _mm_store_si128((__m128i *)(s + 2 * p), q2);
+}
+
+void vpx_highbd_lpf_horizontal_8_dual_sse2(uint16_t *s, int p,
+                                           const uint8_t *_blimit0,
+                                           const uint8_t *_limit0,
+                                           const uint8_t *_thresh0,
+                                           const uint8_t *_blimit1,
+                                           const uint8_t *_limit1,
+                                           const uint8_t *_thresh1,
+                                           int bd) {
+  vpx_highbd_lpf_horizontal_8_sse2(s, p, _blimit0, _limit0, _thresh0, 1, bd);
+  vpx_highbd_lpf_horizontal_8_sse2(s + 8, p, _blimit1, _limit1, _thresh1,
+                                   1, bd);
+}
+
+void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
+                                      const uint8_t *_blimit,
+                                      const uint8_t *_limit,
+                                      const uint8_t *_thresh,
+                                      int count, int bd) {
+  const __m128i zero = _mm_set1_epi16(0);
+  __m128i blimit, limit, thresh;
+  __m128i mask, hev, flat;
+  __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
+  __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+  __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+  __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+  __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+  __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+  __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+  __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+  const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0),
+                                        _mm_subs_epu16(p0, p1));
+  const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0),
+                                        _mm_subs_epu16(q0, q1));
+  const __m128i ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0),
+                                  _mm_subs_epu16(q0, p0));
+  __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1),
+                                  _mm_subs_epu16(q1, p1));
+  __m128i work;
+  const __m128i t4 = _mm_set1_epi16(4);
+  const __m128i t3 = _mm_set1_epi16(3);
+  __m128i t80;
+  __m128i tff80;
+  __m128i tffe0;
+  __m128i t1f;
+  // equivalent to shifting 0x1f left by bitdepth - 8
+  // and setting new bits to 1
+  const __m128i t1 = _mm_set1_epi16(0x1);
+  __m128i t7f;
+  // equivalent to shifting 0x7f left by bitdepth - 8
+  // and setting new bits to 1
+  __m128i ps1, ps0, qs0, qs1;
+  __m128i filt;
+  __m128i work_a;
+  __m128i filter1, filter2;
+
+  (void)count;
+
+  if (bd == 8) {
+    blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
+    limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
+    thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
+    t80 = _mm_set1_epi16(0x80);
+    tff80 = _mm_set1_epi16(0xff80);
+    tffe0 = _mm_set1_epi16(0xffe0);
+    t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 8);
+    t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 8);
+  } else if (bd == 10) {
+    blimit = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
+    limit = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
+    thresh = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
+    t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 2);
+    tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 2);
+    tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 2);
+    t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 6);
+    t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 6);
+  } else {  // bd == 12
+    blimit = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
+    limit = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
+    thresh = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
+    t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 4);
+    tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 4);
+    tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 4);
+    t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 4);
+    t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 4);
+  }
+
+  ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
+  ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
+  qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
+  qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
+
+  // filter_mask and hev_mask
+  flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
+  hev = _mm_subs_epu16(flat, thresh);
+  hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
+
+  abs_p0q0 =_mm_adds_epu16(abs_p0q0, abs_p0q0);
+  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
+  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
+  mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
+  // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+  // So taking maximums continues to work:
+  mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
+  mask = _mm_max_epi16(flat, mask);
+  // mask |= (abs(p1 - p0) > limit) * -1;
+  // mask |= (abs(q1 - q0) > limit) * -1;
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p1),
+                                    _mm_subs_epu16(p1, p2)),
+                       _mm_or_si128(_mm_subs_epu16(p3, p2),
+                                    _mm_subs_epu16(p2, p3)));
+  mask = _mm_max_epi16(work, mask);
+  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(q2, q1),
+                                    _mm_subs_epu16(q1, q2)),
+                       _mm_or_si128(_mm_subs_epu16(q3, q2),
+                                    _mm_subs_epu16(q2, q3)));
+  mask = _mm_max_epi16(work, mask);
+  mask = _mm_subs_epu16(mask, limit);
+  mask = _mm_cmpeq_epi16(mask, zero);
+
+  // filter4
+  filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd);
+  filt = _mm_and_si128(filt, hev);
+  work_a = _mm_subs_epi16(qs0, ps0);
+  filt = _mm_adds_epi16(filt, work_a);
+  filt = _mm_adds_epi16(filt, work_a);
+  filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd);
+
+  // (vpx_filter + 3 * (qs0 - ps0)) & mask
+  filt = _mm_and_si128(filt, mask);
+
+  filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd);
+  filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd);
+
+  // Filter1 >> 3
+  work_a = _mm_cmpgt_epi16(zero, filter1);  // get the values that are <0
+  filter1 = _mm_srli_epi16(filter1, 3);
+  work_a = _mm_and_si128(work_a, tffe0);  // sign bits for the values < 0
+  filter1 = _mm_and_si128(filter1, t1f);  // clamp the range
+  filter1 = _mm_or_si128(filter1, work_a);  // reinsert the sign bits
+
+  // Filter2 >> 3
+  work_a = _mm_cmpgt_epi16(zero, filter2);
+  filter2 = _mm_srli_epi16(filter2, 3);
+  work_a = _mm_and_si128(work_a, tffe0);
+  filter2 = _mm_and_si128(filter2, t1f);
+  filter2 = _mm_or_si128(filter2, work_a);
+
+  // filt >> 1
+  filt = _mm_adds_epi16(filter1, t1);
+  work_a = _mm_cmpgt_epi16(zero, filt);
+  filt = _mm_srli_epi16(filt, 1);
+  work_a = _mm_and_si128(work_a, tff80);
+  filt = _mm_and_si128(filt, t7f);
+  filt = _mm_or_si128(filt, work_a);
+
+  filt = _mm_andnot_si128(hev, filt);
+
+  q0 = _mm_adds_epi16(
+      signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), t80);
+  q1 = _mm_adds_epi16(
+      signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd), t80);
+  p0 = _mm_adds_epi16(
+      signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), t80);
+  p1 = _mm_adds_epi16(
+      signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd), t80);
+
+  _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+  _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+  _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
+  _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+}
+
+void vpx_highbd_lpf_horizontal_4_dual_sse2(uint16_t *s, int p,
+                                           const uint8_t *_blimit0,
+                                           const uint8_t *_limit0,
+                                           const uint8_t *_thresh0,
+                                           const uint8_t *_blimit1,
+                                           const uint8_t *_limit1,
+                                           const uint8_t *_thresh1,
+                                           int bd) {
+  vpx_highbd_lpf_horizontal_4_sse2(s, p, _blimit0, _limit0, _thresh0, 1, bd);
+  vpx_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, 1,
+                                   bd);
+}
+
+static INLINE void highbd_transpose(uint16_t *src[], int in_p,
+                                    uint16_t *dst[], int out_p,
+                                    int num_8x8_to_transpose) {
+  int idx8x8 = 0;
+  __m128i p0, p1, p2, p3, p4, p5, p6, p7, x0, x1, x2, x3, x4, x5, x6, x7;
+  do {
+    uint16_t *in = src[idx8x8];
+    uint16_t *out = dst[idx8x8];
+
+    p0 = _mm_loadu_si128((__m128i *)(in + 0*in_p));  // 00 01 02 03 04 05 06 07
+    p1 = _mm_loadu_si128((__m128i *)(in + 1*in_p));  // 10 11 12 13 14 15 16 17
+    p2 = _mm_loadu_si128((__m128i *)(in + 2*in_p));  // 20 21 22 23 24 25 26 27
+    p3 = _mm_loadu_si128((__m128i *)(in + 3*in_p));  // 30 31 32 33 34 35 36 37
+    p4 = _mm_loadu_si128((__m128i *)(in + 4*in_p));  // 40 41 42 43 44 45 46 47
+    p5 = _mm_loadu_si128((__m128i *)(in + 5*in_p));  // 50 51 52 53 54 55 56 57
+    p6 = _mm_loadu_si128((__m128i *)(in + 6*in_p));  // 60 61 62 63 64 65 66 67
+    p7 = _mm_loadu_si128((__m128i *)(in + 7*in_p));  // 70 71 72 73 74 75 76 77
+    // 00 10 01 11 02 12 03 13
+    x0 = _mm_unpacklo_epi16(p0, p1);
+    // 20 30 21 31 22 32 23 33
+    x1 = _mm_unpacklo_epi16(p2, p3);
+    // 40 50 41 51 42 52 43 53
+    x2 = _mm_unpacklo_epi16(p4, p5);
+    // 60 70 61 71 62 72 63 73
+    x3 = _mm_unpacklo_epi16(p6, p7);
+    // 00 10 20 30 01 11 21 31
+    x4 = _mm_unpacklo_epi32(x0, x1);
+    // 40 50 60 70 41 51 61 71
+    x5 = _mm_unpacklo_epi32(x2, x3);
+    // 00 10 20 30 40 50 60 70
+    x6 = _mm_unpacklo_epi64(x4, x5);
+    // 01 11 21 31 41 51 61 71
+    x7 = _mm_unpackhi_epi64(x4, x5);
+
+    _mm_storeu_si128((__m128i *)(out + 0*out_p), x6);
+    // 00 10 20 30 40 50 60 70
+    _mm_storeu_si128((__m128i *)(out + 1*out_p), x7);
+    // 01 11 21 31 41 51 61 71
+
+    // 02 12 22 32 03 13 23 33
+    x4 = _mm_unpackhi_epi32(x0, x1);
+    // 42 52 62 72 43 53 63 73
+    x5 = _mm_unpackhi_epi32(x2, x3);
+    // 02 12 22 32 42 52 62 72
+    x6 = _mm_unpacklo_epi64(x4, x5);
+    // 03 13 23 33 43 53 63 73
+    x7 = _mm_unpackhi_epi64(x4, x5);
+
+    _mm_storeu_si128((__m128i *)(out + 2*out_p), x6);
+    // 02 12 22 32 42 52 62 72
+    _mm_storeu_si128((__m128i *)(out + 3*out_p), x7);
+    // 03 13 23 33 43 53 63 73
+
+    // 04 14 05 15 06 16 07 17
+    x0 = _mm_unpackhi_epi16(p0, p1);
+    // 24 34 25 35 26 36 27 37
+    x1 = _mm_unpackhi_epi16(p2, p3);
+    // 44 54 45 55 46 56 47 57
+    x2 = _mm_unpackhi_epi16(p4, p5);
+    // 64 74 65 75 66 76 67 77
+    x3 = _mm_unpackhi_epi16(p6, p7);
+    // 04 14 24 34 05 15 25 35
+    x4 = _mm_unpacklo_epi32(x0, x1);
+    // 44 54 64 74 45 55 65 75
+    x5 = _mm_unpacklo_epi32(x2, x3);
+    // 04 14 24 34 44 54 64 74
+    x6 = _mm_unpacklo_epi64(x4, x5);
+    // 05 15 25 35 45 55 65 75
+    x7 = _mm_unpackhi_epi64(x4, x5);
+
+    _mm_storeu_si128((__m128i *)(out + 4*out_p), x6);
+    // 04 14 24 34 44 54 64 74
+    _mm_storeu_si128((__m128i *)(out + 5*out_p), x7);
+    // 05 15 25 35 45 55 65 75
+
+    // 06 16 26 36 07 17 27 37
+    x4 = _mm_unpackhi_epi32(x0, x1);
+    // 46 56 66 76 47 57 67 77
+    x5 = _mm_unpackhi_epi32(x2, x3);
+    // 06 16 26 36 46 56 66 76
+    x6 = _mm_unpacklo_epi64(x4, x5);
+    // 07 17 27 37 47 57 67 77
+    x7 = _mm_unpackhi_epi64(x4, x5);
+
+    _mm_storeu_si128((__m128i *)(out + 6*out_p), x6);
+    // 06 16 26 36 46 56 66 76
+    _mm_storeu_si128((__m128i *)(out + 7*out_p), x7);
+    // 07 17 27 37 47 57 67 77
+  } while (++idx8x8 < num_8x8_to_transpose);
+}
+
+static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1,
+                                        int in_p, uint16_t *out, int out_p) {
+  uint16_t *src0[1];
+  uint16_t *src1[1];
+  uint16_t *dest0[1];
+  uint16_t *dest1[1];
+  src0[0] = in0;
+  src1[0] = in1;
+  dest0[0] = out;
+  dest1[0] = out + 8;
+  highbd_transpose(src0, in_p, dest0, out_p, 1);
+  highbd_transpose(src1, in_p, dest1, out_p, 1);
+}
+
+void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int p,
+                                    const uint8_t *blimit,
+                                    const uint8_t *limit,
+                                    const uint8_t *thresh,
+                                    int count, int bd) {
+  DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
+  uint16_t *src[1];
+  uint16_t *dst[1];
+  (void)count;
+
+  // Transpose 8x8
+  src[0] = s - 4;
+  dst[0] = t_dst;
+
+  highbd_transpose(src, p, dst, 8, 1);
+
+  // Loop filtering
+  vpx_highbd_lpf_horizontal_4_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1,
+                                   bd);
+
+  src[0] = t_dst;
+  dst[0] = s - 4;
+
+  // Transpose back
+  highbd_transpose(src, 8, dst, p, 1);
+}
+
+void vpx_highbd_lpf_vertical_4_dual_sse2(uint16_t *s, int p,
+                                         const uint8_t *blimit0,
+                                         const uint8_t *limit0,
+                                         const uint8_t *thresh0,
+                                         const uint8_t *blimit1,
+                                         const uint8_t *limit1,
+                                         const uint8_t *thresh1,
+                                         int bd) {
+  DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
+  uint16_t *src[2];
+  uint16_t *dst[2];
+
+  // Transpose 8x16
+  highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+
+  // Loop filtering
+  vpx_highbd_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
+                                        thresh0, blimit1, limit1, thresh1, bd);
+  src[0] = t_dst;
+  src[1] = t_dst + 8;
+  dst[0] = s - 4;
+  dst[1] = s - 4 + p * 8;
+
+  // Transpose back
+  highbd_transpose(src, 16, dst, p, 2);
+}
+
+void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int p,
+                                    const uint8_t *blimit,
+                                    const uint8_t *limit,
+                                    const uint8_t *thresh,
+                                    int count, int bd) {
+  DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
+  uint16_t *src[1];
+  uint16_t *dst[1];
+  (void)count;
+
+  // Transpose 8x8
+  src[0] = s - 4;
+  dst[0] = t_dst;
+
+  highbd_transpose(src, p, dst, 8, 1);
+
+  // Loop filtering
+  vpx_highbd_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1,
+                                   bd);
+
+  src[0] = t_dst;
+  dst[0] = s - 4;
+
+  // Transpose back
+  highbd_transpose(src, 8, dst, p, 1);
+}
+
+void vpx_highbd_lpf_vertical_8_dual_sse2(uint16_t *s, int p,
+                                         const uint8_t *blimit0,
+                                         const uint8_t *limit0,
+                                         const uint8_t *thresh0,
+                                         const uint8_t *blimit1,
+                                         const uint8_t *limit1,
+                                         const uint8_t *thresh1,
+                                         int bd) {
+  DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
+  uint16_t *src[2];
+  uint16_t *dst[2];
+
+  // Transpose 8x16
+  highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+
+  // Loop filtering
+  vpx_highbd_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
+                                        thresh0, blimit1, limit1, thresh1, bd);
+  src[0] = t_dst;
+  src[1] = t_dst + 8;
+
+  dst[0] = s - 4;
+  dst[1] = s - 4 + p * 8;
+
+  // Transpose back
+  highbd_transpose(src, 16, dst, p, 2);
+}
+
+void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int p,
+                                     const uint8_t *blimit,
+                                     const uint8_t *limit,
+                                     const uint8_t *thresh,
+                                     int bd) {
+  DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 16]);
+  uint16_t *src[2];
+  uint16_t *dst[2];
+
+  src[0] = s - 8;
+  src[1] = s;
+  dst[0] = t_dst;
+  dst[1] = t_dst + 8 * 8;
+
+  // Transpose 16x8
+  highbd_transpose(src, p, dst, 8, 2);
+
+  // Loop filtering
+  highbd_mb_lpf_horizontal_edge_w_sse2_8(t_dst + 8 * 8, 8, blimit, limit,
+                                         thresh, bd);
+  src[0] = t_dst;
+  src[1] = t_dst + 8 * 8;
+  dst[0] = s - 8;
+  dst[1] = s;
+
+  // Transpose back
+  highbd_transpose(src, 8, dst, p, 2);
+}
+
+void vpx_highbd_lpf_vertical_16_dual_sse2(uint16_t *s,
+                                          int p,
+                                          const uint8_t *blimit,
+                                          const uint8_t *limit,
+                                          const uint8_t *thresh,
+                                          int bd) {
+  DECLARE_ALIGNED(16, uint16_t, t_dst[256]);
+
+  //  Transpose 16x16
+  highbd_transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
+  highbd_transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
+
+  //  Loop filtering
+  highbd_mb_lpf_horizontal_edge_w_sse2_16(t_dst + 8 * 16, 16, blimit, limit,
+                                          thresh, bd);
+
+  //  Transpose back
+  highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
+  highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
+}
diff --git a/libs/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/libs/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
new file mode 100644
index 0000000000..fd46bef3d8
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -0,0 +1,179 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>
+
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr,
+                                intptr_t count,
+                                int skip_block,
+                                const int16_t *zbin_ptr,
+                                const int16_t *round_ptr,
+                                const int16_t *quant_ptr,
+                                const int16_t *quant_shift_ptr,
+                                tran_low_t *qcoeff_ptr,
+                                tran_low_t *dqcoeff_ptr,
+                                const int16_t *dequant_ptr,
+                                uint16_t *eob_ptr,
+                                const int16_t *scan,
+                                const int16_t *iscan) {
+  int i, j, non_zero_regs = (int)count / 4, eob_i = -1;
+  __m128i zbins[2];
+  __m128i nzbins[2];
+
+  zbins[0] = _mm_set_epi32((int)zbin_ptr[1],
+                           (int)zbin_ptr[1],
+                           (int)zbin_ptr[1],
+                           (int)zbin_ptr[0]);
+  zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]);
+
+  nzbins[0] = _mm_setzero_si128();
+  nzbins[1] = _mm_setzero_si128();
+  nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
+  nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
+
+  (void)scan;
+
+  memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = ((int)count / 4) - 1; i >= 0; i--) {
+      __m128i coeffs, cmp1, cmp2;
+      int test;
+      coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+      cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+      cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+      cmp1 = _mm_and_si128(cmp1, cmp2);
+      test = _mm_movemask_epi8(cmp1);
+      if (test == 0xffff)
+        non_zero_regs--;
+      else
+        break;
+    }
+
+    // Quantization pass:
+    for (i = 0; i < non_zero_regs; i++) {
+      __m128i coeffs, coeffs_sign, tmp1, tmp2;
+      int test;
+      int abs_coeff[4];
+      int coeff_sign[4];
+
+      coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+      coeffs_sign = _mm_srai_epi32(coeffs, 31);
+      coeffs = _mm_sub_epi32(
+            _mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
+      tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
+      tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
+      tmp1 = _mm_or_si128(tmp1, tmp2);
+      test = _mm_movemask_epi8(tmp1);
+      _mm_storeu_si128((__m128i*)abs_coeff, coeffs);
+      _mm_storeu_si128((__m128i*)coeff_sign, coeffs_sign);
+
+      for (j = 0; j < 4; j++) {
+        if (test & (1 << (4 * j))) {
+          int k = 4 * i + j;
+          const int64_t tmp1 = abs_coeff[j] + round_ptr[k != 0];
+          const int64_t tmp2 = ((tmp1 * quant_ptr[k != 0]) >> 16) + tmp1;
+          const uint32_t abs_qcoeff =
+              (uint32_t)((tmp2 * quant_shift_ptr[k != 0]) >> 16);
+          qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];
+          dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
+          if (abs_qcoeff)
+            eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
+        }
+      }
+    }
+  }
+  *eob_ptr = eob_i + 1;
+}
+
+
+void vpx_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr,
+                                      intptr_t n_coeffs,
+                                      int skip_block,
+                                      const int16_t *zbin_ptr,
+                                      const int16_t *round_ptr,
+                                      const int16_t *quant_ptr,
+                                      const int16_t *quant_shift_ptr,
+                                      tran_low_t *qcoeff_ptr,
+                                      tran_low_t *dqcoeff_ptr,
+                                      const int16_t *dequant_ptr,
+                                      uint16_t *eob_ptr,
+                                      const int16_t *scan,
+                                      const int16_t *iscan) {
+  __m128i zbins[2];
+  __m128i nzbins[2];
+  int idx = 0;
+  int idx_arr[1024];
+  int i, eob = -1;
+  const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);
+  const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);
+  (void)scan;
+  zbins[0] = _mm_set_epi32(zbin1_tmp,
+                           zbin1_tmp,
+                           zbin1_tmp,
+                           zbin0_tmp);
+  zbins[1] = _mm_set1_epi32(zbin1_tmp);
+
+  nzbins[0] = _mm_setzero_si128();
+  nzbins[1] = _mm_setzero_si128();
+  nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
+  nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = 0; i < n_coeffs / 4; i++) {
+      __m128i coeffs, cmp1, cmp2;
+      int test;
+      coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+      cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+      cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+      cmp1 = _mm_and_si128(cmp1, cmp2);
+      test = _mm_movemask_epi8(cmp1);
+      if (!(test & 0xf))
+        idx_arr[idx++] = i * 4;
+      if (!(test & 0xf0))
+        idx_arr[idx++] = i * 4 + 1;
+      if (!(test & 0xf00))
+        idx_arr[idx++] = i * 4 + 2;
+      if (!(test & 0xf000))
+        idx_arr[idx++] = i * 4 + 3;
+    }
+
+    // Quantization pass: only process the coefficients selected in
+    // pre-scan pass. Note: idx can be zero.
+    for (i = 0; i < idx; i++) {
+      const int rc = idx_arr[i];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int64_t tmp1 = abs_coeff
+                         + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+      const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+      const uint32_t abs_qcoeff =
+          (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
+      qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+      if (abs_qcoeff)
+        eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+#endif
diff --git a/libs/libvpx/vpx_dsp/x86/highbd_sad4d_sse2.asm b/libs/libvpx/vpx_dsp/x86/highbd_sad4d_sse2.asm
new file mode 100644
index 0000000000..6c2a61e019
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/highbd_sad4d_sse2.asm
@@ -0,0 +1,287 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; HIGH_PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_4x2x4 5-6 0
+  movh                  m0, [srcq +%2*2]
+%if %1 == 1
+  movu                  m4, [ref1q+%3*2]
+  movu                  m5, [ref2q+%3*2]
+  movu                  m6, [ref3q+%3*2]
+  movu                  m7, [ref4q+%3*2]
+  movhps                m0, [srcq +%4*2]
+  movhps                m4, [ref1q+%5*2]
+  movhps                m5, [ref2q+%5*2]
+  movhps                m6, [ref3q+%5*2]
+  movhps                m7, [ref4q+%5*2]
+  mova                  m3, m0
+  mova                  m2, m0
+  psubusw               m3, m4
+  psubusw               m2, m5
+  psubusw               m4, m0
+  psubusw               m5, m0
+  por                   m4, m3
+  por                   m5, m2
+  pmaddwd               m4, m1
+  pmaddwd               m5, m1
+  mova                  m3, m0
+  mova                  m2, m0
+  psubusw               m3, m6
+  psubusw               m2, m7
+  psubusw               m6, m0
+  psubusw               m7, m0
+  por                   m6, m3
+  por                   m7, m2
+  pmaddwd               m6, m1
+  pmaddwd               m7, m1
+%else
+  movu                  m2, [ref1q+%3*2]
+  movhps                m0, [srcq +%4*2]
+  movhps                m2, [ref1q+%5*2]
+  mova                  m3, m0
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  pmaddwd               m2, m1
+  paddd                 m4, m2
+
+  movu                  m2, [ref2q+%3*2]
+  mova                  m3, m0
+  movhps                m2, [ref2q+%5*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  pmaddwd               m2, m1
+  paddd                 m5, m2
+
+  movu                  m2, [ref3q+%3*2]
+  mova                  m3, m0
+  movhps                m2, [ref3q+%5*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  pmaddwd               m2, m1
+  paddd                 m6, m2
+
+  movu                  m2, [ref4q+%3*2]
+  mova                  m3, m0
+  movhps                m2, [ref4q+%5*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  pmaddwd               m2, m1
+  paddd                 m7, m2
+%endif
+%if %6 == 1
+  lea                 srcq, [srcq +src_strideq*4]
+  lea                ref1q, [ref1q+ref_strideq*4]
+  lea                ref2q, [ref2q+ref_strideq*4]
+  lea                ref3q, [ref3q+ref_strideq*4]
+  lea                ref4q, [ref4q+ref_strideq*4]
+%endif
+%endmacro
+
+; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_8x2x4 5-6 0
+  ; 1st 8 px
+  mova                  m0, [srcq +%2*2]
+%if %1 == 1
+  movu                  m4, [ref1q+%3*2]
+  movu                  m5, [ref2q+%3*2]
+  movu                  m6, [ref3q+%3*2]
+  movu                  m7, [ref4q+%3*2]
+  mova                  m3, m0
+  mova                  m2, m0
+  psubusw               m3, m4
+  psubusw               m2, m5
+  psubusw               m4, m0
+  psubusw               m5, m0
+  por                   m4, m3
+  por                   m5, m2
+  pmaddwd               m4, m1
+  pmaddwd               m5, m1
+  mova                  m3, m0
+  mova                  m2, m0
+  psubusw               m3, m6
+  psubusw               m2, m7
+  psubusw               m6, m0
+  psubusw               m7, m0
+  por                   m6, m3
+  por                   m7, m2
+  pmaddwd               m6, m1
+  pmaddwd               m7, m1
+%else
+  mova                  m3, m0
+  movu                  m2, [ref1q+%3*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  mova                  m3, m0
+  pmaddwd               m2, m1
+  paddd                 m4, m2
+  movu                  m2, [ref2q+%3*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  mova                  m3, m0
+  pmaddwd               m2, m1
+  paddd                 m5, m2
+  movu                  m2, [ref3q+%3*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  mova                  m3, m0
+  pmaddwd               m2, m1
+  paddd                 m6, m2
+  movu                  m2, [ref4q+%3*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  pmaddwd               m2, m1
+  paddd                 m7, m2
+%endif
+
+  ; 2nd 8 px
+  mova                  m0, [srcq +(%4)*2]
+  mova                  m3, m0
+  movu                  m2, [ref1q+(%5)*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  mova                  m3, m0
+  pmaddwd               m2, m1
+  paddd                 m4, m2
+  movu                  m2, [ref2q+(%5)*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  mova                  m3, m0
+  pmaddwd               m2, m1
+  paddd                 m5, m2
+  movu                  m2, [ref3q+(%5)*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  mova                  m3, m0
+  pmaddwd               m2, m1
+  paddd                 m6, m2
+  movu                  m2, [ref4q+(%5)*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+%if %6 == 1
+  lea                 srcq, [srcq +src_strideq*4]
+  lea                ref1q, [ref1q+ref_strideq*4]
+  lea                ref2q, [ref2q+ref_strideq*4]
+  lea                ref3q, [ref3q+ref_strideq*4]
+  lea                ref4q, [ref4q+ref_strideq*4]
+%endif
+  por                   m2, m3
+  pmaddwd               m2, m1
+  paddd                 m7, m2
+%endmacro
+
+; HIGH_PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_16x2x4 5-6 0
+  HIGH_PROCESS_8x2x4 %1, %2, %3, (%2 + 8), (%3 + 8)
+  HIGH_PROCESS_8x2x4  0, %4, %5, (%4 + 8), (%5 + 8), %6
+%endmacro
+
+; HIGH_PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_32x2x4 5-6 0
+  HIGH_PROCESS_16x2x4 %1, %2, %3, (%2 + 16), (%3 + 16)
+  HIGH_PROCESS_16x2x4  0, %4, %5, (%4 + 16), (%5 + 16), %6
+%endmacro
+
+; HIGH_PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_64x2x4 5-6 0
+  HIGH_PROCESS_32x2x4 %1, %2, %3, (%2 + 32), (%3 + 32)
+  HIGH_PROCESS_32x2x4  0, %4, %5, (%4 + 32), (%5 + 32), %6
+%endmacro
+
+; void vpx_highbd_sadNxNx4d_sse2(uint8_t *src,    int src_stride,
+;                         uint8_t *ref[4], int ref_stride,
+;                         uint32_t res[4]);
+; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8
+%macro HIGH_SADNXN4D 2
+%if UNIX64
+cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
+                              res, ref2, ref3, ref4
+%else
+cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
+                              ref2, ref3, ref4
+%endif
+
+; set m1
+  push                srcq
+  mov                 srcd, 0x00010001
+  movd                  m1, srcd
+  pshufd                m1, m1, 0x0
+  pop                 srcq
+
+  movsxdifnidn src_strideq, src_strided
+  movsxdifnidn ref_strideq, ref_strided
+  mov                ref2q, [ref1q+gprsize*1]
+  mov                ref3q, [ref1q+gprsize*2]
+  mov                ref4q, [ref1q+gprsize*3]
+  mov                ref1q, [ref1q+gprsize*0]
+
+; convert byte pointers to short pointers
+  shl                 srcq, 1
+  shl                ref2q, 1
+  shl                ref3q, 1
+  shl                ref4q, 1
+  shl                ref1q, 1
+
+  HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
+%rep (%2-4)/2
+  HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
+%endrep
+  HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
+  ; N.B. HIGH_PROCESS outputs dwords (32 bits)
+  ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM
+  movhlps               m0, m4
+  movhlps               m1, m5
+  movhlps               m2, m6
+  movhlps               m3, m7
+  paddd                 m4, m0
+  paddd                 m5, m1
+  paddd                 m6, m2
+  paddd                 m7, m3
+  punpckldq             m4, m5
+  punpckldq             m6, m7
+  movhlps               m0, m4
+  movhlps               m1, m6
+  paddd                 m4, m0
+  paddd                 m6, m1
+  punpcklqdq            m4, m6
+  movifnidn             r4, r4mp
+  movu                [r4], m4
+  RET
+%endmacro
+
+
+INIT_XMM sse2
+HIGH_SADNXN4D 64, 64
+HIGH_SADNXN4D 64, 32
+HIGH_SADNXN4D 32, 64
+HIGH_SADNXN4D 32, 32
+HIGH_SADNXN4D 32, 16
+HIGH_SADNXN4D 16, 32
+HIGH_SADNXN4D 16, 16
+HIGH_SADNXN4D 16,  8
+HIGH_SADNXN4D  8, 16
+HIGH_SADNXN4D  8,  8
+HIGH_SADNXN4D  8,  4
+HIGH_SADNXN4D  4,  8
+HIGH_SADNXN4D  4,  4
diff --git a/libs/libvpx/vpx_dsp/x86/highbd_sad_sse2.asm b/libs/libvpx/vpx_dsp/x86/highbd_sad_sse2.asm
new file mode 100644
index 0000000000..bc4b28db24
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/highbd_sad_sse2.asm
@@ -0,0 +1,363 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro HIGH_SAD_FN 4
+%if %4 == 0
+%if %3 == 5
+cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \
+                            src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+%else ; avg
+%if %3 == 5
+cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \
+                                    second_pred, n_rows
+%else ; %3 == 7
+cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, 7, src, src_stride, \
+                                              ref, ref_stride, \
+                                              second_pred, \
+                                              src_stride3, ref_stride3
+%if ARCH_X86_64
+%define n_rowsd r7d
+%else ; x86-32
+%define n_rowsd dword r0m
+%endif ; x86-32/64
+%endif ; %3 == 5/7
+%endif ; avg/sad
+  movsxdifnidn src_strideq, src_strided
+  movsxdifnidn ref_strideq, ref_strided
+%if %3 == 7
+  lea         src_stride3q, [src_strideq*3]
+  lea         ref_stride3q, [ref_strideq*3]
+%endif ; %3 == 7
+; convert src, ref & second_pred to short ptrs (from byte ptrs)
+  shl                 srcq, 1
+  shl                 refq, 1
+%if %4 == 1
+  shl         second_predq, 1
+%endif
+%endmacro
+
+; unsigned int vpx_highbd_sad64x{16,32,64}_sse2(uint8_t *src, int src_stride,
+;                                    uint8_t *ref, int ref_stride);
+%macro HIGH_SAD64XN 1-2 0
+  HIGH_SAD_FN 64, %1, 5, %2
+  mov              n_rowsd, %1
+  pxor                  m0, m0
+  pxor                  m6, m6
+
+.loop:
+  ; first half of each row
+  movu                  m1, [refq]
+  movu                  m2, [refq+16]
+  movu                  m3, [refq+32]
+  movu                  m4, [refq+48]
+%if %2 == 1
+  pavgw                 m1, [second_predq+mmsize*0]
+  pavgw                 m2, [second_predq+mmsize*1]
+  pavgw                 m3, [second_predq+mmsize*2]
+  pavgw                 m4, [second_predq+mmsize*3]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
+  mova                  m5, [srcq]
+  psubusw               m5, m1
+  psubusw               m1, [srcq]
+  por                   m1, m5
+  mova                  m5, [srcq+16]
+  psubusw               m5, m2
+  psubusw               m2, [srcq+16]
+  por                   m2, m5
+  mova                  m5, [srcq+32]
+  psubusw               m5, m3
+  psubusw               m3, [srcq+32]
+  por                   m3, m5
+  mova                  m5, [srcq+48]
+  psubusw               m5, m4
+  psubusw               m4, [srcq+48]
+  por                   m4, m5
+  paddw                 m1, m2
+  paddw                 m3, m4
+  movhlps               m2, m1
+  movhlps               m4, m3
+  paddw                 m1, m2
+  paddw                 m3, m4
+  punpcklwd             m1, m6
+  punpcklwd             m3, m6
+  paddd                 m0, m1
+  paddd                 m0, m3
+  ; second half of each row
+  movu                  m1, [refq+64]
+  movu                  m2, [refq+80]
+  movu                  m3, [refq+96]
+  movu                  m4, [refq+112]
+%if %2 == 1
+  pavgw                 m1, [second_predq+mmsize*0]
+  pavgw                 m2, [second_predq+mmsize*1]
+  pavgw                 m3, [second_predq+mmsize*2]
+  pavgw                 m4, [second_predq+mmsize*3]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
+  mova                  m5, [srcq+64]
+  psubusw               m5, m1
+  psubusw               m1, [srcq+64]
+  por                   m1, m5
+  mova                  m5, [srcq+80]
+  psubusw               m5, m2
+  psubusw               m2, [srcq+80]
+  por                   m2, m5
+  mova                  m5, [srcq+96]
+  psubusw               m5, m3
+  psubusw               m3, [srcq+96]
+  por                   m3, m5
+  mova                  m5, [srcq+112]
+  psubusw               m5, m4
+  psubusw               m4, [srcq+112]
+  por                   m4, m5
+  paddw                 m1, m2
+  paddw                 m3, m4
+  movhlps               m2, m1
+  movhlps               m4, m3
+  paddw                 m1, m2
+  paddw                 m3, m4
+  punpcklwd             m1, m6
+  punpcklwd             m3, m6
+  lea                 refq, [refq+ref_strideq*2]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*2]
+  paddd                 m0, m3
+
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  punpckldq             m0, m6
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD64XN 64 ; highbd_sad64x64_sse2
+HIGH_SAD64XN 32 ; highbd_sad64x32_sse2
+HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2
+HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2
+
+
+; unsigned int vpx_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride,
+;                                    uint8_t *ref, int ref_stride);
+%macro HIGH_SAD32XN 1-2 0
+  HIGH_SAD_FN 32, %1, 5, %2
+  mov              n_rowsd, %1
+  pxor                  m0, m0
+  pxor                  m6, m6
+
+.loop:
+  movu                  m1, [refq]
+  movu                  m2, [refq+16]
+  movu                  m3, [refq+32]
+  movu                  m4, [refq+48]
+%if %2 == 1
+  pavgw                 m1, [second_predq+mmsize*0]
+  pavgw                 m2, [second_predq+mmsize*1]
+  pavgw                 m3, [second_predq+mmsize*2]
+  pavgw                 m4, [second_predq+mmsize*3]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
+  mova                  m5, [srcq]
+  psubusw               m5, m1
+  psubusw               m1, [srcq]
+  por                   m1, m5
+  mova                  m5, [srcq+16]
+  psubusw               m5, m2
+  psubusw               m2, [srcq+16]
+  por                   m2, m5
+  mova                  m5, [srcq+32]
+  psubusw               m5, m3
+  psubusw               m3, [srcq+32]
+  por                   m3, m5
+  mova                  m5, [srcq+48]
+  psubusw               m5, m4
+  psubusw               m4, [srcq+48]
+  por                   m4, m5
+  paddw                 m1, m2
+  paddw                 m3, m4
+  movhlps               m2, m1
+  movhlps               m4, m3
+  paddw                 m1, m2
+  paddw                 m3, m4
+  punpcklwd             m1, m6
+  punpcklwd             m3, m6
+  lea                 refq, [refq+ref_strideq*2]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*2]
+  paddd                 m0, m3
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  punpckldq             m0, m6
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD32XN 64 ; highbd_sad32x64_sse2
+HIGH_SAD32XN 32 ; highbd_sad32x32_sse2
+HIGH_SAD32XN 16 ; highbd_sad32x16_sse2
+HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2
+HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2
+HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2
+
+; unsigned int vpx_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride,
+;                                    uint8_t *ref, int ref_stride);
+%macro HIGH_SAD16XN 1-2 0
+  HIGH_SAD_FN 16, %1, 5, %2
+  mov              n_rowsd, %1/2
+  pxor                  m0, m0
+  pxor                  m6, m6
+
+.loop:
+  movu                  m1, [refq]
+  movu                  m2, [refq+16]
+  movu                  m3, [refq+ref_strideq*2]
+  movu                  m4, [refq+ref_strideq*2+16]
+%if %2 == 1
+  pavgw                 m1, [second_predq+mmsize*0]
+  pavgw                 m2, [second_predq+16]
+  pavgw                 m3, [second_predq+mmsize*2]
+  pavgw                 m4, [second_predq+mmsize*2+16]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
+  mova                  m5, [srcq]
+  psubusw               m5, m1
+  psubusw               m1, [srcq]
+  por                   m1, m5
+  mova                  m5, [srcq+16]
+  psubusw               m5, m2
+  psubusw               m2, [srcq+16]
+  por                   m2, m5
+  mova                  m5, [srcq+src_strideq*2]
+  psubusw               m5, m3
+  psubusw               m3, [srcq+src_strideq*2]
+  por                   m3, m5
+  mova                  m5, [srcq+src_strideq*2+16]
+  psubusw               m5, m4
+  psubusw               m4, [srcq+src_strideq*2+16]
+  por                   m4, m5
+  paddw                 m1, m2
+  paddw                 m3, m4
+  movhlps               m2, m1
+  movhlps               m4, m3
+  paddw                 m1, m2
+  paddw                 m3, m4
+  punpcklwd             m1, m6
+  punpcklwd             m3, m6
+  lea                 refq, [refq+ref_strideq*4]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*4]
+  paddd                 m0, m3
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  punpckldq             m0, m6
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD16XN 32 ; highbd_sad16x32_sse2
+HIGH_SAD16XN 16 ; highbd_sad16x16_sse2
+HIGH_SAD16XN  8 ; highbd_sad16x8_sse2
+HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2
+HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2
+HIGH_SAD16XN  8, 1 ; highbd_sad16x8_avg_sse2
+
+
+; unsigned int vpx_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride,
+;                                    uint8_t *ref, int ref_stride);
+%macro HIGH_SAD8XN 1-2 0
+  HIGH_SAD_FN 8, %1, 7, %2
+  mov              n_rowsd, %1/4
+  pxor                  m0, m0
+  pxor                  m6, m6
+
+.loop:
+  movu                  m1, [refq]
+  movu                  m2, [refq+ref_strideq*2]
+  movu                  m3, [refq+ref_strideq*4]
+  movu                  m4, [refq+ref_stride3q*2]
+%if %2 == 1
+  pavgw                 m1, [second_predq+mmsize*0]
+  pavgw                 m2, [second_predq+mmsize*1]
+  pavgw                 m3, [second_predq+mmsize*2]
+  pavgw                 m4, [second_predq+mmsize*3]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
+  mova                  m5, [srcq]
+  psubusw               m5, m1
+  psubusw               m1, [srcq]
+  por                   m1, m5
+  mova                  m5, [srcq+src_strideq*2]
+  psubusw               m5, m2
+  psubusw               m2, [srcq+src_strideq*2]
+  por                   m2, m5
+  mova                  m5, [srcq+src_strideq*4]
+  psubusw               m5, m3
+  psubusw               m3, [srcq+src_strideq*4]
+  por                   m3, m5
+  mova                  m5, [srcq+src_stride3q*2]
+  psubusw               m5, m4
+  psubusw               m4, [srcq+src_stride3q*2]
+  por                   m4, m5
+  paddw                 m1, m2
+  paddw                 m3, m4
+  movhlps               m2, m1
+  movhlps               m4, m3
+  paddw                 m1, m2
+  paddw                 m3, m4
+  punpcklwd             m1, m6
+  punpcklwd             m3, m6
+  lea                 refq, [refq+ref_strideq*8]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*8]
+  paddd                 m0, m3
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  punpckldq             m0, m6
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD8XN 16 ; highbd_sad8x16_sse2
+HIGH_SAD8XN  8 ; highbd_sad8x8_sse2
+HIGH_SAD8XN  4 ; highbd_sad8x4_sse2
+HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2
+HIGH_SAD8XN  8, 1 ; highbd_sad8x8_avg_sse2
+HIGH_SAD8XN  4, 1 ; highbd_sad8x4_avg_sse2
diff --git a/libs/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/libs/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
new file mode 100644
index 0000000000..30ee81b688
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
@@ -0,0 +1,1037 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_8: times  8 dw  8
+bilin_filter_m_sse2: times  8 dw 16
+                     times  8 dw  0
+                     times  8 dw 14
+                     times  8 dw  2
+                     times  8 dw 12
+                     times  8 dw  4
+                     times  8 dw 10
+                     times  8 dw  6
+                     times 16 dw  8
+                     times  8 dw  6
+                     times  8 dw 10
+                     times  8 dw  4
+                     times  8 dw 12
+                     times  8 dw  2
+                     times  8 dw 14
+
+SECTION .text
+
+; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
+;                               int x_offset, int y_offset,
+;                               const uint8_t *dst, ptrdiff_t dst_stride,
+;                               int height, unsigned int *sse);
+;
+; This function returns the SE and stores SSE in the given pointer.
+
+%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
+  psubw                %3, %4
+  psubw                %1, %2
+  mova                 %4, %3       ; make copies to manipulate to calc sum
+  mova                 %2, %1       ; use originals for calc sse
+  pmaddwd              %3, %3
+  paddw                %4, %2
+  pmaddwd              %1, %1
+  movhlps              %2, %4
+  paddd                %6, %3
+  paddw                %4, %2
+  pxor                 %2, %2
+  pcmpgtw              %2, %4       ; mask for 0 > %4 (sum)
+  punpcklwd            %4, %2       ; sign-extend word to dword
+  paddd                %6, %1
+  paddd                %5, %4
+
+%endmacro
+
+%macro STORE_AND_RET 0
+%if mmsize == 16
+  ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
+  ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
+  ; We have to sign-extend it before adding the words within the register
+  ; and outputing to a dword.
+  movhlps              m3, m7
+  movhlps              m4, m6
+  paddd                m7, m3
+  paddd                m6, m4
+  pshufd               m3, m7, 0x1
+  pshufd               m4, m6, 0x1
+  paddd                m7, m3
+  paddd                m6, m4
+  mov                  r1, ssem         ; r1 = unsigned int *sse
+  movd               [r1], m7           ; store sse
+  movd                rax, m6           ; store sum as return value
+%endif
+  RET
+%endmacro
+
+%macro INC_SRC_BY_SRC_STRIDE  0
+%if ARCH_X86=1 && CONFIG_PIC=1
+  add                srcq, src_stridemp
+  add                srcq, src_stridemp
+%else
+  lea                srcq, [srcq + src_strideq*2]
+%endif
+%endmacro
+
+%macro SUBPEL_VARIANCE 1-2 0 ; W
+%define bilin_filter_m bilin_filter_m_sse2
+%define filter_idx_shift 5
+
+
+%ifdef PIC    ; 64bit PIC
+  %if %2 == 1 ; avg
+    cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
+                                      x_offset, y_offset, \
+                                      dst, dst_stride, \
+                                      sec, sec_stride, height, sse
+    %define sec_str sec_strideq
+  %else
+    cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \
+                                  y_offset, dst, dst_stride, height, sse
+  %endif
+  %define block_height heightd
+  %define bilin_filter sseq
+%else
+  %if ARCH_X86=1 && CONFIG_PIC=1
+    %if %2 == 1 ; avg
+      cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+                                  x_offset, y_offset, \
+                                  dst, dst_stride, \
+                                  sec, sec_stride, \
+                                  height, sse, g_bilin_filter, g_pw_8
+      %define block_height dword heightm
+      %define sec_str sec_stridemp
+
+      ; Store bilin_filter and pw_8 location in stack
+      %if GET_GOT_DEFINED == 1
+        GET_GOT eax
+        add esp, 4                ; restore esp
+      %endif
+
+      lea ecx, [GLOBAL(bilin_filter_m)]
+      mov g_bilin_filterm, ecx
+
+      lea ecx, [GLOBAL(pw_8)]
+      mov g_pw_8m, ecx
+
+      LOAD_IF_USED 0, 1         ; load eax, ecx back
+    %else
+      cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+                                x_offset, y_offset, dst, dst_stride, height, \
+                                sse, g_bilin_filter, g_pw_8
+      %define block_height heightd
+
+      ; Store bilin_filter and pw_8 location in stack
+      %if GET_GOT_DEFINED == 1
+        GET_GOT eax
+        add esp, 4                ; restore esp
+      %endif
+
+      lea ecx, [GLOBAL(bilin_filter_m)]
+      mov g_bilin_filterm, ecx
+
+      lea ecx, [GLOBAL(pw_8)]
+      mov g_pw_8m, ecx
+
+      LOAD_IF_USED 0, 1         ; load eax, ecx back
+    %endif
+  %else
+    %if %2 == 1 ; avg
+      cglobal highbd_sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
+                        7 + 2 * ARCH_X86_64, 13, src, src_stride, \
+                                             x_offset, y_offset, \
+                                             dst, dst_stride, \
+                                             sec, sec_stride, \
+                                             height, sse
+      %if ARCH_X86_64
+      %define block_height heightd
+      %define sec_str sec_strideq
+      %else
+      %define block_height dword heightm
+      %define sec_str sec_stridemp
+      %endif
+    %else
+      cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+                              x_offset, y_offset, dst, dst_stride, height, sse
+      %define block_height heightd
+    %endif
+
+    %define bilin_filter bilin_filter_m
+  %endif
+%endif
+
+  ASSERT               %1 <= 16         ; m6 overflows if w > 16
+  pxor                 m6, m6           ; sum
+  pxor                 m7, m7           ; sse
+
+%if %1 < 16
+  sar                   block_height, 1
+%endif
+%if %2 == 1 ; avg
+  shl             sec_str, 1
+%endif
+
+  ; FIXME(rbultje) replace by jumptable?
+  test          x_offsetd, x_offsetd
+  jnz .x_nonzero
+  ; x_offset == 0
+  test          y_offsetd, y_offsetd
+  jnz .x_zero_y_nonzero
+
+  ; x_offset == 0 && y_offset == 0
+.x_zero_y_zero_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m2, [srcq + 16]
+  mova                 m1, [dstq]
+  mova                 m3, [dstq + 16]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m2, [secq+16]
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*2]
+  lea                dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m2, [srcq + src_strideq*2]
+  mova                 m1, [dstq]
+  mova                 m3, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m2, [secq]
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*4]
+  lea                dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   block_height
+  jg .x_zero_y_zero_loop
+  STORE_AND_RET
+
+.x_zero_y_nonzero:
+  cmp           y_offsetd, 8
+  jne .x_zero_y_nonhalf
+
+  ; x_offset == 0 && y_offset == 0.5
+.x_zero_y_half_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+16]
+  movu                 m4, [srcq+src_strideq*2]
+  movu                 m5, [srcq+src_strideq*2+16]
+  mova                 m2, [dstq]
+  mova                 m3, [dstq+16]
+  pavgw                m0, m4
+  pavgw                m1, m5
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*2]
+  lea                dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+src_strideq*2]
+  movu                 m5, [srcq+src_strideq*4]
+  mova                 m2, [dstq]
+  mova                 m3, [dstq+dst_strideq*2]
+  pavgw                m0, m1
+  pavgw                m1, m5
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m1, [secq]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*4]
+  lea                dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   block_height
+  jg .x_zero_y_half_loop
+  STORE_AND_RET
+
+.x_zero_y_nonhalf:
+  ; x_offset == 0 && y_offset == bilin interpolation
+%ifdef PIC
+  lea        bilin_filter, [bilin_filter_m]
+%endif
+  shl           y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+  mova                 m8, [bilin_filter+y_offsetq]
+  mova                 m9, [bilin_filter+y_offsetq+16]
+  mova                m10, [pw_8]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else ; x86-32 or mmx
+%if ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0, reuse x_offset reg
+%define tempq x_offsetq
+  add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+
+.x_zero_y_other_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq + 16]
+  movu                 m4, [srcq+src_strideq*2]
+  movu                 m5, [srcq+src_strideq*2+16]
+  mova                 m2, [dstq]
+  mova                 m3, [dstq+16]
+  ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
+  ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
+  ; instructions is the same (5), but it is 1 mul instead of 2, so might be
+  ; slightly faster because of pmullw latency. It would also cut our rodata
+  ; tables in half for this function, and save 1-2 registers on x86-64.
+  pmullw               m1, filter_y_a
+  pmullw               m5, filter_y_b
+  paddw                m1, filter_rnd
+  pmullw               m0, filter_y_a
+  pmullw               m4, filter_y_b
+  paddw                m0, filter_rnd
+  paddw                m1, m5
+  paddw                m0, m4
+  psrlw                m1, 4
+  psrlw                m0, 4
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*2]
+  lea                dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+src_strideq*2]
+  movu                 m5, [srcq+src_strideq*4]
+  mova                 m4, m1
+  mova                 m2, [dstq]
+  mova                 m3, [dstq+dst_strideq*2]
+  pmullw               m1, filter_y_a
+  pmullw               m5, filter_y_b
+  paddw                m1, filter_rnd
+  pmullw               m0, filter_y_a
+  pmullw               m4, filter_y_b
+  paddw                m0, filter_rnd
+  paddw                m1, m5
+  paddw                m0, m4
+  psrlw                m1, 4
+  psrlw                m0, 4
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m1, [secq]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*4]
+  lea                dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   block_height
+  jg .x_zero_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+  STORE_AND_RET
+
+.x_nonzero:
+  cmp           x_offsetd, 8
+  jne .x_nonhalf
+  ; x_offset == 0.5
+  test          y_offsetd, y_offsetd
+  jnz .x_half_y_nonzero
+
+  ; x_offset == 0.5 && y_offset == 0
+.x_half_y_zero_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq + 16]
+  movu                 m4, [srcq + 2]
+  movu                 m5, [srcq + 18]
+  mova                 m2, [dstq]
+  mova                 m3, [dstq + 16]
+  pavgw                m0, m4
+  pavgw                m1, m5
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*2]
+  lea                dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq + src_strideq*2]
+  movu                 m4, [srcq + 2]
+  movu                 m5, [srcq + src_strideq*2 + 2]
+  mova                 m2, [dstq]
+  mova                 m3, [dstq + dst_strideq*2]
+  pavgw                m0, m4
+  pavgw                m1, m5
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m1, [secq]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*4]
+  lea                dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   block_height
+  jg .x_half_y_zero_loop
+  STORE_AND_RET
+
+.x_half_y_nonzero:
+  cmp           y_offsetd, 8
+  jne .x_half_y_nonhalf
+
+  ; x_offset == 0.5 && y_offset == 0.5
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+16]
+  movu                 m2, [srcq+2]
+  movu                 m3, [srcq+18]
+  lea                srcq, [srcq + src_strideq*2]
+  pavgw                m0, m2
+  pavgw                m1, m3
+.x_half_y_half_loop:
+  movu                 m2, [srcq]
+  movu                 m3, [srcq + 16]
+  movu                 m4, [srcq + 2]
+  movu                 m5, [srcq + 18]
+  pavgw                m2, m4
+  pavgw                m3, m5
+  pavgw                m0, m2
+  pavgw                m1, m3
+  mova                 m4, [dstq]
+  mova                 m5, [dstq + 16]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m4, m1, m5, m6, m7
+  mova                 m0, m2
+  mova                 m1, m3
+
+  lea                srcq, [srcq + src_strideq*2]
+  lea                dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m2, [srcq+2]
+  lea                srcq, [srcq + src_strideq*2]
+  pavgw                m0, m2
+.x_half_y_half_loop:
+  movu                 m2, [srcq]
+  movu                 m3, [srcq + src_strideq*2]
+  movu                 m4, [srcq + 2]
+  movu                 m5, [srcq + src_strideq*2 + 2]
+  pavgw                m2, m4
+  pavgw                m3, m5
+  pavgw                m0, m2
+  pavgw                m2, m3
+  mova                 m4, [dstq]
+  mova                 m5, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m2, [secq]
+%endif
+  SUM_SSE              m0, m4, m2, m5, m6, m7
+  mova                 m0, m3
+
+  lea                srcq, [srcq + src_strideq*4]
+  lea                dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   block_height
+  jg .x_half_y_half_loop
+  STORE_AND_RET
+
+.x_half_y_nonhalf:
+  ; x_offset == 0.5 && y_offset == bilin interpolation
+%ifdef PIC
+  lea        bilin_filter, [bilin_filter_m]
+%endif
+  shl           y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+  mova                 m8, [bilin_filter+y_offsetq]
+  mova                 m9, [bilin_filter+y_offsetq+16]
+  mova                m10, [pw_8]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else  ; x86_32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0.5. We can reuse x_offset reg
+%define tempq x_offsetq
+  add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+16]
+  movu                 m2, [srcq+2]
+  movu                 m3, [srcq+18]
+  lea                srcq, [srcq + src_strideq*2]
+  pavgw                m0, m2
+  pavgw                m1, m3
+.x_half_y_other_loop:
+  movu                 m2, [srcq]
+  movu                 m3, [srcq+16]
+  movu                 m4, [srcq+2]
+  movu                 m5, [srcq+18]
+  pavgw                m2, m4
+  pavgw                m3, m5
+  mova                 m4, m2
+  mova                 m5, m3
+  pmullw               m1, filter_y_a
+  pmullw               m3, filter_y_b
+  paddw                m1, filter_rnd
+  paddw                m1, m3
+  pmullw               m0, filter_y_a
+  pmullw               m2, filter_y_b
+  paddw                m0, filter_rnd
+  psrlw                m1, 4
+  paddw                m0, m2
+  mova                 m2, [dstq]
+  psrlw                m0, 4
+  mova                 m3, [dstq+16]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+  mova                 m0, m4
+  mova                 m1, m5
+
+  lea                srcq, [srcq + src_strideq*2]
+  lea                dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m2, [srcq+2]
+  lea                srcq, [srcq + src_strideq*2]
+  pavgw                m0, m2
+.x_half_y_other_loop:
+  movu                 m2, [srcq]
+  movu                 m3, [srcq+src_strideq*2]
+  movu                 m4, [srcq+2]
+  movu                 m5, [srcq+src_strideq*2+2]
+  pavgw                m2, m4
+  pavgw                m3, m5
+  mova                 m4, m2
+  mova                 m5, m3
+  pmullw               m4, filter_y_a
+  pmullw               m3, filter_y_b
+  paddw                m4, filter_rnd
+  paddw                m4, m3
+  pmullw               m0, filter_y_a
+  pmullw               m2, filter_y_b
+  paddw                m0, filter_rnd
+  psrlw                m4, 4
+  paddw                m0, m2
+  mova                 m2, [dstq]
+  psrlw                m0, 4
+  mova                 m3, [dstq+dst_strideq*2]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m4, [secq]
+%endif
+  SUM_SSE              m0, m2, m4, m3, m6, m7
+  mova                 m0, m5
+
+  lea                srcq, [srcq + src_strideq*4]
+  lea                dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   block_height
+  jg .x_half_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+  STORE_AND_RET
+
+.x_nonhalf:
+  test          y_offsetd, y_offsetd
+  jnz .x_nonhalf_y_nonzero
+
+  ; x_offset == bilin interpolation && y_offset == 0
+%ifdef PIC
+  lea        bilin_filter, [bilin_filter_m]
+%endif
+  shl           x_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+  mova                 m8, [bilin_filter+x_offsetq]
+  mova                 m9, [bilin_filter+x_offsetq+16]
+  mova                m10, [pw_8]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else    ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; y_offset == 0. We can reuse y_offset reg.
+%define tempq y_offsetq
+  add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+
+.x_other_y_zero_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+16]
+  movu                 m2, [srcq+2]
+  movu                 m3, [srcq+18]
+  mova                 m4, [dstq]
+  mova                 m5, [dstq+16]
+  pmullw               m1, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m1, filter_rnd
+  pmullw               m0, filter_x_a
+  pmullw               m2, filter_x_b
+  paddw                m0, filter_rnd
+  paddw                m1, m3
+  paddw                m0, m2
+  psrlw                m1, 4
+  psrlw                m0, 4
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m4, m1, m5, m6, m7
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+src_strideq*2]
+  movu                 m2, [srcq+2]
+  movu                 m3, [srcq+src_strideq*2+2]
+  mova                 m4, [dstq]
+  mova                 m5, [dstq+dst_strideq*2]
+  pmullw               m1, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m1, filter_rnd
+  pmullw               m0, filter_x_a
+  pmullw               m2, filter_x_b
+  paddw                m0, filter_rnd
+  paddw                m1, m3
+  paddw                m0, m2
+  psrlw                m1, 4
+  psrlw                m0, 4
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m1, [secq]
+%endif
+  SUM_SSE              m0, m4, m1, m5, m6, m7
+
+  lea                srcq, [srcq+src_strideq*4]
+  lea                dstq, [dstq+dst_strideq*4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   block_height
+  jg .x_other_y_zero_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+  STORE_AND_RET
+
+.x_nonhalf_y_nonzero:
+  cmp           y_offsetd, 8
+  jne .x_nonhalf_y_nonhalf
+
+  ; x_offset == bilin interpolation && y_offset == 0.5
+%ifdef PIC
+  lea        bilin_filter, [bilin_filter_m]
+%endif
+  shl           x_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+  mova                 m8, [bilin_filter+x_offsetq]
+  mova                 m9, [bilin_filter+x_offsetq+16]
+  mova                m10, [pw_8]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else    ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; y_offset == 0.5. We can reuse y_offset reg.
+%define tempq y_offsetq
+  add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+16]
+  movu                 m2, [srcq+2]
+  movu                 m3, [srcq+18]
+  pmullw               m0, filter_x_a
+  pmullw               m2, filter_x_b
+  paddw                m0, filter_rnd
+  pmullw               m1, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m1, filter_rnd
+  paddw                m0, m2
+  paddw                m1, m3
+  psrlw                m0, 4
+  psrlw                m1, 4
+  lea                srcq, [srcq+src_strideq*2]
+.x_other_y_half_loop:
+  movu                 m2, [srcq]
+  movu                 m3, [srcq+16]
+  movu                 m4, [srcq+2]
+  movu                 m5, [srcq+18]
+  pmullw               m2, filter_x_a
+  pmullw               m4, filter_x_b
+  paddw                m2, filter_rnd
+  pmullw               m3, filter_x_a
+  pmullw               m5, filter_x_b
+  paddw                m3, filter_rnd
+  paddw                m2, m4
+  paddw                m3, m5
+  mova                 m4, [dstq]
+  mova                 m5, [dstq+16]
+  psrlw                m2, 4
+  psrlw                m3, 4
+  pavgw                m0, m2
+  pavgw                m1, m3
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m4, m1, m5, m6, m7
+  mova                 m0, m2
+  mova                 m1, m3
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m2, [srcq+2]
+  pmullw               m0, filter_x_a
+  pmullw               m2, filter_x_b
+  paddw                m0, filter_rnd
+  paddw                m0, m2
+  psrlw                m0, 4
+  lea                srcq, [srcq+src_strideq*2]
+.x_other_y_half_loop:
+  movu                 m2, [srcq]
+  movu                 m3, [srcq+src_strideq*2]
+  movu                 m4, [srcq+2]
+  movu                 m5, [srcq+src_strideq*2+2]
+  pmullw               m2, filter_x_a
+  pmullw               m4, filter_x_b
+  paddw                m2, filter_rnd
+  pmullw               m3, filter_x_a
+  pmullw               m5, filter_x_b
+  paddw                m3, filter_rnd
+  paddw                m2, m4
+  paddw                m3, m5
+  mova                 m4, [dstq]
+  mova                 m5, [dstq+dst_strideq*2]
+  psrlw                m2, 4
+  psrlw                m3, 4
+  pavgw                m0, m2
+  pavgw                m2, m3
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m2, [secq]
+%endif
+  SUM_SSE              m0, m4, m2, m5, m6, m7
+  mova                 m0, m3
+
+  lea                srcq, [srcq+src_strideq*4]
+  lea                dstq, [dstq+dst_strideq*4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   block_height
+  jg .x_other_y_half_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+  STORE_AND_RET
+
+.x_nonhalf_y_nonhalf:
+; loading filter - this is same as in 8-bit depth
+%ifdef PIC
+  lea        bilin_filter, [bilin_filter_m]
+%endif
+  shl           x_offsetd, filter_idx_shift ; filter_idx_shift = 5
+  shl           y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+  mova                 m8, [bilin_filter+x_offsetq]
+  mova                 m9, [bilin_filter+x_offsetq+16]
+  mova                m10, [bilin_filter+y_offsetq]
+  mova                m11, [bilin_filter+y_offsetq+16]
+  mova                m12, [pw_8]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_y_a m10
+%define filter_y_b m11
+%define filter_rnd m12
+%else   ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; In this case, there is NO unused register. Used src_stride register. Later,
+; src_stride has to be loaded from stack when it is needed.
+%define tempq src_strideq
+  mov tempq, g_bilin_filterm
+  add           x_offsetq, tempq
+  add           y_offsetq, tempq
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           x_offsetq, bilin_filter
+  add           y_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+; end of load filter
+
+  ; x_offset == bilin interpolation && y_offset == bilin interpolation
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m2, [srcq+2]
+  movu                 m1, [srcq+16]
+  movu                 m3, [srcq+18]
+  pmullw               m0, filter_x_a
+  pmullw               m2, filter_x_b
+  paddw                m0, filter_rnd
+  pmullw               m1, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m1, filter_rnd
+  paddw                m0, m2
+  paddw                m1, m3
+  psrlw                m0, 4
+  psrlw                m1, 4
+
+  INC_SRC_BY_SRC_STRIDE
+
+.x_other_y_other_loop:
+  movu                 m2, [srcq]
+  movu                 m4, [srcq+2]
+  movu                 m3, [srcq+16]
+  movu                 m5, [srcq+18]
+  pmullw               m2, filter_x_a
+  pmullw               m4, filter_x_b
+  paddw                m2, filter_rnd
+  pmullw               m3, filter_x_a
+  pmullw               m5, filter_x_b
+  paddw                m3, filter_rnd
+  paddw                m2, m4
+  paddw                m3, m5
+  psrlw                m2, 4
+  psrlw                m3, 4
+  mova                 m4, m2
+  mova                 m5, m3
+  pmullw               m0, filter_y_a
+  pmullw               m2, filter_y_b
+  paddw                m0, filter_rnd
+  pmullw               m1, filter_y_a
+  pmullw               m3, filter_y_b
+  paddw                m0, m2
+  paddw                m1, filter_rnd
+  mova                 m2, [dstq]
+  paddw                m1, m3
+  psrlw                m0, 4
+  psrlw                m1, 4
+  mova                 m3, [dstq+16]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+  mova                 m0, m4
+  mova                 m1, m5
+
+  INC_SRC_BY_SRC_STRIDE
+  lea                dstq, [dstq + dst_strideq * 2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m2, [srcq+2]
+  pmullw               m0, filter_x_a
+  pmullw               m2, filter_x_b
+  paddw                m0, filter_rnd
+  paddw                m0, m2
+  psrlw                m0, 4
+
+  INC_SRC_BY_SRC_STRIDE
+
+.x_other_y_other_loop:
+  movu                 m2, [srcq]
+  movu                 m4, [srcq+2]
+  INC_SRC_BY_SRC_STRIDE
+  movu                 m3, [srcq]
+  movu                 m5, [srcq+2]
+  pmullw               m2, filter_x_a
+  pmullw               m4, filter_x_b
+  paddw                m2, filter_rnd
+  pmullw               m3, filter_x_a
+  pmullw               m5, filter_x_b
+  paddw                m3, filter_rnd
+  paddw                m2, m4
+  paddw                m3, m5
+  psrlw                m2, 4
+  psrlw                m3, 4
+  mova                 m4, m2
+  mova                 m5, m3
+  pmullw               m0, filter_y_a
+  pmullw               m2, filter_y_b
+  paddw                m0, filter_rnd
+  pmullw               m4, filter_y_a
+  pmullw               m3, filter_y_b
+  paddw                m0, m2
+  paddw                m4, filter_rnd
+  mova                 m2, [dstq]
+  paddw                m4, m3
+  psrlw                m0, 4
+  psrlw                m4, 4
+  mova                 m3, [dstq+dst_strideq*2]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m4, [secq]
+%endif
+  SUM_SSE              m0, m2, m4, m3, m6, m7
+  mova                 m0, m5
+
+  INC_SRC_BY_SRC_STRIDE
+  lea                dstq, [dstq + dst_strideq * 4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   block_height
+  jg .x_other_y_other_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+  STORE_AND_RET
+%endmacro
+
+INIT_XMM sse2
+SUBPEL_VARIANCE  8
+SUBPEL_VARIANCE 16
+
+INIT_XMM sse2
+SUBPEL_VARIANCE  8, 1
+SUBPEL_VARIANCE 16, 1
diff --git a/libs/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm b/libs/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm
new file mode 100644
index 0000000000..923418a992
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm
@@ -0,0 +1,313 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;unsigned int vpx_highbd_calc16x16var_sse2
+;(
+;    unsigned char   *  src_ptr,
+;    int             source_stride,
+;    unsigned char   *  ref_ptr,
+;    int             recon_stride,
+;    unsigned int    *  SSE,
+;    int             *  Sum
+;)
+global sym(vpx_highbd_calc16x16var_sse2) PRIVATE
+sym(vpx_highbd_calc16x16var_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        mov         rsi,            arg(0) ;[src_ptr]
+        mov         rdi,            arg(2) ;[ref_ptr]
+
+        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
+        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
+        add         rax,            rax ; source stride in bytes
+        add         rdx,            rdx ; recon stride in bytes
+
+        ; Prefetch data
+        prefetcht0      [rsi]
+        prefetcht0      [rsi+16]
+        prefetcht0      [rsi+rax]
+        prefetcht0      [rsi+rax+16]
+        lea             rbx,    [rsi+rax*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+16]
+        prefetcht0      [rbx+rax]
+        prefetcht0      [rbx+rax+16]
+
+        prefetcht0      [rdi]
+        prefetcht0      [rdi+16]
+        prefetcht0      [rdi+rdx]
+        prefetcht0      [rdi+rdx+16]
+        lea             rbx,    [rdi+rdx*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+16]
+        prefetcht0      [rbx+rdx]
+        prefetcht0      [rbx+rdx+16]
+
+        pxor        xmm0,           xmm0     ; clear xmm0 for unpack
+        pxor        xmm7,           xmm7     ; clear xmm7 for accumulating diffs
+
+        pxor        xmm6,           xmm6     ; clear xmm6 for accumulating sse
+        mov         rcx,            16
+
+.var16loop:
+        movdqu      xmm1,           XMMWORD PTR [rsi]
+        movdqu      xmm2,           XMMWORD PTR [rdi]
+
+        lea             rbx,    [rsi+rax*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+16]
+        prefetcht0      [rbx+rax]
+        prefetcht0      [rbx+rax+16]
+        lea             rbx,    [rdi+rdx*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+16]
+        prefetcht0      [rbx+rdx]
+        prefetcht0      [rbx+rdx+16]
+
+        pxor        xmm5,           xmm5
+
+        psubw       xmm1,           xmm2
+        movdqu      xmm3,           XMMWORD PTR [rsi+16]
+        paddw       xmm5,           xmm1
+        pmaddwd     xmm1,           xmm1
+        movdqu      xmm2,           XMMWORD PTR [rdi+16]
+        paddd       xmm6,           xmm1
+
+        psubw       xmm3,           xmm2
+        movdqu      xmm1,           XMMWORD PTR [rsi+rax]
+        paddw       xmm5,           xmm3
+        pmaddwd     xmm3,           xmm3
+        movdqu      xmm2,           XMMWORD PTR [rdi+rdx]
+        paddd       xmm6,           xmm3
+
+        psubw       xmm1,           xmm2
+        movdqu      xmm3,           XMMWORD PTR [rsi+rax+16]
+        paddw       xmm5,           xmm1
+        pmaddwd     xmm1,           xmm1
+        movdqu      xmm2,           XMMWORD PTR [rdi+rdx+16]
+        paddd       xmm6,           xmm1
+
+        psubw       xmm3,           xmm2
+        paddw       xmm5,           xmm3
+        pmaddwd     xmm3,           xmm3
+        paddd       xmm6,           xmm3
+
+        movdqa      xmm1,           xmm5
+        movdqa      xmm2,           xmm5
+        pcmpgtw     xmm1,           xmm0
+        pcmpeqw     xmm2,           xmm0
+        por         xmm1,           xmm2
+        pcmpeqw     xmm1,           xmm0
+        movdqa      xmm2,           xmm5
+        punpcklwd   xmm5,           xmm1
+        punpckhwd   xmm2,           xmm1
+        paddd       xmm7,           xmm5
+        paddd       xmm7,           xmm2
+
+        lea         rsi,            [rsi + 2*rax]
+        lea         rdi,            [rdi + 2*rdx]
+        sub         rcx,            2
+        jnz         .var16loop
+
+        movdqa      xmm4,           xmm6
+        punpckldq   xmm6,           xmm0
+
+        punpckhdq   xmm4,           xmm0
+        movdqa      xmm5,           xmm7
+
+        paddd       xmm6,           xmm4
+        punpckldq   xmm7,           xmm0
+
+        punpckhdq   xmm5,           xmm0
+        paddd       xmm7,           xmm5
+
+        movdqa      xmm4,           xmm6
+        movdqa      xmm5,           xmm7
+
+        psrldq      xmm4,           8
+        psrldq      xmm5,           8
+
+        paddd       xmm6,           xmm4
+        paddd       xmm7,           xmm5
+
+        mov         rdi,            arg(4)   ; [SSE]
+        mov         rax,            arg(5)   ; [Sum]
+
+        movd DWORD PTR [rdi],       xmm6
+        movd DWORD PTR [rax],       xmm7
+
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    pop rbx
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int vpx_highbd_calc8x8var_sse2
+;(
+;    unsigned char   *  src_ptr,
+;    int             source_stride,
+;    unsigned char   *  ref_ptr,
+;    int             recon_stride,
+;    unsigned int    *  SSE,
+;    int             *  Sum
+;)
+global sym(vpx_highbd_calc8x8var_sse2) PRIVATE
+sym(vpx_highbd_calc8x8var_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        mov         rsi,            arg(0) ;[src_ptr]
+        mov         rdi,            arg(2) ;[ref_ptr]
+
+        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
+        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
+        add         rax,            rax ; source stride in bytes
+        add         rdx,            rdx ; recon stride in bytes
+
+        ; Prefetch data
+        prefetcht0      [rsi]
+        prefetcht0      [rsi+rax]
+        lea             rbx,    [rsi+rax*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rax]
+
+        prefetcht0      [rdi]
+        prefetcht0      [rdi+rdx]
+        lea             rbx,    [rdi+rdx*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rdx]
+
+        pxor        xmm0,           xmm0     ; clear xmm0 for unpack
+        pxor        xmm7,           xmm7     ; clear xmm7 for accumulating diffs
+
+        pxor        xmm6,           xmm6     ; clear xmm6 for accumulating sse
+        mov         rcx,            8
+
+.var8loop:
+        movdqu      xmm1,           XMMWORD PTR [rsi]
+        movdqu      xmm2,           XMMWORD PTR [rdi]
+
+        lea             rbx,    [rsi+rax*4]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rax]
+        lea             rbx,    [rbx+rax*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rax]
+        lea             rbx,    [rdi+rdx*4]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rdx]
+        lea             rbx,    [rbx+rdx*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rdx]
+
+        pxor        xmm5,           xmm5
+
+        psubw       xmm1,           xmm2
+        movdqu      xmm3,           XMMWORD PTR [rsi+rax]
+        paddw       xmm5,           xmm1
+        pmaddwd     xmm1,           xmm1
+        movdqu      xmm2,           XMMWORD PTR [rdi+rdx]
+        paddd       xmm6,           xmm1
+
+        lea         rsi,            [rsi + 2*rax]
+        lea         rdi,            [rdi + 2*rdx]
+
+        psubw       xmm3,           xmm2
+        movdqu      xmm1,           XMMWORD PTR [rsi]
+        paddw       xmm5,           xmm3
+        pmaddwd     xmm3,           xmm3
+        movdqu      xmm2,           XMMWORD PTR [rdi]
+        paddd       xmm6,           xmm3
+
+        psubw       xmm1,           xmm2
+        movdqu      xmm3,           XMMWORD PTR [rsi+rax]
+        paddw       xmm5,           xmm1
+        pmaddwd     xmm1,           xmm1
+        movdqu      xmm2,           XMMWORD PTR [rdi+rdx]
+        paddd       xmm6,           xmm1
+
+        psubw       xmm3,           xmm2
+        paddw       xmm5,           xmm3
+        pmaddwd     xmm3,           xmm3
+        paddd       xmm6,           xmm3
+
+        movdqa      xmm1,           xmm5
+        movdqa      xmm2,           xmm5
+        pcmpgtw     xmm1,           xmm0
+        pcmpeqw     xmm2,           xmm0
+        por         xmm1,           xmm2
+        pcmpeqw     xmm1,           xmm0
+        movdqa      xmm2,           xmm5
+        punpcklwd   xmm5,           xmm1
+        punpckhwd   xmm2,           xmm1
+        paddd       xmm7,           xmm5
+        paddd       xmm7,           xmm2
+
+        lea         rsi,            [rsi + 2*rax]
+        lea         rdi,            [rdi + 2*rdx]
+        sub         rcx,            4
+        jnz         .var8loop
+
+        movdqa      xmm4,           xmm6
+        punpckldq   xmm6,           xmm0
+
+        punpckhdq   xmm4,           xmm0
+        movdqa      xmm5,           xmm7
+
+        paddd       xmm6,           xmm4
+        punpckldq   xmm7,           xmm0
+
+        punpckhdq   xmm5,           xmm0
+        paddd       xmm7,           xmm5
+
+        movdqa      xmm4,           xmm6
+        movdqa      xmm5,           xmm7
+
+        psrldq      xmm4,           8
+        psrldq      xmm5,           8
+
+        paddd       xmm6,           xmm4
+        paddd       xmm7,           xmm5
+
+        mov         rdi,            arg(4)   ; [SSE]
+        mov         rax,            arg(5)   ; [Sum]
+
+        movd DWORD PTR [rdi],       xmm6
+        movd DWORD PTR [rax],       xmm7
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    pop rbx
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/libs/libvpx/vpx_dsp/x86/highbd_variance_sse2.c b/libs/libvpx/vpx_dsp/x86/highbd_variance_sse2.c
new file mode 100644
index 0000000000..81ec5dbdb9
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/highbd_variance_sse2.c
@@ -0,0 +1,593 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "./vpx_config.h"
+
+#include "vpx_ports/mem.h"
+
+typedef uint32_t (*high_variance_fn_t) (const uint16_t *src, int src_stride,
+                                        const uint16_t *ref, int ref_stride,
+                                        uint32_t *sse, int *sum);
+
+uint32_t vpx_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride,
+                                    const uint16_t *ref, int ref_stride,
+                                    uint32_t *sse, int *sum);
+
+uint32_t vpx_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride,
+                                      const uint16_t *ref, int ref_stride,
+                                      uint32_t *sse, int *sum);
+
+static void highbd_8_variance_sse2(const uint16_t *src, int src_stride,
+                                   const uint16_t *ref, int ref_stride,
+                                   int w, int h, uint32_t *sse, int *sum,
+                                   high_variance_fn_t var_fn, int block_size) {
+  int i, j;
+
+  *sse = 0;
+  *sum = 0;
+
+  for (i = 0; i < h; i += block_size) {
+    for (j = 0; j < w; j += block_size) {
+      unsigned int sse0;
+      int sum0;
+      var_fn(src + src_stride * i + j, src_stride,
+             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
+      *sse += sse0;
+      *sum += sum0;
+    }
+  }
+}
+
+static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
+                                    const uint16_t *ref, int ref_stride,
+                                    int w, int h, uint32_t *sse, int *sum,
+                                    high_variance_fn_t var_fn, int block_size) {
+  int i, j;
+  uint64_t sse_long = 0;
+  int32_t sum_long = 0;
+
+  for (i = 0; i < h; i += block_size) {
+    for (j = 0; j < w; j += block_size) {
+      unsigned int sse0;
+      int sum0;
+      var_fn(src + src_stride * i + j, src_stride,
+             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
+      sse_long += sse0;
+      sum_long += sum0;
+    }
+  }
+  *sum = ROUND_POWER_OF_TWO(sum_long, 2);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
+}
+
+static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
+                                    const uint16_t *ref, int ref_stride,
+                                    int w, int h, uint32_t *sse, int *sum,
+                                    high_variance_fn_t var_fn, int block_size) {
+  int i, j;
+  uint64_t sse_long = 0;
+  int32_t sum_long = 0;
+
+  for (i = 0; i < h; i += block_size) {
+    for (j = 0; j < w; j += block_size) {
+      unsigned int sse0;
+      int sum0;
+      var_fn(src + src_stride * i + j, src_stride,
+             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
+      sse_long += sse0;
+      sum_long += sum0;
+    }
+  }
+  *sum = ROUND_POWER_OF_TWO(sum_long, 4);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
+}
+
+
+#define HIGH_GET_VAR(S) \
+void vpx_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
+                                       const uint8_t *ref8, int ref_stride, \
+                                       uint32_t *sse, int *sum) { \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+  vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
+                                     sse, sum); \
+} \
+\
+void vpx_highbd_10_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
+                                          const uint8_t *ref8, int ref_stride, \
+                                          uint32_t *sse, int *sum) { \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+  vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
+                                     sse, sum); \
+  *sum = ROUND_POWER_OF_TWO(*sum, 2); \
+  *sse = ROUND_POWER_OF_TWO(*sse, 4); \
+} \
+\
+void vpx_highbd_12_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
+                                          const uint8_t *ref8, int ref_stride, \
+                                          uint32_t *sse, int *sum) { \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+  vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
+                                     sse, sum); \
+  *sum = ROUND_POWER_OF_TWO(*sum, 4); \
+  *sse = ROUND_POWER_OF_TWO(*sse, 8); \
+}
+
+HIGH_GET_VAR(16);
+HIGH_GET_VAR(8);
+
+#undef HIGH_GET_VAR
+
+#define VAR_FN(w, h, block_size, shift) \
+uint32_t vpx_highbd_8_variance##w##x##h##_sse2( \
+    const uint8_t *src8, int src_stride, \
+    const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
+  int sum; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+  highbd_8_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+                         vpx_highbd_calc##block_size##x##block_size##var_sse2, \
+                         block_size); \
+  return *sse - (((int64_t)sum * sum) >> shift); \
+} \
+\
+uint32_t vpx_highbd_10_variance##w##x##h##_sse2( \
+    const uint8_t *src8, int src_stride, \
+    const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
+  int sum; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+  highbd_10_variance_sse2( \
+      src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+      vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
+  return *sse - (((int64_t)sum * sum) >> shift); \
+} \
+\
+uint32_t vpx_highbd_12_variance##w##x##h##_sse2( \
+    const uint8_t *src8, int src_stride, \
+    const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
+  int sum; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+  highbd_12_variance_sse2( \
+      src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+      vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
+  return *sse - (((int64_t)sum * sum) >> shift); \
+}
+
+VAR_FN(64, 64, 16, 12);
+VAR_FN(64, 32, 16, 11);
+VAR_FN(32, 64, 16, 11);
+VAR_FN(32, 32, 16, 10);
+VAR_FN(32, 16, 16, 9);
+VAR_FN(16, 32, 16, 9);
+VAR_FN(16, 16, 16, 8);
+VAR_FN(16, 8, 8, 7);
+VAR_FN(8, 16, 8, 7);
+VAR_FN(8, 8, 8, 6);
+
+#undef VAR_FN
+
+unsigned int vpx_highbd_8_mse16x16_sse2(const uint8_t *src8, int src_stride,
+                                      const uint8_t *ref8, int ref_stride,
+                                      unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
+                         sse, &sum, vpx_highbd_calc16x16var_sse2, 16);
+  return *sse;
+}
+
+unsigned int vpx_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride,
+                                         const uint8_t *ref8, int ref_stride,
+                                         unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
+                          sse, &sum, vpx_highbd_calc16x16var_sse2, 16);
+  return *sse;
+}
+
+unsigned int vpx_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride,
+                                         const uint8_t *ref8, int ref_stride,
+                                         unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
+                          sse, &sum, vpx_highbd_calc16x16var_sse2, 16);
+  return *sse;
+}
+
+unsigned int vpx_highbd_8_mse8x8_sse2(const uint8_t *src8, int src_stride,
+                                    const uint8_t *ref8, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
+                         sse, &sum, vpx_highbd_calc8x8var_sse2, 8);
+  return *sse;
+}
+
+unsigned int vpx_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride,
+                                       const uint8_t *ref8, int ref_stride,
+                                       unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
+                          sse, &sum, vpx_highbd_calc8x8var_sse2, 8);
+  return *sse;
+}
+
+unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
+                                       const uint8_t *ref8, int ref_stride,
+                                       unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
+                          sse, &sum, vpx_highbd_calc8x8var_sse2, 8);
+  return *sse;
+}
+
+#if CONFIG_USE_X86INC
+// The 2 unused parameters are place holders for PIC enabled build.
+// These definitions are for functions defined in
+// highbd_subpel_variance_impl_sse2.asm
+#define DECL(w, opt) \
+  int vpx_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \
+                                                 ptrdiff_t src_stride, \
+                                                 int x_offset, int y_offset, \
+                                                 const uint16_t *dst, \
+                                                 ptrdiff_t dst_stride, \
+                                                 int height, \
+                                                 unsigned int *sse, \
+                                                 void *unused0, void *unused);
+#define DECLS(opt1, opt2) \
+  DECL(8, opt1); \
+  DECL(16, opt1)
+
+DECLS(sse2, sse);
+// TODO(johannkoenig): enable the ssse3 or delete
+// DECLS(ssse3, ssse3);
+#undef DECLS
+#undef DECL
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+uint32_t vpx_highbd_8_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src8, \
+                                                          int src_stride, \
+                                                          int x_offset, \
+                                                          int y_offset, \
+                                                          const uint8_t *dst8, \
+                                                          int dst_stride, \
+                                                          uint32_t *sse_ptr) { \
+  uint32_t sse; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+  int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
+                                                       x_offset, y_offset, \
+                                                       dst, dst_stride, h, \
+                                                       &sse, NULL, NULL); \
+  if (w > wf) { \
+    unsigned int sse2; \
+    int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
+                                                          src_stride, \
+                                                          x_offset, y_offset, \
+                                                          dst + 16, \
+                                                          dst_stride, \
+                                                          h, &sse2, \
+                                                          NULL, NULL); \
+    se += se2; \
+    sse += sse2; \
+    if (w > wf * 2) { \
+      se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
+                                                        x_offset, y_offset, \
+                                                        dst + 32, dst_stride, \
+                                                        h, &sse2, NULL, NULL); \
+      se += se2; \
+      sse += sse2; \
+      se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+          src + 48, src_stride, x_offset, y_offset, \
+          dst + 48, dst_stride, h, &sse2, NULL, NULL); \
+      se += se2; \
+      sse += sse2; \
+    } \
+  } \
+  *sse_ptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+} \
+\
+uint32_t vpx_highbd_10_sub_pixel_variance##w##x##h##_##opt( \
+    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
+  uint32_t sse; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+  int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
+                                                       x_offset, y_offset, \
+                                                       dst, dst_stride, \
+                                                       h, &sse, NULL, NULL); \
+  if (w > wf) { \
+    uint32_t sse2; \
+    int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
+                                                          src_stride, \
+                                                          x_offset, y_offset, \
+                                                          dst + 16, \
+                                                          dst_stride, \
+                                                          h, &sse2, \
+                                                          NULL, NULL); \
+    se += se2; \
+    sse += sse2; \
+    if (w > wf * 2) { \
+      se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
+                                                        x_offset, y_offset, \
+                                                        dst + 32, dst_stride, \
+                                                        h, &sse2, NULL, NULL); \
+      se += se2; \
+      sse += sse2; \
+      se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
+                                                        x_offset, y_offset, \
+                                                        dst + 48, dst_stride, \
+                                                        h, &sse2, NULL, NULL); \
+      se += se2; \
+      sse += sse2; \
+    } \
+  } \
+  se = ROUND_POWER_OF_TWO(se, 2); \
+  sse = ROUND_POWER_OF_TWO(sse, 4); \
+  *sse_ptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+} \
+\
+uint32_t vpx_highbd_12_sub_pixel_variance##w##x##h##_##opt( \
+    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
+  int start_row; \
+  uint32_t sse; \
+  int se = 0; \
+  uint64_t long_sse = 0; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+  for (start_row = 0; start_row < h; start_row +=16) { \
+    uint32_t sse2; \
+    int height = h - start_row < 16 ? h - start_row : 16; \
+    int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+        src + (start_row * src_stride), src_stride, \
+        x_offset, y_offset, dst + (start_row * dst_stride), \
+        dst_stride, height, &sse2, NULL, NULL); \
+    se += se2; \
+    long_sse += sse2; \
+    if (w > wf) { \
+      se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+          src + 16 + (start_row * src_stride), src_stride, \
+          x_offset, y_offset, dst + 16 + (start_row * dst_stride), \
+          dst_stride, height, &sse2, NULL, NULL); \
+      se += se2; \
+      long_sse += sse2; \
+      if (w > wf * 2) { \
+        se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+            src + 32 + (start_row * src_stride), src_stride, \
+            x_offset, y_offset, dst + 32 + (start_row * dst_stride), \
+            dst_stride, height, &sse2, NULL, NULL); \
+        se += se2; \
+        long_sse += sse2; \
+        se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+            src + 48 + (start_row * src_stride), src_stride, \
+            x_offset, y_offset, dst + 48 + (start_row * dst_stride), \
+            dst_stride, height, &sse2, NULL, NULL); \
+        se += se2; \
+        long_sse += sse2; \
+      }\
+    } \
+  } \
+  se = ROUND_POWER_OF_TWO(se, 4); \
+  sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \
+  *sse_ptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+}
+
+#define FNS(opt1, opt2) \
+FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
+FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
+FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
+FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
+FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
+FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
+FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
+FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
+FN(8, 16, 8, 3, 4, opt1, (int64_t)); \
+FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
+FN(8, 4, 8, 3, 2, opt1, (int64_t));
+
+
+FNS(sse2, sse);
+
+#undef FNS
+#undef FN
+
+// The 2 unused parameters are place holders for PIC enabled build.
+#define DECL(w, opt) \
+int vpx_highbd_sub_pixel_avg_variance##w##xh_##opt(const uint16_t *src, \
+                                                   ptrdiff_t src_stride, \
+                                                   int x_offset, int y_offset, \
+                                                   const uint16_t *dst, \
+                                                   ptrdiff_t dst_stride, \
+                                                   const uint16_t *sec, \
+                                                   ptrdiff_t sec_stride, \
+                                                   int height, \
+                                                   unsigned int *sse, \
+                                                   void *unused0, void *unused);
+#define DECLS(opt1) \
+DECL(16, opt1) \
+DECL(8, opt1)
+
+DECLS(sse2);
+#undef DECL
+#undef DECLS
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+uint32_t vpx_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt( \
+    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
+    const uint8_t *sec8) { \
+  uint32_t sse; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+  uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
+  int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+               src, src_stride, x_offset, \
+               y_offset, dst, dst_stride, sec, w, h, &sse, NULL, NULL); \
+  if (w > wf) { \
+    uint32_t sse2; \
+    int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                  src + 16, src_stride, x_offset, y_offset, \
+                  dst + 16, dst_stride, sec + 16, w, h, &sse2, NULL, NULL); \
+    se += se2; \
+    sse += sse2; \
+    if (w > wf * 2) { \
+      se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                src + 32, src_stride, x_offset, y_offset, \
+                dst + 32, dst_stride, sec + 32, w, h, &sse2, NULL, NULL); \
+      se += se2; \
+      sse += sse2; \
+      se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                src + 48, src_stride, x_offset, y_offset, \
+                dst + 48, dst_stride, sec + 48, w, h, &sse2, NULL, NULL); \
+      se += se2; \
+      sse += sse2; \
+    } \
+  } \
+  *sse_ptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+} \
+\
+uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \
+    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
+    const uint8_t *sec8) { \
+  uint32_t sse; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+  uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
+  int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                                            src, src_stride, x_offset, \
+                                            y_offset, dst, dst_stride, \
+                                            sec, w, h, &sse, NULL, NULL); \
+  if (w > wf) { \
+    uint32_t sse2; \
+    int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                                            src + 16, src_stride, \
+                                            x_offset, y_offset, \
+                                            dst + 16, dst_stride, \
+                                            sec + 16, w, h, &sse2, \
+                                            NULL, NULL); \
+    se += se2; \
+    sse += sse2; \
+    if (w > wf * 2) { \
+      se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                                            src + 32, src_stride, \
+                                            x_offset, y_offset, \
+                                            dst + 32, dst_stride, \
+                                            sec + 32, w, h, &sse2, \
+                                            NULL, NULL); \
+      se += se2; \
+      sse += sse2; \
+      se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                                            src + 48, src_stride, \
+                                            x_offset, y_offset, \
+                                            dst + 48, dst_stride, \
+                                            sec + 48, w, h, &sse2, \
+                                            NULL, NULL); \
+      se += se2; \
+      sse += sse2; \
+    } \
+  } \
+  se = ROUND_POWER_OF_TWO(se, 2); \
+  sse = ROUND_POWER_OF_TWO(sse, 4); \
+  *sse_ptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+} \
+\
+uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \
+    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
+    const uint8_t *sec8) { \
+  int start_row; \
+  uint32_t sse; \
+  int se = 0; \
+  uint64_t long_sse = 0; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+  uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
+  for (start_row = 0; start_row < h; start_row +=16) { \
+    uint32_t sse2; \
+    int height = h - start_row < 16 ? h - start_row : 16; \
+    int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                src + (start_row * src_stride), src_stride, x_offset, \
+                y_offset, dst + (start_row * dst_stride), dst_stride, \
+                sec + (start_row * w), w, height, &sse2, NULL, NULL); \
+    se += se2; \
+    long_sse += sse2; \
+    if (w > wf) { \
+      se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                src + 16 + (start_row * src_stride), src_stride, \
+                x_offset, y_offset, \
+                dst + 16 + (start_row * dst_stride), dst_stride, \
+                sec + 16 + (start_row * w), w, height, &sse2, NULL, NULL); \
+      se += se2; \
+      long_sse += sse2; \
+      if (w > wf * 2) { \
+        se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                src + 32 + (start_row * src_stride), src_stride, \
+                x_offset, y_offset, \
+                dst + 32 + (start_row * dst_stride), dst_stride, \
+                sec + 32 + (start_row * w), w, height, &sse2, NULL, NULL); \
+        se += se2; \
+        long_sse += sse2; \
+        se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                src + 48 + (start_row * src_stride), src_stride, \
+                x_offset, y_offset, \
+                dst + 48 + (start_row * dst_stride), dst_stride, \
+                sec + 48 + (start_row * w), w, height, &sse2, NULL, NULL); \
+        se += se2; \
+        long_sse += sse2; \
+      } \
+    } \
+  } \
+  se = ROUND_POWER_OF_TWO(se, 4); \
+  sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \
+  *sse_ptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+}
+
+
+#define FNS(opt1) \
+FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
+FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
+FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
+FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
+FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
+FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
+FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
+FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
+FN(8, 16, 8, 4, 3, opt1, (int64_t)); \
+FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
+FN(8, 4, 8, 3, 2, opt1, (int64_t));
+
+FNS(sse2);
+
+#undef FNS
+#undef FN
+#endif  // CONFIG_USE_X86INC
diff --git a/libs/libvpx/vpx_dsp/x86/intrapred_sse2.asm b/libs/libvpx/vpx_dsp/x86/intrapred_sse2.asm
new file mode 100644
index 0000000000..c24d53686a
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/intrapred_sse2.asm
@@ -0,0 +1,750 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_4:  times 8 dw 4
+pw_8:  times 8 dw 8
+pw_16: times 8 dw 16
+pw_32: times 8 dw 32
+dc_128: times 16 db 128
+pw2_4:  times 8 dw 2
+pw2_8:  times 8 dw 4
+pw2_16:  times 8 dw 8
+pw2_32:  times 8 dw 16
+
+SECTION .text
+
+INIT_XMM sse2
+cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  movd                  m2, [leftq]
+  movd                  m0, [aboveq]
+  pxor                  m1, m1
+  punpckldq             m0, m2
+  psadbw                m0, m1
+  paddw                 m0, [GLOBAL(pw_4)]
+  psraw                 m0, 3
+  pshuflw               m0, m0, 0x0
+  packuswb              m0, m0
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+  lea                 dstq, [dstq+strideq*2]
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal dc_left_predictor_4x4, 2, 5, 2, dst, stride, above, left, goffset
+  movifnidn          leftq, leftmp
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  movd                  m0, [leftq]
+  psadbw                m0, m1
+  paddw                 m0, [GLOBAL(pw2_4)]
+  psraw                 m0, 2
+  pshuflw               m0, m0, 0x0
+  packuswb              m0, m0
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+  lea                 dstq, [dstq+strideq*2]
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal dc_top_predictor_4x4, 3, 5, 2, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  movd                  m0, [aboveq]
+  psadbw                m0, m1
+  paddw                 m0, [GLOBAL(pw2_4)]
+  psraw                 m0, 2
+  pshuflw               m0, m0, 0x0
+  packuswb              m0, m0
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+  lea                 dstq, [dstq+strideq*2]
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  movq                  m0, [aboveq]
+  movq                  m2, [leftq]
+  DEFINE_ARGS dst, stride, stride3
+  lea             stride3q, [strideq*3]
+  psadbw                m0, m1
+  psadbw                m2, m1
+  paddw                 m0, m2
+  paddw                 m0, [GLOBAL(pw_8)]
+  psraw                 m0, 4
+  punpcklbw             m0, m0
+  pshuflw               m0, m0, 0x0
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal dc_top_predictor_8x8, 3, 5, 2, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  movq                  m0, [aboveq]
+  DEFINE_ARGS dst, stride, stride3
+  lea             stride3q, [strideq*3]
+  psadbw                m0, m1
+  paddw                 m0, [GLOBAL(pw2_8)]
+  psraw                 m0, 3
+  punpcklbw             m0, m0
+  pshuflw               m0, m0, 0x0
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal dc_left_predictor_8x8, 2, 5, 2, dst, stride, above, left, goffset
+  movifnidn          leftq, leftmp
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  movq                  m0, [leftq]
+  DEFINE_ARGS dst, stride, stride3
+  lea             stride3q, [strideq*3]
+  psadbw                m0, m1
+  paddw                 m0, [GLOBAL(pw2_8)]
+  psraw                 m0, 3
+  punpcklbw             m0, m0
+  pshuflw               m0, m0, 0x0
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal dc_128_predictor_4x4, 2, 5, 1, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  DEFINE_ARGS dst, stride, stride3
+  lea             stride3q, [strideq*3]
+  movd     m0,        [GLOBAL(dc_128)]
+  movd    [dstq          ], m0
+  movd    [dstq+strideq  ], m0
+  movd    [dstq+strideq*2], m0
+  movd    [dstq+stride3q ], m0
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal dc_128_predictor_8x8, 2, 5, 1, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  DEFINE_ARGS dst, stride, stride3
+  lea             stride3q, [strideq*3]
+  movq    m0,        [GLOBAL(dc_128)]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  mova                  m0, [aboveq]
+  mova                  m2, [leftq]
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 4
+  psadbw                m0, m1
+  psadbw                m2, m1
+  paddw                 m0, m2
+  movhlps               m2, m0
+  paddw                 m0, m2
+  paddw                 m0, [GLOBAL(pw_16)]
+  psraw                 m0, 5
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+  packuswb              m0, m0
+.loop:
+  mova    [dstq          ], m0
+  mova    [dstq+strideq  ], m0
+  mova    [dstq+strideq*2], m0
+  mova    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+
+
+INIT_XMM sse2
+cglobal dc_top_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  mova                  m0, [aboveq]
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 4
+  psadbw                m0, m1
+  movhlps               m2, m0
+  paddw                 m0, m2
+  paddw                 m0, [GLOBAL(pw2_16)]
+  psraw                 m0, 4
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+  packuswb              m0, m0
+.loop:
+  mova    [dstq          ], m0
+  mova    [dstq+strideq  ], m0
+  mova    [dstq+strideq*2], m0
+  mova    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM sse2
+cglobal dc_left_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  mova                  m0, [leftq]
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 4
+  psadbw                m0, m1
+  movhlps               m2, m0
+  paddw                 m0, m2
+  paddw                 m0, [GLOBAL(pw2_16)]
+  psraw                 m0, 4
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+  packuswb              m0, m0
+.loop:
+  mova    [dstq          ], m0
+  mova    [dstq+strideq  ], m0
+  mova    [dstq+strideq*2], m0
+  mova    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM sse2
+cglobal dc_128_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 4
+  mova    m0,        [GLOBAL(dc_128)]
+.loop:
+  mova    [dstq          ], m0
+  mova    [dstq+strideq  ], m0
+  mova    [dstq+strideq*2], m0
+  mova    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+  RESTORE_GOT
+  RET
+
+
+INIT_XMM sse2
+cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  mova                  m0, [aboveq]
+  mova                  m2, [aboveq+16]
+  mova                  m3, [leftq]
+  mova                  m4, [leftq+16]
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 8
+  psadbw                m0, m1
+  psadbw                m2, m1
+  psadbw                m3, m1
+  psadbw                m4, m1
+  paddw                 m0, m2
+  paddw                 m0, m3
+  paddw                 m0, m4
+  movhlps               m2, m0
+  paddw                 m0, m2
+  paddw                 m0, [GLOBAL(pw_32)]
+  psraw                 m0, 6
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+  packuswb              m0, m0
+.loop:
+  mova [dstq             ], m0
+  mova [dstq          +16], m0
+  mova [dstq+strideq     ], m0
+  mova [dstq+strideq  +16], m0
+  mova [dstq+strideq*2   ], m0
+  mova [dstq+strideq*2+16], m0
+  mova [dstq+stride3q    ], m0
+  mova [dstq+stride3q +16], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM sse2
+cglobal dc_top_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  mova                  m0, [aboveq]
+  mova                  m2, [aboveq+16]
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 8
+  psadbw                m0, m1
+  psadbw                m2, m1
+  paddw                 m0, m2
+  movhlps               m2, m0
+  paddw                 m0, m2
+  paddw                 m0, [GLOBAL(pw2_32)]
+  psraw                 m0, 5
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+  packuswb              m0, m0
+.loop:
+  mova [dstq             ], m0
+  mova [dstq          +16], m0
+  mova [dstq+strideq     ], m0
+  mova [dstq+strideq  +16], m0
+  mova [dstq+strideq*2   ], m0
+  mova [dstq+strideq*2+16], m0
+  mova [dstq+stride3q    ], m0
+  mova [dstq+stride3q +16], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM sse2
+cglobal dc_left_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  mova                  m0, [leftq]
+  mova                  m2, [leftq+16]
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 8
+  psadbw                m0, m1
+  psadbw                m2, m1
+  paddw                 m0, m2
+  movhlps               m2, m0
+  paddw                 m0, m2
+  paddw                 m0, [GLOBAL(pw2_32)]
+  psraw                 m0, 5
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+  packuswb              m0, m0
+.loop:
+  mova [dstq             ], m0
+  mova [dstq          +16], m0
+  mova [dstq+strideq     ], m0
+  mova [dstq+strideq  +16], m0
+  mova [dstq+strideq*2   ], m0
+  mova [dstq+strideq*2+16], m0
+  mova [dstq+stride3q    ], m0
+  mova [dstq+stride3q +16], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM sse2
+cglobal dc_128_predictor_32x32, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 8
+  mova    m0,        [GLOBAL(dc_128)]
+.loop:
+  mova [dstq             ], m0
+  mova [dstq          +16], m0
+  mova [dstq+strideq     ], m0
+  mova [dstq+strideq  +16], m0
+  mova [dstq+strideq*2   ], m0
+  mova [dstq+strideq*2+16], m0
+  mova [dstq+stride3q    ], m0
+  mova [dstq+stride3q +16], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above
+  movd                  m0, [aboveq]
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+  lea                 dstq, [dstq+strideq*2]
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+  RET
+
+INIT_XMM sse2
+cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above
+  movq                  m0, [aboveq]
+  DEFINE_ARGS dst, stride, stride3
+  lea             stride3q, [strideq*3]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+  RET
+
+INIT_XMM sse2
+cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above
+  mova                  m0, [aboveq]
+  DEFINE_ARGS dst, stride, stride3, nlines4
+  lea             stride3q, [strideq*3]
+  mov              nlines4d, 4
+.loop:
+  mova    [dstq          ], m0
+  mova    [dstq+strideq  ], m0
+  mova    [dstq+strideq*2], m0
+  mova    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec             nlines4d
+  jnz .loop
+  REP_RET
+
+INIT_XMM sse2
+cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above
+  mova                  m0, [aboveq]
+  mova                  m1, [aboveq+16]
+  DEFINE_ARGS dst, stride, stride3, nlines4
+  lea             stride3q, [strideq*3]
+  mov              nlines4d, 8
+.loop:
+  mova [dstq             ], m0
+  mova [dstq          +16], m1
+  mova [dstq+strideq     ], m0
+  mova [dstq+strideq  +16], m1
+  mova [dstq+strideq*2   ], m0
+  mova [dstq+strideq*2+16], m1
+  mova [dstq+stride3q    ], m0
+  mova [dstq+stride3q +16], m1
+  lea                 dstq, [dstq+strideq*4]
+  dec             nlines4d
+  jnz .loop
+  REP_RET
+
+INIT_XMM sse2
+cglobal h_predictor_4x4, 2, 4, 4, dst, stride, line, left
+  movifnidn          leftq, leftmp
+  movd                  m0, [leftq]
+  punpcklbw             m0, m0
+  punpcklbw             m0, m0
+  pshufd                m1, m0, 0x1
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m1
+  pshufd                m2, m0, 0x2
+  lea                 dstq, [dstq+strideq*2]
+  pshufd                m3, m0, 0x3
+  movd      [dstq        ], m2
+  movd      [dstq+strideq], m3
+  RET
+
+INIT_XMM sse2
+cglobal h_predictor_8x8, 2, 5, 3, dst, stride, line, left
+  movifnidn          leftq, leftmp
+  mov                lineq, -2
+  DEFINE_ARGS  dst, stride, line, left, stride3
+  lea             stride3q, [strideq*3]
+  movq                  m0, [leftq    ]
+  punpcklbw             m0, m0              ; l1 l1 l2 l2 ... l8 l8
+.loop:
+  pshuflw               m1, m0, 0x0         ; l1 l1 l1 l1 l1 l1 l1 l1
+  pshuflw               m2, m0, 0x55        ; l2 l2 l2 l2 l2 l2 l2 l2
+  movq      [dstq        ], m1
+  movq      [dstq+strideq], m2
+  pshuflw               m1, m0, 0xaa
+  pshuflw               m2, m0, 0xff
+  movq    [dstq+strideq*2], m1
+  movq    [dstq+stride3q ], m2
+  pshufd                m0, m0, 0xe         ; [63:0] l5 l5 l6 l6 l7 l7 l8 l8
+  inc                lineq
+  lea                 dstq, [dstq+strideq*4]
+  jnz .loop
+  REP_RET
+
+INIT_XMM sse2
+cglobal h_predictor_16x16, 2, 5, 3, dst, stride, line, left
+  movifnidn          leftq, leftmp
+  mov                lineq, -4
+  DEFINE_ARGS dst, stride, line, left, stride3
+  lea             stride3q, [strideq*3]
+.loop:
+  movd                  m0, [leftq]
+  punpcklbw             m0, m0
+  punpcklbw             m0, m0              ; l1 to l4 each repeated 4 times
+  pshufd            m1, m0, 0x0             ; l1 repeated 16 times
+  pshufd            m2, m0, 0x55            ; l2 repeated 16 times
+  mova    [dstq          ], m1
+  mova    [dstq+strideq  ], m2
+  pshufd            m1, m0, 0xaa
+  pshufd            m2, m0, 0xff
+  mova    [dstq+strideq*2], m1
+  mova    [dstq+stride3q ], m2
+  inc                lineq
+  lea                leftq, [leftq+4       ]
+  lea                 dstq, [dstq+strideq*4]
+  jnz .loop
+  REP_RET
+
+INIT_XMM sse2
+cglobal h_predictor_32x32, 2, 5, 3, dst, stride, line, left
+  movifnidn              leftq, leftmp
+  mov                    lineq, -8
+  DEFINE_ARGS dst, stride, line, left, stride3
+  lea                 stride3q, [strideq*3]
+.loop:
+  movd                      m0, [leftq]
+  punpcklbw                 m0, m0
+  punpcklbw                 m0, m0              ; l1 to l4 each repeated 4 times
+  pshufd                m1, m0, 0x0             ; l1 repeated 16 times
+  pshufd                m2, m0, 0x55            ; l2 repeated 16 times
+  mova     [dstq             ], m1
+  mova     [dstq+16          ], m1
+  mova     [dstq+strideq     ], m2
+  mova     [dstq+strideq+16  ], m2
+  pshufd                m1, m0, 0xaa
+  pshufd                m2, m0, 0xff
+  mova     [dstq+strideq*2   ], m1
+  mova     [dstq+strideq*2+16], m1
+  mova     [dstq+stride3q    ], m2
+  mova     [dstq+stride3q+16 ], m2
+  inc                    lineq
+  lea                    leftq, [leftq+4       ]
+  lea                     dstq, [dstq+strideq*4]
+  jnz .loop
+  REP_RET
+
+INIT_XMM sse2
+cglobal tm_predictor_4x4, 4, 4, 5, dst, stride, above, left
+  pxor                  m1, m1
+  movq                  m0, [aboveq-1]; [63:0] tl t1 t2 t3 t4 x x x
+  punpcklbw             m0, m1
+  pshuflw               m2, m0, 0x0   ; [63:0] tl tl tl tl [word]
+  psrldq                m0, 2
+  psubw                 m0, m2        ; [63:0] t1-tl t2-tl t3-tl t4-tl [word]
+  movd                  m2, [leftq]
+  punpcklbw             m2, m1
+  pshuflw               m4, m2, 0x0   ; [63:0] l1 l1 l1 l1 [word]
+  pshuflw               m3, m2, 0x55  ; [63:0] l2 l2 l2 l2 [word]
+  paddw                 m4, m0
+  paddw                 m3, m0
+  packuswb              m4, m4
+  packuswb              m3, m3
+  movd      [dstq        ], m4
+  movd      [dstq+strideq], m3
+  lea                 dstq, [dstq+strideq*2]
+  pshuflw               m4, m2, 0xaa
+  pshuflw               m3, m2, 0xff
+  paddw                 m4, m0
+  paddw                 m3, m0
+  packuswb              m4, m4
+  packuswb              m3, m3
+  movd      [dstq        ], m4
+  movd      [dstq+strideq], m3
+  RET
+
+INIT_XMM sse2
+cglobal tm_predictor_8x8, 4, 4, 5, dst, stride, above, left
+  pxor                  m1, m1
+  movd                  m2, [aboveq-1]
+  movq                  m0, [aboveq]
+  punpcklbw             m2, m1
+  punpcklbw             m0, m1        ; t1 t2 t3 t4 t5 t6 t7 t8 [word]
+  pshuflw               m2, m2, 0x0   ; [63:0] tl tl tl tl [word]
+  DEFINE_ARGS dst, stride, line, left
+  mov                lineq, -4
+  punpcklqdq            m2, m2        ; tl tl tl tl tl tl tl tl [word]
+  psubw                 m0, m2        ; t1-tl t2-tl ... t8-tl [word]
+  movq                  m2, [leftq]
+  punpcklbw             m2, m1        ; l1 l2 l3 l4 l5 l6 l7 l8 [word]
+.loop
+  pshuflw               m4, m2, 0x0   ; [63:0] l1 l1 l1 l1 [word]
+  pshuflw               m3, m2, 0x55  ; [63:0] l2 l2 l2 l2 [word]
+  punpcklqdq            m4, m4        ; l1 l1 l1 l1 l1 l1 l1 l1 [word]
+  punpcklqdq            m3, m3        ; l2 l2 l2 l2 l2 l2 l2 l2 [word]
+  paddw                 m4, m0
+  paddw                 m3, m0
+  packuswb              m4, m3
+  movq      [dstq        ], m4
+  movhps    [dstq+strideq], m4
+  lea                 dstq, [dstq+strideq*2]
+  psrldq                m2, 4
+  inc                lineq
+  jnz .loop
+  REP_RET
+
+INIT_XMM sse2
+cglobal tm_predictor_16x16, 4, 5, 8, dst, stride, above, left
+  pxor                  m1, m1
+  mova                  m2, [aboveq-16];
+  mova                  m0, [aboveq]   ; t1 t2 ... t16 [byte]
+  punpckhbw             m2, m1         ; [127:112] tl [word]
+  punpckhbw             m4, m0, m1
+  punpcklbw             m0, m1         ; m0:m4 t1 t2 ... t16 [word]
+  DEFINE_ARGS dst, stride, line, left, stride8
+  mov                lineq, -8
+  pshufhw               m2, m2, 0xff
+  mova                  m3, [leftq]    ; l1 l2 ... l16 [byte]
+  punpckhqdq            m2, m2         ; tl repeated 8 times [word]
+  psubw                 m0, m2
+  psubw                 m4, m2         ; m0:m4 t1-tl t2-tl ... t16-tl [word]
+  punpckhbw             m5, m3, m1
+  punpcklbw             m3, m1         ; m3:m5 l1 l2 ... l16 [word]
+  lea             stride8q, [strideq*8]
+.loop:
+  pshuflw               m6, m3, 0x0
+  pshuflw               m7, m5, 0x0
+  punpcklqdq            m6, m6         ; l1 repeated 8 times [word]
+  punpcklqdq            m7, m7         ; l8 repeated 8 times [word]
+  paddw                 m1, m6, m0
+  paddw                 m6, m4         ; m1:m6 ti-tl+l1 [i=1,15] [word]
+  psrldq                m5, 2
+  packuswb              m1, m6
+  mova     [dstq         ], m1
+  paddw                 m1, m7, m0
+  paddw                 m7, m4         ; m1:m7 ti-tl+l8 [i=1,15] [word]
+  psrldq                m3, 2
+  packuswb              m1, m7
+  mova     [dstq+stride8q], m1
+  inc                lineq
+  lea                 dstq, [dstq+strideq]
+  jnz .loop
+  REP_RET
+
+INIT_XMM sse2
+cglobal tm_predictor_32x32, 4, 4, 8, dst, stride, above, left
+  pxor                  m1, m1
+  movd                  m2, [aboveq-1]
+  mova                  m0, [aboveq]
+  mova                  m4, [aboveq+16]
+  punpcklbw             m2, m1
+  punpckhbw             m3, m0, m1
+  punpckhbw             m5, m4, m1
+  punpcklbw             m0, m1
+  punpcklbw             m4, m1
+  pshuflw               m2, m2, 0x0
+  DEFINE_ARGS dst, stride, line, left
+  mov                lineq, -16
+  punpcklqdq            m2, m2
+  add                leftq, 32
+  psubw                 m0, m2
+  psubw                 m3, m2
+  psubw                 m4, m2
+  psubw                 m5, m2
+.loop:
+  movd                  m2, [leftq+lineq*2]
+  pxor                  m1, m1
+  punpcklbw             m2, m1
+  pshuflw               m7, m2, 0x55
+  pshuflw               m2, m2, 0x0
+  punpcklqdq            m2, m2
+  punpcklqdq            m7, m7
+  paddw                 m6, m2, m3
+  paddw                 m1, m2, m0
+  packuswb              m1, m6
+  mova   [dstq           ], m1
+  paddw                 m6, m2, m5
+  paddw                 m1, m2, m4
+  packuswb              m1, m6
+  mova   [dstq+16        ], m1
+  paddw                 m6, m7, m3
+  paddw                 m1, m7, m0
+  packuswb              m1, m6
+  mova   [dstq+strideq   ], m1
+  paddw                 m6, m7, m5
+  paddw                 m1, m7, m4
+  packuswb              m1, m6
+  mova   [dstq+strideq+16], m1
+  lea                 dstq, [dstq+strideq*2]
+  inc                lineq
+  jnz .loop
+  REP_RET
diff --git a/libs/libvpx/vpx_dsp/x86/intrapred_ssse3.asm b/libs/libvpx/vpx_dsp/x86/intrapred_ssse3.asm
new file mode 100644
index 0000000000..d061278c7d
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/intrapred_ssse3.asm
@@ -0,0 +1,962 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+
+pb_1: times 16 db 1
+sh_b01234577: db 0, 1, 2, 3, 4, 5, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7
+sh_b1234567777777777: db 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
+sh_b2345677777777777: db 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
+sh_b123456789abcdeff: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15
+sh_b23456789abcdefff: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15
+sh_b32104567: db 3, 2, 1, 0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b8091a2b345: db 8, 0, 9, 1, 10, 2, 11, 3, 4, 5, 0, 0, 0, 0, 0, 0
+sh_b76543210: db 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b65432108: db 6, 5, 4, 3, 2, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b54321089: db 5, 4, 3, 2, 1, 0, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b89abcdef: db 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
+sh_bfedcba9876543210: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+sh_b1233: db 1, 2, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b2333: db 2, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+
+SECTION .text
+
+INIT_MMX ssse3
+cglobal d45_predictor_4x4, 3, 4, 4, dst, stride, above, goffset
+  GET_GOT     goffsetq
+
+  movq                m0, [aboveq]
+  pshufb              m2, m0, [GLOBAL(sh_b23456777)]
+  pshufb              m1, m0, [GLOBAL(sh_b01234577)]
+  pshufb              m0, [GLOBAL(sh_b12345677)]
+  pavgb               m3, m2, m1
+  pxor                m2, m1
+  pand                m2, [GLOBAL(pb_1)]
+  psubb               m3, m2
+  pavgb               m0, m3
+
+  ; store 4 lines
+  movd    [dstq        ], m0
+  psrlq               m0, 8
+  movd    [dstq+strideq], m0
+  lea               dstq, [dstq+strideq*2]
+  psrlq               m0, 8
+  movd    [dstq        ], m0
+  psrlq               m0, 8
+  movd    [dstq+strideq], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_MMX ssse3
+cglobal d45_predictor_8x8, 3, 4, 4, dst, stride, above, goffset
+  GET_GOT     goffsetq
+
+  movq                m0, [aboveq]
+  mova                m1, [GLOBAL(sh_b12345677)]
+  DEFINE_ARGS dst, stride, stride3
+  lea           stride3q, [strideq*3]
+  pshufb              m2, m0, [GLOBAL(sh_b23456777)]
+  pavgb               m3, m2, m0
+  pxor                m2, m0
+  pshufb              m0, m1
+  pand                m2, [GLOBAL(pb_1)]
+  psubb               m3, m2
+  pavgb               m0, m3
+
+  ; store 4 lines
+  movq  [dstq          ], m0
+  pshufb              m0, m1
+  movq  [dstq+strideq  ], m0
+  pshufb              m0, m1
+  movq  [dstq+strideq*2], m0
+  pshufb              m0, m1
+  movq  [dstq+stride3q ], m0
+  pshufb              m0, m1
+  lea               dstq, [dstq+strideq*4]
+
+  ; store next 4 lines
+  movq  [dstq          ], m0
+  pshufb              m0, m1
+  movq  [dstq+strideq  ], m0
+  pshufb              m0, m1
+  movq  [dstq+strideq*2], m0
+  pshufb              m0, m1
+  movq  [dstq+stride3q ], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_XMM ssse3
+cglobal d45_predictor_16x16, 3, 6, 4, dst, stride, above, dst8, line, goffset
+  GET_GOT     goffsetq
+
+  mova                   m0, [aboveq]
+  DEFINE_ARGS dst, stride, stride3, dst8, line
+  lea              stride3q, [strideq*3]
+  lea                 dst8q, [dstq+strideq*8]
+  mova                   m1, [GLOBAL(sh_b123456789abcdeff)]
+  pshufb                 m2, m0, [GLOBAL(sh_b23456789abcdefff)]
+  pavgb                  m3, m2, m0
+  pxor                   m2, m0
+  pshufb                 m0, m1
+  pand                   m2, [GLOBAL(pb_1)]
+  psubb                  m3, m2
+  pavgb                  m0, m3
+
+  ; first 4 lines and first half of 3rd 4 lines
+  mov                 lined, 2
+.loop:
+  mova   [dstq            ], m0
+  movhps [dst8q           ], m0
+  pshufb                 m0, m1
+  mova   [dstq +strideq   ], m0
+  movhps [dst8q+strideq   ], m0
+  pshufb                 m0, m1
+  mova   [dstq +strideq*2 ], m0
+  movhps [dst8q+strideq*2 ], m0
+  pshufb                 m0, m1
+  mova   [dstq +stride3q  ], m0
+  movhps [dst8q+stride3q  ], m0
+  pshufb                 m0, m1
+  lea                  dstq, [dstq +strideq*4]
+  lea                 dst8q, [dst8q+strideq*4]
+  dec                 lined
+  jnz .loop
+
+  ; bottom-right 8x8 block
+  movhps [dstq          +8], m0
+  movhps [dstq+strideq  +8], m0
+  movhps [dstq+strideq*2+8], m0
+  movhps [dstq+stride3q +8], m0
+  lea                  dstq, [dstq+strideq*4]
+  movhps [dstq          +8], m0
+  movhps [dstq+strideq  +8], m0
+  movhps [dstq+strideq*2+8], m0
+  movhps [dstq+stride3q +8], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_XMM ssse3
+cglobal d45_predictor_32x32, 3, 6, 7, dst, stride, above, dst16, line, goffset
+  GET_GOT     goffsetq
+
+  mova                   m0, [aboveq]
+  mova                   m4, [aboveq+16]
+  DEFINE_ARGS dst, stride, stride3, dst16, line
+  lea              stride3q, [strideq*3]
+  lea                dst16q, [dstq  +strideq*8]
+  lea                dst16q, [dst16q+strideq*8]
+  mova                   m1, [GLOBAL(sh_b123456789abcdeff)]
+  pshufb                 m2, m4, [GLOBAL(sh_b23456789abcdefff)]
+  pavgb                  m3, m2, m4
+  pxor                   m2, m4
+  palignr                m5, m4, m0, 1
+  palignr                m6, m4, m0, 2
+  pshufb                 m4, m1
+  pand                   m2, [GLOBAL(pb_1)]
+  psubb                  m3, m2
+  pavgb                  m4, m3
+  pavgb                  m3, m0, m6
+  pxor                   m0, m6
+  pand                   m0, [GLOBAL(pb_1)]
+  psubb                  m3, m0
+  pavgb                  m5, m3
+
+  ; write 4x4 lines (and the first half of the second 4x4 lines)
+  mov                  lined, 4
+.loop:
+  mova [dstq               ], m5
+  mova [dstq            +16], m4
+  mova [dst16q             ], m4
+  palignr                 m3, m4, m5, 1
+  pshufb                  m4, m1
+  mova [dstq  +strideq     ], m3
+  mova [dstq  +strideq  +16], m4
+  mova [dst16q+strideq     ], m4
+  palignr                 m5, m4, m3, 1
+  pshufb                  m4, m1
+  mova [dstq  +strideq*2   ], m5
+  mova [dstq  +strideq*2+16], m4
+  mova [dst16q+strideq*2   ], m4
+  palignr                 m3, m4, m5, 1
+  pshufb                  m4, m1
+  mova [dstq  +stride3q    ], m3
+  mova [dstq  +stride3q +16], m4
+  mova [dst16q+stride3q    ], m4
+  palignr                 m5, m4, m3, 1
+  pshufb                  m4, m1
+  lea                  dstq, [dstq  +strideq*4]
+  lea                dst16q, [dst16q+strideq*4]
+  dec                 lined
+  jnz .loop
+
+  ; write second half of second 4x4 lines
+  mova [dstq            +16], m4
+  mova [dstq  +strideq  +16], m4
+  mova [dstq  +strideq*2+16], m4
+  mova [dstq  +stride3q +16], m4
+  lea                  dstq, [dstq  +strideq*4]
+  mova [dstq            +16], m4
+  mova [dstq  +strideq  +16], m4
+  mova [dstq  +strideq*2+16], m4
+  mova [dstq  +stride3q +16], m4
+  lea                  dstq, [dstq  +strideq*4]
+  mova [dstq            +16], m4
+  mova [dstq  +strideq  +16], m4
+  mova [dstq  +strideq*2+16], m4
+  mova [dstq  +stride3q +16], m4
+  lea                  dstq, [dstq  +strideq*4]
+  mova [dstq            +16], m4
+  mova [dstq  +strideq  +16], m4
+  mova [dstq  +strideq*2+16], m4
+  mova [dstq  +stride3q +16], m4
+
+  RESTORE_GOT
+  RET
+
+; ------------------------------------------
+; input: x, y, z, result
+;
+; trick from pascal
+; (x+2y+z+2)>>2 can be calculated as:
+; result = avg(x,z)
+; result -= xor(x,z) & 1
+; result = avg(result,y)
+; ------------------------------------------
+%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4
+  pavgb               %4, %1, %3
+  pxor                %3, %1
+  pand                %3, [GLOBAL(pb_1)]
+  psubb               %4, %3
+  pavgb               %4, %2
+%endmacro
+
+INIT_XMM ssse3
+cglobal d63_predictor_4x4, 3, 4, 5, dst, stride, above, goffset
+  GET_GOT     goffsetq
+
+  movq                m3, [aboveq]
+  pshufb              m1, m3, [GLOBAL(sh_b23456777)]
+  pshufb              m2, m3, [GLOBAL(sh_b12345677)]
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m2, m1, m4
+  pavgb               m3, m2
+
+  ; store 4 lines
+  movd    [dstq        ], m3
+  movd    [dstq+strideq], m4
+  lea               dstq, [dstq+strideq*2]
+  psrldq              m3, 1
+  psrldq              m4, 1
+  movd    [dstq        ], m3
+  movd    [dstq+strideq], m4
+  RESTORE_GOT
+  RET
+
+INIT_XMM ssse3
+cglobal d63_predictor_8x8, 3, 4, 5, dst, stride, above, goffset
+  GET_GOT     goffsetq
+
+  movq                m3, [aboveq]
+  DEFINE_ARGS dst, stride, stride3
+  lea           stride3q, [strideq*3]
+  pshufb              m1, m3, [GLOBAL(sh_b2345677777777777)]
+  pshufb              m0, m3, [GLOBAL(sh_b0123456777777777)]
+  pshufb              m2, m3, [GLOBAL(sh_b1234567777777777)]
+  pshufb              m3, [GLOBAL(sh_b0123456777777777)]
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m4
+  pavgb               m3, m2
+
+  ; store 4 lines
+  movq    [dstq        ], m3
+  movq    [dstq+strideq], m4
+  psrldq              m3, 1
+  psrldq              m4, 1
+  movq  [dstq+strideq*2], m3
+  movq  [dstq+stride3q ], m4
+  lea               dstq, [dstq+strideq*4]
+  psrldq              m3, 1
+  psrldq              m4, 1
+
+  ; store 4 lines
+  movq    [dstq        ], m3
+  movq    [dstq+strideq], m4
+  psrldq              m3, 1
+  psrldq              m4, 1
+  movq  [dstq+strideq*2], m3
+  movq  [dstq+stride3q ], m4
+  RESTORE_GOT
+  RET
+
+INIT_XMM ssse3
+cglobal d63_predictor_16x16, 3, 5, 5, dst, stride, above, line, goffset
+  GET_GOT     goffsetq
+
+  mova                m0, [aboveq]
+  DEFINE_ARGS dst, stride, stride3, line
+  lea           stride3q, [strideq*3]
+  mova                m1, [GLOBAL(sh_b123456789abcdeff)]
+  pshufb              m2, m0, [GLOBAL(sh_b23456789abcdefff)]
+  pshufb              m3, m0, m1
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m3, m2, m4
+  pavgb               m0, m3
+
+  mov              lined, 4
+.loop:
+  mova  [dstq          ], m0
+  mova  [dstq+strideq  ], m4
+  pshufb              m0, m1
+  pshufb              m4, m1
+  mova  [dstq+strideq*2], m0
+  mova  [dstq+stride3q ], m4
+  pshufb              m0, m1
+  pshufb              m4, m1
+  lea               dstq, [dstq+strideq*4]
+  dec              lined
+  jnz .loop
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM ssse3
+cglobal d63_predictor_32x32, 3, 5, 8, dst, stride, above, line, goffset
+  GET_GOT     goffsetq
+
+  mova                   m0, [aboveq]
+  mova                   m7, [aboveq+16]
+  DEFINE_ARGS dst, stride, stride3, line
+  mova                   m1, [GLOBAL(sh_b123456789abcdeff)]
+  lea              stride3q, [strideq*3]
+  pshufb                 m2, m7, [GLOBAL(sh_b23456789abcdefff)]
+  pshufb                 m3, m7, m1
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m2, m4
+  palignr                m6, m7, m0, 1
+  palignr                m5, m7, m0, 2
+  pavgb                  m7, m3
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m6, m5, m2
+  pavgb                  m0, m6
+
+  mov                 lined, 8
+.loop:
+  mova  [dstq             ], m0
+  mova  [dstq          +16], m7
+  mova  [dstq+strideq     ], m2
+  mova  [dstq+strideq  +16], m4
+  palignr                m3, m7, m0, 1
+  palignr                m5, m4, m2, 1
+  pshufb                 m7, m1
+  pshufb                 m4, m1
+
+  mova  [dstq+strideq*2   ], m3
+  mova  [dstq+strideq*2+16], m7
+  mova  [dstq+stride3q    ], m5
+  mova  [dstq+stride3q +16], m4
+  palignr                m0, m7, m3, 1
+  palignr                m2, m4, m5, 1
+  pshufb                 m7, m1
+  pshufb                 m4, m1
+  lea                  dstq, [dstq+strideq*4]
+  dec                 lined
+  jnz .loop
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM ssse3
+cglobal d153_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+  movd                m0, [leftq]               ; l1, l2, l3, l4
+  movd                m1, [aboveq-1]            ; tl, t1, t2, t3
+  punpckldq           m0, m1                    ; l1, l2, l3, l4, tl, t1, t2, t3
+  pshufb              m0, [GLOBAL(sh_b32104567)]; l4, l3, l2, l1, tl, t1, t2, t3
+  psrldq              m1, m0, 1                 ; l3, l2, l1, tl, t1, t2, t3
+  psrldq              m2, m0, 2                 ; l2, l1, tl, t1, t2, t3
+  ; comments below are for a predictor like this
+  ; A1 B1 C1 D1
+  ; A2 B2 A1 B1
+  ; A3 B3 A2 B2
+  ; A4 B4 A3 B3
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3  ; 3-tap avg B4 B3 B2 B1 C1 D1
+  pavgb               m1, m0                    ; 2-tap avg A4 A3 A2 A1
+
+  punpcklqdq          m3, m1                    ; B4 B3 B2 B1 C1 D1 x x A4 A3 A2 A1 ..
+
+  DEFINE_ARGS dst, stride, stride3
+  lea           stride3q, [strideq*3]
+  pshufb              m3, [GLOBAL(sh_b8091a2b345)] ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 ..
+  movd  [dstq+stride3q ], m3
+  psrldq              m3, 2                     ; A3 B3 A2 B2 A1 B1 C1 D1 ..
+  movd  [dstq+strideq*2], m3
+  psrldq              m3, 2                     ; A2 B2 A1 B1 C1 D1 ..
+  movd  [dstq+strideq  ], m3
+  psrldq              m3, 2                     ; A1 B1 C1 D1 ..
+  movd  [dstq          ], m3
+  RESTORE_GOT
+  RET
+
+INIT_XMM ssse3
+cglobal d153_predictor_8x8, 4, 5, 8, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+  movq                m0, [leftq]                     ; [0- 7] l1-8 [byte]
+  movhps              m0, [aboveq-1]                  ; [8-15] tl, t1-7 [byte]
+  pshufb              m1, m0, [GLOBAL(sh_b76543210)]  ; l8-1 [word]
+  pshufb              m2, m0, [GLOBAL(sh_b65432108)]  ; l7-1,tl [word]
+  pshufb              m3, m0, [GLOBAL(sh_b54321089)]  ; l6-1,tl,t1 [word]
+  pshufb              m0, [GLOBAL(sh_b89abcdef)]      ; tl,t1-7 [word]
+  psrldq              m4, m0, 1                       ; t1-7 [word]
+  psrldq              m5, m0, 2                       ; t2-7 [word]
+  ; comments below are for a predictor like this
+  ; A1 B1 C1 D1 E1 F1 G1 H1
+  ; A2 B2 A1 B1 C1 D1 E1 F1
+  ; A3 B3 A2 B2 A1 B1 C1 D1
+  ; A4 B4 A3 B3 A2 B2 A1 B1
+  ; A5 B5 A4 B4 A3 B3 A2 B2
+  ; A6 B6 A5 B5 A4 B4 A3 B3
+  ; A7 B7 A6 B6 A5 B5 A4 B4
+  ; A8 B8 A7 B7 A6 B6 A5 B5
+  pavgb               m6, m1, m2                ; 2-tap avg A8-A1
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m4, m5, m7  ; 3-tap avg C-H1
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m2, m3, m0  ; 3-tap avg B8-1
+
+  punpcklbw           m6, m0                    ; A-B8, A-B7 ... A-B2, A-B1
+
+  DEFINE_ARGS dst, stride, stride3
+  lea           stride3q, [strideq*3]
+
+  movhps [dstq+stride3q], m6                    ; A-B4, A-B3, A-B2, A-B1
+  palignr             m0, m7, m6, 10            ; A-B3, A-B2, A-B1, C-H1
+  movq  [dstq+strideq*2], m0
+  psrldq              m0, 2                     ; A-B2, A-B1, C-H1
+  movq  [dstq+strideq  ], m0
+  psrldq              m0, 2                     ; A-H1
+  movq  [dstq          ], m0
+  lea               dstq, [dstq+strideq*4]
+  movq  [dstq+stride3q ], m6                    ; A-B8, A-B7, A-B6, A-B5
+  psrldq              m6, 2                     ; A-B7, A-B6, A-B5, A-B4
+  movq  [dstq+strideq*2], m6
+  psrldq              m6, 2                     ; A-B6, A-B5, A-B4, A-B3
+  movq  [dstq+strideq  ], m6
+  psrldq              m6, 2                     ; A-B5, A-B4, A-B3, A-B2
+  movq  [dstq          ], m6
+  RESTORE_GOT
+  RET
+
+INIT_XMM ssse3
+cglobal d153_predictor_16x16, 4, 5, 8, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+  mova                m0, [leftq]
+  movu                m7, [aboveq-1]
+  ; comments below are for a predictor like this
+  ; A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 O1 P1
+  ; A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1
+  ; A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1
+  ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1
+  ; A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1
+  ; A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1
+  ; A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1
+  ; A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1
+  ; A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2
+  ; Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3
+  ; Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4
+  ; Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5
+  ; Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6
+  ; Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7
+  ; Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8
+  ; Ag Bg Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9
+  pshufb              m6, m7, [GLOBAL(sh_bfedcba9876543210)]
+  palignr             m5, m0, m6, 15
+  palignr             m3, m0, m6, 14
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4          ; 3-tap avg B3-Bg
+  pshufb              m1, m0, [GLOBAL(sh_b123456789abcdeff)]
+  pavgb               m5, m0                            ; A1 - Ag
+
+  punpcklbw           m0, m4, m5                        ; A-B8 ... A-B1
+  punpckhbw           m4, m5                            ; A-B9 ... A-Bg
+
+  pshufb              m3, m7, [GLOBAL(sh_b123456789abcdeff)]
+  pshufb              m5, m7, [GLOBAL(sh_b23456789abcdefff)]
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1          ; 3-tap avg C1-P1
+
+  pshufb              m6, m0, [GLOBAL(sh_bfedcba9876543210)]
+  DEFINE_ARGS dst, stride, stride3
+  lea           stride3q, [strideq*3]
+  palignr             m2, m1, m6, 14
+  mova  [dstq          ], m2
+  palignr             m2, m1, m6, 12
+  mova  [dstq+strideq  ], m2
+  palignr             m2, m1, m6, 10
+  mova  [dstq+strideq*2], m2
+  palignr             m2, m1, m6, 8
+  mova  [dstq+stride3q ], m2
+  lea               dstq, [dstq+strideq*4]
+  palignr             m2, m1, m6, 6
+  mova  [dstq          ], m2
+  palignr             m2, m1, m6, 4
+  mova  [dstq+strideq  ], m2
+  palignr             m2, m1, m6, 2
+  mova  [dstq+strideq*2], m2
+  pshufb              m4, [GLOBAL(sh_bfedcba9876543210)]
+  mova  [dstq+stride3q ], m6
+  lea               dstq, [dstq+strideq*4]
+
+  palignr             m2, m6, m4, 14
+  mova  [dstq          ], m2
+  palignr             m2, m6, m4, 12
+  mova  [dstq+strideq  ], m2
+  palignr             m2, m6, m4, 10
+  mova  [dstq+strideq*2], m2
+  palignr             m2, m6, m4, 8
+  mova  [dstq+stride3q ], m2
+  lea               dstq, [dstq+strideq*4]
+  palignr             m2, m6, m4, 6
+  mova  [dstq          ], m2
+  palignr             m2, m6, m4, 4
+  mova  [dstq+strideq  ], m2
+  palignr             m2, m6, m4, 2
+  mova  [dstq+strideq*2], m2
+  mova  [dstq+stride3q ], m4
+  RESTORE_GOT
+  RET
+
+INIT_XMM ssse3
+cglobal d153_predictor_32x32, 4, 5, 8, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+  mova                  m0, [leftq]
+  movu                  m7, [aboveq-1]
+  movu                  m1, [aboveq+15]
+
+  pshufb                m4, m1, [GLOBAL(sh_b123456789abcdeff)]
+  pshufb                m6, m1, [GLOBAL(sh_b23456789abcdefff)]
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m4, m6, m2          ; 3-tap avg above [high]
+
+  palignr               m3, m1, m7, 1
+  palignr               m5, m1, m7, 2
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1          ; 3-tap avg above [low]
+
+  pshufb                m7, [GLOBAL(sh_bfedcba9876543210)]
+  palignr               m5, m0, m7, 15
+  palignr               m3, m0, m7, 14
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4          ; 3-tap avg B3-Bg
+  pavgb                 m5, m0                            ; A1 - Ag
+  punpcklbw             m6, m4, m5                        ; A-B8 ... A-B1
+  punpckhbw             m4, m5                            ; A-B9 ... A-Bg
+  pshufb                m6, [GLOBAL(sh_bfedcba9876543210)]
+  pshufb                m4, [GLOBAL(sh_bfedcba9876543210)]
+
+  DEFINE_ARGS dst, stride, stride3, left, line
+  lea             stride3q, [strideq*3]
+
+  palignr               m5, m2, m1, 14
+  palignr               m7, m1, m6, 14
+  mova  [dstq            ], m7
+  mova  [dstq+16         ], m5
+  palignr               m5, m2, m1, 12
+  palignr               m7, m1, m6, 12
+  mova  [dstq+strideq    ], m7
+  mova  [dstq+strideq+16 ], m5
+  palignr                m5, m2, m1, 10
+  palignr                m7, m1, m6, 10
+  mova  [dstq+strideq*2   ], m7
+  mova  [dstq+strideq*2+16], m5
+  palignr                m5, m2, m1, 8
+  palignr                m7, m1, m6, 8
+  mova  [dstq+stride3q    ], m7
+  mova  [dstq+stride3q+16 ], m5
+  lea                  dstq, [dstq+strideq*4]
+  palignr                m5, m2, m1, 6
+  palignr                m7, m1, m6, 6
+  mova  [dstq             ], m7
+  mova  [dstq+16          ], m5
+  palignr                m5, m2, m1, 4
+  palignr                m7, m1, m6, 4
+  mova  [dstq+strideq     ], m7
+  mova  [dstq+strideq+16  ], m5
+  palignr                m5, m2, m1, 2
+  palignr                m7, m1, m6, 2
+  mova  [dstq+strideq*2   ], m7
+  mova  [dstq+strideq*2+16], m5
+  mova  [dstq+stride3q    ], m6
+  mova  [dstq+stride3q+16 ], m1
+  lea                  dstq, [dstq+strideq*4]
+
+  palignr                m5, m1, m6, 14
+  palignr                m3, m6, m4, 14
+  mova  [dstq             ], m3
+  mova  [dstq+16          ], m5
+  palignr                m5, m1, m6, 12
+  palignr                m3, m6, m4, 12
+  mova  [dstq+strideq     ], m3
+  mova  [dstq+strideq+16  ], m5
+  palignr                m5, m1, m6, 10
+  palignr                m3, m6, m4, 10
+  mova  [dstq+strideq*2   ], m3
+  mova  [dstq+strideq*2+16], m5
+  palignr                m5, m1, m6, 8
+  palignr                m3, m6, m4, 8
+  mova  [dstq+stride3q    ], m3
+  mova  [dstq+stride3q+16 ], m5
+  lea                  dstq, [dstq+strideq*4]
+  palignr                m5, m1, m6, 6
+  palignr                m3, m6, m4, 6
+  mova  [dstq             ], m3
+  mova  [dstq+16          ], m5
+  palignr                m5, m1, m6, 4
+  palignr                m3, m6, m4, 4
+  mova  [dstq+strideq     ], m3
+  mova  [dstq+strideq+16  ], m5
+  palignr                m5, m1, m6, 2
+  palignr                m3, m6, m4, 2
+  mova  [dstq+strideq*2   ], m3
+  mova  [dstq+strideq*2+16], m5
+  mova  [dstq+stride3q    ], m4
+  mova  [dstq+stride3q+16 ], m6
+  lea               dstq, [dstq+strideq*4]
+
+  mova                   m7, [leftq]
+  mova                   m3, [leftq+16]
+  palignr                m5, m3, m7, 15
+  palignr                m0, m3, m7, 14
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m5, m0, m2          ; 3-tap avg Bh -
+  pavgb                  m5, m3                            ; Ah -
+  punpcklbw              m3, m2, m5                        ; A-B8 ... A-B1
+  punpckhbw              m2, m5                            ; A-B9 ... A-Bg
+  pshufb                 m3, [GLOBAL(sh_bfedcba9876543210)]
+  pshufb                 m2, [GLOBAL(sh_bfedcba9876543210)]
+
+  palignr                m7, m6, m4, 14
+  palignr                m0, m4, m3, 14
+  mova  [dstq             ], m0
+  mova  [dstq+16          ], m7
+  palignr                m7, m6, m4, 12
+  palignr                m0, m4, m3, 12
+  mova  [dstq+strideq     ], m0
+  mova  [dstq+strideq+16  ], m7
+  palignr                m7, m6, m4, 10
+  palignr                m0, m4, m3, 10
+  mova  [dstq+strideq*2   ], m0
+  mova  [dstq+strideq*2+16], m7
+  palignr                m7, m6, m4, 8
+  palignr                m0, m4, m3, 8
+  mova  [dstq+stride3q    ], m0
+  mova  [dstq+stride3q+16 ], m7
+  lea                  dstq, [dstq+strideq*4]
+  palignr                m7, m6, m4, 6
+  palignr                m0, m4, m3, 6
+  mova  [dstq             ], m0
+  mova  [dstq+16          ], m7
+  palignr                m7, m6, m4, 4
+  palignr                m0, m4, m3, 4
+  mova  [dstq+strideq     ], m0
+  mova  [dstq+strideq+16  ], m7
+  palignr                m7, m6, m4, 2
+  palignr                m0, m4, m3, 2
+  mova  [dstq+strideq*2   ], m0
+  mova  [dstq+strideq*2+16], m7
+  mova  [dstq+stride3q    ], m3
+  mova  [dstq+stride3q+16 ], m4
+  lea                  dstq, [dstq+strideq*4]
+
+  palignr                m7, m4, m3, 14
+  palignr                m0, m3, m2, 14
+  mova  [dstq             ], m0
+  mova  [dstq+16          ], m7
+  palignr                m7, m4, m3, 12
+  palignr                m0, m3, m2, 12
+  mova  [dstq+strideq     ], m0
+  mova  [dstq+strideq+16  ], m7
+  palignr                m7, m4, m3, 10
+  palignr                m0, m3, m2, 10
+  mova  [dstq+strideq*2   ], m0
+  mova  [dstq+strideq*2+16], m7
+  palignr                m7, m4, m3, 8
+  palignr                m0, m3, m2, 8
+  mova  [dstq+stride3q    ], m0
+  mova  [dstq+stride3q+16 ], m7
+  lea                  dstq, [dstq+strideq*4]
+  palignr                m7, m4, m3, 6
+  palignr                m0, m3, m2, 6
+  mova  [dstq             ], m0
+  mova  [dstq+16          ], m7
+  palignr                m7, m4, m3, 4
+  palignr                m0, m3, m2, 4
+  mova  [dstq+strideq     ], m0
+  mova  [dstq+strideq+16  ], m7
+  palignr                m7, m4, m3, 2
+  palignr                m0, m3, m2, 2
+  mova  [dstq+strideq*2   ], m0
+  mova  [dstq+strideq*2+16], m7
+  mova  [dstq+stride3q    ], m2
+  mova  [dstq+stride3q+16 ], m3
+
+  RESTORE_GOT
+  RET
+
+INIT_MMX ssse3
+cglobal d207_predictor_4x4, 4, 5, 4, dst, stride, unused, left, goffset
+  GET_GOT     goffsetq
+  movd                m0, [leftq]                ; abcd [byte]
+  pshufb              m1, m0, [GLOBAL(sh_b1233)] ; bcdd [byte]
+  pshufb              m3, m0, [GLOBAL(sh_b2333)] ; cddd
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m3, m2
+  pavgb               m1, m0             ; ab, bc, cd, d [byte]
+
+  punpcklbw           m1, m2             ; ab, a2bc, bc, b2cd, cd, c3d, d, d
+  movd    [dstq        ], m1
+  psrlq               m1, 16             ; bc, b2cd, cd, c3d, d, d
+  movd    [dstq+strideq], m1
+  lea               dstq, [dstq+strideq*2]
+  psrlq               m1, 16             ; cd, c3d, d, d
+  movd    [dstq        ], m1
+  pshufw              m1, m1, q1111      ; d, d, d, d
+  movd    [dstq+strideq], m1
+  RESTORE_GOT
+  RET
+
+INIT_XMM ssse3
+cglobal d207_predictor_8x8, 4, 5, 4, dst, stride, stride3, left, goffset
+  GET_GOT     goffsetq
+  movq                m3, [leftq]            ; abcdefgh [byte]
+  lea           stride3q, [strideq*3]
+
+  pshufb              m1, m3, [GLOBAL(sh_b2345677777777777)]
+  pshufb              m0, m3, [GLOBAL(sh_b0123456777777777)]
+  pshufb              m2, m3, [GLOBAL(sh_b1234567777777777)]
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m3
+  pavgb               m0, m2
+  punpcklbw           m0, m3        ; interleaved output
+
+  movq  [dstq          ], m0
+  psrldq              m0, 2
+  movq  [dstq+strideq  ], m0
+  psrldq              m0, 2
+  movq  [dstq+strideq*2], m0
+  psrldq              m0, 2
+  movq  [dstq+stride3q ], m0
+  lea               dstq, [dstq+strideq*4]
+  pshufhw             m0, m0, q0000 ; de, d2ef, ef, e2fg, fg, f2gh, gh, g3h, 8xh
+  psrldq              m0, 2
+  movq  [dstq          ], m0
+  psrldq              m0, 2
+  movq  [dstq+strideq  ], m0
+  psrldq              m0, 2
+  movq  [dstq+strideq*2], m0
+  psrldq              m0, 2
+  movq  [dstq+stride3q ], m0
+  RESTORE_GOT
+  RET
+
+INIT_XMM ssse3
+cglobal d207_predictor_16x16, 4, 5, 5, dst, stride, stride3, left, goffset
+  GET_GOT     goffsetq
+  lea           stride3q, [strideq*3]
+  mova                m0, [leftq]            ; abcdefghijklmnop [byte]
+  pshufb              m1, m0, [GLOBAL(sh_b123456789abcdeff)] ; bcdefghijklmnopp
+  pshufb              m2, m0, [GLOBAL(sh_b23456789abcdefff)]
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3
+  pavgb               m1, m0                 ; ab, bc, cd .. no, op, pp [byte]
+
+  punpckhbw           m4, m1, m3    ; interleaved input
+  punpcklbw           m1, m3        ; interleaved output
+  mova  [dstq          ], m1
+  palignr             m3, m4, m1, 2
+  mova  [dstq+strideq  ], m3
+  palignr             m3, m4, m1, 4
+  mova  [dstq+strideq*2], m3
+  palignr             m3, m4, m1, 6
+  mova  [dstq+stride3q ], m3
+  lea               dstq, [dstq+strideq*4]
+  palignr             m3, m4, m1, 8
+  mova  [dstq          ], m3
+  palignr             m3, m4, m1, 10
+  mova  [dstq+strideq  ], m3
+  palignr             m3, m4, m1, 12
+  mova  [dstq+strideq*2], m3
+  palignr             m3, m4, m1, 14
+  mova  [dstq+stride3q ], m3
+  DEFINE_ARGS dst, stride, stride3, line
+  mov              lined, 2
+  mova                m0, [GLOBAL(sh_b23456789abcdefff)]
+.loop:
+  lea               dstq, [dstq+strideq*4]
+  mova  [dstq          ], m4
+  pshufb              m4, m0
+  mova  [dstq+strideq  ], m4
+  pshufb              m4, m0
+  mova  [dstq+strideq*2], m4
+  pshufb              m4, m0
+  mova  [dstq+stride3q ], m4
+  pshufb              m4, m0
+  dec              lined
+  jnz .loop
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM ssse3
+cglobal d207_predictor_32x32, 4, 5, 8, dst, stride, stride3, left, goffset
+  GET_GOT     goffsetq
+  lea           stride3q, [strideq*3]
+  mova                m1, [leftq]              ;  0-15 [byte]
+  mova                m2, [leftq+16]           ; 16-31 [byte]
+  pshufb              m0, m2, [GLOBAL(sh_b23456789abcdefff)]
+  pshufb              m4, m2, [GLOBAL(sh_b123456789abcdeff)]
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m2, m4, m0, m3
+  palignr             m6, m2, m1, 1
+  palignr             m5, m2, m1, 2
+  pavgb               m2, m4         ; high 16px even lines
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m6, m5, m0
+  pavgb                   m1, m6         ; low 16px even lines
+
+  punpckhbw               m6, m1, m0               ; interleaved output 2
+  punpcklbw               m1, m0                   ; interleaved output 1
+
+  punpckhbw               m7, m2, m3               ; interleaved output 4
+  punpcklbw               m2, m3                   ; interleaved output 3
+
+  ; output 1st 8 lines (and half of 2nd 8 lines)
+  DEFINE_ARGS dst, stride, stride3, dst8
+  lea                  dst8q, [dstq+strideq*8]
+  mova  [dstq              ], m1
+  mova  [dstq           +16], m6
+  mova  [dst8q             ], m6
+  palignr             m0, m6, m1, 2
+  palignr             m4, m2, m6, 2
+  mova  [dstq +strideq     ], m0
+  mova  [dstq +strideq  +16], m4
+  mova  [dst8q+strideq     ], m4
+  palignr             m0, m6, m1, 4
+  palignr             m4, m2, m6, 4
+  mova  [dstq +strideq*2   ], m0
+  mova  [dstq +strideq*2+16], m4
+  mova  [dst8q+strideq*2   ], m4
+  palignr             m0, m6, m1, 6
+  palignr             m4, m2, m6, 6
+  mova  [dstq +stride3q    ], m0
+  mova  [dstq +stride3q +16], m4
+  mova  [dst8q+stride3q    ], m4
+  lea               dstq, [dstq +strideq*4]
+  lea              dst8q, [dst8q+strideq*4]
+  palignr             m0, m6, m1, 8
+  palignr             m4, m2, m6, 8
+  mova  [dstq              ], m0
+  mova  [dstq           +16], m4
+  mova  [dst8q             ], m4
+  palignr             m0, m6, m1, 10
+  palignr             m4, m2, m6, 10
+  mova  [dstq +strideq     ], m0
+  mova  [dstq +strideq  +16], m4
+  mova  [dst8q+strideq     ], m4
+  palignr             m0, m6, m1, 12
+  palignr             m4, m2, m6, 12
+  mova  [dstq +strideq*2   ], m0
+  mova  [dstq +strideq*2+16], m4
+  mova  [dst8q+strideq*2   ], m4
+  palignr             m0, m6, m1, 14
+  palignr             m4, m2, m6, 14
+  mova  [dstq +stride3q    ], m0
+  mova  [dstq +stride3q +16], m4
+  mova  [dst8q+stride3q    ], m4
+  lea               dstq, [dstq+strideq*4]
+  lea              dst8q, [dst8q+strideq*4]
+
+  ; output 2nd half of 2nd 8 lines and half of 3rd 8 lines
+  mova  [dstq           +16], m2
+  mova  [dst8q             ], m2
+  palignr             m4, m7, m2, 2
+  mova  [dstq +strideq  +16], m4
+  mova  [dst8q+strideq     ], m4
+  palignr             m4, m7, m2, 4
+  mova  [dstq +strideq*2+16], m4
+  mova  [dst8q+strideq*2   ], m4
+  palignr             m4, m7, m2, 6
+  mova  [dstq +stride3q +16], m4
+  mova  [dst8q+stride3q    ], m4
+  lea               dstq, [dstq+strideq*4]
+  lea              dst8q, [dst8q+strideq*4]
+  palignr             m4, m7, m2, 8
+  mova  [dstq           +16], m4
+  mova  [dst8q             ], m4
+  palignr             m4, m7, m2, 10
+  mova  [dstq +strideq  +16], m4
+  mova  [dst8q+strideq     ], m4
+  palignr             m4, m7, m2, 12
+  mova  [dstq +strideq*2+16], m4
+  mova  [dst8q+strideq*2   ], m4
+  palignr             m4, m7, m2, 14
+  mova  [dstq +stride3q +16], m4
+  mova  [dst8q+stride3q    ], m4
+  lea               dstq, [dstq+strideq*4]
+  lea              dst8q, [dst8q+strideq*4]
+
+  ; output 2nd half of 3rd 8 lines and half of 4th 8 lines
+  mova                m0, [GLOBAL(sh_b23456789abcdefff)]
+  mova  [dstq           +16], m7
+  mova  [dst8q             ], m7
+  pshufb              m7, m0
+  mova  [dstq +strideq  +16], m7
+  mova  [dst8q+strideq     ], m7
+  pshufb              m7, m0
+  mova  [dstq +strideq*2+16], m7
+  mova  [dst8q+strideq*2   ], m7
+  pshufb              m7, m0
+  mova  [dstq +stride3q +16], m7
+  mova  [dst8q+stride3q    ], m7
+  pshufb              m7, m0
+  lea               dstq, [dstq+strideq*4]
+  lea              dst8q, [dst8q+strideq*4]
+  mova  [dstq           +16], m7
+  mova  [dst8q             ], m7
+  pshufb              m7, m0
+  mova  [dstq +strideq  +16], m7
+  mova  [dst8q+strideq     ], m7
+  pshufb              m7, m0
+  mova  [dstq +strideq*2+16], m7
+  mova  [dst8q+strideq*2   ], m7
+  pshufb              m7, m0
+  mova  [dstq +stride3q +16], m7
+  mova  [dst8q+stride3q    ], m7
+  pshufb              m7, m0
+  lea               dstq, [dstq+strideq*4]
+
+  ; output last half of 4th 8 lines
+  mova  [dstq           +16], m7
+  mova  [dstq +strideq  +16], m7
+  mova  [dstq +strideq*2+16], m7
+  mova  [dstq +stride3q +16], m7
+  lea               dstq, [dstq+strideq*4]
+  mova  [dstq           +16], m7
+  mova  [dstq +strideq  +16], m7
+  mova  [dstq +strideq*2+16], m7
+  mova  [dstq +stride3q +16], m7
+
+  ; done!
+  RESTORE_GOT
+  RET
diff --git a/libs/libvpx/vpx_dsp/x86/inv_txfm_sse2.c b/libs/libvpx/vpx_dsp/x86/inv_txfm_sse2.c
new file mode 100644
index 0000000000..ae907fd0bd
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/inv_txfm_sse2.c
@@ -0,0 +1,4060 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+#define RECON_AND_STORE4X4(dest, in_x) \
+{                                                     \
+  __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
+  d0 = _mm_unpacklo_epi8(d0, zero); \
+  d0 = _mm_add_epi16(in_x, d0); \
+  d0 = _mm_packus_epi16(d0, d0); \
+  *(int *)(dest) = _mm_cvtsi128_si32(d0); \
+}
+
+void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest,
+                             int stride) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i eight = _mm_set1_epi16(8);
+  const __m128i cst = _mm_setr_epi16(
+      (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64,
+      (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
+      (int16_t)cospi_8_64, (int16_t)cospi_24_64);
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  __m128i input0, input1, input2, input3;
+
+  // Rows
+  input0 = load_input_data(input);
+  input2 = load_input_data(input + 8);
+
+  // Construct i3, i1, i3, i1, i2, i0, i2, i0
+  input0 = _mm_shufflelo_epi16(input0, 0xd8);
+  input0 = _mm_shufflehi_epi16(input0, 0xd8);
+  input2 = _mm_shufflelo_epi16(input2, 0xd8);
+  input2 = _mm_shufflehi_epi16(input2, 0xd8);
+
+  input1 = _mm_unpackhi_epi32(input0, input0);
+  input0 = _mm_unpacklo_epi32(input0, input0);
+  input3 = _mm_unpackhi_epi32(input2, input2);
+  input2 = _mm_unpacklo_epi32(input2, input2);
+
+  // Stage 1
+  input0 = _mm_madd_epi16(input0, cst);
+  input1 = _mm_madd_epi16(input1, cst);
+  input2 = _mm_madd_epi16(input2, cst);
+  input3 = _mm_madd_epi16(input3, cst);
+
+  input0 = _mm_add_epi32(input0, rounding);
+  input1 = _mm_add_epi32(input1, rounding);
+  input2 = _mm_add_epi32(input2, rounding);
+  input3 = _mm_add_epi32(input3, rounding);
+
+  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
+  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
+  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
+  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
+
+  // Stage 2
+  input0 = _mm_packs_epi32(input0, input1);
+  input1 = _mm_packs_epi32(input2, input3);
+
+  // Transpose
+  input2 = _mm_unpacklo_epi16(input0, input1);
+  input3 = _mm_unpackhi_epi16(input0, input1);
+  input0 = _mm_unpacklo_epi32(input2, input3);
+  input1 = _mm_unpackhi_epi32(input2, input3);
+
+  // Switch column2, column 3, and then, we got:
+  // input2: column1, column 0;  input3: column2, column 3.
+  input1 = _mm_shuffle_epi32(input1, 0x4e);
+  input2 = _mm_add_epi16(input0, input1);
+  input3 = _mm_sub_epi16(input0, input1);
+
+  // Columns
+  // Construct i3, i1, i3, i1, i2, i0, i2, i0
+  input0 = _mm_unpacklo_epi32(input2, input2);
+  input1 = _mm_unpackhi_epi32(input2, input2);
+  input2 = _mm_unpackhi_epi32(input3, input3);
+  input3 = _mm_unpacklo_epi32(input3, input3);
+
+  // Stage 1
+  input0 = _mm_madd_epi16(input0, cst);
+  input1 = _mm_madd_epi16(input1, cst);
+  input2 = _mm_madd_epi16(input2, cst);
+  input3 = _mm_madd_epi16(input3, cst);
+
+  input0 = _mm_add_epi32(input0, rounding);
+  input1 = _mm_add_epi32(input1, rounding);
+  input2 = _mm_add_epi32(input2, rounding);
+  input3 = _mm_add_epi32(input3, rounding);
+
+  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
+  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
+  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
+  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
+
+  // Stage 2
+  input0 = _mm_packs_epi32(input0, input2);
+  input1 = _mm_packs_epi32(input1, input3);
+
+  // Transpose
+  input2 = _mm_unpacklo_epi16(input0, input1);
+  input3 = _mm_unpackhi_epi16(input0, input1);
+  input0 = _mm_unpacklo_epi32(input2, input3);
+  input1 = _mm_unpackhi_epi32(input2, input3);
+
+  // Switch column2, column 3, and then, we got:
+  // input2: column1, column 0;  input3: column2, column 3.
+  input1 = _mm_shuffle_epi32(input1, 0x4e);
+  input2 = _mm_add_epi16(input0, input1);
+  input3 = _mm_sub_epi16(input0, input1);
+
+  // Final round and shift
+  input2 = _mm_add_epi16(input2, eight);
+  input3 = _mm_add_epi16(input3, eight);
+
+  input2 = _mm_srai_epi16(input2, 4);
+  input3 = _mm_srai_epi16(input3, 4);
+
+  // Reconstruction and Store
+  {
+    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
+    __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
+    d0 = _mm_unpacklo_epi32(d0,
+                            _mm_cvtsi32_si128(*(const int *)(dest + stride)));
+    d2 = _mm_unpacklo_epi32(
+        _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2);
+    d0 = _mm_unpacklo_epi8(d0, zero);
+    d2 = _mm_unpacklo_epi8(d2, zero);
+    d0 = _mm_add_epi16(d0, input2);
+    d2 = _mm_add_epi16(d2, input3);
+    d0 = _mm_packus_epi16(d0, d2);
+    // store input0
+    *(int *)dest = _mm_cvtsi128_si32(d0);
+    // store input1
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
+    // store input2
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
+    // store input3
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
+  }
+}
+
+void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest,
+                            int stride) {
+  __m128i dc_value;
+  const __m128i zero = _mm_setzero_si128();
+  int a;
+
+  a = dct_const_round_shift(input[0] * cospi_16_64);
+  a = dct_const_round_shift(a * cospi_16_64);
+  a = ROUND_POWER_OF_TWO(a, 4);
+
+  dc_value = _mm_set1_epi16(a);
+
+  RECON_AND_STORE4X4(dest + 0 * stride, dc_value);
+  RECON_AND_STORE4X4(dest + 1 * stride, dc_value);
+  RECON_AND_STORE4X4(dest + 2 * stride, dc_value);
+  RECON_AND_STORE4X4(dest + 3 * stride, dc_value);
+}
+
+static INLINE void transpose_4x4(__m128i *res) {
+  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
+  const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
+
+  res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
+  res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
+}
+
+void idct4_sse2(__m128i *in) {
+  const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  __m128i u[8], v[8];
+
+  transpose_4x4(in);
+  // stage 1
+  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
+  u[1] = _mm_unpackhi_epi16(in[0], in[1]);
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+  v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
+  v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+
+  u[0] = _mm_packs_epi32(v[0], v[1]);
+  u[1] = _mm_packs_epi32(v[3], v[2]);
+
+  // stage 2
+  in[0] = _mm_add_epi16(u[0], u[1]);
+  in[1] = _mm_sub_epi16(u[0], u[1]);
+  in[1] = _mm_shuffle_epi32(in[1], 0x4E);
+}
+
+void iadst4_sse2(__m128i *in) {
+  const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
+  const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
+  const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
+  const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
+  const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
+  const __m128i kZero = _mm_set1_epi16(0);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  __m128i u[8], v[8], in7;
+
+  transpose_4x4(in);
+  in7 = _mm_srli_si128(in[1], 8);
+  in7 = _mm_add_epi16(in7, in[0]);
+  in7 = _mm_sub_epi16(in7, in[1]);
+
+  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
+  u[1] = _mm_unpackhi_epi16(in[0], in[1]);
+  u[2] = _mm_unpacklo_epi16(in7, kZero);
+  u[3] = _mm_unpackhi_epi16(in[0], kZero);
+
+  v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04);  // s0 + s3
+  v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02);  // s2 + s5
+  v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x2
+  v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01);  // s1 - s4
+  v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04);  // s2 - s6
+  v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s2
+
+  u[0] = _mm_add_epi32(v[0], v[1]);
+  u[1] = _mm_add_epi32(v[3], v[4]);
+  u[2] = v[2];
+  u[3] = _mm_add_epi32(u[0], u[1]);
+  u[4] = _mm_slli_epi32(v[5], 2);
+  u[5] = _mm_add_epi32(u[3], v[5]);
+  u[6] = _mm_sub_epi32(u[5], u[4]);
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+
+  in[0] = _mm_packs_epi32(u[0], u[1]);
+  in[1] = _mm_packs_epi32(u[2], u[3]);
+}
+
+#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \
+                      out0, out1, out2, out3, out4, out5, out6, out7) \
+  {                                                     \
+    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
+    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
+    const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
+    const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
+    const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
+    const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
+    const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \
+    const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \
+                                                        \
+    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
+    const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
+    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
+    const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
+    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
+    const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
+    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
+    const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
+                                                            \
+    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
+    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
+    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
+    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
+    out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
+    out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
+    out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
+    out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
+  }
+
+#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, \
+                         out0, out1, out2, out3) \
+  {                                              \
+    const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \
+    const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \
+    const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \
+    const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \
+    \
+    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
+    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
+    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
+    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
+    \
+    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
+    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
+    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
+    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
+  }
+
+#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
+  {                                            \
+    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
+    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
+    out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
+    out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
+  }
+
+// Define Macro for multiplying elements by constants and adding them together.
+#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \
+                               cst0, cst1, cst2, cst3, res0, res1, res2, res3) \
+  {   \
+      tmp0 = _mm_madd_epi16(lo_0, cst0); \
+      tmp1 = _mm_madd_epi16(hi_0, cst0); \
+      tmp2 = _mm_madd_epi16(lo_0, cst1); \
+      tmp3 = _mm_madd_epi16(hi_0, cst1); \
+      tmp4 = _mm_madd_epi16(lo_1, cst2); \
+      tmp5 = _mm_madd_epi16(hi_1, cst2); \
+      tmp6 = _mm_madd_epi16(lo_1, cst3); \
+      tmp7 = _mm_madd_epi16(hi_1, cst3); \
+      \
+      tmp0 = _mm_add_epi32(tmp0, rounding); \
+      tmp1 = _mm_add_epi32(tmp1, rounding); \
+      tmp2 = _mm_add_epi32(tmp2, rounding); \
+      tmp3 = _mm_add_epi32(tmp3, rounding); \
+      tmp4 = _mm_add_epi32(tmp4, rounding); \
+      tmp5 = _mm_add_epi32(tmp5, rounding); \
+      tmp6 = _mm_add_epi32(tmp6, rounding); \
+      tmp7 = _mm_add_epi32(tmp7, rounding); \
+      \
+      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+      tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
+      tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
+      tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
+      tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
+      \
+      res0 = _mm_packs_epi32(tmp0, tmp1); \
+      res1 = _mm_packs_epi32(tmp2, tmp3); \
+      res2 = _mm_packs_epi32(tmp4, tmp5); \
+      res3 = _mm_packs_epi32(tmp6, tmp7); \
+  }
+
+#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
+  {   \
+      tmp0 = _mm_madd_epi16(lo_0, cst0); \
+      tmp1 = _mm_madd_epi16(hi_0, cst0); \
+      tmp2 = _mm_madd_epi16(lo_0, cst1); \
+      tmp3 = _mm_madd_epi16(hi_0, cst1); \
+      \
+      tmp0 = _mm_add_epi32(tmp0, rounding); \
+      tmp1 = _mm_add_epi32(tmp1, rounding); \
+      tmp2 = _mm_add_epi32(tmp2, rounding); \
+      tmp3 = _mm_add_epi32(tmp3, rounding); \
+      \
+      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+      \
+      res0 = _mm_packs_epi32(tmp0, tmp1); \
+      res1 = _mm_packs_epi32(tmp2, tmp3); \
+  }
+
+#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, \
+              out0, out1, out2, out3, out4, out5, out6, out7)  \
+  { \
+  /* Stage1 */      \
+  { \
+    const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
+    const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \
+    const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \
+    const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \
+    \
+    MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \
+                          stg1_1, stg1_2, stg1_3, stp1_4,      \
+                          stp1_7, stp1_5, stp1_6)              \
+  } \
+    \
+  /* Stage2 */ \
+  { \
+    const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \
+    const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \
+    const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \
+    const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \
+    \
+    MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \
+                           stg2_1, stg2_2, stg2_3, stp2_0,     \
+                           stp2_1, stp2_2, stp2_3)             \
+    \
+    stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \
+    stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \
+    stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \
+    stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \
+  } \
+    \
+  /* Stage3 */ \
+  { \
+    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+    const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+    \
+    stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \
+    stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \
+    stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \
+    stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \
+    \
+    tmp0 = _mm_madd_epi16(lo_56, stg2_1); \
+    tmp1 = _mm_madd_epi16(hi_56, stg2_1); \
+    tmp2 = _mm_madd_epi16(lo_56, stg2_0); \
+    tmp3 = _mm_madd_epi16(hi_56, stg2_0); \
+    \
+    tmp0 = _mm_add_epi32(tmp0, rounding); \
+    tmp1 = _mm_add_epi32(tmp1, rounding); \
+    tmp2 = _mm_add_epi32(tmp2, rounding); \
+    tmp3 = _mm_add_epi32(tmp3, rounding); \
+    \
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+    \
+    stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+    stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+  } \
+  \
+  /* Stage4  */ \
+  out0 = _mm_adds_epi16(stp1_0, stp2_7); \
+  out1 = _mm_adds_epi16(stp1_1, stp1_6); \
+  out2 = _mm_adds_epi16(stp1_2, stp1_5); \
+  out3 = _mm_adds_epi16(stp1_3, stp2_4); \
+  out4 = _mm_subs_epi16(stp1_3, stp2_4); \
+  out5 = _mm_subs_epi16(stp1_2, stp1_5); \
+  out6 = _mm_subs_epi16(stp1_1, stp1_6); \
+  out7 = _mm_subs_epi16(stp1_0, stp2_7); \
+  }
+
+void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest,
+                             int stride) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
+  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  int i;
+
+  // Load input data.
+  in0 = load_input_data(input);
+  in1 = load_input_data(input + 8 * 1);
+  in2 = load_input_data(input + 8 * 2);
+  in3 = load_input_data(input + 8 * 3);
+  in4 = load_input_data(input + 8 * 4);
+  in5 = load_input_data(input + 8 * 5);
+  in6 = load_input_data(input + 8 * 6);
+  in7 = load_input_data(input + 8 * 7);
+
+  // 2-D
+  for (i = 0; i < 2; i++) {
+    // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
+    TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7,
+                  in0, in1, in2, in3, in4, in5, in6, in7);
+
+    // 4-stage 1D idct8x8
+    IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
+          in0, in1, in2, in3, in4, in5, in6, in7);
+  }
+
+  // Final rounding and shift
+  in0 = _mm_adds_epi16(in0, final_rounding);
+  in1 = _mm_adds_epi16(in1, final_rounding);
+  in2 = _mm_adds_epi16(in2, final_rounding);
+  in3 = _mm_adds_epi16(in3, final_rounding);
+  in4 = _mm_adds_epi16(in4, final_rounding);
+  in5 = _mm_adds_epi16(in5, final_rounding);
+  in6 = _mm_adds_epi16(in6, final_rounding);
+  in7 = _mm_adds_epi16(in7, final_rounding);
+
+  in0 = _mm_srai_epi16(in0, 5);
+  in1 = _mm_srai_epi16(in1, 5);
+  in2 = _mm_srai_epi16(in2, 5);
+  in3 = _mm_srai_epi16(in3, 5);
+  in4 = _mm_srai_epi16(in4, 5);
+  in5 = _mm_srai_epi16(in5, 5);
+  in6 = _mm_srai_epi16(in6, 5);
+  in7 = _mm_srai_epi16(in7, 5);
+
+  RECON_AND_STORE(dest + 0 * stride, in0);
+  RECON_AND_STORE(dest + 1 * stride, in1);
+  RECON_AND_STORE(dest + 2 * stride, in2);
+  RECON_AND_STORE(dest + 3 * stride, in3);
+  RECON_AND_STORE(dest + 4 * stride, in4);
+  RECON_AND_STORE(dest + 5 * stride, in5);
+  RECON_AND_STORE(dest + 6 * stride, in6);
+  RECON_AND_STORE(dest + 7 * stride, in7);
+}
+
+void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest,
+                            int stride) {
+  __m128i dc_value;
+  const __m128i zero = _mm_setzero_si128();
+  int a;
+
+  a = dct_const_round_shift(input[0] * cospi_16_64);
+  a = dct_const_round_shift(a * cospi_16_64);
+  a = ROUND_POWER_OF_TWO(a, 5);
+
+  dc_value = _mm_set1_epi16(a);
+
+  RECON_AND_STORE(dest + 0 * stride, dc_value);
+  RECON_AND_STORE(dest + 1 * stride, dc_value);
+  RECON_AND_STORE(dest + 2 * stride, dc_value);
+  RECON_AND_STORE(dest + 3 * stride, dc_value);
+  RECON_AND_STORE(dest + 4 * stride, dc_value);
+  RECON_AND_STORE(dest + 5 * stride, dc_value);
+  RECON_AND_STORE(dest + 6 * stride, dc_value);
+  RECON_AND_STORE(dest + 7 * stride, dc_value);
+}
+
+void idct8_sse2(__m128i *in) {
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+  // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
+  TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7],
+                in0, in1, in2, in3, in4, in5, in6, in7);
+
+  // 4-stage 1D idct8x8
+  IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
+        in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]);
+}
+
+void iadst8_sse2(__m128i *in) {
+  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
+  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
+  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
+  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
+  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__const_0 = _mm_set1_epi16(0);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
+  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
+  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+
+  // transpose
+  array_transpose_8x8(in, in);
+
+  // properly aligned for butterfly input
+  in0 = in[7];
+  in1 = in[0];
+  in2 = in[5];
+  in3 = in[2];
+  in4 = in[3];
+  in5 = in[4];
+  in6 = in[1];
+  in7 = in[6];
+
+  // column transformation
+  // stage 1
+  // interleave and multiply/add into 32-bit integer
+  s0 = _mm_unpacklo_epi16(in0, in1);
+  s1 = _mm_unpackhi_epi16(in0, in1);
+  s2 = _mm_unpacklo_epi16(in2, in3);
+  s3 = _mm_unpackhi_epi16(in2, in3);
+  s4 = _mm_unpacklo_epi16(in4, in5);
+  s5 = _mm_unpackhi_epi16(in4, in5);
+  s6 = _mm_unpacklo_epi16(in6, in7);
+  s7 = _mm_unpackhi_epi16(in6, in7);
+
+  u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
+  u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
+  u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
+  u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
+  u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
+  u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
+  u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
+  u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
+  u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
+  u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
+  u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
+  u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
+  u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
+  u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
+  u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
+  u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
+
+  // addition
+  w0 = _mm_add_epi32(u0, u8);
+  w1 = _mm_add_epi32(u1, u9);
+  w2 = _mm_add_epi32(u2, u10);
+  w3 = _mm_add_epi32(u3, u11);
+  w4 = _mm_add_epi32(u4, u12);
+  w5 = _mm_add_epi32(u5, u13);
+  w6 = _mm_add_epi32(u6, u14);
+  w7 = _mm_add_epi32(u7, u15);
+  w8 = _mm_sub_epi32(u0, u8);
+  w9 = _mm_sub_epi32(u1, u9);
+  w10 = _mm_sub_epi32(u2, u10);
+  w11 = _mm_sub_epi32(u3, u11);
+  w12 = _mm_sub_epi32(u4, u12);
+  w13 = _mm_sub_epi32(u5, u13);
+  w14 = _mm_sub_epi32(u6, u14);
+  w15 = _mm_sub_epi32(u7, u15);
+
+  // shift and rounding
+  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
+  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
+  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
+  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
+  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
+  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
+  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
+  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
+  v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
+  v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
+  v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
+  v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
+  v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
+  v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
+  v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
+  v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
+
+  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+  u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
+  u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
+  u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
+  u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
+  u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
+  u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
+  u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
+  u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
+
+  // back to 16-bit and pack 8 integers into __m128i
+  in[0] = _mm_packs_epi32(u0, u1);
+  in[1] = _mm_packs_epi32(u2, u3);
+  in[2] = _mm_packs_epi32(u4, u5);
+  in[3] = _mm_packs_epi32(u6, u7);
+  in[4] = _mm_packs_epi32(u8, u9);
+  in[5] = _mm_packs_epi32(u10, u11);
+  in[6] = _mm_packs_epi32(u12, u13);
+  in[7] = _mm_packs_epi32(u14, u15);
+
+  // stage 2
+  s0 = _mm_add_epi16(in[0], in[2]);
+  s1 = _mm_add_epi16(in[1], in[3]);
+  s2 = _mm_sub_epi16(in[0], in[2]);
+  s3 = _mm_sub_epi16(in[1], in[3]);
+  u0 = _mm_unpacklo_epi16(in[4], in[5]);
+  u1 = _mm_unpackhi_epi16(in[4], in[5]);
+  u2 = _mm_unpacklo_epi16(in[6], in[7]);
+  u3 = _mm_unpackhi_epi16(in[6], in[7]);
+
+  v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
+  v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
+  v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
+  v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
+  v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
+  v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
+  v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
+  v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
+
+  w0 = _mm_add_epi32(v0, v4);
+  w1 = _mm_add_epi32(v1, v5);
+  w2 = _mm_add_epi32(v2, v6);
+  w3 = _mm_add_epi32(v3, v7);
+  w4 = _mm_sub_epi32(v0, v4);
+  w5 = _mm_sub_epi32(v1, v5);
+  w6 = _mm_sub_epi32(v2, v6);
+  w7 = _mm_sub_epi32(v3, v7);
+
+  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
+  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
+  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
+  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
+  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
+  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
+  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
+  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
+
+  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+
+  // back to 16-bit intergers
+  s4 = _mm_packs_epi32(u0, u1);
+  s5 = _mm_packs_epi32(u2, u3);
+  s6 = _mm_packs_epi32(u4, u5);
+  s7 = _mm_packs_epi32(u6, u7);
+
+  // stage 3
+  u0 = _mm_unpacklo_epi16(s2, s3);
+  u1 = _mm_unpackhi_epi16(s2, s3);
+  u2 = _mm_unpacklo_epi16(s6, s7);
+  u3 = _mm_unpackhi_epi16(s6, s7);
+
+  v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
+  v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
+  v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
+  v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
+  v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
+  v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
+  v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
+  v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
+
+  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
+  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
+  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
+  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
+
+  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
+  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
+  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
+  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
+
+  s2 = _mm_packs_epi32(v0, v1);
+  s3 = _mm_packs_epi32(v2, v3);
+  s6 = _mm_packs_epi32(v4, v5);
+  s7 = _mm_packs_epi32(v6, v7);
+
+  in[0] = s0;
+  in[1] = _mm_sub_epi16(k__const_0, s4);
+  in[2] = s6;
+  in[3] = _mm_sub_epi16(k__const_0, s2);
+  in[4] = s3;
+  in[5] = _mm_sub_epi16(k__const_0, s7);
+  in[6] = s5;
+  in[7] = _mm_sub_epi16(k__const_0, s1);
+}
+
+void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
+                             int stride) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
+  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+  // Rows. Load 4-row input data.
+  in0 = load_input_data(input);
+  in1 = load_input_data(input + 8 * 1);
+  in2 = load_input_data(input + 8 * 2);
+  in3 = load_input_data(input + 8 * 3);
+
+  // 8x4 Transpose
+  TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
+  // Stage1
+  {
+    const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero);
+    const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero);
+
+    tmp0 = _mm_madd_epi16(lo_17, stg1_0);
+    tmp2 = _mm_madd_epi16(lo_17, stg1_1);
+    tmp4 = _mm_madd_epi16(lo_35, stg1_2);
+    tmp6 = _mm_madd_epi16(lo_35, stg1_3);
+
+    tmp0 = _mm_add_epi32(tmp0, rounding);
+    tmp2 = _mm_add_epi32(tmp2, rounding);
+    tmp4 = _mm_add_epi32(tmp4, rounding);
+    tmp6 = _mm_add_epi32(tmp6, rounding);
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
+    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
+
+    stp1_4 = _mm_packs_epi32(tmp0, tmp2);
+    stp1_5 = _mm_packs_epi32(tmp4, tmp6);
+  }
+
+  // Stage2
+  {
+    const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero);
+    const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero);
+
+    tmp0 = _mm_madd_epi16(lo_04, stg2_0);
+    tmp2 = _mm_madd_epi16(lo_04, stg2_1);
+    tmp4 = _mm_madd_epi16(lo_26, stg2_2);
+    tmp6 = _mm_madd_epi16(lo_26, stg2_3);
+
+    tmp0 = _mm_add_epi32(tmp0, rounding);
+    tmp2 = _mm_add_epi32(tmp2, rounding);
+    tmp4 = _mm_add_epi32(tmp4, rounding);
+    tmp6 = _mm_add_epi32(tmp6, rounding);
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
+    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
+
+    stp2_0 = _mm_packs_epi32(tmp0, tmp2);
+    stp2_2 = _mm_packs_epi32(tmp6, tmp4);
+
+    tmp0 = _mm_adds_epi16(stp1_4, stp1_5);
+    tmp1 = _mm_subs_epi16(stp1_4, stp1_5);
+
+    stp2_4 = tmp0;
+    stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
+    stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
+  }
+
+  // Stage3
+  {
+    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
+
+    tmp4 = _mm_adds_epi16(stp2_0, stp2_2);
+    tmp6 = _mm_subs_epi16(stp2_0, stp2_2);
+
+    stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4);
+    stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4);
+
+    tmp0 = _mm_madd_epi16(lo_56, stg3_0);
+    tmp2 = _mm_madd_epi16(lo_56, stg2_0);  // stg3_1 = stg2_0
+
+    tmp0 = _mm_add_epi32(tmp0, rounding);
+    tmp2 = _mm_add_epi32(tmp2, rounding);
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+
+    stp1_5 = _mm_packs_epi32(tmp0, tmp2);
+  }
+
+  // Stage4
+  tmp0 = _mm_adds_epi16(stp1_3, stp2_4);
+  tmp1 = _mm_adds_epi16(stp1_2, stp1_5);
+  tmp2 = _mm_subs_epi16(stp1_3, stp2_4);
+  tmp3 = _mm_subs_epi16(stp1_2, stp1_5);
+
+  TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
+
+  IDCT8(in0, in1, in2, in3, zero, zero, zero, zero,
+        in0, in1, in2, in3, in4, in5, in6, in7);
+  // Final rounding and shift
+  in0 = _mm_adds_epi16(in0, final_rounding);
+  in1 = _mm_adds_epi16(in1, final_rounding);
+  in2 = _mm_adds_epi16(in2, final_rounding);
+  in3 = _mm_adds_epi16(in3, final_rounding);
+  in4 = _mm_adds_epi16(in4, final_rounding);
+  in5 = _mm_adds_epi16(in5, final_rounding);
+  in6 = _mm_adds_epi16(in6, final_rounding);
+  in7 = _mm_adds_epi16(in7, final_rounding);
+
+  in0 = _mm_srai_epi16(in0, 5);
+  in1 = _mm_srai_epi16(in1, 5);
+  in2 = _mm_srai_epi16(in2, 5);
+  in3 = _mm_srai_epi16(in3, 5);
+  in4 = _mm_srai_epi16(in4, 5);
+  in5 = _mm_srai_epi16(in5, 5);
+  in6 = _mm_srai_epi16(in6, 5);
+  in7 = _mm_srai_epi16(in7, 5);
+
+  RECON_AND_STORE(dest + 0 * stride, in0);
+  RECON_AND_STORE(dest + 1 * stride, in1);
+  RECON_AND_STORE(dest + 2 * stride, in2);
+  RECON_AND_STORE(dest + 3 * stride, in3);
+  RECON_AND_STORE(dest + 4 * stride, in4);
+  RECON_AND_STORE(dest + 5 * stride, in5);
+  RECON_AND_STORE(dest + 6 * stride, in6);
+  RECON_AND_STORE(dest + 7 * stride, in7);
+}
+
+#define IDCT16 \
+  /* Stage2 */ \
+  { \
+    const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \
+    const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \
+    const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]);   \
+    const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]);   \
+    const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \
+    const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \
+    const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \
+    const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \
+    \
+    MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \
+                           stg2_0, stg2_1, stg2_2, stg2_3, \
+                           stp2_8, stp2_15, stp2_9, stp2_14) \
+    \
+    MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \
+                           stg2_4, stg2_5, stg2_6, stg2_7, \
+                           stp2_10, stp2_13, stp2_11, stp2_12) \
+  } \
+    \
+  /* Stage3 */ \
+  { \
+    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \
+    const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \
+    const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \
+    const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \
+    \
+    MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \
+                           stg3_0, stg3_1, stg3_2, stg3_3, \
+                           stp1_4, stp1_7, stp1_5, stp1_6) \
+    \
+    stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);  \
+    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);    \
+    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
+    stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
+    \
+    stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \
+    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
+    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
+    stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
+  } \
+  \
+  /* Stage4 */ \
+  { \
+    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \
+    const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \
+    const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \
+    const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \
+    \
+    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
+    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
+    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+    \
+    MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \
+                           stg4_0, stg4_1, stg4_2, stg4_3, \
+                           stp2_0, stp2_1, stp2_2, stp2_3) \
+    \
+    stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
+    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
+    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
+    stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
+    \
+    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
+                           stg4_4, stg4_5, stg4_6, stg4_7, \
+                           stp2_9, stp2_14, stp2_10, stp2_13) \
+  } \
+    \
+  /* Stage5 */ \
+  { \
+    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+    \
+    stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
+    stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
+    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
+    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
+    \
+    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
+    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
+    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
+    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
+    \
+    tmp0 = _mm_add_epi32(tmp0, rounding); \
+    tmp1 = _mm_add_epi32(tmp1, rounding); \
+    tmp2 = _mm_add_epi32(tmp2, rounding); \
+    tmp3 = _mm_add_epi32(tmp3, rounding); \
+    \
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+    \
+    stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+    stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+    \
+    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);  \
+    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);    \
+    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);   \
+    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
+    \
+    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
+    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);   \
+    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);   \
+    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
+  } \
+    \
+  /* Stage6 */ \
+  { \
+    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
+    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
+    \
+    stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
+    stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
+    stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
+    stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
+    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
+    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
+    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
+    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
+    \
+    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
+                           stg6_0, stg4_0, stg6_0, stg4_0, \
+                           stp2_10, stp2_13, stp2_11, stp2_12) \
+  }
+
+#define IDCT16_10 \
+    /* Stage2 */ \
+    { \
+      const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \
+      const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \
+      const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \
+      const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \
+      \
+      MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, \
+                             stg2_0, stg2_1, stg2_6, stg2_7, \
+                             stp1_8_0, stp1_15, stp1_11, stp1_12_0) \
+    } \
+      \
+    /* Stage3 */ \
+    { \
+      const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \
+      const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \
+      \
+      MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, \
+                               stg3_0, stg3_1,  \
+                               stp2_4, stp2_7) \
+      \
+      stp1_9  =  stp1_8_0; \
+      stp1_10 =  stp1_11;  \
+      \
+      stp1_13 = stp1_12_0; \
+      stp1_14 = stp1_15;   \
+    } \
+    \
+    /* Stage4 */ \
+    { \
+      const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \
+      const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \
+      \
+      const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
+      const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
+      const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+      const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+      \
+      MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, \
+                               stg4_0, stg4_1, \
+                               stp1_0, stp1_1) \
+      stp2_5 = stp2_4; \
+      stp2_6 = stp2_7; \
+      \
+      MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
+                             stg4_4, stg4_5, stg4_6, stg4_7, \
+                             stp2_9, stp2_14, stp2_10, stp2_13) \
+    } \
+      \
+    /* Stage5 */ \
+    { \
+      const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+      const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+      \
+      stp1_2 = stp1_1; \
+      stp1_3 = stp1_0; \
+      \
+      tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
+      tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
+      tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
+      tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
+      \
+      tmp0 = _mm_add_epi32(tmp0, rounding); \
+      tmp1 = _mm_add_epi32(tmp1, rounding); \
+      tmp2 = _mm_add_epi32(tmp2, rounding); \
+      tmp3 = _mm_add_epi32(tmp3, rounding); \
+      \
+      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+      \
+      stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+      stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+      \
+      stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);  \
+      stp1_9 = _mm_add_epi16(stp2_9, stp2_10);    \
+      stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);   \
+      stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
+      \
+      stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
+      stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);   \
+      stp1_14 = _mm_add_epi16(stp2_14, stp2_13);   \
+      stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
+    } \
+      \
+    /* Stage6 */ \
+    { \
+      const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+      const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+      const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
+      const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
+      \
+      stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
+      stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
+      stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
+      stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
+      stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
+      stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
+      stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
+      stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
+      \
+      MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
+                             stg6_0, stg4_0, stg6_0, stg4_0, \
+                             stp2_10, stp2_13, stp2_11, stp2_12) \
+    }
+
+void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
+                                int stride) {
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+  const __m128i zero = _mm_setzero_si128();
+
+  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
+  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
+  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
+  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+
+  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
+
+  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+
+  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+
+  __m128i in[16], l[16], r[16], *curr1;
+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
+          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+          stp1_8_0, stp1_12_0;
+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  int i;
+
+  curr1 = l;
+  for (i = 0; i < 2; i++) {
+    // 1-D idct
+
+    // Load input data.
+    in[0] = load_input_data(input);
+    in[8] = load_input_data(input + 8 * 1);
+    in[1] = load_input_data(input + 8 * 2);
+    in[9] = load_input_data(input + 8 * 3);
+    in[2] = load_input_data(input + 8 * 4);
+    in[10] = load_input_data(input + 8 * 5);
+    in[3] = load_input_data(input + 8 * 6);
+    in[11] = load_input_data(input + 8 * 7);
+    in[4] = load_input_data(input + 8 * 8);
+    in[12] = load_input_data(input + 8 * 9);
+    in[5] = load_input_data(input + 8 * 10);
+    in[13] = load_input_data(input + 8 * 11);
+    in[6] = load_input_data(input + 8 * 12);
+    in[14] = load_input_data(input + 8 * 13);
+    in[7] = load_input_data(input + 8 * 14);
+    in[15] = load_input_data(input + 8 * 15);
+
+    array_transpose_8x8(in, in);
+    array_transpose_8x8(in + 8, in + 8);
+
+    IDCT16
+
+    // Stage7
+    curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
+    curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
+    curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
+    curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
+    curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
+    curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
+    curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
+    curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
+    curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
+    curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
+    curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
+    curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
+    curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
+    curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
+    curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
+    curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
+
+    curr1 = r;
+    input += 128;
+  }
+  for (i = 0; i < 2; i++) {
+    int j;
+    // 1-D idct
+    array_transpose_8x8(l + i * 8, in);
+    array_transpose_8x8(r + i * 8, in + 8);
+
+    IDCT16
+
+    // 2-D
+    in[0] = _mm_add_epi16(stp2_0, stp1_15);
+    in[1] = _mm_add_epi16(stp2_1, stp1_14);
+    in[2] = _mm_add_epi16(stp2_2, stp2_13);
+    in[3] = _mm_add_epi16(stp2_3, stp2_12);
+    in[4] = _mm_add_epi16(stp2_4, stp2_11);
+    in[5] = _mm_add_epi16(stp2_5, stp2_10);
+    in[6] = _mm_add_epi16(stp2_6, stp1_9);
+    in[7] = _mm_add_epi16(stp2_7, stp1_8);
+    in[8] = _mm_sub_epi16(stp2_7, stp1_8);
+    in[9] = _mm_sub_epi16(stp2_6, stp1_9);
+    in[10] = _mm_sub_epi16(stp2_5, stp2_10);
+    in[11] = _mm_sub_epi16(stp2_4, stp2_11);
+    in[12] = _mm_sub_epi16(stp2_3, stp2_12);
+    in[13] = _mm_sub_epi16(stp2_2, stp2_13);
+    in[14] = _mm_sub_epi16(stp2_1, stp1_14);
+    in[15] = _mm_sub_epi16(stp2_0, stp1_15);
+
+    for (j = 0; j < 16; ++j) {
+      // Final rounding and shift
+      in[j] = _mm_adds_epi16(in[j], final_rounding);
+      in[j] = _mm_srai_epi16(in[j], 6);
+      RECON_AND_STORE(dest + j * stride, in[j]);
+    }
+
+    dest += 8;
+  }
+}
+
+void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest,
+                              int stride) {
+  __m128i dc_value;
+  const __m128i zero = _mm_setzero_si128();
+  int a, i;
+
+  a = dct_const_round_shift(input[0] * cospi_16_64);
+  a = dct_const_round_shift(a * cospi_16_64);
+  a = ROUND_POWER_OF_TWO(a, 6);
+
+  dc_value = _mm_set1_epi16(a);
+
+  for (i = 0; i < 2; ++i) {
+    RECON_AND_STORE(dest +  0 * stride, dc_value);
+    RECON_AND_STORE(dest +  1 * stride, dc_value);
+    RECON_AND_STORE(dest +  2 * stride, dc_value);
+    RECON_AND_STORE(dest +  3 * stride, dc_value);
+    RECON_AND_STORE(dest +  4 * stride, dc_value);
+    RECON_AND_STORE(dest +  5 * stride, dc_value);
+    RECON_AND_STORE(dest +  6 * stride, dc_value);
+    RECON_AND_STORE(dest +  7 * stride, dc_value);
+    RECON_AND_STORE(dest +  8 * stride, dc_value);
+    RECON_AND_STORE(dest +  9 * stride, dc_value);
+    RECON_AND_STORE(dest + 10 * stride, dc_value);
+    RECON_AND_STORE(dest + 11 * stride, dc_value);
+    RECON_AND_STORE(dest + 12 * stride, dc_value);
+    RECON_AND_STORE(dest + 13 * stride, dc_value);
+    RECON_AND_STORE(dest + 14 * stride, dc_value);
+    RECON_AND_STORE(dest + 15 * stride, dc_value);
+    dest += 8;
+  }
+}
+
+static void iadst16_8col(__m128i *in) {
+  // perform 16x16 1-D ADST for 8 columns
+  __m128i s[16], x[16], u[32], v[32];
+  const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
+  const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+  const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
+  const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+  const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
+  const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
+  const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
+  const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
+  const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
+  const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
+  const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
+  const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
+  const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
+  const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+  const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
+  const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
+  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+  const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i kZero = _mm_set1_epi16(0);
+
+  u[0] = _mm_unpacklo_epi16(in[15], in[0]);
+  u[1] = _mm_unpackhi_epi16(in[15], in[0]);
+  u[2] = _mm_unpacklo_epi16(in[13], in[2]);
+  u[3] = _mm_unpackhi_epi16(in[13], in[2]);
+  u[4] = _mm_unpacklo_epi16(in[11], in[4]);
+  u[5] = _mm_unpackhi_epi16(in[11], in[4]);
+  u[6] = _mm_unpacklo_epi16(in[9], in[6]);
+  u[7] = _mm_unpackhi_epi16(in[9], in[6]);
+  u[8] = _mm_unpacklo_epi16(in[7], in[8]);
+  u[9] = _mm_unpackhi_epi16(in[7], in[8]);
+  u[10] = _mm_unpacklo_epi16(in[5], in[10]);
+  u[11] = _mm_unpackhi_epi16(in[5], in[10]);
+  u[12] = _mm_unpacklo_epi16(in[3], in[12]);
+  u[13] = _mm_unpackhi_epi16(in[3], in[12]);
+  u[14] = _mm_unpacklo_epi16(in[1], in[14]);
+  u[15] = _mm_unpackhi_epi16(in[1], in[14]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
+  v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
+  v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
+  v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
+  v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
+  v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
+  v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
+  v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
+  v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
+  v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
+  v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
+  v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
+  v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
+  v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
+  v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
+  v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
+  v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
+
+  u[0] = _mm_add_epi32(v[0], v[16]);
+  u[1] = _mm_add_epi32(v[1], v[17]);
+  u[2] = _mm_add_epi32(v[2], v[18]);
+  u[3] = _mm_add_epi32(v[3], v[19]);
+  u[4] = _mm_add_epi32(v[4], v[20]);
+  u[5] = _mm_add_epi32(v[5], v[21]);
+  u[6] = _mm_add_epi32(v[6], v[22]);
+  u[7] = _mm_add_epi32(v[7], v[23]);
+  u[8] = _mm_add_epi32(v[8], v[24]);
+  u[9] = _mm_add_epi32(v[9], v[25]);
+  u[10] = _mm_add_epi32(v[10], v[26]);
+  u[11] = _mm_add_epi32(v[11], v[27]);
+  u[12] = _mm_add_epi32(v[12], v[28]);
+  u[13] = _mm_add_epi32(v[13], v[29]);
+  u[14] = _mm_add_epi32(v[14], v[30]);
+  u[15] = _mm_add_epi32(v[15], v[31]);
+  u[16] = _mm_sub_epi32(v[0], v[16]);
+  u[17] = _mm_sub_epi32(v[1], v[17]);
+  u[18] = _mm_sub_epi32(v[2], v[18]);
+  u[19] = _mm_sub_epi32(v[3], v[19]);
+  u[20] = _mm_sub_epi32(v[4], v[20]);
+  u[21] = _mm_sub_epi32(v[5], v[21]);
+  u[22] = _mm_sub_epi32(v[6], v[22]);
+  u[23] = _mm_sub_epi32(v[7], v[23]);
+  u[24] = _mm_sub_epi32(v[8], v[24]);
+  u[25] = _mm_sub_epi32(v[9], v[25]);
+  u[26] = _mm_sub_epi32(v[10], v[26]);
+  u[27] = _mm_sub_epi32(v[11], v[27]);
+  u[28] = _mm_sub_epi32(v[12], v[28]);
+  u[29] = _mm_sub_epi32(v[13], v[29]);
+  u[30] = _mm_sub_epi32(v[14], v[30]);
+  u[31] = _mm_sub_epi32(v[15], v[31]);
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+  v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
+  v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
+  v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
+  v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
+  v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
+  v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
+  v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
+  v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
+  v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
+  v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
+  v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
+  v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
+  v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
+  v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
+  v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
+  v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+  u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
+  u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
+  u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
+  u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
+  u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
+  u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
+  u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
+  u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
+  u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
+  u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
+  u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
+  u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
+  u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
+  u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
+  u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
+  u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
+
+  s[0] = _mm_packs_epi32(u[0], u[1]);
+  s[1] = _mm_packs_epi32(u[2], u[3]);
+  s[2] = _mm_packs_epi32(u[4], u[5]);
+  s[3] = _mm_packs_epi32(u[6], u[7]);
+  s[4] = _mm_packs_epi32(u[8], u[9]);
+  s[5] = _mm_packs_epi32(u[10], u[11]);
+  s[6] = _mm_packs_epi32(u[12], u[13]);
+  s[7] = _mm_packs_epi32(u[14], u[15]);
+  s[8] = _mm_packs_epi32(u[16], u[17]);
+  s[9] = _mm_packs_epi32(u[18], u[19]);
+  s[10] = _mm_packs_epi32(u[20], u[21]);
+  s[11] = _mm_packs_epi32(u[22], u[23]);
+  s[12] = _mm_packs_epi32(u[24], u[25]);
+  s[13] = _mm_packs_epi32(u[26], u[27]);
+  s[14] = _mm_packs_epi32(u[28], u[29]);
+  s[15] = _mm_packs_epi32(u[30], u[31]);
+
+  // stage 2
+  u[0] = _mm_unpacklo_epi16(s[8], s[9]);
+  u[1] = _mm_unpackhi_epi16(s[8], s[9]);
+  u[2] = _mm_unpacklo_epi16(s[10], s[11]);
+  u[3] = _mm_unpackhi_epi16(s[10], s[11]);
+  u[4] = _mm_unpacklo_epi16(s[12], s[13]);
+  u[5] = _mm_unpackhi_epi16(s[12], s[13]);
+  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
+  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
+
+  u[0] = _mm_add_epi32(v[0], v[8]);
+  u[1] = _mm_add_epi32(v[1], v[9]);
+  u[2] = _mm_add_epi32(v[2], v[10]);
+  u[3] = _mm_add_epi32(v[3], v[11]);
+  u[4] = _mm_add_epi32(v[4], v[12]);
+  u[5] = _mm_add_epi32(v[5], v[13]);
+  u[6] = _mm_add_epi32(v[6], v[14]);
+  u[7] = _mm_add_epi32(v[7], v[15]);
+  u[8] = _mm_sub_epi32(v[0], v[8]);
+  u[9] = _mm_sub_epi32(v[1], v[9]);
+  u[10] = _mm_sub_epi32(v[2], v[10]);
+  u[11] = _mm_sub_epi32(v[3], v[11]);
+  u[12] = _mm_sub_epi32(v[4], v[12]);
+  u[13] = _mm_sub_epi32(v[5], v[13]);
+  u[14] = _mm_sub_epi32(v[6], v[14]);
+  u[15] = _mm_sub_epi32(v[7], v[15]);
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+
+  x[0] = _mm_add_epi16(s[0], s[4]);
+  x[1] = _mm_add_epi16(s[1], s[5]);
+  x[2] = _mm_add_epi16(s[2], s[6]);
+  x[3] = _mm_add_epi16(s[3], s[7]);
+  x[4] = _mm_sub_epi16(s[0], s[4]);
+  x[5] = _mm_sub_epi16(s[1], s[5]);
+  x[6] = _mm_sub_epi16(s[2], s[6]);
+  x[7] = _mm_sub_epi16(s[3], s[7]);
+  x[8] = _mm_packs_epi32(u[0], u[1]);
+  x[9] = _mm_packs_epi32(u[2], u[3]);
+  x[10] = _mm_packs_epi32(u[4], u[5]);
+  x[11] = _mm_packs_epi32(u[6], u[7]);
+  x[12] = _mm_packs_epi32(u[8], u[9]);
+  x[13] = _mm_packs_epi32(u[10], u[11]);
+  x[14] = _mm_packs_epi32(u[12], u[13]);
+  x[15] = _mm_packs_epi32(u[14], u[15]);
+
+  // stage 3
+  u[0] = _mm_unpacklo_epi16(x[4], x[5]);
+  u[1] = _mm_unpackhi_epi16(x[4], x[5]);
+  u[2] = _mm_unpacklo_epi16(x[6], x[7]);
+  u[3] = _mm_unpackhi_epi16(x[6], x[7]);
+  u[4] = _mm_unpacklo_epi16(x[12], x[13]);
+  u[5] = _mm_unpackhi_epi16(x[12], x[13]);
+  u[6] = _mm_unpacklo_epi16(x[14], x[15]);
+  u[7] = _mm_unpackhi_epi16(x[14], x[15]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
+
+  u[0] = _mm_add_epi32(v[0], v[4]);
+  u[1] = _mm_add_epi32(v[1], v[5]);
+  u[2] = _mm_add_epi32(v[2], v[6]);
+  u[3] = _mm_add_epi32(v[3], v[7]);
+  u[4] = _mm_sub_epi32(v[0], v[4]);
+  u[5] = _mm_sub_epi32(v[1], v[5]);
+  u[6] = _mm_sub_epi32(v[2], v[6]);
+  u[7] = _mm_sub_epi32(v[3], v[7]);
+  u[8] = _mm_add_epi32(v[8], v[12]);
+  u[9] = _mm_add_epi32(v[9], v[13]);
+  u[10] = _mm_add_epi32(v[10], v[14]);
+  u[11] = _mm_add_epi32(v[11], v[15]);
+  u[12] = _mm_sub_epi32(v[8], v[12]);
+  u[13] = _mm_sub_epi32(v[9], v[13]);
+  u[14] = _mm_sub_epi32(v[10], v[14]);
+  u[15] = _mm_sub_epi32(v[11], v[15]);
+
+  u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+  u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+  u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+  u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+  u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+  u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+  u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+  u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+  u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+  s[0] = _mm_add_epi16(x[0], x[2]);
+  s[1] = _mm_add_epi16(x[1], x[3]);
+  s[2] = _mm_sub_epi16(x[0], x[2]);
+  s[3] = _mm_sub_epi16(x[1], x[3]);
+  s[4] = _mm_packs_epi32(v[0], v[1]);
+  s[5] = _mm_packs_epi32(v[2], v[3]);
+  s[6] = _mm_packs_epi32(v[4], v[5]);
+  s[7] = _mm_packs_epi32(v[6], v[7]);
+  s[8] = _mm_add_epi16(x[8], x[10]);
+  s[9] = _mm_add_epi16(x[9], x[11]);
+  s[10] = _mm_sub_epi16(x[8], x[10]);
+  s[11] = _mm_sub_epi16(x[9], x[11]);
+  s[12] = _mm_packs_epi32(v[8], v[9]);
+  s[13] = _mm_packs_epi32(v[10], v[11]);
+  s[14] = _mm_packs_epi32(v[12], v[13]);
+  s[15] = _mm_packs_epi32(v[14], v[15]);
+
+  // stage 4
+  u[0] = _mm_unpacklo_epi16(s[2], s[3]);
+  u[1] = _mm_unpackhi_epi16(s[2], s[3]);
+  u[2] = _mm_unpacklo_epi16(s[6], s[7]);
+  u[3] = _mm_unpackhi_epi16(s[6], s[7]);
+  u[4] = _mm_unpacklo_epi16(s[10], s[11]);
+  u[5] = _mm_unpackhi_epi16(s[10], s[11]);
+  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
+  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+  in[0] = s[0];
+  in[1] = _mm_sub_epi16(kZero, s[8]);
+  in[2] = s[12];
+  in[3] = _mm_sub_epi16(kZero, s[4]);
+  in[4] = _mm_packs_epi32(v[4], v[5]);
+  in[5] = _mm_packs_epi32(v[12], v[13]);
+  in[6] = _mm_packs_epi32(v[8], v[9]);
+  in[7] = _mm_packs_epi32(v[0], v[1]);
+  in[8] = _mm_packs_epi32(v[2], v[3]);
+  in[9] = _mm_packs_epi32(v[10], v[11]);
+  in[10] = _mm_packs_epi32(v[14], v[15]);
+  in[11] = _mm_packs_epi32(v[6], v[7]);
+  in[12] = s[5];
+  in[13] = _mm_sub_epi16(kZero, s[13]);
+  in[14] = s[9];
+  in[15] = _mm_sub_epi16(kZero, s[1]);
+}
+
+static void idct16_8col(__m128i *in) {
+  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
+  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
+  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
+  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
+  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  __m128i v[16], u[16], s[16], t[16];
+
+  // stage 1
+  s[0] = in[0];
+  s[1] = in[8];
+  s[2] = in[4];
+  s[3] = in[12];
+  s[4] = in[2];
+  s[5] = in[10];
+  s[6] = in[6];
+  s[7] = in[14];
+  s[8] = in[1];
+  s[9] = in[9];
+  s[10] = in[5];
+  s[11] = in[13];
+  s[12] = in[3];
+  s[13] = in[11];
+  s[14] = in[7];
+  s[15] = in[15];
+
+  // stage 2
+  u[0] = _mm_unpacklo_epi16(s[8], s[15]);
+  u[1] = _mm_unpackhi_epi16(s[8], s[15]);
+  u[2] = _mm_unpacklo_epi16(s[9], s[14]);
+  u[3] = _mm_unpackhi_epi16(s[9], s[14]);
+  u[4] = _mm_unpacklo_epi16(s[10], s[13]);
+  u[5] = _mm_unpackhi_epi16(s[10], s[13]);
+  u[6] = _mm_unpacklo_epi16(s[11], s[12]);
+  u[7] = _mm_unpackhi_epi16(s[11], s[12]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+  s[8]  = _mm_packs_epi32(u[0], u[1]);
+  s[15] = _mm_packs_epi32(u[2], u[3]);
+  s[9]  = _mm_packs_epi32(u[4], u[5]);
+  s[14] = _mm_packs_epi32(u[6], u[7]);
+  s[10] = _mm_packs_epi32(u[8], u[9]);
+  s[13] = _mm_packs_epi32(u[10], u[11]);
+  s[11] = _mm_packs_epi32(u[12], u[13]);
+  s[12] = _mm_packs_epi32(u[14], u[15]);
+
+  // stage 3
+  t[0] = s[0];
+  t[1] = s[1];
+  t[2] = s[2];
+  t[3] = s[3];
+  u[0] = _mm_unpacklo_epi16(s[4], s[7]);
+  u[1] = _mm_unpackhi_epi16(s[4], s[7]);
+  u[2] = _mm_unpacklo_epi16(s[5], s[6]);
+  u[3] = _mm_unpackhi_epi16(s[5], s[6]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+  t[4] = _mm_packs_epi32(u[0], u[1]);
+  t[7] = _mm_packs_epi32(u[2], u[3]);
+  t[5] = _mm_packs_epi32(u[4], u[5]);
+  t[6] = _mm_packs_epi32(u[6], u[7]);
+  t[8] = _mm_add_epi16(s[8], s[9]);
+  t[9] = _mm_sub_epi16(s[8], s[9]);
+  t[10] = _mm_sub_epi16(s[11], s[10]);
+  t[11] = _mm_add_epi16(s[10], s[11]);
+  t[12] = _mm_add_epi16(s[12], s[13]);
+  t[13] = _mm_sub_epi16(s[12], s[13]);
+  t[14] = _mm_sub_epi16(s[15], s[14]);
+  t[15] = _mm_add_epi16(s[14], s[15]);
+
+  // stage 4
+  u[0] = _mm_unpacklo_epi16(t[0], t[1]);
+  u[1] = _mm_unpackhi_epi16(t[0], t[1]);
+  u[2] = _mm_unpacklo_epi16(t[2], t[3]);
+  u[3] = _mm_unpackhi_epi16(t[2], t[3]);
+  u[4] = _mm_unpacklo_epi16(t[9], t[14]);
+  u[5] = _mm_unpackhi_epi16(t[9], t[14]);
+  u[6] = _mm_unpacklo_epi16(t[10], t[13]);
+  u[7] = _mm_unpackhi_epi16(t[10], t[13]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+  s[0] = _mm_packs_epi32(u[0], u[1]);
+  s[1] = _mm_packs_epi32(u[2], u[3]);
+  s[2] = _mm_packs_epi32(u[4], u[5]);
+  s[3] = _mm_packs_epi32(u[6], u[7]);
+  s[4] = _mm_add_epi16(t[4], t[5]);
+  s[5] = _mm_sub_epi16(t[4], t[5]);
+  s[6] = _mm_sub_epi16(t[7], t[6]);
+  s[7] = _mm_add_epi16(t[6], t[7]);
+  s[8] = t[8];
+  s[15] = t[15];
+  s[9]  = _mm_packs_epi32(u[8], u[9]);
+  s[14] = _mm_packs_epi32(u[10], u[11]);
+  s[10] = _mm_packs_epi32(u[12], u[13]);
+  s[13] = _mm_packs_epi32(u[14], u[15]);
+  s[11] = t[11];
+  s[12] = t[12];
+
+  // stage 5
+  t[0] = _mm_add_epi16(s[0], s[3]);
+  t[1] = _mm_add_epi16(s[1], s[2]);
+  t[2] = _mm_sub_epi16(s[1], s[2]);
+  t[3] = _mm_sub_epi16(s[0], s[3]);
+  t[4] = s[4];
+  t[7] = s[7];
+
+  u[0] = _mm_unpacklo_epi16(s[5], s[6]);
+  u[1] = _mm_unpackhi_epi16(s[5], s[6]);
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  t[5] = _mm_packs_epi32(u[0], u[1]);
+  t[6] = _mm_packs_epi32(u[2], u[3]);
+
+  t[8] = _mm_add_epi16(s[8], s[11]);
+  t[9] = _mm_add_epi16(s[9], s[10]);
+  t[10] = _mm_sub_epi16(s[9], s[10]);
+  t[11] = _mm_sub_epi16(s[8], s[11]);
+  t[12] = _mm_sub_epi16(s[15], s[12]);
+  t[13] = _mm_sub_epi16(s[14], s[13]);
+  t[14] = _mm_add_epi16(s[13], s[14]);
+  t[15] = _mm_add_epi16(s[12], s[15]);
+
+  // stage 6
+  s[0] = _mm_add_epi16(t[0], t[7]);
+  s[1] = _mm_add_epi16(t[1], t[6]);
+  s[2] = _mm_add_epi16(t[2], t[5]);
+  s[3] = _mm_add_epi16(t[3], t[4]);
+  s[4] = _mm_sub_epi16(t[3], t[4]);
+  s[5] = _mm_sub_epi16(t[2], t[5]);
+  s[6] = _mm_sub_epi16(t[1], t[6]);
+  s[7] = _mm_sub_epi16(t[0], t[7]);
+  s[8] = t[8];
+  s[9] = t[9];
+
+  u[0] = _mm_unpacklo_epi16(t[10], t[13]);
+  u[1] = _mm_unpackhi_epi16(t[10], t[13]);
+  u[2] = _mm_unpacklo_epi16(t[11], t[12]);
+  u[3] = _mm_unpackhi_epi16(t[11], t[12]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+  s[10] = _mm_packs_epi32(u[0], u[1]);
+  s[13] = _mm_packs_epi32(u[2], u[3]);
+  s[11] = _mm_packs_epi32(u[4], u[5]);
+  s[12] = _mm_packs_epi32(u[6], u[7]);
+  s[14] = t[14];
+  s[15] = t[15];
+
+  // stage 7
+  in[0] = _mm_add_epi16(s[0], s[15]);
+  in[1] = _mm_add_epi16(s[1], s[14]);
+  in[2] = _mm_add_epi16(s[2], s[13]);
+  in[3] = _mm_add_epi16(s[3], s[12]);
+  in[4] = _mm_add_epi16(s[4], s[11]);
+  in[5] = _mm_add_epi16(s[5], s[10]);
+  in[6] = _mm_add_epi16(s[6], s[9]);
+  in[7] = _mm_add_epi16(s[7], s[8]);
+  in[8] = _mm_sub_epi16(s[7], s[8]);
+  in[9] = _mm_sub_epi16(s[6], s[9]);
+  in[10] = _mm_sub_epi16(s[5], s[10]);
+  in[11] = _mm_sub_epi16(s[4], s[11]);
+  in[12] = _mm_sub_epi16(s[3], s[12]);
+  in[13] = _mm_sub_epi16(s[2], s[13]);
+  in[14] = _mm_sub_epi16(s[1], s[14]);
+  in[15] = _mm_sub_epi16(s[0], s[15]);
+}
+
+void idct16_sse2(__m128i *in0, __m128i *in1) {
+  array_transpose_16x16(in0, in1);
+  idct16_8col(in0);
+  idct16_8col(in1);
+}
+
+void iadst16_sse2(__m128i *in0, __m128i *in1) {
+  array_transpose_16x16(in0, in1);
+  iadst16_8col(in0);
+  iadst16_8col(in1);
+}
+
+void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
+                               int stride) {
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+  const __m128i zero = _mm_setzero_si128();
+
+  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
+  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+
+  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+
+  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+
+  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+  __m128i in[16], l[16];
+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6,
+          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+          stp1_8_0, stp1_12_0;
+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  int i;
+  // First 1-D inverse DCT
+  // Load input data.
+  in[0] = load_input_data(input);
+  in[1] = load_input_data(input + 8 * 2);
+  in[2] = load_input_data(input + 8 * 4);
+  in[3] = load_input_data(input + 8 * 6);
+
+  TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
+
+  // Stage2
+  {
+    const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);
+    const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]);
+
+    tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
+    tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
+    tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
+    tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
+
+    tmp0 = _mm_add_epi32(tmp0, rounding);
+    tmp2 = _mm_add_epi32(tmp2, rounding);
+    tmp5 = _mm_add_epi32(tmp5, rounding);
+    tmp7 = _mm_add_epi32(tmp7, rounding);
+
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
+    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
+
+    stp2_8  = _mm_packs_epi32(tmp0, tmp2);
+    stp2_11 = _mm_packs_epi32(tmp5, tmp7);
+  }
+
+  // Stage3
+  {
+    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero);
+
+    tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
+    tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
+
+    tmp0 = _mm_add_epi32(tmp0, rounding);
+    tmp2 = _mm_add_epi32(tmp2, rounding);
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+
+    stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
+    stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
+
+    stp1_4 = _mm_packs_epi32(tmp0, tmp2);
+  }
+
+  // Stage4
+  {
+    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);
+    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
+    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
+
+    tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
+    tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
+    tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
+    tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
+    tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
+    tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
+
+    tmp0 = _mm_add_epi32(tmp0, rounding);
+    tmp2 = _mm_add_epi32(tmp2, rounding);
+    tmp1 = _mm_add_epi32(tmp1, rounding);
+    tmp3 = _mm_add_epi32(tmp3, rounding);
+    tmp5 = _mm_add_epi32(tmp5, rounding);
+    tmp7 = _mm_add_epi32(tmp7, rounding);
+
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
+    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
+    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
+    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
+
+    stp1_0 = _mm_packs_epi32(tmp0, tmp0);
+    stp1_1 = _mm_packs_epi32(tmp2, tmp2);
+    stp2_9 = _mm_packs_epi32(tmp1, tmp3);
+    stp2_10 = _mm_packs_epi32(tmp5, tmp7);
+
+    stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
+  }
+
+  // Stage5 and Stage6
+  {
+    tmp0 = _mm_add_epi16(stp2_8, stp2_11);
+    tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
+    tmp2 = _mm_add_epi16(stp2_9, stp2_10);
+    tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
+
+    stp1_9  = _mm_unpacklo_epi64(tmp2, zero);
+    stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
+    stp1_8  = _mm_unpacklo_epi64(tmp0, zero);
+    stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
+
+    stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
+    stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
+    stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
+    stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
+  }
+
+  // Stage6
+  {
+    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4);
+    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
+    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
+
+    tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
+    tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
+    tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
+    tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
+    tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
+    tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
+
+    tmp1 = _mm_add_epi32(tmp1, rounding);
+    tmp3 = _mm_add_epi32(tmp3, rounding);
+    tmp0 = _mm_add_epi32(tmp0, rounding);
+    tmp2 = _mm_add_epi32(tmp2, rounding);
+    tmp4 = _mm_add_epi32(tmp4, rounding);
+    tmp6 = _mm_add_epi32(tmp6, rounding);
+
+    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
+    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
+    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
+
+    stp1_6 = _mm_packs_epi32(tmp3, tmp1);
+
+    stp2_10 = _mm_packs_epi32(tmp0, zero);
+    stp2_13 = _mm_packs_epi32(tmp2, zero);
+    stp2_11 = _mm_packs_epi32(tmp4, zero);
+    stp2_12 = _mm_packs_epi32(tmp6, zero);
+
+    tmp0 = _mm_add_epi16(stp1_0, stp1_4);
+    tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
+    tmp2 = _mm_add_epi16(stp1_1, stp1_6);
+    tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
+
+    stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
+    stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
+    stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
+    stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
+    stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
+    stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
+    stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
+    stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
+  }
+
+  // Stage7. Left 8x16 only.
+  l[0] = _mm_add_epi16(stp2_0, stp1_15);
+  l[1] = _mm_add_epi16(stp2_1, stp1_14);
+  l[2] = _mm_add_epi16(stp2_2, stp2_13);
+  l[3] = _mm_add_epi16(stp2_3, stp2_12);
+  l[4] = _mm_add_epi16(stp2_4, stp2_11);
+  l[5] = _mm_add_epi16(stp2_5, stp2_10);
+  l[6] = _mm_add_epi16(stp2_6, stp1_9);
+  l[7] = _mm_add_epi16(stp2_7, stp1_8);
+  l[8] = _mm_sub_epi16(stp2_7, stp1_8);
+  l[9] = _mm_sub_epi16(stp2_6, stp1_9);
+  l[10] = _mm_sub_epi16(stp2_5, stp2_10);
+  l[11] = _mm_sub_epi16(stp2_4, stp2_11);
+  l[12] = _mm_sub_epi16(stp2_3, stp2_12);
+  l[13] = _mm_sub_epi16(stp2_2, stp2_13);
+  l[14] = _mm_sub_epi16(stp2_1, stp1_14);
+  l[15] = _mm_sub_epi16(stp2_0, stp1_15);
+
+  // Second 1-D inverse transform, performed per 8x16 block
+  for (i = 0; i < 2; i++) {
+    int j;
+    array_transpose_4X8(l + 8 * i, in);
+
+    IDCT16_10
+
+    // Stage7
+    in[0] = _mm_add_epi16(stp2_0, stp1_15);
+    in[1] = _mm_add_epi16(stp2_1, stp1_14);
+    in[2] = _mm_add_epi16(stp2_2, stp2_13);
+    in[3] = _mm_add_epi16(stp2_3, stp2_12);
+    in[4] = _mm_add_epi16(stp2_4, stp2_11);
+    in[5] = _mm_add_epi16(stp2_5, stp2_10);
+    in[6] = _mm_add_epi16(stp2_6, stp1_9);
+    in[7] = _mm_add_epi16(stp2_7, stp1_8);
+    in[8] = _mm_sub_epi16(stp2_7, stp1_8);
+    in[9] = _mm_sub_epi16(stp2_6, stp1_9);
+    in[10] = _mm_sub_epi16(stp2_5, stp2_10);
+    in[11] = _mm_sub_epi16(stp2_4, stp2_11);
+    in[12] = _mm_sub_epi16(stp2_3, stp2_12);
+    in[13] = _mm_sub_epi16(stp2_2, stp2_13);
+    in[14] = _mm_sub_epi16(stp2_1, stp1_14);
+    in[15] = _mm_sub_epi16(stp2_0, stp1_15);
+
+    for (j = 0; j < 16; ++j) {
+      // Final rounding and shift
+      in[j] = _mm_adds_epi16(in[j], final_rounding);
+      in[j] = _mm_srai_epi16(in[j], 6);
+      RECON_AND_STORE(dest + j * stride, in[j]);
+    }
+
+    dest += 8;
+  }
+}
+
+#define LOAD_DQCOEFF(reg, input) \
+  {  \
+    reg = load_input_data(input); \
+    input += 8; \
+  }  \
+
+#define IDCT32_34 \
+/* Stage1 */ \
+{ \
+  const __m128i zero = _mm_setzero_si128();\
+  const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \
+  const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \
+  \
+  const __m128i lo_25_7= _mm_unpacklo_epi16(zero, in[7]); \
+  const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \
+  \
+  const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \
+  const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \
+  \
+  const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \
+  const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \
+  \
+  MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, \
+                         stg1_1, stp1_16, stp1_31); \
+  MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, \
+                         stg1_7, stp1_19, stp1_28); \
+  MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, \
+                         stg1_9, stp1_20, stp1_27); \
+  MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, \
+                         stg1_15, stp1_23, stp1_24); \
+} \
+\
+/* Stage2 */ \
+{ \
+  const __m128i zero = _mm_setzero_si128();\
+  const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \
+  const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \
+  \
+  const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \
+  const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \
+  \
+  MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, \
+                         stg2_1, stp2_8, stp2_15); \
+  MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, \
+                         stg2_7, stp2_11, stp2_12); \
+  \
+  stp2_16 = stp1_16; \
+  stp2_19 = stp1_19; \
+  \
+  stp2_20 = stp1_20; \
+  stp2_23 = stp1_23; \
+  \
+  stp2_24 = stp1_24; \
+  stp2_27 = stp1_27; \
+  \
+  stp2_28 = stp1_28; \
+  stp2_31 = stp1_31; \
+} \
+\
+/* Stage3 */ \
+{ \
+  const __m128i zero = _mm_setzero_si128();\
+  const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \
+  const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \
+  \
+  const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \
+  const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \
+  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \
+  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \
+  \
+  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \
+  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \
+  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \
+  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \
+  \
+  MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, \
+                         stg3_1, stp1_4, stp1_7); \
+  \
+  stp1_8 = stp2_8; \
+  stp1_11 = stp2_11; \
+  stp1_12 = stp2_12; \
+  stp1_15 = stp2_15; \
+  \
+  MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
+                         stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
+                         stp1_18, stp1_29) \
+  MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
+                         stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
+                         stp1_22, stp1_25) \
+  \
+  stp1_16 = stp2_16; \
+  stp1_31 = stp2_31; \
+  stp1_19 = stp2_19; \
+  stp1_20 = stp2_20; \
+  stp1_23 = stp2_23; \
+  stp1_24 = stp2_24; \
+  stp1_27 = stp2_27; \
+  stp1_28 = stp2_28; \
+} \
+\
+/* Stage4 */ \
+{ \
+  const __m128i zero = _mm_setzero_si128();\
+  const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \
+  const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \
+  \
+  const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \
+  const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \
+  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \
+  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \
+  \
+  MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, \
+                         stg4_1, stp2_0, stp2_1); \
+  \
+  stp2_4 = stp1_4; \
+  stp2_5 = stp1_4; \
+  stp2_6 = stp1_7; \
+  stp2_7 = stp1_7; \
+  \
+  MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
+                         stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
+                         stp2_10, stp2_13) \
+  \
+  stp2_8 = stp1_8; \
+  stp2_15 = stp1_15; \
+  stp2_11 = stp1_11; \
+  stp2_12 = stp1_12; \
+  \
+  stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
+  stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
+  stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
+  stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
+  stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
+  stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
+  stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
+  stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
+  \
+  stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
+  stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
+  stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
+  stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
+  stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
+  stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
+  stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
+  stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
+} \
+\
+/* Stage5 */ \
+{ \
+  const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+  const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
+  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
+  \
+  const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
+  const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
+  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+  \
+  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+  \
+  stp1_0 = stp2_0; \
+  stp1_1 = stp2_1; \
+  stp1_2 = stp2_1; \
+  stp1_3 = stp2_0; \
+  \
+  tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
+  tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
+  tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
+  tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
+  \
+  tmp0 = _mm_add_epi32(tmp0, rounding); \
+  tmp1 = _mm_add_epi32(tmp1, rounding); \
+  tmp2 = _mm_add_epi32(tmp2, rounding); \
+  tmp3 = _mm_add_epi32(tmp3, rounding); \
+  \
+  tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+  tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+  tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+  tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+  \
+  stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+  stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+  \
+  stp1_4 = stp2_4; \
+  stp1_7 = stp2_7; \
+  \
+  stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
+  stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
+  stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
+  stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
+  stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
+  stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
+  stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
+  stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
+  \
+  stp1_16 = stp2_16; \
+  stp1_17 = stp2_17; \
+  \
+  MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
+                         stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
+                         stp1_19, stp1_28) \
+  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
+                         stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
+                         stp1_21, stp1_26) \
+  \
+  stp1_22 = stp2_22; \
+  stp1_23 = stp2_23; \
+  stp1_24 = stp2_24; \
+  stp1_25 = stp2_25; \
+  stp1_30 = stp2_30; \
+  stp1_31 = stp2_31; \
+} \
+\
+/* Stage6 */ \
+{ \
+  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+  const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
+  const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
+  \
+  stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
+  stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
+  stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
+  stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
+  stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
+  stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
+  stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
+  stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
+  \
+  stp2_8 = stp1_8; \
+  stp2_9 = stp1_9; \
+  stp2_14 = stp1_14; \
+  stp2_15 = stp1_15; \
+  \
+  MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
+                         stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
+                         stp2_13, stp2_11, stp2_12) \
+  \
+  stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
+  stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
+  stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
+  stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
+  stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
+  stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
+  stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
+  stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
+  \
+  stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
+  stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
+  stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
+  stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
+  stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
+  stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
+  stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
+  stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
+} \
+\
+/* Stage7 */ \
+{ \
+  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+  \
+  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
+  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
+  const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
+  const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
+  \
+  stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
+  stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
+  stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
+  stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
+  stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
+  stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
+  stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
+  stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
+  stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
+  stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
+  stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
+  stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
+  stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
+  stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
+  stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
+  stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
+  \
+  stp1_16 = stp2_16; \
+  stp1_17 = stp2_17; \
+  stp1_18 = stp2_18; \
+  stp1_19 = stp2_19; \
+  \
+  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
+                         stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
+                         stp1_21, stp1_26) \
+  MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
+                         stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
+                         stp1_23, stp1_24) \
+  \
+  stp1_28 = stp2_28; \
+  stp1_29 = stp2_29; \
+  stp1_30 = stp2_30; \
+  stp1_31 = stp2_31; \
+}
+
+
+#define IDCT32 \
+/* Stage1 */ \
+{ \
+  const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \
+  const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \
+  const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \
+  const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \
+  \
+  const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \
+  const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \
+  const __m128i lo_25_7= _mm_unpacklo_epi16(in[25], in[7]); \
+  const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \
+  \
+  const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \
+  const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \
+  const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \
+  const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \
+  \
+  const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \
+  const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \
+  const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \
+  const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \
+  \
+  MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
+                         stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \
+                         stp1_17, stp1_30) \
+  MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \
+                         stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \
+                         stp1_19, stp1_28) \
+  MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \
+                         stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \
+                         stp1_21, stp1_26) \
+  MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \
+                         stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \
+                         stp1_23, stp1_24) \
+} \
+\
+/* Stage2 */ \
+{ \
+  const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \
+  const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \
+  const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \
+  const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \
+  \
+  const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \
+  const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \
+  const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \
+  const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \
+  \
+  MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
+                         stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
+                         stp2_14) \
+  MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \
+                         stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \
+                         stp2_11, stp2_12) \
+  \
+  stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \
+  stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \
+  stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \
+  stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \
+  \
+  stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \
+  stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \
+  stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \
+  stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \
+  \
+  stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \
+  stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \
+  stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \
+  stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \
+  \
+  stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \
+  stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \
+  stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \
+  stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \
+} \
+\
+/* Stage3 */ \
+{ \
+  const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \
+  const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \
+  const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \
+  const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \
+  \
+  const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
+  const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
+  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
+  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
+  \
+  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
+  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
+  \
+  MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \
+                         stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \
+                         stp1_6) \
+  \
+  stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \
+  stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
+  stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
+  stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
+  stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \
+  stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
+  stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
+  stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
+  \
+  MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
+                         stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
+                         stp1_18, stp1_29) \
+  MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
+                         stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
+                         stp1_22, stp1_25) \
+  \
+  stp1_16 = stp2_16; \
+  stp1_31 = stp2_31; \
+  stp1_19 = stp2_19; \
+  stp1_20 = stp2_20; \
+  stp1_23 = stp2_23; \
+  stp1_24 = stp2_24; \
+  stp1_27 = stp2_27; \
+  stp1_28 = stp2_28; \
+} \
+\
+/* Stage4 */ \
+{ \
+  const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \
+  const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \
+  const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \
+  const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \
+  \
+  const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
+  const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
+  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+  \
+  MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \
+                         stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \
+                         stp2_2, stp2_3) \
+  \
+  stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
+  stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
+  stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
+  stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
+  \
+  MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
+                         stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
+                         stp2_10, stp2_13) \
+  \
+  stp2_8 = stp1_8; \
+  stp2_15 = stp1_15; \
+  stp2_11 = stp1_11; \
+  stp2_12 = stp1_12; \
+  \
+  stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
+  stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
+  stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
+  stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
+  stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
+  stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
+  stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
+  stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
+  \
+  stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
+  stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
+  stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
+  stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
+  stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
+  stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
+  stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
+  stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
+} \
+\
+/* Stage5 */ \
+{ \
+  const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+  const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
+  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
+  \
+  const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
+  const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
+  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+  \
+  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+  \
+  stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
+  stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
+  stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
+  stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
+  \
+  tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
+  tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
+  tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
+  tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
+  \
+  tmp0 = _mm_add_epi32(tmp0, rounding); \
+  tmp1 = _mm_add_epi32(tmp1, rounding); \
+  tmp2 = _mm_add_epi32(tmp2, rounding); \
+  tmp3 = _mm_add_epi32(tmp3, rounding); \
+  \
+  tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+  tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+  tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+  tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+  \
+  stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+  stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+  \
+  stp1_4 = stp2_4; \
+  stp1_7 = stp2_7; \
+  \
+  stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
+  stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
+  stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
+  stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
+  stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
+  stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
+  stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
+  stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
+  \
+  stp1_16 = stp2_16; \
+  stp1_17 = stp2_17; \
+  \
+  MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
+                         stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
+                         stp1_19, stp1_28) \
+  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
+                         stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
+                         stp1_21, stp1_26) \
+  \
+  stp1_22 = stp2_22; \
+  stp1_23 = stp2_23; \
+  stp1_24 = stp2_24; \
+  stp1_25 = stp2_25; \
+  stp1_30 = stp2_30; \
+  stp1_31 = stp2_31; \
+} \
+\
+/* Stage6 */ \
+{ \
+  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+  const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
+  const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
+  \
+  stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
+  stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
+  stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
+  stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
+  stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
+  stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
+  stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
+  stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
+  \
+  stp2_8 = stp1_8; \
+  stp2_9 = stp1_9; \
+  stp2_14 = stp1_14; \
+  stp2_15 = stp1_15; \
+  \
+  MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
+                         stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
+                         stp2_13, stp2_11, stp2_12) \
+  \
+  stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
+  stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
+  stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
+  stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
+  stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
+  stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
+  stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
+  stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
+  \
+  stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
+  stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
+  stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
+  stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
+  stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
+  stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
+  stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
+  stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
+} \
+\
+/* Stage7 */ \
+{ \
+  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+  \
+  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
+  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
+  const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
+  const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
+  \
+  stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
+  stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
+  stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
+  stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
+  stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
+  stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
+  stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
+  stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
+  stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
+  stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
+  stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
+  stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
+  stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
+  stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
+  stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
+  stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
+  \
+  stp1_16 = stp2_16; \
+  stp1_17 = stp2_17; \
+  stp1_18 = stp2_18; \
+  stp1_19 = stp2_19; \
+  \
+  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
+                         stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
+                         stp1_21, stp1_26) \
+  MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
+                         stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
+                         stp1_23, stp1_24) \
+  \
+  stp1_28 = stp2_28; \
+  stp1_29 = stp2_29; \
+  stp1_30 = stp2_30; \
+  stp1_31 = stp2_31; \
+}
+
+// Only upper-left 8x8 has non-zero coeff
+void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
+                               int stride) {
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i final_rounding = _mm_set1_epi16(1<<5);
+
+  // idct constants for each stage
+  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
+  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
+  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
+  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
+
+  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
+  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+
+  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
+  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
+
+  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+
+  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+
+  __m128i in[32], col[32];
+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
+          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+          stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
+          stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
+          stp1_30, stp1_31;
+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
+          stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
+          stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
+          stp2_30, stp2_31;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  int i;
+
+  // Load input data. Only need to load the top left 8x8 block.
+  in[0] = load_input_data(input);
+  in[1] = load_input_data(input + 32);
+  in[2] = load_input_data(input + 64);
+  in[3] = load_input_data(input + 96);
+  in[4] = load_input_data(input + 128);
+  in[5] = load_input_data(input + 160);
+  in[6] = load_input_data(input + 192);
+  in[7] = load_input_data(input + 224);
+
+  for (i = 8; i < 32; ++i) {
+    in[i] = _mm_setzero_si128();
+  }
+
+  array_transpose_8x8(in, in);
+  // TODO(hkuang): Following transposes are unnecessary. But remove them will
+  // lead to performance drop on some devices.
+  array_transpose_8x8(in + 8, in + 8);
+  array_transpose_8x8(in + 16, in + 16);
+  array_transpose_8x8(in + 24, in + 24);
+
+  IDCT32_34
+
+  // 1_D: Store 32 intermediate results for each 8x32 block.
+  col[0] = _mm_add_epi16(stp1_0, stp1_31);
+  col[1] = _mm_add_epi16(stp1_1, stp1_30);
+  col[2] = _mm_add_epi16(stp1_2, stp1_29);
+  col[3] = _mm_add_epi16(stp1_3, stp1_28);
+  col[4] = _mm_add_epi16(stp1_4, stp1_27);
+  col[5] = _mm_add_epi16(stp1_5, stp1_26);
+  col[6] = _mm_add_epi16(stp1_6, stp1_25);
+  col[7] = _mm_add_epi16(stp1_7, stp1_24);
+  col[8] = _mm_add_epi16(stp1_8, stp1_23);
+  col[9] = _mm_add_epi16(stp1_9, stp1_22);
+  col[10] = _mm_add_epi16(stp1_10, stp1_21);
+  col[11] = _mm_add_epi16(stp1_11, stp1_20);
+  col[12] = _mm_add_epi16(stp1_12, stp1_19);
+  col[13] = _mm_add_epi16(stp1_13, stp1_18);
+  col[14] = _mm_add_epi16(stp1_14, stp1_17);
+  col[15] = _mm_add_epi16(stp1_15, stp1_16);
+  col[16] = _mm_sub_epi16(stp1_15, stp1_16);
+  col[17] = _mm_sub_epi16(stp1_14, stp1_17);
+  col[18] = _mm_sub_epi16(stp1_13, stp1_18);
+  col[19] = _mm_sub_epi16(stp1_12, stp1_19);
+  col[20] = _mm_sub_epi16(stp1_11, stp1_20);
+  col[21] = _mm_sub_epi16(stp1_10, stp1_21);
+  col[22] = _mm_sub_epi16(stp1_9, stp1_22);
+  col[23] = _mm_sub_epi16(stp1_8, stp1_23);
+  col[24] = _mm_sub_epi16(stp1_7, stp1_24);
+  col[25] = _mm_sub_epi16(stp1_6, stp1_25);
+  col[26] = _mm_sub_epi16(stp1_5, stp1_26);
+  col[27] = _mm_sub_epi16(stp1_4, stp1_27);
+  col[28] = _mm_sub_epi16(stp1_3, stp1_28);
+  col[29] = _mm_sub_epi16(stp1_2, stp1_29);
+  col[30] = _mm_sub_epi16(stp1_1, stp1_30);
+  col[31] = _mm_sub_epi16(stp1_0, stp1_31);
+  for (i = 0; i < 4; i++) {
+    int j;
+    const __m128i zero = _mm_setzero_si128();
+    // Transpose 32x8 block to 8x32 block
+    array_transpose_8x8(col + i * 8, in);
+    IDCT32_34
+
+    // 2_D: Calculate the results and store them to destination.
+    in[0] = _mm_add_epi16(stp1_0, stp1_31);
+    in[1] = _mm_add_epi16(stp1_1, stp1_30);
+    in[2] = _mm_add_epi16(stp1_2, stp1_29);
+    in[3] = _mm_add_epi16(stp1_3, stp1_28);
+    in[4] = _mm_add_epi16(stp1_4, stp1_27);
+    in[5] = _mm_add_epi16(stp1_5, stp1_26);
+    in[6] = _mm_add_epi16(stp1_6, stp1_25);
+    in[7] = _mm_add_epi16(stp1_7, stp1_24);
+    in[8] = _mm_add_epi16(stp1_8, stp1_23);
+    in[9] = _mm_add_epi16(stp1_9, stp1_22);
+    in[10] = _mm_add_epi16(stp1_10, stp1_21);
+    in[11] = _mm_add_epi16(stp1_11, stp1_20);
+    in[12] = _mm_add_epi16(stp1_12, stp1_19);
+    in[13] = _mm_add_epi16(stp1_13, stp1_18);
+    in[14] = _mm_add_epi16(stp1_14, stp1_17);
+    in[15] = _mm_add_epi16(stp1_15, stp1_16);
+    in[16] = _mm_sub_epi16(stp1_15, stp1_16);
+    in[17] = _mm_sub_epi16(stp1_14, stp1_17);
+    in[18] = _mm_sub_epi16(stp1_13, stp1_18);
+    in[19] = _mm_sub_epi16(stp1_12, stp1_19);
+    in[20] = _mm_sub_epi16(stp1_11, stp1_20);
+    in[21] = _mm_sub_epi16(stp1_10, stp1_21);
+    in[22] = _mm_sub_epi16(stp1_9, stp1_22);
+    in[23] = _mm_sub_epi16(stp1_8, stp1_23);
+    in[24] = _mm_sub_epi16(stp1_7, stp1_24);
+    in[25] = _mm_sub_epi16(stp1_6, stp1_25);
+    in[26] = _mm_sub_epi16(stp1_5, stp1_26);
+    in[27] = _mm_sub_epi16(stp1_4, stp1_27);
+    in[28] = _mm_sub_epi16(stp1_3, stp1_28);
+    in[29] = _mm_sub_epi16(stp1_2, stp1_29);
+    in[30] = _mm_sub_epi16(stp1_1, stp1_30);
+    in[31] = _mm_sub_epi16(stp1_0, stp1_31);
+
+    for (j = 0; j < 32; ++j) {
+      // Final rounding and shift
+      in[j] = _mm_adds_epi16(in[j], final_rounding);
+      in[j] = _mm_srai_epi16(in[j], 6);
+      RECON_AND_STORE(dest + j * stride, in[j]);
+    }
+
+    dest += 8;
+  }
+}
+
+void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
+                                 int stride) {
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+  const __m128i zero = _mm_setzero_si128();
+
+  // idct constants for each stage
+  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
+  const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
+  const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
+  const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
+  const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
+  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
+  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
+  const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
+  const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
+  const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
+  const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
+  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
+
+  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
+  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
+  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
+  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+
+  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
+  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
+  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
+
+  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+
+  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+
+  __m128i in[32], col[128], zero_idx[16];
+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
+          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+          stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
+          stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
+          stp1_30, stp1_31;
+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
+          stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
+          stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
+          stp2_30, stp2_31;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  int i, j, i32;
+
+  for (i = 0; i < 4; i++) {
+    i32 = (i << 5);
+    // First 1-D idct
+    // Load input data.
+    LOAD_DQCOEFF(in[0], input);
+    LOAD_DQCOEFF(in[8], input);
+    LOAD_DQCOEFF(in[16], input);
+    LOAD_DQCOEFF(in[24], input);
+    LOAD_DQCOEFF(in[1], input);
+    LOAD_DQCOEFF(in[9], input);
+    LOAD_DQCOEFF(in[17], input);
+    LOAD_DQCOEFF(in[25], input);
+    LOAD_DQCOEFF(in[2], input);
+    LOAD_DQCOEFF(in[10], input);
+    LOAD_DQCOEFF(in[18], input);
+    LOAD_DQCOEFF(in[26], input);
+    LOAD_DQCOEFF(in[3], input);
+    LOAD_DQCOEFF(in[11], input);
+    LOAD_DQCOEFF(in[19], input);
+    LOAD_DQCOEFF(in[27], input);
+
+    LOAD_DQCOEFF(in[4], input);
+    LOAD_DQCOEFF(in[12], input);
+    LOAD_DQCOEFF(in[20], input);
+    LOAD_DQCOEFF(in[28], input);
+    LOAD_DQCOEFF(in[5], input);
+    LOAD_DQCOEFF(in[13], input);
+    LOAD_DQCOEFF(in[21], input);
+    LOAD_DQCOEFF(in[29], input);
+    LOAD_DQCOEFF(in[6], input);
+    LOAD_DQCOEFF(in[14], input);
+    LOAD_DQCOEFF(in[22], input);
+    LOAD_DQCOEFF(in[30], input);
+    LOAD_DQCOEFF(in[7], input);
+    LOAD_DQCOEFF(in[15], input);
+    LOAD_DQCOEFF(in[23], input);
+    LOAD_DQCOEFF(in[31], input);
+
+    // checking if all entries are zero
+    zero_idx[0] = _mm_or_si128(in[0], in[1]);
+    zero_idx[1] = _mm_or_si128(in[2], in[3]);
+    zero_idx[2] = _mm_or_si128(in[4], in[5]);
+    zero_idx[3] = _mm_or_si128(in[6], in[7]);
+    zero_idx[4] = _mm_or_si128(in[8], in[9]);
+    zero_idx[5] = _mm_or_si128(in[10], in[11]);
+    zero_idx[6] = _mm_or_si128(in[12], in[13]);
+    zero_idx[7] = _mm_or_si128(in[14], in[15]);
+    zero_idx[8] = _mm_or_si128(in[16], in[17]);
+    zero_idx[9] = _mm_or_si128(in[18], in[19]);
+    zero_idx[10] = _mm_or_si128(in[20], in[21]);
+    zero_idx[11] = _mm_or_si128(in[22], in[23]);
+    zero_idx[12] = _mm_or_si128(in[24], in[25]);
+    zero_idx[13] = _mm_or_si128(in[26], in[27]);
+    zero_idx[14] = _mm_or_si128(in[28], in[29]);
+    zero_idx[15] = _mm_or_si128(in[30], in[31]);
+
+    zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
+    zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
+    zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
+    zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
+    zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
+    zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
+    zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
+    zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
+
+    zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
+    zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
+    zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
+    zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
+    zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
+    zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
+    zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
+
+    if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) {
+      col[i32 + 0] = _mm_setzero_si128();
+      col[i32 + 1] = _mm_setzero_si128();
+      col[i32 + 2] = _mm_setzero_si128();
+      col[i32 + 3] = _mm_setzero_si128();
+      col[i32 + 4] = _mm_setzero_si128();
+      col[i32 + 5] = _mm_setzero_si128();
+      col[i32 + 6] = _mm_setzero_si128();
+      col[i32 + 7] = _mm_setzero_si128();
+      col[i32 + 8] = _mm_setzero_si128();
+      col[i32 + 9] = _mm_setzero_si128();
+      col[i32 + 10] = _mm_setzero_si128();
+      col[i32 + 11] = _mm_setzero_si128();
+      col[i32 + 12] = _mm_setzero_si128();
+      col[i32 + 13] = _mm_setzero_si128();
+      col[i32 + 14] = _mm_setzero_si128();
+      col[i32 + 15] = _mm_setzero_si128();
+      col[i32 + 16] = _mm_setzero_si128();
+      col[i32 + 17] = _mm_setzero_si128();
+      col[i32 + 18] = _mm_setzero_si128();
+      col[i32 + 19] = _mm_setzero_si128();
+      col[i32 + 20] = _mm_setzero_si128();
+      col[i32 + 21] = _mm_setzero_si128();
+      col[i32 + 22] = _mm_setzero_si128();
+      col[i32 + 23] = _mm_setzero_si128();
+      col[i32 + 24] = _mm_setzero_si128();
+      col[i32 + 25] = _mm_setzero_si128();
+      col[i32 + 26] = _mm_setzero_si128();
+      col[i32 + 27] = _mm_setzero_si128();
+      col[i32 + 28] = _mm_setzero_si128();
+      col[i32 + 29] = _mm_setzero_si128();
+      col[i32 + 30] = _mm_setzero_si128();
+      col[i32 + 31] = _mm_setzero_si128();
+      continue;
+    }
+
+    // Transpose 32x8 block to 8x32 block
+    array_transpose_8x8(in, in);
+    array_transpose_8x8(in + 8, in + 8);
+    array_transpose_8x8(in + 16, in + 16);
+    array_transpose_8x8(in + 24, in + 24);
+
+    IDCT32
+
+    // 1_D: Store 32 intermediate results for each 8x32 block.
+    col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
+    col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
+    col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
+    col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
+    col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
+    col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
+    col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
+    col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
+    col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
+    col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
+    col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
+    col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
+    col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
+    col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
+    col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
+    col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
+    col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
+    col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
+    col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
+    col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
+    col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
+    col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
+    col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
+    col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
+    col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
+    col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
+    col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
+    col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
+    col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
+    col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
+    col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
+    col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
+  }
+  for (i = 0; i < 4; i++) {
+    // Second 1-D idct
+    j = i << 3;
+
+    // Transpose 32x8 block to 8x32 block
+    array_transpose_8x8(col + j, in);
+    array_transpose_8x8(col + j + 32, in + 8);
+    array_transpose_8x8(col + j + 64, in + 16);
+    array_transpose_8x8(col + j + 96, in + 24);
+
+    IDCT32
+
+    // 2_D: Calculate the results and store them to destination.
+    in[0] = _mm_add_epi16(stp1_0, stp1_31);
+    in[1] = _mm_add_epi16(stp1_1, stp1_30);
+    in[2] = _mm_add_epi16(stp1_2, stp1_29);
+    in[3] = _mm_add_epi16(stp1_3, stp1_28);
+    in[4] = _mm_add_epi16(stp1_4, stp1_27);
+    in[5] = _mm_add_epi16(stp1_5, stp1_26);
+    in[6] = _mm_add_epi16(stp1_6, stp1_25);
+    in[7] = _mm_add_epi16(stp1_7, stp1_24);
+    in[8] = _mm_add_epi16(stp1_8, stp1_23);
+    in[9] = _mm_add_epi16(stp1_9, stp1_22);
+    in[10] = _mm_add_epi16(stp1_10, stp1_21);
+    in[11] = _mm_add_epi16(stp1_11, stp1_20);
+    in[12] = _mm_add_epi16(stp1_12, stp1_19);
+    in[13] = _mm_add_epi16(stp1_13, stp1_18);
+    in[14] = _mm_add_epi16(stp1_14, stp1_17);
+    in[15] = _mm_add_epi16(stp1_15, stp1_16);
+    in[16] = _mm_sub_epi16(stp1_15, stp1_16);
+    in[17] = _mm_sub_epi16(stp1_14, stp1_17);
+    in[18] = _mm_sub_epi16(stp1_13, stp1_18);
+    in[19] = _mm_sub_epi16(stp1_12, stp1_19);
+    in[20] = _mm_sub_epi16(stp1_11, stp1_20);
+    in[21] = _mm_sub_epi16(stp1_10, stp1_21);
+    in[22] = _mm_sub_epi16(stp1_9, stp1_22);
+    in[23] = _mm_sub_epi16(stp1_8, stp1_23);
+    in[24] = _mm_sub_epi16(stp1_7, stp1_24);
+    in[25] = _mm_sub_epi16(stp1_6, stp1_25);
+    in[26] = _mm_sub_epi16(stp1_5, stp1_26);
+    in[27] = _mm_sub_epi16(stp1_4, stp1_27);
+    in[28] = _mm_sub_epi16(stp1_3, stp1_28);
+    in[29] = _mm_sub_epi16(stp1_2, stp1_29);
+    in[30] = _mm_sub_epi16(stp1_1, stp1_30);
+    in[31] = _mm_sub_epi16(stp1_0, stp1_31);
+
+    for (j = 0; j < 32; ++j) {
+      // Final rounding and shift
+      in[j] = _mm_adds_epi16(in[j], final_rounding);
+      in[j] = _mm_srai_epi16(in[j], 6);
+      RECON_AND_STORE(dest + j * stride, in[j]);
+    }
+
+    dest += 8;
+  }
+}
+
+void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest,
+                              int stride) {
+  __m128i dc_value;
+  const __m128i zero = _mm_setzero_si128();
+  int a, j;
+
+  a = dct_const_round_shift(input[0] * cospi_16_64);
+  a = dct_const_round_shift(a * cospi_16_64);
+  a = ROUND_POWER_OF_TWO(a, 6);
+
+  dc_value = _mm_set1_epi16(a);
+
+  for (j = 0; j < 32; ++j) {
+    RECON_AND_STORE(dest +  0 + j * stride, dc_value);
+    RECON_AND_STORE(dest +  8 + j * stride, dc_value);
+    RECON_AND_STORE(dest + 16 + j * stride, dc_value);
+    RECON_AND_STORE(dest + 24 + j * stride, dc_value);
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
+  __m128i ubounded, retval;
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one);
+  ubounded = _mm_cmpgt_epi16(value, max);
+  retval = _mm_andnot_si128(ubounded, value);
+  ubounded = _mm_and_si128(ubounded, max);
+  retval = _mm_or_si128(retval, ubounded);
+  retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));
+  return retval;
+}
+
+void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8,
+                                    int stride, int bd) {
+  tran_low_t out[4 * 4];
+  tran_low_t *outptr = out;
+  int i, j;
+  __m128i inptr[4];
+  __m128i sign_bits[2];
+  __m128i temp_mm, min_input, max_input;
+  int test;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+  int optimised_cols = 0;
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i eight = _mm_set1_epi16(8);
+  const __m128i max = _mm_set1_epi16(12043);
+  const __m128i min = _mm_set1_epi16(-12043);
+  // Load input into __m128i
+  inptr[0] = _mm_loadu_si128((const __m128i *)input);
+  inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4));
+  inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8));
+  inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12));
+
+  // Pack to 16 bits
+  inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]);
+  inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]);
+
+  max_input = _mm_max_epi16(inptr[0], inptr[1]);
+  min_input = _mm_min_epi16(inptr[0], inptr[1]);
+  max_input = _mm_cmpgt_epi16(max_input, max);
+  min_input = _mm_cmplt_epi16(min_input, min);
+  temp_mm = _mm_or_si128(max_input, min_input);
+  test = _mm_movemask_epi8(temp_mm);
+
+  if (!test) {
+    // Do the row transform
+    idct4_sse2(inptr);
+
+    // Check the min & max values
+    max_input = _mm_max_epi16(inptr[0], inptr[1]);
+    min_input = _mm_min_epi16(inptr[0], inptr[1]);
+    max_input = _mm_cmpgt_epi16(max_input, max);
+    min_input = _mm_cmplt_epi16(min_input, min);
+    temp_mm = _mm_or_si128(max_input, min_input);
+    test = _mm_movemask_epi8(temp_mm);
+
+    if (test) {
+      transpose_4x4(inptr);
+      sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);
+      sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);
+      inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);
+      inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]);
+      inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]);
+      inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]);
+      _mm_storeu_si128((__m128i *)outptr, inptr[0]);
+      _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]);
+      _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]);
+      _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]);
+    } else {
+      // Set to use the optimised transform for the column
+      optimised_cols = 1;
+    }
+  } else {
+    // Run the un-optimised row transform
+    for (i = 0; i < 4; ++i) {
+      vpx_highbd_idct4_c(input, outptr, bd);
+      input += 4;
+      outptr += 4;
+    }
+  }
+
+  if (optimised_cols) {
+    idct4_sse2(inptr);
+
+    // Final round and shift
+    inptr[0] = _mm_add_epi16(inptr[0], eight);
+    inptr[1] = _mm_add_epi16(inptr[1], eight);
+
+    inptr[0] = _mm_srai_epi16(inptr[0], 4);
+    inptr[1] = _mm_srai_epi16(inptr[1], 4);
+
+    // Reconstruction and Store
+    {
+      __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
+      __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
+      d0 = _mm_unpacklo_epi64(
+          d0, _mm_loadl_epi64((const __m128i *)(dest + stride)));
+      d2 = _mm_unpacklo_epi64(
+          d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
+      d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd);
+      d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd);
+      // store input0
+      _mm_storel_epi64((__m128i *)dest, d0);
+      // store input1
+      d0 = _mm_srli_si128(d0, 8);
+      _mm_storel_epi64((__m128i *)(dest + stride), d0);
+      // store input2
+      _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
+      // store input3
+      d2 = _mm_srli_si128(d2, 8);
+      _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
+    }
+  } else {
+    // Run the un-optimised column transform
+    tran_low_t temp_in[4], temp_out[4];
+    // Columns
+    for (i = 0; i < 4; ++i) {
+      for (j = 0; j < 4; ++j)
+        temp_in[j] = out[j * 4 + i];
+      vpx_highbd_idct4_c(temp_in, temp_out, bd);
+      for (j = 0; j < 4; ++j) {
+        dest[j * stride + i] = highbd_clip_pixel_add(
+            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
+      }
+    }
+  }
+}
+
+void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8,
+                                    int stride, int bd) {
+  tran_low_t out[8 * 8];
+  tran_low_t *outptr = out;
+  int i, j, test;
+  __m128i inptr[8];
+  __m128i min_input, max_input, temp1, temp2, sign_bits;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i sixteen = _mm_set1_epi16(16);
+  const __m128i max = _mm_set1_epi16(6201);
+  const __m128i min = _mm_set1_epi16(-6201);
+  int optimised_cols = 0;
+
+  // Load input into __m128i & pack to 16 bits
+  for (i = 0; i < 8; i++) {
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
+    inptr[i] = _mm_packs_epi32(temp1, temp2);
+  }
+
+  // Find the min & max for the row transform
+  max_input = _mm_max_epi16(inptr[0], inptr[1]);
+  min_input = _mm_min_epi16(inptr[0], inptr[1]);
+  for (i = 2; i < 8; i++) {
+    max_input = _mm_max_epi16(max_input, inptr[i]);
+    min_input = _mm_min_epi16(min_input, inptr[i]);
+  }
+  max_input = _mm_cmpgt_epi16(max_input, max);
+  min_input = _mm_cmplt_epi16(min_input, min);
+  temp1 = _mm_or_si128(max_input, min_input);
+  test = _mm_movemask_epi8(temp1);
+
+  if (!test) {
+    // Do the row transform
+    idct8_sse2(inptr);
+
+    // Find the min & max for the column transform
+    max_input = _mm_max_epi16(inptr[0], inptr[1]);
+    min_input = _mm_min_epi16(inptr[0], inptr[1]);
+    for (i = 2; i < 8; i++) {
+      max_input = _mm_max_epi16(max_input, inptr[i]);
+      min_input = _mm_min_epi16(min_input, inptr[i]);
+    }
+    max_input = _mm_cmpgt_epi16(max_input, max);
+    min_input = _mm_cmplt_epi16(min_input, min);
+    temp1 = _mm_or_si128(max_input, min_input);
+    test = _mm_movemask_epi8(temp1);
+
+    if (test) {
+      array_transpose_8x8(inptr, inptr);
+      for (i = 0; i < 8; i++) {
+        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
+        temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
+        temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
+      }
+    } else {
+      // Set to use the optimised transform for the column
+      optimised_cols = 1;
+    }
+  } else {
+    // Run the un-optimised row transform
+    for (i = 0; i < 8; ++i) {
+      vpx_highbd_idct8_c(input, outptr, bd);
+      input += 8;
+      outptr += 8;
+    }
+  }
+
+  if (optimised_cols) {
+    idct8_sse2(inptr);
+
+    // Final round & shift and Reconstruction and Store
+    {
+      __m128i d[8];
+      for (i = 0; i < 8; i++) {
+        inptr[i] = _mm_add_epi16(inptr[i], sixteen);
+        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
+        inptr[i] = _mm_srai_epi16(inptr[i], 5);
+        d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
+        // Store
+        _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
+      }
+    }
+  } else {
+    // Run the un-optimised column transform
+    tran_low_t temp_in[8], temp_out[8];
+    for (i = 0; i < 8; ++i) {
+      for (j = 0; j < 8; ++j)
+        temp_in[j] = out[j * 8 + i];
+      vpx_highbd_idct8_c(temp_in, temp_out, bd);
+      for (j = 0; j < 8; ++j) {
+        dest[j * stride + i] = highbd_clip_pixel_add(
+            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+      }
+    }
+  }
+}
+
+void vpx_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
+                                    int stride, int bd) {
+  tran_low_t out[8 * 8] = { 0 };
+  tran_low_t *outptr = out;
+  int i, j, test;
+  __m128i inptr[8];
+  __m128i min_input, max_input, temp1, temp2, sign_bits;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i sixteen = _mm_set1_epi16(16);
+  const __m128i max = _mm_set1_epi16(6201);
+  const __m128i min = _mm_set1_epi16(-6201);
+  int optimised_cols = 0;
+
+  // Load input into __m128i & pack to 16 bits
+  for (i = 0; i < 8; i++) {
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
+    inptr[i] = _mm_packs_epi32(temp1, temp2);
+  }
+
+  // Find the min & max for the row transform
+  // only first 4 row has non-zero coefs
+  max_input = _mm_max_epi16(inptr[0], inptr[1]);
+  min_input = _mm_min_epi16(inptr[0], inptr[1]);
+  for (i = 2; i < 4; i++) {
+    max_input = _mm_max_epi16(max_input, inptr[i]);
+    min_input = _mm_min_epi16(min_input, inptr[i]);
+  }
+  max_input = _mm_cmpgt_epi16(max_input, max);
+  min_input = _mm_cmplt_epi16(min_input, min);
+  temp1 = _mm_or_si128(max_input, min_input);
+  test = _mm_movemask_epi8(temp1);
+
+  if (!test) {
+    // Do the row transform
+    idct8_sse2(inptr);
+
+    // Find the min & max for the column transform
+    // N.B. Only first 4 cols contain non-zero coeffs
+    max_input = _mm_max_epi16(inptr[0], inptr[1]);
+    min_input = _mm_min_epi16(inptr[0], inptr[1]);
+    for (i = 2; i < 8; i++) {
+      max_input = _mm_max_epi16(max_input, inptr[i]);
+      min_input = _mm_min_epi16(min_input, inptr[i]);
+    }
+    max_input = _mm_cmpgt_epi16(max_input, max);
+    min_input = _mm_cmplt_epi16(min_input, min);
+    temp1 = _mm_or_si128(max_input, min_input);
+    test = _mm_movemask_epi8(temp1);
+
+    if (test) {
+      // Use fact only first 4 rows contain non-zero coeffs
+      array_transpose_4X8(inptr, inptr);
+      for (i = 0; i < 4; i++) {
+        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
+        temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
+        temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
+      }
+    } else {
+      // Set to use the optimised transform for the column
+      optimised_cols = 1;
+    }
+  } else {
+    // Run the un-optimised row transform
+    for (i = 0; i < 4; ++i) {
+      vpx_highbd_idct8_c(input, outptr, bd);
+      input += 8;
+      outptr += 8;
+    }
+  }
+
+  if (optimised_cols) {
+    idct8_sse2(inptr);
+
+    // Final round & shift and Reconstruction and Store
+    {
+      __m128i d[8];
+      for (i = 0; i < 8; i++) {
+        inptr[i] = _mm_add_epi16(inptr[i], sixteen);
+        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
+        inptr[i] = _mm_srai_epi16(inptr[i], 5);
+        d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
+        // Store
+        _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
+      }
+    }
+  } else {
+    // Run the un-optimised column transform
+    tran_low_t temp_in[8], temp_out[8];
+    for (i = 0; i < 8; ++i) {
+      for (j = 0; j < 8; ++j)
+        temp_in[j] = out[j * 8 + i];
+      vpx_highbd_idct8_c(temp_in, temp_out, bd);
+      for (j = 0; j < 8; ++j) {
+        dest[j * stride + i] = highbd_clip_pixel_add(
+            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+      }
+    }
+  }
+}
+
+void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8,
+                                       int stride, int bd) {
+  tran_low_t out[16 * 16];
+  tran_low_t *outptr = out;
+  int i, j, test;
+  __m128i inptr[32];
+  __m128i min_input, max_input, temp1, temp2, sign_bits;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i rounding = _mm_set1_epi16(32);
+  const __m128i max = _mm_set1_epi16(3155);
+  const __m128i min = _mm_set1_epi16(-3155);
+  int optimised_cols = 0;
+
+  // Load input into __m128i & pack to 16 bits
+  for (i = 0; i < 16; i++) {
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
+    inptr[i] = _mm_packs_epi32(temp1, temp2);
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
+    inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
+  }
+
+  // Find the min & max for the row transform
+  max_input = _mm_max_epi16(inptr[0], inptr[1]);
+  min_input = _mm_min_epi16(inptr[0], inptr[1]);
+  for (i = 2; i < 32; i++) {
+    max_input = _mm_max_epi16(max_input, inptr[i]);
+    min_input = _mm_min_epi16(min_input, inptr[i]);
+  }
+  max_input = _mm_cmpgt_epi16(max_input, max);
+  min_input = _mm_cmplt_epi16(min_input, min);
+  temp1 = _mm_or_si128(max_input, min_input);
+  test = _mm_movemask_epi8(temp1);
+
+  if (!test) {
+    // Do the row transform
+    idct16_sse2(inptr, inptr + 16);
+
+    // Find the min & max for the column transform
+    max_input = _mm_max_epi16(inptr[0], inptr[1]);
+    min_input = _mm_min_epi16(inptr[0], inptr[1]);
+    for (i = 2; i < 32; i++) {
+      max_input = _mm_max_epi16(max_input, inptr[i]);
+      min_input = _mm_min_epi16(min_input, inptr[i]);
+    }
+    max_input = _mm_cmpgt_epi16(max_input, max);
+    min_input = _mm_cmplt_epi16(min_input, min);
+    temp1 = _mm_or_si128(max_input, min_input);
+    test = _mm_movemask_epi8(temp1);
+
+    if (test) {
+      array_transpose_16x16(inptr, inptr + 16);
+      for (i = 0; i < 16; i++) {
+        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
+        temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
+        temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
+        sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
+        temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
+        temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
+      }
+    } else {
+      // Set to use the optimised transform for the column
+      optimised_cols = 1;
+    }
+  } else {
+    // Run the un-optimised row transform
+    for (i = 0; i < 16; ++i) {
+      vpx_highbd_idct16_c(input, outptr, bd);
+      input += 16;
+      outptr += 16;
+    }
+  }
+
+  if (optimised_cols) {
+    idct16_sse2(inptr, inptr + 16);
+
+    // Final round & shift and Reconstruction and Store
+    {
+      __m128i d[2];
+      for (i = 0; i < 16; i++) {
+        inptr[i   ] = _mm_add_epi16(inptr[i   ], rounding);
+        inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);
+        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
+        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));
+        inptr[i   ] = _mm_srai_epi16(inptr[i   ], 6);
+        inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);
+        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i   ]), bd);
+        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);
+        // Store
+        _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
+        _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
+      }
+    }
+  } else {
+    // Run the un-optimised column transform
+    tran_low_t temp_in[16], temp_out[16];
+    for (i = 0; i < 16; ++i) {
+      for (j = 0; j < 16; ++j)
+        temp_in[j] = out[j * 16 + i];
+      vpx_highbd_idct16_c(temp_in, temp_out, bd);
+      for (j = 0; j < 16; ++j) {
+        dest[j * stride + i] = highbd_clip_pixel_add(
+            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+      }
+    }
+  }
+}
+
+void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
+                                      int stride, int bd) {
+  tran_low_t out[16 * 16] = { 0 };
+  tran_low_t *outptr = out;
+  int i, j, test;
+  __m128i inptr[32];
+  __m128i min_input, max_input, temp1, temp2, sign_bits;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i rounding = _mm_set1_epi16(32);
+  const __m128i max = _mm_set1_epi16(3155);
+  const __m128i min = _mm_set1_epi16(-3155);
+  int optimised_cols = 0;
+
+  // Load input into __m128i & pack to 16 bits
+  for (i = 0; i < 16; i++) {
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
+    inptr[i] = _mm_packs_epi32(temp1, temp2);
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
+    inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
+  }
+
+  // Find the min & max for the row transform
+  // Since all non-zero dct coefficients are in upper-left 4x4 area,
+  // we only need to consider first 4 rows here.
+  max_input = _mm_max_epi16(inptr[0], inptr[1]);
+  min_input = _mm_min_epi16(inptr[0], inptr[1]);
+  for (i = 2; i < 4; i++) {
+    max_input = _mm_max_epi16(max_input, inptr[i]);
+    min_input = _mm_min_epi16(min_input, inptr[i]);
+  }
+  max_input = _mm_cmpgt_epi16(max_input, max);
+  min_input = _mm_cmplt_epi16(min_input, min);
+  temp1 = _mm_or_si128(max_input, min_input);
+  test = _mm_movemask_epi8(temp1);
+
+  if (!test) {
+    // Do the row transform (N.B. This transposes inptr)
+    idct16_sse2(inptr, inptr + 16);
+
+    // Find the min & max for the column transform
+    // N.B. Only first 4 cols contain non-zero coeffs
+    max_input = _mm_max_epi16(inptr[0], inptr[1]);
+    min_input = _mm_min_epi16(inptr[0], inptr[1]);
+    for (i = 2; i < 16; i++) {
+      max_input = _mm_max_epi16(max_input, inptr[i]);
+      min_input = _mm_min_epi16(min_input, inptr[i]);
+    }
+    max_input = _mm_cmpgt_epi16(max_input, max);
+    min_input = _mm_cmplt_epi16(min_input, min);
+    temp1 = _mm_or_si128(max_input, min_input);
+    test = _mm_movemask_epi8(temp1);
+
+    if (test) {
+      // Use fact only first 4 rows contain non-zero coeffs
+      array_transpose_8x8(inptr, inptr);
+      array_transpose_8x8(inptr + 8, inptr + 16);
+      for (i = 0; i < 4; i++) {
+        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
+        temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
+        temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
+        sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
+        temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
+        temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
+      }
+    } else {
+      // Set to use the optimised transform for the column
+      optimised_cols = 1;
+    }
+  } else {
+    // Run the un-optimised row transform
+    for (i = 0; i < 4; ++i) {
+      vpx_highbd_idct16_c(input, outptr, bd);
+      input += 16;
+      outptr += 16;
+    }
+  }
+
+  if (optimised_cols) {
+    idct16_sse2(inptr, inptr + 16);
+
+    // Final round & shift and Reconstruction and Store
+    {
+      __m128i d[2];
+      for (i = 0; i < 16; i++) {
+        inptr[i   ] = _mm_add_epi16(inptr[i   ], rounding);
+        inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);
+        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
+        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));
+        inptr[i   ] = _mm_srai_epi16(inptr[i   ], 6);
+        inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);
+        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i   ]), bd);
+        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);
+        // Store
+        _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
+        _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
+      }
+    }
+  } else {
+    // Run the un-optimised column transform
+    tran_low_t temp_in[16], temp_out[16];
+    for (i = 0; i < 16; ++i) {
+      for (j = 0; j < 16; ++j)
+        temp_in[j] = out[j * 16 + i];
+      vpx_highbd_idct16_c(temp_in, temp_out, bd);
+      for (j = 0; j < 16; ++j) {
+        dest[j * stride + i] = highbd_clip_pixel_add(
+            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+      }
+    }
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/libs/libvpx/vpx_dsp/x86/inv_txfm_sse2.h b/libs/libvpx/vpx_dsp/x86/inv_txfm_sse2.h
new file mode 100644
index 0000000000..bd520c18e5
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/inv_txfm_sse2.h
@@ -0,0 +1,196 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_X86_INV_TXFM_SSE2_H_
+#define VPX_DSP_X86_INV_TXFM_SSE2_H_
+
+#include <emmintrin.h>  // SSE2
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/inv_txfm.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+// perform 8x8 transpose
+static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
+  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
+  const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
+  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
+  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
+  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
+
+  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+
+  res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
+  res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
+  res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
+  res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
+  res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
+  res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
+  res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
+  res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
+}
+
+#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \
+  {                                                     \
+    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
+    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
+                                                        \
+    in0 = _mm_unpacklo_epi32(tr0_0, tr0_1);  /* i1 i0 */  \
+    in1 = _mm_unpackhi_epi32(tr0_0, tr0_1);  /* i3 i2 */  \
+  }
+
+static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) {
+  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
+
+  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+
+  out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4);
+  out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4);
+  out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6);
+  out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6);
+}
+
+static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
+  __m128i tbuf[8];
+  array_transpose_8x8(res0, res0);
+  array_transpose_8x8(res1, tbuf);
+  array_transpose_8x8(res0 + 8, res1);
+  array_transpose_8x8(res1 + 8, res1 + 8);
+
+  res0[8] = tbuf[0];
+  res0[9] = tbuf[1];
+  res0[10] = tbuf[2];
+  res0[11] = tbuf[3];
+  res0[12] = tbuf[4];
+  res0[13] = tbuf[5];
+  res0[14] = tbuf[6];
+  res0[15] = tbuf[7];
+}
+
+// Function to allow 8 bit optimisations to be used when profile 0 is used with
+// highbitdepth enabled
+static INLINE __m128i load_input_data(const tran_low_t *data) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  return octa_set_epi16(data[0], data[1], data[2], data[3], data[4], data[5],
+      data[6], data[7]);
+#else
+  return _mm_load_si128((const __m128i *)data);
+#endif
+}
+
+static INLINE void load_buffer_8x16(const tran_low_t *input, __m128i *in) {
+  in[0]  = load_input_data(input + 0 * 16);
+  in[1]  = load_input_data(input + 1 * 16);
+  in[2]  = load_input_data(input + 2 * 16);
+  in[3]  = load_input_data(input + 3 * 16);
+  in[4]  = load_input_data(input + 4 * 16);
+  in[5]  = load_input_data(input + 5 * 16);
+  in[6]  = load_input_data(input + 6 * 16);
+  in[7]  = load_input_data(input + 7 * 16);
+
+  in[8]  = load_input_data(input + 8 * 16);
+  in[9]  = load_input_data(input + 9 * 16);
+  in[10]  = load_input_data(input + 10 * 16);
+  in[11]  = load_input_data(input + 11 * 16);
+  in[12]  = load_input_data(input + 12 * 16);
+  in[13]  = load_input_data(input + 13 * 16);
+  in[14]  = load_input_data(input + 14 * 16);
+  in[15]  = load_input_data(input + 15 * 16);
+}
+
+#define RECON_AND_STORE(dest, in_x) \
+  {                                                     \
+     __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
+      d0 = _mm_unpacklo_epi8(d0, zero); \
+      d0 = _mm_add_epi16(in_x, d0); \
+      d0 = _mm_packus_epi16(d0, d0); \
+      _mm_storel_epi64((__m128i *)(dest), d0); \
+  }
+
+static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
+  const __m128i final_rounding = _mm_set1_epi16(1<<5);
+  const __m128i zero = _mm_setzero_si128();
+  // Final rounding and shift
+  in[0] = _mm_adds_epi16(in[0], final_rounding);
+  in[1] = _mm_adds_epi16(in[1], final_rounding);
+  in[2] = _mm_adds_epi16(in[2], final_rounding);
+  in[3] = _mm_adds_epi16(in[3], final_rounding);
+  in[4] = _mm_adds_epi16(in[4], final_rounding);
+  in[5] = _mm_adds_epi16(in[5], final_rounding);
+  in[6] = _mm_adds_epi16(in[6], final_rounding);
+  in[7] = _mm_adds_epi16(in[7], final_rounding);
+  in[8] = _mm_adds_epi16(in[8], final_rounding);
+  in[9] = _mm_adds_epi16(in[9], final_rounding);
+  in[10] = _mm_adds_epi16(in[10], final_rounding);
+  in[11] = _mm_adds_epi16(in[11], final_rounding);
+  in[12] = _mm_adds_epi16(in[12], final_rounding);
+  in[13] = _mm_adds_epi16(in[13], final_rounding);
+  in[14] = _mm_adds_epi16(in[14], final_rounding);
+  in[15] = _mm_adds_epi16(in[15], final_rounding);
+
+  in[0] = _mm_srai_epi16(in[0], 6);
+  in[1] = _mm_srai_epi16(in[1], 6);
+  in[2] = _mm_srai_epi16(in[2], 6);
+  in[3] = _mm_srai_epi16(in[3], 6);
+  in[4] = _mm_srai_epi16(in[4], 6);
+  in[5] = _mm_srai_epi16(in[5], 6);
+  in[6] = _mm_srai_epi16(in[6], 6);
+  in[7] = _mm_srai_epi16(in[7], 6);
+  in[8] = _mm_srai_epi16(in[8], 6);
+  in[9] = _mm_srai_epi16(in[9], 6);
+  in[10] = _mm_srai_epi16(in[10], 6);
+  in[11] = _mm_srai_epi16(in[11], 6);
+  in[12] = _mm_srai_epi16(in[12], 6);
+  in[13] = _mm_srai_epi16(in[13], 6);
+  in[14] = _mm_srai_epi16(in[14], 6);
+  in[15] = _mm_srai_epi16(in[15], 6);
+
+  RECON_AND_STORE(dest +  0 * stride, in[0]);
+  RECON_AND_STORE(dest +  1 * stride, in[1]);
+  RECON_AND_STORE(dest +  2 * stride, in[2]);
+  RECON_AND_STORE(dest +  3 * stride, in[3]);
+  RECON_AND_STORE(dest +  4 * stride, in[4]);
+  RECON_AND_STORE(dest +  5 * stride, in[5]);
+  RECON_AND_STORE(dest +  6 * stride, in[6]);
+  RECON_AND_STORE(dest +  7 * stride, in[7]);
+  RECON_AND_STORE(dest +  8 * stride, in[8]);
+  RECON_AND_STORE(dest +  9 * stride, in[9]);
+  RECON_AND_STORE(dest + 10 * stride, in[10]);
+  RECON_AND_STORE(dest + 11 * stride, in[11]);
+  RECON_AND_STORE(dest + 12 * stride, in[12]);
+  RECON_AND_STORE(dest + 13 * stride, in[13]);
+  RECON_AND_STORE(dest + 14 * stride, in[14]);
+  RECON_AND_STORE(dest + 15 * stride, in[15]);
+}
+
+void idct4_sse2(__m128i *in);
+void idct8_sse2(__m128i *in);
+void idct16_sse2(__m128i *in0, __m128i *in1);
+void iadst4_sse2(__m128i *in);
+void iadst8_sse2(__m128i *in);
+void iadst16_sse2(__m128i *in0, __m128i *in1);
+
+#endif  // VPX_DSP_X86_INV_TXFM_SSE2_H_
diff --git a/libs/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm b/libs/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
new file mode 100644
index 0000000000..20baf820f6
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
@@ -0,0 +1,1793 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+; This file provides SSSE3 version of the inverse transformation. Part
+; of the functions are originally derived from the ffmpeg project.
+; Note that the current version applies to x86 64-bit only.
+
+SECTION_RODATA
+
+pw_11585x2: times 8 dw 23170
+
+pw_m2404x2:  times 8 dw  -2404*2
+pw_m4756x2:  times 8 dw  -4756*2
+pw_m5520x2:  times 8 dw  -5520*2
+pw_m8423x2:  times 8 dw  -8423*2
+pw_m9102x2:  times 8 dw  -9102*2
+pw_m10394x2: times 8 dw -10394*2
+pw_m11003x2: times 8 dw -11003*2
+
+pw_16364x2: times 8 dw 16364*2
+pw_16305x2: times 8 dw 16305*2
+pw_16207x2: times 8 dw 16207*2
+pw_16069x2: times 8 dw 16069*2
+pw_15893x2: times 8 dw 15893*2
+pw_15679x2: times 8 dw 15679*2
+pw_15426x2: times 8 dw 15426*2
+pw_15137x2: times 8 dw 15137*2
+pw_14811x2: times 8 dw 14811*2
+pw_14449x2: times 8 dw 14449*2
+pw_14053x2: times 8 dw 14053*2
+pw_13623x2: times 8 dw 13623*2
+pw_13160x2: times 8 dw 13160*2
+pw_12665x2: times 8 dw 12665*2
+pw_12140x2: times 8 dw 12140*2
+pw__9760x2: times 8 dw  9760*2
+pw__7723x2: times 8 dw  7723*2
+pw__7005x2: times 8 dw  7005*2
+pw__6270x2: times 8 dw  6270*2
+pw__3981x2: times 8 dw  3981*2
+pw__3196x2: times 8 dw  3196*2
+pw__1606x2: times 8 dw  1606*2
+pw___804x2: times 8 dw   804*2
+
+pd_8192:    times 4 dd 8192
+pw_32:      times 8 dw 32
+pw_16:      times 8 dw 16
+
+%macro TRANSFORM_COEFFS 2
+pw_%1_%2:   dw  %1,  %2,  %1,  %2,  %1,  %2,  %1,  %2
+pw_m%2_%1:  dw -%2,  %1, -%2,  %1, -%2,  %1, -%2,  %1
+pw_m%1_m%2: dw -%1, -%2, -%1, -%2, -%1, -%2, -%1, -%2
+%endmacro
+
+TRANSFORM_COEFFS    6270, 15137
+TRANSFORM_COEFFS    3196, 16069
+TRANSFORM_COEFFS   13623,  9102
+
+; constants for 32x32_34
+TRANSFORM_COEFFS      804, 16364
+TRANSFORM_COEFFS    15426,  5520
+TRANSFORM_COEFFS     3981, 15893
+TRANSFORM_COEFFS    16207,  2404
+TRANSFORM_COEFFS     1606, 16305
+TRANSFORM_COEFFS    15679,  4756
+TRANSFORM_COEFFS    11585, 11585
+
+; constants for 32x32_1024
+TRANSFORM_COEFFS    12140, 11003
+TRANSFORM_COEFFS     7005, 14811
+TRANSFORM_COEFFS    14053,  8423
+TRANSFORM_COEFFS     9760, 13160
+TRANSFORM_COEFFS    12665, 10394
+TRANSFORM_COEFFS     7723, 14449
+
+%macro PAIR_PP_COEFFS 2
+dpw_%1_%2:   dw  %1,  %1,  %1,  %1,  %2,  %2,  %2,  %2
+%endmacro
+
+%macro PAIR_MP_COEFFS 2
+dpw_m%1_%2:  dw -%1, -%1, -%1, -%1,  %2,  %2,  %2,  %2
+%endmacro
+
+%macro PAIR_MM_COEFFS 2
+dpw_m%1_m%2: dw -%1, -%1, -%1, -%1, -%2, -%2, -%2, -%2
+%endmacro
+
+PAIR_PP_COEFFS     30274, 12540
+PAIR_PP_COEFFS      6392, 32138
+PAIR_MP_COEFFS     18204, 27246
+
+PAIR_PP_COEFFS     12540, 12540
+PAIR_PP_COEFFS     30274, 30274
+PAIR_PP_COEFFS      6392,  6392
+PAIR_PP_COEFFS     32138, 32138
+PAIR_MM_COEFFS     18204, 18204
+PAIR_PP_COEFFS     27246, 27246
+
+SECTION .text
+
+%if ARCH_X86_64
+%macro SUM_SUB 3
+  psubw  m%3, m%1, m%2
+  paddw  m%1, m%2
+  SWAP    %2, %3
+%endmacro
+
+; butterfly operation
+%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2
+  pmaddwd            m%1, m%3, %5
+  pmaddwd            m%2, m%3, %6
+  paddd              m%1,  %4
+  paddd              m%2,  %4
+  psrad              m%1,  14
+  psrad              m%2,  14
+%endmacro
+
+%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2
+  punpckhwd          m%6, m%2, m%1
+  MUL_ADD_2X         %7,  %6,  %6,  %5, [pw_m%4_%3], [pw_%3_%4]
+  punpcklwd          m%2, m%1
+  MUL_ADD_2X         %1,  %2,  %2,  %5, [pw_m%4_%3], [pw_%3_%4]
+  packssdw           m%1, m%7
+  packssdw           m%2, m%6
+%endmacro
+
+%macro BUTTERFLY_4Xmm 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2
+  punpckhwd          m%6, m%2, m%1
+  MUL_ADD_2X         %7,  %6,  %6,  %5, [pw_m%4_%3], [pw_m%3_m%4]
+  punpcklwd          m%2, m%1
+  MUL_ADD_2X         %1,  %2,  %2,  %5, [pw_m%4_%3], [pw_m%3_m%4]
+  packssdw           m%1, m%7
+  packssdw           m%2, m%6
+%endmacro
+
+; matrix transpose
+%macro INTERLEAVE_2X 4
+  punpckh%1          m%4, m%2, m%3
+  punpckl%1          m%2, m%3
+  SWAP               %3,  %4
+%endmacro
+
+%macro TRANSPOSE8X8 9
+  INTERLEAVE_2X  wd, %1, %2, %9
+  INTERLEAVE_2X  wd, %3, %4, %9
+  INTERLEAVE_2X  wd, %5, %6, %9
+  INTERLEAVE_2X  wd, %7, %8, %9
+
+  INTERLEAVE_2X  dq, %1, %3, %9
+  INTERLEAVE_2X  dq, %2, %4, %9
+  INTERLEAVE_2X  dq, %5, %7, %9
+  INTERLEAVE_2X  dq, %6, %8, %9
+
+  INTERLEAVE_2X  qdq, %1, %5, %9
+  INTERLEAVE_2X  qdq, %3, %7, %9
+  INTERLEAVE_2X  qdq, %2, %6, %9
+  INTERLEAVE_2X  qdq, %4, %8, %9
+
+  SWAP  %2, %5
+  SWAP  %4, %7
+%endmacro
+
+%macro IDCT8_1D 0
+  SUM_SUB          0,    4,    9
+  BUTTERFLY_4X     2,    6,    6270, 15137,  m8,  9,  10
+  pmulhrsw        m0,  m12
+  pmulhrsw        m4,  m12
+  BUTTERFLY_4X     1,    7,    3196, 16069,  m8,  9,  10
+  BUTTERFLY_4X     5,    3,   13623,  9102,  m8,  9,  10
+
+  SUM_SUB          1,    5,    9
+  SUM_SUB          7,    3,    9
+  SUM_SUB          0,    6,    9
+  SUM_SUB          4,    2,    9
+  SUM_SUB          3,    5,    9
+  pmulhrsw        m3,  m12
+  pmulhrsw        m5,  m12
+
+  SUM_SUB          0,    7,    9
+  SUM_SUB          4,    3,    9
+  SUM_SUB          2,    5,    9
+  SUM_SUB          6,    1,    9
+
+  SWAP             3,    6
+  SWAP             1,    4
+%endmacro
+
+; This macro handles 8 pixels per line
+%macro ADD_STORE_8P_2X 5;  src1, src2, tmp1, tmp2, zero
+  paddw           m%1, m11
+  paddw           m%2, m11
+  psraw           m%1, 5
+  psraw           m%2, 5
+
+  movh            m%3, [outputq]
+  movh            m%4, [outputq + strideq]
+  punpcklbw       m%3, m%5
+  punpcklbw       m%4, m%5
+  paddw           m%3, m%1
+  paddw           m%4, m%2
+  packuswb        m%3, m%5
+  packuswb        m%4, m%5
+  movh               [outputq], m%3
+  movh     [outputq + strideq], m%4
+%endmacro
+
+INIT_XMM ssse3
+; full inverse 8x8 2D-DCT transform
+cglobal idct8x8_64_add, 3, 5, 13, input, output, stride
+  mova     m8, [pd_8192]
+  mova    m11, [pw_16]
+  mova    m12, [pw_11585x2]
+
+  lea      r3, [2 * strideq]
+%if CONFIG_VP9_HIGHBITDEPTH
+  mova     m0, [inputq +   0]
+  packssdw m0, [inputq +  16]
+  mova     m1, [inputq +  32]
+  packssdw m1, [inputq +  48]
+  mova     m2, [inputq +  64]
+  packssdw m2, [inputq +  80]
+  mova     m3, [inputq +  96]
+  packssdw m3, [inputq + 112]
+  mova     m4, [inputq + 128]
+  packssdw m4, [inputq + 144]
+  mova     m5, [inputq + 160]
+  packssdw m5, [inputq + 176]
+  mova     m6, [inputq + 192]
+  packssdw m6, [inputq + 208]
+  mova     m7, [inputq + 224]
+  packssdw m7, [inputq + 240]
+%else
+  mova     m0, [inputq +   0]
+  mova     m1, [inputq +  16]
+  mova     m2, [inputq +  32]
+  mova     m3, [inputq +  48]
+  mova     m4, [inputq +  64]
+  mova     m5, [inputq +  80]
+  mova     m6, [inputq +  96]
+  mova     m7, [inputq + 112]
+%endif
+  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
+  IDCT8_1D
+  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
+  IDCT8_1D
+
+  pxor    m12, m12
+  ADD_STORE_8P_2X  0, 1, 9, 10, 12
+  lea              outputq, [outputq + r3]
+  ADD_STORE_8P_2X  2, 3, 9, 10, 12
+  lea              outputq, [outputq + r3]
+  ADD_STORE_8P_2X  4, 5, 9, 10, 12
+  lea              outputq, [outputq + r3]
+  ADD_STORE_8P_2X  6, 7, 9, 10, 12
+
+  RET
+
+; inverse 8x8 2D-DCT transform with only first 10 coeffs non-zero
+cglobal idct8x8_12_add, 3, 5, 13, input, output, stride
+  mova       m8, [pd_8192]
+  mova      m11, [pw_16]
+  mova      m12, [pw_11585x2]
+
+  lea        r3, [2 * strideq]
+
+%if CONFIG_VP9_HIGHBITDEPTH
+  mova       m0, [inputq +   0]
+  packssdw   m0, [inputq +  16]
+  mova       m1, [inputq +  32]
+  packssdw   m1, [inputq +  48]
+  mova       m2, [inputq +  64]
+  packssdw   m2, [inputq +  80]
+  mova       m3, [inputq +  96]
+  packssdw   m3, [inputq + 112]
+%else
+  mova       m0, [inputq +  0]
+  mova       m1, [inputq + 16]
+  mova       m2, [inputq + 32]
+  mova       m3, [inputq + 48]
+%endif
+
+  punpcklwd  m0, m1
+  punpcklwd  m2, m3
+  punpckhdq  m9, m0, m2
+  punpckldq  m0, m2
+  SWAP       2, 9
+
+  ; m0 -> [0], [0]
+  ; m1 -> [1], [1]
+  ; m2 -> [2], [2]
+  ; m3 -> [3], [3]
+  punpckhqdq m10, m0, m0
+  punpcklqdq m0,  m0
+  punpckhqdq m9,  m2, m2
+  punpcklqdq m2,  m2
+  SWAP       1, 10
+  SWAP       3,  9
+
+  pmulhrsw   m0, m12
+  pmulhrsw   m2, [dpw_30274_12540]
+  pmulhrsw   m1, [dpw_6392_32138]
+  pmulhrsw   m3, [dpw_m18204_27246]
+
+  SUM_SUB    0, 2, 9
+  SUM_SUB    1, 3, 9
+
+  punpcklqdq m9, m3, m3
+  punpckhqdq m5, m3, m9
+
+  SUM_SUB    3, 5, 9
+  punpckhqdq m5, m3
+  pmulhrsw   m5, m12
+
+  punpckhqdq m9, m1, m5
+  punpcklqdq m1, m5
+  SWAP       5, 9
+
+  SUM_SUB    0, 5, 9
+  SUM_SUB    2, 1, 9
+
+  punpckhqdq m3, m0, m0
+  punpckhqdq m4, m1, m1
+  punpckhqdq m6, m5, m5
+  punpckhqdq m7, m2, m2
+
+  punpcklwd  m0, m3
+  punpcklwd  m7, m2
+  punpcklwd  m1, m4
+  punpcklwd  m6, m5
+
+  punpckhdq  m4, m0, m7
+  punpckldq  m0, m7
+  punpckhdq  m10, m1, m6
+  punpckldq  m5, m1, m6
+
+  punpckhqdq m1, m0, m5
+  punpcklqdq m0, m5
+  punpckhqdq m3, m4, m10
+  punpcklqdq m2, m4, m10
+
+
+  pmulhrsw   m0, m12
+  pmulhrsw   m6, m2, [dpw_30274_30274]
+  pmulhrsw   m4, m2, [dpw_12540_12540]
+
+  pmulhrsw   m7, m1, [dpw_32138_32138]
+  pmulhrsw   m1, [dpw_6392_6392]
+  pmulhrsw   m5, m3, [dpw_m18204_m18204]
+  pmulhrsw   m3, [dpw_27246_27246]
+
+  mova       m2, m0
+  SUM_SUB    0, 6, 9
+  SUM_SUB    2, 4, 9
+  SUM_SUB    1, 5, 9
+  SUM_SUB    7, 3, 9
+
+  SUM_SUB    3, 5, 9
+  pmulhrsw   m3, m12
+  pmulhrsw   m5, m12
+
+  SUM_SUB    0, 7, 9
+  SUM_SUB    2, 3, 9
+  SUM_SUB    4, 5, 9
+  SUM_SUB    6, 1, 9
+
+  SWAP       3, 6
+  SWAP       1, 2
+  SWAP       2, 4
+
+
+  pxor    m12, m12
+  ADD_STORE_8P_2X  0, 1, 9, 10, 12
+  lea              outputq, [outputq + r3]
+  ADD_STORE_8P_2X  2, 3, 9, 10, 12
+  lea              outputq, [outputq + r3]
+  ADD_STORE_8P_2X  4, 5, 9, 10, 12
+  lea              outputq, [outputq + r3]
+  ADD_STORE_8P_2X  6, 7, 9, 10, 12
+
+  RET
+
+%define  idx0 16 * 0
+%define  idx1 16 * 1
+%define  idx2 16 * 2
+%define  idx3 16 * 3
+%define  idx4 16 * 4
+%define  idx5 16 * 5
+%define  idx6 16 * 6
+%define  idx7 16 * 7
+%define  idx8 16 * 0
+%define  idx9 16 * 1
+%define idx10 16 * 2
+%define idx11 16 * 3
+%define idx12 16 * 4
+%define idx13 16 * 5
+%define idx14 16 * 6
+%define idx15 16 * 7
+%define idx16 16 * 0
+%define idx17 16 * 1
+%define idx18 16 * 2
+%define idx19 16 * 3
+%define idx20 16 * 4
+%define idx21 16 * 5
+%define idx22 16 * 6
+%define idx23 16 * 7
+%define idx24 16 * 0
+%define idx25 16 * 1
+%define idx26 16 * 2
+%define idx27 16 * 3
+%define idx28 16 * 4
+%define idx29 16 * 5
+%define idx30 16 * 6
+%define idx31 16 * 7
+
+; FROM idct32x32_add_neon.asm
+;
+; Instead of doing the transforms stage by stage, it is done by loading
+; some input values and doing as many stages as possible to minimize the
+; storing/loading of intermediate results. To fit within registers, the
+; final coefficients are cut into four blocks:
+; BLOCK A: 16-19,28-31
+; BLOCK B: 20-23,24-27
+; BLOCK C: 8-11,12-15
+; BLOCK D: 0-3,4-7
+; Blocks A and C are straight calculation through the various stages. In
+; block B, further calculations are performed using the results from
+; block A. In block D, further calculations are performed using the results
+; from block C and then the final calculations are done using results from
+; block A and B which have been combined at the end of block B.
+;
+
+%macro IDCT32X32_34 4
+  ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                m11, m1
+  pmulhrsw             m1, [pw___804x2] ; stp1_16
+  mova      [r4 +      0], m0
+  pmulhrsw            m11, [pw_16364x2] ; stp2_31
+  mova      [r4 + 16 * 2], m2
+  mova                m12, m7
+  pmulhrsw             m7, [pw_15426x2] ; stp1_28
+  mova      [r4 + 16 * 4], m4
+  pmulhrsw            m12, [pw_m5520x2] ; stp2_19
+  mova      [r4 + 16 * 6], m6
+
+  ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m2, m1   ; stp1_16
+  mova                 m0, m11  ; stp1_31
+  mova                 m4, m7   ; stp1_28
+  mova                m15, m12  ; stp1_19
+
+  ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  BUTTERFLY_4X          0,     2,   3196, 16069,  m8,  9,  10 ; stp1_17, stp1_30
+  BUTTERFLY_4Xmm        4,    15,   3196, 16069,  m8,  9,  10 ; stp1_29, stp1_18
+
+  ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               1, 12, 9 ; stp2_16, stp2_19
+  SUM_SUB               0, 15, 9 ; stp2_17, stp2_18
+  SUM_SUB              11,  7, 9 ; stp2_31, stp2_28
+  SUM_SUB               2,  4, 9 ; stp2_30, stp2_29
+
+  ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  BUTTERFLY_4X          4,    15,   6270, 15137,  m8,  9,  10 ; stp1_18, stp1_29
+  BUTTERFLY_4X          7,    12,   6270, 15137,  m8,  9,  10 ; stp1_19, stp1_28
+
+  ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m6, m5
+  pmulhrsw             m5, [pw__3981x2] ; stp1_20
+  mova [stp + %4 + idx28], m12
+  mova [stp + %4 + idx29], m15
+  pmulhrsw             m6, [pw_15893x2] ; stp2_27
+  mova [stp + %4 + idx30], m2
+  mova                 m2, m3
+  pmulhrsw             m3, [pw_m2404x2] ; stp1_23
+  mova [stp + %4 + idx31], m11
+  pmulhrsw             m2, [pw_16207x2] ; stp2_24
+
+  ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                m13, m5 ; stp1_20
+  mova                m14, m6 ; stp1_27
+  mova                m15, m3 ; stp1_23
+  mova                m11, m2 ; stp1_24
+
+  ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  BUTTERFLY_4X         14,    13,  13623,  9102,  m8,  9,  10 ; stp1_21, stp1_26
+  BUTTERFLY_4Xmm       11,    15,  13623,  9102,  m8,  9,  10 ; stp1_25, stp1_22
+
+  ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               3,  5, 9 ; stp2_23, stp2_20
+  SUM_SUB              15, 14, 9 ; stp2_22, stp2_21
+  SUM_SUB               2,  6, 9 ; stp2_24, stp2_27
+  SUM_SUB              11, 13, 9 ; stp2_25, stp2_26
+
+  ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  BUTTERFLY_4Xmm        6,     5,   6270, 15137,  m8,  9,  10 ; stp1_27, stp1_20
+  BUTTERFLY_4Xmm       13,    14,   6270, 15137,  m8,  9,  10 ; stp1_26, stp1_21
+
+  ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               1,  3, 9 ; stp2_16, stp2_23
+  SUM_SUB               0, 15, 9 ; stp2_17, stp2_22
+  SUM_SUB               4, 14, 9 ; stp2_18, stp2_21
+  SUM_SUB               7,  5, 9 ; stp2_19, stp2_20
+  mova [stp + %3 + idx16], m1
+  mova [stp + %3 + idx17], m0
+  mova [stp + %3 + idx18], m4
+  mova [stp + %3 + idx19], m7
+
+  mova                 m4, [stp + %4 + idx28]
+  mova                 m7, [stp + %4 + idx29]
+  mova                m10, [stp + %4 + idx30]
+  mova                m12, [stp + %4 + idx31]
+  SUM_SUB               4,  6, 9 ; stp2_28, stp2_27
+  SUM_SUB               7, 13, 9 ; stp2_29, stp2_26
+  SUM_SUB              10, 11, 9 ; stp2_30, stp2_25
+  SUM_SUB              12,  2, 9 ; stp2_31, stp2_24
+  mova [stp + %4 + idx28], m4
+  mova [stp + %4 + idx29], m7
+  mova [stp + %4 + idx30], m10
+  mova [stp + %4 + idx31], m12
+
+  ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+%if 0 ; overflow occurs in SUM_SUB when using test streams
+  mova                m10, [pw_11585x2]
+  SUM_SUB               6,  5, 9
+  pmulhrsw             m6, m10  ; stp1_27
+  pmulhrsw             m5, m10  ; stp1_20
+  SUM_SUB              13, 14,  9
+  pmulhrsw            m13, m10  ; stp1_26
+  pmulhrsw            m14, m10  ; stp1_21
+  SUM_SUB              11, 15,  9
+  pmulhrsw            m11, m10  ; stp1_25
+  pmulhrsw            m15, m10  ; stp1_22
+  SUM_SUB               2,  3,  9
+  pmulhrsw             m2, m10  ; stp1_24
+  pmulhrsw             m3, m10  ; stp1_23
+%else
+  BUTTERFLY_4X          6,     5,  11585, 11585,  m8,  9,  10 ; stp1_20, stp1_27
+  SWAP 6, 5
+  BUTTERFLY_4X         13,    14,  11585, 11585,  m8,  9,  10 ; stp1_21, stp1_26
+  SWAP 13, 14
+  BUTTERFLY_4X         11,    15,  11585, 11585,  m8,  9,  10 ; stp1_22, stp1_25
+  SWAP 11, 15
+  BUTTERFLY_4X          2,     3,  11585, 11585,  m8,  9,  10 ; stp1_23, stp1_24
+  SWAP 2, 3
+%endif
+
+  mova [stp + %4 + idx24], m2
+  mova [stp + %4 + idx25], m11
+  mova [stp + %4 + idx26], m13
+  mova [stp + %4 + idx27], m6
+
+  ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  ;
+  ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m0, [rsp + transposed_in + 16 *  2]
+  mova                 m6, [rsp + transposed_in + 16 *  6]
+
+  mova                 m1, m0
+  pmulhrsw             m0, [pw__1606x2] ; stp1_8
+  mova [stp + %3 + idx20], m5
+  mova [stp + %3 + idx21], m14
+  pmulhrsw             m1, [pw_16305x2] ; stp2_15
+  mova [stp + %3 + idx22], m15
+  mova                 m7, m6
+  pmulhrsw             m7, [pw_m4756x2] ; stp2_11
+  mova [stp + %3 + idx23], m3
+  pmulhrsw             m6, [pw_15679x2] ; stp1_12
+
+  ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m3, m0 ; stp1_8
+  mova                 m2, m1 ; stp1_15
+
+  ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  BUTTERFLY_4X          2,     3,   6270, 15137,  m8,  9,  10 ;  stp1_9, stp1_14
+  mova                 m4, m7 ; stp1_11
+  mova                 m5, m6 ; stp1_12
+  BUTTERFLY_4Xmm        5,     4,   6270, 15137,  m8,  9,  10 ; stp1_13, stp1_10
+
+  ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               0,  7, 9 ;  stp1_8, stp1_11
+  SUM_SUB               2,  4, 9 ;  stp1_9, stp1_10
+  SUM_SUB               1,  6, 9 ;  stp1_15, stp1_12
+  SUM_SUB               3,  5, 9 ;  stp1_14, stp1_13
+
+  ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+%if 0 ; overflow occurs in SUM_SUB when using test streams
+  mova                m10, [pw_11585x2]
+  SUM_SUB               5,  4, 9
+  pmulhrsw             m5, m10  ; stp1_13
+  pmulhrsw             m4, m10  ; stp1_10
+  SUM_SUB               6,  7, 9
+  pmulhrsw             m6, m10  ; stp1_12
+  pmulhrsw             m7, m10  ; stp1_11
+%else
+  BUTTERFLY_4X          5,     4,  11585, 11585,  m8,  9,  10 ; stp1_10, stp1_13
+  SWAP 5, 4
+  BUTTERFLY_4X          6,     7,  11585, 11585,  m8,  9,  10 ; stp1_11, stp1_12
+  SWAP 6, 7
+%endif
+
+  ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova [stp + %2 +  idx8], m0
+  mova [stp + %2 +  idx9], m2
+  mova [stp + %2 + idx10], m4
+  mova [stp + %2 + idx11], m7
+
+  ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  ;
+  ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  ;
+  ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                m11, [rsp + transposed_in + 16 *  4]
+  mova                m12, m11
+  pmulhrsw            m11, [pw__3196x2] ; stp1_4
+  pmulhrsw            m12, [pw_16069x2] ; stp1_7
+
+  ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m0, [rsp + transposed_in + 16 *  0]
+  mova                m10, [pw_11585x2]
+  pmulhrsw             m0, m10  ; stp1_1
+
+  mova                m14, m11 ; stp1_4
+  mova                m13, m12 ; stp1_7
+
+  ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+%if 0 ; overflow occurs in SUM_SUB when using test streams
+  SUM_SUB              13,   14,  9
+  pmulhrsw            m13, m10  ; stp1_6
+  pmulhrsw            m14, m10  ; stp1_5
+%else
+  BUTTERFLY_4X         13,    14,  11585, 11585,  m8,  9,  10 ; stp1_5, stp1_6
+  SWAP 13, 14
+%endif
+  mova                 m7, m0 ; stp1_0 = stp1_1
+  mova                 m4, m0 ; stp1_1
+  mova                 m2, m7 ; stp1_0
+
+  ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               0, 12, 9 ;  stp1_0, stp1_7
+  SUM_SUB               7, 13, 9 ;  stp1_1, stp1_6
+  SUM_SUB               2, 14, 9 ;  stp1_2, stp1_5
+  SUM_SUB               4, 11, 9 ;  stp1_3, stp1_4
+
+  ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               0,  1, 9 ;  stp1_0, stp1_15
+  SUM_SUB               7,  3, 9 ;  stp1_1, stp1_14
+  SUM_SUB               2,  5, 9 ;  stp1_2, stp1_13
+  SUM_SUB               4,  6, 9 ;  stp1_3, stp1_12
+
+  ; 0-3, 28-31 final stage
+  mova                m15, [stp + %4 + idx30]
+  mova                m10, [stp + %4 + idx31]
+  SUM_SUB               0, 10, 9 ;  stp1_0, stp1_31
+  SUM_SUB               7, 15, 9 ;  stp1_1, stp1_30
+  mova [stp + %1 +  idx0], m0
+  mova [stp + %1 +  idx1], m7
+  mova [stp + %4 + idx30], m15
+  mova [stp + %4 + idx31], m10
+  mova                 m7, [stp + %4 + idx28]
+  mova                 m0, [stp + %4 + idx29]
+  SUM_SUB               2,  0, 9 ;  stp1_2, stp1_29
+  SUM_SUB               4,  7, 9 ;  stp1_3, stp1_28
+  mova [stp + %1 +  idx2], m2
+  mova [stp + %1 +  idx3], m4
+  mova [stp + %4 + idx28], m7
+  mova [stp + %4 + idx29], m0
+
+  ; 12-15, 16-19 final stage
+  mova                 m0, [stp + %3 + idx16]
+  mova                 m7, [stp + %3 + idx17]
+  mova                 m2, [stp + %3 + idx18]
+  mova                 m4, [stp + %3 + idx19]
+  SUM_SUB               1,  0, 9 ;  stp1_15, stp1_16
+  SUM_SUB               3,  7, 9 ;  stp1_14, stp1_17
+  SUM_SUB               5,  2, 9 ;  stp1_13, stp1_18
+  SUM_SUB               6,  4, 9 ;  stp1_12, stp1_19
+  mova [stp + %2 + idx12], m6
+  mova [stp + %2 + idx13], m5
+  mova [stp + %2 + idx14], m3
+  mova [stp + %2 + idx15], m1
+  mova [stp + %3 + idx16], m0
+  mova [stp + %3 + idx17], m7
+  mova [stp + %3 + idx18], m2
+  mova [stp + %3 + idx19], m4
+
+  mova                 m4, [stp + %2 +  idx8]
+  mova                 m5, [stp + %2 +  idx9]
+  mova                 m6, [stp + %2 + idx10]
+  mova                 m7, [stp + %2 + idx11]
+  SUM_SUB              11,  7, 9 ;  stp1_4, stp1_11
+  SUM_SUB              14,  6, 9 ;  stp1_5, stp1_10
+  SUM_SUB              13,  5, 9 ;  stp1_6, stp1_9
+  SUM_SUB              12,  4, 9 ;  stp1_7, stp1_8
+
+  ; 4-7, 24-27 final stage
+  mova                 m0, [stp + %4 + idx27]
+  mova                 m1, [stp + %4 + idx26]
+  mova                 m2, [stp + %4 + idx25]
+  mova                 m3, [stp + %4 + idx24]
+  SUM_SUB              11,  0, 9 ;  stp1_4, stp1_27
+  SUM_SUB              14,  1, 9 ;  stp1_5, stp1_26
+  SUM_SUB              13,  2, 9 ;  stp1_6, stp1_25
+  SUM_SUB              12,  3, 9 ;  stp1_7, stp1_24
+  mova [stp + %4 + idx27], m0
+  mova [stp + %4 + idx26], m1
+  mova [stp + %4 + idx25], m2
+  mova [stp + %4 + idx24], m3
+  mova [stp + %1 +  idx4], m11
+  mova [stp + %1 +  idx5], m14
+  mova [stp + %1 +  idx6], m13
+  mova [stp + %1 +  idx7], m12
+
+  ; 8-11, 20-23 final stage
+  mova                 m0, [stp + %3 + idx20]
+  mova                 m1, [stp + %3 + idx21]
+  mova                 m2, [stp + %3 + idx22]
+  mova                 m3, [stp + %3 + idx23]
+  SUM_SUB               7,  0, 9 ;  stp1_11, stp_20
+  SUM_SUB               6,  1, 9 ;  stp1_10, stp_21
+  SUM_SUB               5,  2, 9 ;   stp1_9, stp_22
+  SUM_SUB               4,  3, 9 ;   stp1_8, stp_23
+  mova [stp + %2 +  idx8], m4
+  mova [stp + %2 +  idx9], m5
+  mova [stp + %2 + idx10], m6
+  mova [stp + %2 + idx11], m7
+  mova [stp + %3 + idx20], m0
+  mova [stp + %3 + idx21], m1
+  mova [stp + %3 + idx22], m2
+  mova [stp + %3 + idx23], m3
+%endmacro
+
+%macro RECON_AND_STORE 1
+  mova            m11, [pw_32]
+  lea             stp, [rsp + %1]
+  mov              r6, 32
+  pxor             m8, m8
+%%recon_and_store:
+  mova             m0, [stp + 16 * 32 * 0]
+  mova             m1, [stp + 16 * 32 * 1]
+  mova             m2, [stp + 16 * 32 * 2]
+  mova             m3, [stp + 16 * 32 * 3]
+  add             stp, 16
+
+  paddw            m0, m11
+  paddw            m1, m11
+  paddw            m2, m11
+  paddw            m3, m11
+  psraw            m0, 6
+  psraw            m1, 6
+  psraw            m2, 6
+  psraw            m3, 6
+  movh             m4, [outputq +  0]
+  movh             m5, [outputq +  8]
+  movh             m6, [outputq + 16]
+  movh             m7, [outputq + 24]
+  punpcklbw        m4, m8
+  punpcklbw        m5, m8
+  punpcklbw        m6, m8
+  punpcklbw        m7, m8
+  paddw            m0, m4
+  paddw            m1, m5
+  paddw            m2, m6
+  paddw            m3, m7
+  packuswb         m0, m1
+  packuswb         m2, m3
+  mova [outputq +  0], m0
+  mova [outputq + 16], m2
+  lea         outputq, [outputq + strideq]
+  dec              r6
+  jnz %%recon_and_store
+%endmacro
+
+%define i32x32_size     16*32*5
+%define pass_two_start  16*32*0
+%define transposed_in   16*32*4
+%define pass_one_start  16*32*0
+%define stp r8
+
+INIT_XMM ssse3
+cglobal idct32x32_34_add, 3, 11, 16, i32x32_size, input, output, stride
+  mova            m8, [pd_8192]
+  lea            stp, [rsp + pass_one_start]
+
+idct32x32_34:
+  mov             r3, inputq
+  lea             r4, [rsp + transposed_in]
+
+idct32x32_34_transpose:
+%if CONFIG_VP9_HIGHBITDEPTH
+  mova            m0, [r3 +       0]
+  packssdw        m0, [r3 +      16]
+  mova            m1, [r3 + 32 *  4]
+  packssdw        m1, [r3 + 32 *  4 + 16]
+  mova            m2, [r3 + 32 *  8]
+  packssdw        m2, [r3 + 32 *  8 + 16]
+  mova            m3, [r3 + 32 * 12]
+  packssdw        m3, [r3 + 32 * 12 + 16]
+  mova            m4, [r3 + 32 * 16]
+  packssdw        m4, [r3 + 32 * 16 + 16]
+  mova            m5, [r3 + 32 * 20]
+  packssdw        m5, [r3 + 32 * 20 + 16]
+  mova            m6, [r3 + 32 * 24]
+  packssdw        m6, [r3 + 32 * 24 + 16]
+  mova            m7, [r3 + 32 * 28]
+  packssdw        m7, [r3 + 32 * 28 + 16]
+%else
+  mova            m0, [r3 +       0]
+  mova            m1, [r3 + 16 *  4]
+  mova            m2, [r3 + 16 *  8]
+  mova            m3, [r3 + 16 * 12]
+  mova            m4, [r3 + 16 * 16]
+  mova            m5, [r3 + 16 * 20]
+  mova            m6, [r3 + 16 * 24]
+  mova            m7, [r3 + 16 * 28]
+%endif
+
+  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
+
+  IDCT32X32_34  16*0, 16*32, 16*64, 16*96
+  lea            stp, [stp + 16 * 8]
+  mov             r6, 4
+  lea            stp, [rsp + pass_one_start]
+  lea             r9, [rsp + pass_one_start]
+
+idct32x32_34_2:
+  lea             r4, [rsp + transposed_in]
+  mov             r3, r9
+
+idct32x32_34_transpose_2:
+  mova            m0, [r3 +      0]
+  mova            m1, [r3 + 16 * 1]
+  mova            m2, [r3 + 16 * 2]
+  mova            m3, [r3 + 16 * 3]
+  mova            m4, [r3 + 16 * 4]
+  mova            m5, [r3 + 16 * 5]
+  mova            m6, [r3 + 16 * 6]
+  mova            m7, [r3 + 16 * 7]
+
+  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
+
+  IDCT32X32_34  16*0, 16*8, 16*16, 16*24
+
+  lea            stp, [stp + 16 * 32]
+  add             r9, 16 * 32
+  dec             r6
+  jnz idct32x32_34_2
+
+  RECON_AND_STORE pass_two_start
+
+  RET
+
+%macro IDCT32X32_135 4
+  ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m1, [rsp + transposed_in + 16 *  1]
+  mova                m11, m1
+  pmulhrsw             m1, [pw___804x2] ; stp1_16
+  pmulhrsw            m11, [pw_16364x2] ; stp2_31
+
+  mova                 m7, [rsp + transposed_in + 16 *  7]
+  mova                m12, m7
+  pmulhrsw             m7, [pw_15426x2] ; stp1_28
+  pmulhrsw            m12, [pw_m5520x2] ; stp2_19
+
+  mova                 m3, [rsp + transposed_in + 16 *  9]
+  mova                 m4, m3
+  pmulhrsw             m3, [pw__7005x2] ; stp1_18
+  pmulhrsw             m4, [pw_14811x2] ; stp2_29
+
+  mova                 m0, [rsp + transposed_in + 16 * 15]
+  mova                 m2, m0
+  pmulhrsw             m0, [pw_12140x2]  ; stp1_30
+  pmulhrsw             m2, [pw_m11003x2] ; stp2_17
+
+  ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               1,  2, 9 ; stp2_16, stp2_17
+  SUM_SUB              12,  3, 9 ; stp2_19, stp2_18
+  SUM_SUB               7,  4, 9 ; stp2_28, stp2_29
+  SUM_SUB              11,  0, 9 ; stp2_31, stp2_30
+
+  ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  BUTTERFLY_4X          0,     2,   3196, 16069,  m8,  9,  10 ; stp1_17, stp1_30
+  BUTTERFLY_4Xmm        4,     3,   3196, 16069,  m8,  9,  10 ; stp1_29, stp1_18
+
+  ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               1, 12, 9 ; stp2_16, stp2_19
+  SUM_SUB               0,  3, 9 ; stp2_17, stp2_18
+  SUM_SUB              11,  7, 9 ; stp2_31, stp2_28
+  SUM_SUB               2,  4, 9 ; stp2_30, stp2_29
+
+  ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  BUTTERFLY_4X          4,     3,   6270, 15137,  m8,  9,  10 ; stp1_18, stp1_29
+  BUTTERFLY_4X          7,    12,   6270, 15137,  m8,  9,  10 ; stp1_19, stp1_28
+
+  mova [stp + %3 + idx16], m1
+  mova [stp + %3 + idx17], m0
+  mova [stp + %3 + idx18], m4
+  mova [stp + %3 + idx19], m7
+  mova [stp + %4 + idx28], m12
+  mova [stp + %4 + idx29], m3
+  mova [stp + %4 + idx30], m2
+  mova [stp + %4 + idx31], m11
+
+  ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m2, [rsp + transposed_in + 16 *  3]
+  mova                 m3, m2
+  pmulhrsw             m3, [pw_m2404x2] ; stp1_23
+  pmulhrsw             m2, [pw_16207x2] ; stp2_24
+
+  mova                 m5, [rsp + transposed_in + 16 *  5]
+  mova                 m6, m5
+  pmulhrsw             m5, [pw__3981x2] ; stp1_20
+  pmulhrsw             m6, [pw_15893x2] ; stp2_27
+
+  mova                m14, [rsp + transposed_in + 16 * 11]
+  mova                m13, m14
+  pmulhrsw            m13, [pw_m8423x2] ; stp1_21
+  pmulhrsw            m14, [pw_14053x2] ; stp2_26
+
+  mova                 m0, [rsp + transposed_in + 16 * 13]
+  mova                 m1, m0
+  pmulhrsw             m0, [pw__9760x2] ; stp1_22
+  pmulhrsw             m1, [pw_13160x2] ; stp2_25
+
+  ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               5, 13, 9 ; stp2_20, stp2_21
+  SUM_SUB               3,  0, 9 ; stp2_23, stp2_22
+  SUM_SUB               2,  1, 9 ; stp2_24, stp2_25
+  SUM_SUB               6, 14, 9 ; stp2_27, stp2_26
+
+  ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  BUTTERFLY_4X         14,    13,  13623,  9102,  m8,  9,  10 ; stp1_21, stp1_26
+  BUTTERFLY_4Xmm        1,     0,  13623,  9102,  m8,  9,  10 ; stp1_25, stp1_22
+
+  ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               3,  5, 9 ; stp2_23, stp2_20
+  SUM_SUB               0, 14, 9 ; stp2_22, stp2_21
+  SUM_SUB               2,  6, 9 ; stp2_24, stp2_27
+  SUM_SUB               1, 13, 9 ; stp2_25, stp2_26
+
+  ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  BUTTERFLY_4Xmm        6,     5,   6270, 15137,  m8,  9,  10 ; stp1_27, stp1_20
+  BUTTERFLY_4Xmm       13,    14,   6270, 15137,  m8,  9,  10 ; stp1_26, stp1_21
+
+  ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m4, [stp + %3 + idx16]
+  mova                 m7, [stp + %3 + idx17]
+  mova                m11, [stp + %3 + idx18]
+  mova                m12, [stp + %3 + idx19]
+  SUM_SUB               4,  3, 9 ; stp2_16, stp2_23
+  SUM_SUB               7,  0, 9 ; stp2_17, stp2_22
+  SUM_SUB              11, 14, 9 ; stp2_18, stp2_21
+  SUM_SUB              12,  5, 9 ; stp2_19, stp2_20
+  mova [stp + %3 + idx16], m4
+  mova [stp + %3 + idx17], m7
+  mova [stp + %3 + idx18], m11
+  mova [stp + %3 + idx19], m12
+
+  mova                 m4, [stp + %4 + idx28]
+  mova                 m7, [stp + %4 + idx29]
+  mova                m11, [stp + %4 + idx30]
+  mova                m12, [stp + %4 + idx31]
+  SUM_SUB               4,  6, 9 ; stp2_28, stp2_27
+  SUM_SUB               7, 13, 9 ; stp2_29, stp2_26
+  SUM_SUB              11,  1, 9 ; stp2_30, stp2_25
+  SUM_SUB              12,  2, 9 ; stp2_31, stp2_24
+  mova [stp + %4 + idx28], m4
+  mova [stp + %4 + idx29], m7
+  mova [stp + %4 + idx30], m11
+  mova [stp + %4 + idx31], m12
+
+  ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+%if 0 ; overflow occurs in SUM_SUB when using test streams
+  mova                m10, [pw_11585x2]
+  SUM_SUB               6,  5,  9
+  pmulhrsw             m6, m10  ; stp1_27
+  pmulhrsw             m5, m10  ; stp1_20
+  SUM_SUB              13, 14,  9
+  pmulhrsw            m13, m10  ; stp1_26
+  pmulhrsw            m14, m10  ; stp1_21
+  SUM_SUB               1,  0,  9
+  pmulhrsw             m1, m10  ; stp1_25
+  pmulhrsw             m0, m10  ; stp1_22
+  SUM_SUB               2,  3,  9
+  pmulhrsw             m2, m10  ; stp1_25
+  pmulhrsw             m3, m10  ; stp1_22
+%else
+  BUTTERFLY_4X          6,     5,  11585, 11585,  m8,  9,  10 ; stp1_20, stp1_27
+  SWAP  6, 5
+  BUTTERFLY_4X         13,    14,  11585, 11585,  m8,  9,  10 ; stp1_21, stp1_26
+  SWAP 13, 14
+  BUTTERFLY_4X          1,     0,  11585, 11585,  m8,  9,  10 ; stp1_22, stp1_25
+  SWAP  1, 0
+  BUTTERFLY_4X          2,     3,  11585, 11585,  m8,  9,  10 ; stp1_23, stp1_24
+  SWAP  2, 3
+%endif
+  mova [stp + %3 + idx20], m5
+  mova [stp + %3 + idx21], m14
+  mova [stp + %3 + idx22], m0
+  mova [stp + %3 + idx23], m3
+  mova [stp + %4 + idx24], m2
+  mova [stp + %4 + idx25], m1
+  mova [stp + %4 + idx26], m13
+  mova [stp + %4 + idx27], m6
+
+  ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  ;
+  ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m0, [rsp + transposed_in + 16 *  2]
+  mova                 m1, m0
+  pmulhrsw             m0, [pw__1606x2] ; stp1_8
+  pmulhrsw             m1, [pw_16305x2] ; stp2_15
+
+  mova                 m6, [rsp + transposed_in + 16 *  6]
+  mova                 m7, m6
+  pmulhrsw             m7, [pw_m4756x2] ; stp2_11
+  pmulhrsw             m6, [pw_15679x2] ; stp1_12
+
+  mova                 m4, [rsp + transposed_in + 16 * 10]
+  mova                 m5, m4
+  pmulhrsw             m4, [pw__7723x2] ; stp1_10
+  pmulhrsw             m5, [pw_14449x2] ; stp2_13
+
+  mova                 m2, [rsp + transposed_in + 16 * 14]
+  mova                 m3, m2
+  pmulhrsw             m3, [pw_m10394x2] ; stp1_9
+  pmulhrsw             m2, [pw_12665x2] ; stp2_14
+
+  ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               0,  3, 9 ;  stp1_8, stp1_9
+  SUM_SUB               7,  4, 9 ; stp1_11, stp1_10
+  SUM_SUB               6,  5, 9 ; stp1_12, stp1_13
+  SUM_SUB               1,  2, 9 ; stp1_15, stp1_14
+
+  ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  BUTTERFLY_4X          2,     3,   6270, 15137,  m8,  9,  10 ;  stp1_9, stp1_14
+  BUTTERFLY_4Xmm        5,     4,   6270, 15137,  m8,  9,  10 ; stp1_13, stp1_10
+
+  ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               0,  7, 9 ;  stp1_8, stp1_11
+  SUM_SUB               2,  4, 9 ;  stp1_9, stp1_10
+  SUM_SUB               1,  6, 9 ;  stp1_15, stp1_12
+  SUM_SUB               3,  5, 9 ;  stp1_14, stp1_13
+
+  ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+%if 0 ; overflow occurs in SUM_SUB when using test streams
+  mova                m10, [pw_11585x2]
+  SUM_SUB               5,    4,  9
+  pmulhrsw             m5, m10  ; stp1_13
+  pmulhrsw             m4, m10  ; stp1_10
+  SUM_SUB               6,    7,  9
+  pmulhrsw             m6, m10  ; stp1_12
+  pmulhrsw             m7, m10  ; stp1_11
+%else
+  BUTTERFLY_4X       5,     4,  11585,  11585,  m8,  9,  10 ; stp1_10, stp1_13
+  SWAP  5, 4
+  BUTTERFLY_4X       6,     7,  11585,  11585,  m8,  9,  10 ; stp1_11, stp1_12
+  SWAP  6, 7
+%endif
+  ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova [stp + %2 +  idx8], m0
+  mova [stp + %2 +  idx9], m2
+  mova [stp + %2 + idx10], m4
+  mova [stp + %2 + idx11], m7
+  mova [stp + %2 + idx12], m6
+  mova [stp + %2 + idx13], m5
+  mova [stp + %2 + idx14], m3
+  mova [stp + %2 + idx15], m1
+
+  ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  ;
+  ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  ;
+  ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                m11, [rsp + transposed_in + 16 *  4]
+  mova                m12, m11
+  pmulhrsw            m11, [pw__3196x2] ; stp1_4
+  pmulhrsw            m12, [pw_16069x2] ; stp1_7
+
+  mova                m13, [rsp + transposed_in + 16 * 12]
+  mova                m14, m13
+  pmulhrsw            m13, [pw_13623x2] ; stp1_6
+  pmulhrsw            m14, [pw_m9102x2] ; stp1_5
+
+  ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m0, [rsp + transposed_in + 16 *  0]
+  mova                 m2, [rsp + transposed_in + 16 *  8]
+  pmulhrsw             m0, [pw_11585x2]  ; stp1_1
+  mova                 m3, m2
+  pmulhrsw             m2, [pw__6270x2]  ; stp1_2
+  pmulhrsw             m3, [pw_15137x2]  ; stp1_3
+
+  SUM_SUB              11, 14, 9 ;  stp1_4, stp1_5
+  SUM_SUB              12, 13, 9 ;  stp1_7, stp1_6
+
+  ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+%if 0 ; overflow occurs in SUM_SUB when using test streams
+  mova                m10, [pw_11585x2]
+  SUM_SUB              13,   14,  9
+  pmulhrsw            m13, m10  ; stp1_6
+  pmulhrsw            m14, m10  ; stp1_5
+%else
+  BUTTERFLY_4X         13,    14,  11585, 11585,  m8,  9,  10 ; stp1_5, stp1_6
+  SWAP 13, 14
+%endif
+  mova                 m1, m0    ; stp1_0 = stp1_1
+  SUM_SUB               0,  3, 9 ;  stp1_0, stp1_3
+  SUM_SUB               1,  2, 9 ;  stp1_1, stp1_2
+
+  ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               0, 12, 9 ;  stp1_0, stp1_7
+  SUM_SUB               1, 13, 9 ;  stp1_1, stp1_6
+  SUM_SUB               2, 14, 9 ;  stp1_2, stp1_5
+  SUM_SUB               3, 11, 9 ;  stp1_3, stp1_4
+
+  ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m4, [stp + %2 + idx12]
+  mova                 m5, [stp + %2 + idx13]
+  mova                 m6, [stp + %2 + idx14]
+  mova                 m7, [stp + %2 + idx15]
+  SUM_SUB               0,  7, 9 ;  stp1_0, stp1_15
+  SUM_SUB               1,  6, 9 ;  stp1_1, stp1_14
+  SUM_SUB               2,  5, 9 ;  stp1_2, stp1_13
+  SUM_SUB               3,  4, 9 ;  stp1_3, stp1_12
+
+  ; 0-3, 28-31 final stage
+  mova                m10, [stp + %4 + idx31]
+  mova                m15, [stp + %4 + idx30]
+  SUM_SUB               0, 10, 9 ;  stp1_0, stp1_31
+  SUM_SUB               1, 15, 9 ;  stp1_1, stp1_30
+  mova [stp + %1 +  idx0], m0
+  mova [stp + %1 +  idx1], m1
+  mova [stp + %4 + idx31], m10
+  mova [stp + %4 + idx30], m15
+  mova                 m0, [stp + %4 + idx29]
+  mova                 m1, [stp + %4 + idx28]
+  SUM_SUB               2,  0, 9 ;  stp1_2, stp1_29
+  SUM_SUB               3,  1, 9 ;  stp1_3, stp1_28
+  mova [stp + %1 +  idx2], m2
+  mova [stp + %1 +  idx3], m3
+  mova [stp + %4 + idx29], m0
+  mova [stp + %4 + idx28], m1
+
+  ; 12-15, 16-19 final stage
+  mova                 m0, [stp + %3 + idx16]
+  mova                 m1, [stp + %3 + idx17]
+  mova                 m2, [stp + %3 + idx18]
+  mova                 m3, [stp + %3 + idx19]
+  SUM_SUB               7,  0, 9 ;  stp1_15, stp1_16
+  SUM_SUB               6,  1, 9 ;  stp1_14, stp1_17
+  SUM_SUB               5,  2, 9 ;  stp1_13, stp1_18
+  SUM_SUB               4,  3, 9 ;  stp1_12, stp1_19
+  mova [stp + %2 + idx12], m4
+  mova [stp + %2 + idx13], m5
+  mova [stp + %2 + idx14], m6
+  mova [stp + %2 + idx15], m7
+  mova [stp + %3 + idx16], m0
+  mova [stp + %3 + idx17], m1
+  mova [stp + %3 + idx18], m2
+  mova [stp + %3 + idx19], m3
+
+  mova                 m4, [stp + %2 +  idx8]
+  mova                 m5, [stp + %2 +  idx9]
+  mova                 m6, [stp + %2 + idx10]
+  mova                 m7, [stp + %2 + idx11]
+  SUM_SUB              11,  7, 9 ;  stp1_4, stp1_11
+  SUM_SUB              14,  6, 9 ;  stp1_5, stp1_10
+  SUM_SUB              13,  5, 9 ;  stp1_6, stp1_9
+  SUM_SUB              12,  4, 9 ;  stp1_7, stp1_8
+
+  ; 4-7, 24-27 final stage
+  mova                 m3, [stp + %4 + idx24]
+  mova                 m2, [stp + %4 + idx25]
+  mova                 m1, [stp + %4 + idx26]
+  mova                 m0, [stp + %4 + idx27]
+  SUM_SUB              12,  3, 9 ;  stp1_7, stp1_24
+  SUM_SUB              13,  2, 9 ;  stp1_6, stp1_25
+  SUM_SUB              14,  1, 9 ;  stp1_5, stp1_26
+  SUM_SUB              11,  0, 9 ;  stp1_4, stp1_27
+  mova [stp + %4 + idx24], m3
+  mova [stp + %4 + idx25], m2
+  mova [stp + %4 + idx26], m1
+  mova [stp + %4 + idx27], m0
+  mova [stp + %1 +  idx4], m11
+  mova [stp + %1 +  idx5], m14
+  mova [stp + %1 +  idx6], m13
+  mova [stp + %1 +  idx7], m12
+
+  ; 8-11, 20-23 final stage
+  mova                 m0, [stp + %3 + idx20]
+  mova                 m1, [stp + %3 + idx21]
+  mova                 m2, [stp + %3 + idx22]
+  mova                 m3, [stp + %3 + idx23]
+  SUM_SUB               7,  0, 9 ;  stp1_11, stp_20
+  SUM_SUB               6,  1, 9 ;  stp1_10, stp_21
+  SUM_SUB               5,  2, 9 ;   stp1_9, stp_22
+  SUM_SUB               4,  3, 9 ;   stp1_8, stp_23
+  mova [stp + %2 +  idx8], m4
+  mova [stp + %2 +  idx9], m5
+  mova [stp + %2 + idx10], m6
+  mova [stp + %2 + idx11], m7
+  mova [stp + %3 + idx20], m0
+  mova [stp + %3 + idx21], m1
+  mova [stp + %3 + idx22], m2
+  mova [stp + %3 + idx23], m3
+%endmacro
+
+INIT_XMM ssse3
+cglobal idct32x32_135_add, 3, 11, 16, i32x32_size, input, output, stride
+  mova            m8, [pd_8192]
+  mov             r6, 2
+  lea            stp, [rsp + pass_one_start]
+
+idct32x32_135:
+  mov             r3, inputq
+  lea             r4, [rsp + transposed_in]
+  mov             r7, 2
+
+idct32x32_135_transpose:
+%if CONFIG_VP9_HIGHBITDEPTH
+  mova            m0, [r3 +       0]
+  packssdw        m0, [r3 +      16]
+  mova            m1, [r3 + 32 *  4]
+  packssdw        m1, [r3 + 32 *  4 + 16]
+  mova            m2, [r3 + 32 *  8]
+  packssdw        m2, [r3 + 32 *  8 + 16]
+  mova            m3, [r3 + 32 * 12]
+  packssdw        m3, [r3 + 32 * 12 + 16]
+  mova            m4, [r3 + 32 * 16]
+  packssdw        m4, [r3 + 32 * 16 + 16]
+  mova            m5, [r3 + 32 * 20]
+  packssdw        m5, [r3 + 32 * 20 + 16]
+  mova            m6, [r3 + 32 * 24]
+  packssdw        m6, [r3 + 32 * 24 + 16]
+  mova            m7, [r3 + 32 * 28]
+  packssdw        m7, [r3 + 32 * 28 + 16]
+%else
+  mova            m0, [r3 +       0]
+  mova            m1, [r3 + 16 *  4]
+  mova            m2, [r3 + 16 *  8]
+  mova            m3, [r3 + 16 * 12]
+  mova            m4, [r3 + 16 * 16]
+  mova            m5, [r3 + 16 * 20]
+  mova            m6, [r3 + 16 * 24]
+  mova            m7, [r3 + 16 * 28]
+%endif
+  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
+
+  mova [r4 +      0], m0
+  mova [r4 + 16 * 1], m1
+  mova [r4 + 16 * 2], m2
+  mova [r4 + 16 * 3], m3
+  mova [r4 + 16 * 4], m4
+  mova [r4 + 16 * 5], m5
+  mova [r4 + 16 * 6], m6
+  mova [r4 + 16 * 7], m7
+
+%if CONFIG_VP9_HIGHBITDEPTH
+  add             r3, 32
+%else
+  add             r3, 16
+%endif
+  add             r4, 16 * 8
+  dec             r7
+  jne idct32x32_135_transpose
+
+  IDCT32X32_135 16*0, 16*32, 16*64, 16*96
+  lea            stp, [stp + 16 * 8]
+%if CONFIG_VP9_HIGHBITDEPTH
+  lea         inputq, [inputq + 32 * 32]
+%else
+  lea         inputq, [inputq + 16 * 32]
+%endif
+  dec             r6
+  jnz idct32x32_135
+
+  mov             r6, 4
+  lea            stp, [rsp + pass_one_start]
+  lea             r9, [rsp + pass_one_start]
+
+idct32x32_135_2:
+  lea             r4, [rsp + transposed_in]
+  mov             r3, r9
+  mov             r7, 2
+
+idct32x32_135_transpose_2:
+  mova            m0, [r3 +      0]
+  mova            m1, [r3 + 16 * 1]
+  mova            m2, [r3 + 16 * 2]
+  mova            m3, [r3 + 16 * 3]
+  mova            m4, [r3 + 16 * 4]
+  mova            m5, [r3 + 16 * 5]
+  mova            m6, [r3 + 16 * 6]
+  mova            m7, [r3 + 16 * 7]
+
+  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
+
+  mova [r4 +      0], m0
+  mova [r4 + 16 * 1], m1
+  mova [r4 + 16 * 2], m2
+  mova [r4 + 16 * 3], m3
+  mova [r4 + 16 * 4], m4
+  mova [r4 + 16 * 5], m5
+  mova [r4 + 16 * 6], m6
+  mova [r4 + 16 * 7], m7
+
+  add             r3, 16 * 8
+  add             r4, 16 * 8
+  dec             r7
+  jne idct32x32_135_transpose_2
+
+  IDCT32X32_135 16*0, 16*8, 16*16, 16*24
+
+  lea            stp, [stp + 16 * 32]
+  add             r9, 16 * 32
+  dec             r6
+  jnz idct32x32_135_2
+
+  RECON_AND_STORE pass_two_start
+
+  RET
+
+%macro IDCT32X32_1024 4
+  ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m1, [rsp + transposed_in + 16 *  1]
+  mova                m11, [rsp + transposed_in + 16 * 31]
+  BUTTERFLY_4X          1,    11,    804, 16364,  m8,  9,  10 ; stp1_16, stp1_31
+
+  mova                 m0, [rsp + transposed_in + 16 * 15]
+  mova                 m2, [rsp + transposed_in + 16 * 17]
+  BUTTERFLY_4X          2,     0,  12140, 11003,  m8,  9,  10 ; stp1_17, stp1_30
+
+  mova                 m7, [rsp + transposed_in + 16 *  7]
+  mova                m12, [rsp + transposed_in + 16 * 25]
+  BUTTERFLY_4X         12,     7,  15426,  5520,  m8,  9,  10 ; stp1_19, stp1_28
+
+  mova                 m3, [rsp + transposed_in + 16 *  9]
+  mova                 m4, [rsp + transposed_in + 16 * 23]
+  BUTTERFLY_4X          3,     4,   7005, 14811,  m8,  9,  10 ; stp1_18, stp1_29
+
+  ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               1,  2, 9 ; stp2_16, stp2_17
+  SUM_SUB              12,  3, 9 ; stp2_19, stp2_18
+  SUM_SUB               7,  4, 9 ; stp2_28, stp2_29
+  SUM_SUB              11,  0, 9 ; stp2_31, stp2_30
+
+  ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  BUTTERFLY_4X          0,     2,   3196, 16069,  m8,  9,  10 ; stp1_17, stp1_30
+  BUTTERFLY_4Xmm        4,     3,   3196, 16069,  m8,  9,  10 ; stp1_29, stp1_18
+
+  ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               1, 12, 9 ; stp2_16, stp2_19
+  SUM_SUB               0,  3, 9 ; stp2_17, stp2_18
+  SUM_SUB              11,  7, 9 ; stp2_31, stp2_28
+  SUM_SUB               2,  4, 9 ; stp2_30, stp2_29
+
+  ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  BUTTERFLY_4X          4,     3,   6270, 15137,  m8,  9,  10 ; stp1_18, stp1_29
+  BUTTERFLY_4X          7,    12,   6270, 15137,  m8,  9,  10 ; stp1_19, stp1_28
+
+  mova [stp + %3 + idx16], m1
+  mova [stp + %3 + idx17], m0
+  mova [stp + %3 + idx18], m4
+  mova [stp + %3 + idx19], m7
+  mova [stp + %4 + idx28], m12
+  mova [stp + %4 + idx29], m3
+  mova [stp + %4 + idx30], m2
+  mova [stp + %4 + idx31], m11
+
+  ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m5, [rsp + transposed_in + 16 *  5]
+  mova                 m6, [rsp + transposed_in + 16 * 27]
+  BUTTERFLY_4X          5,     6,   3981, 15893,  m8,  9,  10 ; stp1_20, stp1_27
+
+  mova                m13, [rsp + transposed_in + 16 * 21]
+  mova                m14, [rsp + transposed_in + 16 * 11]
+  BUTTERFLY_4X         13,    14,  14053,  8423,  m8,  9,  10 ; stp1_21, stp1_26
+
+  mova                 m0, [rsp + transposed_in + 16 * 13]
+  mova                 m1, [rsp + transposed_in + 16 * 19]
+  BUTTERFLY_4X          0,     1,   9760, 13160,  m8,  9,  10 ; stp1_22, stp1_25
+
+  mova                 m2, [rsp + transposed_in + 16 *  3]
+  mova                 m3, [rsp + transposed_in + 16 * 29]
+  BUTTERFLY_4X          3,     2,  16207,  2404,  m8,  9,  10 ; stp1_23, stp1_24
+
+  ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               5, 13, 9 ; stp2_20, stp2_21
+  SUM_SUB               3,  0, 9 ; stp2_23, stp2_22
+  SUM_SUB               2,  1, 9 ; stp2_24, stp2_25
+  SUM_SUB               6, 14, 9 ; stp2_27, stp2_26
+
+  ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  BUTTERFLY_4X         14,    13,  13623,  9102,  m8,  9,  10 ; stp1_21, stp1_26
+  BUTTERFLY_4Xmm        1,     0,  13623,  9102,  m8,  9,  10 ; stp1_25, stp1_22
+
+  ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               3,  5, 9 ; stp2_23, stp2_20
+  SUM_SUB               0, 14, 9 ; stp2_22, stp2_21
+  SUM_SUB               2,  6, 9 ; stp2_24, stp2_27
+  SUM_SUB               1, 13, 9 ; stp2_25, stp2_26
+
+  ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  BUTTERFLY_4Xmm        6,     5,   6270, 15137,  m8,  9,  10 ; stp1_27, stp1_20
+  BUTTERFLY_4Xmm       13,    14,   6270, 15137,  m8,  9,  10 ; stp1_26, stp1_21
+
+  ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m4, [stp + %3 + idx16]
+  mova                 m7, [stp + %3 + idx17]
+  mova                m11, [stp + %3 + idx18]
+  mova                m12, [stp + %3 + idx19]
+  SUM_SUB               4,  3, 9 ; stp2_16, stp2_23
+  SUM_SUB               7,  0, 9 ; stp2_17, stp2_22
+  SUM_SUB              11, 14, 9 ; stp2_18, stp2_21
+  SUM_SUB              12,  5, 9 ; stp2_19, stp2_20
+  mova [stp + %3 + idx16], m4
+  mova [stp + %3 + idx17], m7
+  mova [stp + %3 + idx18], m11
+  mova [stp + %3 + idx19], m12
+
+  mova                 m4, [stp + %4 + idx28]
+  mova                 m7, [stp + %4 + idx29]
+  mova                m11, [stp + %4 + idx30]
+  mova                m12, [stp + %4 + idx31]
+  SUM_SUB               4,  6, 9 ; stp2_28, stp2_27
+  SUM_SUB               7, 13, 9 ; stp2_29, stp2_26
+  SUM_SUB              11,  1, 9 ; stp2_30, stp2_25
+  SUM_SUB              12,  2, 9 ; stp2_31, stp2_24
+  mova [stp + %4 + idx28], m4
+  mova [stp + %4 + idx29], m7
+  mova [stp + %4 + idx30], m11
+  mova [stp + %4 + idx31], m12
+
+  ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+%if 0 ; overflow occurs in SUM_SUB when using test streams
+  mova                m10, [pw_11585x2]
+  SUM_SUB               6,  5,  9
+  pmulhrsw             m6, m10  ; stp1_27
+  pmulhrsw             m5, m10  ; stp1_20
+  SUM_SUB              13, 14,  9
+  pmulhrsw            m13, m10  ; stp1_26
+  pmulhrsw            m14, m10  ; stp1_21
+  SUM_SUB               1,  0,  9
+  pmulhrsw             m1, m10  ; stp1_25
+  pmulhrsw             m0, m10  ; stp1_22
+  SUM_SUB               2,  3,  9
+  pmulhrsw             m2, m10  ; stp1_25
+  pmulhrsw             m3, m10  ; stp1_22
+%else
+  BUTTERFLY_4X          6,     5,  11585, 11585,  m8,  9,  10 ; stp1_20, stp1_27
+  SWAP  6, 5
+  BUTTERFLY_4X         13,    14,  11585, 11585,  m8,  9,  10 ; stp1_21, stp1_26
+  SWAP 13, 14
+  BUTTERFLY_4X          1,     0,  11585, 11585,  m8,  9,  10 ; stp1_22, stp1_25
+  SWAP  1, 0
+  BUTTERFLY_4X          2,     3,  11585, 11585,  m8,  9,  10 ; stp1_23, stp1_24
+  SWAP  2, 3
+%endif
+  mova [stp + %3 + idx20], m5
+  mova [stp + %3 + idx21], m14
+  mova [stp + %3 + idx22], m0
+  mova [stp + %3 + idx23], m3
+  mova [stp + %4 + idx24], m2
+  mova [stp + %4 + idx25], m1
+  mova [stp + %4 + idx26], m13
+  mova [stp + %4 + idx27], m6
+
+  ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  ;
+  ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m0, [rsp + transposed_in + 16 *  2]
+  mova                 m1, [rsp + transposed_in + 16 * 30]
+  BUTTERFLY_4X          0,     1,   1606, 16305,  m8,  9,  10 ; stp1_8, stp1_15
+
+  mova                 m2, [rsp + transposed_in + 16 * 14]
+  mova                 m3, [rsp + transposed_in + 16 * 18]
+  BUTTERFLY_4X          3,     2,  12665, 10394,  m8,  9,  10 ; stp1_9, stp1_14
+
+  mova                 m4, [rsp + transposed_in + 16 * 10]
+  mova                 m5, [rsp + transposed_in + 16 * 22]
+  BUTTERFLY_4X          4,     5,   7723, 14449,  m8,  9,  10 ; stp1_10, stp1_13
+
+  mova                 m6, [rsp + transposed_in + 16 *  6]
+  mova                 m7, [rsp + transposed_in + 16 * 26]
+  BUTTERFLY_4X          7,     6,  15679,  4756,  m8,  9,  10 ; stp1_11, stp1_12
+
+  ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               0,  3, 9 ;  stp1_8, stp1_9
+  SUM_SUB               7,  4, 9 ; stp1_11, stp1_10
+  SUM_SUB               6,  5, 9 ; stp1_12, stp1_13
+  SUM_SUB               1,  2, 9 ; stp1_15, stp1_14
+
+  ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  BUTTERFLY_4X          2,     3,   6270, 15137,  m8,  9,  10 ;  stp1_9, stp1_14
+  BUTTERFLY_4Xmm        5,     4,   6270, 15137,  m8,  9,  10 ; stp1_13, stp1_10
+
+  ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               0,  7, 9 ;  stp1_8, stp1_11
+  SUM_SUB               2,  4, 9 ;  stp1_9, stp1_10
+  SUM_SUB               1,  6, 9 ;  stp1_15, stp1_12
+  SUM_SUB               3,  5, 9 ;  stp1_14, stp1_13
+
+  ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+%if 0 ; overflow occurs in SUM_SUB when using test streams
+  mova                m10, [pw_11585x2]
+  SUM_SUB               5,    4,  9
+  pmulhrsw             m5, m10  ; stp1_13
+  pmulhrsw             m4, m10  ; stp1_10
+  SUM_SUB               6,    7,  9
+  pmulhrsw             m6, m10  ; stp1_12
+  pmulhrsw             m7, m10  ; stp1_11
+%else
+  BUTTERFLY_4X       5,     4,  11585,  11585,  m8,  9,  10 ; stp1_10, stp1_13
+  SWAP  5, 4
+  BUTTERFLY_4X       6,     7,  11585,  11585,  m8,  9,  10 ; stp1_11, stp1_12
+  SWAP  6, 7
+%endif
+  ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova [stp + %2 +  idx8], m0
+  mova [stp + %2 +  idx9], m2
+  mova [stp + %2 + idx10], m4
+  mova [stp + %2 + idx11], m7
+  mova [stp + %2 + idx12], m6
+  mova [stp + %2 + idx13], m5
+  mova [stp + %2 + idx14], m3
+  mova [stp + %2 + idx15], m1
+
+  ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  ;
+  ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  ;
+  ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                m11, [rsp + transposed_in + 16 *  4]
+  mova                m12, [rsp + transposed_in + 16 * 28]
+  BUTTERFLY_4X         11,    12,   3196, 16069,  m8,  9,  10 ; stp1_4, stp1_7
+
+  mova                m13, [rsp + transposed_in + 16 * 12]
+  mova                m14, [rsp + transposed_in + 16 * 20]
+  BUTTERFLY_4X         14,    13,  13623,  9102,  m8,  9,  10 ; stp1_5, stp1_6
+
+  ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m0, [rsp + transposed_in + 16 *  0]
+  mova                 m1, [rsp + transposed_in + 16 * 16]
+
+%if 0 ; overflow occurs in SUM_SUB when using test streams
+  mova                m10, [pw_11585x2]
+  SUM_SUB               0,    1,  9
+  pmulhrsw             m0, m10  ; stp1_1
+  pmulhrsw             m1, m10  ; stp1_0
+%else
+  BUTTERFLY_4X          0,     1,  11585, 11585,  m8,  9,  10 ; stp1_1, stp1_0
+  SWAP  0, 1
+%endif
+  mova                 m2, [rsp + transposed_in + 16 *  8]
+  mova                 m3, [rsp + transposed_in + 16 * 24]
+  BUTTERFLY_4X          2,     3,   6270, 15137,  m8,  9,  10 ;  stp1_2, stp1_3
+
+  mova                m10, [pw_11585x2]
+  SUM_SUB              11, 14, 9 ;  stp1_4, stp1_5
+  SUM_SUB              12, 13, 9 ;  stp1_7, stp1_6
+
+  ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+%if 0 ; overflow occurs in SUM_SUB when using test streams
+  SUM_SUB              13,   14,  9
+  pmulhrsw            m13, m10  ; stp1_6
+  pmulhrsw            m14, m10  ; stp1_5
+%else
+  BUTTERFLY_4X         13,    14,  11585, 11585,  m8,  9,  10 ; stp1_5, stp1_6
+  SWAP 13, 14
+%endif
+  SUM_SUB               0,  3, 9 ;  stp1_0, stp1_3
+  SUM_SUB               1,  2, 9 ;  stp1_1, stp1_2
+
+  ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               0, 12, 9 ;  stp1_0, stp1_7
+  SUM_SUB               1, 13, 9 ;  stp1_1, stp1_6
+  SUM_SUB               2, 14, 9 ;  stp1_2, stp1_5
+  SUM_SUB               3, 11, 9 ;  stp1_3, stp1_4
+
+  ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m4, [stp + %2 + idx12]
+  mova                 m5, [stp + %2 + idx13]
+  mova                 m6, [stp + %2 + idx14]
+  mova                 m7, [stp + %2 + idx15]
+  SUM_SUB               0,  7, 9 ;  stp1_0, stp1_15
+  SUM_SUB               1,  6, 9 ;  stp1_1, stp1_14
+  SUM_SUB               2,  5, 9 ;  stp1_2, stp1_13
+  SUM_SUB               3,  4, 9 ;  stp1_3, stp1_12
+
+  ; 0-3, 28-31 final stage
+  mova                m10, [stp + %4 + idx31]
+  mova                m15, [stp + %4 + idx30]
+  SUM_SUB               0, 10, 9 ;  stp1_0, stp1_31
+  SUM_SUB               1, 15, 9 ;  stp1_1, stp1_30
+  mova [stp + %1 +  idx0], m0
+  mova [stp + %1 +  idx1], m1
+  mova [stp + %4 + idx31], m10
+  mova [stp + %4 + idx30], m15
+  mova                 m0, [stp + %4 + idx29]
+  mova                 m1, [stp + %4 + idx28]
+  SUM_SUB               2,  0, 9 ;  stp1_2, stp1_29
+  SUM_SUB               3,  1, 9 ;  stp1_3, stp1_28
+  mova [stp + %1 +  idx2], m2
+  mova [stp + %1 +  idx3], m3
+  mova [stp + %4 + idx29], m0
+  mova [stp + %4 + idx28], m1
+
+  ; 12-15, 16-19 final stage
+  mova                 m0, [stp + %3 + idx16]
+  mova                 m1, [stp + %3 + idx17]
+  mova                 m2, [stp + %3 + idx18]
+  mova                 m3, [stp + %3 + idx19]
+  SUM_SUB               7,  0, 9 ;  stp1_15, stp1_16
+  SUM_SUB               6,  1, 9 ;  stp1_14, stp1_17
+  SUM_SUB               5,  2, 9 ;  stp1_13, stp1_18
+  SUM_SUB               4,  3, 9 ;  stp1_12, stp1_19
+  mova [stp + %2 + idx12], m4
+  mova [stp + %2 + idx13], m5
+  mova [stp + %2 + idx14], m6
+  mova [stp + %2 + idx15], m7
+  mova [stp + %3 + idx16], m0
+  mova [stp + %3 + idx17], m1
+  mova [stp + %3 + idx18], m2
+  mova [stp + %3 + idx19], m3
+
+  mova                 m4, [stp + %2 +  idx8]
+  mova                 m5, [stp + %2 +  idx9]
+  mova                 m6, [stp + %2 + idx10]
+  mova                 m7, [stp + %2 + idx11]
+  SUM_SUB              11,  7, 9 ;  stp1_4, stp1_11
+  SUM_SUB              14,  6, 9 ;  stp1_5, stp1_10
+  SUM_SUB              13,  5, 9 ;  stp1_6, stp1_9
+  SUM_SUB              12,  4, 9 ;  stp1_7, stp1_8
+
+  ; 4-7, 24-27 final stage
+  mova                 m3, [stp + %4 + idx24]
+  mova                 m2, [stp + %4 + idx25]
+  mova                 m1, [stp + %4 + idx26]
+  mova                 m0, [stp + %4 + idx27]
+  SUM_SUB              12,  3, 9 ;  stp1_7, stp1_24
+  SUM_SUB              13,  2, 9 ;  stp1_6, stp1_25
+  SUM_SUB              14,  1, 9 ;  stp1_5, stp1_26
+  SUM_SUB              11,  0, 9 ;  stp1_4, stp1_27
+  mova [stp + %4 + idx24], m3
+  mova [stp + %4 + idx25], m2
+  mova [stp + %4 + idx26], m1
+  mova [stp + %4 + idx27], m0
+  mova [stp + %1 +  idx4], m11
+  mova [stp + %1 +  idx5], m14
+  mova [stp + %1 +  idx6], m13
+  mova [stp + %1 +  idx7], m12
+
+  ; 8-11, 20-23 final stage
+  mova                 m0, [stp + %3 + idx20]
+  mova                 m1, [stp + %3 + idx21]
+  mova                 m2, [stp + %3 + idx22]
+  mova                 m3, [stp + %3 + idx23]
+  SUM_SUB               7,  0, 9 ;  stp1_11, stp_20
+  SUM_SUB               6,  1, 9 ;  stp1_10, stp_21
+  SUM_SUB               5,  2, 9 ;   stp1_9, stp_22
+  SUM_SUB               4,  3, 9 ;   stp1_8, stp_23
+  mova [stp + %2 +  idx8], m4
+  mova [stp + %2 +  idx9], m5
+  mova [stp + %2 + idx10], m6
+  mova [stp + %2 + idx11], m7
+  mova [stp + %3 + idx20], m0
+  mova [stp + %3 + idx21], m1
+  mova [stp + %3 + idx22], m2
+  mova [stp + %3 + idx23], m3
+%endmacro
+
+INIT_XMM ssse3
+cglobal idct32x32_1024_add, 3, 11, 16, i32x32_size, input, output, stride
+  mova            m8, [pd_8192]
+  mov             r6, 4
+  lea            stp, [rsp + pass_one_start]
+
+idct32x32_1024:
+  mov             r3, inputq
+  lea             r4, [rsp + transposed_in]
+  mov             r7, 4
+
+idct32x32_1024_transpose:
+%if CONFIG_VP9_HIGHBITDEPTH
+  mova            m0, [r3 +       0]
+  packssdw        m0, [r3 +      16]
+  mova            m1, [r3 + 32 *  4]
+  packssdw        m1, [r3 + 32 *  4 + 16]
+  mova            m2, [r3 + 32 *  8]
+  packssdw        m2, [r3 + 32 *  8 + 16]
+  mova            m3, [r3 + 32 * 12]
+  packssdw        m3, [r3 + 32 * 12 + 16]
+  mova            m4, [r3 + 32 * 16]
+  packssdw        m4, [r3 + 32 * 16 + 16]
+  mova            m5, [r3 + 32 * 20]
+  packssdw        m5, [r3 + 32 * 20 + 16]
+  mova            m6, [r3 + 32 * 24]
+  packssdw        m6, [r3 + 32 * 24 + 16]
+  mova            m7, [r3 + 32 * 28]
+  packssdw        m7, [r3 + 32 * 28 + 16]
+%else
+  mova            m0, [r3 +       0]
+  mova            m1, [r3 + 16 *  4]
+  mova            m2, [r3 + 16 *  8]
+  mova            m3, [r3 + 16 * 12]
+  mova            m4, [r3 + 16 * 16]
+  mova            m5, [r3 + 16 * 20]
+  mova            m6, [r3 + 16 * 24]
+  mova            m7, [r3 + 16 * 28]
+%endif
+
+  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
+
+  mova [r4 +      0], m0
+  mova [r4 + 16 * 1], m1
+  mova [r4 + 16 * 2], m2
+  mova [r4 + 16 * 3], m3
+  mova [r4 + 16 * 4], m4
+  mova [r4 + 16 * 5], m5
+  mova [r4 + 16 * 6], m6
+  mova [r4 + 16 * 7], m7
+%if CONFIG_VP9_HIGHBITDEPTH
+  add             r3, 32
+%else
+  add             r3, 16
+%endif
+  add             r4, 16 * 8
+  dec             r7
+  jne idct32x32_1024_transpose
+
+  IDCT32X32_1024 16*0, 16*32, 16*64, 16*96
+
+  lea            stp, [stp + 16 * 8]
+%if CONFIG_VP9_HIGHBITDEPTH
+  lea         inputq, [inputq + 32 * 32]
+%else
+  lea         inputq, [inputq + 16 * 32]
+%endif
+  dec             r6
+  jnz idct32x32_1024
+
+  mov             r6, 4
+  lea            stp, [rsp + pass_one_start]
+  lea             r9, [rsp + pass_one_start]
+
+idct32x32_1024_2:
+  lea             r4, [rsp + transposed_in]
+  mov             r3, r9
+  mov             r7, 4
+
+idct32x32_1024_transpose_2:
+  mova            m0, [r3 +      0]
+  mova            m1, [r3 + 16 * 1]
+  mova            m2, [r3 + 16 * 2]
+  mova            m3, [r3 + 16 * 3]
+  mova            m4, [r3 + 16 * 4]
+  mova            m5, [r3 + 16 * 5]
+  mova            m6, [r3 + 16 * 6]
+  mova            m7, [r3 + 16 * 7]
+
+  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
+
+  mova [r4 +      0], m0
+  mova [r4 + 16 * 1], m1
+  mova [r4 + 16 * 2], m2
+  mova [r4 + 16 * 3], m3
+  mova [r4 + 16 * 4], m4
+  mova [r4 + 16 * 5], m5
+  mova [r4 + 16 * 6], m6
+  mova [r4 + 16 * 7], m7
+
+  add             r3, 16 * 8
+  add             r4, 16 * 8
+  dec             r7
+  jne idct32x32_1024_transpose_2
+
+  IDCT32X32_1024 16*0, 16*8, 16*16, 16*24
+
+  lea            stp, [stp + 16 * 32]
+  add             r9, 16 * 32
+  dec             r6
+  jnz idct32x32_1024_2
+
+  RECON_AND_STORE pass_two_start
+
+  RET
+%endif
diff --git a/libs/libvpx/vpx_dsp/x86/inv_wht_sse2.asm b/libs/libvpx/vpx_dsp/x86/inv_wht_sse2.asm
new file mode 100644
index 0000000000..fbbcd76bd7
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/inv_wht_sse2.asm
@@ -0,0 +1,109 @@
+;
+;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro REORDER_INPUTS 0
+  ; a c d b  to  a b c d
+  SWAP 1, 3, 2
+%endmacro
+
+%macro TRANSFORM_COLS 0
+  ; input:
+  ; m0 a
+  ; m1 b
+  ; m2 c
+  ; m3 d
+  paddw           m0,        m2
+  psubw           m3,        m1
+
+  ; wide subtract
+  punpcklwd       m4,        m0
+  punpcklwd       m5,        m3
+  psrad           m4,        16
+  psrad           m5,        16
+  psubd           m4,        m5
+  psrad           m4,        1
+  packssdw        m4,        m4             ; e
+
+  psubw           m5,        m4,        m1  ; b
+  psubw           m4,        m2             ; c
+  psubw           m0,        m5
+  paddw           m3,        m4
+                                ; m0 a
+  SWAP            1,         5  ; m1 b
+  SWAP            2,         4  ; m2 c
+                                ; m3 d
+%endmacro
+
+%macro TRANSPOSE_4X4 0
+  punpcklwd       m0,        m2
+  punpcklwd       m1,        m3
+  mova            m2,        m0
+  punpcklwd       m0,        m1
+  punpckhwd       m2,        m1
+  pshufd          m1,        m0, 0x0e
+  pshufd          m3,        m2, 0x0e
+%endmacro
+
+; transpose a 4x4 int16 matrix in xmm0 and xmm1 to the bottom half of xmm0-xmm3
+%macro TRANSPOSE_4X4_WIDE 0
+  mova            m3, m0
+  punpcklwd       m0, m1
+  punpckhwd       m3, m1
+  mova            m2, m0
+  punpcklwd       m0, m3
+  punpckhwd       m2, m3
+  pshufd          m1, m0, 0x0e
+  pshufd          m3, m2, 0x0e
+%endmacro
+
+%macro ADD_STORE_4P_2X 5  ; src1, src2, tmp1, tmp2, zero
+  movd            m%3,       [outputq]
+  movd            m%4,       [outputq + strideq]
+  punpcklbw       m%3,       m%5
+  punpcklbw       m%4,       m%5
+  paddw           m%1,       m%3
+  paddw           m%2,       m%4
+  packuswb        m%1,       m%5
+  packuswb        m%2,       m%5
+  movd            [outputq], m%1
+  movd            [outputq + strideq], m%2
+%endmacro
+
+INIT_XMM sse2
+cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride
+%if CONFIG_VP9_HIGHBITDEPTH
+  mova            m0,        [inputq +  0]
+  packssdw        m0,        [inputq + 16]
+  mova            m1,        [inputq + 32]
+  packssdw        m1,        [inputq + 48]
+%else
+  mova            m0,        [inputq +  0]
+  mova            m1,        [inputq + 16]
+%endif
+  psraw           m0,        2
+  psraw           m1,        2
+
+  TRANSPOSE_4X4_WIDE
+  REORDER_INPUTS
+  TRANSFORM_COLS
+  TRANSPOSE_4X4
+  REORDER_INPUTS
+  TRANSFORM_COLS
+
+  pxor            m4, m4
+  ADD_STORE_4P_2X  0, 1, 5, 6, 4
+  lea             outputq, [outputq + 2 * strideq]
+  ADD_STORE_4P_2X  2, 3, 5, 6, 4
+
+  RET
diff --git a/libs/libvpx/vpx_dsp/x86/loopfilter_avx2.c b/libs/libvpx/vpx_dsp/x86/loopfilter_avx2.c
new file mode 100644
index 0000000000..23a97dd05f
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/loopfilter_avx2.c
@@ -0,0 +1,986 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>  /* AVX2 */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/mem.h"
+
+static void mb_lpf_horizontal_edge_w_avx2_8(unsigned char *s, int p,
+        const unsigned char *_blimit, const unsigned char *_limit,
+        const unsigned char *_thresh) {
+    __m128i mask, hev, flat, flat2;
+    const __m128i zero = _mm_set1_epi16(0);
+    const __m128i one = _mm_set1_epi8(1);
+    __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
+    __m128i abs_p1p0;
+
+    const __m128i thresh = _mm_broadcastb_epi8(
+            _mm_cvtsi32_si128((int) _thresh[0]));
+    const __m128i limit = _mm_broadcastb_epi8(
+            _mm_cvtsi32_si128((int) _limit[0]));
+    const __m128i blimit = _mm_broadcastb_epi8(
+            _mm_cvtsi32_si128((int) _blimit[0]));
+
+    q4p4 = _mm_loadl_epi64((__m128i *) (s - 5 * p));
+    q4p4 = _mm_castps_si128(
+            _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *) (s + 4 * p)));
+    q3p3 = _mm_loadl_epi64((__m128i *) (s - 4 * p));
+    q3p3 = _mm_castps_si128(
+            _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *) (s + 3 * p)));
+    q2p2 = _mm_loadl_epi64((__m128i *) (s - 3 * p));
+    q2p2 = _mm_castps_si128(
+            _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *) (s + 2 * p)));
+    q1p1 = _mm_loadl_epi64((__m128i *) (s - 2 * p));
+    q1p1 = _mm_castps_si128(
+            _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *) (s + 1 * p)));
+    p1q1 = _mm_shuffle_epi32(q1p1, 78);
+    q0p0 = _mm_loadl_epi64((__m128i *) (s - 1 * p));
+    q0p0 = _mm_castps_si128(
+            _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *) (s - 0 * p)));
+    p0q0 = _mm_shuffle_epi32(q0p0, 78);
+
+    {
+        __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
+        abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0),
+                _mm_subs_epu8(q0p0, q1p1));
+        abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
+        fe = _mm_set1_epi8(0xfe);
+        ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+        abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0),
+                _mm_subs_epu8(p0q0, q0p0));
+        abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1),
+                _mm_subs_epu8(p1q1, q1p1));
+        flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+        hev = _mm_subs_epu8(flat, thresh);
+        hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+        abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+        abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+        mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+        mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+        // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+        mask = _mm_max_epu8(abs_p1p0, mask);
+        // mask |= (abs(p1 - p0) > limit) * -1;
+        // mask |= (abs(q1 - q0) > limit) * -1;
+
+        work = _mm_max_epu8(
+                _mm_or_si128(_mm_subs_epu8(q2p2, q1p1),
+                        _mm_subs_epu8(q1p1, q2p2)),
+                _mm_or_si128(_mm_subs_epu8(q3p3, q2p2),
+                        _mm_subs_epu8(q2p2, q3p3)));
+        mask = _mm_max_epu8(work, mask);
+        mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
+        mask = _mm_subs_epu8(mask, limit);
+        mask = _mm_cmpeq_epi8(mask, zero);
+    }
+
+    // lp filter
+    {
+        const __m128i t4 = _mm_set1_epi8(4);
+        const __m128i t3 = _mm_set1_epi8(3);
+        const __m128i t80 = _mm_set1_epi8(0x80);
+        const __m128i t1 = _mm_set1_epi16(0x1);
+        __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
+        __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
+        __m128i qs0 = _mm_xor_si128(p0q0, t80);
+        __m128i qs1 = _mm_xor_si128(p1q1, t80);
+        __m128i filt;
+        __m128i work_a;
+        __m128i filter1, filter2;
+        __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
+        __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
+
+        filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
+        work_a = _mm_subs_epi8(qs0, qs0ps0);
+        filt = _mm_adds_epi8(filt, work_a);
+        filt = _mm_adds_epi8(filt, work_a);
+        filt = _mm_adds_epi8(filt, work_a);
+        /* (vpx_filter + 3 * (qs0 - ps0)) & mask */
+        filt = _mm_and_si128(filt, mask);
+
+        filter1 = _mm_adds_epi8(filt, t4);
+        filter2 = _mm_adds_epi8(filt, t3);
+
+        filter1 = _mm_unpacklo_epi8(zero, filter1);
+        filter1 = _mm_srai_epi16(filter1, 0xB);
+        filter2 = _mm_unpacklo_epi8(zero, filter2);
+        filter2 = _mm_srai_epi16(filter2, 0xB);
+
+        /* Filter1 >> 3 */
+        filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
+        qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
+
+        /* filt >> 1 */
+        filt = _mm_adds_epi16(filter1, t1);
+        filt = _mm_srai_epi16(filt, 1);
+        filt = _mm_andnot_si128(
+                _mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), filt);
+        filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
+        qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
+        // loopfilter done
+
+        {
+            __m128i work;
+            flat = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(q2p2, q0p0),
+                            _mm_subs_epu8(q0p0, q2p2)),
+                    _mm_or_si128(_mm_subs_epu8(q3p3, q0p0),
+                            _mm_subs_epu8(q0p0, q3p3)));
+            flat = _mm_max_epu8(abs_p1p0, flat);
+            flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+            flat = _mm_subs_epu8(flat, one);
+            flat = _mm_cmpeq_epi8(flat, zero);
+            flat = _mm_and_si128(flat, mask);
+
+            q5p5 = _mm_loadl_epi64((__m128i *) (s - 6 * p));
+            q5p5 = _mm_castps_si128(
+                    _mm_loadh_pi(_mm_castsi128_ps(q5p5),
+                            (__m64 *) (s + 5 * p)));
+
+            q6p6 = _mm_loadl_epi64((__m128i *) (s - 7 * p));
+            q6p6 = _mm_castps_si128(
+                    _mm_loadh_pi(_mm_castsi128_ps(q6p6),
+                            (__m64 *) (s + 6 * p)));
+
+            flat2 = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(q4p4, q0p0),
+                            _mm_subs_epu8(q0p0, q4p4)),
+                    _mm_or_si128(_mm_subs_epu8(q5p5, q0p0),
+                            _mm_subs_epu8(q0p0, q5p5)));
+
+            q7p7 = _mm_loadl_epi64((__m128i *) (s - 8 * p));
+            q7p7 = _mm_castps_si128(
+                    _mm_loadh_pi(_mm_castsi128_ps(q7p7),
+                            (__m64 *) (s + 7 * p)));
+
+            work = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(q6p6, q0p0),
+                            _mm_subs_epu8(q0p0, q6p6)),
+                    _mm_or_si128(_mm_subs_epu8(q7p7, q0p0),
+                            _mm_subs_epu8(q0p0, q7p7)));
+
+            flat2 = _mm_max_epu8(work, flat2);
+            flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
+            flat2 = _mm_subs_epu8(flat2, one);
+            flat2 = _mm_cmpeq_epi8(flat2, zero);
+            flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
+        }
+
+        // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+        // flat and wide flat calculations
+        {
+            const __m128i eight = _mm_set1_epi16(8);
+            const __m128i four = _mm_set1_epi16(4);
+            __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
+            __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
+            __m128i pixelFilter_p, pixelFilter_q;
+            __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
+            __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
+
+            p7_16 = _mm_unpacklo_epi8(q7p7, zero);
+            p6_16 = _mm_unpacklo_epi8(q6p6, zero);
+            p5_16 = _mm_unpacklo_epi8(q5p5, zero);
+            p4_16 = _mm_unpacklo_epi8(q4p4, zero);
+            p3_16 = _mm_unpacklo_epi8(q3p3, zero);
+            p2_16 = _mm_unpacklo_epi8(q2p2, zero);
+            p1_16 = _mm_unpacklo_epi8(q1p1, zero);
+            p0_16 = _mm_unpacklo_epi8(q0p0, zero);
+            q0_16 = _mm_unpackhi_epi8(q0p0, zero);
+            q1_16 = _mm_unpackhi_epi8(q1p1, zero);
+            q2_16 = _mm_unpackhi_epi8(q2p2, zero);
+            q3_16 = _mm_unpackhi_epi8(q3p3, zero);
+            q4_16 = _mm_unpackhi_epi8(q4p4, zero);
+            q5_16 = _mm_unpackhi_epi8(q5p5, zero);
+            q6_16 = _mm_unpackhi_epi8(q6p6, zero);
+            q7_16 = _mm_unpackhi_epi8(q7p7, zero);
+
+            pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
+                    _mm_add_epi16(p4_16, p3_16));
+            pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
+                    _mm_add_epi16(q4_16, q3_16));
+
+            pixetFilter_p2p1p0 = _mm_add_epi16(p0_16,
+                    _mm_add_epi16(p2_16, p1_16));
+            pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+            pixetFilter_q2q1q0 = _mm_add_epi16(q0_16,
+                    _mm_add_epi16(q2_16, q1_16));
+            pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+            pixelFilter_p = _mm_add_epi16(eight,
+                    _mm_add_epi16(pixelFilter_p, pixelFilter_q));
+            pixetFilter_p2p1p0 = _mm_add_epi16(four,
+                    _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)),
+                    4);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)),
+                    4);
+            flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixetFilter_p2p1p0,
+                            _mm_add_epi16(p3_16, p0_16)), 3);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixetFilter_p2p1p0,
+                            _mm_add_epi16(q3_16, q0_16)), 3);
+
+            flat_q0p0 = _mm_packus_epi16(res_p, res_q);
+
+            sum_p7 = _mm_add_epi16(p7_16, p7_16);
+            sum_q7 = _mm_add_epi16(q7_16, q7_16);
+            sum_p3 = _mm_add_epi16(p3_16, p3_16);
+            sum_q3 = _mm_add_epi16(q3_16, q3_16);
+
+            pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
+            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)),
+                    4);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)),
+                    4);
+            flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+            pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
+            pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixetFilter_p2p1p0,
+                            _mm_add_epi16(sum_p3, p1_16)), 3);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixetFilter_q2q1q0,
+                            _mm_add_epi16(sum_q3, q1_16)), 3);
+            flat_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+            sum_p3 = _mm_add_epi16(sum_p3, p3_16);
+            sum_q3 = _mm_add_epi16(sum_q3, q3_16);
+
+            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
+            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)),
+                    4);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)),
+                    4);
+            flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+            pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
+            pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
+
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixetFilter_p2p1p0,
+                            _mm_add_epi16(sum_p3, p2_16)), 3);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixetFilter_q2q1q0,
+                            _mm_add_epi16(sum_q3, q2_16)), 3);
+            flat_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
+            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)),
+                    4);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)),
+                    4);
+            flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
+
+            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
+            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)),
+                    4);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)),
+                    4);
+            flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
+
+            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
+            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)),
+                    4);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)),
+                    4);
+            flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
+
+            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
+            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)),
+                    4);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)),
+                    4);
+            flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
+        }
+        // wide flat
+        // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+        flat = _mm_shuffle_epi32(flat, 68);
+        flat2 = _mm_shuffle_epi32(flat2, 68);
+
+        q2p2 = _mm_andnot_si128(flat, q2p2);
+        flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
+        q2p2 = _mm_or_si128(q2p2, flat_q2p2);
+
+        qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
+        flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
+        q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
+
+        qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
+        flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
+        q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
+
+        q6p6 = _mm_andnot_si128(flat2, q6p6);
+        flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
+        q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
+        _mm_storel_epi64((__m128i *) (s - 7 * p), q6p6);
+        _mm_storeh_pi((__m64 *) (s + 6 * p), _mm_castsi128_ps(q6p6));
+
+        q5p5 = _mm_andnot_si128(flat2, q5p5);
+        flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
+        q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
+        _mm_storel_epi64((__m128i *) (s - 6 * p), q5p5);
+        _mm_storeh_pi((__m64 *) (s + 5 * p), _mm_castsi128_ps(q5p5));
+
+        q4p4 = _mm_andnot_si128(flat2, q4p4);
+        flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
+        q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
+        _mm_storel_epi64((__m128i *) (s - 5 * p), q4p4);
+        _mm_storeh_pi((__m64 *) (s + 4 * p), _mm_castsi128_ps(q4p4));
+
+        q3p3 = _mm_andnot_si128(flat2, q3p3);
+        flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
+        q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
+        _mm_storel_epi64((__m128i *) (s - 4 * p), q3p3);
+        _mm_storeh_pi((__m64 *) (s + 3 * p), _mm_castsi128_ps(q3p3));
+
+        q2p2 = _mm_andnot_si128(flat2, q2p2);
+        flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
+        q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
+        _mm_storel_epi64((__m128i *) (s - 3 * p), q2p2);
+        _mm_storeh_pi((__m64 *) (s + 2 * p), _mm_castsi128_ps(q2p2));
+
+        q1p1 = _mm_andnot_si128(flat2, q1p1);
+        flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
+        q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
+        _mm_storel_epi64((__m128i *) (s - 2 * p), q1p1);
+        _mm_storeh_pi((__m64 *) (s + 1 * p), _mm_castsi128_ps(q1p1));
+
+        q0p0 = _mm_andnot_si128(flat2, q0p0);
+        flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
+        q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
+        _mm_storel_epi64((__m128i *) (s - 1 * p), q0p0);
+        _mm_storeh_pi((__m64 *) (s - 0 * p), _mm_castsi128_ps(q0p0));
+    }
+}
+
+DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = {
+  0, 128, 1, 128, 2, 128, 3, 128, 4, 128, 5, 128, 6, 128, 7, 128,
+  8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128
+};
+
+static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p,
+        const unsigned char *_blimit, const unsigned char *_limit,
+        const unsigned char *_thresh) {
+    __m128i mask, hev, flat, flat2;
+    const __m128i zero = _mm_set1_epi16(0);
+    const __m128i one = _mm_set1_epi8(1);
+    __m128i p7, p6, p5;
+    __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
+    __m128i q5, q6, q7;
+    __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4,
+            q256_4, p256_3, q256_3, p256_2, q256_2, p256_1, q256_1,
+            p256_0, q256_0;
+
+    const __m128i thresh = _mm_broadcastb_epi8(
+            _mm_cvtsi32_si128((int) _thresh[0]));
+    const __m128i limit = _mm_broadcastb_epi8(
+            _mm_cvtsi32_si128((int) _limit[0]));
+    const __m128i blimit = _mm_broadcastb_epi8(
+            _mm_cvtsi32_si128((int) _blimit[0]));
+
+    p256_4 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s - 5 * p)));
+    p256_3 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s - 4 * p)));
+    p256_2 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s - 3 * p)));
+    p256_1 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s - 2 * p)));
+    p256_0 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s - 1 * p)));
+    q256_0 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s - 0 * p)));
+    q256_1 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s + 1 * p)));
+    q256_2 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s + 2 * p)));
+    q256_3 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s + 3 * p)));
+    q256_4 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s + 4 * p)));
+
+    p4 = _mm256_castsi256_si128(p256_4);
+    p3 = _mm256_castsi256_si128(p256_3);
+    p2 = _mm256_castsi256_si128(p256_2);
+    p1 = _mm256_castsi256_si128(p256_1);
+    p0 = _mm256_castsi256_si128(p256_0);
+    q0 = _mm256_castsi256_si128(q256_0);
+    q1 = _mm256_castsi256_si128(q256_1);
+    q2 = _mm256_castsi256_si128(q256_2);
+    q3 = _mm256_castsi256_si128(q256_3);
+    q4 = _mm256_castsi256_si128(q256_4);
+
+    {
+        const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
+                _mm_subs_epu8(p0, p1));
+        const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
+                _mm_subs_epu8(q0, q1));
+        const __m128i fe = _mm_set1_epi8(0xfe);
+        const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+        __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
+                _mm_subs_epu8(q0, p0));
+        __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
+                _mm_subs_epu8(q1, p1));
+        __m128i work;
+        flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+        hev = _mm_subs_epu8(flat, thresh);
+        hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+        abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+        abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+        mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+        mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+        // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+        mask = _mm_max_epu8(flat, mask);
+        // mask |= (abs(p1 - p0) > limit) * -1;
+        // mask |= (abs(q1 - q0) > limit) * -1;
+        work = _mm_max_epu8(
+                _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
+                _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
+        mask = _mm_max_epu8(work, mask);
+        work = _mm_max_epu8(
+                _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
+                _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
+        mask = _mm_max_epu8(work, mask);
+        mask = _mm_subs_epu8(mask, limit);
+        mask = _mm_cmpeq_epi8(mask, zero);
+    }
+
+    // lp filter
+    {
+        const __m128i t4 = _mm_set1_epi8(4);
+        const __m128i t3 = _mm_set1_epi8(3);
+        const __m128i t80 = _mm_set1_epi8(0x80);
+        const __m128i te0 = _mm_set1_epi8(0xe0);
+        const __m128i t1f = _mm_set1_epi8(0x1f);
+        const __m128i t1 = _mm_set1_epi8(0x1);
+        const __m128i t7f = _mm_set1_epi8(0x7f);
+
+        __m128i ps1 = _mm_xor_si128(p1, t80);
+        __m128i ps0 = _mm_xor_si128(p0, t80);
+        __m128i qs0 = _mm_xor_si128(q0, t80);
+        __m128i qs1 = _mm_xor_si128(q1, t80);
+        __m128i filt;
+        __m128i work_a;
+        __m128i filter1, filter2;
+        __m128i flat2_p6, flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1,
+                flat2_p0, flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4,
+                flat2_q5, flat2_q6, flat_p2, flat_p1, flat_p0, flat_q0, flat_q1,
+                flat_q2;
+
+        filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+        work_a = _mm_subs_epi8(qs0, ps0);
+        filt = _mm_adds_epi8(filt, work_a);
+        filt = _mm_adds_epi8(filt, work_a);
+        filt = _mm_adds_epi8(filt, work_a);
+        /* (vpx_filter + 3 * (qs0 - ps0)) & mask */
+        filt = _mm_and_si128(filt, mask);
+
+        filter1 = _mm_adds_epi8(filt, t4);
+        filter2 = _mm_adds_epi8(filt, t3);
+
+        /* Filter1 >> 3 */
+        work_a = _mm_cmpgt_epi8(zero, filter1);
+        filter1 = _mm_srli_epi16(filter1, 3);
+        work_a = _mm_and_si128(work_a, te0);
+        filter1 = _mm_and_si128(filter1, t1f);
+        filter1 = _mm_or_si128(filter1, work_a);
+        qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+
+        /* Filter2 >> 3 */
+        work_a = _mm_cmpgt_epi8(zero, filter2);
+        filter2 = _mm_srli_epi16(filter2, 3);
+        work_a = _mm_and_si128(work_a, te0);
+        filter2 = _mm_and_si128(filter2, t1f);
+        filter2 = _mm_or_si128(filter2, work_a);
+        ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+
+        /* filt >> 1 */
+        filt = _mm_adds_epi8(filter1, t1);
+        work_a = _mm_cmpgt_epi8(zero, filt);
+        filt = _mm_srli_epi16(filt, 1);
+        work_a = _mm_and_si128(work_a, t80);
+        filt = _mm_and_si128(filt, t7f);
+        filt = _mm_or_si128(filt, work_a);
+        filt = _mm_andnot_si128(hev, filt);
+        ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+        qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+        // loopfilter done
+
+        {
+            __m128i work;
+            work = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
+                    _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
+            flat = _mm_max_epu8(work, flat);
+            work = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
+                    _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
+            flat = _mm_max_epu8(work, flat);
+            work = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)),
+                    _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4)));
+            flat = _mm_subs_epu8(flat, one);
+            flat = _mm_cmpeq_epi8(flat, zero);
+            flat = _mm_and_si128(flat, mask);
+
+            p256_5 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                        (__m128d const *)(s - 6 * p)));
+            q256_5 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                        (__m128d const *)(s + 5 * p)));
+            p5 = _mm256_castsi256_si128(p256_5);
+            q5 = _mm256_castsi256_si128(q256_5);
+            flat2 = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)),
+                    _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5)));
+
+            flat2 = _mm_max_epu8(work, flat2);
+            p256_6 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                        (__m128d const *)(s - 7 * p)));
+            q256_6 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                        (__m128d const *)(s + 6 * p)));
+            p6 = _mm256_castsi256_si128(p256_6);
+            q6 = _mm256_castsi256_si128(q256_6);
+            work = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)),
+                    _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6)));
+
+            flat2 = _mm_max_epu8(work, flat2);
+
+            p256_7 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                        (__m128d const *)(s - 8 * p)));
+            q256_7 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                        (__m128d const *)(s + 7 * p)));
+            p7 = _mm256_castsi256_si128(p256_7);
+            q7 = _mm256_castsi256_si128(q256_7);
+            work = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)),
+                    _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7)));
+
+            flat2 = _mm_max_epu8(work, flat2);
+            flat2 = _mm_subs_epu8(flat2, one);
+            flat2 = _mm_cmpeq_epi8(flat2, zero);
+            flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
+        }
+
+        // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+        // flat and wide flat calculations
+        {
+            const __m256i eight = _mm256_set1_epi16(8);
+            const __m256i four = _mm256_set1_epi16(4);
+            __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0,
+                    pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p,
+                    res_q;
+
+            const __m256i filter = _mm256_load_si256(
+                                  (__m256i const *)filt_loopfilter_avx2);
+            p256_7 = _mm256_shuffle_epi8(p256_7, filter);
+            p256_6 = _mm256_shuffle_epi8(p256_6, filter);
+            p256_5 = _mm256_shuffle_epi8(p256_5, filter);
+            p256_4 = _mm256_shuffle_epi8(p256_4, filter);
+            p256_3 = _mm256_shuffle_epi8(p256_3, filter);
+            p256_2 = _mm256_shuffle_epi8(p256_2, filter);
+            p256_1 = _mm256_shuffle_epi8(p256_1, filter);
+            p256_0 = _mm256_shuffle_epi8(p256_0, filter);
+            q256_0 = _mm256_shuffle_epi8(q256_0, filter);
+            q256_1 = _mm256_shuffle_epi8(q256_1, filter);
+            q256_2 = _mm256_shuffle_epi8(q256_2, filter);
+            q256_3 = _mm256_shuffle_epi8(q256_3, filter);
+            q256_4 = _mm256_shuffle_epi8(q256_4, filter);
+            q256_5 = _mm256_shuffle_epi8(q256_5, filter);
+            q256_6 = _mm256_shuffle_epi8(q256_6, filter);
+            q256_7 = _mm256_shuffle_epi8(q256_7, filter);
+
+            pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5),
+                    _mm256_add_epi16(p256_4, p256_3));
+            pixelFilter_q = _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_5),
+                    _mm256_add_epi16(q256_4, q256_3));
+
+            pixetFilter_p2p1p0 = _mm256_add_epi16(p256_0,
+                    _mm256_add_epi16(p256_2, p256_1));
+            pixelFilter_p = _mm256_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+            pixetFilter_q2q1q0 = _mm256_add_epi16(q256_0,
+                    _mm256_add_epi16(q256_2, q256_1));
+            pixelFilter_q = _mm256_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+
+            pixelFilter_p = _mm256_add_epi16(eight,
+                    _mm256_add_epi16(pixelFilter_p, pixelFilter_q));
+
+            pixetFilter_p2p1p0 = _mm256_add_epi16(four,
+                    _mm256_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(p256_7, p256_0)), 4);
+
+            flat2_p0 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(q256_7, q256_0)), 4);
+
+            flat2_q0 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixetFilter_p2p1p0,
+                            _mm256_add_epi16(p256_3, p256_0)), 3);
+
+            flat_p0 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixetFilter_p2p1p0,
+                            _mm256_add_epi16(q256_3, q256_0)), 3);
+
+            flat_q0 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            sum_p7 = _mm256_add_epi16(p256_7, p256_7);
+
+            sum_q7 = _mm256_add_epi16(q256_7, q256_7);
+
+            sum_p3 = _mm256_add_epi16(p256_3, p256_3);
+
+            sum_q3 = _mm256_add_epi16(q256_3, q256_3);
+
+            pixelFilter_q = _mm256_sub_epi16(pixelFilter_p, p256_6);
+
+            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_6);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(sum_p7, p256_1)), 4);
+
+            flat2_p1 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_q,
+                            _mm256_add_epi16(sum_q7, q256_1)), 4);
+
+            flat2_q1 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_p2p1p0, p256_2);
+
+            pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_2);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixetFilter_p2p1p0,
+                            _mm256_add_epi16(sum_p3, p256_1)), 3);
+
+            flat_p1 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixetFilter_q2q1q0,
+                            _mm256_add_epi16(sum_q3, q256_1)), 3);
+
+            flat_q1 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+            sum_p3 = _mm256_add_epi16(sum_p3, p256_3);
+
+            sum_q3 = _mm256_add_epi16(sum_q3, q256_3);
+
+            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_5);
+
+            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_5);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(sum_p7, p256_2)), 4);
+
+            flat2_p2 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_q,
+                            _mm256_add_epi16(sum_q7, q256_2)), 4);
+
+            flat2_q2 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_1);
+
+            pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_q2q1q0, p256_1);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixetFilter_p2p1p0,
+                            _mm256_add_epi16(sum_p3, p256_2)), 3);
+
+            flat_p2 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixetFilter_q2q1q0,
+                            _mm256_add_epi16(sum_q3, q256_2)), 3);
+
+            flat_q2 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_4);
+
+            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_4);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(sum_p7, p256_3)), 4);
+
+            flat2_p3 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_q,
+                            _mm256_add_epi16(sum_q7, q256_3)), 4);
+
+            flat2_q3 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_3);
+
+            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_3);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(sum_p7, p256_4)), 4);
+
+            flat2_p4 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_q,
+                            _mm256_add_epi16(sum_q7, q256_4)), 4);
+
+            flat2_q4 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_2);
+
+            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_2);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(sum_p7, p256_5)), 4);
+
+            flat2_p5 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_q,
+                            _mm256_add_epi16(sum_q7, q256_5)), 4);
+
+            flat2_q5 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_1);
+
+            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_1);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(sum_p7, p256_6)), 4);
+
+            flat2_p6 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_q,
+                            _mm256_add_epi16(sum_q7, q256_6)), 4);
+
+            flat2_q6 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+        }
+
+        // wide flat
+        // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+        p2 = _mm_andnot_si128(flat, p2);
+        flat_p2 = _mm_and_si128(flat, flat_p2);
+        p2 = _mm_or_si128(flat_p2, p2);
+
+        p1 = _mm_andnot_si128(flat, ps1);
+        flat_p1 = _mm_and_si128(flat, flat_p1);
+        p1 = _mm_or_si128(flat_p1, p1);
+
+        p0 = _mm_andnot_si128(flat, ps0);
+        flat_p0 = _mm_and_si128(flat, flat_p0);
+        p0 = _mm_or_si128(flat_p0, p0);
+
+        q0 = _mm_andnot_si128(flat, qs0);
+        flat_q0 = _mm_and_si128(flat, flat_q0);
+        q0 = _mm_or_si128(flat_q0, q0);
+
+        q1 = _mm_andnot_si128(flat, qs1);
+        flat_q1 = _mm_and_si128(flat, flat_q1);
+        q1 = _mm_or_si128(flat_q1, q1);
+
+        q2 = _mm_andnot_si128(flat, q2);
+        flat_q2 = _mm_and_si128(flat, flat_q2);
+        q2 = _mm_or_si128(flat_q2, q2);
+
+        p6 = _mm_andnot_si128(flat2, p6);
+        flat2_p6 = _mm_and_si128(flat2, flat2_p6);
+        p6 = _mm_or_si128(flat2_p6, p6);
+        _mm_storeu_si128((__m128i *) (s - 7 * p), p6);
+
+        p5 = _mm_andnot_si128(flat2, p5);
+        flat2_p5 = _mm_and_si128(flat2, flat2_p5);
+        p5 = _mm_or_si128(flat2_p5, p5);
+        _mm_storeu_si128((__m128i *) (s - 6 * p), p5);
+
+        p4 = _mm_andnot_si128(flat2, p4);
+        flat2_p4 = _mm_and_si128(flat2, flat2_p4);
+        p4 = _mm_or_si128(flat2_p4, p4);
+        _mm_storeu_si128((__m128i *) (s - 5 * p), p4);
+
+        p3 = _mm_andnot_si128(flat2, p3);
+        flat2_p3 = _mm_and_si128(flat2, flat2_p3);
+        p3 = _mm_or_si128(flat2_p3, p3);
+        _mm_storeu_si128((__m128i *) (s - 4 * p), p3);
+
+        p2 = _mm_andnot_si128(flat2, p2);
+        flat2_p2 = _mm_and_si128(flat2, flat2_p2);
+        p2 = _mm_or_si128(flat2_p2, p2);
+        _mm_storeu_si128((__m128i *) (s - 3 * p), p2);
+
+        p1 = _mm_andnot_si128(flat2, p1);
+        flat2_p1 = _mm_and_si128(flat2, flat2_p1);
+        p1 = _mm_or_si128(flat2_p1, p1);
+        _mm_storeu_si128((__m128i *) (s - 2 * p), p1);
+
+        p0 = _mm_andnot_si128(flat2, p0);
+        flat2_p0 = _mm_and_si128(flat2, flat2_p0);
+        p0 = _mm_or_si128(flat2_p0, p0);
+        _mm_storeu_si128((__m128i *) (s - 1 * p), p0);
+
+        q0 = _mm_andnot_si128(flat2, q0);
+        flat2_q0 = _mm_and_si128(flat2, flat2_q0);
+        q0 = _mm_or_si128(flat2_q0, q0);
+        _mm_storeu_si128((__m128i *) (s - 0 * p), q0);
+
+        q1 = _mm_andnot_si128(flat2, q1);
+        flat2_q1 = _mm_and_si128(flat2, flat2_q1);
+        q1 = _mm_or_si128(flat2_q1, q1);
+        _mm_storeu_si128((__m128i *) (s + 1 * p), q1);
+
+        q2 = _mm_andnot_si128(flat2, q2);
+        flat2_q2 = _mm_and_si128(flat2, flat2_q2);
+        q2 = _mm_or_si128(flat2_q2, q2);
+        _mm_storeu_si128((__m128i *) (s + 2 * p), q2);
+
+        q3 = _mm_andnot_si128(flat2, q3);
+        flat2_q3 = _mm_and_si128(flat2, flat2_q3);
+        q3 = _mm_or_si128(flat2_q3, q3);
+        _mm_storeu_si128((__m128i *) (s + 3 * p), q3);
+
+        q4 = _mm_andnot_si128(flat2, q4);
+        flat2_q4 = _mm_and_si128(flat2, flat2_q4);
+        q4 = _mm_or_si128(flat2_q4, q4);
+        _mm_storeu_si128((__m128i *) (s + 4 * p), q4);
+
+        q5 = _mm_andnot_si128(flat2, q5);
+        flat2_q5 = _mm_and_si128(flat2, flat2_q5);
+        q5 = _mm_or_si128(flat2_q5, q5);
+        _mm_storeu_si128((__m128i *) (s + 5 * p), q5);
+
+        q6 = _mm_andnot_si128(flat2, q6);
+        flat2_q6 = _mm_and_si128(flat2, flat2_q6);
+        q6 = _mm_or_si128(flat2_q6, q6);
+        _mm_storeu_si128((__m128i *) (s + 6 * p), q6);
+    }
+}
+
+void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p,
+        const unsigned char *_blimit, const unsigned char *_limit,
+        const unsigned char *_thresh, int count) {
+    if (count == 1)
+        mb_lpf_horizontal_edge_w_avx2_8(s, p, _blimit, _limit, _thresh);
+    else
+        mb_lpf_horizontal_edge_w_avx2_16(s, p, _blimit, _limit, _thresh);
+}
diff --git a/libs/libvpx/vpx_dsp/x86/loopfilter_mmx.asm b/libs/libvpx/vpx_dsp/x86/loopfilter_mmx.asm
new file mode 100644
index 0000000000..b9c18b680f
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/loopfilter_mmx.asm
@@ -0,0 +1,611 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+
+;void vpx_lpf_horizontal_4_mmx
+;(
+;    unsigned char *src_ptr,
+;    int src_pixel_step,
+;    const char *blimit,
+;    const char *limit,
+;    const char *thresh,
+;    int  count
+;)
+global sym(vpx_lpf_horizontal_4_mmx) PRIVATE
+sym(vpx_lpf_horizontal_4_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 32                         ; reserve 32 bytes
+    %define t0 [rsp + 0]    ;__declspec(align(16)) char t0[8];
+    %define t1 [rsp + 16]   ;__declspec(align(16)) char t1[8];
+
+        mov         rsi, arg(0) ;src_ptr
+        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
+
+        movsxd      rcx, dword ptr arg(5) ;count
+.next8_h:
+        mov         rdx, arg(3) ;limit
+        movq        mm7, [rdx]
+        mov         rdi, rsi              ; rdi points to row +1 for indirect addressing
+        add         rdi, rax
+
+        ; calculate breakout conditions
+        movq        mm2, [rdi+2*rax]      ; q3
+        movq        mm1, [rsi+2*rax]      ; q2
+        movq        mm6, mm1              ; q2
+        psubusb     mm1, mm2              ; q2-=q3
+        psubusb     mm2, mm6              ; q3-=q2
+        por         mm1, mm2              ; abs(q3-q2)
+        psubusb     mm1, mm7              ;
+
+
+        movq        mm4, [rsi+rax]        ; q1
+        movq        mm3, mm4              ; q1
+        psubusb     mm4, mm6              ; q1-=q2
+        psubusb     mm6, mm3              ; q2-=q1
+        por         mm4, mm6              ; abs(q2-q1)
+
+        psubusb     mm4, mm7
+        por        mm1, mm4
+
+        movq        mm4, [rsi]            ; q0
+        movq        mm0, mm4              ; q0
+        psubusb     mm4, mm3              ; q0-=q1
+        psubusb     mm3, mm0              ; q1-=q0
+        por         mm4, mm3              ; abs(q0-q1)
+        movq        t0, mm4               ; save to t0
+        psubusb     mm4, mm7
+        por        mm1, mm4
+
+
+        neg         rax                   ; negate pitch to deal with above border
+
+        movq        mm2, [rsi+4*rax]      ; p3
+        movq        mm4, [rdi+4*rax]      ; p2
+        movq        mm5, mm4              ; p2
+        psubusb     mm4, mm2              ; p2-=p3
+        psubusb     mm2, mm5              ; p3-=p2
+        por         mm4, mm2              ; abs(p3 - p2)
+        psubusb     mm4, mm7
+        por        mm1, mm4
+
+
+        movq        mm4, [rsi+2*rax]      ; p1
+        movq        mm3, mm4              ; p1
+        psubusb     mm4, mm5              ; p1-=p2
+        psubusb     mm5, mm3              ; p2-=p1
+        por         mm4, mm5              ; abs(p2 - p1)
+        psubusb     mm4, mm7
+        por        mm1, mm4
+
+        movq        mm2, mm3              ; p1
+
+        movq        mm4, [rsi+rax]        ; p0
+        movq        mm5, mm4              ; p0
+        psubusb     mm4, mm3              ; p0-=p1
+        psubusb     mm3, mm5              ; p1-=p0
+        por         mm4, mm3              ; abs(p1 - p0)
+        movq        t1, mm4               ; save to t1
+        psubusb     mm4, mm7
+        por        mm1, mm4
+
+        movq        mm3, [rdi]            ; q1
+        movq        mm4, mm3              ; q1
+        psubusb     mm3, mm2              ; q1-=p1
+        psubusb     mm2, mm4              ; p1-=q1
+        por         mm2, mm3              ; abs(p1-q1)
+        pand        mm2, [GLOBAL(tfe)]    ; set lsb of each byte to zero
+        psrlw       mm2, 1                ; abs(p1-q1)/2
+
+        movq        mm6, mm5              ; p0
+        movq        mm3, [rsi]            ; q0
+        psubusb     mm5, mm3              ; p0-=q0
+        psubusb     mm3, mm6              ; q0-=p0
+        por         mm5, mm3              ; abs(p0 - q0)
+        paddusb     mm5, mm5              ; abs(p0-q0)*2
+        paddusb     mm5, mm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+        mov         rdx, arg(2) ;blimit           ; get blimit
+        movq        mm7, [rdx]            ; blimit
+
+        psubusb     mm5,    mm7           ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
+        por         mm1,    mm5
+        pxor        mm5,    mm5
+        pcmpeqb     mm1,    mm5           ; mask mm1
+
+        ; calculate high edge variance
+        mov         rdx, arg(4) ;thresh           ; get thresh
+        movq        mm7, [rdx]            ;
+        movq        mm4, t0               ; get abs (q1 - q0)
+        psubusb     mm4, mm7
+        movq        mm3, t1               ; get abs (p1 - p0)
+        psubusb     mm3, mm7
+        paddb       mm4, mm3              ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+
+        pcmpeqb     mm4,        mm5
+
+        pcmpeqb     mm5,        mm5
+        pxor        mm4,        mm5
+
+
+        ; start work on filters
+        movq        mm2, [rsi+2*rax]      ; p1
+        movq        mm7, [rdi]            ; q1
+        pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values
+        pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values
+        psubsb      mm2, mm7              ; p1 - q1
+        pand        mm2, mm4              ; high var mask (hvm)(p1 - q1)
+        pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values
+        pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values
+        movq        mm3, mm0              ; q0
+        psubsb      mm0, mm6              ; q0 - p0
+        paddsb      mm2, mm0              ; 1 * (q0 - p0) + hvm(p1 - q1)
+        paddsb      mm2, mm0              ; 2 * (q0 - p0) + hvm(p1 - q1)
+        paddsb      mm2, mm0              ; 3 * (q0 - p0) + hvm(p1 - q1)
+        pand        mm1, mm2                  ; mask filter values we don't care about
+        movq        mm2, mm1
+        paddsb      mm1, [GLOBAL(t4)]     ; 3* (q0 - p0) + hvm(p1 - q1) + 4
+        paddsb      mm2, [GLOBAL(t3)]     ; 3* (q0 - p0) + hvm(p1 - q1) + 3
+
+        pxor        mm0, mm0             ;
+        pxor        mm5, mm5
+        punpcklbw   mm0, mm2            ;
+        punpckhbw   mm5, mm2            ;
+        psraw       mm0, 11             ;
+        psraw       mm5, 11
+        packsswb    mm0, mm5
+        movq        mm2, mm0            ;  (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
+
+        pxor        mm0, mm0              ; 0
+        movq        mm5, mm1              ; abcdefgh
+        punpcklbw   mm0, mm1              ; e0f0g0h0
+        psraw       mm0, 11               ; sign extended shift right by 3
+        pxor        mm1, mm1              ; 0
+        punpckhbw   mm1, mm5              ; a0b0c0d0
+        psraw       mm1, 11               ; sign extended shift right by 3
+        movq        mm5, mm0              ; save results
+
+        packsswb    mm0, mm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
+        paddsw      mm5, [GLOBAL(ones)]
+        paddsw      mm1, [GLOBAL(ones)]
+        psraw       mm5, 1                ; partial shifted one more time for 2nd tap
+        psraw       mm1, 1                ; partial shifted one more time for 2nd tap
+        packsswb    mm5, mm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
+        pandn       mm4, mm5              ; high edge variance additive
+
+        paddsb      mm6, mm2              ; p0+= p0 add
+        pxor        mm6, [GLOBAL(t80)]    ; unoffset
+        movq        [rsi+rax], mm6        ; write back
+
+        movq        mm6, [rsi+2*rax]      ; p1
+        pxor        mm6, [GLOBAL(t80)]    ; reoffset
+        paddsb      mm6, mm4              ; p1+= p1 add
+        pxor        mm6, [GLOBAL(t80)]    ; unoffset
+        movq        [rsi+2*rax], mm6      ; write back
+
+        psubsb      mm3, mm0              ; q0-= q0 add
+        pxor        mm3, [GLOBAL(t80)]    ; unoffset
+        movq        [rsi], mm3            ; write back
+
+        psubsb      mm7, mm4              ; q1-= q1 add
+        pxor        mm7, [GLOBAL(t80)]    ; unoffset
+        movq        [rdi], mm7            ; write back
+
+        add         rsi,8
+        neg         rax
+        dec         rcx
+        jnz         .next8_h
+
+    add rsp, 32
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vpx_lpf_vertical_4_mmx
+;(
+;    unsigned char *src_ptr,
+;    int  src_pixel_step,
+;    const char *blimit,
+;    const char *limit,
+;    const char *thresh,
+;    int count
+;)
+global sym(vpx_lpf_vertical_4_mmx) PRIVATE
+sym(vpx_lpf_vertical_4_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub          rsp, 64      ; reserve 64 bytes
+    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
+    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
+    %define srct [rsp + 32]   ;__declspec(align(16)) char srct[32];
+
+        mov         rsi,        arg(0) ;src_ptr
+        movsxd      rax,        dword ptr arg(1) ;src_pixel_step     ; destination pitch?
+
+        lea         rsi,        [rsi + rax*4 - 4]
+
+        movsxd      rcx,        dword ptr arg(5) ;count
+.next8_v:
+        mov         rdi,        rsi           ; rdi points to row +1 for indirect addressing
+        add         rdi,        rax
+
+
+        ;transpose
+        movq        mm6,        [rsi+2*rax]                 ; 67 66 65 64 63 62 61 60
+        movq        mm7,        mm6                         ; 77 76 75 74 73 72 71 70
+
+        punpckhbw   mm7,        [rdi+2*rax]                 ; 77 67 76 66 75 65 74 64
+        punpcklbw   mm6,        [rdi+2*rax]                 ; 73 63 72 62 71 61 70 60
+
+        movq        mm4,        [rsi]                       ; 47 46 45 44 43 42 41 40
+        movq        mm5,        mm4                         ; 47 46 45 44 43 42 41 40
+
+        punpckhbw   mm5,        [rsi+rax]                   ; 57 47 56 46 55 45 54 44
+        punpcklbw   mm4,        [rsi+rax]                   ; 53 43 52 42 51 41 50 40
+
+        movq        mm3,        mm5                         ; 57 47 56 46 55 45 54 44
+        punpckhwd   mm5,        mm7                         ; 77 67 57 47 76 66 56 46
+
+        punpcklwd   mm3,        mm7                         ; 75 65 55 45 74 64 54 44
+        movq        mm2,        mm4                         ; 53 43 52 42 51 41 50 40
+
+        punpckhwd   mm4,        mm6                         ; 73 63 53 43 72 62 52 42
+        punpcklwd   mm2,        mm6                         ; 71 61 51 41 70 60 50 40
+
+        neg         rax
+        movq        mm6,        [rsi+rax*2]                 ; 27 26 25 24 23 22 21 20
+
+        movq        mm1,        mm6                         ; 27 26 25 24 23 22 21 20
+        punpckhbw   mm6,        [rsi+rax]                   ; 37 27 36 36 35 25 34 24
+
+        punpcklbw   mm1,        [rsi+rax]                   ; 33 23 32 22 31 21 30 20
+        movq        mm7,        [rsi+rax*4];                ; 07 06 05 04 03 02 01 00
+
+        punpckhbw   mm7,        [rdi+rax*4]                 ; 17 07 16 06 15 05 14 04
+        movq        mm0,        mm7                         ; 17 07 16 06 15 05 14 04
+
+        punpckhwd   mm7,        mm6                         ; 37 27 17 07 36 26 16 06
+        punpcklwd   mm0,        mm6                         ; 35 25 15 05 34 24 14 04
+
+        movq        mm6,        mm7                         ; 37 27 17 07 36 26 16 06
+        punpckhdq   mm7,        mm5                         ; 77 67 57 47 37 27 17 07  = q3
+
+        punpckldq   mm6,        mm5                         ; 76 66 56 46 36 26 16 06  = q2
+
+        movq        mm5,        mm6                         ; 76 66 56 46 36 26 16 06
+        psubusb     mm5,        mm7                         ; q2-q3
+
+        psubusb     mm7,        mm6                         ; q3-q2
+        por         mm7,        mm5;                        ; mm7=abs (q3-q2)
+
+        movq        mm5,        mm0                         ; 35 25 15 05 34 24 14 04
+        punpckhdq   mm5,        mm3                         ; 75 65 55 45 35 25 15 05 = q1
+
+        punpckldq   mm0,        mm3                         ; 74 64 54 44 34 24 15 04 = q0
+        movq        mm3,        mm5                         ; 75 65 55 45 35 25 15 05 = q1
+
+        psubusb     mm3,        mm6                         ; q1-q2
+        psubusb     mm6,        mm5                         ; q2-q1
+
+        por         mm6,        mm3                         ; mm6=abs(q2-q1)
+        lea         rdx,        srct
+
+        movq        [rdx+24],   mm5                         ; save q1
+        movq        [rdx+16],   mm0                         ; save q0
+
+        movq        mm3,        [rsi+rax*4]                 ; 07 06 05 04 03 02 01 00
+        punpcklbw   mm3,        [rdi+rax*4]                 ; 13 03 12 02 11 01 10 00
+
+        movq        mm0,        mm3                         ; 13 03 12 02 11 01 10 00
+        punpcklwd   mm0,        mm1                         ; 31 21 11 01 30 20 10 00
+
+        punpckhwd   mm3,        mm1                         ; 33 23 13 03 32 22 12 02
+        movq        mm1,        mm0                         ; 31 21 11 01 30 20 10 00
+
+        punpckldq   mm0,        mm2                         ; 70 60 50 40 30 20 10 00  =p3
+        punpckhdq   mm1,        mm2                         ; 71 61 51 41 31 21 11 01  =p2
+
+        movq        mm2,        mm1                         ; 71 61 51 41 31 21 11 01  =p2
+        psubusb     mm2,        mm0                         ; p2-p3
+
+        psubusb     mm0,        mm1                         ; p3-p2
+        por         mm0,        mm2                         ; mm0=abs(p3-p2)
+
+        movq        mm2,        mm3                         ; 33 23 13 03 32 22 12 02
+        punpckldq   mm2,        mm4                         ; 72 62 52 42 32 22 12 02 = p1
+
+        punpckhdq   mm3,        mm4                         ; 73 63 53 43 33 23 13 03 = p0
+        movq        [rdx+8],    mm3                         ; save p0
+
+        movq        [rdx],      mm2                         ; save p1
+        movq        mm5,        mm2                         ; mm5 = p1
+
+        psubusb     mm2,        mm1                         ; p1-p2
+        psubusb     mm1,        mm5                         ; p2-p1
+
+        por         mm1,        mm2                         ; mm1=abs(p2-p1)
+        mov         rdx,        arg(3) ;limit
+
+        movq        mm4,        [rdx]                       ; mm4 = limit
+        psubusb     mm7,        mm4
+
+        psubusb     mm0,        mm4
+        psubusb     mm1,        mm4
+
+        psubusb     mm6,        mm4
+        por         mm7,        mm6
+
+        por         mm0,        mm1
+        por         mm0,        mm7                         ;   abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
+
+        movq        mm1,        mm5                         ; p1
+
+        movq        mm7,        mm3                         ; mm3=mm7=p0
+        psubusb     mm7,        mm5                         ; p0 - p1
+
+        psubusb     mm5,        mm3                         ; p1 - p0
+        por         mm5,        mm7                         ; abs(p1-p0)
+
+        movq        t0,         mm5                         ; save abs(p1-p0)
+        lea         rdx,        srct
+
+        psubusb     mm5,        mm4
+        por         mm0,        mm5                         ; mm0=mask
+
+        movq        mm5,        [rdx+16]                    ; mm5=q0
+        movq        mm7,        [rdx+24]                    ; mm7=q1
+
+        movq        mm6,        mm5                         ; mm6=q0
+        movq        mm2,        mm7                         ; q1
+        psubusb     mm5,        mm7                         ; q0-q1
+
+        psubusb     mm7,        mm6                         ; q1-q0
+        por         mm7,        mm5                         ; abs(q1-q0)
+
+        movq        t1,         mm7                         ; save abs(q1-q0)
+        psubusb     mm7,        mm4
+
+        por         mm0,        mm7                         ; mask
+
+        movq        mm5,        mm2                         ; q1
+        psubusb     mm5,        mm1                         ; q1-=p1
+        psubusb     mm1,        mm2                         ; p1-=q1
+        por         mm5,        mm1                         ; abs(p1-q1)
+        pand        mm5,        [GLOBAL(tfe)]               ; set lsb of each byte to zero
+        psrlw       mm5,        1                           ; abs(p1-q1)/2
+
+        mov         rdx,        arg(2) ;blimit                      ;
+
+        movq        mm4,        [rdx]                       ;blimit
+        movq        mm1,        mm3                         ; mm1=mm3=p0
+
+        movq        mm7,        mm6                         ; mm7=mm6=q0
+        psubusb     mm1,        mm7                         ; p0-q0
+
+        psubusb     mm7,        mm3                         ; q0-p0
+        por         mm1,        mm7                         ; abs(q0-p0)
+        paddusb     mm1,        mm1                         ; abs(q0-p0)*2
+        paddusb     mm1,        mm5                         ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+        psubusb     mm1,        mm4                         ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
+        por         mm1,        mm0;                        ; mask
+
+        pxor        mm0,        mm0
+        pcmpeqb     mm1,        mm0
+
+        ; calculate high edge variance
+        mov         rdx,        arg(4) ;thresh            ; get thresh
+        movq        mm7,        [rdx]
+        ;
+        movq        mm4,        t0              ; get abs (q1 - q0)
+        psubusb     mm4,        mm7
+
+        movq        mm3,        t1              ; get abs (p1 - p0)
+        psubusb     mm3,        mm7
+
+        por         mm4,        mm3             ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+        pcmpeqb     mm4,        mm0
+
+        pcmpeqb     mm0,        mm0
+        pxor        mm4,        mm0
+
+
+
+        ; start work on filters
+        lea         rdx,        srct
+
+        movq        mm2,        [rdx]           ; p1
+        movq        mm7,        [rdx+24]        ; q1
+
+        movq        mm6,        [rdx+8]         ; p0
+        movq        mm0,        [rdx+16]        ; q0
+
+        pxor        mm2,        [GLOBAL(t80)]   ; p1 offset to convert to signed values
+        pxor        mm7,        [GLOBAL(t80)]   ; q1 offset to convert to signed values
+
+        psubsb      mm2,        mm7             ; p1 - q1
+        pand        mm2,        mm4             ; high var mask (hvm)(p1 - q1)
+
+        pxor        mm6,        [GLOBAL(t80)]   ; offset to convert to signed values
+        pxor        mm0,        [GLOBAL(t80)]   ; offset to convert to signed values
+
+        movq        mm3,        mm0             ; q0
+        psubsb      mm0,        mm6             ; q0 - p0
+
+        paddsb      mm2,        mm0             ; 1 * (q0 - p0) + hvm(p1 - q1)
+        paddsb      mm2,        mm0             ; 2 * (q0 - p0) + hvm(p1 - q1)
+
+        paddsb      mm2,        mm0             ; 3 * (q0 - p0) + hvm(p1 - q1)
+        pand       mm1,        mm2              ; mask filter values we don't care about
+
+        movq        mm2,        mm1
+        paddsb      mm1,        [GLOBAL(t4)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 4
+
+        paddsb      mm2,        [GLOBAL(t3)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 3
+        pxor        mm0,        mm0          ;
+
+        pxor        mm5,        mm5
+        punpcklbw   mm0,        mm2         ;
+
+        punpckhbw   mm5,        mm2         ;
+        psraw       mm0,        11              ;
+
+        psraw       mm5,        11
+        packsswb    mm0,        mm5
+
+        movq        mm2,        mm0         ;  (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
+
+        pxor        mm0,        mm0           ; 0
+        movq        mm5,        mm1           ; abcdefgh
+
+        punpcklbw   mm0,        mm1           ; e0f0g0h0
+        psraw       mm0,        11                ; sign extended shift right by 3
+
+        pxor        mm1,        mm1           ; 0
+        punpckhbw   mm1,        mm5           ; a0b0c0d0
+
+        psraw       mm1,        11                ; sign extended shift right by 3
+        movq        mm5,        mm0              ; save results
+
+        packsswb    mm0,        mm1           ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
+        paddsw      mm5,        [GLOBAL(ones)]
+
+        paddsw      mm1,        [GLOBAL(ones)]
+        psraw       mm5,        1                 ; partial shifted one more time for 2nd tap
+
+        psraw       mm1,        1                 ; partial shifted one more time for 2nd tap
+        packsswb    mm5,        mm1           ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
+
+        pandn       mm4,        mm5             ; high edge variance additive
+
+        paddsb      mm6,        mm2             ; p0+= p0 add
+        pxor        mm6,        [GLOBAL(t80)]   ; unoffset
+
+        ; mm6=p0                               ;
+        movq        mm1,        [rdx]           ; p1
+        pxor        mm1,        [GLOBAL(t80)]   ; reoffset
+
+        paddsb      mm1,        mm4                 ; p1+= p1 add
+        pxor        mm1,        [GLOBAL(t80)]       ; unoffset
+        ; mm6 = p0 mm1 = p1
+
+        psubsb      mm3,        mm0                 ; q0-= q0 add
+        pxor        mm3,        [GLOBAL(t80)]       ; unoffset
+
+        ; mm3 = q0
+        psubsb      mm7,        mm4                 ; q1-= q1 add
+        pxor        mm7,        [GLOBAL(t80)]       ; unoffset
+        ; mm7 = q1
+
+        ; transpose and write back
+        ; mm1 =    72 62 52 42 32 22 12 02
+        ; mm6 =    73 63 53 43 33 23 13 03
+        ; mm3 =    74 64 54 44 34 24 14 04
+        ; mm7 =    75 65 55 45 35 25 15 05
+
+        movq        mm2,        mm1             ; 72 62 52 42 32 22 12 02
+        punpcklbw   mm2,        mm6             ; 33 32 23 22 13 12 03 02
+
+        movq        mm4,        mm3             ; 74 64 54 44 34 24 14 04
+        punpckhbw   mm1,        mm6             ; 73 72 63 62 53 52 43 42
+
+        punpcklbw   mm4,        mm7             ; 35 34 25 24 15 14 05 04
+        punpckhbw   mm3,        mm7             ; 75 74 65 64 55 54 45 44
+
+        movq        mm6,        mm2             ; 33 32 23 22 13 12 03 02
+        punpcklwd   mm2,        mm4             ; 15 14 13 12 05 04 03 02
+
+        punpckhwd   mm6,        mm4             ; 35 34 33 32 25 24 23 22
+        movq        mm5,        mm1             ; 73 72 63 62 53 52 43 42
+
+        punpcklwd   mm1,        mm3             ; 55 54 53 52 45 44 43 42
+        punpckhwd   mm5,        mm3             ; 75 74 73 72 65 64 63 62
+
+
+        ; mm2 = 15 14 13 12 05 04 03 02
+        ; mm6 = 35 34 33 32 25 24 23 22
+        ; mm5 = 55 54 53 52 45 44 43 42
+        ; mm1 = 75 74 73 72 65 64 63 62
+
+
+
+        movd        [rsi+rax*4+2], mm2
+        psrlq       mm2,        32
+
+        movd        [rdi+rax*4+2], mm2
+        movd        [rsi+rax*2+2], mm6
+
+        psrlq       mm6,        32
+        movd        [rsi+rax+2],mm6
+
+        movd        [rsi+2],    mm1
+        psrlq       mm1,        32
+
+        movd        [rdi+2],    mm1
+        neg         rax
+
+        movd        [rdi+rax+2],mm5
+        psrlq       mm5,        32
+
+        movd        [rdi+rax*2+2], mm5
+
+        lea         rsi,        [rsi+rax*8]
+        dec         rcx
+        jnz         .next8_v
+
+    add rsp, 64
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+tfe:
+    times 8 db 0xfe
+align 16
+t80:
+    times 8 db 0x80
+align 16
+t3:
+    times 8 db 0x03
+align 16
+t4:
+    times 8 db 0x04
+align 16
+ones:
+    times 4 dw 0x0001
diff --git a/libs/libvpx/vpx_dsp/x86/loopfilter_sse2.c b/libs/libvpx/vpx_dsp/x86/loopfilter_sse2.c
new file mode 100644
index 0000000000..ed10127367
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/loopfilter_sse2.c
@@ -0,0 +1,1587 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>  // SSE2
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/emmintrin_compat.h"
+
+static INLINE __m128i abs_diff(__m128i a, __m128i b) {
+  return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
+}
+
+static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s,
+                                            int p,
+                                            const unsigned char *_blimit,
+                                            const unsigned char *_limit,
+                                            const unsigned char *_thresh) {
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i one = _mm_set1_epi8(1);
+  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
+  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
+  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+  __m128i mask, hev, flat, flat2;
+  __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
+  __m128i abs_p1p0;
+
+  q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
+  q4p4 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q4p4),
+                                       (__m64 *)(s + 4 * p)));
+  q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
+  q3p3 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q3p3),
+                                       (__m64 *)(s + 3 * p)));
+  q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+  q2p2 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q2p2),
+                                       (__m64 *)(s + 2 * p)));
+  q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+  q1p1 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q1p1),
+                                       (__m64 *)(s + 1 * p)));
+  p1q1 = _mm_shuffle_epi32(q1p1, 78);
+  q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+  q0p0 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q0p0),
+                                       (__m64 *)(s - 0 * p)));
+  p0q0 = _mm_shuffle_epi32(q0p0, 78);
+
+  {
+    __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
+    abs_p1p0 = abs_diff(q1p1, q0p0);
+    abs_q1q0 =  _mm_srli_si128(abs_p1p0, 8);
+    fe = _mm_set1_epi8(0xfe);
+    ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+    abs_p0q0 = abs_diff(q0p0, p0q0);
+    abs_p1q1 = abs_diff(q1p1, p1q1);
+    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+    hev = _mm_subs_epu8(flat, thresh);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(abs_p1p0, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
+
+    work = _mm_max_epu8(abs_diff(q2p2, q1p1),
+                        abs_diff(q3p3, q2p2));
+    mask = _mm_max_epu8(work, mask);
+    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
+    mask = _mm_subs_epu8(mask, limit);
+    mask = _mm_cmpeq_epi8(mask, zero);
+  }
+
+  // lp filter
+  {
+    const __m128i t4 = _mm_set1_epi8(4);
+    const __m128i t3 = _mm_set1_epi8(3);
+    const __m128i t80 = _mm_set1_epi8(0x80);
+    const __m128i t1 = _mm_set1_epi16(0x1);
+    __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
+    __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
+    __m128i qs0 = _mm_xor_si128(p0q0, t80);
+    __m128i qs1 = _mm_xor_si128(p1q1, t80);
+    __m128i filt;
+    __m128i work_a;
+    __m128i filter1, filter2;
+    __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
+    __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
+
+    filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
+    work_a = _mm_subs_epi8(qs0, qs0ps0);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    // (vpx_filter + 3 * (qs0 - ps0)) & mask
+    filt = _mm_and_si128(filt, mask);
+
+    filter1 = _mm_adds_epi8(filt, t4);
+    filter2 = _mm_adds_epi8(filt, t3);
+
+    filter1 = _mm_unpacklo_epi8(zero, filter1);
+    filter1 = _mm_srai_epi16(filter1, 0xB);
+    filter2 = _mm_unpacklo_epi8(zero, filter2);
+    filter2 = _mm_srai_epi16(filter2, 0xB);
+
+    // Filter1 >> 3
+    filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
+    qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
+
+    // filt >> 1
+    filt = _mm_adds_epi16(filter1, t1);
+    filt = _mm_srai_epi16(filt, 1);
+    filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
+                            filt);
+    filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
+    qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
+    // loopfilter done
+
+    {
+      __m128i work;
+      flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
+      flat = _mm_max_epu8(abs_p1p0, flat);
+      flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+      flat = _mm_subs_epu8(flat, one);
+      flat = _mm_cmpeq_epi8(flat, zero);
+      flat = _mm_and_si128(flat, mask);
+
+      q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
+      q5p5 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q5p5),
+                                           (__m64 *)(s + 5 * p)));
+
+      q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
+      q6p6 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q6p6),
+                                           (__m64 *)(s + 6 * p)));
+      flat2 = _mm_max_epu8(abs_diff(q4p4, q0p0), abs_diff(q5p5, q0p0));
+
+      q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
+      q7p7 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q7p7),
+                                           (__m64 *)(s + 7 * p)));
+      work = _mm_max_epu8(abs_diff(q6p6, q0p0), abs_diff(q7p7, q0p0));
+      flat2 = _mm_max_epu8(work, flat2);
+      flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
+      flat2 = _mm_subs_epu8(flat2, one);
+      flat2 = _mm_cmpeq_epi8(flat2, zero);
+      flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
+    }
+
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    // flat and wide flat calculations
+    {
+      const __m128i eight = _mm_set1_epi16(8);
+      const __m128i four = _mm_set1_epi16(4);
+      __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
+      __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
+      __m128i pixelFilter_p, pixelFilter_q;
+      __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
+      __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
+
+      p7_16 = _mm_unpacklo_epi8(q7p7, zero);;
+      p6_16 = _mm_unpacklo_epi8(q6p6, zero);
+      p5_16 = _mm_unpacklo_epi8(q5p5, zero);
+      p4_16 = _mm_unpacklo_epi8(q4p4, zero);
+      p3_16 = _mm_unpacklo_epi8(q3p3, zero);
+      p2_16 = _mm_unpacklo_epi8(q2p2, zero);
+      p1_16 = _mm_unpacklo_epi8(q1p1, zero);
+      p0_16 = _mm_unpacklo_epi8(q0p0, zero);
+      q0_16 = _mm_unpackhi_epi8(q0p0, zero);
+      q1_16 = _mm_unpackhi_epi8(q1p1, zero);
+      q2_16 = _mm_unpackhi_epi8(q2p2, zero);
+      q3_16 = _mm_unpackhi_epi8(q3p3, zero);
+      q4_16 = _mm_unpackhi_epi8(q4p4, zero);
+      q5_16 = _mm_unpackhi_epi8(q5p5, zero);
+      q6_16 = _mm_unpackhi_epi8(q6p6, zero);
+      q7_16 = _mm_unpackhi_epi8(q7p7, zero);
+
+      pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
+                                    _mm_add_epi16(p4_16, p3_16));
+      pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
+                                    _mm_add_epi16(q4_16, q3_16));
+
+      pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
+      pixelFilter_p =  _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+      pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
+      pixelFilter_q =  _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+      pixelFilter_p =  _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p,
+                                                         pixelFilter_q));
+      pixetFilter_p2p1p0 =   _mm_add_epi16(four,
+                                           _mm_add_epi16(pixetFilter_p2p1p0,
+                                                         pixetFilter_q2q1q0));
+      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                                           _mm_add_epi16(p7_16, p0_16)), 4);
+      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                                           _mm_add_epi16(q7_16, q0_16)), 4);
+      flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
+      res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
+                                           _mm_add_epi16(p3_16, p0_16)), 3);
+      res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
+                                           _mm_add_epi16(q3_16, q0_16)), 3);
+
+      flat_q0p0 = _mm_packus_epi16(res_p, res_q);
+
+      sum_p7 = _mm_add_epi16(p7_16, p7_16);
+      sum_q7 = _mm_add_epi16(q7_16, q7_16);
+      sum_p3 = _mm_add_epi16(p3_16, p3_16);
+      sum_q3 = _mm_add_epi16(q3_16, q3_16);
+
+      pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
+      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
+      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                             _mm_add_epi16(sum_p7, p1_16)), 4);
+      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+                             _mm_add_epi16(sum_q7, q1_16)), 4);
+      flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+      pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
+      pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
+      res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
+                             _mm_add_epi16(sum_p3, p1_16)), 3);
+      res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
+                             _mm_add_epi16(sum_q3, q1_16)), 3);
+      flat_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+      sum_p3 = _mm_add_epi16(sum_p3, p3_16);
+      sum_q3 = _mm_add_epi16(sum_q3, q3_16);
+
+      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
+      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
+      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                             _mm_add_epi16(sum_p7, p2_16)), 4);
+      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+                             _mm_add_epi16(sum_q7, q2_16)), 4);
+      flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+      pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
+      pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
+
+      res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
+                                           _mm_add_epi16(sum_p3, p2_16)), 3);
+      res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
+                                           _mm_add_epi16(sum_q3, q2_16)), 3);
+      flat_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
+      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
+      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                             _mm_add_epi16(sum_p7, p3_16)), 4);
+      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+                             _mm_add_epi16(sum_q7, q3_16)), 4);
+      flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
+
+      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
+      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
+      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                             _mm_add_epi16(sum_p7, p4_16)), 4);
+      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+                             _mm_add_epi16(sum_q7, q4_16)), 4);
+      flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
+
+      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
+      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
+      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                             _mm_add_epi16(sum_p7, p5_16)), 4);
+      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+                             _mm_add_epi16(sum_q7, q5_16)), 4);
+      flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
+
+      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
+      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
+      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+                             _mm_add_epi16(sum_p7, p6_16)), 4);
+      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+                             _mm_add_epi16(sum_q7, q6_16)), 4);
+      flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
+    }
+    // wide flat
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    flat = _mm_shuffle_epi32(flat, 68);
+    flat2 = _mm_shuffle_epi32(flat2, 68);
+
+    q2p2 = _mm_andnot_si128(flat, q2p2);
+    flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
+    q2p2 = _mm_or_si128(q2p2, flat_q2p2);
+
+    qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
+    flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
+    q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
+
+    qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
+    flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
+    q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
+
+    q6p6 = _mm_andnot_si128(flat2, q6p6);
+    flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
+    q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
+    _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
+    _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));
+
+    q5p5 = _mm_andnot_si128(flat2, q5p5);
+    flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
+    q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
+    _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
+    _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));
+
+    q4p4 = _mm_andnot_si128(flat2, q4p4);
+    flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
+    q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
+    _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
+    _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));
+
+    q3p3 = _mm_andnot_si128(flat2, q3p3);
+    flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
+    q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
+    _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
+    _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));
+
+    q2p2 = _mm_andnot_si128(flat2, q2p2);
+    flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
+    q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
+    _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
+    _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));
+
+    q1p1 = _mm_andnot_si128(flat2, q1p1);
+    flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
+    q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
+    _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
+    _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));
+
+    q0p0 = _mm_andnot_si128(flat2, q0p0);
+    flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
+    q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
+    _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
+    _mm_storeh_pi((__m64 *)(s - 0 * p),  _mm_castsi128_ps(q0p0));
+  }
+}
+
+static INLINE __m128i filter_add2_sub2(const __m128i *const total,
+                                       const __m128i *const a1,
+                                       const __m128i *const a2,
+                                       const __m128i *const s1,
+                                       const __m128i *const s2) {
+  __m128i x = _mm_add_epi16(*a1, *total);
+  x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(*s1, *s2)), *a2);
+  return x;
+}
+
+static INLINE __m128i filter8_mask(const __m128i *const flat,
+                                   const __m128i *const other_filt,
+                                   const __m128i *const f8_lo,
+                                   const __m128i *const f8_hi) {
+  const __m128i f8 = _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3),
+                                      _mm_srli_epi16(*f8_hi, 3));
+  const __m128i result = _mm_and_si128(*flat, f8);
+  return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
+}
+
+static INLINE __m128i filter16_mask(const __m128i *const flat,
+                                    const __m128i *const other_filt,
+                                    const __m128i *const f_lo,
+                                    const __m128i *const f_hi) {
+  const __m128i f = _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4),
+                                     _mm_srli_epi16(*f_hi, 4));
+  const __m128i result = _mm_and_si128(*flat, f);
+  return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
+}
+
+static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
+                                             int p,
+                                             const unsigned char *_blimit,
+                                             const unsigned char *_limit,
+                                             const unsigned char *_thresh) {
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i one = _mm_set1_epi8(1);
+  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
+  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
+  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+  __m128i mask, hev, flat, flat2;
+  __m128i p7, p6, p5;
+  __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
+  __m128i q5, q6, q7;
+
+  __m128i op2, op1, op0, oq0, oq1, oq2;
+
+  __m128i max_abs_p1p0q1q0;
+
+  p7 = _mm_loadu_si128((__m128i *)(s - 8 * p));
+  p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
+  p5 = _mm_loadu_si128((__m128i *)(s - 6 * p));
+  p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
+  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
+  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+  q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
+  q5 = _mm_loadu_si128((__m128i *)(s + 5 * p));
+  q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
+  q7 = _mm_loadu_si128((__m128i *)(s + 7 * p));
+
+  {
+    const __m128i abs_p1p0 = abs_diff(p1, p0);
+    const __m128i abs_q1q0 = abs_diff(q1, q0);
+    const __m128i fe = _mm_set1_epi8(0xfe);
+    const __m128i ff = _mm_cmpeq_epi8(zero, zero);
+    __m128i abs_p0q0 = abs_diff(p0, q0);
+    __m128i abs_p1q1 = abs_diff(p1, q1);
+    __m128i work;
+    max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
+
+    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
+    work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2));
+    mask = _mm_max_epu8(work, mask);
+    work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2));
+    mask = _mm_max_epu8(work, mask);
+    mask = _mm_subs_epu8(mask, limit);
+    mask = _mm_cmpeq_epi8(mask, zero);
+  }
+
+  {
+    __m128i work;
+    work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0));
+    flat = _mm_max_epu8(work, max_abs_p1p0q1q0);
+    work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0));
+    flat = _mm_max_epu8(work, flat);
+    work = _mm_max_epu8(abs_diff(p4, p0), abs_diff(q4, q0));
+    flat = _mm_subs_epu8(flat, one);
+    flat = _mm_cmpeq_epi8(flat, zero);
+    flat = _mm_and_si128(flat, mask);
+    flat2 = _mm_max_epu8(abs_diff(p5, p0), abs_diff(q5, q0));
+    flat2 = _mm_max_epu8(work, flat2);
+    work = _mm_max_epu8(abs_diff(p6, p0), abs_diff(q6, q0));
+    flat2 = _mm_max_epu8(work, flat2);
+    work = _mm_max_epu8(abs_diff(p7, p0), abs_diff(q7, q0));
+    flat2 = _mm_max_epu8(work, flat2);
+    flat2 = _mm_subs_epu8(flat2, one);
+    flat2 = _mm_cmpeq_epi8(flat2, zero);
+    flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
+  }
+
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // filter4
+  {
+    const __m128i t4 = _mm_set1_epi8(4);
+    const __m128i t3 = _mm_set1_epi8(3);
+    const __m128i t80 = _mm_set1_epi8(0x80);
+    const __m128i te0 = _mm_set1_epi8(0xe0);
+    const __m128i t1f = _mm_set1_epi8(0x1f);
+    const __m128i t1 = _mm_set1_epi8(0x1);
+    const __m128i t7f = _mm_set1_epi8(0x7f);
+    const __m128i ff = _mm_cmpeq_epi8(t4, t4);
+
+    __m128i filt;
+    __m128i work_a;
+    __m128i filter1, filter2;
+
+    op1 = _mm_xor_si128(p1, t80);
+    op0 = _mm_xor_si128(p0, t80);
+    oq0 = _mm_xor_si128(q0, t80);
+    oq1 = _mm_xor_si128(q1, t80);
+
+    hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+    filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
+
+    work_a = _mm_subs_epi8(oq0, op0);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    // (vpx_filter + 3 * (qs0 - ps0)) & mask
+    filt = _mm_and_si128(filt, mask);
+    filter1 = _mm_adds_epi8(filt, t4);
+    filter2 = _mm_adds_epi8(filt, t3);
+
+    // Filter1 >> 3
+    work_a = _mm_cmpgt_epi8(zero, filter1);
+    filter1 = _mm_srli_epi16(filter1, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter1 = _mm_and_si128(filter1, t1f);
+    filter1 = _mm_or_si128(filter1, work_a);
+    oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
+
+    // Filter2 >> 3
+    work_a = _mm_cmpgt_epi8(zero, filter2);
+    filter2 = _mm_srli_epi16(filter2, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter2 = _mm_and_si128(filter2, t1f);
+    filter2 = _mm_or_si128(filter2, work_a);
+    op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
+
+    // filt >> 1
+    filt = _mm_adds_epi8(filter1, t1);
+    work_a = _mm_cmpgt_epi8(zero, filt);
+    filt = _mm_srli_epi16(filt, 1);
+    work_a = _mm_and_si128(work_a, t80);
+    filt = _mm_and_si128(filt, t7f);
+    filt = _mm_or_si128(filt, work_a);
+    filt = _mm_andnot_si128(hev, filt);
+    op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
+    oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
+    // loopfilter done
+
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    // filter8
+    {
+      const __m128i four = _mm_set1_epi16(4);
+      const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
+      const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
+      const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
+      const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
+      const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
+      const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
+      const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
+      const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
+
+      const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
+      const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
+      const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
+      const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
+      const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
+      const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
+      const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
+      const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
+      __m128i f8_lo, f8_hi;
+
+      f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four),
+                            _mm_add_epi16(p3_lo, p2_lo));
+      f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo),
+                            _mm_add_epi16(p2_lo, p1_lo));
+      f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo);
+
+      f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four),
+                            _mm_add_epi16(p3_hi, p2_hi));
+      f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi),
+                            _mm_add_epi16(p2_hi, p1_hi));
+      f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi);
+
+      op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi);
+
+      f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo);
+      f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi);
+      op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi);
+
+      f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo);
+      f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi);
+      op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi);
+
+      f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo);
+      f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi);
+      oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi);
+
+      f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo);
+      f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi);
+      oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi);
+
+      f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo);
+      f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi);
+      oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi);
+    }
+
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    // wide flat calculations
+    {
+      const __m128i eight = _mm_set1_epi16(8);
+      const __m128i p7_lo = _mm_unpacklo_epi8(p7, zero);
+      const __m128i p6_lo = _mm_unpacklo_epi8(p6, zero);
+      const __m128i p5_lo = _mm_unpacklo_epi8(p5, zero);
+      const __m128i p4_lo = _mm_unpacklo_epi8(p4, zero);
+      const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
+      const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
+      const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
+      const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
+      const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
+      const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
+      const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
+      const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
+      const __m128i q4_lo = _mm_unpacklo_epi8(q4, zero);
+      const __m128i q5_lo = _mm_unpacklo_epi8(q5, zero);
+      const __m128i q6_lo = _mm_unpacklo_epi8(q6, zero);
+      const __m128i q7_lo = _mm_unpacklo_epi8(q7, zero);
+
+      const __m128i p7_hi = _mm_unpackhi_epi8(p7, zero);
+      const __m128i p6_hi = _mm_unpackhi_epi8(p6, zero);
+      const __m128i p5_hi = _mm_unpackhi_epi8(p5, zero);
+      const __m128i p4_hi = _mm_unpackhi_epi8(p4, zero);
+      const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
+      const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
+      const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
+      const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
+      const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
+      const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
+      const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
+      const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
+      const __m128i q4_hi = _mm_unpackhi_epi8(q4, zero);
+      const __m128i q5_hi = _mm_unpackhi_epi8(q5, zero);
+      const __m128i q6_hi = _mm_unpackhi_epi8(q6, zero);
+      const __m128i q7_hi = _mm_unpackhi_epi8(q7, zero);
+
+      __m128i f_lo;
+      __m128i f_hi;
+
+      f_lo = _mm_sub_epi16(_mm_slli_epi16(p7_lo, 3), p7_lo);  // p7 * 7
+      f_lo = _mm_add_epi16(_mm_slli_epi16(p6_lo, 1),
+                           _mm_add_epi16(p4_lo, f_lo));
+      f_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f_lo),
+                           _mm_add_epi16(p2_lo, p1_lo));
+      f_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f_lo);
+      f_lo = _mm_add_epi16(_mm_add_epi16(p5_lo, eight), f_lo);
+
+      f_hi = _mm_sub_epi16(_mm_slli_epi16(p7_hi, 3), p7_hi);  // p7 * 7
+      f_hi = _mm_add_epi16(_mm_slli_epi16(p6_hi, 1),
+                           _mm_add_epi16(p4_hi, f_hi));
+      f_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f_hi),
+                           _mm_add_epi16(p2_hi, p1_hi));
+      f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi);
+      f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi);
+
+      p6 = filter16_mask(&flat2, &p6, &f_lo, &f_hi);
+      _mm_storeu_si128((__m128i *)(s - 7 * p), p6);
+
+      f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo);
+      f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi);
+      p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi);
+      _mm_storeu_si128((__m128i *)(s - 6 * p), p5);
+
+      f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo);
+      f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi);
+      p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi);
+      _mm_storeu_si128((__m128i *)(s - 5 * p), p4);
+
+      f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo);
+      f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi);
+      p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi);
+      _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
+
+      f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo);
+      f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi);
+      op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi);
+      _mm_storeu_si128((__m128i *)(s - 3 * p), op2);
+
+      f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo);
+      f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi);
+      op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi);
+      _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
+
+      f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo);
+      f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi);
+      op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi);
+      _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
+
+      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo);
+      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi);
+      oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi);
+      _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
+
+      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo);
+      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi);
+      oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi);
+      _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
+
+      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo);
+      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi);
+      oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi);
+      _mm_storeu_si128((__m128i *)(s + 2 * p), oq2);
+
+      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo);
+      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi);
+      q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi);
+      _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
+
+      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo);
+      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi);
+      q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi);
+      _mm_storeu_si128((__m128i *)(s + 4 * p), q4);
+
+      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo);
+      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi);
+      q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi);
+      _mm_storeu_si128((__m128i *)(s + 5 * p), q5);
+
+      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo);
+      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi);
+      q6 = filter16_mask(&flat2, &q6, &f_lo, &f_hi);
+      _mm_storeu_si128((__m128i *)(s + 6 * p), q6);
+    }
+    // wide flat
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  }
+}
+
+// TODO(yunqingwang): remove count and call these 2 functions(8 or 16) directly.
+void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p,
+                                const unsigned char *_blimit,
+                                const unsigned char *_limit,
+                                const unsigned char *_thresh, int count) {
+  if (count == 1)
+    mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh);
+  else
+    mb_lpf_horizontal_edge_w_sse2_16(s, p, _blimit, _limit, _thresh);
+}
+
+void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
+                               const unsigned char *_blimit,
+                               const unsigned char *_limit,
+                               const unsigned char *_thresh, int count) {
+  DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
+  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
+  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+  __m128i mask, hev, flat;
+  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+  __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0;
+
+  (void)count;
+
+  q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 3 * p)));
+  q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 2 * p)));
+  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 1 * p)));
+  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
+                            _mm_loadl_epi64((__m128i *)(s - 0 * p)));
+  p1q1 = _mm_shuffle_epi32(q1p1, 78);
+  p0q0 = _mm_shuffle_epi32(q0p0, 78);
+
+  {
+    // filter_mask and hev_mask
+    const __m128i one = _mm_set1_epi8(1);
+    const __m128i fe = _mm_set1_epi8(0xfe);
+    const __m128i ff = _mm_cmpeq_epi8(fe, fe);
+    __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
+    abs_p1p0 = abs_diff(q1p1, q0p0);
+    abs_q1q0 =  _mm_srli_si128(abs_p1p0, 8);
+
+    abs_p0q0 = abs_diff(q0p0, p0q0);
+    abs_p1q1 = abs_diff(q1p1, p1q1);
+    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+    hev = _mm_subs_epu8(flat, thresh);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(abs_p1p0, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
+
+    work = _mm_max_epu8(abs_diff(q2p2, q1p1),
+                        abs_diff(q3p3, q2p2));
+    mask = _mm_max_epu8(work, mask);
+    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
+    mask = _mm_subs_epu8(mask, limit);
+    mask = _mm_cmpeq_epi8(mask, zero);
+
+    // flat_mask4
+
+    flat = _mm_max_epu8(abs_diff(q2p2, q0p0),
+                        abs_diff(q3p3, q0p0));
+    flat = _mm_max_epu8(abs_p1p0, flat);
+    flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+    flat = _mm_subs_epu8(flat, one);
+    flat = _mm_cmpeq_epi8(flat, zero);
+    flat = _mm_and_si128(flat, mask);
+  }
+
+  {
+    const __m128i four = _mm_set1_epi16(4);
+    unsigned char *src = s;
+    {
+      __m128i workp_a, workp_b, workp_shft;
+      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
+      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
+      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
+      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
+      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
+      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
+      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
+      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
+
+      workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
+      workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
+      workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_op2[0],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_op1[0],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_op0[0],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_oq0[0],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_oq1[0],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_oq2[0],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+    }
+  }
+  // lp filter
+  {
+    const __m128i t4 = _mm_set1_epi8(4);
+    const __m128i t3 = _mm_set1_epi8(3);
+    const __m128i t80 = _mm_set1_epi8(0x80);
+    const __m128i t1 = _mm_set1_epi8(0x1);
+    const __m128i ps1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
+                                      t80);
+    const __m128i ps0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
+                                      t80);
+    const __m128i qs0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)),
+                                      t80);
+    const __m128i qs1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)),
+                                      t80);
+    __m128i filt;
+    __m128i work_a;
+    __m128i filter1, filter2;
+
+    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+    work_a = _mm_subs_epi8(qs0, ps0);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    // (vpx_filter + 3 * (qs0 - ps0)) & mask
+    filt = _mm_and_si128(filt, mask);
+
+    filter1 = _mm_adds_epi8(filt, t4);
+    filter2 = _mm_adds_epi8(filt, t3);
+
+    // Filter1 >> 3
+    filter1 = _mm_unpacklo_epi8(zero, filter1);
+    filter1 = _mm_srai_epi16(filter1, 11);
+    filter1 = _mm_packs_epi16(filter1, filter1);
+
+    // Filter2 >> 3
+    filter2 = _mm_unpacklo_epi8(zero, filter2);
+    filter2 = _mm_srai_epi16(filter2, 11);
+    filter2 = _mm_packs_epi16(filter2, zero);
+
+    // filt >> 1
+    filt = _mm_adds_epi8(filter1, t1);
+    filt = _mm_unpacklo_epi8(zero, filt);
+    filt = _mm_srai_epi16(filt, 9);
+    filt = _mm_packs_epi16(filt, zero);
+
+    filt = _mm_andnot_si128(hev, filt);
+
+    work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+    q0 = _mm_loadl_epi64((__m128i *)flat_oq0);
+    work_a = _mm_andnot_si128(flat, work_a);
+    q0 = _mm_and_si128(flat, q0);
+    q0 = _mm_or_si128(work_a, q0);
+
+    work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+    q1 = _mm_loadl_epi64((__m128i *)flat_oq1);
+    work_a = _mm_andnot_si128(flat, work_a);
+    q1 = _mm_and_si128(flat, q1);
+    q1 = _mm_or_si128(work_a, q1);
+
+    work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
+    q2 = _mm_loadl_epi64((__m128i *)flat_oq2);
+    work_a = _mm_andnot_si128(flat, work_a);
+    q2 = _mm_and_si128(flat, q2);
+    q2 = _mm_or_si128(work_a, q2);
+
+    work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+    p0 = _mm_loadl_epi64((__m128i *)flat_op0);
+    work_a = _mm_andnot_si128(flat, work_a);
+    p0 = _mm_and_si128(flat, p0);
+    p0 = _mm_or_si128(work_a, p0);
+
+    work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+    p1 = _mm_loadl_epi64((__m128i *)flat_op1);
+    work_a = _mm_andnot_si128(flat, work_a);
+    p1 = _mm_and_si128(flat, p1);
+    p1 = _mm_or_si128(work_a, p1);
+
+    work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
+    p2 = _mm_loadl_epi64((__m128i *)flat_op2);
+    work_a = _mm_andnot_si128(flat, work_a);
+    p2 = _mm_and_si128(flat, p2);
+    p2 = _mm_or_si128(work_a, p2);
+
+    _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
+    _mm_storel_epi64((__m128i *)(s - 2 * p), p1);
+    _mm_storel_epi64((__m128i *)(s - 1 * p), p0);
+    _mm_storel_epi64((__m128i *)(s + 0 * p), q0);
+    _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
+    _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
+  }
+}
+
+void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p,
+                                    const uint8_t *_blimit0,
+                                    const uint8_t *_limit0,
+                                    const uint8_t *_thresh0,
+                                    const uint8_t *_blimit1,
+                                    const uint8_t *_limit1,
+                                    const uint8_t *_thresh1) {
+  DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i blimit =
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
+                         _mm_load_si128((const __m128i *)_blimit1));
+  const __m128i limit =
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
+                         _mm_load_si128((const __m128i *)_limit1));
+  const __m128i thresh =
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
+                         _mm_load_si128((const __m128i *)_thresh1));
+
+  __m128i mask, hev, flat;
+  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+
+  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
+  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+  {
+    const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
+                                          _mm_subs_epu8(p0, p1));
+    const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
+                                          _mm_subs_epu8(q0, q1));
+    const __m128i one = _mm_set1_epi8(1);
+    const __m128i fe = _mm_set1_epi8(0xfe);
+    const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+    __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
+                                    _mm_subs_epu8(q0, p0));
+    __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
+                                    _mm_subs_epu8(q1, p1));
+    __m128i work;
+
+    // filter_mask and hev_mask
+    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+    hev = _mm_subs_epu8(flat, thresh);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(flat, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
+                                     _mm_subs_epu8(p1, p2)),
+                         _mm_or_si128(_mm_subs_epu8(p3, p2),
+                                      _mm_subs_epu8(p2, p3)));
+    mask = _mm_max_epu8(work, mask);
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
+                                     _mm_subs_epu8(q1, q2)),
+                         _mm_or_si128(_mm_subs_epu8(q3, q2),
+                                      _mm_subs_epu8(q2, q3)));
+    mask = _mm_max_epu8(work, mask);
+    mask = _mm_subs_epu8(mask, limit);
+    mask = _mm_cmpeq_epi8(mask, zero);
+
+    // flat_mask4
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
+                                     _mm_subs_epu8(p0, p2)),
+                         _mm_or_si128(_mm_subs_epu8(q2, q0),
+                                      _mm_subs_epu8(q0, q2)));
+    flat = _mm_max_epu8(work, flat);
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
+                                     _mm_subs_epu8(p0, p3)),
+                         _mm_or_si128(_mm_subs_epu8(q3, q0),
+                                      _mm_subs_epu8(q0, q3)));
+    flat = _mm_max_epu8(work, flat);
+    flat = _mm_subs_epu8(flat, one);
+    flat = _mm_cmpeq_epi8(flat, zero);
+    flat = _mm_and_si128(flat, mask);
+  }
+  {
+    const __m128i four = _mm_set1_epi16(4);
+    unsigned char *src = s;
+    int i = 0;
+
+    do {
+      __m128i workp_a, workp_b, workp_shft;
+      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
+      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
+      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
+      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
+      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
+      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
+      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
+      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
+
+      workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
+      workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
+      workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_op2[i * 8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_op1[i * 8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_op0[i * 8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_oq0[i * 8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_oq1[i * 8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_oq2[i * 8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      src += 8;
+    } while (++i < 2);
+  }
+  // lp filter
+  {
+    const __m128i t4 = _mm_set1_epi8(4);
+    const __m128i t3 = _mm_set1_epi8(3);
+    const __m128i t80 = _mm_set1_epi8(0x80);
+    const __m128i te0 = _mm_set1_epi8(0xe0);
+    const __m128i t1f = _mm_set1_epi8(0x1f);
+    const __m128i t1 = _mm_set1_epi8(0x1);
+    const __m128i t7f = _mm_set1_epi8(0x7f);
+
+    const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)),
+                                      t80);
+    const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)),
+                                      t80);
+    const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)),
+                                      t80);
+    const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)),
+                                      t80);
+    __m128i filt;
+    __m128i work_a;
+    __m128i filter1, filter2;
+
+    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+    work_a = _mm_subs_epi8(qs0, ps0);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    // (vpx_filter + 3 * (qs0 - ps0)) & mask
+    filt = _mm_and_si128(filt, mask);
+
+    filter1 = _mm_adds_epi8(filt, t4);
+    filter2 = _mm_adds_epi8(filt, t3);
+
+    // Filter1 >> 3
+    work_a = _mm_cmpgt_epi8(zero, filter1);
+    filter1 = _mm_srli_epi16(filter1, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter1 = _mm_and_si128(filter1, t1f);
+    filter1 = _mm_or_si128(filter1, work_a);
+
+    // Filter2 >> 3
+    work_a = _mm_cmpgt_epi8(zero, filter2);
+    filter2 = _mm_srli_epi16(filter2, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter2 = _mm_and_si128(filter2, t1f);
+    filter2 = _mm_or_si128(filter2, work_a);
+
+    // filt >> 1
+    filt = _mm_adds_epi8(filter1, t1);
+    work_a = _mm_cmpgt_epi8(zero, filt);
+    filt = _mm_srli_epi16(filt, 1);
+    work_a = _mm_and_si128(work_a, t80);
+    filt = _mm_and_si128(filt, t7f);
+    filt = _mm_or_si128(filt, work_a);
+
+    filt = _mm_andnot_si128(hev, filt);
+
+    work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+    q0 = _mm_load_si128((__m128i *)flat_oq0);
+    work_a = _mm_andnot_si128(flat, work_a);
+    q0 = _mm_and_si128(flat, q0);
+    q0 = _mm_or_si128(work_a, q0);
+
+    work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+    q1 = _mm_load_si128((__m128i *)flat_oq1);
+    work_a = _mm_andnot_si128(flat, work_a);
+    q1 = _mm_and_si128(flat, q1);
+    q1 = _mm_or_si128(work_a, q1);
+
+    work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
+    q2 = _mm_load_si128((__m128i *)flat_oq2);
+    work_a = _mm_andnot_si128(flat, work_a);
+    q2 = _mm_and_si128(flat, q2);
+    q2 = _mm_or_si128(work_a, q2);
+
+    work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+    p0 = _mm_load_si128((__m128i *)flat_op0);
+    work_a = _mm_andnot_si128(flat, work_a);
+    p0 = _mm_and_si128(flat, p0);
+    p0 = _mm_or_si128(work_a, p0);
+
+    work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+    p1 = _mm_load_si128((__m128i *)flat_op1);
+    work_a = _mm_andnot_si128(flat, work_a);
+    p1 = _mm_and_si128(flat, p1);
+    p1 = _mm_or_si128(work_a, p1);
+
+    work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
+    p2 = _mm_load_si128((__m128i *)flat_op2);
+    work_a = _mm_andnot_si128(flat, work_a);
+    p2 = _mm_and_si128(flat, p2);
+    p2 = _mm_or_si128(work_a, p2);
+
+    _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
+    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+    _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
+    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+    _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
+  }
+}
+
+void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
+                                    const unsigned char *_blimit0,
+                                    const unsigned char *_limit0,
+                                    const unsigned char *_thresh0,
+                                    const unsigned char *_blimit1,
+                                    const unsigned char *_limit1,
+                                    const unsigned char *_thresh1) {
+  const __m128i blimit =
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
+                         _mm_load_si128((const __m128i *)_blimit1));
+  const __m128i limit =
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
+                         _mm_load_si128((const __m128i *)_limit1));
+  const __m128i thresh =
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
+                         _mm_load_si128((const __m128i *)_thresh1));
+  const __m128i zero = _mm_set1_epi16(0);
+  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+  __m128i mask, hev, flat;
+
+  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
+  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+
+  // filter_mask and hev_mask
+  {
+    const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
+                                          _mm_subs_epu8(p0, p1));
+    const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
+                                          _mm_subs_epu8(q0, q1));
+    const __m128i fe = _mm_set1_epi8(0xfe);
+    const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+    __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
+                                    _mm_subs_epu8(q0, p0));
+    __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
+                                    _mm_subs_epu8(q1, p1));
+    __m128i work;
+
+    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+    hev = _mm_subs_epu8(flat, thresh);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(flat, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
+                                     _mm_subs_epu8(p1, p2)),
+                         _mm_or_si128(_mm_subs_epu8(p3, p2),
+                                      _mm_subs_epu8(p2, p3)));
+    mask = _mm_max_epu8(work, mask);
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
+                                     _mm_subs_epu8(q1, q2)),
+                         _mm_or_si128(_mm_subs_epu8(q3, q2),
+                                      _mm_subs_epu8(q2, q3)));
+    mask = _mm_max_epu8(work, mask);
+    mask = _mm_subs_epu8(mask, limit);
+    mask = _mm_cmpeq_epi8(mask, zero);
+  }
+
+  // filter4
+  {
+    const __m128i t4 = _mm_set1_epi8(4);
+    const __m128i t3 = _mm_set1_epi8(3);
+    const __m128i t80 = _mm_set1_epi8(0x80);
+    const __m128i te0 = _mm_set1_epi8(0xe0);
+    const __m128i t1f = _mm_set1_epi8(0x1f);
+    const __m128i t1 = _mm_set1_epi8(0x1);
+    const __m128i t7f = _mm_set1_epi8(0x7f);
+
+    const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)),
+                                      t80);
+    const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)),
+                                      t80);
+    const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)),
+                                      t80);
+    const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)),
+                                      t80);
+    __m128i filt;
+    __m128i work_a;
+    __m128i filter1, filter2;
+
+    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+    work_a = _mm_subs_epi8(qs0, ps0);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    // (vpx_filter + 3 * (qs0 - ps0)) & mask
+    filt = _mm_and_si128(filt, mask);
+
+    filter1 = _mm_adds_epi8(filt, t4);
+    filter2 = _mm_adds_epi8(filt, t3);
+
+    // Filter1 >> 3
+    work_a = _mm_cmpgt_epi8(zero, filter1);
+    filter1 = _mm_srli_epi16(filter1, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter1 = _mm_and_si128(filter1, t1f);
+    filter1 = _mm_or_si128(filter1, work_a);
+
+    // Filter2 >> 3
+    work_a = _mm_cmpgt_epi8(zero, filter2);
+    filter2 = _mm_srli_epi16(filter2, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter2 = _mm_and_si128(filter2, t1f);
+    filter2 = _mm_or_si128(filter2, work_a);
+
+    // filt >> 1
+    filt = _mm_adds_epi8(filter1, t1);
+    work_a = _mm_cmpgt_epi8(zero, filt);
+    filt = _mm_srli_epi16(filt, 1);
+    work_a = _mm_and_si128(work_a, t80);
+    filt = _mm_and_si128(filt, t7f);
+    filt = _mm_or_si128(filt, work_a);
+
+    filt = _mm_andnot_si128(hev, filt);
+
+    q0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+    q1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+    p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+    p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+
+    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+    _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
+    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+  }
+}
+
+static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
+                                 int in_p, unsigned char *out, int out_p) {
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  __m128i x8, x9, x10, x11, x12, x13, x14, x15;
+
+  // 2-way interleave w/hoisting of unpacks
+  x0 = _mm_loadl_epi64((__m128i *)in0);  // 1
+  x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p));  // 3
+  x0 = _mm_unpacklo_epi8(x0, x1);  // 1
+
+  x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p));  // 5
+  x3 = _mm_loadl_epi64((__m128i *)(in0 + 3*in_p));  // 7
+  x1 = _mm_unpacklo_epi8(x2, x3);  // 2
+
+  x4 = _mm_loadl_epi64((__m128i *)(in0 + 4*in_p));  // 9
+  x5 = _mm_loadl_epi64((__m128i *)(in0 + 5*in_p));  // 11
+  x2 = _mm_unpacklo_epi8(x4, x5);  // 3
+
+  x6 = _mm_loadl_epi64((__m128i *)(in0 + 6*in_p));  // 13
+  x7 = _mm_loadl_epi64((__m128i *)(in0 + 7*in_p));  // 15
+  x3 = _mm_unpacklo_epi8(x6, x7);  // 4
+  x4 = _mm_unpacklo_epi16(x0, x1);  // 9
+
+  x8 = _mm_loadl_epi64((__m128i *)in1);  // 2
+  x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p));  // 4
+  x8 = _mm_unpacklo_epi8(x8, x9);  // 5
+  x5 = _mm_unpacklo_epi16(x2, x3);  // 10
+
+  x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p));  // 6
+  x11 = _mm_loadl_epi64((__m128i *)(in1 + 3*in_p));  // 8
+  x9 = _mm_unpacklo_epi8(x10, x11);  // 6
+
+  x12 = _mm_loadl_epi64((__m128i *)(in1 + 4*in_p));  // 10
+  x13 = _mm_loadl_epi64((__m128i *)(in1 + 5*in_p));  // 12
+  x10 = _mm_unpacklo_epi8(x12, x13);  // 7
+  x12 = _mm_unpacklo_epi16(x8, x9);  // 11
+
+  x14 = _mm_loadl_epi64((__m128i *)(in1 + 6*in_p));  // 14
+  x15 = _mm_loadl_epi64((__m128i *)(in1 + 7*in_p));  // 16
+  x11 = _mm_unpacklo_epi8(x14, x15);  // 8
+  x13 = _mm_unpacklo_epi16(x10, x11);  // 12
+
+  x6 = _mm_unpacklo_epi32(x4, x5);  // 13
+  x7 = _mm_unpackhi_epi32(x4, x5);  // 14
+  x14 = _mm_unpacklo_epi32(x12, x13);  // 15
+  x15 = _mm_unpackhi_epi32(x12, x13);  // 16
+
+  // Store first 4-line result
+  _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14));
+  _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14));
+  _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15));
+  _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15));
+
+  x4 = _mm_unpackhi_epi16(x0, x1);
+  x5 = _mm_unpackhi_epi16(x2, x3);
+  x12 = _mm_unpackhi_epi16(x8, x9);
+  x13 = _mm_unpackhi_epi16(x10, x11);
+
+  x6 = _mm_unpacklo_epi32(x4, x5);
+  x7 = _mm_unpackhi_epi32(x4, x5);
+  x14 = _mm_unpacklo_epi32(x12, x13);
+  x15 = _mm_unpackhi_epi32(x12, x13);
+
+  // Store second 4-line result
+  _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14));
+  _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14));
+  _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15));
+  _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15));
+}
+
+static INLINE void transpose(unsigned char *src[], int in_p,
+                             unsigned char *dst[], int out_p,
+                             int num_8x8_to_transpose) {
+  int idx8x8 = 0;
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  do {
+    unsigned char *in = src[idx8x8];
+    unsigned char *out = dst[idx8x8];
+
+    x0 = _mm_loadl_epi64((__m128i *)(in + 0*in_p));  // 00 01 02 03 04 05 06 07
+    x1 = _mm_loadl_epi64((__m128i *)(in + 1*in_p));  // 10 11 12 13 14 15 16 17
+    // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+    x0 = _mm_unpacklo_epi8(x0, x1);
+
+    x2 = _mm_loadl_epi64((__m128i *)(in + 2*in_p));  // 20 21 22 23 24 25 26 27
+    x3 = _mm_loadl_epi64((__m128i *)(in + 3*in_p));  // 30 31 32 33 34 35 36 37
+    // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+    x1 = _mm_unpacklo_epi8(x2, x3);
+
+    x4 = _mm_loadl_epi64((__m128i *)(in + 4*in_p));  // 40 41 42 43 44 45 46 47
+    x5 = _mm_loadl_epi64((__m128i *)(in + 5*in_p));  // 50 51 52 53 54 55 56 57
+    // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+    x2 = _mm_unpacklo_epi8(x4, x5);
+
+    x6 = _mm_loadl_epi64((__m128i *)(in + 6*in_p));  // 60 61 62 63 64 65 66 67
+    x7 = _mm_loadl_epi64((__m128i *)(in + 7*in_p));  // 70 71 72 73 74 75 76 77
+    // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+    x3 = _mm_unpacklo_epi8(x6, x7);
+
+    // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+    x4 = _mm_unpacklo_epi16(x0, x1);
+    // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+    x5 = _mm_unpacklo_epi16(x2, x3);
+    // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+    x6 = _mm_unpacklo_epi32(x4, x5);
+    _mm_storel_pd((double *)(out + 0*out_p),
+                  _mm_castsi128_pd(x6));  // 00 10 20 30 40 50 60 70
+    _mm_storeh_pd((double *)(out + 1*out_p),
+                  _mm_castsi128_pd(x6));  // 01 11 21 31 41 51 61 71
+    // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+    x7 = _mm_unpackhi_epi32(x4, x5);
+    _mm_storel_pd((double *)(out + 2*out_p),
+                  _mm_castsi128_pd(x7));  // 02 12 22 32 42 52 62 72
+    _mm_storeh_pd((double *)(out + 3*out_p),
+                  _mm_castsi128_pd(x7));  // 03 13 23 33 43 53 63 73
+
+    // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+    x4 = _mm_unpackhi_epi16(x0, x1);
+    // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
+    x5 = _mm_unpackhi_epi16(x2, x3);
+    // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
+    x6 = _mm_unpacklo_epi32(x4, x5);
+    _mm_storel_pd((double *)(out + 4*out_p),
+                  _mm_castsi128_pd(x6));  // 04 14 24 34 44 54 64 74
+    _mm_storeh_pd((double *)(out + 5*out_p),
+                  _mm_castsi128_pd(x6));  // 05 15 25 35 45 55 65 75
+    // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
+    x7 = _mm_unpackhi_epi32(x4, x5);
+
+    _mm_storel_pd((double *)(out + 6*out_p),
+                  _mm_castsi128_pd(x7));  // 06 16 26 36 46 56 66 76
+    _mm_storeh_pd((double *)(out + 7*out_p),
+                  _mm_castsi128_pd(x7));  // 07 17 27 37 47 57 67 77
+  } while (++idx8x8 < num_8x8_to_transpose);
+}
+
+void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
+                                  const uint8_t *limit0,
+                                  const uint8_t *thresh0,
+                                  const uint8_t *blimit1,
+                                  const uint8_t *limit1,
+                                  const uint8_t *thresh1) {
+  DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
+  unsigned char *src[2];
+  unsigned char *dst[2];
+
+  // Transpose 8x16
+  transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+
+  // Loop filtering
+  vpx_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
+                                 blimit1, limit1, thresh1);
+  src[0] = t_dst;
+  src[1] = t_dst + 8;
+  dst[0] = s - 4;
+  dst[1] = s - 4 + p * 8;
+
+  // Transpose back
+  transpose(src, 16, dst, p, 2);
+}
+
+void vpx_lpf_vertical_8_sse2(unsigned char *s, int p,
+                             const unsigned char *blimit,
+                             const unsigned char *limit,
+                             const unsigned char *thresh, int count) {
+  DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 8]);
+  unsigned char *src[1];
+  unsigned char *dst[1];
+  (void)count;
+
+  // Transpose 8x8
+  src[0] = s - 4;
+  dst[0] = t_dst;
+
+  transpose(src, p, dst, 8, 1);
+
+  // Loop filtering
+  vpx_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1);
+
+  src[0] = t_dst;
+  dst[0] = s - 4;
+
+  // Transpose back
+  transpose(src, 8, dst, p, 1);
+}
+
+void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
+                                  const uint8_t *limit0,
+                                  const uint8_t *thresh0,
+                                  const uint8_t *blimit1,
+                                  const uint8_t *limit1,
+                                  const uint8_t *thresh1) {
+  DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
+  unsigned char *src[2];
+  unsigned char *dst[2];
+
+  // Transpose 8x16
+  transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+
+  // Loop filtering
+  vpx_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
+                                 blimit1, limit1, thresh1);
+  src[0] = t_dst;
+  src[1] = t_dst + 8;
+
+  dst[0] = s - 4;
+  dst[1] = s - 4 + p * 8;
+
+  // Transpose back
+  transpose(src, 16, dst, p, 2);
+}
+
+void vpx_lpf_vertical_16_sse2(unsigned char *s, int p,
+                              const unsigned char *blimit,
+                              const unsigned char *limit,
+                              const unsigned char *thresh) {
+  DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 16]);
+  unsigned char *src[2];
+  unsigned char *dst[2];
+
+  src[0] = s - 8;
+  src[1] = s;
+  dst[0] = t_dst;
+  dst[1] = t_dst + 8 * 8;
+
+  // Transpose 16x8
+  transpose(src, p, dst, 8, 2);
+
+  // Loop filtering
+  mb_lpf_horizontal_edge_w_sse2_8(t_dst + 8 * 8, 8, blimit, limit, thresh);
+
+  src[0] = t_dst;
+  src[1] = t_dst + 8 * 8;
+  dst[0] = s - 8;
+  dst[1] = s;
+
+  // Transpose back
+  transpose(src, 8, dst, p, 2);
+}
+
+void vpx_lpf_vertical_16_dual_sse2(unsigned char *s, int p,
+                                   const uint8_t *blimit, const uint8_t *limit,
+                                   const uint8_t *thresh) {
+  DECLARE_ALIGNED(16, unsigned char, t_dst[256]);
+
+  // Transpose 16x16
+  transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
+  transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
+
+  // Loop filtering
+  mb_lpf_horizontal_edge_w_sse2_16(t_dst + 8 * 16, 16, blimit, limit,
+                                   thresh);
+
+  // Transpose back
+  transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
+  transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
+}
diff --git a/libs/libvpx/vpx_dsp/x86/quantize_avx_x86_64.asm b/libs/libvpx/vpx_dsp/x86/quantize_avx_x86_64.asm
new file mode 100644
index 0000000000..01c41291be
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/quantize_avx_x86_64.asm
@@ -0,0 +1,544 @@
+;
+;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro QUANTIZE_FN 2
+cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
+                                shift, qcoeff, dqcoeff, dequant, \
+                                eob, scan, iscan
+
+  vzeroupper
+
+  ; If we can skip this block, then just zero the output
+  cmp                         skipmp, 0
+  jne .blank
+
+%ifnidn %1, b_32x32
+
+  ; Special case for ncoeff == 16, as it is frequent and we can save on
+  ; not setting up a loop.
+  cmp                       ncoeffmp, 16
+  jne .generic
+
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+  ;; Special case of ncoeff == 16
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+.single:
+
+  movifnidn                   coeffq, coeffmp
+  movifnidn                    zbinq, zbinmp
+  mova                            m0, [zbinq]              ; m0 = zbin
+
+  ; Get DC and first 15 AC coeffs - in this special case, that is all.
+%if CONFIG_VP9_HIGHBITDEPTH
+  ; coeff stored as 32bit numbers but we process them as 16 bit numbers
+  mova                            m9, [coeffq]
+  packssdw                        m9, [coeffq+16]          ; m9 = c[i]
+  mova                           m10, [coeffq+32]
+  packssdw                       m10, [coeffq+48]          ; m10 = c[i]
+%else
+  mova                            m9, [coeffq]             ; m9 = c[i]
+  mova                           m10, [coeffq+16]          ; m10 = c[i]
+%endif
+
+  mov                             r0, eobmp                ; Output pointer
+  mov                             r1, qcoeffmp             ; Output pointer
+  mov                             r2, dqcoeffmp            ; Output pointer
+
+  pxor                            m5, m5                   ; m5 = dedicated zero
+
+  pcmpeqw                         m4, m4                   ; All word lanes -1
+  paddw                           m0, m4                   ; m0 = zbin - 1
+
+  pabsw                           m6, m9                   ; m6 = abs(m9)
+  pabsw                          m11, m10                  ; m11 = abs(m10)
+  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
+  punpckhqdq                      m0, m0
+  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
+
+  ; Check if all coeffs are less than zbin. If yes, we just write zeros
+  ; to the outputs and we are done.
+  por                            m14, m7, m12
+  ptest                          m14, m14
+  jnz .single_nonzero
+
+%if CONFIG_VP9_HIGHBITDEPTH
+  mova                       [r1   ], ymm5
+  mova                       [r1+32], ymm5
+  mova                       [r2   ], ymm5
+  mova                       [r2+32], ymm5
+%else
+  mova                          [r1], ymm5
+  mova                          [r2], ymm5
+%endif
+  mov                           [r0], word 0
+
+  vzeroupper
+  RET
+
+.single_nonzero:
+
+  ; Actual quantization of size 16 block - setup pointers, rounders, etc.
+  movifnidn                       r4, roundmp
+  movifnidn                       r5, quantmp
+  mov                             r3, dequantmp
+  mov                             r6, shiftmp
+  mova                            m1, [r4]              ; m1 = round
+  mova                            m2, [r5]              ; m2 = quant
+  mova                            m3, [r3]              ; m3 = dequant
+  mova                            m4, [r6]              ; m4 = shift
+
+  mov                             r3, iscanmp
+
+  DEFINE_ARGS eob, qcoeff, dqcoeff, iscan
+
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+  paddsw                          m6, m1                   ; m6 += round
+  punpckhqdq                      m1, m1
+  paddsw                         m11, m1                   ; m11 += round
+  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
+  punpckhqdq                      m2, m2
+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
+  paddw                           m8, m6                   ; m8 += m6
+  paddw                          m13, m11                  ; m13 += m11
+  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
+  punpckhqdq                      m4, m4
+  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
+  psignw                          m8, m9                   ; m8 = reinsert sign
+  psignw                         m13, m10                  ; m13 = reinsert sign
+  pand                            m8, m7
+  pand                           m13, m12
+
+%if CONFIG_VP9_HIGHBITDEPTH
+  ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  pcmpgtw                         m6, m5, m8
+  punpckhwd                       m6, m8, m6
+  pmovsxwd                       m11, m8
+  mova                  [qcoeffq   ], m11
+  mova                  [qcoeffq+16], m6
+  pcmpgtw                         m6, m5, m13
+  punpckhwd                       m6, m13, m6
+  pmovsxwd                       m11, m13
+  mova                  [qcoeffq+32], m11
+  mova                  [qcoeffq+48], m6
+%else
+  mova                  [qcoeffq   ], m8
+  mova                  [qcoeffq+16], m13
+%endif
+
+  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
+  punpckhqdq                      m3, m3
+  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
+
+%if CONFIG_VP9_HIGHBITDEPTH
+  ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  pcmpgtw                         m6, m5, m8
+  punpckhwd                       m6, m8, m6
+  pmovsxwd                       m11, m8
+  mova                 [dqcoeffq   ], m11
+  mova                 [dqcoeffq+16], m6
+  pcmpgtw                         m6, m5, m13
+  punpckhwd                       m6, m13, m6
+  pmovsxwd                       m11, m13
+  mova                 [dqcoeffq+32], m11
+  mova                 [dqcoeffq+48], m6
+%else
+  mova                 [dqcoeffq   ], m8
+  mova                 [dqcoeffq+16], m13
+%endif
+
+  mova                            m6, [iscanq]            ; m6 = scan[i]
+  mova                           m11, [iscanq+16]         ; m11 = scan[i]
+
+  pcmpeqw                         m8,  m8,  m5            ; m8 = c[i] == 0
+  pcmpeqw                        m13, m13,  m5            ; m13 = c[i] == 0
+  psubw                           m6,  m6,  m7            ; m6 = scan[i] + 1
+  psubw                          m11, m11, m12            ; m11 = scan[i] + 1
+  pandn                           m8,  m8,  m6            ; m8 = max(eob)
+  pandn                          m13, m13, m11            ; m13 = max(eob)
+  pmaxsw                          m8,  m8, m13
+
+  ; Horizontally accumulate/max eobs and write into [eob] memory pointer
+  pshufd                          m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0x1
+  pmaxsw                          m8, m7
+  movq                           rax, m8
+  mov                         [eobq], ax
+
+  vzeroupper
+  RET
+
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+  ;; Generic case of ncoeff != 16
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+.generic:
+
+%endif ; %ifnidn %1, b_32x32
+
+DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
+            qcoeff, dqcoeff, dequant, eob, scan, iscan
+
+  ; Actual quantization loop - setup pointers, rounders, etc.
+  movifnidn                   coeffq, coeffmp
+  movifnidn                  ncoeffq, ncoeffmp
+  mov                             r2, dequantmp
+  movifnidn                    zbinq, zbinmp
+  movifnidn                   roundq, roundmp
+  movifnidn                   quantq, quantmp
+  mova                            m0, [zbinq]              ; m0 = zbin
+  mova                            m1, [roundq]             ; m1 = round
+  mova                            m2, [quantq]             ; m2 = quant
+  mova                            m3, [r2]                 ; m3 = dequant
+  pcmpeqw                         m4, m4                   ; All lanes -1
+%ifidn %1, b_32x32
+  psubw                           m0, m4
+  psubw                           m1, m4
+  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2
+  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
+%endif
+  paddw                           m0, m4                   ; m0 = m0 + 1
+
+  mov                             r2, shiftmp
+  mov                             r3, qcoeffmp
+  mova                            m4, [r2]                 ; m4 = shift
+  mov                             r4, dqcoeffmp
+  mov                             r5, iscanmp
+%ifidn %1, b_32x32
+  psllw                           m4, 1
+%endif
+  pxor                            m5, m5                   ; m5 = dedicated zero
+
+  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob
+
+%if CONFIG_VP9_HIGHBITDEPTH
+  lea                         coeffq, [  coeffq+ncoeffq*4]
+  lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
+  lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
+%else
+  lea                         coeffq, [  coeffq+ncoeffq*2]
+  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
+  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
+%endif
+  lea                         iscanq, [  iscanq+ncoeffq*2]
+  neg                        ncoeffq
+
+  ; get DC and first 15 AC coeffs
+%if CONFIG_VP9_HIGHBITDEPTH
+  ; coeff stored as 32bit numbers & require 16bit numbers
+  mova                            m9, [coeffq+ncoeffq*4+ 0]
+  packssdw                        m9, [coeffq+ncoeffq*4+16]
+  mova                           m10, [coeffq+ncoeffq*4+32]
+  packssdw                       m10, [coeffq+ncoeffq*4+48]
+%else
+  mova                            m9, [coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+  mova                           m10, [coeffq+ncoeffq*2+16] ; m10 = c[i]
+%endif
+
+  pabsw                           m6, m9                   ; m6 = abs(m9)
+  pabsw                          m11, m10                  ; m11 = abs(m10)
+  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
+  punpckhqdq                      m0, m0
+  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
+
+  ; Check if all coeffs are less than zbin. If yes, skip forward quickly.
+  por                            m14, m7, m12
+  ptest                          m14, m14
+  jnz .first_nonzero
+
+%if CONFIG_VP9_HIGHBITDEPTH
+  mova        [qcoeffq+ncoeffq*4   ], ymm5
+  mova        [qcoeffq+ncoeffq*4+32], ymm5
+  mova       [dqcoeffq+ncoeffq*4   ], ymm5
+  mova       [dqcoeffq+ncoeffq*4+32], ymm5
+%else
+  mova           [qcoeffq+ncoeffq*2], ymm5
+  mova          [dqcoeffq+ncoeffq*2], ymm5
+%endif
+
+  add                        ncoeffq, mmsize
+
+  punpckhqdq                      m1, m1
+  punpckhqdq                      m2, m2
+  punpckhqdq                      m3, m3
+  punpckhqdq                      m4, m4
+  pxor                            m8, m8
+
+  jmp .ac_only_loop
+
+.first_nonzero:
+
+  paddsw                          m6, m1                   ; m6 += round
+  punpckhqdq                      m1, m1
+  paddsw                         m11, m1                   ; m11 += round
+  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
+  punpckhqdq                      m2, m2
+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
+  paddw                           m8, m6                   ; m8 += m6
+  paddw                          m13, m11                  ; m13 += m11
+  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
+  punpckhqdq                      m4, m4
+  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
+  psignw                          m8, m9                   ; m8 = reinsert sign
+  psignw                         m13, m10                  ; m13 = reinsert sign
+  pand                            m8, m7
+  pand                           m13, m12
+
+%if CONFIG_VP9_HIGHBITDEPTH
+  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  pcmpgtw                         m6, m5, m8
+  punpckhwd                       m6, m8, m6
+  pmovsxwd                       m11, m8
+  mova        [qcoeffq+ncoeffq*4+ 0], m11
+  mova        [qcoeffq+ncoeffq*4+16], m6
+  pcmpgtw                         m6, m5, m13
+  punpckhwd                       m6, m13, m6
+  pmovsxwd                       m11, m13
+  mova        [qcoeffq+ncoeffq*4+32], m11
+  mova        [qcoeffq+ncoeffq*4+48], m6
+%else
+  mova        [qcoeffq+ncoeffq*2+ 0], m8
+  mova        [qcoeffq+ncoeffq*2+16], m13
+%endif
+
+%ifidn %1, b_32x32
+  pabsw                           m8, m8
+  pabsw                          m13, m13
+%endif
+  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
+  punpckhqdq                      m3, m3
+  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+  psrlw                           m8, 1
+  psrlw                          m13, 1
+  psignw                          m8, m9
+  psignw                         m13, m10
+%endif
+
+%if CONFIG_VP9_HIGHBITDEPTH
+  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  pcmpgtw                         m6, m5, m8
+  punpckhwd                       m6, m8, m6
+  pmovsxwd                       m11, m8
+  mova       [dqcoeffq+ncoeffq*4+ 0], m11
+  mova       [dqcoeffq+ncoeffq*4+16], m6
+  pcmpgtw                         m6, m5, m13
+  punpckhwd                       m6, m13, m6
+  pmovsxwd                       m11, m13
+  mova       [dqcoeffq+ncoeffq*4+32], m11
+  mova       [dqcoeffq+ncoeffq*4+48], m6
+%else
+  mova       [dqcoeffq+ncoeffq*2+ 0], m8
+  mova       [dqcoeffq+ncoeffq*2+16], m13
+%endif
+
+  pcmpeqw                         m8, m5                    ; m8 = c[i] == 0
+  pcmpeqw                        m13, m5                    ; m13 = c[i] == 0
+  mova                            m6, [iscanq+ncoeffq*2]    ; m6 = scan[i]
+  mova                           m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i]
+  psubw                           m6, m7                    ; m6 = scan[i] + 1
+  psubw                          m11, m12                   ; m11 = scan[i] + 1
+  pandn                           m8, m6                    ; m8 = max(eob)
+  pandn                          m13, m11                   ; m13 = max(eob)
+  pmaxsw                          m8, m13
+  add                        ncoeffq, mmsize
+
+.ac_only_loop:
+
+%if CONFIG_VP9_HIGHBITDEPTH
+  ; pack coeff from 32bit to 16bit array
+  mova                            m9, [coeffq+ncoeffq*4+ 0]
+  packssdw                        m9, [coeffq+ncoeffq*4+16]
+  mova                           m10, [coeffq+ncoeffq*4+32]
+  packssdw                       m10, [coeffq+ncoeffq*4+48]
+%else
+  mova                            m9, [coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+  mova                           m10, [coeffq+ncoeffq*2+16] ; m10 = c[i]
+%endif
+
+  pabsw                           m6, m9                   ; m6 = abs(m9)
+  pabsw                          m11, m10                  ; m11 = abs(m10)
+  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
+  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
+
+  ; Check if all coeffs are less than zbin. If yes, skip this itertion.
+  ; And just write zeros as the result would be.
+  por                            m14, m7, m12
+  ptest                          m14, m14
+  jnz .rest_nonzero
+
+%if CONFIG_VP9_HIGHBITDEPTH
+  mova        [qcoeffq+ncoeffq*4+ 0], ymm5
+  mova        [qcoeffq+ncoeffq*4+32], ymm5
+  mova       [dqcoeffq+ncoeffq*4+ 0], ymm5
+  mova       [dqcoeffq+ncoeffq*4+32], ymm5
+%else
+  mova        [qcoeffq+ncoeffq*2+ 0], ymm5
+  mova       [dqcoeffq+ncoeffq*2+ 0], ymm5
+%endif
+  add                        ncoeffq, mmsize
+  jnz .ac_only_loop
+
+  ; Horizontally accumulate/max eobs and write into [eob] memory pointer
+  mov                             r2, eobmp
+  pshufd                          m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0x1
+  pmaxsw                          m8, m7
+  movq                           rax, m8
+  mov                           [r2], ax
+  vzeroupper
+  RET
+
+.rest_nonzero:
+  paddsw                          m6, m1                   ; m6 += round
+  paddsw                         m11, m1                   ; m11 += round
+  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
+  paddw                          m14, m6                   ; m14 += m6
+  paddw                          m13, m11                  ; m13 += m11
+  pmulhw                         m14, m4                   ; m14 = m14*qsh>>16
+  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
+  psignw                         m14, m9                   ; m14 = reinsert sign
+  psignw                         m13, m10                  ; m13 = reinsert sign
+  pand                           m14, m7
+  pand                           m13, m12
+
+%if CONFIG_VP9_HIGHBITDEPTH
+  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  pcmpgtw                         m6, m5, m14
+  punpckhwd                       m6, m14, m6
+  pmovsxwd                       m11, m14
+  mova        [qcoeffq+ncoeffq*4+ 0], m11
+  mova        [qcoeffq+ncoeffq*4+16], m6
+  pcmpgtw                         m6, m5, m13
+  punpckhwd                       m6, m13, m6
+  pmovsxwd                       m11, m13
+  mova        [qcoeffq+ncoeffq*4+32], m11
+  mova        [qcoeffq+ncoeffq*4+48], m6
+%else
+  mova        [qcoeffq+ncoeffq*2+ 0], m14
+  mova        [qcoeffq+ncoeffq*2+16], m13
+%endif
+
+%ifidn %1, b_32x32
+  pabsw                          m14, m14
+  pabsw                          m13, m13
+%endif
+  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
+  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+  psrlw                          m14, 1
+  psrlw                          m13, 1
+  psignw                         m14, m9
+  psignw                         m13, m10
+%endif
+
+%if CONFIG_VP9_HIGHBITDEPTH
+  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  pcmpgtw                         m6, m5, m14
+  punpckhwd                       m6, m14, m6
+  pmovsxwd                       m11, m14
+  mova       [dqcoeffq+ncoeffq*4+ 0], m11
+  mova       [dqcoeffq+ncoeffq*4+16], m6
+  pcmpgtw                         m6, m5, m13
+  punpckhwd                       m6, m13, m6
+  pmovsxwd                       m11, m13
+  mova       [dqcoeffq+ncoeffq*4+32], m11
+  mova       [dqcoeffq+ncoeffq*4+48], m6
+%else
+  mova       [dqcoeffq+ncoeffq*2+ 0], m14
+  mova       [dqcoeffq+ncoeffq*2+16], m13
+%endif
+
+  pcmpeqw                        m14, m5                    ; m14 = c[i] == 0
+  pcmpeqw                        m13, m5                    ; m13 = c[i] == 0
+  mova                            m6, [iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
+  mova                           m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i]
+  psubw                           m6, m7                    ; m6 = scan[i] + 1
+  psubw                          m11, m12                   ; m11 = scan[i] + 1
+  pandn                          m14, m6                    ; m14 = max(eob)
+  pandn                          m13, m11                   ; m13 = max(eob)
+  pmaxsw                          m8, m14
+  pmaxsw                          m8, m13
+  add                        ncoeffq, mmsize
+  jnz .ac_only_loop
+
+  ; Horizontally accumulate/max eobs and write into [eob] memory pointer
+  mov                             r2, eobmp
+  pshufd                          m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0x1
+  pmaxsw                          m8, m7
+  movq                           rax, m8
+  mov                           [r2], ax
+  vzeroupper
+  RET
+
+  ; Skip-block, i.e. just write all zeroes
+.blank:
+
+DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
+            qcoeff, dqcoeff, dequant, eob, scan, iscan
+
+  mov                             r0, dqcoeffmp
+  movifnidn                  ncoeffq, ncoeffmp
+  mov                             r2, qcoeffmp
+  mov                             r3, eobmp
+
+DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
+
+%if CONFIG_VP9_HIGHBITDEPTH
+  lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
+  lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
+%else
+  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
+  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
+%endif
+
+  neg                        ncoeffq
+  pxor                            m7, m7
+
+.blank_loop:
+%if CONFIG_VP9_HIGHBITDEPTH
+  mova       [dqcoeffq+ncoeffq*4+ 0], ymm7
+  mova       [dqcoeffq+ncoeffq*4+32], ymm7
+  mova        [qcoeffq+ncoeffq*4+ 0], ymm7
+  mova        [qcoeffq+ncoeffq*4+32], ymm7
+%else
+  mova       [dqcoeffq+ncoeffq*2+ 0], ymm7
+  mova        [qcoeffq+ncoeffq*2+ 0], ymm7
+%endif
+  add                        ncoeffq, mmsize
+  jl .blank_loop
+
+  mov                         [eobq], word 0
+
+  vzeroupper
+  RET
+%endmacro
+
+INIT_XMM avx
+QUANTIZE_FN b, 7
+QUANTIZE_FN b_32x32, 7
+
+END
diff --git a/libs/libvpx/vpx_dsp/x86/quantize_sse2.c b/libs/libvpx/vpx_dsp/x86/quantize_sse2.c
new file mode 100644
index 0000000000..8aa4568d67
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/quantize_sse2.c
@@ -0,0 +1,248 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>
+#include <xmmintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1],
+      (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3], (int16_t)coeff_ptr[4],
+      (int16_t)coeff_ptr[5], (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]);
+#else
+  return _mm_load_si128((const __m128i *)coeff_ptr);
+#endif
+}
+
+static INLINE void store_coefficients(__m128i coeff_vals,
+                                      tran_low_t *coeff_ptr) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  __m128i one = _mm_set1_epi16(1);
+  __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one);
+  __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one);
+  __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi);
+  __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi);
+  _mm_store_si128((__m128i*)(coeff_ptr), coeff_vals_1);
+  _mm_store_si128((__m128i*)(coeff_ptr + 4), coeff_vals_2);
+#else
+  _mm_store_si128((__m128i*)(coeff_ptr), coeff_vals);
+#endif
+}
+
+void vpx_quantize_b_sse2(const tran_low_t* coeff_ptr, intptr_t n_coeffs,
+                         int skip_block, const int16_t* zbin_ptr,
+                         const int16_t* round_ptr, const int16_t* quant_ptr,
+                         const int16_t* quant_shift_ptr, tran_low_t* qcoeff_ptr,
+                         tran_low_t* dqcoeff_ptr, const int16_t* dequant_ptr,
+                         uint16_t* eob_ptr,
+                         const int16_t* scan_ptr,
+                         const int16_t* iscan_ptr) {
+  __m128i zero;
+  (void)scan_ptr;
+
+  coeff_ptr += n_coeffs;
+  iscan_ptr += n_coeffs;
+  qcoeff_ptr += n_coeffs;
+  dqcoeff_ptr += n_coeffs;
+  n_coeffs = -n_coeffs;
+  zero = _mm_setzero_si128();
+  if (!skip_block) {
+    __m128i eob;
+    __m128i zbin;
+    __m128i round, quant, dequant, shift;
+    {
+      __m128i coeff0, coeff1;
+
+      // Setup global values
+      {
+        __m128i pw_1;
+        zbin = _mm_load_si128((const __m128i*)zbin_ptr);
+        round = _mm_load_si128((const __m128i*)round_ptr);
+        quant = _mm_load_si128((const __m128i*)quant_ptr);
+        pw_1 = _mm_set1_epi16(1);
+        zbin = _mm_sub_epi16(zbin, pw_1);
+        dequant = _mm_load_si128((const __m128i*)dequant_ptr);
+        shift = _mm_load_si128((const __m128i*)quant_shift_ptr);
+      }
+
+      {
+        __m128i coeff0_sign, coeff1_sign;
+        __m128i qcoeff0, qcoeff1;
+        __m128i qtmp0, qtmp1;
+        __m128i cmp_mask0, cmp_mask1;
+        // Do DC and first 15 AC
+        coeff0 = load_coefficients(coeff_ptr + n_coeffs);
+        coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8);
+
+        // Poor man's sign extract
+        coeff0_sign = _mm_srai_epi16(coeff0, 15);
+        coeff1_sign = _mm_srai_epi16(coeff1, 15);
+        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+        zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
+        cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+        round = _mm_unpackhi_epi64(round, round);
+        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+        quant = _mm_unpackhi_epi64(quant, quant);
+        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+        qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
+        qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
+        qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
+        shift = _mm_unpackhi_epi64(shift, shift);
+        qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
+
+        // Reinsert signs
+        qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        // Mask out zbin threshold coeffs
+        qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+        qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+        store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs);
+        store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8);
+
+        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+        dequant = _mm_unpackhi_epi64(dequant, dequant);
+        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+        store_coefficients(coeff0, dqcoeff_ptr + n_coeffs);
+        store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8);
+      }
+
+      {
+        // Scan for eob
+        __m128i zero_coeff0, zero_coeff1;
+        __m128i nzero_coeff0, nzero_coeff1;
+        __m128i iscan0, iscan1;
+        __m128i eob1;
+        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+        iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
+        iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
+        // Add one to convert from indices to counts
+        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+        eob = _mm_and_si128(iscan0, nzero_coeff0);
+        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+        eob = _mm_max_epi16(eob, eob1);
+      }
+      n_coeffs += 8 * 2;
+    }
+
+    // AC only loop
+    while (n_coeffs < 0) {
+      __m128i coeff0, coeff1;
+      {
+        __m128i coeff0_sign, coeff1_sign;
+        __m128i qcoeff0, qcoeff1;
+        __m128i qtmp0, qtmp1;
+        __m128i cmp_mask0, cmp_mask1;
+
+        coeff0 = load_coefficients(coeff_ptr + n_coeffs);
+        coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8);
+
+        // Poor man's sign extract
+        coeff0_sign = _mm_srai_epi16(coeff0, 15);
+        coeff1_sign = _mm_srai_epi16(coeff1, 15);
+        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+        cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+        qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
+        qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
+        qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
+        qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
+
+        // Reinsert signs
+        qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        // Mask out zbin threshold coeffs
+        qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+        qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+        store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs);
+        store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8);
+
+        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+        store_coefficients(coeff0, dqcoeff_ptr + n_coeffs);
+        store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8);
+      }
+
+      {
+        // Scan for eob
+        __m128i zero_coeff0, zero_coeff1;
+        __m128i nzero_coeff0, nzero_coeff1;
+        __m128i iscan0, iscan1;
+        __m128i eob0, eob1;
+        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+        iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
+        iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
+        // Add one to convert from indices to counts
+        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+        eob0 = _mm_and_si128(iscan0, nzero_coeff0);
+        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+        eob0 = _mm_max_epi16(eob0, eob1);
+        eob = _mm_max_epi16(eob, eob0);
+      }
+      n_coeffs += 8 * 2;
+    }
+
+    // Accumulate EOB
+    {
+      __m128i eob_shuffled;
+      eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      *eob_ptr = _mm_extract_epi16(eob, 1);
+    }
+  } else {
+    do {
+      store_coefficients(zero, dqcoeff_ptr + n_coeffs);
+      store_coefficients(zero, dqcoeff_ptr + n_coeffs + 8);
+      store_coefficients(zero, qcoeff_ptr + n_coeffs);
+      store_coefficients(zero, qcoeff_ptr + n_coeffs + 8);
+      n_coeffs += 8 * 2;
+    } while (n_coeffs < 0);
+    *eob_ptr = 0;
+  }
+}
diff --git a/libs/libvpx/vpx_dsp/x86/quantize_ssse3_x86_64.asm b/libs/libvpx/vpx_dsp/x86/quantize_ssse3_x86_64.asm
new file mode 100644
index 0000000000..ca21539173
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/quantize_ssse3_x86_64.asm
@@ -0,0 +1,346 @@
+;
+;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_1: times 8 dw 1
+
+SECTION .text
+
+; TODO(yunqingwang)fix quantize_b code for skip=1 case.
+%macro QUANTIZE_FN 2
+cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
+                                shift, qcoeff, dqcoeff, dequant, \
+                                eob, scan, iscan
+  cmp                    dword skipm, 0
+  jne .blank
+
+  ; actual quantize loop - setup pointers, rounders, etc.
+  movifnidn                   coeffq, coeffmp
+  movifnidn                  ncoeffq, ncoeffmp
+  mov                             r2, dequantmp
+  movifnidn                    zbinq, zbinmp
+  movifnidn                   roundq, roundmp
+  movifnidn                   quantq, quantmp
+  mova                            m0, [zbinq]              ; m0 = zbin
+  mova                            m1, [roundq]             ; m1 = round
+  mova                            m2, [quantq]             ; m2 = quant
+%ifidn %1, b_32x32
+  pcmpeqw                         m5, m5
+  psrlw                           m5, 15
+  paddw                           m0, m5
+  paddw                           m1, m5
+  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2
+  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
+%endif
+  mova                            m3, [r2q]                ; m3 = dequant
+  psubw                           m0, [pw_1]
+  mov                             r2, shiftmp
+  mov                             r3, qcoeffmp
+  mova                            m4, [r2]                 ; m4 = shift
+  mov                             r4, dqcoeffmp
+  mov                             r5, iscanmp
+%ifidn %1, b_32x32
+  psllw                           m4, 1
+%endif
+  pxor                            m5, m5                   ; m5 = dedicated zero
+  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob
+%if CONFIG_VP9_HIGHBITDEPTH
+  lea                         coeffq, [  coeffq+ncoeffq*4]
+  lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
+  lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
+%else
+  lea                         coeffq, [  coeffq+ncoeffq*2]
+  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
+  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
+%endif
+  lea                         iscanq, [  iscanq+ncoeffq*2]
+  neg                        ncoeffq
+
+  ; get DC and first 15 AC coeffs
+%if CONFIG_VP9_HIGHBITDEPTH
+  ; coeff stored as 32bit numbers & require 16bit numbers
+  mova                            m9, [  coeffq+ncoeffq*4+ 0]
+  packssdw                        m9, [  coeffq+ncoeffq*4+16]
+  mova                           m10, [  coeffq+ncoeffq*4+32]
+  packssdw                       m10, [  coeffq+ncoeffq*4+48]
+%else
+  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
+%endif
+  pabsw                           m6, m9                   ; m6 = abs(m9)
+  pabsw                          m11, m10                  ; m11 = abs(m10)
+  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
+  punpckhqdq                      m0, m0
+  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
+  paddsw                          m6, m1                   ; m6 += round
+  punpckhqdq                      m1, m1
+  paddsw                         m11, m1                   ; m11 += round
+  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
+  punpckhqdq                      m2, m2
+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
+  paddw                           m8, m6                   ; m8 += m6
+  paddw                          m13, m11                  ; m13 += m11
+  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
+  punpckhqdq                      m4, m4
+  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
+  psignw                          m8, m9                   ; m8 = reinsert sign
+  psignw                         m13, m10                  ; m13 = reinsert sign
+  pand                            m8, m7
+  pand                           m13, m12
+%if CONFIG_VP9_HIGHBITDEPTH
+  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  mova                           m11, m8
+  mova                            m6, m8
+  pcmpgtw                         m5, m8
+  punpcklwd                      m11, m5
+  punpckhwd                       m6, m5
+  mova        [qcoeffq+ncoeffq*4+ 0], m11
+  mova        [qcoeffq+ncoeffq*4+16], m6
+  pxor                            m5, m5
+  mova                           m11, m13
+  mova                            m6, m13
+  pcmpgtw                         m5, m13
+  punpcklwd                      m11, m5
+  punpckhwd                       m6, m5
+  mova        [qcoeffq+ncoeffq*4+32], m11
+  mova        [qcoeffq+ncoeffq*4+48], m6
+  pxor                            m5, m5             ; reset m5 to zero register
+%else
+  mova        [qcoeffq+ncoeffq*2+ 0], m8
+  mova        [qcoeffq+ncoeffq*2+16], m13
+%endif
+%ifidn %1, b_32x32
+  pabsw                           m8, m8
+  pabsw                          m13, m13
+%endif
+  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
+  punpckhqdq                      m3, m3
+  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+  psrlw                           m8, 1
+  psrlw                          m13, 1
+  psignw                          m8, m9
+  psignw                         m13, m10
+%endif
+%if CONFIG_VP9_HIGHBITDEPTH
+  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  mova                            m11, m8
+  mova                            m6, m8
+  pcmpgtw                         m5, m8
+  punpcklwd                      m11, m5
+  punpckhwd                       m6, m5
+  mova       [dqcoeffq+ncoeffq*4+ 0], m11
+  mova       [dqcoeffq+ncoeffq*4+16], m6
+  pxor                            m5, m5
+  mova                           m11, m13
+  mova                            m6, m13
+  pcmpgtw                         m5, m13
+  punpcklwd                      m11, m5
+  punpckhwd                       m6, m5
+  mova       [dqcoeffq+ncoeffq*4+32], m11
+  mova       [dqcoeffq+ncoeffq*4+48], m6
+  pxor                            m5, m5             ; reset m5 to zero register
+%else
+  mova       [dqcoeffq+ncoeffq*2+ 0], m8
+  mova       [dqcoeffq+ncoeffq*2+16], m13
+%endif
+  pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
+  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
+  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
+  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
+  psubw                           m6, m7                   ; m6 = scan[i] + 1
+  psubw                          m11, m12                  ; m11 = scan[i] + 1
+  pandn                           m8, m6                   ; m8 = max(eob)
+  pandn                          m13, m11                  ; m13 = max(eob)
+  pmaxsw                          m8, m13
+  add                        ncoeffq, mmsize
+  jz .accumulate_eob
+
+.ac_only_loop:
+%if CONFIG_VP9_HIGHBITDEPTH
+  ; pack coeff from 32bit to 16bit array
+  mova                            m9, [  coeffq+ncoeffq*4+ 0]
+  packssdw                        m9, [  coeffq+ncoeffq*4+16]
+  mova                           m10, [  coeffq+ncoeffq*4+32]
+  packssdw                       m10, [  coeffq+ncoeffq*4+48]
+%else
+  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
+%endif
+  pabsw                           m6, m9                   ; m6 = abs(m9)
+  pabsw                          m11, m10                  ; m11 = abs(m10)
+  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
+  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
+%ifidn %1, b_32x32
+  pmovmskb                       r6d, m7
+  pmovmskb                       r2d, m12
+  or                              r6, r2
+  jz .skip_iter
+%endif
+  paddsw                          m6, m1                   ; m6 += round
+  paddsw                         m11, m1                   ; m11 += round
+  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
+  paddw                          m14, m6                   ; m14 += m6
+  paddw                          m13, m11                  ; m13 += m11
+  pmulhw                         m14, m4                   ; m14 = m14*qsh>>16
+  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
+  psignw                         m14, m9                   ; m14 = reinsert sign
+  psignw                         m13, m10                  ; m13 = reinsert sign
+  pand                           m14, m7
+  pand                           m13, m12
+%if CONFIG_VP9_HIGHBITDEPTH
+  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  pxor                           m11, m11
+  mova                           m11, m14
+  mova                            m6, m14
+  pcmpgtw                         m5, m14
+  punpcklwd                      m11, m5
+  punpckhwd                       m6, m5
+  mova        [qcoeffq+ncoeffq*4+ 0], m11
+  mova        [qcoeffq+ncoeffq*4+16], m6
+  pxor                            m5, m5
+  mova                           m11, m13
+  mova                            m6, m13
+  pcmpgtw                         m5, m13
+  punpcklwd                      m11, m5
+  punpckhwd                       m6, m5
+  mova        [qcoeffq+ncoeffq*4+32], m11
+  mova        [qcoeffq+ncoeffq*4+48], m6
+  pxor                            m5, m5             ; reset m5 to zero register
+%else
+  mova        [qcoeffq+ncoeffq*2+ 0], m14
+  mova        [qcoeffq+ncoeffq*2+16], m13
+%endif
+%ifidn %1, b_32x32
+  pabsw                          m14, m14
+  pabsw                          m13, m13
+%endif
+  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
+  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+  psrlw                          m14, 1
+  psrlw                          m13, 1
+  psignw                         m14, m9
+  psignw                         m13, m10
+%endif
+%if CONFIG_VP9_HIGHBITDEPTH
+  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  mova                           m11, m14
+  mova                            m6, m14
+  pcmpgtw                         m5, m14
+  punpcklwd                      m11, m5
+  punpckhwd                       m6, m5
+  mova       [dqcoeffq+ncoeffq*4+ 0], m11
+  mova       [dqcoeffq+ncoeffq*4+16], m6
+  pxor                            m5, m5
+  mova                           m11, m13
+  mova                            m6, m13
+  pcmpgtw                         m5, m13
+  punpcklwd                      m11, m5
+  punpckhwd                       m6, m5
+  mova       [dqcoeffq+ncoeffq*4+32], m11
+  mova       [dqcoeffq+ncoeffq*4+48], m6
+  pxor                            m5, m5
+%else
+  mova       [dqcoeffq+ncoeffq*2+ 0], m14
+  mova       [dqcoeffq+ncoeffq*2+16], m13
+%endif
+  pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
+  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
+  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
+  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
+  psubw                           m6, m7                   ; m6 = scan[i] + 1
+  psubw                          m11, m12                  ; m11 = scan[i] + 1
+  pandn                          m14, m6                   ; m14 = max(eob)
+  pandn                          m13, m11                  ; m13 = max(eob)
+  pmaxsw                          m8, m14
+  pmaxsw                          m8, m13
+  add                        ncoeffq, mmsize
+  jl .ac_only_loop
+
+%ifidn %1, b_32x32
+  jmp .accumulate_eob
+.skip_iter:
+%if CONFIG_VP9_HIGHBITDEPTH
+  mova        [qcoeffq+ncoeffq*4+ 0], m5
+  mova        [qcoeffq+ncoeffq*4+16], m5
+  mova        [qcoeffq+ncoeffq*4+32], m5
+  mova        [qcoeffq+ncoeffq*4+48], m5
+  mova       [dqcoeffq+ncoeffq*4+ 0], m5
+  mova       [dqcoeffq+ncoeffq*4+16], m5
+  mova       [dqcoeffq+ncoeffq*4+32], m5
+  mova       [dqcoeffq+ncoeffq*4+48], m5
+%else
+  mova        [qcoeffq+ncoeffq*2+ 0], m5
+  mova        [qcoeffq+ncoeffq*2+16], m5
+  mova       [dqcoeffq+ncoeffq*2+ 0], m5
+  mova       [dqcoeffq+ncoeffq*2+16], m5
+%endif
+  add                        ncoeffq, mmsize
+  jl .ac_only_loop
+%endif
+
+.accumulate_eob:
+  ; horizontally accumulate/max eobs and write into [eob] memory pointer
+  mov                             r2, eobmp
+  pshufd                          m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0x1
+  pmaxsw                          m8, m7
+  pextrw                          r6, m8, 0
+  mov                             [r2], r6
+  RET
+
+  ; skip-block, i.e. just write all zeroes
+.blank:
+  mov                             r0, dqcoeffmp
+  movifnidn                  ncoeffq, ncoeffmp
+  mov                             r2, qcoeffmp
+  mov                             r3, eobmp
+  DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
+%if CONFIG_VP9_HIGHBITDEPTH
+  lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
+  lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
+%else
+  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
+  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
+%endif
+  neg                        ncoeffq
+  pxor                            m7, m7
+.blank_loop:
+%if CONFIG_VP9_HIGHBITDEPTH
+  mova       [dqcoeffq+ncoeffq*4+ 0], m7
+  mova       [dqcoeffq+ncoeffq*4+16], m7
+  mova       [dqcoeffq+ncoeffq*4+32], m7
+  mova       [dqcoeffq+ncoeffq*4+48], m7
+  mova        [qcoeffq+ncoeffq*4+ 0], m7
+  mova        [qcoeffq+ncoeffq*4+16], m7
+  mova        [qcoeffq+ncoeffq*4+32], m7
+  mova        [qcoeffq+ncoeffq*4+48], m7
+%else
+  mova       [dqcoeffq+ncoeffq*2+ 0], m7
+  mova       [dqcoeffq+ncoeffq*2+16], m7
+  mova        [qcoeffq+ncoeffq*2+ 0], m7
+  mova        [qcoeffq+ncoeffq*2+16], m7
+%endif
+  add                        ncoeffq, mmsize
+  jl .blank_loop
+  mov                    word [eobq], 0
+  RET
+%endmacro
+
+INIT_XMM ssse3
+QUANTIZE_FN b, 7
+QUANTIZE_FN b_32x32, 7
diff --git a/libs/libvpx/vpx_dsp/x86/sad4d_avx2.c b/libs/libvpx/vpx_dsp/x86/sad4d_avx2.c
new file mode 100644
index 0000000000..793658f9ea
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/sad4d_avx2.c
@@ -0,0 +1,168 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <immintrin.h>  // AVX2
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+void vpx_sad32x32x4d_avx2(const uint8_t *src,
+                          int src_stride,
+                          const uint8_t *const ref[4],
+                          int ref_stride,
+                          uint32_t res[4]) {
+  __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
+  __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
+  __m256i sum_mlow, sum_mhigh;
+  int i;
+  const uint8_t *ref0, *ref1, *ref2, *ref3;
+
+  ref0 = ref[0];
+  ref1 = ref[1];
+  ref2 = ref[2];
+  ref3 = ref[3];
+  sum_ref0 = _mm256_set1_epi16(0);
+  sum_ref1 = _mm256_set1_epi16(0);
+  sum_ref2 = _mm256_set1_epi16(0);
+  sum_ref3 = _mm256_set1_epi16(0);
+  for (i = 0; i < 32 ; i++) {
+    // load src and all refs
+    src_reg = _mm256_loadu_si256((const __m256i *)src);
+    ref0_reg = _mm256_loadu_si256((const __m256i *)ref0);
+    ref1_reg = _mm256_loadu_si256((const __m256i *)ref1);
+    ref2_reg = _mm256_loadu_si256((const __m256i *)ref2);
+    ref3_reg = _mm256_loadu_si256((const __m256i *)ref3);
+    // sum of the absolute differences between every ref-i to src
+    ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
+    ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
+    ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
+    ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
+    // sum every ref-i
+    sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
+    sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
+    sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
+    sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
+
+    src+= src_stride;
+    ref0+= ref_stride;
+    ref1+= ref_stride;
+    ref2+= ref_stride;
+    ref3+= ref_stride;
+  }
+  {
+    __m128i sum;
+    // in sum_ref-i the result is saved in the first 4 bytes
+    // the other 4 bytes are zeroed.
+    // sum_ref1 and sum_ref3 are shifted left by 4 bytes
+    sum_ref1 = _mm256_slli_si256(sum_ref1, 4);
+    sum_ref3 = _mm256_slli_si256(sum_ref3, 4);
+
+    // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
+    sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1);
+    sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3);
+
+    // merge every 64 bit from each sum_ref-i
+    sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2);
+    sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2);
+
+    // add the low 64 bit to the high 64 bit
+    sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh);
+
+    // add the low 128 bit to the high 128 bit
+    sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow),
+                        _mm256_extractf128_si256(sum_mlow, 1));
+
+    _mm_storeu_si128((__m128i *)(res), sum);
+  }
+}
+
+void vpx_sad64x64x4d_avx2(const uint8_t *src,
+                          int src_stride,
+                          const uint8_t *const ref[4],
+                          int ref_stride,
+                          uint32_t res[4]) {
+  __m256i src_reg, srcnext_reg, ref0_reg, ref0next_reg;
+  __m256i ref1_reg, ref1next_reg, ref2_reg, ref2next_reg;
+  __m256i ref3_reg, ref3next_reg;
+  __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
+  __m256i sum_mlow, sum_mhigh;
+  int i;
+  const uint8_t *ref0, *ref1, *ref2, *ref3;
+
+  ref0 = ref[0];
+  ref1 = ref[1];
+  ref2 = ref[2];
+  ref3 = ref[3];
+  sum_ref0 = _mm256_set1_epi16(0);
+  sum_ref1 = _mm256_set1_epi16(0);
+  sum_ref2 = _mm256_set1_epi16(0);
+  sum_ref3 = _mm256_set1_epi16(0);
+  for (i = 0; i < 64 ; i++) {
+    // load 64 bytes from src and all refs
+    src_reg = _mm256_loadu_si256((const __m256i *)src);
+    srcnext_reg = _mm256_loadu_si256((const __m256i *)(src + 32));
+    ref0_reg = _mm256_loadu_si256((const __m256i *)ref0);
+    ref0next_reg = _mm256_loadu_si256((const __m256i *)(ref0 + 32));
+    ref1_reg = _mm256_loadu_si256((const __m256i *)ref1);
+    ref1next_reg = _mm256_loadu_si256((const __m256i *)(ref1 + 32));
+    ref2_reg = _mm256_loadu_si256((const __m256i *)ref2);
+    ref2next_reg = _mm256_loadu_si256((const __m256i *)(ref2 + 32));
+    ref3_reg = _mm256_loadu_si256((const __m256i *)ref3);
+    ref3next_reg = _mm256_loadu_si256((const __m256i *)(ref3 + 32));
+    // sum of the absolute differences between every ref-i to src
+    ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
+    ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
+    ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
+    ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
+    ref0next_reg = _mm256_sad_epu8(ref0next_reg, srcnext_reg);
+    ref1next_reg = _mm256_sad_epu8(ref1next_reg, srcnext_reg);
+    ref2next_reg = _mm256_sad_epu8(ref2next_reg, srcnext_reg);
+    ref3next_reg = _mm256_sad_epu8(ref3next_reg, srcnext_reg);
+
+    // sum every ref-i
+    sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
+    sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
+    sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
+    sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
+    sum_ref0 = _mm256_add_epi32(sum_ref0, ref0next_reg);
+    sum_ref1 = _mm256_add_epi32(sum_ref1, ref1next_reg);
+    sum_ref2 = _mm256_add_epi32(sum_ref2, ref2next_reg);
+    sum_ref3 = _mm256_add_epi32(sum_ref3, ref3next_reg);
+    src+= src_stride;
+    ref0+= ref_stride;
+    ref1+= ref_stride;
+    ref2+= ref_stride;
+    ref3+= ref_stride;
+  }
+  {
+    __m128i sum;
+
+    // in sum_ref-i the result is saved in the first 4 bytes
+    // the other 4 bytes are zeroed.
+    // sum_ref1 and sum_ref3 are shifted left by 4 bytes
+    sum_ref1 = _mm256_slli_si256(sum_ref1, 4);
+    sum_ref3 = _mm256_slli_si256(sum_ref3, 4);
+
+    // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
+    sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1);
+    sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3);
+
+    // merge every 64 bit from each sum_ref-i
+    sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2);
+    sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2);
+
+    // add the low 64 bit to the high 64 bit
+    sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh);
+
+    // add the low 128 bit to the high 128 bit
+    sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow),
+                        _mm256_extractf128_si256(sum_mlow, 1));
+
+    _mm_storeu_si128((__m128i *)(res), sum);
+  }
+}
diff --git a/libs/libvpx/vpx_dsp/x86/sad4d_sse2.asm b/libs/libvpx/vpx_dsp/x86/sad4d_sse2.asm
new file mode 100644
index 0000000000..3f6e55ce9a
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/sad4d_sse2.asm
@@ -0,0 +1,239 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro PROCESS_4x2x4 5-6 0
+  movd                  m0, [srcq +%2]
+%if %1 == 1
+  movd                  m6, [ref1q+%3]
+  movd                  m4, [ref2q+%3]
+  movd                  m7, [ref3q+%3]
+  movd                  m5, [ref4q+%3]
+  movd                  m1, [srcq +%4]
+  movd                  m2, [ref1q+%5]
+  punpckldq             m0, m1
+  punpckldq             m6, m2
+  movd                  m1, [ref2q+%5]
+  movd                  m2, [ref3q+%5]
+  movd                  m3, [ref4q+%5]
+  punpckldq             m4, m1
+  punpckldq             m7, m2
+  punpckldq             m5, m3
+  movlhps               m0, m0
+  movlhps               m6, m4
+  movlhps               m7, m5
+  psadbw                m6, m0
+  psadbw                m7, m0
+%else
+  movd                  m1, [ref1q+%3]
+  movd                  m5, [ref1q+%5]
+  movd                  m2, [ref2q+%3]
+  movd                  m4, [ref2q+%5]
+  punpckldq             m1, m5
+  punpckldq             m2, m4
+  movd                  m3, [ref3q+%3]
+  movd                  m5, [ref3q+%5]
+  punpckldq             m3, m5
+  movd                  m4, [ref4q+%3]
+  movd                  m5, [ref4q+%5]
+  punpckldq             m4, m5
+  movd                  m5, [srcq +%4]
+  punpckldq             m0, m5
+  movlhps               m0, m0
+  movlhps               m1, m2
+  movlhps               m3, m4
+  psadbw                m1, m0
+  psadbw                m3, m0
+  paddd                 m6, m1
+  paddd                 m7, m3
+%endif
+%if %6 == 1
+  lea                 srcq, [srcq +src_strideq*2]
+  lea                ref1q, [ref1q+ref_strideq*2]
+  lea                ref2q, [ref2q+ref_strideq*2]
+  lea                ref3q, [ref3q+ref_strideq*2]
+  lea                ref4q, [ref4q+ref_strideq*2]
+%endif
+%endmacro
+
+; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro PROCESS_8x2x4 5-6 0
+  movh                  m0, [srcq +%2]
+%if %1 == 1
+  movh                  m4, [ref1q+%3]
+  movh                  m5, [ref2q+%3]
+  movh                  m6, [ref3q+%3]
+  movh                  m7, [ref4q+%3]
+  movhps                m0, [srcq +%4]
+  movhps                m4, [ref1q+%5]
+  movhps                m5, [ref2q+%5]
+  movhps                m6, [ref3q+%5]
+  movhps                m7, [ref4q+%5]
+  psadbw                m4, m0
+  psadbw                m5, m0
+  psadbw                m6, m0
+  psadbw                m7, m0
+%else
+  movh                  m1, [ref1q+%3]
+  movh                  m2, [ref2q+%3]
+  movh                  m3, [ref3q+%3]
+  movhps                m0, [srcq +%4]
+  movhps                m1, [ref1q+%5]
+  movhps                m2, [ref2q+%5]
+  movhps                m3, [ref3q+%5]
+  psadbw                m1, m0
+  psadbw                m2, m0
+  psadbw                m3, m0
+  paddd                 m4, m1
+  movh                  m1, [ref4q+%3]
+  movhps                m1, [ref4q+%5]
+  paddd                 m5, m2
+  paddd                 m6, m3
+  psadbw                m1, m0
+  paddd                 m7, m1
+%endif
+%if %6 == 1
+  lea                 srcq, [srcq +src_strideq*2]
+  lea                ref1q, [ref1q+ref_strideq*2]
+  lea                ref2q, [ref2q+ref_strideq*2]
+  lea                ref3q, [ref3q+ref_strideq*2]
+  lea                ref4q, [ref4q+ref_strideq*2]
+%endif
+%endmacro
+
+; PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro PROCESS_16x2x4 5-6 0
+  ; 1st 16 px
+  mova                  m0, [srcq +%2]
+%if %1 == 1
+  movu                  m4, [ref1q+%3]
+  movu                  m5, [ref2q+%3]
+  movu                  m6, [ref3q+%3]
+  movu                  m7, [ref4q+%3]
+  psadbw                m4, m0
+  psadbw                m5, m0
+  psadbw                m6, m0
+  psadbw                m7, m0
+%else
+  movu                  m1, [ref1q+%3]
+  movu                  m2, [ref2q+%3]
+  movu                  m3, [ref3q+%3]
+  psadbw                m1, m0
+  psadbw                m2, m0
+  psadbw                m3, m0
+  paddd                 m4, m1
+  movu                  m1, [ref4q+%3]
+  paddd                 m5, m2
+  paddd                 m6, m3
+  psadbw                m1, m0
+  paddd                 m7, m1
+%endif
+
+  ; 2nd 16 px
+  mova                  m0, [srcq +%4]
+  movu                  m1, [ref1q+%5]
+  movu                  m2, [ref2q+%5]
+  movu                  m3, [ref3q+%5]
+  psadbw                m1, m0
+  psadbw                m2, m0
+  psadbw                m3, m0
+  paddd                 m4, m1
+  movu                  m1, [ref4q+%5]
+  paddd                 m5, m2
+  paddd                 m6, m3
+%if %6 == 1
+  lea                 srcq, [srcq +src_strideq*2]
+  lea                ref1q, [ref1q+ref_strideq*2]
+  lea                ref2q, [ref2q+ref_strideq*2]
+  lea                ref3q, [ref3q+ref_strideq*2]
+  lea                ref4q, [ref4q+ref_strideq*2]
+%endif
+  psadbw                m1, m0
+  paddd                 m7, m1
+%endmacro
+
+; PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro PROCESS_32x2x4 5-6 0
+  PROCESS_16x2x4 %1, %2, %3, %2 + 16, %3 + 16
+  PROCESS_16x2x4  0, %4, %5, %4 + 16, %5 + 16, %6
+%endmacro
+
+; PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro PROCESS_64x2x4 5-6 0
+  PROCESS_32x2x4 %1, %2, %3, %2 + 32, %3 + 32
+  PROCESS_32x2x4  0, %4, %5, %4 + 32, %5 + 32, %6
+%endmacro
+
+; void vpx_sadNxNx4d_sse2(uint8_t *src,    int src_stride,
+;                         uint8_t *ref[4], int ref_stride,
+;                         uint32_t res[4]);
+; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16, 8x8, 8x4, 4x8 and 4x4
+%macro SADNXN4D 2
+%if UNIX64
+cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
+                              res, ref2, ref3, ref4
+%else
+cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
+                              ref2, ref3, ref4
+%endif
+  movsxdifnidn src_strideq, src_strided
+  movsxdifnidn ref_strideq, ref_strided
+  mov                ref2q, [ref1q+gprsize*1]
+  mov                ref3q, [ref1q+gprsize*2]
+  mov                ref4q, [ref1q+gprsize*3]
+  mov                ref1q, [ref1q+gprsize*0]
+
+  PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
+%rep (%2-4)/2
+  PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
+%endrep
+  PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
+
+%if %1 > 4
+  pslldq                m5, 4
+  pslldq                m7, 4
+  por                   m4, m5
+  por                   m6, m7
+  mova                  m5, m4
+  mova                  m7, m6
+  punpcklqdq            m4, m6
+  punpckhqdq            m5, m7
+  movifnidn             r4, r4mp
+  paddd                 m4, m5
+  movu                [r4], m4
+  RET
+%else
+  movifnidn             r4, r4mp
+  pshufd            m6, m6, 0x08
+  pshufd            m7, m7, 0x08
+  movq              [r4+0], m6
+  movq              [r4+8], m7
+  RET
+%endif
+%endmacro
+
+INIT_XMM sse2
+SADNXN4D 64, 64
+SADNXN4D 64, 32
+SADNXN4D 32, 64
+SADNXN4D 32, 32
+SADNXN4D 32, 16
+SADNXN4D 16, 32
+SADNXN4D 16, 16
+SADNXN4D 16,  8
+SADNXN4D  8, 16
+SADNXN4D  8,  8
+SADNXN4D  8,  4
+SADNXN4D  4,  8
+SADNXN4D  4,  4
diff --git a/libs/libvpx/vpx_dsp/x86/sad_avx2.c b/libs/libvpx/vpx_dsp/x86/sad_avx2.c
new file mode 100644
index 0000000000..ce9ad8f780
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/sad_avx2.c
@@ -0,0 +1,181 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <immintrin.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/mem.h"
+
+#define FSAD64_H(h) \
+unsigned int vpx_sad64x##h##_avx2(const uint8_t *src_ptr, \
+                                  int src_stride, \
+                                  const uint8_t *ref_ptr, \
+                                  int ref_stride) { \
+  int i, res; \
+  __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
+  __m256i sum_sad = _mm256_setzero_si256(); \
+  __m256i sum_sad_h; \
+  __m128i sum_sad128; \
+  for (i = 0 ; i < h ; i++) { \
+    ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
+    ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \
+    sad1_reg = _mm256_sad_epu8(ref1_reg, \
+               _mm256_loadu_si256((__m256i const *)src_ptr)); \
+    sad2_reg = _mm256_sad_epu8(ref2_reg, \
+               _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \
+    sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \
+    ref_ptr+= ref_stride; \
+    src_ptr+= src_stride; \
+  } \
+  sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
+  sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
+  sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
+  sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
+  res = _mm_cvtsi128_si32(sum_sad128); \
+  return res; \
+}
+
+#define FSAD32_H(h) \
+unsigned int vpx_sad32x##h##_avx2(const uint8_t *src_ptr, \
+                                  int src_stride, \
+                                  const uint8_t *ref_ptr, \
+                                  int ref_stride) { \
+  int i, res; \
+  __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
+  __m256i sum_sad = _mm256_setzero_si256(); \
+  __m256i sum_sad_h; \
+  __m128i sum_sad128; \
+  int ref2_stride = ref_stride << 1; \
+  int src2_stride = src_stride << 1; \
+  int max = h >> 1; \
+  for (i = 0 ; i < max ; i++) { \
+    ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
+    ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
+    sad1_reg = _mm256_sad_epu8(ref1_reg, \
+               _mm256_loadu_si256((__m256i const *)src_ptr)); \
+    sad2_reg = _mm256_sad_epu8(ref2_reg, \
+               _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \
+    sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \
+    ref_ptr+= ref2_stride; \
+    src_ptr+= src2_stride; \
+  } \
+  sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
+  sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
+  sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
+  sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
+  res = _mm_cvtsi128_si32(sum_sad128); \
+  return res; \
+}
+
+#define FSAD64 \
+FSAD64_H(64); \
+FSAD64_H(32);
+
+#define FSAD32 \
+FSAD32_H(64); \
+FSAD32_H(32); \
+FSAD32_H(16);
+
+FSAD64;
+FSAD32;
+
+#undef FSAD64
+#undef FSAD32
+#undef FSAD64_H
+#undef FSAD32_H
+
+#define FSADAVG64_H(h) \
+unsigned int vpx_sad64x##h##_avg_avx2(const uint8_t *src_ptr, \
+                                      int src_stride, \
+                                      const uint8_t *ref_ptr, \
+                                      int  ref_stride, \
+                                      const uint8_t *second_pred) { \
+  int i, res; \
+  __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
+  __m256i sum_sad = _mm256_setzero_si256(); \
+  __m256i sum_sad_h; \
+  __m128i sum_sad128; \
+  for (i = 0 ; i < h ; i++) { \
+    ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
+    ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \
+    ref1_reg = _mm256_avg_epu8(ref1_reg, \
+               _mm256_loadu_si256((__m256i const *)second_pred)); \
+    ref2_reg = _mm256_avg_epu8(ref2_reg, \
+               _mm256_loadu_si256((__m256i const *)(second_pred +32))); \
+    sad1_reg = _mm256_sad_epu8(ref1_reg, \
+               _mm256_loadu_si256((__m256i const *)src_ptr)); \
+    sad2_reg = _mm256_sad_epu8(ref2_reg, \
+               _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \
+    sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \
+    ref_ptr+= ref_stride; \
+    src_ptr+= src_stride; \
+    second_pred+= 64; \
+  } \
+  sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
+  sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
+  sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
+  sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
+  res = _mm_cvtsi128_si32(sum_sad128); \
+  return res; \
+}
+
+#define FSADAVG32_H(h) \
+unsigned int vpx_sad32x##h##_avg_avx2(const uint8_t *src_ptr, \
+                                      int src_stride, \
+                                      const uint8_t *ref_ptr, \
+                                      int  ref_stride, \
+                                      const uint8_t *second_pred) { \
+  int i, res; \
+  __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
+  __m256i sum_sad = _mm256_setzero_si256(); \
+  __m256i sum_sad_h; \
+  __m128i sum_sad128; \
+  int ref2_stride = ref_stride << 1; \
+  int src2_stride = src_stride << 1; \
+  int max = h >> 1; \
+  for (i = 0 ; i < max ; i++) { \
+    ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
+    ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
+    ref1_reg = _mm256_avg_epu8(ref1_reg, \
+               _mm256_loadu_si256((__m256i const *)second_pred)); \
+    ref2_reg = _mm256_avg_epu8(ref2_reg, \
+               _mm256_loadu_si256((__m256i const *)(second_pred +32))); \
+    sad1_reg = _mm256_sad_epu8(ref1_reg, \
+               _mm256_loadu_si256((__m256i const *)src_ptr)); \
+    sad2_reg = _mm256_sad_epu8(ref2_reg, \
+               _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \
+    sum_sad = _mm256_add_epi32(sum_sad, \
+              _mm256_add_epi32(sad1_reg, sad2_reg)); \
+    ref_ptr+= ref2_stride; \
+    src_ptr+= src2_stride; \
+    second_pred+= 64; \
+  } \
+  sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
+  sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
+  sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
+  sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
+  res = _mm_cvtsi128_si32(sum_sad128); \
+  return res; \
+}
+
+#define FSADAVG64 \
+FSADAVG64_H(64); \
+FSADAVG64_H(32);
+
+#define FSADAVG32 \
+FSADAVG32_H(64); \
+FSADAVG32_H(32); \
+FSADAVG32_H(16);
+
+FSADAVG64;
+FSADAVG32;
+
+#undef FSADAVG64
+#undef FSADAVG32
+#undef FSADAVG64_H
+#undef FSADAVG32_H
diff --git a/libs/libvpx/vpx_dsp/x86/sad_mmx.asm b/libs/libvpx/vpx_dsp/x86/sad_mmx.asm
new file mode 100644
index 0000000000..9968992bd1
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/sad_mmx.asm
@@ -0,0 +1,427 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+global sym(vpx_sad16x16_mmx) PRIVATE
+global sym(vpx_sad8x16_mmx) PRIVATE
+global sym(vpx_sad8x8_mmx) PRIVATE
+global sym(vpx_sad4x4_mmx) PRIVATE
+global sym(vpx_sad16x8_mmx) PRIVATE
+
+;unsigned int vpx_sad16x16_mmx(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+sym(vpx_sad16x16_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push rsi
+    push rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        lea             rcx,        [rsi+rax*8]
+
+        lea             rcx,        [rcx+rax*8]
+        pxor            mm7,        mm7
+
+        pxor            mm6,        mm6
+
+.x16x16sad_mmx_loop:
+
+        movq            mm0,        QWORD PTR [rsi]
+        movq            mm2,        QWORD PTR [rsi+8]
+
+        movq            mm1,        QWORD PTR [rdi]
+        movq            mm3,        QWORD PTR [rdi+8]
+
+        movq            mm4,        mm0
+        movq            mm5,        mm2
+
+        psubusb         mm0,        mm1
+        psubusb         mm1,        mm4
+
+        psubusb         mm2,        mm3
+        psubusb         mm3,        mm5
+
+        por             mm0,        mm1
+        por             mm2,        mm3
+
+        movq            mm1,        mm0
+        movq            mm3,        mm2
+
+        punpcklbw       mm0,        mm6
+        punpcklbw       mm2,        mm6
+
+        punpckhbw       mm1,        mm6
+        punpckhbw       mm3,        mm6
+
+        paddw           mm0,        mm2
+        paddw           mm1,        mm3
+
+
+        lea             rsi,        [rsi+rax]
+        add             rdi,        rdx
+
+        paddw           mm7,        mm0
+        paddw           mm7,        mm1
+
+        cmp             rsi,        rcx
+        jne             .x16x16sad_mmx_loop
+
+
+        movq            mm0,        mm7
+
+        punpcklwd       mm0,        mm6
+        punpckhwd       mm7,        mm6
+
+        paddw           mm0,        mm7
+        movq            mm7,        mm0
+
+
+        psrlq           mm0,        32
+        paddw           mm7,        mm0
+
+        movq            rax,        mm7
+
+    pop rdi
+    pop rsi
+    mov rsp, rbp
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int vpx_sad8x16_mmx(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+sym(vpx_sad8x16_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push rsi
+    push rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        lea             rcx,        [rsi+rax*8]
+
+        lea             rcx,        [rcx+rax*8]
+        pxor            mm7,        mm7
+
+        pxor            mm6,        mm6
+
+.x8x16sad_mmx_loop:
+
+        movq            mm0,        QWORD PTR [rsi]
+        movq            mm1,        QWORD PTR [rdi]
+
+        movq            mm2,        mm0
+        psubusb         mm0,        mm1
+
+        psubusb         mm1,        mm2
+        por             mm0,        mm1
+
+        movq            mm2,        mm0
+        punpcklbw       mm0,        mm6
+
+        punpckhbw       mm2,        mm6
+        lea             rsi,        [rsi+rax]
+
+        add             rdi,        rdx
+        paddw           mm7,        mm0
+
+        paddw           mm7,        mm2
+        cmp             rsi,        rcx
+
+        jne             .x8x16sad_mmx_loop
+
+        movq            mm0,        mm7
+        punpcklwd       mm0,        mm6
+
+        punpckhwd       mm7,        mm6
+        paddw           mm0,        mm7
+
+        movq            mm7,        mm0
+        psrlq           mm0,        32
+
+        paddw           mm7,        mm0
+        movq            rax,        mm7
+
+    pop rdi
+    pop rsi
+    mov rsp, rbp
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int vpx_sad8x8_mmx(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+sym(vpx_sad8x8_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push rsi
+    push rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        lea             rcx,        [rsi+rax*8]
+        pxor            mm7,        mm7
+
+        pxor            mm6,        mm6
+
+.x8x8sad_mmx_loop:
+
+        movq            mm0,        QWORD PTR [rsi]
+        movq            mm1,        QWORD PTR [rdi]
+
+        movq            mm2,        mm0
+        psubusb         mm0,        mm1
+
+        psubusb         mm1,        mm2
+        por             mm0,        mm1
+
+        movq            mm2,        mm0
+        punpcklbw       mm0,        mm6
+
+        punpckhbw       mm2,        mm6
+        paddw           mm0,        mm2
+
+        lea             rsi,       [rsi+rax]
+        add             rdi,        rdx
+
+        paddw           mm7,       mm0
+        cmp             rsi,        rcx
+
+        jne             .x8x8sad_mmx_loop
+
+        movq            mm0,        mm7
+        punpcklwd       mm0,        mm6
+
+        punpckhwd       mm7,        mm6
+        paddw           mm0,        mm7
+
+        movq            mm7,        mm0
+        psrlq           mm0,        32
+
+        paddw           mm7,        mm0
+        movq            rax,        mm7
+
+    pop rdi
+    pop rsi
+    mov rsp, rbp
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int vpx_sad4x4_mmx(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+sym(vpx_sad4x4_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push rsi
+    push rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        movd            mm0,        DWORD PTR [rsi]
+        movd            mm1,        DWORD PTR [rdi]
+
+        movd            mm2,        DWORD PTR [rsi+rax]
+        movd            mm3,        DWORD PTR [rdi+rdx]
+
+        punpcklbw       mm0,        mm2
+        punpcklbw       mm1,        mm3
+
+        movq            mm2,        mm0
+        psubusb         mm0,        mm1
+
+        psubusb         mm1,        mm2
+        por             mm0,        mm1
+
+        movq            mm2,        mm0
+        pxor            mm3,        mm3
+
+        punpcklbw       mm0,        mm3
+        punpckhbw       mm2,        mm3
+
+        paddw           mm0,        mm2
+
+        lea             rsi,        [rsi+rax*2]
+        lea             rdi,        [rdi+rdx*2]
+
+        movd            mm4,        DWORD PTR [rsi]
+        movd            mm5,        DWORD PTR [rdi]
+
+        movd            mm6,        DWORD PTR [rsi+rax]
+        movd            mm7,        DWORD PTR [rdi+rdx]
+
+        punpcklbw       mm4,        mm6
+        punpcklbw       mm5,        mm7
+
+        movq            mm6,        mm4
+        psubusb         mm4,        mm5
+
+        psubusb         mm5,        mm6
+        por             mm4,        mm5
+
+        movq            mm5,        mm4
+        punpcklbw       mm4,        mm3
+
+        punpckhbw       mm5,        mm3
+        paddw           mm4,        mm5
+
+        paddw           mm0,        mm4
+        movq            mm1,        mm0
+
+        punpcklwd       mm0,        mm3
+        punpckhwd       mm1,        mm3
+
+        paddw           mm0,        mm1
+        movq            mm1,        mm0
+
+        psrlq           mm0,        32
+        paddw           mm0,        mm1
+
+        movq            rax,        mm0
+
+    pop rdi
+    pop rsi
+    mov rsp, rbp
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int vpx_sad16x8_mmx(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+sym(vpx_sad16x8_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push rsi
+    push rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        lea             rcx,        [rsi+rax*8]
+        pxor            mm7,        mm7
+
+        pxor            mm6,        mm6
+
+.x16x8sad_mmx_loop:
+
+        movq            mm0,       [rsi]
+        movq            mm1,       [rdi]
+
+        movq            mm2,        [rsi+8]
+        movq            mm3,        [rdi+8]
+
+        movq            mm4,        mm0
+        movq            mm5,        mm2
+
+        psubusb         mm0,        mm1
+        psubusb         mm1,        mm4
+
+        psubusb         mm2,        mm3
+        psubusb         mm3,        mm5
+
+        por             mm0,        mm1
+        por             mm2,        mm3
+
+        movq            mm1,        mm0
+        movq            mm3,        mm2
+
+        punpcklbw       mm0,        mm6
+        punpckhbw       mm1,        mm6
+
+        punpcklbw       mm2,        mm6
+        punpckhbw       mm3,        mm6
+
+
+        paddw           mm0,        mm2
+        paddw           mm1,        mm3
+
+        paddw           mm0,        mm1
+        lea             rsi,        [rsi+rax]
+
+        add             rdi,        rdx
+        paddw           mm7,        mm0
+
+        cmp             rsi,        rcx
+        jne             .x16x8sad_mmx_loop
+
+        movq            mm0,        mm7
+        punpcklwd       mm0,        mm6
+
+        punpckhwd       mm7,        mm6
+        paddw           mm0,        mm7
+
+        movq            mm7,        mm0
+        psrlq           mm0,        32
+
+        paddw           mm7,        mm0
+        movq            rax,        mm7
+
+    pop rdi
+    pop rsi
+    mov rsp, rbp
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/libs/libvpx/vpx_dsp/x86/sad_sse2.asm b/libs/libvpx/vpx_dsp/x86/sad_sse2.asm
new file mode 100644
index 0000000000..1ec906c236
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/sad_sse2.asm
@@ -0,0 +1,268 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro SAD_FN 4
+%if %4 == 0
+%if %3 == 5
+cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
+                            src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+%else ; avg
+%if %3 == 5
+cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \
+                                    second_pred, n_rows
+%else ; %3 == 7
+cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 6, src, src_stride, \
+                                              ref, ref_stride, \
+                                              second_pred, \
+                                              src_stride3, ref_stride3
+%if ARCH_X86_64
+%define n_rowsd r7d
+%else ; x86-32
+%define n_rowsd dword r0m
+%endif ; x86-32/64
+%endif ; %3 == 5/7
+%endif ; avg/sad
+  movsxdifnidn src_strideq, src_strided
+  movsxdifnidn ref_strideq, ref_strided
+%if %3 == 7
+  lea         src_stride3q, [src_strideq*3]
+  lea         ref_stride3q, [ref_strideq*3]
+%endif ; %3 == 7
+%endmacro
+
+; unsigned int vpx_sad64x64_sse2(uint8_t *src, int src_stride,
+;                                uint8_t *ref, int ref_stride);
+%macro SAD64XN 1-2 0
+  SAD_FN 64, %1, 5, %2
+  mov              n_rowsd, %1
+  pxor                  m0, m0
+.loop:
+  movu                  m1, [refq]
+  movu                  m2, [refq+16]
+  movu                  m3, [refq+32]
+  movu                  m4, [refq+48]
+%if %2 == 1
+  pavgb                 m1, [second_predq+mmsize*0]
+  pavgb                 m2, [second_predq+mmsize*1]
+  pavgb                 m3, [second_predq+mmsize*2]
+  pavgb                 m4, [second_predq+mmsize*3]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
+  psadbw                m1, [srcq]
+  psadbw                m2, [srcq+16]
+  psadbw                m3, [srcq+32]
+  psadbw                m4, [srcq+48]
+  paddd                 m1, m2
+  paddd                 m3, m4
+  add                 refq, ref_strideq
+  paddd                 m0, m1
+  add                 srcq, src_strideq
+  paddd                 m0, m3
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+SAD64XN 64 ; sad64x64_sse2
+SAD64XN 32 ; sad64x32_sse2
+SAD64XN 64, 1 ; sad64x64_avg_sse2
+SAD64XN 32, 1 ; sad64x32_avg_sse2
+
+; unsigned int vpx_sad32x32_sse2(uint8_t *src, int src_stride,
+;                                uint8_t *ref, int ref_stride);
+%macro SAD32XN 1-2 0
+  SAD_FN 32, %1, 5, %2
+  mov              n_rowsd, %1/2
+  pxor                  m0, m0
+.loop:
+  movu                  m1, [refq]
+  movu                  m2, [refq+16]
+  movu                  m3, [refq+ref_strideq]
+  movu                  m4, [refq+ref_strideq+16]
+%if %2 == 1
+  pavgb                 m1, [second_predq+mmsize*0]
+  pavgb                 m2, [second_predq+mmsize*1]
+  pavgb                 m3, [second_predq+mmsize*2]
+  pavgb                 m4, [second_predq+mmsize*3]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
+  psadbw                m1, [srcq]
+  psadbw                m2, [srcq+16]
+  psadbw                m3, [srcq+src_strideq]
+  psadbw                m4, [srcq+src_strideq+16]
+  paddd                 m1, m2
+  paddd                 m3, m4
+  lea                 refq, [refq+ref_strideq*2]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*2]
+  paddd                 m0, m3
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+SAD32XN 64 ; sad32x64_sse2
+SAD32XN 32 ; sad32x32_sse2
+SAD32XN 16 ; sad32x16_sse2
+SAD32XN 64, 1 ; sad32x64_avg_sse2
+SAD32XN 32, 1 ; sad32x32_avg_sse2
+SAD32XN 16, 1 ; sad32x16_avg_sse2
+
+; unsigned int vpx_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
+;                                    uint8_t *ref, int ref_stride);
+%macro SAD16XN 1-2 0
+  SAD_FN 16, %1, 7, %2
+  mov              n_rowsd, %1/4
+  pxor                  m0, m0
+
+.loop:
+  movu                  m1, [refq]
+  movu                  m2, [refq+ref_strideq]
+  movu                  m3, [refq+ref_strideq*2]
+  movu                  m4, [refq+ref_stride3q]
+%if %2 == 1
+  pavgb                 m1, [second_predq+mmsize*0]
+  pavgb                 m2, [second_predq+mmsize*1]
+  pavgb                 m3, [second_predq+mmsize*2]
+  pavgb                 m4, [second_predq+mmsize*3]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
+  psadbw                m1, [srcq]
+  psadbw                m2, [srcq+src_strideq]
+  psadbw                m3, [srcq+src_strideq*2]
+  psadbw                m4, [srcq+src_stride3q]
+  paddd                 m1, m2
+  paddd                 m3, m4
+  lea                 refq, [refq+ref_strideq*4]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*4]
+  paddd                 m0, m3
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+SAD16XN 32 ; sad16x32_sse2
+SAD16XN 16 ; sad16x16_sse2
+SAD16XN  8 ; sad16x8_sse2
+SAD16XN 32, 1 ; sad16x32_avg_sse2
+SAD16XN 16, 1 ; sad16x16_avg_sse2
+SAD16XN  8, 1 ; sad16x8_avg_sse2
+
+; unsigned int vpx_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
+;                                   uint8_t *ref, int ref_stride);
+%macro SAD8XN 1-2 0
+  SAD_FN 8, %1, 7, %2
+  mov              n_rowsd, %1/4
+  pxor                  m0, m0
+
+.loop:
+  movh                  m1, [refq]
+  movhps                m1, [refq+ref_strideq]
+  movh                  m2, [refq+ref_strideq*2]
+  movhps                m2, [refq+ref_stride3q]
+%if %2 == 1
+  pavgb                 m1, [second_predq+mmsize*0]
+  pavgb                 m2, [second_predq+mmsize*1]
+  lea         second_predq, [second_predq+mmsize*2]
+%endif
+  movh                  m3, [srcq]
+  movhps                m3, [srcq+src_strideq]
+  movh                  m4, [srcq+src_strideq*2]
+  movhps                m4, [srcq+src_stride3q]
+  psadbw                m1, m3
+  psadbw                m2, m4
+  lea                 refq, [refq+ref_strideq*4]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*4]
+  paddd                 m0, m2
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+SAD8XN 16 ; sad8x16_sse2
+SAD8XN  8 ; sad8x8_sse2
+SAD8XN  4 ; sad8x4_sse2
+SAD8XN 16, 1 ; sad8x16_avg_sse2
+SAD8XN  8, 1 ; sad8x8_avg_sse2
+SAD8XN  4, 1 ; sad8x4_avg_sse2
+
+; unsigned int vpx_sad4x{4, 8}_sse2(uint8_t *src, int src_stride,
+;                                   uint8_t *ref, int ref_stride);
+%macro SAD4XN 1-2 0
+  SAD_FN 4, %1, 7, %2
+  mov              n_rowsd, %1/4
+  pxor                  m0, m0
+
+.loop:
+  movd                  m1, [refq]
+  movd                  m2, [refq+ref_strideq]
+  movd                  m3, [refq+ref_strideq*2]
+  movd                  m4, [refq+ref_stride3q]
+  punpckldq             m1, m2
+  punpckldq             m3, m4
+  movlhps               m1, m3
+%if %2 == 1
+  pavgb                 m1, [second_predq+mmsize*0]
+  lea         second_predq, [second_predq+mmsize*1]
+%endif
+  movd                  m2, [srcq]
+  movd                  m5, [srcq+src_strideq]
+  movd                  m4, [srcq+src_strideq*2]
+  movd                  m3, [srcq+src_stride3q]
+  punpckldq             m2, m5
+  punpckldq             m4, m3
+  movlhps               m2, m4
+  psadbw                m1, m2
+  lea                 refq, [refq+ref_strideq*4]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*4]
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+SAD4XN  8 ; sad4x8_sse
+SAD4XN  4 ; sad4x4_sse
+SAD4XN  8, 1 ; sad4x8_avg_sse
+SAD4XN  4, 1 ; sad4x4_avg_sse
diff --git a/libs/libvpx/vpx_dsp/x86/sad_sse3.asm b/libs/libvpx/vpx_dsp/x86/sad_sse3.asm
new file mode 100644
index 0000000000..18279bdb9d
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/sad_sse3.asm
@@ -0,0 +1,374 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro STACK_FRAME_CREATE_X3 0
+%if ABI_IS_32BIT
+  %define     src_ptr       rsi
+  %define     src_stride    rax
+  %define     ref_ptr       rdi
+  %define     ref_stride    rdx
+  %define     end_ptr       rcx
+  %define     ret_var       rbx
+  %define     result_ptr    arg(4)
+  %define     height        dword ptr arg(4)
+    push        rbp
+    mov         rbp,        rsp
+    push        rsi
+    push        rdi
+    push        rbx
+
+    mov         rsi,        arg(0)              ; src_ptr
+    mov         rdi,        arg(2)              ; ref_ptr
+
+    movsxd      rax,        dword ptr arg(1)    ; src_stride
+    movsxd      rdx,        dword ptr arg(3)    ; ref_stride
+%else
+  %if LIBVPX_YASM_WIN64
+    SAVE_XMM 7, u
+    %define     src_ptr     rcx
+    %define     src_stride  rdx
+    %define     ref_ptr     r8
+    %define     ref_stride  r9
+    %define     end_ptr     r10
+    %define     ret_var     r11
+    %define     result_ptr  [rsp+xmm_stack_space+8+4*8]
+    %define     height      dword ptr [rsp+xmm_stack_space+8+4*8]
+  %else
+    %define     src_ptr     rdi
+    %define     src_stride  rsi
+    %define     ref_ptr     rdx
+    %define     ref_stride  rcx
+    %define     end_ptr     r9
+    %define     ret_var     r10
+    %define     result_ptr  r8
+    %define     height      r8
+  %endif
+%endif
+
+%endmacro
+
+%macro STACK_FRAME_DESTROY_X3 0
+  %define     src_ptr
+  %define     src_stride
+  %define     ref_ptr
+  %define     ref_stride
+  %define     end_ptr
+  %define     ret_var
+  %define     result_ptr
+  %define     height
+
+%if ABI_IS_32BIT
+    pop         rbx
+    pop         rdi
+    pop         rsi
+    pop         rbp
+%else
+  %if LIBVPX_YASM_WIN64
+    RESTORE_XMM
+  %endif
+%endif
+    ret
+%endmacro
+
+%macro PROCESS_16X2X3 5
+%if %1==0
+        movdqa          xmm0,       XMMWORD PTR [%2]
+        lddqu           xmm5,       XMMWORD PTR [%3]
+        lddqu           xmm6,       XMMWORD PTR [%3+1]
+        lddqu           xmm7,       XMMWORD PTR [%3+2]
+
+        psadbw          xmm5,       xmm0
+        psadbw          xmm6,       xmm0
+        psadbw          xmm7,       xmm0
+%else
+        movdqa          xmm0,       XMMWORD PTR [%2]
+        lddqu           xmm1,       XMMWORD PTR [%3]
+        lddqu           xmm2,       XMMWORD PTR [%3+1]
+        lddqu           xmm3,       XMMWORD PTR [%3+2]
+
+        psadbw          xmm1,       xmm0
+        psadbw          xmm2,       xmm0
+        psadbw          xmm3,       xmm0
+
+        paddw           xmm5,       xmm1
+        paddw           xmm6,       xmm2
+        paddw           xmm7,       xmm3
+%endif
+        movdqa          xmm0,       XMMWORD PTR [%2+%4]
+        lddqu           xmm1,       XMMWORD PTR [%3+%5]
+        lddqu           xmm2,       XMMWORD PTR [%3+%5+1]
+        lddqu           xmm3,       XMMWORD PTR [%3+%5+2]
+
+%if %1==0 || %1==1
+        lea             %2,         [%2+%4*2]
+        lea             %3,         [%3+%5*2]
+%endif
+
+        psadbw          xmm1,       xmm0
+        psadbw          xmm2,       xmm0
+        psadbw          xmm3,       xmm0
+
+        paddw           xmm5,       xmm1
+        paddw           xmm6,       xmm2
+        paddw           xmm7,       xmm3
+%endmacro
+
+%macro PROCESS_8X2X3 5
+%if %1==0
+        movq            mm0,       QWORD PTR [%2]
+        movq            mm5,       QWORD PTR [%3]
+        movq            mm6,       QWORD PTR [%3+1]
+        movq            mm7,       QWORD PTR [%3+2]
+
+        psadbw          mm5,       mm0
+        psadbw          mm6,       mm0
+        psadbw          mm7,       mm0
+%else
+        movq            mm0,       QWORD PTR [%2]
+        movq            mm1,       QWORD PTR [%3]
+        movq            mm2,       QWORD PTR [%3+1]
+        movq            mm3,       QWORD PTR [%3+2]
+
+        psadbw          mm1,       mm0
+        psadbw          mm2,       mm0
+        psadbw          mm3,       mm0
+
+        paddw           mm5,       mm1
+        paddw           mm6,       mm2
+        paddw           mm7,       mm3
+%endif
+        movq            mm0,       QWORD PTR [%2+%4]
+        movq            mm1,       QWORD PTR [%3+%5]
+        movq            mm2,       QWORD PTR [%3+%5+1]
+        movq            mm3,       QWORD PTR [%3+%5+2]
+
+%if %1==0 || %1==1
+        lea             %2,        [%2+%4*2]
+        lea             %3,        [%3+%5*2]
+%endif
+
+        psadbw          mm1,       mm0
+        psadbw          mm2,       mm0
+        psadbw          mm3,       mm0
+
+        paddw           mm5,       mm1
+        paddw           mm6,       mm2
+        paddw           mm7,       mm3
+%endmacro
+
+;void int vpx_sad16x16x3_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  *results)
+global sym(vpx_sad16x16x3_sse3) PRIVATE
+sym(vpx_sad16x16x3_sse3):
+
+    STACK_FRAME_CREATE_X3
+
+        PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
+
+        mov             rcx,        result_ptr
+
+        movq            xmm0,       xmm5
+        psrldq          xmm5,       8
+
+        paddw           xmm0,       xmm5
+        movd            [rcx],      xmm0
+;-
+        movq            xmm0,       xmm6
+        psrldq          xmm6,       8
+
+        paddw           xmm0,       xmm6
+        movd            [rcx+4],    xmm0
+;-
+        movq            xmm0,       xmm7
+        psrldq          xmm7,       8
+
+        paddw           xmm0,       xmm7
+        movd            [rcx+8],    xmm0
+
+    STACK_FRAME_DESTROY_X3
+
+;void int vpx_sad16x8x3_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  *results)
+global sym(vpx_sad16x8x3_sse3) PRIVATE
+sym(vpx_sad16x8x3_sse3):
+
+    STACK_FRAME_CREATE_X3
+
+        PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
+
+        mov             rcx,        result_ptr
+
+        movq            xmm0,       xmm5
+        psrldq          xmm5,       8
+
+        paddw           xmm0,       xmm5
+        movd            [rcx],      xmm0
+;-
+        movq            xmm0,       xmm6
+        psrldq          xmm6,       8
+
+        paddw           xmm0,       xmm6
+        movd            [rcx+4],    xmm0
+;-
+        movq            xmm0,       xmm7
+        psrldq          xmm7,       8
+
+        paddw           xmm0,       xmm7
+        movd            [rcx+8],    xmm0
+
+    STACK_FRAME_DESTROY_X3
+
+;void int vpx_sad8x16x3_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  *results)
+global sym(vpx_sad8x16x3_sse3) PRIVATE
+sym(vpx_sad8x16x3_sse3):
+
+    STACK_FRAME_CREATE_X3
+
+        PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
+
+        mov             rcx,        result_ptr
+
+        punpckldq       mm5,        mm6
+
+        movq            [rcx],      mm5
+        movd            [rcx+8],    mm7
+
+    STACK_FRAME_DESTROY_X3
+
+;void int vpx_sad8x8x3_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  *results)
+global sym(vpx_sad8x8x3_sse3) PRIVATE
+sym(vpx_sad8x8x3_sse3):
+
+    STACK_FRAME_CREATE_X3
+
+        PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
+
+        mov             rcx,        result_ptr
+
+        punpckldq       mm5,        mm6
+
+        movq            [rcx],      mm5
+        movd            [rcx+8],    mm7
+
+    STACK_FRAME_DESTROY_X3
+
+;void int vpx_sad4x4x3_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  *results)
+global sym(vpx_sad4x4x3_sse3) PRIVATE
+sym(vpx_sad4x4x3_sse3):
+
+    STACK_FRAME_CREATE_X3
+
+        movd            mm0,        DWORD PTR [src_ptr]
+        movd            mm1,        DWORD PTR [ref_ptr]
+
+        movd            mm2,        DWORD PTR [src_ptr+src_stride]
+        movd            mm3,        DWORD PTR [ref_ptr+ref_stride]
+
+        punpcklbw       mm0,        mm2
+        punpcklbw       mm1,        mm3
+
+        movd            mm4,        DWORD PTR [ref_ptr+1]
+        movd            mm5,        DWORD PTR [ref_ptr+2]
+
+        movd            mm2,        DWORD PTR [ref_ptr+ref_stride+1]
+        movd            mm3,        DWORD PTR [ref_ptr+ref_stride+2]
+
+        psadbw          mm1,        mm0
+
+        punpcklbw       mm4,        mm2
+        punpcklbw       mm5,        mm3
+
+        psadbw          mm4,        mm0
+        psadbw          mm5,        mm0
+
+        lea             src_ptr,    [src_ptr+src_stride*2]
+        lea             ref_ptr,    [ref_ptr+ref_stride*2]
+
+        movd            mm0,        DWORD PTR [src_ptr]
+        movd            mm2,        DWORD PTR [ref_ptr]
+
+        movd            mm3,        DWORD PTR [src_ptr+src_stride]
+        movd            mm6,        DWORD PTR [ref_ptr+ref_stride]
+
+        punpcklbw       mm0,        mm3
+        punpcklbw       mm2,        mm6
+
+        movd            mm3,        DWORD PTR [ref_ptr+1]
+        movd            mm7,        DWORD PTR [ref_ptr+2]
+
+        psadbw          mm2,        mm0
+
+        paddw           mm1,        mm2
+
+        movd            mm2,        DWORD PTR [ref_ptr+ref_stride+1]
+        movd            mm6,        DWORD PTR [ref_ptr+ref_stride+2]
+
+        punpcklbw       mm3,        mm2
+        punpcklbw       mm7,        mm6
+
+        psadbw          mm3,        mm0
+        psadbw          mm7,        mm0
+
+        paddw           mm3,        mm4
+        paddw           mm7,        mm5
+
+        mov             rcx,        result_ptr
+
+        punpckldq       mm1,        mm3
+
+        movq            [rcx],      mm1
+        movd            [rcx+8],    mm7
+
+    STACK_FRAME_DESTROY_X3
diff --git a/libs/libvpx/vpx_dsp/x86/sad_sse4.asm b/libs/libvpx/vpx_dsp/x86/sad_sse4.asm
new file mode 100644
index 0000000000..bc67447971
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/sad_sse4.asm
@@ -0,0 +1,359 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro PROCESS_16X2X8 1
+%if %1
+        movdqa          xmm0,       XMMWORD PTR [rsi]
+        movq            xmm1,       MMWORD PTR [rdi]
+        movq            xmm3,       MMWORD PTR [rdi+8]
+        movq            xmm2,       MMWORD PTR [rdi+16]
+        punpcklqdq      xmm1,       xmm3
+        punpcklqdq      xmm3,       xmm2
+
+        movdqa          xmm2,       xmm1
+        mpsadbw         xmm1,       xmm0,  0x0
+        mpsadbw         xmm2,       xmm0,  0x5
+
+        psrldq          xmm0,       8
+
+        movdqa          xmm4,       xmm3
+        mpsadbw         xmm3,       xmm0,  0x0
+        mpsadbw         xmm4,       xmm0,  0x5
+
+        paddw           xmm1,       xmm2
+        paddw           xmm1,       xmm3
+        paddw           xmm1,       xmm4
+%else
+        movdqa          xmm0,       XMMWORD PTR [rsi]
+        movq            xmm5,       MMWORD PTR [rdi]
+        movq            xmm3,       MMWORD PTR [rdi+8]
+        movq            xmm2,       MMWORD PTR [rdi+16]
+        punpcklqdq      xmm5,       xmm3
+        punpcklqdq      xmm3,       xmm2
+
+        movdqa          xmm2,       xmm5
+        mpsadbw         xmm5,       xmm0,  0x0
+        mpsadbw         xmm2,       xmm0,  0x5
+
+        psrldq          xmm0,       8
+
+        movdqa          xmm4,       xmm3
+        mpsadbw         xmm3,       xmm0,  0x0
+        mpsadbw         xmm4,       xmm0,  0x5
+
+        paddw           xmm5,       xmm2
+        paddw           xmm5,       xmm3
+        paddw           xmm5,       xmm4
+
+        paddw           xmm1,       xmm5
+%endif
+        movdqa          xmm0,       XMMWORD PTR [rsi + rax]
+        movq            xmm5,       MMWORD PTR [rdi+ rdx]
+        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
+        movq            xmm2,       MMWORD PTR [rdi+ rdx+16]
+        punpcklqdq      xmm5,       xmm3
+        punpcklqdq      xmm3,       xmm2
+
+        lea             rsi,        [rsi+rax*2]
+        lea             rdi,        [rdi+rdx*2]
+
+        movdqa          xmm2,       xmm5
+        mpsadbw         xmm5,       xmm0,  0x0
+        mpsadbw         xmm2,       xmm0,  0x5
+
+        psrldq          xmm0,       8
+        movdqa          xmm4,       xmm3
+        mpsadbw         xmm3,       xmm0,  0x0
+        mpsadbw         xmm4,       xmm0,  0x5
+
+        paddw           xmm5,       xmm2
+        paddw           xmm5,       xmm3
+        paddw           xmm5,       xmm4
+
+        paddw           xmm1,       xmm5
+%endmacro
+
+%macro PROCESS_8X2X8 1
+%if %1
+        movq            xmm0,       MMWORD PTR [rsi]
+        movq            xmm1,       MMWORD PTR [rdi]
+        movq            xmm3,       MMWORD PTR [rdi+8]
+        punpcklqdq      xmm1,       xmm3
+
+        movdqa          xmm2,       xmm1
+        mpsadbw         xmm1,       xmm0,  0x0
+        mpsadbw         xmm2,       xmm0,  0x5
+        paddw           xmm1,       xmm2
+%else
+        movq            xmm0,       MMWORD PTR [rsi]
+        movq            xmm5,       MMWORD PTR [rdi]
+        movq            xmm3,       MMWORD PTR [rdi+8]
+        punpcklqdq      xmm5,       xmm3
+
+        movdqa          xmm2,       xmm5
+        mpsadbw         xmm5,       xmm0,  0x0
+        mpsadbw         xmm2,       xmm0,  0x5
+        paddw           xmm5,       xmm2
+
+        paddw           xmm1,       xmm5
+%endif
+        movq            xmm0,       MMWORD PTR [rsi + rax]
+        movq            xmm5,       MMWORD PTR [rdi+ rdx]
+        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
+        punpcklqdq      xmm5,       xmm3
+
+        lea             rsi,        [rsi+rax*2]
+        lea             rdi,        [rdi+rdx*2]
+
+        movdqa          xmm2,       xmm5
+        mpsadbw         xmm5,       xmm0,  0x0
+        mpsadbw         xmm2,       xmm0,  0x5
+        paddw           xmm5,       xmm2
+
+        paddw           xmm1,       xmm5
+%endmacro
+
+%macro PROCESS_4X2X8 1
+%if %1
+        movd            xmm0,       [rsi]
+        movq            xmm1,       MMWORD PTR [rdi]
+        movq            xmm3,       MMWORD PTR [rdi+8]
+        punpcklqdq      xmm1,       xmm3
+
+        mpsadbw         xmm1,       xmm0,  0x0
+%else
+        movd            xmm0,       [rsi]
+        movq            xmm5,       MMWORD PTR [rdi]
+        movq            xmm3,       MMWORD PTR [rdi+8]
+        punpcklqdq      xmm5,       xmm3
+
+        mpsadbw         xmm5,       xmm0,  0x0
+
+        paddw           xmm1,       xmm5
+%endif
+        movd            xmm0,       [rsi + rax]
+        movq            xmm5,       MMWORD PTR [rdi+ rdx]
+        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
+        punpcklqdq      xmm5,       xmm3
+
+        lea             rsi,        [rsi+rax*2]
+        lea             rdi,        [rdi+rdx*2]
+
+        mpsadbw         xmm5,       xmm0,  0x0
+
+        paddw           xmm1,       xmm5
+%endmacro
+
+%macro WRITE_AS_INTS 0
+    mov             rdi,        arg(4)           ;Results
+    pxor            xmm0, xmm0
+    movdqa          xmm2, xmm1
+    punpcklwd       xmm1, xmm0
+    punpckhwd       xmm2, xmm0
+
+    movdqa          [rdi],    xmm1
+    movdqa          [rdi + 16],    xmm2
+%endmacro
+
+;void vpx_sad16x16x8_sse4_1(
+;    const unsigned char *src_ptr,
+;    int  src_stride,
+;    const unsigned char *ref_ptr,
+;    int  ref_stride,
+;    unsigned short *sad_array);
+global sym(vpx_sad16x16x8_sse4_1) PRIVATE
+sym(vpx_sad16x16x8_sse4_1):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov             rsi,        arg(0)           ;src_ptr
+    mov             rdi,        arg(2)           ;ref_ptr
+
+    movsxd          rax,        dword ptr arg(1) ;src_stride
+    movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+    PROCESS_16X2X8 1
+    PROCESS_16X2X8 0
+    PROCESS_16X2X8 0
+    PROCESS_16X2X8 0
+    PROCESS_16X2X8 0
+    PROCESS_16X2X8 0
+    PROCESS_16X2X8 0
+    PROCESS_16X2X8 0
+
+    WRITE_AS_INTS
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vpx_sad16x8x8_sse4_1(
+;    const unsigned char *src_ptr,
+;    int  src_stride,
+;    const unsigned char *ref_ptr,
+;    int  ref_stride,
+;    unsigned short *sad_array
+;);
+global sym(vpx_sad16x8x8_sse4_1) PRIVATE
+sym(vpx_sad16x8x8_sse4_1):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov             rsi,        arg(0)           ;src_ptr
+    mov             rdi,        arg(2)           ;ref_ptr
+
+    movsxd          rax,        dword ptr arg(1) ;src_stride
+    movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+    PROCESS_16X2X8 1
+    PROCESS_16X2X8 0
+    PROCESS_16X2X8 0
+    PROCESS_16X2X8 0
+
+    WRITE_AS_INTS
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vpx_sad8x8x8_sse4_1(
+;    const unsigned char *src_ptr,
+;    int  src_stride,
+;    const unsigned char *ref_ptr,
+;    int  ref_stride,
+;    unsigned short *sad_array
+;);
+global sym(vpx_sad8x8x8_sse4_1) PRIVATE
+sym(vpx_sad8x8x8_sse4_1):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov             rsi,        arg(0)           ;src_ptr
+    mov             rdi,        arg(2)           ;ref_ptr
+
+    movsxd          rax,        dword ptr arg(1) ;src_stride
+    movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+    PROCESS_8X2X8 1
+    PROCESS_8X2X8 0
+    PROCESS_8X2X8 0
+    PROCESS_8X2X8 0
+
+    WRITE_AS_INTS
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vpx_sad8x16x8_sse4_1(
+;    const unsigned char *src_ptr,
+;    int  src_stride,
+;    const unsigned char *ref_ptr,
+;    int  ref_stride,
+;    unsigned short *sad_array
+;);
+global sym(vpx_sad8x16x8_sse4_1) PRIVATE
+sym(vpx_sad8x16x8_sse4_1):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov             rsi,        arg(0)           ;src_ptr
+    mov             rdi,        arg(2)           ;ref_ptr
+
+    movsxd          rax,        dword ptr arg(1) ;src_stride
+    movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+    PROCESS_8X2X8 1
+    PROCESS_8X2X8 0
+    PROCESS_8X2X8 0
+    PROCESS_8X2X8 0
+    PROCESS_8X2X8 0
+    PROCESS_8X2X8 0
+    PROCESS_8X2X8 0
+    PROCESS_8X2X8 0
+
+    WRITE_AS_INTS
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vpx_sad4x4x8_sse4_1(
+;    const unsigned char *src_ptr,
+;    int  src_stride,
+;    const unsigned char *ref_ptr,
+;    int  ref_stride,
+;    unsigned short *sad_array
+;);
+global sym(vpx_sad4x4x8_sse4_1) PRIVATE
+sym(vpx_sad4x4x8_sse4_1):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov             rsi,        arg(0)           ;src_ptr
+    mov             rdi,        arg(2)           ;ref_ptr
+
+    movsxd          rax,        dword ptr arg(1) ;src_stride
+    movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+    PROCESS_4X2X8 1
+    PROCESS_4X2X8 0
+
+    WRITE_AS_INTS
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+
+
diff --git a/libs/libvpx/vpx_dsp/x86/sad_ssse3.asm b/libs/libvpx/vpx_dsp/x86/sad_ssse3.asm
new file mode 100644
index 0000000000..49f204fa04
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/sad_ssse3.asm
@@ -0,0 +1,370 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro PROCESS_16X2X3 1
+%if %1
+        movdqa          xmm0,       XMMWORD PTR [rsi]
+        lddqu           xmm5,       XMMWORD PTR [rdi]
+        lddqu           xmm6,       XMMWORD PTR [rdi+1]
+        lddqu           xmm7,       XMMWORD PTR [rdi+2]
+
+        psadbw          xmm5,       xmm0
+        psadbw          xmm6,       xmm0
+        psadbw          xmm7,       xmm0
+%else
+        movdqa          xmm0,       XMMWORD PTR [rsi]
+        lddqu           xmm1,       XMMWORD PTR [rdi]
+        lddqu           xmm2,       XMMWORD PTR [rdi+1]
+        lddqu           xmm3,       XMMWORD PTR [rdi+2]
+
+        psadbw          xmm1,       xmm0
+        psadbw          xmm2,       xmm0
+        psadbw          xmm3,       xmm0
+
+        paddw           xmm5,       xmm1
+        paddw           xmm6,       xmm2
+        paddw           xmm7,       xmm3
+%endif
+        movdqa          xmm0,       XMMWORD PTR [rsi+rax]
+        lddqu           xmm1,       XMMWORD PTR [rdi+rdx]
+        lddqu           xmm2,       XMMWORD PTR [rdi+rdx+1]
+        lddqu           xmm3,       XMMWORD PTR [rdi+rdx+2]
+
+        lea             rsi,        [rsi+rax*2]
+        lea             rdi,        [rdi+rdx*2]
+
+        psadbw          xmm1,       xmm0
+        psadbw          xmm2,       xmm0
+        psadbw          xmm3,       xmm0
+
+        paddw           xmm5,       xmm1
+        paddw           xmm6,       xmm2
+        paddw           xmm7,       xmm3
+%endmacro
+
+%macro PROCESS_16X2X3_OFFSET 2
+%if %1
+        movdqa          xmm0,       XMMWORD PTR [rsi]
+        movdqa          xmm4,       XMMWORD PTR [rdi]
+        movdqa          xmm7,       XMMWORD PTR [rdi+16]
+
+        movdqa          xmm5,       xmm7
+        palignr         xmm5,       xmm4,       %2
+
+        movdqa          xmm6,       xmm7
+        palignr         xmm6,       xmm4,       (%2+1)
+
+        palignr         xmm7,       xmm4,       (%2+2)
+
+        psadbw          xmm5,       xmm0
+        psadbw          xmm6,       xmm0
+        psadbw          xmm7,       xmm0
+%else
+        movdqa          xmm0,       XMMWORD PTR [rsi]
+        movdqa          xmm4,       XMMWORD PTR [rdi]
+        movdqa          xmm3,       XMMWORD PTR [rdi+16]
+
+        movdqa          xmm1,       xmm3
+        palignr         xmm1,       xmm4,       %2
+
+        movdqa          xmm2,       xmm3
+        palignr         xmm2,       xmm4,       (%2+1)
+
+        palignr         xmm3,       xmm4,       (%2+2)
+
+        psadbw          xmm1,       xmm0
+        psadbw          xmm2,       xmm0
+        psadbw          xmm3,       xmm0
+
+        paddw           xmm5,       xmm1
+        paddw           xmm6,       xmm2
+        paddw           xmm7,       xmm3
+%endif
+        movdqa          xmm0,       XMMWORD PTR [rsi+rax]
+        movdqa          xmm4,       XMMWORD PTR [rdi+rdx]
+        movdqa          xmm3,       XMMWORD PTR [rdi+rdx+16]
+
+        movdqa          xmm1,       xmm3
+        palignr         xmm1,       xmm4,       %2
+
+        movdqa          xmm2,       xmm3
+        palignr         xmm2,       xmm4,       (%2+1)
+
+        palignr         xmm3,       xmm4,       (%2+2)
+
+        lea             rsi,        [rsi+rax*2]
+        lea             rdi,        [rdi+rdx*2]
+
+        psadbw          xmm1,       xmm0
+        psadbw          xmm2,       xmm0
+        psadbw          xmm3,       xmm0
+
+        paddw           xmm5,       xmm1
+        paddw           xmm6,       xmm2
+        paddw           xmm7,       xmm3
+%endmacro
+
+%macro PROCESS_16X16X3_OFFSET 2
+%2_aligned_by_%1:
+
+        sub             rdi,        %1
+
+        PROCESS_16X2X3_OFFSET 1, %1
+        PROCESS_16X2X3_OFFSET 0, %1
+        PROCESS_16X2X3_OFFSET 0, %1
+        PROCESS_16X2X3_OFFSET 0, %1
+        PROCESS_16X2X3_OFFSET 0, %1
+        PROCESS_16X2X3_OFFSET 0, %1
+        PROCESS_16X2X3_OFFSET 0, %1
+        PROCESS_16X2X3_OFFSET 0, %1
+
+        jmp             %2_store_off
+
+%endmacro
+
+%macro PROCESS_16X8X3_OFFSET 2
+%2_aligned_by_%1:
+
+        sub             rdi,        %1
+
+        PROCESS_16X2X3_OFFSET 1, %1
+        PROCESS_16X2X3_OFFSET 0, %1
+        PROCESS_16X2X3_OFFSET 0, %1
+        PROCESS_16X2X3_OFFSET 0, %1
+
+        jmp             %2_store_off
+
+%endmacro
+
+;void int vpx_sad16x16x3_ssse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  *results)
+global sym(vpx_sad16x16x3_ssse3) PRIVATE
+sym(vpx_sad16x16x3_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rcx
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        mov             rdx,        0xf
+        and             rdx,        rdi
+
+        jmp .vpx_sad16x16x3_ssse3_skiptable
+.vpx_sad16x16x3_ssse3_jumptable:
+        dd .vpx_sad16x16x3_ssse3_aligned_by_0  - .vpx_sad16x16x3_ssse3_do_jump
+        dd .vpx_sad16x16x3_ssse3_aligned_by_1  - .vpx_sad16x16x3_ssse3_do_jump
+        dd .vpx_sad16x16x3_ssse3_aligned_by_2  - .vpx_sad16x16x3_ssse3_do_jump
+        dd .vpx_sad16x16x3_ssse3_aligned_by_3  - .vpx_sad16x16x3_ssse3_do_jump
+        dd .vpx_sad16x16x3_ssse3_aligned_by_4  - .vpx_sad16x16x3_ssse3_do_jump
+        dd .vpx_sad16x16x3_ssse3_aligned_by_5  - .vpx_sad16x16x3_ssse3_do_jump
+        dd .vpx_sad16x16x3_ssse3_aligned_by_6  - .vpx_sad16x16x3_ssse3_do_jump
+        dd .vpx_sad16x16x3_ssse3_aligned_by_7  - .vpx_sad16x16x3_ssse3_do_jump
+        dd .vpx_sad16x16x3_ssse3_aligned_by_8  - .vpx_sad16x16x3_ssse3_do_jump
+        dd .vpx_sad16x16x3_ssse3_aligned_by_9  - .vpx_sad16x16x3_ssse3_do_jump
+        dd .vpx_sad16x16x3_ssse3_aligned_by_10 - .vpx_sad16x16x3_ssse3_do_jump
+        dd .vpx_sad16x16x3_ssse3_aligned_by_11 - .vpx_sad16x16x3_ssse3_do_jump
+        dd .vpx_sad16x16x3_ssse3_aligned_by_12 - .vpx_sad16x16x3_ssse3_do_jump
+        dd .vpx_sad16x16x3_ssse3_aligned_by_13 - .vpx_sad16x16x3_ssse3_do_jump
+        dd .vpx_sad16x16x3_ssse3_aligned_by_14 - .vpx_sad16x16x3_ssse3_do_jump
+        dd .vpx_sad16x16x3_ssse3_aligned_by_15 - .vpx_sad16x16x3_ssse3_do_jump
+.vpx_sad16x16x3_ssse3_skiptable:
+
+        call .vpx_sad16x16x3_ssse3_do_jump
+.vpx_sad16x16x3_ssse3_do_jump:
+        pop             rcx                         ; get the address of do_jump
+        mov             rax,  .vpx_sad16x16x3_ssse3_jumptable - .vpx_sad16x16x3_ssse3_do_jump
+        add             rax,  rcx  ; get the absolute address of vpx_sad16x16x3_ssse3_jumptable
+
+        movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
+        add             rcx,        rax
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        jmp             rcx
+
+        PROCESS_16X16X3_OFFSET 0,  .vpx_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 1,  .vpx_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 2,  .vpx_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 3,  .vpx_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 4,  .vpx_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 5,  .vpx_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 6,  .vpx_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 7,  .vpx_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 8,  .vpx_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 9,  .vpx_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 10, .vpx_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 11, .vpx_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 12, .vpx_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 13, .vpx_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 14, .vpx_sad16x16x3_ssse3
+
+.vpx_sad16x16x3_ssse3_aligned_by_15:
+        PROCESS_16X2X3 1
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+
+.vpx_sad16x16x3_ssse3_store_off:
+        mov             rdi,        arg(4) ;Results
+
+        movq            xmm0,       xmm5
+        psrldq          xmm5,       8
+
+        paddw           xmm0,       xmm5
+        movd            [rdi],      xmm0
+;-
+        movq            xmm0,       xmm6
+        psrldq          xmm6,       8
+
+        paddw           xmm0,       xmm6
+        movd            [rdi+4],    xmm0
+;-
+        movq            xmm0,       xmm7
+        psrldq          xmm7,       8
+
+        paddw           xmm0,       xmm7
+        movd            [rdi+8],    xmm0
+
+    ; begin epilog
+    pop         rcx
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void int vpx_sad16x8x3_ssse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  *results)
+global sym(vpx_sad16x8x3_ssse3) PRIVATE
+sym(vpx_sad16x8x3_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rcx
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        mov             rdx,        0xf
+        and             rdx,        rdi
+
+        jmp .vpx_sad16x8x3_ssse3_skiptable
+.vpx_sad16x8x3_ssse3_jumptable:
+        dd .vpx_sad16x8x3_ssse3_aligned_by_0  - .vpx_sad16x8x3_ssse3_do_jump
+        dd .vpx_sad16x8x3_ssse3_aligned_by_1  - .vpx_sad16x8x3_ssse3_do_jump
+        dd .vpx_sad16x8x3_ssse3_aligned_by_2  - .vpx_sad16x8x3_ssse3_do_jump
+        dd .vpx_sad16x8x3_ssse3_aligned_by_3  - .vpx_sad16x8x3_ssse3_do_jump
+        dd .vpx_sad16x8x3_ssse3_aligned_by_4  - .vpx_sad16x8x3_ssse3_do_jump
+        dd .vpx_sad16x8x3_ssse3_aligned_by_5  - .vpx_sad16x8x3_ssse3_do_jump
+        dd .vpx_sad16x8x3_ssse3_aligned_by_6  - .vpx_sad16x8x3_ssse3_do_jump
+        dd .vpx_sad16x8x3_ssse3_aligned_by_7  - .vpx_sad16x8x3_ssse3_do_jump
+        dd .vpx_sad16x8x3_ssse3_aligned_by_8  - .vpx_sad16x8x3_ssse3_do_jump
+        dd .vpx_sad16x8x3_ssse3_aligned_by_9  - .vpx_sad16x8x3_ssse3_do_jump
+        dd .vpx_sad16x8x3_ssse3_aligned_by_10 - .vpx_sad16x8x3_ssse3_do_jump
+        dd .vpx_sad16x8x3_ssse3_aligned_by_11 - .vpx_sad16x8x3_ssse3_do_jump
+        dd .vpx_sad16x8x3_ssse3_aligned_by_12 - .vpx_sad16x8x3_ssse3_do_jump
+        dd .vpx_sad16x8x3_ssse3_aligned_by_13 - .vpx_sad16x8x3_ssse3_do_jump
+        dd .vpx_sad16x8x3_ssse3_aligned_by_14 - .vpx_sad16x8x3_ssse3_do_jump
+        dd .vpx_sad16x8x3_ssse3_aligned_by_15 - .vpx_sad16x8x3_ssse3_do_jump
+.vpx_sad16x8x3_ssse3_skiptable:
+
+        call .vpx_sad16x8x3_ssse3_do_jump
+.vpx_sad16x8x3_ssse3_do_jump:
+        pop             rcx                         ; get the address of do_jump
+        mov             rax,  .vpx_sad16x8x3_ssse3_jumptable - .vpx_sad16x8x3_ssse3_do_jump
+        add             rax,  rcx  ; get the absolute address of vpx_sad16x8x3_ssse3_jumptable
+
+        movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
+        add             rcx,        rax
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        jmp             rcx
+
+        PROCESS_16X8X3_OFFSET 0,  .vpx_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 1,  .vpx_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 2,  .vpx_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 3,  .vpx_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 4,  .vpx_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 5,  .vpx_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 6,  .vpx_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 7,  .vpx_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 8,  .vpx_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 9,  .vpx_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 10, .vpx_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 11, .vpx_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 12, .vpx_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 13, .vpx_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 14, .vpx_sad16x8x3_ssse3
+
+.vpx_sad16x8x3_ssse3_aligned_by_15:
+
+        PROCESS_16X2X3 1
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+
+.vpx_sad16x8x3_ssse3_store_off:
+        mov             rdi,        arg(4) ;Results
+
+        movq            xmm0,       xmm5
+        psrldq          xmm5,       8
+
+        paddw           xmm0,       xmm5
+        movd            [rdi],      xmm0
+;-
+        movq            xmm0,       xmm6
+        psrldq          xmm6,       8
+
+        paddw           xmm0,       xmm6
+        movd            [rdi+4],    xmm0
+;-
+        movq            xmm0,       xmm7
+        psrldq          xmm7,       8
+
+        paddw           xmm0,       xmm7
+        movd            [rdi+8],    xmm0
+
+    ; begin epilog
+    pop         rcx
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/libs/libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm b/libs/libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm
new file mode 100644
index 0000000000..6d58321e03
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm
@@ -0,0 +1,216 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "vpx_ports/x86_abi_support.asm"
+
+; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
+%macro TABULATE_SSIM 0
+        paddusw         xmm15, xmm3  ; sum_s
+        paddusw         xmm14, xmm4  ; sum_r
+        movdqa          xmm1, xmm3
+        pmaddwd         xmm1, xmm1
+        paddd           xmm13, xmm1 ; sum_sq_s
+        movdqa          xmm2, xmm4
+        pmaddwd         xmm2, xmm2
+        paddd           xmm12, xmm2 ; sum_sq_r
+        pmaddwd         xmm3, xmm4
+        paddd           xmm11, xmm3  ; sum_sxr
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_Q 1
+        movdqa          xmm2,%1
+        punpckldq       %1,xmm0
+        punpckhdq       xmm2,xmm0
+        paddq           %1,xmm2
+        movdqa          xmm2,%1
+        punpcklqdq      %1,xmm0
+        punpckhqdq      xmm2,xmm0
+        paddq           %1,xmm2
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_W 1
+        movdqa          xmm1, %1
+        punpcklwd       %1,xmm0
+        punpckhwd       xmm1,xmm0
+        paddd           %1, xmm1
+        SUM_ACROSS_Q    %1
+%endmacro
+;void ssim_parms_sse2(
+;    unsigned char *s,
+;    int sp,
+;    unsigned char *r,
+;    int rp
+;    uint32_t *sum_s,
+;    uint32_t *sum_r,
+;    uint32_t *sum_sq_s,
+;    uint32_t *sum_sq_r,
+;    uint32_t *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+global sym(vpx_ssim_parms_16x16_sse2) PRIVATE
+sym(vpx_ssim_parms_16x16_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    SAVE_XMM 15
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov             rsi,        arg(0) ;s
+    mov             rcx,        arg(1) ;sp
+    mov             rdi,        arg(2) ;r
+    mov             rax,        arg(3) ;rp
+
+    pxor            xmm0, xmm0
+    pxor            xmm15,xmm15  ;sum_s
+    pxor            xmm14,xmm14  ;sum_r
+    pxor            xmm13,xmm13  ;sum_sq_s
+    pxor            xmm12,xmm12  ;sum_sq_r
+    pxor            xmm11,xmm11  ;sum_sxr
+
+    mov             rdx, 16      ;row counter
+.NextRow:
+
+    ;grab source and reference pixels
+    movdqu          xmm5, [rsi]
+    movdqu          xmm6, [rdi]
+    movdqa          xmm3, xmm5
+    movdqa          xmm4, xmm6
+    punpckhbw       xmm3, xmm0 ; high_s
+    punpckhbw       xmm4, xmm0 ; high_r
+
+    TABULATE_SSIM
+
+    movdqa          xmm3, xmm5
+    movdqa          xmm4, xmm6
+    punpcklbw       xmm3, xmm0 ; low_s
+    punpcklbw       xmm4, xmm0 ; low_r
+
+    TABULATE_SSIM
+
+    add             rsi, rcx   ; next s row
+    add             rdi, rax   ; next r row
+
+    dec             rdx        ; counter
+    jnz .NextRow
+
+    SUM_ACROSS_W    xmm15
+    SUM_ACROSS_W    xmm14
+    SUM_ACROSS_Q    xmm13
+    SUM_ACROSS_Q    xmm12
+    SUM_ACROSS_Q    xmm11
+
+    mov             rdi,arg(4)
+    movd            [rdi], xmm15;
+    mov             rdi,arg(5)
+    movd            [rdi], xmm14;
+    mov             rdi,arg(6)
+    movd            [rdi], xmm13;
+    mov             rdi,arg(7)
+    movd            [rdi], xmm12;
+    mov             rdi,arg(8)
+    movd            [rdi], xmm11;
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void ssim_parms_sse2(
+;    unsigned char *s,
+;    int sp,
+;    unsigned char *r,
+;    int rp
+;    uint32_t *sum_s,
+;    uint32_t *sum_r,
+;    uint32_t *sum_sq_s,
+;    uint32_t *sum_sq_r,
+;    uint32_t *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+global sym(vpx_ssim_parms_8x8_sse2) PRIVATE
+sym(vpx_ssim_parms_8x8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    SAVE_XMM 15
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov             rsi,        arg(0) ;s
+    mov             rcx,        arg(1) ;sp
+    mov             rdi,        arg(2) ;r
+    mov             rax,        arg(3) ;rp
+
+    pxor            xmm0, xmm0
+    pxor            xmm15,xmm15  ;sum_s
+    pxor            xmm14,xmm14  ;sum_r
+    pxor            xmm13,xmm13  ;sum_sq_s
+    pxor            xmm12,xmm12  ;sum_sq_r
+    pxor            xmm11,xmm11  ;sum_sxr
+
+    mov             rdx, 8      ;row counter
+.NextRow:
+
+    ;grab source and reference pixels
+    movq            xmm3, [rsi]
+    movq            xmm4, [rdi]
+    punpcklbw       xmm3, xmm0 ; low_s
+    punpcklbw       xmm4, xmm0 ; low_r
+
+    TABULATE_SSIM
+
+    add             rsi, rcx   ; next s row
+    add             rdi, rax   ; next r row
+
+    dec             rdx        ; counter
+    jnz .NextRow
+
+    SUM_ACROSS_W    xmm15
+    SUM_ACROSS_W    xmm14
+    SUM_ACROSS_Q    xmm13
+    SUM_ACROSS_Q    xmm12
+    SUM_ACROSS_Q    xmm11
+
+    mov             rdi,arg(4)
+    movd            [rdi], xmm15;
+    mov             rdi,arg(5)
+    movd            [rdi], xmm14;
+    mov             rdi,arg(6)
+    movd            [rdi], xmm13;
+    mov             rdi,arg(7)
+    movd            [rdi], xmm12;
+    mov             rdi,arg(8)
+    movd            [rdi], xmm11;
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/libs/libvpx/vpx_dsp/x86/subpel_variance_sse2.asm b/libs/libvpx/vpx_dsp/x86/subpel_variance_sse2.asm
new file mode 100644
index 0000000000..c655e4b346
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/subpel_variance_sse2.asm
@@ -0,0 +1,1400 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_8: times  8 dw  8
+bilin_filter_m_sse2: times  8 dw 16
+                     times  8 dw  0
+                     times  8 dw 14
+                     times  8 dw  2
+                     times  8 dw 12
+                     times  8 dw  4
+                     times  8 dw 10
+                     times  8 dw  6
+                     times 16 dw  8
+                     times  8 dw  6
+                     times  8 dw 10
+                     times  8 dw  4
+                     times  8 dw 12
+                     times  8 dw  2
+                     times  8 dw 14
+
+bilin_filter_m_ssse3: times  8 db 16,  0
+                      times  8 db 14,  2
+                      times  8 db 12,  4
+                      times  8 db 10,  6
+                      times 16 db  8
+                      times  8 db  6, 10
+                      times  8 db  4, 12
+                      times  8 db  2, 14
+
+SECTION .text
+
+; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
+;                               int x_offset, int y_offset,
+;                               const uint8_t *dst, ptrdiff_t dst_stride,
+;                               int height, unsigned int *sse);
+;
+; This function returns the SE and stores SSE in the given pointer.
+
+%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
+  psubw                %3, %4
+  psubw                %1, %2
+  paddw                %5, %3
+  pmaddwd              %3, %3
+  paddw                %5, %1
+  pmaddwd              %1, %1
+  paddd                %6, %3
+  paddd                %6, %1
+%endmacro
+
+%macro STORE_AND_RET 0
+%if mmsize == 16
+  ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
+  ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
+  ; We have to sign-extend it before adding the words within the register
+  ; and outputing to a dword.
+  pcmpgtw              m5, m6           ; mask for 0 > x
+  movhlps              m3, m7
+  punpcklwd            m4, m6, m5
+  punpckhwd            m6, m5           ; sign-extend m6 word->dword
+  paddd                m7, m3
+  paddd                m6, m4
+  pshufd               m3, m7, 0x1
+  movhlps              m4, m6
+  paddd                m7, m3
+  paddd                m6, m4
+  mov                  r1, ssem         ; r1 = unsigned int *sse
+  pshufd               m4, m6, 0x1
+  movd               [r1], m7           ; store sse
+  paddd                m6, m4
+  movd               raxd, m6           ; store sum as return value
+%else ; mmsize == 8
+  pshufw               m4, m6, 0xe
+  pshufw               m3, m7, 0xe
+  paddw                m6, m4
+  paddd                m7, m3
+  pcmpgtw              m5, m6           ; mask for 0 > x
+  mov                  r1, ssem         ; r1 = unsigned int *sse
+  punpcklwd            m6, m5           ; sign-extend m6 word->dword
+  movd               [r1], m7           ; store sse
+  pshufw               m4, m6, 0xe
+  paddd                m6, m4
+  movd               raxd, m6           ; store sum as return value
+%endif
+  RET
+%endmacro
+
+%macro INC_SRC_BY_SRC_STRIDE  0
+%if ARCH_X86=1 && CONFIG_PIC=1
+  add                srcq, src_stridemp
+%else
+  add                srcq, src_strideq
+%endif
+%endmacro
+
+%macro SUBPEL_VARIANCE 1-2 0 ; W
+%if cpuflag(ssse3)
+%define bilin_filter_m bilin_filter_m_ssse3
+%define filter_idx_shift 4
+%else
+%define bilin_filter_m bilin_filter_m_sse2
+%define filter_idx_shift 5
+%endif
+; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses
+; 11, not 13, if the registers are ordered correctly. May make a minor speed
+; difference on Win64
+
+%ifdef PIC    ; 64bit PIC
+  %if %2 == 1 ; avg
+    cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
+                                      x_offset, y_offset, \
+                                      dst, dst_stride, \
+                                      sec, sec_stride, height, sse
+    %define sec_str sec_strideq
+  %else
+    cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \
+                                  y_offset, dst, dst_stride, height, sse
+  %endif
+  %define block_height heightd
+  %define bilin_filter sseq
+%else
+  %if ARCH_X86=1 && CONFIG_PIC=1
+    %if %2 == 1 ; avg
+      cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+                                  x_offset, y_offset, \
+                                  dst, dst_stride, \
+                                  sec, sec_stride, \
+                                  height, sse, g_bilin_filter, g_pw_8
+      %define block_height dword heightm
+      %define sec_str sec_stridemp
+
+      ;Store bilin_filter and pw_8 location in stack
+      %if GET_GOT_DEFINED == 1
+        GET_GOT eax
+        add esp, 4                ; restore esp
+      %endif
+
+      lea ecx, [GLOBAL(bilin_filter_m)]
+      mov g_bilin_filterm, ecx
+
+      lea ecx, [GLOBAL(pw_8)]
+      mov g_pw_8m, ecx
+
+      LOAD_IF_USED 0, 1         ; load eax, ecx back
+    %else
+      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
+                                y_offset, dst, dst_stride, height, sse, \
+                                g_bilin_filter, g_pw_8
+      %define block_height heightd
+
+      ;Store bilin_filter and pw_8 location in stack
+      %if GET_GOT_DEFINED == 1
+        GET_GOT eax
+        add esp, 4                ; restore esp
+      %endif
+
+      lea ecx, [GLOBAL(bilin_filter_m)]
+      mov g_bilin_filterm, ecx
+
+      lea ecx, [GLOBAL(pw_8)]
+      mov g_pw_8m, ecx
+
+      LOAD_IF_USED 0, 1         ; load eax, ecx back
+    %endif
+  %else
+    %if %2 == 1 ; avg
+      cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
+                        7 + 2 * ARCH_X86_64, 13, src, src_stride, \
+                                             x_offset, y_offset, \
+                                             dst, dst_stride, \
+                                             sec, sec_stride, \
+                                             height, sse
+      %if ARCH_X86_64
+      %define block_height heightd
+      %define sec_str sec_strideq
+      %else
+      %define block_height dword heightm
+      %define sec_str sec_stridemp
+      %endif
+    %else
+      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
+                              y_offset, dst, dst_stride, height, sse
+      %define block_height heightd
+    %endif
+
+    %define bilin_filter bilin_filter_m
+  %endif
+%endif
+
+  ASSERT               %1 <= 16         ; m6 overflows if w > 16
+  pxor                 m6, m6           ; sum
+  pxor                 m7, m7           ; sse
+  ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we
+  ; could perhaps use it for something more productive then
+  pxor                 m5, m5           ; dedicated zero register
+%if %1 < 16
+  sar                   block_height, 1
+%if %2 == 1 ; avg
+  shl             sec_str, 1
+%endif
+%endif
+
+  ; FIXME(rbultje) replace by jumptable?
+  test          x_offsetd, x_offsetd
+  jnz .x_nonzero
+  ; x_offset == 0
+  test          y_offsetd, y_offsetd
+  jnz .x_zero_y_nonzero
+
+  ; x_offset == 0 && y_offset == 0
+.x_zero_y_zero_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  mova                 m1, [dstq]
+%if %2 == 1 ; avg
+  pavgb                m0, [secq]
+  punpckhbw            m3, m1, m5
+  punpcklbw            m1, m5
+%endif
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%if %2 == 0 ; !avg
+  punpckhbw            m3, m1, m5
+  punpcklbw            m1, m5
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  add                srcq, src_strideq
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movh                 m0, [srcq]
+%if %2 == 1 ; avg
+%if mmsize == 16
+  movhps               m0, [srcq+src_strideq]
+%else ; mmsize == 8
+  punpckldq            m0, [srcq+src_strideq]
+%endif
+%else ; !avg
+  movh                 m2, [srcq+src_strideq]
+%endif
+  movh                 m1, [dstq]
+  movh                 m3, [dstq+dst_strideq]
+%if %2 == 1 ; avg
+  pavgb                m0, [secq]
+  punpcklbw            m3, m5
+  punpcklbw            m1, m5
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%else ; !avg
+  punpcklbw            m0, m5
+  punpcklbw            m2, m5
+  punpcklbw            m3, m5
+  punpcklbw            m1, m5
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   block_height
+  jg .x_zero_y_zero_loop
+  STORE_AND_RET
+
+.x_zero_y_nonzero:
+  cmp           y_offsetd, 8
+  jne .x_zero_y_nonhalf
+
+  ; x_offset == 0 && y_offset == 0.5
+.x_zero_y_half_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m4, [srcq+src_strideq]
+  mova                 m1, [dstq]
+  pavgb                m0, m4
+  punpckhbw            m3, m1, m5
+%if %2 == 1 ; avg
+  pavgb                m0, [secq]
+%endif
+  punpcklbw            m1, m5
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  add                srcq, src_strideq
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movh                 m0, [srcq]
+  movh                 m2, [srcq+src_strideq]
+%if %2 == 1 ; avg
+%if mmsize == 16
+  movhps               m2, [srcq+src_strideq*2]
+%else ; mmsize == 8
+%if %1 == 4
+  movh                 m1, [srcq+src_strideq*2]
+  punpckldq            m2, m1
+%else
+  punpckldq            m2, [srcq+src_strideq*2]
+%endif
+%endif
+  movh                 m1, [dstq]
+%if mmsize == 16
+  movlhps              m0, m2
+%else ; mmsize == 8
+  punpckldq            m0, m2
+%endif
+  movh                 m3, [dstq+dst_strideq]
+  pavgb                m0, m2
+  punpcklbw            m1, m5
+  pavgb                m0, [secq]
+  punpcklbw            m3, m5
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%else ; !avg
+  movh                 m4, [srcq+src_strideq*2]
+  movh                 m1, [dstq]
+  pavgb                m0, m2
+  movh                 m3, [dstq+dst_strideq]
+  pavgb                m2, m4
+  punpcklbw            m0, m5
+  punpcklbw            m2, m5
+  punpcklbw            m3, m5
+  punpcklbw            m1, m5
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   block_height
+  jg .x_zero_y_half_loop
+  STORE_AND_RET
+
+.x_zero_y_nonhalf:
+  ; x_offset == 0 && y_offset == bilin interpolation
+%ifdef PIC
+  lea        bilin_filter, [bilin_filter_m]
+%endif
+  shl           y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+  mova                 m8, [bilin_filter+y_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+  mova                 m9, [bilin_filter+y_offsetq+16]
+%endif
+  mova                m10, [pw_8]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else ; x86-32 or mmx
+%if ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0, reuse x_offset reg
+%define tempq x_offsetq
+  add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+
+.x_zero_y_other_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m4, [srcq+src_strideq]
+  mova                 m1, [dstq]
+%if cpuflag(ssse3)
+  punpckhbw            m2, m0, m4
+  punpcklbw            m0, m4
+  pmaddubsw            m2, filter_y_a
+  pmaddubsw            m0, filter_y_a
+  paddw                m2, filter_rnd
+  paddw                m0, filter_rnd
+%else
+  punpckhbw            m2, m0, m5
+  punpckhbw            m3, m4, m5
+  punpcklbw            m0, m5
+  punpcklbw            m4, m5
+  ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
+  ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
+  ; instructions is the same (5), but it is 1 mul instead of 2, so might be
+  ; slightly faster because of pmullw latency. It would also cut our rodata
+  ; tables in half for this function, and save 1-2 registers on x86-64.
+  pmullw               m2, filter_y_a
+  pmullw               m3, filter_y_b
+  paddw                m2, filter_rnd
+  pmullw               m0, filter_y_a
+  pmullw               m4, filter_y_b
+  paddw                m0, filter_rnd
+  paddw                m2, m3
+  paddw                m0, m4
+%endif
+  psraw                m2, 4
+  psraw                m0, 4
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+  packuswb             m0, m2
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%endif
+  punpckhbw            m3, m1, m5
+  punpcklbw            m1, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  add                srcq, src_strideq
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movh                 m0, [srcq]
+  movh                 m2, [srcq+src_strideq]
+  movh                 m4, [srcq+src_strideq*2]
+  movh                 m3, [dstq+dst_strideq]
+%if cpuflag(ssse3)
+  movh                 m1, [dstq]
+  punpcklbw            m0, m2
+  punpcklbw            m2, m4
+  pmaddubsw            m0, filter_y_a
+  pmaddubsw            m2, filter_y_a
+  punpcklbw            m3, m5
+  paddw                m2, filter_rnd
+  paddw                m0, filter_rnd
+%else
+  punpcklbw            m0, m5
+  punpcklbw            m2, m5
+  punpcklbw            m4, m5
+  pmullw               m0, filter_y_a
+  pmullw               m1, m2, filter_y_b
+  punpcklbw            m3, m5
+  paddw                m0, filter_rnd
+  pmullw               m2, filter_y_a
+  pmullw               m4, filter_y_b
+  paddw                m0, m1
+  paddw                m2, filter_rnd
+  movh                 m1, [dstq]
+  paddw                m2, m4
+%endif
+  psraw                m0, 4
+  psraw                m2, 4
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+  packuswb             m0, m2
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%endif
+  punpcklbw            m1, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   block_height
+  jg .x_zero_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+  STORE_AND_RET
+
+.x_nonzero:
+  cmp           x_offsetd, 8
+  jne .x_nonhalf
+  ; x_offset == 0.5
+  test          y_offsetd, y_offsetd
+  jnz .x_half_y_nonzero
+
+  ; x_offset == 0.5 && y_offset == 0
+.x_half_y_zero_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m4, [srcq+1]
+  mova                 m1, [dstq]
+  pavgb                m0, m4
+  punpckhbw            m3, m1, m5
+%if %2 == 1 ; avg
+  pavgb                m0, [secq]
+%endif
+  punpcklbw            m1, m5
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  add                srcq, src_strideq
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movh                 m0, [srcq]
+  movh                 m4, [srcq+1]
+%if %2 == 1 ; avg
+%if mmsize == 16
+  movhps               m0, [srcq+src_strideq]
+  movhps               m4, [srcq+src_strideq+1]
+%else ; mmsize == 8
+  punpckldq            m0, [srcq+src_strideq]
+  punpckldq            m4, [srcq+src_strideq+1]
+%endif
+  movh                 m1, [dstq]
+  movh                 m3, [dstq+dst_strideq]
+  pavgb                m0, m4
+  punpcklbw            m3, m5
+  pavgb                m0, [secq]
+  punpcklbw            m1, m5
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%else ; !avg
+  movh                 m2, [srcq+src_strideq]
+  movh                 m1, [dstq]
+  pavgb                m0, m4
+  movh                 m4, [srcq+src_strideq+1]
+  movh                 m3, [dstq+dst_strideq]
+  pavgb                m2, m4
+  punpcklbw            m0, m5
+  punpcklbw            m2, m5
+  punpcklbw            m3, m5
+  punpcklbw            m1, m5
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   block_height
+  jg .x_half_y_zero_loop
+  STORE_AND_RET
+
+.x_half_y_nonzero:
+  cmp           y_offsetd, 8
+  jne .x_half_y_nonhalf
+
+  ; x_offset == 0.5 && y_offset == 0.5
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m3, [srcq+1]
+  add                srcq, src_strideq
+  pavgb                m0, m3
+.x_half_y_half_loop:
+  movu                 m4, [srcq]
+  movu                 m3, [srcq+1]
+  mova                 m1, [dstq]
+  pavgb                m4, m3
+  punpckhbw            m3, m1, m5
+  pavgb                m0, m4
+%if %2 == 1 ; avg
+  punpcklbw            m1, m5
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%else
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+  punpcklbw            m1, m5
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+  mova                 m0, m4
+
+  add                srcq, src_strideq
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movh                 m0, [srcq]
+  movh                 m3, [srcq+1]
+  add                srcq, src_strideq
+  pavgb                m0, m3
+.x_half_y_half_loop:
+  movh                 m2, [srcq]
+  movh                 m3, [srcq+1]
+%if %2 == 1 ; avg
+%if mmsize == 16
+  movhps               m2, [srcq+src_strideq]
+  movhps               m3, [srcq+src_strideq+1]
+%else
+%if %1 == 4
+  movh                 m1, [srcq+src_strideq]
+  punpckldq            m2, m1
+  movh                 m1, [srcq+src_strideq+1]
+  punpckldq            m3, m1
+%else
+  punpckldq            m2, [srcq+src_strideq]
+  punpckldq            m3, [srcq+src_strideq+1]
+%endif
+%endif
+  pavgb                m2, m3
+%if mmsize == 16
+  movlhps              m0, m2
+  movhlps              m4, m2
+%else ; mmsize == 8
+  punpckldq            m0, m2
+  pshufw               m4, m2, 0xe
+%endif
+  movh                 m1, [dstq]
+  pavgb                m0, m2
+  movh                 m3, [dstq+dst_strideq]
+  pavgb                m0, [secq]
+  punpcklbw            m3, m5
+  punpcklbw            m1, m5
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%else ; !avg
+  movh                 m4, [srcq+src_strideq]
+  movh                 m1, [srcq+src_strideq+1]
+  pavgb                m2, m3
+  pavgb                m4, m1
+  pavgb                m0, m2
+  pavgb                m2, m4
+  movh                 m1, [dstq]
+  movh                 m3, [dstq+dst_strideq]
+  punpcklbw            m0, m5
+  punpcklbw            m2, m5
+  punpcklbw            m3, m5
+  punpcklbw            m1, m5
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+  mova                 m0, m4
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   block_height
+  jg .x_half_y_half_loop
+  STORE_AND_RET
+
+.x_half_y_nonhalf:
+  ; x_offset == 0.5 && y_offset == bilin interpolation
+%ifdef PIC
+  lea        bilin_filter, [bilin_filter_m]
+%endif
+  shl           y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+  mova                 m8, [bilin_filter+y_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+  mova                 m9, [bilin_filter+y_offsetq+16]
+%endif
+  mova                m10, [pw_8]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else  ;x86_32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0.5. We can reuse x_offset reg
+%define tempq x_offsetq
+  add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m3, [srcq+1]
+  add                srcq, src_strideq
+  pavgb                m0, m3
+.x_half_y_other_loop:
+  movu                 m4, [srcq]
+  movu                 m2, [srcq+1]
+  mova                 m1, [dstq]
+  pavgb                m4, m2
+%if cpuflag(ssse3)
+  punpckhbw            m2, m0, m4
+  punpcklbw            m0, m4
+  pmaddubsw            m2, filter_y_a
+  pmaddubsw            m0, filter_y_a
+  paddw                m2, filter_rnd
+  paddw                m0, filter_rnd
+  psraw                m2, 4
+%else
+  punpckhbw            m2, m0, m5
+  punpckhbw            m3, m4, m5
+  pmullw               m2, filter_y_a
+  pmullw               m3, filter_y_b
+  paddw                m2, filter_rnd
+  punpcklbw            m0, m5
+  paddw                m2, m3
+  punpcklbw            m3, m4, m5
+  pmullw               m0, filter_y_a
+  pmullw               m3, filter_y_b
+  paddw                m0, filter_rnd
+  psraw                m2, 4
+  paddw                m0, m3
+%endif
+  punpckhbw            m3, m1, m5
+  psraw                m0, 4
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+  packuswb             m0, m2
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%endif
+  punpcklbw            m1, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+  mova                 m0, m4
+
+  add                srcq, src_strideq
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movh                 m0, [srcq]
+  movh                 m3, [srcq+1]
+  add                srcq, src_strideq
+  pavgb                m0, m3
+%if notcpuflag(ssse3)
+  punpcklbw            m0, m5
+%endif
+.x_half_y_other_loop:
+  movh                 m2, [srcq]
+  movh                 m1, [srcq+1]
+  movh                 m4, [srcq+src_strideq]
+  movh                 m3, [srcq+src_strideq+1]
+  pavgb                m2, m1
+  pavgb                m4, m3
+  movh                 m3, [dstq+dst_strideq]
+%if cpuflag(ssse3)
+  movh                 m1, [dstq]
+  punpcklbw            m0, m2
+  punpcklbw            m2, m4
+  pmaddubsw            m0, filter_y_a
+  pmaddubsw            m2, filter_y_a
+  punpcklbw            m3, m5
+  paddw                m0, filter_rnd
+  paddw                m2, filter_rnd
+%else
+  punpcklbw            m2, m5
+  punpcklbw            m4, m5
+  pmullw               m0, filter_y_a
+  pmullw               m1, m2, filter_y_b
+  punpcklbw            m3, m5
+  paddw                m0, filter_rnd
+  pmullw               m2, filter_y_a
+  paddw                m0, m1
+  pmullw               m1, m4, filter_y_b
+  paddw                m2, filter_rnd
+  paddw                m2, m1
+  movh                 m1, [dstq]
+%endif
+  psraw                m0, 4
+  psraw                m2, 4
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+  packuswb             m0, m2
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%endif
+  punpcklbw            m1, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+  mova                 m0, m4
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   block_height
+  jg .x_half_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+  STORE_AND_RET
+
+.x_nonhalf:
+  test          y_offsetd, y_offsetd
+  jnz .x_nonhalf_y_nonzero
+
+  ; x_offset == bilin interpolation && y_offset == 0
+%ifdef PIC
+  lea        bilin_filter, [bilin_filter_m]
+%endif
+  shl           x_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+  mova                 m8, [bilin_filter+x_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+  mova                 m9, [bilin_filter+x_offsetq+16]
+%endif
+  mova                m10, [pw_8]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else    ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+;y_offset == 0. We can reuse y_offset reg.
+%define tempq y_offsetq
+  add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+
+.x_other_y_zero_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m4, [srcq+1]
+  mova                 m1, [dstq]
+%if cpuflag(ssse3)
+  punpckhbw            m2, m0, m4
+  punpcklbw            m0, m4
+  pmaddubsw            m2, filter_x_a
+  pmaddubsw            m0, filter_x_a
+  paddw                m2, filter_rnd
+  paddw                m0, filter_rnd
+%else
+  punpckhbw            m2, m0, m5
+  punpckhbw            m3, m4, m5
+  punpcklbw            m0, m5
+  punpcklbw            m4, m5
+  pmullw               m2, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m2, filter_rnd
+  pmullw               m0, filter_x_a
+  pmullw               m4, filter_x_b
+  paddw                m0, filter_rnd
+  paddw                m2, m3
+  paddw                m0, m4
+%endif
+  psraw                m2, 4
+  psraw                m0, 4
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+  packuswb             m0, m2
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%endif
+  punpckhbw            m3, m1, m5
+  punpcklbw            m1, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  add                srcq, src_strideq
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movh                 m0, [srcq]
+  movh                 m1, [srcq+1]
+  movh                 m2, [srcq+src_strideq]
+  movh                 m4, [srcq+src_strideq+1]
+  movh                 m3, [dstq+dst_strideq]
+%if cpuflag(ssse3)
+  punpcklbw            m0, m1
+  movh                 m1, [dstq]
+  punpcklbw            m2, m4
+  pmaddubsw            m0, filter_x_a
+  pmaddubsw            m2, filter_x_a
+  punpcklbw            m3, m5
+  paddw                m0, filter_rnd
+  paddw                m2, filter_rnd
+%else
+  punpcklbw            m0, m5
+  punpcklbw            m1, m5
+  punpcklbw            m2, m5
+  punpcklbw            m4, m5
+  pmullw               m0, filter_x_a
+  pmullw               m1, filter_x_b
+  punpcklbw            m3, m5
+  paddw                m0, filter_rnd
+  pmullw               m2, filter_x_a
+  pmullw               m4, filter_x_b
+  paddw                m0, m1
+  paddw                m2, filter_rnd
+  movh                 m1, [dstq]
+  paddw                m2, m4
+%endif
+  psraw                m0, 4
+  psraw                m2, 4
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+  packuswb             m0, m2
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%endif
+  punpcklbw            m1, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   block_height
+  jg .x_other_y_zero_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+  STORE_AND_RET
+
+.x_nonhalf_y_nonzero:
+  cmp           y_offsetd, 8
+  jne .x_nonhalf_y_nonhalf
+
+  ; x_offset == bilin interpolation && y_offset == 0.5
+%ifdef PIC
+  lea        bilin_filter, [bilin_filter_m]
+%endif
+  shl           x_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+  mova                 m8, [bilin_filter+x_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+  mova                 m9, [bilin_filter+x_offsetq+16]
+%endif
+  mova                m10, [pw_8]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else    ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; y_offset == 0.5. We can reuse y_offset reg.
+%define tempq y_offsetq
+  add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+1]
+%if cpuflag(ssse3)
+  punpckhbw            m2, m0, m1
+  punpcklbw            m0, m1
+  pmaddubsw            m2, filter_x_a
+  pmaddubsw            m0, filter_x_a
+  paddw                m2, filter_rnd
+  paddw                m0, filter_rnd
+%else
+  punpckhbw            m2, m0, m5
+  punpckhbw            m3, m1, m5
+  punpcklbw            m0, m5
+  punpcklbw            m1, m5
+  pmullw               m0, filter_x_a
+  pmullw               m1, filter_x_b
+  paddw                m0, filter_rnd
+  pmullw               m2, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m2, filter_rnd
+  paddw                m0, m1
+  paddw                m2, m3
+%endif
+  psraw                m0, 4
+  psraw                m2, 4
+  add                srcq, src_strideq
+  packuswb             m0, m2
+.x_other_y_half_loop:
+  movu                 m4, [srcq]
+  movu                 m3, [srcq+1]
+%if cpuflag(ssse3)
+  mova                 m1, [dstq]
+  punpckhbw            m2, m4, m3
+  punpcklbw            m4, m3
+  pmaddubsw            m2, filter_x_a
+  pmaddubsw            m4, filter_x_a
+  paddw                m2, filter_rnd
+  paddw                m4, filter_rnd
+  psraw                m2, 4
+  psraw                m4, 4
+  packuswb             m4, m2
+  pavgb                m0, m4
+  punpckhbw            m3, m1, m5
+  punpcklbw            m1, m5
+%else
+  punpckhbw            m2, m4, m5
+  punpckhbw            m1, m3, m5
+  punpcklbw            m4, m5
+  punpcklbw            m3, m5
+  pmullw               m4, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m4, filter_rnd
+  pmullw               m2, filter_x_a
+  pmullw               m1, filter_x_b
+  paddw                m2, filter_rnd
+  paddw                m4, m3
+  paddw                m2, m1
+  mova                 m1, [dstq]
+  psraw                m4, 4
+  psraw                m2, 4
+  punpckhbw            m3, m1, m5
+  ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we
+  ; have a 1-register shortage to be able to store the backup of the bilin
+  ; filtered second line as words as cache for the next line. Packing into
+  ; a byte costs 1 pack and 2 unpacks, but saves a register.
+  packuswb             m4, m2
+  punpcklbw            m1, m5
+  pavgb                m0, m4
+%endif
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+  pavgb                m0, [secq]
+%endif
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+  mova                 m0, m4
+
+  add                srcq, src_strideq
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movh                 m0, [srcq]
+  movh                 m1, [srcq+1]
+%if cpuflag(ssse3)
+  punpcklbw            m0, m1
+  pmaddubsw            m0, filter_x_a
+  paddw                m0, filter_rnd
+%else
+  punpcklbw            m0, m5
+  punpcklbw            m1, m5
+  pmullw               m0, filter_x_a
+  pmullw               m1, filter_x_b
+  paddw                m0, filter_rnd
+  paddw                m0, m1
+%endif
+  add                srcq, src_strideq
+  psraw                m0, 4
+.x_other_y_half_loop:
+  movh                 m2, [srcq]
+  movh                 m1, [srcq+1]
+  movh                 m4, [srcq+src_strideq]
+  movh                 m3, [srcq+src_strideq+1]
+%if cpuflag(ssse3)
+  punpcklbw            m2, m1
+  punpcklbw            m4, m3
+  pmaddubsw            m2, filter_x_a
+  pmaddubsw            m4, filter_x_a
+  movh                 m1, [dstq]
+  movh                 m3, [dstq+dst_strideq]
+  paddw                m2, filter_rnd
+  paddw                m4, filter_rnd
+%else
+  punpcklbw            m2, m5
+  punpcklbw            m1, m5
+  punpcklbw            m4, m5
+  punpcklbw            m3, m5
+  pmullw               m2, filter_x_a
+  pmullw               m1, filter_x_b
+  paddw                m2, filter_rnd
+  pmullw               m4, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m4, filter_rnd
+  paddw                m2, m1
+  movh                 m1, [dstq]
+  paddw                m4, m3
+  movh                 m3, [dstq+dst_strideq]
+%endif
+  psraw                m2, 4
+  psraw                m4, 4
+  pavgw                m0, m2
+  pavgw                m2, m4
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline - also consider going to bytes here
+  packuswb             m0, m2
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%endif
+  punpcklbw            m3, m5
+  punpcklbw            m1, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+  mova                 m0, m4
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   block_height
+  jg .x_other_y_half_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+  STORE_AND_RET
+
+.x_nonhalf_y_nonhalf:
+%ifdef PIC
+  lea        bilin_filter, [bilin_filter_m]
+%endif
+  shl           x_offsetd, filter_idx_shift
+  shl           y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+  mova                 m8, [bilin_filter+x_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+  mova                 m9, [bilin_filter+x_offsetq+16]
+%endif
+  mova                m10, [bilin_filter+y_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+  mova                m11, [bilin_filter+y_offsetq+16]
+%endif
+  mova                m12, [pw_8]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_y_a m10
+%define filter_y_b m11
+%define filter_rnd m12
+%else   ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; In this case, there is NO unused register. Used src_stride register. Later,
+; src_stride has to be loaded from stack when it is needed.
+%define tempq src_strideq
+  mov tempq, g_bilin_filterm
+  add           x_offsetq, tempq
+  add           y_offsetq, tempq
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           x_offsetq, bilin_filter
+  add           y_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+
+  ; x_offset == bilin interpolation && y_offset == bilin interpolation
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+1]
+%if cpuflag(ssse3)
+  punpckhbw            m2, m0, m1
+  punpcklbw            m0, m1
+  pmaddubsw            m2, filter_x_a
+  pmaddubsw            m0, filter_x_a
+  paddw                m2, filter_rnd
+  paddw                m0, filter_rnd
+%else
+  punpckhbw            m2, m0, m5
+  punpckhbw            m3, m1, m5
+  punpcklbw            m0, m5
+  punpcklbw            m1, m5
+  pmullw               m0, filter_x_a
+  pmullw               m1, filter_x_b
+  paddw                m0, filter_rnd
+  pmullw               m2, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m2, filter_rnd
+  paddw                m0, m1
+  paddw                m2, m3
+%endif
+  psraw                m0, 4
+  psraw                m2, 4
+
+  INC_SRC_BY_SRC_STRIDE
+
+  packuswb             m0, m2
+.x_other_y_other_loop:
+%if cpuflag(ssse3)
+  movu                 m4, [srcq]
+  movu                 m3, [srcq+1]
+  mova                 m1, [dstq]
+  punpckhbw            m2, m4, m3
+  punpcklbw            m4, m3
+  pmaddubsw            m2, filter_x_a
+  pmaddubsw            m4, filter_x_a
+  punpckhbw            m3, m1, m5
+  paddw                m2, filter_rnd
+  paddw                m4, filter_rnd
+  psraw                m2, 4
+  psraw                m4, 4
+  packuswb             m4, m2
+  punpckhbw            m2, m0, m4
+  punpcklbw            m0, m4
+  pmaddubsw            m2, filter_y_a
+  pmaddubsw            m0, filter_y_a
+  punpcklbw            m1, m5
+  paddw                m2, filter_rnd
+  paddw                m0, filter_rnd
+  psraw                m2, 4
+  psraw                m0, 4
+%else
+  movu                 m3, [srcq]
+  movu                 m4, [srcq+1]
+  punpckhbw            m1, m3, m5
+  punpckhbw            m2, m4, m5
+  punpcklbw            m3, m5
+  punpcklbw            m4, m5
+  pmullw               m3, filter_x_a
+  pmullw               m4, filter_x_b
+  paddw                m3, filter_rnd
+  pmullw               m1, filter_x_a
+  pmullw               m2, filter_x_b
+  paddw                m1, filter_rnd
+  paddw                m3, m4
+  paddw                m1, m2
+  psraw                m3, 4
+  psraw                m1, 4
+  packuswb             m4, m3, m1
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+  pmullw               m2, filter_y_a
+  pmullw               m1, filter_y_b
+  paddw                m2, filter_rnd
+  pmullw               m0, filter_y_a
+  pmullw               m3, filter_y_b
+  paddw                m2, m1
+  mova                 m1, [dstq]
+  paddw                m0, filter_rnd
+  psraw                m2, 4
+  paddw                m0, m3
+  punpckhbw            m3, m1, m5
+  psraw                m0, 4
+  punpcklbw            m1, m5
+%endif
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+  packuswb             m0, m2
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+  mova                 m0, m4
+
+  INC_SRC_BY_SRC_STRIDE
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movh                 m0, [srcq]
+  movh                 m1, [srcq+1]
+%if cpuflag(ssse3)
+  punpcklbw            m0, m1
+  pmaddubsw            m0, filter_x_a
+  paddw                m0, filter_rnd
+%else
+  punpcklbw            m0, m5
+  punpcklbw            m1, m5
+  pmullw               m0, filter_x_a
+  pmullw               m1, filter_x_b
+  paddw                m0, filter_rnd
+  paddw                m0, m1
+%endif
+  psraw                m0, 4
+%if cpuflag(ssse3)
+  packuswb             m0, m0
+%endif
+
+  INC_SRC_BY_SRC_STRIDE
+
+.x_other_y_other_loop:
+  movh                 m2, [srcq]
+  movh                 m1, [srcq+1]
+
+  INC_SRC_BY_SRC_STRIDE
+  movh                 m4, [srcq]
+  movh                 m3, [srcq+1]
+
+%if cpuflag(ssse3)
+  punpcklbw            m2, m1
+  punpcklbw            m4, m3
+  pmaddubsw            m2, filter_x_a
+  pmaddubsw            m4, filter_x_a
+  movh                 m3, [dstq+dst_strideq]
+  movh                 m1, [dstq]
+  paddw                m2, filter_rnd
+  paddw                m4, filter_rnd
+  psraw                m2, 4
+  psraw                m4, 4
+  packuswb             m2, m2
+  packuswb             m4, m4
+  punpcklbw            m0, m2
+  punpcklbw            m2, m4
+  pmaddubsw            m0, filter_y_a
+  pmaddubsw            m2, filter_y_a
+  punpcklbw            m3, m5
+  paddw                m0, filter_rnd
+  paddw                m2, filter_rnd
+  psraw                m0, 4
+  psraw                m2, 4
+  punpcklbw            m1, m5
+%else
+  punpcklbw            m2, m5
+  punpcklbw            m1, m5
+  punpcklbw            m4, m5
+  punpcklbw            m3, m5
+  pmullw               m2, filter_x_a
+  pmullw               m1, filter_x_b
+  paddw                m2, filter_rnd
+  pmullw               m4, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m4, filter_rnd
+  paddw                m2, m1
+  paddw                m4, m3
+  psraw                m2, 4
+  psraw                m4, 4
+  pmullw               m0, filter_y_a
+  pmullw               m3, m2, filter_y_b
+  paddw                m0, filter_rnd
+  pmullw               m2, filter_y_a
+  pmullw               m1, m4, filter_y_b
+  paddw                m2, filter_rnd
+  paddw                m0, m3
+  movh                 m3, [dstq+dst_strideq]
+  paddw                m2, m1
+  movh                 m1, [dstq]
+  psraw                m0, 4
+  psraw                m2, 4
+  punpcklbw            m3, m5
+  punpcklbw            m1, m5
+%endif
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+  packuswb             m0, m2
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+  mova                 m0, m4
+
+  INC_SRC_BY_SRC_STRIDE
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   block_height
+  jg .x_other_y_other_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+  STORE_AND_RET
+%endmacro
+
+; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical
+; between the ssse3 and non-ssse3 version. It may make sense to merge their
+; code in the sense that the ssse3 version would jump to the appropriate
+; location in the sse/2 version, rather than duplicating that code in the
+; binary.
+
+INIT_MMX sse
+SUBPEL_VARIANCE  4
+INIT_XMM sse2
+SUBPEL_VARIANCE  8
+SUBPEL_VARIANCE 16
+
+INIT_MMX ssse3
+SUBPEL_VARIANCE  4
+INIT_XMM ssse3
+SUBPEL_VARIANCE  8
+SUBPEL_VARIANCE 16
+
+INIT_MMX sse
+SUBPEL_VARIANCE  4, 1
+INIT_XMM sse2
+SUBPEL_VARIANCE  8, 1
+SUBPEL_VARIANCE 16, 1
+
+INIT_MMX ssse3
+SUBPEL_VARIANCE  4, 1
+INIT_XMM ssse3
+SUBPEL_VARIANCE  8, 1
+SUBPEL_VARIANCE 16, 1
diff --git a/libs/libvpx/vpx_dsp/x86/subtract_sse2.asm b/libs/libvpx/vpx_dsp/x86/subtract_sse2.asm
new file mode 100644
index 0000000000..4273efb854
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/subtract_sse2.asm
@@ -0,0 +1,127 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; void vpx_subtract_block(int rows, int cols,
+;                         int16_t *diff, ptrdiff_t diff_stride,
+;                         const uint8_t *src, ptrdiff_t src_stride,
+;                         const uint8_t *pred, ptrdiff_t pred_stride)
+
+INIT_XMM sse2
+cglobal subtract_block, 7, 7, 8, \
+                        rows, cols, diff, diff_stride, src, src_stride, \
+                        pred, pred_stride
+%define pred_str colsq
+  pxor                  m7, m7         ; dedicated zero register
+  cmp                colsd, 4
+  je .case_4
+  cmp                colsd, 8
+  je .case_8
+  cmp                colsd, 16
+  je .case_16
+  cmp                colsd, 32
+  je .case_32
+
+%macro loop16 6
+  mova                  m0, [srcq+%1]
+  mova                  m4, [srcq+%2]
+  mova                  m1, [predq+%3]
+  mova                  m5, [predq+%4]
+  punpckhbw             m2, m0, m7
+  punpckhbw             m3, m1, m7
+  punpcklbw             m0, m7
+  punpcklbw             m1, m7
+  psubw                 m2, m3
+  psubw                 m0, m1
+  punpckhbw             m1, m4, m7
+  punpckhbw             m3, m5, m7
+  punpcklbw             m4, m7
+  punpcklbw             m5, m7
+  psubw                 m1, m3
+  psubw                 m4, m5
+  mova [diffq+mmsize*0+%5], m0
+  mova [diffq+mmsize*1+%5], m2
+  mova [diffq+mmsize*0+%6], m4
+  mova [diffq+mmsize*1+%6], m1
+%endmacro
+
+  mov             pred_str, pred_stridemp
+.loop_64:
+  loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize
+  loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize
+  lea                diffq, [diffq+diff_strideq*2]
+  add                predq, pred_str
+  add                 srcq, src_strideq
+  dec                rowsd
+  jg .loop_64
+  RET
+
+.case_32:
+  mov             pred_str, pred_stridemp
+.loop_32:
+  loop16 0, mmsize, 0, mmsize, 0, 2*mmsize
+  lea                diffq, [diffq+diff_strideq*2]
+  add                predq, pred_str
+  add                 srcq, src_strideq
+  dec                rowsd
+  jg .loop_32
+  RET
+
+.case_16:
+  mov             pred_str, pred_stridemp
+.loop_16:
+  loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2
+  lea                diffq, [diffq+diff_strideq*4]
+  lea                predq, [predq+pred_str*2]
+  lea                 srcq, [srcq+src_strideq*2]
+  sub                rowsd, 2
+  jg .loop_16
+  RET
+
+%macro loop_h 0
+  movh                  m0, [srcq]
+  movh                  m2, [srcq+src_strideq]
+  movh                  m1, [predq]
+  movh                  m3, [predq+pred_str]
+  punpcklbw             m0, m7
+  punpcklbw             m1, m7
+  punpcklbw             m2, m7
+  punpcklbw             m3, m7
+  psubw                 m0, m1
+  psubw                 m2, m3
+  mova             [diffq], m0
+  mova [diffq+diff_strideq*2], m2
+%endmacro
+
+.case_8:
+  mov             pred_str, pred_stridemp
+.loop_8:
+  loop_h
+  lea                diffq, [diffq+diff_strideq*4]
+  lea                 srcq, [srcq+src_strideq*2]
+  lea                predq, [predq+pred_str*2]
+  sub                rowsd, 2
+  jg .loop_8
+  RET
+
+INIT_MMX
+.case_4:
+  mov             pred_str, pred_stridemp
+.loop_4:
+  loop_h
+  lea                diffq, [diffq+diff_strideq*4]
+  lea                 srcq, [srcq+src_strideq*2]
+  lea                predq, [predq+pred_str*2]
+  sub                rowsd, 2
+  jg .loop_4
+  RET
diff --git a/libs/libvpx/vpx_dsp/x86/txfm_common_sse2.h b/libs/libvpx/vpx_dsp/x86/txfm_common_sse2.h
new file mode 100644
index 0000000000..536b206876
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/txfm_common_sse2.h
@@ -0,0 +1,29 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_X86_TXFM_COMMON_SSE2_H_
+#define VPX_DSP_X86_TXFM_COMMON_SSE2_H_
+
+#include <emmintrin.h>
+#include "vpx/vpx_integer.h"
+
+#define pair_set_epi16(a, b) \
+  _mm_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+                (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
+
+#define dual_set_epi16(a, b) \
+  _mm_set_epi16((int16_t)(b), (int16_t)(b), (int16_t)(b), (int16_t)(b), \
+                (int16_t)(a), (int16_t)(a), (int16_t)(a), (int16_t)(a))
+
+#define octa_set_epi16(a, b, c, d, e, f, g, h) \
+  _mm_setr_epi16((int16_t)(a), (int16_t)(b), (int16_t)(c), (int16_t)(d), \
+                 (int16_t)(e), (int16_t)(f), (int16_t)(g), (int16_t)(h))
+
+#endif  // VPX_DSP_X86_TXFM_COMMON_SSE2_H_
diff --git a/libs/libvpx/vpx_dsp/x86/variance_avx2.c b/libs/libvpx/vpx_dsp/x86/variance_avx2.c
new file mode 100644
index 0000000000..7851a98b14
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/variance_avx2.c
@@ -0,0 +1,183 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "./vpx_dsp_rtcd.h"
+
+typedef void (*get_var_avx2)(const uint8_t *src, int src_stride,
+                             const uint8_t *ref, int ref_stride,
+                             unsigned int *sse, int *sum);
+
+void vpx_get32x32var_avx2(const uint8_t *src, int src_stride,
+                          const uint8_t *ref, int ref_stride,
+                          unsigned int *sse, int *sum);
+
+static void variance_avx2(const uint8_t *src, int src_stride,
+                          const uint8_t *ref, int  ref_stride,
+                          int w, int h, unsigned int *sse, int *sum,
+                          get_var_avx2 var_fn, int block_size) {
+  int i, j;
+
+  *sse = 0;
+  *sum = 0;
+
+  for (i = 0; i < h; i += 16) {
+    for (j = 0; j < w; j += block_size) {
+      unsigned int sse0;
+      int sum0;
+      var_fn(&src[src_stride * i + j], src_stride,
+             &ref[ref_stride * i + j], ref_stride, &sse0, &sum0);
+      *sse += sse0;
+      *sum += sum0;
+    }
+  }
+}
+
+
+unsigned int vpx_variance16x16_avx2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_avx2(src, src_stride, ref, ref_stride, 16, 16,
+                sse, &sum, vpx_get16x16var_avx2, 16);
+  return *sse - (((unsigned int)sum * sum) >> 8);
+}
+
+unsigned int vpx_mse16x16_avx2(const uint8_t *src, int src_stride,
+                               const uint8_t *ref, int ref_stride,
+                               unsigned int *sse) {
+  int sum;
+  vpx_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum);
+  return *sse;
+}
+
+unsigned int vpx_variance32x16_avx2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_avx2(src, src_stride, ref, ref_stride, 32, 16,
+                sse, &sum, vpx_get32x32var_avx2, 32);
+  return *sse - (((int64_t)sum * sum) >> 9);
+}
+
+unsigned int vpx_variance32x32_avx2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_avx2(src, src_stride, ref, ref_stride, 32, 32,
+                sse, &sum, vpx_get32x32var_avx2, 32);
+  return *sse - (((int64_t)sum * sum) >> 10);
+}
+
+unsigned int vpx_variance64x64_avx2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_avx2(src, src_stride, ref, ref_stride, 64, 64,
+                sse, &sum, vpx_get32x32var_avx2, 32);
+  return *sse - (((int64_t)sum * sum) >> 12);
+}
+
+unsigned int vpx_variance64x32_avx2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_avx2(src, src_stride, ref, ref_stride, 64, 32,
+                sse, &sum, vpx_get32x32var_avx2, 32);
+  return *sse - (((int64_t)sum * sum) >> 11);
+}
+
+unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
+                                             int x_offset, int y_offset,
+                                             const uint8_t *dst, int dst_stride,
+                                             int height,
+                                             unsigned int *sse);
+
+unsigned int vpx_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
+                                                 int src_stride,
+                                                 int x_offset,
+                                                 int y_offset,
+                                                 const uint8_t *dst,
+                                                 int dst_stride,
+                                                 const uint8_t *sec,
+                                                 int sec_stride,
+                                                 int height,
+                                                 unsigned int *sseptr);
+
+unsigned int vpx_sub_pixel_variance64x64_avx2(const uint8_t *src,
+                                              int src_stride,
+                                              int x_offset,
+                                              int y_offset,
+                                              const uint8_t *dst,
+                                              int dst_stride,
+                                              unsigned int *sse) {
+  unsigned int sse1;
+  const int se1 = vpx_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
+                                                  y_offset, dst, dst_stride,
+                                                  64, &sse1);
+  unsigned int sse2;
+  const int se2 = vpx_sub_pixel_variance32xh_avx2(src + 32, src_stride,
+                                                  x_offset, y_offset,
+                                                  dst + 32, dst_stride,
+                                                  64, &sse2);
+  const int se = se1 + se2;
+  *sse = sse1 + sse2;
+  return *sse - (((int64_t)se * se) >> 12);
+}
+
+unsigned int vpx_sub_pixel_variance32x32_avx2(const uint8_t *src,
+                                              int src_stride,
+                                              int x_offset,
+                                              int y_offset,
+                                              const uint8_t *dst,
+                                              int dst_stride,
+                                              unsigned int *sse) {
+  const int se = vpx_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
+                                                 y_offset, dst, dst_stride,
+                                                 32, sse);
+  return *sse - (((int64_t)se * se) >> 10);
+}
+
+unsigned int vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t *src,
+                                                  int src_stride,
+                                                  int x_offset,
+                                                  int y_offset,
+                                                  const uint8_t *dst,
+                                                  int dst_stride,
+                                                  unsigned int *sse,
+                                                  const uint8_t *sec) {
+  unsigned int sse1;
+  const int se1 = vpx_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
+                                                      y_offset, dst, dst_stride,
+                                                      sec, 64, 64, &sse1);
+  unsigned int sse2;
+  const int se2 =
+    vpx_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset,
+                                        y_offset, dst + 32, dst_stride,
+                                        sec + 32, 64, 64, &sse2);
+  const int se = se1 + se2;
+
+  *sse = sse1 + sse2;
+
+  return *sse - (((int64_t)se * se) >> 12);
+}
+
+unsigned int vpx_sub_pixel_avg_variance32x32_avx2(const uint8_t *src,
+                                                  int src_stride,
+                                                  int x_offset,
+                                                  int y_offset,
+                                                  const uint8_t *dst,
+                                                  int dst_stride,
+                                                  unsigned int *sse,
+                                                  const uint8_t *sec) {
+  // Process 32 elements in parallel.
+  const int se = vpx_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
+                                                     y_offset, dst, dst_stride,
+                                                     sec, 32, 32, sse);
+  return *sse - (((int64_t)se * se) >> 10);
+}
diff --git a/libs/libvpx/vpx_dsp/x86/variance_impl_avx2.c b/libs/libvpx/vpx_dsp/x86/variance_impl_avx2.c
new file mode 100644
index 0000000000..b289e9a0c7
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/variance_impl_avx2.c
@@ -0,0 +1,727 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>  // AVX2
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/mem.h"
+
+DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
+  16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
+  16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
+  14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
+  14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
+  12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
+  12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
+  10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
+  10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
+  6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
+  4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
+  4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
+  2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
+  2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
+};
+
+
+void vpx_get16x16var_avx2(const unsigned char *src_ptr,
+                          int source_stride,
+                          const unsigned char *ref_ptr,
+                          int recon_stride,
+                          unsigned int *SSE,
+                          int *Sum) {
+    __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low;
+    __m256i ref_expand_high, madd_low, madd_high;
+    unsigned int i, src_2strides, ref_2strides;
+    __m256i zero_reg = _mm256_set1_epi16(0);
+    __m256i sum_ref_src = _mm256_set1_epi16(0);
+    __m256i madd_ref_src = _mm256_set1_epi16(0);
+
+    // processing two strides in a 256 bit register reducing the number
+    // of loop stride by half (comparing to the sse2 code)
+    src_2strides = source_stride << 1;
+    ref_2strides = recon_stride << 1;
+    for (i = 0; i < 8; i++) {
+        src = _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i const *) (src_ptr)));
+        src = _mm256_inserti128_si256(src,
+              _mm_loadu_si128((__m128i const *)(src_ptr+source_stride)), 1);
+
+        ref =_mm256_castsi128_si256(
+             _mm_loadu_si128((__m128i const *) (ref_ptr)));
+        ref = _mm256_inserti128_si256(ref,
+              _mm_loadu_si128((__m128i const *)(ref_ptr+recon_stride)), 1);
+
+        // expanding to 16 bit each lane
+        src_expand_low = _mm256_unpacklo_epi8(src, zero_reg);
+        src_expand_high = _mm256_unpackhi_epi8(src, zero_reg);
+
+        ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg);
+        ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg);
+
+        // src-ref
+        src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low);
+        src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high);
+
+        // madd low (src - ref)
+        madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low);
+
+        // add high to low
+        src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high);
+
+        // madd high (src - ref)
+        madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high);
+
+        sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low);
+
+        // add high to low
+        madd_ref_src = _mm256_add_epi32(madd_ref_src,
+                       _mm256_add_epi32(madd_low, madd_high));
+
+        src_ptr+= src_2strides;
+        ref_ptr+= ref_2strides;
+    }
+
+    {
+        __m128i sum_res, madd_res;
+        __m128i expand_sum_low, expand_sum_high, expand_sum;
+        __m128i expand_madd_low, expand_madd_high, expand_madd;
+        __m128i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum;
+
+        // extract the low lane and add it to the high lane
+        sum_res = _mm_add_epi16(_mm256_castsi256_si128(sum_ref_src),
+                                _mm256_extractf128_si256(sum_ref_src, 1));
+
+        madd_res = _mm_add_epi32(_mm256_castsi256_si128(madd_ref_src),
+                                 _mm256_extractf128_si256(madd_ref_src, 1));
+
+        // padding each 2 bytes with another 2 zeroed bytes
+        expand_sum_low = _mm_unpacklo_epi16(_mm256_castsi256_si128(zero_reg),
+                                            sum_res);
+        expand_sum_high = _mm_unpackhi_epi16(_mm256_castsi256_si128(zero_reg),
+                                             sum_res);
+
+        // shifting the sign 16 bits right
+        expand_sum_low = _mm_srai_epi32(expand_sum_low, 16);
+        expand_sum_high = _mm_srai_epi32(expand_sum_high, 16);
+
+        expand_sum = _mm_add_epi32(expand_sum_low, expand_sum_high);
+
+        // expand each 32 bits of the madd result to 64 bits
+        expand_madd_low = _mm_unpacklo_epi32(madd_res,
+                          _mm256_castsi256_si128(zero_reg));
+        expand_madd_high = _mm_unpackhi_epi32(madd_res,
+                           _mm256_castsi256_si128(zero_reg));
+
+        expand_madd = _mm_add_epi32(expand_madd_low, expand_madd_high);
+
+        ex_expand_sum_low = _mm_unpacklo_epi32(expand_sum,
+                            _mm256_castsi256_si128(zero_reg));
+        ex_expand_sum_high = _mm_unpackhi_epi32(expand_sum,
+                             _mm256_castsi256_si128(zero_reg));
+
+        ex_expand_sum = _mm_add_epi32(ex_expand_sum_low, ex_expand_sum_high);
+
+        // shift 8 bytes eight
+        madd_res = _mm_srli_si128(expand_madd, 8);
+        sum_res = _mm_srli_si128(ex_expand_sum, 8);
+
+        madd_res = _mm_add_epi32(madd_res, expand_madd);
+        sum_res = _mm_add_epi32(sum_res, ex_expand_sum);
+
+        *((int*)SSE)= _mm_cvtsi128_si32(madd_res);
+
+        *((int*)Sum)= _mm_cvtsi128_si32(sum_res);
+    }
+}
+
+void vpx_get32x32var_avx2(const unsigned char *src_ptr,
+                          int source_stride,
+                          const unsigned char *ref_ptr,
+                          int recon_stride,
+                          unsigned int *SSE,
+                          int *Sum) {
+    __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low;
+    __m256i ref_expand_high, madd_low, madd_high;
+    unsigned int i;
+    __m256i zero_reg = _mm256_set1_epi16(0);
+    __m256i sum_ref_src = _mm256_set1_epi16(0);
+    __m256i madd_ref_src = _mm256_set1_epi16(0);
+
+    // processing 32 elements in parallel
+    for (i = 0; i < 16; i++) {
+       src = _mm256_loadu_si256((__m256i const *) (src_ptr));
+
+       ref = _mm256_loadu_si256((__m256i const *) (ref_ptr));
+
+       // expanding to 16 bit each lane
+       src_expand_low = _mm256_unpacklo_epi8(src, zero_reg);
+       src_expand_high = _mm256_unpackhi_epi8(src, zero_reg);
+
+       ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg);
+       ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg);
+
+       // src-ref
+       src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low);
+       src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high);
+
+       // madd low (src - ref)
+       madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low);
+
+       // add high to low
+       src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high);
+
+       // madd high (src - ref)
+       madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high);
+
+       sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low);
+
+       // add high to low
+       madd_ref_src = _mm256_add_epi32(madd_ref_src,
+                      _mm256_add_epi32(madd_low, madd_high));
+
+       src_ptr+= source_stride;
+       ref_ptr+= recon_stride;
+    }
+
+    {
+      __m256i expand_sum_low, expand_sum_high, expand_sum;
+      __m256i expand_madd_low, expand_madd_high, expand_madd;
+      __m256i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum;
+
+      // padding each 2 bytes with another 2 zeroed bytes
+      expand_sum_low = _mm256_unpacklo_epi16(zero_reg, sum_ref_src);
+      expand_sum_high = _mm256_unpackhi_epi16(zero_reg, sum_ref_src);
+
+      // shifting the sign 16 bits right
+      expand_sum_low = _mm256_srai_epi32(expand_sum_low, 16);
+      expand_sum_high = _mm256_srai_epi32(expand_sum_high, 16);
+
+      expand_sum = _mm256_add_epi32(expand_sum_low, expand_sum_high);
+
+      // expand each 32 bits of the madd result to 64 bits
+      expand_madd_low = _mm256_unpacklo_epi32(madd_ref_src, zero_reg);
+      expand_madd_high = _mm256_unpackhi_epi32(madd_ref_src, zero_reg);
+
+      expand_madd = _mm256_add_epi32(expand_madd_low, expand_madd_high);
+
+      ex_expand_sum_low = _mm256_unpacklo_epi32(expand_sum, zero_reg);
+      ex_expand_sum_high = _mm256_unpackhi_epi32(expand_sum, zero_reg);
+
+      ex_expand_sum = _mm256_add_epi32(ex_expand_sum_low, ex_expand_sum_high);
+
+      // shift 8 bytes eight
+      madd_ref_src = _mm256_srli_si256(expand_madd, 8);
+      sum_ref_src = _mm256_srli_si256(ex_expand_sum, 8);
+
+      madd_ref_src = _mm256_add_epi32(madd_ref_src, expand_madd);
+      sum_ref_src = _mm256_add_epi32(sum_ref_src, ex_expand_sum);
+
+      // extract the low lane and the high lane and add the results
+      *((int*)SSE)= _mm_cvtsi128_si32(_mm256_castsi256_si128(madd_ref_src)) +
+      _mm_cvtsi128_si32(_mm256_extractf128_si256(madd_ref_src, 1));
+
+      *((int*)Sum)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_ref_src)) +
+      _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_ref_src, 1));
+    }
+}
+
+#define FILTER_SRC(filter) \
+  /* filter the source */ \
+  exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
+  exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
+  \
+  /* add 8 to source */ \
+  exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \
+  exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \
+  \
+  /* divide source by 16 */ \
+  exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \
+  exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+#define MERGE_WITH_SRC(src_reg, reg) \
+  exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \
+  exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);
+
+#define LOAD_SRC_DST \
+  /* load source and destination */ \
+  src_reg = _mm256_loadu_si256((__m256i const *) (src)); \
+  dst_reg = _mm256_loadu_si256((__m256i const *) (dst));
+
+#define AVG_NEXT_SRC(src_reg, size_stride) \
+  src_next_reg = _mm256_loadu_si256((__m256i const *) \
+                                   (src + size_stride)); \
+  /* average between current and next stride source */ \
+  src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+
+#define MERGE_NEXT_SRC(src_reg, size_stride) \
+  src_next_reg = _mm256_loadu_si256((__m256i const *) \
+                                   (src + size_stride)); \
+  MERGE_WITH_SRC(src_reg, src_next_reg)
+
+#define CALC_SUM_SSE_INSIDE_LOOP \
+  /* expand each byte to 2 bytes */ \
+  exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \
+  exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \
+  /* source - dest */ \
+  exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \
+  exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \
+  /* caculate sum */ \
+  sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \
+  exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
+  sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \
+  exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
+  /* calculate sse */ \
+  sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \
+  sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+// final calculation to sum and sse
+#define CALC_SUM_AND_SSE \
+  res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \
+  sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \
+  sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \
+  sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \
+  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
+  sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \
+  \
+  sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \
+  sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \
+  \
+  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
+  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
+  *((int*)sse)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \
+                _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
+  sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \
+  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
+  sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \
+        _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
+
+
+unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src,
+                                             int src_stride,
+                                             int x_offset,
+                                             int y_offset,
+                                             const uint8_t *dst,
+                                             int dst_stride,
+                                             int height,
+                                             unsigned int *sse) {
+  __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+  __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
+  __m256i zero_reg;
+  int i, sum;
+  sum_reg = _mm256_set1_epi16(0);
+  sse_reg = _mm256_set1_epi16(0);
+  zero_reg = _mm256_set1_epi16(0);
+
+  // x_offset = 0 and y_offset = 0
+  if (x_offset == 0) {
+    if (y_offset == 0) {
+      for (i = 0; i < height ; i++) {
+        LOAD_SRC_DST
+        // expend each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src+= src_stride;
+        dst+= dst_stride;
+      }
+    // x_offset = 0 and y_offset = 8
+    } else if (y_offset == 8) {
+      __m256i src_next_reg;
+      for (i = 0; i < height ; i++) {
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, src_stride)
+        // expend each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src+= src_stride;
+        dst+= dst_stride;
+      }
+    // x_offset = 0 and y_offset = bilin interpolation
+    } else {
+      __m256i filter, pw8, src_next_reg;
+
+      y_offset <<= 5;
+      filter = _mm256_load_si256((__m256i const *)
+               (bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      for (i = 0; i < height ; i++) {
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, src_stride)
+        FILTER_SRC(filter)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src+= src_stride;
+        dst+= dst_stride;
+      }
+    }
+  // x_offset = 8  and y_offset = 0
+  } else if (x_offset == 8) {
+    if (y_offset == 0) {
+      __m256i src_next_reg;
+      for (i = 0; i < height ; i++) {
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, 1)
+        // expand each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src+= src_stride;
+        dst+= dst_stride;
+      }
+    // x_offset = 8  and y_offset = 8
+    } else if (y_offset == 8) {
+      __m256i src_next_reg, src_avg;
+      // load source and another source starting from the next
+      // following byte
+      src_reg = _mm256_loadu_si256((__m256i const *) (src));
+      AVG_NEXT_SRC(src_reg, 1)
+      for (i = 0; i < height ; i++) {
+        src_avg = src_reg;
+        src+= src_stride;
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, 1)
+        // average between previous average to current average
+        src_avg = _mm256_avg_epu8(src_avg, src_reg);
+        // expand each byte to 2 bytes
+        MERGE_WITH_SRC(src_avg, zero_reg)
+        // save current source average
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst+= dst_stride;
+      }
+    // x_offset = 8  and y_offset = bilin interpolation
+    } else {
+      __m256i filter, pw8, src_next_reg, src_avg;
+      y_offset <<= 5;
+      filter = _mm256_load_si256((__m256i const *)
+               (bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      // load source and another source starting from the next
+      // following byte
+      src_reg = _mm256_loadu_si256((__m256i const *) (src));
+      AVG_NEXT_SRC(src_reg, 1)
+      for (i = 0; i < height ; i++) {
+        // save current source average
+        src_avg = src_reg;
+        src+= src_stride;
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, 1)
+        MERGE_WITH_SRC(src_avg, src_reg)
+        FILTER_SRC(filter)
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst+= dst_stride;
+      }
+    }
+  // x_offset = bilin interpolation and y_offset = 0
+  } else {
+    if (y_offset == 0) {
+      __m256i filter, pw8, src_next_reg;
+      x_offset <<= 5;
+      filter = _mm256_load_si256((__m256i const *)
+               (bilinear_filters_avx2 + x_offset));
+      pw8 = _mm256_set1_epi16(8);
+      for (i = 0; i < height ; i++) {
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, 1)
+        FILTER_SRC(filter)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src+= src_stride;
+        dst+= dst_stride;
+      }
+    // x_offset = bilin interpolation and y_offset = 8
+    } else if (y_offset == 8) {
+      __m256i filter, pw8, src_next_reg, src_pack;
+      x_offset <<= 5;
+      filter = _mm256_load_si256((__m256i const *)
+               (bilinear_filters_avx2 + x_offset));
+      pw8 = _mm256_set1_epi16(8);
+      src_reg = _mm256_loadu_si256((__m256i const *) (src));
+      MERGE_NEXT_SRC(src_reg, 1)
+      FILTER_SRC(filter)
+      // convert each 16 bit to 8 bit to each low and high lane source
+      src_pack =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+      for (i = 0; i < height ; i++) {
+        src+= src_stride;
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, 1)
+        FILTER_SRC(filter)
+        src_reg =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        // average between previous pack to the current
+        src_pack = _mm256_avg_epu8(src_pack, src_reg);
+        MERGE_WITH_SRC(src_pack, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src_pack = src_reg;
+        dst+= dst_stride;
+      }
+    // x_offset = bilin interpolation and y_offset = bilin interpolation
+    } else {
+      __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
+      x_offset <<= 5;
+      xfilter = _mm256_load_si256((__m256i const *)
+                (bilinear_filters_avx2 + x_offset));
+      y_offset <<= 5;
+      yfilter = _mm256_load_si256((__m256i const *)
+                (bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      // load source and another source starting from the next
+      // following byte
+      src_reg = _mm256_loadu_si256((__m256i const *) (src));
+      MERGE_NEXT_SRC(src_reg, 1)
+
+      FILTER_SRC(xfilter)
+      // convert each 16 bit to 8 bit to each low and high lane source
+      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+      for (i = 0; i < height ; i++) {
+        src+= src_stride;
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, 1)
+        FILTER_SRC(xfilter)
+        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        // merge previous pack to current pack source
+        MERGE_WITH_SRC(src_pack, src_reg)
+        // filter the source
+        FILTER_SRC(yfilter)
+        src_pack = src_reg;
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst+= dst_stride;
+      }
+    }
+  }
+  CALC_SUM_AND_SSE
+  return sum;
+}
+
+unsigned int vpx_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
+                                             int src_stride,
+                                             int x_offset,
+                                             int y_offset,
+                                             const uint8_t *dst,
+                                             int dst_stride,
+                                             const uint8_t *sec,
+                                             int sec_stride,
+                                             int height,
+                                             unsigned int *sse) {
+  __m256i sec_reg;
+  __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+  __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
+  __m256i zero_reg;
+  int i, sum;
+  sum_reg = _mm256_set1_epi16(0);
+  sse_reg = _mm256_set1_epi16(0);
+  zero_reg = _mm256_set1_epi16(0);
+
+  // x_offset = 0 and y_offset = 0
+  if (x_offset == 0) {
+    if (y_offset == 0) {
+      for (i = 0; i < height ; i++) {
+        LOAD_SRC_DST
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
+        sec+= sec_stride;
+        // expend each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src+= src_stride;
+        dst+= dst_stride;
+      }
+    } else if (y_offset == 8) {
+      __m256i src_next_reg;
+      for (i = 0; i < height ; i++) {
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, src_stride)
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
+        sec+= sec_stride;
+        // expend each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src+= src_stride;
+        dst+= dst_stride;
+      }
+    // x_offset = 0 and y_offset = bilin interpolation
+    } else {
+      __m256i filter, pw8, src_next_reg;
+
+      y_offset <<= 5;
+      filter = _mm256_load_si256((__m256i const *)
+                 (bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      for (i = 0; i < height ; i++) {
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, src_stride)
+        FILTER_SRC(filter)
+        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
+        sec+= sec_stride;
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src+= src_stride;
+        dst+= dst_stride;
+      }
+    }
+  // x_offset = 8  and y_offset = 0
+  } else if (x_offset == 8) {
+    if (y_offset == 0) {
+      __m256i src_next_reg;
+      for (i = 0; i < height ; i++) {
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, 1)
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
+        sec+= sec_stride;
+        // expand each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src+= src_stride;
+        dst+= dst_stride;
+      }
+    // x_offset = 8  and y_offset = 8
+    } else if (y_offset == 8) {
+      __m256i src_next_reg, src_avg;
+      // load source and another source starting from the next
+      // following byte
+      src_reg = _mm256_loadu_si256((__m256i const *) (src));
+      AVG_NEXT_SRC(src_reg, 1)
+      for (i = 0; i < height ; i++) {
+        // save current source average
+        src_avg = src_reg;
+        src+= src_stride;
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, 1)
+        // average between previous average to current average
+        src_avg = _mm256_avg_epu8(src_avg, src_reg);
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+        src_avg = _mm256_avg_epu8(src_avg, sec_reg);
+        sec+= sec_stride;
+        // expand each byte to 2 bytes
+        MERGE_WITH_SRC(src_avg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst+= dst_stride;
+      }
+    // x_offset = 8  and y_offset = bilin interpolation
+    } else {
+      __m256i filter, pw8, src_next_reg, src_avg;
+      y_offset <<= 5;
+      filter = _mm256_load_si256((__m256i const *)
+               (bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      // load source and another source starting from the next
+      // following byte
+      src_reg = _mm256_loadu_si256((__m256i const *) (src));
+      AVG_NEXT_SRC(src_reg, 1)
+      for (i = 0; i < height ; i++) {
+        // save current source average
+        src_avg = src_reg;
+        src+= src_stride;
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, 1)
+        MERGE_WITH_SRC(src_avg, src_reg)
+        FILTER_SRC(filter)
+        src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+        src_avg = _mm256_avg_epu8(src_avg, sec_reg);
+        // expand each byte to 2 bytes
+        MERGE_WITH_SRC(src_avg, zero_reg)
+        sec+= sec_stride;
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst+= dst_stride;
+      }
+    }
+  // x_offset = bilin interpolation and y_offset = 0
+  } else {
+    if (y_offset == 0) {
+      __m256i filter, pw8, src_next_reg;
+      x_offset <<= 5;
+      filter = _mm256_load_si256((__m256i const *)
+               (bilinear_filters_avx2 + x_offset));
+      pw8 = _mm256_set1_epi16(8);
+      for (i = 0; i < height ; i++) {
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, 1)
+        FILTER_SRC(filter)
+        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        sec+= sec_stride;
+        CALC_SUM_SSE_INSIDE_LOOP
+        src+= src_stride;
+        dst+= dst_stride;
+      }
+    // x_offset = bilin interpolation and y_offset = 8
+    } else if (y_offset == 8) {
+      __m256i filter, pw8, src_next_reg, src_pack;
+      x_offset <<= 5;
+      filter = _mm256_load_si256((__m256i const *)
+               (bilinear_filters_avx2 + x_offset));
+      pw8 = _mm256_set1_epi16(8);
+      src_reg = _mm256_loadu_si256((__m256i const *) (src));
+      MERGE_NEXT_SRC(src_reg, 1)
+      FILTER_SRC(filter)
+      // convert each 16 bit to 8 bit to each low and high lane source
+      src_pack =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+      for (i = 0; i < height ; i++) {
+        src+= src_stride;
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, 1)
+        FILTER_SRC(filter)
+        src_reg =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        // average between previous pack to the current
+        src_pack = _mm256_avg_epu8(src_pack, src_reg);
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+        src_pack = _mm256_avg_epu8(src_pack, sec_reg);
+        sec+= sec_stride;
+        MERGE_WITH_SRC(src_pack, zero_reg)
+        src_pack = src_reg;
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst+= dst_stride;
+      }
+    // x_offset = bilin interpolation and y_offset = bilin interpolation
+    } else {
+      __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
+      x_offset <<= 5;
+      xfilter = _mm256_load_si256((__m256i const *)
+                (bilinear_filters_avx2 + x_offset));
+      y_offset <<= 5;
+      yfilter = _mm256_load_si256((__m256i const *)
+                (bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      // load source and another source starting from the next
+      // following byte
+      src_reg = _mm256_loadu_si256((__m256i const *) (src));
+      MERGE_NEXT_SRC(src_reg, 1)
+
+      FILTER_SRC(xfilter)
+      // convert each 16 bit to 8 bit to each low and high lane source
+      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+      for (i = 0; i < height ; i++) {
+        src+= src_stride;
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, 1)
+        FILTER_SRC(xfilter)
+        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        // merge previous pack to current pack source
+        MERGE_WITH_SRC(src_pack, src_reg)
+        // filter the source
+        FILTER_SRC(yfilter)
+        src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+        src_pack = _mm256_avg_epu8(src_pack, sec_reg);
+        MERGE_WITH_SRC(src_pack, zero_reg)
+        src_pack = src_reg;
+        sec+= sec_stride;
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst+= dst_stride;
+      }
+    }
+  }
+  CALC_SUM_AND_SSE
+  return sum;
+}
diff --git a/libs/libvpx/vpx_dsp/x86/variance_impl_mmx.asm b/libs/libvpx/vpx_dsp/x86/variance_impl_mmx.asm
new file mode 100644
index 0000000000..b8ba79b65e
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/variance_impl_mmx.asm
@@ -0,0 +1,744 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%define mmx_filter_shift            7
+
+;unsigned int vpx_get_mb_ss_mmx( short *src_ptr )
+global sym(vpx_get_mb_ss_mmx) PRIVATE
+sym(vpx_get_mb_ss_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    sub         rsp, 8
+    ; end prolog
+
+        mov         rax, arg(0) ;src_ptr
+        mov         rcx, 16
+        pxor        mm4, mm4
+
+.NEXTROW:
+        movq        mm0, [rax]
+        movq        mm1, [rax+8]
+        movq        mm2, [rax+16]
+        movq        mm3, [rax+24]
+        pmaddwd     mm0, mm0
+        pmaddwd     mm1, mm1
+        pmaddwd     mm2, mm2
+        pmaddwd     mm3, mm3
+
+        paddd       mm4, mm0
+        paddd       mm4, mm1
+        paddd       mm4, mm2
+        paddd       mm4, mm3
+
+        add         rax, 32
+        dec         rcx
+        ja          .NEXTROW
+        movq        QWORD PTR [rsp], mm4
+
+        ;return sum[0]+sum[1];
+        movsxd      rax, dword ptr [rsp]
+        movsxd      rcx, dword ptr [rsp+4]
+        add         rax, rcx
+
+    ; begin epilog
+    add rsp, 8
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vpx_get8x8var_mmx
+;(
+;    unsigned char *src_ptr,
+;    int  source_stride,
+;    unsigned char *ref_ptr,
+;    int  recon_stride,
+;    unsigned int *SSE,
+;    int *Sum
+;)
+global sym(vpx_get8x8var_mmx) PRIVATE
+sym(vpx_get8x8var_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push rsi
+    push rdi
+    push rbx
+    sub         rsp, 16
+    ; end prolog
+
+        pxor        mm5, mm5                    ; Blank mmx6
+        pxor        mm6, mm6                    ; Blank mmx7
+        pxor        mm7, mm7                    ; Blank mmx7
+
+        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
+        mov         rbx, arg(2) ;[ref_ptr]
+        movsxd      rcx, dword ptr arg(1) ;[source_stride]
+        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
+
+        ; Row 1
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+        ; Row 2
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+        ; Row 3
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+        ; Row 4
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+        ; Row 5
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        ;              movq        mm4, [rbx + rdx]
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+        ; Row 6
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+        ; Row 7
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+        ; Row 8
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+        ; Now accumulate the final results.
+        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
+        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
+        movsx       rdx, WORD PTR [rsp+8]
+        movsx       rcx, WORD PTR [rsp+10]
+        movsx       rbx, WORD PTR [rsp+12]
+        movsx       rax, WORD PTR [rsp+14]
+        add         rdx, rcx
+        add         rbx, rax
+        add         rdx, rbx    ;XSum
+        movsxd      rax, DWORD PTR [rsp]
+        movsxd      rcx, DWORD PTR [rsp+4]
+        add         rax, rcx    ;XXSum
+        mov         rsi, arg(4) ;SSE
+        mov         rdi, arg(5) ;Sum
+        mov         dword ptr [rsi], eax
+        mov         dword ptr [rdi], edx
+        xor         rax, rax    ; return 0
+
+    ; begin epilog
+    add rsp, 16
+    pop rbx
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void
+;vpx_get4x4var_mmx
+;(
+;    unsigned char *src_ptr,
+;    int  source_stride,
+;    unsigned char *ref_ptr,
+;    int  recon_stride,
+;    unsigned int *SSE,
+;    int *Sum
+;)
+global sym(vpx_get4x4var_mmx) PRIVATE
+sym(vpx_get4x4var_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push rsi
+    push rdi
+    push rbx
+    sub         rsp, 16
+    ; end prolog
+
+        pxor        mm5, mm5                    ; Blank mmx6
+        pxor        mm6, mm6                    ; Blank mmx7
+        pxor        mm7, mm7                    ; Blank mmx7
+
+        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
+        mov         rbx, arg(2) ;[ref_ptr]
+        movsxd      rcx, dword ptr arg(1) ;[source_stride]
+        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
+
+        ; Row 1
+        movd        mm0, [rax]                  ; Copy four bytes to mm0
+        movd        mm1, [rbx]                  ; Copy four bytes to mm1
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movd        mm1, [rbx]                  ; Copy four bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+
+        ; Row 2
+        movd        mm0, [rax]                  ; Copy four bytes to mm0
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movd        mm1, [rbx]                  ; Copy four bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+
+        ; Row 3
+        movd        mm0, [rax]                  ; Copy four bytes to mm0
+        punpcklbw   mm0, mm6                    ; unpack to higher precision
+        punpcklbw   mm1, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movd        mm1, [rbx]                  ; Copy four bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+
+        ; Row 4
+        movd        mm0, [rax]                  ; Copy four bytes to mm0
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        paddd       mm7, mm0                    ; accumulate in mm7
+
+        ; Now accumulate the final results.
+        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
+        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
+        movsx       rdx, WORD PTR [rsp+8]
+        movsx       rcx, WORD PTR [rsp+10]
+        movsx       rbx, WORD PTR [rsp+12]
+        movsx       rax, WORD PTR [rsp+14]
+        add         rdx, rcx
+        add         rbx, rax
+        add         rdx, rbx    ;XSum
+        movsxd      rax, DWORD PTR [rsp]
+        movsxd      rcx, DWORD PTR [rsp+4]
+        add         rax, rcx    ;XXSum
+        mov         rsi, arg(4) ;SSE
+        mov         rdi, arg(5) ;Sum
+        mov         dword ptr [rsi], eax
+        mov         dword ptr [rdi], edx
+        xor         rax, rax    ; return 0
+
+    ; begin epilog
+    add rsp, 16
+    pop rbx
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vpx_filter_block2d_bil4x4_var_mmx
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned short *HFilter,
+;    unsigned short *VFilter,
+;    int *sum,
+;    unsigned int *sumsquared
+;)
+global sym(vpx_filter_block2d_bil4x4_var_mmx) PRIVATE
+sym(vpx_filter_block2d_bil4x4_var_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 8
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    sub         rsp, 16
+    ; end prolog
+
+        pxor            mm6,            mm6                 ;
+        pxor            mm7,            mm7                 ;
+
+        mov             rax,            arg(4) ;HFilter             ;
+        mov             rdx,            arg(5) ;VFilter             ;
+
+        mov             rsi,            arg(0) ;ref_ptr              ;
+        mov             rdi,            arg(2) ;src_ptr              ;
+
+        mov             rcx,            4                   ;
+        pxor            mm0,            mm0                 ;
+
+        movd            mm1,            [rsi]               ;
+        movd            mm3,            [rsi+1]             ;
+
+        punpcklbw       mm1,            mm0                 ;
+        pmullw          mm1,            [rax]               ;
+
+        punpcklbw       mm3,            mm0                 ;
+        pmullw          mm3,            [rax+8]             ;
+
+        paddw           mm1,            mm3                 ;
+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
+
+        psraw           mm1,            mmx_filter_shift    ;
+        movq            mm5,            mm1
+
+%if ABI_IS_32BIT
+        add             rsi, dword ptr  arg(1) ;ref_pixels_per_line    ;
+%else
+        movsxd          r8, dword ptr  arg(1) ;ref_pixels_per_line    ;
+        add             rsi, r8
+%endif
+
+.filter_block2d_bil4x4_var_mmx_loop:
+
+        movd            mm1,            [rsi]               ;
+        movd            mm3,            [rsi+1]             ;
+
+        punpcklbw       mm1,            mm0                 ;
+        pmullw          mm1,            [rax]               ;
+
+        punpcklbw       mm3,            mm0                 ;
+        pmullw          mm3,            [rax+8]             ;
+
+        paddw           mm1,            mm3                 ;
+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
+
+        psraw           mm1,            mmx_filter_shift    ;
+        movq            mm3,            mm5                 ;
+
+        movq            mm5,            mm1                 ;
+        pmullw          mm3,            [rdx]               ;
+
+        pmullw          mm1,            [rdx+8]             ;
+        paddw           mm1,            mm3                 ;
+
+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
+        psraw           mm1,            mmx_filter_shift    ;
+
+        movd            mm3,            [rdi]               ;
+        punpcklbw       mm3,            mm0                 ;
+
+        psubw           mm1,            mm3                 ;
+        paddw           mm6,            mm1                 ;
+
+        pmaddwd         mm1,            mm1                 ;
+        paddd           mm7,            mm1                 ;
+
+%if ABI_IS_32BIT
+        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
+        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
+%else
+        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
+        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
+        add             rsi,            r8
+        add             rdi,            r9
+%endif
+        sub             rcx,            1                   ;
+        jnz             .filter_block2d_bil4x4_var_mmx_loop       ;
+
+        pxor            mm3,            mm3                 ;
+        pxor            mm2,            mm2                 ;
+
+        punpcklwd       mm2,            mm6                 ;
+        punpckhwd       mm3,            mm6                 ;
+
+        paddd           mm2,            mm3                 ;
+        movq            mm6,            mm2                 ;
+
+        psrlq           mm6,            32                  ;
+        paddd           mm2,            mm6                 ;
+
+        psrad           mm2,            16                  ;
+        movq            mm4,            mm7                 ;
+
+        psrlq           mm4,            32                  ;
+        paddd           mm4,            mm7                 ;
+
+        mov             rdi,            arg(6) ;sum
+        mov             rsi,            arg(7) ;sumsquared
+
+        movd            dword ptr [rdi],          mm2                 ;
+        movd            dword ptr [rsi],          mm4                 ;
+
+    ; begin epilog
+    add rsp, 16
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vpx_filter_block2d_bil_var_mmx
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    unsigned short *HFilter,
+;    unsigned short *VFilter,
+;    int *sum,
+;    unsigned int *sumsquared
+;)
+global sym(vpx_filter_block2d_bil_var_mmx) PRIVATE
+sym(vpx_filter_block2d_bil_var_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    sub         rsp, 16
+    ; end prolog
+
+        pxor            mm6,            mm6                 ;
+        pxor            mm7,            mm7                 ;
+        mov             rax,            arg(5) ;HFilter             ;
+
+        mov             rdx,            arg(6) ;VFilter             ;
+        mov             rsi,            arg(0) ;ref_ptr              ;
+
+        mov             rdi,            arg(2) ;src_ptr              ;
+        movsxd          rcx,            dword ptr arg(4) ;Height              ;
+
+        pxor            mm0,            mm0                 ;
+        movq            mm1,            [rsi]               ;
+
+        movq            mm3,            [rsi+1]             ;
+        movq            mm2,            mm1                 ;
+
+        movq            mm4,            mm3                 ;
+        punpcklbw       mm1,            mm0                 ;
+
+        punpckhbw       mm2,            mm0                 ;
+        pmullw          mm1,            [rax]               ;
+
+        pmullw          mm2,            [rax]               ;
+        punpcklbw       mm3,            mm0                 ;
+
+        punpckhbw       mm4,            mm0                 ;
+        pmullw          mm3,            [rax+8]             ;
+
+        pmullw          mm4,            [rax+8]             ;
+        paddw           mm1,            mm3                 ;
+
+        paddw           mm2,            mm4                 ;
+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
+
+        psraw           mm1,            mmx_filter_shift    ;
+        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
+
+        psraw           mm2,            mmx_filter_shift    ;
+        movq            mm5,            mm1
+
+        packuswb        mm5,            mm2                 ;
+%if ABI_IS_32BIT
+        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line
+%else
+        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
+        add             rsi,            r8
+%endif
+
+.filter_block2d_bil_var_mmx_loop:
+
+        movq            mm1,            [rsi]               ;
+        movq            mm3,            [rsi+1]             ;
+
+        movq            mm2,            mm1                 ;
+        movq            mm4,            mm3                 ;
+
+        punpcklbw       mm1,            mm0                 ;
+        punpckhbw       mm2,            mm0                 ;
+
+        pmullw          mm1,            [rax]               ;
+        pmullw          mm2,            [rax]               ;
+
+        punpcklbw       mm3,            mm0                 ;
+        punpckhbw       mm4,            mm0                 ;
+
+        pmullw          mm3,            [rax+8]             ;
+        pmullw          mm4,            [rax+8]             ;
+
+        paddw           mm1,            mm3                 ;
+        paddw           mm2,            mm4                 ;
+
+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
+        psraw           mm1,            mmx_filter_shift    ;
+
+        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
+        psraw           mm2,            mmx_filter_shift    ;
+
+        movq            mm3,            mm5                 ;
+        movq            mm4,            mm5                 ;
+
+        punpcklbw       mm3,            mm0                 ;
+        punpckhbw       mm4,            mm0                 ;
+
+        movq            mm5,            mm1                 ;
+        packuswb        mm5,            mm2                 ;
+
+        pmullw          mm3,            [rdx]               ;
+        pmullw          mm4,            [rdx]               ;
+
+        pmullw          mm1,            [rdx+8]             ;
+        pmullw          mm2,            [rdx+8]             ;
+
+        paddw           mm1,            mm3                 ;
+        paddw           mm2,            mm4                 ;
+
+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
+        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
+
+        psraw           mm1,            mmx_filter_shift    ;
+        psraw           mm2,            mmx_filter_shift    ;
+
+        movq            mm3,            [rdi]               ;
+        movq            mm4,            mm3                 ;
+
+        punpcklbw       mm3,            mm0                 ;
+        punpckhbw       mm4,            mm0                 ;
+
+        psubw           mm1,            mm3                 ;
+        psubw           mm2,            mm4                 ;
+
+        paddw           mm6,            mm1                 ;
+        pmaddwd         mm1,            mm1                 ;
+
+        paddw           mm6,            mm2                 ;
+        pmaddwd         mm2,            mm2                 ;
+
+        paddd           mm7,            mm1                 ;
+        paddd           mm7,            mm2                 ;
+
+%if ABI_IS_32BIT
+        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
+        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
+%else
+        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line    ;
+        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line    ;
+        add             rsi,            r8
+        add             rdi,            r9
+%endif
+        sub             rcx,            1                   ;
+        jnz             .filter_block2d_bil_var_mmx_loop       ;
+
+        pxor            mm3,            mm3                 ;
+        pxor            mm2,            mm2                 ;
+
+        punpcklwd       mm2,            mm6                 ;
+        punpckhwd       mm3,            mm6                 ;
+
+        paddd           mm2,            mm3                 ;
+        movq            mm6,            mm2                 ;
+
+        psrlq           mm6,            32                  ;
+        paddd           mm2,            mm6                 ;
+
+        psrad           mm2,            16                  ;
+        movq            mm4,            mm7                 ;
+
+        psrlq           mm4,            32                  ;
+        paddd           mm4,            mm7                 ;
+
+        mov             rdi,            arg(7) ;sum
+        mov             rsi,            arg(8) ;sumsquared
+
+        movd            dword ptr [rdi],          mm2                 ;
+        movd            dword ptr [rsi],          mm4                 ;
+
+    ; begin epilog
+    add rsp, 16
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+;short mmx_bi_rd[4] = { 64, 64, 64, 64};
+align 16
+mmx_bi_rd:
+    times 4 dw 64
diff --git a/libs/libvpx/vpx_dsp/x86/variance_mmx.c b/libs/libvpx/vpx_dsp/x86/variance_mmx.c
new file mode 100644
index 0000000000..f04f4e2c8f
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/variance_mmx.c
@@ -0,0 +1,249 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_ports/mem.h"
+
+DECLARE_ALIGNED(16, static const int16_t, bilinear_filters_mmx[8][8]) = {
+  { 128, 128, 128, 128,   0,   0,   0,   0 },
+  { 112, 112, 112, 112,  16,  16,  16,  16 },
+  {  96,  96,  96,  96,  32,  32,  32,  32 },
+  {  80,  80,  80,  80,  48,  48,  48,  48 },
+  {  64,  64,  64,  64,  64,  64,  64,  64 },
+  {  48,  48,  48,  48,  80,  80,  80,  80 },
+  {  32,  32,  32,  32,  96,  96,  96,  96 },
+  {  16,  16,  16,  16, 112, 112, 112, 112 }
+};
+
+extern void vpx_get4x4var_mmx(const uint8_t *a, int a_stride,
+                              const uint8_t *b, int b_stride,
+                              unsigned int *sse, int *sum);
+
+extern void vpx_filter_block2d_bil4x4_var_mmx(const unsigned char *ref_ptr,
+                                              int ref_pixels_per_line,
+                                              const unsigned char *src_ptr,
+                                              int src_pixels_per_line,
+                                              const int16_t *HFilter,
+                                              const int16_t *VFilter,
+                                              int *sum,
+                                              unsigned int *sumsquared);
+
+extern void vpx_filter_block2d_bil_var_mmx(const unsigned char *ref_ptr,
+                                           int ref_pixels_per_line,
+                                           const unsigned char *src_ptr,
+                                           int src_pixels_per_line,
+                                           unsigned int Height,
+                                           const int16_t *HFilter,
+                                           const int16_t *VFilter,
+                                           int *sum,
+                                           unsigned int *sumsquared);
+
+
+unsigned int vpx_variance4x4_mmx(const unsigned char *a, int a_stride,
+                                 const unsigned char *b, int b_stride,
+                                 unsigned int *sse) {
+    unsigned int var;
+    int avg;
+
+    vpx_get4x4var_mmx(a, a_stride, b, b_stride, &var, &avg);
+    *sse = var;
+    return (var - (((unsigned int)avg * avg) >> 4));
+}
+
+unsigned int vpx_variance8x8_mmx(const unsigned char *a, int a_stride,
+                                 const unsigned char *b, int b_stride,
+                                 unsigned int *sse) {
+    unsigned int var;
+    int avg;
+
+    vpx_get8x8var_mmx(a, a_stride, b, b_stride, &var, &avg);
+    *sse = var;
+
+    return (var - (((unsigned int)avg * avg) >> 6));
+}
+
+unsigned int vpx_mse16x16_mmx(const unsigned char *a, int a_stride,
+                              const unsigned char *b, int b_stride,
+                              unsigned int *sse) {
+    unsigned int sse0, sse1, sse2, sse3, var;
+    int sum0, sum1, sum2, sum3;
+
+    vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
+    vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1);
+    vpx_get8x8var_mmx(a + 8 * a_stride, a_stride,
+                      b + 8 * b_stride, b_stride, &sse2, &sum2);
+    vpx_get8x8var_mmx(a + 8 * a_stride + 8, a_stride,
+                      b + 8 * b_stride + 8, b_stride, &sse3, &sum3);
+
+    var = sse0 + sse1 + sse2 + sse3;
+    *sse = var;
+    return var;
+}
+
+unsigned int vpx_variance16x16_mmx(const unsigned char *a, int a_stride,
+                                   const unsigned char *b, int b_stride,
+                                   unsigned int *sse) {
+    unsigned int sse0, sse1, sse2, sse3, var;
+    int sum0, sum1, sum2, sum3, avg;
+
+    vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
+    vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1);
+    vpx_get8x8var_mmx(a + 8 * a_stride, a_stride,
+                      b + 8 * b_stride, b_stride, &sse2, &sum2);
+    vpx_get8x8var_mmx(a + 8 * a_stride + 8, a_stride,
+                      b + 8 * b_stride + 8, b_stride, &sse3, &sum3);
+
+    var = sse0 + sse1 + sse2 + sse3;
+    avg = sum0 + sum1 + sum2 + sum3;
+    *sse = var;
+    return (var - (((unsigned int)avg * avg) >> 8));
+}
+
+unsigned int vpx_variance16x8_mmx(const unsigned char *a, int a_stride,
+                                  const unsigned char *b, int b_stride,
+                                  unsigned int *sse) {
+    unsigned int sse0, sse1, var;
+    int sum0, sum1, avg;
+
+    vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
+    vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1);
+
+    var = sse0 + sse1;
+    avg = sum0 + sum1;
+    *sse = var;
+    return (var - (((unsigned int)avg * avg) >> 7));
+}
+
+unsigned int vpx_variance8x16_mmx(const unsigned char *a, int a_stride,
+                                  const unsigned char *b, int b_stride,
+                                  unsigned int *sse) {
+    unsigned int sse0, sse1, var;
+    int sum0, sum1, avg;
+
+    vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
+    vpx_get8x8var_mmx(a + 8 * a_stride, a_stride,
+                      b + 8 * b_stride, b_stride, &sse1, &sum1);
+
+    var = sse0 + sse1;
+    avg = sum0 + sum1;
+    *sse = var;
+
+    return (var - (((unsigned int)avg * avg) >> 7));
+}
+
+uint32_t vpx_sub_pixel_variance4x4_mmx(const uint8_t *a, int a_stride,
+                                       int xoffset, int yoffset,
+                                       const uint8_t *b, int b_stride,
+                                       uint32_t *sse) {
+    int xsum;
+    unsigned int xxsum;
+    vpx_filter_block2d_bil4x4_var_mmx(a, a_stride, b, b_stride,
+                                      bilinear_filters_mmx[xoffset],
+                                      bilinear_filters_mmx[yoffset],
+                                      &xsum, &xxsum);
+    *sse = xxsum;
+    return (xxsum - (((unsigned int)xsum * xsum) >> 4));
+}
+
+
+uint32_t vpx_sub_pixel_variance8x8_mmx(const uint8_t *a, int a_stride,
+                                       int xoffset, int yoffset,
+                                       const uint8_t *b, int b_stride,
+                                       uint32_t *sse) {
+    int xsum;
+    uint32_t xxsum;
+    vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 8,
+                                   bilinear_filters_mmx[xoffset],
+                                   bilinear_filters_mmx[yoffset],
+                                   &xsum, &xxsum);
+    *sse = xxsum;
+    return (xxsum - (((uint32_t)xsum * xsum) >> 6));
+}
+
+uint32_t vpx_sub_pixel_variance16x16_mmx(const uint8_t *a, int a_stride,
+                                         int xoffset, int yoffset,
+                                         const uint8_t *b, int b_stride,
+                                         uint32_t *sse) {
+    int xsum0, xsum1;
+    unsigned int xxsum0, xxsum1;
+
+    vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 16,
+                                   bilinear_filters_mmx[xoffset],
+                                   bilinear_filters_mmx[yoffset],
+                                   &xsum0, &xxsum0);
+
+    vpx_filter_block2d_bil_var_mmx(a + 8, a_stride, b + 8, b_stride, 16,
+                                   bilinear_filters_mmx[xoffset],
+                                   bilinear_filters_mmx[yoffset],
+                                   &xsum1, &xxsum1);
+
+    xsum0 += xsum1;
+    xxsum0 += xxsum1;
+
+    *sse = xxsum0;
+    return (xxsum0 - (((uint32_t)xsum0 * xsum0) >> 8));
+}
+
+uint32_t vpx_sub_pixel_variance16x8_mmx(const uint8_t *a, int a_stride,
+                                        int xoffset, int yoffset,
+                                        const uint8_t *b, int b_stride,
+                                        uint32_t *sse) {
+    int xsum0, xsum1;
+    unsigned int xxsum0, xxsum1;
+
+    vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 8,
+                                   bilinear_filters_mmx[xoffset],
+                                   bilinear_filters_mmx[yoffset],
+                                   &xsum0, &xxsum0);
+
+    vpx_filter_block2d_bil_var_mmx(a + 8, a_stride, b + 8, b_stride, 8,
+                                   bilinear_filters_mmx[xoffset],
+                                   bilinear_filters_mmx[yoffset],
+                                   &xsum1, &xxsum1);
+
+    xsum0 += xsum1;
+    xxsum0 += xxsum1;
+
+    *sse = xxsum0;
+    return (xxsum0 - (((uint32_t)xsum0 * xsum0) >> 7));
+}
+
+uint32_t vpx_sub_pixel_variance8x16_mmx(const uint8_t *a, int a_stride,
+                                        int xoffset, int yoffset,
+                                        const uint8_t *b, int b_stride,
+                                        uint32_t *sse) {
+    int xsum;
+    unsigned int xxsum;
+    vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 16,
+                                   bilinear_filters_mmx[xoffset],
+                                   bilinear_filters_mmx[yoffset],
+                                   &xsum, &xxsum);
+    *sse = xxsum;
+    return (xxsum - (((uint32_t)xsum * xsum) >> 7));
+}
+
+uint32_t vpx_variance_halfpixvar16x16_h_mmx(const uint8_t *a, int a_stride,
+                                            const uint8_t *b, int b_stride,
+                                            uint32_t *sse) {
+  return vpx_sub_pixel_variance16x16_mmx(a, a_stride, 4, 0, b, b_stride, sse);
+}
+
+uint32_t vpx_variance_halfpixvar16x16_v_mmx(const uint8_t *a, int a_stride,
+                                            const uint8_t *b, int b_stride,
+                                            uint32_t *sse) {
+  return vpx_sub_pixel_variance16x16_mmx(a, a_stride, 0, 4, b, b_stride, sse);
+}
+
+uint32_t vpx_variance_halfpixvar16x16_hv_mmx(const uint8_t *a, int a_stride,
+                                             const uint8_t *b, int b_stride,
+                                             uint32_t *sse) {
+  return vpx_sub_pixel_variance16x16_mmx(a, a_stride, 4, 4, b, b_stride, sse);
+}
diff --git a/libs/libvpx/vpx_dsp/x86/variance_sse2.c b/libs/libvpx/vpx_dsp/x86/variance_sse2.c
new file mode 100644
index 0000000000..e6c9365ab4
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/variance_sse2.c
@@ -0,0 +1,477 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>  // SSE2
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_ports/mem.h"
+
+typedef void (*getNxMvar_fn_t) (const unsigned char *src, int src_stride,
+                                const unsigned char *ref, int ref_stride,
+                                unsigned int *sse, int *sum);
+
+unsigned int vpx_get_mb_ss_sse2(const int16_t *src) {
+  __m128i vsum = _mm_setzero_si128();
+  int i;
+
+  for (i = 0; i < 32; ++i) {
+    const __m128i v = _mm_loadu_si128((const __m128i *)src);
+    vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
+    src += 8;
+  }
+
+  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
+  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
+  return  _mm_cvtsi128_si32(vsum);
+}
+
+#define READ64(p, stride, i) \
+  _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \
+      _mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride)))
+
+static void get4x4var_sse2(const uint8_t *src, int src_stride,
+                           const uint8_t *ref, int ref_stride,
+                           unsigned int *sse, int *sum) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero);
+  const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero);
+  const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero);
+  const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero);
+  const __m128i diff0 = _mm_sub_epi16(src0, ref0);
+  const __m128i diff1 = _mm_sub_epi16(src1, ref1);
+
+  // sum
+  __m128i vsum = _mm_add_epi16(diff0, diff1);
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
+  *sum = (int16_t)_mm_extract_epi16(vsum, 0);
+
+  // sse
+  vsum = _mm_add_epi32(_mm_madd_epi16(diff0, diff0),
+                       _mm_madd_epi16(diff1, diff1));
+  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
+  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
+  *sse = _mm_cvtsi128_si32(vsum);
+}
+
+void vpx_get8x8var_sse2(const uint8_t *src, int src_stride,
+                        const uint8_t *ref, int ref_stride,
+                        unsigned int *sse, int *sum) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i vsum = _mm_setzero_si128();
+  __m128i vsse = _mm_setzero_si128();
+  int i;
+
+  for (i = 0; i < 8; i += 2) {
+    const __m128i src0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
+        (const __m128i *)(src + i * src_stride)), zero);
+    const __m128i ref0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
+        (const __m128i *)(ref + i * ref_stride)), zero);
+    const __m128i diff0 = _mm_sub_epi16(src0, ref0);
+
+    const __m128i src1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
+        (const __m128i *)(src + (i + 1) * src_stride)), zero);
+    const __m128i ref1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
+        (const __m128i *)(ref + (i + 1) * ref_stride)), zero);
+    const __m128i diff1 = _mm_sub_epi16(src1, ref1);
+
+    vsum = _mm_add_epi16(vsum, diff0);
+    vsum = _mm_add_epi16(vsum, diff1);
+    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
+    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
+  }
+
+  // sum
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
+  *sum = (int16_t)_mm_extract_epi16(vsum, 0);
+
+  // sse
+  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
+  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
+  *sse = _mm_cvtsi128_si32(vsse);
+}
+
+void vpx_get16x16var_sse2(const uint8_t *src, int src_stride,
+                          const uint8_t *ref, int ref_stride,
+                          unsigned int *sse, int *sum) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i vsum = _mm_setzero_si128();
+  __m128i vsse = _mm_setzero_si128();
+  int i;
+
+  for (i = 0; i < 16; ++i) {
+    const __m128i s = _mm_loadu_si128((const __m128i *)src);
+    const __m128i r = _mm_loadu_si128((const __m128i *)ref);
+
+    const __m128i src0 = _mm_unpacklo_epi8(s, zero);
+    const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
+    const __m128i diff0 = _mm_sub_epi16(src0, ref0);
+
+    const __m128i src1 = _mm_unpackhi_epi8(s, zero);
+    const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
+    const __m128i diff1 = _mm_sub_epi16(src1, ref1);
+
+    vsum = _mm_add_epi16(vsum, diff0);
+    vsum = _mm_add_epi16(vsum, diff1);
+    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
+    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+
+  // sum
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
+  *sum = (int16_t)_mm_extract_epi16(vsum, 0) +
+             (int16_t)_mm_extract_epi16(vsum, 1);
+
+  // sse
+  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
+  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
+  *sse = _mm_cvtsi128_si32(vsse);
+}
+
+
+static void variance_sse2(const unsigned char *src, int src_stride,
+                          const unsigned char *ref, int ref_stride,
+                          int w, int h, unsigned int *sse, int *sum,
+                          getNxMvar_fn_t var_fn, int block_size) {
+  int i, j;
+
+  *sse = 0;
+  *sum = 0;
+
+  for (i = 0; i < h; i += block_size) {
+    for (j = 0; j < w; j += block_size) {
+      unsigned int sse0;
+      int sum0;
+      var_fn(src + src_stride * i + j, src_stride,
+             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
+      *sse += sse0;
+      *sum += sum0;
+    }
+  }
+}
+
+unsigned int vpx_variance4x4_sse2(const unsigned char *src, int src_stride,
+                                  const unsigned char *ref, int ref_stride,
+                                  unsigned int *sse) {
+  int sum;
+  get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
+  return *sse - (((unsigned int)sum * sum) >> 4);
+}
+
+unsigned int vpx_variance8x4_sse2(const uint8_t *src, int src_stride,
+                                  const uint8_t *ref, int ref_stride,
+                                  unsigned int *sse) {
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 8, 4,
+                sse, &sum, get4x4var_sse2, 4);
+  return *sse - (((unsigned int)sum * sum) >> 5);
+}
+
+unsigned int vpx_variance4x8_sse2(const uint8_t *src, int src_stride,
+                                  const uint8_t *ref, int ref_stride,
+                                  unsigned int *sse) {
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 4, 8,
+                sse, &sum, get4x4var_sse2, 4);
+  return *sse - (((unsigned int)sum * sum) >> 5);
+}
+
+unsigned int vpx_variance8x8_sse2(const unsigned char *src, int src_stride,
+                                  const unsigned char *ref, int ref_stride,
+                                  unsigned int *sse) {
+  int sum;
+  vpx_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
+  return *sse - (((unsigned int)sum * sum) >> 6);
+}
+
+unsigned int vpx_variance16x8_sse2(const unsigned char *src, int src_stride,
+                                   const unsigned char *ref, int ref_stride,
+                                   unsigned int *sse) {
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 16, 8,
+                sse, &sum, vpx_get8x8var_sse2, 8);
+  return *sse - (((unsigned int)sum * sum) >> 7);
+}
+
+unsigned int vpx_variance8x16_sse2(const unsigned char *src, int src_stride,
+                                   const unsigned char *ref, int ref_stride,
+                                   unsigned int *sse) {
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 8, 16,
+                sse, &sum, vpx_get8x8var_sse2, 8);
+  return *sse - (((unsigned int)sum * sum) >> 7);
+}
+
+unsigned int vpx_variance16x16_sse2(const unsigned char *src, int src_stride,
+                                    const unsigned char *ref, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  vpx_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
+  return *sse - (((unsigned int)sum * sum) >> 8);
+}
+
+unsigned int vpx_variance32x32_sse2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 32, 32,
+                sse, &sum, vpx_get16x16var_sse2, 16);
+  return *sse - (((int64_t)sum * sum) >> 10);
+}
+
+unsigned int vpx_variance32x16_sse2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 32, 16,
+                sse, &sum, vpx_get16x16var_sse2, 16);
+  return *sse - (((int64_t)sum * sum) >> 9);
+}
+
+unsigned int vpx_variance16x32_sse2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 16, 32,
+                sse, &sum, vpx_get16x16var_sse2, 16);
+  return *sse - (((int64_t)sum * sum) >> 9);
+}
+
+unsigned int vpx_variance64x64_sse2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 64, 64,
+                sse, &sum, vpx_get16x16var_sse2, 16);
+  return *sse - (((int64_t)sum * sum) >> 12);
+}
+
+unsigned int vpx_variance64x32_sse2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 64, 32,
+                sse, &sum, vpx_get16x16var_sse2, 16);
+  return *sse - (((int64_t)sum * sum) >> 11);
+}
+
+unsigned int vpx_variance32x64_sse2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 32, 64,
+                sse, &sum, vpx_get16x16var_sse2, 16);
+  return *sse - (((int64_t)sum * sum) >> 11);
+}
+
+unsigned int vpx_mse8x8_sse2(const uint8_t *src, int src_stride,
+                             const uint8_t *ref, int ref_stride,
+                             unsigned int *sse) {
+  vpx_variance8x8_sse2(src, src_stride, ref, ref_stride, sse);
+  return *sse;
+}
+
+unsigned int vpx_mse8x16_sse2(const uint8_t *src, int src_stride,
+                              const uint8_t *ref, int ref_stride,
+                              unsigned int *sse) {
+  vpx_variance8x16_sse2(src, src_stride, ref, ref_stride, sse);
+  return *sse;
+}
+
+unsigned int vpx_mse16x8_sse2(const uint8_t *src, int src_stride,
+                              const uint8_t *ref, int ref_stride,
+                              unsigned int *sse) {
+  vpx_variance16x8_sse2(src, src_stride, ref, ref_stride, sse);
+  return *sse;
+}
+
+unsigned int vpx_mse16x16_sse2(const uint8_t *src, int src_stride,
+                               const uint8_t *ref, int ref_stride,
+                               unsigned int *sse) {
+  vpx_variance16x16_sse2(src, src_stride, ref, ref_stride, sse);
+  return *sse;
+}
+
+#if CONFIG_USE_X86INC
+// The 2 unused parameters are place holders for PIC enabled build.
+// These definitions are for functions defined in subpel_variance.asm
+#define DECL(w, opt) \
+  int vpx_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \
+                                          ptrdiff_t src_stride, \
+                                          int x_offset, int y_offset, \
+                                          const uint8_t *dst, \
+                                          ptrdiff_t dst_stride, \
+                                          int height, unsigned int *sse, \
+                                          void *unused0, void *unused)
+#define DECLS(opt1, opt2) \
+  DECL(4, opt2); \
+  DECL(8, opt1); \
+  DECL(16, opt1)
+
+DECLS(sse2, sse);
+DECLS(ssse3, ssse3);
+#undef DECLS
+#undef DECL
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+unsigned int vpx_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \
+                                                     int src_stride, \
+                                                     int x_offset, \
+                                                     int y_offset, \
+                                                     const uint8_t *dst, \
+                                                     int dst_stride, \
+                                                     unsigned int *sse_ptr) { \
+  unsigned int sse; \
+  int se = vpx_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \
+                                                y_offset, dst, dst_stride, \
+                                                h, &sse, NULL, NULL); \
+  if (w > wf) { \
+    unsigned int sse2; \
+    int se2 = vpx_sub_pixel_variance##wf##xh_##opt(src + 16, src_stride, \
+                                                   x_offset, y_offset, \
+                                                   dst + 16, dst_stride, \
+                                                   h, &sse2, NULL, NULL); \
+    se += se2; \
+    sse += sse2; \
+    if (w > wf * 2) { \
+      se2 = vpx_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
+                                                 x_offset, y_offset, \
+                                                 dst + 32, dst_stride, \
+                                                 h, &sse2, NULL, NULL); \
+      se += se2; \
+      sse += sse2; \
+      se2 = vpx_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
+                                                 x_offset, y_offset, \
+                                                 dst + 48, dst_stride, \
+                                                 h, &sse2, NULL, NULL); \
+      se += se2; \
+      sse += sse2; \
+    } \
+  } \
+  *sse_ptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+}
+
+#define FNS(opt1, opt2) \
+FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
+FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
+FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
+FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
+FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
+FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
+FN(16, 16, 16, 4, 4, opt1, (uint32_t)); \
+FN(16,  8, 16, 4, 3, opt1, (uint32_t)); \
+FN(8,  16,  8, 3, 4, opt1, (uint32_t)); \
+FN(8,   8,  8, 3, 3, opt1, (uint32_t)); \
+FN(8,   4,  8, 3, 2, opt1, (uint32_t)); \
+FN(4,   8,  4, 2, 3, opt2, (uint32_t)); \
+FN(4,   4,  4, 2, 2, opt2, (uint32_t))
+
+FNS(sse2, sse);
+FNS(ssse3, ssse3);
+
+#undef FNS
+#undef FN
+
+// The 2 unused parameters are place holders for PIC enabled build.
+#define DECL(w, opt) \
+int vpx_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \
+                                            ptrdiff_t src_stride, \
+                                            int x_offset, int y_offset, \
+                                            const uint8_t *dst, \
+                                            ptrdiff_t dst_stride, \
+                                            const uint8_t *sec, \
+                                            ptrdiff_t sec_stride, \
+                                            int height, unsigned int *sse, \
+                                            void *unused0, void *unused)
+#define DECLS(opt1, opt2) \
+DECL(4, opt2); \
+DECL(8, opt1); \
+DECL(16, opt1)
+
+DECLS(sse2, sse);
+DECLS(ssse3, ssse3);
+#undef DECL
+#undef DECLS
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+unsigned int vpx_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \
+                                                         int src_stride, \
+                                                         int x_offset, \
+                                                         int y_offset, \
+                                                         const uint8_t *dst, \
+                                                         int dst_stride, \
+                                                         unsigned int *sseptr, \
+                                                         const uint8_t *sec) { \
+  unsigned int sse; \
+  int se = vpx_sub_pixel_avg_variance##wf##xh_##opt(src, src_stride, x_offset, \
+                                                    y_offset, dst, dst_stride, \
+                                                    sec, w, h, &sse, NULL, \
+                                                    NULL); \
+  if (w > wf) { \
+    unsigned int sse2; \
+    int se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(src + 16, src_stride, \
+                                                       x_offset, y_offset, \
+                                                       dst + 16, dst_stride, \
+                                                       sec + 16, w, h, &sse2, \
+                                                       NULL, NULL); \
+    se += se2; \
+    sse += sse2; \
+    if (w > wf * 2) { \
+      se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(src + 32, src_stride, \
+                                                     x_offset, y_offset, \
+                                                     dst + 32, dst_stride, \
+                                                     sec + 32, w, h, &sse2, \
+                                                     NULL, NULL); \
+      se += se2; \
+      sse += sse2; \
+      se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(src + 48, src_stride, \
+                                                     x_offset, y_offset, \
+                                                     dst + 48, dst_stride, \
+                                                     sec + 48, w, h, &sse2, \
+                                                     NULL, NULL); \
+      se += se2; \
+      sse += sse2; \
+    } \
+  } \
+  *sseptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+}
+
+#define FNS(opt1, opt2) \
+FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
+FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
+FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
+FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
+FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
+FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
+FN(16, 16, 16, 4, 4, opt1, (uint32_t)); \
+FN(16,  8, 16, 4, 3, opt1, (uint32_t)); \
+FN(8,  16,  8, 3, 4, opt1, (uint32_t)); \
+FN(8,   8,  8, 3, 3, opt1, (uint32_t)); \
+FN(8,   4,  8, 3, 2, opt1, (uint32_t)); \
+FN(4,   8,  4, 2, 3, opt2, (uint32_t)); \
+FN(4,   4,  4, 2, 2, opt2, (uint32_t))
+
+FNS(sse2, sse);
+FNS(ssse3, ssse3);
+
+#undef FNS
+#undef FN
+#endif  // CONFIG_USE_X86INC
diff --git a/libs/libvpx/vpx_dsp/x86/vpx_asm_stubs.c b/libs/libvpx/vpx_dsp/x86/vpx_asm_stubs.c
new file mode 100644
index 0000000000..422b0fc422
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/vpx_asm_stubs.c
@@ -0,0 +1,162 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/convolve.h"
+
+#if HAVE_SSE2
+filter8_1dfunction vpx_filter_block1d16_v8_sse2;
+filter8_1dfunction vpx_filter_block1d16_h8_sse2;
+filter8_1dfunction vpx_filter_block1d8_v8_sse2;
+filter8_1dfunction vpx_filter_block1d8_h8_sse2;
+filter8_1dfunction vpx_filter_block1d4_v8_sse2;
+filter8_1dfunction vpx_filter_block1d4_h8_sse2;
+filter8_1dfunction vpx_filter_block1d16_v8_avg_sse2;
+filter8_1dfunction vpx_filter_block1d16_h8_avg_sse2;
+filter8_1dfunction vpx_filter_block1d8_v8_avg_sse2;
+filter8_1dfunction vpx_filter_block1d8_h8_avg_sse2;
+filter8_1dfunction vpx_filter_block1d4_v8_avg_sse2;
+filter8_1dfunction vpx_filter_block1d4_h8_avg_sse2;
+
+filter8_1dfunction vpx_filter_block1d16_v2_sse2;
+filter8_1dfunction vpx_filter_block1d16_h2_sse2;
+filter8_1dfunction vpx_filter_block1d8_v2_sse2;
+filter8_1dfunction vpx_filter_block1d8_h2_sse2;
+filter8_1dfunction vpx_filter_block1d4_v2_sse2;
+filter8_1dfunction vpx_filter_block1d4_h2_sse2;
+filter8_1dfunction vpx_filter_block1d16_v2_avg_sse2;
+filter8_1dfunction vpx_filter_block1d16_h2_avg_sse2;
+filter8_1dfunction vpx_filter_block1d8_v2_avg_sse2;
+filter8_1dfunction vpx_filter_block1d8_h2_avg_sse2;
+filter8_1dfunction vpx_filter_block1d4_v2_avg_sse2;
+filter8_1dfunction vpx_filter_block1d4_h2_avg_sse2;
+
+// void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                               uint8_t *dst, ptrdiff_t dst_stride,
+//                               const int16_t *filter_x, int x_step_q4,
+//                               const int16_t *filter_y, int y_step_q4,
+//                               int w, int h);
+// void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                              uint8_t *dst, ptrdiff_t dst_stride,
+//                              const int16_t *filter_x, int x_step_q4,
+//                              const int16_t *filter_y, int y_step_q4,
+//                              int w, int h);
+// void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                                   uint8_t *dst, ptrdiff_t dst_stride,
+//                                   const int16_t *filter_x, int x_step_q4,
+//                                   const int16_t *filter_y, int y_step_q4,
+//                                   int w, int h);
+// void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                                  uint8_t *dst, ptrdiff_t dst_stride,
+//                                  const int16_t *filter_x, int x_step_q4,
+//                                  const int16_t *filter_y, int y_step_q4,
+//                                  int w, int h);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
+FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
+FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2);
+
+// void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                         uint8_t *dst, ptrdiff_t dst_stride,
+//                         const int16_t *filter_x, int x_step_q4,
+//                         const int16_t *filter_y, int y_step_q4,
+//                         int w, int h);
+// void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                             uint8_t *dst, ptrdiff_t dst_stride,
+//                             const int16_t *filter_x, int x_step_q4,
+//                             const int16_t *filter_y, int y_step_q4,
+//                             int w, int h);
+FUN_CONV_2D(, sse2);
+FUN_CONV_2D(avg_ , sse2);
+
+#if CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2;
+
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2;
+
+// void vpx_highbd_convolve8_horiz_sse2(const uint8_t *src,
+//                                      ptrdiff_t src_stride,
+//                                      uint8_t *dst,
+//                                      ptrdiff_t dst_stride,
+//                                      const int16_t *filter_x,
+//                                      int x_step_q4,
+//                                      const int16_t *filter_y,
+//                                      int y_step_q4,
+//                                      int w, int h, int bd);
+// void vpx_highbd_convolve8_vert_sse2(const uint8_t *src,
+//                                     ptrdiff_t src_stride,
+//                                     uint8_t *dst,
+//                                     ptrdiff_t dst_stride,
+//                                     const int16_t *filter_x,
+//                                     int x_step_q4,
+//                                     const int16_t *filter_y,
+//                                     int y_step_q4,
+//                                     int w, int h, int bd);
+// void vpx_highbd_convolve8_avg_horiz_sse2(const uint8_t *src,
+//                                          ptrdiff_t src_stride,
+//                                          uint8_t *dst,
+//                                          ptrdiff_t dst_stride,
+//                                          const int16_t *filter_x,
+//                                          int x_step_q4,
+//                                          const int16_t *filter_y,
+//                                          int y_step_q4,
+//                                          int w, int h, int bd);
+// void vpx_highbd_convolve8_avg_vert_sse2(const uint8_t *src,
+//                                         ptrdiff_t src_stride,
+//                                         uint8_t *dst,
+//                                         ptrdiff_t dst_stride,
+//                                         const int16_t *filter_x,
+//                                         int x_step_q4,
+//                                         const int16_t *filter_y,
+//                                         int y_step_q4,
+//                                         int w, int h, int bd);
+HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
+HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
+HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
+HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
+                 sse2);
+
+// void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                                uint8_t *dst, ptrdiff_t dst_stride,
+//                                const int16_t *filter_x, int x_step_q4,
+//                                const int16_t *filter_y, int y_step_q4,
+//                                int w, int h, int bd);
+// void vpx_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                                    uint8_t *dst, ptrdiff_t dst_stride,
+//                                    const int16_t *filter_x, int x_step_q4,
+//                                    const int16_t *filter_y, int y_step_q4,
+//                                    int w, int h, int bd);
+HIGH_FUN_CONV_2D(, sse2);
+HIGH_FUN_CONV_2D(avg_ , sse2);
+#endif  // CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
+#endif  // HAVE_SSE2
diff --git a/libs/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm b/libs/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
new file mode 100644
index 0000000000..abc0270655
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
@@ -0,0 +1,228 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro convolve_fn 1-2
+%ifidn %1, avg
+%define AUX_XMM_REGS 4
+%else
+%define AUX_XMM_REGS 0
+%endif
+%ifidn %2, highbd
+%define pavg pavgw
+cglobal %2_convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
+                                              dst, dst_stride, \
+                                              fx, fxs, fy, fys, w, h, bd
+%else
+%define pavg pavgb
+cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
+                                           dst, dst_stride, \
+                                           fx, fxs, fy, fys, w, h
+%endif
+  mov r4d, dword wm
+%ifidn %2, highbd
+  shl r4d, 1
+  shl srcq, 1
+  shl src_strideq, 1
+  shl dstq, 1
+  shl dst_strideq, 1
+%else
+  cmp r4d, 4
+  je .w4
+%endif
+  cmp r4d, 8
+  je .w8
+  cmp r4d, 16
+  je .w16
+  cmp r4d, 32
+  je .w32
+%ifidn %2, highbd
+  cmp r4d, 64
+  je .w64
+
+  mov                    r4d, dword hm
+.loop128:
+  movu                    m0, [srcq]
+  movu                    m1, [srcq+16]
+  movu                    m2, [srcq+32]
+  movu                    m3, [srcq+48]
+%ifidn %1, avg
+  pavg                    m0, [dstq]
+  pavg                    m1, [dstq+16]
+  pavg                    m2, [dstq+32]
+  pavg                    m3, [dstq+48]
+%endif
+  mova             [dstq   ], m0
+  mova             [dstq+16], m1
+  mova             [dstq+32], m2
+  mova             [dstq+48], m3
+  movu                    m0, [srcq+64]
+  movu                    m1, [srcq+80]
+  movu                    m2, [srcq+96]
+  movu                    m3, [srcq+112]
+  add                   srcq, src_strideq
+%ifidn %1, avg
+  pavg                    m0, [dstq+64]
+  pavg                    m1, [dstq+80]
+  pavg                    m2, [dstq+96]
+  pavg                    m3, [dstq+112]
+%endif
+  mova             [dstq+64], m0
+  mova             [dstq+80], m1
+  mova             [dstq+96], m2
+  mova            [dstq+112], m3
+  add                   dstq, dst_strideq
+  dec                    r4d
+  jnz .loop128
+  RET
+%endif
+
+.w64
+  mov                    r4d, dword hm
+.loop64:
+  movu                    m0, [srcq]
+  movu                    m1, [srcq+16]
+  movu                    m2, [srcq+32]
+  movu                    m3, [srcq+48]
+  add                   srcq, src_strideq
+%ifidn %1, avg
+  pavg                    m0, [dstq]
+  pavg                    m1, [dstq+16]
+  pavg                    m2, [dstq+32]
+  pavg                    m3, [dstq+48]
+%endif
+  mova             [dstq   ], m0
+  mova             [dstq+16], m1
+  mova             [dstq+32], m2
+  mova             [dstq+48], m3
+  add                   dstq, dst_strideq
+  dec                    r4d
+  jnz .loop64
+  RET
+
+.w32:
+  mov                    r4d, dword hm
+.loop32:
+  movu                    m0, [srcq]
+  movu                    m1, [srcq+16]
+  movu                    m2, [srcq+src_strideq]
+  movu                    m3, [srcq+src_strideq+16]
+  lea                   srcq, [srcq+src_strideq*2]
+%ifidn %1, avg
+  pavg                    m0, [dstq]
+  pavg                    m1, [dstq            +16]
+  pavg                    m2, [dstq+dst_strideq]
+  pavg                    m3, [dstq+dst_strideq+16]
+%endif
+  mova [dstq               ], m0
+  mova [dstq            +16], m1
+  mova [dstq+dst_strideq   ], m2
+  mova [dstq+dst_strideq+16], m3
+  lea                   dstq, [dstq+dst_strideq*2]
+  sub                    r4d, 2
+  jnz .loop32
+  RET
+
+.w16:
+  mov                    r4d, dword hm
+  lea                    r5q, [src_strideq*3]
+  lea                    r6q, [dst_strideq*3]
+.loop16:
+  movu                    m0, [srcq]
+  movu                    m1, [srcq+src_strideq]
+  movu                    m2, [srcq+src_strideq*2]
+  movu                    m3, [srcq+r5q]
+  lea                   srcq, [srcq+src_strideq*4]
+%ifidn %1, avg
+  pavg                    m0, [dstq]
+  pavg                    m1, [dstq+dst_strideq]
+  pavg                    m2, [dstq+dst_strideq*2]
+  pavg                    m3, [dstq+r6q]
+%endif
+  mova  [dstq              ], m0
+  mova  [dstq+dst_strideq  ], m1
+  mova  [dstq+dst_strideq*2], m2
+  mova  [dstq+r6q          ], m3
+  lea                   dstq, [dstq+dst_strideq*4]
+  sub                    r4d, 4
+  jnz .loop16
+  RET
+
+.w8:
+  mov                    r4d, dword hm
+  lea                    r5q, [src_strideq*3]
+  lea                    r6q, [dst_strideq*3]
+.loop8:
+  movh                    m0, [srcq]
+  movh                    m1, [srcq+src_strideq]
+  movh                    m2, [srcq+src_strideq*2]
+  movh                    m3, [srcq+r5q]
+  lea                   srcq, [srcq+src_strideq*4]
+%ifidn %1, avg
+  movh                    m4, [dstq]
+  movh                    m5, [dstq+dst_strideq]
+  movh                    m6, [dstq+dst_strideq*2]
+  movh                    m7, [dstq+r6q]
+  pavg                    m0, m4
+  pavg                    m1, m5
+  pavg                    m2, m6
+  pavg                    m3, m7
+%endif
+  movh  [dstq              ], m0
+  movh  [dstq+dst_strideq  ], m1
+  movh  [dstq+dst_strideq*2], m2
+  movh  [dstq+r6q          ], m3
+  lea                   dstq, [dstq+dst_strideq*4]
+  sub                    r4d, 4
+  jnz .loop8
+  RET
+
+%ifnidn %2, highbd
+.w4:
+  mov                    r4d, dword hm
+  lea                    r5q, [src_strideq*3]
+  lea                    r6q, [dst_strideq*3]
+.loop4:
+  movd                    m0, [srcq]
+  movd                    m1, [srcq+src_strideq]
+  movd                    m2, [srcq+src_strideq*2]
+  movd                    m3, [srcq+r5q]
+  lea                   srcq, [srcq+src_strideq*4]
+%ifidn %1, avg
+  movd                    m4, [dstq]
+  movd                    m5, [dstq+dst_strideq]
+  movd                    m6, [dstq+dst_strideq*2]
+  movd                    m7, [dstq+r6q]
+  pavg                    m0, m4
+  pavg                    m1, m5
+  pavg                    m2, m6
+  pavg                    m3, m7
+%endif
+  movd  [dstq              ], m0
+  movd  [dstq+dst_strideq  ], m1
+  movd  [dstq+dst_strideq*2], m2
+  movd  [dstq+r6q          ], m3
+  lea                   dstq, [dstq+dst_strideq*4]
+  sub                    r4d, 4
+  jnz .loop4
+  RET
+%endif
+%endmacro
+
+INIT_XMM sse2
+convolve_fn copy
+convolve_fn avg
+%if CONFIG_VP9_HIGHBITDEPTH
+convolve_fn copy, highbd
+convolve_fn avg, highbd
+%endif
diff --git a/libs/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm b/libs/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm
new file mode 100644
index 0000000000..bfc816f235
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm
@@ -0,0 +1,962 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;Note: tap3 and tap4 have to be applied and added after other taps to avoid
+;overflow.
+
+%macro HIGH_GET_FILTERS_4 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rcx, 0x00000040
+
+    movdqa      xmm7, [rdx]                 ;load filters
+    pshuflw     xmm0, xmm7, 0b              ;k0
+    pshuflw     xmm1, xmm7, 01010101b       ;k1
+    pshuflw     xmm2, xmm7, 10101010b       ;k2
+    pshuflw     xmm3, xmm7, 11111111b       ;k3
+    psrldq      xmm7, 8
+    pshuflw     xmm4, xmm7, 0b              ;k4
+    pshuflw     xmm5, xmm7, 01010101b       ;k5
+    pshuflw     xmm6, xmm7, 10101010b       ;k6
+    pshuflw     xmm7, xmm7, 11111111b       ;k7
+
+    punpcklwd   xmm0, xmm6
+    punpcklwd   xmm2, xmm5
+    punpcklwd   xmm3, xmm4
+    punpcklwd   xmm1, xmm7
+
+    movdqa      k0k6, xmm0
+    movdqa      k2k5, xmm2
+    movdqa      k3k4, xmm3
+    movdqa      k1k7, xmm1
+
+    movq        xmm6, rcx
+    pshufd      xmm6, xmm6, 0
+    movdqa      krd, xmm6
+
+    ;Compute max and min values of a pixel
+    mov         rdx, 0x00010001
+    movsxd      rcx, DWORD PTR arg(6)      ;bps
+    movq        xmm0, rdx
+    movq        xmm1, rcx
+    pshufd      xmm0, xmm0, 0b
+    movdqa      xmm2, xmm0
+    psllw       xmm0, xmm1
+    psubw       xmm0, xmm2
+    pxor        xmm1, xmm1
+    movdqa      max, xmm0                  ;max value (for clamping)
+    movdqa      min, xmm1                  ;min value (for clamping)
+
+%endm
+
+%macro HIGH_APPLY_FILTER_4 1
+    punpcklwd   xmm0, xmm6                  ;two row in one register
+    punpcklwd   xmm1, xmm7
+    punpcklwd   xmm2, xmm5
+    punpcklwd   xmm3, xmm4
+
+    pmaddwd     xmm0, k0k6                  ;multiply the filter factors
+    pmaddwd     xmm1, k1k7
+    pmaddwd     xmm2, k2k5
+    pmaddwd     xmm3, k3k4
+
+    paddd       xmm0, xmm1                  ;sum
+    paddd       xmm0, xmm2
+    paddd       xmm0, xmm3
+
+    paddd       xmm0, krd                   ;rounding
+    psrad       xmm0, 7                     ;shift
+    packssdw    xmm0, xmm0                  ;pack to word
+
+    ;clamp the values
+    pminsw      xmm0, max
+    pmaxsw      xmm0, min
+
+%if %1
+    movq        xmm1, [rdi]
+    pavgw       xmm0, xmm1
+%endif
+    movq        [rdi], xmm0
+%endm
+
+%macro HIGH_GET_FILTERS 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x00000040
+
+    movdqa      xmm7, [rdx]                 ;load filters
+    pshuflw     xmm0, xmm7, 0b              ;k0
+    pshuflw     xmm1, xmm7, 01010101b       ;k1
+    pshuflw     xmm2, xmm7, 10101010b       ;k2
+    pshuflw     xmm3, xmm7, 11111111b       ;k3
+    pshufhw     xmm4, xmm7, 0b              ;k4
+    pshufhw     xmm5, xmm7, 01010101b       ;k5
+    pshufhw     xmm6, xmm7, 10101010b       ;k6
+    pshufhw     xmm7, xmm7, 11111111b       ;k7
+    punpcklqdq  xmm2, xmm2
+    punpcklqdq  xmm3, xmm3
+    punpcklwd   xmm0, xmm1
+    punpckhwd   xmm6, xmm7
+    punpckhwd   xmm2, xmm5
+    punpckhwd   xmm3, xmm4
+
+    movdqa      k0k1, xmm0                  ;store filter factors on stack
+    movdqa      k6k7, xmm6
+    movdqa      k2k5, xmm2
+    movdqa      k3k4, xmm3
+
+    movq        xmm6, rcx
+    pshufd      xmm6, xmm6, 0
+    movdqa      krd, xmm6                   ;rounding
+
+    ;Compute max and min values of a pixel
+    mov         rdx, 0x00010001
+    movsxd      rcx, DWORD PTR arg(6)       ;bps
+    movq        xmm0, rdx
+    movq        xmm1, rcx
+    pshufd      xmm0, xmm0, 0b
+    movdqa      xmm2, xmm0
+    psllw       xmm0, xmm1
+    psubw       xmm0, xmm2
+    pxor        xmm1, xmm1
+    movdqa      max, xmm0                  ;max value (for clamping)
+    movdqa      min, xmm1                  ;min value (for clamping)
+%endm
+
+%macro LOAD_VERT_8 1
+    movdqu      xmm0, [rsi + %1]            ;0
+    movdqu      xmm1, [rsi + rax + %1]      ;1
+    movdqu      xmm6, [rsi + rdx * 2 + %1]  ;6
+    lea         rsi,  [rsi + rax]
+    movdqu      xmm7, [rsi + rdx * 2 + %1]  ;7
+    movdqu      xmm2, [rsi + rax + %1]      ;2
+    movdqu      xmm3, [rsi + rax * 2 + %1]  ;3
+    movdqu      xmm4, [rsi + rdx + %1]      ;4
+    movdqu      xmm5, [rsi + rax * 4 + %1]  ;5
+%endm
+
+%macro HIGH_APPLY_FILTER_8 2
+    movdqu      temp, xmm4
+    movdqa      xmm4, xmm0
+    punpcklwd   xmm0, xmm1
+    punpckhwd   xmm4, xmm1
+    movdqa      xmm1, xmm6
+    punpcklwd   xmm6, xmm7
+    punpckhwd   xmm1, xmm7
+    movdqa      xmm7, xmm2
+    punpcklwd   xmm2, xmm5
+    punpckhwd   xmm7, xmm5
+
+    movdqu      xmm5, temp
+    movdqu      temp, xmm4
+    movdqa      xmm4, xmm3
+    punpcklwd   xmm3, xmm5
+    punpckhwd   xmm4, xmm5
+    movdqu      xmm5, temp
+
+    pmaddwd     xmm0, k0k1
+    pmaddwd     xmm5, k0k1
+    pmaddwd     xmm6, k6k7
+    pmaddwd     xmm1, k6k7
+    pmaddwd     xmm2, k2k5
+    pmaddwd     xmm7, k2k5
+    pmaddwd     xmm3, k3k4
+    pmaddwd     xmm4, k3k4
+
+    paddd       xmm0, xmm6
+    paddd       xmm0, xmm2
+    paddd       xmm0, xmm3
+    paddd       xmm5, xmm1
+    paddd       xmm5, xmm7
+    paddd       xmm5, xmm4
+
+    paddd       xmm0, krd                   ;rounding
+    paddd       xmm5, krd
+    psrad       xmm0, 7                     ;shift
+    psrad       xmm5, 7
+    packssdw    xmm0, xmm5                  ;pack back to word
+
+    ;clamp the values
+    pminsw      xmm0, max
+    pmaxsw      xmm0, min
+
+%if %1
+    movdqu      xmm1, [rdi + %2]
+    pavgw       xmm0, xmm1
+%endif
+    movdqu      [rdi + %2], xmm0
+%endm
+
+;void vpx_filter_block1d4_v8_sse2
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    short *filter
+;)
+global sym(vpx_highbd_filter_block1d4_v8_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d4_v8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 7
+    %define k0k6 [rsp + 16 * 0]
+    %define k2k5 [rsp + 16 * 1]
+    %define k3k4 [rsp + 16 * 2]
+    %define k1k7 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define max [rsp + 16 * 5]
+    %define min [rsp + 16 * 6]
+
+    HIGH_GET_FILTERS_4
+
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rbx, [rbx + rbx]
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movq        xmm0, [rsi]                 ;load src: row 0
+    movq        xmm1, [rsi + rax]           ;1
+    movq        xmm6, [rsi + rdx * 2]       ;6
+    lea         rsi,  [rsi + rax]
+    movq        xmm7, [rsi + rdx * 2]       ;7
+    movq        xmm2, [rsi + rax]           ;2
+    movq        xmm3, [rsi + rax * 2]       ;3
+    movq        xmm4, [rsi + rdx]           ;4
+    movq        xmm5, [rsi + rax * 4]       ;5
+
+    HIGH_APPLY_FILTER_4 0
+
+    lea         rdi, [rdi + rbx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 7
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vpx_filter_block1d8_v8_sse2
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    short *filter
+;)
+global sym(vpx_highbd_filter_block1d8_v8_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d8_v8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 8
+    %define k0k1 [rsp + 16 * 0]
+    %define k6k7 [rsp + 16 * 1]
+    %define k2k5 [rsp + 16 * 2]
+    %define k3k4 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define temp [rsp + 16 * 5]
+    %define max [rsp + 16 * 6]
+    %define min [rsp + 16 * 7]
+
+    HIGH_GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rbx, [rbx + rbx]
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    LOAD_VERT_8 0
+    HIGH_APPLY_FILTER_8 0, 0
+
+    lea         rdi, [rdi + rbx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 8
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vpx_filter_block1d16_v8_sse2
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    short *filter
+;)
+global sym(vpx_highbd_filter_block1d16_v8_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d16_v8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 8
+    %define k0k1 [rsp + 16 * 0]
+    %define k6k7 [rsp + 16 * 1]
+    %define k2k5 [rsp + 16 * 2]
+    %define k3k4 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define temp [rsp + 16 * 5]
+    %define max [rsp + 16 * 6]
+    %define min [rsp + 16 * 7]
+
+    HIGH_GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rbx, [rbx + rbx]
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    LOAD_VERT_8 0
+    HIGH_APPLY_FILTER_8 0, 0
+    sub         rsi, rax
+
+    LOAD_VERT_8 16
+    HIGH_APPLY_FILTER_8 0, 16
+    add         rdi, rbx
+
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 8
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vpx_highbd_filter_block1d4_v8_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d4_v8_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 7
+    %define k0k6 [rsp + 16 * 0]
+    %define k2k5 [rsp + 16 * 1]
+    %define k3k4 [rsp + 16 * 2]
+    %define k1k7 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define max [rsp + 16 * 5]
+    %define min [rsp + 16 * 6]
+
+    HIGH_GET_FILTERS_4
+
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rbx, [rbx + rbx]
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movq        xmm0, [rsi]                 ;load src: row 0
+    movq        xmm1, [rsi + rax]           ;1
+    movq        xmm6, [rsi + rdx * 2]       ;6
+    lea         rsi,  [rsi + rax]
+    movq        xmm7, [rsi + rdx * 2]       ;7
+    movq        xmm2, [rsi + rax]           ;2
+    movq        xmm3, [rsi + rax * 2]       ;3
+    movq        xmm4, [rsi + rdx]           ;4
+    movq        xmm5, [rsi + rax * 4]       ;5
+
+    HIGH_APPLY_FILTER_4 1
+
+    lea         rdi, [rdi + rbx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 7
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vpx_highbd_filter_block1d8_v8_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d8_v8_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 8
+    %define k0k1 [rsp + 16 * 0]
+    %define k6k7 [rsp + 16 * 1]
+    %define k2k5 [rsp + 16 * 2]
+    %define k3k4 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define temp [rsp + 16 * 5]
+    %define max [rsp + 16 * 6]
+    %define min [rsp + 16 * 7]
+
+    HIGH_GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rbx, [rbx + rbx]
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+.loop:
+    LOAD_VERT_8 0
+    HIGH_APPLY_FILTER_8 1, 0
+
+    lea         rdi, [rdi + rbx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 8
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vpx_highbd_filter_block1d16_v8_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d16_v8_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 8
+    %define k0k1 [rsp + 16 * 0]
+    %define k6k7 [rsp + 16 * 1]
+    %define k2k5 [rsp + 16 * 2]
+    %define k3k4 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define temp [rsp + 16 * 5]
+    %define max [rsp + 16 * 6]
+    %define min [rsp + 16 * 7]
+
+    HIGH_GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rbx, [rbx + rbx]
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+.loop:
+    LOAD_VERT_8 0
+    HIGH_APPLY_FILTER_8 1, 0
+    sub         rsi, rax
+
+    LOAD_VERT_8 16
+    HIGH_APPLY_FILTER_8 1, 16
+    add         rdi, rbx
+
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 8
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vpx_filter_block1d4_h8_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    short *filter
+;)
+global sym(vpx_highbd_filter_block1d4_h8_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d4_h8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 7
+    %define k0k6 [rsp + 16 * 0]
+    %define k2k5 [rsp + 16 * 1]
+    %define k3k4 [rsp + 16 * 2]
+    %define k1k7 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define max [rsp + 16 * 5]
+    %define min [rsp + 16 * 6]
+
+    HIGH_GET_FILTERS_4
+
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rdx, [rdx + rdx]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 6]           ;load src
+    movdqu      xmm4,   [rsi + 2]
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm4
+    movdqa      xmm7, xmm4
+    movdqa      xmm2, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm5, xmm4
+
+    psrldq      xmm1, 2
+    psrldq      xmm6, 4
+    psrldq      xmm7, 6
+    psrldq      xmm2, 4
+    psrldq      xmm3, 6
+    psrldq      xmm5, 2
+
+    HIGH_APPLY_FILTER_4 0
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 7
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vpx_filter_block1d8_h8_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    short *filter
+;)
+global sym(vpx_highbd_filter_block1d8_h8_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d8_h8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 8
+    %define k0k1 [rsp + 16 * 0]
+    %define k6k7 [rsp + 16 * 1]
+    %define k2k5 [rsp + 16 * 2]
+    %define k3k4 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define temp [rsp + 16 * 5]
+    %define max [rsp + 16 * 6]
+    %define min [rsp + 16 * 7]
+
+    HIGH_GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rdx, [rdx + rdx]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 6]           ;load src
+    movdqu      xmm1,   [rsi - 4]
+    movdqu      xmm2,   [rsi - 2]
+    movdqu      xmm3,   [rsi]
+    movdqu      xmm4,   [rsi + 2]
+    movdqu      xmm5,   [rsi + 4]
+    movdqu      xmm6,   [rsi + 6]
+    movdqu      xmm7,   [rsi + 8]
+
+    HIGH_APPLY_FILTER_8 0, 0
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 8
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vpx_filter_block1d16_h8_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    short *filter
+;)
+global sym(vpx_highbd_filter_block1d16_h8_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d16_h8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 8
+    %define k0k1 [rsp + 16 * 0]
+    %define k6k7 [rsp + 16 * 1]
+    %define k2k5 [rsp + 16 * 2]
+    %define k3k4 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define temp [rsp + 16 * 5]
+    %define max [rsp + 16 * 6]
+    %define min [rsp + 16 * 7]
+
+    HIGH_GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rdx, [rdx + rdx]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 6]           ;load src
+    movdqu      xmm1,   [rsi - 4]
+    movdqu      xmm2,   [rsi - 2]
+    movdqu      xmm3,   [rsi]
+    movdqu      xmm4,   [rsi + 2]
+    movdqu      xmm5,   [rsi + 4]
+    movdqu      xmm6,   [rsi + 6]
+    movdqu      xmm7,   [rsi + 8]
+
+    HIGH_APPLY_FILTER_8 0, 0
+
+    movdqu      xmm0,   [rsi + 10]           ;load src
+    movdqu      xmm1,   [rsi + 12]
+    movdqu      xmm2,   [rsi + 14]
+    movdqu      xmm3,   [rsi + 16]
+    movdqu      xmm4,   [rsi + 18]
+    movdqu      xmm5,   [rsi + 20]
+    movdqu      xmm6,   [rsi + 22]
+    movdqu      xmm7,   [rsi + 24]
+
+    HIGH_APPLY_FILTER_8 0, 16
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 8
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vpx_highbd_filter_block1d4_h8_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d4_h8_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 7
+    %define k0k6 [rsp + 16 * 0]
+    %define k2k5 [rsp + 16 * 1]
+    %define k3k4 [rsp + 16 * 2]
+    %define k1k7 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define max [rsp + 16 * 5]
+    %define min [rsp + 16 * 6]
+
+    HIGH_GET_FILTERS_4
+
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rdx, [rdx + rdx]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 6]           ;load src
+    movdqu      xmm4,   [rsi + 2]
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm4
+    movdqa      xmm7, xmm4
+    movdqa      xmm2, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm5, xmm4
+
+    psrldq      xmm1, 2
+    psrldq      xmm6, 4
+    psrldq      xmm7, 6
+    psrldq      xmm2, 4
+    psrldq      xmm3, 6
+    psrldq      xmm5, 2
+
+    HIGH_APPLY_FILTER_4 1
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 7
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vpx_highbd_filter_block1d8_h8_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d8_h8_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 8
+    %define k0k1 [rsp + 16 * 0]
+    %define k6k7 [rsp + 16 * 1]
+    %define k2k5 [rsp + 16 * 2]
+    %define k3k4 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define temp [rsp + 16 * 5]
+    %define max [rsp + 16 * 6]
+    %define min [rsp + 16 * 7]
+
+    HIGH_GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rdx, [rdx + rdx]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 6]           ;load src
+    movdqu      xmm1,   [rsi - 4]
+    movdqu      xmm2,   [rsi - 2]
+    movdqu      xmm3,   [rsi]
+    movdqu      xmm4,   [rsi + 2]
+    movdqu      xmm5,   [rsi + 4]
+    movdqu      xmm6,   [rsi + 6]
+    movdqu      xmm7,   [rsi + 8]
+
+    HIGH_APPLY_FILTER_8 1, 0
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 8
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vpx_highbd_filter_block1d16_h8_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d16_h8_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 8
+    %define k0k1 [rsp + 16 * 0]
+    %define k6k7 [rsp + 16 * 1]
+    %define k2k5 [rsp + 16 * 2]
+    %define k3k4 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define temp [rsp + 16 * 5]
+    %define max [rsp + 16 * 6]
+    %define min [rsp + 16 * 7]
+
+    HIGH_GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rdx, [rdx + rdx]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 6]           ;load src
+    movdqu      xmm1,   [rsi - 4]
+    movdqu      xmm2,   [rsi - 2]
+    movdqu      xmm3,   [rsi]
+    movdqu      xmm4,   [rsi + 2]
+    movdqu      xmm5,   [rsi + 4]
+    movdqu      xmm6,   [rsi + 6]
+    movdqu      xmm7,   [rsi + 8]
+
+    HIGH_APPLY_FILTER_8 1, 0
+
+    movdqu      xmm0,   [rsi + 10]           ;load src
+    movdqu      xmm1,   [rsi + 12]
+    movdqu      xmm2,   [rsi + 14]
+    movdqu      xmm3,   [rsi + 16]
+    movdqu      xmm4,   [rsi + 18]
+    movdqu      xmm5,   [rsi + 20]
+    movdqu      xmm6,   [rsi + 22]
+    movdqu      xmm7,   [rsi + 24]
+
+    HIGH_APPLY_FILTER_8 1, 16
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 8
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/libs/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm b/libs/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm
new file mode 100644
index 0000000000..72f2ff71da
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm
@@ -0,0 +1,494 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro HIGH_GET_PARAM_4 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x00000040
+
+    movdqa      xmm3, [rdx]                 ;load filters
+    pshuflw     xmm4, xmm3, 11111111b       ;k3
+    psrldq      xmm3, 8
+    pshuflw     xmm3, xmm3, 0b              ;k4
+    punpcklwd   xmm4, xmm3                  ;k3k4
+
+    movq        xmm3, rcx                   ;rounding
+    pshufd      xmm3, xmm3, 0
+
+    mov         rdx, 0x00010001
+    movsxd      rcx, DWORD PTR arg(6)       ;bps
+    movq        xmm5, rdx
+    movq        xmm2, rcx
+    pshufd      xmm5, xmm5, 0b
+    movdqa      xmm1, xmm5
+    psllw       xmm5, xmm2
+    psubw       xmm5, xmm1                  ;max value (for clamping)
+    pxor        xmm2, xmm2                  ;min value (for clamping)
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+%endm
+
+%macro HIGH_APPLY_FILTER_4 1
+
+    punpcklwd   xmm0, xmm1                  ;two row in one register
+    pmaddwd     xmm0, xmm4                  ;multiply the filter factors
+
+    paddd       xmm0, xmm3                  ;rounding
+    psrad       xmm0, 7                     ;shift
+    packssdw    xmm0, xmm0                  ;pack to word
+
+    ;clamp the values
+    pminsw      xmm0, xmm5
+    pmaxsw      xmm0, xmm2
+
+%if %1
+    movq        xmm1, [rdi]
+    pavgw       xmm0, xmm1
+%endif
+
+    movq        [rdi], xmm0
+    lea         rsi, [rsi + 2*rax]
+    lea         rdi, [rdi + 2*rdx]
+    dec         rcx
+%endm
+
+%if ARCH_X86_64
+%macro HIGH_GET_PARAM 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x00000040
+
+    movdqa      xmm6, [rdx]                 ;load filters
+
+    pshuflw     xmm7, xmm6, 11111111b       ;k3
+    pshufhw     xmm6, xmm6, 0b              ;k4
+    psrldq      xmm6, 8
+    punpcklwd   xmm7, xmm6                  ;k3k4k3k4k3k4k3k4
+
+    movq        xmm4, rcx                   ;rounding
+    pshufd      xmm4, xmm4, 0
+
+    mov         rdx, 0x00010001
+    movsxd      rcx, DWORD PTR arg(6)       ;bps
+    movq        xmm8, rdx
+    movq        xmm5, rcx
+    pshufd      xmm8, xmm8, 0b
+    movdqa      xmm1, xmm8
+    psllw       xmm8, xmm5
+    psubw       xmm8, xmm1                  ;max value (for clamping)
+    pxor        xmm5, xmm5                  ;min value (for clamping)
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+%endm
+
+%macro HIGH_APPLY_FILTER_8 1
+    movdqa      xmm6, xmm0
+    punpckhwd   xmm6, xmm1
+    punpcklwd   xmm0, xmm1
+    pmaddwd     xmm6, xmm7
+    pmaddwd     xmm0, xmm7
+
+    paddd       xmm6, xmm4                  ;rounding
+    paddd       xmm0, xmm4                  ;rounding
+    psrad       xmm6, 7                     ;shift
+    psrad       xmm0, 7                     ;shift
+    packssdw    xmm0, xmm6                  ;pack back to word
+
+    ;clamp the values
+    pminsw      xmm0, xmm8
+    pmaxsw      xmm0, xmm5
+
+%if %1
+    movdqu      xmm1, [rdi]
+    pavgw       xmm0, xmm1
+%endif
+    movdqu      [rdi], xmm0                 ;store the result
+
+    lea         rsi, [rsi + 2*rax]
+    lea         rdi, [rdi + 2*rdx]
+    dec         rcx
+%endm
+
+%macro HIGH_APPLY_FILTER_16 1
+    movdqa      xmm9, xmm0
+    movdqa      xmm6, xmm2
+    punpckhwd   xmm9, xmm1
+    punpckhwd   xmm6, xmm3
+    punpcklwd   xmm0, xmm1
+    punpcklwd   xmm2, xmm3
+
+    pmaddwd     xmm9, xmm7
+    pmaddwd     xmm6, xmm7
+    pmaddwd     xmm0, xmm7
+    pmaddwd     xmm2, xmm7
+
+    paddd       xmm9, xmm4                  ;rounding
+    paddd       xmm6, xmm4
+    paddd       xmm0, xmm4
+    paddd       xmm2, xmm4
+
+    psrad       xmm9, 7                     ;shift
+    psrad       xmm6, 7
+    psrad       xmm0, 7
+    psrad       xmm2, 7
+
+    packssdw    xmm0, xmm9                  ;pack back to word
+    packssdw    xmm2, xmm6                  ;pack back to word
+
+    ;clamp the values
+    pminsw      xmm0, xmm8
+    pmaxsw      xmm0, xmm5
+    pminsw      xmm2, xmm8
+    pmaxsw      xmm2, xmm5
+
+%if %1
+    movdqu      xmm1, [rdi]
+    movdqu      xmm3, [rdi + 16]
+    pavgw       xmm0, xmm1
+    pavgw       xmm2, xmm3
+%endif
+    movdqu      [rdi], xmm0               ;store the result
+    movdqu      [rdi + 16], xmm2          ;store the result
+
+    lea         rsi, [rsi + 2*rax]
+    lea         rdi, [rdi + 2*rdx]
+    dec         rcx
+%endm
+%endif
+
+global sym(vpx_highbd_filter_block1d4_v2_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d4_v2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    HIGH_GET_PARAM_4
+.loop:
+    movq        xmm0, [rsi]                 ;load src
+    movq        xmm1, [rsi + 2*rax]
+
+    HIGH_APPLY_FILTER_4 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+%if ARCH_X86_64
+global sym(vpx_highbd_filter_block1d8_v2_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d8_v2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 8
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    HIGH_GET_PARAM
+.loop:
+    movdqu      xmm0, [rsi]                 ;0
+    movdqu      xmm1, [rsi + 2*rax]         ;1
+
+    HIGH_APPLY_FILTER_8 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vpx_highbd_filter_block1d16_v2_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d16_v2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 9
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    HIGH_GET_PARAM
+.loop:
+    movdqu        xmm0, [rsi]               ;0
+    movdqu        xmm2, [rsi + 16]
+    movdqu        xmm1, [rsi + 2*rax]       ;1
+    movdqu        xmm3, [rsi + 2*rax + 16]
+
+    HIGH_APPLY_FILTER_16 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%endif
+
+global sym(vpx_highbd_filter_block1d4_v2_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d4_v2_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    HIGH_GET_PARAM_4
+.loop:
+    movq        xmm0, [rsi]                 ;load src
+    movq        xmm1, [rsi + 2*rax]
+
+    HIGH_APPLY_FILTER_4 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+%if ARCH_X86_64
+global sym(vpx_highbd_filter_block1d8_v2_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d8_v2_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 8
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    HIGH_GET_PARAM
+.loop:
+    movdqu      xmm0, [rsi]                 ;0
+    movdqu      xmm1, [rsi + 2*rax]         ;1
+
+    HIGH_APPLY_FILTER_8 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vpx_highbd_filter_block1d16_v2_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d16_v2_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 9
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    HIGH_GET_PARAM
+.loop:
+    movdqu        xmm0, [rsi]               ;0
+    movdqu        xmm1, [rsi + 2*rax]       ;1
+    movdqu        xmm2, [rsi + 16]
+    movdqu        xmm3, [rsi + 2*rax + 16]
+
+    HIGH_APPLY_FILTER_16 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%endif
+
+global sym(vpx_highbd_filter_block1d4_h2_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d4_h2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    HIGH_GET_PARAM_4
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 2
+
+    HIGH_APPLY_FILTER_4 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+%if ARCH_X86_64
+global sym(vpx_highbd_filter_block1d8_h2_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d8_h2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 8
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    HIGH_GET_PARAM
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqu      xmm1, [rsi + 2]
+
+    HIGH_APPLY_FILTER_8 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vpx_highbd_filter_block1d16_h2_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d16_h2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 9
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    HIGH_GET_PARAM
+.loop:
+    movdqu      xmm0,   [rsi]               ;load src
+    movdqu      xmm1,   [rsi + 2]
+    movdqu      xmm2,   [rsi + 16]
+    movdqu      xmm3,   [rsi + 18]
+
+    HIGH_APPLY_FILTER_16 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%endif
+
+global sym(vpx_highbd_filter_block1d4_h2_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d4_h2_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    HIGH_GET_PARAM_4
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 2
+
+    HIGH_APPLY_FILTER_4 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+%if ARCH_X86_64
+global sym(vpx_highbd_filter_block1d8_h2_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d8_h2_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 8
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    HIGH_GET_PARAM
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqu      xmm1, [rsi + 2]
+
+    HIGH_APPLY_FILTER_8 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vpx_highbd_filter_block1d16_h2_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d16_h2_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 9
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    HIGH_GET_PARAM
+.loop:
+    movdqu      xmm0,   [rsi]               ;load src
+    movdqu      xmm1,   [rsi + 2]
+    movdqu      xmm2,   [rsi + 16]
+    movdqu      xmm3,   [rsi + 18]
+
+    HIGH_APPLY_FILTER_16 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%endif
diff --git a/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
new file mode 100644
index 0000000000..b718678537
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
@@ -0,0 +1,605 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// Due to a header conflict between math.h and intrinsics includes with ceil()
+// in certain configurations under vs9 this include needs to precede
+// immintrin.h.
+
+#include <immintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/convolve.h"
+#include "vpx_ports/mem.h"
+
+// filters for 16_h8 and 16_v8
+DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = {
+  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = {
+  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
+  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {
+  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
+  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
+  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
+  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
+};
+
+#if defined(__clang__)
+# if __clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ <= 3) || \
+    (defined(__APPLE__) && \
+        ((__clang_major__ == 4 && __clang_minor__ <= 2) || \
+            (__clang_major__ == 5 && __clang_minor__ == 0)))
+
+#  define MM256_BROADCASTSI128_SI256(x) \
+       _mm_broadcastsi128_si256((__m128i const *)&(x))
+# else  // clang > 3.3, and not 5.0 on macosx.
+#  define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+# endif  // clang <= 3.3
+#elif defined(__GNUC__)
+# if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
+#  define MM256_BROADCASTSI128_SI256(x) \
+       _mm_broadcastsi128_si256((__m128i const *)&(x))
+# elif __GNUC__ == 4 && __GNUC_MINOR__ == 7
+#  define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)
+# else  // gcc > 4.7
+#  define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+# endif  // gcc <= 4.6
+#else  // !(gcc || clang)
+# define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+#endif  // __clang__
+
+static void vpx_filter_block1d16_h8_avx2(const uint8_t *src_ptr,
+                                         ptrdiff_t src_pixels_per_line,
+                                         uint8_t *output_ptr,
+                                         ptrdiff_t output_pitch,
+                                         uint32_t output_height,
+                                         const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
+  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
+  __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
+  __m256i srcReg32b1, srcReg32b2, filtersReg32;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to 8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+  // duplicate only the first 16 bits (first and second byte)
+  // across 256 bit register
+  firstFilters = _mm256_shuffle_epi8(filtersReg32,
+                 _mm256_set1_epi16(0x100u));
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 256 bit register
+  secondFilters = _mm256_shuffle_epi8(filtersReg32,
+                  _mm256_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 256 bit register
+  thirdFilters = _mm256_shuffle_epi8(filtersReg32,
+                 _mm256_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits (seventh and eighth byte)
+  // across 256 bit register
+  forthFilters = _mm256_shuffle_epi8(filtersReg32,
+                 _mm256_set1_epi16(0x706u));
+
+  filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2);
+  filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2);
+  filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2);
+  filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pixels_per_line << 1;
+  dst_stride = output_pitch << 1;
+  for (i = output_height; i > 1; i-=2) {
+    // load the 2 strides of source
+    srcReg32b1 = _mm256_castsi128_si256(
+                 _mm_loadu_si128((const __m128i *)(src_ptr - 3)));
+    srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
+                 _mm_loadu_si128((const __m128i *)
+                 (src_ptr+src_pixels_per_line-3)), 1);
+
+    // filter the source buffer
+    srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
+    srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
+
+    // add and saturate the results together
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
+
+    // filter the source buffer
+    srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+    srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+    // add and saturate the results together
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,
+                       _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
+
+    // reading 2 strides of the next 16 bytes
+    // (part of it was being read by earlier read)
+    srcReg32b2 = _mm256_castsi128_si256(
+                 _mm_loadu_si128((const __m128i *)(src_ptr + 5)));
+    srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
+                 _mm_loadu_si128((const __m128i *)
+                 (src_ptr+src_pixels_per_line+5)), 1);
+
+    // add and saturate the results together
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,
+                       _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
+
+    // filter the source buffer
+    srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
+    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
+
+    // add and saturate the results together
+    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);
+
+    // filter the source buffer
+    srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
+    srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+    // add and saturate the results together
+    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1,
+                       _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
+    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1,
+                       _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
+
+
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64);
+
+    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64);
+
+    // shift by 7 bit each 16 bit
+    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7);
+    srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1,
+                                           srcRegFilt32b2_1);
+
+    src_ptr+=src_stride;
+
+    // save 16 bytes
+    _mm_store_si128((__m128i*)output_ptr,
+    _mm256_castsi256_si128(srcRegFilt32b1_1));
+
+    // save the next 16 bits
+    _mm_store_si128((__m128i*)(output_ptr+output_pitch),
+    _mm256_extractf128_si256(srcRegFilt32b1_1, 1));
+    output_ptr+=dst_stride;
+  }
+
+  // if the number of strides is odd.
+  // process only 16 bytes
+  if (i > 0) {
+    __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;
+    __m128i srcRegFilt2, srcRegFilt3;
+
+    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+
+    // filter the source buffer
+    srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1,
+                    _mm256_castsi256_si128(filt1Reg));
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg1,
+                  _mm256_castsi256_si128(filt4Reg));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1,
+                    _mm256_castsi256_si128(firstFilters));
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
+                  _mm256_castsi256_si128(forthFilters));
+
+    // add and saturate the results together
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
+
+    // filter the source buffer
+    srcRegFilt3= _mm_shuffle_epi8(srcReg1,
+                 _mm256_castsi256_si128(filt2Reg));
+    srcRegFilt2= _mm_shuffle_epi8(srcReg1,
+                 _mm256_castsi256_si128(filt3Reg));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
+                  _mm256_castsi256_si128(secondFilters));
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
+                  _mm256_castsi256_si128(thirdFilters));
+
+    // add and saturate the results together
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
+                    _mm_min_epi16(srcRegFilt3, srcRegFilt2));
+
+    // reading the next 16 bytes
+    // (part of it was being read by earlier read)
+    srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
+
+    // add and saturate the results together
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
+                    _mm_max_epi16(srcRegFilt3, srcRegFilt2));
+
+    // filter the source buffer
+    srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2,
+                    _mm256_castsi256_si128(filt1Reg));
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
+                  _mm256_castsi256_si128(filt4Reg));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1,
+                    _mm256_castsi256_si128(firstFilters));
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
+                  _mm256_castsi256_si128(forthFilters));
+
+    // add and saturate the results together
+    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
+
+    // filter the source buffer
+    srcRegFilt3 = _mm_shuffle_epi8(srcReg2,
+                  _mm256_castsi256_si128(filt2Reg));
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
+                  _mm256_castsi256_si128(filt3Reg));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
+                  _mm256_castsi256_si128(secondFilters));
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
+                  _mm256_castsi256_si128(thirdFilters));
+
+    // add and saturate the results together
+    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
+                    _mm_min_epi16(srcRegFilt3, srcRegFilt2));
+    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
+                    _mm_max_epi16(srcRegFilt3, srcRegFilt2));
+
+
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
+                    _mm256_castsi256_si128(addFilterReg64));
+
+    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
+                    _mm256_castsi256_si128(addFilterReg64));
+
+    // shift by 7 bit each 16 bit
+    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
+    srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
+
+    // save 16 bytes
+    _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);
+  }
+}
+
+static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr,
+                                         ptrdiff_t src_pitch,
+                                         uint8_t *output_ptr,
+                                         ptrdiff_t out_pitch,
+                                         uint32_t output_height,
+                                         const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i addFilterReg64;
+  __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
+  __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
+  __m256i srcReg32b11, srcReg32b12, filtersReg32;
+  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the
+  // same data in both lanes of 128 bit register.
+  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+  // duplicate only the first 16 bits (first and second byte)
+  // across 256 bit register
+  firstFilters = _mm256_shuffle_epi8(filtersReg32,
+                 _mm256_set1_epi16(0x100u));
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 256 bit register
+  secondFilters = _mm256_shuffle_epi8(filtersReg32,
+                  _mm256_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 256 bit register
+  thirdFilters = _mm256_shuffle_epi8(filtersReg32,
+                 _mm256_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits (seventh and eighth byte)
+  // across 256 bit register
+  forthFilters = _mm256_shuffle_epi8(filtersReg32,
+                 _mm256_set1_epi16(0x706u));
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pitch << 1;
+  dst_stride = out_pitch << 1;
+
+  // load 16 bytes 7 times in stride of src_pitch
+  srcReg32b1 = _mm256_castsi128_si256(
+               _mm_loadu_si128((const __m128i *)(src_ptr)));
+  srcReg32b2 = _mm256_castsi128_si256(
+               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)));
+  srcReg32b3 = _mm256_castsi128_si256(
+               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)));
+  srcReg32b4 = _mm256_castsi128_si256(
+               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)));
+  srcReg32b5 = _mm256_castsi128_si256(
+               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
+  srcReg32b6 = _mm256_castsi128_si256(
+               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
+  srcReg32b7 = _mm256_castsi128_si256(
+               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
+
+  // have each consecutive loads on the same 256 register
+  srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
+               _mm256_castsi256_si128(srcReg32b2), 1);
+  srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
+               _mm256_castsi256_si128(srcReg32b3), 1);
+  srcReg32b3 = _mm256_inserti128_si256(srcReg32b3,
+               _mm256_castsi256_si128(srcReg32b4), 1);
+  srcReg32b4 = _mm256_inserti128_si256(srcReg32b4,
+               _mm256_castsi256_si128(srcReg32b5), 1);
+  srcReg32b5 = _mm256_inserti128_si256(srcReg32b5,
+               _mm256_castsi256_si128(srcReg32b6), 1);
+  srcReg32b6 = _mm256_inserti128_si256(srcReg32b6,
+               _mm256_castsi256_si128(srcReg32b7), 1);
+
+  // merge every two consecutive registers except the last one
+  srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
+  srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2);
+
+  // save
+  srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
+
+  // save
+  srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4);
+
+  // save
+  srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
+
+  // save
+  srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6);
+
+
+  for (i = output_height; i > 1; i-=2) {
+     // load the last 2 loads of 16 bytes and have every two
+     // consecutive loads in the same 256 bit register
+     srcReg32b8 = _mm256_castsi128_si256(
+     _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)));
+     srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
+     _mm256_castsi256_si128(srcReg32b8), 1);
+     srcReg32b9 = _mm256_castsi128_si256(
+     _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8)));
+     srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
+     _mm256_castsi256_si128(srcReg32b9), 1);
+
+     // merge every two consecutive registers
+     // save
+     srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
+     srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);
+
+     // multiply 2 adjacent elements with the filter and add the result
+     srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
+     srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
+
+     // add and saturate the results together
+     srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
+
+     // multiply 2 adjacent elements with the filter and add the result
+     srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
+     srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
+
+     // add and saturate the results together
+     srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
+                   _mm256_min_epi16(srcReg32b8, srcReg32b12));
+     srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
+                   _mm256_max_epi16(srcReg32b8, srcReg32b12));
+
+     // multiply 2 adjacent elements with the filter and add the result
+     srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
+     srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
+
+     srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6);
+
+     // multiply 2 adjacent elements with the filter and add the result
+     srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
+     srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
+
+     // add and saturate the results together
+     srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
+                  _mm256_min_epi16(srcReg32b8, srcReg32b12));
+     srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
+                  _mm256_max_epi16(srcReg32b8, srcReg32b12));
+
+     srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64);
+     srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64);
+
+     // shift by 7 bit each 16 bit
+     srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7);
+     srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7);
+
+     // shrink to 8 bit each 16 bits, the first lane contain the first
+     // convolve result and the second lane contain the second convolve
+     // result
+     srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1);
+
+     src_ptr+=src_stride;
+
+     // save 16 bytes
+     _mm_store_si128((__m128i*)output_ptr,
+     _mm256_castsi256_si128(srcReg32b1));
+
+     // save the next 16 bits
+     _mm_store_si128((__m128i*)(output_ptr+out_pitch),
+     _mm256_extractf128_si256(srcReg32b1, 1));
+
+     output_ptr+=dst_stride;
+
+     // save part of the registers for next strides
+     srcReg32b10 = srcReg32b11;
+     srcReg32b1 = srcReg32b3;
+     srcReg32b11 = srcReg32b2;
+     srcReg32b3 = srcReg32b5;
+     srcReg32b2 = srcReg32b4;
+     srcReg32b5 = srcReg32b7;
+     srcReg32b7 = srcReg32b9;
+  }
+  if (i > 0) {
+    __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5;
+    __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8;
+    // load the last 16 bytes
+    srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
+
+    // merge the last 2 results together
+    srcRegFilt4 = _mm_unpacklo_epi8(
+                  _mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
+    srcRegFilt7 = _mm_unpackhi_epi8(
+                  _mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
+                  _mm256_castsi256_si128(firstFilters));
+    srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4,
+                  _mm256_castsi256_si128(forthFilters));
+    srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1),
+                  _mm256_castsi256_si128(firstFilters));
+    srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7,
+                  _mm256_castsi256_si128(forthFilters));
+
+    // add and saturate the results together
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7);
+
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
+                  _mm256_castsi256_si128(secondFilters));
+    srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3),
+                  _mm256_castsi256_si128(secondFilters));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
+                  _mm256_castsi256_si128(thirdFilters));
+    srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5),
+                  _mm256_castsi256_si128(thirdFilters));
+
+    // add and saturate the results together
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
+                  _mm_min_epi16(srcRegFilt4, srcRegFilt6));
+    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
+                  _mm_min_epi16(srcRegFilt5, srcRegFilt7));
+
+    // add and saturate the results together
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
+                  _mm_max_epi16(srcRegFilt4, srcRegFilt6));
+    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
+                  _mm_max_epi16(srcRegFilt5, srcRegFilt7));
+
+
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
+                  _mm256_castsi256_si128(addFilterReg64));
+    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
+                  _mm256_castsi256_si128(addFilterReg64));
+
+    // shift by 7 bit each 16 bit
+    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+    srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);
+
+    // save 16 bytes
+    _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
+  }
+}
+
+#if HAVE_AVX2 && HAVE_SSSE3
+filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
+#if ARCH_X86_64
+filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
+#define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_intrin_ssse3
+#define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_intrin_ssse3
+#define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_intrin_ssse3
+#else  // ARCH_X86
+filter8_1dfunction vpx_filter_block1d8_v8_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h8_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h8_ssse3;
+#define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_ssse3
+#define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_ssse3
+#define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_ssse3
+#endif  // ARCH_X86_64
+filter8_1dfunction vpx_filter_block1d16_v2_ssse3;
+filter8_1dfunction vpx_filter_block1d16_h2_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v2_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h2_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v2_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h2_ssse3;
+#define vpx_filter_block1d4_v8_avx2 vpx_filter_block1d4_v8_ssse3
+#define vpx_filter_block1d16_v2_avx2 vpx_filter_block1d16_v2_ssse3
+#define vpx_filter_block1d16_h2_avx2 vpx_filter_block1d16_h2_ssse3
+#define vpx_filter_block1d8_v2_avx2  vpx_filter_block1d8_v2_ssse3
+#define vpx_filter_block1d8_h2_avx2  vpx_filter_block1d8_h2_ssse3
+#define vpx_filter_block1d4_v2_avx2  vpx_filter_block1d4_v2_ssse3
+#define vpx_filter_block1d4_h2_avx2  vpx_filter_block1d4_h2_ssse3
+// void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
+//                                uint8_t *dst, ptrdiff_t dst_stride,
+//                                const int16_t *filter_x, int x_step_q4,
+//                                const int16_t *filter_y, int y_step_q4,
+//                                int w, int h);
+// void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
+//                               uint8_t *dst, ptrdiff_t dst_stride,
+//                               const int16_t *filter_x, int x_step_q4,
+//                               const int16_t *filter_y, int y_step_q4,
+//                               int w, int h);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
+
+// void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
+//                          uint8_t *dst, ptrdiff_t dst_stride,
+//                          const int16_t *filter_x, int x_step_q4,
+//                          const int16_t *filter_y, int y_step_q4,
+//                          int w, int h);
+FUN_CONV_2D(, avx2);
+#endif  // HAVE_AX2 && HAVE_SSSE3
diff --git a/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
new file mode 100644
index 0000000000..6fd52087c7
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
@@ -0,0 +1,915 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// Due to a header conflict between math.h and intrinsics includes with ceil()
+// in certain configurations under vs9 this include needs to precede
+// tmmintrin.h.
+
+#include <tmmintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_dsp/x86/convolve.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/emmintrin_compat.h"
+
+// filters only for the 4_h8 convolution
+DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = {
+  0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = {
+  4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10
+};
+
+// filters for 8_h8 and 16_h8
+DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = {
+  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = {
+  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = {
+  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = {
+  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
+};
+
+// These are reused by the avx2 intrinsics.
+filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
+
+void vpx_filter_block1d4_h8_intrin_ssse3(const uint8_t *src_ptr,
+                                         ptrdiff_t src_pixels_per_line,
+                                         uint8_t *output_ptr,
+                                         ptrdiff_t output_pitch,
+                                         uint32_t output_height,
+                                         const int16_t *filter) {
+  __m128i firstFilters, secondFilters, shuffle1, shuffle2;
+  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
+  __m128i addFilterReg64, filtersReg, srcReg, minReg;
+  unsigned int i;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 =_mm_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the first 16 bits in the filter into the first lane
+  firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
+  // duplicate only the third 16 bit in the filter into the first lane
+  secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
+  // duplicate only the seconds 16 bits in the filter into the second lane
+  // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3
+  firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
+  // duplicate only the forth 16 bits in the filter into the second lane
+  // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7
+  secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
+
+  // loading the local filters
+  shuffle1 =_mm_load_si128((__m128i const *)filt1_4_h8);
+  shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8);
+
+  for (i = 0; i < output_height; i++) {
+    srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+
+    // filter the source buffer
+    srcRegFilt1= _mm_shuffle_epi8(srcReg, shuffle1);
+    srcRegFilt2= _mm_shuffle_epi8(srcReg, shuffle2);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
+
+    // extract the higher half of the lane
+    srcRegFilt3 =  _mm_srli_si128(srcRegFilt1, 8);
+    srcRegFilt4 =  _mm_srli_si128(srcRegFilt2, 8);
+
+    minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
+
+    // add and saturate all the results together
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+    srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+    // shift by 7 bit each 16 bits
+    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+    // shrink to 8 bit each 16 bits
+    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+    src_ptr+=src_pixels_per_line;
+
+    // save only 4 bytes
+    *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1);
+
+    output_ptr+=output_pitch;
+  }
+}
+
+void vpx_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr,
+                                         ptrdiff_t src_pixels_per_line,
+                                         uint8_t *output_ptr,
+                                         ptrdiff_t output_pitch,
+                                         uint32_t output_height,
+                                         const int16_t *filter) {
+  __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
+  __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
+  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
+  __m128i addFilterReg64, filtersReg, minReg;
+  unsigned int i;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the first 16 bits (first and second byte)
+  // across 128 bit register
+  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 128 bit register
+  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 128 bit register
+  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits (seventh and eighth byte)
+  // across 128 bit register
+  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+
+  filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
+  filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
+  filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
+  filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
+
+  for (i = 0; i < output_height; i++) {
+    srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+
+    // filter the source buffer
+    srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg);
+    srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
+
+    // filter the source buffer
+    srcRegFilt3= _mm_shuffle_epi8(srcReg, filt3Reg);
+    srcRegFilt4= _mm_shuffle_epi8(srcReg, filt4Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
+    srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);
+
+    // add and saturate all the results together
+    minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+
+    srcRegFilt2= _mm_max_epi16(srcRegFilt2, srcRegFilt3);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+    // shift by 7 bit each 16 bits
+    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+    // shrink to 8 bit each 16 bits
+    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+
+    src_ptr+=src_pixels_per_line;
+
+    // save only 8 bytes
+    _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
+
+    output_ptr+=output_pitch;
+  }
+}
+
+void vpx_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr,
+                                         ptrdiff_t src_pitch,
+                                         uint8_t *output_ptr,
+                                         ptrdiff_t out_pitch,
+                                         uint32_t output_height,
+                                         const int16_t *filter) {
+  __m128i addFilterReg64, filtersReg, minReg;
+  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
+  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;
+  __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
+  __m128i srcReg8;
+  unsigned int i;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the first 16 bits in the filter
+  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
+  // duplicate only the second 16 bits in the filter
+  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+  // duplicate only the third 16 bits in the filter
+  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits in the filter
+  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+
+  // load the first 7 rows of 8 bytes
+  srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr);
+  srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
+  srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
+  srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
+  srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
+  srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
+  srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
+
+  for (i = 0; i < output_height; i++) {
+    // load the last 8 bytes
+    srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
+
+    // merge the result together
+    srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2);
+    srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
+
+    // merge the result together
+    srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6);
+    srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
+    srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters);
+
+    // add and saturate the results together
+    minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5);
+    srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+    // shift by 7 bit each 16 bit
+    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+    // shrink to 8 bit each 16 bits
+    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+
+    src_ptr+=src_pitch;
+
+    // shift down a row
+    srcReg1 = srcReg2;
+    srcReg2 = srcReg3;
+    srcReg3 = srcReg4;
+    srcReg4 = srcReg5;
+    srcReg5 = srcReg6;
+    srcReg6 = srcReg7;
+    srcReg7 = srcReg8;
+
+    // save only 8 bytes convolve result
+    _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
+
+    output_ptr+=out_pitch;
+  }
+}
+
+filter8_1dfunction vpx_filter_block1d16_v8_ssse3;
+filter8_1dfunction vpx_filter_block1d16_h8_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v8_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h8_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h8_ssse3;
+filter8_1dfunction vpx_filter_block1d16_v8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d16_h8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h8_avg_ssse3;
+
+filter8_1dfunction vpx_filter_block1d16_v2_ssse3;
+filter8_1dfunction vpx_filter_block1d16_h2_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v2_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h2_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v2_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h2_ssse3;
+filter8_1dfunction vpx_filter_block1d16_v2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d16_h2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3;
+
+// void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+//                                uint8_t *dst, ptrdiff_t dst_stride,
+//                                const int16_t *filter_x, int x_step_q4,
+//                                const int16_t *filter_y, int y_step_q4,
+//                                int w, int h);
+// void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+//                               uint8_t *dst, ptrdiff_t dst_stride,
+//                               const int16_t *filter_x, int x_step_q4,
+//                               const int16_t *filter_y, int y_step_q4,
+//                               int w, int h);
+// void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+//                                    uint8_t *dst, ptrdiff_t dst_stride,
+//                                    const int16_t *filter_x, int x_step_q4,
+//                                    const int16_t *filter_y, int y_step_q4,
+//                                    int w, int h);
+// void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+//                                   uint8_t *dst, ptrdiff_t dst_stride,
+//                                   const int16_t *filter_x, int x_step_q4,
+//                                   const int16_t *filter_y, int y_step_q4,
+//                                   int w, int h);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
+FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
+FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
+            ssse3);
+
+#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7,           \
+                      out0, out1, out2, out3, out4, out5, out6, out7) { \
+  const __m128i tr0_0 = _mm_unpacklo_epi8(in0, in1);                    \
+  const __m128i tr0_1 = _mm_unpacklo_epi8(in2, in3);                    \
+  const __m128i tr0_2 = _mm_unpacklo_epi8(in4, in5);                    \
+  const __m128i tr0_3 = _mm_unpacklo_epi8(in6, in7);                    \
+                                                                        \
+  const __m128i tr1_0 = _mm_unpacklo_epi16(tr0_0, tr0_1);               \
+  const __m128i tr1_1 = _mm_unpackhi_epi16(tr0_0, tr0_1);               \
+  const __m128i tr1_2 = _mm_unpacklo_epi16(tr0_2, tr0_3);               \
+  const __m128i tr1_3 = _mm_unpackhi_epi16(tr0_2, tr0_3);               \
+                                                                        \
+  const __m128i tr2_0 = _mm_unpacklo_epi32(tr1_0, tr1_2);               \
+  const __m128i tr2_1 = _mm_unpackhi_epi32(tr1_0, tr1_2);               \
+  const __m128i tr2_2 = _mm_unpacklo_epi32(tr1_1, tr1_3);               \
+  const __m128i tr2_3 = _mm_unpackhi_epi32(tr1_1, tr1_3);               \
+                                                                        \
+  out0 = _mm_unpacklo_epi64(tr2_0, tr2_0);                              \
+  out1 = _mm_unpackhi_epi64(tr2_0, tr2_0);                              \
+  out2 = _mm_unpacklo_epi64(tr2_1, tr2_1);                              \
+  out3 = _mm_unpackhi_epi64(tr2_1, tr2_1);                              \
+  out4 = _mm_unpacklo_epi64(tr2_2, tr2_2);                              \
+  out5 = _mm_unpackhi_epi64(tr2_2, tr2_2);                              \
+  out6 = _mm_unpacklo_epi64(tr2_3, tr2_3);                              \
+  out7 = _mm_unpackhi_epi64(tr2_3, tr2_3);                              \
+}
+
+static void filter_horiz_w8_ssse3(const uint8_t *src_x, ptrdiff_t src_pitch,
+                                  uint8_t *dst, const int16_t *x_filter) {
+  const __m128i k_256 = _mm_set1_epi16(1 << 8);
+  const __m128i f_values = _mm_load_si128((const __m128i *)x_filter);
+  // pack and duplicate the filter values
+  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
+  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
+  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
+  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
+  const __m128i A = _mm_loadl_epi64((const __m128i *)src_x);
+  const __m128i B = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch));
+  const __m128i C = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 2));
+  const __m128i D = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 3));
+  const __m128i E = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 4));
+  const __m128i F = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 5));
+  const __m128i G = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 6));
+  const __m128i H = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 7));
+  // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
+  const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
+  // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
+  const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
+  // 40 41 50 51 42 43 52 53 44 45 54 55 46 47 56 57
+  const __m128i tr0_2 = _mm_unpacklo_epi16(E, F);
+  // 60 61 70 71 62 63 72 73 64 65 74 75 66 67 76 77
+  const __m128i tr0_3 = _mm_unpacklo_epi16(G, H);
+  // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
+  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
+  const __m128i tr1_1 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+  // 40 41 50 51 60 61 70 71 42 43 52 53 62 63 72 73
+  const __m128i tr1_2 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+  // 44 45 54 55 64 65 74 75 46 47 56 57 66 67 76 77
+  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+  // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71
+  const __m128i s1s0 = _mm_unpacklo_epi64(tr1_0, tr1_2);
+  const __m128i s3s2 = _mm_unpackhi_epi64(tr1_0, tr1_2);
+  const __m128i s5s4 = _mm_unpacklo_epi64(tr1_1, tr1_3);
+  const __m128i s7s6 = _mm_unpackhi_epi64(tr1_1, tr1_3);
+  // multiply 2 adjacent elements with the filter and add the result
+  const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
+  const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
+  const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
+  const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
+  // add and saturate the results together
+  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
+  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
+  __m128i temp = _mm_adds_epi16(x0, x3);
+  temp = _mm_adds_epi16(temp, min_x2x1);
+  temp = _mm_adds_epi16(temp, max_x2x1);
+  // round and shift by 7 bit each 16 bit
+  temp = _mm_mulhrs_epi16(temp, k_256);
+  // shrink to 8 bit each 16 bits
+  temp = _mm_packus_epi16(temp, temp);
+  // save only 8 bytes convolve result
+  _mm_storel_epi64((__m128i*)dst, temp);
+}
+
+static void transpose8x8_to_dst(const uint8_t *src, ptrdiff_t src_stride,
+                                uint8_t *dst, ptrdiff_t dst_stride) {
+  __m128i A, B, C, D, E, F, G, H;
+
+  A = _mm_loadl_epi64((const __m128i *)src);
+  B = _mm_loadl_epi64((const __m128i *)(src + src_stride));
+  C = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2));
+  D = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3));
+  E = _mm_loadl_epi64((const __m128i *)(src + src_stride * 4));
+  F = _mm_loadl_epi64((const __m128i *)(src + src_stride * 5));
+  G = _mm_loadl_epi64((const __m128i *)(src + src_stride * 6));
+  H = _mm_loadl_epi64((const __m128i *)(src + src_stride * 7));
+
+  TRANSPOSE_8X8(A, B, C, D, E, F, G, H,
+                A, B, C, D, E, F, G, H);
+
+  _mm_storel_epi64((__m128i*)dst, A);
+  _mm_storel_epi64((__m128i*)(dst + dst_stride * 1), B);
+  _mm_storel_epi64((__m128i*)(dst + dst_stride * 2), C);
+  _mm_storel_epi64((__m128i*)(dst + dst_stride * 3), D);
+  _mm_storel_epi64((__m128i*)(dst + dst_stride * 4), E);
+  _mm_storel_epi64((__m128i*)(dst + dst_stride * 5), F);
+  _mm_storel_epi64((__m128i*)(dst + dst_stride * 6), G);
+  _mm_storel_epi64((__m128i*)(dst + dst_stride * 7), H);
+}
+
+static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride,
+                                    uint8_t *dst, ptrdiff_t dst_stride,
+                                    const InterpKernel *x_filters,
+                                    int x0_q4, int x_step_q4, int w, int h) {
+  DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
+  int x, y, z;
+  src -= SUBPEL_TAPS / 2 - 1;
+
+  // This function processes 8x8 areas.  The intermediate height is not always
+  // a multiple of 8, so force it to be a multiple of 8 here.
+  y = h + (8 - (h & 0x7));
+
+  do {
+    int x_q4 = x0_q4;
+    for (x = 0; x < w; x += 8) {
+      // process 8 src_x steps
+      for (z = 0; z < 8; ++z) {
+        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+        const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+        if (x_q4 & SUBPEL_MASK) {
+          filter_horiz_w8_ssse3(src_x, src_stride, temp + (z * 8), x_filter);
+        } else {
+          int i;
+          for (i = 0; i < 8; ++i) {
+            temp[z * 8 + i] = src_x[i * src_stride + 3];
+          }
+        }
+        x_q4 += x_step_q4;
+      }
+
+      // transpose the 8x8 filters values back to dst
+      transpose8x8_to_dst(temp, 8, dst + x, dst_stride);
+    }
+
+    src += src_stride * 8;
+    dst += dst_stride * 8;
+  } while (y -= 8);
+}
+
+static void filter_horiz_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
+                                  uint8_t *dst, const int16_t *filter) {
+  const __m128i k_256 = _mm_set1_epi16(1 << 8);
+  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
+  // pack and duplicate the filter values
+  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
+  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
+  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
+  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
+  const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
+  const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
+  const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
+  const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
+  // TRANSPOSE...
+  // 00 01 02 03 04 05 06 07
+  // 10 11 12 13 14 15 16 17
+  // 20 21 22 23 24 25 26 27
+  // 30 31 32 33 34 35 36 37
+  //
+  // TO
+  //
+  // 00 10 20 30
+  // 01 11 21 31
+  // 02 12 22 32
+  // 03 13 23 33
+  // 04 14 24 34
+  // 05 15 25 35
+  // 06 16 26 36
+  // 07 17 27 37
+  //
+  // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
+  const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
+  // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
+  const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
+  // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
+  const __m128i s1s0  = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
+  const __m128i s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+  // 02 03 12 13 22 23 32 33
+  const __m128i s3s2 = _mm_srli_si128(s1s0, 8);
+  // 06 07 16 17 26 27 36 37
+  const __m128i s7s6 = _mm_srli_si128(s5s4, 8);
+  // multiply 2 adjacent elements with the filter and add the result
+  const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
+  const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
+  const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
+  const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
+  // add and saturate the results together
+  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
+  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
+  __m128i temp = _mm_adds_epi16(x0, x3);
+  temp = _mm_adds_epi16(temp, min_x2x1);
+  temp = _mm_adds_epi16(temp, max_x2x1);
+  // round and shift by 7 bit each 16 bit
+  temp = _mm_mulhrs_epi16(temp, k_256);
+  // shrink to 8 bit each 16 bits
+  temp = _mm_packus_epi16(temp, temp);
+  // save only 4 bytes
+  *(int *)dst = _mm_cvtsi128_si32(temp);
+}
+
+static void transpose4x4_to_dst(const uint8_t *src, ptrdiff_t src_stride,
+                                uint8_t *dst, ptrdiff_t dst_stride) {
+  __m128i A = _mm_cvtsi32_si128(*(const int *)src);
+  __m128i B = _mm_cvtsi32_si128(*(const int *)(src + src_stride));
+  __m128i C = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 2));
+  __m128i D = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 3));
+  // 00 10 01 11 02 12 03 13
+  const __m128i tr0_0 = _mm_unpacklo_epi8(A, B);
+  // 20 30 21 31 22 32 23 33
+  const __m128i tr0_1 = _mm_unpacklo_epi8(C, D);
+  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  A = _mm_unpacklo_epi16(tr0_0, tr0_1);
+  B = _mm_srli_si128(A, 4);
+  C = _mm_srli_si128(A, 8);
+  D = _mm_srli_si128(A, 12);
+
+  *(int *)(dst) =  _mm_cvtsi128_si32(A);
+  *(int *)(dst + dst_stride) =  _mm_cvtsi128_si32(B);
+  *(int *)(dst + dst_stride * 2) =  _mm_cvtsi128_si32(C);
+  *(int *)(dst + dst_stride * 3) =  _mm_cvtsi128_si32(D);
+}
+
+static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride,
+                                    uint8_t *dst, ptrdiff_t dst_stride,
+                                    const InterpKernel *x_filters,
+                                    int x0_q4, int x_step_q4, int w, int h) {
+  DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
+  int x, y, z;
+  src -= SUBPEL_TAPS / 2 - 1;
+
+  for (y = 0; y < h; y += 4) {
+    int x_q4 = x0_q4;
+    for (x = 0; x < w; x += 4) {
+      // process 4 src_x steps
+      for (z = 0; z < 4; ++z) {
+        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+        const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+        if (x_q4 & SUBPEL_MASK) {
+          filter_horiz_w4_ssse3(src_x, src_stride, temp + (z * 4), x_filter);
+        } else {
+          int i;
+          for (i = 0; i < 4; ++i) {
+            temp[z * 4 + i] = src_x[i * src_stride + 3];
+          }
+        }
+        x_q4 += x_step_q4;
+      }
+
+      // transpose the 4x4 filters values back to dst
+      transpose4x4_to_dst(temp, 4, dst + x, dst_stride);
+    }
+
+    src += src_stride * 4;
+    dst += dst_stride * 4;
+  }
+}
+
+static void filter_vert_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
+                                 uint8_t *dst, const int16_t *filter) {
+  const __m128i k_256 = _mm_set1_epi16(1 << 8);
+  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
+  // pack and duplicate the filter values
+  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
+  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
+  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
+  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
+  const __m128i A = _mm_cvtsi32_si128(*(const int *)src_ptr);
+  const __m128i B = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch));
+  const __m128i C = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 2));
+  const __m128i D = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 3));
+  const __m128i E = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 4));
+  const __m128i F = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 5));
+  const __m128i G = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 6));
+  const __m128i H = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 7));
+  const __m128i s1s0 = _mm_unpacklo_epi8(A, B);
+  const __m128i s3s2 = _mm_unpacklo_epi8(C, D);
+  const __m128i s5s4 = _mm_unpacklo_epi8(E, F);
+  const __m128i s7s6 = _mm_unpacklo_epi8(G, H);
+  // multiply 2 adjacent elements with the filter and add the result
+  const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
+  const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
+  const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
+  const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
+  // add and saturate the results together
+  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
+  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
+  __m128i temp = _mm_adds_epi16(x0, x3);
+  temp = _mm_adds_epi16(temp, min_x2x1);
+  temp = _mm_adds_epi16(temp, max_x2x1);
+  // round and shift by 7 bit each 16 bit
+  temp = _mm_mulhrs_epi16(temp, k_256);
+  // shrink to 8 bit each 16 bits
+  temp = _mm_packus_epi16(temp, temp);
+  // save only 4 bytes
+  *(int *)dst = _mm_cvtsi128_si32(temp);
+}
+
+static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride,
+                                   uint8_t *dst, ptrdiff_t dst_stride,
+                                   const InterpKernel *y_filters,
+                                   int y0_q4, int y_step_q4, int w, int h) {
+  int y;
+  int y_q4 = y0_q4;
+
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  for (y = 0; y < h; ++y) {
+    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+    const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+
+    if (y_q4 & SUBPEL_MASK) {
+      filter_vert_w4_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
+    } else {
+      memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
+    }
+
+    y_q4 += y_step_q4;
+  }
+}
+
+static void filter_vert_w8_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
+                                 uint8_t *dst, const int16_t *filter) {
+  const __m128i k_256 = _mm_set1_epi16(1 << 8);
+  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
+  // pack and duplicate the filter values
+  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
+  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
+  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
+  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
+  const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
+  const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
+  const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
+  const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
+  const __m128i E = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
+  const __m128i F = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
+  const __m128i G = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
+  const __m128i H = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
+  const __m128i s1s0 = _mm_unpacklo_epi8(A, B);
+  const __m128i s3s2 = _mm_unpacklo_epi8(C, D);
+  const __m128i s5s4 = _mm_unpacklo_epi8(E, F);
+  const __m128i s7s6 = _mm_unpacklo_epi8(G, H);
+  // multiply 2 adjacent elements with the filter and add the result
+  const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
+  const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
+  const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
+  const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
+  // add and saturate the results together
+  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
+  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
+  __m128i temp = _mm_adds_epi16(x0, x3);
+  temp = _mm_adds_epi16(temp, min_x2x1);
+  temp = _mm_adds_epi16(temp, max_x2x1);
+  // round and shift by 7 bit each 16 bit
+  temp = _mm_mulhrs_epi16(temp, k_256);
+  // shrink to 8 bit each 16 bits
+  temp = _mm_packus_epi16(temp, temp);
+  // save only 8 bytes convolve result
+  _mm_storel_epi64((__m128i*)dst, temp);
+}
+
+static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride,
+                                   uint8_t *dst, ptrdiff_t dst_stride,
+                                   const InterpKernel *y_filters,
+                                   int y0_q4, int y_step_q4, int w, int h) {
+  int y;
+  int y_q4 = y0_q4;
+
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  for (y = 0; y < h; ++y) {
+    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+    const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+    if (y_q4 & SUBPEL_MASK) {
+      filter_vert_w8_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
+    } else {
+      memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
+    }
+    y_q4 += y_step_q4;
+  }
+}
+
+static void filter_vert_w16_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
+                                  uint8_t *dst, const int16_t *filter, int w) {
+  const __m128i k_256 = _mm_set1_epi16(1 << 8);
+  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
+  // pack and duplicate the filter values
+  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
+  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
+  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
+  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
+  int i;
+
+  for (i = 0; i < w; i += 16) {
+    const __m128i A = _mm_loadu_si128((const __m128i *)src_ptr);
+    const __m128i B = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
+    const __m128i C =
+        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
+    const __m128i D =
+        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
+    const __m128i E =
+        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
+    const __m128i F =
+        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
+    const __m128i G =
+        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
+    const __m128i H =
+        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
+    // merge the result together
+    const __m128i s1s0_lo = _mm_unpacklo_epi8(A, B);
+    const __m128i s7s6_lo = _mm_unpacklo_epi8(G, H);
+    const __m128i s1s0_hi = _mm_unpackhi_epi8(A, B);
+    const __m128i s7s6_hi = _mm_unpackhi_epi8(G, H);
+    // multiply 2 adjacent elements with the filter and add the result
+    const __m128i x0_lo = _mm_maddubs_epi16(s1s0_lo, f1f0);
+    const __m128i x3_lo = _mm_maddubs_epi16(s7s6_lo, f7f6);
+    const __m128i x0_hi = _mm_maddubs_epi16(s1s0_hi, f1f0);
+    const __m128i x3_hi = _mm_maddubs_epi16(s7s6_hi, f7f6);
+    // add and saturate the results together
+    const __m128i x3x0_lo = _mm_adds_epi16(x0_lo, x3_lo);
+    const __m128i x3x0_hi = _mm_adds_epi16(x0_hi, x3_hi);
+    // merge the result together
+    const __m128i s3s2_lo = _mm_unpacklo_epi8(C, D);
+    const __m128i s3s2_hi = _mm_unpackhi_epi8(C, D);
+    // multiply 2 adjacent elements with the filter and add the result
+    const __m128i x1_lo = _mm_maddubs_epi16(s3s2_lo, f3f2);
+    const __m128i x1_hi = _mm_maddubs_epi16(s3s2_hi, f3f2);
+    // merge the result together
+    const __m128i s5s4_lo = _mm_unpacklo_epi8(E, F);
+    const __m128i s5s4_hi = _mm_unpackhi_epi8(E, F);
+    // multiply 2 adjacent elements with the filter and add the result
+    const __m128i x2_lo = _mm_maddubs_epi16(s5s4_lo, f5f4);
+    const __m128i x2_hi = _mm_maddubs_epi16(s5s4_hi, f5f4);
+    // add and saturate the results together
+    __m128i temp_lo = _mm_adds_epi16(x3x0_lo, _mm_min_epi16(x1_lo, x2_lo));
+    __m128i temp_hi = _mm_adds_epi16(x3x0_hi, _mm_min_epi16(x1_hi, x2_hi));
+
+    // add and saturate the results together
+    temp_lo = _mm_adds_epi16(temp_lo, _mm_max_epi16(x1_lo, x2_lo));
+    temp_hi = _mm_adds_epi16(temp_hi, _mm_max_epi16(x1_hi, x2_hi));
+    // round and shift by 7 bit each 16 bit
+    temp_lo = _mm_mulhrs_epi16(temp_lo, k_256);
+    temp_hi = _mm_mulhrs_epi16(temp_hi, k_256);
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    temp_hi = _mm_packus_epi16(temp_lo, temp_hi);
+    src_ptr += 16;
+     // save 16 bytes convolve result
+    _mm_store_si128((__m128i*)&dst[i], temp_hi);
+  }
+}
+
+static void scaledconvolve_vert_w16(const uint8_t *src, ptrdiff_t src_stride,
+                                    uint8_t *dst, ptrdiff_t dst_stride,
+                                    const InterpKernel *y_filters,
+                                    int y0_q4, int y_step_q4, int w, int h) {
+  int y;
+  int y_q4 = y0_q4;
+
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  for (y = 0; y < h; ++y) {
+    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+    const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+    if (y_q4 & SUBPEL_MASK) {
+      filter_vert_w16_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter,
+                            w);
+    } else {
+      memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
+    }
+    y_q4 += y_step_q4;
+  }
+}
+
+static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const InterpKernel *const x_filters,
+                             int x0_q4, int x_step_q4,
+                             const InterpKernel *const y_filters,
+                             int y0_q4, int y_step_q4,
+                             int w, int h) {
+  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+  // 2d filtering proceeds in 2 steps:
+  //   (1) Interpolate horizontally into an intermediate buffer, temp.
+  //   (2) Interpolate temp vertically to derive the sub-pixel result.
+  // Deriving the maximum number of rows in the temp buffer (135):
+  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+  // --Largest block size is 64x64 pixels.
+  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+  //   original frame (in 1/16th pixel units).
+  // --Must round-up because block may be located at sub-pixel position.
+  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+  // --Require an additional 8 rows for the horiz_w8 transpose tail.
+  DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
+  const int intermediate_height =
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+  assert(w <= 64);
+  assert(h <= 64);
+  assert(y_step_q4 <= 32);
+  assert(x_step_q4 <= 32);
+
+  if (w >= 8) {
+    scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+                            src_stride, temp, 64, x_filters, x0_q4, x_step_q4,
+                            w, intermediate_height);
+  } else {
+    scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+                            src_stride, temp, 64, x_filters, x0_q4, x_step_q4,
+                            w, intermediate_height);
+  }
+
+  if (w >= 16) {
+    scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+                            dst_stride, y_filters, y0_q4, y_step_q4, w, h);
+  } else if (w == 8) {
+    scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+                           dst_stride, y_filters, y0_q4, y_step_q4, w, h);
+  } else {
+    scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+                           dst_stride, y_filters, y0_q4, y_step_q4, w, h);
+  }
+}
+
+static const InterpKernel *get_filter_base(const int16_t *filter) {
+  // NOTE: This assumes that the filter table is 256-byte aligned.
+  // TODO(agrange) Modify to make independent of table alignment.
+  return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
+}
+
+static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
+  return (int)((const InterpKernel *)(intptr_t)f - base);
+}
+
+void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+                         uint8_t *dst, ptrdiff_t dst_stride,
+                         const int16_t *filter_x, int x_step_q4,
+                         const int16_t *filter_y, int y_step_q4,
+                         int w, int h) {
+  const InterpKernel *const filters_x = get_filter_base(filter_x);
+  const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+  const InterpKernel *const filters_y = get_filter_base(filter_y);
+  const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+  scaledconvolve2d(src, src_stride, dst, dst_stride,
+                   filters_x, x0_q4, x_step_q4,
+                   filters_y, y0_q4, y_step_q4, w, h);
+}
+
+// void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+//                          uint8_t *dst, ptrdiff_t dst_stride,
+//                          const int16_t *filter_x, int x_step_q4,
+//                          const int16_t *filter_y, int y_step_q4,
+//                          int w, int h);
+// void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+//                              uint8_t *dst, ptrdiff_t dst_stride,
+//                              const int16_t *filter_x, int x_step_q4,
+//                              const int16_t *filter_y, int y_step_q4,
+//                              int w, int h);
+FUN_CONV_2D(, ssse3);
+FUN_CONV_2D(avg_ , ssse3);
diff --git a/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm b/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm
new file mode 100644
index 0000000000..08f3d6a6cf
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm
@@ -0,0 +1,987 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;Note: tap3 and tap4 have to be applied and added after other taps to avoid
+;overflow.
+
+%macro GET_FILTERS_4 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rcx, 0x0400040
+
+    movdqa      xmm7, [rdx]                 ;load filters
+    pshuflw     xmm0, xmm7, 0b              ;k0
+    pshuflw     xmm1, xmm7, 01010101b       ;k1
+    pshuflw     xmm2, xmm7, 10101010b       ;k2
+    pshuflw     xmm3, xmm7, 11111111b       ;k3
+    psrldq      xmm7, 8
+    pshuflw     xmm4, xmm7, 0b              ;k4
+    pshuflw     xmm5, xmm7, 01010101b       ;k5
+    pshuflw     xmm6, xmm7, 10101010b       ;k6
+    pshuflw     xmm7, xmm7, 11111111b       ;k7
+
+    punpcklqdq  xmm0, xmm1
+    punpcklqdq  xmm2, xmm3
+    punpcklqdq  xmm5, xmm4
+    punpcklqdq  xmm6, xmm7
+
+    movdqa      k0k1, xmm0
+    movdqa      k2k3, xmm2
+    movdqa      k5k4, xmm5
+    movdqa      k6k7, xmm6
+
+    movq        xmm6, rcx
+    pshufd      xmm6, xmm6, 0
+    movdqa      krd, xmm6
+
+    pxor        xmm7, xmm7
+    movdqa      zero, xmm7
+%endm
+
+%macro APPLY_FILTER_4 1
+    punpckldq   xmm0, xmm1                  ;two row in one register
+    punpckldq   xmm6, xmm7
+    punpckldq   xmm2, xmm3
+    punpckldq   xmm5, xmm4
+
+    punpcklbw   xmm0, zero                  ;unpack to word
+    punpcklbw   xmm6, zero
+    punpcklbw   xmm2, zero
+    punpcklbw   xmm5, zero
+
+    pmullw      xmm0, k0k1                  ;multiply the filter factors
+    pmullw      xmm6, k6k7
+    pmullw      xmm2, k2k3
+    pmullw      xmm5, k5k4
+
+    paddsw      xmm0, xmm6                  ;sum
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 8
+    paddsw      xmm0, xmm1
+    paddsw      xmm0, xmm2
+    psrldq      xmm2, 8
+    paddsw      xmm0, xmm5
+    psrldq      xmm5, 8
+    paddsw      xmm0, xmm2
+    paddsw      xmm0, xmm5
+
+    paddsw      xmm0, krd                   ;rounding
+    psraw       xmm0, 7                     ;shift
+    packuswb    xmm0, xmm0                  ;pack to byte
+
+%if %1
+    movd        xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+    movd        [rdi], xmm0
+%endm
+
+%macro GET_FILTERS 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x0400040
+
+    movdqa      xmm7, [rdx]                 ;load filters
+    pshuflw     xmm0, xmm7, 0b              ;k0
+    pshuflw     xmm1, xmm7, 01010101b       ;k1
+    pshuflw     xmm2, xmm7, 10101010b       ;k2
+    pshuflw     xmm3, xmm7, 11111111b       ;k3
+    pshufhw     xmm4, xmm7, 0b              ;k4
+    pshufhw     xmm5, xmm7, 01010101b       ;k5
+    pshufhw     xmm6, xmm7, 10101010b       ;k6
+    pshufhw     xmm7, xmm7, 11111111b       ;k7
+
+    punpcklwd   xmm0, xmm0
+    punpcklwd   xmm1, xmm1
+    punpcklwd   xmm2, xmm2
+    punpcklwd   xmm3, xmm3
+    punpckhwd   xmm4, xmm4
+    punpckhwd   xmm5, xmm5
+    punpckhwd   xmm6, xmm6
+    punpckhwd   xmm7, xmm7
+
+    movdqa      k0,   xmm0                  ;store filter factors on stack
+    movdqa      k1,   xmm1
+    movdqa      k2,   xmm2
+    movdqa      k3,   xmm3
+    movdqa      k4,   xmm4
+    movdqa      k5,   xmm5
+    movdqa      k6,   xmm6
+    movdqa      k7,   xmm7
+
+    movq        xmm6, rcx
+    pshufd      xmm6, xmm6, 0
+    movdqa      krd, xmm6                   ;rounding
+
+    pxor        xmm7, xmm7
+    movdqa      zero, xmm7
+%endm
+
+%macro LOAD_VERT_8 1
+    movq        xmm0, [rsi + %1]            ;0
+    movq        xmm1, [rsi + rax + %1]      ;1
+    movq        xmm6, [rsi + rdx * 2 + %1]  ;6
+    lea         rsi,  [rsi + rax]
+    movq        xmm7, [rsi + rdx * 2 + %1]  ;7
+    movq        xmm2, [rsi + rax + %1]      ;2
+    movq        xmm3, [rsi + rax * 2 + %1]  ;3
+    movq        xmm4, [rsi + rdx + %1]      ;4
+    movq        xmm5, [rsi + rax * 4 + %1]  ;5
+%endm
+
+%macro APPLY_FILTER_8 2
+    punpcklbw   xmm0, zero
+    punpcklbw   xmm1, zero
+    punpcklbw   xmm6, zero
+    punpcklbw   xmm7, zero
+    punpcklbw   xmm2, zero
+    punpcklbw   xmm5, zero
+    punpcklbw   xmm3, zero
+    punpcklbw   xmm4, zero
+
+    pmullw      xmm0, k0
+    pmullw      xmm1, k1
+    pmullw      xmm6, k6
+    pmullw      xmm7, k7
+    pmullw      xmm2, k2
+    pmullw      xmm5, k5
+    pmullw      xmm3, k3
+    pmullw      xmm4, k4
+
+    paddsw      xmm0, xmm1
+    paddsw      xmm0, xmm6
+    paddsw      xmm0, xmm7
+    paddsw      xmm0, xmm2
+    paddsw      xmm0, xmm5
+    paddsw      xmm0, xmm3
+    paddsw      xmm0, xmm4
+
+    paddsw      xmm0, krd                   ;rounding
+    psraw       xmm0, 7                     ;shift
+    packuswb    xmm0, xmm0                  ;pack back to byte
+%if %1
+    movq        xmm1, [rdi + %2]
+    pavgb       xmm0, xmm1
+%endif
+    movq        [rdi + %2], xmm0
+%endm
+
+;void vpx_filter_block1d4_v8_sse2
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    short *filter
+;)
+global sym(vpx_filter_block1d4_v8_sse2) PRIVATE
+sym(vpx_filter_block1d4_v8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 6
+    %define k0k1 [rsp + 16 * 0]
+    %define k2k3 [rsp + 16 * 1]
+    %define k5k4 [rsp + 16 * 2]
+    %define k6k7 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define zero [rsp + 16 * 5]
+
+    GET_FILTERS_4
+
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movd        xmm0, [rsi]                 ;load src: row 0
+    movd        xmm1, [rsi + rax]           ;1
+    movd        xmm6, [rsi + rdx * 2]       ;6
+    lea         rsi,  [rsi + rax]
+    movd        xmm7, [rsi + rdx * 2]       ;7
+    movd        xmm2, [rsi + rax]           ;2
+    movd        xmm3, [rsi + rax * 2]       ;3
+    movd        xmm4, [rsi + rdx]           ;4
+    movd        xmm5, [rsi + rax * 4]       ;5
+
+    APPLY_FILTER_4 0
+
+    lea         rdi, [rdi + rbx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 6
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vpx_filter_block1d8_v8_sse2
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    short *filter
+;)
+global sym(vpx_filter_block1d8_v8_sse2) PRIVATE
+sym(vpx_filter_block1d8_v8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 10
+    %define k0 [rsp + 16 * 0]
+    %define k1 [rsp + 16 * 1]
+    %define k2 [rsp + 16 * 2]
+    %define k3 [rsp + 16 * 3]
+    %define k4 [rsp + 16 * 4]
+    %define k5 [rsp + 16 * 5]
+    %define k6 [rsp + 16 * 6]
+    %define k7 [rsp + 16 * 7]
+    %define krd [rsp + 16 * 8]
+    %define zero [rsp + 16 * 9]
+
+    GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    LOAD_VERT_8 0
+    APPLY_FILTER_8 0, 0
+
+    lea         rdi, [rdi + rbx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 10
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vpx_filter_block1d16_v8_sse2
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    short *filter
+;)
+global sym(vpx_filter_block1d16_v8_sse2) PRIVATE
+sym(vpx_filter_block1d16_v8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 10
+    %define k0 [rsp + 16 * 0]
+    %define k1 [rsp + 16 * 1]
+    %define k2 [rsp + 16 * 2]
+    %define k3 [rsp + 16 * 3]
+    %define k4 [rsp + 16 * 4]
+    %define k5 [rsp + 16 * 5]
+    %define k6 [rsp + 16 * 6]
+    %define k7 [rsp + 16 * 7]
+    %define krd [rsp + 16 * 8]
+    %define zero [rsp + 16 * 9]
+
+    GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    LOAD_VERT_8 0
+    APPLY_FILTER_8 0, 0
+    sub         rsi, rax
+
+    LOAD_VERT_8 8
+    APPLY_FILTER_8 0, 8
+    add         rdi, rbx
+
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 10
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vpx_filter_block1d4_v8_avg_sse2) PRIVATE
+sym(vpx_filter_block1d4_v8_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 6
+    %define k0k1 [rsp + 16 * 0]
+    %define k2k3 [rsp + 16 * 1]
+    %define k5k4 [rsp + 16 * 2]
+    %define k6k7 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define zero [rsp + 16 * 5]
+
+    GET_FILTERS_4
+
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movd        xmm0, [rsi]                 ;load src: row 0
+    movd        xmm1, [rsi + rax]           ;1
+    movd        xmm6, [rsi + rdx * 2]       ;6
+    lea         rsi,  [rsi + rax]
+    movd        xmm7, [rsi + rdx * 2]       ;7
+    movd        xmm2, [rsi + rax]           ;2
+    movd        xmm3, [rsi + rax * 2]       ;3
+    movd        xmm4, [rsi + rdx]           ;4
+    movd        xmm5, [rsi + rax * 4]       ;5
+
+    APPLY_FILTER_4 1
+
+    lea         rdi, [rdi + rbx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 6
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vpx_filter_block1d8_v8_avg_sse2) PRIVATE
+sym(vpx_filter_block1d8_v8_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 10
+    %define k0 [rsp + 16 * 0]
+    %define k1 [rsp + 16 * 1]
+    %define k2 [rsp + 16 * 2]
+    %define k3 [rsp + 16 * 3]
+    %define k4 [rsp + 16 * 4]
+    %define k5 [rsp + 16 * 5]
+    %define k6 [rsp + 16 * 6]
+    %define k7 [rsp + 16 * 7]
+    %define krd [rsp + 16 * 8]
+    %define zero [rsp + 16 * 9]
+
+    GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+.loop:
+    LOAD_VERT_8 0
+    APPLY_FILTER_8 1, 0
+
+    lea         rdi, [rdi + rbx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 10
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vpx_filter_block1d16_v8_avg_sse2) PRIVATE
+sym(vpx_filter_block1d16_v8_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 10
+    %define k0 [rsp + 16 * 0]
+    %define k1 [rsp + 16 * 1]
+    %define k2 [rsp + 16 * 2]
+    %define k3 [rsp + 16 * 3]
+    %define k4 [rsp + 16 * 4]
+    %define k5 [rsp + 16 * 5]
+    %define k6 [rsp + 16 * 6]
+    %define k7 [rsp + 16 * 7]
+    %define krd [rsp + 16 * 8]
+    %define zero [rsp + 16 * 9]
+
+    GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+.loop:
+    LOAD_VERT_8 0
+    APPLY_FILTER_8 1, 0
+    sub         rsi, rax
+
+    LOAD_VERT_8 8
+    APPLY_FILTER_8 1, 8
+    add         rdi, rbx
+
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 10
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vpx_filter_block1d4_h8_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    short *filter
+;)
+global sym(vpx_filter_block1d4_h8_sse2) PRIVATE
+sym(vpx_filter_block1d4_h8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 6
+    %define k0k1 [rsp + 16 * 0]
+    %define k2k3 [rsp + 16 * 1]
+    %define k5k4 [rsp + 16 * 2]
+    %define k6k7 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define zero [rsp + 16 * 5]
+
+    GET_FILTERS_4
+
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 3]           ;load src
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm0
+    movdqa      xmm7, xmm0
+    movdqa      xmm2, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm5, xmm0
+    movdqa      xmm4, xmm0
+
+    psrldq      xmm1, 1
+    psrldq      xmm6, 6
+    psrldq      xmm7, 7
+    psrldq      xmm2, 2
+    psrldq      xmm3, 3
+    psrldq      xmm5, 5
+    psrldq      xmm4, 4
+
+    APPLY_FILTER_4 0
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 6
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vpx_filter_block1d8_h8_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    short *filter
+;)
+global sym(vpx_filter_block1d8_h8_sse2) PRIVATE
+sym(vpx_filter_block1d8_h8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 10
+    %define k0 [rsp + 16 * 0]
+    %define k1 [rsp + 16 * 1]
+    %define k2 [rsp + 16 * 2]
+    %define k3 [rsp + 16 * 3]
+    %define k4 [rsp + 16 * 4]
+    %define k5 [rsp + 16 * 5]
+    %define k6 [rsp + 16 * 6]
+    %define k7 [rsp + 16 * 7]
+    %define krd [rsp + 16 * 8]
+    %define zero [rsp + 16 * 9]
+
+    GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 3]           ;load src
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm0
+    movdqa      xmm7, xmm0
+    movdqa      xmm2, xmm0
+    movdqa      xmm5, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm4, xmm0
+
+    psrldq      xmm1, 1
+    psrldq      xmm6, 6
+    psrldq      xmm7, 7
+    psrldq      xmm2, 2
+    psrldq      xmm5, 5
+    psrldq      xmm3, 3
+    psrldq      xmm4, 4
+
+    APPLY_FILTER_8 0, 0
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 10
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vpx_filter_block1d16_h8_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    short *filter
+;)
+global sym(vpx_filter_block1d16_h8_sse2) PRIVATE
+sym(vpx_filter_block1d16_h8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 10
+    %define k0 [rsp + 16 * 0]
+    %define k1 [rsp + 16 * 1]
+    %define k2 [rsp + 16 * 2]
+    %define k3 [rsp + 16 * 3]
+    %define k4 [rsp + 16 * 4]
+    %define k5 [rsp + 16 * 5]
+    %define k6 [rsp + 16 * 6]
+    %define k7 [rsp + 16 * 7]
+    %define krd [rsp + 16 * 8]
+    %define zero [rsp + 16 * 9]
+
+    GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 3]           ;load src
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm0
+    movdqa      xmm7, xmm0
+    movdqa      xmm2, xmm0
+    movdqa      xmm5, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm4, xmm0
+
+    psrldq      xmm1, 1
+    psrldq      xmm6, 6
+    psrldq      xmm7, 7
+    psrldq      xmm2, 2
+    psrldq      xmm5, 5
+    psrldq      xmm3, 3
+    psrldq      xmm4, 4
+
+    APPLY_FILTER_8 0, 0
+
+    movdqu      xmm0,   [rsi + 5]           ;load src
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm0
+    movdqa      xmm7, xmm0
+    movdqa      xmm2, xmm0
+    movdqa      xmm5, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm4, xmm0
+
+    psrldq      xmm1, 1
+    psrldq      xmm6, 6
+    psrldq      xmm7, 7
+    psrldq      xmm2, 2
+    psrldq      xmm5, 5
+    psrldq      xmm3, 3
+    psrldq      xmm4, 4
+
+    APPLY_FILTER_8 0, 8
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 10
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vpx_filter_block1d4_h8_avg_sse2) PRIVATE
+sym(vpx_filter_block1d4_h8_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 6
+    %define k0k1 [rsp + 16 * 0]
+    %define k2k3 [rsp + 16 * 1]
+    %define k5k4 [rsp + 16 * 2]
+    %define k6k7 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define zero [rsp + 16 * 5]
+
+    GET_FILTERS_4
+
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 3]           ;load src
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm0
+    movdqa      xmm7, xmm0
+    movdqa      xmm2, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm5, xmm0
+    movdqa      xmm4, xmm0
+
+    psrldq      xmm1, 1
+    psrldq      xmm6, 6
+    psrldq      xmm7, 7
+    psrldq      xmm2, 2
+    psrldq      xmm3, 3
+    psrldq      xmm5, 5
+    psrldq      xmm4, 4
+
+    APPLY_FILTER_4 1
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 6
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vpx_filter_block1d8_h8_avg_sse2) PRIVATE
+sym(vpx_filter_block1d8_h8_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 10
+    %define k0 [rsp + 16 * 0]
+    %define k1 [rsp + 16 * 1]
+    %define k2 [rsp + 16 * 2]
+    %define k3 [rsp + 16 * 3]
+    %define k4 [rsp + 16 * 4]
+    %define k5 [rsp + 16 * 5]
+    %define k6 [rsp + 16 * 6]
+    %define k7 [rsp + 16 * 7]
+    %define krd [rsp + 16 * 8]
+    %define zero [rsp + 16 * 9]
+
+    GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 3]           ;load src
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm0
+    movdqa      xmm7, xmm0
+    movdqa      xmm2, xmm0
+    movdqa      xmm5, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm4, xmm0
+
+    psrldq      xmm1, 1
+    psrldq      xmm6, 6
+    psrldq      xmm7, 7
+    psrldq      xmm2, 2
+    psrldq      xmm5, 5
+    psrldq      xmm3, 3
+    psrldq      xmm4, 4
+
+    APPLY_FILTER_8 1, 0
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 10
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vpx_filter_block1d16_h8_avg_sse2) PRIVATE
+sym(vpx_filter_block1d16_h8_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 10
+    %define k0 [rsp + 16 * 0]
+    %define k1 [rsp + 16 * 1]
+    %define k2 [rsp + 16 * 2]
+    %define k3 [rsp + 16 * 3]
+    %define k4 [rsp + 16 * 4]
+    %define k5 [rsp + 16 * 5]
+    %define k6 [rsp + 16 * 6]
+    %define k7 [rsp + 16 * 7]
+    %define krd [rsp + 16 * 8]
+    %define zero [rsp + 16 * 9]
+
+    GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 3]           ;load src
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm0
+    movdqa      xmm7, xmm0
+    movdqa      xmm2, xmm0
+    movdqa      xmm5, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm4, xmm0
+
+    psrldq      xmm1, 1
+    psrldq      xmm6, 6
+    psrldq      xmm7, 7
+    psrldq      xmm2, 2
+    psrldq      xmm5, 5
+    psrldq      xmm3, 3
+    psrldq      xmm4, 4
+
+    APPLY_FILTER_8 1, 0
+
+    movdqu      xmm0,   [rsi + 5]           ;load src
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm0
+    movdqa      xmm7, xmm0
+    movdqa      xmm2, xmm0
+    movdqa      xmm5, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm4, xmm0
+
+    psrldq      xmm1, 1
+    psrldq      xmm6, 6
+    psrldq      xmm7, 7
+    psrldq      xmm2, 2
+    psrldq      xmm5, 5
+    psrldq      xmm3, 3
+    psrldq      xmm4, 4
+
+    APPLY_FILTER_8 1, 8
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 10
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm b/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
new file mode 100644
index 0000000000..3fbaa274cd
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
@@ -0,0 +1,669 @@
+;
+;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_64:    times 8 dw 64
+
+; %define USE_PMULHRSW
+; NOTE: pmulhrsw has a latency of 5 cycles.  Tests showed a performance loss
+; when using this instruction.
+
+SECTION .text
+%if ARCH_X86_64
+  %define LOCAL_VARS_SIZE 16*4
+%else
+  %define LOCAL_VARS_SIZE 16*6
+%endif
+
+%macro SETUP_LOCAL_VARS 0
+    ; TODO(slavarnway): using xmm registers for these on ARCH_X86_64 +
+    ; pmaddubsw has a higher latency on some platforms, this might be eased by
+    ; interleaving the instructions.
+    %define    k0k1  [rsp + 16*0]
+    %define    k2k3  [rsp + 16*1]
+    %define    k4k5  [rsp + 16*2]
+    %define    k6k7  [rsp + 16*3]
+    packsswb     m4, m4
+    ; TODO(slavarnway): multiple pshufb instructions had a higher latency on
+    ; some platforms.
+    pshuflw      m0, m4, 0b              ;k0_k1
+    pshuflw      m1, m4, 01010101b       ;k2_k3
+    pshuflw      m2, m4, 10101010b       ;k4_k5
+    pshuflw      m3, m4, 11111111b       ;k6_k7
+    punpcklqdq   m0, m0
+    punpcklqdq   m1, m1
+    punpcklqdq   m2, m2
+    punpcklqdq   m3, m3
+    mova       k0k1, m0
+    mova       k2k3, m1
+    mova       k4k5, m2
+    mova       k6k7, m3
+%if ARCH_X86_64
+    %define     krd  m12
+    %define     tmp  m13
+    mova        krd, [GLOBAL(pw_64)]
+%else
+    %define     tmp  [rsp + 16*4]
+    %define     krd  [rsp + 16*5]
+%if CONFIG_PIC=0
+    mova         m6, [GLOBAL(pw_64)]
+%else
+    ; build constants without accessing global memory
+    pcmpeqb      m6, m6                  ;all ones
+    psrlw        m6, 15
+    psllw        m6, 6                   ;aka pw_64
+%endif
+    mova        krd, m6
+%endif
+%endm
+
+%macro HORIZx4_ROW 2
+    mova      %2, %1
+    punpcklbw %1, %1
+    punpckhbw %2, %2
+
+    mova      m3, %2
+    palignr   %2, %1, 1
+    palignr   m3, %1, 5
+
+    pmaddubsw %2, k0k1k4k5
+    pmaddubsw m3, k2k3k6k7
+
+    mova      m4, %2
+    mova      m5, m3
+    psrldq    %2, 8
+    psrldq    m3, 8
+    mova      m6, m5
+
+    paddsw    m4, m3
+    pmaxsw    m5, %2
+    pminsw    %2, m6
+    paddsw    %2, m4
+    paddsw    %2, m5
+    paddsw    %2, krd
+    psraw     %2, 7
+    packuswb  %2, %2
+%endm
+
+;-------------------------------------------------------------------------------
+%macro SUBPIX_HFILTER4 1
+cglobal filter_block1d4_%1, 6, 6+(ARCH_X86_64*2), 11, LOCAL_VARS_SIZE, \
+                            src, sstride, dst, dstride, height, filter
+    mova                m4, [filterq]
+    packsswb            m4, m4
+%if ARCH_X86_64
+    %define       k0k1k4k5 m8
+    %define       k2k3k6k7 m9
+    %define            krd m10
+    %define    orig_height r7d
+    mova               krd, [GLOBAL(pw_64)]
+    pshuflw       k0k1k4k5, m4, 0b              ;k0_k1
+    pshufhw       k0k1k4k5, k0k1k4k5, 10101010b ;k0_k1_k4_k5
+    pshuflw       k2k3k6k7, m4, 01010101b       ;k2_k3
+    pshufhw       k2k3k6k7, k2k3k6k7, 11111111b ;k2_k3_k6_k7
+%else
+    %define       k0k1k4k5 [rsp + 16*0]
+    %define       k2k3k6k7 [rsp + 16*1]
+    %define            krd [rsp + 16*2]
+    %define    orig_height [rsp + 16*3]
+    pshuflw             m6, m4, 0b              ;k0_k1
+    pshufhw             m6, m6, 10101010b       ;k0_k1_k4_k5
+    pshuflw             m7, m4, 01010101b       ;k2_k3
+    pshufhw             m7, m7, 11111111b       ;k2_k3_k6_k7
+%if CONFIG_PIC=0
+    mova                m1, [GLOBAL(pw_64)]
+%else
+    ; build constants without accessing global memory
+    pcmpeqb             m1, m1                  ;all ones
+    psrlw               m1, 15
+    psllw               m1, 6                   ;aka pw_64
+%endif
+    mova          k0k1k4k5, m6
+    mova          k2k3k6k7, m7
+    mova               krd, m1
+%endif
+    mov        orig_height, heightd
+    shr            heightd, 1
+.loop:
+    ;Do two rows at once
+    movh                m0, [srcq - 3]
+    movh                m1, [srcq + 5]
+    punpcklqdq          m0, m1
+    mova                m1, m0
+    movh                m2, [srcq + sstrideq - 3]
+    movh                m3, [srcq + sstrideq + 5]
+    punpcklqdq          m2, m3
+    mova                m3, m2
+    punpcklbw           m0, m0
+    punpckhbw           m1, m1
+    punpcklbw           m2, m2
+    punpckhbw           m3, m3
+    mova                m4, m1
+    palignr             m4, m0,  1
+    pmaddubsw           m4, k0k1k4k5
+    palignr             m1, m0,  5
+    pmaddubsw           m1, k2k3k6k7
+    mova                m7, m3
+    palignr             m7, m2,  1
+    pmaddubsw           m7, k0k1k4k5
+    palignr             m3, m2,  5
+    pmaddubsw           m3, k2k3k6k7
+    mova                m0, m4
+    mova                m5, m1
+    mova                m2, m7
+    psrldq              m4, 8
+    psrldq              m1, 8
+    mova                m6, m5
+    paddsw              m0, m1
+    mova                m1, m3
+    psrldq              m7, 8
+    psrldq              m3, 8
+    paddsw              m2, m3
+    mova                m3, m1
+    pmaxsw              m5, m4
+    pminsw              m4, m6
+    paddsw              m4, m0
+    paddsw              m4, m5
+    pmaxsw              m1, m7
+    pminsw              m7, m3
+    paddsw              m7, m2
+    paddsw              m7, m1
+
+    paddsw              m4, krd
+    psraw               m4, 7
+    packuswb            m4, m4
+    paddsw              m7, krd
+    psraw               m7, 7
+    packuswb            m7, m7
+
+%ifidn %1, h8_avg
+    movd                m0, [dstq]
+    pavgb               m4, m0
+    movd                m2, [dstq + dstrideq]
+    pavgb               m7, m2
+%endif
+    movd            [dstq], m4
+    movd [dstq + dstrideq], m7
+
+    lea               srcq, [srcq + sstrideq        ]
+    prefetcht0              [srcq + 4 * sstrideq - 3]
+    lea               srcq, [srcq + sstrideq        ]
+    lea               dstq, [dstq + 2 * dstrideq    ]
+    prefetcht0              [srcq + 2 * sstrideq - 3]
+
+    dec            heightd
+    jnz              .loop
+
+    ; Do last row if output_height is odd
+    mov            heightd, orig_height
+    and            heightd, 1
+    je               .done
+
+    movh                m0, [srcq - 3]    ; load src
+    movh                m1, [srcq + 5]
+    punpcklqdq          m0, m1
+
+    HORIZx4_ROW         m0, m1
+%ifidn %1, h8_avg
+    movd                m0, [dstq]
+    pavgb               m1, m0
+%endif
+    movd            [dstq], m1
+.done
+    RET
+%endm
+
+%macro HORIZx8_ROW 5
+    mova        %2, %1
+    punpcklbw   %1, %1
+    punpckhbw   %2, %2
+
+    mova        %3, %2
+    mova        %4, %2
+    mova        %5, %2
+
+    palignr     %2, %1, 1
+    palignr     %3, %1, 5
+    palignr     %4, %1, 9
+    palignr     %5, %1, 13
+
+    pmaddubsw   %2, k0k1
+    pmaddubsw   %3, k2k3
+    pmaddubsw   %4, k4k5
+    pmaddubsw   %5, k6k7
+
+    paddsw      %2, %5
+    mova        %1, %3
+    pminsw      %3, %4
+    pmaxsw      %1, %4
+    paddsw      %2, %3
+    paddsw      %1, %2
+    paddsw      %1, krd
+    psraw       %1, 7
+    packuswb    %1, %1
+%endm
+
+;-------------------------------------------------------------------------------
+%macro SUBPIX_HFILTER8 1
+cglobal filter_block1d8_%1, 6, 6+(ARCH_X86_64*1), 14, LOCAL_VARS_SIZE, \
+                            src, sstride, dst, dstride, height, filter
+    mova                 m4, [filterq]
+    SETUP_LOCAL_VARS
+%if ARCH_X86_64
+    %define     orig_height r7d
+%else
+    %define     orig_height heightmp
+%endif
+    mov         orig_height, heightd
+    shr             heightd, 1
+
+.loop:
+    movh                 m0, [srcq - 3]
+    movh                 m3, [srcq + 5]
+    movh                 m4, [srcq + sstrideq - 3]
+    movh                 m7, [srcq + sstrideq + 5]
+    punpcklqdq           m0, m3
+    mova                 m1, m0
+    punpcklbw            m0, m0
+    punpckhbw            m1, m1
+    mova                 m5, m1
+    palignr              m5, m0, 13
+    pmaddubsw            m5, k6k7
+    mova                 m2, m1
+    mova                 m3, m1
+    palignr              m1, m0, 1
+    pmaddubsw            m1, k0k1
+    punpcklqdq           m4, m7
+    mova                 m6, m4
+    punpcklbw            m4, m4
+    palignr              m2, m0, 5
+    punpckhbw            m6, m6
+    palignr              m3, m0, 9
+    mova                 m7, m6
+    pmaddubsw            m2, k2k3
+    pmaddubsw            m3, k4k5
+
+    palignr              m7, m4, 13
+    paddsw               m1, m5
+    mova                 m5, m6
+    mova                 m0, m2
+    palignr              m5, m4, 5
+    pminsw               m2, m3
+    pmaddubsw            m7, k6k7
+    pmaxsw               m3, m0
+    paddsw               m1, m2
+    mova                 m0, m6
+    palignr              m6, m4, 1
+    pmaddubsw            m5, k2k3
+    paddsw               m1, m3
+    pmaddubsw            m6, k0k1
+    palignr              m0, m4, 9
+    paddsw               m1, krd
+    pmaddubsw            m0, k4k5
+    mova                 m4, m5
+    psraw                m1, 7
+    pminsw               m5, m0
+    paddsw               m6, m7
+    packuswb             m1, m1
+
+    paddsw               m6, m5
+    pmaxsw               m0, m4
+    paddsw               m6, m0
+    paddsw               m6, krd
+    psraw                m6, 7
+    packuswb             m6, m6
+
+%ifidn %1, h8_avg
+    movh                 m0, [dstq]
+    movh                 m2, [dstq + dstrideq]
+    pavgb                m1, m0
+    pavgb                m6, m2
+%endif
+    movh             [dstq], m1
+    movh  [dstq + dstrideq], m6
+
+    lea                srcq, [srcq + sstrideq        ]
+    prefetcht0               [srcq + 4 * sstrideq - 3]
+    lea                srcq, [srcq + sstrideq        ]
+    lea                dstq, [dstq + 2 * dstrideq    ]
+    prefetcht0               [srcq + 2 * sstrideq - 3]
+    dec             heightd
+    jnz             .loop
+
+    ;Do last row if output_height is odd
+    mov             heightd, orig_height
+    and             heightd, 1
+    je                .done
+
+    movh                 m0, [srcq - 3]
+    movh                 m3, [srcq + 5]
+    punpcklqdq           m0, m3
+
+    HORIZx8_ROW          m0, m1, m2, m3, m4
+
+%ifidn %1, h8_avg
+    movh                 m1, [dstq]
+    pavgb                m0, m1
+%endif
+    movh             [dstq], m0
+.done:
+    RET
+%endm
+
+;-------------------------------------------------------------------------------
+%macro SUBPIX_HFILTER16 1
+cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*0), 14, LOCAL_VARS_SIZE, \
+                             src, sstride, dst, dstride, height, filter
+    mova          m4, [filterq]
+    SETUP_LOCAL_VARS
+.loop:
+    prefetcht0        [srcq + 2 * sstrideq -3]
+
+    movh          m0, [srcq -  3]
+    movh          m4, [srcq +  5]
+    movh          m6, [srcq + 13]
+    punpcklqdq    m0, m4
+    mova          m7, m0
+    punpckhbw     m0, m0
+    mova          m1, m0
+    punpcklqdq    m4, m6
+    mova          m3, m0
+    punpcklbw     m7, m7
+
+    palignr       m3, m7, 13
+    mova          m2, m0
+    pmaddubsw     m3, k6k7
+    palignr       m0, m7, 1
+    pmaddubsw     m0, k0k1
+    palignr       m1, m7, 5
+    pmaddubsw     m1, k2k3
+    palignr       m2, m7, 9
+    pmaddubsw     m2, k4k5
+    paddsw        m0, m3
+    mova          m3, m4
+    punpckhbw     m4, m4
+    mova          m5, m4
+    punpcklbw     m3, m3
+    mova          m7, m4
+    palignr       m5, m3, 5
+    mova          m6, m4
+    palignr       m4, m3, 1
+    pmaddubsw     m4, k0k1
+    pmaddubsw     m5, k2k3
+    palignr       m6, m3, 9
+    pmaddubsw     m6, k4k5
+    palignr       m7, m3, 13
+    pmaddubsw     m7, k6k7
+
+    mova          m3, m1
+    pmaxsw        m1, m2
+    pminsw        m2, m3
+    paddsw        m0, m2
+    paddsw        m0, m1
+    paddsw        m4, m7
+    mova          m7, m5
+    pmaxsw        m5, m6
+    pminsw        m6, m7
+    paddsw        m4, m6
+    paddsw        m4, m5
+    paddsw        m0, krd
+    paddsw        m4, krd
+    psraw         m0, 7
+    psraw         m4, 7
+    packuswb      m0, m4
+%ifidn %1, h8_avg
+    mova          m1, [dstq]
+    pavgb         m0, m1
+%endif
+    lea         srcq, [srcq + sstrideq]
+    mova      [dstq], m0
+    lea         dstq, [dstq + dstrideq]
+    dec      heightd
+    jnz        .loop
+    RET
+%endm
+
+INIT_XMM ssse3
+SUBPIX_HFILTER16 h8
+SUBPIX_HFILTER16 h8_avg
+SUBPIX_HFILTER8  h8
+SUBPIX_HFILTER8  h8_avg
+SUBPIX_HFILTER4  h8
+SUBPIX_HFILTER4  h8_avg
+
+;-------------------------------------------------------------------------------
+%macro SUBPIX_VFILTER 2
+cglobal filter_block1d%2_%1, 6, 6+(ARCH_X86_64*3), 14, LOCAL_VARS_SIZE, \
+                             src, sstride, dst, dstride, height, filter
+    mova          m4, [filterq]
+    SETUP_LOCAL_VARS
+%if ARCH_X86_64
+    %define      src1q r7
+    %define  sstride6q r8
+    %define dst_stride dstrideq
+%else
+    %define      src1q filterq
+    %define  sstride6q dstrideq
+    %define dst_stride dstridemp
+%endif
+    mov       src1q, srcq
+    add       src1q, sstrideq
+    lea   sstride6q, [sstrideq + sstrideq * 4]
+    add   sstride6q, sstrideq                   ;pitch * 6
+
+%ifidn %2, 8
+    %define movx movh
+%else
+    %define movx movd
+%endif
+.loop:
+    movx         m0, [srcq                ]     ;A
+    movx         m1, [srcq + sstrideq     ]     ;B
+    punpcklbw    m0, m1                         ;A B
+    movx         m2, [srcq + sstrideq * 2 ]     ;C
+    pmaddubsw    m0, k0k1
+    mova         m6, m2
+    movx         m3, [src1q + sstrideq * 2]     ;D
+    punpcklbw    m2, m3                         ;C D
+    pmaddubsw    m2, k2k3
+    movx         m4, [srcq + sstrideq * 4 ]     ;E
+    mova         m7, m4
+    movx         m5, [src1q + sstrideq * 4]     ;F
+    punpcklbw    m4, m5                         ;E F
+    pmaddubsw    m4, k4k5
+    punpcklbw    m1, m6                         ;A B next iter
+    movx         m6, [srcq + sstride6q    ]     ;G
+    punpcklbw    m5, m6                         ;E F next iter
+    punpcklbw    m3, m7                         ;C D next iter
+    pmaddubsw    m5, k4k5
+    movx         m7, [src1q + sstride6q   ]     ;H
+    punpcklbw    m6, m7                         ;G H
+    pmaddubsw    m6, k6k7
+    mova        tmp, m2
+    pmaddubsw    m3, k2k3
+    pmaddubsw    m1, k0k1
+    pmaxsw       m2, m4
+    paddsw       m0, m6
+    movx         m6, [srcq + sstrideq * 8 ]     ;H next iter
+    punpcklbw    m7, m6
+    pmaddubsw    m7, k6k7
+    pminsw       m4, tmp
+    paddsw       m0, m4
+    mova         m4, m3
+    paddsw       m0, m2
+    pminsw       m3, m5
+    pmaxsw       m5, m4
+    paddsw       m0, krd
+    psraw        m0, 7
+    paddsw       m1, m7
+    packuswb     m0, m0
+
+    paddsw       m1, m3
+    paddsw       m1, m5
+    paddsw       m1, krd
+    psraw        m1, 7
+    lea        srcq, [srcq + sstrideq * 2 ]
+    lea       src1q, [src1q + sstrideq * 2]
+    packuswb     m1, m1
+
+%ifidn %1, v8_avg
+    movx         m2, [dstq]
+    pavgb        m0, m2
+%endif
+    movx     [dstq], m0
+    add        dstq, dst_stride
+%ifidn %1, v8_avg
+    movx         m3, [dstq]
+    pavgb        m1, m3
+%endif
+    movx     [dstq], m1
+    add        dstq, dst_stride
+    sub     heightd, 2
+    cmp     heightd, 1
+    jg        .loop
+
+    cmp     heightd, 0
+    je        .done
+
+    movx         m0, [srcq                ]     ;A
+    movx         m1, [srcq + sstrideq     ]     ;B
+    movx         m6, [srcq + sstride6q    ]     ;G
+    punpcklbw    m0, m1                         ;A B
+    movx         m7, [rax + sstride6q     ]     ;H
+    pmaddubsw    m0, k0k1
+    movx         m2, [srcq + sstrideq * 2 ]     ;C
+    punpcklbw    m6, m7                         ;G H
+    movx         m3, [rax + sstrideq * 2  ]     ;D
+    pmaddubsw    m6, k6k7
+    movx         m4, [srcq + sstrideq * 4 ]     ;E
+    punpcklbw    m2, m3                         ;C D
+    movx         m5, [src1q + sstrideq * 4]     ;F
+    punpcklbw    m4, m5                         ;E F
+    pmaddubsw    m2, k2k3
+    pmaddubsw    m4, k4k5
+    paddsw       m0, m6
+    mova         m1, m2
+    pmaxsw       m2, m4
+    pminsw       m4, m1
+    paddsw       m0, m4
+    paddsw       m0, m2
+    paddsw       m0, krd
+    psraw        m0, 7
+    packuswb     m0, m0
+%ifidn %1, v8_avg
+    movx         m1, [dstq]
+    pavgb        m0, m1
+%endif
+    movx     [dstq], m0
+.done:
+    RET
+%endm
+
+;-------------------------------------------------------------------------------
+%macro SUBPIX_VFILTER16 1
+cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*3), 14, LOCAL_VARS_SIZE, \
+                             src, sstride, dst, dstride, height, filter
+
+    mova          m4, [filterq]
+    SETUP_LOCAL_VARS
+%if ARCH_X86_64
+    %define      src1q r7
+    %define  sstride6q r8
+    %define dst_stride dstrideq
+%else
+    %define      src1q filterq
+    %define  sstride6q dstrideq
+    %define dst_stride dstridemp
+%endif
+    mov        src1q, srcq
+    add        src1q, sstrideq
+    lea    sstride6q, [sstrideq + sstrideq * 4]
+    add    sstride6q, sstrideq                   ;pitch * 6
+
+.loop:
+    movh          m0, [srcq                ]     ;A
+    movh          m1, [srcq + sstrideq     ]     ;B
+    movh          m2, [srcq + sstrideq * 2 ]     ;C
+    movh          m3, [src1q + sstrideq * 2]     ;D
+    movh          m4, [srcq + sstrideq * 4 ]     ;E
+    movh          m5, [src1q + sstrideq * 4]     ;F
+
+    punpcklbw     m0, m1                         ;A B
+    movh          m6, [srcq + sstride6q]         ;G
+    punpcklbw     m2, m3                         ;C D
+    movh          m7, [src1q + sstride6q]        ;H
+    punpcklbw     m4, m5                         ;E F
+    pmaddubsw     m0, k0k1
+    movh          m3, [srcq + 8]                 ;A
+    pmaddubsw     m2, k2k3
+    punpcklbw     m6, m7                         ;G H
+    movh          m5, [srcq + sstrideq + 8]      ;B
+    pmaddubsw     m4, k4k5
+    punpcklbw     m3, m5                         ;A B
+    movh          m7, [srcq + sstrideq * 2 + 8]  ;C
+    pmaddubsw     m6, k6k7
+    mova          m1, m2
+    movh          m5, [src1q + sstrideq * 2 + 8] ;D
+    pmaxsw        m2, m4
+    punpcklbw     m7, m5                         ;C D
+    pminsw        m4, m1
+    paddsw        m0, m6
+    pmaddubsw     m3, k0k1
+    movh          m1, [srcq + sstrideq * 4 + 8]  ;E
+    paddsw        m0, m4
+    pmaddubsw     m7, k2k3
+    movh          m6, [src1q + sstrideq * 4 + 8] ;F
+    punpcklbw     m1, m6                         ;E F
+    paddsw        m0, m2
+    paddsw        m0, krd
+    movh          m2, [srcq + sstride6q + 8]     ;G
+    pmaddubsw     m1, k4k5
+    movh          m5, [src1q + sstride6q + 8]    ;H
+    psraw         m0, 7
+    punpcklbw     m2, m5                         ;G H
+    packuswb      m0, m0
+    pmaddubsw     m2, k6k7
+%ifidn %1, v8_avg
+    movh          m4, [dstq]
+    pavgb         m0, m4
+%endif
+    movh      [dstq], m0
+    mova          m6, m7
+    pmaxsw        m7, m1
+    pminsw        m1, m6
+    paddsw        m3, m2
+    paddsw        m3, m1
+    paddsw        m3, m7
+    paddsw        m3, krd
+    psraw         m3, 7
+    packuswb      m3, m3
+
+    add         srcq, sstrideq
+    add        src1q, sstrideq
+%ifidn %1, v8_avg
+    movh          m1, [dstq + 8]
+    pavgb         m3, m1
+%endif
+    movh  [dstq + 8], m3
+    add         dstq, dst_stride
+    dec      heightd
+    jnz        .loop
+    RET
+%endm
+
+INIT_XMM ssse3
+SUBPIX_VFILTER16     v8
+SUBPIX_VFILTER16 v8_avg
+SUBPIX_VFILTER       v8, 8
+SUBPIX_VFILTER   v8_avg, 8
+SUBPIX_VFILTER       v8, 4
+SUBPIX_VFILTER   v8_avg, 4
diff --git a/libs/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm b/libs/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm
new file mode 100644
index 0000000000..a378dd0402
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm
@@ -0,0 +1,448 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro GET_PARAM_4 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x0400040
+
+    movdqa      xmm3, [rdx]                 ;load filters
+    pshuflw     xmm4, xmm3, 11111111b       ;k3
+    psrldq      xmm3, 8
+    pshuflw     xmm3, xmm3, 0b              ;k4
+    punpcklqdq  xmm4, xmm3                  ;k3k4
+
+    movq        xmm3, rcx                   ;rounding
+    pshufd      xmm3, xmm3, 0
+
+    pxor        xmm2, xmm2
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+%endm
+
+%macro APPLY_FILTER_4 1
+
+    punpckldq   xmm0, xmm1                  ;two row in one register
+    punpcklbw   xmm0, xmm2                  ;unpack to word
+    pmullw      xmm0, xmm4                  ;multiply the filter factors
+
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 8
+    paddsw      xmm0, xmm1
+
+    paddsw      xmm0, xmm3                  ;rounding
+    psraw       xmm0, 7                     ;shift
+    packuswb    xmm0, xmm0                  ;pack to byte
+
+%if %1
+    movd        xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+
+    movd        [rdi], xmm0
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+%endm
+
+%macro GET_PARAM 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x0400040
+
+    movdqa      xmm7, [rdx]                 ;load filters
+
+    pshuflw     xmm6, xmm7, 11111111b       ;k3
+    pshufhw     xmm7, xmm7, 0b              ;k4
+    punpcklwd   xmm6, xmm6
+    punpckhwd   xmm7, xmm7
+
+    movq        xmm4, rcx                   ;rounding
+    pshufd      xmm4, xmm4, 0
+
+    pxor        xmm5, xmm5
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+%endm
+
+%macro APPLY_FILTER_8 1
+    punpcklbw   xmm0, xmm5
+    punpcklbw   xmm1, xmm5
+
+    pmullw      xmm0, xmm6
+    pmullw      xmm1, xmm7
+    paddsw      xmm0, xmm1
+    paddsw      xmm0, xmm4                  ;rounding
+    psraw       xmm0, 7                     ;shift
+    packuswb    xmm0, xmm0                  ;pack back to byte
+%if %1
+    movq        xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+    movq        [rdi], xmm0                 ;store the result
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+%endm
+
+%macro APPLY_FILTER_16 1
+    punpcklbw   xmm0, xmm5
+    punpcklbw   xmm1, xmm5
+    punpckhbw   xmm2, xmm5
+    punpckhbw   xmm3, xmm5
+
+    pmullw      xmm0, xmm6
+    pmullw      xmm1, xmm7
+    pmullw      xmm2, xmm6
+    pmullw      xmm3, xmm7
+
+    paddsw      xmm0, xmm1
+    paddsw      xmm2, xmm3
+
+    paddsw      xmm0, xmm4                  ;rounding
+    paddsw      xmm2, xmm4
+    psraw       xmm0, 7                     ;shift
+    psraw       xmm2, 7
+    packuswb    xmm0, xmm2                  ;pack back to byte
+%if %1
+    movdqu      xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+    movdqu      [rdi], xmm0                 ;store the result
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+%endm
+
+global sym(vpx_filter_block1d4_v2_sse2) PRIVATE
+sym(vpx_filter_block1d4_v2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM_4
+.loop:
+    movd        xmm0, [rsi]                 ;load src
+    movd        xmm1, [rsi + rax]
+
+    APPLY_FILTER_4 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vpx_filter_block1d8_v2_sse2) PRIVATE
+sym(vpx_filter_block1d8_v2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movq        xmm0, [rsi]                 ;0
+    movq        xmm1, [rsi + rax]           ;1
+
+    APPLY_FILTER_8 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vpx_filter_block1d16_v2_sse2) PRIVATE
+sym(vpx_filter_block1d16_v2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu        xmm0, [rsi]               ;0
+    movdqu        xmm1, [rsi + rax]         ;1
+    movdqa        xmm2, xmm0
+    movdqa        xmm3, xmm1
+
+    APPLY_FILTER_16 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vpx_filter_block1d4_v2_avg_sse2) PRIVATE
+sym(vpx_filter_block1d4_v2_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM_4
+.loop:
+    movd        xmm0, [rsi]                 ;load src
+    movd        xmm1, [rsi + rax]
+
+    APPLY_FILTER_4 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vpx_filter_block1d8_v2_avg_sse2) PRIVATE
+sym(vpx_filter_block1d8_v2_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movq        xmm0, [rsi]                 ;0
+    movq        xmm1, [rsi + rax]           ;1
+
+    APPLY_FILTER_8 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vpx_filter_block1d16_v2_avg_sse2) PRIVATE
+sym(vpx_filter_block1d16_v2_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu        xmm0, [rsi]               ;0
+    movdqu        xmm1, [rsi + rax]         ;1
+    movdqa        xmm2, xmm0
+    movdqa        xmm3, xmm1
+
+    APPLY_FILTER_16 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vpx_filter_block1d4_h2_sse2) PRIVATE
+sym(vpx_filter_block1d4_h2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM_4
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 1
+
+    APPLY_FILTER_4 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vpx_filter_block1d8_h2_sse2) PRIVATE
+sym(vpx_filter_block1d8_h2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 1
+
+    APPLY_FILTER_8 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vpx_filter_block1d16_h2_sse2) PRIVATE
+sym(vpx_filter_block1d16_h2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu      xmm0,   [rsi]               ;load src
+    movdqu      xmm1,   [rsi + 1]
+    movdqa      xmm2, xmm0
+    movdqa      xmm3, xmm1
+
+    APPLY_FILTER_16 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vpx_filter_block1d4_h2_avg_sse2) PRIVATE
+sym(vpx_filter_block1d4_h2_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM_4
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 1
+
+    APPLY_FILTER_4 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vpx_filter_block1d8_h2_avg_sse2) PRIVATE
+sym(vpx_filter_block1d8_h2_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 1
+
+    APPLY_FILTER_8 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vpx_filter_block1d16_h2_avg_sse2) PRIVATE
+sym(vpx_filter_block1d16_h2_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu      xmm0,   [rsi]               ;load src
+    movdqu      xmm1,   [rsi + 1]
+    movdqa      xmm2, xmm0
+    movdqa      xmm3, xmm1
+
+    APPLY_FILTER_16 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/libs/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm b/libs/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm
new file mode 100644
index 0000000000..3c8cfd2253
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm
@@ -0,0 +1,422 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro GET_PARAM_4 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x0400040
+
+    movdqa      xmm3, [rdx]                 ;load filters
+    psrldq      xmm3, 6
+    packsswb    xmm3, xmm3
+    pshuflw     xmm3, xmm3, 0b              ;k3_k4
+
+    movq        xmm2, rcx                   ;rounding
+    pshufd      xmm2, xmm2, 0
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+%endm
+
+%macro APPLY_FILTER_4 1
+    punpcklbw   xmm0, xmm1
+    pmaddubsw   xmm0, xmm3
+
+    paddsw      xmm0, xmm2                  ;rounding
+    psraw       xmm0, 7                     ;shift
+    packuswb    xmm0, xmm0                  ;pack to byte
+
+%if %1
+    movd        xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+    movd        [rdi], xmm0
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+%endm
+
+%macro GET_PARAM 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x0400040
+
+    movdqa      xmm7, [rdx]                 ;load filters
+    psrldq      xmm7, 6
+    packsswb    xmm7, xmm7
+    pshuflw     xmm7, xmm7, 0b              ;k3_k4
+    punpcklwd   xmm7, xmm7
+
+    movq        xmm6, rcx                   ;rounding
+    pshufd      xmm6, xmm6, 0
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+%endm
+
+%macro APPLY_FILTER_8 1
+    punpcklbw   xmm0, xmm1
+    pmaddubsw   xmm0, xmm7
+
+    paddsw      xmm0, xmm6                  ;rounding
+    psraw       xmm0, 7                     ;shift
+    packuswb    xmm0, xmm0                  ;pack back to byte
+
+%if %1
+    movq        xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+    movq        [rdi], xmm0                 ;store the result
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+%endm
+
+%macro APPLY_FILTER_16 1
+    punpcklbw   xmm0, xmm1
+    punpckhbw   xmm2, xmm1
+    pmaddubsw   xmm0, xmm7
+    pmaddubsw   xmm2, xmm7
+
+    paddsw      xmm0, xmm6                  ;rounding
+    paddsw      xmm2, xmm6
+    psraw       xmm0, 7                     ;shift
+    psraw       xmm2, 7
+    packuswb    xmm0, xmm2                  ;pack back to byte
+
+%if %1
+    movdqu      xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+    movdqu      [rdi], xmm0                 ;store the result
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+%endm
+
+global sym(vpx_filter_block1d4_v2_ssse3) PRIVATE
+sym(vpx_filter_block1d4_v2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM_4
+.loop:
+    movd        xmm0, [rsi]                 ;load src
+    movd        xmm1, [rsi + rax]
+
+    APPLY_FILTER_4 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vpx_filter_block1d8_v2_ssse3) PRIVATE
+sym(vpx_filter_block1d8_v2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movq        xmm0, [rsi]                 ;0
+    movq        xmm1, [rsi + rax]           ;1
+
+    APPLY_FILTER_8 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vpx_filter_block1d16_v2_ssse3) PRIVATE
+sym(vpx_filter_block1d16_v2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu        xmm0, [rsi]               ;0
+    movdqu        xmm1, [rsi + rax]         ;1
+    movdqa        xmm2, xmm0
+
+    APPLY_FILTER_16 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vpx_filter_block1d4_v2_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d4_v2_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM_4
+.loop:
+    movd        xmm0, [rsi]                 ;load src
+    movd        xmm1, [rsi + rax]
+
+    APPLY_FILTER_4 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vpx_filter_block1d8_v2_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d8_v2_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movq        xmm0, [rsi]                 ;0
+    movq        xmm1, [rsi + rax]           ;1
+
+    APPLY_FILTER_8 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vpx_filter_block1d16_v2_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d16_v2_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu        xmm0, [rsi]               ;0
+    movdqu        xmm1, [rsi + rax]         ;1
+    movdqa        xmm2, xmm0
+
+    APPLY_FILTER_16 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vpx_filter_block1d4_h2_ssse3) PRIVATE
+sym(vpx_filter_block1d4_h2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM_4
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 1
+
+    APPLY_FILTER_4 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vpx_filter_block1d8_h2_ssse3) PRIVATE
+sym(vpx_filter_block1d8_h2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 1
+
+    APPLY_FILTER_8 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vpx_filter_block1d16_h2_ssse3) PRIVATE
+sym(vpx_filter_block1d16_h2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu      xmm0,   [rsi]               ;load src
+    movdqu      xmm1,   [rsi + 1]
+    movdqa      xmm2, xmm0
+
+    APPLY_FILTER_16 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vpx_filter_block1d4_h2_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d4_h2_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM_4
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 1
+
+    APPLY_FILTER_4 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vpx_filter_block1d8_h2_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d8_h2_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 1
+
+    APPLY_FILTER_8 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vpx_filter_block1d16_h2_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d16_h2_avg_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu      xmm0,   [rsi]               ;load src
+    movdqu      xmm1,   [rsi + 1]
+    movdqa      xmm2, xmm0
+
+    APPLY_FILTER_16 1
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/libs/libvpx/vpx_mem/include/vpx_mem_intrnl.h b/libs/libvpx/vpx_mem/include/vpx_mem_intrnl.h
new file mode 100644
index 0000000000..c4dd78550f
--- /dev/null
+++ b/libs/libvpx/vpx_mem/include/vpx_mem_intrnl.h
@@ -0,0 +1,31 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VPX_MEM_INCLUDE_VPX_MEM_INTRNL_H_
+#define VPX_MEM_INCLUDE_VPX_MEM_INTRNL_H_
+#include "./vpx_config.h"
+
+#define ADDRESS_STORAGE_SIZE      sizeof(size_t)
+
+#ifndef DEFAULT_ALIGNMENT
+# if defined(VXWORKS)
+#  define DEFAULT_ALIGNMENT        32        /*default addr alignment to use in
+calls to vpx_* functions other
+than vpx_memalign*/
+# else
+#  define DEFAULT_ALIGNMENT        (2 * sizeof(void*))  /* NOLINT */
+# endif
+#endif
+
+/*returns an addr aligned to the byte boundary specified by align*/
+#define align_addr(addr,align) (void*)(((size_t)(addr) + ((align) - 1)) & (size_t)-(align))
+
+#endif  // VPX_MEM_INCLUDE_VPX_MEM_INTRNL_H_
diff --git a/libs/libvpx/vpx_mem/vpx_mem.c b/libs/libvpx/vpx_mem/vpx_mem.c
new file mode 100644
index 0000000000..b261fc0da1
--- /dev/null
+++ b/libs/libvpx/vpx_mem/vpx_mem.c
@@ -0,0 +1,100 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_mem.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "include/vpx_mem_intrnl.h"
+#include "vpx/vpx_integer.h"
+
+void *vpx_memalign(size_t align, size_t size) {
+  void *addr,
+       * x = NULL;
+
+  addr = malloc(size + align - 1 + ADDRESS_STORAGE_SIZE);
+
+  if (addr) {
+    x = align_addr((unsigned char *)addr + ADDRESS_STORAGE_SIZE, (int)align);
+    /* save the actual malloc address */
+    ((size_t *)x)[-1] = (size_t)addr;
+  }
+
+  return x;
+}
+
+void *vpx_malloc(size_t size) {
+  return vpx_memalign(DEFAULT_ALIGNMENT, size);
+}
+
+void *vpx_calloc(size_t num, size_t size) {
+  void *x;
+
+  x = vpx_memalign(DEFAULT_ALIGNMENT, num * size);
+
+  if (x)
+    memset(x, 0, num * size);
+
+  return x;
+}
+
+void *vpx_realloc(void *memblk, size_t size) {
+  void *addr,
+       * new_addr = NULL;
+  int align = DEFAULT_ALIGNMENT;
+
+  /*
+  The realloc() function changes the size of the object pointed to by
+  ptr to the size specified by size, and returns a pointer to the
+  possibly moved block. The contents are unchanged up to the lesser
+  of the new and old sizes. If ptr is null, realloc() behaves like
+  malloc() for the specified size. If size is zero (0) and ptr is
+  not a null pointer, the object pointed to is freed.
+  */
+  if (!memblk)
+    new_addr = vpx_malloc(size);
+  else if (!size)
+    vpx_free(memblk);
+  else {
+    addr   = (void *)(((size_t *)memblk)[-1]);
+    memblk = NULL;
+
+    new_addr = realloc(addr, size + align + ADDRESS_STORAGE_SIZE);
+
+    if (new_addr) {
+      addr = new_addr;
+      new_addr = (void *)(((size_t)
+                           ((unsigned char *)new_addr + ADDRESS_STORAGE_SIZE) + (align - 1)) &
+                          (size_t) - align);
+      /* save the actual malloc address */
+      ((size_t *)new_addr)[-1] = (size_t)addr;
+    }
+  }
+
+  return new_addr;
+}
+
+void vpx_free(void *memblk) {
+  if (memblk) {
+    void *addr = (void *)(((size_t *)memblk)[-1]);
+    free(addr);
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void *vpx_memset16(void *dest, int val, size_t length) {
+  size_t i;
+  uint16_t *dest16 = (uint16_t *)dest;
+  for (i = 0; i < length; i++)
+    *dest16++ = val;
+  return dest;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/libs/libvpx/vpx_mem/vpx_mem.h b/libs/libvpx/vpx_mem/vpx_mem.h
new file mode 100644
index 0000000000..a006e0f00b
--- /dev/null
+++ b/libs/libvpx/vpx_mem/vpx_mem.h
@@ -0,0 +1,47 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VPX_MEM_VPX_MEM_H_
+#define VPX_MEM_VPX_MEM_H_
+
+#include "vpx_config.h"
+#if defined(__uClinux__)
+# include <lddk.h>
+#endif
+
+#include <stdlib.h>
+#include <stddef.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+  void *vpx_memalign(size_t align, size_t size);
+  void *vpx_malloc(size_t size);
+  void *vpx_calloc(size_t num, size_t size);
+  void *vpx_realloc(void *memblk, size_t size);
+  void vpx_free(void *memblk);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  void *vpx_memset16(void *dest, int val, size_t length);
+#endif
+
+#include <string.h>
+
+#ifdef VPX_MEM_PLTFRM
+# include VPX_MEM_PLTFRM
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif  // VPX_MEM_VPX_MEM_H_
diff --git a/libs/libvpx/vpx_mem/vpx_mem.mk b/libs/libvpx/vpx_mem/vpx_mem.mk
new file mode 100644
index 0000000000..7f275eabf9
--- /dev/null
+++ b/libs/libvpx/vpx_mem/vpx_mem.mk
@@ -0,0 +1,4 @@
+MEM_SRCS-yes += vpx_mem.mk
+MEM_SRCS-yes += vpx_mem.c
+MEM_SRCS-yes += vpx_mem.h
+MEM_SRCS-yes += include/vpx_mem_intrnl.h
diff --git a/libs/libvpx/vpx_ports/arm.h b/libs/libvpx/vpx_ports/arm.h
new file mode 100644
index 0000000000..42c98f5a83
--- /dev/null
+++ b/libs/libvpx/vpx_ports/arm.h
@@ -0,0 +1,41 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VPX_PORTS_ARM_H_
+#define VPX_PORTS_ARM_H_
+#include <stdlib.h>
+#include "vpx_config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*ARMv5TE "Enhanced DSP" instructions.*/
+#define HAS_EDSP  0x01
+/*ARMv6 "Parallel" or "Media" instructions.*/
+#define HAS_MEDIA 0x02
+/*ARMv7 optional NEON instructions.*/
+#define HAS_NEON  0x04
+
+int arm_cpu_caps(void);
+
+// Earlier gcc compilers have issues with some neon intrinsics
+#if !defined(__clang__) && defined(__GNUC__) && \
+    __GNUC__ == 4 && __GNUC_MINOR__ <= 6
+#define VPX_INCOMPATIBLE_GCC
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_PORTS_ARM_H_
+
diff --git a/libs/libvpx/vpx_ports/arm_cpudetect.c b/libs/libvpx/vpx_ports/arm_cpudetect.c
new file mode 100644
index 0000000000..8a4b8af964
--- /dev/null
+++ b/libs/libvpx/vpx_ports/arm_cpudetect.c
@@ -0,0 +1,175 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include "vpx_ports/arm.h"
+#include "./vpx_config.h"
+
+#ifdef WINAPI_FAMILY
+#include <winapifamily.h>
+#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#define getenv(x) NULL
+#endif
+#endif
+
+static int arm_cpu_env_flags(int *flags) {
+  char *env;
+  env = getenv("VPX_SIMD_CAPS");
+  if (env && *env) {
+    *flags = (int)strtol(env, NULL, 0);
+    return 0;
+  }
+  *flags = 0;
+  return -1;
+}
+
+static int arm_cpu_env_mask(void) {
+  char *env;
+  env = getenv("VPX_SIMD_CAPS_MASK");
+  return env && *env ? (int)strtol(env, NULL, 0) : ~0;
+}
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+int arm_cpu_caps(void) {
+  /* This function should actually be a no-op. There is no way to adjust any of
+   * these because the RTCD tables do not exist: the functions are called
+   * statically */
+  int flags;
+  int mask;
+  if (!arm_cpu_env_flags(&flags)) {
+    return flags;
+  }
+  mask = arm_cpu_env_mask();
+#if HAVE_MEDIA
+  flags |= HAS_MEDIA;
+#endif /* HAVE_MEDIA */
+#if HAVE_NEON || HAVE_NEON_ASM
+  flags |= HAS_NEON;
+#endif /* HAVE_NEON  || HAVE_NEON_ASM */
+  return flags & mask;
+}
+
+#elif defined(_MSC_VER) /* end !CONFIG_RUNTIME_CPU_DETECT */
+/*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/
+#define WIN32_LEAN_AND_MEAN
+#define WIN32_EXTRA_LEAN
+#include <windows.h>
+
+int arm_cpu_caps(void) {
+  int flags;
+  int mask;
+  if (!arm_cpu_env_flags(&flags)) {
+    return flags;
+  }
+  mask = arm_cpu_env_mask();
+  /* MSVC has no inline __asm support for ARM, but it does let you __emit
+   *  instructions via their assembled hex code.
+   * All of these instructions should be essentially nops.
+   */
+#if HAVE_MEDIA
+  if (mask & HAS_MEDIA)
+    __try {
+      /*SHADD8 r3,r3,r3*/
+      __emit(0xE6333F93);
+      flags |= HAS_MEDIA;
+    } __except (GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION) {
+    /*Ignore exception.*/
+  }
+}
+#endif /* HAVE_MEDIA */
+#if HAVE_NEON || HAVE_NEON_ASM
+if (mask &HAS_NEON) {
+  __try {
+    /*VORR q0,q0,q0*/
+    __emit(0xF2200150);
+    flags |= HAS_NEON;
+  } __except (GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION) {
+    /*Ignore exception.*/
+  }
+}
+#endif /* HAVE_NEON || HAVE_NEON_ASM */
+return flags & mask;
+}
+
+#elif defined(__ANDROID__) /* end _MSC_VER */
+#include <cpu-features.h>
+
+int arm_cpu_caps(void) {
+  int flags;
+  int mask;
+  uint64_t features;
+  if (!arm_cpu_env_flags(&flags)) {
+    return flags;
+  }
+  mask = arm_cpu_env_mask();
+  features = android_getCpuFeatures();
+
+#if HAVE_MEDIA
+  flags |= HAS_MEDIA;
+#endif /* HAVE_MEDIA */
+#if HAVE_NEON || HAVE_NEON_ASM
+  if (features & ANDROID_CPU_ARM_FEATURE_NEON)
+    flags |= HAS_NEON;
+#endif /* HAVE_NEON || HAVE_NEON_ASM */
+  return flags & mask;
+}
+
+#elif defined(__linux__) /* end __ANDROID__ */
+
+#include <stdio.h>
+
+int arm_cpu_caps(void) {
+  FILE *fin;
+  int flags;
+  int mask;
+  if (!arm_cpu_env_flags(&flags)) {
+    return flags;
+  }
+  mask = arm_cpu_env_mask();
+  /* Reading /proc/self/auxv would be easier, but that doesn't work reliably
+   *  on Android.
+   * This also means that detection will fail in Scratchbox.
+   */
+  fin = fopen("/proc/cpuinfo", "r");
+  if (fin != NULL) {
+    /* 512 should be enough for anybody (it's even enough for all the flags
+     * that x86 has accumulated... so far).
+     */
+    char buf[512];
+    while (fgets(buf, 511, fin) != NULL) {
+#if HAVE_NEON || HAVE_NEON_ASM
+      if (memcmp(buf, "Features", 8) == 0) {
+        char *p;
+        p = strstr(buf, " neon");
+        if (p != NULL && (p[5] == ' ' || p[5] == '\n')) {
+          flags |= HAS_NEON;
+        }
+      }
+#endif /* HAVE_NEON || HAVE_NEON_ASM */
+#if HAVE_MEDIA
+      if (memcmp(buf, "CPU architecture:", 17) == 0) {
+        int version;
+        version = atoi(buf + 17);
+        if (version >= 6) {
+          flags |= HAS_MEDIA;
+        }
+      }
+#endif /* HAVE_MEDIA */
+    }
+    fclose(fin);
+  }
+  return flags & mask;
+}
+#else /* end __linux__ */
+#error "--enable-runtime-cpu-detect selected, but no CPU detection method " \
+"available for your platform. Reconfigure with --disable-runtime-cpu-detect."
+#endif
diff --git a/libs/libvpx/vpx_ports/bitops.h b/libs/libvpx/vpx_ports/bitops.h
new file mode 100644
index 0000000000..84ff3659fe
--- /dev/null
+++ b/libs/libvpx/vpx_ports/bitops.h
@@ -0,0 +1,76 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_PORTS_BITOPS_H_
+#define VPX_PORTS_BITOPS_H_
+
+#include <assert.h>
+
+#include "vpx_ports/msvc.h"
+
+#ifdef _MSC_VER
+# include <math.h>  // the ceil() definition must precede intrin.h
+# if _MSC_VER > 1310 && (defined(_M_X64) || defined(_M_IX86))
+#  include <intrin.h>
+#  define USE_MSC_INTRINSICS
+# endif
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// These versions of get_msb() are only valid when n != 0 because all
+// of the optimized versions are undefined when n == 0:
+// https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html
+
+// use GNU builtins where available.
+#if defined(__GNUC__) && \
+    ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4)
+static INLINE int get_msb(unsigned int n) {
+  assert(n != 0);
+  return 31 ^ __builtin_clz(n);
+}
+#elif defined(USE_MSC_INTRINSICS)
+#pragma intrinsic(_BitScanReverse)
+
+static INLINE int get_msb(unsigned int n) {
+  unsigned long first_set_bit;
+  assert(n != 0);
+  _BitScanReverse(&first_set_bit, n);
+  return first_set_bit;
+}
+#undef USE_MSC_INTRINSICS
+#else
+// Returns (int)floor(log2(n)). n must be > 0.
+static INLINE int get_msb(unsigned int n) {
+  int log = 0;
+  unsigned int value = n;
+  int i;
+
+  assert(n != 0);
+
+  for (i = 4; i >= 0; --i) {
+    const int shift = (1 << i);
+    const unsigned int x = value >> shift;
+    if (x != 0) {
+      value = x;
+      log += shift;
+    }
+  }
+  return log;
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_PORTS_BITOPS_H_
diff --git a/libs/libvpx/vpx_ports/config.h b/libs/libvpx/vpx_ports/config.h
new file mode 100644
index 0000000000..3c1ab99f4a
--- /dev/null
+++ b/libs/libvpx/vpx_ports/config.h
@@ -0,0 +1,16 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_PORTS_CONFIG_H_
+#define VPX_PORTS_CONFIG_H_
+
+#include "vpx_config.h"
+
+#endif  // VPX_PORTS_CONFIG_H_
diff --git a/libs/libvpx/vpx_ports/emmintrin_compat.h b/libs/libvpx/vpx_ports/emmintrin_compat.h
new file mode 100644
index 0000000000..16176383d2
--- /dev/null
+++ b/libs/libvpx/vpx_ports/emmintrin_compat.h
@@ -0,0 +1,55 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_PORTS_EMMINTRIN_COMPAT_H_
+#define VPX_PORTS_EMMINTRIN_COMPAT_H_
+
+#if defined(__GNUC__) && __GNUC__ < 4
+/* From emmintrin.h (gcc 4.5.3) */
+/* Casts between various SP, DP, INT vector types.  Note that these do no
+   conversion of values, they just change the type.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castpd_ps(__m128d __A)
+{
+  return (__m128) __A;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castpd_si128(__m128d __A)
+{
+  return (__m128i) __A;
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castps_pd(__m128 __A)
+{
+  return (__m128d) __A;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castps_si128(__m128 __A)
+{
+  return (__m128i) __A;
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castsi128_ps(__m128i __A)
+{
+  return (__m128) __A;
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castsi128_pd(__m128i __A)
+{
+  return (__m128d) __A;
+}
+#endif
+
+#endif  // VPX_PORTS_EMMINTRIN_COMPAT_H_
diff --git a/libs/libvpx/vpx_ports/emms.asm b/libs/libvpx/vpx_ports/emms.asm
new file mode 100644
index 0000000000..db8da28737
--- /dev/null
+++ b/libs/libvpx/vpx_ports/emms.asm
@@ -0,0 +1,38 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+section .text
+global sym(vpx_reset_mmx_state) PRIVATE
+sym(vpx_reset_mmx_state):
+    emms
+    ret
+
+
+%if LIBVPX_YASM_WIN64
+global sym(vpx_winx64_fldcw) PRIVATE
+sym(vpx_winx64_fldcw):
+    sub   rsp, 8
+    mov   [rsp], rcx ; win x64 specific
+    fldcw [rsp]
+    add   rsp, 8
+    ret
+
+
+global sym(vpx_winx64_fstcw) PRIVATE
+sym(vpx_winx64_fstcw):
+    sub   rsp, 8
+    fstcw [rsp]
+    mov   rax, [rsp]
+    add   rsp, 8
+    ret
+%endif
diff --git a/libs/libvpx/vpx_ports/mem.h b/libs/libvpx/vpx_ports/mem.h
new file mode 100644
index 0000000000..7502f90632
--- /dev/null
+++ b/libs/libvpx/vpx_ports/mem.h
@@ -0,0 +1,53 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VPX_PORTS_MEM_H_
+#define VPX_PORTS_MEM_H_
+
+#include "vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+#if (defined(__GNUC__) && __GNUC__) || defined(__SUNPRO_C)
+#define DECLARE_ALIGNED(n,typ,val)  typ val __attribute__ ((aligned (n)))
+#elif defined(_MSC_VER)
+#define DECLARE_ALIGNED(n,typ,val)  __declspec(align(n)) typ val
+#else
+#warning No alignment directives known for this compiler.
+#define DECLARE_ALIGNED(n,typ,val)  typ val
+#endif
+
+/* Indicates that the usage of the specified variable has been audited to assure
+ * that it's safe to use uninitialized. Silences 'may be used uninitialized'
+ * warnings on gcc.
+ */
+#if defined(__GNUC__) && __GNUC__
+#define UNINITIALIZED_IS_SAFE(x) x=x
+#else
+#define UNINITIALIZED_IS_SAFE(x) x
+#endif
+
+#if HAVE_NEON && defined(_MSC_VER)
+#define __builtin_prefetch(x)
+#endif
+
+/* Shift down with rounding */
+#define ROUND_POWER_OF_TWO(value, n) \
+    (((value) + (1 << ((n) - 1))) >> (n))
+
+#define ALIGN_POWER_OF_TWO(value, n) \
+    (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
+
+#if CONFIG_VP9_HIGHBITDEPTH
+#define CONVERT_TO_SHORTPTR(x) ((uint16_t*)(((uintptr_t)x) << 1))
+#define CONVERT_TO_BYTEPTR(x) ((uint8_t*)(((uintptr_t)x) >> 1))
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#endif  // VPX_PORTS_MEM_H_
diff --git a/libs/libvpx/vpx_ports/mem_ops.h b/libs/libvpx/vpx_ports/mem_ops.h
new file mode 100644
index 0000000000..d4a3d773f3
--- /dev/null
+++ b/libs/libvpx/vpx_ports/mem_ops.h
@@ -0,0 +1,226 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_PORTS_MEM_OPS_H_
+#define VPX_PORTS_MEM_OPS_H_
+
+/* \file
+ * \brief Provides portable memory access primitives
+ *
+ * This function provides portable primitives for getting and setting of
+ * signed and unsigned integers in 16, 24, and 32 bit sizes. The operations
+ * can be performed on unaligned data regardless of hardware support for
+ * unaligned accesses.
+ *
+ * The type used to pass the integral values may be changed by defining
+ * MEM_VALUE_T with the appropriate type. The type given must be an integral
+ * numeric type.
+ *
+ * The actual functions instantiated have the MEM_VALUE_T type name pasted
+ * on to the symbol name. This allows the developer to instantiate these
+ * operations for multiple types within the same translation unit. This is
+ * of somewhat questionable utility, but the capability exists nonetheless.
+ * Users not making use of this functionality should call the functions
+ * without the type name appended, and the preprocessor will take care of
+ * it.
+ *
+ * NOTE: This code is not supported on platforms where char > 1 octet ATM.
+ */
+
+#ifndef MAU_T
+/* Minimum Access Unit for this target */
+#define MAU_T unsigned char
+#endif
+
+#ifndef MEM_VALUE_T
+#define MEM_VALUE_T int
+#endif
+
+#undef MEM_VALUE_T_SZ_BITS
+#define MEM_VALUE_T_SZ_BITS (sizeof(MEM_VALUE_T) << 3)
+
+#undef  mem_ops_wrap_symbol
+#define mem_ops_wrap_symbol(fn) mem_ops_wrap_symbol2(fn, MEM_VALUE_T)
+#undef  mem_ops_wrap_symbol2
+#define mem_ops_wrap_symbol2(fn,typ) mem_ops_wrap_symbol3(fn,typ)
+#undef  mem_ops_wrap_symbol3
+#define mem_ops_wrap_symbol3(fn,typ) fn##_as_##typ
+
+/*
+ * Include aligned access routines
+ */
+#define INCLUDED_BY_MEM_OPS_H
+#include "mem_ops_aligned.h"
+#undef  INCLUDED_BY_MEM_OPS_H
+
+#undef  mem_get_be16
+#define mem_get_be16 mem_ops_wrap_symbol(mem_get_be16)
+static unsigned MEM_VALUE_T mem_get_be16(const void *vmem) {
+  unsigned MEM_VALUE_T  val;
+  const MAU_T          *mem = (const MAU_T *)vmem;
+
+  val = mem[0] << 8;
+  val |= mem[1];
+  return val;
+}
+
+#undef  mem_get_be24
+#define mem_get_be24 mem_ops_wrap_symbol(mem_get_be24)
+static unsigned MEM_VALUE_T mem_get_be24(const void *vmem) {
+  unsigned MEM_VALUE_T  val;
+  const MAU_T          *mem = (const MAU_T *)vmem;
+
+  val = mem[0] << 16;
+  val |= mem[1] << 8;
+  val |= mem[2];
+  return val;
+}
+
+#undef  mem_get_be32
+#define mem_get_be32 mem_ops_wrap_symbol(mem_get_be32)
+static unsigned MEM_VALUE_T mem_get_be32(const void *vmem) {
+  unsigned MEM_VALUE_T  val;
+  const MAU_T          *mem = (const MAU_T *)vmem;
+
+  val = mem[0] << 24;
+  val |= mem[1] << 16;
+  val |= mem[2] << 8;
+  val |= mem[3];
+  return val;
+}
+
+#undef  mem_get_le16
+#define mem_get_le16 mem_ops_wrap_symbol(mem_get_le16)
+static unsigned MEM_VALUE_T mem_get_le16(const void *vmem) {
+  unsigned MEM_VALUE_T  val;
+  const MAU_T          *mem = (const MAU_T *)vmem;
+
+  val = mem[1] << 8;
+  val |= mem[0];
+  return val;
+}
+
+#undef  mem_get_le24
+#define mem_get_le24 mem_ops_wrap_symbol(mem_get_le24)
+static unsigned MEM_VALUE_T mem_get_le24(const void *vmem) {
+  unsigned MEM_VALUE_T  val;
+  const MAU_T          *mem = (const MAU_T *)vmem;
+
+  val = mem[2] << 16;
+  val |= mem[1] << 8;
+  val |= mem[0];
+  return val;
+}
+
+#undef  mem_get_le32
+#define mem_get_le32 mem_ops_wrap_symbol(mem_get_le32)
+static unsigned MEM_VALUE_T mem_get_le32(const void *vmem) {
+  unsigned MEM_VALUE_T  val;
+  const MAU_T          *mem = (const MAU_T *)vmem;
+
+  val = mem[3] << 24;
+  val |= mem[2] << 16;
+  val |= mem[1] << 8;
+  val |= mem[0];
+  return val;
+}
+
+#define mem_get_s_generic(end,sz) \
+  static VPX_INLINE signed MEM_VALUE_T mem_get_s##end##sz(const void *vmem) {\
+    const MAU_T *mem = (const MAU_T*)vmem;\
+    signed MEM_VALUE_T val = mem_get_##end##sz(mem);\
+    return (val << (MEM_VALUE_T_SZ_BITS - sz)) >> (MEM_VALUE_T_SZ_BITS - sz);\
+  }
+
+#undef  mem_get_sbe16
+#define mem_get_sbe16 mem_ops_wrap_symbol(mem_get_sbe16)
+mem_get_s_generic(be, 16)
+
+#undef  mem_get_sbe24
+#define mem_get_sbe24 mem_ops_wrap_symbol(mem_get_sbe24)
+mem_get_s_generic(be, 24)
+
+#undef  mem_get_sbe32
+#define mem_get_sbe32 mem_ops_wrap_symbol(mem_get_sbe32)
+mem_get_s_generic(be, 32)
+
+#undef  mem_get_sle16
+#define mem_get_sle16 mem_ops_wrap_symbol(mem_get_sle16)
+mem_get_s_generic(le, 16)
+
+#undef  mem_get_sle24
+#define mem_get_sle24 mem_ops_wrap_symbol(mem_get_sle24)
+mem_get_s_generic(le, 24)
+
+#undef  mem_get_sle32
+#define mem_get_sle32 mem_ops_wrap_symbol(mem_get_sle32)
+mem_get_s_generic(le, 32)
+
+#undef  mem_put_be16
+#define mem_put_be16 mem_ops_wrap_symbol(mem_put_be16)
+static VPX_INLINE void mem_put_be16(void *vmem, MEM_VALUE_T val) {
+  MAU_T *mem = (MAU_T *)vmem;
+
+  mem[0] = (val >> 8) & 0xff;
+  mem[1] = (val >> 0) & 0xff;
+}
+
+#undef  mem_put_be24
+#define mem_put_be24 mem_ops_wrap_symbol(mem_put_be24)
+static VPX_INLINE void mem_put_be24(void *vmem, MEM_VALUE_T val) {
+  MAU_T *mem = (MAU_T *)vmem;
+
+  mem[0] = (val >> 16) & 0xff;
+  mem[1] = (val >>  8) & 0xff;
+  mem[2] = (val >>  0) & 0xff;
+}
+
+#undef  mem_put_be32
+#define mem_put_be32 mem_ops_wrap_symbol(mem_put_be32)
+static VPX_INLINE void mem_put_be32(void *vmem, MEM_VALUE_T val) {
+  MAU_T *mem = (MAU_T *)vmem;
+
+  mem[0] = (val >> 24) & 0xff;
+  mem[1] = (val >> 16) & 0xff;
+  mem[2] = (val >>  8) & 0xff;
+  mem[3] = (val >>  0) & 0xff;
+}
+
+#undef  mem_put_le16
+#define mem_put_le16 mem_ops_wrap_symbol(mem_put_le16)
+static VPX_INLINE void mem_put_le16(void *vmem, MEM_VALUE_T val) {
+  MAU_T *mem = (MAU_T *)vmem;
+
+  mem[0] = (val >>  0) & 0xff;
+  mem[1] = (val >>  8) & 0xff;
+}
+
+#undef  mem_put_le24
+#define mem_put_le24 mem_ops_wrap_symbol(mem_put_le24)
+static VPX_INLINE void mem_put_le24(void *vmem, MEM_VALUE_T val) {
+  MAU_T *mem = (MAU_T *)vmem;
+
+  mem[0] = (val >>  0) & 0xff;
+  mem[1] = (val >>  8) & 0xff;
+  mem[2] = (val >> 16) & 0xff;
+}
+
+#undef  mem_put_le32
+#define mem_put_le32 mem_ops_wrap_symbol(mem_put_le32)
+static VPX_INLINE void mem_put_le32(void *vmem, MEM_VALUE_T val) {
+  MAU_T *mem = (MAU_T *)vmem;
+
+  mem[0] = (val >>  0) & 0xff;
+  mem[1] = (val >>  8) & 0xff;
+  mem[2] = (val >> 16) & 0xff;
+  mem[3] = (val >> 24) & 0xff;
+}
+
+#endif  // VPX_PORTS_MEM_OPS_H_
diff --git a/libs/libvpx/vpx_ports/mem_ops_aligned.h b/libs/libvpx/vpx_ports/mem_ops_aligned.h
new file mode 100644
index 0000000000..c16111fec5
--- /dev/null
+++ b/libs/libvpx/vpx_ports/mem_ops_aligned.h
@@ -0,0 +1,169 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_PORTS_MEM_OPS_ALIGNED_H_
+#define VPX_PORTS_MEM_OPS_ALIGNED_H_
+
+#include "vpx/vpx_integer.h"
+
+/* \file
+ * \brief Provides portable memory access primitives for operating on aligned
+ *        data
+ *
+ * This file is split from mem_ops.h for easier maintenance. See mem_ops.h
+ * for a more detailed description of these primitives.
+ */
+#ifndef INCLUDED_BY_MEM_OPS_H
+#error Include mem_ops.h, not mem_ops_aligned.h directly.
+#endif
+
+/* Architectures that provide instructions for doing this byte swapping
+ * could redefine these macros.
+ */
+#define swap_endian_16(val,raw) do {\
+    val = ((raw>>8) & 0x00ff) \
+          | ((raw<<8) & 0xff00);\
+  } while(0)
+#define swap_endian_32(val,raw) do {\
+    val = ((raw>>24) & 0x000000ff) \
+          | ((raw>>8)  & 0x0000ff00) \
+          | ((raw<<8)  & 0x00ff0000) \
+          | ((raw<<24) & 0xff000000); \
+  } while(0)
+#define swap_endian_16_se(val,raw) do {\
+    swap_endian_16(val,raw);\
+    val = ((val << 16) >> 16);\
+  } while(0)
+#define swap_endian_32_se(val,raw) swap_endian_32(val,raw)
+
+#define mem_get_ne_aligned_generic(end,sz) \
+  static VPX_INLINE unsigned MEM_VALUE_T \
+    mem_get_##end##sz##_aligned(const void *vmem) {\
+    const uint##sz##_t *mem = (const uint##sz##_t *)vmem;\
+    return *mem;\
+  }
+
+#define mem_get_sne_aligned_generic(end,sz) \
+  static VPX_INLINE signed MEM_VALUE_T \
+    mem_get_s##end##sz##_aligned(const void *vmem) {\
+    const int##sz##_t *mem = (const int##sz##_t *)vmem;\
+    return *mem;\
+  }
+
+#define mem_get_se_aligned_generic(end,sz) \
+  static VPX_INLINE unsigned MEM_VALUE_T \
+    mem_get_##end##sz##_aligned(const void *vmem) {\
+    const uint##sz##_t *mem = (const uint##sz##_t *)vmem;\
+    unsigned MEM_VALUE_T val, raw = *mem;\
+    swap_endian_##sz(val,raw);\
+    return val;\
+  }
+
+#define mem_get_sse_aligned_generic(end,sz) \
+  static VPX_INLINE signed MEM_VALUE_T \
+    mem_get_s##end##sz##_aligned(const void *vmem) {\
+    const int##sz##_t *mem = (const int##sz##_t *)vmem;\
+    unsigned MEM_VALUE_T val, raw = *mem;\
+    swap_endian_##sz##_se(val,raw);\
+    return val;\
+  }
+
+#define mem_put_ne_aligned_generic(end,sz) \
+  static VPX_INLINE void \
+    mem_put_##end##sz##_aligned(void *vmem, MEM_VALUE_T val) {\
+    uint##sz##_t *mem = (uint##sz##_t *)vmem;\
+    *mem = (uint##sz##_t)val;\
+  }
+
+#define mem_put_se_aligned_generic(end,sz) \
+  static VPX_INLINE void \
+    mem_put_##end##sz##_aligned(void *vmem, MEM_VALUE_T val) {\
+    uint##sz##_t *mem = (uint##sz##_t *)vmem, raw;\
+    swap_endian_##sz(raw,val);\
+    *mem = (uint##sz##_t)raw;\
+  }
+
+#include "vpx_config.h"
+#if CONFIG_BIG_ENDIAN
+#define mem_get_be_aligned_generic(sz)  mem_get_ne_aligned_generic(be,sz)
+#define mem_get_sbe_aligned_generic(sz) mem_get_sne_aligned_generic(be,sz)
+#define mem_get_le_aligned_generic(sz)  mem_get_se_aligned_generic(le,sz)
+#define mem_get_sle_aligned_generic(sz) mem_get_sse_aligned_generic(le,sz)
+#define mem_put_be_aligned_generic(sz)  mem_put_ne_aligned_generic(be,sz)
+#define mem_put_le_aligned_generic(sz)  mem_put_se_aligned_generic(le,sz)
+#else
+#define mem_get_be_aligned_generic(sz)  mem_get_se_aligned_generic(be,sz)
+#define mem_get_sbe_aligned_generic(sz) mem_get_sse_aligned_generic(be,sz)
+#define mem_get_le_aligned_generic(sz)  mem_get_ne_aligned_generic(le,sz)
+#define mem_get_sle_aligned_generic(sz) mem_get_sne_aligned_generic(le,sz)
+#define mem_put_be_aligned_generic(sz)  mem_put_se_aligned_generic(be,sz)
+#define mem_put_le_aligned_generic(sz)  mem_put_ne_aligned_generic(le,sz)
+#endif
+
+#undef  mem_get_be16_aligned
+#define mem_get_be16_aligned mem_ops_wrap_symbol(mem_get_be16_aligned)
+mem_get_be_aligned_generic(16)
+
+#undef  mem_get_be32_aligned
+#define mem_get_be32_aligned mem_ops_wrap_symbol(mem_get_be32_aligned)
+mem_get_be_aligned_generic(32)
+
+#undef  mem_get_le16_aligned
+#define mem_get_le16_aligned mem_ops_wrap_symbol(mem_get_le16_aligned)
+mem_get_le_aligned_generic(16)
+
+#undef  mem_get_le32_aligned
+#define mem_get_le32_aligned mem_ops_wrap_symbol(mem_get_le32_aligned)
+mem_get_le_aligned_generic(32)
+
+#undef  mem_get_sbe16_aligned
+#define mem_get_sbe16_aligned mem_ops_wrap_symbol(mem_get_sbe16_aligned)
+mem_get_sbe_aligned_generic(16)
+
+#undef  mem_get_sbe32_aligned
+#define mem_get_sbe32_aligned mem_ops_wrap_symbol(mem_get_sbe32_aligned)
+mem_get_sbe_aligned_generic(32)
+
+#undef  mem_get_sle16_aligned
+#define mem_get_sle16_aligned mem_ops_wrap_symbol(mem_get_sle16_aligned)
+mem_get_sle_aligned_generic(16)
+
+#undef  mem_get_sle32_aligned
+#define mem_get_sle32_aligned mem_ops_wrap_symbol(mem_get_sle32_aligned)
+mem_get_sle_aligned_generic(32)
+
+#undef  mem_put_be16_aligned
+#define mem_put_be16_aligned mem_ops_wrap_symbol(mem_put_be16_aligned)
+mem_put_be_aligned_generic(16)
+
+#undef  mem_put_be32_aligned
+#define mem_put_be32_aligned mem_ops_wrap_symbol(mem_put_be32_aligned)
+mem_put_be_aligned_generic(32)
+
+#undef  mem_put_le16_aligned
+#define mem_put_le16_aligned mem_ops_wrap_symbol(mem_put_le16_aligned)
+mem_put_le_aligned_generic(16)
+
+#undef  mem_put_le32_aligned
+#define mem_put_le32_aligned mem_ops_wrap_symbol(mem_put_le32_aligned)
+mem_put_le_aligned_generic(32)
+
+#undef mem_get_ne_aligned_generic
+#undef mem_get_se_aligned_generic
+#undef mem_get_sne_aligned_generic
+#undef mem_get_sse_aligned_generic
+#undef mem_put_ne_aligned_generic
+#undef mem_put_se_aligned_generic
+#undef swap_endian_16
+#undef swap_endian_32
+#undef swap_endian_16_se
+#undef swap_endian_32_se
+
+#endif  // VPX_PORTS_MEM_OPS_ALIGNED_H_
diff --git a/libs/libvpx/vpx_ports/msvc.h b/libs/libvpx/vpx_ports/msvc.h
new file mode 100644
index 0000000000..cab77405f4
--- /dev/null
+++ b/libs/libvpx/vpx_ports/msvc.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_PORTS_MSVC_H_
+#define VPX_PORTS_MSVC_H_
+#ifdef _MSC_VER
+
+#include "./vpx_config.h"
+
+# if _MSC_VER < 1900  // VS2015 provides snprintf
+#  define snprintf _snprintf
+# endif  // _MSC_VER < 1900
+
+#if _MSC_VER < 1800  // VS2013 provides round
+#include <math.h>
+static INLINE double round(double x) {
+  if (x < 0)
+    return ceil(x - 0.5);
+  else
+    return floor(x + 0.5);
+}
+#endif  // _MSC_VER < 1800
+
+#endif  // _MSC_VER
+#endif  // VPX_PORTS_MSVC_H_
diff --git a/libs/libvpx/vpx_ports/system_state.h b/libs/libvpx/vpx_ports/system_state.h
new file mode 100644
index 0000000000..086c64681f
--- /dev/null
+++ b/libs/libvpx/vpx_ports/system_state.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_PORTS_SYSTEM_STATE_H_
+#define VPX_PORTS_SYSTEM_STATE_H_
+
+#include "./vpx_config.h"
+
+#if ARCH_X86 || ARCH_X86_64
+void vpx_reset_mmx_state(void);
+#define vpx_clear_system_state() vpx_reset_mmx_state()
+#else
+#define vpx_clear_system_state()
+#endif  // ARCH_X86 || ARCH_X86_64
+#endif  // VPX_PORTS_SYSTEM_STATE_H_
diff --git a/libs/libvpx/vpx_ports/vpx_once.h b/libs/libvpx/vpx_ports/vpx_once.h
new file mode 100644
index 0000000000..da04db4590
--- /dev/null
+++ b/libs/libvpx/vpx_ports/vpx_once.h
@@ -0,0 +1,150 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_PORTS_VPX_ONCE_H_
+#define VPX_PORTS_VPX_ONCE_H_
+
+#include "vpx_config.h"
+
+/* Implement a function wrapper to guarantee initialization
+ * thread-safety for library singletons.
+ *
+ * NOTE: These functions use static locks, and can only be
+ * used with one common argument per compilation unit. So
+ *
+ * file1.c:
+ *   vpx_once(foo);
+ *   ...
+ *   vpx_once(foo);
+ *
+ *   file2.c:
+ *     vpx_once(bar);
+ *
+ * will ensure foo() and bar() are each called only once, but in
+ *
+ * file1.c:
+ *   vpx_once(foo);
+ *   vpx_once(bar):
+ *
+ * bar() will never be called because the lock is used up
+ * by the call to foo().
+ */
+
+#if CONFIG_MULTITHREAD && defined(_WIN32)
+#include <windows.h>
+#include <stdlib.h>
+/* Declare a per-compilation-unit state variable to track the progress
+ * of calling func() only once. This must be at global scope because
+ * local initializers are not thread-safe in MSVC prior to Visual
+ * Studio 2015.
+ *
+ * As a static, once_state will be zero-initialized as program start.
+ */
+static LONG once_state;
+static void once(void (*func)(void))
+{
+    /* Try to advance once_state from its initial value of 0 to 1.
+     * Only one thread can succeed in doing so.
+     */
+    if (InterlockedCompareExchange(&once_state, 1, 0) == 0) {
+        /* We're the winning thread, having set once_state to 1.
+         * Call our function. */
+        func();
+        /* Now advance once_state to 2, unblocking any other threads. */
+        InterlockedIncrement(&once_state);
+        return;
+    }
+
+    /* We weren't the winning thread, but we want to block on
+     * the state variable so we don't return before func()
+     * has finished executing elsewhere.
+     *
+     * Try to advance once_state from 2 to 2, which is only possible
+     * after the winning thead advances it from 1 to 2.
+     */
+    while (InterlockedCompareExchange(&once_state, 2, 2) != 2) {
+        /* State isn't yet 2. Try again.
+         *
+         * We are used for singleton initialization functions,
+         * which should complete quickly. Contention will likewise
+         * be rare, so it's worthwhile to use a simple but cpu-
+         * intensive busy-wait instead of successive backoff,
+         * waiting on a kernel object, or another heavier-weight scheme.
+         *
+         * We can at least yield our timeslice.
+         */
+        Sleep(0);
+    }
+
+    /* We've seen once_state advance to 2, so we know func()
+     * has been called. And we've left once_state as we found it,
+     * so other threads will have the same experience.
+     *
+     * It's safe to return now.
+     */
+    return;
+}
+
+
+#elif CONFIG_MULTITHREAD && defined(__OS2__)
+#define INCL_DOS
+#include <os2.h>
+static void once(void (*func)(void))
+{
+    static int done;
+
+    /* If the initialization is complete, return early. */
+    if(done)
+        return;
+
+    /* Causes all other threads in the process to block themselves
+     * and give up their time slice.
+     */
+    DosEnterCritSec();
+
+    if (!done)
+    {
+        func();
+        done = 1;
+    }
+
+    /* Restores normal thread dispatching for the current process. */
+    DosExitCritSec();
+}
+
+
+#elif CONFIG_MULTITHREAD && HAVE_PTHREAD_H
+#include <pthread.h>
+static void once(void (*func)(void))
+{
+    static pthread_once_t lock = PTHREAD_ONCE_INIT;
+    pthread_once(&lock, func);
+}
+
+
+#else
+/* No-op version that performs no synchronization. *_rtcd() is idempotent,
+ * so as long as your platform provides atomic loads/stores of pointers
+ * no synchronization is strictly necessary.
+ */
+
+static void once(void (*func)(void))
+{
+    static int done;
+
+    if(!done)
+    {
+        func();
+        done = 1;
+    }
+}
+#endif
+
+#endif  // VPX_PORTS_VPX_ONCE_H_
diff --git a/libs/libvpx/vpx_ports/vpx_ports.mk b/libs/libvpx/vpx_ports/vpx_ports.mk
new file mode 100644
index 0000000000..36b14936df
--- /dev/null
+++ b/libs/libvpx/vpx_ports/vpx_ports.mk
@@ -0,0 +1,27 @@
+##
+##  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+
+PORTS_SRCS-yes += vpx_ports.mk
+
+PORTS_SRCS-yes += bitops.h
+PORTS_SRCS-yes += mem.h
+PORTS_SRCS-yes += msvc.h
+PORTS_SRCS-yes += system_state.h
+PORTS_SRCS-yes += vpx_timer.h
+
+ifeq ($(ARCH_X86)$(ARCH_X86_64),yes)
+PORTS_SRCS-yes += emms.asm
+PORTS_SRCS-yes += x86.h
+PORTS_SRCS-yes += x86_abi_support.asm
+endif
+
+PORTS_SRCS-$(ARCH_ARM) += arm_cpudetect.c
+PORTS_SRCS-$(ARCH_ARM) += arm.h
diff --git a/libs/libvpx/vpx_ports/vpx_timer.h b/libs/libvpx/vpx_ports/vpx_timer.h
new file mode 100644
index 0000000000..dd98e291c2
--- /dev/null
+++ b/libs/libvpx/vpx_ports/vpx_timer.h
@@ -0,0 +1,120 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VPX_PORTS_VPX_TIMER_H_
+#define VPX_PORTS_VPX_TIMER_H_
+
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+
+#if CONFIG_OS_SUPPORT
+
+#if defined(_WIN32)
+/*
+ * Win32 specific includes
+ */
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#endif
+#include <windows.h>
+#else
+/*
+ * POSIX specific includes
+ */
+#include <sys/time.h>
+
+/* timersub is not provided by msys at this time. */
+#ifndef timersub
+#define timersub(a, b, result) \
+  do { \
+    (result)->tv_sec = (a)->tv_sec - (b)->tv_sec; \
+    (result)->tv_usec = (a)->tv_usec - (b)->tv_usec; \
+    if ((result)->tv_usec < 0) { \
+      --(result)->tv_sec; \
+      (result)->tv_usec += 1000000; \
+    } \
+  } while (0)
+#endif
+#endif
+
+
+struct vpx_usec_timer {
+#if defined(_WIN32)
+  LARGE_INTEGER  begin, end;
+#else
+  struct timeval begin, end;
+#endif
+};
+
+
+static INLINE void
+vpx_usec_timer_start(struct vpx_usec_timer *t) {
+#if defined(_WIN32)
+  QueryPerformanceCounter(&t->begin);
+#else
+  gettimeofday(&t->begin, NULL);
+#endif
+}
+
+
+static INLINE void
+vpx_usec_timer_mark(struct vpx_usec_timer *t) {
+#if defined(_WIN32)
+  QueryPerformanceCounter(&t->end);
+#else
+  gettimeofday(&t->end, NULL);
+#endif
+}
+
+
+static INLINE int64_t
+vpx_usec_timer_elapsed(struct vpx_usec_timer *t) {
+#if defined(_WIN32)
+  LARGE_INTEGER freq, diff;
+
+  diff.QuadPart = t->end.QuadPart - t->begin.QuadPart;
+
+  QueryPerformanceFrequency(&freq);
+  return diff.QuadPart * 1000000 / freq.QuadPart;
+#else
+  struct timeval diff;
+
+  timersub(&t->end, &t->begin, &diff);
+  return diff.tv_sec * 1000000 + diff.tv_usec;
+#endif
+}
+
+#else /* CONFIG_OS_SUPPORT = 0*/
+
+/* Empty timer functions if CONFIG_OS_SUPPORT = 0 */
+#ifndef timersub
+#define timersub(a, b, result)
+#endif
+
+struct vpx_usec_timer {
+  void *dummy;
+};
+
+static INLINE void
+vpx_usec_timer_start(struct vpx_usec_timer *t) { }
+
+static INLINE void
+vpx_usec_timer_mark(struct vpx_usec_timer *t) { }
+
+static INLINE int
+vpx_usec_timer_elapsed(struct vpx_usec_timer *t) {
+  return 0;
+}
+
+#endif /* CONFIG_OS_SUPPORT */
+
+#endif  // VPX_PORTS_VPX_TIMER_H_
diff --git a/libs/libvpx/vpx_ports/x86.h b/libs/libvpx/vpx_ports/x86.h
new file mode 100644
index 0000000000..5da346e58f
--- /dev/null
+++ b/libs/libvpx/vpx_ports/x86.h
@@ -0,0 +1,310 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VPX_PORTS_X86_H_
+#define VPX_PORTS_X86_H_
+#include <stdlib.h>
+#include "vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+  VPX_CPU_UNKNOWN = -1,
+  VPX_CPU_AMD,
+  VPX_CPU_AMD_OLD,
+  VPX_CPU_CENTAUR,
+  VPX_CPU_CYRIX,
+  VPX_CPU_INTEL,
+  VPX_CPU_NEXGEN,
+  VPX_CPU_NSC,
+  VPX_CPU_RISE,
+  VPX_CPU_SIS,
+  VPX_CPU_TRANSMETA,
+  VPX_CPU_TRANSMETA_OLD,
+  VPX_CPU_UMC,
+  VPX_CPU_VIA,
+
+  VPX_CPU_LAST
+}  vpx_cpu_t;
+
+#if defined(__GNUC__) && __GNUC__ || defined(__ANDROID__)
+#if ARCH_X86_64
+#define cpuid(func, func2, ax, bx, cx, dx)\
+  __asm__ __volatile__ (\
+                        "cpuid           \n\t" \
+                        : "=a" (ax), "=b" (bx), "=c" (cx), "=d" (dx) \
+                        : "a" (func), "c" (func2));
+#else
+#define cpuid(func, func2, ax, bx, cx, dx)\
+  __asm__ __volatile__ (\
+                        "mov %%ebx, %%edi   \n\t" \
+                        "cpuid              \n\t" \
+                        "xchg %%edi, %%ebx  \n\t" \
+                        : "=a" (ax), "=D" (bx), "=c" (cx), "=d" (dx) \
+                        : "a" (func), "c" (func2));
+#endif
+#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) /* end __GNUC__ or __ANDROID__*/
+#if ARCH_X86_64
+#define cpuid(func, func2, ax, bx, cx, dx)\
+  asm volatile (\
+                "xchg %rsi, %rbx \n\t" \
+                "cpuid           \n\t" \
+                "movl %ebx, %edi \n\t" \
+                "xchg %rsi, %rbx \n\t" \
+                : "=a" (ax), "=D" (bx), "=c" (cx), "=d" (dx) \
+                : "a" (func), "c" (func2));
+#else
+#define cpuid(func, func2, ax, bx, cx, dx)\
+  asm volatile (\
+                "pushl %ebx       \n\t" \
+                "cpuid            \n\t" \
+                "movl %ebx, %edi  \n\t" \
+                "popl %ebx        \n\t" \
+                : "=a" (ax), "=D" (bx), "=c" (cx), "=d" (dx) \
+                : "a" (func), "c" (func2));
+#endif
+#else /* end __SUNPRO__ */
+#if ARCH_X86_64
+#if defined(_MSC_VER) && _MSC_VER > 1500
+void __cpuidex(int CPUInfo[4], int info_type, int ecxvalue);
+#pragma intrinsic(__cpuidex)
+#define cpuid(func, func2, a, b, c, d) do {\
+    int regs[4];\
+    __cpuidex(regs, func, func2); \
+    a = regs[0];  b = regs[1];  c = regs[2];  d = regs[3];\
+  } while(0)
+#else
+void __cpuid(int CPUInfo[4], int info_type);
+#pragma intrinsic(__cpuid)
+#define cpuid(func, func2, a, b, c, d) do {\
+    int regs[4];\
+    __cpuid(regs, func); \
+    a = regs[0];  b = regs[1];  c = regs[2];  d = regs[3];\
+  } while (0)
+#endif
+#else
+#define cpuid(func, func2, a, b, c, d)\
+  __asm mov eax, func\
+  __asm mov ecx, func2\
+  __asm cpuid\
+  __asm mov a, eax\
+  __asm mov b, ebx\
+  __asm mov c, ecx\
+  __asm mov d, edx
+#endif
+#endif /* end others */
+
+// NaCl has no support for xgetbv or the raw opcode.
+#if !defined(__native_client__) && (defined(__i386__) || defined(__x86_64__))
+static INLINE uint64_t xgetbv(void) {
+  const uint32_t ecx = 0;
+  uint32_t eax, edx;
+  // Use the raw opcode for xgetbv for compatibility with older toolchains.
+  __asm__ volatile (
+    ".byte 0x0f, 0x01, 0xd0\n"
+    : "=a"(eax), "=d"(edx) : "c" (ecx));
+  return ((uint64_t)edx << 32) | eax;
+}
+#elif (defined(_M_X64) || defined(_M_IX86)) && \
+      defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040219  // >= VS2010 SP1
+#include <immintrin.h>
+#define xgetbv() _xgetbv(0)
+#elif defined(_MSC_VER) && defined(_M_IX86)
+static INLINE uint64_t xgetbv(void) {
+  uint32_t eax_, edx_;
+  __asm {
+    xor ecx, ecx  // ecx = 0
+    // Use the raw opcode for xgetbv for compatibility with older toolchains.
+    __asm _emit 0x0f __asm _emit 0x01 __asm _emit 0xd0
+    mov eax_, eax
+    mov edx_, edx
+  }
+  return ((uint64_t)edx_ << 32) | eax_;
+}
+#else
+#define xgetbv() 0U  // no AVX for older x64 or unrecognized toolchains.
+#endif
+
+#if defined(_MSC_VER) && _MSC_VER >= 1700
+#include <windows.h>
+#if WINAPI_FAMILY_PARTITION(WINAPI_FAMILY_APP)
+#define getenv(x) NULL
+#endif
+#endif
+
+#define HAS_MMX     0x01
+#define HAS_SSE     0x02
+#define HAS_SSE2    0x04
+#define HAS_SSE3    0x08
+#define HAS_SSSE3   0x10
+#define HAS_SSE4_1  0x20
+#define HAS_AVX     0x40
+#define HAS_AVX2    0x80
+#ifndef BIT
+#define BIT(n) (1<<n)
+#endif
+
+static INLINE int
+x86_simd_caps(void) {
+  unsigned int flags = 0;
+  unsigned int mask = ~0;
+  unsigned int max_cpuid_val, reg_eax, reg_ebx, reg_ecx, reg_edx;
+  char *env;
+  (void)reg_ebx;
+
+  /* See if the CPU capabilities are being overridden by the environment */
+  env = getenv("VPX_SIMD_CAPS");
+
+  if (env && *env)
+    return (int)strtol(env, NULL, 0);
+
+  env = getenv("VPX_SIMD_CAPS_MASK");
+
+  if (env && *env)
+    mask = strtol(env, NULL, 0);
+
+  /* Ensure that the CPUID instruction supports extended features */
+  cpuid(0, 0, max_cpuid_val, reg_ebx, reg_ecx, reg_edx);
+
+  if (max_cpuid_val < 1)
+    return 0;
+
+  /* Get the standard feature flags */
+  cpuid(1, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
+
+  if (reg_edx & BIT(23)) flags |= HAS_MMX;
+
+  if (reg_edx & BIT(25)) flags |= HAS_SSE; /* aka xmm */
+
+  if (reg_edx & BIT(26)) flags |= HAS_SSE2; /* aka wmt */
+
+  if (reg_ecx & BIT(0)) flags |= HAS_SSE3;
+
+  if (reg_ecx & BIT(9)) flags |= HAS_SSSE3;
+
+  if (reg_ecx & BIT(19)) flags |= HAS_SSE4_1;
+
+  // bits 27 (OSXSAVE) & 28 (256-bit AVX)
+  if ((reg_ecx & (BIT(27) | BIT(28))) == (BIT(27) | BIT(28))) {
+    if ((xgetbv() & 0x6) == 0x6) {
+      flags |= HAS_AVX;
+
+      if (max_cpuid_val >= 7) {
+        /* Get the leaf 7 feature flags. Needed to check for AVX2 support */
+        cpuid(7, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
+
+        if (reg_ebx & BIT(5)) flags |= HAS_AVX2;
+      }
+    }
+  }
+
+  return flags & mask;
+}
+
+#if ARCH_X86_64 && defined(_MSC_VER)
+unsigned __int64 __rdtsc(void);
+#pragma intrinsic(__rdtsc)
+#endif
+static INLINE unsigned int
+x86_readtsc(void) {
+#if defined(__GNUC__) && __GNUC__
+  unsigned int tsc;
+  __asm__ __volatile__("rdtsc\n\t":"=a"(tsc):);
+  return tsc;
+#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
+  unsigned int tsc;
+  asm volatile("rdtsc\n\t":"=a"(tsc):);
+  return tsc;
+#else
+#if ARCH_X86_64
+  return (unsigned int)__rdtsc();
+#else
+  __asm  rdtsc;
+#endif
+#endif
+}
+
+
+#if defined(__GNUC__) && __GNUC__
+#define x86_pause_hint()\
+  __asm__ __volatile__ ("pause \n\t")
+#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
+#define x86_pause_hint()\
+  asm volatile ("pause \n\t")
+#else
+#if ARCH_X86_64
+#define x86_pause_hint()\
+  _mm_pause();
+#else
+#define x86_pause_hint()\
+  __asm pause
+#endif
+#endif
+
+#if defined(__GNUC__) && __GNUC__
+static void
+x87_set_control_word(unsigned short mode) {
+  __asm__ __volatile__("fldcw %0" : : "m"(*&mode));
+}
+static unsigned short
+x87_get_control_word(void) {
+  unsigned short mode;
+  __asm__ __volatile__("fstcw %0\n\t":"=m"(*&mode):);
+    return mode;
+}
+#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
+static void
+x87_set_control_word(unsigned short mode) {
+  asm volatile("fldcw %0" : : "m"(*&mode));
+}
+static unsigned short
+x87_get_control_word(void) {
+  unsigned short mode;
+  asm volatile("fstcw %0\n\t":"=m"(*&mode):);
+  return mode;
+}
+#elif ARCH_X86_64
+/* No fldcw intrinsics on Windows x64, punt to external asm */
+extern void           vpx_winx64_fldcw(unsigned short mode);
+extern unsigned short vpx_winx64_fstcw(void);
+#define x87_set_control_word vpx_winx64_fldcw
+#define x87_get_control_word vpx_winx64_fstcw
+#else
+static void
+x87_set_control_word(unsigned short mode) {
+  __asm { fldcw mode }
+}
+static unsigned short
+x87_get_control_word(void) {
+  unsigned short mode;
+  __asm { fstcw mode }
+  return mode;
+}
+#endif
+
+static INLINE unsigned int
+x87_set_double_precision(void) {
+  unsigned int mode = x87_get_control_word();
+  x87_set_control_word((mode&~0x300) | 0x200);
+  return mode;
+}
+
+
+extern void vpx_reset_mmx_state(void);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_PORTS_X86_H_
diff --git a/libs/libvpx/vpx_ports/x86_abi_support.asm b/libs/libvpx/vpx_ports/x86_abi_support.asm
new file mode 100644
index 0000000000..708fa101c5
--- /dev/null
+++ b/libs/libvpx/vpx_ports/x86_abi_support.asm
@@ -0,0 +1,404 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_config.asm"
+
+; 32/64 bit compatibility macros
+;
+; In general, we make the source use 64 bit syntax, then twiddle with it using
+; the preprocessor to get the 32 bit syntax on 32 bit platforms.
+;
+%ifidn __OUTPUT_FORMAT__,elf32
+%define ABI_IS_32BIT 1
+%elifidn __OUTPUT_FORMAT__,macho32
+%define ABI_IS_32BIT 1
+%elifidn __OUTPUT_FORMAT__,win32
+%define ABI_IS_32BIT 1
+%elifidn __OUTPUT_FORMAT__,aout
+%define ABI_IS_32BIT 1
+%else
+%define ABI_IS_32BIT 0
+%endif
+
+%if ABI_IS_32BIT
+%define rax eax
+%define rbx ebx
+%define rcx ecx
+%define rdx edx
+%define rsi esi
+%define rdi edi
+%define rsp esp
+%define rbp ebp
+%define movsxd mov
+%macro movq 2
+  %ifidn %1,eax
+    movd %1,%2
+  %elifidn %2,eax
+    movd %1,%2
+  %elifidn %1,ebx
+    movd %1,%2
+  %elifidn %2,ebx
+    movd %1,%2
+  %elifidn %1,ecx
+    movd %1,%2
+  %elifidn %2,ecx
+    movd %1,%2
+  %elifidn %1,edx
+    movd %1,%2
+  %elifidn %2,edx
+    movd %1,%2
+  %elifidn %1,esi
+    movd %1,%2
+  %elifidn %2,esi
+    movd %1,%2
+  %elifidn %1,edi
+    movd %1,%2
+  %elifidn %2,edi
+    movd %1,%2
+  %elifidn %1,esp
+    movd %1,%2
+  %elifidn %2,esp
+    movd %1,%2
+  %elifidn %1,ebp
+    movd %1,%2
+  %elifidn %2,ebp
+    movd %1,%2
+  %else
+    movq %1,%2
+  %endif
+%endmacro
+%endif
+
+
+; LIBVPX_YASM_WIN64
+; Set LIBVPX_YASM_WIN64 if output is Windows 64bit so the code will work if x64
+; or win64 is defined on the Yasm command line.
+%ifidn __OUTPUT_FORMAT__,win64
+%define LIBVPX_YASM_WIN64 1
+%elifidn __OUTPUT_FORMAT__,x64
+%define LIBVPX_YASM_WIN64 1
+%else
+%define LIBVPX_YASM_WIN64 0
+%endif
+
+; sym()
+; Return the proper symbol name for the target ABI.
+;
+; Certain ABIs, notably MS COFF and Darwin MACH-O, require that symbols
+; with C linkage be prefixed with an underscore.
+;
+%ifidn   __OUTPUT_FORMAT__,elf32
+%define sym(x) x
+%elifidn __OUTPUT_FORMAT__,elf64
+%define sym(x) x
+%elifidn __OUTPUT_FORMAT__,elfx32
+%define sym(x) x
+%elif LIBVPX_YASM_WIN64
+%define sym(x) x
+%else
+%define sym(x) _ %+ x
+%endif
+
+;  PRIVATE
+;  Macro for the attribute to hide a global symbol for the target ABI.
+;  This is only active if CHROMIUM is defined.
+;
+;  Chromium doesn't like exported global symbols due to symbol clashing with
+;  plugins among other things.
+;
+;  Requires Chromium's patched copy of yasm:
+;    http://src.chromium.org/viewvc/chrome?view=rev&revision=73761
+;    http://www.tortall.net/projects/yasm/ticket/236
+;
+%ifdef CHROMIUM
+  %ifidn   __OUTPUT_FORMAT__,elf32
+    %define PRIVATE :hidden
+  %elifidn __OUTPUT_FORMAT__,elf64
+    %define PRIVATE :hidden
+  %elifidn __OUTPUT_FORMAT__,elfx32
+    %define PRIVATE :hidden
+  %elif LIBVPX_YASM_WIN64
+    %define PRIVATE
+  %else
+    %define PRIVATE :private_extern
+  %endif
+%else
+  %define PRIVATE
+%endif
+
+; arg()
+; Return the address specification of the given argument
+;
+%if ABI_IS_32BIT
+  %define arg(x) [ebp+8+4*x]
+%else
+  ; 64 bit ABI passes arguments in registers. This is a workaround to get up
+  ; and running quickly. Relies on SHADOW_ARGS_TO_STACK
+  %if LIBVPX_YASM_WIN64
+    %define arg(x) [rbp+16+8*x]
+  %else
+    %define arg(x) [rbp-8-8*x]
+  %endif
+%endif
+
+; REG_SZ_BYTES, REG_SZ_BITS
+; Size of a register
+%if ABI_IS_32BIT
+%define REG_SZ_BYTES 4
+%define REG_SZ_BITS  32
+%else
+%define REG_SZ_BYTES 8
+%define REG_SZ_BITS  64
+%endif
+
+
+; ALIGN_STACK <alignment> <register>
+; This macro aligns the stack to the given alignment (in bytes). The stack
+; is left such that the previous value of the stack pointer is the first
+; argument on the stack (ie, the inverse of this macro is 'pop rsp.')
+; This macro uses one temporary register, which is not preserved, and thus
+; must be specified as an argument.
+%macro ALIGN_STACK 2
+    mov         %2, rsp
+    and         rsp, -%1
+    lea         rsp, [rsp - (%1 - REG_SZ_BYTES)]
+    push        %2
+%endmacro
+
+
+;
+; The Microsoft assembler tries to impose a certain amount of type safety in
+; its register usage. YASM doesn't recognize these directives, so we just
+; %define them away to maintain as much compatibility as possible with the
+; original inline assembler we're porting from.
+;
+%idefine PTR
+%idefine XMMWORD
+%idefine MMWORD
+
+; PIC macros
+;
+%if ABI_IS_32BIT
+  %if CONFIG_PIC=1
+  %ifidn __OUTPUT_FORMAT__,elf32
+    %define WRT_PLT wrt ..plt
+    %macro GET_GOT 1
+      extern _GLOBAL_OFFSET_TABLE_
+      push %1
+      call %%get_got
+      %%sub_offset:
+      jmp %%exitGG
+      %%get_got:
+      mov %1, [esp]
+      add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%sub_offset wrt ..gotpc
+      ret
+      %%exitGG:
+      %undef GLOBAL
+      %define GLOBAL(x) x + %1 wrt ..gotoff
+      %undef RESTORE_GOT
+      %define RESTORE_GOT pop %1
+    %endmacro
+  %elifidn __OUTPUT_FORMAT__,macho32
+    %macro GET_GOT 1
+      push %1
+      call %%get_got
+      %%get_got:
+      pop  %1
+      %undef GLOBAL
+      %define GLOBAL(x) x + %1 - %%get_got
+      %undef RESTORE_GOT
+      %define RESTORE_GOT pop %1
+    %endmacro
+  %endif
+  %endif
+
+  %ifdef CHROMIUM
+    %ifidn __OUTPUT_FORMAT__,macho32
+      %define HIDDEN_DATA(x) x:private_extern
+    %else
+      %define HIDDEN_DATA(x) x
+    %endif
+  %else
+    %define HIDDEN_DATA(x) x
+  %endif
+%else
+  %macro GET_GOT 1
+  %endmacro
+  %define GLOBAL(x) rel x
+  %ifidn __OUTPUT_FORMAT__,elf64
+    %define WRT_PLT wrt ..plt
+    %define HIDDEN_DATA(x) x:data hidden
+  %elifidn __OUTPUT_FORMAT__,elfx32
+    %define WRT_PLT wrt ..plt
+    %define HIDDEN_DATA(x) x:data hidden
+  %elifidn __OUTPUT_FORMAT__,macho64
+    %ifdef CHROMIUM
+      %define HIDDEN_DATA(x) x:private_extern
+    %else
+      %define HIDDEN_DATA(x) x
+    %endif
+  %else
+    %define HIDDEN_DATA(x) x
+  %endif
+%endif
+%ifnmacro GET_GOT
+    %macro GET_GOT 1
+    %endmacro
+    %define GLOBAL(x) x
+%endif
+%ifndef RESTORE_GOT
+%define RESTORE_GOT
+%endif
+%ifndef WRT_PLT
+%define WRT_PLT
+%endif
+
+%if ABI_IS_32BIT
+  %macro SHADOW_ARGS_TO_STACK 1
+  %endm
+  %define UNSHADOW_ARGS
+%else
+%if LIBVPX_YASM_WIN64
+  %macro SHADOW_ARGS_TO_STACK 1 ; argc
+    %if %1 > 0
+        mov arg(0),rcx
+    %endif
+    %if %1 > 1
+        mov arg(1),rdx
+    %endif
+    %if %1 > 2
+        mov arg(2),r8
+    %endif
+    %if %1 > 3
+        mov arg(3),r9
+    %endif
+  %endm
+%else
+  %macro SHADOW_ARGS_TO_STACK 1 ; argc
+    %if %1 > 0
+        push rdi
+    %endif
+    %if %1 > 1
+        push rsi
+    %endif
+    %if %1 > 2
+        push rdx
+    %endif
+    %if %1 > 3
+        push rcx
+    %endif
+    %if %1 > 4
+        push r8
+    %endif
+    %if %1 > 5
+        push r9
+    %endif
+    %if %1 > 6
+      %assign i %1-6
+      %assign off 16
+      %rep i
+        mov rax,[rbp+off]
+        push rax
+        %assign off off+8
+      %endrep
+    %endif
+  %endm
+%endif
+  %define UNSHADOW_ARGS mov rsp, rbp
+%endif
+
+; Win64 ABI requires that XMM6:XMM15 are callee saved
+; SAVE_XMM n, [u]
+; store registers 6-n on the stack
+; if u is specified, use unaligned movs.
+; Win64 ABI requires 16 byte stack alignment, but then pushes an 8 byte return
+; value. Typically we follow this up with 'push rbp' - re-aligning the stack -
+; but in some cases this is not done and unaligned movs must be used.
+%if LIBVPX_YASM_WIN64
+%macro SAVE_XMM 1-2 a
+  %if %1 < 6
+    %error Only xmm registers 6-15 must be preserved
+  %else
+    %assign last_xmm %1
+    %define movxmm movdq %+ %2
+    %assign xmm_stack_space ((last_xmm - 5) * 16)
+    sub rsp, xmm_stack_space
+    %assign i 6
+    %rep (last_xmm - 5)
+      movxmm [rsp + ((i - 6) * 16)], xmm %+ i
+      %assign i i+1
+    %endrep
+  %endif
+%endmacro
+%macro RESTORE_XMM 0
+  %ifndef last_xmm
+    %error RESTORE_XMM must be paired with SAVE_XMM n
+  %else
+    %assign i last_xmm
+    %rep (last_xmm - 5)
+      movxmm xmm %+ i, [rsp +((i - 6) * 16)]
+      %assign i i-1
+    %endrep
+    add rsp, xmm_stack_space
+    ; there are a couple functions which return from multiple places.
+    ; otherwise, we could uncomment these:
+    ; %undef last_xmm
+    ; %undef xmm_stack_space
+    ; %undef movxmm
+  %endif
+%endmacro
+%else
+%macro SAVE_XMM 1-2
+%endmacro
+%macro RESTORE_XMM 0
+%endmacro
+%endif
+
+; Name of the rodata section
+;
+; .rodata seems to be an elf-ism, as it doesn't work on OSX.
+;
+%ifidn __OUTPUT_FORMAT__,macho64
+%define SECTION_RODATA section .text
+%elifidn __OUTPUT_FORMAT__,macho32
+%macro SECTION_RODATA 0
+section .text
+%endmacro
+%elifidn __OUTPUT_FORMAT__,aout
+%define SECTION_RODATA section .data
+%else
+%define SECTION_RODATA section .rodata
+%endif
+
+
+; Tell GNU ld that we don't require an executable stack.
+%ifidn __OUTPUT_FORMAT__,elf32
+section .note.GNU-stack noalloc noexec nowrite progbits
+section .text
+%elifidn __OUTPUT_FORMAT__,elf64
+section .note.GNU-stack noalloc noexec nowrite progbits
+section .text
+%elifidn __OUTPUT_FORMAT__,elfx32
+section .note.GNU-stack noalloc noexec nowrite progbits
+section .text
+%endif
+
+; On Android platforms use lrand48 when building postproc routines. Prior to L
+; rand() was not available.
+%if CONFIG_POSTPROC=1 || CONFIG_VP9_POSTPROC=1
+%ifdef __ANDROID__
+extern sym(lrand48)
+%define LIBVPX_RAND lrand48
+%else
+extern sym(rand)
+%define LIBVPX_RAND rand
+%endif
+%endif ; CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
diff --git a/libs/libvpx/vpx_scale/generic/gen_scalers.c b/libs/libvpx/vpx_scale/generic/gen_scalers.c
new file mode 100644
index 0000000000..dab324edfc
--- /dev/null
+++ b/libs/libvpx/vpx_scale/generic/gen_scalers.c
@@ -0,0 +1,240 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_scale_rtcd.h"
+#include "vpx_scale/vpx_scale.h"
+#include "vpx_mem/vpx_mem.h"
+/****************************************************************************
+*  Imports
+****************************************************************************/
+
+/****************************************************************************
+ *
+ *
+ *  INPUTS        : const unsigned char *source : Pointer to source data.
+ *                  unsigned int source_width    : Stride of source.
+ *                  unsigned char *dest         : Pointer to destination data.
+ *                  unsigned int dest_width      : Stride of destination (NOT USED).
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Copies horizontal line of pixels from source to
+ *                  destination scaling up by 4 to 5.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+void vp8_horizontal_line_5_4_scale_c(const unsigned char *source,
+                                     unsigned int source_width,
+                                     unsigned char *dest,
+                                     unsigned int dest_width) {
+  unsigned i;
+  unsigned int a, b, c, d, e;
+  unsigned char *des = dest;
+  const unsigned char *src = source;
+
+  (void) dest_width;
+
+  for (i = 0; i < source_width; i += 5) {
+    a = src[0];
+    b = src[1];
+    c = src[2];
+    d = src[3];
+    e = src[4];
+
+    des[0] = (unsigned char) a;
+    des[1] = (unsigned char)((b * 192 + c * 64 + 128) >> 8);
+    des[2] = (unsigned char)((c * 128 + d * 128 + 128) >> 8);
+    des[3] = (unsigned char)((d * 64 + e * 192 + 128) >> 8);
+
+    src += 5;
+    des += 4;
+  }
+}
+
+
+
+
+void vp8_vertical_band_5_4_scale_c(unsigned char *source,
+                                   unsigned int src_pitch,
+                                   unsigned char *dest,
+                                   unsigned int dest_pitch,
+                                   unsigned int dest_width) {
+  unsigned int i;
+  unsigned int a, b, c, d, e;
+  unsigned char *des = dest;
+  unsigned char *src = source;
+
+  for (i = 0; i < dest_width; i++) {
+
+    a = src[0 * src_pitch];
+    b = src[1 * src_pitch];
+    c = src[2 * src_pitch];
+    d = src[3 * src_pitch];
+    e = src[4 * src_pitch];
+
+    des[0 * dest_pitch] = (unsigned char) a;
+    des[1 * dest_pitch] = (unsigned char)((b * 192 + c * 64 + 128) >> 8);
+    des[2 * dest_pitch] = (unsigned char)((c * 128 + d * 128 + 128) >> 8);
+    des[3 * dest_pitch] = (unsigned char)((d * 64 + e * 192 + 128) >> 8);
+
+    src++;
+    des++;
+
+  }
+}
+
+
+/*7***************************************************************************
+ *
+ *  ROUTINE       : vp8_horizontal_line_3_5_scale_c
+ *
+ *  INPUTS        : const unsigned char *source : Pointer to source data.
+ *                  unsigned int source_width    : Stride of source.
+ *                  unsigned char *dest         : Pointer to destination data.
+ *                  unsigned int dest_width      : Stride of destination (NOT USED).
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Copies horizontal line of pixels from source to
+ *                  destination scaling up by 3 to 5.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ *
+ ****************************************************************************/
+void vp8_horizontal_line_5_3_scale_c(const unsigned char *source,
+                                     unsigned int source_width,
+                                     unsigned char *dest,
+                                     unsigned int dest_width) {
+  unsigned int i;
+  unsigned int a, b, c, d, e;
+  unsigned char *des = dest;
+  const unsigned char *src = source;
+
+  (void) dest_width;
+
+  for (i = 0; i < source_width; i += 5) {
+    a = src[0];
+    b = src[1];
+    c = src[2];
+    d = src[3];
+    e = src[4];
+
+    des[0] = (unsigned char) a;
+    des[1] = (unsigned char)((b * 85  + c * 171 + 128) >> 8);
+    des[2] = (unsigned char)((d * 171 + e * 85 + 128) >> 8);
+
+    src += 5;
+    des += 3;
+  }
+
+}
+
+void vp8_vertical_band_5_3_scale_c(unsigned char *source,
+                                   unsigned int src_pitch,
+                                   unsigned char *dest,
+                                   unsigned int dest_pitch,
+                                   unsigned int dest_width) {
+  unsigned int i;
+  unsigned int a, b, c, d, e;
+  unsigned char *des = dest;
+  unsigned char *src = source;
+
+  for (i = 0; i < dest_width; i++) {
+
+    a = src[0 * src_pitch];
+    b = src[1 * src_pitch];
+    c = src[2 * src_pitch];
+    d = src[3 * src_pitch];
+    e = src[4 * src_pitch];
+
+    des[0 * dest_pitch] = (unsigned char) a;
+    des[1 * dest_pitch] = (unsigned char)((b * 85 + c * 171 + 128) >> 8);
+    des[2 * dest_pitch] = (unsigned char)((d * 171 + e * 85 + 128) >> 8);
+
+    src++;
+    des++;
+
+  }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : vp8_horizontal_line_1_2_scale_c
+ *
+ *  INPUTS        : const unsigned char *source : Pointer to source data.
+ *                  unsigned int source_width    : Stride of source.
+ *                  unsigned char *dest         : Pointer to destination data.
+ *                  unsigned int dest_width      : Stride of destination (NOT USED).
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Copies horizontal line of pixels from source to
+ *                  destination scaling up by 1 to 2.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+void vp8_horizontal_line_2_1_scale_c(const unsigned char *source,
+                                     unsigned int source_width,
+                                     unsigned char *dest,
+                                     unsigned int dest_width) {
+  unsigned int i;
+  unsigned int a;
+  unsigned char *des = dest;
+  const unsigned char *src = source;
+
+  (void) dest_width;
+
+  for (i = 0; i < source_width; i += 2) {
+    a = src[0];
+    des [0] = (unsigned char)(a);
+    src += 2;
+    des += 1;
+  }
+}
+
+void vp8_vertical_band_2_1_scale_c(unsigned char *source,
+                                   unsigned int src_pitch,
+                                   unsigned char *dest,
+                                   unsigned int dest_pitch,
+                                   unsigned int dest_width) {
+  (void) dest_pitch;
+  (void) src_pitch;
+  memcpy(dest, source, dest_width);
+}
+
+void vp8_vertical_band_2_1_scale_i_c(unsigned char *source,
+                                     unsigned int src_pitch,
+                                     unsigned char *dest,
+                                     unsigned int dest_pitch,
+                                     unsigned int dest_width) {
+  int i;
+  int temp;
+  int width = dest_width;
+
+  (void) dest_pitch;
+
+  for (i = 0; i < width; i++) {
+    temp = 8;
+    temp += source[i - (int)src_pitch] * 3;
+    temp += source[i] * 10;
+    temp += source[i + src_pitch] * 3;
+    temp >>= 4;
+    dest[i] = (unsigned char)(temp);
+  }
+}
diff --git a/libs/libvpx/vpx_scale/generic/vpx_scale.c b/libs/libvpx/vpx_scale/generic/vpx_scale.c
new file mode 100644
index 0000000000..15e4ba87e7
--- /dev/null
+++ b/libs/libvpx/vpx_scale/generic/vpx_scale.c
@@ -0,0 +1,531 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+ *
+ *   Module Title :     scale.c
+ *
+ *   Description  :     Image scaling functions.
+ *
+ ***************************************************************************/
+
+/****************************************************************************
+*  Header Files
+****************************************************************************/
+#include "./vpx_scale_rtcd.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_scale/vpx_scale.h"
+#include "vpx_scale/yv12config.h"
+
+typedef struct {
+  int     expanded_frame_width;
+  int     expanded_frame_height;
+
+  int HScale;
+  int HRatio;
+  int VScale;
+  int VRatio;
+
+  YV12_BUFFER_CONFIG *src_yuv_config;
+  YV12_BUFFER_CONFIG *dst_yuv_config;
+
+} SCALE_VARS;
+
+/****************************************************************************
+ *
+ *  ROUTINE       : scale1d_2t1_i
+ *
+ *  INPUTS        : const unsigned char *source : Pointer to data to be scaled.
+ *                  int source_step              : Number of pixels to step on in source.
+ *                  unsigned int source_scale    : Scale for source (UNUSED).
+ *                  unsigned int source_length   : Length of source (UNUSED).
+ *                  unsigned char *dest         : Pointer to output data array.
+ *                  int dest_step                : Number of pixels to step on in destination.
+ *                  unsigned int dest_scale      : Scale for destination (UNUSED).
+ *                  unsigned int dest_length     : Length of destination.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Performs 2-to-1 interpolated scaling.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void scale1d_2t1_i
+(
+  const unsigned char *source,
+  int source_step,
+  unsigned int source_scale,
+  unsigned int source_length,
+  unsigned char *dest,
+  int dest_step,
+  unsigned int dest_scale,
+  unsigned int dest_length
+) {
+  unsigned int i, j;
+  unsigned int temp;
+  int source_pitch = source_step;
+  (void) source_length;
+  (void) source_scale;
+  (void) dest_scale;
+
+  source_step *= 2;
+  dest[0] = source[0];
+
+  for (i = dest_step, j = source_step; i < dest_length * dest_step; i += dest_step, j += source_step) {
+    temp = 8;
+    temp += 3 * source[j - source_pitch];
+    temp += 10 * source[j];
+    temp += 3 * source[j + source_pitch];
+    temp >>= 4;
+    dest[i] = (char)(temp);
+  }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : scale1d_2t1_ps
+ *
+ *  INPUTS        : const unsigned char *source : Pointer to data to be scaled.
+ *                  int source_step              : Number of pixels to step on in source.
+ *                  unsigned int source_scale    : Scale for source (UNUSED).
+ *                  unsigned int source_length   : Length of source (UNUSED).
+ *                  unsigned char *dest         : Pointer to output data array.
+ *                  int dest_step                : Number of pixels to step on in destination.
+ *                  unsigned int dest_scale      : Scale for destination (UNUSED).
+ *                  unsigned int dest_length     : Length of destination.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Performs 2-to-1 point subsampled scaling.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void scale1d_2t1_ps
+(
+  const unsigned char *source,
+  int source_step,
+  unsigned int source_scale,
+  unsigned int source_length,
+  unsigned char *dest,
+  int dest_step,
+  unsigned int dest_scale,
+  unsigned int dest_length
+) {
+  unsigned int i, j;
+
+  (void) source_length;
+  (void) source_scale;
+  (void) dest_scale;
+
+  source_step *= 2;
+  j = 0;
+
+  for (i = 0; i < dest_length * dest_step; i += dest_step, j += source_step)
+    dest[i] = source[j];
+}
+/****************************************************************************
+ *
+ *  ROUTINE       : scale1d_c
+ *
+ *  INPUTS        : const unsigned char *source : Pointer to data to be scaled.
+ *                  int source_step              : Number of pixels to step on in source.
+ *                  unsigned int source_scale    : Scale for source.
+ *                  unsigned int source_length   : Length of source (UNUSED).
+ *                  unsigned char *dest         : Pointer to output data array.
+ *                  int dest_step                : Number of pixels to step on in destination.
+ *                  unsigned int dest_scale      : Scale for destination.
+ *                  unsigned int dest_length     : Length of destination.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Performs linear interpolation in one dimension.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void scale1d_c
+(
+  const unsigned char *source,
+  int source_step,
+  unsigned int source_scale,
+  unsigned int source_length,
+  unsigned char *dest,
+  int dest_step,
+  unsigned int dest_scale,
+  unsigned int dest_length
+) {
+  unsigned int i;
+  unsigned int round_value = dest_scale / 2;
+  unsigned int left_modifier = dest_scale;
+  unsigned int right_modifier = 0;
+  unsigned char left_pixel = *source;
+  unsigned char right_pixel = *(source + source_step);
+
+  (void) source_length;
+
+  /* These asserts are needed if there are boundary issues... */
+  /*assert ( dest_scale > source_scale );*/
+  /*assert ( (source_length-1) * dest_scale >= (dest_length-1) * source_scale );*/
+
+  for (i = 0; i < dest_length * dest_step; i += dest_step) {
+    dest[i] = (char)((left_modifier * left_pixel + right_modifier * right_pixel + round_value) / dest_scale);
+
+    right_modifier += source_scale;
+
+    while (right_modifier > dest_scale) {
+      right_modifier -= dest_scale;
+      source += source_step;
+      left_pixel = *source;
+      right_pixel = *(source + source_step);
+    }
+
+    left_modifier = dest_scale - right_modifier;
+  }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : Scale2D
+ *
+ *  INPUTS        : const unsigned char *source  : Pointer to data to be scaled.
+ *                  int source_pitch              : Stride of source image.
+ *                  unsigned int source_width     : Width of input image.
+ *                  unsigned int source_height    : Height of input image.
+ *                  unsigned char *dest          : Pointer to output data array.
+ *                  int dest_pitch                : Stride of destination image.
+ *                  unsigned int dest_width       : Width of destination image.
+ *                  unsigned int dest_height      : Height of destination image.
+ *                  unsigned char *temp_area      : Pointer to temp work area.
+ *                  unsigned char temp_area_height : Height of temp work area.
+ *                  unsigned int hscale          : Horizontal scale factor numerator.
+ *                  unsigned int hratio          : Horizontal scale factor denominator.
+ *                  unsigned int vscale          : Vertical scale factor numerator.
+ *                  unsigned int vratio          : Vertical scale factor denominator.
+ *                  unsigned int interlaced      : Interlace flag.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Performs 2-tap linear interpolation in two dimensions.
+ *
+ *  SPECIAL NOTES : Expansion is performed one band at a time to help with
+ *                  caching.
+ *
+ ****************************************************************************/
+static
+void Scale2D
+(
+  /*const*/
+  unsigned char *source,
+  int source_pitch,
+  unsigned int source_width,
+  unsigned int source_height,
+  unsigned char *dest,
+  int dest_pitch,
+  unsigned int dest_width,
+  unsigned int dest_height,
+  unsigned char *temp_area,
+  unsigned char temp_area_height,
+  unsigned int hscale,
+  unsigned int hratio,
+  unsigned int vscale,
+  unsigned int vratio,
+  unsigned int interlaced
+) {
+  /*unsigned*/
+  int i, j, k;
+  int bands;
+  int dest_band_height;
+  int source_band_height;
+
+  typedef void (*Scale1D)(const unsigned char * source, int source_step, unsigned int source_scale, unsigned int source_length,
+                          unsigned char * dest, int dest_step, unsigned int dest_scale, unsigned int dest_length);
+
+  Scale1D Scale1Dv = scale1d_c;
+  Scale1D Scale1Dh = scale1d_c;
+
+  void (*horiz_line_scale)(const unsigned char *, unsigned int, unsigned char *, unsigned int) = NULL;
+  void (*vert_band_scale)(unsigned char *, unsigned int, unsigned char *, unsigned int, unsigned int) = NULL;
+
+  int ratio_scalable = 1;
+  int interpolation = 0;
+
+  unsigned char *source_base; /* = (unsigned char *) ((source_pitch >= 0) ? source : (source + ((source_height-1) * source_pitch))); */
+  unsigned char *line_src;
+
+
+  source_base = (unsigned char *)source;
+
+  if (source_pitch < 0) {
+    int offset;
+
+    offset = (source_height - 1);
+    offset *= source_pitch;
+
+    source_base += offset;
+  }
+
+  /* find out the ratio for each direction */
+  switch (hratio * 10 / hscale) {
+    case 8:
+      /* 4-5 Scale in Width direction */
+      horiz_line_scale = vp8_horizontal_line_5_4_scale;
+      break;
+    case 6:
+      /* 3-5 Scale in Width direction */
+      horiz_line_scale = vp8_horizontal_line_5_3_scale;
+      break;
+    case 5:
+      /* 1-2 Scale in Width direction */
+      horiz_line_scale = vp8_horizontal_line_2_1_scale;
+      break;
+    default:
+      /* The ratio is not acceptable now */
+      /* throw("The ratio is not acceptable for now!"); */
+      ratio_scalable = 0;
+      break;
+  }
+
+  switch (vratio * 10 / vscale) {
+    case 8:
+      /* 4-5 Scale in vertical direction */
+      vert_band_scale     = vp8_vertical_band_5_4_scale;
+      source_band_height  = 5;
+      dest_band_height    = 4;
+      break;
+    case 6:
+      /* 3-5 Scale in vertical direction */
+      vert_band_scale     = vp8_vertical_band_5_3_scale;
+      source_band_height  = 5;
+      dest_band_height    = 3;
+      break;
+    case 5:
+      /* 1-2 Scale in vertical direction */
+
+      if (interlaced) {
+        /* if the content is interlaced, point sampling is used */
+        vert_band_scale     = vp8_vertical_band_2_1_scale;
+      } else {
+
+        interpolation = 1;
+        /* if the content is progressive, interplo */
+        vert_band_scale     = vp8_vertical_band_2_1_scale_i;
+
+      }
+
+      source_band_height  = 2;
+      dest_band_height    = 1;
+      break;
+    default:
+      /* The ratio is not acceptable now */
+      /* throw("The ratio is not acceptable for now!"); */
+      ratio_scalable = 0;
+      break;
+  }
+
+  if (ratio_scalable) {
+    if (source_height == dest_height) {
+      /* for each band of the image */
+      for (k = 0; k < (int)dest_height; k++) {
+        horiz_line_scale(source, source_width, dest, dest_width);
+        source += source_pitch;
+        dest   += dest_pitch;
+      }
+
+      return;
+    }
+
+    if (interpolation) {
+      if (source < source_base)
+        source = source_base;
+
+      horiz_line_scale(source, source_width, temp_area, dest_width);
+    }
+
+    for (k = 0; k < (int)(dest_height + dest_band_height - 1) / dest_band_height; k++) {
+      /* scale one band horizontally */
+      for (i = 0; i < source_band_height; i++) {
+        /* Trap case where we could read off the base of the source buffer */
+
+        line_src = (unsigned char *)source + i * source_pitch;
+
+        if (line_src < source_base)
+          line_src = source_base;
+
+        horiz_line_scale(line_src, source_width,
+                         temp_area + (i + 1)*dest_pitch, dest_width);
+      }
+
+      /* Vertical scaling is in place */
+      vert_band_scale(temp_area + dest_pitch, dest_pitch, dest, dest_pitch, dest_width);
+
+      if (interpolation)
+        memcpy(temp_area, temp_area + source_band_height * dest_pitch, dest_width);
+
+      /* Next band... */
+      source += (unsigned long) source_band_height  * source_pitch;
+      dest   += (unsigned long) dest_band_height * dest_pitch;
+    }
+
+    return;
+  }
+
+  if (hscale == 2 && hratio == 1)
+    Scale1Dh = scale1d_2t1_ps;
+
+  if (vscale == 2 && vratio == 1) {
+    if (interlaced)
+      Scale1Dv = scale1d_2t1_ps;
+    else
+      Scale1Dv = scale1d_2t1_i;
+  }
+
+  if (source_height == dest_height) {
+    /* for each band of the image */
+    for (k = 0; k < (int)dest_height; k++) {
+      Scale1Dh(source, 1, hscale, source_width + 1, dest, 1, hratio, dest_width);
+      source += source_pitch;
+      dest   += dest_pitch;
+    }
+
+    return;
+  }
+
+  if (dest_height > source_height) {
+    dest_band_height   = temp_area_height - 1;
+    source_band_height = dest_band_height * source_height / dest_height;
+  } else {
+    source_band_height = temp_area_height - 1;
+    dest_band_height   = source_band_height * vratio / vscale;
+  }
+
+  /* first row needs to be done so that we can stay one row ahead for vertical zoom */
+  Scale1Dh(source, 1, hscale, source_width + 1, temp_area, 1, hratio, dest_width);
+
+  /* for each band of the image */
+  bands = (dest_height + dest_band_height - 1) / dest_band_height;
+
+  for (k = 0; k < bands; k++) {
+    /* scale one band horizontally */
+    for (i = 1; i < source_band_height + 1; i++) {
+      if (k * source_band_height + i < (int) source_height) {
+        Scale1Dh(source + i * source_pitch, 1, hscale, source_width + 1,
+                 temp_area + i * dest_pitch, 1, hratio, dest_width);
+      } else { /*  Duplicate the last row */
+        /* copy temp_area row 0 over from last row in the past */
+        memcpy(temp_area + i * dest_pitch, temp_area + (i - 1)*dest_pitch, dest_pitch);
+      }
+    }
+
+    /* scale one band vertically */
+    for (j = 0; j < (int)dest_width; j++) {
+      Scale1Dv(&temp_area[j], dest_pitch, vscale, source_band_height + 1,
+               &dest[j], dest_pitch, vratio, dest_band_height);
+    }
+
+    /* copy temp_area row 0 over from last row in the past */
+    memcpy(temp_area, temp_area + source_band_height * dest_pitch, dest_pitch);
+
+    /* move to the next band */
+    source += source_band_height * source_pitch;
+    dest   += dest_band_height * dest_pitch;
+  }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : vpx_scale_frame
+ *
+ *  INPUTS        : YV12_BUFFER_CONFIG *src       : Pointer to frame to be scaled.
+ *                  YV12_BUFFER_CONFIG *dst       : Pointer to buffer to hold scaled frame.
+ *                  unsigned char *temp_area      : Pointer to temp work area.
+ *                  unsigned char temp_area_height : Height of temp work area.
+ *                  unsigned int hscale          : Horizontal scale factor numerator.
+ *                  unsigned int hratio          : Horizontal scale factor denominator.
+ *                  unsigned int vscale          : Vertical scale factor numerator.
+ *                  unsigned int vratio          : Vertical scale factor denominator.
+ *                  unsigned int interlaced      : Interlace flag.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Performs 2-tap linear interpolation in two dimensions.
+ *
+ *  SPECIAL NOTES : Expansion is performed one band at a time to help with
+ *                  caching.
+ *
+ ****************************************************************************/
+void vpx_scale_frame
+(
+  YV12_BUFFER_CONFIG *src,
+  YV12_BUFFER_CONFIG *dst,
+  unsigned char *temp_area,
+  unsigned char temp_height,
+  unsigned int hscale,
+  unsigned int hratio,
+  unsigned int vscale,
+  unsigned int vratio,
+  unsigned int interlaced
+) {
+  int i;
+  int dw = (hscale - 1 + src->y_width * hratio) / hscale;
+  int dh = (vscale - 1 + src->y_height * vratio) / vscale;
+
+  /* call our internal scaling routines!! */
+  Scale2D((unsigned char *) src->y_buffer, src->y_stride, src->y_width, src->y_height,
+          (unsigned char *) dst->y_buffer, dst->y_stride, dw, dh,
+          temp_area, temp_height, hscale, hratio, vscale, vratio, interlaced);
+
+  if (dw < (int)dst->y_width)
+    for (i = 0; i < dh; i++)
+      memset(dst->y_buffer + i * dst->y_stride + dw - 1, dst->y_buffer[i * dst->y_stride + dw - 2], dst->y_width - dw + 1);
+
+  if (dh < (int)dst->y_height)
+    for (i = dh - 1; i < (int)dst->y_height; i++)
+      memcpy(dst->y_buffer + i * dst->y_stride, dst->y_buffer + (dh - 2) * dst->y_stride, dst->y_width + 1);
+
+  Scale2D((unsigned char *) src->u_buffer, src->uv_stride, src->uv_width, src->uv_height,
+          (unsigned char *) dst->u_buffer, dst->uv_stride, dw / 2, dh / 2,
+          temp_area, temp_height, hscale, hratio, vscale, vratio, interlaced);
+
+  if (dw / 2 < (int)dst->uv_width)
+    for (i = 0; i < dst->uv_height; i++)
+      memset(dst->u_buffer + i * dst->uv_stride + dw / 2 - 1, dst->u_buffer[i * dst->uv_stride + dw / 2 - 2], dst->uv_width - dw / 2 + 1);
+
+  if (dh / 2 < (int)dst->uv_height)
+    for (i = dh / 2 - 1; i < (int)dst->y_height / 2; i++)
+      memcpy(dst->u_buffer + i * dst->uv_stride, dst->u_buffer + (dh / 2 - 2)*dst->uv_stride, dst->uv_width);
+
+  Scale2D((unsigned char *) src->v_buffer, src->uv_stride, src->uv_width, src->uv_height,
+          (unsigned char *) dst->v_buffer, dst->uv_stride, dw / 2, dh / 2,
+          temp_area, temp_height, hscale, hratio, vscale, vratio, interlaced);
+
+  if (dw / 2 < (int)dst->uv_width)
+    for (i = 0; i < dst->uv_height; i++)
+      memset(dst->v_buffer + i * dst->uv_stride + dw / 2 - 1, dst->v_buffer[i * dst->uv_stride + dw / 2 - 2], dst->uv_width - dw / 2 + 1);
+
+  if (dh / 2 < (int) dst->uv_height)
+    for (i = dh / 2 - 1; i < (int)dst->y_height / 2; i++)
+      memcpy(dst->v_buffer + i * dst->uv_stride, dst->v_buffer + (dh / 2 - 2)*dst->uv_stride, dst->uv_width);
+}
diff --git a/libs/libvpx/vpx_scale/generic/yv12config.c b/libs/libvpx/vpx_scale/generic/yv12config.c
new file mode 100644
index 0000000000..e8fee528e7
--- /dev/null
+++ b/libs/libvpx/vpx_scale/generic/yv12config.c
@@ -0,0 +1,287 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vpx_scale/yv12config.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+
+/****************************************************************************
+*  Exports
+****************************************************************************/
+
+/****************************************************************************
+ *
+ ****************************************************************************/
+#define yv12_align_addr(addr, align) \
+    (void*)(((size_t)(addr) + ((align) - 1)) & (size_t)-(align))
+
+int
+vp8_yv12_de_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf) {
+  if (ybf) {
+    // If libvpx is using frame buffer callbacks then buffer_alloc_sz must
+    // not be set.
+    if (ybf->buffer_alloc_sz > 0) {
+      vpx_free(ybf->buffer_alloc);
+    }
+
+    /* buffer_alloc isn't accessed by most functions.  Rather y_buffer,
+      u_buffer and v_buffer point to buffer_alloc and are used.  Clear out
+      all of this so that a freed pointer isn't inadvertently used */
+    memset(ybf, 0, sizeof(YV12_BUFFER_CONFIG));
+  } else {
+    return -1;
+  }
+
+  return 0;
+}
+
+int vp8_yv12_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
+                                  int width, int height, int border) {
+  if (ybf) {
+    int aligned_width = (width + 15) & ~15;
+    int aligned_height = (height + 15) & ~15;
+    int y_stride = ((aligned_width + 2 * border) + 31) & ~31;
+    int yplane_size = (aligned_height + 2 * border) * y_stride;
+    int uv_width = aligned_width >> 1;
+    int uv_height = aligned_height >> 1;
+    /** There is currently a bunch of code which assumes
+      *  uv_stride == y_stride/2, so enforce this here. */
+    int uv_stride = y_stride >> 1;
+    int uvplane_size = (uv_height + border) * uv_stride;
+    const int frame_size = yplane_size + 2 * uvplane_size;
+
+    if (!ybf->buffer_alloc) {
+      ybf->buffer_alloc = (uint8_t *)vpx_memalign(32, frame_size);
+      ybf->buffer_alloc_sz = frame_size;
+    }
+
+    if (!ybf->buffer_alloc || ybf->buffer_alloc_sz < frame_size)
+      return -1;
+
+    /* Only support allocating buffers that have a border that's a multiple
+     * of 32. The border restriction is required to get 16-byte alignment of
+     * the start of the chroma rows without introducing an arbitrary gap
+     * between planes, which would break the semantics of things like
+     * vpx_img_set_rect(). */
+    if (border & 0x1f)
+      return -3;
+
+    ybf->y_crop_width = width;
+    ybf->y_crop_height = height;
+    ybf->y_width  = aligned_width;
+    ybf->y_height = aligned_height;
+    ybf->y_stride = y_stride;
+
+    ybf->uv_crop_width = (width + 1) / 2;
+    ybf->uv_crop_height = (height + 1) / 2;
+    ybf->uv_width = uv_width;
+    ybf->uv_height = uv_height;
+    ybf->uv_stride = uv_stride;
+
+    ybf->alpha_width = 0;
+    ybf->alpha_height = 0;
+    ybf->alpha_stride = 0;
+
+    ybf->border = border;
+    ybf->frame_size = frame_size;
+
+    ybf->y_buffer = ybf->buffer_alloc + (border * y_stride) + border;
+    ybf->u_buffer = ybf->buffer_alloc + yplane_size + (border / 2  * uv_stride) + border / 2;
+    ybf->v_buffer = ybf->buffer_alloc + yplane_size + uvplane_size + (border / 2  * uv_stride) + border / 2;
+    ybf->alpha_buffer = NULL;
+
+    ybf->corrupted = 0; /* assume not currupted by errors */
+    return 0;
+  }
+  return -2;
+}
+
+int vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
+                                int width, int height, int border) {
+  if (ybf) {
+    vp8_yv12_de_alloc_frame_buffer(ybf);
+    return vp8_yv12_realloc_frame_buffer(ybf, width, height, border);
+  }
+  return -2;
+}
+
+#if CONFIG_VP9 || CONFIG_VP10
+// TODO(jkoleszar): Maybe replace this with struct vpx_image
+
+int vpx_free_frame_buffer(YV12_BUFFER_CONFIG *ybf) {
+  if (ybf) {
+    if (ybf->buffer_alloc_sz > 0) {
+      vpx_free(ybf->buffer_alloc);
+    }
+
+    /* buffer_alloc isn't accessed by most functions.  Rather y_buffer,
+      u_buffer and v_buffer point to buffer_alloc and are used.  Clear out
+      all of this so that a freed pointer isn't inadvertently used */
+    memset(ybf, 0, sizeof(YV12_BUFFER_CONFIG));
+  } else {
+    return -1;
+  }
+
+  return 0;
+}
+
+int vpx_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
+                             int width, int height,
+                             int ss_x, int ss_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                             int use_highbitdepth,
+#endif
+                             int border,
+                             int byte_alignment,
+                             vpx_codec_frame_buffer_t *fb,
+                             vpx_get_frame_buffer_cb_fn_t cb,
+                             void *cb_priv) {
+  if (ybf) {
+    const int vp9_byte_align = (byte_alignment == 0) ? 1 : byte_alignment;
+    const int aligned_width = (width + 7) & ~7;
+    const int aligned_height = (height + 7) & ~7;
+    const int y_stride = ((aligned_width + 2 * border) + 31) & ~31;
+    const uint64_t yplane_size = (aligned_height + 2 * border) *
+                                 (uint64_t)y_stride + byte_alignment;
+    const int uv_width = aligned_width >> ss_x;
+    const int uv_height = aligned_height >> ss_y;
+    const int uv_stride = y_stride >> ss_x;
+    const int uv_border_w = border >> ss_x;
+    const int uv_border_h = border >> ss_y;
+    const uint64_t uvplane_size = (uv_height + 2 * uv_border_h) *
+                                  (uint64_t)uv_stride + byte_alignment;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+    const uint64_t frame_size =
+        (1 + use_highbitdepth) * (yplane_size + 2 * uvplane_size);
+#else
+    const uint64_t frame_size = yplane_size + 2 * uvplane_size;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    uint8_t *buf = NULL;
+
+    if (cb != NULL) {
+      const int align_addr_extra_size = 31;
+      const uint64_t external_frame_size = frame_size + align_addr_extra_size;
+
+      assert(fb != NULL);
+
+      if (external_frame_size != (size_t)external_frame_size)
+        return -1;
+
+      // Allocation to hold larger frame, or first allocation.
+      if (cb(cb_priv, (size_t)external_frame_size, fb) < 0)
+        return -1;
+
+      if (fb->data == NULL || fb->size < external_frame_size)
+        return -1;
+
+      ybf->buffer_alloc = (uint8_t *)yv12_align_addr(fb->data, 32);
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+      // This memset is needed for fixing the issue of using uninitialized
+      // value in msan test. It will cause a perf loss, so only do this for
+      // msan test.
+      memset(ybf->buffer_alloc, 0, (int)frame_size);
+#endif
+#endif
+    } else if (frame_size > (size_t)ybf->buffer_alloc_sz) {
+      // Allocation to hold larger frame, or first allocation.
+      vpx_free(ybf->buffer_alloc);
+      ybf->buffer_alloc = NULL;
+
+      if (frame_size != (size_t)frame_size)
+        return -1;
+
+      ybf->buffer_alloc = (uint8_t *)vpx_memalign(32, (size_t)frame_size);
+      if (!ybf->buffer_alloc)
+        return -1;
+
+      ybf->buffer_alloc_sz = (int)frame_size;
+
+      // This memset is needed for fixing valgrind error from C loop filter
+      // due to access uninitialized memory in frame border. It could be
+      // removed if border is totally removed.
+      memset(ybf->buffer_alloc, 0, ybf->buffer_alloc_sz);
+    }
+
+    /* Only support allocating buffers that have a border that's a multiple
+     * of 32. The border restriction is required to get 16-byte alignment of
+     * the start of the chroma rows without introducing an arbitrary gap
+     * between planes, which would break the semantics of things like
+     * vpx_img_set_rect(). */
+    if (border & 0x1f)
+      return -3;
+
+    ybf->y_crop_width = width;
+    ybf->y_crop_height = height;
+    ybf->y_width  = aligned_width;
+    ybf->y_height = aligned_height;
+    ybf->y_stride = y_stride;
+
+    ybf->uv_crop_width = (width + ss_x) >> ss_x;
+    ybf->uv_crop_height = (height + ss_y) >> ss_y;
+    ybf->uv_width = uv_width;
+    ybf->uv_height = uv_height;
+    ybf->uv_stride = uv_stride;
+
+    ybf->border = border;
+    ybf->frame_size = (int)frame_size;
+    ybf->subsampling_x = ss_x;
+    ybf->subsampling_y = ss_y;
+
+    buf = ybf->buffer_alloc;
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (use_highbitdepth) {
+      // Store uint16 addresses when using 16bit framebuffers
+      buf = CONVERT_TO_BYTEPTR(ybf->buffer_alloc);
+      ybf->flags = YV12_FLAG_HIGHBITDEPTH;
+    } else {
+      ybf->flags = 0;
+    }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    ybf->y_buffer = (uint8_t *)yv12_align_addr(
+        buf + (border * y_stride) + border, vp9_byte_align);
+    ybf->u_buffer = (uint8_t *)yv12_align_addr(
+        buf + yplane_size + (uv_border_h * uv_stride) + uv_border_w,
+        vp9_byte_align);
+    ybf->v_buffer = (uint8_t *)yv12_align_addr(
+        buf + yplane_size + uvplane_size + (uv_border_h * uv_stride) +
+        uv_border_w, vp9_byte_align);
+
+    ybf->corrupted = 0; /* assume not corrupted by errors */
+    return 0;
+  }
+  return -2;
+}
+
+int vpx_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
+                           int width, int height,
+                           int ss_x, int ss_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                           int use_highbitdepth,
+#endif
+                           int border,
+                           int byte_alignment) {
+  if (ybf) {
+    vpx_free_frame_buffer(ybf);
+    return vpx_realloc_frame_buffer(ybf, width, height, ss_x, ss_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                    use_highbitdepth,
+#endif
+                                    border, byte_alignment, NULL, NULL, NULL);
+  }
+  return -2;
+}
+#endif
diff --git a/libs/libvpx/vpx_scale/generic/yv12extend.c b/libs/libvpx/vpx_scale/generic/yv12extend.c
new file mode 100644
index 0000000000..670144bc10
--- /dev/null
+++ b/libs/libvpx/vpx_scale/generic/yv12extend.c
@@ -0,0 +1,324 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_config.h"
+#include "./vpx_scale_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+#include "vpx_scale/yv12config.h"
+#if CONFIG_VP9_HIGHBITDEPTH
+#include "vp9/common/vp9_common.h"
+#endif
+
+static void extend_plane(uint8_t *const src, int src_stride,
+                         int width, int height,
+                         int extend_top, int extend_left,
+                         int extend_bottom, int extend_right) {
+  int i;
+  const int linesize = extend_left + extend_right + width;
+
+  /* copy the left and right most columns out */
+  uint8_t *src_ptr1 = src;
+  uint8_t *src_ptr2 = src + width - 1;
+  uint8_t *dst_ptr1 = src - extend_left;
+  uint8_t *dst_ptr2 = src + width;
+
+  for (i = 0; i < height; ++i) {
+    memset(dst_ptr1, src_ptr1[0], extend_left);
+    memset(dst_ptr2, src_ptr2[0], extend_right);
+    src_ptr1 += src_stride;
+    src_ptr2 += src_stride;
+    dst_ptr1 += src_stride;
+    dst_ptr2 += src_stride;
+  }
+
+  /* Now copy the top and bottom lines into each line of the respective
+   * borders
+   */
+  src_ptr1 = src - extend_left;
+  src_ptr2 = src + src_stride * (height - 1) - extend_left;
+  dst_ptr1 = src + src_stride * -extend_top - extend_left;
+  dst_ptr2 = src + src_stride * height - extend_left;
+
+  for (i = 0; i < extend_top; ++i) {
+    memcpy(dst_ptr1, src_ptr1, linesize);
+    dst_ptr1 += src_stride;
+  }
+
+  for (i = 0; i < extend_bottom; ++i) {
+    memcpy(dst_ptr2, src_ptr2, linesize);
+    dst_ptr2 += src_stride;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void extend_plane_high(uint8_t *const src8, int src_stride,
+                              int width, int height,
+                              int extend_top, int extend_left,
+                              int extend_bottom, int extend_right) {
+  int i;
+  const int linesize = extend_left + extend_right + width;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+
+  /* copy the left and right most columns out */
+  uint16_t *src_ptr1 = src;
+  uint16_t *src_ptr2 = src + width - 1;
+  uint16_t *dst_ptr1 = src - extend_left;
+  uint16_t *dst_ptr2 = src + width;
+
+  for (i = 0; i < height; ++i) {
+    vpx_memset16(dst_ptr1, src_ptr1[0], extend_left);
+    vpx_memset16(dst_ptr2, src_ptr2[0], extend_right);
+    src_ptr1 += src_stride;
+    src_ptr2 += src_stride;
+    dst_ptr1 += src_stride;
+    dst_ptr2 += src_stride;
+  }
+
+  /* Now copy the top and bottom lines into each line of the respective
+   * borders
+   */
+  src_ptr1 = src - extend_left;
+  src_ptr2 = src + src_stride * (height - 1) - extend_left;
+  dst_ptr1 = src + src_stride * -extend_top - extend_left;
+  dst_ptr2 = src + src_stride * height - extend_left;
+
+  for (i = 0; i < extend_top; ++i) {
+    memcpy(dst_ptr1, src_ptr1, linesize * sizeof(uint16_t));
+    dst_ptr1 += src_stride;
+  }
+
+  for (i = 0; i < extend_bottom; ++i) {
+    memcpy(dst_ptr2, src_ptr2, linesize * sizeof(uint16_t));
+    dst_ptr2 += src_stride;
+  }
+}
+#endif
+
+void vp8_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) {
+  const int uv_border = ybf->border / 2;
+
+  assert(ybf->border % 2 == 0);
+  assert(ybf->y_height - ybf->y_crop_height < 16);
+  assert(ybf->y_width - ybf->y_crop_width < 16);
+  assert(ybf->y_height - ybf->y_crop_height >= 0);
+  assert(ybf->y_width - ybf->y_crop_width >= 0);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    extend_plane_high(
+        ybf->y_buffer, ybf->y_stride,
+        ybf->y_crop_width, ybf->y_crop_height,
+        ybf->border, ybf->border,
+        ybf->border + ybf->y_height - ybf->y_crop_height,
+        ybf->border + ybf->y_width - ybf->y_crop_width);
+
+    extend_plane_high(
+        ybf->u_buffer, ybf->uv_stride,
+        ybf->uv_crop_width, ybf->uv_crop_height,
+        uv_border, uv_border,
+        uv_border + ybf->uv_height - ybf->uv_crop_height,
+        uv_border + ybf->uv_width - ybf->uv_crop_width);
+
+    extend_plane_high(
+        ybf->v_buffer, ybf->uv_stride,
+        ybf->uv_crop_width, ybf->uv_crop_height,
+        uv_border, uv_border,
+        uv_border + ybf->uv_height - ybf->uv_crop_height,
+        uv_border + ybf->uv_width - ybf->uv_crop_width);
+    return;
+  }
+#endif
+  extend_plane(ybf->y_buffer, ybf->y_stride,
+               ybf->y_crop_width, ybf->y_crop_height,
+               ybf->border, ybf->border,
+               ybf->border + ybf->y_height - ybf->y_crop_height,
+               ybf->border + ybf->y_width - ybf->y_crop_width);
+
+  extend_plane(ybf->u_buffer, ybf->uv_stride,
+               ybf->uv_crop_width, ybf->uv_crop_height,
+               uv_border, uv_border,
+               uv_border + ybf->uv_height - ybf->uv_crop_height,
+               uv_border + ybf->uv_width - ybf->uv_crop_width);
+
+  extend_plane(ybf->v_buffer, ybf->uv_stride,
+               ybf->uv_crop_width, ybf->uv_crop_height,
+               uv_border, uv_border,
+               uv_border + ybf->uv_height - ybf->uv_crop_height,
+               uv_border + ybf->uv_width - ybf->uv_crop_width);
+}
+
+#if CONFIG_VP9 || CONFIG_VP10
+static void extend_frame(YV12_BUFFER_CONFIG *const ybf, int ext_size) {
+  const int c_w = ybf->uv_crop_width;
+  const int c_h = ybf->uv_crop_height;
+  const int ss_x = ybf->uv_width < ybf->y_width;
+  const int ss_y = ybf->uv_height < ybf->y_height;
+  const int c_et = ext_size >> ss_y;
+  const int c_el = ext_size >> ss_x;
+  const int c_eb = c_et + ybf->uv_height - ybf->uv_crop_height;
+  const int c_er = c_el + ybf->uv_width - ybf->uv_crop_width;
+
+  assert(ybf->y_height - ybf->y_crop_height < 16);
+  assert(ybf->y_width - ybf->y_crop_width < 16);
+  assert(ybf->y_height - ybf->y_crop_height >= 0);
+  assert(ybf->y_width - ybf->y_crop_width >= 0);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    extend_plane_high(ybf->y_buffer, ybf->y_stride,
+                      ybf->y_crop_width, ybf->y_crop_height,
+                      ext_size, ext_size,
+                      ext_size + ybf->y_height - ybf->y_crop_height,
+                      ext_size + ybf->y_width - ybf->y_crop_width);
+    extend_plane_high(ybf->u_buffer, ybf->uv_stride,
+                      c_w, c_h, c_et, c_el, c_eb, c_er);
+    extend_plane_high(ybf->v_buffer, ybf->uv_stride,
+                      c_w, c_h, c_et, c_el, c_eb, c_er);
+    return;
+  }
+#endif
+  extend_plane(ybf->y_buffer, ybf->y_stride,
+               ybf->y_crop_width, ybf->y_crop_height,
+               ext_size, ext_size,
+               ext_size + ybf->y_height - ybf->y_crop_height,
+               ext_size + ybf->y_width - ybf->y_crop_width);
+
+  extend_plane(ybf->u_buffer, ybf->uv_stride,
+               c_w, c_h, c_et, c_el, c_eb, c_er);
+
+  extend_plane(ybf->v_buffer, ybf->uv_stride,
+               c_w, c_h, c_et, c_el, c_eb, c_er);
+}
+
+void vpx_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) {
+  extend_frame(ybf, ybf->border);
+}
+
+void vpx_extend_frame_inner_borders_c(YV12_BUFFER_CONFIG *ybf) {
+  const int inner_bw = (ybf->border > VP9INNERBORDERINPIXELS) ?
+                       VP9INNERBORDERINPIXELS : ybf->border;
+  extend_frame(ybf, inner_bw);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void memcpy_short_addr(uint8_t *dst8, const uint8_t *src8, int num) {
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  memcpy(dst, src, num * sizeof(uint16_t));
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // CONFIG_VP9 || CONFIG_VP10
+
+// Copies the source image into the destination image and updates the
+// destination's UMV borders.
+// Note: The frames are assumed to be identical in size.
+void vp8_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_ybc,
+                           YV12_BUFFER_CONFIG *dst_ybc) {
+  int row;
+  const uint8_t *src = src_ybc->y_buffer;
+  uint8_t *dst = dst_ybc->y_buffer;
+
+#if 0
+  /* These assertions are valid in the codec, but the libvpx-tester uses
+   * this code slightly differently.
+   */
+  assert(src_ybc->y_width == dst_ybc->y_width);
+  assert(src_ybc->y_height == dst_ybc->y_height);
+#endif
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (src_ybc->flags & YV12_FLAG_HIGHBITDEPTH) {
+    assert(dst_ybc->flags & YV12_FLAG_HIGHBITDEPTH);
+    for (row = 0; row < src_ybc->y_height; ++row) {
+      memcpy_short_addr(dst, src, src_ybc->y_width);
+      src += src_ybc->y_stride;
+      dst += dst_ybc->y_stride;
+    }
+
+    src = src_ybc->u_buffer;
+    dst = dst_ybc->u_buffer;
+
+    for (row = 0; row < src_ybc->uv_height; ++row) {
+      memcpy_short_addr(dst, src, src_ybc->uv_width);
+      src += src_ybc->uv_stride;
+      dst += dst_ybc->uv_stride;
+    }
+
+    src = src_ybc->v_buffer;
+    dst = dst_ybc->v_buffer;
+
+    for (row = 0; row < src_ybc->uv_height; ++row) {
+      memcpy_short_addr(dst, src, src_ybc->uv_width);
+      src += src_ybc->uv_stride;
+      dst += dst_ybc->uv_stride;
+    }
+
+    vp8_yv12_extend_frame_borders_c(dst_ybc);
+    return;
+  } else {
+    assert(!(dst_ybc->flags & YV12_FLAG_HIGHBITDEPTH));
+  }
+#endif
+
+  for (row = 0; row < src_ybc->y_height; ++row) {
+    memcpy(dst, src, src_ybc->y_width);
+    src += src_ybc->y_stride;
+    dst += dst_ybc->y_stride;
+  }
+
+  src = src_ybc->u_buffer;
+  dst = dst_ybc->u_buffer;
+
+  for (row = 0; row < src_ybc->uv_height; ++row) {
+    memcpy(dst, src, src_ybc->uv_width);
+    src += src_ybc->uv_stride;
+    dst += dst_ybc->uv_stride;
+  }
+
+  src = src_ybc->v_buffer;
+  dst = dst_ybc->v_buffer;
+
+  for (row = 0; row < src_ybc->uv_height; ++row) {
+    memcpy(dst, src, src_ybc->uv_width);
+    src += src_ybc->uv_stride;
+    dst += dst_ybc->uv_stride;
+  }
+
+  vp8_yv12_extend_frame_borders_c(dst_ybc);
+}
+
+void vpx_yv12_copy_y_c(const YV12_BUFFER_CONFIG *src_ybc,
+                       YV12_BUFFER_CONFIG *dst_ybc) {
+  int row;
+  const uint8_t *src = src_ybc->y_buffer;
+  uint8_t *dst = dst_ybc->y_buffer;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (src_ybc->flags & YV12_FLAG_HIGHBITDEPTH) {
+    const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+    uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
+    for (row = 0; row < src_ybc->y_height; ++row) {
+      memcpy(dst16, src16, src_ybc->y_width * sizeof(uint16_t));
+      src16 += src_ybc->y_stride;
+      dst16 += dst_ybc->y_stride;
+    }
+    return;
+  }
+#endif
+
+  for (row = 0; row < src_ybc->y_height; ++row) {
+    memcpy(dst, src, src_ybc->y_width);
+    src += src_ybc->y_stride;
+    dst += dst_ybc->y_stride;
+  }
+}
diff --git a/libs/libvpx/vpx_scale/mips/dspr2/yv12extend_dspr2.c b/libs/libvpx/vpx_scale/mips/dspr2/yv12extend_dspr2.c
new file mode 100644
index 0000000000..aab478539a
--- /dev/null
+++ b/libs/libvpx/vpx_scale/mips/dspr2/yv12extend_dspr2.c
@@ -0,0 +1,144 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "vpx_scale/yv12config.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_scale/vpx_scale.h"
+
+#if HAVE_DSPR2
+static void extend_plane(uint8_t *const src, int src_stride,
+                         int width, int height,
+                         int extend_top, int extend_left,
+                         int extend_bottom, int extend_right) {
+  int       i, j;
+  uint8_t   *left_src, *right_src;
+  uint8_t   *left_dst_start, *right_dst_start;
+  uint8_t   *left_dst, *right_dst;
+  uint8_t   *top_src, *bot_src;
+  uint8_t   *top_dst, *bot_dst;
+  uint32_t  left_pix;
+  uint32_t  right_pix;
+  uint32_t  linesize;
+
+  /* copy the left and right most columns out */
+  left_src  = src;
+  right_src = src + width - 1;
+  left_dst_start = src - extend_left;
+  right_dst_start = src + width;
+
+  for (i = height; i--; ) {
+    left_dst  = left_dst_start;
+    right_dst = right_dst_start;
+
+    __asm__ __volatile__ (
+        "lb        %[left_pix],     0(%[left_src])      \n\t"
+        "lb        %[right_pix],    0(%[right_src])     \n\t"
+        "replv.qb  %[left_pix],     %[left_pix]         \n\t"
+        "replv.qb  %[right_pix],    %[right_pix]        \n\t"
+
+        : [left_pix] "=&r" (left_pix), [right_pix] "=&r" (right_pix)
+        : [left_src] "r" (left_src), [right_src] "r" (right_src)
+    );
+
+    for (j = extend_left/4; j--; ) {
+      __asm__ __volatile__ (
+        "sw     %[left_pix],    0(%[left_dst])     \n\t"
+        "sw     %[right_pix],   0(%[right_dst])    \n\t"
+
+        :
+        : [left_dst] "r" (left_dst), [left_pix] "r" (left_pix),
+          [right_dst] "r" (right_dst), [right_pix] "r" (right_pix)
+      );
+
+      left_dst += 4;
+      right_dst += 4;
+    }
+
+    for (j = extend_left%4; j--; ) {
+      __asm__ __volatile__ (
+        "sb     %[left_pix],    0(%[left_dst])     \n\t"
+        "sb     %[right_pix],   0(%[right_dst])     \n\t"
+
+        :
+        : [left_dst] "r" (left_dst), [left_pix] "r" (left_pix),
+          [right_dst] "r" (right_dst), [right_pix] "r" (right_pix)
+      );
+
+      left_dst += 1;
+      right_dst += 1;
+    }
+
+    left_src  += src_stride;
+    right_src += src_stride;
+    left_dst_start += src_stride;
+    right_dst_start += src_stride;
+  }
+
+  /* Now copy the top and bottom lines into each line of the respective
+   * borders
+   */
+  top_src = src - extend_left;
+  bot_src = src + src_stride * (height - 1) - extend_left;
+  top_dst = src + src_stride * (-extend_top) - extend_left;
+  bot_dst = src + src_stride * (height) - extend_left;
+  linesize = extend_left + extend_right + width;
+
+  for (i = 0; i < extend_top; i++) {
+    memcpy(top_dst, top_src, linesize);
+    top_dst += src_stride;
+  }
+
+  for (i = 0; i < extend_bottom; i++) {
+    memcpy(bot_dst, bot_src, linesize);
+    bot_dst += src_stride;
+  }
+}
+
+static void extend_frame(YV12_BUFFER_CONFIG *const ybf, int ext_size) {
+  const int c_w = ybf->uv_crop_width;
+  const int c_h = ybf->uv_crop_height;
+  const int ss_x = ybf->uv_width < ybf->y_width;
+  const int ss_y = ybf->uv_height < ybf->y_height;
+  const int c_et = ext_size >> ss_y;
+  const int c_el = ext_size >> ss_x;
+  const int c_eb = c_et + ybf->uv_height - ybf->uv_crop_height;
+  const int c_er = c_el + ybf->uv_width - ybf->uv_crop_width;
+
+  assert(ybf->y_height - ybf->y_crop_height < 16);
+  assert(ybf->y_width - ybf->y_crop_width < 16);
+  assert(ybf->y_height - ybf->y_crop_height >= 0);
+  assert(ybf->y_width - ybf->y_crop_width >= 0);
+
+  extend_plane(ybf->y_buffer, ybf->y_stride,
+               ybf->y_crop_width, ybf->y_crop_height,
+               ext_size, ext_size,
+               ext_size + ybf->y_height - ybf->y_crop_height,
+               ext_size + ybf->y_width - ybf->y_crop_width);
+
+  extend_plane(ybf->u_buffer, ybf->uv_stride,
+               c_w, c_h, c_et, c_el, c_eb, c_er);
+
+  extend_plane(ybf->v_buffer, ybf->uv_stride,
+               c_w, c_h, c_et, c_el, c_eb, c_er);
+}
+
+void vpx_extend_frame_borders_dspr2(YV12_BUFFER_CONFIG *ybf) {
+  extend_frame(ybf, ybf->border);
+}
+
+void vpx_extend_frame_inner_borders_dspr2(YV12_BUFFER_CONFIG *ybf) {
+  const int inner_bw = (ybf->border > VP9INNERBORDERINPIXELS) ?
+                       VP9INNERBORDERINPIXELS : ybf->border;
+  extend_frame(ybf, inner_bw);
+}
+#endif
diff --git a/libs/libvpx/vpx_scale/vpx_scale.h b/libs/libvpx/vpx_scale/vpx_scale.h
new file mode 100644
index 0000000000..43fcf9d66e
--- /dev/null
+++ b/libs/libvpx/vpx_scale/vpx_scale.h
@@ -0,0 +1,27 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VPX_SCALE_VPX_SCALE_H_
+#define VPX_SCALE_VPX_SCALE_H_
+
+#include "vpx_scale/yv12config.h"
+
+extern void vpx_scale_frame(YV12_BUFFER_CONFIG *src,
+                            YV12_BUFFER_CONFIG *dst,
+                            unsigned char *temp_area,
+                            unsigned char temp_height,
+                            unsigned int hscale,
+                            unsigned int hratio,
+                            unsigned int vscale,
+                            unsigned int vratio,
+                            unsigned int interlaced);
+
+#endif  // VPX_SCALE_VPX_SCALE_H_
diff --git a/libs/libvpx/vpx_scale/vpx_scale.mk b/libs/libvpx/vpx_scale/vpx_scale.mk
new file mode 100644
index 0000000000..a49abf3b4b
--- /dev/null
+++ b/libs/libvpx/vpx_scale/vpx_scale.mk
@@ -0,0 +1,16 @@
+SCALE_SRCS-yes += vpx_scale.mk
+SCALE_SRCS-yes += yv12config.h
+SCALE_SRCS-$(CONFIG_SPATIAL_RESAMPLING) += vpx_scale.h
+SCALE_SRCS-$(CONFIG_SPATIAL_RESAMPLING) += generic/vpx_scale.c
+SCALE_SRCS-yes += generic/yv12config.c
+SCALE_SRCS-yes += generic/yv12extend.c
+SCALE_SRCS-$(CONFIG_SPATIAL_RESAMPLING) += generic/gen_scalers.c
+SCALE_SRCS-yes += vpx_scale_rtcd.c
+SCALE_SRCS-yes += vpx_scale_rtcd.pl
+
+#mips(dspr2)
+SCALE_SRCS-$(HAVE_DSPR2)  += mips/dspr2/yv12extend_dspr2.c
+
+SCALE_SRCS-no += $(SCALE_SRCS_REMOVE-yes)
+
+$(eval $(call rtcd_h_template,vpx_scale_rtcd,vpx_scale/vpx_scale_rtcd.pl))
diff --git a/libs/libvpx/vpx_scale/vpx_scale_rtcd.c b/libs/libvpx/vpx_scale/vpx_scale_rtcd.c
new file mode 100644
index 0000000000..bea603fd10
--- /dev/null
+++ b/libs/libvpx/vpx_scale/vpx_scale_rtcd.c
@@ -0,0 +1,18 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "./vpx_config.h"
+#define RTCD_C
+#include "./vpx_scale_rtcd.h"
+#include "vpx_ports/vpx_once.h"
+
+void vpx_scale_rtcd()
+{
+    once(setup_rtcd_internal);
+}
diff --git a/libs/libvpx/vpx_scale/vpx_scale_rtcd.pl b/libs/libvpx/vpx_scale/vpx_scale_rtcd.pl
new file mode 100644
index 0000000000..56b952ba35
--- /dev/null
+++ b/libs/libvpx/vpx_scale/vpx_scale_rtcd.pl
@@ -0,0 +1,32 @@
+sub vpx_scale_forward_decls() {
+print <<EOF
+struct yv12_buffer_config;
+EOF
+}
+forward_decls qw/vpx_scale_forward_decls/;
+
+# Scaler functions
+if (vpx_config("CONFIG_SPATIAL_RESAMPLING") eq "yes") {
+    add_proto qw/void vp8_horizontal_line_5_4_scale/, "const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width";
+    add_proto qw/void vp8_vertical_band_5_4_scale/, "unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width";
+    add_proto qw/void vp8_horizontal_line_5_3_scale/, "const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width";
+    add_proto qw/void vp8_vertical_band_5_3_scale/, "unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width";
+    add_proto qw/void vp8_horizontal_line_2_1_scale/, "const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width";
+    add_proto qw/void vp8_vertical_band_2_1_scale/, "unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width";
+    add_proto qw/void vp8_vertical_band_2_1_scale_i/, "unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width";
+}
+
+add_proto qw/void vp8_yv12_extend_frame_borders/, "struct yv12_buffer_config *ybf";
+
+add_proto qw/void vp8_yv12_copy_frame/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc";
+
+add_proto qw/void vpx_yv12_copy_y/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc";
+
+if ((vpx_config("CONFIG_VP9") eq "yes") || (vpx_config("CONFIG_VP10") eq "yes")) {
+    add_proto qw/void vpx_extend_frame_borders/, "struct yv12_buffer_config *ybf";
+    specialize qw/vpx_extend_frame_borders dspr2/;
+
+    add_proto qw/void vpx_extend_frame_inner_borders/, "struct yv12_buffer_config *ybf";
+    specialize qw/vpx_extend_frame_inner_borders dspr2/;
+}
+1;
diff --git a/libs/libvpx/vpx_scale/yv12config.h b/libs/libvpx/vpx_scale/yv12config.h
new file mode 100644
index 0000000000..37b255d4d3
--- /dev/null
+++ b/libs/libvpx/vpx_scale/yv12config.h
@@ -0,0 +1,105 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_SCALE_YV12CONFIG_H_
+#define VPX_SCALE_YV12CONFIG_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "./vpx_config.h"
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_frame_buffer.h"
+#include "vpx/vpx_integer.h"
+
+#define VP8BORDERINPIXELS           32
+#define VP9INNERBORDERINPIXELS      96
+#define VP9_INTERP_EXTEND           4
+#define VP9_ENC_BORDER_IN_PIXELS    160
+#define VP9_DEC_BORDER_IN_PIXELS    32
+
+typedef struct yv12_buffer_config {
+  int   y_width;
+  int   y_height;
+  int   y_crop_width;
+  int   y_crop_height;
+  int   y_stride;
+
+  int   uv_width;
+  int   uv_height;
+  int   uv_crop_width;
+  int   uv_crop_height;
+  int   uv_stride;
+
+  int   alpha_width;
+  int   alpha_height;
+  int   alpha_stride;
+
+  uint8_t *y_buffer;
+  uint8_t *u_buffer;
+  uint8_t *v_buffer;
+  uint8_t *alpha_buffer;
+
+  uint8_t *buffer_alloc;
+  int buffer_alloc_sz;
+  int border;
+  int frame_size;
+  int subsampling_x;
+  int subsampling_y;
+  unsigned int bit_depth;
+  vpx_color_space_t color_space;
+  vpx_color_range_t color_range;
+  int render_width;
+  int render_height;
+
+  int corrupted;
+  int flags;
+} YV12_BUFFER_CONFIG;
+
+#define YV12_FLAG_HIGHBITDEPTH 8
+
+int vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
+                                int width, int height, int border);
+int vp8_yv12_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
+                                  int width, int height, int border);
+int vp8_yv12_de_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf);
+
+int vpx_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
+                           int width, int height, int ss_x, int ss_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                           int use_highbitdepth,
+#endif
+                           int border, int byte_alignment);
+
+// Updates the yv12 buffer config with the frame buffer. |byte_alignment| must
+// be a power of 2, from 32 to 1024. 0 sets legacy alignment. If cb is not
+// NULL, then libvpx is using the frame buffer callbacks to handle memory.
+// If cb is not NULL, libvpx will call cb with minimum size in bytes needed
+// to decode the current frame. If cb is NULL, libvpx will allocate memory
+// internally to decode the current frame. Returns 0 on success. Returns < 0
+// on failure.
+int vpx_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
+                             int width, int height, int ss_x, int ss_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                             int use_highbitdepth,
+#endif
+                             int border,
+                             int byte_alignment,
+                             vpx_codec_frame_buffer_t *fb,
+                             vpx_get_frame_buffer_cb_fn_t cb,
+                             void *cb_priv);
+int vpx_free_frame_buffer(YV12_BUFFER_CONFIG *ybf);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // VPX_SCALE_YV12CONFIG_H_
diff --git a/libs/libvpx/vpx_util/endian_inl.h b/libs/libvpx/vpx_util/endian_inl.h
new file mode 100644
index 0000000000..37bdce1ccd
--- /dev/null
+++ b/libs/libvpx/vpx_util/endian_inl.h
@@ -0,0 +1,120 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Endian related functions.
+
+#ifndef VPX_UTIL_ENDIAN_INL_H_
+#define VPX_UTIL_ENDIAN_INL_H_
+
+#include <stdlib.h>
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+#if defined(__GNUC__)
+# define LOCAL_GCC_VERSION ((__GNUC__ << 8) | __GNUC_MINOR__)
+# define LOCAL_GCC_PREREQ(maj, min) \
+    (LOCAL_GCC_VERSION >= (((maj) << 8) | (min)))
+#else
+# define LOCAL_GCC_VERSION 0
+# define LOCAL_GCC_PREREQ(maj, min) 0
+#endif
+
+// handle clang compatibility
+#ifndef __has_builtin
+# define __has_builtin(x) 0
+#endif
+
+// some endian fix (e.g.: mips-gcc doesn't define __BIG_ENDIAN__)
+#if !defined(WORDS_BIGENDIAN) && \
+    (defined(__BIG_ENDIAN__) || defined(_M_PPC) || \
+     (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)))
+#define WORDS_BIGENDIAN
+#endif
+
+#if defined(WORDS_BIGENDIAN)
+#define HToLE32 BSwap32
+#define HToLE16 BSwap16
+#define HToBE64(x) (x)
+#define HToBE32(x) (x)
+#else
+#define HToLE32(x) (x)
+#define HToLE16(x) (x)
+#define HToBE64(X) BSwap64(X)
+#define HToBE32(X) BSwap32(X)
+#endif
+
+#if LOCAL_GCC_PREREQ(4, 8) || __has_builtin(__builtin_bswap16)
+#define HAVE_BUILTIN_BSWAP16
+#endif
+
+#if LOCAL_GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap32)
+#define HAVE_BUILTIN_BSWAP32
+#endif
+
+#if LOCAL_GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap64)
+#define HAVE_BUILTIN_BSWAP64
+#endif
+
+#if HAVE_MIPS32 && defined(__mips__) && !defined(__mips64) && \
+    defined(__mips_isa_rev) && (__mips_isa_rev >= 2) && (__mips_isa_rev < 6)
+#define VPX_USE_MIPS32_R2
+#endif
+
+static INLINE uint16_t BSwap16(uint16_t x) {
+#if defined(HAVE_BUILTIN_BSWAP16)
+  return __builtin_bswap16(x);
+#elif defined(_MSC_VER)
+  return _byteswap_ushort(x);
+#else
+  // gcc will recognize a 'rorw $8, ...' here:
+  return (x >> 8) | ((x & 0xff) << 8);
+#endif  // HAVE_BUILTIN_BSWAP16
+}
+
+static INLINE uint32_t BSwap32(uint32_t x) {
+#if defined(VPX_USE_MIPS32_R2)
+  uint32_t ret;
+  __asm__ volatile (
+    "wsbh   %[ret], %[x]          \n\t"
+    "rotr   %[ret], %[ret],  16   \n\t"
+    : [ret]"=r"(ret)
+    : [x]"r"(x)
+  );
+  return ret;
+#elif defined(HAVE_BUILTIN_BSWAP32)
+  return __builtin_bswap32(x);
+#elif defined(__i386__) || defined(__x86_64__)
+  uint32_t swapped_bytes;
+  __asm__ volatile("bswap %0" : "=r"(swapped_bytes) : "0"(x));
+  return swapped_bytes;
+#elif defined(_MSC_VER)
+  return (uint32_t)_byteswap_ulong(x);
+#else
+  return (x >> 24) | ((x >> 8) & 0xff00) | ((x << 8) & 0xff0000) | (x << 24);
+#endif  // HAVE_BUILTIN_BSWAP32
+}
+
+static INLINE uint64_t BSwap64(uint64_t x) {
+#if defined(HAVE_BUILTIN_BSWAP64)
+  return __builtin_bswap64(x);
+#elif defined(__x86_64__)
+  uint64_t swapped_bytes;
+  __asm__ volatile("bswapq %0" : "=r"(swapped_bytes) : "0"(x));
+  return swapped_bytes;
+#elif defined(_MSC_VER)
+  return (uint64_t)_byteswap_uint64(x);
+#else  // generic code for swapping 64-bit values (suggested by bdb@)
+  x = ((x & 0xffffffff00000000ull) >> 32) | ((x & 0x00000000ffffffffull) << 32);
+  x = ((x & 0xffff0000ffff0000ull) >> 16) | ((x & 0x0000ffff0000ffffull) << 16);
+  x = ((x & 0xff00ff00ff00ff00ull) >>  8) | ((x & 0x00ff00ff00ff00ffull) <<  8);
+  return x;
+#endif  // HAVE_BUILTIN_BSWAP64
+}
+
+#endif  // VPX_UTIL_ENDIAN_INL_H_
diff --git a/libs/libvpx/vpx_util/vpx_thread.c b/libs/libvpx/vpx_util/vpx_thread.c
new file mode 100644
index 0000000000..0bb0125bd4
--- /dev/null
+++ b/libs/libvpx/vpx_util/vpx_thread.c
@@ -0,0 +1,184 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Multi-threaded worker
+//
+// Original source:
+//  http://git.chromium.org/webm/libwebp.git
+//  100644 blob 264210ba2807e4da47eb5d18c04cf869d89b9784  src/utils/thread.c
+
+#include <assert.h>
+#include <string.h>   // for memset()
+#include "./vpx_thread.h"
+#include "vpx_mem/vpx_mem.h"
+
+#if CONFIG_MULTITHREAD
+
+struct VPxWorkerImpl {
+  pthread_mutex_t mutex_;
+  pthread_cond_t  condition_;
+  pthread_t       thread_;
+};
+
+//------------------------------------------------------------------------------
+
+static void execute(VPxWorker *const worker);  // Forward declaration.
+
+static THREADFN thread_loop(void *ptr) {
+  VPxWorker *const worker = (VPxWorker*)ptr;
+  int done = 0;
+  while (!done) {
+    pthread_mutex_lock(&worker->impl_->mutex_);
+    while (worker->status_ == OK) {   // wait in idling mode
+      pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_);
+    }
+    if (worker->status_ == WORK) {
+      execute(worker);
+      worker->status_ = OK;
+    } else if (worker->status_ == NOT_OK) {   // finish the worker
+      done = 1;
+    }
+    // signal to the main thread that we're done (for sync())
+    pthread_cond_signal(&worker->impl_->condition_);
+    pthread_mutex_unlock(&worker->impl_->mutex_);
+  }
+  return THREAD_RETURN(NULL);    // Thread is finished
+}
+
+// main thread state control
+static void change_state(VPxWorker *const worker,
+                         VPxWorkerStatus new_status) {
+  // No-op when attempting to change state on a thread that didn't come up.
+  // Checking status_ without acquiring the lock first would result in a data
+  // race.
+  if (worker->impl_ == NULL) return;
+
+  pthread_mutex_lock(&worker->impl_->mutex_);
+  if (worker->status_ >= OK) {
+    // wait for the worker to finish
+    while (worker->status_ != OK) {
+      pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_);
+    }
+    // assign new status and release the working thread if needed
+    if (new_status != OK) {
+      worker->status_ = new_status;
+      pthread_cond_signal(&worker->impl_->condition_);
+    }
+  }
+  pthread_mutex_unlock(&worker->impl_->mutex_);
+}
+
+#endif  // CONFIG_MULTITHREAD
+
+//------------------------------------------------------------------------------
+
+static void init(VPxWorker *const worker) {
+  memset(worker, 0, sizeof(*worker));
+  worker->status_ = NOT_OK;
+}
+
+static int sync(VPxWorker *const worker) {
+#if CONFIG_MULTITHREAD
+  change_state(worker, OK);
+#endif
+  assert(worker->status_ <= OK);
+  return !worker->had_error;
+}
+
+static int reset(VPxWorker *const worker) {
+  int ok = 1;
+  worker->had_error = 0;
+  if (worker->status_ < OK) {
+#if CONFIG_MULTITHREAD
+    worker->impl_ = (VPxWorkerImpl*)vpx_calloc(1, sizeof(*worker->impl_));
+    if (worker->impl_ == NULL) {
+      return 0;
+    }
+    if (pthread_mutex_init(&worker->impl_->mutex_, NULL)) {
+      goto Error;
+    }
+    if (pthread_cond_init(&worker->impl_->condition_, NULL)) {
+      pthread_mutex_destroy(&worker->impl_->mutex_);
+      goto Error;
+    }
+    pthread_mutex_lock(&worker->impl_->mutex_);
+    ok = !pthread_create(&worker->impl_->thread_, NULL, thread_loop, worker);
+    if (ok) worker->status_ = OK;
+    pthread_mutex_unlock(&worker->impl_->mutex_);
+    if (!ok) {
+      pthread_mutex_destroy(&worker->impl_->mutex_);
+      pthread_cond_destroy(&worker->impl_->condition_);
+ Error:
+      vpx_free(worker->impl_);
+      worker->impl_ = NULL;
+      return 0;
+    }
+#else
+    worker->status_ = OK;
+#endif
+  } else if (worker->status_ > OK) {
+    ok = sync(worker);
+  }
+  assert(!ok || (worker->status_ == OK));
+  return ok;
+}
+
+static void execute(VPxWorker *const worker) {
+  if (worker->hook != NULL) {
+    worker->had_error |= !worker->hook(worker->data1, worker->data2);
+  }
+}
+
+static void launch(VPxWorker *const worker) {
+#if CONFIG_MULTITHREAD
+  change_state(worker, WORK);
+#else
+  execute(worker);
+#endif
+}
+
+static void end(VPxWorker *const worker) {
+#if CONFIG_MULTITHREAD
+  if (worker->impl_ != NULL) {
+    change_state(worker, NOT_OK);
+    pthread_join(worker->impl_->thread_, NULL);
+    pthread_mutex_destroy(&worker->impl_->mutex_);
+    pthread_cond_destroy(&worker->impl_->condition_);
+    vpx_free(worker->impl_);
+    worker->impl_ = NULL;
+  }
+#else
+  worker->status_ = NOT_OK;
+  assert(worker->impl_ == NULL);
+#endif
+  assert(worker->status_ == NOT_OK);
+}
+
+//------------------------------------------------------------------------------
+
+static VPxWorkerInterface g_worker_interface = {
+  init, reset, sync, launch, execute, end
+};
+
+int vpx_set_worker_interface(const VPxWorkerInterface* const winterface) {
+  if (winterface == NULL ||
+      winterface->init == NULL || winterface->reset == NULL ||
+      winterface->sync == NULL || winterface->launch == NULL ||
+      winterface->execute == NULL || winterface->end == NULL) {
+    return 0;
+  }
+  g_worker_interface = *winterface;
+  return 1;
+}
+
+const VPxWorkerInterface *vpx_get_worker_interface(void) {
+  return &g_worker_interface;
+}
+
+//------------------------------------------------------------------------------
diff --git a/libs/libvpx/vpx_util/vpx_thread.h b/libs/libvpx/vpx_util/vpx_thread.h
new file mode 100644
index 0000000000..de63c4da00
--- /dev/null
+++ b/libs/libvpx/vpx_util/vpx_thread.h
@@ -0,0 +1,223 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Multi-threaded worker
+//
+// Original source:
+//  http://git.chromium.org/webm/libwebp.git
+//  100644 blob 7bd451b124ae3b81596abfbcc823e3cb129d3a38  src/utils/thread.h
+
+#ifndef VPX_THREAD_H_
+#define VPX_THREAD_H_
+
+#include "./vpx_config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Set maximum decode threads to be 8 due to the limit of frame buffers
+// and not enough semaphores in the emulation layer on windows.
+#define MAX_DECODE_THREADS 8
+
+#if CONFIG_MULTITHREAD
+
+#if defined(_WIN32) && !HAVE_PTHREAD_H
+#include <errno.h>  // NOLINT
+#include <process.h>  // NOLINT
+#include <windows.h>  // NOLINT
+typedef HANDLE pthread_t;
+typedef CRITICAL_SECTION pthread_mutex_t;
+typedef struct {
+  HANDLE waiting_sem_;
+  HANDLE received_sem_;
+  HANDLE signal_event_;
+} pthread_cond_t;
+
+//------------------------------------------------------------------------------
+// simplistic pthread emulation layer
+
+// _beginthreadex requires __stdcall
+#define THREADFN unsigned int __stdcall
+#define THREAD_RETURN(val) (unsigned int)((DWORD_PTR)val)
+
+static INLINE int pthread_create(pthread_t* const thread, const void* attr,
+                                 unsigned int (__stdcall *start)(void*),
+                                 void* arg) {
+  (void)attr;
+  *thread = (pthread_t)_beginthreadex(NULL,   /* void *security */
+                                      0,      /* unsigned stack_size */
+                                      start,
+                                      arg,
+                                      0,      /* unsigned initflag */
+                                      NULL);  /* unsigned *thrdaddr */
+  if (*thread == NULL) return 1;
+  SetThreadPriority(*thread, THREAD_PRIORITY_ABOVE_NORMAL);
+  return 0;
+}
+
+static INLINE int pthread_join(pthread_t thread, void** value_ptr) {
+  (void)value_ptr;
+  return (WaitForSingleObject(thread, INFINITE) != WAIT_OBJECT_0 ||
+          CloseHandle(thread) == 0);
+}
+
+// Mutex
+static INLINE int pthread_mutex_init(pthread_mutex_t *const mutex,
+                                     void* mutexattr) {
+  (void)mutexattr;
+  InitializeCriticalSection(mutex);
+  return 0;
+}
+
+static INLINE int pthread_mutex_trylock(pthread_mutex_t *const mutex) {
+  return TryEnterCriticalSection(mutex) ? 0 : EBUSY;
+}
+
+static INLINE int pthread_mutex_lock(pthread_mutex_t *const mutex) {
+  EnterCriticalSection(mutex);
+  return 0;
+}
+
+static INLINE int pthread_mutex_unlock(pthread_mutex_t *const mutex) {
+  LeaveCriticalSection(mutex);
+  return 0;
+}
+
+static INLINE int pthread_mutex_destroy(pthread_mutex_t *const mutex) {
+  DeleteCriticalSection(mutex);
+  return 0;
+}
+
+// Condition
+static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) {
+  int ok = 1;
+  ok &= (CloseHandle(condition->waiting_sem_) != 0);
+  ok &= (CloseHandle(condition->received_sem_) != 0);
+  ok &= (CloseHandle(condition->signal_event_) != 0);
+  return !ok;
+}
+
+static INLINE int pthread_cond_init(pthread_cond_t *const condition,
+                                    void* cond_attr) {
+  (void)cond_attr;
+  condition->waiting_sem_ = CreateSemaphore(NULL, 0, MAX_DECODE_THREADS, NULL);
+  condition->received_sem_ = CreateSemaphore(NULL, 0, MAX_DECODE_THREADS, NULL);
+  condition->signal_event_ = CreateEvent(NULL, FALSE, FALSE, NULL);
+  if (condition->waiting_sem_ == NULL ||
+      condition->received_sem_ == NULL ||
+      condition->signal_event_ == NULL) {
+    pthread_cond_destroy(condition);
+    return 1;
+  }
+  return 0;
+}
+
+static INLINE int pthread_cond_signal(pthread_cond_t *const condition) {
+  int ok = 1;
+  if (WaitForSingleObject(condition->waiting_sem_, 0) == WAIT_OBJECT_0) {
+    // a thread is waiting in pthread_cond_wait: allow it to be notified
+    ok = SetEvent(condition->signal_event_);
+    // wait until the event is consumed so the signaler cannot consume
+    // the event via its own pthread_cond_wait.
+    ok &= (WaitForSingleObject(condition->received_sem_, INFINITE) !=
+           WAIT_OBJECT_0);
+  }
+  return !ok;
+}
+
+static INLINE int pthread_cond_wait(pthread_cond_t *const condition,
+                                    pthread_mutex_t *const mutex) {
+  int ok;
+  // note that there is a consumer available so the signal isn't dropped in
+  // pthread_cond_signal
+  if (!ReleaseSemaphore(condition->waiting_sem_, 1, NULL))
+    return 1;
+  // now unlock the mutex so pthread_cond_signal may be issued
+  pthread_mutex_unlock(mutex);
+  ok = (WaitForSingleObject(condition->signal_event_, INFINITE) ==
+        WAIT_OBJECT_0);
+  ok &= ReleaseSemaphore(condition->received_sem_, 1, NULL);
+  pthread_mutex_lock(mutex);
+  return !ok;
+}
+#else  // _WIN32
+#include <pthread.h> // NOLINT
+# define THREADFN void*
+# define THREAD_RETURN(val) val
+#endif
+
+#endif  // CONFIG_MULTITHREAD
+
+// State of the worker thread object
+typedef enum {
+  NOT_OK = 0,   // object is unusable
+  OK,           // ready to work
+  WORK          // busy finishing the current task
+} VPxWorkerStatus;
+
+// Function to be called by the worker thread. Takes two opaque pointers as
+// arguments (data1 and data2), and should return false in case of error.
+typedef int (*VPxWorkerHook)(void*, void*);
+
+// Platform-dependent implementation details for the worker.
+typedef struct VPxWorkerImpl VPxWorkerImpl;
+
+// Synchronization object used to launch job in the worker thread
+typedef struct {
+  VPxWorkerImpl *impl_;
+  VPxWorkerStatus status_;
+  VPxWorkerHook hook;     // hook to call
+  void *data1;            // first argument passed to 'hook'
+  void *data2;            // second argument passed to 'hook'
+  int had_error;          // return value of the last call to 'hook'
+} VPxWorker;
+
+// The interface for all thread-worker related functions. All these functions
+// must be implemented.
+typedef struct {
+  // Must be called first, before any other method.
+  void (*init)(VPxWorker *const worker);
+  // Must be called to initialize the object and spawn the thread. Re-entrant.
+  // Will potentially launch the thread. Returns false in case of error.
+  int (*reset)(VPxWorker *const worker);
+  // Makes sure the previous work is finished. Returns true if worker->had_error
+  // was not set and no error condition was triggered by the working thread.
+  int (*sync)(VPxWorker *const worker);
+  // Triggers the thread to call hook() with data1 and data2 arguments. These
+  // hook/data1/data2 values can be changed at any time before calling this
+  // function, but not be changed afterward until the next call to Sync().
+  void (*launch)(VPxWorker *const worker);
+  // This function is similar to launch() except that it calls the
+  // hook directly instead of using a thread. Convenient to bypass the thread
+  // mechanism while still using the VPxWorker structs. sync() must
+  // still be called afterward (for error reporting).
+  void (*execute)(VPxWorker *const worker);
+  // Kill the thread and terminate the object. To use the object again, one
+  // must call reset() again.
+  void (*end)(VPxWorker *const worker);
+} VPxWorkerInterface;
+
+// Install a new set of threading functions, overriding the defaults. This
+// should be done before any workers are started, i.e., before any encoding or
+// decoding takes place. The contents of the interface struct are copied, it
+// is safe to free the corresponding memory after this call. This function is
+// not thread-safe. Return false in case of invalid pointer or methods.
+int vpx_set_worker_interface(const VPxWorkerInterface *const winterface);
+
+// Retrieve the currently set thread worker interface.
+const VPxWorkerInterface *vpx_get_worker_interface(void);
+
+//------------------------------------------------------------------------------
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif  // VPX_THREAD_H_
diff --git a/libs/libvpx/vpx_util/vpx_util.mk b/libs/libvpx/vpx_util/vpx_util.mk
new file mode 100644
index 0000000000..c0ef8d3362
--- /dev/null
+++ b/libs/libvpx/vpx_util/vpx_util.mk
@@ -0,0 +1,14 @@
+##
+## Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+UTIL_SRCS-yes += vpx_util.mk
+UTIL_SRCS-yes += vpx_thread.c
+UTIL_SRCS-yes += vpx_thread.h
+UTIL_SRCS-yes += endian_inl.h
diff --git a/libs/libvpx/vpxdec.c b/libs/libvpx/vpxdec.c
new file mode 100644
index 0000000000..285d58e1e7
--- /dev/null
+++ b/libs/libvpx/vpxdec.c
@@ -0,0 +1,1160 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <limits.h>
+
+#include "./vpx_config.h"
+
+#if CONFIG_LIBYUV
+#include "third_party/libyuv/include/libyuv/scale.h"
+#endif
+
+#include "./args.h"
+#include "./ivfdec.h"
+
+#include "vpx/vpx_decoder.h"
+#include "vpx_ports/mem_ops.h"
+#include "vpx_ports/vpx_timer.h"
+
+#if CONFIG_VP8_DECODER || CONFIG_VP9_DECODER || CONFIG_VP10_DECODER
+#include "vpx/vp8dx.h"
+#endif
+
+#include "./md5_utils.h"
+
+#include "./tools_common.h"
+#if CONFIG_WEBM_IO
+#include "./webmdec.h"
+#endif
+#include "./y4menc.h"
+
+static const char *exec_name;
+
+struct VpxDecInputContext {
+  struct VpxInputContext *vpx_input_ctx;
+  struct WebmInputContext *webm_ctx;
+};
+
+static const arg_def_t looparg = ARG_DEF(
+    NULL, "loops", 1, "Number of times to decode the file");
+static const arg_def_t codecarg = ARG_DEF(
+    NULL, "codec", 1, "Codec to use");
+static const arg_def_t use_yv12 = ARG_DEF(
+    NULL, "yv12", 0, "Output raw YV12 frames");
+static const arg_def_t use_i420 = ARG_DEF(
+    NULL, "i420", 0, "Output raw I420 frames");
+static const arg_def_t flipuvarg = ARG_DEF(
+    NULL, "flipuv", 0, "Flip the chroma planes in the output");
+static const arg_def_t rawvideo = ARG_DEF(
+    NULL, "rawvideo", 0, "Output raw YUV frames");
+static const arg_def_t noblitarg = ARG_DEF(
+    NULL, "noblit", 0, "Don't process the decoded frames");
+static const arg_def_t progressarg = ARG_DEF(
+    NULL, "progress", 0, "Show progress after each frame decodes");
+static const arg_def_t limitarg = ARG_DEF(
+    NULL, "limit", 1, "Stop decoding after n frames");
+static const arg_def_t skiparg = ARG_DEF(
+    NULL, "skip", 1, "Skip the first n input frames");
+static const arg_def_t postprocarg = ARG_DEF(
+    NULL, "postproc", 0, "Postprocess decoded frames");
+static const arg_def_t summaryarg = ARG_DEF(
+    NULL, "summary", 0, "Show timing summary");
+static const arg_def_t outputfile = ARG_DEF(
+    "o", "output", 1, "Output file name pattern (see below)");
+static const arg_def_t threadsarg = ARG_DEF(
+    "t", "threads", 1, "Max threads to use");
+static const arg_def_t frameparallelarg = ARG_DEF(
+    NULL, "frame-parallel", 0, "Frame parallel decode");
+static const arg_def_t verbosearg = ARG_DEF(
+    "v", "verbose", 0, "Show version string");
+static const arg_def_t error_concealment = ARG_DEF(
+    NULL, "error-concealment", 0, "Enable decoder error-concealment");
+static const arg_def_t scalearg = ARG_DEF(
+    "S", "scale", 0, "Scale output frames uniformly");
+static const arg_def_t continuearg = ARG_DEF(
+    "k", "keep-going", 0, "(debug) Continue decoding after error");
+static const arg_def_t fb_arg = ARG_DEF(
+    NULL, "frame-buffers", 1, "Number of frame buffers to use");
+static const arg_def_t md5arg = ARG_DEF(
+    NULL, "md5", 0, "Compute the MD5 sum of the decoded frame");
+#if CONFIG_VP9_HIGHBITDEPTH
+static const arg_def_t outbitdeptharg = ARG_DEF(
+    NULL, "output-bit-depth", 1, "Output bit-depth for decoded frames");
+#endif
+
+static const arg_def_t *all_args[] = {
+  &codecarg, &use_yv12, &use_i420, &flipuvarg, &rawvideo, &noblitarg,
+  &progressarg, &limitarg, &skiparg, &postprocarg, &summaryarg, &outputfile,
+  &threadsarg, &frameparallelarg, &verbosearg, &scalearg, &fb_arg,
+  &md5arg, &error_concealment, &continuearg,
+#if CONFIG_VP9_HIGHBITDEPTH
+  &outbitdeptharg,
+#endif
+  NULL
+};
+
+#if CONFIG_VP8_DECODER
+static const arg_def_t addnoise_level = ARG_DEF(
+    NULL, "noise-level", 1, "Enable VP8 postproc add noise");
+static const arg_def_t deblock = ARG_DEF(
+    NULL, "deblock", 0, "Enable VP8 deblocking");
+static const arg_def_t demacroblock_level = ARG_DEF(
+    NULL, "demacroblock-level", 1, "Enable VP8 demacroblocking, w/ level");
+static const arg_def_t pp_debug_info = ARG_DEF(
+    NULL, "pp-debug-info", 1, "Enable VP8 visible debug info");
+static const arg_def_t pp_disp_ref_frame = ARG_DEF(
+    NULL, "pp-dbg-ref-frame", 1,
+    "Display only selected reference frame per macro block");
+static const arg_def_t pp_disp_mb_modes = ARG_DEF(
+    NULL, "pp-dbg-mb-modes", 1, "Display only selected macro block modes");
+static const arg_def_t pp_disp_b_modes = ARG_DEF(
+    NULL, "pp-dbg-b-modes", 1, "Display only selected block modes");
+static const arg_def_t pp_disp_mvs = ARG_DEF(
+    NULL, "pp-dbg-mvs", 1, "Draw only selected motion vectors");
+static const arg_def_t mfqe = ARG_DEF(
+    NULL, "mfqe", 0, "Enable multiframe quality enhancement");
+
+static const arg_def_t *vp8_pp_args[] = {
+  &addnoise_level, &deblock, &demacroblock_level, &pp_debug_info,
+  &pp_disp_ref_frame, &pp_disp_mb_modes, &pp_disp_b_modes, &pp_disp_mvs, &mfqe,
+  NULL
+};
+#endif
+
+#if CONFIG_LIBYUV
+static INLINE int libyuv_scale(vpx_image_t *src, vpx_image_t *dst,
+                                  FilterModeEnum mode) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (src->fmt == VPX_IMG_FMT_I42016) {
+    assert(dst->fmt == VPX_IMG_FMT_I42016);
+    return I420Scale_16((uint16_t*)src->planes[VPX_PLANE_Y],
+                        src->stride[VPX_PLANE_Y]/2,
+                        (uint16_t*)src->planes[VPX_PLANE_U],
+                        src->stride[VPX_PLANE_U]/2,
+                        (uint16_t*)src->planes[VPX_PLANE_V],
+                        src->stride[VPX_PLANE_V]/2,
+                        src->d_w, src->d_h,
+                        (uint16_t*)dst->planes[VPX_PLANE_Y],
+                        dst->stride[VPX_PLANE_Y]/2,
+                        (uint16_t*)dst->planes[VPX_PLANE_U],
+                        dst->stride[VPX_PLANE_U]/2,
+                        (uint16_t*)dst->planes[VPX_PLANE_V],
+                        dst->stride[VPX_PLANE_V]/2,
+                        dst->d_w, dst->d_h,
+                        mode);
+  }
+#endif
+  assert(src->fmt == VPX_IMG_FMT_I420);
+  assert(dst->fmt == VPX_IMG_FMT_I420);
+  return I420Scale(src->planes[VPX_PLANE_Y], src->stride[VPX_PLANE_Y],
+                   src->planes[VPX_PLANE_U], src->stride[VPX_PLANE_U],
+                   src->planes[VPX_PLANE_V], src->stride[VPX_PLANE_V],
+                   src->d_w, src->d_h,
+                   dst->planes[VPX_PLANE_Y], dst->stride[VPX_PLANE_Y],
+                   dst->planes[VPX_PLANE_U], dst->stride[VPX_PLANE_U],
+                   dst->planes[VPX_PLANE_V], dst->stride[VPX_PLANE_V],
+                   dst->d_w, dst->d_h,
+                   mode);
+}
+#endif
+
+void usage_exit(void) {
+  int i;
+
+  fprintf(stderr, "Usage: %s <options> filename\n\n"
+          "Options:\n", exec_name);
+  arg_show_usage(stderr, all_args);
+#if CONFIG_VP8_DECODER
+  fprintf(stderr, "\nVP8 Postprocessing Options:\n");
+  arg_show_usage(stderr, vp8_pp_args);
+#endif
+  fprintf(stderr,
+          "\nOutput File Patterns:\n\n"
+          "  The -o argument specifies the name of the file(s) to "
+          "write to. If the\n  argument does not include any escape "
+          "characters, the output will be\n  written to a single file. "
+          "Otherwise, the filename will be calculated by\n  expanding "
+          "the following escape characters:\n");
+  fprintf(stderr,
+          "\n\t%%w   - Frame width"
+          "\n\t%%h   - Frame height"
+          "\n\t%%<n> - Frame number, zero padded to <n> places (1..9)"
+          "\n\n  Pattern arguments are only supported in conjunction "
+          "with the --yv12 and\n  --i420 options. If the -o option is "
+          "not specified, the output will be\n  directed to stdout.\n"
+         );
+  fprintf(stderr, "\nIncluded decoders:\n\n");
+
+  for (i = 0; i < get_vpx_decoder_count(); ++i) {
+    const VpxInterface *const decoder = get_vpx_decoder_by_index(i);
+    fprintf(stderr, "    %-6s - %s\n",
+            decoder->name, vpx_codec_iface_name(decoder->codec_interface()));
+  }
+
+  exit(EXIT_FAILURE);
+}
+
+static int raw_read_frame(FILE *infile, uint8_t **buffer,
+                          size_t *bytes_read, size_t *buffer_size) {
+  char raw_hdr[RAW_FRAME_HDR_SZ];
+  size_t frame_size = 0;
+
+  if (fread(raw_hdr, RAW_FRAME_HDR_SZ, 1, infile) != 1) {
+    if (!feof(infile))
+      warn("Failed to read RAW frame size\n");
+  } else {
+    const size_t kCorruptFrameThreshold = 256 * 1024 * 1024;
+    const size_t kFrameTooSmallThreshold = 256 * 1024;
+    frame_size = mem_get_le32(raw_hdr);
+
+    if (frame_size > kCorruptFrameThreshold) {
+      warn("Read invalid frame size (%u)\n", (unsigned int)frame_size);
+      frame_size = 0;
+    }
+
+    if (frame_size < kFrameTooSmallThreshold) {
+      warn("Warning: Read invalid frame size (%u) - not a raw file?\n",
+           (unsigned int)frame_size);
+    }
+
+    if (frame_size > *buffer_size) {
+      uint8_t *new_buf = realloc(*buffer, 2 * frame_size);
+      if (new_buf) {
+        *buffer = new_buf;
+        *buffer_size = 2 * frame_size;
+      } else {
+        warn("Failed to allocate compressed data buffer\n");
+        frame_size = 0;
+      }
+    }
+  }
+
+  if (!feof(infile)) {
+    if (fread(*buffer, 1, frame_size, infile) != frame_size) {
+      warn("Failed to read full frame\n");
+      return 1;
+    }
+    *bytes_read = frame_size;
+  }
+
+  return 0;
+}
+
+static int read_frame(struct VpxDecInputContext *input, uint8_t **buf,
+                      size_t *bytes_in_buffer, size_t *buffer_size) {
+  switch (input->vpx_input_ctx->file_type) {
+#if CONFIG_WEBM_IO
+    case FILE_TYPE_WEBM:
+      return webm_read_frame(input->webm_ctx,
+                             buf, bytes_in_buffer, buffer_size);
+#endif
+    case FILE_TYPE_RAW:
+      return raw_read_frame(input->vpx_input_ctx->file,
+                            buf, bytes_in_buffer, buffer_size);
+    case FILE_TYPE_IVF:
+      return ivf_read_frame(input->vpx_input_ctx->file,
+                            buf, bytes_in_buffer, buffer_size);
+    default:
+      return 1;
+  }
+}
+
+static void update_image_md5(const vpx_image_t *img, const int planes[3],
+                             MD5Context *md5) {
+  int i, y;
+
+  for (i = 0; i < 3; ++i) {
+    const int plane = planes[i];
+    const unsigned char *buf = img->planes[plane];
+    const int stride = img->stride[plane];
+    const int w = vpx_img_plane_width(img, plane) *
+                ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
+    const int h = vpx_img_plane_height(img, plane);
+
+    for (y = 0; y < h; ++y) {
+      MD5Update(md5, buf, w);
+      buf += stride;
+    }
+  }
+}
+
+static void write_image_file(const vpx_image_t *img, const int planes[3],
+                             FILE *file) {
+  int i, y;
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int bytes_per_sample = ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
+#else
+  const int bytes_per_sample = 1;
+#endif
+
+  for (i = 0; i < 3; ++i) {
+    const int plane = planes[i];
+    const unsigned char *buf = img->planes[plane];
+    const int stride = img->stride[plane];
+    const int w = vpx_img_plane_width(img, plane);
+    const int h = vpx_img_plane_height(img, plane);
+
+    for (y = 0; y < h; ++y) {
+      fwrite(buf, bytes_per_sample, w, file);
+      buf += stride;
+    }
+  }
+}
+
+static int file_is_raw(struct VpxInputContext *input) {
+  uint8_t buf[32];
+  int is_raw = 0;
+  vpx_codec_stream_info_t si;
+
+  si.sz = sizeof(si);
+
+  if (fread(buf, 1, 32, input->file) == 32) {
+    int i;
+
+    if (mem_get_le32(buf) < 256 * 1024 * 1024) {
+      for (i = 0; i < get_vpx_decoder_count(); ++i) {
+        const VpxInterface *const decoder = get_vpx_decoder_by_index(i);
+        if (!vpx_codec_peek_stream_info(decoder->codec_interface(),
+                                        buf + 4, 32 - 4, &si)) {
+          is_raw = 1;
+          input->fourcc = decoder->fourcc;
+          input->width = si.w;
+          input->height = si.h;
+          input->framerate.numerator = 30;
+          input->framerate.denominator = 1;
+          break;
+        }
+      }
+    }
+  }
+
+  rewind(input->file);
+  return is_raw;
+}
+
+static void show_progress(int frame_in, int frame_out, uint64_t dx_time) {
+  fprintf(stderr,
+          "%d decoded frames/%d showed frames in %"PRId64" us (%.2f fps)\r",
+          frame_in, frame_out, dx_time,
+          (double)frame_out * 1000000.0 / (double)dx_time);
+}
+
+struct ExternalFrameBuffer {
+  uint8_t* data;
+  size_t size;
+  int in_use;
+};
+
+struct ExternalFrameBufferList {
+  int num_external_frame_buffers;
+  struct ExternalFrameBuffer *ext_fb;
+};
+
+// Callback used by libvpx to request an external frame buffer. |cb_priv|
+// Application private data passed into the set function. |min_size| is the
+// minimum size in bytes needed to decode the next frame. |fb| pointer to the
+// frame buffer.
+static int get_vp9_frame_buffer(void *cb_priv, size_t min_size,
+                                vpx_codec_frame_buffer_t *fb) {
+  int i;
+  struct ExternalFrameBufferList *const ext_fb_list =
+      (struct ExternalFrameBufferList *)cb_priv;
+  if (ext_fb_list == NULL)
+    return -1;
+
+  // Find a free frame buffer.
+  for (i = 0; i < ext_fb_list->num_external_frame_buffers; ++i) {
+    if (!ext_fb_list->ext_fb[i].in_use)
+      break;
+  }
+
+  if (i == ext_fb_list->num_external_frame_buffers)
+    return -1;
+
+  if (ext_fb_list->ext_fb[i].size < min_size) {
+    free(ext_fb_list->ext_fb[i].data);
+    ext_fb_list->ext_fb[i].data = (uint8_t *)calloc(min_size, sizeof(uint8_t));
+    if (!ext_fb_list->ext_fb[i].data)
+      return -1;
+
+    ext_fb_list->ext_fb[i].size = min_size;
+  }
+
+  fb->data = ext_fb_list->ext_fb[i].data;
+  fb->size = ext_fb_list->ext_fb[i].size;
+  ext_fb_list->ext_fb[i].in_use = 1;
+
+  // Set the frame buffer's private data to point at the external frame buffer.
+  fb->priv = &ext_fb_list->ext_fb[i];
+  return 0;
+}
+
+// Callback used by libvpx when there are no references to the frame buffer.
+// |cb_priv| user private data passed into the set function. |fb| pointer
+// to the frame buffer.
+static int release_vp9_frame_buffer(void *cb_priv,
+                                    vpx_codec_frame_buffer_t *fb) {
+  struct ExternalFrameBuffer *const ext_fb =
+      (struct ExternalFrameBuffer *)fb->priv;
+  (void)cb_priv;
+  ext_fb->in_use = 0;
+  return 0;
+}
+
+static void generate_filename(const char *pattern, char *out, size_t q_len,
+                              unsigned int d_w, unsigned int d_h,
+                              unsigned int frame_in) {
+  const char *p = pattern;
+  char *q = out;
+
+  do {
+    char *next_pat = strchr(p, '%');
+
+    if (p == next_pat) {
+      size_t pat_len;
+
+      /* parse the pattern */
+      q[q_len - 1] = '\0';
+      switch (p[1]) {
+        case 'w':
+          snprintf(q, q_len - 1, "%d", d_w);
+          break;
+        case 'h':
+          snprintf(q, q_len - 1, "%d", d_h);
+          break;
+        case '1':
+          snprintf(q, q_len - 1, "%d", frame_in);
+          break;
+        case '2':
+          snprintf(q, q_len - 1, "%02d", frame_in);
+          break;
+        case '3':
+          snprintf(q, q_len - 1, "%03d", frame_in);
+          break;
+        case '4':
+          snprintf(q, q_len - 1, "%04d", frame_in);
+          break;
+        case '5':
+          snprintf(q, q_len - 1, "%05d", frame_in);
+          break;
+        case '6':
+          snprintf(q, q_len - 1, "%06d", frame_in);
+          break;
+        case '7':
+          snprintf(q, q_len - 1, "%07d", frame_in);
+          break;
+        case '8':
+          snprintf(q, q_len - 1, "%08d", frame_in);
+          break;
+        case '9':
+          snprintf(q, q_len - 1, "%09d", frame_in);
+          break;
+        default:
+          die("Unrecognized pattern %%%c\n", p[1]);
+          break;
+      }
+
+      pat_len = strlen(q);
+      if (pat_len >= q_len - 1)
+        die("Output filename too long.\n");
+      q += pat_len;
+      p += 2;
+      q_len -= pat_len;
+    } else {
+      size_t copy_len;
+
+      /* copy the next segment */
+      if (!next_pat)
+        copy_len = strlen(p);
+      else
+        copy_len = next_pat - p;
+
+      if (copy_len >= q_len - 1)
+        die("Output filename too long.\n");
+
+      memcpy(q, p, copy_len);
+      q[copy_len] = '\0';
+      q += copy_len;
+      p += copy_len;
+      q_len -= copy_len;
+    }
+  } while (*p);
+}
+
+static int is_single_file(const char *outfile_pattern) {
+  const char *p = outfile_pattern;
+
+  do {
+    p = strchr(p, '%');
+    if (p && p[1] >= '1' && p[1] <= '9')
+      return 0;  // pattern contains sequence number, so it's not unique
+    if (p)
+      p++;
+  } while (p);
+
+  return 1;
+}
+
+static void print_md5(unsigned char digest[16], const char *filename) {
+  int i;
+
+  for (i = 0; i < 16; ++i)
+    printf("%02x", digest[i]);
+  printf("  %s\n", filename);
+}
+
+static FILE *open_outfile(const char *name) {
+  if (strcmp("-", name) == 0) {
+    set_binary_mode(stdout);
+    return stdout;
+  } else {
+    FILE *file = fopen(name, "wb");
+    if (!file)
+      fatal("Failed to open output file '%s'", name);
+    return file;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static int img_shifted_realloc_required(const vpx_image_t *img,
+                                        const vpx_image_t *shifted,
+                                        vpx_img_fmt_t required_fmt) {
+  return img->d_w != shifted->d_w ||
+         img->d_h != shifted->d_h ||
+         required_fmt != shifted->fmt;
+}
+#endif
+
+static int main_loop(int argc, const char **argv_) {
+  vpx_codec_ctx_t       decoder;
+  char                  *fn = NULL;
+  int                    i;
+  uint8_t               *buf = NULL;
+  size_t                 bytes_in_buffer = 0, buffer_size = 0;
+  FILE                  *infile;
+  int                    frame_in = 0, frame_out = 0, flipuv = 0, noblit = 0;
+  int                    do_md5 = 0, progress = 0, frame_parallel = 0;
+  int                    stop_after = 0, postproc = 0, summary = 0, quiet = 1;
+  int                    arg_skip = 0;
+  int                    ec_enabled = 0;
+  int                    keep_going = 0;
+  const VpxInterface *interface = NULL;
+  const VpxInterface *fourcc_interface = NULL;
+  uint64_t dx_time = 0;
+  struct arg               arg;
+  char                   **argv, **argi, **argj;
+
+  int                     single_file;
+  int                     use_y4m = 1;
+  int                     opt_yv12 = 0;
+  int                     opt_i420 = 0;
+  vpx_codec_dec_cfg_t     cfg = {0, 0, 0};
+#if CONFIG_VP9_HIGHBITDEPTH
+  unsigned int            output_bit_depth = 0;
+#endif
+#if CONFIG_VP8_DECODER
+  vp8_postproc_cfg_t      vp8_pp_cfg = {0};
+  int                     vp8_dbg_color_ref_frame = 0;
+  int                     vp8_dbg_color_mb_modes = 0;
+  int                     vp8_dbg_color_b_modes = 0;
+  int                     vp8_dbg_display_mv = 0;
+#endif
+  int                     frames_corrupted = 0;
+  int                     dec_flags = 0;
+  int                     do_scale = 0;
+  vpx_image_t             *scaled_img = NULL;
+#if CONFIG_VP9_HIGHBITDEPTH
+  vpx_image_t             *img_shifted = NULL;
+#endif
+  int                     frame_avail, got_data, flush_decoder = 0;
+  int                     num_external_frame_buffers = 0;
+  struct ExternalFrameBufferList ext_fb_list = {0, NULL};
+
+  const char *outfile_pattern = NULL;
+  char outfile_name[PATH_MAX] = {0};
+  FILE *outfile = NULL;
+
+  MD5Context md5_ctx;
+  unsigned char md5_digest[16];
+
+  struct VpxDecInputContext input = {NULL, NULL};
+  struct VpxInputContext vpx_input_ctx;
+#if CONFIG_WEBM_IO
+  struct WebmInputContext webm_ctx;
+  memset(&(webm_ctx), 0, sizeof(webm_ctx));
+  input.webm_ctx = &webm_ctx;
+#endif
+  input.vpx_input_ctx = &vpx_input_ctx;
+
+  /* Parse command line */
+  exec_name = argv_[0];
+  argv = argv_dup(argc - 1, argv_ + 1);
+
+  for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
+    memset(&arg, 0, sizeof(arg));
+    arg.argv_step = 1;
+
+    if (arg_match(&arg, &codecarg, argi)) {
+      interface = get_vpx_decoder_by_name(arg.val);
+      if (!interface)
+        die("Error: Unrecognized argument (%s) to --codec\n", arg.val);
+    } else if (arg_match(&arg, &looparg, argi)) {
+      // no-op
+    } else if (arg_match(&arg, &outputfile, argi))
+      outfile_pattern = arg.val;
+    else if (arg_match(&arg, &use_yv12, argi)) {
+      use_y4m = 0;
+      flipuv = 1;
+      opt_yv12 = 1;
+    } else if (arg_match(&arg, &use_i420, argi)) {
+      use_y4m = 0;
+      flipuv = 0;
+      opt_i420 = 1;
+    } else if (arg_match(&arg, &rawvideo, argi)) {
+      use_y4m = 0;
+    } else if (arg_match(&arg, &flipuvarg, argi))
+      flipuv = 1;
+    else if (arg_match(&arg, &noblitarg, argi))
+      noblit = 1;
+    else if (arg_match(&arg, &progressarg, argi))
+      progress = 1;
+    else if (arg_match(&arg, &limitarg, argi))
+      stop_after = arg_parse_uint(&arg);
+    else if (arg_match(&arg, &skiparg, argi))
+      arg_skip = arg_parse_uint(&arg);
+    else if (arg_match(&arg, &postprocarg, argi))
+      postproc = 1;
+    else if (arg_match(&arg, &md5arg, argi))
+      do_md5 = 1;
+    else if (arg_match(&arg, &summaryarg, argi))
+      summary = 1;
+    else if (arg_match(&arg, &threadsarg, argi))
+      cfg.threads = arg_parse_uint(&arg);
+#if CONFIG_VP9_DECODER || CONFIG_VP10_DECODER
+    else if (arg_match(&arg, &frameparallelarg, argi))
+      frame_parallel = 1;
+#endif
+    else if (arg_match(&arg, &verbosearg, argi))
+      quiet = 0;
+    else if (arg_match(&arg, &scalearg, argi))
+      do_scale = 1;
+    else if (arg_match(&arg, &fb_arg, argi))
+      num_external_frame_buffers = arg_parse_uint(&arg);
+    else if (arg_match(&arg, &continuearg, argi))
+      keep_going = 1;
+#if CONFIG_VP9_HIGHBITDEPTH
+    else if (arg_match(&arg, &outbitdeptharg, argi)) {
+      output_bit_depth = arg_parse_uint(&arg);
+    }
+#endif
+#if CONFIG_VP8_DECODER
+    else if (arg_match(&arg, &addnoise_level, argi)) {
+      postproc = 1;
+      vp8_pp_cfg.post_proc_flag |= VP8_ADDNOISE;
+      vp8_pp_cfg.noise_level = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &demacroblock_level, argi)) {
+      postproc = 1;
+      vp8_pp_cfg.post_proc_flag |= VP8_DEMACROBLOCK;
+      vp8_pp_cfg.deblocking_level = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &deblock, argi)) {
+      postproc = 1;
+      vp8_pp_cfg.post_proc_flag |= VP8_DEBLOCK;
+    } else if (arg_match(&arg, &mfqe, argi)) {
+      postproc = 1;
+      vp8_pp_cfg.post_proc_flag |= VP8_MFQE;
+    } else if (arg_match(&arg, &pp_debug_info, argi)) {
+      unsigned int level = arg_parse_uint(&arg);
+
+      postproc = 1;
+      vp8_pp_cfg.post_proc_flag &= ~0x7;
+
+      if (level)
+        vp8_pp_cfg.post_proc_flag |= level;
+    } else if (arg_match(&arg, &pp_disp_ref_frame, argi)) {
+      unsigned int flags = arg_parse_int(&arg);
+      if (flags) {
+        postproc = 1;
+        vp8_dbg_color_ref_frame = flags;
+      }
+    } else if (arg_match(&arg, &pp_disp_mb_modes, argi)) {
+      unsigned int flags = arg_parse_int(&arg);
+      if (flags) {
+        postproc = 1;
+        vp8_dbg_color_mb_modes = flags;
+      }
+    } else if (arg_match(&arg, &pp_disp_b_modes, argi)) {
+      unsigned int flags = arg_parse_int(&arg);
+      if (flags) {
+        postproc = 1;
+        vp8_dbg_color_b_modes = flags;
+      }
+    } else if (arg_match(&arg, &pp_disp_mvs, argi)) {
+      unsigned int flags = arg_parse_int(&arg);
+      if (flags) {
+        postproc = 1;
+        vp8_dbg_display_mv = flags;
+      }
+    } else if (arg_match(&arg, &error_concealment, argi)) {
+      ec_enabled = 1;
+    }
+#endif  // CONFIG_VP8_DECODER
+    else
+      argj++;
+  }
+
+  /* Check for unrecognized options */
+  for (argi = argv; *argi; argi++)
+    if (argi[0][0] == '-' && strlen(argi[0]) > 1)
+      die("Error: Unrecognized option %s\n", *argi);
+
+  /* Handle non-option arguments */
+  fn = argv[0];
+
+  if (!fn) {
+    free(argv);
+    usage_exit();
+  }
+  /* Open file */
+  infile = strcmp(fn, "-") ? fopen(fn, "rb") : set_binary_mode(stdin);
+
+  if (!infile) {
+    fatal("Failed to open input file '%s'", strcmp(fn, "-") ? fn : "stdin");
+  }
+#if CONFIG_OS_SUPPORT
+  /* Make sure we don't dump to the terminal, unless forced to with -o - */
+  if (!outfile_pattern && isatty(fileno(stdout)) && !do_md5 && !noblit) {
+    fprintf(stderr,
+            "Not dumping raw video to your terminal. Use '-o -' to "
+            "override.\n");
+    return EXIT_FAILURE;
+  }
+#endif
+  input.vpx_input_ctx->file = infile;
+  if (file_is_ivf(input.vpx_input_ctx))
+    input.vpx_input_ctx->file_type = FILE_TYPE_IVF;
+#if CONFIG_WEBM_IO
+  else if (file_is_webm(input.webm_ctx, input.vpx_input_ctx))
+    input.vpx_input_ctx->file_type = FILE_TYPE_WEBM;
+#endif
+  else if (file_is_raw(input.vpx_input_ctx))
+    input.vpx_input_ctx->file_type = FILE_TYPE_RAW;
+  else {
+    fprintf(stderr, "Unrecognized input file type.\n");
+#if !CONFIG_WEBM_IO
+    fprintf(stderr, "vpxdec was built without WebM container support.\n");
+#endif
+    return EXIT_FAILURE;
+  }
+
+  outfile_pattern = outfile_pattern ? outfile_pattern : "-";
+  single_file = is_single_file(outfile_pattern);
+
+  if (!noblit && single_file) {
+    generate_filename(outfile_pattern, outfile_name, PATH_MAX,
+                      vpx_input_ctx.width, vpx_input_ctx.height, 0);
+    if (do_md5)
+      MD5Init(&md5_ctx);
+    else
+      outfile = open_outfile(outfile_name);
+  }
+
+  if (use_y4m && !noblit) {
+    if (!single_file) {
+      fprintf(stderr, "YUV4MPEG2 not supported with output patterns,"
+              " try --i420 or --yv12 or --rawvideo.\n");
+      return EXIT_FAILURE;
+    }
+
+#if CONFIG_WEBM_IO
+    if (vpx_input_ctx.file_type == FILE_TYPE_WEBM) {
+      if (webm_guess_framerate(input.webm_ctx, input.vpx_input_ctx)) {
+        fprintf(stderr, "Failed to guess framerate -- error parsing "
+                "webm file?\n");
+        return EXIT_FAILURE;
+      }
+    }
+#endif
+  }
+
+  fourcc_interface = get_vpx_decoder_by_fourcc(vpx_input_ctx.fourcc);
+  if (interface && fourcc_interface && interface != fourcc_interface)
+    warn("Header indicates codec: %s\n", fourcc_interface->name);
+  else
+    interface = fourcc_interface;
+
+  if (!interface)
+    interface = get_vpx_decoder_by_index(0);
+
+  dec_flags = (postproc ? VPX_CODEC_USE_POSTPROC : 0) |
+              (ec_enabled ? VPX_CODEC_USE_ERROR_CONCEALMENT : 0) |
+              (frame_parallel ? VPX_CODEC_USE_FRAME_THREADING : 0);
+  if (vpx_codec_dec_init(&decoder, interface->codec_interface(),
+                         &cfg, dec_flags)) {
+    fprintf(stderr, "Failed to initialize decoder: %s\n",
+            vpx_codec_error(&decoder));
+    return EXIT_FAILURE;
+  }
+
+  if (!quiet)
+    fprintf(stderr, "%s\n", decoder.name);
+
+#if CONFIG_VP8_DECODER
+  if (vp8_pp_cfg.post_proc_flag
+      && vpx_codec_control(&decoder, VP8_SET_POSTPROC, &vp8_pp_cfg)) {
+    fprintf(stderr, "Failed to configure postproc: %s\n",
+            vpx_codec_error(&decoder));
+    return EXIT_FAILURE;
+  }
+
+  if (vp8_dbg_color_ref_frame
+      && vpx_codec_control(&decoder, VP8_SET_DBG_COLOR_REF_FRAME,
+                           vp8_dbg_color_ref_frame)) {
+    fprintf(stderr, "Failed to configure reference block visualizer: %s\n",
+            vpx_codec_error(&decoder));
+    return EXIT_FAILURE;
+  }
+
+  if (vp8_dbg_color_mb_modes
+      && vpx_codec_control(&decoder, VP8_SET_DBG_COLOR_MB_MODES,
+                           vp8_dbg_color_mb_modes)) {
+    fprintf(stderr, "Failed to configure macro block visualizer: %s\n",
+            vpx_codec_error(&decoder));
+    return EXIT_FAILURE;
+  }
+
+  if (vp8_dbg_color_b_modes
+      && vpx_codec_control(&decoder, VP8_SET_DBG_COLOR_B_MODES,
+                           vp8_dbg_color_b_modes)) {
+    fprintf(stderr, "Failed to configure block visualizer: %s\n",
+            vpx_codec_error(&decoder));
+    return EXIT_FAILURE;
+  }
+
+  if (vp8_dbg_display_mv
+      && vpx_codec_control(&decoder, VP8_SET_DBG_DISPLAY_MV,
+                           vp8_dbg_display_mv)) {
+    fprintf(stderr, "Failed to configure motion vector visualizer: %s\n",
+            vpx_codec_error(&decoder));
+    return EXIT_FAILURE;
+  }
+#endif
+
+
+  if (arg_skip)
+    fprintf(stderr, "Skipping first %d frames.\n", arg_skip);
+  while (arg_skip) {
+    if (read_frame(&input, &buf, &bytes_in_buffer, &buffer_size))
+      break;
+    arg_skip--;
+  }
+
+  if (num_external_frame_buffers > 0) {
+    ext_fb_list.num_external_frame_buffers = num_external_frame_buffers;
+    ext_fb_list.ext_fb = (struct ExternalFrameBuffer *)calloc(
+        num_external_frame_buffers, sizeof(*ext_fb_list.ext_fb));
+    if (vpx_codec_set_frame_buffer_functions(
+            &decoder, get_vp9_frame_buffer, release_vp9_frame_buffer,
+            &ext_fb_list)) {
+      fprintf(stderr, "Failed to configure external frame buffers: %s\n",
+              vpx_codec_error(&decoder));
+      return EXIT_FAILURE;
+    }
+  }
+
+  frame_avail = 1;
+  got_data = 0;
+
+  /* Decode file */
+  while (frame_avail || got_data) {
+    vpx_codec_iter_t  iter = NULL;
+    vpx_image_t    *img;
+    struct vpx_usec_timer timer;
+    int                   corrupted = 0;
+
+    frame_avail = 0;
+    if (!stop_after || frame_in < stop_after) {
+      if (!read_frame(&input, &buf, &bytes_in_buffer, &buffer_size)) {
+        frame_avail = 1;
+        frame_in++;
+
+        vpx_usec_timer_start(&timer);
+
+        if (vpx_codec_decode(&decoder, buf, (unsigned int)bytes_in_buffer,
+                             NULL, 0)) {
+          const char *detail = vpx_codec_error_detail(&decoder);
+          warn("Failed to decode frame %d: %s",
+               frame_in, vpx_codec_error(&decoder));
+
+          if (detail)
+            warn("Additional information: %s", detail);
+          if (!keep_going)
+            goto fail;
+        }
+
+        vpx_usec_timer_mark(&timer);
+        dx_time += vpx_usec_timer_elapsed(&timer);
+      } else {
+        flush_decoder = 1;
+      }
+    } else {
+      flush_decoder = 1;
+    }
+
+    vpx_usec_timer_start(&timer);
+
+    if (flush_decoder) {
+      // Flush the decoder in frame parallel decode.
+      if (vpx_codec_decode(&decoder, NULL, 0, NULL, 0)) {
+        warn("Failed to flush decoder: %s", vpx_codec_error(&decoder));
+      }
+    }
+
+    got_data = 0;
+    if ((img = vpx_codec_get_frame(&decoder, &iter))) {
+      ++frame_out;
+      got_data = 1;
+    }
+
+    vpx_usec_timer_mark(&timer);
+    dx_time += (unsigned int)vpx_usec_timer_elapsed(&timer);
+
+    if (!frame_parallel &&
+        vpx_codec_control(&decoder, VP8D_GET_FRAME_CORRUPTED, &corrupted)) {
+      warn("Failed VP8_GET_FRAME_CORRUPTED: %s", vpx_codec_error(&decoder));
+      if (!keep_going)
+        goto fail;
+    }
+    frames_corrupted += corrupted;
+
+    if (progress)
+      show_progress(frame_in, frame_out, dx_time);
+
+    if (!noblit && img) {
+      const int PLANES_YUV[] = {VPX_PLANE_Y, VPX_PLANE_U, VPX_PLANE_V};
+      const int PLANES_YVU[] = {VPX_PLANE_Y, VPX_PLANE_V, VPX_PLANE_U};
+      const int *planes = flipuv ? PLANES_YVU : PLANES_YUV;
+
+      if (do_scale) {
+        if (frame_out == 1) {
+          // If the output frames are to be scaled to a fixed display size then
+          // use the width and height specified in the container. If either of
+          // these is set to 0, use the display size set in the first frame
+          // header. If that is unavailable, use the raw decoded size of the
+          // first decoded frame.
+          int render_width = vpx_input_ctx.width;
+          int render_height = vpx_input_ctx.height;
+          if (!render_width || !render_height) {
+            int render_size[2];
+            if (vpx_codec_control(&decoder, VP9D_GET_DISPLAY_SIZE,
+                                  render_size)) {
+              // As last resort use size of first frame as display size.
+              render_width = img->d_w;
+              render_height = img->d_h;
+            } else {
+              render_width = render_size[0];
+              render_height = render_size[1];
+            }
+          }
+          scaled_img = vpx_img_alloc(NULL, img->fmt, render_width,
+                                     render_height, 16);
+          scaled_img->bit_depth = img->bit_depth;
+        }
+
+        if (img->d_w != scaled_img->d_w || img->d_h != scaled_img->d_h) {
+#if CONFIG_LIBYUV
+          libyuv_scale(img, scaled_img, kFilterBox);
+          img = scaled_img;
+#else
+          fprintf(stderr, "Failed  to scale output frame: %s.\n"
+                  "Scaling is disabled in this configuration. "
+                  "To enable scaling, configure with --enable-libyuv\n",
+                  vpx_codec_error(&decoder));
+          return EXIT_FAILURE;
+#endif
+        }
+      }
+#if CONFIG_VP9_HIGHBITDEPTH
+      // Default to codec bit depth if output bit depth not set
+      if (!output_bit_depth && single_file && !do_md5) {
+        output_bit_depth = img->bit_depth;
+      }
+      // Shift up or down if necessary
+      if (output_bit_depth != 0 && output_bit_depth != img->bit_depth) {
+        const vpx_img_fmt_t shifted_fmt = output_bit_depth == 8 ?
+            img->fmt ^ (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) :
+            img->fmt | VPX_IMG_FMT_HIGHBITDEPTH;
+        if (img_shifted &&
+            img_shifted_realloc_required(img, img_shifted, shifted_fmt)) {
+          vpx_img_free(img_shifted);
+          img_shifted = NULL;
+        }
+        if (!img_shifted) {
+          img_shifted = vpx_img_alloc(NULL, shifted_fmt,
+                                      img->d_w, img->d_h, 16);
+          img_shifted->bit_depth = output_bit_depth;
+        }
+        if (output_bit_depth > img->bit_depth) {
+          vpx_img_upshift(img_shifted, img,
+                          output_bit_depth - img->bit_depth);
+        } else {
+          vpx_img_downshift(img_shifted, img,
+                            img->bit_depth - output_bit_depth);
+        }
+        img = img_shifted;
+      }
+#endif
+
+      if (single_file) {
+        if (use_y4m) {
+          char buf[Y4M_BUFFER_SIZE] = {0};
+          size_t len = 0;
+          if (img->fmt == VPX_IMG_FMT_I440 || img->fmt == VPX_IMG_FMT_I44016) {
+            fprintf(stderr, "Cannot produce y4m output for 440 sampling.\n");
+            goto fail;
+          }
+          if (frame_out == 1) {
+            // Y4M file header
+            len = y4m_write_file_header(buf, sizeof(buf),
+                                        vpx_input_ctx.width,
+                                        vpx_input_ctx.height,
+                                        &vpx_input_ctx.framerate,
+                                        img->fmt, img->bit_depth);
+            if (do_md5) {
+              MD5Update(&md5_ctx, (md5byte *)buf, (unsigned int)len);
+            } else {
+              fputs(buf, outfile);
+            }
+          }
+
+          // Y4M frame header
+          len = y4m_write_frame_header(buf, sizeof(buf));
+          if (do_md5) {
+            MD5Update(&md5_ctx, (md5byte *)buf, (unsigned int)len);
+          } else {
+            fputs(buf, outfile);
+          }
+        } else {
+          if (frame_out == 1) {
+            // Check if --yv12 or --i420 options are consistent with the
+            // bit-stream decoded
+            if (opt_i420) {
+              if (img->fmt != VPX_IMG_FMT_I420 &&
+                  img->fmt != VPX_IMG_FMT_I42016) {
+                fprintf(stderr, "Cannot produce i420 output for bit-stream.\n");
+                goto fail;
+              }
+            }
+            if (opt_yv12) {
+              if ((img->fmt != VPX_IMG_FMT_I420 &&
+                   img->fmt != VPX_IMG_FMT_YV12) || img->bit_depth != 8) {
+                fprintf(stderr, "Cannot produce yv12 output for bit-stream.\n");
+                goto fail;
+              }
+            }
+          }
+        }
+
+        if (do_md5) {
+          update_image_md5(img, planes, &md5_ctx);
+        } else {
+          write_image_file(img, planes, outfile);
+        }
+      } else {
+        generate_filename(outfile_pattern, outfile_name, PATH_MAX,
+                          img->d_w, img->d_h, frame_in);
+        if (do_md5) {
+          MD5Init(&md5_ctx);
+          update_image_md5(img, planes, &md5_ctx);
+          MD5Final(md5_digest, &md5_ctx);
+          print_md5(md5_digest, outfile_name);
+        } else {
+          outfile = open_outfile(outfile_name);
+          write_image_file(img, planes, outfile);
+          fclose(outfile);
+        }
+      }
+    }
+  }
+
+  if (summary || progress) {
+    show_progress(frame_in, frame_out, dx_time);
+    fprintf(stderr, "\n");
+  }
+
+  if (frames_corrupted)
+    fprintf(stderr, "WARNING: %d frames corrupted.\n", frames_corrupted);
+
+fail:
+
+  if (vpx_codec_destroy(&decoder)) {
+    fprintf(stderr, "Failed to destroy decoder: %s\n",
+            vpx_codec_error(&decoder));
+    return EXIT_FAILURE;
+  }
+
+  if (!noblit && single_file) {
+    if (do_md5) {
+      MD5Final(md5_digest, &md5_ctx);
+      print_md5(md5_digest, outfile_name);
+    } else {
+      fclose(outfile);
+    }
+  }
+
+#if CONFIG_WEBM_IO
+  if (input.vpx_input_ctx->file_type == FILE_TYPE_WEBM)
+    webm_free(input.webm_ctx);
+#endif
+
+  if (input.vpx_input_ctx->file_type != FILE_TYPE_WEBM)
+    free(buf);
+
+  if (scaled_img) vpx_img_free(scaled_img);
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (img_shifted) vpx_img_free(img_shifted);
+#endif
+
+  for (i = 0; i < ext_fb_list.num_external_frame_buffers; ++i) {
+    free(ext_fb_list.ext_fb[i].data);
+  }
+  free(ext_fb_list.ext_fb);
+
+  fclose(infile);
+  free(argv);
+
+  return frames_corrupted ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+int main(int argc, const char **argv_) {
+  unsigned int loops = 1, i;
+  char **argv, **argi, **argj;
+  struct arg arg;
+  int error = 0;
+
+  argv = argv_dup(argc - 1, argv_ + 1);
+  for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
+    memset(&arg, 0, sizeof(arg));
+    arg.argv_step = 1;
+
+    if (arg_match(&arg, &looparg, argi)) {
+      loops = arg_parse_uint(&arg);
+      break;
+    }
+  }
+  free(argv);
+  for (i = 0; !error && i < loops; i++)
+    error = main_loop(argc, argv_);
+  return error;
+}
diff --git a/libs/libvpx/vpxenc.c b/libs/libvpx/vpxenc.c
new file mode 100644
index 0000000000..f14470a6f2
--- /dev/null
+++ b/libs/libvpx/vpxenc.c
@@ -0,0 +1,2340 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpxenc.h"
+#include "./vpx_config.h"
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#if CONFIG_LIBYUV
+#include "third_party/libyuv/include/libyuv/scale.h"
+#endif
+
+#include "vpx/vpx_encoder.h"
+#if CONFIG_DECODERS
+#include "vpx/vpx_decoder.h"
+#endif
+
+#include "./args.h"
+#include "./ivfenc.h"
+#include "./tools_common.h"
+
+#if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
+#include "vpx/vp8cx.h"
+#endif
+#if CONFIG_VP8_DECODER || CONFIG_VP9_DECODER || CONFIG_VP10_DECODER
+#include "vpx/vp8dx.h"
+#endif
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem_ops.h"
+#include "vpx_ports/vpx_timer.h"
+#include "./rate_hist.h"
+#include "./vpxstats.h"
+#include "./warnings.h"
+#if CONFIG_WEBM_IO
+#include "./webmenc.h"
+#endif
+#include "./y4minput.h"
+
+/* Swallow warnings about unused results of fread/fwrite */
+static size_t wrap_fread(void *ptr, size_t size, size_t nmemb,
+                         FILE *stream) {
+  return fread(ptr, size, nmemb, stream);
+}
+#define fread wrap_fread
+
+static size_t wrap_fwrite(const void *ptr, size_t size, size_t nmemb,
+                          FILE *stream) {
+  return fwrite(ptr, size, nmemb, stream);
+}
+#define fwrite wrap_fwrite
+
+
+static const char *exec_name;
+
+static void warn_or_exit_on_errorv(vpx_codec_ctx_t *ctx, int fatal,
+                                   const char *s, va_list ap) {
+  if (ctx->err) {
+    const char *detail = vpx_codec_error_detail(ctx);
+
+    vfprintf(stderr, s, ap);
+    fprintf(stderr, ": %s\n", vpx_codec_error(ctx));
+
+    if (detail)
+      fprintf(stderr, "    %s\n", detail);
+
+    if (fatal)
+      exit(EXIT_FAILURE);
+  }
+}
+
+static void ctx_exit_on_error(vpx_codec_ctx_t *ctx, const char *s, ...) {
+  va_list ap;
+
+  va_start(ap, s);
+  warn_or_exit_on_errorv(ctx, 1, s, ap);
+  va_end(ap);
+}
+
+static void warn_or_exit_on_error(vpx_codec_ctx_t *ctx, int fatal,
+                                  const char *s, ...) {
+  va_list ap;
+
+  va_start(ap, s);
+  warn_or_exit_on_errorv(ctx, fatal, s, ap);
+  va_end(ap);
+}
+
+static int read_frame(struct VpxInputContext *input_ctx, vpx_image_t *img) {
+  FILE *f = input_ctx->file;
+  y4m_input *y4m = &input_ctx->y4m;
+  int shortread = 0;
+
+  if (input_ctx->file_type == FILE_TYPE_Y4M) {
+    if (y4m_input_fetch_frame(y4m, f, img) < 1)
+      return 0;
+  } else {
+    shortread = read_yuv_frame(input_ctx, img);
+  }
+
+  return !shortread;
+}
+
+static int file_is_y4m(const char detect[4]) {
+  if (memcmp(detect, "YUV4", 4) == 0) {
+    return 1;
+  }
+  return 0;
+}
+
+static int fourcc_is_ivf(const char detect[4]) {
+  if (memcmp(detect, "DKIF", 4) == 0) {
+    return 1;
+  }
+  return 0;
+}
+
+static const arg_def_t debugmode = ARG_DEF(
+    "D", "debug", 0, "Debug mode (makes output deterministic)");
+static const arg_def_t outputfile = ARG_DEF(
+    "o", "output", 1, "Output filename");
+static const arg_def_t use_yv12 = ARG_DEF(
+    NULL, "yv12", 0, "Input file is YV12 ");
+static const arg_def_t use_i420 = ARG_DEF(
+    NULL, "i420", 0, "Input file is I420 (default)");
+static const arg_def_t use_i422 = ARG_DEF(
+    NULL, "i422", 0, "Input file is I422");
+static const arg_def_t use_i444 = ARG_DEF(
+    NULL, "i444", 0, "Input file is I444");
+static const arg_def_t use_i440 = ARG_DEF(
+    NULL, "i440", 0, "Input file is I440");
+static const arg_def_t codecarg = ARG_DEF(
+    NULL, "codec", 1, "Codec to use");
+static const arg_def_t passes = ARG_DEF(
+    "p", "passes", 1, "Number of passes (1/2)");
+static const arg_def_t pass_arg = ARG_DEF(
+    NULL, "pass", 1, "Pass to execute (1/2)");
+static const arg_def_t fpf_name = ARG_DEF(
+    NULL, "fpf", 1, "First pass statistics file name");
+#if CONFIG_FP_MB_STATS
+static const arg_def_t fpmbf_name = ARG_DEF(
+    NULL, "fpmbf", 1, "First pass block statistics file name");
+#endif
+static const arg_def_t limit = ARG_DEF(
+    NULL, "limit", 1, "Stop encoding after n input frames");
+static const arg_def_t skip = ARG_DEF(
+    NULL, "skip", 1, "Skip the first n input frames");
+static const arg_def_t deadline = ARG_DEF(
+    "d", "deadline", 1, "Deadline per frame (usec)");
+static const arg_def_t best_dl = ARG_DEF(
+    NULL, "best", 0, "Use Best Quality Deadline");
+static const arg_def_t good_dl = ARG_DEF(
+    NULL, "good", 0, "Use Good Quality Deadline");
+static const arg_def_t rt_dl = ARG_DEF(
+    NULL, "rt", 0, "Use Realtime Quality Deadline");
+static const arg_def_t quietarg = ARG_DEF(
+    "q", "quiet", 0, "Do not print encode progress");
+static const arg_def_t verbosearg = ARG_DEF(
+    "v", "verbose", 0, "Show encoder parameters");
+static const arg_def_t psnrarg = ARG_DEF(
+    NULL, "psnr", 0, "Show PSNR in status line");
+
+static const struct arg_enum_list test_decode_enum[] = {
+  {"off",   TEST_DECODE_OFF},
+  {"fatal", TEST_DECODE_FATAL},
+  {"warn",  TEST_DECODE_WARN},
+  {NULL, 0}
+};
+static const arg_def_t recontest = ARG_DEF_ENUM(
+    NULL, "test-decode", 1, "Test encode/decode mismatch", test_decode_enum);
+static const arg_def_t framerate = ARG_DEF(
+    NULL, "fps", 1, "Stream frame rate (rate/scale)");
+static const arg_def_t use_webm = ARG_DEF(
+    NULL, "webm", 0, "Output WebM (default when WebM IO is enabled)");
+static const arg_def_t use_ivf = ARG_DEF(
+    NULL, "ivf", 0, "Output IVF");
+static const arg_def_t out_part = ARG_DEF(
+    "P", "output-partitions", 0,
+    "Makes encoder output partitions. Requires IVF output!");
+static const arg_def_t q_hist_n = ARG_DEF(
+    NULL, "q-hist", 1, "Show quantizer histogram (n-buckets)");
+static const arg_def_t rate_hist_n = ARG_DEF(
+    NULL, "rate-hist", 1, "Show rate histogram (n-buckets)");
+static const arg_def_t disable_warnings = ARG_DEF(
+    NULL, "disable-warnings", 0,
+    "Disable warnings about potentially incorrect encode settings.");
+static const arg_def_t disable_warning_prompt = ARG_DEF(
+    "y", "disable-warning-prompt", 0,
+    "Display warnings, but do not prompt user to continue.");
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static const arg_def_t test16bitinternalarg = ARG_DEF(
+    NULL, "test-16bit-internal", 0, "Force use of 16 bit internal buffer");
+#endif
+
+static const arg_def_t *main_args[] = {
+  &debugmode,
+  &outputfile, &codecarg, &passes, &pass_arg, &fpf_name, &limit, &skip,
+  &deadline, &best_dl, &good_dl, &rt_dl,
+  &quietarg, &verbosearg, &psnrarg, &use_webm, &use_ivf, &out_part, &q_hist_n,
+  &rate_hist_n, &disable_warnings, &disable_warning_prompt, &recontest,
+  NULL
+};
+
+static const arg_def_t usage = ARG_DEF(
+    "u", "usage", 1, "Usage profile number to use");
+static const arg_def_t threads = ARG_DEF(
+    "t", "threads", 1, "Max number of threads to use");
+static const arg_def_t profile = ARG_DEF(
+    NULL, "profile", 1, "Bitstream profile number to use");
+static const arg_def_t width = ARG_DEF("w", "width", 1, "Frame width");
+static const arg_def_t height = ARG_DEF("h", "height", 1, "Frame height");
+#if CONFIG_WEBM_IO
+static const struct arg_enum_list stereo_mode_enum[] = {
+  {"mono", STEREO_FORMAT_MONO},
+  {"left-right", STEREO_FORMAT_LEFT_RIGHT},
+  {"bottom-top", STEREO_FORMAT_BOTTOM_TOP},
+  {"top-bottom", STEREO_FORMAT_TOP_BOTTOM},
+  {"right-left", STEREO_FORMAT_RIGHT_LEFT},
+  {NULL, 0}
+};
+static const arg_def_t stereo_mode = ARG_DEF_ENUM(
+    NULL, "stereo-mode", 1, "Stereo 3D video format", stereo_mode_enum);
+#endif
+static const arg_def_t timebase = ARG_DEF(
+    NULL, "timebase", 1, "Output timestamp precision (fractional seconds)");
+static const arg_def_t error_resilient = ARG_DEF(
+    NULL, "error-resilient", 1, "Enable error resiliency features");
+static const arg_def_t lag_in_frames = ARG_DEF(
+    NULL, "lag-in-frames", 1, "Max number of frames to lag");
+
+static const arg_def_t *global_args[] = {
+  &use_yv12, &use_i420, &use_i422, &use_i444, &use_i440,
+  &usage, &threads, &profile,
+  &width, &height,
+#if CONFIG_WEBM_IO
+  &stereo_mode,
+#endif
+  &timebase, &framerate,
+  &error_resilient,
+#if CONFIG_VP9_HIGHBITDEPTH
+  &test16bitinternalarg,
+#endif
+  &lag_in_frames, NULL
+};
+
+static const arg_def_t dropframe_thresh = ARG_DEF(
+    NULL, "drop-frame", 1, "Temporal resampling threshold (buf %)");
+static const arg_def_t resize_allowed = ARG_DEF(
+    NULL, "resize-allowed", 1, "Spatial resampling enabled (bool)");
+static const arg_def_t resize_width = ARG_DEF(
+    NULL, "resize-width", 1, "Width of encoded frame");
+static const arg_def_t resize_height = ARG_DEF(
+    NULL, "resize-height", 1, "Height of encoded frame");
+static const arg_def_t resize_up_thresh = ARG_DEF(
+    NULL, "resize-up", 1, "Upscale threshold (buf %)");
+static const arg_def_t resize_down_thresh = ARG_DEF(
+    NULL, "resize-down", 1, "Downscale threshold (buf %)");
+static const struct arg_enum_list end_usage_enum[] = {
+  {"vbr", VPX_VBR},
+  {"cbr", VPX_CBR},
+  {"cq",  VPX_CQ},
+  {"q",   VPX_Q},
+  {NULL, 0}
+};
+static const arg_def_t end_usage = ARG_DEF_ENUM(
+    NULL, "end-usage", 1, "Rate control mode", end_usage_enum);
+static const arg_def_t target_bitrate = ARG_DEF(
+    NULL, "target-bitrate", 1, "Bitrate (kbps)");
+static const arg_def_t min_quantizer = ARG_DEF(
+    NULL, "min-q", 1, "Minimum (best) quantizer");
+static const arg_def_t max_quantizer = ARG_DEF(
+    NULL, "max-q", 1, "Maximum (worst) quantizer");
+static const arg_def_t undershoot_pct = ARG_DEF(
+    NULL, "undershoot-pct", 1, "Datarate undershoot (min) target (%)");
+static const arg_def_t overshoot_pct = ARG_DEF(
+    NULL, "overshoot-pct", 1, "Datarate overshoot (max) target (%)");
+static const arg_def_t buf_sz = ARG_DEF(
+    NULL, "buf-sz", 1, "Client buffer size (ms)");
+static const arg_def_t buf_initial_sz = ARG_DEF(
+    NULL, "buf-initial-sz", 1, "Client initial buffer size (ms)");
+static const arg_def_t buf_optimal_sz = ARG_DEF(
+    NULL, "buf-optimal-sz", 1, "Client optimal buffer size (ms)");
+static const arg_def_t *rc_args[] = {
+  &dropframe_thresh, &resize_allowed, &resize_width, &resize_height,
+  &resize_up_thresh, &resize_down_thresh, &end_usage, &target_bitrate,
+  &min_quantizer, &max_quantizer, &undershoot_pct, &overshoot_pct, &buf_sz,
+  &buf_initial_sz, &buf_optimal_sz, NULL
+};
+
+
+static const arg_def_t bias_pct = ARG_DEF(
+    NULL, "bias-pct", 1, "CBR/VBR bias (0=CBR, 100=VBR)");
+static const arg_def_t minsection_pct = ARG_DEF(
+    NULL, "minsection-pct", 1, "GOP min bitrate (% of target)");
+static const arg_def_t maxsection_pct = ARG_DEF(
+    NULL, "maxsection-pct", 1, "GOP max bitrate (% of target)");
+static const arg_def_t *rc_twopass_args[] = {
+  &bias_pct, &minsection_pct, &maxsection_pct, NULL
+};
+
+
+static const arg_def_t kf_min_dist = ARG_DEF(
+    NULL, "kf-min-dist", 1, "Minimum keyframe interval (frames)");
+static const arg_def_t kf_max_dist = ARG_DEF(
+    NULL, "kf-max-dist", 1, "Maximum keyframe interval (frames)");
+static const arg_def_t kf_disabled = ARG_DEF(
+    NULL, "disable-kf", 0, "Disable keyframe placement");
+static const arg_def_t *kf_args[] = {
+  &kf_min_dist, &kf_max_dist, &kf_disabled, NULL
+};
+
+
+static const arg_def_t noise_sens = ARG_DEF(
+    NULL, "noise-sensitivity", 1, "Noise sensitivity (frames to blur)");
+static const arg_def_t sharpness = ARG_DEF(
+    NULL, "sharpness", 1, "Loop filter sharpness (0..7)");
+static const arg_def_t static_thresh = ARG_DEF(
+    NULL, "static-thresh", 1, "Motion detection threshold");
+static const arg_def_t auto_altref = ARG_DEF(
+    NULL, "auto-alt-ref", 1, "Enable automatic alt reference frames");
+static const arg_def_t arnr_maxframes = ARG_DEF(
+    NULL, "arnr-maxframes", 1, "AltRef max frames (0..15)");
+static const arg_def_t arnr_strength = ARG_DEF(
+    NULL, "arnr-strength", 1, "AltRef filter strength (0..6)");
+static const arg_def_t arnr_type = ARG_DEF(
+    NULL, "arnr-type", 1, "AltRef type");
+static const struct arg_enum_list tuning_enum[] = {
+  {"psnr", VP8_TUNE_PSNR},
+  {"ssim", VP8_TUNE_SSIM},
+  {NULL, 0}
+};
+static const arg_def_t tune_ssim = ARG_DEF_ENUM(
+    NULL, "tune", 1, "Material to favor", tuning_enum);
+static const arg_def_t cq_level = ARG_DEF(
+    NULL, "cq-level", 1, "Constant/Constrained Quality level");
+static const arg_def_t max_intra_rate_pct = ARG_DEF(
+    NULL, "max-intra-rate", 1, "Max I-frame bitrate (pct)");
+
+#if CONFIG_VP8_ENCODER
+static const arg_def_t cpu_used_vp8 = ARG_DEF(
+    NULL, "cpu-used", 1, "CPU Used (-16..16)");
+static const arg_def_t token_parts = ARG_DEF(
+    NULL, "token-parts", 1, "Number of token partitions to use, log2");
+static const arg_def_t screen_content_mode = ARG_DEF(
+    NULL, "screen-content-mode", 1, "Screen content mode");
+static const arg_def_t *vp8_args[] = {
+  &cpu_used_vp8, &auto_altref, &noise_sens, &sharpness, &static_thresh,
+  &token_parts, &arnr_maxframes, &arnr_strength, &arnr_type,
+  &tune_ssim, &cq_level, &max_intra_rate_pct, &screen_content_mode,
+  NULL
+};
+static const int vp8_arg_ctrl_map[] = {
+  VP8E_SET_CPUUSED, VP8E_SET_ENABLEAUTOALTREF,
+  VP8E_SET_NOISE_SENSITIVITY, VP8E_SET_SHARPNESS, VP8E_SET_STATIC_THRESHOLD,
+  VP8E_SET_TOKEN_PARTITIONS,
+  VP8E_SET_ARNR_MAXFRAMES, VP8E_SET_ARNR_STRENGTH, VP8E_SET_ARNR_TYPE,
+  VP8E_SET_TUNING, VP8E_SET_CQ_LEVEL, VP8E_SET_MAX_INTRA_BITRATE_PCT,
+  VP8E_SET_SCREEN_CONTENT_MODE,
+  0
+};
+#endif
+
+#if CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
+static const arg_def_t cpu_used_vp9 = ARG_DEF(
+    NULL, "cpu-used", 1, "CPU Used (-8..8)");
+static const arg_def_t tile_cols = ARG_DEF(
+    NULL, "tile-columns", 1, "Number of tile columns to use, log2");
+static const arg_def_t tile_rows = ARG_DEF(
+    NULL, "tile-rows", 1, "Number of tile rows to use, log2");
+static const arg_def_t lossless = ARG_DEF(
+    NULL, "lossless", 1, "Lossless mode (0: false (default), 1: true)");
+static const arg_def_t frame_parallel_decoding = ARG_DEF(
+    NULL, "frame-parallel", 1, "Enable frame parallel decodability features");
+static const arg_def_t aq_mode = ARG_DEF(
+    NULL, "aq-mode", 1,
+    "Adaptive quantization mode (0: off (default), 1: variance 2: complexity, "
+    "3: cyclic refresh, 4: equator360)");
+static const arg_def_t frame_periodic_boost = ARG_DEF(
+    NULL, "frame-boost", 1,
+    "Enable frame periodic boost (0: off (default), 1: on)");
+static const arg_def_t gf_cbr_boost_pct = ARG_DEF(
+    NULL, "gf-cbr-boost", 1, "Boost for Golden Frame in CBR mode (pct)");
+static const arg_def_t max_inter_rate_pct = ARG_DEF(
+    NULL, "max-inter-rate", 1, "Max P-frame bitrate (pct)");
+static const arg_def_t min_gf_interval = ARG_DEF(
+    NULL, "min-gf-interval", 1,
+    "min gf/arf frame interval (default 0, indicating in-built behavior)");
+static const arg_def_t max_gf_interval = ARG_DEF(
+    NULL, "max-gf-interval", 1,
+    "max gf/arf frame interval (default 0, indicating in-built behavior)");
+
+static const struct arg_enum_list color_space_enum[] = {
+  { "unknown", VPX_CS_UNKNOWN },
+  { "bt601", VPX_CS_BT_601 },
+  { "bt709", VPX_CS_BT_709 },
+  { "smpte170", VPX_CS_SMPTE_170 },
+  { "smpte240", VPX_CS_SMPTE_240 },
+  { "bt2020", VPX_CS_BT_2020 },
+  { "reserved", VPX_CS_RESERVED },
+  { "sRGB", VPX_CS_SRGB },
+  { NULL, 0 }
+};
+
+static const arg_def_t input_color_space = ARG_DEF_ENUM(
+    NULL, "color-space", 1,
+    "The color space of input content:", color_space_enum);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static const struct arg_enum_list bitdepth_enum[] = {
+  {"8",  VPX_BITS_8},
+  {"10", VPX_BITS_10},
+  {"12", VPX_BITS_12},
+  {NULL, 0}
+};
+
+static const arg_def_t bitdeptharg = ARG_DEF_ENUM(
+    "b", "bit-depth", 1,
+    "Bit depth for codec (8 for version <=1, 10 or 12 for version 2)",
+    bitdepth_enum);
+static const arg_def_t inbitdeptharg = ARG_DEF(
+    NULL, "input-bit-depth", 1, "Bit depth of input");
+#endif
+
+static const struct arg_enum_list tune_content_enum[] = {
+  {"default", VP9E_CONTENT_DEFAULT},
+  {"screen", VP9E_CONTENT_SCREEN},
+  {NULL, 0}
+};
+
+static const arg_def_t tune_content = ARG_DEF_ENUM(
+    NULL, "tune-content", 1, "Tune content type", tune_content_enum);
+#endif
+
+#if CONFIG_VP9_ENCODER
+static const arg_def_t *vp9_args[] = {
+  &cpu_used_vp9, &auto_altref, &sharpness, &static_thresh,
+  &tile_cols, &tile_rows, &arnr_maxframes, &arnr_strength, &arnr_type,
+  &tune_ssim, &cq_level, &max_intra_rate_pct, &max_inter_rate_pct,
+  &gf_cbr_boost_pct, &lossless,
+  &frame_parallel_decoding, &aq_mode, &frame_periodic_boost,
+  &noise_sens, &tune_content, &input_color_space,
+  &min_gf_interval, &max_gf_interval,
+#if CONFIG_VP9_HIGHBITDEPTH
+  &bitdeptharg, &inbitdeptharg,
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  NULL
+};
+static const int vp9_arg_ctrl_map[] = {
+  VP8E_SET_CPUUSED, VP8E_SET_ENABLEAUTOALTREF,
+  VP8E_SET_SHARPNESS, VP8E_SET_STATIC_THRESHOLD,
+  VP9E_SET_TILE_COLUMNS, VP9E_SET_TILE_ROWS,
+  VP8E_SET_ARNR_MAXFRAMES, VP8E_SET_ARNR_STRENGTH, VP8E_SET_ARNR_TYPE,
+  VP8E_SET_TUNING, VP8E_SET_CQ_LEVEL, VP8E_SET_MAX_INTRA_BITRATE_PCT,
+  VP9E_SET_MAX_INTER_BITRATE_PCT, VP9E_SET_GF_CBR_BOOST_PCT,
+  VP9E_SET_LOSSLESS, VP9E_SET_FRAME_PARALLEL_DECODING, VP9E_SET_AQ_MODE,
+  VP9E_SET_FRAME_PERIODIC_BOOST, VP9E_SET_NOISE_SENSITIVITY,
+  VP9E_SET_TUNE_CONTENT, VP9E_SET_COLOR_SPACE,
+  VP9E_SET_MIN_GF_INTERVAL, VP9E_SET_MAX_GF_INTERVAL,
+  0
+};
+#endif
+
+#if CONFIG_VP10_ENCODER
+static const arg_def_t *vp10_args[] = {
+  &cpu_used_vp9, &auto_altref, &sharpness, &static_thresh,
+  &tile_cols, &tile_rows, &arnr_maxframes, &arnr_strength, &arnr_type,
+  &tune_ssim, &cq_level, &max_intra_rate_pct, &max_inter_rate_pct,
+  &gf_cbr_boost_pct, &lossless,
+  &frame_parallel_decoding, &aq_mode, &frame_periodic_boost,
+  &noise_sens, &tune_content, &input_color_space,
+  &min_gf_interval, &max_gf_interval,
+#if CONFIG_VP9_HIGHBITDEPTH
+  &bitdeptharg, &inbitdeptharg,
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  NULL
+};
+static const int vp10_arg_ctrl_map[] = {
+  VP8E_SET_CPUUSED, VP8E_SET_ENABLEAUTOALTREF,
+  VP8E_SET_SHARPNESS, VP8E_SET_STATIC_THRESHOLD,
+  VP9E_SET_TILE_COLUMNS, VP9E_SET_TILE_ROWS,
+  VP8E_SET_ARNR_MAXFRAMES, VP8E_SET_ARNR_STRENGTH, VP8E_SET_ARNR_TYPE,
+  VP8E_SET_TUNING, VP8E_SET_CQ_LEVEL, VP8E_SET_MAX_INTRA_BITRATE_PCT,
+  VP9E_SET_MAX_INTER_BITRATE_PCT, VP9E_SET_GF_CBR_BOOST_PCT,
+  VP9E_SET_LOSSLESS, VP9E_SET_FRAME_PARALLEL_DECODING, VP9E_SET_AQ_MODE,
+  VP9E_SET_FRAME_PERIODIC_BOOST, VP9E_SET_NOISE_SENSITIVITY,
+  VP9E_SET_TUNE_CONTENT, VP9E_SET_COLOR_SPACE,
+  VP9E_SET_MIN_GF_INTERVAL, VP9E_SET_MAX_GF_INTERVAL,
+  0
+};
+#endif
+
+static const arg_def_t *no_args[] = { NULL };
+
+void usage_exit(void) {
+  int i;
+  const int num_encoder = get_vpx_encoder_count();
+
+  fprintf(stderr, "Usage: %s <options> -o dst_filename src_filename \n",
+          exec_name);
+
+  fprintf(stderr, "\nOptions:\n");
+  arg_show_usage(stderr, main_args);
+  fprintf(stderr, "\nEncoder Global Options:\n");
+  arg_show_usage(stderr, global_args);
+  fprintf(stderr, "\nRate Control Options:\n");
+  arg_show_usage(stderr, rc_args);
+  fprintf(stderr, "\nTwopass Rate Control Options:\n");
+  arg_show_usage(stderr, rc_twopass_args);
+  fprintf(stderr, "\nKeyframe Placement Options:\n");
+  arg_show_usage(stderr, kf_args);
+#if CONFIG_VP8_ENCODER
+  fprintf(stderr, "\nVP8 Specific Options:\n");
+  arg_show_usage(stderr, vp8_args);
+#endif
+#if CONFIG_VP9_ENCODER
+  fprintf(stderr, "\nVP9 Specific Options:\n");
+  arg_show_usage(stderr, vp9_args);
+#endif
+#if CONFIG_VP10_ENCODER
+  fprintf(stderr, "\nVP10 Specific Options:\n");
+  arg_show_usage(stderr, vp10_args);
+#endif
+  fprintf(stderr, "\nStream timebase (--timebase):\n"
+          "  The desired precision of timestamps in the output, expressed\n"
+          "  in fractional seconds. Default is 1/1000.\n");
+  fprintf(stderr, "\nIncluded encoders:\n\n");
+
+  for (i = 0; i < num_encoder; ++i) {
+    const VpxInterface *const encoder = get_vpx_encoder_by_index(i);
+    const char* defstr = (i == (num_encoder - 1)) ? "(default)" : "";
+      fprintf(stderr, "    %-6s - %s %s\n",
+              encoder->name, vpx_codec_iface_name(encoder->codec_interface()),
+              defstr);
+  }
+  fprintf(stderr, "\n        ");
+  fprintf(stderr, "Use --codec to switch to a non-default encoder.\n\n");
+
+  exit(EXIT_FAILURE);
+}
+
+#define mmin(a, b)  ((a) < (b) ? (a) : (b))
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void find_mismatch_high(const vpx_image_t *const img1,
+                               const vpx_image_t *const img2,
+                               int yloc[4], int uloc[4], int vloc[4]) {
+  uint16_t *plane1, *plane2;
+  uint32_t stride1, stride2;
+  const uint32_t bsize = 64;
+  const uint32_t bsizey = bsize >> img1->y_chroma_shift;
+  const uint32_t bsizex = bsize >> img1->x_chroma_shift;
+  const uint32_t c_w =
+      (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift;
+  const uint32_t c_h =
+      (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift;
+  int match = 1;
+  uint32_t i, j;
+  yloc[0] = yloc[1] = yloc[2] = yloc[3] = -1;
+  plane1 = (uint16_t*)img1->planes[VPX_PLANE_Y];
+  plane2 = (uint16_t*)img2->planes[VPX_PLANE_Y];
+  stride1 = img1->stride[VPX_PLANE_Y]/2;
+  stride2 = img2->stride[VPX_PLANE_Y]/2;
+  for (i = 0, match = 1; match && i < img1->d_h; i += bsize) {
+    for (j = 0; match && j < img1->d_w; j += bsize) {
+      int k, l;
+      const int si = mmin(i + bsize, img1->d_h) - i;
+      const int sj = mmin(j + bsize, img1->d_w) - j;
+      for (k = 0; match && k < si; ++k) {
+        for (l = 0; match && l < sj; ++l) {
+          if (*(plane1 + (i + k) * stride1 + j + l) !=
+              *(plane2 + (i + k) * stride2 + j + l)) {
+            yloc[0] = i + k;
+            yloc[1] = j + l;
+            yloc[2] = *(plane1 + (i + k) * stride1 + j + l);
+            yloc[3] = *(plane2 + (i + k) * stride2 + j + l);
+            match = 0;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  uloc[0] = uloc[1] = uloc[2] = uloc[3] = -1;
+  plane1 = (uint16_t*)img1->planes[VPX_PLANE_U];
+  plane2 = (uint16_t*)img2->planes[VPX_PLANE_U];
+  stride1 = img1->stride[VPX_PLANE_U]/2;
+  stride2 = img2->stride[VPX_PLANE_U]/2;
+  for (i = 0, match = 1; match && i < c_h; i += bsizey) {
+    for (j = 0; match && j < c_w; j += bsizex) {
+      int k, l;
+      const int si = mmin(i + bsizey, c_h - i);
+      const int sj = mmin(j + bsizex, c_w - j);
+      for (k = 0; match && k < si; ++k) {
+        for (l = 0; match && l < sj; ++l) {
+          if (*(plane1 + (i + k) * stride1 + j + l) !=
+              *(plane2 + (i + k) * stride2 + j + l)) {
+            uloc[0] = i + k;
+            uloc[1] = j + l;
+            uloc[2] = *(plane1 + (i + k) * stride1 + j + l);
+            uloc[3] = *(plane2 + (i + k) * stride2 + j + l);
+            match = 0;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  vloc[0] = vloc[1] = vloc[2] = vloc[3] = -1;
+  plane1 = (uint16_t*)img1->planes[VPX_PLANE_V];
+  plane2 = (uint16_t*)img2->planes[VPX_PLANE_V];
+  stride1 = img1->stride[VPX_PLANE_V]/2;
+  stride2 = img2->stride[VPX_PLANE_V]/2;
+  for (i = 0, match = 1; match && i < c_h; i += bsizey) {
+    for (j = 0; match && j < c_w; j += bsizex) {
+      int k, l;
+      const int si = mmin(i + bsizey, c_h - i);
+      const int sj = mmin(j + bsizex, c_w - j);
+      for (k = 0; match && k < si; ++k) {
+        for (l = 0; match && l < sj; ++l) {
+          if (*(plane1 + (i + k) * stride1 + j + l) !=
+              *(plane2 + (i + k) * stride2 + j + l)) {
+            vloc[0] = i + k;
+            vloc[1] = j + l;
+            vloc[2] = *(plane1 + (i + k) * stride1 + j + l);
+            vloc[3] = *(plane2 + (i + k) * stride2 + j + l);
+            match = 0;
+            break;
+          }
+        }
+      }
+    }
+  }
+}
+#endif
+
+static void find_mismatch(const vpx_image_t *const img1,
+                          const vpx_image_t *const img2,
+                          int yloc[4], int uloc[4], int vloc[4]) {
+  const uint32_t bsize = 64;
+  const uint32_t bsizey = bsize >> img1->y_chroma_shift;
+  const uint32_t bsizex = bsize >> img1->x_chroma_shift;
+  const uint32_t c_w =
+      (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift;
+  const uint32_t c_h =
+      (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift;
+  int match = 1;
+  uint32_t i, j;
+  yloc[0] = yloc[1] = yloc[2] = yloc[3] = -1;
+  for (i = 0, match = 1; match && i < img1->d_h; i += bsize) {
+    for (j = 0; match && j < img1->d_w; j += bsize) {
+      int k, l;
+      const int si = mmin(i + bsize, img1->d_h) - i;
+      const int sj = mmin(j + bsize, img1->d_w) - j;
+      for (k = 0; match && k < si; ++k) {
+        for (l = 0; match && l < sj; ++l) {
+          if (*(img1->planes[VPX_PLANE_Y] +
+                (i + k) * img1->stride[VPX_PLANE_Y] + j + l) !=
+              *(img2->planes[VPX_PLANE_Y] +
+                (i + k) * img2->stride[VPX_PLANE_Y] + j + l)) {
+            yloc[0] = i + k;
+            yloc[1] = j + l;
+            yloc[2] = *(img1->planes[VPX_PLANE_Y] +
+                        (i + k) * img1->stride[VPX_PLANE_Y] + j + l);
+            yloc[3] = *(img2->planes[VPX_PLANE_Y] +
+                        (i + k) * img2->stride[VPX_PLANE_Y] + j + l);
+            match = 0;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  uloc[0] = uloc[1] = uloc[2] = uloc[3] = -1;
+  for (i = 0, match = 1; match && i < c_h; i += bsizey) {
+    for (j = 0; match && j < c_w; j += bsizex) {
+      int k, l;
+      const int si = mmin(i + bsizey, c_h - i);
+      const int sj = mmin(j + bsizex, c_w - j);
+      for (k = 0; match && k < si; ++k) {
+        for (l = 0; match && l < sj; ++l) {
+          if (*(img1->planes[VPX_PLANE_U] +
+                (i + k) * img1->stride[VPX_PLANE_U] + j + l) !=
+              *(img2->planes[VPX_PLANE_U] +
+                (i + k) * img2->stride[VPX_PLANE_U] + j + l)) {
+            uloc[0] = i + k;
+            uloc[1] = j + l;
+            uloc[2] = *(img1->planes[VPX_PLANE_U] +
+                        (i + k) * img1->stride[VPX_PLANE_U] + j + l);
+            uloc[3] = *(img2->planes[VPX_PLANE_U] +
+                        (i + k) * img2->stride[VPX_PLANE_U] + j + l);
+            match = 0;
+            break;
+          }
+        }
+      }
+    }
+  }
+  vloc[0] = vloc[1] = vloc[2] = vloc[3] = -1;
+  for (i = 0, match = 1; match && i < c_h; i += bsizey) {
+    for (j = 0; match && j < c_w; j += bsizex) {
+      int k, l;
+      const int si = mmin(i + bsizey, c_h - i);
+      const int sj = mmin(j + bsizex, c_w - j);
+      for (k = 0; match && k < si; ++k) {
+        for (l = 0; match && l < sj; ++l) {
+          if (*(img1->planes[VPX_PLANE_V] +
+                (i + k) * img1->stride[VPX_PLANE_V] + j + l) !=
+              *(img2->planes[VPX_PLANE_V] +
+                (i + k) * img2->stride[VPX_PLANE_V] + j + l)) {
+            vloc[0] = i + k;
+            vloc[1] = j + l;
+            vloc[2] = *(img1->planes[VPX_PLANE_V] +
+                        (i + k) * img1->stride[VPX_PLANE_V] + j + l);
+            vloc[3] = *(img2->planes[VPX_PLANE_V] +
+                        (i + k) * img2->stride[VPX_PLANE_V] + j + l);
+            match = 0;
+            break;
+          }
+        }
+      }
+    }
+  }
+}
+
+static int compare_img(const vpx_image_t *const img1,
+                       const vpx_image_t *const img2) {
+  uint32_t l_w = img1->d_w;
+  uint32_t c_w =
+      (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift;
+  const uint32_t c_h =
+      (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift;
+  uint32_t i;
+  int match = 1;
+
+  match &= (img1->fmt == img2->fmt);
+  match &= (img1->d_w == img2->d_w);
+  match &= (img1->d_h == img2->d_h);
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (img1->fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
+    l_w *= 2;
+    c_w *= 2;
+  }
+#endif
+
+  for (i = 0; i < img1->d_h; ++i)
+    match &= (memcmp(img1->planes[VPX_PLANE_Y] + i * img1->stride[VPX_PLANE_Y],
+                     img2->planes[VPX_PLANE_Y] + i * img2->stride[VPX_PLANE_Y],
+                     l_w) == 0);
+
+  for (i = 0; i < c_h; ++i)
+    match &= (memcmp(img1->planes[VPX_PLANE_U] + i * img1->stride[VPX_PLANE_U],
+                     img2->planes[VPX_PLANE_U] + i * img2->stride[VPX_PLANE_U],
+                     c_w) == 0);
+
+  for (i = 0; i < c_h; ++i)
+    match &= (memcmp(img1->planes[VPX_PLANE_V] + i * img1->stride[VPX_PLANE_V],
+                     img2->planes[VPX_PLANE_V] + i * img2->stride[VPX_PLANE_V],
+                     c_w) == 0);
+
+  return match;
+}
+
+
+#define NELEMENTS(x) (sizeof(x)/sizeof(x[0]))
+#if CONFIG_VP10_ENCODER
+#define ARG_CTRL_CNT_MAX NELEMENTS(vp10_arg_ctrl_map)
+#elif CONFIG_VP9_ENCODER
+#define ARG_CTRL_CNT_MAX NELEMENTS(vp9_arg_ctrl_map)
+#else
+#define ARG_CTRL_CNT_MAX NELEMENTS(vp8_arg_ctrl_map)
+#endif
+
+#if !CONFIG_WEBM_IO
+typedef int stereo_format_t;
+struct EbmlGlobal { int debug; };
+#endif
+
+/* Per-stream configuration */
+struct stream_config {
+  struct vpx_codec_enc_cfg  cfg;
+  const char               *out_fn;
+  const char               *stats_fn;
+#if CONFIG_FP_MB_STATS
+  const char               *fpmb_stats_fn;
+#endif
+  stereo_format_t           stereo_fmt;
+  int                       arg_ctrls[ARG_CTRL_CNT_MAX][2];
+  int                       arg_ctrl_cnt;
+  int                       write_webm;
+  int                       have_kf_max_dist;
+#if CONFIG_VP9_HIGHBITDEPTH
+  // whether to use 16bit internal buffers
+  int                       use_16bit_internal;
+#endif
+};
+
+
+struct stream_state {
+  int                       index;
+  struct stream_state      *next;
+  struct stream_config      config;
+  FILE                     *file;
+  struct rate_hist         *rate_hist;
+  struct EbmlGlobal         ebml;
+  uint64_t                  psnr_sse_total;
+  uint64_t                  psnr_samples_total;
+  double                    psnr_totals[4];
+  int                       psnr_count;
+  int                       counts[64];
+  vpx_codec_ctx_t           encoder;
+  unsigned int              frames_out;
+  uint64_t                  cx_time;
+  size_t                    nbytes;
+  stats_io_t                stats;
+#if CONFIG_FP_MB_STATS
+  stats_io_t                fpmb_stats;
+#endif
+  struct vpx_image         *img;
+  vpx_codec_ctx_t           decoder;
+  int                       mismatch_seen;
+};
+
+
+static void validate_positive_rational(const char          *msg,
+                                       struct vpx_rational *rat) {
+  if (rat->den < 0) {
+    rat->num *= -1;
+    rat->den *= -1;
+  }
+
+  if (rat->num < 0)
+    die("Error: %s must be positive\n", msg);
+
+  if (!rat->den)
+    die("Error: %s has zero denominator\n", msg);
+}
+
+
+static void parse_global_config(struct VpxEncoderConfig *global, char **argv) {
+  char       **argi, **argj;
+  struct arg   arg;
+  const int num_encoder = get_vpx_encoder_count();
+
+  if (num_encoder < 1)
+    die("Error: no valid encoder available\n");
+
+  /* Initialize default parameters */
+  memset(global, 0, sizeof(*global));
+  global->codec = get_vpx_encoder_by_index(num_encoder - 1);
+  global->passes = 0;
+  global->color_type = I420;
+  /* Assign default deadline to good quality */
+  global->deadline = VPX_DL_GOOD_QUALITY;
+
+  for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
+    arg.argv_step = 1;
+
+    if (arg_match(&arg, &codecarg, argi)) {
+      global->codec = get_vpx_encoder_by_name(arg.val);
+      if (!global->codec)
+        die("Error: Unrecognized argument (%s) to --codec\n", arg.val);
+    } else if (arg_match(&arg, &passes, argi)) {
+      global->passes = arg_parse_uint(&arg);
+
+      if (global->passes < 1 || global->passes > 2)
+        die("Error: Invalid number of passes (%d)\n", global->passes);
+    } else if (arg_match(&arg, &pass_arg, argi)) {
+      global->pass = arg_parse_uint(&arg);
+
+      if (global->pass < 1 || global->pass > 2)
+        die("Error: Invalid pass selected (%d)\n",
+            global->pass);
+    } else if (arg_match(&arg, &usage, argi))
+      global->usage = arg_parse_uint(&arg);
+    else if (arg_match(&arg, &deadline, argi))
+      global->deadline = arg_parse_uint(&arg);
+    else if (arg_match(&arg, &best_dl, argi))
+      global->deadline = VPX_DL_BEST_QUALITY;
+    else if (arg_match(&arg, &good_dl, argi))
+      global->deadline = VPX_DL_GOOD_QUALITY;
+    else if (arg_match(&arg, &rt_dl, argi))
+      global->deadline = VPX_DL_REALTIME;
+    else if (arg_match(&arg, &use_yv12, argi))
+      global->color_type = YV12;
+    else if (arg_match(&arg, &use_i420, argi))
+      global->color_type = I420;
+    else if (arg_match(&arg, &use_i422, argi))
+      global->color_type = I422;
+    else if (arg_match(&arg, &use_i444, argi))
+      global->color_type = I444;
+    else if (arg_match(&arg, &use_i440, argi))
+      global->color_type = I440;
+    else if (arg_match(&arg, &quietarg, argi))
+      global->quiet = 1;
+    else if (arg_match(&arg, &verbosearg, argi))
+      global->verbose = 1;
+    else if (arg_match(&arg, &limit, argi))
+      global->limit = arg_parse_uint(&arg);
+    else if (arg_match(&arg, &skip, argi))
+      global->skip_frames = arg_parse_uint(&arg);
+    else if (arg_match(&arg, &psnrarg, argi))
+      global->show_psnr = 1;
+    else if (arg_match(&arg, &recontest, argi))
+      global->test_decode = arg_parse_enum_or_int(&arg);
+    else if (arg_match(&arg, &framerate, argi)) {
+      global->framerate = arg_parse_rational(&arg);
+      validate_positive_rational(arg.name, &global->framerate);
+      global->have_framerate = 1;
+    } else if (arg_match(&arg, &out_part, argi))
+      global->out_part = 1;
+    else if (arg_match(&arg, &debugmode, argi))
+      global->debug = 1;
+    else if (arg_match(&arg, &q_hist_n, argi))
+      global->show_q_hist_buckets = arg_parse_uint(&arg);
+    else if (arg_match(&arg, &rate_hist_n, argi))
+      global->show_rate_hist_buckets = arg_parse_uint(&arg);
+    else if (arg_match(&arg, &disable_warnings, argi))
+      global->disable_warnings = 1;
+    else if (arg_match(&arg, &disable_warning_prompt, argi))
+      global->disable_warning_prompt = 1;
+    else
+      argj++;
+  }
+
+  if (global->pass) {
+    /* DWIM: Assume the user meant passes=2 if pass=2 is specified */
+    if (global->pass > global->passes) {
+      warn("Assuming --pass=%d implies --passes=%d\n",
+           global->pass, global->pass);
+      global->passes = global->pass;
+    }
+  }
+  /* Validate global config */
+  if (global->passes == 0) {
+#if CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
+    // Make default VP9 passes = 2 until there is a better quality 1-pass
+    // encoder
+    if (global->codec != NULL && global->codec->name != NULL)
+      global->passes = (strcmp(global->codec->name, "vp9") == 0 &&
+                        global->deadline != VPX_DL_REALTIME) ? 2 : 1;
+#else
+    global->passes = 1;
+#endif
+  }
+
+  if (global->deadline == VPX_DL_REALTIME &&
+      global->passes > 1) {
+    warn("Enforcing one-pass encoding in realtime mode\n");
+    global->passes = 1;
+  }
+}
+
+
+static void open_input_file(struct VpxInputContext *input) {
+  /* Parse certain options from the input file, if possible */
+  input->file = strcmp(input->filename, "-")
+      ? fopen(input->filename, "rb") : set_binary_mode(stdin);
+
+  if (!input->file)
+    fatal("Failed to open input file");
+
+  if (!fseeko(input->file, 0, SEEK_END)) {
+    /* Input file is seekable. Figure out how long it is, so we can get
+     * progress info.
+     */
+    input->length = ftello(input->file);
+    rewind(input->file);
+  }
+
+  /* Default to 1:1 pixel aspect ratio. */
+  input->pixel_aspect_ratio.numerator = 1;
+  input->pixel_aspect_ratio.denominator = 1;
+
+  /* For RAW input sources, these bytes will applied on the first frame
+   *  in read_frame().
+   */
+  input->detect.buf_read = fread(input->detect.buf, 1, 4, input->file);
+  input->detect.position = 0;
+
+  if (input->detect.buf_read == 4
+      && file_is_y4m(input->detect.buf)) {
+    if (y4m_input_open(&input->y4m, input->file, input->detect.buf, 4,
+                       input->only_i420) >= 0) {
+      input->file_type = FILE_TYPE_Y4M;
+      input->width = input->y4m.pic_w;
+      input->height = input->y4m.pic_h;
+      input->pixel_aspect_ratio.numerator = input->y4m.par_n;
+      input->pixel_aspect_ratio.denominator = input->y4m.par_d;
+      input->framerate.numerator = input->y4m.fps_n;
+      input->framerate.denominator = input->y4m.fps_d;
+      input->fmt = input->y4m.vpx_fmt;
+      input->bit_depth = input->y4m.bit_depth;
+    } else
+      fatal("Unsupported Y4M stream.");
+  } else if (input->detect.buf_read == 4 && fourcc_is_ivf(input->detect.buf)) {
+    fatal("IVF is not supported as input.");
+  } else {
+    input->file_type = FILE_TYPE_RAW;
+  }
+}
+
+
+static void close_input_file(struct VpxInputContext *input) {
+  fclose(input->file);
+  if (input->file_type == FILE_TYPE_Y4M)
+    y4m_input_close(&input->y4m);
+}
+
+static struct stream_state *new_stream(struct VpxEncoderConfig *global,
+                                       struct stream_state *prev) {
+  struct stream_state *stream;
+
+  stream = calloc(1, sizeof(*stream));
+  if (stream == NULL) {
+    fatal("Failed to allocate new stream.");
+  }
+
+  if (prev) {
+    memcpy(stream, prev, sizeof(*stream));
+    stream->index++;
+    prev->next = stream;
+  } else {
+    vpx_codec_err_t  res;
+
+    /* Populate encoder configuration */
+    res = vpx_codec_enc_config_default(global->codec->codec_interface(),
+                                       &stream->config.cfg,
+                                       global->usage);
+    if (res)
+      fatal("Failed to get config: %s\n", vpx_codec_err_to_string(res));
+
+    /* Change the default timebase to a high enough value so that the
+     * encoder will always create strictly increasing timestamps.
+     */
+    stream->config.cfg.g_timebase.den = 1000;
+
+    /* Never use the library's default resolution, require it be parsed
+     * from the file or set on the command line.
+     */
+    stream->config.cfg.g_w = 0;
+    stream->config.cfg.g_h = 0;
+
+    /* Initialize remaining stream parameters */
+    stream->config.write_webm = 1;
+#if CONFIG_WEBM_IO
+    stream->config.stereo_fmt = STEREO_FORMAT_MONO;
+    stream->ebml.last_pts_ns = -1;
+    stream->ebml.writer = NULL;
+    stream->ebml.segment = NULL;
+#endif
+
+    /* Allows removal of the application version from the EBML tags */
+    stream->ebml.debug = global->debug;
+
+    /* Default lag_in_frames is 0 in realtime mode */
+    if (global->deadline == VPX_DL_REALTIME)
+      stream->config.cfg.g_lag_in_frames = 0;
+  }
+
+  /* Output files must be specified for each stream */
+  stream->config.out_fn = NULL;
+
+  stream->next = NULL;
+  return stream;
+}
+
+
+static int parse_stream_params(struct VpxEncoderConfig *global,
+                               struct stream_state  *stream,
+                               char **argv) {
+  char                   **argi, **argj;
+  struct arg               arg;
+  static const arg_def_t **ctrl_args = no_args;
+  static const int        *ctrl_args_map = NULL;
+  struct stream_config    *config = &stream->config;
+  int                      eos_mark_found = 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+  int                      test_16bit_internal = 0;
+#endif
+
+  // Handle codec specific options
+  if (0) {
+#if CONFIG_VP8_ENCODER
+  } else if (strcmp(global->codec->name, "vp8") == 0) {
+    ctrl_args = vp8_args;
+    ctrl_args_map = vp8_arg_ctrl_map;
+#endif
+#if CONFIG_VP9_ENCODER
+  } else if (strcmp(global->codec->name, "vp9") == 0) {
+    ctrl_args = vp9_args;
+    ctrl_args_map = vp9_arg_ctrl_map;
+#endif
+#if CONFIG_VP10_ENCODER
+  } else if (strcmp(global->codec->name, "vp10") == 0) {
+    // TODO(jingning): Reuse VP9 specific encoder configuration parameters.
+    // Consider to expand this set for VP10 encoder control.
+    ctrl_args = vp10_args;
+    ctrl_args_map = vp10_arg_ctrl_map;
+#endif
+  }
+
+  for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
+    arg.argv_step = 1;
+
+    /* Once we've found an end-of-stream marker (--) we want to continue
+     * shifting arguments but not consuming them.
+     */
+    if (eos_mark_found) {
+      argj++;
+      continue;
+    } else if (!strcmp(*argj, "--")) {
+      eos_mark_found = 1;
+      continue;
+    }
+
+    if (arg_match(&arg, &outputfile, argi)) {
+      config->out_fn = arg.val;
+    } else if (arg_match(&arg, &fpf_name, argi)) {
+      config->stats_fn = arg.val;
+#if CONFIG_FP_MB_STATS
+    } else if (arg_match(&arg, &fpmbf_name, argi)) {
+      config->fpmb_stats_fn = arg.val;
+#endif
+    } else if (arg_match(&arg, &use_webm, argi)) {
+#if CONFIG_WEBM_IO
+      config->write_webm = 1;
+#else
+      die("Error: --webm specified but webm is disabled.");
+#endif
+    } else if (arg_match(&arg, &use_ivf, argi)) {
+      config->write_webm = 0;
+    } else if (arg_match(&arg, &threads, argi)) {
+      config->cfg.g_threads = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &profile, argi)) {
+      config->cfg.g_profile = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &width, argi)) {
+      config->cfg.g_w = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &height, argi)) {
+      config->cfg.g_h = arg_parse_uint(&arg);
+#if CONFIG_VP9_HIGHBITDEPTH
+    } else if (arg_match(&arg, &bitdeptharg, argi)) {
+      config->cfg.g_bit_depth = arg_parse_enum_or_int(&arg);
+    } else if (arg_match(&arg, &inbitdeptharg, argi)) {
+      config->cfg.g_input_bit_depth = arg_parse_uint(&arg);
+#endif
+#if CONFIG_WEBM_IO
+    } else if (arg_match(&arg, &stereo_mode, argi)) {
+      config->stereo_fmt = arg_parse_enum_or_int(&arg);
+#endif
+    } else if (arg_match(&arg, &timebase, argi)) {
+      config->cfg.g_timebase = arg_parse_rational(&arg);
+      validate_positive_rational(arg.name, &config->cfg.g_timebase);
+    } else if (arg_match(&arg, &error_resilient, argi)) {
+      config->cfg.g_error_resilient = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &lag_in_frames, argi)) {
+      config->cfg.g_lag_in_frames = arg_parse_uint(&arg);
+      if (global->deadline == VPX_DL_REALTIME &&
+          config->cfg.g_lag_in_frames != 0) {
+        warn("non-zero %s option ignored in realtime mode.\n", arg.name);
+        config->cfg.g_lag_in_frames = 0;
+      }
+    } else if (arg_match(&arg, &dropframe_thresh, argi)) {
+      config->cfg.rc_dropframe_thresh = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &resize_allowed, argi)) {
+      config->cfg.rc_resize_allowed = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &resize_width, argi)) {
+      config->cfg.rc_scaled_width = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &resize_height, argi)) {
+      config->cfg.rc_scaled_height = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &resize_up_thresh, argi)) {
+      config->cfg.rc_resize_up_thresh = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &resize_down_thresh, argi)) {
+      config->cfg.rc_resize_down_thresh = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &end_usage, argi)) {
+      config->cfg.rc_end_usage = arg_parse_enum_or_int(&arg);
+    } else if (arg_match(&arg, &target_bitrate, argi)) {
+      config->cfg.rc_target_bitrate = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &min_quantizer, argi)) {
+      config->cfg.rc_min_quantizer = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &max_quantizer, argi)) {
+      config->cfg.rc_max_quantizer = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &undershoot_pct, argi)) {
+      config->cfg.rc_undershoot_pct = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &overshoot_pct, argi)) {
+      config->cfg.rc_overshoot_pct = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &buf_sz, argi)) {
+      config->cfg.rc_buf_sz = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &buf_initial_sz, argi)) {
+      config->cfg.rc_buf_initial_sz = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &buf_optimal_sz, argi)) {
+      config->cfg.rc_buf_optimal_sz = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &bias_pct, argi)) {
+        config->cfg.rc_2pass_vbr_bias_pct = arg_parse_uint(&arg);
+      if (global->passes < 2)
+        warn("option %s ignored in one-pass mode.\n", arg.name);
+    } else if (arg_match(&arg, &minsection_pct, argi)) {
+      config->cfg.rc_2pass_vbr_minsection_pct = arg_parse_uint(&arg);
+
+      if (global->passes < 2)
+        warn("option %s ignored in one-pass mode.\n", arg.name);
+    } else if (arg_match(&arg, &maxsection_pct, argi)) {
+      config->cfg.rc_2pass_vbr_maxsection_pct = arg_parse_uint(&arg);
+
+      if (global->passes < 2)
+        warn("option %s ignored in one-pass mode.\n", arg.name);
+    } else if (arg_match(&arg, &kf_min_dist, argi)) {
+      config->cfg.kf_min_dist = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &kf_max_dist, argi)) {
+      config->cfg.kf_max_dist = arg_parse_uint(&arg);
+      config->have_kf_max_dist = 1;
+    } else if (arg_match(&arg, &kf_disabled, argi)) {
+      config->cfg.kf_mode = VPX_KF_DISABLED;
+#if CONFIG_VP9_HIGHBITDEPTH
+    } else if (arg_match(&arg, &test16bitinternalarg, argi)) {
+      if (strcmp(global->codec->name, "vp9") == 0 ||
+          strcmp(global->codec->name, "vp10") == 0) {
+        test_16bit_internal = 1;
+      }
+#endif
+    } else {
+      int i, match = 0;
+      for (i = 0; ctrl_args[i]; i++) {
+        if (arg_match(&arg, ctrl_args[i], argi)) {
+          int j;
+          match = 1;
+
+          /* Point either to the next free element or the first
+          * instance of this control.
+          */
+          for (j = 0; j < config->arg_ctrl_cnt; j++)
+            if (ctrl_args_map != NULL &&
+                config->arg_ctrls[j][0] == ctrl_args_map[i])
+              break;
+
+          /* Update/insert */
+          assert(j < (int)ARG_CTRL_CNT_MAX);
+          if (ctrl_args_map != NULL && j < (int)ARG_CTRL_CNT_MAX) {
+            config->arg_ctrls[j][0] = ctrl_args_map[i];
+            config->arg_ctrls[j][1] = arg_parse_enum_or_int(&arg);
+            if (j == config->arg_ctrl_cnt)
+              config->arg_ctrl_cnt++;
+          }
+        }
+      }
+      if (!match)
+        argj++;
+    }
+  }
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (strcmp(global->codec->name, "vp9") == 0 ||
+      strcmp(global->codec->name, "vp10") == 0) {
+    config->use_16bit_internal = test_16bit_internal |
+                                 (config->cfg.g_profile > 1);
+  }
+#endif
+  return eos_mark_found;
+}
+
+
+#define FOREACH_STREAM(func) \
+  do { \
+    struct stream_state *stream; \
+    for (stream = streams; stream; stream = stream->next) { \
+      func; \
+    } \
+  } while (0)
+
+
+static void validate_stream_config(const struct stream_state *stream,
+                                   const struct VpxEncoderConfig *global) {
+  const struct stream_state *streami;
+  (void)global;
+
+  if (!stream->config.cfg.g_w || !stream->config.cfg.g_h)
+    fatal("Stream %d: Specify stream dimensions with --width (-w) "
+          " and --height (-h)", stream->index);
+
+  // Check that the codec bit depth is greater than the input bit depth.
+  if (stream->config.cfg.g_input_bit_depth >
+      (unsigned int)stream->config.cfg.g_bit_depth) {
+    fatal("Stream %d: codec bit depth (%d) less than input bit depth (%d)",
+          stream->index, (int)stream->config.cfg.g_bit_depth,
+          stream->config.cfg.g_input_bit_depth);
+  }
+
+  for (streami = stream; streami; streami = streami->next) {
+    /* All streams require output files */
+    if (!streami->config.out_fn)
+      fatal("Stream %d: Output file is required (specify with -o)",
+            streami->index);
+
+    /* Check for two streams outputting to the same file */
+    if (streami != stream) {
+      const char *a = stream->config.out_fn;
+      const char *b = streami->config.out_fn;
+      if (!strcmp(a, b) && strcmp(a, "/dev/null") && strcmp(a, ":nul"))
+        fatal("Stream %d: duplicate output file (from stream %d)",
+              streami->index, stream->index);
+    }
+
+    /* Check for two streams sharing a stats file. */
+    if (streami != stream) {
+      const char *a = stream->config.stats_fn;
+      const char *b = streami->config.stats_fn;
+      if (a && b && !strcmp(a, b))
+        fatal("Stream %d: duplicate stats file (from stream %d)",
+              streami->index, stream->index);
+    }
+
+#if CONFIG_FP_MB_STATS
+    /* Check for two streams sharing a mb stats file. */
+    if (streami != stream) {
+      const char *a = stream->config.fpmb_stats_fn;
+      const char *b = streami->config.fpmb_stats_fn;
+      if (a && b && !strcmp(a, b))
+        fatal("Stream %d: duplicate mb stats file (from stream %d)",
+              streami->index, stream->index);
+    }
+#endif
+  }
+}
+
+
+static void set_stream_dimensions(struct stream_state *stream,
+                                  unsigned int w,
+                                  unsigned int h) {
+  if (!stream->config.cfg.g_w) {
+    if (!stream->config.cfg.g_h)
+      stream->config.cfg.g_w = w;
+    else
+      stream->config.cfg.g_w = w * stream->config.cfg.g_h / h;
+  }
+  if (!stream->config.cfg.g_h) {
+    stream->config.cfg.g_h = h * stream->config.cfg.g_w / w;
+  }
+}
+
+
+static void set_default_kf_interval(struct stream_state *stream,
+                                    struct VpxEncoderConfig *global) {
+  /* Use a max keyframe interval of 5 seconds, if none was
+   * specified on the command line.
+   */
+  if (!stream->config.have_kf_max_dist) {
+    double framerate = (double)global->framerate.num / global->framerate.den;
+    if (framerate > 0.0)
+      stream->config.cfg.kf_max_dist = (unsigned int)(5.0 * framerate);
+  }
+}
+
+static const char* file_type_to_string(enum VideoFileType t) {
+  switch (t) {
+    case FILE_TYPE_RAW: return "RAW";
+    case FILE_TYPE_Y4M: return "Y4M";
+    default: return "Other";
+  }
+}
+
+static const char* image_format_to_string(vpx_img_fmt_t f) {
+  switch (f) {
+    case VPX_IMG_FMT_I420: return "I420";
+    case VPX_IMG_FMT_I422: return "I422";
+    case VPX_IMG_FMT_I444: return "I444";
+    case VPX_IMG_FMT_I440: return "I440";
+    case VPX_IMG_FMT_YV12: return "YV12";
+    case VPX_IMG_FMT_I42016: return "I42016";
+    case VPX_IMG_FMT_I42216: return "I42216";
+    case VPX_IMG_FMT_I44416: return "I44416";
+    case VPX_IMG_FMT_I44016: return "I44016";
+    default: return "Other";
+  }
+}
+
+static void show_stream_config(struct stream_state *stream,
+                               struct VpxEncoderConfig *global,
+                               struct VpxInputContext *input) {
+
+#define SHOW(field) \
+  fprintf(stderr, "    %-28s = %d\n", #field, stream->config.cfg.field)
+
+  if (stream->index == 0) {
+    fprintf(stderr, "Codec: %s\n",
+            vpx_codec_iface_name(global->codec->codec_interface()));
+    fprintf(stderr, "Source file: %s File Type: %s Format: %s\n",
+            input->filename,
+            file_type_to_string(input->file_type),
+            image_format_to_string(input->fmt));
+  }
+  if (stream->next || stream->index)
+    fprintf(stderr, "\nStream Index: %d\n", stream->index);
+  fprintf(stderr, "Destination file: %s\n", stream->config.out_fn);
+  fprintf(stderr, "Encoder parameters:\n");
+
+  SHOW(g_usage);
+  SHOW(g_threads);
+  SHOW(g_profile);
+  SHOW(g_w);
+  SHOW(g_h);
+  SHOW(g_bit_depth);
+  SHOW(g_input_bit_depth);
+  SHOW(g_timebase.num);
+  SHOW(g_timebase.den);
+  SHOW(g_error_resilient);
+  SHOW(g_pass);
+  SHOW(g_lag_in_frames);
+  SHOW(rc_dropframe_thresh);
+  SHOW(rc_resize_allowed);
+  SHOW(rc_scaled_width);
+  SHOW(rc_scaled_height);
+  SHOW(rc_resize_up_thresh);
+  SHOW(rc_resize_down_thresh);
+  SHOW(rc_end_usage);
+  SHOW(rc_target_bitrate);
+  SHOW(rc_min_quantizer);
+  SHOW(rc_max_quantizer);
+  SHOW(rc_undershoot_pct);
+  SHOW(rc_overshoot_pct);
+  SHOW(rc_buf_sz);
+  SHOW(rc_buf_initial_sz);
+  SHOW(rc_buf_optimal_sz);
+  SHOW(rc_2pass_vbr_bias_pct);
+  SHOW(rc_2pass_vbr_minsection_pct);
+  SHOW(rc_2pass_vbr_maxsection_pct);
+  SHOW(kf_mode);
+  SHOW(kf_min_dist);
+  SHOW(kf_max_dist);
+}
+
+
+static void open_output_file(struct stream_state *stream,
+                             struct VpxEncoderConfig *global,
+                             const struct VpxRational *pixel_aspect_ratio) {
+  const char *fn = stream->config.out_fn;
+  const struct vpx_codec_enc_cfg *const cfg = &stream->config.cfg;
+
+  if (cfg->g_pass == VPX_RC_FIRST_PASS)
+    return;
+
+  stream->file = strcmp(fn, "-") ? fopen(fn, "wb") : set_binary_mode(stdout);
+
+  if (!stream->file)
+    fatal("Failed to open output file");
+
+  if (stream->config.write_webm && fseek(stream->file, 0, SEEK_CUR))
+    fatal("WebM output to pipes not supported.");
+
+#if CONFIG_WEBM_IO
+  if (stream->config.write_webm) {
+    stream->ebml.stream = stream->file;
+    write_webm_file_header(&stream->ebml, cfg,
+                           &global->framerate,
+                           stream->config.stereo_fmt,
+                           global->codec->fourcc,
+                           pixel_aspect_ratio);
+  }
+#else
+  (void)pixel_aspect_ratio;
+#endif
+
+  if (!stream->config.write_webm) {
+    ivf_write_file_header(stream->file, cfg, global->codec->fourcc, 0);
+  }
+}
+
+
+static void close_output_file(struct stream_state *stream,
+                              unsigned int fourcc) {
+  const struct vpx_codec_enc_cfg *const cfg = &stream->config.cfg;
+
+  if (cfg->g_pass == VPX_RC_FIRST_PASS)
+    return;
+
+#if CONFIG_WEBM_IO
+  if (stream->config.write_webm) {
+    write_webm_file_footer(&stream->ebml);
+  }
+#endif
+
+  if (!stream->config.write_webm) {
+    if (!fseek(stream->file, 0, SEEK_SET))
+      ivf_write_file_header(stream->file, &stream->config.cfg,
+                            fourcc,
+                            stream->frames_out);
+  }
+
+  fclose(stream->file);
+}
+
+
+static void setup_pass(struct stream_state *stream,
+                       struct VpxEncoderConfig *global,
+                       int pass) {
+  if (stream->config.stats_fn) {
+    if (!stats_open_file(&stream->stats, stream->config.stats_fn,
+                         pass))
+      fatal("Failed to open statistics store");
+  } else {
+    if (!stats_open_mem(&stream->stats, pass))
+      fatal("Failed to open statistics store");
+  }
+
+#if CONFIG_FP_MB_STATS
+  if (stream->config.fpmb_stats_fn) {
+    if (!stats_open_file(&stream->fpmb_stats,
+                         stream->config.fpmb_stats_fn, pass))
+      fatal("Failed to open mb statistics store");
+  } else {
+    if (!stats_open_mem(&stream->fpmb_stats, pass))
+      fatal("Failed to open mb statistics store");
+  }
+#endif
+
+  stream->config.cfg.g_pass = global->passes == 2
+                              ? pass ? VPX_RC_LAST_PASS : VPX_RC_FIRST_PASS
+                            : VPX_RC_ONE_PASS;
+  if (pass) {
+    stream->config.cfg.rc_twopass_stats_in = stats_get(&stream->stats);
+#if CONFIG_FP_MB_STATS
+    stream->config.cfg.rc_firstpass_mb_stats_in =
+        stats_get(&stream->fpmb_stats);
+#endif
+  }
+
+  stream->cx_time = 0;
+  stream->nbytes = 0;
+  stream->frames_out = 0;
+}
+
+
+static void initialize_encoder(struct stream_state *stream,
+                               struct VpxEncoderConfig *global) {
+  int i;
+  int flags = 0;
+
+  flags |= global->show_psnr ? VPX_CODEC_USE_PSNR : 0;
+  flags |= global->out_part ? VPX_CODEC_USE_OUTPUT_PARTITION : 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+  flags |= stream->config.use_16bit_internal ? VPX_CODEC_USE_HIGHBITDEPTH : 0;
+#endif
+
+  /* Construct Encoder Context */
+  vpx_codec_enc_init(&stream->encoder, global->codec->codec_interface(),
+                     &stream->config.cfg, flags);
+  ctx_exit_on_error(&stream->encoder, "Failed to initialize encoder");
+
+  /* Note that we bypass the vpx_codec_control wrapper macro because
+   * we're being clever to store the control IDs in an array. Real
+   * applications will want to make use of the enumerations directly
+   */
+  for (i = 0; i < stream->config.arg_ctrl_cnt; i++) {
+    int ctrl = stream->config.arg_ctrls[i][0];
+    int value = stream->config.arg_ctrls[i][1];
+    if (vpx_codec_control_(&stream->encoder, ctrl, value))
+      fprintf(stderr, "Error: Tried to set control %d = %d\n",
+              ctrl, value);
+
+    ctx_exit_on_error(&stream->encoder, "Failed to control codec");
+  }
+
+#if CONFIG_DECODERS
+  if (global->test_decode != TEST_DECODE_OFF) {
+    const VpxInterface *decoder = get_vpx_decoder_by_name(global->codec->name);
+    vpx_codec_dec_init(&stream->decoder, decoder->codec_interface(), NULL, 0);
+  }
+#endif
+}
+
+
+static void encode_frame(struct stream_state *stream,
+                         struct VpxEncoderConfig *global,
+                         struct vpx_image *img,
+                         unsigned int frames_in) {
+  vpx_codec_pts_t frame_start, next_frame_start;
+  struct vpx_codec_enc_cfg *cfg = &stream->config.cfg;
+  struct vpx_usec_timer timer;
+
+  frame_start = (cfg->g_timebase.den * (int64_t)(frames_in - 1)
+                 * global->framerate.den)
+                / cfg->g_timebase.num / global->framerate.num;
+  next_frame_start = (cfg->g_timebase.den * (int64_t)(frames_in)
+                      * global->framerate.den)
+                     / cfg->g_timebase.num / global->framerate.num;
+
+  /* Scale if necessary */
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (img) {
+    if ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) &&
+        (img->d_w != cfg->g_w || img->d_h != cfg->g_h)) {
+      if (img->fmt != VPX_IMG_FMT_I42016) {
+        fprintf(stderr, "%s can only scale 4:2:0 inputs\n", exec_name);
+        exit(EXIT_FAILURE);
+      }
+#if CONFIG_LIBYUV
+      if (!stream->img) {
+        stream->img = vpx_img_alloc(NULL, VPX_IMG_FMT_I42016,
+                                    cfg->g_w, cfg->g_h, 16);
+      }
+      I420Scale_16((uint16*)img->planes[VPX_PLANE_Y],
+                   img->stride[VPX_PLANE_Y]/2,
+                   (uint16*)img->planes[VPX_PLANE_U],
+                   img->stride[VPX_PLANE_U]/2,
+                   (uint16*)img->planes[VPX_PLANE_V],
+                   img->stride[VPX_PLANE_V]/2,
+                   img->d_w, img->d_h,
+                   (uint16*)stream->img->planes[VPX_PLANE_Y],
+                   stream->img->stride[VPX_PLANE_Y]/2,
+                   (uint16*)stream->img->planes[VPX_PLANE_U],
+                   stream->img->stride[VPX_PLANE_U]/2,
+                   (uint16*)stream->img->planes[VPX_PLANE_V],
+                   stream->img->stride[VPX_PLANE_V]/2,
+                   stream->img->d_w, stream->img->d_h,
+                   kFilterBox);
+      img = stream->img;
+#else
+    stream->encoder.err = 1;
+    ctx_exit_on_error(&stream->encoder,
+                      "Stream %d: Failed to encode frame.\n"
+                      "Scaling disabled in this configuration. \n"
+                      "To enable, configure with --enable-libyuv\n",
+                      stream->index);
+#endif
+    }
+  }
+#endif
+  if (img && (img->d_w != cfg->g_w || img->d_h != cfg->g_h)) {
+    if (img->fmt != VPX_IMG_FMT_I420 && img->fmt != VPX_IMG_FMT_YV12) {
+      fprintf(stderr, "%s can only scale 4:2:0 8bpp inputs\n", exec_name);
+      exit(EXIT_FAILURE);
+    }
+#if CONFIG_LIBYUV
+    if (!stream->img)
+      stream->img = vpx_img_alloc(NULL, VPX_IMG_FMT_I420,
+                                  cfg->g_w, cfg->g_h, 16);
+    I420Scale(img->planes[VPX_PLANE_Y], img->stride[VPX_PLANE_Y],
+              img->planes[VPX_PLANE_U], img->stride[VPX_PLANE_U],
+              img->planes[VPX_PLANE_V], img->stride[VPX_PLANE_V],
+              img->d_w, img->d_h,
+              stream->img->planes[VPX_PLANE_Y],
+              stream->img->stride[VPX_PLANE_Y],
+              stream->img->planes[VPX_PLANE_U],
+              stream->img->stride[VPX_PLANE_U],
+              stream->img->planes[VPX_PLANE_V],
+              stream->img->stride[VPX_PLANE_V],
+              stream->img->d_w, stream->img->d_h,
+              kFilterBox);
+    img = stream->img;
+#else
+    stream->encoder.err = 1;
+    ctx_exit_on_error(&stream->encoder,
+                      "Stream %d: Failed to encode frame.\n"
+                      "Scaling disabled in this configuration. \n"
+                      "To enable, configure with --enable-libyuv\n",
+                      stream->index);
+#endif
+  }
+
+  vpx_usec_timer_start(&timer);
+  vpx_codec_encode(&stream->encoder, img, frame_start,
+                   (unsigned long)(next_frame_start - frame_start),
+                   0, global->deadline);
+  vpx_usec_timer_mark(&timer);
+  stream->cx_time += vpx_usec_timer_elapsed(&timer);
+  ctx_exit_on_error(&stream->encoder, "Stream %d: Failed to encode frame",
+                    stream->index);
+}
+
+
+static void update_quantizer_histogram(struct stream_state *stream) {
+  if (stream->config.cfg.g_pass != VPX_RC_FIRST_PASS) {
+    int q;
+
+    vpx_codec_control(&stream->encoder, VP8E_GET_LAST_QUANTIZER_64, &q);
+    ctx_exit_on_error(&stream->encoder, "Failed to read quantizer");
+    stream->counts[q]++;
+  }
+}
+
+
+static void get_cx_data(struct stream_state *stream,
+                        struct VpxEncoderConfig *global,
+                        int *got_data) {
+  const vpx_codec_cx_pkt_t *pkt;
+  const struct vpx_codec_enc_cfg *cfg = &stream->config.cfg;
+  vpx_codec_iter_t iter = NULL;
+
+  *got_data = 0;
+  while ((pkt = vpx_codec_get_cx_data(&stream->encoder, &iter))) {
+    static size_t fsize = 0;
+    static int64_t ivf_header_pos = 0;
+
+    switch (pkt->kind) {
+      case VPX_CODEC_CX_FRAME_PKT:
+        if (!(pkt->data.frame.flags & VPX_FRAME_IS_FRAGMENT)) {
+          stream->frames_out++;
+        }
+        if (!global->quiet)
+          fprintf(stderr, " %6luF", (unsigned long)pkt->data.frame.sz);
+
+        update_rate_histogram(stream->rate_hist, cfg, pkt);
+#if CONFIG_WEBM_IO
+        if (stream->config.write_webm) {
+          write_webm_block(&stream->ebml, cfg, pkt);
+        }
+#endif
+        if (!stream->config.write_webm) {
+          if (pkt->data.frame.partition_id <= 0) {
+            ivf_header_pos = ftello(stream->file);
+            fsize = pkt->data.frame.sz;
+
+            ivf_write_frame_header(stream->file, pkt->data.frame.pts, fsize);
+          } else {
+            fsize += pkt->data.frame.sz;
+
+            if (!(pkt->data.frame.flags & VPX_FRAME_IS_FRAGMENT)) {
+              const int64_t currpos = ftello(stream->file);
+              fseeko(stream->file, ivf_header_pos, SEEK_SET);
+              ivf_write_frame_size(stream->file, fsize);
+              fseeko(stream->file, currpos, SEEK_SET);
+            }
+          }
+
+          (void) fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz,
+                        stream->file);
+        }
+        stream->nbytes += pkt->data.raw.sz;
+
+        *got_data = 1;
+#if CONFIG_DECODERS
+        if (global->test_decode != TEST_DECODE_OFF && !stream->mismatch_seen) {
+          vpx_codec_decode(&stream->decoder, pkt->data.frame.buf,
+                           (unsigned int)pkt->data.frame.sz, NULL, 0);
+          if (stream->decoder.err) {
+            warn_or_exit_on_error(&stream->decoder,
+                                  global->test_decode == TEST_DECODE_FATAL,
+                                  "Failed to decode frame %d in stream %d",
+                                  stream->frames_out + 1, stream->index);
+            stream->mismatch_seen = stream->frames_out + 1;
+          }
+        }
+#endif
+        break;
+      case VPX_CODEC_STATS_PKT:
+        stream->frames_out++;
+        stats_write(&stream->stats,
+                    pkt->data.twopass_stats.buf,
+                    pkt->data.twopass_stats.sz);
+        stream->nbytes += pkt->data.raw.sz;
+        break;
+#if CONFIG_FP_MB_STATS
+      case VPX_CODEC_FPMB_STATS_PKT:
+        stats_write(&stream->fpmb_stats,
+                    pkt->data.firstpass_mb_stats.buf,
+                    pkt->data.firstpass_mb_stats.sz);
+        stream->nbytes += pkt->data.raw.sz;
+        break;
+#endif
+      case VPX_CODEC_PSNR_PKT:
+
+        if (global->show_psnr) {
+          int i;
+
+          stream->psnr_sse_total += pkt->data.psnr.sse[0];
+          stream->psnr_samples_total += pkt->data.psnr.samples[0];
+          for (i = 0; i < 4; i++) {
+            if (!global->quiet)
+              fprintf(stderr, "%.3f ", pkt->data.psnr.psnr[i]);
+            stream->psnr_totals[i] += pkt->data.psnr.psnr[i];
+          }
+          stream->psnr_count++;
+        }
+
+        break;
+      default:
+        break;
+    }
+  }
+}
+
+
+static void show_psnr(struct stream_state  *stream, double peak) {
+  int i;
+  double ovpsnr;
+
+  if (!stream->psnr_count)
+    return;
+
+  fprintf(stderr, "Stream %d PSNR (Overall/Avg/Y/U/V)", stream->index);
+  ovpsnr = sse_to_psnr((double)stream->psnr_samples_total, peak,
+                       (double)stream->psnr_sse_total);
+  fprintf(stderr, " %.3f", ovpsnr);
+
+  for (i = 0; i < 4; i++) {
+    fprintf(stderr, " %.3f", stream->psnr_totals[i] / stream->psnr_count);
+  }
+  fprintf(stderr, "\n");
+}
+
+
+static float usec_to_fps(uint64_t usec, unsigned int frames) {
+  return (float)(usec > 0 ? frames * 1000000.0 / (float)usec : 0);
+}
+
+static void test_decode(struct stream_state  *stream,
+                        enum TestDecodeFatality fatal,
+                        const VpxInterface *codec) {
+  vpx_image_t enc_img, dec_img;
+
+  if (stream->mismatch_seen)
+    return;
+
+  /* Get the internal reference frame */
+  if (strcmp(codec->name, "vp8") == 0) {
+    struct vpx_ref_frame ref_enc, ref_dec;
+    int width, height;
+
+    width = (stream->config.cfg.g_w + 15) & ~15;
+    height = (stream->config.cfg.g_h + 15) & ~15;
+    vpx_img_alloc(&ref_enc.img, VPX_IMG_FMT_I420, width, height, 1);
+    enc_img = ref_enc.img;
+    vpx_img_alloc(&ref_dec.img, VPX_IMG_FMT_I420, width, height, 1);
+    dec_img = ref_dec.img;
+
+    ref_enc.frame_type = VP8_LAST_FRAME;
+    ref_dec.frame_type = VP8_LAST_FRAME;
+    vpx_codec_control(&stream->encoder, VP8_COPY_REFERENCE, &ref_enc);
+    vpx_codec_control(&stream->decoder, VP8_COPY_REFERENCE, &ref_dec);
+  } else {
+    struct vp9_ref_frame ref_enc, ref_dec;
+
+    ref_enc.idx = 0;
+    ref_dec.idx = 0;
+    vpx_codec_control(&stream->encoder, VP9_GET_REFERENCE, &ref_enc);
+    enc_img = ref_enc.img;
+    vpx_codec_control(&stream->decoder, VP9_GET_REFERENCE, &ref_dec);
+    dec_img = ref_dec.img;
+#if CONFIG_VP9_HIGHBITDEPTH
+    if ((enc_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) !=
+        (dec_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH)) {
+      if (enc_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
+        vpx_img_alloc(&enc_img, enc_img.fmt - VPX_IMG_FMT_HIGHBITDEPTH,
+                      enc_img.d_w, enc_img.d_h, 16);
+        vpx_img_truncate_16_to_8(&enc_img, &ref_enc.img);
+      }
+      if (dec_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
+        vpx_img_alloc(&dec_img, dec_img.fmt - VPX_IMG_FMT_HIGHBITDEPTH,
+                      dec_img.d_w, dec_img.d_h, 16);
+        vpx_img_truncate_16_to_8(&dec_img, &ref_dec.img);
+      }
+    }
+#endif
+  }
+  ctx_exit_on_error(&stream->encoder, "Failed to get encoder reference frame");
+  ctx_exit_on_error(&stream->decoder, "Failed to get decoder reference frame");
+
+  if (!compare_img(&enc_img, &dec_img)) {
+    int y[4], u[4], v[4];
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (enc_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
+      find_mismatch_high(&enc_img, &dec_img, y, u, v);
+    } else {
+      find_mismatch(&enc_img, &dec_img, y, u, v);
+    }
+#else
+    find_mismatch(&enc_img, &dec_img, y, u, v);
+#endif
+    stream->decoder.err = 1;
+    warn_or_exit_on_error(&stream->decoder, fatal == TEST_DECODE_FATAL,
+                          "Stream %d: Encode/decode mismatch on frame %d at"
+                          " Y[%d, %d] {%d/%d},"
+                          " U[%d, %d] {%d/%d},"
+                          " V[%d, %d] {%d/%d}",
+                          stream->index, stream->frames_out,
+                          y[0], y[1], y[2], y[3],
+                          u[0], u[1], u[2], u[3],
+                          v[0], v[1], v[2], v[3]);
+    stream->mismatch_seen = stream->frames_out;
+  }
+
+  vpx_img_free(&enc_img);
+  vpx_img_free(&dec_img);
+}
+
+
+static void print_time(const char *label, int64_t etl) {
+  int64_t hours;
+  int64_t mins;
+  int64_t secs;
+
+  if (etl >= 0) {
+    hours = etl / 3600;
+    etl -= hours * 3600;
+    mins = etl / 60;
+    etl -= mins * 60;
+    secs = etl;
+
+    fprintf(stderr, "[%3s %2"PRId64":%02"PRId64":%02"PRId64"] ",
+            label, hours, mins, secs);
+  } else {
+    fprintf(stderr, "[%3s  unknown] ", label);
+  }
+}
+
+
+int main(int argc, const char **argv_) {
+  int pass;
+  vpx_image_t raw;
+#if CONFIG_VP9_HIGHBITDEPTH
+  vpx_image_t raw_shift;
+  int allocated_raw_shift = 0;
+  int use_16bit_internal = 0;
+  int input_shift = 0;
+#endif
+  int frame_avail, got_data;
+
+  struct VpxInputContext input;
+  struct VpxEncoderConfig global;
+  struct stream_state *streams = NULL;
+  char **argv, **argi;
+  uint64_t cx_time = 0;
+  int stream_cnt = 0;
+  int res = 0;
+
+  memset(&input, 0, sizeof(input));
+  exec_name = argv_[0];
+
+  if (argc < 3)
+    usage_exit();
+
+  /* Setup default input stream settings */
+  input.framerate.numerator = 30;
+  input.framerate.denominator = 1;
+  input.only_i420 = 1;
+  input.bit_depth = 0;
+
+  /* First parse the global configuration values, because we want to apply
+   * other parameters on top of the default configuration provided by the
+   * codec.
+   */
+  argv = argv_dup(argc - 1, argv_ + 1);
+  parse_global_config(&global, argv);
+
+  switch (global.color_type) {
+    case I420:
+      input.fmt = VPX_IMG_FMT_I420;
+      break;
+    case I422:
+      input.fmt = VPX_IMG_FMT_I422;
+      break;
+    case I444:
+      input.fmt = VPX_IMG_FMT_I444;
+      break;
+    case I440:
+      input.fmt = VPX_IMG_FMT_I440;
+      break;
+    case YV12:
+      input.fmt = VPX_IMG_FMT_YV12;
+      break;
+  }
+
+  {
+    /* Now parse each stream's parameters. Using a local scope here
+     * due to the use of 'stream' as loop variable in FOREACH_STREAM
+     * loops
+     */
+    struct stream_state *stream = NULL;
+
+    do {
+      stream = new_stream(&global, stream);
+      stream_cnt++;
+      if (!streams)
+        streams = stream;
+    } while (parse_stream_params(&global, stream, argv));
+  }
+
+  /* Check for unrecognized options */
+  for (argi = argv; *argi; argi++)
+    if (argi[0][0] == '-' && argi[0][1])
+      die("Error: Unrecognized option %s\n", *argi);
+
+  FOREACH_STREAM(check_encoder_config(global.disable_warning_prompt,
+                                      &global, &stream->config.cfg););
+
+  /* Handle non-option arguments */
+  input.filename = argv[0];
+
+  if (!input.filename)
+    usage_exit();
+
+  /* Decide if other chroma subsamplings than 4:2:0 are supported */
+  if (global.codec->fourcc == VP9_FOURCC || global.codec->fourcc == VP10_FOURCC)
+    input.only_i420 = 0;
+
+  for (pass = global.pass ? global.pass - 1 : 0; pass < global.passes; pass++) {
+    int frames_in = 0, seen_frames = 0;
+    int64_t estimated_time_left = -1;
+    int64_t average_rate = -1;
+    int64_t lagged_count = 0;
+
+    open_input_file(&input);
+
+    /* If the input file doesn't specify its w/h (raw files), try to get
+     * the data from the first stream's configuration.
+     */
+    if (!input.width || !input.height) {
+      FOREACH_STREAM({
+        if (stream->config.cfg.g_w && stream->config.cfg.g_h) {
+          input.width = stream->config.cfg.g_w;
+          input.height = stream->config.cfg.g_h;
+          break;
+        }
+      });
+    }
+
+    /* Update stream configurations from the input file's parameters */
+    if (!input.width || !input.height)
+      fatal("Specify stream dimensions with --width (-w) "
+            " and --height (-h)");
+
+    /* If input file does not specify bit-depth but input-bit-depth parameter
+     * exists, assume that to be the input bit-depth. However, if the
+     * input-bit-depth paramter does not exist, assume the input bit-depth
+     * to be the same as the codec bit-depth.
+     */
+    if (!input.bit_depth) {
+      FOREACH_STREAM({
+        if (stream->config.cfg.g_input_bit_depth)
+          input.bit_depth = stream->config.cfg.g_input_bit_depth;
+        else
+          input.bit_depth = stream->config.cfg.g_input_bit_depth =
+              (int)stream->config.cfg.g_bit_depth;
+      });
+      if (input.bit_depth > 8) input.fmt |= VPX_IMG_FMT_HIGHBITDEPTH;
+    } else {
+      FOREACH_STREAM({
+        stream->config.cfg.g_input_bit_depth = input.bit_depth;
+      });
+    }
+
+    FOREACH_STREAM(set_stream_dimensions(stream, input.width, input.height));
+    FOREACH_STREAM(validate_stream_config(stream, &global));
+
+    /* Ensure that --passes and --pass are consistent. If --pass is set and
+     * --passes=2, ensure --fpf was set.
+     */
+    if (global.pass && global.passes == 2)
+      FOREACH_STREAM( {
+      if (!stream->config.stats_fn)
+        die("Stream %d: Must specify --fpf when --pass=%d"
+        " and --passes=2\n", stream->index, global.pass);
+    });
+
+#if !CONFIG_WEBM_IO
+    FOREACH_STREAM({
+      if (stream->config.write_webm) {
+        stream->config.write_webm = 0;
+        warn("vpxenc was compiled without WebM container support."
+             "Producing IVF output");
+      }
+    });
+#endif
+
+    /* Use the frame rate from the file only if none was specified
+     * on the command-line.
+     */
+    if (!global.have_framerate) {
+      global.framerate.num = input.framerate.numerator;
+      global.framerate.den = input.framerate.denominator;
+      FOREACH_STREAM(stream->config.cfg.g_timebase.den = global.framerate.num;
+                     stream->config.cfg.g_timebase.num = global.framerate.den);
+    }
+
+    FOREACH_STREAM(set_default_kf_interval(stream, &global));
+
+    /* Show configuration */
+    if (global.verbose && pass == 0)
+      FOREACH_STREAM(show_stream_config(stream, &global, &input));
+
+    if (pass == (global.pass ? global.pass - 1 : 0)) {
+      if (input.file_type == FILE_TYPE_Y4M)
+        /*The Y4M reader does its own allocation.
+          Just initialize this here to avoid problems if we never read any
+           frames.*/
+        memset(&raw, 0, sizeof(raw));
+      else
+        vpx_img_alloc(&raw, input.fmt, input.width, input.height, 32);
+
+      FOREACH_STREAM(stream->rate_hist =
+                         init_rate_histogram(&stream->config.cfg,
+                                             &global.framerate));
+    }
+
+    FOREACH_STREAM(setup_pass(stream, &global, pass));
+    FOREACH_STREAM(open_output_file(stream, &global,
+                                    &input.pixel_aspect_ratio));
+    FOREACH_STREAM(initialize_encoder(stream, &global));
+
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (strcmp(global.codec->name, "vp9") == 0 ||
+        strcmp(global.codec->name, "vp10") == 0) {
+      // Check to see if at least one stream uses 16 bit internal.
+      // Currently assume that the bit_depths for all streams using
+      // highbitdepth are the same.
+      FOREACH_STREAM({
+        if (stream->config.use_16bit_internal) {
+          use_16bit_internal = 1;
+        }
+        if (stream->config.cfg.g_profile == 0) {
+          input_shift = 0;
+        } else {
+          input_shift = (int)stream->config.cfg.g_bit_depth -
+              stream->config.cfg.g_input_bit_depth;
+        }
+      });
+    }
+#endif
+
+    frame_avail = 1;
+    got_data = 0;
+
+    while (frame_avail || got_data) {
+      struct vpx_usec_timer timer;
+
+      if (!global.limit || frames_in < global.limit) {
+        frame_avail = read_frame(&input, &raw);
+
+        if (frame_avail)
+          frames_in++;
+        seen_frames = frames_in > global.skip_frames ?
+                          frames_in - global.skip_frames : 0;
+
+        if (!global.quiet) {
+          float fps = usec_to_fps(cx_time, seen_frames);
+          fprintf(stderr, "\rPass %d/%d ", pass + 1, global.passes);
+
+          if (stream_cnt == 1)
+            fprintf(stderr,
+                    "frame %4d/%-4d %7"PRId64"B ",
+                    frames_in, streams->frames_out, (int64_t)streams->nbytes);
+          else
+            fprintf(stderr, "frame %4d ", frames_in);
+
+          fprintf(stderr, "%7"PRId64" %s %.2f %s ",
+                  cx_time > 9999999 ? cx_time / 1000 : cx_time,
+                  cx_time > 9999999 ? "ms" : "us",
+                  fps >= 1.0 ? fps : fps * 60,
+                  fps >= 1.0 ? "fps" : "fpm");
+          print_time("ETA", estimated_time_left);
+        }
+
+      } else
+        frame_avail = 0;
+
+      if (frames_in > global.skip_frames) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        vpx_image_t *frame_to_encode;
+        if (input_shift || (use_16bit_internal && input.bit_depth == 8)) {
+          assert(use_16bit_internal);
+          // Input bit depth and stream bit depth do not match, so up
+          // shift frame to stream bit depth
+          if (!allocated_raw_shift) {
+            vpx_img_alloc(&raw_shift, raw.fmt | VPX_IMG_FMT_HIGHBITDEPTH,
+                          input.width, input.height, 32);
+            allocated_raw_shift = 1;
+          }
+          vpx_img_upshift(&raw_shift, &raw, input_shift);
+          frame_to_encode = &raw_shift;
+        } else {
+          frame_to_encode = &raw;
+        }
+        vpx_usec_timer_start(&timer);
+        if (use_16bit_internal) {
+          assert(frame_to_encode->fmt & VPX_IMG_FMT_HIGHBITDEPTH);
+          FOREACH_STREAM({
+            if (stream->config.use_16bit_internal)
+              encode_frame(stream, &global,
+                           frame_avail ? frame_to_encode : NULL,
+                           frames_in);
+            else
+              assert(0);
+          });
+        } else {
+          assert((frame_to_encode->fmt & VPX_IMG_FMT_HIGHBITDEPTH) == 0);
+          FOREACH_STREAM(encode_frame(stream, &global,
+                                      frame_avail ? frame_to_encode : NULL,
+                                      frames_in));
+        }
+#else
+        vpx_usec_timer_start(&timer);
+        FOREACH_STREAM(encode_frame(stream, &global,
+                                    frame_avail ? &raw : NULL,
+                                    frames_in));
+#endif
+        vpx_usec_timer_mark(&timer);
+        cx_time += vpx_usec_timer_elapsed(&timer);
+
+        FOREACH_STREAM(update_quantizer_histogram(stream));
+
+        got_data = 0;
+        FOREACH_STREAM(get_cx_data(stream, &global, &got_data));
+
+        if (!got_data && input.length && streams != NULL &&
+            !streams->frames_out) {
+          lagged_count = global.limit ? seen_frames : ftello(input.file);
+        } else if (input.length) {
+          int64_t remaining;
+          int64_t rate;
+
+          if (global.limit) {
+            const int64_t frame_in_lagged = (seen_frames - lagged_count) * 1000;
+
+            rate = cx_time ? frame_in_lagged * (int64_t)1000000 / cx_time : 0;
+            remaining = 1000 * (global.limit - global.skip_frames
+                                - seen_frames + lagged_count);
+          } else {
+            const int64_t input_pos = ftello(input.file);
+            const int64_t input_pos_lagged = input_pos - lagged_count;
+            const int64_t limit = input.length;
+
+            rate = cx_time ? input_pos_lagged * (int64_t)1000000 / cx_time : 0;
+            remaining = limit - input_pos + lagged_count;
+          }
+
+          average_rate = (average_rate <= 0)
+              ? rate
+              : (average_rate * 7 + rate) / 8;
+          estimated_time_left = average_rate ? remaining / average_rate : -1;
+        }
+
+        if (got_data && global.test_decode != TEST_DECODE_OFF)
+          FOREACH_STREAM(test_decode(stream, global.test_decode, global.codec));
+      }
+
+      fflush(stdout);
+      if (!global.quiet)
+        fprintf(stderr, "\033[K");
+    }
+
+    if (stream_cnt > 1)
+      fprintf(stderr, "\n");
+
+    if (!global.quiet) {
+      FOREACH_STREAM(fprintf(stderr,
+          "\rPass %d/%d frame %4d/%-4d %7"PRId64"B %7"PRId64"b/f %7"PRId64"b/s"
+          " %7"PRId64" %s (%.2f fps)\033[K\n",
+          pass + 1,
+          global.passes, frames_in, stream->frames_out, (int64_t)stream->nbytes,
+          seen_frames ? (int64_t)(stream->nbytes * 8 / seen_frames) : 0,
+          seen_frames ? (int64_t)stream->nbytes * 8 *
+              (int64_t)global.framerate.num / global.framerate.den /
+              seen_frames : 0,
+          stream->cx_time > 9999999 ? stream->cx_time / 1000 : stream->cx_time,
+          stream->cx_time > 9999999 ? "ms" : "us",
+          usec_to_fps(stream->cx_time, seen_frames)));
+    }
+
+    if (global.show_psnr) {
+      if (global.codec->fourcc == VP9_FOURCC) {
+        FOREACH_STREAM(
+            show_psnr(stream, (1 << stream->config.cfg.g_input_bit_depth) - 1));
+      } else {
+        FOREACH_STREAM(show_psnr(stream, 255.0));
+      }
+    }
+
+    FOREACH_STREAM(vpx_codec_destroy(&stream->encoder));
+
+    if (global.test_decode != TEST_DECODE_OFF) {
+      FOREACH_STREAM(vpx_codec_destroy(&stream->decoder));
+    }
+
+    close_input_file(&input);
+
+    if (global.test_decode == TEST_DECODE_FATAL) {
+      FOREACH_STREAM(res |= stream->mismatch_seen);
+    }
+    FOREACH_STREAM(close_output_file(stream, global.codec->fourcc));
+
+    FOREACH_STREAM(stats_close(&stream->stats, global.passes - 1));
+
+#if CONFIG_FP_MB_STATS
+    FOREACH_STREAM(stats_close(&stream->fpmb_stats, global.passes - 1));
+#endif
+
+    if (global.pass)
+      break;
+  }
+
+  if (global.show_q_hist_buckets)
+    FOREACH_STREAM(show_q_histogram(stream->counts,
+                                    global.show_q_hist_buckets));
+
+  if (global.show_rate_hist_buckets)
+    FOREACH_STREAM(show_rate_histogram(stream->rate_hist,
+                                       &stream->config.cfg,
+                                       global.show_rate_hist_buckets));
+  FOREACH_STREAM(destroy_rate_histogram(stream->rate_hist));
+
+#if CONFIG_INTERNAL_STATS
+  /* TODO(jkoleszar): This doesn't belong in this executable. Do it for now,
+   * to match some existing utilities.
+   */
+  if (!(global.pass == 1 && global.passes == 2))
+    FOREACH_STREAM({
+      FILE *f = fopen("opsnr.stt", "a");
+      if (stream->mismatch_seen) {
+        fprintf(f, "First mismatch occurred in frame %d\n",
+                stream->mismatch_seen);
+      } else {
+        fprintf(f, "No mismatch detected in recon buffers\n");
+      }
+      fclose(f);
+    });
+#endif
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (allocated_raw_shift)
+    vpx_img_free(&raw_shift);
+#endif
+  vpx_img_free(&raw);
+  free(argv);
+  free(streams);
+  return res ? EXIT_FAILURE : EXIT_SUCCESS;
+}
diff --git a/libs/libvpx/vpxenc.h b/libs/libvpx/vpxenc.h
new file mode 100644
index 0000000000..d867e9d954
--- /dev/null
+++ b/libs/libvpx/vpxenc.h
@@ -0,0 +1,64 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef VPXENC_H_
+#define VPXENC_H_
+
+#include "vpx/vpx_encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum TestDecodeFatality {
+  TEST_DECODE_OFF,
+  TEST_DECODE_FATAL,
+  TEST_DECODE_WARN,
+};
+
+typedef enum {
+  I420,  // 4:2:0 8+ bit-depth
+  I422,  // 4:2:2 8+ bit-depth
+  I444,  // 4:4:4 8+ bit-depth
+  I440,  // 4:4:0 8+ bit-depth
+  YV12,  // 4:2:0 with uv flipped, only 8-bit depth
+} ColorInputType;
+
+struct VpxInterface;
+
+/* Configuration elements common to all streams. */
+struct VpxEncoderConfig {
+  const struct VpxInterface *codec;
+  int passes;
+  int pass;
+  int usage;
+  int deadline;
+  ColorInputType color_type;
+  int quiet;
+  int verbose;
+  int limit;
+  int skip_frames;
+  int show_psnr;
+  enum TestDecodeFatality test_decode;
+  int have_framerate;
+  struct vpx_rational framerate;
+  int out_part;
+  int debug;
+  int show_q_hist_buckets;
+  int show_rate_hist_buckets;
+  int disable_warnings;
+  int disable_warning_prompt;
+  int experimental_bitstream;
+};
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPXENC_H_
diff --git a/libs/libvpx/vpxstats.c b/libs/libvpx/vpxstats.c
new file mode 100644
index 0000000000..16728ce096
--- /dev/null
+++ b/libs/libvpx/vpxstats.c
@@ -0,0 +1,109 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpxstats.h"
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "./tools_common.h"
+
+int stats_open_file(stats_io_t *stats, const char *fpf, int pass) {
+  int res;
+  stats->pass = pass;
+
+  if (pass == 0) {
+    stats->file = fopen(fpf, "wb");
+    stats->buf.sz = 0;
+    stats->buf.buf = NULL;
+    res = (stats->file != NULL);
+  } else {
+    size_t nbytes;
+
+    stats->file = fopen(fpf, "rb");
+
+    if (stats->file == NULL)
+      fatal("First-pass stats file does not exist!");
+
+    if (fseek(stats->file, 0, SEEK_END))
+      fatal("First-pass stats file must be seekable!");
+
+    stats->buf.sz = stats->buf_alloc_sz = ftell(stats->file);
+    rewind(stats->file);
+
+    stats->buf.buf = malloc(stats->buf_alloc_sz);
+
+    if (!stats->buf.buf)
+      fatal("Failed to allocate first-pass stats buffer (%lu bytes)",
+            (unsigned int)stats->buf_alloc_sz);
+
+    nbytes = fread(stats->buf.buf, 1, stats->buf.sz, stats->file);
+    res = (nbytes == stats->buf.sz);
+  }
+
+  return res;
+}
+
+int stats_open_mem(stats_io_t *stats, int pass) {
+  int res;
+  stats->pass = pass;
+
+  if (!pass) {
+    stats->buf.sz = 0;
+    stats->buf_alloc_sz = 64 * 1024;
+    stats->buf.buf = malloc(stats->buf_alloc_sz);
+  }
+
+  stats->buf_ptr = stats->buf.buf;
+  res = (stats->buf.buf != NULL);
+  return res;
+}
+
+void stats_close(stats_io_t *stats, int last_pass) {
+  if (stats->file) {
+    if (stats->pass == last_pass) {
+      free(stats->buf.buf);
+    }
+
+    fclose(stats->file);
+    stats->file = NULL;
+  } else {
+    if (stats->pass == last_pass)
+      free(stats->buf.buf);
+  }
+}
+
+void stats_write(stats_io_t *stats, const void *pkt, size_t len) {
+  if (stats->file) {
+    (void) fwrite(pkt, 1, len, stats->file);
+  } else {
+    if (stats->buf.sz + len > stats->buf_alloc_sz) {
+      size_t  new_sz = stats->buf_alloc_sz + 64 * 1024;
+      char   *new_ptr = realloc(stats->buf.buf, new_sz);
+
+      if (new_ptr) {
+        stats->buf_ptr = new_ptr + (stats->buf_ptr - (char *)stats->buf.buf);
+        stats->buf.buf = new_ptr;
+        stats->buf_alloc_sz = new_sz;
+      } else {
+        fatal("Failed to realloc firstpass stats buffer.");
+      }
+    }
+
+    memcpy(stats->buf_ptr, pkt, len);
+    stats->buf.sz += len;
+    stats->buf_ptr += len;
+  }
+}
+
+vpx_fixed_buf_t stats_get(stats_io_t *stats) {
+  return stats->buf;
+}
diff --git a/libs/libvpx/vpxstats.h b/libs/libvpx/vpxstats.h
new file mode 100644
index 0000000000..5c9ea34f71
--- /dev/null
+++ b/libs/libvpx/vpxstats.h
@@ -0,0 +1,43 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPXSTATS_H_
+#define VPXSTATS_H_
+
+#include <stdio.h>
+
+#include "vpx/vpx_encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* This structure is used to abstract the different ways of handling
+ * first pass statistics
+ */
+typedef struct {
+  vpx_fixed_buf_t buf;
+  int pass;
+  FILE *file;
+  char *buf_ptr;
+  size_t buf_alloc_sz;
+} stats_io_t;
+
+int stats_open_file(stats_io_t *stats, const char *fpf, int pass);
+int stats_open_mem(stats_io_t *stats, int pass);
+void stats_close(stats_io_t *stats, int last_pass);
+void stats_write(stats_io_t *stats, const void *pkt, size_t len);
+vpx_fixed_buf_t stats_get(stats_io_t *stats);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPXSTATS_H_
diff --git a/libs/libvpx/warnings.c b/libs/libvpx/warnings.c
new file mode 100644
index 0000000000..7ac678ab4a
--- /dev/null
+++ b/libs/libvpx/warnings.c
@@ -0,0 +1,115 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./warnings.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "vpx/vpx_encoder.h"
+
+#include "./tools_common.h"
+#include "./vpxenc.h"
+
+static const char quantizer_warning_string[] =
+    "Bad quantizer values. Quantizer values should not be equal, and should "
+    "differ by at least 8.";
+static const char lag_in_frames_with_realtime[] =
+    "Lag in frames is ignored when deadline is set to realtime.";
+
+struct WarningListNode {
+  const char *warning_string;
+  struct WarningListNode *next_warning;
+};
+
+struct WarningList {
+  struct WarningListNode *warning_node;
+};
+
+static void add_warning(const char *warning_string,
+                        struct WarningList *warning_list) {
+  struct WarningListNode **node = &warning_list->warning_node;
+
+  struct WarningListNode *new_node = malloc(sizeof(*new_node));
+  if (new_node == NULL) {
+    fatal("Unable to allocate warning node.");
+  }
+
+  new_node->warning_string = warning_string;
+  new_node->next_warning = NULL;
+
+  while (*node != NULL)
+    node = &(*node)->next_warning;
+
+  *node = new_node;
+}
+
+static void free_warning_list(struct WarningList *warning_list) {
+  while (warning_list->warning_node != NULL) {
+    struct WarningListNode *const node = warning_list->warning_node;
+    warning_list->warning_node = node->next_warning;
+    free(node);
+  }
+}
+
+static int continue_prompt(int num_warnings) {
+  int c;
+  fprintf(stderr,
+          "%d encoder configuration warning(s). Continue? (y to continue) ",
+          num_warnings);
+  c = getchar();
+  return c == 'y';
+}
+
+static void check_quantizer(int min_q, int max_q,
+                            struct WarningList *warning_list) {
+  const int lossless = min_q == 0 && max_q == 0;
+  if (!lossless && (min_q == max_q || abs(max_q - min_q) < 8))
+    add_warning(quantizer_warning_string, warning_list);
+}
+
+static void check_lag_in_frames_realtime_deadline(
+    int lag_in_frames,
+    int deadline,
+    struct WarningList *warning_list) {
+  if (deadline == VPX_DL_REALTIME && lag_in_frames != 0)
+    add_warning(lag_in_frames_with_realtime, warning_list);
+}
+
+void check_encoder_config(int disable_prompt,
+                          const struct VpxEncoderConfig *global_config,
+                          const struct vpx_codec_enc_cfg *stream_config) {
+  int num_warnings = 0;
+  struct WarningListNode *warning = NULL;
+  struct WarningList warning_list = {0};
+
+  check_quantizer(stream_config->rc_min_quantizer,
+                  stream_config->rc_max_quantizer,
+                  &warning_list);
+  check_lag_in_frames_realtime_deadline(stream_config->g_lag_in_frames,
+                                        global_config->deadline,
+                                        &warning_list);
+  /* Count and print warnings. */
+  for (warning = warning_list.warning_node;
+       warning != NULL;
+       warning = warning->next_warning,
+       ++num_warnings) {
+    warn(warning->warning_string);
+  }
+
+  free_warning_list(&warning_list);
+
+  if (num_warnings) {
+    if (!disable_prompt && !continue_prompt(num_warnings))
+      exit(EXIT_FAILURE);
+  }
+}
diff --git a/libs/libvpx/warnings.h b/libs/libvpx/warnings.h
new file mode 100644
index 0000000000..6b8ae6796f
--- /dev/null
+++ b/libs/libvpx/warnings.h
@@ -0,0 +1,33 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef WARNINGS_H_
+#define WARNINGS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct vpx_codec_enc_cfg;
+struct VpxEncoderConfig;
+
+/*
+ * Checks config for improperly used settings. Warns user upon encountering
+ * settings that will lead to poor output quality. Prompts user to continue
+ * when warnings are issued.
+ */
+void check_encoder_config(int disable_prompt,
+                          const struct VpxEncoderConfig *global_config,
+                          const struct vpx_codec_enc_cfg *stream_config);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // WARNINGS_H_
diff --git a/libs/libvpx/webmdec.cc b/libs/libvpx/webmdec.cc
new file mode 100644
index 0000000000..f541cfecc1
--- /dev/null
+++ b/libs/libvpx/webmdec.cc
@@ -0,0 +1,231 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./webmdec.h"
+
+#include <cstring>
+#include <cstdio>
+
+#include "third_party/libwebm/mkvparser.hpp"
+#include "third_party/libwebm/mkvreader.hpp"
+
+namespace {
+
+void reset(struct WebmInputContext *const webm_ctx) {
+  if (webm_ctx->reader != NULL) {
+    mkvparser::MkvReader *const reader =
+        reinterpret_cast<mkvparser::MkvReader*>(webm_ctx->reader);
+    delete reader;
+  }
+  if (webm_ctx->segment != NULL) {
+    mkvparser::Segment *const segment =
+        reinterpret_cast<mkvparser::Segment*>(webm_ctx->segment);
+    delete segment;
+  }
+  if (webm_ctx->buffer != NULL) {
+    delete[] webm_ctx->buffer;
+  }
+  webm_ctx->reader = NULL;
+  webm_ctx->segment = NULL;
+  webm_ctx->buffer = NULL;
+  webm_ctx->cluster = NULL;
+  webm_ctx->block_entry = NULL;
+  webm_ctx->block = NULL;
+  webm_ctx->block_frame_index = 0;
+  webm_ctx->video_track_index = 0;
+  webm_ctx->timestamp_ns = 0;
+  webm_ctx->is_key_frame = false;
+}
+
+void get_first_cluster(struct WebmInputContext *const webm_ctx) {
+  mkvparser::Segment *const segment =
+      reinterpret_cast<mkvparser::Segment*>(webm_ctx->segment);
+  const mkvparser::Cluster *const cluster = segment->GetFirst();
+  webm_ctx->cluster = cluster;
+}
+
+void rewind_and_reset(struct WebmInputContext *const webm_ctx,
+                      struct VpxInputContext *const vpx_ctx) {
+  rewind(vpx_ctx->file);
+  reset(webm_ctx);
+}
+
+}  // namespace
+
+int file_is_webm(struct WebmInputContext *webm_ctx,
+                 struct VpxInputContext *vpx_ctx) {
+  mkvparser::MkvReader *const reader = new mkvparser::MkvReader(vpx_ctx->file);
+  webm_ctx->reader = reader;
+  webm_ctx->reached_eos = 0;
+
+  mkvparser::EBMLHeader header;
+  long long pos = 0;
+  if (header.Parse(reader, pos) < 0) {
+    rewind_and_reset(webm_ctx, vpx_ctx);
+    return 0;
+  }
+
+  mkvparser::Segment* segment;
+  if (mkvparser::Segment::CreateInstance(reader, pos, segment)) {
+    rewind_and_reset(webm_ctx, vpx_ctx);
+    return 0;
+  }
+  webm_ctx->segment = segment;
+  if (segment->Load() < 0) {
+    rewind_and_reset(webm_ctx, vpx_ctx);
+    return 0;
+  }
+
+  const mkvparser::Tracks *const tracks = segment->GetTracks();
+  const mkvparser::VideoTrack* video_track = NULL;
+  for (unsigned long i = 0; i < tracks->GetTracksCount(); ++i) {
+    const mkvparser::Track* const track = tracks->GetTrackByIndex(i);
+    if (track->GetType() == mkvparser::Track::kVideo) {
+      video_track = static_cast<const mkvparser::VideoTrack*>(track);
+      webm_ctx->video_track_index = track->GetNumber();
+      break;
+    }
+  }
+
+  if (video_track == NULL || video_track->GetCodecId() == NULL) {
+    rewind_and_reset(webm_ctx, vpx_ctx);
+    return 0;
+  }
+
+  if (!strncmp(video_track->GetCodecId(), "V_VP8", 5)) {
+    vpx_ctx->fourcc = VP8_FOURCC;
+  } else if (!strncmp(video_track->GetCodecId(), "V_VP9", 5)) {
+    vpx_ctx->fourcc = VP9_FOURCC;
+  } else if (!strncmp(video_track->GetCodecId(), "V_VP10", 6)) {
+    vpx_ctx->fourcc = VP10_FOURCC;
+  } else {
+    rewind_and_reset(webm_ctx, vpx_ctx);
+    return 0;
+  }
+
+  vpx_ctx->framerate.denominator = 0;
+  vpx_ctx->framerate.numerator = 0;
+  vpx_ctx->width = static_cast<uint32_t>(video_track->GetWidth());
+  vpx_ctx->height = static_cast<uint32_t>(video_track->GetHeight());
+
+  get_first_cluster(webm_ctx);
+
+  return 1;
+}
+
+int webm_read_frame(struct WebmInputContext *webm_ctx,
+                    uint8_t **buffer,
+                    size_t *bytes_in_buffer,
+                    size_t *buffer_size) {
+  // This check is needed for frame parallel decoding, in which case this
+  // function could be called even after it has reached end of input stream.
+  if (webm_ctx->reached_eos) {
+    return 1;
+  }
+  mkvparser::Segment *const segment =
+      reinterpret_cast<mkvparser::Segment*>(webm_ctx->segment);
+  const mkvparser::Cluster* cluster =
+      reinterpret_cast<const mkvparser::Cluster*>(webm_ctx->cluster);
+  const mkvparser::Block *block =
+      reinterpret_cast<const mkvparser::Block*>(webm_ctx->block);
+  const mkvparser::BlockEntry *block_entry =
+      reinterpret_cast<const mkvparser::BlockEntry*>(webm_ctx->block_entry);
+  bool block_entry_eos = false;
+  do {
+    long status = 0;
+    bool get_new_block = false;
+    if (block_entry == NULL && !block_entry_eos) {
+      status = cluster->GetFirst(block_entry);
+      get_new_block = true;
+    } else if (block_entry_eos || block_entry->EOS()) {
+      cluster = segment->GetNext(cluster);
+      if (cluster == NULL || cluster->EOS()) {
+        *bytes_in_buffer = 0;
+        webm_ctx->reached_eos = 1;
+        return 1;
+      }
+      status = cluster->GetFirst(block_entry);
+      block_entry_eos = false;
+      get_new_block = true;
+    } else if (block == NULL ||
+               webm_ctx->block_frame_index == block->GetFrameCount() ||
+               block->GetTrackNumber() != webm_ctx->video_track_index) {
+      status = cluster->GetNext(block_entry, block_entry);
+      if (block_entry == NULL || block_entry->EOS()) {
+        block_entry_eos = true;
+        continue;
+      }
+      get_new_block = true;
+    }
+    if (status) {
+      return -1;
+    }
+    if (get_new_block) {
+      block = block_entry->GetBlock();
+      webm_ctx->block_frame_index = 0;
+    }
+  } while (block->GetTrackNumber() != webm_ctx->video_track_index ||
+           block_entry_eos);
+
+  webm_ctx->cluster = cluster;
+  webm_ctx->block_entry = block_entry;
+  webm_ctx->block = block;
+
+  const mkvparser::Block::Frame& frame =
+      block->GetFrame(webm_ctx->block_frame_index);
+  ++webm_ctx->block_frame_index;
+  if (frame.len > static_cast<long>(*buffer_size)) {
+    delete[] *buffer;
+    *buffer = new uint8_t[frame.len];
+    if (*buffer == NULL) {
+      return -1;
+    }
+    *buffer_size = frame.len;
+    webm_ctx->buffer = *buffer;
+  }
+  *bytes_in_buffer = frame.len;
+  webm_ctx->timestamp_ns = block->GetTime(cluster);
+  webm_ctx->is_key_frame = block->IsKey();
+
+  mkvparser::MkvReader *const reader =
+      reinterpret_cast<mkvparser::MkvReader*>(webm_ctx->reader);
+  return frame.Read(reader, *buffer) ? -1 : 0;
+}
+
+int webm_guess_framerate(struct WebmInputContext *webm_ctx,
+                         struct VpxInputContext *vpx_ctx) {
+  uint32_t i = 0;
+  uint8_t *buffer = NULL;
+  size_t bytes_in_buffer = 0;
+  size_t buffer_size = 0;
+  while (webm_ctx->timestamp_ns < 1000000000 && i < 50) {
+    if (webm_read_frame(webm_ctx, &buffer, &bytes_in_buffer, &buffer_size)) {
+      break;
+    }
+    ++i;
+  }
+  vpx_ctx->framerate.numerator = (i - 1) * 1000000;
+  vpx_ctx->framerate.denominator =
+      static_cast<int>(webm_ctx->timestamp_ns / 1000);
+  delete[] buffer;
+
+  get_first_cluster(webm_ctx);
+  webm_ctx->block = NULL;
+  webm_ctx->block_entry = NULL;
+  webm_ctx->block_frame_index = 0;
+  webm_ctx->timestamp_ns = 0;
+  webm_ctx->reached_eos = 0;
+
+  return 0;
+}
+
+void webm_free(struct WebmInputContext *webm_ctx) {
+  reset(webm_ctx);
+}
diff --git a/libs/libvpx/webmdec.h b/libs/libvpx/webmdec.h
new file mode 100644
index 0000000000..7d16380355
--- /dev/null
+++ b/libs/libvpx/webmdec.h
@@ -0,0 +1,74 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef WEBMDEC_H_
+#define WEBMDEC_H_
+
+#include "./tools_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VpxInputContext;
+
+struct WebmInputContext {
+  void *reader;
+  void *segment;
+  uint8_t *buffer;
+  const void *cluster;
+  const void *block_entry;
+  const void *block;
+  int block_frame_index;
+  int video_track_index;
+  uint64_t timestamp_ns;
+  int is_key_frame;
+  int reached_eos;
+};
+
+// Checks if the input is a WebM file. If so, initializes WebMInputContext so
+// that webm_read_frame can be called to retrieve a video frame.
+// Returns 1 on success and 0 on failure or input is not WebM file.
+// TODO(vigneshv): Refactor this function into two smaller functions specific
+// to their task.
+int file_is_webm(struct WebmInputContext *webm_ctx,
+                 struct VpxInputContext *vpx_ctx);
+
+// Reads a WebM Video Frame. Memory for the buffer is created, owned and managed
+// by this function. For the first call, |buffer| should be NULL and
+// |*bytes_in_buffer| should be 0. Once all the frames are read and used,
+// webm_free() should be called, otherwise there will be a leak.
+// Parameters:
+//      webm_ctx - WebmInputContext object
+//      buffer - pointer where the frame data will be filled.
+//      bytes_in_buffer - pointer to buffer size.
+//      buffer_size - unused TODO(vigneshv): remove this
+// Return values:
+//      0 - Success
+//      1 - End of Stream
+//     -1 - Error
+// TODO(vigneshv): Make the return values consistent across all functions in
+// this file.
+int webm_read_frame(struct WebmInputContext *webm_ctx,
+                    uint8_t **buffer,
+                    size_t *bytes_in_buffer,
+                    size_t *buffer_size);
+
+// Guesses the frame rate of the input file based on the container timestamps.
+int webm_guess_framerate(struct WebmInputContext *webm_ctx,
+                         struct VpxInputContext *vpx_ctx);
+
+// Resets the WebMInputContext.
+void webm_free(struct WebmInputContext *webm_ctx);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // WEBMDEC_H_
diff --git a/libs/libvpx/webmenc.cc b/libs/libvpx/webmenc.cc
new file mode 100644
index 0000000000..d41e700443
--- /dev/null
+++ b/libs/libvpx/webmenc.cc
@@ -0,0 +1,112 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "./webmenc.h"
+
+#include <string>
+
+#include "third_party/libwebm/mkvmuxer.hpp"
+#include "third_party/libwebm/mkvmuxerutil.hpp"
+#include "third_party/libwebm/mkvwriter.hpp"
+
+namespace {
+const uint64_t kDebugTrackUid = 0xDEADBEEF;
+const int kVideoTrackNumber = 1;
+}  // namespace
+
+void write_webm_file_header(struct EbmlGlobal *glob,
+                            const vpx_codec_enc_cfg_t *cfg,
+                            const struct vpx_rational *fps,
+                            stereo_format_t stereo_fmt,
+                            unsigned int fourcc,
+                            const struct VpxRational *par) {
+  mkvmuxer::MkvWriter *const writer = new mkvmuxer::MkvWriter(glob->stream);
+  mkvmuxer::Segment *const segment = new mkvmuxer::Segment();
+  segment->Init(writer);
+  segment->set_mode(mkvmuxer::Segment::kFile);
+  segment->OutputCues(true);
+
+  mkvmuxer::SegmentInfo *const info = segment->GetSegmentInfo();
+  const uint64_t kTimecodeScale = 1000000;
+  info->set_timecode_scale(kTimecodeScale);
+  std::string version = "vpxenc";
+  if (!glob->debug) {
+    version.append(std::string(" ") + vpx_codec_version_str());
+  }
+  info->set_writing_app(version.c_str());
+
+  const uint64_t video_track_id =
+      segment->AddVideoTrack(static_cast<int>(cfg->g_w),
+                             static_cast<int>(cfg->g_h),
+                             kVideoTrackNumber);
+  mkvmuxer::VideoTrack* const video_track =
+      static_cast<mkvmuxer::VideoTrack*>(
+          segment->GetTrackByNumber(video_track_id));
+  video_track->SetStereoMode(stereo_fmt);
+  const char *codec_id;
+  switch (fourcc) {
+  case VP8_FOURCC:
+    codec_id = "V_VP8";
+    break;
+  case VP9_FOURCC:
+    codec_id = "V_VP9";
+    break;
+  case VP10_FOURCC:
+    codec_id = "V_VP10";
+    break;
+  default:
+    codec_id = "V_VP10";
+    break;
+  }
+  video_track->set_codec_id(codec_id);
+  if (par->numerator > 1 || par->denominator > 1) {
+    // TODO(fgalligan): Add support of DisplayUnit, Display Aspect Ratio type
+    // to WebM format.
+    const uint64_t display_width =
+        static_cast<uint64_t>(((cfg->g_w * par->numerator * 1.0) /
+                               par->denominator) + .5);
+    video_track->set_display_width(display_width);
+    video_track->set_display_height(cfg->g_h);
+  }
+  if (glob->debug) {
+    video_track->set_uid(kDebugTrackUid);
+  }
+  glob->writer = writer;
+  glob->segment = segment;
+}
+
+void write_webm_block(struct EbmlGlobal *glob,
+                      const vpx_codec_enc_cfg_t *cfg,
+                      const vpx_codec_cx_pkt_t *pkt) {
+  mkvmuxer::Segment *const segment =
+      reinterpret_cast<mkvmuxer::Segment*>(glob->segment);
+  int64_t pts_ns = pkt->data.frame.pts * 1000000000ll *
+                   cfg->g_timebase.num / cfg->g_timebase.den;
+  if (pts_ns <= glob->last_pts_ns)
+    pts_ns = glob->last_pts_ns + 1000000;
+  glob->last_pts_ns = pts_ns;
+
+  segment->AddFrame(static_cast<uint8_t*>(pkt->data.frame.buf),
+                    pkt->data.frame.sz,
+                    kVideoTrackNumber,
+                    pts_ns,
+                    pkt->data.frame.flags & VPX_FRAME_IS_KEY);
+}
+
+void write_webm_file_footer(struct EbmlGlobal *glob) {
+  mkvmuxer::MkvWriter *const writer =
+      reinterpret_cast<mkvmuxer::MkvWriter*>(glob->writer);
+  mkvmuxer::Segment *const segment =
+      reinterpret_cast<mkvmuxer::Segment*>(glob->segment);
+  segment->Finalize();
+  delete segment;
+  delete writer;
+  glob->writer = NULL;
+  glob->segment = NULL;
+}
diff --git a/libs/libvpx/webmenc.h b/libs/libvpx/webmenc.h
new file mode 100644
index 0000000000..c255d3de66
--- /dev/null
+++ b/libs/libvpx/webmenc.h
@@ -0,0 +1,58 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef WEBMENC_H_
+#define WEBMENC_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "tools_common.h"
+#include "vpx/vpx_encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* TODO(vigneshv): Rename this struct */
+struct EbmlGlobal {
+  int debug;
+  FILE *stream;
+  int64_t last_pts_ns;
+  void *writer;
+  void *segment;
+};
+
+/* Stereo 3D packed frame format */
+typedef enum stereo_format {
+  STEREO_FORMAT_MONO = 0,
+  STEREO_FORMAT_LEFT_RIGHT = 1,
+  STEREO_FORMAT_BOTTOM_TOP = 2,
+  STEREO_FORMAT_TOP_BOTTOM = 3,
+  STEREO_FORMAT_RIGHT_LEFT = 11
+} stereo_format_t;
+
+void write_webm_file_header(struct EbmlGlobal *glob,
+                            const vpx_codec_enc_cfg_t *cfg,
+                            const struct vpx_rational *fps,
+                            stereo_format_t stereo_fmt,
+                            unsigned int fourcc,
+                            const struct VpxRational *par);
+
+void write_webm_block(struct EbmlGlobal *glob,
+                      const vpx_codec_enc_cfg_t *cfg,
+                      const vpx_codec_cx_pkt_t *pkt);
+
+void write_webm_file_footer(struct EbmlGlobal *glob);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // WEBMENC_H_
diff --git a/libs/libvpx/y4menc.c b/libs/libvpx/y4menc.c
new file mode 100644
index 0000000000..b647e8dcc5
--- /dev/null
+++ b/libs/libvpx/y4menc.c
@@ -0,0 +1,60 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./y4menc.h"
+
+int y4m_write_file_header(char *buf, size_t len, int width, int height,
+                          const struct VpxRational *framerate,
+                          vpx_img_fmt_t fmt, unsigned int bit_depth) {
+  const char *color;
+  switch (bit_depth) {
+    case 8:
+      color = fmt == VPX_IMG_FMT_444A ? "C444alpha\n" :
+              fmt == VPX_IMG_FMT_I444 ? "C444\n" :
+              fmt == VPX_IMG_FMT_I422 ? "C422\n" :
+              "C420jpeg\n";
+      break;
+    case 9:
+      color = fmt == VPX_IMG_FMT_I44416 ? "C444p9 XYSCSS=444P9\n" :
+              fmt == VPX_IMG_FMT_I42216 ? "C422p9 XYSCSS=422P9\n" :
+              "C420p9 XYSCSS=420P9\n";
+      break;
+    case 10:
+      color = fmt == VPX_IMG_FMT_I44416 ? "C444p10 XYSCSS=444P10\n" :
+              fmt == VPX_IMG_FMT_I42216 ? "C422p10 XYSCSS=422P10\n" :
+              "C420p10 XYSCSS=420P10\n";
+      break;
+    case 12:
+      color = fmt == VPX_IMG_FMT_I44416 ? "C444p12 XYSCSS=444P12\n" :
+              fmt == VPX_IMG_FMT_I42216 ? "C422p12 XYSCSS=422P12\n" :
+              "C420p12 XYSCSS=420P12\n";
+      break;
+    case 14:
+      color = fmt == VPX_IMG_FMT_I44416 ? "C444p14 XYSCSS=444P14\n" :
+              fmt == VPX_IMG_FMT_I42216 ? "C422p14 XYSCSS=422P14\n" :
+              "C420p14 XYSCSS=420P14\n";
+      break;
+    case 16:
+      color = fmt == VPX_IMG_FMT_I44416 ? "C444p16 XYSCSS=444P16\n" :
+              fmt == VPX_IMG_FMT_I42216 ? "C422p16 XYSCSS=422P16\n" :
+              "C420p16 XYSCSS=420P16\n";
+      break;
+    default:
+      color = NULL;
+      assert(0);
+  }
+  return snprintf(buf, len, "YUV4MPEG2 W%u H%u F%u:%u I%c %s", width, height,
+                  framerate->numerator, framerate->denominator, 'p', color);
+}
+
+int y4m_write_frame_header(char *buf, size_t len) {
+  return snprintf(buf, len, "FRAME\n");
+}
diff --git a/libs/libvpx/y4menc.h b/libs/libvpx/y4menc.h
new file mode 100644
index 0000000000..69d590413e
--- /dev/null
+++ b/libs/libvpx/y4menc.h
@@ -0,0 +1,33 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef Y4MENC_H_
+#define Y4MENC_H_
+
+#include "./tools_common.h"
+
+#include "vpx/vpx_decoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define Y4M_BUFFER_SIZE 128
+
+int y4m_write_file_header(char *buf, size_t len, int width, int height,
+                          const struct VpxRational *framerate,
+                          vpx_img_fmt_t fmt, unsigned int bit_depth);
+int y4m_write_frame_header(char *buf, size_t len);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // Y4MENC_H_
diff --git a/libs/libvpx/y4minput.c b/libs/libvpx/y4minput.c
new file mode 100644
index 0000000000..34ea96d9d5
--- /dev/null
+++ b/libs/libvpx/y4minput.c
@@ -0,0 +1,1052 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ *  Based on code from the OggTheora software codec source code,
+ *  Copyright (C) 2002-2010 The Xiph.Org Foundation and contributors.
+ */
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "vpx/vpx_integer.h"
+#include "y4minput.h"
+
+// Reads 'size' bytes from 'file' into 'buf' with some fault tolerance.
+// Returns true on success.
+static int file_read(void *buf, size_t size, FILE *file) {
+  const int kMaxRetries = 5;
+  int retry_count = 0;
+  int file_error;
+  size_t len = 0;
+  do {
+    const size_t n = fread((uint8_t*)buf + len, 1, size - len, file);
+    len += n;
+    file_error = ferror(file);
+    if (file_error) {
+      if (errno == EINTR || errno == EAGAIN) {
+        clearerr(file);
+        continue;
+      } else {
+        fprintf(stderr, "Error reading file: %u of %u bytes read, %d: %s\n",
+                (uint32_t)len, (uint32_t)size, errno, strerror(errno));
+        return 0;
+      }
+    }
+  } while (!feof(file) && len < size && ++retry_count < kMaxRetries);
+
+  if (!feof(file) && len != size) {
+    fprintf(stderr, "Error reading file: %u of %u bytes read,"
+                    " error: %d, retries: %d, %d: %s\n",
+            (uint32_t)len, (uint32_t)size, file_error, retry_count,
+            errno, strerror(errno));
+  }
+  return len == size;
+}
+
+static int y4m_parse_tags(y4m_input *_y4m, char *_tags) {
+  int   got_w;
+  int   got_h;
+  int   got_fps;
+  int   got_interlace;
+  int   got_par;
+  int   got_chroma;
+  char *p;
+  char *q;
+  got_w = got_h = got_fps = got_interlace = got_par = got_chroma = 0;
+  for (p = _tags;; p = q) {
+    /*Skip any leading spaces.*/
+    while (*p == ' ')p++;
+    /*If that's all we have, stop.*/
+    if (p[0] == '\0')break;
+    /*Find the end of this tag.*/
+    for (q = p + 1; *q != '\0' && *q != ' '; q++);
+    /*Process the tag.*/
+    switch (p[0]) {
+      case 'W': {
+        if (sscanf(p + 1, "%d", &_y4m->pic_w) != 1)return -1;
+        got_w = 1;
+      }
+      break;
+      case 'H': {
+        if (sscanf(p + 1, "%d", &_y4m->pic_h) != 1)return -1;
+        got_h = 1;
+      }
+      break;
+      case 'F': {
+        if (sscanf(p + 1, "%d:%d", &_y4m->fps_n, &_y4m->fps_d) != 2) {
+          return -1;
+        }
+        got_fps = 1;
+      }
+      break;
+      case 'I': {
+        _y4m->interlace = p[1];
+        got_interlace = 1;
+      }
+      break;
+      case 'A': {
+        if (sscanf(p + 1, "%d:%d", &_y4m->par_n, &_y4m->par_d) != 2) {
+          return -1;
+        }
+        got_par = 1;
+      }
+      break;
+      case 'C': {
+        if (q - p > 16)return -1;
+        memcpy(_y4m->chroma_type, p + 1, q - p - 1);
+        _y4m->chroma_type[q - p - 1] = '\0';
+        got_chroma = 1;
+      }
+      break;
+      /*Ignore unknown tags.*/
+    }
+  }
+  if (!got_w || !got_h || !got_fps)return -1;
+  if (!got_interlace)_y4m->interlace = '?';
+  if (!got_par)_y4m->par_n = _y4m->par_d = 0;
+  /*Chroma-type is not specified in older files, e.g., those generated by
+     mplayer.*/
+  if (!got_chroma)strcpy(_y4m->chroma_type, "420");
+  return 0;
+}
+
+
+
+/*All anti-aliasing filters in the following conversion functions are based on
+   one of two window functions:
+  The 6-tap Lanczos window (for down-sampling and shifts):
+   sinc(\pi*t)*sinc(\pi*t/3), |t|<3  (sinc(t)==sin(t)/t)
+   0,                         |t|>=3
+  The 4-tap Mitchell window (for up-sampling):
+   7|t|^3-12|t|^2+16/3,             |t|<1
+   -(7/3)|x|^3+12|x|^2-20|x|+32/3,  |t|<2
+   0,                               |t|>=2
+  The number of taps is intentionally kept small to reduce computational
+   overhead and limit ringing.
+
+  The taps from these filters are scaled so that their sum is 1, and the result
+   is scaled by 128 and rounded to integers to create a filter whose
+   intermediate values fit inside 16 bits.
+  Coefficients are rounded in such a way as to ensure their sum is still 128,
+   which is usually equivalent to normal rounding.
+
+  Conversions which require both horizontal and vertical filtering could
+   have these steps pipelined, for less memory consumption and better cache
+   performance, but we do them separately for simplicity.*/
+
+#define OC_MINI(_a,_b)      ((_a)>(_b)?(_b):(_a))
+#define OC_MAXI(_a,_b)      ((_a)<(_b)?(_b):(_a))
+#define OC_CLAMPI(_a,_b,_c) (OC_MAXI(_a,OC_MINI(_b,_c)))
+
+/*420jpeg chroma samples are sited like:
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |   BR  |       |   BR  |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |   BR  |       |   BR  |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+
+  420mpeg2 chroma samples are sited like:
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  BR      |       BR      |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  BR      |       BR      |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+
+  We use a resampling filter to shift the site locations one quarter pixel (at
+   the chroma plane's resolution) to the right.
+  The 4:2:2 modes look exactly the same, except there are twice as many chroma
+   lines, and they are vertically co-sited with the luma samples in both the
+   mpeg2 and jpeg cases (thus requiring no vertical resampling).*/
+static void y4m_42xmpeg2_42xjpeg_helper(unsigned char *_dst,
+                                        const unsigned char *_src, int _c_w, int _c_h) {
+  int y;
+  int x;
+  for (y = 0; y < _c_h; y++) {
+    /*Filter: [4 -17 114 35 -9 1]/128, derived from a 6-tap Lanczos
+       window.*/
+    for (x = 0; x < OC_MINI(_c_w, 2); x++) {
+      _dst[x] = (unsigned char)OC_CLAMPI(0, (4 * _src[0] - 17 * _src[OC_MAXI(x - 1, 0)] +
+                                             114 * _src[x] + 35 * _src[OC_MINI(x + 1, _c_w - 1)] - 9 * _src[OC_MINI(x + 2, _c_w - 1)] +
+                                             _src[OC_MINI(x + 3, _c_w - 1)] + 64) >> 7, 255);
+    }
+    for (; x < _c_w - 3; x++) {
+      _dst[x] = (unsigned char)OC_CLAMPI(0, (4 * _src[x - 2] - 17 * _src[x - 1] +
+                                             114 * _src[x] + 35 * _src[x + 1] - 9 * _src[x + 2] + _src[x + 3] + 64) >> 7, 255);
+    }
+    for (; x < _c_w; x++) {
+      _dst[x] = (unsigned char)OC_CLAMPI(0, (4 * _src[x - 2] - 17 * _src[x - 1] +
+                                             114 * _src[x] + 35 * _src[OC_MINI(x + 1, _c_w - 1)] - 9 * _src[OC_MINI(x + 2, _c_w - 1)] +
+                                             _src[_c_w - 1] + 64) >> 7, 255);
+    }
+    _dst += _c_w;
+    _src += _c_w;
+  }
+}
+
+/*Handles both 422 and 420mpeg2 to 422jpeg and 420jpeg, respectively.*/
+static void y4m_convert_42xmpeg2_42xjpeg(y4m_input *_y4m, unsigned char *_dst,
+                                         unsigned char *_aux) {
+  int c_w;
+  int c_h;
+  int c_sz;
+  int pli;
+  /*Skip past the luma data.*/
+  _dst += _y4m->pic_w * _y4m->pic_h;
+  /*Compute the size of each chroma plane.*/
+  c_w = (_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h;
+  c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v;
+  c_sz = c_w * c_h;
+  for (pli = 1; pli < 3; pli++) {
+    y4m_42xmpeg2_42xjpeg_helper(_dst, _aux, c_w, c_h);
+    _dst += c_sz;
+    _aux += c_sz;
+  }
+}
+
+/*This format is only used for interlaced content, but is included for
+   completeness.
+
+  420jpeg chroma samples are sited like:
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |   BR  |       |   BR  |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |   BR  |       |   BR  |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+
+  420paldv chroma samples are sited like:
+  YR------Y-------YR------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  YB------Y-------YB------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  YR------Y-------YR------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  YB------Y-------YB------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+
+  We use a resampling filter to shift the site locations one quarter pixel (at
+   the chroma plane's resolution) to the right.
+  Then we use another filter to move the C_r location down one quarter pixel,
+   and the C_b location up one quarter pixel.*/
+static void y4m_convert_42xpaldv_42xjpeg(y4m_input *_y4m, unsigned char *_dst,
+                                         unsigned char *_aux) {
+  unsigned char *tmp;
+  int            c_w;
+  int            c_h;
+  int            c_sz;
+  int            pli;
+  int            y;
+  int            x;
+  /*Skip past the luma data.*/
+  _dst += _y4m->pic_w * _y4m->pic_h;
+  /*Compute the size of each chroma plane.*/
+  c_w = (_y4m->pic_w + 1) / 2;
+  c_h = (_y4m->pic_h + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h;
+  c_sz = c_w * c_h;
+  tmp = _aux + 2 * c_sz;
+  for (pli = 1; pli < 3; pli++) {
+    /*First do the horizontal re-sampling.
+      This is the same as the mpeg2 case, except that after the horizontal
+       case, we need to apply a second vertical filter.*/
+    y4m_42xmpeg2_42xjpeg_helper(tmp, _aux, c_w, c_h);
+    _aux += c_sz;
+    switch (pli) {
+      case 1: {
+        /*Slide C_b up a quarter-pel.
+          This is the same filter used above, but in the other order.*/
+        for (x = 0; x < c_w; x++) {
+          for (y = 0; y < OC_MINI(c_h, 3); y++) {
+            _dst[y * c_w] = (unsigned char)OC_CLAMPI(0, (tmp[0]
+                                                         - 9 * tmp[OC_MAXI(y - 2, 0) * c_w] + 35 * tmp[OC_MAXI(y - 1, 0) * c_w]
+                                                         + 114 * tmp[y * c_w] - 17 * tmp[OC_MINI(y + 1, c_h - 1) * c_w]
+                                                         + 4 * tmp[OC_MINI(y + 2, c_h - 1) * c_w] + 64) >> 7, 255);
+          }
+          for (; y < c_h - 2; y++) {
+            _dst[y * c_w] = (unsigned char)OC_CLAMPI(0, (tmp[(y - 3) * c_w]
+                                                         - 9 * tmp[(y - 2) * c_w] + 35 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w]
+                                                         - 17 * tmp[(y + 1) * c_w] + 4 * tmp[(y + 2) * c_w] + 64) >> 7, 255);
+          }
+          for (; y < c_h; y++) {
+            _dst[y * c_w] = (unsigned char)OC_CLAMPI(0, (tmp[(y - 3) * c_w]
+                                                         - 9 * tmp[(y - 2) * c_w] + 35 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w]
+                                                         - 17 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] + 4 * tmp[(c_h - 1) * c_w] + 64) >> 7, 255);
+          }
+          _dst++;
+          tmp++;
+        }
+        _dst += c_sz - c_w;
+        tmp -= c_w;
+      }
+      break;
+      case 2: {
+        /*Slide C_r down a quarter-pel.
+          This is the same as the horizontal filter.*/
+        for (x = 0; x < c_w; x++) {
+          for (y = 0; y < OC_MINI(c_h, 2); y++) {
+            _dst[y * c_w] = (unsigned char)OC_CLAMPI(0, (4 * tmp[0]
+                                                         - 17 * tmp[OC_MAXI(y - 1, 0) * c_w] + 114 * tmp[y * c_w]
+                                                         + 35 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] - 9 * tmp[OC_MINI(y + 2, c_h - 1) * c_w]
+                                                         + tmp[OC_MINI(y + 3, c_h - 1) * c_w] + 64) >> 7, 255);
+          }
+          for (; y < c_h - 3; y++) {
+            _dst[y * c_w] = (unsigned char)OC_CLAMPI(0, (4 * tmp[(y - 2) * c_w]
+                                                         - 17 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] + 35 * tmp[(y + 1) * c_w]
+                                                         - 9 * tmp[(y + 2) * c_w] + tmp[(y + 3) * c_w] + 64) >> 7, 255);
+          }
+          for (; y < c_h; y++) {
+            _dst[y * c_w] = (unsigned char)OC_CLAMPI(0, (4 * tmp[(y - 2) * c_w]
+                                                         - 17 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] + 35 * tmp[OC_MINI(y + 1, c_h - 1) * c_w]
+                                                         - 9 * tmp[OC_MINI(y + 2, c_h - 1) * c_w] + tmp[(c_h - 1) * c_w] + 64) >> 7, 255);
+          }
+          _dst++;
+          tmp++;
+        }
+      }
+      break;
+    }
+    /*For actual interlaced material, this would have to be done separately on
+       each field, and the shift amounts would be different.
+      C_r moves down 1/8, C_b up 3/8 in the top field, and C_r moves down 3/8,
+       C_b up 1/8 in the bottom field.
+      The corresponding filters would be:
+       Down 1/8 (reverse order for up): [3 -11 125 15 -4 0]/128
+       Down 3/8 (reverse order for up): [4 -19 98 56 -13 2]/128*/
+  }
+}
+
+/*Perform vertical filtering to reduce a single plane from 4:2:2 to 4:2:0.
+  This is used as a helper by several converation routines.*/
+static void y4m_422jpeg_420jpeg_helper(unsigned char *_dst,
+                                       const unsigned char *_src, int _c_w, int _c_h) {
+  int y;
+  int x;
+  /*Filter: [3 -17 78 78 -17 3]/128, derived from a 6-tap Lanczos window.*/
+  for (x = 0; x < _c_w; x++) {
+    for (y = 0; y < OC_MINI(_c_h, 2); y += 2) {
+      _dst[(y >> 1)*_c_w] = OC_CLAMPI(0, (64 * _src[0]
+                                          + 78 * _src[OC_MINI(1, _c_h - 1) * _c_w]
+                                          - 17 * _src[OC_MINI(2, _c_h - 1) * _c_w]
+                                          + 3 * _src[OC_MINI(3, _c_h - 1) * _c_w] + 64) >> 7, 255);
+    }
+    for (; y < _c_h - 3; y += 2) {
+      _dst[(y >> 1)*_c_w] = OC_CLAMPI(0, (3 * (_src[(y - 2) * _c_w] + _src[(y + 3) * _c_w])
+                                          - 17 * (_src[(y - 1) * _c_w] + _src[(y + 2) * _c_w])
+                                          + 78 * (_src[y * _c_w] + _src[(y + 1) * _c_w]) + 64) >> 7, 255);
+    }
+    for (; y < _c_h; y += 2) {
+      _dst[(y >> 1)*_c_w] = OC_CLAMPI(0, (3 * (_src[(y - 2) * _c_w]
+                                               + _src[(_c_h - 1) * _c_w]) - 17 * (_src[(y - 1) * _c_w]
+                                                                                  + _src[OC_MINI(y + 2, _c_h - 1) * _c_w])
+                                          + 78 * (_src[y * _c_w] + _src[OC_MINI(y + 1, _c_h - 1) * _c_w]) + 64) >> 7, 255);
+    }
+    _src++;
+    _dst++;
+  }
+}
+
+/*420jpeg chroma samples are sited like:
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |   BR  |       |   BR  |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |   BR  |       |   BR  |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+
+  422jpeg chroma samples are sited like:
+  Y---BR--Y-------Y---BR--Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  Y---BR--Y-------Y---BR--Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  Y---BR--Y-------Y---BR--Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  Y---BR--Y-------Y---BR--Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+
+  We use a resampling filter to decimate the chroma planes by two in the
+   vertical direction.*/
+static void y4m_convert_422jpeg_420jpeg(y4m_input *_y4m, unsigned char *_dst,
+                                        unsigned char *_aux) {
+  int c_w;
+  int c_h;
+  int c_sz;
+  int dst_c_w;
+  int dst_c_h;
+  int dst_c_sz;
+  int pli;
+  /*Skip past the luma data.*/
+  _dst += _y4m->pic_w * _y4m->pic_h;
+  /*Compute the size of each chroma plane.*/
+  c_w = (_y4m->pic_w + _y4m->src_c_dec_h - 1) / _y4m->src_c_dec_h;
+  c_h = _y4m->pic_h;
+  dst_c_w = (_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h;
+  dst_c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v;
+  c_sz = c_w * c_h;
+  dst_c_sz = dst_c_w * dst_c_h;
+  for (pli = 1; pli < 3; pli++) {
+    y4m_422jpeg_420jpeg_helper(_dst, _aux, c_w, c_h);
+    _aux += c_sz;
+    _dst += dst_c_sz;
+  }
+}
+
+/*420jpeg chroma samples are sited like:
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |   BR  |       |   BR  |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |   BR  |       |   BR  |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+
+  422 chroma samples are sited like:
+  YBR-----Y-------YBR-----Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  YBR-----Y-------YBR-----Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  YBR-----Y-------YBR-----Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  YBR-----Y-------YBR-----Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+
+  We use a resampling filter to shift the original site locations one quarter
+   pixel (at the original chroma resolution) to the right.
+  Then we use a second resampling filter to decimate the chroma planes by two
+   in the vertical direction.*/
+static void y4m_convert_422_420jpeg(y4m_input *_y4m, unsigned char *_dst,
+                                    unsigned char *_aux) {
+  unsigned char *tmp;
+  int            c_w;
+  int            c_h;
+  int            c_sz;
+  int            dst_c_h;
+  int            dst_c_sz;
+  int            pli;
+  /*Skip past the luma data.*/
+  _dst += _y4m->pic_w * _y4m->pic_h;
+  /*Compute the size of each chroma plane.*/
+  c_w = (_y4m->pic_w + _y4m->src_c_dec_h - 1) / _y4m->src_c_dec_h;
+  c_h = _y4m->pic_h;
+  dst_c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v;
+  c_sz = c_w * c_h;
+  dst_c_sz = c_w * dst_c_h;
+  tmp = _aux + 2 * c_sz;
+  for (pli = 1; pli < 3; pli++) {
+    /*In reality, the horizontal and vertical steps could be pipelined, for
+       less memory consumption and better cache performance, but we do them
+       separately for simplicity.*/
+    /*First do horizontal filtering (convert to 422jpeg)*/
+    y4m_42xmpeg2_42xjpeg_helper(tmp, _aux, c_w, c_h);
+    /*Now do the vertical filtering.*/
+    y4m_422jpeg_420jpeg_helper(_dst, tmp, c_w, c_h);
+    _aux += c_sz;
+    _dst += dst_c_sz;
+  }
+}
+
+/*420jpeg chroma samples are sited like:
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |   BR  |       |   BR  |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |   BR  |       |   BR  |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+
+  411 chroma samples are sited like:
+  YBR-----Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  YBR-----Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  YBR-----Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  YBR-----Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+
+  We use a filter to resample at site locations one eighth pixel (at the source
+   chroma plane's horizontal resolution) and five eighths of a pixel to the
+   right.
+  Then we use another filter to decimate the planes by 2 in the vertical
+   direction.*/
+static void y4m_convert_411_420jpeg(y4m_input *_y4m, unsigned char *_dst,
+                                    unsigned char *_aux) {
+  unsigned char *tmp;
+  int            c_w;
+  int            c_h;
+  int            c_sz;
+  int            dst_c_w;
+  int            dst_c_h;
+  int            dst_c_sz;
+  int            tmp_sz;
+  int            pli;
+  int            y;
+  int            x;
+  /*Skip past the luma data.*/
+  _dst += _y4m->pic_w * _y4m->pic_h;
+  /*Compute the size of each chroma plane.*/
+  c_w = (_y4m->pic_w + _y4m->src_c_dec_h - 1) / _y4m->src_c_dec_h;
+  c_h = _y4m->pic_h;
+  dst_c_w = (_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h;
+  dst_c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v;
+  c_sz = c_w * c_h;
+  dst_c_sz = dst_c_w * dst_c_h;
+  tmp_sz = dst_c_w * c_h;
+  tmp = _aux + 2 * c_sz;
+  for (pli = 1; pli < 3; pli++) {
+    /*In reality, the horizontal and vertical steps could be pipelined, for
+       less memory consumption and better cache performance, but we do them
+       separately for simplicity.*/
+    /*First do horizontal filtering (convert to 422jpeg)*/
+    for (y = 0; y < c_h; y++) {
+      /*Filters: [1 110 18 -1]/128 and [-3 50 86 -5]/128, both derived from a
+         4-tap Mitchell window.*/
+      for (x = 0; x < OC_MINI(c_w, 1); x++) {
+        tmp[x << 1] = (unsigned char)OC_CLAMPI(0, (111 * _aux[0]
+                                                   + 18 * _aux[OC_MINI(1, c_w - 1)] - _aux[OC_MINI(2, c_w - 1)] + 64) >> 7, 255);
+        tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI(0, (47 * _aux[0]
+                                                       + 86 * _aux[OC_MINI(1, c_w - 1)] - 5 * _aux[OC_MINI(2, c_w - 1)] + 64) >> 7, 255);
+      }
+      for (; x < c_w - 2; x++) {
+        tmp[x << 1] = (unsigned char)OC_CLAMPI(0, (_aux[x - 1] + 110 * _aux[x]
+                                                   + 18 * _aux[x + 1] - _aux[x + 2] + 64) >> 7, 255);
+        tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI(0, (-3 * _aux[x - 1] + 50 * _aux[x]
+                                                       + 86 * _aux[x + 1] - 5 * _aux[x + 2] + 64) >> 7, 255);
+      }
+      for (; x < c_w; x++) {
+        tmp[x << 1] = (unsigned char)OC_CLAMPI(0, (_aux[x - 1] + 110 * _aux[x]
+                                                   + 18 * _aux[OC_MINI(x + 1, c_w - 1)] - _aux[c_w - 1] + 64) >> 7, 255);
+        if ((x << 1 | 1) < dst_c_w) {
+          tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI(0, (-3 * _aux[x - 1] + 50 * _aux[x]
+                                                         + 86 * _aux[OC_MINI(x + 1, c_w - 1)] - 5 * _aux[c_w - 1] + 64) >> 7, 255);
+        }
+      }
+      tmp += dst_c_w;
+      _aux += c_w;
+    }
+    tmp -= tmp_sz;
+    /*Now do the vertical filtering.*/
+    y4m_422jpeg_420jpeg_helper(_dst, tmp, dst_c_w, c_h);
+    _dst += dst_c_sz;
+  }
+}
+
+/*Convert 444 to 420jpeg.*/
+static void y4m_convert_444_420jpeg(y4m_input *_y4m, unsigned char *_dst,
+                                    unsigned char *_aux) {
+  unsigned char *tmp;
+  int            c_w;
+  int            c_h;
+  int            c_sz;
+  int            dst_c_w;
+  int            dst_c_h;
+  int            dst_c_sz;
+  int            tmp_sz;
+  int            pli;
+  int            y;
+  int            x;
+  /*Skip past the luma data.*/
+  _dst += _y4m->pic_w * _y4m->pic_h;
+  /*Compute the size of each chroma plane.*/
+  c_w = (_y4m->pic_w + _y4m->src_c_dec_h - 1) / _y4m->src_c_dec_h;
+  c_h = _y4m->pic_h;
+  dst_c_w = (_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h;
+  dst_c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v;
+  c_sz = c_w * c_h;
+  dst_c_sz = dst_c_w * dst_c_h;
+  tmp_sz = dst_c_w * c_h;
+  tmp = _aux + 2 * c_sz;
+  for (pli = 1; pli < 3; pli++) {
+    /*Filter: [3 -17 78 78 -17 3]/128, derived from a 6-tap Lanczos window.*/
+    for (y = 0; y < c_h; y++) {
+      for (x = 0; x < OC_MINI(c_w, 2); x += 2) {
+        tmp[x >> 1] = OC_CLAMPI(0, (64 * _aux[0] + 78 * _aux[OC_MINI(1, c_w - 1)]
+                                    - 17 * _aux[OC_MINI(2, c_w - 1)]
+                                    + 3 * _aux[OC_MINI(3, c_w - 1)] + 64) >> 7, 255);
+      }
+      for (; x < c_w - 3; x += 2) {
+        tmp[x >> 1] = OC_CLAMPI(0, (3 * (_aux[x - 2] + _aux[x + 3])
+                                    - 17 * (_aux[x - 1] + _aux[x + 2]) + 78 * (_aux[x] + _aux[x + 1]) + 64) >> 7, 255);
+      }
+      for (; x < c_w; x += 2) {
+        tmp[x >> 1] = OC_CLAMPI(0, (3 * (_aux[x - 2] + _aux[c_w - 1]) -
+                                    17 * (_aux[x - 1] + _aux[OC_MINI(x + 2, c_w - 1)]) +
+                                    78 * (_aux[x] + _aux[OC_MINI(x + 1, c_w - 1)]) + 64) >> 7, 255);
+      }
+      tmp += dst_c_w;
+      _aux += c_w;
+    }
+    tmp -= tmp_sz;
+    /*Now do the vertical filtering.*/
+    y4m_422jpeg_420jpeg_helper(_dst, tmp, dst_c_w, c_h);
+    _dst += dst_c_sz;
+  }
+}
+
+/*The image is padded with empty chroma components at 4:2:0.*/
+static void y4m_convert_mono_420jpeg(y4m_input *_y4m, unsigned char *_dst,
+                                     unsigned char *_aux) {
+  int c_sz;
+  (void)_aux;
+  _dst += _y4m->pic_w * _y4m->pic_h;
+  c_sz = ((_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h) *
+         ((_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v);
+  memset(_dst, 128, c_sz * 2);
+}
+
+/*No conversion function needed.*/
+static void y4m_convert_null(y4m_input *_y4m, unsigned char *_dst,
+                             unsigned char *_aux) {
+  (void)_y4m;
+  (void)_dst;
+  (void)_aux;
+}
+
+int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip,
+                   int only_420) {
+  char buffer[80] = {0};
+  int  ret;
+  int  i;
+  /*Read until newline, or 80 cols, whichever happens first.*/
+  for (i = 0; i < 79; i++) {
+    if (_nskip > 0) {
+      buffer[i] = *_skip++;
+      _nskip--;
+    } else {
+      if (!file_read(buffer + i, 1, _fin)) return -1;
+    }
+    if (buffer[i] == '\n')break;
+  }
+  /*We skipped too much header data.*/
+  if (_nskip > 0)return -1;
+  if (i == 79) {
+    fprintf(stderr, "Error parsing header; not a YUV2MPEG2 file?\n");
+    return -1;
+  }
+  buffer[i] = '\0';
+  if (memcmp(buffer, "YUV4MPEG", 8)) {
+    fprintf(stderr, "Incomplete magic for YUV4MPEG file.\n");
+    return -1;
+  }
+  if (buffer[8] != '2') {
+    fprintf(stderr, "Incorrect YUV input file version; YUV4MPEG2 required.\n");
+  }
+  ret = y4m_parse_tags(_y4m, buffer + 5);
+  if (ret < 0) {
+    fprintf(stderr, "Error parsing YUV4MPEG2 header.\n");
+    return ret;
+  }
+  if (_y4m->interlace == '?') {
+    fprintf(stderr, "Warning: Input video interlacing format unknown; "
+            "assuming progressive scan.\n");
+  } else if (_y4m->interlace != 'p') {
+    fprintf(stderr, "Input video is interlaced; "
+            "Only progressive scan handled.\n");
+    return -1;
+  }
+  _y4m->vpx_fmt = VPX_IMG_FMT_I420;
+  _y4m->bps = 12;
+  _y4m->bit_depth = 8;
+  if (strcmp(_y4m->chroma_type, "420") == 0 ||
+      strcmp(_y4m->chroma_type, "420jpeg") == 0) {
+    _y4m->src_c_dec_h = _y4m->dst_c_dec_h = _y4m->src_c_dec_v = _y4m->dst_c_dec_v = 2;
+    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h
+                            + 2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2);
+    /* Natively supported: no conversion required. */
+    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
+    _y4m->convert = y4m_convert_null;
+  } else if (strcmp(_y4m->chroma_type, "420p10") == 0) {
+    _y4m->src_c_dec_h = 2;
+    _y4m->dst_c_dec_h = 2;
+    _y4m->src_c_dec_v = 2;
+    _y4m->dst_c_dec_v = 2;
+    _y4m->dst_buf_read_sz = 2 * (_y4m->pic_w * _y4m->pic_h +
+                                 2 * ((_y4m->pic_w + 1) / 2) *
+                                 ((_y4m->pic_h + 1) / 2));
+    /* Natively supported: no conversion required. */
+    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
+    _y4m->convert = y4m_convert_null;
+    _y4m->bit_depth = 10;
+    _y4m->bps = 15;
+    _y4m->vpx_fmt = VPX_IMG_FMT_I42016;
+    if (only_420) {
+      fprintf(stderr, "Unsupported conversion from 420p10 to 420jpeg\n");
+      return -1;
+    }
+  } else if (strcmp(_y4m->chroma_type, "420p12") == 0) {
+    _y4m->src_c_dec_h = 2;
+    _y4m->dst_c_dec_h = 2;
+    _y4m->src_c_dec_v = 2;
+    _y4m->dst_c_dec_v = 2;
+    _y4m->dst_buf_read_sz = 2 * (_y4m->pic_w * _y4m->pic_h +
+                                 2 * ((_y4m->pic_w + 1) / 2) *
+                                 ((_y4m->pic_h + 1) / 2));
+    /* Natively supported: no conversion required. */
+    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
+    _y4m->convert = y4m_convert_null;
+    _y4m->bit_depth = 12;
+    _y4m->bps = 18;
+    _y4m->vpx_fmt = VPX_IMG_FMT_I42016;
+    if (only_420) {
+      fprintf(stderr, "Unsupported conversion from 420p12 to 420jpeg\n");
+      return -1;
+    }
+  } else if (strcmp(_y4m->chroma_type, "420mpeg2") == 0) {
+    _y4m->src_c_dec_h = _y4m->dst_c_dec_h = _y4m->src_c_dec_v = _y4m->dst_c_dec_v = 2;
+    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+    /*Chroma filter required: read into the aux buf first.*/
+    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz =
+                         2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2);
+    _y4m->convert = y4m_convert_42xmpeg2_42xjpeg;
+  } else if (strcmp(_y4m->chroma_type, "420paldv") == 0) {
+    _y4m->src_c_dec_h = _y4m->dst_c_dec_h = _y4m->src_c_dec_v = _y4m->dst_c_dec_v = 2;
+    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+    /*Chroma filter required: read into the aux buf first.
+      We need to make two filter passes, so we need some extra space in the
+       aux buffer.*/
+    _y4m->aux_buf_sz = 3 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2);
+    _y4m->aux_buf_read_sz = 2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2);
+    _y4m->convert = y4m_convert_42xpaldv_42xjpeg;
+  } else if (strcmp(_y4m->chroma_type, "422jpeg") == 0) {
+    _y4m->src_c_dec_h = _y4m->dst_c_dec_h = 2;
+    _y4m->src_c_dec_v = 1;
+    _y4m->dst_c_dec_v = 2;
+    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+    /*Chroma filter required: read into the aux buf first.*/
+    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
+    _y4m->convert = y4m_convert_422jpeg_420jpeg;
+  } else if (strcmp(_y4m->chroma_type, "422") == 0) {
+    _y4m->src_c_dec_h = 2;
+    _y4m->src_c_dec_v = 1;
+    if (only_420) {
+      _y4m->dst_c_dec_h = 2;
+      _y4m->dst_c_dec_v = 2;
+      _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+      /*Chroma filter required: read into the aux buf first.
+        We need to make two filter passes, so we need some extra space in the
+         aux buffer.*/
+      _y4m->aux_buf_read_sz = 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
+      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz +
+          ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
+      _y4m->convert = y4m_convert_422_420jpeg;
+    } else {
+      _y4m->vpx_fmt = VPX_IMG_FMT_I422;
+      _y4m->bps = 16;
+      _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
+      _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
+      _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h
+                              + 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
+      /*Natively supported: no conversion required.*/
+      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
+      _y4m->convert = y4m_convert_null;
+    }
+  } else if (strcmp(_y4m->chroma_type, "422p10") == 0) {
+    _y4m->src_c_dec_h = 2;
+    _y4m->src_c_dec_v = 1;
+    _y4m->vpx_fmt = VPX_IMG_FMT_I42216;
+    _y4m->bps = 20;
+    _y4m->bit_depth = 10;
+    _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
+    _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
+    _y4m->dst_buf_read_sz = 2 * (_y4m->pic_w * _y4m->pic_h +
+                                 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h);
+    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
+    _y4m->convert = y4m_convert_null;
+    if (only_420) {
+      fprintf(stderr, "Unsupported conversion from 422p10 to 420jpeg\n");
+      return -1;
+    }
+  } else if (strcmp(_y4m->chroma_type, "422p12") == 0) {
+    _y4m->src_c_dec_h = 2;
+    _y4m->src_c_dec_v = 1;
+    _y4m->vpx_fmt = VPX_IMG_FMT_I42216;
+    _y4m->bps = 24;
+    _y4m->bit_depth = 12;
+    _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
+    _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
+    _y4m->dst_buf_read_sz = 2 * (_y4m->pic_w * _y4m->pic_h +
+                                 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h);
+    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
+    _y4m->convert = y4m_convert_null;
+    if (only_420) {
+      fprintf(stderr, "Unsupported conversion from 422p12 to 420jpeg\n");
+      return -1;
+    }
+  } else if (strcmp(_y4m->chroma_type, "411") == 0) {
+    _y4m->src_c_dec_h = 4;
+    _y4m->dst_c_dec_h = 2;
+    _y4m->src_c_dec_v = 1;
+    _y4m->dst_c_dec_v = 2;
+    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+    /*Chroma filter required: read into the aux buf first.
+      We need to make two filter passes, so we need some extra space in the
+       aux buffer.*/
+    _y4m->aux_buf_read_sz = 2 * ((_y4m->pic_w + 3) / 4) * _y4m->pic_h;
+    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz + ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
+    _y4m->convert = y4m_convert_411_420jpeg;
+  } else if (strcmp(_y4m->chroma_type, "444") == 0) {
+    _y4m->src_c_dec_h = 1;
+    _y4m->src_c_dec_v = 1;
+    if (only_420) {
+      _y4m->dst_c_dec_h = 2;
+      _y4m->dst_c_dec_v = 2;
+      _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+      /*Chroma filter required: read into the aux buf first.
+        We need to make two filter passes, so we need some extra space in the
+         aux buffer.*/
+      _y4m->aux_buf_read_sz = 2 * _y4m->pic_w * _y4m->pic_h;
+      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz +
+          ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
+      _y4m->convert = y4m_convert_444_420jpeg;
+    } else {
+      _y4m->vpx_fmt = VPX_IMG_FMT_I444;
+      _y4m->bps = 24;
+      _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
+      _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
+      _y4m->dst_buf_read_sz = 3 * _y4m->pic_w * _y4m->pic_h;
+      /*Natively supported: no conversion required.*/
+      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
+      _y4m->convert = y4m_convert_null;
+    }
+  } else if (strcmp(_y4m->chroma_type, "444p10") == 0) {
+    _y4m->src_c_dec_h = 1;
+    _y4m->src_c_dec_v = 1;
+    _y4m->vpx_fmt = VPX_IMG_FMT_I44416;
+    _y4m->bps = 30;
+    _y4m->bit_depth = 10;
+    _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
+    _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
+    _y4m->dst_buf_read_sz = 2 * 3 * _y4m->pic_w * _y4m->pic_h;
+    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
+    _y4m->convert = y4m_convert_null;
+    if (only_420) {
+      fprintf(stderr, "Unsupported conversion from 444p10 to 420jpeg\n");
+      return -1;
+    }
+  } else if (strcmp(_y4m->chroma_type, "444p12") == 0) {
+    _y4m->src_c_dec_h = 1;
+    _y4m->src_c_dec_v = 1;
+    _y4m->vpx_fmt = VPX_IMG_FMT_I44416;
+    _y4m->bps = 36;
+    _y4m->bit_depth = 12;
+    _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
+    _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
+    _y4m->dst_buf_read_sz = 2 * 3 * _y4m->pic_w * _y4m->pic_h;
+    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
+    _y4m->convert = y4m_convert_null;
+    if (only_420) {
+      fprintf(stderr, "Unsupported conversion from 444p12 to 420jpeg\n");
+      return -1;
+    }
+  } else if (strcmp(_y4m->chroma_type, "444alpha") == 0) {
+    _y4m->src_c_dec_h = 1;
+    _y4m->src_c_dec_v = 1;
+    if (only_420) {
+      _y4m->dst_c_dec_h = 2;
+      _y4m->dst_c_dec_v = 2;
+      _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+      /*Chroma filter required: read into the aux buf first.
+        We need to make two filter passes, so we need some extra space in the
+         aux buffer.
+        The extra plane also gets read into the aux buf.
+        It will be discarded.*/
+      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 3 * _y4m->pic_w * _y4m->pic_h;
+      _y4m->convert = y4m_convert_444_420jpeg;
+    } else {
+      _y4m->vpx_fmt = VPX_IMG_FMT_444A;
+      _y4m->bps = 32;
+      _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
+      _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
+      _y4m->dst_buf_read_sz = 4 * _y4m->pic_w * _y4m->pic_h;
+      /*Natively supported: no conversion required.*/
+      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
+      _y4m->convert = y4m_convert_null;
+    }
+  } else if (strcmp(_y4m->chroma_type, "mono") == 0) {
+    _y4m->src_c_dec_h = _y4m->src_c_dec_v = 0;
+    _y4m->dst_c_dec_h = _y4m->dst_c_dec_v = 2;
+    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+    /*No extra space required, but we need to clear the chroma planes.*/
+    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
+    _y4m->convert = y4m_convert_mono_420jpeg;
+  } else {
+    fprintf(stderr, "Unknown chroma sampling type: %s\n", _y4m->chroma_type);
+    return -1;
+  }
+  /*The size of the final frame buffers is always computed from the
+     destination chroma decimation type.*/
+  _y4m->dst_buf_sz = _y4m->pic_w * _y4m->pic_h
+                     + 2 * ((_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h) *
+                     ((_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v);
+  if (_y4m->bit_depth == 8)
+    _y4m->dst_buf = (unsigned char *)malloc(_y4m->dst_buf_sz);
+  else
+    _y4m->dst_buf = (unsigned char *)malloc(2 * _y4m->dst_buf_sz);
+
+  if (_y4m->aux_buf_sz > 0)
+    _y4m->aux_buf = (unsigned char *)malloc(_y4m->aux_buf_sz);
+  return 0;
+}
+
+void y4m_input_close(y4m_input *_y4m) {
+  free(_y4m->dst_buf);
+  free(_y4m->aux_buf);
+}
+
+int y4m_input_fetch_frame(y4m_input *_y4m, FILE *_fin, vpx_image_t *_img) {
+  char frame[6];
+  int  pic_sz;
+  int  c_w;
+  int  c_h;
+  int  c_sz;
+  int  bytes_per_sample = _y4m->bit_depth > 8 ? 2 : 1;
+  /*Read and skip the frame header.*/
+  if (!file_read(frame, 6, _fin)) return 0;
+  if (memcmp(frame, "FRAME", 5)) {
+    fprintf(stderr, "Loss of framing in Y4M input data\n");
+    return -1;
+  }
+  if (frame[5] != '\n') {
+    char c;
+    int  j;
+    for (j = 0; j < 79 && file_read(&c, 1, _fin) && c != '\n'; j++) {}
+    if (j == 79) {
+      fprintf(stderr, "Error parsing Y4M frame header\n");
+      return -1;
+    }
+  }
+  /*Read the frame data that needs no conversion.*/
+  if (!file_read(_y4m->dst_buf, _y4m->dst_buf_read_sz, _fin)) {
+    fprintf(stderr, "Error reading Y4M frame data.\n");
+    return -1;
+  }
+  /*Read the frame data that does need conversion.*/
+  if (!file_read(_y4m->aux_buf, _y4m->aux_buf_read_sz, _fin)) {
+    fprintf(stderr, "Error reading Y4M frame data.\n");
+    return -1;
+  }
+  /*Now convert the just read frame.*/
+  (*_y4m->convert)(_y4m, _y4m->dst_buf, _y4m->aux_buf);
+  /*Fill in the frame buffer pointers.
+    We don't use vpx_img_wrap() because it forces padding for odd picture
+     sizes, which would require a separate fread call for every row.*/
+  memset(_img, 0, sizeof(*_img));
+  /*Y4M has the planes in Y'CbCr order, which libvpx calls Y, U, and V.*/
+  _img->fmt = _y4m->vpx_fmt;
+  _img->w = _img->d_w = _y4m->pic_w;
+  _img->h = _img->d_h = _y4m->pic_h;
+  _img->x_chroma_shift = _y4m->dst_c_dec_h >> 1;
+  _img->y_chroma_shift = _y4m->dst_c_dec_v >> 1;
+  _img->bps = _y4m->bps;
+
+  /*Set up the buffer pointers.*/
+  pic_sz = _y4m->pic_w * _y4m->pic_h * bytes_per_sample;
+  c_w = (_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h;
+  c_w *= bytes_per_sample;
+  c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v;
+  c_sz = c_w * c_h;
+  _img->stride[VPX_PLANE_Y] = _img->stride[VPX_PLANE_ALPHA] =
+      _y4m->pic_w * bytes_per_sample;
+  _img->stride[VPX_PLANE_U] = _img->stride[VPX_PLANE_V] = c_w;
+  _img->planes[VPX_PLANE_Y] = _y4m->dst_buf;
+  _img->planes[VPX_PLANE_U] = _y4m->dst_buf + pic_sz;
+  _img->planes[VPX_PLANE_V] = _y4m->dst_buf + pic_sz + c_sz;
+  _img->planes[VPX_PLANE_ALPHA] = _y4m->dst_buf + pic_sz + 2 * c_sz;
+  return 1;
+}
diff --git a/libs/libvpx/y4minput.h b/libs/libvpx/y4minput.h
new file mode 100644
index 0000000000..356cebbcf0
--- /dev/null
+++ b/libs/libvpx/y4minput.h
@@ -0,0 +1,74 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ *  Based on code from the OggTheora software codec source code,
+ *  Copyright (C) 2002-2010 The Xiph.Org Foundation and contributors.
+ */
+
+#ifndef Y4MINPUT_H_
+#define Y4MINPUT_H_
+
+# include <stdio.h>
+# include "vpx/vpx_image.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+typedef struct y4m_input y4m_input;
+
+
+
+/*The function used to perform chroma conversion.*/
+typedef void (*y4m_convert_func)(y4m_input *_y4m,
+                                 unsigned char *_dst, unsigned char *_src);
+
+
+
+struct y4m_input {
+  int               pic_w;
+  int               pic_h;
+  int               fps_n;
+  int               fps_d;
+  int               par_n;
+  int               par_d;
+  char              interlace;
+  int               src_c_dec_h;
+  int               src_c_dec_v;
+  int               dst_c_dec_h;
+  int               dst_c_dec_v;
+  char              chroma_type[16];
+  /*The size of each converted frame buffer.*/
+  size_t            dst_buf_sz;
+  /*The amount to read directly into the converted frame buffer.*/
+  size_t            dst_buf_read_sz;
+  /*The size of the auxilliary buffer.*/
+  size_t            aux_buf_sz;
+  /*The amount to read into the auxilliary buffer.*/
+  size_t            aux_buf_read_sz;
+  y4m_convert_func  convert;
+  unsigned char    *dst_buf;
+  unsigned char    *aux_buf;
+  enum vpx_img_fmt  vpx_fmt;
+  int               bps;
+  unsigned int      bit_depth;
+};
+
+int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip,
+                   int only_420);
+void y4m_input_close(y4m_input *_y4m);
+int y4m_input_fetch_frame(y4m_input *_y4m, FILE *_fin, vpx_image_t *img);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // Y4MINPUT_H_
diff --git a/libs/libyuv/AUTHORS b/libs/libyuv/AUTHORS
new file mode 100644
index 0000000000..9686ac13eb
--- /dev/null
+++ b/libs/libyuv/AUTHORS
@@ -0,0 +1,4 @@
+# Names should be added to this file like so:
+# Name or Organization <email address>
+
+Google Inc.
diff --git a/libs/libyuv/Android.mk b/libs/libyuv/Android.mk
new file mode 100644
index 0000000000..4d2092acf5
--- /dev/null
+++ b/libs/libyuv/Android.mk
@@ -0,0 +1,63 @@
+# This is the Android makefile for libyuv for both platform and NDK.
+LOCAL_PATH:= $(call my-dir)
+
+include $(CLEAR_VARS)
+
+LOCAL_CPP_EXTENSION := .cc
+
+LOCAL_SRC_FILES := \
+    source/compare.cc           \
+    source/compare_common.cc    \
+    source/compare_neon64.cc    \
+    source/compare_gcc.cc       \
+    source/convert.cc           \
+    source/convert_argb.cc      \
+    source/convert_from.cc      \
+    source/convert_from_argb.cc \
+    source/convert_to_argb.cc   \
+    source/convert_to_i420.cc   \
+    source/cpu_id.cc            \
+    source/planar_functions.cc  \
+    source/rotate.cc            \
+    source/rotate_any.cc        \
+    source/rotate_argb.cc       \
+    source/rotate_common.cc     \
+    source/rotate_mips.cc       \
+    source/rotate_neon64.cc     \
+    source/rotate_gcc.cc        \
+    source/row_any.cc           \
+    source/row_common.cc        \
+    source/row_mips.cc          \
+    source/row_neon64.cc        \
+    source/row_gcc.cc	        \
+    source/scale.cc             \
+    source/scale_any.cc         \
+    source/scale_argb.cc        \
+    source/scale_common.cc      \
+    source/scale_mips.cc        \
+    source/scale_neon64.cc      \
+    source/scale_gcc.cc         \
+    source/video_common.cc
+
+# TODO(fbarchard): Enable mjpeg encoder.
+#   source/mjpeg_decoder.cc
+#   source/convert_jpeg.cc
+#   source/mjpeg_validate.cc
+
+ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
+    LOCAL_CFLAGS += -DLIBYUV_NEON
+    LOCAL_SRC_FILES += \
+        source/compare_neon.cc.neon    \
+        source/rotate_neon.cc.neon     \
+        source/row_neon.cc.neon        \
+        source/scale_neon.cc.neon
+endif
+
+LOCAL_EXPORT_C_INCLUDES := $(LOCAL_PATH)/include
+LOCAL_C_INCLUDES += $(LOCAL_PATH)/include
+
+LOCAL_MODULE := libyuv_static
+LOCAL_MODULE_TAGS := optional
+
+include $(BUILD_STATIC_LIBRARY)
+
diff --git a/libs/libyuv/BUILD.gn b/libs/libyuv/BUILD.gn
new file mode 100644
index 0000000000..d1e55de16c
--- /dev/null
+++ b/libs/libyuv/BUILD.gn
@@ -0,0 +1,133 @@
+# Copyright 2014 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+import("//build/config/arm.gni")
+import("//build/config/sanitizers/sanitizers.gni")
+
+config("libyuv_config") {
+  include_dirs = [
+    ".",
+    "include",
+  ]
+}
+
+use_neon = current_cpu == "arm64" || (current_cpu == "arm" && (arm_use_neon || arm_optionally_use_neon))
+
+source_set("libyuv") {
+  sources = [
+    "include/libyuv.h",
+    "include/libyuv/basic_types.h",
+    "include/libyuv/compare.h",
+    "include/libyuv/convert.h",
+    "include/libyuv/convert_argb.h",
+    "include/libyuv/convert_from.h",
+    "include/libyuv/convert_from_argb.h",
+    "include/libyuv/cpu_id.h",
+    "include/libyuv/mjpeg_decoder.h",
+    "include/libyuv/planar_functions.h",
+    "include/libyuv/rotate.h",
+    "include/libyuv/rotate_argb.h",
+    "include/libyuv/rotate_row.h",
+    "include/libyuv/row.h",
+    "include/libyuv/scale.h",
+    "include/libyuv/scale_argb.h",
+    "include/libyuv/scale_row.h",
+    "include/libyuv/version.h",
+    "include/libyuv/video_common.h",
+
+    # sources.
+    "source/compare.cc",
+    "source/compare_common.cc",
+    "source/compare_gcc.cc",
+    "source/compare_win.cc",
+    "source/convert.cc",
+    "source/convert_argb.cc",
+    "source/convert_from.cc",
+    "source/convert_from_argb.cc",
+    "source/convert_jpeg.cc",
+    "source/convert_to_argb.cc",
+    "source/convert_to_i420.cc",
+    "source/cpu_id.cc",
+    "source/mjpeg_decoder.cc",
+    "source/mjpeg_validate.cc",
+    "source/planar_functions.cc",
+    "source/rotate.cc",
+    "source/rotate_any.cc",
+    "source/rotate_argb.cc",
+    "source/rotate_common.cc",
+    "source/rotate_mips.cc",
+    "source/rotate_gcc.cc",
+    "source/rotate_win.cc",
+    "source/row_any.cc",
+    "source/row_common.cc",
+    "source/row_mips.cc",
+    "source/row_gcc.cc",
+    "source/row_win.cc",
+    "source/scale.cc",
+    "source/scale_any.cc",
+    "source/scale_argb.cc",
+    "source/scale_common.cc",
+    "source/scale_mips.cc",
+    "source/scale_gcc.cc",
+    "source/scale_win.cc",
+    "source/video_common.cc",
+  ]
+
+  configs -= [ "//build/config/compiler:chromium_code" ]
+  configs += [ "//build/config/compiler:no_chromium_code" ]
+
+  public_configs = [ ":libyuv_config" ]
+
+  defines = []
+
+  if (!is_ios) {
+    defines += [ "HAVE_JPEG" ]
+  }
+
+  if (is_msan) {
+    # MemorySanitizer does not support assembly code yet.
+    # http://crbug.com/344505
+    defines += [ "LIBYUV_DISABLE_X86" ]
+  }
+
+  deps = [
+    "//third_party:jpeg",
+  ]
+
+  if (use_neon) {
+    deps += [ ":libyuv_neon" ]
+  }
+
+  if (is_nacl) {
+    # Always enable optimization under NaCl to workaround crbug.com/538243 .
+    configs -= [ "//build/config/compiler:default_optimization" ]
+    configs += [ "//build/config/compiler:optimize_max" ]
+  }
+}
+
+if (use_neon) {
+  static_library("libyuv_neon") {
+    sources = [
+      "source/compare_neon.cc",
+      "source/compare_neon64.cc",
+      "source/rotate_neon.cc",
+      "source/rotate_neon64.cc",
+      "source/row_neon.cc",
+      "source/row_neon64.cc",
+      "source/scale_neon.cc",
+      "source/scale_neon64.cc",
+    ]
+
+    public_configs = [ ":libyuv_config" ]
+
+    if (current_cpu != "arm64") {
+      configs -= [ "//build/config/compiler:compiler_arm_fpu" ]
+      cflags = [ "-mfpu=neon" ]
+    }
+  }
+}
diff --git a/libs/libyuv/CMakeLists.txt b/libs/libyuv/CMakeLists.txt
new file mode 100644
index 0000000000..f74c05f6b7
--- /dev/null
+++ b/libs/libyuv/CMakeLists.txt
@@ -0,0 +1,141 @@
+cmake_minimum_required(VERSION 2.8)
+
+# CMakeLists for libyuv
+# Originally created for "roxlu build system" to compile libyuv on windows
+# Run with -DTEST=ON to build unit tests
+option(TEST "Built unit tests" OFF)
+
+set(ly_base_dir ${CMAKE_CURRENT_LIST_DIR})
+set(ly_src_dir ${ly_base_dir}/source/)
+set(ly_inc_dir ${ly_base_dir}/include)
+set(ly_lib_name "yuv")
+
+set(ly_source_files
+  ${ly_src_dir}/compare.cc
+  ${ly_src_dir}/compare_common.cc
+  ${ly_src_dir}/compare_neon.cc
+  ${ly_src_dir}/compare_neon64.cc
+  ${ly_src_dir}/compare_gcc.cc
+  ${ly_src_dir}/compare_win.cc
+  ${ly_src_dir}/convert.cc
+  ${ly_src_dir}/convert_argb.cc
+  ${ly_src_dir}/convert_from.cc
+  ${ly_src_dir}/convert_from_argb.cc
+  ${ly_src_dir}/convert_jpeg.cc
+  ${ly_src_dir}/convert_to_argb.cc
+  ${ly_src_dir}/convert_to_i420.cc
+  ${ly_src_dir}/cpu_id.cc
+  ${ly_src_dir}/mjpeg_decoder.cc
+  ${ly_src_dir}/mjpeg_validate.cc
+  ${ly_src_dir}/planar_functions.cc
+  ${ly_src_dir}/rotate.cc
+  ${ly_src_dir}/rotate_any.cc
+  ${ly_src_dir}/rotate_argb.cc
+  ${ly_src_dir}/rotate_common.cc
+  ${ly_src_dir}/rotate_mips.cc
+  ${ly_src_dir}/rotate_neon.cc
+  ${ly_src_dir}/rotate_neon64.cc
+  ${ly_src_dir}/rotate_gcc.cc
+  ${ly_src_dir}/rotate_win.cc
+  ${ly_src_dir}/row_any.cc
+  ${ly_src_dir}/row_common.cc
+  ${ly_src_dir}/row_mips.cc
+  ${ly_src_dir}/row_neon.cc
+  ${ly_src_dir}/row_neon64.cc
+  ${ly_src_dir}/row_gcc.cc
+  ${ly_src_dir}/row_win.cc
+  ${ly_src_dir}/scale.cc
+  ${ly_src_dir}/scale_any.cc
+  ${ly_src_dir}/scale_argb.cc
+  ${ly_src_dir}/scale_common.cc
+  ${ly_src_dir}/scale_mips.cc
+  ${ly_src_dir}/scale_neon.cc
+  ${ly_src_dir}/scale_neon64.cc
+  ${ly_src_dir}/scale_gcc.cc
+  ${ly_src_dir}/scale_win.cc
+  ${ly_src_dir}/video_common.cc
+)
+
+set(ly_unittest_sources
+  ${ly_base_dir}/unit_test/basictypes_test.cc
+  ${ly_base_dir}/unit_test/color_test.cc
+  ${ly_base_dir}/unit_test/compare_test.cc
+  ${ly_base_dir}/unit_test/convert_test.cc
+  ${ly_base_dir}/unit_test/cpu_test.cc
+  ${ly_base_dir}/unit_test/math_test.cc
+  ${ly_base_dir}/unit_test/planar_test.cc
+  ${ly_base_dir}/unit_test/rotate_argb_test.cc
+  ${ly_base_dir}/unit_test/rotate_test.cc
+  ${ly_base_dir}/unit_test/scale_argb_test.cc
+  ${ly_base_dir}/unit_test/scale_test.cc
+  ${ly_base_dir}/unit_test/unit_test.cc
+  ${ly_base_dir}/unit_test/video_common_test.cc
+)
+
+set(ly_header_files
+  ${ly_inc_dir}/libyuv/basic_types.h
+  ${ly_inc_dir}/libyuv/compare.h
+  ${ly_inc_dir}/libyuv/convert.h
+  ${ly_inc_dir}/libyuv/convert_argb.h
+  ${ly_inc_dir}/libyuv/convert_from.h
+  ${ly_inc_dir}/libyuv/convert_from_argb.h
+  ${ly_inc_dir}/libyuv/cpu_id.h
+  ${ly_inc_dir}/libyuv/planar_functions.h
+  ${ly_inc_dir}/libyuv/rotate.h
+  ${ly_inc_dir}/libyuv/rotate_argb.h
+  ${ly_inc_dir}/libyuv/rotate_row.h
+  ${ly_inc_dir}/libyuv/row.h
+  ${ly_inc_dir}/libyuv/scale.h
+  ${ly_inc_dir}/libyuv/scale_argb.h
+  ${ly_inc_dir}/libyuv/scale_row.h
+  ${ly_inc_dir}/libyuv/version.h
+  ${ly_inc_dir}/libyuv/video_common.h
+  ${ly_inc_dir}/libyuv/mjpeg_decoder.h
+)
+
+include_directories(${ly_inc_dir})
+
+add_library(${ly_lib_name} STATIC ${ly_source_files})
+
+add_executable(convert ${ly_base_dir}/util/convert.cc)
+target_link_libraries(convert ${ly_lib_name})
+
+include(FindJPEG)
+if (JPEG_FOUND)
+  include_directories(${JPEG_INCLUDE_DIR})
+  target_link_libraries(convert ${JPEG_LIBRARY})
+  add_definitions(-DHAVE_JPEG)
+endif()
+
+if(TEST)
+  find_library(GTEST_LIBRARY gtest)
+  if(GTEST_LIBRARY STREQUAL "GTEST_LIBRARY-NOTFOUND")
+    set(GTEST_SRC_DIR /usr/src/gtest CACHE STRING "Location of gtest sources")
+    if(EXISTS ${GTEST_SRC_DIR}/src/gtest-all.cc)
+      message(STATUS "building gtest from sources in ${GTEST_SRC_DIR}")
+      set(gtest_sources ${GTEST_SRC_DIR}/src/gtest-all.cc)
+      add_library(gtest STATIC ${gtest_sources})
+      include_directories(${GTEST_SRC_DIR})
+      include_directories(${GTEST_SRC_DIR}/include)
+      set(GTEST_LIBRARY gtest)
+    else()
+      message(FATAL_ERROR "TEST is set but unable to find gtest library")
+    endif()
+  endif()
+
+  add_executable(libyuv_unittest ${ly_unittest_sources})
+  target_link_libraries(libyuv_unittest ${ly_lib_name} ${GTEST_LIBRARY} pthread)
+  if (JPEG_FOUND)
+    target_link_libraries(libyuv_unittest ${JPEG_LIBRARY})
+  endif()
+endif()
+
+if(NACL AND NACL_LIBC STREQUAL "newlib")
+  target_link_libraries(libyuv_unittest glibc-compat)
+endif()
+
+target_link_libraries(libyuv_unittest gflags)
+
+install(TARGETS ${ly_lib_name} DESTINATION lib)
+install(FILES ${ly_header_files} DESTINATION include/libyuv)
+install(FILES ${ly_inc_dir}/libyuv.h DESTINATION include/)
diff --git a/libs/libyuv/DEPS b/libs/libyuv/DEPS
new file mode 100644
index 0000000000..f53bb6be1a
--- /dev/null
+++ b/libs/libyuv/DEPS
@@ -0,0 +1,42 @@
+vars = {
+  # Override root_dir in your .gclient's custom_vars to specify a custom root
+  # folder name.
+  'root_dir': 'libyuv',
+  'extra_gyp_flag': '-Dextra_gyp_flag=0',
+  'chromium_git': 'https://chromium.googlesource.com',
+
+  # Roll the Chromium Git hash to pick up newer versions of all the
+  # dependencies and tools linked to in setup_links.py.
+  'chromium_revision': '3c455872750c9d0f74266b04f97701a516ac9075',
+}
+
+# NOTE: Prefer revision numbers to tags for svn deps. Use http rather than
+# https; the latter can cause problems for users behind proxies.
+deps = {
+  Var('root_dir') + '/third_party/gflags/src':
+    Var('chromium_git') + '/external/gflags/src@e7390f9185c75f8d902c05ed7d20bb94eb914d0c', # from svn revision 82
+}
+
+# Define rules for which include paths are allowed in our source.
+include_rules = [ '+gflags' ]
+
+hooks = [
+  {
+    # Clone chromium and its deps.
+    'name': 'sync chromium',
+    'pattern': '.',
+    'action': ['python', '-u', Var('root_dir') + '/sync_chromium.py',
+               '--target-revision', Var('chromium_revision')],
+  },
+  {
+    # Create links to shared dependencies in Chromium.
+    'name': 'setup_links',
+    'pattern': '.',
+    'action': ['python', Var('root_dir') + '/setup_links.py'],
+  },
+  {
+    # A change to a .gyp, .gypi, or to GYP itself should run the generator.
+    'pattern': '.',
+    'action': ['python', Var('root_dir') + '/gyp_libyuv'],
+  },
+]
diff --git a/libs/libyuv/LICENSE b/libs/libyuv/LICENSE
new file mode 100644
index 0000000000..c911747a6b
--- /dev/null
+++ b/libs/libyuv/LICENSE
@@ -0,0 +1,29 @@
+Copyright 2011 The LibYuv Project Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+
+  * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in
+    the documentation and/or other materials provided with the
+    distribution.
+
+  * Neither the name of Google nor the names of its contributors may
+    be used to endorse or promote products derived from this software
+    without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/libs/libyuv/LICENSE_THIRD_PARTY b/libs/libyuv/LICENSE_THIRD_PARTY
new file mode 100644
index 0000000000..a71591e771
--- /dev/null
+++ b/libs/libyuv/LICENSE_THIRD_PARTY
@@ -0,0 +1,8 @@
+This source tree contains third party source code which is governed by third
+party licenses. This file contains references to files which are under other
+licenses than the one provided in the LICENSE file in the root of the source
+tree.
+
+Files governed by third party licenses:
+source/x86inc.asm
+
diff --git a/libs/libyuv/OWNERS b/libs/libyuv/OWNERS
new file mode 100644
index 0000000000..2db52d3079
--- /dev/null
+++ b/libs/libyuv/OWNERS
@@ -0,0 +1,13 @@
+fbarchard@chromium.org
+magjed@chromium.org
+torbjorng@chromium.org
+
+per-file *.gyp=kjellander@chromium.org
+per-file *.gn=kjellander@chromium.org
+per-file .gitignore=*
+per-file AUTHORS=*
+per-file DEPS=*
+per-file PRESUBMIT.py=kjellander@chromium.org
+per-file gyp_libyuv.py=kjellander@chromium.org
+per-file setup_links.py=*
+per-file sync_chromium.py=kjellander@chromium.org
diff --git a/libs/libyuv/PATENTS b/libs/libyuv/PATENTS
new file mode 100644
index 0000000000..64aa5c90d8
--- /dev/null
+++ b/libs/libyuv/PATENTS
@@ -0,0 +1,24 @@
+Additional IP Rights Grant (Patents)
+
+"This implementation" means the copyrightable works distributed by
+Google as part of the LibYuv code package.
+
+Google hereby grants to you a perpetual, worldwide, non-exclusive,
+no-charge, irrevocable (except as stated in this section) patent
+license to make, have made, use, offer to sell, sell, import,
+transfer, and otherwise run, modify and propagate the contents of this
+implementation of the LibYuv code package, where such license applies
+only to those patent claims, both currently owned by Google and
+acquired in the future, licensable by Google that are necessarily
+infringed by this implementation of the LibYuv code package. This
+grant does not include claims that would be infringed only as a
+consequence of further modification of this implementation. If you or
+your agent or exclusive licensee institute or order or agree to the
+institution of patent litigation against any entity (including a
+cross-claim or counterclaim in a lawsuit) alleging that this
+implementation of the LibYuv code package or any code incorporated
+within this implementation of the LibYuv code package constitutes
+direct or contributory patent infringement, or inducement of patent
+infringement, then any patent rights granted to you under this License
+for this implementation of the LibYuv code package shall terminate as
+of the date such litigation is filed.
\ No newline at end of file
diff --git a/libs/libyuv/PRESUBMIT.py b/libs/libyuv/PRESUBMIT.py
new file mode 100755
index 0000000000..61d92aea17
--- /dev/null
+++ b/libs/libyuv/PRESUBMIT.py
@@ -0,0 +1,51 @@
+# Copyright 2014 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+import re
+import sys
+
+
+def GetDefaultTryConfigs(bots=None):
+  """Returns a list of ('bot', set(['tests']), optionally filtered by [bots].
+
+  For WebRTC purposes, we always return an empty list of tests, since we want
+  to run all tests by default on all our trybots.
+  """
+  return { 'tryserver.libyuv': dict((bot, []) for bot in bots)}
+
+
+# pylint: disable=W0613
+def GetPreferredTryMasters(project, change):
+  files = change.LocalPaths()
+  bots = [
+    'win',
+    'win_rel',
+    'win_x64_rel',
+    'mac',
+    'mac_rel',
+    'ios',
+    'ios_rel',
+    'ios_arm64',
+    'ios_arm64_rel',
+    'mac_asan',
+    'linux',
+    'linux_rel',
+    'linux_memcheck',
+    'linux_tsan2',
+    'linux_asan',
+    'linux_msan',
+    'linux_ubsan',
+    'linux_ubsan_vptr',
+    'android',
+    'android_rel',
+    'android_clang',
+    'android_arm64',
+  ]
+  if not files or all(re.search(r'[\\/]OWNERS$', f) for f in files):
+    return {}
+  return GetDefaultTryConfigs(bots)
diff --git a/libs/libyuv/README.chromium b/libs/libyuv/README.chromium
new file mode 100644
index 0000000000..d5401f6b14
--- /dev/null
+++ b/libs/libyuv/README.chromium
@@ -0,0 +1,8 @@
+Name: libyuv
+URL: http://code.google.com/p/libyuv/
+Version: 1577
+License: BSD
+License File: LICENSE
+
+Description:
+libyuv is an open source project that includes YUV conversion and scaling functionality.
diff --git a/libs/libyuv/README.md b/libs/libyuv/README.md
new file mode 100644
index 0000000000..7b11325d37
--- /dev/null
+++ b/libs/libyuv/README.md
@@ -0,0 +1,18 @@
+**libyuv** is an open source project that includes YUV scaling and conversion functionality.
+
+* Scale YUV to prepare content for compression, with point, bilinear or box filter.
+* Convert to YUV from webcam formats.
+* Convert from YUV to formats for rendering/effects.
+* Rotate by 90/180/270 degrees to adjust for mobile devices in portrait mode.
+* Optimized for SSE2/SSSE3/AVX2 on x86/x64.
+* Optimized for Neon on Arm.
+* Optimized for DSP R2 on Mips.
+
+### Development
+
+See [Getting started] [1] for instructions on how to get started developing.
+
+You can also browse the [docs directory] [2] for more documentation.
+
+[1]: docs/getting_started.md
+[2]: docs/
diff --git a/libs/libyuv/all.gyp b/libs/libyuv/all.gyp
new file mode 100644
index 0000000000..88a7484271
--- /dev/null
+++ b/libs/libyuv/all.gyp
@@ -0,0 +1,21 @@
+# Copyright 2013 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+# all.gyp and All target are for benefit of android gyp build.
+{
+  'targets': [
+    {
+      'target_name': 'All',
+      'type': 'none',
+      'dependencies': [
+        'libyuv.gyp:*',
+        'libyuv_test.gyp:*',
+      ],
+    },
+  ],
+}
diff --git a/libs/libyuv/chromium/.gclient b/libs/libyuv/chromium/.gclient
new file mode 100644
index 0000000000..330ce6365e
--- /dev/null
+++ b/libs/libyuv/chromium/.gclient
@@ -0,0 +1,24 @@
+solutions = [{
+  'name': 'src',
+  'url': 'https://chromium.googlesource.com/chromium/src.git',
+  'deps_file': '.DEPS.git',
+  'managed': False,
+  'custom_deps': {
+    # Skip syncing some large dependencies Libyuv will never need.
+    'src/chrome/tools/test/reference_build/chrome_linux': None,
+    'src/chrome/tools/test/reference_build/chrome_mac': None,
+    'src/chrome/tools/test/reference_build/chrome_win': None,
+    'src/native_client': None,
+    'src/third_party/cld_2/src': None,
+    'src/third_party/ffmpeg': None,
+    'src/third_party/hunspell_dictionaries': None,
+    'src/third_party/liblouis/src': None,
+    'src/third_party/pdfium': None,
+    'src/third_party/skia': None,
+    'src/third_party/trace-viewer': None,
+    'src/third_party/webrtc': None,
+  },
+  'safesync_url': ''
+}]
+
+cache_dir = None
diff --git a/libs/libyuv/chromium/README b/libs/libyuv/chromium/README
new file mode 100644
index 0000000000..127f4b520f
--- /dev/null
+++ b/libs/libyuv/chromium/README
@@ -0,0 +1,5 @@
+This .gclient file is used to do download a copy of Chromium.
+Libyuv uses the Chromium build toolchain and a number of shared
+dependencies by creating symlinks to folders in this checkout,
+using the ../setup_links.py script.
+
diff --git a/libs/libyuv/codereview.settings b/libs/libyuv/codereview.settings
new file mode 100644
index 0000000000..9b5380694e
--- /dev/null
+++ b/libs/libyuv/codereview.settings
@@ -0,0 +1,12 @@
+# This file is used by gcl to get repository specific information.
+CODE_REVIEW_SERVER: codereview.chromium.org
+#CC_LIST:
+VIEW_VC: https://chromium.googlesource.com/libyuv/libyuv/+/
+#STATUS:
+FORCE_HTTPS_COMMIT_URL: True
+PROJECT: libyuv
+TRY_ON_UPLOAD: False
+TRYSERVER_ROOT: src
+TRYSERVER_SVN_URL: svn://svn.chromium.org/chrome-try/try-libyuv
+#GITCL_PREUPLOAD:
+#GITCL_PREDCOMMIT:
diff --git a/libs/libyuv/docs/environment_variables.md b/libs/libyuv/docs/environment_variables.md
new file mode 100644
index 0000000000..bc5e2f6fdb
--- /dev/null
+++ b/libs/libyuv/docs/environment_variables.md
@@ -0,0 +1,32 @@
+# Introduction
+
+For test purposes, environment variables can be set to control libyuv behavior.  These should only be used for testing, to narrow down bugs or to test performance.
+
+# CPU
+
+By default the cpu is detected and the most advanced form of SIMD is used.  But you can disable instruction sets selectively, or completely, falling back on C code.  Set the variable to 1 to disable the specified instruction set.
+
+    LIBYUV_DISABLE_ASM
+    LIBYUV_DISABLE_X86
+    LIBYUV_DISABLE_SSE2
+    LIBYUV_DISABLE_SSSE3
+    LIBYUV_DISABLE_SSE41
+    LIBYUV_DISABLE_SSE42
+    LIBYUV_DISABLE_AVX
+    LIBYUV_DISABLE_AVX2
+    LIBYUV_DISABLE_AVX3
+    LIBYUV_DISABLE_ERMS
+    LIBYUV_DISABLE_FMA3
+    LIBYUV_DISABLE_DSPR2
+    LIBYUV_DISABLE_NEON
+
+# Test Width/Height/Repeat
+
+The unittests default to a small image (32x18) to run fast.  This can be set by environment variable to test a specific resolutions.
+You can also repeat the test a specified number of iterations, allowing benchmarking and profiling.
+
+    set LIBYUV_WIDTH=1280
+    set LIBYUV_HEIGHT=720
+    set LIBYUV_REPEAT=999
+    set LIBYUV_FLAGS=-1
+    set LIBYUV_CPU_INFO=-1
diff --git a/libs/libyuv/docs/filtering.md b/libs/libyuv/docs/filtering.md
new file mode 100644
index 0000000000..8696976e8a
--- /dev/null
+++ b/libs/libyuv/docs/filtering.md
@@ -0,0 +1,196 @@
+# Introduction
+
+This document discusses the current state of filtering in libyuv. An emphasis on maximum performance while avoiding memory exceptions, and minimal amount of code/complexity.  See future work at end.
+
+# LibYuv Filter Subsampling
+
+There are 2 challenges with subsampling
+
+1. centering of samples, which involves clamping on edges
+2. clipping a source region
+
+Centering depends on scale factor and filter mode.
+
+# Down Sampling
+
+If scaling down, the stepping rate is always src_width / dst_width.
+
+    dx = src_width / dst_width;
+
+e.g. If scaling from 1280x720 to 640x360, the step thru the source will be 2.0, stepping over 2 pixels of source for each pixel of destination.
+
+Centering, depends on filter mode.
+
+*Point* downsampling takes the middle pixel.
+
+    x = dx >> 1;
+
+For odd scale factors (e.g. 3x down) this is exactly the middle.  For even scale factors, this rounds up and takes the pixel to the right of center.  e.g. scale of 4x down will take pixel 2.
+
+**Bilinear** filter, uses the 2x2 pixels in the middle.
+
+    x = dx / 2 - 0.5;
+
+For odd scale factors (e.g. 3x down) this is exactly the middle, and point sampling is used.
+For even scale factors, this evenly filters the middle 2x2 pixels.  e.g. 4x down will filter pixels 1,2 at 50% in both directions.
+
+**Box** filter averages the entire box so sampling starts at 0.
+
+    x = 0;
+
+For a scale factor of 2x down, this is equivalent to bilinear.
+
+# Up Sampling
+
+**Point** upsampling use stepping rate of src_width / dst_width and a starting coordinate of 0.
+
+    x = 0;
+    dx = src_width / dst_width;
+
+e.g. If scaling from 640x360 to 1280x720 the step thru the source will be 0.0, stepping half a pixel of source for each pixel of destination. Each pixel is replicated by the scale factor.
+
+**Bilinear** filter stretches such that the first pixel of source maps to the first pixel of destination, and the last pixel of source maps to the last pixel of destination.
+
+    x = 0;
+    dx = (src_width - 1) / (dst_width - 1);
+
+This method is not technically correct, and will likely change in the future.
+
+* It is inconsistent with the bilinear down sampler.  The same method could be used for down sampling, and then it would be more reversible, but that would prevent specialized 2x down sampling.
+* Although centered, the image is slightly magnified.
+* The filtering was changed in early 2013 - previously it used:
+
+        x = 0;
+        dx = (src_width - 1) / (dst_width - 1);
+
+Which is the correct scale factor, but shifted the image left, and extruded the last pixel.  The reason for the change was to remove the extruding code from the low level row functions, allowing 3 functions to sshare the same row functions - ARGBScale, I420Scale, and ARGBInterpolate.  Then the one function was ported to many cpu variations: SSE2, SSSE3, AVX2, Neon and 'Any' version for any number of pixels and alignment.  The function is also specialized for 0,25,50,75%.
+
+The above goes still has the potential to read the last pixel 100% and last pixel + 1 0%, which may cause a memory exception.  So the left pixel goes to a fraction less than the last pixel, but filters in the minimum amount of it, and the maximum of the last pixel.
+
+    dx = FixedDiv((src_width << 16) - 0x00010001, (dst << 16) - 0x00010000);
+
+**Box** filter for upsampling switches over to Bilinear.
+
+# Scale snippet:
+
+    #define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s)
+    #define FIXEDDIV1(src, dst) FixedDiv((src << 16) - 0x00010001, \
+                                         (dst << 16) - 0x00010000);
+
+    // Compute slope values for stepping.
+    void ScaleSlope(int src_width, int src_height,
+                    int dst_width, int dst_height,
+                    FilterMode filtering,
+                    int* x, int* y, int* dx, int* dy) {
+      assert(x != NULL);
+      assert(y != NULL);
+      assert(dx != NULL);
+      assert(dy != NULL);
+      assert(src_width != 0);
+      assert(src_height != 0);
+      assert(dst_width > 0);
+      assert(dst_height > 0);
+      if (filtering == kFilterBox) {
+        // Scale step for point sampling duplicates all pixels equally.
+        *dx = FixedDiv(Abs(src_width), dst_width);
+        *dy = FixedDiv(src_height, dst_height);
+        *x = 0;
+        *y = 0;
+      } else if (filtering == kFilterBilinear) {
+        // Scale step for bilinear sampling renders last pixel once for upsample.
+        if (dst_width <= Abs(src_width)) {
+          *dx = FixedDiv(Abs(src_width), dst_width);
+          *x = CENTERSTART(*dx, -32768);
+        } else if (dst_width > 1) {
+          *dx = FIXEDDIV1(Abs(src_width), dst_width);
+          *x = 0;
+        }
+        if (dst_height <= src_height) {
+          *dy = FixedDiv(src_height,  dst_height);
+          *y = CENTERSTART(*dy, -32768);  // 32768 = -0.5 to center bilinear.
+        } else if (dst_height > 1) {
+          *dy = FIXEDDIV1(src_height, dst_height);
+          *y = 0;
+        }
+      } else if (filtering == kFilterLinear) {
+        // Scale step for bilinear sampling renders last pixel once for upsample.
+        if (dst_width <= Abs(src_width)) {
+          *dx = FixedDiv(Abs(src_width), dst_width);
+          *x = CENTERSTART(*dx, -32768);
+        } else if (dst_width > 1) {
+          *dx = FIXEDDIV1(Abs(src_width), dst_width);
+          *x = 0;
+        }
+        *dy = FixedDiv(src_height, dst_height);
+        *y = *dy >> 1;
+      } else {
+        // Scale step for point sampling duplicates all pixels equally.
+        *dx = FixedDiv(Abs(src_width), dst_width);
+        *dy = FixedDiv(src_height, dst_height);
+        *x = CENTERSTART(*dx, 0);
+        *y = CENTERSTART(*dy, 0);
+      }
+      // Negative src_width means horizontally mirror.
+      if (src_width < 0) {
+        *x += (dst_width - 1) * *dx;
+        *dx = -*dx;
+        src_width = -src_width;
+      }
+    }
+
+# Future Work
+
+Point sampling should ideally be the same as bilinear, but pixel by pixel, round to nearest neighbor.  But as is, it is reversible and exactly matches ffmpeg at all scale factors, both up and down.  The scale factor is
+
+    dx = src_width / dst_width;
+
+The step value is centered for down sample:
+
+    x = dx / 2;
+
+Or starts at 0 for upsample.
+
+    x = 0;
+
+Bilinear filtering is currently correct for down sampling, but not for upsampling.
+Upsampling is stretching the first and last pixel of source, to the first and last pixel of destination.
+
+    dx = (src_width - 1) / (dst_width - 1);<br>
+    x = 0;
+
+It should be stretching such that the first pixel is centered in the middle of the scale factor, to match the pixel that would be sampled for down sampling by the same amount.  And same on last pixel.
+
+    dx = src_width / dst_width;<br>
+    x = dx / 2 - 0.5;
+
+This would start at -0.5 and go to last pixel + 0.5, sampling 50% from last pixel + 1.
+Then clamping would be needed.  On GPUs there are numerous ways to clamp.
+
+1. Clamp the coordinate to the edge of the texture, duplicating the first and last pixel.
+2. Blend with a constant color, such as transparent black.  Typically best for fonts.
+3. Mirror the UV coordinate, which is similar to clamping.  Good for continuous tone images.
+4. Wrap the coordinate, for texture tiling.
+5. Allow the coordinate to index beyond the image, which may be the correct data if sampling a subimage.
+6. Extrapolate the edge based on the previous pixel.  pixel -0.5 is computed from slope of pixel 0 and 1.
+
+Some of these are computational, even for a GPU, which is one reason textures are sometimes limited to power of 2 sizes.
+We do care about the clipping case, where allowing coordinates to become negative and index pixels before the image is the correct data.  But normally for simple scaling, we want to clamp to the edge pixel.  For example, if bilinear scaling from 3x3 to 30x30, we’d essentially want 10 pixels of each of the original 3 pixels.  But we want the original pixels to land in the middle of each 10 pixels, at offsets 5, 15 and 25.  There would be filtering between 5 and 15 between the original pixels 0 and 1.  And filtering between 15 and 25 from original pixels 1 and 2.  The first 5 pixels are clamped to pixel 0 and the last 5 pixels are clamped to pixel 2.
+The easiest way to implement this is copy the original 3 pixels to a buffer, and duplicate the first and last pixels.  0,1,2 becomes 0, 0,1,2, 2.  Then implement a filtering without clamping.  We call this source extruding.  Its only necessary on up sampling, since down sampler will always have valid surrounding pixels.
+Extruding is practical when the image is already copied to a temporary buffer.   It could be done to the original image, as long as the original memory is restored, but valgrind and/or memory protection would disallow this, so it requires a memcpy to a temporary buffer, which may hurt performance.  The memcpy has a performance advantage, from a cache point of view, that can actually make this technique faster, depending on hardware characteristics.
+Vertical extrusion can be done with a memcpy of the first/last row, or clamping a pointer.
+
+
+The other way to implement clamping is handle the edges with a memset.  e.g. Read first source pixel and memset the first 5 pixels.  Filter pixels 0,1,2 to 5 to 25.  Read last pixel and memset the last 5 pixels.  Blur is implemented with this method like this, which has 3 loops per row - left, middle and right.
+
+Box filter is only used for 2x down sample or more.  Its based on integer sized boxes.  Technically it should be filtered edges, but thats substantially slower (roughly 100x), and at that point you may as well do a cubic filter which is more correct.
+
+Box filter currently sums rows into a row buffer.  It does this with
+
+Mirroring will use the same slope as normal, but with a negative.
+The starting coordinate needs to consider the scale factor and filter.  e.g. box filter of 30x30 to 3x3 with mirroring would use -10 for step, but x = 20.  width (30) - dx.
+
+Step needs to be accurate, so it uses an integer divide.  This is as much as 5% of the profile.  An approximated divide is substantially faster, but the inaccuracy causes stepping beyond the original image boundaries.  3 general solutions:
+
+1. copy image to buffer with padding.  allows for small errors in stepping.
+2. hash the divide, so common values are quickly found.
+3. change api so caller provides the slope.
diff --git a/libs/libyuv/docs/formats.md b/libs/libyuv/docs/formats.md
new file mode 100644
index 0000000000..a7cfed8218
--- /dev/null
+++ b/libs/libyuv/docs/formats.md
@@ -0,0 +1,133 @@
+# Introduction
+
+Formats (FOURCC) supported by libyuv are detailed here.
+
+# Core Formats
+
+There are 2 core formats supported by libyuv - I420 and ARGB.  All YUV formats can be converted to/from I420.  All RGB formats can be converted to/from ARGB.
+
+Filtering functions such as scaling and planar functions work on I420 and/or ARGB.
+
+# OSX Core Media Pixel Formats
+
+This is how OSX formats map to libyuv
+
+    enum {
+      kCMPixelFormat_32ARGB          = 32,      FOURCC_BGRA
+      kCMPixelFormat_32BGRA          = 'BGRA',  FOURCC_ARGB
+      kCMPixelFormat_24RGB           = 24,      FOURCC_RAW
+      kCMPixelFormat_16BE555         = 16,      Not supported.
+      kCMPixelFormat_16BE565         = 'B565',  Not supported.
+      kCMPixelFormat_16LE555         = 'L555',  FOURCC_RGBO
+      kCMPixelFormat_16LE565         = 'L565',  FOURCC_RGBP
+      kCMPixelFormat_16LE5551        = '5551',  FOURCC_RGBO
+      kCMPixelFormat_422YpCbCr8      = '2vuy',  FOURCC_UYVY
+      kCMPixelFormat_422YpCbCr8_yuvs = 'yuvs',  FOURCC_YUY2
+      kCMPixelFormat_444YpCbCr8      = 'v308',  FOURCC_I444 ?
+      kCMPixelFormat_4444YpCbCrA8    = 'v408',  Not supported.
+      kCMPixelFormat_422YpCbCr16     = 'v216',  Not supported.
+      kCMPixelFormat_422YpCbCr10     = 'v210',  FOURCC_V210 previously.  Removed now.
+      kCMPixelFormat_444YpCbCr10     = 'v410',  Not supported.
+      kCMPixelFormat_8IndexedGray_WhiteIsZero = 0x00000028,  Not supported.
+    };
+
+
+# FOURCC (Four Charactacter Code) List
+
+The following is extracted from video_common.h as a complete list of formats supported by libyuv.
+
+    enum FourCC {
+      // 9 Primary YUV formats: 5 planar, 2 biplanar, 2 packed.
+      FOURCC_I420 = FOURCC('I', '4', '2', '0'),
+      FOURCC_I422 = FOURCC('I', '4', '2', '2'),
+      FOURCC_I444 = FOURCC('I', '4', '4', '4'),
+      FOURCC_I411 = FOURCC('I', '4', '1', '1'),
+      FOURCC_I400 = FOURCC('I', '4', '0', '0'),
+      FOURCC_NV21 = FOURCC('N', 'V', '2', '1'),
+      FOURCC_NV12 = FOURCC('N', 'V', '1', '2'),
+      FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'),
+      FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'),
+
+      // 2 Secondary YUV formats: row biplanar.
+      FOURCC_M420 = FOURCC('M', '4', '2', '0'),
+      FOURCC_Q420 = FOURCC('Q', '4', '2', '0'),
+
+      // 9 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp.
+      FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),
+      FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'),
+      FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'),
+      FOURCC_24BG = FOURCC('2', '4', 'B', 'G'),
+      FOURCC_RAW  = FOURCC('r', 'a', 'w', ' '),
+      FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'),
+      FOURCC_RGBP = FOURCC('R', 'G', 'B', 'P'),  // rgb565 LE.
+      FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'),  // argb1555 LE.
+      FOURCC_R444 = FOURCC('R', '4', '4', '4'),  // argb4444 LE.
+
+      // 4 Secondary RGB formats: 4 Bayer Patterns.
+      FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'),
+      FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'),
+      FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'),
+      FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'),
+
+      // 1 Primary Compressed YUV format.
+      FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'),
+
+      // 5 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias.
+      FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'),
+      FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'),
+      FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'),
+      FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'),  // Linux version of I420.
+      FOURCC_J420 = FOURCC('J', '4', '2', '0'),
+      FOURCC_J400 = FOURCC('J', '4', '0', '0'),
+
+      // 14 Auxiliary aliases.  CanonicalFourCC() maps these to canonical fourcc.
+      FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'),  // Alias for I420.
+      FOURCC_YU16 = FOURCC('Y', 'U', '1', '6'),  // Alias for I422.
+      FOURCC_YU24 = FOURCC('Y', 'U', '2', '4'),  // Alias for I444.
+      FOURCC_YUYV = FOURCC('Y', 'U', 'Y', 'V'),  // Alias for YUY2.
+      FOURCC_YUVS = FOURCC('y', 'u', 'v', 's'),  // Alias for YUY2 on Mac.
+      FOURCC_HDYC = FOURCC('H', 'D', 'Y', 'C'),  // Alias for UYVY.
+      FOURCC_2VUY = FOURCC('2', 'v', 'u', 'y'),  // Alias for UYVY on Mac.
+      FOURCC_JPEG = FOURCC('J', 'P', 'E', 'G'),  // Alias for MJPG.
+      FOURCC_DMB1 = FOURCC('d', 'm', 'b', '1'),  // Alias for MJPG on Mac.
+      FOURCC_BA81 = FOURCC('B', 'A', '8', '1'),  // Alias for BGGR.
+      FOURCC_RGB3 = FOURCC('R', 'G', 'B', '3'),  // Alias for RAW.
+      FOURCC_BGR3 = FOURCC('B', 'G', 'R', '3'),  // Alias for 24BG.
+      FOURCC_CM32 = FOURCC(0, 0, 0, 32),  // Alias for BGRA kCMPixelFormat_32ARGB
+      FOURCC_CM24 = FOURCC(0, 0, 0, 24),  // Alias for RAW kCMPixelFormat_24RGB
+      FOURCC_L555 = FOURCC('L', '5', '5', '5'),  // Alias for RGBO.
+      FOURCC_L565 = FOURCC('L', '5', '6', '5'),  // Alias for RGBP.
+      FOURCC_5551 = FOURCC('5', '5', '5', '1'),  // Alias for RGBO.
+
+      // 1 Auxiliary compressed YUV format set aside for capturer.
+      FOURCC_H264 = FOURCC('H', '2', '6', '4'),
+
+# The ARGB FOURCC
+
+There are 4 ARGB layouts - ARGB, BGRA, ABGR and RGBA.  ARGB is most common by far, used for screen formats, and windows webcam drivers.
+
+The fourcc describes the order of channels in a ***register***.
+
+A fourcc provided by capturer, can be thought of string, e.g. "ARGB".
+
+On little endian machines, as an int, this would have 'A' in the lowest byte.  The FOURCC macro reverses the order:
+
+    #define FOURCC(a, b, c, d) (((uint32)(a)) | ((uint32)(b) << 8) | ((uint32)(c) << 16) | ((uint32)(d) << 24))
+
+So the "ARGB" string, read as an uint32, is
+
+    FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B')
+
+If you were to read ARGB pixels as uint32's, the alpha would be in the high byte, and the blue in the lowest byte.  In memory, these are stored little endian, so 'B' is first, then 'G', 'R' and 'A' last.
+
+When calling conversion functions, the names match the FOURCC, so in this case it would be I420ToARGB().
+
+All formats can be converted to/from ARGB.
+
+Most 'planar_functions' work on ARGB (e.g. ARGBBlend).
+
+Some are channel order agnostic (e.g. ARGBScale).
+
+Some functions are symmetric (e.g. ARGBToBGRA is the same as BGRAToARGB, so its a macro).
+
+ARGBBlend expects preattenuated ARGB. The R,G,B are premultiplied by alpha.  Other functions don't care.
diff --git a/libs/libyuv/docs/getting_started.md b/libs/libyuv/docs/getting_started.md
new file mode 100644
index 0000000000..68e6d49432
--- /dev/null
+++ b/libs/libyuv/docs/getting_started.md
@@ -0,0 +1,420 @@
+# Getting Started
+
+How to get and build the libyuv code.
+
+## Pre-requisites
+
+You'll need to have depot tools installed: https://www.chromium.org/developers/how-tos/install-depot-tools
+Refer to chromium instructions for each platform for other prerequisites.
+
+## Getting the Code
+
+Create a working directory, enter it, and run:
+
+    gclient config https://chromium.googlesource.com/libyuv/libyuv
+    gclient sync
+
+
+Then you'll get a .gclient file like:
+
+    solutions = [
+      { "name"        : "libyuv",
+        "url"         : "https://chromium.googlesource.com/libyuv/libyuv",
+        "deps_file"   : "DEPS",
+        "managed"     : True,
+        "custom_deps" : {
+        },
+        "safesync_url": "",
+      },
+    ];
+
+
+For iOS add `;target_os=['ios'];` to your OSX .gclient and run `GYP_DEFINES="OS=ios" gclient sync.`
+
+Browse the Git reprository: https://chromium.googlesource.com/libyuv/libyuv/+/master
+
+### Android
+For Android add `;target_os=['android'];` to your Linux .gclient
+
+
+    solutions = [
+      { "name"        : "libyuv",
+        "url"         : "https://chromium.googlesource.com/libyuv/libyuv",
+        "deps_file"   : "DEPS",
+        "managed"     : True,
+        "custom_deps" : {
+        },
+        "safesync_url": "",
+      },
+    ];
+    target_os = ["android", "unix"];
+
+Then run:
+
+    export GYP_DEFINES="OS=android"
+    gclient sync
+
+Caveat: Theres an error with Google Play services updates.  If you get the error "Your version of the Google Play services library is not up to date", run the following:
+    cd chromium/src
+    ./build/android/play_services/update.py download
+    cd ../..
+
+For Windows the gclient sync must be done from an Administrator command prompt.
+
+The sync will generate native build files for your environment using gyp (Windows: Visual Studio, OSX: XCode, Linux: make). This generation can also be forced manually: `gclient runhooks`
+
+To get just the source (not buildable):
+    git clone https://chromium.googlesource.com/libyuv/libyuv
+
+
+## Building the Library and Unittests
+
+### Windows
+
+    set GYP_DEFINES=target_arch=ia32
+    call python gyp_libyuv -fninja -G msvs_version=2013
+    ninja -j7 -C out\Release
+    ninja -j7 -C out\Debug
+
+    set GYP_DEFINES=target_arch=x64
+    call python gyp_libyuv -fninja -G msvs_version=2013
+    ninja -C out\Debug_x64
+    ninja -C out\Release_x64
+
+#### Building with clangcl
+    set GYP_DEFINES=clang=1 target_arch=ia32 libyuv_enable_svn=1
+    set LLVM_REPO_URL=svn://svn.chromium.org/llvm-project
+    call python tools\clang\scripts\update.py
+    call python gyp_libyuv -fninja libyuv_test.gyp
+    ninja -C out\Debug
+    ninja -C out\Release
+
+### OSX
+
+Clang 64 bit shown. Remove `clang=1` for GCC and change x64 to ia32 for 32 bit.
+
+    GYP_DEFINES="clang=1 target_arch=x64" ./gyp_libyuv
+    ninja -j7 -C out/Debug
+    ninja -j7 -C out/Release
+
+    GYP_DEFINES="clang=1 target_arch=ia32" ./gyp_libyuv
+    ninja -j7 -C out/Debug
+    ninja -j7 -C out/Release
+
+### iOS
+http://www.chromium.org/developers/how-tos/build-instructions-ios
+
+Add to .gclient last line: `target_os=['ios'];`
+
+armv7
+
+    GYP_DEFINES="OS=ios target_arch=armv7 target_subarch=arm32" GYP_CROSSCOMPILE=1 GYP_GENERATOR_FLAGS="output_dir=out_ios" ./gyp_libyuv
+    ninja -j7 -C out_ios/Debug-iphoneos libyuv_unittest
+    ninja -j7 -C out_ios/Release-iphoneos libyuv_unittest
+
+arm64
+
+    GYP_DEFINES="OS=ios target_arch=arm64 target_subarch=arm64" GYP_CROSSCOMPILE=1 GYP_GENERATOR_FLAGS="output_dir=out_ios" ./gyp_libyuv
+    ninja -j7 -C out_ios/Debug-iphoneos libyuv_unittest
+    ninja -j7 -C out_ios/Release-iphoneos libyuv_unittest
+
+both armv7 and arm64 (fat)
+
+    GYP_DEFINES="OS=ios target_arch=armv7 target_subarch=both" GYP_CROSSCOMPILE=1 GYP_GENERATOR_FLAGS="output_dir=out_ios" ./gyp_libyuv
+    ninja -j7 -C out_ios/Debug-iphoneos libyuv_unittest
+    ninja -j7 -C out_ios/Release-iphoneos libyuv_unittest
+
+simulator
+
+    GYP_DEFINES="OS=ios target_arch=ia32 target_subarch=arm32" GYP_CROSSCOMPILE=1 GYP_GENERATOR_FLAGS="output_dir=out_sim" ./gyp_libyuv
+    ninja -j7 -C out_sim/Debug-iphonesimulator libyuv_unittest
+    ninja -j7 -C out_sim/Release-iphonesimulator libyuv_unittest
+
+### Android
+https://code.google.com/p/chromium/wiki/AndroidBuildInstructions
+
+Add to .gclient last line: `target_os=['android'];`
+
+armv7
+
+    GYP_DEFINES="OS=android" GYP_CROSSCOMPILE=1 ./gyp_libyuv
+    ninja -j7 -C out/Debug libyuv_unittest_apk
+    ninja -j7 -C out/Release libyuv_unittest_apk
+
+arm64
+
+    GYP_DEFINES="OS=android target_arch=arm64 target_subarch=arm64" GYP_CROSSCOMPILE=1 ./gyp_libyuv
+    ninja -j7 -C out/Debug libyuv_unittest_apk
+    ninja -j7 -C out/Release libyuv_unittest_apk
+
+ia32
+
+    GYP_DEFINES="OS=android target_arch=ia32" GYP_CROSSCOMPILE=1 ./gyp_libyuv
+    ninja -j7 -C out/Debug libyuv_unittest_apk
+    ninja -j7 -C out/Release libyuv_unittest_apk
+
+    GYP_DEFINES="OS=android target_arch=ia32 android_full_debug=1" GYP_CROSSCOMPILE=1 ./gyp_libyuv
+    ninja -j7 -C out/Debug libyuv_unittest_apk
+
+mipsel
+
+    GYP_DEFINES="OS=android target_arch=mipsel" GYP_CROSSCOMPILE=1 ./gyp_libyuv
+    ninja -j7 -C out/Debug libyuv_unittest_apk
+    ninja -j7 -C out/Release libyuv_unittest_apk
+
+arm64 disassembly:
+
+    third_party/android_tools/ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d out/Release/obj/source/libyuv.row_neon64.o
+
+Running tests:
+
+    util/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=*
+
+Running test as benchmark:
+
+    util/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=* -a "--libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=999 --libyuv_flags=-1"
+
+Running test with C code:
+
+    util/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=* -a "--libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=999 --libyuv_flags=0 --libyuv_cpu_info=0"
+
+#### Building with GN
+
+    call gn gen out/Release "--args=is_debug=false target_cpu=\"x86\""
+    call gn gen out/Debug "--args=is_debug=true target_cpu=\"x86\""
+    ninja -C out/Release
+    ninja -C out/Debug
+
+### Linux
+
+    GYP_DEFINES="target_arch=x64" ./gyp_libyuv
+    ninja -j7 -C out/Debug
+    ninja -j7 -C out/Release
+
+    GYP_DEFINES="target_arch=ia32" ./gyp_libyuv
+    ninja -j7 -C out/Debug
+    ninja -j7 -C out/Release
+
+#### CentOS
+
+On CentOS 32 bit the following work around allows a sync:
+
+    export GYP_DEFINES="host_arch=ia32"
+    gclient sync
+
+### Windows Shared Library
+
+Modify libyuv.gyp from 'static_library' to 'shared_library', and add 'LIBYUV_BUILDING_SHARED_LIBRARY' to 'defines'.
+
+    gclient runhooks
+
+After this command follow the building the library instructions above.
+
+If you get a compile error for atlthunk.lib on Windows, read http://www.chromium.org/developers/how-tos/build-instructions-windows
+
+
+### Build targets
+
+    ninja -C out/Debug libyuv
+    ninja -C out/Debug libyuv_unittest
+    ninja -C out/Debug compare
+    ninja -C out/Debug convert
+    ninja -C out/Debug psnr
+    ninja -C out/Debug cpuid
+
+
+## Building the Library with make
+
+### Linux
+
+    make -j7 V=1 -f linux.mk
+    make -j7 V=1 -f linux.mk clean
+    make -j7 V=1 -f linux.mk CXX=clang++
+
+## Building the Library with cmake
+
+Install cmake: http://www.cmake.org/
+
+Default debug build:
+
+    mkdir out
+    cd out
+    cmake ..
+    cmake --build .
+
+Release build/install
+
+    mkdir out
+    cd out
+    cmake -DCMAKE_INSTALL_PREFIX="/usr/lib" -DCMAKE_BUILD_TYPE="Release" ..
+    cmake --build . --config Release
+    sudo cmake --build . --target install --config Release
+
+### Windows 8 Phone
+
+Pre-requisite:
+
+* Install Visual Studio 2012 and Arm to your environment.<br>
+
+Then:
+
+    call "c:\Program Files (x86)\Microsoft Visual Studio 11.0\VC\bin\x86_arm\vcvarsx86_arm.bat"
+
+or with Visual Studio 2013:
+
+    call "c:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\bin\x86_arm\vcvarsx86_arm.bat"
+    nmake /f winarm.mk clean
+    nmake /f winarm.mk
+
+### Windows Shared Library
+
+Modify libyuv.gyp from 'static_library' to 'shared_library', and add 'LIBYUV_BUILDING_SHARED_LIBRARY' to 'defines'. Then run this.
+
+    gclient runhooks
+
+After this command follow the building the library instructions above.
+
+If you get a compile error for atlthunk.lib on Windows, read http://www.chromium.org/developers/how-tos/build-instructions-windows
+
+### 64 bit Windows
+
+    set GYP_DEFINES=target_arch=x64
+    gclient runhooks V=1
+
+### ARM Linux
+
+    export GYP_DEFINES="target_arch=arm"
+    export CROSSTOOL=`<path>`/arm-none-linux-gnueabi
+    export CXX=$CROSSTOOL-g++
+    export CC=$CROSSTOOL-gcc
+    export AR=$CROSSTOOL-ar
+    export AS=$CROSSTOOL-as
+    export RANLIB=$CROSSTOOL-ranlib
+    gclient runhooks
+
+## Running Unittests
+
+### Windows
+
+    out\Release\libyuv_unittest.exe --gtest_catch_exceptions=0 --gtest_filter="*"
+
+### OSX
+
+    out/Release/libyuv_unittest --gtest_filter="*"
+
+### Linux
+
+    out/Release/libyuv_unittest --gtest_filter="*"
+
+Replace --gtest_filter="*" with specific unittest to run.  May include wildcards. e.g.
+
+    out/Release/libyuv_unittest --gtest_filter=libyuvTest.I420ToARGB_Opt
+
+## CPU Emulator tools
+
+### Intel SDE (Software Development Emulator)
+
+Pre-requisite: Install IntelSDE for Windows: http://software.intel.com/en-us/articles/intel-software-development-emulator
+
+Then run:
+
+    c:\intelsde\sde -hsw -- out\release\libyuv_unittest.exe --gtest_filter=*
+
+
+## Memory tools
+
+### Running Dr Memory memcheck for Windows
+
+Pre-requisite: Install Dr Memory for Windows and add it to your path: http://www.drmemory.org/docs/page_install_windows.html
+
+    set GYP_DEFINES=build_for_tool=drmemory target_arch=ia32
+    call python gyp_libyuv -fninja -G msvs_version=2013
+    ninja -C out\Debug
+    drmemory out\Debug\libyuv_unittest.exe --gtest_catch_exceptions=0 --gtest_filter=*
+
+### Running UBSan
+
+See Chromium instructions for sanitizers: https://www.chromium.org/developers/testing/undefinedbehaviorsanitizer
+
+Sanitizers available: TSan, MSan, ASan, UBSan, LSan
+
+    GYP_DEFINES='ubsan=1' gclient runhooks
+    ninja -C out/Release
+
+### Running Valgrind memcheck
+
+Memory errors and race conditions can be found by running tests under special memory tools. [Valgrind] [1] is an instrumentation framework for building dynamic analysis tools. Various tests and profilers are built upon it to find memory handling errors and memory leaks, for instance.
+
+[1]: http://valgrind.org
+
+    solutions = [
+      { "name"        : "libyuv",
+        "url"         : "https://chromium.googlesource.com/libyuv/libyuv",
+        "deps_file"   : "DEPS",
+        "managed"     : True,
+        "custom_deps" : {
+           "libyuv/chromium/src/third_party/valgrind": "https://chromium.googlesource.com/chromium/deps/valgrind/binaries",
+        },
+        "safesync_url": "",
+      },
+    ]
+
+Then run:
+
+    GYP_DEFINES="clang=0 target_arch=x64 build_for_tool=memcheck" python gyp_libyuv
+    ninja -C out/Debug
+    valgrind out/Debug/libyuv_unittest
+
+
+For more information, see http://www.chromium.org/developers/how-tos/using-valgrind
+
+### Running Thread Sanitizer (TSan)
+
+    GYP_DEFINES="clang=0 target_arch=x64 build_for_tool=tsan" python gyp_libyuv
+    ninja -C out/Debug
+    valgrind out/Debug/libyuv_unittest
+
+For more info, see http://www.chromium.org/developers/how-tos/using-valgrind/threadsanitizer
+
+### Running Address Sanitizer (ASan)
+
+    GYP_DEFINES="clang=0 target_arch=x64 build_for_tool=asan" python gyp_libyuv
+    ninja -C out/Debug
+    valgrind out/Debug/libyuv_unittest
+
+For more info, see http://dev.chromium.org/developers/testing/addresssanitizer
+
+## Benchmarking
+
+The unittests can be used to benchmark.
+
+### Windows
+
+    set LIBYUV_WIDTH=1280
+    set LIBYUV_HEIGHT=720
+    set LIBYUV_REPEAT=999
+    set LIBYUV_FLAGS=-1
+    out\Release\libyuv_unittest.exe --gtest_filter=*I420ToARGB_Opt
+
+
+### Linux and Mac
+
+    LIBYUV_WIDTH=1280 LIBYUV_HEIGHT=720 LIBYUV_REPEAT=1000 out/Release/libyuv_unittest --gtest_filter=*I420ToARGB_Opt
+
+    libyuvTest.I420ToARGB_Opt (547 ms)
+
+Indicates 0.547 ms/frame for 1280 x 720.
+
+## Making a change
+
+    gclient sync
+    git checkout -b mycl -t origin/master
+    git pull
+    <edit files>
+    git add -u
+    git commit -m "my change"
+    git cl lint
+    git cl try
+    git cl upload -r a-reviewer@chomium.org -s
+    <once approved..>
+    git cl land
diff --git a/libs/libyuv/docs/rotation.md b/libs/libyuv/docs/rotation.md
new file mode 100644
index 0000000000..fb84fce5a9
--- /dev/null
+++ b/libs/libyuv/docs/rotation.md
@@ -0,0 +1,103 @@
+# Introduction
+
+Rotation by multiplies of 90 degrees allows mobile devices to rotate webcams from landscape to portrait.  The higher level functions ConvertToI420 and ConvertToARGB allow rotation of any format.  Optimized functionality is supported for I420, ARGB, NV12 and NV21.
+
+# ConvertToI420
+
+    int ConvertToI420(const uint8* src_frame, size_t src_size,
+                      uint8* dst_y, int dst_stride_y,
+                      uint8* dst_u, int dst_stride_u,
+                      uint8* dst_v, int dst_stride_v,
+                      int crop_x, int crop_y,
+                      int src_width, int src_height,
+                      int crop_width, int crop_height,
+                      enum RotationMode rotation,
+                      uint32 format);
+
+This function crops, converts, and rotates.  You should think of it in that order.
+  * Crops the original image, which is src_width x src_height, to crop_width x crop_height.  At this point the image is still not rotated.
+  * Converts the cropped region to I420.  Supports inverted source for src_height negative.
+  * Rotates by 90, 180 or 270 degrees.
+The buffer the caller provides should account for rotation.  Be especially important to get stride of the destination correct.
+
+e.g.
+640 x 480 NV12 captured<br>
+Crop to 640 x 360<br>
+Rotate by 90 degrees to 360 x 640.<br>
+Caller passes stride of 360 for Y and 360 / 2 for U and V.<br>
+Caller passes crop_width of 640, crop_height of 360.<br>
+
+# ConvertToARGB
+
+    int ConvertToARGB(const uint8* src_frame, size_t src_size,
+                      uint8* dst_argb, int dst_stride_argb,
+                      int crop_x, int crop_y,
+                      int src_width, int src_height,
+                      int crop_width, int crop_height,
+                      enum RotationMode rotation,
+                      uint32 format);
+
+Same as I420, but implementation is less optimized - reads columns and writes rows, 16 bytes at a time.
+
+# I420Rotate
+
+    int I420Rotate(const uint8* src_y, int src_stride_y,
+                   const uint8* src_u, int src_stride_u,
+                   const uint8* src_v, int src_stride_v,
+                   uint8* dst_y, int dst_stride_y,
+                   uint8* dst_u, int dst_stride_u,
+                   uint8* dst_v, int dst_stride_v,
+                   int src_width, int src_height, enum RotationMode mode);
+
+Destination is rotated, so pass dst_stride_y etc that consider rotation.<br>
+Rotate by 180 can be done in place, but 90 and 270 can not.
+
+Implementation (Neon/SSE2) uses 8 x 8 block transpose, so best efficiency is with sizes and pointers that are aligned to 8.
+
+Cropping can be achieved by adjusting the src_y/u/v pointers and src_width, src_height.
+
+Lower level plane functions are provided, allowing other planar formats to be rotated.  (e.g. I444)
+
+For other planar YUV formats (I444, I422, I411, I400, NV16, NV24), the planar functions are exposed and can be called directly
+
+
+    // Rotate a plane by 0, 90, 180, or 270.
+    int RotatePlane(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride,
+                    int src_width, int src_height, enum RotationMode mode);
+
+# ARGBRotate
+
+    LIBYUV_API
+    int ARGBRotate(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_argb, int dst_stride_argb,
+                   int src_width, int src_height, enum RotationMode mode);
+
+Same as I420, but implementation is less optimized - reads columns and writes rows.
+
+Rotate by 90, or any angle, can be achieved using ARGBAffine.
+
+# Mirror - Horizontal Flip
+
+Mirror functions for horizontally flipping an image, which can be useful for 'self view' of a webcam.
+
+    int I420Mirror(const uint8* src_y, int src_stride_y,
+                   const uint8* src_u, int src_stride_u,
+                   const uint8* src_v, int src_stride_v,
+                   uint8* dst_y, int dst_stride_y,
+                   uint8* dst_u, int dst_stride_u,
+                   uint8* dst_v, int dst_stride_v,
+                   int width, int height);
+    int ARGBMirror(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_argb, int dst_stride_argb,
+                   int width, int height);
+
+Mirror functionality can also be achieved with the I420Scale and ARGBScale functions by passing negative width and/or height.
+
+# Invert - Vertical Flip
+
+Inverting can be achieved with almost any libyuv function by passing a negative source height.
+
+I420Mirror and ARGBMirror can also be used to rotate by 180 degrees by passing a negative height.
+
+
diff --git a/libs/libyuv/download_vs_toolchain.py b/libs/libyuv/download_vs_toolchain.py
new file mode 100644
index 0000000000..4b3457899f
--- /dev/null
+++ b/libs/libyuv/download_vs_toolchain.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python
+#
+# Copyright 2014 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+# This script is used to run the vs_toolchain.py script to download the
+# Visual Studio toolchain. It's just a temporary measure while waiting for the
+# Chrome team to move find_depot_tools into src/build to get rid of these
+# workarounds (similar one in gyp_libyuv).
+
+import os
+import sys
+
+
+checkout_root = os.path.dirname(os.path.realpath(__file__))
+sys.path.insert(0, os.path.join(checkout_root, 'build'))
+sys.path.insert(0, os.path.join(checkout_root, 'tools', 'find_depot_tools'))
+
+
+import vs_toolchain
+
+
+if __name__ == '__main__':
+  sys.exit(vs_toolchain.main())
diff --git a/libs/libyuv/gyp_libyuv b/libs/libyuv/gyp_libyuv
new file mode 100755
index 0000000000..645d3ad45c
--- /dev/null
+++ b/libs/libyuv/gyp_libyuv
@@ -0,0 +1,97 @@
+#!/usr/bin/env python
+#
+# Copyright 2014 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+# This script is used to run GYP for libyuv. It contains selected parts of the
+# main function from the src/build/gyp_chromium file.
+
+import glob
+import os
+import shlex
+import sys
+
+checkout_root = os.path.dirname(os.path.realpath(__file__))
+
+sys.path.insert(0, os.path.join(checkout_root, 'build'))
+import gyp_chromium
+import gyp_helper
+import vs_toolchain
+
+sys.path.insert(0, os.path.join(checkout_root, 'tools', 'gyp', 'pylib'))
+import gyp
+
+def GetSupplementalFiles():
+  """Returns a list of the supplemental files that are included in all GYP
+  sources."""
+  # Can't use the one in gyp_chromium since the directory location of the root
+  # is different.
+  return glob.glob(os.path.join(checkout_root, '*', 'supplement.gypi'))
+
+
+if __name__ == '__main__':
+  args = sys.argv[1:]
+
+  # This could give false positives since it doesn't actually do real option
+  # parsing.  Oh well.
+  gyp_file_specified = False
+  for arg in args:
+    if arg.endswith('.gyp'):
+      gyp_file_specified = True
+      break
+
+  # If we didn't get a file, assume 'all.gyp' in the root of the checkout.
+  if not gyp_file_specified:
+    # Because of a bug in gyp, simply adding the abspath to all.gyp doesn't
+    # work, but chdir'ing and adding the relative path does. Spooky :/
+    os.chdir(checkout_root)
+    args.append('all.gyp')
+
+  # There shouldn't be a circular dependency relationship between .gyp files,
+  args.append('--no-circular-check')
+
+  # Default to ninja unless GYP_GENERATORS is set.
+  if not os.environ.get('GYP_GENERATORS'):
+    os.environ['GYP_GENERATORS'] = 'ninja'
+
+  vs2013_runtime_dll_dirs = None
+  if int(os.environ.get('DEPOT_TOOLS_WIN_TOOLCHAIN', '1')):
+    vs2013_runtime_dll_dirs = vs_toolchain.SetEnvironmentAndGetRuntimeDllDirs()
+
+  # Enforce gyp syntax checking. This adds about 20% execution time.
+  args.append('--check')
+
+  supplemental_includes = gyp_chromium.GetSupplementalFiles()
+  gyp_vars_dict = gyp_chromium.GetGypVars(supplemental_includes)
+
+  # Automatically turn on crosscompile support for platforms that need it.
+  if all(('ninja' in os.environ.get('GYP_GENERATORS', ''),
+          gyp_vars_dict.get('OS') in ['android', 'ios'],
+          'GYP_CROSSCOMPILE' not in os.environ)):
+    os.environ['GYP_CROSSCOMPILE'] = '1'
+
+  args.extend(['-I' + i for i in
+               gyp_chromium.additional_include_files(supplemental_includes,
+                                                     args)])
+
+  # Set the gyp depth variable to the root of the checkout.
+  args.append('--depth=' + os.path.relpath(checkout_root))
+
+  print 'Updating projects from gyp files...'
+  sys.stdout.flush()
+
+  # Off we go...
+  gyp_rc = gyp.main(args)
+
+  if vs2013_runtime_dll_dirs:
+    x64_runtime, x86_runtime = vs2013_runtime_dll_dirs
+    vs_toolchain.CopyVsRuntimeDlls(
+        os.path.join(checkout_root, gyp_chromium.GetOutputDirectory()),
+        (x86_runtime, x64_runtime))
+
+  sys.exit(gyp_rc)
diff --git a/libs/libyuv/gyp_libyuv.py b/libs/libyuv/gyp_libyuv.py
new file mode 100644
index 0000000000..ac42038df3
--- /dev/null
+++ b/libs/libyuv/gyp_libyuv.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python
+#
+# Copyright 2014 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+
+# This script is a modified copy of the src/build/gyp_chromium.py file. 
+# It is needed for parallel processing.
+
+# This file is (possibly, depending on python version) imported by
+# gyp_libyuv when GYP_PARALLEL=1 and it creates sub-processes
+# through the multiprocessing library.
+
+# Importing in Python 2.6 (fixed in 2.7) on Windows doesn't search for
+# imports that don't end in .py (and aren't directories with an
+# __init__.py). This wrapper makes "import gyp_libyuv" work with
+# those old versions and makes it possible to execute gyp_libyuv.py
+# directly on Windows where the extension is useful.
+
+import os
+
+path = os.path.abspath(os.path.split(__file__)[0])
+execfile(os.path.join(path, 'gyp_libyuv'))
diff --git a/libs/libyuv/include/libyuv.h b/libs/libyuv/include/libyuv.h
new file mode 100644
index 0000000000..de652836e0
--- /dev/null
+++ b/libs/libyuv/include/libyuv.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_H_  // NOLINT
+#define INCLUDE_LIBYUV_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/compare.h"
+#include "libyuv/convert.h"
+#include "libyuv/convert_argb.h"
+#include "libyuv/convert_from.h"
+#include "libyuv/convert_from_argb.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/mjpeg_decoder.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+#include "libyuv/rotate_argb.h"
+#include "libyuv/row.h"
+#include "libyuv/scale.h"
+#include "libyuv/scale_argb.h"
+#include "libyuv/scale_row.h"
+#include "libyuv/version.h"
+#include "libyuv/video_common.h"
+
+#endif  // INCLUDE_LIBYUV_H_  NOLINT
diff --git a/libs/libyuv/include/libyuv/basic_types.h b/libs/libyuv/include/libyuv/basic_types.h
new file mode 100644
index 0000000000..beb750ba65
--- /dev/null
+++ b/libs/libyuv/include/libyuv/basic_types.h
@@ -0,0 +1,118 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_  // NOLINT
+#define INCLUDE_LIBYUV_BASIC_TYPES_H_
+
+#include <stddef.h>  // for NULL, size_t
+
+#if defined(__ANDROID__) || (defined(_MSC_VER) && (_MSC_VER < 1600))
+#include <sys/types.h>  // for uintptr_t on x86
+#else
+#include <stdint.h>  // for uintptr_t
+#endif
+
+#ifndef GG_LONGLONG
+#ifndef INT_TYPES_DEFINED
+#define INT_TYPES_DEFINED
+#ifdef COMPILER_MSVC
+typedef unsigned __int64 uint64;
+typedef __int64 int64;
+#ifndef INT64_C
+#define INT64_C(x) x ## I64
+#endif
+#ifndef UINT64_C
+#define UINT64_C(x) x ## UI64
+#endif
+#define INT64_F "I64"
+#else  // COMPILER_MSVC
+#if defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
+typedef unsigned long uint64;  // NOLINT
+typedef long int64;  // NOLINT
+#ifndef INT64_C
+#define INT64_C(x) x ## L
+#endif
+#ifndef UINT64_C
+#define UINT64_C(x) x ## UL
+#endif
+#define INT64_F "l"
+#else  // defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
+typedef unsigned long long uint64;  // NOLINT
+typedef long long int64;  // NOLINT
+#ifndef INT64_C
+#define INT64_C(x) x ## LL
+#endif
+#ifndef UINT64_C
+#define UINT64_C(x) x ## ULL
+#endif
+#define INT64_F "ll"
+#endif  // __LP64__
+#endif  // COMPILER_MSVC
+typedef unsigned int uint32;
+typedef int int32;
+typedef unsigned short uint16;  // NOLINT
+typedef short int16;  // NOLINT
+typedef unsigned char uint8;
+typedef signed char int8;
+#endif  // INT_TYPES_DEFINED
+#endif  // GG_LONGLONG
+
+// Detect compiler is for x86 or x64.
+#if defined(__x86_64__) || defined(_M_X64) || \
+    defined(__i386__) || defined(_M_IX86)
+#define CPU_X86 1
+#endif
+// Detect compiler is for ARM.
+#if defined(__arm__) || defined(_M_ARM)
+#define CPU_ARM 1
+#endif
+
+#ifndef ALIGNP
+#ifdef __cplusplus
+#define ALIGNP(p, t) \
+    (reinterpret_cast<uint8*>(((reinterpret_cast<uintptr_t>(p) + \
+    ((t) - 1)) & ~((t) - 1))))
+#else
+#define ALIGNP(p, t) \
+    ((uint8*)((((uintptr_t)(p) + ((t) - 1)) & ~((t) - 1))))  /* NOLINT */
+#endif
+#endif
+
+#if !defined(LIBYUV_API)
+#if defined(_WIN32) || defined(__CYGWIN__)
+#if defined(LIBYUV_BUILDING_SHARED_LIBRARY)
+#define LIBYUV_API __declspec(dllexport)
+#elif defined(LIBYUV_USING_SHARED_LIBRARY)
+#define LIBYUV_API __declspec(dllimport)
+#else
+#define LIBYUV_API
+#endif  // LIBYUV_BUILDING_SHARED_LIBRARY
+#elif defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__APPLE__) && \
+    (defined(LIBYUV_BUILDING_SHARED_LIBRARY) || \
+    defined(LIBYUV_USING_SHARED_LIBRARY))
+#define LIBYUV_API __attribute__ ((visibility ("default")))
+#else
+#define LIBYUV_API
+#endif  // __GNUC__
+#endif  // LIBYUV_API
+
+#define LIBYUV_BOOL int
+#define LIBYUV_FALSE 0
+#define LIBYUV_TRUE 1
+
+// Visual C x86 or GCC little endian.
+#if defined(__x86_64__) || defined(_M_X64) || \
+  defined(__i386__) || defined(_M_IX86) || \
+  defined(__arm__) || defined(_M_ARM) || \
+  (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#define LIBYUV_LITTLE_ENDIAN
+#endif
+
+#endif  // INCLUDE_LIBYUV_BASIC_TYPES_H_  NOLINT
diff --git a/libs/libyuv/include/libyuv/compare.h b/libs/libyuv/include/libyuv/compare.h
new file mode 100644
index 0000000000..08b2bb2ecf
--- /dev/null
+++ b/libs/libyuv/include/libyuv/compare.h
@@ -0,0 +1,78 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_COMPARE_H_  // NOLINT
+#define INCLUDE_LIBYUV_COMPARE_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Compute a hash for specified memory. Seed of 5381 recommended.
+LIBYUV_API
+uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed);
+
+// Scan an opaque argb image and return fourcc based on alpha offset.
+// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
+LIBYUV_API
+uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height);
+
+// Sum Square Error - used to compute Mean Square Error or PSNR.
+LIBYUV_API
+uint64 ComputeSumSquareError(const uint8* src_a,
+                             const uint8* src_b, int count);
+
+LIBYUV_API
+uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
+                                  const uint8* src_b, int stride_b,
+                                  int width, int height);
+
+static const int kMaxPsnr = 128;
+
+LIBYUV_API
+double SumSquareErrorToPsnr(uint64 sse, uint64 count);
+
+LIBYUV_API
+double CalcFramePsnr(const uint8* src_a, int stride_a,
+                     const uint8* src_b, int stride_b,
+                     int width, int height);
+
+LIBYUV_API
+double I420Psnr(const uint8* src_y_a, int stride_y_a,
+                const uint8* src_u_a, int stride_u_a,
+                const uint8* src_v_a, int stride_v_a,
+                const uint8* src_y_b, int stride_y_b,
+                const uint8* src_u_b, int stride_u_b,
+                const uint8* src_v_b, int stride_v_b,
+                int width, int height);
+
+LIBYUV_API
+double CalcFrameSsim(const uint8* src_a, int stride_a,
+                     const uint8* src_b, int stride_b,
+                     int width, int height);
+
+LIBYUV_API
+double I420Ssim(const uint8* src_y_a, int stride_y_a,
+                const uint8* src_u_a, int stride_u_a,
+                const uint8* src_v_a, int stride_v_a,
+                const uint8* src_y_b, int stride_y_b,
+                const uint8* src_u_b, int stride_u_b,
+                const uint8* src_v_b, int stride_v_b,
+                int width, int height);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_COMPARE_H_  NOLINT
diff --git a/libs/libyuv/include/libyuv/compare_row.h b/libs/libyuv/include/libyuv/compare_row.h
new file mode 100644
index 0000000000..f5836da11d
--- /dev/null
+++ b/libs/libyuv/include/libyuv/compare_row.h
@@ -0,0 +1,78 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_COMPARE_ROW_H_  // NOLINT
+#define INCLUDE_LIBYUV_COMPARE_ROW_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if defined(__pnacl__) || defined(__CLR_VER) || \
+    (defined(__i386__) && !defined(__SSE2__))
+#define LIBYUV_DISABLE_X86
+#endif
+
+// Visual C 2012 required for AVX2.
+#if defined(_M_IX86) && !defined(__clang__) && \
+    defined(_MSC_VER) && _MSC_VER >= 1700
+#define VISUALC_HAS_AVX2 1
+#endif  // VisualStudio >= 2012
+
+// clang >= 3.4.0 required for AVX2.
+#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
+#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4))
+#define CLANG_HAS_AVX2 1
+#endif  // clang >= 3.4
+#endif  // __clang__
+
+#if !defined(LIBYUV_DISABLE_X86) && \
+    defined(_M_IX86) && (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))
+#define HAS_HASHDJB2_AVX2
+#endif
+
+// The following are available for Visual C and GCC:
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__x86_64__) || (defined(__i386__) || defined(_M_IX86)))
+#define HAS_HASHDJB2_SSE41
+#define HAS_SUMSQUAREERROR_SSE2
+#endif
+
+// The following are available for Visual C and clangcl 32 bit:
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
+    (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))
+#define HAS_HASHDJB2_AVX2
+#define HAS_SUMSQUAREERROR_AVX2
+#endif
+
+// The following are available for Neon:
+#if !defined(LIBYUV_DISABLE_NEON) && \
+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
+#define HAS_SUMSQUAREERROR_NEON
+#endif
+
+uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count);
+uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count);
+uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count);
+uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
+
+uint32 HashDjb2_C(const uint8* src, int count, uint32 seed);
+uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed);
+uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_COMPARE_ROW_H_  NOLINT
diff --git a/libs/libyuv/include/libyuv/convert.h b/libs/libyuv/include/libyuv/convert.h
new file mode 100644
index 0000000000..a8d3fa07ac
--- /dev/null
+++ b/libs/libyuv/include/libyuv/convert.h
@@ -0,0 +1,245 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_CONVERT_H_  // NOLINT
+#define INCLUDE_LIBYUV_CONVERT_H_
+
+#include "libyuv/basic_types.h"
+// TODO(fbarchard): Remove the following headers includes.
+#include "libyuv/convert_from.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Convert I444 to I420.
+LIBYUV_API
+int I444ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert I422 to I420.
+LIBYUV_API
+int I422ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert I411 to I420.
+LIBYUV_API
+int I411ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Copy I420 to I420.
+#define I420ToI420 I420Copy
+LIBYUV_API
+int I420Copy(const uint8* src_y, int src_stride_y,
+             const uint8* src_u, int src_stride_u,
+             const uint8* src_v, int src_stride_v,
+             uint8* dst_y, int dst_stride_y,
+             uint8* dst_u, int dst_stride_u,
+             uint8* dst_v, int dst_stride_v,
+             int width, int height);
+
+// Convert I400 (grey) to I420.
+LIBYUV_API
+int I400ToI420(const uint8* src_y, int src_stride_y,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+#define J400ToJ420 I400ToI420
+
+// Convert NV12 to I420.
+LIBYUV_API
+int NV12ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_uv, int src_stride_uv,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert NV21 to I420.
+LIBYUV_API
+int NV21ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_vu, int src_stride_vu,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert YUY2 to I420.
+LIBYUV_API
+int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert UYVY to I420.
+LIBYUV_API
+int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert M420 to I420.
+LIBYUV_API
+int M420ToI420(const uint8* src_m420, int src_stride_m420,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// ARGB little endian (bgra in memory) to I420.
+LIBYUV_API
+int ARGBToI420(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// BGRA little endian (argb in memory) to I420.
+LIBYUV_API
+int BGRAToI420(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// ABGR little endian (rgba in memory) to I420.
+LIBYUV_API
+int ABGRToI420(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// RGBA little endian (abgr in memory) to I420.
+LIBYUV_API
+int RGBAToI420(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// RGB little endian (bgr in memory) to I420.
+LIBYUV_API
+int RGB24ToI420(const uint8* src_frame, int src_stride_frame,
+                uint8* dst_y, int dst_stride_y,
+                uint8* dst_u, int dst_stride_u,
+                uint8* dst_v, int dst_stride_v,
+                int width, int height);
+
+// RGB big endian (rgb in memory) to I420.
+LIBYUV_API
+int RAWToI420(const uint8* src_frame, int src_stride_frame,
+              uint8* dst_y, int dst_stride_y,
+              uint8* dst_u, int dst_stride_u,
+              uint8* dst_v, int dst_stride_v,
+              int width, int height);
+
+// RGB16 (RGBP fourcc) little endian to I420.
+LIBYUV_API
+int RGB565ToI420(const uint8* src_frame, int src_stride_frame,
+                 uint8* dst_y, int dst_stride_y,
+                 uint8* dst_u, int dst_stride_u,
+                 uint8* dst_v, int dst_stride_v,
+                 int width, int height);
+
+// RGB15 (RGBO fourcc) little endian to I420.
+LIBYUV_API
+int ARGB1555ToI420(const uint8* src_frame, int src_stride_frame,
+                   uint8* dst_y, int dst_stride_y,
+                   uint8* dst_u, int dst_stride_u,
+                   uint8* dst_v, int dst_stride_v,
+                   int width, int height);
+
+// RGB12 (R444 fourcc) little endian to I420.
+LIBYUV_API
+int ARGB4444ToI420(const uint8* src_frame, int src_stride_frame,
+                   uint8* dst_y, int dst_stride_y,
+                   uint8* dst_u, int dst_stride_u,
+                   uint8* dst_v, int dst_stride_v,
+                   int width, int height);
+
+#ifdef HAVE_JPEG
+// src_width/height provided by capture.
+// dst_width/height for clipping determine final size.
+LIBYUV_API
+int MJPGToI420(const uint8* sample, size_t sample_size,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int src_width, int src_height,
+               int dst_width, int dst_height);
+
+// Query size of MJPG in pixels.
+LIBYUV_API
+int MJPGSize(const uint8* sample, size_t sample_size,
+             int* width, int* height);
+#endif
+
+// Convert camera sample to I420 with cropping, rotation and vertical flip.
+// "src_size" is needed to parse MJPG.
+// "dst_stride_y" number of bytes in a row of the dst_y plane.
+//   Normally this would be the same as dst_width, with recommended alignment
+//   to 16 bytes for better efficiency.
+//   If rotation of 90 or 270 is used, stride is affected. The caller should
+//   allocate the I420 buffer according to rotation.
+// "dst_stride_u" number of bytes in a row of the dst_u plane.
+//   Normally this would be the same as (dst_width + 1) / 2, with
+//   recommended alignment to 16 bytes for better efficiency.
+//   If rotation of 90 or 270 is used, stride is affected.
+// "crop_x" and "crop_y" are starting position for cropping.
+//   To center, crop_x = (src_width - dst_width) / 2
+//              crop_y = (src_height - dst_height) / 2
+// "src_width" / "src_height" is size of src_frame in pixels.
+//   "src_height" can be negative indicating a vertically flipped image source.
+// "crop_width" / "crop_height" is the size to crop the src to.
+//    Must be less than or equal to src_width/src_height
+//    Cropping parameters are pre-rotation.
+// "rotation" can be 0, 90, 180 or 270.
+// "format" is a fourcc. ie 'I420', 'YUY2'
+// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
+LIBYUV_API
+int ConvertToI420(const uint8* src_frame, size_t src_size,
+                  uint8* dst_y, int dst_stride_y,
+                  uint8* dst_u, int dst_stride_u,
+                  uint8* dst_v, int dst_stride_v,
+                  int crop_x, int crop_y,
+                  int src_width, int src_height,
+                  int crop_width, int crop_height,
+                  enum RotationMode rotation,
+                  uint32 format);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_CONVERT_H_  NOLINT
diff --git a/libs/libyuv/include/libyuv/convert_argb.h b/libs/libyuv/include/libyuv/convert_argb.h
new file mode 100644
index 0000000000..ce4e3d0751
--- /dev/null
+++ b/libs/libyuv/include/libyuv/convert_argb.h
@@ -0,0 +1,313 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_CONVERT_ARGB_H_  // NOLINT
+#define INCLUDE_LIBYUV_CONVERT_ARGB_H_
+
+#include "libyuv/basic_types.h"
+// TODO(fbarchard): Remove the following headers includes
+#include "libyuv/convert_from.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+
+// TODO(fbarchard): This set of functions should exactly match convert.h
+// TODO(fbarchard): Add tests. Create random content of right size and convert
+// with C vs Opt and or to I420 and compare.
+// TODO(fbarchard): Some of these functions lack parameter setting.
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Alias.
+#define ARGBToARGB ARGBCopy
+
+// Copy ARGB to ARGB.
+LIBYUV_API
+int ARGBCopy(const uint8* src_argb, int src_stride_argb,
+             uint8* dst_argb, int dst_stride_argb,
+             int width, int height);
+
+// Convert I420 to ARGB.
+LIBYUV_API
+int I420ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert I422 to ARGB.
+LIBYUV_API
+int I422ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert I444 to ARGB.
+LIBYUV_API
+int I444ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert J444 to ARGB.
+LIBYUV_API
+int J444ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert I444 to ABGR.
+LIBYUV_API
+int I444ToABGR(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_abgr, int dst_stride_abgr,
+               int width, int height);
+
+// Convert I411 to ARGB.
+LIBYUV_API
+int I411ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert I420 with Alpha to preattenuated ARGB.
+LIBYUV_API
+int I420AlphaToARGB(const uint8* src_y, int src_stride_y,
+                    const uint8* src_u, int src_stride_u,
+                    const uint8* src_v, int src_stride_v,
+                    const uint8* src_a, int src_stride_a,
+                    uint8* dst_argb, int dst_stride_argb,
+                    int width, int height, int attenuate);
+
+// Convert I420 with Alpha to preattenuated ABGR.
+LIBYUV_API
+int I420AlphaToABGR(const uint8* src_y, int src_stride_y,
+                    const uint8* src_u, int src_stride_u,
+                    const uint8* src_v, int src_stride_v,
+                    const uint8* src_a, int src_stride_a,
+                    uint8* dst_abgr, int dst_stride_abgr,
+                    int width, int height, int attenuate);
+
+// Convert I400 (grey) to ARGB.  Reverse of ARGBToI400.
+LIBYUV_API
+int I400ToARGB(const uint8* src_y, int src_stride_y,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert J400 (jpeg grey) to ARGB.
+LIBYUV_API
+int J400ToARGB(const uint8* src_y, int src_stride_y,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Alias.
+#define YToARGB I400ToARGB
+
+// Convert NV12 to ARGB.
+LIBYUV_API
+int NV12ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_uv, int src_stride_uv,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert NV21 to ARGB.
+LIBYUV_API
+int NV21ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_vu, int src_stride_vu,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert M420 to ARGB.
+LIBYUV_API
+int M420ToARGB(const uint8* src_m420, int src_stride_m420,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert YUY2 to ARGB.
+LIBYUV_API
+int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert UYVY to ARGB.
+LIBYUV_API
+int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert J420 to ARGB.
+LIBYUV_API
+int J420ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert J422 to ARGB.
+LIBYUV_API
+int J422ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert J420 to ABGR.
+LIBYUV_API
+int J420ToABGR(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_abgr, int dst_stride_abgr,
+               int width, int height);
+
+// Convert J422 to ABGR.
+LIBYUV_API
+int J422ToABGR(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_abgr, int dst_stride_abgr,
+               int width, int height);
+
+// Convert H420 to ARGB.
+LIBYUV_API
+int H420ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert H422 to ARGB.
+LIBYUV_API
+int H422ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert H420 to ABGR.
+LIBYUV_API
+int H420ToABGR(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_abgr, int dst_stride_abgr,
+               int width, int height);
+
+// Convert H422 to ABGR.
+LIBYUV_API
+int H422ToABGR(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_abgr, int dst_stride_abgr,
+               int width, int height);
+
+// BGRA little endian (argb in memory) to ARGB.
+LIBYUV_API
+int BGRAToARGB(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// ABGR little endian (rgba in memory) to ARGB.
+LIBYUV_API
+int ABGRToARGB(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// RGBA little endian (abgr in memory) to ARGB.
+LIBYUV_API
+int RGBAToARGB(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Deprecated function name.
+#define BG24ToARGB RGB24ToARGB
+
+// RGB little endian (bgr in memory) to ARGB.
+LIBYUV_API
+int RGB24ToARGB(const uint8* src_frame, int src_stride_frame,
+                uint8* dst_argb, int dst_stride_argb,
+                int width, int height);
+
+// RGB big endian (rgb in memory) to ARGB.
+LIBYUV_API
+int RAWToARGB(const uint8* src_frame, int src_stride_frame,
+              uint8* dst_argb, int dst_stride_argb,
+              int width, int height);
+
+// RGB16 (RGBP fourcc) little endian to ARGB.
+LIBYUV_API
+int RGB565ToARGB(const uint8* src_frame, int src_stride_frame,
+                 uint8* dst_argb, int dst_stride_argb,
+                 int width, int height);
+
+// RGB15 (RGBO fourcc) little endian to ARGB.
+LIBYUV_API
+int ARGB1555ToARGB(const uint8* src_frame, int src_stride_frame,
+                   uint8* dst_argb, int dst_stride_argb,
+                   int width, int height);
+
+// RGB12 (R444 fourcc) little endian to ARGB.
+LIBYUV_API
+int ARGB4444ToARGB(const uint8* src_frame, int src_stride_frame,
+                   uint8* dst_argb, int dst_stride_argb,
+                   int width, int height);
+
+#ifdef HAVE_JPEG
+// src_width/height provided by capture
+// dst_width/height for clipping determine final size.
+LIBYUV_API
+int MJPGToARGB(const uint8* sample, size_t sample_size,
+               uint8* dst_argb, int dst_stride_argb,
+               int src_width, int src_height,
+               int dst_width, int dst_height);
+#endif
+
+// Convert camera sample to ARGB with cropping, rotation and vertical flip.
+// "src_size" is needed to parse MJPG.
+// "dst_stride_argb" number of bytes in a row of the dst_argb plane.
+//   Normally this would be the same as dst_width, with recommended alignment
+//   to 16 bytes for better efficiency.
+//   If rotation of 90 or 270 is used, stride is affected. The caller should
+//   allocate the I420 buffer according to rotation.
+// "dst_stride_u" number of bytes in a row of the dst_u plane.
+//   Normally this would be the same as (dst_width + 1) / 2, with
+//   recommended alignment to 16 bytes for better efficiency.
+//   If rotation of 90 or 270 is used, stride is affected.
+// "crop_x" and "crop_y" are starting position for cropping.
+//   To center, crop_x = (src_width - dst_width) / 2
+//              crop_y = (src_height - dst_height) / 2
+// "src_width" / "src_height" is size of src_frame in pixels.
+//   "src_height" can be negative indicating a vertically flipped image source.
+// "crop_width" / "crop_height" is the size to crop the src to.
+//    Must be less than or equal to src_width/src_height
+//    Cropping parameters are pre-rotation.
+// "rotation" can be 0, 90, 180 or 270.
+// "format" is a fourcc. ie 'I420', 'YUY2'
+// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
+LIBYUV_API
+int ConvertToARGB(const uint8* src_frame, size_t src_size,
+                  uint8* dst_argb, int dst_stride_argb,
+                  int crop_x, int crop_y,
+                  int src_width, int src_height,
+                  int crop_width, int crop_height,
+                  enum RotationMode rotation,
+                  uint32 format);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_CONVERT_ARGB_H_  NOLINT
diff --git a/libs/libyuv/include/libyuv/convert_from.h b/libs/libyuv/include/libyuv/convert_from.h
new file mode 100644
index 0000000000..9fd8d4de5f
--- /dev/null
+++ b/libs/libyuv/include/libyuv/convert_from.h
@@ -0,0 +1,181 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_CONVERT_FROM_H_  // NOLINT
+#define INCLUDE_LIBYUV_CONVERT_FROM_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/rotate.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// See Also convert.h for conversions from formats to I420.
+
+// I420Copy in convert to I420ToI420.
+
+LIBYUV_API
+int I420ToI422(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+LIBYUV_API
+int I420ToI444(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+LIBYUV_API
+int I420ToI411(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Copy to I400. Source can be I420, I422, I444, I400, NV12 or NV21.
+LIBYUV_API
+int I400Copy(const uint8* src_y, int src_stride_y,
+             uint8* dst_y, int dst_stride_y,
+             int width, int height);
+
+// TODO(fbarchard): I420ToM420
+
+LIBYUV_API
+int I420ToNV12(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height);
+
+LIBYUV_API
+int I420ToNV21(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_vu, int dst_stride_vu,
+               int width, int height);
+
+LIBYUV_API
+int I420ToYUY2(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_frame, int dst_stride_frame,
+               int width, int height);
+
+LIBYUV_API
+int I420ToUYVY(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_frame, int dst_stride_frame,
+               int width, int height);
+
+LIBYUV_API
+int I420ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+LIBYUV_API
+int I420ToBGRA(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+LIBYUV_API
+int I420ToABGR(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+LIBYUV_API
+int I420ToRGBA(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_rgba, int dst_stride_rgba,
+               int width, int height);
+
+LIBYUV_API
+int I420ToRGB24(const uint8* src_y, int src_stride_y,
+                const uint8* src_u, int src_stride_u,
+                const uint8* src_v, int src_stride_v,
+                uint8* dst_frame, int dst_stride_frame,
+                int width, int height);
+
+LIBYUV_API
+int I420ToRAW(const uint8* src_y, int src_stride_y,
+              const uint8* src_u, int src_stride_u,
+              const uint8* src_v, int src_stride_v,
+              uint8* dst_frame, int dst_stride_frame,
+              int width, int height);
+
+LIBYUV_API
+int I420ToRGB565(const uint8* src_y, int src_stride_y,
+                 const uint8* src_u, int src_stride_u,
+                 const uint8* src_v, int src_stride_v,
+                 uint8* dst_frame, int dst_stride_frame,
+                 int width, int height);
+
+// Convert I420 To RGB565 with 4x4 dither matrix (16 bytes).
+// Values in dither matrix from 0 to 7 recommended.
+// The order of the dither matrix is first byte is upper left.
+
+LIBYUV_API
+int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
+                       const uint8* src_u, int src_stride_u,
+                       const uint8* src_v, int src_stride_v,
+                       uint8* dst_frame, int dst_stride_frame,
+                       const uint8* dither4x4, int width, int height);
+
+LIBYUV_API
+int I420ToARGB1555(const uint8* src_y, int src_stride_y,
+                   const uint8* src_u, int src_stride_u,
+                   const uint8* src_v, int src_stride_v,
+                   uint8* dst_frame, int dst_stride_frame,
+                   int width, int height);
+
+LIBYUV_API
+int I420ToARGB4444(const uint8* src_y, int src_stride_y,
+                   const uint8* src_u, int src_stride_u,
+                   const uint8* src_v, int src_stride_v,
+                   uint8* dst_frame, int dst_stride_frame,
+                   int width, int height);
+
+// Convert I420 to specified format.
+// "dst_sample_stride" is bytes in a row for the destination. Pass 0 if the
+//    buffer has contiguous rows. Can be negative. A multiple of 16 is optimal.
+LIBYUV_API
+int ConvertFromI420(const uint8* y, int y_stride,
+                    const uint8* u, int u_stride,
+                    const uint8* v, int v_stride,
+                    uint8* dst_sample, int dst_sample_stride,
+                    int width, int height,
+                    uint32 format);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_CONVERT_FROM_H_  NOLINT
diff --git a/libs/libyuv/include/libyuv/convert_from_argb.h b/libs/libyuv/include/libyuv/convert_from_argb.h
new file mode 100644
index 0000000000..1df53200dd
--- /dev/null
+++ b/libs/libyuv/include/libyuv/convert_from_argb.h
@@ -0,0 +1,190 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_  // NOLINT
+#define INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Copy ARGB to ARGB.
+#define ARGBToARGB ARGBCopy
+LIBYUV_API
+int ARGBCopy(const uint8* src_argb, int src_stride_argb,
+             uint8* dst_argb, int dst_stride_argb,
+             int width, int height);
+
+// Convert ARGB To BGRA.
+LIBYUV_API
+int ARGBToBGRA(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_bgra, int dst_stride_bgra,
+               int width, int height);
+
+// Convert ARGB To ABGR.
+LIBYUV_API
+int ARGBToABGR(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_abgr, int dst_stride_abgr,
+               int width, int height);
+
+// Convert ARGB To RGBA.
+LIBYUV_API
+int ARGBToRGBA(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_rgba, int dst_stride_rgba,
+               int width, int height);
+
+// Convert ARGB To RGB24.
+LIBYUV_API
+int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
+                uint8* dst_rgb24, int dst_stride_rgb24,
+                int width, int height);
+
+// Convert ARGB To RAW.
+LIBYUV_API
+int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
+              uint8* dst_rgb, int dst_stride_rgb,
+              int width, int height);
+
+// Convert ARGB To RGB565.
+LIBYUV_API
+int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
+                 uint8* dst_rgb565, int dst_stride_rgb565,
+                 int width, int height);
+
+// Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes).
+// Values in dither matrix from 0 to 7 recommended.
+// The order of the dither matrix is first byte is upper left.
+// TODO(fbarchard): Consider pointer to 2d array for dither4x4.
+// const uint8(*dither)[4][4];
+LIBYUV_API
+int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
+                       uint8* dst_rgb565, int dst_stride_rgb565,
+                       const uint8* dither4x4, int width, int height);
+
+// Convert ARGB To ARGB1555.
+LIBYUV_API
+int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_argb1555, int dst_stride_argb1555,
+                   int width, int height);
+
+// Convert ARGB To ARGB4444.
+LIBYUV_API
+int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_argb4444, int dst_stride_argb4444,
+                   int width, int height);
+
+// Convert ARGB To I444.
+LIBYUV_API
+int ARGBToI444(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert ARGB To I422.
+LIBYUV_API
+int ARGBToI422(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert ARGB To I420. (also in convert.h)
+LIBYUV_API
+int ARGBToI420(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert ARGB to J420. (JPeg full range I420).
+LIBYUV_API
+int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_yj, int dst_stride_yj,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert ARGB to J422.
+LIBYUV_API
+int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_yj, int dst_stride_yj,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert ARGB To I411.
+LIBYUV_API
+int ARGBToI411(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert ARGB to J400. (JPeg full range).
+LIBYUV_API
+int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_yj, int dst_stride_yj,
+               int width, int height);
+
+// Convert ARGB to I400.
+LIBYUV_API
+int ARGBToI400(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height);
+
+// Convert ARGB to G. (Reverse of J400toARGB, which replicates G back to ARGB)
+LIBYUV_API
+int ARGBToG(const uint8* src_argb, int src_stride_argb,
+            uint8* dst_g, int dst_stride_g,
+            int width, int height);
+
+// Convert ARGB To NV12.
+LIBYUV_API
+int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height);
+
+// Convert ARGB To NV21.
+LIBYUV_API
+int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_vu, int dst_stride_vu,
+               int width, int height);
+
+// Convert ARGB To NV21.
+LIBYUV_API
+int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_vu, int dst_stride_vu,
+               int width, int height);
+
+// Convert ARGB To YUY2.
+LIBYUV_API
+int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_yuy2, int dst_stride_yuy2,
+               int width, int height);
+
+// Convert ARGB To UYVY.
+LIBYUV_API
+int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_uyvy, int dst_stride_uyvy,
+               int width, int height);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_  NOLINT
diff --git a/libs/libyuv/include/libyuv/cpu_id.h b/libs/libyuv/include/libyuv/cpu_id.h
new file mode 100644
index 0000000000..2ccc3e7dd3
--- /dev/null
+++ b/libs/libyuv/include/libyuv/cpu_id.h
@@ -0,0 +1,80 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_CPU_ID_H_  // NOLINT
+#define INCLUDE_LIBYUV_CPU_ID_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Internal flag to indicate cpuid requires initialization.
+static const int kCpuInitialized = 0x1;
+
+// These flags are only valid on ARM processors.
+static const int kCpuHasARM = 0x2;
+static const int kCpuHasNEON = 0x4;
+// 0x8 reserved for future ARM flag.
+
+// These flags are only valid on x86 processors.
+static const int kCpuHasX86 = 0x10;
+static const int kCpuHasSSE2 = 0x20;
+static const int kCpuHasSSSE3 = 0x40;
+static const int kCpuHasSSE41 = 0x80;
+static const int kCpuHasSSE42 = 0x100;
+static const int kCpuHasAVX = 0x200;
+static const int kCpuHasAVX2 = 0x400;
+static const int kCpuHasERMS = 0x800;
+static const int kCpuHasFMA3 = 0x1000;
+static const int kCpuHasAVX3 = 0x2000;
+// 0x2000, 0x4000, 0x8000 reserved for future X86 flags.
+
+// These flags are only valid on MIPS processors.
+static const int kCpuHasMIPS = 0x10000;
+static const int kCpuHasDSPR2 = 0x20000;
+
+// Internal function used to auto-init.
+LIBYUV_API
+int InitCpuFlags(void);
+
+// Internal function for parsing /proc/cpuinfo.
+LIBYUV_API
+int ArmCpuCaps(const char* cpuinfo_name);
+
+// Detect CPU has SSE2 etc.
+// Test_flag parameter should be one of kCpuHas constants above.
+// returns non-zero if instruction set is detected
+static __inline int TestCpuFlag(int test_flag) {
+  LIBYUV_API extern int cpu_info_;
+  return (!cpu_info_ ? InitCpuFlags() : cpu_info_) & test_flag;
+}
+
+// For testing, allow CPU flags to be disabled.
+// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.
+// MaskCpuFlags(-1) to enable all cpu specific optimizations.
+// MaskCpuFlags(0) to disable all cpu specific optimizations.
+LIBYUV_API
+void MaskCpuFlags(int enable_flags);
+
+// Low level cpuid for X86. Returns zeros on other CPUs.
+// eax is the info type that you want.
+// ecx is typically the cpu number, and should normally be zero.
+LIBYUV_API
+void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_CPU_ID_H_  NOLINT
diff --git a/libs/libyuv/include/libyuv/mjpeg_decoder.h b/libs/libyuv/include/libyuv/mjpeg_decoder.h
new file mode 100644
index 0000000000..8423121d11
--- /dev/null
+++ b/libs/libyuv/include/libyuv/mjpeg_decoder.h
@@ -0,0 +1,192 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_MJPEG_DECODER_H_  // NOLINT
+#define INCLUDE_LIBYUV_MJPEG_DECODER_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+// NOTE: For a simplified public API use convert.h MJPGToI420().
+
+struct jpeg_common_struct;
+struct jpeg_decompress_struct;
+struct jpeg_source_mgr;
+
+namespace libyuv {
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+static const uint32 kUnknownDataSize = 0xFFFFFFFF;
+
+enum JpegSubsamplingType {
+  kJpegYuv420,
+  kJpegYuv422,
+  kJpegYuv411,
+  kJpegYuv444,
+  kJpegYuv400,
+  kJpegUnknown
+};
+
+struct Buffer {
+  const uint8* data;
+  int len;
+};
+
+struct BufferVector {
+  Buffer* buffers;
+  int len;
+  int pos;
+};
+
+struct SetJmpErrorMgr;
+
+// MJPEG ("Motion JPEG") is a pseudo-standard video codec where the frames are
+// simply independent JPEG images with a fixed huffman table (which is omitted).
+// It is rarely used in video transmission, but is common as a camera capture
+// format, especially in Logitech devices. This class implements a decoder for
+// MJPEG frames.
+//
+// See http://tools.ietf.org/html/rfc2435
+class LIBYUV_API MJpegDecoder {
+ public:
+  typedef void (*CallbackFunction)(void* opaque,
+                                   const uint8* const* data,
+                                   const int* strides,
+                                   int rows);
+
+  static const int kColorSpaceUnknown;
+  static const int kColorSpaceGrayscale;
+  static const int kColorSpaceRgb;
+  static const int kColorSpaceYCbCr;
+  static const int kColorSpaceCMYK;
+  static const int kColorSpaceYCCK;
+
+  MJpegDecoder();
+  ~MJpegDecoder();
+
+  // Loads a new frame, reads its headers, and determines the uncompressed
+  // image format.
+  // Returns LIBYUV_TRUE if image looks valid and format is supported.
+  // If return value is LIBYUV_TRUE, then the values for all the following
+  // getters are populated.
+  // src_len is the size of the compressed mjpeg frame in bytes.
+  LIBYUV_BOOL LoadFrame(const uint8* src, size_t src_len);
+
+  // Returns width of the last loaded frame in pixels.
+  int GetWidth();
+
+  // Returns height of the last loaded frame in pixels.
+  int GetHeight();
+
+  // Returns format of the last loaded frame. The return value is one of the
+  // kColorSpace* constants.
+  int GetColorSpace();
+
+  // Number of color components in the color space.
+  int GetNumComponents();
+
+  // Sample factors of the n-th component.
+  int GetHorizSampFactor(int component);
+
+  int GetVertSampFactor(int component);
+
+  int GetHorizSubSampFactor(int component);
+
+  int GetVertSubSampFactor(int component);
+
+  // Public for testability.
+  int GetImageScanlinesPerImcuRow();
+
+  // Public for testability.
+  int GetComponentScanlinesPerImcuRow(int component);
+
+  // Width of a component in bytes.
+  int GetComponentWidth(int component);
+
+  // Height of a component.
+  int GetComponentHeight(int component);
+
+  // Width of a component in bytes with padding for DCTSIZE. Public for testing.
+  int GetComponentStride(int component);
+
+  // Size of a component in bytes.
+  int GetComponentSize(int component);
+
+  // Call this after LoadFrame() if you decide you don't want to decode it
+  // after all.
+  LIBYUV_BOOL UnloadFrame();
+
+  // Decodes the entire image into a one-buffer-per-color-component format.
+  // dst_width must match exactly. dst_height must be <= to image height; if
+  // less, the image is cropped. "planes" must have size equal to at least
+  // GetNumComponents() and they must point to non-overlapping buffers of size
+  // at least GetComponentSize(i). The pointers in planes are incremented
+  // to point to after the end of the written data.
+  // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded.
+  LIBYUV_BOOL DecodeToBuffers(uint8** planes, int dst_width, int dst_height);
+
+  // Decodes the entire image and passes the data via repeated calls to a
+  // callback function. Each call will get the data for a whole number of
+  // image scanlines.
+  // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded.
+  LIBYUV_BOOL DecodeToCallback(CallbackFunction fn, void* opaque,
+                        int dst_width, int dst_height);
+
+  // The helper function which recognizes the jpeg sub-sampling type.
+  static JpegSubsamplingType JpegSubsamplingTypeHelper(
+     int* subsample_x, int* subsample_y, int number_of_components);
+
+ private:
+  void AllocOutputBuffers(int num_outbufs);
+  void DestroyOutputBuffers();
+
+  LIBYUV_BOOL StartDecode();
+  LIBYUV_BOOL FinishDecode();
+
+  void SetScanlinePointers(uint8** data);
+  LIBYUV_BOOL DecodeImcuRow();
+
+  int GetComponentScanlinePadding(int component);
+
+  // A buffer holding the input data for a frame.
+  Buffer buf_;
+  BufferVector buf_vec_;
+
+  jpeg_decompress_struct* decompress_struct_;
+  jpeg_source_mgr* source_mgr_;
+  SetJmpErrorMgr* error_mgr_;
+
+  // LIBYUV_TRUE iff at least one component has scanline padding. (i.e.,
+  // GetComponentScanlinePadding() != 0.)
+  LIBYUV_BOOL has_scanline_padding_;
+
+  // Temporaries used to point to scanline outputs.
+  int num_outbufs_;  // Outermost size of all arrays below.
+  uint8*** scanlines_;
+  int* scanlines_sizes_;
+  // Temporary buffer used for decoding when we can't decode directly to the
+  // output buffers. Large enough for just one iMCU row.
+  uint8** databuf_;
+  int* databuf_strides_;
+};
+
+}  // namespace libyuv
+
+#endif  //  __cplusplus
+#endif  // INCLUDE_LIBYUV_MJPEG_DECODER_H_  NOLINT
diff --git a/libs/libyuv/include/libyuv/planar_functions.h b/libs/libyuv/include/libyuv/planar_functions.h
new file mode 100644
index 0000000000..95870b9aea
--- /dev/null
+++ b/libs/libyuv/include/libyuv/planar_functions.h
@@ -0,0 +1,495 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_  // NOLINT
+#define INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
+
+#include "libyuv/basic_types.h"
+
+// TODO(fbarchard): Remove the following headers includes.
+#include "libyuv/convert.h"
+#include "libyuv/convert_argb.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Copy a plane of data.
+LIBYUV_API
+void CopyPlane(const uint8* src_y, int src_stride_y,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height);
+
+LIBYUV_API
+void CopyPlane_16(const uint16* src_y, int src_stride_y,
+                  uint16* dst_y, int dst_stride_y,
+                  int width, int height);
+
+// Set a plane of data to a 32 bit value.
+LIBYUV_API
+void SetPlane(uint8* dst_y, int dst_stride_y,
+              int width, int height,
+              uint32 value);
+
+// Copy I400.  Supports inverting.
+LIBYUV_API
+int I400ToI400(const uint8* src_y, int src_stride_y,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height);
+
+#define J400ToJ400 I400ToI400
+
+// Copy I422 to I422.
+#define I422ToI422 I422Copy
+LIBYUV_API
+int I422Copy(const uint8* src_y, int src_stride_y,
+             const uint8* src_u, int src_stride_u,
+             const uint8* src_v, int src_stride_v,
+             uint8* dst_y, int dst_stride_y,
+             uint8* dst_u, int dst_stride_u,
+             uint8* dst_v, int dst_stride_v,
+             int width, int height);
+
+// Copy I444 to I444.
+#define I444ToI444 I444Copy
+LIBYUV_API
+int I444Copy(const uint8* src_y, int src_stride_y,
+             const uint8* src_u, int src_stride_u,
+             const uint8* src_v, int src_stride_v,
+             uint8* dst_y, int dst_stride_y,
+             uint8* dst_u, int dst_stride_u,
+             uint8* dst_v, int dst_stride_v,
+             int width, int height);
+
+// Convert YUY2 to I422.
+LIBYUV_API
+int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert UYVY to I422.
+LIBYUV_API
+int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+LIBYUV_API
+int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height);
+
+LIBYUV_API
+int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height);
+
+// Convert I420 to I400. (calls CopyPlane ignoring u/v).
+LIBYUV_API
+int I420ToI400(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height);
+
+// Alias
+#define J420ToJ400 I420ToI400
+#define I420ToI420Mirror I420Mirror
+
+// I420 mirror.
+LIBYUV_API
+int I420Mirror(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Alias
+#define I400ToI400Mirror I400Mirror
+
+// I400 mirror.  A single plane is mirrored horizontally.
+// Pass negative height to achieve 180 degree rotation.
+LIBYUV_API
+int I400Mirror(const uint8* src_y, int src_stride_y,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height);
+
+// Alias
+#define ARGBToARGBMirror ARGBMirror
+
+// ARGB mirror.
+LIBYUV_API
+int ARGBMirror(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert NV12 to RGB565.
+LIBYUV_API
+int NV12ToRGB565(const uint8* src_y, int src_stride_y,
+                 const uint8* src_uv, int src_stride_uv,
+                 uint8* dst_rgb565, int dst_stride_rgb565,
+                 int width, int height);
+
+// I422ToARGB is in convert_argb.h
+// Convert I422 to BGRA.
+LIBYUV_API
+int I422ToBGRA(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_bgra, int dst_stride_bgra,
+               int width, int height);
+
+// Convert I422 to ABGR.
+LIBYUV_API
+int I422ToABGR(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_abgr, int dst_stride_abgr,
+               int width, int height);
+
+// Convert I422 to RGBA.
+LIBYUV_API
+int I422ToRGBA(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_rgba, int dst_stride_rgba,
+               int width, int height);
+
+// Alias
+#define RGB24ToRAW RAWToRGB24
+
+LIBYUV_API
+int RAWToRGB24(const uint8* src_raw, int src_stride_raw,
+               uint8* dst_rgb24, int dst_stride_rgb24,
+               int width, int height);
+
+// Draw a rectangle into I420.
+LIBYUV_API
+int I420Rect(uint8* dst_y, int dst_stride_y,
+             uint8* dst_u, int dst_stride_u,
+             uint8* dst_v, int dst_stride_v,
+             int x, int y, int width, int height,
+             int value_y, int value_u, int value_v);
+
+// Draw a rectangle into ARGB.
+LIBYUV_API
+int ARGBRect(uint8* dst_argb, int dst_stride_argb,
+             int x, int y, int width, int height, uint32 value);
+
+// Convert ARGB to gray scale ARGB.
+LIBYUV_API
+int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Make a rectangle of ARGB gray scale.
+LIBYUV_API
+int ARGBGray(uint8* dst_argb, int dst_stride_argb,
+             int x, int y, int width, int height);
+
+// Make a rectangle of ARGB Sepia tone.
+LIBYUV_API
+int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
+              int x, int y, int width, int height);
+
+// Apply a matrix rotation to each ARGB pixel.
+// matrix_argb is 4 signed ARGB values. -128 to 127 representing -2 to 2.
+// The first 4 coefficients apply to B, G, R, A and produce B of the output.
+// The next 4 coefficients apply to B, G, R, A and produce G of the output.
+// The next 4 coefficients apply to B, G, R, A and produce R of the output.
+// The last 4 coefficients apply to B, G, R, A and produce A of the output.
+LIBYUV_API
+int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
+                    uint8* dst_argb, int dst_stride_argb,
+                    const int8* matrix_argb,
+                    int width, int height);
+
+// Deprecated. Use ARGBColorMatrix instead.
+// Apply a matrix rotation to each ARGB pixel.
+// matrix_argb is 3 signed ARGB values. -128 to 127 representing -1 to 1.
+// The first 4 coefficients apply to B, G, R, A and produce B of the output.
+// The next 4 coefficients apply to B, G, R, A and produce G of the output.
+// The last 4 coefficients apply to B, G, R, A and produce R of the output.
+LIBYUV_API
+int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
+                   const int8* matrix_rgb,
+                   int x, int y, int width, int height);
+
+// Apply a color table each ARGB pixel.
+// Table contains 256 ARGB values.
+LIBYUV_API
+int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,
+                   const uint8* table_argb,
+                   int x, int y, int width, int height);
+
+// Apply a color table each ARGB pixel but preserve destination alpha.
+// Table contains 256 ARGB values.
+LIBYUV_API
+int RGBColorTable(uint8* dst_argb, int dst_stride_argb,
+                  const uint8* table_argb,
+                  int x, int y, int width, int height);
+
+// Apply a luma/color table each ARGB pixel but preserve destination alpha.
+// Table contains 32768 values indexed by [Y][C] where 7 it 7 bit luma from
+// RGB (YJ style) and C is an 8 bit color component (R, G or B).
+LIBYUV_API
+int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb,
+                       uint8* dst_argb, int dst_stride_argb,
+                       const uint8* luma_rgb_table,
+                       int width, int height);
+
+// Apply a 3 term polynomial to ARGB values.
+// poly points to a 4x4 matrix.  The first row is constants.  The 2nd row is
+// coefficients for b, g, r and a.  The 3rd row is coefficients for b squared,
+// g squared, r squared and a squared.  The 4rd row is coefficients for b to
+// the 3, g to the 3, r to the 3 and a to the 3.  The values are summed and
+// result clamped to 0 to 255.
+// A polynomial approximation can be dirived using software such as 'R'.
+
+LIBYUV_API
+int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_argb, int dst_stride_argb,
+                   const float* poly,
+                   int width, int height);
+
+// Quantize a rectangle of ARGB. Alpha unaffected.
+// scale is a 16 bit fractional fixed point scaler between 0 and 65535.
+// interval_size should be a value between 1 and 255.
+// interval_offset should be a value between 0 and 255.
+LIBYUV_API
+int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
+                 int scale, int interval_size, int interval_offset,
+                 int x, int y, int width, int height);
+
+// Copy ARGB to ARGB.
+LIBYUV_API
+int ARGBCopy(const uint8* src_argb, int src_stride_argb,
+             uint8* dst_argb, int dst_stride_argb,
+             int width, int height);
+
+// Copy Alpha channel of ARGB to alpha of ARGB.
+LIBYUV_API
+int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
+                  uint8* dst_argb, int dst_stride_argb,
+                  int width, int height);
+
+// Copy Y channel to Alpha of ARGB.
+LIBYUV_API
+int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
+                     uint8* dst_argb, int dst_stride_argb,
+                     int width, int height);
+
+typedef void (*ARGBBlendRow)(const uint8* src_argb0, const uint8* src_argb1,
+                             uint8* dst_argb, int width);
+
+// Get function to Alpha Blend ARGB pixels and store to destination.
+LIBYUV_API
+ARGBBlendRow GetARGBBlend();
+
+// Alpha Blend ARGB images and store to destination.
+// Source is pre-multiplied by alpha using ARGBAttenuate.
+// Alpha of destination is set to 255.
+LIBYUV_API
+int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
+              const uint8* src_argb1, int src_stride_argb1,
+              uint8* dst_argb, int dst_stride_argb,
+              int width, int height);
+
+// Alpha Blend plane and store to destination.
+// Source is not pre-multiplied by alpha.
+LIBYUV_API
+int BlendPlane(const uint8* src_y0, int src_stride_y0,
+               const uint8* src_y1, int src_stride_y1,
+               const uint8* alpha, int alpha_stride,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height);
+
+// Alpha Blend YUV images and store to destination.
+// Source is not pre-multiplied by alpha.
+// Alpha is full width x height and subsampled to half size to apply to UV.
+LIBYUV_API
+int I420Blend(const uint8* src_y0, int src_stride_y0,
+              const uint8* src_u0, int src_stride_u0,
+              const uint8* src_v0, int src_stride_v0,
+              const uint8* src_y1, int src_stride_y1,
+              const uint8* src_u1, int src_stride_u1,
+              const uint8* src_v1, int src_stride_v1,
+              const uint8* alpha, int alpha_stride,
+              uint8* dst_y, int dst_stride_y,
+              uint8* dst_u, int dst_stride_u,
+              uint8* dst_v, int dst_stride_v,
+              int width, int height);
+
+// Multiply ARGB image by ARGB image. Shifted down by 8. Saturates to 255.
+LIBYUV_API
+int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
+                 const uint8* src_argb1, int src_stride_argb1,
+                 uint8* dst_argb, int dst_stride_argb,
+                 int width, int height);
+
+// Add ARGB image with ARGB image. Saturates to 255.
+LIBYUV_API
+int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
+            const uint8* src_argb1, int src_stride_argb1,
+            uint8* dst_argb, int dst_stride_argb,
+            int width, int height);
+
+// Subtract ARGB image (argb1) from ARGB image (argb0). Saturates to 0.
+LIBYUV_API
+int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
+                 const uint8* src_argb1, int src_stride_argb1,
+                 uint8* dst_argb, int dst_stride_argb,
+                 int width, int height);
+
+// Convert I422 to YUY2.
+LIBYUV_API
+int I422ToYUY2(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_frame, int dst_stride_frame,
+               int width, int height);
+
+// Convert I422 to UYVY.
+LIBYUV_API
+int I422ToUYVY(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_frame, int dst_stride_frame,
+               int width, int height);
+
+// Convert unattentuated ARGB to preattenuated ARGB.
+LIBYUV_API
+int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
+                  uint8* dst_argb, int dst_stride_argb,
+                  int width, int height);
+
+// Convert preattentuated ARGB to unattenuated ARGB.
+LIBYUV_API
+int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
+                    uint8* dst_argb, int dst_stride_argb,
+                    int width, int height);
+
+// Internal function - do not call directly.
+// Computes table of cumulative sum for image where the value is the sum
+// of all values above and to the left of the entry. Used by ARGBBlur.
+LIBYUV_API
+int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
+                             int32* dst_cumsum, int dst_stride32_cumsum,
+                             int width, int height);
+
+// Blur ARGB image.
+// dst_cumsum table of width * (height + 1) * 16 bytes aligned to
+//   16 byte boundary.
+// dst_stride32_cumsum is number of ints in a row (width * 4).
+// radius is number of pixels around the center.  e.g. 1 = 3x3. 2=5x5.
+// Blur is optimized for radius of 5 (11x11) or less.
+LIBYUV_API
+int ARGBBlur(const uint8* src_argb, int src_stride_argb,
+             uint8* dst_argb, int dst_stride_argb,
+             int32* dst_cumsum, int dst_stride32_cumsum,
+             int width, int height, int radius);
+
+// Multiply ARGB image by ARGB value.
+LIBYUV_API
+int ARGBShade(const uint8* src_argb, int src_stride_argb,
+              uint8* dst_argb, int dst_stride_argb,
+              int width, int height, uint32 value);
+
+// Interpolate between two images using specified amount of interpolation
+// (0 to 255) and store to destination.
+// 'interpolation' is specified as 8 bit fraction where 0 means 100% src0
+// and 255 means 1% src0 and 99% src1.
+LIBYUV_API
+int InterpolatePlane(const uint8* src0, int src_stride0,
+                     const uint8* src1, int src_stride1,
+                     uint8* dst, int dst_stride,
+                     int width, int height, int interpolation);
+
+// Interpolate between two ARGB images using specified amount of interpolation
+// Internally calls InterpolatePlane with width * 4 (bpp).
+LIBYUV_API
+int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
+                    const uint8* src_argb1, int src_stride_argb1,
+                    uint8* dst_argb, int dst_stride_argb,
+                    int width, int height, int interpolation);
+
+// Interpolate between two YUV images using specified amount of interpolation
+// Internally calls InterpolatePlane on each plane where the U and V planes
+// are half width and half height.
+LIBYUV_API
+int I420Interpolate(const uint8* src0_y, int src0_stride_y,
+                    const uint8* src0_u, int src0_stride_u,
+                    const uint8* src0_v, int src0_stride_v,
+                    const uint8* src1_y, int src1_stride_y,
+                    const uint8* src1_u, int src1_stride_u,
+                    const uint8* src1_v, int src1_stride_v,
+                    uint8* dst_y, int dst_stride_y,
+                    uint8* dst_u, int dst_stride_u,
+                    uint8* dst_v, int dst_stride_v,
+                    int width, int height, int interpolation);
+
+#if defined(__pnacl__) || defined(__CLR_VER) || \
+    (defined(__i386__) && !defined(__SSE2__))
+#define LIBYUV_DISABLE_X86
+#endif
+// The following are available on all x86 platforms:
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+#define HAS_ARGBAFFINEROW_SSE2
+#endif
+
+// Row function for copying pixels from a source with a slope to a row
+// of destination. Useful for scaling, rotation, mirror, texture mapping.
+LIBYUV_API
+void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
+                     uint8* dst_argb, const float* uv_dudv, int width);
+LIBYUV_API
+void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
+                        uint8* dst_argb, const float* uv_dudv, int width);
+
+// Shuffle ARGB channel order.  e.g. BGRA to ARGB.
+// shuffler is 16 bytes and must be aligned.
+LIBYUV_API
+int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
+                uint8* dst_argb, int dst_stride_argb,
+                const uint8* shuffler, int width, int height);
+
+// Sobel ARGB effect with planar output.
+LIBYUV_API
+int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb,
+                     uint8* dst_y, int dst_stride_y,
+                     int width, int height);
+
+// Sobel ARGB effect.
+LIBYUV_API
+int ARGBSobel(const uint8* src_argb, int src_stride_argb,
+              uint8* dst_argb, int dst_stride_argb,
+              int width, int height);
+
+// Sobel ARGB effect w/ Sobel X, Sobel, Sobel Y in ARGB.
+LIBYUV_API
+int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
+                uint8* dst_argb, int dst_stride_argb,
+                int width, int height);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_  NOLINT
diff --git a/libs/libyuv/include/libyuv/rotate.h b/libs/libyuv/include/libyuv/rotate.h
new file mode 100644
index 0000000000..8af60b8955
--- /dev/null
+++ b/libs/libyuv/include/libyuv/rotate.h
@@ -0,0 +1,117 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_ROTATE_H_  // NOLINT
+#define INCLUDE_LIBYUV_ROTATE_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Supported rotation.
+typedef enum RotationMode {
+  kRotate0 = 0,  // No rotation.
+  kRotate90 = 90,  // Rotate 90 degrees clockwise.
+  kRotate180 = 180,  // Rotate 180 degrees.
+  kRotate270 = 270,  // Rotate 270 degrees clockwise.
+
+  // Deprecated.
+  kRotateNone = 0,
+  kRotateClockwise = 90,
+  kRotateCounterClockwise = 270,
+} RotationModeEnum;
+
+// Rotate I420 frame.
+LIBYUV_API
+int I420Rotate(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int src_width, int src_height, enum RotationMode mode);
+
+// Rotate NV12 input and store in I420.
+LIBYUV_API
+int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
+                     const uint8* src_uv, int src_stride_uv,
+                     uint8* dst_y, int dst_stride_y,
+                     uint8* dst_u, int dst_stride_u,
+                     uint8* dst_v, int dst_stride_v,
+                     int src_width, int src_height, enum RotationMode mode);
+
+// Rotate a plane by 0, 90, 180, or 270.
+LIBYUV_API
+int RotatePlane(const uint8* src, int src_stride,
+                uint8* dst, int dst_stride,
+                int src_width, int src_height, enum RotationMode mode);
+
+// Rotate planes by 90, 180, 270. Deprecated.
+LIBYUV_API
+void RotatePlane90(const uint8* src, int src_stride,
+                   uint8* dst, int dst_stride,
+                   int width, int height);
+
+LIBYUV_API
+void RotatePlane180(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride,
+                    int width, int height);
+
+LIBYUV_API
+void RotatePlane270(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride,
+                    int width, int height);
+
+LIBYUV_API
+void RotateUV90(const uint8* src, int src_stride,
+                uint8* dst_a, int dst_stride_a,
+                uint8* dst_b, int dst_stride_b,
+                int width, int height);
+
+// Rotations for when U and V are interleaved.
+// These functions take one input pointer and
+// split the data into two buffers while
+// rotating them. Deprecated.
+LIBYUV_API
+void RotateUV180(const uint8* src, int src_stride,
+                 uint8* dst_a, int dst_stride_a,
+                 uint8* dst_b, int dst_stride_b,
+                 int width, int height);
+
+LIBYUV_API
+void RotateUV270(const uint8* src, int src_stride,
+                 uint8* dst_a, int dst_stride_a,
+                 uint8* dst_b, int dst_stride_b,
+                 int width, int height);
+
+// The 90 and 270 functions are based on transposes.
+// Doing a transpose with reversing the read/write
+// order will result in a rotation by +- 90 degrees.
+// Deprecated.
+LIBYUV_API
+void TransposePlane(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride,
+                    int width, int height);
+
+LIBYUV_API
+void TransposeUV(const uint8* src, int src_stride,
+                 uint8* dst_a, int dst_stride_a,
+                 uint8* dst_b, int dst_stride_b,
+                 int width, int height);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_ROTATE_H_  NOLINT
diff --git a/libs/libyuv/include/libyuv/rotate_argb.h b/libs/libyuv/include/libyuv/rotate_argb.h
new file mode 100644
index 0000000000..660ff5573e
--- /dev/null
+++ b/libs/libyuv/include/libyuv/rotate_argb.h
@@ -0,0 +1,33 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_ROTATE_ARGB_H_  // NOLINT
+#define INCLUDE_LIBYUV_ROTATE_ARGB_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/rotate.h"  // For RotationMode.
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Rotate ARGB frame
+LIBYUV_API
+int ARGBRotate(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_argb, int dst_stride_argb,
+               int src_width, int src_height, enum RotationMode mode);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_ROTATE_ARGB_H_  NOLINT
diff --git a/libs/libyuv/include/libyuv/rotate_row.h b/libs/libyuv/include/libyuv/rotate_row.h
new file mode 100644
index 0000000000..d9f4d07928
--- /dev/null
+++ b/libs/libyuv/include/libyuv/rotate_row.h
@@ -0,0 +1,116 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_ROTATE_ROW_H_  // NOLINT
+#define INCLUDE_LIBYUV_ROTATE_ROW_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if defined(__pnacl__) || defined(__CLR_VER) || \
+    (defined(__i386__) && !defined(__SSE2__))
+#define LIBYUV_DISABLE_X86
+#endif
+
+// The following are available for Visual C and clangcl 32 bit:
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+#define HAS_TRANSPOSEWX8_SSSE3
+#define HAS_TRANSPOSEUVWX8_SSE2
+#endif
+
+// The following are available for GCC 32 or 64 bit but not NaCL for 64 bit:
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
+#define HAS_TRANSPOSEWX8_SSSE3
+#endif
+
+// The following are available for 64 bit GCC but not NaCL:
+#if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
+    defined(__x86_64__)
+#define HAS_TRANSPOSEWX8_FAST_SSSE3
+#define HAS_TRANSPOSEUVWX8_SSE2
+#endif
+
+#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
+#define HAS_TRANSPOSEWX8_NEON
+#define HAS_TRANSPOSEUVWX8_NEON
+#endif
+
+#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
+    defined(__mips__) && \
+    defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+#define HAS_TRANSPOSEWX8_DSPR2
+#define HAS_TRANSPOSEUVWX8_DSPR2
+#endif  // defined(__mips__)
+
+void TransposeWxH_C(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride, int width, int height);
+
+void TransposeWx8_C(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride, int width);
+void TransposeWx8_NEON(const uint8* src, int src_stride,
+                       uint8* dst, int dst_stride, int width);
+void TransposeWx8_SSSE3(const uint8* src, int src_stride,
+                        uint8* dst, int dst_stride, int width);
+void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,
+                             uint8* dst, int dst_stride, int width);
+void TransposeWx8_DSPR2(const uint8* src, int src_stride,
+                        uint8* dst, int dst_stride, int width);
+void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride,
+                             uint8* dst, int dst_stride, int width);
+
+void TransposeWx8_Any_NEON(const uint8* src, int src_stride,
+                           uint8* dst, int dst_stride, int width);
+void TransposeWx8_Any_SSSE3(const uint8* src, int src_stride,
+                            uint8* dst, int dst_stride, int width);
+void TransposeWx8_Fast_Any_SSSE3(const uint8* src, int src_stride,
+                                 uint8* dst, int dst_stride, int width);
+void TransposeWx8_Any_DSPR2(const uint8* src, int src_stride,
+                            uint8* dst, int dst_stride, int width);
+
+void TransposeUVWxH_C(const uint8* src, int src_stride,
+                      uint8* dst_a, int dst_stride_a,
+                      uint8* dst_b, int dst_stride_b,
+                      int width, int height);
+
+void TransposeUVWx8_C(const uint8* src, int src_stride,
+                      uint8* dst_a, int dst_stride_a,
+                      uint8* dst_b, int dst_stride_b, int width);
+void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b, int width);
+void TransposeUVWx8_NEON(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b, int width);
+void TransposeUVWx8_DSPR2(const uint8* src, int src_stride,
+                          uint8* dst_a, int dst_stride_a,
+                          uint8* dst_b, int dst_stride_b, int width);
+
+void TransposeUVWx8_Any_SSE2(const uint8* src, int src_stride,
+                             uint8* dst_a, int dst_stride_a,
+                             uint8* dst_b, int dst_stride_b, int width);
+void TransposeUVWx8_Any_NEON(const uint8* src, int src_stride,
+                             uint8* dst_a, int dst_stride_a,
+                             uint8* dst_b, int dst_stride_b, int width);
+void TransposeUVWx8_Any_DSPR2(const uint8* src, int src_stride,
+                              uint8* dst_a, int dst_stride_a,
+                              uint8* dst_b, int dst_stride_b, int width);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_ROTATE_ROW_H_  NOLINT
diff --git a/libs/libyuv/include/libyuv/row.h b/libs/libyuv/include/libyuv/row.h
new file mode 100644
index 0000000000..6011524487
--- /dev/null
+++ b/libs/libyuv/include/libyuv/row.h
@@ -0,0 +1,1923 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_ROW_H_  // NOLINT
+#define INCLUDE_LIBYUV_ROW_H_
+
+#include <stdlib.h>  // For malloc.
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1)))
+
+#ifdef __cplusplus
+#define align_buffer_64(var, size)                                             \
+  uint8* var##_mem = reinterpret_cast<uint8*>(malloc((size) + 63));            \
+  uint8* var = reinterpret_cast<uint8*>                                        \
+      ((reinterpret_cast<intptr_t>(var##_mem) + 63) & ~63)
+#else
+#define align_buffer_64(var, size)                                             \
+  uint8* var##_mem = (uint8*)(malloc((size) + 63));               /* NOLINT */ \
+  uint8* var = (uint8*)(((intptr_t)(var##_mem) + 63) & ~63)       /* NOLINT */
+#endif
+
+#define free_aligned_buffer_64(var) \
+  free(var##_mem);  \
+  var = 0
+
+#if defined(__pnacl__) || defined(__CLR_VER) || \
+    (defined(__i386__) && !defined(__SSE2__))
+#define LIBYUV_DISABLE_X86
+#endif
+// True if compiling for SSSE3 as a requirement.
+#if defined(__SSSE3__) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 3))
+#define LIBYUV_SSSE3_ONLY
+#endif
+
+#if defined(__native_client__)
+#define LIBYUV_DISABLE_NEON
+#endif
+// clang >= 3.5.0 required for Arm64.
+#if defined(__clang__) && defined(__aarch64__) && !defined(LIBYUV_DISABLE_NEON)
+#if (__clang_major__ < 3) || (__clang_major__ == 3 && (__clang_minor__ < 5))
+#define LIBYUV_DISABLE_NEON
+#endif  // clang >= 3.5
+#endif  // __clang__
+
+// GCC >= 4.7.0 required for AVX2.
+#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
+#define GCC_HAS_AVX2 1
+#endif  // GNUC >= 4.7
+#endif  // __GNUC__
+
+// clang >= 3.4.0 required for AVX2.
+#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
+#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4))
+#define CLANG_HAS_AVX2 1
+#endif  // clang >= 3.4
+#endif  // __clang__
+
+// Visual C 2012 required for AVX2.
+#if defined(_M_IX86) && !defined(__clang__) && \
+    defined(_MSC_VER) && _MSC_VER >= 1700
+#define VISUALC_HAS_AVX2 1
+#endif  // VisualStudio >= 2012
+
+// The following are available on all x86 platforms:
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+// Conversions:
+#define HAS_ABGRTOUVROW_SSSE3
+#define HAS_ABGRTOYROW_SSSE3
+#define HAS_ARGB1555TOARGBROW_SSE2
+#define HAS_ARGB4444TOARGBROW_SSE2
+#define HAS_ARGBSETROW_X86
+#define HAS_ARGBSHUFFLEROW_SSE2
+#define HAS_ARGBSHUFFLEROW_SSSE3
+#define HAS_ARGBTOARGB1555ROW_SSE2
+#define HAS_ARGBTOARGB4444ROW_SSE2
+#define HAS_ARGBTORAWROW_SSSE3
+#define HAS_ARGBTORGB24ROW_SSSE3
+#define HAS_ARGBTORGB565DITHERROW_SSE2
+#define HAS_ARGBTORGB565ROW_SSE2
+#define HAS_ARGBTOUV444ROW_SSSE3
+#define HAS_ARGBTOUVJROW_SSSE3
+#define HAS_ARGBTOUVROW_SSSE3
+#define HAS_ARGBTOYJROW_SSSE3
+#define HAS_ARGBTOYROW_SSSE3
+#define HAS_BGRATOUVROW_SSSE3
+#define HAS_BGRATOYROW_SSSE3
+#define HAS_COPYROW_ERMS
+#define HAS_COPYROW_SSE2
+#define HAS_H422TOARGBROW_SSSE3
+#define HAS_I400TOARGBROW_SSE2
+#define HAS_I422TOARGB1555ROW_SSSE3
+#define HAS_I422TOARGB4444ROW_SSSE3
+#define HAS_I422TOARGBROW_SSSE3
+#define HAS_I422TORGB24ROW_SSSE3
+#define HAS_I422TORGB565ROW_SSSE3
+#define HAS_I422TORGBAROW_SSSE3
+#define HAS_I422TOUYVYROW_SSE2
+#define HAS_I422TOYUY2ROW_SSE2
+#define HAS_I444TOARGBROW_SSSE3
+#define HAS_J400TOARGBROW_SSE2
+#define HAS_J422TOARGBROW_SSSE3
+#define HAS_MERGEUVROW_SSE2
+#define HAS_MIRRORROW_SSSE3
+#define HAS_MIRRORUVROW_SSSE3
+#define HAS_NV12TOARGBROW_SSSE3
+#define HAS_NV12TORGB565ROW_SSSE3
+#define HAS_NV21TOARGBROW_SSSE3
+#define HAS_RAWTOARGBROW_SSSE3
+#define HAS_RAWTORGB24ROW_SSSE3
+#define HAS_RAWTOYROW_SSSE3
+#define HAS_RGB24TOARGBROW_SSSE3
+#define HAS_RGB24TOYROW_SSSE3
+#define HAS_RGB565TOARGBROW_SSE2
+#define HAS_RGBATOUVROW_SSSE3
+#define HAS_RGBATOYROW_SSSE3
+#define HAS_SETROW_ERMS
+#define HAS_SETROW_X86
+#define HAS_SPLITUVROW_SSE2
+#define HAS_UYVYTOARGBROW_SSSE3
+#define HAS_UYVYTOUV422ROW_SSE2
+#define HAS_UYVYTOUVROW_SSE2
+#define HAS_UYVYTOYROW_SSE2
+#define HAS_YUY2TOARGBROW_SSSE3
+#define HAS_YUY2TOUV422ROW_SSE2
+#define HAS_YUY2TOUVROW_SSE2
+#define HAS_YUY2TOYROW_SSE2
+
+// Effects:
+#define HAS_ARGBADDROW_SSE2
+#define HAS_ARGBAFFINEROW_SSE2
+#define HAS_ARGBATTENUATEROW_SSSE3
+#define HAS_ARGBBLENDROW_SSSE3
+#define HAS_ARGBCOLORMATRIXROW_SSSE3
+#define HAS_ARGBCOLORTABLEROW_X86
+#define HAS_ARGBCOPYALPHAROW_SSE2
+#define HAS_ARGBCOPYYTOALPHAROW_SSE2
+#define HAS_ARGBGRAYROW_SSSE3
+#define HAS_ARGBLUMACOLORTABLEROW_SSSE3
+#define HAS_ARGBMIRRORROW_SSE2
+#define HAS_ARGBMULTIPLYROW_SSE2
+#define HAS_ARGBPOLYNOMIALROW_SSE2
+#define HAS_ARGBQUANTIZEROW_SSE2
+#define HAS_ARGBSEPIAROW_SSSE3
+#define HAS_ARGBSHADEROW_SSE2
+#define HAS_ARGBSUBTRACTROW_SSE2
+#define HAS_ARGBUNATTENUATEROW_SSE2
+#define HAS_BLENDPLANEROW_SSSE3
+#define HAS_COMPUTECUMULATIVESUMROW_SSE2
+#define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+#define HAS_INTERPOLATEROW_SSSE3
+#define HAS_RGBCOLORTABLEROW_X86
+#define HAS_SOBELROW_SSE2
+#define HAS_SOBELTOPLANEROW_SSE2
+#define HAS_SOBELXROW_SSE2
+#define HAS_SOBELXYROW_SSE2
+#define HAS_SOBELYROW_SSE2
+
+// The following functions fail on gcc/clang 32 bit with fpic and framepointer.
+// caveat: clangcl uses row_win.cc which works.
+#if defined(NDEBUG) || !(defined(_DEBUG) && defined(__i386__)) || \
+    !defined(__i386__) || defined(_MSC_VER)
+// TODO(fbarchard): fix build error on x86 debug
+// https://code.google.com/p/libyuv/issues/detail?id=524
+#define HAS_I411TOARGBROW_SSSE3
+// TODO(fbarchard): fix build error on android_full_debug=1
+// https://code.google.com/p/libyuv/issues/detail?id=517
+#define HAS_I422ALPHATOARGBROW_SSSE3
+#endif
+#endif
+
+// The following are available on all x86 platforms, but
+// require VS2012, clang 3.4 or gcc 4.7.
+// The code supports NaCL but requires a new compiler and validator.
+#if !defined(LIBYUV_DISABLE_X86) && (defined(VISUALC_HAS_AVX2) || \
+    defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
+#define HAS_ARGBCOPYALPHAROW_AVX2
+#define HAS_ARGBCOPYYTOALPHAROW_AVX2
+#define HAS_ARGBMIRRORROW_AVX2
+#define HAS_ARGBPOLYNOMIALROW_AVX2
+#define HAS_ARGBSHUFFLEROW_AVX2
+#define HAS_ARGBTORGB565DITHERROW_AVX2
+#define HAS_ARGBTOUVJROW_AVX2
+#define HAS_ARGBTOUVROW_AVX2
+#define HAS_ARGBTOYJROW_AVX2
+#define HAS_ARGBTOYROW_AVX2
+#define HAS_COPYROW_AVX
+#define HAS_H422TOARGBROW_AVX2
+#define HAS_I400TOARGBROW_AVX2
+#if !(defined(_DEBUG) && defined(__i386__))
+// TODO(fbarchard): fix build error on android_full_debug=1
+// https://code.google.com/p/libyuv/issues/detail?id=517
+#define HAS_I422ALPHATOARGBROW_AVX2
+#endif
+#define HAS_I411TOARGBROW_AVX2
+#define HAS_I422TOARGB1555ROW_AVX2
+#define HAS_I422TOARGB4444ROW_AVX2
+#define HAS_I422TOARGBROW_AVX2
+#define HAS_I422TORGB24ROW_AVX2
+#define HAS_I422TORGB565ROW_AVX2
+#define HAS_I422TORGBAROW_AVX2
+#define HAS_I444TOARGBROW_AVX2
+#define HAS_INTERPOLATEROW_AVX2
+#define HAS_J422TOARGBROW_AVX2
+#define HAS_MERGEUVROW_AVX2
+#define HAS_MIRRORROW_AVX2
+#define HAS_NV12TOARGBROW_AVX2
+#define HAS_NV12TORGB565ROW_AVX2
+#define HAS_NV21TOARGBROW_AVX2
+#define HAS_SPLITUVROW_AVX2
+#define HAS_UYVYTOARGBROW_AVX2
+#define HAS_UYVYTOUV422ROW_AVX2
+#define HAS_UYVYTOUVROW_AVX2
+#define HAS_UYVYTOYROW_AVX2
+#define HAS_YUY2TOARGBROW_AVX2
+#define HAS_YUY2TOUV422ROW_AVX2
+#define HAS_YUY2TOUVROW_AVX2
+#define HAS_YUY2TOYROW_AVX2
+
+// Effects:
+#define HAS_ARGBADDROW_AVX2
+#define HAS_ARGBATTENUATEROW_AVX2
+#define HAS_ARGBMULTIPLYROW_AVX2
+#define HAS_ARGBSUBTRACTROW_AVX2
+#define HAS_ARGBUNATTENUATEROW_AVX2
+#define HAS_BLENDPLANEROW_AVX2
+#endif
+
+// The following are available for AVX2 Visual C and clangcl 32 bit:
+// TODO(fbarchard): Port to gcc.
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
+    (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))
+#define HAS_ARGB1555TOARGBROW_AVX2
+#define HAS_ARGB4444TOARGBROW_AVX2
+#define HAS_ARGBTOARGB1555ROW_AVX2
+#define HAS_ARGBTOARGB4444ROW_AVX2
+#define HAS_ARGBTORGB565ROW_AVX2
+#define HAS_J400TOARGBROW_AVX2
+#define HAS_RGB565TOARGBROW_AVX2
+#endif
+
+// The following are also available on x64 Visual C.
+#if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64) && \
+    (!defined(__clang__) || defined(__SSSE3__))
+#define HAS_I422ALPHATOARGBROW_SSSE3
+#define HAS_I422TOARGBROW_SSSE3
+#endif
+
+// The following are available on Neon platforms:
+#if !defined(LIBYUV_DISABLE_NEON) && \
+    (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+#define HAS_ABGRTOUVROW_NEON
+#define HAS_ABGRTOYROW_NEON
+#define HAS_ARGB1555TOARGBROW_NEON
+#define HAS_ARGB1555TOUVROW_NEON
+#define HAS_ARGB1555TOYROW_NEON
+#define HAS_ARGB4444TOARGBROW_NEON
+#define HAS_ARGB4444TOUVROW_NEON
+#define HAS_ARGB4444TOYROW_NEON
+#define HAS_ARGBSETROW_NEON
+#define HAS_ARGBTOARGB1555ROW_NEON
+#define HAS_ARGBTOARGB4444ROW_NEON
+#define HAS_ARGBTORAWROW_NEON
+#define HAS_ARGBTORGB24ROW_NEON
+#define HAS_ARGBTORGB565DITHERROW_NEON
+#define HAS_ARGBTORGB565ROW_NEON
+#define HAS_ARGBTOUV411ROW_NEON
+#define HAS_ARGBTOUV444ROW_NEON
+#define HAS_ARGBTOUVJROW_NEON
+#define HAS_ARGBTOUVROW_NEON
+#define HAS_ARGBTOYJROW_NEON
+#define HAS_ARGBTOYROW_NEON
+#define HAS_BGRATOUVROW_NEON
+#define HAS_BGRATOYROW_NEON
+#define HAS_COPYROW_NEON
+#define HAS_I400TOARGBROW_NEON
+#define HAS_I411TOARGBROW_NEON
+#define HAS_I422ALPHATOARGBROW_NEON
+#define HAS_I422TOARGB1555ROW_NEON
+#define HAS_I422TOARGB4444ROW_NEON
+#define HAS_I422TOARGBROW_NEON
+#define HAS_I422TORGB24ROW_NEON
+#define HAS_I422TORGB565ROW_NEON
+#define HAS_I422TORGBAROW_NEON
+#define HAS_I422TOUYVYROW_NEON
+#define HAS_I422TOYUY2ROW_NEON
+#define HAS_I444TOARGBROW_NEON
+#define HAS_J400TOARGBROW_NEON
+#define HAS_MERGEUVROW_NEON
+#define HAS_MIRRORROW_NEON
+#define HAS_MIRRORUVROW_NEON
+#define HAS_NV12TOARGBROW_NEON
+#define HAS_NV12TORGB565ROW_NEON
+#define HAS_NV21TOARGBROW_NEON
+#define HAS_RAWTOARGBROW_NEON
+#define HAS_RAWTORGB24ROW_NEON
+#define HAS_RAWTOUVROW_NEON
+#define HAS_RAWTOYROW_NEON
+#define HAS_RGB24TOARGBROW_NEON
+#define HAS_RGB24TOUVROW_NEON
+#define HAS_RGB24TOYROW_NEON
+#define HAS_RGB565TOARGBROW_NEON
+#define HAS_RGB565TOUVROW_NEON
+#define HAS_RGB565TOYROW_NEON
+#define HAS_RGBATOUVROW_NEON
+#define HAS_RGBATOYROW_NEON
+#define HAS_SETROW_NEON
+#define HAS_SPLITUVROW_NEON
+#define HAS_UYVYTOARGBROW_NEON
+#define HAS_UYVYTOUV422ROW_NEON
+#define HAS_UYVYTOUVROW_NEON
+#define HAS_UYVYTOYROW_NEON
+#define HAS_YUY2TOARGBROW_NEON
+#define HAS_YUY2TOUV422ROW_NEON
+#define HAS_YUY2TOUVROW_NEON
+#define HAS_YUY2TOYROW_NEON
+
+// Effects:
+#define HAS_ARGBADDROW_NEON
+#define HAS_ARGBATTENUATEROW_NEON
+#define HAS_ARGBBLENDROW_NEON
+#define HAS_ARGBCOLORMATRIXROW_NEON
+#define HAS_ARGBGRAYROW_NEON
+#define HAS_ARGBMIRRORROW_NEON
+#define HAS_ARGBMULTIPLYROW_NEON
+#define HAS_ARGBQUANTIZEROW_NEON
+#define HAS_ARGBSEPIAROW_NEON
+#define HAS_ARGBSHADEROW_NEON
+#define HAS_ARGBSHUFFLEROW_NEON
+#define HAS_ARGBSUBTRACTROW_NEON
+#define HAS_INTERPOLATEROW_NEON
+#define HAS_SOBELROW_NEON
+#define HAS_SOBELTOPLANEROW_NEON
+#define HAS_SOBELXROW_NEON
+#define HAS_SOBELXYROW_NEON
+#define HAS_SOBELYROW_NEON
+#endif
+
+// The following are available on Mips platforms:
+#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \
+    (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)
+#define HAS_COPYROW_MIPS
+#if defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+#define HAS_I422TOARGBROW_DSPR2
+#define HAS_INTERPOLATEROW_DSPR2
+#define HAS_MIRRORROW_DSPR2
+#define HAS_MIRRORUVROW_DSPR2
+#define HAS_SPLITUVROW_DSPR2
+#endif
+#endif
+
+#if defined(_MSC_VER) && !defined(__CLR_VER)
+#define SIMD_ALIGNED(var) __declspec(align(16)) var
+#define SIMD_ALIGNED32(var) __declspec(align(64)) var
+typedef __declspec(align(16)) int16 vec16[8];
+typedef __declspec(align(16)) int32 vec32[4];
+typedef __declspec(align(16)) int8 vec8[16];
+typedef __declspec(align(16)) uint16 uvec16[8];
+typedef __declspec(align(16)) uint32 uvec32[4];
+typedef __declspec(align(16)) uint8 uvec8[16];
+typedef __declspec(align(32)) int16 lvec16[16];
+typedef __declspec(align(32)) int32 lvec32[8];
+typedef __declspec(align(32)) int8 lvec8[32];
+typedef __declspec(align(32)) uint16 ulvec16[16];
+typedef __declspec(align(32)) uint32 ulvec32[8];
+typedef __declspec(align(32)) uint8 ulvec8[32];
+#elif defined(__GNUC__) && !defined(__pnacl__)
+// Caveat GCC 4.2 to 4.7 have a known issue using vectors with const.
+#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
+#define SIMD_ALIGNED32(var) var __attribute__((aligned(64)))
+typedef int16 __attribute__((vector_size(16))) vec16;
+typedef int32 __attribute__((vector_size(16))) vec32;
+typedef int8 __attribute__((vector_size(16))) vec8;
+typedef uint16 __attribute__((vector_size(16))) uvec16;
+typedef uint32 __attribute__((vector_size(16))) uvec32;
+typedef uint8 __attribute__((vector_size(16))) uvec8;
+typedef int16 __attribute__((vector_size(32))) lvec16;
+typedef int32 __attribute__((vector_size(32))) lvec32;
+typedef int8 __attribute__((vector_size(32))) lvec8;
+typedef uint16 __attribute__((vector_size(32))) ulvec16;
+typedef uint32 __attribute__((vector_size(32))) ulvec32;
+typedef uint8 __attribute__((vector_size(32))) ulvec8;
+#else
+#define SIMD_ALIGNED(var) var
+#define SIMD_ALIGNED32(var) var
+typedef int16 vec16[8];
+typedef int32 vec32[4];
+typedef int8 vec8[16];
+typedef uint16 uvec16[8];
+typedef uint32 uvec32[4];
+typedef uint8 uvec8[16];
+typedef int16 lvec16[16];
+typedef int32 lvec32[8];
+typedef int8 lvec8[32];
+typedef uint16 ulvec16[16];
+typedef uint32 ulvec32[8];
+typedef uint8 ulvec8[32];
+#endif
+
+#if defined(__aarch64__)
+// This struct is for Arm64 color conversion.
+struct YuvConstants {
+  uvec16 kUVToRB;
+  uvec16 kUVToRB2;
+  uvec16 kUVToG;
+  uvec16 kUVToG2;
+  vec16 kUVBiasBGR;
+  vec32 kYToRgb;
+};
+#elif defined(__arm__)
+// This struct is for ArmV7 color conversion.
+struct YuvConstants {
+  uvec8 kUVToRB;
+  uvec8 kUVToG;
+  vec16 kUVBiasBGR;
+  vec32 kYToRgb;
+};
+#else
+// This struct is for Intel color conversion.
+struct YuvConstants {
+  lvec8 kUVToB;
+  lvec8 kUVToG;
+  lvec8 kUVToR;
+  lvec16 kUVBiasB;
+  lvec16 kUVBiasG;
+  lvec16 kUVBiasR;
+  lvec16 kYToRgb;
+};
+
+// Offsets into YuvConstants structure
+#define KUVTOB   0
+#define KUVTOG   32
+#define KUVTOR   64
+#define KUVBIASB 96
+#define KUVBIASG 128
+#define KUVBIASR 160
+#define KYTORGB  192
+#endif
+
+// Conversion matrix for YUV to RGB
+extern const struct YuvConstants kYuvI601Constants;  // BT.601
+extern const struct YuvConstants kYuvJPEGConstants;  // JPeg color space
+extern const struct YuvConstants kYuvH709Constants;  // BT.709
+
+// Conversion matrix for YVU to BGR
+extern const struct YuvConstants kYvuI601Constants;  // BT.601
+extern const struct YuvConstants kYvuJPEGConstants;  // JPeg color space
+extern const struct YuvConstants kYvuH709Constants;  // BT.709
+
+#if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__)
+#define OMITFP
+#else
+#define OMITFP __attribute__((optimize("omit-frame-pointer")))
+#endif
+
+// NaCL macros for GCC x86 and x64.
+#if defined(__native_client__)
+#define LABELALIGN ".p2align 5\n"
+#else
+#define LABELALIGN
+#endif
+#if defined(__native_client__) && defined(__x86_64__)
+// r14 is used for MEMOP macros.
+#define NACL_R14 "r14",
+#define BUNDLELOCK ".bundle_lock\n"
+#define BUNDLEUNLOCK ".bundle_unlock\n"
+#define MEMACCESS(base) "%%nacl:(%%r15,%q" #base ")"
+#define MEMACCESS2(offset, base) "%%nacl:" #offset "(%%r15,%q" #base ")"
+#define MEMLEA(offset, base) #offset "(%q" #base ")"
+#define MEMLEA3(offset, index, scale) \
+    #offset "(,%q" #index "," #scale ")"
+#define MEMLEA4(offset, base, index, scale) \
+    #offset "(%q" #base ",%q" #index "," #scale ")"
+#define MEMMOVESTRING(s, d) "%%nacl:(%q" #s "),%%nacl:(%q" #d "), %%r15"
+#define MEMSTORESTRING(reg, d) "%%" #reg ",%%nacl:(%q" #d "), %%r15"
+#define MEMOPREG(opcode, offset, base, index, scale, reg) \
+    BUNDLELOCK \
+    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
+    #opcode " (%%r15,%%r14),%%" #reg "\n" \
+    BUNDLEUNLOCK
+#define MEMOPMEM(opcode, reg, offset, base, index, scale) \
+    BUNDLELOCK \
+    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
+    #opcode " %%" #reg ",(%%r15,%%r14)\n" \
+    BUNDLEUNLOCK
+#define MEMOPARG(opcode, offset, base, index, scale, arg) \
+    BUNDLELOCK \
+    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
+    #opcode " (%%r15,%%r14),%" #arg "\n" \
+    BUNDLEUNLOCK
+#define VMEMOPREG(opcode, offset, base, index, scale, reg1, reg2) \
+    BUNDLELOCK \
+    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
+    #opcode " (%%r15,%%r14),%%" #reg1 ",%%" #reg2 "\n" \
+    BUNDLEUNLOCK
+#define VEXTOPMEM(op, sel, reg, offset, base, index, scale) \
+    BUNDLELOCK \
+    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
+    #op " $" #sel ",%%" #reg ",(%%r15,%%r14)\n" \
+    BUNDLEUNLOCK
+#else  // defined(__native_client__) && defined(__x86_64__)
+#define NACL_R14
+#define BUNDLEALIGN
+#define MEMACCESS(base) "(%" #base ")"
+#define MEMACCESS2(offset, base) #offset "(%" #base ")"
+#define MEMLEA(offset, base) #offset "(%" #base ")"
+#define MEMLEA3(offset, index, scale) \
+    #offset "(,%" #index "," #scale ")"
+#define MEMLEA4(offset, base, index, scale) \
+    #offset "(%" #base ",%" #index "," #scale ")"
+#define MEMMOVESTRING(s, d)
+#define MEMSTORESTRING(reg, d)
+#define MEMOPREG(opcode, offset, base, index, scale, reg) \
+    #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg "\n"
+#define MEMOPMEM(opcode, reg, offset, base, index, scale) \
+    #opcode " %%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n"
+#define MEMOPARG(opcode, offset, base, index, scale, arg) \
+    #opcode " " #offset "(%" #base ",%" #index "," #scale "),%" #arg "\n"
+#define VMEMOPREG(opcode, offset, base, index, scale, reg1, reg2) \
+    #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg1 ",%%" \
+    #reg2 "\n"
+#define VEXTOPMEM(op, sel, reg, offset, base, index, scale) \
+    #op " $" #sel ",%%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n"
+#endif  // defined(__native_client__) && defined(__x86_64__)
+
+#if defined(__arm__) || defined(__aarch64__)
+#undef MEMACCESS
+#if defined(__native_client__)
+#define MEMACCESS(base) ".p2align 3\nbic %" #base ", #0xc0000000\n"
+#else
+#define MEMACCESS(base)
+#endif
+#endif
+
+void I444ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I422ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I422AlphaToARGBRow_NEON(const uint8* y_buf,
+                             const uint8* u_buf,
+                             const uint8* v_buf,
+                             const uint8* a_buf,
+                             uint8* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void I422ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I411ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I422ToRGBARow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_rgba,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I422ToRGB24Row_NEON(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void I422ToRGB565Row_NEON(const uint8* src_y,
+                          const uint8* src_u,
+                          const uint8* src_v,
+                          uint8* dst_rgb565,
+                          const struct YuvConstants* yuvconstants,
+                          int width);
+void I422ToARGB1555Row_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb1555,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I422ToARGB4444Row_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb4444,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void NV12ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_uv,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void NV12ToRGB565Row_NEON(const uint8* src_y,
+                          const uint8* src_uv,
+                          uint8* dst_rgb565,
+                          const struct YuvConstants* yuvconstants,
+                          int width);
+void NV21ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_vu,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void UYVYToARGBRow_NEON(const uint8* src_uyvy,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+
+void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width);
+void ARGBToYRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int width);
+void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width);
+void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width);
+void ARGBToYJRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int width);
+void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width);
+void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int width);
+void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int width);
+void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int width);
+void RGB24ToYRow_SSSE3(const uint8* src_rgb24, uint8* dst_y, int width);
+void RAWToYRow_SSSE3(const uint8* src_raw, uint8* dst_y, int width);
+void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width);
+void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width);
+void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                         int width);
+void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                         int width);
+void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width);
+void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
+                      uint8* dst_u, uint8* dst_v, int width);
+void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
+                      uint8* dst_u, uint8* dst_v, int width);
+void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
+                      uint8* dst_u, uint8* dst_v, int width);
+void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
+                       uint8* dst_u, uint8* dst_v, int width);
+void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
+                     uint8* dst_u, uint8* dst_v, int width);
+void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
+                        uint8* dst_u, uint8* dst_v, int width);
+void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
+                          uint8* dst_u, uint8* dst_v, int width);
+void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
+                          uint8* dst_u, uint8* dst_v, int width);
+void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width);
+void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width);
+void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width);
+void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width);
+void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width);
+void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width);
+void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width);
+void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width);
+void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int width);
+void ARGBToYJRow_C(const uint8* src_argb, uint8* dst_y, int width);
+void BGRAToYRow_C(const uint8* src_bgra, uint8* dst_y, int width);
+void ABGRToYRow_C(const uint8* src_abgr, uint8* dst_y, int width);
+void RGBAToYRow_C(const uint8* src_rgba, uint8* dst_y, int width);
+void RGB24ToYRow_C(const uint8* src_rgb24, uint8* dst_y, int width);
+void RAWToYRow_C(const uint8* src_raw, uint8* dst_y, int width);
+void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width);
+void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int width);
+void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width);
+void ARGBToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int width);
+void ARGBToYJRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int width);
+void BGRAToYRow_Any_SSSE3(const uint8* src_bgra, uint8* dst_y, int width);
+void ABGRToYRow_Any_SSSE3(const uint8* src_abgr, uint8* dst_y, int width);
+void RGBAToYRow_Any_SSSE3(const uint8* src_rgba, uint8* dst_y, int width);
+void RGB24ToYRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_y, int width);
+void RAWToYRow_Any_SSSE3(const uint8* src_raw, uint8* dst_y, int width);
+void ARGBToYRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int width);
+void ARGBToYJRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int width);
+void BGRAToYRow_Any_NEON(const uint8* src_bgra, uint8* dst_y, int width);
+void ABGRToYRow_Any_NEON(const uint8* src_abgr, uint8* dst_y, int width);
+void RGBAToYRow_Any_NEON(const uint8* src_rgba, uint8* dst_y, int width);
+void RGB24ToYRow_Any_NEON(const uint8* src_rgb24, uint8* dst_y, int width);
+void RAWToYRow_Any_NEON(const uint8* src_raw, uint8* dst_y, int width);
+void RGB565ToYRow_Any_NEON(const uint8* src_rgb565, uint8* dst_y, int width);
+void ARGB1555ToYRow_Any_NEON(const uint8* src_argb1555, uint8* dst_y,
+                             int width);
+void ARGB4444ToYRow_Any_NEON(const uint8* src_argb4444, uint8* dst_y,
+                             int width);
+
+void ARGBToUVRow_AVX2(const uint8* src_argb, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVJRow_AVX2(const uint8* src_argb, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVJRow_SSSE3(const uint8* src_argb, int src_stride_argb,
+                        uint8* dst_u, uint8* dst_v, int width);
+void BGRAToUVRow_SSSE3(const uint8* src_bgra, int src_stride_bgra,
+                       uint8* dst_u, uint8* dst_v, int width);
+void ABGRToUVRow_SSSE3(const uint8* src_abgr, int src_stride_abgr,
+                       uint8* dst_u, uint8* dst_v, int width);
+void RGBAToUVRow_SSSE3(const uint8* src_rgba, int src_stride_rgba,
+                       uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVRow_Any_AVX2(const uint8* src_argb, int src_stride_argb,
+                          uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVJRow_Any_AVX2(const uint8* src_argb, int src_stride_argb,
+                           uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb,
+                           uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVJRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb,
+                            uint8* dst_u, uint8* dst_v, int width);
+void BGRAToUVRow_Any_SSSE3(const uint8* src_bgra, int src_stride_bgra,
+                           uint8* dst_u, uint8* dst_v, int width);
+void ABGRToUVRow_Any_SSSE3(const uint8* src_abgr, int src_stride_abgr,
+                           uint8* dst_u, uint8* dst_v, int width);
+void RGBAToUVRow_Any_SSSE3(const uint8* src_rgba, int src_stride_rgba,
+                           uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUV444Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                             int width);
+void ARGBToUV411Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                             int width);
+void ARGBToUVRow_Any_NEON(const uint8* src_argb, int src_stride_argb,
+                          uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVJRow_Any_NEON(const uint8* src_argb, int src_stride_argb,
+                           uint8* dst_u, uint8* dst_v, int width);
+void BGRAToUVRow_Any_NEON(const uint8* src_bgra, int src_stride_bgra,
+                          uint8* dst_u, uint8* dst_v, int width);
+void ABGRToUVRow_Any_NEON(const uint8* src_abgr, int src_stride_abgr,
+                          uint8* dst_u, uint8* dst_v, int width);
+void RGBAToUVRow_Any_NEON(const uint8* src_rgba, int src_stride_rgba,
+                          uint8* dst_u, uint8* dst_v, int width);
+void RGB24ToUVRow_Any_NEON(const uint8* src_rgb24, int src_stride_rgb24,
+                           uint8* dst_u, uint8* dst_v, int width);
+void RAWToUVRow_Any_NEON(const uint8* src_raw, int src_stride_raw,
+                         uint8* dst_u, uint8* dst_v, int width);
+void RGB565ToUVRow_Any_NEON(const uint8* src_rgb565, int src_stride_rgb565,
+                            uint8* dst_u, uint8* dst_v, int width);
+void ARGB1555ToUVRow_Any_NEON(const uint8* src_argb1555,
+                              int src_stride_argb1555,
+                              uint8* dst_u, uint8* dst_v, int width);
+void ARGB4444ToUVRow_Any_NEON(const uint8* src_argb4444,
+                              int src_stride_argb4444,
+                              uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVRow_C(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVJRow_C(const uint8* src_argb, int src_stride_argb,
+                    uint8* dst_u, uint8* dst_v, int width);
+void BGRAToUVRow_C(const uint8* src_bgra, int src_stride_bgra,
+                   uint8* dst_u, uint8* dst_v, int width);
+void ABGRToUVRow_C(const uint8* src_abgr, int src_stride_abgr,
+                   uint8* dst_u, uint8* dst_v, int width);
+void RGBAToUVRow_C(const uint8* src_rgba, int src_stride_rgba,
+                   uint8* dst_u, uint8* dst_v, int width);
+void RGB24ToUVRow_C(const uint8* src_rgb24, int src_stride_rgb24,
+                    uint8* dst_u, uint8* dst_v, int width);
+void RAWToUVRow_C(const uint8* src_raw, int src_stride_raw,
+                  uint8* dst_u, uint8* dst_v, int width);
+void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565,
+                     uint8* dst_u, uint8* dst_v, int width);
+void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555,
+                       uint8* dst_u, uint8* dst_v, int width);
+void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444,
+                       uint8* dst_u, uint8* dst_v, int width);
+
+void ARGBToUV444Row_SSSE3(const uint8* src_argb,
+                          uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUV444Row_Any_SSSE3(const uint8* src_argb,
+                              uint8* dst_u, uint8* dst_v, int width);
+
+void ARGBToUV444Row_C(const uint8* src_argb,
+                      uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUV411Row_C(const uint8* src_argb,
+                      uint8* dst_u, uint8* dst_v, int width);
+
+void MirrorRow_AVX2(const uint8* src, uint8* dst, int width);
+void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width);
+void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
+void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width);
+void MirrorRow_C(const uint8* src, uint8* dst, int width);
+void MirrorRow_Any_AVX2(const uint8* src, uint8* dst, int width);
+void MirrorRow_Any_SSSE3(const uint8* src, uint8* dst, int width);
+void MirrorRow_Any_SSE2(const uint8* src, uint8* dst, int width);
+void MirrorRow_Any_NEON(const uint8* src, uint8* dst, int width);
+
+void MirrorUVRow_SSSE3(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                       int width);
+void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                      int width);
+void MirrorUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                       int width);
+void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width);
+
+void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_Any_AVX2(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_Any_SSE2(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_Any_NEON(const uint8* src, uint8* dst, int width);
+
+void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width);
+void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                     int width);
+void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                     int width);
+void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                     int width);
+void SplitUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                      int width);
+void SplitUVRow_Any_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                         int width);
+void SplitUVRow_Any_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                         int width);
+void SplitUVRow_Any_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                         int width);
+void SplitUVRow_Any_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                          int width);
+
+void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                  int width);
+void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                     int width);
+void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                     int width);
+void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                     int width);
+void MergeUVRow_Any_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                         int width);
+void MergeUVRow_Any_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                         int width);
+void MergeUVRow_Any_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                         int width);
+
+void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
+void CopyRow_AVX(const uint8* src, uint8* dst, int count);
+void CopyRow_ERMS(const uint8* src, uint8* dst, int count);
+void CopyRow_NEON(const uint8* src, uint8* dst, int count);
+void CopyRow_MIPS(const uint8* src, uint8* dst, int count);
+void CopyRow_C(const uint8* src, uint8* dst, int count);
+void CopyRow_Any_SSE2(const uint8* src, uint8* dst, int count);
+void CopyRow_Any_AVX(const uint8* src, uint8* dst, int count);
+void CopyRow_Any_NEON(const uint8* src, uint8* dst, int count);
+
+void CopyRow_16_C(const uint16* src, uint16* dst, int count);
+
+void ARGBCopyAlphaRow_C(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBCopyAlphaRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBCopyAlphaRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBCopyAlphaRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
+                               int width);
+void ARGBCopyAlphaRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,
+                               int width);
+
+void ARGBCopyYToAlphaRow_C(const uint8* src_y, uint8* dst_argb, int width);
+void ARGBCopyYToAlphaRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
+void ARGBCopyYToAlphaRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
+void ARGBCopyYToAlphaRow_Any_SSE2(const uint8* src_y, uint8* dst_argb,
+                                  int width);
+void ARGBCopyYToAlphaRow_Any_AVX2(const uint8* src_y, uint8* dst_argb,
+                                  int width);
+
+void SetRow_C(uint8* dst, uint8 v8, int count);
+void SetRow_X86(uint8* dst, uint8 v8, int count);
+void SetRow_ERMS(uint8* dst, uint8 v8, int count);
+void SetRow_NEON(uint8* dst, uint8 v8, int count);
+void SetRow_Any_X86(uint8* dst, uint8 v8, int count);
+void SetRow_Any_NEON(uint8* dst, uint8 v8, int count);
+
+void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int count);
+void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count);
+void ARGBSetRow_NEON(uint8* dst_argb, uint32 v32, int count);
+void ARGBSetRow_Any_NEON(uint8* dst_argb, uint32 v32, int count);
+
+// ARGBShufflers for BGRAToARGB etc.
+void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
+                      const uint8* shuffler, int width);
+void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int width);
+void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                          const uint8* shuffler, int width);
+void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int width);
+void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int width);
+void ARGBShuffleRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
+                             const uint8* shuffler, int width);
+void ARGBShuffleRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                              const uint8* shuffler, int width);
+void ARGBShuffleRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,
+                             const uint8* shuffler, int width);
+void ARGBShuffleRow_Any_NEON(const uint8* src_argb, uint8* dst_argb,
+                             const uint8* shuffler, int width);
+
+void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width);
+void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int width);
+void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width);
+void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, int width);
+void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
+                            int width);
+void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
+                            int width);
+void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, int width);
+void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
+                            int width);
+void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
+                            int width);
+
+void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width);
+void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width);
+void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width);
+void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width);
+void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
+                            int width);
+void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
+                            int width);
+void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width);
+void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width);
+void RAWToRGB24Row_C(const uint8* src_raw, uint8* dst_rgb24, int width);
+void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width);
+void ARGB1555ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGB4444ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int width);
+void RGB24ToARGBRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_argb,
+                              int width);
+void RAWToARGBRow_Any_SSSE3(const uint8* src_raw, uint8* dst_argb, int width);
+void RAWToRGB24Row_Any_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width);
+
+void RGB565ToARGBRow_Any_SSE2(const uint8* src_rgb565, uint8* dst_argb,
+                              int width);
+void ARGB1555ToARGBRow_Any_SSE2(const uint8* src_argb1555, uint8* dst_argb,
+                                int width);
+void ARGB4444ToARGBRow_Any_SSE2(const uint8* src_argb4444, uint8* dst_argb,
+                                int width);
+void RGB565ToARGBRow_Any_AVX2(const uint8* src_rgb565, uint8* dst_argb,
+                              int width);
+void ARGB1555ToARGBRow_Any_AVX2(const uint8* src_argb1555, uint8* dst_argb,
+                                int width);
+void ARGB4444ToARGBRow_Any_AVX2(const uint8* src_argb4444, uint8* dst_argb,
+                                int width);
+
+void RGB24ToARGBRow_Any_NEON(const uint8* src_rgb24, uint8* dst_argb,
+                             int width);
+void RAWToARGBRow_Any_NEON(const uint8* src_raw, uint8* dst_argb, int width);
+void RAWToRGB24Row_Any_NEON(const uint8* src_raw, uint8* dst_rgb24, int width);
+void RGB565ToARGBRow_Any_NEON(const uint8* src_rgb565, uint8* dst_argb,
+                              int width);
+void ARGB1555ToARGBRow_Any_NEON(const uint8* src_argb1555, uint8* dst_argb,
+                                int width);
+void ARGB4444ToARGBRow_Any_NEON(const uint8* src_argb4444, uint8* dst_argb,
+                                int width);
+
+void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);
+
+void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb,
+                             const uint32 dither4, int width);
+void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
+                                const uint32 dither4, int width);
+void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
+                                const uint32 dither4, int width);
+
+void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width);
+
+void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
+                                const uint32 dither4, int width);
+
+void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width);
+
+void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
+void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
+void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width);
+void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width);
+void J400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int width);
+void J400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int width);
+void J400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width);
+
+void I444ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width);
+void I422ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width);
+void I422ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width);
+void I422AlphaToARGBRow_C(const uint8* y_buf,
+                          const uint8* u_buf,
+                          const uint8* v_buf,
+                          const uint8* a_buf,
+                          uint8* dst_argb,
+                          const struct YuvConstants* yuvconstants,
+                          int width);
+void I411ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width);
+void NV12ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_uv,
+                     uint8* dst_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width);
+void NV12ToRGB565Row_C(const uint8* src_y,
+                       const uint8* src_uv,
+                       uint8* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void NV21ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_uv,
+                     uint8* dst_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width);
+void YUY2ToARGBRow_C(const uint8* src_yuy2,
+                     uint8* dst_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width);
+void UYVYToARGBRow_C(const uint8* src_uyvy,
+                     uint8* dst_argb,
+                     const struct YuvConstants* yuvconstants,
+                     int width);
+void I422ToRGBARow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_rgba,
+                     const struct YuvConstants* yuvconstants,
+                     int width);
+void I422ToRGB24Row_C(const uint8* src_y,
+                      const uint8* src_u,
+                      const uint8* src_v,
+                      uint8* dst_rgb24,
+                      const struct YuvConstants* yuvconstants,
+                      int width);
+void I422ToARGB4444Row_C(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_argb4444,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void I422ToARGB1555Row_C(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_argb4444,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void I422ToRGB565Row_C(const uint8* src_y,
+                       const uint8* src_u,
+                       const uint8* src_v,
+                       uint8* dst_rgb565,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void I422ToARGBRow_AVX2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I422ToARGBRow_AVX2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I422ToRGBARow_AVX2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I444ToARGBRow_SSSE3(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void I444ToARGBRow_AVX2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I444ToARGBRow_SSSE3(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void I444ToARGBRow_AVX2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I422ToARGBRow_SSSE3(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              const uint8* a_buf,
+                              uint8* dst_argb,
+                              const struct YuvConstants* yuvconstants,
+                              int width);
+void I422AlphaToARGBRow_AVX2(const uint8* y_buf,
+                             const uint8* u_buf,
+                             const uint8* v_buf,
+                             const uint8* a_buf,
+                             uint8* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void I422ToARGBRow_SSSE3(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void I411ToARGBRow_SSSE3(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void I411ToARGBRow_AVX2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void NV12ToARGBRow_SSSE3(const uint8* src_y,
+                         const uint8* src_uv,
+                         uint8* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void NV12ToARGBRow_AVX2(const uint8* src_y,
+                        const uint8* src_uv,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void NV12ToRGB565Row_SSSE3(const uint8* src_y,
+                           const uint8* src_uv,
+                           uint8* dst_argb,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void NV12ToRGB565Row_AVX2(const uint8* src_y,
+                          const uint8* src_uv,
+                          uint8* dst_argb,
+                          const struct YuvConstants* yuvconstants,
+                          int width);
+void NV21ToARGBRow_SSSE3(const uint8* src_y,
+                         const uint8* src_uv,
+                         uint8* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void NV21ToARGBRow_AVX2(const uint8* src_y,
+                        const uint8* src_uv,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
+                         uint8* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
+                         uint8* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void YUY2ToARGBRow_AVX2(const uint8* src_yuy2,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void UYVYToARGBRow_AVX2(const uint8* src_uyvy,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I422ToRGBARow_SSSE3(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_rgba,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void I422ToARGB4444Row_SSSE3(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void I422ToARGB4444Row_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I422ToARGB1555Row_SSSE3(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void I422ToARGB1555Row_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I422ToRGB565Row_SSSE3(const uint8* src_y,
+                           const uint8* src_u,
+                           const uint8* src_v,
+                           uint8* dst_argb,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void I422ToRGB565Row_AVX2(const uint8* src_y,
+                          const uint8* src_u,
+                          const uint8* src_v,
+                          uint8* dst_argb,
+                          const struct YuvConstants* yuvconstants,
+                          int width);
+void I422ToRGB24Row_SSSE3(const uint8* src_y,
+                          const uint8* src_u,
+                          const uint8* src_v,
+                          uint8* dst_rgb24,
+                          const struct YuvConstants* yuvconstants,
+                          int width);
+void I422ToRGB24Row_AVX2(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void I422ToARGBRow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I422ToRGBARow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I444ToARGBRow_Any_SSSE3(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void I444ToARGBRow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I422ToARGBRow_Any_SSSE3(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void I422AlphaToARGBRow_Any_SSSE3(const uint8* y_buf,
+                                  const uint8* u_buf,
+                                  const uint8* v_buf,
+                                  const uint8* a_buf,
+                                  uint8* dst_argb,
+                                  const struct YuvConstants* yuvconstants,
+                                  int width);
+void I422AlphaToARGBRow_Any_AVX2(const uint8* y_buf,
+                                 const uint8* u_buf,
+                                 const uint8* v_buf,
+                                 const uint8* a_buf,
+                                 uint8* dst_argb,
+                                 const struct YuvConstants* yuvconstants,
+                                 int width);
+void I411ToARGBRow_Any_SSSE3(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void I411ToARGBRow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void NV12ToARGBRow_Any_SSSE3(const uint8* src_y,
+                             const uint8* src_uv,
+                             uint8* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void NV12ToARGBRow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_uv,
+                            uint8* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void NV21ToARGBRow_Any_SSSE3(const uint8* src_y,
+                             const uint8* src_vu,
+                             uint8* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void NV21ToARGBRow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_vu,
+                            uint8* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void NV12ToRGB565Row_Any_SSSE3(const uint8* src_y,
+                               const uint8* src_uv,
+                               uint8* dst_argb,
+                               const struct YuvConstants* yuvconstants,
+                               int width);
+void NV12ToRGB565Row_Any_AVX2(const uint8* src_y,
+                              const uint8* src_uv,
+                              uint8* dst_argb,
+                              const struct YuvConstants* yuvconstants,
+                              int width);
+void YUY2ToARGBRow_Any_SSSE3(const uint8* src_yuy2,
+                             uint8* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void UYVYToARGBRow_Any_SSSE3(const uint8* src_uyvy,
+                             uint8* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void YUY2ToARGBRow_Any_AVX2(const uint8* src_yuy2,
+                            uint8* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void UYVYToARGBRow_Any_AVX2(const uint8* src_uyvy,
+                            uint8* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I422ToRGBARow_Any_SSSE3(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_rgba,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void I422ToARGB4444Row_Any_SSSE3(const uint8* src_y,
+                                 const uint8* src_u,
+                                 const uint8* src_v,
+                                 uint8* dst_rgba,
+                                 const struct YuvConstants* yuvconstants,
+                                 int width);
+void I422ToARGB4444Row_Any_AVX2(const uint8* src_y,
+                                const uint8* src_u,
+                                const uint8* src_v,
+                                uint8* dst_rgba,
+                                const struct YuvConstants* yuvconstants,
+                                int width);
+void I422ToARGB1555Row_Any_SSSE3(const uint8* src_y,
+                                 const uint8* src_u,
+                                 const uint8* src_v,
+                                 uint8* dst_rgba,
+                                 const struct YuvConstants* yuvconstants,
+                                 int width);
+void I422ToARGB1555Row_Any_AVX2(const uint8* src_y,
+                                const uint8* src_u,
+                                const uint8* src_v,
+                                uint8* dst_rgba,
+                                const struct YuvConstants* yuvconstants,
+                                int width);
+void I422ToRGB565Row_Any_SSSE3(const uint8* src_y,
+                               const uint8* src_u,
+                               const uint8* src_v,
+                               uint8* dst_rgba,
+                               const struct YuvConstants* yuvconstants,
+                               int width);
+void I422ToRGB565Row_Any_AVX2(const uint8* src_y,
+                              const uint8* src_u,
+                              const uint8* src_v,
+                              uint8* dst_rgba,
+                              const struct YuvConstants* yuvconstants,
+                              int width);
+void I422ToRGB24Row_Any_SSSE3(const uint8* src_y,
+                              const uint8* src_u,
+                              const uint8* src_v,
+                              uint8* dst_argb,
+                              const struct YuvConstants* yuvconstants,
+                              int width);
+void I422ToRGB24Row_Any_AVX2(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+
+void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width);
+void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
+void I400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
+void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width);
+void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int width);
+void I400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int width);
+void I400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width);
+
+// ARGB preattenuated alpha blend.
+void ARGBBlendRow_SSSE3(const uint8* src_argb, const uint8* src_argb1,
+                        uint8* dst_argb, int width);
+void ARGBBlendRow_NEON(const uint8* src_argb, const uint8* src_argb1,
+                       uint8* dst_argb, int width);
+void ARGBBlendRow_C(const uint8* src_argb, const uint8* src_argb1,
+                    uint8* dst_argb, int width);
+
+// Unattenuated planar alpha blend.
+void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
+                         const uint8* alpha, uint8* dst, int width);
+void BlendPlaneRow_Any_SSSE3(const uint8* src0, const uint8* src1,
+                             const uint8* alpha, uint8* dst, int width);
+void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
+                        const uint8* alpha, uint8* dst, int width);
+void BlendPlaneRow_Any_AVX2(const uint8* src0, const uint8* src1,
+                            const uint8* alpha, uint8* dst, int width);
+void BlendPlaneRow_C(const uint8* src0, const uint8* src1,
+                     const uint8* alpha, uint8* dst, int width);
+
+// ARGB multiply images. Same API as Blend, but these require
+// pointer and width alignment for SSE2.
+void ARGBMultiplyRow_C(const uint8* src_argb, const uint8* src_argb1,
+                       uint8* dst_argb, int width);
+void ARGBMultiplyRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
+                          uint8* dst_argb, int width);
+void ARGBMultiplyRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,
+                              uint8* dst_argb, int width);
+void ARGBMultiplyRow_AVX2(const uint8* src_argb, const uint8* src_argb1,
+                          uint8* dst_argb, int width);
+void ARGBMultiplyRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1,
+                              uint8* dst_argb, int width);
+void ARGBMultiplyRow_NEON(const uint8* src_argb, const uint8* src_argb1,
+                          uint8* dst_argb, int width);
+void ARGBMultiplyRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,
+                              uint8* dst_argb, int width);
+
+// ARGB add images.
+void ARGBAddRow_C(const uint8* src_argb, const uint8* src_argb1,
+                  uint8* dst_argb, int width);
+void ARGBAddRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
+                     uint8* dst_argb, int width);
+void ARGBAddRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,
+                         uint8* dst_argb, int width);
+void ARGBAddRow_AVX2(const uint8* src_argb, const uint8* src_argb1,
+                     uint8* dst_argb, int width);
+void ARGBAddRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1,
+                         uint8* dst_argb, int width);
+void ARGBAddRow_NEON(const uint8* src_argb, const uint8* src_argb1,
+                     uint8* dst_argb, int width);
+void ARGBAddRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,
+                         uint8* dst_argb, int width);
+
+// ARGB subtract images. Same API as Blend, but these require
+// pointer and width alignment for SSE2.
+void ARGBSubtractRow_C(const uint8* src_argb, const uint8* src_argb1,
+                       uint8* dst_argb, int width);
+void ARGBSubtractRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
+                          uint8* dst_argb, int width);
+void ARGBSubtractRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,
+                              uint8* dst_argb, int width);
+void ARGBSubtractRow_AVX2(const uint8* src_argb, const uint8* src_argb1,
+                          uint8* dst_argb, int width);
+void ARGBSubtractRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1,
+                              uint8* dst_argb, int width);
+void ARGBSubtractRow_NEON(const uint8* src_argb, const uint8* src_argb1,
+                          uint8* dst_argb, int width);
+void ARGBSubtractRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,
+                              uint8* dst_argb, int width);
+
+void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb,
+                                int width);
+void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb,
+                                int width);
+
+void ARGBToRGB565DitherRow_Any_SSE2(const uint8* src_argb, uint8* dst_rgb,
+                                    const uint32 dither4, int width);
+void ARGBToRGB565DitherRow_Any_AVX2(const uint8* src_argb, uint8* dst_rgb,
+                                    const uint32 dither4, int width);
+
+void ARGBToRGB565Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToARGB1555Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb,
+                                int width);
+void ARGBToARGB4444Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb,
+                                int width);
+
+void ARGBToRGB24Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRAWRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRGB565Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToARGB1555Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb,
+                                int width);
+void ARGBToARGB4444Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb,
+                                int width);
+void ARGBToRGB565DitherRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb,
+                                    const uint32 dither4, int width);
+
+void I444ToARGBRow_Any_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I422ToARGBRow_Any_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I422AlphaToARGBRow_Any_NEON(const uint8* src_y,
+                                 const uint8* src_u,
+                                 const uint8* src_v,
+                                 const uint8* src_a,
+                                 uint8* dst_argb,
+                                 const struct YuvConstants* yuvconstants,
+                                 int width);
+void I411ToARGBRow_Any_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I422ToRGBARow_Any_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I422ToRGB24Row_Any_NEON(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void I422ToARGB4444Row_Any_NEON(const uint8* src_y,
+                                const uint8* src_u,
+                                const uint8* src_v,
+                                uint8* dst_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width);
+void I422ToARGB1555Row_Any_NEON(const uint8* src_y,
+                                const uint8* src_u,
+                                const uint8* src_v,
+                                uint8* dst_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width);
+void I422ToRGB565Row_Any_NEON(const uint8* src_y,
+                              const uint8* src_u,
+                              const uint8* src_v,
+                              uint8* dst_argb,
+                              const struct YuvConstants* yuvconstants,
+                              int width);
+void NV12ToARGBRow_Any_NEON(const uint8* src_y,
+                            const uint8* src_uv,
+                            uint8* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void NV21ToARGBRow_Any_NEON(const uint8* src_y,
+                            const uint8* src_vu,
+                            uint8* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void NV12ToRGB565Row_Any_NEON(const uint8* src_y,
+                              const uint8* src_uv,
+                              uint8* dst_argb,
+                              const struct YuvConstants* yuvconstants,
+                              int width);
+void YUY2ToARGBRow_Any_NEON(const uint8* src_yuy2,
+                            uint8* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void UYVYToARGBRow_Any_NEON(const uint8* src_uyvy,
+                            uint8* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I422ToARGBRow_DSPR2(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void I422ToARGBRow_DSPR2(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+
+void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width);
+void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int width);
+void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
+                         uint8* dst_u, uint8* dst_v, int width);
+void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width);
+void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int width);
+void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
+                         uint8* dst_u, uint8* dst_v, int width);
+void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width);
+void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int width);
+void YUY2ToUV422Row_NEON(const uint8* src_yuy2,
+                         uint8* dst_u, uint8* dst_v, int width);
+void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width);
+void YUY2ToUVRow_C(const uint8* src_yuy2, int stride_yuy2,
+                   uint8* dst_u, uint8* dst_v, int width);
+void YUY2ToUV422Row_C(const uint8* src_yuy2,
+                      uint8* dst_u, uint8* dst_v, int width);
+void YUY2ToYRow_Any_AVX2(const uint8* src_yuy2, uint8* dst_y, int width);
+void YUY2ToUVRow_Any_AVX2(const uint8* src_yuy2, int stride_yuy2,
+                          uint8* dst_u, uint8* dst_v, int width);
+void YUY2ToUV422Row_Any_AVX2(const uint8* src_yuy2,
+                             uint8* dst_u, uint8* dst_v, int width);
+void YUY2ToYRow_Any_SSE2(const uint8* src_yuy2, uint8* dst_y, int width);
+void YUY2ToUVRow_Any_SSE2(const uint8* src_yuy2, int stride_yuy2,
+                          uint8* dst_u, uint8* dst_v, int width);
+void YUY2ToUV422Row_Any_SSE2(const uint8* src_yuy2,
+                             uint8* dst_u, uint8* dst_v, int width);
+void YUY2ToYRow_Any_NEON(const uint8* src_yuy2, uint8* dst_y, int width);
+void YUY2ToUVRow_Any_NEON(const uint8* src_yuy2, int stride_yuy2,
+                          uint8* dst_u, uint8* dst_v, int width);
+void YUY2ToUV422Row_Any_NEON(const uint8* src_yuy2,
+                             uint8* dst_u, uint8* dst_v, int width);
+void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width);
+void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int width);
+void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int width);
+void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int width);
+void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int width);
+void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int width);
+void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width);
+void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int width);
+void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int width);
+void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width);
+void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int width);
+void UYVYToUV422Row_NEON(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int width);
+
+void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width);
+void UYVYToUVRow_C(const uint8* src_uyvy, int stride_uyvy,
+                   uint8* dst_u, uint8* dst_v, int width);
+void UYVYToUV422Row_C(const uint8* src_uyvy,
+                      uint8* dst_u, uint8* dst_v, int width);
+void UYVYToYRow_Any_AVX2(const uint8* src_uyvy, uint8* dst_y, int width);
+void UYVYToUVRow_Any_AVX2(const uint8* src_uyvy, int stride_uyvy,
+                          uint8* dst_u, uint8* dst_v, int width);
+void UYVYToUV422Row_Any_AVX2(const uint8* src_uyvy,
+                             uint8* dst_u, uint8* dst_v, int width);
+void UYVYToYRow_Any_SSE2(const uint8* src_uyvy, uint8* dst_y, int width);
+void UYVYToUVRow_Any_SSE2(const uint8* src_uyvy, int stride_uyvy,
+                          uint8* dst_u, uint8* dst_v, int width);
+void UYVYToUV422Row_Any_SSE2(const uint8* src_uyvy,
+                             uint8* dst_u, uint8* dst_v, int width);
+void UYVYToYRow_Any_NEON(const uint8* src_uyvy, uint8* dst_y, int width);
+void UYVYToUVRow_Any_NEON(const uint8* src_uyvy, int stride_uyvy,
+                          uint8* dst_u, uint8* dst_v, int width);
+void UYVYToUV422Row_Any_NEON(const uint8* src_uyvy,
+                             uint8* dst_u, uint8* dst_v, int width);
+
+void I422ToYUY2Row_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_yuy2, int width);
+void I422ToUYVYRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_uyvy, int width);
+void I422ToYUY2Row_SSE2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_yuy2, int width);
+void I422ToUYVYRow_SSE2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_uyvy, int width);
+void I422ToYUY2Row_Any_SSE2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_yuy2, int width);
+void I422ToUYVYRow_Any_SSE2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_uyvy, int width);
+void I422ToYUY2Row_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_yuy2, int width);
+void I422ToUYVYRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_uyvy, int width);
+void I422ToYUY2Row_Any_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_yuy2, int width);
+void I422ToUYVYRow_Any_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_uyvy, int width);
+
+// Effects related row functions.
+void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBAttenuateRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
+                               int width);
+void ARGBAttenuateRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                                int width);
+void ARGBAttenuateRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,
+                               int width);
+void ARGBAttenuateRow_Any_NEON(const uint8* src_argb, uint8* dst_argb,
+                               int width);
+
+// Inverse table for unattenuate, shared by C and SSE2.
+extern const uint32 fixed_invtbl8[256];
+void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBUnattenuateRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
+                                 int width);
+void ARGBUnattenuateRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,
+                                 int width);
+
+void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width);
+
+void ARGBSepiaRow_C(uint8* dst_argb, int width);
+void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width);
+void ARGBSepiaRow_NEON(uint8* dst_argb, int width);
+
+void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb,
+                          const int8* matrix_argb, int width);
+void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                              const int8* matrix_argb, int width);
+void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
+                             const int8* matrix_argb, int width);
+
+void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width);
+void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width);
+
+void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width);
+void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width);
+
+void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,
+                       int interval_offset, int width);
+void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
+                          int interval_offset, int width);
+void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
+                          int interval_offset, int width);
+
+void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
+                    uint32 value);
+void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
+                       uint32 value);
+void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
+                       uint32 value);
+
+// Used for blur.
+void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
+                                    int width, int area, uint8* dst, int count);
+void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
+                                  const int32* previous_cumsum, int width);
+
+void CumulativeSumToAverageRow_C(const int32* topleft, const int32* botleft,
+                                 int width, int area, uint8* dst, int count);
+void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
+                               const int32* previous_cumsum, int width);
+
+LIBYUV_API
+void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
+                     uint8* dst_argb, const float* uv_dudv, int width);
+LIBYUV_API
+void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
+                        uint8* dst_argb, const float* uv_dudv, int width);
+
+// Used for I420Scale, ARGBScale, and ARGBInterpolate.
+void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
+                      ptrdiff_t src_stride_ptr,
+                      int width, int source_y_fraction);
+void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                          ptrdiff_t src_stride_ptr, int width,
+                          int source_y_fraction);
+void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride_ptr, int width,
+                         int source_y_fraction);
+void InterpolateRow_NEON(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride_ptr, int width,
+                         int source_y_fraction);
+void InterpolateRow_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
+                          ptrdiff_t src_stride_ptr, int width,
+                          int source_y_fraction);
+void InterpolateRow_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,
+                             ptrdiff_t src_stride_ptr, int width,
+                             int source_y_fraction);
+void InterpolateRow_Any_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                              ptrdiff_t src_stride_ptr, int width,
+                              int source_y_fraction);
+void InterpolateRow_Any_AVX2(uint8* dst_ptr, const uint8* src_ptr,
+                             ptrdiff_t src_stride_ptr, int width,
+                             int source_y_fraction);
+void InterpolateRow_Any_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
+                              ptrdiff_t src_stride_ptr, int width,
+                              int source_y_fraction);
+
+void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                         ptrdiff_t src_stride_ptr,
+                         int width, int source_y_fraction);
+
+// Sobel images.
+void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2,
+                 uint8* dst_sobelx, int width);
+void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
+                    const uint8* src_y2, uint8* dst_sobelx, int width);
+void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
+                    const uint8* src_y2, uint8* dst_sobelx, int width);
+void SobelYRow_C(const uint8* src_y0, const uint8* src_y1,
+                 uint8* dst_sobely, int width);
+void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
+                    uint8* dst_sobely, int width);
+void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
+                    uint8* dst_sobely, int width);
+void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely,
+                uint8* dst_argb, int width);
+void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                   uint8* dst_argb, int width);
+void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                   uint8* dst_argb, int width);
+void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely,
+                       uint8* dst_y, int width);
+void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                          uint8* dst_y, int width);
+void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                          uint8* dst_y, int width);
+void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely,
+                  uint8* dst_argb, int width);
+void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width);
+void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width);
+void SobelRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                       uint8* dst_argb, int width);
+void SobelRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                       uint8* dst_argb, int width);
+void SobelToPlaneRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                              uint8* dst_y, int width);
+void SobelToPlaneRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                              uint8* dst_y, int width);
+void SobelXYRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                         uint8* dst_argb, int width);
+void SobelXYRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                         uint8* dst_argb, int width);
+
+void ARGBPolynomialRow_C(const uint8* src_argb,
+                         uint8* dst_argb, const float* poly,
+                         int width);
+void ARGBPolynomialRow_SSE2(const uint8* src_argb,
+                            uint8* dst_argb, const float* poly,
+                            int width);
+void ARGBPolynomialRow_AVX2(const uint8* src_argb,
+                            uint8* dst_argb, const float* poly,
+                            int width);
+
+void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
+                             const uint8* luma, uint32 lumacoeff);
+void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                                 int width,
+                                 const uint8* luma, uint32 lumacoeff);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_ROW_H_  NOLINT
diff --git a/libs/libyuv/include/libyuv/scale.h b/libs/libyuv/include/libyuv/scale.h
new file mode 100644
index 0000000000..102158d1ab
--- /dev/null
+++ b/libs/libyuv/include/libyuv/scale.h
@@ -0,0 +1,103 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_SCALE_H_  // NOLINT
+#define INCLUDE_LIBYUV_SCALE_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Supported filtering.
+typedef enum FilterMode {
+  kFilterNone = 0,  // Point sample; Fastest.
+  kFilterLinear = 1,  // Filter horizontally only.
+  kFilterBilinear = 2,  // Faster than box, but lower quality scaling down.
+  kFilterBox = 3  // Highest quality.
+} FilterModeEnum;
+
+// Scale a YUV plane.
+LIBYUV_API
+void ScalePlane(const uint8* src, int src_stride,
+                int src_width, int src_height,
+                uint8* dst, int dst_stride,
+                int dst_width, int dst_height,
+                enum FilterMode filtering);
+
+LIBYUV_API
+void ScalePlane_16(const uint16* src, int src_stride,
+                   int src_width, int src_height,
+                   uint16* dst, int dst_stride,
+                   int dst_width, int dst_height,
+                   enum FilterMode filtering);
+
+// Scales a YUV 4:2:0 image from the src width and height to the
+// dst width and height.
+// If filtering is kFilterNone, a simple nearest-neighbor algorithm is
+// used. This produces basic (blocky) quality at the fastest speed.
+// If filtering is kFilterBilinear, interpolation is used to produce a better
+// quality image, at the expense of speed.
+// If filtering is kFilterBox, averaging is used to produce ever better
+// quality image, at further expense of speed.
+// Returns 0 if successful.
+
+LIBYUV_API
+int I420Scale(const uint8* src_y, int src_stride_y,
+              const uint8* src_u, int src_stride_u,
+              const uint8* src_v, int src_stride_v,
+              int src_width, int src_height,
+              uint8* dst_y, int dst_stride_y,
+              uint8* dst_u, int dst_stride_u,
+              uint8* dst_v, int dst_stride_v,
+              int dst_width, int dst_height,
+              enum FilterMode filtering);
+
+LIBYUV_API
+int I420Scale_16(const uint16* src_y, int src_stride_y,
+                 const uint16* src_u, int src_stride_u,
+                 const uint16* src_v, int src_stride_v,
+                 int src_width, int src_height,
+                 uint16* dst_y, int dst_stride_y,
+                 uint16* dst_u, int dst_stride_u,
+                 uint16* dst_v, int dst_stride_v,
+                 int dst_width, int dst_height,
+                 enum FilterMode filtering);
+
+#ifdef __cplusplus
+// Legacy API.  Deprecated.
+LIBYUV_API
+int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
+          int src_stride_y, int src_stride_u, int src_stride_v,
+          int src_width, int src_height,
+          uint8* dst_y, uint8* dst_u, uint8* dst_v,
+          int dst_stride_y, int dst_stride_u, int dst_stride_v,
+          int dst_width, int dst_height,
+          LIBYUV_BOOL interpolate);
+
+// Legacy API.  Deprecated.
+LIBYUV_API
+int ScaleOffset(const uint8* src_i420, int src_width, int src_height,
+                uint8* dst_i420, int dst_width, int dst_height, int dst_yoffset,
+                LIBYUV_BOOL interpolate);
+
+// For testing, allow disabling of specialized scalers.
+LIBYUV_API
+void SetUseReferenceImpl(LIBYUV_BOOL use);
+#endif  // __cplusplus
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_SCALE_H_  NOLINT
diff --git a/libs/libyuv/include/libyuv/scale_argb.h b/libs/libyuv/include/libyuv/scale_argb.h
new file mode 100644
index 0000000000..b56cf52099
--- /dev/null
+++ b/libs/libyuv/include/libyuv/scale_argb.h
@@ -0,0 +1,56 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_SCALE_ARGB_H_  // NOLINT
+#define INCLUDE_LIBYUV_SCALE_ARGB_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/scale.h"  // For FilterMode
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+LIBYUV_API
+int ARGBScale(const uint8* src_argb, int src_stride_argb,
+              int src_width, int src_height,
+              uint8* dst_argb, int dst_stride_argb,
+              int dst_width, int dst_height,
+              enum FilterMode filtering);
+
+// Clipped scale takes destination rectangle coordinates for clip values.
+LIBYUV_API
+int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
+                  int src_width, int src_height,
+                  uint8* dst_argb, int dst_stride_argb,
+                  int dst_width, int dst_height,
+                  int clip_x, int clip_y, int clip_width, int clip_height,
+                  enum FilterMode filtering);
+
+// Scale with YUV conversion to ARGB and clipping.
+LIBYUV_API
+int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y,
+                       const uint8* src_u, int src_stride_u,
+                       const uint8* src_v, int src_stride_v,
+                       uint32 src_fourcc,
+                       int src_width, int src_height,
+                       uint8* dst_argb, int dst_stride_argb,
+                       uint32 dst_fourcc,
+                       int dst_width, int dst_height,
+                       int clip_x, int clip_y, int clip_width, int clip_height,
+                       enum FilterMode filtering);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_SCALE_ARGB_H_  NOLINT
diff --git a/libs/libyuv/include/libyuv/scale_row.h b/libs/libyuv/include/libyuv/scale_row.h
new file mode 100644
index 0000000000..a3b3ede60e
--- /dev/null
+++ b/libs/libyuv/include/libyuv/scale_row.h
@@ -0,0 +1,497 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_SCALE_ROW_H_  // NOLINT
+#define INCLUDE_LIBYUV_SCALE_ROW_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/scale.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if defined(__pnacl__) || defined(__CLR_VER) || \
+    (defined(__i386__) && !defined(__SSE2__))
+#define LIBYUV_DISABLE_X86
+#endif
+
+// GCC >= 4.7.0 required for AVX2.
+#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
+#define GCC_HAS_AVX2 1
+#endif  // GNUC >= 4.7
+#endif  // __GNUC__
+
+// clang >= 3.4.0 required for AVX2.
+#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
+#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4))
+#define CLANG_HAS_AVX2 1
+#endif  // clang >= 3.4
+#endif  // __clang__
+
+// Visual C 2012 required for AVX2.
+#if defined(_M_IX86) && !defined(__clang__) && \
+    defined(_MSC_VER) && _MSC_VER >= 1700
+#define VISUALC_HAS_AVX2 1
+#endif  // VisualStudio >= 2012
+
+// The following are available on all x86 platforms:
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+#define HAS_FIXEDDIV1_X86
+#define HAS_FIXEDDIV_X86
+#define HAS_SCALEARGBCOLS_SSE2
+#define HAS_SCALEARGBCOLSUP2_SSE2
+#define HAS_SCALEARGBFILTERCOLS_SSSE3
+#define HAS_SCALEARGBROWDOWN2_SSE2
+#define HAS_SCALEARGBROWDOWNEVEN_SSE2
+#define HAS_SCALECOLSUP2_SSE2
+#define HAS_SCALEFILTERCOLS_SSSE3
+#define HAS_SCALEROWDOWN2_SSSE3
+#define HAS_SCALEROWDOWN34_SSSE3
+#define HAS_SCALEROWDOWN38_SSSE3
+#define HAS_SCALEROWDOWN4_SSSE3
+#define HAS_SCALEADDROW_SSE2
+#endif
+
+// The following are available on all x86 platforms, but
+// require VS2012, clang 3.4 or gcc 4.7.
+// The code supports NaCL but requires a new compiler and validator.
+#if !defined(LIBYUV_DISABLE_X86) && (defined(VISUALC_HAS_AVX2) || \
+    defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
+#define HAS_SCALEADDROW_AVX2
+#define HAS_SCALEROWDOWN2_AVX2
+#define HAS_SCALEROWDOWN4_AVX2
+#endif
+
+// The following are available on Neon platforms:
+#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
+#define HAS_SCALEARGBCOLS_NEON
+#define HAS_SCALEARGBROWDOWN2_NEON
+#define HAS_SCALEARGBROWDOWNEVEN_NEON
+#define HAS_SCALEFILTERCOLS_NEON
+#define HAS_SCALEROWDOWN2_NEON
+#define HAS_SCALEROWDOWN34_NEON
+#define HAS_SCALEROWDOWN38_NEON
+#define HAS_SCALEROWDOWN4_NEON
+#define HAS_SCALEARGBFILTERCOLS_NEON
+#endif
+
+// The following are available on Mips platforms:
+#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
+    defined(__mips__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+#define HAS_SCALEROWDOWN2_DSPR2
+#define HAS_SCALEROWDOWN4_DSPR2
+#define HAS_SCALEROWDOWN34_DSPR2
+#define HAS_SCALEROWDOWN38_DSPR2
+#endif
+
+// Scale ARGB vertically with bilinear interpolation.
+void ScalePlaneVertical(int src_height,
+                        int dst_width, int dst_height,
+                        int src_stride, int dst_stride,
+                        const uint8* src_argb, uint8* dst_argb,
+                        int x, int y, int dy,
+                        int bpp, enum FilterMode filtering);
+
+void ScalePlaneVertical_16(int src_height,
+                           int dst_width, int dst_height,
+                           int src_stride, int dst_stride,
+                           const uint16* src_argb, uint16* dst_argb,
+                           int x, int y, int dy,
+                           int wpp, enum FilterMode filtering);
+
+// Simplify the filtering based on scale factors.
+enum FilterMode ScaleFilterReduce(int src_width, int src_height,
+                                  int dst_width, int dst_height,
+                                  enum FilterMode filtering);
+
+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv_C(int num, int div);
+int FixedDiv_X86(int num, int div);
+// Divide num - 1 by div - 1 and return as 16.16 fixed point result.
+int FixedDiv1_C(int num, int div);
+int FixedDiv1_X86(int num, int div);
+#ifdef HAS_FIXEDDIV_X86
+#define FixedDiv FixedDiv_X86
+#define FixedDiv1 FixedDiv1_X86
+#else
+#define FixedDiv FixedDiv_C
+#define FixedDiv1 FixedDiv1_C
+#endif
+
+// Compute slope values for stepping.
+void ScaleSlope(int src_width, int src_height,
+                int dst_width, int dst_height,
+                enum FilterMode filtering,
+                int* x, int* y, int* dx, int* dy);
+
+void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                     uint8* dst, int dst_width);
+void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                        uint16* dst, int dst_width);
+void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst, int dst_width);
+void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                              uint16* dst, int dst_width);
+void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst, int dst_width);
+void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst, int dst_width);
+void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                           uint16* dst, int dst_width);
+void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                     uint8* dst, int dst_width);
+void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                        uint16* dst, int dst_width);
+void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst, int dst_width);
+void ScaleRowDown4Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                           uint16* dst, int dst_width);
+void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                      uint8* dst, int dst_width);
+void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                         uint16* dst, int dst_width);
+void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* d, int dst_width);
+void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                               uint16* d, int dst_width);
+void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* d, int dst_width);
+void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                               uint16* d, int dst_width);
+void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,
+                 int dst_width, int x, int dx);
+void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                    int dst_width, int x, int dx);
+void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,
+                    int dst_width, int, int);
+void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                       int dst_width, int, int);
+void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
+                       int dst_width, int x, int dx);
+void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                          int dst_width, int x, int dx);
+void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,
+                         int dst_width, int x, int dx);
+void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                            int dst_width, int x, int dx);
+void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                      uint8* dst, int dst_width);
+void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                         uint16* dst, int dst_width);
+void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint16* dst_ptr, int dst_width);
+void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                               uint16* dst_ptr, int dst_width);
+void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width);
+void ScaleARGBRowDown2_C(const uint8* src_argb,
+                         ptrdiff_t src_stride,
+                         uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Linear_C(const uint8* src_argb,
+                               ptrdiff_t src_stride,
+                               uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride,
+                            uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride,
+                            int src_stepx,
+                            uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEvenBox_C(const uint8* src_argb,
+                               ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8* dst_argb, int dst_width);
+void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb,
+                     int dst_width, int x, int dx);
+void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb,
+                       int dst_width, int x, int dx);
+void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,
+                        int dst_width, int, int);
+void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,
+                           int dst_width, int x, int dx);
+void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
+                             int dst_width, int x, int dx);
+
+// Specialized scalers for x86.
+void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                         uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width);
+void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width);
+void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                         uint8* dst_ptr, int dst_width);
+void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width);
+void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width);
+void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width);
+
+void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                          uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                          uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width);
+void ScaleRowDown2_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                             uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Linear_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                                   uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Box_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Box_Odd_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width);
+void ScaleRowDown2_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Linear_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                  uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Box_Odd_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+void ScaleRowDown4_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                             uint8* dst_ptr, int dst_width);
+void ScaleRowDown4Box_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width);
+void ScaleRowDown4_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width);
+void ScaleRowDown4Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+
+void ScaleRowDown34_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_1_Box_Any_SSSE3(const uint8* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_0_Box_Any_SSSE3(const uint8* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_3_Box_Any_SSSE3(const uint8* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_2_Box_Any_SSSE3(const uint8* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint8* dst_ptr, int dst_width);
+
+void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+void ScaleAddRow_Any_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+void ScaleAddRow_Any_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+
+void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                           int dst_width, int x, int dx);
+void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+                       int dst_width, int x, int dx);
+
+
+// ARGB Column functions
+void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
+                        int dst_width, int x, int dx);
+void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
+                               int dst_width, int x, int dx);
+void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
+                           int dst_width, int x, int dx);
+void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
+                              int dst_width, int x, int dx);
+void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
+                        int dst_width, int x, int dx);
+void ScaleARGBFilterCols_Any_NEON(uint8* dst_argb, const uint8* src_argb,
+                                  int dst_width, int x, int dx);
+void ScaleARGBCols_Any_NEON(uint8* dst_argb, const uint8* src_argb,
+                            int dst_width, int x, int dx);
+
+// ARGB Row functions
+void ScaleARGBRowDown2_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                            uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                                  uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                               uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst, int dst_width);
+void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                  uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst, int dst_width);
+void ScaleARGBRowDown2_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                                uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Linear_Any_SSE2(const uint8* src_argb,
+                                      ptrdiff_t src_stride,
+                                      uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Box_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                                   uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                                uint8* dst, int dst_width);
+void ScaleARGBRowDown2Linear_Any_NEON(const uint8* src_argb,
+                                      ptrdiff_t src_stride,
+                                      uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                                   uint8* dst, int dst_width);
+
+void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                               int src_stepx, uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEven_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                                   int src_stepx,
+                                   uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEvenBox_Any_SSE2(const uint8* src_argb,
+                                      ptrdiff_t src_stride,
+                                      int src_stepx,
+                                      uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEven_Any_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                   int src_stepx,
+                                   uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEvenBox_Any_NEON(const uint8* src_argb,
+                                      ptrdiff_t src_stride,
+                                      int src_stepx,
+                                      uint8* dst_argb, int dst_width);
+
+// ScaleRowDown2Box also used by planar functions
+// NEON downscalers with interpolation.
+
+// Note - not static due to reuse in convert for 444 to 420.
+void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst, int dst_width);
+void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst, int dst_width);
+void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst, int dst_width);
+
+void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width);
+void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width);
+
+// Down scale from 4 to 3 pixels. Use the neon multilane read/write
+//  to load up the every 4th pixel into a 4 different registers.
+// Point samples 32 pixels to 24 pixels.
+void ScaleRowDown34_NEON(const uint8* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+
+// 32 -> 12
+void ScaleRowDown38_NEON(const uint8* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8* dst_ptr, int dst_width);
+// 32x3 -> 12x1
+void ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+
+void ScaleRowDown2_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst, int dst_width);
+void ScaleRowDown2Linear_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                                  uint8* dst, int dst_width);
+void ScaleRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst, int dst_width);
+void ScaleRowDown2Box_Odd_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst, int dst_width);
+void ScaleRowDown4_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width);
+void ScaleRowDown4Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                             uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_0_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                                   uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_1_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                                   uint8* dst_ptr, int dst_width);
+// 32 -> 12
+void ScaleRowDown38_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                             uint8* dst_ptr, int dst_width);
+// 32x3 -> 12x1
+void ScaleRowDown38_3_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+
+void ScaleAddRow_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+void ScaleAddRow_Any_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+
+void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
+                          int dst_width, int x, int dx);
+
+void ScaleFilterCols_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,
+                              int dst_width, int x, int dx);
+
+void ScaleRowDown2_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                         uint8* dst, int dst_width);
+void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst, int dst_width);
+void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                         uint8* dst, int dst_width);
+void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst, int dst_width);
+void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                          uint8* dst, int dst_width);
+void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                uint8* d, int dst_width);
+void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                uint8* d, int dst_width);
+void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                          uint8* dst, int dst_width);
+void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_SCALE_ROW_H_  NOLINT
diff --git a/libs/libyuv/include/libyuv/version.h b/libs/libyuv/include/libyuv/version.h
new file mode 100644
index 0000000000..c7800d56a5
--- /dev/null
+++ b/libs/libyuv/include/libyuv/version.h
@@ -0,0 +1,16 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
+#define INCLUDE_LIBYUV_VERSION_H_
+
+#define LIBYUV_VERSION 1577
+
+#endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
diff --git a/libs/libyuv/include/libyuv/video_common.h b/libs/libyuv/include/libyuv/video_common.h
new file mode 100644
index 0000000000..ad934e4241
--- /dev/null
+++ b/libs/libyuv/include/libyuv/video_common.h
@@ -0,0 +1,184 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// Common definitions for video, including fourcc and VideoFormat.
+
+#ifndef INCLUDE_LIBYUV_VIDEO_COMMON_H_  // NOLINT
+#define INCLUDE_LIBYUV_VIDEO_COMMON_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+// Definition of FourCC codes
+//////////////////////////////////////////////////////////////////////////////
+
+// Convert four characters to a FourCC code.
+// Needs to be a macro otherwise the OS X compiler complains when the kFormat*
+// constants are used in a switch.
+#ifdef __cplusplus
+#define FOURCC(a, b, c, d) ( \
+    (static_cast<uint32>(a)) | (static_cast<uint32>(b) << 8) | \
+    (static_cast<uint32>(c) << 16) | (static_cast<uint32>(d) << 24))
+#else
+#define FOURCC(a, b, c, d) ( \
+    ((uint32)(a)) | ((uint32)(b) << 8) | /* NOLINT */ \
+    ((uint32)(c) << 16) | ((uint32)(d) << 24))  /* NOLINT */
+#endif
+
+// Some pages discussing FourCC codes:
+//   http://www.fourcc.org/yuv.php
+//   http://v4l2spec.bytesex.org/spec/book1.htm
+//   http://developer.apple.com/quicktime/icefloe/dispatch020.html
+//   http://msdn.microsoft.com/library/windows/desktop/dd206750.aspx#nv12
+//   http://people.xiph.org/~xiphmont/containers/nut/nut4cc.txt
+
+// FourCC codes grouped according to implementation efficiency.
+// Primary formats should convert in 1 efficient step.
+// Secondary formats are converted in 2 steps.
+// Auxilliary formats call primary converters.
+enum FourCC {
+  // 9 Primary YUV formats: 5 planar, 2 biplanar, 2 packed.
+  FOURCC_I420 = FOURCC('I', '4', '2', '0'),
+  FOURCC_I422 = FOURCC('I', '4', '2', '2'),
+  FOURCC_I444 = FOURCC('I', '4', '4', '4'),
+  FOURCC_I411 = FOURCC('I', '4', '1', '1'),
+  FOURCC_I400 = FOURCC('I', '4', '0', '0'),
+  FOURCC_NV21 = FOURCC('N', 'V', '2', '1'),
+  FOURCC_NV12 = FOURCC('N', 'V', '1', '2'),
+  FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'),
+  FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'),
+
+  // 2 Secondary YUV formats: row biplanar.
+  FOURCC_M420 = FOURCC('M', '4', '2', '0'),
+  FOURCC_Q420 = FOURCC('Q', '4', '2', '0'),  // deprecated.
+
+  // 9 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp.
+  FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),
+  FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'),
+  FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'),
+  FOURCC_24BG = FOURCC('2', '4', 'B', 'G'),
+  FOURCC_RAW  = FOURCC('r', 'a', 'w', ' '),
+  FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'),
+  FOURCC_RGBP = FOURCC('R', 'G', 'B', 'P'),  // rgb565 LE.
+  FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'),  // argb1555 LE.
+  FOURCC_R444 = FOURCC('R', '4', '4', '4'),  // argb4444 LE.
+
+  // 4 Secondary RGB formats: 4 Bayer Patterns. deprecated.
+  FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'),
+  FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'),
+  FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'),
+  FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'),
+
+  // 1 Primary Compressed YUV format.
+  FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'),
+
+  // 5 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias.
+  FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'),
+  FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'),
+  FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'),
+  FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'),  // Linux version of I420.
+  FOURCC_J420 = FOURCC('J', '4', '2', '0'),
+  FOURCC_J400 = FOURCC('J', '4', '0', '0'),  // unofficial fourcc
+  FOURCC_H420 = FOURCC('H', '4', '2', '0'),  // unofficial fourcc
+
+  // 14 Auxiliary aliases.  CanonicalFourCC() maps these to canonical fourcc.
+  FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'),  // Alias for I420.
+  FOURCC_YU16 = FOURCC('Y', 'U', '1', '6'),  // Alias for I422.
+  FOURCC_YU24 = FOURCC('Y', 'U', '2', '4'),  // Alias for I444.
+  FOURCC_YUYV = FOURCC('Y', 'U', 'Y', 'V'),  // Alias for YUY2.
+  FOURCC_YUVS = FOURCC('y', 'u', 'v', 's'),  // Alias for YUY2 on Mac.
+  FOURCC_HDYC = FOURCC('H', 'D', 'Y', 'C'),  // Alias for UYVY.
+  FOURCC_2VUY = FOURCC('2', 'v', 'u', 'y'),  // Alias for UYVY on Mac.
+  FOURCC_JPEG = FOURCC('J', 'P', 'E', 'G'),  // Alias for MJPG.
+  FOURCC_DMB1 = FOURCC('d', 'm', 'b', '1'),  // Alias for MJPG on Mac.
+  FOURCC_BA81 = FOURCC('B', 'A', '8', '1'),  // Alias for BGGR.
+  FOURCC_RGB3 = FOURCC('R', 'G', 'B', '3'),  // Alias for RAW.
+  FOURCC_BGR3 = FOURCC('B', 'G', 'R', '3'),  // Alias for 24BG.
+  FOURCC_CM32 = FOURCC(0, 0, 0, 32),  // Alias for BGRA kCMPixelFormat_32ARGB
+  FOURCC_CM24 = FOURCC(0, 0, 0, 24),  // Alias for RAW kCMPixelFormat_24RGB
+  FOURCC_L555 = FOURCC('L', '5', '5', '5'),  // Alias for RGBO.
+  FOURCC_L565 = FOURCC('L', '5', '6', '5'),  // Alias for RGBP.
+  FOURCC_5551 = FOURCC('5', '5', '5', '1'),  // Alias for RGBO.
+
+  // 1 Auxiliary compressed YUV format set aside for capturer.
+  FOURCC_H264 = FOURCC('H', '2', '6', '4'),
+
+  // Match any fourcc.
+  FOURCC_ANY = -1,
+};
+
+enum FourCCBpp {
+  // Canonical fourcc codes used in our code.
+  FOURCC_BPP_I420 = 12,
+  FOURCC_BPP_I422 = 16,
+  FOURCC_BPP_I444 = 24,
+  FOURCC_BPP_I411 = 12,
+  FOURCC_BPP_I400 = 8,
+  FOURCC_BPP_NV21 = 12,
+  FOURCC_BPP_NV12 = 12,
+  FOURCC_BPP_YUY2 = 16,
+  FOURCC_BPP_UYVY = 16,
+  FOURCC_BPP_M420 = 12,
+  FOURCC_BPP_Q420 = 12,
+  FOURCC_BPP_ARGB = 32,
+  FOURCC_BPP_BGRA = 32,
+  FOURCC_BPP_ABGR = 32,
+  FOURCC_BPP_RGBA = 32,
+  FOURCC_BPP_24BG = 24,
+  FOURCC_BPP_RAW  = 24,
+  FOURCC_BPP_RGBP = 16,
+  FOURCC_BPP_RGBO = 16,
+  FOURCC_BPP_R444 = 16,
+  FOURCC_BPP_RGGB = 8,
+  FOURCC_BPP_BGGR = 8,
+  FOURCC_BPP_GRBG = 8,
+  FOURCC_BPP_GBRG = 8,
+  FOURCC_BPP_YV12 = 12,
+  FOURCC_BPP_YV16 = 16,
+  FOURCC_BPP_YV24 = 24,
+  FOURCC_BPP_YU12 = 12,
+  FOURCC_BPP_J420 = 12,
+  FOURCC_BPP_J400 = 8,
+  FOURCC_BPP_H420 = 12,
+  FOURCC_BPP_MJPG = 0,  // 0 means unknown.
+  FOURCC_BPP_H264 = 0,
+  FOURCC_BPP_IYUV = 12,
+  FOURCC_BPP_YU16 = 16,
+  FOURCC_BPP_YU24 = 24,
+  FOURCC_BPP_YUYV = 16,
+  FOURCC_BPP_YUVS = 16,
+  FOURCC_BPP_HDYC = 16,
+  FOURCC_BPP_2VUY = 16,
+  FOURCC_BPP_JPEG = 1,
+  FOURCC_BPP_DMB1 = 1,
+  FOURCC_BPP_BA81 = 8,
+  FOURCC_BPP_RGB3 = 24,
+  FOURCC_BPP_BGR3 = 24,
+  FOURCC_BPP_CM32 = 32,
+  FOURCC_BPP_CM24 = 24,
+
+  // Match any fourcc.
+  FOURCC_BPP_ANY  = 0,  // 0 means unknown.
+};
+
+// Converts fourcc aliases into canonical ones.
+LIBYUV_API uint32 CanonicalFourCC(uint32 fourcc);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_VIDEO_COMMON_H_  NOLINT
diff --git a/libs/libyuv/libyuv.gyp b/libs/libyuv/libyuv.gyp
new file mode 100644
index 0000000000..44dec09eec
--- /dev/null
+++ b/libs/libyuv/libyuv.gyp
@@ -0,0 +1,149 @@
+# Copyright 2011 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+{
+  'includes': [
+    'libyuv.gypi',
+  ],
+  # Make sure that if we are being compiled to an xcodeproj, nothing tries to
+  # include a .pch.
+  'xcode_settings': {
+    'GCC_PREFIX_HEADER': '',
+    'GCC_PRECOMPILE_PREFIX_HEADER': 'NO',
+  },
+  'variables': {
+    'use_system_libjpeg%': 0,
+    'libyuv_disable_jpeg%': 0,
+    # 'chromium_code' treats libyuv as internal and increases warning level.
+    'chromium_code': 1,
+    # clang compiler default variable usable by other apps that include libyuv.
+    'clang%': 0,
+    # Link-Time Optimizations.
+    'use_lto%': 0,
+    'build_neon': 0,
+    'conditions': [
+       ['(target_arch == "armv7" or target_arch == "armv7s" or \
+       (target_arch == "arm" and arm_version >= 7) or target_arch == "arm64")\
+       and (arm_neon == 1 or arm_neon_optional == 1)',
+       {
+         'build_neon': 1,
+       }],
+    ],
+  },
+
+  'targets': [
+    {
+      'target_name': 'libyuv',
+      # Change type to 'shared_library' to build .so or .dll files.
+      'type': 'static_library',
+      'variables': {
+        'optimize': 'max',  # enable O2 and ltcg.
+      },
+      # Allows libyuv.a redistributable library without external dependencies.
+      'standalone_static_library': 1,
+      'conditions': [
+        ['build_neon != 0', {
+          'defines': [
+            'LIBYUV_NEON',
+          ],
+          'cflags!': [
+            '-mfpu=vfp',
+            '-mfpu=vfpv3',
+            '-mfpu=vfpv3-d16',
+          ],
+          'conditions': [
+            # Disable LTO in libyuv_neon target due to gcc 4.9 compiler bug.
+            ['clang == 0 and use_lto == 1', {
+              'cflags!': [
+                '-flto',
+                '-ffat-lto-objects',
+              ],
+            }],
+            # arm64 does not need -mfpu=neon option as neon is not optional
+            ['target_arch != "arm64"', {
+              'cflags': [
+                '-mfpu=neon',
+              ],
+            }],
+          ],
+        }],
+        ['OS != "ios" and libyuv_disable_jpeg != 1', {
+          'defines': [
+            'HAVE_JPEG'
+          ],
+          'conditions': [
+            # Caveat system jpeg support may not support motion jpeg
+            [ 'use_system_libjpeg == 1', {
+              'dependencies': [
+                 '<(DEPTH)/third_party/libjpeg/libjpeg.gyp:libjpeg',
+              ],
+            }, {
+              'dependencies': [
+                 '<(DEPTH)/third_party/libjpeg_turbo/libjpeg.gyp:libjpeg',
+              ],
+            }],
+            [ 'use_system_libjpeg == 1', {
+              'link_settings': {
+                'libraries': [
+                  '-ljpeg',
+                ],
+              }
+            }],
+          ],
+        }],
+        # MemorySanitizer does not support assembly code yet.
+        # http://crbug.com/344505
+        [ 'msan == 1', {
+          'defines': [
+            'LIBYUV_DISABLE_X86',
+          ],
+        }],
+      ], #conditions
+      'defines': [
+        # Enable the following 3 macros to turn off assembly for specified CPU.
+        # 'LIBYUV_DISABLE_X86',
+        # 'LIBYUV_DISABLE_NEON',
+        # 'LIBYUV_DISABLE_MIPS',
+        # Enable the following macro to build libyuv as a shared library (dll).
+        # 'LIBYUV_USING_SHARED_LIBRARY',
+        # TODO(fbarchard): Make these into gyp defines.
+      ],
+      'include_dirs': [
+        'include',
+        '.',
+      ],
+      'direct_dependent_settings': {
+        'include_dirs': [
+          'include',
+          '.',
+        ],
+        'conditions': [
+          ['OS == "android" and target_arch == "arm64"', {
+            'ldflags': [
+              '-Wl,--dynamic-linker,/system/bin/linker64',
+            ],
+          }],
+          ['OS == "android" and target_arch != "arm64"', {
+            'ldflags': [
+              '-Wl,--dynamic-linker,/system/bin/linker',
+            ],
+          }],
+        ], #conditions
+      },
+      'sources': [
+        '<@(libyuv_sources)',
+      ],
+    },
+  ], # targets.
+}
+
+# Local Variables:
+# tab-width:2
+# indent-tabs-mode:nil
+# End:
+# vim: set expandtab tabstop=2 shiftwidth=2:
diff --git a/libs/libyuv/libyuv.gypi b/libs/libyuv/libyuv.gypi
new file mode 100644
index 0000000000..73fdec0a9f
--- /dev/null
+++ b/libs/libyuv/libyuv.gypi
@@ -0,0 +1,79 @@
+# Copyright 2014 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+{
+  'variables': {
+    'libyuv_sources': [
+      # includes.
+      'include/libyuv.h',
+      'include/libyuv/basic_types.h',
+      'include/libyuv/compare.h',
+      'include/libyuv/convert.h',
+      'include/libyuv/convert_argb.h',
+      'include/libyuv/convert_from.h',
+      'include/libyuv/convert_from_argb.h',
+      'include/libyuv/cpu_id.h',
+      'include/libyuv/mjpeg_decoder.h',
+      'include/libyuv/planar_functions.h',
+      'include/libyuv/rotate.h',
+      'include/libyuv/rotate_argb.h',
+      'include/libyuv/rotate_row.h',
+      'include/libyuv/row.h',
+      'include/libyuv/scale.h',
+      'include/libyuv/scale_argb.h',
+      'include/libyuv/scale_row.h',
+      'include/libyuv/version.h',
+      'include/libyuv/video_common.h',
+
+      # sources.
+      'source/compare.cc',
+      'source/compare_common.cc',
+      'source/compare_gcc.cc',
+      'source/compare_neon.cc',
+      'source/compare_neon64.cc',
+      'source/compare_win.cc',
+      'source/convert.cc',
+      'source/convert_argb.cc',
+      'source/convert_from.cc',
+      'source/convert_from_argb.cc',
+      'source/convert_jpeg.cc',
+      'source/convert_to_argb.cc',
+      'source/convert_to_i420.cc',
+      'source/cpu_id.cc',
+      'source/mjpeg_decoder.cc',
+      'source/mjpeg_validate.cc',
+      'source/planar_functions.cc',
+      'source/rotate.cc',
+      'source/rotate_any.cc',
+      'source/rotate_argb.cc',
+      'source/rotate_common.cc',
+      'source/rotate_gcc.cc',
+      'source/rotate_mips.cc',
+      'source/rotate_neon.cc',
+      'source/rotate_neon64.cc',
+      'source/rotate_win.cc',
+      'source/row_any.cc',
+      'source/row_common.cc',
+      'source/row_gcc.cc',
+      'source/row_mips.cc',
+      'source/row_neon.cc',
+      'source/row_neon64.cc',
+      'source/row_win.cc',
+      'source/scale.cc',
+      'source/scale_any.cc',
+      'source/scale_argb.cc',
+      'source/scale_common.cc',
+      'source/scale_gcc.cc',
+      'source/scale_mips.cc',
+      'source/scale_neon.cc',
+      'source/scale_neon64.cc',
+      'source/scale_win.cc',
+      'source/video_common.cc',
+    ],
+  }
+}
diff --git a/libs/libyuv/libyuv_nacl.gyp b/libs/libyuv/libyuv_nacl.gyp
new file mode 100644
index 0000000000..b8fe57ee3e
--- /dev/null
+++ b/libs/libyuv/libyuv_nacl.gyp
@@ -0,0 +1,37 @@
+# Copyright 2014 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+{
+  'includes': [
+    'libyuv.gypi',
+    '../../native_client/build/untrusted.gypi',
+  ],
+  'targets': [
+    {
+      'target_name': 'libyuv_nacl',
+      'type': 'none',
+      'variables': {
+        'nlib_target': 'libyuv_nacl.a',
+        'build_glibc': 0,
+        'build_newlib': 0,
+        'build_pnacl_newlib': 1,
+      },
+      'include_dirs': [
+        'include',
+      ],
+      'direct_dependent_settings': {
+        'include_dirs': [
+          'include',
+        ],
+      },
+      'sources': [
+        '<@(libyuv_sources)',
+      ],
+    },  # target libyuv_nacl
+  ]
+}
diff --git a/libs/libyuv/libyuv_test.gyp b/libs/libyuv/libyuv_test.gyp
new file mode 100644
index 0000000000..0b1c825aae
--- /dev/null
+++ b/libs/libyuv/libyuv_test.gyp
@@ -0,0 +1,227 @@
+# Copyright 2011 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+{
+  'variables': {
+    'libyuv_disable_jpeg%': 0,
+  },
+  'targets': [
+    {
+      'target_name': 'libyuv_unittest',
+      'type': '<(gtest_target_type)',
+      'dependencies': [
+        'libyuv.gyp:libyuv',
+        'testing/gtest.gyp:gtest',
+        'third_party/gflags/gflags.gyp:gflags',
+      ],
+      'direct_dependent_settings': {
+        'defines': [
+          'GTEST_RELATIVE_PATH',
+        ],
+      },
+      'export_dependent_settings': [
+        '<(DEPTH)/testing/gtest.gyp:gtest',
+      ],
+      'sources': [
+        # headers
+        'unit_test/unit_test.h',
+
+        # sources
+        'unit_test/basictypes_test.cc',
+        'unit_test/compare_test.cc',
+        'unit_test/color_test.cc',
+        'unit_test/convert_test.cc',
+        'unit_test/cpu_test.cc',
+        'unit_test/math_test.cc',
+        'unit_test/planar_test.cc',
+        'unit_test/rotate_argb_test.cc',
+        'unit_test/rotate_test.cc',
+        'unit_test/scale_argb_test.cc',
+        'unit_test/scale_test.cc',
+        'unit_test/unit_test.cc',
+        'unit_test/video_common_test.cc',
+      ],
+      'conditions': [
+        ['OS=="linux"', {
+          'cflags': [
+            '-fexceptions',
+          ],
+        }],
+        [ 'OS == "ios" and target_subarch == 64', {
+          'defines': [
+            'LIBYUV_DISABLE_NEON'
+          ],
+        }],
+        [ 'OS == "ios"', {
+          'xcode_settings': {
+            'DEBUGGING_SYMBOLS': 'YES',
+            'DEBUG_INFORMATION_FORMAT' : 'dwarf-with-dsym',
+            # Work around compile issue with isosim.mm, see
+            # https://code.google.com/p/libyuv/issues/detail?id=548 for details.
+            'WARNING_CFLAGS': [
+              '-Wno-sometimes-uninitialized',
+            ],
+          },
+          'cflags': [
+            '-Wno-sometimes-uninitialized',
+          ],
+        }],
+        [ 'OS != "ios" and libyuv_disable_jpeg != 1', {
+          'defines': [
+            'HAVE_JPEG',
+          ],
+        }],
+        ['OS=="android"', {
+          'dependencies': [
+            '<(DEPTH)/testing/android/native_test.gyp:native_test_native_code',
+          ],
+        }],
+        # TODO(YangZhang): These lines can be removed when high accuracy
+        # YUV to RGB to Neon is ported.
+        [ '(target_arch == "armv7" or target_arch == "armv7s" \
+          or (target_arch == "arm" and arm_version >= 7) \
+          or target_arch == "arm64") \
+          and (arm_neon == 1 or arm_neon_optional == 1)', {
+          'defines': [
+            'LIBYUV_NEON'
+          ],
+        }],
+        # MemorySanitizer does not support assembly code yet.
+        # http://crbug.com/344505
+        [ 'msan == 1', {
+          'defines': [
+            'LIBYUV_DISABLE_X86',
+          ],
+        }],
+      ], # conditions
+      'defines': [
+        # Enable the following 3 macros to turn off assembly for specified CPU.
+        # 'LIBYUV_DISABLE_X86',
+        # 'LIBYUV_DISABLE_NEON',
+        # 'LIBYUV_DISABLE_MIPS',
+        # Enable the following macro to build libyuv as a shared library (dll).
+        # 'LIBYUV_USING_SHARED_LIBRARY',
+      ],
+    },
+    {
+      'target_name': 'compare',
+      'type': 'executable',
+      'dependencies': [
+        'libyuv.gyp:libyuv',
+      ],
+      'sources': [
+        # sources
+        'util/compare.cc',
+      ],
+      'conditions': [
+        ['OS=="linux"', {
+          'cflags': [
+            '-fexceptions',
+          ],
+        }],
+      ], # conditions
+    },
+    {
+      'target_name': 'convert',
+      'type': 'executable',
+      'dependencies': [
+        'libyuv.gyp:libyuv',
+      ],
+      'sources': [
+        # sources
+        'util/convert.cc',
+      ],
+      'conditions': [
+        ['OS=="linux"', {
+          'cflags': [
+            '-fexceptions',
+          ],
+        }],
+      ], # conditions
+    },
+    # TODO(fbarchard): Enable SSE2 and OpenMP for better performance.
+    {
+      'target_name': 'psnr',
+      'type': 'executable',
+      'sources': [
+        # sources
+        'util/psnr_main.cc',
+        'util/psnr.cc',
+        'util/ssim.cc',
+      ],
+      'dependencies': [
+        'libyuv.gyp:libyuv',
+      ],
+      'conditions': [
+        [ 'OS == "ios" and target_subarch == 64', {
+          'defines': [
+            'LIBYUV_DISABLE_NEON'
+          ],
+        }],
+
+        [ 'OS != "ios" and libyuv_disable_jpeg != 1', {
+          'defines': [
+            'HAVE_JPEG',
+          ],
+        }],
+      ], # conditions
+    },
+
+    {
+      'target_name': 'cpuid',
+      'type': 'executable',
+      'sources': [
+        # sources
+        'util/cpuid.c',
+      ],
+      'dependencies': [
+        'libyuv.gyp:libyuv',
+      ],
+    },
+  ], # targets
+  'conditions': [
+    ['OS=="android"', {
+      'targets': [
+        {
+          # TODO(kjellander): Figure out what to change in build/apk_test.gypi
+          # to it can be used instead of the copied code below. Using it in its
+          # current version was not possible, since the target starts with 'lib',
+          # which somewhere confuses the variables.
+          'target_name': 'libyuv_unittest_apk',
+          'type': 'none',
+          'variables': {
+            # These are used to configure java_apk.gypi included below.
+            'test_type': 'gtest',
+            'apk_name': 'libyuv_unittest',
+            'intermediate_dir': '<(PRODUCT_DIR)/libyuv_unittest_apk',
+            'final_apk_path': '<(intermediate_dir)/libyuv_unittest-debug.apk',
+            'java_in_dir': '<(DEPTH)/testing/android/native_test/java',
+            'native_lib_target': 'libyuv_unittest',
+            'gyp_managed_install': 0,
+          },
+          'includes': [ 'build/java_apk.gypi' ],
+          'dependencies': [
+            '<(DEPTH)/base/base.gyp:base_java',
+            '<(DEPTH)/build/android/pylib/device/commands/commands.gyp:chromium_commands',
+            '<(DEPTH)/build/android/pylib/remote/device/dummy/dummy.gyp:remote_device_dummy_apk',
+            '<(DEPTH)/testing/android/appurify_support.gyp:appurify_support_java',
+            '<(DEPTH)/testing/android/on_device_instrumentation.gyp:reporter_java',
+            '<(DEPTH)/tools/android/android_tools.gyp:android_tools',
+            'libyuv_unittest',
+          ],
+        },
+      ],
+    }],
+  ],
+}
+
+# Local Variables:
+# tab-width:2
+# indent-tabs-mode:nil
+# End:
+# vim: set expandtab tabstop=2 shiftwidth=2:
diff --git a/libs/libyuv/linux.mk b/libs/libyuv/linux.mk
new file mode 100644
index 0000000000..563f8ef45d
--- /dev/null
+++ b/libs/libyuv/linux.mk
@@ -0,0 +1,52 @@
+# This is a generic makefile for libyuv for gcc.
+# make -f linux.mk CXX=clang++
+
+CXX?=g++
+CXXFLAGS?=-O2 -fomit-frame-pointer
+CXXFLAGS+=-Iinclude/
+
+LOCAL_OBJ_FILES := \
+    source/compare.o           \
+    source/compare_common.o    \
+    source/compare_gcc.o       \
+    source/convert.o           \
+    source/convert_argb.o      \
+    source/convert_from.o      \
+    source/convert_from_argb.o \
+    source/convert_to_argb.o   \
+    source/convert_to_i420.o   \
+    source/cpu_id.o            \
+    source/planar_functions.o  \
+    source/rotate.o            \
+    source/rotate_any.o        \
+    source/rotate_argb.o       \
+    source/rotate_common.o     \
+    source/rotate_gcc.o        \
+    source/rotate_mips.o       \
+    source/row_any.o           \
+    source/row_common.o        \
+    source/row_mips.o          \
+    source/row_gcc.o           \
+    source/scale.o             \
+    source/scale_any.o         \
+    source/scale_argb.o        \
+    source/scale_common.o      \
+    source/scale_gcc.o         \
+    source/scale_mips.o        \
+    source/video_common.o
+
+.cc.o:
+	$(CXX) -c $(CXXFLAGS) $*.cc -o $*.o
+
+all: libyuv.a convert
+
+libyuv.a: $(LOCAL_OBJ_FILES)
+	$(AR) $(ARFLAGS) $@ $(LOCAL_OBJ_FILES)
+
+# A test utility that uses libyuv conversion.
+convert: util/convert.cc libyuv.a
+	$(CXX) $(CXXFLAGS) -Iutil/ -o $@ util/convert.cc libyuv.a
+
+clean:
+	/bin/rm -f source/*.o *.ii *.s libyuv.a convert
+
diff --git a/libs/libyuv/public.mk b/libs/libyuv/public.mk
new file mode 100644
index 0000000000..090d8cb659
--- /dev/null
+++ b/libs/libyuv/public.mk
@@ -0,0 +1,13 @@
+# This file contains all the common make variables which are useful for
+# anyone depending on this library.
+# Note that dependencies on NDK are not directly listed since NDK auto adds
+# them.
+
+LIBYUV_INCLUDES := $(LIBYUV_PATH)/include
+
+LIBYUV_C_FLAGS :=
+
+LIBYUV_CPP_FLAGS :=
+
+LIBYUV_LDLIBS :=
+LIBYUV_DEP_MODULES :=
diff --git a/libs/libyuv/setup_links.py b/libs/libyuv/setup_links.py
new file mode 100755
index 0000000000..975ef90471
--- /dev/null
+++ b/libs/libyuv/setup_links.py
@@ -0,0 +1,519 @@
+#!/usr/bin/env python
+# Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS.  All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+"""Setup links to a Chromium checkout for WebRTC.
+
+WebRTC standalone shares a lot of dependencies and build tools with Chromium.
+To do this, many of the paths of a Chromium checkout is emulated by creating
+symlinks to files and directories. This script handles the setup of symlinks to
+achieve this.
+
+It also handles cleanup of the legacy Subversion-based approach that was used
+before Chrome switched over their master repo from Subversion to Git.
+"""
+
+
+import ctypes
+import errno
+import logging
+import optparse
+import os
+import shelve
+import shutil
+import subprocess
+import sys
+import textwrap
+
+
+DIRECTORIES = [
+  'build',
+  'buildtools',
+  'google_apis',  # Needed by build/common.gypi.
+  'net',
+  'testing',
+  'third_party/binutils',
+  'third_party/boringssl',
+  'third_party/colorama',
+  'third_party/drmemory',
+  'third_party/expat',
+  'third_party/icu',
+  'third_party/instrumented_libraries',
+  'third_party/jsoncpp',
+  'third_party/libjpeg',
+  'third_party/libjpeg_turbo',
+  'third_party/libsrtp',
+  'third_party/libudev',
+  'third_party/libvpx_new',
+  'third_party/libyuv',
+  'third_party/llvm-build',
+  'third_party/lss',
+  'third_party/nss',
+  'third_party/ocmock',
+  'third_party/openmax_dl',
+  'third_party/opus',
+  'third_party/proguard',
+  'third_party/protobuf',
+  'third_party/sqlite',
+  'third_party/syzygy',
+  'third_party/usrsctp',
+  'third_party/yasm',
+  'third_party/zlib',
+  'tools/clang',
+  'tools/generate_library_loader',
+  'tools/gn',
+  'tools/gyp',
+  'tools/memory',
+  'tools/protoc_wrapper',
+  'tools/python',
+  'tools/swarming_client',
+  'tools/valgrind',
+  'tools/vim',
+  'tools/win',
+]
+
+from sync_chromium import get_target_os_list
+target_os = get_target_os_list()
+if 'android' in target_os:
+  DIRECTORIES += [
+    'base',
+    'third_party/android_platform',
+    'third_party/android_testrunner',
+    'third_party/android_tools',
+    'third_party/appurify-python',
+    'third_party/ashmem',
+    'third_party/catapult',
+    'third_party/ijar',
+    'third_party/jsr-305',
+    'third_party/junit',
+    'third_party/libevent',
+    'third_party/libxml',
+    'third_party/mockito',
+    'third_party/modp_b64',
+    'third_party/requests',
+    'third_party/robolectric',
+    'tools/android',
+    'tools/grit',
+    'tools/relocation_packer',
+    'tools/telemetry',
+  ]
+if 'ios' in target_os:
+  DIRECTORIES.append('third_party/class-dump')
+
+FILES = {
+  'tools/find_depot_tools.py': None,
+  'tools/isolate_driver.py': None,
+  'third_party/BUILD.gn': None,
+}
+
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+CHROMIUM_CHECKOUT = os.path.join('chromium', 'src')
+LINKS_DB = 'links'
+
+# Version management to make future upgrades/downgrades easier to support.
+SCHEMA_VERSION = 1
+
+
+def query_yes_no(question, default=False):
+  """Ask a yes/no question via raw_input() and return their answer.
+
+  Modified from http://stackoverflow.com/a/3041990.
+  """
+  prompt = " [%s/%%s]: "
+  prompt = prompt % ('Y' if default is True  else 'y')
+  prompt = prompt % ('N' if default is False else 'n')
+
+  if default is None:
+    default = 'INVALID'
+
+  while True:
+    sys.stdout.write(question + prompt)
+    choice = raw_input().lower()
+    if choice == '' and default != 'INVALID':
+      return default
+
+    if 'yes'.startswith(choice):
+      return True
+    elif 'no'.startswith(choice):
+      return False
+
+    print "Please respond with 'yes' or 'no' (or 'y' or 'n')."
+
+
+# Actions
+class Action(object):
+  def __init__(self, dangerous):
+    self.dangerous = dangerous
+
+  def announce(self, planning):
+    """Log a description of this action.
+
+    Args:
+      planning - True iff we're in the planning stage, False if we're in the
+                 doit stage.
+    """
+    pass
+
+  def doit(self, links_db):
+    """Execute the action, recording what we did to links_db, if necessary."""
+    pass
+
+
+class Remove(Action):
+  def __init__(self, path, dangerous):
+    super(Remove, self).__init__(dangerous)
+    self._priority = 0
+    self._path = path
+
+  def announce(self, planning):
+    log = logging.warn
+    filesystem_type = 'file'
+    if not self.dangerous:
+      log = logging.info
+      filesystem_type = 'link'
+    if planning:
+      log('Planning to remove %s: %s', filesystem_type, self._path)
+    else:
+      log('Removing %s: %s', filesystem_type, self._path)
+
+  def doit(self, _):
+    os.remove(self._path)
+
+
+class Rmtree(Action):
+  def __init__(self, path):
+    super(Rmtree, self).__init__(dangerous=True)
+    self._priority = 0
+    self._path = path
+
+  def announce(self, planning):
+    if planning:
+      logging.warn('Planning to remove directory: %s', self._path)
+    else:
+      logging.warn('Removing directory: %s', self._path)
+
+  def doit(self, _):
+    if sys.platform.startswith('win'):
+      # shutil.rmtree() doesn't work on Windows if any of the directories are
+      # read-only, which svn repositories are.
+      subprocess.check_call(['rd', '/q', '/s', self._path], shell=True)
+    else:
+      shutil.rmtree(self._path)
+
+
+class Makedirs(Action):
+  def __init__(self, path):
+    super(Makedirs, self).__init__(dangerous=False)
+    self._priority = 1
+    self._path = path
+
+  def doit(self, _):
+    try:
+      os.makedirs(self._path)
+    except OSError as e:
+      if e.errno != errno.EEXIST:
+        raise
+
+
+class Symlink(Action):
+  def __init__(self, source_path, link_path):
+    super(Symlink, self).__init__(dangerous=False)
+    self._priority = 2
+    self._source_path = source_path
+    self._link_path = link_path
+
+  def announce(self, planning):
+    if planning:
+      logging.info(
+          'Planning to create link from %s to %s', self._link_path,
+          self._source_path)
+    else:
+      logging.debug(
+          'Linking from %s to %s', self._link_path, self._source_path)
+
+  def doit(self, links_db):
+    # Files not in the root directory need relative path calculation.
+    # On Windows, use absolute paths instead since NTFS doesn't seem to support
+    # relative paths for symlinks.
+    if sys.platform.startswith('win'):
+      source_path = os.path.abspath(self._source_path)
+    else:
+      if os.path.dirname(self._link_path) != self._link_path:
+        source_path = os.path.relpath(self._source_path,
+                                      os.path.dirname(self._link_path))
+
+    os.symlink(source_path, os.path.abspath(self._link_path))
+    links_db[self._source_path] = self._link_path
+
+
+class LinkError(IOError):
+  """Failed to create a link."""
+  pass
+
+
+# Handles symlink creation on the different platforms.
+if sys.platform.startswith('win'):
+  def symlink(source_path, link_path):
+    flag = 1 if os.path.isdir(source_path) else 0
+    if not ctypes.windll.kernel32.CreateSymbolicLinkW(
+        unicode(link_path), unicode(source_path), flag):
+      raise OSError('Failed to create symlink to %s. Notice that only NTFS '
+                    'version 5.0 and up has all the needed APIs for '
+                    'creating symlinks.' % source_path)
+  os.symlink = symlink
+
+
+class WebRTCLinkSetup(object):
+  def __init__(self, links_db, force=False, dry_run=False, prompt=False):
+    self._force = force
+    self._dry_run = dry_run
+    self._prompt = prompt
+    self._links_db = links_db
+
+  def CreateLinks(self, on_bot):
+    logging.debug('CreateLinks')
+    # First, make a plan of action
+    actions = []
+
+    for source_path, link_path in FILES.iteritems():
+      actions += self._ActionForPath(
+          source_path, link_path, check_fn=os.path.isfile, check_msg='files')
+    for source_dir in DIRECTORIES:
+      actions += self._ActionForPath(
+          source_dir, None, check_fn=os.path.isdir,
+          check_msg='directories')
+
+    if not on_bot and self._force:
+      # When making the manual switch from legacy SVN checkouts to the new
+      # Git-based Chromium DEPS, the .gclient_entries file that contains cached
+      # URLs for all DEPS entries must be removed to avoid future sync problems.
+      entries_file = os.path.join(os.path.dirname(ROOT_DIR), '.gclient_entries')
+      if os.path.exists(entries_file):
+        actions.append(Remove(entries_file, dangerous=True))
+
+    actions.sort()
+
+    if self._dry_run:
+      for action in actions:
+        action.announce(planning=True)
+      logging.info('Not doing anything because dry-run was specified.')
+      sys.exit(0)
+
+    if any(a.dangerous for a in actions):
+      logging.warn('Dangerous actions:')
+      for action in (a for a in actions if a.dangerous):
+        action.announce(planning=True)
+      print
+
+      if not self._force:
+        logging.error(textwrap.dedent("""\
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+                              A C T I O N     R E Q I R E D
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        Because chromium/src is transitioning to Git (from SVN), we needed to
+        change the way that the WebRTC standalone checkout works. Instead of
+        individually syncing subdirectories of Chromium in SVN, we're now
+        syncing Chromium (and all of its DEPS, as defined by its own DEPS file),
+        into the `chromium/src` directory.
+
+        As such, all Chromium directories which are currently pulled by DEPS are
+        now replaced with a symlink into the full Chromium checkout.
+
+        To avoid disrupting developers, we've chosen to not delete your
+        directories forcibly, in case you have some work in progress in one of
+        them :).
+
+        ACTION REQUIRED:
+        Before running `gclient sync|runhooks` again, you must run:
+        %s%s --force
+
+        Which will replace all directories which now must be symlinks, after
+        prompting with a summary of the work-to-be-done.
+        """), 'python ' if sys.platform.startswith('win') else '', sys.argv[0])
+        sys.exit(1)
+      elif self._prompt:
+        if not query_yes_no('Would you like to perform the above plan?'):
+          sys.exit(1)
+
+    for action in actions:
+      action.announce(planning=False)
+      action.doit(self._links_db)
+
+    if not on_bot and self._force:
+      logging.info('Completed!\n\nNow run `gclient sync|runhooks` again to '
+                   'let the remaining hooks (that probably were interrupted) '
+                   'execute.')
+
+  def CleanupLinks(self):
+    logging.debug('CleanupLinks')
+    for source, link_path  in self._links_db.iteritems():
+      if source == 'SCHEMA_VERSION':
+        continue
+      if os.path.islink(link_path) or sys.platform.startswith('win'):
+        # os.path.islink() always returns false on Windows
+        # See http://bugs.python.org/issue13143.
+        logging.debug('Removing link to %s at %s', source, link_path)
+        if not self._dry_run:
+          if os.path.exists(link_path):
+            if sys.platform.startswith('win') and os.path.isdir(link_path):
+              subprocess.check_call(['rmdir', '/q', '/s', link_path],
+                                    shell=True)
+            else:
+              os.remove(link_path)
+          del self._links_db[source]
+
+  @staticmethod
+  def _ActionForPath(source_path, link_path=None, check_fn=None,
+                     check_msg=None):
+    """Create zero or more Actions to link to a file or directory.
+
+    This will be a symlink on POSIX platforms. On Windows this requires
+    that NTFS is version 5.0 or higher (Vista or newer).
+
+    Args:
+      source_path: Path relative to the Chromium checkout root.
+        For readability, the path may contain slashes, which will
+        automatically be converted to the right path delimiter on Windows.
+      link_path: The location for the link to create. If omitted it will be the
+        same path as source_path.
+      check_fn: A function returning true if the type of filesystem object is
+        correct for the attempted call. Otherwise an error message with
+        check_msg will be printed.
+      check_msg: String used to inform the user of an invalid attempt to create
+        a file.
+    Returns:
+      A list of Action objects.
+    """
+    def fix_separators(path):
+      if sys.platform.startswith('win'):
+        return path.replace(os.altsep, os.sep)
+      else:
+        return path
+
+    assert check_fn
+    assert check_msg
+    link_path = link_path or source_path
+    link_path = fix_separators(link_path)
+
+    source_path = fix_separators(source_path)
+    source_path = os.path.join(CHROMIUM_CHECKOUT, source_path)
+    if os.path.exists(source_path) and not check_fn:
+      raise LinkError('_LinkChromiumPath can only be used to link to %s: '
+                      'Tried to link to: %s' % (check_msg, source_path))
+
+    if not os.path.exists(source_path):
+      logging.debug('Silently ignoring missing source: %s. This is to avoid '
+                    'errors on platform-specific dependencies.', source_path)
+      return []
+
+    actions = []
+
+    if os.path.exists(link_path) or os.path.islink(link_path):
+      if os.path.islink(link_path):
+        actions.append(Remove(link_path, dangerous=False))
+      elif os.path.isfile(link_path):
+        actions.append(Remove(link_path, dangerous=True))
+      elif os.path.isdir(link_path):
+        actions.append(Rmtree(link_path))
+      else:
+        raise LinkError('Don\'t know how to plan: %s' % link_path)
+
+    # Create parent directories to the target link if needed.
+    target_parent_dirs = os.path.dirname(link_path)
+    if (target_parent_dirs and
+        target_parent_dirs != link_path and
+        not os.path.exists(target_parent_dirs)):
+      actions.append(Makedirs(target_parent_dirs))
+
+    actions.append(Symlink(source_path, link_path))
+
+    return actions
+
+def _initialize_database(filename):
+  links_database = shelve.open(filename)
+
+  # Wipe the database if this version of the script ends up looking at a
+  # newer (future) version of the links db, just to be sure.
+  version = links_database.get('SCHEMA_VERSION')
+  if version and version != SCHEMA_VERSION:
+    logging.info('Found database with schema version %s while this script only '
+                 'supports %s. Wiping previous database contents.', version,
+                 SCHEMA_VERSION)
+    links_database.clear()
+  links_database['SCHEMA_VERSION'] = SCHEMA_VERSION
+  return links_database
+
+
+def main():
+  on_bot = os.environ.get('CHROME_HEADLESS') == '1'
+
+  parser = optparse.OptionParser()
+  parser.add_option('-d', '--dry-run', action='store_true', default=False,
+                    help='Print what would be done, but don\'t perform any '
+                         'operations. This will automatically set logging to '
+                         'verbose.')
+  parser.add_option('-c', '--clean-only', action='store_true', default=False,
+                    help='Only clean previously created links, don\'t create '
+                         'new ones. This will automatically set logging to '
+                         'verbose.')
+  parser.add_option('-f', '--force', action='store_true', default=on_bot,
+                    help='Force link creation. CAUTION: This deletes existing '
+                         'folders and files in the locations where links are '
+                         'about to be created.')
+  parser.add_option('-n', '--no-prompt', action='store_false', dest='prompt',
+                    default=(not on_bot),
+                    help='Prompt if we\'re planning to do a dangerous action')
+  parser.add_option('-v', '--verbose', action='store_const',
+                    const=logging.DEBUG, default=logging.INFO,
+                    help='Print verbose output for debugging.')
+  options, _ = parser.parse_args()
+
+  if options.dry_run or options.force or options.clean_only:
+    options.verbose = logging.DEBUG
+  logging.basicConfig(format='%(message)s', level=options.verbose)
+
+  # Work from the root directory of the checkout.
+  script_dir = os.path.dirname(os.path.abspath(__file__))
+  os.chdir(script_dir)
+
+  if sys.platform.startswith('win'):
+    def is_admin():
+      try:
+        return os.getuid() == 0
+      except AttributeError:
+        return ctypes.windll.shell32.IsUserAnAdmin() != 0
+    if not is_admin():
+      logging.error('On Windows, you now need to have administrator '
+                    'privileges for the shell running %s (or '
+                    '`gclient sync|runhooks`).\nPlease start another command '
+                    'prompt as Administrator and try again.', sys.argv[0])
+      return 1
+
+  if not os.path.exists(CHROMIUM_CHECKOUT):
+    logging.error('Cannot find a Chromium checkout at %s. Did you run "gclient '
+                  'sync" before running this script?', CHROMIUM_CHECKOUT)
+    return 2
+
+  links_database = _initialize_database(LINKS_DB)
+  try:
+    symlink_creator = WebRTCLinkSetup(links_database, options.force,
+                                      options.dry_run, options.prompt)
+    symlink_creator.CleanupLinks()
+    if not options.clean_only:
+      symlink_creator.CreateLinks(on_bot)
+  except LinkError as e:
+    print >> sys.stderr, e.message
+    return 3
+  finally:
+    links_database.close()
+  return 0
+
+
+if __name__ == '__main__':
+  sys.exit(main())
diff --git a/libs/libyuv/source/compare.cc b/libs/libyuv/source/compare.cc
new file mode 100644
index 0000000000..e3846bdfdd
--- /dev/null
+++ b/libs/libyuv/source/compare.cc
@@ -0,0 +1,340 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/compare.h"
+
+#include <float.h>
+#include <math.h>
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#include "libyuv/basic_types.h"
+#include "libyuv/compare_row.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/row.h"
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// hash seed of 5381 recommended.
+LIBYUV_API
+uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
+  const int kBlockSize = 1 << 15;  // 32768;
+  int remainder;
+  uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) =
+      HashDjb2_C;
+#if defined(HAS_HASHDJB2_SSE41)
+  if (TestCpuFlag(kCpuHasSSE41)) {
+    HashDjb2_SSE = HashDjb2_SSE41;
+  }
+#endif
+#if defined(HAS_HASHDJB2_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    HashDjb2_SSE = HashDjb2_AVX2;
+  }
+#endif
+
+  while (count >= (uint64)(kBlockSize)) {
+    seed = HashDjb2_SSE(src, kBlockSize, seed);
+    src += kBlockSize;
+    count -= kBlockSize;
+  }
+  remainder = (int)(count) & ~15;
+  if (remainder) {
+    seed = HashDjb2_SSE(src, remainder, seed);
+    src += remainder;
+    count -= remainder;
+  }
+  remainder = (int)(count) & 15;
+  if (remainder) {
+    seed = HashDjb2_C(src, remainder, seed);
+  }
+  return seed;
+}
+
+static uint32 ARGBDetectRow_C(const uint8* argb, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    if (argb[0] != 255) {  // First byte is not Alpha of 255, so not ARGB.
+      return FOURCC_BGRA;
+    }
+    if (argb[3] != 255) {  // 4th byte is not Alpha of 255, so not BGRA.
+      return FOURCC_ARGB;
+    }
+    if (argb[4] != 255) {  // Second pixel first byte is not Alpha of 255.
+      return FOURCC_BGRA;
+    }
+    if (argb[7] != 255) {  // Second pixel 4th byte is not Alpha of 255.
+      return FOURCC_ARGB;
+    }
+    argb += 8;
+  }
+  if (width & 1) {
+    if (argb[0] != 255) {  // First byte is not Alpha of 255, so not ARGB.
+      return FOURCC_BGRA;
+    }
+    if (argb[3] != 255) {  // 4th byte is not Alpha of 255, so not BGRA.
+      return FOURCC_ARGB;
+    }
+  }
+  return 0;
+}
+
+// Scan an opaque argb image and return fourcc based on alpha offset.
+// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
+LIBYUV_API
+uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height) {
+  uint32 fourcc = 0;
+  int h;
+
+  // Coalesce rows.
+  if (stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    stride_argb = 0;
+  }
+  for (h = 0; h < height && fourcc == 0; ++h) {
+    fourcc = ARGBDetectRow_C(argb, width);
+    argb += stride_argb;
+  }
+  return fourcc;
+}
+
+// TODO(fbarchard): Refactor into row function.
+LIBYUV_API
+uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
+                             int count) {
+  // SumSquareError returns values 0 to 65535 for each squared difference.
+  // Up to 65536 of those can be summed and remain within a uint32.
+  // After each block of 65536 pixels, accumulate into a uint64.
+  const int kBlockSize = 65536;
+  int remainder = count & (kBlockSize - 1) & ~31;
+  uint64 sse = 0;
+  int i;
+  uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) =
+      SumSquareError_C;
+#if defined(HAS_SUMSQUAREERROR_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SumSquareError = SumSquareError_NEON;
+  }
+#endif
+#if defined(HAS_SUMSQUAREERROR_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    // Note only used for multiples of 16 so count is not checked.
+    SumSquareError = SumSquareError_SSE2;
+  }
+#endif
+#if defined(HAS_SUMSQUAREERROR_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    // Note only used for multiples of 32 so count is not checked.
+    SumSquareError = SumSquareError_AVX2;
+  }
+#endif
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+: sse)
+#endif
+  for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
+    sse += SumSquareError(src_a + i, src_b + i, kBlockSize);
+  }
+  src_a += count & ~(kBlockSize - 1);
+  src_b += count & ~(kBlockSize - 1);
+  if (remainder) {
+    sse += SumSquareError(src_a, src_b, remainder);
+    src_a += remainder;
+    src_b += remainder;
+  }
+  remainder = count & 31;
+  if (remainder) {
+    sse += SumSquareError_C(src_a, src_b, remainder);
+  }
+  return sse;
+}
+
+LIBYUV_API
+uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
+                                  const uint8* src_b, int stride_b,
+                                  int width, int height) {
+  uint64 sse = 0;
+  int h;
+  // Coalesce rows.
+  if (stride_a == width &&
+      stride_b == width) {
+    width *= height;
+    height = 1;
+    stride_a = stride_b = 0;
+  }
+  for (h = 0; h < height; ++h) {
+    sse += ComputeSumSquareError(src_a, src_b, width);
+    src_a += stride_a;
+    src_b += stride_b;
+  }
+  return sse;
+}
+
+LIBYUV_API
+double SumSquareErrorToPsnr(uint64 sse, uint64 count) {
+  double psnr;
+  if (sse > 0) {
+    double mse = (double)(count) / (double)(sse);
+    psnr = 10.0 * log10(255.0 * 255.0 * mse);
+  } else {
+    psnr = kMaxPsnr;      // Limit to prevent divide by 0
+  }
+
+  if (psnr > kMaxPsnr)
+    psnr = kMaxPsnr;
+
+  return psnr;
+}
+
+LIBYUV_API
+double CalcFramePsnr(const uint8* src_a, int stride_a,
+                     const uint8* src_b, int stride_b,
+                     int width, int height) {
+  const uint64 samples = width * height;
+  const uint64 sse = ComputeSumSquareErrorPlane(src_a, stride_a,
+                                                src_b, stride_b,
+                                                width, height);
+  return SumSquareErrorToPsnr(sse, samples);
+}
+
+LIBYUV_API
+double I420Psnr(const uint8* src_y_a, int stride_y_a,
+                const uint8* src_u_a, int stride_u_a,
+                const uint8* src_v_a, int stride_v_a,
+                const uint8* src_y_b, int stride_y_b,
+                const uint8* src_u_b, int stride_u_b,
+                const uint8* src_v_b, int stride_v_b,
+                int width, int height) {
+  const uint64 sse_y = ComputeSumSquareErrorPlane(src_y_a, stride_y_a,
+                                                  src_y_b, stride_y_b,
+                                                  width, height);
+  const int width_uv = (width + 1) >> 1;
+  const int height_uv = (height + 1) >> 1;
+  const uint64 sse_u = ComputeSumSquareErrorPlane(src_u_a, stride_u_a,
+                                                  src_u_b, stride_u_b,
+                                                  width_uv, height_uv);
+  const uint64 sse_v = ComputeSumSquareErrorPlane(src_v_a, stride_v_a,
+                                                  src_v_b, stride_v_b,
+                                                  width_uv, height_uv);
+  const uint64 samples = width * height + 2 * (width_uv * height_uv);
+  const uint64 sse = sse_y + sse_u + sse_v;
+  return SumSquareErrorToPsnr(sse, samples);
+}
+
+static const int64 cc1 =  26634;  // (64^2*(.01*255)^2
+static const int64 cc2 = 239708;  // (64^2*(.03*255)^2
+
+static double Ssim8x8_C(const uint8* src_a, int stride_a,
+                        const uint8* src_b, int stride_b) {
+  int64 sum_a = 0;
+  int64 sum_b = 0;
+  int64 sum_sq_a = 0;
+  int64 sum_sq_b = 0;
+  int64 sum_axb = 0;
+
+  int i;
+  for (i = 0; i < 8; ++i) {
+    int j;
+    for (j = 0; j < 8; ++j) {
+      sum_a += src_a[j];
+      sum_b += src_b[j];
+      sum_sq_a += src_a[j] * src_a[j];
+      sum_sq_b += src_b[j] * src_b[j];
+      sum_axb += src_a[j] * src_b[j];
+    }
+
+    src_a += stride_a;
+    src_b += stride_b;
+  }
+
+  {
+    const int64 count = 64;
+    // scale the constants by number of pixels
+    const int64 c1 = (cc1 * count * count) >> 12;
+    const int64 c2 = (cc2 * count * count) >> 12;
+
+    const int64 sum_a_x_sum_b = sum_a * sum_b;
+
+    const int64 ssim_n = (2 * sum_a_x_sum_b + c1) *
+                         (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2);
+
+    const int64 sum_a_sq = sum_a*sum_a;
+    const int64 sum_b_sq = sum_b*sum_b;
+
+    const int64 ssim_d = (sum_a_sq + sum_b_sq + c1) *
+                         (count * sum_sq_a - sum_a_sq +
+                          count * sum_sq_b - sum_b_sq + c2);
+
+    if (ssim_d == 0.0) {
+      return DBL_MAX;
+    }
+    return ssim_n * 1.0 / ssim_d;
+  }
+}
+
+// We are using a 8x8 moving window with starting location of each 8x8 window
+// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
+// block boundaries to penalize blocking artifacts.
+LIBYUV_API
+double CalcFrameSsim(const uint8* src_a, int stride_a,
+                     const uint8* src_b, int stride_b,
+                     int width, int height) {
+  int samples = 0;
+  double ssim_total = 0;
+  double (*Ssim8x8)(const uint8* src_a, int stride_a,
+                    const uint8* src_b, int stride_b) = Ssim8x8_C;
+
+  // sample point start with each 4x4 location
+  int i;
+  for (i = 0; i < height - 8; i += 4) {
+    int j;
+    for (j = 0; j < width - 8; j += 4) {
+      ssim_total += Ssim8x8(src_a + j, stride_a, src_b + j, stride_b);
+      samples++;
+    }
+
+    src_a += stride_a * 4;
+    src_b += stride_b * 4;
+  }
+
+  ssim_total /= samples;
+  return ssim_total;
+}
+
+LIBYUV_API
+double I420Ssim(const uint8* src_y_a, int stride_y_a,
+                const uint8* src_u_a, int stride_u_a,
+                const uint8* src_v_a, int stride_v_a,
+                const uint8* src_y_b, int stride_y_b,
+                const uint8* src_u_b, int stride_u_b,
+                const uint8* src_v_b, int stride_v_b,
+                int width, int height) {
+  const double ssim_y = CalcFrameSsim(src_y_a, stride_y_a,
+                                      src_y_b, stride_y_b, width, height);
+  const int width_uv = (width + 1) >> 1;
+  const int height_uv = (height + 1) >> 1;
+  const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a,
+                                      src_u_b, stride_u_b,
+                                      width_uv, height_uv);
+  const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a,
+                                      src_v_b, stride_v_b,
+                                      width_uv, height_uv);
+  return ssim_y * 0.8 + 0.1 * (ssim_u + ssim_v);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libyuv/source/compare_common.cc b/libs/libyuv/source/compare_common.cc
new file mode 100644
index 0000000000..42fc589354
--- /dev/null
+++ b/libs/libyuv/source/compare_common.cc
@@ -0,0 +1,44 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#include "libyuv/compare_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count) {
+  uint32 sse = 0u;
+  int i;
+  for (i = 0; i < count; ++i) {
+    int diff = src_a[i] - src_b[i];
+    sse += (uint32)(diff * diff);
+  }
+  return sse;
+}
+
+// hash seed of 5381 recommended.
+// Internal C version of HashDjb2 with int sized count for efficiency.
+uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) {
+  uint32 hash = seed;
+  int i;
+  for (i = 0; i < count; ++i) {
+    hash += (hash << 5) + src[i];
+  }
+  return hash;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libyuv/source/compare_gcc.cc b/libs/libyuv/source/compare_gcc.cc
new file mode 100644
index 0000000000..1b83edb166
--- /dev/null
+++ b/libs/libyuv/source/compare_gcc.cc
@@ -0,0 +1,151 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#include "libyuv/compare_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+
+uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
+  uint32 sse;
+  asm volatile (
+    "pxor      %%xmm0,%%xmm0                   \n"
+    "pxor      %%xmm5,%%xmm5                   \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
+    "lea       " MEMLEA(0x10, 0) ",%0          \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
+    "lea       " MEMLEA(0x10, 1) ",%1          \n"
+    "movdqa    %%xmm1,%%xmm3                   \n"
+    "psubusb   %%xmm2,%%xmm1                   \n"
+    "psubusb   %%xmm3,%%xmm2                   \n"
+    "por       %%xmm2,%%xmm1                   \n"
+    "movdqa    %%xmm1,%%xmm2                   \n"
+    "punpcklbw %%xmm5,%%xmm1                   \n"
+    "punpckhbw %%xmm5,%%xmm2                   \n"
+    "pmaddwd   %%xmm1,%%xmm1                   \n"
+    "pmaddwd   %%xmm2,%%xmm2                   \n"
+    "paddd     %%xmm1,%%xmm0                   \n"
+    "paddd     %%xmm2,%%xmm0                   \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+
+    "pshufd    $0xee,%%xmm0,%%xmm1             \n"
+    "paddd     %%xmm1,%%xmm0                   \n"
+    "pshufd    $0x1,%%xmm0,%%xmm1              \n"
+    "paddd     %%xmm1,%%xmm0                   \n"
+    "movd      %%xmm0,%3                       \n"
+
+  : "+r"(src_a),      // %0
+    "+r"(src_b),      // %1
+    "+r"(count),      // %2
+    "=g"(sse)         // %3
+  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+  return sse;
+}
+
+static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16
+static uvec32 kHashMul0 = {
+  0x0c3525e1,  // 33 ^ 15
+  0xa3476dc1,  // 33 ^ 14
+  0x3b4039a1,  // 33 ^ 13
+  0x4f5f0981,  // 33 ^ 12
+};
+static uvec32 kHashMul1 = {
+  0x30f35d61,  // 33 ^ 11
+  0x855cb541,  // 33 ^ 10
+  0x040a9121,  // 33 ^ 9
+  0x747c7101,  // 33 ^ 8
+};
+static uvec32 kHashMul2 = {
+  0xec41d4e1,  // 33 ^ 7
+  0x4cfa3cc1,  // 33 ^ 6
+  0x025528a1,  // 33 ^ 5
+  0x00121881,  // 33 ^ 4
+};
+static uvec32 kHashMul3 = {
+  0x00008c61,  // 33 ^ 3
+  0x00000441,  // 33 ^ 2
+  0x00000021,  // 33 ^ 1
+  0x00000001,  // 33 ^ 0
+};
+
+uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
+  uint32 hash;
+  asm volatile (
+    "movd      %2,%%xmm0                       \n"
+    "pxor      %%xmm7,%%xmm7                   \n"
+    "movdqa    %4,%%xmm6                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
+    "lea       " MEMLEA(0x10, 0) ",%0          \n"
+    "pmulld    %%xmm6,%%xmm0                   \n"
+    "movdqa    %5,%%xmm5                       \n"
+    "movdqa    %%xmm1,%%xmm2                   \n"
+    "punpcklbw %%xmm7,%%xmm2                   \n"
+    "movdqa    %%xmm2,%%xmm3                   \n"
+    "punpcklwd %%xmm7,%%xmm3                   \n"
+    "pmulld    %%xmm5,%%xmm3                   \n"
+    "movdqa    %6,%%xmm5                       \n"
+    "movdqa    %%xmm2,%%xmm4                   \n"
+    "punpckhwd %%xmm7,%%xmm4                   \n"
+    "pmulld    %%xmm5,%%xmm4                   \n"
+    "movdqa    %7,%%xmm5                       \n"
+    "punpckhbw %%xmm7,%%xmm1                   \n"
+    "movdqa    %%xmm1,%%xmm2                   \n"
+    "punpcklwd %%xmm7,%%xmm2                   \n"
+    "pmulld    %%xmm5,%%xmm2                   \n"
+    "movdqa    %8,%%xmm5                       \n"
+    "punpckhwd %%xmm7,%%xmm1                   \n"
+    "pmulld    %%xmm5,%%xmm1                   \n"
+    "paddd     %%xmm4,%%xmm3                   \n"
+    "paddd     %%xmm2,%%xmm1                   \n"
+    "paddd     %%xmm3,%%xmm1                   \n"
+    "pshufd    $0xe,%%xmm1,%%xmm2              \n"
+    "paddd     %%xmm2,%%xmm1                   \n"
+    "pshufd    $0x1,%%xmm1,%%xmm2              \n"
+    "paddd     %%xmm2,%%xmm1                   \n"
+    "paddd     %%xmm1,%%xmm0                   \n"
+    "sub       $0x10,%1                        \n"
+    "jg        1b                              \n"
+    "movd      %%xmm0,%3                       \n"
+  : "+r"(src),        // %0
+    "+r"(count),      // %1
+    "+rm"(seed),      // %2
+    "=g"(hash)        // %3
+  : "m"(kHash16x33),  // %4
+    "m"(kHashMul0),   // %5
+    "m"(kHashMul1),   // %6
+    "m"(kHashMul2),   // %7
+    "m"(kHashMul3)    // %8
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+  return hash;
+}
+#endif  // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
diff --git a/libs/libyuv/source/compare_neon.cc b/libs/libyuv/source/compare_neon.cc
new file mode 100644
index 0000000000..49aa3b4eef
--- /dev/null
+++ b/libs/libyuv/source/compare_neon.cc
@@ -0,0 +1,66 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#include "libyuv/compare_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+    !defined(__aarch64__)
+
+uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
+  volatile uint32 sse;
+  asm volatile (
+    "vmov.u8    q8, #0                         \n"
+    "vmov.u8    q10, #0                        \n"
+    "vmov.u8    q9, #0                         \n"
+    "vmov.u8    q11, #0                        \n"
+
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"
+    MEMACCESS(1)
+    "vld1.8     {q1}, [%1]!                    \n"
+    "subs       %2, %2, #16                    \n"
+    "vsubl.u8   q2, d0, d2                     \n"
+    "vsubl.u8   q3, d1, d3                     \n"
+    "vmlal.s16  q8, d4, d4                     \n"
+    "vmlal.s16  q9, d6, d6                     \n"
+    "vmlal.s16  q10, d5, d5                    \n"
+    "vmlal.s16  q11, d7, d7                    \n"
+    "bgt        1b                             \n"
+
+    "vadd.u32   q8, q8, q9                     \n"
+    "vadd.u32   q10, q10, q11                  \n"
+    "vadd.u32   q11, q8, q10                   \n"
+    "vpaddl.u32 q1, q11                        \n"
+    "vadd.u64   d0, d2, d3                     \n"
+    "vmov.32    %3, d0[0]                      \n"
+    : "+r"(src_a),
+      "+r"(src_b),
+      "+r"(count),
+      "=r"(sse)
+    :
+    : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
+  return sse;
+}
+
+#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libyuv/source/compare_neon64.cc b/libs/libyuv/source/compare_neon64.cc
new file mode 100644
index 0000000000..f9c7df98c8
--- /dev/null
+++ b/libs/libyuv/source/compare_neon64.cc
@@ -0,0 +1,64 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#include "libyuv/compare_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
+  volatile uint32 sse;
+  asm volatile (
+    "eor        v16.16b, v16.16b, v16.16b      \n"
+    "eor        v18.16b, v18.16b, v18.16b      \n"
+    "eor        v17.16b, v17.16b, v17.16b      \n"
+    "eor        v19.16b, v19.16b, v19.16b      \n"
+
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"
+    MEMACCESS(1)
+    "ld1        {v1.16b}, [%1], #16            \n"
+    "subs       %w2, %w2, #16                  \n"
+    "usubl      v2.8h, v0.8b, v1.8b            \n"
+    "usubl2     v3.8h, v0.16b, v1.16b          \n"
+    "smlal      v16.4s, v2.4h, v2.4h           \n"
+    "smlal      v17.4s, v3.4h, v3.4h           \n"
+    "smlal2     v18.4s, v2.8h, v2.8h           \n"
+    "smlal2     v19.4s, v3.8h, v3.8h           \n"
+    "b.gt       1b                             \n"
+
+    "add        v16.4s, v16.4s, v17.4s         \n"
+    "add        v18.4s, v18.4s, v19.4s         \n"
+    "add        v19.4s, v16.4s, v18.4s         \n"
+    "addv       s0, v19.4s                     \n"
+    "fmov       %w3, s0                        \n"
+    : "+r"(src_a),
+      "+r"(src_b),
+      "+r"(count),
+      "=r"(sse)
+    :
+    : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
+  return sse;
+}
+
+#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libyuv/source/compare_win.cc b/libs/libyuv/source/compare_win.cc
new file mode 100644
index 0000000000..dc86fe25b1
--- /dev/null
+++ b/libs/libyuv/source/compare_win.cc
@@ -0,0 +1,222 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#include "libyuv/compare_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for 32 bit Visual C x86 and clangcl
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+
+__declspec(naked)
+uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
+  __asm {
+    mov        eax, [esp + 4]    // src_a
+    mov        edx, [esp + 8]    // src_b
+    mov        ecx, [esp + 12]   // count
+    pxor       xmm0, xmm0
+    pxor       xmm5, xmm5
+
+  wloop:
+    movdqu     xmm1, [eax]
+    lea        eax,  [eax + 16]
+    movdqu     xmm2, [edx]
+    lea        edx,  [edx + 16]
+    movdqa     xmm3, xmm1  // abs trick
+    psubusb    xmm1, xmm2
+    psubusb    xmm2, xmm3
+    por        xmm1, xmm2
+    movdqa     xmm2, xmm1
+    punpcklbw  xmm1, xmm5
+    punpckhbw  xmm2, xmm5
+    pmaddwd    xmm1, xmm1
+    pmaddwd    xmm2, xmm2
+    paddd      xmm0, xmm1
+    paddd      xmm0, xmm2
+    sub        ecx, 16
+    jg         wloop
+
+    pshufd     xmm1, xmm0, 0xee
+    paddd      xmm0, xmm1
+    pshufd     xmm1, xmm0, 0x01
+    paddd      xmm0, xmm1
+    movd       eax, xmm0
+    ret
+  }
+}
+
+// Visual C 2012 required for AVX2.
+#if _MSC_VER >= 1700
+// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
+#pragma warning(disable: 4752)
+__declspec(naked)
+uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
+  __asm {
+    mov        eax, [esp + 4]    // src_a
+    mov        edx, [esp + 8]    // src_b
+    mov        ecx, [esp + 12]   // count
+    vpxor      ymm0, ymm0, ymm0  // sum
+    vpxor      ymm5, ymm5, ymm5  // constant 0 for unpck
+    sub        edx, eax
+
+  wloop:
+    vmovdqu    ymm1, [eax]
+    vmovdqu    ymm2, [eax + edx]
+    lea        eax,  [eax + 32]
+    vpsubusb   ymm3, ymm1, ymm2  // abs difference trick
+    vpsubusb   ymm2, ymm2, ymm1
+    vpor       ymm1, ymm2, ymm3
+    vpunpcklbw ymm2, ymm1, ymm5  // u16.  mutates order.
+    vpunpckhbw ymm1, ymm1, ymm5
+    vpmaddwd   ymm2, ymm2, ymm2  // square + hadd to u32.
+    vpmaddwd   ymm1, ymm1, ymm1
+    vpaddd     ymm0, ymm0, ymm1
+    vpaddd     ymm0, ymm0, ymm2
+    sub        ecx, 32
+    jg         wloop
+
+    vpshufd    ymm1, ymm0, 0xee  // 3, 2 + 1, 0 both lanes.
+    vpaddd     ymm0, ymm0, ymm1
+    vpshufd    ymm1, ymm0, 0x01  // 1 + 0 both lanes.
+    vpaddd     ymm0, ymm0, ymm1
+    vpermq     ymm1, ymm0, 0x02  // high + low lane.
+    vpaddd     ymm0, ymm0, ymm1
+    vmovd      eax, xmm0
+    vzeroupper
+    ret
+  }
+}
+#endif  // _MSC_VER >= 1700
+
+uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16
+uvec32 kHashMul0 = {
+  0x0c3525e1,  // 33 ^ 15
+  0xa3476dc1,  // 33 ^ 14
+  0x3b4039a1,  // 33 ^ 13
+  0x4f5f0981,  // 33 ^ 12
+};
+uvec32 kHashMul1 = {
+  0x30f35d61,  // 33 ^ 11
+  0x855cb541,  // 33 ^ 10
+  0x040a9121,  // 33 ^ 9
+  0x747c7101,  // 33 ^ 8
+};
+uvec32 kHashMul2 = {
+  0xec41d4e1,  // 33 ^ 7
+  0x4cfa3cc1,  // 33 ^ 6
+  0x025528a1,  // 33 ^ 5
+  0x00121881,  // 33 ^ 4
+};
+uvec32 kHashMul3 = {
+  0x00008c61,  // 33 ^ 3
+  0x00000441,  // 33 ^ 2
+  0x00000021,  // 33 ^ 1
+  0x00000001,  // 33 ^ 0
+};
+
+__declspec(naked)
+uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
+  __asm {
+    mov        eax, [esp + 4]    // src
+    mov        ecx, [esp + 8]    // count
+    movd       xmm0, [esp + 12]  // seed
+
+    pxor       xmm7, xmm7        // constant 0 for unpck
+    movdqa     xmm6, xmmword ptr kHash16x33
+
+  wloop:
+    movdqu     xmm1, [eax]       // src[0-15]
+    lea        eax, [eax + 16]
+    pmulld     xmm0, xmm6        // hash *= 33 ^ 16
+    movdqa     xmm5, xmmword ptr kHashMul0
+    movdqa     xmm2, xmm1
+    punpcklbw  xmm2, xmm7        // src[0-7]
+    movdqa     xmm3, xmm2
+    punpcklwd  xmm3, xmm7        // src[0-3]
+    pmulld     xmm3, xmm5
+    movdqa     xmm5, xmmword ptr kHashMul1
+    movdqa     xmm4, xmm2
+    punpckhwd  xmm4, xmm7        // src[4-7]
+    pmulld     xmm4, xmm5
+    movdqa     xmm5, xmmword ptr kHashMul2
+    punpckhbw  xmm1, xmm7        // src[8-15]
+    movdqa     xmm2, xmm1
+    punpcklwd  xmm2, xmm7        // src[8-11]
+    pmulld     xmm2, xmm5
+    movdqa     xmm5, xmmword ptr kHashMul3
+    punpckhwd  xmm1, xmm7        // src[12-15]
+    pmulld     xmm1, xmm5
+    paddd      xmm3, xmm4        // add 16 results
+    paddd      xmm1, xmm2
+    paddd      xmm1, xmm3
+
+    pshufd     xmm2, xmm1, 0x0e  // upper 2 dwords
+    paddd      xmm1, xmm2
+    pshufd     xmm2, xmm1, 0x01
+    paddd      xmm1, xmm2
+    paddd      xmm0, xmm1
+    sub        ecx, 16
+    jg         wloop
+
+    movd       eax, xmm0         // return hash
+    ret
+  }
+}
+
+// Visual C 2012 required for AVX2.
+#if _MSC_VER >= 1700
+__declspec(naked)
+uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
+  __asm {
+    mov        eax, [esp + 4]    // src
+    mov        ecx, [esp + 8]    // count
+    vmovd      xmm0, [esp + 12]  // seed
+
+  wloop:
+    vpmovzxbd  xmm3, [eax]  // src[0-3]
+    vpmulld    xmm0, xmm0, xmmword ptr kHash16x33  // hash *= 33 ^ 16
+    vpmovzxbd  xmm4, [eax + 4]  // src[4-7]
+    vpmulld    xmm3, xmm3, xmmword ptr kHashMul0
+    vpmovzxbd  xmm2, [eax + 8]  // src[8-11]
+    vpmulld    xmm4, xmm4, xmmword ptr kHashMul1
+    vpmovzxbd  xmm1, [eax + 12]  // src[12-15]
+    vpmulld    xmm2, xmm2, xmmword ptr kHashMul2
+    lea        eax, [eax + 16]
+    vpmulld    xmm1, xmm1, xmmword ptr kHashMul3
+    vpaddd     xmm3, xmm3, xmm4        // add 16 results
+    vpaddd     xmm1, xmm1, xmm2
+    vpaddd     xmm1, xmm1, xmm3
+    vpshufd    xmm2, xmm1, 0x0e  // upper 2 dwords
+    vpaddd     xmm1, xmm1,xmm2
+    vpshufd    xmm2, xmm1, 0x01
+    vpaddd     xmm1, xmm1, xmm2
+    vpaddd     xmm0, xmm0, xmm1
+    sub        ecx, 16
+    jg         wloop
+
+    vmovd      eax, xmm0         // return hash
+    vzeroupper
+    ret
+  }
+}
+#endif  // _MSC_VER >= 1700
+
+#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libyuv/source/convert.cc b/libs/libyuv/source/convert.cc
new file mode 100644
index 0000000000..e332bc505c
--- /dev/null
+++ b/libs/libyuv/source/convert.cc
@@ -0,0 +1,1389 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert.h"
+
+#include "libyuv/basic_types.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+#include "libyuv/scale.h"  // For ScalePlane()
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
+static __inline int Abs(int v) {
+  return v >= 0 ? v : -v;
+}
+
+// Any I4xx To I420 format with mirroring.
+static int I4xxToI420(const uint8* src_y, int src_stride_y,
+                      const uint8* src_u, int src_stride_u,
+                      const uint8* src_v, int src_stride_v,
+                      uint8* dst_y, int dst_stride_y,
+                      uint8* dst_u, int dst_stride_u,
+                      uint8* dst_v, int dst_stride_v,
+                      int src_y_width, int src_y_height,
+                      int src_uv_width, int src_uv_height) {
+  const int dst_y_width = Abs(src_y_width);
+  const int dst_y_height = Abs(src_y_height);
+  const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1);
+  const int dst_uv_height = SUBSAMPLE(dst_y_height, 1, 1);
+  if (src_y_width == 0 || src_y_height == 0 ||
+      src_uv_width == 0 || src_uv_height == 0) {
+    return -1;
+  }
+  ScalePlane(src_y, src_stride_y, src_y_width, src_y_height,
+             dst_y, dst_stride_y, dst_y_width, dst_y_height,
+             kFilterBilinear);
+  ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height,
+             dst_u, dst_stride_u, dst_uv_width, dst_uv_height,
+             kFilterBilinear);
+  ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height,
+             dst_v, dst_stride_v, dst_uv_width, dst_uv_height,
+             kFilterBilinear);
+  return 0;
+}
+
+// Copy I420 with optional flipping
+// TODO(fbarchard): Use Scale plane which supports mirroring, but ensure
+// is does row coalescing.
+LIBYUV_API
+int I420Copy(const uint8* src_y, int src_stride_y,
+             const uint8* src_u, int src_stride_u,
+             const uint8* src_v, int src_stride_v,
+             uint8* dst_y, int dst_stride_y,
+             uint8* dst_u, int dst_stride_u,
+             uint8* dst_v, int dst_stride_v,
+             int width, int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_y || !src_u || !src_v ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  if (dst_y) {
+    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+  // Copy UV planes.
+  CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
+  CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
+  return 0;
+}
+
+// 422 chroma is 1/2 width, 1x height
+// 420 chroma is 1/2 width, 1/2 height
+LIBYUV_API
+int I422ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  const int src_uv_width = SUBSAMPLE(width, 1, 1);
+  return I4xxToI420(src_y, src_stride_y,
+                    src_u, src_stride_u,
+                    src_v, src_stride_v,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height,
+                    src_uv_width, height);
+}
+
+// 444 chroma is 1x width, 1x height
+// 420 chroma is 1/2 width, 1/2 height
+LIBYUV_API
+int I444ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  return I4xxToI420(src_y, src_stride_y,
+                    src_u, src_stride_u,
+                    src_v, src_stride_v,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height,
+                    width, height);
+}
+
+// 411 chroma is 1/4 width, 1x height
+// 420 chroma is 1/2 width, 1/2 height
+LIBYUV_API
+int I411ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  const int src_uv_width = SUBSAMPLE(width, 3, 2);
+  return I4xxToI420(src_y, src_stride_y,
+                    src_u, src_stride_u,
+                    src_v, src_stride_v,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height,
+                    src_uv_width, height);
+}
+
+// I400 is greyscale typically used in MJPG
+LIBYUV_API
+int I400ToI420(const uint8* src_y, int src_stride_y,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_y || !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  SetPlane(dst_u, dst_stride_u, halfwidth, halfheight, 128);
+  SetPlane(dst_v, dst_stride_v, halfwidth, halfheight, 128);
+  return 0;
+}
+
+static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
+                       uint8* dst, int dst_stride,
+                       int width, int height) {
+  int y;
+  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+#if defined(HAS_COPYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
+  }
+#endif
+#if defined(HAS_COPYROW_AVX)
+  if (TestCpuFlag(kCpuHasAVX)) {
+    CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
+  }
+#endif
+#if defined(HAS_COPYROW_ERMS)
+  if (TestCpuFlag(kCpuHasERMS)) {
+    CopyRow = CopyRow_ERMS;
+  }
+#endif
+#if defined(HAS_COPYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
+  }
+#endif
+#if defined(HAS_COPYROW_MIPS)
+  if (TestCpuFlag(kCpuHasMIPS)) {
+    CopyRow = CopyRow_MIPS;
+  }
+#endif
+
+  // Copy plane
+  for (y = 0; y < height - 1; y += 2) {
+    CopyRow(src, dst, width);
+    CopyRow(src + src_stride_0, dst + dst_stride, width);
+    src += src_stride_0 + src_stride_1;
+    dst += dst_stride * 2;
+  }
+  if (height & 1) {
+    CopyRow(src, dst, width);
+  }
+}
+
+// Support converting from FOURCC_M420
+// Useful for bandwidth constrained transports like USB 1.0 and 2.0 and for
+// easy conversion to I420.
+// M420 format description:
+// M420 is row biplanar 420: 2 rows of Y and 1 row of UV.
+// Chroma is half width / half height. (420)
+// src_stride_m420 is row planar. Normally this will be the width in pixels.
+//   The UV plane is half width, but 2 values, so src_stride_m420 applies to
+//   this as well as the two Y planes.
+static int X420ToI420(const uint8* src_y,
+                      int src_stride_y0, int src_stride_y1,
+                      const uint8* src_uv, int src_stride_uv,
+                      uint8* dst_y, int dst_stride_y,
+                      uint8* dst_u, int dst_stride_u,
+                      uint8* dst_v, int dst_stride_v,
+                      int width, int height) {
+  int y;
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                     int width) = SplitUVRow_C;
+  if (!src_y || !src_uv ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_u = dst_u + (halfheight - 1) * dst_stride_u;
+    dst_v = dst_v + (halfheight - 1) * dst_stride_v;
+    dst_stride_y = -dst_stride_y;
+    dst_stride_u = -dst_stride_u;
+    dst_stride_v = -dst_stride_v;
+  }
+  // Coalesce rows.
+  if (src_stride_y0 == width &&
+      src_stride_y1 == width &&
+      dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_y0 = src_stride_y1 = dst_stride_y = 0;
+  }
+  // Coalesce rows.
+  if (src_stride_uv == halfwidth * 2 &&
+      dst_stride_u == halfwidth &&
+      dst_stride_v == halfwidth) {
+    halfwidth *= halfheight;
+    halfheight = 1;
+    src_stride_uv = dst_stride_u = dst_stride_v = 0;
+  }
+#if defined(HAS_SPLITUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SplitUVRow = SplitUVRow_Any_SSE2;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      SplitUVRow = SplitUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    SplitUVRow = SplitUVRow_Any_AVX2;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      SplitUVRow = SplitUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SplitUVRow = SplitUVRow_Any_NEON;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      SplitUVRow = SplitUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) &&
+      IS_ALIGNED(src_uv, 4) && IS_ALIGNED(src_stride_uv, 4) &&
+      IS_ALIGNED(dst_u, 4) && IS_ALIGNED(dst_stride_u, 4) &&
+      IS_ALIGNED(dst_v, 4) && IS_ALIGNED(dst_stride_v, 4)) {
+    SplitUVRow = SplitUVRow_Any_DSPR2;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      SplitUVRow = SplitUVRow_DSPR2;
+    }
+  }
+#endif
+
+  if (dst_y) {
+    if (src_stride_y0 == src_stride_y1) {
+      CopyPlane(src_y, src_stride_y0, dst_y, dst_stride_y, width, height);
+    } else {
+      CopyPlane2(src_y, src_stride_y0, src_stride_y1, dst_y, dst_stride_y,
+                 width, height);
+    }
+  }
+
+  for (y = 0; y < halfheight; ++y) {
+    // Copy a row of UV.
+    SplitUVRow(src_uv, dst_u, dst_v, halfwidth);
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+    src_uv += src_stride_uv;
+  }
+  return 0;
+}
+
+// Convert NV12 to I420.
+LIBYUV_API
+int NV12ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_uv, int src_stride_uv,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  return X420ToI420(src_y, src_stride_y, src_stride_y,
+                    src_uv, src_stride_uv,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height);
+}
+
+// Convert NV21 to I420.  Same as NV12 but u and v pointers swapped.
+LIBYUV_API
+int NV21ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_vu, int src_stride_vu,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  return X420ToI420(src_y, src_stride_y, src_stride_y,
+                    src_vu, src_stride_vu,
+                    dst_y, dst_stride_y,
+                    dst_v, dst_stride_v,
+                    dst_u, dst_stride_u,
+                    width, height);
+}
+
+// Convert M420 to I420.
+LIBYUV_API
+int M420ToI420(const uint8* src_m420, int src_stride_m420,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  return X420ToI420(src_m420, src_stride_m420, src_stride_m420 * 2,
+                    src_m420 + src_stride_m420 * 2, src_stride_m420 * 3,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height);
+}
+
+// Convert YUY2 to I420.
+LIBYUV_API
+int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*YUY2ToUVRow)(const uint8* src_yuy2, int src_stride_yuy2,
+      uint8* dst_u, uint8* dst_v, int width) = YUY2ToUVRow_C;
+  void (*YUY2ToYRow)(const uint8* src_yuy2,
+      uint8* dst_y, int width) = YUY2ToYRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+    src_stride_yuy2 = -src_stride_yuy2;
+  }
+#if defined(HAS_YUY2TOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    YUY2ToUVRow = YUY2ToUVRow_Any_SSE2;
+    YUY2ToYRow = YUY2ToYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToUVRow = YUY2ToUVRow_SSE2;
+      YUY2ToYRow = YUY2ToYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    YUY2ToUVRow = YUY2ToUVRow_Any_AVX2;
+    YUY2ToYRow = YUY2ToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      YUY2ToUVRow = YUY2ToUVRow_AVX2;
+      YUY2ToYRow = YUY2ToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    YUY2ToYRow = YUY2ToYRow_Any_NEON;
+    YUY2ToUVRow = YUY2ToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToYRow = YUY2ToYRow_NEON;
+      YUY2ToUVRow = YUY2ToUVRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width);
+    YUY2ToYRow(src_yuy2, dst_y, width);
+    YUY2ToYRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y, width);
+    src_yuy2 += src_stride_yuy2 * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    YUY2ToUVRow(src_yuy2, 0, dst_u, dst_v, width);
+    YUY2ToYRow(src_yuy2, dst_y, width);
+  }
+  return 0;
+}
+
+// Convert UYVY to I420.
+LIBYUV_API
+int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy,
+      uint8* dst_u, uint8* dst_v, int width) = UYVYToUVRow_C;
+  void (*UYVYToYRow)(const uint8* src_uyvy,
+      uint8* dst_y, int width) = UYVYToYRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+    src_stride_uyvy = -src_stride_uyvy;
+  }
+#if defined(HAS_UYVYTOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    UYVYToUVRow = UYVYToUVRow_Any_SSE2;
+    UYVYToYRow = UYVYToYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToUVRow = UYVYToUVRow_SSE2;
+      UYVYToYRow = UYVYToYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    UYVYToUVRow = UYVYToUVRow_Any_AVX2;
+    UYVYToYRow = UYVYToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      UYVYToUVRow = UYVYToUVRow_AVX2;
+      UYVYToYRow = UYVYToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    UYVYToYRow = UYVYToYRow_Any_NEON;
+    UYVYToUVRow = UYVYToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToYRow = UYVYToYRow_NEON;
+      UYVYToUVRow = UYVYToUVRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width);
+    UYVYToYRow(src_uyvy, dst_y, width);
+    UYVYToYRow(src_uyvy + src_stride_uyvy, dst_y + dst_stride_y, width);
+    src_uyvy += src_stride_uyvy * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    UYVYToUVRow(src_uyvy, 0, dst_u, dst_v, width);
+    UYVYToYRow(src_uyvy, dst_y, width);
+  }
+  return 0;
+}
+
+// Convert ARGB to I420.
+LIBYUV_API
+int ARGBToI420(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+      ARGBToYRow_C;
+  if (!src_argb ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVRow = ARGBToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width);
+    ARGBToYRow(src_argb, dst_y, width);
+    ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+    src_argb += src_stride_argb * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
+    ARGBToYRow(src_argb, dst_y, width);
+  }
+  return 0;
+}
+
+// Convert BGRA to I420.
+LIBYUV_API
+int BGRAToI420(const uint8* src_bgra, int src_stride_bgra,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*BGRAToUVRow)(const uint8* src_bgra0, int src_stride_bgra,
+      uint8* dst_u, uint8* dst_v, int width) = BGRAToUVRow_C;
+  void (*BGRAToYRow)(const uint8* src_bgra, uint8* dst_y, int width) =
+      BGRAToYRow_C;
+  if (!src_bgra ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_bgra = src_bgra + (height - 1) * src_stride_bgra;
+    src_stride_bgra = -src_stride_bgra;
+  }
+#if defined(HAS_BGRATOYROW_SSSE3) && defined(HAS_BGRATOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    BGRAToUVRow = BGRAToUVRow_Any_SSSE3;
+    BGRAToYRow = BGRAToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      BGRAToUVRow = BGRAToUVRow_SSSE3;
+      BGRAToYRow = BGRAToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_BGRATOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    BGRAToYRow = BGRAToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      BGRAToYRow = BGRAToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_BGRATOUVROW_NEON)
+    if (TestCpuFlag(kCpuHasNEON)) {
+      BGRAToUVRow = BGRAToUVRow_Any_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        BGRAToUVRow = BGRAToUVRow_NEON;
+      }
+    }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    BGRAToUVRow(src_bgra, src_stride_bgra, dst_u, dst_v, width);
+    BGRAToYRow(src_bgra, dst_y, width);
+    BGRAToYRow(src_bgra + src_stride_bgra, dst_y + dst_stride_y, width);
+    src_bgra += src_stride_bgra * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    BGRAToUVRow(src_bgra, 0, dst_u, dst_v, width);
+    BGRAToYRow(src_bgra, dst_y, width);
+  }
+  return 0;
+}
+
+// Convert ABGR to I420.
+LIBYUV_API
+int ABGRToI420(const uint8* src_abgr, int src_stride_abgr,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*ABGRToUVRow)(const uint8* src_abgr0, int src_stride_abgr,
+      uint8* dst_u, uint8* dst_v, int width) = ABGRToUVRow_C;
+  void (*ABGRToYRow)(const uint8* src_abgr, uint8* dst_y, int width) =
+      ABGRToYRow_C;
+  if (!src_abgr ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+    src_stride_abgr = -src_stride_abgr;
+  }
+#if defined(HAS_ABGRTOYROW_SSSE3) && defined(HAS_ABGRTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
+    ABGRToYRow = ABGRToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVRow = ABGRToUVRow_SSSE3;
+      ABGRToYRow = ABGRToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ABGRToYRow = ABGRToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ABGRToYRow = ABGRToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ABGRToUVRow = ABGRToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVRow = ABGRToUVRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width);
+    ABGRToYRow(src_abgr, dst_y, width);
+    ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width);
+    src_abgr += src_stride_abgr * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    ABGRToUVRow(src_abgr, 0, dst_u, dst_v, width);
+    ABGRToYRow(src_abgr, dst_y, width);
+  }
+  return 0;
+}
+
+// Convert RGBA to I420.
+LIBYUV_API
+int RGBAToI420(const uint8* src_rgba, int src_stride_rgba,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*RGBAToUVRow)(const uint8* src_rgba0, int src_stride_rgba,
+      uint8* dst_u, uint8* dst_v, int width) = RGBAToUVRow_C;
+  void (*RGBAToYRow)(const uint8* src_rgba, uint8* dst_y, int width) =
+      RGBAToYRow_C;
+  if (!src_rgba ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_rgba = src_rgba + (height - 1) * src_stride_rgba;
+    src_stride_rgba = -src_stride_rgba;
+  }
+#if defined(HAS_RGBATOYROW_SSSE3) && defined(HAS_RGBATOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RGBAToUVRow = RGBAToUVRow_Any_SSSE3;
+    RGBAToYRow = RGBAToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToUVRow = RGBAToUVRow_SSSE3;
+      RGBAToYRow = RGBAToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGBAToYRow = RGBAToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGBAToYRow = RGBAToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGBAToUVRow = RGBAToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToUVRow = RGBAToUVRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    RGBAToUVRow(src_rgba, src_stride_rgba, dst_u, dst_v, width);
+    RGBAToYRow(src_rgba, dst_y, width);
+    RGBAToYRow(src_rgba + src_stride_rgba, dst_y + dst_stride_y, width);
+    src_rgba += src_stride_rgba * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    RGBAToUVRow(src_rgba, 0, dst_u, dst_v, width);
+    RGBAToYRow(src_rgba, dst_y, width);
+  }
+  return 0;
+}
+
+// Convert RGB24 to I420.
+LIBYUV_API
+int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
+                uint8* dst_y, int dst_stride_y,
+                uint8* dst_u, int dst_stride_u,
+                uint8* dst_v, int dst_stride_v,
+                int width, int height) {
+  int y;
+#if defined(HAS_RGB24TOYROW_NEON)
+  void (*RGB24ToUVRow)(const uint8* src_rgb24, int src_stride_rgb24,
+      uint8* dst_u, uint8* dst_v, int width) = RGB24ToUVRow_C;
+  void (*RGB24ToYRow)(const uint8* src_rgb24, uint8* dst_y, int width) =
+      RGB24ToYRow_C;
+#else
+  void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
+      RGB24ToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+      ARGBToYRow_C;
+#endif
+  if (!src_rgb24 || !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+    src_stride_rgb24 = -src_stride_rgb24;
+  }
+
+// Neon version does direct RGB24 to YUV.
+#if defined(HAS_RGB24TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGB24ToUVRow = RGB24ToUVRow_Any_NEON;
+    RGB24ToYRow = RGB24ToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGB24ToYRow = RGB24ToYRow_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        RGB24ToUVRow = RGB24ToUVRow_NEON;
+      }
+    }
+  }
+// Other platforms do intermediate conversion from RGB24 to ARGB.
+#else
+#if defined(HAS_RGB24TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+  {
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (width * 4 + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);
+#endif
+
+    for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_RGB24TOYROW_NEON)
+      RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
+      RGB24ToYRow(src_rgb24, dst_y, width);
+      RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
+#else
+      RGB24ToARGBRow(src_rgb24, row, width);
+      RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);
+      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+      src_rgb24 += src_stride_rgb24 * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+#if defined(HAS_RGB24TOYROW_NEON)
+      RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width);
+      RGB24ToYRow(src_rgb24, dst_y, width);
+#else
+      RGB24ToARGBRow(src_rgb24, row, width);
+      ARGBToUVRow(row, 0, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+#endif
+    }
+#if !defined(HAS_RGB24TOYROW_NEON)
+    free_aligned_buffer_64(row);
+  }
+#endif
+  return 0;
+}
+
+// Convert RAW to I420.
+LIBYUV_API
+int RAWToI420(const uint8* src_raw, int src_stride_raw,
+              uint8* dst_y, int dst_stride_y,
+              uint8* dst_u, int dst_stride_u,
+              uint8* dst_v, int dst_stride_v,
+              int width, int height) {
+  int y;
+#if defined(HAS_RAWTOYROW_NEON)
+  void (*RAWToUVRow)(const uint8* src_raw, int src_stride_raw,
+      uint8* dst_u, uint8* dst_v, int width) = RAWToUVRow_C;
+  void (*RAWToYRow)(const uint8* src_raw, uint8* dst_y, int width) =
+      RAWToYRow_C;
+#else
+  void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
+      RAWToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+      ARGBToYRow_C;
+#endif
+  if (!src_raw || !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_raw = src_raw + (height - 1) * src_stride_raw;
+    src_stride_raw = -src_stride_raw;
+  }
+
+// Neon version does direct RAW to YUV.
+#if defined(HAS_RAWTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RAWToUVRow = RAWToUVRow_Any_NEON;
+    RAWToYRow = RAWToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RAWToYRow = RAWToYRow_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        RAWToUVRow = RAWToUVRow_NEON;
+      }
+    }
+  }
+// Other platforms do intermediate conversion from RAW to ARGB.
+#else
+#if defined(HAS_RAWTOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToARGBRow = RAWToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+  {
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (width * 4 + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);
+#endif
+
+    for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_RAWTOYROW_NEON)
+      RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width);
+      RAWToYRow(src_raw, dst_y, width);
+      RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
+#else
+      RAWToARGBRow(src_raw, row, width);
+      RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width);
+      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+      src_raw += src_stride_raw * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+#if defined(HAS_RAWTOYROW_NEON)
+      RAWToUVRow(src_raw, 0, dst_u, dst_v, width);
+      RAWToYRow(src_raw, dst_y, width);
+#else
+      RAWToARGBRow(src_raw, row, width);
+      ARGBToUVRow(row, 0, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+#endif
+    }
+#if !defined(HAS_RAWTOYROW_NEON)
+    free_aligned_buffer_64(row);
+  }
+#endif
+  return 0;
+}
+
+// Convert RGB565 to I420.
+LIBYUV_API
+int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
+                 uint8* dst_y, int dst_stride_y,
+                 uint8* dst_u, int dst_stride_u,
+                 uint8* dst_v, int dst_stride_v,
+                 int width, int height) {
+  int y;
+#if defined(HAS_RGB565TOYROW_NEON)
+  void (*RGB565ToUVRow)(const uint8* src_rgb565, int src_stride_rgb565,
+      uint8* dst_u, uint8* dst_v, int width) = RGB565ToUVRow_C;
+  void (*RGB565ToYRow)(const uint8* src_rgb565, uint8* dst_y, int width) =
+      RGB565ToYRow_C;
+#else
+  void (*RGB565ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
+      RGB565ToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+      ARGBToYRow_C;
+#endif
+  if (!src_rgb565 || !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565;
+    src_stride_rgb565 = -src_stride_rgb565;
+  }
+
+// Neon version does direct RGB565 to YUV.
+#if defined(HAS_RGB565TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGB565ToUVRow = RGB565ToUVRow_Any_NEON;
+    RGB565ToYRow = RGB565ToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGB565ToYRow = RGB565ToYRow_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        RGB565ToUVRow = RGB565ToUVRow_NEON;
+      }
+    }
+  }
+// Other platforms do intermediate conversion from RGB565 to ARGB.
+#else
+#if defined(HAS_RGB565TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_RGB565TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+  {
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (width * 4 + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);
+#endif
+
+    for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_RGB565TOYROW_NEON)
+      RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width);
+      RGB565ToYRow(src_rgb565, dst_y, width);
+      RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width);
+#else
+      RGB565ToARGBRow(src_rgb565, row, width);
+      RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kRowSize, width);
+      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+      src_rgb565 += src_stride_rgb565 * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+#if defined(HAS_RGB565TOYROW_NEON)
+      RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width);
+      RGB565ToYRow(src_rgb565, dst_y, width);
+#else
+      RGB565ToARGBRow(src_rgb565, row, width);
+      ARGBToUVRow(row, 0, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+#endif
+    }
+#if !defined(HAS_RGB565TOYROW_NEON)
+    free_aligned_buffer_64(row);
+  }
+#endif
+  return 0;
+}
+
+// Convert ARGB1555 to I420.
+LIBYUV_API
+int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
+                   uint8* dst_y, int dst_stride_y,
+                   uint8* dst_u, int dst_stride_u,
+                   uint8* dst_v, int dst_stride_v,
+                   int width, int height) {
+  int y;
+#if defined(HAS_ARGB1555TOYROW_NEON)
+  void (*ARGB1555ToUVRow)(const uint8* src_argb1555, int src_stride_argb1555,
+      uint8* dst_u, uint8* dst_v, int width) = ARGB1555ToUVRow_C;
+  void (*ARGB1555ToYRow)(const uint8* src_argb1555, uint8* dst_y, int width) =
+      ARGB1555ToYRow_C;
+#else
+  void (*ARGB1555ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
+      ARGB1555ToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+      ARGBToYRow_C;
+#endif
+  if (!src_argb1555 || !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555;
+    src_stride_argb1555 = -src_stride_argb1555;
+  }
+
+// Neon version does direct ARGB1555 to YUV.
+#if defined(HAS_ARGB1555TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGB1555ToUVRow = ARGB1555ToUVRow_Any_NEON;
+    ARGB1555ToYRow = ARGB1555ToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB1555ToYRow = ARGB1555ToYRow_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        ARGB1555ToUVRow = ARGB1555ToUVRow_NEON;
+      }
+    }
+  }
+// Other platforms do intermediate conversion from ARGB1555 to ARGB.
+#else
+#if defined(HAS_ARGB1555TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+  {
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (width * 4 + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);
+#endif
+
+    for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_ARGB1555TOYROW_NEON)
+      ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width);
+      ARGB1555ToYRow(src_argb1555, dst_y, width);
+      ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y,
+                     width);
+#else
+      ARGB1555ToARGBRow(src_argb1555, row, width);
+      ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kRowSize,
+                        width);
+      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+      src_argb1555 += src_stride_argb1555 * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+#if defined(HAS_ARGB1555TOYROW_NEON)
+      ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width);
+      ARGB1555ToYRow(src_argb1555, dst_y, width);
+#else
+      ARGB1555ToARGBRow(src_argb1555, row, width);
+      ARGBToUVRow(row, 0, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+#endif
+    }
+#if !defined(HAS_ARGB1555TOYROW_NEON)
+    free_aligned_buffer_64(row);
+  }
+#endif
+  return 0;
+}
+
+// Convert ARGB4444 to I420.
+LIBYUV_API
+int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
+                   uint8* dst_y, int dst_stride_y,
+                   uint8* dst_u, int dst_stride_u,
+                   uint8* dst_v, int dst_stride_v,
+                   int width, int height) {
+  int y;
+#if defined(HAS_ARGB4444TOYROW_NEON)
+  void (*ARGB4444ToUVRow)(const uint8* src_argb4444, int src_stride_argb4444,
+      uint8* dst_u, uint8* dst_v, int width) = ARGB4444ToUVRow_C;
+  void (*ARGB4444ToYRow)(const uint8* src_argb4444, uint8* dst_y, int width) =
+      ARGB4444ToYRow_C;
+#else
+  void (*ARGB4444ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
+      ARGB4444ToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+      ARGBToYRow_C;
+#endif
+  if (!src_argb4444 || !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444;
+    src_stride_argb4444 = -src_stride_argb4444;
+  }
+
+// Neon version does direct ARGB4444 to YUV.
+#if defined(HAS_ARGB4444TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGB4444ToUVRow = ARGB4444ToUVRow_Any_NEON;
+    ARGB4444ToYRow = ARGB4444ToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB4444ToYRow = ARGB4444ToYRow_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        ARGB4444ToUVRow = ARGB4444ToUVRow_NEON;
+      }
+    }
+  }
+// Other platforms do intermediate conversion from ARGB4444 to ARGB.
+#else
+#if defined(HAS_ARGB4444TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+  {
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (width * 4 + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);
+#endif
+
+    for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_ARGB4444TOYROW_NEON)
+      ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width);
+      ARGB4444ToYRow(src_argb4444, dst_y, width);
+      ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y,
+                     width);
+#else
+      ARGB4444ToARGBRow(src_argb4444, row, width);
+      ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + kRowSize,
+                        width);
+      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+      src_argb4444 += src_stride_argb4444 * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+#if defined(HAS_ARGB4444TOYROW_NEON)
+      ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width);
+      ARGB4444ToYRow(src_argb4444, dst_y, width);
+#else
+      ARGB4444ToARGBRow(src_argb4444, row, width);
+      ARGBToUVRow(row, 0, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+#endif
+    }
+#if !defined(HAS_ARGB4444TOYROW_NEON)
+    free_aligned_buffer_64(row);
+  }
+#endif
+  return 0;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libyuv/source/convert_argb.cc b/libs/libyuv/source/convert_argb.cc
new file mode 100644
index 0000000000..e586f7043c
--- /dev/null
+++ b/libs/libyuv/source/convert_argb.cc
@@ -0,0 +1,1455 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert_argb.h"
+
+#include "libyuv/cpu_id.h"
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+#include "libyuv/rotate_argb.h"
+#include "libyuv/row.h"
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Copy ARGB with optional flipping
+LIBYUV_API
+int ARGBCopy(const uint8* src_argb, int src_stride_argb,
+             uint8* dst_argb, int dst_stride_argb,
+             int width, int height) {
+  if (!src_argb || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+
+  CopyPlane(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+            width * 4, height);
+  return 0;
+}
+
+// Convert I422 to ARGB with matrix
+static int I420ToARGBMatrix(const uint8* src_y, int src_stride_y,
+                            const uint8* src_u, int src_stride_u,
+                            const uint8* src_v, int src_stride_v,
+                            uint8* dst_argb, int dst_stride_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width, int height) {
+  int y;
+  void (*I422ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        const struct YuvConstants* yuvconstants,
+                        int width) = I422ToARGBRow_C;
+  if (!src_y || !src_u || !src_v || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_I422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGBRow = I422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    I422ToARGBRow = I422ToARGBRow_DSPR2;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to ARGB.
+LIBYUV_API
+int I420ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y,
+                          src_u, src_stride_u,
+                          src_v, src_stride_v,
+                          dst_argb, dst_stride_argb,
+                          &kYuvI601Constants,
+                          width, height);
+}
+
+// Convert I420 to ABGR.
+LIBYUV_API
+int I420ToABGR(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_abgr, int dst_stride_abgr,
+               int width, int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y,
+                          src_v, src_stride_v,  // Swap U and V
+                          src_u, src_stride_u,
+                          dst_abgr, dst_stride_abgr,
+                          &kYvuI601Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert J420 to ARGB.
+LIBYUV_API
+int J420ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y,
+                          src_u, src_stride_u,
+                          src_v, src_stride_v,
+                          dst_argb, dst_stride_argb,
+                          &kYuvJPEGConstants,
+                          width, height);
+}
+
+// Convert J420 to ABGR.
+LIBYUV_API
+int J420ToABGR(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_abgr, int dst_stride_abgr,
+               int width, int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y,
+                          src_v, src_stride_v,  // Swap U and V
+                          src_u, src_stride_u,
+                          dst_abgr, dst_stride_abgr,
+                          &kYvuJPEGConstants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert H420 to ARGB.
+LIBYUV_API
+int H420ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y,
+                          src_u, src_stride_u,
+                          src_v, src_stride_v,
+                          dst_argb, dst_stride_argb,
+                          &kYuvH709Constants,
+                          width, height);
+}
+
+// Convert H420 to ABGR.
+LIBYUV_API
+int H420ToABGR(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_abgr, int dst_stride_abgr,
+               int width, int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y,
+                          src_v, src_stride_v,  // Swap U and V
+                          src_u, src_stride_u,
+                          dst_abgr, dst_stride_abgr,
+                          &kYvuH709Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert I422 to ARGB with matrix
+static int I422ToARGBMatrix(const uint8* src_y, int src_stride_y,
+                            const uint8* src_u, int src_stride_u,
+                            const uint8* src_v, int src_stride_v,
+                            uint8* dst_argb, int dst_stride_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width, int height) {
+  int y;
+  void (*I422ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        const struct YuvConstants* yuvconstants,
+                        int width) = I422ToARGBRow_C;
+  if (!src_y || !src_u || !src_v ||
+      !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      src_stride_u * 2 == width &&
+      src_stride_v * 2 == width &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
+  }
+#if defined(HAS_I422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGBRow = I422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    I422ToARGBRow = I422ToARGBRow_DSPR2;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert I422 to ARGB.
+LIBYUV_API
+int I422ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y,
+                          src_u, src_stride_u,
+                          src_v, src_stride_v,
+                          dst_argb, dst_stride_argb,
+                          &kYuvI601Constants,
+                          width, height);
+}
+
+// Convert I422 to ABGR.
+LIBYUV_API
+int I422ToABGR(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_abgr, int dst_stride_abgr,
+               int width, int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y,
+                          src_v, src_stride_v,  // Swap U and V
+                          src_u, src_stride_u,
+                          dst_abgr, dst_stride_abgr,
+                          &kYvuI601Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert J422 to ARGB.
+LIBYUV_API
+int J422ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y,
+                          src_u, src_stride_u,
+                          src_v, src_stride_v,
+                          dst_argb, dst_stride_argb,
+                          &kYuvJPEGConstants,
+                          width, height);
+}
+
+// Convert J422 to ABGR.
+LIBYUV_API
+int J422ToABGR(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_abgr, int dst_stride_abgr,
+               int width, int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y,
+                          src_v, src_stride_v,  // Swap U and V
+                          src_u, src_stride_u,
+                          dst_abgr, dst_stride_abgr,
+                          &kYvuJPEGConstants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert H422 to ARGB.
+LIBYUV_API
+int H422ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y,
+                          src_u, src_stride_u,
+                          src_v, src_stride_v,
+                          dst_argb, dst_stride_argb,
+                          &kYuvH709Constants,
+                          width, height);
+}
+
+// Convert H422 to ABGR.
+LIBYUV_API
+int H422ToABGR(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_abgr, int dst_stride_abgr,
+               int width, int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y,
+                          src_v, src_stride_v,  // Swap U and V
+                          src_u, src_stride_u,
+                          dst_abgr, dst_stride_abgr,
+                          &kYvuH709Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert I444 to ARGB with matrix
+static int I444ToARGBMatrix(const uint8* src_y, int src_stride_y,
+                            const uint8* src_u, int src_stride_u,
+                            const uint8* src_v, int src_stride_v,
+                            uint8* dst_argb, int dst_stride_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width, int height) {
+  int y;
+  void (*I444ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        const struct YuvConstants* yuvconstants,
+                        int width) = I444ToARGBRow_C;
+  if (!src_y || !src_u || !src_v ||
+      !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      src_stride_u == width &&
+      src_stride_v == width &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
+  }
+#if defined(HAS_I444TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I444ToARGBRow = I444ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I444ToARGBRow = I444ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I444TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I444ToARGBRow = I444ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I444ToARGBRow = I444ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I444TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I444ToARGBRow = I444ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I444ToARGBRow = I444ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert I444 to ARGB.
+LIBYUV_API
+int I444ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  return I444ToARGBMatrix(src_y, src_stride_y,
+                          src_u, src_stride_u,
+                          src_v, src_stride_v,
+                          dst_argb, dst_stride_argb,
+                          &kYuvI601Constants,
+                          width, height);
+}
+
+// Convert I444 to ABGR.
+LIBYUV_API
+int I444ToABGR(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_abgr, int dst_stride_abgr,
+               int width, int height) {
+  return I444ToARGBMatrix(src_y, src_stride_y,
+                          src_v, src_stride_v,  // Swap U and V
+                          src_u, src_stride_u,
+                          dst_abgr, dst_stride_abgr,
+                          &kYvuI601Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert J444 to ARGB.
+LIBYUV_API
+int J444ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  return I444ToARGBMatrix(src_y, src_stride_y,
+                          src_u, src_stride_u,
+                          src_v, src_stride_v,
+                          dst_argb, dst_stride_argb,
+                          &kYuvJPEGConstants,
+                          width, height);
+}
+
+// Convert I411 to ARGB.
+LIBYUV_API
+int I411ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*I411ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        const struct YuvConstants* yuvconstants,
+                        int width) = I411ToARGBRow_C;
+  if (!src_y || !src_u || !src_v ||
+      !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      src_stride_u * 4 == width &&
+      src_stride_v * 4 == width &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
+  }
+#if defined(HAS_I411TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I411ToARGBRow = I411ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I411ToARGBRow = I411ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I411TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I411ToARGBRow = I411ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I411ToARGBRow = I411ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I411TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I411ToARGBRow = I411ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I411ToARGBRow = I411ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I411ToARGBRow(src_y, src_u, src_v, dst_argb, &kYuvI601Constants, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert I420 with Alpha to preattenuated ARGB.
+static int I420AlphaToARGBMatrix(const uint8* src_y, int src_stride_y,
+                                 const uint8* src_u, int src_stride_u,
+                                 const uint8* src_v, int src_stride_v,
+                                 const uint8* src_a, int src_stride_a,
+                                 uint8* dst_argb, int dst_stride_argb,
+                                 const struct YuvConstants* yuvconstants,
+                                 int width, int height, int attenuate) {
+  int y;
+  void (*I422AlphaToARGBRow)(const uint8* y_buf,
+                             const uint8* u_buf,
+                             const uint8* v_buf,
+                             const uint8* a_buf,
+                             uint8* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width) = I422AlphaToARGBRow_C;
+  void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb,
+                           int width) = ARGBAttenuateRow_C;
+  if (!src_y || !src_u || !src_v || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_I422ALPHATOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422AlphaToARGBRow = I422AlphaToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422ALPHATOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422AlphaToARGBRow = I422AlphaToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422ALPHATOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422AlphaToARGBRow = I422AlphaToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422ALPHATOARGBROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    I422AlphaToARGBRow = I422AlphaToARGBRow_DSPR2;
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
+                       width);
+    if (attenuate) {
+      ARGBAttenuateRow(dst_argb, dst_argb, width);
+    }
+    dst_argb += dst_stride_argb;
+    src_a += src_stride_a;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 with Alpha to ARGB.
+LIBYUV_API
+int I420AlphaToARGB(const uint8* src_y, int src_stride_y,
+                    const uint8* src_u, int src_stride_u,
+                    const uint8* src_v, int src_stride_v,
+                    const uint8* src_a, int src_stride_a,
+                    uint8* dst_argb, int dst_stride_argb,
+                    int width, int height, int attenuate) {
+  return I420AlphaToARGBMatrix(src_y, src_stride_y,
+                               src_u, src_stride_u,
+                               src_v, src_stride_v,
+                               src_a, src_stride_a,
+                               dst_argb, dst_stride_argb,
+                               &kYuvI601Constants,
+                               width, height, attenuate);
+}
+
+// Convert I420 with Alpha to ABGR.
+LIBYUV_API
+int I420AlphaToABGR(const uint8* src_y, int src_stride_y,
+                    const uint8* src_u, int src_stride_u,
+                    const uint8* src_v, int src_stride_v,
+                    const uint8* src_a, int src_stride_a,
+                    uint8* dst_abgr, int dst_stride_abgr,
+                    int width, int height, int attenuate) {
+  return I420AlphaToARGBMatrix(src_y, src_stride_y,
+                               src_v, src_stride_v,  // Swap U and V
+                               src_u, src_stride_u,
+                               src_a, src_stride_a,
+                               dst_abgr, dst_stride_abgr,
+                               &kYvuI601Constants,  // Use Yvu matrix
+                               width, height, attenuate);
+}
+
+// Convert I400 to ARGB.
+LIBYUV_API
+int I400ToARGB(const uint8* src_y, int src_stride_y,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*I400ToARGBRow)(const uint8* y_buf,
+                     uint8* rgb_buf,
+                     int width) = I400ToARGBRow_C;
+  if (!src_y || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_argb = 0;
+  }
+#if defined(HAS_I400TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    I400ToARGBRow = I400ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      I400ToARGBRow = I400ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_I400TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I400ToARGBRow = I400ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I400ToARGBRow = I400ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I400TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I400ToARGBRow = I400ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I400ToARGBRow = I400ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I400ToARGBRow(src_y, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+  }
+  return 0;
+}
+
+// Convert J400 to ARGB.
+LIBYUV_API
+int J400ToARGB(const uint8* src_y, int src_stride_y,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*J400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int width) =
+      J400ToARGBRow_C;
+  if (!src_y || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_argb = 0;
+  }
+#if defined(HAS_J400TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    J400ToARGBRow = J400ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      J400ToARGBRow = J400ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_J400TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    J400ToARGBRow = J400ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      J400ToARGBRow = J400ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_J400TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    J400ToARGBRow = J400ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      J400ToARGBRow = J400ToARGBRow_NEON;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    J400ToARGBRow(src_y, dst_argb, width);
+    src_y += src_stride_y;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Shuffle table for converting BGRA to ARGB.
+static uvec8 kShuffleMaskBGRAToARGB = {
+  3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
+};
+
+// Shuffle table for converting ABGR to ARGB.
+static uvec8 kShuffleMaskABGRToARGB = {
+  2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
+};
+
+// Shuffle table for converting RGBA to ARGB.
+static uvec8 kShuffleMaskRGBAToARGB = {
+  1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
+};
+
+// Convert BGRA to ARGB.
+LIBYUV_API
+int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  return ARGBShuffle(src_bgra, src_stride_bgra,
+                     dst_argb, dst_stride_argb,
+                     (const uint8*)(&kShuffleMaskBGRAToARGB),
+                     width, height);
+}
+
+// Convert ARGB to BGRA (same as BGRAToARGB).
+LIBYUV_API
+int ARGBToBGRA(const uint8* src_bgra, int src_stride_bgra,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  return ARGBShuffle(src_bgra, src_stride_bgra,
+                     dst_argb, dst_stride_argb,
+                     (const uint8*)(&kShuffleMaskBGRAToARGB),
+                     width, height);
+}
+
+// Convert ABGR to ARGB.
+LIBYUV_API
+int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  return ARGBShuffle(src_abgr, src_stride_abgr,
+                     dst_argb, dst_stride_argb,
+                     (const uint8*)(&kShuffleMaskABGRToARGB),
+                     width, height);
+}
+
+// Convert ARGB to ABGR to (same as ABGRToARGB).
+LIBYUV_API
+int ARGBToABGR(const uint8* src_abgr, int src_stride_abgr,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  return ARGBShuffle(src_abgr, src_stride_abgr,
+                     dst_argb, dst_stride_argb,
+                     (const uint8*)(&kShuffleMaskABGRToARGB),
+                     width, height);
+}
+
+// Convert RGBA to ARGB.
+LIBYUV_API
+int RGBAToARGB(const uint8* src_rgba, int src_stride_rgba,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  return ARGBShuffle(src_rgba, src_stride_rgba,
+                     dst_argb, dst_stride_argb,
+                     (const uint8*)(&kShuffleMaskRGBAToARGB),
+                     width, height);
+}
+
+// Convert RGB24 to ARGB.
+LIBYUV_API
+int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24,
+                uint8* dst_argb, int dst_stride_argb,
+                int width, int height) {
+  int y;
+  void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
+      RGB24ToARGBRow_C;
+  if (!src_rgb24 || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+    src_stride_rgb24 = -src_stride_rgb24;
+  }
+  // Coalesce rows.
+  if (src_stride_rgb24 == width * 3 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_rgb24 = dst_stride_argb = 0;
+  }
+#if defined(HAS_RGB24TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    RGB24ToARGBRow(src_rgb24, dst_argb, width);
+    src_rgb24 += src_stride_rgb24;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert RAW to ARGB.
+LIBYUV_API
+int RAWToARGB(const uint8* src_raw, int src_stride_raw,
+              uint8* dst_argb, int dst_stride_argb,
+              int width, int height) {
+  int y;
+  void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
+      RAWToARGBRow_C;
+  if (!src_raw || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_raw = src_raw + (height - 1) * src_stride_raw;
+    src_stride_raw = -src_stride_raw;
+  }
+  // Coalesce rows.
+  if (src_stride_raw == width * 3 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_raw = dst_stride_argb = 0;
+  }
+#if defined(HAS_RAWTOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToARGBRow = RAWToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RAWToARGBRow = RAWToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RAWToARGBRow = RAWToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    RAWToARGBRow(src_raw, dst_argb, width);
+    src_raw += src_stride_raw;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert RGB565 to ARGB.
+LIBYUV_API
+int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565,
+                 uint8* dst_argb, int dst_stride_argb,
+                 int width, int height) {
+  int y;
+  void (*RGB565ToARGBRow)(const uint8* src_rgb565, uint8* dst_argb, int width) =
+      RGB565ToARGBRow_C;
+  if (!src_rgb565 || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565;
+    src_stride_rgb565 = -src_stride_rgb565;
+  }
+  // Coalesce rows.
+  if (src_stride_rgb565 == width * 2 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_rgb565 = dst_stride_argb = 0;
+  }
+#if defined(HAS_RGB565TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_RGB565TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_RGB565TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    RGB565ToARGBRow(src_rgb565, dst_argb, width);
+    src_rgb565 += src_stride_rgb565;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert ARGB1555 to ARGB.
+LIBYUV_API
+int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555,
+                   uint8* dst_argb, int dst_stride_argb,
+                   int width, int height) {
+  int y;
+  void (*ARGB1555ToARGBRow)(const uint8* src_argb1555, uint8* dst_argb,
+      int width) = ARGB1555ToARGBRow_C;
+  if (!src_argb1555 || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555;
+    src_stride_argb1555 = -src_stride_argb1555;
+  }
+  // Coalesce rows.
+  if (src_stride_argb1555 == width * 2 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb1555 = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGB1555TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGB1555ToARGBRow(src_argb1555, dst_argb, width);
+    src_argb1555 += src_stride_argb1555;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert ARGB4444 to ARGB.
+LIBYUV_API
+int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444,
+                   uint8* dst_argb, int dst_stride_argb,
+                   int width, int height) {
+  int y;
+  void (*ARGB4444ToARGBRow)(const uint8* src_argb4444, uint8* dst_argb,
+      int width) = ARGB4444ToARGBRow_C;
+  if (!src_argb4444 || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444;
+    src_stride_argb4444 = -src_stride_argb4444;
+  }
+  // Coalesce rows.
+  if (src_stride_argb4444 == width * 2 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb4444 = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGB4444TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGB4444ToARGBRow(src_argb4444, dst_argb, width);
+    src_argb4444 += src_stride_argb4444;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert NV12 to ARGB.
+LIBYUV_API
+int NV12ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_uv, int src_stride_uv,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*NV12ToARGBRow)(const uint8* y_buf,
+                        const uint8* uv_buf,
+                        uint8* rgb_buf,
+                        const struct YuvConstants* yuvconstants,
+                        int width) = NV12ToARGBRow_C;
+  if (!src_y || !src_uv || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_NV12TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToARGBRow = NV12ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_NV12TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      NV12ToARGBRow = NV12ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_NV12TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToARGBRow = NV12ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    NV12ToARGBRow(src_y, src_uv, dst_argb, &kYuvI601Constants, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_uv += src_stride_uv;
+    }
+  }
+  return 0;
+}
+
+// Convert NV21 to ARGB.
+LIBYUV_API
+int NV21ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_uv, int src_stride_uv,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*NV21ToARGBRow)(const uint8* y_buf,
+                        const uint8* uv_buf,
+                        uint8* rgb_buf,
+                        const struct YuvConstants* yuvconstants,
+                        int width) = NV21ToARGBRow_C;
+  if (!src_y || !src_uv || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_NV21TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    NV21ToARGBRow = NV21ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      NV21ToARGBRow = NV21ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_NV21TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV21ToARGBRow = NV21ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      NV21ToARGBRow = NV21ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_NV21TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    NV21ToARGBRow = NV21ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      NV21ToARGBRow = NV21ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    NV21ToARGBRow(src_y, src_uv, dst_argb, &kYuvI601Constants, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_uv += src_stride_uv;
+    }
+  }
+  return 0;
+}
+
+// Convert M420 to ARGB.
+LIBYUV_API
+int M420ToARGB(const uint8* src_m420, int src_stride_m420,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*NV12ToARGBRow)(const uint8* y_buf,
+                        const uint8* uv_buf,
+                        uint8* rgb_buf,
+                        const struct YuvConstants* yuvconstants,
+                        int width) = NV12ToARGBRow_C;
+  if (!src_m420 || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_NV12TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToARGBRow = NV12ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_NV12TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      NV12ToARGBRow = NV12ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_NV12TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToARGBRow = NV12ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb,
+                  &kYuvI601Constants, width);
+    NV12ToARGBRow(src_m420 + src_stride_m420, src_m420 + src_stride_m420 * 2,
+                  dst_argb + dst_stride_argb, &kYuvI601Constants, width);
+    dst_argb += dst_stride_argb * 2;
+    src_m420 += src_stride_m420 * 3;
+  }
+  if (height & 1) {
+    NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb,
+                  &kYuvI601Constants, width);
+  }
+  return 0;
+}
+
+// Convert YUY2 to ARGB.
+LIBYUV_API
+int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*YUY2ToARGBRow)(const uint8* src_yuy2,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) =
+      YUY2ToARGBRow_C;
+  if (!src_yuy2 || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+    src_stride_yuy2 = -src_stride_yuy2;
+  }
+  // Coalesce rows.
+  if (src_stride_yuy2 == width * 2 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_yuy2 = dst_stride_argb = 0;
+  }
+#if defined(HAS_YUY2TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    YUY2ToARGBRow = YUY2ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToARGBRow = YUY2ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    YUY2ToARGBRow = YUY2ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      YUY2ToARGBRow = YUY2ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    YUY2ToARGBRow = YUY2ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      YUY2ToARGBRow = YUY2ToARGBRow_NEON;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    YUY2ToARGBRow(src_yuy2, dst_argb, &kYuvI601Constants, width);
+    src_yuy2 += src_stride_yuy2;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert UYVY to ARGB.
+LIBYUV_API
+int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*UYVYToARGBRow)(const uint8* src_uyvy,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) =
+      UYVYToARGBRow_C;
+  if (!src_uyvy || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+    src_stride_uyvy = -src_stride_uyvy;
+  }
+  // Coalesce rows.
+  if (src_stride_uyvy == width * 2 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_uyvy = dst_stride_argb = 0;
+  }
+#if defined(HAS_UYVYTOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    UYVYToARGBRow = UYVYToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToARGBRow = UYVYToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    UYVYToARGBRow = UYVYToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      UYVYToARGBRow = UYVYToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    UYVYToARGBRow = UYVYToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      UYVYToARGBRow = UYVYToARGBRow_NEON;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    UYVYToARGBRow(src_uyvy, dst_argb, &kYuvI601Constants, width);
+    src_uyvy += src_stride_uyvy;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libyuv/source/convert_from.cc b/libs/libyuv/source/convert_from.cc
new file mode 100644
index 0000000000..3bc9eb1be4
--- /dev/null
+++ b/libs/libyuv/source/convert_from.cc
@@ -0,0 +1,1167 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert_from.h"
+
+#include "libyuv/basic_types.h"
+#include "libyuv/convert.h"  // For I420Copy
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+#include "libyuv/scale.h"  // For ScalePlane()
+#include "libyuv/video_common.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
+static __inline int Abs(int v) {
+  return v >= 0 ? v : -v;
+}
+
+// I420 To any I4xx YUV format with mirroring.
+static int I420ToI4xx(const uint8* src_y, int src_stride_y,
+                      const uint8* src_u, int src_stride_u,
+                      const uint8* src_v, int src_stride_v,
+                      uint8* dst_y, int dst_stride_y,
+                      uint8* dst_u, int dst_stride_u,
+                      uint8* dst_v, int dst_stride_v,
+                      int src_y_width, int src_y_height,
+                      int dst_uv_width, int dst_uv_height) {
+  const int dst_y_width = Abs(src_y_width);
+  const int dst_y_height = Abs(src_y_height);
+  const int src_uv_width = SUBSAMPLE(src_y_width, 1, 1);
+  const int src_uv_height = SUBSAMPLE(src_y_height, 1, 1);
+  if (src_y_width == 0 || src_y_height == 0 ||
+      dst_uv_width <= 0 || dst_uv_height <= 0) {
+    return -1;
+  }
+  ScalePlane(src_y, src_stride_y, src_y_width, src_y_height,
+             dst_y, dst_stride_y, dst_y_width, dst_y_height,
+             kFilterBilinear);
+  ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height,
+             dst_u, dst_stride_u, dst_uv_width, dst_uv_height,
+             kFilterBilinear);
+  ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height,
+             dst_v, dst_stride_v, dst_uv_width, dst_uv_height,
+             kFilterBilinear);
+  return 0;
+}
+
+// 420 chroma is 1/2 width, 1/2 height
+// 422 chroma is 1/2 width, 1x height
+LIBYUV_API
+int I420ToI422(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  const int dst_uv_width = (Abs(width) + 1) >> 1;
+  const int dst_uv_height = Abs(height);
+  return I420ToI4xx(src_y, src_stride_y,
+                    src_u, src_stride_u,
+                    src_v, src_stride_v,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height,
+                    dst_uv_width, dst_uv_height);
+}
+
+// 420 chroma is 1/2 width, 1/2 height
+// 444 chroma is 1x width, 1x height
+LIBYUV_API
+int I420ToI444(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  const int dst_uv_width = Abs(width);
+  const int dst_uv_height = Abs(height);
+  return I420ToI4xx(src_y, src_stride_y,
+                    src_u, src_stride_u,
+                    src_v, src_stride_v,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height,
+                    dst_uv_width, dst_uv_height);
+}
+
+// 420 chroma is 1/2 width, 1/2 height
+// 411 chroma is 1/4 width, 1x height
+LIBYUV_API
+int I420ToI411(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  const int dst_uv_width = (Abs(width) + 3) >> 2;
+  const int dst_uv_height = Abs(height);
+  return I420ToI4xx(src_y, src_stride_y,
+                    src_u, src_stride_u,
+                    src_v, src_stride_v,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height,
+                    dst_uv_width, dst_uv_height);
+}
+
+// Copy to I400. Source can be I420,422,444,400,NV12,NV21
+LIBYUV_API
+int I400Copy(const uint8* src_y, int src_stride_y,
+             uint8* dst_y, int dst_stride_y,
+             int width, int height) {
+  if (!src_y || !dst_y ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  return 0;
+}
+
+LIBYUV_API
+int I422ToYUY2(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_yuy2, int dst_stride_yuy2,
+               int width, int height) {
+  int y;
+  void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
+                        const uint8* src_v, uint8* dst_yuy2, int width) =
+      I422ToYUY2Row_C;
+  if (!src_y || !src_u || !src_v || !dst_yuy2 ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
+    dst_stride_yuy2 = -dst_stride_yuy2;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      src_stride_u * 2 == width &&
+      src_stride_v * 2 == width &&
+      dst_stride_yuy2 == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_yuy2 = 0;
+  }
+#if defined(HAS_I422TOYUY2ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOYUY2ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_yuy2 += dst_stride_yuy2;
+  }
+  return 0;
+}
+
+LIBYUV_API
+int I420ToYUY2(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_yuy2, int dst_stride_yuy2,
+               int width, int height) {
+  int y;
+  void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
+                        const uint8* src_v, uint8* dst_yuy2, int width) =
+      I422ToYUY2Row_C;
+  if (!src_y || !src_u || !src_v || !dst_yuy2 ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
+    dst_stride_yuy2 = -dst_stride_yuy2;
+  }
+#if defined(HAS_I422TOYUY2ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOYUY2ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
+    I422ToYUY2Row(src_y + src_stride_y, src_u, src_v,
+                  dst_yuy2 + dst_stride_yuy2, width);
+    src_y += src_stride_y * 2;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_yuy2 += dst_stride_yuy2 * 2;
+  }
+  if (height & 1) {
+    I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
+  }
+  return 0;
+}
+
+LIBYUV_API
+int I422ToUYVY(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_uyvy, int dst_stride_uyvy,
+               int width, int height) {
+  int y;
+  void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
+                        const uint8* src_v, uint8* dst_uyvy, int width) =
+      I422ToUYVYRow_C;
+  if (!src_y || !src_u || !src_v || !dst_uyvy ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
+    dst_stride_uyvy = -dst_stride_uyvy;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      src_stride_u * 2 == width &&
+      src_stride_v * 2 == width &&
+      dst_stride_uyvy == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_uyvy = 0;
+  }
+#if defined(HAS_I422TOUYVYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOUYVYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_uyvy += dst_stride_uyvy;
+  }
+  return 0;
+}
+
+LIBYUV_API
+int I420ToUYVY(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_uyvy, int dst_stride_uyvy,
+               int width, int height) {
+  int y;
+  void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
+                        const uint8* src_v, uint8* dst_uyvy, int width) =
+      I422ToUYVYRow_C;
+  if (!src_y || !src_u || !src_v || !dst_uyvy ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
+    dst_stride_uyvy = -dst_stride_uyvy;
+  }
+#if defined(HAS_I422TOUYVYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOUYVYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
+    I422ToUYVYRow(src_y + src_stride_y, src_u, src_v,
+                  dst_uyvy + dst_stride_uyvy, width);
+    src_y += src_stride_y * 2;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_uyvy += dst_stride_uyvy * 2;
+  }
+  if (height & 1) {
+    I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
+  }
+  return 0;
+}
+
+LIBYUV_API
+int I420ToNV12(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height) {
+  int y;
+  void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+      int width) = MergeUVRow_C;
+  // Coalesce rows.
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_y || !src_u || !src_v || !dst_y || !dst_uv ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_uv = dst_uv + (halfheight - 1) * dst_stride_uv;
+    dst_stride_y = -dst_stride_y;
+    dst_stride_uv = -dst_stride_uv;
+  }
+  if (src_stride_y == width &&
+      dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_y = 0;
+  }
+  // Coalesce rows.
+  if (src_stride_u == halfwidth &&
+      src_stride_v == halfwidth &&
+      dst_stride_uv == halfwidth * 2) {
+    halfwidth *= halfheight;
+    halfheight = 1;
+    src_stride_u = src_stride_v = dst_stride_uv = 0;
+  }
+#if defined(HAS_MERGEUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    MergeUVRow_ = MergeUVRow_Any_SSE2;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MergeUVRow_ = MergeUVRow_Any_AVX2;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      MergeUVRow_ = MergeUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MergeUVRow_ = MergeUVRow_Any_NEON;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_NEON;
+    }
+  }
+#endif
+
+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  for (y = 0; y < halfheight; ++y) {
+    // Merge a row of U and V into a row of UV.
+    MergeUVRow_(src_u, src_v, dst_uv, halfwidth);
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_uv += dst_stride_uv;
+  }
+  return 0;
+}
+
+LIBYUV_API
+int I420ToNV21(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_vu, int dst_stride_vu,
+               int width, int height) {
+  return I420ToNV12(src_y, src_stride_y,
+                    src_v, src_stride_v,
+                    src_u, src_stride_u,
+                    dst_y, dst_stride_y,
+                    dst_vu, dst_stride_vu,
+                    width, height);
+}
+
+// Convert I422 to RGBA with matrix
+static int I420ToRGBAMatrix(const uint8* src_y, int src_stride_y,
+                            const uint8* src_u, int src_stride_u,
+                            const uint8* src_v, int src_stride_v,
+                            uint8* dst_rgba, int dst_stride_rgba,
+                            const struct YuvConstants* yuvconstants,
+                            int width, int height) {
+  int y;
+  void (*I422ToRGBARow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        const struct YuvConstants* yuvconstants,
+                        int width) = I422ToRGBARow_C;
+  if (!src_y || !src_u || !src_v || !dst_rgba ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
+    dst_stride_rgba = -dst_stride_rgba;
+  }
+#if defined(HAS_I422TORGBAROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGBARow = I422ToRGBARow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRGBARow = I422ToRGBARow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGBARow = I422ToRGBARow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToRGBARow = I422ToRGBARow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGBARow = I422ToRGBARow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_rgba, 4) && IS_ALIGNED(dst_stride_rgba, 4)) {
+    I422ToRGBARow = I422ToRGBARow_DSPR2;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
+    dst_rgba += dst_stride_rgba;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to RGBA.
+LIBYUV_API
+int I420ToRGBA(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_rgba, int dst_stride_rgba,
+               int width, int height) {
+  return I420ToRGBAMatrix(src_y, src_stride_y,
+                          src_u, src_stride_u,
+                          src_v, src_stride_v,
+                          dst_rgba, dst_stride_rgba,
+                          &kYuvI601Constants,
+                          width, height);
+}
+
+// Convert I420 to BGRA.
+LIBYUV_API
+int I420ToBGRA(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_bgra, int dst_stride_bgra,
+               int width, int height) {
+  return I420ToRGBAMatrix(src_y, src_stride_y,
+                          src_v, src_stride_v,  // Swap U and V
+                          src_u, src_stride_u,
+                          dst_bgra, dst_stride_bgra,
+                          &kYvuI601Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert I420 to RGB24 with matrix
+static int I420ToRGB24Matrix(const uint8* src_y, int src_stride_y,
+                             const uint8* src_u, int src_stride_u,
+                             const uint8* src_v, int src_stride_v,
+                             uint8* dst_rgb24, int dst_stride_rgb24,
+                             const struct YuvConstants* yuvconstants,
+                             int width, int height) {
+  int y;
+  void (*I422ToRGB24Row)(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* rgb_buf,
+                         const struct YuvConstants* yuvconstants,
+                         int width) = I422ToRGB24Row_C;
+  if (!src_y || !src_u || !src_v || !dst_rgb24 ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+    dst_stride_rgb24 = -dst_stride_rgb24;
+  }
+#if defined(HAS_I422TORGB24ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB24Row = I422ToRGB24Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB24ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB24Row = I422ToRGB24Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB24Row = I422ToRGB24Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width);
+    dst_rgb24 += dst_stride_rgb24;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to RGB24.
+LIBYUV_API
+int I420ToRGB24(const uint8* src_y, int src_stride_y,
+                const uint8* src_u, int src_stride_u,
+                const uint8* src_v, int src_stride_v,
+                uint8* dst_rgb24, int dst_stride_rgb24,
+                int width, int height) {
+  return I420ToRGB24Matrix(src_y, src_stride_y,
+                           src_u, src_stride_u,
+                           src_v, src_stride_v,
+                           dst_rgb24, dst_stride_rgb24,
+                           &kYuvI601Constants,
+                           width, height);
+}
+
+// Convert I420 to RAW.
+LIBYUV_API
+int I420ToRAW(const uint8* src_y, int src_stride_y,
+              const uint8* src_u, int src_stride_u,
+              const uint8* src_v, int src_stride_v,
+              uint8* dst_raw, int dst_stride_raw,
+              int width, int height) {
+  return I420ToRGB24Matrix(src_y, src_stride_y,
+                           src_v, src_stride_v,  // Swap U and V
+                           src_u, src_stride_u,
+                           dst_raw, dst_stride_raw,
+                           &kYvuI601Constants,  // Use Yvu matrix
+                           width, height);
+}
+
+// Convert I420 to ARGB1555.
+LIBYUV_API
+int I420ToARGB1555(const uint8* src_y, int src_stride_y,
+                   const uint8* src_u, int src_stride_u,
+                   const uint8* src_v, int src_stride_v,
+                   uint8* dst_argb1555, int dst_stride_argb1555,
+                   int width, int height) {
+  int y;
+  void (*I422ToARGB1555Row)(const uint8* y_buf,
+                            const uint8* u_buf,
+                            const uint8* v_buf,
+                            uint8* rgb_buf,
+                            const struct YuvConstants* yuvconstants,
+                            int width) = I422ToARGB1555Row_C;
+  if (!src_y || !src_u || !src_v || !dst_argb1555 ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb1555 = dst_argb1555 + (height - 1) * dst_stride_argb1555;
+    dst_stride_argb1555 = -dst_stride_argb1555;
+  }
+#if defined(HAS_I422TOARGB1555ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB1555Row = I422ToARGB1555Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGB1555Row = I422ToARGB1555Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGB1555Row = I422ToARGB1555Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB1555Row = I422ToARGB1555Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, &kYuvI601Constants,
+                      width);
+    dst_argb1555 += dst_stride_argb1555;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+
+// Convert I420 to ARGB4444.
+LIBYUV_API
+int I420ToARGB4444(const uint8* src_y, int src_stride_y,
+                   const uint8* src_u, int src_stride_u,
+                   const uint8* src_v, int src_stride_v,
+                   uint8* dst_argb4444, int dst_stride_argb4444,
+                   int width, int height) {
+  int y;
+  void (*I422ToARGB4444Row)(const uint8* y_buf,
+                            const uint8* u_buf,
+                            const uint8* v_buf,
+                            uint8* rgb_buf,
+                            const struct YuvConstants* yuvconstants,
+                            int width) = I422ToARGB4444Row_C;
+  if (!src_y || !src_u || !src_v || !dst_argb4444 ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb4444 = dst_argb4444 + (height - 1) * dst_stride_argb4444;
+    dst_stride_argb4444 = -dst_stride_argb4444;
+  }
+#if defined(HAS_I422TOARGB4444ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGB4444Row = I422ToARGB4444Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB4444Row = I422ToARGB4444Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGB4444Row = I422ToARGB4444Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGB4444Row = I422ToARGB4444Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGB4444Row = I422ToARGB4444Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB4444Row = I422ToARGB4444Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, &kYuvI601Constants,
+                      width);
+    dst_argb4444 += dst_stride_argb4444;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to RGB565.
+LIBYUV_API
+int I420ToRGB565(const uint8* src_y, int src_stride_y,
+                 const uint8* src_u, int src_stride_u,
+                 const uint8* src_v, int src_stride_v,
+                 uint8* dst_rgb565, int dst_stride_rgb565,
+                 int width, int height) {
+  int y;
+  void (*I422ToRGB565Row)(const uint8* y_buf,
+                          const uint8* u_buf,
+                          const uint8* v_buf,
+                          uint8* rgb_buf,
+                          const struct YuvConstants* yuvconstants,
+                          int width) = I422ToRGB565Row_C;
+  if (!src_y || !src_u || !src_v || !dst_rgb565 ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+    dst_stride_rgb565 = -dst_stride_rgb565;
+  }
+#if defined(HAS_I422TORGB565ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB565Row = I422ToRGB565Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB565ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB565Row = I422ToRGB565Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB565ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB565Row = I422ToRGB565Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width);
+    dst_rgb565 += dst_stride_rgb565;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Ordered 8x8 dither for 888 to 565.  Values from 0 to 7.
+static const uint8 kDither565_4x4[16] = {
+  0, 4, 1, 5,
+  6, 2, 7, 3,
+  1, 5, 0, 4,
+  7, 3, 6, 2,
+};
+
+// Convert I420 to RGB565 with dithering.
+LIBYUV_API
+int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
+                       const uint8* src_u, int src_stride_u,
+                       const uint8* src_v, int src_stride_v,
+                       uint8* dst_rgb565, int dst_stride_rgb565,
+                       const uint8* dither4x4, int width, int height) {
+  int y;
+  void (*I422ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        const struct YuvConstants* yuvconstants,
+                        int width) = I422ToARGBRow_C;
+  void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,
+      const uint32 dither4, int width) = ARGBToRGB565DitherRow_C;
+  if (!src_y || !src_u || !src_v || !dst_rgb565 ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+    dst_stride_rgb565 = -dst_stride_rgb565;
+  }
+  if (!dither4x4) {
+    dither4x4 = kDither565_4x4;
+  }
+#if defined(HAS_I422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGBRow = I422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2)) {
+    I422ToARGBRow = I422ToARGBRow_DSPR2;
+  }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON;
+    }
+  }
+#endif
+  {
+    // Allocate a row of argb.
+    align_buffer_64(row_argb, width * 4);
+    for (y = 0; y < height; ++y) {
+      I422ToARGBRow(src_y, src_u, src_v, row_argb, &kYuvI601Constants, width);
+      ARGBToRGB565DitherRow(row_argb, dst_rgb565,
+                            *(uint32*)(dither4x4 + ((y & 3) << 2)), width);
+      dst_rgb565 += dst_stride_rgb565;
+      src_y += src_stride_y;
+      if (y & 1) {
+        src_u += src_stride_u;
+        src_v += src_stride_v;
+      }
+    }
+    free_aligned_buffer_64(row_argb);
+  }
+  return 0;
+}
+
+// Convert I420 to specified format
+LIBYUV_API
+int ConvertFromI420(const uint8* y, int y_stride,
+                    const uint8* u, int u_stride,
+                    const uint8* v, int v_stride,
+                    uint8* dst_sample, int dst_sample_stride,
+                    int width, int height,
+                    uint32 fourcc) {
+  uint32 format = CanonicalFourCC(fourcc);
+  int r = 0;
+  if (!y || !u|| !v || !dst_sample ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  switch (format) {
+    // Single plane formats
+    case FOURCC_YUY2:
+      r = I420ToYUY2(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 2,
+                     width, height);
+      break;
+    case FOURCC_UYVY:
+      r = I420ToUYVY(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 2,
+                     width, height);
+      break;
+    case FOURCC_RGBP:
+      r = I420ToRGB565(y, y_stride,
+                       u, u_stride,
+                       v, v_stride,
+                       dst_sample,
+                       dst_sample_stride ? dst_sample_stride : width * 2,
+                       width, height);
+      break;
+    case FOURCC_RGBO:
+      r = I420ToARGB1555(y, y_stride,
+                         u, u_stride,
+                         v, v_stride,
+                         dst_sample,
+                         dst_sample_stride ? dst_sample_stride : width * 2,
+                         width, height);
+      break;
+    case FOURCC_R444:
+      r = I420ToARGB4444(y, y_stride,
+                         u, u_stride,
+                         v, v_stride,
+                         dst_sample,
+                         dst_sample_stride ? dst_sample_stride : width * 2,
+                         width, height);
+      break;
+    case FOURCC_24BG:
+      r = I420ToRGB24(y, y_stride,
+                      u, u_stride,
+                      v, v_stride,
+                      dst_sample,
+                      dst_sample_stride ? dst_sample_stride : width * 3,
+                      width, height);
+      break;
+    case FOURCC_RAW:
+      r = I420ToRAW(y, y_stride,
+                    u, u_stride,
+                    v, v_stride,
+                    dst_sample,
+                    dst_sample_stride ? dst_sample_stride : width * 3,
+                    width, height);
+      break;
+    case FOURCC_ARGB:
+      r = I420ToARGB(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4,
+                     width, height);
+      break;
+    case FOURCC_BGRA:
+      r = I420ToBGRA(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4,
+                     width, height);
+      break;
+    case FOURCC_ABGR:
+      r = I420ToABGR(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4,
+                     width, height);
+      break;
+    case FOURCC_RGBA:
+      r = I420ToRGBA(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4,
+                     width, height);
+      break;
+    case FOURCC_I400:
+      r = I400Copy(y, y_stride,
+                   dst_sample,
+                   dst_sample_stride ? dst_sample_stride : width,
+                   width, height);
+      break;
+    case FOURCC_NV12: {
+      uint8* dst_uv = dst_sample + width * height;
+      r = I420ToNV12(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width,
+                     dst_uv,
+                     dst_sample_stride ? dst_sample_stride : width,
+                     width, height);
+      break;
+    }
+    case FOURCC_NV21: {
+      uint8* dst_vu = dst_sample + width * height;
+      r = I420ToNV21(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width,
+                     dst_vu,
+                     dst_sample_stride ? dst_sample_stride : width,
+                     width, height);
+      break;
+    }
+    // TODO(fbarchard): Add M420.
+    // Triplanar formats
+    // TODO(fbarchard): halfstride instead of halfwidth
+    case FOURCC_I420:
+    case FOURCC_YU12:
+    case FOURCC_YV12: {
+      int halfwidth = (width + 1) / 2;
+      int halfheight = (height + 1) / 2;
+      uint8* dst_u;
+      uint8* dst_v;
+      if (format == FOURCC_YV12) {
+        dst_v = dst_sample + width * height;
+        dst_u = dst_v + halfwidth * halfheight;
+      } else {
+        dst_u = dst_sample + width * height;
+        dst_v = dst_u + halfwidth * halfheight;
+      }
+      r = I420Copy(y, y_stride,
+                   u, u_stride,
+                   v, v_stride,
+                   dst_sample, width,
+                   dst_u, halfwidth,
+                   dst_v, halfwidth,
+                   width, height);
+      break;
+    }
+    case FOURCC_I422:
+    case FOURCC_YV16: {
+      int halfwidth = (width + 1) / 2;
+      uint8* dst_u;
+      uint8* dst_v;
+      if (format == FOURCC_YV16) {
+        dst_v = dst_sample + width * height;
+        dst_u = dst_v + halfwidth * height;
+      } else {
+        dst_u = dst_sample + width * height;
+        dst_v = dst_u + halfwidth * height;
+      }
+      r = I420ToI422(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample, width,
+                     dst_u, halfwidth,
+                     dst_v, halfwidth,
+                     width, height);
+      break;
+    }
+    case FOURCC_I444:
+    case FOURCC_YV24: {
+      uint8* dst_u;
+      uint8* dst_v;
+      if (format == FOURCC_YV24) {
+        dst_v = dst_sample + width * height;
+        dst_u = dst_v + width * height;
+      } else {
+        dst_u = dst_sample + width * height;
+        dst_v = dst_u + width * height;
+      }
+      r = I420ToI444(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample, width,
+                     dst_u, width,
+                     dst_v, width,
+                     width, height);
+      break;
+    }
+    case FOURCC_I411: {
+      int quarterwidth = (width + 3) / 4;
+      uint8* dst_u = dst_sample + width * height;
+      uint8* dst_v = dst_u + quarterwidth * height;
+      r = I420ToI411(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample, width,
+                     dst_u, quarterwidth,
+                     dst_v, quarterwidth,
+                     width, height);
+      break;
+    }
+
+    // Formats not supported - MJPG, biplanar, some rgb formats.
+    default:
+      return -1;  // unknown fourcc - return failure code.
+  }
+  return r;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libyuv/source/convert_from_argb.cc b/libs/libyuv/source/convert_from_argb.cc
new file mode 100644
index 0000000000..2a8682b7eb
--- /dev/null
+++ b/libs/libyuv/source/convert_from_argb.cc
@@ -0,0 +1,1286 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert_from_argb.h"
+
+#include "libyuv/basic_types.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// ARGB little endian (bgra in memory) to I444
+LIBYUV_API
+int ARGBToI444(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+      ARGBToYRow_C;
+  void (*ARGBToUV444Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+      int width) = ARGBToUV444Row_C;
+  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_y == width &&
+      dst_stride_u == width &&
+      dst_stride_v == width) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+  }
+#if defined(HAS_ARGBTOUV444ROW_SSSE3)
+    if (TestCpuFlag(kCpuHasSSSE3)) {
+      ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3;
+      if (IS_ALIGNED(width, 16)) {
+        ARGBToUV444Row = ARGBToUV444Row_SSSE3;
+      }
+  }
+#endif
+#if defined(HAS_ARGBTOUV444ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUV444Row = ARGBToUV444Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToUV444Row = ARGBToUV444Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToUV444Row(src_argb, dst_u, dst_v, width);
+    ARGBToYRow(src_argb, dst_y, width);
+    src_argb += src_stride_argb;
+    dst_y += dst_stride_y;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  return 0;
+}
+
+// ARGB little endian (bgra in memory) to I422
+LIBYUV_API
+int ARGBToI422(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+      ARGBToYRow_C;
+  if (!src_argb ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_y == width &&
+      dst_stride_u * 2 == width &&
+      dst_stride_v * 2 == width) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+  }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVRow = ARGBToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
+    ARGBToYRow(src_argb, dst_y, width);
+    src_argb += src_stride_argb;
+    dst_y += dst_stride_y;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  return 0;
+}
+
+// ARGB little endian (bgra in memory) to I411
+LIBYUV_API
+int ARGBToI411(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*ARGBToUV411Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+      int width) = ARGBToUV411Row_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+      ARGBToYRow_C;
+  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_y == width &&
+      dst_stride_u * 4 == width &&
+      dst_stride_v * 4 == width) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+  }
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUV411ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUV411Row = ARGBToUV411Row_Any_NEON;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUV411Row = ARGBToUV411Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToUV411Row(src_argb, dst_u, dst_v, width);
+    ARGBToYRow(src_argb, dst_y, width);
+    src_argb += src_stride_argb;
+    dst_y += dst_stride_y;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  return 0;
+}
+
+LIBYUV_API
+int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height) {
+  int y;
+  int halfwidth = (width + 1) >> 1;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+      ARGBToYRow_C;
+  void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                      int width) = MergeUVRow_C;
+  if (!src_argb ||
+      !dst_y || !dst_uv ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVRow = ARGBToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    MergeUVRow_ = MergeUVRow_Any_SSE2;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MergeUVRow_ = MergeUVRow_Any_AVX2;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      MergeUVRow_ = MergeUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MergeUVRow_ = MergeUVRow_Any_NEON;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_NEON;
+    }
+  }
+#endif
+  {
+    // Allocate a rows of uv.
+    align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
+    uint8* row_v = row_u + ((halfwidth + 31) & ~31);
+
+    for (y = 0; y < height - 1; y += 2) {
+      ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
+      MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+      ARGBToYRow(src_argb, dst_y, width);
+      ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+      src_argb += src_stride_argb * 2;
+      dst_y += dst_stride_y * 2;
+      dst_uv += dst_stride_uv;
+    }
+    if (height & 1) {
+      ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+      MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+      ARGBToYRow(src_argb, dst_y, width);
+    }
+    free_aligned_buffer_64(row_u);
+  }
+  return 0;
+}
+
+// Same as NV12 but U and V swapped.
+LIBYUV_API
+int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height) {
+  int y;
+  int halfwidth = (width + 1) >> 1;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+      ARGBToYRow_C;
+  void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                      int width) = MergeUVRow_C;
+  if (!src_argb ||
+      !dst_y || !dst_uv ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVRow = ARGBToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    MergeUVRow_ = MergeUVRow_Any_SSE2;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MergeUVRow_ = MergeUVRow_Any_AVX2;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      MergeUVRow_ = MergeUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MergeUVRow_ = MergeUVRow_Any_NEON;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_NEON;
+    }
+  }
+#endif
+  {
+    // Allocate a rows of uv.
+    align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
+    uint8* row_v = row_u + ((halfwidth + 31) & ~31);
+
+    for (y = 0; y < height - 1; y += 2) {
+      ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
+      MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
+      ARGBToYRow(src_argb, dst_y, width);
+      ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+      src_argb += src_stride_argb * 2;
+      dst_y += dst_stride_y * 2;
+      dst_uv += dst_stride_uv;
+    }
+    if (height & 1) {
+      ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+      MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
+      ARGBToYRow(src_argb, dst_y, width);
+    }
+    free_aligned_buffer_64(row_u);
+  }
+  return 0;
+}
+
+// Convert ARGB to YUY2.
+LIBYUV_API
+int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_yuy2, int dst_stride_yuy2,
+               int width, int height) {
+  int y;
+  void (*ARGBToUVRow)(const uint8* src_argb, int src_stride_argb,
+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+      ARGBToYRow_C;
+  void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
+      const uint8* src_v, uint8* dst_yuy2, int width) = I422ToYUY2Row_C;
+
+  if (!src_argb || !dst_yuy2 ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
+    dst_stride_yuy2 = -dst_stride_yuy2;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_yuy2 == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_yuy2 = 0;
+  }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVRow = ARGBToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOYUY2ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOYUY2ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_NEON;
+    }
+  }
+#endif
+
+  {
+    // Allocate a rows of yuv.
+    align_buffer_64(row_y, ((width + 63) & ~63) * 2);
+    uint8* row_u = row_y + ((width + 63) & ~63);
+    uint8* row_v = row_u + ((width + 63) & ~63) / 2;
+
+    for (y = 0; y < height; ++y) {
+      ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+      ARGBToYRow(src_argb, row_y, width);
+      I422ToYUY2Row(row_y, row_u, row_v, dst_yuy2, width);
+      src_argb += src_stride_argb;
+      dst_yuy2 += dst_stride_yuy2;
+    }
+
+    free_aligned_buffer_64(row_y);
+  }
+  return 0;
+}
+
+// Convert ARGB to UYVY.
+LIBYUV_API
+int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_uyvy, int dst_stride_uyvy,
+               int width, int height) {
+  int y;
+  void (*ARGBToUVRow)(const uint8* src_argb, int src_stride_argb,
+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+      ARGBToYRow_C;
+  void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
+      const uint8* src_v, uint8* dst_uyvy, int width) = I422ToUYVYRow_C;
+
+  if (!src_argb || !dst_uyvy ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
+    dst_stride_uyvy = -dst_stride_uyvy;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_uyvy == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_uyvy = 0;
+  }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVRow = ARGBToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOUYVYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOUYVYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_NEON;
+    }
+  }
+#endif
+
+  {
+    // Allocate a rows of yuv.
+    align_buffer_64(row_y, ((width + 63) & ~63) * 2);
+    uint8* row_u = row_y + ((width + 63) & ~63);
+    uint8* row_v = row_u + ((width + 63) & ~63) / 2;
+
+    for (y = 0; y < height; ++y) {
+      ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+      ARGBToYRow(src_argb, row_y, width);
+      I422ToUYVYRow(row_y, row_u, row_v, dst_uyvy, width);
+      src_argb += src_stride_argb;
+      dst_uyvy += dst_stride_uyvy;
+    }
+
+    free_aligned_buffer_64(row_y);
+  }
+  return 0;
+}
+
+// Convert ARGB to I400.
+LIBYUV_API
+int ARGBToI400(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height) {
+  int y;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+      ARGBToYRow_C;
+  if (!src_argb || !dst_y || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_y = 0;
+  }
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToYRow(src_argb, dst_y, width);
+    src_argb += src_stride_argb;
+    dst_y += dst_stride_y;
+  }
+  return 0;
+}
+
+// Shuffle table for converting ARGB to RGBA.
+static uvec8 kShuffleMaskARGBToRGBA = {
+  3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
+};
+
+// Convert ARGB to RGBA.
+LIBYUV_API
+int ARGBToRGBA(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_rgba, int dst_stride_rgba,
+               int width, int height) {
+  return ARGBShuffle(src_argb, src_stride_argb,
+                     dst_rgba, dst_stride_rgba,
+                     (const uint8*)(&kShuffleMaskARGBToRGBA),
+                     width, height);
+}
+
+// Convert ARGB To RGB24.
+LIBYUV_API
+int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
+                uint8* dst_rgb24, int dst_stride_rgb24,
+                int width, int height) {
+  int y;
+  void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
+      ARGBToRGB24Row_C;
+  if (!src_argb || !dst_rgb24 || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_rgb24 == width * 3) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_rgb24 = 0;
+  }
+#if defined(HAS_ARGBTORGB24ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToRGB24Row = ARGBToRGB24Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB24Row = ARGBToRGB24Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToRGB24Row(src_argb, dst_rgb24, width);
+    src_argb += src_stride_argb;
+    dst_rgb24 += dst_stride_rgb24;
+  }
+  return 0;
+}
+
+// Convert ARGB To RAW.
+LIBYUV_API
+int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
+              uint8* dst_raw, int dst_stride_raw,
+              int width, int height) {
+  int y;
+  void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int width) =
+      ARGBToRAWRow_C;
+  if (!src_argb || !dst_raw || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_raw == width * 3) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_raw = 0;
+  }
+#if defined(HAS_ARGBTORAWROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToRAWRow = ARGBToRAWRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORAWROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToRAWRow = ARGBToRAWRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRAWRow = ARGBToRAWRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToRAWRow(src_argb, dst_raw, width);
+    src_argb += src_stride_argb;
+    dst_raw += dst_stride_raw;
+  }
+  return 0;
+}
+
+// Ordered 8x8 dither for 888 to 565.  Values from 0 to 7.
+static const uint8 kDither565_4x4[16] = {
+  0, 4, 1, 5,
+  6, 2, 7, 3,
+  1, 5, 0, 4,
+  7, 3, 6, 2,
+};
+
+// Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes).
+LIBYUV_API
+int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
+                       uint8* dst_rgb565, int dst_stride_rgb565,
+                       const uint8* dither4x4, int width, int height) {
+  int y;
+  void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,
+      const uint32 dither4, int width) = ARGBToRGB565DitherRow_C;
+  if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  if (!dither4x4) {
+    dither4x4 = kDither565_4x4;
+  }
+#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    ARGBToRGB565DitherRow(src_argb, dst_rgb565,
+                          *(uint32*)(dither4x4 + ((y & 3) << 2)), width);
+    src_argb += src_stride_argb;
+    dst_rgb565 += dst_stride_rgb565;
+  }
+  return 0;
+}
+
+// Convert ARGB To RGB565.
+// TODO(fbarchard): Consider using dither function low level with zeros.
+LIBYUV_API
+int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
+                 uint8* dst_rgb565, int dst_stride_rgb565,
+                 int width, int height) {
+  int y;
+  void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
+      ARGBToRGB565Row_C;
+  if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_rgb565 == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_rgb565 = 0;
+  }
+#if defined(HAS_ARGBTORGB565ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToRGB565Row = ARGBToRGB565Row_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565Row = ARGBToRGB565Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToRGB565Row = ARGBToRGB565Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565Row = ARGBToRGB565Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToRGB565Row(src_argb, dst_rgb565, width);
+    src_argb += src_stride_argb;
+    dst_rgb565 += dst_stride_rgb565;
+  }
+  return 0;
+}
+
+// Convert ARGB To ARGB1555.
+LIBYUV_API
+int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_argb1555, int dst_stride_argb1555,
+                   int width, int height) {
+  int y;
+  void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
+      ARGBToARGB1555Row_C;
+  if (!src_argb || !dst_argb1555 || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb1555 == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb1555 = 0;
+  }
+#if defined(HAS_ARGBTOARGB1555ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOARGB1555ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToARGB1555Row = ARGBToARGB1555Row_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToARGB1555Row = ARGBToARGB1555Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOARGB1555ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToARGB1555Row = ARGBToARGB1555Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToARGB1555Row = ARGBToARGB1555Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToARGB1555Row(src_argb, dst_argb1555, width);
+    src_argb += src_stride_argb;
+    dst_argb1555 += dst_stride_argb1555;
+  }
+  return 0;
+}
+
+// Convert ARGB To ARGB4444.
+LIBYUV_API
+int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_argb4444, int dst_stride_argb4444,
+                   int width, int height) {
+  int y;
+  void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
+      ARGBToARGB4444Row_C;
+  if (!src_argb || !dst_argb4444 || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb4444 == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb4444 = 0;
+  }
+#if defined(HAS_ARGBTOARGB4444ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToARGB4444Row = ARGBToARGB4444Row_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToARGB4444Row = ARGBToARGB4444Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOARGB4444ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToARGB4444Row = ARGBToARGB4444Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToARGB4444Row = ARGBToARGB4444Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToARGB4444Row(src_argb, dst_argb4444, width);
+    src_argb += src_stride_argb;
+    dst_argb4444 += dst_stride_argb4444;
+  }
+  return 0;
+}
+
+// Convert ARGB to J420. (JPeg full range I420).
+LIBYUV_API
+int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_yj, int dst_stride_yj,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;
+  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) =
+      ARGBToYJRow_C;
+  if (!src_argb ||
+      !dst_yj || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
+    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVJRow = ARGBToUVJRow_SSSE3;
+      ARGBToYJRow = ARGBToYJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYJRow = ARGBToYJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYJRow = ARGBToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYJRow = ARGBToYJRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVJRow = ARGBToUVJRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    ARGBToUVJRow(src_argb, src_stride_argb, dst_u, dst_v, width);
+    ARGBToYJRow(src_argb, dst_yj, width);
+    ARGBToYJRow(src_argb + src_stride_argb, dst_yj + dst_stride_yj, width);
+    src_argb += src_stride_argb * 2;
+    dst_yj += dst_stride_yj * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width);
+    ARGBToYJRow(src_argb, dst_yj, width);
+  }
+  return 0;
+}
+
+// Convert ARGB to J422. (JPeg full range I422).
+LIBYUV_API
+int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_yj, int dst_stride_yj,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;
+  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) =
+      ARGBToYJRow_C;
+  if (!src_argb ||
+      !dst_yj || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_yj == width &&
+      dst_stride_u * 2 == width &&
+      dst_stride_v * 2 == width) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_yj = dst_stride_u = dst_stride_v = 0;
+  }
+#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
+    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVJRow = ARGBToUVJRow_SSSE3;
+      ARGBToYJRow = ARGBToYJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYJRow = ARGBToYJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYJRow = ARGBToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYJRow = ARGBToYJRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVJRow = ARGBToUVJRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width);
+    ARGBToYJRow(src_argb, dst_yj, width);
+    src_argb += src_stride_argb;
+    dst_yj += dst_stride_yj;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  return 0;
+}
+
+// Convert ARGB to J400.
+LIBYUV_API
+int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_yj, int dst_stride_yj,
+               int width, int height) {
+  int y;
+  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) =
+      ARGBToYJRow_C;
+  if (!src_argb || !dst_yj || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_yj == width) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_yj = 0;
+  }
+#if defined(HAS_ARGBTOYJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYJRow = ARGBToYJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYJRow = ARGBToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYJRow = ARGBToYJRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToYJRow(src_argb, dst_yj, width);
+    src_argb += src_stride_argb;
+    dst_yj += dst_stride_yj;
+  }
+  return 0;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libyuv/source/convert_jpeg.cc b/libs/libyuv/source/convert_jpeg.cc
new file mode 100644
index 0000000000..bcb980f7f1
--- /dev/null
+++ b/libs/libyuv/source/convert_jpeg.cc
@@ -0,0 +1,392 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert.h"
+
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#ifdef HAVE_JPEG
+struct I420Buffers {
+  uint8* y;
+  int y_stride;
+  uint8* u;
+  int u_stride;
+  uint8* v;
+  int v_stride;
+  int w;
+  int h;
+};
+
+static void JpegCopyI420(void* opaque,
+                         const uint8* const* data,
+                         const int* strides,
+                         int rows) {
+  I420Buffers* dest = (I420Buffers*)(opaque);
+  I420Copy(data[0], strides[0],
+           data[1], strides[1],
+           data[2], strides[2],
+           dest->y, dest->y_stride,
+           dest->u, dest->u_stride,
+           dest->v, dest->v_stride,
+           dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->u += ((rows + 1) >> 1) * dest->u_stride;
+  dest->v += ((rows + 1) >> 1) * dest->v_stride;
+  dest->h -= rows;
+}
+
+static void JpegI422ToI420(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  I420Buffers* dest = (I420Buffers*)(opaque);
+  I422ToI420(data[0], strides[0],
+             data[1], strides[1],
+             data[2], strides[2],
+             dest->y, dest->y_stride,
+             dest->u, dest->u_stride,
+             dest->v, dest->v_stride,
+             dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->u += ((rows + 1) >> 1) * dest->u_stride;
+  dest->v += ((rows + 1) >> 1) * dest->v_stride;
+  dest->h -= rows;
+}
+
+static void JpegI444ToI420(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  I420Buffers* dest = (I420Buffers*)(opaque);
+  I444ToI420(data[0], strides[0],
+             data[1], strides[1],
+             data[2], strides[2],
+             dest->y, dest->y_stride,
+             dest->u, dest->u_stride,
+             dest->v, dest->v_stride,
+             dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->u += ((rows + 1) >> 1) * dest->u_stride;
+  dest->v += ((rows + 1) >> 1) * dest->v_stride;
+  dest->h -= rows;
+}
+
+static void JpegI411ToI420(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  I420Buffers* dest = (I420Buffers*)(opaque);
+  I411ToI420(data[0], strides[0],
+             data[1], strides[1],
+             data[2], strides[2],
+             dest->y, dest->y_stride,
+             dest->u, dest->u_stride,
+             dest->v, dest->v_stride,
+             dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->u += ((rows + 1) >> 1) * dest->u_stride;
+  dest->v += ((rows + 1) >> 1) * dest->v_stride;
+  dest->h -= rows;
+}
+
+static void JpegI400ToI420(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  I420Buffers* dest = (I420Buffers*)(opaque);
+  I400ToI420(data[0], strides[0],
+             dest->y, dest->y_stride,
+             dest->u, dest->u_stride,
+             dest->v, dest->v_stride,
+             dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->u += ((rows + 1) >> 1) * dest->u_stride;
+  dest->v += ((rows + 1) >> 1) * dest->v_stride;
+  dest->h -= rows;
+}
+
+// Query size of MJPG in pixels.
+LIBYUV_API
+int MJPGSize(const uint8* sample, size_t sample_size,
+             int* width, int* height) {
+  MJpegDecoder mjpeg_decoder;
+  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+  if (ret) {
+    *width = mjpeg_decoder.GetWidth();
+    *height = mjpeg_decoder.GetHeight();
+  }
+  mjpeg_decoder.UnloadFrame();
+  return ret ? 0 : -1;  // -1 for runtime failure.
+}
+
+// MJPG (Motion JPeg) to I420
+// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
+LIBYUV_API
+int MJPGToI420(const uint8* sample,
+               size_t sample_size,
+               uint8* y, int y_stride,
+               uint8* u, int u_stride,
+               uint8* v, int v_stride,
+               int w, int h,
+               int dw, int dh) {
+  if (sample_size == kUnknownDataSize) {
+    // ERROR: MJPEG frame size unknown
+    return -1;
+  }
+
+  // TODO(fbarchard): Port MJpeg to C.
+  MJpegDecoder mjpeg_decoder;
+  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+  if (ret && (mjpeg_decoder.GetWidth() != w ||
+              mjpeg_decoder.GetHeight() != h)) {
+    // ERROR: MJPEG frame has unexpected dimensions
+    mjpeg_decoder.UnloadFrame();
+    return 1;  // runtime failure
+  }
+  if (ret) {
+    I420Buffers bufs = { y, y_stride, u, u_stride, v, v_stride, dw, dh };
+    // YUV420
+    if (mjpeg_decoder.GetColorSpace() ==
+            MJpegDecoder::kColorSpaceYCbCr &&
+        mjpeg_decoder.GetNumComponents() == 3 &&
+        mjpeg_decoder.GetVertSampFactor(0) == 2 &&
+        mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+        mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+        mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+        mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+        mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dw, dh);
+    // YUV422
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dw, dh);
+    // YUV444
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dw, dh);
+    // YUV411
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 4 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToI420, &bufs, dw, dh);
+    // YUV400
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceGrayscale &&
+               mjpeg_decoder.GetNumComponents() == 1 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dw, dh);
+    } else {
+      // TODO(fbarchard): Implement conversion for any other colorspace/sample
+      // factors that occur in practice. 411 is supported by libjpeg
+      // ERROR: Unable to convert MJPEG frame because format is not supported
+      mjpeg_decoder.UnloadFrame();
+      return 1;
+    }
+  }
+  return ret ? 0 : 1;
+}
+
+#ifdef HAVE_JPEG
+struct ARGBBuffers {
+  uint8* argb;
+  int argb_stride;
+  int w;
+  int h;
+};
+
+static void JpegI420ToARGB(void* opaque,
+                         const uint8* const* data,
+                         const int* strides,
+                         int rows) {
+  ARGBBuffers* dest = (ARGBBuffers*)(opaque);
+  I420ToARGB(data[0], strides[0],
+             data[1], strides[1],
+             data[2], strides[2],
+             dest->argb, dest->argb_stride,
+             dest->w, rows);
+  dest->argb += rows * dest->argb_stride;
+  dest->h -= rows;
+}
+
+static void JpegI422ToARGB(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  ARGBBuffers* dest = (ARGBBuffers*)(opaque);
+  I422ToARGB(data[0], strides[0],
+             data[1], strides[1],
+             data[2], strides[2],
+             dest->argb, dest->argb_stride,
+             dest->w, rows);
+  dest->argb += rows * dest->argb_stride;
+  dest->h -= rows;
+}
+
+static void JpegI444ToARGB(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  ARGBBuffers* dest = (ARGBBuffers*)(opaque);
+  I444ToARGB(data[0], strides[0],
+             data[1], strides[1],
+             data[2], strides[2],
+             dest->argb, dest->argb_stride,
+             dest->w, rows);
+  dest->argb += rows * dest->argb_stride;
+  dest->h -= rows;
+}
+
+static void JpegI411ToARGB(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  ARGBBuffers* dest = (ARGBBuffers*)(opaque);
+  I411ToARGB(data[0], strides[0],
+             data[1], strides[1],
+             data[2], strides[2],
+             dest->argb, dest->argb_stride,
+             dest->w, rows);
+  dest->argb += rows * dest->argb_stride;
+  dest->h -= rows;
+}
+
+static void JpegI400ToARGB(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  ARGBBuffers* dest = (ARGBBuffers*)(opaque);
+  I400ToARGB(data[0], strides[0],
+             dest->argb, dest->argb_stride,
+             dest->w, rows);
+  dest->argb += rows * dest->argb_stride;
+  dest->h -= rows;
+}
+
+// MJPG (Motion JPeg) to ARGB
+// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
+LIBYUV_API
+int MJPGToARGB(const uint8* sample,
+               size_t sample_size,
+               uint8* argb, int argb_stride,
+               int w, int h,
+               int dw, int dh) {
+  if (sample_size == kUnknownDataSize) {
+    // ERROR: MJPEG frame size unknown
+    return -1;
+  }
+
+  // TODO(fbarchard): Port MJpeg to C.
+  MJpegDecoder mjpeg_decoder;
+  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+  if (ret && (mjpeg_decoder.GetWidth() != w ||
+              mjpeg_decoder.GetHeight() != h)) {
+    // ERROR: MJPEG frame has unexpected dimensions
+    mjpeg_decoder.UnloadFrame();
+    return 1;  // runtime failure
+  }
+  if (ret) {
+    ARGBBuffers bufs = { argb, argb_stride, dw, dh };
+    // YUV420
+    if (mjpeg_decoder.GetColorSpace() ==
+            MJpegDecoder::kColorSpaceYCbCr &&
+        mjpeg_decoder.GetNumComponents() == 3 &&
+        mjpeg_decoder.GetVertSampFactor(0) == 2 &&
+        mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+        mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+        mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+        mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+        mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dw, dh);
+    // YUV422
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dw, dh);
+    // YUV444
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dw, dh);
+    // YUV411
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 4 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToARGB, &bufs, dw, dh);
+    // YUV400
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceGrayscale &&
+               mjpeg_decoder.GetNumComponents() == 1 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dw, dh);
+    } else {
+      // TODO(fbarchard): Implement conversion for any other colorspace/sample
+      // factors that occur in practice. 411 is supported by libjpeg
+      // ERROR: Unable to convert MJPEG frame because format is not supported
+      mjpeg_decoder.UnloadFrame();
+      return 1;
+    }
+  }
+  return ret ? 0 : 1;
+}
+#endif
+
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libyuv/source/convert_to_argb.cc b/libs/libyuv/source/convert_to_argb.cc
new file mode 100644
index 0000000000..af829fbd32
--- /dev/null
+++ b/libs/libyuv/source/convert_to_argb.cc
@@ -0,0 +1,306 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert_argb.h"
+
+#include "libyuv/cpu_id.h"
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+#include "libyuv/rotate_argb.h"
+#include "libyuv/row.h"
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Convert camera sample to I420 with cropping, rotation and vertical flip.
+// src_width is used for source stride computation
+// src_height is used to compute location of planes, and indicate inversion
+// sample_size is measured in bytes and is the size of the frame.
+//   With MJPEG it is the compressed size of the frame.
+LIBYUV_API
+int ConvertToARGB(const uint8* sample, size_t sample_size,
+                  uint8* crop_argb, int argb_stride,
+                  int crop_x, int crop_y,
+                  int src_width, int src_height,
+                  int crop_width, int crop_height,
+                  enum RotationMode rotation,
+                  uint32 fourcc) {
+  uint32 format = CanonicalFourCC(fourcc);
+  int aligned_src_width = (src_width + 1) & ~1;
+  const uint8* src;
+  const uint8* src_uv;
+  int abs_src_height = (src_height < 0) ? -src_height : src_height;
+  int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height;
+  int r = 0;
+
+  // One pass rotation is available for some formats. For the rest, convert
+  // to I420 (with optional vertical flipping) into a temporary I420 buffer,
+  // and then rotate the I420 to the final destination buffer.
+  // For in-place conversion, if destination crop_argb is same as source sample,
+  // also enable temporary buffer.
+  LIBYUV_BOOL need_buf = (rotation && format != FOURCC_ARGB) ||
+      crop_argb == sample;
+  uint8* tmp_argb = crop_argb;
+  int tmp_argb_stride = argb_stride;
+  uint8* rotate_buffer = NULL;
+  int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
+
+  if (crop_argb == NULL || sample == NULL ||
+      src_width <= 0 || crop_width <= 0 ||
+      src_height == 0 || crop_height == 0) {
+    return -1;
+  }
+  if (src_height < 0) {
+    inv_crop_height = -inv_crop_height;
+  }
+
+  if (need_buf) {
+    int argb_size = crop_width * abs_crop_height * 4;
+    rotate_buffer = (uint8*)malloc(argb_size);
+    if (!rotate_buffer) {
+      return 1;  // Out of memory runtime error.
+    }
+    crop_argb = rotate_buffer;
+    argb_stride = crop_width;
+  }
+
+  switch (format) {
+    // Single plane formats
+    case FOURCC_YUY2:
+      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+      r = YUY2ToARGB(src, aligned_src_width * 2,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_UYVY:
+      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+      r = UYVYToARGB(src, aligned_src_width * 2,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_24BG:
+      src = sample + (src_width * crop_y + crop_x) * 3;
+      r = RGB24ToARGB(src, src_width * 3,
+                      crop_argb, argb_stride,
+                      crop_width, inv_crop_height);
+      break;
+    case FOURCC_RAW:
+      src = sample + (src_width * crop_y + crop_x) * 3;
+      r = RAWToARGB(src, src_width * 3,
+                    crop_argb, argb_stride,
+                    crop_width, inv_crop_height);
+      break;
+    case FOURCC_ARGB:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = ARGBToARGB(src, src_width * 4,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_BGRA:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = BGRAToARGB(src, src_width * 4,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_ABGR:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = ABGRToARGB(src, src_width * 4,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_RGBA:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = RGBAToARGB(src, src_width * 4,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_RGBP:
+      src = sample + (src_width * crop_y + crop_x) * 2;
+      r = RGB565ToARGB(src, src_width * 2,
+                       crop_argb, argb_stride,
+                       crop_width, inv_crop_height);
+      break;
+    case FOURCC_RGBO:
+      src = sample + (src_width * crop_y + crop_x) * 2;
+      r = ARGB1555ToARGB(src, src_width * 2,
+                         crop_argb, argb_stride,
+                         crop_width, inv_crop_height);
+      break;
+    case FOURCC_R444:
+      src = sample + (src_width * crop_y + crop_x) * 2;
+      r = ARGB4444ToARGB(src, src_width * 2,
+                         crop_argb, argb_stride,
+                         crop_width, inv_crop_height);
+      break;
+    case FOURCC_I400:
+      src = sample + src_width * crop_y + crop_x;
+      r = I400ToARGB(src, src_width,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+
+    // Biplanar formats
+    case FOURCC_NV12:
+      src = sample + (src_width * crop_y + crop_x);
+      src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
+      r = NV12ToARGB(src, src_width,
+                     src_uv, aligned_src_width,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_NV21:
+      src = sample + (src_width * crop_y + crop_x);
+      src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
+      // Call NV12 but with u and v parameters swapped.
+      r = NV21ToARGB(src, src_width,
+                     src_uv, aligned_src_width,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_M420:
+      src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
+      r = M420ToARGB(src, src_width,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    // Triplanar formats
+    case FOURCC_I420:
+    case FOURCC_YU12:
+    case FOURCC_YV12: {
+      const uint8* src_y = sample + (src_width * crop_y + crop_x);
+      const uint8* src_u;
+      const uint8* src_v;
+      int halfwidth = (src_width + 1) / 2;
+      int halfheight = (abs_src_height + 1) / 2;
+      if (format == FOURCC_YV12) {
+        src_v = sample + src_width * abs_src_height +
+            (halfwidth * crop_y + crop_x) / 2;
+        src_u = sample + src_width * abs_src_height +
+            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+      } else {
+        src_u = sample + src_width * abs_src_height +
+            (halfwidth * crop_y + crop_x) / 2;
+        src_v = sample + src_width * abs_src_height +
+            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+      }
+      r = I420ToARGB(src_y, src_width,
+                     src_u, halfwidth,
+                     src_v, halfwidth,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    }
+
+    case FOURCC_J420: {
+      const uint8* src_y = sample + (src_width * crop_y + crop_x);
+      const uint8* src_u;
+      const uint8* src_v;
+      int halfwidth = (src_width + 1) / 2;
+      int halfheight = (abs_src_height + 1) / 2;
+      src_u = sample + src_width * abs_src_height +
+          (halfwidth * crop_y + crop_x) / 2;
+      src_v = sample + src_width * abs_src_height +
+          halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+      r = J420ToARGB(src_y, src_width,
+                     src_u, halfwidth,
+                     src_v, halfwidth,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    }
+
+    case FOURCC_I422:
+    case FOURCC_YV16: {
+      const uint8* src_y = sample + src_width * crop_y + crop_x;
+      const uint8* src_u;
+      const uint8* src_v;
+      int halfwidth = (src_width + 1) / 2;
+      if (format == FOURCC_YV16) {
+        src_v = sample + src_width * abs_src_height +
+            halfwidth * crop_y + crop_x / 2;
+        src_u = sample + src_width * abs_src_height +
+            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+      } else {
+        src_u = sample + src_width * abs_src_height +
+            halfwidth * crop_y + crop_x / 2;
+        src_v = sample + src_width * abs_src_height +
+            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+      }
+      r = I422ToARGB(src_y, src_width,
+                     src_u, halfwidth,
+                     src_v, halfwidth,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    }
+    case FOURCC_I444:
+    case FOURCC_YV24: {
+      const uint8* src_y = sample + src_width * crop_y + crop_x;
+      const uint8* src_u;
+      const uint8* src_v;
+      if (format == FOURCC_YV24) {
+        src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
+        src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+      } else {
+        src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+        src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+      }
+      r = I444ToARGB(src_y, src_width,
+                     src_u, src_width,
+                     src_v, src_width,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    }
+    case FOURCC_I411: {
+      int quarterwidth = (src_width + 3) / 4;
+      const uint8* src_y = sample + src_width * crop_y + crop_x;
+      const uint8* src_u = sample + src_width * abs_src_height +
+          quarterwidth * crop_y + crop_x / 4;
+      const uint8* src_v = sample + src_width * abs_src_height +
+          quarterwidth * (abs_src_height + crop_y) + crop_x / 4;
+      r = I411ToARGB(src_y, src_width,
+                     src_u, quarterwidth,
+                     src_v, quarterwidth,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    }
+#ifdef HAVE_JPEG
+    case FOURCC_MJPG:
+      r = MJPGToARGB(sample, sample_size,
+                     crop_argb, argb_stride,
+                     src_width, abs_src_height, crop_width, inv_crop_height);
+      break;
+#endif
+    default:
+      r = -1;  // unknown fourcc - return failure code.
+  }
+
+  if (need_buf) {
+    if (!r) {
+      r = ARGBRotate(crop_argb, argb_stride,
+                     tmp_argb, tmp_argb_stride,
+                     crop_width, abs_crop_height, rotation);
+    }
+    free(rotate_buffer);
+  }
+
+  return r;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libyuv/source/convert_to_i420.cc b/libs/libyuv/source/convert_to_i420.cc
new file mode 100644
index 0000000000..5e75369b55
--- /dev/null
+++ b/libs/libyuv/source/convert_to_i420.cc
@@ -0,0 +1,339 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "libyuv/convert.h"
+
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Convert camera sample to I420 with cropping, rotation and vertical flip.
+// src_width is used for source stride computation
+// src_height is used to compute location of planes, and indicate inversion
+// sample_size is measured in bytes and is the size of the frame.
+//   With MJPEG it is the compressed size of the frame.
+LIBYUV_API
+int ConvertToI420(const uint8* sample,
+                  size_t sample_size,
+                  uint8* y, int y_stride,
+                  uint8* u, int u_stride,
+                  uint8* v, int v_stride,
+                  int crop_x, int crop_y,
+                  int src_width, int src_height,
+                  int crop_width, int crop_height,
+                  enum RotationMode rotation,
+                  uint32 fourcc) {
+  uint32 format = CanonicalFourCC(fourcc);
+  int aligned_src_width = (src_width + 1) & ~1;
+  const uint8* src;
+  const uint8* src_uv;
+  int abs_src_height = (src_height < 0) ? -src_height : src_height;
+  int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height;
+  int r = 0;
+  LIBYUV_BOOL need_buf = (rotation && format != FOURCC_I420 &&
+      format != FOURCC_NV12 && format != FOURCC_NV21 &&
+      format != FOURCC_YU12 && format != FOURCC_YV12) || y == sample;
+  uint8* tmp_y = y;
+  uint8* tmp_u = u;
+  uint8* tmp_v = v;
+  int tmp_y_stride = y_stride;
+  int tmp_u_stride = u_stride;
+  int tmp_v_stride = v_stride;
+  uint8* rotate_buffer = NULL;
+  int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
+
+  if (!y || !u || !v || !sample ||
+      src_width <= 0 || crop_width <= 0  ||
+      src_height == 0 || crop_height == 0) {
+    return -1;
+  }
+  if (src_height < 0) {
+    inv_crop_height = -inv_crop_height;
+  }
+
+  // One pass rotation is available for some formats. For the rest, convert
+  // to I420 (with optional vertical flipping) into a temporary I420 buffer,
+  // and then rotate the I420 to the final destination buffer.
+  // For in-place conversion, if destination y is same as source sample,
+  // also enable temporary buffer.
+  if (need_buf) {
+    int y_size = crop_width * abs_crop_height;
+    int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2);
+    rotate_buffer = (uint8*)malloc(y_size + uv_size * 2);
+    if (!rotate_buffer) {
+      return 1;  // Out of memory runtime error.
+    }
+    y = rotate_buffer;
+    u = y + y_size;
+    v = u + uv_size;
+    y_stride = crop_width;
+    u_stride = v_stride = ((crop_width + 1) / 2);
+  }
+
+  switch (format) {
+    // Single plane formats
+    case FOURCC_YUY2:
+      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+      r = YUY2ToI420(src, aligned_src_width * 2,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_UYVY:
+      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+      r = UYVYToI420(src, aligned_src_width * 2,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_RGBP:
+      src = sample + (src_width * crop_y + crop_x) * 2;
+      r = RGB565ToI420(src, src_width * 2,
+                       y, y_stride,
+                       u, u_stride,
+                       v, v_stride,
+                       crop_width, inv_crop_height);
+      break;
+    case FOURCC_RGBO:
+      src = sample + (src_width * crop_y + crop_x) * 2;
+      r = ARGB1555ToI420(src, src_width * 2,
+                         y, y_stride,
+                         u, u_stride,
+                         v, v_stride,
+                         crop_width, inv_crop_height);
+      break;
+    case FOURCC_R444:
+      src = sample + (src_width * crop_y + crop_x) * 2;
+      r = ARGB4444ToI420(src, src_width * 2,
+                         y, y_stride,
+                         u, u_stride,
+                         v, v_stride,
+                         crop_width, inv_crop_height);
+      break;
+    case FOURCC_24BG:
+      src = sample + (src_width * crop_y + crop_x) * 3;
+      r = RGB24ToI420(src, src_width * 3,
+                      y, y_stride,
+                      u, u_stride,
+                      v, v_stride,
+                      crop_width, inv_crop_height);
+      break;
+    case FOURCC_RAW:
+      src = sample + (src_width * crop_y + crop_x) * 3;
+      r = RAWToI420(src, src_width * 3,
+                    y, y_stride,
+                    u, u_stride,
+                    v, v_stride,
+                    crop_width, inv_crop_height);
+      break;
+    case FOURCC_ARGB:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = ARGBToI420(src, src_width * 4,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_BGRA:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = BGRAToI420(src, src_width * 4,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_ABGR:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = ABGRToI420(src, src_width * 4,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_RGBA:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = RGBAToI420(src, src_width * 4,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_I400:
+      src = sample + src_width * crop_y + crop_x;
+      r = I400ToI420(src, src_width,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    // Biplanar formats
+    case FOURCC_NV12:
+      src = sample + (src_width * crop_y + crop_x);
+      src_uv = sample + (src_width * src_height) +
+        ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
+      r = NV12ToI420Rotate(src, src_width,
+                           src_uv, aligned_src_width,
+                           y, y_stride,
+                           u, u_stride,
+                           v, v_stride,
+                           crop_width, inv_crop_height, rotation);
+      break;
+    case FOURCC_NV21:
+      src = sample + (src_width * crop_y + crop_x);
+      src_uv = sample + (src_width * src_height) +
+        ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
+      // Call NV12 but with u and v parameters swapped.
+      r = NV12ToI420Rotate(src, src_width,
+                           src_uv, aligned_src_width,
+                           y, y_stride,
+                           v, v_stride,
+                           u, u_stride,
+                           crop_width, inv_crop_height, rotation);
+      break;
+    case FOURCC_M420:
+      src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
+      r = M420ToI420(src, src_width,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    // Triplanar formats
+    case FOURCC_I420:
+    case FOURCC_YU12:
+    case FOURCC_YV12: {
+      const uint8* src_y = sample + (src_width * crop_y + crop_x);
+      const uint8* src_u;
+      const uint8* src_v;
+      int halfwidth = (src_width + 1) / 2;
+      int halfheight = (abs_src_height + 1) / 2;
+      if (format == FOURCC_YV12) {
+        src_v = sample + src_width * abs_src_height +
+            (halfwidth * crop_y + crop_x) / 2;
+        src_u = sample + src_width * abs_src_height +
+            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+      } else {
+        src_u = sample + src_width * abs_src_height +
+            (halfwidth * crop_y + crop_x) / 2;
+        src_v = sample + src_width * abs_src_height +
+            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+      }
+      r = I420Rotate(src_y, src_width,
+                     src_u, halfwidth,
+                     src_v, halfwidth,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height, rotation);
+      break;
+    }
+    case FOURCC_I422:
+    case FOURCC_YV16: {
+      const uint8* src_y = sample + src_width * crop_y + crop_x;
+      const uint8* src_u;
+      const uint8* src_v;
+      int halfwidth = (src_width + 1) / 2;
+      if (format == FOURCC_YV16) {
+        src_v = sample + src_width * abs_src_height +
+            halfwidth * crop_y + crop_x / 2;
+        src_u = sample + src_width * abs_src_height +
+            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+      } else {
+        src_u = sample + src_width * abs_src_height +
+            halfwidth * crop_y + crop_x / 2;
+        src_v = sample + src_width * abs_src_height +
+            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+      }
+      r = I422ToI420(src_y, src_width,
+                     src_u, halfwidth,
+                     src_v, halfwidth,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    }
+    case FOURCC_I444:
+    case FOURCC_YV24: {
+      const uint8* src_y = sample + src_width * crop_y + crop_x;
+      const uint8* src_u;
+      const uint8* src_v;
+      if (format == FOURCC_YV24) {
+        src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
+        src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+      } else {
+        src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+        src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+      }
+      r = I444ToI420(src_y, src_width,
+                     src_u, src_width,
+                     src_v, src_width,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    }
+    case FOURCC_I411: {
+      int quarterwidth = (src_width + 3) / 4;
+      const uint8* src_y = sample + src_width * crop_y + crop_x;
+      const uint8* src_u = sample + src_width * abs_src_height +
+          quarterwidth * crop_y + crop_x / 4;
+      const uint8* src_v = sample + src_width * abs_src_height +
+          quarterwidth * (abs_src_height + crop_y) + crop_x / 4;
+      r = I411ToI420(src_y, src_width,
+                     src_u, quarterwidth,
+                     src_v, quarterwidth,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    }
+#ifdef HAVE_JPEG
+    case FOURCC_MJPG:
+      r = MJPGToI420(sample, sample_size,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     src_width, abs_src_height, crop_width, inv_crop_height);
+      break;
+#endif
+    default:
+      r = -1;  // unknown fourcc - return failure code.
+  }
+
+  if (need_buf) {
+    if (!r) {
+      r = I420Rotate(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     tmp_y, tmp_y_stride,
+                     tmp_u, tmp_u_stride,
+                     tmp_v, tmp_v_stride,
+                     crop_width, abs_crop_height, rotation);
+    }
+    free(rotate_buffer);
+  }
+
+  return r;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libyuv/source/cpu_id.cc b/libs/libyuv/source/cpu_id.cc
new file mode 100644
index 0000000000..8d2c3a4957
--- /dev/null
+++ b/libs/libyuv/source/cpu_id.cc
@@ -0,0 +1,299 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/cpu_id.h"
+
+#if defined(_MSC_VER) && !defined(__clang__)
+#include <intrin.h>  // For __cpuidex()
+#endif
+#if !defined(__pnacl__) && !defined(__CLR_VER) && \
+    !defined(__native_client__) && (defined(_M_IX86) || defined(_M_X64)) && \
+    defined(_MSC_VER) && !defined(__clang__) && (_MSC_FULL_VER >= 160040219)
+#include <immintrin.h>  // For _xgetbv()
+#endif
+
+#if !defined(__native_client__)
+#include <stdlib.h>  // For getenv()
+#endif
+
+// For ArmCpuCaps() but unittested on all platforms
+#include <stdio.h>
+#include <string.h>
+
+#include "libyuv/basic_types.h"  // For CPU_X86
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// For functions that use the stack and have runtime checks for overflow,
+// use SAFEBUFFERS to avoid additional check.
+#if (defined(_MSC_VER) && !defined(__clang__)) && (_MSC_FULL_VER >= 160040219)
+#define SAFEBUFFERS __declspec(safebuffers)
+#else
+#define SAFEBUFFERS
+#endif
+
+// Low level cpuid for X86.
+#if (defined(_M_IX86) || defined(_M_X64) || \
+    defined(__i386__) || defined(__x86_64__)) && \
+    !defined(__pnacl__) && !defined(__CLR_VER)
+LIBYUV_API
+void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
+#if defined(_MSC_VER) && !defined(__clang__)
+// Visual C version uses intrinsic or inline x86 assembly.
+#if (_MSC_FULL_VER >= 160040219)
+  __cpuidex((int*)(cpu_info), info_eax, info_ecx);
+#elif defined(_M_IX86)
+  __asm {
+    mov        eax, info_eax
+    mov        ecx, info_ecx
+    mov        edi, cpu_info
+    cpuid
+    mov        [edi], eax
+    mov        [edi + 4], ebx
+    mov        [edi + 8], ecx
+    mov        [edi + 12], edx
+  }
+#else  // Visual C but not x86
+  if (info_ecx == 0) {
+    __cpuid((int*)(cpu_info), info_eax);
+  } else {
+    cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0;
+  }
+#endif
+// GCC version uses inline x86 assembly.
+#else  // defined(_MSC_VER) && !defined(__clang__)
+  uint32 info_ebx, info_edx;
+  asm volatile (
+#if defined( __i386__) && defined(__PIC__)
+    // Preserve ebx for fpic 32 bit.
+    "mov %%ebx, %%edi                          \n"
+    "cpuid                                     \n"
+    "xchg %%edi, %%ebx                         \n"
+    : "=D" (info_ebx),
+#else
+    "cpuid                                     \n"
+    : "=b" (info_ebx),
+#endif  //  defined( __i386__) && defined(__PIC__)
+      "+a" (info_eax), "+c" (info_ecx), "=d" (info_edx));
+  cpu_info[0] = info_eax;
+  cpu_info[1] = info_ebx;
+  cpu_info[2] = info_ecx;
+  cpu_info[3] = info_edx;
+#endif  // defined(_MSC_VER) && !defined(__clang__)
+}
+#else  // (defined(_M_IX86) || defined(_M_X64) ...
+LIBYUV_API
+void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) {
+  cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
+}
+#endif
+
+// For VS2010 and earlier emit can be used:
+//   _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0  // For VS2010 and earlier.
+//  __asm {
+//    xor        ecx, ecx    // xcr 0
+//    xgetbv
+//    mov        xcr0, eax
+//  }
+// For VS2013 and earlier 32 bit, the _xgetbv(0) optimizer produces bad code.
+// https://code.google.com/p/libyuv/issues/detail?id=529
+#if defined(_M_IX86) && (_MSC_VER < 1900)
+#pragma optimize("g", off)
+#endif
+#if (defined(_M_IX86) || defined(_M_X64) || \
+    defined(__i386__) || defined(__x86_64__)) && \
+    !defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__)
+#define HAS_XGETBV
+// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
+int GetXCR0() {
+  uint32 xcr0 = 0u;
+#if (_MSC_FULL_VER >= 160040219)
+  xcr0 = (uint32)(_xgetbv(0));  // VS2010 SP1 required.
+#elif defined(__i386__) || defined(__x86_64__)
+  asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx");
+#endif  // defined(__i386__) || defined(__x86_64__)
+  return xcr0;
+}
+#endif  // defined(_M_IX86) || defined(_M_X64) ..
+// Return optimization to previous setting.
+#if defined(_M_IX86) && (_MSC_VER < 1900)
+#pragma optimize("g", on)
+#endif
+
+// based on libvpx arm_cpudetect.c
+// For Arm, but public to allow testing on any CPU
+LIBYUV_API SAFEBUFFERS
+int ArmCpuCaps(const char* cpuinfo_name) {
+  char cpuinfo_line[512];
+  FILE* f = fopen(cpuinfo_name, "r");
+  if (!f) {
+    // Assume Neon if /proc/cpuinfo is unavailable.
+    // This will occur for Chrome sandbox for Pepper or Render process.
+    return kCpuHasNEON;
+  }
+  while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {
+    if (memcmp(cpuinfo_line, "Features", 8) == 0) {
+      char* p = strstr(cpuinfo_line, " neon");
+      if (p && (p[5] == ' ' || p[5] == '\n')) {
+        fclose(f);
+        return kCpuHasNEON;
+      }
+      // aarch64 uses asimd for Neon.
+      p = strstr(cpuinfo_line, " asimd");
+      if (p && (p[6] == ' ' || p[6] == '\n')) {
+        fclose(f);
+        return kCpuHasNEON;
+      }
+    }
+  }
+  fclose(f);
+  return 0;
+}
+
+// CPU detect function for SIMD instruction sets.
+LIBYUV_API
+int cpu_info_ = 0;  // cpu_info is not initialized yet.
+
+// Test environment variable for disabling CPU features. Any non-zero value
+// to disable. Zero ignored to make it easy to set the variable on/off.
+#if !defined(__native_client__) && !defined(_M_ARM)
+
+static LIBYUV_BOOL TestEnv(const char* name) {
+  const char* var = getenv(name);
+  if (var) {
+    if (var[0] != '0') {
+      return LIBYUV_TRUE;
+    }
+  }
+  return LIBYUV_FALSE;
+}
+#else  // nacl does not support getenv().
+static LIBYUV_BOOL TestEnv(const char*) {
+  return LIBYUV_FALSE;
+}
+#endif
+
+LIBYUV_API SAFEBUFFERS
+int InitCpuFlags(void) {
+  // TODO(fbarchard): swap kCpuInit logic so 0 means uninitialized.
+  int cpu_info = 0;
+#if !defined(__pnacl__) && !defined(__CLR_VER) && defined(CPU_X86)
+  uint32 cpu_info0[4] = { 0, 0, 0, 0 };
+  uint32 cpu_info1[4] = { 0, 0, 0, 0 };
+  uint32 cpu_info7[4] = { 0, 0, 0, 0 };
+  CpuId(0, 0, cpu_info0);
+  CpuId(1, 0, cpu_info1);
+  if (cpu_info0[0] >= 7) {
+    CpuId(7, 0, cpu_info7);
+  }
+  cpu_info = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
+             ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
+             ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
+             ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
+             ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) |
+             ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
+             kCpuHasX86;
+
+#ifdef HAS_XGETBV
+  // AVX requires CPU has AVX, XSAVE and OSXSave for xgetbv
+  if (((cpu_info1[2] & 0x1c000000) == 0x1c000000) &&  // AVX and OSXSave
+      ((GetXCR0() & 6) == 6)) {  // Test OS saves YMM registers
+    cpu_info |= ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) | kCpuHasAVX;
+
+    // Detect AVX512bw
+    if ((GetXCR0() & 0xe0) == 0xe0) {
+      cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX3 : 0;
+    }
+  }
+#endif
+
+  // Environment variable overrides for testing.
+  if (TestEnv("LIBYUV_DISABLE_X86")) {
+    cpu_info &= ~kCpuHasX86;
+  }
+  if (TestEnv("LIBYUV_DISABLE_SSE2")) {
+    cpu_info &= ~kCpuHasSSE2;
+  }
+  if (TestEnv("LIBYUV_DISABLE_SSSE3")) {
+    cpu_info &= ~kCpuHasSSSE3;
+  }
+  if (TestEnv("LIBYUV_DISABLE_SSE41")) {
+    cpu_info &= ~kCpuHasSSE41;
+  }
+  if (TestEnv("LIBYUV_DISABLE_SSE42")) {
+    cpu_info &= ~kCpuHasSSE42;
+  }
+  if (TestEnv("LIBYUV_DISABLE_AVX")) {
+    cpu_info &= ~kCpuHasAVX;
+  }
+  if (TestEnv("LIBYUV_DISABLE_AVX2")) {
+    cpu_info &= ~kCpuHasAVX2;
+  }
+  if (TestEnv("LIBYUV_DISABLE_ERMS")) {
+    cpu_info &= ~kCpuHasERMS;
+  }
+  if (TestEnv("LIBYUV_DISABLE_FMA3")) {
+    cpu_info &= ~kCpuHasFMA3;
+  }
+  if (TestEnv("LIBYUV_DISABLE_AVX3")) {
+    cpu_info &= ~kCpuHasAVX3;
+  }
+#endif
+#if defined(__mips__) && defined(__linux__)
+#if defined(__mips_dspr2)
+  cpu_info |= kCpuHasDSPR2;
+#endif
+  cpu_info |= kCpuHasMIPS;
+  if (getenv("LIBYUV_DISABLE_DSPR2")) {
+    cpu_info &= ~kCpuHasDSPR2;
+  }
+#endif
+#if defined(__arm__) || defined(__aarch64__)
+// gcc -mfpu=neon defines __ARM_NEON__
+// __ARM_NEON__ generates code that requires Neon.  NaCL also requires Neon.
+// For Linux, /proc/cpuinfo can be tested but without that assume Neon.
+#if defined(__ARM_NEON__) || defined(__native_client__) || !defined(__linux__)
+  cpu_info = kCpuHasNEON;
+// For aarch64(arm64), /proc/cpuinfo's feature is not complete, e.g. no neon
+// flag in it.
+// So for aarch64, neon enabling is hard coded here.
+#endif
+#if defined(__aarch64__)
+  cpu_info = kCpuHasNEON;
+#else
+  // Linux arm parse text file for neon detect.
+  cpu_info = ArmCpuCaps("/proc/cpuinfo");
+#endif
+  cpu_info |= kCpuHasARM;
+  if (TestEnv("LIBYUV_DISABLE_NEON")) {
+    cpu_info &= ~kCpuHasNEON;
+  }
+#endif  // __arm__
+  if (TestEnv("LIBYUV_DISABLE_ASM")) {
+    cpu_info = 0;
+  }
+  cpu_info  |= kCpuInitialized;
+  cpu_info_ = cpu_info;
+  return cpu_info;
+}
+
+// Note that use of this function is not thread safe.
+LIBYUV_API
+void MaskCpuFlags(int enable_flags) {
+  cpu_info_ = InitCpuFlags() & enable_flags;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libyuv/source/mjpeg_decoder.cc b/libs/libyuv/source/mjpeg_decoder.cc
new file mode 100644
index 0000000000..50818418a6
--- /dev/null
+++ b/libs/libyuv/source/mjpeg_decoder.cc
@@ -0,0 +1,570 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/mjpeg_decoder.h"
+
+#ifdef HAVE_JPEG
+#include <assert.h>
+
+#if !defined(__pnacl__) && !defined(__CLR_VER) && \
+    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
+// Must be included before jpeglib.
+#include <setjmp.h>
+#define HAVE_SETJMP
+
+#if defined(_MSC_VER)
+// disable warning 4324: structure was padded due to __declspec(align())
+#pragma warning(disable:4324)
+#endif
+
+#endif
+struct FILE;  // For jpeglib.h.
+
+// C++ build requires extern C for jpeg internals.
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <jpeglib.h>
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#include "libyuv/planar_functions.h"  // For CopyPlane().
+
+namespace libyuv {
+
+#ifdef HAVE_SETJMP
+struct SetJmpErrorMgr {
+  jpeg_error_mgr base;  // Must be at the top
+  jmp_buf setjmp_buffer;
+};
+#endif
+
+const int MJpegDecoder::kColorSpaceUnknown = JCS_UNKNOWN;
+const int MJpegDecoder::kColorSpaceGrayscale = JCS_GRAYSCALE;
+const int MJpegDecoder::kColorSpaceRgb = JCS_RGB;
+const int MJpegDecoder::kColorSpaceYCbCr = JCS_YCbCr;
+const int MJpegDecoder::kColorSpaceCMYK = JCS_CMYK;
+const int MJpegDecoder::kColorSpaceYCCK = JCS_YCCK;
+
+// Methods that are passed to jpeglib.
+boolean fill_input_buffer(jpeg_decompress_struct* cinfo);
+void init_source(jpeg_decompress_struct* cinfo);
+void skip_input_data(jpeg_decompress_struct* cinfo, long num_bytes);  // NOLINT
+void term_source(jpeg_decompress_struct* cinfo);
+void ErrorHandler(jpeg_common_struct* cinfo);
+
+MJpegDecoder::MJpegDecoder()
+    : has_scanline_padding_(LIBYUV_FALSE),
+      num_outbufs_(0),
+      scanlines_(NULL),
+      scanlines_sizes_(NULL),
+      databuf_(NULL),
+      databuf_strides_(NULL) {
+  decompress_struct_ = new jpeg_decompress_struct;
+  source_mgr_ = new jpeg_source_mgr;
+#ifdef HAVE_SETJMP
+  error_mgr_ = new SetJmpErrorMgr;
+  decompress_struct_->err = jpeg_std_error(&error_mgr_->base);
+  // Override standard exit()-based error handler.
+  error_mgr_->base.error_exit = &ErrorHandler;
+#endif
+  decompress_struct_->client_data = NULL;
+  source_mgr_->init_source = &init_source;
+  source_mgr_->fill_input_buffer = &fill_input_buffer;
+  source_mgr_->skip_input_data = &skip_input_data;
+  source_mgr_->resync_to_restart = &jpeg_resync_to_restart;
+  source_mgr_->term_source = &term_source;
+  jpeg_create_decompress(decompress_struct_);
+  decompress_struct_->src = source_mgr_;
+  buf_vec_.buffers = &buf_;
+  buf_vec_.len = 1;
+}
+
+MJpegDecoder::~MJpegDecoder() {
+  jpeg_destroy_decompress(decompress_struct_);
+  delete decompress_struct_;
+  delete source_mgr_;
+#ifdef HAVE_SETJMP
+  delete error_mgr_;
+#endif
+  DestroyOutputBuffers();
+}
+
+LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) {
+  if (!ValidateJpeg(src, src_len)) {
+    return LIBYUV_FALSE;
+  }
+
+  buf_.data = src;
+  buf_.len = static_cast<int>(src_len);
+  buf_vec_.pos = 0;
+  decompress_struct_->client_data = &buf_vec_;
+#ifdef HAVE_SETJMP
+  if (setjmp(error_mgr_->setjmp_buffer)) {
+    // We called jpeg_read_header, it experienced an error, and we called
+    // longjmp() and rewound the stack to here. Return error.
+    return LIBYUV_FALSE;
+  }
+#endif
+  if (jpeg_read_header(decompress_struct_, TRUE) != JPEG_HEADER_OK) {
+    // ERROR: Bad MJPEG header
+    return LIBYUV_FALSE;
+  }
+  AllocOutputBuffers(GetNumComponents());
+  for (int i = 0; i < num_outbufs_; ++i) {
+    int scanlines_size = GetComponentScanlinesPerImcuRow(i);
+    if (scanlines_sizes_[i] != scanlines_size) {
+      if (scanlines_[i]) {
+        delete scanlines_[i];
+      }
+      scanlines_[i] = new uint8* [scanlines_size];
+      scanlines_sizes_[i] = scanlines_size;
+    }
+
+    // We allocate padding for the final scanline to pad it up to DCTSIZE bytes
+    // to avoid memory errors, since jpeglib only reads full MCUs blocks. For
+    // the preceding scanlines, the padding is not needed/wanted because the
+    // following addresses will already be valid (they are the initial bytes of
+    // the next scanline) and will be overwritten when jpeglib writes out that
+    // next scanline.
+    int databuf_stride = GetComponentStride(i);
+    int databuf_size = scanlines_size * databuf_stride;
+    if (databuf_strides_[i] != databuf_stride) {
+      if (databuf_[i]) {
+        delete databuf_[i];
+      }
+      databuf_[i] = new uint8[databuf_size];
+      databuf_strides_[i] = databuf_stride;
+    }
+
+    if (GetComponentStride(i) != GetComponentWidth(i)) {
+      has_scanline_padding_ = LIBYUV_TRUE;
+    }
+  }
+  return LIBYUV_TRUE;
+}
+
+static int DivideAndRoundUp(int numerator, int denominator) {
+  return (numerator + denominator - 1) / denominator;
+}
+
+static int DivideAndRoundDown(int numerator, int denominator) {
+  return numerator / denominator;
+}
+
+// Returns width of the last loaded frame.
+int MJpegDecoder::GetWidth() {
+  return decompress_struct_->image_width;
+}
+
+// Returns height of the last loaded frame.
+int MJpegDecoder::GetHeight() {
+  return decompress_struct_->image_height;
+}
+
+// Returns format of the last loaded frame. The return value is one of the
+// kColorSpace* constants.
+int MJpegDecoder::GetColorSpace() {
+  return decompress_struct_->jpeg_color_space;
+}
+
+// Number of color components in the color space.
+int MJpegDecoder::GetNumComponents() {
+  return decompress_struct_->num_components;
+}
+
+// Sample factors of the n-th component.
+int MJpegDecoder::GetHorizSampFactor(int component) {
+  return decompress_struct_->comp_info[component].h_samp_factor;
+}
+
+int MJpegDecoder::GetVertSampFactor(int component) {
+  return decompress_struct_->comp_info[component].v_samp_factor;
+}
+
+int MJpegDecoder::GetHorizSubSampFactor(int component) {
+  return decompress_struct_->max_h_samp_factor /
+      GetHorizSampFactor(component);
+}
+
+int MJpegDecoder::GetVertSubSampFactor(int component) {
+  return decompress_struct_->max_v_samp_factor /
+      GetVertSampFactor(component);
+}
+
+int MJpegDecoder::GetImageScanlinesPerImcuRow() {
+  return decompress_struct_->max_v_samp_factor * DCTSIZE;
+}
+
+int MJpegDecoder::GetComponentScanlinesPerImcuRow(int component) {
+  int vs = GetVertSubSampFactor(component);
+  return DivideAndRoundUp(GetImageScanlinesPerImcuRow(), vs);
+}
+
+int MJpegDecoder::GetComponentWidth(int component) {
+  int hs = GetHorizSubSampFactor(component);
+  return DivideAndRoundUp(GetWidth(), hs);
+}
+
+int MJpegDecoder::GetComponentHeight(int component) {
+  int vs = GetVertSubSampFactor(component);
+  return DivideAndRoundUp(GetHeight(), vs);
+}
+
+// Get width in bytes padded out to a multiple of DCTSIZE
+int MJpegDecoder::GetComponentStride(int component) {
+  return (GetComponentWidth(component) + DCTSIZE - 1) & ~(DCTSIZE - 1);
+}
+
+int MJpegDecoder::GetComponentSize(int component) {
+  return GetComponentWidth(component) * GetComponentHeight(component);
+}
+
+LIBYUV_BOOL MJpegDecoder::UnloadFrame() {
+#ifdef HAVE_SETJMP
+  if (setjmp(error_mgr_->setjmp_buffer)) {
+    // We called jpeg_abort_decompress, it experienced an error, and we called
+    // longjmp() and rewound the stack to here. Return error.
+    return LIBYUV_FALSE;
+  }
+#endif
+  jpeg_abort_decompress(decompress_struct_);
+  return LIBYUV_TRUE;
+}
+
+// TODO(fbarchard): Allow rectangle to be specified: x, y, width, height.
+LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(
+    uint8** planes, int dst_width, int dst_height) {
+  if (dst_width != GetWidth() ||
+      dst_height > GetHeight()) {
+    // ERROR: Bad dimensions
+    return LIBYUV_FALSE;
+  }
+#ifdef HAVE_SETJMP
+  if (setjmp(error_mgr_->setjmp_buffer)) {
+    // We called into jpeglib, it experienced an error sometime during this
+    // function call, and we called longjmp() and rewound the stack to here.
+    // Return error.
+    return LIBYUV_FALSE;
+  }
+#endif
+  if (!StartDecode()) {
+    return LIBYUV_FALSE;
+  }
+  SetScanlinePointers(databuf_);
+  int lines_left = dst_height;
+  // Compute amount of lines to skip to implement vertical crop.
+  // TODO(fbarchard): Ensure skip is a multiple of maximum component
+  // subsample. ie 2
+  int skip = (GetHeight() - dst_height) / 2;
+  if (skip > 0) {
+    // There is no API to skip lines in the output data, so we read them
+    // into the temp buffer.
+    while (skip >= GetImageScanlinesPerImcuRow()) {
+      if (!DecodeImcuRow()) {
+        FinishDecode();
+        return LIBYUV_FALSE;
+      }
+      skip -= GetImageScanlinesPerImcuRow();
+    }
+    if (skip > 0) {
+      // Have a partial iMCU row left over to skip. Must read it and then
+      // copy the parts we want into the destination.
+      if (!DecodeImcuRow()) {
+        FinishDecode();
+        return LIBYUV_FALSE;
+      }
+      for (int i = 0; i < num_outbufs_; ++i) {
+        // TODO(fbarchard): Compute skip to avoid this
+        assert(skip % GetVertSubSampFactor(i) == 0);
+        int rows_to_skip =
+            DivideAndRoundDown(skip, GetVertSubSampFactor(i));
+        int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i) -
+                                rows_to_skip;
+        int data_to_skip = rows_to_skip * GetComponentStride(i);
+        CopyPlane(databuf_[i] + data_to_skip, GetComponentStride(i),
+                  planes[i], GetComponentWidth(i),
+                  GetComponentWidth(i), scanlines_to_copy);
+        planes[i] += scanlines_to_copy * GetComponentWidth(i);
+      }
+      lines_left -= (GetImageScanlinesPerImcuRow() - skip);
+    }
+  }
+
+  // Read full MCUs but cropped horizontally
+  for (; lines_left > GetImageScanlinesPerImcuRow();
+         lines_left -= GetImageScanlinesPerImcuRow()) {
+    if (!DecodeImcuRow()) {
+      FinishDecode();
+      return LIBYUV_FALSE;
+    }
+    for (int i = 0; i < num_outbufs_; ++i) {
+      int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i);
+      CopyPlane(databuf_[i], GetComponentStride(i),
+                planes[i], GetComponentWidth(i),
+                GetComponentWidth(i), scanlines_to_copy);
+      planes[i] += scanlines_to_copy * GetComponentWidth(i);
+    }
+  }
+
+  if (lines_left > 0) {
+    // Have a partial iMCU row left over to decode.
+    if (!DecodeImcuRow()) {
+      FinishDecode();
+      return LIBYUV_FALSE;
+    }
+    for (int i = 0; i < num_outbufs_; ++i) {
+      int scanlines_to_copy =
+          DivideAndRoundUp(lines_left, GetVertSubSampFactor(i));
+      CopyPlane(databuf_[i], GetComponentStride(i),
+                planes[i], GetComponentWidth(i),
+                GetComponentWidth(i), scanlines_to_copy);
+      planes[i] += scanlines_to_copy * GetComponentWidth(i);
+    }
+  }
+  return FinishDecode();
+}
+
+LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn, void* opaque,
+    int dst_width, int dst_height) {
+  if (dst_width != GetWidth() ||
+      dst_height > GetHeight()) {
+    // ERROR: Bad dimensions
+    return LIBYUV_FALSE;
+  }
+#ifdef HAVE_SETJMP
+  if (setjmp(error_mgr_->setjmp_buffer)) {
+    // We called into jpeglib, it experienced an error sometime during this
+    // function call, and we called longjmp() and rewound the stack to here.
+    // Return error.
+    return LIBYUV_FALSE;
+  }
+#endif
+  if (!StartDecode()) {
+    return LIBYUV_FALSE;
+  }
+  SetScanlinePointers(databuf_);
+  int lines_left = dst_height;
+  // TODO(fbarchard): Compute amount of lines to skip to implement vertical crop
+  int skip = (GetHeight() - dst_height) / 2;
+  if (skip > 0) {
+    while (skip >= GetImageScanlinesPerImcuRow()) {
+      if (!DecodeImcuRow()) {
+        FinishDecode();
+        return LIBYUV_FALSE;
+      }
+      skip -= GetImageScanlinesPerImcuRow();
+    }
+    if (skip > 0) {
+      // Have a partial iMCU row left over to skip.
+      if (!DecodeImcuRow()) {
+        FinishDecode();
+        return LIBYUV_FALSE;
+      }
+      for (int i = 0; i < num_outbufs_; ++i) {
+        // TODO(fbarchard): Compute skip to avoid this
+        assert(skip % GetVertSubSampFactor(i) == 0);
+        int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));
+        int data_to_skip = rows_to_skip * GetComponentStride(i);
+        // Change our own data buffer pointers so we can pass them to the
+        // callback.
+        databuf_[i] += data_to_skip;
+      }
+      int scanlines_to_copy = GetImageScanlinesPerImcuRow() - skip;
+      (*fn)(opaque, databuf_, databuf_strides_, scanlines_to_copy);
+      // Now change them back.
+      for (int i = 0; i < num_outbufs_; ++i) {
+        int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));
+        int data_to_skip = rows_to_skip * GetComponentStride(i);
+        databuf_[i] -= data_to_skip;
+      }
+      lines_left -= scanlines_to_copy;
+    }
+  }
+  // Read full MCUs until we get to the crop point.
+  for (; lines_left >= GetImageScanlinesPerImcuRow();
+         lines_left -= GetImageScanlinesPerImcuRow()) {
+    if (!DecodeImcuRow()) {
+      FinishDecode();
+      return LIBYUV_FALSE;
+    }
+    (*fn)(opaque, databuf_, databuf_strides_, GetImageScanlinesPerImcuRow());
+  }
+  if (lines_left > 0) {
+    // Have a partial iMCU row left over to decode.
+    if (!DecodeImcuRow()) {
+      FinishDecode();
+      return LIBYUV_FALSE;
+    }
+    (*fn)(opaque, databuf_, databuf_strides_, lines_left);
+  }
+  return FinishDecode();
+}
+
+void init_source(j_decompress_ptr cinfo) {
+  fill_input_buffer(cinfo);
+}
+
+boolean fill_input_buffer(j_decompress_ptr cinfo) {
+  BufferVector* buf_vec = reinterpret_cast<BufferVector*>(cinfo->client_data);
+  if (buf_vec->pos >= buf_vec->len) {
+    assert(0 && "No more data");
+    // ERROR: No more data
+    return FALSE;
+  }
+  cinfo->src->next_input_byte = buf_vec->buffers[buf_vec->pos].data;
+  cinfo->src->bytes_in_buffer = buf_vec->buffers[buf_vec->pos].len;
+  ++buf_vec->pos;
+  return TRUE;
+}
+
+void skip_input_data(j_decompress_ptr cinfo, long num_bytes) {  // NOLINT
+  cinfo->src->next_input_byte += num_bytes;
+}
+
+void term_source(j_decompress_ptr cinfo) {
+  // Nothing to do.
+}
+
+#ifdef HAVE_SETJMP
+void ErrorHandler(j_common_ptr cinfo) {
+  // This is called when a jpeglib command experiences an error. Unfortunately
+  // jpeglib's error handling model is not very flexible, because it expects the
+  // error handler to not return--i.e., it wants the program to terminate. To
+  // recover from errors we use setjmp() as shown in their example. setjmp() is
+  // C's implementation for the "call with current continuation" functionality
+  // seen in some functional programming languages.
+  // A formatted message can be output, but is unsafe for release.
+#ifdef DEBUG
+  char buf[JMSG_LENGTH_MAX];
+  (*cinfo->err->format_message)(cinfo, buf);
+  // ERROR: Error in jpeglib: buf
+#endif
+
+  SetJmpErrorMgr* mgr = reinterpret_cast<SetJmpErrorMgr*>(cinfo->err);
+  // This rewinds the call stack to the point of the corresponding setjmp()
+  // and causes it to return (for a second time) with value 1.
+  longjmp(mgr->setjmp_buffer, 1);
+}
+#endif
+
+void MJpegDecoder::AllocOutputBuffers(int num_outbufs) {
+  if (num_outbufs != num_outbufs_) {
+    // We could perhaps optimize this case to resize the output buffers without
+    // necessarily having to delete and recreate each one, but it's not worth
+    // it.
+    DestroyOutputBuffers();
+
+    scanlines_ = new uint8** [num_outbufs];
+    scanlines_sizes_ = new int[num_outbufs];
+    databuf_ = new uint8* [num_outbufs];
+    databuf_strides_ = new int[num_outbufs];
+
+    for (int i = 0; i < num_outbufs; ++i) {
+      scanlines_[i] = NULL;
+      scanlines_sizes_[i] = 0;
+      databuf_[i] = NULL;
+      databuf_strides_[i] = 0;
+    }
+
+    num_outbufs_ = num_outbufs;
+  }
+}
+
+void MJpegDecoder::DestroyOutputBuffers() {
+  for (int i = 0; i < num_outbufs_; ++i) {
+    delete [] scanlines_[i];
+    delete [] databuf_[i];
+  }
+  delete [] scanlines_;
+  delete [] databuf_;
+  delete [] scanlines_sizes_;
+  delete [] databuf_strides_;
+  scanlines_ = NULL;
+  databuf_ = NULL;
+  scanlines_sizes_ = NULL;
+  databuf_strides_ = NULL;
+  num_outbufs_ = 0;
+}
+
+// JDCT_IFAST and do_block_smoothing improve performance substantially.
+LIBYUV_BOOL MJpegDecoder::StartDecode() {
+  decompress_struct_->raw_data_out = TRUE;
+  decompress_struct_->dct_method = JDCT_IFAST;  // JDCT_ISLOW is default
+  decompress_struct_->dither_mode = JDITHER_NONE;
+  // Not applicable to 'raw':
+  decompress_struct_->do_fancy_upsampling = (boolean)(LIBYUV_FALSE);
+  // Only for buffered mode:
+  decompress_struct_->enable_2pass_quant = (boolean)(LIBYUV_FALSE);
+  // Blocky but fast:
+  decompress_struct_->do_block_smoothing = (boolean)(LIBYUV_FALSE);
+
+  if (!jpeg_start_decompress(decompress_struct_)) {
+    // ERROR: Couldn't start JPEG decompressor";
+    return LIBYUV_FALSE;
+  }
+  return LIBYUV_TRUE;
+}
+
+LIBYUV_BOOL MJpegDecoder::FinishDecode() {
+  // jpeglib considers it an error if we finish without decoding the whole
+  // image, so we call "abort" rather than "finish".
+  jpeg_abort_decompress(decompress_struct_);
+  return LIBYUV_TRUE;
+}
+
+void MJpegDecoder::SetScanlinePointers(uint8** data) {
+  for (int i = 0; i < num_outbufs_; ++i) {
+    uint8* data_i = data[i];
+    for (int j = 0; j < scanlines_sizes_[i]; ++j) {
+      scanlines_[i][j] = data_i;
+      data_i += GetComponentStride(i);
+    }
+  }
+}
+
+inline LIBYUV_BOOL MJpegDecoder::DecodeImcuRow() {
+  return (unsigned int)(GetImageScanlinesPerImcuRow()) ==
+      jpeg_read_raw_data(decompress_struct_,
+                         scanlines_,
+                         GetImageScanlinesPerImcuRow());
+}
+
+// The helper function which recognizes the jpeg sub-sampling type.
+JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper(
+    int* subsample_x, int* subsample_y, int number_of_components) {
+  if (number_of_components == 3) {  // Color images.
+    if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
+        subsample_x[1] == 2 && subsample_y[1] == 2 &&
+        subsample_x[2] == 2 && subsample_y[2] == 2) {
+      return kJpegYuv420;
+    } else if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
+        subsample_x[1] == 2 && subsample_y[1] == 1 &&
+        subsample_x[2] == 2 && subsample_y[2] == 1) {
+      return kJpegYuv422;
+    } else if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
+        subsample_x[1] == 1 && subsample_y[1] == 1 &&
+        subsample_x[2] == 1 && subsample_y[2] == 1) {
+      return kJpegYuv444;
+    }
+  } else if (number_of_components == 1) {  // Grey-scale images.
+    if (subsample_x[0] == 1 && subsample_y[0] == 1) {
+      return kJpegYuv400;
+    }
+  }
+  return kJpegUnknown;
+}
+
+}  // namespace libyuv
+#endif  // HAVE_JPEG
+
diff --git a/libs/libyuv/source/mjpeg_validate.cc b/libs/libyuv/source/mjpeg_validate.cc
new file mode 100644
index 0000000000..9c48832045
--- /dev/null
+++ b/libs/libyuv/source/mjpeg_validate.cc
@@ -0,0 +1,71 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/mjpeg_decoder.h"
+
+#include <string.h>  // For memchr.
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Helper function to scan for EOI marker (0xff 0xd9).
+static LIBYUV_BOOL ScanEOI(const uint8* sample, size_t sample_size) {
+  if (sample_size >= 2) {
+    const uint8* end = sample + sample_size - 1;
+    const uint8* it = sample;
+    while (it < end) {
+      // TODO(fbarchard): scan for 0xd9 instead.
+      it = static_cast<const uint8 *>(memchr(it, 0xff, end - it));
+      if (it == NULL) {
+        break;
+      }
+      if (it[1] == 0xd9) {
+        return LIBYUV_TRUE;  // Success: Valid jpeg.
+      }
+      ++it;  // Skip over current 0xff.
+    }
+  }
+  // ERROR: Invalid jpeg end code not found. Size sample_size
+  return LIBYUV_FALSE;
+}
+
+// Helper function to validate the jpeg appears intact.
+LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) {
+  // Maximum size that ValidateJpeg will consider valid.
+  const size_t kMaxJpegSize = 0x7fffffffull;
+  const size_t kBackSearchSize = 1024;
+  if (sample_size < 64 || sample_size > kMaxJpegSize || !sample) {
+    // ERROR: Invalid jpeg size: sample_size
+    return LIBYUV_FALSE;
+  }
+  if (sample[0] != 0xff || sample[1] != 0xd8) {  // SOI marker
+    // ERROR: Invalid jpeg initial start code
+    return LIBYUV_FALSE;
+  }
+
+  // Look for the End Of Image (EOI) marker near the end of the buffer.
+  if (sample_size > kBackSearchSize) {
+    if (ScanEOI(sample + sample_size - kBackSearchSize, kBackSearchSize)) {
+      return LIBYUV_TRUE;  // Success: Valid jpeg.
+    }
+    // Reduce search size for forward search.
+    sample_size = sample_size - kBackSearchSize + 1;
+  }
+  // Step over SOI marker and scan for EOI.
+  return ScanEOI(sample + 2, sample_size - 2);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
diff --git a/libs/libyuv/source/planar_functions.cc b/libs/libyuv/source/planar_functions.cc
new file mode 100644
index 0000000000..851c0fea91
--- /dev/null
+++ b/libs/libyuv/source/planar_functions.cc
@@ -0,0 +1,2629 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/planar_functions.h"
+
+#include <string.h>  // for memset()
+
+#include "libyuv/cpu_id.h"
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"  // for ScaleRowDown2
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Copy a plane of data
+LIBYUV_API
+void CopyPlane(const uint8* src_y, int src_stride_y,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height) {
+  int y;
+  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_y = 0;
+  }
+  // Nothing to do.
+  if (src_y == dst_y && src_stride_y == dst_stride_y) {
+    return;
+  }
+#if defined(HAS_COPYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
+  }
+#endif
+#if defined(HAS_COPYROW_AVX)
+  if (TestCpuFlag(kCpuHasAVX)) {
+    CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
+  }
+#endif
+#if defined(HAS_COPYROW_ERMS)
+  if (TestCpuFlag(kCpuHasERMS)) {
+    CopyRow = CopyRow_ERMS;
+  }
+#endif
+#if defined(HAS_COPYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
+  }
+#endif
+#if defined(HAS_COPYROW_MIPS)
+  if (TestCpuFlag(kCpuHasMIPS)) {
+    CopyRow = CopyRow_MIPS;
+  }
+#endif
+
+  // Copy plane
+  for (y = 0; y < height; ++y) {
+    CopyRow(src_y, dst_y, width);
+    src_y += src_stride_y;
+    dst_y += dst_stride_y;
+  }
+}
+
+LIBYUV_API
+void CopyPlane_16(const uint16* src_y, int src_stride_y,
+                  uint16* dst_y, int dst_stride_y,
+                  int width, int height) {
+  int y;
+  void (*CopyRow)(const uint16* src, uint16* dst, int width) = CopyRow_16_C;
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_y = 0;
+  }
+#if defined(HAS_COPYROW_16_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) {
+    CopyRow = CopyRow_16_SSE2;
+  }
+#endif
+#if defined(HAS_COPYROW_16_ERMS)
+  if (TestCpuFlag(kCpuHasERMS)) {
+    CopyRow = CopyRow_16_ERMS;
+  }
+#endif
+#if defined(HAS_COPYROW_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
+    CopyRow = CopyRow_16_NEON;
+  }
+#endif
+#if defined(HAS_COPYROW_16_MIPS)
+  if (TestCpuFlag(kCpuHasMIPS)) {
+    CopyRow = CopyRow_16_MIPS;
+  }
+#endif
+
+  // Copy plane
+  for (y = 0; y < height; ++y) {
+    CopyRow(src_y, dst_y, width);
+    src_y += src_stride_y;
+    dst_y += dst_stride_y;
+  }
+}
+
+// Copy I422.
+LIBYUV_API
+int I422Copy(const uint8* src_y, int src_stride_y,
+             const uint8* src_u, int src_stride_u,
+             const uint8* src_v, int src_stride_v,
+             uint8* dst_y, int dst_stride_y,
+             uint8* dst_u, int dst_stride_u,
+             uint8* dst_v, int dst_stride_v,
+             int width, int height) {
+  int halfwidth = (width + 1) >> 1;
+  if (!src_y || !src_u || !src_v ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, height);
+  CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, height);
+  return 0;
+}
+
+// Copy I444.
+LIBYUV_API
+int I444Copy(const uint8* src_y, int src_stride_y,
+             const uint8* src_u, int src_stride_u,
+             const uint8* src_v, int src_stride_v,
+             uint8* dst_y, int dst_stride_y,
+             uint8* dst_u, int dst_stride_u,
+             uint8* dst_v, int dst_stride_v,
+             int width, int height) {
+  if (!src_y || !src_u || !src_v ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+  CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+  return 0;
+}
+
+// Copy I400.
+LIBYUV_API
+int I400ToI400(const uint8* src_y, int src_stride_y,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height) {
+  if (!src_y || !dst_y || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  return 0;
+}
+
+// Convert I420 to I400.
+LIBYUV_API
+int I420ToI400(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height) {
+  if (!src_y || !dst_y || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  return 0;
+}
+
+// Mirror a plane of data.
+void MirrorPlane(const uint8* src_y, int src_stride_y,
+                 uint8* dst_y, int dst_stride_y,
+                 int width, int height) {
+  int y;
+  void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+#if defined(HAS_MIRRORROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MirrorRow = MirrorRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      MirrorRow = MirrorRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    MirrorRow = MirrorRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      MirrorRow = MirrorRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MirrorRow = MirrorRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      MirrorRow = MirrorRow_AVX2;
+    }
+  }
+#endif
+// TODO(fbarchard): Mirror on mips handle unaligned memory.
+#if defined(HAS_MIRRORROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(dst_y, 4) && IS_ALIGNED(dst_stride_y, 4)) {
+    MirrorRow = MirrorRow_DSPR2;
+  }
+#endif
+
+  // Mirror plane
+  for (y = 0; y < height; ++y) {
+    MirrorRow(src_y, dst_y, width);
+    src_y += src_stride_y;
+    dst_y += dst_stride_y;
+  }
+}
+
+// Convert YUY2 to I422.
+LIBYUV_API
+int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*YUY2ToUV422Row)(const uint8* src_yuy2,
+                         uint8* dst_u, uint8* dst_v, int width) =
+      YUY2ToUV422Row_C;
+  void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int width) =
+      YUY2ToYRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+    src_stride_yuy2 = -src_stride_yuy2;
+  }
+  // Coalesce rows.
+  if (src_stride_yuy2 == width * 2 &&
+      dst_stride_y == width &&
+      dst_stride_u * 2 == width &&
+      dst_stride_v * 2 == width) {
+    width *= height;
+    height = 1;
+    src_stride_yuy2 = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+  }
+#if defined(HAS_YUY2TOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
+    YUY2ToYRow = YUY2ToYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
+      YUY2ToYRow = YUY2ToYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2;
+    YUY2ToYRow = YUY2ToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      YUY2ToUV422Row = YUY2ToUV422Row_AVX2;
+      YUY2ToYRow = YUY2ToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    YUY2ToYRow = YUY2ToYRow_Any_NEON;
+    if (width >= 16) {
+      YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
+    }
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToYRow = YUY2ToYRow_NEON;
+      YUY2ToUV422Row = YUY2ToUV422Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
+    YUY2ToYRow(src_yuy2, dst_y, width);
+    src_yuy2 += src_stride_yuy2;
+    dst_y += dst_stride_y;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  return 0;
+}
+
+// Convert UYVY to I422.
+LIBYUV_API
+int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*UYVYToUV422Row)(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int width) =
+      UYVYToUV422Row_C;
+  void (*UYVYToYRow)(const uint8* src_uyvy,
+                     uint8* dst_y, int width) = UYVYToYRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+    src_stride_uyvy = -src_stride_uyvy;
+  }
+  // Coalesce rows.
+  if (src_stride_uyvy == width * 2 &&
+      dst_stride_y == width &&
+      dst_stride_u * 2 == width &&
+      dst_stride_v * 2 == width) {
+    width *= height;
+    height = 1;
+    src_stride_uyvy = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+  }
+#if defined(HAS_UYVYTOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    UYVYToUV422Row = UYVYToUV422Row_Any_SSE2;
+    UYVYToYRow = UYVYToYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToUV422Row = UYVYToUV422Row_SSE2;
+      UYVYToYRow = UYVYToYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    UYVYToUV422Row = UYVYToUV422Row_Any_AVX2;
+    UYVYToYRow = UYVYToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      UYVYToUV422Row = UYVYToUV422Row_AVX2;
+      UYVYToYRow = UYVYToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    UYVYToYRow = UYVYToYRow_Any_NEON;
+    if (width >= 16) {
+      UYVYToUV422Row = UYVYToUV422Row_Any_NEON;
+    }
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToYRow = UYVYToYRow_NEON;
+      UYVYToUV422Row = UYVYToUV422Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    UYVYToUV422Row(src_uyvy, dst_u, dst_v, width);
+    UYVYToYRow(src_uyvy, dst_y, width);
+    src_uyvy += src_stride_uyvy;
+    dst_y += dst_stride_y;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  return 0;
+}
+
+// Mirror I400 with optional flipping
+LIBYUV_API
+int I400Mirror(const uint8* src_y, int src_stride_y,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height) {
+  if (!src_y || !dst_y ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+
+  MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  return 0;
+}
+
+// Mirror I420 with optional flipping
+LIBYUV_API
+int I420Mirror(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  if (dst_y) {
+    MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+  MirrorPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
+  MirrorPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
+  return 0;
+}
+
+// ARGB mirror.
+LIBYUV_API
+int ARGBMirror(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =
+      ARGBMirrorRow_C;
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+#if defined(HAS_ARGBMIRRORROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBMirrorRow = ARGBMirrorRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBMIRRORROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBMirrorRow = ARGBMirrorRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBMIRRORROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBMirrorRow = ARGBMirrorRow_AVX2;
+    }
+  }
+#endif
+
+  // Mirror plane
+  for (y = 0; y < height; ++y) {
+    ARGBMirrorRow(src_argb, dst_argb, width);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Get a blender that optimized for the CPU and pixel count.
+// As there are 6 blenders to choose from, the caller should try to use
+// the same blend function for all pixels if possible.
+LIBYUV_API
+ARGBBlendRow GetARGBBlend() {
+  void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,
+                       uint8* dst_argb, int width) = ARGBBlendRow_C;
+#if defined(HAS_ARGBBLENDROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBBlendRow = ARGBBlendRow_SSSE3;
+    return ARGBBlendRow;
+  }
+#endif
+#if defined(HAS_ARGBBLENDROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBBlendRow = ARGBBlendRow_NEON;
+  }
+#endif
+  return ARGBBlendRow;
+}
+
+// Alpha Blend 2 ARGB images and store to destination.
+LIBYUV_API
+int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
+              const uint8* src_argb1, int src_stride_argb1,
+              uint8* dst_argb, int dst_stride_argb,
+              int width, int height) {
+  int y;
+  void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,
+                       uint8* dst_argb, int width) = GetARGBBlend();
+  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb0 == width * 4 &&
+      src_stride_argb1 == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
+  }
+
+  for (y = 0; y < height; ++y) {
+    ARGBBlendRow(src_argb0, src_argb1, dst_argb, width);
+    src_argb0 += src_stride_argb0;
+    src_argb1 += src_stride_argb1;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Alpha Blend plane and store to destination.
+LIBYUV_API
+int BlendPlane(const uint8* src_y0, int src_stride_y0,
+               const uint8* src_y1, int src_stride_y1,
+               const uint8* alpha, int alpha_stride,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height) {
+  int y;
+  void (*BlendPlaneRow)(const uint8* src0, const uint8* src1,
+      const uint8* alpha, uint8* dst, int width) = BlendPlaneRow_C;
+  if (!src_y0 || !src_y1 || !alpha || !dst_y || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_stride_y = -dst_stride_y;
+  }
+
+  // Coalesce rows for Y plane.
+  if (src_stride_y0 == width &&
+      src_stride_y1 == width &&
+      alpha_stride == width &&
+      dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_y0 = src_stride_y1 = alpha_stride = dst_stride_y = 0;
+  }
+
+#if defined(HAS_BLENDPLANEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+  BlendPlaneRow = BlendPlaneRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      BlendPlaneRow = BlendPlaneRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_BLENDPLANEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+  BlendPlaneRow = BlendPlaneRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      BlendPlaneRow = BlendPlaneRow_AVX2;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    BlendPlaneRow(src_y0, src_y1, alpha, dst_y, width);
+    src_y0 += src_stride_y0;
+    src_y1 += src_stride_y1;
+    alpha += alpha_stride;
+    dst_y += dst_stride_y;
+  }
+  return 0;
+}
+
+#define MAXTWIDTH 2048
+// Alpha Blend YUV images and store to destination.
+LIBYUV_API
+int I420Blend(const uint8* src_y0, int src_stride_y0,
+              const uint8* src_u0, int src_stride_u0,
+              const uint8* src_v0, int src_stride_v0,
+              const uint8* src_y1, int src_stride_y1,
+              const uint8* src_u1, int src_stride_u1,
+              const uint8* src_v1, int src_stride_v1,
+              const uint8* alpha, int alpha_stride,
+              uint8* dst_y, int dst_stride_y,
+              uint8* dst_u, int dst_stride_u,
+              uint8* dst_v, int dst_stride_v,
+              int width, int height) {
+  int y;
+  // Half width/height for UV.
+  int halfwidth = (width + 1) >> 1;
+  void (*BlendPlaneRow)(const uint8* src0, const uint8* src1,
+      const uint8* alpha, uint8* dst, int width) = BlendPlaneRow_C;
+  void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) = ScaleRowDown2Box_C;
+  if (!src_y0 || !src_u0 || !src_v0 || !src_y1 || !src_u1 || !src_v1 ||
+      !alpha || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_stride_y = -dst_stride_y;
+  }
+
+  // Blend Y plane.
+  BlendPlane(src_y0, src_stride_y0,
+             src_y1, src_stride_y1,
+             alpha, alpha_stride,
+             dst_y, dst_stride_y,
+             width, height);
+
+#if defined(HAS_BLENDPLANEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    BlendPlaneRow = BlendPlaneRow_Any_SSSE3;
+    if (IS_ALIGNED(halfwidth, 8)) {
+      BlendPlaneRow = BlendPlaneRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_BLENDPLANEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    BlendPlaneRow = BlendPlaneRow_Any_AVX2;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      BlendPlaneRow = BlendPlaneRow_AVX2;
+    }
+  }
+#endif
+  if (!IS_ALIGNED(width, 2)) {
+    ScaleRowDown2 = ScaleRowDown2Box_Odd_C;
+  }
+#if defined(HAS_SCALEROWDOWN2_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleRowDown2 = ScaleRowDown2Box_Odd_NEON;
+    if (IS_ALIGNED(width, 2)) {
+      ScaleRowDown2 = ScaleRowDown2Box_Any_NEON;
+      if (IS_ALIGNED(halfwidth, 16)) {
+        ScaleRowDown2 = ScaleRowDown2Box_NEON;
+      }
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN2_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ScaleRowDown2 = ScaleRowDown2Box_Odd_SSSE3;
+    if (IS_ALIGNED(width, 2)) {
+      ScaleRowDown2 = ScaleRowDown2Box_Any_SSSE3;
+      if (IS_ALIGNED(halfwidth, 16)) {
+        ScaleRowDown2 = ScaleRowDown2Box_SSSE3;
+      }
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN2_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ScaleRowDown2 = ScaleRowDown2Box_Odd_AVX2;
+    if (IS_ALIGNED(width, 2)) {
+      ScaleRowDown2 = ScaleRowDown2Box_Any_AVX2;
+      if (IS_ALIGNED(halfwidth, 32)) {
+        ScaleRowDown2 = ScaleRowDown2Box_AVX2;
+      }
+    }
+  }
+#endif
+
+  // Row buffer for intermediate alpha pixels.
+  align_buffer_64(halfalpha, halfwidth);
+  for (y = 0; y < height; y += 2) {
+    // last row of odd height image use 1 row of alpha instead of 2.
+    if (y == (height - 1)) {
+      alpha_stride = 0;
+    }
+    // Subsample 2 rows of UV to half width and half height.
+    ScaleRowDown2(alpha, alpha_stride, halfalpha, halfwidth);
+    alpha += alpha_stride * 2;
+    BlendPlaneRow(src_u0, src_u1, halfalpha, dst_u, halfwidth);
+    BlendPlaneRow(src_v0, src_v1, halfalpha, dst_v, halfwidth);
+    src_u0 += src_stride_u0;
+    src_u1 += src_stride_u1;
+    dst_u += dst_stride_u;
+    src_v0 += src_stride_v0;
+    src_v1 += src_stride_v1;
+    dst_v += dst_stride_v;
+  }
+  free_aligned_buffer_64(halfalpha);
+  return 0;
+}
+
+// Multiply 2 ARGB images and store to destination.
+LIBYUV_API
+int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
+                 const uint8* src_argb1, int src_stride_argb1,
+                 uint8* dst_argb, int dst_stride_argb,
+                 int width, int height) {
+  int y;
+  void (*ARGBMultiplyRow)(const uint8* src0, const uint8* src1, uint8* dst,
+                          int width) = ARGBMultiplyRow_C;
+  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb0 == width * 4 &&
+      src_stride_argb1 == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBMULTIPLYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBMultiplyRow = ARGBMultiplyRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBMultiplyRow = ARGBMultiplyRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBMULTIPLYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBMultiplyRow = ARGBMultiplyRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBMultiplyRow = ARGBMultiplyRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBMULTIPLYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBMultiplyRow = ARGBMultiplyRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBMultiplyRow = ARGBMultiplyRow_NEON;
+    }
+  }
+#endif
+
+  // Multiply plane
+  for (y = 0; y < height; ++y) {
+    ARGBMultiplyRow(src_argb0, src_argb1, dst_argb, width);
+    src_argb0 += src_stride_argb0;
+    src_argb1 += src_stride_argb1;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Add 2 ARGB images and store to destination.
+LIBYUV_API
+int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
+            const uint8* src_argb1, int src_stride_argb1,
+            uint8* dst_argb, int dst_stride_argb,
+            int width, int height) {
+  int y;
+  void (*ARGBAddRow)(const uint8* src0, const uint8* src1, uint8* dst,
+                     int width) = ARGBAddRow_C;
+  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb0 == width * 4 &&
+      src_stride_argb1 == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBADDROW_SSE2) && (defined(_MSC_VER) && !defined(__clang__))
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBAddRow = ARGBAddRow_SSE2;
+  }
+#endif
+#if defined(HAS_ARGBADDROW_SSE2) && !(defined(_MSC_VER) && !defined(__clang__))
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBAddRow = ARGBAddRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBAddRow = ARGBAddRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBADDROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBAddRow = ARGBAddRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAddRow = ARGBAddRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBADDROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBAddRow = ARGBAddRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAddRow = ARGBAddRow_NEON;
+    }
+  }
+#endif
+
+  // Add plane
+  for (y = 0; y < height; ++y) {
+    ARGBAddRow(src_argb0, src_argb1, dst_argb, width);
+    src_argb0 += src_stride_argb0;
+    src_argb1 += src_stride_argb1;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Subtract 2 ARGB images and store to destination.
+LIBYUV_API
+int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
+                 const uint8* src_argb1, int src_stride_argb1,
+                 uint8* dst_argb, int dst_stride_argb,
+                 int width, int height) {
+  int y;
+  void (*ARGBSubtractRow)(const uint8* src0, const uint8* src1, uint8* dst,
+                          int width) = ARGBSubtractRow_C;
+  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb0 == width * 4 &&
+      src_stride_argb1 == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBSUBTRACTROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBSubtractRow = ARGBSubtractRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBSubtractRow = ARGBSubtractRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBSUBTRACTROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBSubtractRow = ARGBSubtractRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBSubtractRow = ARGBSubtractRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBSUBTRACTROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBSubtractRow = ARGBSubtractRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBSubtractRow = ARGBSubtractRow_NEON;
+    }
+  }
+#endif
+
+  // Subtract plane
+  for (y = 0; y < height; ++y) {
+    ARGBSubtractRow(src_argb0, src_argb1, dst_argb, width);
+    src_argb0 += src_stride_argb0;
+    src_argb1 += src_stride_argb1;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+// Convert I422 to RGBA with matrix
+static int I422ToRGBAMatrix(const uint8* src_y, int src_stride_y,
+                            const uint8* src_u, int src_stride_u,
+                            const uint8* src_v, int src_stride_v,
+                            uint8* dst_rgba, int dst_stride_rgba,
+                            const struct YuvConstants* yuvconstants,
+                            int width, int height) {
+  int y;
+  void (*I422ToRGBARow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        const struct YuvConstants* yuvconstants,
+                        int width) = I422ToRGBARow_C;
+  if (!src_y || !src_u || !src_v || !dst_rgba ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
+    dst_stride_rgba = -dst_stride_rgba;
+  }
+#if defined(HAS_I422TORGBAROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGBARow = I422ToRGBARow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRGBARow = I422ToRGBARow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGBARow = I422ToRGBARow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToRGBARow = I422ToRGBARow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGBARow = I422ToRGBARow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_rgba, 4) && IS_ALIGNED(dst_stride_rgba, 4)) {
+    I422ToRGBARow = I422ToRGBARow_DSPR2;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
+    dst_rgba += dst_stride_rgba;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert I422 to RGBA.
+LIBYUV_API
+int I422ToRGBA(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_rgba, int dst_stride_rgba,
+               int width, int height) {
+  return I422ToRGBAMatrix(src_y, src_stride_y,
+                          src_u, src_stride_u,
+                          src_v, src_stride_v,
+                          dst_rgba, dst_stride_rgba,
+                          &kYuvI601Constants,
+                          width, height);
+}
+
+// Convert I422 to BGRA.
+LIBYUV_API
+int I422ToBGRA(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_bgra, int dst_stride_bgra,
+               int width, int height) {
+  return I422ToRGBAMatrix(src_y, src_stride_y,
+                          src_v, src_stride_v,  // Swap U and V
+                          src_u, src_stride_u,
+                          dst_bgra, dst_stride_bgra,
+                          &kYvuI601Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert NV12 to RGB565.
+LIBYUV_API
+int NV12ToRGB565(const uint8* src_y, int src_stride_y,
+                 const uint8* src_uv, int src_stride_uv,
+                 uint8* dst_rgb565, int dst_stride_rgb565,
+                 int width, int height) {
+  int y;
+  void (*NV12ToRGB565Row)(const uint8* y_buf,
+                          const uint8* uv_buf,
+                          uint8* rgb_buf,
+                          const struct YuvConstants* yuvconstants,
+                          int width) = NV12ToRGB565Row_C;
+  if (!src_y || !src_uv || !dst_rgb565 ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+    dst_stride_rgb565 = -dst_stride_rgb565;
+  }
+#if defined(HAS_NV12TORGB565ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    NV12ToRGB565Row = NV12ToRGB565Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToRGB565Row = NV12ToRGB565Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_NV12TORGB565ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV12ToRGB565Row = NV12ToRGB565Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      NV12ToRGB565Row = NV12ToRGB565Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_NV12TORGB565ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToRGB565Row = NV12ToRGB565Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    NV12ToRGB565Row(src_y, src_uv, dst_rgb565, &kYuvI601Constants, width);
+    dst_rgb565 += dst_stride_rgb565;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_uv += src_stride_uv;
+    }
+  }
+  return 0;
+}
+
+// Convert RAW to RGB24.
+LIBYUV_API
+int RAWToRGB24(const uint8* src_raw, int src_stride_raw,
+               uint8* dst_rgb24, int dst_stride_rgb24,
+               int width, int height) {
+  int y;
+  void (*RAWToRGB24Row)(const uint8* src_rgb, uint8* dst_rgb24, int width) =
+      RAWToRGB24Row_C;
+  if (!src_raw || !dst_rgb24 ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_raw = src_raw + (height - 1) * src_stride_raw;
+    src_stride_raw = -src_stride_raw;
+  }
+  // Coalesce rows.
+  if (src_stride_raw == width * 3 &&
+      dst_stride_rgb24 == width * 3) {
+    width *= height;
+    height = 1;
+    src_stride_raw = dst_stride_rgb24 = 0;
+  }
+#if defined(HAS_RAWTORGB24ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RAWToRGB24Row = RAWToRGB24Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      RAWToRGB24Row = RAWToRGB24Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_RAWTORGB24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RAWToRGB24Row = RAWToRGB24Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RAWToRGB24Row = RAWToRGB24Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    RAWToRGB24Row(src_raw, dst_rgb24, width);
+    src_raw += src_stride_raw;
+    dst_rgb24 += dst_stride_rgb24;
+  }
+  return 0;
+}
+
+LIBYUV_API
+void SetPlane(uint8* dst_y, int dst_stride_y,
+              int width, int height,
+              uint32 value) {
+  int y;
+  void (*SetRow)(uint8* dst, uint8 value, int width) = SetRow_C;
+  if (height < 0) {
+    height = -height;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_stride_y = -dst_stride_y;
+  }
+  // Coalesce rows.
+  if (dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    dst_stride_y = 0;
+  }
+#if defined(HAS_SETROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SetRow = SetRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      SetRow = SetRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SETROW_X86)
+  if (TestCpuFlag(kCpuHasX86)) {
+    SetRow = SetRow_Any_X86;
+    if (IS_ALIGNED(width, 4)) {
+      SetRow = SetRow_X86;
+    }
+  }
+#endif
+#if defined(HAS_SETROW_ERMS)
+  if (TestCpuFlag(kCpuHasERMS)) {
+    SetRow = SetRow_ERMS;
+  }
+#endif
+
+  // Set plane
+  for (y = 0; y < height; ++y) {
+    SetRow(dst_y, value, width);
+    dst_y += dst_stride_y;
+  }
+}
+
+// Draw a rectangle into I420
+LIBYUV_API
+int I420Rect(uint8* dst_y, int dst_stride_y,
+             uint8* dst_u, int dst_stride_u,
+             uint8* dst_v, int dst_stride_v,
+             int x, int y,
+             int width, int height,
+             int value_y, int value_u, int value_v) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  uint8* start_y = dst_y + y * dst_stride_y + x;
+  uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2);
+  uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2);
+  if (!dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0 ||
+      x < 0 || y < 0 ||
+      value_y < 0 || value_y > 255 ||
+      value_u < 0 || value_u > 255 ||
+      value_v < 0 || value_v > 255) {
+    return -1;
+  }
+
+  SetPlane(start_y, dst_stride_y, width, height, value_y);
+  SetPlane(start_u, dst_stride_u, halfwidth, halfheight, value_u);
+  SetPlane(start_v, dst_stride_v, halfwidth, halfheight, value_v);
+  return 0;
+}
+
+// Draw a rectangle into ARGB
+LIBYUV_API
+int ARGBRect(uint8* dst_argb, int dst_stride_argb,
+             int dst_x, int dst_y,
+             int width, int height,
+             uint32 value) {
+  int y;
+  void (*ARGBSetRow)(uint8* dst_argb, uint32 value, int width) = ARGBSetRow_C;
+  if (!dst_argb ||
+      width <= 0 || height == 0 ||
+      dst_x < 0 || dst_y < 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  dst_argb += dst_y * dst_stride_argb + dst_x * 4;
+  // Coalesce rows.
+  if (dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    dst_stride_argb = 0;
+  }
+
+#if defined(HAS_ARGBSETROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBSetRow = ARGBSetRow_Any_NEON;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBSetRow = ARGBSetRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBSETROW_X86)
+  if (TestCpuFlag(kCpuHasX86)) {
+    ARGBSetRow = ARGBSetRow_X86;
+  }
+#endif
+
+  // Set plane
+  for (y = 0; y < height; ++y) {
+    ARGBSetRow(dst_argb, value, width);
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert unattentuated ARGB to preattenuated ARGB.
+// An unattenutated ARGB alpha blend uses the formula
+// p = a * f + (1 - a) * b
+// where
+//   p is output pixel
+//   f is foreground pixel
+//   b is background pixel
+//   a is alpha value from foreground pixel
+// An preattenutated ARGB alpha blend uses the formula
+// p = f + (1 - a) * b
+// where
+//   f is foreground pixel premultiplied by alpha
+
+LIBYUV_API
+int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
+                  uint8* dst_argb, int dst_stride_argb,
+                  int width, int height) {
+  int y;
+  void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb,
+                           int width) = ARGBAttenuateRow_C;
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBATTENUATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBAttenuateRow(src_argb, dst_argb, width);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert preattentuated ARGB to unattenuated ARGB.
+LIBYUV_API
+int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
+                    uint8* dst_argb, int dst_stride_argb,
+                    int width, int height) {
+  int y;
+  void (*ARGBUnattenuateRow)(const uint8* src_argb, uint8* dst_argb,
+                             int width) = ARGBUnattenuateRow_C;
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBUNATTENUATEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBUnattenuateRow = ARGBUnattenuateRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBUnattenuateRow = ARGBUnattenuateRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBUNATTENUATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBUnattenuateRow = ARGBUnattenuateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBUnattenuateRow = ARGBUnattenuateRow_AVX2;
+    }
+  }
+#endif
+// TODO(fbarchard): Neon version.
+
+  for (y = 0; y < height; ++y) {
+    ARGBUnattenuateRow(src_argb, dst_argb, width);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert ARGB to Grayed ARGB.
+LIBYUV_API
+int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb,
+                      int width) = ARGBGrayRow_C;
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBGRAYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
+    ARGBGrayRow = ARGBGrayRow_SSSE3;
+  }
+#endif
+#if defined(HAS_ARGBGRAYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    ARGBGrayRow = ARGBGrayRow_NEON;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBGrayRow(src_argb, dst_argb, width);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Make a rectangle of ARGB gray scale.
+LIBYUV_API
+int ARGBGray(uint8* dst_argb, int dst_stride_argb,
+             int dst_x, int dst_y,
+             int width, int height) {
+  int y;
+  void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb,
+                      int width) = ARGBGrayRow_C;
+  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
+    return -1;
+  }
+  // Coalesce rows.
+  if (dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBGRAYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
+    ARGBGrayRow = ARGBGrayRow_SSSE3;
+  }
+#endif
+#if defined(HAS_ARGBGRAYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    ARGBGrayRow = ARGBGrayRow_NEON;
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    ARGBGrayRow(dst, dst, width);
+    dst += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Make a rectangle of ARGB Sepia tone.
+LIBYUV_API
+int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
+              int dst_x, int dst_y, int width, int height) {
+  int y;
+  void (*ARGBSepiaRow)(uint8* dst_argb, int width) = ARGBSepiaRow_C;
+  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
+    return -1;
+  }
+  // Coalesce rows.
+  if (dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBSEPIAROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
+    ARGBSepiaRow = ARGBSepiaRow_SSSE3;
+  }
+#endif
+#if defined(HAS_ARGBSEPIAROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    ARGBSepiaRow = ARGBSepiaRow_NEON;
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    ARGBSepiaRow(dst, width);
+    dst += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Apply a 4x4 matrix to each ARGB pixel.
+// Note: Normally for shading, but can be used to swizzle or invert.
+LIBYUV_API
+int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
+                    uint8* dst_argb, int dst_stride_argb,
+                    const int8* matrix_argb,
+                    int width, int height) {
+  int y;
+  void (*ARGBColorMatrixRow)(const uint8* src_argb, uint8* dst_argb,
+      const int8* matrix_argb, int width) = ARGBColorMatrixRow_C;
+  if (!src_argb || !dst_argb || !matrix_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBCOLORMATRIXROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
+    ARGBColorMatrixRow = ARGBColorMatrixRow_SSSE3;
+  }
+#endif
+#if defined(HAS_ARGBCOLORMATRIXROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    ARGBColorMatrixRow = ARGBColorMatrixRow_NEON;
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    ARGBColorMatrixRow(src_argb, dst_argb, matrix_argb, width);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Apply a 4x3 matrix to each ARGB pixel.
+// Deprecated.
+LIBYUV_API
+int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
+                   const int8* matrix_rgb,
+                   int dst_x, int dst_y, int width, int height) {
+  SIMD_ALIGNED(int8 matrix_argb[16]);
+  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  if (!dst_argb || !matrix_rgb || width <= 0 || height <= 0 ||
+      dst_x < 0 || dst_y < 0) {
+    return -1;
+  }
+
+  // Convert 4x3 7 bit matrix to 4x4 6 bit matrix.
+  matrix_argb[0] = matrix_rgb[0] / 2;
+  matrix_argb[1] = matrix_rgb[1] / 2;
+  matrix_argb[2] = matrix_rgb[2] / 2;
+  matrix_argb[3] = matrix_rgb[3] / 2;
+  matrix_argb[4] = matrix_rgb[4] / 2;
+  matrix_argb[5] = matrix_rgb[5] / 2;
+  matrix_argb[6] = matrix_rgb[6] / 2;
+  matrix_argb[7] = matrix_rgb[7] / 2;
+  matrix_argb[8] = matrix_rgb[8] / 2;
+  matrix_argb[9] = matrix_rgb[9] / 2;
+  matrix_argb[10] = matrix_rgb[10] / 2;
+  matrix_argb[11] = matrix_rgb[11] / 2;
+  matrix_argb[14] = matrix_argb[13] = matrix_argb[12] = 0;
+  matrix_argb[15] = 64;  // 1.0
+
+  return ARGBColorMatrix((const uint8*)(dst), dst_stride_argb,
+                         dst, dst_stride_argb,
+                         &matrix_argb[0], width, height);
+}
+
+// Apply a color table each ARGB pixel.
+// Table contains 256 ARGB values.
+LIBYUV_API
+int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,
+                   const uint8* table_argb,
+                   int dst_x, int dst_y, int width, int height) {
+  int y;
+  void (*ARGBColorTableRow)(uint8* dst_argb, const uint8* table_argb,
+                            int width) = ARGBColorTableRow_C;
+  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  if (!dst_argb || !table_argb || width <= 0 || height <= 0 ||
+      dst_x < 0 || dst_y < 0) {
+    return -1;
+  }
+  // Coalesce rows.
+  if (dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBCOLORTABLEROW_X86)
+  if (TestCpuFlag(kCpuHasX86)) {
+    ARGBColorTableRow = ARGBColorTableRow_X86;
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    ARGBColorTableRow(dst, table_argb, width);
+    dst += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Apply a color table each ARGB pixel but preserve destination alpha.
+// Table contains 256 ARGB values.
+LIBYUV_API
+int RGBColorTable(uint8* dst_argb, int dst_stride_argb,
+                  const uint8* table_argb,
+                  int dst_x, int dst_y, int width, int height) {
+  int y;
+  void (*RGBColorTableRow)(uint8* dst_argb, const uint8* table_argb,
+                           int width) = RGBColorTableRow_C;
+  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  if (!dst_argb || !table_argb || width <= 0 || height <= 0 ||
+      dst_x < 0 || dst_y < 0) {
+    return -1;
+  }
+  // Coalesce rows.
+  if (dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    dst_stride_argb = 0;
+  }
+#if defined(HAS_RGBCOLORTABLEROW_X86)
+  if (TestCpuFlag(kCpuHasX86)) {
+    RGBColorTableRow = RGBColorTableRow_X86;
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    RGBColorTableRow(dst, table_argb, width);
+    dst += dst_stride_argb;
+  }
+  return 0;
+}
+
+// ARGBQuantize is used to posterize art.
+// e.g. rgb / qvalue * qvalue + qvalue / 2
+// But the low levels implement efficiently with 3 parameters, and could be
+// used for other high level operations.
+// dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset;
+// where scale is 1 / interval_size as a fixed point value.
+// The divide is replaces with a multiply by reciprocal fixed point multiply.
+// Caveat - although SSE2 saturates, the C function does not and should be used
+// with care if doing anything but quantization.
+LIBYUV_API
+int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
+                 int scale, int interval_size, int interval_offset,
+                 int dst_x, int dst_y, int width, int height) {
+  int y;
+  void (*ARGBQuantizeRow)(uint8* dst_argb, int scale, int interval_size,
+                          int interval_offset, int width) = ARGBQuantizeRow_C;
+  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0 ||
+      interval_size < 1 || interval_size > 255) {
+    return -1;
+  }
+  // Coalesce rows.
+  if (dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBQUANTIZEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) {
+    ARGBQuantizeRow = ARGBQuantizeRow_SSE2;
+  }
+#endif
+#if defined(HAS_ARGBQUANTIZEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    ARGBQuantizeRow = ARGBQuantizeRow_NEON;
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    ARGBQuantizeRow(dst, scale, interval_size, interval_offset, width);
+    dst += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Computes table of cumulative sum for image where the value is the sum
+// of all values above and to the left of the entry. Used by ARGBBlur.
+LIBYUV_API
+int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
+                             int32* dst_cumsum, int dst_stride32_cumsum,
+                             int width, int height) {
+  int y;
+  void (*ComputeCumulativeSumRow)(const uint8* row, int32* cumsum,
+      const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C;
+  int32* previous_cumsum = dst_cumsum;
+  if (!dst_cumsum || !src_argb || width <= 0 || height <= 0) {
+    return -1;
+  }
+#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2;
+  }
+#endif
+  memset(dst_cumsum, 0, width * sizeof(dst_cumsum[0]) * 4);  // 4 int per pixel.
+  for (y = 0; y < height; ++y) {
+    ComputeCumulativeSumRow(src_argb, dst_cumsum, previous_cumsum, width);
+    previous_cumsum = dst_cumsum;
+    dst_cumsum += dst_stride32_cumsum;
+    src_argb += src_stride_argb;
+  }
+  return 0;
+}
+
+// Blur ARGB image.
+// Caller should allocate CumulativeSum table of width * height * 16 bytes
+// aligned to 16 byte boundary. height can be radius * 2 + 2 to save memory
+// as the buffer is treated as circular.
+LIBYUV_API
+int ARGBBlur(const uint8* src_argb, int src_stride_argb,
+             uint8* dst_argb, int dst_stride_argb,
+             int32* dst_cumsum, int dst_stride32_cumsum,
+             int width, int height, int radius) {
+  int y;
+  void (*ComputeCumulativeSumRow)(const uint8 *row, int32 *cumsum,
+      const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C;
+  void (*CumulativeSumToAverageRow)(const int32* topleft, const int32* botleft,
+      int width, int area, uint8* dst, int count) = CumulativeSumToAverageRow_C;
+  int32* cumsum_bot_row;
+  int32* max_cumsum_bot_row;
+  int32* cumsum_top_row;
+
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  if (radius > height) {
+    radius = height;
+  }
+  if (radius > (width / 2 - 1)) {
+    radius = width / 2 - 1;
+  }
+  if (radius <= 0) {
+    return -1;
+  }
+#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2;
+    CumulativeSumToAverageRow = CumulativeSumToAverageRow_SSE2;
+  }
+#endif
+  // Compute enough CumulativeSum for first row to be blurred. After this
+  // one row of CumulativeSum is updated at a time.
+  ARGBComputeCumulativeSum(src_argb, src_stride_argb,
+                           dst_cumsum, dst_stride32_cumsum,
+                           width, radius);
+
+  src_argb = src_argb + radius * src_stride_argb;
+  cumsum_bot_row = &dst_cumsum[(radius - 1) * dst_stride32_cumsum];
+
+  max_cumsum_bot_row = &dst_cumsum[(radius * 2 + 2) * dst_stride32_cumsum];
+  cumsum_top_row = &dst_cumsum[0];
+
+  for (y = 0; y < height; ++y) {
+    int top_y = ((y - radius - 1) >= 0) ? (y - radius - 1) : 0;
+    int bot_y = ((y + radius) < height) ? (y + radius) : (height - 1);
+    int area = radius * (bot_y - top_y);
+    int boxwidth = radius * 4;
+    int x;
+    int n;
+
+    // Increment cumsum_top_row pointer with circular buffer wrap around.
+    if (top_y) {
+      cumsum_top_row += dst_stride32_cumsum;
+      if (cumsum_top_row >= max_cumsum_bot_row) {
+        cumsum_top_row = dst_cumsum;
+      }
+    }
+    // Increment cumsum_bot_row pointer with circular buffer wrap around and
+    // then fill in a row of CumulativeSum.
+    if ((y + radius) < height) {
+      const int32* prev_cumsum_bot_row = cumsum_bot_row;
+      cumsum_bot_row += dst_stride32_cumsum;
+      if (cumsum_bot_row >= max_cumsum_bot_row) {
+        cumsum_bot_row = dst_cumsum;
+      }
+      ComputeCumulativeSumRow(src_argb, cumsum_bot_row, prev_cumsum_bot_row,
+                              width);
+      src_argb += src_stride_argb;
+    }
+
+    // Left clipped.
+    for (x = 0; x < radius + 1; ++x) {
+      CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row,
+                                boxwidth, area, &dst_argb[x * 4], 1);
+      area += (bot_y - top_y);
+      boxwidth += 4;
+    }
+
+    // Middle unclipped.
+    n = (width - 1) - radius - x + 1;
+    CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row,
+                              boxwidth, area, &dst_argb[x * 4], n);
+
+    // Right clipped.
+    for (x += n; x <= width - 1; ++x) {
+      area -= (bot_y - top_y);
+      boxwidth -= 4;
+      CumulativeSumToAverageRow(cumsum_top_row + (x - radius - 1) * 4,
+                                cumsum_bot_row + (x - radius - 1) * 4,
+                                boxwidth, area, &dst_argb[x * 4], 1);
+    }
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Multiply ARGB image by a specified ARGB value.
+LIBYUV_API
+int ARGBShade(const uint8* src_argb, int src_stride_argb,
+              uint8* dst_argb, int dst_stride_argb,
+              int width, int height, uint32 value) {
+  int y;
+  void (*ARGBShadeRow)(const uint8* src_argb, uint8* dst_argb,
+                       int width, uint32 value) = ARGBShadeRow_C;
+  if (!src_argb || !dst_argb || width <= 0 || height == 0 || value == 0u) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBSHADEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) {
+    ARGBShadeRow = ARGBShadeRow_SSE2;
+  }
+#endif
+#if defined(HAS_ARGBSHADEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    ARGBShadeRow = ARGBShadeRow_NEON;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBShadeRow(src_argb, dst_argb, width, value);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Interpolate 2 planes by specified amount (0 to 255).
+LIBYUV_API
+int InterpolatePlane(const uint8* src0, int src_stride0,
+                     const uint8* src1, int src_stride1,
+                     uint8* dst, int dst_stride,
+                     int width, int height, int interpolation) {
+  int y;
+  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+  if (!src0 || !src1 || !dst || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst = dst + (height - 1) * dst_stride;
+    dst_stride = -dst_stride;
+  }
+  // Coalesce rows.
+  if (src_stride0 == width &&
+      src_stride1 == width &&
+      dst_stride == width) {
+    width *= height;
+    height = 1;
+    src_stride0 = src_stride1 = dst_stride = 0;
+  }
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) &&
+      IS_ALIGNED(src0, 4) && IS_ALIGNED(src_stride0, 4) &&
+      IS_ALIGNED(src1, 4) && IS_ALIGNED(src_stride1, 4) &&
+      IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4) &&
+      IS_ALIGNED(width, 4)) {
+    InterpolateRow = InterpolateRow_DSPR2;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    InterpolateRow(dst, src0, src1 - src0,
+                   width, interpolation);
+    src0 += src_stride0;
+    src1 += src_stride1;
+    dst += dst_stride;
+  }
+  return 0;
+}
+
+// Interpolate 2 ARGB images by specified amount (0 to 255).
+LIBYUV_API
+int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
+                    const uint8* src_argb1, int src_stride_argb1,
+                    uint8* dst_argb, int dst_stride_argb,
+                    int width, int height, int interpolation) {
+  return InterpolatePlane(src_argb0, src_stride_argb0,
+                          src_argb1, src_stride_argb1,
+                          dst_argb, dst_stride_argb,
+                          width * 4, height, interpolation);
+}
+
+// Interpolate 2 YUV images by specified amount (0 to 255).
+LIBYUV_API
+int I420Interpolate(const uint8* src0_y, int src0_stride_y,
+                    const uint8* src0_u, int src0_stride_u,
+                    const uint8* src0_v, int src0_stride_v,
+                    const uint8* src1_y, int src1_stride_y,
+                    const uint8* src1_u, int src1_stride_u,
+                    const uint8* src1_v, int src1_stride_v,
+                    uint8* dst_y, int dst_stride_y,
+                    uint8* dst_u, int dst_stride_u,
+                    uint8* dst_v, int dst_stride_v,
+                    int width, int height, int interpolation) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src0_y || !src0_u || !src0_v ||
+      !src1_y || !src1_u || !src1_v ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  InterpolatePlane(src0_y, src0_stride_y,
+                   src1_y, src1_stride_y,
+                   dst_y, dst_stride_y,
+                   width, height, interpolation);
+  InterpolatePlane(src0_u, src0_stride_u,
+                   src1_u, src1_stride_u,
+                   dst_u, dst_stride_u,
+                   halfwidth, halfheight, interpolation);
+  InterpolatePlane(src0_v, src0_stride_v,
+                   src1_v, src1_stride_v,
+                   dst_v, dst_stride_v,
+                   halfwidth, halfheight, interpolation);
+  return 0;
+}
+
+// Shuffle ARGB channel order.  e.g. BGRA to ARGB.
+LIBYUV_API
+int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
+                uint8* dst_argb, int dst_stride_argb,
+                const uint8* shuffler, int width, int height) {
+  int y;
+  void (*ARGBShuffleRow)(const uint8* src_bgra, uint8* dst_argb,
+                         const uint8* shuffler, int width) = ARGBShuffleRow_C;
+  if (!src_bgra || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_bgra = src_bgra + (height - 1) * src_stride_bgra;
+    src_stride_bgra = -src_stride_bgra;
+  }
+  // Coalesce rows.
+  if (src_stride_bgra == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_bgra = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBSHUFFLEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBShuffleRow = ARGBShuffleRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBShuffleRow = ARGBShuffleRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBSHUFFLEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBShuffleRow = ARGBShuffleRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBShuffleRow = ARGBShuffleRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBSHUFFLEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBShuffleRow = ARGBShuffleRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBShuffleRow = ARGBShuffleRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBSHUFFLEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBShuffleRow = ARGBShuffleRow_Any_NEON;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBShuffleRow = ARGBShuffleRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBShuffleRow(src_bgra, dst_argb, shuffler, width);
+    src_bgra += src_stride_bgra;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Sobel ARGB effect.
+static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
+                        uint8* dst_argb, int dst_stride_argb,
+                        int width, int height,
+                        void (*SobelRow)(const uint8* src_sobelx,
+                                         const uint8* src_sobely,
+                                         uint8* dst, int width)) {
+  int y;
+  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_g, int width) =
+      ARGBToYJRow_C;
+  void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1,
+                    uint8* dst_sobely, int width) = SobelYRow_C;
+  void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1,
+                    const uint8* src_y2, uint8* dst_sobely, int width) =
+      SobelXRow_C;
+  const int kEdge = 16;  // Extra pixels at start of row for extrude/align.
+  if (!src_argb  || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb  = src_argb  + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+
+#if defined(HAS_ARGBTOYJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYJRow = ARGBToYJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYJRow = ARGBToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYJRow = ARGBToYJRow_NEON;
+    }
+  }
+#endif
+
+#if defined(HAS_SOBELYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SobelYRow = SobelYRow_SSE2;
+  }
+#endif
+#if defined(HAS_SOBELYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SobelYRow = SobelYRow_NEON;
+  }
+#endif
+#if defined(HAS_SOBELXROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SobelXRow = SobelXRow_SSE2;
+  }
+#endif
+#if defined(HAS_SOBELXROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SobelXRow = SobelXRow_NEON;
+  }
+#endif
+  {
+    // 3 rows with edges before/after.
+    const int kRowSize = (width + kEdge + 31) & ~31;
+    align_buffer_64(rows, kRowSize * 2 + (kEdge + kRowSize * 3 + kEdge));
+    uint8* row_sobelx = rows;
+    uint8* row_sobely = rows + kRowSize;
+    uint8* row_y = rows + kRowSize * 2;
+
+    // Convert first row.
+    uint8* row_y0 = row_y + kEdge;
+    uint8* row_y1 = row_y0 + kRowSize;
+    uint8* row_y2 = row_y1 + kRowSize;
+    ARGBToYJRow(src_argb, row_y0, width);
+    row_y0[-1] = row_y0[0];
+    memset(row_y0 + width, row_y0[width - 1], 16);  // Extrude 16 for valgrind.
+    ARGBToYJRow(src_argb, row_y1, width);
+    row_y1[-1] = row_y1[0];
+    memset(row_y1 + width, row_y1[width - 1], 16);
+    memset(row_y2 + width, 0, 16);
+
+    for (y = 0; y < height; ++y) {
+      // Convert next row of ARGB to G.
+      if (y < (height - 1)) {
+        src_argb += src_stride_argb;
+      }
+      ARGBToYJRow(src_argb, row_y2, width);
+      row_y2[-1] = row_y2[0];
+      row_y2[width] = row_y2[width - 1];
+
+      SobelXRow(row_y0 - 1, row_y1 - 1, row_y2 - 1, row_sobelx, width);
+      SobelYRow(row_y0 - 1, row_y2 - 1, row_sobely, width);
+      SobelRow(row_sobelx, row_sobely, dst_argb, width);
+
+      // Cycle thru circular queue of 3 row_y buffers.
+      {
+        uint8* row_yt = row_y0;
+        row_y0 = row_y1;
+        row_y1 = row_y2;
+        row_y2 = row_yt;
+      }
+
+      dst_argb += dst_stride_argb;
+    }
+    free_aligned_buffer_64(rows);
+  }
+  return 0;
+}
+
+// Sobel ARGB effect.
+LIBYUV_API
+int ARGBSobel(const uint8* src_argb, int src_stride_argb,
+              uint8* dst_argb, int dst_stride_argb,
+              int width, int height) {
+  void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely,
+                   uint8* dst_argb, int width) = SobelRow_C;
+#if defined(HAS_SOBELROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SobelRow = SobelRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      SobelRow = SobelRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SOBELROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SobelRow = SobelRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      SobelRow = SobelRow_NEON;
+    }
+  }
+#endif
+  return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+                      width, height, SobelRow);
+}
+
+// Sobel ARGB effect with planar output.
+LIBYUV_API
+int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb,
+                     uint8* dst_y, int dst_stride_y,
+                     int width, int height) {
+  void (*SobelToPlaneRow)(const uint8* src_sobelx, const uint8* src_sobely,
+                          uint8* dst_, int width) = SobelToPlaneRow_C;
+#if defined(HAS_SOBELTOPLANEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SobelToPlaneRow = SobelToPlaneRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      SobelToPlaneRow = SobelToPlaneRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SOBELTOPLANEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SobelToPlaneRow = SobelToPlaneRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      SobelToPlaneRow = SobelToPlaneRow_NEON;
+    }
+  }
+#endif
+  return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y,
+                      width, height, SobelToPlaneRow);
+}
+
+// SobelXY ARGB effect.
+// Similar to Sobel, but also stores Sobel X in R and Sobel Y in B.  G = Sobel.
+LIBYUV_API
+int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
+                uint8* dst_argb, int dst_stride_argb,
+                int width, int height) {
+  void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width) = SobelXYRow_C;
+#if defined(HAS_SOBELXYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SobelXYRow = SobelXYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      SobelXYRow = SobelXYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SOBELXYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SobelXYRow = SobelXYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      SobelXYRow = SobelXYRow_NEON;
+    }
+  }
+#endif
+  return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+                      width, height, SobelXYRow);
+}
+
+// Apply a 4x4 polynomial to each ARGB pixel.
+LIBYUV_API
+int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_argb, int dst_stride_argb,
+                   const float* poly,
+                   int width, int height) {
+  int y;
+  void (*ARGBPolynomialRow)(const uint8* src_argb,
+                            uint8* dst_argb, const float* poly,
+                            int width) = ARGBPolynomialRow_C;
+  if (!src_argb || !dst_argb || !poly || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb  = src_argb  + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBPOLYNOMIALROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 2)) {
+    ARGBPolynomialRow = ARGBPolynomialRow_SSE2;
+  }
+#endif
+#if defined(HAS_ARGBPOLYNOMIALROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasFMA3) &&
+      IS_ALIGNED(width, 2)) {
+    ARGBPolynomialRow = ARGBPolynomialRow_AVX2;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBPolynomialRow(src_argb, dst_argb, poly, width);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Apply a lumacolortable to each ARGB pixel.
+LIBYUV_API
+int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb,
+                       uint8* dst_argb, int dst_stride_argb,
+                       const uint8* luma,
+                       int width, int height) {
+  int y;
+  void (*ARGBLumaColorTableRow)(const uint8* src_argb, uint8* dst_argb,
+      int width, const uint8* luma, const uint32 lumacoeff) =
+      ARGBLumaColorTableRow_C;
+  if (!src_argb || !dst_argb || !luma || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb  = src_argb  + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBLUMACOLORTABLEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4)) {
+    ARGBLumaColorTableRow = ARGBLumaColorTableRow_SSSE3;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBLumaColorTableRow(src_argb, dst_argb, width, luma, 0x00264b0f);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Copy Alpha from one ARGB image to another.
+LIBYUV_API
+int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
+                  uint8* dst_argb, int dst_stride_argb,
+                  int width, int height) {
+  int y;
+  void (*ARGBCopyAlphaRow)(const uint8* src_argb, uint8* dst_argb, int width) =
+      ARGBCopyAlphaRow_C;
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBCOPYALPHAROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBCopyAlphaRow = ARGBCopyAlphaRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBCOPYALPHAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBCopyAlphaRow = ARGBCopyAlphaRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBCopyAlphaRow = ARGBCopyAlphaRow_AVX2;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBCopyAlphaRow(src_argb, dst_argb, width);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Copy a planar Y channel to the alpha channel of a destination ARGB image.
+LIBYUV_API
+int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
+                     uint8* dst_argb, int dst_stride_argb,
+                     int width, int height) {
+  int y;
+  void (*ARGBCopyYToAlphaRow)(const uint8* src_y, uint8* dst_argb, int width) =
+      ARGBCopyYToAlphaRow_C;
+  if (!src_y || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBCOPYYTOALPHAROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBCOPYYTOALPHAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_AVX2;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBCopyYToAlphaRow(src_y, dst_argb, width);
+    src_y += src_stride_y;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// TODO(fbarchard): Consider if width is even Y channel can be split
+// directly. A SplitUVRow_Odd function could copy the remaining chroma.
+
+LIBYUV_API
+int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height) {
+  int y;
+  int halfwidth = (width + 1) >> 1;
+  void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                     int width) = SplitUVRow_C;
+  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+  if (!src_yuy2 ||
+      !dst_y || !dst_uv ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+    src_stride_yuy2 = -src_stride_yuy2;
+  }
+#if defined(HAS_SPLITUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SplitUVRow = SplitUVRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      SplitUVRow = SplitUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    SplitUVRow = SplitUVRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      SplitUVRow = SplitUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SplitUVRow = SplitUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      SplitUVRow = SplitUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+
+  {
+    int awidth = halfwidth * 2;
+    // row of y and 2 rows of uv
+    align_buffer_64(rows, awidth * 3);
+
+    for (y = 0; y < height - 1; y += 2) {
+      // Split Y from UV.
+      SplitUVRow(src_yuy2, rows, rows + awidth, awidth);
+      memcpy(dst_y, rows, width);
+      SplitUVRow(src_yuy2 + src_stride_yuy2, rows, rows + awidth * 2, awidth);
+      memcpy(dst_y + dst_stride_y, rows, width);
+      InterpolateRow(dst_uv, rows + awidth, awidth, awidth, 128);
+      src_yuy2 += src_stride_yuy2 * 2;
+      dst_y += dst_stride_y * 2;
+      dst_uv += dst_stride_uv;
+    }
+    if (height & 1) {
+      // Split Y from UV.
+      SplitUVRow(src_yuy2, rows, dst_uv, awidth);
+      memcpy(dst_y, rows, width);
+    }
+    free_aligned_buffer_64(rows);
+  }
+  return 0;
+}
+
+LIBYUV_API
+int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height) {
+  int y;
+  int halfwidth = (width + 1) >> 1;
+  void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                     int width) = SplitUVRow_C;
+  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+  if (!src_uyvy ||
+      !dst_y || !dst_uv ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+    src_stride_uyvy = -src_stride_uyvy;
+  }
+#if defined(HAS_SPLITUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SplitUVRow = SplitUVRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      SplitUVRow = SplitUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    SplitUVRow = SplitUVRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      SplitUVRow = SplitUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SplitUVRow = SplitUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      SplitUVRow = SplitUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+
+  {
+    int awidth = halfwidth * 2;
+    // row of y and 2 rows of uv
+    align_buffer_64(rows, awidth * 3);
+
+    for (y = 0; y < height - 1; y += 2) {
+      // Split Y from UV.
+      SplitUVRow(src_uyvy, rows + awidth, rows, awidth);
+      memcpy(dst_y, rows, width);
+      SplitUVRow(src_uyvy + src_stride_uyvy, rows + awidth * 2, rows, awidth);
+      memcpy(dst_y + dst_stride_y, rows, width);
+      InterpolateRow(dst_uv, rows + awidth, awidth, awidth, 128);
+      src_uyvy += src_stride_uyvy * 2;
+      dst_y += dst_stride_y * 2;
+      dst_uv += dst_stride_uv;
+    }
+    if (height & 1) {
+      // Split Y from UV.
+      SplitUVRow(src_uyvy, dst_uv, rows, awidth);
+      memcpy(dst_y, rows, width);
+    }
+    free_aligned_buffer_64(rows);
+  }
+  return 0;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libyuv/source/rotate.cc b/libs/libyuv/source/rotate.cc
new file mode 100644
index 0000000000..01ea5c4074
--- /dev/null
+++ b/libs/libyuv/source/rotate.cc
@@ -0,0 +1,491 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate.h"
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/convert.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+LIBYUV_API
+void TransposePlane(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride,
+                    int width, int height) {
+  int i = height;
+  void (*TransposeWx8)(const uint8* src, int src_stride,
+                       uint8* dst, int dst_stride, int width) = TransposeWx8_C;
+#if defined(HAS_TRANSPOSEWX8_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    TransposeWx8 = TransposeWx8_NEON;
+  }
+#endif
+#if defined(HAS_TRANSPOSEWX8_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    TransposeWx8 = TransposeWx8_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      TransposeWx8 = TransposeWx8_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    TransposeWx8 = TransposeWx8_Fast_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      TransposeWx8 = TransposeWx8_Fast_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_TRANSPOSEWX8_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    if (IS_ALIGNED(width, 4) &&
+        IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
+      TransposeWx8 = TransposeWx8_Fast_DSPR2;
+    } else {
+      TransposeWx8 = TransposeWx8_DSPR2;
+    }
+  }
+#endif
+
+  // Work across the source in 8x8 tiles
+  while (i >= 8) {
+    TransposeWx8(src, src_stride, dst, dst_stride, width);
+    src += 8 * src_stride;    // Go down 8 rows.
+    dst += 8;                 // Move over 8 columns.
+    i -= 8;
+  }
+
+  if (i > 0) {
+    TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
+  }
+}
+
+LIBYUV_API
+void RotatePlane90(const uint8* src, int src_stride,
+                   uint8* dst, int dst_stride,
+                   int width, int height) {
+  // Rotate by 90 is a transpose with the source read
+  // from bottom to top. So set the source pointer to the end
+  // of the buffer and flip the sign of the source stride.
+  src += src_stride * (height - 1);
+  src_stride = -src_stride;
+  TransposePlane(src, src_stride, dst, dst_stride, width, height);
+}
+
+LIBYUV_API
+void RotatePlane270(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride,
+                    int width, int height) {
+  // Rotate by 270 is a transpose with the destination written
+  // from bottom to top. So set the destination pointer to the end
+  // of the buffer and flip the sign of the destination stride.
+  dst += dst_stride * (width - 1);
+  dst_stride = -dst_stride;
+  TransposePlane(src, src_stride, dst, dst_stride, width, height);
+}
+
+LIBYUV_API
+void RotatePlane180(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride,
+                    int width, int height) {
+  // Swap first and last row and mirror the content. Uses a temporary row.
+  align_buffer_64(row, width);
+  const uint8* src_bot = src + src_stride * (height - 1);
+  uint8* dst_bot = dst + dst_stride * (height - 1);
+  int half_height = (height + 1) >> 1;
+  int y;
+  void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
+  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+#if defined(HAS_MIRRORROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MirrorRow = MirrorRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      MirrorRow = MirrorRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    MirrorRow = MirrorRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      MirrorRow = MirrorRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MirrorRow = MirrorRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      MirrorRow = MirrorRow_AVX2;
+    }
+  }
+#endif
+// TODO(fbarchard): Mirror on mips handle unaligned memory.
+#if defined(HAS_MIRRORROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) &&
+      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4)) {
+    MirrorRow = MirrorRow_DSPR2;
+  }
+#endif
+#if defined(HAS_COPYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
+  }
+#endif
+#if defined(HAS_COPYROW_AVX)
+  if (TestCpuFlag(kCpuHasAVX)) {
+    CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
+  }
+#endif
+#if defined(HAS_COPYROW_ERMS)
+  if (TestCpuFlag(kCpuHasERMS)) {
+    CopyRow = CopyRow_ERMS;
+  }
+#endif
+#if defined(HAS_COPYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
+  }
+#endif
+#if defined(HAS_COPYROW_MIPS)
+  if (TestCpuFlag(kCpuHasMIPS)) {
+    CopyRow = CopyRow_MIPS;
+  }
+#endif
+
+  // Odd height will harmlessly mirror the middle row twice.
+  for (y = 0; y < half_height; ++y) {
+    MirrorRow(src, row, width);  // Mirror first row into a buffer
+    src += src_stride;
+    MirrorRow(src_bot, dst, width);  // Mirror last row into first row
+    dst += dst_stride;
+    CopyRow(row, dst_bot, width);  // Copy first mirrored row into last
+    src_bot -= src_stride;
+    dst_bot -= dst_stride;
+  }
+  free_aligned_buffer_64(row);
+}
+
+LIBYUV_API
+void TransposeUV(const uint8* src, int src_stride,
+                 uint8* dst_a, int dst_stride_a,
+                 uint8* dst_b, int dst_stride_b,
+                 int width, int height) {
+  int i = height;
+  void (*TransposeUVWx8)(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b,
+                         int width) = TransposeUVWx8_C;
+#if defined(HAS_TRANSPOSEUVWX8_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    TransposeUVWx8 = TransposeUVWx8_NEON;
+  }
+#endif
+#if defined(HAS_TRANSPOSEUVWX8_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    TransposeUVWx8 = TransposeUVWx8_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      TransposeUVWx8 = TransposeUVWx8_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_TRANSPOSEUVWX8_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 2) &&
+      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
+    TransposeUVWx8 = TransposeUVWx8_DSPR2;
+  }
+#endif
+
+  // Work through the source in 8x8 tiles.
+  while (i >= 8) {
+    TransposeUVWx8(src, src_stride,
+                   dst_a, dst_stride_a,
+                   dst_b, dst_stride_b,
+                   width);
+    src += 8 * src_stride;    // Go down 8 rows.
+    dst_a += 8;               // Move over 8 columns.
+    dst_b += 8;               // Move over 8 columns.
+    i -= 8;
+  }
+
+  if (i > 0) {
+    TransposeUVWxH_C(src, src_stride,
+                     dst_a, dst_stride_a,
+                     dst_b, dst_stride_b,
+                     width, i);
+  }
+}
+
+LIBYUV_API
+void RotateUV90(const uint8* src, int src_stride,
+                uint8* dst_a, int dst_stride_a,
+                uint8* dst_b, int dst_stride_b,
+                int width, int height) {
+  src += src_stride * (height - 1);
+  src_stride = -src_stride;
+
+  TransposeUV(src, src_stride,
+              dst_a, dst_stride_a,
+              dst_b, dst_stride_b,
+              width, height);
+}
+
+LIBYUV_API
+void RotateUV270(const uint8* src, int src_stride,
+                 uint8* dst_a, int dst_stride_a,
+                 uint8* dst_b, int dst_stride_b,
+                 int width, int height) {
+  dst_a += dst_stride_a * (width - 1);
+  dst_b += dst_stride_b * (width - 1);
+  dst_stride_a = -dst_stride_a;
+  dst_stride_b = -dst_stride_b;
+
+  TransposeUV(src, src_stride,
+              dst_a, dst_stride_a,
+              dst_b, dst_stride_b,
+              width, height);
+}
+
+// Rotate 180 is a horizontal and vertical flip.
+LIBYUV_API
+void RotateUV180(const uint8* src, int src_stride,
+                 uint8* dst_a, int dst_stride_a,
+                 uint8* dst_b, int dst_stride_b,
+                 int width, int height) {
+  int i;
+  void (*MirrorUVRow)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) =
+      MirrorUVRow_C;
+#if defined(HAS_MIRRORUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    MirrorUVRow = MirrorUVRow_NEON;
+  }
+#endif
+#if defined(HAS_MIRRORUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
+    MirrorUVRow = MirrorUVRow_SSSE3;
+  }
+#endif
+#if defined(HAS_MIRRORUVROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) &&
+      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
+    MirrorUVRow = MirrorUVRow_DSPR2;
+  }
+#endif
+
+  dst_a += dst_stride_a * (height - 1);
+  dst_b += dst_stride_b * (height - 1);
+
+  for (i = 0; i < height; ++i) {
+    MirrorUVRow(src, dst_a, dst_b, width);
+    src += src_stride;
+    dst_a -= dst_stride_a;
+    dst_b -= dst_stride_b;
+  }
+}
+
+LIBYUV_API
+int RotatePlane(const uint8* src, int src_stride,
+                uint8* dst, int dst_stride,
+                int width, int height,
+                enum RotationMode mode) {
+  if (!src || width <= 0 || height == 0 || !dst) {
+    return -1;
+  }
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src = src + (height - 1) * src_stride;
+    src_stride = -src_stride;
+  }
+
+  switch (mode) {
+    case kRotate0:
+      // copy frame
+      CopyPlane(src, src_stride,
+                dst, dst_stride,
+                width, height);
+      return 0;
+    case kRotate90:
+      RotatePlane90(src, src_stride,
+                    dst, dst_stride,
+                    width, height);
+      return 0;
+    case kRotate270:
+      RotatePlane270(src, src_stride,
+                     dst, dst_stride,
+                     width, height);
+      return 0;
+    case kRotate180:
+      RotatePlane180(src, src_stride,
+                     dst, dst_stride,
+                     width, height);
+      return 0;
+    default:
+      break;
+  }
+  return -1;
+}
+
+LIBYUV_API
+int I420Rotate(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height,
+               enum RotationMode mode) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 ||
+      !dst_y || !dst_u || !dst_v) {
+    return -1;
+  }
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  switch (mode) {
+    case kRotate0:
+      // copy frame
+      return I420Copy(src_y, src_stride_y,
+                      src_u, src_stride_u,
+                      src_v, src_stride_v,
+                      dst_y, dst_stride_y,
+                      dst_u, dst_stride_u,
+                      dst_v, dst_stride_v,
+                      width, height);
+    case kRotate90:
+      RotatePlane90(src_y, src_stride_y,
+                    dst_y, dst_stride_y,
+                    width, height);
+      RotatePlane90(src_u, src_stride_u,
+                    dst_u, dst_stride_u,
+                    halfwidth, halfheight);
+      RotatePlane90(src_v, src_stride_v,
+                    dst_v, dst_stride_v,
+                    halfwidth, halfheight);
+      return 0;
+    case kRotate270:
+      RotatePlane270(src_y, src_stride_y,
+                     dst_y, dst_stride_y,
+                     width, height);
+      RotatePlane270(src_u, src_stride_u,
+                     dst_u, dst_stride_u,
+                     halfwidth, halfheight);
+      RotatePlane270(src_v, src_stride_v,
+                     dst_v, dst_stride_v,
+                     halfwidth, halfheight);
+      return 0;
+    case kRotate180:
+      RotatePlane180(src_y, src_stride_y,
+                     dst_y, dst_stride_y,
+                     width, height);
+      RotatePlane180(src_u, src_stride_u,
+                     dst_u, dst_stride_u,
+                     halfwidth, halfheight);
+      RotatePlane180(src_v, src_stride_v,
+                     dst_v, dst_stride_v,
+                     halfwidth, halfheight);
+      return 0;
+    default:
+      break;
+  }
+  return -1;
+}
+
+LIBYUV_API
+int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
+                     const uint8* src_uv, int src_stride_uv,
+                     uint8* dst_y, int dst_stride_y,
+                     uint8* dst_u, int dst_stride_u,
+                     uint8* dst_v, int dst_stride_v,
+                     int width, int height,
+                     enum RotationMode mode) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_y || !src_uv || width <= 0 || height == 0 ||
+      !dst_y || !dst_u || !dst_v) {
+    return -1;
+  }
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_uv = src_uv + (halfheight - 1) * src_stride_uv;
+    src_stride_y = -src_stride_y;
+    src_stride_uv = -src_stride_uv;
+  }
+
+  switch (mode) {
+    case kRotate0:
+      // copy frame
+      return NV12ToI420(src_y, src_stride_y,
+                        src_uv, src_stride_uv,
+                        dst_y, dst_stride_y,
+                        dst_u, dst_stride_u,
+                        dst_v, dst_stride_v,
+                        width, height);
+    case kRotate90:
+      RotatePlane90(src_y, src_stride_y,
+                    dst_y, dst_stride_y,
+                    width, height);
+      RotateUV90(src_uv, src_stride_uv,
+                 dst_u, dst_stride_u,
+                 dst_v, dst_stride_v,
+                 halfwidth, halfheight);
+      return 0;
+    case kRotate270:
+      RotatePlane270(src_y, src_stride_y,
+                     dst_y, dst_stride_y,
+                     width, height);
+      RotateUV270(src_uv, src_stride_uv,
+                  dst_u, dst_stride_u,
+                  dst_v, dst_stride_v,
+                  halfwidth, halfheight);
+      return 0;
+    case kRotate180:
+      RotatePlane180(src_y, src_stride_y,
+                     dst_y, dst_stride_y,
+                     width, height);
+      RotateUV180(src_uv, src_stride_uv,
+                  dst_u, dst_stride_u,
+                  dst_v, dst_stride_v,
+                  halfwidth, halfheight);
+      return 0;
+    default:
+      break;
+  }
+  return -1;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libyuv/source/rotate_any.cc b/libs/libyuv/source/rotate_any.cc
new file mode 100644
index 0000000000..31a74c3155
--- /dev/null
+++ b/libs/libyuv/source/rotate_any.cc
@@ -0,0 +1,80 @@
+/*
+ *  Copyright 2015 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate.h"
+#include "libyuv/rotate_row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define TANY(NAMEANY, TPOS_SIMD, MASK)                                         \
+    void NAMEANY(const uint8* src, int src_stride,                             \
+                 uint8* dst, int dst_stride, int width) {                      \
+      int r = width & MASK;                                                    \
+      int n = width - r;                                                       \
+      if (n > 0) {                                                             \
+        TPOS_SIMD(src, src_stride, dst, dst_stride, n);                        \
+      }                                                                        \
+      TransposeWx8_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r);\
+    }
+
+#ifdef HAS_TRANSPOSEWX8_NEON
+TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, 7)
+#endif
+#ifdef HAS_TRANSPOSEWX8_SSSE3
+TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, 7)
+#endif
+#ifdef HAS_TRANSPOSEWX8_FAST_SSSE3
+TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, 15)
+#endif
+#ifdef HAS_TRANSPOSEWX8_DSPR2
+TANY(TransposeWx8_Any_DSPR2, TransposeWx8_DSPR2, 7)
+#endif
+#undef TANY
+
+#define TUVANY(NAMEANY, TPOS_SIMD, MASK)                                       \
+    void NAMEANY(const uint8* src, int src_stride,                             \
+                uint8* dst_a, int dst_stride_a,                                \
+                uint8* dst_b, int dst_stride_b, int width) {                   \
+      int r = width & MASK;                                                    \
+      int n = width - r;                                                       \
+      if (n > 0) {                                                             \
+        TPOS_SIMD(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,   \
+                  n);                                                          \
+      }                                                                        \
+      TransposeUVWx8_C(src + n * 2, src_stride,                                \
+                       dst_a + n * dst_stride_a, dst_stride_a,                 \
+                       dst_b + n * dst_stride_b, dst_stride_b, r);             \
+    }
+
+#ifdef HAS_TRANSPOSEUVWX8_NEON
+TUVANY(TransposeUVWx8_Any_NEON, TransposeUVWx8_NEON, 7)
+#endif
+#ifdef HAS_TRANSPOSEUVWX8_SSE2
+TUVANY(TransposeUVWx8_Any_SSE2, TransposeUVWx8_SSE2, 7)
+#endif
+#ifdef HAS_TRANSPOSEUVWX8_DSPR2
+TUVANY(TransposeUVWx8_Any_DSPR2, TransposeUVWx8_DSPR2, 7)
+#endif
+#undef TUVANY
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+
+
+
+
diff --git a/libs/libyuv/source/rotate_argb.cc b/libs/libyuv/source/rotate_argb.cc
new file mode 100644
index 0000000000..787c0ad1be
--- /dev/null
+++ b/libs/libyuv/source/rotate_argb.cc
@@ -0,0 +1,205 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate.h"
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/convert.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// ARGBScale has a function to copy pixels to a row, striding each source
+// pixel by a constant.
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || \
+    (defined(__x86_64__) && !defined(__native_client__)) || defined(__i386__))
+#define HAS_SCALEARGBROWDOWNEVEN_SSE2
+void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride,
+                               int src_stepx, uint8* dst_ptr, int dst_width);
+#endif
+#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
+#define HAS_SCALEARGBROWDOWNEVEN_NEON
+void ScaleARGBRowDownEven_NEON(const uint8* src_ptr, int src_stride,
+                               int src_stepx, uint8* dst_ptr, int dst_width);
+#endif
+
+void ScaleARGBRowDownEven_C(const uint8* src_ptr, int,
+                            int src_stepx, uint8* dst_ptr, int dst_width);
+
+static void ARGBTranspose(const uint8* src, int src_stride,
+                          uint8* dst, int dst_stride, int width, int height) {
+  int i;
+  int src_pixel_step = src_stride >> 2;
+  void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride,
+      int src_step, uint8* dst_ptr, int dst_width) = ScaleARGBRowDownEven_C;
+#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4)) {  // Width of dest.
+    ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2;
+  }
+#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(height, 4)) {  // Width of dest.
+    ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON;
+  }
+#endif
+
+  for (i = 0; i < width; ++i) {  // column of source to row of dest.
+    ScaleARGBRowDownEven(src, 0, src_pixel_step, dst, height);
+    dst += dst_stride;
+    src += 4;
+  }
+}
+
+void ARGBRotate90(const uint8* src, int src_stride,
+                  uint8* dst, int dst_stride, int width, int height) {
+  // Rotate by 90 is a ARGBTranspose with the source read
+  // from bottom to top. So set the source pointer to the end
+  // of the buffer and flip the sign of the source stride.
+  src += src_stride * (height - 1);
+  src_stride = -src_stride;
+  ARGBTranspose(src, src_stride, dst, dst_stride, width, height);
+}
+
+void ARGBRotate270(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride, int width, int height) {
+  // Rotate by 270 is a ARGBTranspose with the destination written
+  // from bottom to top. So set the destination pointer to the end
+  // of the buffer and flip the sign of the destination stride.
+  dst += dst_stride * (width - 1);
+  dst_stride = -dst_stride;
+  ARGBTranspose(src, src_stride, dst, dst_stride, width, height);
+}
+
+void ARGBRotate180(const uint8* src, int src_stride,
+                   uint8* dst, int dst_stride, int width, int height) {
+  // Swap first and last row and mirror the content. Uses a temporary row.
+  align_buffer_64(row, width * 4);
+  const uint8* src_bot = src + src_stride * (height - 1);
+  uint8* dst_bot = dst + dst_stride * (height - 1);
+  int half_height = (height + 1) >> 1;
+  int y;
+  void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =
+      ARGBMirrorRow_C;
+  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+#if defined(HAS_ARGBMIRRORROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBMirrorRow = ARGBMirrorRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBMIRRORROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBMirrorRow = ARGBMirrorRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBMIRRORROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBMirrorRow = ARGBMirrorRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_COPYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
+  }
+#endif
+#if defined(HAS_COPYROW_AVX)
+  if (TestCpuFlag(kCpuHasAVX)) {
+    CopyRow = IS_ALIGNED(width * 4, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
+  }
+#endif
+#if defined(HAS_COPYROW_ERMS)
+  if (TestCpuFlag(kCpuHasERMS)) {
+    CopyRow = CopyRow_ERMS;
+  }
+#endif
+#if defined(HAS_COPYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
+  }
+#endif
+#if defined(HAS_COPYROW_MIPS)
+  if (TestCpuFlag(kCpuHasMIPS)) {
+    CopyRow = CopyRow_MIPS;
+  }
+#endif
+
+  // Odd height will harmlessly mirror the middle row twice.
+  for (y = 0; y < half_height; ++y) {
+    ARGBMirrorRow(src, row, width);  // Mirror first row into a buffer
+    ARGBMirrorRow(src_bot, dst, width);  // Mirror last row into first row
+    CopyRow(row, dst_bot, width * 4);  // Copy first mirrored row into last
+    src += src_stride;
+    dst += dst_stride;
+    src_bot -= src_stride;
+    dst_bot -= dst_stride;
+  }
+  free_aligned_buffer_64(row);
+}
+
+LIBYUV_API
+int ARGBRotate(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_argb, int dst_stride_argb, int width, int height,
+               enum RotationMode mode) {
+  if (!src_argb || width <= 0 || height == 0 || !dst_argb) {
+    return -1;
+  }
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+
+  switch (mode) {
+    case kRotate0:
+      // copy frame
+      return ARGBCopy(src_argb, src_stride_argb,
+                      dst_argb, dst_stride_argb,
+                      width, height);
+    case kRotate90:
+      ARGBRotate90(src_argb, src_stride_argb,
+                   dst_argb, dst_stride_argb,
+                   width, height);
+      return 0;
+    case kRotate270:
+      ARGBRotate270(src_argb, src_stride_argb,
+                    dst_argb, dst_stride_argb,
+                    width, height);
+      return 0;
+    case kRotate180:
+      ARGBRotate180(src_argb, src_stride_argb,
+                    dst_argb, dst_stride_argb,
+                    width, height);
+      return 0;
+    default:
+      break;
+  }
+  return -1;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libyuv/source/rotate_common.cc b/libs/libyuv/source/rotate_common.cc
new file mode 100644
index 0000000000..b33a9a0c6e
--- /dev/null
+++ b/libs/libyuv/source/rotate_common.cc
@@ -0,0 +1,92 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/rotate_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+void TransposeWx8_C(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    dst[0] = src[0 * src_stride];
+    dst[1] = src[1 * src_stride];
+    dst[2] = src[2 * src_stride];
+    dst[3] = src[3 * src_stride];
+    dst[4] = src[4 * src_stride];
+    dst[5] = src[5 * src_stride];
+    dst[6] = src[6 * src_stride];
+    dst[7] = src[7 * src_stride];
+    ++src;
+    dst += dst_stride;
+  }
+}
+
+void TransposeUVWx8_C(const uint8* src, int src_stride,
+                      uint8* dst_a, int dst_stride_a,
+                      uint8* dst_b, int dst_stride_b, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    dst_a[0] = src[0 * src_stride + 0];
+    dst_b[0] = src[0 * src_stride + 1];
+    dst_a[1] = src[1 * src_stride + 0];
+    dst_b[1] = src[1 * src_stride + 1];
+    dst_a[2] = src[2 * src_stride + 0];
+    dst_b[2] = src[2 * src_stride + 1];
+    dst_a[3] = src[3 * src_stride + 0];
+    dst_b[3] = src[3 * src_stride + 1];
+    dst_a[4] = src[4 * src_stride + 0];
+    dst_b[4] = src[4 * src_stride + 1];
+    dst_a[5] = src[5 * src_stride + 0];
+    dst_b[5] = src[5 * src_stride + 1];
+    dst_a[6] = src[6 * src_stride + 0];
+    dst_b[6] = src[6 * src_stride + 1];
+    dst_a[7] = src[7 * src_stride + 0];
+    dst_b[7] = src[7 * src_stride + 1];
+    src += 2;
+    dst_a += dst_stride_a;
+    dst_b += dst_stride_b;
+  }
+}
+
+void TransposeWxH_C(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride,
+                    int width, int height) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    int j;
+    for (j = 0; j < height; ++j) {
+      dst[i * dst_stride + j] = src[j * src_stride + i];
+    }
+  }
+}
+
+void TransposeUVWxH_C(const uint8* src, int src_stride,
+                      uint8* dst_a, int dst_stride_a,
+                      uint8* dst_b, int dst_stride_b,
+                      int width, int height) {
+  int i;
+  for (i = 0; i < width * 2; i += 2) {
+    int j;
+    for (j = 0; j < height; ++j) {
+      dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
+      dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
+    }
+  }
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libyuv/source/rotate_gcc.cc b/libs/libyuv/source/rotate_gcc.cc
new file mode 100644
index 0000000000..cbe870caa7
--- /dev/null
+++ b/libs/libyuv/source/rotate_gcc.cc
@@ -0,0 +1,368 @@
+/*
+ *  Copyright 2015 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/rotate_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+
+// Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit.
+#if defined(HAS_TRANSPOSEWX8_SSSE3)
+void TransposeWx8_SSSE3(const uint8* src, int src_stride,
+                        uint8* dst, int dst_stride, int width) {
+  asm volatile (
+    // Read in the data from the source pointer.
+    // First round of bit swap.
+    LABELALIGN
+  "1:                                            \n"
+    "movq       (%0),%%xmm0                      \n"
+    "movq       (%0,%3),%%xmm1                   \n"
+    "lea        (%0,%3,2),%0                     \n"
+    "punpcklbw  %%xmm1,%%xmm0                    \n"
+    "movq       (%0),%%xmm2                      \n"
+    "movdqa     %%xmm0,%%xmm1                    \n"
+    "palignr    $0x8,%%xmm1,%%xmm1               \n"
+    "movq       (%0,%3),%%xmm3                   \n"
+    "lea        (%0,%3,2),%0                     \n"
+    "punpcklbw  %%xmm3,%%xmm2                    \n"
+    "movdqa     %%xmm2,%%xmm3                    \n"
+    "movq       (%0),%%xmm4                      \n"
+    "palignr    $0x8,%%xmm3,%%xmm3               \n"
+    "movq       (%0,%3),%%xmm5                   \n"
+    "lea        (%0,%3,2),%0                     \n"
+    "punpcklbw  %%xmm5,%%xmm4                    \n"
+    "movdqa     %%xmm4,%%xmm5                    \n"
+    "movq       (%0),%%xmm6                      \n"
+    "palignr    $0x8,%%xmm5,%%xmm5               \n"
+    "movq       (%0,%3),%%xmm7                   \n"
+    "lea        (%0,%3,2),%0                     \n"
+    "punpcklbw  %%xmm7,%%xmm6                    \n"
+    "neg        %3                               \n"
+    "movdqa     %%xmm6,%%xmm7                    \n"
+    "lea        0x8(%0,%3,8),%0                  \n"
+    "palignr    $0x8,%%xmm7,%%xmm7               \n"
+    "neg        %3                               \n"
+     // Second round of bit swap.
+    "punpcklwd  %%xmm2,%%xmm0                    \n"
+    "punpcklwd  %%xmm3,%%xmm1                    \n"
+    "movdqa     %%xmm0,%%xmm2                    \n"
+    "movdqa     %%xmm1,%%xmm3                    \n"
+    "palignr    $0x8,%%xmm2,%%xmm2               \n"
+    "palignr    $0x8,%%xmm3,%%xmm3               \n"
+    "punpcklwd  %%xmm6,%%xmm4                    \n"
+    "punpcklwd  %%xmm7,%%xmm5                    \n"
+    "movdqa     %%xmm4,%%xmm6                    \n"
+    "movdqa     %%xmm5,%%xmm7                    \n"
+    "palignr    $0x8,%%xmm6,%%xmm6               \n"
+    "palignr    $0x8,%%xmm7,%%xmm7               \n"
+    // Third round of bit swap.
+    // Write to the destination pointer.
+    "punpckldq  %%xmm4,%%xmm0                    \n"
+    "movq       %%xmm0,(%1)                      \n"
+    "movdqa     %%xmm0,%%xmm4                    \n"
+    "palignr    $0x8,%%xmm4,%%xmm4               \n"
+    "movq       %%xmm4,(%1,%4)                   \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "punpckldq  %%xmm6,%%xmm2                    \n"
+    "movdqa     %%xmm2,%%xmm6                    \n"
+    "movq       %%xmm2,(%1)                      \n"
+    "palignr    $0x8,%%xmm6,%%xmm6               \n"
+    "punpckldq  %%xmm5,%%xmm1                    \n"
+    "movq       %%xmm6,(%1,%4)                   \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "movdqa     %%xmm1,%%xmm5                    \n"
+    "movq       %%xmm1,(%1)                      \n"
+    "palignr    $0x8,%%xmm5,%%xmm5               \n"
+    "movq       %%xmm5,(%1,%4)                   \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "punpckldq  %%xmm7,%%xmm3                    \n"
+    "movq       %%xmm3,(%1)                      \n"
+    "movdqa     %%xmm3,%%xmm7                    \n"
+    "palignr    $0x8,%%xmm7,%%xmm7               \n"
+    "sub        $0x8,%2                          \n"
+    "movq       %%xmm7,(%1,%4)                   \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "jg         1b                               \n"
+    : "+r"(src),    // %0
+      "+r"(dst),    // %1
+      "+r"(width)   // %2
+    : "r"((intptr_t)(src_stride)),  // %3
+      "r"((intptr_t)(dst_stride))   // %4
+    : "memory", "cc",
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // defined(HAS_TRANSPOSEWX8_SSSE3)
+
+// Transpose 16x8. 64 bit
+#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
+void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,
+                             uint8* dst, int dst_stride, int width) {
+  asm volatile (
+    // Read in the data from the source pointer.
+    // First round of bit swap.
+    LABELALIGN
+  "1:                                            \n"
+    "movdqu     (%0),%%xmm0                      \n"
+    "movdqu     (%0,%3),%%xmm1                   \n"
+    "lea        (%0,%3,2),%0                     \n"
+    "movdqa     %%xmm0,%%xmm8                    \n"
+    "punpcklbw  %%xmm1,%%xmm0                    \n"
+    "punpckhbw  %%xmm1,%%xmm8                    \n"
+    "movdqu     (%0),%%xmm2                      \n"
+    "movdqa     %%xmm0,%%xmm1                    \n"
+    "movdqa     %%xmm8,%%xmm9                    \n"
+    "palignr    $0x8,%%xmm1,%%xmm1               \n"
+    "palignr    $0x8,%%xmm9,%%xmm9               \n"
+    "movdqu     (%0,%3),%%xmm3                   \n"
+    "lea        (%0,%3,2),%0                     \n"
+    "movdqa     %%xmm2,%%xmm10                   \n"
+    "punpcklbw  %%xmm3,%%xmm2                    \n"
+    "punpckhbw  %%xmm3,%%xmm10                   \n"
+    "movdqa     %%xmm2,%%xmm3                    \n"
+    "movdqa     %%xmm10,%%xmm11                  \n"
+    "movdqu     (%0),%%xmm4                      \n"
+    "palignr    $0x8,%%xmm3,%%xmm3               \n"
+    "palignr    $0x8,%%xmm11,%%xmm11             \n"
+    "movdqu     (%0,%3),%%xmm5                   \n"
+    "lea        (%0,%3,2),%0                     \n"
+    "movdqa     %%xmm4,%%xmm12                   \n"
+    "punpcklbw  %%xmm5,%%xmm4                    \n"
+    "punpckhbw  %%xmm5,%%xmm12                   \n"
+    "movdqa     %%xmm4,%%xmm5                    \n"
+    "movdqa     %%xmm12,%%xmm13                  \n"
+    "movdqu     (%0),%%xmm6                      \n"
+    "palignr    $0x8,%%xmm5,%%xmm5               \n"
+    "palignr    $0x8,%%xmm13,%%xmm13             \n"
+    "movdqu     (%0,%3),%%xmm7                   \n"
+    "lea        (%0,%3,2),%0                     \n"
+    "movdqa     %%xmm6,%%xmm14                   \n"
+    "punpcklbw  %%xmm7,%%xmm6                    \n"
+    "punpckhbw  %%xmm7,%%xmm14                   \n"
+    "neg        %3                               \n"
+    "movdqa     %%xmm6,%%xmm7                    \n"
+    "movdqa     %%xmm14,%%xmm15                  \n"
+    "lea        0x10(%0,%3,8),%0                 \n"
+    "palignr    $0x8,%%xmm7,%%xmm7               \n"
+    "palignr    $0x8,%%xmm15,%%xmm15             \n"
+    "neg        %3                               \n"
+     // Second round of bit swap.
+    "punpcklwd  %%xmm2,%%xmm0                    \n"
+    "punpcklwd  %%xmm3,%%xmm1                    \n"
+    "movdqa     %%xmm0,%%xmm2                    \n"
+    "movdqa     %%xmm1,%%xmm3                    \n"
+    "palignr    $0x8,%%xmm2,%%xmm2               \n"
+    "palignr    $0x8,%%xmm3,%%xmm3               \n"
+    "punpcklwd  %%xmm6,%%xmm4                    \n"
+    "punpcklwd  %%xmm7,%%xmm5                    \n"
+    "movdqa     %%xmm4,%%xmm6                    \n"
+    "movdqa     %%xmm5,%%xmm7                    \n"
+    "palignr    $0x8,%%xmm6,%%xmm6               \n"
+    "palignr    $0x8,%%xmm7,%%xmm7               \n"
+    "punpcklwd  %%xmm10,%%xmm8                   \n"
+    "punpcklwd  %%xmm11,%%xmm9                   \n"
+    "movdqa     %%xmm8,%%xmm10                   \n"
+    "movdqa     %%xmm9,%%xmm11                   \n"
+    "palignr    $0x8,%%xmm10,%%xmm10             \n"
+    "palignr    $0x8,%%xmm11,%%xmm11             \n"
+    "punpcklwd  %%xmm14,%%xmm12                  \n"
+    "punpcklwd  %%xmm15,%%xmm13                  \n"
+    "movdqa     %%xmm12,%%xmm14                  \n"
+    "movdqa     %%xmm13,%%xmm15                  \n"
+    "palignr    $0x8,%%xmm14,%%xmm14             \n"
+    "palignr    $0x8,%%xmm15,%%xmm15             \n"
+    // Third round of bit swap.
+    // Write to the destination pointer.
+    "punpckldq  %%xmm4,%%xmm0                    \n"
+    "movq       %%xmm0,(%1)                      \n"
+    "movdqa     %%xmm0,%%xmm4                    \n"
+    "palignr    $0x8,%%xmm4,%%xmm4               \n"
+    "movq       %%xmm4,(%1,%4)                   \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "punpckldq  %%xmm6,%%xmm2                    \n"
+    "movdqa     %%xmm2,%%xmm6                    \n"
+    "movq       %%xmm2,(%1)                      \n"
+    "palignr    $0x8,%%xmm6,%%xmm6               \n"
+    "punpckldq  %%xmm5,%%xmm1                    \n"
+    "movq       %%xmm6,(%1,%4)                   \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "movdqa     %%xmm1,%%xmm5                    \n"
+    "movq       %%xmm1,(%1)                      \n"
+    "palignr    $0x8,%%xmm5,%%xmm5               \n"
+    "movq       %%xmm5,(%1,%4)                   \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "punpckldq  %%xmm7,%%xmm3                    \n"
+    "movq       %%xmm3,(%1)                      \n"
+    "movdqa     %%xmm3,%%xmm7                    \n"
+    "palignr    $0x8,%%xmm7,%%xmm7               \n"
+    "movq       %%xmm7,(%1,%4)                   \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "punpckldq  %%xmm12,%%xmm8                   \n"
+    "movq       %%xmm8,(%1)                      \n"
+    "movdqa     %%xmm8,%%xmm12                   \n"
+    "palignr    $0x8,%%xmm12,%%xmm12             \n"
+    "movq       %%xmm12,(%1,%4)                  \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "punpckldq  %%xmm14,%%xmm10                  \n"
+    "movdqa     %%xmm10,%%xmm14                  \n"
+    "movq       %%xmm10,(%1)                     \n"
+    "palignr    $0x8,%%xmm14,%%xmm14             \n"
+    "punpckldq  %%xmm13,%%xmm9                   \n"
+    "movq       %%xmm14,(%1,%4)                  \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "movdqa     %%xmm9,%%xmm13                   \n"
+    "movq       %%xmm9,(%1)                      \n"
+    "palignr    $0x8,%%xmm13,%%xmm13             \n"
+    "movq       %%xmm13,(%1,%4)                  \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "punpckldq  %%xmm15,%%xmm11                  \n"
+    "movq       %%xmm11,(%1)                     \n"
+    "movdqa     %%xmm11,%%xmm15                  \n"
+    "palignr    $0x8,%%xmm15,%%xmm15             \n"
+    "sub        $0x10,%2                         \n"
+    "movq       %%xmm15,(%1,%4)                  \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "jg         1b                               \n"
+    : "+r"(src),    // %0
+      "+r"(dst),    // %1
+      "+r"(width)   // %2
+    : "r"((intptr_t)(src_stride)),  // %3
+      "r"((intptr_t)(dst_stride))   // %4
+    : "memory", "cc",
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
+      "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",  "xmm14",  "xmm15"
+  );
+}
+#endif  // defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
+
+// Transpose UV 8x8.  64 bit.
+#if defined(HAS_TRANSPOSEUVWX8_SSE2)
+void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b, int width) {
+  asm volatile (
+    // Read in the data from the source pointer.
+    // First round of bit swap.
+    LABELALIGN
+  "1:                                            \n"
+    "movdqu     (%0),%%xmm0                      \n"
+    "movdqu     (%0,%4),%%xmm1                   \n"
+    "lea        (%0,%4,2),%0                     \n"
+    "movdqa     %%xmm0,%%xmm8                    \n"
+    "punpcklbw  %%xmm1,%%xmm0                    \n"
+    "punpckhbw  %%xmm1,%%xmm8                    \n"
+    "movdqa     %%xmm8,%%xmm1                    \n"
+    "movdqu     (%0),%%xmm2                      \n"
+    "movdqu     (%0,%4),%%xmm3                   \n"
+    "lea        (%0,%4,2),%0                     \n"
+    "movdqa     %%xmm2,%%xmm8                    \n"
+    "punpcklbw  %%xmm3,%%xmm2                    \n"
+    "punpckhbw  %%xmm3,%%xmm8                    \n"
+    "movdqa     %%xmm8,%%xmm3                    \n"
+    "movdqu     (%0),%%xmm4                      \n"
+    "movdqu     (%0,%4),%%xmm5                   \n"
+    "lea        (%0,%4,2),%0                     \n"
+    "movdqa     %%xmm4,%%xmm8                    \n"
+    "punpcklbw  %%xmm5,%%xmm4                    \n"
+    "punpckhbw  %%xmm5,%%xmm8                    \n"
+    "movdqa     %%xmm8,%%xmm5                    \n"
+    "movdqu     (%0),%%xmm6                      \n"
+    "movdqu     (%0,%4),%%xmm7                   \n"
+    "lea        (%0,%4,2),%0                     \n"
+    "movdqa     %%xmm6,%%xmm8                    \n"
+    "punpcklbw  %%xmm7,%%xmm6                    \n"
+    "neg        %4                               \n"
+    "lea        0x10(%0,%4,8),%0                 \n"
+    "punpckhbw  %%xmm7,%%xmm8                    \n"
+    "movdqa     %%xmm8,%%xmm7                    \n"
+    "neg        %4                               \n"
+     // Second round of bit swap.
+    "movdqa     %%xmm0,%%xmm8                    \n"
+    "movdqa     %%xmm1,%%xmm9                    \n"
+    "punpckhwd  %%xmm2,%%xmm8                    \n"
+    "punpckhwd  %%xmm3,%%xmm9                    \n"
+    "punpcklwd  %%xmm2,%%xmm0                    \n"
+    "punpcklwd  %%xmm3,%%xmm1                    \n"
+    "movdqa     %%xmm8,%%xmm2                    \n"
+    "movdqa     %%xmm9,%%xmm3                    \n"
+    "movdqa     %%xmm4,%%xmm8                    \n"
+    "movdqa     %%xmm5,%%xmm9                    \n"
+    "punpckhwd  %%xmm6,%%xmm8                    \n"
+    "punpckhwd  %%xmm7,%%xmm9                    \n"
+    "punpcklwd  %%xmm6,%%xmm4                    \n"
+    "punpcklwd  %%xmm7,%%xmm5                    \n"
+    "movdqa     %%xmm8,%%xmm6                    \n"
+    "movdqa     %%xmm9,%%xmm7                    \n"
+    // Third round of bit swap.
+    // Write to the destination pointer.
+    "movdqa     %%xmm0,%%xmm8                    \n"
+    "punpckldq  %%xmm4,%%xmm0                    \n"
+    "movlpd     %%xmm0,(%1)                      \n"  // Write back U channel
+    "movhpd     %%xmm0,(%2)                      \n"  // Write back V channel
+    "punpckhdq  %%xmm4,%%xmm8                    \n"
+    "movlpd     %%xmm8,(%1,%5)                   \n"
+    "lea        (%1,%5,2),%1                     \n"
+    "movhpd     %%xmm8,(%2,%6)                   \n"
+    "lea        (%2,%6,2),%2                     \n"
+    "movdqa     %%xmm2,%%xmm8                    \n"
+    "punpckldq  %%xmm6,%%xmm2                    \n"
+    "movlpd     %%xmm2,(%1)                      \n"
+    "movhpd     %%xmm2,(%2)                      \n"
+    "punpckhdq  %%xmm6,%%xmm8                    \n"
+    "movlpd     %%xmm8,(%1,%5)                   \n"
+    "lea        (%1,%5,2),%1                     \n"
+    "movhpd     %%xmm8,(%2,%6)                   \n"
+    "lea        (%2,%6,2),%2                     \n"
+    "movdqa     %%xmm1,%%xmm8                    \n"
+    "punpckldq  %%xmm5,%%xmm1                    \n"
+    "movlpd     %%xmm1,(%1)                      \n"
+    "movhpd     %%xmm1,(%2)                      \n"
+    "punpckhdq  %%xmm5,%%xmm8                    \n"
+    "movlpd     %%xmm8,(%1,%5)                   \n"
+    "lea        (%1,%5,2),%1                     \n"
+    "movhpd     %%xmm8,(%2,%6)                   \n"
+    "lea        (%2,%6,2),%2                     \n"
+    "movdqa     %%xmm3,%%xmm8                    \n"
+    "punpckldq  %%xmm7,%%xmm3                    \n"
+    "movlpd     %%xmm3,(%1)                      \n"
+    "movhpd     %%xmm3,(%2)                      \n"
+    "punpckhdq  %%xmm7,%%xmm8                    \n"
+    "sub        $0x8,%3                          \n"
+    "movlpd     %%xmm8,(%1,%5)                   \n"
+    "lea        (%1,%5,2),%1                     \n"
+    "movhpd     %%xmm8,(%2,%6)                   \n"
+    "lea        (%2,%6,2),%2                     \n"
+    "jg         1b                               \n"
+    : "+r"(src),    // %0
+      "+r"(dst_a),  // %1
+      "+r"(dst_b),  // %2
+      "+r"(width)   // %3
+    : "r"((intptr_t)(src_stride)),    // %4
+      "r"((intptr_t)(dst_stride_a)),  // %5
+      "r"((intptr_t)(dst_stride_b))   // %6
+    : "memory", "cc",
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
+      "xmm8", "xmm9"
+  );
+}
+#endif  // defined(HAS_TRANSPOSEUVWX8_SSE2)
+#endif  // defined(__x86_64__) || defined(__i386__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libyuv/source/rotate_mips.cc b/libs/libyuv/source/rotate_mips.cc
new file mode 100644
index 0000000000..23e89fbad4
--- /dev/null
+++ b/libs/libyuv/source/rotate_mips.cc
@@ -0,0 +1,484 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/rotate_row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_MIPS) && \
+    defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
+    (_MIPS_SIM == _MIPS_SIM_ABI32)
+
+void TransposeWx8_DSPR2(const uint8* src, int src_stride,
+                             uint8* dst, int dst_stride, int width) {
+   __asm__ __volatile__ (
+      ".set push                                         \n"
+      ".set noreorder                                    \n"
+      "sll              $t2, %[src_stride], 0x1          \n" // src_stride x 2
+      "sll              $t4, %[src_stride], 0x2          \n" // src_stride x 4
+      "sll              $t9, %[src_stride], 0x3          \n" // src_stride x 8
+      "addu             $t3, $t2, %[src_stride]          \n"
+      "addu             $t5, $t4, %[src_stride]          \n"
+      "addu             $t6, $t2, $t4                    \n"
+      "andi             $t0, %[dst], 0x3                 \n"
+      "andi             $t1, %[dst_stride], 0x3          \n"
+      "or               $t0, $t0, $t1                    \n"
+      "bnez             $t0, 11f                         \n"
+      " subu            $t7, $t9, %[src_stride]          \n"
+//dst + dst_stride word aligned
+    "1:                                                  \n"
+      "lbu              $t0, 0(%[src])                   \n"
+      "lbux             $t1, %[src_stride](%[src])       \n"
+      "lbux             $t8, $t2(%[src])                 \n"
+      "lbux             $t9, $t3(%[src])                 \n"
+      "sll              $t1, $t1, 16                     \n"
+      "sll              $t9, $t9, 16                     \n"
+      "or               $t0, $t0, $t1                    \n"
+      "or               $t8, $t8, $t9                    \n"
+      "precr.qb.ph      $s0, $t8, $t0                    \n"
+      "lbux             $t0, $t4(%[src])                 \n"
+      "lbux             $t1, $t5(%[src])                 \n"
+      "lbux             $t8, $t6(%[src])                 \n"
+      "lbux             $t9, $t7(%[src])                 \n"
+      "sll              $t1, $t1, 16                     \n"
+      "sll              $t9, $t9, 16                     \n"
+      "or               $t0, $t0, $t1                    \n"
+      "or               $t8, $t8, $t9                    \n"
+      "precr.qb.ph      $s1, $t8, $t0                    \n"
+      "sw               $s0, 0(%[dst])                   \n"
+      "addiu            %[width], -1                     \n"
+      "addiu            %[src], 1                        \n"
+      "sw               $s1, 4(%[dst])                   \n"
+      "bnez             %[width], 1b                     \n"
+      " addu            %[dst], %[dst], %[dst_stride]    \n"
+      "b                2f                               \n"
+//dst + dst_stride unaligned
+   "11:                                                  \n"
+      "lbu              $t0, 0(%[src])                   \n"
+      "lbux             $t1, %[src_stride](%[src])       \n"
+      "lbux             $t8, $t2(%[src])                 \n"
+      "lbux             $t9, $t3(%[src])                 \n"
+      "sll              $t1, $t1, 16                     \n"
+      "sll              $t9, $t9, 16                     \n"
+      "or               $t0, $t0, $t1                    \n"
+      "or               $t8, $t8, $t9                    \n"
+      "precr.qb.ph      $s0, $t8, $t0                    \n"
+      "lbux             $t0, $t4(%[src])                 \n"
+      "lbux             $t1, $t5(%[src])                 \n"
+      "lbux             $t8, $t6(%[src])                 \n"
+      "lbux             $t9, $t7(%[src])                 \n"
+      "sll              $t1, $t1, 16                     \n"
+      "sll              $t9, $t9, 16                     \n"
+      "or               $t0, $t0, $t1                    \n"
+      "or               $t8, $t8, $t9                    \n"
+      "precr.qb.ph      $s1, $t8, $t0                    \n"
+      "swr              $s0, 0(%[dst])                   \n"
+      "swl              $s0, 3(%[dst])                   \n"
+      "addiu            %[width], -1                     \n"
+      "addiu            %[src], 1                        \n"
+      "swr              $s1, 4(%[dst])                   \n"
+      "swl              $s1, 7(%[dst])                   \n"
+      "bnez             %[width], 11b                    \n"
+       "addu             %[dst], %[dst], %[dst_stride]   \n"
+    "2:                                                  \n"
+      ".set pop                                          \n"
+      :[src] "+r" (src),
+       [dst] "+r" (dst),
+       [width] "+r" (width)
+      :[src_stride] "r" (src_stride),
+       [dst_stride] "r" (dst_stride)
+      : "t0", "t1",  "t2", "t3", "t4", "t5",
+        "t6", "t7", "t8", "t9",
+        "s0", "s1"
+  );
+}
+
+void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride,
+                                  uint8* dst, int dst_stride, int width) {
+  __asm__ __volatile__ (
+      ".set noat                                         \n"
+      ".set push                                         \n"
+      ".set noreorder                                    \n"
+      "beqz             %[width], 2f                     \n"
+      " sll             $t2, %[src_stride], 0x1          \n"  // src_stride x 2
+      "sll              $t4, %[src_stride], 0x2          \n"  // src_stride x 4
+      "sll              $t9, %[src_stride], 0x3          \n"  // src_stride x 8
+      "addu             $t3, $t2, %[src_stride]          \n"
+      "addu             $t5, $t4, %[src_stride]          \n"
+      "addu             $t6, $t2, $t4                    \n"
+
+      "srl              $AT, %[width], 0x2               \n"
+      "andi             $t0, %[dst], 0x3                 \n"
+      "andi             $t1, %[dst_stride], 0x3          \n"
+      "or               $t0, $t0, $t1                    \n"
+      "bnez             $t0, 11f                         \n"
+      " subu            $t7, $t9, %[src_stride]          \n"
+//dst + dst_stride word aligned
+      "1:                                                \n"
+      "lw               $t0, 0(%[src])                   \n"
+      "lwx              $t1, %[src_stride](%[src])       \n"
+      "lwx              $t8, $t2(%[src])                 \n"
+      "lwx              $t9, $t3(%[src])                 \n"
+
+// t0 = | 30 | 20 | 10 | 00 |
+// t1 = | 31 | 21 | 11 | 01 |
+// t8 = | 32 | 22 | 12 | 02 |
+// t9 = | 33 | 23 | 13 | 03 |
+
+      "precr.qb.ph     $s0, $t1, $t0                     \n"
+      "precr.qb.ph     $s1, $t9, $t8                     \n"
+      "precrq.qb.ph    $s2, $t1, $t0                     \n"
+      "precrq.qb.ph    $s3, $t9, $t8                     \n"
+
+  // s0 = | 21 | 01 | 20 | 00 |
+  // s1 = | 23 | 03 | 22 | 02 |
+  // s2 = | 31 | 11 | 30 | 10 |
+  // s3 = | 33 | 13 | 32 | 12 |
+
+      "precr.qb.ph     $s4, $s1, $s0                     \n"
+      "precrq.qb.ph    $s5, $s1, $s0                     \n"
+      "precr.qb.ph     $s6, $s3, $s2                     \n"
+      "precrq.qb.ph    $s7, $s3, $s2                     \n"
+
+  // s4 = | 03 | 02 | 01 | 00 |
+  // s5 = | 23 | 22 | 21 | 20 |
+  // s6 = | 13 | 12 | 11 | 10 |
+  // s7 = | 33 | 32 | 31 | 30 |
+
+      "lwx              $t0, $t4(%[src])                 \n"
+      "lwx              $t1, $t5(%[src])                 \n"
+      "lwx              $t8, $t6(%[src])                 \n"
+      "lwx              $t9, $t7(%[src])                 \n"
+
+// t0 = | 34 | 24 | 14 | 04 |
+// t1 = | 35 | 25 | 15 | 05 |
+// t8 = | 36 | 26 | 16 | 06 |
+// t9 = | 37 | 27 | 17 | 07 |
+
+      "precr.qb.ph     $s0, $t1, $t0                     \n"
+      "precr.qb.ph     $s1, $t9, $t8                     \n"
+      "precrq.qb.ph    $s2, $t1, $t0                     \n"
+      "precrq.qb.ph    $s3, $t9, $t8                     \n"
+
+  // s0 = | 25 | 05 | 24 | 04 |
+  // s1 = | 27 | 07 | 26 | 06 |
+  // s2 = | 35 | 15 | 34 | 14 |
+  // s3 = | 37 | 17 | 36 | 16 |
+
+      "precr.qb.ph     $t0, $s1, $s0                     \n"
+      "precrq.qb.ph    $t1, $s1, $s0                     \n"
+      "precr.qb.ph     $t8, $s3, $s2                     \n"
+      "precrq.qb.ph    $t9, $s3, $s2                     \n"
+
+  // t0 = | 07 | 06 | 05 | 04 |
+  // t1 = | 27 | 26 | 25 | 24 |
+  // t8 = | 17 | 16 | 15 | 14 |
+  // t9 = | 37 | 36 | 35 | 34 |
+
+      "addu            $s0, %[dst], %[dst_stride]        \n"
+      "addu            $s1, $s0, %[dst_stride]           \n"
+      "addu            $s2, $s1, %[dst_stride]           \n"
+
+      "sw              $s4, 0(%[dst])                    \n"
+      "sw              $t0, 4(%[dst])                    \n"
+      "sw              $s6, 0($s0)                       \n"
+      "sw              $t8, 4($s0)                       \n"
+      "sw              $s5, 0($s1)                       \n"
+      "sw              $t1, 4($s1)                       \n"
+      "sw              $s7, 0($s2)                       \n"
+      "sw              $t9, 4($s2)                       \n"
+
+      "addiu            $AT, -1                          \n"
+      "addiu            %[src], 4                        \n"
+
+      "bnez             $AT, 1b                          \n"
+      " addu            %[dst], $s2, %[dst_stride]       \n"
+      "b                2f                               \n"
+//dst + dst_stride unaligned
+      "11:                                               \n"
+      "lw               $t0, 0(%[src])                   \n"
+      "lwx              $t1, %[src_stride](%[src])       \n"
+      "lwx              $t8, $t2(%[src])                 \n"
+      "lwx              $t9, $t3(%[src])                 \n"
+
+// t0 = | 30 | 20 | 10 | 00 |
+// t1 = | 31 | 21 | 11 | 01 |
+// t8 = | 32 | 22 | 12 | 02 |
+// t9 = | 33 | 23 | 13 | 03 |
+
+      "precr.qb.ph     $s0, $t1, $t0                     \n"
+      "precr.qb.ph     $s1, $t9, $t8                     \n"
+      "precrq.qb.ph    $s2, $t1, $t0                     \n"
+      "precrq.qb.ph    $s3, $t9, $t8                     \n"
+
+  // s0 = | 21 | 01 | 20 | 00 |
+  // s1 = | 23 | 03 | 22 | 02 |
+  // s2 = | 31 | 11 | 30 | 10 |
+  // s3 = | 33 | 13 | 32 | 12 |
+
+      "precr.qb.ph     $s4, $s1, $s0                     \n"
+      "precrq.qb.ph    $s5, $s1, $s0                     \n"
+      "precr.qb.ph     $s6, $s3, $s2                     \n"
+      "precrq.qb.ph    $s7, $s3, $s2                     \n"
+
+  // s4 = | 03 | 02 | 01 | 00 |
+  // s5 = | 23 | 22 | 21 | 20 |
+  // s6 = | 13 | 12 | 11 | 10 |
+  // s7 = | 33 | 32 | 31 | 30 |
+
+      "lwx              $t0, $t4(%[src])                 \n"
+      "lwx              $t1, $t5(%[src])                 \n"
+      "lwx              $t8, $t6(%[src])                 \n"
+      "lwx              $t9, $t7(%[src])                 \n"
+
+// t0 = | 34 | 24 | 14 | 04 |
+// t1 = | 35 | 25 | 15 | 05 |
+// t8 = | 36 | 26 | 16 | 06 |
+// t9 = | 37 | 27 | 17 | 07 |
+
+      "precr.qb.ph     $s0, $t1, $t0                     \n"
+      "precr.qb.ph     $s1, $t9, $t8                     \n"
+      "precrq.qb.ph    $s2, $t1, $t0                     \n"
+      "precrq.qb.ph    $s3, $t9, $t8                     \n"
+
+  // s0 = | 25 | 05 | 24 | 04 |
+  // s1 = | 27 | 07 | 26 | 06 |
+  // s2 = | 35 | 15 | 34 | 14 |
+  // s3 = | 37 | 17 | 36 | 16 |
+
+      "precr.qb.ph     $t0, $s1, $s0                     \n"
+      "precrq.qb.ph    $t1, $s1, $s0                     \n"
+      "precr.qb.ph     $t8, $s3, $s2                     \n"
+      "precrq.qb.ph    $t9, $s3, $s2                     \n"
+
+  // t0 = | 07 | 06 | 05 | 04 |
+  // t1 = | 27 | 26 | 25 | 24 |
+  // t8 = | 17 | 16 | 15 | 14 |
+  // t9 = | 37 | 36 | 35 | 34 |
+
+      "addu            $s0, %[dst], %[dst_stride]        \n"
+      "addu            $s1, $s0, %[dst_stride]           \n"
+      "addu            $s2, $s1, %[dst_stride]           \n"
+
+      "swr              $s4, 0(%[dst])                   \n"
+      "swl              $s4, 3(%[dst])                   \n"
+      "swr              $t0, 4(%[dst])                   \n"
+      "swl              $t0, 7(%[dst])                   \n"
+      "swr              $s6, 0($s0)                      \n"
+      "swl              $s6, 3($s0)                      \n"
+      "swr              $t8, 4($s0)                      \n"
+      "swl              $t8, 7($s0)                      \n"
+      "swr              $s5, 0($s1)                      \n"
+      "swl              $s5, 3($s1)                      \n"
+      "swr              $t1, 4($s1)                      \n"
+      "swl              $t1, 7($s1)                      \n"
+      "swr              $s7, 0($s2)                      \n"
+      "swl              $s7, 3($s2)                      \n"
+      "swr              $t9, 4($s2)                      \n"
+      "swl              $t9, 7($s2)                      \n"
+
+      "addiu            $AT, -1                          \n"
+      "addiu            %[src], 4                        \n"
+
+      "bnez             $AT, 11b                         \n"
+      " addu            %[dst], $s2, %[dst_stride]       \n"
+      "2:                                                \n"
+      ".set pop                                          \n"
+      ".set at                                           \n"
+      :[src] "+r" (src),
+       [dst] "+r" (dst),
+       [width] "+r" (width)
+      :[src_stride] "r" (src_stride),
+       [dst_stride] "r" (dst_stride)
+      : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9",
+        "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7"
+  );
+}
+
+void TransposeUVWx8_DSPR2(const uint8* src, int src_stride,
+                               uint8* dst_a, int dst_stride_a,
+                               uint8* dst_b, int dst_stride_b,
+                               int width) {
+  __asm__ __volatile__ (
+      ".set push                                         \n"
+      ".set noreorder                                    \n"
+      "beqz            %[width], 2f                      \n"
+      " sll            $t2, %[src_stride], 0x1           \n" // src_stride x 2
+      "sll             $t4, %[src_stride], 0x2           \n" // src_stride x 4
+      "sll             $t9, %[src_stride], 0x3           \n" // src_stride x 8
+      "addu            $t3, $t2, %[src_stride]           \n"
+      "addu            $t5, $t4, %[src_stride]           \n"
+      "addu            $t6, $t2, $t4                     \n"
+      "subu            $t7, $t9, %[src_stride]           \n"
+      "srl             $t1, %[width], 1                  \n"
+
+// check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b
+      "andi            $t0, %[dst_a], 0x3                \n"
+      "andi            $t8, %[dst_b], 0x3                \n"
+      "or              $t0, $t0, $t8                     \n"
+      "andi            $t8, %[dst_stride_a], 0x3         \n"
+      "andi            $s5, %[dst_stride_b], 0x3         \n"
+      "or              $t8, $t8, $s5                     \n"
+      "or              $t0, $t0, $t8                     \n"
+      "bnez            $t0, 11f                          \n"
+      " nop                                              \n"
+// dst + dst_stride word aligned (both, a & b dst addresses)
+    "1:                                                  \n"
+      "lw              $t0, 0(%[src])                    \n" // |B0|A0|b0|a0|
+      "lwx             $t8, %[src_stride](%[src])        \n" // |B1|A1|b1|a1|
+      "addu            $s5, %[dst_a], %[dst_stride_a]    \n"
+      "lwx             $t9, $t2(%[src])                  \n" // |B2|A2|b2|a2|
+      "lwx             $s0, $t3(%[src])                  \n" // |B3|A3|b3|a3|
+      "addu            $s6, %[dst_b], %[dst_stride_b]    \n"
+
+      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B1|A1|B0|A0|
+      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B3|A3|B2|A2|
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A3|A2|A1|A0|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B3|B2|B1|B0|
+
+      "sll             $t0, $t0, 16                      \n"
+      "packrl.ph       $s1, $t8, $t0                     \n" // |b1|a1|b0|a0|
+      "sll             $t9, $t9, 16                      \n"
+      "packrl.ph       $s2, $s0, $t9                     \n" // |b3|a3|b2|a2|
+
+      "sw              $s3, 0($s5)                       \n"
+      "sw              $s4, 0($s6)                       \n"
+
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a3|a2|a1|a0|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b3|b2|b1|b0|
+
+      "lwx             $t0, $t4(%[src])                  \n" // |B4|A4|b4|a4|
+      "lwx             $t8, $t5(%[src])                  \n" // |B5|A5|b5|a5|
+      "lwx             $t9, $t6(%[src])                  \n" // |B6|A6|b6|a6|
+      "lwx             $s0, $t7(%[src])                  \n" // |B7|A7|b7|a7|
+      "sw              $s3, 0(%[dst_a])                  \n"
+      "sw              $s4, 0(%[dst_b])                  \n"
+
+      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B5|A5|B4|A4|
+      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B6|A6|B7|A7|
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A7|A6|A5|A4|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B7|B6|B5|B4|
+
+      "sll             $t0, $t0, 16                      \n"
+      "packrl.ph       $s1, $t8, $t0                     \n" // |b5|a5|b4|a4|
+      "sll             $t9, $t9, 16                      \n"
+      "packrl.ph       $s2, $s0, $t9                     \n" // |b7|a7|b6|a6|
+      "sw              $s3, 4($s5)                       \n"
+      "sw              $s4, 4($s6)                       \n"
+
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a7|a6|a5|a4|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b7|b6|b5|b4|
+
+      "addiu           %[src], 4                         \n"
+      "addiu           $t1, -1                           \n"
+      "sll             $t0, %[dst_stride_a], 1           \n"
+      "sll             $t8, %[dst_stride_b], 1           \n"
+      "sw              $s3, 4(%[dst_a])                  \n"
+      "sw              $s4, 4(%[dst_b])                  \n"
+      "addu            %[dst_a], %[dst_a], $t0           \n"
+      "bnez            $t1, 1b                           \n"
+      " addu           %[dst_b], %[dst_b], $t8           \n"
+      "b               2f                                \n"
+      " nop                                              \n"
+
+// dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned
+   "11:                                                  \n"
+      "lw              $t0, 0(%[src])                    \n" // |B0|A0|b0|a0|
+      "lwx             $t8, %[src_stride](%[src])        \n" // |B1|A1|b1|a1|
+      "addu            $s5, %[dst_a], %[dst_stride_a]    \n"
+      "lwx             $t9, $t2(%[src])                  \n" // |B2|A2|b2|a2|
+      "lwx             $s0, $t3(%[src])                  \n" // |B3|A3|b3|a3|
+      "addu            $s6, %[dst_b], %[dst_stride_b]    \n"
+
+      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B1|A1|B0|A0|
+      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B3|A3|B2|A2|
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A3|A2|A1|A0|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B3|B2|B1|B0|
+
+      "sll             $t0, $t0, 16                      \n"
+      "packrl.ph       $s1, $t8, $t0                     \n" // |b1|a1|b0|a0|
+      "sll             $t9, $t9, 16                      \n"
+      "packrl.ph       $s2, $s0, $t9                     \n" // |b3|a3|b2|a2|
+
+      "swr             $s3, 0($s5)                       \n"
+      "swl             $s3, 3($s5)                       \n"
+      "swr             $s4, 0($s6)                       \n"
+      "swl             $s4, 3($s6)                       \n"
+
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a3|a2|a1|a0|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b3|b2|b1|b0|
+
+      "lwx             $t0, $t4(%[src])                  \n" // |B4|A4|b4|a4|
+      "lwx             $t8, $t5(%[src])                  \n" // |B5|A5|b5|a5|
+      "lwx             $t9, $t6(%[src])                  \n" // |B6|A6|b6|a6|
+      "lwx             $s0, $t7(%[src])                  \n" // |B7|A7|b7|a7|
+      "swr             $s3, 0(%[dst_a])                  \n"
+      "swl             $s3, 3(%[dst_a])                  \n"
+      "swr             $s4, 0(%[dst_b])                  \n"
+      "swl             $s4, 3(%[dst_b])                  \n"
+
+      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B5|A5|B4|A4|
+      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B6|A6|B7|A7|
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A7|A6|A5|A4|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B7|B6|B5|B4|
+
+      "sll             $t0, $t0, 16                      \n"
+      "packrl.ph       $s1, $t8, $t0                     \n" // |b5|a5|b4|a4|
+      "sll             $t9, $t9, 16                      \n"
+      "packrl.ph       $s2, $s0, $t9                     \n" // |b7|a7|b6|a6|
+
+      "swr             $s3, 4($s5)                       \n"
+      "swl             $s3, 7($s5)                       \n"
+      "swr             $s4, 4($s6)                       \n"
+      "swl             $s4, 7($s6)                       \n"
+
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a7|a6|a5|a4|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b7|b6|b5|b4|
+
+      "addiu           %[src], 4                         \n"
+      "addiu           $t1, -1                           \n"
+      "sll             $t0, %[dst_stride_a], 1           \n"
+      "sll             $t8, %[dst_stride_b], 1           \n"
+      "swr             $s3, 4(%[dst_a])                  \n"
+      "swl             $s3, 7(%[dst_a])                  \n"
+      "swr             $s4, 4(%[dst_b])                  \n"
+      "swl             $s4, 7(%[dst_b])                  \n"
+      "addu            %[dst_a], %[dst_a], $t0           \n"
+      "bnez            $t1, 11b                          \n"
+      " addu           %[dst_b], %[dst_b], $t8           \n"
+
+      "2:                                                \n"
+      ".set pop                                          \n"
+      : [src] "+r" (src),
+        [dst_a] "+r" (dst_a),
+        [dst_b] "+r" (dst_b),
+        [width] "+r" (width),
+        [src_stride] "+r" (src_stride)
+      : [dst_stride_a] "r" (dst_stride_a),
+        [dst_stride_b] "r" (dst_stride_b)
+      : "t0", "t1",  "t2", "t3",  "t4", "t5",
+        "t6", "t7", "t8", "t9",
+        "s0", "s1", "s2", "s3",
+        "s4", "s5", "s6"
+  );
+}
+
+#endif  // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libyuv/source/rotate_neon.cc b/libs/libyuv/source/rotate_neon.cc
new file mode 100644
index 0000000000..9e4ecd80d9
--- /dev/null
+++ b/libs/libyuv/source/rotate_neon.cc
@@ -0,0 +1,533 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/rotate_row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+    !defined(__aarch64__)
+
+static uvec8 kVTbl4x4Transpose =
+  { 0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15 };
+
+void TransposeWx8_NEON(const uint8* src, int src_stride,
+                       uint8* dst, int dst_stride,
+                       int width) {
+  const uint8* src_temp = NULL;
+  asm volatile (
+    // loops are on blocks of 8. loop will stop when
+    // counter gets to or below 0. starting the counter
+    // at w-8 allow for this
+    "sub         %5, #8                        \n"
+
+    // handle 8x8 blocks. this should be the majority of the plane
+    "1:                                        \n"
+      "mov         %0, %1                      \n"
+
+      MEMACCESS(0)
+      "vld1.8      {d0}, [%0], %2              \n"
+      MEMACCESS(0)
+      "vld1.8      {d1}, [%0], %2              \n"
+      MEMACCESS(0)
+      "vld1.8      {d2}, [%0], %2              \n"
+      MEMACCESS(0)
+      "vld1.8      {d3}, [%0], %2              \n"
+      MEMACCESS(0)
+      "vld1.8      {d4}, [%0], %2              \n"
+      MEMACCESS(0)
+      "vld1.8      {d5}, [%0], %2              \n"
+      MEMACCESS(0)
+      "vld1.8      {d6}, [%0], %2              \n"
+      MEMACCESS(0)
+      "vld1.8      {d7}, [%0]                  \n"
+
+      "vtrn.8      d1, d0                      \n"
+      "vtrn.8      d3, d2                      \n"
+      "vtrn.8      d5, d4                      \n"
+      "vtrn.8      d7, d6                      \n"
+
+      "vtrn.16     d1, d3                      \n"
+      "vtrn.16     d0, d2                      \n"
+      "vtrn.16     d5, d7                      \n"
+      "vtrn.16     d4, d6                      \n"
+
+      "vtrn.32     d1, d5                      \n"
+      "vtrn.32     d0, d4                      \n"
+      "vtrn.32     d3, d7                      \n"
+      "vtrn.32     d2, d6                      \n"
+
+      "vrev16.8    q0, q0                      \n"
+      "vrev16.8    q1, q1                      \n"
+      "vrev16.8    q2, q2                      \n"
+      "vrev16.8    q3, q3                      \n"
+
+      "mov         %0, %3                      \n"
+
+    MEMACCESS(0)
+      "vst1.8      {d1}, [%0], %4              \n"
+    MEMACCESS(0)
+      "vst1.8      {d0}, [%0], %4              \n"
+    MEMACCESS(0)
+      "vst1.8      {d3}, [%0], %4              \n"
+    MEMACCESS(0)
+      "vst1.8      {d2}, [%0], %4              \n"
+    MEMACCESS(0)
+      "vst1.8      {d5}, [%0], %4              \n"
+    MEMACCESS(0)
+      "vst1.8      {d4}, [%0], %4              \n"
+    MEMACCESS(0)
+      "vst1.8      {d7}, [%0], %4              \n"
+    MEMACCESS(0)
+      "vst1.8      {d6}, [%0]                  \n"
+
+      "add         %1, #8                      \n"  // src += 8
+      "add         %3, %3, %4, lsl #3          \n"  // dst += 8 * dst_stride
+      "subs        %5,  #8                     \n"  // w   -= 8
+      "bge         1b                          \n"
+
+    // add 8 back to counter. if the result is 0 there are
+    // no residuals.
+    "adds        %5, #8                        \n"
+    "beq         4f                            \n"
+
+    // some residual, so between 1 and 7 lines left to transpose
+    "cmp         %5, #2                        \n"
+    "blt         3f                            \n"
+
+    "cmp         %5, #4                        \n"
+    "blt         2f                            \n"
+
+    // 4x8 block
+    "mov         %0, %1                        \n"
+    MEMACCESS(0)
+    "vld1.32     {d0[0]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.32     {d0[1]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.32     {d1[0]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.32     {d1[1]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.32     {d2[0]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.32     {d2[1]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.32     {d3[0]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.32     {d3[1]}, [%0]                 \n"
+
+    "mov         %0, %3                        \n"
+
+    MEMACCESS(6)
+    "vld1.8      {q3}, [%6]                    \n"
+
+    "vtbl.8      d4, {d0, d1}, d6              \n"
+    "vtbl.8      d5, {d0, d1}, d7              \n"
+    "vtbl.8      d0, {d2, d3}, d6              \n"
+    "vtbl.8      d1, {d2, d3}, d7              \n"
+
+    // TODO(frkoenig): Rework shuffle above to
+    // write out with 4 instead of 8 writes.
+    MEMACCESS(0)
+    "vst1.32     {d4[0]}, [%0], %4             \n"
+    MEMACCESS(0)
+    "vst1.32     {d4[1]}, [%0], %4             \n"
+    MEMACCESS(0)
+    "vst1.32     {d5[0]}, [%0], %4             \n"
+    MEMACCESS(0)
+    "vst1.32     {d5[1]}, [%0]                 \n"
+
+    "add         %0, %3, #4                    \n"
+    MEMACCESS(0)
+    "vst1.32     {d0[0]}, [%0], %4             \n"
+    MEMACCESS(0)
+    "vst1.32     {d0[1]}, [%0], %4             \n"
+    MEMACCESS(0)
+    "vst1.32     {d1[0]}, [%0], %4             \n"
+    MEMACCESS(0)
+    "vst1.32     {d1[1]}, [%0]                 \n"
+
+    "add         %1, #4                        \n"  // src += 4
+    "add         %3, %3, %4, lsl #2            \n"  // dst += 4 * dst_stride
+    "subs        %5,  #4                       \n"  // w   -= 4
+    "beq         4f                            \n"
+
+    // some residual, check to see if it includes a 2x8 block,
+    // or less
+    "cmp         %5, #2                        \n"
+    "blt         3f                            \n"
+
+    // 2x8 block
+    "2:                                        \n"
+    "mov         %0, %1                        \n"
+    MEMACCESS(0)
+    "vld1.16     {d0[0]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.16     {d1[0]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.16     {d0[1]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.16     {d1[1]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.16     {d0[2]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.16     {d1[2]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.16     {d0[3]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.16     {d1[3]}, [%0]                 \n"
+
+    "vtrn.8      d0, d1                        \n"
+
+    "mov         %0, %3                        \n"
+
+    MEMACCESS(0)
+    "vst1.64     {d0}, [%0], %4                \n"
+    MEMACCESS(0)
+    "vst1.64     {d1}, [%0]                    \n"
+
+    "add         %1, #2                        \n"  // src += 2
+    "add         %3, %3, %4, lsl #1            \n"  // dst += 2 * dst_stride
+    "subs        %5,  #2                       \n"  // w   -= 2
+    "beq         4f                            \n"
+
+    // 1x8 block
+    "3:                                        \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[0]}, [%1], %2             \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[1]}, [%1], %2             \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[2]}, [%1], %2             \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[3]}, [%1], %2             \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[4]}, [%1], %2             \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[5]}, [%1], %2             \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[6]}, [%1], %2             \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[7]}, [%1]                 \n"
+
+    MEMACCESS(3)
+    "vst1.64     {d0}, [%3]                    \n"
+
+    "4:                                        \n"
+
+    : "+r"(src_temp),          // %0
+      "+r"(src),               // %1
+      "+r"(src_stride),        // %2
+      "+r"(dst),               // %3
+      "+r"(dst_stride),        // %4
+      "+r"(width)              // %5
+    : "r"(&kVTbl4x4Transpose)  // %6
+    : "memory", "cc", "q0", "q1", "q2", "q3"
+  );
+}
+
+static uvec8 kVTbl4x4TransposeDi =
+  { 0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15 };
+
+void TransposeUVWx8_NEON(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b,
+                         int width) {
+  const uint8* src_temp = NULL;
+  asm volatile (
+    // loops are on blocks of 8. loop will stop when
+    // counter gets to or below 0. starting the counter
+    // at w-8 allow for this
+    "sub         %7, #8                        \n"
+
+    // handle 8x8 blocks. this should be the majority of the plane
+    "1:                                        \n"
+      "mov         %0, %1                      \n"
+
+      MEMACCESS(0)
+      "vld2.8      {d0,  d1},  [%0], %2        \n"
+      MEMACCESS(0)
+      "vld2.8      {d2,  d3},  [%0], %2        \n"
+      MEMACCESS(0)
+      "vld2.8      {d4,  d5},  [%0], %2        \n"
+      MEMACCESS(0)
+      "vld2.8      {d6,  d7},  [%0], %2        \n"
+      MEMACCESS(0)
+      "vld2.8      {d16, d17}, [%0], %2        \n"
+      MEMACCESS(0)
+      "vld2.8      {d18, d19}, [%0], %2        \n"
+      MEMACCESS(0)
+      "vld2.8      {d20, d21}, [%0], %2        \n"
+      MEMACCESS(0)
+      "vld2.8      {d22, d23}, [%0]            \n"
+
+      "vtrn.8      q1, q0                      \n"
+      "vtrn.8      q3, q2                      \n"
+      "vtrn.8      q9, q8                      \n"
+      "vtrn.8      q11, q10                    \n"
+
+      "vtrn.16     q1, q3                      \n"
+      "vtrn.16     q0, q2                      \n"
+      "vtrn.16     q9, q11                     \n"
+      "vtrn.16     q8, q10                     \n"
+
+      "vtrn.32     q1, q9                      \n"
+      "vtrn.32     q0, q8                      \n"
+      "vtrn.32     q3, q11                     \n"
+      "vtrn.32     q2, q10                     \n"
+
+      "vrev16.8    q0, q0                      \n"
+      "vrev16.8    q1, q1                      \n"
+      "vrev16.8    q2, q2                      \n"
+      "vrev16.8    q3, q3                      \n"
+      "vrev16.8    q8, q8                      \n"
+      "vrev16.8    q9, q9                      \n"
+      "vrev16.8    q10, q10                    \n"
+      "vrev16.8    q11, q11                    \n"
+
+      "mov         %0, %3                      \n"
+
+    MEMACCESS(0)
+      "vst1.8      {d2},  [%0], %4             \n"
+    MEMACCESS(0)
+      "vst1.8      {d0},  [%0], %4             \n"
+    MEMACCESS(0)
+      "vst1.8      {d6},  [%0], %4             \n"
+    MEMACCESS(0)
+      "vst1.8      {d4},  [%0], %4             \n"
+    MEMACCESS(0)
+      "vst1.8      {d18}, [%0], %4             \n"
+    MEMACCESS(0)
+      "vst1.8      {d16}, [%0], %4             \n"
+    MEMACCESS(0)
+      "vst1.8      {d22}, [%0], %4             \n"
+    MEMACCESS(0)
+      "vst1.8      {d20}, [%0]                 \n"
+
+      "mov         %0, %5                      \n"
+
+    MEMACCESS(0)
+      "vst1.8      {d3},  [%0], %6             \n"
+    MEMACCESS(0)
+      "vst1.8      {d1},  [%0], %6             \n"
+    MEMACCESS(0)
+      "vst1.8      {d7},  [%0], %6             \n"
+    MEMACCESS(0)
+      "vst1.8      {d5},  [%0], %6             \n"
+    MEMACCESS(0)
+      "vst1.8      {d19}, [%0], %6             \n"
+    MEMACCESS(0)
+      "vst1.8      {d17}, [%0], %6             \n"
+    MEMACCESS(0)
+      "vst1.8      {d23}, [%0], %6             \n"
+    MEMACCESS(0)
+      "vst1.8      {d21}, [%0]                 \n"
+
+      "add         %1, #8*2                    \n"  // src   += 8*2
+      "add         %3, %3, %4, lsl #3          \n"  // dst_a += 8 * dst_stride_a
+      "add         %5, %5, %6, lsl #3          \n"  // dst_b += 8 * dst_stride_b
+      "subs        %7,  #8                     \n"  // w     -= 8
+      "bge         1b                          \n"
+
+    // add 8 back to counter. if the result is 0 there are
+    // no residuals.
+    "adds        %7, #8                        \n"
+    "beq         4f                            \n"
+
+    // some residual, so between 1 and 7 lines left to transpose
+    "cmp         %7, #2                        \n"
+    "blt         3f                            \n"
+
+    "cmp         %7, #4                        \n"
+    "blt         2f                            \n"
+
+    // TODO(frkoenig): Clean this up
+    // 4x8 block
+    "mov         %0, %1                        \n"
+    MEMACCESS(0)
+    "vld1.64     {d0}, [%0], %2                \n"
+    MEMACCESS(0)
+    "vld1.64     {d1}, [%0], %2                \n"
+    MEMACCESS(0)
+    "vld1.64     {d2}, [%0], %2                \n"
+    MEMACCESS(0)
+    "vld1.64     {d3}, [%0], %2                \n"
+    MEMACCESS(0)
+    "vld1.64     {d4}, [%0], %2                \n"
+    MEMACCESS(0)
+    "vld1.64     {d5}, [%0], %2                \n"
+    MEMACCESS(0)
+    "vld1.64     {d6}, [%0], %2                \n"
+    MEMACCESS(0)
+    "vld1.64     {d7}, [%0]                    \n"
+
+    MEMACCESS(8)
+    "vld1.8      {q15}, [%8]                   \n"
+
+    "vtrn.8      q0, q1                        \n"
+    "vtrn.8      q2, q3                        \n"
+
+    "vtbl.8      d16, {d0, d1}, d30            \n"
+    "vtbl.8      d17, {d0, d1}, d31            \n"
+    "vtbl.8      d18, {d2, d3}, d30            \n"
+    "vtbl.8      d19, {d2, d3}, d31            \n"
+    "vtbl.8      d20, {d4, d5}, d30            \n"
+    "vtbl.8      d21, {d4, d5}, d31            \n"
+    "vtbl.8      d22, {d6, d7}, d30            \n"
+    "vtbl.8      d23, {d6, d7}, d31            \n"
+
+    "mov         %0, %3                        \n"
+
+    MEMACCESS(0)
+    "vst1.32     {d16[0]},  [%0], %4           \n"
+    MEMACCESS(0)
+    "vst1.32     {d16[1]},  [%0], %4           \n"
+    MEMACCESS(0)
+    "vst1.32     {d17[0]},  [%0], %4           \n"
+    MEMACCESS(0)
+    "vst1.32     {d17[1]},  [%0], %4           \n"
+
+    "add         %0, %3, #4                    \n"
+    MEMACCESS(0)
+    "vst1.32     {d20[0]}, [%0], %4            \n"
+    MEMACCESS(0)
+    "vst1.32     {d20[1]}, [%0], %4            \n"
+    MEMACCESS(0)
+    "vst1.32     {d21[0]}, [%0], %4            \n"
+    MEMACCESS(0)
+    "vst1.32     {d21[1]}, [%0]                \n"
+
+    "mov         %0, %5                        \n"
+
+    MEMACCESS(0)
+    "vst1.32     {d18[0]}, [%0], %6            \n"
+    MEMACCESS(0)
+    "vst1.32     {d18[1]}, [%0], %6            \n"
+    MEMACCESS(0)
+    "vst1.32     {d19[0]}, [%0], %6            \n"
+    MEMACCESS(0)
+    "vst1.32     {d19[1]}, [%0], %6            \n"
+
+    "add         %0, %5, #4                    \n"
+    MEMACCESS(0)
+    "vst1.32     {d22[0]},  [%0], %6           \n"
+    MEMACCESS(0)
+    "vst1.32     {d22[1]},  [%0], %6           \n"
+    MEMACCESS(0)
+    "vst1.32     {d23[0]},  [%0], %6           \n"
+    MEMACCESS(0)
+    "vst1.32     {d23[1]},  [%0]               \n"
+
+    "add         %1, #4*2                      \n"  // src   += 4 * 2
+    "add         %3, %3, %4, lsl #2            \n"  // dst_a += 4 * dst_stride_a
+    "add         %5, %5, %6, lsl #2            \n"  // dst_b += 4 * dst_stride_b
+    "subs        %7,  #4                       \n"  // w     -= 4
+    "beq         4f                            \n"
+
+    // some residual, check to see if it includes a 2x8 block,
+    // or less
+    "cmp         %7, #2                        \n"
+    "blt         3f                            \n"
+
+    // 2x8 block
+    "2:                                        \n"
+    "mov         %0, %1                        \n"
+    MEMACCESS(0)
+    "vld2.16     {d0[0], d2[0]}, [%0], %2      \n"
+    MEMACCESS(0)
+    "vld2.16     {d1[0], d3[0]}, [%0], %2      \n"
+    MEMACCESS(0)
+    "vld2.16     {d0[1], d2[1]}, [%0], %2      \n"
+    MEMACCESS(0)
+    "vld2.16     {d1[1], d3[1]}, [%0], %2      \n"
+    MEMACCESS(0)
+    "vld2.16     {d0[2], d2[2]}, [%0], %2      \n"
+    MEMACCESS(0)
+    "vld2.16     {d1[2], d3[2]}, [%0], %2      \n"
+    MEMACCESS(0)
+    "vld2.16     {d0[3], d2[3]}, [%0], %2      \n"
+    MEMACCESS(0)
+    "vld2.16     {d1[3], d3[3]}, [%0]          \n"
+
+    "vtrn.8      d0, d1                        \n"
+    "vtrn.8      d2, d3                        \n"
+
+    "mov         %0, %3                        \n"
+
+    MEMACCESS(0)
+    "vst1.64     {d0}, [%0], %4                \n"
+    MEMACCESS(0)
+    "vst1.64     {d2}, [%0]                    \n"
+
+    "mov         %0, %5                        \n"
+
+    MEMACCESS(0)
+    "vst1.64     {d1}, [%0], %6                \n"
+    MEMACCESS(0)
+    "vst1.64     {d3}, [%0]                    \n"
+
+    "add         %1, #2*2                      \n"  // src   += 2 * 2
+    "add         %3, %3, %4, lsl #1            \n"  // dst_a += 2 * dst_stride_a
+    "add         %5, %5, %6, lsl #1            \n"  // dst_b += 2 * dst_stride_b
+    "subs        %7,  #2                       \n"  // w     -= 2
+    "beq         4f                            \n"
+
+    // 1x8 block
+    "3:                                        \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[0], d1[0]}, [%1], %2      \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[1], d1[1]}, [%1], %2      \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[2], d1[2]}, [%1], %2      \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[3], d1[3]}, [%1], %2      \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[4], d1[4]}, [%1], %2      \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[5], d1[5]}, [%1], %2      \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[6], d1[6]}, [%1], %2      \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[7], d1[7]}, [%1]          \n"
+
+    MEMACCESS(3)
+    "vst1.64     {d0}, [%3]                    \n"
+    MEMACCESS(5)
+    "vst1.64     {d1}, [%5]                    \n"
+
+    "4:                                        \n"
+
+    : "+r"(src_temp),            // %0
+      "+r"(src),                 // %1
+      "+r"(src_stride),          // %2
+      "+r"(dst_a),               // %3
+      "+r"(dst_stride_a),        // %4
+      "+r"(dst_b),               // %5
+      "+r"(dst_stride_b),        // %6
+      "+r"(width)                // %7
+    : "r"(&kVTbl4x4TransposeDi)  // %8
+    : "memory", "cc",
+      "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
+  );
+}
+#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libyuv/source/rotate_neon64.cc b/libs/libyuv/source/rotate_neon64.cc
new file mode 100644
index 0000000000..f52c082b3f
--- /dev/null
+++ b/libs/libyuv/source/rotate_neon64.cc
@@ -0,0 +1,543 @@
+/*
+ *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/rotate_row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon armv8 64 bit.
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+static uvec8 kVTbl4x4Transpose =
+  { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
+
+void TransposeWx8_NEON(const uint8* src, int src_stride,
+                       uint8* dst, int dst_stride, int width) {
+  const uint8* src_temp = NULL;
+  int64 width64 = (int64) width;  // Work around clang 3.4 warning.
+  asm volatile (
+    // loops are on blocks of 8. loop will stop when
+    // counter gets to or below 0. starting the counter
+    // at w-8 allow for this
+    "sub         %3, %3, #8                      \n"
+
+    // handle 8x8 blocks. this should be the majority of the plane
+    "1:                                          \n"
+      "mov         %0, %1                        \n"
+
+      MEMACCESS(0)
+      "ld1        {v0.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v1.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v2.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v3.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v4.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v5.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v6.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v7.8b}, [%0]                  \n"
+
+      "trn2     v16.8b, v0.8b, v1.8b             \n"
+      "trn1     v17.8b, v0.8b, v1.8b             \n"
+      "trn2     v18.8b, v2.8b, v3.8b             \n"
+      "trn1     v19.8b, v2.8b, v3.8b             \n"
+      "trn2     v20.8b, v4.8b, v5.8b             \n"
+      "trn1     v21.8b, v4.8b, v5.8b             \n"
+      "trn2     v22.8b, v6.8b, v7.8b             \n"
+      "trn1     v23.8b, v6.8b, v7.8b             \n"
+
+      "trn2     v3.4h, v17.4h, v19.4h            \n"
+      "trn1     v1.4h, v17.4h, v19.4h            \n"
+      "trn2     v2.4h, v16.4h, v18.4h            \n"
+      "trn1     v0.4h, v16.4h, v18.4h            \n"
+      "trn2     v7.4h, v21.4h, v23.4h            \n"
+      "trn1     v5.4h, v21.4h, v23.4h            \n"
+      "trn2     v6.4h, v20.4h, v22.4h            \n"
+      "trn1     v4.4h, v20.4h, v22.4h            \n"
+
+      "trn2     v21.2s, v1.2s, v5.2s             \n"
+      "trn1     v17.2s, v1.2s, v5.2s             \n"
+      "trn2     v20.2s, v0.2s, v4.2s             \n"
+      "trn1     v16.2s, v0.2s, v4.2s             \n"
+      "trn2     v23.2s, v3.2s, v7.2s             \n"
+      "trn1     v19.2s, v3.2s, v7.2s             \n"
+      "trn2     v22.2s, v2.2s, v6.2s             \n"
+      "trn1     v18.2s, v2.2s, v6.2s             \n"
+
+      "mov         %0, %2                        \n"
+
+    MEMACCESS(0)
+      "st1      {v17.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v16.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v19.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v18.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v21.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v20.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v23.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v22.8b}, [%0]                   \n"
+
+      "add         %1, %1, #8                    \n"  // src += 8
+      "add         %2, %2, %6, lsl #3            \n"  // dst += 8 * dst_stride
+      "subs        %3, %3, #8                    \n"  // w   -= 8
+      "b.ge        1b                            \n"
+
+    // add 8 back to counter. if the result is 0 there are
+    // no residuals.
+    "adds        %3, %3, #8                      \n"
+    "b.eq        4f                              \n"
+
+    // some residual, so between 1 and 7 lines left to transpose
+    "cmp         %3, #2                          \n"
+    "b.lt        3f                              \n"
+
+    "cmp         %3, #4                          \n"
+    "b.lt        2f                              \n"
+
+    // 4x8 block
+    "mov         %0, %1                          \n"
+    MEMACCESS(0)
+    "ld1     {v0.s}[0], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v0.s}[1], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v0.s}[2], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v0.s}[3], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.s}[0], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.s}[1], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.s}[2], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.s}[3], [%0]                     \n"
+
+    "mov         %0, %2                          \n"
+
+    MEMACCESS(4)
+    "ld1      {v2.16b}, [%4]                     \n"
+
+    "tbl      v3.16b, {v0.16b}, v2.16b           \n"
+    "tbl      v0.16b, {v1.16b}, v2.16b           \n"
+
+    // TODO(frkoenig): Rework shuffle above to
+    // write out with 4 instead of 8 writes.
+    MEMACCESS(0)
+    "st1 {v3.s}[0], [%0], %6                     \n"
+    MEMACCESS(0)
+    "st1 {v3.s}[1], [%0], %6                     \n"
+    MEMACCESS(0)
+    "st1 {v3.s}[2], [%0], %6                     \n"
+    MEMACCESS(0)
+    "st1 {v3.s}[3], [%0]                         \n"
+
+    "add         %0, %2, #4                      \n"
+    MEMACCESS(0)
+    "st1 {v0.s}[0], [%0], %6                     \n"
+    MEMACCESS(0)
+    "st1 {v0.s}[1], [%0], %6                     \n"
+    MEMACCESS(0)
+    "st1 {v0.s}[2], [%0], %6                     \n"
+    MEMACCESS(0)
+    "st1 {v0.s}[3], [%0]                         \n"
+
+    "add         %1, %1, #4                      \n"  // src += 4
+    "add         %2, %2, %6, lsl #2              \n"  // dst += 4 * dst_stride
+    "subs        %3, %3, #4                      \n"  // w   -= 4
+    "b.eq        4f                              \n"
+
+    // some residual, check to see if it includes a 2x8 block,
+    // or less
+    "cmp         %3, #2                          \n"
+    "b.lt        3f                              \n"
+
+    // 2x8 block
+    "2:                                          \n"
+    "mov         %0, %1                          \n"
+    MEMACCESS(0)
+    "ld1     {v0.h}[0], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.h}[0], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v0.h}[1], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.h}[1], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v0.h}[2], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.h}[2], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v0.h}[3], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.h}[3], [%0]                     \n"
+
+    "trn2    v2.8b, v0.8b, v1.8b                 \n"
+    "trn1    v3.8b, v0.8b, v1.8b                 \n"
+
+    "mov         %0, %2                          \n"
+
+    MEMACCESS(0)
+    "st1     {v3.8b}, [%0], %6                   \n"
+    MEMACCESS(0)
+    "st1     {v2.8b}, [%0]                       \n"
+
+    "add         %1, %1, #2                      \n"  // src += 2
+    "add         %2, %2, %6, lsl #1              \n"  // dst += 2 * dst_stride
+    "subs        %3, %3,  #2                     \n"  // w   -= 2
+    "b.eq        4f                              \n"
+
+    // 1x8 block
+    "3:                                          \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[0], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[1], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[2], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[3], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[4], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[5], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[6], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[7], [%1]                 \n"
+
+    MEMACCESS(2)
+    "st1         {v0.8b}, [%2]                   \n"
+
+    "4:                                          \n"
+
+    : "+r"(src_temp),                             // %0
+      "+r"(src),                                  // %1
+      "+r"(dst),                                  // %2
+      "+r"(width64)                               // %3
+    : "r"(&kVTbl4x4Transpose),                    // %4
+      "r"(static_cast<ptrdiff_t>(src_stride)),    // %5
+      "r"(static_cast<ptrdiff_t>(dst_stride))     // %6
+    : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+      "v17", "v18", "v19", "v20", "v21", "v22", "v23"
+  );
+}
+
+static uint8 kVTbl4x4TransposeDi[32] =
+  { 0,  16, 32, 48,  2, 18, 34, 50,  4, 20, 36, 52,  6, 22, 38, 54,
+    1,  17, 33, 49,  3, 19, 35, 51,  5, 21, 37, 53,  7, 23, 39, 55};
+
+void TransposeUVWx8_NEON(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b,
+                         int width) {
+  const uint8* src_temp = NULL;
+  int64 width64 = (int64) width;  // Work around clang 3.4 warning.
+  asm volatile (
+    // loops are on blocks of 8. loop will stop when
+    // counter gets to or below 0. starting the counter
+    // at w-8 allow for this
+    "sub       %4, %4, #8                      \n"
+
+    // handle 8x8 blocks. this should be the majority of the plane
+    "1:                                        \n"
+    "mov       %0, %1                          \n"
+
+    MEMACCESS(0)
+    "ld1       {v0.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v1.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v2.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v3.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v4.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v5.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v6.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v7.16b}, [%0]                  \n"
+
+    "trn1      v16.16b, v0.16b, v1.16b         \n"
+    "trn2      v17.16b, v0.16b, v1.16b         \n"
+    "trn1      v18.16b, v2.16b, v3.16b         \n"
+    "trn2      v19.16b, v2.16b, v3.16b         \n"
+    "trn1      v20.16b, v4.16b, v5.16b         \n"
+    "trn2      v21.16b, v4.16b, v5.16b         \n"
+    "trn1      v22.16b, v6.16b, v7.16b         \n"
+    "trn2      v23.16b, v6.16b, v7.16b         \n"
+
+    "trn1      v0.8h, v16.8h, v18.8h           \n"
+    "trn2      v1.8h, v16.8h, v18.8h           \n"
+    "trn1      v2.8h, v20.8h, v22.8h           \n"
+    "trn2      v3.8h, v20.8h, v22.8h           \n"
+    "trn1      v4.8h, v17.8h, v19.8h           \n"
+    "trn2      v5.8h, v17.8h, v19.8h           \n"
+    "trn1      v6.8h, v21.8h, v23.8h           \n"
+    "trn2      v7.8h, v21.8h, v23.8h           \n"
+
+    "trn1      v16.4s, v0.4s, v2.4s            \n"
+    "trn2      v17.4s, v0.4s, v2.4s            \n"
+    "trn1      v18.4s, v1.4s, v3.4s            \n"
+    "trn2      v19.4s, v1.4s, v3.4s            \n"
+    "trn1      v20.4s, v4.4s, v6.4s            \n"
+    "trn2      v21.4s, v4.4s, v6.4s            \n"
+    "trn1      v22.4s, v5.4s, v7.4s            \n"
+    "trn2      v23.4s, v5.4s, v7.4s            \n"
+
+    "mov       %0, %2                          \n"
+
+    MEMACCESS(0)
+    "st1       {v16.d}[0], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v18.d}[0], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v17.d}[0], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v19.d}[0], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v16.d}[1], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v18.d}[1], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v17.d}[1], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v19.d}[1], [%0]                \n"
+
+    "mov       %0, %3                          \n"
+
+    MEMACCESS(0)
+    "st1       {v20.d}[0], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v22.d}[0], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v21.d}[0], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v23.d}[0], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v20.d}[1], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v22.d}[1], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v21.d}[1], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v23.d}[1], [%0]                \n"
+
+    "add       %1, %1, #16                     \n"  // src   += 8*2
+    "add       %2, %2, %6, lsl #3              \n"  // dst_a += 8 * dst_stride_a
+    "add       %3, %3, %7, lsl #3              \n"  // dst_b += 8 * dst_stride_b
+    "subs      %4, %4,  #8                     \n"  // w     -= 8
+    "b.ge      1b                              \n"
+
+    // add 8 back to counter. if the result is 0 there are
+    // no residuals.
+    "adds      %4, %4, #8                      \n"
+    "b.eq      4f                              \n"
+
+    // some residual, so between 1 and 7 lines left to transpose
+    "cmp       %4, #2                          \n"
+    "b.lt      3f                              \n"
+
+    "cmp       %4, #4                          \n"
+    "b.lt      2f                              \n"
+
+    // TODO(frkoenig): Clean this up
+    // 4x8 block
+    "mov       %0, %1                          \n"
+    MEMACCESS(0)
+    "ld1       {v0.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v1.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v2.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v3.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v4.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v5.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v6.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v7.8b}, [%0]                   \n"
+
+    MEMACCESS(8)
+    "ld1       {v30.16b}, [%8], #16            \n"
+    "ld1       {v31.16b}, [%8]                 \n"
+
+    "tbl       v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b  \n"
+    "tbl       v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b  \n"
+    "tbl       v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b  \n"
+    "tbl       v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b  \n"
+
+    "mov       %0, %2                          \n"
+
+    MEMACCESS(0)
+    "st1       {v16.s}[0],  [%0], %6           \n"
+    MEMACCESS(0)
+    "st1       {v16.s}[1],  [%0], %6           \n"
+    MEMACCESS(0)
+    "st1       {v16.s}[2],  [%0], %6           \n"
+    MEMACCESS(0)
+    "st1       {v16.s}[3],  [%0], %6           \n"
+
+    "add       %0, %2, #4                      \n"
+    MEMACCESS(0)
+    "st1       {v18.s}[0], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v18.s}[1], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v18.s}[2], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v18.s}[3], [%0]                \n"
+
+    "mov       %0, %3                          \n"
+
+    MEMACCESS(0)
+    "st1       {v17.s}[0], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v17.s}[1], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v17.s}[2], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v17.s}[3], [%0], %7            \n"
+
+    "add       %0, %3, #4                      \n"
+    MEMACCESS(0)
+    "st1       {v19.s}[0],  [%0], %7           \n"
+    MEMACCESS(0)
+    "st1       {v19.s}[1],  [%0], %7           \n"
+    MEMACCESS(0)
+    "st1       {v19.s}[2],  [%0], %7           \n"
+    MEMACCESS(0)
+    "st1       {v19.s}[3],  [%0]               \n"
+
+    "add       %1, %1, #8                      \n"  // src   += 4 * 2
+    "add       %2, %2, %6, lsl #2              \n"  // dst_a += 4 * dst_stride_a
+    "add       %3, %3, %7, lsl #2              \n"  // dst_b += 4 * dst_stride_b
+    "subs      %4,  %4,  #4                    \n"  // w     -= 4
+    "b.eq      4f                              \n"
+
+    // some residual, check to see if it includes a 2x8 block,
+    // or less
+    "cmp       %4, #2                          \n"
+    "b.lt      3f                              \n"
+
+    // 2x8 block
+    "2:                                        \n"
+    "mov       %0, %1                          \n"
+    MEMACCESS(0)
+    "ld2       {v0.h, v1.h}[0], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v2.h, v3.h}[0], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v0.h, v1.h}[1], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v2.h, v3.h}[1], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v0.h, v1.h}[2], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v2.h, v3.h}[2], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v0.h, v1.h}[3], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v2.h, v3.h}[3], [%0]           \n"
+
+    "trn1      v4.8b, v0.8b, v2.8b             \n"
+    "trn2      v5.8b, v0.8b, v2.8b             \n"
+    "trn1      v6.8b, v1.8b, v3.8b             \n"
+    "trn2      v7.8b, v1.8b, v3.8b             \n"
+
+    "mov       %0, %2                          \n"
+
+    MEMACCESS(0)
+    "st1       {v4.d}[0], [%0], %6             \n"
+    MEMACCESS(0)
+    "st1       {v6.d}[0], [%0]                 \n"
+
+    "mov       %0, %3                          \n"
+
+    MEMACCESS(0)
+    "st1       {v5.d}[0], [%0], %7             \n"
+    MEMACCESS(0)
+    "st1       {v7.d}[0], [%0]                 \n"
+
+    "add       %1, %1, #4                      \n"  // src   += 2 * 2
+    "add       %2, %2, %6, lsl #1              \n"  // dst_a += 2 * dst_stride_a
+    "add       %3, %3, %7, lsl #1              \n"  // dst_b += 2 * dst_stride_b
+    "subs      %4,  %4,  #2                    \n"  // w     -= 2
+    "b.eq      4f                              \n"
+
+    // 1x8 block
+    "3:                                        \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[0], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[1], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[2], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[3], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[4], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[5], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[6], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[7], [%1]           \n"
+
+    MEMACCESS(2)
+    "st1       {v0.d}[0], [%2]                 \n"
+    MEMACCESS(3)
+    "st1       {v1.d}[0], [%3]                 \n"
+
+    "4:                                        \n"
+
+    : "+r"(src_temp),                             // %0
+      "+r"(src),                                  // %1
+      "+r"(dst_a),                                // %2
+      "+r"(dst_b),                                // %3
+      "+r"(width64)                               // %4
+    : "r"(static_cast<ptrdiff_t>(src_stride)),    // %5
+      "r"(static_cast<ptrdiff_t>(dst_stride_a)),  // %6
+      "r"(static_cast<ptrdiff_t>(dst_stride_b)),  // %7
+      "r"(&kVTbl4x4TransposeDi)                   // %8
+    : "memory", "cc",
+      "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+      "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+      "v30", "v31"
+  );
+}
+#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libyuv/source/rotate_win.cc b/libs/libyuv/source/rotate_win.cc
new file mode 100644
index 0000000000..1300fc0feb
--- /dev/null
+++ b/libs/libyuv/source/rotate_win.cc
@@ -0,0 +1,247 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/rotate_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for 32 bit Visual C x86 and clangcl
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+
+__declspec(naked)
+void TransposeWx8_SSSE3(const uint8* src, int src_stride,
+                        uint8* dst, int dst_stride, int width) {
+  __asm {
+    push      edi
+    push      esi
+    push      ebp
+    mov       eax, [esp + 12 + 4]   // src
+    mov       edi, [esp + 12 + 8]   // src_stride
+    mov       edx, [esp + 12 + 12]  // dst
+    mov       esi, [esp + 12 + 16]  // dst_stride
+    mov       ecx, [esp + 12 + 20]  // width
+
+    // Read in the data from the source pointer.
+    // First round of bit swap.
+    align      4
+ convertloop:
+    movq      xmm0, qword ptr [eax]
+    lea       ebp, [eax + 8]
+    movq      xmm1, qword ptr [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    punpcklbw xmm0, xmm1
+    movq      xmm2, qword ptr [eax]
+    movdqa    xmm1, xmm0
+    palignr   xmm1, xmm1, 8
+    movq      xmm3, qword ptr [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    punpcklbw xmm2, xmm3
+    movdqa    xmm3, xmm2
+    movq      xmm4, qword ptr [eax]
+    palignr   xmm3, xmm3, 8
+    movq      xmm5, qword ptr [eax + edi]
+    punpcklbw xmm4, xmm5
+    lea       eax, [eax + 2 * edi]
+    movdqa    xmm5, xmm4
+    movq      xmm6, qword ptr [eax]
+    palignr   xmm5, xmm5, 8
+    movq      xmm7, qword ptr [eax + edi]
+    punpcklbw xmm6, xmm7
+    mov       eax, ebp
+    movdqa    xmm7, xmm6
+    palignr   xmm7, xmm7, 8
+    // Second round of bit swap.
+    punpcklwd xmm0, xmm2
+    punpcklwd xmm1, xmm3
+    movdqa    xmm2, xmm0
+    movdqa    xmm3, xmm1
+    palignr   xmm2, xmm2, 8
+    palignr   xmm3, xmm3, 8
+    punpcklwd xmm4, xmm6
+    punpcklwd xmm5, xmm7
+    movdqa    xmm6, xmm4
+    movdqa    xmm7, xmm5
+    palignr   xmm6, xmm6, 8
+    palignr   xmm7, xmm7, 8
+    // Third round of bit swap.
+    // Write to the destination pointer.
+    punpckldq xmm0, xmm4
+    movq      qword ptr [edx], xmm0
+    movdqa    xmm4, xmm0
+    palignr   xmm4, xmm4, 8
+    movq      qword ptr [edx + esi], xmm4
+    lea       edx, [edx + 2 * esi]
+    punpckldq xmm2, xmm6
+    movdqa    xmm6, xmm2
+    palignr   xmm6, xmm6, 8
+    movq      qword ptr [edx], xmm2
+    punpckldq xmm1, xmm5
+    movq      qword ptr [edx + esi], xmm6
+    lea       edx, [edx + 2 * esi]
+    movdqa    xmm5, xmm1
+    movq      qword ptr [edx], xmm1
+    palignr   xmm5, xmm5, 8
+    punpckldq xmm3, xmm7
+    movq      qword ptr [edx + esi], xmm5
+    lea       edx, [edx + 2 * esi]
+    movq      qword ptr [edx], xmm3
+    movdqa    xmm7, xmm3
+    palignr   xmm7, xmm7, 8
+    sub       ecx, 8
+    movq      qword ptr [edx + esi], xmm7
+    lea       edx, [edx + 2 * esi]
+    jg        convertloop
+
+    pop       ebp
+    pop       esi
+    pop       edi
+    ret
+  }
+}
+
+__declspec(naked)
+void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b,
+                         int w) {
+  __asm {
+    push      ebx
+    push      esi
+    push      edi
+    push      ebp
+    mov       eax, [esp + 16 + 4]   // src
+    mov       edi, [esp + 16 + 8]   // src_stride
+    mov       edx, [esp + 16 + 12]  // dst_a
+    mov       esi, [esp + 16 + 16]  // dst_stride_a
+    mov       ebx, [esp + 16 + 20]  // dst_b
+    mov       ebp, [esp + 16 + 24]  // dst_stride_b
+    mov       ecx, esp
+    sub       esp, 4 + 16
+    and       esp, ~15
+    mov       [esp + 16], ecx
+    mov       ecx, [ecx + 16 + 28]  // w
+
+    align      4
+ convertloop:
+    // Read in the data from the source pointer.
+    // First round of bit swap.
+    movdqu    xmm0, [eax]
+    movdqu    xmm1, [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    movdqa    xmm7, xmm0  // use xmm7 as temp register.
+    punpcklbw xmm0, xmm1
+    punpckhbw xmm7, xmm1
+    movdqa    xmm1, xmm7
+    movdqu    xmm2, [eax]
+    movdqu    xmm3, [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    movdqa    xmm7, xmm2
+    punpcklbw xmm2, xmm3
+    punpckhbw xmm7, xmm3
+    movdqa    xmm3, xmm7
+    movdqu    xmm4, [eax]
+    movdqu    xmm5, [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    movdqa    xmm7, xmm4
+    punpcklbw xmm4, xmm5
+    punpckhbw xmm7, xmm5
+    movdqa    xmm5, xmm7
+    movdqu    xmm6, [eax]
+    movdqu    xmm7, [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    movdqu    [esp], xmm5  // backup xmm5
+    neg       edi
+    movdqa    xmm5, xmm6   // use xmm5 as temp register.
+    punpcklbw xmm6, xmm7
+    punpckhbw xmm5, xmm7
+    movdqa    xmm7, xmm5
+    lea       eax, [eax + 8 * edi + 16]
+    neg       edi
+    // Second round of bit swap.
+    movdqa    xmm5, xmm0
+    punpcklwd xmm0, xmm2
+    punpckhwd xmm5, xmm2
+    movdqa    xmm2, xmm5
+    movdqa    xmm5, xmm1
+    punpcklwd xmm1, xmm3
+    punpckhwd xmm5, xmm3
+    movdqa    xmm3, xmm5
+    movdqa    xmm5, xmm4
+    punpcklwd xmm4, xmm6
+    punpckhwd xmm5, xmm6
+    movdqa    xmm6, xmm5
+    movdqu    xmm5, [esp]  // restore xmm5
+    movdqu    [esp], xmm6  // backup xmm6
+    movdqa    xmm6, xmm5    // use xmm6 as temp register.
+    punpcklwd xmm5, xmm7
+    punpckhwd xmm6, xmm7
+    movdqa    xmm7, xmm6
+    // Third round of bit swap.
+    // Write to the destination pointer.
+    movdqa    xmm6, xmm0
+    punpckldq xmm0, xmm4
+    punpckhdq xmm6, xmm4
+    movdqa    xmm4, xmm6
+    movdqu    xmm6, [esp]  // restore xmm6
+    movlpd    qword ptr [edx], xmm0
+    movhpd    qword ptr [ebx], xmm0
+    movlpd    qword ptr [edx + esi], xmm4
+    lea       edx, [edx + 2 * esi]
+    movhpd    qword ptr [ebx + ebp], xmm4
+    lea       ebx, [ebx + 2 * ebp]
+    movdqa    xmm0, xmm2   // use xmm0 as the temp register.
+    punpckldq xmm2, xmm6
+    movlpd    qword ptr [edx], xmm2
+    movhpd    qword ptr [ebx], xmm2
+    punpckhdq xmm0, xmm6
+    movlpd    qword ptr [edx + esi], xmm0
+    lea       edx, [edx + 2 * esi]
+    movhpd    qword ptr [ebx + ebp], xmm0
+    lea       ebx, [ebx + 2 * ebp]
+    movdqa    xmm0, xmm1   // use xmm0 as the temp register.
+    punpckldq xmm1, xmm5
+    movlpd    qword ptr [edx], xmm1
+    movhpd    qword ptr [ebx], xmm1
+    punpckhdq xmm0, xmm5
+    movlpd    qword ptr [edx + esi], xmm0
+    lea       edx, [edx + 2 * esi]
+    movhpd    qword ptr [ebx + ebp], xmm0
+    lea       ebx, [ebx + 2 * ebp]
+    movdqa    xmm0, xmm3   // use xmm0 as the temp register.
+    punpckldq xmm3, xmm7
+    movlpd    qword ptr [edx], xmm3
+    movhpd    qword ptr [ebx], xmm3
+    punpckhdq xmm0, xmm7
+    sub       ecx, 8
+    movlpd    qword ptr [edx + esi], xmm0
+    lea       edx, [edx + 2 * esi]
+    movhpd    qword ptr [ebx + ebp], xmm0
+    lea       ebx, [ebx + 2 * ebp]
+    jg        convertloop
+
+    mov       esp, [esp + 16]
+    pop       ebp
+    pop       edi
+    pop       esi
+    pop       ebx
+    ret
+  }
+}
+
+#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libyuv/source/row_any.cc b/libs/libyuv/source/row_any.cc
new file mode 100644
index 0000000000..29b7a343d5
--- /dev/null
+++ b/libs/libyuv/source/row_any.cc
@@ -0,0 +1,818 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#include <string.h>  // For memset.
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Subsampled source needs to be increase by 1 of not even.
+#define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift))
+
+// Any 4 planes to 1 with yuvconstants
+#define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)                \
+    void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf,   \
+                 const uint8* a_buf, uint8* dst_ptr,                           \
+                 const struct YuvConstants* yuvconstants,  int width) {        \
+      SIMD_ALIGNED(uint8 temp[64 * 5]);                                        \
+      memset(temp, 0, 64 * 4);  /* for msan */                                 \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n);        \
+      }                                                                        \
+      memcpy(temp, y_buf + n, r);                                              \
+      memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));               \
+      memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));              \
+      memcpy(temp + 192, a_buf + n, r);                                        \
+      ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256,            \
+               yuvconstants, MASK + 1);                                        \
+      memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256,                      \
+             SS(r, DUVSHIFT) * BPP);                                           \
+    }
+
+#ifdef HAS_I422ALPHATOARGBROW_SSSE3
+ANY41C(I422AlphaToARGBRow_Any_SSSE3, I422AlphaToARGBRow_SSSE3, 1, 0, 4, 7)
+#endif
+#ifdef HAS_I422ALPHATOARGBROW_AVX2
+ANY41C(I422AlphaToARGBRow_Any_AVX2, I422AlphaToARGBRow_AVX2, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I422ALPHATOARGBROW_NEON
+ANY41C(I422AlphaToARGBRow_Any_NEON, I422AlphaToARGBRow_NEON, 1, 0, 4, 7)
+#endif
+#undef ANY41C
+
+// Any 3 planes to 1.
+#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)                 \
+    void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf,   \
+                 uint8* dst_ptr, int width) {                                  \
+      SIMD_ALIGNED(uint8 temp[64 * 4]);                                        \
+      memset(temp, 0, 64 * 3);  /* for YUY2 and msan */                        \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n);                             \
+      }                                                                        \
+      memcpy(temp, y_buf + n, r);                                              \
+      memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));               \
+      memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));              \
+      ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, MASK + 1);             \
+      memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192,                      \
+             SS(r, DUVSHIFT) * BPP);                                           \
+    }
+#ifdef HAS_I422TOYUY2ROW_SSE2
+ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15)
+ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15)
+#endif
+#ifdef HAS_I422TOYUY2ROW_NEON
+ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15)
+#endif
+#ifdef HAS_I422TOUYVYROW_NEON
+ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15)
+#endif
+#ifdef HAS_BLENDPLANEROW_AVX2
+ANY31(BlendPlaneRow_Any_AVX2, BlendPlaneRow_AVX2, 0, 0, 1, 31)
+#endif
+#ifdef HAS_BLENDPLANEROW_SSSE3
+ANY31(BlendPlaneRow_Any_SSSE3, BlendPlaneRow_SSSE3, 0, 0, 1, 7)
+#endif
+#undef ANY31
+
+// Note that odd width replication includes 444 due to implementation
+// on arm that subsamples 444 to 422 internally.
+// Any 3 planes to 1 with yuvconstants
+#define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)                \
+    void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf,   \
+                 uint8* dst_ptr, const struct YuvConstants* yuvconstants,      \
+                 int width) {                                                  \
+      SIMD_ALIGNED(uint8 temp[64 * 4]);                                        \
+      memset(temp, 0, 64 * 3);  /* for YUY2 and msan */                        \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n);               \
+      }                                                                        \
+      memcpy(temp, y_buf + n, r);                                              \
+      memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));               \
+      memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));              \
+      if (width & 1) {                                                         \
+        temp[64 + SS(r, UVSHIFT)] = temp[64 + SS(r, UVSHIFT) - 1];             \
+        temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1];           \
+      }                                                                        \
+      ANY_SIMD(temp, temp + 64, temp + 128, temp + 192,                        \
+               yuvconstants, MASK + 1);                                        \
+      memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192,                      \
+             SS(r, DUVSHIFT) * BPP);                                           \
+    }
+
+#ifdef HAS_I422TOARGBROW_SSSE3
+ANY31C(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7)
+#endif
+#ifdef HAS_I411TOARGBROW_SSSE3
+ANY31C(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_SSSE3, 2, 0, 4, 7)
+#endif
+#ifdef HAS_I444TOARGBROW_SSSE3
+ANY31C(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7)
+ANY31C(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7)
+ANY31C(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, 1, 0, 2, 7)
+ANY31C(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, 1, 0, 2, 7)
+ANY31C(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7)
+ANY31C(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 7)
+#endif  // HAS_I444TOARGBROW_SSSE3
+#ifdef HAS_I422TORGB24ROW_AVX2
+ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 15)
+#endif
+#ifdef HAS_I422TOARGBROW_AVX2
+ANY31C(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I422TORGBAROW_AVX2
+ANY31C(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I444TOARGBROW_AVX2
+ANY31C(I444ToARGBRow_Any_AVX2, I444ToARGBRow_AVX2, 0, 0, 4, 15)
+#endif
+#ifdef HAS_I411TOARGBROW_AVX2
+ANY31C(I411ToARGBRow_Any_AVX2, I411ToARGBRow_AVX2, 2, 0, 4, 15)
+#endif
+#ifdef HAS_I422TOARGB4444ROW_AVX2
+ANY31C(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, 1, 0, 2, 7)
+#endif
+#ifdef HAS_I422TOARGB1555ROW_AVX2
+ANY31C(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, 1, 0, 2, 7)
+#endif
+#ifdef HAS_I422TORGB565ROW_AVX2
+ANY31C(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 7)
+#endif
+#ifdef HAS_I422TOARGBROW_NEON
+ANY31C(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, 0, 0, 4, 7)
+ANY31C(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, 1, 0, 4, 7)
+ANY31C(I411ToARGBRow_Any_NEON, I411ToARGBRow_NEON, 2, 0, 4, 7)
+ANY31C(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, 1, 0, 4, 7)
+ANY31C(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, 1, 0, 3, 7)
+ANY31C(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, 1, 0, 2, 7)
+ANY31C(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, 1, 0, 2, 7)
+ANY31C(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7)
+#endif
+#undef ANY31C
+
+// Any 2 planes to 1.
+#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK)              \
+    void NAMEANY(const uint8* y_buf, const uint8* uv_buf,                      \
+                 uint8* dst_ptr, int width) {                                  \
+      SIMD_ALIGNED(uint8 temp[64 * 3]);                                        \
+      memset(temp, 0, 64 * 2);  /* for msan */                                 \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(y_buf, uv_buf, dst_ptr, n);                                   \
+      }                                                                        \
+      memcpy(temp, y_buf + n * SBPP, r * SBPP);                                \
+      memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2,                       \
+             SS(r, UVSHIFT) * SBPP2);                                          \
+      ANY_SIMD(temp, temp + 64, temp + 128, MASK + 1);                         \
+      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
+    }
+
+// Merge functions.
+#ifdef HAS_MERGEUVROW_SSE2
+ANY21(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, 0, 1, 1, 2, 15)
+#endif
+#ifdef HAS_MERGEUVROW_AVX2
+ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 31)
+#endif
+#ifdef HAS_MERGEUVROW_NEON
+ANY21(MergeUVRow_Any_NEON, MergeUVRow_NEON, 0, 1, 1, 2, 15)
+#endif
+
+// Math functions.
+#ifdef HAS_ARGBMULTIPLYROW_SSE2
+ANY21(ARGBMultiplyRow_Any_SSE2, ARGBMultiplyRow_SSE2, 0, 4, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBADDROW_SSE2
+ANY21(ARGBAddRow_Any_SSE2, ARGBAddRow_SSE2, 0, 4, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_SSE2
+ANY21(ARGBSubtractRow_Any_SSE2, ARGBSubtractRow_SSE2, 0, 4, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBMULTIPLYROW_AVX2
+ANY21(ARGBMultiplyRow_Any_AVX2, ARGBMultiplyRow_AVX2, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBADDROW_AVX2
+ANY21(ARGBAddRow_Any_AVX2, ARGBAddRow_AVX2, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_AVX2
+ANY21(ARGBSubtractRow_Any_AVX2, ARGBSubtractRow_AVX2, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBMULTIPLYROW_NEON
+ANY21(ARGBMultiplyRow_Any_NEON, ARGBMultiplyRow_NEON, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBADDROW_NEON
+ANY21(ARGBAddRow_Any_NEON, ARGBAddRow_NEON, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_NEON
+ANY21(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_SOBELROW_SSE2
+ANY21(SobelRow_Any_SSE2, SobelRow_SSE2, 0, 1, 1, 4, 15)
+#endif
+#ifdef HAS_SOBELROW_NEON
+ANY21(SobelRow_Any_NEON, SobelRow_NEON, 0, 1, 1, 4, 7)
+#endif
+#ifdef HAS_SOBELTOPLANEROW_SSE2
+ANY21(SobelToPlaneRow_Any_SSE2, SobelToPlaneRow_SSE2, 0, 1, 1, 1, 15)
+#endif
+#ifdef HAS_SOBELTOPLANEROW_NEON
+ANY21(SobelToPlaneRow_Any_NEON, SobelToPlaneRow_NEON, 0, 1, 1, 1, 15)
+#endif
+#ifdef HAS_SOBELXYROW_SSE2
+ANY21(SobelXYRow_Any_SSE2, SobelXYRow_SSE2, 0, 1, 1, 4, 15)
+#endif
+#ifdef HAS_SOBELXYROW_NEON
+ANY21(SobelXYRow_Any_NEON, SobelXYRow_NEON, 0, 1, 1, 4, 7)
+#endif
+#undef ANY21
+
+// Any 2 planes to 1 with yuvconstants
+#define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK)             \
+    void NAMEANY(const uint8* y_buf, const uint8* uv_buf,                      \
+                 uint8* dst_ptr, const struct YuvConstants* yuvconstants,      \
+                 int width) {                                                  \
+      SIMD_ALIGNED(uint8 temp[64 * 3]);                                        \
+      memset(temp, 0, 64 * 2);  /* for msan */                                 \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n);                     \
+      }                                                                        \
+      memcpy(temp, y_buf + n * SBPP, r * SBPP);                                \
+      memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2,                       \
+             SS(r, UVSHIFT) * SBPP2);                                          \
+      ANY_SIMD(temp, temp + 64, temp + 128, yuvconstants, MASK + 1);           \
+      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
+    }
+
+// Biplanar to RGB.
+#ifdef HAS_NV12TOARGBROW_SSSE3
+ANY21C(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV12TOARGBROW_AVX2
+ANY21C(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, 1, 1, 2, 4, 15)
+#endif
+#ifdef HAS_NV12TOARGBROW_NEON
+ANY21C(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV21TOARGBROW_SSSE3
+ANY21C(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV21TOARGBROW_AVX2
+ANY21C(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, 1, 1, 2, 4, 15)
+#endif
+#ifdef HAS_NV21TOARGBROW_NEON
+ANY21C(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV12TORGB565ROW_SSSE3
+ANY21C(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7)
+#endif
+#ifdef HAS_NV12TORGB565ROW_AVX2
+ANY21C(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, 1, 1, 2, 2, 15)
+#endif
+#ifdef HAS_NV12TORGB565ROW_NEON
+ANY21C(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, 1, 1, 2, 2, 7)
+#endif
+#undef ANY21C
+
+// Any 1 to 1.
+#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)                     \
+    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) {            \
+      SIMD_ALIGNED(uint8 temp[128 * 2]);                                       \
+      memset(temp, 0, 128);  /* for YUY2 and msan */                           \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(src_ptr, dst_ptr, n);                                         \
+      }                                                                        \
+      memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP);    \
+      ANY_SIMD(temp, temp + 128, MASK + 1);                                    \
+      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
+    }
+
+#ifdef HAS_COPYROW_AVX
+ANY11(CopyRow_Any_AVX, CopyRow_AVX, 0, 1, 1, 63)
+#endif
+#ifdef HAS_COPYROW_SSE2
+ANY11(CopyRow_Any_SSE2, CopyRow_SSE2, 0, 1, 1, 31)
+#endif
+#ifdef HAS_COPYROW_NEON
+ANY11(CopyRow_Any_NEON, CopyRow_NEON, 0, 1, 1, 31)
+#endif
+#if defined(HAS_ARGBTORGB24ROW_SSSE3)
+ANY11(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, 0, 4, 3, 15)
+ANY11(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, 0, 4, 3, 15)
+ANY11(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 0, 4, 2, 3)
+ANY11(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 0, 4, 2, 3)
+ANY11(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 0, 4, 2, 3)
+#endif
+#if defined(HAS_ARGBTORGB565ROW_AVX2)
+ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7)
+#endif
+#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
+ANY11(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, 0, 4, 2, 7)
+ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7)
+#endif
+#if defined(HAS_J400TOARGBROW_SSE2)
+ANY11(J400ToARGBRow_Any_SSE2, J400ToARGBRow_SSE2, 0, 1, 4, 7)
+#endif
+#if defined(HAS_J400TOARGBROW_AVX2)
+ANY11(J400ToARGBRow_Any_AVX2, J400ToARGBRow_AVX2, 0, 1, 4, 15)
+#endif
+#if defined(HAS_I400TOARGBROW_SSE2)
+ANY11(I400ToARGBRow_Any_SSE2, I400ToARGBRow_SSE2, 0, 1, 4, 7)
+#endif
+#if defined(HAS_I400TOARGBROW_AVX2)
+ANY11(I400ToARGBRow_Any_AVX2, I400ToARGBRow_AVX2, 0, 1, 4, 15)
+#endif
+#if defined(HAS_RGB24TOARGBROW_SSSE3)
+ANY11(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, 0, 3, 4, 15)
+ANY11(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, 0, 3, 4, 15)
+ANY11(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, 0, 2, 4, 7)
+ANY11(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, 0, 2, 4, 7)
+ANY11(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, 0, 2, 4, 7)
+#endif
+#if defined(HAS_RAWTORGB24ROW_SSSE3)
+ANY11(RAWToRGB24Row_Any_SSSE3, RAWToRGB24Row_SSSE3, 0, 3, 3, 7)
+#endif
+#if defined(HAS_RGB565TOARGBROW_AVX2)
+ANY11(RGB565ToARGBRow_Any_AVX2, RGB565ToARGBRow_AVX2, 0, 2, 4, 15)
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_AVX2)
+ANY11(ARGB1555ToARGBRow_Any_AVX2, ARGB1555ToARGBRow_AVX2, 0, 2, 4, 15)
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_AVX2)
+ANY11(ARGB4444ToARGBRow_Any_AVX2, ARGB4444ToARGBRow_AVX2, 0, 2, 4, 15)
+#endif
+#if defined(HAS_ARGBTORGB24ROW_NEON)
+ANY11(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, 0, 4, 3, 7)
+ANY11(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, 0, 4, 3, 7)
+ANY11(ARGBToRGB565Row_Any_NEON, ARGBToRGB565Row_NEON, 0, 4, 2, 7)
+ANY11(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, 0, 4, 2, 7)
+ANY11(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, 0, 4, 2, 7)
+ANY11(J400ToARGBRow_Any_NEON, J400ToARGBRow_NEON, 0, 1, 4, 7)
+ANY11(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, 0, 1, 4, 7)
+#endif
+#if defined(HAS_RAWTORGB24ROW_NEON)
+ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7)
+#endif
+#ifdef HAS_ARGBTOYROW_AVX2
+ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31)
+#endif
+#ifdef HAS_ARGBTOYJROW_AVX2
+ANY11(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 0, 4, 1, 31)
+#endif
+#ifdef HAS_UYVYTOYROW_AVX2
+ANY11(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 0, 2, 1, 31)
+#endif
+#ifdef HAS_YUY2TOYROW_AVX2
+ANY11(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, 1, 4, 1, 31)
+#endif
+#ifdef HAS_ARGBTOYROW_SSSE3
+ANY11(ARGBToYRow_Any_SSSE3, ARGBToYRow_SSSE3, 0, 4, 1, 15)
+#endif
+#ifdef HAS_BGRATOYROW_SSSE3
+ANY11(BGRAToYRow_Any_SSSE3, BGRAToYRow_SSSE3, 0, 4, 1, 15)
+ANY11(ABGRToYRow_Any_SSSE3, ABGRToYRow_SSSE3, 0, 4, 1, 15)
+ANY11(RGBAToYRow_Any_SSSE3, RGBAToYRow_SSSE3, 0, 4, 1, 15)
+ANY11(YUY2ToYRow_Any_SSE2, YUY2ToYRow_SSE2, 1, 4, 1, 15)
+ANY11(UYVYToYRow_Any_SSE2, UYVYToYRow_SSE2, 1, 4, 1, 15)
+#endif
+#ifdef HAS_ARGBTOYJROW_SSSE3
+ANY11(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ARGBTOYROW_NEON
+ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 7)
+#endif
+#ifdef HAS_ARGBTOYJROW_NEON
+ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 7)
+#endif
+#ifdef HAS_BGRATOYROW_NEON
+ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 7)
+#endif
+#ifdef HAS_ABGRTOYROW_NEON
+ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 7)
+#endif
+#ifdef HAS_RGBATOYROW_NEON
+ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 7)
+#endif
+#ifdef HAS_RGB24TOYROW_NEON
+ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 7)
+#endif
+#ifdef HAS_RAWTOYROW_NEON
+ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 7)
+#endif
+#ifdef HAS_RGB565TOYROW_NEON
+ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 7)
+#endif
+#ifdef HAS_ARGB1555TOYROW_NEON
+ANY11(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 0, 2, 1, 7)
+#endif
+#ifdef HAS_ARGB4444TOYROW_NEON
+ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 7)
+#endif
+#ifdef HAS_YUY2TOYROW_NEON
+ANY11(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 1, 4, 1, 15)
+#endif
+#ifdef HAS_UYVYTOYROW_NEON
+ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 0, 2, 1, 15)
+#endif
+#ifdef HAS_RGB24TOARGBROW_NEON
+ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7)
+#endif
+#ifdef HAS_RAWTOARGBROW_NEON
+ANY11(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 0, 3, 4, 7)
+#endif
+#ifdef HAS_RGB565TOARGBROW_NEON
+ANY11(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 0, 2, 4, 7)
+#endif
+#ifdef HAS_ARGB1555TOARGBROW_NEON
+ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 7)
+#endif
+#ifdef HAS_ARGB4444TOARGBROW_NEON
+ANY11(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 0, 2, 4, 7)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_SSSE3
+ANY11(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, 0, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBUNATTENUATEROW_SSE2
+ANY11(ARGBUnattenuateRow_Any_SSE2, ARGBUnattenuateRow_SSE2, 0, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_AVX2
+ANY11(ARGBAttenuateRow_Any_AVX2, ARGBAttenuateRow_AVX2, 0, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBUNATTENUATEROW_AVX2
+ANY11(ARGBUnattenuateRow_Any_AVX2, ARGBUnattenuateRow_AVX2, 0, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_NEON
+ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7)
+#endif
+#undef ANY11
+
+// Any 1 to 1 with yuvconstants
+#define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)                    \
+    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr,                         \
+                 const struct YuvConstants* yuvconstants, int width) {         \
+      SIMD_ALIGNED(uint8 temp[128 * 2]);                                       \
+      memset(temp, 0, 128);  /* for YUY2 and msan */                           \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(src_ptr, dst_ptr, yuvconstants, n);                           \
+      }                                                                        \
+      memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP);    \
+      ANY_SIMD(temp, temp + 128, yuvconstants, MASK + 1);                      \
+      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
+    }
+#if defined(HAS_YUY2TOARGBROW_SSSE3)
+ANY11C(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, 1, 4, 4, 15)
+ANY11C(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, 1, 4, 4, 15)
+#endif
+#if defined(HAS_YUY2TOARGBROW_AVX2)
+ANY11C(YUY2ToARGBRow_Any_AVX2, YUY2ToARGBRow_AVX2, 1, 4, 4, 31)
+ANY11C(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, 1, 4, 4, 31)
+#endif
+#if defined(HAS_YUY2TOARGBROW_NEON)
+ANY11C(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, 1, 4, 4, 7)
+ANY11C(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7)
+#endif
+#undef ANY11C
+
+// Any 1 to 1 blended.
+#define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)                    \
+    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) {            \
+      SIMD_ALIGNED(uint8 temp[128 * 2]);                                       \
+      memset(temp, 0, 128 * 2);  /* for YUY2 and msan */                       \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(src_ptr, dst_ptr, n);                                         \
+      }                                                                        \
+      memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP);    \
+      memcpy(temp + 128, dst_ptr + n * BPP, r * BPP);                          \
+      ANY_SIMD(temp, temp + 128, MASK + 1);                                    \
+      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
+    }
+
+#ifdef HAS_ARGBCOPYALPHAROW_AVX2
+ANY11B(ARGBCopyAlphaRow_Any_AVX2, ARGBCopyAlphaRow_AVX2, 0, 4, 4, 15)
+#endif
+#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
+ANY11B(ARGBCopyAlphaRow_Any_SSE2, ARGBCopyAlphaRow_SSE2, 0, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
+ANY11B(ARGBCopyYToAlphaRow_Any_AVX2, ARGBCopyYToAlphaRow_AVX2, 0, 1, 4, 15)
+#endif
+#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
+ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7)
+#endif
+#undef ANY11B
+
+// Any 1 to 1 with parameter.
+#define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK)                          \
+    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr,                         \
+                 T shuffler, int width) {                                      \
+      SIMD_ALIGNED(uint8 temp[64 * 2]);                                        \
+      memset(temp, 0, 64);  /* for msan */                                     \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(src_ptr, dst_ptr, shuffler, n);                               \
+      }                                                                        \
+      memcpy(temp, src_ptr + n * SBPP, r * SBPP);                              \
+      ANY_SIMD(temp, temp + 64, shuffler, MASK + 1);                           \
+      memcpy(dst_ptr + n * BPP, temp + 64, r * BPP);                           \
+    }
+
+#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
+ANY11P(ARGBToRGB565DitherRow_Any_SSE2, ARGBToRGB565DitherRow_SSE2,
+       const uint32, 4, 2, 3)
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
+ANY11P(ARGBToRGB565DitherRow_Any_AVX2, ARGBToRGB565DitherRow_AVX2,
+       const uint32, 4, 2, 7)
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
+ANY11P(ARGBToRGB565DitherRow_Any_NEON, ARGBToRGB565DitherRow_NEON,
+       const uint32, 4, 2, 7)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_SSE2
+ANY11P(ARGBShuffleRow_Any_SSE2, ARGBShuffleRow_SSE2, const uint8*, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_SSSE3
+ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8*, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_AVX2
+ANY11P(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, const uint8*, 4, 4, 15)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_NEON
+ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8*, 4, 4, 3)
+#endif
+#undef ANY11P
+
+// Any 1 to 1 interpolate.  Takes 2 rows of source via stride.
+#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, MASK)                             \
+    void NAMEANY(uint8* dst_ptr, const uint8* src_ptr,                         \
+                 ptrdiff_t src_stride_ptr, int width,                          \
+                 int source_y_fraction) {                                      \
+      SIMD_ALIGNED(uint8 temp[64 * 3]);                                        \
+      memset(temp, 0, 64 * 2);  /* for msan */                                 \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction);      \
+      }                                                                        \
+      memcpy(temp, src_ptr + n * SBPP, r * SBPP);                              \
+      memcpy(temp + 64, src_ptr + src_stride_ptr + n * SBPP, r * SBPP);        \
+      ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction);             \
+      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
+    }
+
+#ifdef HAS_INTERPOLATEROW_AVX2
+ANY11T(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, 1, 1, 31)
+#endif
+#ifdef HAS_INTERPOLATEROW_SSSE3
+ANY11T(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, 1, 1, 15)
+#endif
+#ifdef HAS_INTERPOLATEROW_NEON
+ANY11T(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15)
+#endif
+#ifdef HAS_INTERPOLATEROW_DSPR2
+ANY11T(InterpolateRow_Any_DSPR2, InterpolateRow_DSPR2, 1, 1, 3)
+#endif
+#undef ANY11T
+
+// Any 1 to 1 mirror.
+#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK)                                   \
+    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) {            \
+      SIMD_ALIGNED(uint8 temp[64 * 2]);                                        \
+      memset(temp, 0, 64);  /* for msan */                                     \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(src_ptr + r * BPP, dst_ptr, n);                               \
+      }                                                                        \
+      memcpy(temp, src_ptr, r * BPP);                                          \
+      ANY_SIMD(temp, temp + 64, MASK + 1);                                     \
+      memcpy(dst_ptr + n * BPP, temp + 64 + (MASK + 1 - r) * BPP, r * BPP);    \
+    }
+
+#ifdef HAS_MIRRORROW_AVX2
+ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31)
+#endif
+#ifdef HAS_MIRRORROW_SSSE3
+ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15)
+#endif
+#ifdef HAS_MIRRORROW_NEON
+ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 15)
+#endif
+#ifdef HAS_ARGBMIRRORROW_AVX2
+ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7)
+#endif
+#ifdef HAS_ARGBMIRRORROW_SSE2
+ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3)
+#endif
+#ifdef HAS_ARGBMIRRORROW_NEON
+ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 3)
+#endif
+#undef ANY11M
+
+// Any 1 plane. (memset)
+#define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK)                                  \
+    void NAMEANY(uint8* dst_ptr, T v32, int width) {                           \
+      SIMD_ALIGNED(uint8 temp[64]);                                            \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(dst_ptr, v32, n);                                             \
+      }                                                                        \
+      ANY_SIMD(temp, v32, MASK + 1);                                           \
+      memcpy(dst_ptr + n * BPP, temp, r * BPP);                                \
+    }
+
+#ifdef HAS_SETROW_X86
+ANY1(SetRow_Any_X86, SetRow_X86, uint8, 1, 3)
+#endif
+#ifdef HAS_SETROW_NEON
+ANY1(SetRow_Any_NEON, SetRow_NEON, uint8, 1, 15)
+#endif
+#ifdef HAS_ARGBSETROW_NEON
+ANY1(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, uint32, 4, 3)
+#endif
+#undef ANY1
+
+// Any 1 to 2.  Outputs UV planes.
+#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK)                 \
+    void NAMEANY(const uint8* src_ptr, uint8* dst_u, uint8* dst_v, int width) {\
+      SIMD_ALIGNED(uint8 temp[128 * 3]);                                       \
+      memset(temp, 0, 128);  /* for msan */                                    \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(src_ptr, dst_u, dst_v, n);                                    \
+      }                                                                        \
+      memcpy(temp, src_ptr  + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);     \
+      /* repeat last 4 bytes for 422 subsampler */                             \
+      if ((width & 1) && BPP == 4 && DUVSHIFT == 1) {                          \
+        memcpy(temp + SS(r, UVSHIFT) * BPP,                                    \
+               temp + SS(r, UVSHIFT) * BPP - BPP, BPP);                        \
+      }                                                                        \
+      /* repeat last 4 - 12 bytes for 411 subsampler */                        \
+      if (((width & 3) == 1) && BPP == 4 && DUVSHIFT == 2) {                   \
+        memcpy(temp + SS(r, UVSHIFT) * BPP,                                    \
+               temp + SS(r, UVSHIFT) * BPP - BPP, BPP);                        \
+        memcpy(temp + SS(r, UVSHIFT) * BPP + BPP,                              \
+               temp + SS(r, UVSHIFT) * BPP - BPP, BPP * 2);                    \
+      }                                                                        \
+      if (((width & 3) == 2) && BPP == 4 && DUVSHIFT == 2) {                   \
+        memcpy(temp + SS(r, UVSHIFT) * BPP,                                    \
+               temp + SS(r, UVSHIFT) * BPP - BPP * 2, BPP * 2);                \
+      }                                                                        \
+      if (((width & 3) == 3) && BPP == 4 && DUVSHIFT == 2) {                   \
+        memcpy(temp + SS(r, UVSHIFT) * BPP,                                    \
+               temp + SS(r, UVSHIFT) * BPP - BPP, BPP);                        \
+      }                                                                        \
+      ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1);                        \
+      memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT));            \
+      memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT));            \
+    }
+
+#ifdef HAS_SPLITUVROW_SSE2
+ANY12(SplitUVRow_Any_SSE2, SplitUVRow_SSE2, 0, 2, 0, 15)
+#endif
+#ifdef HAS_SPLITUVROW_AVX2
+ANY12(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, 0, 2, 0, 31)
+#endif
+#ifdef HAS_SPLITUVROW_NEON
+ANY12(SplitUVRow_Any_NEON, SplitUVRow_NEON, 0, 2, 0, 15)
+#endif
+#ifdef HAS_SPLITUVROW_DSPR2
+ANY12(SplitUVRow_Any_DSPR2, SplitUVRow_DSPR2, 0, 2, 0, 15)
+#endif
+#ifdef HAS_ARGBTOUV444ROW_SSSE3
+ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_AVX2
+ANY12(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2, 1, 4, 1, 31)
+ANY12(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2, 1, 4, 1, 31)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_SSE2
+ANY12(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_SSE2, 1, 4, 1, 15)
+ANY12(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_SSE2, 1, 4, 1, 15)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_NEON
+ANY12(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, 0, 4, 0, 7)
+ANY12(ARGBToUV411Row_Any_NEON, ARGBToUV411Row_NEON, 0, 4, 2, 31)
+ANY12(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, 1, 4, 1, 15)
+ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15)
+#endif
+#undef ANY12
+
+// Any 1 to 2 with source stride (2 rows of source).  Outputs UV planes.
+// 128 byte row allows for 32 avx ARGB pixels.
+#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK)                          \
+    void NAMEANY(const uint8* src_ptr, int src_stride_ptr,                     \
+                 uint8* dst_u, uint8* dst_v, int width) {                      \
+      SIMD_ALIGNED(uint8 temp[128 * 4]);                                       \
+      memset(temp, 0, 128 * 2);  /* for msan */                                \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(src_ptr, src_stride_ptr, dst_u, dst_v, n);                    \
+      }                                                                        \
+      memcpy(temp, src_ptr  + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);     \
+      memcpy(temp + 128, src_ptr  + src_stride_ptr + (n >> UVSHIFT) * BPP,     \
+             SS(r, UVSHIFT) * BPP);                                            \
+      if ((width & 1) && UVSHIFT == 0) {  /* repeat last pixel for subsample */\
+        memcpy(temp + SS(r, UVSHIFT) * BPP,                                    \
+               temp + SS(r, UVSHIFT) * BPP - BPP, BPP);                        \
+        memcpy(temp + 128 + SS(r, UVSHIFT) * BPP,                              \
+               temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP);                  \
+      }                                                                        \
+      ANY_SIMD(temp, 128, temp + 256, temp + 384, MASK + 1);                   \
+      memcpy(dst_u + (n >> 1), temp + 256, SS(r, 1));                          \
+      memcpy(dst_v + (n >> 1), temp + 384, SS(r, 1));                          \
+    }
+
+#ifdef HAS_ARGBTOUVROW_AVX2
+ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31)
+#endif
+#ifdef HAS_ARGBTOUVJROW_AVX2
+ANY12S(ARGBToUVJRow_Any_AVX2, ARGBToUVJRow_AVX2, 0, 4, 31)
+#endif
+#ifdef HAS_ARGBTOUVROW_SSSE3
+ANY12S(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_SSSE3, 0, 4, 15)
+ANY12S(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_SSSE3, 0, 4, 15)
+ANY12S(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_SSSE3, 0, 4, 15)
+ANY12S(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_SSSE3, 0, 4, 15)
+ANY12S(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_SSSE3, 0, 4, 15)
+#endif
+#ifdef HAS_YUY2TOUVROW_AVX2
+ANY12S(YUY2ToUVRow_Any_AVX2, YUY2ToUVRow_AVX2, 1, 4, 31)
+ANY12S(UYVYToUVRow_Any_AVX2, UYVYToUVRow_AVX2, 1, 4, 31)
+#endif
+#ifdef HAS_YUY2TOUVROW_SSE2
+ANY12S(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_SSE2, 1, 4, 15)
+ANY12S(UYVYToUVRow_Any_SSE2, UYVYToUVRow_SSE2, 1, 4, 15)
+#endif
+#ifdef HAS_ARGBTOUVROW_NEON
+ANY12S(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_ARGBTOUVJROW_NEON
+ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_BGRATOUVROW_NEON
+ANY12S(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_ABGRTOUVROW_NEON
+ANY12S(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_RGBATOUVROW_NEON
+ANY12S(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_RGB24TOUVROW_NEON
+ANY12S(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, 0, 3, 15)
+#endif
+#ifdef HAS_RAWTOUVROW_NEON
+ANY12S(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, 0, 3, 15)
+#endif
+#ifdef HAS_RGB565TOUVROW_NEON
+ANY12S(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, 0, 2, 15)
+#endif
+#ifdef HAS_ARGB1555TOUVROW_NEON
+ANY12S(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, 0, 2, 15)
+#endif
+#ifdef HAS_ARGB4444TOUVROW_NEON
+ANY12S(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, 0, 2, 15)
+#endif
+#ifdef HAS_YUY2TOUVROW_NEON
+ANY12S(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, 1, 4, 15)
+#endif
+#ifdef HAS_UYVYTOUVROW_NEON
+ANY12S(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, 1, 4, 15)
+#endif
+#undef ANY12S
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libyuv/source/row_common.cc b/libs/libyuv/source/row_common.cc
new file mode 100644
index 0000000000..2b80d074ce
--- /dev/null
+++ b/libs/libyuv/source/row_common.cc
@@ -0,0 +1,2614 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#include <string.h>  // For memcpy and memset.
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// llvm x86 is poor at ternary operator, so use branchless min/max.
+
+#define USE_BRANCHLESS 1
+#if USE_BRANCHLESS
+static __inline int32 clamp0(int32 v) {
+  return ((-(v) >> 31) & (v));
+}
+
+static __inline int32 clamp255(int32 v) {
+  return (((255 - (v)) >> 31) | (v)) & 255;
+}
+
+static __inline uint32 Clamp(int32 val) {
+  int v = clamp0(val);
+  return (uint32)(clamp255(v));
+}
+
+static __inline uint32 Abs(int32 v) {
+  int m = v >> 31;
+  return (v + m) ^ m;
+}
+#else  // USE_BRANCHLESS
+static __inline int32 clamp0(int32 v) {
+  return (v < 0) ? 0 : v;
+}
+
+static __inline int32 clamp255(int32 v) {
+  return (v > 255) ? 255 : v;
+}
+
+static __inline uint32 Clamp(int32 val) {
+  int v = clamp0(val);
+  return (uint32)(clamp255(v));
+}
+
+static __inline uint32 Abs(int32 v) {
+  return (v < 0) ? -v : v;
+}
+#endif  // USE_BRANCHLESS
+
+#ifdef LIBYUV_LITTLE_ENDIAN
+#define WRITEWORD(p, v) *(uint32*)(p) = v
+#else
+static inline void WRITEWORD(uint8* p, uint32 v) {
+  p[0] = (uint8)(v & 255);
+  p[1] = (uint8)((v >> 8) & 255);
+  p[2] = (uint8)((v >> 16) & 255);
+  p[3] = (uint8)((v >> 24) & 255);
+}
+#endif
+
+void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 b = src_rgb24[0];
+    uint8 g = src_rgb24[1];
+    uint8 r = src_rgb24[2];
+    dst_argb[0] = b;
+    dst_argb[1] = g;
+    dst_argb[2] = r;
+    dst_argb[3] = 255u;
+    dst_argb += 4;
+    src_rgb24 += 3;
+  }
+}
+
+void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 r = src_raw[0];
+    uint8 g = src_raw[1];
+    uint8 b = src_raw[2];
+    dst_argb[0] = b;
+    dst_argb[1] = g;
+    dst_argb[2] = r;
+    dst_argb[3] = 255u;
+    dst_argb += 4;
+    src_raw += 3;
+  }
+}
+
+void RAWToRGB24Row_C(const uint8* src_raw, uint8* dst_rgb24, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 r = src_raw[0];
+    uint8 g = src_raw[1];
+    uint8 b = src_raw[2];
+    dst_rgb24[0] = b;
+    dst_rgb24[1] = g;
+    dst_rgb24[2] = r;
+    dst_rgb24 += 3;
+    src_raw += 3;
+  }
+}
+
+void RGB565ToARGBRow_C(const uint8* src_rgb565, uint8* dst_argb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 b = src_rgb565[0] & 0x1f;
+    uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+    uint8 r = src_rgb565[1] >> 3;
+    dst_argb[0] = (b << 3) | (b >> 2);
+    dst_argb[1] = (g << 2) | (g >> 4);
+    dst_argb[2] = (r << 3) | (r >> 2);
+    dst_argb[3] = 255u;
+    dst_argb += 4;
+    src_rgb565 += 2;
+  }
+}
+
+void ARGB1555ToARGBRow_C(const uint8* src_argb1555, uint8* dst_argb,
+                         int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 b = src_argb1555[0] & 0x1f;
+    uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+    uint8 r = (src_argb1555[1] & 0x7c) >> 2;
+    uint8 a = src_argb1555[1] >> 7;
+    dst_argb[0] = (b << 3) | (b >> 2);
+    dst_argb[1] = (g << 3) | (g >> 2);
+    dst_argb[2] = (r << 3) | (r >> 2);
+    dst_argb[3] = -a;
+    dst_argb += 4;
+    src_argb1555 += 2;
+  }
+}
+
+void ARGB4444ToARGBRow_C(const uint8* src_argb4444, uint8* dst_argb,
+                         int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 b = src_argb4444[0] & 0x0f;
+    uint8 g = src_argb4444[0] >> 4;
+    uint8 r = src_argb4444[1] & 0x0f;
+    uint8 a = src_argb4444[1] >> 4;
+    dst_argb[0] = (b << 4) | b;
+    dst_argb[1] = (g << 4) | g;
+    dst_argb[2] = (r << 4) | r;
+    dst_argb[3] = (a << 4) | a;
+    dst_argb += 4;
+    src_argb4444 += 2;
+  }
+}
+
+void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 b = src_argb[0];
+    uint8 g = src_argb[1];
+    uint8 r = src_argb[2];
+    dst_rgb[0] = b;
+    dst_rgb[1] = g;
+    dst_rgb[2] = r;
+    dst_rgb += 3;
+    src_argb += 4;
+  }
+}
+
+void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 b = src_argb[0];
+    uint8 g = src_argb[1];
+    uint8 r = src_argb[2];
+    dst_rgb[0] = r;
+    dst_rgb[1] = g;
+    dst_rgb[2] = b;
+    dst_rgb += 3;
+    src_argb += 4;
+  }
+}
+
+void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    uint8 b0 = src_argb[0] >> 3;
+    uint8 g0 = src_argb[1] >> 2;
+    uint8 r0 = src_argb[2] >> 3;
+    uint8 b1 = src_argb[4] >> 3;
+    uint8 g1 = src_argb[5] >> 2;
+    uint8 r1 = src_argb[6] >> 3;
+    WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) |
+              (b1 << 16) | (g1 << 21) | (r1 << 27));
+    dst_rgb += 4;
+    src_argb += 8;
+  }
+  if (width & 1) {
+    uint8 b0 = src_argb[0] >> 3;
+    uint8 g0 = src_argb[1] >> 2;
+    uint8 r0 = src_argb[2] >> 3;
+    *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
+  }
+}
+
+// dither4 is a row of 4 values from 4x4 dither matrix.
+// The 4x4 matrix contains values to increase RGB.  When converting to
+// fewer bits (565) this provides an ordered dither.
+// The order in the 4x4 matrix in first byte is upper left.
+// The 4 values are passed as an int, then referenced as an array, so
+// endian will not affect order of the original matrix.  But the dither4
+// will containing the first pixel in the lower byte for little endian
+// or the upper byte for big endian.
+void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb,
+                             const uint32 dither4, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    int dither0 = ((const unsigned char*)(&dither4))[x & 3];
+    int dither1 = ((const unsigned char*)(&dither4))[(x + 1) & 3];
+    uint8 b0 = clamp255(src_argb[0] + dither0) >> 3;
+    uint8 g0 = clamp255(src_argb[1] + dither0) >> 2;
+    uint8 r0 = clamp255(src_argb[2] + dither0) >> 3;
+    uint8 b1 = clamp255(src_argb[4] + dither1) >> 3;
+    uint8 g1 = clamp255(src_argb[5] + dither1) >> 2;
+    uint8 r1 = clamp255(src_argb[6] + dither1) >> 3;
+    WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) |
+              (b1 << 16) | (g1 << 21) | (r1 << 27));
+    dst_rgb += 4;
+    src_argb += 8;
+  }
+  if (width & 1) {
+    int dither0 = ((const unsigned char*)(&dither4))[(width - 1) & 3];
+    uint8 b0 = clamp255(src_argb[0] + dither0) >> 3;
+    uint8 g0 = clamp255(src_argb[1] + dither0) >> 2;
+    uint8 r0 = clamp255(src_argb[2] + dither0) >> 3;
+    *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
+  }
+}
+
+void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    uint8 b0 = src_argb[0] >> 3;
+    uint8 g0 = src_argb[1] >> 3;
+    uint8 r0 = src_argb[2] >> 3;
+    uint8 a0 = src_argb[3] >> 7;
+    uint8 b1 = src_argb[4] >> 3;
+    uint8 g1 = src_argb[5] >> 3;
+    uint8 r1 = src_argb[6] >> 3;
+    uint8 a1 = src_argb[7] >> 7;
+    *(uint32*)(dst_rgb) =
+        b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) |
+        (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31);
+    dst_rgb += 4;
+    src_argb += 8;
+  }
+  if (width & 1) {
+    uint8 b0 = src_argb[0] >> 3;
+    uint8 g0 = src_argb[1] >> 3;
+    uint8 r0 = src_argb[2] >> 3;
+    uint8 a0 = src_argb[3] >> 7;
+    *(uint16*)(dst_rgb) =
+        b0 | (g0 << 5) | (r0 << 10) | (a0 << 15);
+  }
+}
+
+void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    uint8 b0 = src_argb[0] >> 4;
+    uint8 g0 = src_argb[1] >> 4;
+    uint8 r0 = src_argb[2] >> 4;
+    uint8 a0 = src_argb[3] >> 4;
+    uint8 b1 = src_argb[4] >> 4;
+    uint8 g1 = src_argb[5] >> 4;
+    uint8 r1 = src_argb[6] >> 4;
+    uint8 a1 = src_argb[7] >> 4;
+    *(uint32*)(dst_rgb) =
+        b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) |
+        (b1 << 16) | (g1 << 20) | (r1 << 24) | (a1 << 28);
+    dst_rgb += 4;
+    src_argb += 8;
+  }
+  if (width & 1) {
+    uint8 b0 = src_argb[0] >> 4;
+    uint8 g0 = src_argb[1] >> 4;
+    uint8 r0 = src_argb[2] >> 4;
+    uint8 a0 = src_argb[3] >> 4;
+    *(uint16*)(dst_rgb) =
+        b0 | (g0 << 4) | (r0 << 8) | (a0 << 12);
+  }
+}
+
+static __inline int RGBToY(uint8 r, uint8 g, uint8 b) {
+  return (66 * r + 129 * g +  25 * b + 0x1080) >> 8;
+}
+
+static __inline int RGBToU(uint8 r, uint8 g, uint8 b) {
+  return (112 * b - 74 * g - 38 * r + 0x8080) >> 8;
+}
+static __inline int RGBToV(uint8 r, uint8 g, uint8 b) {
+  return (112 * r - 94 * g - 18 * b + 0x8080) >> 8;
+}
+
+#define MAKEROWY(NAME, R, G, B, BPP) \
+void NAME ## ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) {       \
+  int x;                                                                       \
+  for (x = 0; x < width; ++x) {                                                \
+    dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]);               \
+    src_argb0 += BPP;                                                          \
+    dst_y += 1;                                                                \
+  }                                                                            \
+}                                                                              \
+void NAME ## ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb,              \
+                       uint8* dst_u, uint8* dst_v, int width) {                \
+  const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;                           \
+  int x;                                                                       \
+  for (x = 0; x < width - 1; x += 2) {                                         \
+    uint8 ab = (src_rgb0[B] + src_rgb0[B + BPP] +                              \
+               src_rgb1[B] + src_rgb1[B + BPP]) >> 2;                          \
+    uint8 ag = (src_rgb0[G] + src_rgb0[G + BPP] +                              \
+               src_rgb1[G] + src_rgb1[G + BPP]) >> 2;                          \
+    uint8 ar = (src_rgb0[R] + src_rgb0[R + BPP] +                              \
+               src_rgb1[R] + src_rgb1[R + BPP]) >> 2;                          \
+    dst_u[0] = RGBToU(ar, ag, ab);                                             \
+    dst_v[0] = RGBToV(ar, ag, ab);                                             \
+    src_rgb0 += BPP * 2;                                                       \
+    src_rgb1 += BPP * 2;                                                       \
+    dst_u += 1;                                                                \
+    dst_v += 1;                                                                \
+  }                                                                            \
+  if (width & 1) {                                                             \
+    uint8 ab = (src_rgb0[B] + src_rgb1[B]) >> 1;                               \
+    uint8 ag = (src_rgb0[G] + src_rgb1[G]) >> 1;                               \
+    uint8 ar = (src_rgb0[R] + src_rgb1[R]) >> 1;                               \
+    dst_u[0] = RGBToU(ar, ag, ab);                                             \
+    dst_v[0] = RGBToV(ar, ag, ab);                                             \
+  }                                                                            \
+}
+
+MAKEROWY(ARGB, 2, 1, 0, 4)
+MAKEROWY(BGRA, 1, 2, 3, 4)
+MAKEROWY(ABGR, 0, 1, 2, 4)
+MAKEROWY(RGBA, 3, 2, 1, 4)
+MAKEROWY(RGB24, 2, 1, 0, 3)
+MAKEROWY(RAW, 0, 1, 2, 3)
+#undef MAKEROWY
+
+// JPeg uses a variation on BT.601-1 full range
+// y =  0.29900 * r + 0.58700 * g + 0.11400 * b
+// u = -0.16874 * r - 0.33126 * g + 0.50000 * b  + center
+// v =  0.50000 * r - 0.41869 * g - 0.08131 * b  + center
+// BT.601 Mpeg range uses:
+// b 0.1016 * 255 = 25.908 = 25
+// g 0.5078 * 255 = 129.489 = 129
+// r 0.2578 * 255 = 65.739 = 66
+// JPeg 8 bit Y (not used):
+// b 0.11400 * 256 = 29.184 = 29
+// g 0.58700 * 256 = 150.272 = 150
+// r 0.29900 * 256 = 76.544 = 77
+// JPeg 7 bit Y:
+// b 0.11400 * 128 = 14.592 = 15
+// g 0.58700 * 128 = 75.136 = 75
+// r 0.29900 * 128 = 38.272 = 38
+// JPeg 8 bit U:
+// b  0.50000 * 255 = 127.5 = 127
+// g -0.33126 * 255 = -84.4713 = -84
+// r -0.16874 * 255 = -43.0287 = -43
+// JPeg 8 bit V:
+// b -0.08131 * 255 = -20.73405 = -20
+// g -0.41869 * 255 = -106.76595 = -107
+// r  0.50000 * 255 = 127.5 = 127
+
+static __inline int RGBToYJ(uint8 r, uint8 g, uint8 b) {
+  return (38 * r + 75 * g +  15 * b + 64) >> 7;
+}
+
+static __inline int RGBToUJ(uint8 r, uint8 g, uint8 b) {
+  return (127 * b - 84 * g - 43 * r + 0x8080) >> 8;
+}
+static __inline int RGBToVJ(uint8 r, uint8 g, uint8 b) {
+  return (127 * r - 107 * g - 20 * b + 0x8080) >> 8;
+}
+
+#define AVGB(a, b) (((a) + (b) + 1) >> 1)
+
+#define MAKEROWYJ(NAME, R, G, B, BPP) \
+void NAME ## ToYJRow_C(const uint8* src_argb0, uint8* dst_y, int width) {      \
+  int x;                                                                       \
+  for (x = 0; x < width; ++x) {                                                \
+    dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]);              \
+    src_argb0 += BPP;                                                          \
+    dst_y += 1;                                                                \
+  }                                                                            \
+}                                                                              \
+void NAME ## ToUVJRow_C(const uint8* src_rgb0, int src_stride_rgb,             \
+                        uint8* dst_u, uint8* dst_v, int width) {               \
+  const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;                           \
+  int x;                                                                       \
+  for (x = 0; x < width - 1; x += 2) {                                         \
+    uint8 ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]),                            \
+                    AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP]));               \
+    uint8 ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]),                            \
+                    AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP]));               \
+    uint8 ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]),                            \
+                    AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP]));               \
+    dst_u[0] = RGBToUJ(ar, ag, ab);                                            \
+    dst_v[0] = RGBToVJ(ar, ag, ab);                                            \
+    src_rgb0 += BPP * 2;                                                       \
+    src_rgb1 += BPP * 2;                                                       \
+    dst_u += 1;                                                                \
+    dst_v += 1;                                                                \
+  }                                                                            \
+  if (width & 1) {                                                             \
+    uint8 ab = AVGB(src_rgb0[B], src_rgb1[B]);                                 \
+    uint8 ag = AVGB(src_rgb0[G], src_rgb1[G]);                                 \
+    uint8 ar = AVGB(src_rgb0[R], src_rgb1[R]);                                 \
+    dst_u[0] = RGBToUJ(ar, ag, ab);                                            \
+    dst_v[0] = RGBToVJ(ar, ag, ab);                                            \
+  }                                                                            \
+}
+
+MAKEROWYJ(ARGB, 2, 1, 0, 4)
+#undef MAKEROWYJ
+
+void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 b = src_rgb565[0] & 0x1f;
+    uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+    uint8 r = src_rgb565[1] >> 3;
+    b = (b << 3) | (b >> 2);
+    g = (g << 2) | (g >> 4);
+    r = (r << 3) | (r >> 2);
+    dst_y[0] = RGBToY(r, g, b);
+    src_rgb565 += 2;
+    dst_y += 1;
+  }
+}
+
+void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 b = src_argb1555[0] & 0x1f;
+    uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+    uint8 r = (src_argb1555[1] & 0x7c) >> 2;
+    b = (b << 3) | (b >> 2);
+    g = (g << 3) | (g >> 2);
+    r = (r << 3) | (r >> 2);
+    dst_y[0] = RGBToY(r, g, b);
+    src_argb1555 += 2;
+    dst_y += 1;
+  }
+}
+
+void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 b = src_argb4444[0] & 0x0f;
+    uint8 g = src_argb4444[0] >> 4;
+    uint8 r = src_argb4444[1] & 0x0f;
+    b = (b << 4) | b;
+    g = (g << 4) | g;
+    r = (r << 4) | r;
+    dst_y[0] = RGBToY(r, g, b);
+    src_argb4444 += 2;
+    dst_y += 1;
+  }
+}
+
+void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565,
+                     uint8* dst_u, uint8* dst_v, int width) {
+  const uint8* next_rgb565 = src_rgb565 + src_stride_rgb565;
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    uint8 b0 = src_rgb565[0] & 0x1f;
+    uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+    uint8 r0 = src_rgb565[1] >> 3;
+    uint8 b1 = src_rgb565[2] & 0x1f;
+    uint8 g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3);
+    uint8 r1 = src_rgb565[3] >> 3;
+    uint8 b2 = next_rgb565[0] & 0x1f;
+    uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
+    uint8 r2 = next_rgb565[1] >> 3;
+    uint8 b3 = next_rgb565[2] & 0x1f;
+    uint8 g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3);
+    uint8 r3 = next_rgb565[3] >> 3;
+    uint8 b = (b0 + b1 + b2 + b3);  // 565 * 4 = 787.
+    uint8 g = (g0 + g1 + g2 + g3);
+    uint8 r = (r0 + r1 + r2 + r3);
+    b = (b << 1) | (b >> 6);  // 787 -> 888.
+    r = (r << 1) | (r >> 6);
+    dst_u[0] = RGBToU(r, g, b);
+    dst_v[0] = RGBToV(r, g, b);
+    src_rgb565 += 4;
+    next_rgb565 += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+  if (width & 1) {
+    uint8 b0 = src_rgb565[0] & 0x1f;
+    uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+    uint8 r0 = src_rgb565[1] >> 3;
+    uint8 b2 = next_rgb565[0] & 0x1f;
+    uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
+    uint8 r2 = next_rgb565[1] >> 3;
+    uint8 b = (b0 + b2);  // 565 * 2 = 676.
+    uint8 g = (g0 + g2);
+    uint8 r = (r0 + r2);
+    b = (b << 2) | (b >> 4);  // 676 -> 888
+    g = (g << 1) | (g >> 6);
+    r = (r << 2) | (r >> 4);
+    dst_u[0] = RGBToU(r, g, b);
+    dst_v[0] = RGBToV(r, g, b);
+  }
+}
+
+void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  const uint8* next_argb1555 = src_argb1555 + src_stride_argb1555;
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    uint8 b0 = src_argb1555[0] & 0x1f;
+    uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+    uint8 r0 = (src_argb1555[1] & 0x7c) >> 2;
+    uint8 b1 = src_argb1555[2] & 0x1f;
+    uint8 g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3);
+    uint8 r1 = (src_argb1555[3] & 0x7c) >> 2;
+    uint8 b2 = next_argb1555[0] & 0x1f;
+    uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
+    uint8 r2 = (next_argb1555[1] & 0x7c) >> 2;
+    uint8 b3 = next_argb1555[2] & 0x1f;
+    uint8 g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3);
+    uint8 r3 = (next_argb1555[3] & 0x7c) >> 2;
+    uint8 b = (b0 + b1 + b2 + b3);  // 555 * 4 = 777.
+    uint8 g = (g0 + g1 + g2 + g3);
+    uint8 r = (r0 + r1 + r2 + r3);
+    b = (b << 1) | (b >> 6);  // 777 -> 888.
+    g = (g << 1) | (g >> 6);
+    r = (r << 1) | (r >> 6);
+    dst_u[0] = RGBToU(r, g, b);
+    dst_v[0] = RGBToV(r, g, b);
+    src_argb1555 += 4;
+    next_argb1555 += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+  if (width & 1) {
+    uint8 b0 = src_argb1555[0] & 0x1f;
+    uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+    uint8 r0 = (src_argb1555[1] & 0x7c) >> 2;
+    uint8 b2 = next_argb1555[0] & 0x1f;
+    uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
+    uint8 r2 = next_argb1555[1] >> 3;
+    uint8 b = (b0 + b2);  // 555 * 2 = 666.
+    uint8 g = (g0 + g2);
+    uint8 r = (r0 + r2);
+    b = (b << 2) | (b >> 4);  // 666 -> 888.
+    g = (g << 2) | (g >> 4);
+    r = (r << 2) | (r >> 4);
+    dst_u[0] = RGBToU(r, g, b);
+    dst_v[0] = RGBToV(r, g, b);
+  }
+}
+
+void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  const uint8* next_argb4444 = src_argb4444 + src_stride_argb4444;
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    uint8 b0 = src_argb4444[0] & 0x0f;
+    uint8 g0 = src_argb4444[0] >> 4;
+    uint8 r0 = src_argb4444[1] & 0x0f;
+    uint8 b1 = src_argb4444[2] & 0x0f;
+    uint8 g1 = src_argb4444[2] >> 4;
+    uint8 r1 = src_argb4444[3] & 0x0f;
+    uint8 b2 = next_argb4444[0] & 0x0f;
+    uint8 g2 = next_argb4444[0] >> 4;
+    uint8 r2 = next_argb4444[1] & 0x0f;
+    uint8 b3 = next_argb4444[2] & 0x0f;
+    uint8 g3 = next_argb4444[2] >> 4;
+    uint8 r3 = next_argb4444[3] & 0x0f;
+    uint8 b = (b0 + b1 + b2 + b3);  // 444 * 4 = 666.
+    uint8 g = (g0 + g1 + g2 + g3);
+    uint8 r = (r0 + r1 + r2 + r3);
+    b = (b << 2) | (b >> 4);  // 666 -> 888.
+    g = (g << 2) | (g >> 4);
+    r = (r << 2) | (r >> 4);
+    dst_u[0] = RGBToU(r, g, b);
+    dst_v[0] = RGBToV(r, g, b);
+    src_argb4444 += 4;
+    next_argb4444 += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+  if (width & 1) {
+    uint8 b0 = src_argb4444[0] & 0x0f;
+    uint8 g0 = src_argb4444[0] >> 4;
+    uint8 r0 = src_argb4444[1] & 0x0f;
+    uint8 b2 = next_argb4444[0] & 0x0f;
+    uint8 g2 = next_argb4444[0] >> 4;
+    uint8 r2 = next_argb4444[1] & 0x0f;
+    uint8 b = (b0 + b2);  // 444 * 2 = 555.
+    uint8 g = (g0 + g2);
+    uint8 r = (r0 + r2);
+    b = (b << 3) | (b >> 2);  // 555 -> 888.
+    g = (g << 3) | (g >> 2);
+    r = (r << 3) | (r >> 2);
+    dst_u[0] = RGBToU(r, g, b);
+    dst_v[0] = RGBToV(r, g, b);
+  }
+}
+
+void ARGBToUV444Row_C(const uint8* src_argb,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 ab = src_argb[0];
+    uint8 ag = src_argb[1];
+    uint8 ar = src_argb[2];
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+    src_argb += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+}
+
+void ARGBToUV411Row_C(const uint8* src_argb,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  int x;
+  for (x = 0; x < width - 3; x += 4) {
+    uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8] + src_argb[12]) >> 2;
+    uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9] + src_argb[13]) >> 2;
+    uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10] + src_argb[14]) >> 2;
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+    src_argb += 16;
+    dst_u += 1;
+    dst_v += 1;
+  }
+  // Odd width handling mimics 'any' function which replicates last pixel.
+  if ((width & 3) == 3) {
+    uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8] + src_argb[8]) >> 2;
+    uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9] + src_argb[9]) >> 2;
+    uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10] + src_argb[10]) >> 2;
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+  } else if ((width & 3) == 2) {
+    uint8 ab = (src_argb[0] + src_argb[4]) >> 1;
+    uint8 ag = (src_argb[1] + src_argb[5]) >> 1;
+    uint8 ar = (src_argb[2] + src_argb[6]) >> 1;
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+  } else if ((width & 3) == 1) {
+    uint8 ab = src_argb[0];
+    uint8 ag = src_argb[1];
+    uint8 ar = src_argb[2];
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+  }
+}
+
+void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 y = RGBToYJ(src_argb[2], src_argb[1], src_argb[0]);
+    dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
+    dst_argb[3] = src_argb[3];
+    dst_argb += 4;
+    src_argb += 4;
+  }
+}
+
+// Convert a row of image to Sepia tone.
+void ARGBSepiaRow_C(uint8* dst_argb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    int b = dst_argb[0];
+    int g = dst_argb[1];
+    int r = dst_argb[2];
+    int sb = (b * 17 + g * 68 + r * 35) >> 7;
+    int sg = (b * 22 + g * 88 + r * 45) >> 7;
+    int sr = (b * 24 + g * 98 + r * 50) >> 7;
+    // b does not over flow. a is preserved from original.
+    dst_argb[0] = sb;
+    dst_argb[1] = clamp255(sg);
+    dst_argb[2] = clamp255(sr);
+    dst_argb += 4;
+  }
+}
+
+// Apply color matrix to a row of image. Matrix is signed.
+// TODO(fbarchard): Consider adding rounding (+32).
+void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb,
+                          const int8* matrix_argb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    int b = src_argb[0];
+    int g = src_argb[1];
+    int r = src_argb[2];
+    int a = src_argb[3];
+    int sb = (b * matrix_argb[0] + g * matrix_argb[1] +
+              r * matrix_argb[2] + a * matrix_argb[3]) >> 6;
+    int sg = (b * matrix_argb[4] + g * matrix_argb[5] +
+              r * matrix_argb[6] + a * matrix_argb[7]) >> 6;
+    int sr = (b * matrix_argb[8] + g * matrix_argb[9] +
+              r * matrix_argb[10] + a * matrix_argb[11]) >> 6;
+    int sa = (b * matrix_argb[12] + g * matrix_argb[13] +
+              r * matrix_argb[14] + a * matrix_argb[15]) >> 6;
+    dst_argb[0] = Clamp(sb);
+    dst_argb[1] = Clamp(sg);
+    dst_argb[2] = Clamp(sr);
+    dst_argb[3] = Clamp(sa);
+    src_argb += 4;
+    dst_argb += 4;
+  }
+}
+
+// Apply color table to a row of image.
+void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    int b = dst_argb[0];
+    int g = dst_argb[1];
+    int r = dst_argb[2];
+    int a = dst_argb[3];
+    dst_argb[0] = table_argb[b * 4 + 0];
+    dst_argb[1] = table_argb[g * 4 + 1];
+    dst_argb[2] = table_argb[r * 4 + 2];
+    dst_argb[3] = table_argb[a * 4 + 3];
+    dst_argb += 4;
+  }
+}
+
+// Apply color table to a row of image.
+void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    int b = dst_argb[0];
+    int g = dst_argb[1];
+    int r = dst_argb[2];
+    dst_argb[0] = table_argb[b * 4 + 0];
+    dst_argb[1] = table_argb[g * 4 + 1];
+    dst_argb[2] = table_argb[r * 4 + 2];
+    dst_argb += 4;
+  }
+}
+
+void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,
+                       int interval_offset, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    int b = dst_argb[0];
+    int g = dst_argb[1];
+    int r = dst_argb[2];
+    dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset;
+    dst_argb[1] = (g * scale >> 16) * interval_size + interval_offset;
+    dst_argb[2] = (r * scale >> 16) * interval_size + interval_offset;
+    dst_argb += 4;
+  }
+}
+
+#define REPEAT8(v) (v) | ((v) << 8)
+#define SHADE(f, v) v * f >> 24
+
+void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
+                    uint32 value) {
+  const uint32 b_scale = REPEAT8(value & 0xff);
+  const uint32 g_scale = REPEAT8((value >> 8) & 0xff);
+  const uint32 r_scale = REPEAT8((value >> 16) & 0xff);
+  const uint32 a_scale = REPEAT8(value >> 24);
+
+  int i;
+  for (i = 0; i < width; ++i) {
+    const uint32 b = REPEAT8(src_argb[0]);
+    const uint32 g = REPEAT8(src_argb[1]);
+    const uint32 r = REPEAT8(src_argb[2]);
+    const uint32 a = REPEAT8(src_argb[3]);
+    dst_argb[0] = SHADE(b, b_scale);
+    dst_argb[1] = SHADE(g, g_scale);
+    dst_argb[2] = SHADE(r, r_scale);
+    dst_argb[3] = SHADE(a, a_scale);
+    src_argb += 4;
+    dst_argb += 4;
+  }
+}
+#undef REPEAT8
+#undef SHADE
+
+#define REPEAT8(v) (v) | ((v) << 8)
+#define SHADE(f, v) v * f >> 16
+
+void ARGBMultiplyRow_C(const uint8* src_argb0, const uint8* src_argb1,
+                       uint8* dst_argb, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    const uint32 b = REPEAT8(src_argb0[0]);
+    const uint32 g = REPEAT8(src_argb0[1]);
+    const uint32 r = REPEAT8(src_argb0[2]);
+    const uint32 a = REPEAT8(src_argb0[3]);
+    const uint32 b_scale = src_argb1[0];
+    const uint32 g_scale = src_argb1[1];
+    const uint32 r_scale = src_argb1[2];
+    const uint32 a_scale = src_argb1[3];
+    dst_argb[0] = SHADE(b, b_scale);
+    dst_argb[1] = SHADE(g, g_scale);
+    dst_argb[2] = SHADE(r, r_scale);
+    dst_argb[3] = SHADE(a, a_scale);
+    src_argb0 += 4;
+    src_argb1 += 4;
+    dst_argb += 4;
+  }
+}
+#undef REPEAT8
+#undef SHADE
+
+#define SHADE(f, v) clamp255(v + f)
+
+void ARGBAddRow_C(const uint8* src_argb0, const uint8* src_argb1,
+                  uint8* dst_argb, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    const int b = src_argb0[0];
+    const int g = src_argb0[1];
+    const int r = src_argb0[2];
+    const int a = src_argb0[3];
+    const int b_add = src_argb1[0];
+    const int g_add = src_argb1[1];
+    const int r_add = src_argb1[2];
+    const int a_add = src_argb1[3];
+    dst_argb[0] = SHADE(b, b_add);
+    dst_argb[1] = SHADE(g, g_add);
+    dst_argb[2] = SHADE(r, r_add);
+    dst_argb[3] = SHADE(a, a_add);
+    src_argb0 += 4;
+    src_argb1 += 4;
+    dst_argb += 4;
+  }
+}
+#undef SHADE
+
+#define SHADE(f, v) clamp0(f - v)
+
+void ARGBSubtractRow_C(const uint8* src_argb0, const uint8* src_argb1,
+                       uint8* dst_argb, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    const int b = src_argb0[0];
+    const int g = src_argb0[1];
+    const int r = src_argb0[2];
+    const int a = src_argb0[3];
+    const int b_sub = src_argb1[0];
+    const int g_sub = src_argb1[1];
+    const int r_sub = src_argb1[2];
+    const int a_sub = src_argb1[3];
+    dst_argb[0] = SHADE(b, b_sub);
+    dst_argb[1] = SHADE(g, g_sub);
+    dst_argb[2] = SHADE(r, r_sub);
+    dst_argb[3] = SHADE(a, a_sub);
+    src_argb0 += 4;
+    src_argb1 += 4;
+    dst_argb += 4;
+  }
+}
+#undef SHADE
+
+// Sobel functions which mimics SSSE3.
+void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2,
+                 uint8* dst_sobelx, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    int a = src_y0[i];
+    int b = src_y1[i];
+    int c = src_y2[i];
+    int a_sub = src_y0[i + 2];
+    int b_sub = src_y1[i + 2];
+    int c_sub = src_y2[i + 2];
+    int a_diff = a - a_sub;
+    int b_diff = b - b_sub;
+    int c_diff = c - c_sub;
+    int sobel = Abs(a_diff + b_diff * 2 + c_diff);
+    dst_sobelx[i] = (uint8)(clamp255(sobel));
+  }
+}
+
+void SobelYRow_C(const uint8* src_y0, const uint8* src_y1,
+                 uint8* dst_sobely, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    int a = src_y0[i + 0];
+    int b = src_y0[i + 1];
+    int c = src_y0[i + 2];
+    int a_sub = src_y1[i + 0];
+    int b_sub = src_y1[i + 1];
+    int c_sub = src_y1[i + 2];
+    int a_diff = a - a_sub;
+    int b_diff = b - b_sub;
+    int c_diff = c - c_sub;
+    int sobel = Abs(a_diff + b_diff * 2 + c_diff);
+    dst_sobely[i] = (uint8)(clamp255(sobel));
+  }
+}
+
+void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely,
+                uint8* dst_argb, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    int r = src_sobelx[i];
+    int b = src_sobely[i];
+    int s = clamp255(r + b);
+    dst_argb[0] = (uint8)(s);
+    dst_argb[1] = (uint8)(s);
+    dst_argb[2] = (uint8)(s);
+    dst_argb[3] = (uint8)(255u);
+    dst_argb += 4;
+  }
+}
+
+void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely,
+                       uint8* dst_y, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    int r = src_sobelx[i];
+    int b = src_sobely[i];
+    int s = clamp255(r + b);
+    dst_y[i] = (uint8)(s);
+  }
+}
+
+void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely,
+                  uint8* dst_argb, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    int r = src_sobelx[i];
+    int b = src_sobely[i];
+    int g = clamp255(r + b);
+    dst_argb[0] = (uint8)(b);
+    dst_argb[1] = (uint8)(g);
+    dst_argb[2] = (uint8)(r);
+    dst_argb[3] = (uint8)(255u);
+    dst_argb += 4;
+  }
+}
+
+void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
+  // Copy a Y to RGB.
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 y = src_y[0];
+    dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
+    dst_argb[3] = 255u;
+    dst_argb += 4;
+    ++src_y;
+  }
+}
+
+// TODO(fbarchard): Unify these structures to be platform independent.
+// TODO(fbarchard): Generate SIMD structures from float matrix.
+
+// BT.601 YUV to RGB reference
+//  R = (Y - 16) * 1.164              - V * -1.596
+//  G = (Y - 16) * 1.164 - U *  0.391 - V *  0.813
+//  B = (Y - 16) * 1.164 - U * -2.018
+
+// Y contribution to R,G,B.  Scale and bias.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
+
+// U and V contributions to R,G,B.
+#define UB -128 /* max(-128, round(-2.018 * 64)) */
+#define UG 25 /* round(0.391 * 64) */
+#define VG 52 /* round(0.813 * 64) */
+#define VR -102 /* round(-1.596 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BB (UB * 128            + YGB)
+#define BG (UG * 128 + VG * 128 + YGB)
+#define BR            (VR * 128 + YGB)
+
+#if defined(__aarch64__)
+const YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
+  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
+  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
+  { UG, VG, UG, VG, UG, VG, UG, VG },
+  { UG, VG, UG, VG, UG, VG, UG, VG },
+  { BB, BG, BR, 0, 0, 0, 0, 0 },
+  { 0x0101 * YG, 0, 0, 0 }
+};
+const YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
+  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
+  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
+  { VG, UG, VG, UG, VG, UG, VG, UG },
+  { VG, UG, VG, UG, VG, UG, VG, UG },
+  { BR, BG, BB, 0, 0, 0, 0, 0 },
+  { 0x0101 * YG, 0, 0, 0 }
+};
+#elif defined(__arm__)
+const YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
+  { -UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { BB, BG, BR, 0, 0, 0, 0, 0 },
+  { 0x0101 * YG, 0, 0, 0 }
+};
+const YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
+  { -VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { BR, BG, BB, 0, 0, 0, 0, 0 },
+  { 0x0101 * YG, 0, 0, 0 }
+};
+#else
+const YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
+  { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
+    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
+  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
+  { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
+    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
+  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
+  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
+  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
+  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
+};
+const YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
+  { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+    VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
+  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+    VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
+  { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+    0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
+  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
+  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
+  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
+  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
+};
+#endif
+
+#undef BB
+#undef BG
+#undef BR
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef YG
+
+// JPEG YUV to RGB reference
+// *  R = Y                - V * -1.40200
+// *  G = Y - U *  0.34414 - V *  0.71414
+// *  B = Y - U * -1.77200
+
+// Y contribution to R,G,B.  Scale and bias.
+#define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+#define YGB 32  /* 64 / 2 */
+
+// U and V contributions to R,G,B.
+#define UB -113 /* round(-1.77200 * 64) */
+#define UG 22 /* round(0.34414 * 64) */
+#define VG 46 /* round(0.71414  * 64) */
+#define VR -90 /* round(-1.40200 * 64) */
+
+// Bias values to round, and subtract 128 from U and V.
+#define BB (UB * 128            + YGB)
+#define BG (UG * 128 + VG * 128 + YGB)
+#define BR            (VR * 128 + YGB)
+
+#if defined(__aarch64__)
+const YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
+  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
+  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
+  { UG, VG, UG, VG, UG, VG, UG, VG },
+  { UG, VG, UG, VG, UG, VG, UG, VG },
+  { BB, BG, BR, 0, 0, 0, 0, 0 },
+  { 0x0101 * YG, 0, 0, 0 }
+};
+const YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
+  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
+  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
+  { VG, UG, VG, UG, VG, UG, VG, UG },
+  { VG, UG, VG, UG, VG, UG, VG, UG },
+  { BR, BG, BB, 0, 0, 0, 0, 0 },
+  { 0x0101 * YG, 0, 0, 0 }
+};
+#elif defined(__arm__)
+const YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
+  { -UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { BB, BG, BR, 0, 0, 0, 0, 0 },
+  { 0x0101 * YG, 0, 0, 0 }
+};
+const YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
+  { -VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { BR, BG, BB, 0, 0, 0, 0, 0 },
+  { 0x0101 * YG, 0, 0, 0 }
+};
+#else
+const YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
+  { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
+    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
+  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
+  { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
+    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
+  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
+  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
+  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
+  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
+};
+const YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
+  { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+    VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
+  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+    VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
+  { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+    0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
+  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
+  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
+  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
+  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
+};
+#endif
+
+#undef BB
+#undef BG
+#undef BR
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef YG
+
+// BT.709 YUV to RGB reference
+// *  R = Y                - V * -1.28033
+// *  G = Y - U *  0.21482 - V *  0.38059
+// *  B = Y - U * -2.12798
+
+// Y contribution to R,G,B.  Scale and bias.
+#define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+#define YGB 32  /* 64 / 2 */
+
+// TODO(fbarchard): Find way to express 2.12 instead of 2.0.
+// U and V contributions to R,G,B.
+#define UB -128 /* max(-128, round(-2.12798 * 64)) */
+#define UG 14 /* round(0.21482 * 64) */
+#define VG 24 /* round(0.38059  * 64) */
+#define VR -82 /* round(-1.28033 * 64) */
+
+// Bias values to round, and subtract 128 from U and V.
+#define BB (UB * 128            + YGB)
+#define BG (UG * 128 + VG * 128 + YGB)
+#define BR            (VR * 128 + YGB)
+
+#if defined(__aarch64__)
+const YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
+  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
+  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
+  { UG, VG, UG, VG, UG, VG, UG, VG },
+  { UG, VG, UG, VG, UG, VG, UG, VG },
+  { BB, BG, BR, 0, 0, 0, 0, 0 },
+  { 0x0101 * YG, 0, 0, 0 }
+};
+const YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
+  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
+  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
+  { VG, UG, VG, UG, VG, UG, VG, UG },
+  { VG, UG, VG, UG, VG, UG, VG, UG },
+  { BR, BG, BB, 0, 0, 0, 0, 0 },
+  { 0x0101 * YG, 0, 0, 0 }
+};
+#elif defined(__arm__)
+const YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
+  { -UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { BB, BG, BR, 0, 0, 0, 0, 0 },
+  { 0x0101 * YG, 0, 0, 0 }
+};
+const YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
+  { -VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { BR, BG, BB, 0, 0, 0, 0, 0 },
+  { 0x0101 * YG, 0, 0, 0 }
+};
+#else
+const YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
+  { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
+    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
+  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
+  { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
+    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
+  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
+  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
+  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
+  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
+};
+const YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
+  { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+    VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
+  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+    VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
+  { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+    0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
+  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
+  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
+  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
+  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
+};
+#endif
+
+#undef BB
+#undef BG
+#undef BR
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef YG
+
+// C reference code that mimics the YUV assembly.
+static __inline void YuvPixel(uint8 y, uint8 u, uint8 v,
+                              uint8* b, uint8* g, uint8* r,
+                              const struct YuvConstants* yuvconstants) {
+#if defined(__aarch64__)
+  int ub = -yuvconstants->kUVToRB[0];
+  int ug = yuvconstants->kUVToG[0];
+  int vg = yuvconstants->kUVToG[1];
+  int vr = -yuvconstants->kUVToRB[1];
+  int bb = yuvconstants->kUVBiasBGR[0];
+  int bg = yuvconstants->kUVBiasBGR[1];
+  int br = yuvconstants->kUVBiasBGR[2];
+  int yg = yuvconstants->kYToRgb[0] / 0x0101;
+#elif defined(__arm__)
+  int ub = -yuvconstants->kUVToRB[0];
+  int ug = yuvconstants->kUVToG[0];
+  int vg = yuvconstants->kUVToG[4];
+  int vr = -yuvconstants->kUVToRB[4];
+  int bb = yuvconstants->kUVBiasBGR[0];
+  int bg = yuvconstants->kUVBiasBGR[1];
+  int br = yuvconstants->kUVBiasBGR[2];
+  int yg = yuvconstants->kYToRgb[0] / 0x0101;
+#else
+  int ub = yuvconstants->kUVToB[0];
+  int ug = yuvconstants->kUVToG[0];
+  int vg = yuvconstants->kUVToG[1];
+  int vr = yuvconstants->kUVToR[1];
+  int bb = yuvconstants->kUVBiasB[0];
+  int bg = yuvconstants->kUVBiasG[0];
+  int br = yuvconstants->kUVBiasR[0];
+  int yg = yuvconstants->kYToRgb[0];
+#endif
+
+  uint32 y1 = (uint32)(y * 0x0101 * yg) >> 16;
+  *b = Clamp((int32)(-(u * ub         ) + y1 + bb) >> 6);
+  *g = Clamp((int32)(-(u * ug + v * vg) + y1 + bg) >> 6);
+  *r = Clamp((int32)(-(         v * vr) + y1 + br) >> 6);
+}
+
+// Y contribution to R,G,B.  Scale and bias.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
+
+// C reference code that mimics the YUV assembly.
+static __inline void YPixel(uint8 y, uint8* b, uint8* g, uint8* r) {
+  uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16;
+  *b = Clamp((int32)(y1 + YGB) >> 6);
+  *g = Clamp((int32)(y1 + YGB) >> 6);
+  *r = Clamp((int32)(y1 + YGB) >> 6);
+}
+
+#undef YG
+#undef YGB
+
+#if !defined(LIBYUV_DISABLE_NEON) && \
+    (defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON))
+// C mimic assembly.
+// TODO(fbarchard): Remove subsampling from Neon.
+void I444ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    uint8 u = (src_u[0] + src_u[1] + 1) >> 1;
+    uint8 v = (src_v[0] + src_v[1] + 1) >> 1;
+    YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2,
+             yuvconstants);
+    rgb_buf[3] = 255;
+    YuvPixel(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6,
+             yuvconstants);
+    rgb_buf[7] = 255;
+    src_y += 2;
+    src_u += 2;
+    src_v += 2;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+  }
+}
+#else
+void I444ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+    src_y += 1;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 4;  // Advance 1 pixel.
+  }
+}
+#endif
+
+// Also used for 420
+void I422ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+    YuvPixel(src_y[1], src_u[0], src_v[0],
+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+    rgb_buf[7] = 255;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+  }
+}
+
+void I422AlphaToARGBRow_C(const uint8* src_y,
+                          const uint8* src_u,
+                          const uint8* src_v,
+                          const uint8* src_a,
+                          uint8* rgb_buf,
+                          const struct YuvConstants* yuvconstants,
+                          int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = src_a[0];
+    YuvPixel(src_y[1], src_u[0], src_v[0],
+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+    rgb_buf[7] = src_a[1];
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    src_a += 2;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = src_a[0];
+  }
+}
+
+void I422ToRGB24Row_C(const uint8* src_y,
+                      const uint8* src_u,
+                      const uint8* src_v,
+                      uint8* rgb_buf,
+                      const struct YuvConstants* yuvconstants,
+                      int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    YuvPixel(src_y[1], src_u[0], src_v[0],
+             rgb_buf + 3, rgb_buf + 4, rgb_buf + 5, yuvconstants);
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 6;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+  }
+}
+
+void I422ToARGB4444Row_C(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_argb4444,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  uint8 b0;
+  uint8 g0;
+  uint8 r0;
+  uint8 b1;
+  uint8 g1;
+  uint8 r1;
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
+    b0 = b0 >> 4;
+    g0 = g0 >> 4;
+    r0 = r0 >> 4;
+    b1 = b1 >> 4;
+    g1 = g1 >> 4;
+    r1 = r1 >> 4;
+    *(uint32*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) |
+        (b1 << 16) | (g1 << 20) | (r1 << 24) | 0xf000f000;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    dst_argb4444 += 4;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+    b0 = b0 >> 4;
+    g0 = g0 >> 4;
+    r0 = r0 >> 4;
+    *(uint16*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) |
+        0xf000;
+  }
+}
+
+void I422ToARGB1555Row_C(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_argb1555,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  uint8 b0;
+  uint8 g0;
+  uint8 r0;
+  uint8 b1;
+  uint8 g1;
+  uint8 r1;
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
+    b0 = b0 >> 3;
+    g0 = g0 >> 3;
+    r0 = r0 >> 3;
+    b1 = b1 >> 3;
+    g1 = g1 >> 3;
+    r1 = r1 >> 3;
+    *(uint32*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) |
+        (b1 << 16) | (g1 << 21) | (r1 << 26) | 0x80008000;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    dst_argb1555 += 4;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+    b0 = b0 >> 3;
+    g0 = g0 >> 3;
+    r0 = r0 >> 3;
+    *(uint16*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) |
+        0x8000;
+  }
+}
+
+void I422ToRGB565Row_C(const uint8* src_y,
+                       const uint8* src_u,
+                       const uint8* src_v,
+                       uint8* dst_rgb565,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  uint8 b0;
+  uint8 g0;
+  uint8 r0;
+  uint8 b1;
+  uint8 g1;
+  uint8 r1;
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
+    b0 = b0 >> 3;
+    g0 = g0 >> 2;
+    r0 = r0 >> 3;
+    b1 = b1 >> 3;
+    g1 = g1 >> 2;
+    r1 = r1 >> 3;
+    *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
+        (b1 << 16) | (g1 << 21) | (r1 << 27);
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    dst_rgb565 += 4;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+    b0 = b0 >> 3;
+    g0 = g0 >> 2;
+    r0 = r0 >> 3;
+    *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
+  }
+}
+
+void I411ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 3; x += 4) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+    YuvPixel(src_y[1], src_u[0], src_v[0],
+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+    rgb_buf[7] = 255;
+    YuvPixel(src_y[2], src_u[0], src_v[0],
+             rgb_buf + 8, rgb_buf + 9, rgb_buf + 10, yuvconstants);
+    rgb_buf[11] = 255;
+    YuvPixel(src_y[3], src_u[0], src_v[0],
+             rgb_buf + 12, rgb_buf + 13, rgb_buf + 14, yuvconstants);
+    rgb_buf[15] = 255;
+    src_y += 4;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 16;  // Advance 4 pixels.
+  }
+  if (width & 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+    YuvPixel(src_y[1], src_u[0], src_v[0],
+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+    rgb_buf[7] = 255;
+    src_y += 2;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+  }
+}
+
+void NV12ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_uv,
+                     uint8* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_uv[0], src_uv[1],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+    YuvPixel(src_y[1], src_uv[0], src_uv[1],
+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+    rgb_buf[7] = 255;
+    src_y += 2;
+    src_uv += 2;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_uv[0], src_uv[1],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+  }
+}
+
+void NV21ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_vu,
+                     uint8* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_vu[1], src_vu[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+    YuvPixel(src_y[1], src_vu[1], src_vu[0],
+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+    rgb_buf[7] = 255;
+    src_y += 2;
+    src_vu += 2;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_vu[1], src_vu[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+  }
+}
+
+void NV12ToRGB565Row_C(const uint8* src_y,
+                       const uint8* src_uv,
+                       uint8* dst_rgb565,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  uint8 b0;
+  uint8 g0;
+  uint8 r0;
+  uint8 b1;
+  uint8 g1;
+  uint8 r1;
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants);
+    YuvPixel(src_y[1], src_uv[0], src_uv[1], &b1, &g1, &r1, yuvconstants);
+    b0 = b0 >> 3;
+    g0 = g0 >> 2;
+    r0 = r0 >> 3;
+    b1 = b1 >> 3;
+    g1 = g1 >> 2;
+    r1 = r1 >> 3;
+    *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
+        (b1 << 16) | (g1 << 21) | (r1 << 27);
+    src_y += 2;
+    src_uv += 2;
+    dst_rgb565 += 4;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants);
+    b0 = b0 >> 3;
+    g0 = g0 >> 2;
+    r0 = r0 >> 3;
+    *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
+  }
+}
+
+void YUY2ToARGBRow_C(const uint8* src_yuy2,
+                     uint8* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+    YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3],
+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+    rgb_buf[7] = 255;
+    src_yuy2 += 4;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+  }
+}
+
+void UYVYToARGBRow_C(const uint8* src_uyvy,
+                     uint8* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+    YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2],
+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+    rgb_buf[7] = 255;
+    src_uyvy += 4;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+  }
+}
+
+void I422ToRGBARow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 1, rgb_buf + 2, rgb_buf + 3, yuvconstants);
+    rgb_buf[0] = 255;
+    YuvPixel(src_y[1], src_u[0], src_v[0],
+             rgb_buf + 5, rgb_buf + 6, rgb_buf + 7, yuvconstants);
+    rgb_buf[4] = 255;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 1, rgb_buf + 2, rgb_buf + 3, yuvconstants);
+    rgb_buf[0] = 255;
+  }
+}
+
+void I400ToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+    YPixel(src_y[1], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+    rgb_buf[7] = 255;
+    src_y += 2;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+  }
+}
+
+void MirrorRow_C(const uint8* src, uint8* dst, int width) {
+  int x;
+  src += width - 1;
+  for (x = 0; x < width - 1; x += 2) {
+    dst[x] = src[0];
+    dst[x + 1] = src[-1];
+    src -= 2;
+  }
+  if (width & 1) {
+    dst[width - 1] = src[0];
+  }
+}
+
+void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
+  int x;
+  src_uv += (width - 1) << 1;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_u[x] = src_uv[0];
+    dst_u[x + 1] = src_uv[-2];
+    dst_v[x] = src_uv[1];
+    dst_v[x + 1] = src_uv[-2 + 1];
+    src_uv -= 4;
+  }
+  if (width & 1) {
+    dst_u[width - 1] = src_uv[0];
+    dst_v[width - 1] = src_uv[1];
+  }
+}
+
+void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width) {
+  int x;
+  const uint32* src32 = (const uint32*)(src);
+  uint32* dst32 = (uint32*)(dst);
+  src32 += width - 1;
+  for (x = 0; x < width - 1; x += 2) {
+    dst32[x] = src32[0];
+    dst32[x + 1] = src32[-1];
+    src32 -= 2;
+  }
+  if (width & 1) {
+    dst32[width - 1] = src32[0];
+  }
+}
+
+void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_u[x] = src_uv[0];
+    dst_u[x + 1] = src_uv[2];
+    dst_v[x] = src_uv[1];
+    dst_v[x + 1] = src_uv[3];
+    src_uv += 4;
+  }
+  if (width & 1) {
+    dst_u[width - 1] = src_uv[0];
+    dst_v[width - 1] = src_uv[1];
+  }
+}
+
+void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                  int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_uv[0] = src_u[x];
+    dst_uv[1] = src_v[x];
+    dst_uv[2] = src_u[x + 1];
+    dst_uv[3] = src_v[x + 1];
+    dst_uv += 4;
+  }
+  if (width & 1) {
+    dst_uv[0] = src_u[width - 1];
+    dst_uv[1] = src_v[width - 1];
+  }
+}
+
+void CopyRow_C(const uint8* src, uint8* dst, int count) {
+  memcpy(dst, src, count);
+}
+
+void CopyRow_16_C(const uint16* src, uint16* dst, int count) {
+  memcpy(dst, src, count * 2);
+}
+
+void SetRow_C(uint8* dst, uint8 v8, int width) {
+  memset(dst, v8, width);
+}
+
+void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int width) {
+  uint32* d = (uint32*)(dst_argb);
+  int x;
+  for (x = 0; x < width; ++x) {
+    d[x] = v32;
+  }
+}
+
+// Filter 2 rows of YUY2 UV's (422) into U and V (420).
+void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,
+                   uint8* dst_u, uint8* dst_v, int width) {
+  // Output a row of UV values, filtering 2 rows of YUY2.
+  int x;
+  for (x = 0; x < width; x += 2) {
+    dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1;
+    dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1;
+    src_yuy2 += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+}
+
+// Copy row of YUY2 UV's (422) into U and V (422).
+void YUY2ToUV422Row_C(const uint8* src_yuy2,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  // Output a row of UV values.
+  int x;
+  for (x = 0; x < width; x += 2) {
+    dst_u[0] = src_yuy2[1];
+    dst_v[0] = src_yuy2[3];
+    src_yuy2 += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+}
+
+// Copy row of YUY2 Y's (422) into Y (420/422).
+void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {
+  // Output a row of Y values.
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_y[x] = src_yuy2[0];
+    dst_y[x + 1] = src_yuy2[2];
+    src_yuy2 += 4;
+  }
+  if (width & 1) {
+    dst_y[width - 1] = src_yuy2[0];
+  }
+}
+
+// Filter 2 rows of UYVY UV's (422) into U and V (420).
+void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy,
+                   uint8* dst_u, uint8* dst_v, int width) {
+  // Output a row of UV values.
+  int x;
+  for (x = 0; x < width; x += 2) {
+    dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1;
+    dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1;
+    src_uyvy += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+}
+
+// Copy row of UYVY UV's (422) into U and V (422).
+void UYVYToUV422Row_C(const uint8* src_uyvy,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  // Output a row of UV values.
+  int x;
+  for (x = 0; x < width; x += 2) {
+    dst_u[0] = src_uyvy[0];
+    dst_v[0] = src_uyvy[2];
+    src_uyvy += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+}
+
+// Copy row of UYVY Y's (422) into Y (420/422).
+void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width) {
+  // Output a row of Y values.
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_y[x] = src_uyvy[1];
+    dst_y[x + 1] = src_uyvy[3];
+    src_uyvy += 4;
+  }
+  if (width & 1) {
+    dst_y[width - 1] = src_uyvy[1];
+  }
+}
+
+#define BLEND(f, b, a) (((256 - a) * b) >> 8) + f
+
+// Blend src_argb0 over src_argb1 and store to dst_argb.
+// dst_argb may be src_argb0 or src_argb1.
+// This code mimics the SSSE3 version for better testability.
+void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
+                    uint8* dst_argb, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    uint32 fb = src_argb0[0];
+    uint32 fg = src_argb0[1];
+    uint32 fr = src_argb0[2];
+    uint32 a = src_argb0[3];
+    uint32 bb = src_argb1[0];
+    uint32 bg = src_argb1[1];
+    uint32 br = src_argb1[2];
+    dst_argb[0] = BLEND(fb, bb, a);
+    dst_argb[1] = BLEND(fg, bg, a);
+    dst_argb[2] = BLEND(fr, br, a);
+    dst_argb[3] = 255u;
+
+    fb = src_argb0[4 + 0];
+    fg = src_argb0[4 + 1];
+    fr = src_argb0[4 + 2];
+    a = src_argb0[4 + 3];
+    bb = src_argb1[4 + 0];
+    bg = src_argb1[4 + 1];
+    br = src_argb1[4 + 2];
+    dst_argb[4 + 0] = BLEND(fb, bb, a);
+    dst_argb[4 + 1] = BLEND(fg, bg, a);
+    dst_argb[4 + 2] = BLEND(fr, br, a);
+    dst_argb[4 + 3] = 255u;
+    src_argb0 += 8;
+    src_argb1 += 8;
+    dst_argb += 8;
+  }
+
+  if (width & 1) {
+    uint32 fb = src_argb0[0];
+    uint32 fg = src_argb0[1];
+    uint32 fr = src_argb0[2];
+    uint32 a = src_argb0[3];
+    uint32 bb = src_argb1[0];
+    uint32 bg = src_argb1[1];
+    uint32 br = src_argb1[2];
+    dst_argb[0] = BLEND(fb, bb, a);
+    dst_argb[1] = BLEND(fg, bg, a);
+    dst_argb[2] = BLEND(fr, br, a);
+    dst_argb[3] = 255u;
+  }
+}
+#undef BLEND
+
+#define UBLEND(f, b, a) (((a) * f) + ((255 - a) * b) + 255) >> 8
+void BlendPlaneRow_C(const uint8* src0, const uint8* src1,
+                     const uint8* alpha, uint8* dst, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    dst[0] = UBLEND(src0[0], src1[0], alpha[0]);
+    dst[1] = UBLEND(src0[1], src1[1], alpha[1]);
+    src0 += 2;
+    src1 += 2;
+    alpha += 2;
+    dst += 2;
+  }
+  if (width & 1) {
+    dst[0] = UBLEND(src0[0], src1[0], alpha[0]);
+  }
+}
+#undef UBLEND
+
+#define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24
+
+// Multiply source RGB by alpha and store to destination.
+// This code mimics the SSSE3 version for better testability.
+void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
+  int i;
+  for (i = 0; i < width - 1; i += 2) {
+    uint32 b = src_argb[0];
+    uint32 g = src_argb[1];
+    uint32 r = src_argb[2];
+    uint32 a = src_argb[3];
+    dst_argb[0] = ATTENUATE(b, a);
+    dst_argb[1] = ATTENUATE(g, a);
+    dst_argb[2] = ATTENUATE(r, a);
+    dst_argb[3] = a;
+    b = src_argb[4];
+    g = src_argb[5];
+    r = src_argb[6];
+    a = src_argb[7];
+    dst_argb[4] = ATTENUATE(b, a);
+    dst_argb[5] = ATTENUATE(g, a);
+    dst_argb[6] = ATTENUATE(r, a);
+    dst_argb[7] = a;
+    src_argb += 8;
+    dst_argb += 8;
+  }
+
+  if (width & 1) {
+    const uint32 b = src_argb[0];
+    const uint32 g = src_argb[1];
+    const uint32 r = src_argb[2];
+    const uint32 a = src_argb[3];
+    dst_argb[0] = ATTENUATE(b, a);
+    dst_argb[1] = ATTENUATE(g, a);
+    dst_argb[2] = ATTENUATE(r, a);
+    dst_argb[3] = a;
+  }
+}
+#undef ATTENUATE
+
+// Divide source RGB by alpha and store to destination.
+// b = (b * 255 + (a / 2)) / a;
+// g = (g * 255 + (a / 2)) / a;
+// r = (r * 255 + (a / 2)) / a;
+// Reciprocal method is off by 1 on some values. ie 125
+// 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower.
+#define T(a) 0x01000000 + (0x10000 / a)
+const uint32 fixed_invtbl8[256] = {
+  0x01000000, 0x0100ffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07),
+  T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f),
+  T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17),
+  T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f),
+  T(0x20), T(0x21), T(0x22), T(0x23), T(0x24), T(0x25), T(0x26), T(0x27),
+  T(0x28), T(0x29), T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f),
+  T(0x30), T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37),
+  T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), T(0x3f),
+  T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), T(0x46), T(0x47),
+  T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), T(0x4d), T(0x4e), T(0x4f),
+  T(0x50), T(0x51), T(0x52), T(0x53), T(0x54), T(0x55), T(0x56), T(0x57),
+  T(0x58), T(0x59), T(0x5a), T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f),
+  T(0x60), T(0x61), T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67),
+  T(0x68), T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f),
+  T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), T(0x77),
+  T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), T(0x7e), T(0x7f),
+  T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), T(0x85), T(0x86), T(0x87),
+  T(0x88), T(0x89), T(0x8a), T(0x8b), T(0x8c), T(0x8d), T(0x8e), T(0x8f),
+  T(0x90), T(0x91), T(0x92), T(0x93), T(0x94), T(0x95), T(0x96), T(0x97),
+  T(0x98), T(0x99), T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f),
+  T(0xa0), T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7),
+  T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), T(0xaf),
+  T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), T(0xb6), T(0xb7),
+  T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), T(0xbd), T(0xbe), T(0xbf),
+  T(0xc0), T(0xc1), T(0xc2), T(0xc3), T(0xc4), T(0xc5), T(0xc6), T(0xc7),
+  T(0xc8), T(0xc9), T(0xca), T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf),
+  T(0xd0), T(0xd1), T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7),
+  T(0xd8), T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf),
+  T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7),
+  T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef),
+  T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7),
+  T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), 0x01000100 };
+#undef T
+
+void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    uint32 b = src_argb[0];
+    uint32 g = src_argb[1];
+    uint32 r = src_argb[2];
+    const uint32 a = src_argb[3];
+    const uint32 ia = fixed_invtbl8[a] & 0xffff;  // 8.8 fixed point
+    b = (b * ia) >> 8;
+    g = (g * ia) >> 8;
+    r = (r * ia) >> 8;
+    // Clamping should not be necessary but is free in assembly.
+    dst_argb[0] = clamp255(b);
+    dst_argb[1] = clamp255(g);
+    dst_argb[2] = clamp255(r);
+    dst_argb[3] = a;
+    src_argb += 4;
+    dst_argb += 4;
+  }
+}
+
+void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
+                               const int32* previous_cumsum, int width) {
+  int32 row_sum[4] = {0, 0, 0, 0};
+  int x;
+  for (x = 0; x < width; ++x) {
+    row_sum[0] += row[x * 4 + 0];
+    row_sum[1] += row[x * 4 + 1];
+    row_sum[2] += row[x * 4 + 2];
+    row_sum[3] += row[x * 4 + 3];
+    cumsum[x * 4 + 0] = row_sum[0]  + previous_cumsum[x * 4 + 0];
+    cumsum[x * 4 + 1] = row_sum[1]  + previous_cumsum[x * 4 + 1];
+    cumsum[x * 4 + 2] = row_sum[2]  + previous_cumsum[x * 4 + 2];
+    cumsum[x * 4 + 3] = row_sum[3]  + previous_cumsum[x * 4 + 3];
+  }
+}
+
+void CumulativeSumToAverageRow_C(const int32* tl, const int32* bl,
+                                int w, int area, uint8* dst, int count) {
+  float ooa = 1.0f / area;
+  int i;
+  for (i = 0; i < count; ++i) {
+    dst[0] = (uint8)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa);
+    dst[1] = (uint8)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa);
+    dst[2] = (uint8)((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa);
+    dst[3] = (uint8)((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa);
+    dst += 4;
+    tl += 4;
+    bl += 4;
+  }
+}
+
+// Copy pixels from rotated source to destination row with a slope.
+LIBYUV_API
+void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
+                     uint8* dst_argb, const float* uv_dudv, int width) {
+  int i;
+  // Render a row of pixels from source into a buffer.
+  float uv[2];
+  uv[0] = uv_dudv[0];
+  uv[1] = uv_dudv[1];
+  for (i = 0; i < width; ++i) {
+    int x = (int)(uv[0]);
+    int y = (int)(uv[1]);
+    *(uint32*)(dst_argb) =
+        *(const uint32*)(src_argb + y * src_argb_stride +
+                                         x * 4);
+    dst_argb += 4;
+    uv[0] += uv_dudv[2];
+    uv[1] += uv_dudv[3];
+  }
+}
+
+// Blend 2 rows into 1.
+static void HalfRow_C(const uint8* src_uv, int src_uv_stride,
+                      uint8* dst_uv, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
+  }
+}
+
+static void HalfRow_16_C(const uint16* src_uv, int src_uv_stride,
+                         uint16* dst_uv, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
+  }
+}
+
+// C version 2x2 -> 2x1.
+void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
+                      ptrdiff_t src_stride,
+                      int width, int source_y_fraction) {
+  int y1_fraction = source_y_fraction ;
+  int y0_fraction = 256 - y1_fraction;
+  const uint8* src_ptr1 = src_ptr + src_stride;
+  int x;
+  if (y1_fraction == 0) {
+    memcpy(dst_ptr, src_ptr, width);
+    return;
+  }
+  if (y1_fraction == 128) {
+    HalfRow_C(src_ptr, (int)(src_stride), dst_ptr, width);
+    return;
+  }
+  for (x = 0; x < width - 1; x += 2) {
+    dst_ptr[0] =
+        (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8;
+    dst_ptr[1] =
+        (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction + 128) >> 8;
+    src_ptr += 2;
+    src_ptr1 += 2;
+    dst_ptr += 2;
+  }
+  if (width & 1) {
+    dst_ptr[0] =
+        (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8;
+  }
+}
+
+void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                         ptrdiff_t src_stride,
+                         int width, int source_y_fraction) {
+  int y1_fraction = source_y_fraction;
+  int y0_fraction = 256 - y1_fraction;
+  const uint16* src_ptr1 = src_ptr + src_stride;
+  int x;
+  if (source_y_fraction == 0) {
+    memcpy(dst_ptr, src_ptr, width * 2);
+    return;
+  }
+  if (source_y_fraction == 128) {
+    HalfRow_16_C(src_ptr, (int)(src_stride), dst_ptr, width);
+    return;
+  }
+  for (x = 0; x < width - 1; x += 2) {
+    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
+    dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
+    src_ptr += 2;
+    src_ptr1 += 2;
+    dst_ptr += 2;
+  }
+  if (width & 1) {
+    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
+  }
+}
+
+// Use first 4 shuffler values to reorder ARGB channels.
+void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
+                      const uint8* shuffler, int width) {
+  int index0 = shuffler[0];
+  int index1 = shuffler[1];
+  int index2 = shuffler[2];
+  int index3 = shuffler[3];
+  // Shuffle a row of ARGB.
+  int x;
+  for (x = 0; x < width; ++x) {
+    // To support in-place conversion.
+    uint8 b = src_argb[index0];
+    uint8 g = src_argb[index1];
+    uint8 r = src_argb[index2];
+    uint8 a = src_argb[index3];
+    dst_argb[0] = b;
+    dst_argb[1] = g;
+    dst_argb[2] = r;
+    dst_argb[3] = a;
+    src_argb += 4;
+    dst_argb += 4;
+  }
+}
+
+void I422ToYUY2Row_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_frame, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_frame[0] = src_y[0];
+    dst_frame[1] = src_u[0];
+    dst_frame[2] = src_y[1];
+    dst_frame[3] = src_v[0];
+    dst_frame += 4;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+  }
+  if (width & 1) {
+    dst_frame[0] = src_y[0];
+    dst_frame[1] = src_u[0];
+    dst_frame[2] = 0;
+    dst_frame[3] = src_v[0];
+  }
+}
+
+void I422ToUYVYRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_frame, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_frame[0] = src_u[0];
+    dst_frame[1] = src_y[0];
+    dst_frame[2] = src_v[0];
+    dst_frame[3] = src_y[1];
+    dst_frame += 4;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+  }
+  if (width & 1) {
+    dst_frame[0] = src_u[0];
+    dst_frame[1] = src_y[0];
+    dst_frame[2] = src_v[0];
+    dst_frame[3] = 0;
+  }
+}
+
+
+void ARGBPolynomialRow_C(const uint8* src_argb,
+                         uint8* dst_argb,
+                         const float* poly,
+                         int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    float b = (float)(src_argb[0]);
+    float g = (float)(src_argb[1]);
+    float r = (float)(src_argb[2]);
+    float a = (float)(src_argb[3]);
+    float b2 = b * b;
+    float g2 = g * g;
+    float r2 = r * r;
+    float a2 = a * a;
+    float db = poly[0] + poly[4] * b;
+    float dg = poly[1] + poly[5] * g;
+    float dr = poly[2] + poly[6] * r;
+    float da = poly[3] + poly[7] * a;
+    float b3 = b2 * b;
+    float g3 = g2 * g;
+    float r3 = r2 * r;
+    float a3 = a2 * a;
+    db += poly[8] * b2;
+    dg += poly[9] * g2;
+    dr += poly[10] * r2;
+    da += poly[11] * a2;
+    db += poly[12] * b3;
+    dg += poly[13] * g3;
+    dr += poly[14] * r3;
+    da += poly[15] * a3;
+
+    dst_argb[0] = Clamp((int32)(db));
+    dst_argb[1] = Clamp((int32)(dg));
+    dst_argb[2] = Clamp((int32)(dr));
+    dst_argb[3] = Clamp((int32)(da));
+    src_argb += 4;
+    dst_argb += 4;
+  }
+}
+
+void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
+                             const uint8* luma, uint32 lumacoeff) {
+  uint32 bc = lumacoeff & 0xff;
+  uint32 gc = (lumacoeff >> 8) & 0xff;
+  uint32 rc = (lumacoeff >> 16) & 0xff;
+
+  int i;
+  for (i = 0; i < width - 1; i += 2) {
+    // Luminance in rows, color values in columns.
+    const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +
+                           src_argb[2] * rc) & 0x7F00u) + luma;
+    const uint8* luma1;
+    dst_argb[0] = luma0[src_argb[0]];
+    dst_argb[1] = luma0[src_argb[1]];
+    dst_argb[2] = luma0[src_argb[2]];
+    dst_argb[3] = src_argb[3];
+    luma1 = ((src_argb[4] * bc + src_argb[5] * gc +
+              src_argb[6] * rc) & 0x7F00u) + luma;
+    dst_argb[4] = luma1[src_argb[4]];
+    dst_argb[5] = luma1[src_argb[5]];
+    dst_argb[6] = luma1[src_argb[6]];
+    dst_argb[7] = src_argb[7];
+    src_argb += 8;
+    dst_argb += 8;
+  }
+  if (width & 1) {
+    // Luminance in rows, color values in columns.
+    const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +
+                           src_argb[2] * rc) & 0x7F00u) + luma;
+    dst_argb[0] = luma0[src_argb[0]];
+    dst_argb[1] = luma0[src_argb[1]];
+    dst_argb[2] = luma0[src_argb[2]];
+    dst_argb[3] = src_argb[3];
+  }
+}
+
+void ARGBCopyAlphaRow_C(const uint8* src, uint8* dst, int width) {
+  int i;
+  for (i = 0; i < width - 1; i += 2) {
+    dst[3] = src[3];
+    dst[7] = src[7];
+    dst += 8;
+    src += 8;
+  }
+  if (width & 1) {
+    dst[3] = src[3];
+  }
+}
+
+void ARGBCopyYToAlphaRow_C(const uint8* src, uint8* dst, int width) {
+  int i;
+  for (i = 0; i < width - 1; i += 2) {
+    dst[3] = src[0];
+    dst[7] = src[1];
+    dst += 8;
+    src += 2;
+  }
+  if (width & 1) {
+    dst[3] = src[0];
+  }
+}
+
+// Maximum temporary width for wrappers to process at a time, in pixels.
+#define MAXTWIDTH 2048
+
+#if !(defined(_MSC_VER) && defined(_M_IX86)) && \
+    defined(HAS_I422TORGB565ROW_SSSE3)
+// row_win.cc has asm version, but GCC uses 2 step wrapper.
+void I422ToRGB565Row_SSSE3(const uint8* src_y,
+                           const uint8* src_u,
+                           const uint8* src_v,
+                           uint8* dst_rgb565,
+                           const struct YuvConstants* yuvconstants,
+                           int width) {
+  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
+    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_rgb565 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_I422TOARGB1555ROW_SSSE3)
+void I422ToARGB1555Row_SSSE3(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb1555,
+                             const struct YuvConstants* yuvconstants,
+                             int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
+    ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth);
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_argb1555 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_I422TOARGB4444ROW_SSSE3)
+void I422ToARGB4444Row_SSSE3(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb4444,
+                             const struct YuvConstants* yuvconstants,
+                             int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
+    ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth);
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_argb4444 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_NV12TORGB565ROW_SSSE3)
+void NV12ToRGB565Row_SSSE3(const uint8* src_y,
+                           const uint8* src_uv,
+                           uint8* dst_rgb565,
+                           const struct YuvConstants* yuvconstants,
+                           int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    NV12ToARGBRow_SSSE3(src_y, src_uv, row, yuvconstants, twidth);
+    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
+    src_y += twidth;
+    src_uv += twidth;
+    dst_rgb565 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_I422TORGB565ROW_AVX2)
+void I422ToRGB565Row_AVX2(const uint8* src_y,
+                          const uint8* src_u,
+                          const uint8* src_v,
+                          uint8* dst_rgb565,
+                          const struct YuvConstants* yuvconstants,
+                          int width) {
+  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTORGB565ROW_AVX2)
+    ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
+#else
+    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
+#endif
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_rgb565 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_I422TOARGB1555ROW_AVX2)
+void I422ToARGB1555Row_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb1555,
+                            const struct YuvConstants* yuvconstants,
+                            int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTOARGB1555ROW_AVX2)
+    ARGBToARGB1555Row_AVX2(row, dst_argb1555, twidth);
+#else
+    ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth);
+#endif
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_argb1555 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_I422TOARGB4444ROW_AVX2)
+void I422ToARGB4444Row_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb4444,
+                            const struct YuvConstants* yuvconstants,
+                            int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
+    ARGBToARGB4444Row_AVX2(row, dst_argb4444, twidth);
+#else
+    ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth);
+#endif
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_argb4444 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_I422TORGB24ROW_AVX2)
+void I422ToRGB24Row_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_rgb24,
+                            const struct YuvConstants* yuvconstants,
+                            int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
+    // TODO(fbarchard): ARGBToRGB24Row_AVX2
+    ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_rgb24 += twidth * 3;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_NV12TORGB565ROW_AVX2)
+void NV12ToRGB565Row_AVX2(const uint8* src_y,
+                          const uint8* src_uv,
+                          uint8* dst_rgb565,
+                          const struct YuvConstants* yuvconstants,
+                          int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTORGB565ROW_AVX2)
+    ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
+#else
+    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
+#endif
+    src_y += twidth;
+    src_uv += twidth;
+    dst_rgb565 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libyuv/source/row_gcc.cc b/libs/libyuv/source/row_gcc.cc
new file mode 100644
index 0000000000..d5174516e7
--- /dev/null
+++ b/libs/libyuv/source/row_gcc.cc
@@ -0,0 +1,5507 @@
+// VERSION 2
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+
+#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
+
+// Constants for ARGB
+static vec8 kARGBToY = {
+  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
+};
+
+// JPeg full range.
+static vec8 kARGBToYJ = {
+  15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
+};
+#endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
+
+#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
+
+static vec8 kARGBToU = {
+  112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
+};
+
+static vec8 kARGBToUJ = {
+  127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
+};
+
+static vec8 kARGBToV = {
+  -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
+};
+
+static vec8 kARGBToVJ = {
+  -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
+};
+
+// Constants for BGRA
+static vec8 kBGRAToY = {
+  0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
+};
+
+static vec8 kBGRAToU = {
+  0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
+};
+
+static vec8 kBGRAToV = {
+  0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
+};
+
+// Constants for ABGR
+static vec8 kABGRToY = {
+  33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
+};
+
+static vec8 kABGRToU = {
+  -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
+};
+
+static vec8 kABGRToV = {
+  112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
+};
+
+// Constants for RGBA.
+static vec8 kRGBAToY = {
+  0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
+};
+
+static vec8 kRGBAToU = {
+  0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
+};
+
+static vec8 kRGBAToV = {
+  0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
+};
+
+static uvec8 kAddY16 = {
+  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
+};
+
+// 7 bit fixed point 0.5.
+static vec16 kAddYJ64 = {
+  64, 64, 64, 64, 64, 64, 64, 64
+};
+
+static uvec8 kAddUV128 = {
+  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
+};
+
+static uvec16 kAddUVJ128 = {
+  0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
+};
+#endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
+
+#ifdef HAS_RGB24TOARGBROW_SSSE3
+
+// Shuffle table for converting RGB24 to ARGB.
+static uvec8 kShuffleMaskRGB24ToARGB = {
+  0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
+};
+
+// Shuffle table for converting RAW to ARGB.
+static uvec8 kShuffleMaskRAWToARGB = {
+  2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
+};
+
+// Shuffle table for converting RAW to RGB24.  First 8.
+static const uvec8 kShuffleMaskRAWToRGB24_0 = {
+  2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
+  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
+};
+
+// Shuffle table for converting RAW to RGB24.  Middle 8.
+static const uvec8 kShuffleMaskRAWToRGB24_1 = {
+  2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
+  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
+};
+
+// Shuffle table for converting RAW to RGB24.  Last 8.
+static const uvec8 kShuffleMaskRAWToRGB24_2 = {
+  8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
+  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
+};
+
+// Shuffle table for converting ARGB to RGB24.
+static uvec8 kShuffleMaskARGBToRGB24 = {
+  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
+};
+
+// Shuffle table for converting ARGB to RAW.
+static uvec8 kShuffleMaskARGBToRAW = {
+  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
+};
+
+// Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
+static uvec8 kShuffleMaskARGBToRGB24_0 = {
+  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
+};
+
+// YUY2 shuf 16 Y to 32 Y.
+static const lvec8 kShuffleYUY2Y = {
+  0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14,
+  0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14
+};
+
+// YUY2 shuf 8 UV to 16 UV.
+static const lvec8 kShuffleYUY2UV = {
+  1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15,
+  1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15
+};
+
+// UYVY shuf 16 Y to 32 Y.
+static const lvec8 kShuffleUYVYY = {
+  1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15,
+  1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15
+};
+
+// UYVY shuf 8 UV to 16 UV.
+static const lvec8 kShuffleUYVYUV = {
+  0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14,
+  0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14
+};
+
+// NV21 shuf 8 VU to 16 UV.
+static const lvec8 kShuffleNV21 = {
+  1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
+  1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
+};
+#endif  // HAS_RGB24TOARGBROW_SSSE3
+
+#ifdef HAS_J400TOARGBROW_SSE2
+void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "pslld     $0x18,%%xmm5                    \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movq      " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x8,0) ",%0            \n"
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm0,%%xmm0                   \n"
+    "punpckhwd %%xmm1,%%xmm1                   \n"
+    "por       %%xmm5,%%xmm0                   \n"
+    "por       %%xmm5,%%xmm1                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_y),     // %0
+    "+r"(dst_argb),  // %1
+    "+r"(width)        // %2
+  :: "memory", "cc", "xmm0", "xmm1", "xmm5"
+  );
+}
+#endif  // HAS_J400TOARGBROW_SSE2
+
+#ifdef HAS_RGB24TOARGBROW_SSSE3
+void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
+    "pslld     $0x18,%%xmm5                    \n"
+    "movdqa    %3,%%xmm4                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm3   \n"
+    "lea       " MEMLEA(0x30,0) ",%0           \n"
+    "movdqa    %%xmm3,%%xmm2                   \n"
+    "palignr   $0x8,%%xmm1,%%xmm2              \n"
+    "pshufb    %%xmm4,%%xmm2                   \n"
+    "por       %%xmm5,%%xmm2                   \n"
+    "palignr   $0xc,%%xmm0,%%xmm1              \n"
+    "pshufb    %%xmm4,%%xmm0                   \n"
+    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
+    "por       %%xmm5,%%xmm0                   \n"
+    "pshufb    %%xmm4,%%xmm1                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "por       %%xmm5,%%xmm1                   \n"
+    "palignr   $0x4,%%xmm3,%%xmm3              \n"
+    "pshufb    %%xmm4,%%xmm3                   \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "por       %%xmm5,%%xmm3                   \n"
+    "movdqu    %%xmm3," MEMACCESS2(0x30,1) "   \n"
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_rgb24),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(width)        // %2
+  : "m"(kShuffleMaskRGB24ToARGB)  // %3
+  : "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
+    "pslld     $0x18,%%xmm5                    \n"
+    "movdqa    %3,%%xmm4                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm3   \n"
+    "lea       " MEMLEA(0x30,0) ",%0           \n"
+    "movdqa    %%xmm3,%%xmm2                   \n"
+    "palignr   $0x8,%%xmm1,%%xmm2              \n"
+    "pshufb    %%xmm4,%%xmm2                   \n"
+    "por       %%xmm5,%%xmm2                   \n"
+    "palignr   $0xc,%%xmm0,%%xmm1              \n"
+    "pshufb    %%xmm4,%%xmm0                   \n"
+    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
+    "por       %%xmm5,%%xmm0                   \n"
+    "pshufb    %%xmm4,%%xmm1                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "por       %%xmm5,%%xmm1                   \n"
+    "palignr   $0x4,%%xmm3,%%xmm3              \n"
+    "pshufb    %%xmm4,%%xmm3                   \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "por       %%xmm5,%%xmm3                   \n"
+    "movdqu    %%xmm3," MEMACCESS2(0x30,1) "   \n"
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_raw),   // %0
+    "+r"(dst_argb),  // %1
+    "+r"(width)        // %2
+  : "m"(kShuffleMaskRAWToARGB)  // %3
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) {
+  asm volatile (
+   "movdqa     %3,%%xmm3                       \n"
+   "movdqa     %4,%%xmm4                       \n"
+   "movdqa     %5,%%xmm5                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x4,0) ",%%xmm1    \n"
+    "movdqu    " MEMACCESS2(0x8,0) ",%%xmm2    \n"
+    "lea       " MEMLEA(0x18,0) ",%0           \n"
+    "pshufb    %%xmm3,%%xmm0                   \n"
+    "pshufb    %%xmm4,%%xmm1                   \n"
+    "pshufb    %%xmm5,%%xmm2                   \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "movq      %%xmm1," MEMACCESS2(0x8,1) "    \n"
+    "movq      %%xmm2," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x18,1) ",%1           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_raw),    // %0
+    "+r"(dst_rgb24),  // %1
+    "+r"(width)       // %2
+  : "m"(kShuffleMaskRAWToRGB24_0),  // %3
+    "m"(kShuffleMaskRAWToRGB24_1),  // %4
+    "m"(kShuffleMaskRAWToRGB24_2)   // %5
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    "mov       $0x1080108,%%eax                \n"
+    "movd      %%eax,%%xmm5                    \n"
+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+    "mov       $0x20802080,%%eax               \n"
+    "movd      %%eax,%%xmm6                    \n"
+    "pshufd    $0x0,%%xmm6,%%xmm6              \n"
+    "pcmpeqb   %%xmm3,%%xmm3                   \n"
+    "psllw     $0xb,%%xmm3                     \n"
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "psllw     $0xa,%%xmm4                     \n"
+    "psrlw     $0x5,%%xmm4                     \n"
+    "pcmpeqb   %%xmm7,%%xmm7                   \n"
+    "psllw     $0x8,%%xmm7                     \n"
+    "sub       %0,%1                           \n"
+    "sub       %0,%1                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "pand      %%xmm3,%%xmm1                   \n"
+    "psllw     $0xb,%%xmm2                     \n"
+    "pmulhuw   %%xmm5,%%xmm1                   \n"
+    "pmulhuw   %%xmm5,%%xmm2                   \n"
+    "psllw     $0x8,%%xmm1                     \n"
+    "por       %%xmm2,%%xmm1                   \n"
+    "pand      %%xmm4,%%xmm0                   \n"
+    "pmulhuw   %%xmm6,%%xmm0                   \n"
+    "por       %%xmm7,%%xmm0                   \n"
+    "movdqa    %%xmm1,%%xmm2                   \n"
+    "punpcklbw %%xmm0,%%xmm1                   \n"
+    "punpckhbw %%xmm0,%%xmm2                   \n"
+    MEMOPMEM(movdqu,xmm1,0x00,1,0,2)           //  movdqu  %%xmm1,(%1,%0,2)
+    MEMOPMEM(movdqu,xmm2,0x10,1,0,2)           //  movdqu  %%xmm2,0x10(%1,%0,2)
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(width)   // %2
+  :
+  : "memory", "cc", "eax", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
+void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    "mov       $0x1080108,%%eax                \n"
+    "movd      %%eax,%%xmm5                    \n"
+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+    "mov       $0x42004200,%%eax               \n"
+    "movd      %%eax,%%xmm6                    \n"
+    "pshufd    $0x0,%%xmm6,%%xmm6              \n"
+    "pcmpeqb   %%xmm3,%%xmm3                   \n"
+    "psllw     $0xb,%%xmm3                     \n"
+    "movdqa    %%xmm3,%%xmm4                   \n"
+    "psrlw     $0x6,%%xmm4                     \n"
+    "pcmpeqb   %%xmm7,%%xmm7                   \n"
+    "psllw     $0x8,%%xmm7                     \n"
+    "sub       %0,%1                           \n"
+    "sub       %0,%1                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "psllw     $0x1,%%xmm1                     \n"
+    "psllw     $0xb,%%xmm2                     \n"
+    "pand      %%xmm3,%%xmm1                   \n"
+    "pmulhuw   %%xmm5,%%xmm2                   \n"
+    "pmulhuw   %%xmm5,%%xmm1                   \n"
+    "psllw     $0x8,%%xmm1                     \n"
+    "por       %%xmm2,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "pand      %%xmm4,%%xmm0                   \n"
+    "psraw     $0x8,%%xmm2                     \n"
+    "pmulhuw   %%xmm6,%%xmm0                   \n"
+    "pand      %%xmm7,%%xmm2                   \n"
+    "por       %%xmm2,%%xmm0                   \n"
+    "movdqa    %%xmm1,%%xmm2                   \n"
+    "punpcklbw %%xmm0,%%xmm1                   \n"
+    "punpckhbw %%xmm0,%%xmm2                   \n"
+    MEMOPMEM(movdqu,xmm1,0x00,1,0,2)           //  movdqu  %%xmm1,(%1,%0,2)
+    MEMOPMEM(movdqu,xmm2,0x10,1,0,2)           //  movdqu  %%xmm2,0x10(%1,%0,2)
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(width)   // %2
+  :
+  : "memory", "cc", "eax", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
+void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    "mov       $0xf0f0f0f,%%eax                \n"
+    "movd      %%eax,%%xmm4                    \n"
+    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
+    "movdqa    %%xmm4,%%xmm5                   \n"
+    "pslld     $0x4,%%xmm5                     \n"
+    "sub       %0,%1                           \n"
+    "sub       %0,%1                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "pand      %%xmm4,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm2,%%xmm3                   \n"
+    "psllw     $0x4,%%xmm1                     \n"
+    "psrlw     $0x4,%%xmm3                     \n"
+    "por       %%xmm1,%%xmm0                   \n"
+    "por       %%xmm3,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm2,%%xmm0                   \n"
+    "punpckhbw %%xmm2,%%xmm1                   \n"
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,2)           //  movdqu  %%xmm0,(%1,%0,2)
+    MEMOPMEM(movdqu,xmm1,0x10,1,0,2)           //  movdqu  %%xmm1,0x10(%1,%0,2)
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(width)   // %2
+  :
+  : "memory", "cc", "eax", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    "movdqa    %3,%%xmm6                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "pshufb    %%xmm6,%%xmm0                   \n"
+    "pshufb    %%xmm6,%%xmm1                   \n"
+    "pshufb    %%xmm6,%%xmm2                   \n"
+    "pshufb    %%xmm6,%%xmm3                   \n"
+    "movdqa    %%xmm1,%%xmm4                   \n"
+    "psrldq    $0x4,%%xmm1                     \n"
+    "pslldq    $0xc,%%xmm4                     \n"
+    "movdqa    %%xmm2,%%xmm5                   \n"
+    "por       %%xmm4,%%xmm0                   \n"
+    "pslldq    $0x8,%%xmm5                     \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "por       %%xmm5,%%xmm1                   \n"
+    "psrldq    $0x8,%%xmm2                     \n"
+    "pslldq    $0x4,%%xmm3                     \n"
+    "por       %%xmm3,%%xmm2                   \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
+    "lea       " MEMLEA(0x30,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(width)   // %2
+  : "m"(kShuffleMaskARGBToRGB24)  // %3
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+
+void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    "movdqa    %3,%%xmm6                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "pshufb    %%xmm6,%%xmm0                   \n"
+    "pshufb    %%xmm6,%%xmm1                   \n"
+    "pshufb    %%xmm6,%%xmm2                   \n"
+    "pshufb    %%xmm6,%%xmm3                   \n"
+    "movdqa    %%xmm1,%%xmm4                   \n"
+    "psrldq    $0x4,%%xmm1                     \n"
+    "pslldq    $0xc,%%xmm4                     \n"
+    "movdqa    %%xmm2,%%xmm5                   \n"
+    "por       %%xmm4,%%xmm0                   \n"
+    "pslldq    $0x8,%%xmm5                     \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "por       %%xmm5,%%xmm1                   \n"
+    "psrldq    $0x8,%%xmm2                     \n"
+    "pslldq    $0x4,%%xmm3                     \n"
+    "por       %%xmm3,%%xmm2                   \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
+    "lea       " MEMLEA(0x30,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(width)   // %2
+  : "m"(kShuffleMaskARGBToRAW)  // %3
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+
+void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm3,%%xmm3                   \n"
+    "psrld     $0x1b,%%xmm3                    \n"
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "psrld     $0x1a,%%xmm4                    \n"
+    "pslld     $0x5,%%xmm4                     \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "pslld     $0xb,%%xmm5                     \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "pslld     $0x8,%%xmm0                     \n"
+    "psrld     $0x3,%%xmm1                     \n"
+    "psrld     $0x5,%%xmm2                     \n"
+    "psrad     $0x10,%%xmm0                    \n"
+    "pand      %%xmm3,%%xmm1                   \n"
+    "pand      %%xmm4,%%xmm2                   \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "por       %%xmm2,%%xmm1                   \n"
+    "por       %%xmm1,%%xmm0                   \n"
+    "packssdw  %%xmm0,%%xmm0                   \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(width)   // %2
+  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void ARGBToRGB565DitherRow_SSE2(const uint8* src, uint8* dst,
+                                const uint32 dither4, int width) {
+  asm volatile (
+    "movd       %3,%%xmm6                      \n"
+    "punpcklbw  %%xmm6,%%xmm6                  \n"
+    "movdqa     %%xmm6,%%xmm7                  \n"
+    "punpcklwd  %%xmm6,%%xmm6                  \n"
+    "punpckhwd  %%xmm7,%%xmm7                  \n"
+    "pcmpeqb    %%xmm3,%%xmm3                  \n"
+    "psrld      $0x1b,%%xmm3                   \n"
+    "pcmpeqb    %%xmm4,%%xmm4                  \n"
+    "psrld      $0x1a,%%xmm4                   \n"
+    "pslld      $0x5,%%xmm4                    \n"
+    "pcmpeqb    %%xmm5,%%xmm5                  \n"
+    "pslld      $0xb,%%xmm5                    \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu     (%0),%%xmm0                    \n"
+    "paddusb    %%xmm6,%%xmm0                  \n"
+    "movdqa     %%xmm0,%%xmm1                  \n"
+    "movdqa     %%xmm0,%%xmm2                  \n"
+    "pslld      $0x8,%%xmm0                    \n"
+    "psrld      $0x3,%%xmm1                    \n"
+    "psrld      $0x5,%%xmm2                    \n"
+    "psrad      $0x10,%%xmm0                   \n"
+    "pand       %%xmm3,%%xmm1                  \n"
+    "pand       %%xmm4,%%xmm2                  \n"
+    "pand       %%xmm5,%%xmm0                  \n"
+    "por        %%xmm2,%%xmm1                  \n"
+    "por        %%xmm1,%%xmm0                  \n"
+    "packssdw   %%xmm0,%%xmm0                  \n"
+    "lea        0x10(%0),%0                    \n"
+    "movq       %%xmm0,(%1)                    \n"
+    "lea        0x8(%1),%1                     \n"
+    "sub        $0x4,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(width)   // %2
+  : "m"(dither4) // %3
+  : "memory", "cc",
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
+#ifdef HAS_ARGBTORGB565DITHERROW_AVX2
+void ARGBToRGB565DitherRow_AVX2(const uint8* src, uint8* dst,
+                                const uint32 dither4, int width) {
+  asm volatile (
+    "vbroadcastss %3,%%xmm6                    \n"
+    "vpunpcklbw %%xmm6,%%xmm6,%%xmm6           \n"
+    "vpermq     $0xd8,%%ymm6,%%ymm6            \n"
+    "vpunpcklwd %%ymm6,%%ymm6,%%ymm6           \n"
+    "vpcmpeqb   %%ymm3,%%ymm3,%%ymm3           \n"
+    "vpsrld     $0x1b,%%ymm3,%%ymm3            \n"
+    "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
+    "vpsrld     $0x1a,%%ymm4,%%ymm4            \n"
+    "vpslld     $0x5,%%ymm4,%%ymm4             \n"
+    "vpslld     $0xb,%%ymm3,%%ymm5             \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    (%0),%%ymm0                    \n"
+    "vpaddusb   %%ymm6,%%ymm0,%%ymm0           \n"
+    "vpsrld     $0x5,%%ymm0,%%ymm2             \n"
+    "vpsrld     $0x3,%%ymm0,%%ymm1             \n"
+    "vpsrld     $0x8,%%ymm0,%%ymm0             \n"
+    "vpand      %%ymm4,%%ymm2,%%ymm2           \n"
+    "vpand      %%ymm3,%%ymm1,%%ymm1           \n"
+    "vpand      %%ymm5,%%ymm0,%%ymm0           \n"
+    "vpor       %%ymm2,%%ymm1,%%ymm1           \n"
+    "vpor       %%ymm1,%%ymm0,%%ymm0           \n"
+    "vpackusdw  %%ymm0,%%ymm0,%%ymm0           \n"
+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+    "lea        0x20(%0),%0                    \n"
+    "vmovdqu    %%xmm0,(%1)                    \n"
+    "lea        0x10(%1),%1                    \n"
+    "sub        $0x8,%2                        \n"
+    "jg         1b                             \n"
+    "vzeroupper                                \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(width)   // %2
+  : "m"(dither4) // %3
+  : "memory", "cc",
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBTORGB565DITHERROW_AVX2
+
+
+void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "psrld     $0x1b,%%xmm4                    \n"
+    "movdqa    %%xmm4,%%xmm5                   \n"
+    "pslld     $0x5,%%xmm5                     \n"
+    "movdqa    %%xmm4,%%xmm6                   \n"
+    "pslld     $0xa,%%xmm6                     \n"
+    "pcmpeqb   %%xmm7,%%xmm7                   \n"
+    "pslld     $0xf,%%xmm7                     \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm3                   \n"
+    "psrad     $0x10,%%xmm0                    \n"
+    "psrld     $0x3,%%xmm1                     \n"
+    "psrld     $0x6,%%xmm2                     \n"
+    "psrld     $0x9,%%xmm3                     \n"
+    "pand      %%xmm7,%%xmm0                   \n"
+    "pand      %%xmm4,%%xmm1                   \n"
+    "pand      %%xmm5,%%xmm2                   \n"
+    "pand      %%xmm6,%%xmm3                   \n"
+    "por       %%xmm1,%%xmm0                   \n"
+    "por       %%xmm3,%%xmm2                   \n"
+    "por       %%xmm2,%%xmm0                   \n"
+    "packssdw  %%xmm0,%%xmm0                   \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(width)   // %2
+  :: "memory", "cc",
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
+void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "psllw     $0xc,%%xmm4                     \n"
+    "movdqa    %%xmm4,%%xmm3                   \n"
+    "psrlw     $0x8,%%xmm3                     \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "pand      %%xmm3,%%xmm0                   \n"
+    "pand      %%xmm4,%%xmm1                   \n"
+    "psrlq     $0x4,%%xmm0                     \n"
+    "psrlq     $0x8,%%xmm1                     \n"
+    "por       %%xmm1,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(width)   // %2
+  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+  );
+}
+#endif  // HAS_RGB24TOARGBROW_SSSE3
+
+#ifdef HAS_ARGBTOYROW_SSSE3
+// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
+void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
+  asm volatile (
+    "movdqa    %3,%%xmm4                       \n"
+    "movdqa    %4,%%xmm5                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm4,%%xmm3                   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "phaddw    %%xmm1,%%xmm0                   \n"
+    "phaddw    %%xmm3,%%xmm2                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "psrlw     $0x7,%%xmm2                     \n"
+    "packuswb  %%xmm2,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(width)        // %2
+  : "m"(kARGBToY),   // %3
+    "m"(kAddY16)     // %4
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_ARGBTOYROW_SSSE3
+
+#ifdef HAS_ARGBTOYJROW_SSSE3
+// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
+// Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
+void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
+  asm volatile (
+    "movdqa    %3,%%xmm4                       \n"
+    "movdqa    %4,%%xmm5                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm4,%%xmm3                   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "phaddw    %%xmm1,%%xmm0                   \n"
+    "phaddw    %%xmm3,%%xmm2                   \n"
+    "paddw     %%xmm5,%%xmm0                   \n"
+    "paddw     %%xmm5,%%xmm2                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "psrlw     $0x7,%%xmm2                     \n"
+    "packuswb  %%xmm2,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(width)        // %2
+  : "m"(kARGBToYJ),  // %3
+    "m"(kAddYJ64)    // %4
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_ARGBTOYJROW_SSSE3
+
+#ifdef HAS_ARGBTOYROW_AVX2
+// vpermd for vphaddw + vpackuswb vpermd.
+static const lvec32 kPermdARGBToY_AVX = {
+  0, 4, 1, 5, 2, 6, 3, 7
+};
+
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
+  asm volatile (
+    "vbroadcastf128 %3,%%ymm4                  \n"
+    "vbroadcastf128 %4,%%ymm5                  \n"
+    "vmovdqu    %5,%%ymm6                      \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
+    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
+    "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
+    "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
+    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
+    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
+    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
+    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+    "lea       " MEMLEA(0x80,0) ",%0           \n"
+    "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
+    "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
+    "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
+    "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
+    "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
+    "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
+    "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"  // add 16 for Y
+    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(width)        // %2
+  : "m"(kARGBToY),   // %3
+    "m"(kAddY16),    // %4
+    "m"(kPermdARGBToY_AVX)  // %5
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+#endif  // HAS_ARGBTOYROW_AVX2
+
+#ifdef HAS_ARGBTOYJROW_AVX2
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
+  asm volatile (
+    "vbroadcastf128 %3,%%ymm4                  \n"
+    "vbroadcastf128 %4,%%ymm5                  \n"
+    "vmovdqu    %5,%%ymm6                      \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
+    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
+    "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
+    "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
+    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
+    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
+    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
+    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+    "lea       " MEMLEA(0x80,0) ",%0           \n"
+    "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
+    "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
+    "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"  // Add .5 for rounding.
+    "vpaddw     %%ymm5,%%ymm2,%%ymm2           \n"
+    "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
+    "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
+    "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
+    "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
+    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(width)        // %2
+  : "m"(kARGBToYJ),   // %3
+    "m"(kAddYJ64),    // %4
+    "m"(kPermdARGBToY_AVX)  // %5
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+#endif  // HAS_ARGBTOYJROW_AVX2
+
+#ifdef HAS_ARGBTOUVROW_SSSE3
+void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "movdqa    %5,%%xmm3                       \n"
+    "movdqa    %6,%%xmm4                       \n"
+    "movdqa    %7,%%xmm5                       \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm1                   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm6                   \n"
+
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqa    %%xmm2,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm6,%%xmm2             \n"
+    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm2,%%xmm6                   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "pmaddubsw %%xmm3,%%xmm6                   \n"
+    "phaddw    %%xmm2,%%xmm0                   \n"
+    "phaddw    %%xmm6,%%xmm1                   \n"
+    "psraw     $0x8,%%xmm0                     \n"
+    "psraw     $0x8,%%xmm1                     \n"
+    "packsswb  %%xmm1,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "movlps    %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps    %%xmm0,(%1,%2,1)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb0),       // %0
+    "+r"(dst_u),           // %1
+    "+r"(dst_v),           // %2
+    "+rm"(width)           // %3
+  : "r"((intptr_t)(src_stride_argb)), // %4
+    "m"(kARGBToV),  // %5
+    "m"(kARGBToU),  // %6
+    "m"(kAddUV128)  // %7
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBTOUVROW_SSSE3
+
+#ifdef HAS_ARGBTOUVROW_AVX2
+// vpshufb for vphaddw + vpackuswb packed to shorts.
+static const lvec8 kShufARGBToUV_AVX = {
+  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+};
+void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "vbroadcastf128 %5,%%ymm5                  \n"
+    "vbroadcastf128 %6,%%ymm6                  \n"
+    "vbroadcastf128 %7,%%ymm7                  \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
+    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
+    "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
+    "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
+    VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
+    VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
+    VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
+    VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
+    "lea       " MEMLEA(0x80,0) ",%0           \n"
+    "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"
+    "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"
+    "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"
+    "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"
+    "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"
+    "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"
+
+    "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"
+    "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"
+    "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"
+    "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"
+    "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"
+    "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpsraw     $0x8,%%ymm1,%%ymm1             \n"
+    "vpsraw     $0x8,%%ymm0,%%ymm0             \n"
+    "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"
+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+    "vpshufb    %8,%%ymm0,%%ymm0               \n"
+    "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"
+
+    "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
+    VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x20,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb0),       // %0
+    "+r"(dst_u),           // %1
+    "+r"(dst_v),           // %2
+    "+rm"(width)           // %3
+  : "r"((intptr_t)(src_stride_argb)), // %4
+    "m"(kAddUV128),  // %5
+    "m"(kARGBToV),   // %6
+    "m"(kARGBToU),   // %7
+    "m"(kShufARGBToUV_AVX)  // %8
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBTOUVROW_AVX2
+
+#ifdef HAS_ARGBTOUVJROW_AVX2
+void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "vbroadcastf128 %5,%%ymm5                  \n"
+    "vbroadcastf128 %6,%%ymm6                  \n"
+    "vbroadcastf128 %7,%%ymm7                  \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
+    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
+    "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
+    "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
+    VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
+    VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
+    VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
+    VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
+    "lea       " MEMLEA(0x80,0) ",%0           \n"
+    "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"
+    "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"
+    "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"
+    "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"
+    "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"
+    "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"
+
+    "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"
+    "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"
+    "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"
+    "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"
+    "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"
+    "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"
+    "vpaddw     %%ymm5,%%ymm1,%%ymm1           \n"
+    "vpsraw     $0x8,%%ymm1,%%ymm1             \n"
+    "vpsraw     $0x8,%%ymm0,%%ymm0             \n"
+    "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"
+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+    "vpshufb    %8,%%ymm0,%%ymm0               \n"
+
+    "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
+    VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x20,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb0),       // %0
+    "+r"(dst_u),           // %1
+    "+r"(dst_v),           // %2
+    "+rm"(width)           // %3
+  : "r"((intptr_t)(src_stride_argb)), // %4
+    "m"(kAddUVJ128),  // %5
+    "m"(kARGBToVJ),  // %6
+    "m"(kARGBToUJ),  // %7
+    "m"(kShufARGBToUV_AVX)  // %8
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBTOUVJROW_AVX2
+
+#ifdef HAS_ARGBTOUVJROW_SSSE3
+void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                        uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "movdqa    %5,%%xmm3                       \n"
+    "movdqa    %6,%%xmm4                       \n"
+    "movdqa    %7,%%xmm5                       \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm1                   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm6                   \n"
+
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqa    %%xmm2,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm6,%%xmm2             \n"
+    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm2,%%xmm6                   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "pmaddubsw %%xmm3,%%xmm6                   \n"
+    "phaddw    %%xmm2,%%xmm0                   \n"
+    "phaddw    %%xmm6,%%xmm1                   \n"
+    "paddw     %%xmm5,%%xmm0                   \n"
+    "paddw     %%xmm5,%%xmm1                   \n"
+    "psraw     $0x8,%%xmm0                     \n"
+    "psraw     $0x8,%%xmm1                     \n"
+    "packsswb  %%xmm1,%%xmm0                   \n"
+    "movlps    %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb0),       // %0
+    "+r"(dst_u),           // %1
+    "+r"(dst_v),           // %2
+    "+rm"(width)           // %3
+  : "r"((intptr_t)(src_stride_argb)), // %4
+    "m"(kARGBToVJ),  // %5
+    "m"(kARGBToUJ),  // %6
+    "m"(kAddUVJ128)  // %7
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBTOUVJROW_SSSE3
+
+#ifdef HAS_ARGBTOUV444ROW_SSSE3
+void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                          int width) {
+  asm volatile (
+    "movdqa    %4,%%xmm3                       \n"
+    "movdqa    %5,%%xmm4                       \n"
+    "movdqa    %6,%%xmm5                       \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm4,%%xmm6                   \n"
+    "phaddw    %%xmm1,%%xmm0                   \n"
+    "phaddw    %%xmm6,%%xmm2                   \n"
+    "psraw     $0x8,%%xmm0                     \n"
+    "psraw     $0x8,%%xmm2                     \n"
+    "packsswb  %%xmm2,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    "pmaddubsw %%xmm3,%%xmm0                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "pmaddubsw %%xmm3,%%xmm2                   \n"
+    "pmaddubsw %%xmm3,%%xmm6                   \n"
+    "phaddw    %%xmm1,%%xmm0                   \n"
+    "phaddw    %%xmm6,%%xmm2                   \n"
+    "psraw     $0x8,%%xmm0                     \n"
+    "psraw     $0x8,%%xmm2                     \n"
+    "packsswb  %%xmm2,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    MEMOPMEM(movdqu,xmm0,0x00,1,2,1)           //  movdqu  %%xmm0,(%1,%2,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),        // %0
+    "+r"(dst_u),           // %1
+    "+r"(dst_v),           // %2
+    "+rm"(width)           // %3
+  : "m"(kARGBToV),  // %4
+    "m"(kARGBToU),  // %5
+    "m"(kAddUV128)  // %6
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm6"
+  );
+}
+#endif  // HAS_ARGBTOUV444ROW_SSSE3
+
+void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int width) {
+  asm volatile (
+    "movdqa    %4,%%xmm5                       \n"
+    "movdqa    %3,%%xmm4                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm4,%%xmm3                   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "phaddw    %%xmm1,%%xmm0                   \n"
+    "phaddw    %%xmm3,%%xmm2                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "psrlw     $0x7,%%xmm2                     \n"
+    "packuswb  %%xmm2,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_bgra),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(width)        // %2
+  : "m"(kBGRAToY),   // %3
+    "m"(kAddY16)     // %4
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "movdqa    %5,%%xmm3                       \n"
+    "movdqa    %6,%%xmm4                       \n"
+    "movdqa    %7,%%xmm5                       \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm1                   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm6                   \n"
+
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqa    %%xmm2,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm6,%%xmm2             \n"
+    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm2,%%xmm6                   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "pmaddubsw %%xmm3,%%xmm6                   \n"
+    "phaddw    %%xmm2,%%xmm0                   \n"
+    "phaddw    %%xmm6,%%xmm1                   \n"
+    "psraw     $0x8,%%xmm0                     \n"
+    "psraw     $0x8,%%xmm1                     \n"
+    "packsswb  %%xmm1,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "movlps    %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_bgra0),       // %0
+    "+r"(dst_u),           // %1
+    "+r"(dst_v),           // %2
+    "+rm"(width)           // %3
+  : "r"((intptr_t)(src_stride_bgra)), // %4
+    "m"(kBGRAToV),  // %5
+    "m"(kBGRAToU),  // %6
+    "m"(kAddUV128)  // %7
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+  );
+}
+
+void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int width) {
+  asm volatile (
+    "movdqa    %4,%%xmm5                       \n"
+    "movdqa    %3,%%xmm4                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm4,%%xmm3                   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "phaddw    %%xmm1,%%xmm0                   \n"
+    "phaddw    %%xmm3,%%xmm2                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "psrlw     $0x7,%%xmm2                     \n"
+    "packuswb  %%xmm2,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_abgr),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(width)        // %2
+  : "m"(kABGRToY),   // %3
+    "m"(kAddY16)     // %4
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int width) {
+  asm volatile (
+    "movdqa    %4,%%xmm5                       \n"
+    "movdqa    %3,%%xmm4                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm4,%%xmm3                   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "phaddw    %%xmm1,%%xmm0                   \n"
+    "phaddw    %%xmm3,%%xmm2                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "psrlw     $0x7,%%xmm2                     \n"
+    "packuswb  %%xmm2,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_rgba),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(width)        // %2
+  : "m"(kRGBAToY),   // %3
+    "m"(kAddY16)     // %4
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "movdqa    %5,%%xmm3                       \n"
+    "movdqa    %6,%%xmm4                       \n"
+    "movdqa    %7,%%xmm5                       \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm1                   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm6                   \n"
+
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqa    %%xmm2,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm6,%%xmm2             \n"
+    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm2,%%xmm6                   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "pmaddubsw %%xmm3,%%xmm6                   \n"
+    "phaddw    %%xmm2,%%xmm0                   \n"
+    "phaddw    %%xmm6,%%xmm1                   \n"
+    "psraw     $0x8,%%xmm0                     \n"
+    "psraw     $0x8,%%xmm1                     \n"
+    "packsswb  %%xmm1,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "movlps    %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_abgr0),       // %0
+    "+r"(dst_u),           // %1
+    "+r"(dst_v),           // %2
+    "+rm"(width)           // %3
+  : "r"((intptr_t)(src_stride_abgr)), // %4
+    "m"(kABGRToV),  // %5
+    "m"(kABGRToU),  // %6
+    "m"(kAddUV128)  // %7
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+  );
+}
+
+void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "movdqa    %5,%%xmm3                       \n"
+    "movdqa    %6,%%xmm4                       \n"
+    "movdqa    %7,%%xmm5                       \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm1                   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm6                   \n"
+
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqa    %%xmm2,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm6,%%xmm2             \n"
+    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm2,%%xmm6                   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "pmaddubsw %%xmm3,%%xmm6                   \n"
+    "phaddw    %%xmm2,%%xmm0                   \n"
+    "phaddw    %%xmm6,%%xmm1                   \n"
+    "psraw     $0x8,%%xmm0                     \n"
+    "psraw     $0x8,%%xmm1                     \n"
+    "packsswb  %%xmm1,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "movlps    %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_rgba0),       // %0
+    "+r"(dst_u),           // %1
+    "+r"(dst_v),           // %2
+    "+rm"(width)           // %3
+  : "r"((intptr_t)(src_stride_rgba)), // %4
+    "m"(kRGBAToV),  // %5
+    "m"(kRGBAToU),  // %6
+    "m"(kAddUV128)  // %7
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+  );
+}
+
+#if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
+
+// Read 8 UV from 444
+#define READYUV444                                                             \
+    "movq       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
+    MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
+    "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]               \n"            \
+    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
+    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
+    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
+    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
+
+// Read 4 UV from 422, upsample to 8 UV
+#define READYUV422                                                             \
+    "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
+    MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
+    "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]               \n"            \
+    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
+    "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
+    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
+    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
+    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
+
+// Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
+#define READYUVA422                                                            \
+    "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
+    MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
+    "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]               \n"            \
+    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
+    "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
+    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
+    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
+    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"            \
+    "movq       " MEMACCESS([a_buf]) ",%%xmm5                   \n"            \
+    "lea        " MEMLEA(0x8, [a_buf]) ",%[a_buf]               \n"
+
+// Read 2 UV from 411, upsample to 8 UV.
+// reading 4 bytes is an msan violation.
+//    "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"
+//    MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)
+// pinsrw fails with drmemory
+//  __asm pinsrw     xmm0, [esi], 0        /* U */
+//  __asm pinsrw     xmm1, [esi + edi], 0  /* V */
+#define READYUV411_TEMP                                                        \
+    "movzwl     " MEMACCESS([u_buf]) ",%[temp]                  \n"            \
+    "movd       %[temp],%%xmm0                                  \n"            \
+    MEMOPARG(movzwl, 0x00, [u_buf], [v_buf], 1, [temp]) "       \n"            \
+    "movd       %[temp],%%xmm1                                  \n"            \
+    "lea        " MEMLEA(0x2, [u_buf]) ",%[u_buf]               \n"            \
+    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
+    "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
+    "punpckldq  %%xmm0,%%xmm0                                   \n"            \
+    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
+    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
+    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
+
+// Read 4 UV from NV12, upsample to 8 UV
+#define READNV12                                                               \
+    "movq       " MEMACCESS([uv_buf]) ",%%xmm0                  \n"            \
+    "lea        " MEMLEA(0x8, [uv_buf]) ",%[uv_buf]             \n"            \
+    "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
+    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
+    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
+    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
+
+// Read 4 VU from NV21, upsample to 8 UV
+#define READNV21                                                               \
+    "movq       " MEMACCESS([vu_buf]) ",%%xmm0                  \n"            \
+    "lea        " MEMLEA(0x8, [vu_buf]) ",%[vu_buf]             \n"            \
+    "pshufb     %[kShuffleNV21], %%xmm0                         \n"            \
+    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
+    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
+    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
+
+// Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
+#define READYUY2                                                               \
+    "movdqu     " MEMACCESS([yuy2_buf]) ",%%xmm4                \n"            \
+    "pshufb     %[kShuffleYUY2Y], %%xmm4                        \n"            \
+    "movdqu     " MEMACCESS([yuy2_buf]) ",%%xmm0                \n"            \
+    "pshufb     %[kShuffleYUY2UV], %%xmm0                       \n"            \
+    "lea        " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf]        \n"
+
+// Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
+#define READUYVY                                                               \
+    "movdqu     " MEMACCESS([uyvy_buf]) ",%%xmm4                \n"            \
+    "pshufb     %[kShuffleUYVYY], %%xmm4                        \n"            \
+    "movdqu     " MEMACCESS([uyvy_buf]) ",%%xmm0                \n"            \
+    "pshufb     %[kShuffleUYVYUV], %%xmm0                       \n"            \
+    "lea        " MEMLEA(0x10, [uyvy_buf]) ",%[uyvy_buf]        \n"
+
+#if defined(__x86_64__)
+#define YUVTORGB_SETUP(yuvconstants)                                           \
+    "movdqa     " MEMACCESS([yuvconstants]) ",%%xmm8            \n"            \
+    "movdqa     " MEMACCESS2(32, [yuvconstants]) ",%%xmm9       \n"            \
+    "movdqa     " MEMACCESS2(64, [yuvconstants]) ",%%xmm10      \n"            \
+    "movdqa     " MEMACCESS2(96, [yuvconstants]) ",%%xmm11      \n"            \
+    "movdqa     " MEMACCESS2(128, [yuvconstants]) ",%%xmm12     \n"            \
+    "movdqa     " MEMACCESS2(160, [yuvconstants]) ",%%xmm13     \n"            \
+    "movdqa     " MEMACCESS2(192, [yuvconstants]) ",%%xmm14     \n"
+// Convert 8 pixels: 8 UV and 8 Y
+#define YUVTORGB(yuvconstants)                                                 \
+    "movdqa     %%xmm0,%%xmm1                                   \n"            \
+    "movdqa     %%xmm0,%%xmm2                                   \n"            \
+    "movdqa     %%xmm0,%%xmm3                                   \n"            \
+    "movdqa     %%xmm11,%%xmm0                                  \n"            \
+    "pmaddubsw  %%xmm8,%%xmm1                                   \n"            \
+    "psubw      %%xmm1,%%xmm0                                   \n"            \
+    "movdqa     %%xmm12,%%xmm1                                  \n"            \
+    "pmaddubsw  %%xmm9,%%xmm2                                   \n"            \
+    "psubw      %%xmm2,%%xmm1                                   \n"            \
+    "movdqa     %%xmm13,%%xmm2                                  \n"            \
+    "pmaddubsw  %%xmm10,%%xmm3                                  \n"            \
+    "psubw      %%xmm3,%%xmm2                                   \n"            \
+    "pmulhuw    %%xmm14,%%xmm4                                  \n"            \
+    "paddsw     %%xmm4,%%xmm0                                   \n"            \
+    "paddsw     %%xmm4,%%xmm1                                   \n"            \
+    "paddsw     %%xmm4,%%xmm2                                   \n"            \
+    "psraw      $0x6,%%xmm0                                     \n"            \
+    "psraw      $0x6,%%xmm1                                     \n"            \
+    "psraw      $0x6,%%xmm2                                     \n"            \
+    "packuswb   %%xmm0,%%xmm0                                   \n"            \
+    "packuswb   %%xmm1,%%xmm1                                   \n"            \
+    "packuswb   %%xmm2,%%xmm2                                   \n"
+#define YUVTORGB_REGS \
+    "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
+
+#else
+#define YUVTORGB_SETUP(yuvconstants)
+// Convert 8 pixels: 8 UV and 8 Y
+#define YUVTORGB(yuvconstants)                                                 \
+    "movdqa     %%xmm0,%%xmm1                                   \n"            \
+    "movdqa     %%xmm0,%%xmm2                                   \n"            \
+    "movdqa     %%xmm0,%%xmm3                                   \n"            \
+    "movdqa     " MEMACCESS2(96, [yuvconstants]) ",%%xmm0       \n"            \
+    "pmaddubsw  " MEMACCESS([yuvconstants]) ",%%xmm1            \n"            \
+    "psubw      %%xmm1,%%xmm0                                   \n"            \
+    "movdqa     " MEMACCESS2(128, [yuvconstants]) ",%%xmm1      \n"            \
+    "pmaddubsw  " MEMACCESS2(32, [yuvconstants]) ",%%xmm2       \n"            \
+    "psubw      %%xmm2,%%xmm1                                   \n"            \
+    "movdqa     " MEMACCESS2(160, [yuvconstants]) ",%%xmm2      \n"            \
+    "pmaddubsw  " MEMACCESS2(64, [yuvconstants]) ",%%xmm3       \n"            \
+    "psubw      %%xmm3,%%xmm2                                   \n"            \
+    "pmulhuw    " MEMACCESS2(192, [yuvconstants]) ",%%xmm4      \n"            \
+    "paddsw     %%xmm4,%%xmm0                                   \n"            \
+    "paddsw     %%xmm4,%%xmm1                                   \n"            \
+    "paddsw     %%xmm4,%%xmm2                                   \n"            \
+    "psraw      $0x6,%%xmm0                                     \n"            \
+    "psraw      $0x6,%%xmm1                                     \n"            \
+    "psraw      $0x6,%%xmm2                                     \n"            \
+    "packuswb   %%xmm0,%%xmm0                                   \n"            \
+    "packuswb   %%xmm1,%%xmm1                                   \n"            \
+    "packuswb   %%xmm2,%%xmm2                                   \n"
+#define YUVTORGB_REGS
+#endif
+
+// Store 8 ARGB values.
+#define STOREARGB                                                              \
+    "punpcklbw  %%xmm1,%%xmm0                                    \n"           \
+    "punpcklbw  %%xmm5,%%xmm2                                    \n"           \
+    "movdqa     %%xmm0,%%xmm1                                    \n"           \
+    "punpcklwd  %%xmm2,%%xmm0                                    \n"           \
+    "punpckhwd  %%xmm2,%%xmm1                                    \n"           \
+    "movdqu     %%xmm0," MEMACCESS([dst_argb]) "                 \n"           \
+    "movdqu     %%xmm1," MEMACCESS2(0x10, [dst_argb]) "          \n"           \
+    "lea        " MEMLEA(0x20, [dst_argb]) ", %[dst_argb]        \n"
+
+// Store 8 RGBA values.
+#define STORERGBA                                                              \
+    "pcmpeqb   %%xmm5,%%xmm5                                     \n"           \
+    "punpcklbw %%xmm2,%%xmm1                                     \n"           \
+    "punpcklbw %%xmm0,%%xmm5                                     \n"           \
+    "movdqa    %%xmm5,%%xmm0                                     \n"           \
+    "punpcklwd %%xmm1,%%xmm5                                     \n"           \
+    "punpckhwd %%xmm1,%%xmm0                                     \n"           \
+    "movdqu    %%xmm5," MEMACCESS([dst_rgba]) "                  \n"           \
+    "movdqu    %%xmm0," MEMACCESS2(0x10, [dst_rgba]) "           \n"           \
+    "lea       " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba]          \n"
+
+void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* dst_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV444
+    YUVTORGB(yuvconstants)
+    STOREARGB
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", NACL_R14 YUVTORGB_REGS
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
+                                 const uint8* u_buf,
+                                 const uint8* v_buf,
+                                 uint8* dst_rgb24,
+                                 const struct YuvConstants* yuvconstants,
+                                 int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+    "movdqa    %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
+    "movdqa    %[kShuffleMaskARGBToRGB24],%%xmm6   \n"
+    "sub       %[u_buf],%[v_buf]               \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB(yuvconstants)
+    "punpcklbw %%xmm1,%%xmm0                   \n"
+    "punpcklbw %%xmm2,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm2,%%xmm0                   \n"
+    "punpckhwd %%xmm2,%%xmm1                   \n"
+    "pshufb    %%xmm5,%%xmm0                   \n"
+    "pshufb    %%xmm6,%%xmm1                   \n"
+    "palignr   $0xc,%%xmm0,%%xmm1              \n"
+    "movq      %%xmm0," MEMACCESS([dst_rgb24]) "\n"
+    "movdqu    %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n"
+    "lea       " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n"
+    "subl      $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_rgb24]"+r"(dst_rgb24),  // %[dst_rgb24]
+#if defined(__i386__) && defined(__pic__)
+    [width]"+m"(width)     // %[width]
+#else
+    [width]"+rm"(width)    // %[width]
+#endif
+  : [yuvconstants]"r"(yuvconstants),  // %[yuvconstants]
+    [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
+    [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
+  : "memory", "cc", NACL_R14 YUVTORGB_REGS
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+
+void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* dst_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB(yuvconstants)
+    STOREARGB
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", NACL_R14 YUVTORGB_REGS
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+#ifdef HAS_I422ALPHATOARGBROW_SSSE3
+void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
+                                     const uint8* u_buf,
+                                     const uint8* v_buf,
+                                     const uint8* a_buf,
+                                     uint8* dst_argb,
+                                     const struct YuvConstants* yuvconstants,
+                                     int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+    "sub       %[u_buf],%[v_buf]               \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUVA422
+    YUVTORGB(yuvconstants)
+    STOREARGB
+    "subl      $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [a_buf]"+r"(a_buf),    // %[a_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+#if defined(__i386__) && defined(__pic__)
+    [width]"+m"(width)     // %[width]
+#else
+    [width]"+rm"(width)    // %[width]
+#endif
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", NACL_R14 YUVTORGB_REGS
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_I422ALPHATOARGBROW_SSSE3
+
+#ifdef HAS_I411TOARGBROW_SSSE3
+void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* dst_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  int temp = 0;
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV411_TEMP
+    YUVTORGB(yuvconstants)
+    STOREARGB
+    "subl      $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [temp]"+r"(temp),       // %[temp]
+#if defined(__i386__) && defined(__pic__)
+    [width]"+m"(width)     // %[width]
+#else
+    [width]"+rm"(width)    // %[width]
+#endif
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", NACL_R14 YUVTORGB_REGS
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif
+
+void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
+                                const uint8* uv_buf,
+                                uint8* dst_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    LABELALIGN
+  "1:                                          \n"
+    READNV12
+    YUVTORGB(yuvconstants)
+    STOREARGB
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+    : "memory", "cc", YUVTORGB_REGS  // Does not use r14.
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
+                                const uint8* vu_buf,
+                                uint8* dst_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    LABELALIGN
+  "1:                                          \n"
+    READNV21
+    YUVTORGB(yuvconstants)
+    STOREARGB
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [vu_buf]"+r"(vu_buf),    // %[vu_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+    [kShuffleNV21]"m"(kShuffleNV21)
+    : "memory", "cc", YUVTORGB_REGS  // Does not use r14.
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf,
+                                uint8* dst_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUY2
+    YUVTORGB(yuvconstants)
+    STOREARGB
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [yuy2_buf]"+r"(yuy2_buf),    // %[yuy2_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+    [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
+    [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
+    : "memory", "cc", YUVTORGB_REGS  // Does not use r14.
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void OMITFP UYVYToARGBRow_SSSE3(const uint8* uyvy_buf,
+                                uint8* dst_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    LABELALIGN
+  "1:                                          \n"
+    READUYVY
+    YUVTORGB(yuvconstants)
+    STOREARGB
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [uyvy_buf]"+r"(uyvy_buf),    // %[uyvy_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+    [kShuffleUYVYY]"m"(kShuffleUYVYY),
+    [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
+    : "memory", "cc", YUVTORGB_REGS  // Does not use r14.
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* dst_rgba,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB(yuvconstants)
+    STORERGBA
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_rgba]"+r"(dst_rgba),  // %[dst_rgba]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", NACL_R14 YUVTORGB_REGS
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+#endif  // HAS_I422TOARGBROW_SSSE3
+
+// Read 16 UV from 444
+#define READYUV444_AVX2                                                        \
+    "vmovdqu    " MEMACCESS([u_buf]) ",%%xmm0                       \n"        \
+    MEMOPREG(vmovdqu, 0x00, [u_buf], [v_buf], 1, xmm1)                         \
+    "lea        " MEMLEA(0x10, [u_buf]) ",%[u_buf]                  \n"        \
+    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
+    "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n"        \
+    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
+    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
+    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
+    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
+    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
+
+// Read 8 UV from 422, upsample to 16 UV.
+#define READYUV422_AVX2                                                        \
+    "vmovq      " MEMACCESS([u_buf]) ",%%xmm0                       \n"        \
+    MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1)                           \
+    "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]                   \n"        \
+    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
+    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
+    "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \
+    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
+    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
+    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
+    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
+
+// Read 8 UV from 422, upsample to 16 UV.  With 16 Alpha.
+#define READYUVA422_AVX2                                                       \
+    "vmovq      " MEMACCESS([u_buf]) ",%%xmm0                       \n"        \
+    MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1)                           \
+    "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]                   \n"        \
+    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
+    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
+    "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \
+    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
+    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
+    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
+    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"        \
+    "vmovdqu    " MEMACCESS([a_buf]) ",%%xmm5                       \n"        \
+    "vpermq     $0xd8,%%ymm5,%%ymm5                                 \n"        \
+    "lea        " MEMLEA(0x10, [a_buf]) ",%[a_buf]                  \n"
+
+// Read 4 UV from 411, upsample to 16 UV.
+#define READYUV411_AVX2                                                        \
+    "vmovd      " MEMACCESS([u_buf]) ",%%xmm0                       \n"        \
+    MEMOPREG(vmovd, 0x00, [u_buf], [v_buf], 1, xmm1)                           \
+    "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]                   \n"        \
+    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
+    "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \
+    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
+    "vpunpckldq %%ymm0,%%ymm0,%%ymm0                                \n"        \
+    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
+    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
+    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
+    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
+
+// Read 8 UV from NV12, upsample to 16 UV.
+#define READNV12_AVX2                                                          \
+    "vmovdqu    " MEMACCESS([uv_buf]) ",%%xmm0                      \n"        \
+    "lea        " MEMLEA(0x10, [uv_buf]) ",%[uv_buf]                \n"        \
+    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
+    "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \
+    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
+    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
+    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
+    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
+
+// Read 8 VU from NV21, upsample to 16 UV.
+#define READNV21_AVX2                                                          \
+    "vmovdqu    " MEMACCESS([vu_buf]) ",%%xmm0                      \n"        \
+    "lea        " MEMLEA(0x10, [vu_buf]) ",%[vu_buf]                \n"        \
+    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
+    "vpshufb     %[kShuffleNV21], %%ymm0, %%ymm0                    \n"        \
+    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
+    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
+    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
+    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
+
+// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
+#define READYUY2_AVX2                                                          \
+    "vmovdqu    " MEMACCESS([yuy2_buf]) ",%%ymm4                    \n"        \
+    "vpshufb    %[kShuffleYUY2Y], %%ymm4, %%ymm4                    \n"        \
+    "vmovdqu    " MEMACCESS([yuy2_buf]) ",%%ymm0                    \n"        \
+    "vpshufb    %[kShuffleYUY2UV], %%ymm0, %%ymm0                   \n"        \
+    "lea        " MEMLEA(0x20, [yuy2_buf]) ",%[yuy2_buf]            \n"
+
+// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
+#define READUYVY_AVX2                                                          \
+    "vmovdqu     " MEMACCESS([uyvy_buf]) ",%%ymm4                   \n"        \
+    "vpshufb     %[kShuffleUYVYY], %%ymm4, %%ymm4                   \n"        \
+    "vmovdqu     " MEMACCESS([uyvy_buf]) ",%%ymm0                   \n"        \
+    "vpshufb     %[kShuffleUYVYUV], %%ymm0, %%ymm0                  \n"        \
+    "lea        " MEMLEA(0x20, [uyvy_buf]) ",%[uyvy_buf]            \n"
+
+#if defined(__x86_64__)
+#define YUVTORGB_SETUP_AVX2(yuvconstants)                                      \
+    "vmovdqa     " MEMACCESS([yuvconstants]) ",%%ymm8            \n"           \
+    "vmovdqa     " MEMACCESS2(32, [yuvconstants]) ",%%ymm9       \n"           \
+    "vmovdqa     " MEMACCESS2(64, [yuvconstants]) ",%%ymm10      \n"           \
+    "vmovdqa     " MEMACCESS2(96, [yuvconstants]) ",%%ymm11      \n"           \
+    "vmovdqa     " MEMACCESS2(128, [yuvconstants]) ",%%ymm12     \n"           \
+    "vmovdqa     " MEMACCESS2(160, [yuvconstants]) ",%%ymm13     \n"           \
+    "vmovdqa     " MEMACCESS2(192, [yuvconstants]) ",%%ymm14     \n"
+#define YUVTORGB_AVX2(yuvconstants)                                            \
+    "vpmaddubsw  %%ymm10,%%ymm0,%%ymm2                              \n"        \
+    "vpmaddubsw  %%ymm9,%%ymm0,%%ymm1                               \n"        \
+    "vpmaddubsw  %%ymm8,%%ymm0,%%ymm0                               \n"        \
+    "vpsubw      %%ymm2,%%ymm13,%%ymm2                              \n"        \
+    "vpsubw      %%ymm1,%%ymm12,%%ymm1                              \n"        \
+    "vpsubw      %%ymm0,%%ymm11,%%ymm0                              \n"        \
+    "vpmulhuw    %%ymm14,%%ymm4,%%ymm4                              \n"        \
+    "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n"        \
+    "vpaddsw     %%ymm4,%%ymm1,%%ymm1                               \n"        \
+    "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"        \
+    "vpsraw      $0x6,%%ymm0,%%ymm0                                 \n"        \
+    "vpsraw      $0x6,%%ymm1,%%ymm1                                 \n"        \
+    "vpsraw      $0x6,%%ymm2,%%ymm2                                 \n"        \
+    "vpackuswb   %%ymm0,%%ymm0,%%ymm0                               \n"        \
+    "vpackuswb   %%ymm1,%%ymm1,%%ymm1                               \n"        \
+    "vpackuswb   %%ymm2,%%ymm2,%%ymm2                               \n"
+#define YUVTORGB_REGS_AVX2 \
+    "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
+#else  // Convert 16 pixels: 16 UV and 16 Y.
+#define YUVTORGB_SETUP_AVX2(yuvconstants)
+#define YUVTORGB_AVX2(yuvconstants)                                            \
+    "vpmaddubsw  " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2   \n"        \
+    "vpmaddubsw  " MEMACCESS2(32, [yuvconstants]) ",%%ymm0,%%ymm1   \n"        \
+    "vpmaddubsw  " MEMACCESS([yuvconstants]) ",%%ymm0,%%ymm0        \n"        \
+    "vmovdqu     " MEMACCESS2(160, [yuvconstants]) ",%%ymm3         \n"        \
+    "vpsubw      %%ymm2,%%ymm3,%%ymm2                               \n"        \
+    "vmovdqu     " MEMACCESS2(128, [yuvconstants]) ",%%ymm3         \n"        \
+    "vpsubw      %%ymm1,%%ymm3,%%ymm1                               \n"        \
+    "vmovdqu     " MEMACCESS2(96, [yuvconstants]) ",%%ymm3          \n"        \
+    "vpsubw      %%ymm0,%%ymm3,%%ymm0                               \n"        \
+    "vpmulhuw    " MEMACCESS2(192, [yuvconstants]) ",%%ymm4,%%ymm4  \n"        \
+    "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n"        \
+    "vpaddsw     %%ymm4,%%ymm1,%%ymm1                               \n"        \
+    "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"        \
+    "vpsraw      $0x6,%%ymm0,%%ymm0                                 \n"        \
+    "vpsraw      $0x6,%%ymm1,%%ymm1                                 \n"        \
+    "vpsraw      $0x6,%%ymm2,%%ymm2                                 \n"        \
+    "vpackuswb   %%ymm0,%%ymm0,%%ymm0                               \n"        \
+    "vpackuswb   %%ymm1,%%ymm1,%%ymm1                               \n"        \
+    "vpackuswb   %%ymm2,%%ymm2,%%ymm2                               \n"
+#define YUVTORGB_REGS_AVX2
+#endif
+
+// Store 16 ARGB values.
+#define STOREARGB_AVX2                                                         \
+    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
+    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
+    "vpunpcklbw %%ymm5,%%ymm2,%%ymm2                                \n"        \
+    "vpermq     $0xd8,%%ymm2,%%ymm2                                 \n"        \
+    "vpunpcklwd %%ymm2,%%ymm0,%%ymm1                                \n"        \
+    "vpunpckhwd %%ymm2,%%ymm0,%%ymm0                                \n"        \
+    "vmovdqu    %%ymm1," MEMACCESS([dst_argb]) "                    \n"        \
+    "vmovdqu    %%ymm0," MEMACCESS2(0x20, [dst_argb]) "             \n"        \
+    "lea       " MEMLEA(0x40, [dst_argb]) ", %[dst_argb]            \n"
+
+#ifdef HAS_I444TOARGBROW_AVX2
+// 16 pixels
+// 16 UV values with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP I444ToARGBRow_AVX2(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* dst_argb,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+    "sub       %[u_buf],%[v_buf]               \n"
+    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV444_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+    STOREARGB_AVX2
+    "sub       $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_I444TOARGBROW_AVX2
+
+#ifdef HAS_I411TOARGBROW_AVX2
+// 16 pixels
+// 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP I411ToARGBRow_AVX2(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* dst_argb,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+    "sub       %[u_buf],%[v_buf]               \n"
+    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV411_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+    STOREARGB_AVX2
+    "sub       $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_I411TOARGBROW_AVX2
+
+#if defined(HAS_I422TOARGBROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* dst_argb,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+    "sub       %[u_buf],%[v_buf]               \n"
+    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+    STOREARGB_AVX2
+    "sub       $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_I422TOARGBROW_AVX2
+
+#if defined(HAS_I422ALPHATOARGBROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
+void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               const uint8* a_buf,
+                               uint8* dst_argb,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+    "sub       %[u_buf],%[v_buf]               \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUVA422_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+    STOREARGB_AVX2
+    "subl      $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [a_buf]"+r"(a_buf),    // %[a_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+#if defined(__i386__) && defined(__pic__)
+    [width]"+m"(width)     // %[width]
+#else
+    [width]"+rm"(width)    // %[width]
+#endif
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_I422ALPHATOARGBROW_AVX2
+
+#if defined(HAS_I422TORGBAROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
+void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* dst_argb,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+    "sub       %[u_buf],%[v_buf]               \n"
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+
+    // Step 3: Weave into RGBA
+    "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"
+    "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
+    "vpunpcklbw %%ymm0,%%ymm5,%%ymm2           \n"
+    "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
+    "vpunpcklwd %%ymm1,%%ymm2,%%ymm0           \n"
+    "vpunpckhwd %%ymm1,%%ymm2,%%ymm1           \n"
+    "vmovdqu    %%ymm0," MEMACCESS([dst_argb]) "\n"
+    "vmovdqu    %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"
+    "lea       " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
+    "sub       $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_I422TORGBAROW_AVX2
+
+#if defined(HAS_NV12TOARGBROW_AVX2)
+// 16 pixels.
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf,
+                               const uint8* uv_buf,
+                               uint8* dst_argb,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    LABELALIGN
+  "1:                                          \n"
+    READNV12_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+    STOREARGB_AVX2
+    "sub       $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+    : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.
+    "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_NV12TOARGBROW_AVX2
+
+#if defined(HAS_NV21TOARGBROW_AVX2)
+// 16 pixels.
+// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf,
+                               const uint8* vu_buf,
+                               uint8* dst_argb,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    LABELALIGN
+  "1:                                          \n"
+    READNV21_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+    STOREARGB_AVX2
+    "sub       $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [vu_buf]"+r"(vu_buf),    // %[vu_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+    [kShuffleNV21]"m"(kShuffleNV21)
+    : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.
+      "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_NV21TOARGBROW_AVX2
+
+#if defined(HAS_YUY2TOARGBROW_AVX2)
+// 16 pixels.
+// 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
+void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf,
+                               uint8* dst_argb,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUY2_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+    STOREARGB_AVX2
+    "sub       $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : [yuy2_buf]"+r"(yuy2_buf),    // %[yuy2_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+    [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
+    [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
+    : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.
+      "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_YUY2TOARGBROW_AVX2
+
+#if defined(HAS_UYVYTOARGBROW_AVX2)
+// 16 pixels.
+// 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
+void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf,
+                               uint8* dst_argb,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    LABELALIGN
+  "1:                                          \n"
+    READUYVY_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+    STOREARGB_AVX2
+    "sub       $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : [uyvy_buf]"+r"(uyvy_buf),    // %[uyvy_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+    [kShuffleUYVYY]"m"(kShuffleUYVYY),
+    [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
+    : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_UYVYTOARGBROW_AVX2
+
+#ifdef HAS_I400TOARGBROW_SSE2
+void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
+  asm volatile (
+    "mov       $0x4a354a35,%%eax               \n"  // 4a35 = 18997 = 1.164
+    "movd      %%eax,%%xmm2                    \n"
+    "pshufd    $0x0,%%xmm2,%%xmm2              \n"
+    "mov       $0x04880488,%%eax               \n"  // 0488 = 1160 = 1.164 * 16
+    "movd      %%eax,%%xmm3                    \n"
+    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "pslld     $0x18,%%xmm4                    \n"
+    LABELALIGN
+  "1:                                          \n"
+    // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
+    "movq      " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x8,0) ",%0            \n"
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "pmulhuw   %%xmm2,%%xmm0                   \n"
+    "psubusw   %%xmm3,%%xmm0                   \n"
+    "psrlw     $6, %%xmm0                      \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+
+    // Step 2: Weave into ARGB
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm0,%%xmm0                   \n"
+    "punpckhwd %%xmm1,%%xmm1                   \n"
+    "por       %%xmm4,%%xmm0                   \n"
+    "por       %%xmm4,%%xmm1                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(y_buf),     // %0
+    "+r"(dst_argb),  // %1
+    "+rm"(width)     // %2
+  :
+  : "memory", "cc", "eax"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+  );
+}
+#endif  // HAS_I400TOARGBROW_SSE2
+
+#ifdef HAS_I400TOARGBROW_AVX2
+// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
+// note: vpunpcklbw mutates and vpackuswb unmutates.
+void I400ToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {
+  asm volatile (
+    "mov        $0x4a354a35,%%eax              \n" // 0488 = 1160 = 1.164 * 16
+    "vmovd      %%eax,%%xmm2                   \n"
+    "vbroadcastss %%xmm2,%%ymm2                \n"
+    "mov        $0x4880488,%%eax               \n" // 4a35 = 18997 = 1.164
+    "vmovd      %%eax,%%xmm3                   \n"
+    "vbroadcastss %%xmm3,%%ymm3                \n"
+    "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
+    "vpslld     $0x18,%%ymm4,%%ymm4            \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
+    "vmovdqu    " MEMACCESS(0) ",%%xmm0        \n"
+    "lea        " MEMLEA(0x10,0) ",%0          \n"
+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+    "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
+    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpsubusw   %%ymm3,%%ymm0,%%ymm0           \n"
+    "vpsrlw     $0x6,%%ymm0,%%ymm0             \n"
+    "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
+    "vpunpcklbw %%ymm0,%%ymm0,%%ymm1           \n"
+    "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
+    "vpunpcklwd %%ymm1,%%ymm1,%%ymm0           \n"
+    "vpunpckhwd %%ymm1,%%ymm1,%%ymm1           \n"
+    "vpor       %%ymm4,%%ymm0,%%ymm0           \n"
+    "vpor       %%ymm4,%%ymm1,%%ymm1           \n"
+    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
+    "vmovdqu    %%ymm1," MEMACCESS2(0x20,1) "  \n"
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub        $0x10,%2                       \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(y_buf),     // %0
+    "+r"(dst_argb),  // %1
+    "+rm"(width)     // %2
+  :
+  : "memory", "cc", "eax"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+  );
+}
+#endif  // HAS_I400TOARGBROW_AVX2
+
+#ifdef HAS_MIRRORROW_SSSE3
+// Shuffle table for reversing the bytes.
+static uvec8 kShuffleMirror = {
+  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
+};
+
+void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
+  intptr_t temp_width = (intptr_t)(width);
+  asm volatile (
+    "movdqa    %3,%%xmm5                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    MEMOPREG(movdqu,-0x10,0,2,1,xmm0)          //  movdqu -0x10(%0,%2),%%xmm0
+    "pshufb    %%xmm5,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(temp_width)  // %2
+  : "m"(kShuffleMirror) // %3
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm5"
+  );
+}
+#endif  // HAS_MIRRORROW_SSSE3
+
+#ifdef HAS_MIRRORROW_AVX2
+void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+  intptr_t temp_width = (intptr_t)(width);
+  asm volatile (
+    "vbroadcastf128 %3,%%ymm5                  \n"
+    LABELALIGN
+  "1:                                          \n"
+    MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0)         //  vmovdqu -0x20(%0,%2),%%ymm0
+    "vpshufb    %%ymm5,%%ymm0,%%ymm0           \n"
+    "vpermq     $0x4e,%%ymm0,%%ymm0            \n"
+    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(temp_width)  // %2
+  : "m"(kShuffleMirror) // %3
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm5"
+  );
+}
+#endif  // HAS_MIRRORROW_AVX2
+
+#ifdef HAS_MIRRORUVROW_SSSE3
+// Shuffle table for reversing the bytes of UV channels.
+static uvec8 kShuffleMirrorUV = {
+  14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
+};
+void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
+                       int width) {
+  intptr_t temp_width = (intptr_t)(width);
+  asm volatile (
+    "movdqa    %4,%%xmm1                       \n"
+    "lea       " MEMLEA4(-0x10,0,3,2) ",%0     \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(-0x10,0) ",%0          \n"
+    "pshufb    %%xmm1,%%xmm0                   \n"
+    "movlpd    %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movhpd,xmm0,0x00,1,2,1)           //  movhpd    %%xmm0,(%1,%2)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $8,%3                           \n"
+    "jg        1b                              \n"
+  : "+r"(src),      // %0
+    "+r"(dst_u),    // %1
+    "+r"(dst_v),    // %2
+    "+r"(temp_width)  // %3
+  : "m"(kShuffleMirrorUV)  // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1"
+  );
+}
+#endif  // HAS_MIRRORUVROW_SSSE3
+
+#ifdef HAS_ARGBMIRRORROW_SSE2
+
+void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
+  intptr_t temp_width = (intptr_t)(width);
+  asm volatile (
+    "lea       " MEMLEA4(-0x10,0,2,4) ",%0     \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "pshufd    $0x1b,%%xmm0,%%xmm0             \n"
+    "lea       " MEMLEA(-0x10,0) ",%0          \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(temp_width)  // %2
+  :
+  : "memory", "cc"
+    , "xmm0"
+  );
+}
+#endif  // HAS_ARGBMIRRORROW_SSE2
+
+#ifdef HAS_ARGBMIRRORROW_AVX2
+// Shuffle table for reversing the bytes.
+static const ulvec32 kARGBShuffleMirror_AVX2 = {
+  7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
+};
+void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+  intptr_t temp_width = (intptr_t)(width);
+  asm volatile (
+    "vmovdqu    %3,%%ymm5                      \n"
+    LABELALIGN
+  "1:                                          \n"
+    VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0
+    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
+    "lea        " MEMLEA(0x20,1) ",%1          \n"
+    "sub        $0x8,%2                        \n"
+    "jg         1b                             \n"
+    "vzeroupper                                \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(temp_width)  // %2
+  : "m"(kARGBShuffleMirror_AVX2) // %3
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm5"
+  );
+}
+#endif  // HAS_ARGBMIRRORROW_AVX2
+
+#ifdef HAS_SPLITUVROW_AVX2
+void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                     int width) {
+  asm volatile (
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5             \n"
+    "vpsrlw     $0x8,%%ymm5,%%ymm5               \n"
+    "sub        %1,%2                            \n"
+    LABELALIGN
+  "1:                                            \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0          \n"
+    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1    \n"
+    "lea        " MEMLEA(0x40,0) ",%0            \n"
+    "vpsrlw     $0x8,%%ymm0,%%ymm2               \n"
+    "vpsrlw     $0x8,%%ymm1,%%ymm3               \n"
+    "vpand      %%ymm5,%%ymm0,%%ymm0             \n"
+    "vpand      %%ymm5,%%ymm1,%%ymm1             \n"
+    "vpackuswb  %%ymm1,%%ymm0,%%ymm0             \n"
+    "vpackuswb  %%ymm3,%%ymm2,%%ymm2             \n"
+    "vpermq     $0xd8,%%ymm0,%%ymm0              \n"
+    "vpermq     $0xd8,%%ymm2,%%ymm2              \n"
+    "vmovdqu    %%ymm0," MEMACCESS(1) "          \n"
+    MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1)             //  vmovdqu %%ymm2,(%1,%2)
+    "lea        " MEMLEA(0x20,1) ",%1            \n"
+    "sub        $0x20,%3                         \n"
+    "jg         1b                               \n"
+    "vzeroupper                                  \n"
+  : "+r"(src_uv),     // %0
+    "+r"(dst_u),      // %1
+    "+r"(dst_v),      // %2
+    "+r"(width)         // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_SPLITUVROW_AVX2
+
+#ifdef HAS_SPLITUVROW_SSE2
+void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                     int width) {
+  asm volatile (
+    "pcmpeqb    %%xmm5,%%xmm5                    \n"
+    "psrlw      $0x8,%%xmm5                      \n"
+    "sub        %1,%2                            \n"
+    LABELALIGN
+  "1:                                            \n"
+    "movdqu     " MEMACCESS(0) ",%%xmm0          \n"
+    "movdqu     " MEMACCESS2(0x10,0) ",%%xmm1    \n"
+    "lea        " MEMLEA(0x20,0) ",%0            \n"
+    "movdqa     %%xmm0,%%xmm2                    \n"
+    "movdqa     %%xmm1,%%xmm3                    \n"
+    "pand       %%xmm5,%%xmm0                    \n"
+    "pand       %%xmm5,%%xmm1                    \n"
+    "packuswb   %%xmm1,%%xmm0                    \n"
+    "psrlw      $0x8,%%xmm2                      \n"
+    "psrlw      $0x8,%%xmm3                      \n"
+    "packuswb   %%xmm3,%%xmm2                    \n"
+    "movdqu     %%xmm0," MEMACCESS(1) "          \n"
+    MEMOPMEM(movdqu,xmm2,0x00,1,2,1)             //  movdqu     %%xmm2,(%1,%2)
+    "lea        " MEMLEA(0x10,1) ",%1            \n"
+    "sub        $0x10,%3                         \n"
+    "jg         1b                               \n"
+  : "+r"(src_uv),     // %0
+    "+r"(dst_u),      // %1
+    "+r"(dst_v),      // %2
+    "+r"(width)         // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_SPLITUVROW_SSE2
+
+#ifdef HAS_MERGEUVROW_AVX2
+void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                     int width) {
+  asm volatile (
+    "sub       %0,%1                             \n"
+    LABELALIGN
+  "1:                                            \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0           \n"
+    MEMOPREG(vmovdqu,0x00,0,1,1,ymm1)             //  vmovdqu (%0,%1,1),%%ymm1
+    "lea       " MEMLEA(0x20,0) ",%0             \n"
+    "vpunpcklbw %%ymm1,%%ymm0,%%ymm2             \n"
+    "vpunpckhbw %%ymm1,%%ymm0,%%ymm0             \n"
+    "vextractf128 $0x0,%%ymm2," MEMACCESS(2) "   \n"
+    "vextractf128 $0x0,%%ymm0," MEMACCESS2(0x10,2) "\n"
+    "vextractf128 $0x1,%%ymm2," MEMACCESS2(0x20,2) "\n"
+    "vextractf128 $0x1,%%ymm0," MEMACCESS2(0x30,2) "\n"
+    "lea       " MEMLEA(0x40,2) ",%2             \n"
+    "sub       $0x20,%3                          \n"
+    "jg        1b                                \n"
+    "vzeroupper                                  \n"
+  : "+r"(src_u),     // %0
+    "+r"(src_v),     // %1
+    "+r"(dst_uv),    // %2
+    "+r"(width)      // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2"
+  );
+}
+#endif  // HAS_MERGEUVROW_AVX2
+
+#ifdef HAS_MERGEUVROW_SSE2
+void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                     int width) {
+  asm volatile (
+    "sub       %0,%1                             \n"
+    LABELALIGN
+  "1:                                            \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
+    MEMOPREG(movdqu,0x00,0,1,1,xmm1)             //  movdqu    (%0,%1,1),%%xmm1
+    "lea       " MEMLEA(0x10,0) ",%0             \n"
+    "movdqa    %%xmm0,%%xmm2                     \n"
+    "punpcklbw %%xmm1,%%xmm0                     \n"
+    "punpckhbw %%xmm1,%%xmm2                     \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "           \n"
+    "movdqu    %%xmm2," MEMACCESS2(0x10,2) "     \n"
+    "lea       " MEMLEA(0x20,2) ",%2             \n"
+    "sub       $0x10,%3                          \n"
+    "jg        1b                                \n"
+  : "+r"(src_u),     // %0
+    "+r"(src_v),     // %1
+    "+r"(dst_uv),    // %2
+    "+r"(width)      // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2"
+  );
+}
+#endif  // HAS_MERGEUVROW_SSE2
+
+#ifdef HAS_COPYROW_SSE2
+void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
+  asm volatile (
+    "test       $0xf,%0                        \n"
+    "jne        2f                             \n"
+    "test       $0xf,%1                        \n"
+    "jne        2f                             \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+    "jmp       9f                              \n"
+    LABELALIGN
+  "2:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
+    "jg        2b                              \n"
+  "9:                                          \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(count)  // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1"
+  );
+}
+#endif  // HAS_COPYROW_SSE2
+
+#ifdef HAS_COPYROW_AVX
+void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
+    "vmovdqu   %%ymm1," MEMACCESS2(0x20,1) "   \n"
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub       $0x40,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(count)  // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1"
+  );
+}
+#endif  // HAS_COPYROW_AVX
+
+#ifdef HAS_COPYROW_ERMS
+// Multiple of 1.
+void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
+  size_t width_tmp = (size_t)(width);
+  asm volatile (
+    "rep movsb " MEMMOVESTRING(0,1) "          \n"
+  : "+S"(src),  // %0
+    "+D"(dst),  // %1
+    "+c"(width_tmp) // %2
+  :
+  : "memory", "cc"
+  );
+}
+#endif  // HAS_COPYROW_ERMS
+
+#ifdef HAS_ARGBCOPYALPHAROW_SSE2
+// width in pixels
+void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm0,%%xmm0                   \n"
+    "pslld     $0x18,%%xmm0                    \n"
+    "pcmpeqb   %%xmm1,%%xmm1                   \n"
+    "psrld     $0x8,%%xmm1                     \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm4         \n"
+    "movdqu    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
+    "pand      %%xmm0,%%xmm2                   \n"
+    "pand      %%xmm0,%%xmm3                   \n"
+    "pand      %%xmm1,%%xmm4                   \n"
+    "pand      %%xmm1,%%xmm5                   \n"
+    "por       %%xmm4,%%xmm2                   \n"
+    "por       %%xmm5,%%xmm3                   \n"
+    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width)  // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_ARGBCOPYALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYALPHAROW_AVX2
+// width in pixels
+void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
+    "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm1         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm2   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1        \n"
+    "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2  \n"
+    "vmovdqu   %%ymm1," MEMACCESS(1) "         \n"
+    "vmovdqu   %%ymm2," MEMACCESS2(0x20,1) "   \n"
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width)  // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2"
+  );
+}
+#endif  // HAS_ARGBCOPYALPHAROW_AVX2
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
+// width in pixels
+void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm0,%%xmm0                   \n"
+    "pslld     $0x18,%%xmm0                    \n"
+    "pcmpeqb   %%xmm1,%%xmm1                   \n"
+    "psrld     $0x8,%%xmm1                     \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movq      " MEMACCESS(0) ",%%xmm2         \n"
+    "lea       " MEMLEA(0x8,0) ",%0            \n"
+    "punpcklbw %%xmm2,%%xmm2                   \n"
+    "punpckhwd %%xmm2,%%xmm3                   \n"
+    "punpcklwd %%xmm2,%%xmm2                   \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm4         \n"
+    "movdqu    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
+    "pand      %%xmm0,%%xmm2                   \n"
+    "pand      %%xmm0,%%xmm3                   \n"
+    "pand      %%xmm1,%%xmm4                   \n"
+    "pand      %%xmm1,%%xmm5                   \n"
+    "por       %%xmm4,%%xmm2                   \n"
+    "por       %%xmm5,%%xmm3                   \n"
+    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width)  // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
+// width in pixels
+void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
+    "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vpmovzxbd " MEMACCESS(0) ",%%ymm1         \n"
+    "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2    \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "vpslld    $0x18,%%ymm1,%%ymm1             \n"
+    "vpslld    $0x18,%%ymm2,%%ymm2             \n"
+    "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1        \n"
+    "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2  \n"
+    "vmovdqu   %%ymm1," MEMACCESS(1) "         \n"
+    "vmovdqu   %%ymm2," MEMACCESS2(0x20,1) "   \n"
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width)  // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2"
+  );
+}
+#endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
+
+#ifdef HAS_SETROW_X86
+void SetRow_X86(uint8* dst, uint8 v8, int width) {
+  size_t width_tmp = (size_t)(width >> 2);
+  const uint32 v32 = v8 * 0x01010101u;  // Duplicate byte to all bytes.
+  asm volatile (
+    "rep stosl " MEMSTORESTRING(eax,0) "       \n"
+    : "+D"(dst),       // %0
+      "+c"(width_tmp)  // %1
+    : "a"(v32)         // %2
+    : "memory", "cc");
+}
+
+void SetRow_ERMS(uint8* dst, uint8 v8, int width) {
+  size_t width_tmp = (size_t)(width);
+  asm volatile (
+    "rep stosb " MEMSTORESTRING(al,0) "        \n"
+    : "+D"(dst),       // %0
+      "+c"(width_tmp)  // %1
+    : "a"(v8)          // %2
+    : "memory", "cc");
+}
+
+void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) {
+  size_t width_tmp = (size_t)(width);
+  asm volatile (
+    "rep stosl " MEMSTORESTRING(eax,0) "       \n"
+    : "+D"(dst_argb),  // %0
+      "+c"(width_tmp)  // %1
+    : "a"(v32)         // %2
+    : "memory", "cc");
+}
+#endif  // HAS_SETROW_X86
+
+#ifdef HAS_YUY2TOYROW_SSE2
+void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_yuy2),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(width)        // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm5"
+  );
+}
+
+void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
+    MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "pavgb     %%xmm3,%%xmm1                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm1                   \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_yuy2),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(width)          // %3
+  : "r"((intptr_t)(stride_yuy2))  // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+
+void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
+                         uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm1                   \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_yuy2),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(width)          // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm5"
+  );
+}
+
+void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int width) {
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_uyvy),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(width)        // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1"
+  );
+}
+
+void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
+    MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "pavgb     %%xmm3,%%xmm1                   \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm1                   \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_uyvy),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(width)          // %3
+  : "r"((intptr_t)(stride_uyvy))  // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+
+void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm1                   \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_uyvy),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(width)          // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm5"
+  );
+}
+#endif  // HAS_YUY2TOYROW_SSE2
+
+#ifdef HAS_YUY2TOYROW_AVX2
+void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) {
+  asm volatile (
+    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
+    "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
+    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
+    "lea      " MEMLEA(0x20,1) ",%1            \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_yuy2),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(width)        // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm5"
+  );
+}
+
+void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
+    VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+    "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
+    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
+    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
+    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
+    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
+    "lea      " MEMLEA(0x10,1) ",%1            \n"
+    "sub       $0x20,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_yuy2),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(width)          // %3
+  : "r"((intptr_t)(stride_yuy2))  // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm5"
+  );
+}
+
+void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
+                         uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+    "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
+    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
+    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
+    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
+    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
+    "lea      " MEMLEA(0x10,1) ",%1            \n"
+    "sub       $0x20,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_yuy2),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(width)          // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm5"
+  );
+}
+
+void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width) {
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+    "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
+    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
+    "lea      " MEMLEA(0x20,1) ",%1            \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_uyvy),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(width)        // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm5"
+  );
+}
+void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
+    "sub       %1,%2                           \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
+    VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
+    "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
+    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
+    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
+    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
+    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
+    "lea      " MEMLEA(0x10,1) ",%1            \n"
+    "sub       $0x20,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_uyvy),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(width)          // %3
+  : "r"((intptr_t)(stride_uyvy))  // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm5"
+  );
+}
+
+void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    "vpsrlw     $0x8,%%ymm5,%%ymm5             \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
+    "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
+    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
+    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
+    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
+    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
+    "lea      " MEMLEA(0x10,1) ",%1            \n"
+    "sub       $0x20,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_uyvy),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(width)          // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm5"
+  );
+}
+#endif  // HAS_YUY2TOYROW_AVX2
+
+#ifdef HAS_ARGBBLENDROW_SSSE3
+// Shuffle table for isolating alpha.
+static uvec8 kShuffleAlpha = {
+  3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
+  11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
+};
+
+// Blend 8 pixels at a time
+void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
+                        uint8* dst_argb, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm7,%%xmm7                   \n"
+    "psrlw     $0xf,%%xmm7                     \n"
+    "pcmpeqb   %%xmm6,%%xmm6                   \n"
+    "psrlw     $0x8,%%xmm6                     \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psllw     $0x8,%%xmm5                     \n"
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "pslld     $0x18,%%xmm4                    \n"
+    "sub       $0x4,%3                         \n"
+    "jl        49f                             \n"
+
+    // 4 pixel loop.
+    LABELALIGN
+  "40:                                         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm3         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqa    %%xmm3,%%xmm0                   \n"
+    "pxor      %%xmm4,%%xmm3                   \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
+    "pshufb    %4,%%xmm3                       \n"
+    "pand      %%xmm6,%%xmm2                   \n"
+    "paddw     %%xmm7,%%xmm3                   \n"
+    "pmullw    %%xmm3,%%xmm2                   \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "por       %%xmm4,%%xmm0                   \n"
+    "pmullw    %%xmm3,%%xmm1                   \n"
+    "psrlw     $0x8,%%xmm2                     \n"
+    "paddusb   %%xmm2,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "paddusb   %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
+    "jge       40b                             \n"
+
+  "49:                                         \n"
+    "add       $0x3,%3                         \n"
+    "jl        99f                             \n"
+
+    // 1 pixel loop.
+  "91:                                         \n"
+    "movd      " MEMACCESS(0) ",%%xmm3         \n"
+    "lea       " MEMLEA(0x4,0) ",%0            \n"
+    "movdqa    %%xmm3,%%xmm0                   \n"
+    "pxor      %%xmm4,%%xmm3                   \n"
+    "movd      " MEMACCESS(1) ",%%xmm2         \n"
+    "pshufb    %4,%%xmm3                       \n"
+    "pand      %%xmm6,%%xmm2                   \n"
+    "paddw     %%xmm7,%%xmm3                   \n"
+    "pmullw    %%xmm3,%%xmm2                   \n"
+    "movd      " MEMACCESS(1) ",%%xmm1         \n"
+    "lea       " MEMLEA(0x4,1) ",%1            \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "por       %%xmm4,%%xmm0                   \n"
+    "pmullw    %%xmm3,%%xmm1                   \n"
+    "psrlw     $0x8,%%xmm2                     \n"
+    "paddusb   %%xmm2,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "paddusb   %%xmm1,%%xmm0                   \n"
+    "movd      %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x4,2) ",%2            \n"
+    "sub       $0x1,%3                         \n"
+    "jge       91b                             \n"
+  "99:                                         \n"
+  : "+r"(src_argb0),    // %0
+    "+r"(src_argb1),    // %1
+    "+r"(dst_argb),     // %2
+    "+r"(width)         // %3
+  : "m"(kShuffleAlpha)  // %4
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBBLENDROW_SSSE3
+
+#ifdef HAS_BLENDPLANEROW_SSSE3
+// Blend 8 pixels at a time.
+// unsigned version of math
+// =((A2*C2)+(B2*(255-C2))+255)/256
+// signed version of math
+// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
+void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
+                         const uint8* alpha, uint8* dst, int width) {
+  asm volatile (
+    "pcmpeqb    %%xmm5,%%xmm5                  \n"
+    "psllw      $0x8,%%xmm5                    \n"
+    "mov        $0x80808080,%%eax              \n"
+    "movd       %%eax,%%xmm6                   \n"
+    "pshufd     $0x0,%%xmm6,%%xmm6             \n"
+    "mov        $0x807f807f,%%eax              \n"
+    "movd       %%eax,%%xmm7                   \n"
+    "pshufd     $0x0,%%xmm7,%%xmm7             \n"
+    "sub        %2,%0                          \n"
+    "sub        %2,%1                          \n"
+    "sub        %2,%3                          \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movq       (%2),%%xmm0                    \n"
+    "punpcklbw  %%xmm0,%%xmm0                  \n"
+    "pxor       %%xmm5,%%xmm0                  \n"
+    "movq       (%0,%2,1),%%xmm1               \n"
+    "movq       (%1,%2,1),%%xmm2               \n"
+    "punpcklbw  %%xmm2,%%xmm1                  \n"
+    "psubb      %%xmm6,%%xmm1                  \n"
+    "pmaddubsw  %%xmm1,%%xmm0                  \n"
+    "paddw      %%xmm7,%%xmm0                  \n"
+    "psrlw      $0x8,%%xmm0                    \n"
+    "packuswb   %%xmm0,%%xmm0                  \n"
+    "movq       %%xmm0,(%3,%2,1)               \n"
+    "lea        0x8(%2),%2                     \n"
+    "sub        $0x8,%4                        \n"
+    "jg        1b                              \n"
+  : "+r"(src0),       // %0
+    "+r"(src1),       // %1
+    "+r"(alpha),      // %2
+    "+r"(dst),        // %3
+    "+r"(width)       // %4
+  :: "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_BLENDPLANEROW_SSSE3
+
+#ifdef HAS_BLENDPLANEROW_AVX2
+// Blend 32 pixels at a time.
+// unsigned version of math
+// =((A2*C2)+(B2*(255-C2))+255)/256
+// signed version of math
+// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
+void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
+                        const uint8* alpha, uint8* dst, int width) {
+  asm volatile (
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    "vpsllw     $0x8,%%ymm5,%%ymm5             \n"
+    "mov        $0x80808080,%%eax              \n"
+    "vmovd      %%eax,%%xmm6                   \n"
+    "vbroadcastss %%xmm6,%%ymm6                \n"
+    "mov        $0x807f807f,%%eax              \n"
+    "vmovd      %%eax,%%xmm7                   \n"
+    "vbroadcastss %%xmm7,%%ymm7                \n"
+    "sub        %2,%0                          \n"
+    "sub        %2,%1                          \n"
+    "sub        %2,%3                          \n"
+
+    // 32 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    (%2),%%ymm0                    \n"
+    "vpunpckhbw %%ymm0,%%ymm0,%%ymm3           \n"
+    "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
+    "vpxor      %%ymm5,%%ymm3,%%ymm3           \n"
+    "vpxor      %%ymm5,%%ymm0,%%ymm0           \n"
+    "vmovdqu    (%0,%2,1),%%ymm1               \n"
+    "vmovdqu    (%1,%2,1),%%ymm2               \n"
+    "vpunpckhbw %%ymm2,%%ymm1,%%ymm4           \n"
+    "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"
+    "vpsubb     %%ymm6,%%ymm4,%%ymm4           \n"
+    "vpsubb     %%ymm6,%%ymm1,%%ymm1           \n"
+    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+    "vpmaddubsw %%ymm1,%%ymm0,%%ymm0           \n"
+    "vpaddw     %%ymm7,%%ymm3,%%ymm3           \n"
+    "vpaddw     %%ymm7,%%ymm0,%%ymm0           \n"
+    "vpsrlw     $0x8,%%ymm3,%%ymm3             \n"
+    "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
+    "vpackuswb  %%ymm3,%%ymm0,%%ymm0           \n"
+    "vmovdqu    %%ymm0,(%3,%2,1)               \n"
+    "lea        0x20(%2),%2                    \n"
+    "sub        $0x20,%4                       \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src0),       // %0
+    "+r"(src1),       // %1
+    "+r"(alpha),      // %2
+    "+r"(dst),        // %3
+    "+r"(width)       // %4
+  :: "memory", "cc", "eax",
+     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_BLENDPLANEROW_AVX2
+
+#ifdef HAS_ARGBATTENUATEROW_SSSE3
+// Shuffle table duplicating alpha
+static uvec8 kShuffleAlpha0 = {
+  3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u
+};
+static uvec8 kShuffleAlpha1 = {
+  11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
+  15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u
+};
+// Attenuate 4 pixels at a time.
+void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm3,%%xmm3                   \n"
+    "pslld     $0x18,%%xmm3                    \n"
+    "movdqa    %3,%%xmm4                       \n"
+    "movdqa    %4,%%xmm5                       \n"
+
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "pshufb    %%xmm4,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
+    "punpcklbw %%xmm1,%%xmm1                   \n"
+    "pmulhuw   %%xmm1,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
+    "pshufb    %%xmm5,%%xmm1                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
+    "punpckhbw %%xmm2,%%xmm2                   \n"
+    "pmulhuw   %%xmm2,%%xmm1                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "pand      %%xmm3,%%xmm2                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "por       %%xmm2,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),    // %0
+    "+r"(dst_argb),    // %1
+    "+r"(width)        // %2
+  : "m"(kShuffleAlpha0),  // %3
+    "m"(kShuffleAlpha1)  // %4
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_ARGBATTENUATEROW_SSSE3
+
+#ifdef HAS_ARGBATTENUATEROW_AVX2
+// Shuffle table duplicating alpha.
+static const uvec8 kShuffleAlpha_AVX2 = {
+  6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
+};
+// Attenuate 8 pixels at a time.
+void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
+  asm volatile (
+    "vbroadcastf128 %3,%%ymm4                  \n"
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    "vpslld     $0x18,%%ymm5,%%ymm5            \n"
+    "sub        %0,%1                          \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm6        \n"
+    "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
+    "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
+    "vpshufb    %%ymm4,%%ymm0,%%ymm2           \n"
+    "vpshufb    %%ymm4,%%ymm1,%%ymm3           \n"
+    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
+    "vpand      %%ymm5,%%ymm6,%%ymm6           \n"
+    "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
+    "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
+    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+    "vpor       %%ymm6,%%ymm0,%%ymm0           \n"
+    MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1)          //  vmovdqu %%ymm0,(%0,%1)
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "sub        $0x8,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb),    // %0
+    "+r"(dst_argb),    // %1
+    "+r"(width)        // %2
+  : "m"(kShuffleAlpha_AVX2)  // %3
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+#endif  // HAS_ARGBATTENUATEROW_AVX2
+
+#ifdef HAS_ARGBUNATTENUATEROW_SSE2
+// Unattenuate 4 pixels at a time.
+void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
+                             int width) {
+  uintptr_t alpha = 0;
+  asm volatile (
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movzb     " MEMACCESS2(0x03,0) ",%3       \n"
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
+    "movzb     " MEMACCESS2(0x07,0) ",%3       \n"
+    MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3
+    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
+    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
+    "movlhps   %%xmm3,%%xmm2                   \n"
+    "pmulhuw   %%xmm2,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
+    "movzb     " MEMACCESS2(0x0b,0) ",%3       \n"
+    "punpckhbw %%xmm1,%%xmm1                   \n"
+    MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
+    "movzb     " MEMACCESS2(0x0f,0) ",%3       \n"
+    MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3
+    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
+    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
+    "movlhps   %%xmm3,%%xmm2                   \n"
+    "pmulhuw   %%xmm2,%%xmm1                   \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),    // %0
+    "+r"(dst_argb),    // %1
+    "+r"(width),       // %2
+    "+r"(alpha)        // %3
+  : "r"(fixed_invtbl8)  // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_ARGBUNATTENUATEROW_SSE2
+
+#ifdef HAS_ARGBUNATTENUATEROW_AVX2
+// Shuffle table duplicating alpha.
+static const uvec8 kUnattenShuffleAlpha_AVX2 = {
+  0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
+};
+// Unattenuate 8 pixels at a time.
+void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+                             int width) {
+  uintptr_t alpha = 0;
+  asm volatile (
+    "sub        %0,%1                          \n"
+    "vbroadcastf128 %5,%%ymm5                  \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    // replace VPGATHER
+    "movzb     " MEMACCESS2(0x03,0) ",%3       \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm0)             //  vmovd 0x0(%4,%3,4),%%xmm0
+    "movzb     " MEMACCESS2(0x07,0) ",%3       \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm1)             //  vmovd 0x0(%4,%3,4),%%xmm1
+    "movzb     " MEMACCESS2(0x0b,0) ",%3       \n"
+    "vpunpckldq %%xmm1,%%xmm0,%%xmm6           \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm2)             //  vmovd 0x0(%4,%3,4),%%xmm2
+    "movzb     " MEMACCESS2(0x0f,0) ",%3       \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm3)             //  vmovd 0x0(%4,%3,4),%%xmm3
+    "movzb     " MEMACCESS2(0x13,0) ",%3       \n"
+    "vpunpckldq %%xmm3,%%xmm2,%%xmm7           \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm0)             //  vmovd 0x0(%4,%3,4),%%xmm0
+    "movzb     " MEMACCESS2(0x17,0) ",%3       \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm1)             //  vmovd 0x0(%4,%3,4),%%xmm1
+    "movzb     " MEMACCESS2(0x1b,0) ",%3       \n"
+    "vpunpckldq %%xmm1,%%xmm0,%%xmm0           \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm2)             //  vmovd 0x0(%4,%3,4),%%xmm2
+    "movzb     " MEMACCESS2(0x1f,0) ",%3       \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm3)             //  vmovd 0x0(%4,%3,4),%%xmm3
+    "vpunpckldq %%xmm3,%%xmm2,%%xmm2           \n"
+    "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3          \n"
+    "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0          \n"
+    "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3     \n"
+    // end of VPGATHER
+
+    "vmovdqu    " MEMACCESS(0) ",%%ymm6        \n"
+    "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
+    "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
+    "vpunpcklwd %%ymm3,%%ymm3,%%ymm2           \n"
+    "vpunpckhwd %%ymm3,%%ymm3,%%ymm3           \n"
+    "vpshufb    %%ymm5,%%ymm2,%%ymm2           \n"
+    "vpshufb    %%ymm5,%%ymm3,%%ymm3           \n"
+    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
+    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+    MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1)          //  vmovdqu %%ymm0,(%0,%1)
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "sub        $0x8,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb),    // %0
+    "+r"(dst_argb),    // %1
+    "+r"(width),       // %2
+    "+r"(alpha)        // %3
+  : "r"(fixed_invtbl8),  // %4
+    "m"(kUnattenShuffleAlpha_AVX2)  // %5
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBUNATTENUATEROW_AVX2
+
+#ifdef HAS_ARGBGRAYROW_SSSE3
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
+void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
+  asm volatile (
+    "movdqa    %3,%%xmm4                       \n"
+    "movdqa    %4,%%xmm5                       \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "phaddw    %%xmm1,%%xmm0                   \n"
+    "paddw     %%xmm5,%%xmm0                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "psrld     $0x18,%%xmm2                    \n"
+    "psrld     $0x18,%%xmm3                    \n"
+    "packuswb  %%xmm3,%%xmm2                   \n"
+    "packuswb  %%xmm2,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm3                   \n"
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "punpcklbw %%xmm2,%%xmm3                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm3,%%xmm0                   \n"
+    "punpckhwd %%xmm3,%%xmm1                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_argb),   // %1
+    "+r"(width)       // %2
+  : "m"(kARGBToYJ),   // %3
+    "m"(kAddYJ64)     // %4
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_ARGBGRAYROW_SSSE3
+
+#ifdef HAS_ARGBSEPIAROW_SSSE3
+//    b = (r * 35 + g * 68 + b * 17) >> 7
+//    g = (r * 45 + g * 88 + b * 22) >> 7
+//    r = (r * 50 + g * 98 + b * 24) >> 7
+// Constant for ARGB color to sepia tone
+static vec8 kARGBToSepiaB = {
+  17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
+};
+
+static vec8 kARGBToSepiaG = {
+  22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
+};
+
+static vec8 kARGBToSepiaR = {
+  24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
+};
+
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
+  asm volatile (
+    "movdqa    %2,%%xmm2                       \n"
+    "movdqa    %3,%%xmm3                       \n"
+    "movdqa    %4,%%xmm4                       \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
+    "pmaddubsw %%xmm2,%%xmm0                   \n"
+    "pmaddubsw %%xmm2,%%xmm6                   \n"
+    "phaddw    %%xmm6,%%xmm0                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm5         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "pmaddubsw %%xmm3,%%xmm5                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "phaddw    %%xmm1,%%xmm5                   \n"
+    "psrlw     $0x7,%%xmm5                     \n"
+    "packuswb  %%xmm5,%%xmm5                   \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm5         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "pmaddubsw %%xmm4,%%xmm5                   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "phaddw    %%xmm1,%%xmm5                   \n"
+    "psrlw     $0x7,%%xmm5                     \n"
+    "packuswb  %%xmm5,%%xmm5                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "psrld     $0x18,%%xmm6                    \n"
+    "psrld     $0x18,%%xmm1                    \n"
+    "packuswb  %%xmm1,%%xmm6                   \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "punpcklbw %%xmm6,%%xmm5                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm5,%%xmm0                   \n"
+    "punpckhwd %%xmm5,%%xmm1                   \n"
+    "movdqu    %%xmm0," MEMACCESS(0) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "sub       $0x8,%1                         \n"
+    "jg        1b                              \n"
+  : "+r"(dst_argb),      // %0
+    "+r"(width)          // %1
+  : "m"(kARGBToSepiaB),  // %2
+    "m"(kARGBToSepiaG),  // %3
+    "m"(kARGBToSepiaR)   // %4
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+#endif  // HAS_ARGBSEPIAROW_SSSE3
+
+#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// Same as Sepia except matrix is provided.
+void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                              const int8* matrix_argb, int width) {
+  asm volatile (
+    "movdqu    " MEMACCESS(3) ",%%xmm5         \n"
+    "pshufd    $0x00,%%xmm5,%%xmm2             \n"
+    "pshufd    $0x55,%%xmm5,%%xmm3             \n"
+    "pshufd    $0xaa,%%xmm5,%%xmm4             \n"
+    "pshufd    $0xff,%%xmm5,%%xmm5             \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
+    "pmaddubsw %%xmm2,%%xmm0                   \n"
+    "pmaddubsw %%xmm2,%%xmm7                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "pmaddubsw %%xmm3,%%xmm6                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "phaddsw   %%xmm7,%%xmm0                   \n"
+    "phaddsw   %%xmm1,%%xmm6                   \n"
+    "psraw     $0x6,%%xmm0                     \n"
+    "psraw     $0x6,%%xmm6                     \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "punpcklbw %%xmm6,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "pmaddubsw %%xmm4,%%xmm7                   \n"
+    "phaddsw   %%xmm7,%%xmm1                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
+    "pmaddubsw %%xmm5,%%xmm6                   \n"
+    "pmaddubsw %%xmm5,%%xmm7                   \n"
+    "phaddsw   %%xmm7,%%xmm6                   \n"
+    "psraw     $0x6,%%xmm1                     \n"
+    "psraw     $0x6,%%xmm6                     \n"
+    "packuswb  %%xmm1,%%xmm1                   \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "punpcklbw %%xmm6,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm6                   \n"
+    "punpcklwd %%xmm1,%%xmm0                   \n"
+    "punpckhwd %%xmm1,%%xmm6                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm6," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),      // %0
+    "+r"(dst_argb),      // %1
+    "+r"(width)          // %2
+  : "r"(matrix_argb)     // %3
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
+
+#ifdef HAS_ARGBQUANTIZEROW_SSE2
+// Quantize 4 ARGB pixels (16 bytes).
+void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
+                          int interval_offset, int width) {
+  asm volatile (
+    "movd      %2,%%xmm2                       \n"
+    "movd      %3,%%xmm3                       \n"
+    "movd      %4,%%xmm4                       \n"
+    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
+    "pshufd    $0x44,%%xmm2,%%xmm2             \n"
+    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
+    "pshufd    $0x44,%%xmm3,%%xmm3             \n"
+    "pshuflw   $0x40,%%xmm4,%%xmm4             \n"
+    "pshufd    $0x44,%%xmm4,%%xmm4             \n"
+    "pxor      %%xmm5,%%xmm5                   \n"
+    "pcmpeqb   %%xmm6,%%xmm6                   \n"
+    "pslld     $0x18,%%xmm6                    \n"
+
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "pmulhuw   %%xmm2,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
+    "punpckhbw %%xmm5,%%xmm1                   \n"
+    "pmulhuw   %%xmm2,%%xmm1                   \n"
+    "pmullw    %%xmm3,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm7         \n"
+    "pmullw    %%xmm3,%%xmm1                   \n"
+    "pand      %%xmm6,%%xmm7                   \n"
+    "paddw     %%xmm4,%%xmm0                   \n"
+    "paddw     %%xmm4,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "por       %%xmm7,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(0) "         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "sub       $0x4,%1                         \n"
+    "jg        1b                              \n"
+  : "+r"(dst_argb),       // %0
+    "+r"(width)           // %1
+  : "r"(scale),           // %2
+    "r"(interval_size),   // %3
+    "r"(interval_offset)  // %4
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBQUANTIZEROW_SSE2
+
+#ifdef HAS_ARGBSHADEROW_SSE2
+// Shade 4 pixels at a time by specified value.
+void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
+                       uint32 value) {
+  asm volatile (
+    "movd      %3,%%xmm2                       \n"
+    "punpcklbw %%xmm2,%%xmm2                   \n"
+    "punpcklqdq %%xmm2,%%xmm2                  \n"
+
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "punpckhbw %%xmm1,%%xmm1                   \n"
+    "pmulhuw   %%xmm2,%%xmm0                   \n"
+    "pmulhuw   %%xmm2,%%xmm1                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(width)      // %2
+  : "r"(value)       // %3
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2"
+  );
+}
+#endif  // HAS_ARGBSHADEROW_SSE2
+
+#ifdef HAS_ARGBMULTIPLYROW_SSE2
+// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
+void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  asm volatile (
+    "pxor      %%xmm5,%%xmm5                  \n"
+
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "movdqu    %%xmm0,%%xmm1                   \n"
+    "movdqu    %%xmm2,%%xmm3                   \n"
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "punpckhbw %%xmm1,%%xmm1                   \n"
+    "punpcklbw %%xmm5,%%xmm2                   \n"
+    "punpckhbw %%xmm5,%%xmm3                   \n"
+    "pmulhuw   %%xmm2,%%xmm0                   \n"
+    "pmulhuw   %%xmm3,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_ARGBMULTIPLYROW_SSE2
+
+#ifdef HAS_ARGBMULTIPLYROW_AVX2
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  asm volatile (
+    "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
+
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm1        \n"
+    "lea        " MEMLEA(0x20,0) ",%0          \n"
+    "vmovdqu    " MEMACCESS(1) ",%%ymm3        \n"
+    "lea        " MEMLEA(0x20,1) ",%1          \n"
+    "vpunpcklbw %%ymm1,%%ymm1,%%ymm0           \n"
+    "vpunpckhbw %%ymm1,%%ymm1,%%ymm1           \n"
+    "vpunpcklbw %%ymm5,%%ymm3,%%ymm2           \n"
+    "vpunpckhbw %%ymm5,%%ymm3,%%ymm3           \n"
+    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
+    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+    "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
+    "lea       " MEMLEA(0x20,2) ",%2           \n"
+    "sub        $0x8,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "memory", "cc"
+#if defined(__AVX2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+  );
+}
+#endif  // HAS_ARGBMULTIPLYROW_AVX2
+
+#ifdef HAS_ARGBADDROW_SSE2
+// Add 2 rows of ARGB pixels together, 4 pixels at a time.
+void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "paddusb   %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1"
+  );
+}
+#endif  // HAS_ARGBADDROW_SSE2
+
+#ifdef HAS_ARGBADDROW_AVX2
+// Add 2 rows of ARGB pixels together, 4 pixels at a time.
+void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
+    "lea        " MEMLEA(0x20,0) ",%0          \n"
+    "vpaddusb   " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
+    "lea        " MEMLEA(0x20,1) ",%1          \n"
+    "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
+    "lea        " MEMLEA(0x20,2) ",%2          \n"
+    "sub        $0x8,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "memory", "cc"
+    , "xmm0"
+  );
+}
+#endif  // HAS_ARGBADDROW_AVX2
+
+#ifdef HAS_ARGBSUBTRACTROW_SSE2
+// Subtract 2 rows of ARGB pixels, 4 pixels at a time.
+void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  asm volatile (
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "psubusb   %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1"
+  );
+}
+#endif  // HAS_ARGBSUBTRACTROW_SSE2
+
+#ifdef HAS_ARGBSUBTRACTROW_AVX2
+// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
+void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  asm volatile (
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
+    "lea        " MEMLEA(0x20,0) ",%0          \n"
+    "vpsubusb   " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
+    "lea        " MEMLEA(0x20,1) ",%1          \n"
+    "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
+    "lea        " MEMLEA(0x20,2) ",%2          \n"
+    "sub        $0x8,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "memory", "cc"
+    , "xmm0"
+  );
+}
+#endif  // HAS_ARGBSUBTRACTROW_AVX2
+
+#ifdef HAS_SOBELXROW_SSE2
+// SobelX as a matrix is
+// -1  0  1
+// -2  0  2
+// -1  0  1
+void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
+                    const uint8* src_y2, uint8* dst_sobelx, int width) {
+  asm volatile (
+    "sub       %0,%1                           \n"
+    "sub       %0,%2                           \n"
+    "sub       %0,%3                           \n"
+    "pxor      %%xmm5,%%xmm5                   \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movq      " MEMACCESS(0) ",%%xmm0         \n"
+    "movq      " MEMACCESS2(0x2,0) ",%%xmm1    \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "punpcklbw %%xmm5,%%xmm1                   \n"
+    "psubw     %%xmm1,%%xmm0                   \n"
+    MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1
+    MEMOPREG(movq,0x02,0,1,1,xmm2)             //  movq      0x2(%0,%1,1),%%xmm2
+    "punpcklbw %%xmm5,%%xmm1                   \n"
+    "punpcklbw %%xmm5,%%xmm2                   \n"
+    "psubw     %%xmm2,%%xmm1                   \n"
+    MEMOPREG(movq,0x00,0,2,1,xmm2)             //  movq      (%0,%2,1),%%xmm2
+    MEMOPREG(movq,0x02,0,2,1,xmm3)             //  movq      0x2(%0,%2,1),%%xmm3
+    "punpcklbw %%xmm5,%%xmm2                   \n"
+    "punpcklbw %%xmm5,%%xmm3                   \n"
+    "psubw     %%xmm3,%%xmm2                   \n"
+    "paddw     %%xmm2,%%xmm0                   \n"
+    "paddw     %%xmm1,%%xmm0                   \n"
+    "paddw     %%xmm1,%%xmm0                   \n"
+    "pxor      %%xmm1,%%xmm1                   \n"
+    "psubw     %%xmm0,%%xmm1                   \n"
+    "pmaxsw    %%xmm1,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    MEMOPMEM(movq,xmm0,0x00,0,3,1)             //  movq      %%xmm0,(%0,%3,1)
+    "lea       " MEMLEA(0x8,0) ",%0            \n"
+    "sub       $0x8,%4                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_y0),      // %0
+    "+r"(src_y1),      // %1
+    "+r"(src_y2),      // %2
+    "+r"(dst_sobelx),  // %3
+    "+r"(width)        // %4
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_SOBELXROW_SSE2
+
+#ifdef HAS_SOBELYROW_SSE2
+// SobelY as a matrix is
+// -1 -2 -1
+//  0  0  0
+//  1  2  1
+void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
+                    uint8* dst_sobely, int width) {
+  asm volatile (
+    "sub       %0,%1                           \n"
+    "sub       %0,%2                           \n"
+    "pxor      %%xmm5,%%xmm5                   \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movq      " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "punpcklbw %%xmm5,%%xmm1                   \n"
+    "psubw     %%xmm1,%%xmm0                   \n"
+    "movq      " MEMACCESS2(0x1,0) ",%%xmm1    \n"
+    MEMOPREG(movq,0x01,0,1,1,xmm2)             //  movq      0x1(%0,%1,1),%%xmm2
+    "punpcklbw %%xmm5,%%xmm1                   \n"
+    "punpcklbw %%xmm5,%%xmm2                   \n"
+    "psubw     %%xmm2,%%xmm1                   \n"
+    "movq      " MEMACCESS2(0x2,0) ",%%xmm2    \n"
+    MEMOPREG(movq,0x02,0,1,1,xmm3)             //  movq      0x2(%0,%1,1),%%xmm3
+    "punpcklbw %%xmm5,%%xmm2                   \n"
+    "punpcklbw %%xmm5,%%xmm3                   \n"
+    "psubw     %%xmm3,%%xmm2                   \n"
+    "paddw     %%xmm2,%%xmm0                   \n"
+    "paddw     %%xmm1,%%xmm0                   \n"
+    "paddw     %%xmm1,%%xmm0                   \n"
+    "pxor      %%xmm1,%%xmm1                   \n"
+    "psubw     %%xmm0,%%xmm1                   \n"
+    "pmaxsw    %%xmm1,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    MEMOPMEM(movq,xmm0,0x00,0,2,1)             //  movq      %%xmm0,(%0,%2,1)
+    "lea       " MEMLEA(0x8,0) ",%0            \n"
+    "sub       $0x8,%3                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_y0),      // %0
+    "+r"(src_y1),      // %1
+    "+r"(dst_sobely),  // %2
+    "+r"(width)        // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_SOBELYROW_SSE2
+
+#ifdef HAS_SOBELROW_SSE2
+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+// A = 255
+// R = Sobel
+// G = Sobel
+// B = Sobel
+void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                   uint8* dst_argb, int width) {
+  asm volatile (
+    "sub       %0,%1                           \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "pslld     $0x18,%%xmm5                    \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "paddusb   %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "punpcklbw %%xmm0,%%xmm2                   \n"
+    "punpckhbw %%xmm0,%%xmm0                   \n"
+    "movdqa    %%xmm2,%%xmm1                   \n"
+    "punpcklwd %%xmm2,%%xmm1                   \n"
+    "punpckhwd %%xmm2,%%xmm2                   \n"
+    "por       %%xmm5,%%xmm1                   \n"
+    "por       %%xmm5,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm3                   \n"
+    "punpcklwd %%xmm0,%%xmm3                   \n"
+    "punpckhwd %%xmm0,%%xmm0                   \n"
+    "por       %%xmm5,%%xmm3                   \n"
+    "por       %%xmm5,%%xmm0                   \n"
+    "movdqu    %%xmm1," MEMACCESS(2) "         \n"
+    "movdqu    %%xmm2," MEMACCESS2(0x10,2) "   \n"
+    "movdqu    %%xmm3," MEMACCESS2(0x20,2) "   \n"
+    "movdqu    %%xmm0," MEMACCESS2(0x30,2) "   \n"
+    "lea       " MEMLEA(0x40,2) ",%2           \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_argb),    // %2
+    "+r"(width)        // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_SOBELROW_SSE2
+
+#ifdef HAS_SOBELTOPLANEROW_SSE2
+// Adds Sobel X and Sobel Y and stores Sobel into a plane.
+void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                          uint8* dst_y, int width) {
+  asm volatile (
+    "sub       %0,%1                           \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "pslld     $0x18,%%xmm5                    \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "paddusb   %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_y),       // %2
+    "+r"(width)        // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1"
+  );
+}
+#endif  // HAS_SOBELTOPLANEROW_SSE2
+
+#ifdef HAS_SOBELXYROW_SSE2
+// Mixes Sobel X, Sobel Y and Sobel into ARGB.
+// A = 255
+// R = Sobel X
+// G = Sobel
+// B = Sobel Y
+void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    "sub       %0,%1                           \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "paddusb   %%xmm1,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm3                   \n"
+    "punpcklbw %%xmm5,%%xmm3                   \n"
+    "punpckhbw %%xmm5,%%xmm0                   \n"
+    "movdqa    %%xmm1,%%xmm4                   \n"
+    "punpcklbw %%xmm2,%%xmm4                   \n"
+    "punpckhbw %%xmm2,%%xmm1                   \n"
+    "movdqa    %%xmm4,%%xmm6                   \n"
+    "punpcklwd %%xmm3,%%xmm6                   \n"
+    "punpckhwd %%xmm3,%%xmm4                   \n"
+    "movdqa    %%xmm1,%%xmm7                   \n"
+    "punpcklwd %%xmm0,%%xmm7                   \n"
+    "punpckhwd %%xmm0,%%xmm1                   \n"
+    "movdqu    %%xmm6," MEMACCESS(2) "         \n"
+    "movdqu    %%xmm4," MEMACCESS2(0x10,2) "   \n"
+    "movdqu    %%xmm7," MEMACCESS2(0x20,2) "   \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x30,2) "   \n"
+    "lea       " MEMLEA(0x40,2) ",%2           \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_argb),    // %2
+    "+r"(width)        // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_SOBELXYROW_SSE2
+
+#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
+// Creates a table of cumulative sums where each value is a sum of all values
+// above and to the left of the value, inclusive of the value.
+void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
+                                  const int32* previous_cumsum, int width) {
+  asm volatile (
+    "pxor      %%xmm0,%%xmm0                   \n"
+    "pxor      %%xmm1,%%xmm1                   \n"
+    "sub       $0x4,%3                         \n"
+    "jl        49f                             \n"
+    "test      $0xf,%1                         \n"
+    "jne       49f                             \n"
+
+  // 4 pixel loop                              \n"
+    LABELALIGN
+  "40:                                         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqa    %%xmm2,%%xmm4                   \n"
+    "punpcklbw %%xmm1,%%xmm2                   \n"
+    "movdqa    %%xmm2,%%xmm3                   \n"
+    "punpcklwd %%xmm1,%%xmm2                   \n"
+    "punpckhwd %%xmm1,%%xmm3                   \n"
+    "punpckhbw %%xmm1,%%xmm4                   \n"
+    "movdqa    %%xmm4,%%xmm5                   \n"
+    "punpcklwd %%xmm1,%%xmm4                   \n"
+    "punpckhwd %%xmm1,%%xmm5                   \n"
+    "paddd     %%xmm2,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(2) ",%%xmm2         \n"
+    "paddd     %%xmm0,%%xmm2                   \n"
+    "paddd     %%xmm3,%%xmm0                   \n"
+    "movdqu    " MEMACCESS2(0x10,2) ",%%xmm3   \n"
+    "paddd     %%xmm0,%%xmm3                   \n"
+    "paddd     %%xmm4,%%xmm0                   \n"
+    "movdqu    " MEMACCESS2(0x20,2) ",%%xmm4   \n"
+    "paddd     %%xmm0,%%xmm4                   \n"
+    "paddd     %%xmm5,%%xmm0                   \n"
+    "movdqu    " MEMACCESS2(0x30,2) ",%%xmm5   \n"
+    "lea       " MEMLEA(0x40,2) ",%2           \n"
+    "paddd     %%xmm0,%%xmm5                   \n"
+    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
+    "movdqu    %%xmm4," MEMACCESS2(0x20,1) "   \n"
+    "movdqu    %%xmm5," MEMACCESS2(0x30,1) "   \n"
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub       $0x4,%3                         \n"
+    "jge       40b                             \n"
+
+  "49:                                         \n"
+    "add       $0x3,%3                         \n"
+    "jl        19f                             \n"
+
+  // 1 pixel loop                              \n"
+    LABELALIGN
+  "10:                                         \n"
+    "movd      " MEMACCESS(0) ",%%xmm2         \n"
+    "lea       " MEMLEA(0x4,0) ",%0            \n"
+    "punpcklbw %%xmm1,%%xmm2                   \n"
+    "punpcklwd %%xmm1,%%xmm2                   \n"
+    "paddd     %%xmm2,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(2) ",%%xmm2         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "paddd     %%xmm0,%%xmm2                   \n"
+    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x1,%3                         \n"
+    "jge       10b                             \n"
+
+  "19:                                         \n"
+  : "+r"(row),  // %0
+    "+r"(cumsum),  // %1
+    "+r"(previous_cumsum),  // %2
+    "+r"(width)  // %3
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
+
+#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
+                                    int width, int area, uint8* dst,
+                                    int count) {
+  asm volatile (
+    "movd      %5,%%xmm5                       \n"
+    "cvtdq2ps  %%xmm5,%%xmm5                   \n"
+    "rcpss     %%xmm5,%%xmm4                   \n"
+    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
+    "sub       $0x4,%3                         \n"
+    "jl        49f                             \n"
+    "cmpl      $0x80,%5                        \n"
+    "ja        40f                             \n"
+
+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+    "pcmpeqb   %%xmm6,%%xmm6                   \n"
+    "psrld     $0x10,%%xmm6                    \n"
+    "cvtdq2ps  %%xmm6,%%xmm6                   \n"
+    "addps     %%xmm6,%%xmm5                   \n"
+    "mulps     %%xmm4,%%xmm5                   \n"
+    "cvtps2dq  %%xmm5,%%xmm5                   \n"
+    "packssdw  %%xmm5,%%xmm5                   \n"
+
+  // 4 pixel small loop                        \n"
+    LABELALIGN
+  "4:                                         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
+    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
+    MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1
+    MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2
+    MEMOPREG(psubd,0x30,0,4,4,xmm3)            // psubd    0x30(%0,%4,4),%%xmm3
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "psubd     " MEMACCESS(1) ",%%xmm0         \n"
+    "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"
+    "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"
+    "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"
+    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
+    MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1
+    MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2
+    MEMOPREG(paddd,0x30,1,4,4,xmm3)            // paddd    0x30(%1,%4,4),%%xmm3
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "packssdw  %%xmm1,%%xmm0                   \n"
+    "packssdw  %%xmm3,%%xmm2                   \n"
+    "pmulhuw   %%xmm5,%%xmm0                   \n"
+    "pmulhuw   %%xmm5,%%xmm2                   \n"
+    "packuswb  %%xmm2,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
+    "jge       4b                              \n"
+    "jmp       49f                             \n"
+
+  // 4 pixel loop                              \n"
+    LABELALIGN
+  "40:                                         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
+    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
+    MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1
+    MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2
+    MEMOPREG(psubd,0x30,0,4,4,xmm3)            // psubd    0x30(%0,%4,4),%%xmm3
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "psubd     " MEMACCESS(1) ",%%xmm0         \n"
+    "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"
+    "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"
+    "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"
+    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
+    MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1
+    MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2
+    MEMOPREG(paddd,0x30,1,4,4,xmm3)            // paddd    0x30(%1,%4,4),%%xmm3
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
+    "cvtdq2ps  %%xmm1,%%xmm1                   \n"
+    "mulps     %%xmm4,%%xmm0                   \n"
+    "mulps     %%xmm4,%%xmm1                   \n"
+    "cvtdq2ps  %%xmm2,%%xmm2                   \n"
+    "cvtdq2ps  %%xmm3,%%xmm3                   \n"
+    "mulps     %%xmm4,%%xmm2                   \n"
+    "mulps     %%xmm4,%%xmm3                   \n"
+    "cvtps2dq  %%xmm0,%%xmm0                   \n"
+    "cvtps2dq  %%xmm1,%%xmm1                   \n"
+    "cvtps2dq  %%xmm2,%%xmm2                   \n"
+    "cvtps2dq  %%xmm3,%%xmm3                   \n"
+    "packssdw  %%xmm1,%%xmm0                   \n"
+    "packssdw  %%xmm3,%%xmm2                   \n"
+    "packuswb  %%xmm2,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
+    "jge       40b                             \n"
+
+  "49:                                         \n"
+    "add       $0x3,%3                         \n"
+    "jl        19f                             \n"
+
+  // 1 pixel loop                              \n"
+    LABELALIGN
+  "10:                                         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "psubd     " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
+    "mulps     %%xmm4,%%xmm0                   \n"
+    "cvtps2dq  %%xmm0,%%xmm0                   \n"
+    "packssdw  %%xmm0,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movd      %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x4,2) ",%2            \n"
+    "sub       $0x1,%3                         \n"
+    "jge       10b                             \n"
+  "19:                                         \n"
+  : "+r"(topleft),  // %0
+    "+r"(botleft),  // %1
+    "+r"(dst),      // %2
+    "+rm"(count)    // %3
+  : "r"((intptr_t)(width)),  // %4
+    "rm"(area)     // %5
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+#endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+
+#ifdef HAS_ARGBAFFINEROW_SSE2
+// Copy ARGB pixels from source image with slope to a row of destination.
+LIBYUV_API
+void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
+                        uint8* dst_argb, const float* src_dudv, int width) {
+  intptr_t src_argb_stride_temp = src_argb_stride;
+  intptr_t temp = 0;
+  asm volatile (
+    "movq      " MEMACCESS(3) ",%%xmm2         \n"
+    "movq      " MEMACCESS2(0x08,3) ",%%xmm7   \n"
+    "shl       $0x10,%1                        \n"
+    "add       $0x4,%1                         \n"
+    "movd      %1,%%xmm5                       \n"
+    "sub       $0x4,%4                         \n"
+    "jl        49f                             \n"
+
+    "pshufd    $0x44,%%xmm7,%%xmm7             \n"
+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+    "movdqa    %%xmm2,%%xmm0                   \n"
+    "addps     %%xmm7,%%xmm0                   \n"
+    "movlhps   %%xmm0,%%xmm2                   \n"
+    "movdqa    %%xmm7,%%xmm4                   \n"
+    "addps     %%xmm4,%%xmm4                   \n"
+    "movdqa    %%xmm2,%%xmm3                   \n"
+    "addps     %%xmm4,%%xmm3                   \n"
+    "addps     %%xmm4,%%xmm4                   \n"
+
+  // 4 pixel loop                              \n"
+    LABELALIGN
+  "40:                                         \n"
+    "cvttps2dq %%xmm2,%%xmm0                   \n"  // x, y float to int first 2
+    "cvttps2dq %%xmm3,%%xmm1                   \n"  // x, y float to int next 2
+    "packssdw  %%xmm1,%%xmm0                   \n"  // x, y as 8 shorts
+    "pmaddwd   %%xmm5,%%xmm0                   \n"  // off = x * 4 + y * stride
+    "movd      %%xmm0,%k1                      \n"
+    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+    "movd      %%xmm0,%k5                      \n"
+    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+    MEMOPREG(movd,0x00,0,1,1,xmm1)             //  movd      (%0,%1,1),%%xmm1
+    MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6
+    "punpckldq %%xmm6,%%xmm1                   \n"
+    "addps     %%xmm4,%%xmm2                   \n"
+    "movq      %%xmm1," MEMACCESS(2) "         \n"
+    "movd      %%xmm0,%k1                      \n"
+    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+    "movd      %%xmm0,%k5                      \n"
+    MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0
+    MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6
+    "punpckldq %%xmm6,%%xmm0                   \n"
+    "addps     %%xmm4,%%xmm3                   \n"
+    "movq      %%xmm0," MEMACCESS2(0x08,2) "   \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%4                         \n"
+    "jge       40b                             \n"
+
+  "49:                                         \n"
+    "add       $0x3,%4                         \n"
+    "jl        19f                             \n"
+
+  // 1 pixel loop                              \n"
+    LABELALIGN
+  "10:                                         \n"
+    "cvttps2dq %%xmm2,%%xmm0                   \n"
+    "packssdw  %%xmm0,%%xmm0                   \n"
+    "pmaddwd   %%xmm5,%%xmm0                   \n"
+    "addps     %%xmm7,%%xmm2                   \n"
+    "movd      %%xmm0,%k1                      \n"
+    MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0
+    "movd      %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x04,2) ",%2           \n"
+    "sub       $0x1,%4                         \n"
+    "jge       10b                             \n"
+  "19:                                         \n"
+  : "+r"(src_argb),  // %0
+    "+r"(src_argb_stride_temp),  // %1
+    "+r"(dst_argb),  // %2
+    "+r"(src_dudv),  // %3
+    "+rm"(width),    // %4
+    "+r"(temp)   // %5
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBAFFINEROW_SSE2
+
+#ifdef HAS_INTERPOLATEROW_SSSE3
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                          ptrdiff_t src_stride, int dst_width,
+                          int source_y_fraction) {
+  asm volatile (
+    "sub       %1,%0                           \n"
+    "cmp       $0x0,%3                         \n"
+    "je        100f                            \n"
+    "cmp       $0x80,%3                        \n"
+    "je        50f                             \n"
+
+    "movd      %3,%%xmm0                       \n"
+    "neg       %3                              \n"
+    "add       $0x100,%3                       \n"
+    "movd      %3,%%xmm5                       \n"
+    "punpcklbw %%xmm0,%%xmm5                   \n"
+    "punpcklwd %%xmm5,%%xmm5                   \n"
+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+    "mov       $0x80808080,%%eax               \n"
+    "movd      %%eax,%%xmm4                    \n"
+    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
+
+    // General purpose row blend.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,1,4,1,xmm2)
+    "movdqa     %%xmm0,%%xmm1                  \n"
+    "punpcklbw  %%xmm2,%%xmm0                  \n"
+    "punpckhbw  %%xmm2,%%xmm1                  \n"
+    "psubb      %%xmm4,%%xmm0                  \n"
+    "psubb      %%xmm4,%%xmm1                  \n"
+    "movdqa     %%xmm5,%%xmm2                  \n"
+    "movdqa     %%xmm5,%%xmm3                  \n"
+    "pmaddubsw  %%xmm0,%%xmm2                  \n"
+    "pmaddubsw  %%xmm1,%%xmm3                  \n"
+    "paddw      %%xmm4,%%xmm2                  \n"
+    "paddw      %%xmm4,%%xmm3                  \n"
+    "psrlw      $0x8,%%xmm2                    \n"
+    "psrlw      $0x8,%%xmm3                    \n"
+    "packuswb   %%xmm3,%%xmm2                  \n"
+    MEMOPMEM(movdqu,xmm2,0x00,1,0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+    "jmp       99f                             \n"
+
+    // Blend 50 / 50.
+    LABELALIGN
+  "50:                                         \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,1,4,1,xmm1)
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        50b                             \n"
+    "jmp       99f                             \n"
+
+    // Blend 100 / 0 - Copy row unchanged.
+    LABELALIGN
+  "100:                                        \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        100b                            \n"
+
+  "99:                                         \n"
+  : "+r"(dst_ptr),    // %0
+    "+r"(src_ptr),    // %1
+    "+r"(dst_width),  // %2
+    "+r"(source_y_fraction)  // %3
+  : "r"((intptr_t)(src_stride))  // %4
+  : "memory", "cc", "eax", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_INTERPOLATEROW_SSSE3
+
+#ifdef HAS_INTERPOLATEROW_AVX2
+// Bilinear filter 32x2 -> 32x1
+void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) {
+  asm volatile (
+    "cmp       $0x0,%3                         \n"
+    "je        100f                            \n"
+    "sub       %1,%0                           \n"
+    "cmp       $0x80,%3                        \n"
+    "je        50f                             \n"
+
+    "vmovd      %3,%%xmm0                      \n"
+    "neg        %3                             \n"
+    "add        $0x100,%3                      \n"
+    "vmovd      %3,%%xmm5                      \n"
+    "vpunpcklbw %%xmm0,%%xmm5,%%xmm5           \n"
+    "vpunpcklwd %%xmm5,%%xmm5,%%xmm5           \n"
+    "vbroadcastss %%xmm5,%%ymm5                \n"
+    "mov        $0x80808080,%%eax              \n"
+    "vmovd      %%eax,%%xmm4                   \n"
+    "vbroadcastss %%xmm4,%%ymm4                \n"
+
+    // General purpose row blend.
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"
+    MEMOPREG(vmovdqu,0x00,1,4,1,ymm2)
+    "vpunpckhbw %%ymm2,%%ymm0,%%ymm1           \n"
+    "vpunpcklbw %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpsubb     %%ymm4,%%ymm1,%%ymm1           \n"
+    "vpsubb     %%ymm4,%%ymm0,%%ymm0           \n"
+    "vpmaddubsw %%ymm1,%%ymm5,%%ymm1           \n"
+    "vpmaddubsw %%ymm0,%%ymm5,%%ymm0           \n"
+    "vpaddw     %%ymm4,%%ymm1,%%ymm1           \n"
+    "vpaddw     %%ymm4,%%ymm0,%%ymm0           \n"
+    "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
+    "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
+    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+    MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+    "jmp       99f                             \n"
+
+    // Blend 50 / 50.
+    LABELALIGN
+  "50:                                         \n"
+    "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"
+    VMEMOPREG(vpavgb,0x00,1,4,1,ymm0,ymm0)     // vpavgb (%1,%4,1),%%ymm0,%%ymm0
+    MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
+    "jg        50b                             \n"
+    "jmp       99f                             \n"
+
+    // Blend 100 / 0 - Copy row unchanged.
+    LABELALIGN
+  "100:                                        \n"
+    "rep movsb " MEMMOVESTRING(1,0) "          \n"
+    "jmp       999f                            \n"
+
+  "99:                                         \n"
+    "vzeroupper                                \n"
+  "999:                                        \n"
+  : "+D"(dst_ptr),    // %0
+    "+S"(src_ptr),    // %1
+    "+c"(dst_width),  // %2
+    "+r"(source_y_fraction)  // %3
+  : "r"((intptr_t)(src_stride))  // %4
+  : "memory", "cc", "eax", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_INTERPOLATEROW_AVX2
+
+#ifdef HAS_ARGBSHUFFLEROW_SSSE3
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                          const uint8* shuffler, int width) {
+  asm volatile (
+    "movdqu    " MEMACCESS(3) ",%%xmm5         \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pshufb    %%xmm5,%%xmm0                   \n"
+    "pshufb    %%xmm5,%%xmm1                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(width)        // %2
+  : "r"(shuffler)    // %3
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm5"
+  );
+}
+#endif  // HAS_ARGBSHUFFLEROW_SSSE3
+
+#ifdef HAS_ARGBSHUFFLEROW_AVX2
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int width) {
+  asm volatile (
+    "vbroadcastf128 " MEMACCESS(3) ",%%ymm5    \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpshufb   %%ymm5,%%ymm0,%%ymm0            \n"
+    "vpshufb   %%ymm5,%%ymm1,%%ymm1            \n"
+    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
+    "vmovdqu   %%ymm1," MEMACCESS2(0x20,1) "   \n"
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(width)        // %2
+  : "r"(shuffler)    // %3
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm5"
+  );
+}
+#endif  // HAS_ARGBSHUFFLEROW_AVX2
+
+#ifdef HAS_ARGBSHUFFLEROW_SSE2
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int width) {
+  uintptr_t pixel_temp = 0u;
+  asm volatile (
+    "pxor      %%xmm5,%%xmm5                   \n"
+    "mov       " MEMACCESS(4) ",%k2            \n"
+    "cmp       $0x3000102,%k2                  \n"
+    "je        3012f                           \n"
+    "cmp       $0x10203,%k2                    \n"
+    "je        123f                            \n"
+    "cmp       $0x30201,%k2                    \n"
+    "je        321f                            \n"
+    "cmp       $0x2010003,%k2                  \n"
+    "je        2103f                           \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "movzb     " MEMACCESS(4) ",%2             \n"
+    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
+    "mov       %b2," MEMACCESS(1) "            \n"
+    "movzb     " MEMACCESS2(0x1,4) ",%2        \n"
+    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
+    "mov       %b2," MEMACCESS2(0x1,1) "       \n"
+    "movzb     " MEMACCESS2(0x2,4) ",%2        \n"
+    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
+    "mov       %b2," MEMACCESS2(0x2,1) "       \n"
+    "movzb     " MEMACCESS2(0x3,4) ",%2        \n"
+    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
+    "mov       %b2," MEMACCESS2(0x3,1) "       \n"
+    "lea       " MEMLEA(0x4,0) ",%0            \n"
+    "lea       " MEMLEA(0x4,1) ",%1            \n"
+    "sub       $0x1,%3                         \n"
+    "jg        1b                              \n"
+    "jmp       99f                             \n"
+
+    LABELALIGN
+  "123:                                        \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "punpckhbw %%xmm5,%%xmm1                   \n"
+    "pshufhw   $0x1b,%%xmm0,%%xmm0             \n"
+    "pshuflw   $0x1b,%%xmm0,%%xmm0             \n"
+    "pshufhw   $0x1b,%%xmm1,%%xmm1             \n"
+    "pshuflw   $0x1b,%%xmm1,%%xmm1             \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%3                         \n"
+    "jg        123b                            \n"
+    "jmp       99f                             \n"
+
+    LABELALIGN
+  "321:                                        \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "punpckhbw %%xmm5,%%xmm1                   \n"
+    "pshufhw   $0x39,%%xmm0,%%xmm0             \n"
+    "pshuflw   $0x39,%%xmm0,%%xmm0             \n"
+    "pshufhw   $0x39,%%xmm1,%%xmm1             \n"
+    "pshuflw   $0x39,%%xmm1,%%xmm1             \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%3                         \n"
+    "jg        321b                            \n"
+    "jmp       99f                             \n"
+
+    LABELALIGN
+  "2103:                                       \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "punpckhbw %%xmm5,%%xmm1                   \n"
+    "pshufhw   $0x93,%%xmm0,%%xmm0             \n"
+    "pshuflw   $0x93,%%xmm0,%%xmm0             \n"
+    "pshufhw   $0x93,%%xmm1,%%xmm1             \n"
+    "pshuflw   $0x93,%%xmm1,%%xmm1             \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%3                         \n"
+    "jg        2103b                           \n"
+    "jmp       99f                             \n"
+
+    LABELALIGN
+  "3012:                                       \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "punpckhbw %%xmm5,%%xmm1                   \n"
+    "pshufhw   $0xc6,%%xmm0,%%xmm0             \n"
+    "pshuflw   $0xc6,%%xmm0,%%xmm0             \n"
+    "pshufhw   $0xc6,%%xmm1,%%xmm1             \n"
+    "pshuflw   $0xc6,%%xmm1,%%xmm1             \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%3                         \n"
+    "jg        3012b                           \n"
+
+  "99:                                         \n"
+  : "+r"(src_argb),    // %0
+    "+r"(dst_argb),    // %1
+    "+d"(pixel_temp),  // %2
+    "+r"(width)         // %3
+  : "r"(shuffler)      // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm5"
+  );
+}
+#endif  // HAS_ARGBSHUFFLEROW_SSE2
+
+#ifdef HAS_I422TOYUY2ROW_SSE2
+void I422ToYUY2Row_SSE2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_frame, int width) {
+ asm volatile (
+    "sub       %1,%2                             \n"
+    LABELALIGN
+  "1:                                            \n"
+    "movq      " MEMACCESS(1) ",%%xmm2           \n"
+    MEMOPREG(movq,0x00,1,2,1,xmm3)               //  movq    (%1,%2,1),%%xmm3
+    "lea       " MEMLEA(0x8,1) ",%1              \n"
+    "punpcklbw %%xmm3,%%xmm2                     \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
+    "lea       " MEMLEA(0x10,0) ",%0             \n"
+    "movdqa    %%xmm0,%%xmm1                     \n"
+    "punpcklbw %%xmm2,%%xmm0                     \n"
+    "punpckhbw %%xmm2,%%xmm1                     \n"
+    "movdqu    %%xmm0," MEMACCESS(3) "           \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,3) "     \n"
+    "lea       " MEMLEA(0x20,3) ",%3             \n"
+    "sub       $0x10,%4                          \n"
+    "jg         1b                               \n"
+    : "+r"(src_y),  // %0
+      "+r"(src_u),  // %1
+      "+r"(src_v),  // %2
+      "+r"(dst_frame),  // %3
+      "+rm"(width)  // %4
+    :
+    : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3"
+  );
+}
+#endif  // HAS_I422TOYUY2ROW_SSE2
+
+#ifdef HAS_I422TOUYVYROW_SSE2
+void I422ToUYVYRow_SSE2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_frame, int width) {
+ asm volatile (
+    "sub        %1,%2                            \n"
+    LABELALIGN
+  "1:                                            \n"
+    "movq      " MEMACCESS(1) ",%%xmm2           \n"
+    MEMOPREG(movq,0x00,1,2,1,xmm3)               //  movq    (%1,%2,1),%%xmm3
+    "lea       " MEMLEA(0x8,1) ",%1              \n"
+    "punpcklbw %%xmm3,%%xmm2                     \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
+    "movdqa    %%xmm2,%%xmm1                     \n"
+    "lea       " MEMLEA(0x10,0) ",%0             \n"
+    "punpcklbw %%xmm0,%%xmm1                     \n"
+    "punpckhbw %%xmm0,%%xmm2                     \n"
+    "movdqu    %%xmm1," MEMACCESS(3) "           \n"
+    "movdqu    %%xmm2," MEMACCESS2(0x10,3) "     \n"
+    "lea       " MEMLEA(0x20,3) ",%3             \n"
+    "sub       $0x10,%4                          \n"
+    "jg         1b                               \n"
+    : "+r"(src_y),  // %0
+      "+r"(src_u),  // %1
+      "+r"(src_v),  // %2
+      "+r"(dst_frame),  // %3
+      "+rm"(width)  // %4
+    :
+    : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3"
+  );
+}
+#endif  // HAS_I422TOUYVYROW_SSE2
+
+#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
+void ARGBPolynomialRow_SSE2(const uint8* src_argb,
+                            uint8* dst_argb, const float* poly,
+                            int width) {
+  asm volatile (
+    "pxor      %%xmm3,%%xmm3                   \n"
+
+    // 2 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movq      " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x8,0) ",%0            \n"
+    "punpcklbw %%xmm3,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm4                   \n"
+    "punpcklwd %%xmm3,%%xmm0                   \n"
+    "punpckhwd %%xmm3,%%xmm4                   \n"
+    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
+    "cvtdq2ps  %%xmm4,%%xmm4                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm4,%%xmm5                   \n"
+    "mulps     " MEMACCESS2(0x10,3) ",%%xmm0   \n"
+    "mulps     " MEMACCESS2(0x10,3) ",%%xmm4   \n"
+    "addps     " MEMACCESS(3) ",%%xmm0         \n"
+    "addps     " MEMACCESS(3) ",%%xmm4         \n"
+    "movdqa    %%xmm1,%%xmm2                   \n"
+    "movdqa    %%xmm5,%%xmm6                   \n"
+    "mulps     %%xmm1,%%xmm2                   \n"
+    "mulps     %%xmm5,%%xmm6                   \n"
+    "mulps     %%xmm2,%%xmm1                   \n"
+    "mulps     %%xmm6,%%xmm5                   \n"
+    "mulps     " MEMACCESS2(0x20,3) ",%%xmm2   \n"
+    "mulps     " MEMACCESS2(0x20,3) ",%%xmm6   \n"
+    "mulps     " MEMACCESS2(0x30,3) ",%%xmm1   \n"
+    "mulps     " MEMACCESS2(0x30,3) ",%%xmm5   \n"
+    "addps     %%xmm2,%%xmm0                   \n"
+    "addps     %%xmm6,%%xmm4                   \n"
+    "addps     %%xmm1,%%xmm0                   \n"
+    "addps     %%xmm5,%%xmm4                   \n"
+    "cvttps2dq %%xmm0,%%xmm0                   \n"
+    "cvttps2dq %%xmm4,%%xmm4                   \n"
+    "packuswb  %%xmm4,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x2,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(width)      // %2
+  : "r"(poly)        // %3
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+#endif  // HAS_ARGBPOLYNOMIALROW_SSE2
+
+#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
+void ARGBPolynomialRow_AVX2(const uint8* src_argb,
+                            uint8* dst_argb, const float* poly,
+                            int width) {
+  asm volatile (
+    "vbroadcastf128 " MEMACCESS(3) ",%%ymm4     \n"
+    "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n"
+    "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n"
+    "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n"
+
+    // 2 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "vpmovzxbd   " MEMACCESS(0) ",%%ymm0       \n"  // 2 ARGB pixels
+    "lea         " MEMLEA(0x8,0) ",%0          \n"
+    "vcvtdq2ps   %%ymm0,%%ymm0                 \n"  // X 8 floats
+    "vmulps      %%ymm0,%%ymm0,%%ymm2          \n"  // X * X
+    "vmulps      %%ymm7,%%ymm0,%%ymm3          \n"  // C3 * X
+    "vfmadd132ps %%ymm5,%%ymm4,%%ymm0          \n"  // result = C0 + C1 * X
+    "vfmadd231ps %%ymm6,%%ymm2,%%ymm0          \n"  // result += C2 * X * X
+    "vfmadd231ps %%ymm3,%%ymm2,%%ymm0          \n"  // result += C3 * X * X * X
+    "vcvttps2dq  %%ymm0,%%ymm0                 \n"
+    "vpackusdw   %%ymm0,%%ymm0,%%ymm0          \n"
+    "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+    "vpackuswb   %%xmm0,%%xmm0,%%xmm0          \n"
+    "vmovq       %%xmm0," MEMACCESS(1) "       \n"
+    "lea         " MEMLEA(0x8,1) ",%1          \n"
+    "sub         $0x2,%2                       \n"
+    "jg          1b                            \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(width)      // %2
+  : "r"(poly)        // %3
+  : "memory", "cc",
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBPOLYNOMIALROW_AVX2
+
+#ifdef HAS_ARGBCOLORTABLEROW_X86
+// Tranform ARGB pixels with color table.
+void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
+                           int width) {
+  uintptr_t pixel_temp = 0u;
+  asm volatile (
+    // 1 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movzb     " MEMACCESS(0) ",%1             \n"
+    "lea       " MEMLEA(0x4,0) ",%0            \n"
+    MEMOPARG(movzb,0x00,3,1,4,1) "             \n"  // movzb (%3,%1,4),%1
+    "mov       %b1," MEMACCESS2(-0x4,0) "      \n"
+    "movzb     " MEMACCESS2(-0x3,0) ",%1       \n"
+    MEMOPARG(movzb,0x01,3,1,4,1) "             \n"  // movzb 0x1(%3,%1,4),%1
+    "mov       %b1," MEMACCESS2(-0x3,0) "      \n"
+    "movzb     " MEMACCESS2(-0x2,0) ",%1       \n"
+    MEMOPARG(movzb,0x02,3,1,4,1) "             \n"  // movzb 0x2(%3,%1,4),%1
+    "mov       %b1," MEMACCESS2(-0x2,0) "      \n"
+    "movzb     " MEMACCESS2(-0x1,0) ",%1       \n"
+    MEMOPARG(movzb,0x03,3,1,4,1) "             \n"  // movzb 0x3(%3,%1,4),%1
+    "mov       %b1," MEMACCESS2(-0x1,0) "      \n"
+    "dec       %2                              \n"
+    "jg        1b                              \n"
+  : "+r"(dst_argb),   // %0
+    "+d"(pixel_temp), // %1
+    "+r"(width)       // %2
+  : "r"(table_argb)   // %3
+  : "memory", "cc");
+}
+#endif  // HAS_ARGBCOLORTABLEROW_X86
+
+#ifdef HAS_RGBCOLORTABLEROW_X86
+// Tranform RGB pixels with color table.
+void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
+  uintptr_t pixel_temp = 0u;
+  asm volatile (
+    // 1 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movzb     " MEMACCESS(0) ",%1             \n"
+    "lea       " MEMLEA(0x4,0) ",%0            \n"
+    MEMOPARG(movzb,0x00,3,1,4,1) "             \n"  // movzb (%3,%1,4),%1
+    "mov       %b1," MEMACCESS2(-0x4,0) "      \n"
+    "movzb     " MEMACCESS2(-0x3,0) ",%1       \n"
+    MEMOPARG(movzb,0x01,3,1,4,1) "             \n"  // movzb 0x1(%3,%1,4),%1
+    "mov       %b1," MEMACCESS2(-0x3,0) "      \n"
+    "movzb     " MEMACCESS2(-0x2,0) ",%1       \n"
+    MEMOPARG(movzb,0x02,3,1,4,1) "             \n"  // movzb 0x2(%3,%1,4),%1
+    "mov       %b1," MEMACCESS2(-0x2,0) "      \n"
+    "dec       %2                              \n"
+    "jg        1b                              \n"
+  : "+r"(dst_argb),   // %0
+    "+d"(pixel_temp), // %1
+    "+r"(width)       // %2
+  : "r"(table_argb)   // %3
+  : "memory", "cc");
+}
+#endif  // HAS_RGBCOLORTABLEROW_X86
+
+#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
+// Tranform RGB pixels with luma table.
+void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                                 int width,
+                                 const uint8* luma, uint32 lumacoeff) {
+  uintptr_t pixel_temp = 0u;
+  uintptr_t table_temp = 0u;
+  asm volatile (
+    "movd      %6,%%xmm3                       \n"
+    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "psllw     $0x8,%%xmm4                     \n"
+    "pxor      %%xmm5,%%xmm5                   \n"
+
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(2) ",%%xmm0         \n"
+    "pmaddubsw %%xmm3,%%xmm0                   \n"
+    "phaddw    %%xmm0,%%xmm0                   \n"
+    "pand      %%xmm4,%%xmm0                   \n"
+    "punpcklwd %%xmm5,%%xmm0                   \n"
+    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
+    "add       %5,%1                           \n"
+    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+
+    "movzb     " MEMACCESS(2) ",%0             \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS(3) "            \n"
+    "movzb     " MEMACCESS2(0x1,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0x1,3) "       \n"
+    "movzb     " MEMACCESS2(0x2,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0x2,3) "       \n"
+    "movzb     " MEMACCESS2(0x3,2) ",%0        \n"
+    "mov       %b0," MEMACCESS2(0x3,3) "       \n"
+
+    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
+    "add       %5,%1                           \n"
+    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+
+    "movzb     " MEMACCESS2(0x4,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0x4,3) "       \n"
+    "movzb     " MEMACCESS2(0x5,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0x5,3) "       \n"
+    "movzb     " MEMACCESS2(0x6,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0x6,3) "       \n"
+    "movzb     " MEMACCESS2(0x7,2) ",%0        \n"
+    "mov       %b0," MEMACCESS2(0x7,3) "       \n"
+
+    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
+    "add       %5,%1                           \n"
+    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+
+    "movzb     " MEMACCESS2(0x8,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0x8,3) "       \n"
+    "movzb     " MEMACCESS2(0x9,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0x9,3) "       \n"
+    "movzb     " MEMACCESS2(0xa,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0xa,3) "       \n"
+    "movzb     " MEMACCESS2(0xb,2) ",%0        \n"
+    "mov       %b0," MEMACCESS2(0xb,3) "       \n"
+
+    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
+    "add       %5,%1                           \n"
+
+    "movzb     " MEMACCESS2(0xc,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0xc,3) "       \n"
+    "movzb     " MEMACCESS2(0xd,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0xd,3) "       \n"
+    "movzb     " MEMACCESS2(0xe,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0xe,3) "       \n"
+    "movzb     " MEMACCESS2(0xf,2) ",%0        \n"
+    "mov       %b0," MEMACCESS2(0xf,3) "       \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "lea       " MEMLEA(0x10,3) ",%3           \n"
+    "sub       $0x4,%4                         \n"
+    "jg        1b                              \n"
+  : "+d"(pixel_temp),  // %0
+    "+a"(table_temp),  // %1
+    "+r"(src_argb),    // %2
+    "+r"(dst_argb),    // %3
+    "+rm"(width)       // %4
+  : "r"(luma),         // %5
+    "rm"(lumacoeff)    // %6
+  : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
+
+#endif  // defined(__x86_64__) || defined(__i386__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libyuv/source/row_mips.cc b/libs/libyuv/source/row_mips.cc
new file mode 100644
index 0000000000..2c55b786b2
--- /dev/null
+++ b/libs/libyuv/source/row_mips.cc
@@ -0,0 +1,782 @@
+/*
+ *  Copyright (c) 2012 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// The following are available on Mips platforms:
+#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \
+    (_MIPS_SIM == _MIPS_SIM_ABI32)
+
+#ifdef HAS_COPYROW_MIPS
+void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
+  __asm__ __volatile__ (
+    ".set      noreorder                         \n"
+    ".set      noat                              \n"
+    "slti      $at, %[count], 8                  \n"
+    "bne       $at ,$zero, $last8                \n"
+    "xor       $t8, %[src], %[dst]               \n"
+    "andi      $t8, $t8, 0x3                     \n"
+
+    "bne       $t8, $zero, unaligned             \n"
+    "negu      $a3, %[dst]                       \n"
+    // make dst/src aligned
+    "andi      $a3, $a3, 0x3                     \n"
+    "beq       $a3, $zero, $chk16w               \n"
+    // word-aligned now count is the remining bytes count
+    "subu     %[count], %[count], $a3            \n"
+
+    "lwr       $t8, 0(%[src])                    \n"
+    "addu      %[src], %[src], $a3               \n"
+    "swr       $t8, 0(%[dst])                    \n"
+    "addu      %[dst], %[dst], $a3               \n"
+
+    // Now the dst/src are mutually word-aligned with word-aligned addresses
+    "$chk16w:                                    \n"
+    "andi      $t8, %[count], 0x3f               \n"  // whole 64-B chunks?
+    // t8 is the byte count after 64-byte chunks
+    "beq       %[count], $t8, chk8w              \n"
+    // There will be at most 1 32-byte chunk after it
+    "subu      $a3, %[count], $t8                \n"  // the reminder
+    // Here a3 counts bytes in 16w chunks
+    "addu      $a3, %[dst], $a3                  \n"
+    // Now a3 is the final dst after 64-byte chunks
+    "addu      $t0, %[dst], %[count]             \n"
+    // t0 is the "past the end" address
+
+    // When in the loop we exercise "pref 30,x(a1)", the a1+x should not be past
+    // the "t0-32" address
+    // This means: for x=128 the last "safe" a1 address is "t0-160"
+    // Alternatively, for x=64 the last "safe" a1 address is "t0-96"
+    // we will use "pref 30,128(a1)", so "t0-160" is the limit
+    "subu      $t9, $t0, 160                     \n"
+    // t9 is the "last safe pref 30,128(a1)" address
+    "pref      0, 0(%[src])                      \n"  // first line of src
+    "pref      0, 32(%[src])                     \n"  // second line of src
+    "pref      0, 64(%[src])                     \n"
+    "pref      30, 32(%[dst])                    \n"
+    // In case the a1 > t9 don't use "pref 30" at all
+    "sgtu      $v1, %[dst], $t9                  \n"
+    "bgtz      $v1, $loop16w                     \n"
+    "nop                                         \n"
+    // otherwise, start with using pref30
+    "pref      30, 64(%[dst])                    \n"
+    "$loop16w:                                    \n"
+    "pref      0, 96(%[src])                     \n"
+    "lw        $t0, 0(%[src])                    \n"
+    "bgtz      $v1, $skip_pref30_96              \n"  // skip
+    "lw        $t1, 4(%[src])                    \n"
+    "pref      30, 96(%[dst])                    \n"  // continue
+    "$skip_pref30_96:                            \n"
+    "lw        $t2, 8(%[src])                    \n"
+    "lw        $t3, 12(%[src])                   \n"
+    "lw        $t4, 16(%[src])                   \n"
+    "lw        $t5, 20(%[src])                   \n"
+    "lw        $t6, 24(%[src])                   \n"
+    "lw        $t7, 28(%[src])                   \n"
+    "pref      0, 128(%[src])                    \n"
+    //  bring the next lines of src, addr 128
+    "sw        $t0, 0(%[dst])                    \n"
+    "sw        $t1, 4(%[dst])                    \n"
+    "sw        $t2, 8(%[dst])                    \n"
+    "sw        $t3, 12(%[dst])                   \n"
+    "sw        $t4, 16(%[dst])                   \n"
+    "sw        $t5, 20(%[dst])                   \n"
+    "sw        $t6, 24(%[dst])                   \n"
+    "sw        $t7, 28(%[dst])                   \n"
+    "lw        $t0, 32(%[src])                   \n"
+    "bgtz      $v1, $skip_pref30_128             \n"  // skip pref 30,128(a1)
+    "lw        $t1, 36(%[src])                   \n"
+    "pref      30, 128(%[dst])                   \n"  // set dest, addr 128
+    "$skip_pref30_128:                           \n"
+    "lw        $t2, 40(%[src])                   \n"
+    "lw        $t3, 44(%[src])                   \n"
+    "lw        $t4, 48(%[src])                   \n"
+    "lw        $t5, 52(%[src])                   \n"
+    "lw        $t6, 56(%[src])                   \n"
+    "lw        $t7, 60(%[src])                   \n"
+    "pref      0, 160(%[src])                    \n"
+    // bring the next lines of src, addr 160
+    "sw        $t0, 32(%[dst])                   \n"
+    "sw        $t1, 36(%[dst])                   \n"
+    "sw        $t2, 40(%[dst])                   \n"
+    "sw        $t3, 44(%[dst])                   \n"
+    "sw        $t4, 48(%[dst])                   \n"
+    "sw        $t5, 52(%[dst])                   \n"
+    "sw        $t6, 56(%[dst])                   \n"
+    "sw        $t7, 60(%[dst])                   \n"
+
+    "addiu     %[dst], %[dst], 64                \n"  // adding 64 to dest
+    "sgtu      $v1, %[dst], $t9                  \n"
+    "bne       %[dst], $a3, $loop16w             \n"
+    " addiu    %[src], %[src], 64                \n"  // adding 64 to src
+    "move      %[count], $t8                     \n"
+
+    // Here we have src and dest word-aligned but less than 64-bytes to go
+
+    "chk8w:                                      \n"
+    "pref      0, 0x0(%[src])                    \n"
+    "andi      $t8, %[count], 0x1f               \n"  // 32-byte chunk?
+    // the t8 is the reminder count past 32-bytes
+    "beq       %[count], $t8, chk1w              \n"
+    // count=t8,no 32-byte chunk
+    " nop                                        \n"
+
+    "lw        $t0, 0(%[src])                    \n"
+    "lw        $t1, 4(%[src])                    \n"
+    "lw        $t2, 8(%[src])                    \n"
+    "lw        $t3, 12(%[src])                   \n"
+    "lw        $t4, 16(%[src])                   \n"
+    "lw        $t5, 20(%[src])                   \n"
+    "lw        $t6, 24(%[src])                   \n"
+    "lw        $t7, 28(%[src])                   \n"
+    "addiu     %[src], %[src], 32                \n"
+
+    "sw        $t0, 0(%[dst])                    \n"
+    "sw        $t1, 4(%[dst])                    \n"
+    "sw        $t2, 8(%[dst])                    \n"
+    "sw        $t3, 12(%[dst])                   \n"
+    "sw        $t4, 16(%[dst])                   \n"
+    "sw        $t5, 20(%[dst])                   \n"
+    "sw        $t6, 24(%[dst])                   \n"
+    "sw        $t7, 28(%[dst])                   \n"
+    "addiu     %[dst], %[dst], 32                \n"
+
+    "chk1w:                                      \n"
+    "andi      %[count], $t8, 0x3                \n"
+    // now count is the reminder past 1w chunks
+    "beq       %[count], $t8, $last8             \n"
+    " subu     $a3, $t8, %[count]                \n"
+    // a3 is count of bytes in 1w chunks
+    "addu      $a3, %[dst], $a3                  \n"
+    // now a3 is the dst address past the 1w chunks
+    // copying in words (4-byte chunks)
+    "$wordCopy_loop:                             \n"
+    "lw        $t3, 0(%[src])                    \n"
+    // the first t3 may be equal t0 ... optimize?
+    "addiu     %[src], %[src],4                  \n"
+    "addiu     %[dst], %[dst],4                  \n"
+    "bne       %[dst], $a3,$wordCopy_loop        \n"
+    " sw       $t3, -4(%[dst])                   \n"
+
+    // For the last (<8) bytes
+    "$last8:                                     \n"
+    "blez      %[count], leave                   \n"
+    " addu     $a3, %[dst], %[count]             \n"  // a3 -last dst address
+    "$last8loop:                                 \n"
+    "lb        $v1, 0(%[src])                    \n"
+    "addiu     %[src], %[src], 1                 \n"
+    "addiu     %[dst], %[dst], 1                 \n"
+    "bne       %[dst], $a3, $last8loop           \n"
+    " sb       $v1, -1(%[dst])                   \n"
+
+    "leave:                                      \n"
+    "  j       $ra                               \n"
+    "  nop                                       \n"
+
+    //
+    // UNALIGNED case
+    //
+
+    "unaligned:                                  \n"
+    // got here with a3="negu a1"
+    "andi      $a3, $a3, 0x3                     \n"  // a1 is word aligned?
+    "beqz      $a3, $ua_chk16w                   \n"
+    " subu     %[count], %[count], $a3           \n"
+    // bytes left after initial a3 bytes
+    "lwr       $v1, 0(%[src])                    \n"
+    "lwl       $v1, 3(%[src])                    \n"
+    "addu      %[src], %[src], $a3               \n"  // a3 may be 1, 2 or 3
+    "swr       $v1, 0(%[dst])                    \n"
+    "addu      %[dst], %[dst], $a3               \n"
+    // below the dst will be word aligned (NOTE1)
+    "$ua_chk16w:                                 \n"
+    "andi      $t8, %[count], 0x3f               \n"  // whole 64-B chunks?
+    // t8 is the byte count after 64-byte chunks
+    "beq       %[count], $t8, ua_chk8w           \n"
+    // if a2==t8, no 64-byte chunks
+    // There will be at most 1 32-byte chunk after it
+    "subu      $a3, %[count], $t8                \n"  // the reminder
+    // Here a3 counts bytes in 16w chunks
+    "addu      $a3, %[dst], $a3                  \n"
+    // Now a3 is the final dst after 64-byte chunks
+    "addu      $t0, %[dst], %[count]             \n"  // t0 "past the end"
+    "subu      $t9, $t0, 160                     \n"
+    // t9 is the "last safe pref 30,128(a1)" address
+    "pref      0, 0(%[src])                      \n"  // first line of src
+    "pref      0, 32(%[src])                     \n"  // second line  addr 32
+    "pref      0, 64(%[src])                     \n"
+    "pref      30, 32(%[dst])                    \n"
+    // safe, as we have at least 64 bytes ahead
+    // In case the a1 > t9 don't use "pref 30" at all
+    "sgtu      $v1, %[dst], $t9                  \n"
+    "bgtz      $v1, $ua_loop16w                  \n"
+    // skip "pref 30,64(a1)" for too short arrays
+    " nop                                        \n"
+    // otherwise, start with using pref30
+    "pref      30, 64(%[dst])                    \n"
+    "$ua_loop16w:                                \n"
+    "pref      0, 96(%[src])                     \n"
+    "lwr       $t0, 0(%[src])                    \n"
+    "lwl       $t0, 3(%[src])                    \n"
+    "lwr       $t1, 4(%[src])                    \n"
+    "bgtz      $v1, $ua_skip_pref30_96           \n"
+    " lwl      $t1, 7(%[src])                    \n"
+    "pref      30, 96(%[dst])                    \n"
+    // continue setting up the dest, addr 96
+    "$ua_skip_pref30_96:                         \n"
+    "lwr       $t2, 8(%[src])                    \n"
+    "lwl       $t2, 11(%[src])                   \n"
+    "lwr       $t3, 12(%[src])                   \n"
+    "lwl       $t3, 15(%[src])                   \n"
+    "lwr       $t4, 16(%[src])                   \n"
+    "lwl       $t4, 19(%[src])                   \n"
+    "lwr       $t5, 20(%[src])                   \n"
+    "lwl       $t5, 23(%[src])                   \n"
+    "lwr       $t6, 24(%[src])                   \n"
+    "lwl       $t6, 27(%[src])                   \n"
+    "lwr       $t7, 28(%[src])                   \n"
+    "lwl       $t7, 31(%[src])                   \n"
+    "pref      0, 128(%[src])                    \n"
+    // bring the next lines of src, addr 128
+    "sw        $t0, 0(%[dst])                    \n"
+    "sw        $t1, 4(%[dst])                    \n"
+    "sw        $t2, 8(%[dst])                    \n"
+    "sw        $t3, 12(%[dst])                   \n"
+    "sw        $t4, 16(%[dst])                   \n"
+    "sw        $t5, 20(%[dst])                   \n"
+    "sw        $t6, 24(%[dst])                   \n"
+    "sw        $t7, 28(%[dst])                   \n"
+    "lwr       $t0, 32(%[src])                   \n"
+    "lwl       $t0, 35(%[src])                   \n"
+    "lwr       $t1, 36(%[src])                   \n"
+    "bgtz      $v1, ua_skip_pref30_128           \n"
+    " lwl      $t1, 39(%[src])                   \n"
+    "pref      30, 128(%[dst])                   \n"
+    // continue setting up the dest, addr 128
+    "ua_skip_pref30_128:                         \n"
+
+    "lwr       $t2, 40(%[src])                   \n"
+    "lwl       $t2, 43(%[src])                   \n"
+    "lwr       $t3, 44(%[src])                   \n"
+    "lwl       $t3, 47(%[src])                   \n"
+    "lwr       $t4, 48(%[src])                   \n"
+    "lwl       $t4, 51(%[src])                   \n"
+    "lwr       $t5, 52(%[src])                   \n"
+    "lwl       $t5, 55(%[src])                   \n"
+    "lwr       $t6, 56(%[src])                   \n"
+    "lwl       $t6, 59(%[src])                   \n"
+    "lwr       $t7, 60(%[src])                   \n"
+    "lwl       $t7, 63(%[src])                   \n"
+    "pref      0, 160(%[src])                    \n"
+    // bring the next lines of src, addr 160
+    "sw        $t0, 32(%[dst])                   \n"
+    "sw        $t1, 36(%[dst])                   \n"
+    "sw        $t2, 40(%[dst])                   \n"
+    "sw        $t3, 44(%[dst])                   \n"
+    "sw        $t4, 48(%[dst])                   \n"
+    "sw        $t5, 52(%[dst])                   \n"
+    "sw        $t6, 56(%[dst])                   \n"
+    "sw        $t7, 60(%[dst])                   \n"
+
+    "addiu     %[dst],%[dst],64                  \n"  // adding 64 to dest
+    "sgtu      $v1,%[dst],$t9                    \n"
+    "bne       %[dst],$a3,$ua_loop16w            \n"
+    " addiu    %[src],%[src],64                  \n"  // adding 64 to src
+    "move      %[count],$t8                      \n"
+
+    // Here we have src and dest word-aligned but less than 64-bytes to go
+
+    "ua_chk8w:                                   \n"
+    "pref      0, 0x0(%[src])                    \n"
+    "andi      $t8, %[count], 0x1f               \n"  // 32-byte chunk?
+    // the t8 is the reminder count
+    "beq       %[count], $t8, $ua_chk1w          \n"
+    // when count==t8, no 32-byte chunk
+
+    "lwr       $t0, 0(%[src])                    \n"
+    "lwl       $t0, 3(%[src])                    \n"
+    "lwr       $t1, 4(%[src])                    \n"
+    "lwl       $t1, 7(%[src])                    \n"
+    "lwr       $t2, 8(%[src])                    \n"
+    "lwl       $t2, 11(%[src])                   \n"
+    "lwr       $t3, 12(%[src])                   \n"
+    "lwl       $t3, 15(%[src])                   \n"
+    "lwr       $t4, 16(%[src])                   \n"
+    "lwl       $t4, 19(%[src])                   \n"
+    "lwr       $t5, 20(%[src])                   \n"
+    "lwl       $t5, 23(%[src])                   \n"
+    "lwr       $t6, 24(%[src])                   \n"
+    "lwl       $t6, 27(%[src])                   \n"
+    "lwr       $t7, 28(%[src])                   \n"
+    "lwl       $t7, 31(%[src])                   \n"
+    "addiu     %[src], %[src], 32                \n"
+
+    "sw        $t0, 0(%[dst])                    \n"
+    "sw        $t1, 4(%[dst])                    \n"
+    "sw        $t2, 8(%[dst])                    \n"
+    "sw        $t3, 12(%[dst])                   \n"
+    "sw        $t4, 16(%[dst])                   \n"
+    "sw        $t5, 20(%[dst])                   \n"
+    "sw        $t6, 24(%[dst])                   \n"
+    "sw        $t7, 28(%[dst])                   \n"
+    "addiu     %[dst], %[dst], 32                \n"
+
+    "$ua_chk1w:                                  \n"
+    "andi      %[count], $t8, 0x3                \n"
+    // now count is the reminder past 1w chunks
+    "beq       %[count], $t8, ua_smallCopy       \n"
+    "subu      $a3, $t8, %[count]                \n"
+    // a3 is count of bytes in 1w chunks
+    "addu      $a3, %[dst], $a3                  \n"
+    // now a3 is the dst address past the 1w chunks
+
+    // copying in words (4-byte chunks)
+    "$ua_wordCopy_loop:                          \n"
+    "lwr       $v1, 0(%[src])                    \n"
+    "lwl       $v1, 3(%[src])                    \n"
+    "addiu     %[src], %[src], 4                 \n"
+    "addiu     %[dst], %[dst], 4                 \n"
+    // note: dst=a1 is word aligned here, see NOTE1
+    "bne       %[dst], $a3, $ua_wordCopy_loop    \n"
+    " sw       $v1,-4(%[dst])                    \n"
+
+    // Now less than 4 bytes (value in count) left to copy
+    "ua_smallCopy:                               \n"
+    "beqz      %[count], leave                   \n"
+    " addu     $a3, %[dst], %[count]             \n" // a3 = last dst address
+    "$ua_smallCopy_loop:                         \n"
+    "lb        $v1, 0(%[src])                    \n"
+    "addiu     %[src], %[src], 1                 \n"
+    "addiu     %[dst], %[dst], 1                 \n"
+    "bne       %[dst],$a3,$ua_smallCopy_loop     \n"
+    " sb       $v1, -1(%[dst])                   \n"
+
+    "j         $ra                               \n"
+    " nop                                        \n"
+    ".set      at                                \n"
+    ".set      reorder                           \n"
+       : [dst] "+r" (dst), [src] "+r" (src)
+       : [count] "r" (count)
+       : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7",
+       "t8", "t9", "a3", "v1", "at"
+  );
+}
+#endif  // HAS_COPYROW_MIPS
+
+// DSPR2 functions
+#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \
+    (__mips_dsp_rev >= 2) && \
+    (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)
+
+void SplitUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                           int width) {
+  __asm__ __volatile__ (
+    ".set push                                     \n"
+    ".set noreorder                                \n"
+    "srl             $t4, %[width], 4              \n"  // multiplies of 16
+    "blez            $t4, 2f                       \n"
+    " andi           %[width], %[width], 0xf       \n"  // residual
+
+  "1:                                              \n"
+    "addiu           $t4, $t4, -1                  \n"
+    "lw              $t0, 0(%[src_uv])             \n"  // V1 | U1 | V0 | U0
+    "lw              $t1, 4(%[src_uv])             \n"  // V3 | U3 | V2 | U2
+    "lw              $t2, 8(%[src_uv])             \n"  // V5 | U5 | V4 | U4
+    "lw              $t3, 12(%[src_uv])            \n"  // V7 | U7 | V6 | U6
+    "lw              $t5, 16(%[src_uv])            \n"  // V9 | U9 | V8 | U8
+    "lw              $t6, 20(%[src_uv])            \n"  // V11 | U11 | V10 | U10
+    "lw              $t7, 24(%[src_uv])            \n"  // V13 | U13 | V12 | U12
+    "lw              $t8, 28(%[src_uv])            \n"  // V15 | U15 | V14 | U14
+    "addiu           %[src_uv], %[src_uv], 32      \n"
+    "precrq.qb.ph    $t9, $t1, $t0                 \n"  // V3 | V2 | V1 | V0
+    "precr.qb.ph     $t0, $t1, $t0                 \n"  // U3 | U2 | U1 | U0
+    "precrq.qb.ph    $t1, $t3, $t2                 \n"  // V7 | V6 | V5 | V4
+    "precr.qb.ph     $t2, $t3, $t2                 \n"  // U7 | U6 | U5 | U4
+    "precrq.qb.ph    $t3, $t6, $t5                 \n"  // V11 | V10 | V9 | V8
+    "precr.qb.ph     $t5, $t6, $t5                 \n"  // U11 | U10 | U9 | U8
+    "precrq.qb.ph    $t6, $t8, $t7                 \n"  // V15 | V14 | V13 | V12
+    "precr.qb.ph     $t7, $t8, $t7                 \n"  // U15 | U14 | U13 | U12
+    "sw              $t9, 0(%[dst_v])              \n"
+    "sw              $t0, 0(%[dst_u])              \n"
+    "sw              $t1, 4(%[dst_v])              \n"
+    "sw              $t2, 4(%[dst_u])              \n"
+    "sw              $t3, 8(%[dst_v])              \n"
+    "sw              $t5, 8(%[dst_u])              \n"
+    "sw              $t6, 12(%[dst_v])             \n"
+    "sw              $t7, 12(%[dst_u])             \n"
+    "addiu           %[dst_v], %[dst_v], 16        \n"
+    "bgtz            $t4, 1b                       \n"
+    " addiu          %[dst_u], %[dst_u], 16        \n"
+
+    "beqz            %[width], 3f                  \n"
+    " nop                                          \n"
+
+  "2:                                              \n"
+    "lbu             $t0, 0(%[src_uv])             \n"
+    "lbu             $t1, 1(%[src_uv])             \n"
+    "addiu           %[src_uv], %[src_uv], 2       \n"
+    "addiu           %[width], %[width], -1        \n"
+    "sb              $t0, 0(%[dst_u])              \n"
+    "sb              $t1, 0(%[dst_v])              \n"
+    "addiu           %[dst_u], %[dst_u], 1         \n"
+    "bgtz            %[width], 2b                  \n"
+    " addiu          %[dst_v], %[dst_v], 1         \n"
+
+  "3:                                              \n"
+    ".set pop                                      \n"
+     : [src_uv] "+r" (src_uv),
+       [width] "+r" (width),
+       [dst_u] "+r" (dst_u),
+       [dst_v] "+r" (dst_v)
+     :
+     : "t0", "t1", "t2", "t3",
+     "t4", "t5", "t6", "t7", "t8", "t9"
+  );
+}
+
+void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width) {
+  __asm__ __volatile__ (
+    ".set push                             \n"
+    ".set noreorder                        \n"
+
+    "srl       $t4, %[width], 4            \n"  // multiplies of 16
+    "andi      $t5, %[width], 0xf          \n"
+    "blez      $t4, 2f                     \n"
+    " addu     %[src], %[src], %[width]    \n"  // src += width
+
+   "1:                                     \n"
+    "lw        $t0, -16(%[src])            \n"  // |3|2|1|0|
+    "lw        $t1, -12(%[src])            \n"  // |7|6|5|4|
+    "lw        $t2, -8(%[src])             \n"  // |11|10|9|8|
+    "lw        $t3, -4(%[src])             \n"  // |15|14|13|12|
+    "wsbh      $t0, $t0                    \n"  // |2|3|0|1|
+    "wsbh      $t1, $t1                    \n"  // |6|7|4|5|
+    "wsbh      $t2, $t2                    \n"  // |10|11|8|9|
+    "wsbh      $t3, $t3                    \n"  // |14|15|12|13|
+    "rotr      $t0, $t0, 16                \n"  // |0|1|2|3|
+    "rotr      $t1, $t1, 16                \n"  // |4|5|6|7|
+    "rotr      $t2, $t2, 16                \n"  // |8|9|10|11|
+    "rotr      $t3, $t3, 16                \n"  // |12|13|14|15|
+    "addiu     %[src], %[src], -16         \n"
+    "addiu     $t4, $t4, -1                \n"
+    "sw        $t3, 0(%[dst])              \n"  // |15|14|13|12|
+    "sw        $t2, 4(%[dst])              \n"  // |11|10|9|8|
+    "sw        $t1, 8(%[dst])              \n"  // |7|6|5|4|
+    "sw        $t0, 12(%[dst])             \n"  // |3|2|1|0|
+    "bgtz      $t4, 1b                     \n"
+    " addiu    %[dst], %[dst], 16          \n"
+    "beqz      $t5, 3f                     \n"
+    " nop                                  \n"
+
+   "2:                                     \n"
+    "lbu       $t0, -1(%[src])             \n"
+    "addiu     $t5, $t5, -1                \n"
+    "addiu     %[src], %[src], -1          \n"
+    "sb        $t0, 0(%[dst])              \n"
+    "bgez      $t5, 2b                     \n"
+    " addiu    %[dst], %[dst], 1           \n"
+
+   "3:                                     \n"
+    ".set pop                              \n"
+      : [src] "+r" (src), [dst] "+r" (dst)
+      : [width] "r" (width)
+      : "t0", "t1", "t2", "t3", "t4", "t5"
+  );
+}
+
+void MirrorUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                            int width) {
+  int x = 0;
+  int y = 0;
+  __asm__ __volatile__ (
+    ".set push                                    \n"
+    ".set noreorder                               \n"
+
+    "addu            $t4, %[width], %[width]      \n"
+    "srl             %[x], %[width], 4            \n"
+    "andi            %[y], %[width], 0xf          \n"
+    "blez            %[x], 2f                     \n"
+    " addu           %[src_uv], %[src_uv], $t4    \n"
+
+   "1:                                            \n"
+    "lw              $t0, -32(%[src_uv])          \n"  // |3|2|1|0|
+    "lw              $t1, -28(%[src_uv])          \n"  // |7|6|5|4|
+    "lw              $t2, -24(%[src_uv])          \n"  // |11|10|9|8|
+    "lw              $t3, -20(%[src_uv])          \n"  // |15|14|13|12|
+    "lw              $t4, -16(%[src_uv])          \n"  // |19|18|17|16|
+    "lw              $t6, -12(%[src_uv])          \n"  // |23|22|21|20|
+    "lw              $t7, -8(%[src_uv])           \n"  // |27|26|25|24|
+    "lw              $t8, -4(%[src_uv])           \n"  // |31|30|29|28|
+
+    "rotr            $t0, $t0, 16                 \n"  // |1|0|3|2|
+    "rotr            $t1, $t1, 16                 \n"  // |5|4|7|6|
+    "rotr            $t2, $t2, 16                 \n"  // |9|8|11|10|
+    "rotr            $t3, $t3, 16                 \n"  // |13|12|15|14|
+    "rotr            $t4, $t4, 16                 \n"  // |17|16|19|18|
+    "rotr            $t6, $t6, 16                 \n"  // |21|20|23|22|
+    "rotr            $t7, $t7, 16                 \n"  // |25|24|27|26|
+    "rotr            $t8, $t8, 16                 \n"  // |29|28|31|30|
+    "precr.qb.ph     $t9, $t0, $t1                \n"  // |0|2|4|6|
+    "precrq.qb.ph    $t5, $t0, $t1                \n"  // |1|3|5|7|
+    "precr.qb.ph     $t0, $t2, $t3                \n"  // |8|10|12|14|
+    "precrq.qb.ph    $t1, $t2, $t3                \n"  // |9|11|13|15|
+    "precr.qb.ph     $t2, $t4, $t6                \n"  // |16|18|20|22|
+    "precrq.qb.ph    $t3, $t4, $t6                \n"  // |17|19|21|23|
+    "precr.qb.ph     $t4, $t7, $t8                \n"  // |24|26|28|30|
+    "precrq.qb.ph    $t6, $t7, $t8                \n"  // |25|27|29|31|
+    "addiu           %[src_uv], %[src_uv], -32    \n"
+    "addiu           %[x], %[x], -1               \n"
+    "swr             $t4, 0(%[dst_u])             \n"
+    "swl             $t4, 3(%[dst_u])             \n"  // |30|28|26|24|
+    "swr             $t6, 0(%[dst_v])             \n"
+    "swl             $t6, 3(%[dst_v])             \n"  // |31|29|27|25|
+    "swr             $t2, 4(%[dst_u])             \n"
+    "swl             $t2, 7(%[dst_u])             \n"  // |22|20|18|16|
+    "swr             $t3, 4(%[dst_v])             \n"
+    "swl             $t3, 7(%[dst_v])             \n"  // |23|21|19|17|
+    "swr             $t0, 8(%[dst_u])             \n"
+    "swl             $t0, 11(%[dst_u])            \n"  // |14|12|10|8|
+    "swr             $t1, 8(%[dst_v])             \n"
+    "swl             $t1, 11(%[dst_v])            \n"  // |15|13|11|9|
+    "swr             $t9, 12(%[dst_u])            \n"
+    "swl             $t9, 15(%[dst_u])            \n"  // |6|4|2|0|
+    "swr             $t5, 12(%[dst_v])            \n"
+    "swl             $t5, 15(%[dst_v])            \n"  // |7|5|3|1|
+    "addiu           %[dst_v], %[dst_v], 16       \n"
+    "bgtz            %[x], 1b                     \n"
+    " addiu          %[dst_u], %[dst_u], 16       \n"
+    "beqz            %[y], 3f                     \n"
+    " nop                                         \n"
+    "b               2f                           \n"
+    " nop                                         \n"
+
+   "2:                                            \n"
+    "lbu             $t0, -2(%[src_uv])           \n"
+    "lbu             $t1, -1(%[src_uv])           \n"
+    "addiu           %[src_uv], %[src_uv], -2     \n"
+    "addiu           %[y], %[y], -1               \n"
+    "sb              $t0, 0(%[dst_u])             \n"
+    "sb              $t1, 0(%[dst_v])             \n"
+    "addiu           %[dst_u], %[dst_u], 1        \n"
+    "bgtz            %[y], 2b                     \n"
+    " addiu          %[dst_v], %[dst_v], 1        \n"
+
+   "3:                                            \n"
+    ".set pop                                     \n"
+      : [src_uv] "+r" (src_uv),
+        [dst_u] "+r" (dst_u),
+        [dst_v] "+r" (dst_v),
+        [x] "=&r" (x),
+        [y] "+r" (y)
+      : [width] "r" (width)
+      : "t0", "t1", "t2", "t3", "t4",
+      "t5", "t7", "t8", "t9"
+  );
+}
+
+// Convert (4 Y and 2 VU) I422 and arrange RGB values into
+// t5 = | 0 | B0 | 0 | b0 |
+// t4 = | 0 | B1 | 0 | b1 |
+// t9 = | 0 | G0 | 0 | g0 |
+// t8 = | 0 | G1 | 0 | g1 |
+// t2 = | 0 | R0 | 0 | r0 |
+// t1 = | 0 | R1 | 0 | r1 |
+#define YUVTORGB                                                               \
+      "lw                $t0, 0(%[y_buf])       \n"                            \
+      "lhu               $t1, 0(%[u_buf])       \n"                            \
+      "lhu               $t2, 0(%[v_buf])       \n"                            \
+      "preceu.ph.qbr     $t1, $t1               \n"                            \
+      "preceu.ph.qbr     $t2, $t2               \n"                            \
+      "preceu.ph.qbra    $t3, $t0               \n"                            \
+      "preceu.ph.qbla    $t0, $t0               \n"                            \
+      "subu.ph           $t1, $t1, $s5          \n"                            \
+      "subu.ph           $t2, $t2, $s5          \n"                            \
+      "subu.ph           $t3, $t3, $s4          \n"                            \
+      "subu.ph           $t0, $t0, $s4          \n"                            \
+      "mul.ph            $t3, $t3, $s0          \n"                            \
+      "mul.ph            $t0, $t0, $s0          \n"                            \
+      "shll.ph           $t4, $t1, 0x7          \n"                            \
+      "subu.ph           $t4, $t4, $t1          \n"                            \
+      "mul.ph            $t6, $t1, $s1          \n"                            \
+      "mul.ph            $t1, $t2, $s2          \n"                            \
+      "addq_s.ph         $t5, $t4, $t3          \n"                            \
+      "addq_s.ph         $t4, $t4, $t0          \n"                            \
+      "shra.ph           $t5, $t5, 6            \n"                            \
+      "shra.ph           $t4, $t4, 6            \n"                            \
+      "addiu             %[u_buf], 2            \n"                            \
+      "addiu             %[v_buf], 2            \n"                            \
+      "addu.ph           $t6, $t6, $t1          \n"                            \
+      "mul.ph            $t1, $t2, $s3          \n"                            \
+      "addu.ph           $t9, $t6, $t3          \n"                            \
+      "addu.ph           $t8, $t6, $t0          \n"                            \
+      "shra.ph           $t9, $t9, 6            \n"                            \
+      "shra.ph           $t8, $t8, 6            \n"                            \
+      "addu.ph           $t2, $t1, $t3          \n"                            \
+      "addu.ph           $t1, $t1, $t0          \n"                            \
+      "shra.ph           $t2, $t2, 6            \n"                            \
+      "shra.ph           $t1, $t1, 6            \n"                            \
+      "subu.ph           $t5, $t5, $s5          \n"                            \
+      "subu.ph           $t4, $t4, $s5          \n"                            \
+      "subu.ph           $t9, $t9, $s5          \n"                            \
+      "subu.ph           $t8, $t8, $s5          \n"                            \
+      "subu.ph           $t2, $t2, $s5          \n"                            \
+      "subu.ph           $t1, $t1, $s5          \n"                            \
+      "shll_s.ph         $t5, $t5, 8            \n"                            \
+      "shll_s.ph         $t4, $t4, 8            \n"                            \
+      "shll_s.ph         $t9, $t9, 8            \n"                            \
+      "shll_s.ph         $t8, $t8, 8            \n"                            \
+      "shll_s.ph         $t2, $t2, 8            \n"                            \
+      "shll_s.ph         $t1, $t1, 8            \n"                            \
+      "shra.ph           $t5, $t5, 8            \n"                            \
+      "shra.ph           $t4, $t4, 8            \n"                            \
+      "shra.ph           $t9, $t9, 8            \n"                            \
+      "shra.ph           $t8, $t8, 8            \n"                            \
+      "shra.ph           $t2, $t2, 8            \n"                            \
+      "shra.ph           $t1, $t1, 8            \n"                            \
+      "addu.ph           $t5, $t5, $s5          \n"                            \
+      "addu.ph           $t4, $t4, $s5          \n"                            \
+      "addu.ph           $t9, $t9, $s5          \n"                            \
+      "addu.ph           $t8, $t8, $s5          \n"                            \
+      "addu.ph           $t2, $t2, $s5          \n"                            \
+      "addu.ph           $t1, $t1, $s5          \n"
+
+// TODO(fbarchard): accept yuv conversion constants.
+void I422ToARGBRow_DSPR2(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              const struct YuvConstants* yuvconstants,
+                              int width) {
+  __asm__ __volatile__ (
+    ".set push                                \n"
+    ".set noreorder                           \n"
+    "beqz              %[width], 2f           \n"
+    " repl.ph          $s0, 74                \n"  // |YG|YG| = |74|74|
+    "repl.ph           $s1, -25               \n"  // |UG|UG| = |-25|-25|
+    "repl.ph           $s2, -52               \n"  // |VG|VG| = |-52|-52|
+    "repl.ph           $s3, 102               \n"  // |VR|VR| = |102|102|
+    "repl.ph           $s4, 16                \n"  // |0|16|0|16|
+    "repl.ph           $s5, 128               \n"  // |128|128| // clipping
+    "lui               $s6, 0xff00            \n"
+    "ori               $s6, 0xff00            \n"  // |ff|00|ff|00|ff|
+
+   "1:                                        \n"
+      YUVTORGB
+// Arranging into argb format
+    "precr.qb.ph       $t4, $t8, $t4          \n"  // |G1|g1|B1|b1|
+    "precr.qb.ph       $t5, $t9, $t5          \n"  // |G0|g0|B0|b0|
+    "addiu             %[width], -4           \n"
+    "precrq.qb.ph      $t8, $t4, $t5          \n"  // |G1|B1|G0|B0|
+    "precr.qb.ph       $t9, $t4, $t5          \n"  // |g1|b1|g0|b0|
+    "precr.qb.ph       $t2, $t1, $t2          \n"  // |R1|r1|R0|r0|
+
+    "addiu             %[y_buf], 4            \n"
+    "preceu.ph.qbla    $t1, $t2               \n"  // |0 |R1|0 |R0|
+    "preceu.ph.qbra    $t2, $t2               \n"  // |0 |r1|0 |r0|
+    "or                $t1, $t1, $s6          \n"  // |ff|R1|ff|R0|
+    "or                $t2, $t2, $s6          \n"  // |ff|r1|ff|r0|
+    "precrq.ph.w       $t0, $t2, $t9          \n"  // |ff|r1|g1|b1|
+    "precrq.ph.w       $t3, $t1, $t8          \n"  // |ff|R1|G1|B1|
+    "sll               $t9, $t9, 16           \n"
+    "sll               $t8, $t8, 16           \n"
+    "packrl.ph         $t2, $t2, $t9          \n"  // |ff|r0|g0|b0|
+    "packrl.ph         $t1, $t1, $t8          \n"  // |ff|R0|G0|B0|
+// Store results.
+    "sw                $t2, 0(%[rgb_buf])     \n"
+    "sw                $t0, 4(%[rgb_buf])     \n"
+    "sw                $t1, 8(%[rgb_buf])     \n"
+    "sw                $t3, 12(%[rgb_buf])    \n"
+    "bnez              %[width], 1b           \n"
+    " addiu            %[rgb_buf], 16         \n"
+   "2:                                        \n"
+    ".set pop                                 \n"
+      :[y_buf] "+r" (y_buf),
+       [u_buf] "+r" (u_buf),
+       [v_buf] "+r" (v_buf),
+       [width] "+r" (width),
+       [rgb_buf] "+r" (rgb_buf)
+      :
+      : "t0", "t1",  "t2", "t3",  "t4", "t5",
+      "t6", "t7", "t8", "t9",
+      "s0", "s1", "s2", "s3",
+      "s4", "s5", "s6"
+  );
+}
+
+// Bilinear filter 8x2 -> 8x1
+void InterpolateRow_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
+                               ptrdiff_t src_stride, int dst_width,
+                               int source_y_fraction) {
+    int y0_fraction = 256 - source_y_fraction;
+    const uint8* src_ptr1 = src_ptr + src_stride;
+
+  __asm__ __volatile__ (
+     ".set push                                           \n"
+     ".set noreorder                                      \n"
+
+     "replv.ph          $t0, %[y0_fraction]               \n"
+     "replv.ph          $t1, %[source_y_fraction]         \n"
+
+   "1:                                                    \n"
+     "lw                $t2, 0(%[src_ptr])                \n"
+     "lw                $t3, 0(%[src_ptr1])               \n"
+     "lw                $t4, 4(%[src_ptr])                \n"
+     "lw                $t5, 4(%[src_ptr1])               \n"
+     "muleu_s.ph.qbl    $t6, $t2, $t0                     \n"
+     "muleu_s.ph.qbr    $t7, $t2, $t0                     \n"
+     "muleu_s.ph.qbl    $t8, $t3, $t1                     \n"
+     "muleu_s.ph.qbr    $t9, $t3, $t1                     \n"
+     "muleu_s.ph.qbl    $t2, $t4, $t0                     \n"
+     "muleu_s.ph.qbr    $t3, $t4, $t0                     \n"
+     "muleu_s.ph.qbl    $t4, $t5, $t1                     \n"
+     "muleu_s.ph.qbr    $t5, $t5, $t1                     \n"
+     "addq.ph           $t6, $t6, $t8                     \n"
+     "addq.ph           $t7, $t7, $t9                     \n"
+     "addq.ph           $t2, $t2, $t4                     \n"
+     "addq.ph           $t3, $t3, $t5                     \n"
+     "shra.ph           $t6, $t6, 8                       \n"
+     "shra.ph           $t7, $t7, 8                       \n"
+     "shra.ph           $t2, $t2, 8                       \n"
+     "shra.ph           $t3, $t3, 8                       \n"
+     "precr.qb.ph       $t6, $t6, $t7                     \n"
+     "precr.qb.ph       $t2, $t2, $t3                     \n"
+     "addiu             %[src_ptr], %[src_ptr], 8         \n"
+     "addiu             %[src_ptr1], %[src_ptr1], 8       \n"
+     "addiu             %[dst_width], %[dst_width], -8    \n"
+     "sw                $t6, 0(%[dst_ptr])                \n"
+     "sw                $t2, 4(%[dst_ptr])                \n"
+     "bgtz              %[dst_width], 1b                  \n"
+     " addiu            %[dst_ptr], %[dst_ptr], 8         \n"
+
+     ".set pop                                            \n"
+  : [dst_ptr] "+r" (dst_ptr),
+    [src_ptr1] "+r" (src_ptr1),
+    [src_ptr] "+r" (src_ptr),
+    [dst_width] "+r" (dst_width)
+  : [source_y_fraction] "r" (source_y_fraction),
+    [y0_fraction] "r" (y0_fraction),
+    [src_stride] "r" (src_stride)
+  : "t0", "t1", "t2", "t3", "t4", "t5",
+    "t6", "t7", "t8", "t9"
+  );
+}
+#endif  // __mips_dsp_rev >= 2
+
+#endif  // defined(__mips__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libyuv/source/row_neon.cc b/libs/libyuv/source/row_neon.cc
new file mode 100644
index 0000000000..13fe95cb7c
--- /dev/null
+++ b/libs/libyuv/source/row_neon.cc
@@ -0,0 +1,2839 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+    !defined(__aarch64__)
+
+// Read 8 Y, 4 U and 4 V from 422
+#define READYUV422                                                             \
+    MEMACCESS(0)                                                               \
+    "vld1.8     {d0}, [%0]!                    \n"                             \
+    MEMACCESS(1)                                                               \
+    "vld1.32    {d2[0]}, [%1]!                 \n"                             \
+    MEMACCESS(2)                                                               \
+    "vld1.32    {d2[1]}, [%2]!                 \n"
+
+// Read 8 Y, 2 U and 2 V from 422
+#define READYUV411                                                             \
+    MEMACCESS(0)                                                               \
+    "vld1.8     {d0}, [%0]!                    \n"                             \
+    MEMACCESS(1)                                                               \
+    "vld1.16    {d2[0]}, [%1]!                 \n"                             \
+    MEMACCESS(2)                                                               \
+    "vld1.16    {d2[1]}, [%2]!                 \n"                             \
+    "vmov.u8    d3, d2                         \n"                             \
+    "vzip.u8    d2, d3                         \n"
+
+// Read 8 Y, 8 U and 8 V from 444
+#define READYUV444                                                             \
+    MEMACCESS(0)                                                               \
+    "vld1.8     {d0}, [%0]!                    \n"                             \
+    MEMACCESS(1)                                                               \
+    "vld1.8     {d2}, [%1]!                    \n"                             \
+    MEMACCESS(2)                                                               \
+    "vld1.8     {d3}, [%2]!                    \n"                             \
+    "vpaddl.u8  q1, q1                         \n"                             \
+    "vrshrn.u16 d2, q1, #1                     \n"
+
+// Read 8 Y, and set 4 U and 4 V to 128
+#define READYUV400                                                             \
+    MEMACCESS(0)                                                               \
+    "vld1.8     {d0}, [%0]!                    \n"                             \
+    "vmov.u8    d2, #128                       \n"
+
+// Read 8 Y and 4 UV from NV12
+#define READNV12                                                               \
+    MEMACCESS(0)                                                               \
+    "vld1.8     {d0}, [%0]!                    \n"                             \
+    MEMACCESS(1)                                                               \
+    "vld1.8     {d2}, [%1]!                    \n"                             \
+    "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\
+    "vuzp.u8    d2, d3                         \n"                             \
+    "vtrn.u32   d2, d3                         \n"
+
+// Read 8 Y and 4 VU from NV21
+#define READNV21                                                               \
+    MEMACCESS(0)                                                               \
+    "vld1.8     {d0}, [%0]!                    \n"                             \
+    MEMACCESS(1)                                                               \
+    "vld1.8     {d2}, [%1]!                    \n"                             \
+    "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\
+    "vuzp.u8    d3, d2                         \n"                             \
+    "vtrn.u32   d2, d3                         \n"
+
+// Read 8 YUY2
+#define READYUY2                                                               \
+    MEMACCESS(0)                                                               \
+    "vld2.8     {d0, d2}, [%0]!                \n"                             \
+    "vmov.u8    d3, d2                         \n"                             \
+    "vuzp.u8    d2, d3                         \n"                             \
+    "vtrn.u32   d2, d3                         \n"
+
+// Read 8 UYVY
+#define READUYVY                                                               \
+    MEMACCESS(0)                                                               \
+    "vld2.8     {d2, d3}, [%0]!                \n"                             \
+    "vmov.u8    d0, d3                         \n"                             \
+    "vmov.u8    d3, d2                         \n"                             \
+    "vuzp.u8    d2, d3                         \n"                             \
+    "vtrn.u32   d2, d3                         \n"
+
+#define YUVTORGB_SETUP                                                         \
+    MEMACCESS([kUVToRB])                                                       \
+    "vld1.8     {d24}, [%[kUVToRB]]            \n"                             \
+    MEMACCESS([kUVToG])                                                        \
+    "vld1.8     {d25}, [%[kUVToG]]             \n"                             \
+    MEMACCESS([kUVBiasBGR])                                                    \
+    "vld1.16    {d26[], d27[]}, [%[kUVBiasBGR]]! \n"                           \
+    MEMACCESS([kUVBiasBGR])                                                    \
+    "vld1.16    {d8[], d9[]}, [%[kUVBiasBGR]]!   \n"                           \
+    MEMACCESS([kUVBiasBGR])                                                    \
+    "vld1.16    {d28[], d29[]}, [%[kUVBiasBGR]]  \n"                           \
+    MEMACCESS([kYToRgb])                                                       \
+    "vld1.32    {d30[], d31[]}, [%[kYToRgb]]     \n"
+
+#define YUVTORGB                                                               \
+    "vmull.u8   q8, d2, d24                    \n" /* u/v B/R component      */\
+    "vmull.u8   q9, d2, d25                    \n" /* u/v G component        */\
+    "vmovl.u8   q0, d0                         \n" /* Y                      */\
+    "vmovl.s16  q10, d1                        \n"                             \
+    "vmovl.s16  q0, d0                         \n"                             \
+    "vmul.s32   q10, q10, q15                  \n"                             \
+    "vmul.s32   q0, q0, q15                    \n"                             \
+    "vqshrun.s32 d0, q0, #16                   \n"                             \
+    "vqshrun.s32 d1, q10, #16                  \n" /* Y                      */\
+    "vadd.s16   d18, d19                       \n"                             \
+    "vshll.u16  q1, d16, #16                   \n" /* Replicate u * UB       */\
+    "vshll.u16  q10, d17, #16                  \n" /* Replicate v * VR       */\
+    "vshll.u16  q3, d18, #16                   \n" /* Replicate (v*VG + u*UG)*/\
+    "vaddw.u16  q1, q1, d16                    \n"                             \
+    "vaddw.u16  q10, q10, d17                  \n"                             \
+    "vaddw.u16  q3, q3, d18                    \n"                             \
+    "vqadd.s16  q8, q0, q13                    \n" /* B */                     \
+    "vqadd.s16  q9, q0, q14                    \n" /* R */                     \
+    "vqadd.s16  q0, q0, q4                     \n" /* G */                     \
+    "vqadd.s16  q8, q8, q1                     \n" /* B */                     \
+    "vqadd.s16  q9, q9, q10                    \n" /* R */                     \
+    "vqsub.s16  q0, q0, q3                     \n" /* G */                     \
+    "vqshrun.s16 d20, q8, #6                   \n" /* B */                     \
+    "vqshrun.s16 d22, q9, #6                   \n" /* R */                     \
+    "vqshrun.s16 d21, q0, #6                   \n" /* G */
+
+void I444ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+    "vmov.u8    d23, #255                      \n"
+  "1:                                          \n"
+    READYUV444
+    YUVTORGB
+    "subs       %4, %4, #8                     \n"
+    MEMACCESS(3)
+    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_argb),  // %3
+      "+r"(width)      // %4
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void I422ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+    "vmov.u8    d23, #255                      \n"
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB
+    "subs       %4, %4, #8                     \n"
+    MEMACCESS(3)
+    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_argb),  // %3
+      "+r"(width)      // %4
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void I422AlphaToARGBRow_NEON(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             const uint8* src_a,
+                             uint8* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB
+    "subs       %5, %5, #8                     \n"
+    MEMACCESS(3)
+    "vld1.8     {d23}, [%3]!                   \n"
+    MEMACCESS(4)
+    "vst4.8     {d20, d21, d22, d23}, [%4]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(src_a),     // %3
+      "+r"(dst_argb),  // %4
+      "+r"(width)      // %5
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void I411ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+    "vmov.u8    d23, #255                      \n"
+  "1:                                          \n"
+    READYUV411
+    YUVTORGB
+    "subs       %4, %4, #8                     \n"
+    MEMACCESS(3)
+    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_argb),  // %3
+      "+r"(width)      // %4
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void I422ToRGBARow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_rgba,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB
+    "subs       %4, %4, #8                     \n"
+    "vmov.u8    d19, #255                      \n"  // d19 modified by YUVTORGB
+    MEMACCESS(3)
+    "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_rgba),  // %3
+      "+r"(width)      // %4
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void I422ToRGB24Row_NEON(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB
+    "subs       %4, %4, #8                     \n"
+    MEMACCESS(3)
+    "vst3.8     {d20, d21, d22}, [%3]!         \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),      // %0
+      "+r"(src_u),      // %1
+      "+r"(src_v),      // %2
+      "+r"(dst_rgb24),  // %3
+      "+r"(width)       // %4
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+#define ARGBTORGB565                                                           \
+    "vshr.u8    d20, d20, #3                   \n"  /* B                    */ \
+    "vshr.u8    d21, d21, #2                   \n"  /* G                    */ \
+    "vshr.u8    d22, d22, #3                   \n"  /* R                    */ \
+    "vmovl.u8   q8, d20                        \n"  /* B                    */ \
+    "vmovl.u8   q9, d21                        \n"  /* G                    */ \
+    "vmovl.u8   q10, d22                       \n"  /* R                    */ \
+    "vshl.u16   q9, q9, #5                     \n"  /* G                    */ \
+    "vshl.u16   q10, q10, #11                  \n"  /* R                    */ \
+    "vorr       q0, q8, q9                     \n"  /* BG                   */ \
+    "vorr       q0, q0, q10                    \n"  /* BGR                  */
+
+void I422ToRGB565Row_NEON(const uint8* src_y,
+                          const uint8* src_u,
+                          const uint8* src_v,
+                          uint8* dst_rgb565,
+                          const struct YuvConstants* yuvconstants,
+                          int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB
+    "subs       %4, %4, #8                     \n"
+    ARGBTORGB565
+    MEMACCESS(3)
+    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels RGB565.
+    "bgt        1b                             \n"
+    : "+r"(src_y),    // %0
+      "+r"(src_u),    // %1
+      "+r"(src_v),    // %2
+      "+r"(dst_rgb565),  // %3
+      "+r"(width)     // %4
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+#define ARGBTOARGB1555                                                         \
+    "vshr.u8    q10, q10, #3                   \n"  /* B                    */ \
+    "vshr.u8    d22, d22, #3                   \n"  /* R                    */ \
+    "vshr.u8    d23, d23, #7                   \n"  /* A                    */ \
+    "vmovl.u8   q8, d20                        \n"  /* B                    */ \
+    "vmovl.u8   q9, d21                        \n"  /* G                    */ \
+    "vmovl.u8   q10, d22                       \n"  /* R                    */ \
+    "vmovl.u8   q11, d23                       \n"  /* A                    */ \
+    "vshl.u16   q9, q9, #5                     \n"  /* G                    */ \
+    "vshl.u16   q10, q10, #10                  \n"  /* R                    */ \
+    "vshl.u16   q11, q11, #15                  \n"  /* A                    */ \
+    "vorr       q0, q8, q9                     \n"  /* BG                   */ \
+    "vorr       q1, q10, q11                   \n"  /* RA                   */ \
+    "vorr       q0, q0, q1                     \n"  /* BGRA                 */
+
+void I422ToARGB1555Row_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb1555,
+                            const struct YuvConstants* yuvconstants,
+                            int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB
+    "subs       %4, %4, #8                     \n"
+    "vmov.u8    d23, #255                      \n"
+    ARGBTOARGB1555
+    MEMACCESS(3)
+    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels ARGB1555.
+    "bgt        1b                             \n"
+    : "+r"(src_y),    // %0
+      "+r"(src_u),    // %1
+      "+r"(src_v),    // %2
+      "+r"(dst_argb1555),  // %3
+      "+r"(width)     // %4
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+#define ARGBTOARGB4444                                                         \
+    "vshr.u8    d20, d20, #4                   \n"  /* B                    */ \
+    "vbic.32    d21, d21, d4                   \n"  /* G                    */ \
+    "vshr.u8    d22, d22, #4                   \n"  /* R                    */ \
+    "vbic.32    d23, d23, d4                   \n"  /* A                    */ \
+    "vorr       d0, d20, d21                   \n"  /* BG                   */ \
+    "vorr       d1, d22, d23                   \n"  /* RA                   */ \
+    "vzip.u8    d0, d1                         \n"  /* BGRA                 */
+
+void I422ToARGB4444Row_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb4444,
+                            const struct YuvConstants* yuvconstants,
+                            int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+    "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB
+    "subs       %4, %4, #8                     \n"
+    "vmov.u8    d23, #255                      \n"
+    ARGBTOARGB4444
+    MEMACCESS(3)
+    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels ARGB4444.
+    "bgt        1b                             \n"
+    : "+r"(src_y),    // %0
+      "+r"(src_u),    // %1
+      "+r"(src_v),    // %2
+      "+r"(dst_argb4444),  // %3
+      "+r"(width)     // %4
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void I400ToARGBRow_NEON(const uint8* src_y,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+    "vmov.u8    d23, #255                      \n"
+  "1:                                          \n"
+    READYUV400
+    YUVTORGB
+    "subs       %2, %2, #8                     \n"
+    MEMACCESS(1)
+    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(dst_argb),  // %1
+      "+r"(width)      // %2
+    : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB),
+      [kUVToG]"r"(&kYuvI601Constants.kUVToG),
+      [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR),
+      [kYToRgb]"r"(&kYuvI601Constants.kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void J400ToARGBRow_NEON(const uint8* src_y,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    "vmov.u8    d23, #255                      \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {d20}, [%0]!                   \n"
+    "vmov       d21, d20                       \n"
+    "vmov       d22, d20                       \n"
+    "subs       %2, %2, #8                     \n"
+    MEMACCESS(1)
+    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(dst_argb),  // %1
+      "+r"(width)      // %2
+    :
+    : "cc", "memory", "d20", "d21", "d22", "d23"
+  );
+}
+
+void NV12ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_uv,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+    "vmov.u8    d23, #255                      \n"
+  "1:                                          \n"
+    READNV12
+    YUVTORGB
+    "subs       %3, %3, #8                     \n"
+    MEMACCESS(2)
+    "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_uv),    // %1
+      "+r"(dst_argb),  // %2
+      "+r"(width)      // %3
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void NV21ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_vu,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+    "vmov.u8    d23, #255                      \n"
+  "1:                                          \n"
+    READNV21
+    YUVTORGB
+    "subs       %3, %3, #8                     \n"
+    MEMACCESS(2)
+    "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_vu),    // %1
+      "+r"(dst_argb),  // %2
+      "+r"(width)      // %3
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void NV12ToRGB565Row_NEON(const uint8* src_y,
+                          const uint8* src_uv,
+                          uint8* dst_rgb565,
+                          const struct YuvConstants* yuvconstants,
+                          int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+  "1:                                          \n"
+    READNV12
+    YUVTORGB
+    "subs       %3, %3, #8                     \n"
+    ARGBTORGB565
+    MEMACCESS(2)
+    "vst1.8     {q0}, [%2]!                    \n"  // store 8 pixels RGB565.
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_uv),    // %1
+      "+r"(dst_rgb565),  // %2
+      "+r"(width)      // %3
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+    "vmov.u8    d23, #255                      \n"
+  "1:                                          \n"
+    READYUY2
+    YUVTORGB
+    "subs       %2, %2, #8                     \n"
+    MEMACCESS(1)
+    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_yuy2),  // %0
+      "+r"(dst_argb),  // %1
+      "+r"(width)      // %2
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void UYVYToARGBRow_NEON(const uint8* src_uyvy,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+    "vmov.u8    d23, #255                      \n"
+  "1:                                          \n"
+    READUYVY
+    YUVTORGB
+    "subs       %2, %2, #8                     \n"
+    MEMACCESS(1)
+    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_uyvy),  // %0
+      "+r"(dst_argb),  // %1
+      "+r"(width)      // %2
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
+void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                     int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pairs of UV
+    "subs       %3, %3, #16                    \n"  // 16 processed per loop
+    MEMACCESS(1)
+    "vst1.8     {q0}, [%1]!                    \n"  // store U
+    MEMACCESS(2)
+    "vst1.8     {q1}, [%2]!                    \n"  // store V
+    "bgt        1b                             \n"
+    : "+r"(src_uv),  // %0
+      "+r"(dst_u),   // %1
+      "+r"(dst_v),   // %2
+      "+r"(width)    // %3  // Output registers
+    :                       // Input registers
+    : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+
+// Reads 16 U's and V's and writes out 16 pairs of UV.
+void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                     int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load U
+    MEMACCESS(1)
+    "vld1.8     {q1}, [%1]!                    \n"  // load V
+    "subs       %3, %3, #16                    \n"  // 16 processed per loop
+    MEMACCESS(2)
+    "vst2.u8    {q0, q1}, [%2]!                \n"  // store 16 pairs of UV
+    "bgt        1b                             \n"
+    :
+      "+r"(src_u),   // %0
+      "+r"(src_v),   // %1
+      "+r"(dst_uv),  // %2
+      "+r"(width)    // %3  // Output registers
+    :                       // Input registers
+    : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+
+// Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
+void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 32
+    "subs       %2, %2, #32                    \n"  // 32 processed per loop
+    MEMACCESS(1)
+    "vst1.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 32
+    "bgt        1b                             \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(count)  // %2  // Output registers
+  :                     // Input registers
+  : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+
+// SetRow writes 'count' bytes using an 8 bit value repeated.
+void SetRow_NEON(uint8* dst, uint8 v8, int count) {
+  asm volatile (
+    "vdup.8    q0, %2                          \n"  // duplicate 16 bytes
+  "1:                                          \n"
+    "subs      %1, %1, #16                     \n"  // 16 bytes per loop
+    MEMACCESS(0)
+    "vst1.8    {q0}, [%0]!                     \n"  // store
+    "bgt       1b                              \n"
+  : "+r"(dst),   // %0
+    "+r"(count)  // %1
+  : "r"(v8)      // %2
+  : "cc", "memory", "q0"
+  );
+}
+
+// ARGBSetRow writes 'count' pixels using an 32 bit value repeated.
+void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
+  asm volatile (
+    "vdup.u32  q0, %2                          \n"  // duplicate 4 ints
+  "1:                                          \n"
+    "subs      %1, %1, #4                      \n"  // 4 pixels per loop
+    MEMACCESS(0)
+    "vst1.8    {q0}, [%0]!                     \n"  // store
+    "bgt       1b                              \n"
+  : "+r"(dst),   // %0
+    "+r"(count)  // %1
+  : "r"(v32)     // %2
+  : "cc", "memory", "q0"
+  );
+}
+
+void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    // Start at end of source row.
+    "mov        r3, #-16                       \n"
+    "add        %0, %0, %2                     \n"
+    "sub        %0, #16                        \n"
+
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
+    "subs       %2, #16                        \n"  // 16 pixels per loop.
+    "vrev64.8   q0, q0                         \n"
+    MEMACCESS(1)
+    "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width)  // %2
+  :
+  : "cc", "memory", "r3", "q0"
+  );
+}
+
+void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                      int width) {
+  asm volatile (
+    // Start at end of source row.
+    "mov        r12, #-16                      \n"
+    "add        %0, %0, %3, lsl #1             \n"
+    "sub        %0, #16                        \n"
+
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld2.8     {d0, d1}, [%0], r12            \n"  // src -= 16
+    "subs       %3, #8                         \n"  // 8 pixels per loop.
+    "vrev64.8   q0, q0                         \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // dst += 8
+    MEMACCESS(2)
+    "vst1.8     {d1}, [%2]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src_uv),  // %0
+    "+r"(dst_u),   // %1
+    "+r"(dst_v),   // %2
+    "+r"(width)    // %3
+  :
+  : "cc", "memory", "r12", "q0"
+  );
+}
+
+void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    // Start at end of source row.
+    "mov        r3, #-16                       \n"
+    "add        %0, %0, %2, lsl #2             \n"
+    "sub        %0, #16                        \n"
+
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
+    "subs       %2, #4                         \n"  // 4 pixels per loop.
+    "vrev64.32  q0, q0                         \n"
+    MEMACCESS(1)
+    "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width)  // %2
+  :
+  : "cc", "memory", "r3", "q0"
+  );
+}
+
+void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
+  asm volatile (
+    "vmov.u8    d4, #255                       \n"  // Alpha
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RGB24.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    MEMACCESS(1)
+    "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(src_rgb24),  // %0
+    "+r"(dst_argb),   // %1
+    "+r"(width)         // %2
+  :
+  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
+  );
+}
+
+void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {
+  asm volatile (
+    "vmov.u8    d4, #255                       \n"  // Alpha
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RAW.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vswp.u8    d1, d3                         \n"  // swap R, B
+    MEMACCESS(1)
+    "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(src_raw),   // %0
+    "+r"(dst_argb),  // %1
+    "+r"(width)      // %2
+  :
+  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
+  );
+}
+
+void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RAW.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vswp.u8    d1, d3                         \n"  // swap R, B
+    MEMACCESS(1)
+    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RGB24.
+    "bgt        1b                             \n"
+  : "+r"(src_raw),    // %0
+    "+r"(dst_rgb24),  // %1
+    "+r"(width)       // %2
+  :
+  : "cc", "memory", "d1", "d2", "d3"  // Clobber List
+  );
+}
+
+#define RGB565TOARGB                                                           \
+    "vshrn.u16  d6, q0, #5                     \n"  /* G xxGGGGGG           */ \
+    "vuzp.u8    d0, d1                         \n"  /* d0 xxxBBBBB RRRRRxxx */ \
+    "vshl.u8    d6, d6, #2                     \n"  /* G GGGGGG00 upper 6   */ \
+    "vshr.u8    d1, d1, #3                     \n"  /* R 000RRRRR lower 5   */ \
+    "vshl.u8    q0, q0, #3                     \n"  /* B,R BBBBB000 upper 5 */ \
+    "vshr.u8    q2, q0, #5                     \n"  /* B,R 00000BBB lower 3 */ \
+    "vorr.u8    d0, d0, d4                     \n"  /* B                    */ \
+    "vshr.u8    d4, d6, #6                     \n"  /* G 000000GG lower 2   */ \
+    "vorr.u8    d2, d1, d5                     \n"  /* R                    */ \
+    "vorr.u8    d1, d4, d6                     \n"  /* G                    */
+
+void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
+  asm volatile (
+    "vmov.u8    d3, #255                       \n"  // Alpha
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    RGB565TOARGB
+    MEMACCESS(1)
+    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(src_rgb565),  // %0
+    "+r"(dst_argb),    // %1
+    "+r"(width)          // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
+  );
+}
+
+#define ARGB1555TOARGB                                                         \
+    "vshrn.u16  d7, q0, #8                     \n"  /* A Arrrrrxx           */ \
+    "vshr.u8    d6, d7, #2                     \n"  /* R xxxRRRRR           */ \
+    "vshrn.u16  d5, q0, #5                     \n"  /* G xxxGGGGG           */ \
+    "vmovn.u16  d4, q0                         \n"  /* B xxxBBBBB           */ \
+    "vshr.u8    d7, d7, #7                     \n"  /* A 0000000A           */ \
+    "vneg.s8    d7, d7                         \n"  /* A AAAAAAAA upper 8   */ \
+    "vshl.u8    d6, d6, #3                     \n"  /* R RRRRR000 upper 5   */ \
+    "vshr.u8    q1, q3, #5                     \n"  /* R,A 00000RRR lower 3 */ \
+    "vshl.u8    q0, q2, #3                     \n"  /* B,G BBBBB000 upper 5 */ \
+    "vshr.u8    q2, q0, #5                     \n"  /* B,G 00000BBB lower 3 */ \
+    "vorr.u8    q1, q1, q3                     \n"  /* R,A                  */ \
+    "vorr.u8    q0, q0, q2                     \n"  /* B,G                  */ \
+
+// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
+#define RGB555TOARGB                                                           \
+    "vshrn.u16  d6, q0, #5                     \n"  /* G xxxGGGGG           */ \
+    "vuzp.u8    d0, d1                         \n"  /* d0 xxxBBBBB xRRRRRxx */ \
+    "vshl.u8    d6, d6, #3                     \n"  /* G GGGGG000 upper 5   */ \
+    "vshr.u8    d1, d1, #2                     \n"  /* R 00xRRRRR lower 5   */ \
+    "vshl.u8    q0, q0, #3                     \n"  /* B,R BBBBB000 upper 5 */ \
+    "vshr.u8    q2, q0, #5                     \n"  /* B,R 00000BBB lower 3 */ \
+    "vorr.u8    d0, d0, d4                     \n"  /* B                    */ \
+    "vshr.u8    d4, d6, #5                     \n"  /* G 00000GGG lower 3   */ \
+    "vorr.u8    d2, d1, d5                     \n"  /* R                    */ \
+    "vorr.u8    d1, d4, d6                     \n"  /* G                    */
+
+void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
+                            int width) {
+  asm volatile (
+    "vmov.u8    d3, #255                       \n"  // Alpha
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    ARGB1555TOARGB
+    MEMACCESS(1)
+    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(src_argb1555),  // %0
+    "+r"(dst_argb),    // %1
+    "+r"(width)          // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
+  );
+}
+
+#define ARGB4444TOARGB                                                         \
+    "vuzp.u8    d0, d1                         \n"  /* d0 BG, d1 RA         */ \
+    "vshl.u8    q2, q0, #4                     \n"  /* B,R BBBB0000         */ \
+    "vshr.u8    q1, q0, #4                     \n"  /* G,A 0000GGGG         */ \
+    "vshr.u8    q0, q2, #4                     \n"  /* B,R 0000BBBB         */ \
+    "vorr.u8    q0, q0, q2                     \n"  /* B,R BBBBBBBB         */ \
+    "vshl.u8    q2, q1, #4                     \n"  /* G,A GGGG0000         */ \
+    "vorr.u8    q1, q1, q2                     \n"  /* G,A GGGGGGGG         */ \
+    "vswp.u8    d1, d2                         \n"  /* B,R,G,A -> B,G,R,A   */
+
+void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
+                            int width) {
+  asm volatile (
+    "vmov.u8    d3, #255                       \n"  // Alpha
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    ARGB4444TOARGB
+    MEMACCESS(1)
+    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(src_argb4444),  // %0
+    "+r"(dst_argb),    // %1
+    "+r"(width)          // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2"  // Clobber List
+  );
+}
+
+void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    MEMACCESS(1)
+    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RGB24.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_rgb24),  // %1
+    "+r"(width)         // %2
+  :
+  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
+  );
+}
+
+void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vswp.u8    d1, d3                         \n"  // swap R, B
+    MEMACCESS(1)
+    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RAW.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_raw),   // %1
+    "+r"(width)        // %2
+  :
+  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
+  );
+}
+
+void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of YUY2.
+    "subs       %2, %2, #16                    \n"  // 16 processed per loop.
+    MEMACCESS(1)
+    "vst1.8     {q0}, [%1]!                    \n"  // store 16 pixels of Y.
+    "bgt        1b                             \n"
+  : "+r"(src_yuy2),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(width)        // %2
+  :
+  : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+
+void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of UYVY.
+    "subs       %2, %2, #16                    \n"  // 16 processed per loop.
+    MEMACCESS(1)
+    "vst1.8     {q1}, [%1]!                    \n"  // store 16 pixels of Y.
+    "bgt        1b                             \n"
+  : "+r"(src_uyvy),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(width)        // %2
+  :
+  : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+
+void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
+                         int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
+    "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
+    MEMACCESS(1)
+    "vst1.8     {d1}, [%1]!                    \n"  // store 8 U.
+    MEMACCESS(2)
+    "vst1.8     {d3}, [%2]!                    \n"  // store 8 V.
+    "bgt        1b                             \n"
+  : "+r"(src_yuy2),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(width)        // %3
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
+  );
+}
+
+void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
+                         int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
+    "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 U.
+    MEMACCESS(2)
+    "vst1.8     {d2}, [%2]!                    \n"  // store 8 V.
+    "bgt        1b                             \n"
+  : "+r"(src_uyvy),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(width)        // %3
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
+  );
+}
+
+void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // stride + src_yuy2
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
+    "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
+    MEMACCESS(1)
+    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row YUY2.
+    "vrhadd.u8  d1, d1, d5                     \n"  // average rows of U
+    "vrhadd.u8  d3, d3, d7                     \n"  // average rows of V
+    MEMACCESS(2)
+    "vst1.8     {d1}, [%2]!                    \n"  // store 8 U.
+    MEMACCESS(3)
+    "vst1.8     {d3}, [%3]!                    \n"  // store 8 V.
+    "bgt        1b                             \n"
+  : "+r"(src_yuy2),     // %0
+    "+r"(stride_yuy2),  // %1
+    "+r"(dst_u),        // %2
+    "+r"(dst_v),        // %3
+    "+r"(width)           // %4
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"  // Clobber List
+  );
+}
+
+void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // stride + src_uyvy
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
+    "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
+    MEMACCESS(1)
+    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row UYVY.
+    "vrhadd.u8  d0, d0, d4                     \n"  // average rows of U
+    "vrhadd.u8  d2, d2, d6                     \n"  // average rows of V
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 U.
+    MEMACCESS(3)
+    "vst1.8     {d2}, [%3]!                    \n"  // store 8 V.
+    "bgt        1b                             \n"
+  : "+r"(src_uyvy),     // %0
+    "+r"(stride_uyvy),  // %1
+    "+r"(dst_u),        // %2
+    "+r"(dst_v),        // %3
+    "+r"(width)           // %4
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"  // Clobber List
+  );
+}
+
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int width) {
+  asm volatile (
+    MEMACCESS(3)
+    "vld1.8     {q2}, [%3]                     \n"  // shuffler
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 4 pixels.
+    "subs       %2, %2, #4                     \n"  // 4 processed per loop
+    "vtbl.8     d2, {d0, d1}, d4               \n"  // look up 2 first pixels
+    "vtbl.8     d3, {d0, d1}, d5               \n"  // look up 2 next pixels
+    MEMACCESS(1)
+    "vst1.8     {q1}, [%1]!                    \n"  // store 4.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(width)        // %2
+  : "r"(shuffler)    // %3
+  : "cc", "memory", "q0", "q1", "q2"  // Clobber List
+  );
+}
+
+void I422ToYUY2Row_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_yuy2, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld2.8     {d0, d2}, [%0]!                \n"  // load 16 Ys
+    MEMACCESS(1)
+    "vld1.8     {d1}, [%1]!                    \n"  // load 8 Us
+    MEMACCESS(2)
+    "vld1.8     {d3}, [%2]!                    \n"  // load 8 Vs
+    "subs       %4, %4, #16                    \n"  // 16 pixels
+    MEMACCESS(3)
+    "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 YUY2/16 pixels.
+    "bgt        1b                             \n"
+  : "+r"(src_y),     // %0
+    "+r"(src_u),     // %1
+    "+r"(src_v),     // %2
+    "+r"(dst_yuy2),  // %3
+    "+r"(width)      // %4
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3"
+  );
+}
+
+void I422ToUYVYRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_uyvy, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld2.8     {d1, d3}, [%0]!                \n"  // load 16 Ys
+    MEMACCESS(1)
+    "vld1.8     {d0}, [%1]!                    \n"  // load 8 Us
+    MEMACCESS(2)
+    "vld1.8     {d2}, [%2]!                    \n"  // load 8 Vs
+    "subs       %4, %4, #16                    \n"  // 16 pixels
+    MEMACCESS(3)
+    "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 UYVY/16 pixels.
+    "bgt        1b                             \n"
+  : "+r"(src_y),     // %0
+    "+r"(src_u),     // %1
+    "+r"(src_v),     // %2
+    "+r"(dst_uyvy),  // %3
+    "+r"(width)      // %4
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3"
+  );
+}
+
+void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    ARGBTORGB565
+    MEMACCESS(1)
+    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels RGB565.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_rgb565),  // %1
+    "+r"(width)        // %2
+  :
+  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
+  );
+}
+
+void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
+                                const uint32 dither4, int width) {
+  asm volatile (
+    "vdup.32    d2, %2                         \n"  // dither4
+  "1:                                          \n"
+    MEMACCESS(1)
+    "vld4.8     {d20, d21, d22, d23}, [%1]!    \n"  // load 8 pixels of ARGB.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vqadd.u8   d20, d20, d2                   \n"
+    "vqadd.u8   d21, d21, d2                   \n"
+    "vqadd.u8   d22, d22, d2                   \n"
+    ARGBTORGB565
+    MEMACCESS(0)
+    "vst1.8     {q0}, [%0]!                    \n"  // store 8 pixels RGB565.
+    "bgt        1b                             \n"
+  : "+r"(dst_rgb)    // %0
+  : "r"(src_argb),   // %1
+    "r"(dither4),    // %2
+    "r"(width)       // %3
+  : "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11"
+  );
+}
+
+void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
+                            int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    ARGBTOARGB1555
+    MEMACCESS(1)
+    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels ARGB1555.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb1555),  // %1
+    "+r"(width)        // %2
+  :
+  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
+  );
+}
+
+void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
+                            int width) {
+  asm volatile (
+    "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    ARGBTOARGB4444
+    MEMACCESS(1)
+    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels ARGB4444.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),      // %0
+    "+r"(dst_argb4444),  // %1
+    "+r"(width)            // %2
+  :
+  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
+  );
+}
+
+void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
+  asm volatile (
+    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
+    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
+    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
+    "vmov.u8    d27, #16                       \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q2, d0, d24                    \n"  // B
+    "vmlal.u8   q2, d1, d25                    \n"  // G
+    "vmlal.u8   q2, d2, d26                    \n"  // R
+    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d27                        \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(width)        // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
+  );
+}
+
+void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
+  asm volatile (
+    "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient
+    "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient
+    "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q2, d0, d24                    \n"  // B
+    "vmlal.u8   q2, d1, d25                    \n"  // G
+    "vmlal.u8   q2, d2, d26                    \n"  // R
+    "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit Y
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(width)        // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
+  );
+}
+
+// 8x1 pixels.
+void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                         int width) {
+  asm volatile (
+    "vmov.u8    d24, #112                      \n"  // UB / VR 0.875 coefficient
+    "vmov.u8    d25, #74                       \n"  // UG -0.5781 coefficient
+    "vmov.u8    d26, #38                       \n"  // UR -0.2969 coefficient
+    "vmov.u8    d27, #18                       \n"  // VB -0.1406 coefficient
+    "vmov.u8    d28, #94                       \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q2, d0, d24                    \n"  // B
+    "vmlsl.u8   q2, d1, d25                    \n"  // G
+    "vmlsl.u8   q2, d2, d26                    \n"  // R
+    "vadd.u16   q2, q2, q15                    \n"  // +128 -> unsigned
+
+    "vmull.u8   q3, d2, d24                    \n"  // R
+    "vmlsl.u8   q3, d1, d28                    \n"  // G
+    "vmlsl.u8   q3, d0, d27                    \n"  // B
+    "vadd.u16   q3, q3, q15                    \n"  // +128 -> unsigned
+
+    "vqshrn.u16  d0, q2, #8                    \n"  // 16 bit to 8 bit U
+    "vqshrn.u16  d1, q3, #8                    \n"  // 16 bit to 8 bit V
+
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
+    MEMACCESS(2)
+    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(width)        // %3
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", "q15"
+  );
+}
+
+// 32x1 pixels -> 8x1.  width is number of argb pixels. e.g. 32.
+void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                         int width) {
+  asm volatile (
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(0)
+    "vld4.8     {d8, d10, d12, d14}, [%0]!     \n"  // load 8 more ARGB pixels.
+    MEMACCESS(0)
+    "vld4.8     {d9, d11, d13, d15}, [%0]!     \n"  // load last 8 ARGB pixels.
+    "vpaddl.u8  q4, q4                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q5, q5                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q6, q6                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vpadd.u16  d0, d0, d1                     \n"  // B 16 shorts -> 8 shorts.
+    "vpadd.u16  d1, d8, d9                     \n"  // B
+    "vpadd.u16  d2, d2, d3                     \n"  // G 16 shorts -> 8 shorts.
+    "vpadd.u16  d3, d10, d11                   \n"  // G
+    "vpadd.u16  d4, d4, d5                     \n"  // R 16 shorts -> 8 shorts.
+    "vpadd.u16  d5, d12, d13                   \n"  // R
+
+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
+    "vrshr.u16  q1, q1, #1                     \n"
+    "vrshr.u16  q2, q2, #1                     \n"
+
+    "subs       %3, %3, #32                    \n"  // 32 processed per loop.
+    "vmul.s16   q8, q0, q10                    \n"  // B
+    "vmls.s16   q8, q1, q11                    \n"  // G
+    "vmls.s16   q8, q2, q12                    \n"  // R
+    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
+    "vmul.s16   q9, q2, q10                    \n"  // R
+    "vmls.s16   q9, q1, q14                    \n"  // G
+    "vmls.s16   q9, q0, q13                    \n"  // B
+    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
+    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
+    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
+    MEMACCESS(2)
+    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(width)        // %3
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
+#define RGBTOUV(QB, QG, QR) \
+    "vmul.s16   q8, " #QB ", q10               \n"  /* B                    */ \
+    "vmls.s16   q8, " #QG ", q11               \n"  /* G                    */ \
+    "vmls.s16   q8, " #QR ", q12               \n"  /* R                    */ \
+    "vadd.u16   q8, q8, q15                    \n"  /* +128 -> unsigned     */ \
+    "vmul.s16   q9, " #QR ", q10               \n"  /* R                    */ \
+    "vmls.s16   q9, " #QG ", q14               \n"  /* G                    */ \
+    "vmls.s16   q9, " #QB ", q13               \n"  /* B                    */ \
+    "vadd.u16   q9, q9, q15                    \n"  /* +128 -> unsigned     */ \
+    "vqshrn.u16  d0, q8, #8                    \n"  /* 16 bit to 8 bit U    */ \
+    "vqshrn.u16  d1, q9, #8                    \n"  /* 16 bit to 8 bit V    */
+
+// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
+void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_argb
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.
+    MEMACCESS(1)
+    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.
+    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
+    "vrshr.u16  q1, q1, #1                     \n"
+    "vrshr.u16  q2, q2, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    RGBTOUV(q0, q1, q2)
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(src_stride_argb),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+// TODO(fbarchard): Subsample match C code.
+void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_argb
+    "vmov.s16   q10, #127 / 2                  \n"  // UB / VR 0.500 coefficient
+    "vmov.s16   q11, #84 / 2                   \n"  // UG -0.33126 coefficient
+    "vmov.s16   q12, #43 / 2                   \n"  // UR -0.16874 coefficient
+    "vmov.s16   q13, #20 / 2                   \n"  // VB -0.08131 coefficient
+    "vmov.s16   q14, #107 / 2                  \n"  // VG -0.41869 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.
+    MEMACCESS(1)
+    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.
+    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
+    "vrshr.u16  q1, q1, #1                     \n"
+    "vrshr.u16  q2, q2, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    RGBTOUV(q0, q1, q2)
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(src_stride_argb),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_bgra
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 BGRA pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 BGRA pixels.
+    "vpaddl.u8  q3, q3                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more BGRA pixels.
+    MEMACCESS(1)
+    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 BGRA pixels.
+    "vpadal.u8  q3, q7                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q2, q6                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q5                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vrshr.u16  q1, q1, #1                     \n"  // 2x average
+    "vrshr.u16  q2, q2, #1                     \n"
+    "vrshr.u16  q3, q3, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    RGBTOUV(q3, q2, q1)
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_bgra),  // %0
+    "+r"(src_stride_bgra),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_abgr
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ABGR pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ABGR pixels.
+    "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ABGR pixels.
+    MEMACCESS(1)
+    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ABGR pixels.
+    "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q0, q4                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
+    "vrshr.u16  q1, q1, #1                     \n"
+    "vrshr.u16  q2, q2, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    RGBTOUV(q2, q1, q0)
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_abgr),  // %0
+    "+r"(src_stride_abgr),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_rgba
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 RGBA pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 RGBA pixels.
+    "vpaddl.u8  q0, q1                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q2                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q3                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more RGBA pixels.
+    MEMACCESS(1)
+    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 RGBA pixels.
+    "vpadal.u8  q0, q5                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q6                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q2, q7                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
+    "vrshr.u16  q1, q1, #1                     \n"
+    "vrshr.u16  q2, q2, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    RGBTOUV(q0, q1, q2)
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_rgba),  // %0
+    "+r"(src_stride_rgba),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_rgb24
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RGB24 pixels.
+    MEMACCESS(0)
+    "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RGB24 pixels.
+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RGB24 pixels.
+    MEMACCESS(1)
+    "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RGB24 pixels.
+    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
+    "vrshr.u16  q1, q1, #1                     \n"
+    "vrshr.u16  q2, q2, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    RGBTOUV(q0, q1, q2)
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_rgb24),  // %0
+    "+r"(src_stride_rgb24),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
+                     uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_raw
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RAW pixels.
+    MEMACCESS(0)
+    "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RAW pixels.
+    "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RAW pixels.
+    MEMACCESS(1)
+    "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RAW pixels.
+    "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q0, q4                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
+    "vrshr.u16  q1, q1, #1                     \n"
+    "vrshr.u16  q2, q2, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    RGBTOUV(q2, q1, q0)
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_raw),  // %0
+    "+r"(src_stride_raw),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
+void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
+                        uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_argb
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
+    RGB565TOARGB
+    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // next 8 RGB565 pixels.
+    RGB565TOARGB
+    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"  // load 8 RGB565 pixels.
+    RGB565TOARGB
+    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"  // next 8 RGB565 pixels.
+    RGB565TOARGB
+    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+
+    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
+    "vrshr.u16  q5, q5, #1                     \n"
+    "vrshr.u16  q6, q6, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
+    "vmul.s16   q8, q4, q10                    \n"  // B
+    "vmls.s16   q8, q5, q11                    \n"  // G
+    "vmls.s16   q8, q6, q12                    \n"  // R
+    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
+    "vmul.s16   q9, q6, q10                    \n"  // R
+    "vmls.s16   q9, q5, q14                    \n"  // G
+    "vmls.s16   q9, q4, q13                    \n"  // B
+    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
+    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
+    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_rgb565),  // %0
+    "+r"(src_stride_rgb565),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
+void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
+                        uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_argb
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
+    RGB555TOARGB
+    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB1555 pixels.
+    RGB555TOARGB
+    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB1555 pixels.
+    RGB555TOARGB
+    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB1555 pixels.
+    RGB555TOARGB
+    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+
+    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
+    "vrshr.u16  q5, q5, #1                     \n"
+    "vrshr.u16  q6, q6, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
+    "vmul.s16   q8, q4, q10                    \n"  // B
+    "vmls.s16   q8, q5, q11                    \n"  // G
+    "vmls.s16   q8, q6, q12                    \n"  // R
+    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
+    "vmul.s16   q9, q6, q10                    \n"  // R
+    "vmls.s16   q9, q5, q14                    \n"  // G
+    "vmls.s16   q9, q4, q13                    \n"  // B
+    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
+    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
+    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_argb1555),  // %0
+    "+r"(src_stride_argb1555),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
+void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
+                          uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_argb
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
+    ARGB4444TOARGB
+    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB4444 pixels.
+    ARGB4444TOARGB
+    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB4444 pixels.
+    ARGB4444TOARGB
+    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB4444 pixels.
+    ARGB4444TOARGB
+    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+
+    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
+    "vrshr.u16  q5, q5, #1                     \n"
+    "vrshr.u16  q6, q6, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
+    "vmul.s16   q8, q4, q10                    \n"  // B
+    "vmls.s16   q8, q5, q11                    \n"  // G
+    "vmls.s16   q8, q6, q12                    \n"  // R
+    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
+    "vmul.s16   q9, q6, q10                    \n"  // R
+    "vmls.s16   q9, q5, q14                    \n"  // G
+    "vmls.s16   q9, q4, q13                    \n"  // B
+    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
+    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
+    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_argb4444),  // %0
+    "+r"(src_stride_argb4444),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) {
+  asm volatile (
+    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
+    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
+    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
+    "vmov.u8    d27, #16                       \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    RGB565TOARGB
+    "vmull.u8   q2, d0, d24                    \n"  // B
+    "vmlal.u8   q2, d1, d25                    \n"  // G
+    "vmlal.u8   q2, d2, d26                    \n"  // R
+    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d27                        \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_rgb565),  // %0
+    "+r"(dst_y),       // %1
+    "+r"(width)          // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
+  );
+}
+
+void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) {
+  asm volatile (
+    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
+    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
+    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
+    "vmov.u8    d27, #16                       \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    ARGB1555TOARGB
+    "vmull.u8   q2, d0, d24                    \n"  // B
+    "vmlal.u8   q2, d1, d25                    \n"  // G
+    "vmlal.u8   q2, d2, d26                    \n"  // R
+    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d27                        \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_argb1555),  // %0
+    "+r"(dst_y),         // %1
+    "+r"(width)            // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
+  );
+}
+
+void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) {
+  asm volatile (
+    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
+    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
+    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
+    "vmov.u8    d27, #16                       \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    ARGB4444TOARGB
+    "vmull.u8   q2, d0, d24                    \n"  // B
+    "vmlal.u8   q2, d1, d25                    \n"  // G
+    "vmlal.u8   q2, d2, d26                    \n"  // R
+    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d27                        \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_argb4444),  // %0
+    "+r"(dst_y),         // %1
+    "+r"(width)            // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
+  );
+}
+
+void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) {
+  asm volatile (
+    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
+    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
+    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
+    "vmov.u8    d7, #16                        \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of BGRA.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q8, d1, d4                     \n"  // R
+    "vmlal.u8   q8, d2, d5                     \n"  // G
+    "vmlal.u8   q8, d3, d6                     \n"  // B
+    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d7                         \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_bgra),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(width)        // %2
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+  );
+}
+
+void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) {
+  asm volatile (
+    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
+    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
+    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
+    "vmov.u8    d7, #16                        \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ABGR.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q8, d0, d4                     \n"  // R
+    "vmlal.u8   q8, d1, d5                     \n"  // G
+    "vmlal.u8   q8, d2, d6                     \n"  // B
+    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d7                         \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_abgr),  // %0
+    "+r"(dst_y),  // %1
+    "+r"(width)        // %2
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+  );
+}
+
+void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) {
+  asm volatile (
+    "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient
+    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
+    "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient
+    "vmov.u8    d7, #16                        \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of RGBA.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q8, d1, d4                     \n"  // B
+    "vmlal.u8   q8, d2, d5                     \n"  // G
+    "vmlal.u8   q8, d3, d6                     \n"  // R
+    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d7                         \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_rgba),  // %0
+    "+r"(dst_y),  // %1
+    "+r"(width)        // %2
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+  );
+}
+
+void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) {
+  asm volatile (
+    "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient
+    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
+    "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient
+    "vmov.u8    d7, #16                        \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RGB24.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q8, d0, d4                     \n"  // B
+    "vmlal.u8   q8, d1, d5                     \n"  // G
+    "vmlal.u8   q8, d2, d6                     \n"  // R
+    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d7                         \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_rgb24),  // %0
+    "+r"(dst_y),  // %1
+    "+r"(width)        // %2
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+  );
+}
+
+void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {
+  asm volatile (
+    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
+    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
+    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
+    "vmov.u8    d7, #16                        \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RAW.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q8, d0, d4                     \n"  // B
+    "vmlal.u8   q8, d1, d5                     \n"  // G
+    "vmlal.u8   q8, d2, d6                     \n"  // R
+    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d7                         \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_raw),  // %0
+    "+r"(dst_y),  // %1
+    "+r"(width)        // %2
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+  );
+}
+
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_NEON(uint8* dst_ptr,
+                         const uint8* src_ptr, ptrdiff_t src_stride,
+                         int dst_width, int source_y_fraction) {
+  int y1_fraction = source_y_fraction;
+  asm volatile (
+    "cmp        %4, #0                         \n"
+    "beq        100f                           \n"
+    "add        %2, %1                         \n"
+    "cmp        %4, #128                       \n"
+    "beq        50f                            \n"
+
+    "vdup.8     d5, %4                         \n"
+    "rsb        %4, #256                       \n"
+    "vdup.8     d4, %4                         \n"
+    // General purpose row blend.
+  "1:                                          \n"
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"
+    MEMACCESS(2)
+    "vld1.8     {q1}, [%2]!                    \n"
+    "subs       %3, %3, #16                    \n"
+    "vmull.u8   q13, d0, d4                    \n"
+    "vmull.u8   q14, d1, d4                    \n"
+    "vmlal.u8   q13, d2, d5                    \n"
+    "vmlal.u8   q14, d3, d5                    \n"
+    "vrshrn.u16 d0, q13, #8                    \n"
+    "vrshrn.u16 d1, q14, #8                    \n"
+    MEMACCESS(0)
+    "vst1.8     {q0}, [%0]!                    \n"
+    "bgt        1b                             \n"
+    "b          99f                            \n"
+
+    // Blend 50 / 50.
+  "50:                                         \n"
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"
+    MEMACCESS(2)
+    "vld1.8     {q1}, [%2]!                    \n"
+    "subs       %3, %3, #16                    \n"
+    "vrhadd.u8  q0, q1                         \n"
+    MEMACCESS(0)
+    "vst1.8     {q0}, [%0]!                    \n"
+    "bgt        50b                            \n"
+    "b          99f                            \n"
+
+    // Blend 100 / 0 - Copy row unchanged.
+  "100:                                        \n"
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"
+    "subs       %3, %3, #16                    \n"
+    MEMACCESS(0)
+    "vst1.8     {q0}, [%0]!                    \n"
+    "bgt        100b                           \n"
+
+  "99:                                         \n"
+  : "+r"(dst_ptr),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(src_stride),       // %2
+    "+r"(dst_width),        // %3
+    "+r"(y1_fraction)       // %4
+  :
+  : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14"
+  );
+}
+
+// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
+void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+                       uint8* dst_argb, int width) {
+  asm volatile (
+    "subs       %3, #8                         \n"
+    "blt        89f                            \n"
+    // Blend 8 pixels.
+  "8:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB0.
+    MEMACCESS(1)
+    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 pixels of ARGB1.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q10, d4, d3                    \n"  // db * a
+    "vmull.u8   q11, d5, d3                    \n"  // dg * a
+    "vmull.u8   q12, d6, d3                    \n"  // dr * a
+    "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8
+    "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8
+    "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8
+    "vqsub.u8   q2, q2, q10                    \n"  // dbg - dbg * a / 256
+    "vqsub.u8   d6, d6, d22                    \n"  // dr - dr * a / 256
+    "vqadd.u8   q0, q0, q2                     \n"  // + sbg
+    "vqadd.u8   d2, d2, d6                     \n"  // + sr
+    "vmov.u8    d3, #255                       \n"  // a = 255
+    MEMACCESS(2)
+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 pixels of ARGB.
+    "bge        8b                             \n"
+
+  "89:                                         \n"
+    "adds       %3, #8-1                       \n"
+    "blt        99f                            \n"
+
+    // Blend 1 pixels.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n"  // load 1 pixel ARGB0.
+    MEMACCESS(1)
+    "vld4.8     {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n"  // load 1 pixel ARGB1.
+    "subs       %3, %3, #1                     \n"  // 1 processed per loop.
+    "vmull.u8   q10, d4, d3                    \n"  // db * a
+    "vmull.u8   q11, d5, d3                    \n"  // dg * a
+    "vmull.u8   q12, d6, d3                    \n"  // dr * a
+    "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8
+    "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8
+    "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8
+    "vqsub.u8   q2, q2, q10                    \n"  // dbg - dbg * a / 256
+    "vqsub.u8   d6, d6, d22                    \n"  // dr - dr * a / 256
+    "vqadd.u8   q0, q0, q2                     \n"  // + sbg
+    "vqadd.u8   d2, d2, d6                     \n"  // + sr
+    "vmov.u8    d3, #255                       \n"  // a = 255
+    MEMACCESS(2)
+    "vst4.8     {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n"  // store 1 pixel.
+    "bge        1b                             \n"
+
+  "99:                                         \n"
+
+  : "+r"(src_argb0),    // %0
+    "+r"(src_argb1),    // %1
+    "+r"(dst_argb),     // %2
+    "+r"(width)         // %3
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12"
+  );
+}
+
+// Attenuate 8 pixels at a time.
+void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
+  asm volatile (
+    // Attenuate 8 pixels.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q10, d0, d3                    \n"  // b * a
+    "vmull.u8   q11, d1, d3                    \n"  // g * a
+    "vmull.u8   q12, d2, d3                    \n"  // r * a
+    "vqrshrn.u16 d0, q10, #8                   \n"  // b >>= 8
+    "vqrshrn.u16 d1, q11, #8                   \n"  // g >>= 8
+    "vqrshrn.u16 d2, q12, #8                   \n"  // r >>= 8
+    MEMACCESS(1)
+    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_argb),   // %1
+    "+r"(width)       // %2
+  :
+  : "cc", "memory", "q0", "q1", "q10", "q11", "q12"
+  );
+}
+
+// Quantize 8 ARGB pixels (32 bytes).
+// dst = (dst * scale >> 16) * interval_size + interval_offset;
+void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
+                          int interval_offset, int width) {
+  asm volatile (
+    "vdup.u16   q8, %2                         \n"
+    "vshr.u16   q8, q8, #1                     \n"  // scale >>= 1
+    "vdup.u16   q9, %3                         \n"  // interval multiply.
+    "vdup.u16   q10, %4                        \n"  // interval add
+
+    // 8 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]         \n"  // load 8 pixels of ARGB.
+    "subs       %1, %1, #8                     \n"  // 8 processed per loop.
+    "vmovl.u8   q0, d0                         \n"  // b (0 .. 255)
+    "vmovl.u8   q1, d2                         \n"
+    "vmovl.u8   q2, d4                         \n"
+    "vqdmulh.s16 q0, q0, q8                    \n"  // b * scale
+    "vqdmulh.s16 q1, q1, q8                    \n"  // g
+    "vqdmulh.s16 q2, q2, q8                    \n"  // r
+    "vmul.u16   q0, q0, q9                     \n"  // b * interval_size
+    "vmul.u16   q1, q1, q9                     \n"  // g
+    "vmul.u16   q2, q2, q9                     \n"  // r
+    "vadd.u16   q0, q0, q10                    \n"  // b + interval_offset
+    "vadd.u16   q1, q1, q10                    \n"  // g
+    "vadd.u16   q2, q2, q10                    \n"  // r
+    "vqmovn.u16 d0, q0                         \n"
+    "vqmovn.u16 d2, q1                         \n"
+    "vqmovn.u16 d4, q2                         \n"
+    MEMACCESS(0)
+    "vst4.8     {d0, d2, d4, d6}, [%0]!        \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(dst_argb),       // %0
+    "+r"(width)           // %1
+  : "r"(scale),           // %2
+    "r"(interval_size),   // %3
+    "r"(interval_offset)  // %4
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"
+  );
+}
+
+// Shade 8 pixels at a time by specified value.
+// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
+// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
+void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
+                       uint32 value) {
+  asm volatile (
+    "vdup.u32   q0, %3                         \n"  // duplicate scale value.
+    "vzip.u8    d0, d1                         \n"  // d0 aarrggbb.
+    "vshr.u16   q0, q0, #1                     \n"  // scale / 2.
+
+    // 8 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d20, d22, d24, d26}, [%0]!    \n"  // load 8 pixels of ARGB.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmovl.u8   q10, d20                       \n"  // b (0 .. 255)
+    "vmovl.u8   q11, d22                       \n"
+    "vmovl.u8   q12, d24                       \n"
+    "vmovl.u8   q13, d26                       \n"
+    "vqrdmulh.s16 q10, q10, d0[0]              \n"  // b * scale * 2
+    "vqrdmulh.s16 q11, q11, d0[1]              \n"  // g
+    "vqrdmulh.s16 q12, q12, d0[2]              \n"  // r
+    "vqrdmulh.s16 q13, q13, d0[3]              \n"  // a
+    "vqmovn.u16 d20, q10                       \n"
+    "vqmovn.u16 d22, q11                       \n"
+    "vqmovn.u16 d24, q12                       \n"
+    "vqmovn.u16 d26, q13                       \n"
+    MEMACCESS(1)
+    "vst4.8     {d20, d22, d24, d26}, [%1]!    \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),       // %0
+    "+r"(dst_argb),       // %1
+    "+r"(width)           // %2
+  : "r"(value)            // %3
+  : "cc", "memory", "q0", "q10", "q11", "q12", "q13"
+  );
+}
+
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
+// Similar to ARGBToYJ but stores ARGB.
+// C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
+void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
+  asm volatile (
+    "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient
+    "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient
+    "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q2, d0, d24                    \n"  // B
+    "vmlal.u8   q2, d1, d25                    \n"  // G
+    "vmlal.u8   q2, d2, d26                    \n"  // R
+    "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit B
+    "vmov       d1, d0                         \n"  // G
+    "vmov       d2, d0                         \n"  // R
+    MEMACCESS(1)
+    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 ARGB pixels.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(width)      // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
+  );
+}
+
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+//    b = (r * 35 + g * 68 + b * 17) >> 7
+//    g = (r * 45 + g * 88 + b * 22) >> 7
+//    r = (r * 50 + g * 98 + b * 24) >> 7
+void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
+  asm volatile (
+    "vmov.u8    d20, #17                       \n"  // BB coefficient
+    "vmov.u8    d21, #68                       \n"  // BG coefficient
+    "vmov.u8    d22, #35                       \n"  // BR coefficient
+    "vmov.u8    d24, #22                       \n"  // GB coefficient
+    "vmov.u8    d25, #88                       \n"  // GG coefficient
+    "vmov.u8    d26, #45                       \n"  // GR coefficient
+    "vmov.u8    d28, #24                       \n"  // BB coefficient
+    "vmov.u8    d29, #98                       \n"  // BG coefficient
+    "vmov.u8    d30, #50                       \n"  // BR coefficient
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]         \n"  // load 8 ARGB pixels.
+    "subs       %1, %1, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q2, d0, d20                    \n"  // B to Sepia B
+    "vmlal.u8   q2, d1, d21                    \n"  // G
+    "vmlal.u8   q2, d2, d22                    \n"  // R
+    "vmull.u8   q3, d0, d24                    \n"  // B to Sepia G
+    "vmlal.u8   q3, d1, d25                    \n"  // G
+    "vmlal.u8   q3, d2, d26                    \n"  // R
+    "vmull.u8   q8, d0, d28                    \n"  // B to Sepia R
+    "vmlal.u8   q8, d1, d29                    \n"  // G
+    "vmlal.u8   q8, d2, d30                    \n"  // R
+    "vqshrn.u16 d0, q2, #7                     \n"  // 16 bit to 8 bit B
+    "vqshrn.u16 d1, q3, #7                     \n"  // 16 bit to 8 bit G
+    "vqshrn.u16 d2, q8, #7                     \n"  // 16 bit to 8 bit R
+    MEMACCESS(0)
+    "vst4.8     {d0, d1, d2, d3}, [%0]!        \n"  // store 8 ARGB pixels.
+    "bgt        1b                             \n"
+  : "+r"(dst_argb),  // %0
+    "+r"(width)      // %1
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3",
+    "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// TODO(fbarchard): Was same as Sepia except matrix is provided.  This function
+// needs to saturate.  Consider doing a non-saturating version.
+void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
+                             const int8* matrix_argb, int width) {
+  asm volatile (
+    MEMACCESS(3)
+    "vld1.8     {q2}, [%3]                     \n"  // load 3 ARGB vectors.
+    "vmovl.s8   q0, d4                         \n"  // B,G coefficients s16.
+    "vmovl.s8   q1, d5                         \n"  // R,A coefficients s16.
+
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d16, d18, d20, d22}, [%0]!    \n"  // load 8 ARGB pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmovl.u8   q8, d16                        \n"  // b (0 .. 255) 16 bit
+    "vmovl.u8   q9, d18                        \n"  // g
+    "vmovl.u8   q10, d20                       \n"  // r
+    "vmovl.u8   q11, d22                       \n"  // a
+    "vmul.s16   q12, q8, d0[0]                 \n"  // B = B * Matrix B
+    "vmul.s16   q13, q8, d1[0]                 \n"  // G = B * Matrix G
+    "vmul.s16   q14, q8, d2[0]                 \n"  // R = B * Matrix R
+    "vmul.s16   q15, q8, d3[0]                 \n"  // A = B * Matrix A
+    "vmul.s16   q4, q9, d0[1]                  \n"  // B += G * Matrix B
+    "vmul.s16   q5, q9, d1[1]                  \n"  // G += G * Matrix G
+    "vmul.s16   q6, q9, d2[1]                  \n"  // R += G * Matrix R
+    "vmul.s16   q7, q9, d3[1]                  \n"  // A += G * Matrix A
+    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
+    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
+    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
+    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
+    "vmul.s16   q4, q10, d0[2]                 \n"  // B += R * Matrix B
+    "vmul.s16   q5, q10, d1[2]                 \n"  // G += R * Matrix G
+    "vmul.s16   q6, q10, d2[2]                 \n"  // R += R * Matrix R
+    "vmul.s16   q7, q10, d3[2]                 \n"  // A += R * Matrix A
+    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
+    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
+    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
+    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
+    "vmul.s16   q4, q11, d0[3]                 \n"  // B += A * Matrix B
+    "vmul.s16   q5, q11, d1[3]                 \n"  // G += A * Matrix G
+    "vmul.s16   q6, q11, d2[3]                 \n"  // R += A * Matrix R
+    "vmul.s16   q7, q11, d3[3]                 \n"  // A += A * Matrix A
+    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
+    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
+    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
+    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
+    "vqshrun.s16 d16, q12, #6                  \n"  // 16 bit to 8 bit B
+    "vqshrun.s16 d18, q13, #6                  \n"  // 16 bit to 8 bit G
+    "vqshrun.s16 d20, q14, #6                  \n"  // 16 bit to 8 bit R
+    "vqshrun.s16 d22, q15, #6                  \n"  // 16 bit to 8 bit A
+    MEMACCESS(1)
+    "vst4.8     {d16, d18, d20, d22}, [%1]!    \n"  // store 8 ARGB pixels.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_argb),   // %1
+    "+r"(width)       // %2
+  : "r"(matrix_argb)  // %3
+  : "cc", "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9",
+    "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
+#ifdef HAS_ARGBMULTIPLYROW_NEON
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  asm volatile (
+    // 8 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(1)
+    "vld4.8     {d1, d3, d5, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q0, d0, d1                     \n"  // multiply B
+    "vmull.u8   q1, d2, d3                     \n"  // multiply G
+    "vmull.u8   q2, d4, d5                     \n"  // multiply R
+    "vmull.u8   q3, d6, d7                     \n"  // multiply A
+    "vrshrn.u16 d0, q0, #8                     \n"  // 16 bit to 8 bit B
+    "vrshrn.u16 d1, q1, #8                     \n"  // 16 bit to 8 bit G
+    "vrshrn.u16 d2, q2, #8                     \n"  // 16 bit to 8 bit R
+    "vrshrn.u16 d3, q3, #8                     \n"  // 16 bit to 8 bit A
+    MEMACCESS(2)
+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+    "bgt        1b                             \n"
+
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3"
+  );
+}
+#endif  // HAS_ARGBMULTIPLYROW_NEON
+
+// Add 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    // 8 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(1)
+    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vqadd.u8   q0, q0, q2                     \n"  // add B, G
+    "vqadd.u8   q1, q1, q3                     \n"  // add R, A
+    MEMACCESS(2)
+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+    "bgt        1b                             \n"
+
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3"
+  );
+}
+
+// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
+void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  asm volatile (
+    // 8 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(1)
+    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vqsub.u8   q0, q0, q2                     \n"  // subtract B, G
+    "vqsub.u8   q1, q1, q3                     \n"  // subtract R, A
+    MEMACCESS(2)
+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+    "bgt        1b                             \n"
+
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3"
+  );
+}
+
+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+// A = 255
+// R = Sobel
+// G = Sobel
+// B = Sobel
+void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    "vmov.u8    d3, #255                       \n"  // alpha
+    // 8 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {d0}, [%0]!                    \n"  // load 8 sobelx.
+    MEMACCESS(1)
+    "vld1.8     {d1}, [%1]!                    \n"  // load 8 sobely.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vqadd.u8   d0, d0, d1                     \n"  // add
+    "vmov.u8    d1, d0                         \n"
+    "vmov.u8    d2, d0                         \n"
+    MEMACCESS(2)
+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+    "bgt        1b                             \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_argb),    // %2
+    "+r"(width)        // %3
+  :
+  : "cc", "memory", "q0", "q1"
+  );
+}
+
+// Adds Sobel X and Sobel Y and stores Sobel into plane.
+void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                          uint8* dst_y, int width) {
+  asm volatile (
+    // 16 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 16 sobelx.
+    MEMACCESS(1)
+    "vld1.8     {q1}, [%1]!                    \n"  // load 16 sobely.
+    "subs       %3, %3, #16                    \n"  // 16 processed per loop.
+    "vqadd.u8   q0, q0, q1                     \n"  // add
+    MEMACCESS(2)
+    "vst1.8     {q0}, [%2]!                    \n"  // store 16 pixels.
+    "bgt        1b                             \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_y),       // %2
+    "+r"(width)        // %3
+  :
+  : "cc", "memory", "q0", "q1"
+  );
+}
+
+// Mixes Sobel X, Sobel Y and Sobel into ARGB.
+// A = 255
+// R = Sobel X
+// G = Sobel
+// B = Sobel Y
+void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    "vmov.u8    d3, #255                       \n"  // alpha
+    // 8 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {d2}, [%0]!                    \n"  // load 8 sobelx.
+    MEMACCESS(1)
+    "vld1.8     {d0}, [%1]!                    \n"  // load 8 sobely.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vqadd.u8   d1, d0, d2                     \n"  // add
+    MEMACCESS(2)
+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+    "bgt        1b                             \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_argb),    // %2
+    "+r"(width)        // %3
+  :
+  : "cc", "memory", "q0", "q1"
+  );
+}
+
+// SobelX as a matrix is
+// -1  0  1
+// -2  0  2
+// -1  0  1
+void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
+                    const uint8* src_y2, uint8* dst_sobelx, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {d0}, [%0],%5                  \n"  // top
+    MEMACCESS(0)
+    "vld1.8     {d1}, [%0],%6                  \n"
+    "vsubl.u8   q0, d0, d1                     \n"
+    MEMACCESS(1)
+    "vld1.8     {d2}, [%1],%5                  \n"  // center * 2
+    MEMACCESS(1)
+    "vld1.8     {d3}, [%1],%6                  \n"
+    "vsubl.u8   q1, d2, d3                     \n"
+    "vadd.s16   q0, q0, q1                     \n"
+    "vadd.s16   q0, q0, q1                     \n"
+    MEMACCESS(2)
+    "vld1.8     {d2}, [%2],%5                  \n"  // bottom
+    MEMACCESS(2)
+    "vld1.8     {d3}, [%2],%6                  \n"
+    "subs       %4, %4, #8                     \n"  // 8 pixels
+    "vsubl.u8   q1, d2, d3                     \n"
+    "vadd.s16   q0, q0, q1                     \n"
+    "vabs.s16   q0, q0                         \n"
+    "vqmovn.u16 d0, q0                         \n"
+    MEMACCESS(3)
+    "vst1.8     {d0}, [%3]!                    \n"  // store 8 sobelx
+    "bgt        1b                             \n"
+  : "+r"(src_y0),      // %0
+    "+r"(src_y1),      // %1
+    "+r"(src_y2),      // %2
+    "+r"(dst_sobelx),  // %3
+    "+r"(width)        // %4
+  : "r"(2),            // %5
+    "r"(6)             // %6
+  : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+
+// SobelY as a matrix is
+// -1 -2 -1
+//  0  0  0
+//  1  2  1
+void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
+                    uint8* dst_sobely, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {d0}, [%0],%4                  \n"  // left
+    MEMACCESS(1)
+    "vld1.8     {d1}, [%1],%4                  \n"
+    "vsubl.u8   q0, d0, d1                     \n"
+    MEMACCESS(0)
+    "vld1.8     {d2}, [%0],%4                  \n"  // center * 2
+    MEMACCESS(1)
+    "vld1.8     {d3}, [%1],%4                  \n"
+    "vsubl.u8   q1, d2, d3                     \n"
+    "vadd.s16   q0, q0, q1                     \n"
+    "vadd.s16   q0, q0, q1                     \n"
+    MEMACCESS(0)
+    "vld1.8     {d2}, [%0],%5                  \n"  // right
+    MEMACCESS(1)
+    "vld1.8     {d3}, [%1],%5                  \n"
+    "subs       %3, %3, #8                     \n"  // 8 pixels
+    "vsubl.u8   q1, d2, d3                     \n"
+    "vadd.s16   q0, q0, q1                     \n"
+    "vabs.s16   q0, q0                         \n"
+    "vqmovn.u16 d0, q0                         \n"
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 sobely
+    "bgt        1b                             \n"
+  : "+r"(src_y0),      // %0
+    "+r"(src_y1),      // %1
+    "+r"(dst_sobely),  // %2
+    "+r"(width)        // %3
+  : "r"(1),            // %4
+    "r"(6)             // %5
+  : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libyuv/source/row_neon64.cc b/libs/libyuv/source/row_neon64.cc
new file mode 100644
index 0000000000..0b5ca05bfb
--- /dev/null
+++ b/libs/libyuv/source/row_neon64.cc
@@ -0,0 +1,2961 @@
+/*
+ *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon armv8 64 bit.
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+// Read 8 Y, 4 U and 4 V from 422
+#define READYUV422                                                             \
+    MEMACCESS(0)                                                               \
+    "ld1        {v0.8b}, [%0], #8              \n"                             \
+    MEMACCESS(1)                                                               \
+    "ld1        {v1.s}[0], [%1], #4            \n"                             \
+    MEMACCESS(2)                                                               \
+    "ld1        {v1.s}[1], [%2], #4            \n"
+
+// Read 8 Y, 2 U and 2 V from 422
+#define READYUV411                                                             \
+    MEMACCESS(0)                                                               \
+    "ld1        {v0.8b}, [%0], #8              \n"                             \
+    MEMACCESS(1)                                                               \
+    "ld1        {v2.h}[0], [%1], #2            \n"                             \
+    MEMACCESS(2)                                                               \
+    "ld1        {v2.h}[1], [%2], #2            \n"                             \
+    "zip1       v1.8b, v2.8b, v2.8b            \n"
+
+// Read 8 Y, 8 U and 8 V from 444
+#define READYUV444                                                             \
+    MEMACCESS(0)                                                               \
+    "ld1        {v0.8b}, [%0], #8              \n"                             \
+    MEMACCESS(1)                                                               \
+    "ld1        {v1.d}[0], [%1], #8            \n"                             \
+    MEMACCESS(2)                                                               \
+    "ld1        {v1.d}[1], [%2], #8            \n"                             \
+    "uaddlp     v1.8h, v1.16b                  \n"                             \
+    "rshrn      v1.8b, v1.8h, #1               \n"
+
+// Read 8 Y, and set 4 U and 4 V to 128
+#define READYUV400                                                             \
+    MEMACCESS(0)                                                               \
+    "ld1        {v0.8b}, [%0], #8              \n"                             \
+    "movi       v1.8b , #128                   \n"
+
+// Read 8 Y and 4 UV from NV12
+#define READNV12                                                               \
+    MEMACCESS(0)                                                               \
+    "ld1        {v0.8b}, [%0], #8              \n"                             \
+    MEMACCESS(1)                                                               \
+    "ld1        {v2.8b}, [%1], #8              \n"                             \
+    "uzp1       v1.8b, v2.8b, v2.8b            \n"                             \
+    "uzp2       v3.8b, v2.8b, v2.8b            \n"                             \
+    "ins        v1.s[1], v3.s[0]               \n"
+
+// Read 8 Y and 4 VU from NV21
+#define READNV21                                                               \
+    MEMACCESS(0)                                                               \
+    "ld1        {v0.8b}, [%0], #8              \n"                             \
+    MEMACCESS(1)                                                               \
+    "ld1        {v2.8b}, [%1], #8              \n"                             \
+    "uzp1       v3.8b, v2.8b, v2.8b            \n"                             \
+    "uzp2       v1.8b, v2.8b, v2.8b            \n"                             \
+    "ins        v1.s[1], v3.s[0]               \n"
+
+// Read 8 YUY2
+#define READYUY2                                                               \
+    MEMACCESS(0)                                                               \
+    "ld2        {v0.8b, v1.8b}, [%0], #16      \n"                             \
+    "uzp2       v3.8b, v1.8b, v1.8b            \n"                             \
+    "uzp1       v1.8b, v1.8b, v1.8b            \n"                             \
+    "ins        v1.s[1], v3.s[0]               \n"
+
+// Read 8 UYVY
+#define READUYVY                                                               \
+    MEMACCESS(0)                                                               \
+    "ld2        {v2.8b, v3.8b}, [%0], #16      \n"                             \
+    "orr        v0.8b, v3.8b, v3.8b            \n"                             \
+    "uzp1       v1.8b, v2.8b, v2.8b            \n"                             \
+    "uzp2       v3.8b, v2.8b, v2.8b            \n"                             \
+    "ins        v1.s[1], v3.s[0]               \n"
+
+#define YUVTORGB_SETUP                                                         \
+    "ld1r       {v24.8h}, [%[kUVBiasBGR]], #2  \n"                             \
+    "ld1r       {v25.8h}, [%[kUVBiasBGR]], #2  \n"                             \
+    "ld1r       {v26.8h}, [%[kUVBiasBGR]]      \n"                             \
+    "ld1r       {v31.4s}, [%[kYToRgb]]         \n"                             \
+    "ld2        {v27.8h, v28.8h}, [%[kUVToRB]] \n"                             \
+    "ld2        {v29.8h, v30.8h}, [%[kUVToG]]  \n"
+
+#define YUVTORGB(vR, vG, vB)                                                   \
+    "uxtl       v0.8h, v0.8b                   \n" /* Extract Y    */          \
+    "shll       v2.8h, v1.8b, #8               \n" /* Replicate UV */          \
+    "ushll2     v3.4s, v0.8h, #0               \n" /* Y */                     \
+    "ushll      v0.4s, v0.4h, #0               \n"                             \
+    "mul        v3.4s, v3.4s, v31.4s           \n"                             \
+    "mul        v0.4s, v0.4s, v31.4s           \n"                             \
+    "sqshrun    v0.4h, v0.4s, #16              \n"                             \
+    "sqshrun2   v0.8h, v3.4s, #16              \n" /* Y */                     \
+    "uaddw      v1.8h, v2.8h, v1.8b            \n" /* Replicate UV */          \
+    "mov        v2.d[0], v1.d[1]               \n" /* Extract V */             \
+    "uxtl       v2.8h, v2.8b                   \n"                             \
+    "uxtl       v1.8h, v1.8b                   \n" /* Extract U */             \
+    "mul        v3.8h, v1.8h, v27.8h           \n"                             \
+    "mul        v5.8h, v1.8h, v29.8h           \n"                             \
+    "mul        v6.8h, v2.8h, v30.8h           \n"                             \
+    "mul        v7.8h, v2.8h, v28.8h           \n"                             \
+    "sqadd      v6.8h, v6.8h, v5.8h            \n"                             \
+    "sqadd      " #vB ".8h, v24.8h, v0.8h      \n" /* B */                     \
+    "sqadd      " #vG ".8h, v25.8h, v0.8h      \n" /* G */                     \
+    "sqadd      " #vR ".8h, v26.8h, v0.8h      \n" /* R */                     \
+    "sqadd      " #vB ".8h, " #vB ".8h, v3.8h  \n" /* B */                     \
+    "sqsub      " #vG ".8h, " #vG ".8h, v6.8h  \n" /* G */                     \
+    "sqadd      " #vR ".8h, " #vR ".8h, v7.8h  \n" /* R */                     \
+    "sqshrun    " #vB ".8b, " #vB ".8h, #6     \n" /* B */                     \
+    "sqshrun    " #vG ".8b, " #vG ".8h, #6     \n" /* G */                     \
+    "sqshrun    " #vR ".8b, " #vR ".8h, #6     \n" /* R */                     \
+
+#ifdef HAS_I444TOARGBROW_NEON
+void I444ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+    "movi       v23.8b, #255                   \n" /* A */
+  "1:                                          \n"
+    READYUV444
+    YUVTORGB(v22, v21, v20)
+    "subs       %w4, %w4, #8                   \n"
+    MEMACCESS(3)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_argb),  // %3
+      "+r"(width)      // %4
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_I444TOARGBROW_NEON
+
+#ifdef HAS_I422TOARGBROW_NEON
+void I422ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+    "movi       v23.8b, #255                   \n" /* A */
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB(v22, v21, v20)
+    "subs       %w4, %w4, #8                   \n"
+    MEMACCESS(3)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_argb),  // %3
+      "+r"(width)      // %4
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_I422TOARGBROW_NEON
+
+#ifdef HAS_I422ALPHATOARGBROW_NEON
+void I422AlphaToARGBRow_NEON(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             const uint8* src_a,
+                             uint8* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB(v22, v21, v20)
+    MEMACCESS(3)
+    "ld1        {v23.8b}, [%3], #8             \n"
+    "subs       %w5, %w5, #8                   \n"
+    MEMACCESS(4)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(src_a),     // %3
+      "+r"(dst_argb),  // %4
+      "+r"(width)      // %5
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_I422ALPHATOARGBROW_NEON
+
+#ifdef HAS_I411TOARGBROW_NEON
+void I411ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+    "movi       v23.8b, #255                   \n" /* A */
+  "1:                                          \n"
+    READYUV411
+    YUVTORGB(v22, v21, v20)
+    "subs       %w4, %w4, #8                   \n"
+    MEMACCESS(3)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_argb),  // %3
+      "+r"(width)      // %4
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_I411TOARGBROW_NEON
+
+#ifdef HAS_I422TORGBAROW_NEON
+void I422ToRGBARow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_rgba,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+    "movi       v20.8b, #255                   \n" /* A */
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB(v23, v22, v21)
+    "subs       %w4, %w4, #8                   \n"
+    MEMACCESS(3)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_rgba),  // %3
+      "+r"(width)      // %4
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_I422TORGBAROW_NEON
+
+#ifdef HAS_I422TORGB24ROW_NEON
+void I422ToRGB24Row_NEON(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB(v22, v21, v20)
+    "subs       %w4, %w4, #8                   \n"
+    MEMACCESS(3)
+    "st3        {v20.8b,v21.8b,v22.8b}, [%3], #24     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_rgb24), // %3
+      "+r"(width)      // %4
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_I422TORGB24ROW_NEON
+
+#define ARGBTORGB565                                                           \
+    "shll       v0.8h,  v22.8b, #8             \n"  /* R                    */ \
+    "shll       v20.8h, v20.8b, #8             \n"  /* B                    */ \
+    "shll       v21.8h, v21.8b, #8             \n"  /* G                    */ \
+    "sri        v0.8h,  v21.8h, #5             \n"  /* RG                   */ \
+    "sri        v0.8h,  v20.8h, #11            \n"  /* RGB                  */
+
+#ifdef HAS_I422TORGB565ROW_NEON
+void I422ToRGB565Row_NEON(const uint8* src_y,
+                          const uint8* src_u,
+                          const uint8* src_v,
+                          uint8* dst_rgb565,
+                          const struct YuvConstants* yuvconstants,
+                          int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB(v22, v21, v20)
+    "subs       %w4, %w4, #8                   \n"
+    ARGBTORGB565
+    MEMACCESS(3)
+    "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels RGB565.
+    "b.gt       1b                             \n"
+    : "+r"(src_y),    // %0
+      "+r"(src_u),    // %1
+      "+r"(src_v),    // %2
+      "+r"(dst_rgb565),  // %3
+      "+r"(width)     // %4
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_I422TORGB565ROW_NEON
+
+#define ARGBTOARGB1555                                                         \
+    "shll       v0.8h,  v23.8b, #8             \n"  /* A                    */ \
+    "shll       v22.8h, v22.8b, #8             \n"  /* R                    */ \
+    "shll       v20.8h, v20.8b, #8             \n"  /* B                    */ \
+    "shll       v21.8h, v21.8b, #8             \n"  /* G                    */ \
+    "sri        v0.8h,  v22.8h, #1             \n"  /* AR                   */ \
+    "sri        v0.8h,  v21.8h, #6             \n"  /* ARG                  */ \
+    "sri        v0.8h,  v20.8h, #11            \n"  /* ARGB                 */
+
+#ifdef HAS_I422TOARGB1555ROW_NEON
+void I422ToARGB1555Row_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb1555,
+                            const struct YuvConstants* yuvconstants,
+                            int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+    "movi       v23.8b, #255                   \n"
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB(v22, v21, v20)
+    "subs       %w4, %w4, #8                   \n"
+    ARGBTOARGB1555
+    MEMACCESS(3)
+    "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels RGB565.
+    "b.gt       1b                             \n"
+    : "+r"(src_y),    // %0
+      "+r"(src_u),    // %1
+      "+r"(src_v),    // %2
+      "+r"(dst_argb1555),  // %3
+      "+r"(width)     // %4
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_I422TOARGB1555ROW_NEON
+
+#define ARGBTOARGB4444                                                         \
+    /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f        */ \
+    "ushr       v20.8b, v20.8b, #4             \n"  /* B                    */ \
+    "bic        v21.8b, v21.8b, v4.8b          \n"  /* G                    */ \
+    "ushr       v22.8b, v22.8b, #4             \n"  /* R                    */ \
+    "bic        v23.8b, v23.8b, v4.8b          \n"  /* A                    */ \
+    "orr        v0.8b,  v20.8b, v21.8b         \n"  /* BG                   */ \
+    "orr        v1.8b,  v22.8b, v23.8b         \n"  /* RA                   */ \
+    "zip1       v0.16b, v0.16b, v1.16b         \n"  /* BGRA                 */
+
+#ifdef HAS_I422TOARGB4444ROW_NEON
+void I422ToARGB4444Row_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb4444,
+                            const struct YuvConstants* yuvconstants,
+                            int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+    "movi       v4.16b, #0x0f                  \n"  // bits to clear with vbic.
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB(v22, v21, v20)
+    "subs       %w4, %w4, #8                   \n"
+    "movi       v23.8b, #255                   \n"
+    ARGBTOARGB4444
+    MEMACCESS(3)
+    "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels ARGB4444.
+    "b.gt       1b                             \n"
+    : "+r"(src_y),    // %0
+      "+r"(src_u),    // %1
+      "+r"(src_v),    // %2
+      "+r"(dst_argb4444),  // %3
+      "+r"(width)     // %4
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_I422TOARGB4444ROW_NEON
+
+#ifdef HAS_I400TOARGBROW_NEON
+void I400ToARGBRow_NEON(const uint8* src_y,
+                        uint8* dst_argb,
+                        int width) {
+  int64 width64 = (int64)(width);
+  asm volatile (
+    YUVTORGB_SETUP
+    "movi       v23.8b, #255                   \n"
+  "1:                                          \n"
+    READYUV400
+    YUVTORGB(v22, v21, v20)
+    "subs       %w2, %w2, #8                   \n"
+    MEMACCESS(1)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(dst_argb),  // %1
+      "+r"(width64)    // %2
+    : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB),
+      [kUVToG]"r"(&kYuvI601Constants.kUVToG),
+      [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR),
+      [kYToRgb]"r"(&kYuvI601Constants.kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_I400TOARGBROW_NEON
+
+#ifdef HAS_J400TOARGBROW_NEON
+void J400ToARGBRow_NEON(const uint8* src_y,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    "movi       v23.8b, #255                   \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v20.8b}, [%0], #8             \n"
+    "orr        v21.8b, v20.8b, v20.8b         \n"
+    "orr        v22.8b, v20.8b, v20.8b         \n"
+    "subs       %w2, %w2, #8                   \n"
+    MEMACCESS(1)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(dst_argb),  // %1
+      "+r"(width)      // %2
+    :
+    : "cc", "memory", "v20", "v21", "v22", "v23"
+  );
+}
+#endif  // HAS_J400TOARGBROW_NEON
+
+#ifdef HAS_NV12TOARGBROW_NEON
+void NV12ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_uv,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+    "movi       v23.8b, #255                   \n"
+  "1:                                          \n"
+    READNV12
+    YUVTORGB(v22, v21, v20)
+    "subs       %w3, %w3, #8                   \n"
+    MEMACCESS(2)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_uv),    // %1
+      "+r"(dst_argb),  // %2
+      "+r"(width)      // %3
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_NV12TOARGBROW_NEON
+
+#ifdef HAS_NV12TOARGBROW_NEON
+void NV21ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_vu,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+    "movi       v23.8b, #255                   \n"
+  "1:                                          \n"
+    READNV21
+    YUVTORGB(v22, v21, v20)
+    "subs       %w3, %w3, #8                   \n"
+    MEMACCESS(2)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_vu),    // %1
+      "+r"(dst_argb),  // %2
+      "+r"(width)      // %3
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_NV12TOARGBROW_NEON
+
+#ifdef HAS_NV12TORGB565ROW_NEON
+void NV12ToRGB565Row_NEON(const uint8* src_y,
+                          const uint8* src_uv,
+                          uint8* dst_rgb565,
+                          const struct YuvConstants* yuvconstants,
+                          int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+  "1:                                          \n"
+    READNV12
+    YUVTORGB(v22, v21, v20)
+    "subs       %w3, %w3, #8                   \n"
+    ARGBTORGB565
+    MEMACCESS(2)
+    "st1        {v0.8h}, [%2], 16              \n"  // store 8 pixels RGB565.
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_uv),    // %1
+      "+r"(dst_rgb565),  // %2
+      "+r"(width)      // %3
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_NV12TORGB565ROW_NEON
+
+#ifdef HAS_YUY2TOARGBROW_NEON
+void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  int64 width64 = (int64)(width);
+  asm volatile (
+    YUVTORGB_SETUP
+    "movi       v23.8b, #255                   \n"
+  "1:                                          \n"
+    READYUY2
+    YUVTORGB(v22, v21, v20)
+    "subs       %w2, %w2, #8                   \n"
+    MEMACCESS(1)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32      \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_yuy2),  // %0
+      "+r"(dst_argb),  // %1
+      "+r"(width64)    // %2
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_YUY2TOARGBROW_NEON
+
+#ifdef HAS_UYVYTOARGBROW_NEON
+void UYVYToARGBRow_NEON(const uint8* src_uyvy,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  int64 width64 = (int64)(width);
+  asm volatile (
+    YUVTORGB_SETUP
+    "movi       v23.8b, #255                   \n"
+  "1:                                          \n"
+    READUYVY
+    YUVTORGB(v22, v21, v20)
+    "subs       %w2, %w2, #8                   \n"
+    MEMACCESS(1)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32      \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_uyvy),  // %0
+      "+r"(dst_argb),  // %1
+      "+r"(width64)    // %2
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_UYVYTOARGBROW_NEON
+
+// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
+#ifdef HAS_SPLITUVROW_NEON
+void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                     int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pairs of UV
+    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
+    MEMACCESS(1)
+    "st1        {v0.16b}, [%1], #16            \n"  // store U
+    MEMACCESS(2)
+    "st1        {v1.16b}, [%2], #16            \n"  // store V
+    "b.gt       1b                             \n"
+    : "+r"(src_uv),  // %0
+      "+r"(dst_u),   // %1
+      "+r"(dst_v),   // %2
+      "+r"(width)    // %3  // Output registers
+    :                       // Input registers
+    : "cc", "memory", "v0", "v1"  // Clobber List
+  );
+}
+#endif  // HAS_SPLITUVROW_NEON
+
+// Reads 16 U's and V's and writes out 16 pairs of UV.
+#ifdef HAS_MERGEUVROW_NEON
+void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                     int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load U
+    MEMACCESS(1)
+    "ld1        {v1.16b}, [%1], #16            \n"  // load V
+    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
+    MEMACCESS(2)
+    "st2        {v0.16b,v1.16b}, [%2], #32     \n"  // store 16 pairs of UV
+    "b.gt       1b                             \n"
+    :
+      "+r"(src_u),   // %0
+      "+r"(src_v),   // %1
+      "+r"(dst_uv),  // %2
+      "+r"(width)    // %3  // Output registers
+    :                       // Input registers
+    : "cc", "memory", "v0", "v1"  // Clobber List
+  );
+}
+#endif  // HAS_MERGEUVROW_NEON
+
+// Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
+#ifdef HAS_COPYROW_NEON
+void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32       \n"  // load 32
+    "subs       %w2, %w2, #32                  \n"  // 32 processed per loop
+    MEMACCESS(1)
+    "st1        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32       \n"  // store 32
+    "b.gt       1b                             \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(count)  // %2  // Output registers
+  :                     // Input registers
+  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+#endif  // HAS_COPYROW_NEON
+
+// SetRow writes 'count' bytes using an 8 bit value repeated.
+void SetRow_NEON(uint8* dst, uint8 v8, int count) {
+  asm volatile (
+    "dup        v0.16b, %w2                    \n"  // duplicate 16 bytes
+  "1:                                          \n"
+    "subs      %w1, %w1, #16                   \n"  // 16 bytes per loop
+    MEMACCESS(0)
+    "st1        {v0.16b}, [%0], #16            \n"  // store
+    "b.gt      1b                              \n"
+  : "+r"(dst),   // %0
+    "+r"(count)  // %1
+  : "r"(v8)      // %2
+  : "cc", "memory", "v0"
+  );
+}
+
+void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
+  asm volatile (
+    "dup        v0.4s, %w2                     \n"  // duplicate 4 ints
+  "1:                                          \n"
+    "subs      %w1, %w1, #4                    \n"  // 4 ints per loop
+    MEMACCESS(0)
+    "st1        {v0.16b}, [%0], #16            \n"  // store
+    "b.gt      1b                              \n"
+  : "+r"(dst),   // %0
+    "+r"(count)  // %1
+  : "r"(v32)     // %2
+  : "cc", "memory", "v0"
+  );
+}
+
+#ifdef HAS_MIRRORROW_NEON
+void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
+  int64 width64 = (int64) width;
+  asm volatile (
+    // Start at end of source row.
+    "add        %0, %0, %2                     \n"
+    "sub        %0, %0, #16                    \n"
+
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
+    "subs       %2, %2, #16                   \n"  // 16 pixels per loop.
+    "rev64      v0.16b, v0.16b                 \n"
+    MEMACCESS(1)
+    "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
+    MEMACCESS(1)
+    "st1        {v0.D}[0], [%1], #8            \n"
+    "b.gt       1b                             \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width64)  // %2
+  : "r"((ptrdiff_t)-16)    // %3
+  : "cc", "memory", "v0"
+  );
+}
+#endif  // HAS_MIRRORROW_NEON
+
+#ifdef HAS_MIRRORUVROW_NEON
+void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                      int width) {
+  int64 width64 = (int64) width;
+  asm volatile (
+    // Start at end of source row.
+    "add        %0, %0, %3, lsl #1             \n"
+    "sub        %0, %0, #16                    \n"
+
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld2        {v0.8b, v1.8b}, [%0], %4       \n"  // src -= 16
+    "subs       %3, %3, #8                     \n"  // 8 pixels per loop.
+    "rev64      v0.8b, v0.8b                   \n"
+    "rev64      v1.8b, v1.8b                   \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // dst += 8
+    MEMACCESS(2)
+    "st1        {v1.8b}, [%2], #8              \n"
+    "b.gt       1b                             \n"
+  : "+r"(src_uv),  // %0
+    "+r"(dst_u),   // %1
+    "+r"(dst_v),   // %2
+    "+r"(width64)    // %3
+  : "r"((ptrdiff_t)-16)      // %4
+  : "cc", "memory", "v0", "v1"
+  );
+}
+#endif  // HAS_MIRRORUVROW_NEON
+
+#ifdef HAS_ARGBMIRRORROW_NEON
+void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
+  int64 width64 = (int64) width;
+  asm volatile (
+    // Start at end of source row.
+    "add        %0, %0, %2, lsl #2             \n"
+    "sub        %0, %0, #16                    \n"
+
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
+    "subs       %2, %2, #4                     \n"  // 4 pixels per loop.
+    "rev64      v0.4s, v0.4s                   \n"
+    MEMACCESS(1)
+    "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
+    MEMACCESS(1)
+    "st1        {v0.D}[0], [%1], #8            \n"
+    "b.gt       1b                             \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width64)  // %2
+  : "r"((ptrdiff_t)-16)    // %3
+  : "cc", "memory", "v0"
+  );
+}
+#endif  // HAS_ARGBMIRRORROW_NEON
+
+#ifdef HAS_RGB24TOARGBROW_NEON
+void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
+  asm volatile (
+    "movi       v4.8b, #255                    \n"  // Alpha
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld3        {v1.8b,v2.8b,v3.8b}, [%0], #24 \n"  // load 8 pixels of RGB24.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    MEMACCESS(1)
+    "st4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+  : "+r"(src_rgb24),  // %0
+    "+r"(dst_argb),   // %1
+    "+r"(width)         // %2
+  :
+  : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
+  );
+}
+#endif  // HAS_RGB24TOARGBROW_NEON
+
+#ifdef HAS_RAWTOARGBROW_NEON
+void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {
+  asm volatile (
+    "movi       v5.8b, #255                    \n"  // Alpha
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "orr        v3.8b, v1.8b, v1.8b            \n"  // move g
+    "orr        v4.8b, v0.8b, v0.8b            \n"  // move r
+    MEMACCESS(1)
+    "st4        {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n"  // store b g r a
+    "b.gt       1b                             \n"
+  : "+r"(src_raw),   // %0
+    "+r"(dst_argb),  // %1
+    "+r"(width)        // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"  // Clobber List
+  );
+}
+#endif  // HAS_RAWTOARGBROW_NEON
+
+void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "orr        v3.8b, v1.8b, v1.8b            \n"  // move g
+    "orr        v4.8b, v0.8b, v0.8b            \n"  // move r
+    MEMACCESS(1)
+    "st3        {v2.8b,v3.8b,v4.8b}, [%1], #24 \n"  // store b g r
+    "b.gt       1b                             \n"
+  : "+r"(src_raw),    // %0
+    "+r"(dst_rgb24),  // %1
+    "+r"(width)       // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
+  );
+}
+
+#define RGB565TOARGB                                                           \
+    "shrn       v6.8b, v0.8h, #5               \n"  /* G xxGGGGGG           */ \
+    "shl        v6.8b, v6.8b, #2               \n"  /* G GGGGGG00 upper 6   */ \
+    "ushr       v4.8b, v6.8b, #6               \n"  /* G 000000GG lower 2   */ \
+    "orr        v1.8b, v4.8b, v6.8b            \n"  /* G                    */ \
+    "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \
+    "ushr       v0.8h, v0.8h, #11              \n"  /* R 000RRRRR           */ \
+    "xtn2       v2.16b,v0.8h                   \n"  /* R in upper part      */ \
+    "shl        v2.16b, v2.16b, #3             \n"  /* R,B BBBBB000 upper 5 */ \
+    "ushr       v0.16b, v2.16b, #5             \n"  /* R,B 00000BBB lower 3 */ \
+    "orr        v0.16b, v0.16b, v2.16b         \n"  /* R,B                  */ \
+    "dup        v2.2D, v0.D[1]                 \n"  /* R                    */
+
+#ifdef HAS_RGB565TOARGBROW_NEON
+void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
+  asm volatile (
+    "movi       v3.8b, #255                    \n"  // Alpha
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    RGB565TOARGB
+    MEMACCESS(1)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+  : "+r"(src_rgb565),  // %0
+    "+r"(dst_argb),    // %1
+    "+r"(width)          // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6"  // Clobber List
+  );
+}
+#endif  // HAS_RGB565TOARGBROW_NEON
+
+#define ARGB1555TOARGB                                                         \
+    "ushr       v2.8h, v0.8h, #10              \n"  /* R xxxRRRRR           */ \
+    "shl        v2.8h, v2.8h, #3               \n"  /* R RRRRR000 upper 5   */ \
+    "xtn        v3.8b, v2.8h                   \n"  /* RRRRR000 AAAAAAAA    */ \
+                                                                               \
+    "sshr       v2.8h, v0.8h, #15              \n"  /* A AAAAAAAA           */ \
+    "xtn2       v3.16b, v2.8h                  \n"                             \
+                                                                               \
+    "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \
+    "shrn2      v2.16b,v0.8h, #5               \n"  /* G xxxGGGGG           */ \
+                                                                               \
+    "ushr       v1.16b, v3.16b, #5             \n"  /* R,A 00000RRR lower 3 */ \
+    "shl        v0.16b, v2.16b, #3             \n"  /* B,G BBBBB000 upper 5 */ \
+    "ushr       v2.16b, v0.16b, #5             \n"  /* B,G 00000BBB lower 3 */ \
+                                                                               \
+    "orr        v0.16b, v0.16b, v2.16b         \n"  /* B,G                  */ \
+    "orr        v2.16b, v1.16b, v3.16b         \n"  /* R,A                  */ \
+    "dup        v1.2D, v0.D[1]                 \n"                             \
+    "dup        v3.2D, v2.D[1]                 \n"
+
+// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
+#define RGB555TOARGB                                                           \
+    "ushr       v2.8h, v0.8h, #10              \n"  /* R xxxRRRRR           */ \
+    "shl        v2.8h, v2.8h, #3               \n"  /* R RRRRR000 upper 5   */ \
+    "xtn        v3.8b, v2.8h                   \n"  /* RRRRR000             */ \
+                                                                               \
+    "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \
+    "shrn2      v2.16b,v0.8h, #5               \n"  /* G xxxGGGGG           */ \
+                                                                               \
+    "ushr       v1.16b, v3.16b, #5             \n"  /* R   00000RRR lower 3 */ \
+    "shl        v0.16b, v2.16b, #3             \n"  /* B,G BBBBB000 upper 5 */ \
+    "ushr       v2.16b, v0.16b, #5             \n"  /* B,G 00000BBB lower 3 */ \
+                                                                               \
+    "orr        v0.16b, v0.16b, v2.16b         \n"  /* B,G                  */ \
+    "orr        v2.16b, v1.16b, v3.16b         \n"  /* R                    */ \
+    "dup        v1.2D, v0.D[1]                 \n"  /* G */                    \
+
+#ifdef HAS_ARGB1555TOARGBROW_NEON
+void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
+                            int width) {
+  asm volatile (
+    "movi       v3.8b, #255                    \n"  // Alpha
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    ARGB1555TOARGB
+    MEMACCESS(1)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+  : "+r"(src_argb1555),  // %0
+    "+r"(dst_argb),    // %1
+    "+r"(width)          // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+#endif  // HAS_ARGB1555TOARGBROW_NEON
+
+#define ARGB4444TOARGB                                                         \
+    "shrn       v1.8b,  v0.8h, #8              \n"  /* v1(l) AR             */ \
+    "xtn2       v1.16b, v0.8h                  \n"  /* v1(h) GB             */ \
+    "shl        v2.16b, v1.16b, #4             \n"  /* B,R BBBB0000         */ \
+    "ushr       v3.16b, v1.16b, #4             \n"  /* G,A 0000GGGG         */ \
+    "ushr       v0.16b, v2.16b, #4             \n"  /* B,R 0000BBBB         */ \
+    "shl        v1.16b, v3.16b, #4             \n"  /* G,A GGGG0000         */ \
+    "orr        v2.16b, v0.16b, v2.16b         \n"  /* B,R BBBBBBBB         */ \
+    "orr        v3.16b, v1.16b, v3.16b         \n"  /* G,A GGGGGGGG         */ \
+    "dup        v0.2D, v2.D[1]                 \n"                             \
+    "dup        v1.2D, v3.D[1]                 \n"
+
+#ifdef HAS_ARGB4444TOARGBROW_NEON
+void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
+                            int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    ARGB4444TOARGB
+    MEMACCESS(1)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+  : "+r"(src_argb4444),  // %0
+    "+r"(dst_argb),    // %1
+    "+r"(width)          // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
+  );
+}
+#endif  // HAS_ARGB4444TOARGBROW_NEON
+
+#ifdef HAS_ARGBTORGB24ROW_NEON
+void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load 8 ARGB pixels
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    MEMACCESS(1)
+    "st3        {v1.8b,v2.8b,v3.8b}, [%1], #24 \n"  // store 8 pixels of RGB24.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_rgb24),  // %1
+    "+r"(width)         // %2
+  :
+  : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
+  );
+}
+#endif  // HAS_ARGBTORGB24ROW_NEON
+
+#ifdef HAS_ARGBTORAWROW_NEON
+void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load b g r a
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "orr        v4.8b, v2.8b, v2.8b            \n"  // mov g
+    "orr        v5.8b, v1.8b, v1.8b            \n"  // mov b
+    MEMACCESS(1)
+    "st3        {v3.8b,v4.8b,v5.8b}, [%1], #24 \n"  // store r g b
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_raw),   // %1
+    "+r"(width)        // %2
+  :
+  : "cc", "memory", "v1", "v2", "v3", "v4", "v5"  // Clobber List
+  );
+}
+#endif  // HAS_ARGBTORAWROW_NEON
+
+#ifdef HAS_YUY2TOYROW_NEON
+void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of YUY2.
+    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
+    MEMACCESS(1)
+    "st1        {v0.16b}, [%1], #16            \n"  // store 16 pixels of Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_yuy2),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(width)        // %2
+  :
+  : "cc", "memory", "v0", "v1"  // Clobber List
+  );
+}
+#endif  // HAS_YUY2TOYROW_NEON
+
+#ifdef HAS_UYVYTOYROW_NEON
+void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of UYVY.
+    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
+    MEMACCESS(1)
+    "st1        {v1.16b}, [%1], #16            \n"  // store 16 pixels of Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_uyvy),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(width)        // %2
+  :
+  : "cc", "memory", "v0", "v1"  // Clobber List
+  );
+}
+#endif  // HAS_UYVYTOYROW_NEON
+
+#ifdef HAS_YUY2TOUV422ROW_NEON
+void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
+                         int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 YUY2 pixels
+    "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
+    MEMACCESS(1)
+    "st1        {v1.8b}, [%1], #8              \n"  // store 8 U.
+    MEMACCESS(2)
+    "st1        {v3.8b}, [%2], #8              \n"  // store 8 V.
+    "b.gt       1b                             \n"
+  : "+r"(src_yuy2),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(width)        // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+#endif  // HAS_YUY2TOUV422ROW_NEON
+
+#ifdef HAS_UYVYTOUV422ROW_NEON
+void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
+                         int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 UYVY pixels
+    "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 U.
+    MEMACCESS(2)
+    "st1        {v2.8b}, [%2], #8              \n"  // store 8 V.
+    "b.gt       1b                             \n"
+  : "+r"(src_uyvy),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(width)        // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+#endif  // HAS_UYVYTOUV422ROW_NEON
+
+#ifdef HAS_YUY2TOUVROW_NEON
+void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  const uint8* src_yuy2b = src_yuy2 + stride_yuy2;
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
+    "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.
+    MEMACCESS(1)
+    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
+    "urhadd     v1.8b, v1.8b, v5.8b            \n"  // average rows of U
+    "urhadd     v3.8b, v3.8b, v7.8b            \n"  // average rows of V
+    MEMACCESS(2)
+    "st1        {v1.8b}, [%2], #8              \n"  // store 8 U.
+    MEMACCESS(3)
+    "st1        {v3.8b}, [%3], #8              \n"  // store 8 V.
+    "b.gt       1b                             \n"
+  : "+r"(src_yuy2),     // %0
+    "+r"(src_yuy2b),    // %1
+    "+r"(dst_u),        // %2
+    "+r"(dst_v),        // %3
+    "+r"(width)           // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
+    "v5", "v6", "v7"  // Clobber List
+  );
+}
+#endif  // HAS_YUY2TOUVROW_NEON
+
+#ifdef HAS_UYVYTOUVROW_NEON
+void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  const uint8* src_uyvyb = src_uyvy + stride_uyvy;
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
+    "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.
+    MEMACCESS(1)
+    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
+    "urhadd     v0.8b, v0.8b, v4.8b            \n"  // average rows of U
+    "urhadd     v2.8b, v2.8b, v6.8b            \n"  // average rows of V
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 U.
+    MEMACCESS(3)
+    "st1        {v2.8b}, [%3], #8              \n"  // store 8 V.
+    "b.gt       1b                             \n"
+  : "+r"(src_uyvy),     // %0
+    "+r"(src_uyvyb),    // %1
+    "+r"(dst_u),        // %2
+    "+r"(dst_v),        // %3
+    "+r"(width)           // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
+    "v5", "v6", "v7"  // Clobber List
+  );
+}
+#endif  // HAS_UYVYTOUVROW_NEON
+
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+#ifdef HAS_ARGBSHUFFLEROW_NEON
+void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int width) {
+  asm volatile (
+    MEMACCESS(3)
+    "ld1        {v2.16b}, [%3]                 \n"  // shuffler
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 4 pixels.
+    "subs       %w2, %w2, #4                   \n"  // 4 processed per loop
+    "tbl        v1.16b, {v0.16b}, v2.16b       \n"  // look up 4 pixels
+    MEMACCESS(1)
+    "st1        {v1.16b}, [%1], #16            \n"  // store 4.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(width)        // %2
+  : "r"(shuffler)    // %3
+  : "cc", "memory", "v0", "v1", "v2"  // Clobber List
+  );
+}
+#endif  // HAS_ARGBSHUFFLEROW_NEON
+
+#ifdef HAS_I422TOYUY2ROW_NEON
+void I422ToYUY2Row_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_yuy2, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld2        {v0.8b, v1.8b}, [%0], #16      \n"  // load 16 Ys
+    "orr        v2.8b, v1.8b, v1.8b            \n"
+    MEMACCESS(1)
+    "ld1        {v1.8b}, [%1], #8              \n"  // load 8 Us
+    MEMACCESS(2)
+    "ld1        {v3.8b}, [%2], #8              \n"  // load 8 Vs
+    "subs       %w4, %w4, #16                  \n"  // 16 pixels
+    MEMACCESS(3)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
+    "b.gt       1b                             \n"
+  : "+r"(src_y),     // %0
+    "+r"(src_u),     // %1
+    "+r"(src_v),     // %2
+    "+r"(dst_yuy2),  // %3
+    "+r"(width)      // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3"
+  );
+}
+#endif  // HAS_I422TOYUY2ROW_NEON
+
+#ifdef HAS_I422TOUYVYROW_NEON
+void I422ToUYVYRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_uyvy, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld2        {v1.8b,v2.8b}, [%0], #16       \n"  // load 16 Ys
+    "orr        v3.8b, v2.8b, v2.8b            \n"
+    MEMACCESS(1)
+    "ld1        {v0.8b}, [%1], #8              \n"  // load 8 Us
+    MEMACCESS(2)
+    "ld1        {v2.8b}, [%2], #8              \n"  // load 8 Vs
+    "subs       %w4, %w4, #16                  \n"  // 16 pixels
+    MEMACCESS(3)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
+    "b.gt       1b                             \n"
+  : "+r"(src_y),     // %0
+    "+r"(src_u),     // %1
+    "+r"(src_v),     // %2
+    "+r"(dst_uyvy),  // %3
+    "+r"(width)      // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3"
+  );
+}
+#endif  // HAS_I422TOUYVYROW_NEON
+
+#ifdef HAS_ARGBTORGB565ROW_NEON
+void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    ARGBTORGB565
+    MEMACCESS(1)
+    "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels RGB565.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_rgb565),  // %1
+    "+r"(width)        // %2
+  :
+  : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
+  );
+}
+#endif  // HAS_ARGBTORGB565ROW_NEON
+
+#ifdef HAS_ARGBTORGB565DITHERROW_NEON
+void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
+                                const uint32 dither4, int width) {
+  asm volatile (
+    "dup        v1.4s, %w2                     \n"  // dither4
+  "1:                                          \n"
+    MEMACCESS(1)
+    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"  // load 8 pixels
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "uqadd      v20.8b, v20.8b, v1.8b          \n"
+    "uqadd      v21.8b, v21.8b, v1.8b          \n"
+    "uqadd      v22.8b, v22.8b, v1.8b          \n"
+    ARGBTORGB565
+    MEMACCESS(0)
+    "st1        {v0.16b}, [%0], #16            \n"  // store 8 pixels RGB565.
+    "b.gt       1b                             \n"
+  : "+r"(dst_rgb)    // %0
+  : "r"(src_argb),   // %1
+    "r"(dither4),    // %2
+    "r"(width)       // %3
+  : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23"
+  );
+}
+#endif  // HAS_ARGBTORGB565ROW_NEON
+
+#ifdef HAS_ARGBTOARGB1555ROW_NEON
+void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
+                            int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    ARGBTOARGB1555
+    MEMACCESS(1)
+    "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels ARGB1555.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb1555),  // %1
+    "+r"(width)        // %2
+  :
+  : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
+  );
+}
+#endif  // HAS_ARGBTOARGB1555ROW_NEON
+
+#ifdef HAS_ARGBTOARGB4444ROW_NEON
+void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
+                            int width) {
+  asm volatile (
+    "movi       v4.16b, #0x0f                  \n"  // bits to clear with vbic.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    ARGBTOARGB4444
+    MEMACCESS(1)
+    "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels ARGB4444.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),      // %0
+    "+r"(dst_argb4444),  // %1
+    "+r"(width)            // %2
+  :
+  : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23"
+  );
+}
+#endif  // HAS_ARGBTOARGB4444ROW_NEON
+
+#ifdef HAS_ARGBTOYROW_NEON
+void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
+  asm volatile (
+    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
+    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
+    "movi       v7.8b, #16                     \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v3.8h, v0.8b, v4.8b            \n"  // B
+    "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
+    "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
+    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v7.8b            \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(width)        // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
+  );
+}
+#endif  // HAS_ARGBTOYROW_NEON
+
+#ifdef HAS_ARGBTOYJROW_NEON
+void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
+  asm volatile (
+    "movi       v4.8b, #15                     \n"  // B * 0.11400 coefficient
+    "movi       v5.8b, #75                     \n"  // G * 0.58700 coefficient
+    "movi       v6.8b, #38                     \n"  // R * 0.29900 coefficient
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v3.8h, v0.8b, v4.8b            \n"  // B
+    "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
+    "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
+    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 15 bit to 8 bit Y
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(width)        // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
+  );
+}
+#endif  // HAS_ARGBTOYJROW_NEON
+
+// 8x1 pixels.
+#ifdef HAS_ARGBTOUV444ROW_NEON
+void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                         int width) {
+  asm volatile (
+    "movi       v24.8b, #112                   \n"  // UB / VR 0.875 coefficient
+    "movi       v25.8b, #74                    \n"  // UG -0.5781 coefficient
+    "movi       v26.8b, #38                    \n"  // UR -0.2969 coefficient
+    "movi       v27.8b, #18                    \n"  // VB -0.1406 coefficient
+    "movi       v28.8b, #94                    \n"  // VG -0.7344 coefficient
+    "movi       v29.16b,#0x80                  \n"  // 128.5
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "umull      v4.8h, v0.8b, v24.8b           \n"  // B
+    "umlsl      v4.8h, v1.8b, v25.8b           \n"  // G
+    "umlsl      v4.8h, v2.8b, v26.8b           \n"  // R
+    "add        v4.8h, v4.8h, v29.8h           \n"  // +128 -> unsigned
+
+    "umull      v3.8h, v2.8b, v24.8b           \n"  // R
+    "umlsl      v3.8h, v1.8b, v28.8b           \n"  // G
+    "umlsl      v3.8h, v0.8b, v27.8b           \n"  // B
+    "add        v3.8h, v3.8h, v29.8h           \n"  // +128 -> unsigned
+
+    "uqshrn     v0.8b, v4.8h, #8               \n"  // 16 bit to 8 bit U
+    "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
+
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
+    MEMACCESS(2)
+    "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(width)        // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
+    "v24", "v25", "v26", "v27", "v28", "v29"
+  );
+}
+#endif  // HAS_ARGBTOUV444ROW_NEON
+
+#define RGBTOUV_SETUP_REG                                                      \
+    "movi       v20.8h, #56, lsl #0  \n"  /* UB/VR coefficient (0.875) / 2 */  \
+    "movi       v21.8h, #37, lsl #0  \n"  /* UG coefficient (-0.5781) / 2  */  \
+    "movi       v22.8h, #19, lsl #0  \n"  /* UR coefficient (-0.2969) / 2  */  \
+    "movi       v23.8h, #9,  lsl #0  \n"  /* VB coefficient (-0.1406) / 2  */  \
+    "movi       v24.8h, #47, lsl #0  \n"  /* VG coefficient (-0.7344) / 2  */  \
+    "movi       v25.16b, #0x80       \n"  /* 128.5 (0x8080 in 16-bit)      */
+
+// 32x1 pixels -> 8x1.  width is number of argb pixels. e.g. 32.
+#ifdef HAS_ARGBTOUV411ROW_NEON
+void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                         int width) {
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(0)
+    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%0], #64 \n"  // load next 16.
+    "uaddlp     v4.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v5.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v6.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    "addp       v0.8h, v0.8h, v4.8h            \n"  // B 16 shorts -> 8 shorts.
+    "addp       v1.8h, v1.8h, v5.8h            \n"  // G 16 shorts -> 8 shorts.
+    "addp       v2.8h, v2.8h, v6.8h            \n"  // R 16 shorts -> 8 shorts.
+
+    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v1.8h, #1               \n"
+    "urshr      v2.8h, v2.8h, #1               \n"
+
+    "subs       %w3, %w3, #32                  \n"  // 32 processed per loop.
+    "mul        v3.8h, v0.8h, v20.8h           \n"  // B
+    "mls        v3.8h, v1.8h, v21.8h           \n"  // G
+    "mls        v3.8h, v2.8h, v22.8h           \n"  // R
+    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
+    "mul        v4.8h, v2.8h, v20.8h           \n"  // R
+    "mls        v4.8h, v1.8h, v24.8h           \n"  // G
+    "mls        v4.8h, v0.8h, v23.8h           \n"  // B
+    "add        v4.8h, v4.8h, v25.8h           \n"  // +128 -> unsigned
+    "uqshrn     v0.8b, v3.8h, #8               \n"  // 16 bit to 8 bit U
+    "uqshrn     v1.8b, v4.8h, #8               \n"  // 16 bit to 8 bit V
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
+    MEMACCESS(2)
+    "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(width)        // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+#endif  // HAS_ARGBTOUV411ROW_NEON
+
+// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
+#define RGBTOUV(QB, QG, QR) \
+    "mul        v3.8h, " #QB ",v20.8h          \n"  /* B                    */ \
+    "mul        v4.8h, " #QR ",v20.8h          \n"  /* R                    */ \
+    "mls        v3.8h, " #QG ",v21.8h          \n"  /* G                    */ \
+    "mls        v4.8h, " #QG ",v24.8h          \n"  /* G                    */ \
+    "mls        v3.8h, " #QR ",v22.8h          \n"  /* R                    */ \
+    "mls        v4.8h, " #QB ",v23.8h          \n"  /* B                    */ \
+    "add        v3.8h, v3.8h, v25.8h           \n"  /* +128 -> unsigned     */ \
+    "add        v4.8h, v4.8h, v25.8h           \n"  /* +128 -> unsigned     */ \
+    "uqshrn     v0.8b, v3.8h, #8               \n"  /* 16 bit to 8 bit U    */ \
+    "uqshrn     v1.8b, v4.8h, #8               \n"  /* 16 bit to 8 bit V    */
+
+// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
+// TODO(fbarchard): consider ptrdiff_t for all strides.
+
+#ifdef HAS_ARGBTOUVROW_NEON
+void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  const uint8* src_argb_1 = src_argb + src_stride_argb;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    MEMACCESS(1)
+    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
+    "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v1.8h, #1               \n"
+    "urshr      v2.8h, v2.8h, #1               \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+    RGBTOUV(v0.8h, v1.8h, v2.8h)
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(src_argb_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+#endif  // HAS_ARGBTOUVROW_NEON
+
+// TODO(fbarchard): Subsample match C code.
+#ifdef HAS_ARGBTOUVJROW_NEON
+void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  const uint8* src_argb_1 = src_argb + src_stride_argb;
+  asm volatile (
+    "movi       v20.8h, #63, lsl #0            \n"  // UB/VR coeff (0.500) / 2
+    "movi       v21.8h, #42, lsl #0            \n"  // UG coeff (-0.33126) / 2
+    "movi       v22.8h, #21, lsl #0            \n"  // UR coeff (-0.16874) / 2
+    "movi       v23.8h, #10, lsl #0            \n"  // VB coeff (-0.08131) / 2
+    "movi       v24.8h, #53, lsl #0            \n"  // VG coeff (-0.41869) / 2
+    "movi       v25.16b, #0x80                 \n"  // 128.5 (0x8080 in 16-bit)
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64  \n"  // load next 16
+    "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v1.8h, #1               \n"
+    "urshr      v2.8h, v2.8h, #1               \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+    RGBTOUV(v0.8h, v1.8h, v2.8h)
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(src_argb_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+#endif  // HAS_ARGBTOUVJROW_NEON
+
+#ifdef HAS_BGRATOUVROW_NEON
+void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  const uint8* src_bgra_1 = src_bgra + src_stride_bgra;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+    "uaddlp     v0.8h, v3.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v3.8h, v2.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v1.16b                  \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more
+    "uadalp     v0.8h, v7.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v3.8h, v6.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v5.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v3.8h, #1               \n"
+    "urshr      v2.8h, v2.8h, #1               \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+    RGBTOUV(v0.8h, v1.8h, v2.8h)
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_bgra),  // %0
+    "+r"(src_bgra_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+#endif  // HAS_BGRATOUVROW_NEON
+
+#ifdef HAS_ABGRTOUVROW_NEON
+void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  const uint8* src_abgr_1 = src_abgr + src_stride_abgr;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+    "uaddlp     v3.8h, v2.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v0.16b                  \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
+    "uadalp     v3.8h, v6.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v4.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    "urshr      v0.8h, v3.8h, #1               \n"  // 2x average
+    "urshr      v2.8h, v2.8h, #1               \n"
+    "urshr      v1.8h, v1.8h, #1               \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+    RGBTOUV(v0.8h, v2.8h, v1.8h)
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_abgr),  // %0
+    "+r"(src_abgr_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+#endif  // HAS_ABGRTOUVROW_NEON
+
+#ifdef HAS_RGBATOUVROW_NEON
+void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  const uint8* src_rgba_1 = src_rgba + src_stride_rgba;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+    "uaddlp     v0.8h, v1.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v2.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v3.16b                  \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
+    "uadalp     v0.8h, v5.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v6.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v7.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v1.8h, #1               \n"
+    "urshr      v2.8h, v2.8h, #1               \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+    RGBTOUV(v0.8h, v1.8h, v2.8h)
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_rgba),  // %0
+    "+r"(src_rgba_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+#endif  // HAS_RGBATOUVROW_NEON
+
+#ifdef HAS_RGB24TOUVROW_NEON
+void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 pixels.
+    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "ld3        {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 16 more.
+    "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v1.8h, #1               \n"
+    "urshr      v2.8h, v2.8h, #1               \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+    RGBTOUV(v0.8h, v1.8h, v2.8h)
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_rgb24),  // %0
+    "+r"(src_rgb24_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+#endif  // HAS_RGB24TOUVROW_NEON
+
+#ifdef HAS_RAWTOUVROW_NEON
+void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
+                     uint8* dst_u, uint8* dst_v, int width) {
+  const uint8* src_raw_1 = src_raw + src_stride_raw;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 8 RAW pixels.
+    "uaddlp     v2.8h, v2.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v0.8h, v0.16b                  \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "ld3        {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 8 more RAW pixels
+    "uadalp     v2.8h, v6.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v0.8h, v4.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    "urshr      v2.8h, v2.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v1.8h, #1               \n"
+    "urshr      v0.8h, v0.8h, #1               \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+    RGBTOUV(v2.8h, v1.8h, v0.8h)
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_raw),  // %0
+    "+r"(src_raw_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+#endif  // HAS_RAWTOUVROW_NEON
+
+// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
+#ifdef HAS_RGB565TOUVROW_NEON
+void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
+                        uint8* dst_u, uint8* dst_v, int width) {
+  const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
+  asm volatile (
+    "movi       v22.8h, #56, lsl #0            \n"  // UB / VR coeff (0.875) / 2
+    "movi       v23.8h, #37, lsl #0            \n"  // UG coeff (-0.5781) / 2
+    "movi       v24.8h, #19, lsl #0            \n"  // UR coeff (-0.2969) / 2
+    "movi       v25.8h, #9 , lsl #0            \n"  // VB coeff (-0.1406) / 2
+    "movi       v26.8h, #47, lsl #0            \n"  // VG coeff (-0.7344) / 2
+    "movi       v27.16b, #0x80                 \n"  // 128.5 (0x8080 in 16-bit)
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
+    RGB565TOARGB
+    "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uaddlp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uaddlp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // next 8 RGB565 pixels.
+    RGB565TOARGB
+    "uaddlp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uaddlp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uaddlp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+
+    MEMACCESS(1)
+    "ld1        {v0.16b}, [%1], #16            \n"  // load 8 RGB565 pixels.
+    RGB565TOARGB
+    "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uadalp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uadalp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(1)
+    "ld1        {v0.16b}, [%1], #16            \n"  // next 8 RGB565 pixels.
+    RGB565TOARGB
+    "uadalp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uadalp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uadalp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+
+    "ins        v16.D[1], v17.D[0]             \n"
+    "ins        v18.D[1], v19.D[0]             \n"
+    "ins        v20.D[1], v21.D[0]             \n"
+
+    "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
+    "urshr      v5.8h, v18.8h, #1              \n"
+    "urshr      v6.8h, v20.8h, #1              \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
+    "mul        v16.8h, v4.8h, v22.8h          \n"  // B
+    "mls        v16.8h, v5.8h, v23.8h          \n"  // G
+    "mls        v16.8h, v6.8h, v24.8h          \n"  // R
+    "add        v16.8h, v16.8h, v27.8h         \n"  // +128 -> unsigned
+    "mul        v17.8h, v6.8h, v22.8h          \n"  // R
+    "mls        v17.8h, v5.8h, v26.8h          \n"  // G
+    "mls        v17.8h, v4.8h, v25.8h          \n"  // B
+    "add        v17.8h, v17.8h, v27.8h         \n"  // +128 -> unsigned
+    "uqshrn     v0.8b, v16.8h, #8              \n"  // 16 bit to 8 bit U
+    "uqshrn     v1.8b, v17.8h, #8              \n"  // 16 bit to 8 bit V
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_rgb565),  // %0
+    "+r"(src_rgb565_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
+    "v25", "v26", "v27"
+  );
+}
+#endif  // HAS_RGB565TOUVROW_NEON
+
+// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
+#ifdef HAS_ARGB1555TOUVROW_NEON
+void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
+                        uint8* dst_u, uint8* dst_v, int width) {
+  const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
+    RGB555TOARGB
+    "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB1555 pixels.
+    RGB555TOARGB
+    "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+
+    MEMACCESS(1)
+    "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB1555 pixels.
+    RGB555TOARGB
+    "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(1)
+    "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB1555 pixels.
+    RGB555TOARGB
+    "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+
+    "ins        v16.D[1], v26.D[0]             \n"
+    "ins        v17.D[1], v27.D[0]             \n"
+    "ins        v18.D[1], v28.D[0]             \n"
+
+    "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
+    "urshr      v5.8h, v17.8h, #1              \n"
+    "urshr      v6.8h, v18.8h, #1              \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
+    "mul        v2.8h, v4.8h, v20.8h           \n"  // B
+    "mls        v2.8h, v5.8h, v21.8h           \n"  // G
+    "mls        v2.8h, v6.8h, v22.8h           \n"  // R
+    "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned
+    "mul        v3.8h, v6.8h, v20.8h           \n"  // R
+    "mls        v3.8h, v5.8h, v24.8h           \n"  // G
+    "mls        v3.8h, v4.8h, v23.8h           \n"  // B
+    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
+    "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U
+    "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb1555),  // %0
+    "+r"(src_argb1555_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
+    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
+    "v26", "v27", "v28"
+  );
+}
+#endif  // HAS_ARGB1555TOUVROW_NEON
+
+// 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
+#ifdef HAS_ARGB4444TOUVROW_NEON
+void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
+                          uint8* dst_u, uint8* dst_v, int width) {
+  const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
+    ARGB4444TOARGB
+    "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB4444 pixels.
+    ARGB4444TOARGB
+    "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+
+    MEMACCESS(1)
+    "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB4444 pixels.
+    ARGB4444TOARGB
+    "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(1)
+    "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB4444 pixels.
+    ARGB4444TOARGB
+    "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+
+    "ins        v16.D[1], v26.D[0]             \n"
+    "ins        v17.D[1], v27.D[0]             \n"
+    "ins        v18.D[1], v28.D[0]             \n"
+
+    "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
+    "urshr      v5.8h, v17.8h, #1              \n"
+    "urshr      v6.8h, v18.8h, #1              \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
+    "mul        v2.8h, v4.8h, v20.8h           \n"  // B
+    "mls        v2.8h, v5.8h, v21.8h           \n"  // G
+    "mls        v2.8h, v6.8h, v22.8h           \n"  // R
+    "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned
+    "mul        v3.8h, v6.8h, v20.8h           \n"  // R
+    "mls        v3.8h, v5.8h, v24.8h           \n"  // G
+    "mls        v3.8h, v4.8h, v23.8h           \n"  // B
+    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
+    "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U
+    "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb4444),  // %0
+    "+r"(src_argb4444_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
+    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
+    "v26", "v27", "v28"
+
+  );
+}
+#endif  // HAS_ARGB4444TOUVROW_NEON
+
+#ifdef HAS_RGB565TOYROW_NEON
+void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) {
+  asm volatile (
+    "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient
+    "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient
+    "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient
+    "movi       v27.8b, #16                    \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    RGB565TOARGB
+    "umull      v3.8h, v0.8b, v24.8b           \n"  // B
+    "umlal      v3.8h, v1.8b, v25.8b           \n"  // G
+    "umlal      v3.8h, v2.8b, v26.8b           \n"  // R
+    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v27.8b           \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_rgb565),  // %0
+    "+r"(dst_y),       // %1
+    "+r"(width)          // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6",
+    "v24", "v25", "v26", "v27"
+  );
+}
+#endif  // HAS_RGB565TOYROW_NEON
+
+#ifdef HAS_ARGB1555TOYROW_NEON
+void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) {
+  asm volatile (
+    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
+    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
+    "movi       v7.8b, #16                     \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    ARGB1555TOARGB
+    "umull      v3.8h, v0.8b, v4.8b            \n"  // B
+    "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
+    "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
+    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v7.8b            \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb1555),  // %0
+    "+r"(dst_y),         // %1
+    "+r"(width)            // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
+  );
+}
+#endif  // HAS_ARGB1555TOYROW_NEON
+
+#ifdef HAS_ARGB4444TOYROW_NEON
+void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) {
+  asm volatile (
+    "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient
+    "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient
+    "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient
+    "movi       v27.8b, #16                    \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    ARGB4444TOARGB
+    "umull      v3.8h, v0.8b, v24.8b           \n"  // B
+    "umlal      v3.8h, v1.8b, v25.8b           \n"  // G
+    "umlal      v3.8h, v2.8b, v26.8b           \n"  // R
+    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v27.8b           \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb4444),  // %0
+    "+r"(dst_y),         // %1
+    "+r"(width)            // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27"
+  );
+}
+#endif  // HAS_ARGB4444TOYROW_NEON
+
+#ifdef HAS_BGRATOYROW_NEON
+void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) {
+  asm volatile (
+    "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
+    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+    "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
+    "movi       v7.8b, #16                     \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v16.8h, v1.8b, v4.8b           \n"  // R
+    "umlal      v16.8h, v2.8b, v5.8b           \n"  // G
+    "umlal      v16.8h, v3.8b, v6.8b           \n"  // B
+    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v7.8b            \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_bgra),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(width)        // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
+  );
+}
+#endif  // HAS_BGRATOYROW_NEON
+
+#ifdef HAS_ABGRTOYROW_NEON
+void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) {
+  asm volatile (
+    "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
+    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+    "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
+    "movi       v7.8b, #16                     \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v16.8h, v0.8b, v4.8b           \n"  // R
+    "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
+    "umlal      v16.8h, v2.8b, v6.8b           \n"  // B
+    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v7.8b            \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_abgr),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(width)        // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
+  );
+}
+#endif  // HAS_ABGRTOYROW_NEON
+
+#ifdef HAS_RGBATOYROW_NEON
+void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) {
+  asm volatile (
+    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
+    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
+    "movi       v7.8b, #16                     \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v16.8h, v1.8b, v4.8b           \n"  // B
+    "umlal      v16.8h, v2.8b, v5.8b           \n"  // G
+    "umlal      v16.8h, v3.8b, v6.8b           \n"  // R
+    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v7.8b            \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_rgba),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(width)        // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
+  );
+}
+#endif  // HAS_RGBATOYROW_NEON
+
+#ifdef HAS_RGB24TOYROW_NEON
+void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) {
+  asm volatile (
+    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
+    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
+    "movi       v7.8b, #16                     \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v16.8h, v0.8b, v4.8b           \n"  // B
+    "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
+    "umlal      v16.8h, v2.8b, v6.8b           \n"  // R
+    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v7.8b            \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_rgb24),  // %0
+    "+r"(dst_y),      // %1
+    "+r"(width)         // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
+  );
+}
+#endif  // HAS_RGB24TOYROW_NEON
+
+#ifdef HAS_RAWTOYROW_NEON
+void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {
+  asm volatile (
+    "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
+    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+    "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
+    "movi       v7.8b, #16                     \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v16.8h, v0.8b, v4.8b           \n"  // B
+    "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
+    "umlal      v16.8h, v2.8b, v6.8b           \n"  // R
+    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v7.8b            \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_raw),  // %0
+    "+r"(dst_y),    // %1
+    "+r"(width)       // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
+  );
+}
+#endif  // HAS_RAWTOYROW_NEON
+
+// Bilinear filter 16x2 -> 16x1
+#ifdef HAS_INTERPOLATEROW_NEON
+void InterpolateRow_NEON(uint8* dst_ptr,
+                         const uint8* src_ptr, ptrdiff_t src_stride,
+                         int dst_width, int source_y_fraction) {
+  int y1_fraction = source_y_fraction;
+  int y0_fraction = 256 - y1_fraction;
+  const uint8* src_ptr1 = src_ptr + src_stride;
+  asm volatile (
+    "cmp        %w4, #0                        \n"
+    "b.eq       100f                           \n"
+    "cmp        %w4, #128                      \n"
+    "b.eq       50f                            \n"
+
+    "dup        v5.16b, %w4                    \n"
+    "dup        v4.16b, %w5                    \n"
+    // General purpose row blend.
+  "1:                                          \n"
+    MEMACCESS(1)
+    "ld1        {v0.16b}, [%1], #16            \n"
+    MEMACCESS(2)
+    "ld1        {v1.16b}, [%2], #16            \n"
+    "subs       %w3, %w3, #16                  \n"
+    "umull      v2.8h, v0.8b,  v4.8b           \n"
+    "umull2     v3.8h, v0.16b, v4.16b          \n"
+    "umlal      v2.8h, v1.8b,  v5.8b           \n"
+    "umlal2     v3.8h, v1.16b, v5.16b          \n"
+    "rshrn      v0.8b,  v2.8h, #8              \n"
+    "rshrn2     v0.16b, v3.8h, #8              \n"
+    MEMACCESS(0)
+    "st1        {v0.16b}, [%0], #16            \n"
+    "b.gt       1b                             \n"
+    "b          99f                            \n"
+
+    // Blend 50 / 50.
+  "50:                                         \n"
+    MEMACCESS(1)
+    "ld1        {v0.16b}, [%1], #16            \n"
+    MEMACCESS(2)
+    "ld1        {v1.16b}, [%2], #16            \n"
+    "subs       %w3, %w3, #16                  \n"
+    "urhadd     v0.16b, v0.16b, v1.16b         \n"
+    MEMACCESS(0)
+    "st1        {v0.16b}, [%0], #16            \n"
+    "b.gt       50b                            \n"
+    "b          99f                            \n"
+
+    // Blend 100 / 0 - Copy row unchanged.
+  "100:                                        \n"
+    MEMACCESS(1)
+    "ld1        {v0.16b}, [%1], #16            \n"
+    "subs       %w3, %w3, #16                  \n"
+    MEMACCESS(0)
+    "st1        {v0.16b}, [%0], #16            \n"
+    "b.gt       100b                           \n"
+
+  "99:                                         \n"
+  : "+r"(dst_ptr),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(src_ptr1),         // %2
+    "+r"(dst_width),        // %3
+    "+r"(y1_fraction),      // %4
+    "+r"(y0_fraction)       // %5
+  :
+  : "cc", "memory", "v0", "v1", "v3", "v4", "v5"
+  );
+}
+#endif  // HAS_INTERPOLATEROW_NEON
+
+// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
+#ifdef HAS_ARGBBLENDROW_NEON
+void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+                       uint8* dst_argb, int width) {
+  asm volatile (
+    "subs       %w3, %w3, #8                   \n"
+    "b.lt       89f                            \n"
+    // Blend 8 pixels.
+  "8:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB0 pixels
+    MEMACCESS(1)
+    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 ARGB1 pixels
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
+    "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
+    "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a
+    "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8
+    "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8
+    "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8
+    "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)
+    "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)
+    "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)
+    "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb
+    "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg
+    "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr
+    "movi       v3.8b, #255                    \n"  // a = 255
+    MEMACCESS(2)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
+    "b.ge       8b                             \n"
+
+  "89:                                         \n"
+    "adds       %w3, %w3, #8-1                 \n"
+    "b.lt       99f                            \n"
+
+    // Blend 1 pixels.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n"  // load 1 pixel ARGB0.
+    MEMACCESS(1)
+    "ld4        {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n"  // load 1 pixel ARGB1.
+    "subs       %w3, %w3, #1                   \n"  // 1 processed per loop.
+    "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
+    "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
+    "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a
+    "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8
+    "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8
+    "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8
+    "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)
+    "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)
+    "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)
+    "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb
+    "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg
+    "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr
+    "movi       v3.8b, #255                    \n"  // a = 255
+    MEMACCESS(2)
+    "st4        {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n"  // store 1 pixel.
+    "b.ge       1b                             \n"
+
+  "99:                                         \n"
+
+  : "+r"(src_argb0),    // %0
+    "+r"(src_argb1),    // %1
+    "+r"(dst_argb),     // %2
+    "+r"(width)         // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v16", "v17", "v18"
+  );
+}
+#endif  // HAS_ARGBBLENDROW_NEON
+
+// Attenuate 8 pixels at a time.
+#ifdef HAS_ARGBATTENUATEROW_NEON
+void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
+  asm volatile (
+    // Attenuate 8 pixels.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v4.8h, v0.8b, v3.8b            \n"  // b * a
+    "umull      v5.8h, v1.8b, v3.8b            \n"  // g * a
+    "umull      v6.8h, v2.8b, v3.8b            \n"  // r * a
+    "uqrshrn    v0.8b, v4.8h, #8               \n"  // b >>= 8
+    "uqrshrn    v1.8b, v5.8h, #8               \n"  // g >>= 8
+    "uqrshrn    v2.8b, v6.8h, #8               \n"  // r >>= 8
+    MEMACCESS(1)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_argb),   // %1
+    "+r"(width)       // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
+  );
+}
+#endif  // HAS_ARGBATTENUATEROW_NEON
+
+// Quantize 8 ARGB pixels (32 bytes).
+// dst = (dst * scale >> 16) * interval_size + interval_offset;
+#ifdef HAS_ARGBQUANTIZEROW_NEON
+void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
+                          int interval_offset, int width) {
+  asm volatile (
+    "dup        v4.8h, %w2                     \n"
+    "ushr       v4.8h, v4.8h, #1               \n"  // scale >>= 1
+    "dup        v5.8h, %w3                     \n"  // interval multiply.
+    "dup        v6.8h, %w4                     \n"  // interval add
+
+    // 8 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0]  \n"  // load 8 pixels of ARGB.
+    "subs       %w1, %w1, #8                   \n"  // 8 processed per loop.
+    "uxtl       v0.8h, v0.8b                   \n"  // b (0 .. 255)
+    "uxtl       v1.8h, v1.8b                   \n"
+    "uxtl       v2.8h, v2.8b                   \n"
+    "sqdmulh    v0.8h, v0.8h, v4.8h            \n"  // b * scale
+    "sqdmulh    v1.8h, v1.8h, v4.8h            \n"  // g
+    "sqdmulh    v2.8h, v2.8h, v4.8h            \n"  // r
+    "mul        v0.8h, v0.8h, v5.8h            \n"  // b * interval_size
+    "mul        v1.8h, v1.8h, v5.8h            \n"  // g
+    "mul        v2.8h, v2.8h, v5.8h            \n"  // r
+    "add        v0.8h, v0.8h, v6.8h            \n"  // b + interval_offset
+    "add        v1.8h, v1.8h, v6.8h            \n"  // g
+    "add        v2.8h, v2.8h, v6.8h            \n"  // r
+    "uqxtn      v0.8b, v0.8h                   \n"
+    "uqxtn      v1.8b, v1.8h                   \n"
+    "uqxtn      v2.8b, v2.8h                   \n"
+    MEMACCESS(0)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+  : "+r"(dst_argb),       // %0
+    "+r"(width)           // %1
+  : "r"(scale),           // %2
+    "r"(interval_size),   // %3
+    "r"(interval_offset)  // %4
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
+  );
+}
+#endif  // HAS_ARGBQUANTIZEROW_NEON
+
+// Shade 8 pixels at a time by specified value.
+// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
+// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
+#ifdef HAS_ARGBSHADEROW_NEON
+void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
+                       uint32 value) {
+  asm volatile (
+    "dup        v0.4s, %w3                     \n"  // duplicate scale value.
+    "zip1       v0.8b, v0.8b, v0.8b            \n"  // v0.8b aarrggbb.
+    "ushr       v0.8h, v0.8h, #1               \n"  // scale / 2.
+
+    // 8 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "uxtl       v4.8h, v4.8b                   \n"  // b (0 .. 255)
+    "uxtl       v5.8h, v5.8b                   \n"
+    "uxtl       v6.8h, v6.8b                   \n"
+    "uxtl       v7.8h, v7.8b                   \n"
+    "sqrdmulh   v4.8h, v4.8h, v0.h[0]          \n"  // b * scale * 2
+    "sqrdmulh   v5.8h, v5.8h, v0.h[1]          \n"  // g
+    "sqrdmulh   v6.8h, v6.8h, v0.h[2]          \n"  // r
+    "sqrdmulh   v7.8h, v7.8h, v0.h[3]          \n"  // a
+    "uqxtn      v4.8b, v4.8h                   \n"
+    "uqxtn      v5.8b, v5.8h                   \n"
+    "uqxtn      v6.8b, v6.8h                   \n"
+    "uqxtn      v7.8b, v7.8h                   \n"
+    MEMACCESS(1)
+    "st4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),       // %0
+    "+r"(dst_argb),       // %1
+    "+r"(width)           // %2
+  : "r"(value)            // %3
+  : "cc", "memory", "v0", "v4", "v5", "v6", "v7"
+  );
+}
+#endif  // HAS_ARGBSHADEROW_NEON
+
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
+// Similar to ARGBToYJ but stores ARGB.
+// C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
+#ifdef HAS_ARGBGRAYROW_NEON
+void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
+  asm volatile (
+    "movi       v24.8b, #15                    \n"  // B * 0.11400 coefficient
+    "movi       v25.8b, #75                    \n"  // G * 0.58700 coefficient
+    "movi       v26.8b, #38                    \n"  // R * 0.29900 coefficient
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v4.8h, v0.8b, v24.8b           \n"  // B
+    "umlal      v4.8h, v1.8b, v25.8b           \n"  // G
+    "umlal      v4.8h, v2.8b, v26.8b           \n"  // R
+    "sqrshrun   v0.8b, v4.8h, #7               \n"  // 15 bit to 8 bit B
+    "orr        v1.8b, v0.8b, v0.8b            \n"  // G
+    "orr        v2.8b, v0.8b, v0.8b            \n"  // R
+    MEMACCESS(1)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 pixels.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(width)      // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26"
+  );
+}
+#endif  // HAS_ARGBGRAYROW_NEON
+
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+//    b = (r * 35 + g * 68 + b * 17) >> 7
+//    g = (r * 45 + g * 88 + b * 22) >> 7
+//    r = (r * 50 + g * 98 + b * 24) >> 7
+
+#ifdef HAS_ARGBSEPIAROW_NEON
+void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
+  asm volatile (
+    "movi       v20.8b, #17                    \n"  // BB coefficient
+    "movi       v21.8b, #68                    \n"  // BG coefficient
+    "movi       v22.8b, #35                    \n"  // BR coefficient
+    "movi       v24.8b, #22                    \n"  // GB coefficient
+    "movi       v25.8b, #88                    \n"  // GG coefficient
+    "movi       v26.8b, #45                    \n"  // GR coefficient
+    "movi       v28.8b, #24                    \n"  // BB coefficient
+    "movi       v29.8b, #98                    \n"  // BG coefficient
+    "movi       v30.8b, #50                    \n"  // BR coefficient
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n"  // load 8 ARGB pixels.
+    "subs       %w1, %w1, #8                   \n"  // 8 processed per loop.
+    "umull      v4.8h, v0.8b, v20.8b           \n"  // B to Sepia B
+    "umlal      v4.8h, v1.8b, v21.8b           \n"  // G
+    "umlal      v4.8h, v2.8b, v22.8b           \n"  // R
+    "umull      v5.8h, v0.8b, v24.8b           \n"  // B to Sepia G
+    "umlal      v5.8h, v1.8b, v25.8b           \n"  // G
+    "umlal      v5.8h, v2.8b, v26.8b           \n"  // R
+    "umull      v6.8h, v0.8b, v28.8b           \n"  // B to Sepia R
+    "umlal      v6.8h, v1.8b, v29.8b           \n"  // G
+    "umlal      v6.8h, v2.8b, v30.8b           \n"  // R
+    "uqshrn     v0.8b, v4.8h, #7               \n"  // 16 bit to 8 bit B
+    "uqshrn     v1.8b, v5.8h, #7               \n"  // 16 bit to 8 bit G
+    "uqshrn     v2.8b, v6.8h, #7               \n"  // 16 bit to 8 bit R
+    MEMACCESS(0)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 pixels.
+    "b.gt       1b                             \n"
+  : "+r"(dst_argb),  // %0
+    "+r"(width)      // %1
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_ARGBSEPIAROW_NEON
+
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// TODO(fbarchard): Was same as Sepia except matrix is provided.  This function
+// needs to saturate.  Consider doing a non-saturating version.
+#ifdef HAS_ARGBCOLORMATRIXROW_NEON
+void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
+                             const int8* matrix_argb, int width) {
+  asm volatile (
+    MEMACCESS(3)
+    "ld1        {v2.16b}, [%3]                 \n"  // load 3 ARGB vectors.
+    "sxtl       v0.8h, v2.8b                   \n"  // B,G coefficients s16.
+    "sxtl2      v1.8h, v2.16b                  \n"  // R,A coefficients s16.
+
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "uxtl       v16.8h, v16.8b                 \n"  // b (0 .. 255) 16 bit
+    "uxtl       v17.8h, v17.8b                 \n"  // g
+    "uxtl       v18.8h, v18.8b                 \n"  // r
+    "uxtl       v19.8h, v19.8b                 \n"  // a
+    "mul        v22.8h, v16.8h, v0.h[0]        \n"  // B = B * Matrix B
+    "mul        v23.8h, v16.8h, v0.h[4]        \n"  // G = B * Matrix G
+    "mul        v24.8h, v16.8h, v1.h[0]        \n"  // R = B * Matrix R
+    "mul        v25.8h, v16.8h, v1.h[4]        \n"  // A = B * Matrix A
+    "mul        v4.8h, v17.8h, v0.h[1]         \n"  // B += G * Matrix B
+    "mul        v5.8h, v17.8h, v0.h[5]         \n"  // G += G * Matrix G
+    "mul        v6.8h, v17.8h, v1.h[1]         \n"  // R += G * Matrix R
+    "mul        v7.8h, v17.8h, v1.h[5]         \n"  // A += G * Matrix A
+    "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
+    "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
+    "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
+    "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
+    "mul        v4.8h, v18.8h, v0.h[2]         \n"  // B += R * Matrix B
+    "mul        v5.8h, v18.8h, v0.h[6]         \n"  // G += R * Matrix G
+    "mul        v6.8h, v18.8h, v1.h[2]         \n"  // R += R * Matrix R
+    "mul        v7.8h, v18.8h, v1.h[6]         \n"  // A += R * Matrix A
+    "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
+    "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
+    "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
+    "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
+    "mul        v4.8h, v19.8h, v0.h[3]         \n"  // B += A * Matrix B
+    "mul        v5.8h, v19.8h, v0.h[7]         \n"  // G += A * Matrix G
+    "mul        v6.8h, v19.8h, v1.h[3]         \n"  // R += A * Matrix R
+    "mul        v7.8h, v19.8h, v1.h[7]         \n"  // A += A * Matrix A
+    "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
+    "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
+    "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
+    "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
+    "sqshrun    v16.8b, v22.8h, #6             \n"  // 16 bit to 8 bit B
+    "sqshrun    v17.8b, v23.8h, #6             \n"  // 16 bit to 8 bit G
+    "sqshrun    v18.8b, v24.8h, #6             \n"  // 16 bit to 8 bit R
+    "sqshrun    v19.8b, v25.8h, #6             \n"  // 16 bit to 8 bit A
+    MEMACCESS(1)
+    "st4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n"  // store 8 pixels.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_argb),   // %1
+    "+r"(width)       // %2
+  : "r"(matrix_argb)  // %3
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
+    "v18", "v19", "v22", "v23", "v24", "v25"
+  );
+}
+#endif  // HAS_ARGBCOLORMATRIXROW_NEON
+
+// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+#ifdef HAS_ARGBMULTIPLYROW_NEON
+void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  asm volatile (
+    // 8 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
+    MEMACCESS(1)
+    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "umull      v0.8h, v0.8b, v4.8b            \n"  // multiply B
+    "umull      v1.8h, v1.8b, v5.8b            \n"  // multiply G
+    "umull      v2.8h, v2.8b, v6.8b            \n"  // multiply R
+    "umull      v3.8h, v3.8b, v7.8b            \n"  // multiply A
+    "rshrn      v0.8b, v0.8h, #8               \n"  // 16 bit to 8 bit B
+    "rshrn      v1.8b, v1.8h, #8               \n"  // 16 bit to 8 bit G
+    "rshrn      v2.8b, v2.8h, #8               \n"  // 16 bit to 8 bit R
+    "rshrn      v3.8b, v3.8h, #8               \n"  // 16 bit to 8 bit A
+    MEMACCESS(2)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
+  );
+}
+#endif  // HAS_ARGBMULTIPLYROW_NEON
+
+// Add 2 rows of ARGB pixels together, 8 pixels at a time.
+#ifdef HAS_ARGBADDROW_NEON
+void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    // 8 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
+    MEMACCESS(1)
+    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "uqadd      v0.8b, v0.8b, v4.8b            \n"
+    "uqadd      v1.8b, v1.8b, v5.8b            \n"
+    "uqadd      v2.8b, v2.8b, v6.8b            \n"
+    "uqadd      v3.8b, v3.8b, v7.8b            \n"
+    MEMACCESS(2)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
+  );
+}
+#endif  // HAS_ARGBADDROW_NEON
+
+// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
+#ifdef HAS_ARGBSUBTRACTROW_NEON
+void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  asm volatile (
+    // 8 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
+    MEMACCESS(1)
+    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "uqsub      v0.8b, v0.8b, v4.8b            \n"
+    "uqsub      v1.8b, v1.8b, v5.8b            \n"
+    "uqsub      v2.8b, v2.8b, v6.8b            \n"
+    "uqsub      v3.8b, v3.8b, v7.8b            \n"
+    MEMACCESS(2)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
+  );
+}
+#endif  // HAS_ARGBSUBTRACTROW_NEON
+
+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+// A = 255
+// R = Sobel
+// G = Sobel
+// B = Sobel
+#ifdef HAS_SOBELROW_NEON
+void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    "movi       v3.8b, #255                    \n"  // alpha
+    // 8 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.8b}, [%0], #8              \n"  // load 8 sobelx.
+    MEMACCESS(1)
+    "ld1        {v1.8b}, [%1], #8              \n"  // load 8 sobely.
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "uqadd      v0.8b, v0.8b, v1.8b            \n"  // add
+    "orr        v1.8b, v0.8b, v0.8b            \n"
+    "orr        v2.8b, v0.8b, v0.8b            \n"
+    MEMACCESS(2)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_argb),    // %2
+    "+r"(width)        // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3"
+  );
+}
+#endif  // HAS_SOBELROW_NEON
+
+// Adds Sobel X and Sobel Y and stores Sobel into plane.
+#ifdef HAS_SOBELTOPLANEROW_NEON
+void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                          uint8* dst_y, int width) {
+  asm volatile (
+    // 16 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 16 sobelx.
+    MEMACCESS(1)
+    "ld1        {v1.16b}, [%1], #16            \n"  // load 16 sobely.
+    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
+    "uqadd      v0.16b, v0.16b, v1.16b         \n"  // add
+    MEMACCESS(2)
+    "st1        {v0.16b}, [%2], #16            \n"  // store 16 pixels.
+    "b.gt       1b                             \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_y),       // %2
+    "+r"(width)        // %3
+  :
+  : "cc", "memory", "v0", "v1"
+  );
+}
+#endif  // HAS_SOBELTOPLANEROW_NEON
+
+// Mixes Sobel X, Sobel Y and Sobel into ARGB.
+// A = 255
+// R = Sobel X
+// G = Sobel
+// B = Sobel Y
+#ifdef HAS_SOBELXYROW_NEON
+void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    "movi       v3.8b, #255                    \n"  // alpha
+    // 8 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v2.8b}, [%0], #8              \n"  // load 8 sobelx.
+    MEMACCESS(1)
+    "ld1        {v0.8b}, [%1], #8              \n"  // load 8 sobely.
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "uqadd      v1.8b, v0.8b, v2.8b            \n"  // add
+    MEMACCESS(2)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_argb),    // %2
+    "+r"(width)        // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3"
+  );
+}
+#endif  // HAS_SOBELXYROW_NEON
+
+// SobelX as a matrix is
+// -1  0  1
+// -2  0  2
+// -1  0  1
+#ifdef HAS_SOBELXROW_NEON
+void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
+                    const uint8* src_y2, uint8* dst_sobelx, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.8b}, [%0],%5               \n"  // top
+    MEMACCESS(0)
+    "ld1        {v1.8b}, [%0],%6               \n"
+    "usubl      v0.8h, v0.8b, v1.8b            \n"
+    MEMACCESS(1)
+    "ld1        {v2.8b}, [%1],%5               \n"  // center * 2
+    MEMACCESS(1)
+    "ld1        {v3.8b}, [%1],%6               \n"
+    "usubl      v1.8h, v2.8b, v3.8b            \n"
+    "add        v0.8h, v0.8h, v1.8h            \n"
+    "add        v0.8h, v0.8h, v1.8h            \n"
+    MEMACCESS(2)
+    "ld1        {v2.8b}, [%2],%5               \n"  // bottom
+    MEMACCESS(2)
+    "ld1        {v3.8b}, [%2],%6               \n"
+    "subs       %w4, %w4, #8                   \n"  // 8 pixels
+    "usubl      v1.8h, v2.8b, v3.8b            \n"
+    "add        v0.8h, v0.8h, v1.8h            \n"
+    "abs        v0.8h, v0.8h                   \n"
+    "uqxtn      v0.8b, v0.8h                   \n"
+    MEMACCESS(3)
+    "st1        {v0.8b}, [%3], #8              \n"  // store 8 sobelx
+    "b.gt       1b                             \n"
+  : "+r"(src_y0),      // %0
+    "+r"(src_y1),      // %1
+    "+r"(src_y2),      // %2
+    "+r"(dst_sobelx),  // %3
+    "+r"(width)        // %4
+  : "r"(2LL),          // %5
+    "r"(6LL)           // %6
+  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+#endif  // HAS_SOBELXROW_NEON
+
+// SobelY as a matrix is
+// -1 -2 -1
+//  0  0  0
+//  1  2  1
+#ifdef HAS_SOBELYROW_NEON
+void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
+                    uint8* dst_sobely, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.8b}, [%0],%4               \n"  // left
+    MEMACCESS(1)
+    "ld1        {v1.8b}, [%1],%4               \n"
+    "usubl      v0.8h, v0.8b, v1.8b            \n"
+    MEMACCESS(0)
+    "ld1        {v2.8b}, [%0],%4               \n"  // center * 2
+    MEMACCESS(1)
+    "ld1        {v3.8b}, [%1],%4               \n"
+    "usubl      v1.8h, v2.8b, v3.8b            \n"
+    "add        v0.8h, v0.8h, v1.8h            \n"
+    "add        v0.8h, v0.8h, v1.8h            \n"
+    MEMACCESS(0)
+    "ld1        {v2.8b}, [%0],%5               \n"  // right
+    MEMACCESS(1)
+    "ld1        {v3.8b}, [%1],%5               \n"
+    "subs       %w3, %w3, #8                   \n"  // 8 pixels
+    "usubl      v1.8h, v2.8b, v3.8b            \n"
+    "add        v0.8h, v0.8h, v1.8h            \n"
+    "abs        v0.8h, v0.8h                   \n"
+    "uqxtn      v0.8b, v0.8h                   \n"
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 sobely
+    "b.gt       1b                             \n"
+  : "+r"(src_y0),      // %0
+    "+r"(src_y1),      // %1
+    "+r"(dst_sobely),  // %2
+    "+r"(width)        // %3
+  : "r"(1LL),          // %4
+    "r"(6LL)           // %5
+  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+#endif  // HAS_SOBELYROW_NEON
+#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libyuv/source/row_win.cc b/libs/libyuv/source/row_win.cc
new file mode 100644
index 0000000000..a8c16c3c1e
--- /dev/null
+++ b/libs/libyuv/source/row_win.cc
@@ -0,0 +1,6241 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_X64) && \
+    defined(_MSC_VER) && !defined(__clang__)
+#include <emmintrin.h>
+#include <tmmintrin.h>  // For _mm_maddubs_epi16
+#endif
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for Visual C 32/64 bit and clangcl 32 bit
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__)))
+
+// 64 bit
+#if defined(_M_X64)
+
+// Read 4 UV from 422, upsample to 8 UV.
+#define READYUV422                                                             \
+    xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);                                 \
+    xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));                      \
+    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                                      \
+    xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);                                     \
+    u_buf += 4;                                                                \
+    xmm4 = _mm_loadl_epi64((__m128i*)y_buf);                                   \
+    xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                                      \
+    y_buf += 8;
+
+// Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
+#define READYUVA422                                                            \
+    xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);                                 \
+    xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));                      \
+    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                                      \
+    xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);                                     \
+    u_buf += 4;                                                                \
+    xmm4 = _mm_loadl_epi64((__m128i*)y_buf);                                   \
+    xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                                      \
+    y_buf += 8;                                                                \
+    xmm5 = _mm_loadl_epi64((__m128i*)a_buf);                                   \
+    a_buf += 8;
+
+// Convert 8 pixels: 8 UV and 8 Y.
+#define YUVTORGB(yuvconstants)                                                 \
+    xmm1 = _mm_loadu_si128(&xmm0);                                             \
+    xmm2 = _mm_loadu_si128(&xmm0);                                             \
+    xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB);           \
+    xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG);           \
+    xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR);           \
+    xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0);             \
+    xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1);             \
+    xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2);             \
+    xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb);            \
+    xmm0 = _mm_adds_epi16(xmm0, xmm4);                                         \
+    xmm1 = _mm_adds_epi16(xmm1, xmm4);                                         \
+    xmm2 = _mm_adds_epi16(xmm2, xmm4);                                         \
+    xmm0 = _mm_srai_epi16(xmm0, 6);                                            \
+    xmm1 = _mm_srai_epi16(xmm1, 6);                                            \
+    xmm2 = _mm_srai_epi16(xmm2, 6);                                            \
+    xmm0 = _mm_packus_epi16(xmm0, xmm0);                                       \
+    xmm1 = _mm_packus_epi16(xmm1, xmm1);                                       \
+    xmm2 = _mm_packus_epi16(xmm2, xmm2);
+
+// Store 8 ARGB values.
+#define STOREARGB                                                              \
+    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                                      \
+    xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);                                      \
+    xmm1 = _mm_loadu_si128(&xmm0);                                             \
+    xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);                                     \
+    xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);                                     \
+    _mm_storeu_si128((__m128i *)dst_argb, xmm0);                               \
+    _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1);                        \
+    dst_argb += 32;
+
+
+#if defined(HAS_I422TOARGBROW_SSSE3)
+void I422ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  __m128i xmm0, xmm1, xmm2, xmm4;
+  const __m128i xmm5 = _mm_set1_epi8(-1);
+  const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
+  while (width > 0) {
+    READYUV422
+    YUVTORGB(yuvconstants)
+    STOREARGB
+    width -= 8;
+  }
+}
+#endif
+
+#if defined(HAS_I422ALPHATOARGBROW_SSSE3)
+void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              const uint8* a_buf,
+                              uint8* dst_argb,
+                              const struct YuvConstants* yuvconstants,
+                              int width) {
+  __m128i xmm0, xmm1, xmm2, xmm4, xmm5;
+  const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
+  while (width > 0) {
+    READYUVA422
+    YUVTORGB(yuvconstants)
+    STOREARGB
+    width -= 8;
+  }
+}
+#endif
+
+// 32 bit
+#else  // defined(_M_X64)
+#ifdef HAS_ARGBTOYROW_SSSE3
+
+// Constants for ARGB.
+static const vec8 kARGBToY = {
+  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
+};
+
+// JPeg full range.
+static const vec8 kARGBToYJ = {
+  15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
+};
+
+static const vec8 kARGBToU = {
+  112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
+};
+
+static const vec8 kARGBToUJ = {
+  127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
+};
+
+static const vec8 kARGBToV = {
+  -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
+};
+
+static const vec8 kARGBToVJ = {
+  -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
+};
+
+// vpshufb for vphaddw + vpackuswb packed to shorts.
+static const lvec8 kShufARGBToUV_AVX = {
+  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+};
+
+// Constants for BGRA.
+static const vec8 kBGRAToY = {
+  0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
+};
+
+static const vec8 kBGRAToU = {
+  0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
+};
+
+static const vec8 kBGRAToV = {
+  0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
+};
+
+// Constants for ABGR.
+static const vec8 kABGRToY = {
+  33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
+};
+
+static const vec8 kABGRToU = {
+  -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
+};
+
+static const vec8 kABGRToV = {
+  112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
+};
+
+// Constants for RGBA.
+static const vec8 kRGBAToY = {
+  0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
+};
+
+static const vec8 kRGBAToU = {
+  0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
+};
+
+static const vec8 kRGBAToV = {
+  0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
+};
+
+static const uvec8 kAddY16 = {
+  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
+};
+
+// 7 bit fixed point 0.5.
+static const vec16 kAddYJ64 = {
+  64, 64, 64, 64, 64, 64, 64, 64
+};
+
+static const uvec8 kAddUV128 = {
+  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
+};
+
+static const uvec16 kAddUVJ128 = {
+  0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
+};
+
+// Shuffle table for converting RGB24 to ARGB.
+static const uvec8 kShuffleMaskRGB24ToARGB = {
+  0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
+};
+
+// Shuffle table for converting RAW to ARGB.
+static const uvec8 kShuffleMaskRAWToARGB = {
+  2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
+};
+
+// Shuffle table for converting RAW to RGB24.  First 8.
+static const uvec8 kShuffleMaskRAWToRGB24_0 = {
+  2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
+  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
+};
+
+// Shuffle table for converting RAW to RGB24.  Middle 8.
+static const uvec8 kShuffleMaskRAWToRGB24_1 = {
+  2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
+  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
+};
+
+// Shuffle table for converting RAW to RGB24.  Last 8.
+static const uvec8 kShuffleMaskRAWToRGB24_2 = {
+  8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
+  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
+};
+
+// Shuffle table for converting ARGB to RGB24.
+static const uvec8 kShuffleMaskARGBToRGB24 = {
+  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
+};
+
+// Shuffle table for converting ARGB to RAW.
+static const uvec8 kShuffleMaskARGBToRAW = {
+  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
+};
+
+// Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
+static const uvec8 kShuffleMaskARGBToRGB24_0 = {
+  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
+};
+
+// YUY2 shuf 16 Y to 32 Y.
+static const lvec8 kShuffleYUY2Y = {
+  0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14,
+  0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14
+};
+
+// YUY2 shuf 8 UV to 16 UV.
+static const lvec8 kShuffleYUY2UV = {
+  1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15,
+  1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15
+};
+
+// UYVY shuf 16 Y to 32 Y.
+static const lvec8 kShuffleUYVYY = {
+  1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15,
+  1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15
+};
+
+// UYVY shuf 8 UV to 16 UV.
+static const lvec8 kShuffleUYVYUV = {
+  0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14,
+  0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14
+};
+
+// NV21 shuf 8 VU to 16 UV.
+static const lvec8 kShuffleNV21 = {
+  1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
+  1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
+};
+
+// Duplicates gray value 3 times and fills in alpha opaque.
+__declspec(naked)
+void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_y
+    mov        edx, [esp + 8]        // dst_argb
+    mov        ecx, [esp + 12]       // width
+    pcmpeqb    xmm5, xmm5            // generate mask 0xff000000
+    pslld      xmm5, 24
+
+  convertloop:
+    movq       xmm0, qword ptr [eax]
+    lea        eax,  [eax + 8]
+    punpcklbw  xmm0, xmm0
+    movdqa     xmm1, xmm0
+    punpcklwd  xmm0, xmm0
+    punpckhwd  xmm1, xmm1
+    por        xmm0, xmm5
+    por        xmm1, xmm5
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+    ret
+  }
+}
+
+#ifdef HAS_J400TOARGBROW_AVX2
+// Duplicates gray value 3 times and fills in alpha opaque.
+__declspec(naked)
+void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width) {
+  __asm {
+    mov         eax, [esp + 4]        // src_y
+    mov         edx, [esp + 8]        // dst_argb
+    mov         ecx, [esp + 12]       // width
+    vpcmpeqb    ymm5, ymm5, ymm5      // generate mask 0xff000000
+    vpslld      ymm5, ymm5, 24
+
+  convertloop:
+    vmovdqu     xmm0, [eax]
+    lea         eax,  [eax + 16]
+    vpermq      ymm0, ymm0, 0xd8
+    vpunpcklbw  ymm0, ymm0, ymm0
+    vpermq      ymm0, ymm0, 0xd8
+    vpunpckhwd  ymm1, ymm0, ymm0
+    vpunpcklwd  ymm0, ymm0, ymm0
+    vpor        ymm0, ymm0, ymm5
+    vpor        ymm1, ymm1, ymm5
+    vmovdqu     [edx], ymm0
+    vmovdqu     [edx + 32], ymm1
+    lea         edx, [edx + 64]
+    sub         ecx, 16
+    jg          convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_J400TOARGBROW_AVX2
+
+__declspec(naked)
+void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) {
+  __asm {
+    mov       eax, [esp + 4]   // src_rgb24
+    mov       edx, [esp + 8]   // dst_argb
+    mov       ecx, [esp + 12]  // width
+    pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
+    pslld     xmm5, 24
+    movdqa    xmm4, xmmword ptr kShuffleMaskRGB24ToARGB
+
+ convertloop:
+    movdqu    xmm0, [eax]
+    movdqu    xmm1, [eax + 16]
+    movdqu    xmm3, [eax + 32]
+    lea       eax, [eax + 48]
+    movdqa    xmm2, xmm3
+    palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
+    pshufb    xmm2, xmm4
+    por       xmm2, xmm5
+    palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
+    pshufb    xmm0, xmm4
+    movdqu    [edx + 32], xmm2
+    por       xmm0, xmm5
+    pshufb    xmm1, xmm4
+    movdqu    [edx], xmm0
+    por       xmm1, xmm5
+    palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
+    pshufb    xmm3, xmm4
+    movdqu    [edx + 16], xmm1
+    por       xmm3, xmm5
+    movdqu    [edx + 48], xmm3
+    lea       edx, [edx + 64]
+    sub       ecx, 16
+    jg        convertloop
+    ret
+  }
+}
+
+__declspec(naked)
+void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
+                        int width) {
+  __asm {
+    mov       eax, [esp + 4]   // src_raw
+    mov       edx, [esp + 8]   // dst_argb
+    mov       ecx, [esp + 12]  // width
+    pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
+    pslld     xmm5, 24
+    movdqa    xmm4, xmmword ptr kShuffleMaskRAWToARGB
+
+ convertloop:
+    movdqu    xmm0, [eax]
+    movdqu    xmm1, [eax + 16]
+    movdqu    xmm3, [eax + 32]
+    lea       eax, [eax + 48]
+    movdqa    xmm2, xmm3
+    palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
+    pshufb    xmm2, xmm4
+    por       xmm2, xmm5
+    palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
+    pshufb    xmm0, xmm4
+    movdqu    [edx + 32], xmm2
+    por       xmm0, xmm5
+    pshufb    xmm1, xmm4
+    movdqu    [edx], xmm0
+    por       xmm1, xmm5
+    palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
+    pshufb    xmm3, xmm4
+    movdqu    [edx + 16], xmm1
+    por       xmm3, xmm5
+    movdqu    [edx + 48], xmm3
+    lea       edx, [edx + 64]
+    sub       ecx, 16
+    jg        convertloop
+    ret
+  }
+}
+
+__declspec(naked)
+void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) {
+  __asm {
+    mov       eax, [esp + 4]   // src_raw
+    mov       edx, [esp + 8]   // dst_rgb24
+    mov       ecx, [esp + 12]  // width
+    movdqa    xmm3, xmmword ptr kShuffleMaskRAWToRGB24_0
+    movdqa    xmm4, xmmword ptr kShuffleMaskRAWToRGB24_1
+    movdqa    xmm5, xmmword ptr kShuffleMaskRAWToRGB24_2
+
+ convertloop:
+    movdqu    xmm0, [eax]
+    movdqu    xmm1, [eax + 4]
+    movdqu    xmm2, [eax + 8]
+    lea       eax, [eax + 24]
+    pshufb    xmm0, xmm3
+    pshufb    xmm1, xmm4
+    pshufb    xmm2, xmm5
+    movq      qword ptr [edx], xmm0
+    movq      qword ptr [edx + 8], xmm1
+    movq      qword ptr [edx + 16], xmm2
+    lea       edx, [edx + 24]
+    sub       ecx, 8
+    jg        convertloop
+    ret
+  }
+}
+
+// pmul method to replicate bits.
+// Math to replicate bits:
+// (v << 8) | (v << 3)
+// v * 256 + v * 8
+// v * (256 + 8)
+// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
+// 20 instructions.
+__declspec(naked)
+void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
+                          int width) {
+  __asm {
+    mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
+    movd      xmm5, eax
+    pshufd    xmm5, xmm5, 0
+    mov       eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
+    movd      xmm6, eax
+    pshufd    xmm6, xmm6, 0
+    pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
+    psllw     xmm3, 11
+    pcmpeqb   xmm4, xmm4       // generate mask 0x07e007e0 for Green
+    psllw     xmm4, 10
+    psrlw     xmm4, 5
+    pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
+    psllw     xmm7, 8
+
+    mov       eax, [esp + 4]   // src_rgb565
+    mov       edx, [esp + 8]   // dst_argb
+    mov       ecx, [esp + 12]  // width
+    sub       edx, eax
+    sub       edx, eax
+
+ convertloop:
+    movdqu    xmm0, [eax]   // fetch 8 pixels of bgr565
+    movdqa    xmm1, xmm0
+    movdqa    xmm2, xmm0
+    pand      xmm1, xmm3    // R in upper 5 bits
+    psllw     xmm2, 11      // B in upper 5 bits
+    pmulhuw   xmm1, xmm5    // * (256 + 8)
+    pmulhuw   xmm2, xmm5    // * (256 + 8)
+    psllw     xmm1, 8
+    por       xmm1, xmm2    // RB
+    pand      xmm0, xmm4    // G in middle 6 bits
+    pmulhuw   xmm0, xmm6    // << 5 * (256 + 4)
+    por       xmm0, xmm7    // AG
+    movdqa    xmm2, xmm1
+    punpcklbw xmm1, xmm0
+    punpckhbw xmm2, xmm0
+    movdqu    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
+    movdqu    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
+    lea       eax, [eax + 16]
+    sub       ecx, 8
+    jg        convertloop
+    ret
+  }
+}
+
+#ifdef HAS_RGB565TOARGBROW_AVX2
+// pmul method to replicate bits.
+// Math to replicate bits:
+// (v << 8) | (v << 3)
+// v * 256 + v * 8
+// v * (256 + 8)
+// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
+__declspec(naked)
+void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb,
+                          int width) {
+  __asm {
+    mov        eax, 0x01080108  // generate multiplier to repeat 5 bits
+    vmovd      xmm5, eax
+    vbroadcastss ymm5, xmm5
+    mov        eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
+    vmovd      xmm6, eax
+    vbroadcastss ymm6, xmm6
+    vpcmpeqb   ymm3, ymm3, ymm3       // generate mask 0xf800f800 for Red
+    vpsllw     ymm3, ymm3, 11
+    vpcmpeqb   ymm4, ymm4, ymm4       // generate mask 0x07e007e0 for Green
+    vpsllw     ymm4, ymm4, 10
+    vpsrlw     ymm4, ymm4, 5
+    vpcmpeqb   ymm7, ymm7, ymm7       // generate mask 0xff00ff00 for Alpha
+    vpsllw     ymm7, ymm7, 8
+
+    mov        eax, [esp + 4]   // src_rgb565
+    mov        edx, [esp + 8]   // dst_argb
+    mov        ecx, [esp + 12]  // width
+    sub        edx, eax
+    sub        edx, eax
+
+ convertloop:
+    vmovdqu    ymm0, [eax]   // fetch 16 pixels of bgr565
+    vpand      ymm1, ymm0, ymm3    // R in upper 5 bits
+    vpsllw     ymm2, ymm0, 11      // B in upper 5 bits
+    vpmulhuw   ymm1, ymm1, ymm5    // * (256 + 8)
+    vpmulhuw   ymm2, ymm2, ymm5    // * (256 + 8)
+    vpsllw     ymm1, ymm1, 8
+    vpor       ymm1, ymm1, ymm2    // RB
+    vpand      ymm0, ymm0, ymm4    // G in middle 6 bits
+    vpmulhuw   ymm0, ymm0, ymm6    // << 5 * (256 + 4)
+    vpor       ymm0, ymm0, ymm7    // AG
+    vpermq     ymm0, ymm0, 0xd8    // mutate for unpack
+    vpermq     ymm1, ymm1, 0xd8
+    vpunpckhbw ymm2, ymm1, ymm0
+    vpunpcklbw ymm1, ymm1, ymm0
+    vmovdqu    [eax * 2 + edx], ymm1  // store 4 pixels of ARGB
+    vmovdqu    [eax * 2 + edx + 32], ymm2  // store next 4 pixels of ARGB
+    lea       eax, [eax + 32]
+    sub       ecx, 16
+    jg        convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_RGB565TOARGBROW_AVX2
+
+#ifdef HAS_ARGB1555TOARGBROW_AVX2
+__declspec(naked)
+void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
+                            int width) {
+  __asm {
+    mov        eax, 0x01080108  // generate multiplier to repeat 5 bits
+    vmovd      xmm5, eax
+    vbroadcastss ymm5, xmm5
+    mov        eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
+    vmovd      xmm6, eax
+    vbroadcastss ymm6, xmm6
+    vpcmpeqb   ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
+    vpsllw     ymm3, ymm3, 11
+    vpsrlw     ymm4, ymm3, 6    // generate mask 0x03e003e0 for Green
+    vpcmpeqb   ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
+    vpsllw     ymm7, ymm7, 8
+
+    mov        eax,  [esp + 4]   // src_argb1555
+    mov        edx,  [esp + 8]   // dst_argb
+    mov        ecx,  [esp + 12]  // width
+    sub        edx,  eax
+    sub        edx,  eax
+
+ convertloop:
+    vmovdqu    ymm0, [eax]         // fetch 16 pixels of 1555
+    vpsllw     ymm1, ymm0, 1       // R in upper 5 bits
+    vpsllw     ymm2, ymm0, 11      // B in upper 5 bits
+    vpand      ymm1, ymm1, ymm3
+    vpmulhuw   ymm2, ymm2, ymm5    // * (256 + 8)
+    vpmulhuw   ymm1, ymm1, ymm5    // * (256 + 8)
+    vpsllw     ymm1, ymm1, 8
+    vpor       ymm1, ymm1, ymm2    // RB
+    vpsraw     ymm2, ymm0, 8       // A
+    vpand      ymm0, ymm0, ymm4    // G in middle 5 bits
+    vpmulhuw   ymm0, ymm0, ymm6    // << 6 * (256 + 8)
+    vpand      ymm2, ymm2, ymm7
+    vpor       ymm0, ymm0, ymm2    // AG
+    vpermq     ymm0, ymm0, 0xd8    // mutate for unpack
+    vpermq     ymm1, ymm1, 0xd8
+    vpunpckhbw ymm2, ymm1, ymm0
+    vpunpcklbw ymm1, ymm1, ymm0
+    vmovdqu    [eax * 2 + edx], ymm1  // store 8 pixels of ARGB
+    vmovdqu    [eax * 2 + edx + 32], ymm2  // store next 8 pixels of ARGB
+    lea       eax, [eax + 32]
+    sub       ecx, 16
+    jg        convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGB1555TOARGBROW_AVX2
+
+#ifdef HAS_ARGB4444TOARGBROW_AVX2
+__declspec(naked)
+void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
+                            int width) {
+  __asm {
+    mov       eax,  0x0f0f0f0f  // generate mask 0x0f0f0f0f
+    vmovd     xmm4, eax
+    vbroadcastss ymm4, xmm4
+    vpslld    ymm5, ymm4, 4     // 0xf0f0f0f0 for high nibbles
+    mov       eax,  [esp + 4]   // src_argb4444
+    mov       edx,  [esp + 8]   // dst_argb
+    mov       ecx,  [esp + 12]  // width
+    sub       edx,  eax
+    sub       edx,  eax
+
+ convertloop:
+    vmovdqu    ymm0, [eax]         // fetch 16 pixels of bgra4444
+    vpand      ymm2, ymm0, ymm5    // mask high nibbles
+    vpand      ymm0, ymm0, ymm4    // mask low nibbles
+    vpsrlw     ymm3, ymm2, 4
+    vpsllw     ymm1, ymm0, 4
+    vpor       ymm2, ymm2, ymm3
+    vpor       ymm0, ymm0, ymm1
+    vpermq     ymm0, ymm0, 0xd8    // mutate for unpack
+    vpermq     ymm2, ymm2, 0xd8
+    vpunpckhbw ymm1, ymm0, ymm2
+    vpunpcklbw ymm0, ymm0, ymm2
+    vmovdqu    [eax * 2 + edx], ymm0  // store 8 pixels of ARGB
+    vmovdqu    [eax * 2 + edx + 32], ymm1  // store next 8 pixels of ARGB
+    lea       eax, [eax + 32]
+    sub       ecx, 16
+    jg        convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGB4444TOARGBROW_AVX2
+
+// 24 instructions
+__declspec(naked)
+void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
+                            int width) {
+  __asm {
+    mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
+    movd      xmm5, eax
+    pshufd    xmm5, xmm5, 0
+    mov       eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
+    movd      xmm6, eax
+    pshufd    xmm6, xmm6, 0
+    pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
+    psllw     xmm3, 11
+    movdqa    xmm4, xmm3       // generate mask 0x03e003e0 for Green
+    psrlw     xmm4, 6
+    pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
+    psllw     xmm7, 8
+
+    mov       eax, [esp + 4]   // src_argb1555
+    mov       edx, [esp + 8]   // dst_argb
+    mov       ecx, [esp + 12]  // width
+    sub       edx, eax
+    sub       edx, eax
+
+ convertloop:
+    movdqu    xmm0, [eax]   // fetch 8 pixels of 1555
+    movdqa    xmm1, xmm0
+    movdqa    xmm2, xmm0
+    psllw     xmm1, 1       // R in upper 5 bits
+    psllw     xmm2, 11      // B in upper 5 bits
+    pand      xmm1, xmm3
+    pmulhuw   xmm2, xmm5    // * (256 + 8)
+    pmulhuw   xmm1, xmm5    // * (256 + 8)
+    psllw     xmm1, 8
+    por       xmm1, xmm2    // RB
+    movdqa    xmm2, xmm0
+    pand      xmm0, xmm4    // G in middle 5 bits
+    psraw     xmm2, 8       // A
+    pmulhuw   xmm0, xmm6    // << 6 * (256 + 8)
+    pand      xmm2, xmm7
+    por       xmm0, xmm2    // AG
+    movdqa    xmm2, xmm1
+    punpcklbw xmm1, xmm0
+    punpckhbw xmm2, xmm0
+    movdqu    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
+    movdqu    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
+    lea       eax, [eax + 16]
+    sub       ecx, 8
+    jg        convertloop
+    ret
+  }
+}
+
+// 18 instructions.
+__declspec(naked)
+void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
+                            int width) {
+  __asm {
+    mov       eax, 0x0f0f0f0f  // generate mask 0x0f0f0f0f
+    movd      xmm4, eax
+    pshufd    xmm4, xmm4, 0
+    movdqa    xmm5, xmm4       // 0xf0f0f0f0 for high nibbles
+    pslld     xmm5, 4
+    mov       eax, [esp + 4]   // src_argb4444
+    mov       edx, [esp + 8]   // dst_argb
+    mov       ecx, [esp + 12]  // width
+    sub       edx, eax
+    sub       edx, eax
+
+ convertloop:
+    movdqu    xmm0, [eax]   // fetch 8 pixels of bgra4444
+    movdqa    xmm2, xmm0
+    pand      xmm0, xmm4    // mask low nibbles
+    pand      xmm2, xmm5    // mask high nibbles
+    movdqa    xmm1, xmm0
+    movdqa    xmm3, xmm2
+    psllw     xmm1, 4
+    psrlw     xmm3, 4
+    por       xmm0, xmm1
+    por       xmm2, xmm3
+    movdqa    xmm1, xmm0
+    punpcklbw xmm0, xmm2
+    punpckhbw xmm1, xmm2
+    movdqu    [eax * 2 + edx], xmm0  // store 4 pixels of ARGB
+    movdqu    [eax * 2 + edx + 16], xmm1  // store next 4 pixels of ARGB
+    lea       eax, [eax + 16]
+    sub       ecx, 8
+    jg        convertloop
+    ret
+  }
+}
+
+__declspec(naked)
+void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) {
+  __asm {
+    mov       eax, [esp + 4]   // src_argb
+    mov       edx, [esp + 8]   // dst_rgb
+    mov       ecx, [esp + 12]  // width
+    movdqa    xmm6, xmmword ptr kShuffleMaskARGBToRGB24
+
+ convertloop:
+    movdqu    xmm0, [eax]   // fetch 16 pixels of argb
+    movdqu    xmm1, [eax + 16]
+    movdqu    xmm2, [eax + 32]
+    movdqu    xmm3, [eax + 48]
+    lea       eax, [eax + 64]
+    pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
+    pshufb    xmm1, xmm6
+    pshufb    xmm2, xmm6
+    pshufb    xmm3, xmm6
+    movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
+    psrldq    xmm1, 4      // 8 bytes from 1
+    pslldq    xmm4, 12     // 4 bytes from 1 for 0
+    movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
+    por       xmm0, xmm4   // 4 bytes from 1 for 0
+    pslldq    xmm5, 8      // 8 bytes from 2 for 1
+    movdqu    [edx], xmm0  // store 0
+    por       xmm1, xmm5   // 8 bytes from 2 for 1
+    psrldq    xmm2, 8      // 4 bytes from 2
+    pslldq    xmm3, 4      // 12 bytes from 3 for 2
+    por       xmm2, xmm3   // 12 bytes from 3 for 2
+    movdqu    [edx + 16], xmm1   // store 1
+    movdqu    [edx + 32], xmm2   // store 2
+    lea       edx, [edx + 48]
+    sub       ecx, 16
+    jg        convertloop
+    ret
+  }
+}
+
+__declspec(naked)
+void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) {
+  __asm {
+    mov       eax, [esp + 4]   // src_argb
+    mov       edx, [esp + 8]   // dst_rgb
+    mov       ecx, [esp + 12]  // width
+    movdqa    xmm6, xmmword ptr kShuffleMaskARGBToRAW
+
+ convertloop:
+    movdqu    xmm0, [eax]   // fetch 16 pixels of argb
+    movdqu    xmm1, [eax + 16]
+    movdqu    xmm2, [eax + 32]
+    movdqu    xmm3, [eax + 48]
+    lea       eax, [eax + 64]
+    pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
+    pshufb    xmm1, xmm6
+    pshufb    xmm2, xmm6
+    pshufb    xmm3, xmm6
+    movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
+    psrldq    xmm1, 4      // 8 bytes from 1
+    pslldq    xmm4, 12     // 4 bytes from 1 for 0
+    movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
+    por       xmm0, xmm4   // 4 bytes from 1 for 0
+    pslldq    xmm5, 8      // 8 bytes from 2 for 1
+    movdqu    [edx], xmm0  // store 0
+    por       xmm1, xmm5   // 8 bytes from 2 for 1
+    psrldq    xmm2, 8      // 4 bytes from 2
+    pslldq    xmm3, 4      // 12 bytes from 3 for 2
+    por       xmm2, xmm3   // 12 bytes from 3 for 2
+    movdqu    [edx + 16], xmm1   // store 1
+    movdqu    [edx + 32], xmm2   // store 2
+    lea       edx, [edx + 48]
+    sub       ecx, 16
+    jg        convertloop
+    ret
+  }
+}
+
+__declspec(naked)
+void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
+  __asm {
+    mov       eax, [esp + 4]   // src_argb
+    mov       edx, [esp + 8]   // dst_rgb
+    mov       ecx, [esp + 12]  // width
+    pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f
+    psrld     xmm3, 27
+    pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0
+    psrld     xmm4, 26
+    pslld     xmm4, 5
+    pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800
+    pslld     xmm5, 11
+
+ convertloop:
+    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
+    movdqa    xmm1, xmm0    // B
+    movdqa    xmm2, xmm0    // G
+    pslld     xmm0, 8       // R
+    psrld     xmm1, 3       // B
+    psrld     xmm2, 5       // G
+    psrad     xmm0, 16      // R
+    pand      xmm1, xmm3    // B
+    pand      xmm2, xmm4    // G
+    pand      xmm0, xmm5    // R
+    por       xmm1, xmm2    // BG
+    por       xmm0, xmm1    // BGR
+    packssdw  xmm0, xmm0
+    lea       eax, [eax + 16]
+    movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
+    lea       edx, [edx + 8]
+    sub       ecx, 4
+    jg        convertloop
+    ret
+  }
+}
+
+__declspec(naked)
+void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
+                                const uint32 dither4, int width) {
+  __asm {
+
+    mov       eax, [esp + 4]   // src_argb
+    mov       edx, [esp + 8]   // dst_rgb
+    movd      xmm6, [esp + 12] // dither4
+    mov       ecx, [esp + 16]  // width
+    punpcklbw xmm6, xmm6       // make dither 16 bytes
+    movdqa    xmm7, xmm6
+    punpcklwd xmm6, xmm6
+    punpckhwd xmm7, xmm7
+    pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f
+    psrld     xmm3, 27
+    pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0
+    psrld     xmm4, 26
+    pslld     xmm4, 5
+    pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800
+    pslld     xmm5, 11
+
+ convertloop:
+    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
+    paddusb   xmm0, xmm6    // add dither
+    movdqa    xmm1, xmm0    // B
+    movdqa    xmm2, xmm0    // G
+    pslld     xmm0, 8       // R
+    psrld     xmm1, 3       // B
+    psrld     xmm2, 5       // G
+    psrad     xmm0, 16      // R
+    pand      xmm1, xmm3    // B
+    pand      xmm2, xmm4    // G
+    pand      xmm0, xmm5    // R
+    por       xmm1, xmm2    // BG
+    por       xmm0, xmm1    // BGR
+    packssdw  xmm0, xmm0
+    lea       eax, [eax + 16]
+    movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
+    lea       edx, [edx + 8]
+    sub       ecx, 4
+    jg        convertloop
+    ret
+  }
+}
+
+#ifdef HAS_ARGBTORGB565DITHERROW_AVX2
+__declspec(naked)
+void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
+                                const uint32 dither4, int width) {
+  __asm {
+    mov        eax, [esp + 4]      // src_argb
+    mov        edx, [esp + 8]      // dst_rgb
+    vbroadcastss xmm6, [esp + 12]  // dither4
+    mov        ecx, [esp + 16]     // width
+    vpunpcklbw xmm6, xmm6, xmm6    // make dither 32 bytes
+    vpermq     ymm6, ymm6, 0xd8
+    vpunpcklwd ymm6, ymm6, ymm6
+    vpcmpeqb   ymm3, ymm3, ymm3    // generate mask 0x0000001f
+    vpsrld     ymm3, ymm3, 27
+    vpcmpeqb   ymm4, ymm4, ymm4    // generate mask 0x000007e0
+    vpsrld     ymm4, ymm4, 26
+    vpslld     ymm4, ymm4, 5
+    vpslld     ymm5, ymm3, 11      // generate mask 0x0000f800
+
+ convertloop:
+    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
+    vpaddusb   ymm0, ymm0, ymm6    // add dither
+    vpsrld     ymm2, ymm0, 5       // G
+    vpsrld     ymm1, ymm0, 3       // B
+    vpsrld     ymm0, ymm0, 8       // R
+    vpand      ymm2, ymm2, ymm4    // G
+    vpand      ymm1, ymm1, ymm3    // B
+    vpand      ymm0, ymm0, ymm5    // R
+    vpor       ymm1, ymm1, ymm2    // BG
+    vpor       ymm0, ymm0, ymm1    // BGR
+    vpackusdw  ymm0, ymm0, ymm0
+    vpermq     ymm0, ymm0, 0xd8
+    lea        eax, [eax + 32]
+    vmovdqu    [edx], xmm0         // store 8 pixels of RGB565
+    lea        edx, [edx + 16]
+    sub        ecx, 8
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBTORGB565DITHERROW_AVX2
+
+// TODO(fbarchard): Improve sign extension/packing.
+__declspec(naked)
+void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
+  __asm {
+    mov       eax, [esp + 4]   // src_argb
+    mov       edx, [esp + 8]   // dst_rgb
+    mov       ecx, [esp + 12]  // width
+    pcmpeqb   xmm4, xmm4       // generate mask 0x0000001f
+    psrld     xmm4, 27
+    movdqa    xmm5, xmm4       // generate mask 0x000003e0
+    pslld     xmm5, 5
+    movdqa    xmm6, xmm4       // generate mask 0x00007c00
+    pslld     xmm6, 10
+    pcmpeqb   xmm7, xmm7       // generate mask 0xffff8000
+    pslld     xmm7, 15
+
+ convertloop:
+    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
+    movdqa    xmm1, xmm0    // B
+    movdqa    xmm2, xmm0    // G
+    movdqa    xmm3, xmm0    // R
+    psrad     xmm0, 16      // A
+    psrld     xmm1, 3       // B
+    psrld     xmm2, 6       // G
+    psrld     xmm3, 9       // R
+    pand      xmm0, xmm7    // A
+    pand      xmm1, xmm4    // B
+    pand      xmm2, xmm5    // G
+    pand      xmm3, xmm6    // R
+    por       xmm0, xmm1    // BA
+    por       xmm2, xmm3    // GR
+    por       xmm0, xmm2    // BGRA
+    packssdw  xmm0, xmm0
+    lea       eax, [eax + 16]
+    movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB1555
+    lea       edx, [edx + 8]
+    sub       ecx, 4
+    jg        convertloop
+    ret
+  }
+}
+
+__declspec(naked)
+void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
+  __asm {
+    mov       eax, [esp + 4]   // src_argb
+    mov       edx, [esp + 8]   // dst_rgb
+    mov       ecx, [esp + 12]  // width
+    pcmpeqb   xmm4, xmm4       // generate mask 0xf000f000
+    psllw     xmm4, 12
+    movdqa    xmm3, xmm4       // generate mask 0x00f000f0
+    psrlw     xmm3, 8
+
+ convertloop:
+    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
+    movdqa    xmm1, xmm0
+    pand      xmm0, xmm3    // low nibble
+    pand      xmm1, xmm4    // high nibble
+    psrld     xmm0, 4
+    psrld     xmm1, 8
+    por       xmm0, xmm1
+    packuswb  xmm0, xmm0
+    lea       eax, [eax + 16]
+    movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB4444
+    lea       edx, [edx + 8]
+    sub       ecx, 4
+    jg        convertloop
+    ret
+  }
+}
+
+#ifdef HAS_ARGBTORGB565ROW_AVX2
+__declspec(naked)
+void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
+  __asm {
+    mov        eax, [esp + 4]      // src_argb
+    mov        edx, [esp + 8]      // dst_rgb
+    mov        ecx, [esp + 12]     // width
+    vpcmpeqb   ymm3, ymm3, ymm3    // generate mask 0x0000001f
+    vpsrld     ymm3, ymm3, 27
+    vpcmpeqb   ymm4, ymm4, ymm4    // generate mask 0x000007e0
+    vpsrld     ymm4, ymm4, 26
+    vpslld     ymm4, ymm4, 5
+    vpslld     ymm5, ymm3, 11      // generate mask 0x0000f800
+
+ convertloop:
+    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
+    vpsrld     ymm2, ymm0, 5       // G
+    vpsrld     ymm1, ymm0, 3       // B
+    vpsrld     ymm0, ymm0, 8       // R
+    vpand      ymm2, ymm2, ymm4    // G
+    vpand      ymm1, ymm1, ymm3    // B
+    vpand      ymm0, ymm0, ymm5    // R
+    vpor       ymm1, ymm1, ymm2    // BG
+    vpor       ymm0, ymm0, ymm1    // BGR
+    vpackusdw  ymm0, ymm0, ymm0
+    vpermq     ymm0, ymm0, 0xd8
+    lea        eax, [eax + 32]
+    vmovdqu    [edx], xmm0         // store 8 pixels of RGB565
+    lea        edx, [edx + 16]
+    sub        ecx, 8
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBTORGB565ROW_AVX2
+
+#ifdef HAS_ARGBTOARGB1555ROW_AVX2
+__declspec(naked)
+void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
+  __asm {
+    mov        eax, [esp + 4]      // src_argb
+    mov        edx, [esp + 8]      // dst_rgb
+    mov        ecx, [esp + 12]     // width
+    vpcmpeqb   ymm4, ymm4, ymm4
+    vpsrld     ymm4, ymm4, 27      // generate mask 0x0000001f
+    vpslld     ymm5, ymm4, 5       // generate mask 0x000003e0
+    vpslld     ymm6, ymm4, 10      // generate mask 0x00007c00
+    vpcmpeqb   ymm7, ymm7, ymm7    // generate mask 0xffff8000
+    vpslld     ymm7, ymm7, 15
+
+ convertloop:
+    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
+    vpsrld     ymm3, ymm0, 9       // R
+    vpsrld     ymm2, ymm0, 6       // G
+    vpsrld     ymm1, ymm0, 3       // B
+    vpsrad     ymm0, ymm0, 16      // A
+    vpand      ymm3, ymm3, ymm6    // R
+    vpand      ymm2, ymm2, ymm5    // G
+    vpand      ymm1, ymm1, ymm4    // B
+    vpand      ymm0, ymm0, ymm7    // A
+    vpor       ymm0, ymm0, ymm1    // BA
+    vpor       ymm2, ymm2, ymm3    // GR
+    vpor       ymm0, ymm0, ymm2    // BGRA
+    vpackssdw  ymm0, ymm0, ymm0
+    vpermq     ymm0, ymm0, 0xd8
+    lea        eax, [eax + 32]
+    vmovdqu    [edx], xmm0         // store 8 pixels of ARGB1555
+    lea        edx, [edx + 16]
+    sub        ecx, 8
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBTOARGB1555ROW_AVX2
+
+#ifdef HAS_ARGBTOARGB4444ROW_AVX2
+__declspec(naked)
+void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
+  __asm {
+    mov        eax, [esp + 4]   // src_argb
+    mov        edx, [esp + 8]   // dst_rgb
+    mov        ecx, [esp + 12]  // width
+    vpcmpeqb   ymm4, ymm4, ymm4   // generate mask 0xf000f000
+    vpsllw     ymm4, ymm4, 12
+    vpsrlw     ymm3, ymm4, 8      // generate mask 0x00f000f0
+
+ convertloop:
+    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
+    vpand      ymm1, ymm0, ymm4    // high nibble
+    vpand      ymm0, ymm0, ymm3    // low nibble
+    vpsrld     ymm1, ymm1, 8
+    vpsrld     ymm0, ymm0, 4
+    vpor       ymm0, ymm0, ymm1
+    vpackuswb  ymm0, ymm0, ymm0
+    vpermq     ymm0, ymm0, 0xd8
+    lea        eax, [eax + 32]
+    vmovdqu    [edx], xmm0         // store 8 pixels of ARGB4444
+    lea        edx, [edx + 16]
+    sub        ecx, 8
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBTOARGB4444ROW_AVX2
+
+// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
+__declspec(naked)
+void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
+  __asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_y */
+    mov        ecx, [esp + 12]  /* width */
+    movdqa     xmm4, xmmword ptr kARGBToY
+    movdqa     xmm5, xmmword ptr kAddY16
+
+ convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    pmaddubsw  xmm0, xmm4
+    pmaddubsw  xmm1, xmm4
+    pmaddubsw  xmm2, xmm4
+    pmaddubsw  xmm3, xmm4
+    lea        eax, [eax + 64]
+    phaddw     xmm0, xmm1
+    phaddw     xmm2, xmm3
+    psrlw      xmm0, 7
+    psrlw      xmm2, 7
+    packuswb   xmm0, xmm2
+    paddb      xmm0, xmm5
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         convertloop
+    ret
+  }
+}
+
+// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
+// Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
+__declspec(naked)
+void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
+  __asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_y */
+    mov        ecx, [esp + 12]  /* width */
+    movdqa     xmm4, xmmword ptr kARGBToYJ
+    movdqa     xmm5, xmmword ptr kAddYJ64
+
+ convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    pmaddubsw  xmm0, xmm4
+    pmaddubsw  xmm1, xmm4
+    pmaddubsw  xmm2, xmm4
+    pmaddubsw  xmm3, xmm4
+    lea        eax, [eax + 64]
+    phaddw     xmm0, xmm1
+    phaddw     xmm2, xmm3
+    paddw      xmm0, xmm5  // Add .5 for rounding.
+    paddw      xmm2, xmm5
+    psrlw      xmm0, 7
+    psrlw      xmm2, 7
+    packuswb   xmm0, xmm2
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         convertloop
+    ret
+  }
+}
+
+#ifdef HAS_ARGBTOYROW_AVX2
+// vpermd for vphaddw + vpackuswb vpermd.
+static const lvec32 kPermdARGBToY_AVX = {
+  0, 4, 1, 5, 2, 6, 3, 7
+};
+
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+__declspec(naked)
+void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
+  __asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_y */
+    mov        ecx, [esp + 12]  /* width */
+    vbroadcastf128 ymm4, xmmword ptr kARGBToY
+    vbroadcastf128 ymm5, xmmword ptr kAddY16
+    vmovdqu    ymm6, ymmword ptr kPermdARGBToY_AVX
+
+ convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    vmovdqu    ymm2, [eax + 64]
+    vmovdqu    ymm3, [eax + 96]
+    vpmaddubsw ymm0, ymm0, ymm4
+    vpmaddubsw ymm1, ymm1, ymm4
+    vpmaddubsw ymm2, ymm2, ymm4
+    vpmaddubsw ymm3, ymm3, ymm4
+    lea        eax, [eax + 128]
+    vphaddw    ymm0, ymm0, ymm1  // mutates.
+    vphaddw    ymm2, ymm2, ymm3
+    vpsrlw     ymm0, ymm0, 7
+    vpsrlw     ymm2, ymm2, 7
+    vpackuswb  ymm0, ymm0, ymm2  // mutates.
+    vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
+    vpaddb     ymm0, ymm0, ymm5  // add 16 for Y
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
+    sub        ecx, 32
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  //  HAS_ARGBTOYROW_AVX2
+
+#ifdef HAS_ARGBTOYJROW_AVX2
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+__declspec(naked)
+void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
+  __asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_y */
+    mov        ecx, [esp + 12]  /* width */
+    vbroadcastf128 ymm4, xmmword ptr kARGBToYJ
+    vbroadcastf128 ymm5, xmmword ptr kAddYJ64
+    vmovdqu    ymm6, ymmword ptr kPermdARGBToY_AVX
+
+ convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    vmovdqu    ymm2, [eax + 64]
+    vmovdqu    ymm3, [eax + 96]
+    vpmaddubsw ymm0, ymm0, ymm4
+    vpmaddubsw ymm1, ymm1, ymm4
+    vpmaddubsw ymm2, ymm2, ymm4
+    vpmaddubsw ymm3, ymm3, ymm4
+    lea        eax, [eax + 128]
+    vphaddw    ymm0, ymm0, ymm1  // mutates.
+    vphaddw    ymm2, ymm2, ymm3
+    vpaddw     ymm0, ymm0, ymm5  // Add .5 for rounding.
+    vpaddw     ymm2, ymm2, ymm5
+    vpsrlw     ymm0, ymm0, 7
+    vpsrlw     ymm2, ymm2, 7
+    vpackuswb  ymm0, ymm0, ymm2  // mutates.
+    vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
+    sub        ecx, 32
+    jg         convertloop
+
+    vzeroupper
+    ret
+  }
+}
+#endif  //  HAS_ARGBTOYJROW_AVX2
+
+__declspec(naked)
+void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
+  __asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_y */
+    mov        ecx, [esp + 12]  /* width */
+    movdqa     xmm4, xmmword ptr kBGRAToY
+    movdqa     xmm5, xmmword ptr kAddY16
+
+ convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    pmaddubsw  xmm0, xmm4
+    pmaddubsw  xmm1, xmm4
+    pmaddubsw  xmm2, xmm4
+    pmaddubsw  xmm3, xmm4
+    lea        eax, [eax + 64]
+    phaddw     xmm0, xmm1
+    phaddw     xmm2, xmm3
+    psrlw      xmm0, 7
+    psrlw      xmm2, 7
+    packuswb   xmm0, xmm2
+    paddb      xmm0, xmm5
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         convertloop
+    ret
+  }
+}
+
+__declspec(naked)
+void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
+  __asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_y */
+    mov        ecx, [esp + 12]  /* width */
+    movdqa     xmm4, xmmword ptr kABGRToY
+    movdqa     xmm5, xmmword ptr kAddY16
+
+ convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    pmaddubsw  xmm0, xmm4
+    pmaddubsw  xmm1, xmm4
+    pmaddubsw  xmm2, xmm4
+    pmaddubsw  xmm3, xmm4
+    lea        eax, [eax + 64]
+    phaddw     xmm0, xmm1
+    phaddw     xmm2, xmm3
+    psrlw      xmm0, 7
+    psrlw      xmm2, 7
+    packuswb   xmm0, xmm2
+    paddb      xmm0, xmm5
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         convertloop
+    ret
+  }
+}
+
+__declspec(naked)
+void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
+  __asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_y */
+    mov        ecx, [esp + 12]  /* width */
+    movdqa     xmm4, xmmword ptr kRGBAToY
+    movdqa     xmm5, xmmword ptr kAddY16
+
+ convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    pmaddubsw  xmm0, xmm4
+    pmaddubsw  xmm1, xmm4
+    pmaddubsw  xmm2, xmm4
+    pmaddubsw  xmm3, xmm4
+    lea        eax, [eax + 64]
+    phaddw     xmm0, xmm1
+    phaddw     xmm2, xmm3
+    psrlw      xmm0, 7
+    psrlw      xmm2, 7
+    packuswb   xmm0, xmm2
+    paddb      xmm0, xmm5
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         convertloop
+    ret
+  }
+}
+
+__declspec(naked)
+void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // src_argb
+    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // width
+    movdqa     xmm5, xmmword ptr kAddUV128
+    movdqa     xmm6, xmmword ptr kARGBToV
+    movdqa     xmm7, xmmword ptr kARGBToU
+    sub        edi, edx             // stride from u to v
+
+ convertloop:
+    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+    movdqu     xmm0, [eax]
+    movdqu     xmm4, [eax + esi]
+    pavgb      xmm0, xmm4
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm4, [eax + esi + 16]
+    pavgb      xmm1, xmm4
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm4, [eax + esi + 32]
+    pavgb      xmm2, xmm4
+    movdqu     xmm3, [eax + 48]
+    movdqu     xmm4, [eax + esi + 48]
+    pavgb      xmm3, xmm4
+
+    lea        eax,  [eax + 64]
+    movdqa     xmm4, xmm0
+    shufps     xmm0, xmm1, 0x88
+    shufps     xmm4, xmm1, 0xdd
+    pavgb      xmm0, xmm4
+    movdqa     xmm4, xmm2
+    shufps     xmm2, xmm3, 0x88
+    shufps     xmm4, xmm3, 0xdd
+    pavgb      xmm2, xmm4
+
+    // step 2 - convert to U and V
+    // from here down is very similar to Y code except
+    // instead of 16 different pixels, its 8 pixels of U and 8 of V
+    movdqa     xmm1, xmm0
+    movdqa     xmm3, xmm2
+    pmaddubsw  xmm0, xmm7  // U
+    pmaddubsw  xmm2, xmm7
+    pmaddubsw  xmm1, xmm6  // V
+    pmaddubsw  xmm3, xmm6
+    phaddw     xmm0, xmm2
+    phaddw     xmm1, xmm3
+    psraw      xmm0, 8
+    psraw      xmm1, 8
+    packsswb   xmm0, xmm1
+    paddb      xmm0, xmm5            // -> unsigned
+
+    // step 3 - store 8 U and 8 V values
+    movlps     qword ptr [edx], xmm0 // U
+    movhps     qword ptr [edx + edi], xmm0 // V
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked)
+void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                        uint8* dst_u, uint8* dst_v, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // src_argb
+    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // width
+    movdqa     xmm5, xmmword ptr kAddUVJ128
+    movdqa     xmm6, xmmword ptr kARGBToVJ
+    movdqa     xmm7, xmmword ptr kARGBToUJ
+    sub        edi, edx             // stride from u to v
+
+ convertloop:
+    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+    movdqu     xmm0, [eax]
+    movdqu     xmm4, [eax + esi]
+    pavgb      xmm0, xmm4
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm4, [eax + esi + 16]
+    pavgb      xmm1, xmm4
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm4, [eax + esi + 32]
+    pavgb      xmm2, xmm4
+    movdqu     xmm3, [eax + 48]
+    movdqu     xmm4, [eax + esi + 48]
+    pavgb      xmm3, xmm4
+
+    lea        eax,  [eax + 64]
+    movdqa     xmm4, xmm0
+    shufps     xmm0, xmm1, 0x88
+    shufps     xmm4, xmm1, 0xdd
+    pavgb      xmm0, xmm4
+    movdqa     xmm4, xmm2
+    shufps     xmm2, xmm3, 0x88
+    shufps     xmm4, xmm3, 0xdd
+    pavgb      xmm2, xmm4
+
+    // step 2 - convert to U and V
+    // from here down is very similar to Y code except
+    // instead of 16 different pixels, its 8 pixels of U and 8 of V
+    movdqa     xmm1, xmm0
+    movdqa     xmm3, xmm2
+    pmaddubsw  xmm0, xmm7  // U
+    pmaddubsw  xmm2, xmm7
+    pmaddubsw  xmm1, xmm6  // V
+    pmaddubsw  xmm3, xmm6
+    phaddw     xmm0, xmm2
+    phaddw     xmm1, xmm3
+    paddw      xmm0, xmm5  // +.5 rounding -> unsigned
+    paddw      xmm1, xmm5
+    psraw      xmm0, 8
+    psraw      xmm1, 8
+    packsswb   xmm0, xmm1
+
+    // step 3 - store 8 U and 8 V values
+    movlps     qword ptr [edx], xmm0 // U
+    movhps     qword ptr [edx + edi], xmm0 // V
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+#ifdef HAS_ARGBTOUVROW_AVX2
+__declspec(naked)
+void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // src_argb
+    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // width
+    vbroadcastf128 ymm5, xmmword ptr kAddUV128
+    vbroadcastf128 ymm6, xmmword ptr kARGBToV
+    vbroadcastf128 ymm7, xmmword ptr kARGBToU
+    sub        edi, edx             // stride from u to v
+
+ convertloop:
+    /* step 1 - subsample 32x2 argb pixels to 16x1 */
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    vmovdqu    ymm2, [eax + 64]
+    vmovdqu    ymm3, [eax + 96]
+    vpavgb     ymm0, ymm0, [eax + esi]
+    vpavgb     ymm1, ymm1, [eax + esi + 32]
+    vpavgb     ymm2, ymm2, [eax + esi + 64]
+    vpavgb     ymm3, ymm3, [eax + esi + 96]
+    lea        eax,  [eax + 128]
+    vshufps    ymm4, ymm0, ymm1, 0x88
+    vshufps    ymm0, ymm0, ymm1, 0xdd
+    vpavgb     ymm0, ymm0, ymm4  // mutated by vshufps
+    vshufps    ymm4, ymm2, ymm3, 0x88
+    vshufps    ymm2, ymm2, ymm3, 0xdd
+    vpavgb     ymm2, ymm2, ymm4  // mutated by vshufps
+
+    // step 2 - convert to U and V
+    // from here down is very similar to Y code except
+    // instead of 32 different pixels, its 16 pixels of U and 16 of V
+    vpmaddubsw ymm1, ymm0, ymm7  // U
+    vpmaddubsw ymm3, ymm2, ymm7
+    vpmaddubsw ymm0, ymm0, ymm6  // V
+    vpmaddubsw ymm2, ymm2, ymm6
+    vphaddw    ymm1, ymm1, ymm3  // mutates
+    vphaddw    ymm0, ymm0, ymm2
+    vpsraw     ymm1, ymm1, 8
+    vpsraw     ymm0, ymm0, 8
+    vpacksswb  ymm0, ymm1, ymm0  // mutates
+    vpermq     ymm0, ymm0, 0xd8  // For vpacksswb
+    vpshufb    ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX  // for vshufps/vphaddw
+    vpaddb     ymm0, ymm0, ymm5  // -> unsigned
+
+    // step 3 - store 16 U and 16 V values
+    vextractf128 [edx], ymm0, 0 // U
+    vextractf128 [edx + edi], ymm0, 1 // V
+    lea        edx, [edx + 16]
+    sub        ecx, 32
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBTOUVROW_AVX2
+
+#ifdef HAS_ARGBTOUVJROW_AVX2
+__declspec(naked)
+void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // src_argb
+    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // width
+    vbroadcastf128 ymm5, xmmword ptr kAddUV128
+    vbroadcastf128 ymm6, xmmword ptr kARGBToV
+    vbroadcastf128 ymm7, xmmword ptr kARGBToU
+    sub        edi, edx             // stride from u to v
+
+ convertloop:
+    /* step 1 - subsample 32x2 argb pixels to 16x1 */
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    vmovdqu    ymm2, [eax + 64]
+    vmovdqu    ymm3, [eax + 96]
+    vpavgb     ymm0, ymm0, [eax + esi]
+    vpavgb     ymm1, ymm1, [eax + esi + 32]
+    vpavgb     ymm2, ymm2, [eax + esi + 64]
+    vpavgb     ymm3, ymm3, [eax + esi + 96]
+    lea        eax,  [eax + 128]
+    vshufps    ymm4, ymm0, ymm1, 0x88
+    vshufps    ymm0, ymm0, ymm1, 0xdd
+    vpavgb     ymm0, ymm0, ymm4  // mutated by vshufps
+    vshufps    ymm4, ymm2, ymm3, 0x88
+    vshufps    ymm2, ymm2, ymm3, 0xdd
+    vpavgb     ymm2, ymm2, ymm4  // mutated by vshufps
+
+    // step 2 - convert to U and V
+    // from here down is very similar to Y code except
+    // instead of 32 different pixels, its 16 pixels of U and 16 of V
+    vpmaddubsw ymm1, ymm0, ymm7  // U
+    vpmaddubsw ymm3, ymm2, ymm7
+    vpmaddubsw ymm0, ymm0, ymm6  // V
+    vpmaddubsw ymm2, ymm2, ymm6
+    vphaddw    ymm1, ymm1, ymm3  // mutates
+    vphaddw    ymm0, ymm0, ymm2
+    vpaddw     ymm1, ymm1, ymm5  // +.5 rounding -> unsigned
+    vpaddw     ymm0, ymm0, ymm5
+    vpsraw     ymm1, ymm1, 8
+    vpsraw     ymm0, ymm0, 8
+    vpacksswb  ymm0, ymm1, ymm0  // mutates
+    vpermq     ymm0, ymm0, 0xd8  // For vpacksswb
+    vpshufb    ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX  // for vshufps/vphaddw
+
+    // step 3 - store 16 U and 16 V values
+    vextractf128 [edx], ymm0, 0 // U
+    vextractf128 [edx + edi], ymm0, 1 // V
+    lea        edx, [edx + 16]
+    sub        ecx, 32
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBTOUVJROW_AVX2
+
+__declspec(naked)
+void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
+                          uint8* dst_u, uint8* dst_v, int width) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]   // src_argb
+    mov        edx, [esp + 4 + 8]   // dst_u
+    mov        edi, [esp + 4 + 12]  // dst_v
+    mov        ecx, [esp + 4 + 16]  // width
+    movdqa     xmm5, xmmword ptr kAddUV128
+    movdqa     xmm6, xmmword ptr kARGBToV
+    movdqa     xmm7, xmmword ptr kARGBToU
+    sub        edi, edx             // stride from u to v
+
+ convertloop:
+    /* convert to U and V */
+    movdqu     xmm0, [eax]          // U
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    pmaddubsw  xmm0, xmm7
+    pmaddubsw  xmm1, xmm7
+    pmaddubsw  xmm2, xmm7
+    pmaddubsw  xmm3, xmm7
+    phaddw     xmm0, xmm1
+    phaddw     xmm2, xmm3
+    psraw      xmm0, 8
+    psraw      xmm2, 8
+    packsswb   xmm0, xmm2
+    paddb      xmm0, xmm5
+    movdqu     [edx], xmm0
+
+    movdqu     xmm0, [eax]          // V
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    pmaddubsw  xmm0, xmm6
+    pmaddubsw  xmm1, xmm6
+    pmaddubsw  xmm2, xmm6
+    pmaddubsw  xmm3, xmm6
+    phaddw     xmm0, xmm1
+    phaddw     xmm2, xmm3
+    psraw      xmm0, 8
+    psraw      xmm2, 8
+    packsswb   xmm0, xmm2
+    paddb      xmm0, xmm5
+    lea        eax,  [eax + 64]
+    movdqu     [edx + edi], xmm0
+    lea        edx,  [edx + 16]
+    sub        ecx,  16
+    jg         convertloop
+
+    pop        edi
+    ret
+  }
+}
+
+__declspec(naked)
+void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // src_argb
+    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // width
+    movdqa     xmm5, xmmword ptr kAddUV128
+    movdqa     xmm6, xmmword ptr kBGRAToV
+    movdqa     xmm7, xmmword ptr kBGRAToU
+    sub        edi, edx             // stride from u to v
+
+ convertloop:
+    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+    movdqu     xmm0, [eax]
+    movdqu     xmm4, [eax + esi]
+    pavgb      xmm0, xmm4
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm4, [eax + esi + 16]
+    pavgb      xmm1, xmm4
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm4, [eax + esi + 32]
+    pavgb      xmm2, xmm4
+    movdqu     xmm3, [eax + 48]
+    movdqu     xmm4, [eax + esi + 48]
+    pavgb      xmm3, xmm4
+
+    lea        eax,  [eax + 64]
+    movdqa     xmm4, xmm0
+    shufps     xmm0, xmm1, 0x88
+    shufps     xmm4, xmm1, 0xdd
+    pavgb      xmm0, xmm4
+    movdqa     xmm4, xmm2
+    shufps     xmm2, xmm3, 0x88
+    shufps     xmm4, xmm3, 0xdd
+    pavgb      xmm2, xmm4
+
+    // step 2 - convert to U and V
+    // from here down is very similar to Y code except
+    // instead of 16 different pixels, its 8 pixels of U and 8 of V
+    movdqa     xmm1, xmm0
+    movdqa     xmm3, xmm2
+    pmaddubsw  xmm0, xmm7  // U
+    pmaddubsw  xmm2, xmm7
+    pmaddubsw  xmm1, xmm6  // V
+    pmaddubsw  xmm3, xmm6
+    phaddw     xmm0, xmm2
+    phaddw     xmm1, xmm3
+    psraw      xmm0, 8
+    psraw      xmm1, 8
+    packsswb   xmm0, xmm1
+    paddb      xmm0, xmm5            // -> unsigned
+
+    // step 3 - store 8 U and 8 V values
+    movlps     qword ptr [edx], xmm0 // U
+    movhps     qword ptr [edx + edi], xmm0 // V
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked)
+void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // src_argb
+    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // width
+    movdqa     xmm5, xmmword ptr kAddUV128
+    movdqa     xmm6, xmmword ptr kABGRToV
+    movdqa     xmm7, xmmword ptr kABGRToU
+    sub        edi, edx             // stride from u to v
+
+ convertloop:
+    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+    movdqu     xmm0, [eax]
+    movdqu     xmm4, [eax + esi]
+    pavgb      xmm0, xmm4
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm4, [eax + esi + 16]
+    pavgb      xmm1, xmm4
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm4, [eax + esi + 32]
+    pavgb      xmm2, xmm4
+    movdqu     xmm3, [eax + 48]
+    movdqu     xmm4, [eax + esi + 48]
+    pavgb      xmm3, xmm4
+
+    lea        eax,  [eax + 64]
+    movdqa     xmm4, xmm0
+    shufps     xmm0, xmm1, 0x88
+    shufps     xmm4, xmm1, 0xdd
+    pavgb      xmm0, xmm4
+    movdqa     xmm4, xmm2
+    shufps     xmm2, xmm3, 0x88
+    shufps     xmm4, xmm3, 0xdd
+    pavgb      xmm2, xmm4
+
+    // step 2 - convert to U and V
+    // from here down is very similar to Y code except
+    // instead of 16 different pixels, its 8 pixels of U and 8 of V
+    movdqa     xmm1, xmm0
+    movdqa     xmm3, xmm2
+    pmaddubsw  xmm0, xmm7  // U
+    pmaddubsw  xmm2, xmm7
+    pmaddubsw  xmm1, xmm6  // V
+    pmaddubsw  xmm3, xmm6
+    phaddw     xmm0, xmm2
+    phaddw     xmm1, xmm3
+    psraw      xmm0, 8
+    psraw      xmm1, 8
+    packsswb   xmm0, xmm1
+    paddb      xmm0, xmm5            // -> unsigned
+
+    // step 3 - store 8 U and 8 V values
+    movlps     qword ptr [edx], xmm0 // U
+    movhps     qword ptr [edx + edi], xmm0 // V
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked)
+void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // src_argb
+    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // width
+    movdqa     xmm5, xmmword ptr kAddUV128
+    movdqa     xmm6, xmmword ptr kRGBAToV
+    movdqa     xmm7, xmmword ptr kRGBAToU
+    sub        edi, edx             // stride from u to v
+
+ convertloop:
+    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+    movdqu     xmm0, [eax]
+    movdqu     xmm4, [eax + esi]
+    pavgb      xmm0, xmm4
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm4, [eax + esi + 16]
+    pavgb      xmm1, xmm4
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm4, [eax + esi + 32]
+    pavgb      xmm2, xmm4
+    movdqu     xmm3, [eax + 48]
+    movdqu     xmm4, [eax + esi + 48]
+    pavgb      xmm3, xmm4
+
+    lea        eax,  [eax + 64]
+    movdqa     xmm4, xmm0
+    shufps     xmm0, xmm1, 0x88
+    shufps     xmm4, xmm1, 0xdd
+    pavgb      xmm0, xmm4
+    movdqa     xmm4, xmm2
+    shufps     xmm2, xmm3, 0x88
+    shufps     xmm4, xmm3, 0xdd
+    pavgb      xmm2, xmm4
+
+    // step 2 - convert to U and V
+    // from here down is very similar to Y code except
+    // instead of 16 different pixels, its 8 pixels of U and 8 of V
+    movdqa     xmm1, xmm0
+    movdqa     xmm3, xmm2
+    pmaddubsw  xmm0, xmm7  // U
+    pmaddubsw  xmm2, xmm7
+    pmaddubsw  xmm1, xmm6  // V
+    pmaddubsw  xmm3, xmm6
+    phaddw     xmm0, xmm2
+    phaddw     xmm1, xmm3
+    psraw      xmm0, 8
+    psraw      xmm1, 8
+    packsswb   xmm0, xmm1
+    paddb      xmm0, xmm5            // -> unsigned
+
+    // step 3 - store 8 U and 8 V values
+    movlps     qword ptr [edx], xmm0 // U
+    movhps     qword ptr [edx + edi], xmm0 // V
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBTOYROW_SSSE3
+
+// Read 16 UV from 444
+#define READYUV444_AVX2 __asm {                                                \
+    __asm vmovdqu    xmm0, [esi]                  /* U */                      \
+    __asm vmovdqu    xmm1, [esi + edi]            /* V */                      \
+    __asm lea        esi,  [esi + 16]                                          \
+    __asm vpermq     ymm0, ymm0, 0xd8                                          \
+    __asm vpermq     ymm1, ymm1, 0xd8                                          \
+    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
+    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
+    __asm vpermq     ymm4, ymm4, 0xd8                                          \
+    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
+    __asm lea        eax, [eax + 16]                                           \
+  }
+
+// Read 8 UV from 422, upsample to 16 UV.
+#define READYUV422_AVX2 __asm {                                                \
+    __asm vmovq      xmm0, qword ptr [esi]        /* U */                      \
+    __asm vmovq      xmm1, qword ptr [esi + edi]  /* V */                      \
+    __asm lea        esi,  [esi + 8]                                           \
+    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
+    __asm vpermq     ymm0, ymm0, 0xd8                                          \
+    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
+    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
+    __asm vpermq     ymm4, ymm4, 0xd8                                          \
+    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
+    __asm lea        eax, [eax + 16]                                           \
+  }
+
+// Read 8 UV from 422, upsample to 16 UV.  With 16 Alpha.
+#define READYUVA422_AVX2 __asm {                                               \
+    __asm vmovq      xmm0, qword ptr [esi]        /* U */                      \
+    __asm vmovq      xmm1, qword ptr [esi + edi]  /* V */                      \
+    __asm lea        esi,  [esi + 8]                                           \
+    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
+    __asm vpermq     ymm0, ymm0, 0xd8                                          \
+    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
+    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
+    __asm vpermq     ymm4, ymm4, 0xd8                                          \
+    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
+    __asm lea        eax, [eax + 16]                                           \
+    __asm vmovdqu    xmm5, [ebp]                  /* A */                      \
+    __asm vpermq     ymm5, ymm5, 0xd8                                          \
+    __asm lea        ebp, [ebp + 16]                                           \
+  }
+
+// Read 4 UV from 411, upsample to 16 UV.
+#define READYUV411_AVX2 __asm {                                                \
+    __asm vmovd      xmm0, dword ptr [esi]        /* U */                      \
+    __asm vmovd      xmm1, dword ptr [esi + edi]  /* V */                      \
+    __asm lea        esi,  [esi + 4]                                           \
+    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
+    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
+    __asm vpermq     ymm0, ymm0, 0xd8                                          \
+    __asm vpunpckldq ymm0, ymm0, ymm0             /* UVUVUVUV (upsample) */    \
+    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
+    __asm vpermq     ymm4, ymm4, 0xd8                                          \
+    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
+    __asm lea        eax, [eax + 16]                                           \
+  }
+
+// Read 8 UV from NV12, upsample to 16 UV.
+#define READNV12_AVX2 __asm {                                                  \
+    __asm vmovdqu    xmm0, [esi]                  /* UV */                     \
+    __asm lea        esi,  [esi + 16]                                          \
+    __asm vpermq     ymm0, ymm0, 0xd8                                          \
+    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
+    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
+    __asm vpermq     ymm4, ymm4, 0xd8                                          \
+    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
+    __asm lea        eax, [eax + 16]                                           \
+  }
+
+// Read 8 UV from NV21, upsample to 16 UV.
+#define READNV21_AVX2 __asm {                                                  \
+    __asm vmovdqu    xmm0, [esi]                  /* UV */                     \
+    __asm lea        esi,  [esi + 16]                                          \
+    __asm vpermq     ymm0, ymm0, 0xd8                                          \
+    __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleNV21                      \
+    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
+    __asm vpermq     ymm4, ymm4, 0xd8                                          \
+    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
+    __asm lea        eax, [eax + 16]                                           \
+  }
+
+// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
+#define READYUY2_AVX2 __asm {                                                  \
+    __asm vmovdqu    ymm4, [eax]          /* YUY2 */                           \
+    __asm vpshufb    ymm4, ymm4, ymmword ptr kShuffleYUY2Y                     \
+    __asm vmovdqu    ymm0, [eax]          /* UV */                             \
+    __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleYUY2UV                    \
+    __asm lea        eax, [eax + 32]                                           \
+  }
+
+// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
+#define READUYVY_AVX2 __asm {                                                  \
+    __asm vmovdqu    ymm4, [eax]          /* UYVY */                           \
+    __asm vpshufb    ymm4, ymm4, ymmword ptr kShuffleUYVYY                     \
+    __asm vmovdqu    ymm0, [eax]          /* UV */                             \
+    __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleUYVYUV                    \
+    __asm lea        eax, [eax + 32]                                           \
+  }
+
+// Convert 16 pixels: 16 UV and 16 Y.
+#define YUVTORGB_AVX2(YuvConstants) __asm {                                    \
+    __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\
+    __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\
+    __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\
+    __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASR]               \
+    __asm vpsubw     ymm2, ymm3, ymm2                                          \
+    __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASG]               \
+    __asm vpsubw     ymm1, ymm3, ymm1                                          \
+    __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASB]               \
+    __asm vpsubw     ymm0, ymm3, ymm0                                          \
+    /* Step 2: Find Y contribution to 16 R,G,B values */                       \
+    __asm vpmulhuw   ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB]          \
+    __asm vpaddsw    ymm0, ymm0, ymm4           /* B += Y */                   \
+    __asm vpaddsw    ymm1, ymm1, ymm4           /* G += Y */                   \
+    __asm vpaddsw    ymm2, ymm2, ymm4           /* R += Y */                   \
+    __asm vpsraw     ymm0, ymm0, 6                                             \
+    __asm vpsraw     ymm1, ymm1, 6                                             \
+    __asm vpsraw     ymm2, ymm2, 6                                             \
+    __asm vpackuswb  ymm0, ymm0, ymm0           /* B */                        \
+    __asm vpackuswb  ymm1, ymm1, ymm1           /* G */                        \
+    __asm vpackuswb  ymm2, ymm2, ymm2           /* R */                        \
+  }
+
+// Store 16 ARGB values.
+#define STOREARGB_AVX2 __asm {                                                 \
+    __asm vpunpcklbw ymm0, ymm0, ymm1           /* BG */                       \
+    __asm vpermq     ymm0, ymm0, 0xd8                                          \
+    __asm vpunpcklbw ymm2, ymm2, ymm5           /* RA */                       \
+    __asm vpermq     ymm2, ymm2, 0xd8                                          \
+    __asm vpunpcklwd ymm1, ymm0, ymm2           /* BGRA first 8 pixels */      \
+    __asm vpunpckhwd ymm0, ymm0, ymm2           /* BGRA next 8 pixels */       \
+    __asm vmovdqu    0[edx], ymm1                                              \
+    __asm vmovdqu    32[edx], ymm0                                             \
+    __asm lea        edx,  [edx + 64]                                          \
+  }
+
+// Store 16 RGBA values.
+#define STORERGBA_AVX2 __asm {                                                 \
+    __asm vpunpcklbw ymm1, ymm1, ymm2           /* GR */                       \
+    __asm vpermq     ymm1, ymm1, 0xd8                                          \
+    __asm vpunpcklbw ymm2, ymm5, ymm0           /* AB */                       \
+    __asm vpermq     ymm2, ymm2, 0xd8                                          \
+    __asm vpunpcklwd ymm0, ymm2, ymm1           /* ABGR first 8 pixels */      \
+    __asm vpunpckhwd ymm1, ymm2, ymm1           /* ABGR next 8 pixels */       \
+    __asm vmovdqu    [edx], ymm0                                               \
+    __asm vmovdqu    [edx + 32], ymm1                                          \
+    __asm lea        edx,  [edx + 64]                                          \
+  }
+
+#ifdef HAS_I422TOARGBROW_AVX2
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked)
+void I422ToARGBRow_AVX2(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  __asm {
+    push       esi
+    push       edi
+    push       ebx
+    mov        eax, [esp + 12 + 4]   // Y
+    mov        esi, [esp + 12 + 8]   // U
+    mov        edi, [esp + 12 + 12]  // V
+    mov        edx, [esp + 12 + 16]  // argb
+    mov        ebx, [esp + 12 + 20]  // yuvconstants
+    mov        ecx, [esp + 12 + 24]  // width
+    sub        edi, esi
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+    READYUV422_AVX2
+    YUVTORGB_AVX2(ebx)
+    STOREARGB_AVX2
+
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        ebx
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_I422TOARGBROW_AVX2
+
+#ifdef HAS_I422ALPHATOARGBROW_AVX2
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
+__declspec(naked)
+void I422AlphaToARGBRow_AVX2(const uint8* y_buf,
+                             const uint8* u_buf,
+                             const uint8* v_buf,
+                             const uint8* a_buf,
+                             uint8* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width) {
+  __asm {
+    push       esi
+    push       edi
+    push       ebx
+    push       ebp
+    mov        eax, [esp + 16 + 4]   // Y
+    mov        esi, [esp + 16 + 8]   // U
+    mov        edi, [esp + 16 + 12]  // V
+    mov        ebp, [esp + 16 + 16]  // A
+    mov        edx, [esp + 16 + 20]  // argb
+    mov        ebx, [esp + 16 + 24]  // yuvconstants
+    mov        ecx, [esp + 16 + 28]  // width
+    sub        edi, esi
+
+ convertloop:
+    READYUVA422_AVX2
+    YUVTORGB_AVX2(ebx)
+    STOREARGB_AVX2
+
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        ebp
+    pop        ebx
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_I422ALPHATOARGBROW_AVX2
+
+#ifdef HAS_I444TOARGBROW_AVX2
+// 16 pixels
+// 16 UV values with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked)
+void I444ToARGBRow_AVX2(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  __asm {
+    push       esi
+    push       edi
+    push       ebx
+    mov        eax, [esp + 12 + 4]   // Y
+    mov        esi, [esp + 12 + 8]   // U
+    mov        edi, [esp + 12 + 12]  // V
+    mov        edx, [esp + 12 + 16]  // argb
+    mov        ebx, [esp + 12 + 20]  // yuvconstants
+    mov        ecx, [esp + 12 + 24]  // width
+    sub        edi, esi
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+ convertloop:
+    READYUV444_AVX2
+    YUVTORGB_AVX2(ebx)
+    STOREARGB_AVX2
+
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        ebx
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_I444TOARGBROW_AVX2
+
+#ifdef HAS_I411TOARGBROW_AVX2
+// 16 pixels
+// 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked)
+void I411ToARGBRow_AVX2(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  __asm {
+    push       esi
+    push       edi
+    push       ebx
+    mov        eax, [esp + 12 + 4]   // Y
+    mov        esi, [esp + 12 + 8]   // U
+    mov        edi, [esp + 12 + 12]  // V
+    mov        edx, [esp + 12 + 16]  // abgr
+    mov        ebx, [esp + 12 + 20]  // yuvconstants
+    mov        ecx, [esp + 12 + 24]  // width
+    sub        edi, esi
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+    READYUV411_AVX2
+    YUVTORGB_AVX2(ebx)
+    STOREARGB_AVX2
+
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        ebx
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_I411TOARGBROW_AVX2
+
+#ifdef HAS_NV12TOARGBROW_AVX2
+// 16 pixels.
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked)
+void NV12ToARGBRow_AVX2(const uint8* y_buf,
+                        const uint8* uv_buf,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  __asm {
+    push       esi
+    push       ebx
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // UV
+    mov        edx, [esp + 8 + 12]  // argb
+    mov        ebx, [esp + 8 + 16]  // yuvconstants
+    mov        ecx, [esp + 8 + 20]  // width
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+    READNV12_AVX2
+    YUVTORGB_AVX2(ebx)
+    STOREARGB_AVX2
+
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        ebx
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_NV12TOARGBROW_AVX2
+
+#ifdef HAS_NV21TOARGBROW_AVX2
+// 16 pixels.
+// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked)
+void NV21ToARGBRow_AVX2(const uint8* y_buf,
+                        const uint8* vu_buf,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  __asm {
+    push       esi
+    push       ebx
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // VU
+    mov        edx, [esp + 8 + 12]  // argb
+    mov        ebx, [esp + 8 + 16]  // yuvconstants
+    mov        ecx, [esp + 8 + 20]  // width
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+    READNV21_AVX2
+    YUVTORGB_AVX2(ebx)
+    STOREARGB_AVX2
+
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        ebx
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_NV21TOARGBROW_AVX2
+
+#ifdef HAS_YUY2TOARGBROW_AVX2
+// 16 pixels.
+// 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
+__declspec(naked)
+void YUY2ToARGBRow_AVX2(const uint8* src_yuy2,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  __asm {
+    push       ebx
+    mov        eax, [esp + 4 + 4]   // yuy2
+    mov        edx, [esp + 4 + 8]   // argb
+    mov        ebx, [esp + 4 + 12]  // yuvconstants
+    mov        ecx, [esp + 4 + 16]  // width
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+    READYUY2_AVX2
+    YUVTORGB_AVX2(ebx)
+    STOREARGB_AVX2
+
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        ebx
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_YUY2TOARGBROW_AVX2
+
+#ifdef HAS_UYVYTOARGBROW_AVX2
+// 16 pixels.
+// 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
+__declspec(naked)
+void UYVYToARGBRow_AVX2(const uint8* src_uyvy,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  __asm {
+    push       ebx
+    mov        eax, [esp + 4 + 4]   // uyvy
+    mov        edx, [esp + 4 + 8]   // argb
+    mov        ebx, [esp + 4 + 12]  // yuvconstants
+    mov        ecx, [esp + 4 + 16]  // width
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+    READUYVY_AVX2
+    YUVTORGB_AVX2(ebx)
+    STOREARGB_AVX2
+
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        ebx
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_UYVYTOARGBROW_AVX2
+
+#ifdef HAS_I422TORGBAROW_AVX2
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
+__declspec(naked)
+void I422ToRGBARow_AVX2(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  __asm {
+    push       esi
+    push       edi
+    push       ebx
+    mov        eax, [esp + 12 + 4]   // Y
+    mov        esi, [esp + 12 + 8]   // U
+    mov        edi, [esp + 12 + 12]  // V
+    mov        edx, [esp + 12 + 16]  // abgr
+    mov        ebx, [esp + 12 + 20]  // yuvconstants
+    mov        ecx, [esp + 12 + 24]  // width
+    sub        edi, esi
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+    READYUV422_AVX2
+    YUVTORGB_AVX2(ebx)
+    STORERGBA_AVX2
+
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        ebx
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_I422TORGBAROW_AVX2
+
+#if defined(HAS_I422TOARGBROW_SSSE3)
+// TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
+// Allows a conversion with half size scaling.
+
+// Read 8 UV from 444.
+#define READYUV444 __asm {                                                     \
+    __asm movq       xmm0, qword ptr [esi] /* U */                             \
+    __asm movq       xmm1, qword ptr [esi + edi] /* V */                       \
+    __asm lea        esi,  [esi + 8]                                           \
+    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
+    __asm movq       xmm4, qword ptr [eax]                                     \
+    __asm punpcklbw  xmm4, xmm4                                                \
+    __asm lea        eax, [eax + 8]                                            \
+  }
+
+// Read 4 UV from 422, upsample to 8 UV.
+#define READYUV422 __asm {                                                     \
+    __asm movd       xmm0, [esi]          /* U */                              \
+    __asm movd       xmm1, [esi + edi]    /* V */                              \
+    __asm lea        esi,  [esi + 4]                                           \
+    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
+    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
+    __asm movq       xmm4, qword ptr [eax]                                     \
+    __asm punpcklbw  xmm4, xmm4                                                \
+    __asm lea        eax, [eax + 8]                                            \
+  }
+
+// Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
+#define READYUVA422 __asm {                                                    \
+    __asm movd       xmm0, [esi]          /* U */                              \
+    __asm movd       xmm1, [esi + edi]    /* V */                              \
+    __asm lea        esi,  [esi + 4]                                           \
+    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
+    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
+    __asm movq       xmm4, qword ptr [eax]   /* Y */                           \
+    __asm punpcklbw  xmm4, xmm4                                                \
+    __asm lea        eax, [eax + 8]                                            \
+    __asm movq       xmm5, qword ptr [ebp]   /* A */                           \
+    __asm lea        ebp, [ebp + 8]                                            \
+  }
+
+// Read 2 UV from 411, upsample to 8 UV.
+// drmemory fails with memory fault if pinsrw used. libyuv bug: 525
+//  __asm pinsrw     xmm0, [esi], 0        /* U */
+//  __asm pinsrw     xmm1, [esi + edi], 0  /* V */
+#define READYUV411_EBX __asm {                                                 \
+    __asm movzx      ebx, word ptr [esi]        /* U */                        \
+    __asm movd       xmm0, ebx                                                 \
+    __asm movzx      ebx, word ptr [esi + edi]  /* V */                        \
+    __asm movd       xmm1, ebx                                                 \
+    __asm lea        esi,  [esi + 2]                                           \
+    __asm punpcklbw  xmm0, xmm1            /* UV */                            \
+    __asm punpcklwd  xmm0, xmm0            /* UVUV (upsample) */               \
+    __asm punpckldq  xmm0, xmm0            /* UVUVUVUV (upsample) */           \
+    __asm movq       xmm4, qword ptr [eax]                                     \
+    __asm punpcklbw  xmm4, xmm4                                                \
+    __asm lea        eax, [eax + 8]                                            \
+  }
+
+// Read 4 UV from NV12, upsample to 8 UV.
+#define READNV12 __asm {                                                       \
+    __asm movq       xmm0, qword ptr [esi] /* UV */                            \
+    __asm lea        esi,  [esi + 8]                                           \
+    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
+    __asm movq       xmm4, qword ptr [eax]                                     \
+    __asm punpcklbw  xmm4, xmm4                                                \
+    __asm lea        eax, [eax + 8]                                            \
+  }
+
+// Read 4 VU from NV21, upsample to 8 UV.
+#define READNV21 __asm {                                                       \
+    __asm movq       xmm0, qword ptr [esi] /* UV */                            \
+    __asm lea        esi,  [esi + 8]                                           \
+    __asm pshufb     xmm0, xmmword ptr kShuffleNV21                            \
+    __asm movq       xmm4, qword ptr [eax]                                     \
+    __asm punpcklbw  xmm4, xmm4                                                \
+    __asm lea        eax, [eax + 8]                                            \
+  }
+
+// Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV.
+#define READYUY2 __asm {                                                       \
+    __asm movdqu     xmm4, [eax]          /* YUY2 */                           \
+    __asm pshufb     xmm4, xmmword ptr kShuffleYUY2Y                           \
+    __asm movdqu     xmm0, [eax]          /* UV */                             \
+    __asm pshufb     xmm0, xmmword ptr kShuffleYUY2UV                          \
+    __asm lea        eax, [eax + 16]                                           \
+  }
+
+// Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV.
+#define READUYVY __asm {                                                       \
+    __asm movdqu     xmm4, [eax]          /* UYVY */                           \
+    __asm pshufb     xmm4, xmmword ptr kShuffleUYVYY                           \
+    __asm movdqu     xmm0, [eax]          /* UV */                             \
+    __asm pshufb     xmm0, xmmword ptr kShuffleUYVYUV                          \
+    __asm lea        eax, [eax + 16]                                           \
+  }
+
+// Convert 8 pixels: 8 UV and 8 Y.
+#define YUVTORGB(YuvConstants) __asm {                                         \
+    __asm movdqa     xmm1, xmm0                                                \
+    __asm movdqa     xmm2, xmm0                                                \
+    __asm movdqa     xmm3, xmm0                                                \
+    __asm movdqa     xmm0, xmmword ptr [YuvConstants + KUVBIASB]               \
+    __asm pmaddubsw  xmm1, xmmword ptr [YuvConstants + KUVTOB]                 \
+    __asm psubw      xmm0, xmm1                                                \
+    __asm movdqa     xmm1, xmmword ptr [YuvConstants + KUVBIASG]               \
+    __asm pmaddubsw  xmm2, xmmword ptr [YuvConstants + KUVTOG]                 \
+    __asm psubw      xmm1, xmm2                                                \
+    __asm movdqa     xmm2, xmmword ptr [YuvConstants + KUVBIASR]               \
+    __asm pmaddubsw  xmm3, xmmword ptr [YuvConstants + KUVTOR]                 \
+    __asm psubw      xmm2, xmm3                                                \
+    __asm pmulhuw    xmm4, xmmword ptr [YuvConstants + KYTORGB]                \
+    __asm paddsw     xmm0, xmm4           /* B += Y */                         \
+    __asm paddsw     xmm1, xmm4           /* G += Y */                         \
+    __asm paddsw     xmm2, xmm4           /* R += Y */                         \
+    __asm psraw      xmm0, 6                                                   \
+    __asm psraw      xmm1, 6                                                   \
+    __asm psraw      xmm2, 6                                                   \
+    __asm packuswb   xmm0, xmm0           /* B */                              \
+    __asm packuswb   xmm1, xmm1           /* G */                              \
+    __asm packuswb   xmm2, xmm2           /* R */                              \
+  }
+
+// Store 8 ARGB values.
+#define STOREARGB __asm {                                                      \
+    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
+    __asm punpcklbw  xmm2, xmm5           /* RA */                             \
+    __asm movdqa     xmm1, xmm0                                                \
+    __asm punpcklwd  xmm0, xmm2           /* BGRA first 4 pixels */            \
+    __asm punpckhwd  xmm1, xmm2           /* BGRA next 4 pixels */             \
+    __asm movdqu     0[edx], xmm0                                              \
+    __asm movdqu     16[edx], xmm1                                             \
+    __asm lea        edx,  [edx + 32]                                          \
+  }
+
+// Store 8 BGRA values.
+#define STOREBGRA __asm {                                                      \
+    __asm pcmpeqb    xmm5, xmm5           /* generate 0xffffffff for alpha */  \
+    __asm punpcklbw  xmm1, xmm0           /* GB */                             \
+    __asm punpcklbw  xmm5, xmm2           /* AR */                             \
+    __asm movdqa     xmm0, xmm5                                                \
+    __asm punpcklwd  xmm5, xmm1           /* BGRA first 4 pixels */            \
+    __asm punpckhwd  xmm0, xmm1           /* BGRA next 4 pixels */             \
+    __asm movdqu     0[edx], xmm5                                              \
+    __asm movdqu     16[edx], xmm0                                             \
+    __asm lea        edx,  [edx + 32]                                          \
+  }
+
+// Store 8 RGBA values.
+#define STORERGBA __asm {                                                      \
+    __asm pcmpeqb    xmm5, xmm5           /* generate 0xffffffff for alpha */  \
+    __asm punpcklbw  xmm1, xmm2           /* GR */                             \
+    __asm punpcklbw  xmm5, xmm0           /* AB */                             \
+    __asm movdqa     xmm0, xmm5                                                \
+    __asm punpcklwd  xmm5, xmm1           /* RGBA first 4 pixels */            \
+    __asm punpckhwd  xmm0, xmm1           /* RGBA next 4 pixels */             \
+    __asm movdqu     0[edx], xmm5                                              \
+    __asm movdqu     16[edx], xmm0                                             \
+    __asm lea        edx,  [edx + 32]                                          \
+  }
+
+// Store 8 RGB24 values.
+#define STORERGB24 __asm {                                                     \
+    /* Weave into RRGB */                                                      \
+    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
+    __asm punpcklbw  xmm2, xmm2           /* RR */                             \
+    __asm movdqa     xmm1, xmm0                                                \
+    __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \
+    __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \
+    /* RRGB -> RGB24 */                                                        \
+    __asm pshufb     xmm0, xmm5           /* Pack first 8 and last 4 bytes. */ \
+    __asm pshufb     xmm1, xmm6           /* Pack first 12 bytes. */           \
+    __asm palignr    xmm1, xmm0, 12       /* last 4 bytes of xmm0 + 12 xmm1 */ \
+    __asm movq       qword ptr 0[edx], xmm0  /* First 8 bytes */               \
+    __asm movdqu     8[edx], xmm1         /* Last 16 bytes */                  \
+    __asm lea        edx,  [edx + 24]                                          \
+  }
+
+// Store 8 RGB565 values.
+#define STORERGB565 __asm {                                                    \
+    /* Weave into RRGB */                                                      \
+    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
+    __asm punpcklbw  xmm2, xmm2           /* RR */                             \
+    __asm movdqa     xmm1, xmm0                                                \
+    __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \
+    __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \
+    /* RRGB -> RGB565 */                                                       \
+    __asm movdqa     xmm3, xmm0    /* B  first 4 pixels of argb */             \
+    __asm movdqa     xmm2, xmm0    /* G */                                     \
+    __asm pslld      xmm0, 8       /* R */                                     \
+    __asm psrld      xmm3, 3       /* B */                                     \
+    __asm psrld      xmm2, 5       /* G */                                     \
+    __asm psrad      xmm0, 16      /* R */                                     \
+    __asm pand       xmm3, xmm5    /* B */                                     \
+    __asm pand       xmm2, xmm6    /* G */                                     \
+    __asm pand       xmm0, xmm7    /* R */                                     \
+    __asm por        xmm3, xmm2    /* BG */                                    \
+    __asm por        xmm0, xmm3    /* BGR */                                   \
+    __asm movdqa     xmm3, xmm1    /* B  next 4 pixels of argb */              \
+    __asm movdqa     xmm2, xmm1    /* G */                                     \
+    __asm pslld      xmm1, 8       /* R */                                     \
+    __asm psrld      xmm3, 3       /* B */                                     \
+    __asm psrld      xmm2, 5       /* G */                                     \
+    __asm psrad      xmm1, 16      /* R */                                     \
+    __asm pand       xmm3, xmm5    /* B */                                     \
+    __asm pand       xmm2, xmm6    /* G */                                     \
+    __asm pand       xmm1, xmm7    /* R */                                     \
+    __asm por        xmm3, xmm2    /* BG */                                    \
+    __asm por        xmm1, xmm3    /* BGR */                                   \
+    __asm packssdw   xmm0, xmm1                                                \
+    __asm movdqu     0[edx], xmm0  /* store 8 pixels of RGB565 */              \
+    __asm lea        edx, [edx + 16]                                           \
+  }
+
+// 8 pixels.
+// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked)
+void I444ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  __asm {
+    push       esi
+    push       edi
+    push       ebx
+    mov        eax, [esp + 12 + 4]   // Y
+    mov        esi, [esp + 12 + 8]   // U
+    mov        edi, [esp + 12 + 12]  // V
+    mov        edx, [esp + 12 + 16]  // argb
+    mov        ebx, [esp + 12 + 20]  // yuvconstants
+    mov        ecx, [esp + 12 + 24]  // width
+    sub        edi, esi
+    pcmpeqb    xmm5, xmm5            // generate 0xffffffff for alpha
+
+ convertloop:
+    READYUV444
+    YUVTORGB(ebx)
+    STOREARGB
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        ebx
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+// 8 pixels.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
+__declspec(naked)
+void I422ToRGB24Row_SSSE3(const uint8* y_buf,
+                          const uint8* u_buf,
+                          const uint8* v_buf,
+                          uint8* dst_rgb24,
+                          const struct YuvConstants* yuvconstants,
+                          int width) {
+  __asm {
+    push       esi
+    push       edi
+    push       ebx
+    mov        eax, [esp + 12 + 4]   // Y
+    mov        esi, [esp + 12 + 8]   // U
+    mov        edi, [esp + 12 + 12]  // V
+    mov        edx, [esp + 12 + 16]  // argb
+    mov        ebx, [esp + 12 + 20]  // yuvconstants
+    mov        ecx, [esp + 12 + 24]  // width
+    sub        edi, esi
+    movdqa     xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0
+    movdqa     xmm6, xmmword ptr kShuffleMaskARGBToRGB24
+
+ convertloop:
+    READYUV422
+    YUVTORGB(ebx)
+    STORERGB24
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        ebx
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+// 8 pixels
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
+__declspec(naked)
+void I422ToRGB565Row_SSSE3(const uint8* y_buf,
+                           const uint8* u_buf,
+                           const uint8* v_buf,
+                           uint8* rgb565_buf,
+                           const struct YuvConstants* yuvconstants,
+                           int width) {
+  __asm {
+    push       esi
+    push       edi
+    push       ebx
+    mov        eax, [esp + 12 + 4]   // Y
+    mov        esi, [esp + 12 + 8]   // U
+    mov        edi, [esp + 12 + 12]  // V
+    mov        edx, [esp + 12 + 16]  // argb
+    mov        ebx, [esp + 12 + 20]  // yuvconstants
+    mov        ecx, [esp + 12 + 24]  // width
+    sub        edi, esi
+    pcmpeqb    xmm5, xmm5       // generate mask 0x0000001f
+    psrld      xmm5, 27
+    pcmpeqb    xmm6, xmm6       // generate mask 0x000007e0
+    psrld      xmm6, 26
+    pslld      xmm6, 5
+    pcmpeqb    xmm7, xmm7       // generate mask 0xfffff800
+    pslld      xmm7, 11
+
+ convertloop:
+    READYUV422
+    YUVTORGB(ebx)
+    STORERGB565
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        ebx
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+// 8 pixels.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked)
+void I422ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  __asm {
+    push       esi
+    push       edi
+    push       ebx
+    mov        eax, [esp + 12 + 4]   // Y
+    mov        esi, [esp + 12 + 8]   // U
+    mov        edi, [esp + 12 + 12]  // V
+    mov        edx, [esp + 12 + 16]  // argb
+    mov        ebx, [esp + 12 + 20]  // yuvconstants
+    mov        ecx, [esp + 12 + 24]  // width
+    sub        edi, esi
+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+
+ convertloop:
+    READYUV422
+    YUVTORGB(ebx)
+    STOREARGB
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        ebx
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+// 8 pixels.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB.
+__declspec(naked)
+void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              const uint8* a_buf,
+                              uint8* dst_argb,
+                              const struct YuvConstants* yuvconstants,
+                              int width) {
+  __asm {
+    push       esi
+    push       edi
+    push       ebx
+    push       ebp
+    mov        eax, [esp + 16 + 4]   // Y
+    mov        esi, [esp + 16 + 8]   // U
+    mov        edi, [esp + 16 + 12]  // V
+    mov        ebp, [esp + 16 + 16]  // A
+    mov        edx, [esp + 16 + 20]  // argb
+    mov        ebx, [esp + 16 + 24]  // yuvconstants
+    mov        ecx, [esp + 16 + 28]  // width
+    sub        edi, esi
+
+ convertloop:
+    READYUVA422
+    YUVTORGB(ebx)
+    STOREARGB
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        ebp
+    pop        ebx
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+// 8 pixels.
+// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+// Similar to I420 but duplicate UV once more.
+__declspec(naked)
+void I411ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  __asm {
+    push       esi
+    push       edi
+    push       ebx
+    push       ebp
+    mov        eax, [esp + 16 + 4]   // Y
+    mov        esi, [esp + 16 + 8]   // U
+    mov        edi, [esp + 16 + 12]  // V
+    mov        edx, [esp + 16 + 16]  // abgr
+    mov        ebp, [esp + 16 + 20]  // yuvconstants
+    mov        ecx, [esp + 16 + 24]  // width
+    sub        edi, esi
+    pcmpeqb    xmm5, xmm5            // generate 0xffffffff for alpha
+
+ convertloop:
+    READYUV411_EBX
+    YUVTORGB(ebp)
+    STOREARGB
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        ebp
+    pop        ebx
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+// 8 pixels.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked)
+void NV12ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* uv_buf,
+                         uint8* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  __asm {
+    push       esi
+    push       ebx
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // UV
+    mov        edx, [esp + 8 + 12]  // argb
+    mov        ebx, [esp + 8 + 16]  // yuvconstants
+    mov        ecx, [esp + 8 + 20]  // width
+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+
+ convertloop:
+    READNV12
+    YUVTORGB(ebx)
+    STOREARGB
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        ebx
+    pop        esi
+    ret
+  }
+}
+
+// 8 pixels.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked)
+void NV21ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* vu_buf,
+                         uint8* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  __asm {
+    push       esi
+    push       ebx
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // VU
+    mov        edx, [esp + 8 + 12]  // argb
+    mov        ebx, [esp + 8 + 16]  // yuvconstants
+    mov        ecx, [esp + 8 + 20]  // width
+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+
+ convertloop:
+    READNV21
+    YUVTORGB(ebx)
+    STOREARGB
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        ebx
+    pop        esi
+    ret
+  }
+}
+
+// 8 pixels.
+// 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
+__declspec(naked)
+void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
+                         uint8* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  __asm {
+    push       ebx
+    mov        eax, [esp + 4 + 4]   // yuy2
+    mov        edx, [esp + 4 + 8]   // argb
+    mov        ebx, [esp + 4 + 12]  // yuvconstants
+    mov        ecx, [esp + 4 + 16]  // width
+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+
+ convertloop:
+    READYUY2
+    YUVTORGB(ebx)
+    STOREARGB
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        ebx
+    ret
+  }
+}
+
+// 8 pixels.
+// 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
+__declspec(naked)
+void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
+                         uint8* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  __asm {
+    push       ebx
+    mov        eax, [esp + 4 + 4]   // uyvy
+    mov        edx, [esp + 4 + 8]   // argb
+    mov        ebx, [esp + 4 + 12]  // yuvconstants
+    mov        ecx, [esp + 4 + 16]  // width
+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+
+ convertloop:
+    READUYVY
+    YUVTORGB(ebx)
+    STOREARGB
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        ebx
+    ret
+  }
+}
+
+__declspec(naked)
+void I422ToRGBARow_SSSE3(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* dst_rgba,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  __asm {
+    push       esi
+    push       edi
+    push       ebx
+    mov        eax, [esp + 12 + 4]   // Y
+    mov        esi, [esp + 12 + 8]   // U
+    mov        edi, [esp + 12 + 12]  // V
+    mov        edx, [esp + 12 + 16]  // argb
+    mov        ebx, [esp + 12 + 20]  // yuvconstants
+    mov        ecx, [esp + 12 + 24]  // width
+    sub        edi, esi
+
+ convertloop:
+    READYUV422
+    YUVTORGB(ebx)
+    STORERGBA
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        ebx
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_I422TOARGBROW_SSSE3
+
+#ifdef HAS_I400TOARGBROW_SSE2
+// 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
+__declspec(naked)
+void I400ToARGBRow_SSE2(const uint8* y_buf,
+                        uint8* rgb_buf,
+                        int width) {
+  __asm {
+    mov        eax, 0x4a354a35      // 4a35 = 18997 = round(1.164 * 64 * 256)
+    movd       xmm2, eax
+    pshufd     xmm2, xmm2,0
+    mov        eax, 0x04880488      // 0488 = 1160 = round(1.164 * 64 * 16)
+    movd       xmm3, eax
+    pshufd     xmm3, xmm3, 0
+    pcmpeqb    xmm4, xmm4           // generate mask 0xff000000
+    pslld      xmm4, 24
+
+    mov        eax, [esp + 4]       // Y
+    mov        edx, [esp + 8]       // rgb
+    mov        ecx, [esp + 12]      // width
+
+ convertloop:
+    // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
+    movq       xmm0, qword ptr [eax]
+    lea        eax, [eax + 8]
+    punpcklbw  xmm0, xmm0           // Y.Y
+    pmulhuw    xmm0, xmm2
+    psubusw    xmm0, xmm3
+    psrlw      xmm0, 6
+    packuswb   xmm0, xmm0           // G
+
+    // Step 2: Weave into ARGB
+    punpcklbw  xmm0, xmm0           // GG
+    movdqa     xmm1, xmm0
+    punpcklwd  xmm0, xmm0           // BGRA first 4 pixels
+    punpckhwd  xmm1, xmm1           // BGRA next 4 pixels
+    por        xmm0, xmm4
+    por        xmm1, xmm4
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
+    lea        edx,  [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+    ret
+  }
+}
+#endif  // HAS_I400TOARGBROW_SSE2
+
+#ifdef HAS_I400TOARGBROW_AVX2
+// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
+// note: vpunpcklbw mutates and vpackuswb unmutates.
+__declspec(naked)
+void I400ToARGBRow_AVX2(const uint8* y_buf,
+                        uint8* rgb_buf,
+                        int width) {
+  __asm {
+    mov        eax, 0x4a354a35      // 4a35 = 18997 = round(1.164 * 64 * 256)
+    vmovd      xmm2, eax
+    vbroadcastss ymm2, xmm2
+    mov        eax, 0x04880488      // 0488 = 1160 = round(1.164 * 64 * 16)
+    vmovd      xmm3, eax
+    vbroadcastss ymm3, xmm3
+    vpcmpeqb   ymm4, ymm4, ymm4     // generate mask 0xff000000
+    vpslld     ymm4, ymm4, 24
+
+    mov        eax, [esp + 4]       // Y
+    mov        edx, [esp + 8]       // rgb
+    mov        ecx, [esp + 12]      // width
+
+ convertloop:
+    // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164
+    vmovdqu    xmm0, [eax]
+    lea        eax, [eax + 16]
+    vpermq     ymm0, ymm0, 0xd8           // vpunpcklbw mutates
+    vpunpcklbw ymm0, ymm0, ymm0           // Y.Y
+    vpmulhuw   ymm0, ymm0, ymm2
+    vpsubusw   ymm0, ymm0, ymm3
+    vpsrlw     ymm0, ymm0, 6
+    vpackuswb  ymm0, ymm0, ymm0           // G.  still mutated: 3120
+
+    // TODO(fbarchard): Weave alpha with unpack.
+    // Step 2: Weave into ARGB
+    vpunpcklbw ymm1, ymm0, ymm0           // GG - mutates
+    vpermq     ymm1, ymm1, 0xd8
+    vpunpcklwd ymm0, ymm1, ymm1           // GGGG first 8 pixels
+    vpunpckhwd ymm1, ymm1, ymm1           // GGGG next 8 pixels
+    vpor       ymm0, ymm0, ymm4
+    vpor       ymm1, ymm1, ymm4
+    vmovdqu    [edx], ymm0
+    vmovdqu    [edx + 32], ymm1
+    lea        edx,  [edx + 64]
+    sub        ecx, 16
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_I400TOARGBROW_AVX2
+
+#ifdef HAS_MIRRORROW_SSSE3
+// Shuffle table for reversing the bytes.
+static const uvec8 kShuffleMirror = {
+  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
+};
+
+// TODO(fbarchard): Replace lea with -16 offset.
+__declspec(naked)
+void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
+  __asm {
+    mov       eax, [esp + 4]   // src
+    mov       edx, [esp + 8]   // dst
+    mov       ecx, [esp + 12]  // width
+    movdqa    xmm5, xmmword ptr kShuffleMirror
+
+ convertloop:
+    movdqu    xmm0, [eax - 16 + ecx]
+    pshufb    xmm0, xmm5
+    movdqu    [edx], xmm0
+    lea       edx, [edx + 16]
+    sub       ecx, 16
+    jg        convertloop
+    ret
+  }
+}
+#endif  // HAS_MIRRORROW_SSSE3
+
+#ifdef HAS_MIRRORROW_AVX2
+__declspec(naked)
+void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+  __asm {
+    mov       eax, [esp + 4]   // src
+    mov       edx, [esp + 8]   // dst
+    mov       ecx, [esp + 12]  // width
+    vbroadcastf128 ymm5, xmmword ptr kShuffleMirror
+
+ convertloop:
+    vmovdqu   ymm0, [eax - 32 + ecx]
+    vpshufb   ymm0, ymm0, ymm5
+    vpermq    ymm0, ymm0, 0x4e  // swap high and low halfs
+    vmovdqu   [edx], ymm0
+    lea       edx, [edx + 32]
+    sub       ecx, 32
+    jg        convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_MIRRORROW_AVX2
+
+#ifdef HAS_MIRRORUVROW_SSSE3
+// Shuffle table for reversing the bytes of UV channels.
+static const uvec8 kShuffleMirrorUV = {
+  14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
+};
+
+__declspec(naked)
+void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
+                       int width) {
+  __asm {
+    push      edi
+    mov       eax, [esp + 4 + 4]   // src
+    mov       edx, [esp + 4 + 8]   // dst_u
+    mov       edi, [esp + 4 + 12]  // dst_v
+    mov       ecx, [esp + 4 + 16]  // width
+    movdqa    xmm1, xmmword ptr kShuffleMirrorUV
+    lea       eax, [eax + ecx * 2 - 16]
+    sub       edi, edx
+
+ convertloop:
+    movdqu    xmm0, [eax]
+    lea       eax, [eax - 16]
+    pshufb    xmm0, xmm1
+    movlpd    qword ptr [edx], xmm0
+    movhpd    qword ptr [edx + edi], xmm0
+    lea       edx, [edx + 8]
+    sub       ecx, 8
+    jg        convertloop
+
+    pop       edi
+    ret
+  }
+}
+#endif  // HAS_MIRRORUVROW_SSSE3
+
+#ifdef HAS_ARGBMIRRORROW_SSE2
+__declspec(naked)
+void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
+  __asm {
+    mov       eax, [esp + 4]   // src
+    mov       edx, [esp + 8]   // dst
+    mov       ecx, [esp + 12]  // width
+    lea       eax, [eax - 16 + ecx * 4]  // last 4 pixels.
+
+ convertloop:
+    movdqu    xmm0, [eax]
+    lea       eax, [eax - 16]
+    pshufd    xmm0, xmm0, 0x1b
+    movdqu    [edx], xmm0
+    lea       edx, [edx + 16]
+    sub       ecx, 4
+    jg        convertloop
+    ret
+  }
+}
+#endif  // HAS_ARGBMIRRORROW_SSE2
+
+#ifdef HAS_ARGBMIRRORROW_AVX2
+// Shuffle table for reversing the bytes.
+static const ulvec32 kARGBShuffleMirror_AVX2 = {
+  7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
+};
+
+__declspec(naked)
+void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+  __asm {
+    mov       eax, [esp + 4]   // src
+    mov       edx, [esp + 8]   // dst
+    mov       ecx, [esp + 12]  // width
+    vmovdqu   ymm5, ymmword ptr kARGBShuffleMirror_AVX2
+
+ convertloop:
+    vpermd    ymm0, ymm5, [eax - 32 + ecx * 4]  // permute dword order
+    vmovdqu   [edx], ymm0
+    lea       edx, [edx + 32]
+    sub       ecx, 8
+    jg        convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBMIRRORROW_AVX2
+
+#ifdef HAS_SPLITUVROW_SSE2
+__declspec(naked)
+void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                     int width) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]    // src_uv
+    mov        edx, [esp + 4 + 8]    // dst_u
+    mov        edi, [esp + 4 + 12]   // dst_v
+    mov        ecx, [esp + 4 + 16]   // width
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+    sub        edi, edx
+
+  convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    movdqa     xmm2, xmm0
+    movdqa     xmm3, xmm1
+    pand       xmm0, xmm5   // even bytes
+    pand       xmm1, xmm5
+    packuswb   xmm0, xmm1
+    psrlw      xmm2, 8      // odd bytes
+    psrlw      xmm3, 8
+    packuswb   xmm2, xmm3
+    movdqu     [edx], xmm0
+    movdqu     [edx + edi], xmm2
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    ret
+  }
+}
+
+#endif  // HAS_SPLITUVROW_SSE2
+
+#ifdef HAS_SPLITUVROW_AVX2
+__declspec(naked)
+void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                     int width) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]    // src_uv
+    mov        edx, [esp + 4 + 8]    // dst_u
+    mov        edi, [esp + 4 + 12]   // dst_v
+    mov        ecx, [esp + 4 + 16]   // width
+    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
+    vpsrlw     ymm5, ymm5, 8
+    sub        edi, edx
+
+  convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    lea        eax,  [eax + 64]
+    vpsrlw     ymm2, ymm0, 8      // odd bytes
+    vpsrlw     ymm3, ymm1, 8
+    vpand      ymm0, ymm0, ymm5   // even bytes
+    vpand      ymm1, ymm1, ymm5
+    vpackuswb  ymm0, ymm0, ymm1
+    vpackuswb  ymm2, ymm2, ymm3
+    vpermq     ymm0, ymm0, 0xd8
+    vpermq     ymm2, ymm2, 0xd8
+    vmovdqu    [edx], ymm0
+    vmovdqu    [edx + edi], ymm2
+    lea        edx, [edx + 32]
+    sub        ecx, 32
+    jg         convertloop
+
+    pop        edi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_SPLITUVROW_AVX2
+
+#ifdef HAS_MERGEUVROW_SSE2
+__declspec(naked)
+void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                     int width) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]    // src_u
+    mov        edx, [esp + 4 + 8]    // src_v
+    mov        edi, [esp + 4 + 12]   // dst_uv
+    mov        ecx, [esp + 4 + 16]   // width
+    sub        edx, eax
+
+  convertloop:
+    movdqu     xmm0, [eax]      // read 16 U's
+    movdqu     xmm1, [eax + edx]  // and 16 V's
+    lea        eax,  [eax + 16]
+    movdqa     xmm2, xmm0
+    punpcklbw  xmm0, xmm1       // first 8 UV pairs
+    punpckhbw  xmm2, xmm1       // next 8 UV pairs
+    movdqu     [edi], xmm0
+    movdqu     [edi + 16], xmm2
+    lea        edi, [edi + 32]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    ret
+  }
+}
+#endif  //  HAS_MERGEUVROW_SSE2
+
+#ifdef HAS_MERGEUVROW_AVX2
+__declspec(naked)
+void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                     int width) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]    // src_u
+    mov        edx, [esp + 4 + 8]    // src_v
+    mov        edi, [esp + 4 + 12]   // dst_uv
+    mov        ecx, [esp + 4 + 16]   // width
+    sub        edx, eax
+
+  convertloop:
+    vmovdqu    ymm0, [eax]           // read 32 U's
+    vmovdqu    ymm1, [eax + edx]     // and 32 V's
+    lea        eax,  [eax + 32]
+    vpunpcklbw ymm2, ymm0, ymm1      // low 16 UV pairs. mutated qqword 0,2
+    vpunpckhbw ymm0, ymm0, ymm1      // high 16 UV pairs. mutated qqword 1,3
+    vextractf128 [edi], ymm2, 0       // bytes 0..15
+    vextractf128 [edi + 16], ymm0, 0  // bytes 16..31
+    vextractf128 [edi + 32], ymm2, 1  // bytes 32..47
+    vextractf128 [edi + 48], ymm0, 1  // bytes 47..63
+    lea        edi, [edi + 64]
+    sub        ecx, 32
+    jg         convertloop
+
+    pop        edi
+    vzeroupper
+    ret
+  }
+}
+#endif  //  HAS_MERGEUVROW_AVX2
+
+#ifdef HAS_COPYROW_SSE2
+// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
+__declspec(naked)
+void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
+  __asm {
+    mov        eax, [esp + 4]   // src
+    mov        edx, [esp + 8]   // dst
+    mov        ecx, [esp + 12]  // count
+    test       eax, 15
+    jne        convertloopu
+    test       edx, 15
+    jne        convertloopu
+
+  convertloopa:
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    lea        eax, [eax + 32]
+    movdqa     [edx], xmm0
+    movdqa     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    sub        ecx, 32
+    jg         convertloopa
+    ret
+
+  convertloopu:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax, [eax + 32]
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    sub        ecx, 32
+    jg         convertloopu
+    ret
+  }
+}
+#endif  // HAS_COPYROW_SSE2
+
+#ifdef HAS_COPYROW_AVX
+// CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time.
+__declspec(naked)
+void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
+  __asm {
+    mov        eax, [esp + 4]   // src
+    mov        edx, [esp + 8]   // dst
+    mov        ecx, [esp + 12]  // count
+
+  convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    lea        eax, [eax + 64]
+    vmovdqu    [edx], ymm0
+    vmovdqu    [edx + 32], ymm1
+    lea        edx, [edx + 64]
+    sub        ecx, 64
+    jg         convertloop
+
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_COPYROW_AVX
+
+// Multiple of 1.
+__declspec(naked)
+void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
+  __asm {
+    mov        eax, esi
+    mov        edx, edi
+    mov        esi, [esp + 4]   // src
+    mov        edi, [esp + 8]   // dst
+    mov        ecx, [esp + 12]  // count
+    rep movsb
+    mov        edi, edx
+    mov        esi, eax
+    ret
+  }
+}
+
+#ifdef HAS_ARGBCOPYALPHAROW_SSE2
+// width in pixels
+__declspec(naked)
+void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
+  __asm {
+    mov        eax, [esp + 4]   // src
+    mov        edx, [esp + 8]   // dst
+    mov        ecx, [esp + 12]  // count
+    pcmpeqb    xmm0, xmm0       // generate mask 0xff000000
+    pslld      xmm0, 24
+    pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
+    psrld      xmm1, 8
+
+  convertloop:
+    movdqu     xmm2, [eax]
+    movdqu     xmm3, [eax + 16]
+    lea        eax, [eax + 32]
+    movdqu     xmm4, [edx]
+    movdqu     xmm5, [edx + 16]
+    pand       xmm2, xmm0
+    pand       xmm3, xmm0
+    pand       xmm4, xmm1
+    pand       xmm5, xmm1
+    por        xmm2, xmm4
+    por        xmm3, xmm5
+    movdqu     [edx], xmm2
+    movdqu     [edx + 16], xmm3
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    ret
+  }
+}
+#endif  // HAS_ARGBCOPYALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYALPHAROW_AVX2
+// width in pixels
+__declspec(naked)
+void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
+  __asm {
+    mov        eax, [esp + 4]   // src
+    mov        edx, [esp + 8]   // dst
+    mov        ecx, [esp + 12]  // count
+    vpcmpeqb   ymm0, ymm0, ymm0
+    vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
+
+  convertloop:
+    vmovdqu    ymm1, [eax]
+    vmovdqu    ymm2, [eax + 32]
+    lea        eax, [eax + 64]
+    vpblendvb  ymm1, ymm1, [edx], ymm0
+    vpblendvb  ymm2, ymm2, [edx + 32], ymm0
+    vmovdqu    [edx], ymm1
+    vmovdqu    [edx + 32], ymm2
+    lea        edx, [edx + 64]
+    sub        ecx, 16
+    jg         convertloop
+
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBCOPYALPHAROW_AVX2
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
+// width in pixels
+__declspec(naked)
+void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
+  __asm {
+    mov        eax, [esp + 4]   // src
+    mov        edx, [esp + 8]   // dst
+    mov        ecx, [esp + 12]  // count
+    pcmpeqb    xmm0, xmm0       // generate mask 0xff000000
+    pslld      xmm0, 24
+    pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
+    psrld      xmm1, 8
+
+  convertloop:
+    movq       xmm2, qword ptr [eax]  // 8 Y's
+    lea        eax, [eax + 8]
+    punpcklbw  xmm2, xmm2
+    punpckhwd  xmm3, xmm2
+    punpcklwd  xmm2, xmm2
+    movdqu     xmm4, [edx]
+    movdqu     xmm5, [edx + 16]
+    pand       xmm2, xmm0
+    pand       xmm3, xmm0
+    pand       xmm4, xmm1
+    pand       xmm5, xmm1
+    por        xmm2, xmm4
+    por        xmm3, xmm5
+    movdqu     [edx], xmm2
+    movdqu     [edx + 16], xmm3
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    ret
+  }
+}
+#endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
+// width in pixels
+__declspec(naked)
+void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
+  __asm {
+    mov        eax, [esp + 4]   // src
+    mov        edx, [esp + 8]   // dst
+    mov        ecx, [esp + 12]  // count
+    vpcmpeqb   ymm0, ymm0, ymm0
+    vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
+
+  convertloop:
+    vpmovzxbd  ymm1, qword ptr [eax]
+    vpmovzxbd  ymm2, qword ptr [eax + 8]
+    lea        eax, [eax + 16]
+    vpslld     ymm1, ymm1, 24
+    vpslld     ymm2, ymm2, 24
+    vpblendvb  ymm1, ymm1, [edx], ymm0
+    vpblendvb  ymm2, ymm2, [edx + 32], ymm0
+    vmovdqu    [edx], ymm1
+    vmovdqu    [edx + 32], ymm2
+    lea        edx, [edx + 64]
+    sub        ecx, 16
+    jg         convertloop
+
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
+
+#ifdef HAS_SETROW_X86
+// Write 'count' bytes using an 8 bit value repeated.
+// Count should be multiple of 4.
+__declspec(naked)
+void SetRow_X86(uint8* dst, uint8 v8, int count) {
+  __asm {
+    movzx      eax, byte ptr [esp + 8]    // v8
+    mov        edx, 0x01010101  // Duplicate byte to all bytes.
+    mul        edx              // overwrites edx with upper part of result.
+    mov        edx, edi
+    mov        edi, [esp + 4]   // dst
+    mov        ecx, [esp + 12]  // count
+    shr        ecx, 2
+    rep stosd
+    mov        edi, edx
+    ret
+  }
+}
+
+// Write 'count' bytes using an 8 bit value repeated.
+__declspec(naked)
+void SetRow_ERMS(uint8* dst, uint8 v8, int count) {
+  __asm {
+    mov        edx, edi
+    mov        edi, [esp + 4]   // dst
+    mov        eax, [esp + 8]   // v8
+    mov        ecx, [esp + 12]  // count
+    rep stosb
+    mov        edi, edx
+    ret
+  }
+}
+
+// Write 'count' 32 bit values.
+__declspec(naked)
+void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
+  __asm {
+    mov        edx, edi
+    mov        edi, [esp + 4]   // dst
+    mov        eax, [esp + 8]   // v32
+    mov        ecx, [esp + 12]  // count
+    rep stosd
+    mov        edi, edx
+    ret
+  }
+}
+#endif  // HAS_SETROW_X86
+
+#ifdef HAS_YUY2TOYROW_AVX2
+__declspec(naked)
+void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) {
+  __asm {
+    mov        eax, [esp + 4]    // src_yuy2
+    mov        edx, [esp + 8]    // dst_y
+    mov        ecx, [esp + 12]   // width
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
+    vpsrlw     ymm5, ymm5, 8
+
+  convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    lea        eax,  [eax + 64]
+    vpand      ymm0, ymm0, ymm5   // even bytes are Y
+    vpand      ymm1, ymm1, ymm5
+    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpermq     ymm0, ymm0, 0xd8
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
+    sub        ecx, 32
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+
+__declspec(naked)
+void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_yuy2
+    mov        esi, [esp + 8 + 8]    // stride_yuy2
+    mov        edx, [esp + 8 + 12]   // dst_u
+    mov        edi, [esp + 8 + 16]   // dst_v
+    mov        ecx, [esp + 8 + 20]   // width
+    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
+    vpsrlw     ymm5, ymm5, 8
+    sub        edi, edx
+
+  convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    vpavgb     ymm0, ymm0, [eax + esi]
+    vpavgb     ymm1, ymm1, [eax + esi + 32]
+    lea        eax,  [eax + 64]
+    vpsrlw     ymm0, ymm0, 8      // YUYV -> UVUV
+    vpsrlw     ymm1, ymm1, 8
+    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpermq     ymm0, ymm0, 0xd8
+    vpand      ymm1, ymm0, ymm5  // U
+    vpsrlw     ymm0, ymm0, 8     // V
+    vpackuswb  ymm1, ymm1, ymm1  // mutates.
+    vpackuswb  ymm0, ymm0, ymm0  // mutates.
+    vpermq     ymm1, ymm1, 0xd8
+    vpermq     ymm0, ymm0, 0xd8
+    vextractf128 [edx], ymm1, 0  // U
+    vextractf128 [edx + edi], ymm0, 0 // V
+    lea        edx, [edx + 16]
+    sub        ecx, 32
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+
+__declspec(naked)
+void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
+                         uint8* dst_u, uint8* dst_v, int width) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]    // src_yuy2
+    mov        edx, [esp + 4 + 8]    // dst_u
+    mov        edi, [esp + 4 + 12]   // dst_v
+    mov        ecx, [esp + 4 + 16]   // width
+    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
+    vpsrlw     ymm5, ymm5, 8
+    sub        edi, edx
+
+  convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    lea        eax,  [eax + 64]
+    vpsrlw     ymm0, ymm0, 8      // YUYV -> UVUV
+    vpsrlw     ymm1, ymm1, 8
+    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpermq     ymm0, ymm0, 0xd8
+    vpand      ymm1, ymm0, ymm5  // U
+    vpsrlw     ymm0, ymm0, 8     // V
+    vpackuswb  ymm1, ymm1, ymm1  // mutates.
+    vpackuswb  ymm0, ymm0, ymm0  // mutates.
+    vpermq     ymm1, ymm1, 0xd8
+    vpermq     ymm0, ymm0, 0xd8
+    vextractf128 [edx], ymm1, 0  // U
+    vextractf128 [edx + edi], ymm0, 0 // V
+    lea        edx, [edx + 16]
+    sub        ecx, 32
+    jg         convertloop
+
+    pop        edi
+    vzeroupper
+    ret
+  }
+}
+
+__declspec(naked)
+void UYVYToYRow_AVX2(const uint8* src_uyvy,
+                     uint8* dst_y, int width) {
+  __asm {
+    mov        eax, [esp + 4]    // src_uyvy
+    mov        edx, [esp + 8]    // dst_y
+    mov        ecx, [esp + 12]   // width
+
+  convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    lea        eax,  [eax + 64]
+    vpsrlw     ymm0, ymm0, 8      // odd bytes are Y
+    vpsrlw     ymm1, ymm1, 8
+    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpermq     ymm0, ymm0, 0xd8
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
+    sub        ecx, 32
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+
+__declspec(naked)
+void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_yuy2
+    mov        esi, [esp + 8 + 8]    // stride_yuy2
+    mov        edx, [esp + 8 + 12]   // dst_u
+    mov        edi, [esp + 8 + 16]   // dst_v
+    mov        ecx, [esp + 8 + 20]   // width
+    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
+    vpsrlw     ymm5, ymm5, 8
+    sub        edi, edx
+
+  convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    vpavgb     ymm0, ymm0, [eax + esi]
+    vpavgb     ymm1, ymm1, [eax + esi + 32]
+    lea        eax,  [eax + 64]
+    vpand      ymm0, ymm0, ymm5   // UYVY -> UVUV
+    vpand      ymm1, ymm1, ymm5
+    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpermq     ymm0, ymm0, 0xd8
+    vpand      ymm1, ymm0, ymm5  // U
+    vpsrlw     ymm0, ymm0, 8     // V
+    vpackuswb  ymm1, ymm1, ymm1  // mutates.
+    vpackuswb  ymm0, ymm0, ymm0  // mutates.
+    vpermq     ymm1, ymm1, 0xd8
+    vpermq     ymm0, ymm0, 0xd8
+    vextractf128 [edx], ymm1, 0  // U
+    vextractf128 [edx + edi], ymm0, 0 // V
+    lea        edx, [edx + 16]
+    sub        ecx, 32
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+
+__declspec(naked)
+void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int width) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]    // src_yuy2
+    mov        edx, [esp + 4 + 8]    // dst_u
+    mov        edi, [esp + 4 + 12]   // dst_v
+    mov        ecx, [esp + 4 + 16]   // width
+    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
+    vpsrlw     ymm5, ymm5, 8
+    sub        edi, edx
+
+  convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    lea        eax,  [eax + 64]
+    vpand      ymm0, ymm0, ymm5   // UYVY -> UVUV
+    vpand      ymm1, ymm1, ymm5
+    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpermq     ymm0, ymm0, 0xd8
+    vpand      ymm1, ymm0, ymm5  // U
+    vpsrlw     ymm0, ymm0, 8     // V
+    vpackuswb  ymm1, ymm1, ymm1  // mutates.
+    vpackuswb  ymm0, ymm0, ymm0  // mutates.
+    vpermq     ymm1, ymm1, 0xd8
+    vpermq     ymm0, ymm0, 0xd8
+    vextractf128 [edx], ymm1, 0  // U
+    vextractf128 [edx + edi], ymm0, 0 // V
+    lea        edx, [edx + 16]
+    sub        ecx, 32
+    jg         convertloop
+
+    pop        edi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_YUY2TOYROW_AVX2
+
+#ifdef HAS_YUY2TOYROW_SSE2
+__declspec(naked)
+void YUY2ToYRow_SSE2(const uint8* src_yuy2,
+                     uint8* dst_y, int width) {
+  __asm {
+    mov        eax, [esp + 4]    // src_yuy2
+    mov        edx, [esp + 8]    // dst_y
+    mov        ecx, [esp + 12]   // width
+    pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+
+  convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    pand       xmm0, xmm5   // even bytes are Y
+    pand       xmm1, xmm5
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         convertloop
+    ret
+  }
+}
+
+__declspec(naked)
+void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_yuy2
+    mov        esi, [esp + 8 + 8]    // stride_yuy2
+    mov        edx, [esp + 8 + 12]   // dst_u
+    mov        edi, [esp + 8 + 16]   // dst_v
+    mov        ecx, [esp + 8 + 20]   // width
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+    sub        edi, edx
+
+  convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + esi]
+    movdqu     xmm3, [eax + esi + 16]
+    lea        eax,  [eax + 32]
+    pavgb      xmm0, xmm2
+    pavgb      xmm1, xmm3
+    psrlw      xmm0, 8      // YUYV -> UVUV
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    movdqa     xmm1, xmm0
+    pand       xmm0, xmm5  // U
+    packuswb   xmm0, xmm0
+    psrlw      xmm1, 8     // V
+    packuswb   xmm1, xmm1
+    movq       qword ptr [edx], xmm0
+    movq       qword ptr [edx + edi], xmm1
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked)
+void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
+                         uint8* dst_u, uint8* dst_v, int width) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]    // src_yuy2
+    mov        edx, [esp + 4 + 8]    // dst_u
+    mov        edi, [esp + 4 + 12]   // dst_v
+    mov        ecx, [esp + 4 + 16]   // width
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+    sub        edi, edx
+
+  convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    psrlw      xmm0, 8      // YUYV -> UVUV
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    movdqa     xmm1, xmm0
+    pand       xmm0, xmm5  // U
+    packuswb   xmm0, xmm0
+    psrlw      xmm1, 8     // V
+    packuswb   xmm1, xmm1
+    movq       qword ptr [edx], xmm0
+    movq       qword ptr [edx + edi], xmm1
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    ret
+  }
+}
+
+__declspec(naked)
+void UYVYToYRow_SSE2(const uint8* src_uyvy,
+                     uint8* dst_y, int width) {
+  __asm {
+    mov        eax, [esp + 4]    // src_uyvy
+    mov        edx, [esp + 8]    // dst_y
+    mov        ecx, [esp + 12]   // width
+
+  convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    psrlw      xmm0, 8    // odd bytes are Y
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         convertloop
+    ret
+  }
+}
+
+__declspec(naked)
+void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_yuy2
+    mov        esi, [esp + 8 + 8]    // stride_yuy2
+    mov        edx, [esp + 8 + 12]   // dst_u
+    mov        edi, [esp + 8 + 16]   // dst_v
+    mov        ecx, [esp + 8 + 20]   // width
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+    sub        edi, edx
+
+  convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + esi]
+    movdqu     xmm3, [eax + esi + 16]
+    lea        eax,  [eax + 32]
+    pavgb      xmm0, xmm2
+    pavgb      xmm1, xmm3
+    pand       xmm0, xmm5   // UYVY -> UVUV
+    pand       xmm1, xmm5
+    packuswb   xmm0, xmm1
+    movdqa     xmm1, xmm0
+    pand       xmm0, xmm5  // U
+    packuswb   xmm0, xmm0
+    psrlw      xmm1, 8     // V
+    packuswb   xmm1, xmm1
+    movq       qword ptr [edx], xmm0
+    movq       qword ptr [edx + edi], xmm1
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked)
+void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int width) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]    // src_yuy2
+    mov        edx, [esp + 4 + 8]    // dst_u
+    mov        edi, [esp + 4 + 12]   // dst_v
+    mov        ecx, [esp + 4 + 16]   // width
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+    sub        edi, edx
+
+  convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    pand       xmm0, xmm5   // UYVY -> UVUV
+    pand       xmm1, xmm5
+    packuswb   xmm0, xmm1
+    movdqa     xmm1, xmm0
+    pand       xmm0, xmm5  // U
+    packuswb   xmm0, xmm0
+    psrlw      xmm1, 8     // V
+    packuswb   xmm1, xmm1
+    movq       qword ptr [edx], xmm0
+    movq       qword ptr [edx + edi], xmm1
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    ret
+  }
+}
+#endif  // HAS_YUY2TOYROW_SSE2
+
+#ifdef HAS_BLENDPLANEROW_SSSE3
+// Blend 8 pixels at a time.
+// unsigned version of math
+// =((A2*C2)+(B2*(255-C2))+255)/256
+// signed version of math
+// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
+__declspec(naked)
+void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
+                         const uint8* alpha, uint8* dst, int width) {
+  __asm {
+    push       esi
+    push       edi
+    pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
+    psllw      xmm5, 8
+    mov        eax, 0x80808080  // 128 for biasing image to signed.
+    movd       xmm6, eax
+    pshufd     xmm6, xmm6, 0x00
+
+    mov        eax, 0x807f807f  // 32768 + 127 for unbias and round.
+    movd       xmm7, eax
+    pshufd     xmm7, xmm7, 0x00
+    mov        eax, [esp + 8 + 4]   // src0
+    mov        edx, [esp + 8 + 8]   // src1
+    mov        esi, [esp + 8 + 12]  // alpha
+    mov        edi, [esp + 8 + 16]  // dst
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        eax, esi
+    sub        edx, esi
+    sub        edi, esi
+
+    // 8 pixel loop.
+  convertloop8:
+    movq       xmm0, qword ptr [esi]        // alpha
+    punpcklbw  xmm0, xmm0
+    pxor       xmm0, xmm5         // a, 255-a
+    movq       xmm1, qword ptr [eax + esi]  // src0
+    movq       xmm2, qword ptr [edx + esi]  // src1
+    punpcklbw  xmm1, xmm2
+    psubb      xmm1, xmm6         // bias src0/1 - 128
+    pmaddubsw  xmm0, xmm1
+    paddw      xmm0, xmm7         // unbias result - 32768 and round.
+    psrlw      xmm0, 8
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edi + esi], xmm0
+    lea        esi, [esi + 8]
+    sub        ecx, 8
+    jg         convertloop8
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_BLENDPLANEROW_SSSE3
+
+#ifdef HAS_BLENDPLANEROW_AVX2
+// Blend 32 pixels at a time.
+// unsigned version of math
+// =((A2*C2)+(B2*(255-C2))+255)/256
+// signed version of math
+// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
+__declspec(naked)
+void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
+                         const uint8* alpha, uint8* dst, int width) {
+  __asm {
+    push        esi
+    push        edi
+    vpcmpeqb    ymm5, ymm5, ymm5       // generate mask 0xff00ff00
+    vpsllw      ymm5, ymm5, 8
+    mov         eax, 0x80808080  // 128 for biasing image to signed.
+    vmovd       xmm6, eax
+    vbroadcastss ymm6, xmm6
+    mov         eax, 0x807f807f  // 32768 + 127 for unbias and round.
+    vmovd       xmm7, eax
+    vbroadcastss ymm7, xmm7
+    mov         eax, [esp + 8 + 4]   // src0
+    mov         edx, [esp + 8 + 8]   // src1
+    mov         esi, [esp + 8 + 12]  // alpha
+    mov         edi, [esp + 8 + 16]  // dst
+    mov         ecx, [esp + 8 + 20]  // width
+    sub         eax, esi
+    sub         edx, esi
+    sub         edi, esi
+
+    // 32 pixel loop.
+  convertloop32:
+    vmovdqu     ymm0, [esi]        // alpha
+    vpunpckhbw  ymm3, ymm0, ymm0   // 8..15, 24..31
+    vpunpcklbw  ymm0, ymm0, ymm0   // 0..7, 16..23
+    vpxor       ymm3, ymm3, ymm5   // a, 255-a
+    vpxor       ymm0, ymm0, ymm5   // a, 255-a
+    vmovdqu     ymm1, [eax + esi]  // src0
+    vmovdqu     ymm2, [edx + esi]  // src1
+    vpunpckhbw  ymm4, ymm1, ymm2
+    vpunpcklbw  ymm1, ymm1, ymm2
+    vpsubb      ymm4, ymm4, ymm6   // bias src0/1 - 128
+    vpsubb      ymm1, ymm1, ymm6   // bias src0/1 - 128
+    vpmaddubsw  ymm3, ymm3, ymm4
+    vpmaddubsw  ymm0, ymm0, ymm1
+    vpaddw      ymm3, ymm3, ymm7   // unbias result - 32768 and round.
+    vpaddw      ymm0, ymm0, ymm7   // unbias result - 32768 and round.
+    vpsrlw      ymm3, ymm3, 8
+    vpsrlw      ymm0, ymm0, 8
+    vpackuswb   ymm0, ymm0, ymm3
+    vmovdqu     [edi + esi], ymm0
+    lea         esi, [esi + 32]
+    sub         ecx, 32
+    jg          convertloop32
+
+    pop         edi
+    pop         esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_BLENDPLANEROW_AVX2
+
+#ifdef HAS_ARGBBLENDROW_SSSE3
+// Shuffle table for isolating alpha.
+static const uvec8 kShuffleAlpha = {
+  3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
+  11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
+};
+
+// Blend 8 pixels at a time.
+__declspec(naked)
+void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
+                        uint8* dst_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_argb0
+    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+    pcmpeqb    xmm7, xmm7       // generate constant 0x0001
+    psrlw      xmm7, 15
+    pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
+    psrlw      xmm6, 8
+    pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
+    psllw      xmm5, 8
+    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
+    pslld      xmm4, 24
+    sub        ecx, 4
+    jl         convertloop4b    // less than 4 pixels?
+
+    // 4 pixel loop.
+  convertloop4:
+    movdqu     xmm3, [eax]      // src argb
+    lea        eax, [eax + 16]
+    movdqa     xmm0, xmm3       // src argb
+    pxor       xmm3, xmm4       // ~alpha
+    movdqu     xmm2, [esi]      // _r_b
+    pshufb     xmm3, xmmword ptr kShuffleAlpha // alpha
+    pand       xmm2, xmm6       // _r_b
+    paddw      xmm3, xmm7       // 256 - alpha
+    pmullw     xmm2, xmm3       // _r_b * alpha
+    movdqu     xmm1, [esi]      // _a_g
+    lea        esi, [esi + 16]
+    psrlw      xmm1, 8          // _a_g
+    por        xmm0, xmm4       // set alpha to 255
+    pmullw     xmm1, xmm3       // _a_g * alpha
+    psrlw      xmm2, 8          // _r_b convert to 8 bits again
+    paddusb    xmm0, xmm2       // + src argb
+    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
+    paddusb    xmm0, xmm1       // + src argb
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jge        convertloop4
+
+  convertloop4b:
+    add        ecx, 4 - 1
+    jl         convertloop1b
+
+    // 1 pixel loop.
+  convertloop1:
+    movd       xmm3, [eax]      // src argb
+    lea        eax, [eax + 4]
+    movdqa     xmm0, xmm3       // src argb
+    pxor       xmm3, xmm4       // ~alpha
+    movd       xmm2, [esi]      // _r_b
+    pshufb     xmm3, xmmword ptr kShuffleAlpha // alpha
+    pand       xmm2, xmm6       // _r_b
+    paddw      xmm3, xmm7       // 256 - alpha
+    pmullw     xmm2, xmm3       // _r_b * alpha
+    movd       xmm1, [esi]      // _a_g
+    lea        esi, [esi + 4]
+    psrlw      xmm1, 8          // _a_g
+    por        xmm0, xmm4       // set alpha to 255
+    pmullw     xmm1, xmm3       // _a_g * alpha
+    psrlw      xmm2, 8          // _r_b convert to 8 bits again
+    paddusb    xmm0, xmm2       // + src argb
+    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
+    paddusb    xmm0, xmm1       // + src argb
+    movd       [edx], xmm0
+    lea        edx, [edx + 4]
+    sub        ecx, 1
+    jge        convertloop1
+
+  convertloop1b:
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBBLENDROW_SSSE3
+
+#ifdef HAS_ARGBATTENUATEROW_SSSE3
+// Shuffle table duplicating alpha.
+static const uvec8 kShuffleAlpha0 = {
+  3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
+};
+static const uvec8 kShuffleAlpha1 = {
+  11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
+  15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
+};
+__declspec(naked)
+void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
+  __asm {
+    mov        eax, [esp + 4]   // src_argb0
+    mov        edx, [esp + 8]   // dst_argb
+    mov        ecx, [esp + 12]  // width
+    pcmpeqb    xmm3, xmm3       // generate mask 0xff000000
+    pslld      xmm3, 24
+    movdqa     xmm4, xmmword ptr kShuffleAlpha0
+    movdqa     xmm5, xmmword ptr kShuffleAlpha1
+
+ convertloop:
+    movdqu     xmm0, [eax]      // read 4 pixels
+    pshufb     xmm0, xmm4       // isolate first 2 alphas
+    movdqu     xmm1, [eax]      // read 4 pixels
+    punpcklbw  xmm1, xmm1       // first 2 pixel rgbs
+    pmulhuw    xmm0, xmm1       // rgb * a
+    movdqu     xmm1, [eax]      // read 4 pixels
+    pshufb     xmm1, xmm5       // isolate next 2 alphas
+    movdqu     xmm2, [eax]      // read 4 pixels
+    punpckhbw  xmm2, xmm2       // next 2 pixel rgbs
+    pmulhuw    xmm1, xmm2       // rgb * a
+    movdqu     xmm2, [eax]      // mask original alpha
+    lea        eax, [eax + 16]
+    pand       xmm2, xmm3
+    psrlw      xmm0, 8
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    por        xmm0, xmm2       // copy original alpha
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         convertloop
+
+    ret
+  }
+}
+#endif  // HAS_ARGBATTENUATEROW_SSSE3
+
+#ifdef HAS_ARGBATTENUATEROW_AVX2
+// Shuffle table duplicating alpha.
+static const uvec8 kShuffleAlpha_AVX2 = {
+  6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
+};
+__declspec(naked)
+void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
+  __asm {
+    mov        eax, [esp + 4]   // src_argb0
+    mov        edx, [esp + 8]   // dst_argb
+    mov        ecx, [esp + 12]  // width
+    sub        edx, eax
+    vbroadcastf128 ymm4, xmmword ptr kShuffleAlpha_AVX2
+    vpcmpeqb   ymm5, ymm5, ymm5 // generate mask 0xff000000
+    vpslld     ymm5, ymm5, 24
+
+ convertloop:
+    vmovdqu    ymm6, [eax]       // read 8 pixels.
+    vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
+    vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
+    vpshufb    ymm2, ymm0, ymm4  // low 4 alphas
+    vpshufb    ymm3, ymm1, ymm4  // high 4 alphas
+    vpmulhuw   ymm0, ymm0, ymm2  // rgb * a
+    vpmulhuw   ymm1, ymm1, ymm3  // rgb * a
+    vpand      ymm6, ymm6, ymm5  // isolate alpha
+    vpsrlw     ymm0, ymm0, 8
+    vpsrlw     ymm1, ymm1, 8
+    vpackuswb  ymm0, ymm0, ymm1  // unmutated.
+    vpor       ymm0, ymm0, ymm6  // copy original alpha
+    vmovdqu    [eax + edx], ymm0
+    lea        eax, [eax + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBATTENUATEROW_AVX2
+
+#ifdef HAS_ARGBUNATTENUATEROW_SSE2
+// Unattenuate 4 pixels at a time.
+__declspec(naked)
+void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
+                             int width) {
+  __asm {
+    push       ebx
+    push       esi
+    push       edi
+    mov        eax, [esp + 12 + 4]   // src_argb
+    mov        edx, [esp + 12 + 8]   // dst_argb
+    mov        ecx, [esp + 12 + 12]  // width
+    lea        ebx, fixed_invtbl8
+
+ convertloop:
+    movdqu     xmm0, [eax]      // read 4 pixels
+    movzx      esi, byte ptr [eax + 3]  // first alpha
+    movzx      edi, byte ptr [eax + 7]  // second alpha
+    punpcklbw  xmm0, xmm0       // first 2
+    movd       xmm2, dword ptr [ebx + esi * 4]
+    movd       xmm3, dword ptr [ebx + edi * 4]
+    pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words.  1, a, a, a
+    pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words
+    movlhps    xmm2, xmm3
+    pmulhuw    xmm0, xmm2       // rgb * a
+
+    movdqu     xmm1, [eax]      // read 4 pixels
+    movzx      esi, byte ptr [eax + 11]  // third alpha
+    movzx      edi, byte ptr [eax + 15]  // forth alpha
+    punpckhbw  xmm1, xmm1       // next 2
+    movd       xmm2, dword ptr [ebx + esi * 4]
+    movd       xmm3, dword ptr [ebx + edi * 4]
+    pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words
+    pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words
+    movlhps    xmm2, xmm3
+    pmulhuw    xmm1, xmm2       // rgb * a
+    lea        eax, [eax + 16]
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    pop        ebx
+    ret
+  }
+}
+#endif  // HAS_ARGBUNATTENUATEROW_SSE2
+
+#ifdef HAS_ARGBUNATTENUATEROW_AVX2
+// Shuffle table duplicating alpha.
+static const uvec8 kUnattenShuffleAlpha_AVX2 = {
+  0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
+};
+// TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
+// USE_GATHER is not on by default, due to being a slow instruction.
+#ifdef USE_GATHER
+__declspec(naked)
+void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+                             int width) {
+  __asm {
+    mov        eax, [esp + 4]   // src_argb0
+    mov        edx, [esp + 8]   // dst_argb
+    mov        ecx, [esp + 12]  // width
+    sub        edx, eax
+    vbroadcastf128 ymm4, xmmword ptr kUnattenShuffleAlpha_AVX2
+
+ convertloop:
+    vmovdqu    ymm6, [eax]       // read 8 pixels.
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0xffffffff for gather.
+    vpsrld     ymm2, ymm6, 24    // alpha in low 8 bits.
+    vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
+    vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
+    vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5  // ymm5 cleared.  1, a
+    vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
+    vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.
+    vpshufb    ymm2, ymm2, ymm4  // replicate low 4 alphas. 1, a, a, a
+    vpshufb    ymm3, ymm3, ymm4  // replicate high 4 alphas
+    vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
+    vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
+    vpackuswb  ymm0, ymm0, ymm1  // unmutated.
+    vmovdqu    [eax + edx], ymm0
+    lea        eax, [eax + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    vzeroupper
+    ret
+  }
+}
+#else  // USE_GATHER
+__declspec(naked)
+void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+                             int width) {
+  __asm {
+
+    push       ebx
+    push       esi
+    push       edi
+    mov        eax, [esp + 12 + 4]   // src_argb
+    mov        edx, [esp + 12 + 8]   // dst_argb
+    mov        ecx, [esp + 12 + 12]  // width
+    sub        edx, eax
+    lea        ebx, fixed_invtbl8
+    vbroadcastf128 ymm5, xmmword ptr kUnattenShuffleAlpha_AVX2
+
+ convertloop:
+    // replace VPGATHER
+    movzx      esi, byte ptr [eax + 3]                 // alpha0
+    movzx      edi, byte ptr [eax + 7]                 // alpha1
+    vmovd      xmm0, dword ptr [ebx + esi * 4]  // [1,a0]
+    vmovd      xmm1, dword ptr [ebx + edi * 4]  // [1,a1]
+    movzx      esi, byte ptr [eax + 11]                // alpha2
+    movzx      edi, byte ptr [eax + 15]                // alpha3
+    vpunpckldq xmm6, xmm0, xmm1                        // [1,a1,1,a0]
+    vmovd      xmm2, dword ptr [ebx + esi * 4]  // [1,a2]
+    vmovd      xmm3, dword ptr [ebx + edi * 4]  // [1,a3]
+    movzx      esi, byte ptr [eax + 19]                // alpha4
+    movzx      edi, byte ptr [eax + 23]                // alpha5
+    vpunpckldq xmm7, xmm2, xmm3                        // [1,a3,1,a2]
+    vmovd      xmm0, dword ptr [ebx + esi * 4]  // [1,a4]
+    vmovd      xmm1, dword ptr [ebx + edi * 4]  // [1,a5]
+    movzx      esi, byte ptr [eax + 27]                // alpha6
+    movzx      edi, byte ptr [eax + 31]                // alpha7
+    vpunpckldq xmm0, xmm0, xmm1                        // [1,a5,1,a4]
+    vmovd      xmm2, dword ptr [ebx + esi * 4]  // [1,a6]
+    vmovd      xmm3, dword ptr [ebx + edi * 4]  // [1,a7]
+    vpunpckldq xmm2, xmm2, xmm3                        // [1,a7,1,a6]
+    vpunpcklqdq xmm3, xmm6, xmm7                       // [1,a3,1,a2,1,a1,1,a0]
+    vpunpcklqdq xmm0, xmm0, xmm2                       // [1,a7,1,a6,1,a5,1,a4]
+    vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
+    // end of VPGATHER
+
+    vmovdqu    ymm6, [eax]       // read 8 pixels.
+    vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
+    vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
+    vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
+    vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.
+    vpshufb    ymm2, ymm2, ymm5  // replicate low 4 alphas. 1, a, a, a
+    vpshufb    ymm3, ymm3, ymm5  // replicate high 4 alphas
+    vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
+    vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
+    vpackuswb  ymm0, ymm0, ymm1  // unmutated.
+    vmovdqu    [eax + edx], ymm0
+    lea        eax, [eax + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    pop        ebx
+    vzeroupper
+    ret
+  }
+}
+#endif  // USE_GATHER
+#endif  // HAS_ARGBATTENUATEROW_AVX2
+
+#ifdef HAS_ARGBGRAYROW_SSSE3
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
+__declspec(naked)
+void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
+  __asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_argb */
+    mov        ecx, [esp + 12]  /* width */
+    movdqa     xmm4, xmmword ptr kARGBToYJ
+    movdqa     xmm5, xmmword ptr kAddYJ64
+
+ convertloop:
+    movdqu     xmm0, [eax]  // G
+    movdqu     xmm1, [eax + 16]
+    pmaddubsw  xmm0, xmm4
+    pmaddubsw  xmm1, xmm4
+    phaddw     xmm0, xmm1
+    paddw      xmm0, xmm5  // Add .5 for rounding.
+    psrlw      xmm0, 7
+    packuswb   xmm0, xmm0   // 8 G bytes
+    movdqu     xmm2, [eax]  // A
+    movdqu     xmm3, [eax + 16]
+    lea        eax, [eax + 32]
+    psrld      xmm2, 24
+    psrld      xmm3, 24
+    packuswb   xmm2, xmm3
+    packuswb   xmm2, xmm2   // 8 A bytes
+    movdqa     xmm3, xmm0   // Weave into GG, GA, then GGGA
+    punpcklbw  xmm0, xmm0   // 8 GG words
+    punpcklbw  xmm3, xmm2   // 8 GA words
+    movdqa     xmm1, xmm0
+    punpcklwd  xmm0, xmm3   // GGGA first 4
+    punpckhwd  xmm1, xmm3   // GGGA next 4
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+    ret
+  }
+}
+#endif  // HAS_ARGBGRAYROW_SSSE3
+
+#ifdef HAS_ARGBSEPIAROW_SSSE3
+//    b = (r * 35 + g * 68 + b * 17) >> 7
+//    g = (r * 45 + g * 88 + b * 22) >> 7
+//    r = (r * 50 + g * 98 + b * 24) >> 7
+// Constant for ARGB color to sepia tone.
+static const vec8 kARGBToSepiaB = {
+  17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
+};
+
+static const vec8 kARGBToSepiaG = {
+  22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
+};
+
+static const vec8 kARGBToSepiaR = {
+  24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
+};
+
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+__declspec(naked)
+void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
+  __asm {
+    mov        eax, [esp + 4]   /* dst_argb */
+    mov        ecx, [esp + 8]   /* width */
+    movdqa     xmm2, xmmword ptr kARGBToSepiaB
+    movdqa     xmm3, xmmword ptr kARGBToSepiaG
+    movdqa     xmm4, xmmword ptr kARGBToSepiaR
+
+ convertloop:
+    movdqu     xmm0, [eax]  // B
+    movdqu     xmm6, [eax + 16]
+    pmaddubsw  xmm0, xmm2
+    pmaddubsw  xmm6, xmm2
+    phaddw     xmm0, xmm6
+    psrlw      xmm0, 7
+    packuswb   xmm0, xmm0   // 8 B values
+    movdqu     xmm5, [eax]  // G
+    movdqu     xmm1, [eax + 16]
+    pmaddubsw  xmm5, xmm3
+    pmaddubsw  xmm1, xmm3
+    phaddw     xmm5, xmm1
+    psrlw      xmm5, 7
+    packuswb   xmm5, xmm5   // 8 G values
+    punpcklbw  xmm0, xmm5   // 8 BG values
+    movdqu     xmm5, [eax]  // R
+    movdqu     xmm1, [eax + 16]
+    pmaddubsw  xmm5, xmm4
+    pmaddubsw  xmm1, xmm4
+    phaddw     xmm5, xmm1
+    psrlw      xmm5, 7
+    packuswb   xmm5, xmm5   // 8 R values
+    movdqu     xmm6, [eax]  // A
+    movdqu     xmm1, [eax + 16]
+    psrld      xmm6, 24
+    psrld      xmm1, 24
+    packuswb   xmm6, xmm1
+    packuswb   xmm6, xmm6   // 8 A values
+    punpcklbw  xmm5, xmm6   // 8 RA values
+    movdqa     xmm1, xmm0   // Weave BG, RA together
+    punpcklwd  xmm0, xmm5   // BGRA first 4
+    punpckhwd  xmm1, xmm5   // BGRA next 4
+    movdqu     [eax], xmm0
+    movdqu     [eax + 16], xmm1
+    lea        eax, [eax + 32]
+    sub        ecx, 8
+    jg         convertloop
+    ret
+  }
+}
+#endif  // HAS_ARGBSEPIAROW_SSSE3
+
+#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// Same as Sepia except matrix is provided.
+// TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
+// and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
+__declspec(naked)
+void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                              const int8* matrix_argb, int width) {
+  __asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_argb */
+    mov        ecx, [esp + 12]  /* matrix_argb */
+    movdqu     xmm5, [ecx]
+    pshufd     xmm2, xmm5, 0x00
+    pshufd     xmm3, xmm5, 0x55
+    pshufd     xmm4, xmm5, 0xaa
+    pshufd     xmm5, xmm5, 0xff
+    mov        ecx, [esp + 16]  /* width */
+
+ convertloop:
+    movdqu     xmm0, [eax]  // B
+    movdqu     xmm7, [eax + 16]
+    pmaddubsw  xmm0, xmm2
+    pmaddubsw  xmm7, xmm2
+    movdqu     xmm6, [eax]  // G
+    movdqu     xmm1, [eax + 16]
+    pmaddubsw  xmm6, xmm3
+    pmaddubsw  xmm1, xmm3
+    phaddsw    xmm0, xmm7   // B
+    phaddsw    xmm6, xmm1   // G
+    psraw      xmm0, 6      // B
+    psraw      xmm6, 6      // G
+    packuswb   xmm0, xmm0   // 8 B values
+    packuswb   xmm6, xmm6   // 8 G values
+    punpcklbw  xmm0, xmm6   // 8 BG values
+    movdqu     xmm1, [eax]  // R
+    movdqu     xmm7, [eax + 16]
+    pmaddubsw  xmm1, xmm4
+    pmaddubsw  xmm7, xmm4
+    phaddsw    xmm1, xmm7   // R
+    movdqu     xmm6, [eax]  // A
+    movdqu     xmm7, [eax + 16]
+    pmaddubsw  xmm6, xmm5
+    pmaddubsw  xmm7, xmm5
+    phaddsw    xmm6, xmm7   // A
+    psraw      xmm1, 6      // R
+    psraw      xmm6, 6      // A
+    packuswb   xmm1, xmm1   // 8 R values
+    packuswb   xmm6, xmm6   // 8 A values
+    punpcklbw  xmm1, xmm6   // 8 RA values
+    movdqa     xmm6, xmm0   // Weave BG, RA together
+    punpcklwd  xmm0, xmm1   // BGRA first 4
+    punpckhwd  xmm6, xmm1   // BGRA next 4
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm6
+    lea        eax, [eax + 32]
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+    ret
+  }
+}
+#endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
+
+#ifdef HAS_ARGBQUANTIZEROW_SSE2
+// Quantize 4 ARGB pixels (16 bytes).
+__declspec(naked)
+void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
+                          int interval_offset, int width) {
+  __asm {
+    mov        eax, [esp + 4]    /* dst_argb */
+    movd       xmm2, [esp + 8]   /* scale */
+    movd       xmm3, [esp + 12]  /* interval_size */
+    movd       xmm4, [esp + 16]  /* interval_offset */
+    mov        ecx, [esp + 20]   /* width */
+    pshuflw    xmm2, xmm2, 040h
+    pshufd     xmm2, xmm2, 044h
+    pshuflw    xmm3, xmm3, 040h
+    pshufd     xmm3, xmm3, 044h
+    pshuflw    xmm4, xmm4, 040h
+    pshufd     xmm4, xmm4, 044h
+    pxor       xmm5, xmm5  // constant 0
+    pcmpeqb    xmm6, xmm6  // generate mask 0xff000000
+    pslld      xmm6, 24
+
+ convertloop:
+    movdqu     xmm0, [eax]  // read 4 pixels
+    punpcklbw  xmm0, xmm5   // first 2 pixels
+    pmulhuw    xmm0, xmm2   // pixel * scale >> 16
+    movdqu     xmm1, [eax]  // read 4 pixels
+    punpckhbw  xmm1, xmm5   // next 2 pixels
+    pmulhuw    xmm1, xmm2
+    pmullw     xmm0, xmm3   // * interval_size
+    movdqu     xmm7, [eax]  // read 4 pixels
+    pmullw     xmm1, xmm3
+    pand       xmm7, xmm6   // mask alpha
+    paddw      xmm0, xmm4   // + interval_size / 2
+    paddw      xmm1, xmm4
+    packuswb   xmm0, xmm1
+    por        xmm0, xmm7
+    movdqu     [eax], xmm0
+    lea        eax, [eax + 16]
+    sub        ecx, 4
+    jg         convertloop
+    ret
+  }
+}
+#endif  // HAS_ARGBQUANTIZEROW_SSE2
+
+#ifdef HAS_ARGBSHADEROW_SSE2
+// Shade 4 pixels at a time by specified value.
+__declspec(naked)
+void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
+                       uint32 value) {
+  __asm {
+    mov        eax, [esp + 4]   // src_argb
+    mov        edx, [esp + 8]   // dst_argb
+    mov        ecx, [esp + 12]  // width
+    movd       xmm2, [esp + 16]  // value
+    punpcklbw  xmm2, xmm2
+    punpcklqdq xmm2, xmm2
+
+ convertloop:
+    movdqu     xmm0, [eax]      // read 4 pixels
+    lea        eax, [eax + 16]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm0       // first 2
+    punpckhbw  xmm1, xmm1       // next 2
+    pmulhuw    xmm0, xmm2       // argb * value
+    pmulhuw    xmm1, xmm2       // argb * value
+    psrlw      xmm0, 8
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         convertloop
+
+    ret
+  }
+}
+#endif  // HAS_ARGBSHADEROW_SSE2
+
+#ifdef HAS_ARGBMULTIPLYROW_SSE2
+// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
+__declspec(naked)
+void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_argb0
+    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+    pxor       xmm5, xmm5  // constant 0
+
+ convertloop:
+    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
+    movdqu     xmm2, [esi]        // read 4 pixels from src_argb1
+    movdqu     xmm1, xmm0
+    movdqu     xmm3, xmm2
+    punpcklbw  xmm0, xmm0         // first 2
+    punpckhbw  xmm1, xmm1         // next 2
+    punpcklbw  xmm2, xmm5         // first 2
+    punpckhbw  xmm3, xmm5         // next 2
+    pmulhuw    xmm0, xmm2         // src_argb0 * src_argb1 first 2
+    pmulhuw    xmm1, xmm3         // src_argb0 * src_argb1 next 2
+    lea        eax, [eax + 16]
+    lea        esi, [esi + 16]
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBMULTIPLYROW_SSE2
+
+#ifdef HAS_ARGBADDROW_SSE2
+// Add 2 rows of ARGB pixels together, 4 pixels at a time.
+// TODO(fbarchard): Port this to posix, neon and other math functions.
+__declspec(naked)
+void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                     uint8* dst_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_argb0
+    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+
+    sub        ecx, 4
+    jl         convertloop49
+
+ convertloop4:
+    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
+    lea        eax, [eax + 16]
+    movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
+    lea        esi, [esi + 16]
+    paddusb    xmm0, xmm1         // src_argb0 + src_argb1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jge        convertloop4
+
+ convertloop49:
+    add        ecx, 4 - 1
+    jl         convertloop19
+
+ convertloop1:
+    movd       xmm0, [eax]        // read 1 pixels from src_argb0
+    lea        eax, [eax + 4]
+    movd       xmm1, [esi]        // read 1 pixels from src_argb1
+    lea        esi, [esi + 4]
+    paddusb    xmm0, xmm1         // src_argb0 + src_argb1
+    movd       [edx], xmm0
+    lea        edx, [edx + 4]
+    sub        ecx, 1
+    jge        convertloop1
+
+ convertloop19:
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBADDROW_SSE2
+
+#ifdef HAS_ARGBSUBTRACTROW_SSE2
+// Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
+__declspec(naked)
+void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_argb0
+    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+
+ convertloop:
+    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
+    lea        eax, [eax + 16]
+    movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
+    lea        esi, [esi + 16]
+    psubusb    xmm0, xmm1         // src_argb0 - src_argb1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBSUBTRACTROW_SSE2
+
+#ifdef HAS_ARGBMULTIPLYROW_AVX2
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+__declspec(naked)
+void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_argb0
+    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+    vpxor      ymm5, ymm5, ymm5     // constant 0
+
+ convertloop:
+    vmovdqu    ymm1, [eax]        // read 8 pixels from src_argb0
+    lea        eax, [eax + 32]
+    vmovdqu    ymm3, [esi]        // read 8 pixels from src_argb1
+    lea        esi, [esi + 32]
+    vpunpcklbw ymm0, ymm1, ymm1   // low 4
+    vpunpckhbw ymm1, ymm1, ymm1   // high 4
+    vpunpcklbw ymm2, ymm3, ymm5   // low 4
+    vpunpckhbw ymm3, ymm3, ymm5   // high 4
+    vpmulhuw   ymm0, ymm0, ymm2   // src_argb0 * src_argb1 low 4
+    vpmulhuw   ymm1, ymm1, ymm3   // src_argb0 * src_argb1 high 4
+    vpackuswb  ymm0, ymm0, ymm1
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBMULTIPLYROW_AVX2
+
+#ifdef HAS_ARGBADDROW_AVX2
+// Add 2 rows of ARGB pixels together, 8 pixels at a time.
+__declspec(naked)
+void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+                     uint8* dst_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_argb0
+    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+
+ convertloop:
+    vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
+    lea        eax, [eax + 32]
+    vpaddusb   ymm0, ymm0, [esi]        // add 8 pixels from src_argb1
+    lea        esi, [esi + 32]
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBADDROW_AVX2
+
+#ifdef HAS_ARGBSUBTRACTROW_AVX2
+// Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
+__declspec(naked)
+void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_argb0
+    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+
+ convertloop:
+    vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
+    lea        eax, [eax + 32]
+    vpsubusb   ymm0, ymm0, [esi]        // src_argb0 - src_argb1
+    lea        esi, [esi + 32]
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBSUBTRACTROW_AVX2
+
+#ifdef HAS_SOBELXROW_SSE2
+// SobelX as a matrix is
+// -1  0  1
+// -2  0  2
+// -1  0  1
+__declspec(naked)
+void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
+                    const uint8* src_y2, uint8* dst_sobelx, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // src_y0
+    mov        esi, [esp + 8 + 8]   // src_y1
+    mov        edi, [esp + 8 + 12]  // src_y2
+    mov        edx, [esp + 8 + 16]  // dst_sobelx
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        esi, eax
+    sub        edi, eax
+    sub        edx, eax
+    pxor       xmm5, xmm5  // constant 0
+
+ convertloop:
+    movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]
+    movq       xmm1, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]
+    punpcklbw  xmm0, xmm5
+    punpcklbw  xmm1, xmm5
+    psubw      xmm0, xmm1
+    movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]
+    movq       xmm2, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
+    punpcklbw  xmm1, xmm5
+    punpcklbw  xmm2, xmm5
+    psubw      xmm1, xmm2
+    movq       xmm2, qword ptr [eax + edi]      // read 8 pixels from src_y2[0]
+    movq       xmm3, qword ptr [eax + edi + 2]  // read 8 pixels from src_y2[2]
+    punpcklbw  xmm2, xmm5
+    punpcklbw  xmm3, xmm5
+    psubw      xmm2, xmm3
+    paddw      xmm0, xmm2
+    paddw      xmm0, xmm1
+    paddw      xmm0, xmm1
+    pxor       xmm1, xmm1   // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
+    psubw      xmm1, xmm0
+    pmaxsw     xmm0, xmm1
+    packuswb   xmm0, xmm0
+    movq       qword ptr [eax + edx], xmm0
+    lea        eax, [eax + 8]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_SOBELXROW_SSE2
+
+#ifdef HAS_SOBELYROW_SSE2
+// SobelY as a matrix is
+// -1 -2 -1
+//  0  0  0
+//  1  2  1
+__declspec(naked)
+void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
+                    uint8* dst_sobely, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_y0
+    mov        esi, [esp + 4 + 8]   // src_y1
+    mov        edx, [esp + 4 + 12]  // dst_sobely
+    mov        ecx, [esp + 4 + 16]  // width
+    sub        esi, eax
+    sub        edx, eax
+    pxor       xmm5, xmm5  // constant 0
+
+ convertloop:
+    movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]
+    movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]
+    punpcklbw  xmm0, xmm5
+    punpcklbw  xmm1, xmm5
+    psubw      xmm0, xmm1
+    movq       xmm1, qword ptr [eax + 1]        // read 8 pixels from src_y0[1]
+    movq       xmm2, qword ptr [eax + esi + 1]  // read 8 pixels from src_y1[1]
+    punpcklbw  xmm1, xmm5
+    punpcklbw  xmm2, xmm5
+    psubw      xmm1, xmm2
+    movq       xmm2, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]
+    movq       xmm3, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
+    punpcklbw  xmm2, xmm5
+    punpcklbw  xmm3, xmm5
+    psubw      xmm2, xmm3
+    paddw      xmm0, xmm2
+    paddw      xmm0, xmm1
+    paddw      xmm0, xmm1
+    pxor       xmm1, xmm1   // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
+    psubw      xmm1, xmm0
+    pmaxsw     xmm0, xmm1
+    packuswb   xmm0, xmm0
+    movq       qword ptr [eax + edx], xmm0
+    lea        eax, [eax + 8]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_SOBELYROW_SSE2
+
+#ifdef HAS_SOBELROW_SSE2
+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+// A = 255
+// R = Sobel
+// G = Sobel
+// B = Sobel
+__declspec(naked)
+void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                   uint8* dst_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_sobelx
+    mov        esi, [esp + 4 + 8]   // src_sobely
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+    sub        esi, eax
+    pcmpeqb    xmm5, xmm5           // alpha 255
+    pslld      xmm5, 24             // 0xff000000
+
+ convertloop:
+    movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
+    movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
+    lea        eax, [eax + 16]
+    paddusb    xmm0, xmm1             // sobel = sobelx + sobely
+    movdqa     xmm2, xmm0             // GG
+    punpcklbw  xmm2, xmm0             // First 8
+    punpckhbw  xmm0, xmm0             // Next 8
+    movdqa     xmm1, xmm2             // GGGG
+    punpcklwd  xmm1, xmm2             // First 4
+    punpckhwd  xmm2, xmm2             // Next 4
+    por        xmm1, xmm5             // GGGA
+    por        xmm2, xmm5
+    movdqa     xmm3, xmm0             // GGGG
+    punpcklwd  xmm3, xmm0             // Next 4
+    punpckhwd  xmm0, xmm0             // Last 4
+    por        xmm3, xmm5             // GGGA
+    por        xmm0, xmm5
+    movdqu     [edx], xmm1
+    movdqu     [edx + 16], xmm2
+    movdqu     [edx + 32], xmm3
+    movdqu     [edx + 48], xmm0
+    lea        edx, [edx + 64]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_SOBELROW_SSE2
+
+#ifdef HAS_SOBELTOPLANEROW_SSE2
+// Adds Sobel X and Sobel Y and stores Sobel into a plane.
+__declspec(naked)
+void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                          uint8* dst_y, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_sobelx
+    mov        esi, [esp + 4 + 8]   // src_sobely
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+    sub        esi, eax
+
+ convertloop:
+    movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
+    movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
+    lea        eax, [eax + 16]
+    paddusb    xmm0, xmm1             // sobel = sobelx + sobely
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_SOBELTOPLANEROW_SSE2
+
+#ifdef HAS_SOBELXYROW_SSE2
+// Mixes Sobel X, Sobel Y and Sobel into ARGB.
+// A = 255
+// R = Sobel X
+// G = Sobel
+// B = Sobel Y
+__declspec(naked)
+void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_sobelx
+    mov        esi, [esp + 4 + 8]   // src_sobely
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+    sub        esi, eax
+    pcmpeqb    xmm5, xmm5           // alpha 255
+
+ convertloop:
+    movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
+    movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
+    lea        eax, [eax + 16]
+    movdqa     xmm2, xmm0
+    paddusb    xmm2, xmm1             // sobel = sobelx + sobely
+    movdqa     xmm3, xmm0             // XA
+    punpcklbw  xmm3, xmm5
+    punpckhbw  xmm0, xmm5
+    movdqa     xmm4, xmm1             // YS
+    punpcklbw  xmm4, xmm2
+    punpckhbw  xmm1, xmm2
+    movdqa     xmm6, xmm4             // YSXA
+    punpcklwd  xmm6, xmm3             // First 4
+    punpckhwd  xmm4, xmm3             // Next 4
+    movdqa     xmm7, xmm1             // YSXA
+    punpcklwd  xmm7, xmm0             // Next 4
+    punpckhwd  xmm1, xmm0             // Last 4
+    movdqu     [edx], xmm6
+    movdqu     [edx + 16], xmm4
+    movdqu     [edx + 32], xmm7
+    movdqu     [edx + 48], xmm1
+    lea        edx, [edx + 64]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_SOBELXYROW_SSE2
+
+#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+// Consider float CumulativeSum.
+// Consider calling CumulativeSum one row at time as needed.
+// Consider circular CumulativeSum buffer of radius * 2 + 1 height.
+// Convert cumulative sum for an area to an average for 1 pixel.
+// topleft is pointer to top left of CumulativeSum buffer for area.
+// botleft is pointer to bottom left of CumulativeSum buffer.
+// width is offset from left to right of area in CumulativeSum buffer measured
+//   in number of ints.
+// area is the number of pixels in the area being averaged.
+// dst points to pixel to store result to.
+// count is number of averaged pixels to produce.
+// Does 4 pixels at a time.
+void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
+                                    int width, int area, uint8* dst,
+                                    int count) {
+  __asm {
+    mov        eax, topleft  // eax topleft
+    mov        esi, botleft  // esi botleft
+    mov        edx, width
+    movd       xmm5, area
+    mov        edi, dst
+    mov        ecx, count
+    cvtdq2ps   xmm5, xmm5
+    rcpss      xmm4, xmm5  // 1.0f / area
+    pshufd     xmm4, xmm4, 0
+    sub        ecx, 4
+    jl         l4b
+
+    cmp        area, 128  // 128 pixels will not overflow 15 bits.
+    ja         l4
+
+    pshufd     xmm5, xmm5, 0        // area
+    pcmpeqb    xmm6, xmm6           // constant of 65536.0 - 1 = 65535.0
+    psrld      xmm6, 16
+    cvtdq2ps   xmm6, xmm6
+    addps      xmm5, xmm6           // (65536.0 + area - 1)
+    mulps      xmm5, xmm4           // (65536.0 + area - 1) * 1 / area
+    cvtps2dq   xmm5, xmm5           // 0.16 fixed point
+    packssdw   xmm5, xmm5           // 16 bit shorts
+
+    // 4 pixel loop small blocks.
+  s4:
+    // top left
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+
+    // - top right
+    psubd      xmm0, [eax + edx * 4]
+    psubd      xmm1, [eax + edx * 4 + 16]
+    psubd      xmm2, [eax + edx * 4 + 32]
+    psubd      xmm3, [eax + edx * 4 + 48]
+    lea        eax, [eax + 64]
+
+    // - bottom left
+    psubd      xmm0, [esi]
+    psubd      xmm1, [esi + 16]
+    psubd      xmm2, [esi + 32]
+    psubd      xmm3, [esi + 48]
+
+    // + bottom right
+    paddd      xmm0, [esi + edx * 4]
+    paddd      xmm1, [esi + edx * 4 + 16]
+    paddd      xmm2, [esi + edx * 4 + 32]
+    paddd      xmm3, [esi + edx * 4 + 48]
+    lea        esi, [esi + 64]
+
+    packssdw   xmm0, xmm1  // pack 4 pixels into 2 registers
+    packssdw   xmm2, xmm3
+
+    pmulhuw    xmm0, xmm5
+    pmulhuw    xmm2, xmm5
+
+    packuswb   xmm0, xmm2
+    movdqu     [edi], xmm0
+    lea        edi, [edi + 16]
+    sub        ecx, 4
+    jge        s4
+
+    jmp        l4b
+
+    // 4 pixel loop
+  l4:
+    // top left
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+
+    // - top right
+    psubd      xmm0, [eax + edx * 4]
+    psubd      xmm1, [eax + edx * 4 + 16]
+    psubd      xmm2, [eax + edx * 4 + 32]
+    psubd      xmm3, [eax + edx * 4 + 48]
+    lea        eax, [eax + 64]
+
+    // - bottom left
+    psubd      xmm0, [esi]
+    psubd      xmm1, [esi + 16]
+    psubd      xmm2, [esi + 32]
+    psubd      xmm3, [esi + 48]
+
+    // + bottom right
+    paddd      xmm0, [esi + edx * 4]
+    paddd      xmm1, [esi + edx * 4 + 16]
+    paddd      xmm2, [esi + edx * 4 + 32]
+    paddd      xmm3, [esi + edx * 4 + 48]
+    lea        esi, [esi + 64]
+
+    cvtdq2ps   xmm0, xmm0   // Average = Sum * 1 / Area
+    cvtdq2ps   xmm1, xmm1
+    mulps      xmm0, xmm4
+    mulps      xmm1, xmm4
+    cvtdq2ps   xmm2, xmm2
+    cvtdq2ps   xmm3, xmm3
+    mulps      xmm2, xmm4
+    mulps      xmm3, xmm4
+    cvtps2dq   xmm0, xmm0
+    cvtps2dq   xmm1, xmm1
+    cvtps2dq   xmm2, xmm2
+    cvtps2dq   xmm3, xmm3
+    packssdw   xmm0, xmm1
+    packssdw   xmm2, xmm3
+    packuswb   xmm0, xmm2
+    movdqu     [edi], xmm0
+    lea        edi, [edi + 16]
+    sub        ecx, 4
+    jge        l4
+
+  l4b:
+    add        ecx, 4 - 1
+    jl         l1b
+
+    // 1 pixel loop
+  l1:
+    movdqu     xmm0, [eax]
+    psubd      xmm0, [eax + edx * 4]
+    lea        eax, [eax + 16]
+    psubd      xmm0, [esi]
+    paddd      xmm0, [esi + edx * 4]
+    lea        esi, [esi + 16]
+    cvtdq2ps   xmm0, xmm0
+    mulps      xmm0, xmm4
+    cvtps2dq   xmm0, xmm0
+    packssdw   xmm0, xmm0
+    packuswb   xmm0, xmm0
+    movd       dword ptr [edi], xmm0
+    lea        edi, [edi + 4]
+    sub        ecx, 1
+    jge        l1
+  l1b:
+  }
+}
+#endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+
+#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
+// Creates a table of cumulative sums where each value is a sum of all values
+// above and to the left of the value.
+void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
+                                  const int32* previous_cumsum, int width) {
+  __asm {
+    mov        eax, row
+    mov        edx, cumsum
+    mov        esi, previous_cumsum
+    mov        ecx, width
+    pxor       xmm0, xmm0
+    pxor       xmm1, xmm1
+
+    sub        ecx, 4
+    jl         l4b
+    test       edx, 15
+    jne        l4b
+
+    // 4 pixel loop
+  l4:
+    movdqu     xmm2, [eax]  // 4 argb pixels 16 bytes.
+    lea        eax, [eax + 16]
+    movdqa     xmm4, xmm2
+
+    punpcklbw  xmm2, xmm1
+    movdqa     xmm3, xmm2
+    punpcklwd  xmm2, xmm1
+    punpckhwd  xmm3, xmm1
+
+    punpckhbw  xmm4, xmm1
+    movdqa     xmm5, xmm4
+    punpcklwd  xmm4, xmm1
+    punpckhwd  xmm5, xmm1
+
+    paddd      xmm0, xmm2
+    movdqu     xmm2, [esi]  // previous row above.
+    paddd      xmm2, xmm0
+
+    paddd      xmm0, xmm3
+    movdqu     xmm3, [esi + 16]
+    paddd      xmm3, xmm0
+
+    paddd      xmm0, xmm4
+    movdqu     xmm4, [esi + 32]
+    paddd      xmm4, xmm0
+
+    paddd      xmm0, xmm5
+    movdqu     xmm5, [esi + 48]
+    lea        esi, [esi + 64]
+    paddd      xmm5, xmm0
+
+    movdqu     [edx], xmm2
+    movdqu     [edx + 16], xmm3
+    movdqu     [edx + 32], xmm4
+    movdqu     [edx + 48], xmm5
+
+    lea        edx, [edx + 64]
+    sub        ecx, 4
+    jge        l4
+
+  l4b:
+    add        ecx, 4 - 1
+    jl         l1b
+
+    // 1 pixel loop
+  l1:
+    movd       xmm2, dword ptr [eax]  // 1 argb pixel 4 bytes.
+    lea        eax, [eax + 4]
+    punpcklbw  xmm2, xmm1
+    punpcklwd  xmm2, xmm1
+    paddd      xmm0, xmm2
+    movdqu     xmm2, [esi]
+    lea        esi, [esi + 16]
+    paddd      xmm2, xmm0
+    movdqu     [edx], xmm2
+    lea        edx, [edx + 16]
+    sub        ecx, 1
+    jge        l1
+
+ l1b:
+  }
+}
+#endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
+
+#ifdef HAS_ARGBAFFINEROW_SSE2
+// Copy ARGB pixels from source image with slope to a row of destination.
+__declspec(naked)
+LIBYUV_API
+void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
+                        uint8* dst_argb, const float* uv_dudv, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 12]  // src_argb
+    mov        esi, [esp + 16]  // stride
+    mov        edx, [esp + 20]  // dst_argb
+    mov        ecx, [esp + 24]  // pointer to uv_dudv
+    movq       xmm2, qword ptr [ecx]  // uv
+    movq       xmm7, qword ptr [ecx + 8]  // dudv
+    mov        ecx, [esp + 28]  // width
+    shl        esi, 16          // 4, stride
+    add        esi, 4
+    movd       xmm5, esi
+    sub        ecx, 4
+    jl         l4b
+
+    // setup for 4 pixel loop
+    pshufd     xmm7, xmm7, 0x44  // dup dudv
+    pshufd     xmm5, xmm5, 0  // dup 4, stride
+    movdqa     xmm0, xmm2    // x0, y0, x1, y1
+    addps      xmm0, xmm7
+    movlhps    xmm2, xmm0
+    movdqa     xmm4, xmm7
+    addps      xmm4, xmm4    // dudv *= 2
+    movdqa     xmm3, xmm2    // x2, y2, x3, y3
+    addps      xmm3, xmm4
+    addps      xmm4, xmm4    // dudv *= 4
+
+    // 4 pixel loop
+  l4:
+    cvttps2dq  xmm0, xmm2    // x, y float to int first 2
+    cvttps2dq  xmm1, xmm3    // x, y float to int next 2
+    packssdw   xmm0, xmm1    // x, y as 8 shorts
+    pmaddwd    xmm0, xmm5    // offsets = x * 4 + y * stride.
+    movd       esi, xmm0
+    pshufd     xmm0, xmm0, 0x39  // shift right
+    movd       edi, xmm0
+    pshufd     xmm0, xmm0, 0x39  // shift right
+    movd       xmm1, [eax + esi]  // read pixel 0
+    movd       xmm6, [eax + edi]  // read pixel 1
+    punpckldq  xmm1, xmm6     // combine pixel 0 and 1
+    addps      xmm2, xmm4    // x, y += dx, dy first 2
+    movq       qword ptr [edx], xmm1
+    movd       esi, xmm0
+    pshufd     xmm0, xmm0, 0x39  // shift right
+    movd       edi, xmm0
+    movd       xmm6, [eax + esi]  // read pixel 2
+    movd       xmm0, [eax + edi]  // read pixel 3
+    punpckldq  xmm6, xmm0     // combine pixel 2 and 3
+    addps      xmm3, xmm4    // x, y += dx, dy next 2
+    movq       qword ptr 8[edx], xmm6
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jge        l4
+
+  l4b:
+    add        ecx, 4 - 1
+    jl         l1b
+
+    // 1 pixel loop
+  l1:
+    cvttps2dq  xmm0, xmm2    // x, y float to int
+    packssdw   xmm0, xmm0    // x, y as shorts
+    pmaddwd    xmm0, xmm5    // offset = x * 4 + y * stride
+    addps      xmm2, xmm7    // x, y += dx, dy
+    movd       esi, xmm0
+    movd       xmm0, [eax + esi]  // copy a pixel
+    movd       [edx], xmm0
+    lea        edx, [edx + 4]
+    sub        ecx, 1
+    jge        l1
+  l1b:
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBAFFINEROW_SSE2
+
+#ifdef HAS_INTERPOLATEROW_AVX2
+// Bilinear filter 32x2 -> 32x1
+__declspec(naked)
+void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) {
+  __asm {
+    push       esi
+    push       edi
+    mov        edi, [esp + 8 + 4]   // dst_ptr
+    mov        esi, [esp + 8 + 8]   // src_ptr
+    mov        edx, [esp + 8 + 12]  // src_stride
+    mov        ecx, [esp + 8 + 16]  // dst_width
+    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
+    // Dispatch to specialized filters if applicable.
+    cmp        eax, 0
+    je         xloop100  // 0 / 256.  Blend 100 / 0.
+    sub        edi, esi
+    cmp        eax, 128
+    je         xloop50   // 128 /256 is 0.50.  Blend 50 / 50.
+
+    vmovd      xmm0, eax  // high fraction 0..255
+    neg        eax
+    add        eax, 256
+    vmovd      xmm5, eax  // low fraction 256..1
+    vpunpcklbw xmm5, xmm5, xmm0
+    vpunpcklwd xmm5, xmm5, xmm5
+    vbroadcastss ymm5, xmm5
+
+    mov        eax, 0x80808080  // 128b for bias and rounding.
+    vmovd      xmm4, eax
+    vbroadcastss ymm4, xmm4
+
+  xloop:
+    vmovdqu    ymm0, [esi]
+    vmovdqu    ymm2, [esi + edx]
+    vpunpckhbw ymm1, ymm0, ymm2  // mutates
+    vpunpcklbw ymm0, ymm0, ymm2
+    vpsubb     ymm1, ymm1, ymm4  // bias to signed image
+    vpsubb     ymm0, ymm0, ymm4
+    vpmaddubsw ymm1, ymm5, ymm1
+    vpmaddubsw ymm0, ymm5, ymm0
+    vpaddw     ymm1, ymm1, ymm4  // unbias and round
+    vpaddw     ymm0, ymm0, ymm4
+    vpsrlw     ymm1, ymm1, 8
+    vpsrlw     ymm0, ymm0, 8
+    vpackuswb  ymm0, ymm0, ymm1  // unmutates
+    vmovdqu    [esi + edi], ymm0
+    lea        esi, [esi + 32]
+    sub        ecx, 32
+    jg         xloop
+    jmp        xloop99
+
+   // Blend 50 / 50.
+ xloop50:
+   vmovdqu    ymm0, [esi]
+   vpavgb     ymm0, ymm0, [esi + edx]
+   vmovdqu    [esi + edi], ymm0
+   lea        esi, [esi + 32]
+   sub        ecx, 32
+   jg         xloop50
+   jmp        xloop99
+
+   // Blend 100 / 0 - Copy row unchanged.
+ xloop100:
+   rep movsb
+
+  xloop99:
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_INTERPOLATEROW_AVX2
+
+// Bilinear filter 16x2 -> 16x1
+// TODO(fbarchard): Consider allowing 256 using memcpy.
+__declspec(naked)
+void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                          ptrdiff_t src_stride, int dst_width,
+                          int source_y_fraction) {
+  __asm {
+    push       esi
+    push       edi
+
+    mov        edi, [esp + 8 + 4]   // dst_ptr
+    mov        esi, [esp + 8 + 8]   // src_ptr
+    mov        edx, [esp + 8 + 12]  // src_stride
+    mov        ecx, [esp + 8 + 16]  // dst_width
+    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
+    sub        edi, esi
+    // Dispatch to specialized filters if applicable.
+    cmp        eax, 0
+    je         xloop100  // 0 /256.  Blend 100 / 0.
+    cmp        eax, 128
+    je         xloop50   // 128 / 256 is 0.50.  Blend 50 / 50.
+
+    movd       xmm0, eax  // high fraction 0..255
+    neg        eax
+    add        eax, 256
+    movd       xmm5, eax  // low fraction 255..1
+    punpcklbw  xmm5, xmm0
+    punpcklwd  xmm5, xmm5
+    pshufd     xmm5, xmm5, 0
+    mov        eax, 0x80808080  // 128 for biasing image to signed.
+    movd       xmm4, eax
+    pshufd     xmm4, xmm4, 0x00
+
+  xloop:
+    movdqu     xmm0, [esi]
+    movdqu     xmm2, [esi + edx]
+    movdqu     xmm1, xmm0
+    punpcklbw  xmm0, xmm2
+    punpckhbw  xmm1, xmm2
+    psubb      xmm0, xmm4  // bias image by -128
+    psubb      xmm1, xmm4
+    movdqa     xmm2, xmm5
+    movdqa     xmm3, xmm5
+    pmaddubsw  xmm2, xmm0
+    pmaddubsw  xmm3, xmm1
+    paddw      xmm2, xmm4
+    paddw      xmm3, xmm4
+    psrlw      xmm2, 8
+    psrlw      xmm3, 8
+    packuswb   xmm2, xmm3
+    movdqu     [esi + edi], xmm2
+    lea        esi, [esi + 16]
+    sub        ecx, 16
+    jg         xloop
+    jmp        xloop99
+
+    // Blend 50 / 50.
+  xloop50:
+    movdqu     xmm0, [esi]
+    movdqu     xmm1, [esi + edx]
+    pavgb      xmm0, xmm1
+    movdqu     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    sub        ecx, 16
+    jg         xloop50
+    jmp        xloop99
+
+    // Blend 100 / 0 - Copy row unchanged.
+  xloop100:
+    movdqu     xmm0, [esi]
+    movdqu     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    sub        ecx, 16
+    jg         xloop100
+
+  xloop99:
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+__declspec(naked)
+void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                          const uint8* shuffler, int width) {
+  __asm {
+    mov        eax, [esp + 4]    // src_argb
+    mov        edx, [esp + 8]    // dst_argb
+    mov        ecx, [esp + 12]   // shuffler
+    movdqu     xmm5, [ecx]
+    mov        ecx, [esp + 16]   // width
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax, [eax + 32]
+    pshufb     xmm0, xmm5
+    pshufb     xmm1, xmm5
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         wloop
+    ret
+  }
+}
+
+#ifdef HAS_ARGBSHUFFLEROW_AVX2
+__declspec(naked)
+void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int width) {
+  __asm {
+    mov        eax, [esp + 4]     // src_argb
+    mov        edx, [esp + 8]     // dst_argb
+    mov        ecx, [esp + 12]    // shuffler
+    vbroadcastf128 ymm5, [ecx]    // same shuffle in high as low.
+    mov        ecx, [esp + 16]    // width
+
+  wloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    lea        eax, [eax + 64]
+    vpshufb    ymm0, ymm0, ymm5
+    vpshufb    ymm1, ymm1, ymm5
+    vmovdqu    [edx], ymm0
+    vmovdqu    [edx + 32], ymm1
+    lea        edx, [edx + 64]
+    sub        ecx, 16
+    jg         wloop
+
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBSHUFFLEROW_AVX2
+
+__declspec(naked)
+void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int width) {
+  __asm {
+    push       ebx
+    push       esi
+    mov        eax, [esp + 8 + 4]    // src_argb
+    mov        edx, [esp + 8 + 8]    // dst_argb
+    mov        esi, [esp + 8 + 12]   // shuffler
+    mov        ecx, [esp + 8 + 16]   // width
+    pxor       xmm5, xmm5
+
+    mov        ebx, [esi]   // shuffler
+    cmp        ebx, 0x03000102
+    je         shuf_3012
+    cmp        ebx, 0x00010203
+    je         shuf_0123
+    cmp        ebx, 0x00030201
+    je         shuf_0321
+    cmp        ebx, 0x02010003
+    je         shuf_2103
+
+  // TODO(fbarchard): Use one source pointer and 3 offsets.
+  shuf_any1:
+    movzx      ebx, byte ptr [esi]
+    movzx      ebx, byte ptr [eax + ebx]
+    mov        [edx], bl
+    movzx      ebx, byte ptr [esi + 1]
+    movzx      ebx, byte ptr [eax + ebx]
+    mov        [edx + 1], bl
+    movzx      ebx, byte ptr [esi + 2]
+    movzx      ebx, byte ptr [eax + ebx]
+    mov        [edx + 2], bl
+    movzx      ebx, byte ptr [esi + 3]
+    movzx      ebx, byte ptr [eax + ebx]
+    mov        [edx + 3], bl
+    lea        eax, [eax + 4]
+    lea        edx, [edx + 4]
+    sub        ecx, 1
+    jg         shuf_any1
+    jmp        shuf99
+
+  shuf_0123:
+    movdqu     xmm0, [eax]
+    lea        eax, [eax + 16]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm5
+    punpckhbw  xmm1, xmm5
+    pshufhw    xmm0, xmm0, 01Bh   // 1B = 00011011 = 0x0123 = BGRAToARGB
+    pshuflw    xmm0, xmm0, 01Bh
+    pshufhw    xmm1, xmm1, 01Bh
+    pshuflw    xmm1, xmm1, 01Bh
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         shuf_0123
+    jmp        shuf99
+
+  shuf_0321:
+    movdqu     xmm0, [eax]
+    lea        eax, [eax + 16]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm5
+    punpckhbw  xmm1, xmm5
+    pshufhw    xmm0, xmm0, 039h   // 39 = 00111001 = 0x0321 = RGBAToARGB
+    pshuflw    xmm0, xmm0, 039h
+    pshufhw    xmm1, xmm1, 039h
+    pshuflw    xmm1, xmm1, 039h
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         shuf_0321
+    jmp        shuf99
+
+  shuf_2103:
+    movdqu     xmm0, [eax]
+    lea        eax, [eax + 16]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm5
+    punpckhbw  xmm1, xmm5
+    pshufhw    xmm0, xmm0, 093h   // 93 = 10010011 = 0x2103 = ARGBToRGBA
+    pshuflw    xmm0, xmm0, 093h
+    pshufhw    xmm1, xmm1, 093h
+    pshuflw    xmm1, xmm1, 093h
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         shuf_2103
+    jmp        shuf99
+
+  shuf_3012:
+    movdqu     xmm0, [eax]
+    lea        eax, [eax + 16]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm5
+    punpckhbw  xmm1, xmm5
+    pshufhw    xmm0, xmm0, 0C6h   // C6 = 11000110 = 0x3012 = ABGRToARGB
+    pshuflw    xmm0, xmm0, 0C6h
+    pshufhw    xmm1, xmm1, 0C6h
+    pshuflw    xmm1, xmm1, 0C6h
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         shuf_3012
+
+  shuf99:
+    pop        esi
+    pop        ebx
+    ret
+  }
+}
+
+// YUY2 - Macro-pixel = 2 image pixels
+// Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
+
+// UYVY - Macro-pixel = 2 image pixels
+// U0Y0V0Y1
+
+__declspec(naked)
+void I422ToYUY2Row_SSE2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_frame, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_y
+    mov        esi, [esp + 8 + 8]    // src_u
+    mov        edx, [esp + 8 + 12]   // src_v
+    mov        edi, [esp + 8 + 16]   // dst_frame
+    mov        ecx, [esp + 8 + 20]   // width
+    sub        edx, esi
+
+  convertloop:
+    movq       xmm2, qword ptr [esi] // U
+    movq       xmm3, qword ptr [esi + edx] // V
+    lea        esi, [esi + 8]
+    punpcklbw  xmm2, xmm3 // UV
+    movdqu     xmm0, [eax] // Y
+    lea        eax, [eax + 16]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm2 // YUYV
+    punpckhbw  xmm1, xmm2
+    movdqu     [edi], xmm0
+    movdqu     [edi + 16], xmm1
+    lea        edi, [edi + 32]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked)
+void I422ToUYVYRow_SSE2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_frame, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_y
+    mov        esi, [esp + 8 + 8]    // src_u
+    mov        edx, [esp + 8 + 12]   // src_v
+    mov        edi, [esp + 8 + 16]   // dst_frame
+    mov        ecx, [esp + 8 + 20]   // width
+    sub        edx, esi
+
+  convertloop:
+    movq       xmm2, qword ptr [esi] // U
+    movq       xmm3, qword ptr [esi + edx] // V
+    lea        esi, [esi + 8]
+    punpcklbw  xmm2, xmm3 // UV
+    movdqu     xmm0, [eax] // Y
+    movdqa     xmm1, xmm2
+    lea        eax, [eax + 16]
+    punpcklbw  xmm1, xmm0 // UYVY
+    punpckhbw  xmm2, xmm0
+    movdqu     [edi], xmm1
+    movdqu     [edi + 16], xmm2
+    lea        edi, [edi + 32]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
+__declspec(naked)
+void ARGBPolynomialRow_SSE2(const uint8* src_argb,
+                            uint8* dst_argb, const float* poly,
+                            int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   /* src_argb */
+    mov        edx, [esp + 4 + 8]   /* dst_argb */
+    mov        esi, [esp + 4 + 12]  /* poly */
+    mov        ecx, [esp + 4 + 16]  /* width */
+    pxor       xmm3, xmm3  // 0 constant for zero extending bytes to ints.
+
+    // 2 pixel loop.
+ convertloop:
+//    pmovzxbd  xmm0, dword ptr [eax]  // BGRA pixel
+//    pmovzxbd  xmm4, dword ptr [eax + 4]  // BGRA pixel
+    movq       xmm0, qword ptr [eax]  // BGRABGRA
+    lea        eax, [eax + 8]
+    punpcklbw  xmm0, xmm3
+    movdqa     xmm4, xmm0
+    punpcklwd  xmm0, xmm3  // pixel 0
+    punpckhwd  xmm4, xmm3  // pixel 1
+    cvtdq2ps   xmm0, xmm0  // 4 floats
+    cvtdq2ps   xmm4, xmm4
+    movdqa     xmm1, xmm0  // X
+    movdqa     xmm5, xmm4
+    mulps      xmm0, [esi + 16]  // C1 * X
+    mulps      xmm4, [esi + 16]
+    addps      xmm0, [esi]  // result = C0 + C1 * X
+    addps      xmm4, [esi]
+    movdqa     xmm2, xmm1
+    movdqa     xmm6, xmm5
+    mulps      xmm2, xmm1  // X * X
+    mulps      xmm6, xmm5
+    mulps      xmm1, xmm2  // X * X * X
+    mulps      xmm5, xmm6
+    mulps      xmm2, [esi + 32]  // C2 * X * X
+    mulps      xmm6, [esi + 32]
+    mulps      xmm1, [esi + 48]  // C3 * X * X * X
+    mulps      xmm5, [esi + 48]
+    addps      xmm0, xmm2  // result += C2 * X * X
+    addps      xmm4, xmm6
+    addps      xmm0, xmm1  // result += C3 * X * X * X
+    addps      xmm4, xmm5
+    cvttps2dq  xmm0, xmm0
+    cvttps2dq  xmm4, xmm4
+    packuswb   xmm0, xmm4
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx], xmm0
+    lea        edx, [edx + 8]
+    sub        ecx, 2
+    jg         convertloop
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBPOLYNOMIALROW_SSE2
+
+#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
+__declspec(naked)
+void ARGBPolynomialRow_AVX2(const uint8* src_argb,
+                            uint8* dst_argb, const float* poly,
+                            int width) {
+  __asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_argb */
+    mov        ecx, [esp + 12]   /* poly */
+    vbroadcastf128 ymm4, [ecx]       // C0
+    vbroadcastf128 ymm5, [ecx + 16]  // C1
+    vbroadcastf128 ymm6, [ecx + 32]  // C2
+    vbroadcastf128 ymm7, [ecx + 48]  // C3
+    mov        ecx, [esp + 16]  /* width */
+
+    // 2 pixel loop.
+ convertloop:
+    vpmovzxbd   ymm0, qword ptr [eax]  // 2 BGRA pixels
+    lea         eax, [eax + 8]
+    vcvtdq2ps   ymm0, ymm0        // X 8 floats
+    vmulps      ymm2, ymm0, ymm0  // X * X
+    vmulps      ymm3, ymm0, ymm7  // C3 * X
+    vfmadd132ps ymm0, ymm4, ymm5  // result = C0 + C1 * X
+    vfmadd231ps ymm0, ymm2, ymm6  // result += C2 * X * X
+    vfmadd231ps ymm0, ymm2, ymm3  // result += C3 * X * X * X
+    vcvttps2dq  ymm0, ymm0
+    vpackusdw   ymm0, ymm0, ymm0  // b0g0r0a0_00000000_b0g0r0a0_00000000
+    vpermq      ymm0, ymm0, 0xd8  // b0g0r0a0_b0g0r0a0_00000000_00000000
+    vpackuswb   xmm0, xmm0, xmm0  // bgrabgra_00000000_00000000_00000000
+    vmovq       qword ptr [edx], xmm0
+    lea         edx, [edx + 8]
+    sub         ecx, 2
+    jg          convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBPOLYNOMIALROW_AVX2
+
+#ifdef HAS_ARGBCOLORTABLEROW_X86
+// Tranform ARGB pixels with color table.
+__declspec(naked)
+void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
+                           int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   /* dst_argb */
+    mov        esi, [esp + 4 + 8]   /* table_argb */
+    mov        ecx, [esp + 4 + 12]  /* width */
+
+    // 1 pixel loop.
+  convertloop:
+    movzx      edx, byte ptr [eax]
+    lea        eax, [eax + 4]
+    movzx      edx, byte ptr [esi + edx * 4]
+    mov        byte ptr [eax - 4], dl
+    movzx      edx, byte ptr [eax - 4 + 1]
+    movzx      edx, byte ptr [esi + edx * 4 + 1]
+    mov        byte ptr [eax - 4 + 1], dl
+    movzx      edx, byte ptr [eax - 4 + 2]
+    movzx      edx, byte ptr [esi + edx * 4 + 2]
+    mov        byte ptr [eax - 4 + 2], dl
+    movzx      edx, byte ptr [eax - 4 + 3]
+    movzx      edx, byte ptr [esi + edx * 4 + 3]
+    mov        byte ptr [eax - 4 + 3], dl
+    dec        ecx
+    jg         convertloop
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBCOLORTABLEROW_X86
+
+#ifdef HAS_RGBCOLORTABLEROW_X86
+// Tranform RGB pixels with color table.
+__declspec(naked)
+void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   /* dst_argb */
+    mov        esi, [esp + 4 + 8]   /* table_argb */
+    mov        ecx, [esp + 4 + 12]  /* width */
+
+    // 1 pixel loop.
+  convertloop:
+    movzx      edx, byte ptr [eax]
+    lea        eax, [eax + 4]
+    movzx      edx, byte ptr [esi + edx * 4]
+    mov        byte ptr [eax - 4], dl
+    movzx      edx, byte ptr [eax - 4 + 1]
+    movzx      edx, byte ptr [esi + edx * 4 + 1]
+    mov        byte ptr [eax - 4 + 1], dl
+    movzx      edx, byte ptr [eax - 4 + 2]
+    movzx      edx, byte ptr [esi + edx * 4 + 2]
+    mov        byte ptr [eax - 4 + 2], dl
+    dec        ecx
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_RGBCOLORTABLEROW_X86
+
+#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
+// Tranform RGB pixels with luma table.
+__declspec(naked)
+void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                                 int width,
+                                 const uint8* luma, uint32 lumacoeff) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   /* src_argb */
+    mov        edi, [esp + 8 + 8]   /* dst_argb */
+    mov        ecx, [esp + 8 + 12]  /* width */
+    movd       xmm2, dword ptr [esp + 8 + 16]  // luma table
+    movd       xmm3, dword ptr [esp + 8 + 20]  // lumacoeff
+    pshufd     xmm2, xmm2, 0
+    pshufd     xmm3, xmm3, 0
+    pcmpeqb    xmm4, xmm4        // generate mask 0xff00ff00
+    psllw      xmm4, 8
+    pxor       xmm5, xmm5
+
+    // 4 pixel loop.
+  convertloop:
+    movdqu     xmm0, xmmword ptr [eax]      // generate luma ptr
+    pmaddubsw  xmm0, xmm3
+    phaddw     xmm0, xmm0
+    pand       xmm0, xmm4  // mask out low bits
+    punpcklwd  xmm0, xmm5
+    paddd      xmm0, xmm2  // add table base
+    movd       esi, xmm0
+    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
+
+    movzx      edx, byte ptr [eax]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi], dl
+    movzx      edx, byte ptr [eax + 1]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 1], dl
+    movzx      edx, byte ptr [eax + 2]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 2], dl
+    movzx      edx, byte ptr [eax + 3]  // copy alpha.
+    mov        byte ptr [edi + 3], dl
+
+    movd       esi, xmm0
+    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
+
+    movzx      edx, byte ptr [eax + 4]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 4], dl
+    movzx      edx, byte ptr [eax + 5]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 5], dl
+    movzx      edx, byte ptr [eax + 6]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 6], dl
+    movzx      edx, byte ptr [eax + 7]  // copy alpha.
+    mov        byte ptr [edi + 7], dl
+
+    movd       esi, xmm0
+    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
+
+    movzx      edx, byte ptr [eax + 8]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 8], dl
+    movzx      edx, byte ptr [eax + 9]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 9], dl
+    movzx      edx, byte ptr [eax + 10]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 10], dl
+    movzx      edx, byte ptr [eax + 11]  // copy alpha.
+    mov        byte ptr [edi + 11], dl
+
+    movd       esi, xmm0
+
+    movzx      edx, byte ptr [eax + 12]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 12], dl
+    movzx      edx, byte ptr [eax + 13]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 13], dl
+    movzx      edx, byte ptr [eax + 14]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 14], dl
+    movzx      edx, byte ptr [eax + 15]  // copy alpha.
+    mov        byte ptr [edi + 15], dl
+
+    lea        eax, [eax + 16]
+    lea        edi, [edi + 16]
+    sub        ecx, 4
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
+
+#endif  // defined(_M_X64)
+#endif  // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64))
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libyuv/source/scale.cc b/libs/libyuv/source/scale.cc
new file mode 100644
index 0000000000..36e3fe5281
--- /dev/null
+++ b/libs/libyuv/source/scale.cc
@@ -0,0 +1,1672 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"  // For CopyPlane
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+static __inline int Abs(int v) {
+  return v >= 0 ? v : -v;
+}
+
+#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
+
+// Scale plane, 1/2
+// This is an optimized version for scaling down a plane to 1/2 of
+// its original size.
+
+static void ScalePlaneDown2(int src_width, int src_height,
+                            int dst_width, int dst_height,
+                            int src_stride, int dst_stride,
+                            const uint8* src_ptr, uint8* dst_ptr,
+                            enum FilterMode filtering) {
+  int y;
+  void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) =
+      filtering == kFilterNone ? ScaleRowDown2_C :
+      (filtering == kFilterLinear ? ScaleRowDown2Linear_C : ScaleRowDown2Box_C);
+  int row_stride = src_stride << 1;
+  if (!filtering) {
+    src_ptr += src_stride;  // Point to odd rows.
+    src_stride = 0;
+  }
+
+#if defined(HAS_SCALEROWDOWN2_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_NEON :
+        (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_NEON :
+        ScaleRowDown2Box_Any_NEON);
+    if (IS_ALIGNED(dst_width, 16)) {
+      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_NEON :
+          (filtering == kFilterLinear ? ScaleRowDown2Linear_NEON :
+          ScaleRowDown2Box_NEON);
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN2_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_SSSE3 :
+        (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSSE3 :
+        ScaleRowDown2Box_Any_SSSE3);
+    if (IS_ALIGNED(dst_width, 16)) {
+      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSSE3 :
+          (filtering == kFilterLinear ? ScaleRowDown2Linear_SSSE3 :
+          ScaleRowDown2Box_SSSE3);
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN2_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_AVX2 :
+        (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_AVX2 :
+        ScaleRowDown2Box_Any_AVX2);
+    if (IS_ALIGNED(dst_width, 32)) {
+      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_AVX2 :
+          (filtering == kFilterLinear ? ScaleRowDown2Linear_AVX2 :
+          ScaleRowDown2Box_AVX2);
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN2_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_ptr, 4) &&
+      IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+    ScaleRowDown2 = filtering ?
+        ScaleRowDown2Box_DSPR2 : ScaleRowDown2_DSPR2;
+  }
+#endif
+
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
+  }
+  // TODO(fbarchard): Loop through source height to allow odd height.
+  for (y = 0; y < dst_height; ++y) {
+    ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
+    src_ptr += row_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+static void ScalePlaneDown2_16(int src_width, int src_height,
+                               int dst_width, int dst_height,
+                               int src_stride, int dst_stride,
+                               const uint16* src_ptr, uint16* dst_ptr,
+                               enum FilterMode filtering) {
+  int y;
+  void (*ScaleRowDown2)(const uint16* src_ptr, ptrdiff_t src_stride,
+                        uint16* dst_ptr, int dst_width) =
+    filtering == kFilterNone ? ScaleRowDown2_16_C :
+        (filtering == kFilterLinear ? ScaleRowDown2Linear_16_C :
+        ScaleRowDown2Box_16_C);
+  int row_stride = src_stride << 1;
+  if (!filtering) {
+    src_ptr += src_stride;  // Point to odd rows.
+    src_stride = 0;
+  }
+
+#if defined(HAS_SCALEROWDOWN2_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) {
+    ScaleRowDown2 = filtering ? ScaleRowDown2Box_16_NEON :
+        ScaleRowDown2_16_NEON;
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN2_16_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
+    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_SSE2 :
+        (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2 :
+        ScaleRowDown2Box_16_SSE2);
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN2_16_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_ptr, 4) &&
+      IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+    ScaleRowDown2 = filtering ?
+        ScaleRowDown2Box_16_DSPR2 : ScaleRowDown2_16_DSPR2;
+  }
+#endif
+
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
+  }
+  // TODO(fbarchard): Loop through source height to allow odd height.
+  for (y = 0; y < dst_height; ++y) {
+    ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
+    src_ptr += row_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+// Scale plane, 1/4
+// This is an optimized version for scaling down a plane to 1/4 of
+// its original size.
+
+static void ScalePlaneDown4(int src_width, int src_height,
+                            int dst_width, int dst_height,
+                            int src_stride, int dst_stride,
+                            const uint8* src_ptr, uint8* dst_ptr,
+                            enum FilterMode filtering) {
+  int y;
+  void (*ScaleRowDown4)(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) =
+      filtering ? ScaleRowDown4Box_C : ScaleRowDown4_C;
+  int row_stride = src_stride << 2;
+  if (!filtering) {
+    src_ptr += src_stride * 2;  // Point to row 2.
+    src_stride = 0;
+  }
+#if defined(HAS_SCALEROWDOWN4_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleRowDown4 = filtering ?
+        ScaleRowDown4Box_Any_NEON : ScaleRowDown4_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN4_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ScaleRowDown4 = filtering ?
+        ScaleRowDown4Box_Any_SSSE3 : ScaleRowDown4_Any_SSSE3;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSSE3 : ScaleRowDown4_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN4_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ScaleRowDown4 = filtering ?
+        ScaleRowDown4Box_Any_AVX2 : ScaleRowDown4_Any_AVX2;
+    if (IS_ALIGNED(dst_width, 16)) {
+      ScaleRowDown4 = filtering ? ScaleRowDown4Box_AVX2 : ScaleRowDown4_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN4_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(row_stride, 4) &&
+      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+    ScaleRowDown4 = filtering ?
+        ScaleRowDown4Box_DSPR2 : ScaleRowDown4_DSPR2;
+  }
+#endif
+
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
+  }
+  for (y = 0; y < dst_height; ++y) {
+    ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
+    src_ptr += row_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+static void ScalePlaneDown4_16(int src_width, int src_height,
+                               int dst_width, int dst_height,
+                               int src_stride, int dst_stride,
+                               const uint16* src_ptr, uint16* dst_ptr,
+                               enum FilterMode filtering) {
+  int y;
+  void (*ScaleRowDown4)(const uint16* src_ptr, ptrdiff_t src_stride,
+                        uint16* dst_ptr, int dst_width) =
+      filtering ? ScaleRowDown4Box_16_C : ScaleRowDown4_16_C;
+  int row_stride = src_stride << 2;
+  if (!filtering) {
+    src_ptr += src_stride * 2;  // Point to row 2.
+    src_stride = 0;
+  }
+#if defined(HAS_SCALEROWDOWN4_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) {
+    ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_NEON :
+        ScaleRowDown4_16_NEON;
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN4_16_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+    ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_SSE2 :
+        ScaleRowDown4_16_SSE2;
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN4_16_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(row_stride, 4) &&
+      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+    ScaleRowDown4 = filtering ?
+        ScaleRowDown4Box_16_DSPR2 : ScaleRowDown4_16_DSPR2;
+  }
+#endif
+
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
+  }
+  for (y = 0; y < dst_height; ++y) {
+    ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
+    src_ptr += row_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+// Scale plane down, 3/4
+
+static void ScalePlaneDown34(int src_width, int src_height,
+                             int dst_width, int dst_height,
+                             int src_stride, int dst_stride,
+                             const uint8* src_ptr, uint8* dst_ptr,
+                             enum FilterMode filtering) {
+  int y;
+  void (*ScaleRowDown34_0)(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width);
+  void (*ScaleRowDown34_1)(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width);
+  const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+  assert(dst_width % 3 == 0);
+  if (!filtering) {
+    ScaleRowDown34_0 = ScaleRowDown34_C;
+    ScaleRowDown34_1 = ScaleRowDown34_C;
+  } else {
+    ScaleRowDown34_0 = ScaleRowDown34_0_Box_C;
+    ScaleRowDown34_1 = ScaleRowDown34_1_Box_C;
+  }
+#if defined(HAS_SCALEROWDOWN34_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    if (!filtering) {
+      ScaleRowDown34_0 = ScaleRowDown34_Any_NEON;
+      ScaleRowDown34_1 = ScaleRowDown34_Any_NEON;
+    } else {
+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_NEON;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_NEON;
+    }
+    if (dst_width % 24 == 0) {
+      if (!filtering) {
+        ScaleRowDown34_0 = ScaleRowDown34_NEON;
+        ScaleRowDown34_1 = ScaleRowDown34_NEON;
+      } else {
+        ScaleRowDown34_0 = ScaleRowDown34_0_Box_NEON;
+        ScaleRowDown34_1 = ScaleRowDown34_1_Box_NEON;
+      }
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN34_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    if (!filtering) {
+      ScaleRowDown34_0 = ScaleRowDown34_Any_SSSE3;
+      ScaleRowDown34_1 = ScaleRowDown34_Any_SSSE3;
+    } else {
+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_SSSE3;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_SSSE3;
+    }
+    if (dst_width % 24 == 0) {
+      if (!filtering) {
+        ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
+        ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
+      } else {
+        ScaleRowDown34_0 = ScaleRowDown34_0_Box_SSSE3;
+        ScaleRowDown34_1 = ScaleRowDown34_1_Box_SSSE3;
+      }
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN34_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 24 == 0) &&
+      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+    if (!filtering) {
+      ScaleRowDown34_0 = ScaleRowDown34_DSPR2;
+      ScaleRowDown34_1 = ScaleRowDown34_DSPR2;
+    } else {
+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_DSPR2;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_DSPR2;
+    }
+  }
+#endif
+
+  for (y = 0; y < dst_height - 2; y += 3) {
+    ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+    ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+    ScaleRowDown34_0(src_ptr + src_stride, -filter_stride,
+                     dst_ptr, dst_width);
+    src_ptr += src_stride * 2;
+    dst_ptr += dst_stride;
+  }
+
+  // Remainder 1 or 2 rows with last row vertically unfiltered
+  if ((dst_height % 3) == 2) {
+    ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+    ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width);
+  } else if ((dst_height % 3) == 1) {
+    ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width);
+  }
+}
+
+static void ScalePlaneDown34_16(int src_width, int src_height,
+                                int dst_width, int dst_height,
+                                int src_stride, int dst_stride,
+                                const uint16* src_ptr, uint16* dst_ptr,
+                                enum FilterMode filtering) {
+  int y;
+  void (*ScaleRowDown34_0)(const uint16* src_ptr, ptrdiff_t src_stride,
+                           uint16* dst_ptr, int dst_width);
+  void (*ScaleRowDown34_1)(const uint16* src_ptr, ptrdiff_t src_stride,
+                           uint16* dst_ptr, int dst_width);
+  const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+  assert(dst_width % 3 == 0);
+  if (!filtering) {
+    ScaleRowDown34_0 = ScaleRowDown34_16_C;
+    ScaleRowDown34_1 = ScaleRowDown34_16_C;
+  } else {
+    ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_C;
+    ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_C;
+  }
+#if defined(HAS_SCALEROWDOWN34_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && (dst_width % 24 == 0)) {
+    if (!filtering) {
+      ScaleRowDown34_0 = ScaleRowDown34_16_NEON;
+      ScaleRowDown34_1 = ScaleRowDown34_16_NEON;
+    } else {
+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_NEON;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN34_16_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) {
+    if (!filtering) {
+      ScaleRowDown34_0 = ScaleRowDown34_16_SSSE3;
+      ScaleRowDown34_1 = ScaleRowDown34_16_SSSE3;
+    } else {
+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_SSSE3;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN34_16_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 24 == 0) &&
+      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+    if (!filtering) {
+      ScaleRowDown34_0 = ScaleRowDown34_16_DSPR2;
+      ScaleRowDown34_1 = ScaleRowDown34_16_DSPR2;
+    } else {
+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_DSPR2;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_DSPR2;
+    }
+  }
+#endif
+
+  for (y = 0; y < dst_height - 2; y += 3) {
+    ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+    ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+    ScaleRowDown34_0(src_ptr + src_stride, -filter_stride,
+                     dst_ptr, dst_width);
+    src_ptr += src_stride * 2;
+    dst_ptr += dst_stride;
+  }
+
+  // Remainder 1 or 2 rows with last row vertically unfiltered
+  if ((dst_height % 3) == 2) {
+    ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+    ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width);
+  } else if ((dst_height % 3) == 1) {
+    ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width);
+  }
+}
+
+
+// Scale plane, 3/8
+// This is an optimized version for scaling down a plane to 3/8
+// of its original size.
+//
+// Uses box filter arranges like this
+// aaabbbcc -> abc
+// aaabbbcc    def
+// aaabbbcc    ghi
+// dddeeeff
+// dddeeeff
+// dddeeeff
+// ggghhhii
+// ggghhhii
+// Boxes are 3x3, 2x3, 3x2 and 2x2
+
+static void ScalePlaneDown38(int src_width, int src_height,
+                             int dst_width, int dst_height,
+                             int src_stride, int dst_stride,
+                             const uint8* src_ptr, uint8* dst_ptr,
+                             enum FilterMode filtering) {
+  int y;
+  void (*ScaleRowDown38_3)(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width);
+  void (*ScaleRowDown38_2)(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width);
+  const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+  assert(dst_width % 3 == 0);
+  if (!filtering) {
+    ScaleRowDown38_3 = ScaleRowDown38_C;
+    ScaleRowDown38_2 = ScaleRowDown38_C;
+  } else {
+    ScaleRowDown38_3 = ScaleRowDown38_3_Box_C;
+    ScaleRowDown38_2 = ScaleRowDown38_2_Box_C;
+  }
+
+#if defined(HAS_SCALEROWDOWN38_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    if (!filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_Any_NEON;
+      ScaleRowDown38_2 = ScaleRowDown38_Any_NEON;
+    } else {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_NEON;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_NEON;
+    }
+    if (dst_width % 12 == 0) {
+      if (!filtering) {
+        ScaleRowDown38_3 = ScaleRowDown38_NEON;
+        ScaleRowDown38_2 = ScaleRowDown38_NEON;
+      } else {
+        ScaleRowDown38_3 = ScaleRowDown38_3_Box_NEON;
+        ScaleRowDown38_2 = ScaleRowDown38_2_Box_NEON;
+      }
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN38_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    if (!filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_Any_SSSE3;
+      ScaleRowDown38_2 = ScaleRowDown38_Any_SSSE3;
+    } else {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_SSSE3;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_SSSE3;
+    }
+    if (dst_width % 12 == 0 && !filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_SSSE3;
+      ScaleRowDown38_2 = ScaleRowDown38_SSSE3;
+    }
+    if (dst_width % 6 == 0 && filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_SSSE3;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN38_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 12 == 0) &&
+      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+    if (!filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_DSPR2;
+      ScaleRowDown38_2 = ScaleRowDown38_DSPR2;
+    } else {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_DSPR2;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_DSPR2;
+    }
+  }
+#endif
+
+  for (y = 0; y < dst_height - 2; y += 3) {
+    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride * 3;
+    dst_ptr += dst_stride;
+    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride * 3;
+    dst_ptr += dst_stride;
+    ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride * 2;
+    dst_ptr += dst_stride;
+  }
+
+  // Remainder 1 or 2 rows with last row vertically unfiltered
+  if ((dst_height % 3) == 2) {
+    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride * 3;
+    dst_ptr += dst_stride;
+    ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
+  } else if ((dst_height % 3) == 1) {
+    ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
+  }
+}
+
+static void ScalePlaneDown38_16(int src_width, int src_height,
+                                int dst_width, int dst_height,
+                                int src_stride, int dst_stride,
+                                const uint16* src_ptr, uint16* dst_ptr,
+                                enum FilterMode filtering) {
+  int y;
+  void (*ScaleRowDown38_3)(const uint16* src_ptr, ptrdiff_t src_stride,
+                           uint16* dst_ptr, int dst_width);
+  void (*ScaleRowDown38_2)(const uint16* src_ptr, ptrdiff_t src_stride,
+                           uint16* dst_ptr, int dst_width);
+  const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+  assert(dst_width % 3 == 0);
+  if (!filtering) {
+    ScaleRowDown38_3 = ScaleRowDown38_16_C;
+    ScaleRowDown38_2 = ScaleRowDown38_16_C;
+  } else {
+    ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_C;
+    ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_C;
+  }
+#if defined(HAS_SCALEROWDOWN38_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && (dst_width % 12 == 0)) {
+    if (!filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_16_NEON;
+      ScaleRowDown38_2 = ScaleRowDown38_16_NEON;
+    } else {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_NEON;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN38_16_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) {
+    if (!filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_16_SSSE3;
+      ScaleRowDown38_2 = ScaleRowDown38_16_SSSE3;
+    } else {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_SSSE3;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN38_16_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 12 == 0) &&
+      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+    if (!filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_16_DSPR2;
+      ScaleRowDown38_2 = ScaleRowDown38_16_DSPR2;
+    } else {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_DSPR2;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_DSPR2;
+    }
+  }
+#endif
+
+  for (y = 0; y < dst_height - 2; y += 3) {
+    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride * 3;
+    dst_ptr += dst_stride;
+    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride * 3;
+    dst_ptr += dst_stride;
+    ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride * 2;
+    dst_ptr += dst_stride;
+  }
+
+  // Remainder 1 or 2 rows with last row vertically unfiltered
+  if ((dst_height % 3) == 2) {
+    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride * 3;
+    dst_ptr += dst_stride;
+    ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
+  } else if ((dst_height % 3) == 1) {
+    ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
+  }
+}
+
+#define MIN1(x) ((x) < 1 ? 1 : (x))
+
+static __inline uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {
+  uint32 sum = 0u;
+  int x;
+  assert(iboxwidth > 0);
+  for (x = 0; x < iboxwidth; ++x) {
+    sum += src_ptr[x];
+  }
+  return sum;
+}
+
+static __inline uint32 SumPixels_16(int iboxwidth, const uint32* src_ptr) {
+  uint32 sum = 0u;
+  int x;
+  assert(iboxwidth > 0);
+  for (x = 0; x < iboxwidth; ++x) {
+    sum += src_ptr[x];
+  }
+  return sum;
+}
+
+static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx,
+                            const uint16* src_ptr, uint8* dst_ptr) {
+  int i;
+  int scaletbl[2];
+  int minboxwidth = dx >> 16;
+  int boxwidth;
+  scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight);
+  scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight);
+  for (i = 0; i < dst_width; ++i) {
+    int ix = x >> 16;
+    x += dx;
+    boxwidth = MIN1((x >> 16) - ix);
+    *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) *
+        scaletbl[boxwidth - minboxwidth] >> 16;
+  }
+}
+
+static void ScaleAddCols2_16_C(int dst_width, int boxheight, int x, int dx,
+                               const uint32* src_ptr, uint16* dst_ptr) {
+  int i;
+  int scaletbl[2];
+  int minboxwidth = dx >> 16;
+  int boxwidth;
+  scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight);
+  scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight);
+  for (i = 0; i < dst_width; ++i) {
+    int ix = x >> 16;
+    x += dx;
+    boxwidth = MIN1((x >> 16) - ix);
+    *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + ix) *
+        scaletbl[boxwidth - minboxwidth]  >> 16;
+  }
+}
+
+static void ScaleAddCols0_C(int dst_width, int boxheight, int x, int,
+                            const uint16* src_ptr, uint8* dst_ptr) {
+  int scaleval = 65536 / boxheight;
+  int i;
+  src_ptr += (x >> 16);
+  for (i = 0; i < dst_width; ++i) {
+    *dst_ptr++ = src_ptr[i] * scaleval >> 16;
+  }
+}
+
+static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx,
+                            const uint16* src_ptr, uint8* dst_ptr) {
+  int boxwidth = MIN1(dx >> 16);
+  int scaleval = 65536 / (boxwidth * boxheight);
+  int i;
+  x >>= 16;
+  for (i = 0; i < dst_width; ++i) {
+    *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16;
+    x += boxwidth;
+  }
+}
+
+static void ScaleAddCols1_16_C(int dst_width, int boxheight, int x, int dx,
+                               const uint32* src_ptr, uint16* dst_ptr) {
+  int boxwidth = MIN1(dx >> 16);
+  int scaleval = 65536 / (boxwidth * boxheight);
+  int i;
+  for (i = 0; i < dst_width; ++i) {
+    *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + x) * scaleval >> 16;
+    x += boxwidth;
+  }
+}
+
+// Scale plane down to any dimensions, with interpolation.
+// (boxfilter).
+//
+// Same method as SimpleScale, which is fixed point, outputting
+// one pixel of destination using fixed point (16.16) to step
+// through source, sampling a box of pixel with simple
+// averaging.
+static void ScalePlaneBox(int src_width, int src_height,
+                          int dst_width, int dst_height,
+                          int src_stride, int dst_stride,
+                          const uint8* src_ptr, uint8* dst_ptr) {
+  int j, k;
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  const int max_y = (src_height << 16);
+  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox,
+             &x, &y, &dx, &dy);
+  src_width = Abs(src_width);
+  {
+    // Allocate a row buffer of uint16.
+    align_buffer_64(row16, src_width * 2);
+    void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
+        const uint16* src_ptr, uint8* dst_ptr) =
+        (dx & 0xffff) ? ScaleAddCols2_C:
+        ((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C);
+    void (*ScaleAddRow)(const uint8* src_ptr, uint16* dst_ptr, int src_width) =
+        ScaleAddRow_C;
+#if defined(HAS_SCALEADDROW_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2)) {
+      ScaleAddRow = ScaleAddRow_Any_SSE2;
+      if (IS_ALIGNED(src_width, 16)) {
+        ScaleAddRow = ScaleAddRow_SSE2;
+      }
+    }
+#endif
+#if defined(HAS_SCALEADDROW_AVX2)
+    if (TestCpuFlag(kCpuHasAVX2)) {
+      ScaleAddRow = ScaleAddRow_Any_AVX2;
+      if (IS_ALIGNED(src_width, 32)) {
+        ScaleAddRow = ScaleAddRow_AVX2;
+      }
+    }
+#endif
+#if defined(HAS_SCALEADDROW_NEON)
+    if (TestCpuFlag(kCpuHasNEON)) {
+      ScaleAddRow = ScaleAddRow_Any_NEON;
+      if (IS_ALIGNED(src_width, 16)) {
+        ScaleAddRow = ScaleAddRow_NEON;
+      }
+    }
+#endif
+
+    for (j = 0; j < dst_height; ++j) {
+      int boxheight;
+      int iy = y >> 16;
+      const uint8* src = src_ptr + iy * src_stride;
+      y += dy;
+      if (y > max_y) {
+        y = max_y;
+      }
+      boxheight = MIN1((y >> 16) - iy);
+      memset(row16, 0, src_width * 2);
+      for (k = 0; k < boxheight; ++k) {
+        ScaleAddRow(src, (uint16 *)(row16), src_width);
+        src += src_stride;
+      }
+      ScaleAddCols(dst_width, boxheight, x, dx, (uint16*)(row16), dst_ptr);
+      dst_ptr += dst_stride;
+    }
+    free_aligned_buffer_64(row16);
+  }
+}
+
+static void ScalePlaneBox_16(int src_width, int src_height,
+                             int dst_width, int dst_height,
+                             int src_stride, int dst_stride,
+                             const uint16* src_ptr, uint16* dst_ptr) {
+  int j, k;
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  const int max_y = (src_height << 16);
+  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox,
+             &x, &y, &dx, &dy);
+  src_width = Abs(src_width);
+  {
+    // Allocate a row buffer of uint32.
+    align_buffer_64(row32, src_width * 4);
+    void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
+        const uint32* src_ptr, uint16* dst_ptr) =
+        (dx & 0xffff) ? ScaleAddCols2_16_C: ScaleAddCols1_16_C;
+    void (*ScaleAddRow)(const uint16* src_ptr, uint32* dst_ptr, int src_width) =
+        ScaleAddRow_16_C;
+
+#if defined(HAS_SCALEADDROW_16_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 16)) {
+      ScaleAddRow = ScaleAddRow_16_SSE2;
+    }
+#endif
+
+    for (j = 0; j < dst_height; ++j) {
+      int boxheight;
+      int iy = y >> 16;
+      const uint16* src = src_ptr + iy * src_stride;
+      y += dy;
+      if (y > max_y) {
+        y = max_y;
+      }
+      boxheight = MIN1((y >> 16) - iy);
+      memset(row32, 0, src_width * 4);
+      for (k = 0; k < boxheight; ++k) {
+        ScaleAddRow(src, (uint32 *)(row32), src_width);
+        src += src_stride;
+      }
+      ScaleAddCols(dst_width, boxheight, x, dx, (uint32*)(row32), dst_ptr);
+      dst_ptr += dst_stride;
+    }
+    free_aligned_buffer_64(row32);
+  }
+}
+
+// Scale plane down with bilinear interpolation.
+void ScalePlaneBilinearDown(int src_width, int src_height,
+                            int dst_width, int dst_height,
+                            int src_stride, int dst_stride,
+                            const uint8* src_ptr, uint8* dst_ptr,
+                            enum FilterMode filtering) {
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
+  // Allocate a row buffer.
+  align_buffer_64(row, src_width);
+
+  const int max_y = (src_height - 1) << 16;
+  int j;
+  void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
+      int dst_width, int x, int dx) =
+      (src_width >= 32768) ? ScaleFilterCols64_C : ScaleFilterCols_C;
+  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_C;
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
+             &x, &y, &dx, &dy);
+  src_width = Abs(src_width);
+
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(src_width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(src_width, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(src_width, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    InterpolateRow = InterpolateRow_Any_DSPR2;
+    if (IS_ALIGNED(src_width, 4)) {
+      InterpolateRow = InterpolateRow_DSPR2;
+    }
+  }
+#endif
+
+
+#if defined(HAS_SCALEFILTERCOLS_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleFilterCols = ScaleFilterCols_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEFILTERCOLS_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && src_width < 32768) {
+    ScaleFilterCols = ScaleFilterCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleFilterCols = ScaleFilterCols_NEON;
+    }
+  }
+#endif
+  if (y > max_y) {
+    y = max_y;
+  }
+
+  for (j = 0; j < dst_height; ++j) {
+    int yi = y >> 16;
+    const uint8* src = src_ptr + yi * src_stride;
+    if (filtering == kFilterLinear) {
+      ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
+    } else {
+      int yf = (y >> 8) & 255;
+      InterpolateRow(row, src, src_stride, src_width, yf);
+      ScaleFilterCols(dst_ptr, row, dst_width, x, dx);
+    }
+    dst_ptr += dst_stride;
+    y += dy;
+    if (y > max_y) {
+      y = max_y;
+    }
+  }
+  free_aligned_buffer_64(row);
+}
+
+void ScalePlaneBilinearDown_16(int src_width, int src_height,
+                               int dst_width, int dst_height,
+                               int src_stride, int dst_stride,
+                               const uint16* src_ptr, uint16* dst_ptr,
+                               enum FilterMode filtering) {
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
+  // Allocate a row buffer.
+  align_buffer_64(row, src_width * 2);
+
+  const int max_y = (src_height - 1) << 16;
+  int j;
+  void (*ScaleFilterCols)(uint16* dst_ptr, const uint16* src_ptr,
+      int dst_width, int x, int dx) =
+      (src_width >= 32768) ? ScaleFilterCols64_16_C : ScaleFilterCols_16_C;
+  void (*InterpolateRow)(uint16* dst_ptr, const uint16* src_ptr,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_16_C;
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
+             &x, &y, &dx, &dy);
+  src_width = Abs(src_width);
+
+#if defined(HAS_INTERPOLATEROW_16_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_Any_16_SSE2;
+    if (IS_ALIGNED(src_width, 16)) {
+      InterpolateRow = InterpolateRow_16_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_16_SSSE3;
+    if (IS_ALIGNED(src_width, 16)) {
+      InterpolateRow = InterpolateRow_16_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_16_AVX2;
+    if (IS_ALIGNED(src_width, 32)) {
+      InterpolateRow = InterpolateRow_16_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_16_NEON;
+    if (IS_ALIGNED(src_width, 16)) {
+      InterpolateRow = InterpolateRow_16_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    InterpolateRow = InterpolateRow_Any_16_DSPR2;
+    if (IS_ALIGNED(src_width, 4)) {
+      InterpolateRow = InterpolateRow_16_DSPR2;
+    }
+  }
+#endif
+
+
+#if defined(HAS_SCALEFILTERCOLS_16_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleFilterCols = ScaleFilterCols_16_SSSE3;
+  }
+#endif
+  if (y > max_y) {
+    y = max_y;
+  }
+
+  for (j = 0; j < dst_height; ++j) {
+    int yi = y >> 16;
+    const uint16* src = src_ptr + yi * src_stride;
+    if (filtering == kFilterLinear) {
+      ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
+    } else {
+      int yf = (y >> 8) & 255;
+      InterpolateRow((uint16*)row, src, src_stride, src_width, yf);
+      ScaleFilterCols(dst_ptr, (uint16*)row, dst_width, x, dx);
+    }
+    dst_ptr += dst_stride;
+    y += dy;
+    if (y > max_y) {
+      y = max_y;
+    }
+  }
+  free_aligned_buffer_64(row);
+}
+
+// Scale up down with bilinear interpolation.
+void ScalePlaneBilinearUp(int src_width, int src_height,
+                          int dst_width, int dst_height,
+                          int src_stride, int dst_stride,
+                          const uint8* src_ptr, uint8* dst_ptr,
+                          enum FilterMode filtering) {
+  int j;
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  const int max_y = (src_height - 1) << 16;
+  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_C;
+  void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
+      int dst_width, int x, int dx) =
+      filtering ? ScaleFilterCols_C : ScaleCols_C;
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
+             &x, &y, &dx, &dy);
+  src_width = Abs(src_width);
+
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(dst_width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(dst_width, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(dst_width, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    InterpolateRow = InterpolateRow_Any_DSPR2;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_DSPR2;
+    }
+  }
+#endif
+
+  if (filtering && src_width >= 32768) {
+    ScaleFilterCols = ScaleFilterCols64_C;
+  }
+#if defined(HAS_SCALEFILTERCOLS_SSSE3)
+  if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleFilterCols = ScaleFilterCols_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEFILTERCOLS_NEON)
+  if (filtering && TestCpuFlag(kCpuHasNEON) && src_width < 32768) {
+    ScaleFilterCols = ScaleFilterCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleFilterCols = ScaleFilterCols_NEON;
+    }
+  }
+#endif
+  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+    ScaleFilterCols = ScaleColsUp2_C;
+#if defined(HAS_SCALECOLS_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+      ScaleFilterCols = ScaleColsUp2_SSE2;
+    }
+#endif
+  }
+
+  if (y > max_y) {
+    y = max_y;
+  }
+  {
+    int yi = y >> 16;
+    const uint8* src = src_ptr + yi * src_stride;
+
+    // Allocate 2 row buffers.
+    const int kRowSize = (dst_width + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);
+
+    uint8* rowptr = row;
+    int rowstride = kRowSize;
+    int lasty = yi;
+
+    ScaleFilterCols(rowptr, src, dst_width, x, dx);
+    if (src_height > 1) {
+      src += src_stride;
+    }
+    ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx);
+    src += src_stride;
+
+    for (j = 0; j < dst_height; ++j) {
+      yi = y >> 16;
+      if (yi != lasty) {
+        if (y > max_y) {
+          y = max_y;
+          yi = y >> 16;
+          src = src_ptr + yi * src_stride;
+        }
+        if (yi != lasty) {
+          ScaleFilterCols(rowptr, src, dst_width, x, dx);
+          rowptr += rowstride;
+          rowstride = -rowstride;
+          lasty = yi;
+          src += src_stride;
+        }
+      }
+      if (filtering == kFilterLinear) {
+        InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0);
+      } else {
+        int yf = (y >> 8) & 255;
+        InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf);
+      }
+      dst_ptr += dst_stride;
+      y += dy;
+    }
+    free_aligned_buffer_64(row);
+  }
+}
+
+void ScalePlaneBilinearUp_16(int src_width, int src_height,
+                             int dst_width, int dst_height,
+                             int src_stride, int dst_stride,
+                             const uint16* src_ptr, uint16* dst_ptr,
+                             enum FilterMode filtering) {
+  int j;
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  const int max_y = (src_height - 1) << 16;
+  void (*InterpolateRow)(uint16* dst_ptr, const uint16* src_ptr,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_16_C;
+  void (*ScaleFilterCols)(uint16* dst_ptr, const uint16* src_ptr,
+      int dst_width, int x, int dx) =
+      filtering ? ScaleFilterCols_16_C : ScaleCols_16_C;
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
+             &x, &y, &dx, &dy);
+  src_width = Abs(src_width);
+
+#if defined(HAS_INTERPOLATEROW_16_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_Any_16_SSE2;
+    if (IS_ALIGNED(dst_width, 16)) {
+      InterpolateRow = InterpolateRow_16_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_16_SSSE3;
+    if (IS_ALIGNED(dst_width, 16)) {
+      InterpolateRow = InterpolateRow_16_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_16_AVX2;
+    if (IS_ALIGNED(dst_width, 32)) {
+      InterpolateRow = InterpolateRow_16_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_16_NEON;
+    if (IS_ALIGNED(dst_width, 16)) {
+      InterpolateRow = InterpolateRow_16_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2)) {
+    InterpolateRow = InterpolateRow_Any_16_DSPR2;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_16_DSPR2;
+    }
+  }
+#endif
+
+  if (filtering && src_width >= 32768) {
+    ScaleFilterCols = ScaleFilterCols64_16_C;
+  }
+#if defined(HAS_SCALEFILTERCOLS_16_SSSE3)
+  if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleFilterCols = ScaleFilterCols_16_SSSE3;
+  }
+#endif
+  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+    ScaleFilterCols = ScaleColsUp2_16_C;
+#if defined(HAS_SCALECOLS_16_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+      ScaleFilterCols = ScaleColsUp2_16_SSE2;
+    }
+#endif
+  }
+
+  if (y > max_y) {
+    y = max_y;
+  }
+  {
+    int yi = y >> 16;
+    const uint16* src = src_ptr + yi * src_stride;
+
+    // Allocate 2 row buffers.
+    const int kRowSize = (dst_width + 31) & ~31;
+    align_buffer_64(row, kRowSize * 4);
+
+    uint16* rowptr = (uint16*)row;
+    int rowstride = kRowSize;
+    int lasty = yi;
+
+    ScaleFilterCols(rowptr, src, dst_width, x, dx);
+    if (src_height > 1) {
+      src += src_stride;
+    }
+    ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx);
+    src += src_stride;
+
+    for (j = 0; j < dst_height; ++j) {
+      yi = y >> 16;
+      if (yi != lasty) {
+        if (y > max_y) {
+          y = max_y;
+          yi = y >> 16;
+          src = src_ptr + yi * src_stride;
+        }
+        if (yi != lasty) {
+          ScaleFilterCols(rowptr, src, dst_width, x, dx);
+          rowptr += rowstride;
+          rowstride = -rowstride;
+          lasty = yi;
+          src += src_stride;
+        }
+      }
+      if (filtering == kFilterLinear) {
+        InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0);
+      } else {
+        int yf = (y >> 8) & 255;
+        InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf);
+      }
+      dst_ptr += dst_stride;
+      y += dy;
+    }
+    free_aligned_buffer_64(row);
+  }
+}
+
+// Scale Plane to/from any dimensions, without interpolation.
+// Fixed point math is used for performance: The upper 16 bits
+// of x and dx is the integer part of the source position and
+// the lower 16 bits are the fixed decimal part.
+
+static void ScalePlaneSimple(int src_width, int src_height,
+                             int dst_width, int dst_height,
+                             int src_stride, int dst_stride,
+                             const uint8* src_ptr, uint8* dst_ptr) {
+  int i;
+  void (*ScaleCols)(uint8* dst_ptr, const uint8* src_ptr,
+      int dst_width, int x, int dx) = ScaleCols_C;
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone,
+             &x, &y, &dx, &dy);
+  src_width = Abs(src_width);
+
+  if (src_width * 2 == dst_width && x < 0x8000) {
+    ScaleCols = ScaleColsUp2_C;
+#if defined(HAS_SCALECOLS_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+      ScaleCols = ScaleColsUp2_SSE2;
+    }
+#endif
+  }
+
+  for (i = 0; i < dst_height; ++i) {
+    ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx);
+    dst_ptr += dst_stride;
+    y += dy;
+  }
+}
+
+static void ScalePlaneSimple_16(int src_width, int src_height,
+                                int dst_width, int dst_height,
+                                int src_stride, int dst_stride,
+                                const uint16* src_ptr, uint16* dst_ptr) {
+  int i;
+  void (*ScaleCols)(uint16* dst_ptr, const uint16* src_ptr,
+      int dst_width, int x, int dx) = ScaleCols_16_C;
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone,
+             &x, &y, &dx, &dy);
+  src_width = Abs(src_width);
+
+  if (src_width * 2 == dst_width && x < 0x8000) {
+    ScaleCols = ScaleColsUp2_16_C;
+#if defined(HAS_SCALECOLS_16_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+      ScaleCols = ScaleColsUp2_16_SSE2;
+    }
+#endif
+  }
+
+  for (i = 0; i < dst_height; ++i) {
+    ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride,
+              dst_width, x, dx);
+    dst_ptr += dst_stride;
+    y += dy;
+  }
+}
+
+// Scale a plane.
+// This function dispatches to a specialized scaler based on scale factor.
+
+LIBYUV_API
+void ScalePlane(const uint8* src, int src_stride,
+                int src_width, int src_height,
+                uint8* dst, int dst_stride,
+                int dst_width, int dst_height,
+                enum FilterMode filtering) {
+  // Simplify filtering when possible.
+  filtering = ScaleFilterReduce(src_width, src_height,
+                                dst_width, dst_height, filtering);
+
+  // Negative height means invert the image.
+  if (src_height < 0) {
+    src_height = -src_height;
+    src = src + (src_height - 1) * src_stride;
+    src_stride = -src_stride;
+  }
+
+  // Use specialized scales to improve performance for common resolutions.
+  // For example, all the 1/2 scalings will use ScalePlaneDown2()
+  if (dst_width == src_width && dst_height == src_height) {
+    // Straight copy.
+    CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height);
+    return;
+  }
+  if (dst_width == src_width && filtering != kFilterBox) {
+    int dy = FixedDiv(src_height, dst_height);
+    // Arbitrary scale vertically, but unscaled horizontally.
+    ScalePlaneVertical(src_height,
+                       dst_width, dst_height,
+                       src_stride, dst_stride, src, dst,
+                       0, 0, dy, 1, filtering);
+    return;
+  }
+  if (dst_width <= Abs(src_width) && dst_height <= src_height) {
+    // Scale down.
+    if (4 * dst_width == 3 * src_width &&
+        4 * dst_height == 3 * src_height) {
+      // optimized, 3/4
+      ScalePlaneDown34(src_width, src_height, dst_width, dst_height,
+                       src_stride, dst_stride, src, dst, filtering);
+      return;
+    }
+    if (2 * dst_width == src_width && 2 * dst_height == src_height) {
+      // optimized, 1/2
+      ScalePlaneDown2(src_width, src_height, dst_width, dst_height,
+                      src_stride, dst_stride, src, dst, filtering);
+      return;
+    }
+    // 3/8 rounded up for odd sized chroma height.
+    if (8 * dst_width == 3 * src_width &&
+        dst_height == ((src_height * 3 + 7) / 8)) {
+      // optimized, 3/8
+      ScalePlaneDown38(src_width, src_height, dst_width, dst_height,
+                       src_stride, dst_stride, src, dst, filtering);
+      return;
+    }
+    if (4 * dst_width == src_width && 4 * dst_height == src_height &&
+        (filtering == kFilterBox || filtering == kFilterNone)) {
+      // optimized, 1/4
+      ScalePlaneDown4(src_width, src_height, dst_width, dst_height,
+                      src_stride, dst_stride, src, dst, filtering);
+      return;
+    }
+  }
+  if (filtering == kFilterBox && dst_height * 2 < src_height) {
+    ScalePlaneBox(src_width, src_height, dst_width, dst_height,
+                  src_stride, dst_stride, src, dst);
+    return;
+  }
+  if (filtering && dst_height > src_height) {
+    ScalePlaneBilinearUp(src_width, src_height, dst_width, dst_height,
+                         src_stride, dst_stride, src, dst, filtering);
+    return;
+  }
+  if (filtering) {
+    ScalePlaneBilinearDown(src_width, src_height, dst_width, dst_height,
+                           src_stride, dst_stride, src, dst, filtering);
+    return;
+  }
+  ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
+                   src_stride, dst_stride, src, dst);
+}
+
+LIBYUV_API
+void ScalePlane_16(const uint16* src, int src_stride,
+                  int src_width, int src_height,
+                  uint16* dst, int dst_stride,
+                  int dst_width, int dst_height,
+                  enum FilterMode filtering) {
+  // Simplify filtering when possible.
+  filtering = ScaleFilterReduce(src_width, src_height,
+                                dst_width, dst_height, filtering);
+
+  // Negative height means invert the image.
+  if (src_height < 0) {
+    src_height = -src_height;
+    src = src + (src_height - 1) * src_stride;
+    src_stride = -src_stride;
+  }
+
+  // Use specialized scales to improve performance for common resolutions.
+  // For example, all the 1/2 scalings will use ScalePlaneDown2()
+  if (dst_width == src_width && dst_height == src_height) {
+    // Straight copy.
+    CopyPlane_16(src, src_stride, dst, dst_stride, dst_width, dst_height);
+    return;
+  }
+  if (dst_width == src_width) {
+    int dy = FixedDiv(src_height, dst_height);
+    // Arbitrary scale vertically, but unscaled vertically.
+    ScalePlaneVertical_16(src_height,
+                          dst_width, dst_height,
+                          src_stride, dst_stride, src, dst,
+                          0, 0, dy, 1, filtering);
+    return;
+  }
+  if (dst_width <= Abs(src_width) && dst_height <= src_height) {
+    // Scale down.
+    if (4 * dst_width == 3 * src_width &&
+        4 * dst_height == 3 * src_height) {
+      // optimized, 3/4
+      ScalePlaneDown34_16(src_width, src_height, dst_width, dst_height,
+                          src_stride, dst_stride, src, dst, filtering);
+      return;
+    }
+    if (2 * dst_width == src_width && 2 * dst_height == src_height) {
+      // optimized, 1/2
+      ScalePlaneDown2_16(src_width, src_height, dst_width, dst_height,
+                         src_stride, dst_stride, src, dst, filtering);
+      return;
+    }
+    // 3/8 rounded up for odd sized chroma height.
+    if (8 * dst_width == 3 * src_width &&
+        dst_height == ((src_height * 3 + 7) / 8)) {
+      // optimized, 3/8
+      ScalePlaneDown38_16(src_width, src_height, dst_width, dst_height,
+                          src_stride, dst_stride, src, dst, filtering);
+      return;
+    }
+    if (4 * dst_width == src_width && 4 * dst_height == src_height &&
+               filtering != kFilterBilinear) {
+      // optimized, 1/4
+      ScalePlaneDown4_16(src_width, src_height, dst_width, dst_height,
+                         src_stride, dst_stride, src, dst, filtering);
+      return;
+    }
+  }
+  if (filtering == kFilterBox && dst_height * 2 < src_height) {
+    ScalePlaneBox_16(src_width, src_height, dst_width, dst_height,
+                     src_stride, dst_stride, src, dst);
+    return;
+  }
+  if (filtering && dst_height > src_height) {
+    ScalePlaneBilinearUp_16(src_width, src_height, dst_width, dst_height,
+                            src_stride, dst_stride, src, dst, filtering);
+    return;
+  }
+  if (filtering) {
+    ScalePlaneBilinearDown_16(src_width, src_height, dst_width, dst_height,
+                              src_stride, dst_stride, src, dst, filtering);
+    return;
+  }
+  ScalePlaneSimple_16(src_width, src_height, dst_width, dst_height,
+                      src_stride, dst_stride, src, dst);
+}
+
+// Scale an I420 image.
+// This function in turn calls a scaling function for each plane.
+
+LIBYUV_API
+int I420Scale(const uint8* src_y, int src_stride_y,
+              const uint8* src_u, int src_stride_u,
+              const uint8* src_v, int src_stride_v,
+              int src_width, int src_height,
+              uint8* dst_y, int dst_stride_y,
+              uint8* dst_u, int dst_stride_u,
+              uint8* dst_v, int dst_stride_v,
+              int dst_width, int dst_height,
+              enum FilterMode filtering) {
+  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+  int src_halfheight = SUBSAMPLE(src_height, 1, 1);
+  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+  int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
+  if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 ||
+      !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }
+
+  ScalePlane(src_y, src_stride_y, src_width, src_height,
+             dst_y, dst_stride_y, dst_width, dst_height,
+             filtering);
+  ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight,
+             dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
+             filtering);
+  ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight,
+             dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
+             filtering);
+  return 0;
+}
+
+LIBYUV_API
+int I420Scale_16(const uint16* src_y, int src_stride_y,
+                 const uint16* src_u, int src_stride_u,
+                 const uint16* src_v, int src_stride_v,
+                 int src_width, int src_height,
+                 uint16* dst_y, int dst_stride_y,
+                 uint16* dst_u, int dst_stride_u,
+                 uint16* dst_v, int dst_stride_v,
+                 int dst_width, int dst_height,
+                 enum FilterMode filtering) {
+  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+  int src_halfheight = SUBSAMPLE(src_height, 1, 1);
+  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+  int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
+  if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 ||
+      !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }
+
+  ScalePlane_16(src_y, src_stride_y, src_width, src_height,
+                dst_y, dst_stride_y, dst_width, dst_height,
+                filtering);
+  ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_halfheight,
+                dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
+                filtering);
+  ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_halfheight,
+                dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
+                filtering);
+  return 0;
+}
+
+// Deprecated api
+LIBYUV_API
+int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
+          int src_stride_y, int src_stride_u, int src_stride_v,
+          int src_width, int src_height,
+          uint8* dst_y, uint8* dst_u, uint8* dst_v,
+          int dst_stride_y, int dst_stride_u, int dst_stride_v,
+          int dst_width, int dst_height,
+          LIBYUV_BOOL interpolate) {
+  return I420Scale(src_y, src_stride_y,
+                   src_u, src_stride_u,
+                   src_v, src_stride_v,
+                   src_width, src_height,
+                   dst_y, dst_stride_y,
+                   dst_u, dst_stride_u,
+                   dst_v, dst_stride_v,
+                   dst_width, dst_height,
+                   interpolate ? kFilterBox : kFilterNone);
+}
+
+// Deprecated api
+LIBYUV_API
+int ScaleOffset(const uint8* src, int src_width, int src_height,
+                uint8* dst, int dst_width, int dst_height, int dst_yoffset,
+                LIBYUV_BOOL interpolate) {
+  // Chroma requires offset to multiple of 2.
+  int dst_yoffset_even = dst_yoffset & ~1;
+  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+  int src_halfheight = SUBSAMPLE(src_height, 1, 1);
+  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+  int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
+  int aheight = dst_height - dst_yoffset_even * 2;  // actual output height
+  const uint8* src_y = src;
+  const uint8* src_u = src + src_width * src_height;
+  const uint8* src_v = src + src_width * src_height +
+                             src_halfwidth * src_halfheight;
+  uint8* dst_y = dst + dst_yoffset_even * dst_width;
+  uint8* dst_u = dst + dst_width * dst_height +
+                 (dst_yoffset_even >> 1) * dst_halfwidth;
+  uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight +
+                 (dst_yoffset_even >> 1) * dst_halfwidth;
+  if (!src || src_width <= 0 || src_height <= 0 ||
+      !dst || dst_width <= 0 || dst_height <= 0 || dst_yoffset_even < 0 ||
+      dst_yoffset_even >= dst_height) {
+    return -1;
+  }
+  return I420Scale(src_y, src_width,
+                   src_u, src_halfwidth,
+                   src_v, src_halfwidth,
+                   src_width, src_height,
+                   dst_y, dst_width,
+                   dst_u, dst_halfwidth,
+                   dst_v, dst_halfwidth,
+                   dst_width, aheight,
+                   interpolate ? kFilterBox : kFilterNone);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libyuv/source/scale_any.cc b/libs/libyuv/source/scale_any.cc
new file mode 100644
index 0000000000..ed76a9e4c0
--- /dev/null
+++ b/libs/libyuv/source/scale_any.cc
@@ -0,0 +1,221 @@
+/*
+ *  Copyright 2015 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+#include "libyuv/scale_row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols
+#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK)                            \
+    void NAMEANY(uint8* dst_ptr, const uint8* src_ptr,                         \
+                 int dst_width, int x, int dx) {                               \
+      int n = dst_width & ~MASK;                                               \
+      if (n > 0) {                                                             \
+        TERP_SIMD(dst_ptr, src_ptr, n, x, dx);                                 \
+      }                                                                        \
+      TERP_C(dst_ptr + n * BPP, src_ptr,                                       \
+             dst_width & MASK, x + n * dx, dx);                                \
+    }
+
+#ifdef HAS_SCALEFILTERCOLS_NEON
+CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
+#endif
+#ifdef HAS_SCALEARGBCOLS_NEON
+CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7)
+#endif
+#ifdef HAS_SCALEARGBFILTERCOLS_NEON
+CANY(ScaleARGBFilterCols_Any_NEON, ScaleARGBFilterCols_NEON,
+     ScaleARGBFilterCols_C, 4, 3)
+#endif
+#undef CANY
+
+// Fixed scale down.
+#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK)   \
+    void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride,                   \
+                 uint8* dst_ptr, int dst_width) {                              \
+      int r = (int)((unsigned int)dst_width % (MASK + 1));                     \
+      int n = dst_width - r;                                                   \
+      if (n > 0) {                                                             \
+        SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n);                    \
+      }                                                                        \
+      SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride,                 \
+                     dst_ptr + n * BPP, r);                                    \
+    }
+
+// Fixed scale down for odd source width.  Used by I420Blend subsampling.
+// Since dst_width is (width + 1) / 2, this function scales one less pixel
+// and copies the last pixel.
+#define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK)   \
+    void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride,                   \
+                 uint8* dst_ptr, int dst_width) {                              \
+      int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1));               \
+      int n = dst_width - r;                                                   \
+      if (n > 0) {                                                             \
+        SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n);                    \
+      }                                                                        \
+      SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride,                 \
+                     dst_ptr + n * BPP, r);                                    \
+    }
+
+#ifdef HAS_SCALEROWDOWN2_SSSE3
+SDANY(ScaleRowDown2_Any_SSSE3, ScaleRowDown2_SSSE3, ScaleRowDown2_C, 2, 1, 15)
+SDANY(ScaleRowDown2Linear_Any_SSSE3, ScaleRowDown2Linear_SSSE3,
+      ScaleRowDown2Linear_C, 2, 1, 15)
+SDANY(ScaleRowDown2Box_Any_SSSE3, ScaleRowDown2Box_SSSE3, ScaleRowDown2Box_C,
+      2, 1, 15)
+SDODD(ScaleRowDown2Box_Odd_SSSE3, ScaleRowDown2Box_SSSE3,
+      ScaleRowDown2Box_Odd_C, 2, 1, 15)
+#endif
+#ifdef HAS_SCALEROWDOWN2_AVX2
+SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31)
+SDANY(ScaleRowDown2Linear_Any_AVX2, ScaleRowDown2Linear_AVX2,
+      ScaleRowDown2Linear_C, 2, 1, 31)
+SDANY(ScaleRowDown2Box_Any_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_C,
+      2, 1, 31)
+SDODD(ScaleRowDown2Box_Odd_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_Odd_C,
+      2, 1, 31)
+#endif
+#ifdef HAS_SCALEROWDOWN2_NEON
+SDANY(ScaleRowDown2_Any_NEON, ScaleRowDown2_NEON, ScaleRowDown2_C, 2, 1, 15)
+SDANY(ScaleRowDown2Linear_Any_NEON, ScaleRowDown2Linear_NEON,
+      ScaleRowDown2Linear_C, 2, 1, 15)
+SDANY(ScaleRowDown2Box_Any_NEON, ScaleRowDown2Box_NEON,
+      ScaleRowDown2Box_C, 2, 1, 15)
+SDODD(ScaleRowDown2Box_Odd_NEON, ScaleRowDown2Box_NEON,
+      ScaleRowDown2Box_Odd_C, 2, 1, 15)
+#endif
+#ifdef HAS_SCALEROWDOWN4_SSSE3
+SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7)
+SDANY(ScaleRowDown4Box_Any_SSSE3, ScaleRowDown4Box_SSSE3, ScaleRowDown4Box_C,
+      4, 1, 7)
+#endif
+#ifdef HAS_SCALEROWDOWN4_AVX2
+SDANY(ScaleRowDown4_Any_AVX2, ScaleRowDown4_AVX2, ScaleRowDown4_C, 4, 1, 15)
+SDANY(ScaleRowDown4Box_Any_AVX2, ScaleRowDown4Box_AVX2, ScaleRowDown4Box_C,
+      4, 1, 15)
+#endif
+#ifdef HAS_SCALEROWDOWN4_NEON
+SDANY(ScaleRowDown4_Any_NEON, ScaleRowDown4_NEON, ScaleRowDown4_C, 4, 1, 7)
+SDANY(ScaleRowDown4Box_Any_NEON, ScaleRowDown4Box_NEON, ScaleRowDown4Box_C,
+      4, 1, 7)
+#endif
+#ifdef HAS_SCALEROWDOWN34_SSSE3
+SDANY(ScaleRowDown34_Any_SSSE3, ScaleRowDown34_SSSE3,
+      ScaleRowDown34_C, 4 / 3, 1, 23)
+SDANY(ScaleRowDown34_0_Box_Any_SSSE3, ScaleRowDown34_0_Box_SSSE3,
+      ScaleRowDown34_0_Box_C, 4 / 3, 1, 23)
+SDANY(ScaleRowDown34_1_Box_Any_SSSE3, ScaleRowDown34_1_Box_SSSE3,
+      ScaleRowDown34_1_Box_C, 4 / 3, 1, 23)
+#endif
+#ifdef HAS_SCALEROWDOWN34_NEON
+SDANY(ScaleRowDown34_Any_NEON, ScaleRowDown34_NEON,
+      ScaleRowDown34_C, 4 / 3, 1, 23)
+SDANY(ScaleRowDown34_0_Box_Any_NEON, ScaleRowDown34_0_Box_NEON,
+      ScaleRowDown34_0_Box_C, 4 / 3, 1, 23)
+SDANY(ScaleRowDown34_1_Box_Any_NEON, ScaleRowDown34_1_Box_NEON,
+      ScaleRowDown34_1_Box_C, 4 / 3, 1, 23)
+#endif
+#ifdef HAS_SCALEROWDOWN38_SSSE3
+SDANY(ScaleRowDown38_Any_SSSE3, ScaleRowDown38_SSSE3,
+      ScaleRowDown38_C, 8 / 3, 1, 11)
+SDANY(ScaleRowDown38_3_Box_Any_SSSE3, ScaleRowDown38_3_Box_SSSE3,
+      ScaleRowDown38_3_Box_C, 8 / 3, 1, 5)
+SDANY(ScaleRowDown38_2_Box_Any_SSSE3, ScaleRowDown38_2_Box_SSSE3,
+      ScaleRowDown38_2_Box_C, 8 / 3, 1, 5)
+#endif
+#ifdef HAS_SCALEROWDOWN38_NEON
+SDANY(ScaleRowDown38_Any_NEON, ScaleRowDown38_NEON,
+      ScaleRowDown38_C, 8 / 3, 1, 11)
+SDANY(ScaleRowDown38_3_Box_Any_NEON, ScaleRowDown38_3_Box_NEON,
+      ScaleRowDown38_3_Box_C, 8 / 3, 1, 11)
+SDANY(ScaleRowDown38_2_Box_Any_NEON, ScaleRowDown38_2_Box_NEON,
+      ScaleRowDown38_2_Box_C, 8 / 3, 1, 11)
+#endif
+
+#ifdef HAS_SCALEARGBROWDOWN2_SSE2
+SDANY(ScaleARGBRowDown2_Any_SSE2, ScaleARGBRowDown2_SSE2,
+      ScaleARGBRowDown2_C, 2, 4, 3)
+SDANY(ScaleARGBRowDown2Linear_Any_SSE2, ScaleARGBRowDown2Linear_SSE2,
+      ScaleARGBRowDown2Linear_C, 2, 4, 3)
+SDANY(ScaleARGBRowDown2Box_Any_SSE2, ScaleARGBRowDown2Box_SSE2,
+      ScaleARGBRowDown2Box_C, 2, 4, 3)
+#endif
+#ifdef HAS_SCALEARGBROWDOWN2_NEON
+SDANY(ScaleARGBRowDown2_Any_NEON, ScaleARGBRowDown2_NEON,
+      ScaleARGBRowDown2_C, 2, 4, 7)
+SDANY(ScaleARGBRowDown2Linear_Any_NEON, ScaleARGBRowDown2Linear_NEON,
+      ScaleARGBRowDown2Linear_C, 2, 4, 7)
+SDANY(ScaleARGBRowDown2Box_Any_NEON, ScaleARGBRowDown2Box_NEON,
+      ScaleARGBRowDown2Box_C, 2, 4, 7)
+#endif
+#undef SDANY
+
+// Scale down by even scale factor.
+#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK)          \
+    void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, int src_stepx,    \
+                 uint8* dst_ptr, int dst_width) {                              \
+      int r = (int)((unsigned int)dst_width % (MASK + 1));                     \
+      int n = dst_width - r;                                                   \
+      if (n > 0) {                                                             \
+        SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n);         \
+      }                                                                        \
+      SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride,              \
+                     src_stepx, dst_ptr + n * BPP, r);                         \
+    }
+
+#ifdef HAS_SCALEARGBROWDOWNEVEN_SSE2
+SDAANY(ScaleARGBRowDownEven_Any_SSE2, ScaleARGBRowDownEven_SSE2,
+       ScaleARGBRowDownEven_C, 4, 3)
+SDAANY(ScaleARGBRowDownEvenBox_Any_SSE2, ScaleARGBRowDownEvenBox_SSE2,
+       ScaleARGBRowDownEvenBox_C, 4, 3)
+#endif
+#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON
+SDAANY(ScaleARGBRowDownEven_Any_NEON, ScaleARGBRowDownEven_NEON,
+       ScaleARGBRowDownEven_C, 4, 3)
+SDAANY(ScaleARGBRowDownEvenBox_Any_NEON, ScaleARGBRowDownEvenBox_NEON,
+       ScaleARGBRowDownEvenBox_C, 4, 3)
+#endif
+
+// Add rows box filter scale down.
+#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK)                  \
+  void NAMEANY(const uint8* src_ptr, uint16* dst_ptr, int src_width) {         \
+      int n = src_width & ~MASK;                                               \
+      if (n > 0) {                                                             \
+        SCALEADDROW_SIMD(src_ptr, dst_ptr, n);                                 \
+      }                                                                        \
+      SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK);               \
+    }
+
+#ifdef HAS_SCALEADDROW_SSE2
+SAANY(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, ScaleAddRow_C, 15)
+#endif
+#ifdef HAS_SCALEADDROW_AVX2
+SAANY(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, ScaleAddRow_C, 31)
+#endif
+#ifdef HAS_SCALEADDROW_NEON
+SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15)
+#endif
+#undef SAANY
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+
+
+
+
diff --git a/libs/libyuv/source/scale_argb.cc b/libs/libyuv/source/scale_argb.cc
new file mode 100644
index 0000000000..17f51ae9bf
--- /dev/null
+++ b/libs/libyuv/source/scale_argb.cc
@@ -0,0 +1,859 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"  // For CopyARGB
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+static __inline int Abs(int v) {
+  return v >= 0 ? v : -v;
+}
+
+// ScaleARGB ARGB, 1/2
+// This is an optimized version for scaling down a ARGB to 1/2 of
+// its original size.
+static void ScaleARGBDown2(int src_width, int src_height,
+                           int dst_width, int dst_height,
+                           int src_stride, int dst_stride,
+                           const uint8* src_argb, uint8* dst_argb,
+                           int x, int dx, int y, int dy,
+                           enum FilterMode filtering) {
+  int j;
+  int row_stride = src_stride * (dy >> 16);
+  void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
+                            uint8* dst_argb, int dst_width) =
+    filtering == kFilterNone ? ScaleARGBRowDown2_C :
+        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C :
+        ScaleARGBRowDown2Box_C);
+  assert(dx == 65536 * 2);  // Test scale factor of 2.
+  assert((dy & 0x1ffff) == 0);  // Test vertical scale is multiple of 2.
+  // Advance to odd row, even column.
+  if (filtering == kFilterBilinear) {
+    src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
+  } else {
+    src_argb += (y >> 16) * src_stride + ((x >> 16) - 1) * 4;
+  }
+
+#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_SSE2 :
+        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2 :
+        ScaleARGBRowDown2Box_Any_SSE2);
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 :
+          (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 :
+          ScaleARGBRowDown2Box_SSE2);
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBROWDOWN2_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_NEON :
+        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON :
+        ScaleARGBRowDown2Box_Any_NEON);
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_NEON :
+          (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON :
+          ScaleARGBRowDown2Box_NEON);
+    }
+  }
+#endif
+
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
+  }
+  for (j = 0; j < dst_height; ++j) {
+    ScaleARGBRowDown2(src_argb, src_stride, dst_argb, dst_width);
+    src_argb += row_stride;
+    dst_argb += dst_stride;
+  }
+}
+
+// ScaleARGB ARGB, 1/4
+// This is an optimized version for scaling down a ARGB to 1/4 of
+// its original size.
+static void ScaleARGBDown4Box(int src_width, int src_height,
+                              int dst_width, int dst_height,
+                              int src_stride, int dst_stride,
+                              const uint8* src_argb, uint8* dst_argb,
+                              int x, int dx, int y, int dy) {
+  int j;
+  // Allocate 2 rows of ARGB.
+  const int kRowSize = (dst_width * 2 * 4 + 31) & ~31;
+  align_buffer_64(row, kRowSize * 2);
+  int row_stride = src_stride * (dy >> 16);
+  void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
+    uint8* dst_argb, int dst_width) = ScaleARGBRowDown2Box_C;
+  // Advance to odd row, even column.
+  src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
+  assert(dx == 65536 * 4);  // Test scale factor of 4.
+  assert((dy & 0x3ffff) == 0);  // Test vertical scale is multiple of 4.
+#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_SSE2;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBROWDOWN2_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON;
+    }
+  }
+#endif
+
+  for (j = 0; j < dst_height; ++j) {
+    ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2);
+    ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride,
+                      row + kRowSize, dst_width * 2);
+    ScaleARGBRowDown2(row, kRowSize, dst_argb, dst_width);
+    src_argb += row_stride;
+    dst_argb += dst_stride;
+  }
+  free_aligned_buffer_64(row);
+}
+
+// ScaleARGB ARGB Even
+// This is an optimized version for scaling down a ARGB to even
+// multiple of its original size.
+static void ScaleARGBDownEven(int src_width, int src_height,
+                              int dst_width, int dst_height,
+                              int src_stride, int dst_stride,
+                              const uint8* src_argb, uint8* dst_argb,
+                              int x, int dx, int y, int dy,
+                              enum FilterMode filtering) {
+  int j;
+  int col_step = dx >> 16;
+  int row_stride = (dy >> 16) * src_stride;
+  void (*ScaleARGBRowDownEven)(const uint8* src_argb, ptrdiff_t src_stride,
+                               int src_step, uint8* dst_argb, int dst_width) =
+      filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C;
+  assert(IS_ALIGNED(src_width, 2));
+  assert(IS_ALIGNED(src_height, 2));
+  src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
+#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2 :
+        ScaleARGBRowDownEven_Any_SSE2;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 :
+          ScaleARGBRowDownEven_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON :
+        ScaleARGBRowDownEven_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON :
+          ScaleARGBRowDownEven_NEON;
+    }
+  }
+#endif
+
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
+  }
+  for (j = 0; j < dst_height; ++j) {
+    ScaleARGBRowDownEven(src_argb, src_stride, col_step, dst_argb, dst_width);
+    src_argb += row_stride;
+    dst_argb += dst_stride;
+  }
+}
+
+// Scale ARGB down with bilinear interpolation.
+static void ScaleARGBBilinearDown(int src_width, int src_height,
+                                  int dst_width, int dst_height,
+                                  int src_stride, int dst_stride,
+                                  const uint8* src_argb, uint8* dst_argb,
+                                  int x, int dx, int y, int dy,
+                                  enum FilterMode filtering) {
+  int j;
+  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_C;
+  void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
+      int dst_width, int x, int dx) =
+      (src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C;
+  int64 xlast = x + (int64)(dst_width - 1) * dx;
+  int64 xl = (dx >= 0) ? x : xlast;
+  int64 xr = (dx >= 0) ? xlast : x;
+  int clip_src_width;
+  xl = (xl >> 16) & ~3;  // Left edge aligned.
+  xr = (xr >> 16) + 1;  // Right most pixel used.  Bilinear uses 2 pixels.
+  xr = (xr + 1 + 3) & ~3;  // 1 beyond 4 pixel aligned right most pixel.
+  if (xr > src_width) {
+    xr = src_width;
+  }
+  clip_src_width = (int)(xr - xl) * 4;  // Width aligned to 4.
+  src_argb += xl * 4;
+  x -= (int)(xl << 16);
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(clip_src_width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(clip_src_width, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(clip_src_width, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) &&
+      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4)) {
+    InterpolateRow = InterpolateRow_Any_DSPR2;
+    if (IS_ALIGNED(clip_src_width, 4)) {
+      InterpolateRow = InterpolateRow_DSPR2;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
+    }
+  }
+#endif
+  // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
+  // Allocate a row of ARGB.
+  {
+    align_buffer_64(row, clip_src_width * 4);
+
+    const int max_y = (src_height - 1) << 16;
+    if (y > max_y) {
+      y = max_y;
+    }
+    for (j = 0; j < dst_height; ++j) {
+      int yi = y >> 16;
+      const uint8* src = src_argb + yi * src_stride;
+      if (filtering == kFilterLinear) {
+        ScaleARGBFilterCols(dst_argb, src, dst_width, x, dx);
+      } else {
+        int yf = (y >> 8) & 255;
+        InterpolateRow(row, src, src_stride, clip_src_width, yf);
+        ScaleARGBFilterCols(dst_argb, row, dst_width, x, dx);
+      }
+      dst_argb += dst_stride;
+      y += dy;
+      if (y > max_y) {
+        y = max_y;
+      }
+    }
+    free_aligned_buffer_64(row);
+  }
+}
+
+// Scale ARGB up with bilinear interpolation.
+static void ScaleARGBBilinearUp(int src_width, int src_height,
+                                int dst_width, int dst_height,
+                                int src_stride, int dst_stride,
+                                const uint8* src_argb, uint8* dst_argb,
+                                int x, int dx, int y, int dy,
+                                enum FilterMode filtering) {
+  int j;
+  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_C;
+  void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
+      int dst_width, int x, int dx) =
+      filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
+  const int max_y = (src_height - 1) << 16;
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(dst_width, 8)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
+    InterpolateRow = InterpolateRow_DSPR2;
+  }
+#endif
+  if (src_width >= 32768) {
+    ScaleARGBFilterCols = filtering ?
+        ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
+  }
+#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
+  if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
+  if (filtering && TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBCOLS_SSE2)
+  if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
+    ScaleARGBFilterCols = ScaleARGBCols_SSE2;
+  }
+#endif
+#if defined(HAS_SCALEARGBCOLS_NEON)
+  if (!filtering && TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBFilterCols = ScaleARGBCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBFilterCols = ScaleARGBCols_NEON;
+    }
+  }
+#endif
+  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+    ScaleARGBFilterCols = ScaleARGBColsUp2_C;
+#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
+    }
+#endif
+  }
+
+  if (y > max_y) {
+    y = max_y;
+  }
+
+  {
+    int yi = y >> 16;
+    const uint8* src = src_argb + yi * src_stride;
+
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (dst_width * 4 + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);
+
+    uint8* rowptr = row;
+    int rowstride = kRowSize;
+    int lasty = yi;
+
+    ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
+    if (src_height > 1) {
+      src += src_stride;
+    }
+    ScaleARGBFilterCols(rowptr + rowstride, src, dst_width, x, dx);
+    src += src_stride;
+
+    for (j = 0; j < dst_height; ++j) {
+      yi = y >> 16;
+      if (yi != lasty) {
+        if (y > max_y) {
+          y = max_y;
+          yi = y >> 16;
+          src = src_argb + yi * src_stride;
+        }
+        if (yi != lasty) {
+          ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
+          rowptr += rowstride;
+          rowstride = -rowstride;
+          lasty = yi;
+          src += src_stride;
+        }
+      }
+      if (filtering == kFilterLinear) {
+        InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);
+      } else {
+        int yf = (y >> 8) & 255;
+        InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
+      }
+      dst_argb += dst_stride;
+      y += dy;
+    }
+    free_aligned_buffer_64(row);
+  }
+}
+
+#ifdef YUVSCALEUP
+// Scale YUV to ARGB up with bilinear interpolation.
+static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
+                                     int dst_width, int dst_height,
+                                     int src_stride_y,
+                                     int src_stride_u,
+                                     int src_stride_v,
+                                     int dst_stride_argb,
+                                     const uint8* src_y,
+                                     const uint8* src_u,
+                                     const uint8* src_v,
+                                     uint8* dst_argb,
+                                     int x, int dx, int y, int dy,
+                                     enum FilterMode filtering) {
+  int j;
+  void (*I422ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToARGBRow_C;
+#if defined(HAS_I422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(src_width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(src_width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGBRow = I422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(src_width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    I422ToARGBRow = I422ToARGBRow_DSPR2;
+  }
+#endif
+
+  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_C;
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(dst_width, 8)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    InterpolateRow = InterpolateRow_DSPR2;
+  }
+#endif
+
+  void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
+      int dst_width, int x, int dx) =
+      filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
+  if (src_width >= 32768) {
+    ScaleARGBFilterCols = filtering ?
+        ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
+  }
+#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
+  if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
+  if (filtering && TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBCOLS_SSE2)
+  if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
+    ScaleARGBFilterCols = ScaleARGBCols_SSE2;
+  }
+#endif
+#if defined(HAS_SCALEARGBCOLS_NEON)
+  if (!filtering && TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBFilterCols = ScaleARGBCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBFilterCols = ScaleARGBCols_NEON;
+    }
+  }
+#endif
+  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+    ScaleARGBFilterCols = ScaleARGBColsUp2_C;
+#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
+    }
+#endif
+  }
+
+  const int max_y = (src_height - 1) << 16;
+  if (y > max_y) {
+    y = max_y;
+  }
+  const int kYShift = 1;  // Shift Y by 1 to convert Y plane to UV coordinate.
+  int yi = y >> 16;
+  int uv_yi = yi >> kYShift;
+  const uint8* src_row_y = src_y + yi * src_stride_y;
+  const uint8* src_row_u = src_u + uv_yi * src_stride_u;
+  const uint8* src_row_v = src_v + uv_yi * src_stride_v;
+
+  // Allocate 2 rows of ARGB.
+  const int kRowSize = (dst_width * 4 + 31) & ~31;
+  align_buffer_64(row, kRowSize * 2);
+
+  // Allocate 1 row of ARGB for source conversion.
+  align_buffer_64(argb_row, src_width * 4);
+
+  uint8* rowptr = row;
+  int rowstride = kRowSize;
+  int lasty = yi;
+
+  // TODO(fbarchard): Convert first 2 rows of YUV to ARGB.
+  ScaleARGBFilterCols(rowptr, src_row_y, dst_width, x, dx);
+  if (src_height > 1) {
+    src_row_y += src_stride_y;
+    if (yi & 1) {
+      src_row_u += src_stride_u;
+      src_row_v += src_stride_v;
+    }
+  }
+  ScaleARGBFilterCols(rowptr + rowstride, src_row_y, dst_width, x, dx);
+  if (src_height > 2) {
+    src_row_y += src_stride_y;
+    if (!(yi & 1)) {
+      src_row_u += src_stride_u;
+      src_row_v += src_stride_v;
+    }
+  }
+
+  for (j = 0; j < dst_height; ++j) {
+    yi = y >> 16;
+    if (yi != lasty) {
+      if (y > max_y) {
+        y = max_y;
+        yi = y >> 16;
+        uv_yi = yi >> kYShift;
+        src_row_y = src_y + yi * src_stride_y;
+        src_row_u = src_u + uv_yi * src_stride_u;
+        src_row_v = src_v + uv_yi * src_stride_v;
+      }
+      if (yi != lasty) {
+        // TODO(fbarchard): Convert the clipped region of row.
+        I422ToARGBRow(src_row_y, src_row_u, src_row_v, argb_row, src_width);
+        ScaleARGBFilterCols(rowptr, argb_row, dst_width, x, dx);
+        rowptr += rowstride;
+        rowstride = -rowstride;
+        lasty = yi;
+        src_row_y += src_stride_y;
+        if (yi & 1) {
+          src_row_u += src_stride_u;
+          src_row_v += src_stride_v;
+        }
+      }
+    }
+    if (filtering == kFilterLinear) {
+      InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);
+    } else {
+      int yf = (y >> 8) & 255;
+      InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
+    }
+    dst_argb += dst_stride_argb;
+    y += dy;
+  }
+  free_aligned_buffer_64(row);
+  free_aligned_buffer_64(row_argb);
+}
+#endif
+
+// Scale ARGB to/from any dimensions, without interpolation.
+// Fixed point math is used for performance: The upper 16 bits
+// of x and dx is the integer part of the source position and
+// the lower 16 bits are the fixed decimal part.
+
+static void ScaleARGBSimple(int src_width, int src_height,
+                            int dst_width, int dst_height,
+                            int src_stride, int dst_stride,
+                            const uint8* src_argb, uint8* dst_argb,
+                            int x, int dx, int y, int dy) {
+  int j;
+  void (*ScaleARGBCols)(uint8* dst_argb, const uint8* src_argb,
+      int dst_width, int x, int dx) =
+      (src_width >= 32768) ? ScaleARGBCols64_C : ScaleARGBCols_C;
+#if defined(HAS_SCALEARGBCOLS_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
+    ScaleARGBCols = ScaleARGBCols_SSE2;
+  }
+#endif
+#if defined(HAS_SCALEARGBCOLS_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBCols = ScaleARGBCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBCols = ScaleARGBCols_NEON;
+    }
+  }
+#endif
+  if (src_width * 2 == dst_width && x < 0x8000) {
+    ScaleARGBCols = ScaleARGBColsUp2_C;
+#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBCols = ScaleARGBColsUp2_SSE2;
+    }
+#endif
+  }
+
+  for (j = 0; j < dst_height; ++j) {
+    ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride,
+                  dst_width, x, dx);
+    dst_argb += dst_stride;
+    y += dy;
+  }
+}
+
+// ScaleARGB a ARGB.
+// This function in turn calls a scaling function
+// suitable for handling the desired resolutions.
+static void ScaleARGB(const uint8* src, int src_stride,
+                      int src_width, int src_height,
+                      uint8* dst, int dst_stride,
+                      int dst_width, int dst_height,
+                      int clip_x, int clip_y, int clip_width, int clip_height,
+                      enum FilterMode filtering) {
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  // ARGB does not support box filter yet, but allow the user to pass it.
+  // Simplify filtering when possible.
+  filtering = ScaleFilterReduce(src_width, src_height,
+                                dst_width, dst_height,
+                                filtering);
+
+  // Negative src_height means invert the image.
+  if (src_height < 0) {
+    src_height = -src_height;
+    src = src + (src_height - 1) * src_stride;
+    src_stride = -src_stride;
+  }
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
+             &x, &y, &dx, &dy);
+  src_width = Abs(src_width);
+  if (clip_x) {
+    int64 clipf = (int64)(clip_x) * dx;
+    x += (clipf & 0xffff);
+    src += (clipf >> 16) * 4;
+    dst += clip_x * 4;
+  }
+  if (clip_y) {
+    int64 clipf = (int64)(clip_y) * dy;
+    y += (clipf & 0xffff);
+    src += (clipf >> 16) * src_stride;
+    dst += clip_y * dst_stride;
+  }
+
+  // Special case for integer step values.
+  if (((dx | dy) & 0xffff) == 0) {
+    if (!dx || !dy) {  // 1 pixel wide and/or tall.
+      filtering = kFilterNone;
+    } else {
+      // Optimized even scale down. ie 2, 4, 6, 8, 10x.
+      if (!(dx & 0x10000) && !(dy & 0x10000)) {
+        if (dx == 0x20000) {
+          // Optimized 1/2 downsample.
+          ScaleARGBDown2(src_width, src_height,
+                         clip_width, clip_height,
+                         src_stride, dst_stride, src, dst,
+                         x, dx, y, dy, filtering);
+          return;
+        }
+        if (dx == 0x40000 && filtering == kFilterBox) {
+          // Optimized 1/4 box downsample.
+          ScaleARGBDown4Box(src_width, src_height,
+                            clip_width, clip_height,
+                            src_stride, dst_stride, src, dst,
+                            x, dx, y, dy);
+          return;
+        }
+        ScaleARGBDownEven(src_width, src_height,
+                          clip_width, clip_height,
+                          src_stride, dst_stride, src, dst,
+                          x, dx, y, dy, filtering);
+        return;
+      }
+      // Optimized odd scale down. ie 3, 5, 7, 9x.
+      if ((dx & 0x10000) && (dy & 0x10000)) {
+        filtering = kFilterNone;
+        if (dx == 0x10000 && dy == 0x10000) {
+          // Straight copy.
+          ARGBCopy(src + (y >> 16) * src_stride + (x >> 16) * 4, src_stride,
+                   dst, dst_stride, clip_width, clip_height);
+          return;
+        }
+      }
+    }
+  }
+  if (dx == 0x10000 && (x & 0xffff) == 0) {
+    // Arbitrary scale vertically, but unscaled vertically.
+    ScalePlaneVertical(src_height,
+                       clip_width, clip_height,
+                       src_stride, dst_stride, src, dst,
+                       x, y, dy, 4, filtering);
+    return;
+  }
+  if (filtering && dy < 65536) {
+    ScaleARGBBilinearUp(src_width, src_height,
+                        clip_width, clip_height,
+                        src_stride, dst_stride, src, dst,
+                        x, dx, y, dy, filtering);
+    return;
+  }
+  if (filtering) {
+    ScaleARGBBilinearDown(src_width, src_height,
+                          clip_width, clip_height,
+                          src_stride, dst_stride, src, dst,
+                          x, dx, y, dy, filtering);
+    return;
+  }
+  ScaleARGBSimple(src_width, src_height, clip_width, clip_height,
+                  src_stride, dst_stride, src, dst,
+                  x, dx, y, dy);
+}
+
+LIBYUV_API
+int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
+                  int src_width, int src_height,
+                  uint8* dst_argb, int dst_stride_argb,
+                  int dst_width, int dst_height,
+                  int clip_x, int clip_y, int clip_width, int clip_height,
+                  enum FilterMode filtering) {
+  if (!src_argb || src_width == 0 || src_height == 0 ||
+      !dst_argb || dst_width <= 0 || dst_height <= 0 ||
+      clip_x < 0 || clip_y < 0 ||
+      clip_width > 32768 || clip_height > 32768 ||
+      (clip_x + clip_width) > dst_width ||
+      (clip_y + clip_height) > dst_height) {
+    return -1;
+  }
+  ScaleARGB(src_argb, src_stride_argb, src_width, src_height,
+            dst_argb, dst_stride_argb, dst_width, dst_height,
+            clip_x, clip_y, clip_width, clip_height, filtering);
+  return 0;
+}
+
+// Scale an ARGB image.
+LIBYUV_API
+int ARGBScale(const uint8* src_argb, int src_stride_argb,
+              int src_width, int src_height,
+              uint8* dst_argb, int dst_stride_argb,
+              int dst_width, int dst_height,
+              enum FilterMode filtering) {
+  if (!src_argb || src_width == 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 ||
+      !dst_argb || dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }
+  ScaleARGB(src_argb, src_stride_argb, src_width, src_height,
+            dst_argb, dst_stride_argb, dst_width, dst_height,
+            0, 0, dst_width, dst_height, filtering);
+  return 0;
+}
+
+// Scale with YUV conversion to ARGB and clipping.
+LIBYUV_API
+int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y,
+                       const uint8* src_u, int src_stride_u,
+                       const uint8* src_v, int src_stride_v,
+                       uint32 src_fourcc,
+                       int src_width, int src_height,
+                       uint8* dst_argb, int dst_stride_argb,
+                       uint32 dst_fourcc,
+                       int dst_width, int dst_height,
+                       int clip_x, int clip_y, int clip_width, int clip_height,
+                       enum FilterMode filtering) {
+  uint8* argb_buffer = (uint8*)malloc(src_width * src_height * 4);
+  int r;
+  I420ToARGB(src_y, src_stride_y,
+             src_u, src_stride_u,
+             src_v, src_stride_v,
+             argb_buffer, src_width * 4,
+             src_width, src_height);
+
+  r = ARGBScaleClip(argb_buffer, src_width * 4,
+                    src_width, src_height,
+                    dst_argb, dst_stride_argb,
+                    dst_width, dst_height,
+                    clip_x, clip_y, clip_width, clip_height,
+                    filtering);
+  free(argb_buffer);
+  return r;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libyuv/source/scale_common.cc b/libs/libyuv/source/scale_common.cc
new file mode 100644
index 0000000000..d3992df2e6
--- /dev/null
+++ b/libs/libyuv/source/scale_common.cc
@@ -0,0 +1,1151 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"  // For CopyARGB
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+static __inline int Abs(int v) {
+  return v >= 0 ? v : -v;
+}
+
+// CPU agnostic row functions
+void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                     uint8* dst, int dst_width) {
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = src_ptr[1];
+    dst[1] = src_ptr[3];
+    dst += 2;
+    src_ptr += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = src_ptr[1];
+  }
+}
+
+void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                        uint16* dst, int dst_width) {
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = src_ptr[1];
+    dst[1] = src_ptr[3];
+    dst += 2;
+    src_ptr += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = src_ptr[1];
+  }
+}
+
+void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst, int dst_width) {
+  const uint8* s = src_ptr;
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = (s[0] + s[1] + 1) >> 1;
+    dst[1] = (s[2] + s[3] + 1) >> 1;
+    dst += 2;
+    s += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = (s[0] + s[1] + 1) >> 1;
+  }
+}
+
+void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                              uint16* dst, int dst_width) {
+  const uint16* s = src_ptr;
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = (s[0] + s[1] + 1) >> 1;
+    dst[1] = (s[2] + s[3] + 1) >> 1;
+    dst += 2;
+    s += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = (s[0] + s[1] + 1) >> 1;
+  }
+}
+
+void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst, int dst_width) {
+  const uint8* s = src_ptr;
+  const uint8* t = src_ptr + src_stride;
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+    dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
+    dst += 2;
+    s += 4;
+    t += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+  }
+}
+
+void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst, int dst_width) {
+  const uint8* s = src_ptr;
+  const uint8* t = src_ptr + src_stride;
+  int x;
+  dst_width -= 1;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+    dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
+    dst += 2;
+    s += 4;
+    t += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+    dst += 1;
+    s += 2;
+    t += 2;
+  }
+  dst[0] = (s[0] + t[0] + 1) >> 1;
+}
+
+void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                           uint16* dst, int dst_width) {
+  const uint16* s = src_ptr;
+  const uint16* t = src_ptr + src_stride;
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+    dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
+    dst += 2;
+    s += 4;
+    t += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+  }
+}
+
+void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                     uint8* dst, int dst_width) {
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = src_ptr[2];
+    dst[1] = src_ptr[6];
+    dst += 2;
+    src_ptr += 8;
+  }
+  if (dst_width & 1) {
+    dst[0] = src_ptr[2];
+  }
+}
+
+void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                        uint16* dst, int dst_width) {
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = src_ptr[2];
+    dst[1] = src_ptr[6];
+    dst += 2;
+    src_ptr += 8;
+  }
+  if (dst_width & 1) {
+    dst[0] = src_ptr[2];
+  }
+}
+
+void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst, int dst_width) {
+  intptr_t stride = src_stride;
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+             src_ptr[stride + 0] + src_ptr[stride + 1] +
+             src_ptr[stride + 2] + src_ptr[stride + 3] +
+             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
+             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
+             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
+             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
+             8) >> 4;
+    dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
+             src_ptr[stride + 4] + src_ptr[stride + 5] +
+             src_ptr[stride + 6] + src_ptr[stride + 7] +
+             src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +
+             src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +
+             src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +
+             src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +
+             8) >> 4;
+    dst += 2;
+    src_ptr += 8;
+  }
+  if (dst_width & 1) {
+    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+             src_ptr[stride + 0] + src_ptr[stride + 1] +
+             src_ptr[stride + 2] + src_ptr[stride + 3] +
+             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
+             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
+             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
+             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
+             8) >> 4;
+  }
+}
+
+void ScaleRowDown4Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                           uint16* dst, int dst_width) {
+  intptr_t stride = src_stride;
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+             src_ptr[stride + 0] + src_ptr[stride + 1] +
+             src_ptr[stride + 2] + src_ptr[stride + 3] +
+             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
+             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
+             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
+             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
+             8) >> 4;
+    dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
+             src_ptr[stride + 4] + src_ptr[stride + 5] +
+             src_ptr[stride + 6] + src_ptr[stride + 7] +
+             src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +
+             src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +
+             src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +
+             src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +
+             8) >> 4;
+    dst += 2;
+    src_ptr += 8;
+  }
+  if (dst_width & 1) {
+    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+             src_ptr[stride + 0] + src_ptr[stride + 1] +
+             src_ptr[stride + 2] + src_ptr[stride + 3] +
+             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
+             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
+             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
+             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
+             8) >> 4;
+  }
+}
+
+void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                      uint8* dst, int dst_width) {
+  int x;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (x = 0; x < dst_width; x += 3) {
+    dst[0] = src_ptr[0];
+    dst[1] = src_ptr[1];
+    dst[2] = src_ptr[3];
+    dst += 3;
+    src_ptr += 4;
+  }
+}
+
+void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                         uint16* dst, int dst_width) {
+  int x;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (x = 0; x < dst_width; x += 3) {
+    dst[0] = src_ptr[0];
+    dst[1] = src_ptr[1];
+    dst[2] = src_ptr[3];
+    dst += 3;
+    src_ptr += 4;
+  }
+}
+
+// Filter rows 0 and 1 together, 3 : 1
+void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* d, int dst_width) {
+  const uint8* s = src_ptr;
+  const uint8* t = src_ptr + src_stride;
+  int x;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (x = 0; x < dst_width; x += 3) {
+    uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+    uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+    uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+    uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+    uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+    uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+    d[0] = (a0 * 3 + b0 + 2) >> 2;
+    d[1] = (a1 * 3 + b1 + 2) >> 2;
+    d[2] = (a2 * 3 + b2 + 2) >> 2;
+    d += 3;
+    s += 4;
+    t += 4;
+  }
+}
+
+void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                               uint16* d, int dst_width) {
+  const uint16* s = src_ptr;
+  const uint16* t = src_ptr + src_stride;
+  int x;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (x = 0; x < dst_width; x += 3) {
+    uint16 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+    uint16 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+    uint16 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+    uint16 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+    uint16 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+    uint16 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+    d[0] = (a0 * 3 + b0 + 2) >> 2;
+    d[1] = (a1 * 3 + b1 + 2) >> 2;
+    d[2] = (a2 * 3 + b2 + 2) >> 2;
+    d += 3;
+    s += 4;
+    t += 4;
+  }
+}
+
+// Filter rows 1 and 2 together, 1 : 1
+void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* d, int dst_width) {
+  const uint8* s = src_ptr;
+  const uint8* t = src_ptr + src_stride;
+  int x;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (x = 0; x < dst_width; x += 3) {
+    uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+    uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+    uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+    uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+    uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+    uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+    d[0] = (a0 + b0 + 1) >> 1;
+    d[1] = (a1 + b1 + 1) >> 1;
+    d[2] = (a2 + b2 + 1) >> 1;
+    d += 3;
+    s += 4;
+    t += 4;
+  }
+}
+
+void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                               uint16* d, int dst_width) {
+  const uint16* s = src_ptr;
+  const uint16* t = src_ptr + src_stride;
+  int x;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (x = 0; x < dst_width; x += 3) {
+    uint16 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+    uint16 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+    uint16 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+    uint16 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+    uint16 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+    uint16 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+    d[0] = (a0 + b0 + 1) >> 1;
+    d[1] = (a1 + b1 + 1) >> 1;
+    d[2] = (a2 + b2 + 1) >> 1;
+    d += 3;
+    s += 4;
+    t += 4;
+  }
+}
+
+// Scales a single row of pixels using point sampling.
+void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,
+                 int dst_width, int x, int dx) {
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst_ptr[0] = src_ptr[x >> 16];
+    x += dx;
+    dst_ptr[1] = src_ptr[x >> 16];
+    x += dx;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    dst_ptr[0] = src_ptr[x >> 16];
+  }
+}
+
+void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                    int dst_width, int x, int dx) {
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst_ptr[0] = src_ptr[x >> 16];
+    x += dx;
+    dst_ptr[1] = src_ptr[x >> 16];
+    x += dx;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    dst_ptr[0] = src_ptr[x >> 16];
+  }
+}
+
+// Scales a single row of pixels up by 2x using point sampling.
+void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,
+                    int dst_width, int x, int dx) {
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst_ptr[1] = dst_ptr[0] = src_ptr[0];
+    src_ptr += 1;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    dst_ptr[0] = src_ptr[0];
+  }
+}
+
+void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                       int dst_width, int x, int dx) {
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst_ptr[1] = dst_ptr[0] = src_ptr[0];
+    src_ptr += 1;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    dst_ptr[0] = src_ptr[0];
+  }
+}
+
+// (1-f)a + fb can be replaced with a + f(b-a)
+#define BLENDER(a, b, f) (uint8)((int)(a) + \
+    ((int)(f) * ((int)(b) - (int)(a)) >> 16))
+
+void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
+                       int dst_width, int x, int dx) {
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    int xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    xi = x >> 16;
+    a = src_ptr[xi];
+    b = src_ptr[xi + 1];
+    dst_ptr[1] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    int xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+  }
+}
+
+void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,
+                         int dst_width, int x32, int dx) {
+  int64 x = (int64)(x32);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    int64 xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    xi = x >> 16;
+    a = src_ptr[xi];
+    b = src_ptr[xi + 1];
+    dst_ptr[1] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    int64 xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+  }
+}
+#undef BLENDER
+
+#define BLENDER(a, b, f) (uint16)((int)(a) + \
+    ((int)(f) * ((int)(b) - (int)(a)) >> 16))
+
+void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                       int dst_width, int x, int dx) {
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    int xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    xi = x >> 16;
+    a = src_ptr[xi];
+    b = src_ptr[xi + 1];
+    dst_ptr[1] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    int xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+  }
+}
+
+void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                         int dst_width, int x32, int dx) {
+  int64 x = (int64)(x32);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    int64 xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    xi = x >> 16;
+    a = src_ptr[xi];
+    b = src_ptr[xi + 1];
+    dst_ptr[1] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    int64 xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+  }
+}
+#undef BLENDER
+
+void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                      uint8* dst, int dst_width) {
+  int x;
+  assert(dst_width % 3 == 0);
+  for (x = 0; x < dst_width; x += 3) {
+    dst[0] = src_ptr[0];
+    dst[1] = src_ptr[3];
+    dst[2] = src_ptr[6];
+    dst += 3;
+    src_ptr += 8;
+  }
+}
+
+void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                         uint16* dst, int dst_width) {
+  int x;
+  assert(dst_width % 3 == 0);
+  for (x = 0; x < dst_width; x += 3) {
+    dst[0] = src_ptr[0];
+    dst[1] = src_ptr[3];
+    dst[2] = src_ptr[6];
+    dst += 3;
+    src_ptr += 8;
+  }
+}
+
+// 8x3 -> 3x1
+void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width) {
+  intptr_t stride = src_stride;
+  int i;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (i = 0; i < dst_width; i += 3) {
+    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
+        src_ptr[stride + 0] + src_ptr[stride + 1] +
+        src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
+        src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
+        (65536 / 9) >> 16;
+    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
+        src_ptr[stride + 3] + src_ptr[stride + 4] +
+        src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
+        src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
+        (65536 / 9) >> 16;
+    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
+        src_ptr[stride + 6] + src_ptr[stride + 7] +
+        src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
+        (65536 / 6) >> 16;
+    src_ptr += 8;
+    dst_ptr += 3;
+  }
+}
+
+void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint16* dst_ptr, int dst_width) {
+  intptr_t stride = src_stride;
+  int i;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (i = 0; i < dst_width; i += 3) {
+    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
+        src_ptr[stride + 0] + src_ptr[stride + 1] +
+        src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
+        src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
+        (65536 / 9) >> 16;
+    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
+        src_ptr[stride + 3] + src_ptr[stride + 4] +
+        src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
+        src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
+        (65536 / 9) >> 16;
+    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
+        src_ptr[stride + 6] + src_ptr[stride + 7] +
+        src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
+        (65536 / 6) >> 16;
+    src_ptr += 8;
+    dst_ptr += 3;
+  }
+}
+
+// 8x2 -> 3x1
+void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width) {
+  intptr_t stride = src_stride;
+  int i;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (i = 0; i < dst_width; i += 3) {
+    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
+        src_ptr[stride + 0] + src_ptr[stride + 1] +
+        src_ptr[stride + 2]) * (65536 / 6) >> 16;
+    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
+        src_ptr[stride + 3] + src_ptr[stride + 4] +
+        src_ptr[stride + 5]) * (65536 / 6) >> 16;
+    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
+        src_ptr[stride + 6] + src_ptr[stride + 7]) *
+        (65536 / 4) >> 16;
+    src_ptr += 8;
+    dst_ptr += 3;
+  }
+}
+
+void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                               uint16* dst_ptr, int dst_width) {
+  intptr_t stride = src_stride;
+  int i;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (i = 0; i < dst_width; i += 3) {
+    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
+        src_ptr[stride + 0] + src_ptr[stride + 1] +
+        src_ptr[stride + 2]) * (65536 / 6) >> 16;
+    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
+        src_ptr[stride + 3] + src_ptr[stride + 4] +
+        src_ptr[stride + 5]) * (65536 / 6) >> 16;
+    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
+        src_ptr[stride + 6] + src_ptr[stride + 7]) *
+        (65536 / 4) >> 16;
+    src_ptr += 8;
+    dst_ptr += 3;
+  }
+}
+
+void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
+  int x;
+  assert(src_width > 0);
+  for (x = 0; x < src_width - 1; x += 2) {
+    dst_ptr[0] += src_ptr[0];
+    dst_ptr[1] += src_ptr[1];
+    src_ptr += 2;
+    dst_ptr += 2;
+  }
+  if (src_width & 1) {
+    dst_ptr[0] += src_ptr[0];
+  }
+}
+
+void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width) {
+  int x;
+  assert(src_width > 0);
+  for (x = 0; x < src_width - 1; x += 2) {
+    dst_ptr[0] += src_ptr[0];
+    dst_ptr[1] += src_ptr[1];
+    src_ptr += 2;
+    dst_ptr += 2;
+  }
+  if (src_width & 1) {
+    dst_ptr[0] += src_ptr[0];
+  }
+}
+
+void ScaleARGBRowDown2_C(const uint8* src_argb,
+                         ptrdiff_t src_stride,
+                         uint8* dst_argb, int dst_width) {
+  const uint32* src = (const uint32*)(src_argb);
+  uint32* dst = (uint32*)(dst_argb);
+
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = src[1];
+    dst[1] = src[3];
+    src += 4;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[1];
+  }
+}
+
+void ScaleARGBRowDown2Linear_C(const uint8* src_argb,
+                               ptrdiff_t src_stride,
+                               uint8* dst_argb, int dst_width) {
+  int x;
+  for (x = 0; x < dst_width; ++x) {
+    dst_argb[0] = (src_argb[0] + src_argb[4] + 1) >> 1;
+    dst_argb[1] = (src_argb[1] + src_argb[5] + 1) >> 1;
+    dst_argb[2] = (src_argb[2] + src_argb[6] + 1) >> 1;
+    dst_argb[3] = (src_argb[3] + src_argb[7] + 1) >> 1;
+    src_argb += 8;
+    dst_argb += 4;
+  }
+}
+
+void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride,
+                            uint8* dst_argb, int dst_width) {
+  int x;
+  for (x = 0; x < dst_width; ++x) {
+    dst_argb[0] = (src_argb[0] + src_argb[4] +
+                  src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2;
+    dst_argb[1] = (src_argb[1] + src_argb[5] +
+                  src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2;
+    dst_argb[2] = (src_argb[2] + src_argb[6] +
+                  src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2;
+    dst_argb[3] = (src_argb[3] + src_argb[7] +
+                  src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2;
+    src_argb += 8;
+    dst_argb += 4;
+  }
+}
+
+void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride,
+                            int src_stepx,
+                            uint8* dst_argb, int dst_width) {
+  const uint32* src = (const uint32*)(src_argb);
+  uint32* dst = (uint32*)(dst_argb);
+
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = src[0];
+    dst[1] = src[src_stepx];
+    src += src_stepx * 2;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[0];
+  }
+}
+
+void ScaleARGBRowDownEvenBox_C(const uint8* src_argb,
+                               ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8* dst_argb, int dst_width) {
+  int x;
+  for (x = 0; x < dst_width; ++x) {
+    dst_argb[0] = (src_argb[0] + src_argb[4] +
+                  src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2;
+    dst_argb[1] = (src_argb[1] + src_argb[5] +
+                  src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2;
+    dst_argb[2] = (src_argb[2] + src_argb[6] +
+                  src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2;
+    dst_argb[3] = (src_argb[3] + src_argb[7] +
+                  src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2;
+    src_argb += src_stepx * 4;
+    dst_argb += 4;
+  }
+}
+
+// Scales a single row of pixels using point sampling.
+void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb,
+                     int dst_width, int x, int dx) {
+  const uint32* src = (const uint32*)(src_argb);
+  uint32* dst = (uint32*)(dst_argb);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst[0] = src[x >> 16];
+    x += dx;
+    dst[1] = src[x >> 16];
+    x += dx;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[x >> 16];
+  }
+}
+
+void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb,
+                       int dst_width, int x32, int dx) {
+  int64 x = (int64)(x32);
+  const uint32* src = (const uint32*)(src_argb);
+  uint32* dst = (uint32*)(dst_argb);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst[0] = src[x >> 16];
+    x += dx;
+    dst[1] = src[x >> 16];
+    x += dx;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[x >> 16];
+  }
+}
+
+// Scales a single row of pixels up by 2x using point sampling.
+void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,
+                        int dst_width, int x, int dx) {
+  const uint32* src = (const uint32*)(src_argb);
+  uint32* dst = (uint32*)(dst_argb);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst[1] = dst[0] = src[0];
+    src += 1;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[0];
+  }
+}
+
+// Mimics SSSE3 blender
+#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7
+#define BLENDERC(a, b, f, s) (uint32)( \
+    BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
+#define BLENDER(a, b, f) \
+    BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | \
+    BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0)
+
+void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,
+                           int dst_width, int x, int dx) {
+  const uint32* src = (const uint32*)(src_argb);
+  uint32* dst = (uint32*)(dst_argb);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    int xi = x >> 16;
+    int xf = (x >> 9) & 0x7f;
+    uint32 a = src[xi];
+    uint32 b = src[xi + 1];
+    dst[0] = BLENDER(a, b, xf);
+    x += dx;
+    xi = x >> 16;
+    xf = (x >> 9) & 0x7f;
+    a = src[xi];
+    b = src[xi + 1];
+    dst[1] = BLENDER(a, b, xf);
+    x += dx;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    int xi = x >> 16;
+    int xf = (x >> 9) & 0x7f;
+    uint32 a = src[xi];
+    uint32 b = src[xi + 1];
+    dst[0] = BLENDER(a, b, xf);
+  }
+}
+
+void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
+                             int dst_width, int x32, int dx) {
+  int64 x = (int64)(x32);
+  const uint32* src = (const uint32*)(src_argb);
+  uint32* dst = (uint32*)(dst_argb);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    int64 xi = x >> 16;
+    int xf = (x >> 9) & 0x7f;
+    uint32 a = src[xi];
+    uint32 b = src[xi + 1];
+    dst[0] = BLENDER(a, b, xf);
+    x += dx;
+    xi = x >> 16;
+    xf = (x >> 9) & 0x7f;
+    a = src[xi];
+    b = src[xi + 1];
+    dst[1] = BLENDER(a, b, xf);
+    x += dx;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    int64 xi = x >> 16;
+    int xf = (x >> 9) & 0x7f;
+    uint32 a = src[xi];
+    uint32 b = src[xi + 1];
+    dst[0] = BLENDER(a, b, xf);
+  }
+}
+#undef BLENDER1
+#undef BLENDERC
+#undef BLENDER
+
+// Scale plane vertically with bilinear interpolation.
+void ScalePlaneVertical(int src_height,
+                        int dst_width, int dst_height,
+                        int src_stride, int dst_stride,
+                        const uint8* src_argb, uint8* dst_argb,
+                        int x, int y, int dy,
+                        int bpp, enum FilterMode filtering) {
+  // TODO(fbarchard): Allow higher bpp.
+  int dst_width_bytes = dst_width * bpp;
+  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_C;
+  const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
+  int j;
+  assert(bpp >= 1 && bpp <= 4);
+  assert(src_height != 0);
+  assert(dst_width > 0);
+  assert(dst_height > 0);
+  src_argb += (x >> 16) * bpp;
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(dst_width_bytes, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(dst_width_bytes, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(dst_width_bytes, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) &&
+      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
+    InterpolateRow = InterpolateRow_Any_DSPR2;
+    if (IS_ALIGNED(dst_width_bytes, 4)) {
+      InterpolateRow = InterpolateRow_DSPR2;
+    }
+  }
+#endif
+  for (j = 0; j < dst_height; ++j) {
+    int yi;
+    int yf;
+    if (y > max_y) {
+      y = max_y;
+    }
+    yi = y >> 16;
+    yf = filtering ? ((y >> 8) & 255) : 0;
+    InterpolateRow(dst_argb, src_argb + yi * src_stride,
+                   src_stride, dst_width_bytes, yf);
+    dst_argb += dst_stride;
+    y += dy;
+  }
+}
+void ScalePlaneVertical_16(int src_height,
+                           int dst_width, int dst_height,
+                           int src_stride, int dst_stride,
+                           const uint16* src_argb, uint16* dst_argb,
+                           int x, int y, int dy,
+                           int wpp, enum FilterMode filtering) {
+  // TODO(fbarchard): Allow higher wpp.
+  int dst_width_words = dst_width * wpp;
+  void (*InterpolateRow)(uint16* dst_argb, const uint16* src_argb,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_16_C;
+  const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
+  int j;
+  assert(wpp >= 1 && wpp <= 2);
+  assert(src_height != 0);
+  assert(dst_width > 0);
+  assert(dst_height > 0);
+  src_argb += (x >> 16) * wpp;
+#if defined(HAS_INTERPOLATEROW_16_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_Any_16_SSE2;
+    if (IS_ALIGNED(dst_width_bytes, 16)) {
+      InterpolateRow = InterpolateRow_16_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_16_SSSE3;
+    if (IS_ALIGNED(dst_width_bytes, 16)) {
+      InterpolateRow = InterpolateRow_16_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_16_AVX2;
+    if (IS_ALIGNED(dst_width_bytes, 32)) {
+      InterpolateRow = InterpolateRow_16_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_16_NEON;
+    if (IS_ALIGNED(dst_width_bytes, 16)) {
+      InterpolateRow = InterpolateRow_16_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_DSPR2)
+  if (TestCpuFlag(kCpuHasDSPR2) &&
+      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
+    InterpolateRow = InterpolateRow_Any_16_DSPR2;
+    if (IS_ALIGNED(dst_width_bytes, 4)) {
+      InterpolateRow = InterpolateRow_16_DSPR2;
+    }
+  }
+#endif
+  for (j = 0; j < dst_height; ++j) {
+    int yi;
+    int yf;
+    if (y > max_y) {
+      y = max_y;
+    }
+    yi = y >> 16;
+    yf = filtering ? ((y >> 8) & 255) : 0;
+    InterpolateRow(dst_argb, src_argb + yi * src_stride,
+                   src_stride, dst_width_words, yf);
+    dst_argb += dst_stride;
+    y += dy;
+  }
+}
+
+// Simplify the filtering based on scale factors.
+enum FilterMode ScaleFilterReduce(int src_width, int src_height,
+                                  int dst_width, int dst_height,
+                                  enum FilterMode filtering) {
+  if (src_width < 0) {
+    src_width = -src_width;
+  }
+  if (src_height < 0) {
+    src_height = -src_height;
+  }
+  if (filtering == kFilterBox) {
+    // If scaling both axis to 0.5 or larger, switch from Box to Bilinear.
+    if (dst_width * 2 >= src_width && dst_height * 2 >= src_height) {
+      filtering = kFilterBilinear;
+    }
+  }
+  if (filtering == kFilterBilinear) {
+    if (src_height == 1) {
+      filtering = kFilterLinear;
+    }
+    // TODO(fbarchard): Detect any odd scale factor and reduce to Linear.
+    if (dst_height == src_height || dst_height * 3 == src_height) {
+      filtering = kFilterLinear;
+    }
+    // TODO(fbarchard): Remove 1 pixel wide filter restriction, which is to
+    // avoid reading 2 pixels horizontally that causes memory exception.
+    if (src_width == 1) {
+      filtering = kFilterNone;
+    }
+  }
+  if (filtering == kFilterLinear) {
+    if (src_width == 1) {
+      filtering = kFilterNone;
+    }
+    // TODO(fbarchard): Detect any odd scale factor and reduce to None.
+    if (dst_width == src_width || dst_width * 3 == src_width) {
+      filtering = kFilterNone;
+    }
+  }
+  return filtering;
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv_C(int num, int div) {
+  return (int)(((int64)(num) << 16) / div);
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv1_C(int num, int div) {
+  return (int)((((int64)(num) << 16) - 0x00010001) /
+                          (div - 1));
+}
+
+#define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s)
+
+// Compute slope values for stepping.
+void ScaleSlope(int src_width, int src_height,
+                int dst_width, int dst_height,
+                enum FilterMode filtering,
+                int* x, int* y, int* dx, int* dy) {
+  assert(x != NULL);
+  assert(y != NULL);
+  assert(dx != NULL);
+  assert(dy != NULL);
+  assert(src_width != 0);
+  assert(src_height != 0);
+  assert(dst_width > 0);
+  assert(dst_height > 0);
+  // Check for 1 pixel and avoid FixedDiv overflow.
+  if (dst_width == 1 && src_width >= 32768) {
+    dst_width = src_width;
+  }
+  if (dst_height == 1 && src_height >= 32768) {
+    dst_height = src_height;
+  }
+  if (filtering == kFilterBox) {
+    // Scale step for point sampling duplicates all pixels equally.
+    *dx = FixedDiv(Abs(src_width), dst_width);
+    *dy = FixedDiv(src_height, dst_height);
+    *x = 0;
+    *y = 0;
+  } else if (filtering == kFilterBilinear) {
+    // Scale step for bilinear sampling renders last pixel once for upsample.
+    if (dst_width <= Abs(src_width)) {
+      *dx = FixedDiv(Abs(src_width), dst_width);
+      *x = CENTERSTART(*dx, -32768);  // Subtract 0.5 (32768) to center filter.
+    } else if (dst_width > 1) {
+      *dx = FixedDiv1(Abs(src_width), dst_width);
+      *x = 0;
+    }
+    if (dst_height <= src_height) {
+      *dy = FixedDiv(src_height,  dst_height);
+      *y = CENTERSTART(*dy, -32768);  // Subtract 0.5 (32768) to center filter.
+    } else if (dst_height > 1) {
+      *dy = FixedDiv1(src_height, dst_height);
+      *y = 0;
+    }
+  } else if (filtering == kFilterLinear) {
+    // Scale step for bilinear sampling renders last pixel once for upsample.
+    if (dst_width <= Abs(src_width)) {
+      *dx = FixedDiv(Abs(src_width), dst_width);
+      *x = CENTERSTART(*dx, -32768);  // Subtract 0.5 (32768) to center filter.
+    } else if (dst_width > 1) {
+      *dx = FixedDiv1(Abs(src_width), dst_width);
+      *x = 0;
+    }
+    *dy = FixedDiv(src_height, dst_height);
+    *y = *dy >> 1;
+  } else {
+    // Scale step for point sampling duplicates all pixels equally.
+    *dx = FixedDiv(Abs(src_width), dst_width);
+    *dy = FixedDiv(src_height, dst_height);
+    *x = CENTERSTART(*dx, 0);
+    *y = CENTERSTART(*dy, 0);
+  }
+  // Negative src_width means horizontally mirror.
+  if (src_width < 0) {
+    *x += (dst_width - 1) * *dx;
+    *dx = -*dx;
+    // src_width = -src_width;   // Caller must do this.
+  }
+}
+#undef CENTERSTART
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libyuv/source/scale_gcc.cc b/libs/libyuv/source/scale_gcc.cc
new file mode 100644
index 0000000000..a1ae4e2773
--- /dev/null
+++ b/libs/libyuv/source/scale_gcc.cc
@@ -0,0 +1,1292 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+
+// Offsets for source bytes 0 to 9
+static uvec8 kShuf0 =
+  { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
+static uvec8 kShuf1 =
+  { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static uvec8 kShuf2 =
+  { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 0 to 10
+static uvec8 kShuf01 =
+  { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
+
+// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
+static uvec8 kShuf11 =
+  { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static uvec8 kShuf21 =
+  { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
+
+// Coefficients for source bytes 0 to 10
+static uvec8 kMadd01 =
+  { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
+
+// Coefficients for source bytes 10 to 21
+static uvec8 kMadd11 =
+  { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
+
+// Coefficients for source bytes 21 to 31
+static uvec8 kMadd21 =
+  { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
+
+// Coefficients for source bytes 21 to 31
+static vec16 kRound34 =
+  { 2, 2, 2, 2, 2, 2, 2, 2 };
+
+static uvec8 kShuf38a =
+  { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+static uvec8 kShuf38b =
+  { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
+
+// Arrange words 0,3,6 into 0,1,2
+static uvec8 kShufAc =
+  { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Arrange words 0,3,6 into 3,4,5
+static uvec8 kShufAc3 =
+  { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
+
+// Scaling values for boxes of 3x3 and 2x3
+static uvec16 kScaleAc33 =
+  { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
+
+// Arrange first value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb0 =
+  { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
+
+// Arrange second value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb1 =
+  { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
+
+// Arrange third value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb2 =
+  { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
+
+// Scaling values for boxes of 3x2 and 2x2
+static uvec16 kScaleAb2 =
+  { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
+
+// GCC versions of row functions are verbatim conversions from Visual C.
+// Generated using gcc disassembly on Visual C object file:
+// objdump -D yuvscaler.obj >yuvscaler.txt
+
+void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                         uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  :: "memory", "cc", "xmm0", "xmm1"
+  );
+}
+
+void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "pcmpeqb    %%xmm4,%%xmm4                  \n"
+    "psrlw      $0xf,%%xmm4                    \n"
+    "packuswb   %%xmm4,%%xmm4                  \n"
+    "pxor       %%xmm5,%%xmm5                  \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10, 0) ",%%xmm1  \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pmaddubsw  %%xmm4,%%xmm0                  \n"
+    "pmaddubsw  %%xmm4,%%xmm1                  \n"
+    "pavgw      %%xmm5,%%xmm0                  \n"
+    "pavgw      %%xmm5,%%xmm1                  \n"
+    "packuswb   %%xmm1,%%xmm0                  \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
+  );
+}
+
+void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "pcmpeqb    %%xmm4,%%xmm4                  \n"
+    "psrlw      $0xf,%%xmm4                    \n"
+    "packuswb   %%xmm4,%%xmm4                  \n"
+    "pxor       %%xmm5,%%xmm5                  \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu  (%0,%3,1),%%xmm2
+    MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu  0x10(%0,%3,1),%%xmm3
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pmaddubsw  %%xmm4,%%xmm0                  \n"
+    "pmaddubsw  %%xmm4,%%xmm1                  \n"
+    "pmaddubsw  %%xmm4,%%xmm2                  \n"
+    "pmaddubsw  %%xmm4,%%xmm3                  \n"
+    "paddw      %%xmm2,%%xmm0                  \n"
+    "paddw      %%xmm3,%%xmm1                  \n"
+    "psrlw      $0x1,%%xmm0                    \n"
+    "psrlw      $0x1,%%xmm1                    \n"
+    "pavgw      %%xmm5,%%xmm0                  \n"
+    "pavgw      %%xmm5,%%xmm1                  \n"
+    "packuswb   %%xmm1,%%xmm0                  \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  : "r"((intptr_t)(src_stride))   // %3
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+
+#ifdef HAS_SCALEROWDOWN2_AVX2
+void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
+    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
+    "lea        " MEMLEA(0x40,0) ",%0          \n"
+    "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
+    "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
+    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
+    "lea        " MEMLEA(0x20,1) ",%1          \n"
+    "sub        $0x20,%2                       \n"
+    "jg         1b                             \n"
+    "vzeroupper                                \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  :: "memory", "cc", "xmm0", "xmm1"
+  );
+}
+
+void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
+    "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
+    "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"
+    "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
+    "vmovdqu    " MEMACCESS2(0x20, 0) ",%%ymm1 \n"
+    "lea        " MEMLEA(0x40,0) ",%0          \n"
+    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
+    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
+    "vpavgw     %%ymm5,%%ymm0,%%ymm0           \n"
+    "vpavgw     %%ymm5,%%ymm1,%%ymm1           \n"
+    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
+    "lea        " MEMLEA(0x20,1) ",%1          \n"
+    "sub        $0x20,%2                       \n"
+    "jg         1b                             \n"
+    "vzeroupper                                \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
+  );
+}
+
+void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
+    "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
+    "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"
+    "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
+    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
+    MEMOPREG(vmovdqu,0x00,0,3,1,ymm2)          //  vmovdqu  (%0,%3,1),%%ymm2
+    MEMOPREG(vmovdqu,0x20,0,3,1,ymm3)          //  vmovdqu  0x20(%0,%3,1),%%ymm3
+    "lea        " MEMLEA(0x40,0) ",%0          \n"
+    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
+    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
+    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
+    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+    "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
+    "vpsrlw     $0x1,%%ymm0,%%ymm0             \n"
+    "vpsrlw     $0x1,%%ymm1,%%ymm1             \n"
+    "vpavgw     %%ymm5,%%ymm0,%%ymm0           \n"
+    "vpavgw     %%ymm5,%%ymm1,%%ymm1           \n"
+    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
+    "lea        " MEMLEA(0x20,1) ",%1          \n"
+    "sub        $0x20,%2                       \n"
+    "jg         1b                             \n"
+    "vzeroupper                                \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  : "r"((intptr_t)(src_stride))   // %3
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_SCALEROWDOWN2_AVX2
+
+void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrld     $0x18,%%xmm5                    \n"
+    "pslld     $0x10,%%xmm5                    \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  :: "memory", "cc", "xmm0", "xmm1", "xmm5"
+  );
+}
+
+void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
+  intptr_t stridex3 = 0;
+  asm volatile (
+    "pcmpeqb    %%xmm4,%%xmm4                  \n"
+    "psrlw      $0xf,%%xmm4                    \n"
+    "movdqa     %%xmm4,%%xmm5                  \n"
+    "packuswb   %%xmm4,%%xmm4                  \n"
+    "psllw      $0x3,%%xmm5                    \n"
+    "lea       " MEMLEA4(0x00,4,4,2) ",%3      \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
+    MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
+    "pmaddubsw  %%xmm4,%%xmm0                  \n"
+    "pmaddubsw  %%xmm4,%%xmm1                  \n"
+    "pmaddubsw  %%xmm4,%%xmm2                  \n"
+    "pmaddubsw  %%xmm4,%%xmm3                  \n"
+    "paddw      %%xmm2,%%xmm0                  \n"
+    "paddw      %%xmm3,%%xmm1                  \n"
+    MEMOPREG(movdqu,0x00,0,4,2,xmm2)           //  movdqu  (%0,%4,2),%%xmm2
+    MEMOPREG(movdqu,0x10,0,4,2,xmm3)           //  movdqu  0x10(%0,%4,2),%%xmm3
+    "pmaddubsw  %%xmm4,%%xmm2                  \n"
+    "pmaddubsw  %%xmm4,%%xmm3                  \n"
+    "paddw      %%xmm2,%%xmm0                  \n"
+    "paddw      %%xmm3,%%xmm1                  \n"
+    MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu  (%0,%3,1),%%xmm2
+    MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu  0x10(%0,%3,1),%%xmm3
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pmaddubsw  %%xmm4,%%xmm2                  \n"
+    "pmaddubsw  %%xmm4,%%xmm3                  \n"
+    "paddw      %%xmm2,%%xmm0                  \n"
+    "paddw      %%xmm3,%%xmm1                  \n"
+    "phaddw     %%xmm1,%%xmm0                  \n"
+    "paddw      %%xmm5,%%xmm0                  \n"
+    "psrlw      $0x4,%%xmm0                    \n"
+    "packuswb   %%xmm0,%%xmm0                  \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),     // %0
+    "+r"(dst_ptr),     // %1
+    "+r"(dst_width),   // %2
+    "+r"(stridex3)     // %3
+  : "r"((intptr_t)(src_stride))    // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+
+#ifdef HAS_SCALEROWDOWN4_AVX2
+void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    "vpsrld     $0x18,%%ymm5,%%ymm5            \n"
+    "vpslld     $0x10,%%ymm5,%%ymm5            \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
+    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
+    "lea        " MEMLEA(0x40,0) ",%0          \n"
+    "vpand      %%ymm5,%%ymm0,%%ymm0           \n"
+    "vpand      %%ymm5,%%ymm1,%%ymm1           \n"
+    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+    "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
+    "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+    "vmovdqu    %%xmm0," MEMACCESS(1) "        \n"
+    "lea        " MEMLEA(0x10,1) ",%1          \n"
+    "sub        $0x10,%2                       \n"
+    "jg         1b                             \n"
+    "vzeroupper                                \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  :: "memory", "cc", "xmm0", "xmm1", "xmm5"
+  );
+}
+
+void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
+    "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
+    "vpsllw     $0x3,%%ymm4,%%ymm5             \n"
+    "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
+    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
+    MEMOPREG(vmovdqu,0x00,0,3,1,ymm2)          //  vmovdqu  (%0,%3,1),%%ymm2
+    MEMOPREG(vmovdqu,0x20,0,3,1,ymm3)          //  vmovdqu  0x20(%0,%3,1),%%ymm3
+    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
+    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
+    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
+    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+    "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
+    MEMOPREG(vmovdqu,0x00,0,3,2,ymm2)          //  vmovdqu  (%0,%3,2),%%ymm2
+    MEMOPREG(vmovdqu,0x20,0,3,2,ymm3)          //  vmovdqu  0x20(%0,%3,2),%%ymm3
+    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
+    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+    "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
+    MEMOPREG(vmovdqu,0x00,0,4,1,ymm2)          //  vmovdqu  (%0,%4,1),%%ymm2
+    MEMOPREG(vmovdqu,0x20,0,4,1,ymm3)          //  vmovdqu  0x20(%0,%4,1),%%ymm3
+    "lea        " MEMLEA(0x40,0) ",%0          \n"
+    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
+    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+    "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
+    "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"
+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+    "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"
+    "vpsrlw     $0x4,%%ymm0,%%ymm0             \n"
+    "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+    "vmovdqu    %%xmm0," MEMACCESS(1) "        \n"
+    "lea        " MEMLEA(0x10,1) ",%1          \n"
+    "sub        $0x10,%2                       \n"
+    "jg         1b                             \n"
+    "vzeroupper                                \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  : "r"((intptr_t)(src_stride)),  // %3
+    "r"((intptr_t)(src_stride * 3))   // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_SCALEROWDOWN4_AVX2
+
+void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                          uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "movdqa    %0,%%xmm3                       \n"
+    "movdqa    %1,%%xmm4                       \n"
+    "movdqa    %2,%%xmm5                       \n"
+  :
+  : "m"(kShuf0),  // %0
+    "m"(kShuf1),  // %1
+    "m"(kShuf2)   // %2
+  );
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm2   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "movdqa    %%xmm2,%%xmm1                   \n"
+    "palignr   $0x8,%%xmm0,%%xmm1              \n"
+    "pshufb    %%xmm3,%%xmm0                   \n"
+    "pshufb    %%xmm4,%%xmm1                   \n"
+    "pshufb    %%xmm5,%%xmm2                   \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "movq      %%xmm1," MEMACCESS2(0x8,1) "    \n"
+    "movq      %%xmm2," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x18,1) ",%1           \n"
+    "sub       $0x18,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(dst_width)  // %2
+  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "movdqa    %0,%%xmm2                       \n"  // kShuf01
+    "movdqa    %1,%%xmm3                       \n"  // kShuf11
+    "movdqa    %2,%%xmm4                       \n"  // kShuf21
+  :
+  : "m"(kShuf01),  // %0
+    "m"(kShuf11),  // %1
+    "m"(kShuf21)   // %2
+  );
+  asm volatile (
+    "movdqa    %0,%%xmm5                       \n"  // kMadd01
+    "movdqa    %1,%%xmm0                       \n"  // kMadd11
+    "movdqa    %2,%%xmm1                       \n"  // kRound34
+  :
+  : "m"(kMadd01),  // %0
+    "m"(kMadd11),  // %1
+    "m"(kRound34)  // %2
+  );
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
+    MEMOPREG(movdqu,0x00,0,3,1,xmm7)           //  movdqu  (%0,%3),%%xmm7
+    "pavgb     %%xmm7,%%xmm6                   \n"
+    "pshufb    %%xmm2,%%xmm6                   \n"
+    "pmaddubsw %%xmm5,%%xmm6                   \n"
+    "paddsw    %%xmm1,%%xmm6                   \n"
+    "psrlw     $0x2,%%xmm6                     \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "movq      %%xmm6," MEMACCESS(1) "         \n"
+    "movdqu    " MEMACCESS2(0x8,0) ",%%xmm6    \n"
+    MEMOPREG(movdqu,0x8,0,3,1,xmm7)            //  movdqu  0x8(%0,%3),%%xmm7
+    "pavgb     %%xmm7,%%xmm6                   \n"
+    "pshufb    %%xmm3,%%xmm6                   \n"
+    "pmaddubsw %%xmm0,%%xmm6                   \n"
+    "paddsw    %%xmm1,%%xmm6                   \n"
+    "psrlw     $0x2,%%xmm6                     \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "movq      %%xmm6," MEMACCESS2(0x8,1) "    \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
+    MEMOPREG(movdqu,0x10,0,3,1,xmm7)           //  movdqu  0x10(%0,%3),%%xmm7
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pavgb     %%xmm7,%%xmm6                   \n"
+    "pshufb    %%xmm4,%%xmm6                   \n"
+    "pmaddubsw %4,%%xmm6                       \n"
+    "paddsw    %%xmm1,%%xmm6                   \n"
+    "psrlw     $0x2,%%xmm6                     \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "movq      %%xmm6," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x18,1) ",%1           \n"
+    "sub       $0x18,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(dst_width)  // %2
+  : "r"((intptr_t)(src_stride)),  // %3
+    "m"(kMadd21)     // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
+void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "movdqa    %0,%%xmm2                       \n"  // kShuf01
+    "movdqa    %1,%%xmm3                       \n"  // kShuf11
+    "movdqa    %2,%%xmm4                       \n"  // kShuf21
+  :
+  : "m"(kShuf01),  // %0
+    "m"(kShuf11),  // %1
+    "m"(kShuf21)   // %2
+  );
+  asm volatile (
+    "movdqa    %0,%%xmm5                       \n"  // kMadd01
+    "movdqa    %1,%%xmm0                       \n"  // kMadd11
+    "movdqa    %2,%%xmm1                       \n"  // kRound34
+  :
+  : "m"(kMadd01),  // %0
+    "m"(kMadd11),  // %1
+    "m"(kRound34)  // %2
+  );
+
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
+    MEMOPREG(movdqu,0x00,0,3,1,xmm7)           //  movdqu  (%0,%3,1),%%xmm7
+    "pavgb     %%xmm6,%%xmm7                   \n"
+    "pavgb     %%xmm7,%%xmm6                   \n"
+    "pshufb    %%xmm2,%%xmm6                   \n"
+    "pmaddubsw %%xmm5,%%xmm6                   \n"
+    "paddsw    %%xmm1,%%xmm6                   \n"
+    "psrlw     $0x2,%%xmm6                     \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "movq      %%xmm6," MEMACCESS(1) "         \n"
+    "movdqu    " MEMACCESS2(0x8,0) ",%%xmm6    \n"
+    MEMOPREG(movdqu,0x8,0,3,1,xmm7)            //  movdqu  0x8(%0,%3,1),%%xmm7
+    "pavgb     %%xmm6,%%xmm7                   \n"
+    "pavgb     %%xmm7,%%xmm6                   \n"
+    "pshufb    %%xmm3,%%xmm6                   \n"
+    "pmaddubsw %%xmm0,%%xmm6                   \n"
+    "paddsw    %%xmm1,%%xmm6                   \n"
+    "psrlw     $0x2,%%xmm6                     \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "movq      %%xmm6," MEMACCESS2(0x8,1) "    \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
+    MEMOPREG(movdqu,0x10,0,3,1,xmm7)           //  movdqu  0x10(%0,%3,1),%%xmm7
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pavgb     %%xmm6,%%xmm7                   \n"
+    "pavgb     %%xmm7,%%xmm6                   \n"
+    "pshufb    %%xmm4,%%xmm6                   \n"
+    "pmaddubsw %4,%%xmm6                       \n"
+    "paddsw    %%xmm1,%%xmm6                   \n"
+    "psrlw     $0x2,%%xmm6                     \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "movq      %%xmm6," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x18,1) ",%1           \n"
+    "sub       $0x18,%2                        \n"
+    "jg        1b                              \n"
+    : "+r"(src_ptr),   // %0
+      "+r"(dst_ptr),   // %1
+      "+r"(dst_width)  // %2
+    : "r"((intptr_t)(src_stride)),  // %3
+      "m"(kMadd21)     // %4
+    : "memory", "cc", NACL_R14
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
+void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                          uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "movdqa    %3,%%xmm4                       \n"
+    "movdqa    %4,%%xmm5                       \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pshufb    %%xmm4,%%xmm0                   \n"
+    "pshufb    %%xmm5,%%xmm1                   \n"
+    "paddusb   %%xmm1,%%xmm0                   \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "movhlps   %%xmm0,%%xmm1                   \n"
+    "movd      %%xmm1," MEMACCESS2(0x8,1) "    \n"
+    "lea       " MEMLEA(0xc,1) ",%1            \n"
+    "sub       $0xc,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(dst_width)  // %2
+  : "m"(kShuf38a),   // %3
+    "m"(kShuf38b)    // %4
+  : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
+  );
+}
+
+void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "movdqa    %0,%%xmm2                       \n"
+    "movdqa    %1,%%xmm3                       \n"
+    "movdqa    %2,%%xmm4                       \n"
+    "movdqa    %3,%%xmm5                       \n"
+  :
+  : "m"(kShufAb0),   // %0
+    "m"(kShufAb1),   // %1
+    "m"(kShufAb2),   // %2
+    "m"(kScaleAb2)   // %3
+  );
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,3,1,xmm1)           //  movdqu  (%0,%3,1),%%xmm1
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "pshufb    %%xmm2,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm6                   \n"
+    "pshufb    %%xmm3,%%xmm6                   \n"
+    "paddusw   %%xmm6,%%xmm1                   \n"
+    "pshufb    %%xmm4,%%xmm0                   \n"
+    "paddusw   %%xmm0,%%xmm1                   \n"
+    "pmulhuw   %%xmm5,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm1                   \n"
+    "movd      %%xmm1," MEMACCESS(1) "         \n"
+    "psrlq     $0x10,%%xmm1                    \n"
+    "movd      %%xmm1," MEMACCESS2(0x2,1) "    \n"
+    "lea       " MEMLEA(0x6,1) ",%1            \n"
+    "sub       $0x6,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),     // %0
+    "+r"(dst_ptr),     // %1
+    "+r"(dst_width)    // %2
+  : "r"((intptr_t)(src_stride))  // %3
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+
+void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "movdqa    %0,%%xmm2                       \n"
+    "movdqa    %1,%%xmm3                       \n"
+    "movdqa    %2,%%xmm4                       \n"
+    "pxor      %%xmm5,%%xmm5                   \n"
+  :
+  : "m"(kShufAc),    // %0
+    "m"(kShufAc3),   // %1
+    "m"(kScaleAc33)  // %2
+  );
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,3,1,xmm6)           //  movdqu  (%0,%3,1),%%xmm6
+    "movhlps   %%xmm0,%%xmm1                   \n"
+    "movhlps   %%xmm6,%%xmm7                   \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "punpcklbw %%xmm5,%%xmm1                   \n"
+    "punpcklbw %%xmm5,%%xmm6                   \n"
+    "punpcklbw %%xmm5,%%xmm7                   \n"
+    "paddusw   %%xmm6,%%xmm0                   \n"
+    "paddusw   %%xmm7,%%xmm1                   \n"
+    MEMOPREG(movdqu,0x00,0,3,2,xmm6)           //  movdqu  (%0,%3,2),%%xmm6
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movhlps   %%xmm6,%%xmm7                   \n"
+    "punpcklbw %%xmm5,%%xmm6                   \n"
+    "punpcklbw %%xmm5,%%xmm7                   \n"
+    "paddusw   %%xmm6,%%xmm0                   \n"
+    "paddusw   %%xmm7,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm6                   \n"
+    "psrldq    $0x2,%%xmm0                     \n"
+    "paddusw   %%xmm0,%%xmm6                   \n"
+    "psrldq    $0x2,%%xmm0                     \n"
+    "paddusw   %%xmm0,%%xmm6                   \n"
+    "pshufb    %%xmm2,%%xmm6                   \n"
+    "movdqa    %%xmm1,%%xmm7                   \n"
+    "psrldq    $0x2,%%xmm1                     \n"
+    "paddusw   %%xmm1,%%xmm7                   \n"
+    "psrldq    $0x2,%%xmm1                     \n"
+    "paddusw   %%xmm1,%%xmm7                   \n"
+    "pshufb    %%xmm3,%%xmm7                   \n"
+    "paddusw   %%xmm7,%%xmm6                   \n"
+    "pmulhuw   %%xmm4,%%xmm6                   \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "movd      %%xmm6," MEMACCESS(1) "         \n"
+    "psrlq     $0x10,%%xmm6                    \n"
+    "movd      %%xmm6," MEMACCESS2(0x2,1) "    \n"
+    "lea       " MEMLEA(0x6,1) ",%1            \n"
+    "sub       $0x6,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  : "r"((intptr_t)(src_stride))   // %3
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
+// Reads 16xN bytes and produces 16 shorts at a time.
+void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
+  asm volatile (
+    "pxor      %%xmm5,%%xmm5                   \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm3         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"  // src_ptr += 16
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,1) ",%%xmm1   \n"
+    "movdqa    %%xmm3,%%xmm2                   \n"
+    "punpcklbw %%xmm5,%%xmm2                   \n"
+    "punpckhbw %%xmm5,%%xmm3                   \n"
+    "paddusw   %%xmm2,%%xmm0                   \n"
+    "paddusw   %%xmm3,%%xmm1                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),     // %0
+    "+r"(dst_ptr),     // %1
+    "+r"(src_width)    // %2
+  :
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+
+
+#ifdef HAS_SCALEADDROW_AVX2
+// Reads 32 bytes and accumulates to 32 shorts at a time.
+void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
+  asm volatile (
+    "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm3        \n"
+    "lea        " MEMLEA(0x20,0) ",%0          \n"  // src_ptr += 32
+    "vpermq     $0xd8,%%ymm3,%%ymm3            \n"
+    "vpunpcklbw %%ymm5,%%ymm3,%%ymm2           \n"
+    "vpunpckhbw %%ymm5,%%ymm3,%%ymm3           \n"
+    "vpaddusw   " MEMACCESS(1) ",%%ymm2,%%ymm0 \n"
+    "vpaddusw   " MEMACCESS2(0x20,1) ",%%ymm3,%%ymm1 \n"
+    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
+    "vmovdqu    %%ymm1," MEMACCESS2(0x20,1) "  \n"
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_ptr),     // %0
+    "+r"(dst_ptr),     // %1
+    "+r"(src_width)    // %2
+  :
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_SCALEADDROW_AVX2
+
+// Bilinear column filtering. SSSE3 version.
+void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                           int dst_width, int x, int dx) {
+  intptr_t x0 = 0, x1 = 0, temp_pixel = 0;
+  asm volatile (
+    "movd      %6,%%xmm2                       \n"
+    "movd      %7,%%xmm3                       \n"
+    "movl      $0x04040000,%k2                 \n"
+    "movd      %k2,%%xmm5                      \n"
+    "pcmpeqb   %%xmm6,%%xmm6                   \n"
+    "psrlw     $0x9,%%xmm6                     \n"
+    "pextrw    $0x1,%%xmm2,%k3                 \n"
+    "subl      $0x2,%5                         \n"
+    "jl        29f                             \n"
+    "movdqa    %%xmm2,%%xmm0                   \n"
+    "paddd     %%xmm3,%%xmm0                   \n"
+    "punpckldq %%xmm0,%%xmm2                   \n"
+    "punpckldq %%xmm3,%%xmm3                   \n"
+    "paddd     %%xmm3,%%xmm3                   \n"
+    "pextrw    $0x3,%%xmm2,%k4                 \n"
+
+    LABELALIGN
+  "2:                                          \n"
+    "movdqa    %%xmm2,%%xmm1                   \n"
+    "paddd     %%xmm3,%%xmm2                   \n"
+    MEMOPARG(movzwl,0x00,1,3,1,k2)             //  movzwl  (%1,%3,1),%k2
+    "movd      %k2,%%xmm0                      \n"
+    "psrlw     $0x9,%%xmm1                     \n"
+    MEMOPARG(movzwl,0x00,1,4,1,k2)             //  movzwl  (%1,%4,1),%k2
+    "movd      %k2,%%xmm4                      \n"
+    "pshufb    %%xmm5,%%xmm1                   \n"
+    "punpcklwd %%xmm4,%%xmm0                   \n"
+    "pxor      %%xmm6,%%xmm1                   \n"
+    "pmaddubsw %%xmm1,%%xmm0                   \n"
+    "pextrw    $0x1,%%xmm2,%k3                 \n"
+    "pextrw    $0x3,%%xmm2,%k4                 \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movd      %%xmm0,%k2                      \n"
+    "mov       %w2," MEMACCESS(0) "            \n"
+    "lea       " MEMLEA(0x2,0) ",%0            \n"
+    "sub       $0x2,%5                         \n"
+    "jge       2b                              \n"
+
+    LABELALIGN
+  "29:                                         \n"
+    "addl      $0x1,%5                         \n"
+    "jl        99f                             \n"
+    MEMOPARG(movzwl,0x00,1,3,1,k2)             //  movzwl  (%1,%3,1),%k2
+    "movd      %k2,%%xmm0                      \n"
+    "psrlw     $0x9,%%xmm2                     \n"
+    "pshufb    %%xmm5,%%xmm2                   \n"
+    "pxor      %%xmm6,%%xmm2                   \n"
+    "pmaddubsw %%xmm2,%%xmm0                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movd      %%xmm0,%k2                      \n"
+    "mov       %b2," MEMACCESS(0) "            \n"
+  "99:                                         \n"
+  : "+r"(dst_ptr),     // %0
+    "+r"(src_ptr),     // %1
+    "+a"(temp_pixel),  // %2
+    "+r"(x0),          // %3
+    "+r"(x1),          // %4
+    "+rm"(dst_width)   // %5
+  : "rm"(x),           // %6
+    "rm"(dx)           // %7
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+
+// Reads 4 pixels, duplicates them and writes 8 pixels.
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+                       int dst_width, int x, int dx) {
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "punpckhbw %%xmm1,%%xmm1                   \n"
+    "movdqu    %%xmm0," MEMACCESS(0) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "sub       $0x20,%2                         \n"
+    "jg        1b                              \n"
+
+  : "+r"(dst_ptr),     // %0
+    "+r"(src_ptr),     // %1
+    "+r"(dst_width)    // %2
+  :: "memory", "cc", "xmm0", "xmm1"
+  );
+}
+
+void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
+                            ptrdiff_t src_stride,
+                            uint8* dst_argb, int dst_width) {
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "shufps    $0xdd,%%xmm1,%%xmm0             \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(dst_width)  // %2
+  :: "memory", "cc", "xmm0", "xmm1"
+  );
+}
+
+void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
+                                  ptrdiff_t src_stride,
+                                  uint8* dst_argb, int dst_width) {
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm2             \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(dst_width)  // %2
+  :: "memory", "cc", "xmm0", "xmm1"
+  );
+}
+
+void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
+                               ptrdiff_t src_stride,
+                               uint8* dst_argb, int dst_width) {
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu   (%0,%3,1),%%xmm2
+    MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu   0x10(%0,%3,1),%%xmm3
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "pavgb     %%xmm3,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm2             \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_argb),   // %1
+    "+r"(dst_width)   // %2
+  : "r"((intptr_t)(src_stride))   // %3
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3"
+  );
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: dst_argb 16 byte aligned.
+void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                               int src_stepx, uint8* dst_argb, int dst_width) {
+  intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
+  intptr_t src_stepx_x12 = 0;
+  asm volatile (
+    "lea       " MEMLEA3(0x00,1,4) ",%1        \n"
+    "lea       " MEMLEA4(0x00,1,1,2) ",%4      \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movd      " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movd,0x00,0,1,1,xmm1)             //  movd      (%0,%1,1),%%xmm1
+    "punpckldq %%xmm1,%%xmm0                   \n"
+    MEMOPREG(movd,0x00,0,1,2,xmm2)             //  movd      (%0,%1,2),%%xmm2
+    MEMOPREG(movd,0x00,0,4,1,xmm3)             //  movd      (%0,%4,1),%%xmm3
+    "lea       " MEMLEA4(0x00,0,1,4) ",%0      \n"
+    "punpckldq %%xmm3,%%xmm2                   \n"
+    "punpcklqdq %%xmm2,%%xmm0                  \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),      // %0
+    "+r"(src_stepx_x4),  // %1
+    "+r"(dst_argb),      // %2
+    "+r"(dst_width),     // %3
+    "+r"(src_stepx_x12)  // %4
+  :: "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3"
+  );
+}
+
+// Blends four 2x2 to 4x1.
+// Alignment requirement: dst_argb 16 byte aligned.
+void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
+                                  ptrdiff_t src_stride, int src_stepx,
+                                  uint8* dst_argb, int dst_width) {
+  intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
+  intptr_t src_stepx_x12 = 0;
+  intptr_t row1 = (intptr_t)(src_stride);
+  asm volatile (
+    "lea       " MEMLEA3(0x00,1,4) ",%1        \n"
+    "lea       " MEMLEA4(0x00,1,1,2) ",%4      \n"
+    "lea       " MEMLEA4(0x00,0,5,1) ",%5      \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "movq      " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movhps,0x00,0,1,1,xmm0)           //  movhps    (%0,%1,1),%%xmm0
+    MEMOPREG(movq,0x00,0,1,2,xmm1)             //  movq      (%0,%1,2),%%xmm1
+    MEMOPREG(movhps,0x00,0,4,1,xmm1)           //  movhps    (%0,%4,1),%%xmm1
+    "lea       " MEMLEA4(0x00,0,1,4) ",%0      \n"
+    "movq      " MEMACCESS(5) ",%%xmm2         \n"
+    MEMOPREG(movhps,0x00,5,1,1,xmm2)           //  movhps    (%5,%1,1),%%xmm2
+    MEMOPREG(movq,0x00,5,1,2,xmm3)             //  movq      (%5,%1,2),%%xmm3
+    MEMOPREG(movhps,0x00,5,4,1,xmm3)           //  movhps    (%5,%4,1),%%xmm3
+    "lea       " MEMLEA4(0x00,5,1,4) ",%5      \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "pavgb     %%xmm3,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm2             \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),       // %0
+    "+r"(src_stepx_x4),   // %1
+    "+r"(dst_argb),       // %2
+    "+rm"(dst_width),     // %3
+    "+r"(src_stepx_x12),  // %4
+    "+r"(row1)            // %5
+  :: "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3"
+  );
+}
+
+void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
+                        int dst_width, int x, int dx) {
+  intptr_t x0 = 0, x1 = 0;
+  asm volatile (
+    "movd      %5,%%xmm2                       \n"
+    "movd      %6,%%xmm3                       \n"
+    "pshufd    $0x0,%%xmm2,%%xmm2              \n"
+    "pshufd    $0x11,%%xmm3,%%xmm0             \n"
+    "paddd     %%xmm0,%%xmm2                   \n"
+    "paddd     %%xmm3,%%xmm3                   \n"
+    "pshufd    $0x5,%%xmm3,%%xmm0              \n"
+    "paddd     %%xmm0,%%xmm2                   \n"
+    "paddd     %%xmm3,%%xmm3                   \n"
+    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
+    "pextrw    $0x1,%%xmm2,%k0                 \n"
+    "pextrw    $0x3,%%xmm2,%k1                 \n"
+    "cmp       $0x0,%4                         \n"
+    "jl        99f                             \n"
+    "sub       $0x4,%4                         \n"
+    "jl        49f                             \n"
+
+    LABELALIGN
+  "40:                                         \n"
+    MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
+    MEMOPREG(movd,0x00,3,1,4,xmm1)             //  movd      (%3,%1,4),%%xmm1
+    "pextrw    $0x5,%%xmm2,%k0                 \n"
+    "pextrw    $0x7,%%xmm2,%k1                 \n"
+    "paddd     %%xmm3,%%xmm2                   \n"
+    "punpckldq %%xmm1,%%xmm0                   \n"
+    MEMOPREG(movd,0x00,3,0,4,xmm1)             //  movd      (%3,%0,4),%%xmm1
+    MEMOPREG(movd,0x00,3,1,4,xmm4)             //  movd      (%3,%1,4),%%xmm4
+    "pextrw    $0x1,%%xmm2,%k0                 \n"
+    "pextrw    $0x3,%%xmm2,%k1                 \n"
+    "punpckldq %%xmm4,%%xmm1                   \n"
+    "punpcklqdq %%xmm1,%%xmm0                  \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%4                         \n"
+    "jge       40b                             \n"
+
+  "49:                                         \n"
+    "test      $0x2,%4                         \n"
+    "je        29f                             \n"
+    MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
+    MEMOPREG(movd,0x00,3,1,4,xmm1)             //  movd      (%3,%1,4),%%xmm1
+    "pextrw    $0x5,%%xmm2,%k0                 \n"
+    "punpckldq %%xmm1,%%xmm0                   \n"
+    "movq      %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x8,2) ",%2            \n"
+  "29:                                         \n"
+    "test      $0x1,%4                         \n"
+    "je        99f                             \n"
+    MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
+    "movd      %%xmm0," MEMACCESS(2) "         \n"
+  "99:                                         \n"
+  : "+a"(x0),          // %0
+    "+d"(x1),          // %1
+    "+r"(dst_argb),    // %2
+    "+r"(src_argb),    // %3
+    "+r"(dst_width)    // %4
+  : "rm"(x),           // %5
+    "rm"(dx)           // %6
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+  );
+}
+
+// Reads 4 pixels, duplicates them and writes 8 pixels.
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
+                           int dst_width, int x, int dx) {
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpckldq %%xmm0,%%xmm0                   \n"
+    "punpckhdq %%xmm1,%%xmm1                   \n"
+    "movdqu    %%xmm0," MEMACCESS(0) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+
+  : "+r"(dst_argb),    // %0
+    "+r"(src_argb),    // %1
+    "+r"(dst_width)    // %2
+  :: "memory", "cc", NACL_R14
+    "xmm0", "xmm1"
+  );
+}
+
+// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
+static uvec8 kShuffleColARGB = {
+  0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel
+  8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
+};
+
+// Shuffle table for duplicating 2 fractions into 8 bytes each
+static uvec8 kShuffleFractions = {
+  0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
+};
+
+// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
+void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
+                               int dst_width, int x, int dx) {
+  intptr_t x0 = 0, x1 = 0;
+  asm volatile (
+    "movdqa    %0,%%xmm4                       \n"
+    "movdqa    %1,%%xmm5                       \n"
+  :
+  : "m"(kShuffleColARGB),  // %0
+    "m"(kShuffleFractions)  // %1
+  );
+
+  asm volatile (
+    "movd      %5,%%xmm2                       \n"
+    "movd      %6,%%xmm3                       \n"
+    "pcmpeqb   %%xmm6,%%xmm6                   \n"
+    "psrlw     $0x9,%%xmm6                     \n"
+    "pextrw    $0x1,%%xmm2,%k3                 \n"
+    "sub       $0x2,%2                         \n"
+    "jl        29f                             \n"
+    "movdqa    %%xmm2,%%xmm0                   \n"
+    "paddd     %%xmm3,%%xmm0                   \n"
+    "punpckldq %%xmm0,%%xmm2                   \n"
+    "punpckldq %%xmm3,%%xmm3                   \n"
+    "paddd     %%xmm3,%%xmm3                   \n"
+    "pextrw    $0x3,%%xmm2,%k4                 \n"
+
+    LABELALIGN
+  "2:                                          \n"
+    "movdqa    %%xmm2,%%xmm1                   \n"
+    "paddd     %%xmm3,%%xmm2                   \n"
+    MEMOPREG(movq,0x00,1,3,4,xmm0)             //  movq      (%1,%3,4),%%xmm0
+    "psrlw     $0x9,%%xmm1                     \n"
+    MEMOPREG(movhps,0x00,1,4,4,xmm0)           //  movhps    (%1,%4,4),%%xmm0
+    "pshufb    %%xmm5,%%xmm1                   \n"
+    "pshufb    %%xmm4,%%xmm0                   \n"
+    "pxor      %%xmm6,%%xmm1                   \n"
+    "pmaddubsw %%xmm1,%%xmm0                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "pextrw    $0x1,%%xmm2,%k3                 \n"
+    "pextrw    $0x3,%%xmm2,%k4                 \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movq      %%xmm0," MEMACCESS(0) "         \n"
+    "lea       " MEMLEA(0x8,0) ",%0            \n"
+    "sub       $0x2,%2                         \n"
+    "jge       2b                              \n"
+
+    LABELALIGN
+  "29:                                         \n"
+    "add       $0x1,%2                         \n"
+    "jl        99f                             \n"
+    "psrlw     $0x9,%%xmm2                     \n"
+    MEMOPREG(movq,0x00,1,3,4,xmm0)             //  movq      (%1,%3,4),%%xmm0
+    "pshufb    %%xmm5,%%xmm2                   \n"
+    "pshufb    %%xmm4,%%xmm0                   \n"
+    "pxor      %%xmm6,%%xmm2                   \n"
+    "pmaddubsw %%xmm2,%%xmm0                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movd      %%xmm0," MEMACCESS(0) "         \n"
+
+    LABELALIGN
+  "99:                                         \n"
+  : "+r"(dst_argb),    // %0
+    "+r"(src_argb),    // %1
+    "+rm"(dst_width),  // %2
+    "+r"(x0),          // %3
+    "+r"(x1)           // %4
+  : "rm"(x),           // %5
+    "rm"(dx)           // %6
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv_X86(int num, int div) {
+  asm volatile (
+    "cdq                                       \n"
+    "shld      $0x10,%%eax,%%edx               \n"
+    "shl       $0x10,%%eax                     \n"
+    "idiv      %1                              \n"
+    "mov       %0, %%eax                       \n"
+    : "+a"(num)  // %0
+    : "c"(div)   // %1
+    : "memory", "cc", "edx"
+  );
+  return num;
+}
+
+// Divide num - 1 by div - 1 and return as 16.16 fixed point result.
+int FixedDiv1_X86(int num, int div) {
+  asm volatile (
+    "cdq                                       \n"
+    "shld      $0x10,%%eax,%%edx               \n"
+    "shl       $0x10,%%eax                     \n"
+    "sub       $0x10001,%%eax                  \n"
+    "sbb       $0x0,%%edx                      \n"
+    "sub       $0x1,%1                         \n"
+    "idiv      %1                              \n"
+    "mov       %0, %%eax                       \n"
+    : "+a"(num)  // %0
+    : "c"(div)   // %1
+    : "memory", "cc", "edx"
+  );
+  return num;
+}
+
+#endif  // defined(__x86_64__) || defined(__i386__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libyuv/source/scale_mips.cc b/libs/libyuv/source/scale_mips.cc
new file mode 100644
index 0000000000..ae953073fa
--- /dev/null
+++ b/libs/libyuv/source/scale_mips.cc
@@ -0,0 +1,644 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC MIPS DSPR2
+#if !defined(LIBYUV_DISABLE_MIPS) && \
+    defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
+    (_MIPS_SIM == _MIPS_SIM_ABI32)
+
+void ScaleRowDown2_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                         uint8* dst, int dst_width) {
+  __asm__ __volatile__(
+    ".set push                                     \n"
+    ".set noreorder                                \n"
+
+    "srl            $t9, %[dst_width], 4           \n"  // iterations -> by 16
+    "beqz           $t9, 2f                        \n"
+    " nop                                          \n"
+
+  "1:                                              \n"
+    "lw             $t0, 0(%[src_ptr])             \n"  // |3|2|1|0|
+    "lw             $t1, 4(%[src_ptr])             \n"  // |7|6|5|4|
+    "lw             $t2, 8(%[src_ptr])             \n"  // |11|10|9|8|
+    "lw             $t3, 12(%[src_ptr])            \n"  // |15|14|13|12|
+    "lw             $t4, 16(%[src_ptr])            \n"  // |19|18|17|16|
+    "lw             $t5, 20(%[src_ptr])            \n"  // |23|22|21|20|
+    "lw             $t6, 24(%[src_ptr])            \n"  // |27|26|25|24|
+    "lw             $t7, 28(%[src_ptr])            \n"  // |31|30|29|28|
+    // TODO(fbarchard): Use odd pixels instead of even.
+    "precr.qb.ph    $t8, $t1, $t0                  \n"  // |6|4|2|0|
+    "precr.qb.ph    $t0, $t3, $t2                  \n"  // |14|12|10|8|
+    "precr.qb.ph    $t1, $t5, $t4                  \n"  // |22|20|18|16|
+    "precr.qb.ph    $t2, $t7, $t6                  \n"  // |30|28|26|24|
+    "addiu          %[src_ptr], %[src_ptr], 32     \n"
+    "addiu          $t9, $t9, -1                   \n"
+    "sw             $t8, 0(%[dst])                 \n"
+    "sw             $t0, 4(%[dst])                 \n"
+    "sw             $t1, 8(%[dst])                 \n"
+    "sw             $t2, 12(%[dst])                \n"
+    "bgtz           $t9, 1b                        \n"
+    " addiu         %[dst], %[dst], 16             \n"
+
+  "2:                                              \n"
+    "andi           $t9, %[dst_width], 0xf         \n"  // residue
+    "beqz           $t9, 3f                        \n"
+    " nop                                          \n"
+
+  "21:                                             \n"
+    "lbu            $t0, 0(%[src_ptr])             \n"
+    "addiu          %[src_ptr], %[src_ptr], 2      \n"
+    "addiu          $t9, $t9, -1                   \n"
+    "sb             $t0, 0(%[dst])                 \n"
+    "bgtz           $t9, 21b                       \n"
+    " addiu         %[dst], %[dst], 1              \n"
+
+  "3:                                              \n"
+    ".set pop                                      \n"
+  : [src_ptr] "+r" (src_ptr),
+    [dst] "+r" (dst)
+  : [dst_width] "r" (dst_width)
+  : "t0", "t1", "t2", "t3", "t4", "t5",
+    "t6", "t7", "t8", "t9"
+  );
+}
+
+void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst, int dst_width) {
+  const uint8* t = src_ptr + src_stride;
+
+  __asm__ __volatile__ (
+    ".set push                                    \n"
+    ".set noreorder                               \n"
+
+    "srl            $t9, %[dst_width], 3          \n"  // iterations -> step 8
+    "bltz           $t9, 2f                       \n"
+    " nop                                         \n"
+
+  "1:                                             \n"
+    "lw             $t0, 0(%[src_ptr])            \n"  // |3|2|1|0|
+    "lw             $t1, 4(%[src_ptr])            \n"  // |7|6|5|4|
+    "lw             $t2, 8(%[src_ptr])            \n"  // |11|10|9|8|
+    "lw             $t3, 12(%[src_ptr])           \n"  // |15|14|13|12|
+    "lw             $t4, 0(%[t])                  \n"  // |19|18|17|16|
+    "lw             $t5, 4(%[t])                  \n"  // |23|22|21|20|
+    "lw             $t6, 8(%[t])                  \n"  // |27|26|25|24|
+    "lw             $t7, 12(%[t])                 \n"  // |31|30|29|28|
+    "addiu          $t9, $t9, -1                  \n"
+    "srl            $t8, $t0, 16                  \n"  // |X|X|3|2|
+    "ins            $t0, $t4, 16, 16              \n"  // |17|16|1|0|
+    "ins            $t4, $t8, 0, 16               \n"  // |19|18|3|2|
+    "raddu.w.qb     $t0, $t0                      \n"  // |17+16+1+0|
+    "raddu.w.qb     $t4, $t4                      \n"  // |19+18+3+2|
+    "shra_r.w       $t0, $t0, 2                   \n"  // |t0+2|>>2
+    "shra_r.w       $t4, $t4, 2                   \n"  // |t4+2|>>2
+    "srl            $t8, $t1, 16                  \n"  // |X|X|7|6|
+    "ins            $t1, $t5, 16, 16              \n"  // |21|20|5|4|
+    "ins            $t5, $t8, 0, 16               \n"  // |22|23|7|6|
+    "raddu.w.qb     $t1, $t1                      \n"  // |21+20+5+4|
+    "raddu.w.qb     $t5, $t5                      \n"  // |23+22+7+6|
+    "shra_r.w       $t1, $t1, 2                   \n"  // |t1+2|>>2
+    "shra_r.w       $t5, $t5, 2                   \n"  // |t5+2|>>2
+    "srl            $t8, $t2, 16                  \n"  // |X|X|11|10|
+    "ins            $t2, $t6, 16, 16              \n"  // |25|24|9|8|
+    "ins            $t6, $t8, 0, 16               \n"  // |27|26|11|10|
+    "raddu.w.qb     $t2, $t2                      \n"  // |25+24+9+8|
+    "raddu.w.qb     $t6, $t6                      \n"  // |27+26+11+10|
+    "shra_r.w       $t2, $t2, 2                   \n"  // |t2+2|>>2
+    "shra_r.w       $t6, $t6, 2                   \n"  // |t5+2|>>2
+    "srl            $t8, $t3, 16                  \n"  // |X|X|15|14|
+    "ins            $t3, $t7, 16, 16              \n"  // |29|28|13|12|
+    "ins            $t7, $t8, 0, 16               \n"  // |31|30|15|14|
+    "raddu.w.qb     $t3, $t3                      \n"  // |29+28+13+12|
+    "raddu.w.qb     $t7, $t7                      \n"  // |31+30+15+14|
+    "shra_r.w       $t3, $t3, 2                   \n"  // |t3+2|>>2
+    "shra_r.w       $t7, $t7, 2                   \n"  // |t7+2|>>2
+    "addiu          %[src_ptr], %[src_ptr], 16    \n"
+    "addiu          %[t], %[t], 16                \n"
+    "sb             $t0, 0(%[dst])                \n"
+    "sb             $t4, 1(%[dst])                \n"
+    "sb             $t1, 2(%[dst])                \n"
+    "sb             $t5, 3(%[dst])                \n"
+    "sb             $t2, 4(%[dst])                \n"
+    "sb             $t6, 5(%[dst])                \n"
+    "sb             $t3, 6(%[dst])                \n"
+    "sb             $t7, 7(%[dst])                \n"
+    "bgtz           $t9, 1b                       \n"
+    " addiu         %[dst], %[dst], 8             \n"
+
+  "2:                                             \n"
+    "andi           $t9, %[dst_width], 0x7        \n"  // x = residue
+    "beqz           $t9, 3f                       \n"
+    " nop                                         \n"
+
+    "21:                                          \n"
+    "lwr            $t1, 0(%[src_ptr])            \n"
+    "lwl            $t1, 3(%[src_ptr])            \n"
+    "lwr            $t2, 0(%[t])                  \n"
+    "lwl            $t2, 3(%[t])                  \n"
+    "srl            $t8, $t1, 16                  \n"
+    "ins            $t1, $t2, 16, 16              \n"
+    "ins            $t2, $t8, 0, 16               \n"
+    "raddu.w.qb     $t1, $t1                      \n"
+    "raddu.w.qb     $t2, $t2                      \n"
+    "shra_r.w       $t1, $t1, 2                   \n"
+    "shra_r.w       $t2, $t2, 2                   \n"
+    "sb             $t1, 0(%[dst])                \n"
+    "sb             $t2, 1(%[dst])                \n"
+    "addiu          %[src_ptr], %[src_ptr], 4     \n"
+    "addiu          $t9, $t9, -2                  \n"
+    "addiu          %[t], %[t], 4                 \n"
+    "bgtz           $t9, 21b                      \n"
+    " addiu         %[dst], %[dst], 2             \n"
+
+  "3:                                             \n"
+    ".set pop                                     \n"
+
+  : [src_ptr] "+r" (src_ptr),
+    [dst] "+r" (dst), [t] "+r" (t)
+  : [dst_width] "r" (dst_width)
+  : "t0", "t1", "t2", "t3", "t4", "t5",
+    "t6", "t7", "t8", "t9"
+  );
+}
+
+void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                         uint8* dst, int dst_width) {
+  __asm__ __volatile__ (
+      ".set push                                    \n"
+      ".set noreorder                               \n"
+
+      "srl            $t9, %[dst_width], 3          \n"
+      "beqz           $t9, 2f                       \n"
+      " nop                                         \n"
+
+     "1:                                            \n"
+      "lw             $t1, 0(%[src_ptr])            \n"  // |3|2|1|0|
+      "lw             $t2, 4(%[src_ptr])            \n"  // |7|6|5|4|
+      "lw             $t3, 8(%[src_ptr])            \n"  // |11|10|9|8|
+      "lw             $t4, 12(%[src_ptr])           \n"  // |15|14|13|12|
+      "lw             $t5, 16(%[src_ptr])           \n"  // |19|18|17|16|
+      "lw             $t6, 20(%[src_ptr])           \n"  // |23|22|21|20|
+      "lw             $t7, 24(%[src_ptr])           \n"  // |27|26|25|24|
+      "lw             $t8, 28(%[src_ptr])           \n"  // |31|30|29|28|
+      "precr.qb.ph    $t1, $t2, $t1                 \n"  // |6|4|2|0|
+      "precr.qb.ph    $t2, $t4, $t3                 \n"  // |14|12|10|8|
+      "precr.qb.ph    $t5, $t6, $t5                 \n"  // |22|20|18|16|
+      "precr.qb.ph    $t6, $t8, $t7                 \n"  // |30|28|26|24|
+      "precr.qb.ph    $t1, $t2, $t1                 \n"  // |12|8|4|0|
+      "precr.qb.ph    $t5, $t6, $t5                 \n"  // |28|24|20|16|
+      "addiu          %[src_ptr], %[src_ptr], 32    \n"
+      "addiu          $t9, $t9, -1                  \n"
+      "sw             $t1, 0(%[dst])                \n"
+      "sw             $t5, 4(%[dst])                \n"
+      "bgtz           $t9, 1b                       \n"
+      " addiu         %[dst], %[dst], 8             \n"
+
+    "2:                                             \n"
+      "andi           $t9, %[dst_width], 7          \n"  // residue
+      "beqz           $t9, 3f                       \n"
+      " nop                                         \n"
+
+    "21:                                            \n"
+      "lbu            $t1, 0(%[src_ptr])            \n"
+      "addiu          %[src_ptr], %[src_ptr], 4     \n"
+      "addiu          $t9, $t9, -1                  \n"
+      "sb             $t1, 0(%[dst])                \n"
+      "bgtz           $t9, 21b                      \n"
+      " addiu         %[dst], %[dst], 1             \n"
+
+    "3:                                             \n"
+      ".set pop                                     \n"
+      : [src_ptr] "+r" (src_ptr),
+        [dst] "+r" (dst)
+      : [dst_width] "r" (dst_width)
+      : "t1", "t2", "t3", "t4", "t5",
+        "t6", "t7", "t8", "t9"
+  );
+}
+
+void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst, int dst_width) {
+  intptr_t stride = src_stride;
+  const uint8* s1 = src_ptr + stride;
+  const uint8* s2 = s1 + stride;
+  const uint8* s3 = s2 + stride;
+
+  __asm__ __volatile__ (
+      ".set push                                  \n"
+      ".set noreorder                             \n"
+
+      "srl           $t9, %[dst_width], 1         \n"
+      "andi          $t8, %[dst_width], 1         \n"
+
+     "1:                                          \n"
+      "lw            $t0, 0(%[src_ptr])           \n"  // |3|2|1|0|
+      "lw            $t1, 0(%[s1])                \n"  // |7|6|5|4|
+      "lw            $t2, 0(%[s2])                \n"  // |11|10|9|8|
+      "lw            $t3, 0(%[s3])                \n"  // |15|14|13|12|
+      "lw            $t4, 4(%[src_ptr])           \n"  // |19|18|17|16|
+      "lw            $t5, 4(%[s1])                \n"  // |23|22|21|20|
+      "lw            $t6, 4(%[s2])                \n"  // |27|26|25|24|
+      "lw            $t7, 4(%[s3])                \n"  // |31|30|29|28|
+      "raddu.w.qb    $t0, $t0                     \n"  // |3 + 2 + 1 + 0|
+      "raddu.w.qb    $t1, $t1                     \n"  // |7 + 6 + 5 + 4|
+      "raddu.w.qb    $t2, $t2                     \n"  // |11 + 10 + 9 + 8|
+      "raddu.w.qb    $t3, $t3                     \n"  // |15 + 14 + 13 + 12|
+      "raddu.w.qb    $t4, $t4                     \n"  // |19 + 18 + 17 + 16|
+      "raddu.w.qb    $t5, $t5                     \n"  // |23 + 22 + 21 + 20|
+      "raddu.w.qb    $t6, $t6                     \n"  // |27 + 26 + 25 + 24|
+      "raddu.w.qb    $t7, $t7                     \n"  // |31 + 30 + 29 + 28|
+      "add           $t0, $t0, $t1                \n"
+      "add           $t1, $t2, $t3                \n"
+      "add           $t0, $t0, $t1                \n"
+      "add           $t4, $t4, $t5                \n"
+      "add           $t6, $t6, $t7                \n"
+      "add           $t4, $t4, $t6                \n"
+      "shra_r.w      $t0, $t0, 4                  \n"
+      "shra_r.w      $t4, $t4, 4                  \n"
+      "sb            $t0, 0(%[dst])               \n"
+      "sb            $t4, 1(%[dst])               \n"
+      "addiu         %[src_ptr], %[src_ptr], 8    \n"
+      "addiu         %[s1], %[s1], 8              \n"
+      "addiu         %[s2], %[s2], 8              \n"
+      "addiu         %[s3], %[s3], 8              \n"
+      "addiu         $t9, $t9, -1                 \n"
+      "bgtz          $t9, 1b                      \n"
+      " addiu        %[dst], %[dst], 2            \n"
+      "beqz          $t8, 2f                      \n"
+      " nop                                       \n"
+
+      "lw            $t0, 0(%[src_ptr])           \n"  // |3|2|1|0|
+      "lw            $t1, 0(%[s1])                \n"  // |7|6|5|4|
+      "lw            $t2, 0(%[s2])                \n"  // |11|10|9|8|
+      "lw            $t3, 0(%[s3])                \n"  // |15|14|13|12|
+      "raddu.w.qb    $t0, $t0                     \n"  // |3 + 2 + 1 + 0|
+      "raddu.w.qb    $t1, $t1                     \n"  // |7 + 6 + 5 + 4|
+      "raddu.w.qb    $t2, $t2                     \n"  // |11 + 10 + 9 + 8|
+      "raddu.w.qb    $t3, $t3                     \n"  // |15 + 14 + 13 + 12|
+      "add           $t0, $t0, $t1                \n"
+      "add           $t1, $t2, $t3                \n"
+      "add           $t0, $t0, $t1                \n"
+      "shra_r.w      $t0, $t0, 4                  \n"
+      "sb            $t0, 0(%[dst])               \n"
+
+      "2:                                         \n"
+      ".set pop                                   \n"
+
+      : [src_ptr] "+r" (src_ptr),
+        [dst] "+r" (dst),
+        [s1] "+r" (s1),
+        [s2] "+r" (s2),
+        [s3] "+r" (s3)
+      : [dst_width] "r" (dst_width)
+      : "t0", "t1", "t2", "t3", "t4", "t5",
+        "t6","t7", "t8", "t9"
+  );
+}
+
+void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                          uint8* dst, int dst_width) {
+  __asm__ __volatile__ (
+      ".set push                                          \n"
+      ".set noreorder                                     \n"
+    "1:                                                   \n"
+      "lw              $t1, 0(%[src_ptr])                 \n"  // |3|2|1|0|
+      "lw              $t2, 4(%[src_ptr])                 \n"  // |7|6|5|4|
+      "lw              $t3, 8(%[src_ptr])                 \n"  // |11|10|9|8|
+      "lw              $t4, 12(%[src_ptr])                \n"  // |15|14|13|12|
+      "lw              $t5, 16(%[src_ptr])                \n"  // |19|18|17|16|
+      "lw              $t6, 20(%[src_ptr])                \n"  // |23|22|21|20|
+      "lw              $t7, 24(%[src_ptr])                \n"  // |27|26|25|24|
+      "lw              $t8, 28(%[src_ptr])                \n"  // |31|30|29|28|
+      "precrq.qb.ph    $t0, $t2, $t4                      \n"  // |7|5|15|13|
+      "precrq.qb.ph    $t9, $t6, $t8                      \n"  // |23|21|31|30|
+      "addiu           %[dst_width], %[dst_width], -24    \n"
+      "ins             $t1, $t1, 8, 16                    \n"  // |3|1|0|X|
+      "ins             $t4, $t0, 8, 16                    \n"  // |X|15|13|12|
+      "ins             $t5, $t5, 8, 16                    \n"  // |19|17|16|X|
+      "ins             $t8, $t9, 8, 16                    \n"  // |X|31|29|28|
+      "addiu           %[src_ptr], %[src_ptr], 32         \n"
+      "packrl.ph       $t0, $t3, $t0                      \n"  // |9|8|7|5|
+      "packrl.ph       $t9, $t7, $t9                      \n"  // |25|24|23|21|
+      "prepend         $t1, $t2, 8                        \n"  // |4|3|1|0|
+      "prepend         $t3, $t4, 24                       \n"  // |15|13|12|11|
+      "prepend         $t5, $t6, 8                        \n"  // |20|19|17|16|
+      "prepend         $t7, $t8, 24                       \n"  // |31|29|28|27|
+      "sw              $t1, 0(%[dst])                     \n"
+      "sw              $t0, 4(%[dst])                     \n"
+      "sw              $t3, 8(%[dst])                     \n"
+      "sw              $t5, 12(%[dst])                    \n"
+      "sw              $t9, 16(%[dst])                    \n"
+      "sw              $t7, 20(%[dst])                    \n"
+      "bnez            %[dst_width], 1b                   \n"
+      " addiu          %[dst], %[dst], 24                 \n"
+      ".set pop                                           \n"
+      : [src_ptr] "+r" (src_ptr),
+        [dst] "+r" (dst),
+        [dst_width] "+r" (dst_width)
+      :
+      : "t0", "t1", "t2", "t3", "t4", "t5",
+        "t6","t7", "t8", "t9"
+  );
+}
+
+void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                uint8* d, int dst_width) {
+  __asm__ __volatile__ (
+      ".set push                                         \n"
+      ".set noreorder                                    \n"
+      "repl.ph           $t3, 3                          \n"  // 0x00030003
+
+    "1:                                                  \n"
+      "lw                $t0, 0(%[src_ptr])              \n"  // |S3|S2|S1|S0|
+      "lwx               $t1, %[src_stride](%[src_ptr])  \n"  // |T3|T2|T1|T0|
+      "rotr              $t2, $t0, 8                     \n"  // |S0|S3|S2|S1|
+      "rotr              $t6, $t1, 8                     \n"  // |T0|T3|T2|T1|
+      "muleu_s.ph.qbl    $t4, $t2, $t3                   \n"  // |S0*3|S3*3|
+      "muleu_s.ph.qbl    $t5, $t6, $t3                   \n"  // |T0*3|T3*3|
+      "andi              $t0, $t2, 0xFFFF                \n"  // |0|0|S2|S1|
+      "andi              $t1, $t6, 0xFFFF                \n"  // |0|0|T2|T1|
+      "raddu.w.qb        $t0, $t0                        \n"
+      "raddu.w.qb        $t1, $t1                        \n"
+      "shra_r.w          $t0, $t0, 1                     \n"
+      "shra_r.w          $t1, $t1, 1                     \n"
+      "preceu.ph.qbr     $t2, $t2                        \n"  // |0|S2|0|S1|
+      "preceu.ph.qbr     $t6, $t6                        \n"  // |0|T2|0|T1|
+      "rotr              $t2, $t2, 16                    \n"  // |0|S1|0|S2|
+      "rotr              $t6, $t6, 16                    \n"  // |0|T1|0|T2|
+      "addu.ph           $t2, $t2, $t4                   \n"
+      "addu.ph           $t6, $t6, $t5                   \n"
+      "sll               $t5, $t0, 1                     \n"
+      "add               $t0, $t5, $t0                   \n"
+      "shra_r.ph         $t2, $t2, 2                     \n"
+      "shra_r.ph         $t6, $t6, 2                     \n"
+      "shll.ph           $t4, $t2, 1                     \n"
+      "addq.ph           $t4, $t4, $t2                   \n"
+      "addu              $t0, $t0, $t1                   \n"
+      "addiu             %[src_ptr], %[src_ptr], 4       \n"
+      "shra_r.w          $t0, $t0, 2                     \n"
+      "addu.ph           $t6, $t6, $t4                   \n"
+      "shra_r.ph         $t6, $t6, 2                     \n"
+      "srl               $t1, $t6, 16                    \n"
+      "addiu             %[dst_width], %[dst_width], -3  \n"
+      "sb                $t1, 0(%[d])                    \n"
+      "sb                $t0, 1(%[d])                    \n"
+      "sb                $t6, 2(%[d])                    \n"
+      "bgtz              %[dst_width], 1b                \n"
+      " addiu            %[d], %[d], 3                   \n"
+    "3:                                                  \n"
+      ".set pop                                          \n"
+      : [src_ptr] "+r" (src_ptr),
+        [src_stride] "+r" (src_stride),
+        [d] "+r" (d),
+        [dst_width] "+r" (dst_width)
+      :
+      : "t0", "t1", "t2", "t3",
+        "t4", "t5", "t6"
+  );
+}
+
+void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                uint8* d, int dst_width) {
+  __asm__ __volatile__ (
+      ".set push                                           \n"
+      ".set noreorder                                      \n"
+      "repl.ph           $t2, 3                            \n"  // 0x00030003
+
+    "1:                                                    \n"
+      "lw                $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
+      "lwx               $t1, %[src_stride](%[src_ptr])    \n"  // |T3|T2|T1|T0|
+      "rotr              $t4, $t0, 8                       \n"  // |S0|S3|S2|S1|
+      "rotr              $t6, $t1, 8                       \n"  // |T0|T3|T2|T1|
+      "muleu_s.ph.qbl    $t3, $t4, $t2                     \n"  // |S0*3|S3*3|
+      "muleu_s.ph.qbl    $t5, $t6, $t2                     \n"  // |T0*3|T3*3|
+      "andi              $t0, $t4, 0xFFFF                  \n"  // |0|0|S2|S1|
+      "andi              $t1, $t6, 0xFFFF                  \n"  // |0|0|T2|T1|
+      "raddu.w.qb        $t0, $t0                          \n"
+      "raddu.w.qb        $t1, $t1                          \n"
+      "shra_r.w          $t0, $t0, 1                       \n"
+      "shra_r.w          $t1, $t1, 1                       \n"
+      "preceu.ph.qbr     $t4, $t4                          \n"  // |0|S2|0|S1|
+      "preceu.ph.qbr     $t6, $t6                          \n"  // |0|T2|0|T1|
+      "rotr              $t4, $t4, 16                      \n"  // |0|S1|0|S2|
+      "rotr              $t6, $t6, 16                      \n"  // |0|T1|0|T2|
+      "addu.ph           $t4, $t4, $t3                     \n"
+      "addu.ph           $t6, $t6, $t5                     \n"
+      "shra_r.ph         $t6, $t6, 2                       \n"
+      "shra_r.ph         $t4, $t4, 2                       \n"
+      "addu.ph           $t6, $t6, $t4                     \n"
+      "addiu             %[src_ptr], %[src_ptr], 4         \n"
+      "shra_r.ph         $t6, $t6, 1                       \n"
+      "addu              $t0, $t0, $t1                     \n"
+      "addiu             %[dst_width], %[dst_width], -3    \n"
+      "shra_r.w          $t0, $t0, 1                       \n"
+      "srl               $t1, $t6, 16                      \n"
+      "sb                $t1, 0(%[d])                      \n"
+      "sb                $t0, 1(%[d])                      \n"
+      "sb                $t6, 2(%[d])                      \n"
+      "bgtz              %[dst_width], 1b                  \n"
+      " addiu            %[d], %[d], 3                     \n"
+    "3:                                                    \n"
+      ".set pop                                            \n"
+      : [src_ptr] "+r" (src_ptr),
+        [src_stride] "+r" (src_stride),
+        [d] "+r" (d),
+        [dst_width] "+r" (dst_width)
+      :
+      : "t0", "t1", "t2", "t3",
+        "t4", "t5", "t6"
+  );
+}
+
+void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                          uint8* dst, int dst_width) {
+  __asm__ __volatile__ (
+      ".set push                                     \n"
+      ".set noreorder                                \n"
+
+    "1:                                              \n"
+      "lw         $t0, 0(%[src_ptr])                 \n"  // |3|2|1|0|
+      "lw         $t1, 4(%[src_ptr])                 \n"  // |7|6|5|4|
+      "lw         $t2, 8(%[src_ptr])                 \n"  // |11|10|9|8|
+      "lw         $t3, 12(%[src_ptr])                \n"  // |15|14|13|12|
+      "lw         $t4, 16(%[src_ptr])                \n"  // |19|18|17|16|
+      "lw         $t5, 20(%[src_ptr])                \n"  // |23|22|21|20|
+      "lw         $t6, 24(%[src_ptr])                \n"  // |27|26|25|24|
+      "lw         $t7, 28(%[src_ptr])                \n"  // |31|30|29|28|
+      "wsbh       $t0, $t0                           \n"  // |2|3|0|1|
+      "wsbh       $t6, $t6                           \n"  // |26|27|24|25|
+      "srl        $t0, $t0, 8                        \n"  // |X|2|3|0|
+      "srl        $t3, $t3, 16                       \n"  // |X|X|15|14|
+      "srl        $t5, $t5, 16                       \n"  // |X|X|23|22|
+      "srl        $t7, $t7, 16                       \n"  // |X|X|31|30|
+      "ins        $t1, $t2, 24, 8                    \n"  // |8|6|5|4|
+      "ins        $t6, $t5, 0, 8                     \n"  // |26|27|24|22|
+      "ins        $t1, $t0, 0, 16                    \n"  // |8|6|3|0|
+      "ins        $t6, $t7, 24, 8                    \n"  // |30|27|24|22|
+      "prepend    $t2, $t3, 24                       \n"  // |X|15|14|11|
+      "ins        $t4, $t4, 16, 8                    \n"  // |19|16|17|X|
+      "ins        $t4, $t2, 0, 16                    \n"  // |19|16|14|11|
+      "addiu      %[src_ptr], %[src_ptr], 32         \n"
+      "addiu      %[dst_width], %[dst_width], -12    \n"
+      "addiu      $t8,%[dst_width], -12              \n"
+      "sw         $t1, 0(%[dst])                     \n"
+      "sw         $t4, 4(%[dst])                     \n"
+      "sw         $t6, 8(%[dst])                     \n"
+      "bgez       $t8, 1b                            \n"
+      " addiu     %[dst], %[dst], 12                 \n"
+      ".set pop                                      \n"
+      : [src_ptr] "+r" (src_ptr),
+        [dst] "+r" (dst),
+        [dst_width] "+r" (dst_width)
+      :
+      : "t0", "t1", "t2", "t3", "t4",
+        "t5", "t6", "t7", "t8"
+  );
+}
+
+void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
+  intptr_t stride = src_stride;
+  const uint8* t = src_ptr + stride;
+  const int c = 0x2AAA;
+
+  __asm__ __volatile__ (
+      ".set push                                         \n"
+      ".set noreorder                                    \n"
+
+    "1:                                                  \n"
+      "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
+      "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|
+      "lw              $t2, 0(%[t])                      \n"  // |T3|T2|T1|T0|
+      "lw              $t3, 4(%[t])                      \n"  // |T7|T6|T5|T4|
+      "rotr            $t1, $t1, 16                      \n"  // |S5|S4|S7|S6|
+      "packrl.ph       $t4, $t1, $t3                     \n"  // |S7|S6|T7|T6|
+      "packrl.ph       $t5, $t3, $t1                     \n"  // |T5|T4|S5|S4|
+      "raddu.w.qb      $t4, $t4                          \n"  // S7+S6+T7+T6
+      "raddu.w.qb      $t5, $t5                          \n"  // T5+T4+S5+S4
+      "precrq.qb.ph    $t6, $t0, $t2                     \n"  // |S3|S1|T3|T1|
+      "precrq.qb.ph    $t6, $t6, $t6                     \n"  // |S3|T3|S3|T3|
+      "srl             $t4, $t4, 2                       \n"  // t4 / 4
+      "srl             $t6, $t6, 16                      \n"  // |0|0|S3|T3|
+      "raddu.w.qb      $t6, $t6                          \n"  // 0+0+S3+T3
+      "addu            $t6, $t5, $t6                     \n"
+      "mul             $t6, $t6, %[c]                    \n"  // t6 * 0x2AAA
+      "sll             $t0, $t0, 8                       \n"  // |S2|S1|S0|0|
+      "sll             $t2, $t2, 8                       \n"  // |T2|T1|T0|0|
+      "raddu.w.qb      $t0, $t0                          \n"  // S2+S1+S0+0
+      "raddu.w.qb      $t2, $t2                          \n"  // T2+T1+T0+0
+      "addu            $t0, $t0, $t2                     \n"
+      "mul             $t0, $t0, %[c]                    \n"  // t0 * 0x2AAA
+      "addiu           %[src_ptr], %[src_ptr], 8         \n"
+      "addiu           %[t], %[t], 8                     \n"
+      "addiu           %[dst_width], %[dst_width], -3    \n"
+      "addiu           %[dst_ptr], %[dst_ptr], 3         \n"
+      "srl             $t6, $t6, 16                      \n"
+      "srl             $t0, $t0, 16                      \n"
+      "sb              $t4, -1(%[dst_ptr])               \n"
+      "sb              $t6, -2(%[dst_ptr])               \n"
+      "bgtz            %[dst_width], 1b                  \n"
+      " sb             $t0, -3(%[dst_ptr])               \n"
+      ".set pop                                          \n"
+      : [src_ptr] "+r" (src_ptr),
+        [dst_ptr] "+r" (dst_ptr),
+        [t] "+r" (t),
+        [dst_width] "+r" (dst_width)
+      : [c] "r" (c)
+      : "t0", "t1", "t2", "t3", "t4", "t5", "t6"
+  );
+}
+
+void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
+  intptr_t stride = src_stride;
+  const uint8* s1 = src_ptr + stride;
+  stride += stride;
+  const uint8* s2 = src_ptr + stride;
+  const int c1 = 0x1C71;
+  const int c2 = 0x2AAA;
+
+  __asm__ __volatile__ (
+      ".set push                                         \n"
+      ".set noreorder                                    \n"
+
+    "1:                                                  \n"
+      "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
+      "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|
+      "lw              $t2, 0(%[s1])                     \n"  // |T3|T2|T1|T0|
+      "lw              $t3, 4(%[s1])                     \n"  // |T7|T6|T5|T4|
+      "lw              $t4, 0(%[s2])                     \n"  // |R3|R2|R1|R0|
+      "lw              $t5, 4(%[s2])                     \n"  // |R7|R6|R5|R4|
+      "rotr            $t1, $t1, 16                      \n"  // |S5|S4|S7|S6|
+      "packrl.ph       $t6, $t1, $t3                     \n"  // |S7|S6|T7|T6|
+      "raddu.w.qb      $t6, $t6                          \n"  // S7+S6+T7+T6
+      "packrl.ph       $t7, $t3, $t1                     \n"  // |T5|T4|S5|S4|
+      "raddu.w.qb      $t7, $t7                          \n"  // T5+T4+S5+S4
+      "sll             $t8, $t5, 16                      \n"  // |R5|R4|0|0|
+      "raddu.w.qb      $t8, $t8                          \n"  // R5+R4
+      "addu            $t7, $t7, $t8                     \n"
+      "srl             $t8, $t5, 16                      \n"  // |0|0|R7|R6|
+      "raddu.w.qb      $t8, $t8                          \n"  // R7 + R6
+      "addu            $t6, $t6, $t8                     \n"
+      "mul             $t6, $t6, %[c2]                   \n"  // t6 * 0x2AAA
+      "precrq.qb.ph    $t8, $t0, $t2                     \n"  // |S3|S1|T3|T1|
+      "precrq.qb.ph    $t8, $t8, $t4                     \n"  // |S3|T3|R3|R1|
+      "srl             $t8, $t8, 8                       \n"  // |0|S3|T3|R3|
+      "raddu.w.qb      $t8, $t8                          \n"  // S3 + T3 + R3
+      "addu            $t7, $t7, $t8                     \n"
+      "mul             $t7, $t7, %[c1]                   \n"  // t7 * 0x1C71
+      "sll             $t0, $t0, 8                       \n"  // |S2|S1|S0|0|
+      "sll             $t2, $t2, 8                       \n"  // |T2|T1|T0|0|
+      "sll             $t4, $t4, 8                       \n"  // |R2|R1|R0|0|
+      "raddu.w.qb      $t0, $t0                          \n"
+      "raddu.w.qb      $t2, $t2                          \n"
+      "raddu.w.qb      $t4, $t4                          \n"
+      "addu            $t0, $t0, $t2                     \n"
+      "addu            $t0, $t0, $t4                     \n"
+      "mul             $t0, $t0, %[c1]                   \n"  // t0 * 0x1C71
+      "addiu           %[src_ptr], %[src_ptr], 8         \n"
+      "addiu           %[s1], %[s1], 8                   \n"
+      "addiu           %[s2], %[s2], 8                   \n"
+      "addiu           %[dst_width], %[dst_width], -3    \n"
+      "addiu           %[dst_ptr], %[dst_ptr], 3         \n"
+      "srl             $t6, $t6, 16                      \n"
+      "srl             $t7, $t7, 16                      \n"
+      "srl             $t0, $t0, 16                      \n"
+      "sb              $t6, -1(%[dst_ptr])               \n"
+      "sb              $t7, -2(%[dst_ptr])               \n"
+      "bgtz            %[dst_width], 1b                  \n"
+      " sb             $t0, -3(%[dst_ptr])               \n"
+      ".set pop                                          \n"
+      : [src_ptr] "+r" (src_ptr),
+        [dst_ptr] "+r" (dst_ptr),
+        [s1] "+r" (s1),
+        [s2] "+r" (s2),
+        [dst_width] "+r" (dst_width)
+      : [c1] "r" (c1), [c2] "r" (c2)
+      : "t0", "t1", "t2", "t3", "t4",
+        "t5", "t6", "t7", "t8"
+  );
+}
+
+#endif  // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
diff --git a/libs/libyuv/source/scale_neon.cc b/libs/libyuv/source/scale_neon.cc
new file mode 100644
index 0000000000..10856cf847
--- /dev/null
+++ b/libs/libyuv/source/scale_neon.cc
@@ -0,0 +1,1017 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon.
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+    !defined(__aarch64__)
+
+// NEON downscalers with interpolation.
+// Provided by Fritz Koenig
+
+// Read 32x1 throw away even pixels, and write 16x1.
+void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst, int dst_width) {
+  asm volatile (
+  "1:                                          \n"
+    // load even pixels into q0, odd into q1
+    MEMACCESS(0)
+    "vld2.8     {q0, q1}, [%0]!                \n"
+    "subs       %2, %2, #16                    \n"  // 16 processed per loop
+    MEMACCESS(1)
+    "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst),              // %1
+    "+r"(dst_width)         // %2
+  :
+  : "q0", "q1"              // Clobber List
+  );
+}
+
+// Read 32x1 average down and write 16x1.
+void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst, int dst_width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0, q1}, [%0]!                \n"  // load pixels and post inc
+    "subs       %2, %2, #16                    \n"  // 16 processed per loop
+    "vpaddl.u8  q0, q0                         \n"  // add adjacent
+    "vpaddl.u8  q1, q1                         \n"
+    "vrshrn.u16 d0, q0, #1                     \n"  // downshift, round and pack
+    "vrshrn.u16 d1, q1, #1                     \n"
+    MEMACCESS(1)
+    "vst1.8     {q0}, [%1]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst),              // %1
+    "+r"(dst_width)         // %2
+  :
+  : "q0", "q1"     // Clobber List
+  );
+}
+
+// Read 32x2 average down and write 16x1.
+void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst, int dst_width) {
+  asm volatile (
+    // change the stride to row 2 pointer
+    "add        %1, %0                         \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0, q1}, [%0]!                \n"  // load row 1 and post inc
+    MEMACCESS(1)
+    "vld1.8     {q2, q3}, [%1]!                \n"  // load row 2 and post inc
+    "subs       %3, %3, #16                    \n"  // 16 processed per loop
+    "vpaddl.u8  q0, q0                         \n"  // row 1 add adjacent
+    "vpaddl.u8  q1, q1                         \n"
+    "vpadal.u8  q0, q2                         \n"  // row 2 add adjacent + row1
+    "vpadal.u8  q1, q3                         \n"
+    "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack
+    "vrshrn.u16 d1, q1, #2                     \n"
+    MEMACCESS(2)
+    "vst1.8     {q0}, [%2]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(src_stride),       // %1
+    "+r"(dst),              // %2
+    "+r"(dst_width)         // %3
+  :
+  : "q0", "q1", "q2", "q3"     // Clobber List
+  );
+}
+
+void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n" // src line 0
+    "subs       %2, %2, #8                     \n" // 8 processed per loop
+    MEMACCESS(1)
+    "vst1.8     {d2}, [%1]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width)         // %2
+  :
+  : "q0", "q1", "memory", "cc"
+  );
+}
+
+void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
+  const uint8* src_ptr1 = src_ptr + src_stride;
+  const uint8* src_ptr2 = src_ptr + src_stride * 2;
+  const uint8* src_ptr3 = src_ptr + src_stride * 3;
+asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"   // load up 16x4
+    MEMACCESS(3)
+    "vld1.8     {q1}, [%3]!                    \n"
+    MEMACCESS(4)
+    "vld1.8     {q2}, [%4]!                    \n"
+    MEMACCESS(5)
+    "vld1.8     {q3}, [%5]!                    \n"
+    "subs       %2, %2, #4                     \n"
+    "vpaddl.u8  q0, q0                         \n"
+    "vpadal.u8  q0, q1                         \n"
+    "vpadal.u8  q0, q2                         \n"
+    "vpadal.u8  q0, q3                         \n"
+    "vpaddl.u16 q0, q0                         \n"
+    "vrshrn.u32 d0, q0, #4                     \n"   // divide by 16 w/rounding
+    "vmovn.u16  d0, q0                         \n"
+    MEMACCESS(1)
+    "vst1.32    {d0[0]}, [%1]!                 \n"
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(dst_width), // %2
+    "+r"(src_ptr1),  // %3
+    "+r"(src_ptr2),  // %4
+    "+r"(src_ptr3)   // %5
+  :
+  : "q0", "q1", "q2", "q3", "memory", "cc"
+  );
+}
+
+// Down scale from 4 to 3 pixels. Use the neon multilane read/write
+// to load up the every 4th pixel into a 4 different registers.
+// Point samples 32 pixels to 24 pixels.
+void ScaleRowDown34_NEON(const uint8* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8* dst_ptr, int dst_width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!      \n" // src line 0
+    "subs       %2, %2, #24                  \n"
+    "vmov       d2, d3                       \n" // order d0, d1, d2
+    MEMACCESS(1)
+    "vst3.8     {d0, d1, d2}, [%1]!          \n"
+    "bgt        1b                           \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width)         // %2
+  :
+  : "d0", "d1", "d2", "d3", "memory", "cc"
+  );
+}
+
+void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "vmov.u8    d24, #3                        \n"
+    "add        %3, %0                         \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0
+    MEMACCESS(3)
+    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n" // src line 1
+    "subs         %2, %2, #24                  \n"
+
+    // filter src line 0 with src line 1
+    // expand chars to shorts to allow for room
+    // when adding lines together
+    "vmovl.u8     q8, d4                       \n"
+    "vmovl.u8     q9, d5                       \n"
+    "vmovl.u8     q10, d6                      \n"
+    "vmovl.u8     q11, d7                      \n"
+
+    // 3 * line_0 + line_1
+    "vmlal.u8     q8, d0, d24                  \n"
+    "vmlal.u8     q9, d1, d24                  \n"
+    "vmlal.u8     q10, d2, d24                 \n"
+    "vmlal.u8     q11, d3, d24                 \n"
+
+    // (3 * line_0 + line_1) >> 2
+    "vqrshrn.u16  d0, q8, #2                   \n"
+    "vqrshrn.u16  d1, q9, #2                   \n"
+    "vqrshrn.u16  d2, q10, #2                  \n"
+    "vqrshrn.u16  d3, q11, #2                  \n"
+
+    // a0 = (src[0] * 3 + s[1] * 1) >> 2
+    "vmovl.u8     q8, d1                       \n"
+    "vmlal.u8     q8, d0, d24                  \n"
+    "vqrshrn.u16  d0, q8, #2                   \n"
+
+    // a1 = (src[1] * 1 + s[2] * 1) >> 1
+    "vrhadd.u8    d1, d1, d2                   \n"
+
+    // a2 = (src[2] * 1 + s[3] * 3) >> 2
+    "vmovl.u8     q8, d2                       \n"
+    "vmlal.u8     q8, d3, d24                  \n"
+    "vqrshrn.u16  d2, q8, #2                   \n"
+
+    MEMACCESS(1)
+    "vst3.8       {d0, d1, d2}, [%1]!          \n"
+
+    "bgt          1b                           \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width),        // %2
+    "+r"(src_stride)        // %3
+  :
+  : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
+  );
+}
+
+void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "vmov.u8    d24, #3                        \n"
+    "add        %3, %0                         \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0
+    MEMACCESS(3)
+    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n" // src line 1
+    "subs         %2, %2, #24                  \n"
+    // average src line 0 with src line 1
+    "vrhadd.u8    q0, q0, q2                   \n"
+    "vrhadd.u8    q1, q1, q3                   \n"
+
+    // a0 = (src[0] * 3 + s[1] * 1) >> 2
+    "vmovl.u8     q3, d1                       \n"
+    "vmlal.u8     q3, d0, d24                  \n"
+    "vqrshrn.u16  d0, q3, #2                   \n"
+
+    // a1 = (src[1] * 1 + s[2] * 1) >> 1
+    "vrhadd.u8    d1, d1, d2                   \n"
+
+    // a2 = (src[2] * 1 + s[3] * 3) >> 2
+    "vmovl.u8     q3, d2                       \n"
+    "vmlal.u8     q3, d3, d24                  \n"
+    "vqrshrn.u16  d2, q3, #2                   \n"
+
+    MEMACCESS(1)
+    "vst3.8       {d0, d1, d2}, [%1]!          \n"
+    "bgt          1b                           \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width),        // %2
+    "+r"(src_stride)        // %3
+  :
+  : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
+  );
+}
+
+#define HAS_SCALEROWDOWN38_NEON
+static uvec8 kShuf38 =
+  { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
+static uvec8 kShuf38_2 =
+  { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
+static vec16 kMult38_Div6 =
+  { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
+    65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
+static vec16 kMult38_Div9 =
+  { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
+    65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
+
+// 32 -> 12
+void ScaleRowDown38_NEON(const uint8* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    MEMACCESS(3)
+    "vld1.8     {q3}, [%3]                     \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"
+    "subs       %2, %2, #12                    \n"
+    "vtbl.u8    d4, {d0, d1, d2, d3}, d6       \n"
+    "vtbl.u8    d5, {d0, d1, d2, d3}, d7       \n"
+    MEMACCESS(1)
+    "vst1.8     {d4}, [%1]!                    \n"
+    MEMACCESS(1)
+    "vst1.32    {d5[0]}, [%1]!                 \n"
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width)         // %2
+  : "r"(&kShuf38)           // %3
+  : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
+  );
+}
+
+// 32x3 -> 12x1
+void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
+                                      ptrdiff_t src_stride,
+                                      uint8* dst_ptr, int dst_width) {
+  const uint8* src_ptr1 = src_ptr + src_stride * 2;
+
+  asm volatile (
+    MEMACCESS(5)
+    "vld1.16    {q13}, [%5]                    \n"
+    MEMACCESS(6)
+    "vld1.8     {q14}, [%6]                    \n"
+    MEMACCESS(7)
+    "vld1.8     {q15}, [%7]                    \n"
+    "add        %3, %0                         \n"
+  "1:                                          \n"
+
+    // d0 = 00 40 01 41 02 42 03 43
+    // d1 = 10 50 11 51 12 52 13 53
+    // d2 = 20 60 21 61 22 62 23 63
+    // d3 = 30 70 31 71 32 72 33 73
+    MEMACCESS(0)
+    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"
+    MEMACCESS(3)
+    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"
+    MEMACCESS(4)
+    "vld4.8       {d16, d17, d18, d19}, [%4]!  \n"
+    "subs         %2, %2, #12                  \n"
+
+    // Shuffle the input data around to get align the data
+    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+    // d0 = 00 10 01 11 02 12 03 13
+    // d1 = 40 50 41 51 42 52 43 53
+    "vtrn.u8      d0, d1                       \n"
+    "vtrn.u8      d4, d5                       \n"
+    "vtrn.u8      d16, d17                     \n"
+
+    // d2 = 20 30 21 31 22 32 23 33
+    // d3 = 60 70 61 71 62 72 63 73
+    "vtrn.u8      d2, d3                       \n"
+    "vtrn.u8      d6, d7                       \n"
+    "vtrn.u8      d18, d19                     \n"
+
+    // d0 = 00+10 01+11 02+12 03+13
+    // d2 = 40+50 41+51 42+52 43+53
+    "vpaddl.u8    q0, q0                       \n"
+    "vpaddl.u8    q2, q2                       \n"
+    "vpaddl.u8    q8, q8                       \n"
+
+    // d3 = 60+70 61+71 62+72 63+73
+    "vpaddl.u8    d3, d3                       \n"
+    "vpaddl.u8    d7, d7                       \n"
+    "vpaddl.u8    d19, d19                     \n"
+
+    // combine source lines
+    "vadd.u16     q0, q2                       \n"
+    "vadd.u16     q0, q8                       \n"
+    "vadd.u16     d4, d3, d7                   \n"
+    "vadd.u16     d4, d19                      \n"
+
+    // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
+    //             + s[6 + st * 1] + s[7 + st * 1]
+    //             + s[6 + st * 2] + s[7 + st * 2]) / 6
+    "vqrdmulh.s16 q2, q2, q13                  \n"
+    "vmovn.u16    d4, q2                       \n"
+
+    // Shuffle 2,3 reg around so that 2 can be added to the
+    //  0,1 reg and 3 can be added to the 4,5 reg. This
+    //  requires expanding from u8 to u16 as the 0,1 and 4,5
+    //  registers are already expanded. Then do transposes
+    //  to get aligned.
+    // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+    "vmovl.u8     q1, d2                       \n"
+    "vmovl.u8     q3, d6                       \n"
+    "vmovl.u8     q9, d18                      \n"
+
+    // combine source lines
+    "vadd.u16     q1, q3                       \n"
+    "vadd.u16     q1, q9                       \n"
+
+    // d4 = xx 20 xx 30 xx 22 xx 32
+    // d5 = xx 21 xx 31 xx 23 xx 33
+    "vtrn.u32     d2, d3                       \n"
+
+    // d4 = xx 20 xx 21 xx 22 xx 23
+    // d5 = xx 30 xx 31 xx 32 xx 33
+    "vtrn.u16     d2, d3                       \n"
+
+    // 0+1+2, 3+4+5
+    "vadd.u16     q0, q1                       \n"
+
+    // Need to divide, but can't downshift as the the value
+    //  isn't a power of 2. So multiply by 65536 / n
+    //  and take the upper 16 bits.
+    "vqrdmulh.s16 q0, q0, q15                  \n"
+
+    // Align for table lookup, vtbl requires registers to
+    //  be adjacent
+    "vmov.u8      d2, d4                       \n"
+
+    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
+    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
+
+    MEMACCESS(1)
+    "vst1.8       {d3}, [%1]!                  \n"
+    MEMACCESS(1)
+    "vst1.32      {d4[0]}, [%1]!               \n"
+    "bgt          1b                           \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width),        // %2
+    "+r"(src_stride),       // %3
+    "+r"(src_ptr1)          // %4
+  : "r"(&kMult38_Div6),     // %5
+    "r"(&kShuf38_2),        // %6
+    "r"(&kMult38_Div9)      // %7
+  : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc"
+  );
+}
+
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    MEMACCESS(4)
+    "vld1.16    {q13}, [%4]                    \n"
+    MEMACCESS(5)
+    "vld1.8     {q14}, [%5]                    \n"
+    "add        %3, %0                         \n"
+  "1:                                          \n"
+
+    // d0 = 00 40 01 41 02 42 03 43
+    // d1 = 10 50 11 51 12 52 13 53
+    // d2 = 20 60 21 61 22 62 23 63
+    // d3 = 30 70 31 71 32 72 33 73
+    MEMACCESS(0)
+    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"
+    MEMACCESS(3)
+    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"
+    "subs         %2, %2, #12                  \n"
+
+    // Shuffle the input data around to get align the data
+    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+    // d0 = 00 10 01 11 02 12 03 13
+    // d1 = 40 50 41 51 42 52 43 53
+    "vtrn.u8      d0, d1                       \n"
+    "vtrn.u8      d4, d5                       \n"
+
+    // d2 = 20 30 21 31 22 32 23 33
+    // d3 = 60 70 61 71 62 72 63 73
+    "vtrn.u8      d2, d3                       \n"
+    "vtrn.u8      d6, d7                       \n"
+
+    // d0 = 00+10 01+11 02+12 03+13
+    // d2 = 40+50 41+51 42+52 43+53
+    "vpaddl.u8    q0, q0                       \n"
+    "vpaddl.u8    q2, q2                       \n"
+
+    // d3 = 60+70 61+71 62+72 63+73
+    "vpaddl.u8    d3, d3                       \n"
+    "vpaddl.u8    d7, d7                       \n"
+
+    // combine source lines
+    "vadd.u16     q0, q2                       \n"
+    "vadd.u16     d4, d3, d7                   \n"
+
+    // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
+    "vqrshrn.u16  d4, q2, #2                   \n"
+
+    // Shuffle 2,3 reg around so that 2 can be added to the
+    //  0,1 reg and 3 can be added to the 4,5 reg. This
+    //  requires expanding from u8 to u16 as the 0,1 and 4,5
+    //  registers are already expanded. Then do transposes
+    //  to get aligned.
+    // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+    "vmovl.u8     q1, d2                       \n"
+    "vmovl.u8     q3, d6                       \n"
+
+    // combine source lines
+    "vadd.u16     q1, q3                       \n"
+
+    // d4 = xx 20 xx 30 xx 22 xx 32
+    // d5 = xx 21 xx 31 xx 23 xx 33
+    "vtrn.u32     d2, d3                       \n"
+
+    // d4 = xx 20 xx 21 xx 22 xx 23
+    // d5 = xx 30 xx 31 xx 32 xx 33
+    "vtrn.u16     d2, d3                       \n"
+
+    // 0+1+2, 3+4+5
+    "vadd.u16     q0, q1                       \n"
+
+    // Need to divide, but can't downshift as the the value
+    //  isn't a power of 2. So multiply by 65536 / n
+    //  and take the upper 16 bits.
+    "vqrdmulh.s16 q0, q0, q13                  \n"
+
+    // Align for table lookup, vtbl requires registers to
+    //  be adjacent
+    "vmov.u8      d2, d4                       \n"
+
+    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
+    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
+
+    MEMACCESS(1)
+    "vst1.8       {d3}, [%1]!                  \n"
+    MEMACCESS(1)
+    "vst1.32      {d4[0]}, [%1]!               \n"
+    "bgt          1b                           \n"
+  : "+r"(src_ptr),       // %0
+    "+r"(dst_ptr),       // %1
+    "+r"(dst_width),     // %2
+    "+r"(src_stride)     // %3
+  : "r"(&kMult38_Div6),  // %4
+    "r"(&kShuf38_2)      // %5
+  : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
+  );
+}
+
+void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                    uint16* dst_ptr, int src_width, int src_height) {
+  const uint8* src_tmp = NULL;
+  asm volatile (
+  "1:                                          \n"
+    "mov       %0, %1                          \n"
+    "mov       r12, %5                         \n"
+    "veor      q2, q2, q2                      \n"
+    "veor      q3, q3, q3                      \n"
+  "2:                                          \n"
+    // load 16 pixels into q0
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0], %3                 \n"
+    "vaddw.u8   q3, q3, d1                     \n"
+    "vaddw.u8   q2, q2, d0                     \n"
+    "subs       r12, r12, #1                   \n"
+    "bgt        2b                             \n"
+    MEMACCESS(2)
+    "vst1.16    {q2, q3}, [%2]!                \n"  // store pixels
+    "add        %1, %1, #16                    \n"
+    "subs       %4, %4, #16                    \n"  // 16 processed per loop
+    "bgt        1b                             \n"
+  : "+r"(src_tmp),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(dst_ptr),          // %2
+    "+r"(src_stride),       // %3
+    "+r"(src_width),        // %4
+    "+r"(src_height)        // %5
+  :
+  : "memory", "cc", "r12", "q0", "q1", "q2", "q3"  // Clobber List
+  );
+}
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD2_DATA8_LANE(n)                                    \
+    "lsr        %5, %3, #16                    \n"             \
+    "add        %6, %1, %5                     \n"             \
+    "add        %3, %3, %4                     \n"             \
+    MEMACCESS(6)                                               \
+    "vld2.8     {d6["#n"], d7["#n"]}, [%6]     \n"
+
+void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
+                          int dst_width, int x, int dx) {
+  int dx_offset[4] = {0, 1, 2, 3};
+  int* tmp = dx_offset;
+  const uint8* src_tmp = src_ptr;
+  asm volatile (
+    "vdup.32    q0, %3                         \n"  // x
+    "vdup.32    q1, %4                         \n"  // dx
+    "vld1.32    {q2}, [%5]                     \n"  // 0 1 2 3
+    "vshl.i32   q3, q1, #2                     \n"  // 4 * dx
+    "vmul.s32   q1, q1, q2                     \n"
+    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
+    "vadd.s32   q1, q1, q0                     \n"
+    // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
+    "vadd.s32   q2, q1, q3                     \n"
+    "vshl.i32   q0, q3, #1                     \n"  // 8 * dx
+  "1:                                          \n"
+    LOAD2_DATA8_LANE(0)
+    LOAD2_DATA8_LANE(1)
+    LOAD2_DATA8_LANE(2)
+    LOAD2_DATA8_LANE(3)
+    LOAD2_DATA8_LANE(4)
+    LOAD2_DATA8_LANE(5)
+    LOAD2_DATA8_LANE(6)
+    LOAD2_DATA8_LANE(7)
+    "vmov       q10, q1                        \n"
+    "vmov       q11, q2                        \n"
+    "vuzp.16    q10, q11                       \n"
+    "vmovl.u8   q8, d6                         \n"
+    "vmovl.u8   q9, d7                         \n"
+    "vsubl.s16  q11, d18, d16                  \n"
+    "vsubl.s16  q12, d19, d17                  \n"
+    "vmovl.u16  q13, d20                       \n"
+    "vmovl.u16  q10, d21                       \n"
+    "vmul.s32   q11, q11, q13                  \n"
+    "vmul.s32   q12, q12, q10                  \n"
+    "vshrn.s32  d18, q11, #16                  \n"
+    "vshrn.s32  d19, q12, #16                  \n"
+    "vadd.s16   q8, q8, q9                     \n"
+    "vmovn.s16  d6, q8                         \n"
+
+    MEMACCESS(0)
+    "vst1.8     {d6}, [%0]!                    \n"  // store pixels
+    "vadd.s32   q1, q1, q0                     \n"
+    "vadd.s32   q2, q2, q0                     \n"
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop
+    "bgt        1b                             \n"
+  : "+r"(dst_ptr),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(dst_width),        // %2
+    "+r"(x),                // %3
+    "+r"(dx),               // %4
+    "+r"(tmp),              // %5
+    "+r"(src_tmp)           // %6
+  :
+  : "memory", "cc", "q0", "q1", "q2", "q3",
+    "q8", "q9", "q10", "q11", "q12", "q13"
+  );
+}
+
+#undef LOAD2_DATA8_LANE
+
+// 16x2 -> 16x1
+void ScaleFilterRows_NEON(uint8* dst_ptr,
+                          const uint8* src_ptr, ptrdiff_t src_stride,
+                          int dst_width, int source_y_fraction) {
+  asm volatile (
+    "cmp          %4, #0                       \n"
+    "beq          100f                         \n"
+    "add          %2, %1                       \n"
+    "cmp          %4, #64                      \n"
+    "beq          75f                          \n"
+    "cmp          %4, #128                     \n"
+    "beq          50f                          \n"
+    "cmp          %4, #192                     \n"
+    "beq          25f                          \n"
+
+    "vdup.8       d5, %4                       \n"
+    "rsb          %4, #256                     \n"
+    "vdup.8       d4, %4                       \n"
+    // General purpose row blend.
+  "1:                                          \n"
+    MEMACCESS(1)
+    "vld1.8       {q0}, [%1]!                  \n"
+    MEMACCESS(2)
+    "vld1.8       {q1}, [%2]!                  \n"
+    "subs         %3, %3, #16                  \n"
+    "vmull.u8     q13, d0, d4                  \n"
+    "vmull.u8     q14, d1, d4                  \n"
+    "vmlal.u8     q13, d2, d5                  \n"
+    "vmlal.u8     q14, d3, d5                  \n"
+    "vrshrn.u16   d0, q13, #8                  \n"
+    "vrshrn.u16   d1, q14, #8                  \n"
+    MEMACCESS(0)
+    "vst1.8       {q0}, [%0]!                  \n"
+    "bgt          1b                           \n"
+    "b            99f                          \n"
+
+    // Blend 25 / 75.
+  "25:                                         \n"
+    MEMACCESS(1)
+    "vld1.8       {q0}, [%1]!                  \n"
+    MEMACCESS(2)
+    "vld1.8       {q1}, [%2]!                  \n"
+    "subs         %3, %3, #16                  \n"
+    "vrhadd.u8    q0, q1                       \n"
+    "vrhadd.u8    q0, q1                       \n"
+    MEMACCESS(0)
+    "vst1.8       {q0}, [%0]!                  \n"
+    "bgt          25b                          \n"
+    "b            99f                          \n"
+
+    // Blend 50 / 50.
+  "50:                                         \n"
+    MEMACCESS(1)
+    "vld1.8       {q0}, [%1]!                  \n"
+    MEMACCESS(2)
+    "vld1.8       {q1}, [%2]!                  \n"
+    "subs         %3, %3, #16                  \n"
+    "vrhadd.u8    q0, q1                       \n"
+    MEMACCESS(0)
+    "vst1.8       {q0}, [%0]!                  \n"
+    "bgt          50b                          \n"
+    "b            99f                          \n"
+
+    // Blend 75 / 25.
+  "75:                                         \n"
+    MEMACCESS(1)
+    "vld1.8       {q1}, [%1]!                  \n"
+    MEMACCESS(2)
+    "vld1.8       {q0}, [%2]!                  \n"
+    "subs         %3, %3, #16                  \n"
+    "vrhadd.u8    q0, q1                       \n"
+    "vrhadd.u8    q0, q1                       \n"
+    MEMACCESS(0)
+    "vst1.8       {q0}, [%0]!                  \n"
+    "bgt          75b                          \n"
+    "b            99f                          \n"
+
+    // Blend 100 / 0 - Copy row unchanged.
+  "100:                                        \n"
+    MEMACCESS(1)
+    "vld1.8       {q0}, [%1]!                  \n"
+    "subs         %3, %3, #16                  \n"
+    MEMACCESS(0)
+    "vst1.8       {q0}, [%0]!                  \n"
+    "bgt          100b                         \n"
+
+  "99:                                         \n"
+    MEMACCESS(0)
+    "vst1.8       {d1[7]}, [%0]                \n"
+  : "+r"(dst_ptr),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(src_stride),       // %2
+    "+r"(dst_width),        // %3
+    "+r"(source_y_fraction) // %4
+  :
+  : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
+  );
+}
+
+void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst, int dst_width) {
+  asm volatile (
+  "1:                                          \n"
+    // load even pixels into q0, odd into q1
+    MEMACCESS(0)
+    "vld2.32    {q0, q1}, [%0]!                \n"
+    MEMACCESS(0)
+    "vld2.32    {q2, q3}, [%0]!                \n"
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop
+    MEMACCESS(1)
+    "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels
+    MEMACCESS(1)
+    "vst1.8     {q3}, [%1]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst),              // %1
+    "+r"(dst_width)         // %2
+  :
+  : "memory", "cc", "q0", "q1", "q2", "q3"  // Clobber List
+  );
+}
+
+void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                  uint8* dst_argb, int dst_width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop
+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    "vpaddl.u8  q3, q3                         \n"  // A 16 bytes -> 8 shorts.
+    "vrshrn.u16 d0, q0, #1                     \n"  // downshift, round and pack
+    "vrshrn.u16 d1, q1, #1                     \n"
+    "vrshrn.u16 d2, q2, #1                     \n"
+    "vrshrn.u16 d3, q3, #1                     \n"
+    MEMACCESS(1)
+    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"
+    "bgt       1b                              \n"
+  : "+r"(src_argb),         // %0
+    "+r"(dst_argb),         // %1
+    "+r"(dst_width)         // %2
+  :
+  : "memory", "cc", "q0", "q1", "q2", "q3"     // Clobber List
+  );
+}
+
+void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst, int dst_width) {
+  asm volatile (
+    // change the stride to row 2 pointer
+    "add        %1, %1, %0                     \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    "vpaddl.u8  q3, q3                         \n"  // A 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "vld4.8     {d16, d18, d20, d22}, [%1]!    \n"  // load 8 more ARGB pixels.
+    MEMACCESS(1)
+    "vld4.8     {d17, d19, d21, d23}, [%1]!    \n"  // load last 8 ARGB pixels.
+    "vpadal.u8  q0, q8                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q9                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q2, q10                        \n"  // R 16 bytes -> 8 shorts.
+    "vpadal.u8  q3, q11                        \n"  // A 16 bytes -> 8 shorts.
+    "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack
+    "vrshrn.u16 d1, q1, #2                     \n"
+    "vrshrn.u16 d2, q2, #2                     \n"
+    "vrshrn.u16 d3, q3, #2                     \n"
+    MEMACCESS(2)
+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(src_stride),       // %1
+    "+r"(dst),              // %2
+    "+r"(dst_width)         // %3
+  :
+  : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
+  );
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+void ScaleARGBRowDownEven_NEON(const uint8* src_argb,  ptrdiff_t src_stride,
+                               int src_stepx, uint8* dst_argb, int dst_width) {
+  asm volatile (
+    "mov        r12, %3, lsl #2                \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.32    {d0[0]}, [%0], r12             \n"
+    MEMACCESS(0)
+    "vld1.32    {d0[1]}, [%0], r12             \n"
+    MEMACCESS(0)
+    "vld1.32    {d1[0]}, [%0], r12             \n"
+    MEMACCESS(0)
+    "vld1.32    {d1[1]}, [%0], r12             \n"
+    "subs       %2, %2, #4                     \n"  // 4 pixels per loop.
+    MEMACCESS(1)
+    "vst1.8     {q0}, [%1]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src_argb),    // %0
+    "+r"(dst_argb),    // %1
+    "+r"(dst_width)    // %2
+  : "r"(src_stepx)     // %3
+  : "memory", "cc", "r12", "q0"
+  );
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8* dst_argb, int dst_width) {
+  asm volatile (
+    "mov        r12, %4, lsl #2                \n"
+    "add        %1, %1, %0                     \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {d0}, [%0], r12                \n"  // Read 4 2x2 blocks -> 2x1
+    MEMACCESS(1)
+    "vld1.8     {d1}, [%1], r12                \n"
+    MEMACCESS(0)
+    "vld1.8     {d2}, [%0], r12                \n"
+    MEMACCESS(1)
+    "vld1.8     {d3}, [%1], r12                \n"
+    MEMACCESS(0)
+    "vld1.8     {d4}, [%0], r12                \n"
+    MEMACCESS(1)
+    "vld1.8     {d5}, [%1], r12                \n"
+    MEMACCESS(0)
+    "vld1.8     {d6}, [%0], r12                \n"
+    MEMACCESS(1)
+    "vld1.8     {d7}, [%1], r12                \n"
+    "vaddl.u8   q0, d0, d1                     \n"
+    "vaddl.u8   q1, d2, d3                     \n"
+    "vaddl.u8   q2, d4, d5                     \n"
+    "vaddl.u8   q3, d6, d7                     \n"
+    "vswp.8     d1, d2                         \n"  // ab_cd -> ac_bd
+    "vswp.8     d5, d6                         \n"  // ef_gh -> eg_fh
+    "vadd.u16   q0, q0, q1                     \n"  // (a+b)_(c+d)
+    "vadd.u16   q2, q2, q3                     \n"  // (e+f)_(g+h)
+    "vrshrn.u16 d0, q0, #2                     \n"  // first 2 pixels.
+    "vrshrn.u16 d1, q2, #2                     \n"  // next 2 pixels.
+    "subs       %3, %3, #4                     \n"  // 4 pixels per loop.
+    MEMACCESS(2)
+    "vst1.8     {q0}, [%2]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src_argb),    // %0
+    "+r"(src_stride),  // %1
+    "+r"(dst_argb),    // %2
+    "+r"(dst_width)    // %3
+  : "r"(src_stepx)     // %4
+  : "memory", "cc", "r12", "q0", "q1", "q2", "q3"
+  );
+}
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD1_DATA32_LANE(dn, n)                               \
+    "lsr        %5, %3, #16                    \n"             \
+    "add        %6, %1, %5, lsl #2             \n"             \
+    "add        %3, %3, %4                     \n"             \
+    MEMACCESS(6)                                               \
+    "vld1.32    {"#dn"["#n"]}, [%6]            \n"
+
+void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
+                        int dst_width, int x, int dx) {
+  int tmp = 0;
+  const uint8* src_tmp = src_argb;
+  asm volatile (
+  "1:                                          \n"
+    LOAD1_DATA32_LANE(d0, 0)
+    LOAD1_DATA32_LANE(d0, 1)
+    LOAD1_DATA32_LANE(d1, 0)
+    LOAD1_DATA32_LANE(d1, 1)
+    LOAD1_DATA32_LANE(d2, 0)
+    LOAD1_DATA32_LANE(d2, 1)
+    LOAD1_DATA32_LANE(d3, 0)
+    LOAD1_DATA32_LANE(d3, 1)
+
+    MEMACCESS(0)
+    "vst1.32     {q0, q1}, [%0]!               \n"  // store pixels
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop
+    "bgt        1b                             \n"
+  : "+r"(dst_argb),         // %0
+    "+r"(src_argb),         // %1
+    "+r"(dst_width),        // %2
+    "+r"(x),                // %3
+    "+r"(dx),               // %4
+    "+r"(tmp),              // %5
+    "+r"(src_tmp)           // %6
+  :
+  : "memory", "cc", "q0", "q1"
+  );
+}
+
+#undef LOAD1_DATA32_LANE
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD2_DATA32_LANE(dn1, dn2, n)                         \
+    "lsr        %5, %3, #16                           \n"      \
+    "add        %6, %1, %5, lsl #2                    \n"      \
+    "add        %3, %3, %4                            \n"      \
+    MEMACCESS(6)                                               \
+    "vld2.32    {"#dn1"["#n"], "#dn2"["#n"]}, [%6]    \n"
+
+void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
+                              int dst_width, int x, int dx) {
+  int dx_offset[4] = {0, 1, 2, 3};
+  int* tmp = dx_offset;
+  const uint8* src_tmp = src_argb;
+  asm volatile (
+    "vdup.32    q0, %3                         \n"  // x
+    "vdup.32    q1, %4                         \n"  // dx
+    "vld1.32    {q2}, [%5]                     \n"  // 0 1 2 3
+    "vshl.i32   q9, q1, #2                     \n"  // 4 * dx
+    "vmul.s32   q1, q1, q2                     \n"
+    "vmov.i8    q3, #0x7f                      \n"  // 0x7F
+    "vmov.i16   q15, #0x7f                     \n"  // 0x7F
+    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
+    "vadd.s32   q8, q1, q0                     \n"
+  "1:                                          \n"
+    // d0, d1: a
+    // d2, d3: b
+    LOAD2_DATA32_LANE(d0, d2, 0)
+    LOAD2_DATA32_LANE(d0, d2, 1)
+    LOAD2_DATA32_LANE(d1, d3, 0)
+    LOAD2_DATA32_LANE(d1, d3, 1)
+    "vshrn.i32   d22, q8, #9                   \n"
+    "vand.16     d22, d22, d30                 \n"
+    "vdup.8      d24, d22[0]                   \n"
+    "vdup.8      d25, d22[2]                   \n"
+    "vdup.8      d26, d22[4]                   \n"
+    "vdup.8      d27, d22[6]                   \n"
+    "vext.8      d4, d24, d25, #4              \n"
+    "vext.8      d5, d26, d27, #4              \n"  // f
+    "veor.8      q10, q2, q3                   \n"  // 0x7f ^ f
+    "vmull.u8    q11, d0, d20                  \n"
+    "vmull.u8    q12, d1, d21                  \n"
+    "vmull.u8    q13, d2, d4                   \n"
+    "vmull.u8    q14, d3, d5                   \n"
+    "vadd.i16    q11, q11, q13                 \n"
+    "vadd.i16    q12, q12, q14                 \n"
+    "vshrn.i16   d0, q11, #7                   \n"
+    "vshrn.i16   d1, q12, #7                   \n"
+
+    MEMACCESS(0)
+    "vst1.32     {d0, d1}, [%0]!               \n"  // store pixels
+    "vadd.s32    q8, q8, q9                    \n"
+    "subs        %2, %2, #4                    \n"  // 4 processed per loop
+    "bgt         1b                            \n"
+  : "+r"(dst_argb),         // %0
+    "+r"(src_argb),         // %1
+    "+r"(dst_width),        // %2
+    "+r"(x),                // %3
+    "+r"(dx),               // %4
+    "+r"(tmp),              // %5
+    "+r"(src_tmp)           // %6
+  :
+  : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9",
+    "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+#undef LOAD2_DATA32_LANE
+
+#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libyuv/source/scale_neon64.cc b/libs/libyuv/source/scale_neon64.cc
new file mode 100644
index 0000000000..1d55193579
--- /dev/null
+++ b/libs/libyuv/source/scale_neon64.cc
@@ -0,0 +1,1042 @@
+/*
+ *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon armv8 64 bit.
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+// Read 32x1 throw away even pixels, and write 16x1.
+void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst, int dst_width) {
+  asm volatile (
+  "1:                                          \n"
+    // load even pixels into v0, odd into v1
+    MEMACCESS(0)
+    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"
+    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
+    MEMACCESS(1)
+    "st1        {v1.16b}, [%1], #16            \n"  // store odd pixels
+    "b.gt       1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst),              // %1
+    "+r"(dst_width)         // %2
+  :
+  : "v0", "v1"              // Clobber List
+  );
+}
+
+// Read 32x1 average down and write 16x1.
+void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst, int dst_width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b,v1.16b}, [%0], #32     \n"  // load pixels and post inc
+    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
+    "uaddlp     v0.8h, v0.16b                  \n"  // add adjacent
+    "uaddlp     v1.8h, v1.16b                  \n"
+    "rshrn      v0.8b, v0.8h, #1               \n"  // downshift, round and pack
+    "rshrn2     v0.16b, v1.8h, #1              \n"
+    MEMACCESS(1)
+    "st1        {v0.16b}, [%1], #16            \n"
+    "b.gt       1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst),              // %1
+    "+r"(dst_width)         // %2
+  :
+  : "v0", "v1"     // Clobber List
+  );
+}
+
+// Read 32x2 average down and write 16x1.
+void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst, int dst_width) {
+  asm volatile (
+    // change the stride to row 2 pointer
+    "add        %1, %1, %0                     \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b,v1.16b}, [%0], #32    \n"  // load row 1 and post inc
+    MEMACCESS(1)
+    "ld1        {v2.16b, v3.16b}, [%1], #32    \n"  // load row 2 and post inc
+    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
+    "uaddlp     v0.8h, v0.16b                  \n"  // row 1 add adjacent
+    "uaddlp     v1.8h, v1.16b                  \n"
+    "uadalp     v0.8h, v2.16b                  \n"  // row 2 add adjacent + row1
+    "uadalp     v1.8h, v3.16b                  \n"
+    "rshrn      v0.8b, v0.8h, #2               \n"  // downshift, round and pack
+    "rshrn2     v0.16b, v1.8h, #2              \n"
+    MEMACCESS(2)
+    "st1        {v0.16b}, [%2], #16            \n"
+    "b.gt       1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(src_stride),       // %1
+    "+r"(dst),              // %2
+    "+r"(dst_width)         // %3
+  :
+  : "v0", "v1", "v2", "v3"     // Clobber List
+  );
+}
+
+void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4     {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32          \n"  // src line 0
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
+    MEMACCESS(1)
+    "st1     {v2.8b}, [%1], #8                 \n"
+    "b.gt       1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width)         // %2
+  :
+  : "v0", "v1", "v2", "v3", "memory", "cc"
+  );
+}
+
+void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
+  const uint8* src_ptr1 = src_ptr + src_stride;
+  const uint8* src_ptr2 = src_ptr + src_stride * 2;
+  const uint8* src_ptr3 = src_ptr + src_stride * 3;
+asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1     {v0.16b}, [%0], #16               \n"   // load up 16x4
+    MEMACCESS(3)
+    "ld1     {v1.16b}, [%2], #16               \n"
+    MEMACCESS(4)
+    "ld1     {v2.16b}, [%3], #16               \n"
+    MEMACCESS(5)
+    "ld1     {v3.16b}, [%4], #16               \n"
+    "subs    %w5, %w5, #4                      \n"
+    "uaddlp  v0.8h, v0.16b                     \n"
+    "uadalp  v0.8h, v1.16b                     \n"
+    "uadalp  v0.8h, v2.16b                     \n"
+    "uadalp  v0.8h, v3.16b                     \n"
+    "addp    v0.8h, v0.8h, v0.8h               \n"
+    "rshrn   v0.8b, v0.8h, #4                  \n"   // divide by 16 w/rounding
+    MEMACCESS(1)
+    "st1    {v0.s}[0], [%1], #4                \n"
+    "b.gt       1b                             \n"
+  : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(src_ptr1),  // %2
+    "+r"(src_ptr2),  // %3
+    "+r"(src_ptr3),  // %4
+    "+r"(dst_width)  // %5
+  :
+  : "v0", "v1", "v2", "v3", "memory", "cc"
+  );
+}
+
+// Down scale from 4 to 3 pixels. Use the neon multilane read/write
+// to load up the every 4th pixel into a 4 different registers.
+// Point samples 32 pixels to 24 pixels.
+void ScaleRowDown34_NEON(const uint8* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8* dst_ptr, int dst_width) {
+  asm volatile (
+  "1:                                                  \n"
+    MEMACCESS(0)
+    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src line 0
+    "subs      %w2, %w2, #24                           \n"
+    "orr       v2.16b, v3.16b, v3.16b                  \n"  // order v0, v1, v2
+    MEMACCESS(1)
+    "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n"
+    "b.gt      1b                                      \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width)         // %2
+  :
+  : "v0", "v1", "v2", "v3", "memory", "cc"
+  );
+}
+
+void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "movi      v20.8b, #3                              \n"
+    "add       %3, %3, %0                              \n"
+  "1:                                                  \n"
+    MEMACCESS(0)
+    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src line 0
+    MEMACCESS(3)
+    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32                \n"  // src line 1
+    "subs         %w2, %w2, #24                        \n"
+
+    // filter src line 0 with src line 1
+    // expand chars to shorts to allow for room
+    // when adding lines together
+    "ushll     v16.8h, v4.8b, #0                       \n"
+    "ushll     v17.8h, v5.8b, #0                       \n"
+    "ushll     v18.8h, v6.8b, #0                       \n"
+    "ushll     v19.8h, v7.8b, #0                       \n"
+
+    // 3 * line_0 + line_1
+    "umlal     v16.8h, v0.8b, v20.8b                   \n"
+    "umlal     v17.8h, v1.8b, v20.8b                   \n"
+    "umlal     v18.8h, v2.8b, v20.8b                   \n"
+    "umlal     v19.8h, v3.8b, v20.8b                   \n"
+
+    // (3 * line_0 + line_1) >> 2
+    "uqrshrn   v0.8b, v16.8h, #2                       \n"
+    "uqrshrn   v1.8b, v17.8h, #2                       \n"
+    "uqrshrn   v2.8b, v18.8h, #2                       \n"
+    "uqrshrn   v3.8b, v19.8h, #2                       \n"
+
+    // a0 = (src[0] * 3 + s[1] * 1) >> 2
+    "ushll     v16.8h, v1.8b, #0                       \n"
+    "umlal     v16.8h, v0.8b, v20.8b                   \n"
+    "uqrshrn   v0.8b, v16.8h, #2                       \n"
+
+    // a1 = (src[1] * 1 + s[2] * 1) >> 1
+    "urhadd    v1.8b, v1.8b, v2.8b                     \n"
+
+    // a2 = (src[2] * 1 + s[3] * 3) >> 2
+    "ushll     v16.8h, v2.8b, #0                       \n"
+    "umlal     v16.8h, v3.8b, v20.8b                   \n"
+    "uqrshrn   v2.8b, v16.8h, #2                       \n"
+
+    MEMACCESS(1)
+    "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n"
+
+    "b.gt      1b                                      \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width),        // %2
+    "+r"(src_stride)        // %3
+  :
+  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19",
+    "v20", "memory", "cc"
+  );
+}
+
+void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "movi      v20.8b, #3                              \n"
+    "add       %3, %3, %0                              \n"
+  "1:                                                  \n"
+    MEMACCESS(0)
+    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src line 0
+    MEMACCESS(3)
+    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32                \n"  // src line 1
+    "subs         %w2, %w2, #24                        \n"
+    // average src line 0 with src line 1
+    "urhadd    v0.8b, v0.8b, v4.8b                     \n"
+    "urhadd    v1.8b, v1.8b, v5.8b                     \n"
+    "urhadd    v2.8b, v2.8b, v6.8b                     \n"
+    "urhadd    v3.8b, v3.8b, v7.8b                     \n"
+
+    // a0 = (src[0] * 3 + s[1] * 1) >> 2
+    "ushll     v4.8h, v1.8b, #0                        \n"
+    "umlal     v4.8h, v0.8b, v20.8b                    \n"
+    "uqrshrn   v0.8b, v4.8h, #2                        \n"
+
+    // a1 = (src[1] * 1 + s[2] * 1) >> 1
+    "urhadd    v1.8b, v1.8b, v2.8b                     \n"
+
+    // a2 = (src[2] * 1 + s[3] * 3) >> 2
+    "ushll     v4.8h, v2.8b, #0                        \n"
+    "umlal     v4.8h, v3.8b, v20.8b                    \n"
+    "uqrshrn   v2.8b, v4.8h, #2                        \n"
+
+    MEMACCESS(1)
+    "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n"
+    "b.gt      1b                                      \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width),        // %2
+    "+r"(src_stride)        // %3
+  :
+  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc"
+  );
+}
+
+static uvec8 kShuf38 =
+  { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
+static uvec8 kShuf38_2 =
+  { 0, 16, 32, 2, 18, 33, 4, 20, 34, 6, 22, 35, 0, 0, 0, 0 };
+static vec16 kMult38_Div6 =
+  { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
+    65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
+static vec16 kMult38_Div9 =
+  { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
+    65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
+
+// 32 -> 12
+void ScaleRowDown38_NEON(const uint8* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    MEMACCESS(3)
+    "ld1       {v3.16b}, [%3]                          \n"
+  "1:                                                  \n"
+    MEMACCESS(0)
+    "ld1       {v0.16b,v1.16b}, [%0], #32             \n"
+    "subs      %w2, %w2, #12                           \n"
+    "tbl       v2.16b, {v0.16b,v1.16b}, v3.16b        \n"
+    MEMACCESS(1)
+    "st1       {v2.8b}, [%1], #8                       \n"
+    MEMACCESS(1)
+    "st1       {v2.s}[2], [%1], #4                     \n"
+    "b.gt      1b                                      \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width)         // %2
+  : "r"(&kShuf38)           // %3
+  : "v0", "v1", "v2", "v3", "memory", "cc"
+  );
+}
+
+// 32x3 -> 12x1
+void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
+                                      ptrdiff_t src_stride,
+                                      uint8* dst_ptr, int dst_width) {
+  const uint8* src_ptr1 = src_ptr + src_stride * 2;
+  ptrdiff_t tmp_src_stride = src_stride;
+
+  asm volatile (
+    MEMACCESS(5)
+    "ld1       {v29.8h}, [%5]                          \n"
+    MEMACCESS(6)
+    "ld1       {v30.16b}, [%6]                         \n"
+    MEMACCESS(7)
+    "ld1       {v31.8h}, [%7]                          \n"
+    "add       %2, %2, %0                              \n"
+  "1:                                                  \n"
+
+    // 00 40 01 41 02 42 03 43
+    // 10 50 11 51 12 52 13 53
+    // 20 60 21 61 22 62 23 63
+    // 30 70 31 71 32 72 33 73
+    MEMACCESS(0)
+    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"
+    MEMACCESS(3)
+    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32                \n"
+    MEMACCESS(4)
+    "ld4       {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32              \n"
+    "subs      %w4, %w4, #12                           \n"
+
+    // Shuffle the input data around to get align the data
+    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+    // 00 10 01 11 02 12 03 13
+    // 40 50 41 51 42 52 43 53
+    "trn1      v20.8b, v0.8b, v1.8b                    \n"
+    "trn2      v21.8b, v0.8b, v1.8b                    \n"
+    "trn1      v22.8b, v4.8b, v5.8b                    \n"
+    "trn2      v23.8b, v4.8b, v5.8b                    \n"
+    "trn1      v24.8b, v16.8b, v17.8b                  \n"
+    "trn2      v25.8b, v16.8b, v17.8b                  \n"
+
+    // 20 30 21 31 22 32 23 33
+    // 60 70 61 71 62 72 63 73
+    "trn1      v0.8b, v2.8b, v3.8b                     \n"
+    "trn2      v1.8b, v2.8b, v3.8b                     \n"
+    "trn1      v4.8b, v6.8b, v7.8b                     \n"
+    "trn2      v5.8b, v6.8b, v7.8b                     \n"
+    "trn1      v16.8b, v18.8b, v19.8b                  \n"
+    "trn2      v17.8b, v18.8b, v19.8b                  \n"
+
+    // 00+10 01+11 02+12 03+13
+    // 40+50 41+51 42+52 43+53
+    "uaddlp    v20.4h, v20.8b                          \n"
+    "uaddlp    v21.4h, v21.8b                          \n"
+    "uaddlp    v22.4h, v22.8b                          \n"
+    "uaddlp    v23.4h, v23.8b                          \n"
+    "uaddlp    v24.4h, v24.8b                          \n"
+    "uaddlp    v25.4h, v25.8b                          \n"
+
+    // 60+70 61+71 62+72 63+73
+    "uaddlp    v1.4h, v1.8b                            \n"
+    "uaddlp    v5.4h, v5.8b                            \n"
+    "uaddlp    v17.4h, v17.8b                          \n"
+
+    // combine source lines
+    "add       v20.4h, v20.4h, v22.4h                  \n"
+    "add       v21.4h, v21.4h, v23.4h                  \n"
+    "add       v20.4h, v20.4h, v24.4h                  \n"
+    "add       v21.4h, v21.4h, v25.4h                  \n"
+    "add       v2.4h, v1.4h, v5.4h                     \n"
+    "add       v2.4h, v2.4h, v17.4h                    \n"
+
+    // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
+    //             + s[6 + st * 1] + s[7 + st * 1]
+    //             + s[6 + st * 2] + s[7 + st * 2]) / 6
+    "sqrdmulh  v2.8h, v2.8h, v29.8h                    \n"
+    "xtn       v2.8b,  v2.8h                           \n"
+
+    // Shuffle 2,3 reg around so that 2 can be added to the
+    //  0,1 reg and 3 can be added to the 4,5 reg. This
+    //  requires expanding from u8 to u16 as the 0,1 and 4,5
+    //  registers are already expanded. Then do transposes
+    //  to get aligned.
+    // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+    "ushll     v16.8h, v16.8b, #0                      \n"
+    "uaddl     v0.8h, v0.8b, v4.8b                     \n"
+
+    // combine source lines
+    "add       v0.8h, v0.8h, v16.8h                    \n"
+
+    // xx 20 xx 21 xx 22 xx 23
+    // xx 30 xx 31 xx 32 xx 33
+    "trn1      v1.8h, v0.8h, v0.8h                     \n"
+    "trn2      v4.8h, v0.8h, v0.8h                     \n"
+    "xtn       v0.4h, v1.4s                            \n"
+    "xtn       v4.4h, v4.4s                            \n"
+
+    // 0+1+2, 3+4+5
+    "add       v20.8h, v20.8h, v0.8h                   \n"
+    "add       v21.8h, v21.8h, v4.8h                   \n"
+
+    // Need to divide, but can't downshift as the the value
+    //  isn't a power of 2. So multiply by 65536 / n
+    //  and take the upper 16 bits.
+    "sqrdmulh  v0.8h, v20.8h, v31.8h                   \n"
+    "sqrdmulh  v1.8h, v21.8h, v31.8h                   \n"
+
+    // Align for table lookup, vtbl requires registers to
+    //  be adjacent
+    "tbl       v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
+
+    MEMACCESS(1)
+    "st1       {v3.8b}, [%1], #8                       \n"
+    MEMACCESS(1)
+    "st1       {v3.s}[2], [%1], #4                     \n"
+    "b.gt      1b                                      \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(tmp_src_stride),   // %2
+    "+r"(src_ptr1),         // %3
+    "+r"(dst_width)         // %4
+  : "r"(&kMult38_Div6),     // %5
+    "r"(&kShuf38_2),        // %6
+    "r"(&kMult38_Div9)      // %7
+  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
+    "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29",
+    "v30", "v31", "memory", "cc"
+  );
+}
+
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  // TODO(fbarchard): use src_stride directly for clang 3.5+.
+  ptrdiff_t tmp_src_stride = src_stride;
+  asm volatile (
+    MEMACCESS(4)
+    "ld1       {v30.8h}, [%4]                          \n"
+    MEMACCESS(5)
+    "ld1       {v31.16b}, [%5]                         \n"
+    "add       %2, %2, %0                              \n"
+  "1:                                                  \n"
+
+    // 00 40 01 41 02 42 03 43
+    // 10 50 11 51 12 52 13 53
+    // 20 60 21 61 22 62 23 63
+    // 30 70 31 71 32 72 33 73
+    MEMACCESS(0)
+    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"
+    MEMACCESS(3)
+    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32                \n"
+    "subs      %w3, %w3, #12                           \n"
+
+    // Shuffle the input data around to get align the data
+    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+    // 00 10 01 11 02 12 03 13
+    // 40 50 41 51 42 52 43 53
+    "trn1      v16.8b, v0.8b, v1.8b                    \n"
+    "trn2      v17.8b, v0.8b, v1.8b                    \n"
+    "trn1      v18.8b, v4.8b, v5.8b                    \n"
+    "trn2      v19.8b, v4.8b, v5.8b                    \n"
+
+    // 20 30 21 31 22 32 23 33
+    // 60 70 61 71 62 72 63 73
+    "trn1      v0.8b, v2.8b, v3.8b                     \n"
+    "trn2      v1.8b, v2.8b, v3.8b                     \n"
+    "trn1      v4.8b, v6.8b, v7.8b                     \n"
+    "trn2      v5.8b, v6.8b, v7.8b                     \n"
+
+    // 00+10 01+11 02+12 03+13
+    // 40+50 41+51 42+52 43+53
+    "uaddlp    v16.4h, v16.8b                          \n"
+    "uaddlp    v17.4h, v17.8b                          \n"
+    "uaddlp    v18.4h, v18.8b                          \n"
+    "uaddlp    v19.4h, v19.8b                          \n"
+
+    // 60+70 61+71 62+72 63+73
+    "uaddlp    v1.4h, v1.8b                            \n"
+    "uaddlp    v5.4h, v5.8b                            \n"
+
+    // combine source lines
+    "add       v16.4h, v16.4h, v18.4h                  \n"
+    "add       v17.4h, v17.4h, v19.4h                  \n"
+    "add       v2.4h, v1.4h, v5.4h                     \n"
+
+    // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
+    "uqrshrn   v2.8b, v2.8h, #2                        \n"
+
+    // Shuffle 2,3 reg around so that 2 can be added to the
+    //  0,1 reg and 3 can be added to the 4,5 reg. This
+    //  requires expanding from u8 to u16 as the 0,1 and 4,5
+    //  registers are already expanded. Then do transposes
+    //  to get aligned.
+    // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+
+    // combine source lines
+    "uaddl     v0.8h, v0.8b, v4.8b                     \n"
+
+    // xx 20 xx 21 xx 22 xx 23
+    // xx 30 xx 31 xx 32 xx 33
+    "trn1      v1.8h, v0.8h, v0.8h                     \n"
+    "trn2      v4.8h, v0.8h, v0.8h                     \n"
+    "xtn       v0.4h, v1.4s                            \n"
+    "xtn       v4.4h, v4.4s                            \n"
+
+    // 0+1+2, 3+4+5
+    "add       v16.8h, v16.8h, v0.8h                   \n"
+    "add       v17.8h, v17.8h, v4.8h                   \n"
+
+    // Need to divide, but can't downshift as the the value
+    //  isn't a power of 2. So multiply by 65536 / n
+    //  and take the upper 16 bits.
+    "sqrdmulh  v0.8h, v16.8h, v30.8h                   \n"
+    "sqrdmulh  v1.8h, v17.8h, v30.8h                   \n"
+
+    // Align for table lookup, vtbl requires registers to
+    //  be adjacent
+
+    "tbl       v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
+
+    MEMACCESS(1)
+    "st1       {v3.8b}, [%1], #8                       \n"
+    MEMACCESS(1)
+    "st1       {v3.s}[2], [%1], #4                     \n"
+    "b.gt      1b                                      \n"
+  : "+r"(src_ptr),         // %0
+    "+r"(dst_ptr),         // %1
+    "+r"(tmp_src_stride),  // %2
+    "+r"(dst_width)        // %3
+  : "r"(&kMult38_Div6),    // %4
+    "r"(&kShuf38_2)        // %5
+  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
+    "v18", "v19", "v30", "v31", "memory", "cc"
+  );
+}
+
+void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                    uint16* dst_ptr, int src_width, int src_height) {
+  const uint8* src_tmp = NULL;
+  asm volatile (
+  "1:                                          \n"
+    "mov       %0, %1                          \n"
+    "mov       w12, %w5                        \n"
+    "eor       v2.16b, v2.16b, v2.16b          \n"
+    "eor       v3.16b, v3.16b, v3.16b          \n"
+  "2:                                          \n"
+    // load 16 pixels into q0
+    MEMACCESS(0)
+    "ld1       {v0.16b}, [%0], %3              \n"
+    "uaddw2    v3.8h, v3.8h, v0.16b            \n"
+    "uaddw     v2.8h, v2.8h, v0.8b             \n"
+    "subs      w12, w12, #1                    \n"
+    "b.gt      2b                              \n"
+    MEMACCESS(2)
+    "st1      {v2.8h, v3.8h}, [%2], #32        \n"  // store pixels
+    "add      %1, %1, #16                      \n"
+    "subs     %w4, %w4, #16                    \n"  // 16 processed per loop
+    "b.gt     1b                               \n"
+  : "+r"(src_tmp),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(dst_ptr),          // %2
+    "+r"(src_stride),       // %3
+    "+r"(src_width),        // %4
+    "+r"(src_height)        // %5
+  :
+  : "memory", "cc", "w12", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD2_DATA8_LANE(n)                                    \
+    "lsr        %5, %3, #16                    \n"             \
+    "add        %6, %1, %5                    \n"              \
+    "add        %3, %3, %4                     \n"             \
+    MEMACCESS(6)                                               \
+    "ld2        {v4.b, v5.b}["#n"], [%6]      \n"
+
+void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
+                          int dst_width, int x, int dx) {
+  int dx_offset[4] = {0, 1, 2, 3};
+  int* tmp = dx_offset;
+  const uint8* src_tmp = src_ptr;
+  int64 dst_width64 = (int64) dst_width;  // Work around ios 64 bit warning.
+  int64 x64 = (int64) x;
+  int64 dx64 = (int64) dx;
+  asm volatile (
+    "dup        v0.4s, %w3                     \n"  // x
+    "dup        v1.4s, %w4                     \n"  // dx
+    "ld1        {v2.4s}, [%5]                  \n"  // 0 1 2 3
+    "shl        v3.4s, v1.4s, #2               \n"  // 4 * dx
+    "mul        v1.4s, v1.4s, v2.4s            \n"
+    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
+    "add        v1.4s, v1.4s, v0.4s            \n"
+    // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
+    "add        v2.4s, v1.4s, v3.4s            \n"
+    "shl        v0.4s, v3.4s, #1               \n"  // 8 * dx
+  "1:                                          \n"
+    LOAD2_DATA8_LANE(0)
+    LOAD2_DATA8_LANE(1)
+    LOAD2_DATA8_LANE(2)
+    LOAD2_DATA8_LANE(3)
+    LOAD2_DATA8_LANE(4)
+    LOAD2_DATA8_LANE(5)
+    LOAD2_DATA8_LANE(6)
+    LOAD2_DATA8_LANE(7)
+    "mov       v6.16b, v1.16b                  \n"
+    "mov       v7.16b, v2.16b                  \n"
+    "uzp1      v6.8h, v6.8h, v7.8h             \n"
+    "ushll     v4.8h, v4.8b, #0                \n"
+    "ushll     v5.8h, v5.8b, #0                \n"
+    "ssubl     v16.4s, v5.4h, v4.4h            \n"
+    "ssubl2    v17.4s, v5.8h, v4.8h            \n"
+    "ushll     v7.4s, v6.4h, #0                \n"
+    "ushll2    v6.4s, v6.8h, #0                \n"
+    "mul       v16.4s, v16.4s, v7.4s           \n"
+    "mul       v17.4s, v17.4s, v6.4s           \n"
+    "shrn      v6.4h, v16.4s, #16              \n"
+    "shrn2     v6.8h, v17.4s, #16              \n"
+    "add       v4.8h, v4.8h, v6.8h             \n"
+    "xtn       v4.8b, v4.8h                    \n"
+
+    MEMACCESS(0)
+    "st1       {v4.8b}, [%0], #8               \n"  // store pixels
+    "add       v1.4s, v1.4s, v0.4s             \n"
+    "add       v2.4s, v2.4s, v0.4s             \n"
+    "subs      %w2, %w2, #8                    \n"  // 8 processed per loop
+    "b.gt      1b                              \n"
+  : "+r"(dst_ptr),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(dst_width64),      // %2
+    "+r"(x64),              // %3
+    "+r"(dx64),             // %4
+    "+r"(tmp),              // %5
+    "+r"(src_tmp)           // %6
+  :
+  : "memory", "cc", "v0", "v1", "v2", "v3",
+    "v4", "v5", "v6", "v7", "v16", "v17"
+  );
+}
+
+#undef LOAD2_DATA8_LANE
+
+// 16x2 -> 16x1
+void ScaleFilterRows_NEON(uint8* dst_ptr,
+                          const uint8* src_ptr, ptrdiff_t src_stride,
+                          int dst_width, int source_y_fraction) {
+    int y_fraction = 256 - source_y_fraction;
+  asm volatile (
+    "cmp          %w4, #0                      \n"
+    "b.eq         100f                         \n"
+    "add          %2, %2, %1                   \n"
+    "cmp          %w4, #64                     \n"
+    "b.eq         75f                          \n"
+    "cmp          %w4, #128                    \n"
+    "b.eq         50f                          \n"
+    "cmp          %w4, #192                    \n"
+    "b.eq         25f                          \n"
+
+    "dup          v5.8b, %w4                   \n"
+    "dup          v4.8b, %w5                   \n"
+    // General purpose row blend.
+  "1:                                          \n"
+    MEMACCESS(1)
+    "ld1          {v0.16b}, [%1], #16          \n"
+    MEMACCESS(2)
+    "ld1          {v1.16b}, [%2], #16          \n"
+    "subs         %w3, %w3, #16                \n"
+    "umull        v6.8h, v0.8b, v4.8b          \n"
+    "umull2       v7.8h, v0.16b, v4.16b        \n"
+    "umlal        v6.8h, v1.8b, v5.8b          \n"
+    "umlal2       v7.8h, v1.16b, v5.16b        \n"
+    "rshrn        v0.8b, v6.8h, #8             \n"
+    "rshrn2       v0.16b, v7.8h, #8            \n"
+    MEMACCESS(0)
+    "st1          {v0.16b}, [%0], #16          \n"
+    "b.gt         1b                           \n"
+    "b            99f                          \n"
+
+    // Blend 25 / 75.
+  "25:                                         \n"
+    MEMACCESS(1)
+    "ld1          {v0.16b}, [%1], #16          \n"
+    MEMACCESS(2)
+    "ld1          {v1.16b}, [%2], #16          \n"
+    "subs         %w3, %w3, #16                \n"
+    "urhadd       v0.16b, v0.16b, v1.16b       \n"
+    "urhadd       v0.16b, v0.16b, v1.16b       \n"
+    MEMACCESS(0)
+    "st1          {v0.16b}, [%0], #16          \n"
+    "b.gt         25b                          \n"
+    "b            99f                          \n"
+
+    // Blend 50 / 50.
+  "50:                                         \n"
+    MEMACCESS(1)
+    "ld1          {v0.16b}, [%1], #16          \n"
+    MEMACCESS(2)
+    "ld1          {v1.16b}, [%2], #16          \n"
+    "subs         %w3, %w3, #16                \n"
+    "urhadd       v0.16b, v0.16b, v1.16b       \n"
+    MEMACCESS(0)
+    "st1          {v0.16b}, [%0], #16          \n"
+    "b.gt         50b                          \n"
+    "b            99f                          \n"
+
+    // Blend 75 / 25.
+  "75:                                         \n"
+    MEMACCESS(1)
+    "ld1          {v1.16b}, [%1], #16          \n"
+    MEMACCESS(2)
+    "ld1          {v0.16b}, [%2], #16          \n"
+    "subs         %w3, %w3, #16                \n"
+    "urhadd       v0.16b, v0.16b, v1.16b       \n"
+    "urhadd       v0.16b, v0.16b, v1.16b       \n"
+    MEMACCESS(0)
+    "st1          {v0.16b}, [%0], #16          \n"
+    "b.gt         75b                          \n"
+    "b            99f                          \n"
+
+    // Blend 100 / 0 - Copy row unchanged.
+  "100:                                        \n"
+    MEMACCESS(1)
+    "ld1          {v0.16b}, [%1], #16          \n"
+    "subs         %w3, %w3, #16                \n"
+    MEMACCESS(0)
+    "st1          {v0.16b}, [%0], #16          \n"
+    "b.gt         100b                         \n"
+
+  "99:                                         \n"
+    MEMACCESS(0)
+    "st1          {v0.b}[15], [%0]             \n"
+  : "+r"(dst_ptr),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(src_stride),       // %2
+    "+r"(dst_width),        // %3
+    "+r"(source_y_fraction),// %4
+    "+r"(y_fraction)        // %5
+  :
+  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc"
+  );
+}
+
+void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst, int dst_width) {
+  asm volatile (
+  "1:                                          \n"
+    // load even pixels into q0, odd into q1
+    MEMACCESS (0)
+    "ld2        {v0.4s, v1.4s}, [%0], #32      \n"
+    MEMACCESS (0)
+    "ld2        {v2.4s, v3.4s}, [%0], #32      \n"
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
+    MEMACCESS (1)
+    "st1        {v1.16b}, [%1], #16            \n"  // store odd pixels
+    MEMACCESS (1)
+    "st1        {v3.16b}, [%1], #16            \n"
+    "b.gt       1b                             \n"
+  : "+r" (src_ptr),          // %0
+    "+r" (dst),              // %1
+    "+r" (dst_width)         // %2
+  :
+  : "memory", "cc", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+
+void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                  uint8* dst_argb, int dst_width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS (0)
+    // load 8 ARGB pixels.
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64   \n"
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
+    "uaddlp     v3.8h, v3.16b                  \n"  // A 16 bytes -> 8 shorts.
+    "rshrn      v0.8b, v0.8h, #1               \n"  // downshift, round and pack
+    "rshrn      v1.8b, v1.8h, #1               \n"
+    "rshrn      v2.8b, v2.8h, #1               \n"
+    "rshrn      v3.8b, v3.8h, #1               \n"
+    MEMACCESS (1)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32     \n"
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),         // %0
+    "+r"(dst_argb),         // %1
+    "+r"(dst_width)         // %2
+  :
+  : "memory", "cc", "v0", "v1", "v2", "v3"    // Clobber List
+  );
+}
+
+void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst, int dst_width) {
+  asm volatile (
+    // change the stride to row 2 pointer
+    "add        %1, %1, %0                     \n"
+  "1:                                          \n"
+    MEMACCESS (0)
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64   \n"  // load 8 ARGB pixels.
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
+    "uaddlp     v3.8h, v3.16b                  \n"  // A 16 bytes -> 8 shorts.
+    MEMACCESS (1)
+    "ld4        {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n"  // load 8 more ARGB pixels.
+    "uadalp     v0.8h, v16.16b                 \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v17.16b                 \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v18.16b                 \n"  // R 16 bytes -> 8 shorts.
+    "uadalp     v3.8h, v19.16b                 \n"  // A 16 bytes -> 8 shorts.
+    "rshrn      v0.8b, v0.8h, #2               \n"  // downshift, round and pack
+    "rshrn      v1.8b, v1.8h, #2               \n"
+    "rshrn      v2.8b, v2.8h, #2               \n"
+    "rshrn      v3.8b, v3.8h, #2               \n"
+    MEMACCESS (2)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32     \n"
+    "b.gt       1b                             \n"
+  : "+r" (src_ptr),          // %0
+    "+r" (src_stride),       // %1
+    "+r" (dst),              // %2
+    "+r" (dst_width)         // %3
+  :
+  : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"
+  );
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+void ScaleARGBRowDownEven_NEON(const uint8* src_argb,  ptrdiff_t src_stride,
+                               int src_stepx, uint8* dst_argb, int dst_width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.s}[0], [%0], %3            \n"
+    MEMACCESS(0)
+    "ld1        {v0.s}[1], [%0], %3            \n"
+    MEMACCESS(0)
+    "ld1        {v0.s}[2], [%0], %3            \n"
+    MEMACCESS(0)
+    "ld1        {v0.s}[3], [%0], %3            \n"
+    "subs       %w2, %w2, #4                   \n"  // 4 pixels per loop.
+    MEMACCESS(1)
+    "st1        {v0.16b}, [%1], #16            \n"
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),    // %0
+    "+r"(dst_argb),    // %1
+    "+r"(dst_width)    // %2
+  : "r"((int64)(src_stepx * 4)) // %3
+  : "memory", "cc", "v0"
+  );
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+// TODO(Yang Zhang): Might be worth another optimization pass in future.
+// It could be upgraded to 8 pixels at a time to start with.
+void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8* dst_argb, int dst_width) {
+  asm volatile (
+    "add        %1, %1, %0                     \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.8b}, [%0], %4              \n"  // Read 4 2x2 blocks -> 2x1
+    MEMACCESS(1)
+    "ld1        {v1.8b}, [%1], %4              \n"
+    MEMACCESS(0)
+    "ld1        {v2.8b}, [%0], %4              \n"
+    MEMACCESS(1)
+    "ld1        {v3.8b}, [%1], %4              \n"
+    MEMACCESS(0)
+    "ld1        {v4.8b}, [%0], %4              \n"
+    MEMACCESS(1)
+    "ld1        {v5.8b}, [%1], %4              \n"
+    MEMACCESS(0)
+    "ld1        {v6.8b}, [%0], %4              \n"
+    MEMACCESS(1)
+    "ld1        {v7.8b}, [%1], %4              \n"
+    "uaddl      v0.8h, v0.8b, v1.8b            \n"
+    "uaddl      v2.8h, v2.8b, v3.8b            \n"
+    "uaddl      v4.8h, v4.8b, v5.8b            \n"
+    "uaddl      v6.8h, v6.8b, v7.8b            \n"
+    "mov        v16.d[1], v0.d[1]              \n"  // ab_cd -> ac_bd
+    "mov        v0.d[1], v2.d[0]               \n"
+    "mov        v2.d[0], v16.d[1]              \n"
+    "mov        v16.d[1], v4.d[1]              \n"  // ef_gh -> eg_fh
+    "mov        v4.d[1], v6.d[0]               \n"
+    "mov        v6.d[0], v16.d[1]              \n"
+    "add        v0.8h, v0.8h, v2.8h            \n"  // (a+b)_(c+d)
+    "add        v4.8h, v4.8h, v6.8h            \n"  // (e+f)_(g+h)
+    "rshrn      v0.8b, v0.8h, #2               \n"  // first 2 pixels.
+    "rshrn2     v0.16b, v4.8h, #2              \n"  // next 2 pixels.
+    "subs       %w3, %w3, #4                   \n"  // 4 pixels per loop.
+    MEMACCESS(2)
+    "st1     {v0.16b}, [%2], #16               \n"
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),    // %0
+    "+r"(src_stride),  // %1
+    "+r"(dst_argb),    // %2
+    "+r"(dst_width)    // %3
+  : "r"((int64)(src_stepx * 4)) // %4
+  : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
+  );
+}
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD1_DATA32_LANE(vn, n)                               \
+    "lsr        %5, %3, #16                    \n"             \
+    "add        %6, %1, %5, lsl #2             \n"             \
+    "add        %3, %3, %4                     \n"             \
+    MEMACCESS(6)                                               \
+    "ld1        {"#vn".s}["#n"], [%6]          \n"
+
+void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
+                        int dst_width, int x, int dx) {
+  const uint8* src_tmp = src_argb;
+  int64 dst_width64 = (int64) dst_width;  // Work around ios 64 bit warning.
+  int64 x64 = (int64) x;
+  int64 dx64 = (int64) dx;
+  int64 tmp64 = 0;
+  asm volatile (
+  "1:                                          \n"
+    LOAD1_DATA32_LANE(v0, 0)
+    LOAD1_DATA32_LANE(v0, 1)
+    LOAD1_DATA32_LANE(v0, 2)
+    LOAD1_DATA32_LANE(v0, 3)
+    LOAD1_DATA32_LANE(v1, 0)
+    LOAD1_DATA32_LANE(v1, 1)
+    LOAD1_DATA32_LANE(v1, 2)
+    LOAD1_DATA32_LANE(v1, 3)
+
+    MEMACCESS(0)
+    "st1        {v0.4s, v1.4s}, [%0], #32      \n"  // store pixels
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
+    "b.gt        1b                            \n"
+  : "+r"(dst_argb),         // %0
+    "+r"(src_argb),         // %1
+    "+r"(dst_width64),      // %2
+    "+r"(x64),              // %3
+    "+r"(dx64),             // %4
+    "+r"(tmp64),            // %5
+    "+r"(src_tmp)           // %6
+  :
+  : "memory", "cc", "v0", "v1"
+  );
+}
+
+#undef LOAD1_DATA32_LANE
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD2_DATA32_LANE(vn1, vn2, n)                         \
+    "lsr        %5, %3, #16                           \n"      \
+    "add        %6, %1, %5, lsl #2                    \n"      \
+    "add        %3, %3, %4                            \n"      \
+    MEMACCESS(6)                                               \
+    "ld2        {"#vn1".s, "#vn2".s}["#n"], [%6]      \n"
+
+void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
+                              int dst_width, int x, int dx) {
+  int dx_offset[4] = {0, 1, 2, 3};
+  int* tmp = dx_offset;
+  const uint8* src_tmp = src_argb;
+  int64 dst_width64 = (int64) dst_width;  // Work around ios 64 bit warning.
+  int64 x64 = (int64) x;
+  int64 dx64 = (int64) dx;
+  asm volatile (
+    "dup        v0.4s, %w3                     \n"  // x
+    "dup        v1.4s, %w4                     \n"  // dx
+    "ld1        {v2.4s}, [%5]                  \n"  // 0 1 2 3
+    "shl        v6.4s, v1.4s, #2               \n"  // 4 * dx
+    "mul        v1.4s, v1.4s, v2.4s            \n"
+    "movi       v3.16b, #0x7f                  \n"  // 0x7F
+    "movi       v4.8h, #0x7f                   \n"  // 0x7F
+    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
+    "add        v5.4s, v1.4s, v0.4s            \n"
+  "1:                                          \n"
+    // d0, d1: a
+    // d2, d3: b
+    LOAD2_DATA32_LANE(v0, v1, 0)
+    LOAD2_DATA32_LANE(v0, v1, 1)
+    LOAD2_DATA32_LANE(v0, v1, 2)
+    LOAD2_DATA32_LANE(v0, v1, 3)
+    "shrn       v2.4h, v5.4s, #9               \n"
+    "and        v2.8b, v2.8b, v4.8b            \n"
+    "dup        v16.8b, v2.b[0]                \n"
+    "dup        v17.8b, v2.b[2]                \n"
+    "dup        v18.8b, v2.b[4]                \n"
+    "dup        v19.8b, v2.b[6]                \n"
+    "ext        v2.8b, v16.8b, v17.8b, #4      \n"
+    "ext        v17.8b, v18.8b, v19.8b, #4     \n"
+    "ins        v2.d[1], v17.d[0]              \n"  // f
+    "eor        v7.16b, v2.16b, v3.16b         \n"  // 0x7f ^ f
+    "umull      v16.8h, v0.8b, v7.8b           \n"
+    "umull2     v17.8h, v0.16b, v7.16b         \n"
+    "umull      v18.8h, v1.8b, v2.8b           \n"
+    "umull2     v19.8h, v1.16b, v2.16b         \n"
+    "add        v16.8h, v16.8h, v18.8h         \n"
+    "add        v17.8h, v17.8h, v19.8h         \n"
+    "shrn       v0.8b, v16.8h, #7              \n"
+    "shrn2      v0.16b, v17.8h, #7             \n"
+
+    MEMACCESS(0)
+    "st1     {v0.4s}, [%0], #16                \n"  // store pixels
+    "add     v5.4s, v5.4s, v6.4s               \n"
+    "subs    %w2, %w2, #4                      \n"  // 4 processed per loop
+    "b.gt    1b                                \n"
+  : "+r"(dst_argb),         // %0
+    "+r"(src_argb),         // %1
+    "+r"(dst_width64),      // %2
+    "+r"(x64),              // %3
+    "+r"(dx64),             // %4
+    "+r"(tmp),              // %5
+    "+r"(src_tmp)           // %6
+  :
+  : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
+    "v6", "v7", "v16", "v17", "v18", "v19"
+  );
+}
+
+#undef LOAD2_DATA32_LANE
+
+#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libyuv/source/scale_win.cc b/libs/libyuv/source/scale_win.cc
new file mode 100644
index 0000000000..21b1ed923f
--- /dev/null
+++ b/libs/libyuv/source/scale_win.cc
@@ -0,0 +1,1357 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for 32 bit Visual C x86 and clangcl
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+
+// Offsets for source bytes 0 to 9
+static uvec8 kShuf0 =
+  { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
+static uvec8 kShuf1 =
+  { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static uvec8 kShuf2 =
+  { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 0 to 10
+static uvec8 kShuf01 =
+  { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
+
+// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
+static uvec8 kShuf11 =
+  { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static uvec8 kShuf21 =
+  { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
+
+// Coefficients for source bytes 0 to 10
+static uvec8 kMadd01 =
+  { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
+
+// Coefficients for source bytes 10 to 21
+static uvec8 kMadd11 =
+  { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
+
+// Coefficients for source bytes 21 to 31
+static uvec8 kMadd21 =
+  { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
+
+// Coefficients for source bytes 21 to 31
+static vec16 kRound34 =
+  { 2, 2, 2, 2, 2, 2, 2, 2 };
+
+static uvec8 kShuf38a =
+  { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+static uvec8 kShuf38b =
+  { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
+
+// Arrange words 0,3,6 into 0,1,2
+static uvec8 kShufAc =
+  { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Arrange words 0,3,6 into 3,4,5
+static uvec8 kShufAc3 =
+  { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
+
+// Scaling values for boxes of 3x3 and 2x3
+static uvec16 kScaleAc33 =
+  { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
+
+// Arrange first value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb0 =
+  { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
+
+// Arrange second value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb1 =
+  { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
+
+// Arrange third value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb2 =
+  { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
+
+// Scaling values for boxes of 3x2 and 2x2
+static uvec16 kScaleAb2 =
+  { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
+
+// Reads 32 pixels, throws half away and writes 16 pixels.
+__declspec(naked)
+void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                         uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_ptr
+                                     // src_stride ignored
+    mov        edx, [esp + 12]       // dst_ptr
+    mov        ecx, [esp + 16]       // dst_width
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    psrlw      xmm0, 8               // isolate odd pixels.
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         wloop
+
+    ret
+  }
+}
+
+// Blends 32x1 rectangle to 16x1.
+__declspec(naked)
+void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_ptr
+                                     // src_stride
+    mov        edx, [esp + 12]       // dst_ptr
+    mov        ecx, [esp + 16]       // dst_width
+
+    pcmpeqb    xmm4, xmm4            // constant 0x0101
+    psrlw      xmm4, 15
+    packuswb   xmm4, xmm4
+    pxor       xmm5, xmm5            // constant 0
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    pmaddubsw  xmm0, xmm4      // horizontal add
+    pmaddubsw  xmm1, xmm4
+    pavgw      xmm0, xmm5      // (x + 1) / 2
+    pavgw      xmm1, xmm5
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         wloop
+
+    ret
+  }
+}
+
+// Blends 32x2 rectangle to 16x1.
+__declspec(naked)
+void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]    // src_ptr
+    mov        esi, [esp + 4 + 8]    // src_stride
+    mov        edx, [esp + 4 + 12]   // dst_ptr
+    mov        ecx, [esp + 4 + 16]   // dst_width
+
+    pcmpeqb    xmm4, xmm4            // constant 0x0101
+    psrlw      xmm4, 15
+    packuswb   xmm4, xmm4
+    pxor       xmm5, xmm5            // constant 0
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + esi]
+    movdqu     xmm3, [eax + esi + 16]
+    lea        eax,  [eax + 32]
+    pmaddubsw  xmm0, xmm4      // horizontal add
+    pmaddubsw  xmm1, xmm4
+    pmaddubsw  xmm2, xmm4
+    pmaddubsw  xmm3, xmm4
+    paddw      xmm0, xmm2      // vertical add
+    paddw      xmm1, xmm3
+    psrlw      xmm0, 1
+    psrlw      xmm1, 1
+    pavgw      xmm0, xmm5      // (x + 1) / 2
+    pavgw      xmm1, xmm5
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         wloop
+
+    pop        esi
+    ret
+  }
+}
+
+#ifdef HAS_SCALEROWDOWN2_AVX2
+// Reads 64 pixels, throws half away and writes 32 pixels.
+__declspec(naked)
+void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_ptr
+                                     // src_stride ignored
+    mov        edx, [esp + 12]       // dst_ptr
+    mov        ecx, [esp + 16]       // dst_width
+
+  wloop:
+    vmovdqu     ymm0, [eax]
+    vmovdqu     ymm1, [eax + 32]
+    lea         eax,  [eax + 64]
+    vpsrlw      ymm0, ymm0, 8        // isolate odd pixels.
+    vpsrlw      ymm1, ymm1, 8
+    vpackuswb   ymm0, ymm0, ymm1
+    vpermq      ymm0, ymm0, 0xd8     // unmutate vpackuswb
+    vmovdqu     [edx], ymm0
+    lea         edx, [edx + 32]
+    sub         ecx, 32
+    jg          wloop
+
+    vzeroupper
+    ret
+  }
+}
+
+// Blends 64x1 rectangle to 32x1.
+__declspec(naked)
+void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov         eax, [esp + 4]        // src_ptr
+                                      // src_stride
+    mov         edx, [esp + 12]       // dst_ptr
+    mov         ecx, [esp + 16]       // dst_width
+
+    vpcmpeqb    ymm4, ymm4, ymm4      // '1' constant, 8b
+    vpsrlw      ymm4, ymm4, 15
+    vpackuswb   ymm4, ymm4, ymm4
+    vpxor       ymm5, ymm5, ymm5      // constant 0
+
+  wloop:
+    vmovdqu     ymm0, [eax]
+    vmovdqu     ymm1, [eax + 32]
+    lea         eax,  [eax + 64]
+    vpmaddubsw  ymm0, ymm0, ymm4      // horizontal add
+    vpmaddubsw  ymm1, ymm1, ymm4
+    vpavgw      ymm0, ymm0, ymm5      // (x + 1) / 2
+    vpavgw      ymm1, ymm1, ymm5
+    vpackuswb   ymm0, ymm0, ymm1
+    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
+    vmovdqu     [edx], ymm0
+    lea         edx, [edx + 32]
+    sub         ecx, 32
+    jg          wloop
+
+    vzeroupper
+    ret
+  }
+}
+
+// For rounding, average = (sum + 2) / 4
+// becomes average((sum >> 1), 0)
+// Blends 64x2 rectangle to 32x1.
+__declspec(naked)
+void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
+  __asm {
+    push        esi
+    mov         eax, [esp + 4 + 4]    // src_ptr
+    mov         esi, [esp + 4 + 8]    // src_stride
+    mov         edx, [esp + 4 + 12]   // dst_ptr
+    mov         ecx, [esp + 4 + 16]   // dst_width
+
+    vpcmpeqb    ymm4, ymm4, ymm4      // '1' constant, 8b
+    vpsrlw      ymm4, ymm4, 15
+    vpackuswb   ymm4, ymm4, ymm4
+    vpxor       ymm5, ymm5, ymm5      // constant 0
+
+  wloop:
+    vmovdqu     ymm0, [eax]
+    vmovdqu     ymm1, [eax + 32]
+    vmovdqu     ymm2, [eax + esi]
+    vmovdqu     ymm3, [eax + esi + 32]
+    lea         eax,  [eax + 64]
+    vpmaddubsw  ymm0, ymm0, ymm4      // horizontal add
+    vpmaddubsw  ymm1, ymm1, ymm4
+    vpmaddubsw  ymm2, ymm2, ymm4
+    vpmaddubsw  ymm3, ymm3, ymm4
+    vpaddw      ymm0, ymm0, ymm2      // vertical add
+    vpaddw      ymm1, ymm1, ymm3
+    vpsrlw      ymm0, ymm0, 1         // (x + 2) / 4 = (x / 2 + 1) / 2
+    vpsrlw      ymm1, ymm1, 1
+    vpavgw      ymm0, ymm0, ymm5      // (x + 1) / 2
+    vpavgw      ymm1, ymm1, ymm5
+    vpackuswb   ymm0, ymm0, ymm1
+    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
+    vmovdqu     [edx], ymm0
+    lea         edx, [edx + 32]
+    sub         ecx, 32
+    jg          wloop
+
+    pop         esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_SCALEROWDOWN2_AVX2
+
+// Point samples 32 pixels to 8 pixels.
+__declspec(naked)
+void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_ptr
+                                     // src_stride ignored
+    mov        edx, [esp + 12]       // dst_ptr
+    mov        ecx, [esp + 16]       // dst_width
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff0000
+    psrld      xmm5, 24
+    pslld      xmm5, 16
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    pand       xmm0, xmm5
+    pand       xmm1, xmm5
+    packuswb   xmm0, xmm1
+    psrlw      xmm0, 8
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx], xmm0
+    lea        edx, [edx + 8]
+    sub        ecx, 8
+    jg         wloop
+
+    ret
+  }
+}
+
+// Blends 32x4 rectangle to 8x1.
+__declspec(naked)
+void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_ptr
+    mov        esi, [esp + 8 + 8]    // src_stride
+    mov        edx, [esp + 8 + 12]   // dst_ptr
+    mov        ecx, [esp + 8 + 16]   // dst_width
+    lea        edi, [esi + esi * 2]  // src_stride * 3
+    pcmpeqb    xmm4, xmm4            // constant 0x0101
+    psrlw      xmm4, 15
+    movdqa     xmm5, xmm4
+    packuswb   xmm4, xmm4
+    psllw      xmm5, 3               // constant 0x0008
+
+  wloop:
+    movdqu     xmm0, [eax]           // average rows
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + esi]
+    movdqu     xmm3, [eax + esi + 16]
+    pmaddubsw  xmm0, xmm4      // horizontal add
+    pmaddubsw  xmm1, xmm4
+    pmaddubsw  xmm2, xmm4
+    pmaddubsw  xmm3, xmm4
+    paddw      xmm0, xmm2      // vertical add rows 0, 1
+    paddw      xmm1, xmm3
+    movdqu     xmm2, [eax + esi * 2]
+    movdqu     xmm3, [eax + esi * 2 + 16]
+    pmaddubsw  xmm2, xmm4
+    pmaddubsw  xmm3, xmm4
+    paddw      xmm0, xmm2      // add row 2
+    paddw      xmm1, xmm3
+    movdqu     xmm2, [eax + edi]
+    movdqu     xmm3, [eax + edi + 16]
+    lea        eax, [eax + 32]
+    pmaddubsw  xmm2, xmm4
+    pmaddubsw  xmm3, xmm4
+    paddw      xmm0, xmm2      // add row 3
+    paddw      xmm1, xmm3
+    phaddw     xmm0, xmm1
+    paddw      xmm0, xmm5      // + 8 for round
+    psrlw      xmm0, 4         // /16 for average of 4 * 4
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx], xmm0
+    lea        edx, [edx + 8]
+    sub        ecx, 8
+    jg         wloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+#ifdef HAS_SCALEROWDOWN4_AVX2
+// Point samples 64 pixels to 16 pixels.
+__declspec(naked)
+void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov         eax, [esp + 4]        // src_ptr
+                                      // src_stride ignored
+    mov         edx, [esp + 12]       // dst_ptr
+    mov         ecx, [esp + 16]       // dst_width
+    vpcmpeqb    ymm5, ymm5, ymm5      // generate mask 0x00ff0000
+    vpsrld      ymm5, ymm5, 24
+    vpslld      ymm5, ymm5, 16
+
+  wloop:
+    vmovdqu     ymm0, [eax]
+    vmovdqu     ymm1, [eax + 32]
+    lea         eax,  [eax + 64]
+    vpand       ymm0, ymm0, ymm5
+    vpand       ymm1, ymm1, ymm5
+    vpackuswb   ymm0, ymm0, ymm1
+    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
+    vpsrlw      ymm0, ymm0, 8
+    vpackuswb   ymm0, ymm0, ymm0
+    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
+    vmovdqu     [edx], xmm0
+    lea         edx, [edx + 16]
+    sub         ecx, 16
+    jg          wloop
+
+    vzeroupper
+    ret
+  }
+}
+
+// Blends 64x4 rectangle to 16x1.
+__declspec(naked)
+void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
+  __asm {
+    push        esi
+    push        edi
+    mov         eax, [esp + 8 + 4]    // src_ptr
+    mov         esi, [esp + 8 + 8]    // src_stride
+    mov         edx, [esp + 8 + 12]   // dst_ptr
+    mov         ecx, [esp + 8 + 16]   // dst_width
+    lea         edi, [esi + esi * 2]  // src_stride * 3
+    vpcmpeqb    ymm4, ymm4, ymm4            // constant 0x0101
+    vpsrlw      ymm4, ymm4, 15
+    vpsllw      ymm5, ymm4, 3               // constant 0x0008
+    vpackuswb   ymm4, ymm4, ymm4
+
+  wloop:
+    vmovdqu     ymm0, [eax]           // average rows
+    vmovdqu     ymm1, [eax + 32]
+    vmovdqu     ymm2, [eax + esi]
+    vmovdqu     ymm3, [eax + esi + 32]
+    vpmaddubsw  ymm0, ymm0, ymm4      // horizontal add
+    vpmaddubsw  ymm1, ymm1, ymm4
+    vpmaddubsw  ymm2, ymm2, ymm4
+    vpmaddubsw  ymm3, ymm3, ymm4
+    vpaddw      ymm0, ymm0, ymm2      // vertical add rows 0, 1
+    vpaddw      ymm1, ymm1, ymm3
+    vmovdqu     ymm2, [eax + esi * 2]
+    vmovdqu     ymm3, [eax + esi * 2 + 32]
+    vpmaddubsw  ymm2, ymm2, ymm4
+    vpmaddubsw  ymm3, ymm3, ymm4
+    vpaddw      ymm0, ymm0, ymm2      // add row 2
+    vpaddw      ymm1, ymm1, ymm3
+    vmovdqu     ymm2, [eax + edi]
+    vmovdqu     ymm3, [eax + edi + 32]
+    lea         eax,  [eax + 64]
+    vpmaddubsw  ymm2, ymm2, ymm4
+    vpmaddubsw  ymm3, ymm3, ymm4
+    vpaddw      ymm0, ymm0, ymm2      // add row 3
+    vpaddw      ymm1, ymm1, ymm3
+    vphaddw     ymm0, ymm0, ymm1      // mutates
+    vpermq      ymm0, ymm0, 0xd8      // unmutate vphaddw
+    vpaddw      ymm0, ymm0, ymm5      // + 8 for round
+    vpsrlw      ymm0, ymm0, 4         // /32 for average of 4 * 4
+    vpackuswb   ymm0, ymm0, ymm0
+    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
+    vmovdqu     [edx], xmm0
+    lea         edx, [edx + 16]
+    sub         ecx, 16
+    jg          wloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_SCALEROWDOWN4_AVX2
+
+// Point samples 32 pixels to 24 pixels.
+// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
+// Then shuffled to do the scaling.
+
+__declspec(naked)
+void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                          uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_ptr
+                                     // src_stride ignored
+    mov        edx, [esp + 12]       // dst_ptr
+    mov        ecx, [esp + 16]       // dst_width
+    movdqa     xmm3, xmmword ptr kShuf0
+    movdqa     xmm4, xmmword ptr kShuf1
+    movdqa     xmm5, xmmword ptr kShuf2
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    movdqa     xmm2, xmm1
+    palignr    xmm1, xmm0, 8
+    pshufb     xmm0, xmm3
+    pshufb     xmm1, xmm4
+    pshufb     xmm2, xmm5
+    movq       qword ptr [edx], xmm0
+    movq       qword ptr [edx + 8], xmm1
+    movq       qword ptr [edx + 16], xmm2
+    lea        edx, [edx + 24]
+    sub        ecx, 24
+    jg         wloop
+
+    ret
+  }
+}
+
+// Blends 32x2 rectangle to 24x1
+// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
+// Then shuffled to do the scaling.
+
+// Register usage:
+// xmm0 src_row 0
+// xmm1 src_row 1
+// xmm2 shuf 0
+// xmm3 shuf 1
+// xmm4 shuf 2
+// xmm5 madd 0
+// xmm6 madd 1
+// xmm7 kRound34
+
+// Note that movdqa+palign may be better than movdqu.
+__declspec(naked)
+void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]    // src_ptr
+    mov        esi, [esp + 4 + 8]    // src_stride
+    mov        edx, [esp + 4 + 12]   // dst_ptr
+    mov        ecx, [esp + 4 + 16]   // dst_width
+    movdqa     xmm2, xmmword ptr kShuf01
+    movdqa     xmm3, xmmword ptr kShuf11
+    movdqa     xmm4, xmmword ptr kShuf21
+    movdqa     xmm5, xmmword ptr kMadd01
+    movdqa     xmm6, xmmword ptr kMadd11
+    movdqa     xmm7, xmmword ptr kRound34
+
+  wloop:
+    movdqu     xmm0, [eax]           // pixels 0..7
+    movdqu     xmm1, [eax + esi]
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm2
+    pmaddubsw  xmm0, xmm5
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx], xmm0
+    movdqu     xmm0, [eax + 8]       // pixels 8..15
+    movdqu     xmm1, [eax + esi + 8]
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm3
+    pmaddubsw  xmm0, xmm6
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx + 8], xmm0
+    movdqu     xmm0, [eax + 16]      // pixels 16..23
+    movdqu     xmm1, [eax + esi + 16]
+    lea        eax, [eax + 32]
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm4
+    movdqa     xmm1, xmmword ptr kMadd21
+    pmaddubsw  xmm0, xmm1
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx + 16], xmm0
+    lea        edx, [edx + 24]
+    sub        ecx, 24
+    jg         wloop
+
+    pop        esi
+    ret
+  }
+}
+
+// Note that movdqa+palign may be better than movdqu.
+__declspec(naked)
+void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]    // src_ptr
+    mov        esi, [esp + 4 + 8]    // src_stride
+    mov        edx, [esp + 4 + 12]   // dst_ptr
+    mov        ecx, [esp + 4 + 16]   // dst_width
+    movdqa     xmm2, xmmword ptr kShuf01
+    movdqa     xmm3, xmmword ptr kShuf11
+    movdqa     xmm4, xmmword ptr kShuf21
+    movdqa     xmm5, xmmword ptr kMadd01
+    movdqa     xmm6, xmmword ptr kMadd11
+    movdqa     xmm7, xmmword ptr kRound34
+
+  wloop:
+    movdqu     xmm0, [eax]           // pixels 0..7
+    movdqu     xmm1, [eax + esi]
+    pavgb      xmm1, xmm0
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm2
+    pmaddubsw  xmm0, xmm5
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx], xmm0
+    movdqu     xmm0, [eax + 8]       // pixels 8..15
+    movdqu     xmm1, [eax + esi + 8]
+    pavgb      xmm1, xmm0
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm3
+    pmaddubsw  xmm0, xmm6
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx + 8], xmm0
+    movdqu     xmm0, [eax + 16]      // pixels 16..23
+    movdqu     xmm1, [eax + esi + 16]
+    lea        eax, [eax + 32]
+    pavgb      xmm1, xmm0
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm4
+    movdqa     xmm1, xmmword ptr kMadd21
+    pmaddubsw  xmm0, xmm1
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx + 16], xmm0
+    lea        edx, [edx+24]
+    sub        ecx, 24
+    jg         wloop
+
+    pop        esi
+    ret
+  }
+}
+
+// 3/8 point sampler
+
+// Scale 32 pixels to 12
+__declspec(naked)
+void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                          uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_ptr
+                                     // src_stride ignored
+    mov        edx, [esp + 12]       // dst_ptr
+    mov        ecx, [esp + 16]       // dst_width
+    movdqa     xmm4, xmmword ptr kShuf38a
+    movdqa     xmm5, xmmword ptr kShuf38b
+
+  xloop:
+    movdqu     xmm0, [eax]           // 16 pixels -> 0,1,2,3,4,5
+    movdqu     xmm1, [eax + 16]      // 16 pixels -> 6,7,8,9,10,11
+    lea        eax, [eax + 32]
+    pshufb     xmm0, xmm4
+    pshufb     xmm1, xmm5
+    paddusb    xmm0, xmm1
+
+    movq       qword ptr [edx], xmm0  // write 12 pixels
+    movhlps    xmm1, xmm0
+    movd       [edx + 8], xmm1
+    lea        edx, [edx + 12]
+    sub        ecx, 12
+    jg         xloop
+
+    ret
+  }
+}
+
+// Scale 16x3 pixels to 6x1 with interpolation
+__declspec(naked)
+void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]    // src_ptr
+    mov        esi, [esp + 4 + 8]    // src_stride
+    mov        edx, [esp + 4 + 12]   // dst_ptr
+    mov        ecx, [esp + 4 + 16]   // dst_width
+    movdqa     xmm2, xmmword ptr kShufAc
+    movdqa     xmm3, xmmword ptr kShufAc3
+    movdqa     xmm4, xmmword ptr kScaleAc33
+    pxor       xmm5, xmm5
+
+  xloop:
+    movdqu     xmm0, [eax]           // sum up 3 rows into xmm0/1
+    movdqu     xmm6, [eax + esi]
+    movhlps    xmm1, xmm0
+    movhlps    xmm7, xmm6
+    punpcklbw  xmm0, xmm5
+    punpcklbw  xmm1, xmm5
+    punpcklbw  xmm6, xmm5
+    punpcklbw  xmm7, xmm5
+    paddusw    xmm0, xmm6
+    paddusw    xmm1, xmm7
+    movdqu     xmm6, [eax + esi * 2]
+    lea        eax, [eax + 16]
+    movhlps    xmm7, xmm6
+    punpcklbw  xmm6, xmm5
+    punpcklbw  xmm7, xmm5
+    paddusw    xmm0, xmm6
+    paddusw    xmm1, xmm7
+
+    movdqa     xmm6, xmm0            // 8 pixels -> 0,1,2 of xmm6
+    psrldq     xmm0, 2
+    paddusw    xmm6, xmm0
+    psrldq     xmm0, 2
+    paddusw    xmm6, xmm0
+    pshufb     xmm6, xmm2
+
+    movdqa     xmm7, xmm1            // 8 pixels -> 3,4,5 of xmm6
+    psrldq     xmm1, 2
+    paddusw    xmm7, xmm1
+    psrldq     xmm1, 2
+    paddusw    xmm7, xmm1
+    pshufb     xmm7, xmm3
+    paddusw    xmm6, xmm7
+
+    pmulhuw    xmm6, xmm4            // divide by 9,9,6, 9,9,6
+    packuswb   xmm6, xmm6
+
+    movd       [edx], xmm6           // write 6 pixels
+    psrlq      xmm6, 16
+    movd       [edx + 2], xmm6
+    lea        edx, [edx + 6]
+    sub        ecx, 6
+    jg         xloop
+
+    pop        esi
+    ret
+  }
+}
+
+// Scale 16x2 pixels to 6x1 with interpolation
+__declspec(naked)
+void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]    // src_ptr
+    mov        esi, [esp + 4 + 8]    // src_stride
+    mov        edx, [esp + 4 + 12]   // dst_ptr
+    mov        ecx, [esp + 4 + 16]   // dst_width
+    movdqa     xmm2, xmmword ptr kShufAb0
+    movdqa     xmm3, xmmword ptr kShufAb1
+    movdqa     xmm4, xmmword ptr kShufAb2
+    movdqa     xmm5, xmmword ptr kScaleAb2
+
+  xloop:
+    movdqu     xmm0, [eax]           // average 2 rows into xmm0
+    movdqu     xmm1, [eax + esi]
+    lea        eax, [eax + 16]
+    pavgb      xmm0, xmm1
+
+    movdqa     xmm1, xmm0            // 16 pixels -> 0,1,2,3,4,5 of xmm1
+    pshufb     xmm1, xmm2
+    movdqa     xmm6, xmm0
+    pshufb     xmm6, xmm3
+    paddusw    xmm1, xmm6
+    pshufb     xmm0, xmm4
+    paddusw    xmm1, xmm0
+
+    pmulhuw    xmm1, xmm5            // divide by 3,3,2, 3,3,2
+    packuswb   xmm1, xmm1
+
+    movd       [edx], xmm1           // write 6 pixels
+    psrlq      xmm1, 16
+    movd       [edx + 2], xmm1
+    lea        edx, [edx + 6]
+    sub        ecx, 6
+    jg         xloop
+
+    pop        esi
+    ret
+  }
+}
+
+// Reads 16 bytes and accumulates to 16 shorts at a time.
+__declspec(naked)
+void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
+  __asm {
+    mov        eax, [esp + 4]   // src_ptr
+    mov        edx, [esp + 8]   // dst_ptr
+    mov        ecx, [esp + 12]  // src_width
+    pxor       xmm5, xmm5
+
+  // sum rows
+  xloop:
+    movdqu     xmm3, [eax]       // read 16 bytes
+    lea        eax, [eax + 16]
+    movdqu     xmm0, [edx]       // read 16 words from destination
+    movdqu     xmm1, [edx + 16]
+    movdqa     xmm2, xmm3
+    punpcklbw  xmm2, xmm5
+    punpckhbw  xmm3, xmm5
+    paddusw    xmm0, xmm2        // sum 16 words
+    paddusw    xmm1, xmm3
+    movdqu     [edx], xmm0       // write 16 words to destination
+    movdqu     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    sub        ecx, 16
+    jg         xloop
+    ret
+  }
+}
+
+#ifdef HAS_SCALEADDROW_AVX2
+// Reads 32 bytes and accumulates to 32 shorts at a time.
+__declspec(naked)
+void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
+  __asm {
+    mov         eax, [esp + 4]   // src_ptr
+    mov         edx, [esp + 8]   // dst_ptr
+    mov         ecx, [esp + 12]  // src_width
+    vpxor       ymm5, ymm5, ymm5
+
+  // sum rows
+  xloop:
+    vmovdqu     ymm3, [eax]       // read 32 bytes
+    lea         eax, [eax + 32]
+    vpermq      ymm3, ymm3, 0xd8  // unmutate for vpunpck
+    vpunpcklbw  ymm2, ymm3, ymm5
+    vpunpckhbw  ymm3, ymm3, ymm5
+    vpaddusw    ymm0, ymm2, [edx] // sum 16 words
+    vpaddusw    ymm1, ymm3, [edx + 32]
+    vmovdqu     [edx], ymm0       // write 32 words to destination
+    vmovdqu     [edx + 32], ymm1
+    lea         edx, [edx + 64]
+    sub         ecx, 32
+    jg          xloop
+
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_SCALEADDROW_AVX2
+
+// Bilinear column filtering. SSSE3 version.
+__declspec(naked)
+void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                           int dst_width, int x, int dx) {
+  __asm {
+    push       ebx
+    push       esi
+    push       edi
+    mov        edi, [esp + 12 + 4]    // dst_ptr
+    mov        esi, [esp + 12 + 8]    // src_ptr
+    mov        ecx, [esp + 12 + 12]   // dst_width
+    movd       xmm2, [esp + 12 + 16]  // x
+    movd       xmm3, [esp + 12 + 20]  // dx
+    mov        eax, 0x04040000      // shuffle to line up fractions with pixel.
+    movd       xmm5, eax
+    pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.
+    psrlw      xmm6, 9
+    pextrw     eax, xmm2, 1         // get x0 integer. preroll
+    sub        ecx, 2
+    jl         xloop29
+
+    movdqa     xmm0, xmm2           // x1 = x0 + dx
+    paddd      xmm0, xmm3
+    punpckldq  xmm2, xmm0           // x0 x1
+    punpckldq  xmm3, xmm3           // dx dx
+    paddd      xmm3, xmm3           // dx * 2, dx * 2
+    pextrw     edx, xmm2, 3         // get x1 integer. preroll
+
+    // 2 Pixel loop.
+  xloop2:
+    movdqa     xmm1, xmm2           // x0, x1 fractions.
+    paddd      xmm2, xmm3           // x += dx
+    movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
+    movd       xmm0, ebx
+    psrlw      xmm1, 9              // 7 bit fractions.
+    movzx      ebx, word ptr [esi + edx]  // 2 source x1 pixels
+    movd       xmm4, ebx
+    pshufb     xmm1, xmm5           // 0011
+    punpcklwd  xmm0, xmm4
+    pxor       xmm1, xmm6           // 0..7f and 7f..0
+    pmaddubsw  xmm0, xmm1           // 16 bit, 2 pixels.
+    pextrw     eax, xmm2, 1         // get x0 integer. next iteration.
+    pextrw     edx, xmm2, 3         // get x1 integer. next iteration.
+    psrlw      xmm0, 7              // 8.7 fixed point to low 8 bits.
+    packuswb   xmm0, xmm0           // 8 bits, 2 pixels.
+    movd       ebx, xmm0
+    mov        [edi], bx
+    lea        edi, [edi + 2]
+    sub        ecx, 2               // 2 pixels
+    jge        xloop2
+
+ xloop29:
+
+    add        ecx, 2 - 1
+    jl         xloop99
+
+    // 1 pixel remainder
+    movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
+    movd       xmm0, ebx
+    psrlw      xmm2, 9              // 7 bit fractions.
+    pshufb     xmm2, xmm5           // 0011
+    pxor       xmm2, xmm6           // 0..7f and 7f..0
+    pmaddubsw  xmm0, xmm2           // 16 bit
+    psrlw      xmm0, 7              // 8.7 fixed point to low 8 bits.
+    packuswb   xmm0, xmm0           // 8 bits
+    movd       ebx, xmm0
+    mov        [edi], bl
+
+ xloop99:
+
+    pop        edi
+    pop        esi
+    pop        ebx
+    ret
+  }
+}
+
+// Reads 16 pixels, duplicates them and writes 32 pixels.
+__declspec(naked)
+void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+                       int dst_width, int x, int dx) {
+  __asm {
+    mov        edx, [esp + 4]    // dst_ptr
+    mov        eax, [esp + 8]    // src_ptr
+    mov        ecx, [esp + 12]   // dst_width
+
+  wloop:
+    movdqu     xmm0, [eax]
+    lea        eax,  [eax + 16]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm0
+    punpckhbw  xmm1, xmm1
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    sub        ecx, 32
+    jg         wloop
+
+    ret
+  }
+}
+
+// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
+__declspec(naked)
+void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
+                            ptrdiff_t src_stride,
+                            uint8* dst_argb, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_argb
+                                     // src_stride ignored
+    mov        edx, [esp + 12]       // dst_argb
+    mov        ecx, [esp + 16]       // dst_width
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    shufps     xmm0, xmm1, 0xdd
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         wloop
+
+    ret
+  }
+}
+
+// Blends 8x1 rectangle to 4x1.
+__declspec(naked)
+void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
+                                  ptrdiff_t src_stride,
+                                  uint8* dst_argb, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_argb
+                                     // src_stride ignored
+    mov        edx, [esp + 12]       // dst_argb
+    mov        ecx, [esp + 16]       // dst_width
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    movdqa     xmm2, xmm0
+    shufps     xmm0, xmm1, 0x88      // even pixels
+    shufps     xmm2, xmm1, 0xdd      // odd pixels
+    pavgb      xmm0, xmm2
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         wloop
+
+    ret
+  }
+}
+
+// Blends 8x2 rectangle to 4x1.
+__declspec(naked)
+void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
+                               ptrdiff_t src_stride,
+                               uint8* dst_argb, int dst_width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]    // src_argb
+    mov        esi, [esp + 4 + 8]    // src_stride
+    mov        edx, [esp + 4 + 12]   // dst_argb
+    mov        ecx, [esp + 4 + 16]   // dst_width
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + esi]
+    movdqu     xmm3, [eax + esi + 16]
+    lea        eax,  [eax + 32]
+    pavgb      xmm0, xmm2            // average rows
+    pavgb      xmm1, xmm3
+    movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
+    shufps     xmm0, xmm1, 0x88      // even pixels
+    shufps     xmm2, xmm1, 0xdd      // odd pixels
+    pavgb      xmm0, xmm2
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         wloop
+
+    pop        esi
+    ret
+  }
+}
+
+// Reads 4 pixels at a time.
+__declspec(naked)
+void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8* dst_argb, int dst_width) {
+  __asm {
+    push       ebx
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_argb
+                                     // src_stride ignored
+    mov        ebx, [esp + 8 + 12]   // src_stepx
+    mov        edx, [esp + 8 + 16]   // dst_argb
+    mov        ecx, [esp + 8 + 20]   // dst_width
+    lea        ebx, [ebx * 4]
+    lea        edi, [ebx + ebx * 2]
+
+  wloop:
+    movd       xmm0, [eax]
+    movd       xmm1, [eax + ebx]
+    punpckldq  xmm0, xmm1
+    movd       xmm2, [eax + ebx * 2]
+    movd       xmm3, [eax + edi]
+    lea        eax,  [eax + ebx * 4]
+    punpckldq  xmm2, xmm3
+    punpcklqdq xmm0, xmm2
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         wloop
+
+    pop        edi
+    pop        ebx
+    ret
+  }
+}
+
+// Blends four 2x2 to 4x1.
+__declspec(naked)
+void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
+                                  ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8* dst_argb, int dst_width) {
+  __asm {
+    push       ebx
+    push       esi
+    push       edi
+    mov        eax, [esp + 12 + 4]    // src_argb
+    mov        esi, [esp + 12 + 8]    // src_stride
+    mov        ebx, [esp + 12 + 12]   // src_stepx
+    mov        edx, [esp + 12 + 16]   // dst_argb
+    mov        ecx, [esp + 12 + 20]   // dst_width
+    lea        esi, [eax + esi]       // row1 pointer
+    lea        ebx, [ebx * 4]
+    lea        edi, [ebx + ebx * 2]
+
+  wloop:
+    movq       xmm0, qword ptr [eax]  // row0 4 pairs
+    movhps     xmm0, qword ptr [eax + ebx]
+    movq       xmm1, qword ptr [eax + ebx * 2]
+    movhps     xmm1, qword ptr [eax + edi]
+    lea        eax,  [eax + ebx * 4]
+    movq       xmm2, qword ptr [esi]  // row1 4 pairs
+    movhps     xmm2, qword ptr [esi + ebx]
+    movq       xmm3, qword ptr [esi + ebx * 2]
+    movhps     xmm3, qword ptr [esi + edi]
+    lea        esi,  [esi + ebx * 4]
+    pavgb      xmm0, xmm2            // average rows
+    pavgb      xmm1, xmm3
+    movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
+    shufps     xmm0, xmm1, 0x88      // even pixels
+    shufps     xmm2, xmm1, 0xdd      // odd pixels
+    pavgb      xmm0, xmm2
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         wloop
+
+    pop        edi
+    pop        esi
+    pop        ebx
+    ret
+  }
+}
+
+// Column scaling unfiltered. SSE2 version.
+__declspec(naked)
+void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
+                        int dst_width, int x, int dx) {
+  __asm {
+    push       edi
+    push       esi
+    mov        edi, [esp + 8 + 4]    // dst_argb
+    mov        esi, [esp + 8 + 8]    // src_argb
+    mov        ecx, [esp + 8 + 12]   // dst_width
+    movd       xmm2, [esp + 8 + 16]  // x
+    movd       xmm3, [esp + 8 + 20]  // dx
+
+    pshufd     xmm2, xmm2, 0         // x0 x0 x0 x0
+    pshufd     xmm0, xmm3, 0x11      // dx  0 dx  0
+    paddd      xmm2, xmm0
+    paddd      xmm3, xmm3            // 0, 0, 0,  dx * 2
+    pshufd     xmm0, xmm3, 0x05      // dx * 2, dx * 2, 0, 0
+    paddd      xmm2, xmm0            // x3 x2 x1 x0
+    paddd      xmm3, xmm3            // 0, 0, 0,  dx * 4
+    pshufd     xmm3, xmm3, 0         // dx * 4, dx * 4, dx * 4, dx * 4
+
+    pextrw     eax, xmm2, 1          // get x0 integer.
+    pextrw     edx, xmm2, 3          // get x1 integer.
+
+    cmp        ecx, 0
+    jle        xloop99
+    sub        ecx, 4
+    jl         xloop49
+
+    // 4 Pixel loop.
+ xloop4:
+    movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
+    movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
+    pextrw     eax, xmm2, 5           // get x2 integer.
+    pextrw     edx, xmm2, 7           // get x3 integer.
+    paddd      xmm2, xmm3             // x += dx
+    punpckldq  xmm0, xmm1             // x0 x1
+
+    movd       xmm1, [esi + eax * 4]  // 1 source x2 pixels
+    movd       xmm4, [esi + edx * 4]  // 1 source x3 pixels
+    pextrw     eax, xmm2, 1           // get x0 integer. next iteration.
+    pextrw     edx, xmm2, 3           // get x1 integer. next iteration.
+    punpckldq  xmm1, xmm4             // x2 x3
+    punpcklqdq xmm0, xmm1             // x0 x1 x2 x3
+    movdqu     [edi], xmm0
+    lea        edi, [edi + 16]
+    sub        ecx, 4                 // 4 pixels
+    jge        xloop4
+
+ xloop49:
+    test       ecx, 2
+    je         xloop29
+
+    // 2 Pixels.
+    movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
+    movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
+    pextrw     eax, xmm2, 5           // get x2 integer.
+    punpckldq  xmm0, xmm1             // x0 x1
+
+    movq       qword ptr [edi], xmm0
+    lea        edi, [edi + 8]
+
+ xloop29:
+    test       ecx, 1
+    je         xloop99
+
+    // 1 Pixels.
+    movd       xmm0, [esi + eax * 4]  // 1 source x2 pixels
+    movd       dword ptr [edi], xmm0
+ xloop99:
+
+    pop        esi
+    pop        edi
+    ret
+  }
+}
+
+// Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.
+// TODO(fbarchard): Port to Neon
+
+// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
+static uvec8 kShuffleColARGB = {
+  0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel
+  8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
+};
+
+// Shuffle table for duplicating 2 fractions into 8 bytes each
+static uvec8 kShuffleFractions = {
+  0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
+};
+
+__declspec(naked)
+void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
+                               int dst_width, int x, int dx) {
+  __asm {
+    push       esi
+    push       edi
+    mov        edi, [esp + 8 + 4]    // dst_argb
+    mov        esi, [esp + 8 + 8]    // src_argb
+    mov        ecx, [esp + 8 + 12]   // dst_width
+    movd       xmm2, [esp + 8 + 16]  // x
+    movd       xmm3, [esp + 8 + 20]  // dx
+    movdqa     xmm4, xmmword ptr kShuffleColARGB
+    movdqa     xmm5, xmmword ptr kShuffleFractions
+    pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.
+    psrlw      xmm6, 9
+    pextrw     eax, xmm2, 1         // get x0 integer. preroll
+    sub        ecx, 2
+    jl         xloop29
+
+    movdqa     xmm0, xmm2           // x1 = x0 + dx
+    paddd      xmm0, xmm3
+    punpckldq  xmm2, xmm0           // x0 x1
+    punpckldq  xmm3, xmm3           // dx dx
+    paddd      xmm3, xmm3           // dx * 2, dx * 2
+    pextrw     edx, xmm2, 3         // get x1 integer. preroll
+
+    // 2 Pixel loop.
+  xloop2:
+    movdqa     xmm1, xmm2           // x0, x1 fractions.
+    paddd      xmm2, xmm3           // x += dx
+    movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
+    psrlw      xmm1, 9              // 7 bit fractions.
+    movhps     xmm0, qword ptr [esi + edx * 4]  // 2 source x1 pixels
+    pshufb     xmm1, xmm5           // 0000000011111111
+    pshufb     xmm0, xmm4           // arrange pixels into pairs
+    pxor       xmm1, xmm6           // 0..7f and 7f..0
+    pmaddubsw  xmm0, xmm1           // argb_argb 16 bit, 2 pixels.
+    pextrw     eax, xmm2, 1         // get x0 integer. next iteration.
+    pextrw     edx, xmm2, 3         // get x1 integer. next iteration.
+    psrlw      xmm0, 7              // argb 8.7 fixed point to low 8 bits.
+    packuswb   xmm0, xmm0           // argb_argb 8 bits, 2 pixels.
+    movq       qword ptr [edi], xmm0
+    lea        edi, [edi + 8]
+    sub        ecx, 2               // 2 pixels
+    jge        xloop2
+
+ xloop29:
+
+    add        ecx, 2 - 1
+    jl         xloop99
+
+    // 1 pixel remainder
+    psrlw      xmm2, 9              // 7 bit fractions.
+    movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
+    pshufb     xmm2, xmm5           // 00000000
+    pshufb     xmm0, xmm4           // arrange pixels into pairs
+    pxor       xmm2, xmm6           // 0..7f and 7f..0
+    pmaddubsw  xmm0, xmm2           // argb 16 bit, 1 pixel.
+    psrlw      xmm0, 7
+    packuswb   xmm0, xmm0           // argb 8 bits, 1 pixel.
+    movd       [edi], xmm0
+
+ xloop99:
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+// Reads 4 pixels, duplicates them and writes 8 pixels.
+__declspec(naked)
+void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
+                           int dst_width, int x, int dx) {
+  __asm {
+    mov        edx, [esp + 4]    // dst_argb
+    mov        eax, [esp + 8]    // src_argb
+    mov        ecx, [esp + 12]   // dst_width
+
+  wloop:
+    movdqu     xmm0, [eax]
+    lea        eax,  [eax + 16]
+    movdqa     xmm1, xmm0
+    punpckldq  xmm0, xmm0
+    punpckhdq  xmm1, xmm1
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         wloop
+
+    ret
+  }
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+__declspec(naked)
+int FixedDiv_X86(int num, int div) {
+  __asm {
+    mov        eax, [esp + 4]    // num
+    cdq                          // extend num to 64 bits
+    shld       edx, eax, 16      // 32.16
+    shl        eax, 16
+    idiv       dword ptr [esp + 8]
+    ret
+  }
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+__declspec(naked)
+int FixedDiv1_X86(int num, int div) {
+  __asm {
+    mov        eax, [esp + 4]    // num
+    mov        ecx, [esp + 8]    // denom
+    cdq                          // extend num to 64 bits
+    shld       edx, eax, 16      // 32.16
+    shl        eax, 16
+    sub        eax, 0x00010001
+    sbb        edx, 0
+    sub        ecx, 1
+    idiv       ecx
+    ret
+  }
+}
+#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libyuv/source/video_common.cc b/libs/libyuv/source/video_common.cc
new file mode 100644
index 0000000000..379a0669ae
--- /dev/null
+++ b/libs/libyuv/source/video_common.cc
@@ -0,0 +1,64 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define ARRAY_SIZE(x) (int)(sizeof(x) / sizeof(x[0]))
+
+struct FourCCAliasEntry {
+  uint32 alias;
+  uint32 canonical;
+};
+
+static const struct FourCCAliasEntry kFourCCAliases[] = {
+  {FOURCC_IYUV, FOURCC_I420},
+  {FOURCC_YU16, FOURCC_I422},
+  {FOURCC_YU24, FOURCC_I444},
+  {FOURCC_YUYV, FOURCC_YUY2},
+  {FOURCC_YUVS, FOURCC_YUY2},  // kCMPixelFormat_422YpCbCr8_yuvs
+  {FOURCC_HDYC, FOURCC_UYVY},
+  {FOURCC_2VUY, FOURCC_UYVY},  // kCMPixelFormat_422YpCbCr8
+  {FOURCC_JPEG, FOURCC_MJPG},  // Note: JPEG has DHT while MJPG does not.
+  {FOURCC_DMB1, FOURCC_MJPG},
+  {FOURCC_BA81, FOURCC_BGGR},  // deprecated.
+  {FOURCC_RGB3, FOURCC_RAW },
+  {FOURCC_BGR3, FOURCC_24BG},
+  {FOURCC_CM32, FOURCC_BGRA},  // kCMPixelFormat_32ARGB
+  {FOURCC_CM24, FOURCC_RAW },  // kCMPixelFormat_24RGB
+  {FOURCC_L555, FOURCC_RGBO},  // kCMPixelFormat_16LE555
+  {FOURCC_L565, FOURCC_RGBP},  // kCMPixelFormat_16LE565
+  {FOURCC_5551, FOURCC_RGBO},  // kCMPixelFormat_16LE5551
+};
+// TODO(fbarchard): Consider mapping kCMPixelFormat_32BGRA to FOURCC_ARGB.
+//  {FOURCC_BGRA, FOURCC_ARGB},  // kCMPixelFormat_32BGRA
+
+LIBYUV_API
+uint32 CanonicalFourCC(uint32 fourcc) {
+  int i;
+  for (i = 0; i < ARRAY_SIZE(kFourCCAliases); ++i) {
+    if (kFourCCAliases[i].alias == fourcc) {
+      return kFourCCAliases[i].canonical;
+    }
+  }
+  // Not an alias, so return it as-is.
+  return fourcc;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
diff --git a/libs/libyuv/sync_chromium.py b/libs/libyuv/sync_chromium.py
new file mode 100755
index 0000000000..53341a78a3
--- /dev/null
+++ b/libs/libyuv/sync_chromium.py
@@ -0,0 +1,156 @@
+#!/usr/bin/env python
+# Copyright 2014 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+"""Script to download a Chromium checkout into the workspace.
+
+The script downloads a full Chromium Git clone and its DEPS.
+
+The following environment variable can be used to alter the behavior:
+* CHROMIUM_NO_HISTORY - If set to 1, a Git checkout with no history will be
+  downloaded. This is consumes less bandwidth and disk space but is known to be
+  slower in general if you have a high-speed connection.
+
+After a successful sync has completed, a .last_sync_chromium file is written to
+the chromium directory. While it exists, no more gclient sync operations will be
+performed until the --target-revision changes or the SCRIPT_VERSION constant is
+incremented. The file can be removed manually to force a new sync.
+"""
+
+import argparse
+import os
+import subprocess
+import sys
+
+# Bump this whenever the algorithm changes and you need bots/devs to re-sync,
+# ignoring the .last_sync_chromium file
+SCRIPT_VERSION = 4
+
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+CHROMIUM_NO_HISTORY = 'CHROMIUM_NO_HISTORY'
+
+def _parse_gclient_dict():
+  gclient_dict = {}
+  try:
+    main_gclient = os.path.join(os.path.dirname(ROOT_DIR), '.gclient')
+    with open(main_gclient, 'rb') as deps_content:
+      exec(deps_content, gclient_dict)
+  except Exception as e:
+    print >> sys.stderr, 'error while parsing .gclient:', e
+  return gclient_dict
+
+
+def get_cache_dir():
+  return _parse_gclient_dict().get('cache_dir')
+
+
+def get_target_os_list():
+  return ','.join(_parse_gclient_dict().get('target_os', []))
+
+
+def main():
+  CR_DIR = os.path.join(ROOT_DIR, 'chromium')
+
+  p = argparse.ArgumentParser()
+  p.add_argument('--target-revision', required=True,
+                 help='The target chromium git revision [REQUIRED]')
+  p.add_argument('--chromium-dir', default=CR_DIR,
+                 help=('The path to the chromium directory to sync '
+                       '(default: %(default)r)'))
+  opts = p.parse_args()
+  opts.chromium_dir = os.path.abspath(opts.chromium_dir)
+
+  target_os_list = get_target_os_list()
+
+  # Do a quick check to see if we were successful last time to make runhooks
+  # sooper fast.
+  flag_file = os.path.join(opts.chromium_dir, '.last_sync_chromium')
+  flag_file_content = '\n'.join([
+    str(SCRIPT_VERSION),
+    opts.target_revision,
+    repr(target_os_list),
+  ])
+  if (os.path.exists(os.path.join(opts.chromium_dir, 'src')) and
+      os.path.exists(flag_file)):
+    with open(flag_file, 'r') as f:
+      if f.read() == flag_file_content:
+        print 'Chromium already up to date: ', opts.target_revision
+        return 0
+    os.unlink(flag_file)
+
+  env = os.environ.copy()
+
+  # Avoid downloading NaCl toolchain as part of the Chromium hooks.
+  env.setdefault('GYP_DEFINES', '')
+  env['GYP_DEFINES'] += ' disable_nacl=1'
+  env['GYP_CHROMIUM_NO_ACTION'] = '1'
+  gclient_cmd = 'gclient.bat' if sys.platform.startswith('win') else 'gclient'
+  args = [
+      gclient_cmd, 'sync', '--force', '--revision', 'src@'+opts.target_revision
+  ]
+
+  if os.environ.get('CHROME_HEADLESS') == '1':
+    # Running on a buildbot.
+    args.append('-vvv')
+
+    if sys.platform.startswith('win'):
+      cache_path = os.path.join(os.path.splitdrive(ROOT_DIR)[0] + os.path.sep,
+                                'b', 'git-cache')
+    else:
+      cache_path = '/b/git-cache'
+  else:
+    # Support developers setting the cache_dir in .gclient.
+    cache_path = get_cache_dir()
+
+  # Allow for users with poor internet connections to download a Git clone
+  # without history (saves several gigs but is generally slower and doesn't work
+  # with the Git cache).
+  if os.environ.get(CHROMIUM_NO_HISTORY) == '1':
+    if cache_path:
+      print >> sys.stderr, (
+          'You cannot use "no-history" mode for syncing Chrome (i.e. set the '
+          '%s environment variable to 1) when you have cache_dir configured in '
+          'your .gclient.' % CHROMIUM_NO_HISTORY)
+      return 1
+    args.append('--no-history')
+    gclient_entries_file = os.path.join(opts.chromium_dir, '.gclient_entries')
+  else:
+    # Write a temporary .gclient file that has the cache_dir variable added.
+    gclientfile = os.path.join(opts.chromium_dir, '.gclient')
+    with open(gclientfile, 'rb') as spec:
+      spec = spec.read().splitlines()
+      spec[-1] = 'cache_dir = %r' % (cache_path,)
+    with open(gclientfile + '.tmp', 'wb') as f:
+      f.write('\n'.join(spec))
+
+    args += [
+      '--gclientfile', '.gclient.tmp',
+      '--delete_unversioned_trees', '--reset', '--upstream'
+    ]
+    gclient_entries_file = os.path.join(opts.chromium_dir,
+                                        '.gclient.tmp_entries')
+
+  # To avoid gclient sync problems when DEPS entries have been removed we must
+  # wipe the gclient's entries file that contains cached URLs for all DEPS.
+  if os.path.exists(gclient_entries_file):
+    os.unlink(gclient_entries_file)
+
+  if target_os_list:
+    args += ['--deps=' + target_os_list]
+
+  print 'Running "%s" in %s' % (' '.join(args), opts.chromium_dir)
+  ret = subprocess.call(args, cwd=opts.chromium_dir, env=env)
+  if ret == 0:
+    with open(flag_file, 'wb') as f:
+      f.write(flag_file_content)
+
+  return ret
+
+
+if __name__ == '__main__':
+  sys.exit(main())
diff --git a/libs/libyuv/third_party/gflags/BUILD.gn b/libs/libyuv/third_party/gflags/BUILD.gn
new file mode 100644
index 0000000000..913c558754
--- /dev/null
+++ b/libs/libyuv/third_party/gflags/BUILD.gn
@@ -0,0 +1,68 @@
+#
+# Copyright 2014 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+# This is a copy of WebRTC's BUILD.gn.
+
+if (is_win) {
+  gflags_gen_arch_root = "gen/win"
+} else {
+  gflags_gen_arch_root = "gen/posix"
+}
+
+config("gflags_config") {
+  include_dirs = [
+    "$gflags_gen_arch_root/include",  # For configured files.
+    "src",  # For everything else.
+  ]
+
+  defines = [
+    # These macros exist so flags and symbols are properly exported when
+    # building DLLs. Since we don't build DLLs, we need to disable them.
+    "GFLAGS_DLL_DECL=",
+    "GFLAGS_DLL_DECLARE_FLAG=",
+    "GFLAGS_DLL_DEFINE_FLAG=",
+  ]
+
+  # GN orders flags on a target before flags from configs. The default config
+  # adds -Wall, and this flag have to be after -Wall -- so they need to
+  # come from a config and can't be on the target directly.
+  if (is_clang) {
+    cflags = [ "-Wno-unused-local-typedef" ]
+  }
+}
+
+source_set("gflags") {
+  sources = [
+    "src/gflags.cc",
+    "src/gflags_completions.cc",
+    "src/gflags_reporting.cc",
+  ]
+  if (is_win) {
+    sources += [ "src/windows/port.cc" ]
+
+    cflags = [
+      "/wd4005",  # WIN32_LEAN_AND_MEAN.
+      "/wd4267",  # Conversion from size_t to "type".
+    ]
+  }
+
+  include_dirs = [ "$gflags_gen_arch_root/include/private" ]  # For config.h
+
+  public_configs = [ ":gflags_config" ]
+
+  configs -= [ "//build/config/compiler:chromium_code" ]
+  configs += [ "//build/config/compiler:no_chromium_code" ]
+
+  if (is_clang) {
+    # TODO(andrew): Look into fixing this warning upstream:
+    # http://code.google.com/p/webrtc/issues/detail?id=760
+    configs -= [ "//build/config/clang:extra_warnings" ]
+  }
+}
+
diff --git a/libs/libyuv/third_party/gflags/LICENSE b/libs/libyuv/third_party/gflags/LICENSE
new file mode 100644
index 0000000000..d15b0c2413
--- /dev/null
+++ b/libs/libyuv/third_party/gflags/LICENSE
@@ -0,0 +1,28 @@
+Copyright (c) 2006, Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+    * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/libs/libyuv/third_party/gflags/README.libyuv b/libs/libyuv/third_party/gflags/README.libyuv
new file mode 100644
index 0000000000..63d560c646
--- /dev/null
+++ b/libs/libyuv/third_party/gflags/README.libyuv
@@ -0,0 +1,28 @@
+URL: http://code.google.com/p/gflags/
+Version: 2.0
+License: New BSD
+License File: LICENSE
+
+Description:
+The gflags package contains a library that implements commandline
+flags processing. As such it's a replacement for getopt(). It has
+increased flexibility, including built-in support for C++ types like
+string, and the ability to define flags in the source file in which
+they're used.
+
+Local Modifications: None
+
+
+How to update platform configuration files:
+The gen/ directory contains pre-generated configuration header files.
+Historically, all operating systems and architectures have generated
+similar configurations except for Windows. This is why there's only
+posix and win directories below gen/.
+When rolling gflags to a newer version, it's a good idea to check if
+new configuration files needs to be generated as well.
+Do this by running ./configure in the newly checked out version of
+gflags. Then diff the generated files with the ones below gen/.
+If you notice a diff, update the files with the updated ones.
+If you suspect platform dependend changes other than Windows, you'll
+have to checkout gflags on the other platforms as well and run
+./configure there too.
diff --git a/libs/libyuv/third_party/gflags/gen/posix/include/gflags/gflags.h b/libs/libyuv/third_party/gflags/gen/posix/include/gflags/gflags.h
new file mode 100644
index 0000000000..5d07b30b90
--- /dev/null
+++ b/libs/libyuv/third_party/gflags/gen/posix/include/gflags/gflags.h
@@ -0,0 +1,592 @@
+// Copyright (c) 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Ray Sidney
+// Revamped and reorganized by Craig Silverstein
+//
+// This is the file that should be included by any file which declares
+// or defines a command line flag or wants to parse command line flags
+// or print a program usage message (which will include information about
+// flags).  Executive summary, in the form of an example foo.cc file:
+//
+//    #include "foo.h"         // foo.h has a line "DECLARE_int32(start);"
+//    #include "validators.h"  // hypothetical file defining ValidateIsFile()
+//
+//    DEFINE_int32(end, 1000, "The last record to read");
+//
+//    DEFINE_string(filename, "my_file.txt", "The file to read");
+//    // Crash if the specified file does not exist.
+//    static bool dummy = RegisterFlagValidator(&FLAGS_filename,
+//                                              &ValidateIsFile);
+//
+//    DECLARE_bool(verbose); // some other file has a DEFINE_bool(verbose, ...)
+//
+//    void MyFunc() {
+//      if (FLAGS_verbose) printf("Records %d-%d\n", FLAGS_start, FLAGS_end);
+//    }
+//
+// Then, at the command-line:
+//    ./foo --noverbose --start=5 --end=100
+//
+// For more details, see
+//    doc/gflags.html
+//
+// --- A note about thread-safety:
+//
+// We describe many functions in this routine as being thread-hostile,
+// thread-compatible, or thread-safe.  Here are the meanings we use:
+//
+// thread-safe: it is safe for multiple threads to call this routine
+//   (or, when referring to a class, methods of this class)
+//   concurrently.
+// thread-hostile: it is not safe for multiple threads to call this
+//   routine (or methods of this class) concurrently.  In gflags,
+//   most thread-hostile routines are intended to be called early in,
+//   or even before, main() -- that is, before threads are spawned.
+// thread-compatible: it is safe for multiple threads to read from
+//   this variable (when applied to variables), or to call const
+//   methods of this class (when applied to classes), as long as no
+//   other thread is writing to the variable or calling non-const
+//   methods of this class.
+
+#ifndef GOOGLE_GFLAGS_H_
+#define GOOGLE_GFLAGS_H_
+
+#include <string>
+#include <vector>
+
+// We care a lot about number of bits things take up.  Unfortunately,
+// systems define their bit-specific ints in a lot of different ways.
+// We use our own way, and have a typedef to get there.
+// Note: these commands below may look like "#if 1" or "#if 0", but
+// that's because they were constructed that way at ./configure time.
+// Look at gflags.h.in to see how they're calculated (based on your config).
+#if 1
+#include <stdint.h>             // the normal place uint16_t is defined
+#endif
+#if 1
+#include <sys/types.h>          // the normal place u_int16_t is defined
+#endif
+#if 1
+#include <inttypes.h>           // a third place for uint16_t or u_int16_t
+#endif
+
+namespace google {
+
+#if 1      // the C99 format
+typedef int32_t int32;
+typedef uint32_t uint32;
+typedef int64_t int64;
+typedef uint64_t uint64;
+#elif 1   // the BSD format
+typedef int32_t int32;
+typedef u_int32_t uint32;
+typedef int64_t int64;
+typedef u_int64_t uint64;
+#elif 0     // the windows (vc7) format
+typedef __int32 int32;
+typedef unsigned __int32 uint32;
+typedef __int64 int64;
+typedef unsigned __int64 uint64;
+#else
+#error Do not know how to define a 32-bit integer quantity on your system
+#endif
+
+// TODO(kjellander): update generated .h's for new gflags.
+// https://code.google.com/p/webrtc/issues/detail?id=2251
+extern const char* VersionString();
+extern void SetVersionString(const std::string& version);
+
+// --------------------------------------------------------------------
+// To actually define a flag in a file, use DEFINE_bool,
+// DEFINE_string, etc. at the bottom of this file.  You may also find
+// it useful to register a validator with the flag.  This ensures that
+// when the flag is parsed from the commandline, or is later set via
+// SetCommandLineOption, we call the validation function. It is _not_
+// called when you assign the value to the flag directly using the = operator.
+//
+// The validation function should return true if the flag value is valid, and
+// false otherwise. If the function returns false for the new setting of the
+// flag, the flag will retain its current value. If it returns false for the
+// default value, ParseCommandLineFlags() will die.
+//
+// This function is safe to call at global construct time (as in the
+// example below).
+//
+// Example use:
+//    static bool ValidatePort(const char* flagname, int32 value) {
+//       if (value > 0 && value < 32768)   // value is ok
+//         return true;
+//       printf("Invalid value for --%s: %d\n", flagname, (int)value);
+//       return false;
+//    }
+//    DEFINE_int32(port, 0, "What port to listen on");
+//    static bool dummy = RegisterFlagValidator(&FLAGS_port, &ValidatePort);
+
+// Returns true if successfully registered, false if not (because the
+// first argument doesn't point to a command-line flag, or because a
+// validator is already registered for this flag).
+bool RegisterFlagValidator(const bool* flag,
+                           bool (*validate_fn)(const char*, bool));
+bool RegisterFlagValidator(const int32* flag,
+                           bool (*validate_fn)(const char*, int32));
+bool RegisterFlagValidator(const int64* flag,
+                           bool (*validate_fn)(const char*, int64));
+bool RegisterFlagValidator(const uint64* flag,
+                           bool (*validate_fn)(const char*, uint64));
+bool RegisterFlagValidator(const double* flag,
+                           bool (*validate_fn)(const char*, double));
+bool RegisterFlagValidator(const std::string* flag,
+                           bool (*validate_fn)(const char*, const std::string&));
+
+
+// --------------------------------------------------------------------
+// These methods are the best way to get access to info about the
+// list of commandline flags.  Note that these routines are pretty slow.
+//   GetAllFlags: mostly-complete info about the list, sorted by file.
+//   ShowUsageWithFlags: pretty-prints the list to stdout (what --help does)
+//   ShowUsageWithFlagsRestrict: limit to filenames with restrict as a substr
+//
+// In addition to accessing flags, you can also access argv[0] (the program
+// name) and argv (the entire commandline), which we sock away a copy of.
+// These variables are static, so you should only set them once.
+
+struct CommandLineFlagInfo {
+  std::string name;           // the name of the flag
+  std::string type;           // the type of the flag: int32, etc
+  std::string description;    // the "help text" associated with the flag
+  std::string current_value;  // the current value, as a string
+  std::string default_value;  // the default value, as a string
+  std::string filename;       // 'cleaned' version of filename holding the flag
+  bool has_validator_fn;      // true if RegisterFlagValidator called on flag
+  bool is_default;            // true if the flag has the default value and
+                              // has not been set explicitly from the cmdline
+                              // or via SetCommandLineOption
+  const void* flag_ptr;
+
+};
+
+// Using this inside of a validator is a recipe for a deadlock.
+// TODO(wojtekm) Fix locking when validators are running, to make it safe to
+// call validators during ParseAllFlags.
+// Also make sure then to uncomment the corresponding unit test in
+// commandlineflags_unittest.sh
+extern void GetAllFlags(std::vector<CommandLineFlagInfo>* OUTPUT);
+// These two are actually defined in commandlineflags_reporting.cc.
+extern void ShowUsageWithFlags(const char *argv0);  // what --help does
+extern void ShowUsageWithFlagsRestrict(const char *argv0, const char *restrict);
+
+// Create a descriptive string for a flag.
+// Goes to some trouble to make pretty line breaks.
+extern std::string DescribeOneFlag(const CommandLineFlagInfo& flag);
+
+// Thread-hostile; meant to be called before any threads are spawned.
+extern void SetArgv(int argc, const char** argv);
+// The following functions are thread-safe as long as SetArgv() is
+// only called before any threads start.
+extern const std::vector<std::string>& GetArgvs();  // all of argv as a vector
+extern const char* GetArgv();                // all of argv as a string
+extern const char* GetArgv0();               // only argv0
+extern uint32 GetArgvSum();                  // simple checksum of argv
+extern const char* ProgramInvocationName();  // argv0, or "UNKNOWN" if not set
+extern const char* ProgramInvocationShortName();   // basename(argv0)
+// ProgramUsage() is thread-safe as long as SetUsageMessage() is only
+// called before any threads start.
+extern const char* ProgramUsage();           // string set by SetUsageMessage()
+
+
+// --------------------------------------------------------------------
+// Normally you access commandline flags by just saying "if (FLAGS_foo)"
+// or whatever, and set them by calling "FLAGS_foo = bar" (or, more
+// commonly, via the DEFINE_foo macro).  But if you need a bit more
+// control, we have programmatic ways to get/set the flags as well.
+// These programmatic ways to access flags are thread-safe, but direct
+// access is only thread-compatible.
+
+// Return true iff the flagname was found.
+// OUTPUT is set to the flag's value, or unchanged if we return false.
+extern bool GetCommandLineOption(const char* name, std::string* OUTPUT);
+
+// Return true iff the flagname was found. OUTPUT is set to the flag's
+// CommandLineFlagInfo or unchanged if we return false.
+extern bool GetCommandLineFlagInfo(const char* name,
+                                   CommandLineFlagInfo* OUTPUT);
+
+// Return the CommandLineFlagInfo of the flagname.  exit() if name not found.
+// Example usage, to check if a flag's value is currently the default value:
+//   if (GetCommandLineFlagInfoOrDie("foo").is_default) ...
+extern CommandLineFlagInfo GetCommandLineFlagInfoOrDie(const char* name);
+
+enum FlagSettingMode {
+  // update the flag's value (can call this multiple times).
+  SET_FLAGS_VALUE,
+  // update the flag's value, but *only if* it has not yet been updated
+  // with SET_FLAGS_VALUE, SET_FLAG_IF_DEFAULT, or "FLAGS_xxx = nondef".
+  SET_FLAG_IF_DEFAULT,
+  // set the flag's default value to this.  If the flag has not yet updated
+  // yet (via SET_FLAGS_VALUE, SET_FLAG_IF_DEFAULT, or "FLAGS_xxx = nondef")
+  // change the flag's current value to the new default value as well.
+  SET_FLAGS_DEFAULT
+};
+
+// Set a particular flag ("command line option").  Returns a string
+// describing the new value that the option has been set to.  The
+// return value API is not well-specified, so basically just depend on
+// it to be empty if the setting failed for some reason -- the name is
+// not a valid flag name, or the value is not a valid value -- and
+// non-empty else.
+
+// SetCommandLineOption uses set_mode == SET_FLAGS_VALUE (the common case)
+extern std::string SetCommandLineOption(const char* name, const char* value);
+extern std::string SetCommandLineOptionWithMode(const char* name, const char* value,
+                                                FlagSettingMode set_mode);
+
+
+// --------------------------------------------------------------------
+// Saves the states (value, default value, whether the user has set
+// the flag, registered validators, etc) of all flags, and restores
+// them when the FlagSaver is destroyed.  This is very useful in
+// tests, say, when you want to let your tests change the flags, but
+// make sure that they get reverted to the original states when your
+// test is complete.
+//
+// Example usage:
+//   void TestFoo() {
+//     FlagSaver s1;
+//     FLAG_foo = false;
+//     FLAG_bar = "some value";
+//
+//     // test happens here.  You can return at any time
+//     // without worrying about restoring the FLAG values.
+//   }
+//
+// Note: This class is marked with __attribute__((unused)) because all the
+// work is done in the constructor and destructor, so in the standard
+// usage example above, the compiler would complain that it's an
+// unused variable.
+//
+// This class is thread-safe.
+
+class FlagSaver {
+ public:
+  FlagSaver();
+  ~FlagSaver();
+
+ private:
+  class FlagSaverImpl* impl_;   // we use pimpl here to keep API steady
+
+  FlagSaver(const FlagSaver&);  // no copying!
+  void operator=(const FlagSaver&);
+} __attribute__ ((unused));
+
+// --------------------------------------------------------------------
+// Some deprecated or hopefully-soon-to-be-deprecated functions.
+
+// This is often used for logging.  TODO(csilvers): figure out a better way
+extern std::string CommandlineFlagsIntoString();
+// Usually where this is used, a FlagSaver should be used instead.
+extern bool ReadFlagsFromString(const std::string& flagfilecontents,
+                                const char* prog_name,
+                                bool errors_are_fatal); // uses SET_FLAGS_VALUE
+
+// These let you manually implement --flagfile functionality.
+// DEPRECATED.
+extern bool AppendFlagsIntoFile(const std::string& filename, const char* prog_name);
+extern bool SaveCommandFlags();  // actually defined in google.cc !
+extern bool ReadFromFlagsFile(const std::string& filename, const char* prog_name,
+                              bool errors_are_fatal);   // uses SET_FLAGS_VALUE
+
+
+// --------------------------------------------------------------------
+// Useful routines for initializing flags from the environment.
+// In each case, if 'varname' does not exist in the environment
+// return defval.  If 'varname' does exist but is not valid
+// (e.g., not a number for an int32 flag), abort with an error.
+// Otherwise, return the value.  NOTE: for booleans, for true use
+// 't' or 'T' or 'true' or '1', for false 'f' or 'F' or 'false' or '0'.
+
+extern bool BoolFromEnv(const char *varname, bool defval);
+extern int32 Int32FromEnv(const char *varname, int32 defval);
+extern int64 Int64FromEnv(const char *varname, int64 defval);
+extern uint64 Uint64FromEnv(const char *varname, uint64 defval);
+extern double DoubleFromEnv(const char *varname, double defval);
+extern const char *StringFromEnv(const char *varname, const char *defval);
+
+
+// --------------------------------------------------------------------
+// The next two functions parse commandlineflags from main():
+
+// Set the "usage" message for this program.  For example:
+//   string usage("This program does nothing.  Sample usage:\n");
+//   usage += argv[0] + " <uselessarg1> <uselessarg2>";
+//   SetUsageMessage(usage);
+// Do not include commandline flags in the usage: we do that for you!
+// Thread-hostile; meant to be called before any threads are spawned.
+extern void SetUsageMessage(const std::string& usage);
+
+// Looks for flags in argv and parses them.  Rearranges argv to put
+// flags first, or removes them entirely if remove_flags is true.
+// If a flag is defined more than once in the command line or flag
+// file, the last definition is used.  Returns the index (into argv)
+// of the first non-flag argument.
+// See top-of-file for more details on this function.
+#ifndef SWIG   // In swig, use ParseCommandLineFlagsScript() instead.
+extern uint32 ParseCommandLineFlags(int *argc, char*** argv,
+                                    bool remove_flags);
+#endif
+
+
+// Calls to ParseCommandLineNonHelpFlags and then to
+// HandleCommandLineHelpFlags can be used instead of a call to
+// ParseCommandLineFlags during initialization, in order to allow for
+// changing default values for some FLAGS (via
+// e.g. SetCommandLineOptionWithMode calls) between the time of
+// command line parsing and the time of dumping help information for
+// the flags as a result of command line parsing.  If a flag is
+// defined more than once in the command line or flag file, the last
+// definition is used.  Returns the index (into argv) of the first
+// non-flag argument.  (If remove_flags is true, will always return 1.)
+extern uint32 ParseCommandLineNonHelpFlags(int *argc, char*** argv,
+                                           bool remove_flags);
+// This is actually defined in commandlineflags_reporting.cc.
+// This function is misnamed (it also handles --version, etc.), but
+// it's too late to change that now. :-(
+extern void HandleCommandLineHelpFlags();   // in commandlineflags_reporting.cc
+
+// Allow command line reparsing.  Disables the error normally
+// generated when an unknown flag is found, since it may be found in a
+// later parse.  Thread-hostile; meant to be called before any threads
+// are spawned.
+extern void AllowCommandLineReparsing();
+
+// Reparse the flags that have not yet been recognized.  Only flags
+// registered since the last parse will be recognized.  Any flag value
+// must be provided as part of the argument using "=", not as a
+// separate command line argument that follows the flag argument.
+// Intended for handling flags from dynamically loaded libraries,
+// since their flags are not registered until they are loaded.
+// Returns the index (into the original argv) of the first non-flag
+// argument.  (If remove_flags is true, will always return 1.)
+extern void ReparseCommandLineNonHelpFlags();
+
+// Clean up memory allocated by flags.  This is only needed to reduce
+// the quantity of "potentially leaked" reports emitted by memory
+// debugging tools such as valgrind.  It is not required for normal
+// operation, or for the perftools heap-checker.  It must only be called
+// when the process is about to exit, and all threads that might
+// access flags are quiescent.  Referencing flags after this is called
+// will have unexpected consequences.  This is not safe to run when
+// multiple threads might be running: the function is thread-hostile.
+extern void ShutDownCommandLineFlags();
+
+
+// --------------------------------------------------------------------
+// Now come the command line flag declaration/definition macros that
+// will actually be used.  They're kind of hairy.  A major reason
+// for this is initialization: we want people to be able to access
+// variables in global constructors and have that not crash, even if
+// their global constructor runs before the global constructor here.
+// (Obviously, we can't guarantee the flags will have the correct
+// default value in that case, but at least accessing them is safe.)
+// The only way to do that is have flags point to a static buffer.
+// So we make one, using a union to ensure proper alignment, and
+// then use placement-new to actually set up the flag with the
+// correct default value.  In the same vein, we have to worry about
+// flag access in global destructors, so FlagRegisterer has to be
+// careful never to destroy the flag-values it constructs.
+//
+// Note that when we define a flag variable FLAGS_<name>, we also
+// preemptively define a junk variable, FLAGS_no<name>.  This is to
+// cause a link-time error if someone tries to define 2 flags with
+// names like "logging" and "nologging".  We do this because a bool
+// flag FLAG can be set from the command line to true with a "-FLAG"
+// argument, and to false with a "-noFLAG" argument, and so this can
+// potentially avert confusion.
+//
+// We also put flags into their own namespace.  It is purposefully
+// named in an opaque way that people should have trouble typing
+// directly.  The idea is that DEFINE puts the flag in the weird
+// namespace, and DECLARE imports the flag from there into the current
+// namespace.  The net result is to force people to use DECLARE to get
+// access to a flag, rather than saying "extern bool FLAGS_whatever;"
+// or some such instead.  We want this so we can put extra
+// functionality (like sanity-checking) in DECLARE if we want, and
+// make sure it is picked up everywhere.
+//
+// We also put the type of the variable in the namespace, so that
+// people can't DECLARE_int32 something that they DEFINE_bool'd
+// elsewhere.
+
+class FlagRegisterer {
+ public:
+  FlagRegisterer(const char* name, const char* type,
+                 const char* help, const char* filename,
+                 void* current_storage, void* defvalue_storage);
+};
+
+extern bool FlagsTypeWarn(const char *name);
+
+// If your application #defines STRIP_FLAG_HELP to a non-zero value
+// before #including this file, we remove the help message from the
+// binary file. This can reduce the size of the resulting binary
+// somewhat, and may also be useful for security reasons.
+
+extern const char kStrippedFlagHelp[];
+
+}
+
+#ifndef SWIG  // In swig, ignore the main flag declarations
+
+#if defined(STRIP_FLAG_HELP) && STRIP_FLAG_HELP > 0
+// Need this construct to avoid the 'defined but not used' warning.
+#define MAYBE_STRIPPED_HELP(txt) (false ? (txt) : ::google::kStrippedFlagHelp)
+#else
+#define MAYBE_STRIPPED_HELP(txt) txt
+#endif
+
+// Each command-line flag has two variables associated with it: one
+// with the current value, and one with the default value.  However,
+// we have a third variable, which is where value is assigned; it's a
+// constant.  This guarantees that FLAG_##value is initialized at
+// static initialization time (e.g. before program-start) rather than
+// than global construction time (which is after program-start but
+// before main), at least when 'value' is a compile-time constant.  We
+// use a small trick for the "default value" variable, and call it
+// FLAGS_no<name>.  This serves the second purpose of assuring a
+// compile error if someone tries to define a flag named no<name>
+// which is illegal (--foo and --nofoo both affect the "foo" flag).
+#define DEFINE_VARIABLE(type, shorttype, name, value, help) \
+  namespace fL##shorttype {                                     \
+    static const type FLAGS_nono##name = value;                 \
+    type FLAGS_##name = FLAGS_nono##name;                       \
+    type FLAGS_no##name = FLAGS_nono##name;                     \
+    static ::google::FlagRegisterer o_##name(      \
+      #name, #type, MAYBE_STRIPPED_HELP(help), __FILE__,        \
+      &FLAGS_##name, &FLAGS_no##name);                          \
+  }                                                             \
+  using fL##shorttype::FLAGS_##name
+
+#define DECLARE_VARIABLE(type, shorttype, name) \
+  namespace fL##shorttype {                     \
+    extern type FLAGS_##name;                   \
+  }                                             \
+  using fL##shorttype::FLAGS_##name
+
+// For DEFINE_bool, we want to do the extra check that the passed-in
+// value is actually a bool, and not a string or something that can be
+// coerced to a bool.  These declarations (no definition needed!) will
+// help us do that, and never evaluate From, which is important.
+// We'll use 'sizeof(IsBool(val))' to distinguish. This code requires
+// that the compiler have different sizes for bool & double. Since
+// this is not guaranteed by the standard, we check it with a
+// compile-time assert (msg[-1] will give a compile-time error).
+namespace fLB {
+struct CompileAssert {};
+typedef CompileAssert expected_sizeof_double_neq_sizeof_bool[
+                      (sizeof(double) != sizeof(bool)) ? 1 : -1];
+template<typename From> double IsBoolFlag(const From& from);
+bool IsBoolFlag(bool from);
+}  // namespace fLB
+
+#define DECLARE_bool(name)          DECLARE_VARIABLE(bool, B, name)
+#define DEFINE_bool(name, val, txt)                                       \
+  namespace fLB {                                                         \
+    typedef ::fLB::CompileAssert FLAG_##name##_value_is_not_a_bool[       \
+            (sizeof(::fLB::IsBoolFlag(val)) != sizeof(double)) ? 1 : -1]; \
+  }                                                                       \
+  DEFINE_VARIABLE(bool, B, name, val, txt)
+
+#define DECLARE_int32(name)         DECLARE_VARIABLE(::google::int32, I, name)
+#define DEFINE_int32(name,val,txt)  DEFINE_VARIABLE(::google::int32, I, name, val, txt)
+
+#define DECLARE_int64(name)         DECLARE_VARIABLE(::google::int64, I64, name)
+#define DEFINE_int64(name,val,txt)  DEFINE_VARIABLE(::google::int64, I64, name, val, txt)
+
+#define DECLARE_uint64(name)        DECLARE_VARIABLE(::google::uint64, U64, name)
+#define DEFINE_uint64(name,val,txt) DEFINE_VARIABLE(::google::uint64, U64, name, val, txt)
+
+#define DECLARE_double(name)          DECLARE_VARIABLE(double, D, name)
+#define DEFINE_double(name, val, txt) DEFINE_VARIABLE(double, D, name, val, txt)
+
+// Strings are trickier, because they're not a POD, so we can't
+// construct them at static-initialization time (instead they get
+// constructed at global-constructor time, which is much later).  To
+// try to avoid crashes in that case, we use a char buffer to store
+// the string, which we can static-initialize, and then placement-new
+// into it later.  It's not perfect, but the best we can do.
+
+namespace fLS {
+// The meaning of "string" might be different between now and when the
+// macros below get invoked (e.g., if someone is experimenting with
+// other string implementations that get defined after this file is
+// included).  Save the current meaning now and use it in the macros.
+typedef std::string clstring;
+
+inline clstring* dont_pass0toDEFINE_string(char *stringspot,
+                                           const char *value) {
+  return new(stringspot) clstring(value);
+}
+inline clstring* dont_pass0toDEFINE_string(char *stringspot,
+                                           const clstring &value) {
+  return new(stringspot) clstring(value);
+}
+inline clstring* dont_pass0toDEFINE_string(char *stringspot,
+                                           int value);
+}  // namespace fLS
+
+#define DECLARE_string(name)  namespace fLS { extern ::fLS::clstring& FLAGS_##name; } \
+                              using fLS::FLAGS_##name
+
+// We need to define a var named FLAGS_no##name so people don't define
+// --string and --nostring.  And we need a temporary place to put val
+// so we don't have to evaluate it twice.  Two great needs that go
+// great together!
+// The weird 'using' + 'extern' inside the fLS namespace is to work around
+// an unknown compiler bug/issue with the gcc 4.2.1 on SUSE 10.  See
+//    http://code.google.com/p/google-gflags/issues/detail?id=20
+#define DEFINE_string(name, val, txt)                                       \
+  namespace fLS {                                                           \
+    using ::fLS::clstring;                                                  \
+    static union { void* align; char s[sizeof(clstring)]; } s_##name[2];    \
+    clstring* const FLAGS_no##name = ::fLS::                                \
+                                   dont_pass0toDEFINE_string(s_##name[0].s, \
+                                                             val);          \
+    static ::google::FlagRegisterer o_##name(                  \
+        #name, "string", MAYBE_STRIPPED_HELP(txt), __FILE__,                \
+        s_##name[0].s, new (s_##name[1].s) clstring(*FLAGS_no##name));      \
+    extern clstring& FLAGS_##name;                                          \
+    using fLS::FLAGS_##name;                                                \
+    clstring& FLAGS_##name = *FLAGS_no##name;                               \
+  }                                                                         \
+  using fLS::FLAGS_##name
+
+#endif  // SWIG
+
+#endif  // GOOGLE_GFLAGS_H_
diff --git a/libs/libyuv/third_party/gflags/gen/posix/include/gflags/gflags_completions.h b/libs/libyuv/third_party/gflags/gen/posix/include/gflags/gflags_completions.h
new file mode 100644
index 0000000000..9d9ce7a5f7
--- /dev/null
+++ b/libs/libyuv/third_party/gflags/gen/posix/include/gflags/gflags_completions.h
@@ -0,0 +1,121 @@
+// Copyright (c) 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// ---
+// Author: Dave Nicponski
+//
+// Implement helpful bash-style command line flag completions
+//
+// ** Functional API:
+// HandleCommandLineCompletions() should be called early during
+// program startup, but after command line flag code has been
+// initialized, such as the beginning of HandleCommandLineHelpFlags().
+// It checks the value of the flag --tab_completion_word.  If this
+// flag is empty, nothing happens here.  If it contains a string,
+// however, then HandleCommandLineCompletions() will hijack the
+// process, attempting to identify the intention behind this
+// completion.  Regardless of the outcome of this deduction, the
+// process will be terminated, similar to --helpshort flag
+// handling.
+//
+// ** Overview of Bash completions:
+// Bash can be told to programatically determine completions for the
+// current 'cursor word'.  It does this by (in this case) invoking a
+// command with some additional arguments identifying the command
+// being executed, the word being completed, and the previous word
+// (if any).  Bash then expects a sequence of output lines to be
+// printed to stdout.  If these lines all contain a common prefix
+// longer than the cursor word, bash will replace the cursor word
+// with that common prefix, and display nothing.  If there isn't such
+// a common prefix, bash will display the lines in pages using 'more'.
+//
+// ** Strategy taken for command line completions:
+// If we can deduce either the exact flag intended, or a common flag
+// prefix, we'll output exactly that.  Otherwise, if information
+// must be displayed to the user, we'll take the opportunity to add
+// some helpful information beyond just the flag name (specifically,
+// we'll include the default flag value and as much of the flag's
+// description as can fit on a single terminal line width, as specified
+// by the flag --tab_completion_columns).  Furthermore, we'll try to
+// make bash order the output such that the most useful or relevent
+// flags are the most likely to be shown at the top.
+//
+// ** Additional features:
+// To assist in finding that one really useful flag, substring matching
+// was implemented.  Before pressing a <TAB> to get completion for the
+// current word, you can append one or more '?' to the flag to do
+// substring matching.  Here's the semantics:
+//   --foo<TAB>     Show me all flags with names prefixed by 'foo'
+//   --foo?<TAB>    Show me all flags with 'foo' somewhere in the name
+//   --foo??<TAB>   Same as prior case, but also search in module
+//                  definition path for 'foo'
+//   --foo???<TAB>  Same as prior case, but also search in flag
+//                  descriptions for 'foo'
+// Finally, we'll trim the output to a relatively small number of
+// flags to keep bash quiet about the verbosity of output.  If one
+// really wanted to see all possible matches, appending a '+' to the
+// search word will force the exhaustive list of matches to be printed.
+//
+// ** How to have bash accept completions from a binary:
+// Bash requires that it be informed about each command that programmatic
+// completion should be enabled for.  Example addition to a .bashrc
+// file would be (your path to gflags_completions.sh file may differ):
+
+/*
+$ complete -o bashdefault -o default -o nospace -C                        \
+ '/usr/local/bin/gflags_completions.sh --tab_completion_columns $COLUMNS' \
+  time  env  binary_name  another_binary  [...]
+*/
+
+// This would allow the following to work:
+//   $ /path/to/binary_name --vmodule<TAB>
+// Or:
+//   $ ./bin/path/another_binary --gfs_u<TAB>
+// (etc)
+//
+// Sadly, it appears that bash gives no easy way to force this behavior for
+// all commands.  That's where the "time" in the above example comes in.
+// If you haven't specifically added a command to the list of completion
+// supported commands, you can still get completions by prefixing the
+// entire command with "env".
+//   $ env /some/brand/new/binary --vmod<TAB>
+// Assuming that "binary" is a newly compiled binary, this should still
+// produce the expected completion output.
+
+
+#ifndef GOOGLE_GFLAGS_COMPLETIONS_H_
+#define GOOGLE_GFLAGS_COMPLETIONS_H_
+
+namespace google {
+
+void HandleCommandLineCompletions(void);
+
+}
+
+#endif  // GOOGLE_GFLAGS_COMPLETIONS_H_
diff --git a/libs/libyuv/third_party/gflags/gen/posix/include/private/config.h b/libs/libyuv/third_party/gflags/gen/posix/include/private/config.h
new file mode 100644
index 0000000000..98d8e1abd1
--- /dev/null
+++ b/libs/libyuv/third_party/gflags/gen/posix/include/private/config.h
@@ -0,0 +1,110 @@
+/* src/config.h.  Generated from config.h.in by configure.  */
+/* src/config.h.in.  Generated from configure.ac by autoheader.  */
+
+/* Always the empty-string on non-windows systems. On windows, should be
+   "__declspec(dllexport)". This way, when we compile the dll, we export our
+   functions/classes. It's safe to define this here because config.h is only
+   used internally, to compile the DLL, and every DLL source file #includes
+   "config.h" before anything else. */
+#define GFLAGS_DLL_DECL /**/
+
+/* Namespace for Google classes */
+#define GOOGLE_NAMESPACE ::google
+
+/* Define to 1 if you have the <dlfcn.h> header file. */
+#define HAVE_DLFCN_H 1
+
+/* Define to 1 if you have the <fnmatch.h> header file. */
+#define HAVE_FNMATCH_H 1
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+#define HAVE_INTTYPES_H 1
+
+/* Define to 1 if you have the <memory.h> header file. */
+#define HAVE_MEMORY_H 1
+
+/* define if the compiler implements namespaces */
+#define HAVE_NAMESPACES 1
+
+/* Define if you have POSIX threads libraries and header files. */
+#define HAVE_PTHREAD 1
+
+/* Define to 1 if you have the `putenv' function. */
+#define HAVE_PUTENV 1
+
+/* Define to 1 if you have the `setenv' function. */
+#define HAVE_SETENV 1
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#define HAVE_STDINT_H 1
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#define HAVE_STDLIB_H 1
+
+/* Define to 1 if you have the <strings.h> header file. */
+#define HAVE_STRINGS_H 1
+
+/* Define to 1 if you have the <string.h> header file. */
+#define HAVE_STRING_H 1
+
+/* Define to 1 if you have the `strtoll' function. */
+#define HAVE_STRTOLL 1
+
+/* Define to 1 if you have the `strtoq' function. */
+#define HAVE_STRTOQ 1
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+#define HAVE_SYS_STAT_H 1
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#define HAVE_SYS_TYPES_H 1
+
+/* Define to 1 if you have the <unistd.h> header file. */
+#define HAVE_UNISTD_H 1
+
+/* define if your compiler has __attribute__ */
+#define HAVE___ATTRIBUTE__ 1
+
+/* Define to the sub-directory in which libtool stores uninstalled libraries.
+   */
+#define LT_OBJDIR ".libs/"
+
+/* Name of package */
+#define PACKAGE "gflags"
+
+/* Define to the address where bug reports for this package should be sent. */
+#define PACKAGE_BUGREPORT "opensource@google.com"
+
+/* Define to the full name of this package. */
+#define PACKAGE_NAME "gflags"
+
+/* Define to the full name and version of this package. */
+#define PACKAGE_STRING "gflags 1.5"
+
+/* Define to the one symbol short name of this package. */
+#define PACKAGE_TARNAME "gflags"
+
+/* Define to the home page for this package. */
+#define PACKAGE_URL ""
+
+/* Define to the version of this package. */
+#define PACKAGE_VERSION "1.5"
+
+/* Define to necessary symbol if this constant uses a non-standard name on
+   your system. */
+/* #undef PTHREAD_CREATE_JOINABLE */
+
+/* Define to 1 if you have the ANSI C header files. */
+#define STDC_HEADERS 1
+
+/* the namespace where STL code like vector<> is defined */
+#define STL_NAMESPACE std
+
+/* Version number of package */
+#define VERSION "1.5"
+
+/* Stops putting the code inside the Google namespace */
+#define _END_GOOGLE_NAMESPACE_ }
+
+/* Puts following code inside the Google namespace */
+#define _START_GOOGLE_NAMESPACE_ namespace google {
diff --git a/libs/libyuv/third_party/gflags/gen/win/include/gflags/gflags.h b/libs/libyuv/third_party/gflags/gen/win/include/gflags/gflags.h
new file mode 100644
index 0000000000..6af969b353
--- /dev/null
+++ b/libs/libyuv/third_party/gflags/gen/win/include/gflags/gflags.h
@@ -0,0 +1,607 @@
+// Copyright (c) 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Ray Sidney
+// Revamped and reorganized by Craig Silverstein
+//
+// This is the file that should be included by any file which declares
+// or defines a command line flag or wants to parse command line flags
+// or print a program usage message (which will include information about
+// flags).  Executive summary, in the form of an example foo.cc file:
+//
+//    #include "foo.h"         // foo.h has a line "DECLARE_int32(start);"
+//    #include "validators.h"  // hypothetical file defining ValidateIsFile()
+//
+//    DEFINE_int32(end, 1000, "The last record to read");
+//
+//    DEFINE_string(filename, "my_file.txt", "The file to read");
+//    // Crash if the specified file does not exist.
+//    static bool dummy = RegisterFlagValidator(&FLAGS_filename,
+//                                              &ValidateIsFile);
+//
+//    DECLARE_bool(verbose); // some other file has a DEFINE_bool(verbose, ...)
+//
+//    void MyFunc() {
+//      if (FLAGS_verbose) printf("Records %d-%d\n", FLAGS_start, FLAGS_end);
+//    }
+//
+// Then, at the command-line:
+//    ./foo --noverbose --start=5 --end=100
+//
+// For more details, see
+//    doc/gflags.html
+//
+// --- A note about thread-safety:
+//
+// We describe many functions in this routine as being thread-hostile,
+// thread-compatible, or thread-safe.  Here are the meanings we use:
+//
+// thread-safe: it is safe for multiple threads to call this routine
+//   (or, when referring to a class, methods of this class)
+//   concurrently.
+// thread-hostile: it is not safe for multiple threads to call this
+//   routine (or methods of this class) concurrently.  In gflags,
+//   most thread-hostile routines are intended to be called early in,
+//   or even before, main() -- that is, before threads are spawned.
+// thread-compatible: it is safe for multiple threads to read from
+//   this variable (when applied to variables), or to call const
+//   methods of this class (when applied to classes), as long as no
+//   other thread is writing to the variable or calling non-const
+//   methods of this class.
+
+#ifndef GOOGLE_GFLAGS_H_
+#define GOOGLE_GFLAGS_H_
+
+#include <string>
+#include <vector>
+
+// We care a lot about number of bits things take up.  Unfortunately,
+// systems define their bit-specific ints in a lot of different ways.
+// We use our own way, and have a typedef to get there.
+// Note: these commands below may look like "#if 1" or "#if 0", but
+// that's because they were constructed that way at ./configure time.
+// Look at gflags.h.in to see how they're calculated (based on your config).
+#if 0
+#include <stdint.h>             // the normal place uint16_t is defined
+#endif
+#if 1
+#include <sys/types.h>          // the normal place u_int16_t is defined
+#endif
+#if 0
+#include <inttypes.h>           // a third place for uint16_t or u_int16_t
+#endif
+
+// Annoying stuff for windows -- makes sure clients can import these functions
+#if defined(_WIN32)
+# ifndef GFLAGS_DLL_DECL
+#   define GFLAGS_DLL_DECL  __declspec(dllimport)
+# endif
+# ifndef GFLAGS_DLL_DECLARE_FLAG
+#   define GFLAGS_DLL_DECLARE_FLAG  __declspec(dllimport)
+# endif
+# ifndef GFLAGS_DLL_DEFINE_FLAG
+#   define GFLAGS_DLL_DEFINE_FLAG   __declspec(dllexport)
+# endif
+#else
+# ifndef GFLAGS_DLL_DECL
+#   define GFLAGS_DLL_DECL
+# endif
+# ifndef GFLAGS_DLL_DECLARE_FLAG
+#   define GFLAGS_DLL_DECLARE_FLAG
+# endif
+# ifndef GFLAGS_DLL_DEFINE_FLAG
+#   define GFLAGS_DLL_DEFINE_FLAG
+# endif
+#endif
+
+namespace google {
+
+#if 0      // the C99 format
+typedef int32_t int32;
+typedef uint32_t uint32;
+typedef int64_t int64;
+typedef uint64_t uint64;
+#elif 0   // the BSD format
+typedef int32_t int32;
+typedef u_int32_t uint32;
+typedef int64_t int64;
+typedef u_int64_t uint64;
+#elif 1     // the windows (vc7) format
+typedef __int32 int32;
+typedef unsigned __int32 uint32;
+typedef __int64 int64;
+typedef unsigned __int64 uint64;
+#else
+#error Do not know how to define a 32-bit integer quantity on your system
+#endif
+
+// TODO(kjellander): update generated .h's for new gflags.
+// https://code.google.com/p/webrtc/issues/detail?id=2251
+extern const char* VersionString();
+extern void SetVersionString(const std::string& version);
+
+// --------------------------------------------------------------------
+// To actually define a flag in a file, use DEFINE_bool,
+// DEFINE_string, etc. at the bottom of this file.  You may also find
+// it useful to register a validator with the flag.  This ensures that
+// when the flag is parsed from the commandline, or is later set via
+// SetCommandLineOption, we call the validation function. It is _not_
+// called when you assign the value to the flag directly using the = operator.
+//
+// The validation function should return true if the flag value is valid, and
+// false otherwise. If the function returns false for the new setting of the
+// flag, the flag will retain its current value. If it returns false for the
+// default value, ParseCommandLineFlags() will die.
+//
+// This function is safe to call at global construct time (as in the
+// example below).
+//
+// Example use:
+//    static bool ValidatePort(const char* flagname, int32 value) {
+//       if (value > 0 && value < 32768)   // value is ok
+//         return true;
+//       printf("Invalid value for --%s: %d\n", flagname, (int)value);
+//       return false;
+//    }
+//    DEFINE_int32(port, 0, "What port to listen on");
+//    static bool dummy = RegisterFlagValidator(&FLAGS_port, &ValidatePort);
+
+// Returns true if successfully registered, false if not (because the
+// first argument doesn't point to a command-line flag, or because a
+// validator is already registered for this flag).
+GFLAGS_DLL_DECL bool RegisterFlagValidator(const bool* flag,
+                           bool (*validate_fn)(const char*, bool));
+GFLAGS_DLL_DECL bool RegisterFlagValidator(const int32* flag,
+                           bool (*validate_fn)(const char*, int32));
+GFLAGS_DLL_DECL bool RegisterFlagValidator(const int64* flag,
+                           bool (*validate_fn)(const char*, int64));
+GFLAGS_DLL_DECL bool RegisterFlagValidator(const uint64* flag,
+                           bool (*validate_fn)(const char*, uint64));
+GFLAGS_DLL_DECL bool RegisterFlagValidator(const double* flag,
+                           bool (*validate_fn)(const char*, double));
+GFLAGS_DLL_DECL bool RegisterFlagValidator(const std::string* flag,
+                           bool (*validate_fn)(const char*, const std::string&));
+
+
+// --------------------------------------------------------------------
+// These methods are the best way to get access to info about the
+// list of commandline flags.  Note that these routines are pretty slow.
+//   GetAllFlags: mostly-complete info about the list, sorted by file.
+//   ShowUsageWithFlags: pretty-prints the list to stdout (what --help does)
+//   ShowUsageWithFlagsRestrict: limit to filenames with restrict as a substr
+//
+// In addition to accessing flags, you can also access argv[0] (the program
+// name) and argv (the entire commandline), which we sock away a copy of.
+// These variables are static, so you should only set them once.
+
+struct GFLAGS_DLL_DECL CommandLineFlagInfo {
+  std::string name;           // the name of the flag
+  std::string type;           // the type of the flag: int32, etc
+  std::string description;    // the "help text" associated with the flag
+  std::string current_value;  // the current value, as a string
+  std::string default_value;  // the default value, as a string
+  std::string filename;       // 'cleaned' version of filename holding the flag
+  bool has_validator_fn;      // true if RegisterFlagValidator called on flag
+  bool is_default;            // true if the flag has the default value and
+                              // has not been set explicitly from the cmdline
+                              // or via SetCommandLineOption
+  const void* flag_ptr;
+};
+
+// Using this inside of a validator is a recipe for a deadlock.
+// TODO(wojtekm) Fix locking when validators are running, to make it safe to
+// call validators during ParseAllFlags.
+// Also make sure then to uncomment the corresponding unit test in
+// commandlineflags_unittest.sh
+extern GFLAGS_DLL_DECL void GetAllFlags(std::vector<CommandLineFlagInfo>* OUTPUT);
+// These two are actually defined in commandlineflags_reporting.cc.
+extern GFLAGS_DLL_DECL void ShowUsageWithFlags(const char *argv0);  // what --help does
+extern GFLAGS_DLL_DECL void ShowUsageWithFlagsRestrict(const char *argv0, const char *restrict);
+
+// Create a descriptive string for a flag.
+// Goes to some trouble to make pretty line breaks.
+extern GFLAGS_DLL_DECL std::string DescribeOneFlag(const CommandLineFlagInfo& flag);
+
+// Thread-hostile; meant to be called before any threads are spawned.
+extern GFLAGS_DLL_DECL void SetArgv(int argc, const char** argv);
+// The following functions are thread-safe as long as SetArgv() is
+// only called before any threads start.
+extern GFLAGS_DLL_DECL const std::vector<std::string>& GetArgvs();  // all of argv as a vector
+extern GFLAGS_DLL_DECL const char* GetArgv();               // all of argv as a string
+extern GFLAGS_DLL_DECL const char* GetArgv0();              // only argv0
+extern GFLAGS_DLL_DECL uint32 GetArgvSum();                 // simple checksum of argv
+extern GFLAGS_DLL_DECL const char* ProgramInvocationName(); // argv0, or "UNKNOWN" if not set
+extern GFLAGS_DLL_DECL const char* ProgramInvocationShortName();   // basename(argv0)
+// ProgramUsage() is thread-safe as long as SetUsageMessage() is only
+// called before any threads start.
+extern GFLAGS_DLL_DECL const char* ProgramUsage();          // string set by SetUsageMessage()
+
+
+// --------------------------------------------------------------------
+// Normally you access commandline flags by just saying "if (FLAGS_foo)"
+// or whatever, and set them by calling "FLAGS_foo = bar" (or, more
+// commonly, via the DEFINE_foo macro).  But if you need a bit more
+// control, we have programmatic ways to get/set the flags as well.
+// These programmatic ways to access flags are thread-safe, but direct
+// access is only thread-compatible.
+
+// Return true iff the flagname was found.
+// OUTPUT is set to the flag's value, or unchanged if we return false.
+extern GFLAGS_DLL_DECL bool GetCommandLineOption(const char* name, std::string* OUTPUT);
+
+// Return true iff the flagname was found. OUTPUT is set to the flag's
+// CommandLineFlagInfo or unchanged if we return false.
+extern GFLAGS_DLL_DECL bool GetCommandLineFlagInfo(const char* name,
+                                   CommandLineFlagInfo* OUTPUT);
+
+// Return the CommandLineFlagInfo of the flagname.  exit() if name not found.
+// Example usage, to check if a flag's value is currently the default value:
+//   if (GetCommandLineFlagInfoOrDie("foo").is_default) ...
+extern GFLAGS_DLL_DECL CommandLineFlagInfo GetCommandLineFlagInfoOrDie(const char* name);
+
+enum GFLAGS_DLL_DECL FlagSettingMode {
+  // update the flag's value (can call this multiple times).
+  SET_FLAGS_VALUE,
+  // update the flag's value, but *only if* it has not yet been updated
+  // with SET_FLAGS_VALUE, SET_FLAG_IF_DEFAULT, or "FLAGS_xxx = nondef".
+  SET_FLAG_IF_DEFAULT,
+  // set the flag's default value to this.  If the flag has not yet updated
+  // yet (via SET_FLAGS_VALUE, SET_FLAG_IF_DEFAULT, or "FLAGS_xxx = nondef")
+  // change the flag's current value to the new default value as well.
+  SET_FLAGS_DEFAULT
+};
+
+// Set a particular flag ("command line option").  Returns a string
+// describing the new value that the option has been set to.  The
+// return value API is not well-specified, so basically just depend on
+// it to be empty if the setting failed for some reason -- the name is
+// not a valid flag name, or the value is not a valid value -- and
+// non-empty else.
+
+// SetCommandLineOption uses set_mode == SET_FLAGS_VALUE (the common case)
+extern GFLAGS_DLL_DECL std::string SetCommandLineOption(const char* name, const char* value);
+extern GFLAGS_DLL_DECL std::string SetCommandLineOptionWithMode(const char* name, const char* value,
+                                                FlagSettingMode set_mode);
+
+
+// --------------------------------------------------------------------
+// Saves the states (value, default value, whether the user has set
+// the flag, registered validators, etc) of all flags, and restores
+// them when the FlagSaver is destroyed.  This is very useful in
+// tests, say, when you want to let your tests change the flags, but
+// make sure that they get reverted to the original states when your
+// test is complete.
+//
+// Example usage:
+//   void TestFoo() {
+//     FlagSaver s1;
+//     FLAG_foo = false;
+//     FLAG_bar = "some value";
+//
+//     // test happens here.  You can return at any time
+//     // without worrying about restoring the FLAG values.
+//   }
+//
+// Note: This class is marked with __attribute__((unused)) because all the
+// work is done in the constructor and destructor, so in the standard
+// usage example above, the compiler would complain that it's an
+// unused variable.
+//
+// This class is thread-safe.
+
+class GFLAGS_DLL_DECL FlagSaver {
+ public:
+  FlagSaver();
+  ~FlagSaver();
+
+ private:
+  class FlagSaverImpl* impl_;   // we use pimpl here to keep API steady
+
+  FlagSaver(const FlagSaver&);  // no copying!
+  void operator=(const FlagSaver&);
+} ;
+
+// --------------------------------------------------------------------
+// Some deprecated or hopefully-soon-to-be-deprecated functions.
+
+// This is often used for logging.  TODO(csilvers): figure out a better way
+extern GFLAGS_DLL_DECL std::string CommandlineFlagsIntoString();
+// Usually where this is used, a FlagSaver should be used instead.
+extern GFLAGS_DLL_DECL bool ReadFlagsFromString(const std::string& flagfilecontents,
+                                const char* prog_name,
+                                bool errors_are_fatal); // uses SET_FLAGS_VALUE
+
+// These let you manually implement --flagfile functionality.
+// DEPRECATED.
+extern GFLAGS_DLL_DECL bool AppendFlagsIntoFile(const std::string& filename, const char* prog_name);
+extern GFLAGS_DLL_DECL bool SaveCommandFlags();  // actually defined in google.cc !
+extern GFLAGS_DLL_DECL bool ReadFromFlagsFile(const std::string& filename, const char* prog_name,
+                              bool errors_are_fatal);   // uses SET_FLAGS_VALUE
+
+
+// --------------------------------------------------------------------
+// Useful routines for initializing flags from the environment.
+// In each case, if 'varname' does not exist in the environment
+// return defval.  If 'varname' does exist but is not valid
+// (e.g., not a number for an int32 flag), abort with an error.
+// Otherwise, return the value.  NOTE: for booleans, for true use
+// 't' or 'T' or 'true' or '1', for false 'f' or 'F' or 'false' or '0'.
+
+extern GFLAGS_DLL_DECL bool BoolFromEnv(const char *varname, bool defval);
+extern GFLAGS_DLL_DECL int32 Int32FromEnv(const char *varname, int32 defval);
+extern GFLAGS_DLL_DECL int64 Int64FromEnv(const char *varname, int64 defval);
+extern GFLAGS_DLL_DECL uint64 Uint64FromEnv(const char *varname, uint64 defval);
+extern GFLAGS_DLL_DECL double DoubleFromEnv(const char *varname, double defval);
+extern GFLAGS_DLL_DECL const char *StringFromEnv(const char *varname, const char *defval);
+
+
+// --------------------------------------------------------------------
+// The next two functions parse commandlineflags from main():
+
+// Set the "usage" message for this program.  For example:
+//   string usage("This program does nothing.  Sample usage:\n");
+//   usage += argv[0] + " <uselessarg1> <uselessarg2>";
+//   SetUsageMessage(usage);
+// Do not include commandline flags in the usage: we do that for you!
+// Thread-hostile; meant to be called before any threads are spawned.
+extern GFLAGS_DLL_DECL void SetUsageMessage(const std::string& usage);
+
+// Looks for flags in argv and parses them.  Rearranges argv to put
+// flags first, or removes them entirely if remove_flags is true.
+// If a flag is defined more than once in the command line or flag
+// file, the last definition is used.
+// See top-of-file for more details on this function.
+#ifndef SWIG   // In swig, use ParseCommandLineFlagsScript() instead.
+extern GFLAGS_DLL_DECL uint32 ParseCommandLineFlags(int *argc, char*** argv,
+                                    bool remove_flags);
+#endif
+
+
+// Calls to ParseCommandLineNonHelpFlags and then to
+// HandleCommandLineHelpFlags can be used instead of a call to
+// ParseCommandLineFlags during initialization, in order to allow for
+// changing default values for some FLAGS (via
+// e.g. SetCommandLineOptionWithMode calls) between the time of
+// command line parsing and the time of dumping help information for
+// the flags as a result of command line parsing.
+// If a flag is defined more than once in the command line or flag
+// file, the last definition is used.
+extern GFLAGS_DLL_DECL uint32 ParseCommandLineNonHelpFlags(int *argc, char*** argv,
+                                           bool remove_flags);
+// This is actually defined in commandlineflags_reporting.cc.
+// This function is misnamed (it also handles --version, etc.), but
+// it's too late to change that now. :-(
+extern GFLAGS_DLL_DECL void HandleCommandLineHelpFlags();   // in commandlineflags_reporting.cc
+
+// Allow command line reparsing.  Disables the error normally
+// generated when an unknown flag is found, since it may be found in a
+// later parse.  Thread-hostile; meant to be called before any threads
+// are spawned.
+extern GFLAGS_DLL_DECL void AllowCommandLineReparsing();
+
+// Reparse the flags that have not yet been recognized.
+// Only flags registered since the last parse will be recognized.
+// Any flag value must be provided as part of the argument using "=",
+// not as a separate command line argument that follows the flag argument.
+// Intended for handling flags from dynamically loaded libraries,
+// since their flags are not registered until they are loaded.
+extern GFLAGS_DLL_DECL void ReparseCommandLineNonHelpFlags();
+
+// Clean up memory allocated by flags.  This is only needed to reduce
+// the quantity of "potentially leaked" reports emitted by memory
+// debugging tools such as valgrind.  It is not required for normal
+// operation, or for the perftools heap-checker.  It must only be called
+// when the process is about to exit, and all threads that might
+// access flags are quiescent.  Referencing flags after this is called
+// will have unexpected consequences.  This is not safe to run when
+// multiple threads might be running: the function is thread-hostile.
+extern GFLAGS_DLL_DECL void ShutDownCommandLineFlags();
+
+
+// --------------------------------------------------------------------
+// Now come the command line flag declaration/definition macros that
+// will actually be used.  They're kind of hairy.  A major reason
+// for this is initialization: we want people to be able to access
+// variables in global constructors and have that not crash, even if
+// their global constructor runs before the global constructor here.
+// (Obviously, we can't guarantee the flags will have the correct
+// default value in that case, but at least accessing them is safe.)
+// The only way to do that is have flags point to a static buffer.
+// So we make one, using a union to ensure proper alignment, and
+// then use placement-new to actually set up the flag with the
+// correct default value.  In the same vein, we have to worry about
+// flag access in global destructors, so FlagRegisterer has to be
+// careful never to destroy the flag-values it constructs.
+//
+// Note that when we define a flag variable FLAGS_<name>, we also
+// preemptively define a junk variable, FLAGS_no<name>.  This is to
+// cause a link-time error if someone tries to define 2 flags with
+// names like "logging" and "nologging".  We do this because a bool
+// flag FLAG can be set from the command line to true with a "-FLAG"
+// argument, and to false with a "-noFLAG" argument, and so this can
+// potentially avert confusion.
+//
+// We also put flags into their own namespace.  It is purposefully
+// named in an opaque way that people should have trouble typing
+// directly.  The idea is that DEFINE puts the flag in the weird
+// namespace, and DECLARE imports the flag from there into the current
+// namespace.  The net result is to force people to use DECLARE to get
+// access to a flag, rather than saying "extern bool FLAGS_whatever;"
+// or some such instead.  We want this so we can put extra
+// functionality (like sanity-checking) in DECLARE if we want, and
+// make sure it is picked up everywhere.
+//
+// We also put the type of the variable in the namespace, so that
+// people can't DECLARE_int32 something that they DEFINE_bool'd
+// elsewhere.
+
+class GFLAGS_DLL_DECL FlagRegisterer {
+ public:
+  FlagRegisterer(const char* name, const char* type,
+                 const char* help, const char* filename,
+                 void* current_storage, void* defvalue_storage);
+};
+
+extern bool FlagsTypeWarn(const char *name);
+
+// If your application #defines STRIP_FLAG_HELP to a non-zero value
+// before #including this file, we remove the help message from the
+// binary file. This can reduce the size of the resulting binary
+// somewhat, and may also be useful for security reasons.
+
+extern const char kStrippedFlagHelp[];
+
+}
+
+#ifndef SWIG  // In swig, ignore the main flag declarations
+
+#if defined(STRIP_FLAG_HELP) && STRIP_FLAG_HELP > 0
+// Need this construct to avoid the 'defined but not used' warning.
+#define MAYBE_STRIPPED_HELP(txt) (false ? (txt) : kStrippedFlagHelp)
+#else
+#define MAYBE_STRIPPED_HELP(txt) txt
+#endif
+
+// Each command-line flag has two variables associated with it: one
+// with the current value, and one with the default value.  However,
+// we have a third variable, which is where value is assigned; it's a
+// constant.  This guarantees that FLAG_##value is initialized at
+// static initialization time (e.g. before program-start) rather than
+// than global construction time (which is after program-start but
+// before main), at least when 'value' is a compile-time constant.  We
+// use a small trick for the "default value" variable, and call it
+// FLAGS_no<name>.  This serves the second purpose of assuring a
+// compile error if someone tries to define a flag named no<name>
+// which is illegal (--foo and --nofoo both affect the "foo" flag).
+#define DEFINE_VARIABLE(type, shorttype, name, value, help) \
+  namespace fL##shorttype {                                     \
+    static const type FLAGS_nono##name = value;                 \
+    /* We always want to export defined variables, dll or no */ \
+    GFLAGS_DLL_DEFINE_FLAG type FLAGS_##name = FLAGS_nono##name; \
+    type FLAGS_no##name = FLAGS_nono##name;                     \
+    static ::google::FlagRegisterer o_##name(                   \
+      #name, #type, MAYBE_STRIPPED_HELP(help), __FILE__,        \
+      &FLAGS_##name, &FLAGS_no##name);                          \
+  }                                                             \
+  using fL##shorttype::FLAGS_##name
+
+#define DECLARE_VARIABLE(type, shorttype, name) \
+  namespace fL##shorttype {                     \
+    /* We always want to import declared variables, dll or no */ \
+    extern GFLAGS_DLL_DECLARE_FLAG type FLAGS_##name; \
+  }                                             \
+  using fL##shorttype::FLAGS_##name
+
+// For DEFINE_bool, we want to do the extra check that the passed-in
+// value is actually a bool, and not a string or something that can be
+// coerced to a bool.  These declarations (no definition needed!) will
+// help us do that, and never evaluate From, which is important.
+// We'll use 'sizeof(IsBool(val))' to distinguish. This code requires
+// that the compiler have different sizes for bool & double. Since
+// this is not guaranteed by the standard, we check it with a
+// compile-time assert (msg[-1] will give a compile-time error).
+namespace fLB {
+struct CompileAssert {};
+typedef CompileAssert expected_sizeof_double_neq_sizeof_bool[
+                      (sizeof(double) != sizeof(bool)) ? 1 : -1];
+template<typename From> GFLAGS_DLL_DECL double IsBoolFlag(const From& from);
+GFLAGS_DLL_DECL bool IsBoolFlag(bool from);
+}  // namespace fLB
+
+#define DECLARE_bool(name)          DECLARE_VARIABLE(bool, B, name)
+#define DEFINE_bool(name, val, txt)                                       \
+  namespace fLB {                                                         \
+    typedef ::fLB::CompileAssert FLAG_##name##_value_is_not_a_bool[       \
+            (sizeof(::fLB::IsBoolFlag(val)) != sizeof(double)) ? 1 : -1]; \
+  }                                                                       \
+  DEFINE_VARIABLE(bool, B, name, val, txt)
+
+#define DECLARE_int32(name)         DECLARE_VARIABLE(::google::int32, I, name)
+#define DEFINE_int32(name,val,txt)  DEFINE_VARIABLE(::google::int32, I, name, val, txt)
+
+#define DECLARE_int64(name)         DECLARE_VARIABLE(::google::int64, I64, name)
+#define DEFINE_int64(name,val,txt)  DEFINE_VARIABLE(::google::int64, I64, name, val, txt)
+
+#define DECLARE_uint64(name)        DECLARE_VARIABLE(::google::uint64, U64, name)
+#define DEFINE_uint64(name,val,txt) DEFINE_VARIABLE(::google::uint64, U64, name, val, txt)
+
+#define DECLARE_double(name)          DECLARE_VARIABLE(double, D, name)
+#define DEFINE_double(name, val, txt) DEFINE_VARIABLE(double, D, name, val, txt)
+
+// Strings are trickier, because they're not a POD, so we can't
+// construct them at static-initialization time (instead they get
+// constructed at global-constructor time, which is much later).  To
+// try to avoid crashes in that case, we use a char buffer to store
+// the string, which we can static-initialize, and then placement-new
+// into it later.  It's not perfect, but the best we can do.
+
+namespace fLS {
+// The meaning of "string" might be different between now and when the
+// macros below get invoked (e.g., if someone is experimenting with
+// other string implementations that get defined after this file is
+// included).  Save the current meaning now and use it in the macros.
+typedef std::string clstring;
+
+inline clstring* dont_pass0toDEFINE_string(char *stringspot,
+                                           const char *value) {
+  return new(stringspot) clstring(value);
+}
+inline clstring* dont_pass0toDEFINE_string(char *stringspot,
+                                           const clstring &value) {
+  return new(stringspot) clstring(value);
+}
+inline clstring* dont_pass0toDEFINE_string(char *stringspot,
+                                           int value);
+}  // namespace fLS
+
+#define DECLARE_string(name)  namespace fLS { extern GFLAGS_DLL_DECLARE_FLAG ::fLS::clstring& FLAGS_##name; } \
+                              using fLS::FLAGS_##name
+
+// We need to define a var named FLAGS_no##name so people don't define
+// --string and --nostring.  And we need a temporary place to put val
+// so we don't have to evaluate it twice.  Two great needs that go
+// great together!
+#define DEFINE_string(name, val, txt)                                       \
+  namespace fLS {                                                           \
+    using ::fLS::clstring;                                                  \
+    static union { void* align; char s[sizeof(clstring)]; } s_##name[2];    \
+    clstring* const FLAGS_no##name = ::fLS::                                \
+                                   dont_pass0toDEFINE_string(s_##name[0].s, \
+                                                             val);          \
+    static ::google::FlagRegisterer o_##name(                  \
+        #name, "string", MAYBE_STRIPPED_HELP(txt), __FILE__,                \
+        s_##name[0].s, new (s_##name[1].s) clstring(*FLAGS_no##name));      \
+    GFLAGS_DLL_DEFINE_FLAG clstring& FLAGS_##name = *FLAGS_no##name;        \
+  }                                                                         \
+  using fLS::FLAGS_##name
+
+#endif  // SWIG
+
+#endif  // GOOGLE_GFLAGS_H_
diff --git a/libs/libyuv/third_party/gflags/gen/win/include/gflags/gflags_completions.h b/libs/libyuv/third_party/gflags/gen/win/include/gflags/gflags_completions.h
new file mode 100644
index 0000000000..e97de5b3f6
--- /dev/null
+++ b/libs/libyuv/third_party/gflags/gen/win/include/gflags/gflags_completions.h
@@ -0,0 +1,130 @@
+// Copyright (c) 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// ---
+// Author: Dave Nicponski
+//
+// Implement helpful bash-style command line flag completions
+//
+// ** Functional API:
+// HandleCommandLineCompletions() should be called early during
+// program startup, but after command line flag code has been
+// initialized, such as the beginning of HandleCommandLineHelpFlags().
+// It checks the value of the flag --tab_completion_word.  If this
+// flag is empty, nothing happens here.  If it contains a string,
+// however, then HandleCommandLineCompletions() will hijack the
+// process, attempting to identify the intention behind this
+// completion.  Regardless of the outcome of this deduction, the
+// process will be terminated, similar to --helpshort flag
+// handling.
+//
+// ** Overview of Bash completions:
+// Bash can be told to programatically determine completions for the
+// current 'cursor word'.  It does this by (in this case) invoking a
+// command with some additional arguments identifying the command
+// being executed, the word being completed, and the previous word
+// (if any).  Bash then expects a sequence of output lines to be
+// printed to stdout.  If these lines all contain a common prefix
+// longer than the cursor word, bash will replace the cursor word
+// with that common prefix, and display nothing.  If there isn't such
+// a common prefix, bash will display the lines in pages using 'more'.
+//
+// ** Strategy taken for command line completions:
+// If we can deduce either the exact flag intended, or a common flag
+// prefix, we'll output exactly that.  Otherwise, if information
+// must be displayed to the user, we'll take the opportunity to add
+// some helpful information beyond just the flag name (specifically,
+// we'll include the default flag value and as much of the flag's
+// description as can fit on a single terminal line width, as specified
+// by the flag --tab_completion_columns).  Furthermore, we'll try to
+// make bash order the output such that the most useful or relevent
+// flags are the most likely to be shown at the top.
+//
+// ** Additional features:
+// To assist in finding that one really useful flag, substring matching
+// was implemented.  Before pressing a <TAB> to get completion for the
+// current word, you can append one or more '?' to the flag to do
+// substring matching.  Here's the semantics:
+//   --foo<TAB>     Show me all flags with names prefixed by 'foo'
+//   --foo?<TAB>    Show me all flags with 'foo' somewhere in the name
+//   --foo??<TAB>   Same as prior case, but also search in module
+//                  definition path for 'foo'
+//   --foo???<TAB>  Same as prior case, but also search in flag
+//                  descriptions for 'foo'
+// Finally, we'll trim the output to a relatively small number of
+// flags to keep bash quiet about the verbosity of output.  If one
+// really wanted to see all possible matches, appending a '+' to the
+// search word will force the exhaustive list of matches to be printed.
+//
+// ** How to have bash accept completions from a binary:
+// Bash requires that it be informed about each command that programmatic
+// completion should be enabled for.  Example addition to a .bashrc
+// file would be (your path to gflags_completions.sh file may differ):
+
+/*
+$ complete -o bashdefault -o default -o nospace -C                        \
+ '/usr/local/bin/gflags_completions.sh --tab_completion_columns $COLUMNS' \
+  time  env  binary_name  another_binary  [...]
+*/
+
+// This would allow the following to work:
+//   $ /path/to/binary_name --vmodule<TAB>
+// Or:
+//   $ ./bin/path/another_binary --gfs_u<TAB>
+// (etc)
+//
+// Sadly, it appears that bash gives no easy way to force this behavior for
+// all commands.  That's where the "time" in the above example comes in.
+// If you haven't specifically added a command to the list of completion
+// supported commands, you can still get completions by prefixing the
+// entire command with "env".
+//   $ env /some/brand/new/binary --vmod<TAB>
+// Assuming that "binary" is a newly compiled binary, this should still
+// produce the expected completion output.
+
+
+#ifndef GOOGLE_GFLAGS_COMPLETIONS_H_
+#define GOOGLE_GFLAGS_COMPLETIONS_H_
+
+// Annoying stuff for windows -- makes sure clients can import these functions
+#ifndef GFLAGS_DLL_DECL
+# ifdef _WIN32
+#   define GFLAGS_DLL_DECL  __declspec(dllimport)
+# else
+#   define GFLAGS_DLL_DECL
+# endif
+#endif
+
+namespace google {
+
+GFLAGS_DLL_DECL void HandleCommandLineCompletions(void);
+
+}
+
+#endif  // GOOGLE_GFLAGS_COMPLETIONS_H_
diff --git a/libs/libyuv/third_party/gflags/gen/win/include/private/config.h b/libs/libyuv/third_party/gflags/gen/win/include/private/config.h
new file mode 100644
index 0000000000..dcca757e49
--- /dev/null
+++ b/libs/libyuv/third_party/gflags/gen/win/include/private/config.h
@@ -0,0 +1,139 @@
+/* src/config.h.in.  Generated from configure.ac by autoheader.  */
+
+/* Sometimes we accidentally #include this config.h instead of the one
+   in .. -- this is particularly true for msys/mingw, which uses the
+   unix config.h but also runs code in the windows directory.
+   */
+#ifdef __MINGW32__
+#include "../config.h"
+#define GOOGLE_GFLAGS_WINDOWS_CONFIG_H_
+#endif
+
+#ifndef GOOGLE_GFLAGS_WINDOWS_CONFIG_H_
+#define GOOGLE_GFLAGS_WINDOWS_CONFIG_H_
+
+/* Always the empty-string on non-windows systems. On windows, should be
+   "__declspec(dllexport)". This way, when we compile the dll, we export our
+   functions/classes. It's safe to define this here because config.h is only
+   used internally, to compile the DLL, and every DLL source file #includes
+   "config.h" before anything else. */
+#ifndef GFLAGS_DLL_DECL
+# define GFLAGS_IS_A_DLL  1   /* not set if you're statically linking */
+# define GFLAGS_DLL_DECL  __declspec(dllexport)
+# define GFLAGS_DLL_DECL_FOR_UNITTESTS  __declspec(dllimport)
+#endif
+
+/* Namespace for Google classes */
+#define GOOGLE_NAMESPACE  ::google
+
+/* Define to 1 if you have the <dlfcn.h> header file. */
+#undef HAVE_DLFCN_H
+
+/* Define to 1 if you have the <fnmatch.h> header file. */
+#undef HAVE_FNMATCH_H
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+#undef HAVE_INTTYPES_H
+
+/* Define to 1 if you have the <memory.h> header file. */
+#undef HAVE_MEMORY_H
+
+/* define if the compiler implements namespaces */
+#define HAVE_NAMESPACES  1
+
+/* Define if you have POSIX threads libraries and header files. */
+#undef HAVE_PTHREAD
+
+/* Define to 1 if you have the `putenv' function. */
+#define HAVE_PUTENV  1
+
+/* Define to 1 if you have the `setenv' function. */
+#undef HAVE_SETENV
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#undef HAVE_STDINT_H
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#define HAVE_STDLIB_H 1
+
+/* Define to 1 if you have the <strings.h> header file. */
+#undef HAVE_STRINGS_H
+
+/* Define to 1 if you have the <string.h> header file. */
+#define HAVE_STRING_H 1
+
+/* Define to 1 if you have the `strtoll' function. */
+#define HAVE_STRTOLL  1
+
+/* Define to 1 if you have the `strtoq' function. */
+#define HAVE_STRTOQ  1
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+#define HAVE_SYS_STAT_H 1
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#define HAVE_SYS_TYPES_H 1
+
+/* Define to 1 if you have the <unistd.h> header file. */
+#undef HAVE_UNISTD_H
+
+/* define if your compiler has __attribute__ */
+#undef HAVE___ATTRIBUTE__
+
+/* Define to the sub-directory in which libtool stores uninstalled libraries.
+   */
+#undef LT_OBJDIR
+
+/* Name of package */
+#undef PACKAGE
+
+/* Define to the address where bug reports for this package should be sent. */
+#undef PACKAGE_BUGREPORT
+
+/* Define to the full name of this package. */
+#undef PACKAGE_NAME
+
+/* Define to the full name and version of this package. */
+#undef PACKAGE_STRING
+
+/* Define to the one symbol short name of this package. */
+#undef PACKAGE_TARNAME
+
+/* Define to the home page for this package. */
+#undef PACKAGE_URL
+
+/* Define to the version of this package. */
+#undef PACKAGE_VERSION
+
+/* Define to necessary symbol if this constant uses a non-standard name on
+   your system. */
+#undef PTHREAD_CREATE_JOINABLE
+
+/* Define to 1 if you have the ANSI C header files. */
+#define STDC_HEADERS  1
+
+/* the namespace where STL code like vector<> is defined */
+#define STL_NAMESPACE  std
+
+/* Version number of package */
+#undef VERSION
+
+/* Stops putting the code inside the Google namespace */
+#define _END_GOOGLE_NAMESPACE_  }
+
+/* Puts following code inside the Google namespace */
+#define _START_GOOGLE_NAMESPACE_  namespace google {
+
+// ---------------------------------------------------------------------
+// Extra stuff not found in config.h.in
+
+// This must be defined before the windows.h is included.  It's needed
+// for mutex.h, to give access to the TryLock method.
+#ifndef _WIN32_WINNT
+# define _WIN32_WINNT 0x0400
+#endif
+
+// TODO(csilvers): include windows/port.h in every relevant source file instead?
+#include "windows/port.h"
+
+#endif  /* GOOGLE_GFLAGS_WINDOWS_CONFIG_H_ */
diff --git a/libs/libyuv/third_party/gflags/gflags.gyp b/libs/libyuv/third_party/gflags/gflags.gyp
new file mode 100644
index 0000000000..7ce3f80f6c
--- /dev/null
+++ b/libs/libyuv/third_party/gflags/gflags.gyp
@@ -0,0 +1,90 @@
+#
+# Copyright 2014 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+# This is a copy of WebRTC's gflags.gyp.
+
+{
+  'variables': {
+    'gflags_root': '<(DEPTH)/third_party/gflags',
+    'conditions': [
+      ['OS=="win"', {
+        'gflags_gen_arch_root': '<(gflags_root)/gen/win',
+      }, {
+        'gflags_gen_arch_root': '<(gflags_root)/gen/posix',
+      }],
+    ],
+  },
+  'targets': [
+    {
+      'target_name': 'gflags',
+      'type': 'static_library',
+      'include_dirs': [
+        '<(gflags_gen_arch_root)/include/private',  # For config.h
+        '<(gflags_gen_arch_root)/include',  # For configured files.
+        '<(gflags_root)/src',  # For everything else.
+      ],
+      'defines': [
+        # These macros exist so flags and symbols are properly
+        # exported when building DLLs. Since we don't build DLLs, we
+        # need to disable them.
+        'GFLAGS_DLL_DECL=',
+        'GFLAGS_DLL_DECLARE_FLAG=',
+        'GFLAGS_DLL_DEFINE_FLAG=',
+      ],
+      'direct_dependent_settings': {
+        'include_dirs': [
+          '<(gflags_gen_arch_root)/include',  # For configured files.
+          '<(gflags_root)/src',  # For everything else.
+        ],
+        'defines': [
+          'GFLAGS_DLL_DECL=',
+          'GFLAGS_DLL_DECLARE_FLAG=',
+          'GFLAGS_DLL_DEFINE_FLAG=',
+        ],
+      },
+      'sources': [
+        'src/gflags.cc',
+        'src/gflags_completions.cc',
+        'src/gflags_reporting.cc',
+      ],
+      'conditions': [
+        ['OS=="win"', {
+          'sources': [
+            'src/windows/port.cc',
+          ],
+          # Suppress warnings about WIN32_LEAN_AND_MEAN and size_t truncation.
+          'msvs_disabled_warnings': [4005, 4267],
+        }],
+        # TODO(andrew): Look into fixing this warning upstream:
+        # http://code.google.com/p/webrtc/issues/detail?id=760
+        ['OS=="win" and clang==1', {
+          'msvs_settings': {
+            'VCCLCompilerTool': {
+              'AdditionalOptions!': [
+                '-Wheader-hygiene',  # Suppress warning about using namespace.
+              ],
+              'AdditionalOptions': [
+                '-Wno-unused-local-typedef',  # Suppress unused private typedef.
+              ],
+            },
+          },
+        }],
+        ['clang==1', {
+          'cflags': ['-Wno-unused-local-typedef',],
+          'cflags!': ['-Wheader-hygiene',],
+          'xcode_settings': {
+            'WARNING_CFLAGS': ['-Wno-unused-local-typedef',],
+            'WARNING_CFLAGS!': ['-Wheader-hygiene',],
+          },
+        }],
+      ],
+    },
+  ],
+}
+
diff --git a/libs/libyuv/tools/OWNERS b/libs/libyuv/tools/OWNERS
new file mode 100644
index 0000000000..aca046d45e
--- /dev/null
+++ b/libs/libyuv/tools/OWNERS
@@ -0,0 +1 @@
+kjellander@chromium.org
diff --git a/libs/libyuv/tools/gritsettings/README b/libs/libyuv/tools/gritsettings/README
new file mode 100644
index 0000000000..ca9cb31f74
--- /dev/null
+++ b/libs/libyuv/tools/gritsettings/README
@@ -0,0 +1,7 @@
+This is a dummy configuration file to workaround the assumption of the grit
+resource file being located here in the Chromium build toolchain.
+
+This is only needed for our Android native tests to be able to include the
+build/apk_test.gypi GYP file and depend on the
+<(DEPTH)/testing/android/native_test.gyp:native_test_native_code target in
+Chromium.
diff --git a/libs/libyuv/tools/gritsettings/resource_ids b/libs/libyuv/tools/gritsettings/resource_ids
new file mode 100644
index 0000000000..bfdfbbe90b
--- /dev/null
+++ b/libs/libyuv/tools/gritsettings/resource_ids
@@ -0,0 +1,15 @@
+# Copyright (c) 2015 The WebRTC project authors. All Rights Reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS.  All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+{
+  "SRCDIR": "../..",
+
+  "chromium/src/net/base/net_resources.grd": {
+    "includes": [4000],
+  },
+}
diff --git a/libs/libyuv/tools/msan/OWNERS b/libs/libyuv/tools/msan/OWNERS
new file mode 100644
index 0000000000..60351e7ea2
--- /dev/null
+++ b/libs/libyuv/tools/msan/OWNERS
@@ -0,0 +1,3 @@
+pbos@chromium.org
+kjellander@chromium.org
+
diff --git a/libs/libyuv/tools/msan/blacklist.txt b/libs/libyuv/tools/msan/blacklist.txt
new file mode 100644
index 0000000000..8b5e42a7b3
--- /dev/null
+++ b/libs/libyuv/tools/msan/blacklist.txt
@@ -0,0 +1,9 @@
+# The rules in this file are only applied at compile time.
+# Because the Chrome buildsystem does not automatically touch the files
+# mentioned here, changing this file requires clobbering all MSan bots.
+#
+# Please think twice before you add or remove these rules.
+
+# This is a stripped down copy of Chromium's blacklist.txt, to enable
+# adding libyuv-specific blacklist entries.
+
diff --git a/libs/libyuv/tools/ubsan/OWNERS b/libs/libyuv/tools/ubsan/OWNERS
new file mode 100644
index 0000000000..b608519abf
--- /dev/null
+++ b/libs/libyuv/tools/ubsan/OWNERS
@@ -0,0 +1,4 @@
+pbos@webrtc.org
+kjellander@webrtc.org
+fbarchard@chromium.org
+
diff --git a/libs/libyuv/tools/ubsan/blacklist.txt b/libs/libyuv/tools/ubsan/blacklist.txt
new file mode 100644
index 0000000000..8bcb29073b
--- /dev/null
+++ b/libs/libyuv/tools/ubsan/blacklist.txt
@@ -0,0 +1,15 @@
+#############################################################################
+# UBSan blacklist.
+# Please think twice before you add or remove these rules.
+
+# This is a stripped down copy of Chromium's blacklist.txt, to enable
+# adding WebRTC-specific blacklist entries.
+
+#############################################################################
+# YASM does some funny things that UBsan doesn't like.
+# https://crbug.com/489901
+src:*/third_party/yasm/*
+
+#############################################################################
+# Ignore system libraries.
+src:*/usr/*
diff --git a/libs/libyuv/tools/ubsan/vptr_blacklist.txt b/libs/libyuv/tools/ubsan/vptr_blacklist.txt
new file mode 100644
index 0000000000..8ed070c05d
--- /dev/null
+++ b/libs/libyuv/tools/ubsan/vptr_blacklist.txt
@@ -0,0 +1,21 @@
+#############################################################################
+# UBSan vptr blacklist.
+# Function and type based blacklisting use a mangled name, and it is especially
+# tricky to represent C++ types. For now, any possible changes by name manglings
+# are simply represented as wildcard expressions of regexp, and thus it might be
+# over-blacklisted.
+#
+# Please think twice before you add or remove these rules.
+#
+# This is a stripped down copy of Chromium's vptr_blacklist.txt, to enable
+# adding libyuv-specific blacklist entries.
+
+#############################################################################
+# Using raw pointer values.
+#
+# A raw pointer value (16) is used to infer the field offset by
+# GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET.
+
+# Example:
+# src:*/third_party/protobuf/src/google/protobuf/compiler/plugin.pb.cc
+
diff --git a/libs/libyuv/tools/valgrind-libyuv/libyuv_tests.bat b/libs/libyuv/tools/valgrind-libyuv/libyuv_tests.bat
new file mode 100644
index 0000000000..e37f09eb25
--- /dev/null
+++ b/libs/libyuv/tools/valgrind-libyuv/libyuv_tests.bat
@@ -0,0 +1,79 @@
+@echo off
+:: Copyright (c) 2012 The LibYuv Project Authors. All rights reserved.
+::
+:: Use of this source code is governed by a BSD-style license
+:: that can be found in the LICENSE file in the root of the source
+:: tree. An additional intellectual property rights grant can be found
+:: in the file PATENTS.  All contributing project authors may
+:: be found in the AUTHORS file in the root of the source tree.
+
+:: This script is a copy of chrome_tests.bat with the following changes:
+:: - Invokes libyuv_tests.py instead of chrome_tests.py
+:: - Chromium's Valgrind scripts directory is added to the PYTHONPATH to make
+::   it possible to execute the Python scripts properly.
+
+:: TODO(timurrrr): batch files 'export' all the variables to the parent shell
+set THISDIR=%~dp0
+set TOOL_NAME="unknown"
+
+:: Get the tool name and put it into TOOL_NAME {{{1
+:: NB: SHIFT command doesn't modify %*
+:PARSE_ARGS_LOOP
+  if %1 == () GOTO:TOOLNAME_NOT_FOUND
+  if %1 == --tool GOTO:TOOLNAME_FOUND
+  SHIFT
+  goto :PARSE_ARGS_LOOP
+
+:TOOLNAME_NOT_FOUND
+echo "Please specify a tool (tsan or drmemory) by using --tool flag"
+exit /B 1
+
+:TOOLNAME_FOUND
+SHIFT
+set TOOL_NAME=%1
+:: }}}
+if "%TOOL_NAME%" == "drmemory"          GOTO :SETUP_DRMEMORY
+if "%TOOL_NAME%" == "drmemory_light"    GOTO :SETUP_DRMEMORY
+if "%TOOL_NAME%" == "drmemory_full"     GOTO :SETUP_DRMEMORY
+if "%TOOL_NAME%" == "drmemory_pattern"  GOTO :SETUP_DRMEMORY
+if "%TOOL_NAME%" == "tsan"     GOTO :SETUP_TSAN
+echo "Unknown tool: `%TOOL_NAME%`! Only tsan and drmemory are supported."
+exit /B 1
+
+:SETUP_DRMEMORY
+if NOT "%DRMEMORY_COMMAND%"=="" GOTO :RUN_TESTS
+:: Set up DRMEMORY_COMMAND to invoke Dr. Memory {{{1
+set DRMEMORY_PATH=%THISDIR%..\..\third_party\drmemory
+set DRMEMORY_SFX=%DRMEMORY_PATH%\drmemory-windows-sfx.exe
+if EXIST %DRMEMORY_SFX% GOTO DRMEMORY_BINARY_OK
+echo "Can't find Dr. Memory executables."
+echo "See http://www.chromium.org/developers/how-tos/using-valgrind/dr-memory"
+echo "for the instructions on how to get them."
+exit /B 1
+
+:DRMEMORY_BINARY_OK
+%DRMEMORY_SFX% -o%DRMEMORY_PATH%\unpacked -y
+set DRMEMORY_COMMAND=%DRMEMORY_PATH%\unpacked\bin\drmemory.exe
+:: }}}
+goto :RUN_TESTS
+
+:SETUP_TSAN
+:: Set up PIN_COMMAND to invoke TSan {{{1
+set TSAN_PATH=%THISDIR%..\..\third_party\tsan
+set TSAN_SFX=%TSAN_PATH%\tsan-x86-windows-sfx.exe
+if EXIST %TSAN_SFX% GOTO TSAN_BINARY_OK
+echo "Can't find ThreadSanitizer executables."
+echo "See http://www.chromium.org/developers/how-tos/using-valgrind/threadsanitizer/threadsanitizer-on-windows"
+echo "for the instructions on how to get them."
+exit /B 1
+
+:TSAN_BINARY_OK
+%TSAN_SFX% -o%TSAN_PATH%\unpacked -y
+set PIN_COMMAND=%TSAN_PATH%\unpacked\tsan-x86-windows\tsan.bat
+:: }}}
+goto :RUN_TESTS
+
+:RUN_TESTS
+set PYTHONPATH=%THISDIR%..\python\google;%THISDIR%..\valgrind
+set RUNNING_ON_VALGRIND=yes
+python %THISDIR%libyuv_tests.py %*
diff --git a/libs/libyuv/tools/valgrind-libyuv/libyuv_tests.py b/libs/libyuv/tools/valgrind-libyuv/libyuv_tests.py
new file mode 100755
index 0000000000..bd27cd5ca6
--- /dev/null
+++ b/libs/libyuv/tools/valgrind-libyuv/libyuv_tests.py
@@ -0,0 +1,138 @@
+#!/usr/bin/env python
+# Copyright (c) 2012 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS.  All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+"""Runs various libyuv tests through valgrind_test.py.
+
+This script inherits the chrome_tests.py in Chrome, but allows running any test
+instead of only the hard-coded ones. It uses the -t cmdline flag to do this, and
+only supports specifying a single test for each run.
+
+Suppression files:
+The Chrome valgrind directory we use as a DEPS dependency contains the following
+suppression files:
+  valgrind/memcheck/suppressions.txt
+  valgrind/memcheck/suppressions_mac.txt
+  valgrind/tsan/suppressions.txt
+  valgrind/tsan/suppressions_mac.txt
+  valgrind/tsan/suppressions_win32.txt
+Since they're referenced from the chrome_tests.py script, we have similar files
+below the directory of this script. When executing, this script will setup both
+Chrome's suppression files and our own, so we can easily maintain libyuv
+specific suppressions in our own files.
+"""
+
+import logging
+import optparse
+import os
+import sys
+
+import logging_utils
+import path_utils
+
+import chrome_tests
+
+
+class LibyuvTest(chrome_tests.ChromeTests):
+  """Class that handles setup of suppressions for libyuv.
+
+  Everything else is inherited from chrome_tests.ChromeTests.
+  """
+
+  def _DefaultCommand(self, tool, exe=None, valgrind_test_args=None):
+    """Override command-building method so we can add more suppressions."""
+    cmd = chrome_tests.ChromeTests._DefaultCommand(self, tool, exe,
+                                                   valgrind_test_args)
+    # When ChromeTests._DefaultCommand has executed, it has setup suppression
+    # files based on what's found in the memcheck/ or tsan/ subdirectories of
+    # this script's location. If Mac or Windows is executing, additional
+    # platform specific files have also been added.
+    # Since only the ones located below this directory is added, we must also
+    # add the ones maintained by Chrome, located in ../valgrind.
+
+    # The idea is to look for --suppression arguments in the cmd list and add a
+    # modified copy of each suppression file, for the corresponding file in
+    # ../valgrind. If we would simply replace 'valgrind-libyuv' with 'valgrind'
+    # we may produce invalid paths if other parts of the path contain that
+    # string. That's why the code below only replaces the end of the path.
+    script_dir = path_utils.ScriptDir()
+    old_base, _ = os.path.split(script_dir)
+    new_dir = os.path.join(old_base, 'valgrind')
+    add_suppressions = []
+    for token in cmd:
+      if '--suppressions' in token:
+        add_suppressions.append(token.replace(script_dir, new_dir))
+    return add_suppressions + cmd
+
+
+def main(_):
+  parser = optparse.OptionParser('usage: %prog -b <dir> -t <test> <test args>')
+  parser.disable_interspersed_args()
+  parser.add_option('-b', '--build-dir',
+                    help=('Location of the compiler output. Can only be used '
+                          'when the test argument does not contain this path.'))
+  parser.add_option("--target", help="Debug or Release")
+  parser.add_option('-t', '--test', help='Test to run.')
+  parser.add_option('', '--baseline', action='store_true', default=False,
+                    help='Generate baseline data instead of validating')
+  parser.add_option('', '--gtest_filter',
+                    help='Additional arguments to --gtest_filter')
+  parser.add_option('', '--gtest_repeat',
+                    help='Argument for --gtest_repeat')
+  parser.add_option("--gtest_shuffle", action="store_true", default=False,
+                    help="Randomize tests' orders on every iteration.")
+  parser.add_option("--gtest_break_on_failure", action="store_true",
+                    default=False,
+                    help="Drop in to debugger on assertion failure. Also "
+                         "useful for forcing tests to exit with a stack dump "
+                         "on the first assertion failure when running with "
+                         "--gtest_repeat=-1")
+  parser.add_option('-v', '--verbose', action='store_true', default=False,
+                    help='Verbose output - enable debug log messages')
+  parser.add_option('', '--tool', dest='valgrind_tool', default='memcheck',
+                    help='Specify a valgrind tool to run the tests under')
+  parser.add_option('', '--tool_flags', dest='valgrind_tool_flags', default='',
+                    help='Specify custom flags for the selected valgrind tool')
+  parser.add_option('', '--keep_logs', action='store_true', default=False,
+                    help=('Store memory tool logs in the <tool>.logs directory '
+                          'instead of /tmp.\nThis can be useful for tool '
+                          'developers/maintainers.\nPlease note that the <tool>'
+                          '.logs directory will be clobbered on tool startup.'))
+  parser.add_option("--test-launcher-bot-mode", action="store_true",
+                    help="run the tests with --test-launcher-bot-mode")
+  parser.add_option("--test-launcher-total-shards", type=int,
+                    help="run the tests with --test-launcher-total-shards")
+  parser.add_option("--test-launcher-shard-index", type=int,
+                    help="run the tests with --test-launcher-shard-index")
+  options, args = parser.parse_args()
+
+  if options.verbose:
+    logging_utils.config_root(logging.DEBUG)
+  else:
+    logging_utils.config_root()
+
+  if not options.test:
+    parser.error('--test not specified')
+
+  # Support build dir both with and without the target.
+  if (options.target and options.build_dir and
+      not options.build_dir.endswith(options.target)):
+    options.build_dir = os.path.join(options.build_dir, options.target)
+
+  # If --build_dir is provided, prepend it to the test executable if needed.
+  test_executable = options.test
+  if options.build_dir and not test_executable.startswith(options.build_dir):
+    test_executable = os.path.join(options.build_dir, test_executable)
+  args = [test_executable] + args
+
+  test = LibyuvTest(options, args, 'cmdline')
+  return test.Run()
+
+if __name__ == '__main__':
+  return_code = main(sys.argv)
+  sys.exit(return_code)
diff --git a/libs/libyuv/tools/valgrind-libyuv/libyuv_tests.sh b/libs/libyuv/tools/valgrind-libyuv/libyuv_tests.sh
new file mode 100755
index 0000000000..4fee7daed6
--- /dev/null
+++ b/libs/libyuv/tools/valgrind-libyuv/libyuv_tests.sh
@@ -0,0 +1,138 @@
+#!/bin/bash
+# Copyright (c) 2012 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS.  All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+# Set up some paths and re-direct the arguments to libyuv_tests.py
+
+# This script is a copy of the chrome_tests.sh wrapper script with the following
+# changes:
+# - The locate_valgrind.sh of Chromium's Valgrind scripts dir is used to locate
+#   the Valgrind framework install.
+# - libyuv_tests.py is invoked instead of chrome_tests.py.
+# - Chromium's Valgrind scripts directory is added to the PYTHONPATH to make it
+#   possible to execute the Python scripts properly.
+
+export THISDIR=`dirname $0`
+ARGV_COPY="$@"
+
+# We need to set CHROME_VALGRIND iff using Memcheck or TSan-Valgrind:
+#   tools/valgrind-libyuv/libyuv_tests.sh --tool memcheck
+# or
+#   tools/valgrind-libyuv/libyuv_tests.sh --tool=memcheck
+# (same for "--tool=tsan")
+tool="memcheck"  # Default to memcheck.
+while (( "$#" ))
+do
+  if [[ "$1" == "--tool" ]]
+  then
+    tool="$2"
+    shift
+  elif [[ "$1" =~ --tool=(.*) ]]
+  then
+    tool="${BASH_REMATCH[1]}"
+  fi
+  shift
+done
+
+NEEDS_VALGRIND=0
+NEEDS_DRMEMORY=0
+
+case "$tool" in
+  "memcheck")
+    NEEDS_VALGRIND=1
+    ;;
+  "tsan" | "tsan_rv")
+    if [ "`uname -s`" == CYGWIN* ]
+    then
+      NEEDS_PIN=1
+    else
+      NEEDS_VALGRIND=1
+    fi
+    ;;
+  "drmemory" | "drmemory_light" | "drmemory_full" | "drmemory_pattern")
+    NEEDS_DRMEMORY=1
+    ;;
+esac
+
+# For Libyuv, we'll use the locate_valgrind.sh script in Chromium's Valgrind
+# scripts dir to locate the Valgrind framework install
+CHROME_VALGRIND_SCRIPTS=$THISDIR/../valgrind
+
+if [ "$NEEDS_VALGRIND" == "1" ]
+then
+  CHROME_VALGRIND=`sh $CHROME_VALGRIND_SCRIPTS/locate_valgrind.sh`
+  if [ "$CHROME_VALGRIND" = "" ]
+  then
+    # locate_valgrind.sh failed
+    exit 1
+  fi
+  echo "Using valgrind binaries from ${CHROME_VALGRIND}"
+
+  PATH="${CHROME_VALGRIND}/bin:$PATH"
+  # We need to set these variables to override default lib paths hard-coded into
+  # Valgrind binary.
+  export VALGRIND_LIB="$CHROME_VALGRIND/lib/valgrind"
+  export VALGRIND_LIB_INNER="$CHROME_VALGRIND/lib/valgrind"
+
+  # Clean up some /tmp directories that might be stale due to interrupted
+  # chrome_tests.py execution.
+  # FYI:
+  #   -mtime +1  <- only print files modified more than 24h ago,
+  #   -print0/-0 are needed to handle possible newlines in the filenames.
+  echo "Cleanup /tmp from Valgrind stuff"
+  find /tmp -maxdepth 1 \(\
+        -name "vgdb-pipe-*" -or -name "vg_logs_*" -or -name "valgrind.*" \
+      \) -mtime +1 -print0 | xargs -0 rm -rf
+fi
+
+if [ "$NEEDS_DRMEMORY" == "1" ]
+then
+  if [ -z "$DRMEMORY_COMMAND" ]
+  then
+    DRMEMORY_PATH="$THISDIR/../../third_party/drmemory"
+    DRMEMORY_SFX="$DRMEMORY_PATH/drmemory-windows-sfx.exe"
+    if [ ! -f "$DRMEMORY_SFX" ]
+    then
+      echo "Can't find Dr. Memory executables."
+      echo "See http://www.chromium.org/developers/how-tos/using-valgrind/dr-memory"
+      echo "for the instructions on how to get them."
+      exit 1
+    fi
+
+    chmod +x "$DRMEMORY_SFX"  # Cygwin won't run it without +x.
+    "$DRMEMORY_SFX" -o"$DRMEMORY_PATH/unpacked" -y
+    export DRMEMORY_COMMAND="$DRMEMORY_PATH/unpacked/bin/drmemory.exe"
+  fi
+fi
+
+if [ "$NEEDS_PIN" == "1" ]
+then
+  if [ -z "$PIN_COMMAND" ]
+  then
+    # Set up PIN_COMMAND to invoke TSan.
+    TSAN_PATH="$THISDIR/../../third_party/tsan"
+    TSAN_SFX="$TSAN_PATH/tsan-x86-windows-sfx.exe"
+    echo "$TSAN_SFX"
+    if [ ! -f $TSAN_SFX ]
+    then
+      echo "Can't find ThreadSanitizer executables."
+      echo "See http://www.chromium.org/developers/how-tos/using-valgrind/threadsanitizer/threadsanitizer-on-windows"
+      echo "for the instructions on how to get them."
+      exit 1
+    fi
+
+    chmod +x "$TSAN_SFX"  # Cygwin won't run it without +x.
+    "$TSAN_SFX" -o"$TSAN_PATH"/unpacked -y
+    export PIN_COMMAND="$TSAN_PATH/unpacked/tsan-x86-windows/tsan.bat"
+  fi
+fi
+
+# Add Chrome's Valgrind scripts dir to the PYTHON_PATH since it contains
+# the scripts that are needed for this script to run
+PYTHONPATH=$THISDIR/../python/google:$CHROME_VALGRIND_SCRIPTS python \
+           "$THISDIR/libyuv_tests.py" $ARGV_COPY
diff --git a/libs/libyuv/tools/valgrind-libyuv/memcheck/OWNERS b/libs/libyuv/tools/valgrind-libyuv/memcheck/OWNERS
new file mode 100644
index 0000000000..72e8ffc0db
--- /dev/null
+++ b/libs/libyuv/tools/valgrind-libyuv/memcheck/OWNERS
@@ -0,0 +1 @@
+*
diff --git a/libs/libyuv/tools/valgrind-libyuv/memcheck/PRESUBMIT.py b/libs/libyuv/tools/valgrind-libyuv/memcheck/PRESUBMIT.py
new file mode 100644
index 0000000000..46ff4cfcf1
--- /dev/null
+++ b/libs/libyuv/tools/valgrind-libyuv/memcheck/PRESUBMIT.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python
+# Copyright (c) 2012 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS.  All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+"""
+Copied from Chrome's src/tools/valgrind/memcheck/PRESUBMIT.py
+
+See http://dev.chromium.org/developers/how-tos/depottools/presubmit-scripts
+for more details on the presubmit API built into gcl.
+"""
+
+import os
+import re
+import sys
+
+def CheckChange(input_api, output_api):
+  """Checks the memcheck suppressions files for bad data."""
+
+  # Add the path to the Chrome valgrind dir to the import path:
+  tools_vg_path = os.path.join(input_api.PresubmitLocalPath(), '..', '..',
+                               'valgrind')
+  sys.path.append(tools_vg_path)
+  import suppressions
+
+  sup_regex = re.compile('suppressions.*\.txt$')
+  suppressions = {}
+  errors = []
+  check_for_memcheck = False
+  # skip_next_line has 3 possible values:
+  # - False: don't skip the next line.
+  # - 'skip_suppression_name': the next line is a suppression name, skip.
+  # - 'skip_param': the next line is a system call parameter error, skip.
+  skip_next_line = False
+  for f in filter(lambda x: sup_regex.search(x.LocalPath()),
+                  input_api.AffectedFiles()):
+    for line, line_num in zip(f.NewContents(),
+                              xrange(1, len(f.NewContents()) + 1)):
+      line = line.lstrip()
+      if line.startswith('#') or not line:
+        continue
+
+      if skip_next_line:
+        if skip_next_line == 'skip_suppression_name':
+          if 'insert_a_suppression_name_here' in line:
+            errors.append('"insert_a_suppression_name_here" is not a valid '
+                          'suppression name')
+          if suppressions.has_key(line):
+            if f.LocalPath() == suppressions[line][1]:
+              errors.append('suppression with name "%s" at %s line %s '
+                            'has already been defined at line %s' %
+                            (line, f.LocalPath(), line_num,
+                             suppressions[line][1]))
+            else:
+              errors.append('suppression with name "%s" at %s line %s '
+                            'has already been defined at %s line %s' %
+                            (line, f.LocalPath(), line_num,
+                             suppressions[line][0], suppressions[line][1]))
+          else:
+            suppressions[line] = (f, line_num)
+            check_for_memcheck = True;
+        skip_next_line = False
+        continue
+      if check_for_memcheck:
+        if not line.startswith('Memcheck:'):
+          errors.append('"%s" should be "Memcheck:..." in %s line %s' %
+                        (line, f.LocalPath(), line_num))
+        check_for_memcheck = False;
+      if line == '{':
+        skip_next_line = 'skip_suppression_name'
+        continue
+      if line == "Memcheck:Param":
+        skip_next_line = 'skip_param'
+        continue
+
+      if (line.startswith('fun:') or line.startswith('obj:') or
+          line.startswith('Memcheck:') or line == '}' or
+          line == '...'):
+        continue
+      errors.append('"%s" is probably wrong: %s line %s' % (line, f.LocalPath(),
+                                                            line_num))
+  if errors:
+    return [output_api.PresubmitError('\n'.join(errors))]
+  return []
+
+def CheckChangeOnUpload(input_api, output_api):
+  return CheckChange(input_api, output_api)
+
+def CheckChangeOnCommit(input_api, output_api):
+  return CheckChange(input_api, output_api)
+
+def GetPreferredTrySlaves():
+  # We don't have any memcheck slaves yet, so there's no use for this method.
+  # When we have, the slave name(s) should be put into this list.
+  return []
diff --git a/libs/libyuv/tools/valgrind-libyuv/memcheck/suppressions.txt b/libs/libyuv/tools/valgrind-libyuv/memcheck/suppressions.txt
new file mode 100644
index 0000000000..3ad0c8ccc5
--- /dev/null
+++ b/libs/libyuv/tools/valgrind-libyuv/memcheck/suppressions.txt
@@ -0,0 +1,5 @@
+# This file is used in addition to the one already maintained in Chrome.
+# It acts as a place holder for future additions for this project.
+# It must exist for the Python wrapper script to work properly.
+
+
diff --git a/libs/libyuv/tools/valgrind-libyuv/memcheck/suppressions_mac.txt b/libs/libyuv/tools/valgrind-libyuv/memcheck/suppressions_mac.txt
new file mode 100644
index 0000000000..3ad0c8ccc5
--- /dev/null
+++ b/libs/libyuv/tools/valgrind-libyuv/memcheck/suppressions_mac.txt
@@ -0,0 +1,5 @@
+# This file is used in addition to the one already maintained in Chrome.
+# It acts as a place holder for future additions for this project.
+# It must exist for the Python wrapper script to work properly.
+
+
diff --git a/libs/libyuv/tools/valgrind-libyuv/memcheck/suppressions_win32.txt b/libs/libyuv/tools/valgrind-libyuv/memcheck/suppressions_win32.txt
new file mode 100644
index 0000000000..3ad0c8ccc5
--- /dev/null
+++ b/libs/libyuv/tools/valgrind-libyuv/memcheck/suppressions_win32.txt
@@ -0,0 +1,5 @@
+# This file is used in addition to the one already maintained in Chrome.
+# It acts as a place holder for future additions for this project.
+# It must exist for the Python wrapper script to work properly.
+
+
diff --git a/libs/libyuv/tools/valgrind-libyuv/tsan/OWNERS b/libs/libyuv/tools/valgrind-libyuv/tsan/OWNERS
new file mode 100644
index 0000000000..72e8ffc0db
--- /dev/null
+++ b/libs/libyuv/tools/valgrind-libyuv/tsan/OWNERS
@@ -0,0 +1 @@
+*
diff --git a/libs/libyuv/tools/valgrind-libyuv/tsan/PRESUBMIT.py b/libs/libyuv/tools/valgrind-libyuv/tsan/PRESUBMIT.py
new file mode 100644
index 0000000000..d25b6ebcee
--- /dev/null
+++ b/libs/libyuv/tools/valgrind-libyuv/tsan/PRESUBMIT.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python
+# Copyright (c) 2012 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS.  All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+import os
+import re
+import sys
+
+"""
+Copied from Chrome's src/tools/valgrind/tsan/PRESUBMIT.py
+
+See http://dev.chromium.org/developers/how-tos/depottools/presubmit-scripts
+for more details on the presubmit API built into gcl.
+"""
+
+def CheckChange(input_api, output_api):
+  """Checks the TSan suppressions files for bad suppressions."""
+
+  # Add the path to the Chrome valgrind dir to the import path:
+  tools_vg_path = os.path.join(input_api.PresubmitLocalPath(), '..', '..',
+                               'valgrind')
+  sys.path.append(tools_vg_path)
+  import suppressions
+
+  return suppressions.PresubmitCheck(input_api, output_api)
+
+def CheckChangeOnUpload(input_api, output_api):
+  return CheckChange(input_api, output_api)
+
+def CheckChangeOnCommit(input_api, output_api):
+  return CheckChange(input_api, output_api)
+
+def GetPreferredTrySlaves():
+  # We don't have any tsan slaves yet, so there's no use for this method.
+  # When we have, the slave name(s) should be put into this list.
+  return []
diff --git a/libs/libyuv/tools/valgrind-libyuv/tsan/suppressions.txt b/libs/libyuv/tools/valgrind-libyuv/tsan/suppressions.txt
new file mode 100644
index 0000000000..3ad0c8ccc5
--- /dev/null
+++ b/libs/libyuv/tools/valgrind-libyuv/tsan/suppressions.txt
@@ -0,0 +1,5 @@
+# This file is used in addition to the one already maintained in Chrome.
+# It acts as a place holder for future additions for this project.
+# It must exist for the Python wrapper script to work properly.
+
+
diff --git a/libs/libyuv/tools/valgrind-libyuv/tsan/suppressions_mac.txt b/libs/libyuv/tools/valgrind-libyuv/tsan/suppressions_mac.txt
new file mode 100644
index 0000000000..3ad0c8ccc5
--- /dev/null
+++ b/libs/libyuv/tools/valgrind-libyuv/tsan/suppressions_mac.txt
@@ -0,0 +1,5 @@
+# This file is used in addition to the one already maintained in Chrome.
+# It acts as a place holder for future additions for this project.
+# It must exist for the Python wrapper script to work properly.
+
+
diff --git a/libs/libyuv/tools/valgrind-libyuv/tsan/suppressions_win32.txt b/libs/libyuv/tools/valgrind-libyuv/tsan/suppressions_win32.txt
new file mode 100644
index 0000000000..3ad0c8ccc5
--- /dev/null
+++ b/libs/libyuv/tools/valgrind-libyuv/tsan/suppressions_win32.txt
@@ -0,0 +1,5 @@
+# This file is used in addition to the one already maintained in Chrome.
+# It acts as a place holder for future additions for this project.
+# It must exist for the Python wrapper script to work properly.
+
+
diff --git a/libs/libyuv/unit_test/basictypes_test.cc b/libs/libyuv/unit_test/basictypes_test.cc
new file mode 100644
index 0000000000..89f7644d58
--- /dev/null
+++ b/libs/libyuv/unit_test/basictypes_test.cc
@@ -0,0 +1,60 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "../unit_test/unit_test.h"
+#include "libyuv/basic_types.h"
+
+namespace libyuv {
+
+TEST_F(LibYUVBaseTest, Endian) {
+  uint16 v16 = 0x1234u;
+  uint8 first_byte = *reinterpret_cast<uint8*>(&v16);
+#if defined(LIBYUV_LITTLE_ENDIAN)
+  EXPECT_EQ(0x34u, first_byte);
+#else
+  EXPECT_EQ(0x12u, first_byte);
+#endif
+}
+
+TEST_F(LibYUVBaseTest, SizeOfTypes) {
+  int8 i8 = -1;
+  uint8 u8 = 1u;
+  int16 i16 = -1;
+  uint16 u16 = 1u;
+  int32 i32 = -1;
+  uint32 u32 = 1u;
+  int64 i64 = -1;
+  uint64 u64 = 1u;
+  EXPECT_EQ(1u, sizeof(i8));
+  EXPECT_EQ(1u, sizeof(u8));
+  EXPECT_EQ(2u, sizeof(i16));
+  EXPECT_EQ(2u, sizeof(u16));
+  EXPECT_EQ(4u, sizeof(i32));
+  EXPECT_EQ(4u, sizeof(u32));
+  EXPECT_EQ(8u, sizeof(i64));
+  EXPECT_EQ(8u, sizeof(u64));
+  EXPECT_GT(0, i8);
+  EXPECT_LT(0u, u8);
+  EXPECT_GT(0, i16);
+  EXPECT_LT(0u, u16);
+  EXPECT_GT(0, i32);
+  EXPECT_LT(0u, u32);
+  EXPECT_GT(0, i64);
+  EXPECT_LT(0u, u64);
+}
+
+TEST_F(LibYUVBaseTest, SizeOfConstants) {
+  EXPECT_EQ(8u, sizeof(INT64_C(0)));
+  EXPECT_EQ(8u, sizeof(UINT64_C(0)));
+  EXPECT_EQ(8u, sizeof(INT64_C(0x1234567887654321)));
+  EXPECT_EQ(8u, sizeof(UINT64_C(0x8765432112345678)));
+}
+
+}  // namespace libyuv
diff --git a/libs/libyuv/unit_test/color_test.cc b/libs/libyuv/unit_test/color_test.cc
new file mode 100644
index 0000000000..555413f79a
--- /dev/null
+++ b/libs/libyuv/unit_test/color_test.cc
@@ -0,0 +1,570 @@
+/*
+ *  Copyright 2015 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "libyuv/convert.h"
+#include "libyuv/convert_argb.h"
+#include "libyuv/convert_from.h"
+#include "libyuv/convert_from_argb.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/row.h"  // For Sobel
+#include "../unit_test/unit_test.h"
+
+namespace libyuv {
+
+// TODO(fbarchard): Port high accuracy YUV to RGB to Neon.
+#if !defined(LIBYUV_DISABLE_NEON) && \
+    (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+#define ERROR_R 1
+#define ERROR_G 1
+#define ERROR_B 3
+#define ERROR_FULL 6
+#define ERROR_J420 5
+#else
+#define ERROR_R 1
+#define ERROR_G 1
+#define ERROR_B 3
+#define ERROR_FULL 5
+#define ERROR_J420 3
+#endif
+
+#define TESTCS(TESTNAME, YUVTOARGB, ARGBTOYUV, HS1, HS, HN, DIFF)              \
+  TEST_F(LibYUVColorTest, TESTNAME) {                                          \
+  const int kPixels = benchmark_width_ * benchmark_height_;                    \
+  const int kHalfPixels = ((benchmark_width_ + 1) / 2) *                       \
+      ((benchmark_height_ + HS1) / HS);                                        \
+  align_buffer_64(orig_y, kPixels);                                            \
+  align_buffer_64(orig_u, kHalfPixels);                                        \
+  align_buffer_64(orig_v, kHalfPixels);                                        \
+  align_buffer_64(orig_pixels, kPixels * 4);                                   \
+  align_buffer_64(temp_y, kPixels);                                            \
+  align_buffer_64(temp_u, kHalfPixels);                                        \
+  align_buffer_64(temp_v, kHalfPixels);                                        \
+  align_buffer_64(dst_pixels_opt, kPixels * 4);                                \
+  align_buffer_64(dst_pixels_c, kPixels * 4);                                  \
+                                                                               \
+  MemRandomize(orig_pixels, kPixels * 4);                                      \
+  MemRandomize(orig_y, kPixels);                                               \
+  MemRandomize(orig_u, kHalfPixels);                                           \
+  MemRandomize(orig_v, kHalfPixels);                                           \
+  MemRandomize(temp_y, kPixels);                                               \
+  MemRandomize(temp_u, kHalfPixels);                                           \
+  MemRandomize(temp_v, kHalfPixels);                                           \
+  MemRandomize(dst_pixels_opt, kPixels * 4);                                   \
+  MemRandomize(dst_pixels_c, kPixels * 4);                                     \
+                                                                               \
+  /* The test is overall for color conversion matrix being reversible, so */   \
+  /* this initializes the pixel with 2x2 blocks to eliminate subsampling. */   \
+  uint8* p = orig_y;                                                           \
+  for (int y = 0; y < benchmark_height_ - HS1; y += HS) {                      \
+    for (int x = 0; x < benchmark_width_ - 1; x += 2) {                        \
+      uint8 r = static_cast<uint8>(fastrand());                                \
+      p[0] = r;                                                                \
+      p[1] = r;                                                                \
+      p[HN] = r;                                                               \
+      p[HN + 1] = r;                                                           \
+      p += 2;                                                                  \
+    }                                                                          \
+    if (benchmark_width_ & 1) {                                                \
+      uint8 r = static_cast<uint8>(fastrand());                                \
+      p[0] = r;                                                                \
+      p[HN] = r;                                                               \
+      p += 1;                                                                  \
+    }                                                                          \
+    p += HN;                                                                   \
+  }                                                                            \
+  if ((benchmark_height_ & 1) && HS == 2) {                                    \
+    for (int x = 0; x < benchmark_width_ - 1; x += 2) {                        \
+      uint8 r = static_cast<uint8>(fastrand());                                \
+      p[0] = r;                                                                \
+      p[1] = r;                                                                \
+      p += 2;                                                                  \
+    }                                                                          \
+    if (benchmark_width_ & 1) {                                                \
+      uint8 r = static_cast<uint8>(fastrand());                                \
+      p[0] = r;                                                                \
+      p += 1;                                                                  \
+    }                                                                          \
+  }                                                                            \
+  /* Start with YUV converted to ARGB. */                                      \
+  YUVTOARGB(orig_y, benchmark_width_,                                          \
+            orig_u, (benchmark_width_ + 1) / 2,                                \
+            orig_v, (benchmark_width_ + 1) / 2,                                \
+            orig_pixels, benchmark_width_ * 4,                                 \
+            benchmark_width_, benchmark_height_);                              \
+                                                                               \
+  ARGBTOYUV(orig_pixels, benchmark_width_ * 4,                                 \
+            temp_y, benchmark_width_,                                          \
+            temp_u, (benchmark_width_ + 1) / 2,                                \
+            temp_v, (benchmark_width_ + 1) / 2,                                \
+            benchmark_width_, benchmark_height_);                              \
+                                                                               \
+  MaskCpuFlags(disable_cpu_flags_);                                            \
+  YUVTOARGB(temp_y, benchmark_width_,                                          \
+            temp_u, (benchmark_width_ + 1) / 2,                                \
+            temp_v, (benchmark_width_ + 1) / 2,                                \
+            dst_pixels_c, benchmark_width_ * 4,                                \
+            benchmark_width_, benchmark_height_);                              \
+  MaskCpuFlags(benchmark_cpu_info_);                                           \
+                                                                               \
+  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
+    YUVTOARGB(temp_y, benchmark_width_,                                        \
+              temp_u, (benchmark_width_ + 1) / 2,                              \
+              temp_v, (benchmark_width_ + 1) / 2,                              \
+              dst_pixels_opt, benchmark_width_ * 4,                            \
+              benchmark_width_, benchmark_height_);                            \
+  }                                                                            \
+  /* Test C and SIMD match. */                                                 \
+  for (int i = 0; i < kPixels * 4; ++i) {                                      \
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);                             \
+  }                                                                            \
+  /* Test SIMD is close to original. */                                        \
+  for (int i = 0; i < kPixels * 4; ++i) {                                      \
+    EXPECT_NEAR(static_cast<int>(orig_pixels[i]),                              \
+                static_cast<int>(dst_pixels_opt[i]), DIFF);                    \
+  }                                                                            \
+                                                                               \
+  free_aligned_buffer_64(orig_pixels);                                         \
+  free_aligned_buffer_64(orig_y);                                              \
+  free_aligned_buffer_64(orig_u);                                              \
+  free_aligned_buffer_64(orig_v);                                              \
+  free_aligned_buffer_64(temp_y);                                              \
+  free_aligned_buffer_64(temp_u);                                              \
+  free_aligned_buffer_64(temp_v);                                              \
+  free_aligned_buffer_64(dst_pixels_opt);                                      \
+  free_aligned_buffer_64(dst_pixels_c);                                        \
+}                                                                              \
+
+TESTCS(TestI420, I420ToARGB, ARGBToI420, 1, 2, benchmark_width_, ERROR_FULL)
+TESTCS(TestI422, I422ToARGB, ARGBToI422, 0, 1, 0, ERROR_FULL)
+TESTCS(TestJ420, J420ToARGB, ARGBToJ420, 1, 2, benchmark_width_, ERROR_J420)
+TESTCS(TestJ422, J422ToARGB, ARGBToJ422, 0, 1, 0, 3)
+
+static void YUVToRGB(int y, int u, int v, int* r, int* g, int* b) {
+  const int kWidth = 16;
+  const int kHeight = 1;
+  const int kPixels = kWidth * kHeight;
+  const int kHalfPixels = ((kWidth + 1) / 2) * ((kHeight + 1) / 2);
+
+  SIMD_ALIGNED(uint8 orig_y[16]);
+  SIMD_ALIGNED(uint8 orig_u[8]);
+  SIMD_ALIGNED(uint8 orig_v[8]);
+  SIMD_ALIGNED(uint8 orig_pixels[16 * 4]);
+  memset(orig_y, y, kPixels);
+  memset(orig_u, u, kHalfPixels);
+  memset(orig_v, v, kHalfPixels);
+
+  /* YUV converted to ARGB. */
+  I422ToARGB(orig_y, kWidth,
+             orig_u, (kWidth + 1) / 2,
+             orig_v, (kWidth + 1) / 2,
+             orig_pixels, kWidth * 4,
+             kWidth, kHeight);
+
+  *b = orig_pixels[0];
+  *g = orig_pixels[1];
+  *r = orig_pixels[2];
+}
+
+static void YUVJToRGB(int y, int u, int v, int* r, int* g, int* b) {
+  const int kWidth = 16;
+  const int kHeight = 1;
+  const int kPixels = kWidth * kHeight;
+  const int kHalfPixels = ((kWidth + 1) / 2) * ((kHeight + 1) / 2);
+
+  SIMD_ALIGNED(uint8 orig_y[16]);
+  SIMD_ALIGNED(uint8 orig_u[8]);
+  SIMD_ALIGNED(uint8 orig_v[8]);
+  SIMD_ALIGNED(uint8 orig_pixels[16 * 4]);
+  memset(orig_y, y, kPixels);
+  memset(orig_u, u, kHalfPixels);
+  memset(orig_v, v, kHalfPixels);
+
+  /* YUV converted to ARGB. */
+  J422ToARGB(orig_y, kWidth,
+             orig_u, (kWidth + 1) / 2,
+             orig_v, (kWidth + 1) / 2,
+             orig_pixels, kWidth * 4,
+             kWidth, kHeight);
+
+  *b = orig_pixels[0];
+  *g = orig_pixels[1];
+  *r = orig_pixels[2];
+}
+
+static void YToRGB(int y, int* r, int* g, int* b) {
+  const int kWidth = 16;
+  const int kHeight = 1;
+  const int kPixels = kWidth * kHeight;
+
+  SIMD_ALIGNED(uint8 orig_y[16]);
+  SIMD_ALIGNED(uint8 orig_pixels[16 * 4]);
+  memset(orig_y, y, kPixels);
+
+  /* YUV converted to ARGB. */
+  I400ToARGB(orig_y, kWidth, orig_pixels, kWidth * 4, kWidth, kHeight);
+
+  *b = orig_pixels[0];
+  *g = orig_pixels[1];
+  *r = orig_pixels[2];
+}
+
+static void YJToRGB(int y, int* r, int* g, int* b) {
+  const int kWidth = 16;
+  const int kHeight = 1;
+  const int kPixels = kWidth * kHeight;
+
+  SIMD_ALIGNED(uint8 orig_y[16]);
+  SIMD_ALIGNED(uint8 orig_pixels[16 * 4]);
+  memset(orig_y, y, kPixels);
+
+  /* YUV converted to ARGB. */
+  J400ToARGB(orig_y, kWidth, orig_pixels, kWidth * 4, kWidth, kHeight);
+
+  *b = orig_pixels[0];
+  *g = orig_pixels[1];
+  *r = orig_pixels[2];
+}
+
+// Pick a method for clamping.
+//  #define CLAMPMETHOD_IF 1
+//  #define CLAMPMETHOD_TABLE 1
+#define CLAMPMETHOD_TERNARY 1
+//  #define CLAMPMETHOD_MASK 1
+
+// Pick a method for rounding.
+#define ROUND(f) static_cast<int>(f + 0.5f)
+//  #define ROUND(f) lrintf(f)
+//  #define ROUND(f) static_cast<int>(round(f))
+//  #define ROUND(f) _mm_cvt_ss2si(_mm_load_ss(&f))
+
+#if defined(CLAMPMETHOD_IF)
+static int RoundToByte(float f) {
+  int i =  ROUND(f);
+  if (i < 0) {
+    i = 0;
+  }
+  if (i > 255) {
+    i = 255;
+  }
+  return i;
+}
+#elif defined(CLAMPMETHOD_TABLE)
+static const unsigned char clamptable[811] = {
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
+  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+  29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66,
+  67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85,
+  86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103,
+  104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118,
+  119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133,
+  134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
+  149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163,
+  164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178,
+  179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193,
+  194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208,
+  209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
+  224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238,
+  239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253,
+  254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
+};
+
+static int RoundToByte(float f) {
+  return clamptable[ROUND(f) + 276];
+}
+#elif defined(CLAMPMETHOD_TERNARY)
+static int RoundToByte(float f) {
+  int i = ROUND(f);
+  return (i < 0) ? 0 : ((i > 255) ? 255 : i);
+}
+#elif defined(CLAMPMETHOD_MASK)
+static int RoundToByte(float f) {
+  int i = ROUND(f);
+  i =  ((-(i) >> 31) & (i));  // clamp to 0.
+  return (((255 - (i)) >> 31) | (i)) & 255;  // clamp to 255.
+}
+#endif
+
+#define RANDOM256(s) ((s & 1) ? ((s >> 1) ^ 0xb8) : (s >> 1))
+
+TEST_F(LibYUVColorTest, TestRoundToByte) {
+  int allb = 0;
+  int count = benchmark_width_ * benchmark_height_;
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    float f = (fastrand() & 255) * 3.14f - 260.f;
+    for (int j = 0; j < count; ++j) {
+      int b = RoundToByte(f);
+      f += 0.91f;
+      allb |= b;
+    }
+  }
+  EXPECT_GE(allb, 0);
+  EXPECT_LE(allb, 255);
+}
+
+static void YUVToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
+  *r = RoundToByte((y - 16) * 1.164 - (v - 128) * -1.596);
+  *g = RoundToByte((y - 16) * 1.164 - (u - 128) * 0.391 - (v - 128) * 0.813);
+  *b = RoundToByte((y - 16) * 1.164 - (u - 128) * -2.018);
+}
+
+static void YUVJToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
+  *r = RoundToByte(y - (v - 128) * -1.40200);
+  *g = RoundToByte(y - (u - 128) * 0.34414 - (v - 128) * 0.71414);
+  *b = RoundToByte(y - (u - 128) * -1.77200);
+}
+
+TEST_F(LibYUVColorTest, TestYUV) {
+  int r0, g0, b0, r1, g1, b1;
+
+  // cyan (less red)
+  YUVToRGBReference(240, 255, 0, &r0, &g0, &b0);
+  EXPECT_EQ(56, r0);
+  EXPECT_EQ(255, g0);
+  EXPECT_EQ(255, b0);
+
+  YUVToRGB(240, 255, 0, &r1, &g1, &b1);
+  EXPECT_EQ(57, r1);
+  EXPECT_EQ(255, g1);
+  EXPECT_EQ(255, b1);
+
+  // green (less red and blue)
+  YUVToRGBReference(240, 0, 0, &r0, &g0, &b0);
+  EXPECT_EQ(56, r0);
+  EXPECT_EQ(255, g0);
+  EXPECT_EQ(2, b0);
+
+  YUVToRGB(240, 0, 0, &r1, &g1, &b1);
+  EXPECT_EQ(57, r1);
+  EXPECT_EQ(255, g1);
+  EXPECT_EQ(5, b1);
+
+  for (int i = 0; i < 256; ++i) {
+    YUVToRGBReference(i, 128, 128, &r0, &g0, &b0);
+    YUVToRGB(i, 128, 128, &r1, &g1, &b1);
+    EXPECT_NEAR(r0, r1, ERROR_R);
+    EXPECT_NEAR(g0, g1, ERROR_G);
+    EXPECT_NEAR(b0, b1, ERROR_B);
+
+    YUVToRGBReference(i, 0, 0, &r0, &g0, &b0);
+    YUVToRGB(i, 0, 0, &r1, &g1, &b1);
+    EXPECT_NEAR(r0, r1, ERROR_R);
+    EXPECT_NEAR(g0, g1, ERROR_G);
+    EXPECT_NEAR(b0, b1, ERROR_B);
+
+    YUVToRGBReference(i, 0, 255, &r0, &g0, &b0);
+    YUVToRGB(i, 0, 255, &r1, &g1, &b1);
+    EXPECT_NEAR(r0, r1, ERROR_R);
+    EXPECT_NEAR(g0, g1, ERROR_G);
+    EXPECT_NEAR(b0, b1, ERROR_B);
+  }
+}
+
+TEST_F(LibYUVColorTest, TestGreyYUV) {
+  int r0, g0, b0, r1, g1, b1, r2, g2, b2;
+
+  // black
+  YUVToRGBReference(16, 128, 128, &r0, &g0, &b0);
+  EXPECT_EQ(0, r0);
+  EXPECT_EQ(0, g0);
+  EXPECT_EQ(0, b0);
+
+  YUVToRGB(16, 128, 128, &r1, &g1, &b1);
+  EXPECT_EQ(0, r1);
+  EXPECT_EQ(0, g1);
+  EXPECT_EQ(0, b1);
+
+  // white
+  YUVToRGBReference(240, 128, 128, &r0, &g0, &b0);
+  EXPECT_EQ(255, r0);
+  EXPECT_EQ(255, g0);
+  EXPECT_EQ(255, b0);
+
+  YUVToRGB(240, 128, 128, &r1, &g1, &b1);
+  EXPECT_EQ(255, r1);
+  EXPECT_EQ(255, g1);
+  EXPECT_EQ(255, b1);
+
+  // grey
+  YUVToRGBReference(128, 128, 128, &r0, &g0, &b0);
+  EXPECT_EQ(130, r0);
+  EXPECT_EQ(130, g0);
+  EXPECT_EQ(130, b0);
+
+  YUVToRGB(128, 128, 128, &r1, &g1, &b1);
+  EXPECT_EQ(130, r1);
+  EXPECT_EQ(130, g1);
+  EXPECT_EQ(130, b1);
+
+
+  for (int y = 0; y < 256; ++y) {
+    YUVToRGBReference(y, 128, 128, &r0, &g0, &b0);
+    YUVToRGB(y, 128, 128, &r1, &g1, &b1);
+    YToRGB(y, &r2, &g2, &b2);
+    EXPECT_EQ(r0, r1);
+    EXPECT_EQ(g0, g1);
+    EXPECT_EQ(b0, b1);
+    EXPECT_EQ(r0, r2);
+    EXPECT_EQ(g0, g2);
+    EXPECT_EQ(b0, b2);
+  }
+}
+
+static void PrintHistogram(int rh[256], int gh[256], int bh[256]) {
+  int i;
+  printf("hist");
+  for (i = 0; i < 256; ++i) {
+    if (rh[i] || gh[i] || bh[i]) {
+      printf("\t%8d", i - 128);
+    }
+  }
+  printf("\nred");
+  for (i = 0; i < 256; ++i) {
+    if (rh[i] || gh[i] || bh[i]) {
+      printf("\t%8d", rh[i]);
+    }
+  }
+  printf("\ngreen");
+  for (i = 0; i < 256; ++i) {
+    if (rh[i] || gh[i] || bh[i]) {
+      printf("\t%8d", gh[i]);
+    }
+  }
+  printf("\nblue");
+  for (i = 0; i < 256; ++i) {
+    if (rh[i] || gh[i] || bh[i]) {
+      printf("\t%8d", bh[i]);
+    }
+  }
+  printf("\n");
+}
+
+TEST_F(LibYUVColorTest, TestFullYUV) {
+  int rh[256] = { 0, }, gh[256] = { 0, }, bh[256] = { 0, };
+  for (int u = 0; u < 256; ++u) {
+    for (int v = 0; v < 256; ++v) {
+      for (int y2 = 0; y2 < 256; ++y2) {
+        int r0, g0, b0, r1, g1, b1;
+        int y = RANDOM256(y2);
+        YUVToRGBReference(y, u, v, &r0, &g0, &b0);
+        YUVToRGB(y, u, v, &r1, &g1, &b1);
+        EXPECT_NEAR(r0, r1, ERROR_R);
+        EXPECT_NEAR(g0, g1, ERROR_G);
+        EXPECT_NEAR(b0, b1, ERROR_B);
+        ++rh[r1 - r0 + 128];
+        ++gh[g1 - g0 + 128];
+        ++bh[b1 - b0 + 128];
+      }
+    }
+  }
+  PrintHistogram(rh, gh, bh);
+}
+
+TEST_F(LibYUVColorTest, TestFullYUVJ) {
+  int rh[256] = { 0, }, gh[256] = { 0, }, bh[256] = { 0, };
+  for (int u = 0; u < 256; ++u) {
+    for (int v = 0; v < 256; ++v) {
+      for (int y2 = 0; y2 < 256; ++y2) {
+        int r0, g0, b0, r1, g1, b1;
+        int y = RANDOM256(y2);
+        YUVJToRGBReference(y, u, v, &r0, &g0, &b0);
+        YUVJToRGB(y, u, v, &r1, &g1, &b1);
+        EXPECT_NEAR(r0, r1, 1);
+        EXPECT_NEAR(g0, g1, 1);
+        EXPECT_NEAR(b0, b1, 1);
+        ++rh[r1 - r0 + 128];
+        ++gh[g1 - g0 + 128];
+        ++bh[b1 - b0 + 128];
+      }
+    }
+  }
+  PrintHistogram(rh, gh, bh);
+}
+
+TEST_F(LibYUVColorTest, TestGreyYUVJ) {
+  int r0, g0, b0, r1, g1, b1, r2, g2, b2;
+
+  // black
+  YUVJToRGBReference(0, 128, 128, &r0, &g0, &b0);
+  EXPECT_EQ(0, r0);
+  EXPECT_EQ(0, g0);
+  EXPECT_EQ(0, b0);
+
+  YUVJToRGB(0, 128, 128, &r1, &g1, &b1);
+  EXPECT_EQ(0, r1);
+  EXPECT_EQ(0, g1);
+  EXPECT_EQ(0, b1);
+
+  // white
+  YUVJToRGBReference(255, 128, 128, &r0, &g0, &b0);
+  EXPECT_EQ(255, r0);
+  EXPECT_EQ(255, g0);
+  EXPECT_EQ(255, b0);
+
+  YUVJToRGB(255, 128, 128, &r1, &g1, &b1);
+  EXPECT_EQ(255, r1);
+  EXPECT_EQ(255, g1);
+  EXPECT_EQ(255, b1);
+
+  // grey
+  YUVJToRGBReference(128, 128, 128, &r0, &g0, &b0);
+  EXPECT_EQ(128, r0);
+  EXPECT_EQ(128, g0);
+  EXPECT_EQ(128, b0);
+
+  YUVJToRGB(128, 128, 128, &r1, &g1, &b1);
+  EXPECT_EQ(128, r1);
+  EXPECT_EQ(128, g1);
+  EXPECT_EQ(128, b1);
+
+  for (int y = 0; y < 256; ++y) {
+    YUVJToRGBReference(y, 128, 128, &r0, &g0, &b0);
+    YUVJToRGB(y, 128, 128, &r1, &g1, &b1);
+    YJToRGB(y, &r2, &g2, &b2);
+    EXPECT_EQ(r0, r1);
+    EXPECT_EQ(g0, g1);
+    EXPECT_EQ(b0, b1);
+    EXPECT_EQ(r0, r2);
+    EXPECT_EQ(g0, g2);
+    EXPECT_EQ(b0, b2);
+  }
+}
+
+}  // namespace libyuv
diff --git a/libs/libyuv/unit_test/compare_test.cc b/libs/libyuv/unit_test/compare_test.cc
new file mode 100644
index 0000000000..572a0a0aab
--- /dev/null
+++ b/libs/libyuv/unit_test/compare_test.cc
@@ -0,0 +1,514 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#include "../unit_test/unit_test.h"
+#include "libyuv/basic_types.h"
+#include "libyuv/compare.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/row.h"
+#include "libyuv/video_common.h"
+
+namespace libyuv {
+
+// hash seed of 5381 recommended.
+static uint32 ReferenceHashDjb2(const uint8* src, uint64 count, uint32 seed) {
+  uint32 hash = seed;
+  if (count > 0) {
+    do {
+      hash = hash * 33 + *src++;
+    } while (--count);
+  }
+  return hash;
+}
+
+TEST_F(LibYUVBaseTest, Djb2_Test) {
+  const int kMaxTest = benchmark_width_ * benchmark_height_;
+  align_buffer_64(src_a, kMaxTest);
+  align_buffer_64(src_b, kMaxTest);
+
+  const char* fox = "The quick brown fox jumps over the lazy dog"
+      " and feels as if he were in the seventh heaven of typography"
+      " together with Hermann Zapf";
+  uint32 foxhash = HashDjb2(reinterpret_cast<const uint8*>(fox), 131, 5381);
+  const uint32 kExpectedFoxHash = 2611006483u;
+  EXPECT_EQ(kExpectedFoxHash, foxhash);
+
+  for (int i = 0; i < kMaxTest; ++i) {
+    src_a[i] = (fastrand() & 0xff);
+    src_b[i] = (fastrand() & 0xff);
+  }
+  // Compare different buffers. Expect hash is different.
+  uint32 h1 = HashDjb2(src_a, kMaxTest, 5381);
+  uint32 h2 = HashDjb2(src_b, kMaxTest, 5381);
+  EXPECT_NE(h1, h2);
+
+  // Make last half same. Expect hash is different.
+  memcpy(src_a + kMaxTest / 2, src_b + kMaxTest / 2, kMaxTest / 2);
+  h1 = HashDjb2(src_a, kMaxTest, 5381);
+  h2 = HashDjb2(src_b, kMaxTest, 5381);
+  EXPECT_NE(h1, h2);
+
+  // Make first half same. Expect hash is different.
+  memcpy(src_a + kMaxTest / 2, src_a, kMaxTest / 2);
+  memcpy(src_b + kMaxTest / 2, src_b, kMaxTest / 2);
+  memcpy(src_a, src_b, kMaxTest / 2);
+  h1 = HashDjb2(src_a, kMaxTest, 5381);
+  h2 = HashDjb2(src_b, kMaxTest, 5381);
+  EXPECT_NE(h1, h2);
+
+  // Make same. Expect hash is same.
+  memcpy(src_a, src_b, kMaxTest);
+  h1 = HashDjb2(src_a, kMaxTest, 5381);
+  h2 = HashDjb2(src_b, kMaxTest, 5381);
+  EXPECT_EQ(h1, h2);
+
+  // Mask seed different. Expect hash is different.
+  memcpy(src_a, src_b, kMaxTest);
+  h1 = HashDjb2(src_a, kMaxTest, 5381);
+  h2 = HashDjb2(src_b, kMaxTest, 1234);
+  EXPECT_NE(h1, h2);
+
+  // Make one byte different in middle. Expect hash is different.
+  memcpy(src_a, src_b, kMaxTest);
+  ++src_b[kMaxTest / 2];
+  h1 = HashDjb2(src_a, kMaxTest, 5381);
+  h2 = HashDjb2(src_b, kMaxTest, 5381);
+  EXPECT_NE(h1, h2);
+
+  // Make first byte different. Expect hash is different.
+  memcpy(src_a, src_b, kMaxTest);
+  ++src_b[0];
+  h1 = HashDjb2(src_a, kMaxTest, 5381);
+  h2 = HashDjb2(src_b, kMaxTest, 5381);
+  EXPECT_NE(h1, h2);
+
+  // Make last byte different. Expect hash is different.
+  memcpy(src_a, src_b, kMaxTest);
+  ++src_b[kMaxTest - 1];
+  h1 = HashDjb2(src_a, kMaxTest, 5381);
+  h2 = HashDjb2(src_b, kMaxTest, 5381);
+  EXPECT_NE(h1, h2);
+
+  // Make a zeros. Test different lengths. Expect hash is different.
+  memset(src_a, 0, kMaxTest);
+  h1 = HashDjb2(src_a, kMaxTest, 5381);
+  h2 = HashDjb2(src_a, kMaxTest / 2, 5381);
+  EXPECT_NE(h1, h2);
+
+  // Make a zeros and seed of zero. Test different lengths. Expect hash is same.
+  memset(src_a, 0, kMaxTest);
+  h1 = HashDjb2(src_a, kMaxTest, 0);
+  h2 = HashDjb2(src_a, kMaxTest / 2, 0);
+  EXPECT_EQ(h1, h2);
+
+  free_aligned_buffer_64(src_a);
+  free_aligned_buffer_64(src_b);
+}
+
+TEST_F(LibYUVBaseTest, BenchmarkDjb2_Opt) {
+  const int kMaxTest = benchmark_width_ * benchmark_height_;
+  align_buffer_64(src_a, kMaxTest);
+
+  for (int i = 0; i < kMaxTest; ++i) {
+    src_a[i] = i;
+  }
+  uint32 h2 = ReferenceHashDjb2(src_a, kMaxTest, 5381);
+  uint32 h1;
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    h1 = HashDjb2(src_a, kMaxTest, 5381);
+  }
+  EXPECT_EQ(h1, h2);
+  free_aligned_buffer_64(src_a);
+}
+
+TEST_F(LibYUVBaseTest, BenchmarkDjb2_Unaligned) {
+  const int kMaxTest = benchmark_width_ * benchmark_height_;
+  align_buffer_64(src_a, kMaxTest + 1);
+  for (int i = 0; i < kMaxTest; ++i) {
+    src_a[i + 1] = i;
+  }
+  uint32 h2 = ReferenceHashDjb2(src_a + 1, kMaxTest, 5381);
+  uint32 h1;
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    h1 = HashDjb2(src_a + 1, kMaxTest, 5381);
+  }
+  EXPECT_EQ(h1, h2);
+  free_aligned_buffer_64(src_a);
+}
+
+TEST_F(LibYUVBaseTest, BenchmarkARGBDetect_Opt) {
+  uint32 fourcc;
+  const int kMaxTest = benchmark_width_ * benchmark_height_ * 4;
+  align_buffer_64(src_a, kMaxTest);
+  for (int i = 0; i < kMaxTest; ++i) {
+    src_a[i] = 255;
+  }
+
+  src_a[0] = 0;
+  fourcc = ARGBDetect(src_a, benchmark_width_ * 4,
+                      benchmark_width_, benchmark_height_);
+  EXPECT_EQ(libyuv::FOURCC_BGRA, fourcc);
+  src_a[0] = 255;
+  src_a[3] = 0;
+  fourcc = ARGBDetect(src_a, benchmark_width_ * 4,
+                      benchmark_width_, benchmark_height_);
+  EXPECT_EQ(libyuv::FOURCC_ARGB, fourcc);
+  src_a[3] = 255;
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    fourcc = ARGBDetect(src_a, benchmark_width_ * 4,
+                        benchmark_width_, benchmark_height_);
+  }
+  EXPECT_EQ(0, fourcc);
+
+  free_aligned_buffer_64(src_a);
+}
+
+TEST_F(LibYUVBaseTest, BenchmarkARGBDetect_Unaligned) {
+  uint32 fourcc;
+  const int kMaxTest = benchmark_width_ * benchmark_height_ * 4 + 1;
+  align_buffer_64(src_a, kMaxTest);
+  for (int i = 0; i < kMaxTest; ++i) {
+    src_a[i + 1] = 255;
+  }
+
+  src_a[0 + 1] = 0;
+  fourcc = ARGBDetect(src_a + 1, benchmark_width_ * 4,
+                      benchmark_width_, benchmark_height_);
+  EXPECT_EQ(libyuv::FOURCC_BGRA, fourcc);
+  src_a[0 + 1] = 255;
+  src_a[3 + 1] = 0;
+  fourcc = ARGBDetect(src_a + 1, benchmark_width_ * 4,
+                      benchmark_width_, benchmark_height_);
+  EXPECT_EQ(libyuv::FOURCC_ARGB, fourcc);
+  src_a[3 + 1] = 255;
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    fourcc = ARGBDetect(src_a + 1, benchmark_width_ * 4,
+                        benchmark_width_, benchmark_height_);
+  }
+  EXPECT_EQ(0, fourcc);
+
+  free_aligned_buffer_64(src_a);
+}
+TEST_F(LibYUVBaseTest, BenchmarkSumSquareError_Opt) {
+  const int kMaxWidth = 4096 * 3;
+  align_buffer_64(src_a, kMaxWidth);
+  align_buffer_64(src_b, kMaxWidth);
+  memset(src_a, 0, kMaxWidth);
+  memset(src_b, 0, kMaxWidth);
+
+  memcpy(src_a, "test0123test4567", 16);
+  memcpy(src_b, "tick0123tock4567", 16);
+  uint64 h1 = ComputeSumSquareError(src_a, src_b, 16);
+  EXPECT_EQ(790u, h1);
+
+  for (int i = 0; i < kMaxWidth; ++i) {
+    src_a[i] = i;
+    src_b[i] = i;
+  }
+  memset(src_a, 0, kMaxWidth);
+  memset(src_b, 0, kMaxWidth);
+
+  int count = benchmark_iterations_ *
+    ((benchmark_width_ * benchmark_height_ + kMaxWidth - 1) / kMaxWidth);
+  for (int i = 0; i < count; ++i) {
+    h1 = ComputeSumSquareError(src_a, src_b, kMaxWidth);
+  }
+
+  EXPECT_EQ(0, h1);
+
+  free_aligned_buffer_64(src_a);
+  free_aligned_buffer_64(src_b);
+}
+
+TEST_F(LibYUVBaseTest, SumSquareError) {
+  const int kMaxWidth = 4096 * 3;
+  align_buffer_64(src_a, kMaxWidth);
+  align_buffer_64(src_b, kMaxWidth);
+  memset(src_a, 0, kMaxWidth);
+  memset(src_b, 0, kMaxWidth);
+
+  uint64 err;
+  err = ComputeSumSquareError(src_a, src_b, kMaxWidth);
+
+  EXPECT_EQ(0, err);
+
+  memset(src_a, 1, kMaxWidth);
+  err = ComputeSumSquareError(src_a, src_b, kMaxWidth);
+
+  EXPECT_EQ(err, kMaxWidth);
+
+  memset(src_a, 190, kMaxWidth);
+  memset(src_b, 193, kMaxWidth);
+  err = ComputeSumSquareError(src_a, src_b, kMaxWidth);
+
+  EXPECT_EQ(kMaxWidth * 3 * 3, err);
+
+  for (int i = 0; i < kMaxWidth; ++i) {
+    src_a[i] = (fastrand() & 0xff);
+    src_b[i] = (fastrand() & 0xff);
+  }
+
+  MaskCpuFlags(disable_cpu_flags_);
+  uint64 c_err = ComputeSumSquareError(src_a, src_b, kMaxWidth);
+
+  MaskCpuFlags(benchmark_cpu_info_);
+  uint64 opt_err = ComputeSumSquareError(src_a, src_b, kMaxWidth);
+
+  EXPECT_EQ(c_err, opt_err);
+
+  free_aligned_buffer_64(src_a);
+  free_aligned_buffer_64(src_b);
+}
+
+TEST_F(LibYUVBaseTest, BenchmarkPsnr_Opt) {
+  align_buffer_64(src_a, benchmark_width_ * benchmark_height_);
+  align_buffer_64(src_b, benchmark_width_ * benchmark_height_);
+  for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
+    src_a[i] = i;
+    src_b[i] = i;
+  }
+
+  MaskCpuFlags(benchmark_cpu_info_);
+
+  double opt_time = get_time();
+  for (int i = 0; i < benchmark_iterations_; ++i)
+    CalcFramePsnr(src_a, benchmark_width_,
+                  src_b, benchmark_width_,
+                  benchmark_width_, benchmark_height_);
+
+  opt_time = (get_time() - opt_time) / benchmark_iterations_;
+  printf("BenchmarkPsnr_Opt - %8.2f us opt\n", opt_time * 1e6);
+
+  EXPECT_EQ(0, 0);
+
+  free_aligned_buffer_64(src_a);
+  free_aligned_buffer_64(src_b);
+}
+
+TEST_F(LibYUVBaseTest, BenchmarkPsnr_Unaligned) {
+  align_buffer_64(src_a, benchmark_width_ * benchmark_height_ + 1);
+  align_buffer_64(src_b, benchmark_width_ * benchmark_height_);
+  for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
+    src_a[i + 1] = i;
+    src_b[i] = i;
+  }
+
+  MaskCpuFlags(benchmark_cpu_info_);
+
+  double opt_time = get_time();
+  for (int i = 0; i < benchmark_iterations_; ++i)
+    CalcFramePsnr(src_a + 1, benchmark_width_,
+                  src_b, benchmark_width_,
+                  benchmark_width_, benchmark_height_);
+
+  opt_time = (get_time() - opt_time) / benchmark_iterations_;
+  printf("BenchmarkPsnr_Opt - %8.2f us opt\n", opt_time * 1e6);
+
+  EXPECT_EQ(0, 0);
+
+  free_aligned_buffer_64(src_a);
+  free_aligned_buffer_64(src_b);
+}
+
+TEST_F(LibYUVBaseTest, Psnr) {
+  const int kSrcWidth = benchmark_width_;
+  const int kSrcHeight = benchmark_height_;
+  const int b = 128;
+  const int kSrcPlaneSize = (kSrcWidth + b * 2) * (kSrcHeight + b * 2);
+  const int kSrcStride = 2 * b + kSrcWidth;
+  align_buffer_64(src_a, kSrcPlaneSize);
+  align_buffer_64(src_b, kSrcPlaneSize);
+  memset(src_a, 0, kSrcPlaneSize);
+  memset(src_b, 0, kSrcPlaneSize);
+
+  double err;
+  err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride,
+                      src_b + kSrcStride * b + b, kSrcStride,
+                      kSrcWidth, kSrcHeight);
+
+  EXPECT_EQ(err, kMaxPsnr);
+
+  memset(src_a, 255, kSrcPlaneSize);
+
+  err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride,
+                      src_b + kSrcStride * b + b, kSrcStride,
+                      kSrcWidth, kSrcHeight);
+
+  EXPECT_EQ(err, 0.0);
+
+  memset(src_a, 1, kSrcPlaneSize);
+
+  err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride,
+                      src_b + kSrcStride * b + b, kSrcStride,
+                      kSrcWidth, kSrcHeight);
+
+  EXPECT_GT(err, 48.0);
+  EXPECT_LT(err, 49.0);
+
+  for (int i = 0; i < kSrcPlaneSize; ++i) {
+    src_a[i] = i;
+  }
+
+  err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride,
+                      src_b + kSrcStride * b + b, kSrcStride,
+                      kSrcWidth, kSrcHeight);
+
+  EXPECT_GT(err, 2.0);
+  if (kSrcWidth * kSrcHeight >= 256) {
+    EXPECT_LT(err, 6.0);
+  }
+
+  memset(src_a, 0, kSrcPlaneSize);
+  memset(src_b, 0, kSrcPlaneSize);
+
+  for (int i = b; i < (kSrcHeight + b); ++i) {
+    for (int j = b; j < (kSrcWidth + b); ++j) {
+      src_a[(i * kSrcStride) + j] = (fastrand() & 0xff);
+      src_b[(i * kSrcStride) + j] = (fastrand() & 0xff);
+    }
+  }
+
+  MaskCpuFlags(disable_cpu_flags_);
+  double c_err, opt_err;
+
+  c_err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride,
+                        src_b + kSrcStride * b + b, kSrcStride,
+                        kSrcWidth, kSrcHeight);
+
+  MaskCpuFlags(benchmark_cpu_info_);
+
+  opt_err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride,
+                          src_b + kSrcStride * b + b, kSrcStride,
+                          kSrcWidth, kSrcHeight);
+
+  EXPECT_EQ(opt_err, c_err);
+
+  free_aligned_buffer_64(src_a);
+  free_aligned_buffer_64(src_b);
+}
+
+TEST_F(LibYUVBaseTest, DISABLED_BenchmarkSsim_Opt) {
+  align_buffer_64(src_a, benchmark_width_ * benchmark_height_);
+  align_buffer_64(src_b, benchmark_width_ * benchmark_height_);
+  for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
+    src_a[i] = i;
+    src_b[i] = i;
+  }
+
+  MaskCpuFlags(benchmark_cpu_info_);
+
+  double opt_time = get_time();
+  for (int i = 0; i < benchmark_iterations_; ++i)
+    CalcFrameSsim(src_a, benchmark_width_,
+                  src_b, benchmark_width_,
+                  benchmark_width_, benchmark_height_);
+
+  opt_time = (get_time() - opt_time) / benchmark_iterations_;
+  printf("BenchmarkSsim_Opt - %8.2f us opt\n", opt_time * 1e6);
+
+  EXPECT_EQ(0, 0);  // Pass if we get this far.
+
+  free_aligned_buffer_64(src_a);
+  free_aligned_buffer_64(src_b);
+}
+
+TEST_F(LibYUVBaseTest, Ssim) {
+  const int kSrcWidth = benchmark_width_;
+  const int kSrcHeight = benchmark_height_;
+  const int b = 128;
+  const int kSrcPlaneSize = (kSrcWidth + b * 2) * (kSrcHeight + b * 2);
+  const int kSrcStride = 2 * b + kSrcWidth;
+  align_buffer_64(src_a, kSrcPlaneSize);
+  align_buffer_64(src_b, kSrcPlaneSize);
+  memset(src_a, 0, kSrcPlaneSize);
+  memset(src_b, 0, kSrcPlaneSize);
+
+  if (kSrcWidth <=8 || kSrcHeight <= 8) {
+    printf("warning - Ssim size too small.  Testing function executes.\n");
+  }
+
+  double err;
+  err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride,
+                      src_b + kSrcStride * b + b, kSrcStride,
+                      kSrcWidth, kSrcHeight);
+
+  if (kSrcWidth > 8 && kSrcHeight > 8) {
+    EXPECT_EQ(err, 1.0);
+  }
+
+  memset(src_a, 255, kSrcPlaneSize);
+
+  err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride,
+                      src_b + kSrcStride * b + b, kSrcStride,
+                      kSrcWidth, kSrcHeight);
+
+  if (kSrcWidth > 8 && kSrcHeight > 8) {
+    EXPECT_LT(err, 0.0001);
+  }
+
+  memset(src_a, 1, kSrcPlaneSize);
+
+  err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride,
+                      src_b + kSrcStride * b + b, kSrcStride,
+                      kSrcWidth, kSrcHeight);
+
+  if (kSrcWidth > 8 && kSrcHeight > 8) {
+    EXPECT_GT(err, 0.0001);
+    EXPECT_LT(err, 0.9);
+  }
+
+  for (int i = 0; i < kSrcPlaneSize; ++i) {
+    src_a[i] = i;
+  }
+
+  err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride,
+                      src_b + kSrcStride * b + b, kSrcStride,
+                      kSrcWidth, kSrcHeight);
+
+  if (kSrcWidth > 8 && kSrcHeight > 8) {
+    EXPECT_GT(err, 0.0);
+    EXPECT_LT(err, 0.01);
+  }
+
+  for (int i = b; i < (kSrcHeight + b); ++i) {
+    for (int j = b; j < (kSrcWidth + b); ++j) {
+      src_a[(i * kSrcStride) + j] = (fastrand() & 0xff);
+      src_b[(i * kSrcStride) + j] = (fastrand() & 0xff);
+    }
+  }
+
+  MaskCpuFlags(disable_cpu_flags_);
+  double c_err, opt_err;
+
+  c_err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride,
+                        src_b + kSrcStride * b + b, kSrcStride,
+                        kSrcWidth, kSrcHeight);
+
+  MaskCpuFlags(benchmark_cpu_info_);
+
+  opt_err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride,
+                          src_b + kSrcStride * b + b, kSrcStride,
+                          kSrcWidth, kSrcHeight);
+
+  if (kSrcWidth > 8 && kSrcHeight > 8) {
+    EXPECT_EQ(opt_err, c_err);
+  }
+
+  free_aligned_buffer_64(src_a);
+  free_aligned_buffer_64(src_b);
+}
+
+}  // namespace libyuv
diff --git a/libs/libyuv/unit_test/convert_test.cc b/libs/libyuv/unit_test/convert_test.cc
new file mode 100644
index 0000000000..c4d264a48e
--- /dev/null
+++ b/libs/libyuv/unit_test/convert_test.cc
@@ -0,0 +1,1859 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <time.h>
+
+#include "libyuv/compare.h"
+#include "libyuv/convert.h"
+#include "libyuv/convert_argb.h"
+#include "libyuv/convert_from.h"
+#include "libyuv/convert_from_argb.h"
+#include "libyuv/cpu_id.h"
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+#include "libyuv/row.h"
+#include "libyuv/video_common.h"
+#include "../unit_test/unit_test.h"
+
+namespace libyuv {
+
+#define SUBSAMPLE(v, a) ((((v) + (a) - 1)) / (a))
+
+#define TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,           \
+                       FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF)   \
+TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) {                 \
+  const int kWidth = ((W1280) > 0) ? (W1280) : 1;                              \
+  const int kHeight = benchmark_height_;                                       \
+  align_buffer_64(src_y, kWidth * kHeight + OFF);                              \
+  align_buffer_64(src_u,                                                       \
+                  SUBSAMPLE(kWidth, SRC_SUBSAMP_X) *                           \
+                  SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF);                    \
+  align_buffer_64(src_v,                                                       \
+                  SUBSAMPLE(kWidth, SRC_SUBSAMP_X) *                           \
+                  SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF);                    \
+  align_buffer_64(dst_y_c, kWidth * kHeight);                                  \
+  align_buffer_64(dst_u_c,                                                     \
+                  SUBSAMPLE(kWidth, SUBSAMP_X) *                               \
+                  SUBSAMPLE(kHeight, SUBSAMP_Y));                              \
+  align_buffer_64(dst_v_c,                                                     \
+                  SUBSAMPLE(kWidth, SUBSAMP_X) *                               \
+                  SUBSAMPLE(kHeight, SUBSAMP_Y));                              \
+  align_buffer_64(dst_y_opt, kWidth * kHeight);                                \
+  align_buffer_64(dst_u_opt,                                                   \
+                  SUBSAMPLE(kWidth, SUBSAMP_X) *                               \
+                  SUBSAMPLE(kHeight, SUBSAMP_Y));                              \
+  align_buffer_64(dst_v_opt,                                                   \
+                  SUBSAMPLE(kWidth, SUBSAMP_X) *                               \
+                  SUBSAMPLE(kHeight, SUBSAMP_Y));                              \
+  for (int i = 0; i < kHeight; ++i)                                            \
+    for (int j = 0; j < kWidth; ++j)                                           \
+      src_y[i * kWidth + j + OFF] = (fastrand() & 0xff);                       \
+  for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) {                \
+    for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) {               \
+      src_u[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] =                \
+          (fastrand() & 0xff);                                                 \
+      src_v[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] =                \
+          (fastrand() & 0xff);                                                 \
+    }                                                                          \
+  }                                                                            \
+  memset(dst_y_c, 1, kWidth * kHeight);                                        \
+  memset(dst_u_c, 2, SUBSAMPLE(kWidth, SUBSAMP_X) *                            \
+                     SUBSAMPLE(kHeight, SUBSAMP_Y));                           \
+  memset(dst_v_c, 3, SUBSAMPLE(kWidth, SUBSAMP_X) *                            \
+                     SUBSAMPLE(kHeight, SUBSAMP_Y));                           \
+  memset(dst_y_opt, 101, kWidth * kHeight);                                    \
+  memset(dst_u_opt, 102, SUBSAMPLE(kWidth, SUBSAMP_X) *                        \
+                         SUBSAMPLE(kHeight, SUBSAMP_Y));                       \
+  memset(dst_v_opt, 103, SUBSAMPLE(kWidth, SUBSAMP_X) *                        \
+                         SUBSAMPLE(kHeight, SUBSAMP_Y));                       \
+  MaskCpuFlags(disable_cpu_flags_);                                            \
+  SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth,                          \
+                                 src_u + OFF,                                  \
+                                 SUBSAMPLE(kWidth, SRC_SUBSAMP_X),             \
+                                 src_v + OFF,                                  \
+                                 SUBSAMPLE(kWidth, SRC_SUBSAMP_X),             \
+                                 dst_y_c, kWidth,                              \
+                                 dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X),        \
+                                 dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X),        \
+                                 kWidth, NEG kHeight);                         \
+  MaskCpuFlags(benchmark_cpu_info_);                                           \
+  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
+    SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth,                        \
+                                   src_u + OFF,                                \
+                                       SUBSAMPLE(kWidth, SRC_SUBSAMP_X),       \
+                                   src_v + OFF,                                \
+                                       SUBSAMPLE(kWidth, SRC_SUBSAMP_X),       \
+                                   dst_y_opt, kWidth,                          \
+                                   dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X),    \
+                                   dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X),    \
+                                   kWidth, NEG kHeight);                       \
+  }                                                                            \
+  int max_diff = 0;                                                            \
+  for (int i = 0; i < kHeight; ++i) {                                          \
+    for (int j = 0; j < kWidth; ++j) {                                         \
+      int abs_diff =                                                           \
+          abs(static_cast<int>(dst_y_c[i * kWidth + j]) -                      \
+              static_cast<int>(dst_y_opt[i * kWidth + j]));                    \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_EQ(0, max_diff);                                                      \
+  for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                    \
+    for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) {                   \
+      int abs_diff =                                                           \
+          abs(static_cast<int>(dst_u_c[i *                                     \
+                               SUBSAMPLE(kWidth, SUBSAMP_X) + j]) -            \
+              static_cast<int>(dst_u_opt[i *                                   \
+                               SUBSAMPLE(kWidth, SUBSAMP_X) + j]));            \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, 3);                                                      \
+  for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                    \
+    for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) {                   \
+      int abs_diff =                                                           \
+          abs(static_cast<int>(dst_v_c[i *                                     \
+                               SUBSAMPLE(kWidth, SUBSAMP_X) + j]) -            \
+              static_cast<int>(dst_v_opt[i *                                   \
+                               SUBSAMPLE(kWidth, SUBSAMP_X) + j]));            \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, 3);                                                      \
+  free_aligned_buffer_64(dst_y_c);                                             \
+  free_aligned_buffer_64(dst_u_c);                                             \
+  free_aligned_buffer_64(dst_v_c);                                             \
+  free_aligned_buffer_64(dst_y_opt);                                           \
+  free_aligned_buffer_64(dst_u_opt);                                           \
+  free_aligned_buffer_64(dst_v_opt);                                           \
+  free_aligned_buffer_64(src_y);                                               \
+  free_aligned_buffer_64(src_u);                                               \
+  free_aligned_buffer_64(src_v);                                               \
+}
+
+#define TESTPLANARTOP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,            \
+                      FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y)                        \
+    TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,               \
+                   FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,                           \
+                   benchmark_width_ - 4, _Any, +, 0)                           \
+    TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,               \
+                   FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,                           \
+                   benchmark_width_, _Unaligned, +, 1)                         \
+    TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,               \
+                   FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,                           \
+                   benchmark_width_, _Invert, -, 0)                            \
+    TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,               \
+                   FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,                           \
+                   benchmark_width_, _Opt, +, 0)
+
+TESTPLANARTOP(I420, 2, 2, I420, 2, 2)
+TESTPLANARTOP(I422, 2, 1, I420, 2, 2)
+TESTPLANARTOP(I444, 1, 1, I420, 2, 2)
+TESTPLANARTOP(I411, 4, 1, I420, 2, 2)
+TESTPLANARTOP(I420, 2, 2, I422, 2, 1)
+TESTPLANARTOP(I420, 2, 2, I444, 1, 1)
+TESTPLANARTOP(I420, 2, 2, I411, 4, 1)
+TESTPLANARTOP(I420, 2, 2, I420Mirror, 2, 2)
+TESTPLANARTOP(I422, 2, 1, I422, 2, 1)
+TESTPLANARTOP(I444, 1, 1, I444, 1, 1)
+
+#define TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,          \
+                       FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF)   \
+TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) {                 \
+  const int kWidth = ((W1280) > 0) ? (W1280) : 1;                              \
+  const int kHeight = benchmark_height_;                                       \
+  align_buffer_64(src_y, kWidth * kHeight + OFF);                              \
+  align_buffer_64(src_u,                                                       \
+                  SUBSAMPLE(kWidth, SRC_SUBSAMP_X) *                           \
+                  SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF);                    \
+  align_buffer_64(src_v,                                                       \
+                  SUBSAMPLE(kWidth, SRC_SUBSAMP_X) *                           \
+                  SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF);                    \
+  align_buffer_64(dst_y_c, kWidth * kHeight);                                  \
+  align_buffer_64(dst_uv_c, SUBSAMPLE(kWidth * 2, SUBSAMP_X) *                 \
+                  SUBSAMPLE(kHeight, SUBSAMP_Y));                              \
+  align_buffer_64(dst_y_opt, kWidth * kHeight);                                \
+  align_buffer_64(dst_uv_opt, SUBSAMPLE(kWidth * 2, SUBSAMP_X) *               \
+                  SUBSAMPLE(kHeight, SUBSAMP_Y));                              \
+  for (int i = 0; i < kHeight; ++i)                                            \
+    for (int j = 0; j < kWidth; ++j)                                           \
+      src_y[i * kWidth + j + OFF] = (fastrand() & 0xff);                       \
+  for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) {                \
+    for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) {               \
+      src_u[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] =                \
+          (fastrand() & 0xff);                                                 \
+      src_v[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] =                \
+          (fastrand() & 0xff);                                                 \
+    }                                                                          \
+  }                                                                            \
+  memset(dst_y_c, 1, kWidth * kHeight);                                        \
+  memset(dst_uv_c, 2, SUBSAMPLE(kWidth * 2, SUBSAMP_X) *                       \
+                      SUBSAMPLE(kHeight, SUBSAMP_Y));                          \
+  memset(dst_y_opt, 101, kWidth * kHeight);                                    \
+  memset(dst_uv_opt, 102, SUBSAMPLE(kWidth * 2, SUBSAMP_X) *                   \
+                          SUBSAMPLE(kHeight, SUBSAMP_Y));                      \
+  MaskCpuFlags(disable_cpu_flags_);                                            \
+  SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth,                          \
+                                 src_u + OFF,                                  \
+                                 SUBSAMPLE(kWidth, SRC_SUBSAMP_X),             \
+                                 src_v + OFF,                                  \
+                                 SUBSAMPLE(kWidth, SRC_SUBSAMP_X),             \
+                                 dst_y_c, kWidth,                              \
+                                 dst_uv_c, SUBSAMPLE(kWidth * 2, SUBSAMP_X),   \
+                                 kWidth, NEG kHeight);                         \
+  MaskCpuFlags(benchmark_cpu_info_);                                           \
+  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
+    SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth,                        \
+                                   src_u + OFF,                                \
+                                   SUBSAMPLE(kWidth, SRC_SUBSAMP_X),           \
+                                   src_v + OFF,                                \
+                                   SUBSAMPLE(kWidth, SRC_SUBSAMP_X),           \
+                                   dst_y_opt, kWidth,                          \
+                                   dst_uv_opt,                                 \
+                                   SUBSAMPLE(kWidth * 2, SUBSAMP_X),           \
+                                   kWidth, NEG kHeight);                       \
+  }                                                                            \
+  int max_diff = 0;                                                            \
+  for (int i = 0; i < kHeight; ++i) {                                          \
+    for (int j = 0; j < kWidth; ++j) {                                         \
+      int abs_diff =                                                           \
+          abs(static_cast<int>(dst_y_c[i * kWidth + j]) -                      \
+              static_cast<int>(dst_y_opt[i * kWidth + j]));                    \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, 1);                                                      \
+  for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                    \
+    for (int j = 0; j < SUBSAMPLE(kWidth * 2, SUBSAMP_X); ++j) {               \
+      int abs_diff =                                                           \
+          abs(static_cast<int>(dst_uv_c[i *                                    \
+                               SUBSAMPLE(kWidth * 2, SUBSAMP_X) + j]) -        \
+              static_cast<int>(dst_uv_opt[i *                                  \
+                               SUBSAMPLE(kWidth * 2, SUBSAMP_X) + j]));        \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, 1);                                                      \
+  free_aligned_buffer_64(dst_y_c);                                             \
+  free_aligned_buffer_64(dst_uv_c);                                            \
+  free_aligned_buffer_64(dst_y_opt);                                           \
+  free_aligned_buffer_64(dst_uv_opt);                                          \
+  free_aligned_buffer_64(src_y);                                               \
+  free_aligned_buffer_64(src_u);                                               \
+  free_aligned_buffer_64(src_v);                                               \
+}
+
+#define TESTPLANARTOBP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,           \
+                       FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y)                       \
+    TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,              \
+                    FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,                          \
+                    benchmark_width_ - 4, _Any, +, 0)                          \
+    TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,              \
+                    FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,                          \
+                    benchmark_width_, _Unaligned, +, 1)                        \
+    TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,              \
+                    FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,                          \
+                    benchmark_width_, _Invert, -, 0)                           \
+    TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,              \
+                    FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,                          \
+                    benchmark_width_, _Opt, +, 0)
+
+TESTPLANARTOBP(I420, 2, 2, NV12, 2, 2)
+TESTPLANARTOBP(I420, 2, 2, NV21, 2, 2)
+
+#define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,         \
+                         FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \
+TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) {                 \
+  const int kWidth = ((W1280) > 0) ? (W1280) : 1;                              \
+  const int kHeight = benchmark_height_;                                       \
+  align_buffer_64(src_y, kWidth * kHeight + OFF);                              \
+  align_buffer_64(src_uv, 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X) *               \
+                  SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF);                    \
+  align_buffer_64(dst_y_c, kWidth * kHeight);                                  \
+  align_buffer_64(dst_u_c,                                                     \
+                  SUBSAMPLE(kWidth, SUBSAMP_X) *                               \
+                  SUBSAMPLE(kHeight, SUBSAMP_Y));                              \
+  align_buffer_64(dst_v_c,                                                     \
+                  SUBSAMPLE(kWidth, SUBSAMP_X) *                               \
+                  SUBSAMPLE(kHeight, SUBSAMP_Y));                              \
+  align_buffer_64(dst_y_opt, kWidth * kHeight);                                \
+  align_buffer_64(dst_u_opt,                                                   \
+                  SUBSAMPLE(kWidth, SUBSAMP_X) *                               \
+                  SUBSAMPLE(kHeight, SUBSAMP_Y));                              \
+  align_buffer_64(dst_v_opt,                                                   \
+                  SUBSAMPLE(kWidth, SUBSAMP_X) *                               \
+                  SUBSAMPLE(kHeight, SUBSAMP_Y));                              \
+  for (int i = 0; i < kHeight; ++i)                                            \
+    for (int j = 0; j < kWidth; ++j)                                           \
+      src_y[i * kWidth + j + OFF] = (fastrand() & 0xff);                       \
+  for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) {                \
+    for (int j = 0; j < 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) {           \
+      src_uv[(i * 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] =           \
+          (fastrand() & 0xff);                                                 \
+    }                                                                          \
+  }                                                                            \
+  memset(dst_y_c, 1, kWidth * kHeight);                                        \
+  memset(dst_u_c, 2, SUBSAMPLE(kWidth, SUBSAMP_X) *                            \
+                     SUBSAMPLE(kHeight, SUBSAMP_Y));                           \
+  memset(dst_v_c, 3, SUBSAMPLE(kWidth, SUBSAMP_X) *                            \
+                     SUBSAMPLE(kHeight, SUBSAMP_Y));                           \
+  memset(dst_y_opt, 101, kWidth * kHeight);                                    \
+  memset(dst_u_opt, 102, SUBSAMPLE(kWidth, SUBSAMP_X) *                        \
+                         SUBSAMPLE(kHeight, SUBSAMP_Y));                       \
+  memset(dst_v_opt, 103, SUBSAMPLE(kWidth, SUBSAMP_X) *                        \
+                         SUBSAMPLE(kHeight, SUBSAMP_Y));                       \
+  MaskCpuFlags(disable_cpu_flags_);                                            \
+  SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth,                          \
+                                 src_uv + OFF,                                 \
+                                 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X),         \
+                                 dst_y_c, kWidth,                              \
+                                 dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X),        \
+                                 dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X),        \
+                                 kWidth, NEG kHeight);                         \
+  MaskCpuFlags(benchmark_cpu_info_);                                           \
+  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
+    SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth,                        \
+                                   src_uv + OFF,                               \
+                                   2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X),       \
+                                   dst_y_opt, kWidth,                          \
+                                   dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X),    \
+                                   dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X),    \
+                                   kWidth, NEG kHeight);                       \
+  }                                                                            \
+  int max_diff = 0;                                                            \
+  for (int i = 0; i < kHeight; ++i) {                                          \
+    for (int j = 0; j < kWidth; ++j) {                                         \
+      int abs_diff =                                                           \
+          abs(static_cast<int>(dst_y_c[i * kWidth + j]) -                      \
+              static_cast<int>(dst_y_opt[i * kWidth + j]));                    \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, 1);                                                      \
+  for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                    \
+    for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) {                   \
+      int abs_diff =                                                           \
+          abs(static_cast<int>(dst_u_c[i *                                     \
+                               SUBSAMPLE(kWidth, SUBSAMP_X) + j]) -            \
+              static_cast<int>(dst_u_opt[i *                                   \
+                               SUBSAMPLE(kWidth, SUBSAMP_X) + j]));            \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, 1);                                                      \
+  for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                    \
+    for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) {                   \
+      int abs_diff =                                                           \
+          abs(static_cast<int>(dst_v_c[i *                                     \
+                               SUBSAMPLE(kWidth, SUBSAMP_X) + j]) -            \
+              static_cast<int>(dst_v_opt[i *                                   \
+                               SUBSAMPLE(kWidth, SUBSAMP_X) + j]));            \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, 1);                                                      \
+  free_aligned_buffer_64(dst_y_c);                                             \
+  free_aligned_buffer_64(dst_u_c);                                             \
+  free_aligned_buffer_64(dst_v_c);                                             \
+  free_aligned_buffer_64(dst_y_opt);                                           \
+  free_aligned_buffer_64(dst_u_opt);                                           \
+  free_aligned_buffer_64(dst_v_opt);                                           \
+  free_aligned_buffer_64(src_y);                                               \
+  free_aligned_buffer_64(src_uv);                                              \
+}
+
+#define TESTBIPLANARTOP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,          \
+                        FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y)                      \
+    TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,             \
+                     FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,                         \
+                     benchmark_width_ - 4, _Any, +, 0)                         \
+    TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,             \
+                     FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,                         \
+                     benchmark_width_, _Unaligned, +, 1)                       \
+    TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,             \
+                     FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,                         \
+                     benchmark_width_, _Invert, -, 0)                          \
+    TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,             \
+                     FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,                         \
+                     benchmark_width_, _Opt, +, 0)
+
+TESTBIPLANARTOP(NV12, 2, 2, I420, 2, 2)
+TESTBIPLANARTOP(NV21, 2, 2, I420, 2, 2)
+
+#define ALIGNINT(V, ALIGN) (((V) + (ALIGN) - 1) / (ALIGN) * (ALIGN))
+
+#define TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,  \
+                       YALIGN, W1280, DIFF, N, NEG, OFF, FMT_C, BPP_C)         \
+TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                          \
+  const int kWidth = ((W1280) > 0) ? (W1280) : 1;                              \
+  const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                     \
+  const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN);                        \
+  const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                          \
+  const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);               \
+  align_buffer_64(src_y, kWidth * kHeight + OFF);                              \
+  align_buffer_64(src_u, kSizeUV + OFF);                                       \
+  align_buffer_64(src_v, kSizeUV + OFF);                                       \
+  align_buffer_64(dst_argb_c, kStrideB * kHeight + OFF);                       \
+  align_buffer_64(dst_argb_opt, kStrideB * kHeight + OFF);                     \
+  for (int i = 0; i < kWidth * kHeight; ++i) {                                 \
+    src_y[i + OFF] = (fastrand() & 0xff);                                      \
+  }                                                                            \
+  for (int i = 0; i < kSizeUV; ++i) {                                          \
+    src_u[i + OFF] = (fastrand() & 0xff);                                      \
+    src_v[i + OFF] = (fastrand() & 0xff);                                      \
+  }                                                                            \
+  memset(dst_argb_c + OFF, 1, kStrideB * kHeight);                             \
+  memset(dst_argb_opt + OFF, 101, kStrideB * kHeight);                         \
+  MaskCpuFlags(disable_cpu_flags_);                                            \
+  FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth,                                   \
+                        src_u + OFF, kStrideUV,                                \
+                        src_v + OFF, kStrideUV,                                \
+                        dst_argb_c + OFF, kStrideB,                            \
+                        kWidth, NEG kHeight);                                  \
+  MaskCpuFlags(benchmark_cpu_info_);                                           \
+  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
+    FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth,                                 \
+                          src_u + OFF, kStrideUV,                              \
+                          src_v + OFF, kStrideUV,                              \
+                          dst_argb_opt + OFF, kStrideB,                        \
+                          kWidth, NEG kHeight);                                \
+  }                                                                            \
+  int max_diff = 0;                                                            \
+  /* Convert to ARGB so 565 is expanded to bytes that can be compared. */      \
+  align_buffer_64(dst_argb32_c, kWidth * BPP_C  * kHeight);                    \
+  align_buffer_64(dst_argb32_opt, kWidth * BPP_C  * kHeight);                  \
+  memset(dst_argb32_c, 2, kWidth * BPP_C  * kHeight);                          \
+  memset(dst_argb32_opt, 102, kWidth * BPP_C  * kHeight);                      \
+  FMT_B##To##FMT_C(dst_argb_c + OFF, kStrideB,                                 \
+                   dst_argb32_c, kWidth * BPP_C ,                              \
+                   kWidth, kHeight);                                           \
+  FMT_B##To##FMT_C(dst_argb_opt + OFF, kStrideB,                               \
+                   dst_argb32_opt, kWidth * BPP_C ,                            \
+                   kWidth, kHeight);                                           \
+  for (int i = 0; i < kWidth * BPP_C * kHeight; ++i) {                         \
+    int abs_diff =                                                             \
+        abs(static_cast<int>(dst_argb32_c[i]) -                                \
+            static_cast<int>(dst_argb32_opt[i]));                              \
+    if (abs_diff > max_diff) {                                                 \
+      max_diff = abs_diff;                                                     \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, DIFF);                                                   \
+  free_aligned_buffer_64(src_y);                                               \
+  free_aligned_buffer_64(src_u);                                               \
+  free_aligned_buffer_64(src_v);                                               \
+  free_aligned_buffer_64(dst_argb_c);                                          \
+  free_aligned_buffer_64(dst_argb_opt);                                        \
+  free_aligned_buffer_64(dst_argb32_c);                                        \
+  free_aligned_buffer_64(dst_argb32_opt);                                      \
+}
+
+#define TESTPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,   \
+                      YALIGN, DIFF, FMT_C, BPP_C)                              \
+    TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
+        YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0, FMT_C, BPP_C)          \
+    TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
+        YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1, FMT_C, BPP_C)        \
+    TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
+        YALIGN, benchmark_width_, DIFF, _Invert, -, 0, FMT_C, BPP_C)           \
+    TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
+        YALIGN, benchmark_width_, DIFF, _Opt, +, 0, FMT_C, BPP_C)
+
+TESTPLANARTOB(I420, 2, 2, ARGB, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(J420, 2, 2, ARGB, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(J420, 2, 2, ABGR, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(H420, 2, 2, ARGB, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(H420, 2, 2, ABGR, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, BGRA, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, ABGR, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, RGBA, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, RAW, 3, 3, 1, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, RGB24, 3, 3, 1, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, RGB565, 2, 2, 1, 9, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, ARGB1555, 2, 2, 1, 9, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, ARGB4444, 2, 2, 1, 17, ARGB, 4)
+TESTPLANARTOB(I422, 2, 1, ARGB, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(J422, 2, 1, ARGB, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(J422, 2, 1, ABGR, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(H422, 2, 1, ARGB, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(H422, 2, 1, ABGR, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I422, 2, 1, BGRA, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I422, 2, 1, ABGR, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I422, 2, 1, RGBA, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I411, 4, 1, ARGB, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I444, 1, 1, ARGB, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(J444, 1, 1, ARGB, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I444, 1, 1, ABGR, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, YUY2, 2, 4, 1, 1, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, UYVY, 2, 4, 1, 1, ARGB, 4)
+TESTPLANARTOB(I422, 2, 1, YUY2, 2, 4, 1, 0, ARGB, 4)
+TESTPLANARTOB(I422, 2, 1, UYVY, 2, 4, 1, 0, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, I400, 1, 1, 1, 0, ARGB, 4)
+TESTPLANARTOB(J420, 2, 2, J400, 1, 1, 1, 0, ARGB, 4)
+
+#define TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+                       YALIGN, W1280, DIFF, N, NEG, OFF, ATTEN)                \
+TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                          \
+  const int kWidth = ((W1280) > 0) ? (W1280) : 1;                              \
+  const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                     \
+  const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN);                        \
+  const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                          \
+  const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);               \
+  align_buffer_64(src_y, kWidth * kHeight + OFF);                              \
+  align_buffer_64(src_u, kSizeUV + OFF);                                       \
+  align_buffer_64(src_v, kSizeUV + OFF);                                       \
+  align_buffer_64(src_a, kWidth * kHeight + OFF);                              \
+  align_buffer_64(dst_argb_c, kStrideB * kHeight + OFF);                       \
+  align_buffer_64(dst_argb_opt, kStrideB * kHeight + OFF);                     \
+  for (int i = 0; i < kWidth * kHeight; ++i) {                                 \
+    src_y[i + OFF] = (fastrand() & 0xff);                                      \
+    src_a[i + OFF] = (fastrand() & 0xff);                                      \
+  }                                                                            \
+  for (int i = 0; i < kSizeUV; ++i) {                                          \
+    src_u[i + OFF] = (fastrand() & 0xff);                                      \
+    src_v[i + OFF] = (fastrand() & 0xff);                                      \
+  }                                                                            \
+  memset(dst_argb_c + OFF, 1, kStrideB * kHeight);                             \
+  memset(dst_argb_opt + OFF, 101, kStrideB * kHeight);                         \
+  MaskCpuFlags(disable_cpu_flags_);                                            \
+  FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth,                                   \
+                        src_u + OFF, kStrideUV,                                \
+                        src_v + OFF, kStrideUV,                                \
+                        src_a + OFF, kWidth,                                   \
+                        dst_argb_c + OFF, kStrideB,                            \
+                        kWidth, NEG kHeight, ATTEN);                           \
+  MaskCpuFlags(benchmark_cpu_info_);                                           \
+  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
+    FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth,                                 \
+                          src_u + OFF, kStrideUV,                              \
+                          src_v + OFF, kStrideUV,                              \
+                          src_a + OFF, kWidth,                                 \
+                          dst_argb_opt + OFF, kStrideB,                        \
+                          kWidth, NEG kHeight, ATTEN);                         \
+  }                                                                            \
+  int max_diff = 0;                                                            \
+  for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) {                         \
+    int abs_diff =                                                             \
+        abs(static_cast<int>(dst_argb_c[i + OFF]) -                            \
+            static_cast<int>(dst_argb_opt[i + OFF]));                          \
+    if (abs_diff > max_diff) {                                                 \
+      max_diff = abs_diff;                                                     \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, DIFF);                                                   \
+  free_aligned_buffer_64(src_y);                                               \
+  free_aligned_buffer_64(src_u);                                               \
+  free_aligned_buffer_64(src_v);                                               \
+  free_aligned_buffer_64(src_a);                                               \
+  free_aligned_buffer_64(dst_argb_c);                                          \
+  free_aligned_buffer_64(dst_argb_opt);                                        \
+}
+
+#define TESTQPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,  \
+                       YALIGN, DIFF)                                           \
+    TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,     \
+        YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0, 0)                     \
+    TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,     \
+        YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1, 0)                   \
+    TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,     \
+        YALIGN, benchmark_width_, DIFF, _Invert, -, 0, 0)                      \
+    TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,     \
+        YALIGN, benchmark_width_, DIFF, _Opt, +, 0, 0)                         \
+    TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,     \
+        YALIGN, benchmark_width_, DIFF, _Premult, +, 0, 1)
+
+TESTQPLANARTOB(I420Alpha, 2, 2, ARGB, 4, 4, 1, 2)
+TESTQPLANARTOB(I420Alpha, 2, 2, ABGR, 4, 4, 1, 2)
+
+#define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,       \
+                         W1280, DIFF, N, NEG, OFF)                             \
+TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                          \
+  const int kWidth = ((W1280) > 0) ? (W1280) : 1;                              \
+  const int kHeight = benchmark_height_;                                       \
+  const int kStrideB = kWidth * BPP_B;                                         \
+  const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                          \
+  align_buffer_64(src_y, kWidth * kHeight + OFF);                              \
+  align_buffer_64(src_uv,                                                      \
+                  kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y) * 2 + OFF);        \
+  align_buffer_64(dst_argb_c, kStrideB * kHeight);                             \
+  align_buffer_64(dst_argb_opt, kStrideB * kHeight);                           \
+  for (int i = 0; i < kHeight; ++i)                                            \
+    for (int j = 0; j < kWidth; ++j)                                           \
+      src_y[i * kWidth + j + OFF] = (fastrand() & 0xff);                       \
+  for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                    \
+    for (int j = 0; j < kStrideUV * 2; ++j) {                                  \
+      src_uv[i * kStrideUV * 2 + j + OFF] = (fastrand() & 0xff);               \
+    }                                                                          \
+  }                                                                            \
+  memset(dst_argb_c, 1, kStrideB * kHeight);                                   \
+  memset(dst_argb_opt, 101, kStrideB * kHeight);                               \
+  MaskCpuFlags(disable_cpu_flags_);                                            \
+  FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth,                                   \
+                        src_uv + OFF, kStrideUV * 2,                           \
+                        dst_argb_c, kWidth * BPP_B,                            \
+                        kWidth, NEG kHeight);                                  \
+  MaskCpuFlags(benchmark_cpu_info_);                                           \
+  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
+    FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth,                                 \
+                          src_uv + OFF, kStrideUV * 2,                         \
+                          dst_argb_opt, kWidth * BPP_B,                        \
+                          kWidth, NEG kHeight);                                \
+  }                                                                            \
+  /* Convert to ARGB so 565 is expanded to bytes that can be compared. */      \
+  align_buffer_64(dst_argb32_c, kWidth * 4 * kHeight);                         \
+  align_buffer_64(dst_argb32_opt, kWidth * 4 * kHeight);                       \
+  memset(dst_argb32_c, 2, kWidth * 4 * kHeight);                               \
+  memset(dst_argb32_opt, 102, kWidth * 4 * kHeight);                           \
+  FMT_B##ToARGB(dst_argb_c, kStrideB,                                          \
+                dst_argb32_c, kWidth * 4,                                      \
+                kWidth, kHeight);                                              \
+  FMT_B##ToARGB(dst_argb_opt, kStrideB,                                        \
+                dst_argb32_opt, kWidth * 4,                                    \
+                kWidth, kHeight);                                              \
+  int max_diff = 0;                                                            \
+  for (int i = 0; i < kHeight; ++i) {                                          \
+    for (int j = 0; j < kWidth * 4; ++j) {                                     \
+      int abs_diff =                                                           \
+          abs(static_cast<int>(dst_argb32_c[i * kWidth * 4 + j]) -             \
+              static_cast<int>(dst_argb32_opt[i * kWidth * 4 + j]));           \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, DIFF);                                                   \
+  free_aligned_buffer_64(src_y);                                               \
+  free_aligned_buffer_64(src_uv);                                              \
+  free_aligned_buffer_64(dst_argb_c);                                          \
+  free_aligned_buffer_64(dst_argb_opt);                                        \
+  free_aligned_buffer_64(dst_argb32_c);                                        \
+  free_aligned_buffer_64(dst_argb32_opt);                                      \
+}
+
+#define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, DIFF)  \
+    TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,           \
+                     benchmark_width_ - 4, DIFF, _Any, +, 0)                   \
+    TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,           \
+                     benchmark_width_, DIFF, _Unaligned, +, 1)                 \
+    TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,           \
+                     benchmark_width_, DIFF, _Invert, -, 0)                    \
+    TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,           \
+                     benchmark_width_, DIFF, _Opt, +, 0)
+
+TESTBIPLANARTOB(NV12, 2, 2, ARGB, 4, 2)
+TESTBIPLANARTOB(NV21, 2, 2, ARGB, 4, 2)
+TESTBIPLANARTOB(NV12, 2, 2, RGB565, 2, 9)
+
+#define TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+                       W1280, DIFF, N, NEG, OFF)                               \
+TEST_F(LibYUVConvertTest, FMT_A##To##FMT_PLANAR##N) {                          \
+  const int kWidth = ((W1280) > 0) ? (W1280) : 1;                              \
+  const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                     \
+  const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                          \
+  const int kStride =                                                          \
+      (kStrideUV * SUBSAMP_X * 8 * BPP_A + 7) / 8;                             \
+  align_buffer_64(src_argb, kStride * kHeight + OFF);                          \
+  align_buffer_64(dst_y_c, kWidth * kHeight);                                  \
+  align_buffer_64(dst_u_c,                                                     \
+                  kStrideUV *                                                  \
+                  SUBSAMPLE(kHeight, SUBSAMP_Y));                              \
+  align_buffer_64(dst_v_c,                                                     \
+                  kStrideUV *                                                  \
+                  SUBSAMPLE(kHeight, SUBSAMP_Y));                              \
+  align_buffer_64(dst_y_opt, kWidth * kHeight);                                \
+  align_buffer_64(dst_u_opt,                                                   \
+                  kStrideUV *                                                  \
+                  SUBSAMPLE(kHeight, SUBSAMP_Y));                              \
+  align_buffer_64(dst_v_opt,                                                   \
+                  kStrideUV *                                                  \
+                  SUBSAMPLE(kHeight, SUBSAMP_Y));                              \
+  memset(dst_y_c, 1, kWidth * kHeight);                                        \
+  memset(dst_u_c, 2,                                                           \
+         kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y));                           \
+  memset(dst_v_c, 3,                                                           \
+         kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y));                           \
+  memset(dst_y_opt, 101, kWidth * kHeight);                                    \
+  memset(dst_u_opt, 102,                                                       \
+         kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y));                           \
+  memset(dst_v_opt, 103,                                                       \
+         kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y));                           \
+  for (int i = 0; i < kHeight; ++i)                                            \
+    for (int j = 0; j < kStride; ++j)                                          \
+      src_argb[(i * kStride) + j + OFF] = (fastrand() & 0xff);                 \
+  MaskCpuFlags(disable_cpu_flags_);                                            \
+  FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride,                               \
+                        dst_y_c, kWidth,                                       \
+                        dst_u_c, kStrideUV,                                    \
+                        dst_v_c, kStrideUV,                                    \
+                        kWidth, NEG kHeight);                                  \
+  MaskCpuFlags(benchmark_cpu_info_);                                           \
+  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
+    FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride,                             \
+                          dst_y_opt, kWidth,                                   \
+                          dst_u_opt, kStrideUV,                                \
+                          dst_v_opt, kStrideUV,                                \
+                          kWidth, NEG kHeight);                                \
+  }                                                                            \
+  for (int i = 0; i < kHeight; ++i) {                                          \
+    for (int j = 0; j < kWidth; ++j) {                                         \
+      EXPECT_NEAR(static_cast<int>(dst_y_c[i * kWidth + j]),                   \
+                  static_cast<int>(dst_y_opt[i * kWidth + j]), DIFF);          \
+    }                                                                          \
+  }                                                                            \
+  for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                    \
+    for (int j = 0; j < kStrideUV; ++j) {                                      \
+      EXPECT_NEAR(static_cast<int>(dst_u_c[i * kStrideUV + j]),                \
+                  static_cast<int>(dst_u_opt[i * kStrideUV + j]), DIFF);       \
+    }                                                                          \
+  }                                                                            \
+  for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                    \
+    for (int j = 0; j < kStrideUV; ++j) {                                      \
+      EXPECT_NEAR(static_cast<int>(dst_v_c[i *                                 \
+                                   kStrideUV + j]),                            \
+                  static_cast<int>(dst_v_opt[i *                               \
+                                   kStrideUV + j]), DIFF);                     \
+    }                                                                          \
+  }                                                                            \
+  free_aligned_buffer_64(dst_y_c);                                             \
+  free_aligned_buffer_64(dst_u_c);                                             \
+  free_aligned_buffer_64(dst_v_c);                                             \
+  free_aligned_buffer_64(dst_y_opt);                                           \
+  free_aligned_buffer_64(dst_u_opt);                                           \
+  free_aligned_buffer_64(dst_v_opt);                                           \
+  free_aligned_buffer_64(src_argb);                                            \
+}
+
+#define TESTATOPLANAR(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,  \
+                      DIFF)                                                    \
+    TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,     \
+                   benchmark_width_ - 4, DIFF, _Any, +, 0)                     \
+    TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,     \
+                   benchmark_width_, DIFF, _Unaligned, +, 1)                   \
+    TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,     \
+                   benchmark_width_, DIFF, _Invert, -, 0)                      \
+    TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,     \
+                   benchmark_width_, DIFF, _Opt, +, 0)
+
+TESTATOPLANAR(ARGB, 4, 1, I420, 2, 2, 4)
+#if defined(__arm__) || defined (__aarch64__)
+// arm version subsamples by summing 4 pixels then multiplying by matrix with
+// 4x smaller coefficients which are rounded to nearest integer.
+TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2, 4)
+TESTATOPLANAR(ARGB, 4, 1, J422, 2, 1, 4)
+#else
+TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2, 0)
+TESTATOPLANAR(ARGB, 4, 1, J422, 2, 1, 0)
+#endif
+TESTATOPLANAR(BGRA, 4, 1, I420, 2, 2, 4)
+TESTATOPLANAR(ABGR, 4, 1, I420, 2, 2, 4)
+TESTATOPLANAR(RGBA, 4, 1, I420, 2, 2, 4)
+TESTATOPLANAR(RAW, 3, 1, I420, 2, 2, 4)
+TESTATOPLANAR(RGB24, 3, 1, I420, 2, 2, 4)
+TESTATOPLANAR(RGB565, 2, 1, I420, 2, 2, 5)
+// TODO(fbarchard): Make 1555 neon work same as C code, reduce to diff 9.
+TESTATOPLANAR(ARGB1555, 2, 1, I420, 2, 2, 15)
+TESTATOPLANAR(ARGB4444, 2, 1, I420, 2, 2, 17)
+TESTATOPLANAR(ARGB, 4, 1, I411, 4, 1, 4)
+TESTATOPLANAR(ARGB, 4, 1, I422, 2, 1, 2)
+TESTATOPLANAR(ARGB, 4, 1, I444, 1, 1, 2)
+TESTATOPLANAR(YUY2, 2, 1, I420, 2, 2, 2)
+TESTATOPLANAR(UYVY, 2, 1, I420, 2, 2, 2)
+TESTATOPLANAR(YUY2, 2, 1, I422, 2, 1, 2)
+TESTATOPLANAR(UYVY, 2, 1, I422, 2, 1, 2)
+TESTATOPLANAR(I400, 1, 1, I420, 2, 2, 2)
+TESTATOPLANAR(J400, 1, 1, J420, 2, 2, 2)
+
+#define TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,\
+                         W1280, N, NEG, OFF)                                   \
+TEST_F(LibYUVConvertTest, FMT_A##To##FMT_PLANAR##N) {                          \
+  const int kWidth = ((W1280) > 0) ? (W1280) : 1;                              \
+  const int kHeight = benchmark_height_;                                       \
+  const int kStride = SUBSAMPLE(kWidth, SUB_A) * BPP_A;                        \
+  const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                          \
+  align_buffer_64(src_argb, kStride * kHeight + OFF);                          \
+  align_buffer_64(dst_y_c, kWidth * kHeight);                                  \
+  align_buffer_64(dst_uv_c, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));    \
+  align_buffer_64(dst_y_opt, kWidth * kHeight);                                \
+  align_buffer_64(dst_uv_opt, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));  \
+  for (int i = 0; i < kHeight; ++i)                                            \
+    for (int j = 0; j < kStride; ++j)                                          \
+      src_argb[(i * kStride) + j + OFF] = (fastrand() & 0xff);                 \
+  memset(dst_y_c, 1, kWidth * kHeight);                                        \
+  memset(dst_uv_c, 2, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));          \
+  memset(dst_y_opt, 101, kWidth * kHeight);                                    \
+  memset(dst_uv_opt, 102, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));      \
+  MaskCpuFlags(disable_cpu_flags_);                                            \
+  FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride,                               \
+                        dst_y_c, kWidth, dst_uv_c, kStrideUV * 2,              \
+                        kWidth, NEG kHeight);                                  \
+  MaskCpuFlags(benchmark_cpu_info_);                                           \
+  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
+    FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride,                             \
+                          dst_y_opt, kWidth,                                   \
+                          dst_uv_opt, kStrideUV * 2, kWidth, NEG kHeight);     \
+  }                                                                            \
+  int max_diff = 0;                                                            \
+  for (int i = 0; i < kHeight; ++i) {                                          \
+    for (int j = 0; j < kWidth; ++j) {                                         \
+      int abs_diff =                                                           \
+          abs(static_cast<int>(dst_y_c[i * kWidth + j]) -                      \
+              static_cast<int>(dst_y_opt[i * kWidth + j]));                    \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, 4);                                                      \
+  for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                    \
+    for (int j = 0; j < kStrideUV * 2; ++j) {                                  \
+      int abs_diff =                                                           \
+          abs(static_cast<int>(dst_uv_c[i * kStrideUV * 2 + j]) -              \
+              static_cast<int>(dst_uv_opt[i * kStrideUV * 2 + j]));            \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, 4);                                                      \
+  free_aligned_buffer_64(dst_y_c);                                             \
+  free_aligned_buffer_64(dst_uv_c);                                            \
+  free_aligned_buffer_64(dst_y_opt);                                           \
+  free_aligned_buffer_64(dst_uv_opt);                                          \
+  free_aligned_buffer_64(src_argb);                                            \
+}
+
+#define TESTATOBIPLANAR(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
+    TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,    \
+                     benchmark_width_ - 4, _Any, +, 0)                         \
+    TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,    \
+                     benchmark_width_, _Unaligned, +, 1)                       \
+    TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,    \
+                     benchmark_width_, _Invert, -, 0)                          \
+    TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,    \
+                     benchmark_width_, _Opt, +, 0)
+
+TESTATOBIPLANAR(ARGB, 1, 4, NV12, 2, 2)
+TESTATOBIPLANAR(ARGB, 1, 4, NV21, 2, 2)
+TESTATOBIPLANAR(YUY2, 2, 4, NV12, 2, 2)
+TESTATOBIPLANAR(UYVY, 2, 4, NV12, 2, 2)
+
+#define TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                            \
+                  FMT_B, BPP_B, STRIDE_B, HEIGHT_B,                            \
+                  W1280, DIFF, N, NEG, OFF)                                    \
+TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##N) {                               \
+  const int kWidth = ((W1280) > 0) ? (W1280) : 1;                              \
+  const int kHeight = benchmark_height_;                                       \
+  const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A;         \
+  const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B;         \
+  const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;  \
+  const int kStrideB = (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;  \
+  align_buffer_64(src_argb, kStrideA * kHeightA + OFF);                        \
+  align_buffer_64(dst_argb_c, kStrideB * kHeightB);                            \
+  align_buffer_64(dst_argb_opt, kStrideB * kHeightB);                          \
+  for (int i = 0; i < kStrideA * kHeightA; ++i) {                              \
+    src_argb[i + OFF] = (fastrand() & 0xff);                                   \
+  }                                                                            \
+  memset(dst_argb_c, 1, kStrideB * kHeightB);                                  \
+  memset(dst_argb_opt, 101, kStrideB * kHeightB);                              \
+  MaskCpuFlags(disable_cpu_flags_);                                            \
+  FMT_A##To##FMT_B(src_argb + OFF, kStrideA,                                   \
+                   dst_argb_c, kStrideB,                                       \
+                   kWidth, NEG kHeight);                                       \
+  MaskCpuFlags(benchmark_cpu_info_);                                           \
+  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
+    FMT_A##To##FMT_B(src_argb + OFF, kStrideA,                                 \
+                     dst_argb_opt, kStrideB,                                   \
+                     kWidth, NEG kHeight);                                     \
+  }                                                                            \
+  int max_diff = 0;                                                            \
+  for (int i = 0; i < kStrideB * kHeightB; ++i) {                              \
+    int abs_diff =                                                             \
+        abs(static_cast<int>(dst_argb_c[i]) -                                  \
+            static_cast<int>(dst_argb_opt[i]));                                \
+    if (abs_diff > max_diff) {                                                 \
+      max_diff = abs_diff;                                                     \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, DIFF);                                                   \
+  free_aligned_buffer_64(src_argb);                                            \
+  free_aligned_buffer_64(dst_argb_c);                                          \
+  free_aligned_buffer_64(dst_argb_opt);                                        \
+}
+
+#define TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                       \
+                       FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF)                 \
+TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##_Random) {                         \
+  for (int times = 0; times < benchmark_iterations_; ++times) {                \
+    const int kWidth = (fastrand() & 63) + 1;                                  \
+    const int kHeight = (fastrand() & 31) + 1;                                 \
+    const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A;       \
+    const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B;       \
+    const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;\
+    const int kStrideB = (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;\
+    align_buffer_page_end(src_argb, kStrideA * kHeightA);                      \
+    align_buffer_page_end(dst_argb_c, kStrideB * kHeightB);                    \
+    align_buffer_page_end(dst_argb_opt, kStrideB * kHeightB);                  \
+    for (int i = 0; i < kStrideA * kHeightA; ++i) {                            \
+      src_argb[i] = (fastrand() & 0xff);                                       \
+    }                                                                          \
+    memset(dst_argb_c, 123, kStrideB * kHeightB);                              \
+    memset(dst_argb_opt, 123, kStrideB * kHeightB);                            \
+    MaskCpuFlags(disable_cpu_flags_);                                          \
+    FMT_A##To##FMT_B(src_argb, kStrideA,                                       \
+                     dst_argb_c, kStrideB,                                     \
+                     kWidth, kHeight);                                         \
+    MaskCpuFlags(benchmark_cpu_info_);                                         \
+    FMT_A##To##FMT_B(src_argb, kStrideA,                                       \
+                     dst_argb_opt, kStrideB,                                   \
+                     kWidth, kHeight);                                         \
+    int max_diff = 0;                                                          \
+    for (int i = 0; i < kStrideB * kHeightB; ++i) {                            \
+      int abs_diff =                                                           \
+          abs(static_cast<int>(dst_argb_c[i]) -                                \
+              static_cast<int>(dst_argb_opt[i]));                              \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+    EXPECT_LE(max_diff, DIFF);                                                 \
+    free_aligned_buffer_page_end(src_argb);                                    \
+    free_aligned_buffer_page_end(dst_argb_c);                                  \
+    free_aligned_buffer_page_end(dst_argb_opt);                                \
+  }                                                                            \
+}
+
+#define TESTATOB(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                             \
+                 FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF)                       \
+    TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                                \
+              FMT_B, BPP_B, STRIDE_B, HEIGHT_B,                                \
+              benchmark_width_ - 4, DIFF, _Any, +, 0)                          \
+    TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                                \
+              FMT_B, BPP_B, STRIDE_B, HEIGHT_B,                                \
+              benchmark_width_, DIFF, _Unaligned, +, 1)                        \
+    TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                                \
+              FMT_B, BPP_B, STRIDE_B, HEIGHT_B,                                \
+              benchmark_width_, DIFF, _Invert, -, 0)                           \
+    TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                                \
+              FMT_B, BPP_B, STRIDE_B, HEIGHT_B,                                \
+              benchmark_width_, DIFF, _Opt, +, 0)                              \
+    TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                           \
+                   FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF)
+
+TESTATOB(ARGB, 4, 4, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, BGRA, 4, 4, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, ABGR, 4, 4, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, RGBA, 4, 4, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, RAW, 3, 3, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, RGB24, 3, 3, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, RGB565, 2, 2, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, ARGB1555, 2, 2, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, ARGB4444, 2, 2, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, YUY2, 2, 4, 1, 4)
+TESTATOB(ARGB, 4, 4, 1, UYVY, 2, 4, 1, 4)
+TESTATOB(ARGB, 4, 4, 1, I400, 1, 1, 1, 2)
+TESTATOB(ARGB, 4, 4, 1, J400, 1, 1, 1, 2)
+TESTATOB(BGRA, 4, 4, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(ABGR, 4, 4, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(RGBA, 4, 4, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(RAW, 3, 3, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(RAW, 3, 3, 1, RGB24, 3, 3, 1, 0)
+TESTATOB(RGB24, 3, 3, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(RGB565, 2, 2, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(ARGB1555, 2, 2, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(ARGB4444, 2, 2, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(YUY2, 2, 4, 1, ARGB, 4, 4, 1, 4)
+TESTATOB(UYVY, 2, 4, 1, ARGB, 4, 4, 1, 4)
+TESTATOB(I400, 1, 1, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(J400, 1, 1, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(I400, 1, 1, 1, I400, 1, 1, 1, 0)
+TESTATOB(J400, 1, 1, 1, J400, 1, 1, 1, 0)
+TESTATOB(I400, 1, 1, 1, I400Mirror, 1, 1, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, ARGBMirror, 4, 4, 1, 0)
+
+#define TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                           \
+                   FMT_B, BPP_B, STRIDE_B, HEIGHT_B,                           \
+                   W1280, DIFF, N, NEG, OFF)                                   \
+TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##Dither##N) {                       \
+  const int kWidth = ((W1280) > 0) ? (W1280) : 1;                              \
+  const int kHeight = benchmark_height_;                                       \
+  const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A;         \
+  const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B;         \
+  const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;  \
+  const int kStrideB = (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;  \
+  align_buffer_64(src_argb, kStrideA * kHeightA + OFF);                        \
+  align_buffer_64(dst_argb_c, kStrideB * kHeightB);                            \
+  align_buffer_64(dst_argb_opt, kStrideB * kHeightB);                          \
+  for (int i = 0; i < kStrideA * kHeightA; ++i) {                              \
+    src_argb[i + OFF] = (fastrand() & 0xff);                                   \
+  }                                                                            \
+  memset(dst_argb_c, 1, kStrideB * kHeightB);                                  \
+  memset(dst_argb_opt, 101, kStrideB * kHeightB);                              \
+  MaskCpuFlags(disable_cpu_flags_);                                            \
+  FMT_A##To##FMT_B##Dither(src_argb + OFF, kStrideA,                           \
+                           dst_argb_c, kStrideB,                               \
+                           NULL, kWidth, NEG kHeight);                         \
+  MaskCpuFlags(benchmark_cpu_info_);                                           \
+  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
+    FMT_A##To##FMT_B##Dither(src_argb + OFF, kStrideA,                         \
+                             dst_argb_opt, kStrideB,                           \
+                             NULL, kWidth, NEG kHeight);                       \
+  }                                                                            \
+  int max_diff = 0;                                                            \
+  for (int i = 0; i < kStrideB * kHeightB; ++i) {                              \
+    int abs_diff =                                                             \
+        abs(static_cast<int>(dst_argb_c[i]) -                                  \
+            static_cast<int>(dst_argb_opt[i]));                                \
+    if (abs_diff > max_diff) {                                                 \
+      max_diff = abs_diff;                                                     \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, DIFF);                                                   \
+  free_aligned_buffer_64(src_argb);                                            \
+  free_aligned_buffer_64(dst_argb_c);                                          \
+  free_aligned_buffer_64(dst_argb_opt);                                        \
+}
+
+#define TESTATOBDRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                      \
+                       FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF)                 \
+TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##Dither_Random) {                   \
+  for (int times = 0; times < benchmark_iterations_; ++times) {                \
+    const int kWidth = (fastrand() & 63) + 1;                                  \
+    const int kHeight = (fastrand() & 31) + 1;                                 \
+    const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A;       \
+    const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B;       \
+    const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;\
+    const int kStrideB = (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;\
+    align_buffer_page_end(src_argb, kStrideA * kHeightA);                      \
+    align_buffer_page_end(dst_argb_c, kStrideB * kHeightB);                    \
+    align_buffer_page_end(dst_argb_opt, kStrideB * kHeightB);                  \
+    for (int i = 0; i < kStrideA * kHeightA; ++i) {                            \
+      src_argb[i] = (fastrand() & 0xff);                                       \
+    }                                                                          \
+    memset(dst_argb_c, 123, kStrideB * kHeightB);                              \
+    memset(dst_argb_opt, 123, kStrideB * kHeightB);                            \
+    MaskCpuFlags(disable_cpu_flags_);                                          \
+    FMT_A##To##FMT_B##Dither(src_argb, kStrideA,                               \
+                             dst_argb_c, kStrideB,                             \
+                             NULL, kWidth, kHeight);                           \
+    MaskCpuFlags(benchmark_cpu_info_);                                         \
+    FMT_A##To##FMT_B##Dither(src_argb, kStrideA,                               \
+                             dst_argb_opt, kStrideB,                           \
+                             NULL, kWidth, kHeight);                           \
+    int max_diff = 0;                                                          \
+    for (int i = 0; i < kStrideB * kHeightB; ++i) {                            \
+      int abs_diff =                                                           \
+          abs(static_cast<int>(dst_argb_c[i]) -                                \
+              static_cast<int>(dst_argb_opt[i]));                              \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+    EXPECT_LE(max_diff, DIFF);                                                 \
+    free_aligned_buffer_page_end(src_argb);                                    \
+    free_aligned_buffer_page_end(dst_argb_c);                                  \
+    free_aligned_buffer_page_end(dst_argb_opt);                                \
+  }                                                                            \
+}
+
+#define TESTATOBD(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                            \
+                  FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF)                      \
+    TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                               \
+               FMT_B, BPP_B, STRIDE_B, HEIGHT_B,                               \
+               benchmark_width_ - 4, DIFF, _Any, +, 0)                         \
+    TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                               \
+               FMT_B, BPP_B, STRIDE_B, HEIGHT_B,                               \
+               benchmark_width_, DIFF, _Unaligned, +, 1)                       \
+    TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                               \
+               FMT_B, BPP_B, STRIDE_B, HEIGHT_B,                               \
+               benchmark_width_, DIFF, _Invert, -, 0)                          \
+    TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                               \
+               FMT_B, BPP_B, STRIDE_B, HEIGHT_B,                               \
+               benchmark_width_, DIFF, _Opt, +, 0)                             \
+    TESTATOBDRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                          \
+                    FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF)
+
+TESTATOBD(ARGB, 4, 4, 1, RGB565, 2, 2, 1, 0)
+
+#define TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A,                          \
+                 W1280, N, NEG, OFF)                                           \
+TEST_F(LibYUVConvertTest, FMT_ATOB##_Symetric##N) {                            \
+  const int kWidth = ((W1280) > 0) ? (W1280) : 1;                              \
+  const int kHeight = benchmark_height_;                                       \
+  const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A;         \
+  const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;  \
+  align_buffer_64(src_argb, kStrideA * kHeightA + OFF);                        \
+  align_buffer_64(dst_argb_c, kStrideA * kHeightA);                            \
+  align_buffer_64(dst_argb_opt, kStrideA * kHeightA);                          \
+  for (int i = 0; i < kStrideA * kHeightA; ++i) {                              \
+    src_argb[i + OFF] = (fastrand() & 0xff);                                   \
+  }                                                                            \
+  memset(dst_argb_c, 1, kStrideA * kHeightA);                                  \
+  memset(dst_argb_opt, 101, kStrideA * kHeightA);                              \
+  MaskCpuFlags(disable_cpu_flags_);                                            \
+  FMT_ATOB(src_argb + OFF, kStrideA,                                           \
+           dst_argb_c, kStrideA,                                               \
+           kWidth, NEG kHeight);                                               \
+  MaskCpuFlags(benchmark_cpu_info_);                                           \
+  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
+    FMT_ATOB(src_argb + OFF, kStrideA,                                         \
+             dst_argb_opt, kStrideA,                                           \
+             kWidth, NEG kHeight);                                             \
+  }                                                                            \
+  MaskCpuFlags(disable_cpu_flags_);                                            \
+  FMT_ATOB(dst_argb_c, kStrideA,                                               \
+           dst_argb_c, kStrideA,                                               \
+           kWidth, NEG kHeight);                                               \
+  MaskCpuFlags(benchmark_cpu_info_);                                           \
+  FMT_ATOB(dst_argb_opt, kStrideA,                                             \
+           dst_argb_opt, kStrideA,                                             \
+           kWidth, NEG kHeight);                                               \
+  for (int i = 0; i < kStrideA * kHeightA; ++i) {                              \
+    EXPECT_EQ(src_argb[i + OFF], dst_argb_opt[i]);                             \
+    EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]);                                 \
+  }                                                                            \
+  free_aligned_buffer_64(src_argb);                                            \
+  free_aligned_buffer_64(dst_argb_c);                                          \
+  free_aligned_buffer_64(dst_argb_opt);                                        \
+}
+
+#define TESTSYM(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A)                           \
+    TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A,                              \
+             benchmark_width_ - 4, _Any, +, 0)                                 \
+    TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A,                              \
+             benchmark_width_, _Unaligned, +, 1)                               \
+    TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A,                              \
+             benchmark_width_, _Opt, +, 0)
+
+TESTSYM(ARGBToARGB, 4, 4, 1)
+TESTSYM(ARGBToBGRA, 4, 4, 1)
+TESTSYM(ARGBToABGR, 4, 4, 1)
+TESTSYM(BGRAToARGB, 4, 4, 1)
+TESTSYM(ABGRToARGB, 4, 4, 1)
+
+TEST_F(LibYUVConvertTest, Test565) {
+  SIMD_ALIGNED(uint8 orig_pixels[256][4]);
+  SIMD_ALIGNED(uint8 pixels565[256][2]);
+
+  for (int i = 0; i < 256; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      orig_pixels[i][j] = i;
+    }
+  }
+  ARGBToRGB565(&orig_pixels[0][0], 0, &pixels565[0][0], 0, 256, 1);
+  uint32 checksum = HashDjb2(&pixels565[0][0], sizeof(pixels565), 5381);
+  EXPECT_EQ(610919429u, checksum);
+}
+
+#ifdef HAVE_JPEG
+TEST_F(LibYUVConvertTest, ValidateJpeg) {
+  const int kOff = 10;
+  const int kMinJpeg = 64;
+  const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg ?
+    benchmark_width_ * benchmark_height_ : kMinJpeg;
+  const int kSize = kImageSize + kOff;
+  align_buffer_page_end(orig_pixels, kSize);
+
+  // No SOI or EOI. Expect fail.
+  memset(orig_pixels, 0, kSize);
+  EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
+
+  // Test special value that matches marker start.
+  memset(orig_pixels, 0xff, kSize);
+  EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
+
+  // EOI, SOI. Expect pass.
+  orig_pixels[0] = 0xff;
+  orig_pixels[1] = 0xd8;  // SOI.
+  orig_pixels[kSize - kOff + 0] = 0xff;
+  orig_pixels[kSize - kOff + 1] = 0xd9;  // EOI.
+  for (int times = 0; times < benchmark_iterations_; ++times) {
+    EXPECT_TRUE(ValidateJpeg(orig_pixels, kSize));
+  }
+  free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(LibYUVConvertTest, ValidateJpegLarge) {
+  const int kOff = 10;
+  const int kMinJpeg = 64;
+  const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg ?
+    benchmark_width_ * benchmark_height_ : kMinJpeg;
+  const int kSize = kImageSize + kOff;
+  const int kMultiple = 10;
+  const int kBufSize = kImageSize * kMultiple + kOff;
+  align_buffer_page_end(orig_pixels, kBufSize);
+
+  // No SOI or EOI. Expect fail.
+  memset(orig_pixels, 0, kBufSize);
+  EXPECT_FALSE(ValidateJpeg(orig_pixels, kBufSize));
+
+  // EOI, SOI. Expect pass.
+  orig_pixels[0] = 0xff;
+  orig_pixels[1] = 0xd8;  // SOI.
+  orig_pixels[kSize - kOff + 0] = 0xff;
+  orig_pixels[kSize - kOff + 1] = 0xd9;  // EOI.
+  for (int times = 0; times < benchmark_iterations_; ++times) {
+    EXPECT_TRUE(ValidateJpeg(orig_pixels, kBufSize));
+  }
+  free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(LibYUVConvertTest, InvalidateJpeg) {
+  const int kOff = 10;
+  const int kMinJpeg = 64;
+  const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg ?
+    benchmark_width_ * benchmark_height_ : kMinJpeg;
+  const int kSize = kImageSize + kOff;
+  align_buffer_page_end(orig_pixels, kSize);
+
+  // NULL pointer. Expect fail.
+  EXPECT_FALSE(ValidateJpeg(NULL, kSize));
+
+  // Negative size. Expect fail.
+  EXPECT_FALSE(ValidateJpeg(orig_pixels, -1));
+
+  // Too large size. Expect fail.
+  EXPECT_FALSE(ValidateJpeg(orig_pixels, 0xfb000000ull));
+
+  // No SOI or EOI. Expect fail.
+  memset(orig_pixels, 0, kSize);
+  EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
+
+  // SOI but no EOI. Expect fail.
+  orig_pixels[0] = 0xff;
+  orig_pixels[1] = 0xd8;  // SOI.
+  for (int times = 0; times < benchmark_iterations_; ++times) {
+    EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
+  }
+
+  // EOI but no SOI. Expect fail.
+  orig_pixels[0] = 0;
+  orig_pixels[1] = 0;
+  orig_pixels[kSize - kOff + 0] = 0xff;
+  orig_pixels[kSize - kOff + 1] = 0xd9;  // EOI.
+  EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
+
+  free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(LibYUVConvertTest, FuzzJpeg) {
+  // SOI but no EOI. Expect fail.
+  for (int times = 0; times < benchmark_iterations_; ++times) {
+    const int kSize = fastrand() % 5000 + 2;
+    align_buffer_page_end(orig_pixels, kSize);
+    MemRandomize(orig_pixels, kSize);
+
+    // Add SOI so frame will be scanned.
+    orig_pixels[0] = 0xff;
+    orig_pixels[1] = 0xd8;  // SOI.
+    orig_pixels[kSize - 1] = 0xff;
+    ValidateJpeg(orig_pixels, kSize);  // Failure normally expected.
+    free_aligned_buffer_page_end(orig_pixels);
+  }
+}
+
+TEST_F(LibYUVConvertTest, MJPGToI420) {
+  const int kOff = 10;
+  const int kMinJpeg = 64;
+  const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg ?
+    benchmark_width_ * benchmark_height_ : kMinJpeg;
+  const int kSize = kImageSize + kOff;
+  align_buffer_page_end(orig_pixels, kSize);
+  align_buffer_page_end(dst_y_opt, benchmark_width_ * benchmark_height_);
+  align_buffer_page_end(dst_u_opt,
+                        SUBSAMPLE(benchmark_width_, 2) *
+                        SUBSAMPLE(benchmark_height_, 2));
+  align_buffer_page_end(dst_v_opt,
+                        SUBSAMPLE(benchmark_width_, 2) *
+                        SUBSAMPLE(benchmark_height_, 2));
+
+  // EOI, SOI to make MJPG appear valid.
+  memset(orig_pixels, 0, kSize);
+  orig_pixels[0] = 0xff;
+  orig_pixels[1] = 0xd8;  // SOI.
+  orig_pixels[kSize - kOff + 0] = 0xff;
+  orig_pixels[kSize - kOff + 1] = 0xd9;  // EOI.
+
+  for (int times = 0; times < benchmark_iterations_; ++times) {
+    int ret = MJPGToI420(orig_pixels, kSize,
+                         dst_y_opt, benchmark_width_,
+                         dst_u_opt, SUBSAMPLE(benchmark_width_, 2),
+                         dst_v_opt, SUBSAMPLE(benchmark_width_, 2),
+                         benchmark_width_, benchmark_height_,
+                         benchmark_width_, benchmark_height_);
+    // Expect failure because image is not really valid.
+    EXPECT_EQ(1, ret);
+  }
+
+  free_aligned_buffer_page_end(dst_y_opt);
+  free_aligned_buffer_page_end(dst_u_opt);
+  free_aligned_buffer_page_end(dst_v_opt);
+  free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(LibYUVConvertTest, MJPGToARGB) {
+  const int kOff = 10;
+  const int kMinJpeg = 64;
+  const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg ?
+    benchmark_width_ * benchmark_height_ : kMinJpeg;
+  const int kSize = kImageSize + kOff;
+  align_buffer_page_end(orig_pixels, kSize);
+  align_buffer_page_end(dst_argb_opt, benchmark_width_ * benchmark_height_ * 4);
+
+  // EOI, SOI to make MJPG appear valid.
+  memset(orig_pixels, 0, kSize);
+  orig_pixels[0] = 0xff;
+  orig_pixels[1] = 0xd8;  // SOI.
+  orig_pixels[kSize - kOff + 0] = 0xff;
+  orig_pixels[kSize - kOff + 1] = 0xd9;  // EOI.
+
+  for (int times = 0; times < benchmark_iterations_; ++times) {
+    int ret = MJPGToARGB(orig_pixels, kSize,
+                         dst_argb_opt, benchmark_width_ * 4,
+                         benchmark_width_, benchmark_height_,
+                         benchmark_width_, benchmark_height_);
+    // Expect failure because image is not really valid.
+    EXPECT_EQ(1, ret);
+  }
+
+  free_aligned_buffer_page_end(dst_argb_opt);
+  free_aligned_buffer_page_end(orig_pixels);
+}
+
+#endif  // HAVE_JPEG
+
+TEST_F(LibYUVConvertTest, NV12Crop) {
+  const int SUBSAMP_X = 2;
+  const int SUBSAMP_Y = 2;
+  const int kWidth = benchmark_width_;
+  const int kHeight = benchmark_height_;
+  const int crop_y =
+    ((benchmark_height_ - (benchmark_height_ * 360 / 480)) / 2 + 1) & ~1;
+  const int kDestWidth = benchmark_width_;
+  const int kDestHeight = benchmark_height_ - crop_y * 2;
+  const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);
+  const int sample_size = kWidth * kHeight +
+    kStrideUV *
+    SUBSAMPLE(kHeight, SUBSAMP_Y) * 2;
+  align_buffer_64(src_y, sample_size);
+  uint8* src_uv = src_y + kWidth * kHeight;
+
+  align_buffer_64(dst_y, kDestWidth * kDestHeight);
+  align_buffer_64(dst_u,
+                  SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+                  SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+  align_buffer_64(dst_v,
+                  SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+                  SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+
+  align_buffer_64(dst_y_2, kDestWidth * kDestHeight);
+  align_buffer_64(dst_u_2,
+                  SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+                  SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+  align_buffer_64(dst_v_2,
+                  SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+                  SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+
+  for (int i = 0; i < kHeight * kWidth; ++i) {
+    src_y[i] = (fastrand() & 0xff);
+  }
+  for (int i = 0; i < (SUBSAMPLE(kHeight, SUBSAMP_Y) *
+       kStrideUV) * 2; ++i) {
+    src_uv[i] = (fastrand() & 0xff);
+  }
+  memset(dst_y, 1, kDestWidth * kDestHeight);
+  memset(dst_u, 2, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+                   SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+  memset(dst_v, 3, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+                   SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+  memset(dst_y_2, 1, kDestWidth * kDestHeight);
+  memset(dst_u_2, 2, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+                     SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+  memset(dst_v_2, 3, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+                     SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+
+  ConvertToI420(src_y, sample_size,
+                dst_y_2, kDestWidth,
+                dst_u_2, SUBSAMPLE(kDestWidth, SUBSAMP_X),
+                dst_v_2, SUBSAMPLE(kDestWidth, SUBSAMP_X),
+                0, crop_y,
+                kWidth, kHeight,
+                kDestWidth, kDestHeight,
+                libyuv::kRotate0, libyuv::FOURCC_NV12);
+
+  NV12ToI420(src_y + crop_y * kWidth, kWidth,
+             src_uv + (crop_y / 2) * kStrideUV * 2,
+               kStrideUV * 2,
+             dst_y, kDestWidth,
+             dst_u, SUBSAMPLE(kDestWidth, SUBSAMP_X),
+             dst_v, SUBSAMPLE(kDestWidth, SUBSAMP_X),
+             kDestWidth, kDestHeight);
+
+  for (int i = 0; i < kDestHeight; ++i) {
+    for (int j = 0; j < kDestWidth; ++j) {
+      EXPECT_EQ(dst_y[i * kWidth + j], dst_y_2[i * kWidth + j]);
+    }
+  }
+  for (int i = 0; i < SUBSAMPLE(kDestHeight, SUBSAMP_Y); ++i) {
+    for (int j = 0; j < SUBSAMPLE(kDestWidth, SUBSAMP_X); ++j) {
+      EXPECT_EQ(dst_u[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j],
+                dst_u_2[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j]);
+    }
+  }
+  for (int i = 0; i < SUBSAMPLE(kDestHeight, SUBSAMP_Y); ++i) {
+    for (int j = 0; j < SUBSAMPLE(kDestWidth, SUBSAMP_X); ++j) {
+      EXPECT_EQ(dst_v[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j],
+                dst_v_2[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j]);
+    }
+  }
+  free_aligned_buffer_64(dst_y);
+  free_aligned_buffer_64(dst_u);
+  free_aligned_buffer_64(dst_v);
+  free_aligned_buffer_64(dst_y_2);
+  free_aligned_buffer_64(dst_u_2);
+  free_aligned_buffer_64(dst_v_2);
+  free_aligned_buffer_64(src_y);
+}
+
+TEST_F(LibYUVConvertTest, TestYToARGB) {
+  uint8 y[32];
+  uint8 expectedg[32];
+  for (int i = 0; i < 32; ++i) {
+    y[i] = i * 5 + 17;
+    expectedg[i] = static_cast<int>((y[i] - 16) * 1.164f + 0.5f);
+  }
+  uint8 argb[32 * 4];
+  YToARGB(y, 0, argb, 0, 32, 1);
+
+  for (int i = 0; i < 32; ++i) {
+    printf("%2d %d: %d <-> %d,%d,%d,%d\n", i, y[i], expectedg[i],
+           argb[i * 4 + 0],
+           argb[i * 4 + 1],
+           argb[i * 4 + 2],
+           argb[i * 4 + 3]);
+  }
+  for (int i = 0; i < 32; ++i) {
+    EXPECT_EQ(expectedg[i], argb[i * 4 + 0]);
+  }
+}
+
+static const uint8 kNoDither4x4[16] = {
+  0, 0, 0, 0,
+  0, 0, 0, 0,
+  0, 0, 0, 0,
+  0, 0, 0, 0,
+};
+
+TEST_F(LibYUVConvertTest, TestNoDither) {
+  align_buffer_64(src_argb, benchmark_width_ * benchmark_height_ * 4);
+  align_buffer_64(dst_rgb565, benchmark_width_ * benchmark_height_ * 2);
+  align_buffer_64(dst_rgb565dither, benchmark_width_ * benchmark_height_ * 2);
+  MemRandomize(src_argb, benchmark_width_ * benchmark_height_ * 4);
+  MemRandomize(dst_rgb565, benchmark_width_ * benchmark_height_ * 2);
+  MemRandomize(dst_rgb565dither, benchmark_width_ * benchmark_height_ * 2);
+  ARGBToRGB565(src_argb, benchmark_width_ * 4,
+               dst_rgb565, benchmark_width_ * 2,
+               benchmark_width_, benchmark_height_);
+  ARGBToRGB565Dither(src_argb, benchmark_width_ * 4,
+                     dst_rgb565dither, benchmark_width_ * 2,
+                     kNoDither4x4, benchmark_width_, benchmark_height_);
+  for (int i = 0; i < benchmark_width_ * benchmark_height_ * 2; ++i) {
+    EXPECT_EQ(dst_rgb565[i], dst_rgb565dither[i]);
+  }
+
+  free_aligned_buffer_64(src_argb);
+  free_aligned_buffer_64(dst_rgb565);
+  free_aligned_buffer_64(dst_rgb565dither);
+}
+
+// Ordered 4x4 dither for 888 to 565.  Values from 0 to 7.
+static const uint8 kDither565_4x4[16] = {
+  0, 4, 1, 5,
+  6, 2, 7, 3,
+  1, 5, 0, 4,
+  7, 3, 6, 2,
+};
+
+TEST_F(LibYUVConvertTest, TestDither) {
+  align_buffer_64(src_argb, benchmark_width_ * benchmark_height_ * 4);
+  align_buffer_64(dst_rgb565, benchmark_width_ * benchmark_height_ * 2);
+  align_buffer_64(dst_rgb565dither, benchmark_width_ * benchmark_height_ * 2);
+  align_buffer_64(dst_argb, benchmark_width_ * benchmark_height_ * 4);
+  align_buffer_64(dst_argbdither, benchmark_width_ * benchmark_height_ * 4);
+  MemRandomize(src_argb, benchmark_width_ * benchmark_height_ * 4);
+  MemRandomize(dst_rgb565, benchmark_width_ * benchmark_height_ * 2);
+  MemRandomize(dst_rgb565dither, benchmark_width_ * benchmark_height_ * 2);
+  MemRandomize(dst_argb, benchmark_width_ * benchmark_height_ * 4);
+  MemRandomize(dst_argbdither, benchmark_width_ * benchmark_height_ * 4);
+  ARGBToRGB565(src_argb, benchmark_width_ * 4,
+               dst_rgb565, benchmark_width_ * 2,
+               benchmark_width_, benchmark_height_);
+  ARGBToRGB565Dither(src_argb, benchmark_width_ * 4,
+                     dst_rgb565dither, benchmark_width_ * 2,
+                     kDither565_4x4, benchmark_width_, benchmark_height_);
+  RGB565ToARGB(dst_rgb565, benchmark_width_ * 2,
+               dst_argb, benchmark_width_ * 4,
+               benchmark_width_, benchmark_height_);
+  RGB565ToARGB(dst_rgb565dither, benchmark_width_ * 2,
+               dst_argbdither, benchmark_width_ * 4,
+               benchmark_width_, benchmark_height_);
+
+  for (int i = 0; i < benchmark_width_ * benchmark_height_ * 4; ++i) {
+    EXPECT_NEAR(dst_argb[i], dst_argbdither[i], 9);
+  }
+  free_aligned_buffer_64(src_argb);
+  free_aligned_buffer_64(dst_rgb565);
+  free_aligned_buffer_64(dst_rgb565dither);
+  free_aligned_buffer_64(dst_argb);
+  free_aligned_buffer_64(dst_argbdither);
+}
+
+#define TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+                       YALIGN, W1280, DIFF, N, NEG, OFF, FMT_C, BPP_C)         \
+TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##Dither##N) {                  \
+  const int kWidth = ((W1280) > 0) ? (W1280) : 1;                              \
+  const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                     \
+  const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN);                        \
+  const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                          \
+  const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);               \
+  align_buffer_64(src_y, kWidth * kHeight + OFF);                              \
+  align_buffer_64(src_u, kSizeUV + OFF);                                       \
+  align_buffer_64(src_v, kSizeUV + OFF);                                       \
+  align_buffer_64(dst_argb_c, kStrideB * kHeight + OFF);                       \
+  align_buffer_64(dst_argb_opt, kStrideB * kHeight + OFF);                     \
+  for (int i = 0; i < kWidth * kHeight; ++i) {                                 \
+    src_y[i + OFF] = (fastrand() & 0xff);                                      \
+  }                                                                            \
+  for (int i = 0; i < kSizeUV; ++i) {                                          \
+    src_u[i + OFF] = (fastrand() & 0xff);                                      \
+    src_v[i + OFF] = (fastrand() & 0xff);                                      \
+  }                                                                            \
+  memset(dst_argb_c + OFF, 1, kStrideB * kHeight);                             \
+  memset(dst_argb_opt + OFF, 101, kStrideB * kHeight);                         \
+  MaskCpuFlags(disable_cpu_flags_);                                            \
+  FMT_PLANAR##To##FMT_B##Dither(src_y + OFF, kWidth,                           \
+                        src_u + OFF, kStrideUV,                                \
+                        src_v + OFF, kStrideUV,                                \
+                        dst_argb_c + OFF, kStrideB,                            \
+                        NULL, kWidth, NEG kHeight);                            \
+  MaskCpuFlags(benchmark_cpu_info_);                                           \
+  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
+    FMT_PLANAR##To##FMT_B##Dither(src_y + OFF, kWidth,                         \
+                          src_u + OFF, kStrideUV,                              \
+                          src_v + OFF, kStrideUV,                              \
+                          dst_argb_opt + OFF, kStrideB,                        \
+                          NULL, kWidth, NEG kHeight);                          \
+  }                                                                            \
+  int max_diff = 0;                                                            \
+  /* Convert to ARGB so 565 is expanded to bytes that can be compared. */      \
+  align_buffer_64(dst_argb32_c, kWidth * BPP_C  * kHeight);                    \
+  align_buffer_64(dst_argb32_opt, kWidth * BPP_C  * kHeight);                  \
+  memset(dst_argb32_c, 2, kWidth * BPP_C  * kHeight);                          \
+  memset(dst_argb32_opt, 102, kWidth * BPP_C  * kHeight);                      \
+  FMT_B##To##FMT_C(dst_argb_c + OFF, kStrideB,                                 \
+                   dst_argb32_c, kWidth * BPP_C ,                              \
+                   kWidth, kHeight);                                           \
+  FMT_B##To##FMT_C(dst_argb_opt + OFF, kStrideB,                               \
+                   dst_argb32_opt, kWidth * BPP_C ,                            \
+                   kWidth, kHeight);                                           \
+  for (int i = 0; i < kWidth * BPP_C * kHeight; ++i) {                         \
+    int abs_diff =                                                             \
+        abs(static_cast<int>(dst_argb32_c[i]) -                                \
+            static_cast<int>(dst_argb32_opt[i]));                              \
+    if (abs_diff > max_diff) {                                                 \
+      max_diff = abs_diff;                                                     \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, DIFF);                                                   \
+  free_aligned_buffer_64(src_y);                                               \
+  free_aligned_buffer_64(src_u);                                               \
+  free_aligned_buffer_64(src_v);                                               \
+  free_aligned_buffer_64(dst_argb_c);                                          \
+  free_aligned_buffer_64(dst_argb_opt);                                        \
+  free_aligned_buffer_64(dst_argb32_c);                                        \
+  free_aligned_buffer_64(dst_argb32_opt);                                      \
+}
+
+#define TESTPLANARTOBD(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,  \
+                      YALIGN, DIFF, FMT_C, BPP_C)                              \
+    TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,     \
+        YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0, FMT_C, BPP_C)          \
+    TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,     \
+        YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1, FMT_C, BPP_C)        \
+    TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,     \
+        YALIGN, benchmark_width_, DIFF, _Invert, -, 0, FMT_C, BPP_C)           \
+    TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,     \
+        YALIGN, benchmark_width_, DIFF, _Opt, +, 0, FMT_C, BPP_C)
+
+TESTPLANARTOBD(I420, 2, 2, RGB565, 2, 2, 1, 9, ARGB, 4)
+
+#define TESTPTOB(NAME, UYVYTOI420, UYVYTONV12)                                 \
+TEST_F(LibYUVConvertTest, NAME) {                                              \
+  const int kWidth = benchmark_width_;                                         \
+  const int kHeight = benchmark_height_;                                       \
+                                                                               \
+  align_buffer_64(orig_uyvy,                                                   \
+                  4 * SUBSAMPLE(kWidth, 2) * kHeight);                         \
+  align_buffer_64(orig_y, kWidth * kHeight);                                   \
+  align_buffer_64(orig_u,                                                      \
+                  SUBSAMPLE(kWidth, 2) *                                       \
+                  SUBSAMPLE(kHeight, 2));                                      \
+  align_buffer_64(orig_v,                                                      \
+                  SUBSAMPLE(kWidth, 2) *                                       \
+                  SUBSAMPLE(kHeight, 2));                                      \
+                                                                               \
+  align_buffer_64(dst_y_orig, kWidth * kHeight);                               \
+  align_buffer_64(dst_uv_orig, 2 *                                             \
+                  SUBSAMPLE(kWidth, 2) *                                       \
+                  SUBSAMPLE(kHeight, 2));                                      \
+                                                                               \
+  align_buffer_64(dst_y, kWidth * kHeight);                                    \
+  align_buffer_64(dst_uv, 2 *                                                  \
+                  SUBSAMPLE(kWidth, 2) *                                       \
+                  SUBSAMPLE(kHeight, 2));                                      \
+                                                                               \
+  MemRandomize(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2) * kHeight);                 \
+                                                                               \
+  /* Convert UYVY to NV12 in 2 steps for reference */                          \
+  libyuv::UYVYTOI420(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2),                      \
+                     orig_y, kWidth,                                           \
+                     orig_u, SUBSAMPLE(kWidth, 2),                             \
+                     orig_v, SUBSAMPLE(kWidth, 2),                             \
+                     kWidth, kHeight);                                         \
+  libyuv::I420ToNV12(orig_y, kWidth,                                           \
+                     orig_u, SUBSAMPLE(kWidth, 2),                             \
+                     orig_v, SUBSAMPLE(kWidth, 2),                             \
+                     dst_y_orig, kWidth,                                       \
+                     dst_uv_orig, 2 * SUBSAMPLE(kWidth, 2),                    \
+                     kWidth, kHeight);                                         \
+                                                                               \
+  /* Convert to NV12 */                                                        \
+  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
+    libyuv::UYVYTONV12(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2),                    \
+                       dst_y, kWidth,                                          \
+                       dst_uv, 2 * SUBSAMPLE(kWidth, 2),                       \
+                       kWidth, kHeight);                                       \
+  }                                                                            \
+                                                                               \
+  for (int i = 0; i < kWidth * kHeight; ++i) {                                 \
+    EXPECT_EQ(orig_y[i], dst_y[i]);                                            \
+  }                                                                            \
+  for (int i = 0; i < kWidth * kHeight; ++i) {                                 \
+    EXPECT_EQ(dst_y_orig[i], dst_y[i]);                                        \
+  }                                                                            \
+  for (int i = 0; i < 2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2); ++i) { \
+    EXPECT_EQ(dst_uv_orig[i], dst_uv[i]);                                      \
+  }                                                                            \
+                                                                               \
+  free_aligned_buffer_64(orig_uyvy);                                           \
+  free_aligned_buffer_64(orig_y);                                              \
+  free_aligned_buffer_64(orig_u);                                              \
+  free_aligned_buffer_64(orig_v);                                              \
+  free_aligned_buffer_64(dst_y_orig);                                          \
+  free_aligned_buffer_64(dst_uv_orig);                                         \
+  free_aligned_buffer_64(dst_y);                                               \
+  free_aligned_buffer_64(dst_uv);                                              \
+}
+
+TESTPTOB(TestYUY2ToNV12, YUY2ToI420, YUY2ToNV12)
+TESTPTOB(TestUYVYToNV12, UYVYToI420, UYVYToNV12)
+
+#define TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B,  \
+                       W1280, N, NEG, OFF, FMT_C, BPP_C)                       \
+TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##_##FMT_C##N) {                \
+  const int kWidth = ((W1280) > 0) ? (W1280) : 1;                              \
+  const int kHeight = benchmark_height_;                                       \
+  const int kStrideB = SUBSAMPLE(kWidth, SUB_B) * BPP_B;                       \
+  const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                          \
+  const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);               \
+  align_buffer_64(src_y, kWidth * kHeight + OFF);                              \
+  align_buffer_64(src_u, kSizeUV + OFF);                                       \
+  align_buffer_64(src_v, kSizeUV + OFF);                                       \
+  align_buffer_64(dst_argb_b, kStrideB * kHeight + OFF);                       \
+  for (int i = 0; i < kWidth * kHeight; ++i) {                                 \
+    src_y[i + OFF] = (fastrand() & 0xff);                                      \
+  }                                                                            \
+  for (int i = 0; i < kSizeUV; ++i) {                                          \
+    src_u[i + OFF] = (fastrand() & 0xff);                                      \
+    src_v[i + OFF] = (fastrand() & 0xff);                                      \
+  }                                                                            \
+  memset(dst_argb_b + OFF, 1, kStrideB * kHeight);                             \
+  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
+    FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth,                                 \
+                          src_u + OFF, kStrideUV,                              \
+                          src_v + OFF, kStrideUV,                              \
+                          dst_argb_b + OFF, kStrideB,                          \
+                          kWidth, NEG kHeight);                                \
+  }                                                                            \
+  /* Convert to a 3rd format in 1 step and 2 steps and compare  */             \
+  const int kStrideC = kWidth * BPP_C;                                         \
+  align_buffer_64(dst_argb_c, kStrideC * kHeight + OFF);                       \
+  align_buffer_64(dst_argb_bc, kStrideC * kHeight + OFF);                      \
+  memset(dst_argb_c + OFF, 2, kStrideC * kHeight);                             \
+  memset(dst_argb_bc + OFF, 3, kStrideC * kHeight);                            \
+  FMT_PLANAR##To##FMT_C(src_y + OFF, kWidth,                                   \
+                        src_u + OFF, kStrideUV,                                \
+                        src_v + OFF, kStrideUV,                                \
+                        dst_argb_c + OFF, kStrideC,                            \
+                        kWidth, NEG kHeight);                                  \
+  /* Convert B to C */                                                         \
+  FMT_B##To##FMT_C(dst_argb_b + OFF, kStrideB,                                 \
+                   dst_argb_bc + OFF, kStrideC,                                \
+                   kWidth, kHeight);                                           \
+  for (int i = 0; i < kStrideC * kHeight; ++i) {                               \
+    EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_bc[i + OFF]);                      \
+  }                                                                            \
+  free_aligned_buffer_64(src_y);                                               \
+  free_aligned_buffer_64(src_u);                                               \
+  free_aligned_buffer_64(src_v);                                               \
+  free_aligned_buffer_64(dst_argb_b);                                          \
+  free_aligned_buffer_64(dst_argb_c);                                          \
+  free_aligned_buffer_64(dst_argb_bc);                                         \
+}
+
+#define TESTPLANARTOE(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B,   \
+                      FMT_C, BPP_C)                                            \
+    TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B,      \
+        benchmark_width_ - 4, _Any, +, 0, FMT_C, BPP_C)                        \
+    TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B,      \
+        benchmark_width_, _Unaligned, +, 1, FMT_C, BPP_C)                      \
+    TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B,      \
+        benchmark_width_, _Invert, -, 0, FMT_C, BPP_C)                         \
+    TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B,      \
+        benchmark_width_, _Opt, +, 0, FMT_C, BPP_C)
+
+TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ABGR, 4)
+TESTPLANARTOE(J420, 2, 2, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(J420, 2, 2, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(H420, 2, 2, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(H420, 2, 2, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(I420, 2, 2, BGRA, 1, 4, ARGB, 4)
+TESTPLANARTOE(I420, 2, 2, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(I420, 2, 2, RGBA, 1, 4, ARGB, 4)
+TESTPLANARTOE(I420, 2, 2, RGB24, 1, 3, ARGB, 4)
+TESTPLANARTOE(I420, 2, 2, RAW, 1, 3, RGB24, 3)
+TESTPLANARTOE(I420, 2, 2, RGB24, 1, 3, RAW, 3)
+TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, RAW, 3)
+TESTPLANARTOE(I420, 2, 2, RAW, 1, 3, ARGB, 4)
+TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, RGB565, 2)
+TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ARGB1555, 2)
+TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ARGB4444, 2)
+TESTPLANARTOE(I422, 2, 1, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(J422, 2, 1, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(J422, 2, 1, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(H422, 2, 1, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(H422, 2, 1, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(I422, 2, 1, BGRA, 1, 4, ARGB, 4)
+TESTPLANARTOE(I422, 2, 1, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(I422, 2, 1, RGBA, 1, 4, ARGB, 4)
+TESTPLANARTOE(I411, 4, 1, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(I444, 1, 1, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(J444, 1, 1, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(I444, 1, 1, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(I420, 2, 2, YUY2, 2, 4, ARGB, 4)
+TESTPLANARTOE(I420, 2, 2, UYVY, 2, 4, ARGB, 4)
+TESTPLANARTOE(I422, 2, 1, YUY2, 2, 4, ARGB, 4)
+TESTPLANARTOE(I422, 2, 1, UYVY, 2, 4, ARGB, 4)
+
+#define TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
+                       W1280, N, NEG, OFF, FMT_C, BPP_C, ATTEN)                \
+TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##_##FMT_C##N) {                \
+  const int kWidth = ((W1280) > 0) ? (W1280) : 1;                              \
+  const int kHeight = benchmark_height_;                                       \
+  const int kStrideB = SUBSAMPLE(kWidth, SUB_B) * BPP_B;                       \
+  const int kSizeUV =                                                          \
+    SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y);              \
+  align_buffer_64(src_y, kWidth * kHeight + OFF);                              \
+  align_buffer_64(src_u, kSizeUV + OFF);                                       \
+  align_buffer_64(src_v, kSizeUV + OFF);                                       \
+  align_buffer_64(src_a, kWidth * kHeight + OFF);                              \
+  align_buffer_64(dst_argb_b, kStrideB * kHeight + OFF);                       \
+  for (int i = 0; i < kWidth * kHeight; ++i) {                                 \
+    src_y[i + OFF] = (fastrand() & 0xff);                                      \
+    src_a[i + OFF] = (fastrand() & 0xff);                                      \
+  }                                                                            \
+  for (int i = 0; i < kSizeUV; ++i) {                                          \
+    src_u[i + OFF] = (fastrand() & 0xff);                                      \
+    src_v[i + OFF] = (fastrand() & 0xff);                                      \
+  }                                                                            \
+  memset(dst_argb_b + OFF, 1, kStrideB * kHeight);                             \
+  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
+    FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth,                                 \
+                          src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X),           \
+                          src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X),           \
+                          src_a + OFF, kWidth,                                 \
+                          dst_argb_b + OFF, kStrideB,                          \
+                          kWidth, NEG kHeight, ATTEN);                         \
+  }                                                                            \
+  int max_diff = 0;                                                            \
+  /* Convert to a 3rd format in 1 step and 2 steps and compare  */             \
+  const int kStrideC = kWidth * BPP_C;                                         \
+  align_buffer_64(dst_argb_c, kStrideC * kHeight + OFF);                       \
+  align_buffer_64(dst_argb_bc, kStrideC * kHeight + OFF);                      \
+  memset(dst_argb_c + OFF, 2, kStrideC * kHeight);                             \
+  memset(dst_argb_bc + OFF, 3, kStrideC * kHeight);                            \
+  FMT_PLANAR##To##FMT_C(src_y + OFF, kWidth,                                   \
+                        src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X),             \
+                        src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X),             \
+                        src_a + OFF, kWidth,                                   \
+                        dst_argb_c + OFF, kStrideC,                            \
+                        kWidth, NEG kHeight, ATTEN);                           \
+  /* Convert B to C */                                                         \
+  FMT_B##To##FMT_C(dst_argb_b + OFF, kStrideB,                                 \
+                   dst_argb_bc + OFF, kStrideC,                                \
+                   kWidth, kHeight);                                           \
+  for (int i = 0; i < kStrideC * kHeight; ++i) {                               \
+    EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_bc[i + OFF]);                      \
+  }                                                                            \
+  free_aligned_buffer_64(src_y);                                               \
+  free_aligned_buffer_64(src_u);                                               \
+  free_aligned_buffer_64(src_v);                                               \
+  free_aligned_buffer_64(src_a);                                               \
+  free_aligned_buffer_64(dst_argb_b);                                          \
+  free_aligned_buffer_64(dst_argb_c);                                          \
+  free_aligned_buffer_64(dst_argb_bc);                                         \
+}
+
+#define TESTQPLANARTOE(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B,  \
+                      FMT_C, BPP_C)                                            \
+    TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B,     \
+        benchmark_width_ - 4, _Any, +, 0, FMT_C, BPP_C, 0)                     \
+    TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B,     \
+        benchmark_width_, _Unaligned, +, 1, FMT_C, BPP_C, 0)                   \
+    TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B,     \
+        benchmark_width_, _Invert, -, 0, FMT_C, BPP_C, 0)                      \
+    TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B,     \
+        benchmark_width_, _Opt, +, 0, FMT_C, BPP_C, 0)                         \
+      TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B,   \
+          benchmark_width_, _Premult, +, 0, FMT_C, BPP_C, 1)
+
+TESTQPLANARTOE(I420Alpha, 2, 2, ARGB, 1, 4, ABGR, 4)
+TESTQPLANARTOE(I420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4)
+
+}  // namespace libyuv
diff --git a/libs/libyuv/unit_test/cpu_test.cc b/libs/libyuv/unit_test/cpu_test.cc
new file mode 100644
index 0000000000..5933ee442d
--- /dev/null
+++ b/libs/libyuv/unit_test/cpu_test.cc
@@ -0,0 +1,147 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "libyuv/basic_types.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/row.h"  // For HAS_ARGBSHUFFLEROW_AVX2.
+#include "libyuv/version.h"
+#include "../unit_test/unit_test.h"
+
+namespace libyuv {
+
+TEST_F(LibYUVBaseTest, TestCpuHas) {
+  int cpu_flags = TestCpuFlag(-1);
+  printf("Cpu Flags %x\n", cpu_flags);
+  int has_arm = TestCpuFlag(kCpuHasARM);
+  printf("Has ARM %x\n", has_arm);
+  int has_neon = TestCpuFlag(kCpuHasNEON);
+  printf("Has NEON %x\n", has_neon);
+  int has_x86 = TestCpuFlag(kCpuHasX86);
+  printf("Has X86 %x\n", has_x86);
+  int has_sse2 = TestCpuFlag(kCpuHasSSE2);
+  printf("Has SSE2 %x\n", has_sse2);
+  int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
+  printf("Has SSSE3 %x\n", has_ssse3);
+  int has_sse41 = TestCpuFlag(kCpuHasSSE41);
+  printf("Has SSE4.1 %x\n", has_sse41);
+  int has_sse42 = TestCpuFlag(kCpuHasSSE42);
+  printf("Has SSE4.2 %x\n", has_sse42);
+  int has_avx = TestCpuFlag(kCpuHasAVX);
+  printf("Has AVX %x\n", has_avx);
+  int has_avx2 = TestCpuFlag(kCpuHasAVX2);
+  printf("Has AVX2 %x\n", has_avx2);
+  int has_erms = TestCpuFlag(kCpuHasERMS);
+  printf("Has ERMS %x\n", has_erms);
+  int has_fma3 = TestCpuFlag(kCpuHasFMA3);
+  printf("Has FMA3 %x\n", has_fma3);
+  int has_avx3 = TestCpuFlag(kCpuHasAVX3);
+  printf("Has AVX3 %x\n", has_avx3);
+  int has_mips = TestCpuFlag(kCpuHasMIPS);
+  printf("Has MIPS %x\n", has_mips);
+  int has_dspr2 = TestCpuFlag(kCpuHasDSPR2);
+  printf("Has DSPR2 %x\n", has_dspr2);
+}
+
+TEST_F(LibYUVBaseTest, TestCpuCompilerEnabled) {
+#if defined(__aarch64__)
+  printf("Arm64 build\n");
+#endif
+#if defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON)
+  printf("Neon build enabled\n");
+#endif
+#if defined(__x86_64__) || defined(_M_X64)
+  printf("x64 build\n");
+#endif
+#ifdef _MSC_VER
+printf("_MSC_VER %d\n", _MSC_VER);
+#endif
+#if !defined(LIBYUV_DISABLE_X86) && (defined(GCC_HAS_AVX2) || \
+    defined(CLANG_HAS_AVX2) || defined(VISUALC_HAS_AVX2))
+  printf("Has AVX2 1\n");
+  // If compiler supports AVX2, the following function is expected to exist:
+#if !defined(HAS_ARGBSHUFFLEROW_AVX2)
+  EXPECT_TRUE(0);  // HAS_ARGBSHUFFLEROW_AVX2 was expected.
+#endif
+#else
+  printf("Has AVX2 0\n");
+  // If compiler does not support AVX2, the following function not expected:
+#if defined(HAS_ARGBSHUFFLEROW_AVX2)
+  EXPECT_TRUE(0);  // HAS_ARGBSHUFFLEROW_AVX2 was not expected.
+#endif
+#endif
+}
+
+#if defined(__i386__) || defined(__x86_64__) || \
+    defined(_M_IX86) || defined(_M_X64)
+TEST_F(LibYUVBaseTest, TestCpuId) {
+  int has_x86 = TestCpuFlag(kCpuHasX86);
+  if (has_x86) {
+    uint32 cpu_info[4];
+    // Vendor ID:
+    // AuthenticAMD AMD processor
+    // CentaurHauls Centaur processor
+    // CyrixInstead Cyrix processor
+    // GenuineIntel Intel processor
+    // GenuineTMx86 Transmeta processor
+    // Geode by NSC National Semiconductor processor
+    // NexGenDriven NexGen processor
+    // RiseRiseRise Rise Technology processor
+    // SiS SiS SiS  SiS processor
+    // UMC UMC UMC  UMC processor
+    CpuId(0, 0, cpu_info);
+    cpu_info[0] = cpu_info[1];  // Reorder output
+    cpu_info[1] = cpu_info[3];
+    cpu_info[3] = 0;
+    printf("Cpu Vendor: %s %x %x %x\n", reinterpret_cast<char*>(&cpu_info[0]),
+           cpu_info[0], cpu_info[1], cpu_info[2]);
+    EXPECT_EQ(12, strlen(reinterpret_cast<char*>(&cpu_info[0])));
+
+    // CPU Family and Model
+    // 3:0 - Stepping
+    // 7:4 - Model
+    // 11:8 - Family
+    // 13:12 - Processor Type
+    // 19:16 - Extended Model
+    // 27:20 - Extended Family
+    CpuId(1, 0, cpu_info);
+    int family = ((cpu_info[0] >> 8) & 0x0f) | ((cpu_info[0] >> 16) & 0xff0);
+    int model = ((cpu_info[0] >> 4) & 0x0f) | ((cpu_info[0] >> 12) & 0xf0);
+    printf("Cpu Family %d (0x%x), Model %d (0x%x)\n", family, family,
+           model, model);
+  }
+}
+#endif
+
+static int FileExists(const char* file_name) {
+  FILE* f = fopen(file_name, "r");
+  if (!f) {
+    return 0;
+  }
+  fclose(f);
+  return 1;
+}
+
+TEST_F(LibYUVBaseTest, TestLinuxNeon) {
+  if (FileExists("../../unit_test/testdata/arm_v7.txt")) {
+    EXPECT_EQ(0, ArmCpuCaps("../../unit_test/testdata/arm_v7.txt"));
+    EXPECT_EQ(kCpuHasNEON, ArmCpuCaps("../../unit_test/testdata/tegra3.txt"));
+    EXPECT_EQ(kCpuHasNEON, ArmCpuCaps("../../unit_test/testdata/juno.txt"));
+  } else {
+    printf("WARNING: unable to load \"../../unit_test/testdata/arm_v7.txt\"\n");
+  }
+#if defined(__linux__) && defined(__ARM_NEON__)
+  EXPECT_EQ(kCpuHasNEON, ArmCpuCaps("/proc/cpuinfo"));
+#endif
+}
+
+}  // namespace libyuv
diff --git a/libs/libyuv/unit_test/math_test.cc b/libs/libyuv/unit_test/math_test.cc
new file mode 100644
index 0000000000..6297954232
--- /dev/null
+++ b/libs/libyuv/unit_test/math_test.cc
@@ -0,0 +1,156 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#include "libyuv/basic_types.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/row.h"
+#include "libyuv/scale.h"
+#include "libyuv/scale_row.h"
+#include "../unit_test/unit_test.h"
+
+namespace libyuv {
+
+TEST_F(LibYUVBaseTest, TestFixedDiv) {
+  int num[1280];
+  int div[1280];
+  int result_opt[1280];
+  int result_c[1280];
+
+  EXPECT_EQ(0x10000, libyuv::FixedDiv(1, 1));
+  EXPECT_EQ(0x7fff0000, libyuv::FixedDiv(0x7fff, 1));
+  // TODO(fbarchard): Avoid the following that throw exceptions.
+  // EXPECT_EQ(0x100000000, libyuv::FixedDiv(0x10000, 1));
+  // EXPECT_EQ(0x80000000, libyuv::FixedDiv(0x8000, 1));
+
+  EXPECT_EQ(0x20000, libyuv::FixedDiv(640 * 2, 640));
+  EXPECT_EQ(0x30000, libyuv::FixedDiv(640 * 3, 640));
+  EXPECT_EQ(0x40000, libyuv::FixedDiv(640 * 4, 640));
+  EXPECT_EQ(0x50000, libyuv::FixedDiv(640 * 5, 640));
+  EXPECT_EQ(0x60000, libyuv::FixedDiv(640 * 6, 640));
+  EXPECT_EQ(0x70000, libyuv::FixedDiv(640 * 7, 640));
+  EXPECT_EQ(0x80000, libyuv::FixedDiv(640 * 8, 640));
+  EXPECT_EQ(0xa0000, libyuv::FixedDiv(640 * 10, 640));
+  EXPECT_EQ(0x20000, libyuv::FixedDiv(960 * 2, 960));
+  EXPECT_EQ(0x08000, libyuv::FixedDiv(640 / 2, 640));
+  EXPECT_EQ(0x04000, libyuv::FixedDiv(640 / 4, 640));
+  EXPECT_EQ(0x20000, libyuv::FixedDiv(1080 * 2, 1080));
+  EXPECT_EQ(0x20000, libyuv::FixedDiv(200000, 100000));
+  EXPECT_EQ(0x18000, libyuv::FixedDiv(150000, 100000));
+  EXPECT_EQ(0x20000, libyuv::FixedDiv(40000, 20000));
+  EXPECT_EQ(0x20000, libyuv::FixedDiv(-40000, -20000));
+  EXPECT_EQ(-0x20000, libyuv::FixedDiv(40000, -20000));
+  EXPECT_EQ(-0x20000, libyuv::FixedDiv(-40000, 20000));
+  EXPECT_EQ(0x10000, libyuv::FixedDiv(4095, 4095));
+  EXPECT_EQ(0x10000, libyuv::FixedDiv(4096, 4096));
+  EXPECT_EQ(0x10000, libyuv::FixedDiv(4097, 4097));
+  EXPECT_EQ(123 * 65536, libyuv::FixedDiv(123, 1));
+
+  for (int i = 1; i < 4100; ++i) {
+    EXPECT_EQ(0x10000, libyuv::FixedDiv(i, i));
+    EXPECT_EQ(0x20000, libyuv::FixedDiv(i * 2, i));
+    EXPECT_EQ(0x30000, libyuv::FixedDiv(i * 3, i));
+    EXPECT_EQ(0x40000, libyuv::FixedDiv(i * 4, i));
+    EXPECT_EQ(0x08000, libyuv::FixedDiv(i, i * 2));
+    EXPECT_NEAR(16384 * 65536 / i, libyuv::FixedDiv(16384, i), 1);
+  }
+  EXPECT_EQ(123 * 65536, libyuv::FixedDiv(123, 1));
+
+  MemRandomize(reinterpret_cast<uint8*>(&num[0]), sizeof(num));
+  MemRandomize(reinterpret_cast<uint8*>(&div[0]), sizeof(div));
+  for (int j = 0; j < 1280; ++j) {
+    if (div[j] == 0) {
+      div[j] = 1280;
+    }
+    num[j] &= 0xffff;  // Clamp to avoid divide overflow.
+  }
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    for (int j = 0; j < 1280; ++j) {
+      result_opt[j] = libyuv::FixedDiv(num[j], div[j]);
+    }
+  }
+  for (int j = 0; j < 1280; ++j) {
+    result_c[j] = libyuv::FixedDiv_C(num[j], div[j]);
+    EXPECT_NEAR(result_c[j], result_opt[j], 1);
+  }
+}
+
+TEST_F(LibYUVBaseTest, TestFixedDiv_Opt) {
+  int num[1280];
+  int div[1280];
+  int result_opt[1280];
+  int result_c[1280];
+
+  MemRandomize(reinterpret_cast<uint8*>(&num[0]), sizeof(num));
+  MemRandomize(reinterpret_cast<uint8*>(&div[0]), sizeof(div));
+  for (int j = 0; j < 1280; ++j) {
+    num[j] &= 4095;  // Make numerator smaller.
+    div[j] &= 4095;  // Make divisor smaller.
+    if (div[j] == 0) {
+      div[j] = 1280;
+    }
+  }
+
+  int has_x86 = TestCpuFlag(kCpuHasX86);
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    if (has_x86) {
+      for (int j = 0; j < 1280; ++j) {
+        result_opt[j] = libyuv::FixedDiv(num[j], div[j]);
+      }
+    } else {
+      for (int j = 0; j < 1280; ++j) {
+        result_opt[j] = libyuv::FixedDiv_C(num[j], div[j]);
+      }
+    }
+  }
+  for (int j = 0; j < 1280; ++j) {
+    result_c[j] = libyuv::FixedDiv_C(num[j], div[j]);
+    EXPECT_NEAR(result_c[j], result_opt[j], 1);
+  }
+}
+
+TEST_F(LibYUVBaseTest, TestFixedDiv1_Opt) {
+  int num[1280];
+  int div[1280];
+  int result_opt[1280];
+  int result_c[1280];
+
+  MemRandomize(reinterpret_cast<uint8*>(&num[0]), sizeof(num));
+  MemRandomize(reinterpret_cast<uint8*>(&div[0]), sizeof(div));
+  for (int j = 0; j < 1280; ++j) {
+    num[j] &= 4095;  // Make numerator smaller.
+    div[j] &= 4095;  // Make divisor smaller.
+    if (div[j] <= 1) {
+      div[j] = 1280;
+    }
+  }
+
+  int has_x86 = TestCpuFlag(kCpuHasX86);
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    if (has_x86) {
+      for (int j = 0; j < 1280; ++j) {
+        result_opt[j] = libyuv::FixedDiv1(num[j], div[j]);
+      }
+    } else {
+      for (int j = 0; j < 1280; ++j) {
+        result_opt[j] = libyuv::FixedDiv1_C(num[j], div[j]);
+      }
+    }
+  }
+  for (int j = 0; j < 1280; ++j) {
+    result_c[j] = libyuv::FixedDiv1_C(num[j], div[j]);
+    EXPECT_NEAR(result_c[j], result_opt[j], 1);
+  }
+}
+
+}  // namespace libyuv
diff --git a/libs/libyuv/unit_test/planar_test.cc b/libs/libyuv/unit_test/planar_test.cc
new file mode 100644
index 0000000000..9146c9a455
--- /dev/null
+++ b/libs/libyuv/unit_test/planar_test.cc
@@ -0,0 +1,2532 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <time.h>
+
+#include "libyuv/compare.h"
+#include "libyuv/convert.h"
+#include "libyuv/convert_argb.h"
+#include "libyuv/convert_from.h"
+#include "libyuv/convert_from_argb.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+#include "libyuv/row.h"  // For Sobel
+#include "../unit_test/unit_test.h"
+
+namespace libyuv {
+
+TEST_F(LibYUVPlanarTest, TestAttenuate) {
+  const int kSize = 1280 * 4;
+  align_buffer_64(orig_pixels, kSize);
+  align_buffer_64(atten_pixels, kSize);
+  align_buffer_64(unatten_pixels, kSize);
+  align_buffer_64(atten2_pixels, kSize);
+
+  // Test unattenuation clamps
+  orig_pixels[0 * 4 + 0] = 200u;
+  orig_pixels[0 * 4 + 1] = 129u;
+  orig_pixels[0 * 4 + 2] = 127u;
+  orig_pixels[0 * 4 + 3] = 128u;
+  // Test unattenuation transparent and opaque are unaffected
+  orig_pixels[1 * 4 + 0] = 16u;
+  orig_pixels[1 * 4 + 1] = 64u;
+  orig_pixels[1 * 4 + 2] = 192u;
+  orig_pixels[1 * 4 + 3] = 0u;
+  orig_pixels[2 * 4 + 0] = 16u;
+  orig_pixels[2 * 4 + 1] = 64u;
+  orig_pixels[2 * 4 + 2] = 192u;
+  orig_pixels[2 * 4 + 3] = 255u;
+  orig_pixels[3 * 4 + 0] = 16u;
+  orig_pixels[3 * 4 + 1] = 64u;
+  orig_pixels[3 * 4 + 2] = 192u;
+  orig_pixels[3 * 4 + 3] = 128u;
+  ARGBUnattenuate(orig_pixels, 0, unatten_pixels, 0, 4, 1);
+  EXPECT_EQ(255u, unatten_pixels[0 * 4 + 0]);
+  EXPECT_EQ(255u, unatten_pixels[0 * 4 + 1]);
+  EXPECT_EQ(254u, unatten_pixels[0 * 4 + 2]);
+  EXPECT_EQ(128u, unatten_pixels[0 * 4 + 3]);
+  EXPECT_EQ(0u, unatten_pixels[1 * 4 + 0]);
+  EXPECT_EQ(0u, unatten_pixels[1 * 4 + 1]);
+  EXPECT_EQ(0u, unatten_pixels[1 * 4 + 2]);
+  EXPECT_EQ(0u, unatten_pixels[1 * 4 + 3]);
+  EXPECT_EQ(16u, unatten_pixels[2 * 4 + 0]);
+  EXPECT_EQ(64u, unatten_pixels[2 * 4 + 1]);
+  EXPECT_EQ(192u, unatten_pixels[2 * 4 + 2]);
+  EXPECT_EQ(255u, unatten_pixels[2 * 4 + 3]);
+  EXPECT_EQ(32u, unatten_pixels[3 * 4 + 0]);
+  EXPECT_EQ(128u, unatten_pixels[3 * 4 + 1]);
+  EXPECT_EQ(255u, unatten_pixels[3 * 4 + 2]);
+  EXPECT_EQ(128u, unatten_pixels[3 * 4 + 3]);
+
+  for (int i = 0; i < 1280; ++i) {
+    orig_pixels[i * 4 + 0] = i;
+    orig_pixels[i * 4 + 1] = i / 2;
+    orig_pixels[i * 4 + 2] = i / 3;
+    orig_pixels[i * 4 + 3] = i;
+  }
+  ARGBAttenuate(orig_pixels, 0, atten_pixels, 0, 1280, 1);
+  ARGBUnattenuate(atten_pixels, 0, unatten_pixels, 0, 1280, 1);
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    ARGBAttenuate(unatten_pixels, 0, atten2_pixels, 0, 1280, 1);
+  }
+  for (int i = 0; i < 1280; ++i) {
+    EXPECT_NEAR(atten_pixels[i * 4 + 0], atten2_pixels[i * 4 + 0], 2);
+    EXPECT_NEAR(atten_pixels[i * 4 + 1], atten2_pixels[i * 4 + 1], 2);
+    EXPECT_NEAR(atten_pixels[i * 4 + 2], atten2_pixels[i * 4 + 2], 2);
+    EXPECT_NEAR(atten_pixels[i * 4 + 3], atten2_pixels[i * 4 + 3], 2);
+  }
+  // Make sure transparent, 50% and opaque are fully accurate.
+  EXPECT_EQ(0, atten_pixels[0 * 4 + 0]);
+  EXPECT_EQ(0, atten_pixels[0 * 4 + 1]);
+  EXPECT_EQ(0, atten_pixels[0 * 4 + 2]);
+  EXPECT_EQ(0, atten_pixels[0 * 4 + 3]);
+  EXPECT_EQ(64, atten_pixels[128 * 4 + 0]);
+  EXPECT_EQ(32, atten_pixels[128 * 4 + 1]);
+  EXPECT_EQ(21,  atten_pixels[128 * 4 + 2]);
+  EXPECT_EQ(128, atten_pixels[128 * 4 + 3]);
+  EXPECT_NEAR(255, atten_pixels[255 * 4 + 0], 1);
+  EXPECT_NEAR(127, atten_pixels[255 * 4 + 1], 1);
+  EXPECT_NEAR(85,  atten_pixels[255 * 4 + 2], 1);
+  EXPECT_EQ(255, atten_pixels[255 * 4 + 3]);
+
+  free_aligned_buffer_64(atten2_pixels);
+  free_aligned_buffer_64(unatten_pixels);
+  free_aligned_buffer_64(atten_pixels);
+  free_aligned_buffer_64(orig_pixels);
+}
+
+static int TestAttenuateI(int width, int height, int benchmark_iterations,
+                          int disable_cpu_flags, int benchmark_cpu_info,
+                          int invert, int off) {
+  if (width < 1) {
+    width = 1;
+  }
+  const int kBpp = 4;
+  const int kStride = width * kBpp;
+  align_buffer_64(src_argb, kStride * height + off);
+  align_buffer_64(dst_argb_c, kStride * height);
+  align_buffer_64(dst_argb_opt, kStride * height);
+  for (int i = 0; i < kStride * height; ++i) {
+    src_argb[i + off] = (fastrand() & 0xff);
+  }
+  memset(dst_argb_c, 0, kStride * height);
+  memset(dst_argb_opt, 0, kStride * height);
+
+  MaskCpuFlags(disable_cpu_flags);
+  ARGBAttenuate(src_argb + off, kStride,
+                dst_argb_c, kStride,
+                width, invert * height);
+  MaskCpuFlags(benchmark_cpu_info);
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    ARGBAttenuate(src_argb + off, kStride,
+                  dst_argb_opt, kStride,
+                  width, invert * height);
+  }
+  int max_diff = 0;
+  for (int i = 0; i < kStride * height; ++i) {
+    int abs_diff =
+        abs(static_cast<int>(dst_argb_c[i]) -
+            static_cast<int>(dst_argb_opt[i]));
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
+    }
+  }
+  free_aligned_buffer_64(src_argb);
+  free_aligned_buffer_64(dst_argb_c);
+  free_aligned_buffer_64(dst_argb_opt);
+  return max_diff;
+}
+
+TEST_F(LibYUVPlanarTest, ARGBAttenuate_Any) {
+  int max_diff = TestAttenuateI(benchmark_width_ - 1, benchmark_height_,
+                                benchmark_iterations_,
+                                disable_cpu_flags_, benchmark_cpu_info_,
+                                +1, 0);
+  EXPECT_LE(max_diff, 2);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBAttenuate_Unaligned) {
+  int max_diff = TestAttenuateI(benchmark_width_, benchmark_height_,
+                                benchmark_iterations_,
+                                disable_cpu_flags_, benchmark_cpu_info_,
+                                +1, 1);
+  EXPECT_LE(max_diff, 2);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBAttenuate_Invert) {
+  int max_diff = TestAttenuateI(benchmark_width_, benchmark_height_,
+                                benchmark_iterations_,
+                                disable_cpu_flags_, benchmark_cpu_info_,
+                                -1, 0);
+  EXPECT_LE(max_diff, 2);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBAttenuate_Opt) {
+  int max_diff = TestAttenuateI(benchmark_width_, benchmark_height_,
+                                benchmark_iterations_,
+                                disable_cpu_flags_, benchmark_cpu_info_,
+                                +1, 0);
+  EXPECT_LE(max_diff, 2);
+}
+
+static int TestUnattenuateI(int width, int height, int benchmark_iterations,
+                            int disable_cpu_flags, int benchmark_cpu_info,
+                            int invert, int off) {
+  if (width < 1) {
+    width = 1;
+  }
+  const int kBpp = 4;
+  const int kStride = width * kBpp;
+  align_buffer_64(src_argb, kStride * height + off);
+  align_buffer_64(dst_argb_c, kStride * height);
+  align_buffer_64(dst_argb_opt, kStride * height);
+  for (int i = 0; i < kStride * height; ++i) {
+    src_argb[i + off] = (fastrand() & 0xff);
+  }
+  ARGBAttenuate(src_argb + off, kStride,
+                src_argb + off, kStride,
+                width, height);
+  memset(dst_argb_c, 0, kStride * height);
+  memset(dst_argb_opt, 0, kStride * height);
+
+  MaskCpuFlags(disable_cpu_flags);
+  ARGBUnattenuate(src_argb + off, kStride,
+                  dst_argb_c, kStride,
+                  width, invert * height);
+  MaskCpuFlags(benchmark_cpu_info);
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    ARGBUnattenuate(src_argb + off, kStride,
+                    dst_argb_opt, kStride,
+                    width, invert * height);
+  }
+  int max_diff = 0;
+  for (int i = 0; i < kStride * height; ++i) {
+    int abs_diff =
+        abs(static_cast<int>(dst_argb_c[i]) -
+            static_cast<int>(dst_argb_opt[i]));
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
+    }
+  }
+  free_aligned_buffer_64(src_argb);
+  free_aligned_buffer_64(dst_argb_c);
+  free_aligned_buffer_64(dst_argb_opt);
+  return max_diff;
+}
+
+TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Any) {
+  int max_diff = TestUnattenuateI(benchmark_width_ - 1, benchmark_height_,
+                                  benchmark_iterations_,
+                                  disable_cpu_flags_, benchmark_cpu_info_,
+                                  +1, 0);
+  EXPECT_LE(max_diff, 2);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Unaligned) {
+  int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_,
+                                  benchmark_iterations_,
+                                  disable_cpu_flags_, benchmark_cpu_info_,
+                                  +1, 1);
+  EXPECT_LE(max_diff, 2);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Invert) {
+  int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_,
+                                  benchmark_iterations_,
+                                  disable_cpu_flags_, benchmark_cpu_info_,
+                                  -1, 0);
+  EXPECT_LE(max_diff, 2);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Opt) {
+  int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_,
+                                  benchmark_iterations_,
+                                  disable_cpu_flags_, benchmark_cpu_info_,
+                                  +1, 0);
+  EXPECT_LE(max_diff, 2);
+}
+
+TEST_F(LibYUVPlanarTest, TestARGBComputeCumulativeSum) {
+  SIMD_ALIGNED(uint8 orig_pixels[16][16][4]);
+  SIMD_ALIGNED(int32 added_pixels[16][16][4]);
+
+  for (int y = 0; y < 16; ++y) {
+    for (int x = 0; x < 16; ++x) {
+      orig_pixels[y][x][0] = 1u;
+      orig_pixels[y][x][1] = 2u;
+      orig_pixels[y][x][2] = 3u;
+      orig_pixels[y][x][3] = 255u;
+    }
+  }
+
+  ARGBComputeCumulativeSum(&orig_pixels[0][0][0], 16 * 4,
+                           &added_pixels[0][0][0], 16 * 4,
+                           16, 16);
+
+  for (int y = 0; y < 16; ++y) {
+    for (int x = 0; x < 16; ++x) {
+      EXPECT_EQ((x + 1) * (y + 1), added_pixels[y][x][0]);
+      EXPECT_EQ((x + 1) * (y + 1) * 2, added_pixels[y][x][1]);
+      EXPECT_EQ((x + 1) * (y + 1) * 3, added_pixels[y][x][2]);
+      EXPECT_EQ((x + 1) * (y + 1) * 255, added_pixels[y][x][3]);
+    }
+  }
+}
+
+TEST_F(LibYUVPlanarTest, TestARGBGray) {
+  SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+  memset(orig_pixels, 0, sizeof(orig_pixels));
+
+  // Test blue
+  orig_pixels[0][0] = 255u;
+  orig_pixels[0][1] = 0u;
+  orig_pixels[0][2] = 0u;
+  orig_pixels[0][3] = 128u;
+  // Test green
+  orig_pixels[1][0] = 0u;
+  orig_pixels[1][1] = 255u;
+  orig_pixels[1][2] = 0u;
+  orig_pixels[1][3] = 0u;
+  // Test red
+  orig_pixels[2][0] = 0u;
+  orig_pixels[2][1] = 0u;
+  orig_pixels[2][2] = 255u;
+  orig_pixels[2][3] = 255u;
+  // Test black
+  orig_pixels[3][0] = 0u;
+  orig_pixels[3][1] = 0u;
+  orig_pixels[3][2] = 0u;
+  orig_pixels[3][3] = 255u;
+  // Test white
+  orig_pixels[4][0] = 255u;
+  orig_pixels[4][1] = 255u;
+  orig_pixels[4][2] = 255u;
+  orig_pixels[4][3] = 255u;
+  // Test color
+  orig_pixels[5][0] = 16u;
+  orig_pixels[5][1] = 64u;
+  orig_pixels[5][2] = 192u;
+  orig_pixels[5][3] = 224u;
+  // Do 16 to test asm version.
+  ARGBGray(&orig_pixels[0][0], 0, 0, 0, 16, 1);
+  EXPECT_EQ(30u, orig_pixels[0][0]);
+  EXPECT_EQ(30u, orig_pixels[0][1]);
+  EXPECT_EQ(30u, orig_pixels[0][2]);
+  EXPECT_EQ(128u, orig_pixels[0][3]);
+  EXPECT_EQ(149u, orig_pixels[1][0]);
+  EXPECT_EQ(149u, orig_pixels[1][1]);
+  EXPECT_EQ(149u, orig_pixels[1][2]);
+  EXPECT_EQ(0u, orig_pixels[1][3]);
+  EXPECT_EQ(76u, orig_pixels[2][0]);
+  EXPECT_EQ(76u, orig_pixels[2][1]);
+  EXPECT_EQ(76u, orig_pixels[2][2]);
+  EXPECT_EQ(255u, orig_pixels[2][3]);
+  EXPECT_EQ(0u, orig_pixels[3][0]);
+  EXPECT_EQ(0u, orig_pixels[3][1]);
+  EXPECT_EQ(0u, orig_pixels[3][2]);
+  EXPECT_EQ(255u, orig_pixels[3][3]);
+  EXPECT_EQ(255u, orig_pixels[4][0]);
+  EXPECT_EQ(255u, orig_pixels[4][1]);
+  EXPECT_EQ(255u, orig_pixels[4][2]);
+  EXPECT_EQ(255u, orig_pixels[4][3]);
+  EXPECT_EQ(96u, orig_pixels[5][0]);
+  EXPECT_EQ(96u, orig_pixels[5][1]);
+  EXPECT_EQ(96u, orig_pixels[5][2]);
+  EXPECT_EQ(224u, orig_pixels[5][3]);
+  for (int i = 0; i < 1280; ++i) {
+    orig_pixels[i][0] = i;
+    orig_pixels[i][1] = i / 2;
+    orig_pixels[i][2] = i / 3;
+    orig_pixels[i][3] = i;
+  }
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    ARGBGray(&orig_pixels[0][0], 0, 0, 0, 1280, 1);
+  }
+}
+
+TEST_F(LibYUVPlanarTest, TestARGBGrayTo) {
+  SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+  SIMD_ALIGNED(uint8 gray_pixels[1280][4]);
+  memset(orig_pixels, 0, sizeof(orig_pixels));
+
+  // Test blue
+  orig_pixels[0][0] = 255u;
+  orig_pixels[0][1] = 0u;
+  orig_pixels[0][2] = 0u;
+  orig_pixels[0][3] = 128u;
+  // Test green
+  orig_pixels[1][0] = 0u;
+  orig_pixels[1][1] = 255u;
+  orig_pixels[1][2] = 0u;
+  orig_pixels[1][3] = 0u;
+  // Test red
+  orig_pixels[2][0] = 0u;
+  orig_pixels[2][1] = 0u;
+  orig_pixels[2][2] = 255u;
+  orig_pixels[2][3] = 255u;
+  // Test black
+  orig_pixels[3][0] = 0u;
+  orig_pixels[3][1] = 0u;
+  orig_pixels[3][2] = 0u;
+  orig_pixels[3][3] = 255u;
+  // Test white
+  orig_pixels[4][0] = 255u;
+  orig_pixels[4][1] = 255u;
+  orig_pixels[4][2] = 255u;
+  orig_pixels[4][3] = 255u;
+  // Test color
+  orig_pixels[5][0] = 16u;
+  orig_pixels[5][1] = 64u;
+  orig_pixels[5][2] = 192u;
+  orig_pixels[5][3] = 224u;
+  // Do 16 to test asm version.
+  ARGBGrayTo(&orig_pixels[0][0], 0, &gray_pixels[0][0], 0, 16, 1);
+  EXPECT_EQ(30u, gray_pixels[0][0]);
+  EXPECT_EQ(30u, gray_pixels[0][1]);
+  EXPECT_EQ(30u, gray_pixels[0][2]);
+  EXPECT_EQ(128u, gray_pixels[0][3]);
+  EXPECT_EQ(149u, gray_pixels[1][0]);
+  EXPECT_EQ(149u, gray_pixels[1][1]);
+  EXPECT_EQ(149u, gray_pixels[1][2]);
+  EXPECT_EQ(0u, gray_pixels[1][3]);
+  EXPECT_EQ(76u, gray_pixels[2][0]);
+  EXPECT_EQ(76u, gray_pixels[2][1]);
+  EXPECT_EQ(76u, gray_pixels[2][2]);
+  EXPECT_EQ(255u, gray_pixels[2][3]);
+  EXPECT_EQ(0u, gray_pixels[3][0]);
+  EXPECT_EQ(0u, gray_pixels[3][1]);
+  EXPECT_EQ(0u, gray_pixels[3][2]);
+  EXPECT_EQ(255u, gray_pixels[3][3]);
+  EXPECT_EQ(255u, gray_pixels[4][0]);
+  EXPECT_EQ(255u, gray_pixels[4][1]);
+  EXPECT_EQ(255u, gray_pixels[4][2]);
+  EXPECT_EQ(255u, gray_pixels[4][3]);
+  EXPECT_EQ(96u, gray_pixels[5][0]);
+  EXPECT_EQ(96u, gray_pixels[5][1]);
+  EXPECT_EQ(96u, gray_pixels[5][2]);
+  EXPECT_EQ(224u, gray_pixels[5][3]);
+  for (int i = 0; i < 1280; ++i) {
+    orig_pixels[i][0] = i;
+    orig_pixels[i][1] = i / 2;
+    orig_pixels[i][2] = i / 3;
+    orig_pixels[i][3] = i;
+  }
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    ARGBGrayTo(&orig_pixels[0][0], 0, &gray_pixels[0][0], 0, 1280, 1);
+  }
+}
+
+TEST_F(LibYUVPlanarTest, TestARGBSepia) {
+  SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+  memset(orig_pixels, 0, sizeof(orig_pixels));
+
+  // Test blue
+  orig_pixels[0][0] = 255u;
+  orig_pixels[0][1] = 0u;
+  orig_pixels[0][2] = 0u;
+  orig_pixels[0][3] = 128u;
+  // Test green
+  orig_pixels[1][0] = 0u;
+  orig_pixels[1][1] = 255u;
+  orig_pixels[1][2] = 0u;
+  orig_pixels[1][3] = 0u;
+  // Test red
+  orig_pixels[2][0] = 0u;
+  orig_pixels[2][1] = 0u;
+  orig_pixels[2][2] = 255u;
+  orig_pixels[2][3] = 255u;
+  // Test black
+  orig_pixels[3][0] = 0u;
+  orig_pixels[3][1] = 0u;
+  orig_pixels[3][2] = 0u;
+  orig_pixels[3][3] = 255u;
+  // Test white
+  orig_pixels[4][0] = 255u;
+  orig_pixels[4][1] = 255u;
+  orig_pixels[4][2] = 255u;
+  orig_pixels[4][3] = 255u;
+  // Test color
+  orig_pixels[5][0] = 16u;
+  orig_pixels[5][1] = 64u;
+  orig_pixels[5][2] = 192u;
+  orig_pixels[5][3] = 224u;
+  // Do 16 to test asm version.
+  ARGBSepia(&orig_pixels[0][0], 0, 0, 0, 16, 1);
+  EXPECT_EQ(33u, orig_pixels[0][0]);
+  EXPECT_EQ(43u, orig_pixels[0][1]);
+  EXPECT_EQ(47u, orig_pixels[0][2]);
+  EXPECT_EQ(128u, orig_pixels[0][3]);
+  EXPECT_EQ(135u, orig_pixels[1][0]);
+  EXPECT_EQ(175u, orig_pixels[1][1]);
+  EXPECT_EQ(195u, orig_pixels[1][2]);
+  EXPECT_EQ(0u, orig_pixels[1][3]);
+  EXPECT_EQ(69u, orig_pixels[2][0]);
+  EXPECT_EQ(89u, orig_pixels[2][1]);
+  EXPECT_EQ(99u, orig_pixels[2][2]);
+  EXPECT_EQ(255u, orig_pixels[2][3]);
+  EXPECT_EQ(0u, orig_pixels[3][0]);
+  EXPECT_EQ(0u, orig_pixels[3][1]);
+  EXPECT_EQ(0u, orig_pixels[3][2]);
+  EXPECT_EQ(255u, orig_pixels[3][3]);
+  EXPECT_EQ(239u, orig_pixels[4][0]);
+  EXPECT_EQ(255u, orig_pixels[4][1]);
+  EXPECT_EQ(255u, orig_pixels[4][2]);
+  EXPECT_EQ(255u, orig_pixels[4][3]);
+  EXPECT_EQ(88u, orig_pixels[5][0]);
+  EXPECT_EQ(114u, orig_pixels[5][1]);
+  EXPECT_EQ(127u, orig_pixels[5][2]);
+  EXPECT_EQ(224u, orig_pixels[5][3]);
+
+  for (int i = 0; i < 1280; ++i) {
+    orig_pixels[i][0] = i;
+    orig_pixels[i][1] = i / 2;
+    orig_pixels[i][2] = i / 3;
+    orig_pixels[i][3] = i;
+  }
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    ARGBSepia(&orig_pixels[0][0], 0, 0, 0, 1280, 1);
+  }
+}
+
+TEST_F(LibYUVPlanarTest, TestARGBColorMatrix) {
+  SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+  SIMD_ALIGNED(uint8 dst_pixels_opt[1280][4]);
+  SIMD_ALIGNED(uint8 dst_pixels_c[1280][4]);
+
+  // Matrix for Sepia.
+  SIMD_ALIGNED(static const int8 kRGBToSepia[]) = {
+    17 / 2, 68 / 2, 35 / 2, 0,
+    22 / 2, 88 / 2, 45 / 2, 0,
+    24 / 2, 98 / 2, 50 / 2, 0,
+    0, 0, 0, 64,  // Copy alpha.
+  };
+  memset(orig_pixels, 0, sizeof(orig_pixels));
+
+  // Test blue
+  orig_pixels[0][0] = 255u;
+  orig_pixels[0][1] = 0u;
+  orig_pixels[0][2] = 0u;
+  orig_pixels[0][3] = 128u;
+  // Test green
+  orig_pixels[1][0] = 0u;
+  orig_pixels[1][1] = 255u;
+  orig_pixels[1][2] = 0u;
+  orig_pixels[1][3] = 0u;
+  // Test red
+  orig_pixels[2][0] = 0u;
+  orig_pixels[2][1] = 0u;
+  orig_pixels[2][2] = 255u;
+  orig_pixels[2][3] = 255u;
+  // Test color
+  orig_pixels[3][0] = 16u;
+  orig_pixels[3][1] = 64u;
+  orig_pixels[3][2] = 192u;
+  orig_pixels[3][3] = 224u;
+  // Do 16 to test asm version.
+  ARGBColorMatrix(&orig_pixels[0][0], 0, &dst_pixels_opt[0][0], 0,
+                  &kRGBToSepia[0], 16, 1);
+  EXPECT_EQ(31u, dst_pixels_opt[0][0]);
+  EXPECT_EQ(43u, dst_pixels_opt[0][1]);
+  EXPECT_EQ(47u, dst_pixels_opt[0][2]);
+  EXPECT_EQ(128u, dst_pixels_opt[0][3]);
+  EXPECT_EQ(135u, dst_pixels_opt[1][0]);
+  EXPECT_EQ(175u, dst_pixels_opt[1][1]);
+  EXPECT_EQ(195u, dst_pixels_opt[1][2]);
+  EXPECT_EQ(0u, dst_pixels_opt[1][3]);
+  EXPECT_EQ(67u, dst_pixels_opt[2][0]);
+  EXPECT_EQ(87u, dst_pixels_opt[2][1]);
+  EXPECT_EQ(99u, dst_pixels_opt[2][2]);
+  EXPECT_EQ(255u, dst_pixels_opt[2][3]);
+  EXPECT_EQ(87u, dst_pixels_opt[3][0]);
+  EXPECT_EQ(112u, dst_pixels_opt[3][1]);
+  EXPECT_EQ(127u, dst_pixels_opt[3][2]);
+  EXPECT_EQ(224u, dst_pixels_opt[3][3]);
+
+  for (int i = 0; i < 1280; ++i) {
+    orig_pixels[i][0] = i;
+    orig_pixels[i][1] = i / 2;
+    orig_pixels[i][2] = i / 3;
+    orig_pixels[i][3] = i;
+  }
+  MaskCpuFlags(disable_cpu_flags_);
+  ARGBColorMatrix(&orig_pixels[0][0], 0, &dst_pixels_c[0][0], 0,
+                  &kRGBToSepia[0], 1280, 1);
+  MaskCpuFlags(benchmark_cpu_info_);
+
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    ARGBColorMatrix(&orig_pixels[0][0], 0, &dst_pixels_opt[0][0], 0,
+                    &kRGBToSepia[0], 1280, 1);
+  }
+
+  for (int i = 0; i < 1280; ++i) {
+    EXPECT_EQ(dst_pixels_c[i][0], dst_pixels_opt[i][0]);
+    EXPECT_EQ(dst_pixels_c[i][1], dst_pixels_opt[i][1]);
+    EXPECT_EQ(dst_pixels_c[i][2], dst_pixels_opt[i][2]);
+    EXPECT_EQ(dst_pixels_c[i][3], dst_pixels_opt[i][3]);
+  }
+}
+
+TEST_F(LibYUVPlanarTest, TestRGBColorMatrix) {
+  SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+
+  // Matrix for Sepia.
+  SIMD_ALIGNED(static const int8 kRGBToSepia[]) = {
+    17, 68, 35, 0,
+    22, 88, 45, 0,
+    24, 98, 50, 0,
+    0, 0, 0, 0,  // Unused but makes matrix 16 bytes.
+  };
+  memset(orig_pixels, 0, sizeof(orig_pixels));
+
+  // Test blue
+  orig_pixels[0][0] = 255u;
+  orig_pixels[0][1] = 0u;
+  orig_pixels[0][2] = 0u;
+  orig_pixels[0][3] = 128u;
+  // Test green
+  orig_pixels[1][0] = 0u;
+  orig_pixels[1][1] = 255u;
+  orig_pixels[1][2] = 0u;
+  orig_pixels[1][3] = 0u;
+  // Test red
+  orig_pixels[2][0] = 0u;
+  orig_pixels[2][1] = 0u;
+  orig_pixels[2][2] = 255u;
+  orig_pixels[2][3] = 255u;
+  // Test color
+  orig_pixels[3][0] = 16u;
+  orig_pixels[3][1] = 64u;
+  orig_pixels[3][2] = 192u;
+  orig_pixels[3][3] = 224u;
+  // Do 16 to test asm version.
+  RGBColorMatrix(&orig_pixels[0][0], 0, &kRGBToSepia[0], 0, 0, 16, 1);
+  EXPECT_EQ(31u, orig_pixels[0][0]);
+  EXPECT_EQ(43u, orig_pixels[0][1]);
+  EXPECT_EQ(47u, orig_pixels[0][2]);
+  EXPECT_EQ(128u, orig_pixels[0][3]);
+  EXPECT_EQ(135u, orig_pixels[1][0]);
+  EXPECT_EQ(175u, orig_pixels[1][1]);
+  EXPECT_EQ(195u, orig_pixels[1][2]);
+  EXPECT_EQ(0u, orig_pixels[1][3]);
+  EXPECT_EQ(67u, orig_pixels[2][0]);
+  EXPECT_EQ(87u, orig_pixels[2][1]);
+  EXPECT_EQ(99u, orig_pixels[2][2]);
+  EXPECT_EQ(255u, orig_pixels[2][3]);
+  EXPECT_EQ(87u, orig_pixels[3][0]);
+  EXPECT_EQ(112u, orig_pixels[3][1]);
+  EXPECT_EQ(127u, orig_pixels[3][2]);
+  EXPECT_EQ(224u, orig_pixels[3][3]);
+
+  for (int i = 0; i < 1280; ++i) {
+    orig_pixels[i][0] = i;
+    orig_pixels[i][1] = i / 2;
+    orig_pixels[i][2] = i / 3;
+    orig_pixels[i][3] = i;
+  }
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    RGBColorMatrix(&orig_pixels[0][0], 0, &kRGBToSepia[0], 0, 0, 1280, 1);
+  }
+}
+
+TEST_F(LibYUVPlanarTest, TestARGBColorTable) {
+  SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+  memset(orig_pixels, 0, sizeof(orig_pixels));
+
+  // Matrix for Sepia.
+  static const uint8 kARGBTable[256 * 4] = {
+    1u, 2u, 3u, 4u,
+    5u, 6u, 7u, 8u,
+    9u, 10u, 11u, 12u,
+    13u, 14u, 15u, 16u,
+  };
+
+  orig_pixels[0][0] = 0u;
+  orig_pixels[0][1] = 0u;
+  orig_pixels[0][2] = 0u;
+  orig_pixels[0][3] = 0u;
+  orig_pixels[1][0] = 1u;
+  orig_pixels[1][1] = 1u;
+  orig_pixels[1][2] = 1u;
+  orig_pixels[1][3] = 1u;
+  orig_pixels[2][0] = 2u;
+  orig_pixels[2][1] = 2u;
+  orig_pixels[2][2] = 2u;
+  orig_pixels[2][3] = 2u;
+  orig_pixels[3][0] = 0u;
+  orig_pixels[3][1] = 1u;
+  orig_pixels[3][2] = 2u;
+  orig_pixels[3][3] = 3u;
+  // Do 16 to test asm version.
+  ARGBColorTable(&orig_pixels[0][0], 0, &kARGBTable[0], 0, 0, 16, 1);
+  EXPECT_EQ(1u, orig_pixels[0][0]);
+  EXPECT_EQ(2u, orig_pixels[0][1]);
+  EXPECT_EQ(3u, orig_pixels[0][2]);
+  EXPECT_EQ(4u, orig_pixels[0][3]);
+  EXPECT_EQ(5u, orig_pixels[1][0]);
+  EXPECT_EQ(6u, orig_pixels[1][1]);
+  EXPECT_EQ(7u, orig_pixels[1][2]);
+  EXPECT_EQ(8u, orig_pixels[1][3]);
+  EXPECT_EQ(9u, orig_pixels[2][0]);
+  EXPECT_EQ(10u, orig_pixels[2][1]);
+  EXPECT_EQ(11u, orig_pixels[2][2]);
+  EXPECT_EQ(12u, orig_pixels[2][3]);
+  EXPECT_EQ(1u, orig_pixels[3][0]);
+  EXPECT_EQ(6u, orig_pixels[3][1]);
+  EXPECT_EQ(11u, orig_pixels[3][2]);
+  EXPECT_EQ(16u, orig_pixels[3][3]);
+
+  for (int i = 0; i < 1280; ++i) {
+    orig_pixels[i][0] = i;
+    orig_pixels[i][1] = i / 2;
+    orig_pixels[i][2] = i / 3;
+    orig_pixels[i][3] = i;
+  }
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    ARGBColorTable(&orig_pixels[0][0], 0, &kARGBTable[0], 0, 0, 1280, 1);
+  }
+}
+
+// Same as TestARGBColorTable except alpha does not change.
+TEST_F(LibYUVPlanarTest, TestRGBColorTable) {
+  SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+  memset(orig_pixels, 0, sizeof(orig_pixels));
+
+  // Matrix for Sepia.
+  static const uint8 kARGBTable[256 * 4] = {
+    1u, 2u, 3u, 4u,
+    5u, 6u, 7u, 8u,
+    9u, 10u, 11u, 12u,
+    13u, 14u, 15u, 16u,
+  };
+
+  orig_pixels[0][0] = 0u;
+  orig_pixels[0][1] = 0u;
+  orig_pixels[0][2] = 0u;
+  orig_pixels[0][3] = 0u;
+  orig_pixels[1][0] = 1u;
+  orig_pixels[1][1] = 1u;
+  orig_pixels[1][2] = 1u;
+  orig_pixels[1][3] = 1u;
+  orig_pixels[2][0] = 2u;
+  orig_pixels[2][1] = 2u;
+  orig_pixels[2][2] = 2u;
+  orig_pixels[2][3] = 2u;
+  orig_pixels[3][0] = 0u;
+  orig_pixels[3][1] = 1u;
+  orig_pixels[3][2] = 2u;
+  orig_pixels[3][3] = 3u;
+  // Do 16 to test asm version.
+  RGBColorTable(&orig_pixels[0][0], 0, &kARGBTable[0], 0, 0, 16, 1);
+  EXPECT_EQ(1u, orig_pixels[0][0]);
+  EXPECT_EQ(2u, orig_pixels[0][1]);
+  EXPECT_EQ(3u, orig_pixels[0][2]);
+  EXPECT_EQ(0u, orig_pixels[0][3]);  // Alpha unchanged.
+  EXPECT_EQ(5u, orig_pixels[1][0]);
+  EXPECT_EQ(6u, orig_pixels[1][1]);
+  EXPECT_EQ(7u, orig_pixels[1][2]);
+  EXPECT_EQ(1u, orig_pixels[1][3]);  // Alpha unchanged.
+  EXPECT_EQ(9u, orig_pixels[2][0]);
+  EXPECT_EQ(10u, orig_pixels[2][1]);
+  EXPECT_EQ(11u, orig_pixels[2][2]);
+  EXPECT_EQ(2u, orig_pixels[2][3]);  // Alpha unchanged.
+  EXPECT_EQ(1u, orig_pixels[3][0]);
+  EXPECT_EQ(6u, orig_pixels[3][1]);
+  EXPECT_EQ(11u, orig_pixels[3][2]);
+  EXPECT_EQ(3u, orig_pixels[3][3]);  // Alpha unchanged.
+
+  for (int i = 0; i < 1280; ++i) {
+    orig_pixels[i][0] = i;
+    orig_pixels[i][1] = i / 2;
+    orig_pixels[i][2] = i / 3;
+    orig_pixels[i][3] = i;
+  }
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    RGBColorTable(&orig_pixels[0][0], 0, &kARGBTable[0], 0, 0, 1280, 1);
+  }
+}
+
+TEST_F(LibYUVPlanarTest, TestARGBQuantize) {
+  SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+
+  for (int i = 0; i < 1280; ++i) {
+    orig_pixels[i][0] = i;
+    orig_pixels[i][1] = i / 2;
+    orig_pixels[i][2] = i / 3;
+    orig_pixels[i][3] = i;
+  }
+  ARGBQuantize(&orig_pixels[0][0], 0,
+               (65536 + (8 / 2)) / 8, 8, 8 / 2, 0, 0, 1280, 1);
+
+  for (int i = 0; i < 1280; ++i) {
+    EXPECT_EQ((i / 8 * 8 + 8 / 2) & 255, orig_pixels[i][0]);
+    EXPECT_EQ((i / 2 / 8 * 8 + 8 / 2) & 255, orig_pixels[i][1]);
+    EXPECT_EQ((i / 3 / 8 * 8 + 8 / 2) & 255, orig_pixels[i][2]);
+    EXPECT_EQ(i & 255, orig_pixels[i][3]);
+  }
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    ARGBQuantize(&orig_pixels[0][0], 0,
+                 (65536 + (8 / 2)) / 8, 8, 8 / 2, 0, 0, 1280, 1);
+  }
+}
+
+TEST_F(LibYUVPlanarTest, TestARGBMirror) {
+  SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+  SIMD_ALIGNED(uint8 dst_pixels[1280][4]);
+
+  for (int i = 0; i < 1280; ++i) {
+    orig_pixels[i][0] = i;
+    orig_pixels[i][1] = i / 2;
+    orig_pixels[i][2] = i / 3;
+    orig_pixels[i][3] = i / 4;
+  }
+  ARGBMirror(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, 1280, 1);
+
+  for (int i = 0; i < 1280; ++i) {
+    EXPECT_EQ(i & 255, dst_pixels[1280 - 1 - i][0]);
+    EXPECT_EQ((i / 2) & 255, dst_pixels[1280 - 1 - i][1]);
+    EXPECT_EQ((i / 3) & 255, dst_pixels[1280 - 1 - i][2]);
+    EXPECT_EQ((i / 4) & 255, dst_pixels[1280 - 1 - i][3]);
+  }
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    ARGBMirror(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, 1280, 1);
+  }
+}
+
+TEST_F(LibYUVPlanarTest, TestShade) {
+  SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+  SIMD_ALIGNED(uint8 shade_pixels[1280][4]);
+  memset(orig_pixels, 0, sizeof(orig_pixels));
+
+  orig_pixels[0][0] = 10u;
+  orig_pixels[0][1] = 20u;
+  orig_pixels[0][2] = 40u;
+  orig_pixels[0][3] = 80u;
+  orig_pixels[1][0] = 0u;
+  orig_pixels[1][1] = 0u;
+  orig_pixels[1][2] = 0u;
+  orig_pixels[1][3] = 255u;
+  orig_pixels[2][0] = 0u;
+  orig_pixels[2][1] = 0u;
+  orig_pixels[2][2] = 0u;
+  orig_pixels[2][3] = 0u;
+  orig_pixels[3][0] = 0u;
+  orig_pixels[3][1] = 0u;
+  orig_pixels[3][2] = 0u;
+  orig_pixels[3][3] = 0u;
+  // Do 8 pixels to allow opt version to be used.
+  ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 8, 1, 0x80ffffff);
+  EXPECT_EQ(10u, shade_pixels[0][0]);
+  EXPECT_EQ(20u, shade_pixels[0][1]);
+  EXPECT_EQ(40u, shade_pixels[0][2]);
+  EXPECT_EQ(40u, shade_pixels[0][3]);
+  EXPECT_EQ(0u, shade_pixels[1][0]);
+  EXPECT_EQ(0u, shade_pixels[1][1]);
+  EXPECT_EQ(0u, shade_pixels[1][2]);
+  EXPECT_EQ(128u, shade_pixels[1][3]);
+  EXPECT_EQ(0u, shade_pixels[2][0]);
+  EXPECT_EQ(0u, shade_pixels[2][1]);
+  EXPECT_EQ(0u, shade_pixels[2][2]);
+  EXPECT_EQ(0u, shade_pixels[2][3]);
+  EXPECT_EQ(0u, shade_pixels[3][0]);
+  EXPECT_EQ(0u, shade_pixels[3][1]);
+  EXPECT_EQ(0u, shade_pixels[3][2]);
+  EXPECT_EQ(0u, shade_pixels[3][3]);
+
+  ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 8, 1, 0x80808080);
+  EXPECT_EQ(5u, shade_pixels[0][0]);
+  EXPECT_EQ(10u, shade_pixels[0][1]);
+  EXPECT_EQ(20u, shade_pixels[0][2]);
+  EXPECT_EQ(40u, shade_pixels[0][3]);
+
+  ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 8, 1, 0x10204080);
+  EXPECT_EQ(5u, shade_pixels[0][0]);
+  EXPECT_EQ(5u, shade_pixels[0][1]);
+  EXPECT_EQ(5u, shade_pixels[0][2]);
+  EXPECT_EQ(5u, shade_pixels[0][3]);
+
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 1280, 1,
+              0x80808080);
+  }
+}
+
+TEST_F(LibYUVPlanarTest, TestARGBInterpolate) {
+  SIMD_ALIGNED(uint8 orig_pixels_0[1280][4]);
+  SIMD_ALIGNED(uint8 orig_pixels_1[1280][4]);
+  SIMD_ALIGNED(uint8 interpolate_pixels[1280][4]);
+  memset(orig_pixels_0, 0, sizeof(orig_pixels_0));
+  memset(orig_pixels_1, 0, sizeof(orig_pixels_1));
+
+  orig_pixels_0[0][0] = 16u;
+  orig_pixels_0[0][1] = 32u;
+  orig_pixels_0[0][2] = 64u;
+  orig_pixels_0[0][3] = 128u;
+  orig_pixels_0[1][0] = 0u;
+  orig_pixels_0[1][1] = 0u;
+  orig_pixels_0[1][2] = 0u;
+  orig_pixels_0[1][3] = 255u;
+  orig_pixels_0[2][0] = 0u;
+  orig_pixels_0[2][1] = 0u;
+  orig_pixels_0[2][2] = 0u;
+  orig_pixels_0[2][3] = 0u;
+  orig_pixels_0[3][0] = 0u;
+  orig_pixels_0[3][1] = 0u;
+  orig_pixels_0[3][2] = 0u;
+  orig_pixels_0[3][3] = 0u;
+
+  orig_pixels_1[0][0] = 0u;
+  orig_pixels_1[0][1] = 0u;
+  orig_pixels_1[0][2] = 0u;
+  orig_pixels_1[0][3] = 0u;
+  orig_pixels_1[1][0] = 0u;
+  orig_pixels_1[1][1] = 0u;
+  orig_pixels_1[1][2] = 0u;
+  orig_pixels_1[1][3] = 0u;
+  orig_pixels_1[2][0] = 0u;
+  orig_pixels_1[2][1] = 0u;
+  orig_pixels_1[2][2] = 0u;
+  orig_pixels_1[2][3] = 0u;
+  orig_pixels_1[3][0] = 255u;
+  orig_pixels_1[3][1] = 255u;
+  orig_pixels_1[3][2] = 255u;
+  orig_pixels_1[3][3] = 255u;
+
+  ARGBInterpolate(&orig_pixels_0[0][0], 0, &orig_pixels_1[0][0], 0,
+                  &interpolate_pixels[0][0], 0, 4, 1, 128);
+  EXPECT_EQ(8u, interpolate_pixels[0][0]);
+  EXPECT_EQ(16u, interpolate_pixels[0][1]);
+  EXPECT_EQ(32u, interpolate_pixels[0][2]);
+  EXPECT_EQ(64u, interpolate_pixels[0][3]);
+  EXPECT_EQ(0u, interpolate_pixels[1][0]);
+  EXPECT_EQ(0u, interpolate_pixels[1][1]);
+  EXPECT_EQ(0u, interpolate_pixels[1][2]);
+  EXPECT_EQ(128u, interpolate_pixels[1][3]);
+  EXPECT_EQ(0u, interpolate_pixels[2][0]);
+  EXPECT_EQ(0u, interpolate_pixels[2][1]);
+  EXPECT_EQ(0u, interpolate_pixels[2][2]);
+  EXPECT_EQ(0u, interpolate_pixels[2][3]);
+  EXPECT_EQ(128u, interpolate_pixels[3][0]);
+  EXPECT_EQ(128u, interpolate_pixels[3][1]);
+  EXPECT_EQ(128u, interpolate_pixels[3][2]);
+  EXPECT_EQ(128u, interpolate_pixels[3][3]);
+
+  ARGBInterpolate(&orig_pixels_0[0][0], 0, &orig_pixels_1[0][0], 0,
+                  &interpolate_pixels[0][0], 0, 4, 1, 0);
+  EXPECT_EQ(16u, interpolate_pixels[0][0]);
+  EXPECT_EQ(32u, interpolate_pixels[0][1]);
+  EXPECT_EQ(64u, interpolate_pixels[0][2]);
+  EXPECT_EQ(128u, interpolate_pixels[0][3]);
+
+  ARGBInterpolate(&orig_pixels_0[0][0], 0, &orig_pixels_1[0][0], 0,
+                  &interpolate_pixels[0][0], 0, 4, 1, 192);
+
+  EXPECT_EQ(4u, interpolate_pixels[0][0]);
+  EXPECT_EQ(8u, interpolate_pixels[0][1]);
+  EXPECT_EQ(16u, interpolate_pixels[0][2]);
+  EXPECT_EQ(32u, interpolate_pixels[0][3]);
+
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    ARGBInterpolate(&orig_pixels_0[0][0], 0, &orig_pixels_1[0][0], 0,
+                    &interpolate_pixels[0][0], 0, 1280, 1, 128);
+  }
+}
+
+TEST_F(LibYUVPlanarTest, TestInterpolatePlane) {
+  SIMD_ALIGNED(uint8 orig_pixels_0[1280]);
+  SIMD_ALIGNED(uint8 orig_pixels_1[1280]);
+  SIMD_ALIGNED(uint8 interpolate_pixels[1280]);
+  memset(orig_pixels_0, 0, sizeof(orig_pixels_0));
+  memset(orig_pixels_1, 0, sizeof(orig_pixels_1));
+
+  orig_pixels_0[0] = 16u;
+  orig_pixels_0[1] = 32u;
+  orig_pixels_0[2] = 64u;
+  orig_pixels_0[3] = 128u;
+  orig_pixels_0[4] = 0u;
+  orig_pixels_0[5] = 0u;
+  orig_pixels_0[6] = 0u;
+  orig_pixels_0[7] = 255u;
+  orig_pixels_0[8] = 0u;
+  orig_pixels_0[9] = 0u;
+  orig_pixels_0[10] = 0u;
+  orig_pixels_0[11] = 0u;
+  orig_pixels_0[12] = 0u;
+  orig_pixels_0[13] = 0u;
+  orig_pixels_0[14] = 0u;
+  orig_pixels_0[15] = 0u;
+
+  orig_pixels_1[0] = 0u;
+  orig_pixels_1[1] = 0u;
+  orig_pixels_1[2] = 0u;
+  orig_pixels_1[3] = 0u;
+  orig_pixels_1[4] = 0u;
+  orig_pixels_1[5] = 0u;
+  orig_pixels_1[6] = 0u;
+  orig_pixels_1[7] = 0u;
+  orig_pixels_1[8] = 0u;
+  orig_pixels_1[9] = 0u;
+  orig_pixels_1[10] = 0u;
+  orig_pixels_1[11] = 0u;
+  orig_pixels_1[12] = 255u;
+  orig_pixels_1[13] = 255u;
+  orig_pixels_1[14] = 255u;
+  orig_pixels_1[15] = 255u;
+
+  InterpolatePlane(&orig_pixels_0[0], 0, &orig_pixels_1[0], 0,
+                   &interpolate_pixels[0], 0, 16, 1, 128);
+  EXPECT_EQ(8u, interpolate_pixels[0]);
+  EXPECT_EQ(16u, interpolate_pixels[1]);
+  EXPECT_EQ(32u, interpolate_pixels[2]);
+  EXPECT_EQ(64u, interpolate_pixels[3]);
+  EXPECT_EQ(0u, interpolate_pixels[4]);
+  EXPECT_EQ(0u, interpolate_pixels[5]);
+  EXPECT_EQ(0u, interpolate_pixels[6]);
+  EXPECT_EQ(128u, interpolate_pixels[7]);
+  EXPECT_EQ(0u, interpolate_pixels[8]);
+  EXPECT_EQ(0u, interpolate_pixels[9]);
+  EXPECT_EQ(0u, interpolate_pixels[10]);
+  EXPECT_EQ(0u, interpolate_pixels[11]);
+  EXPECT_EQ(128u, interpolate_pixels[12]);
+  EXPECT_EQ(128u, interpolate_pixels[13]);
+  EXPECT_EQ(128u, interpolate_pixels[14]);
+  EXPECT_EQ(128u, interpolate_pixels[15]);
+
+  InterpolatePlane(&orig_pixels_0[0], 0, &orig_pixels_1[0], 0,
+                   &interpolate_pixels[0], 0, 16, 1, 0);
+  EXPECT_EQ(16u, interpolate_pixels[0]);
+  EXPECT_EQ(32u, interpolate_pixels[1]);
+  EXPECT_EQ(64u, interpolate_pixels[2]);
+  EXPECT_EQ(128u, interpolate_pixels[3]);
+
+  InterpolatePlane(&orig_pixels_0[0], 0, &orig_pixels_1[0], 0,
+                   &interpolate_pixels[0], 0, 16, 1, 192);
+
+  EXPECT_EQ(4u, interpolate_pixels[0]);
+  EXPECT_EQ(8u, interpolate_pixels[1]);
+  EXPECT_EQ(16u, interpolate_pixels[2]);
+  EXPECT_EQ(32u, interpolate_pixels[3]);
+
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    InterpolatePlane(&orig_pixels_0[0], 0, &orig_pixels_1[0], 0,
+                     &interpolate_pixels[0], 0, 1280, 1, 123);
+  }
+}
+
+#define TESTTERP(FMT_A, BPP_A, STRIDE_A,                                       \
+                 FMT_B, BPP_B, STRIDE_B,                                       \
+                 W1280, TERP, N, NEG, OFF)                               \
+TEST_F(LibYUVPlanarTest, ARGBInterpolate##TERP##N) {                           \
+  const int kWidth = ((W1280) > 0) ? (W1280) : 1;                              \
+  const int kHeight = benchmark_height_;                                       \
+  const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;  \
+  const int kStrideB = (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;  \
+  align_buffer_64(src_argb_a, kStrideA * kHeight + OFF);                       \
+  align_buffer_64(src_argb_b, kStrideA * kHeight + OFF);                       \
+  align_buffer_64(dst_argb_c, kStrideB * kHeight);                             \
+  align_buffer_64(dst_argb_opt, kStrideB * kHeight);                           \
+  for (int i = 0; i < kStrideA * kHeight; ++i) {                               \
+    src_argb_a[i + OFF] = (fastrand() & 0xff);                                 \
+    src_argb_b[i + OFF] = (fastrand() & 0xff);                                 \
+  }                                                                            \
+  MaskCpuFlags(disable_cpu_flags_);                                            \
+  ARGBInterpolate(src_argb_a + OFF, kStrideA,                                  \
+                  src_argb_b + OFF, kStrideA,                                  \
+                  dst_argb_c, kStrideB,                                        \
+                  kWidth, NEG kHeight, TERP);                                  \
+  MaskCpuFlags(benchmark_cpu_info_);                                           \
+  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
+    ARGBInterpolate(src_argb_a + OFF, kStrideA,                                \
+                    src_argb_b + OFF, kStrideA,                                \
+                    dst_argb_opt, kStrideB,                                    \
+                    kWidth, NEG kHeight, TERP);                                \
+  }                                                                            \
+  for (int i = 0; i < kStrideB * kHeight; ++i) {                               \
+    EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]);                                 \
+  }                                                                            \
+  free_aligned_buffer_64(src_argb_a);                                          \
+  free_aligned_buffer_64(src_argb_b);                                          \
+  free_aligned_buffer_64(dst_argb_c);                                          \
+  free_aligned_buffer_64(dst_argb_opt);                                        \
+}
+
+#define TESTINTERPOLATE(TERP)                                                  \
+    TESTTERP(ARGB, 4, 1, ARGB, 4, 1, benchmark_width_ - 1, TERP, _Any, +, 0)   \
+    TESTTERP(ARGB, 4, 1, ARGB, 4, 1, benchmark_width_, TERP, _Unaligned, +, 1) \
+    TESTTERP(ARGB, 4, 1, ARGB, 4, 1, benchmark_width_, TERP, _Invert, -, 0)    \
+    TESTTERP(ARGB, 4, 1, ARGB, 4, 1, benchmark_width_, TERP, _Opt, +, 0)
+
+TESTINTERPOLATE(0)
+TESTINTERPOLATE(64)
+TESTINTERPOLATE(128)
+TESTINTERPOLATE(192)
+TESTINTERPOLATE(255)
+
+static int TestBlend(int width, int height, int benchmark_iterations,
+                     int disable_cpu_flags, int benchmark_cpu_info,
+                     int invert, int off) {
+  if (width < 1) {
+    width = 1;
+  }
+  const int kBpp = 4;
+  const int kStride = width * kBpp;
+  align_buffer_64(src_argb_a, kStride * height + off);
+  align_buffer_64(src_argb_b, kStride * height + off);
+  align_buffer_64(dst_argb_c, kStride * height);
+  align_buffer_64(dst_argb_opt, kStride * height);
+  for (int i = 0; i < kStride * height; ++i) {
+    src_argb_a[i + off] = (fastrand() & 0xff);
+    src_argb_b[i + off] = (fastrand() & 0xff);
+  }
+  ARGBAttenuate(src_argb_a + off, kStride, src_argb_a + off, kStride, width,
+                height);
+  ARGBAttenuate(src_argb_b + off, kStride, src_argb_b + off, kStride, width,
+                height);
+  memset(dst_argb_c, 255, kStride * height);
+  memset(dst_argb_opt, 255, kStride * height);
+
+  MaskCpuFlags(disable_cpu_flags);
+  ARGBBlend(src_argb_a + off, kStride,
+            src_argb_b + off, kStride,
+            dst_argb_c, kStride,
+            width, invert * height);
+  MaskCpuFlags(benchmark_cpu_info);
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    ARGBBlend(src_argb_a + off, kStride,
+              src_argb_b + off, kStride,
+              dst_argb_opt, kStride,
+              width, invert * height);
+  }
+  int max_diff = 0;
+  for (int i = 0; i < kStride * height; ++i) {
+    int abs_diff =
+        abs(static_cast<int>(dst_argb_c[i]) -
+            static_cast<int>(dst_argb_opt[i]));
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
+    }
+  }
+  free_aligned_buffer_64(src_argb_a);
+  free_aligned_buffer_64(src_argb_b);
+  free_aligned_buffer_64(dst_argb_c);
+  free_aligned_buffer_64(dst_argb_opt);
+  return max_diff;
+}
+
+TEST_F(LibYUVPlanarTest, ARGBBlend_Any) {
+  int max_diff = TestBlend(benchmark_width_ - 4, benchmark_height_,
+                           benchmark_iterations_,
+                           disable_cpu_flags_,  benchmark_cpu_info_, +1, 0);
+  EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBBlend_Unaligned) {
+  int max_diff = TestBlend(benchmark_width_, benchmark_height_,
+                           benchmark_iterations_,
+                           disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
+  EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBBlend_Invert) {
+  int max_diff = TestBlend(benchmark_width_, benchmark_height_,
+                           benchmark_iterations_,
+                           disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
+  EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBBlend_Opt) {
+  int max_diff = TestBlend(benchmark_width_, benchmark_height_,
+                           benchmark_iterations_,
+                           disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+  EXPECT_LE(max_diff, 1);
+}
+
+static void TestBlendPlane(int width, int height, int benchmark_iterations,
+                           int disable_cpu_flags, int benchmark_cpu_info,
+                           int invert, int off) {
+  if (width < 1) {
+    width = 1;
+  }
+  const int kBpp = 1;
+  const int kStride = width * kBpp;
+  align_buffer_64(src_argb_a, kStride * height + off);
+  align_buffer_64(src_argb_b, kStride * height + off);
+  align_buffer_64(src_argb_alpha, kStride * height + off);
+  align_buffer_64(dst_argb_c, kStride * height + off);
+  align_buffer_64(dst_argb_opt, kStride * height + off);
+  memset(dst_argb_c, 255, kStride * height + off);
+  memset(dst_argb_opt, 255, kStride * height + off);
+
+  // Test source is maintained exactly if alpha is 255.
+  for (int i = 0; i < width; ++i) {
+    src_argb_a[i + off] = i & 255;
+    src_argb_b[i + off] = 255 - (i & 255);
+  }
+  memset(src_argb_alpha + off, 255, width);
+  BlendPlane(src_argb_a + off, width,
+             src_argb_b + off, width,
+             src_argb_alpha + off, width,
+             dst_argb_opt + off, width,
+             width, 1);
+  for (int i = 0; i < width; ++i) {
+    EXPECT_EQ(src_argb_a[i + off], dst_argb_opt[i + off]);
+  }
+  // Test destination is maintained exactly if alpha is 0.
+  memset(src_argb_alpha + off, 0, width);
+  BlendPlane(src_argb_a + off, width,
+             src_argb_b + off, width,
+             src_argb_alpha + off, width,
+             dst_argb_opt + off, width,
+             width, 1);
+  for (int i = 0; i < width; ++i) {
+    EXPECT_EQ(src_argb_b[i + off], dst_argb_opt[i + off]);
+  }
+  for (int i = 0; i < kStride * height; ++i) {
+    src_argb_a[i + off] = (fastrand() & 0xff);
+    src_argb_b[i + off] = (fastrand() & 0xff);
+    src_argb_alpha[i + off] = (fastrand() & 0xff);
+  }
+
+  MaskCpuFlags(disable_cpu_flags);
+  BlendPlane(src_argb_a + off, width,
+             src_argb_b + off, width,
+             src_argb_alpha + off, width,
+             dst_argb_c + off, width,
+             width, height);
+  MaskCpuFlags(benchmark_cpu_info);
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    BlendPlane(src_argb_a + off, width,
+               src_argb_b + off, width,
+               src_argb_alpha + off, width,
+               dst_argb_opt + off, width,
+               width, height);
+  }
+  for (int i = 0; i < kStride * height; ++i) {
+    EXPECT_EQ(dst_argb_c[i + off], dst_argb_opt[i + off]);
+  }
+  free_aligned_buffer_64(src_argb_a);
+  free_aligned_buffer_64(src_argb_b);
+  free_aligned_buffer_64(src_argb_alpha);
+  free_aligned_buffer_64(dst_argb_c);
+  free_aligned_buffer_64(dst_argb_opt);
+  return;
+}
+
+TEST_F(LibYUVPlanarTest, BlendPlane_Opt) {
+  TestBlendPlane(benchmark_width_, benchmark_height_, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+}
+TEST_F(LibYUVPlanarTest, BlendPlane_Unaligned) {
+  TestBlendPlane(benchmark_width_, benchmark_height_, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
+}
+TEST_F(LibYUVPlanarTest, BlendPlane_Any) {
+  TestBlendPlane(benchmark_width_ - 4, benchmark_height_, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
+}
+TEST_F(LibYUVPlanarTest, BlendPlane_Invert) {
+  TestBlendPlane(benchmark_width_, benchmark_height_, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_, -1, 1);
+}
+
+#define SUBSAMPLE(v, a) ((((v) + (a) - 1)) / (a))
+
+static void TestI420Blend(int width, int height, int benchmark_iterations,
+                          int disable_cpu_flags, int benchmark_cpu_info,
+                          int invert, int off) {
+  width = ((width) > 0) ? (width) : 1;
+  const int kStrideUV = SUBSAMPLE(width, 2);
+  const int kSizeUV = kStrideUV * SUBSAMPLE(height, 2);
+  align_buffer_64(src_y0, width * height + off);
+  align_buffer_64(src_u0, kSizeUV + off);
+  align_buffer_64(src_v0, kSizeUV + off);
+  align_buffer_64(src_y1, width * height + off);
+  align_buffer_64(src_u1, kSizeUV + off);
+  align_buffer_64(src_v1, kSizeUV + off);
+  align_buffer_64(src_a, width * height + off);
+  align_buffer_64(dst_y_c, width * height + off);
+  align_buffer_64(dst_u_c, kSizeUV + off);
+  align_buffer_64(dst_v_c, kSizeUV + off);
+  align_buffer_64(dst_y_opt, width * height + off);
+  align_buffer_64(dst_u_opt, kSizeUV + off);
+  align_buffer_64(dst_v_opt, kSizeUV + off);
+
+  MemRandomize(src_y0, width * height + off);
+  MemRandomize(src_u0, kSizeUV + off);
+  MemRandomize(src_v0, kSizeUV + off);
+  MemRandomize(src_y1, width * height + off);
+  MemRandomize(src_u1, kSizeUV + off);
+  MemRandomize(src_v1, kSizeUV + off);
+  MemRandomize(src_a, width * height + off);
+  memset(dst_y_c, 255, width * height + off);
+  memset(dst_u_c, 255, kSizeUV + off);
+  memset(dst_v_c, 255, kSizeUV + off);
+  memset(dst_y_opt, 255, width * height + off);
+  memset(dst_u_opt, 255, kSizeUV + off);
+  memset(dst_v_opt, 255, kSizeUV + off);
+
+  MaskCpuFlags(disable_cpu_flags);
+  I420Blend(src_y0 + off, width,
+            src_u0 + off, kStrideUV,
+            src_v0 + off, kStrideUV,
+            src_y1 + off, width,
+            src_u1 + off, kStrideUV,
+            src_v1 + off, kStrideUV,
+            src_a + off, width,
+            dst_y_c + off, width,
+            dst_u_c + off, kStrideUV,
+            dst_v_c + off, kStrideUV,
+            width, height);
+  MaskCpuFlags(benchmark_cpu_info);
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    I420Blend(src_y0 + off, width,
+              src_u0 + off, kStrideUV,
+              src_v0 + off, kStrideUV,
+              src_y1 + off, width,
+              src_u1 + off, kStrideUV,
+              src_v1 + off, kStrideUV,
+              src_a + off, width,
+              dst_y_opt + off, width,
+              dst_u_opt + off, kStrideUV,
+              dst_v_opt + off, kStrideUV,
+              width, height);
+  }
+  for (int i = 0; i < width * height; ++i) {
+    EXPECT_EQ(dst_y_c[i + off], dst_y_opt[i + off]);
+  }
+  for (int i = 0; i < kSizeUV; ++i) {
+    EXPECT_EQ(dst_u_c[i + off], dst_u_opt[i + off]);
+    EXPECT_EQ(dst_v_c[i + off], dst_v_opt[i + off]);
+  }
+  free_aligned_buffer_64(src_y0);
+  free_aligned_buffer_64(src_u0);
+  free_aligned_buffer_64(src_v0);
+  free_aligned_buffer_64(src_y1);
+  free_aligned_buffer_64(src_u1);
+  free_aligned_buffer_64(src_v1);
+  free_aligned_buffer_64(src_a);
+  free_aligned_buffer_64(dst_y_c);
+  free_aligned_buffer_64(dst_u_c);
+  free_aligned_buffer_64(dst_v_c);
+  free_aligned_buffer_64(dst_y_opt);
+  free_aligned_buffer_64(dst_u_opt);
+  free_aligned_buffer_64(dst_v_opt);
+  return;
+}
+
+TEST_F(LibYUVPlanarTest, I420Blend_Opt) {
+  TestI420Blend(benchmark_width_, benchmark_height_, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+}
+TEST_F(LibYUVPlanarTest, I420Blend_Unaligned) {
+  TestI420Blend(benchmark_width_, benchmark_height_, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
+}
+
+// TODO(fbarchard): DISABLED because _Any uses C.  Avoid C and re-enable.
+TEST_F(LibYUVPlanarTest, DISABLED_I420Blend_Any) {
+  TestI420Blend(benchmark_width_ - 4, benchmark_height_, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+}
+TEST_F(LibYUVPlanarTest, I420Blend_Invert) {
+  TestI420Blend(benchmark_width_, benchmark_height_, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
+}
+
+TEST_F(LibYUVPlanarTest, TestAffine) {
+  SIMD_ALIGNED(uint8 orig_pixels_0[1280][4]);
+  SIMD_ALIGNED(uint8 interpolate_pixels_C[1280][4]);
+
+  for (int i = 0; i < 1280; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      orig_pixels_0[i][j] = i;
+    }
+  }
+
+  float uv_step[4] = { 0.f, 0.f, 0.75f, 0.f };
+
+  ARGBAffineRow_C(&orig_pixels_0[0][0], 0, &interpolate_pixels_C[0][0],
+                  uv_step, 1280);
+  EXPECT_EQ(0u, interpolate_pixels_C[0][0]);
+  EXPECT_EQ(96u, interpolate_pixels_C[128][0]);
+  EXPECT_EQ(191u, interpolate_pixels_C[255][3]);
+
+#if defined(HAS_ARGBAFFINEROW_SSE2)
+  SIMD_ALIGNED(uint8 interpolate_pixels_Opt[1280][4]);
+  ARGBAffineRow_SSE2(&orig_pixels_0[0][0], 0, &interpolate_pixels_Opt[0][0],
+                     uv_step, 1280);
+  EXPECT_EQ(0, memcmp(interpolate_pixels_Opt, interpolate_pixels_C, 1280 * 4));
+
+  int has_sse2 = TestCpuFlag(kCpuHasSSE2);
+  if (has_sse2) {
+    for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+      ARGBAffineRow_SSE2(&orig_pixels_0[0][0], 0, &interpolate_pixels_Opt[0][0],
+                         uv_step, 1280);
+    }
+  }
+#endif
+}
+
+TEST_F(LibYUVPlanarTest, TestSobelX) {
+  SIMD_ALIGNED(uint8 orig_pixels_0[1280 + 2]);
+  SIMD_ALIGNED(uint8 orig_pixels_1[1280 + 2]);
+  SIMD_ALIGNED(uint8 orig_pixels_2[1280 + 2]);
+  SIMD_ALIGNED(uint8 sobel_pixels_c[1280]);
+  SIMD_ALIGNED(uint8 sobel_pixels_opt[1280]);
+
+  for (int i = 0; i < 1280 + 2; ++i) {
+    orig_pixels_0[i] = i;
+    orig_pixels_1[i] = i * 2;
+    orig_pixels_2[i] = i * 3;
+  }
+
+  SobelXRow_C(orig_pixels_0, orig_pixels_1, orig_pixels_2,
+              sobel_pixels_c, 1280);
+
+  EXPECT_EQ(16u, sobel_pixels_c[0]);
+  EXPECT_EQ(16u, sobel_pixels_c[100]);
+  EXPECT_EQ(255u, sobel_pixels_c[255]);
+
+  void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1,
+                    const uint8* src_y2, uint8* dst_sobely, int width) =
+      SobelXRow_C;
+#if defined(HAS_SOBELXROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SobelXRow = SobelXRow_SSE2;
+  }
+#endif
+#if defined(HAS_SOBELXROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SobelXRow = SobelXRow_NEON;
+  }
+#endif
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    SobelXRow(orig_pixels_0, orig_pixels_1, orig_pixels_2,
+              sobel_pixels_opt, 1280);
+  }
+  for (int i = 0; i < 1280; ++i) {
+    EXPECT_EQ(sobel_pixels_c[i], sobel_pixels_opt[i]);
+  }
+}
+
+TEST_F(LibYUVPlanarTest, TestSobelY) {
+  SIMD_ALIGNED(uint8 orig_pixels_0[1280 + 2]);
+  SIMD_ALIGNED(uint8 orig_pixels_1[1280 + 2]);
+  SIMD_ALIGNED(uint8 sobel_pixels_c[1280]);
+  SIMD_ALIGNED(uint8 sobel_pixels_opt[1280]);
+
+  for (int i = 0; i < 1280 + 2; ++i) {
+    orig_pixels_0[i] = i;
+    orig_pixels_1[i] = i * 2;
+  }
+
+  SobelYRow_C(orig_pixels_0, orig_pixels_1, sobel_pixels_c, 1280);
+
+  EXPECT_EQ(4u, sobel_pixels_c[0]);
+  EXPECT_EQ(255u, sobel_pixels_c[100]);
+  EXPECT_EQ(0u, sobel_pixels_c[255]);
+  void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1,
+                    uint8* dst_sobely, int width) = SobelYRow_C;
+#if defined(HAS_SOBELYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SobelYRow = SobelYRow_SSE2;
+  }
+#endif
+#if defined(HAS_SOBELYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SobelYRow = SobelYRow_NEON;
+  }
+#endif
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    SobelYRow(orig_pixels_0, orig_pixels_1, sobel_pixels_opt, 1280);
+  }
+  for (int i = 0; i < 1280; ++i) {
+    EXPECT_EQ(sobel_pixels_c[i], sobel_pixels_opt[i]);
+  }
+}
+
+TEST_F(LibYUVPlanarTest, TestSobel) {
+  SIMD_ALIGNED(uint8 orig_sobelx[1280]);
+  SIMD_ALIGNED(uint8 orig_sobely[1280]);
+  SIMD_ALIGNED(uint8 sobel_pixels_c[1280 * 4]);
+  SIMD_ALIGNED(uint8 sobel_pixels_opt[1280 * 4]);
+
+  for (int i = 0; i < 1280; ++i) {
+    orig_sobelx[i] = i;
+    orig_sobely[i] = i * 2;
+  }
+
+  SobelRow_C(orig_sobelx, orig_sobely, sobel_pixels_c, 1280);
+
+  EXPECT_EQ(0u, sobel_pixels_c[0]);
+  EXPECT_EQ(3u, sobel_pixels_c[4]);
+  EXPECT_EQ(3u, sobel_pixels_c[5]);
+  EXPECT_EQ(3u, sobel_pixels_c[6]);
+  EXPECT_EQ(255u, sobel_pixels_c[7]);
+  EXPECT_EQ(6u, sobel_pixels_c[8]);
+  EXPECT_EQ(6u, sobel_pixels_c[9]);
+  EXPECT_EQ(6u, sobel_pixels_c[10]);
+  EXPECT_EQ(255u, sobel_pixels_c[7]);
+  EXPECT_EQ(255u, sobel_pixels_c[100 * 4 + 1]);
+  EXPECT_EQ(255u, sobel_pixels_c[255 * 4 + 1]);
+  void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely,
+                   uint8* dst_argb, int width) = SobelRow_C;
+#if defined(HAS_SOBELROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SobelRow = SobelRow_SSE2;
+  }
+#endif
+#if defined(HAS_SOBELROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SobelRow = SobelRow_NEON;
+  }
+#endif
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    SobelRow(orig_sobelx, orig_sobely, sobel_pixels_opt, 1280);
+  }
+  for (int i = 0; i < 1280 * 4; ++i) {
+    EXPECT_EQ(sobel_pixels_c[i], sobel_pixels_opt[i]);
+  }
+}
+
+TEST_F(LibYUVPlanarTest, TestSobelToPlane) {
+  SIMD_ALIGNED(uint8 orig_sobelx[1280]);
+  SIMD_ALIGNED(uint8 orig_sobely[1280]);
+  SIMD_ALIGNED(uint8 sobel_pixels_c[1280]);
+  SIMD_ALIGNED(uint8 sobel_pixels_opt[1280]);
+
+  for (int i = 0; i < 1280; ++i) {
+    orig_sobelx[i] = i;
+    orig_sobely[i] = i * 2;
+  }
+
+  SobelToPlaneRow_C(orig_sobelx, orig_sobely, sobel_pixels_c, 1280);
+
+  EXPECT_EQ(0u, sobel_pixels_c[0]);
+  EXPECT_EQ(3u, sobel_pixels_c[1]);
+  EXPECT_EQ(6u, sobel_pixels_c[2]);
+  EXPECT_EQ(99u, sobel_pixels_c[33]);
+  EXPECT_EQ(255u, sobel_pixels_c[100]);
+  void (*SobelToPlaneRow)(const uint8* src_sobelx, const uint8* src_sobely,
+                          uint8* dst_y, int width) = SobelToPlaneRow_C;
+#if defined(HAS_SOBELTOPLANEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SobelToPlaneRow = SobelToPlaneRow_SSE2;
+  }
+#endif
+#if defined(HAS_SOBELTOPLANEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SobelToPlaneRow = SobelToPlaneRow_NEON;
+  }
+#endif
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    SobelToPlaneRow(orig_sobelx, orig_sobely, sobel_pixels_opt, 1280);
+  }
+  for (int i = 0; i < 1280; ++i) {
+    EXPECT_EQ(sobel_pixels_c[i], sobel_pixels_opt[i]);
+  }
+}
+
+TEST_F(LibYUVPlanarTest, TestSobelXY) {
+  SIMD_ALIGNED(uint8 orig_sobelx[1280]);
+  SIMD_ALIGNED(uint8 orig_sobely[1280]);
+  SIMD_ALIGNED(uint8 sobel_pixels_c[1280 * 4]);
+  SIMD_ALIGNED(uint8 sobel_pixels_opt[1280 * 4]);
+
+  for (int i = 0; i < 1280; ++i) {
+    orig_sobelx[i] = i;
+    orig_sobely[i] = i * 2;
+  }
+
+  SobelXYRow_C(orig_sobelx, orig_sobely, sobel_pixels_c, 1280);
+
+  EXPECT_EQ(0u, sobel_pixels_c[0]);
+  EXPECT_EQ(2u, sobel_pixels_c[4]);
+  EXPECT_EQ(3u, sobel_pixels_c[5]);
+  EXPECT_EQ(1u, sobel_pixels_c[6]);
+  EXPECT_EQ(255u, sobel_pixels_c[7]);
+  EXPECT_EQ(255u, sobel_pixels_c[100 * 4 + 1]);
+  EXPECT_EQ(255u, sobel_pixels_c[255 * 4 + 1]);
+  void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely,
+                       uint8* dst_argb, int width) = SobelXYRow_C;
+#if defined(HAS_SOBELXYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SobelXYRow = SobelXYRow_SSE2;
+  }
+#endif
+#if defined(HAS_SOBELXYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SobelXYRow = SobelXYRow_NEON;
+  }
+#endif
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    SobelXYRow(orig_sobelx, orig_sobely, sobel_pixels_opt, 1280);
+  }
+  for (int i = 0; i < 1280 * 4; ++i) {
+    EXPECT_EQ(sobel_pixels_c[i], sobel_pixels_opt[i]);
+  }
+}
+
+TEST_F(LibYUVPlanarTest, TestCopyPlane) {
+  int err = 0;
+  int yw = benchmark_width_;
+  int yh = benchmark_height_;
+  int b = 12;
+  int i, j;
+
+  int y_plane_size = (yw + b * 2) * (yh + b * 2);
+  align_buffer_64(orig_y, y_plane_size);
+  align_buffer_64(dst_c, y_plane_size);
+  align_buffer_64(dst_opt, y_plane_size);
+
+  memset(orig_y, 0, y_plane_size);
+  memset(dst_c, 0, y_plane_size);
+  memset(dst_opt, 0, y_plane_size);
+
+  // Fill image buffers with random data.
+  for (i = b; i < (yh + b); ++i) {
+    for (j = b; j < (yw + b); ++j) {
+      orig_y[i * (yw + b * 2) + j] = fastrand() & 0xff;
+    }
+  }
+
+  // Fill destination buffers with random data.
+  for (i = 0; i < y_plane_size; ++i) {
+    uint8 random_number = fastrand() & 0x7f;
+    dst_c[i] = random_number;
+    dst_opt[i] = dst_c[i];
+  }
+
+  int y_off = b * (yw + b * 2) + b;
+
+  int y_st = yw + b * 2;
+  int stride = 8;
+
+  // Disable all optimizations.
+  MaskCpuFlags(disable_cpu_flags_);
+  double c_time = get_time();
+  for (j = 0; j < benchmark_iterations_; j++) {
+    CopyPlane(orig_y + y_off, y_st, dst_c + y_off, stride, yw, yh);
+  }
+  c_time = (get_time() - c_time) / benchmark_iterations_;
+
+  // Enable optimizations.
+  MaskCpuFlags(benchmark_cpu_info_);
+  double opt_time = get_time();
+  for (j = 0; j < benchmark_iterations_; j++) {
+    CopyPlane(orig_y + y_off, y_st, dst_opt + y_off, stride, yw, yh);
+  }
+  opt_time = (get_time() - opt_time) / benchmark_iterations_;
+
+  for (i = 0; i < y_plane_size; ++i) {
+    if (dst_c[i] != dst_opt[i])
+      ++err;
+  }
+
+  free_aligned_buffer_64(orig_y);
+  free_aligned_buffer_64(dst_c);
+  free_aligned_buffer_64(dst_opt);
+
+  EXPECT_EQ(0, err);
+}
+
+static int TestMultiply(int width, int height, int benchmark_iterations,
+                        int disable_cpu_flags, int benchmark_cpu_info,
+                        int invert, int off) {
+  if (width < 1) {
+    width = 1;
+  }
+  const int kBpp = 4;
+  const int kStride = width * kBpp;
+  align_buffer_64(src_argb_a, kStride * height + off);
+  align_buffer_64(src_argb_b, kStride * height + off);
+  align_buffer_64(dst_argb_c, kStride * height);
+  align_buffer_64(dst_argb_opt, kStride * height);
+  for (int i = 0; i < kStride * height; ++i) {
+    src_argb_a[i + off] = (fastrand() & 0xff);
+    src_argb_b[i + off] = (fastrand() & 0xff);
+  }
+  memset(dst_argb_c, 0, kStride * height);
+  memset(dst_argb_opt, 0, kStride * height);
+
+  MaskCpuFlags(disable_cpu_flags);
+  ARGBMultiply(src_argb_a + off, kStride,
+               src_argb_b + off, kStride,
+               dst_argb_c, kStride,
+               width, invert * height);
+  MaskCpuFlags(benchmark_cpu_info);
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    ARGBMultiply(src_argb_a + off, kStride,
+                 src_argb_b + off, kStride,
+                 dst_argb_opt, kStride,
+                 width, invert * height);
+  }
+  int max_diff = 0;
+  for (int i = 0; i < kStride * height; ++i) {
+    int abs_diff =
+        abs(static_cast<int>(dst_argb_c[i]) -
+            static_cast<int>(dst_argb_opt[i]));
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
+    }
+  }
+  free_aligned_buffer_64(src_argb_a);
+  free_aligned_buffer_64(src_argb_b);
+  free_aligned_buffer_64(dst_argb_c);
+  free_aligned_buffer_64(dst_argb_opt);
+  return max_diff;
+}
+
+TEST_F(LibYUVPlanarTest, ARGBMultiply_Any) {
+  int max_diff = TestMultiply(benchmark_width_ - 1, benchmark_height_,
+                              benchmark_iterations_,
+                              disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+  EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBMultiply_Unaligned) {
+  int max_diff = TestMultiply(benchmark_width_, benchmark_height_,
+                              benchmark_iterations_,
+                              disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
+  EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBMultiply_Invert) {
+  int max_diff = TestMultiply(benchmark_width_, benchmark_height_,
+                              benchmark_iterations_,
+                              disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
+  EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBMultiply_Opt) {
+  int max_diff = TestMultiply(benchmark_width_, benchmark_height_,
+                              benchmark_iterations_,
+                              disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+  EXPECT_LE(max_diff, 1);
+}
+
+static int TestAdd(int width, int height, int benchmark_iterations,
+                   int disable_cpu_flags,  int benchmark_cpu_info,
+                   int invert, int off) {
+  if (width < 1) {
+    width = 1;
+  }
+  const int kBpp = 4;
+  const int kStride = width * kBpp;
+  align_buffer_64(src_argb_a, kStride * height + off);
+  align_buffer_64(src_argb_b, kStride * height + off);
+  align_buffer_64(dst_argb_c, kStride * height);
+  align_buffer_64(dst_argb_opt, kStride * height);
+  for (int i = 0; i < kStride * height; ++i) {
+    src_argb_a[i + off] = (fastrand() & 0xff);
+    src_argb_b[i + off] = (fastrand() & 0xff);
+  }
+  memset(dst_argb_c, 0, kStride * height);
+  memset(dst_argb_opt, 0, kStride * height);
+
+  MaskCpuFlags(disable_cpu_flags);
+  ARGBAdd(src_argb_a + off, kStride,
+          src_argb_b + off, kStride,
+          dst_argb_c, kStride,
+          width, invert * height);
+  MaskCpuFlags(benchmark_cpu_info);
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    ARGBAdd(src_argb_a + off, kStride,
+            src_argb_b + off, kStride,
+            dst_argb_opt, kStride,
+            width, invert * height);
+  }
+  int max_diff = 0;
+  for (int i = 0; i < kStride * height; ++i) {
+    int abs_diff =
+        abs(static_cast<int>(dst_argb_c[i]) -
+            static_cast<int>(dst_argb_opt[i]));
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
+    }
+  }
+  free_aligned_buffer_64(src_argb_a);
+  free_aligned_buffer_64(src_argb_b);
+  free_aligned_buffer_64(dst_argb_c);
+  free_aligned_buffer_64(dst_argb_opt);
+  return max_diff;
+}
+
+TEST_F(LibYUVPlanarTest, ARGBAdd_Any) {
+  int max_diff = TestAdd(benchmark_width_ - 1, benchmark_height_,
+                         benchmark_iterations_,
+                         disable_cpu_flags_,  benchmark_cpu_info_, +1, 0);
+  EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBAdd_Unaligned) {
+  int max_diff = TestAdd(benchmark_width_, benchmark_height_,
+                         benchmark_iterations_,
+                         disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
+  EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBAdd_Invert) {
+  int max_diff = TestAdd(benchmark_width_, benchmark_height_,
+                         benchmark_iterations_,
+                         disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
+  EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBAdd_Opt) {
+  int max_diff = TestAdd(benchmark_width_, benchmark_height_,
+                         benchmark_iterations_,
+                         disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+  EXPECT_LE(max_diff, 1);
+}
+
+static int TestSubtract(int width, int height, int benchmark_iterations,
+                        int disable_cpu_flags, int benchmark_cpu_info,
+                        int invert, int off) {
+  if (width < 1) {
+    width = 1;
+  }
+  const int kBpp = 4;
+  const int kStride = width * kBpp;
+  align_buffer_64(src_argb_a, kStride * height + off);
+  align_buffer_64(src_argb_b, kStride * height + off);
+  align_buffer_64(dst_argb_c, kStride * height);
+  align_buffer_64(dst_argb_opt, kStride * height);
+  for (int i = 0; i < kStride * height; ++i) {
+    src_argb_a[i + off] = (fastrand() & 0xff);
+    src_argb_b[i + off] = (fastrand() & 0xff);
+  }
+  memset(dst_argb_c, 0, kStride * height);
+  memset(dst_argb_opt, 0, kStride * height);
+
+  MaskCpuFlags(disable_cpu_flags);
+  ARGBSubtract(src_argb_a + off, kStride,
+               src_argb_b + off, kStride,
+               dst_argb_c, kStride,
+               width, invert * height);
+  MaskCpuFlags(benchmark_cpu_info);
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    ARGBSubtract(src_argb_a + off, kStride,
+                 src_argb_b + off, kStride,
+                 dst_argb_opt, kStride,
+                 width, invert * height);
+  }
+  int max_diff = 0;
+  for (int i = 0; i < kStride * height; ++i) {
+    int abs_diff =
+        abs(static_cast<int>(dst_argb_c[i]) -
+            static_cast<int>(dst_argb_opt[i]));
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
+    }
+  }
+  free_aligned_buffer_64(src_argb_a);
+  free_aligned_buffer_64(src_argb_b);
+  free_aligned_buffer_64(dst_argb_c);
+  free_aligned_buffer_64(dst_argb_opt);
+  return max_diff;
+}
+
+TEST_F(LibYUVPlanarTest, ARGBSubtract_Any) {
+  int max_diff = TestSubtract(benchmark_width_ - 1, benchmark_height_,
+                              benchmark_iterations_,
+                              disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+  EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBSubtract_Unaligned) {
+  int max_diff = TestSubtract(benchmark_width_, benchmark_height_,
+                              benchmark_iterations_,
+                              disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
+  EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBSubtract_Invert) {
+  int max_diff = TestSubtract(benchmark_width_, benchmark_height_,
+                              benchmark_iterations_,
+                              disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
+  EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBSubtract_Opt) {
+  int max_diff = TestSubtract(benchmark_width_, benchmark_height_,
+                              benchmark_iterations_,
+                              disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+  EXPECT_LE(max_diff, 1);
+}
+
+static int TestSobel(int width, int height, int benchmark_iterations,
+                     int disable_cpu_flags, int benchmark_cpu_info,
+                     int invert, int off) {
+  if (width < 1) {
+    width = 1;
+  }
+  const int kBpp = 4;
+  const int kStride = width * kBpp;
+  align_buffer_64(src_argb_a, kStride * height + off);
+  align_buffer_64(dst_argb_c, kStride * height);
+  align_buffer_64(dst_argb_opt, kStride * height);
+  memset(src_argb_a, 0, kStride * height + off);
+  for (int i = 0; i < kStride * height; ++i) {
+    src_argb_a[i + off] = (fastrand() & 0xff);
+  }
+  memset(dst_argb_c, 0, kStride * height);
+  memset(dst_argb_opt, 0, kStride * height);
+
+  MaskCpuFlags(disable_cpu_flags);
+  ARGBSobel(src_argb_a + off, kStride,
+            dst_argb_c, kStride,
+            width, invert * height);
+  MaskCpuFlags(benchmark_cpu_info);
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    ARGBSobel(src_argb_a + off, kStride,
+              dst_argb_opt, kStride,
+              width, invert * height);
+  }
+  int max_diff = 0;
+  for (int i = 0; i < kStride * height; ++i) {
+    int abs_diff =
+        abs(static_cast<int>(dst_argb_c[i]) -
+            static_cast<int>(dst_argb_opt[i]));
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
+    }
+  }
+  free_aligned_buffer_64(src_argb_a);
+  free_aligned_buffer_64(dst_argb_c);
+  free_aligned_buffer_64(dst_argb_opt);
+  return max_diff;
+}
+
+TEST_F(LibYUVPlanarTest, ARGBSobel_Any) {
+  int max_diff = TestSobel(benchmark_width_ - 1, benchmark_height_,
+                           benchmark_iterations_,
+                           disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+  EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBSobel_Unaligned) {
+  int max_diff = TestSobel(benchmark_width_, benchmark_height_,
+                           benchmark_iterations_,
+                           disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
+  EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBSobel_Invert) {
+  int max_diff = TestSobel(benchmark_width_, benchmark_height_,
+                           benchmark_iterations_,
+                           disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
+  EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBSobel_Opt) {
+  int max_diff = TestSobel(benchmark_width_, benchmark_height_,
+                           benchmark_iterations_,
+                           disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+  EXPECT_EQ(0, max_diff);
+}
+
+static int TestSobelToPlane(int width, int height, int benchmark_iterations,
+                            int disable_cpu_flags, int benchmark_cpu_info,
+                            int invert, int off) {
+  if (width < 1) {
+    width = 1;
+  }
+  const int kSrcBpp = 4;
+  const int kDstBpp = 1;
+  const int kSrcStride = (width * kSrcBpp + 15) & ~15;
+  const int kDstStride = (width * kDstBpp + 15) & ~15;
+  align_buffer_64(src_argb_a, kSrcStride * height + off);
+  align_buffer_64(dst_argb_c, kDstStride * height);
+  align_buffer_64(dst_argb_opt, kDstStride * height);
+  memset(src_argb_a, 0, kSrcStride * height + off);
+  for (int i = 0; i < kSrcStride * height; ++i) {
+    src_argb_a[i + off] = (fastrand() & 0xff);
+  }
+  memset(dst_argb_c, 0, kDstStride * height);
+  memset(dst_argb_opt, 0, kDstStride * height);
+
+  MaskCpuFlags(disable_cpu_flags);
+  ARGBSobelToPlane(src_argb_a + off, kSrcStride,
+                   dst_argb_c, kDstStride,
+                   width, invert * height);
+  MaskCpuFlags(benchmark_cpu_info);
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    ARGBSobelToPlane(src_argb_a + off, kSrcStride,
+                     dst_argb_opt, kDstStride,
+                     width, invert * height);
+  }
+  int max_diff = 0;
+  for (int i = 0; i < kDstStride * height; ++i) {
+    int abs_diff =
+        abs(static_cast<int>(dst_argb_c[i]) -
+            static_cast<int>(dst_argb_opt[i]));
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
+    }
+  }
+  free_aligned_buffer_64(src_argb_a);
+  free_aligned_buffer_64(dst_argb_c);
+  free_aligned_buffer_64(dst_argb_opt);
+  return max_diff;
+}
+
+TEST_F(LibYUVPlanarTest, ARGBSobelToPlane_Any) {
+  int max_diff = TestSobelToPlane(benchmark_width_ - 1, benchmark_height_,
+                                  benchmark_iterations_,
+                                  disable_cpu_flags_, benchmark_cpu_info_,
+                                  +1, 0);
+  EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBSobelToPlane_Unaligned) {
+  int max_diff = TestSobelToPlane(benchmark_width_, benchmark_height_,
+                                  benchmark_iterations_,
+                                  disable_cpu_flags_, benchmark_cpu_info_,
+                                  +1, 1);
+  EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBSobelToPlane_Invert) {
+  int max_diff = TestSobelToPlane(benchmark_width_, benchmark_height_,
+                                  benchmark_iterations_,
+                                  disable_cpu_flags_, benchmark_cpu_info_,
+                                  -1, 0);
+  EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBSobelToPlane_Opt) {
+  int max_diff = TestSobelToPlane(benchmark_width_, benchmark_height_,
+                                  benchmark_iterations_,
+                                  disable_cpu_flags_, benchmark_cpu_info_,
+                                  +1, 0);
+  EXPECT_EQ(0, max_diff);
+}
+
+static int TestSobelXY(int width, int height, int benchmark_iterations,
+                       int disable_cpu_flags, int benchmark_cpu_info,
+                       int invert, int off) {
+  if (width < 1) {
+    width = 1;
+  }
+  const int kBpp = 4;
+  const int kStride = width * kBpp;
+  align_buffer_64(src_argb_a, kStride * height + off);
+  align_buffer_64(dst_argb_c, kStride * height);
+  align_buffer_64(dst_argb_opt, kStride * height);
+  memset(src_argb_a, 0, kStride * height + off);
+  for (int i = 0; i < kStride * height; ++i) {
+    src_argb_a[i + off] = (fastrand() & 0xff);
+  }
+  memset(dst_argb_c, 0, kStride * height);
+  memset(dst_argb_opt, 0, kStride * height);
+
+  MaskCpuFlags(disable_cpu_flags);
+  ARGBSobelXY(src_argb_a + off, kStride,
+            dst_argb_c, kStride,
+            width, invert * height);
+  MaskCpuFlags(benchmark_cpu_info);
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    ARGBSobelXY(src_argb_a + off, kStride,
+              dst_argb_opt, kStride,
+              width, invert * height);
+  }
+  int max_diff = 0;
+  for (int i = 0; i < kStride * height; ++i) {
+    int abs_diff =
+        abs(static_cast<int>(dst_argb_c[i]) -
+            static_cast<int>(dst_argb_opt[i]));
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
+    }
+  }
+  free_aligned_buffer_64(src_argb_a);
+  free_aligned_buffer_64(dst_argb_c);
+  free_aligned_buffer_64(dst_argb_opt);
+  return max_diff;
+}
+
+TEST_F(LibYUVPlanarTest, ARGBSobelXY_Any) {
+  int max_diff = TestSobelXY(benchmark_width_ - 1, benchmark_height_,
+                             benchmark_iterations_,
+                             disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+  EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBSobelXY_Unaligned) {
+  int max_diff = TestSobelXY(benchmark_width_, benchmark_height_,
+                             benchmark_iterations_,
+                             disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
+  EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBSobelXY_Invert) {
+  int max_diff = TestSobelXY(benchmark_width_, benchmark_height_,
+                             benchmark_iterations_,
+                             disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
+  EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBSobelXY_Opt) {
+  int max_diff = TestSobelXY(benchmark_width_, benchmark_height_,
+                             benchmark_iterations_,
+                             disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+  EXPECT_EQ(0, max_diff);
+}
+
+static int TestBlur(int width, int height, int benchmark_iterations,
+                    int disable_cpu_flags, int benchmark_cpu_info,
+                    int invert, int off, int radius) {
+  if (width < 1) {
+    width = 1;
+  }
+  const int kBpp = 4;
+  const int kStride = width * kBpp;
+  align_buffer_64(src_argb_a, kStride * height + off);
+  align_buffer_64(dst_cumsum, width * height * 16);
+  align_buffer_64(dst_argb_c, kStride * height);
+  align_buffer_64(dst_argb_opt, kStride * height);
+  for (int i = 0; i < kStride * height; ++i) {
+    src_argb_a[i + off] = (fastrand() & 0xff);
+  }
+  memset(dst_cumsum, 0, width * height * 16);
+  memset(dst_argb_c, 0, kStride * height);
+  memset(dst_argb_opt, 0, kStride * height);
+
+  MaskCpuFlags(disable_cpu_flags);
+  ARGBBlur(src_argb_a + off, kStride,
+           dst_argb_c, kStride,
+           reinterpret_cast<int32*>(dst_cumsum), width * 4,
+           width, invert * height, radius);
+  MaskCpuFlags(benchmark_cpu_info);
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    ARGBBlur(src_argb_a + off, kStride,
+             dst_argb_opt, kStride,
+             reinterpret_cast<int32*>(dst_cumsum), width * 4,
+             width, invert * height, radius);
+  }
+  int max_diff = 0;
+  for (int i = 0; i < kStride * height; ++i) {
+    int abs_diff =
+        abs(static_cast<int>(dst_argb_c[i]) -
+            static_cast<int>(dst_argb_opt[i]));
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
+    }
+  }
+  free_aligned_buffer_64(src_argb_a);
+  free_aligned_buffer_64(dst_cumsum);
+  free_aligned_buffer_64(dst_argb_c);
+  free_aligned_buffer_64(dst_argb_opt);
+  return max_diff;
+}
+
+static const int kBlurSize = 55;
+TEST_F(LibYUVPlanarTest, ARGBBlur_Any) {
+  int max_diff = TestBlur(benchmark_width_ - 1, benchmark_height_,
+                          benchmark_iterations_,
+                          disable_cpu_flags_, benchmark_cpu_info_,
+                          +1, 0, kBlurSize);
+  EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBBlur_Unaligned) {
+  int max_diff = TestBlur(benchmark_width_, benchmark_height_,
+                          benchmark_iterations_,
+                          disable_cpu_flags_, benchmark_cpu_info_,
+                          +1, 1, kBlurSize);
+  EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBBlur_Invert) {
+  int max_diff = TestBlur(benchmark_width_, benchmark_height_,
+                          benchmark_iterations_,
+                          disable_cpu_flags_, benchmark_cpu_info_,
+                          -1, 0, kBlurSize);
+  EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBBlur_Opt) {
+  int max_diff = TestBlur(benchmark_width_, benchmark_height_,
+                          benchmark_iterations_,
+                          disable_cpu_flags_, benchmark_cpu_info_,
+                          +1, 0, kBlurSize);
+  EXPECT_LE(max_diff, 1);
+}
+
+static const int kBlurSmallSize = 5;
+TEST_F(LibYUVPlanarTest, ARGBBlurSmall_Any) {
+  int max_diff = TestBlur(benchmark_width_ - 1, benchmark_height_,
+                          benchmark_iterations_,
+                          disable_cpu_flags_, benchmark_cpu_info_,
+                          +1, 0, kBlurSmallSize);
+  EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBBlurSmall_Unaligned) {
+  int max_diff = TestBlur(benchmark_width_, benchmark_height_,
+                          benchmark_iterations_,
+                          disable_cpu_flags_, benchmark_cpu_info_,
+                          +1, 1, kBlurSmallSize);
+  EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBBlurSmall_Invert) {
+  int max_diff = TestBlur(benchmark_width_, benchmark_height_,
+                          benchmark_iterations_,
+                          disable_cpu_flags_, benchmark_cpu_info_,
+                          -1, 0, kBlurSmallSize);
+  EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBBlurSmall_Opt) {
+  int max_diff = TestBlur(benchmark_width_, benchmark_height_,
+                          benchmark_iterations_,
+                          disable_cpu_flags_, benchmark_cpu_info_,
+                          +1, 0, kBlurSmallSize);
+  EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, TestARGBPolynomial) {
+  SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+  SIMD_ALIGNED(uint8 dst_pixels_opt[1280][4]);
+  SIMD_ALIGNED(uint8 dst_pixels_c[1280][4]);
+  memset(orig_pixels, 0, sizeof(orig_pixels));
+
+  SIMD_ALIGNED(static const float kWarmifyPolynomial[16]) = {
+    0.94230f,  -3.03300f,    -2.92500f,  0.f,  // C0
+    0.584500f,  1.112000f,    1.535000f, 1.f,  // C1 x
+    0.001313f, -0.002503f,   -0.004496f, 0.f,  // C2 x * x
+    0.0f,       0.000006965f, 0.000008781f, 0.f,  // C3 x * x * x
+  };
+
+  // Test blue
+  orig_pixels[0][0] = 255u;
+  orig_pixels[0][1] = 0u;
+  orig_pixels[0][2] = 0u;
+  orig_pixels[0][3] = 128u;
+  // Test green
+  orig_pixels[1][0] = 0u;
+  orig_pixels[1][1] = 255u;
+  orig_pixels[1][2] = 0u;
+  orig_pixels[1][3] = 0u;
+  // Test red
+  orig_pixels[2][0] = 0u;
+  orig_pixels[2][1] = 0u;
+  orig_pixels[2][2] = 255u;
+  orig_pixels[2][3] = 255u;
+  // Test white
+  orig_pixels[3][0] = 255u;
+  orig_pixels[3][1] = 255u;
+  orig_pixels[3][2] = 255u;
+  orig_pixels[3][3] = 255u;
+  // Test color
+  orig_pixels[4][0] = 16u;
+  orig_pixels[4][1] = 64u;
+  orig_pixels[4][2] = 192u;
+  orig_pixels[4][3] = 224u;
+  // Do 16 to test asm version.
+  ARGBPolynomial(&orig_pixels[0][0], 0, &dst_pixels_opt[0][0], 0,
+                 &kWarmifyPolynomial[0], 16, 1);
+  EXPECT_EQ(235u, dst_pixels_opt[0][0]);
+  EXPECT_EQ(0u, dst_pixels_opt[0][1]);
+  EXPECT_EQ(0u, dst_pixels_opt[0][2]);
+  EXPECT_EQ(128u, dst_pixels_opt[0][3]);
+  EXPECT_EQ(0u, dst_pixels_opt[1][0]);
+  EXPECT_EQ(233u, dst_pixels_opt[1][1]);
+  EXPECT_EQ(0u, dst_pixels_opt[1][2]);
+  EXPECT_EQ(0u, dst_pixels_opt[1][3]);
+  EXPECT_EQ(0u, dst_pixels_opt[2][0]);
+  EXPECT_EQ(0u, dst_pixels_opt[2][1]);
+  EXPECT_EQ(241u, dst_pixels_opt[2][2]);
+  EXPECT_EQ(255u, dst_pixels_opt[2][3]);
+  EXPECT_EQ(235u, dst_pixels_opt[3][0]);
+  EXPECT_EQ(233u, dst_pixels_opt[3][1]);
+  EXPECT_EQ(241u, dst_pixels_opt[3][2]);
+  EXPECT_EQ(255u, dst_pixels_opt[3][3]);
+  EXPECT_EQ(10u, dst_pixels_opt[4][0]);
+  EXPECT_EQ(59u, dst_pixels_opt[4][1]);
+  EXPECT_EQ(188u, dst_pixels_opt[4][2]);
+  EXPECT_EQ(224u, dst_pixels_opt[4][3]);
+
+  for (int i = 0; i < 1280; ++i) {
+    orig_pixels[i][0] = i;
+    orig_pixels[i][1] = i / 2;
+    orig_pixels[i][2] = i / 3;
+    orig_pixels[i][3] = i;
+  }
+
+  MaskCpuFlags(disable_cpu_flags_);
+  ARGBPolynomial(&orig_pixels[0][0], 0, &dst_pixels_c[0][0], 0,
+                 &kWarmifyPolynomial[0], 1280, 1);
+  MaskCpuFlags(benchmark_cpu_info_);
+
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    ARGBPolynomial(&orig_pixels[0][0], 0, &dst_pixels_opt[0][0], 0,
+                   &kWarmifyPolynomial[0], 1280, 1);
+  }
+
+  for (int i = 0; i < 1280; ++i) {
+    EXPECT_EQ(dst_pixels_c[i][0], dst_pixels_opt[i][0]);
+    EXPECT_EQ(dst_pixels_c[i][1], dst_pixels_opt[i][1]);
+    EXPECT_EQ(dst_pixels_c[i][2], dst_pixels_opt[i][2]);
+    EXPECT_EQ(dst_pixels_c[i][3], dst_pixels_opt[i][3]);
+  }
+}
+
+TEST_F(LibYUVPlanarTest, TestARGBLumaColorTable) {
+  SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+  SIMD_ALIGNED(uint8 dst_pixels_opt[1280][4]);
+  SIMD_ALIGNED(uint8 dst_pixels_c[1280][4]);
+  memset(orig_pixels, 0, sizeof(orig_pixels));
+
+  align_buffer_64(lumacolortable, 32768);
+  int v = 0;
+  for (int i = 0; i < 32768; ++i) {
+    lumacolortable[i] = v;
+    v += 3;
+  }
+  // Test blue
+  orig_pixels[0][0] = 255u;
+  orig_pixels[0][1] = 0u;
+  orig_pixels[0][2] = 0u;
+  orig_pixels[0][3] = 128u;
+  // Test green
+  orig_pixels[1][0] = 0u;
+  orig_pixels[1][1] = 255u;
+  orig_pixels[1][2] = 0u;
+  orig_pixels[1][3] = 0u;
+  // Test red
+  orig_pixels[2][0] = 0u;
+  orig_pixels[2][1] = 0u;
+  orig_pixels[2][2] = 255u;
+  orig_pixels[2][3] = 255u;
+  // Test color
+  orig_pixels[3][0] = 16u;
+  orig_pixels[3][1] = 64u;
+  orig_pixels[3][2] = 192u;
+  orig_pixels[3][3] = 224u;
+  // Do 16 to test asm version.
+  ARGBLumaColorTable(&orig_pixels[0][0], 0, &dst_pixels_opt[0][0], 0,
+                     &lumacolortable[0], 16, 1);
+  EXPECT_EQ(253u, dst_pixels_opt[0][0]);
+  EXPECT_EQ(0u, dst_pixels_opt[0][1]);
+  EXPECT_EQ(0u, dst_pixels_opt[0][2]);
+  EXPECT_EQ(128u, dst_pixels_opt[0][3]);
+  EXPECT_EQ(0u, dst_pixels_opt[1][0]);
+  EXPECT_EQ(253u, dst_pixels_opt[1][1]);
+  EXPECT_EQ(0u, dst_pixels_opt[1][2]);
+  EXPECT_EQ(0u, dst_pixels_opt[1][3]);
+  EXPECT_EQ(0u, dst_pixels_opt[2][0]);
+  EXPECT_EQ(0u, dst_pixels_opt[2][1]);
+  EXPECT_EQ(253u, dst_pixels_opt[2][2]);
+  EXPECT_EQ(255u, dst_pixels_opt[2][3]);
+  EXPECT_EQ(48u, dst_pixels_opt[3][0]);
+  EXPECT_EQ(192u, dst_pixels_opt[3][1]);
+  EXPECT_EQ(64u, dst_pixels_opt[3][2]);
+  EXPECT_EQ(224u, dst_pixels_opt[3][3]);
+
+  for (int i = 0; i < 1280; ++i) {
+    orig_pixels[i][0] = i;
+    orig_pixels[i][1] = i / 2;
+    orig_pixels[i][2] = i / 3;
+    orig_pixels[i][3] = i;
+  }
+
+  MaskCpuFlags(disable_cpu_flags_);
+  ARGBLumaColorTable(&orig_pixels[0][0], 0, &dst_pixels_c[0][0], 0,
+                     lumacolortable, 1280, 1);
+  MaskCpuFlags(benchmark_cpu_info_);
+
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    ARGBLumaColorTable(&orig_pixels[0][0], 0, &dst_pixels_opt[0][0], 0,
+                       lumacolortable, 1280, 1);
+  }
+  for (int i = 0; i < 1280; ++i) {
+    EXPECT_EQ(dst_pixels_c[i][0], dst_pixels_opt[i][0]);
+    EXPECT_EQ(dst_pixels_c[i][1], dst_pixels_opt[i][1]);
+    EXPECT_EQ(dst_pixels_c[i][2], dst_pixels_opt[i][2]);
+    EXPECT_EQ(dst_pixels_c[i][3], dst_pixels_opt[i][3]);
+  }
+
+  free_aligned_buffer_64(lumacolortable);
+}
+
+TEST_F(LibYUVPlanarTest, TestARGBCopyAlpha) {
+  const int kSize = benchmark_width_ * benchmark_height_ * 4;
+  align_buffer_64(orig_pixels, kSize);
+  align_buffer_64(dst_pixels_opt, kSize);
+  align_buffer_64(dst_pixels_c, kSize);
+
+  MemRandomize(orig_pixels, kSize);
+  MemRandomize(dst_pixels_opt, kSize);
+  memcpy(dst_pixels_c, dst_pixels_opt, kSize);
+
+  MaskCpuFlags(disable_cpu_flags_);
+  ARGBCopyAlpha(orig_pixels, benchmark_width_ * 4,
+                dst_pixels_c, benchmark_width_ * 4,
+                benchmark_width_, benchmark_height_);
+  MaskCpuFlags(benchmark_cpu_info_);
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    ARGBCopyAlpha(orig_pixels, benchmark_width_ * 4,
+                  dst_pixels_opt, benchmark_width_ * 4,
+                  benchmark_width_, benchmark_height_);
+  }
+  for (int i = 0; i < kSize; ++i) {
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+  }
+
+  free_aligned_buffer_64(dst_pixels_c);
+  free_aligned_buffer_64(dst_pixels_opt);
+  free_aligned_buffer_64(orig_pixels);
+}
+
+TEST_F(LibYUVPlanarTest, TestARGBCopyYToAlpha) {
+  const int kPixels = benchmark_width_ * benchmark_height_;
+  align_buffer_64(orig_pixels, kPixels);
+  align_buffer_64(dst_pixels_opt, kPixels * 4);
+  align_buffer_64(dst_pixels_c, kPixels * 4);
+
+  MemRandomize(orig_pixels, kPixels);
+  MemRandomize(dst_pixels_opt, kPixels * 4);
+  memcpy(dst_pixels_c, dst_pixels_opt, kPixels * 4);
+
+  MaskCpuFlags(disable_cpu_flags_);
+  ARGBCopyYToAlpha(orig_pixels, benchmark_width_,
+                   dst_pixels_c, benchmark_width_ * 4,
+                   benchmark_width_, benchmark_height_);
+  MaskCpuFlags(benchmark_cpu_info_);
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    ARGBCopyYToAlpha(orig_pixels, benchmark_width_,
+                     dst_pixels_opt, benchmark_width_ * 4,
+                     benchmark_width_, benchmark_height_);
+  }
+  for (int i = 0; i < kPixels * 4; ++i) {
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+  }
+
+  free_aligned_buffer_64(dst_pixels_c);
+  free_aligned_buffer_64(dst_pixels_opt);
+  free_aligned_buffer_64(orig_pixels);
+}
+
+static int TestARGBRect(int width, int height, int benchmark_iterations,
+                        int disable_cpu_flags, int benchmark_cpu_info,
+                        int invert, int off, int bpp) {
+  if (width < 1) {
+    width = 1;
+  }
+  const int kStride = width * bpp;
+  const int kSize = kStride * height;
+  const uint32 v32 = fastrand() & (bpp == 4 ? 0xffffffff : 0xff);
+
+  align_buffer_64(dst_argb_c, kSize + off);
+  align_buffer_64(dst_argb_opt, kSize + off);
+
+  MemRandomize(dst_argb_c + off, kSize);
+  memcpy(dst_argb_opt + off, dst_argb_c + off, kSize);
+
+  MaskCpuFlags(disable_cpu_flags);
+  if (bpp == 4) {
+    ARGBRect(dst_argb_c + off, kStride, 0, 0, width, invert * height, v32);
+  } else {
+    SetPlane(dst_argb_c + off, kStride, width, invert * height, v32);
+  }
+
+  MaskCpuFlags(benchmark_cpu_info);
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    if (bpp == 4) {
+      ARGBRect(dst_argb_opt + off, kStride, 0, 0, width, invert * height, v32);
+    } else {
+      SetPlane(dst_argb_opt + off, kStride, width, invert * height, v32);
+    }
+  }
+  int max_diff = 0;
+  for (int i = 0; i < kStride * height; ++i) {
+    int abs_diff =
+        abs(static_cast<int>(dst_argb_c[i + off]) -
+            static_cast<int>(dst_argb_opt[i + off]));
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
+    }
+  }
+  free_aligned_buffer_64(dst_argb_c);
+  free_aligned_buffer_64(dst_argb_opt);
+  return max_diff;
+}
+
+TEST_F(LibYUVPlanarTest, ARGBRect_Any) {
+  int max_diff = TestARGBRect(benchmark_width_ - 1, benchmark_height_,
+                              benchmark_iterations_,
+                              disable_cpu_flags_, benchmark_cpu_info_,
+                              +1, 0, 4);
+  EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBRect_Unaligned) {
+  int max_diff = TestARGBRect(benchmark_width_, benchmark_height_,
+                              benchmark_iterations_,
+                              disable_cpu_flags_, benchmark_cpu_info_,
+                              +1, 1, 4);
+  EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBRect_Invert) {
+  int max_diff = TestARGBRect(benchmark_width_, benchmark_height_,
+                              benchmark_iterations_,
+                              disable_cpu_flags_, benchmark_cpu_info_,
+                              -1, 0, 4);
+  EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(LibYUVPlanarTest, ARGBRect_Opt) {
+  int max_diff = TestARGBRect(benchmark_width_, benchmark_height_,
+                              benchmark_iterations_,
+                              disable_cpu_flags_, benchmark_cpu_info_,
+                              +1, 0, 4);
+  EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(LibYUVPlanarTest, SetPlane_Any) {
+  int max_diff = TestARGBRect(benchmark_width_ - 1, benchmark_height_,
+                              benchmark_iterations_,
+                              disable_cpu_flags_, benchmark_cpu_info_,
+                              +1, 0, 1);
+  EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(LibYUVPlanarTest, SetPlane_Unaligned) {
+  int max_diff = TestARGBRect(benchmark_width_, benchmark_height_,
+                              benchmark_iterations_,
+                              disable_cpu_flags_, benchmark_cpu_info_,
+                              +1, 1, 1);
+  EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(LibYUVPlanarTest, SetPlane_Invert) {
+  int max_diff = TestARGBRect(benchmark_width_, benchmark_height_,
+                              benchmark_iterations_,
+                              disable_cpu_flags_, benchmark_cpu_info_,
+                              -1, 0, 1);
+  EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(LibYUVPlanarTest, SetPlane_Opt) {
+  int max_diff = TestARGBRect(benchmark_width_, benchmark_height_,
+                              benchmark_iterations_,
+                              disable_cpu_flags_, benchmark_cpu_info_,
+                              +1, 0, 1);
+  EXPECT_EQ(0, max_diff);
+}
+
+}  // namespace libyuv
diff --git a/libs/libyuv/unit_test/rotate_argb_test.cc b/libs/libyuv/unit_test/rotate_argb_test.cc
new file mode 100644
index 0000000000..24640800a1
--- /dev/null
+++ b/libs/libyuv/unit_test/rotate_argb_test.cc
@@ -0,0 +1,197 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/rotate_argb.h"
+#include "libyuv/row.h"
+#include "../unit_test/unit_test.h"
+
+namespace libyuv {
+
+void TestRotateBpp(int src_width, int src_height,
+                   int dst_width, int dst_height,
+                   libyuv::RotationMode mode,
+                   int benchmark_iterations,
+                   int disable_cpu_flags,
+                   int benchmark_cpu_info,
+                   const int kBpp) {
+  if (src_width < 1) {
+    src_width = 1;
+  }
+  if (src_height < 1) {
+    src_height = 1;
+  }
+  if (dst_width < 1) {
+    dst_width = 1;
+  }
+  if (dst_height < 1) {
+    dst_height = 1;
+  }
+  int src_stride_argb = src_width * kBpp;
+  int src_argb_plane_size = src_stride_argb * abs(src_height);
+  align_buffer_64(src_argb, src_argb_plane_size);
+  for (int i = 0; i < src_argb_plane_size; ++i) {
+    src_argb[i] = fastrand() & 0xff;
+  }
+
+  int dst_stride_argb = dst_width * kBpp;
+  int dst_argb_plane_size = dst_stride_argb * dst_height;
+  align_buffer_64(dst_argb_c, dst_argb_plane_size);
+  align_buffer_64(dst_argb_opt, dst_argb_plane_size);
+  memset(dst_argb_c, 2, dst_argb_plane_size);
+  memset(dst_argb_opt, 3, dst_argb_plane_size);
+
+  if (kBpp == 1) {
+    MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+    RotatePlane(src_argb, src_stride_argb,
+                dst_argb_c, dst_stride_argb,
+                src_width, src_height, mode);
+
+    MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+    for (int i = 0; i < benchmark_iterations; ++i) {
+      RotatePlane(src_argb, src_stride_argb,
+                  dst_argb_opt, dst_stride_argb,
+                  src_width, src_height, mode);
+    }
+  } else if (kBpp == 4) {
+    MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+    ARGBRotate(src_argb, src_stride_argb,
+               dst_argb_c, dst_stride_argb,
+               src_width, src_height, mode);
+
+    MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+    for (int i = 0; i < benchmark_iterations; ++i) {
+      ARGBRotate(src_argb, src_stride_argb,
+                 dst_argb_opt, dst_stride_argb,
+                 src_width, src_height, mode);
+    }
+  }
+
+  // Rotation should be exact.
+  for (int i = 0; i < dst_argb_plane_size; ++i) {
+    EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]);
+  }
+
+  free_aligned_buffer_64(dst_argb_c);
+  free_aligned_buffer_64(dst_argb_opt);
+  free_aligned_buffer_64(src_argb);
+}
+
+static void ARGBTestRotate(int src_width, int src_height,
+                           int dst_width, int dst_height,
+                           libyuv::RotationMode mode,
+                           int benchmark_iterations,
+                           int disable_cpu_flags,
+                           int benchmark_cpu_info) {
+  TestRotateBpp(src_width, src_height,
+                dst_width, dst_height,
+                mode, benchmark_iterations,
+                disable_cpu_flags, benchmark_cpu_info, 4);
+}
+
+TEST_F(LibYUVRotateTest, ARGBRotate0_Opt) {
+  ARGBTestRotate(benchmark_width_, benchmark_height_,
+                 benchmark_width_, benchmark_height_,
+                 kRotate0, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, ARGBRotate90_Opt) {
+  ARGBTestRotate(benchmark_width_, benchmark_height_,
+                 benchmark_height_, benchmark_width_,
+                 kRotate90, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, ARGBRotate180_Opt) {
+  ARGBTestRotate(benchmark_width_, benchmark_height_,
+                 benchmark_width_, benchmark_height_,
+                 kRotate180, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, ARGBRotate270_Opt) {
+  ARGBTestRotate(benchmark_width_, benchmark_height_,
+                 benchmark_height_, benchmark_width_,
+                 kRotate270, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+static void TestRotatePlane(int src_width, int src_height,
+                            int dst_width, int dst_height,
+                            libyuv::RotationMode mode,
+                            int benchmark_iterations,
+                            int disable_cpu_flags,
+                            int benchmark_cpu_info) {
+  TestRotateBpp(src_width, src_height,
+                dst_width, dst_height,
+                mode, benchmark_iterations,
+                disable_cpu_flags, benchmark_cpu_info, 1);
+}
+
+TEST_F(LibYUVRotateTest, RotatePlane0_Opt) {
+  TestRotatePlane(benchmark_width_, benchmark_height_,
+                  benchmark_width_, benchmark_height_,
+                  kRotate0, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, RotatePlane90_Opt) {
+  TestRotatePlane(benchmark_width_, benchmark_height_,
+                  benchmark_height_, benchmark_width_,
+                  kRotate90, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, RotatePlane180_Opt) {
+  TestRotatePlane(benchmark_width_, benchmark_height_,
+                  benchmark_width_, benchmark_height_,
+                  kRotate180, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, RotatePlane270_Opt) {
+  TestRotatePlane(benchmark_width_, benchmark_height_,
+                  benchmark_height_, benchmark_width_,
+                  kRotate270, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_RotatePlane0_Odd) {
+  TestRotatePlane(benchmark_width_ - 3, benchmark_height_ - 1,
+                  benchmark_width_ - 3, benchmark_height_ - 1,
+                  kRotate0, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_RotatePlane90_Odd) {
+  TestRotatePlane(benchmark_width_ - 3, benchmark_height_ - 1,
+                  benchmark_height_ - 1, benchmark_width_ - 3,
+                  kRotate90, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_RotatePlane180_Odd) {
+  TestRotatePlane(benchmark_width_ - 3, benchmark_height_ - 1,
+                  benchmark_width_ - 3, benchmark_height_ - 1,
+                  kRotate180, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_RotatePlane270_Odd) {
+  TestRotatePlane(benchmark_width_ - 3, benchmark_height_ - 1,
+                  benchmark_height_ - 1, benchmark_width_ - 3,
+                  kRotate270, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+}  // namespace libyuv
diff --git a/libs/libyuv/unit_test/rotate_test.cc b/libs/libyuv/unit_test/rotate_test.cc
new file mode 100644
index 0000000000..1f5b86e95a
--- /dev/null
+++ b/libs/libyuv/unit_test/rotate_test.cc
@@ -0,0 +1,297 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/rotate.h"
+#include "libyuv/row.h"
+#include "../unit_test/unit_test.h"
+
+namespace libyuv {
+
+static void I420TestRotate(int src_width, int src_height,
+                           int dst_width, int dst_height,
+                           libyuv::RotationMode mode,
+                           int benchmark_iterations,
+                           int disable_cpu_flags, int benchmark_cpu_info) {
+  if (src_width < 1) {
+    src_width = 1;
+  }
+  if (src_height == 0) {
+    src_height = 1;
+  }
+  if (dst_width < 1) {
+    dst_width = 1;
+  }
+  if (dst_height < 1) {
+    dst_height = 1;
+  }
+  int src_i420_y_size = src_width * Abs(src_height);
+  int src_i420_uv_size = ((src_width + 1) / 2) * ((Abs(src_height) + 1) / 2);
+  int src_i420_size = src_i420_y_size + src_i420_uv_size * 2;
+  align_buffer_64(src_i420, src_i420_size);
+  for (int i = 0; i < src_i420_size; ++i) {
+    src_i420[i] = fastrand() & 0xff;
+  }
+
+  int dst_i420_y_size = dst_width * dst_height;
+  int dst_i420_uv_size = ((dst_width + 1) / 2) * ((dst_height + 1) / 2);
+  int dst_i420_size = dst_i420_y_size + dst_i420_uv_size * 2;
+  align_buffer_64(dst_i420_c, dst_i420_size);
+  align_buffer_64(dst_i420_opt, dst_i420_size);
+  memset(dst_i420_c, 2, dst_i420_size);
+  memset(dst_i420_opt, 3, dst_i420_size);
+
+  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+  I420Rotate(src_i420, src_width,
+             src_i420 + src_i420_y_size, (src_width + 1) / 2,
+             src_i420 + src_i420_y_size + src_i420_uv_size, (src_width + 1) / 2,
+             dst_i420_c, dst_width,
+             dst_i420_c + dst_i420_y_size, (dst_width + 1) / 2,
+             dst_i420_c + dst_i420_y_size + dst_i420_uv_size,
+               (dst_width + 1) / 2,
+             src_width, src_height, mode);
+
+  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    I420Rotate(src_i420, src_width,
+               src_i420 + src_i420_y_size, (src_width + 1) / 2,
+               src_i420 + src_i420_y_size + src_i420_uv_size,
+                 (src_width + 1) / 2,
+               dst_i420_opt, dst_width,
+               dst_i420_opt + dst_i420_y_size, (dst_width + 1) / 2,
+               dst_i420_opt + dst_i420_y_size + dst_i420_uv_size,
+                 (dst_width + 1) / 2,
+               src_width, src_height, mode);
+  }
+
+  // Rotation should be exact.
+  for (int i = 0; i < dst_i420_size; ++i) {
+    EXPECT_EQ(dst_i420_c[i], dst_i420_opt[i]);
+  }
+
+  free_aligned_buffer_64(dst_i420_c);
+  free_aligned_buffer_64(dst_i420_opt);
+  free_aligned_buffer_64(src_i420);
+}
+
+TEST_F(LibYUVRotateTest, I420Rotate0_Opt) {
+  I420TestRotate(benchmark_width_, benchmark_height_,
+                 benchmark_width_, benchmark_height_,
+                 kRotate0, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I420Rotate90_Opt) {
+  I420TestRotate(benchmark_width_, benchmark_height_,
+                 benchmark_height_, benchmark_width_,
+                 kRotate90, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I420Rotate180_Opt) {
+  I420TestRotate(benchmark_width_, benchmark_height_,
+                 benchmark_width_, benchmark_height_,
+                 kRotate180, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I420Rotate270_Opt) {
+  I420TestRotate(benchmark_width_, benchmark_height_,
+                 benchmark_height_, benchmark_width_,
+                 kRotate270, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+// TODO(fbarchard): Remove odd width tests.
+// Odd width tests work but disabled because they use C code and can be
+// tested by passing an odd width command line or environment variable.
+TEST_F(LibYUVRotateTest, DISABLED_I420Rotate0_Odd) {
+  I420TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+                 benchmark_width_ - 3, benchmark_height_ - 1,
+                 kRotate0, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_I420Rotate90_Odd) {
+  I420TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+                 benchmark_height_ - 1, benchmark_width_ - 3,
+                 kRotate90, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_I420Rotate180_Odd) {
+  I420TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+                 benchmark_width_ - 3, benchmark_height_ - 1,
+                 kRotate180, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_I420Rotate270_Odd) {
+  I420TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+                 benchmark_height_ - 1, benchmark_width_ - 3,
+                 kRotate270, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+static void NV12TestRotate(int src_width, int src_height,
+                           int dst_width, int dst_height,
+                           libyuv::RotationMode mode,
+                           int benchmark_iterations,
+                           int disable_cpu_flags, int benchmark_cpu_info) {
+  if (src_width < 1) {
+    src_width = 1;
+  }
+  if (src_height == 0) {  // allow negative for inversion test.
+    src_height = 1;
+  }
+  if (dst_width < 1) {
+    dst_width = 1;
+  }
+  if (dst_height < 1) {
+    dst_height = 1;
+  }
+  int src_nv12_y_size = src_width * Abs(src_height);
+  int src_nv12_uv_size =
+      ((src_width + 1) / 2) * ((Abs(src_height) + 1) / 2) * 2;
+  int src_nv12_size = src_nv12_y_size + src_nv12_uv_size;
+  align_buffer_64(src_nv12, src_nv12_size);
+  for (int i = 0; i < src_nv12_size; ++i) {
+    src_nv12[i] = fastrand() & 0xff;
+  }
+
+  int dst_i420_y_size = dst_width * dst_height;
+  int dst_i420_uv_size = ((dst_width + 1) / 2) * ((dst_height + 1) / 2);
+  int dst_i420_size = dst_i420_y_size + dst_i420_uv_size * 2;
+  align_buffer_64(dst_i420_c, dst_i420_size);
+  align_buffer_64(dst_i420_opt, dst_i420_size);
+  memset(dst_i420_c, 2, dst_i420_size);
+  memset(dst_i420_opt, 3, dst_i420_size);
+
+  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+  NV12ToI420Rotate(src_nv12, src_width,
+                   src_nv12 + src_nv12_y_size, (src_width + 1) & ~1,
+                   dst_i420_c, dst_width,
+                   dst_i420_c + dst_i420_y_size, (dst_width + 1) / 2,
+                   dst_i420_c + dst_i420_y_size + dst_i420_uv_size,
+                     (dst_width + 1) / 2,
+                   src_width, src_height, mode);
+
+  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    NV12ToI420Rotate(src_nv12, src_width,
+                     src_nv12 + src_nv12_y_size, (src_width + 1) & ~1,
+                     dst_i420_opt, dst_width,
+                     dst_i420_opt + dst_i420_y_size, (dst_width + 1) / 2,
+                     dst_i420_opt + dst_i420_y_size + dst_i420_uv_size,
+                       (dst_width + 1) / 2,
+                     src_width, src_height, mode);
+  }
+
+  // Rotation should be exact.
+  for (int i = 0; i < dst_i420_size; ++i) {
+    EXPECT_EQ(dst_i420_c[i], dst_i420_opt[i]);
+  }
+
+  free_aligned_buffer_64(dst_i420_c);
+  free_aligned_buffer_64(dst_i420_opt);
+  free_aligned_buffer_64(src_nv12);
+}
+
+TEST_F(LibYUVRotateTest, NV12Rotate0_Opt) {
+  NV12TestRotate(benchmark_width_, benchmark_height_,
+                 benchmark_width_, benchmark_height_,
+                 kRotate0, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, NV12Rotate90_Opt) {
+  NV12TestRotate(benchmark_width_, benchmark_height_,
+                 benchmark_height_, benchmark_width_,
+                 kRotate90, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, NV12Rotate180_Opt) {
+  NV12TestRotate(benchmark_width_, benchmark_height_,
+                 benchmark_width_, benchmark_height_,
+                 kRotate180, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, NV12Rotate270_Opt) {
+  NV12TestRotate(benchmark_width_, benchmark_height_,
+                 benchmark_height_, benchmark_width_,
+                 kRotate270, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate0_Odd) {
+  NV12TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+                 benchmark_width_ - 3, benchmark_height_ - 1,
+                 kRotate0, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate90_Odd) {
+  NV12TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+                 benchmark_height_ - 1, benchmark_width_ - 3,
+                 kRotate90, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate180_Odd) {
+  NV12TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+                 benchmark_width_ - 3, benchmark_height_ - 1,
+                 kRotate180, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate270_Odd) {
+  NV12TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+                 benchmark_height_ - 1, benchmark_width_ - 3,
+                 kRotate270, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, NV12Rotate0_Invert) {
+  NV12TestRotate(benchmark_width_, -benchmark_height_,
+                 benchmark_width_, benchmark_height_,
+                 kRotate0, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, NV12Rotate90_Invert) {
+  NV12TestRotate(benchmark_width_, -benchmark_height_,
+                 benchmark_height_, benchmark_width_,
+                 kRotate90, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, NV12Rotate180_Invert) {
+  NV12TestRotate(benchmark_width_, -benchmark_height_,
+                 benchmark_width_, benchmark_height_,
+                 kRotate180, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, NV12Rotate270_Invert) {
+  NV12TestRotate(benchmark_width_, -benchmark_height_,
+                 benchmark_height_, benchmark_width_,
+                 kRotate270, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+
+
+
+
+}  // namespace libyuv
diff --git a/libs/libyuv/unit_test/scale_argb_test.cc b/libs/libyuv/unit_test/scale_argb_test.cc
new file mode 100644
index 0000000000..e85eb2a545
--- /dev/null
+++ b/libs/libyuv/unit_test/scale_argb_test.cc
@@ -0,0 +1,456 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <time.h>
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/convert.h"
+#include "libyuv/scale_argb.h"
+#include "libyuv/row.h"
+#include "libyuv/video_common.h"
+#include "../unit_test/unit_test.h"
+
+namespace libyuv {
+
+#define STRINGIZE(line) #line
+#define FILELINESTR(file, line) file ":" STRINGIZE(line)
+
+// Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
+static int ARGBTestFilter(int src_width, int src_height,
+                          int dst_width, int dst_height,
+                          FilterMode f, int benchmark_iterations,
+                          int disable_cpu_flags, int benchmark_cpu_info) {
+  int i, j;
+  const int b = 0;  // 128 to test for padding/stride.
+  int64 src_argb_plane_size = (Abs(src_width) + b * 2) *
+      (Abs(src_height) + b * 2) * 4LL;
+  int src_stride_argb = (b * 2 + Abs(src_width)) * 4;
+
+  align_buffer_page_end(src_argb, src_argb_plane_size);
+  if (!src_argb) {
+    printf("Skipped.  Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+    return 0;
+  }
+  MemRandomize(src_argb, src_argb_plane_size);
+
+  int64 dst_argb_plane_size = (dst_width + b * 2) * (dst_height + b * 2) * 4LL;
+  int dst_stride_argb = (b * 2 + dst_width) * 4;
+
+  align_buffer_page_end(dst_argb_c, dst_argb_plane_size);
+  align_buffer_page_end(dst_argb_opt, dst_argb_plane_size);
+  if (!dst_argb_c || !dst_argb_opt) {
+    printf("Skipped.  Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+    return 0;
+  }
+  memset(dst_argb_c, 2, dst_argb_plane_size);
+  memset(dst_argb_opt, 3, dst_argb_plane_size);
+
+  // Warm up both versions for consistent benchmarks.
+  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+  ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
+            src_width, src_height,
+            dst_argb_c + (dst_stride_argb * b) + b * 4, dst_stride_argb,
+            dst_width, dst_height, f);
+  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+  ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
+            src_width, src_height,
+            dst_argb_opt + (dst_stride_argb * b) + b * 4, dst_stride_argb,
+            dst_width, dst_height, f);
+
+  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+  double c_time = get_time();
+  ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
+            src_width, src_height,
+            dst_argb_c + (dst_stride_argb * b) + b * 4, dst_stride_argb,
+            dst_width, dst_height, f);
+
+  c_time = (get_time() - c_time);
+
+  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+  double opt_time = get_time();
+  for (i = 0; i < benchmark_iterations; ++i) {
+    ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
+              src_width, src_height,
+              dst_argb_opt + (dst_stride_argb * b) + b * 4, dst_stride_argb,
+              dst_width, dst_height, f);
+  }
+  opt_time = (get_time() - opt_time) / benchmark_iterations;
+
+  // Report performance of C vs OPT
+  printf("filter %d - %8d us C - %8d us OPT\n",
+         f, static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6));
+
+  // C version may be a little off from the optimized. Order of
+  //  operations may introduce rounding somewhere. So do a difference
+  //  of the buffers and look to see that the max difference isn't
+  //  over 2.
+  int max_diff = 0;
+  for (i = b; i < (dst_height + b); ++i) {
+    for (j = b * 4; j < (dst_width + b) * 4; ++j) {
+      int abs_diff = Abs(dst_argb_c[(i * dst_stride_argb) + j] -
+                         dst_argb_opt[(i * dst_stride_argb) + j]);
+      if (abs_diff > max_diff) {
+        max_diff = abs_diff;
+      }
+    }
+  }
+
+  free_aligned_buffer_page_end(dst_argb_c);
+  free_aligned_buffer_page_end(dst_argb_opt);
+  free_aligned_buffer_page_end(src_argb);
+  return max_diff;
+}
+
+static const int kTileX = 8;
+static const int kTileY = 8;
+
+static int TileARGBScale(const uint8* src_argb, int src_stride_argb,
+                         int src_width, int src_height,
+                         uint8* dst_argb, int dst_stride_argb,
+                         int dst_width, int dst_height,
+                         FilterMode filtering) {
+  for (int y = 0; y < dst_height; y += kTileY) {
+    for (int x = 0; x < dst_width; x += kTileX) {
+      int clip_width = kTileX;
+      if (x + clip_width > dst_width) {
+        clip_width = dst_width - x;
+      }
+      int clip_height = kTileY;
+      if (y + clip_height > dst_height) {
+        clip_height = dst_height - y;
+      }
+      int r = ARGBScaleClip(src_argb, src_stride_argb,
+                            src_width, src_height,
+                            dst_argb, dst_stride_argb,
+                            dst_width, dst_height,
+                            x, y, clip_width, clip_height, filtering);
+      if (r) {
+        return r;
+      }
+    }
+  }
+  return 0;
+}
+
+static int ARGBClipTestFilter(int src_width, int src_height,
+                              int dst_width, int dst_height,
+                              FilterMode f, int benchmark_iterations) {
+  const int b = 128;
+  int64 src_argb_plane_size = (Abs(src_width) + b * 2) *
+      (Abs(src_height) + b * 2) * 4;
+  int src_stride_argb = (b * 2 + Abs(src_width)) * 4;
+
+  align_buffer_64(src_argb, src_argb_plane_size);
+  if (!src_argb) {
+    printf("Skipped.  Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+    return 0;
+  }
+  memset(src_argb, 1, src_argb_plane_size);
+
+  int64 dst_argb_plane_size = (dst_width + b * 2) * (dst_height + b * 2) * 4;
+  int dst_stride_argb = (b * 2 + dst_width) * 4;
+
+  int i, j;
+  for (i = b; i < (Abs(src_height) + b); ++i) {
+    for (j = b; j < (Abs(src_width) + b) * 4; ++j) {
+      src_argb[(i * src_stride_argb) + j] = (fastrand() & 0xff);
+    }
+  }
+
+  align_buffer_64(dst_argb_c, dst_argb_plane_size);
+  align_buffer_64(dst_argb_opt, dst_argb_plane_size);
+  if (!dst_argb_c || !dst_argb_opt) {
+    printf("Skipped.  Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+    return 0;
+  }
+  memset(dst_argb_c, 2, dst_argb_plane_size);
+  memset(dst_argb_opt, 3, dst_argb_plane_size);
+
+  // Do full image, no clipping.
+  double c_time = get_time();
+  ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
+            src_width, src_height,
+            dst_argb_c + (dst_stride_argb * b) + b * 4, dst_stride_argb,
+            dst_width, dst_height, f);
+  c_time = (get_time() - c_time);
+
+  // Do tiled image, clipping scale to a tile at a time.
+  double opt_time = get_time();
+  for (i = 0; i < benchmark_iterations; ++i) {
+    TileARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
+                  src_width, src_height,
+                  dst_argb_opt + (dst_stride_argb * b) + b * 4, dst_stride_argb,
+                  dst_width, dst_height, f);
+  }
+  opt_time = (get_time() - opt_time) / benchmark_iterations;
+
+  // Report performance of Full vs Tiled.
+  printf("filter %d - %8d us Full - %8d us Tiled\n",
+         f, static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6));
+
+  // Compare full scaled image vs tiled image.
+  int max_diff = 0;
+  for (i = b; i < (dst_height + b); ++i) {
+    for (j = b * 4; j < (dst_width + b) * 4; ++j) {
+      int abs_diff = Abs(dst_argb_c[(i * dst_stride_argb) + j] -
+                         dst_argb_opt[(i * dst_stride_argb) + j]);
+      if (abs_diff > max_diff) {
+        max_diff = abs_diff;
+      }
+    }
+  }
+
+  free_aligned_buffer_64(dst_argb_c);
+  free_aligned_buffer_64(dst_argb_opt);
+  free_aligned_buffer_64(src_argb);
+  return max_diff;
+}
+
+// The following adjustments in dimensions ensure the scale factor will be
+// exactly achieved.
+#define DX(x, nom, denom) static_cast<int>((Abs(x) / nom) * nom)
+#define SX(x, nom, denom) static_cast<int>((x / nom) * denom)
+
+#define TEST_FACTOR1(name, filter, nom, denom, max_diff)                       \
+    TEST_F(LibYUVScaleTest, ARGBScaleDownBy##name##_##filter) {                \
+      int diff = ARGBTestFilter(SX(benchmark_width_, nom, denom),              \
+                                SX(benchmark_height_, nom, denom),             \
+                                DX(benchmark_width_, nom, denom),              \
+                                DX(benchmark_height_, nom, denom),             \
+                                kFilter##filter, benchmark_iterations_,        \
+                                disable_cpu_flags_, benchmark_cpu_info_);      \
+      EXPECT_LE(diff, max_diff);                                               \
+    }                                                                          \
+    TEST_F(LibYUVScaleTest, ARGBScaleDownClipBy##name##_##filter) {            \
+      int diff = ARGBClipTestFilter(SX(benchmark_width_, nom, denom),          \
+                                    SX(benchmark_height_, nom, denom),         \
+                                    DX(benchmark_width_, nom, denom),          \
+                                    DX(benchmark_height_, nom, denom),         \
+                                    kFilter##filter, benchmark_iterations_);   \
+      EXPECT_LE(diff, max_diff);                                               \
+    }
+
+// Test a scale factor with all 4 filters.  Expect unfiltered to be exact, but
+// filtering is different fixed point implementations for SSSE3, Neon and C.
+#define TEST_FACTOR(name, nom, denom)                                          \
+    TEST_FACTOR1(name, None, nom, denom, 0)                                    \
+    TEST_FACTOR1(name, Linear, nom, denom, 3)                                  \
+    TEST_FACTOR1(name, Bilinear, nom, denom, 3)                                \
+    TEST_FACTOR1(name, Box, nom, denom, 3)
+
+TEST_FACTOR(2, 1, 2)
+TEST_FACTOR(4, 1, 4)
+TEST_FACTOR(8, 1, 8)
+TEST_FACTOR(3by4, 3, 4)
+TEST_FACTOR(3by8, 3, 8)
+TEST_FACTOR(3, 1, 3)
+#undef TEST_FACTOR1
+#undef TEST_FACTOR
+#undef SX
+#undef DX
+
+#define TEST_SCALETO1(name, width, height, filter, max_diff)                   \
+    TEST_F(LibYUVScaleTest, name##To##width##x##height##_##filter) {           \
+      int diff = ARGBTestFilter(benchmark_width_, benchmark_height_,           \
+                                width, height,                                 \
+                                kFilter##filter, benchmark_iterations_,        \
+                                disable_cpu_flags_, benchmark_cpu_info_);      \
+      EXPECT_LE(diff, max_diff);                                               \
+    }                                                                          \
+    TEST_F(LibYUVScaleTest, name##From##width##x##height##_##filter) {         \
+      int diff = ARGBTestFilter(width, height,                                 \
+                                Abs(benchmark_width_), Abs(benchmark_height_), \
+                                kFilter##filter, benchmark_iterations_,        \
+                                disable_cpu_flags_, benchmark_cpu_info_);      \
+      EXPECT_LE(diff, max_diff);                                               \
+    }                                                                          \
+    TEST_F(LibYUVScaleTest, name##ClipTo##width##x##height##_##filter) {       \
+      int diff = ARGBClipTestFilter(benchmark_width_, benchmark_height_,       \
+                                    width, height,                             \
+                                    kFilter##filter, benchmark_iterations_);   \
+      EXPECT_LE(diff, max_diff);                                               \
+    }                                                                          \
+    TEST_F(LibYUVScaleTest, name##ClipFrom##width##x##height##_##filter) {     \
+      int diff = ARGBClipTestFilter(width, height,                             \
+                                    Abs(benchmark_width_),                     \
+                                    Abs(benchmark_height_),                    \
+                                    kFilter##filter, benchmark_iterations_);   \
+      EXPECT_LE(diff, max_diff);                                               \
+    }
+
+/// Test scale to a specified size with all 4 filters.
+#define TEST_SCALETO(name, width, height)                                      \
+    TEST_SCALETO1(name, width, height, None, 0)                                \
+    TEST_SCALETO1(name, width, height, Linear, 3)                              \
+    TEST_SCALETO1(name, width, height, Bilinear, 3)
+
+TEST_SCALETO(ARGBScale, 1, 1)
+TEST_SCALETO(ARGBScale, 320, 240)
+TEST_SCALETO(ARGBScale, 352, 288)
+TEST_SCALETO(ARGBScale, 569, 480)
+TEST_SCALETO(ARGBScale, 640, 360)
+TEST_SCALETO(ARGBScale, 1280, 720)
+#undef TEST_SCALETO1
+#undef TEST_SCALETO
+
+// Scale with YUV conversion to ARGB and clipping.
+LIBYUV_API
+int YUVToARGBScaleReference2(const uint8* src_y, int src_stride_y,
+                             const uint8* src_u, int src_stride_u,
+                             const uint8* src_v, int src_stride_v,
+                             uint32 src_fourcc,
+                             int src_width, int src_height,
+                             uint8* dst_argb, int dst_stride_argb,
+                             uint32 dst_fourcc,
+                             int dst_width, int dst_height,
+                             int clip_x, int clip_y,
+                             int clip_width, int clip_height,
+                             enum FilterMode filtering) {
+
+  uint8* argb_buffer = (uint8*)malloc(src_width * src_height * 4);
+  int r;
+  I420ToARGB(src_y, src_stride_y,
+             src_u, src_stride_u,
+             src_v, src_stride_v,
+             argb_buffer, src_width * 4,
+             src_width, src_height);
+
+  r = ARGBScaleClip(argb_buffer, src_width * 4,
+                    src_width, src_height,
+                    dst_argb, dst_stride_argb,
+                    dst_width, dst_height,
+                    clip_x, clip_y, clip_width, clip_height,
+                    filtering);
+  free(argb_buffer);
+  return r;
+}
+
+static void FillRamp(uint8* buf, int width, int height, int v, int dx, int dy) {
+  int rv = v;
+  for (int y = 0; y < height; ++y) {
+    for (int x = 0; x < width; ++x) {
+      *buf++ = v;
+      v += dx;
+      if (v < 0 || v > 255) {
+        dx = -dx;
+        v += dx;
+      }
+    }
+    v = rv + dy;
+    if (v < 0 || v > 255) {
+      dy = -dy;
+      v += dy;
+    }
+    rv = v;
+  }
+}
+
+// Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
+static int YUVToARGBTestFilter(int src_width, int src_height,
+                               int dst_width, int dst_height,
+                               FilterMode f, int benchmark_iterations,
+                               int disable_cpu_flags, int benchmark_cpu_info) {
+  int64 src_y_plane_size = Abs(src_width) * Abs(src_height);
+  int64 src_uv_plane_size = ((Abs(src_width) + 1) / 2) *
+      ((Abs(src_height) + 1) / 2);
+  int src_stride_y = Abs(src_width);
+  int src_stride_uv = (Abs(src_width) + 1) / 2;
+
+  align_buffer_page_end(src_y, src_y_plane_size);
+  align_buffer_page_end(src_u, src_uv_plane_size);
+  align_buffer_page_end(src_v, src_uv_plane_size);
+
+  int64 dst_argb_plane_size = (dst_width) * (dst_height) * 4LL;
+  int dst_stride_argb = (dst_width) * 4;
+  align_buffer_page_end(dst_argb_c, dst_argb_plane_size);
+  align_buffer_page_end(dst_argb_opt, dst_argb_plane_size);
+  if (!dst_argb_c || !dst_argb_opt || !src_y || !src_u || !src_v) {
+    printf("Skipped.  Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+    return 0;
+  }
+  // Fill YUV image with continuous ramp, which is less sensitive to
+  // subsampling and filtering differences for test purposes.
+  FillRamp(src_y, Abs(src_width), Abs(src_height), 128, 1, 1);
+  FillRamp(src_u, (Abs(src_width) + 1) / 2, (Abs(src_height) + 1) / 2, 3, 1, 1);
+  FillRamp(src_v, (Abs(src_width) + 1) / 2, (Abs(src_height) + 1) / 2, 4, 1, 1);
+  memset(dst_argb_c, 2, dst_argb_plane_size);
+  memset(dst_argb_opt, 3, dst_argb_plane_size);
+
+  YUVToARGBScaleReference2(src_y, src_stride_y,
+                           src_u, src_stride_uv,
+                           src_v, src_stride_uv,
+                           libyuv::FOURCC_I420,
+                           src_width, src_height,
+                           dst_argb_c, dst_stride_argb,
+                           libyuv::FOURCC_I420,
+                           dst_width, dst_height,
+                           0, 0, dst_width, dst_height,
+                           f);
+
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    YUVToARGBScaleClip(src_y, src_stride_y,
+                       src_u, src_stride_uv,
+                       src_v, src_stride_uv,
+                       libyuv::FOURCC_I420,
+                       src_width, src_height,
+                       dst_argb_opt, dst_stride_argb,
+                       libyuv::FOURCC_I420,
+                       dst_width, dst_height,
+                       0, 0, dst_width, dst_height,
+                       f);
+  }
+  int max_diff = 0;
+  for (int i = 0; i < dst_height; ++i) {
+    for (int j = 0; j < dst_width * 4; ++j) {
+      int abs_diff = Abs(dst_argb_c[(i * dst_stride_argb) + j] -
+                         dst_argb_opt[(i * dst_stride_argb) + j]);
+      if (abs_diff > max_diff) {
+        printf("error %d at %d,%d c %d opt %d",
+               abs_diff,
+               j, i,
+               dst_argb_c[(i * dst_stride_argb) + j],
+               dst_argb_opt[(i * dst_stride_argb) + j]);
+        EXPECT_LE(abs_diff, 40);
+        max_diff = abs_diff;
+      }
+    }
+  }
+
+  free_aligned_buffer_page_end(dst_argb_c);
+  free_aligned_buffer_page_end(dst_argb_opt);
+  free_aligned_buffer_page_end(src_y);
+  free_aligned_buffer_page_end(src_u);
+  free_aligned_buffer_page_end(src_v);
+  return max_diff;
+}
+
+TEST_F(LibYUVScaleTest, YUVToRGBScaleUp) {
+  int diff = YUVToARGBTestFilter(benchmark_width_, benchmark_height_,
+                                 benchmark_width_ * 3 / 2,
+                                 benchmark_height_ * 3 / 2,
+                                 libyuv::kFilterBilinear,
+                                 benchmark_iterations_,
+                                 disable_cpu_flags_, benchmark_cpu_info_);
+  EXPECT_LE(diff, 10);
+}
+
+TEST_F(LibYUVScaleTest, YUVToRGBScaleDown) {
+  int diff = YUVToARGBTestFilter(benchmark_width_ * 3 / 2,
+                                 benchmark_height_ * 3 / 2,
+                                 benchmark_width_, benchmark_height_,
+                                 libyuv::kFilterBilinear,
+                                 benchmark_iterations_,
+                                 disable_cpu_flags_, benchmark_cpu_info_);
+  EXPECT_LE(diff, 10);
+}
+
+
+}  // namespace libyuv
diff --git a/libs/libyuv/unit_test/scale_test.cc b/libs/libyuv/unit_test/scale_test.cc
new file mode 100644
index 0000000000..f31af80b31
--- /dev/null
+++ b/libs/libyuv/unit_test/scale_test.cc
@@ -0,0 +1,364 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <time.h>
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/scale.h"
+#include "../unit_test/unit_test.h"
+
+#define STRINGIZE(line) #line
+#define FILELINESTR(file, line) file ":" STRINGIZE(line)
+
+namespace libyuv {
+
+// Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
+static int TestFilter(int src_width, int src_height,
+                      int dst_width, int dst_height,
+                      FilterMode f, int benchmark_iterations,
+                      int disable_cpu_flags, int benchmark_cpu_info) {
+  int i, j;
+  const int b = 0;  // 128 to test for padding/stride.
+  int src_width_uv = (Abs(src_width) + 1) >> 1;
+  int src_height_uv = (Abs(src_height) + 1) >> 1;
+
+  int64 src_y_plane_size = (Abs(src_width) + b * 2) * (Abs(src_height) + b * 2);
+  int64 src_uv_plane_size = (src_width_uv + b * 2) * (src_height_uv + b * 2);
+
+  int src_stride_y = b * 2 + Abs(src_width);
+  int src_stride_uv = b * 2 + src_width_uv;
+
+  align_buffer_page_end(src_y, src_y_plane_size)
+  align_buffer_page_end(src_u, src_uv_plane_size)
+  align_buffer_page_end(src_v, src_uv_plane_size)
+  if (!src_y || !src_u || !src_v) {
+    printf("Skipped.  Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+    return 0;
+  }
+  MemRandomize(src_y, src_y_plane_size);
+  MemRandomize(src_u, src_uv_plane_size);
+  MemRandomize(src_v, src_uv_plane_size);
+
+  int dst_width_uv = (dst_width + 1) >> 1;
+  int dst_height_uv = (dst_height + 1) >> 1;
+
+  int64 dst_y_plane_size = (dst_width + b * 2) * (dst_height + b * 2);
+  int64 dst_uv_plane_size = (dst_width_uv + b * 2) * (dst_height_uv + b * 2);
+
+  int dst_stride_y = b * 2 + dst_width;
+  int dst_stride_uv = b * 2 + dst_width_uv;
+
+  align_buffer_page_end(dst_y_c, dst_y_plane_size)
+  align_buffer_page_end(dst_u_c, dst_uv_plane_size)
+  align_buffer_page_end(dst_v_c, dst_uv_plane_size)
+  align_buffer_page_end(dst_y_opt, dst_y_plane_size)
+  align_buffer_page_end(dst_u_opt, dst_uv_plane_size)
+  align_buffer_page_end(dst_v_opt, dst_uv_plane_size)
+  if (!dst_y_c || !dst_u_c || !dst_v_c ||
+      !dst_y_opt|| !dst_u_opt|| !dst_v_opt) {
+    printf("Skipped.  Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+    return 0;
+  }
+
+  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+  double c_time = get_time();
+  I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
+            src_u + (src_stride_uv * b) + b, src_stride_uv,
+            src_v + (src_stride_uv * b) + b, src_stride_uv,
+            src_width, src_height,
+            dst_y_c + (dst_stride_y * b) + b, dst_stride_y,
+            dst_u_c + (dst_stride_uv * b) + b, dst_stride_uv,
+            dst_v_c + (dst_stride_uv * b) + b, dst_stride_uv,
+            dst_width, dst_height, f);
+  c_time = (get_time() - c_time);
+
+  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+  double opt_time = get_time();
+  for (i = 0; i < benchmark_iterations; ++i) {
+    I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
+              src_u + (src_stride_uv * b) + b, src_stride_uv,
+              src_v + (src_stride_uv * b) + b, src_stride_uv,
+              src_width, src_height,
+              dst_y_opt + (dst_stride_y * b) + b, dst_stride_y,
+              dst_u_opt + (dst_stride_uv * b) + b, dst_stride_uv,
+              dst_v_opt + (dst_stride_uv * b) + b, dst_stride_uv,
+              dst_width, dst_height, f);
+  }
+  opt_time = (get_time() - opt_time) / benchmark_iterations;
+  // Report performance of C vs OPT
+  printf("filter %d - %8d us C - %8d us OPT\n",
+         f,
+         static_cast<int>(c_time * 1e6),
+         static_cast<int>(opt_time * 1e6));
+
+  // C version may be a little off from the optimized. Order of
+  //  operations may introduce rounding somewhere. So do a difference
+  //  of the buffers and look to see that the max difference isn't
+  //  over 2.
+  int max_diff = 0;
+  for (i = b; i < (dst_height + b); ++i) {
+    for (j = b; j < (dst_width + b); ++j) {
+      int abs_diff = Abs(dst_y_c[(i * dst_stride_y) + j] -
+                         dst_y_opt[(i * dst_stride_y) + j]);
+      if (abs_diff > max_diff) {
+        max_diff = abs_diff;
+      }
+    }
+  }
+
+  for (i = b; i < (dst_height_uv + b); ++i) {
+    for (j = b; j < (dst_width_uv + b); ++j) {
+      int abs_diff = Abs(dst_u_c[(i * dst_stride_uv) + j] -
+                         dst_u_opt[(i * dst_stride_uv) + j]);
+      if (abs_diff > max_diff) {
+        max_diff = abs_diff;
+      }
+      abs_diff = Abs(dst_v_c[(i * dst_stride_uv) + j] -
+                     dst_v_opt[(i * dst_stride_uv) + j]);
+      if (abs_diff > max_diff) {
+        max_diff = abs_diff;
+      }
+    }
+  }
+
+  free_aligned_buffer_page_end(dst_y_c)
+  free_aligned_buffer_page_end(dst_u_c)
+  free_aligned_buffer_page_end(dst_v_c)
+  free_aligned_buffer_page_end(dst_y_opt)
+  free_aligned_buffer_page_end(dst_u_opt)
+  free_aligned_buffer_page_end(dst_v_opt)
+
+  free_aligned_buffer_page_end(src_y)
+  free_aligned_buffer_page_end(src_u)
+  free_aligned_buffer_page_end(src_v)
+
+  return max_diff;
+}
+
+// Test scaling with 8 bit C vs 16 bit C and return maximum pixel difference.
+// 0 = exact.
+static int TestFilter_16(int src_width, int src_height,
+                         int dst_width, int dst_height,
+                         FilterMode f, int benchmark_iterations) {
+  int i, j;
+  const int b = 0;  // 128 to test for padding/stride.
+  int src_width_uv = (Abs(src_width) + 1) >> 1;
+  int src_height_uv = (Abs(src_height) + 1) >> 1;
+
+  int64 src_y_plane_size = (Abs(src_width) + b * 2) *
+      (Abs(src_height) + b * 2);
+  int64 src_uv_plane_size = (src_width_uv + b * 2) * (src_height_uv + b * 2);
+
+  int src_stride_y = b * 2 + Abs(src_width);
+  int src_stride_uv = b * 2 + src_width_uv;
+
+  align_buffer_page_end(src_y, src_y_plane_size)
+  align_buffer_page_end(src_u, src_uv_plane_size)
+  align_buffer_page_end(src_v, src_uv_plane_size)
+  align_buffer_page_end(src_y_16, src_y_plane_size * 2)
+  align_buffer_page_end(src_u_16, src_uv_plane_size * 2)
+  align_buffer_page_end(src_v_16, src_uv_plane_size * 2)
+  uint16* p_src_y_16 = reinterpret_cast<uint16*>(src_y_16);
+  uint16* p_src_u_16 = reinterpret_cast<uint16*>(src_u_16);
+  uint16* p_src_v_16 = reinterpret_cast<uint16*>(src_v_16);
+
+  MemRandomize(src_y, src_y_plane_size);
+  MemRandomize(src_u, src_uv_plane_size);
+  MemRandomize(src_v, src_uv_plane_size);
+
+  for (i = b; i < src_height + b; ++i) {
+    for (j = b; j < src_width + b; ++j) {
+      p_src_y_16[(i * src_stride_y) + j] = src_y[(i * src_stride_y) + j];
+    }
+  }
+
+  for (i = b; i < (src_height_uv + b); ++i) {
+    for (j = b; j < (src_width_uv + b); ++j) {
+      p_src_u_16[(i * src_stride_uv) + j] = src_u[(i * src_stride_uv) + j];
+      p_src_v_16[(i * src_stride_uv) + j] = src_v[(i * src_stride_uv) + j];
+    }
+  }
+
+  int dst_width_uv = (dst_width + 1) >> 1;
+  int dst_height_uv = (dst_height + 1) >> 1;
+
+  int dst_y_plane_size = (dst_width + b * 2) * (dst_height + b * 2);
+  int dst_uv_plane_size = (dst_width_uv + b * 2) * (dst_height_uv + b * 2);
+
+  int dst_stride_y = b * 2 + dst_width;
+  int dst_stride_uv = b * 2 + dst_width_uv;
+
+  align_buffer_page_end(dst_y_8, dst_y_plane_size)
+  align_buffer_page_end(dst_u_8, dst_uv_plane_size)
+  align_buffer_page_end(dst_v_8, dst_uv_plane_size)
+  align_buffer_page_end(dst_y_16, dst_y_plane_size * 2)
+  align_buffer_page_end(dst_u_16, dst_uv_plane_size * 2)
+  align_buffer_page_end(dst_v_16, dst_uv_plane_size * 2)
+
+  uint16* p_dst_y_16 = reinterpret_cast<uint16*>(dst_y_16);
+  uint16* p_dst_u_16 = reinterpret_cast<uint16*>(dst_u_16);
+  uint16* p_dst_v_16 = reinterpret_cast<uint16*>(dst_v_16);
+
+  I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
+            src_u + (src_stride_uv * b) + b, src_stride_uv,
+            src_v + (src_stride_uv * b) + b, src_stride_uv,
+            src_width, src_height,
+            dst_y_8 + (dst_stride_y * b) + b, dst_stride_y,
+            dst_u_8 + (dst_stride_uv * b) + b, dst_stride_uv,
+            dst_v_8 + (dst_stride_uv * b) + b, dst_stride_uv,
+            dst_width, dst_height, f);
+
+  for (i = 0; i < benchmark_iterations; ++i) {
+    I420Scale_16(p_src_y_16 + (src_stride_y * b) + b, src_stride_y,
+                 p_src_u_16 + (src_stride_uv * b) + b, src_stride_uv,
+                 p_src_v_16 + (src_stride_uv * b) + b, src_stride_uv,
+                 src_width, src_height,
+                 p_dst_y_16 + (dst_stride_y * b) + b, dst_stride_y,
+                 p_dst_u_16 + (dst_stride_uv * b) + b, dst_stride_uv,
+                 p_dst_v_16 + (dst_stride_uv * b) + b, dst_stride_uv,
+                 dst_width, dst_height, f);
+  }
+
+  // Expect an exact match
+  int max_diff = 0;
+  for (i = b; i < (dst_height + b); ++i) {
+    for (j = b; j < (dst_width + b); ++j) {
+      int abs_diff = Abs(dst_y_8[(i * dst_stride_y) + j] -
+                         p_dst_y_16[(i * dst_stride_y) + j]);
+      if (abs_diff > max_diff) {
+        max_diff = abs_diff;
+      }
+    }
+  }
+
+  for (i = b; i < (dst_height_uv + b); ++i) {
+    for (j = b; j < (dst_width_uv + b); ++j) {
+      int abs_diff = Abs(dst_u_8[(i * dst_stride_uv) + j] -
+                         p_dst_u_16[(i * dst_stride_uv) + j]);
+      if (abs_diff > max_diff) {
+        max_diff = abs_diff;
+      }
+      abs_diff = Abs(dst_v_8[(i * dst_stride_uv) + j] -
+                     p_dst_v_16[(i * dst_stride_uv) + j]);
+      if (abs_diff > max_diff) {
+        max_diff = abs_diff;
+      }
+    }
+  }
+
+  free_aligned_buffer_page_end(dst_y_8)
+  free_aligned_buffer_page_end(dst_u_8)
+  free_aligned_buffer_page_end(dst_v_8)
+  free_aligned_buffer_page_end(dst_y_16)
+  free_aligned_buffer_page_end(dst_u_16)
+  free_aligned_buffer_page_end(dst_v_16)
+
+  free_aligned_buffer_page_end(src_y)
+  free_aligned_buffer_page_end(src_u)
+  free_aligned_buffer_page_end(src_v)
+  free_aligned_buffer_page_end(src_y_16)
+  free_aligned_buffer_page_end(src_u_16)
+  free_aligned_buffer_page_end(src_v_16)
+
+  return max_diff;
+}
+
+// The following adjustments in dimensions ensure the scale factor will be
+// exactly achieved.
+// 2 is chroma subsample
+#define DX(x, nom, denom) static_cast<int>((Abs(x) / nom / 2) * nom * 2)
+#define SX(x, nom, denom) static_cast<int>((x / nom / 2) * denom * 2)
+
+#define TEST_FACTOR1(name, filter, nom, denom, max_diff)                       \
+    TEST_F(LibYUVScaleTest, ScaleDownBy##name##_##filter) {                    \
+      int diff = TestFilter(SX(benchmark_width_, nom, denom),                  \
+                            SX(benchmark_height_, nom, denom),                 \
+                            DX(benchmark_width_, nom, denom),                  \
+                            DX(benchmark_height_, nom, denom),                 \
+                            kFilter##filter, benchmark_iterations_,            \
+                            disable_cpu_flags_, benchmark_cpu_info_);          \
+      EXPECT_LE(diff, max_diff);                                               \
+    }                                                                          \
+    TEST_F(LibYUVScaleTest, DISABLED_ScaleDownBy##name##_##filter##_16) {      \
+      int diff = TestFilter_16(SX(benchmark_width_, nom, denom),               \
+                               SX(benchmark_height_, nom, denom),              \
+                               DX(benchmark_width_, nom, denom),               \
+                               DX(benchmark_height_, nom, denom),              \
+                               kFilter##filter, benchmark_iterations_);        \
+      EXPECT_LE(diff, max_diff);                                               \
+    }
+
+// Test a scale factor with all 4 filters.  Expect unfiltered to be exact, but
+// filtering is different fixed point implementations for SSSE3, Neon and C.
+#define TEST_FACTOR(name, nom, denom, boxdiff)                                 \
+    TEST_FACTOR1(name, None, nom, denom, 0)                                    \
+    TEST_FACTOR1(name, Linear, nom, denom, 3)                                  \
+    TEST_FACTOR1(name, Bilinear, nom, denom, 3)                                \
+    TEST_FACTOR1(name, Box, nom, denom, boxdiff)
+
+TEST_FACTOR(2, 1, 2, 0)
+TEST_FACTOR(4, 1, 4, 0)
+TEST_FACTOR(8, 1, 8, 3)
+TEST_FACTOR(3by4, 3, 4, 1)
+TEST_FACTOR(3by8, 3, 8, 1)
+TEST_FACTOR(3, 1, 3, 3)
+#undef TEST_FACTOR1
+#undef TEST_FACTOR
+#undef SX
+#undef DX
+
+#define TEST_SCALETO1(name, width, height, filter, max_diff)                   \
+    TEST_F(LibYUVScaleTest, name##To##width##x##height##_##filter) {           \
+      int diff = TestFilter(benchmark_width_, benchmark_height_,               \
+                            width, height,                                     \
+                            kFilter##filter, benchmark_iterations_,            \
+                            disable_cpu_flags_, benchmark_cpu_info_);          \
+      EXPECT_LE(diff, max_diff);                                               \
+    }                                                                          \
+    TEST_F(LibYUVScaleTest, name##From##width##x##height##_##filter) {         \
+      int diff = TestFilter(width, height,                                     \
+                            Abs(benchmark_width_), Abs(benchmark_height_),     \
+                            kFilter##filter, benchmark_iterations_,            \
+                            disable_cpu_flags_, benchmark_cpu_info_);          \
+      EXPECT_LE(diff, max_diff);                                               \
+    }                                                                          \
+    TEST_F(LibYUVScaleTest,                                                    \
+        DISABLED_##name##To##width##x##height##_##filter##_16) {               \
+      int diff = TestFilter_16(benchmark_width_, benchmark_height_,            \
+                               width, height,                                  \
+                               kFilter##filter, benchmark_iterations_);        \
+      EXPECT_LE(diff, max_diff);                                               \
+    }                                                                          \
+    TEST_F(LibYUVScaleTest,                                                    \
+        DISABLED_##name##From##width##x##height##_##filter##_16) {             \
+      int diff = TestFilter_16(width, height,                                  \
+                               Abs(benchmark_width_), Abs(benchmark_height_),  \
+                               kFilter##filter, benchmark_iterations_);        \
+      EXPECT_LE(diff, max_diff);                                               \
+    }
+
+// Test scale to a specified size with all 4 filters.
+#define TEST_SCALETO(name, width, height)                                      \
+    TEST_SCALETO1(name, width, height, None, 0)                                \
+    TEST_SCALETO1(name, width, height, Linear, 3)                              \
+    TEST_SCALETO1(name, width, height, Bilinear, 3)                            \
+    TEST_SCALETO1(name, width, height, Box, 3)
+
+TEST_SCALETO(Scale, 1, 1)
+TEST_SCALETO(Scale, 320, 240)
+TEST_SCALETO(Scale, 352, 288)
+TEST_SCALETO(Scale, 569, 480)
+TEST_SCALETO(Scale, 640, 360)
+TEST_SCALETO(Scale, 1280, 720)
+#undef TEST_SCALETO1
+#undef TEST_SCALETO
+
+}  // namespace libyuv
diff --git a/libs/libyuv/unit_test/testdata/arm_v7.txt b/libs/libyuv/unit_test/testdata/arm_v7.txt
new file mode 100644
index 0000000000..5d7dbd0480
--- /dev/null
+++ b/libs/libyuv/unit_test/testdata/arm_v7.txt
@@ -0,0 +1,12 @@
+Processor	: ARMv7 Processor rev 5 (v7l)
+BogoMIPS	: 795.44
+Features	: swp half thumb fastmult vfp edsp iwmmxt thumbee vfpv3 vfpv3d16 
+CPU implementer	: 0x56
+CPU architecture: 7
+CPU variant	: 0x0
+CPU part	: 0x581
+CPU revision	: 5
+
+Hardware	: OLPC XO-1.75
+Revision	: 0000
+Serial		: 0000000000000000
diff --git a/libs/libyuv/unit_test/testdata/juno.txt b/libs/libyuv/unit_test/testdata/juno.txt
new file mode 100644
index 0000000000..dd465272b8
--- /dev/null
+++ b/libs/libyuv/unit_test/testdata/juno.txt
@@ -0,0 +1,15 @@
+Processor       : AArch64 Processor rev 0 (aarch64)
+processor       : 0
+processor       : 1
+processor       : 2
+processor       : 3
+processor       : 4
+processor       : 5
+Features        : fp asimd evtstrm aes pmull sha1 sha2 crc32
+CPU implementer : 0x41
+CPU architecture: AArch64
+CPU variant     : 0x0
+CPU part        : 0xd07
+CPU revision    : 0
+
+Hardware        : Juno
diff --git a/libs/libyuv/unit_test/testdata/tegra3.txt b/libs/libyuv/unit_test/testdata/tegra3.txt
new file mode 100644
index 0000000000..d1b09f6b77
--- /dev/null
+++ b/libs/libyuv/unit_test/testdata/tegra3.txt
@@ -0,0 +1,23 @@
+Processor       : ARMv7 Processor rev 9 (v7l)
+processor       : 0
+BogoMIPS        : 1992.29
+
+processor       : 1
+BogoMIPS        : 1992.29
+
+processor       : 2
+BogoMIPS        : 1992.29
+
+processor       : 3
+BogoMIPS        : 1992.29
+
+Features        : swp half thumb fastmult vfp edsp neon vfpv3
+CPU implementer : 0�41
+CPU architecture: 7
+CPU variant     : 0�2
+CPU part        : 0xc09
+CPU revision    : 9
+
+Hardware        : cardhu
+Revision        : 0000
+
diff --git a/libs/libyuv/unit_test/unit_test.cc b/libs/libyuv/unit_test/unit_test.cc
new file mode 100644
index 0000000000..c98c285cbf
--- /dev/null
+++ b/libs/libyuv/unit_test/unit_test.cc
@@ -0,0 +1,355 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "../unit_test/unit_test.h"
+
+#include <stdlib.h>  // For getenv()
+
+#include <cstring>
+
+#include "gflags/gflags.h"
+
+// Change this to 1000 for benchmarking.
+// TODO(fbarchard): Add command line parsing to pass this as option.
+#define BENCHMARK_ITERATIONS 1
+
+unsigned int fastrand_seed = 0xfb;
+
+DEFINE_int32(libyuv_width, 0, "width of test image.");
+DEFINE_int32(libyuv_height, 0, "height of test image.");
+DEFINE_int32(libyuv_repeat, 0, "number of times to repeat test.");
+DEFINE_int32(libyuv_flags, 0, "cpu flags for reference code. 0 = C -1 = asm");
+DEFINE_int32(libyuv_cpu_info, -1,
+             "cpu flags for benchmark code. -1 = SIMD, 1 = C");
+
+// For quicker unittests, default is 128 x 72.  But when benchmarking,
+// default to 720p.  Allow size to specify.
+// Set flags to -1 for benchmarking to avoid slower C code.
+
+LibYUVConvertTest::LibYUVConvertTest() :
+    benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(130),
+    benchmark_height_(72), disable_cpu_flags_(1), benchmark_cpu_info_(-1) {
+  const char* repeat = getenv("LIBYUV_REPEAT");
+  if (repeat) {
+    benchmark_iterations_ = atoi(repeat);  // NOLINT
+  }
+  if (FLAGS_libyuv_repeat) {
+    benchmark_iterations_ = FLAGS_libyuv_repeat;
+  }
+  if (benchmark_iterations_ > 1) {
+    benchmark_width_ = 1280;
+    benchmark_height_ = 720;
+  }
+  const char* width = getenv("LIBYUV_WIDTH");
+  if (width) {
+    benchmark_width_ = atoi(width);  // NOLINT
+  }
+  if (FLAGS_libyuv_width) {
+    benchmark_width_ = FLAGS_libyuv_width;
+  }
+  const char* height = getenv("LIBYUV_HEIGHT");
+  if (height) {
+    benchmark_height_ = atoi(height);  // NOLINT
+  }
+  if (FLAGS_libyuv_height) {
+    benchmark_height_ = FLAGS_libyuv_height;
+  }
+  const char* cpu_flags = getenv("LIBYUV_FLAGS");
+  if (cpu_flags) {
+    disable_cpu_flags_ = atoi(cpu_flags);  // NOLINT
+  }
+  if (FLAGS_libyuv_flags) {
+    disable_cpu_flags_ = FLAGS_libyuv_flags;
+  }
+  const char* cpu_info = getenv("LIBYUV_CPU_INFO");
+  if (cpu_info) {
+    benchmark_cpu_info_ = atoi(cpu_flags);  // NOLINT
+  }
+  if (FLAGS_libyuv_cpu_info) {
+    benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
+  }
+  benchmark_pixels_div256_ = static_cast<int>((
+      static_cast<double>(Abs(benchmark_width_)) *
+      static_cast<double>(Abs(benchmark_height_)) *
+      static_cast<double>(benchmark_iterations_)  + 255.0) / 256.0);
+  benchmark_pixels_div1280_ = static_cast<int>((
+      static_cast<double>(Abs(benchmark_width_)) *
+      static_cast<double>(Abs(benchmark_height_)) *
+      static_cast<double>(benchmark_iterations_)  + 1279.0) / 1280.0);
+}
+
+LibYUVColorTest::LibYUVColorTest() :
+    benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(128),
+    benchmark_height_(72), disable_cpu_flags_(1), benchmark_cpu_info_(-1) {
+  const char* repeat = getenv("LIBYUV_REPEAT");
+  if (repeat) {
+    benchmark_iterations_ = atoi(repeat);  // NOLINT
+  }
+  if (FLAGS_libyuv_repeat) {
+    benchmark_iterations_ = FLAGS_libyuv_repeat;
+  }
+  if (benchmark_iterations_ > 1) {
+    benchmark_width_ = 1280;
+    benchmark_height_ = 720;
+  }
+  const char* width = getenv("LIBYUV_WIDTH");
+  if (width) {
+    benchmark_width_ = atoi(width);  // NOLINT
+  }
+  if (FLAGS_libyuv_width) {
+    benchmark_width_ = FLAGS_libyuv_width;
+  }
+  const char* height = getenv("LIBYUV_HEIGHT");
+  if (height) {
+    benchmark_height_ = atoi(height);  // NOLINT
+  }
+  if (FLAGS_libyuv_height) {
+    benchmark_height_ = FLAGS_libyuv_height;
+  }
+  const char* cpu_flags = getenv("LIBYUV_FLAGS");
+  if (cpu_flags) {
+    disable_cpu_flags_ = atoi(cpu_flags);  // NOLINT
+  }
+  if (FLAGS_libyuv_flags) {
+    disable_cpu_flags_ = FLAGS_libyuv_flags;
+  }
+  const char* cpu_info = getenv("LIBYUV_CPU_INFO");
+  if (cpu_info) {
+    benchmark_cpu_info_ = atoi(cpu_flags);  // NOLINT
+  }
+  if (FLAGS_libyuv_cpu_info) {
+    benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
+  }
+  benchmark_pixels_div256_ = static_cast<int>((
+      static_cast<double>(Abs(benchmark_width_)) *
+      static_cast<double>(Abs(benchmark_height_)) *
+      static_cast<double>(benchmark_iterations_)  + 255.0) / 256.0);
+  benchmark_pixels_div1280_ = static_cast<int>((
+      static_cast<double>(Abs(benchmark_width_)) *
+      static_cast<double>(Abs(benchmark_height_)) *
+      static_cast<double>(benchmark_iterations_)  + 1279.0) / 1280.0);
+}
+
+LibYUVScaleTest::LibYUVScaleTest() :
+    benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(128),
+    benchmark_height_(72), disable_cpu_flags_(1), benchmark_cpu_info_(-1) {
+  const char* repeat = getenv("LIBYUV_REPEAT");
+  if (repeat) {
+    benchmark_iterations_ = atoi(repeat);  // NOLINT
+  }
+  if (FLAGS_libyuv_repeat) {
+    benchmark_iterations_ = FLAGS_libyuv_repeat;
+  }
+  if (benchmark_iterations_ > 1) {
+    benchmark_width_ = 1280;
+    benchmark_height_ = 720;
+  }
+  const char* width = getenv("LIBYUV_WIDTH");
+  if (width) {
+    benchmark_width_ = atoi(width);  // NOLINT
+  }
+  if (FLAGS_libyuv_width) {
+    benchmark_width_ = FLAGS_libyuv_width;
+  }
+  const char* height = getenv("LIBYUV_HEIGHT");
+  if (height) {
+    benchmark_height_ = atoi(height);  // NOLINT
+  }
+  if (FLAGS_libyuv_height) {
+    benchmark_height_ = FLAGS_libyuv_height;
+  }
+  const char* cpu_flags = getenv("LIBYUV_FLAGS");
+  if (cpu_flags) {
+    disable_cpu_flags_ = atoi(cpu_flags);  // NOLINT
+  }
+  if (FLAGS_libyuv_flags) {
+    disable_cpu_flags_ = FLAGS_libyuv_flags;
+  }
+  const char* cpu_info = getenv("LIBYUV_CPU_INFO");
+  if (cpu_info) {
+    benchmark_cpu_info_ = atoi(cpu_flags);  // NOLINT
+  }
+  if (FLAGS_libyuv_cpu_info) {
+    benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
+  }
+  benchmark_pixels_div256_ = static_cast<int>((
+      static_cast<double>(Abs(benchmark_width_)) *
+      static_cast<double>(Abs(benchmark_height_)) *
+      static_cast<double>(benchmark_iterations_)  + 255.0) / 256.0);
+  benchmark_pixels_div1280_ = static_cast<int>((
+      static_cast<double>(Abs(benchmark_width_)) *
+      static_cast<double>(Abs(benchmark_height_)) *
+      static_cast<double>(benchmark_iterations_)  + 1279.0) / 1280.0);
+}
+
+LibYUVRotateTest::LibYUVRotateTest() :
+    benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(128),
+    benchmark_height_(72), disable_cpu_flags_(1), benchmark_cpu_info_(-1) {
+  const char* repeat = getenv("LIBYUV_REPEAT");
+  if (repeat) {
+    benchmark_iterations_ = atoi(repeat);  // NOLINT
+  }
+  if (FLAGS_libyuv_repeat) {
+    benchmark_iterations_ = FLAGS_libyuv_repeat;
+  }
+  if (benchmark_iterations_ > 1) {
+    benchmark_width_ = 1280;
+    benchmark_height_ = 720;
+  }
+  const char* width = getenv("LIBYUV_WIDTH");
+  if (width) {
+    benchmark_width_ = atoi(width);  // NOLINT
+  }
+  if (FLAGS_libyuv_width) {
+    benchmark_width_ = FLAGS_libyuv_width;
+  }
+  const char* height = getenv("LIBYUV_HEIGHT");
+  if (height) {
+    benchmark_height_ = atoi(height);  // NOLINT
+  }
+  if (FLAGS_libyuv_height) {
+    benchmark_height_ = FLAGS_libyuv_height;
+  }
+  const char* cpu_flags = getenv("LIBYUV_FLAGS");
+  if (cpu_flags) {
+    disable_cpu_flags_ = atoi(cpu_flags);  // NOLINT
+  }
+  if (FLAGS_libyuv_flags) {
+    disable_cpu_flags_ = FLAGS_libyuv_flags;
+  }
+  const char* cpu_info = getenv("LIBYUV_CPU_INFO");
+  if (cpu_info) {
+    benchmark_cpu_info_ = atoi(cpu_flags);  // NOLINT
+  }
+  if (FLAGS_libyuv_cpu_info) {
+    benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
+  }
+  benchmark_pixels_div256_ = static_cast<int>((
+      static_cast<double>(Abs(benchmark_width_)) *
+      static_cast<double>(Abs(benchmark_height_)) *
+      static_cast<double>(benchmark_iterations_)  + 255.0) / 256.0);
+  benchmark_pixels_div1280_ = static_cast<int>((
+      static_cast<double>(Abs(benchmark_width_)) *
+      static_cast<double>(Abs(benchmark_height_)) *
+      static_cast<double>(benchmark_iterations_)  + 1279.0) / 1280.0);
+}
+
+LibYUVPlanarTest::LibYUVPlanarTest() :
+    benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(128),
+    benchmark_height_(72), disable_cpu_flags_(1), benchmark_cpu_info_(-1) {
+  const char* repeat = getenv("LIBYUV_REPEAT");
+  if (repeat) {
+    benchmark_iterations_ = atoi(repeat);  // NOLINT
+  }
+  if (FLAGS_libyuv_repeat) {
+    benchmark_iterations_ = FLAGS_libyuv_repeat;
+  }
+  if (benchmark_iterations_ > 1) {
+    benchmark_width_ = 1280;
+    benchmark_height_ = 720;
+  }
+  const char* width = getenv("LIBYUV_WIDTH");
+  if (width) {
+    benchmark_width_ = atoi(width);  // NOLINT
+  }
+  if (FLAGS_libyuv_width) {
+    benchmark_width_ = FLAGS_libyuv_width;
+  }
+  const char* height = getenv("LIBYUV_HEIGHT");
+  if (height) {
+    benchmark_height_ = atoi(height);  // NOLINT
+  }
+  if (FLAGS_libyuv_height) {
+    benchmark_height_ = FLAGS_libyuv_height;
+  }
+  const char* cpu_flags = getenv("LIBYUV_FLAGS");
+  if (cpu_flags) {
+    disable_cpu_flags_ = atoi(cpu_flags);  // NOLINT
+  }
+  if (FLAGS_libyuv_flags) {
+    disable_cpu_flags_ = FLAGS_libyuv_flags;
+  }
+  const char* cpu_info = getenv("LIBYUV_CPU_INFO");
+  if (cpu_info) {
+    benchmark_cpu_info_ = atoi(cpu_flags);  // NOLINT
+  }
+  if (FLAGS_libyuv_cpu_info) {
+    benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
+  }
+  benchmark_pixels_div256_ = static_cast<int>((
+      static_cast<double>(Abs(benchmark_width_)) *
+      static_cast<double>(Abs(benchmark_height_)) *
+      static_cast<double>(benchmark_iterations_)  + 255.0) / 256.0);
+  benchmark_pixels_div1280_ = static_cast<int>((
+      static_cast<double>(Abs(benchmark_width_)) *
+      static_cast<double>(Abs(benchmark_height_)) *
+      static_cast<double>(benchmark_iterations_)  + 1279.0) / 1280.0);
+}
+
+LibYUVBaseTest::LibYUVBaseTest() :
+    benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(128),
+    benchmark_height_(72), disable_cpu_flags_(1), benchmark_cpu_info_(-1) {
+  const char* repeat = getenv("LIBYUV_REPEAT");
+  if (repeat) {
+    benchmark_iterations_ = atoi(repeat);  // NOLINT
+  }
+  if (FLAGS_libyuv_repeat) {
+    benchmark_iterations_ = FLAGS_libyuv_repeat;
+  }
+  if (benchmark_iterations_ > 1) {
+    benchmark_width_ = 1280;
+    benchmark_height_ = 720;
+  }
+  const char* width = getenv("LIBYUV_WIDTH");
+  if (width) {
+    benchmark_width_ = atoi(width);  // NOLINT
+  }
+  if (FLAGS_libyuv_width) {
+    benchmark_width_ = FLAGS_libyuv_width;
+  }
+  const char* height = getenv("LIBYUV_HEIGHT");
+  if (height) {
+    benchmark_height_ = atoi(height);  // NOLINT
+  }
+  if (FLAGS_libyuv_height) {
+    benchmark_height_ = FLAGS_libyuv_height;
+  }
+  const char* cpu_flags = getenv("LIBYUV_FLAGS");
+  if (cpu_flags) {
+    disable_cpu_flags_ = atoi(cpu_flags);  // NOLINT
+  }
+  if (FLAGS_libyuv_flags) {
+    disable_cpu_flags_ = FLAGS_libyuv_flags;
+  }
+  const char* cpu_info = getenv("LIBYUV_CPU_INFO");
+  if (cpu_info) {
+    benchmark_cpu_info_ = atoi(cpu_flags);  // NOLINT
+  }
+  if (FLAGS_libyuv_cpu_info) {
+    benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
+  }
+  benchmark_pixels_div256_ = static_cast<int>((
+      static_cast<double>(Abs(benchmark_width_)) *
+      static_cast<double>(Abs(benchmark_height_)) *
+      static_cast<double>(benchmark_iterations_)  + 255.0) / 256.0);
+  benchmark_pixels_div1280_ = static_cast<int>((
+      static_cast<double>(Abs(benchmark_width_)) *
+      static_cast<double>(Abs(benchmark_height_)) *
+      static_cast<double>(benchmark_iterations_)  + 1279.0) / 1280.0);
+}
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  // AllowCommandLineParsing allows us to ignore flags passed on to us by
+  // Chromium build bots without having to explicitly disable them.
+  google::AllowCommandLineReparsing();
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  return RUN_ALL_TESTS();
+}
diff --git a/libs/libyuv/unit_test/unit_test.h b/libs/libyuv/unit_test/unit_test.h
new file mode 100644
index 0000000000..009ff62abf
--- /dev/null
+++ b/libs/libyuv/unit_test/unit_test.h
@@ -0,0 +1,153 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef UNIT_TEST_UNIT_TEST_H_  // NOLINT
+#define UNIT_TEST_UNIT_TEST_H_
+
+#ifdef WIN32
+#include <windows.h>
+#else
+#include <sys/time.h>
+#include <sys/resource.h>
+#endif
+
+#include <gtest/gtest.h>
+
+#include "libyuv/basic_types.h"
+
+static __inline int Abs(int v) {
+  return v >= 0 ? v : -v;
+}
+
+#define OFFBY 0
+
+#define align_buffer_page_end(var, size)                                       \
+  uint8* var;                                                                  \
+  uint8* var##_mem;                                                            \
+  var##_mem = reinterpret_cast<uint8*>(malloc((((size) + 4095) & ~4095) +      \
+      OFFBY));                                                                 \
+  var = var##_mem + (-(size) & 4095) + OFFBY;
+
+#define free_aligned_buffer_page_end(var) \
+  free(var##_mem);  \
+  var = 0;
+
+#ifdef WIN32
+static inline double get_time() {
+  LARGE_INTEGER t, f;
+  QueryPerformanceCounter(&t);
+  QueryPerformanceFrequency(&f);
+  return static_cast<double>(t.QuadPart) / static_cast<double>(f.QuadPart);
+}
+#else
+static inline double get_time() {
+  struct timeval t;
+  struct timezone tzp;
+  gettimeofday(&t, &tzp);
+  return t.tv_sec + t.tv_usec * 1e-6;
+}
+#endif
+
+extern unsigned int fastrand_seed;
+inline int fastrand() {
+  fastrand_seed = fastrand_seed * 214013u + 2531011u;
+  return static_cast<int>((fastrand_seed >> 16) & 0xffff);
+}
+
+static inline void MemRandomize(uint8* dst, int64 len) {
+  int64 i;
+  for (i = 0; i < len - 1; i += 2) {
+    *reinterpret_cast<uint16*>(dst) = fastrand();
+    dst += 2;
+  }
+  for (; i < len; ++i) {
+    *dst++ = fastrand();
+  }
+}
+
+class LibYUVColorTest : public ::testing::Test {
+ protected:
+  LibYUVColorTest();
+
+  int benchmark_iterations_;  // Default 1. Use 1000 for benchmarking.
+  int benchmark_width_;  // Default 1280.  Use 640 for benchmarking VGA.
+  int benchmark_height_;  // Default 720.  Use 360 for benchmarking VGA.
+  int benchmark_pixels_div256_;  // Total pixels to benchmark / 256.
+  int benchmark_pixels_div1280_;  // Total pixels to benchmark / 1280.
+  int disable_cpu_flags_;  // Default 1.  Use -1 for benchmarking.
+  int benchmark_cpu_info_;  // Default -1.  Use 1 to disable SIMD.
+};
+
+class LibYUVConvertTest : public ::testing::Test {
+ protected:
+  LibYUVConvertTest();
+
+  int benchmark_iterations_;  // Default 1. Use 1000 for benchmarking.
+  int benchmark_width_;  // Default 1280.  Use 640 for benchmarking VGA.
+  int benchmark_height_;  // Default 720.  Use 360 for benchmarking VGA.
+  int benchmark_pixels_div256_;  // Total pixels to benchmark / 256.
+  int benchmark_pixels_div1280_;  // Total pixels to benchmark / 1280.
+  int disable_cpu_flags_;  // Default 1.  Use -1 for benchmarking.
+  int benchmark_cpu_info_;  // Default -1.  Use 1 to disable SIMD.
+};
+
+class LibYUVScaleTest : public ::testing::Test {
+ protected:
+  LibYUVScaleTest();
+
+  int benchmark_iterations_;  // Default 1. Use 1000 for benchmarking.
+  int benchmark_width_;  // Default 1280.  Use 640 for benchmarking VGA.
+  int benchmark_height_;  // Default 720.  Use 360 for benchmarking VGA.
+  int benchmark_pixels_div256_;  // Total pixels to benchmark / 256.
+  int benchmark_pixels_div1280_;  // Total pixels to benchmark / 1280.
+  int disable_cpu_flags_;  // Default 1.  Use -1 for benchmarking.
+  int benchmark_cpu_info_;  // Default -1.  Use 1 to disable SIMD.
+};
+
+class LibYUVRotateTest : public ::testing::Test {
+ protected:
+  LibYUVRotateTest();
+
+  int benchmark_iterations_;  // Default 1. Use 1000 for benchmarking.
+  int benchmark_width_;  // Default 1280.  Use 640 for benchmarking VGA.
+  int benchmark_height_;  // Default 720.  Use 360 for benchmarking VGA.
+  int benchmark_pixels_div256_;  // Total pixels to benchmark / 256.
+  int benchmark_pixels_div1280_;  // Total pixels to benchmark / 1280.
+  int disable_cpu_flags_;  // Default 1.  Use -1 for benchmarking.
+  int benchmark_cpu_info_;  // Default -1.  Use 1 to disable SIMD.
+};
+
+class LibYUVPlanarTest : public ::testing::Test {
+ protected:
+  LibYUVPlanarTest();
+
+  int benchmark_iterations_;  // Default 1. Use 1000 for benchmarking.
+  int benchmark_width_;  // Default 1280.  Use 640 for benchmarking VGA.
+  int benchmark_height_;  // Default 720.  Use 360 for benchmarking VGA.
+  int benchmark_pixels_div256_;  // Total pixels to benchmark / 256.
+  int benchmark_pixels_div1280_;  // Total pixels to benchmark / 1280.
+  int disable_cpu_flags_;  // Default 1.  Use -1 for benchmarking.
+  int benchmark_cpu_info_;  // Default -1.  Use 1 to disable SIMD.
+};
+
+class LibYUVBaseTest : public ::testing::Test {
+ protected:
+  LibYUVBaseTest();
+
+  int benchmark_iterations_;  // Default 1. Use 1000 for benchmarking.
+  int benchmark_width_;  // Default 1280.  Use 640 for benchmarking VGA.
+  int benchmark_height_;  // Default 720.  Use 360 for benchmarking VGA.
+  int benchmark_pixels_div256_;  // Total pixels to benchmark / 256.
+  int benchmark_pixels_div1280_;  // Total pixels to benchmark / 1280.
+  int disable_cpu_flags_;  // Default 1.  Use -1 for benchmarking.
+  int benchmark_cpu_info_;  // Default -1.  Use 1 to disable SIMD.
+};
+
+#endif  // UNIT_TEST_UNIT_TEST_H_  NOLINT
diff --git a/libs/libyuv/unit_test/video_common_test.cc b/libs/libyuv/unit_test/video_common_test.cc
new file mode 100644
index 0000000000..e3b7fb82ae
--- /dev/null
+++ b/libs/libyuv/unit_test/video_common_test.cc
@@ -0,0 +1,106 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "libyuv/video_common.h"
+#include "../unit_test/unit_test.h"
+
+namespace libyuv {
+
+// Tests FourCC codes in video common, which are used for ConvertToI420().
+
+static bool TestValidChar(uint32 onecc) {
+  if ((onecc >= '0' && onecc <= '9') ||
+      (onecc >= 'A' && onecc <= 'Z') ||
+      (onecc >= 'a' && onecc <= 'z') ||
+      (onecc == ' ') || (onecc == 0xff)) {
+    return true;
+  }
+  return false;
+}
+
+static bool TestValidFourCC(uint32 fourcc, int bpp) {
+  if (!TestValidChar(fourcc & 0xff) ||
+      !TestValidChar((fourcc >> 8) & 0xff) ||
+      !TestValidChar((fourcc >> 16) & 0xff) ||
+      !TestValidChar((fourcc >> 24) & 0xff)) {
+    return false;
+  }
+  if (bpp < 0 || bpp > 32) {
+    return false;
+  }
+  return true;
+}
+
+TEST_F(LibYUVBaseTest, TestCanonicalFourCC) {
+  EXPECT_EQ(FOURCC_I420, CanonicalFourCC(FOURCC_IYUV));
+  EXPECT_EQ(FOURCC_I422, CanonicalFourCC(FOURCC_YU16));
+  EXPECT_EQ(FOURCC_I444, CanonicalFourCC(FOURCC_YU24));
+  EXPECT_EQ(FOURCC_YUY2, CanonicalFourCC(FOURCC_YUYV));
+  EXPECT_EQ(FOURCC_YUY2, CanonicalFourCC(FOURCC_YUVS));
+  EXPECT_EQ(FOURCC_UYVY, CanonicalFourCC(FOURCC_HDYC));
+  EXPECT_EQ(FOURCC_UYVY, CanonicalFourCC(FOURCC_2VUY));
+  EXPECT_EQ(FOURCC_MJPG, CanonicalFourCC(FOURCC_JPEG));
+  EXPECT_EQ(FOURCC_MJPG, CanonicalFourCC(FOURCC_DMB1));
+  EXPECT_EQ(FOURCC_RAW,  CanonicalFourCC(FOURCC_RGB3));
+  EXPECT_EQ(FOURCC_24BG, CanonicalFourCC(FOURCC_BGR3));
+  EXPECT_EQ(FOURCC_BGRA, CanonicalFourCC(FOURCC_CM32));
+  EXPECT_EQ(FOURCC_RAW,  CanonicalFourCC(FOURCC_CM24));
+  EXPECT_EQ(FOURCC_RGBO, CanonicalFourCC(FOURCC_L555));
+  EXPECT_EQ(FOURCC_RGBP, CanonicalFourCC(FOURCC_L565));
+  EXPECT_EQ(FOURCC_RGBO, CanonicalFourCC(FOURCC_5551));
+}
+
+TEST_F(LibYUVBaseTest, TestFourCC) {
+  EXPECT_TRUE(TestValidFourCC(FOURCC_I420, FOURCC_BPP_I420));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_I420, FOURCC_BPP_I420));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_I422, FOURCC_BPP_I422));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_I444, FOURCC_BPP_I444));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_I411, FOURCC_BPP_I411));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_I400, FOURCC_BPP_I400));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_NV21, FOURCC_BPP_NV21));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_NV12, FOURCC_BPP_NV12));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_YUY2, FOURCC_BPP_YUY2));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_UYVY, FOURCC_BPP_UYVY));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_M420, FOURCC_BPP_M420));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_Q420, FOURCC_BPP_Q420));  // deprecated.
+  EXPECT_TRUE(TestValidFourCC(FOURCC_ARGB, FOURCC_BPP_ARGB));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_BGRA, FOURCC_BPP_BGRA));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_ABGR, FOURCC_BPP_ABGR));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_24BG, FOURCC_BPP_24BG));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_RAW,  FOURCC_BPP_RAW));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_RGBA, FOURCC_BPP_RGBA));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_RGBP, FOURCC_BPP_RGBP));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_RGBO, FOURCC_BPP_RGBO));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_R444, FOURCC_BPP_R444));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_MJPG, FOURCC_BPP_MJPG));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_YV12, FOURCC_BPP_YV12));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_YV16, FOURCC_BPP_YV16));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_YV24, FOURCC_BPP_YV24));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_YU12, FOURCC_BPP_YU12));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_IYUV, FOURCC_BPP_IYUV));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_YU16, FOURCC_BPP_YU16));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_YU24, FOURCC_BPP_YU24));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_YUYV, FOURCC_BPP_YUYV));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_YUVS, FOURCC_BPP_YUVS));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_HDYC, FOURCC_BPP_HDYC));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_2VUY, FOURCC_BPP_2VUY));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_JPEG, FOURCC_BPP_JPEG));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_DMB1, FOURCC_BPP_DMB1));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_BA81, FOURCC_BPP_BA81));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_RGB3, FOURCC_BPP_RGB3));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_BGR3, FOURCC_BPP_BGR3));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_H264, FOURCC_BPP_H264));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_ANY,  FOURCC_BPP_ANY));
+}
+
+}  // namespace libyuv
diff --git a/libs/libyuv/util/Makefile b/libs/libyuv/util/Makefile
new file mode 100644
index 0000000000..6044d2adf6
--- /dev/null
+++ b/libs/libyuv/util/Makefile
@@ -0,0 +1,6 @@
+psnr: psnr.cc ssim.cc psnr_main.cc
+ifeq ($(CXX),icl)
+	$(CXX) /arch:SSE2 /Ox /openmp psnr.cc ssim.cc psnr_main.cc
+else
+	$(CXX) -msse2 -O3 -fopenmp -static -o psnr psnr.cc ssim.cc psnr_main.cc -Wl,--strip-all
+endif
diff --git a/libs/libyuv/util/android/test_runner.py b/libs/libyuv/util/android/test_runner.py
new file mode 100755
index 0000000000..8b06b7eabe
--- /dev/null
+++ b/libs/libyuv/util/android/test_runner.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python
+# Copyright 2014 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+"""
+Runs tests on Android devices.
+
+This script exists to avoid Libyuv being broken by changes in the Chrome Android
+test execution toolchain. It also conveniently sets the CHECKOUT_SOURCE_ROOT
+environment variable.
+"""
+
+import os
+import sys
+
+SCRIPT_DIR = os.path.dirname(__file__)
+ROOT_DIR = os.path.abspath(os.path.join(SCRIPT_DIR, os.pardir, os.pardir))
+CHROMIUM_BUILD_ANDROID_DIR = os.path.join(ROOT_DIR, 'build', 'android')
+sys.path.insert(0, CHROMIUM_BUILD_ANDROID_DIR)
+
+
+import test_runner  # pylint: disable=W0406
+
+def main():
+  # Override environment variable to make it possible for the scripts to find
+  # the root directory (our symlinking of the Chromium build toolchain would
+  # otherwise make them fail to do so).
+  os.environ['CHECKOUT_SOURCE_ROOT'] = ROOT_DIR
+  return test_runner.main()
+
+if __name__ == '__main__':
+  sys.exit(main())
diff --git a/libs/libyuv/util/compare.cc b/libs/libyuv/util/compare.cc
new file mode 100644
index 0000000000..c36c0fa5f3
--- /dev/null
+++ b/libs/libyuv/util/compare.cc
@@ -0,0 +1,63 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#include "libyuv/basic_types.h"
+#include "libyuv/compare.h"
+#include "libyuv/version.h"
+
+int main(int argc, char** argv) {
+  if (argc < 1) {
+    printf("libyuv compare v%d\n", LIBYUV_VERSION);
+    printf("compare file1.yuv file2.yuv\n");
+    return -1;
+  }
+  char* name1 = argv[1];
+  char* name2 = (argc > 2) ? argv[2] : NULL;
+  FILE* fin1 = fopen(name1, "rb");
+  FILE* fin2 = name2 ? fopen(name2, "rb") : NULL;
+
+  const int kBlockSize = 32768;
+  uint8 buf1[kBlockSize];
+  uint8 buf2[kBlockSize];
+  uint32 hash1 = 5381;
+  uint32 hash2 = 5381;
+  uint64 sum_square_err = 0;
+  uint64 size_min = 0;
+  int amt1 = 0;
+  int amt2 = 0;
+  do {
+    amt1 = static_cast<int>(fread(buf1, 1, kBlockSize, fin1));
+    if (amt1 > 0) hash1 = libyuv::HashDjb2(buf1, amt1, hash1);
+    if (fin2) {
+      amt2 = static_cast<int>(fread(buf2, 1, kBlockSize, fin2));
+      if (amt2 > 0) hash2 = libyuv::HashDjb2(buf2, amt2, hash2);
+      int amt_min = (amt1 < amt2) ? amt1 : amt2;
+      size_min += amt_min;
+      sum_square_err += libyuv::ComputeSumSquareError(buf1, buf2, amt_min);
+    }
+  } while (amt1 > 0 || amt2 > 0);
+
+  printf("hash1 %x", hash1);
+  if (fin2) {
+    printf(", hash2 %x", hash2);
+    double mse = static_cast<double>(sum_square_err) /
+                 static_cast<double>(size_min);
+    printf(", mse %.2f", mse);
+    double psnr = libyuv::SumSquareErrorToPsnr(sum_square_err, size_min);
+    printf(", psnr %.2f\n", psnr);
+    fclose(fin2);
+  }
+  fclose(fin1);
+}
diff --git a/libs/libyuv/util/convert.cc b/libs/libyuv/util/convert.cc
new file mode 100644
index 0000000000..5f071416da
--- /dev/null
+++ b/libs/libyuv/util/convert.cc
@@ -0,0 +1,365 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// Convert an ARGB image to YUV.
+// Usage: convert src_argb.raw dst_yuv.raw
+
+#ifndef _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_WARNINGS
+#endif
+
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "libyuv/convert.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/scale_argb.h"
+
+// options
+bool verbose = false;
+bool attenuate = false;
+bool unattenuate = false;
+int image_width = 0, image_height = 0;  // original width and height
+int dst_width = 0, dst_height = 0;  // new width and height
+int fileindex_org = 0;  // argv argument contains the original file name.
+int fileindex_rec = 0;  // argv argument contains the reconstructed file name.
+int num_rec = 0;  // Number of reconstructed images.
+int num_skip_org = 0;  // Number of frames to skip in original.
+int num_frames = 0;  // Number of frames to convert.
+int filter = 1;  // Bilinear filter for scaling.
+
+static __inline uint32 Abs(int32 v) {
+  return v >= 0 ? v : -v;
+}
+
+// Parse PYUV format. ie name.1920x800_24Hz_P420.yuv
+bool ExtractResolutionFromFilename(const char* name,
+                                   int* width_ptr,
+                                   int* height_ptr) {
+  // Isolate the .width_height. section of the filename by searching for a
+  // dot or underscore followed by a digit.
+  for (int i = 0; name[i]; ++i) {
+    if ((name[i] == '.' || name[i] == '_') &&
+        name[i + 1] >= '0' && name[i + 1] <= '9') {
+      int n = sscanf(name + i + 1, "%dx%d", width_ptr, height_ptr);  // NOLINT
+      if (2 == n) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+void PrintHelp(const char * program) {
+  printf("%s [-options] src_argb.raw dst_yuv.raw\n", program);
+  printf(" -s <width> <height> .... specify source resolution.  "
+         "Optional if name contains\n"
+         "                          resolution (ie. "
+         "name.1920x800_24Hz_P420.yuv)\n"
+         "                          Negative value mirrors.\n");
+  printf(" -d <width> <height> .... specify destination resolution.\n");
+  printf(" -f <filter> ............ 0 = point, 1 = bilinear (default).\n");
+  printf(" -skip <src_argb> ....... Number of frame to skip of src_argb\n");
+  printf(" -frames <num> .......... Number of frames to convert\n");
+  printf(" -attenuate ............. Attenuate the ARGB image\n");
+  printf(" -unattenuate ........... Unattenuate the ARGB image\n");
+  printf(" -v ..................... verbose\n");
+  printf(" -h ..................... this help\n");
+  exit(0);
+}
+
+void ParseOptions(int argc, const char* argv[]) {
+  if (argc <= 1) PrintHelp(argv[0]);
+  for (int c = 1; c < argc; ++c) {
+    if (!strcmp(argv[c], "-v")) {
+      verbose = true;
+    } else if (!strcmp(argv[c], "-attenuate")) {
+      attenuate = true;
+    } else if (!strcmp(argv[c], "-unattenuate")) {
+      unattenuate = true;
+    } else if (!strcmp(argv[c], "-h") || !strcmp(argv[c], "-help")) {
+      PrintHelp(argv[0]);
+    } else if (!strcmp(argv[c], "-s") && c + 2 < argc) {
+      image_width = atoi(argv[++c]);    // NOLINT
+      image_height = atoi(argv[++c]);   // NOLINT
+    } else if (!strcmp(argv[c], "-d") && c + 2 < argc) {
+      dst_width = atoi(argv[++c]);    // NOLINT
+      dst_height = atoi(argv[++c]);   // NOLINT
+    } else if (!strcmp(argv[c], "-skip") && c + 1 < argc) {
+      num_skip_org = atoi(argv[++c]);   // NOLINT
+    } else if (!strcmp(argv[c], "-frames") && c + 1 < argc) {
+      num_frames = atoi(argv[++c]);     // NOLINT
+    } else if (!strcmp(argv[c], "-f") && c + 1 < argc) {
+      filter = atoi(argv[++c]);     // NOLINT
+    } else if (argv[c][0] == '-') {
+      fprintf(stderr, "Unknown option. %s\n", argv[c]);
+    } else if (fileindex_org == 0) {
+      fileindex_org = c;
+    } else if (fileindex_rec == 0) {
+      fileindex_rec = c;
+      num_rec = 1;
+    } else {
+      ++num_rec;
+    }
+  }
+  if (fileindex_org == 0 || fileindex_rec == 0) {
+    fprintf(stderr, "Missing filenames\n");
+    PrintHelp(argv[0]);
+  }
+  if (num_skip_org < 0) {
+    fprintf(stderr, "Skipped frames incorrect\n");
+    PrintHelp(argv[0]);
+  }
+  if (num_frames < 0) {
+    fprintf(stderr, "Number of frames incorrect\n");
+    PrintHelp(argv[0]);
+  }
+
+  int org_width, org_height;
+  int rec_width, rec_height;
+  bool org_res_avail = ExtractResolutionFromFilename(argv[fileindex_org],
+                                                     &org_width,
+                                                     &org_height);
+  bool rec_res_avail = ExtractResolutionFromFilename(argv[fileindex_rec],
+                                                     &rec_width,
+                                                     &rec_height);
+  if (image_width == 0 || image_height == 0) {
+    if (org_res_avail) {
+      image_width = org_width;
+      image_height = org_height;
+    } else if (rec_res_avail) {
+      image_width = rec_width;
+      image_height = rec_height;
+    } else {
+      fprintf(stderr, "Missing dimensions.\n");
+      PrintHelp(argv[0]);
+    }
+  }
+  if (dst_width == 0 || dst_height == 0) {
+    if (rec_res_avail) {
+      dst_width = rec_width;
+      dst_height = rec_height;
+    } else {
+      dst_width = Abs(image_width);
+      dst_height = Abs(image_height);
+    }
+  }
+}
+
+static const int kTileX = 32;
+static const int kTileY = 32;
+
+static int TileARGBScale(const uint8* src_argb, int src_stride_argb,
+                         int src_width, int src_height,
+                         uint8* dst_argb, int dst_stride_argb,
+                         int dst_width, int dst_height,
+                         libyuv::FilterMode filtering) {
+  for (int y = 0; y < dst_height; y += kTileY) {
+    for (int x = 0; x < dst_width; x += kTileX) {
+      int clip_width = kTileX;
+      if (x + clip_width > dst_width) {
+        clip_width = dst_width - x;
+      }
+      int clip_height = kTileY;
+      if (y + clip_height > dst_height) {
+        clip_height = dst_height - y;
+      }
+      int r = libyuv::ARGBScaleClip(src_argb, src_stride_argb,
+                                    src_width, src_height,
+                                    dst_argb, dst_stride_argb,
+                                    dst_width, dst_height,
+                                    x, y, clip_width, clip_height, filtering);
+      if (r) {
+        return r;
+      }
+    }
+  }
+  return 0;
+}
+
+int main(int argc, const char* argv[]) {
+  ParseOptions(argc, argv);
+
+  // Open original file (first file argument)
+  FILE* const file_org = fopen(argv[fileindex_org], "rb");
+  if (file_org == NULL) {
+    fprintf(stderr, "Cannot open %s\n", argv[fileindex_org]);
+    exit(1);
+  }
+
+  // Open all files to convert to
+  FILE** file_rec = new FILE* [num_rec];
+  memset(file_rec, 0, num_rec * sizeof(FILE*)); // NOLINT
+  for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
+    file_rec[cur_rec] = fopen(argv[fileindex_rec + cur_rec], "wb");
+    if (file_rec[cur_rec] == NULL) {
+      fprintf(stderr, "Cannot open %s\n", argv[fileindex_rec + cur_rec]);
+      fclose(file_org);
+      for (int i = 0; i < cur_rec; ++i) {
+        fclose(file_rec[i]);
+      }
+      delete[] file_rec;
+      exit(1);
+    }
+  }
+
+  bool org_is_yuv = strstr(argv[fileindex_org], "_P420.") != NULL;
+  bool org_is_argb = strstr(argv[fileindex_org], "_ARGB.") != NULL;
+  if (!org_is_yuv && !org_is_argb) {
+    fprintf(stderr, "Original format unknown %s\n", argv[fileindex_org]);
+    exit(1);
+  }
+  int org_size = Abs(image_width) * Abs(image_height) * 4;  // ARGB
+  // Input is YUV
+  if (org_is_yuv) {
+    const int y_size = Abs(image_width) * Abs(image_height);
+    const int uv_size = ((Abs(image_width) + 1) / 2) *
+        ((Abs(image_height) + 1) / 2);
+    org_size = y_size + 2 * uv_size;  // YUV original.
+  }
+
+  const int dst_size = dst_width * dst_height * 4;  // ARGB scaled
+  const int y_size = dst_width * dst_height;
+  const int uv_size = ((dst_width + 1) / 2) * ((dst_height + 1) / 2);
+  const size_t total_size = y_size + 2 * uv_size;
+#if defined(_MSC_VER)
+  _fseeki64(file_org,
+            static_cast<__int64>(num_skip_org) *
+            static_cast<__int64>(org_size), SEEK_SET);
+#else
+  fseek(file_org, num_skip_org * total_size, SEEK_SET);
+#endif
+
+  uint8* const ch_org = new uint8[org_size];
+  uint8* const ch_dst = new uint8[dst_size];
+  uint8* const ch_rec = new uint8[total_size];
+  if (ch_org == NULL || ch_rec == NULL) {
+    fprintf(stderr, "No memory available\n");
+    fclose(file_org);
+    for (int i = 0; i < num_rec; ++i) {
+      fclose(file_rec[i]);
+    }
+    delete[] ch_org;
+    delete[] ch_dst;
+    delete[] ch_rec;
+    delete[] file_rec;
+    exit(1);
+  }
+
+  if (verbose) {
+    printf("Size: %dx%d to %dx%d\n", image_width, image_height,
+           dst_width, dst_height);
+  }
+
+  int number_of_frames;
+  for (number_of_frames = 0; ; ++number_of_frames) {
+    if (num_frames && number_of_frames >= num_frames)
+      break;
+
+    // Load original YUV or ARGB frame.
+    size_t bytes_org = fread(ch_org, sizeof(uint8),
+                             static_cast<size_t>(org_size), file_org);
+    if (bytes_org < static_cast<size_t>(org_size))
+      break;
+
+    // TODO(fbarchard): Attenuate doesnt need to know dimensions.
+    // ARGB attenuate frame
+    if (org_is_argb && attenuate) {
+      libyuv::ARGBAttenuate(ch_org, 0, ch_org, 0, org_size / 4, 1);
+    }
+    // ARGB unattenuate frame
+    if (org_is_argb && unattenuate) {
+      libyuv::ARGBUnattenuate(ch_org, 0, ch_org, 0, org_size / 4, 1);
+    }
+
+    for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
+      // Scale YUV or ARGB frame.
+      if (org_is_yuv) {
+        int src_width = Abs(image_width);
+        int src_height = Abs(image_height);
+        int half_src_width = (src_width + 1) / 2;
+        int half_src_height = (src_height + 1) / 2;
+        int half_dst_width = (dst_width + 1) / 2;
+        int half_dst_height = (dst_height + 1) / 2;
+        I420Scale(ch_org, src_width,
+                  ch_org + src_width * src_height, half_src_width,
+                  ch_org + src_width * src_height +
+                      half_src_width * half_src_height,  half_src_width,
+                  image_width, image_height,
+                  ch_rec, dst_width,
+                  ch_rec + dst_width * dst_height, half_dst_width,
+                  ch_rec + dst_width * dst_height +
+                      half_dst_width * half_dst_height,  half_dst_width,
+                  dst_width, dst_height,
+                      static_cast<libyuv::FilterMode>(filter));
+      } else {
+        TileARGBScale(ch_org, Abs(image_width) * 4,
+                      image_width, image_height,
+                      ch_dst, dst_width * 4,
+                      dst_width, dst_height,
+                      static_cast<libyuv::FilterMode>(filter));
+      }
+      bool rec_is_yuv = strstr(argv[fileindex_rec + cur_rec], "_P420.") != NULL;
+      bool rec_is_argb =
+          strstr(argv[fileindex_rec + cur_rec], "_ARGB.") != NULL;
+      if (!rec_is_yuv && !rec_is_argb) {
+        fprintf(stderr, "Output format unknown %s\n",
+                argv[fileindex_rec + cur_rec]);
+        continue;  // Advance to next file.
+      }
+
+      // Convert ARGB to YUV.
+      if (!org_is_yuv && rec_is_yuv) {
+        int half_width = (dst_width + 1) / 2;
+        int half_height = (dst_height + 1) / 2;
+        libyuv::ARGBToI420(ch_dst, dst_width * 4,
+                           ch_rec, dst_width,
+                           ch_rec + dst_width * dst_height, half_width,
+                           ch_rec + dst_width * dst_height +
+                               half_width * half_height,  half_width,
+                           dst_width, dst_height);
+      }
+
+      // Output YUV or ARGB frame.
+      if (rec_is_yuv) {
+        size_t bytes_rec = fwrite(ch_rec, sizeof(uint8),
+                                  static_cast<size_t>(total_size),
+                                  file_rec[cur_rec]);
+        if (bytes_rec < static_cast<size_t>(total_size))
+          break;
+      } else {
+        size_t bytes_rec = fwrite(ch_dst, sizeof(uint8),
+                                  static_cast<size_t>(dst_size),
+                                  file_rec[cur_rec]);
+        if (bytes_rec < static_cast<size_t>(dst_size))
+          break;
+      }
+      if (verbose) {
+        printf("%5d", number_of_frames);
+      }
+      if (verbose) {
+        printf("\t%s", argv[fileindex_rec + cur_rec]);
+        printf("\n");
+      }
+    }
+  }
+
+  fclose(file_org);
+  for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
+    fclose(file_rec[cur_rec]);
+  }
+  delete[] ch_org;
+  delete[] ch_dst;
+  delete[] ch_rec;
+  delete[] file_rec;
+  return 0;
+}
diff --git a/libs/libyuv/util/cpuid.c b/libs/libyuv/util/cpuid.c
new file mode 100644
index 0000000000..94e245b11d
--- /dev/null
+++ b/libs/libyuv/util/cpuid.c
@@ -0,0 +1,94 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define INCLUDE_LIBYUV_COMPARE_H_
+#include "libyuv.h"
+#include "./psnr.h"
+#include "./ssim.h"
+
+int main(int argc, const char* argv[]) {
+  int cpu_flags = TestCpuFlag(-1);
+  int has_arm = TestCpuFlag(kCpuHasARM);
+  int has_mips = TestCpuFlag(kCpuHasMIPS);
+  int has_x86 = TestCpuFlag(kCpuHasX86);
+#if defined(__i386__) || defined(__x86_64__) || \
+    defined(_M_IX86) || defined(_M_X64)
+  if (has_x86) {
+    uint32 family, model, cpu_info[4];
+    // Vendor ID:
+    // AuthenticAMD AMD processor
+    // CentaurHauls Centaur processor
+    // CyrixInstead Cyrix processor
+    // GenuineIntel Intel processor
+    // GenuineTMx86 Transmeta processor
+    // Geode by NSC National Semiconductor processor
+    // NexGenDriven NexGen processor
+    // RiseRiseRise Rise Technology processor
+    // SiS SiS SiS  SiS processor
+    // UMC UMC UMC  UMC processor
+    CpuId(0, 0, &cpu_info[0]);
+    cpu_info[0] = cpu_info[1];  // Reorder output
+    cpu_info[1] = cpu_info[3];
+    cpu_info[3] = 0;
+    printf("Cpu Vendor: %s\n", (char*)(&cpu_info[0]));
+
+    // CPU Family and Model
+    // 3:0 - Stepping
+    // 7:4 - Model
+    // 11:8 - Family
+    // 13:12 - Processor Type
+    // 19:16 - Extended Model
+    // 27:20 - Extended Family
+    CpuId(1, 0, &cpu_info[0]);
+    family = ((cpu_info[0] >> 8) & 0x0f) | ((cpu_info[0] >> 16) & 0xff0);
+    model = ((cpu_info[0] >> 4) & 0x0f) | ((cpu_info[0] >> 12) & 0xf0);
+    printf("Cpu Family %d (0x%x), Model %d (0x%x)\n", family, family,
+           model, model);
+  }
+#endif
+  printf("Cpu Flags %x\n", cpu_flags);
+  printf("Has ARM %x\n", has_arm);
+  printf("Has MIPS %x\n", has_mips);
+  printf("Has X86 %x\n", has_x86);
+  if (has_arm) {
+    int has_neon = TestCpuFlag(kCpuHasNEON);
+    printf("Has NEON %x\n", has_neon);
+  }
+  if (has_mips) {
+    int has_dspr2 = TestCpuFlag(kCpuHasDSPR2);
+    printf("Has DSPR2 %x\n", has_dspr2);
+  }
+  if (has_x86) {
+    int has_sse2 = TestCpuFlag(kCpuHasSSE2);
+    int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
+    int has_sse41 = TestCpuFlag(kCpuHasSSE41);
+    int has_sse42 = TestCpuFlag(kCpuHasSSE42);
+    int has_avx = TestCpuFlag(kCpuHasAVX);
+    int has_avx2 = TestCpuFlag(kCpuHasAVX2);
+    int has_avx3 = TestCpuFlag(kCpuHasAVX3);
+    int has_erms = TestCpuFlag(kCpuHasERMS);
+    int has_fma3 = TestCpuFlag(kCpuHasFMA3);
+    printf("Has SSE2 %x\n", has_sse2);
+    printf("Has SSSE3 %x\n", has_ssse3);
+    printf("Has SSE4.1 %x\n", has_sse41);
+    printf("Has SSE4.2 %x\n", has_sse42);
+    printf("Has AVX %x\n", has_avx);
+    printf("Has AVX2 %x\n", has_avx2);
+    printf("Has AVX3 %x\n", has_avx3);
+    printf("Has ERMS %x\n", has_erms);
+    printf("Has FMA3 %x\n", has_fma3);
+  }
+  return 0;
+}
+
diff --git a/libs/libyuv/util/psnr.cc b/libs/libyuv/util/psnr.cc
new file mode 100644
index 0000000000..52b04bd516
--- /dev/null
+++ b/libs/libyuv/util/psnr.cc
@@ -0,0 +1,288 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./psnr.h"  // NOLINT
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+#ifdef _MSC_VER
+#include <intrin.h>  // For __cpuid()
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef unsigned int uint32;  // NOLINT
+#ifdef _MSC_VER
+typedef unsigned __int64 uint64;
+#else  // COMPILER_MSVC
+#if defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
+typedef unsigned long uint64;  // NOLINT
+#else  // defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
+typedef unsigned long long uint64;  // NOLINT
+#endif  // __LP64__
+#endif  // _MSC_VER
+
+// libyuv provides this function when linking library for jpeg support.
+#if !defined(HAVE_JPEG)
+
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+    !defined(__aarch64__)
+#define HAS_SUMSQUAREERROR_NEON
+static uint32 SumSquareError_NEON(const uint8* src_a,
+                                  const uint8* src_b, int count) {
+  volatile uint32 sse;
+  asm volatile (
+    "vmov.u8    q7, #0                         \n"
+    "vmov.u8    q9, #0                         \n"
+    "vmov.u8    q8, #0                         \n"
+    "vmov.u8    q10, #0                        \n"
+
+  "1:                                          \n"
+    "vld1.u8    {q0}, [%0]!                    \n"
+    "vld1.u8    {q1}, [%1]!                    \n"
+    "vsubl.u8   q2, d0, d2                     \n"
+    "vsubl.u8   q3, d1, d3                     \n"
+    "vmlal.s16  q7, d4, d4                     \n"
+    "vmlal.s16  q8, d6, d6                     \n"
+    "vmlal.s16  q8, d5, d5                     \n"
+    "vmlal.s16  q10, d7, d7                    \n"
+    "subs       %2, %2, #16                    \n"
+    "bhi        1b                             \n"
+
+    "vadd.u32   q7, q7, q8                     \n"
+    "vadd.u32   q9, q9, q10                    \n"
+    "vadd.u32   q10, q7, q9                    \n"
+    "vpaddl.u32 q1, q10                        \n"
+    "vadd.u64   d0, d2, d3                     \n"
+    "vmov.32    %3, d0[0]                      \n"
+    : "+r"(src_a),
+      "+r"(src_b),
+      "+r"(count),
+      "=r"(sse)
+    :
+    : "memory", "cc", "q0", "q1", "q2", "q3", "q7", "q8", "q9", "q10");
+  return sse;
+}
+#elif !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+#define HAS_SUMSQUAREERROR_NEON
+static uint32 SumSquareError_NEON(const uint8* src_a,
+                                  const uint8* src_b, int count) {
+  volatile uint32 sse;
+  asm volatile (
+    "eor        v16.16b, v16.16b, v16.16b      \n"
+    "eor        v18.16b, v18.16b, v18.16b      \n"
+    "eor        v17.16b, v17.16b, v17.16b      \n"
+    "eor        v19.16b, v19.16b, v19.16b      \n"
+
+  "1:                                          \n"
+    "ld1        {v0.16b}, [%0], #16            \n"
+    "ld1        {v1.16b}, [%1], #16            \n"
+    "subs       %w2, %w2, #16                  \n"
+    "usubl      v2.8h, v0.8b, v1.8b            \n"
+    "usubl2     v3.8h, v0.16b, v1.16b          \n"
+    "smlal      v16.4s, v2.4h, v2.4h           \n"
+    "smlal      v17.4s, v3.4h, v3.4h           \n"
+    "smlal2     v18.4s, v2.8h, v2.8h           \n"
+    "smlal2     v19.4s, v3.8h, v3.8h           \n"
+    "b.gt       1b                             \n"
+
+    "add        v16.4s, v16.4s, v17.4s         \n"
+    "add        v18.4s, v18.4s, v19.4s         \n"
+    "add        v19.4s, v16.4s, v18.4s         \n"
+    "addv       s0, v19.4s                     \n"
+    "fmov       %w3, s0                        \n"
+    : "+r"(src_a),
+      "+r"(src_b),
+      "+r"(count),
+      "=r"(sse)
+    :
+    : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
+  return sse;
+}
+#elif !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+#define HAS_SUMSQUAREERROR_SSE2
+__declspec(naked)
+static uint32 SumSquareError_SSE2(const uint8* /*src_a*/,
+                                  const uint8* /*src_b*/, int /*count*/) {
+  __asm {
+    mov        eax, [esp + 4]    // src_a
+    mov        edx, [esp + 8]    // src_b
+    mov        ecx, [esp + 12]   // count
+    pxor       xmm0, xmm0
+    pxor       xmm5, xmm5
+    sub        edx, eax
+
+  wloop:
+    movdqu     xmm1, [eax]
+    movdqu     xmm2, [eax + edx]
+    lea        eax,  [eax + 16]
+    movdqu     xmm3, xmm1
+    psubusb    xmm1, xmm2
+    psubusb    xmm2, xmm3
+    por        xmm1, xmm2
+    movdqu     xmm2, xmm1
+    punpcklbw  xmm1, xmm5
+    punpckhbw  xmm2, xmm5
+    pmaddwd    xmm1, xmm1
+    pmaddwd    xmm2, xmm2
+    paddd      xmm0, xmm1
+    paddd      xmm0, xmm2
+    sub        ecx, 16
+    ja         wloop
+
+    pshufd     xmm1, xmm0, 0EEh
+    paddd      xmm0, xmm1
+    pshufd     xmm1, xmm0, 01h
+    paddd      xmm0, xmm1
+    movd       eax, xmm0
+    ret
+  }
+}
+#elif !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+#define HAS_SUMSQUAREERROR_SSE2
+static uint32 SumSquareError_SSE2(const uint8* src_a,
+                                  const uint8* src_b, int count) {
+  uint32 sse;
+  asm volatile (  // NOLINT
+    "pxor      %%xmm0,%%xmm0                   \n"
+    "pxor      %%xmm5,%%xmm5                   \n"
+    "sub       %0,%1                           \n"
+
+  "1:                                          \n"
+    "movdqu    (%0),%%xmm1                     \n"
+    "movdqu    (%0,%1,1),%%xmm2                \n"
+    "lea       0x10(%0),%0                     \n"
+    "movdqu    %%xmm1,%%xmm3                   \n"
+    "psubusb   %%xmm2,%%xmm1                   \n"
+    "psubusb   %%xmm3,%%xmm2                   \n"
+    "por       %%xmm2,%%xmm1                   \n"
+    "movdqu    %%xmm1,%%xmm2                   \n"
+    "punpcklbw %%xmm5,%%xmm1                   \n"
+    "punpckhbw %%xmm5,%%xmm2                   \n"
+    "pmaddwd   %%xmm1,%%xmm1                   \n"
+    "pmaddwd   %%xmm2,%%xmm2                   \n"
+    "paddd     %%xmm1,%%xmm0                   \n"
+    "paddd     %%xmm2,%%xmm0                   \n"
+    "sub       $0x10,%2                        \n"
+    "ja        1b                              \n"
+
+    "pshufd    $0xee,%%xmm0,%%xmm1             \n"
+    "paddd     %%xmm1,%%xmm0                   \n"
+    "pshufd    $0x1,%%xmm0,%%xmm1              \n"
+    "paddd     %%xmm1,%%xmm0                   \n"
+    "movd      %%xmm0,%3                       \n"
+
+  : "+r"(src_a),      // %0
+    "+r"(src_b),      // %1
+    "+r"(count),      // %2
+    "=g"(sse)         // %3
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+  );  // NOLINT
+  return sse;
+}
+#endif  // LIBYUV_DISABLE_X86 etc
+
+#if defined(HAS_SUMSQUAREERROR_SSE2)
+#if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__)
+static __inline void __cpuid(int cpu_info[4], int info_type) {
+  asm volatile (  // NOLINT
+    "mov %%ebx, %%edi                          \n"
+    "cpuid                                     \n"
+    "xchg %%edi, %%ebx                         \n"
+    : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
+    : "a"(info_type));
+}
+// For gcc/clang but not clangcl.
+#elif (defined(__i386__) || defined(__x86_64__)) && !defined(_MSC_VER)
+static __inline void __cpuid(int cpu_info[4], int info_type) {
+  asm volatile (  // NOLINT
+    "cpuid                                     \n"
+    : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
+    : "a"(info_type));
+}
+#endif
+
+static int CpuHasSSE2() {
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86)
+  int cpu_info[4];
+  __cpuid(cpu_info, 1);
+  if (cpu_info[3] & 0x04000000) {
+    return 1;
+  }
+#endif
+  return 0;
+}
+#endif  // HAS_SUMSQUAREERROR_SSE2
+
+static uint32 SumSquareError_C(const uint8* src_a,
+                               const uint8* src_b, int count) {
+  uint32 sse = 0u;
+  for (int x = 0; x < count; ++x) {
+    int diff = src_a[x] - src_b[x];
+    sse += static_cast<uint32>(diff * diff);
+  }
+  return sse;
+}
+
+double ComputeSumSquareError(const uint8* src_a,
+                             const uint8* src_b, int count) {
+  uint32 (*SumSquareError)(const uint8* src_a,
+                           const uint8* src_b, int count) = SumSquareError_C;
+#if defined(HAS_SUMSQUAREERROR_NEON)
+  SumSquareError = SumSquareError_NEON;
+#endif
+#if defined(HAS_SUMSQUAREERROR_SSE2)
+  if (CpuHasSSE2()) {
+    SumSquareError = SumSquareError_SSE2;
+  }
+#endif
+  const int kBlockSize = 1 << 15;
+  uint64 sse = 0;
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+: sse)
+#endif
+  for (int i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
+    sse += SumSquareError(src_a + i, src_b + i, kBlockSize);
+  }
+  src_a += count & ~(kBlockSize - 1);
+  src_b += count & ~(kBlockSize - 1);
+  int remainder = count & (kBlockSize - 1) & ~15;
+  if (remainder) {
+    sse += SumSquareError(src_a, src_b, remainder);
+    src_a += remainder;
+    src_b += remainder;
+  }
+  remainder = count & 15;
+  if (remainder) {
+    sse += SumSquareError_C(src_a, src_b, remainder);
+  }
+  return static_cast<double>(sse);
+}
+#endif
+
+// PSNR formula: psnr = 10 * log10 (Peak Signal^2 * size / sse)
+// Returns 128.0 (kMaxPSNR) if sse is 0 (perfect match).
+double ComputePSNR(double sse, double size) {
+  const double kMINSSE = 255.0 * 255.0 * size / pow(10.0, kMaxPSNR / 10.0);
+  if (sse <= kMINSSE)
+    sse = kMINSSE;  // Produces max PSNR of 128
+  return 10.0 * log10(255.0 * 255.0 * size / sse);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
diff --git a/libs/libyuv/util/psnr.h b/libs/libyuv/util/psnr.h
new file mode 100644
index 0000000000..0816b97600
--- /dev/null
+++ b/libs/libyuv/util/psnr.h
@@ -0,0 +1,45 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// Get PSNR for video sequence. Assuming RAW 4:2:0 Y:Cb:Cr format
+
+#ifndef UTIL_PSNR_H_  // NOLINT
+#define UTIL_PSNR_H_
+
+#include <math.h>  // For log10()
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if !defined(INT_TYPES_DEFINED) && !defined(UINT8_TYPE_DEFINED)
+typedef unsigned char uint8;
+#define UINT8_TYPE_DEFINED
+#endif
+
+static const double kMaxPSNR = 128.0;
+
+// libyuv provides this function when linking library for jpeg support.
+// TODO(fbarchard): make psnr lib compatible subset of libyuv.
+#if !defined(HAVE_JPEG)
+// Computer Sum of Squared Error (SSE).
+// Pass this to ComputePSNR for final result.
+double ComputeSumSquareError(const uint8* org, const uint8* rec, int size);
+#endif
+
+// PSNR formula: psnr = 10 * log10 (Peak Signal^2 * size / sse)
+// Returns 128.0 (kMaxPSNR) if sse is 0 (perfect match).
+double ComputePSNR(double sse, double size);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // UTIL_PSNR_H_  // NOLINT
diff --git a/libs/libyuv/util/psnr_main.cc b/libs/libyuv/util/psnr_main.cc
new file mode 100644
index 0000000000..0518ab84e0
--- /dev/null
+++ b/libs/libyuv/util/psnr_main.cc
@@ -0,0 +1,648 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// Get PSNR or SSIM for video sequence. Assuming RAW 4:2:0 Y:Cb:Cr format
+// To build: g++ -O3 -o psnr psnr.cc ssim.cc psnr_main.cc
+// or VisualC: cl /Ox psnr.cc ssim.cc psnr_main.cc
+//
+// To enable OpenMP and SSE2
+// gcc: g++ -msse2 -O3 -fopenmp -o psnr psnr.cc ssim.cc psnr_main.cc
+// vc:  cl /arch:SSE2 /Ox /openmp psnr.cc ssim.cc psnr_main.cc
+//
+// Usage: psnr org_seq rec_seq -s width height [-skip skip_org skip_rec]
+
+#ifndef _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_WARNINGS
+#endif
+
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#include "./psnr.h"
+#include "./ssim.h"
+#ifdef HAVE_JPEG
+#include "libyuv/compare.h"
+#include "libyuv/convert.h"
+#endif
+
+struct metric {
+  double y, u, v, all;
+  double min_y, min_u, min_v, min_all;
+  double global_y, global_u, global_v, global_all;
+  int min_frame;
+};
+
+// options
+bool verbose = false;
+bool quiet = false;
+bool show_name = false;
+bool do_swap_uv = false;
+bool do_psnr = false;
+bool do_ssim = false;
+bool do_mse = false;
+bool do_lssim = false;
+int image_width = 0, image_height = 0;
+int fileindex_org = 0;  // argv argument contains the source file name.
+int fileindex_rec = 0;  // argv argument contains the destination file name.
+int num_rec = 0;
+int num_skip_org = 0;
+int num_skip_rec = 0;
+int num_frames = 0;
+#ifdef _OPENMP
+int num_threads = 0;
+#endif
+
+// Parse PYUV format. ie name.1920x800_24Hz_P420.yuv
+bool ExtractResolutionFromFilename(const char* name,
+                                   int* width_ptr,
+                                   int* height_ptr) {
+  // Isolate the .width_height. section of the filename by searching for a
+  // dot or underscore followed by a digit.
+  for (int i = 0; name[i]; ++i) {
+    if ((name[i] == '.' || name[i] == '_') &&
+        name[i + 1] >= '0' && name[i + 1] <= '9') {
+      int n = sscanf(name + i + 1, "%dx%d", width_ptr, height_ptr);  // NOLINT
+      if (2 == n) {
+        return true;
+      }
+    }
+  }
+
+#ifdef HAVE_JPEG
+  // Try parsing file as a jpeg.
+  FILE* const file_org = fopen(name, "rb");
+  if (file_org == NULL) {
+    fprintf(stderr, "Cannot open %s\n", name);
+    return false;
+  }
+  fseek(file_org, 0, SEEK_END);
+  size_t total_size  = ftell(file_org);
+  fseek(file_org, 0, SEEK_SET);
+  uint8* const ch_org = new uint8[total_size];
+  memset(ch_org, 0, total_size);
+  size_t bytes_org = fread(ch_org, sizeof(uint8), total_size, file_org);
+  fclose(file_org);
+  if (bytes_org == total_size) {
+    if (0 == libyuv::MJPGSize(ch_org, total_size, width_ptr, height_ptr)) {
+      delete[] ch_org;
+      return true;
+    }
+  }
+  delete[] ch_org;
+#endif  // HAVE_JPEG
+  return false;
+}
+
+// Scale Y channel from 16..240 to 0..255.
+// This can be useful when comparing codecs that are inconsistant about Y
+uint8 ScaleY(uint8 y) {
+  int ny = (y - 16) * 256 / 224;
+  if (ny < 0) ny = 0;
+  if (ny > 255) ny = 255;
+  return static_cast<uint8>(ny);
+}
+
+// MSE = Mean Square Error
+double GetMSE(double sse, double size) {
+  return sse / size;
+}
+
+void PrintHelp(const char * program) {
+  printf("%s [-options] org_seq rec_seq [rec_seq2.. etc]\n", program);
+#ifdef HAVE_JPEG
+  printf("jpeg or raw YUV 420 supported.\n");
+#endif
+  printf("options:\n");
+  printf(" -s <width> <height> .... specify YUV size, mandatory if none of the "
+         "sequences have the\n");
+  printf("                          resolution embedded in their filename (ie. "
+         "name.1920x800_24Hz_P420.yuv)\n");
+  printf(" -psnr .................. compute PSNR (default)\n");
+  printf(" -ssim .................. compute SSIM\n");
+  printf(" -mse ................... compute MSE\n");
+  printf(" -swap .................. Swap U and V plane\n");
+  printf(" -skip <org> <rec> ...... Number of frame to skip of org and rec\n");
+  printf(" -frames <num> .......... Number of frames to compare\n");
+#ifdef _OPENMP
+  printf(" -t <num> ............... Number of threads\n");
+#endif
+  printf(" -n ..................... Show file name\n");
+  printf(" -v ..................... verbose++\n");
+  printf(" -q ..................... quiet\n");
+  printf(" -h ..................... this help\n");
+  exit(0);
+}
+
+void ParseOptions(int argc, const char* argv[]) {
+  if (argc <= 1) PrintHelp(argv[0]);
+  for (int c = 1; c < argc; ++c) {
+    if (!strcmp(argv[c], "-v")) {
+      verbose = true;
+    } else if (!strcmp(argv[c], "-q")) {
+      quiet = true;
+    } else if (!strcmp(argv[c], "-n")) {
+      show_name = true;
+    } else if (!strcmp(argv[c], "-psnr")) {
+      do_psnr = true;
+    } else if (!strcmp(argv[c], "-mse")) {
+      do_mse = true;
+    } else if (!strcmp(argv[c], "-ssim")) {
+      do_ssim = true;
+    } else if (!strcmp(argv[c], "-lssim")) {
+      do_ssim = true;
+      do_lssim = true;
+    } else if (!strcmp(argv[c], "-swap")) {
+      do_swap_uv = true;
+    } else if (!strcmp(argv[c], "-h") || !strcmp(argv[c], "-help")) {
+      PrintHelp(argv[0]);
+    } else if (!strcmp(argv[c], "-s") && c + 2 < argc) {
+      image_width = atoi(argv[++c]);    // NOLINT
+      image_height = atoi(argv[++c]);   // NOLINT
+    } else if (!strcmp(argv[c], "-skip") && c + 2 < argc) {
+      num_skip_org = atoi(argv[++c]);   // NOLINT
+      num_skip_rec = atoi(argv[++c]);   // NOLINT
+    } else if (!strcmp(argv[c], "-frames") && c + 1 < argc) {
+      num_frames = atoi(argv[++c]);     // NOLINT
+#ifdef _OPENMP
+    } else if (!strcmp(argv[c], "-t") && c + 1 < argc) {
+      num_threads = atoi(argv[++c]);    // NOLINT
+#endif
+    } else if (argv[c][0] == '-') {
+      fprintf(stderr, "Unknown option. %s\n", argv[c]);
+    } else if (fileindex_org == 0) {
+      fileindex_org = c;
+    } else if (fileindex_rec == 0) {
+      fileindex_rec = c;
+      num_rec = 1;
+    } else {
+      ++num_rec;
+    }
+  }
+  if (fileindex_org == 0 || fileindex_rec == 0) {
+    fprintf(stderr, "Missing filenames\n");
+    PrintHelp(argv[0]);
+  }
+  if (num_skip_org < 0 || num_skip_rec < 0) {
+    fprintf(stderr, "Skipped frames incorrect\n");
+    PrintHelp(argv[0]);
+  }
+  if (num_frames < 0) {
+    fprintf(stderr, "Number of frames incorrect\n");
+    PrintHelp(argv[0]);
+  }
+  if (image_width == 0 || image_height == 0) {
+    int org_width, org_height;
+    int rec_width, rec_height;
+    bool org_res_avail = ExtractResolutionFromFilename(argv[fileindex_org],
+                                                       &org_width,
+                                                       &org_height);
+    bool rec_res_avail = ExtractResolutionFromFilename(argv[fileindex_rec],
+                                                       &rec_width,
+                                                       &rec_height);
+    if (org_res_avail) {
+      if (rec_res_avail) {
+        if ((org_width == rec_width) && (org_height == rec_height)) {
+          image_width = org_width;
+          image_height = org_height;
+        } else {
+          fprintf(stderr, "Sequences have different resolutions.\n");
+          PrintHelp(argv[0]);
+        }
+      } else {
+        image_width = org_width;
+        image_height = org_height;
+      }
+    } else if (rec_res_avail) {
+      image_width = rec_width;
+      image_height = rec_height;
+    } else {
+      fprintf(stderr, "Missing dimensions.\n");
+      PrintHelp(argv[0]);
+    }
+  }
+}
+
+bool UpdateMetrics(uint8* ch_org, uint8* ch_rec,
+                   const int y_size, const int uv_size, const size_t total_size,
+                   int number_of_frames,
+                   metric* cur_distortion_psnr,
+                   metric* distorted_frame, bool do_psnr) {
+  const int uv_offset = (do_swap_uv ? uv_size : 0);
+  const uint8* const u_org = ch_org + y_size + uv_offset;
+  const uint8* const u_rec = ch_rec + y_size;
+  const uint8* const v_org = ch_org + y_size + (uv_size - uv_offset);
+  const uint8* const v_rec = ch_rec + y_size + uv_size;
+  if (do_psnr) {
+#ifdef HAVE_JPEG
+    double y_err = static_cast<double>(
+      libyuv::ComputeSumSquareError(ch_org, ch_rec, y_size));
+    double u_err = static_cast<double>(
+      libyuv::ComputeSumSquareError(u_org, u_rec, uv_size));
+    double v_err = static_cast<double>(
+      libyuv::ComputeSumSquareError(v_org, v_rec, uv_size));
+#else
+    double y_err = ComputeSumSquareError(ch_org, ch_rec, y_size);
+    double u_err = ComputeSumSquareError(u_org, u_rec, uv_size);
+    double v_err = ComputeSumSquareError(v_org, v_rec, uv_size);
+#endif
+    const double total_err = y_err + u_err + v_err;
+    cur_distortion_psnr->global_y += y_err;
+    cur_distortion_psnr->global_u += u_err;
+    cur_distortion_psnr->global_v += v_err;
+    cur_distortion_psnr->global_all += total_err;
+    distorted_frame->y = ComputePSNR(y_err, static_cast<double>(y_size));
+    distorted_frame->u = ComputePSNR(u_err, static_cast<double>(uv_size));
+    distorted_frame->v = ComputePSNR(v_err, static_cast<double>(uv_size));
+    distorted_frame->all = ComputePSNR(total_err,
+                                       static_cast<double>(total_size));
+  } else {
+    distorted_frame->y = CalcSSIM(ch_org, ch_rec, image_width, image_height);
+    distorted_frame->u = CalcSSIM(u_org, u_rec, (image_width + 1) / 2,
+                                 (image_height + 1) / 2);
+    distorted_frame->v = CalcSSIM(v_org, v_rec, (image_width + 1) / 2,
+                                 (image_height + 1) / 2);
+    distorted_frame->all =
+      (distorted_frame->y + distorted_frame->u + distorted_frame->v)
+        / total_size;
+    distorted_frame->y /= y_size;
+    distorted_frame->u /= uv_size;
+    distorted_frame->v /= uv_size;
+
+    if (do_lssim) {
+      distorted_frame->all = CalcLSSIM(distorted_frame->all);
+      distorted_frame->y = CalcLSSIM(distorted_frame->y);
+      distorted_frame->u = CalcLSSIM(distorted_frame->u);
+      distorted_frame->v = CalcLSSIM(distorted_frame->v);
+    }
+  }
+
+  cur_distortion_psnr->y += distorted_frame->y;
+  cur_distortion_psnr->u += distorted_frame->u;
+  cur_distortion_psnr->v += distorted_frame->v;
+  cur_distortion_psnr->all += distorted_frame->all;
+
+  bool ismin = false;
+  if (distorted_frame->y < cur_distortion_psnr->min_y)
+    cur_distortion_psnr->min_y = distorted_frame->y;
+  if (distorted_frame->u < cur_distortion_psnr->min_u)
+    cur_distortion_psnr->min_u = distorted_frame->u;
+  if (distorted_frame->v < cur_distortion_psnr->min_v)
+    cur_distortion_psnr->min_v = distorted_frame->v;
+  if (distorted_frame->all < cur_distortion_psnr->min_all) {
+    cur_distortion_psnr->min_all = distorted_frame->all;
+    cur_distortion_psnr->min_frame = number_of_frames;
+    ismin = true;
+  }
+  return ismin;
+}
+
+int main(int argc, const char* argv[]) {
+  ParseOptions(argc, argv);
+  if (!do_psnr && !do_ssim) {
+    do_psnr = true;
+  }
+
+#ifdef _OPENMP
+  if (num_threads) {
+    omp_set_num_threads(num_threads);
+  }
+  if (verbose) {
+    printf("OpenMP %d procs\n", omp_get_num_procs());
+  }
+#endif
+  // Open original file (first file argument)
+  FILE* const file_org = fopen(argv[fileindex_org], "rb");
+  if (file_org == NULL) {
+    fprintf(stderr, "Cannot open %s\n", argv[fileindex_org]);
+    exit(1);
+  }
+
+  // Open all files to compare to
+  FILE** file_rec = new FILE* [num_rec];
+  memset(file_rec, 0, num_rec * sizeof(FILE*)); // NOLINT
+  for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
+    file_rec[cur_rec] = fopen(argv[fileindex_rec + cur_rec], "rb");
+    if (file_rec[cur_rec] == NULL) {
+      fprintf(stderr, "Cannot open %s\n", argv[fileindex_rec + cur_rec]);
+      fclose(file_org);
+      for (int i = 0; i < cur_rec; ++i) {
+        fclose(file_rec[i]);
+      }
+      delete[] file_rec;
+      exit(1);
+    }
+  }
+
+  const int y_size = image_width * image_height;
+  const int uv_size = ((image_width + 1) / 2) * ((image_height + 1) / 2);
+  const size_t total_size = y_size + 2 * uv_size;    // NOLINT
+#if defined(_MSC_VER)
+  _fseeki64(file_org,
+            static_cast<__int64>(num_skip_org) *
+            static_cast<__int64>(total_size), SEEK_SET);
+#else
+  fseek(file_org, num_skip_org * total_size, SEEK_SET);
+#endif
+  for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
+#if defined(_MSC_VER)
+    _fseeki64(file_rec[cur_rec],
+              static_cast<__int64>(num_skip_rec) *
+              static_cast<__int64>(total_size),
+              SEEK_SET);
+#else
+    fseek(file_rec[cur_rec], num_skip_rec * total_size, SEEK_SET);
+#endif
+  }
+
+  uint8* const ch_org = new uint8[total_size];
+  uint8* const ch_rec = new uint8[total_size];
+  if (ch_org == NULL || ch_rec == NULL) {
+    fprintf(stderr, "No memory available\n");
+    fclose(file_org);
+    for (int i = 0; i < num_rec; ++i) {
+      fclose(file_rec[i]);
+    }
+    delete[] ch_org;
+    delete[] ch_rec;
+    delete[] file_rec;
+    exit(1);
+  }
+
+  metric* const distortion_psnr = new metric[num_rec];
+  metric* const distortion_ssim = new metric[num_rec];
+  for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
+    metric* cur_distortion_psnr = &distortion_psnr[cur_rec];
+    cur_distortion_psnr->y = 0.0;
+    cur_distortion_psnr->u = 0.0;
+    cur_distortion_psnr->v = 0.0;
+    cur_distortion_psnr->all = 0.0;
+    cur_distortion_psnr->min_y = kMaxPSNR;
+    cur_distortion_psnr->min_u = kMaxPSNR;
+    cur_distortion_psnr->min_v = kMaxPSNR;
+    cur_distortion_psnr->min_all = kMaxPSNR;
+    cur_distortion_psnr->min_frame = 0;
+    cur_distortion_psnr->global_y = 0.0;
+    cur_distortion_psnr->global_u = 0.0;
+    cur_distortion_psnr->global_v = 0.0;
+    cur_distortion_psnr->global_all = 0.0;
+    distortion_ssim[cur_rec] = cur_distortion_psnr[cur_rec];
+  }
+
+  if (verbose) {
+    printf("Size: %dx%d\n", image_width, image_height);
+  }
+
+  if (!quiet) {
+    printf("Frame");
+    if (do_psnr) {
+      printf("\t PSNR-Y \t PSNR-U \t PSNR-V \t PSNR-All \t Frame");
+    }
+    if (do_ssim) {
+      printf("\t  SSIM-Y\t  SSIM-U\t  SSIM-V\t  SSIM-All\t Frame");
+    }
+    if (show_name) {
+      printf("\tName\n");
+    } else {
+      printf("\n");
+    }
+  }
+
+  int number_of_frames;
+  for (number_of_frames = 0; ; ++number_of_frames) {
+    if (num_frames && number_of_frames >= num_frames)
+      break;
+
+    size_t bytes_org = fread(ch_org, sizeof(uint8), total_size, file_org);
+    if (bytes_org < total_size) {
+#ifdef HAVE_JPEG
+      // Try parsing file as a jpeg.
+      uint8* const ch_jpeg = new uint8[bytes_org];
+      memcpy(ch_jpeg, ch_org, bytes_org);
+      memset(ch_org, 0, total_size);
+
+      if (0 != libyuv::MJPGToI420(ch_jpeg, bytes_org,
+                                  ch_org,
+                                  image_width,
+                                  ch_org + y_size,
+                                  (image_width + 1) / 2,
+                                  ch_org + y_size + uv_size,
+                                  (image_width + 1) / 2,
+                                  image_width,
+                                  image_height,
+                                  image_width,
+                                  image_height)) {
+        delete[] ch_jpeg;
+        break;
+      }
+      delete[] ch_jpeg;
+#else
+      break;
+#endif  // HAVE_JPEG
+    }
+
+    for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
+      size_t bytes_rec = fread(ch_rec, sizeof(uint8),
+                               total_size, file_rec[cur_rec]);
+      if (bytes_rec < total_size) {
+#ifdef HAVE_JPEG
+        // Try parsing file as a jpeg.
+        uint8* const ch_jpeg = new uint8[bytes_rec];
+        memcpy(ch_jpeg, ch_rec, bytes_rec);
+        memset(ch_rec, 0, total_size);
+
+        if (0 != libyuv::MJPGToI420(ch_jpeg, bytes_rec,
+                                    ch_rec,
+                                    image_width,
+                                    ch_rec + y_size,
+                                    (image_width + 1) / 2,
+                                    ch_rec + y_size + uv_size,
+                                    (image_width + 1) / 2,
+                                    image_width,
+                                    image_height,
+                                    image_width,
+                                    image_height)) {
+          delete[] ch_jpeg;
+          break;
+        }
+        delete[] ch_jpeg;
+#else
+        break;
+#endif  // HAVE_JPEG
+      }
+
+      if (verbose) {
+        printf("%5d", number_of_frames);
+      }
+      if (do_psnr) {
+        metric distorted_frame;
+        metric* cur_distortion_psnr = &distortion_psnr[cur_rec];
+        bool ismin = UpdateMetrics(ch_org, ch_rec,
+                                   y_size, uv_size, total_size,
+                                   number_of_frames,
+                                   cur_distortion_psnr,
+                                   &distorted_frame, true);
+        if (verbose) {
+          printf("\t%10.6f", distorted_frame.y);
+          printf("\t%10.6f", distorted_frame.u);
+          printf("\t%10.6f", distorted_frame.v);
+          printf("\t%10.6f", distorted_frame.all);
+          printf("\t%5s", ismin ? "min" : "");
+        }
+      }
+      if (do_ssim) {
+        metric distorted_frame;
+        metric* cur_distortion_ssim = &distortion_ssim[cur_rec];
+        bool ismin = UpdateMetrics(ch_org, ch_rec,
+                                   y_size, uv_size, total_size,
+                                   number_of_frames,
+                                   cur_distortion_ssim,
+                                   &distorted_frame, false);
+        if (verbose) {
+          printf("\t%10.6f", distorted_frame.y);
+          printf("\t%10.6f", distorted_frame.u);
+          printf("\t%10.6f", distorted_frame.v);
+          printf("\t%10.6f", distorted_frame.all);
+          printf("\t%5s", ismin ? "min" : "");
+        }
+      }
+      if (verbose) {
+        if (show_name) {
+          printf("\t%s", argv[fileindex_rec + cur_rec]);
+        }
+        printf("\n");
+      }
+    }
+  }
+
+  // Final PSNR computation.
+  for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
+    metric* cur_distortion_psnr = &distortion_psnr[cur_rec];
+    metric* cur_distortion_ssim = &distortion_ssim[cur_rec];
+    if (number_of_frames > 0) {
+      const double norm = 1. / static_cast<double>(number_of_frames);
+      cur_distortion_psnr->y *= norm;
+      cur_distortion_psnr->u *= norm;
+      cur_distortion_psnr->v *= norm;
+      cur_distortion_psnr->all *= norm;
+      cur_distortion_ssim->y *= norm;
+      cur_distortion_ssim->u *= norm;
+      cur_distortion_ssim->v *= norm;
+      cur_distortion_ssim->all *= norm;
+    }
+
+    if (do_psnr) {
+      const double global_psnr_y = ComputePSNR(
+          cur_distortion_psnr->global_y,
+          static_cast<double>(y_size) * number_of_frames);
+      const double global_psnr_u = ComputePSNR(
+          cur_distortion_psnr->global_u,
+          static_cast<double>(uv_size) * number_of_frames);
+      const double global_psnr_v = ComputePSNR(
+          cur_distortion_psnr->global_v,
+          static_cast<double>(uv_size) * number_of_frames);
+      const double global_psnr_all = ComputePSNR(
+          cur_distortion_psnr->global_all,
+          static_cast<double>(total_size) * number_of_frames);
+      printf("Global:\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d",
+          global_psnr_y,
+          global_psnr_u,
+          global_psnr_v,
+          global_psnr_all,
+          number_of_frames);
+      if (show_name) {
+        printf("\t%s", argv[fileindex_rec + cur_rec]);
+      }
+      printf("\n");
+    }
+
+    if (!quiet) {
+      printf("Avg:");
+      if (do_psnr) {
+        printf("\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d",
+             cur_distortion_psnr->y,
+             cur_distortion_psnr->u,
+             cur_distortion_psnr->v,
+             cur_distortion_psnr->all,
+             number_of_frames);
+      }
+      if (do_ssim) {
+        printf("\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d",
+             cur_distortion_ssim->y,
+             cur_distortion_ssim->u,
+             cur_distortion_ssim->v,
+             cur_distortion_ssim->all,
+             number_of_frames);
+      }
+      if (show_name) {
+        printf("\t%s", argv[fileindex_rec + cur_rec]);
+      }
+      printf("\n");
+    }
+    if (!quiet) {
+      printf("Min:");
+      if (do_psnr) {
+        printf("\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d",
+            cur_distortion_psnr->min_y,
+            cur_distortion_psnr->min_u,
+            cur_distortion_psnr->min_v,
+            cur_distortion_psnr->min_all,
+            cur_distortion_psnr->min_frame);
+      }
+      if (do_ssim) {
+        printf("\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d",
+            cur_distortion_ssim->min_y,
+            cur_distortion_ssim->min_u,
+            cur_distortion_ssim->min_v,
+            cur_distortion_ssim->min_all,
+            cur_distortion_ssim->min_frame);
+      }
+      if (show_name) {
+        printf("\t%s", argv[fileindex_rec + cur_rec]);
+      }
+      printf("\n");
+    }
+
+    if (do_mse) {
+      double global_mse_y = GetMSE(cur_distortion_psnr->global_y,
+        static_cast<double>(y_size) * number_of_frames);
+      double global_mse_u = GetMSE(cur_distortion_psnr->global_u,
+        static_cast<double>(uv_size) * number_of_frames);
+      double global_mse_v = GetMSE(cur_distortion_psnr->global_v,
+        static_cast<double>(uv_size) * number_of_frames);
+      double global_mse_all = GetMSE(cur_distortion_psnr->global_all,
+        static_cast<double>(total_size) * number_of_frames);
+      printf("MSE:\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d",
+          global_mse_y,
+          global_mse_u,
+          global_mse_v,
+          global_mse_all,
+          number_of_frames);
+      if (show_name) {
+        printf("\t%s", argv[fileindex_rec + cur_rec]);
+      }
+      printf("\n");
+    }
+  }
+  fclose(file_org);
+  for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
+    fclose(file_rec[cur_rec]);
+  }
+  delete[] distortion_psnr;
+  delete[] distortion_ssim;
+  delete[] ch_org;
+  delete[] ch_rec;
+  delete[] file_rec;
+  return 0;
+}
diff --git a/libs/libyuv/util/ssim.cc b/libs/libyuv/util/ssim.cc
new file mode 100644
index 0000000000..5a6399b782
--- /dev/null
+++ b/libs/libyuv/util/ssim.cc
@@ -0,0 +1,336 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "../util/ssim.h"  // NOLINT
+
+#include <string.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef unsigned int uint32;     // NOLINT
+typedef unsigned short uint16;   // NOLINT
+
+#if !defined(LIBYUV_DISABLE_X86) && !defined(__SSE2__) && \
+  (defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)))
+#define __SSE2__
+#endif
+#if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__)
+#include <emmintrin.h>
+#endif
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+// SSIM
+enum { KERNEL = 3, KERNEL_SIZE = 2 * KERNEL + 1 };
+
+// Symmetric Gaussian kernel:  K[i] = ~11 * exp(-0.3 * i * i)
+// The maximum value (11 x 11) must be less than 128 to avoid sign
+// problems during the calls to _mm_mullo_epi16().
+static const int K[KERNEL_SIZE] = {
+  1, 3, 7, 11, 7, 3, 1    // ~11 * exp(-0.3 * i * i)
+};
+static const double kiW[KERNEL + 1 + 1] = {
+  1. / 1089.,   // 1 / sum(i:0..6, j..6) K[i]*K[j]
+  1. / 1089.,   // 1 / sum(i:0..6, j..6) K[i]*K[j]
+  1. / 1056.,   // 1 / sum(i:0..5, j..6) K[i]*K[j]
+  1. / 957.,    // 1 / sum(i:0..4, j..6) K[i]*K[j]
+  1. / 726.,    // 1 / sum(i:0..3, j..6) K[i]*K[j]
+};
+
+#if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__)
+
+#define PWEIGHT(A, B)  static_cast<uint16>(K[(A)] * K[(B)])   // weight product
+#define MAKE_WEIGHT(L)                                               \
+  { { { PWEIGHT(L, 0), PWEIGHT(L, 1), PWEIGHT(L, 2), PWEIGHT(L, 3),  \
+        PWEIGHT(L, 4), PWEIGHT(L, 5), PWEIGHT(L, 6), 0 } } }
+
+// We need this union trick to be able to initialize constant static __m128i
+// values. We can't call _mm_set_epi16() for static compile-time initialization.
+static const struct {
+  union {
+    uint16 i16_[8];
+    __m128i m_;
+  } values_;
+} W0 = MAKE_WEIGHT(0),
+  W1 = MAKE_WEIGHT(1),
+  W2 = MAKE_WEIGHT(2),
+  W3 = MAKE_WEIGHT(3);
+  // ... the rest is symmetric.
+#undef MAKE_WEIGHT
+#undef PWEIGHT
+#endif
+
+// Common final expression for SSIM, once the weighted sums are known.
+static double FinalizeSSIM(double iw, double xm, double ym,
+                           double xxm, double xym, double yym) {
+  const double iwx = xm * iw;
+  const double iwy = ym * iw;
+  double sxx = xxm * iw - iwx * iwx;
+  double syy = yym * iw - iwy * iwy;
+  // small errors are possible, due to rounding. Clamp to zero.
+  if (sxx < 0.) sxx = 0.;
+  if (syy < 0.) syy = 0.;
+  const double sxsy = sqrt(sxx * syy);
+  const double sxy = xym * iw - iwx * iwy;
+  static const double C11 = (0.01 * 0.01) * (255 * 255);
+  static const double C22 = (0.03 * 0.03) * (255 * 255);
+  static const double C33 = (0.015 * 0.015) * (255 * 255);
+  const double l = (2. * iwx * iwy + C11) / (iwx * iwx + iwy * iwy + C11);
+  const double c = (2. * sxsy      + C22) / (sxx + syy + C22);
+  const double s = (sxy + C33) / (sxsy + C33);
+  return l * c * s;
+}
+
+// GetSSIM() does clipping.  GetSSIMFullKernel() does not
+
+// TODO(skal): use summed tables?
+// Note: worst case of accumulation is a weight of 33 = 11 + 2 * (7 + 3 + 1)
+// with a diff of 255, squared. The maximum error is thus 0x4388241,
+// which fits into 32 bits integers.
+double GetSSIM(const uint8 *org, const uint8 *rec,
+               int xo, int yo, int W, int H, int stride) {
+  uint32 ws = 0, xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;
+  org += (yo - KERNEL) * stride;
+  org += (xo - KERNEL);
+  rec += (yo - KERNEL) * stride;
+  rec += (xo - KERNEL);
+  for (int y_ = 0; y_ < KERNEL_SIZE; ++y_, org += stride, rec += stride) {
+    if (((yo - KERNEL + y_) < 0) || ((yo - KERNEL + y_) >= H)) continue;
+    const int Wy = K[y_];
+    for (int x_ = 0; x_ < KERNEL_SIZE; ++x_) {
+      const int Wxy = Wy * K[x_];
+      if (((xo - KERNEL + x_) >= 0) && ((xo - KERNEL + x_) < W)) {
+        const int org_x = org[x_];
+        const int rec_x = rec[x_];
+        ws += Wxy;
+        xm  += Wxy * org_x;
+        ym  += Wxy * rec_x;
+        xxm += Wxy * org_x * org_x;
+        xym += Wxy * org_x * rec_x;
+        yym += Wxy * rec_x * rec_x;
+      }
+    }
+  }
+  return FinalizeSSIM(1. / ws, xm, ym, xxm, xym, yym);
+}
+
+double GetSSIMFullKernel(const uint8 *org, const uint8 *rec,
+                         int xo, int yo, int stride,
+                         double area_weight) {
+  uint32 xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;
+
+#if defined(LIBYUV_DISABLE_X86) || !defined(__SSE2__)
+
+  org += yo * stride + xo;
+  rec += yo * stride + xo;
+  for (int y = 1; y <= KERNEL; y++) {
+    const int dy1 = y * stride;
+    const int dy2 = y * stride;
+    const int Wy = K[KERNEL + y];
+
+    for (int x = 1; x <= KERNEL; x++) {
+      // Compute the contributions of upper-left (ul), upper-right (ur)
+      // lower-left (ll) and lower-right (lr) points (see the diagram below).
+      // Symmetric Kernel will have same weight on those points.
+      //       -  -  -  -  -  -  -
+      //       -  ul -  -  -  ur -
+      //       -  -  -  -  -  -  -
+      //       -  -  -  0  -  -  -
+      //       -  -  -  -  -  -  -
+      //       -  ll -  -  -  lr -
+      //       -  -  -  -  -  -  -
+      const int Wxy = Wy * K[KERNEL + x];
+      const int ul1 = org[-dy1 - x];
+      const int ur1 = org[-dy1 + x];
+      const int ll1 = org[dy1 - x];
+      const int lr1 = org[dy1 + x];
+
+      const int ul2 = rec[-dy2 - x];
+      const int ur2 = rec[-dy2 + x];
+      const int ll2 = rec[dy2 - x];
+      const int lr2 = rec[dy2 + x];
+
+      xm  += Wxy * (ul1 + ur1 + ll1 + lr1);
+      ym  += Wxy * (ul2 + ur2 + ll2 + lr2);
+      xxm += Wxy * (ul1 * ul1 + ur1 * ur1 + ll1 * ll1 + lr1 * lr1);
+      xym += Wxy * (ul1 * ul2 + ur1 * ur2 + ll1 * ll2 + lr1 * lr2);
+      yym += Wxy * (ul2 * ul2 + ur2 * ur2 + ll2 * ll2 + lr2 * lr2);
+    }
+
+    // Compute the contributions of up (u), down (d), left (l) and right (r)
+    // points across the main axes (see the diagram below).
+    // Symmetric Kernel will have same weight on those points.
+    //       -  -  -  -  -  -  -
+    //       -  -  -  u  -  -  -
+    //       -  -  -  -  -  -  -
+    //       -  l  -  0  -  r  -
+    //       -  -  -  -  -  -  -
+    //       -  -  -  d  -  -  -
+    //       -  -  -  -  -  -  -
+    const int Wxy = Wy * K[KERNEL];
+    const int u1 = org[-dy1];
+    const int d1 = org[dy1];
+    const int l1 = org[-y];
+    const int r1 = org[y];
+
+    const int u2 = rec[-dy2];
+    const int d2 = rec[dy2];
+    const int l2 = rec[-y];
+    const int r2 = rec[y];
+
+    xm  += Wxy * (u1 + d1 + l1 + r1);
+    ym  += Wxy * (u2 + d2 + l2 + r2);
+    xxm += Wxy * (u1 * u1 + d1 * d1 + l1 * l1 + r1 * r1);
+    xym += Wxy * (u1 * u2 + d1 * d2 + l1 * l2 + r1 * r2);
+    yym += Wxy * (u2 * u2 + d2 * d2 + l2 * l2 + r2 * r2);
+  }
+
+  // Lastly the contribution of (x0, y0) point.
+  const int Wxy = K[KERNEL] * K[KERNEL];
+  const int s1 = org[0];
+  const int s2 = rec[0];
+
+  xm  += Wxy * s1;
+  ym  += Wxy * s2;
+  xxm += Wxy * s1 * s1;
+  xym += Wxy * s1 * s2;
+  yym += Wxy * s2 * s2;
+
+#else   // __SSE2__
+
+  org += (yo - KERNEL) * stride + (xo - KERNEL);
+  rec += (yo - KERNEL) * stride + (xo - KERNEL);
+
+  const __m128i zero = _mm_setzero_si128();
+  __m128i x = zero;
+  __m128i y = zero;
+  __m128i xx = zero;
+  __m128i xy = zero;
+  __m128i yy = zero;
+
+// Read 8 pixels at line #L, and convert to 16bit, perform weighting
+// and acccumulate.
+#define LOAD_LINE_PAIR(L, WEIGHT) do {                                       \
+  const __m128i v0 =                                                         \
+      _mm_loadl_epi64(reinterpret_cast<const __m128i*>(org + (L) * stride)); \
+  const __m128i v1 =                                                         \
+      _mm_loadl_epi64(reinterpret_cast<const __m128i*>(rec + (L) * stride)); \
+  const __m128i w0 = _mm_unpacklo_epi8(v0, zero);                            \
+  const __m128i w1 = _mm_unpacklo_epi8(v1, zero);                            \
+  const __m128i ww0 = _mm_mullo_epi16(w0, (WEIGHT).values_.m_);              \
+  const __m128i ww1 = _mm_mullo_epi16(w1, (WEIGHT).values_.m_);              \
+  x = _mm_add_epi32(x, _mm_unpacklo_epi16(ww0, zero));                       \
+  y = _mm_add_epi32(y, _mm_unpacklo_epi16(ww1, zero));                       \
+  x = _mm_add_epi32(x, _mm_unpackhi_epi16(ww0, zero));                       \
+  y = _mm_add_epi32(y, _mm_unpackhi_epi16(ww1, zero));                       \
+  xx = _mm_add_epi32(xx, _mm_madd_epi16(ww0, w0));                           \
+  xy = _mm_add_epi32(xy, _mm_madd_epi16(ww0, w1));                           \
+  yy = _mm_add_epi32(yy, _mm_madd_epi16(ww1, w1));                           \
+} while (0)
+
+#define ADD_AND_STORE_FOUR_EPI32(M, OUT) do {                                \
+  uint32 tmp[4];                                                             \
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(tmp), (M));                    \
+  (OUT) = tmp[3] + tmp[2] + tmp[1] + tmp[0];                                 \
+} while (0)
+
+  LOAD_LINE_PAIR(0, W0);
+  LOAD_LINE_PAIR(1, W1);
+  LOAD_LINE_PAIR(2, W2);
+  LOAD_LINE_PAIR(3, W3);
+  LOAD_LINE_PAIR(4, W2);
+  LOAD_LINE_PAIR(5, W1);
+  LOAD_LINE_PAIR(6, W0);
+
+  ADD_AND_STORE_FOUR_EPI32(x, xm);
+  ADD_AND_STORE_FOUR_EPI32(y, ym);
+  ADD_AND_STORE_FOUR_EPI32(xx, xxm);
+  ADD_AND_STORE_FOUR_EPI32(xy, xym);
+  ADD_AND_STORE_FOUR_EPI32(yy, yym);
+
+#undef LOAD_LINE_PAIR
+#undef ADD_AND_STORE_FOUR_EPI32
+#endif
+
+  return FinalizeSSIM(area_weight, xm, ym, xxm, xym, yym);
+}
+
+static int start_max(int x, int y) { return (x > y) ? x : y; }
+
+double CalcSSIM(const uint8 *org, const uint8 *rec,
+                const int image_width, const int image_height) {
+  double SSIM = 0.;
+  const int KERNEL_Y = (image_height < KERNEL) ? image_height : KERNEL;
+  const int KERNEL_X = (image_width < KERNEL) ? image_width : KERNEL;
+  const int start_x = start_max(image_width - 8 + KERNEL_X, KERNEL_X);
+  const int start_y = start_max(image_height - KERNEL_Y, KERNEL_Y);
+  const int stride = image_width;
+
+  for (int j = 0; j < KERNEL_Y; ++j) {
+    for (int i = 0; i < image_width; ++i) {
+      SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
+    }
+  }
+
+#ifdef _OPENMP
+  #pragma omp parallel for reduction(+: SSIM)
+#endif
+  for (int j = KERNEL_Y; j < image_height - KERNEL_Y; ++j) {
+    for (int i = 0; i < KERNEL_X; ++i) {
+      SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
+    }
+    for (int i = KERNEL_X; i < start_x; ++i) {
+      SSIM += GetSSIMFullKernel(org, rec, i, j, stride, kiW[0]);
+    }
+    if (start_x < image_width) {
+      // GetSSIMFullKernel() needs to be able to read 8 pixels (in SSE2). So we
+      // copy the 8 rightmost pixels on a cache area, and pad this area with
+      // zeros which won't contribute to the overall SSIM value (but we need
+      // to pass the correct normalizing constant!). By using this cache, we can
+      // still call GetSSIMFullKernel() instead of the slower GetSSIM().
+      // NOTE: we could use similar method for the left-most pixels too.
+      const int kScratchWidth = 8;
+      const int kScratchStride = kScratchWidth + KERNEL + 1;
+      uint8 scratch_org[KERNEL_SIZE * kScratchStride] = { 0 };
+      uint8 scratch_rec[KERNEL_SIZE * kScratchStride] = { 0 };
+
+      for (int k = 0; k < KERNEL_SIZE; ++k) {
+        const int offset =
+            (j - KERNEL + k) * stride + image_width - kScratchWidth;
+        memcpy(scratch_org + k * kScratchStride, org + offset, kScratchWidth);
+        memcpy(scratch_rec + k * kScratchStride, rec + offset, kScratchWidth);
+      }
+      for (int k = 0;  k <= KERNEL_X + 1; ++k) {
+        SSIM += GetSSIMFullKernel(scratch_org, scratch_rec,
+                                  KERNEL + k, KERNEL, kScratchStride, kiW[k]);
+      }
+    }
+  }
+
+  for (int j = start_y; j < image_height; ++j) {
+    for (int i = 0; i < image_width; ++i) {
+      SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
+    }
+  }
+  return SSIM;
+}
+
+double CalcLSSIM(double ssim) {
+  return -10.0 * log10(1.0 - ssim);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
diff --git a/libs/libyuv/util/ssim.h b/libs/libyuv/util/ssim.h
new file mode 100644
index 0000000000..430eb71c3d
--- /dev/null
+++ b/libs/libyuv/util/ssim.h
@@ -0,0 +1,36 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// Get SSIM for video sequence. Assuming RAW 4:2:0 Y:Cb:Cr format
+
+#ifndef UTIL_SSIM_H_  // NOLINT
+#define UTIL_SSIM_H_
+
+#include <math.h>  // For log10()
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if !defined(INT_TYPES_DEFINED) && !defined(UINT8_TYPE_DEFINED)
+typedef unsigned char uint8;
+#define UINT8_TYPE_DEFINED
+#endif
+
+double CalcSSIM(const uint8* org, const uint8* rec,
+                const int image_width, const int image_height);
+
+double CalcLSSIM(double ssim);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // UTIL_SSIM_H_  // NOLINT
diff --git a/libs/libyuv/winarm.mk b/libs/libyuv/winarm.mk
new file mode 100644
index 0000000000..c4307a431f
--- /dev/null
+++ b/libs/libyuv/winarm.mk
@@ -0,0 +1,46 @@
+# This is a generic makefile for libyuv for Windows Arm.
+# call "c:\Program Files (x86)\Microsoft Visual Studio 11.0\VC\bin\x86_arm\vcvarsx86_arm.bat"
+# nmake /f winarm.mk
+# make -f winarm.mk
+# nmake /f winarm.mk clean
+# consider /arch:ARMv7VE
+CC=cl
+CCFLAGS=/Ox /nologo /Iinclude /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP
+AR=lib
+ARFLAGS=/MACHINE:ARM /NOLOGO /SUBSYSTEM:NATIVE
+RM=cmd /c del
+
+LOCAL_OBJ_FILES = \
+	source/compare.o\
+	source/compare_common.o\
+	source/convert.o\
+	source/convert_argb.o\
+	source/convert_from.o\
+	source/convert_from_argb.o\
+	source/convert_to_argb.o\
+	source/convert_to_i420.o\
+	source/cpu_id.o\
+	source/planar_functions.o\
+	source/rotate.o\
+	source/rotate_any.o\
+	source/rotate_argb.o\
+	source/rotate_common.o\
+	source/row_any.o\
+	source/row_common.o\
+	source/scale.o\
+	source/scale_any.o\
+	source/scale_argb.o\
+	source/scale_common.o\
+	source/video_common.o
+
+.cc.o:
+	$(CC) /c $(CCFLAGS) $*.cc /Fo$@
+
+all: libyuv_arm.lib winarm.mk
+
+libyuv_arm.lib: $(LOCAL_OBJ_FILES) winarm.mk
+	$(AR) $(ARFLAGS) /OUT:$@ $(LOCAL_OBJ_FILES)
+
+clean:
+	$(RM) "source\*.o" libyuv_arm.lib
+
diff --git a/libs/sofia-sip/libsofia-sip-ua/sdp/sdp_parse.c b/libs/sofia-sip/libsofia-sip-ua/sdp/sdp_parse.c
index d56bc9d23c..4a79bf3b36 100644
--- a/libs/sofia-sip/libsofia-sip-ua/sdp/sdp_parse.c
+++ b/libs/sofia-sip/libsofia-sip-ua/sdp/sdp_parse.c
@@ -921,13 +921,13 @@ static void parse_bandwidth(sdp_parser_t *p, char *r, sdp_bandwidth_t **result)
   }
 
   if (su_casematch(name, "CT"))
-    modifier = sdp_bw_ct, name = NULL;
+    modifier = sdp_bw_ct, name = "CT";
   else if (su_casematch(name, "TIAS") == 1)
-    modifier = sdp_bw_tias, name = NULL;
+    modifier = sdp_bw_tias, name = "TIAS";
   else if (su_casematch(name, "AS") == 1)
-    modifier = sdp_bw_as, name = NULL;
+    modifier = sdp_bw_as, name = "AS";
   else
-    modifier = sdp_bw_x;
+	modifier = sdp_bw_x, name = "BW-X";
 
   if (STRICT(p))
     PARSE_CHECK_REST(p, r, "b");
diff --git a/libs/spandsp/configure.ac b/libs/spandsp/configure.ac
index 16d6d247a4..4ec8952558 100644
--- a/libs/spandsp/configure.ac
+++ b/libs/spandsp/configure.ac
@@ -37,6 +37,7 @@ m4_include(m4/ax_c99_features.m4)
 m4_include(m4/ax_check_export_capability.m4)
 m4_include(m4/ax_check_arm_neon.m4)
 m4_include(m4/ax_func_aligned_alloc.m4)
+m4_include(m4/memmove.m4)
 
 AC_CONFIG_SRCDIR([src/tone_generate.c])
 AC_CONFIG_AUX_DIR([config])
@@ -152,6 +153,7 @@ AC_FUNC_ERROR_AT_LINE
 AC_FUNC_VPRINTF
 AC_FUNC_MEMCMP
 AC_FUNC_MEMMOVE
+
 if test "${ax_cv_c_compiler_vendor}" = "gnu"
   then
     AC_FUNC_REALLOC
diff --git a/libs/spandsp/m4/memmove.m4 b/libs/spandsp/m4/memmove.m4
new file mode 100644
index 0000000000..f2301a07db
--- /dev/null
+++ b/libs/spandsp/m4/memmove.m4
@@ -0,0 +1,27 @@
+AC_DEFUN([AC_FUNC_MEMMOVE],
+[AC_CHECK_FUNCS(memmove)
+AC_MSG_CHECKING(for working memmove)
+AC_CACHE_VAL(ac_cv_have_working_memmove,
+[AC_TRY_RUN(
+[#include <stdio.h>
+
+int main(void)
+{
+    char buf[10];
+    strcpy (buf, "01234567");
+    memmove (buf, buf + 2, 3);
+    if (strcmp (buf, "23434567"))
+        exit (1);
+    strcpy (buf, "01234567");
+    memmove (buf + 2, buf, 3);
+    if (strcmp (buf, "01012567"))
+        exit (1);
+    exit (0);
+}], ac_cv_have_working_memmove=yes, ac_cv_have_working_memmove=no, ac_cv_have_working_memmove=cross)])
+AC_MSG_RESULT([$ac_cv_have_working_memmove])
+if test x$ac_cv_have_working_memmove != "xyes"; then
+  AC_LIBOBJ(memmove)
+  AC_MSG_WARN([Replacing missing/broken memmove.])
+  AC_DEFINE(PREFER_PORTABLE_MEMMOVE, 1, "enable replacement memmove if system memmove is broken or missing")
+fi])
+
diff --git a/libs/spandsp/spandsp/fax-tests.xml b/libs/spandsp/spandsp/fax-tests.xml
index 2f52468721..8c03d0e22f 100644
--- a/libs/spandsp/spandsp/fax-tests.xml
+++ b/libs/spandsp/spandsp/fax-tests.xml
@@ -589,5 +589,131 @@
         <step dir="R" type="CLEAR" timein="800" timeout="1200"/>
         <step type="STATUS" value="OK"/>
     </test>
+    <test name="Invalid-signalling-rate">
+        <step type="CALL"/>
+
+        <!--<step dir="T" type="CNG"/>-->
+
+        <step dir="R" type="CED"/>
+        <!-- Invalid signalling rate in DCS -->
+        <step dir="R" type="HDLC" modem="V.21" tag="DIS" value="FF C8 01 ..." timeout="60000"/>
+        <step dir="R" type="SILENCE"/>
+
+        <step type="WAIT" value="75"/>
+        <step dir="T" type="PREAMBLE" modem="V.21"/>
+        <step dir="T" type="HDLC" tag="DCS" value="FF C8 C1 00 49 1f 00"/>
+        <step dir="T" type="POSTAMBLE"/>
+
+        <step dir="R" type="HDLC" modem="V.21" tag="DCN" value="FF C8 5F"/>
+        <step dir="R" type="SILENCE"/>
+
+        <step dir="R" type="CLEAR" timein="800" timeout="1200"/>
+        <step type="STATUS" value="OK"/>
+    </test>
+    <test name="No-caller-response">
+        <step type="CALL"/>
+        <step dir="T" type="SET" tag="IDENT" value="+0123456789"/>
+
+        <!--<step dir="T" type="CNG"/>-->
+
+        <step dir="R" type="CED"/>
+        <!-- -->
+        <step dir="R" type="HDLC" modem="V.21" tag="DIS" value="FF C8 01 ..." timeout="60000"/>
+        <step dir="R" type="SILENCE"/>
+
+        <repeat min="5" max="5">
+            <step dir="R" type="HDLC" modem="V.21" tag="DIS" value="FF C8 01 ..."/>
+            <step dir="R" type="SILENCE"/>
+        </repeat>
+
+        <step dir="R" type="HDLC" modem="V.21" tag="DCN" value="FF C8 5F"/>
+        <step dir="R" type="SILENCE"/>
+
+        <step dir="R" type="CLEAR" timein="800" timeout="1200"/>
+        <step type="STATUS" value="T1_EXPIRED"/>
+    </test>
+    <test name="No-answerer-response">
+        <step type="ANSWER" value="etsi_300_242_a4_impress_white.tif"/>
+        <step dir="T" type="SET" tag="IDENT" value="+0123456789"/>
+
+        <step dir="R" type="CNG"/>
+
+        <step type="WAIT" value="75"/>
+
+        <step dir="R" type="CLEAR" timein="55000" timeout="65000"/>
+        <step type="STATUS" value="T0_EXPIRED"/>
+    </test>
+    <test name="T1-after-EOM">
+        <!-- After and EOM exchange the far end no longer responds. -->
+        <step type="CALL" value="etsi_300_242_a4_white.tif"/>
+
+        <step dir="R" type="CED"/>
+        <step dir="R" type="HDLC" modem="V.21" tag="DIS" value="FF C8 01 ..." timeout="60000"/>
+        <step dir="R" type="SILENCE"/>
+
+        <step type="WAIT" value="75"/>
+        <step dir="T" type="PREAMBLE" modem="V.21"/>
+        <step dir="T" type="HDLC" tag="DCS" value="FF C8 41 00 50 00"/>
+        <step dir="T" type="POSTAMBLE"/>
+        <step type="WAIT" value="75"/>
+        <step dir="T" type="TCF" modem="V.27ter/4800" value="900"/>
+
+        <step dir="R" type="HDLC" modem="V.21" tag="CFR" value="FF C8 21"/>
+        <step dir="R" type="SILENCE"/>
+
+        <step type="WAIT" value="75"/>
+        <step dir="T" type="MSG" modem="V.27ter/4800" value="etsi_300_242_a4_white.tif"/>
+        <step type="WAIT" value="75"/>
+        <step dir="T" type="PREAMBLE" modem="V.21"/>
+        <step dir="T" type="HDLC" tag="EOM" value="FF C8 71"/>
+        <step dir="T" type="POSTAMBLE"/>
+
+        <step dir="R" type="HDLC" modem="V.21" tag="MCF" value="FF C8 31"/>
+        <step dir="R" type="SILENCE"/>
+
+        <step dir="R" type="HDLC" modem="V.21" tag="DIS" value="FF C8 01 ..." timeout="60000"/>
+        <step dir="R" type="SILENCE"/>
+
+        <repeat min="5" max="5">
+            <step dir="R" type="HDLC" modem="V.21" tag="DIS" value="FF C8 01 ..."/>
+            <step dir="R" type="SILENCE"/>
+        </repeat>
+
+        <step dir="R" type="HDLC" modem="V.21" tag="DCN" value="FF C8 5F"/>
+        <step dir="R" type="SILENCE"/>
+
+        <step dir="R" type="CLEAR" timein="800" timeout="1200"/>
+        <step type="STATUS" value="T1_EXPIRED"/>
+    </test>
+    <test name="T1-after-EOMx">
+        <!-- After and EOM exchange the far end no longer responds. -->
+        <step type="ANSWER" value="etsi_300_242_a4_white.tif"/>
+
+        <step dir="R" type="CNG"/>
+
+        <step dir="T" type="CED"/>
+        <step type="WAIT" value="75"/>
+        <step dir="T" type="PREAMBLE" modem="V.21"/>
+        <step dir="T" type="HDLC" tag="DIS" value="FF C8 01 00 50 00"/>
+        <step dir="T" type="POSTAMBLE"/>
+
+        <step dir="R" type="HDLC" modem="V.21" tag="DCS+" value="FF C8 C1 00 ..."/>
+        <step dir="R" type="TCF" modem="V.27ter/4800" timeout="10000"/>
+
+        <step type="WAIT" value="75"/>
+        <step dir="T" type="PREAMBLE" modem="V.21"/>
+        <step dir="T" type="HDLC" tag="CFR" value="FF C8 21"/>
+        <step dir="T" type="POSTAMBLE"/>
+
+        <step dir="T" type="SET" tag="TXFILE" value="etsi_300_242_a4_white.tif"/>
+
+        <step dir="R" type="MSG" modem="V.27ter/4800" timeout="60000"/>
+        <step dir="R" type="HDLC" modem="V.21" tag="EOM" value="FF C8 F1"/>
+
+        <step type="WAIT" value="75"/>
+
+        <step dir="R" type="CLEAR" timein="55000" timeout="65000"/>
+        <step type="STATUS" value="T0_EXPIRED"/>
+    </test>
 </test-group>
 </fax-tests>
diff --git a/libs/spandsp/src/t30.c b/libs/spandsp/src/t30.c
index f13d4eb879..7bb81ddec1 100644
--- a/libs/spandsp/src/t30.c
+++ b/libs/spandsp/src/t30.c
@@ -439,9 +439,9 @@ static void start_final_pause(t30_state_t *s);
 static void decode_20digit_msg(t30_state_t *s, char *msg, const uint8_t *pkt, int len);
 static void decode_url_msg(t30_state_t *s, char *msg, const uint8_t *pkt, int len);
 static int decode_nsf_nss_nsc(t30_state_t *s, uint8_t *msg[], const uint8_t *pkt, int len);
-static void set_min_scan_time(t30_state_t *s);
 static int send_cfr_sequence(t30_state_t *s, int start);
 static int build_dcs(t30_state_t *s);
+static void set_min_scan_time(t30_state_t *s);
 static void timer_t2_start(t30_state_t *s);
 static void timer_t2_flagged_start(t30_state_t *s);
 static void timer_t2_dropped_start(t30_state_t *s);
@@ -2521,6 +2521,9 @@ static void send_dcn(t30_state_t *s)
 static void return_to_phase_b(t30_state_t *s, int with_fallback)
 {
     /* This is what we do after things like T30_EOM is exchanged. */
+    span_log(&s->logging, SPAN_LOG_PROTOCOL_WARNING, "Returning to phase B\n");
+    /* Run the T1 timer, like we do on first detecting the far end. */
+    s->timer_t0_t1 = ms_to_samples(DEFAULT_TIMER_T1);
     set_state(s, (s->calling_party)  ?  T30_STATE_T  :  T30_STATE_R);
 }
 /*- End of function --------------------------------------------------------*/
@@ -2712,6 +2715,7 @@ static int send_cfr_sequence(t30_state_t *s, int start)
         s->step++;
         if (send_csa_frame(s))
             break;
+        /*endif*/
         /* Fall through */
     case 1:
         s->step++;
@@ -3142,6 +3146,58 @@ static int process_rx_dcs(t30_state_t *s, const uint8_t *msg, int len)
 }
 /*- End of function --------------------------------------------------------*/
 
+static void assess_copy_quality(t30_state_t *s, uint8_t fcf)
+{
+    int quality;
+    
+    quality = copy_quality(s);
+    switch (quality)
+    {
+    case T30_COPY_QUALITY_PERFECT:
+    case T30_COPY_QUALITY_GOOD:
+        rx_end_page(s);
+        break;
+    case T30_COPY_QUALITY_POOR:
+        rx_end_page(s);
+        break;
+    case T30_COPY_QUALITY_BAD:
+        /* Some people want to keep even the bad pages */
+        if (s->keep_bad_pages)
+            rx_end_page(s);
+        /*endif*/
+        break;
+    }
+    /*endswitch*/
+
+    if (s->phase_d_handler)
+        s->phase_d_handler(s->phase_d_user_data, fcf);
+    /*endif*/
+    if (fcf == T30_EOP)
+        terminate_operation_in_progress(s);
+    else
+        rx_start_page(s);
+    /*endif*/
+
+    switch (quality)
+    {
+    case T30_COPY_QUALITY_PERFECT:
+    case T30_COPY_QUALITY_GOOD:
+        s->last_rx_page_result = T30_MCF;
+        break;
+    case T30_COPY_QUALITY_POOR:
+        s->last_rx_page_result = T30_RTP;
+        break;
+    case T30_COPY_QUALITY_BAD:
+    default:
+        s->last_rx_page_result = T30_RTN;
+        break;
+    }
+    /*endswitch*/
+    set_state(s, T30_STATE_III_Q);
+    send_simple_frame(s, s->last_rx_page_result);
+}
+/*- End of function --------------------------------------------------------*/
+
 static int send_response_to_pps(t30_state_t *s)
 {
     queue_phase(s, T30_PHASE_D_TX);
@@ -3289,6 +3345,7 @@ static int process_rx_pps(t30_state_t *s, const uint8_t *msg, int len)
                         /* Use the length of the first frame as our model for what the length should be */
                         if (s->ecm_len[frame_no] == 64)
                             expected_len = 64;
+                        /*endif*/
                         first = false;
                     }
                     /*endif*/
@@ -4082,58 +4139,6 @@ static void process_state_f_doc_non_ecm(t30_state_t *s, const uint8_t *msg, int
 }
 /*- End of function --------------------------------------------------------*/
 
-static void assess_copy_quality(t30_state_t *s, uint8_t fcf)
-{
-    int quality;
-    
-    quality = copy_quality(s);
-    switch (quality)
-    {
-    case T30_COPY_QUALITY_PERFECT:
-    case T30_COPY_QUALITY_GOOD:
-        rx_end_page(s);
-        break;
-    case T30_COPY_QUALITY_POOR:
-        rx_end_page(s);
-        break;
-    case T30_COPY_QUALITY_BAD:
-        /* Some people want to keep even the bad pages */
-        if (s->keep_bad_pages)
-            rx_end_page(s);
-        /*endif*/
-        break;
-    }
-    /*endswitch*/
-
-    if (s->phase_d_handler)
-        s->phase_d_handler(s->phase_d_user_data, fcf);
-    /*endif*/
-    if (fcf == T30_EOP)
-        terminate_operation_in_progress(s);
-    else
-        rx_start_page(s);
-    /*endif*/
-
-    switch (quality)
-    {
-    case T30_COPY_QUALITY_PERFECT:
-    case T30_COPY_QUALITY_GOOD:
-        s->last_rx_page_result = T30_MCF;
-        break;
-    case T30_COPY_QUALITY_POOR:
-        s->last_rx_page_result = T30_RTP;
-        break;
-    case T30_COPY_QUALITY_BAD:
-    default:
-        s->last_rx_page_result = T30_RTN;
-        break;
-    }
-    /*endswitch*/
-    set_state(s, T30_STATE_III_Q);
-    send_simple_frame(s, s->last_rx_page_result);
-}
-/*- End of function --------------------------------------------------------*/
-
 static void process_state_f_post_doc_non_ecm(t30_state_t *s, const uint8_t *msg, int len)
 {
     uint8_t fcf;
@@ -4724,6 +4729,7 @@ static void process_state_ii_q(t30_state_t *s, const uint8_t *msg, int len)
             t30_set_status(s, T30_ERR_TX_BADPG);
             break;
         }
+        /*endswitch*/
         terminate_call(s);
         break;
     case T30_CRP:
@@ -5721,6 +5727,7 @@ static void set_phase(t30_state_t *s, int phase)
     case T30_PHASE_D_TX:
         if (!s->far_end_detected  &&  s->timer_t0_t1 > 0)
         {
+            /* Switch from T0 to T1 */
             s->timer_t0_t1 = ms_to_samples(DEFAULT_TIMER_T1);
             s->far_end_detected = true;
         }
@@ -5822,7 +5829,10 @@ static void set_state(t30_state_t *s, int state)
 static void repeat_last_command(t30_state_t *s)
 {
     s->step = 0;
-    if (++s->retries >= MAX_COMMAND_TRIES)
+    /* If T0 or T1 are in progress we do not want to apply a limit to the maximum number of retries. We
+       let T0 or T1 terminate things if the far end doesn't communicate. */
+    s->retries++;
+    if (s->timer_t0_t1 == 0  &&  s->retries >= MAX_COMMAND_TRIES)
     {
         span_log(&s->logging, SPAN_LOG_FLOW, "Too many retries. Giving up.\n");
         switch (s->state)
@@ -6083,6 +6093,8 @@ static void timer_t2_expired(t30_state_t *s)
             /* We didn't receive a response to our T30_MCF after T30_EOM, so we must be OK
                to proceed to phase B, and pretty much act like its the beginning of a call. */
             span_log(&s->logging, SPAN_LOG_FLOW, "Returning to phase B after %s\n", t30_frametype(s->next_rx_step));
+            /* Run the T1 timer, like we do on first detecting the far end. */
+            s->timer_t0_t1 = ms_to_samples(DEFAULT_TIMER_T1);
             s->dis_received = false;
             set_phase(s, T30_PHASE_B_TX);
             timer_t2_start(s);
@@ -6667,6 +6679,7 @@ static void t30_hdlc_rx_status(void *user_data, int status)
     case SIG_STATUS_FRAMING_OK:
         if (!s->far_end_detected  &&  s->timer_t0_t1 > 0)
         {
+            /* Switch from T0 to T1 */
             s->timer_t0_t1 = ms_to_samples(DEFAULT_TIMER_T1);
             s->far_end_detected = true;
             if (s->phase == T30_PHASE_A_CED  ||  s->phase == T30_PHASE_A_CNG)
diff --git a/libs/spandsp/src/t30_api.c b/libs/spandsp/src/t30_api.c
index 724dcf9735..23c2d49375 100644
--- a/libs/spandsp/src/t30_api.c
+++ b/libs/spandsp/src/t30_api.c
@@ -120,26 +120,34 @@ SPAN_DECLARE(int) t33_sub_address_extract_field(uint8_t num[21], const uint8_t t
             {
                 type = T33_SST;
             }
+            /*endif*/
             while (t33[i])
             {
                 ch = t33[i++];
                 if (ch == '#')
                     break;
+                /*endif*/
                 num[j++] = ch;
                 if (j >= 20)
                     return -1;
+                /*endif*/
             }
+            /*endwhile*/
             num[j] = '\0';
             return type;
         }
+        /*endif*/
         /* Skip this field */
         i++;
         while (t33[i])
         {
             if (t33[i++] == '#')
                 break;
+            /*endif*/
         }
+        /*endwhile*/
     }
+    /*endfor*/
     return T33_NONE;
 }
 /*- End of function --------------------------------------------------------*/
@@ -148,8 +156,10 @@ SPAN_DECLARE(void) t33_sub_address_add_field(uint8_t t33[], const uint8_t field[
 {
     if (t33[0] != '\0')
         strcat((char *) t33, "#");
+    /*endif*/
     if (type == T33_SST)
         strcat((char *) t33, "#");
+    /*endif*/
     strcat((char *) t33, (const char *) field);
 }
 /*- End of function --------------------------------------------------------*/
@@ -161,8 +171,10 @@ SPAN_DECLARE(int) t30_set_tx_ident(t30_state_t *s, const char *id)
         s->tx_info.ident[0] = '\0';
         return 0;
     }
+    /*endif*/
     if (strlen(id) > T30_MAX_IDENT_LEN)
         return -1;
+    /*endif*/
     strcpy(s->tx_info.ident, id);
     t4_tx_set_local_ident(&s->t4.tx, s->tx_info.ident);
     return 0;
@@ -173,6 +185,7 @@ SPAN_DECLARE(const char *) t30_get_tx_ident(t30_state_t *s)
 {
     if (s->tx_info.ident[0] == '\0')
         return NULL;
+    /*endif*/
     return s->tx_info.ident;
 }
 /*- End of function --------------------------------------------------------*/
@@ -181,6 +194,7 @@ SPAN_DECLARE(const char *) t30_get_rx_ident(t30_state_t *s)
 {
     if (s->rx_info.ident[0] == '\0')
         return NULL;
+    /*endif*/
     return s->rx_info.ident;
 }
 /*- End of function --------------------------------------------------------*/
@@ -192,8 +206,10 @@ SPAN_DECLARE(int) t30_set_tx_sub_address(t30_state_t *s, const char *sub_address
         s->tx_info.sub_address[0] = '\0';
         return 0;
     }
+    /*endif*/
     if (strlen(sub_address) > T30_MAX_IDENT_LEN)
         return -1;
+    /*endif*/
     strcpy(s->tx_info.sub_address, sub_address);
     return 0;
 }
@@ -203,6 +219,7 @@ SPAN_DECLARE(const char *) t30_get_tx_sub_address(t30_state_t *s)
 {
     if (s->tx_info.sub_address[0] == '\0')
         return NULL;
+    /*endif*/
     return s->tx_info.sub_address;
 }
 /*- End of function --------------------------------------------------------*/
@@ -211,6 +228,7 @@ SPAN_DECLARE(const char *) t30_get_rx_sub_address(t30_state_t *s)
 {
     if (s->rx_info.sub_address[0] == '\0')
         return NULL;
+    /*endif*/
     return s->rx_info.sub_address;
 }
 /*- End of function --------------------------------------------------------*/
@@ -222,8 +240,10 @@ SPAN_DECLARE(int) t30_set_tx_selective_polling_address(t30_state_t *s, const cha
         s->tx_info.selective_polling_address[0] = '\0';
         return 0;
     }
+    /*endif*/
     if (strlen(selective_polling_address) > T30_MAX_IDENT_LEN)
         return -1;
+    /*endif*/
     strcpy(s->tx_info.selective_polling_address, selective_polling_address);
     return 0;
 }
@@ -233,6 +253,7 @@ SPAN_DECLARE(const char *) t30_get_tx_selective_polling_address(t30_state_t *s)
 {
     if (s->tx_info.selective_polling_address[0] == '\0')
         return NULL;
+    /*endif*/
     return s->tx_info.selective_polling_address;
 }
 /*- End of function --------------------------------------------------------*/
@@ -241,6 +262,7 @@ SPAN_DECLARE(const char *) t30_get_rx_selective_polling_address(t30_state_t *s)
 {
     if (s->rx_info.selective_polling_address[0] == '\0')
         return NULL;
+    /*endif*/
     return s->rx_info.selective_polling_address;
 }
 /*- End of function --------------------------------------------------------*/
@@ -252,8 +274,10 @@ SPAN_DECLARE(int) t30_set_tx_polled_sub_address(t30_state_t *s, const char *poll
         s->tx_info.polled_sub_address[0] = '\0';
         return 0;
     }
+    /*endif*/
     if (strlen(polled_sub_address) > T30_MAX_IDENT_LEN)
         return -1;
+    /*endif*/
     strcpy(s->tx_info.polled_sub_address, polled_sub_address);
     return 0;
 }
@@ -263,6 +287,7 @@ SPAN_DECLARE(const char *) t30_get_tx_polled_sub_address(t30_state_t *s)
 {
     if (s->tx_info.polled_sub_address[0] == '\0')
         return NULL;
+    /*endif*/
     return s->tx_info.polled_sub_address;
 }
 /*- End of function --------------------------------------------------------*/
@@ -271,6 +296,7 @@ SPAN_DECLARE(const char *) t30_get_rx_polled_sub_address(t30_state_t *s)
 {
     if (s->rx_info.polled_sub_address[0] == '\0')
         return NULL;
+    /*endif*/
     return s->rx_info.polled_sub_address;
 }
 /*- End of function --------------------------------------------------------*/
@@ -282,8 +308,10 @@ SPAN_DECLARE(int) t30_set_tx_sender_ident(t30_state_t *s, const char *sender_ide
         s->tx_info.sender_ident[0] = '\0';
         return 0;
     }
+    /*endif*/
     if (strlen(sender_ident) > T30_MAX_IDENT_LEN)
         return -1;
+    /*endif*/
     strcpy(s->tx_info.sender_ident, sender_ident);
     return 0;
 }
@@ -293,6 +321,7 @@ SPAN_DECLARE(const char *) t30_get_tx_sender_ident(t30_state_t *s)
 {
     if (s->tx_info.sender_ident[0] == '\0')
         return NULL;
+    /*endif*/
     return s->tx_info.sender_ident;
 }
 /*- End of function --------------------------------------------------------*/
@@ -301,6 +330,7 @@ SPAN_DECLARE(const char *) t30_get_rx_sender_ident(t30_state_t *s)
 {
     if (s->rx_info.sender_ident[0] == '\0')
         return NULL;
+    /*endif*/
     return s->rx_info.sender_ident;
 }
 /*- End of function --------------------------------------------------------*/
@@ -312,8 +342,10 @@ SPAN_DECLARE(int) t30_set_tx_password(t30_state_t *s, const char *password)
         s->tx_info.password[0] = '\0';
         return 0;
     }
+    /*endif*/
     if (strlen(password) > T30_MAX_IDENT_LEN)
         return -1;
+    /*endif*/
     strcpy(s->tx_info.password, password);
     return 0;
 }
@@ -323,6 +355,7 @@ SPAN_DECLARE(const char *) t30_get_tx_password(t30_state_t *s)
 {
     if (s->tx_info.password[0] == '\0')
         return NULL;
+    /*endif*/
     return s->tx_info.password;
 }
 /*- End of function --------------------------------------------------------*/
@@ -331,6 +364,7 @@ SPAN_DECLARE(const char *) t30_get_rx_password(t30_state_t *s)
 {
     if (s->rx_info.password[0] == '\0')
         return NULL;
+    /*endif*/
     return s->rx_info.password;
 }
 /*- End of function --------------------------------------------------------*/
@@ -339,6 +373,7 @@ SPAN_DECLARE(int) t30_set_tx_nsf(t30_state_t *s, const uint8_t *nsf, int len)
 {
     if (s->tx_info.nsf)
         span_free(s->tx_info.nsf);
+    /*endif*/
     if (nsf  &&  len > 0  &&  (s->tx_info.nsf = span_alloc(len + 3)))
     {
         memcpy(&s->tx_info.nsf[3], nsf, len);
@@ -349,6 +384,7 @@ SPAN_DECLARE(int) t30_set_tx_nsf(t30_state_t *s, const uint8_t *nsf, int len)
         s->tx_info.nsf = NULL;
         s->tx_info.nsf_len = 0;
     }
+    /*endif*/
     return 0;
 }
 /*- End of function --------------------------------------------------------*/
@@ -357,6 +393,7 @@ SPAN_DECLARE(size_t) t30_get_tx_nsf(t30_state_t *s, const uint8_t *nsf[])
 {
     if (nsf)
         *nsf = s->tx_info.nsf;
+    /*endif*/
     return s->tx_info.nsf_len;
 }
 /*- End of function --------------------------------------------------------*/
@@ -365,6 +402,7 @@ SPAN_DECLARE(size_t) t30_get_rx_nsf(t30_state_t *s, const uint8_t *nsf[])
 {
     if (nsf)
         *nsf = s->rx_info.nsf;
+    /*endif*/
     return s->rx_info.nsf_len;
 }
 /*- End of function --------------------------------------------------------*/
@@ -373,6 +411,7 @@ SPAN_DECLARE(int) t30_set_tx_nsc(t30_state_t *s, const uint8_t *nsc, int len)
 {
     if (s->tx_info.nsc)
         span_free(s->tx_info.nsc);
+    /*endif*/
     if (nsc  &&  len > 0  &&  (s->tx_info.nsc = span_alloc(len + 3)))
     {
         memcpy(&s->tx_info.nsc[3], nsc, len);
@@ -383,6 +422,7 @@ SPAN_DECLARE(int) t30_set_tx_nsc(t30_state_t *s, const uint8_t *nsc, int len)
         s->tx_info.nsc = NULL;
         s->tx_info.nsc_len = 0;
     }
+    /*endif*/
     return 0;
 }
 /*- End of function --------------------------------------------------------*/
@@ -391,6 +431,7 @@ SPAN_DECLARE(size_t) t30_get_tx_nsc(t30_state_t *s, const uint8_t *nsc[])
 {
     if (nsc)
         *nsc = s->tx_info.nsc;
+    /*endif*/
     return s->tx_info.nsc_len;
 }
 /*- End of function --------------------------------------------------------*/
@@ -399,6 +440,7 @@ SPAN_DECLARE(size_t) t30_get_rx_nsc(t30_state_t *s, const uint8_t *nsc[])
 {
     if (nsc)
         *nsc = s->rx_info.nsc;
+    /*endif*/
     return s->rx_info.nsc_len;
 }
 /*- End of function --------------------------------------------------------*/
@@ -407,6 +449,7 @@ SPAN_DECLARE(int) t30_set_tx_nss(t30_state_t *s, const uint8_t *nss, int len)
 {
     if (s->tx_info.nss)
         span_free(s->tx_info.nss);
+    /*endif*/
     if (nss  &&  len > 0  &&  (s->tx_info.nss = span_alloc(len + 3)))
     {
         memcpy(&s->tx_info.nss[3], nss, len);
@@ -417,6 +460,7 @@ SPAN_DECLARE(int) t30_set_tx_nss(t30_state_t *s, const uint8_t *nss, int len)
         s->tx_info.nss = NULL;
         s->tx_info.nss_len = 0;
     }
+    /*endif*/
     return 0;
 }
 /*- End of function --------------------------------------------------------*/
@@ -425,6 +469,7 @@ SPAN_DECLARE(size_t) t30_get_tx_nss(t30_state_t *s, const uint8_t *nss[])
 {
     if (nss)
         *nss = s->tx_info.nss;
+    /*endif*/
     return s->tx_info.nss_len;
 }
 /*- End of function --------------------------------------------------------*/
@@ -433,6 +478,7 @@ SPAN_DECLARE(size_t) t30_get_rx_nss(t30_state_t *s, const uint8_t *nss[])
 {
     if (nss)
         *nss = s->rx_info.nss;
+    /*endif*/
     return s->rx_info.nss_len;
 }
 /*- End of function --------------------------------------------------------*/
@@ -441,20 +487,24 @@ SPAN_DECLARE(int) t30_set_tx_tsa(t30_state_t *s, int type, const char *address,
 {
     if (s->tx_info.tsa)
         span_free(s->tx_info.tsa);
+    /*endif*/
     if (address == NULL  ||  len == 0)
     {
         s->tx_info.tsa = NULL;
         s->tx_info.tsa_len = 0;
         return 0;
     }
+    /*endif*/
     s->tx_info.tsa_type = type;
     if (len < 0)
         len = strlen(address);
+    /*endif*/
     if ((s->tx_info.tsa = span_alloc(len)))
     {
         memcpy(s->tx_info.tsa, address, len);
         s->tx_info.tsa_len = len;
     }
+    /*endif*/
     return 0;
 }
 /*- End of function --------------------------------------------------------*/
@@ -463,8 +513,10 @@ SPAN_DECLARE(size_t) t30_get_tx_tsa(t30_state_t *s, int *type, const char *addre
 {
     if (type)
         *type = s->tx_info.tsa_type;
+    /*endif*/
     if (address)
         *address = s->tx_info.tsa;
+    /*endif*/
     return s->tx_info.tsa_len;
 }
 /*- End of function --------------------------------------------------------*/
@@ -473,8 +525,10 @@ SPAN_DECLARE(size_t) t30_get_rx_tsa(t30_state_t *s, int *type, const char *addre
 {
     if (type)
         *type = s->rx_info.tsa_type;
+    /*endif*/
     if (address)
         *address = s->rx_info.tsa;
+    /*endif*/
     return s->rx_info.tsa_len;
 }
 /*- End of function --------------------------------------------------------*/
@@ -483,11 +537,13 @@ SPAN_DECLARE(int) t30_set_tx_ira(t30_state_t *s, int type, const char *address,
 {
     if (s->tx_info.ira)
         span_free(s->tx_info.ira);
+    /*endif*/
     if (address == NULL)
     {
         s->tx_info.ira = NULL;
         return 0;
     }
+    /*endif*/
     s->tx_info.ira = strdup(address);
     return 0;
 }
@@ -497,8 +553,10 @@ SPAN_DECLARE(size_t) t30_get_tx_ira(t30_state_t *s, int *type, const char *addre
 {
     if (type)
         *type = s->tx_info.ira_type;
+    /*endif*/
     if (address)
         *address = s->tx_info.ira;
+    /*endif*/
     return s->tx_info.ira_len;
 }
 /*- End of function --------------------------------------------------------*/
@@ -507,8 +565,10 @@ SPAN_DECLARE(size_t) t30_get_rx_ira(t30_state_t *s, int *type, const char *addre
 {
     if (type)
         *type = s->rx_info.ira_type;
+    /*endif*/
     if (address)
         *address = s->rx_info.ira;
+    /*endif*/
     return s->rx_info.ira_len;
 }
 /*- End of function --------------------------------------------------------*/
@@ -517,11 +577,13 @@ SPAN_DECLARE(int) t30_set_tx_cia(t30_state_t *s, int type, const char *address,
 {
     if (s->tx_info.cia)
         span_free(s->tx_info.cia);
+    /*endif*/
     if (address == NULL)
     {
         s->tx_info.cia = NULL;
         return 0;
     }
+    /*endif*/
     s->tx_info.cia = strdup(address);
     return 0;
 }
@@ -531,8 +593,10 @@ SPAN_DECLARE(size_t) t30_get_tx_cia(t30_state_t *s, int *type, const char *addre
 {
     if (type)
         *type = s->tx_info.cia_type;
+    /*endif*/
     if (address)
         *address = s->tx_info.cia;
+    /*endif*/
     return s->tx_info.cia_len;
 }
 /*- End of function --------------------------------------------------------*/
@@ -541,8 +605,10 @@ SPAN_DECLARE(size_t) t30_get_rx_cia(t30_state_t *s, int *type, const char *addre
 {
     if (type)
         *type = s->rx_info.cia_type;
+    /*endif*/
     if (address)
         *address = s->rx_info.cia;
+    /*endif*/
     return s->rx_info.cia_len;
 }
 /*- End of function --------------------------------------------------------*/
@@ -551,11 +617,13 @@ SPAN_DECLARE(int) t30_set_tx_isp(t30_state_t *s, int type, const char *address,
 {
     if (s->tx_info.isp)
         span_free(s->tx_info.isp);
+    /*endif*/
     if (address == NULL)
     {
         s->tx_info.isp = NULL;
         return 0;
     }
+    /*endif*/
     s->tx_info.isp = strdup(address);
     return 0;
 }
@@ -565,8 +633,10 @@ SPAN_DECLARE(size_t) t30_get_tx_isp(t30_state_t *s, int *type, const char *addre
 {
     if (type)
         *type = s->tx_info.isp_type;
+    /*endif*/
     if (address)
         *address = s->tx_info.isp;
+    /*endif*/
     return s->tx_info.isp_len;
 }
 /*- End of function --------------------------------------------------------*/
@@ -575,8 +645,10 @@ SPAN_DECLARE(size_t) t30_get_rx_isp(t30_state_t *s, int *type, const char *addre
 {
     if (type)
         *type = s->rx_info.isp_type;
+    /*endif*/
     if (address)
         *address = s->rx_info.isp;
+    /*endif*/
     return s->rx_info.isp_len;
 }
 /*- End of function --------------------------------------------------------*/
@@ -585,11 +657,13 @@ SPAN_DECLARE(int) t30_set_tx_csa(t30_state_t *s, int type, const char *address,
 {
     if (s->tx_info.csa)
         span_free(s->tx_info.csa);
+    /*endif*/
     if (address == NULL)
     {
         s->tx_info.csa = NULL;
         return 0;
     }
+    /*endif*/
     s->tx_info.csa = strdup(address);
     return 0;
 }
@@ -599,8 +673,10 @@ SPAN_DECLARE(size_t) t30_get_tx_csa(t30_state_t *s, int *type, const char *addre
 {
     if (type)
         *type = s->tx_info.csa_type;
+    /*endif*/
     if (address)
         *address = s->tx_info.csa;
+    /*endif*/
     return s->tx_info.csa_len;
 }
 /*- End of function --------------------------------------------------------*/
@@ -609,8 +685,10 @@ SPAN_DECLARE(size_t) t30_get_rx_csa(t30_state_t *s, int *type, const char *addre
 {
     if (type)
         *type = s->rx_info.csa_type;
+    /*endif*/
     if (address)
         *address = s->rx_info.csa;
+    /*endif*/
     return s->rx_info.csa_len;
 }
 /*- End of function --------------------------------------------------------*/
@@ -630,8 +708,10 @@ SPAN_DECLARE(int) t30_set_tx_page_header_info(t30_state_t *s, const char *info)
         s->header_info[0] = '\0';
         return 0;
     }
+    /*endif*/
     if (strlen(info) > T30_MAX_PAGE_HEADER_INFO)
         return -1;
+    /*endif*/
     strcpy(s->header_info, info);
     t4_tx_set_header_info(&s->t4.tx, s->header_info);
     return 0;
@@ -642,6 +722,7 @@ SPAN_DECLARE(size_t) t30_get_tx_page_header_info(t30_state_t *s, char *info)
 {
     if (info)
         strcpy(info, s->header_info);
+    /*endif*/
     return strlen(s->header_info);
 }
 /*- End of function --------------------------------------------------------*/
@@ -654,6 +735,7 @@ SPAN_DECLARE(int) t30_set_tx_page_header_tz(t30_state_t *s, const char *tzstring
         t4_tx_set_header_tz(&s->t4.tx, &s->tz);
         return 0;
     }
+    /*endif*/
     return -1;
 }
 /*- End of function --------------------------------------------------------*/
@@ -758,6 +840,7 @@ SPAN_DECLARE(int) t30_set_minimum_scan_line_time(t30_state_t *s, int min_time)
         s->local_min_scan_time_code = 4;
     else
         return -1;
+    /*endif*/
     t30_build_dis_or_dtc(s);
     return 0;
 }
@@ -856,8 +939,10 @@ SPAN_DECLARE(int) t30_set_supported_image_sizes(t30_state_t *s, int supported_im
     /* Force the sizes which depend on sizes which are supported */
     if ((supported_image_sizes & T4_SUPPORT_LENGTH_UNLIMITED))
         supported_image_sizes |= T4_SUPPORT_LENGTH_B4;
+    /*endif*/
     if ((supported_image_sizes & T4_SUPPORT_WIDTH_303MM))
         supported_image_sizes |= T4_SUPPORT_WIDTH_255MM;
+    /*endif*/
     s->supported_image_sizes = supported_image_sizes;
     t30_build_dis_or_dtc(s);
     return 0;
@@ -879,6 +964,7 @@ SPAN_DECLARE(void) t30_set_status(t30_state_t *s, int status)
         span_log(&s->logging, SPAN_LOG_FLOW, "Status changing to '%s'\n", t30_completion_code_to_str(status));
         s->current_status = status;
     }
+    /*endif*/
 }
 /*- End of function --------------------------------------------------------*/
 
diff --git a/libs/spandsp/src/t30_logging.c b/libs/spandsp/src/t30_logging.c
index 51739999d1..0ef46b9f31 100644
--- a/libs/spandsp/src/t30_logging.c
+++ b/libs/spandsp/src/t30_logging.c
@@ -497,12 +497,15 @@ static void octet_bit_field(logging_state_t *log,
     {
         if ((tag = yeah) == NULL)
             tag = "Set";
+        /*endif*/
     }
     else
     {
         if ((tag = neigh) == NULL)
             tag = "Not set";
+        /*endif*/
     }
+    /*endif*/
     /* Eh, voila! */
     span_log(log, SPAN_LOG_FLOW, "  %s= %s: %s\n", s, desc, tag);
 }
@@ -528,6 +531,7 @@ static void octet_field(logging_state_t *log,
     /* Edit the bit string for display. */
     for (i = start;  i < end;  i++)
         s[7 - i + ((i < 4)  ?  1  :  0)] = (uint8_t) ((octet >> i) & 1) + '0';
+    /*endfor*/
 
     /* Find the right tag to display. */
     octet = (uint8_t) ((octet >> start) & ((0xFF + (1 << (end - start))) & 0xFF));
@@ -539,7 +543,9 @@ static void octet_field(logging_state_t *log,
             tag = tags[i].str;
             break;
         }
+        /*endif*/
     }
+    /*endfor*/
     /* Eh, voila! */
     span_log(log, SPAN_LOG_FLOW, "  %s= %s: %s\n", s, desc, tag);
 }
@@ -670,6 +676,7 @@ SPAN_DECLARE(void) t30_decode_dis_dtc_dcs(t30_state_t *s, const uint8_t *pkt, in
 
     if (!span_log_test(&s->logging, SPAN_LOG_FLOW))
         return;
+    /*endif*/
     frame_type = pkt[2] & 0xFE;
     log = &s->logging;
     if (len <= 2)
@@ -677,6 +684,7 @@ SPAN_DECLARE(void) t30_decode_dis_dtc_dcs(t30_state_t *s, const uint8_t *pkt, in
         span_log(log, SPAN_LOG_FLOW, "  Frame is short\n");
         return;
     }
+    /*endif*/
 
     span_log(log, SPAN_LOG_FLOW, "%s:\n", t30_frametype(pkt[2]));
     if (len <= 3)
@@ -684,6 +692,7 @@ SPAN_DECLARE(void) t30_decode_dis_dtc_dcs(t30_state_t *s, const uint8_t *pkt, in
         span_log(log, SPAN_LOG_FLOW, "  Frame is short\n");
         return;
     }
+    /*endif*/
     octet_bit_field(log, pkt, 1, "Store and forward Internet fax (T.37)", NULL, NULL);
     octet_reserved_bit(log, pkt, 2, 0);
     octet_bit_field(log, pkt, 3, "Real-time Internet fax (T.38)", NULL, NULL);
@@ -699,12 +708,14 @@ SPAN_DECLARE(void) t30_decode_dis_dtc_dcs(t30_state_t *s, const uint8_t *pkt, in
         octet_bit_field(log, pkt, 6, "V.8 capabilities", NULL, NULL);
         octet_bit_field(log, pkt, 7, "Preferred octets", "64 octets", "256 octets");
     }
+    /*endif*/
     octet_reserved_bit(log, pkt, 8, 0);
     if (len <= 4)
     {
         span_log(log, SPAN_LOG_FLOW, "  Frame is short\n");
         return;
     }
+    /*endif*/
 
     if (frame_type == T30_DCS)
     {
@@ -718,6 +729,7 @@ SPAN_DECLARE(void) t30_decode_dis_dtc_dcs(t30_state_t *s, const uint8_t *pkt, in
         octet_bit_field(log, pkt, 10, "Can receive fax", NULL, NULL);
         octet_field(log, pkt, 11, 14, "Supported data signalling rates", available_signalling_rate_tags);
     }
+    /*endif*/
     octet_bit_field(log, pkt, 15, "R8x7.7lines/mm and/or 200x200pels/25.4mm", NULL, NULL);
     octet_bit_field(log, pkt, 16, "2-D coding", NULL, NULL);
     if (len <= 5)
@@ -725,6 +737,7 @@ SPAN_DECLARE(void) t30_decode_dis_dtc_dcs(t30_state_t *s, const uint8_t *pkt, in
         span_log(log, SPAN_LOG_FLOW, "  Frame is short\n");
         return;
     }
+    /*endif*/
 
     if (frame_type == T30_DCS)
     {
@@ -738,14 +751,17 @@ SPAN_DECLARE(void) t30_decode_dis_dtc_dcs(t30_state_t *s, const uint8_t *pkt, in
         octet_field(log, pkt, 19, 20, "Recording length", available_recording_length_tags);
         octet_field(log, pkt, 21, 23, "Receiver's minimum scan line time", available_minimum_scan_line_time_tags);
     }
+    /*endif*/
     octet_bit_field(log, pkt, 24, "Extension indicator", NULL, NULL);
     if (!(pkt[5] & DISBIT8))
         return;
+    /*endif*/
     if (len <= 6)
     {
         span_log(log, SPAN_LOG_FLOW, "  Frame is short\n");
         return;
     }
+    /*endif*/
 
     octet_reserved_bit(log, pkt, 25, 0);
     octet_bit_field(log, pkt, 26, "Compressed/uncompressed mode", "Uncompressed", "Compressed");
@@ -754,17 +770,20 @@ SPAN_DECLARE(void) t30_decode_dis_dtc_dcs(t30_state_t *s, const uint8_t *pkt, in
         octet_bit_field(log, pkt, 28, "Frame size", "64 octets", "256 octets");
     else
         octet_reserved_bit(log, pkt, 28, 0);
+    /*endif*/
     octet_reserved_bit(log, pkt, 29, 0);
     octet_reserved_bit(log, pkt, 30, 0);
     octet_bit_field(log, pkt, 31, "T.6 coding", NULL, NULL);
     octet_bit_field(log, pkt, 32, "Extension indicator", NULL, NULL);
     if (!(pkt[6] & DISBIT8))
         return;
+    /*endif*/
     if (len <= 7)
     {
         span_log(log, SPAN_LOG_FLOW, "  Frame is short\n");
         return;
     }
+    /*endif*/
 
     octet_bit_field(log, pkt, 33, "\"Field not valid\" supported", NULL, NULL);
     if (frame_type == T30_DCS)
@@ -777,6 +796,7 @@ SPAN_DECLARE(void) t30_decode_dis_dtc_dcs(t30_state_t *s, const uint8_t *pkt, in
         octet_bit_field(log, pkt, 34, "Multiple selective polling", NULL, NULL);
         octet_bit_field(log, pkt, 35, "Polled sub-address", NULL, NULL);
     }
+    /*endif*/
     octet_bit_field(log, pkt, 36, "T.43 coding", NULL, NULL);
     octet_bit_field(log, pkt, 37, "Plane interleave", NULL, NULL);
     octet_bit_field(log, pkt, 38, "Voice coding with 32kbit/s ADPCM (Rec. G.726)", NULL, NULL);
@@ -784,11 +804,13 @@ SPAN_DECLARE(void) t30_decode_dis_dtc_dcs(t30_state_t *s, const uint8_t *pkt, in
     octet_bit_field(log, pkt, 40, "Extension indicator", NULL, NULL);
     if (!(pkt[7] & DISBIT8))
         return;
+    /*endif*/
     if (len <= 8)
     {
         span_log(log, SPAN_LOG_FLOW, "  Frame is short\n");
         return;
     }
+    /*endif*/
     octet_bit_field(log, pkt, 41, "R8x15.4lines/mm", NULL, NULL);
     octet_bit_field(log, pkt, 42, "300x300pels/25.4mm", NULL, NULL);
     octet_bit_field(log, pkt, 43, "R16x15.4lines/mm and/or 400x400pels/25.4mm", NULL, NULL);
@@ -806,14 +828,17 @@ SPAN_DECLARE(void) t30_decode_dis_dtc_dcs(t30_state_t *s, const uint8_t *pkt, in
         octet_bit_field(log, pkt, 46, "Minimum scan line time for higher resolutions", "T15.4 = 1/2 T7.7", "T15.4 = T7.7");
         octet_bit_field(log, pkt, 47, "Selective polling", NULL, NULL);
     }
+    /*endif*/
     octet_bit_field(log, pkt, 48, "Extension indicator", NULL, NULL);
     if (!(pkt[8] & DISBIT8))
         return;
+    /*endif*/
     if (len <= 9)
     {
         span_log(log, SPAN_LOG_FLOW, "  Frame is short\n");
         return;
     }
+    /*endif*/
 
     octet_bit_field(log, pkt, 49, "Sub-addressing", NULL, NULL);
     if (frame_type == T30_DCS)
@@ -826,6 +851,7 @@ SPAN_DECLARE(void) t30_decode_dis_dtc_dcs(t30_state_t *s, const uint8_t *pkt, in
         octet_bit_field(log, pkt, 50, "Password", NULL, NULL);
         octet_bit_field(log, pkt, 51, "Ready to transmit a data file (polling)", NULL, NULL);
     }
+    /*endif*/
     octet_reserved_bit(log, pkt, 52, 0);
     octet_bit_field(log, pkt, 53, "Binary file transfer (BFT)", NULL, NULL);
     octet_bit_field(log, pkt, 54, "Document transfer mode (DTM)", NULL, NULL);
@@ -833,11 +859,13 @@ SPAN_DECLARE(void) t30_decode_dis_dtc_dcs(t30_state_t *s, const uint8_t *pkt, in
     octet_bit_field(log, pkt, 56, "Extension indicator", NULL, NULL);
     if (!(pkt[9] & DISBIT8))
         return;
+    /*endif*/
     if (len <= 10)
     {
         span_log(log, SPAN_LOG_FLOW, "  Frame is short\n");
         return;
     }
+    /*endif*/
 
     octet_bit_field(log, pkt, 57, "Basic transfer mode (BTM)", NULL, NULL);
     octet_reserved_bit(log, pkt, 58, 0);
@@ -845,6 +873,7 @@ SPAN_DECLARE(void) t30_decode_dis_dtc_dcs(t30_state_t *s, const uint8_t *pkt, in
         octet_reserved_bit(log, pkt, 59, 0);
     else
         octet_bit_field(log, pkt, 59, "Ready to transfer a character or mixed mode document (polling)", NULL, NULL);
+    /*endif*/
     octet_bit_field(log, pkt, 60, "Character mode", NULL, NULL);
     octet_reserved_bit(log, pkt, 61, 0);
     octet_bit_field(log, pkt, 62, "Mixed mode (Annex E/T.4)", NULL, NULL);
@@ -852,11 +881,13 @@ SPAN_DECLARE(void) t30_decode_dis_dtc_dcs(t30_state_t *s, const uint8_t *pkt, in
     octet_bit_field(log, pkt, 64, "Extension indicator", NULL, NULL);
     if (!(pkt[10] & DISBIT8))
         return;
+    /*endif*/
     if (len <= 11)
     {
         span_log(log, SPAN_LOG_FLOW, "  Frame is short\n");
         return;
     }
+    /*endif*/
 
     octet_bit_field(log, pkt, 65, "Processable mode 26 (Rec. T.505)", NULL, NULL);
     octet_bit_field(log, pkt, 66, "Digital network capability", NULL, NULL);
@@ -865,20 +896,24 @@ SPAN_DECLARE(void) t30_decode_dis_dtc_dcs(t30_state_t *s, const uint8_t *pkt, in
         octet_bit_field(log, pkt, 68, "Full colour mode", NULL, NULL);
     else
         octet_bit_field(log, pkt, 68, "JPEG coding", NULL, NULL);
+    /*endif*/
     octet_bit_field(log, pkt, 69, "Full colour mode", NULL, NULL);
     if (frame_type == T30_DCS)
         octet_bit_field(log, pkt, 70, "Preferred Huffman tables", NULL, NULL);
     else
         octet_reserved_bit(log, pkt, 70, 0);
+    /*endif*/
     octet_bit_field(log, pkt, 71, "12bits/pel component", NULL, NULL);
     octet_bit_field(log, pkt, 72, "Extension indicator", NULL, NULL);
     if (!(pkt[11] & DISBIT8))
         return;
+    /*endif*/
     if (len <= 12)
     {
         span_log(log, SPAN_LOG_FLOW, "  Frame is short\n");
         return;
     }
+    /*endif*/
 
     octet_bit_field(log, pkt, 73, "No subsampling (1:1:1)", NULL, NULL);
     octet_bit_field(log, pkt, 74, "Custom illuminant", NULL, NULL);
@@ -890,11 +925,13 @@ SPAN_DECLARE(void) t30_decode_dis_dtc_dcs(t30_state_t *s, const uint8_t *pkt, in
     octet_bit_field(log, pkt, 80, "Extension indicator", NULL, NULL);
     if (!(pkt[12] & DISBIT8))
         return;
+    /*endif*/
     if (len <= 13)
     {
         span_log(log, SPAN_LOG_FLOW, "  Frame is short\n");
         return;
     }
+    /*endif*/
 
     octet_bit_field(log, pkt, 81, "HKM key management", NULL, NULL);
     octet_bit_field(log, pkt, 82, "RSA key management", NULL, NULL);
@@ -906,11 +943,13 @@ SPAN_DECLARE(void) t30_decode_dis_dtc_dcs(t30_state_t *s, const uint8_t *pkt, in
     octet_bit_field(log, pkt, 88, "Extension indicator", NULL, NULL);
     if (!(pkt[13] & DISBIT8))
         return;
+    /*endif*/
     if (len <= 14)
     {
         span_log(log, SPAN_LOG_FLOW, "  Frame is short\n");
         return;
     }
+    /*endif*/
 
     octet_bit_field(log, pkt, 89, "Alternative hashing system 2", NULL, NULL);
     octet_bit_field(log, pkt, 90, "Alternative hashing system 3", NULL, NULL);
@@ -925,6 +964,7 @@ SPAN_DECLARE(void) t30_decode_dis_dtc_dcs(t30_state_t *s, const uint8_t *pkt, in
         span_log(log, SPAN_LOG_FLOW, "  Frame is short\n");
         return;
     }
+    /*endif*/
 
     octet_bit_field(log, pkt, 97, "Colour/gray-scale 300pels/25.4mm x 300lines/25.4mm or 400pels/25.4mm x 400lines/25.4mm resolution", NULL, NULL);
     octet_bit_field(log, pkt, 98, "100pels/25.4mm x 100lines/25.4mm for colour/gray scale", NULL, NULL);
@@ -939,16 +979,19 @@ SPAN_DECLARE(void) t30_decode_dis_dtc_dcs(t30_state_t *s, const uint8_t *pkt, in
         octet_bit_field(log, pkt, 100, "Extended BFT Negotiations capable", NULL, NULL);
         octet_bit_field(log, pkt, 101, "Internet Selective Polling address (ISP)", NULL, NULL);
     }
+    /*endif*/
     octet_bit_field(log, pkt, 102, "Internet Routing Address (IRA)", NULL, NULL);
     octet_reserved_bit(log, pkt, 103, 0);
     octet_bit_field(log, pkt, 104, "Extension indicator", NULL, NULL);
     if (!(pkt[15] & DISBIT8))
         return;
+    /*endif*/
     if (len <= 16)
     {
         span_log(log, SPAN_LOG_FLOW, "  Frame is short\n");
         return;
     }
+    /*endif*/
 
     octet_bit_field(log, pkt, 105, "600pels/25.4mm x 600lines/25.4mm", NULL, NULL);
     octet_bit_field(log, pkt, 106, "1200pels/25.4mm x 1200lines/25.4mm", NULL, NULL);
@@ -960,11 +1003,13 @@ SPAN_DECLARE(void) t30_decode_dis_dtc_dcs(t30_state_t *s, const uint8_t *pkt, in
     octet_bit_field(log, pkt, 112, "Extension indicator", NULL, NULL);
     if (!(pkt[16] & DISBIT8))
         return;
+    /*endif*/
     if (len <= 17)
     {
         span_log(log, SPAN_LOG_FLOW, "  Frame is short\n");
         return;
     }
+    /*endif*/
 
     octet_bit_field(log, pkt, 113, "Double sided printing capability (alternate mode)", NULL, NULL);
     octet_bit_field(log, pkt, 114, "Double sided printing capability (continuous mode)", NULL, NULL);
@@ -972,17 +1017,20 @@ SPAN_DECLARE(void) t30_decode_dis_dtc_dcs(t30_state_t *s, const uint8_t *pkt, in
         octet_bit_field(log, pkt, 115, "Black and white mixed raster content profile (MRCbw)", NULL, NULL);
     else
         octet_reserved_bit(log, pkt, 115, 0);
+    /*endif*/
     octet_bit_field(log, pkt, 116, "T.45 (run length colour encoded)", NULL, NULL);
     octet_field(log, pkt, 117, 118, "Shared memory", shared_data_memory_capacity_tags);
     octet_bit_field(log, pkt, 119, "T.44 colour space", NULL, NULL);
     octet_bit_field(log, pkt, 120, "Extension indicator", NULL, NULL);
     if (!(pkt[17] & DISBIT8))
         return;
+    /*endif*/
     if (len <= 18)
     {
         span_log(log, SPAN_LOG_FLOW, "  Frame is short\n");
         return;
     }
+    /*endif*/
 
     octet_bit_field(log, pkt, 121, "Flow control capability for T.38 communication", NULL, NULL);
     octet_bit_field(log, pkt, 122, "K>4", NULL, NULL);
@@ -992,6 +1040,7 @@ SPAN_DECLARE(void) t30_decode_dis_dtc_dcs(t30_state_t *s, const uint8_t *pkt, in
     octet_bit_field(log, pkt, 128, "Extension indicator", NULL, NULL);
     if (!(pkt[18] & DISBIT8))
         return;
+    /*endif*/
 
     span_log(log, SPAN_LOG_FLOW, "  Extended beyond the current T.30 specification!\n");
 }
diff --git a/libs/spandsp/tests/fax_tester.c b/libs/spandsp/tests/fax_tester.c
index 8885957972..419fb705fa 100644
--- a/libs/spandsp/tests/fax_tester.c
+++ b/libs/spandsp/tests/fax_tester.c
@@ -771,8 +771,6 @@ void faxtester_set_tx_type(void *user_data, int type, int bit_rate, int short_tr
     s = (faxtester_state_t *) user_data;
     t = &s->modems;
     span_log(&s->logging, SPAN_LOG_FLOW, "Set tx type %d\n", type);
-    if (s->current_tx_type == type)
-        return;
     if (use_hdlc)
     {
         get_bit_func = (get_bit_func_t) hdlc_tx_get_bit;
@@ -783,6 +781,12 @@ void faxtester_set_tx_type(void *user_data, int type, int bit_rate, int short_tr
         get_bit_func = non_ecm_get_bit;
         get_bit_user_data = (void *) s;
     }
+    if (type == s->current_tx_type)
+    {
+        if (type == T30_MODEM_PAUSE)
+            silence_gen_alter(&t->silence_gen, ms_to_samples(short_train));
+        return;
+    }
     switch (type)
     {
     case T30_MODEM_PAUSE:
@@ -1314,6 +1318,8 @@ SPAN_DECLARE(int) faxtester_next_step(faxtester_state_t *s)
         else if (strcasecmp((const char *) parms.type, "CLEAR") == 0)
         {
             span_log(&s->logging, SPAN_LOG_FLOW, "Far end should drop the call\n");
+            faxtester_set_rx_type(s, T30_MODEM_NONE, 0, false, false);
+            faxtester_set_tx_type(s, T30_MODEM_PAUSE, 0, s->timeout_x, false);
             s->test_for_call_clear = true;
             s->call_clear_timer = 0;
         }
@@ -1648,6 +1654,8 @@ SPAN_DECLARE(int) faxtester_next_step(faxtester_state_t *s)
         else if (strcasecmp((const char *) parms.type, "CLEAR") == 0)
         {
             span_log(&s->logging, SPAN_LOG_FLOW, "Time to drop the call\n");
+            faxtester_set_rx_type(s, T30_MODEM_NONE, 0, false, false);
+            faxtester_set_tx_type(s, T30_MODEM_PAUSE, 0, s->timeout_x, false);
             t30_terminate(s->far_t30);
             free_node_parms(&parms);
             return 0;
diff --git a/libs/spandsp/tests/tsb85_extra_tests.sh b/libs/spandsp/tests/tsb85_extra_tests.sh
index d7a3b08d40..7d4b10146b 100755
--- a/libs/spandsp/tests/tsb85_extra_tests.sh
+++ b/libs/spandsp/tests/tsb85_extra_tests.sh
@@ -28,7 +28,7 @@ run_tsb85_test()
     fi
 }
 
-for TEST in PPS-MPS-lost-PPS V17-12000-V29-9600 Phase-D-collision Modem-change-at-CTC ECM-DCN-clipped Non-ECM-DCN-clipped Tx-EOP-echo Tx-PPS-echo
+for TEST in PPS-MPS-lost-PPS V17-12000-V29-9600 Phase-D-collision Modem-change-at-CTC ECM-DCN-clipped Non-ECM-DCN-clipped Tx-EOP-echo Tx-PPS-echo Invalid-signalling-rate No-caller-response No-answerer-response T1-after-EOM T1-after-EOMx
 do
     run_tsb85_test
 done
diff --git a/libs/spandsp/tests/v18_tests.c b/libs/spandsp/tests/v18_tests.c
index a767eac263..866eac8027 100644
--- a/libs/spandsp/tests/v18_tests.c
+++ b/libs/spandsp/tests/v18_tests.c
@@ -45,6 +45,11 @@
 
 #define SAMPLES_PER_CHUNK   160
 
+#define CHUNKS_PER_SECOND   50
+
+#define TESTER              0
+#define TUT                 1
+
 int log_audio = false;
 SNDFILE *outhandle = NULL;
 char result[2][1024];
@@ -56,8 +61,19 @@ int good_message_received;
 
 both_ways_line_model_state_t *model;
 int rbs_pattern = 0;
-float noise_level = -70.0f;
 int line_model_no = 0;
+#if 0
+float echo_level_cpe1 = -15.0f;
+float echo_level_co1 = -15.0f;
+float echo_level_cpe2 = -15.0f;
+float echo_level_co2 = -15.0f;
+#else
+float echo_level_cpe1 = -99.0f;
+float echo_level_co1 = -99.0f;
+float echo_level_cpe2 = -99.0f;
+float echo_level_co2 = -99.0f;
+#endif
+float noise_level = -70.0f;
 int channel_codec = MUNGE_CODEC_NONE;
 v18_state_t *v18[2];
 
@@ -96,23 +112,23 @@ static void basic_tests(int mode)
     int j;
 
     printf("Testing %s\n", v18_mode_to_str(mode));
-    v18[0] = v18_init(NULL, true, mode, V18_AUTOMODING_GLOBAL, put_text_msg, NULL);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, mode, V18_AUTOMODING_GLOBAL, put_text_msg, NULL);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, mode, V18_AUTOMODING_GLOBAL, put_text_msg, NULL);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, mode, V18_AUTOMODING_GLOBAL, put_text_msg, NULL);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -123,19 +139,19 @@ static void basic_tests(int mode)
     /* Fake an OK condition for the first message test */
     good_message_received = true;
     push = 0;
-    if (v18_put(v18[0], qbf_tx, -1) != strlen(qbf_tx))
+    if (v18_put(v18[TESTER], qbf_tx, -1) != strlen(qbf_tx))
     {
         printf("V.18 put failed\n");
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         if (push == 0)
         {
-            if ((samples = v18_tx(v18[0], amp[0], SAMPLES_PER_CHUNK)) == 0)
+            if ((samples = v18_tx(v18[TESTER], amp[0], SAMPLES_PER_CHUNK)) == 0)
                 push = 10;
         }
         else
@@ -150,7 +166,7 @@ static void basic_tests(int mode)
                     exit(2);
                 }
                 good_message_received = false;
-                if (v18_put(v18[0], qbf_tx, -1) != strlen(qbf_tx))
+                if (v18_put(v18[TESTER], qbf_tx, -1) != strlen(qbf_tx))
                 {
                     printf("V.18 put failed\n");
                     exit(2);
@@ -162,7 +178,7 @@ static void basic_tests(int mode)
             vec_zeroi16(&amp[0][samples], SAMPLES_PER_CHUNK - samples);
             samples = SAMPLES_PER_CHUNK;
         }
-        if ((samples = v18_tx(v18[1], amp[1], SAMPLES_PER_CHUNK)) == 0)
+        if ((samples = v18_tx(v18[TUT], amp[1], SAMPLES_PER_CHUNK)) == 0)
             push = 10;
         if (samples < SAMPLES_PER_CHUNK)
         {
@@ -194,11 +210,11 @@ static void basic_tests(int mode)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
 }
 /*- End of function --------------------------------------------------------*/
 
@@ -229,23 +245,23 @@ static int test_misc_01(void)
                         TUT should continue to probe until the test is terminated.
         Comments:       This feature should also be verified by observation during the automoding tests.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, misc_01_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, misc_01_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, misc_01_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, misc_01_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -253,8 +269,8 @@ static int test_misc_01(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -292,12 +308,12 @@ static int test_misc_01(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -334,23 +350,23 @@ static int test_misc_02(void)
         Comments:       The TUT should indicate that carrier has been lost at some time after the 1650Hz
                         signal is lost.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, misc_02_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, misc_02_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, misc_02_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, misc_02_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -358,8 +374,8 @@ static int test_misc_02(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -397,12 +413,12 @@ static int test_misc_02(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -439,23 +455,23 @@ static int test_misc_03(void)
         Comments:       The TUT should indicate that carrier has been lost at some time after the carrier
                         signal is removed and not disconnect.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, misc_03_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, misc_03_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, misc_03_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, misc_03_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -463,8 +479,8 @@ static int test_misc_03(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -502,12 +518,12 @@ static int test_misc_03(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -543,23 +559,23 @@ static int test_misc_04(void)
                         automatically hang up when busy tone is detected. PABX busy tones may differ in
                         frequency and cadence from national parameters.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, misc_04_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, misc_04_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, misc_04_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, misc_04_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -567,8 +583,8 @@ static int test_misc_04(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -606,12 +622,12 @@ static int test_misc_04(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -644,23 +660,23 @@ static int test_misc_05(void)
         Pass criteria:  The RINGING condition should be visually indicated by the TUT.
         Comments:       This test should be repeated across a range of valid timings and ring voltages.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, misc_05_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, misc_05_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, misc_05_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, misc_05_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -668,8 +684,8 @@ static int test_misc_05(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -707,12 +723,12 @@ static int test_misc_05(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -748,23 +764,23 @@ static int test_misc_06(void)
                         mode. There may be other cases, e.g. where the V.18 DCE is used in a gateway,
                         when automatic disconnection is required.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, misc_06_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, misc_06_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, misc_06_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, misc_06_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -772,8 +788,8 @@ static int test_misc_06(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -811,12 +827,12 @@ static int test_misc_06(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -850,23 +866,23 @@ static int test_misc_07(void)
                         However, this may possibly not be indicated by the DTE.
         Comments:       The possible modes are: V.21, V.23, Baudot 45, Baudot 50, EDT, Bell 103, DTMF.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, misc_07_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, misc_07_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, misc_07_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, misc_07_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -874,8 +890,8 @@ static int test_misc_07(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -913,12 +929,12 @@ static int test_misc_07(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -953,23 +969,23 @@ static int test_misc_08(void)
         Comment:        The response times and signal level thresholds of Circuit 135 are not specified in
                         ITU-T V.18 or V.24 and therefore the pattern indicated may vary.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, misc_08_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, misc_08_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, misc_08_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, misc_08_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -977,8 +993,8 @@ static int test_misc_08(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -1016,12 +1032,12 @@ static int test_misc_08(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -1053,23 +1069,23 @@ static int test_misc_09(void)
         Pass criteria:  TBD
         Comment:        TBD
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, misc_09_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, misc_09_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, misc_09_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, misc_09_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -1077,8 +1093,8 @@ static int test_misc_09(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -1116,12 +1132,12 @@ static int test_misc_09(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -1162,23 +1178,23 @@ static int test_org_01(void)
                         8) The whole sequence should be repeated until the call is cleared.
                         9) When V.18 to V.18, the XCI must not force V.23 or Minitel mode.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_01_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_01_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_01_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_01_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -1186,8 +1202,8 @@ static int test_org_01(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -1225,12 +1241,12 @@ static int test_org_01(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -1265,23 +1281,23 @@ static int test_org_02(void)
                         2) The TUT should reply with transmission of TXP as defined in 5.1.2.
                         3) Verify that TXP sequence has correct bit pattern.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_02_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_02_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_02_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_02_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -1289,8 +1305,8 @@ static int test_org_02(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -1328,12 +1344,12 @@ static int test_org_02(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -1366,23 +1382,23 @@ static int test_org_03(void)
         Pass criteria:  The TUT should stop sending TXP at the end of the current sequence when ANS
                         tone ceases.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_03_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_03_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_03_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_03_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -1390,8 +1406,8 @@ static int test_org_03(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -1429,12 +1445,12 @@ static int test_org_03(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -1471,23 +1487,23 @@ static int test_org_04(void)
                            with the V.18 operational requirements.
         Comments:       The TUT should indicate that V.18 mode has been selected.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_04_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_04_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_04_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_04_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -1495,8 +1511,8 @@ static int test_org_04(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -1534,12 +1550,12 @@ static int test_org_04(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -1577,23 +1593,23 @@ static int test_org_05(void)
                         examination of TUT. If there is no visual indication, verify by use of ITU-T T.50 for
                         ITU-T V.21 as opposed to UTF-8 coded ISO 10646 character set for ITU-T V.18.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_05_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_05_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_05_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_05_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -1601,8 +1617,8 @@ static int test_org_05(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -1640,12 +1656,12 @@ static int test_org_05(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -1682,23 +1698,23 @@ static int test_org_06(void)
                            by the TUT to comply with Annex E.
         Comments:       The TUT should indicate that V.23 mode has been selected.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_06_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_06_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_06_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_06_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -1706,8 +1722,8 @@ static int test_org_06(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -1745,12 +1761,12 @@ static int test_org_06(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -1787,23 +1803,23 @@ static int test_org_07(void)
                         literally. It may however, occur when connected to certain Swedish textphones if the
                         handset is lifted just after the start of an automatically answered incoming call.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_07_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_07_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_07_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_07_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -1811,8 +1827,8 @@ static int test_org_07(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -1850,12 +1866,12 @@ static int test_org_07(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -1888,23 +1904,23 @@ static int test_org_08(void)
                         2) Data should be transmitted and received at 300 bit/s to comply with Annex D.
         Comments:       The TUT should indicate that Bell 103 mode has been selected.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_08_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_08_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_08_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_08_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -1912,8 +1928,8 @@ static int test_org_08(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -1951,12 +1967,12 @@ static int test_org_08(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -1989,23 +2005,23 @@ static int test_org_09(void)
                         2) Data should be transmitted and received at 300 bit/s to comply with Annex F.
         Comments:       The TUT should indicate that V.21 mode has been selected.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_09_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_09_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_09_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_09_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -2013,8 +2029,8 @@ static int test_org_09(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -2052,12 +2068,12 @@ static int test_org_09(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -2091,23 +2107,23 @@ static int test_org_10(void)
                            by the TUT to comply with Annex E.
         Comments:       The TUT should indicate that V.23 mode has been selected.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_10_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_10_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_10_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_10_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -2115,8 +2131,8 @@ static int test_org_10(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -2154,12 +2170,12 @@ static int test_org_10(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -2196,23 +2212,23 @@ static int test_org_11(void)
         Comments:       The TUT should indicate that V.23 mode has been selected at least 3s after
                         the start of the 390Hz tone.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_11_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_11_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_11_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_11_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -2220,8 +2236,8 @@ static int test_org_11(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -2259,12 +2275,12 @@ static int test_org_11(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -2306,23 +2322,23 @@ static int test_org_12(void)
                         automode answer state. The TUT may then select either 45.45 or 50 bit/s for the
                         transmission.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_12_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_12_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_12_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_12_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -2330,8 +2346,8 @@ static int test_org_12(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -2369,12 +2385,12 @@ static int test_org_12(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -2410,23 +2426,23 @@ static int test_org_13(void)
                         TUT should comply with ITU-T Q.24 for the Danish Administration while
                         receiving for best possible performance.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_13_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_13_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_13_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_13_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -2434,8 +2450,8 @@ static int test_org_13(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -2473,12 +2489,12 @@ static int test_org_13(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -2516,23 +2532,23 @@ static int test_org_14(void)
                         the number lost should be minimal. The data bits and parity are specified in
                         Annex C.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_14_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_14_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_14_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_14_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -2540,8 +2556,8 @@ static int test_org_14(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -2579,12 +2595,12 @@ static int test_org_14(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -2617,23 +2633,23 @@ static int test_org_15(void)
                         the CI signal.
         Comments:       Echoes of the CI sequences may be detected at 300 bit/s.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_15_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_15_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_15_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_15_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -2641,8 +2657,8 @@ static int test_org_15(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -2680,12 +2696,12 @@ static int test_org_15(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -2718,23 +2734,23 @@ static int test_org_16(void)
                         2) Data should be transmitted and received at 300 bit/s complying with Annex F.
         Comments:       The TUT should indicate that V.21 mode has been selected.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_16_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_16_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_16_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_16_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -2742,8 +2758,8 @@ static int test_org_16(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -2781,12 +2797,12 @@ static int test_org_16(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -2817,23 +2833,23 @@ static int test_org_17(void)
         Pass criteria:  TUT should not respond to the 980Hz tone and resume sending CI signals after a
                         maximum of 2.4s from the end of the 980Hz tone.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_17_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_17_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_17_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_17_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -2841,8 +2857,8 @@ static int test_org_17(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -2880,12 +2896,12 @@ static int test_org_17(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -2918,23 +2934,23 @@ static int test_org_18(void)
         Comments:       This implies timer Tr has expired 2s after the start of the 980Hz tone and
                         then 1650Hz has been detected for 0.5s.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_18_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_18_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_18_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_18_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -2942,8 +2958,8 @@ static int test_org_18(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -2981,12 +2997,12 @@ static int test_org_18(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -3018,23 +3034,23 @@ static int test_org_19(void)
                         2) Data should be transmitted and received at 300 bit/s complying with Annex D.
         Comments:       The TUT should indicate that Bell 103 mode has been selected.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_19_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_19_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_19_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_19_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -3042,8 +3058,8 @@ static int test_org_19(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -3081,12 +3097,12 @@ static int test_org_19(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -3125,23 +3141,23 @@ static int test_org_20(void)
                         presence and cadence of the tones for instance by a flashing light. The TUT may
                         disconnect on reception of tones indicating a failed call attempt.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_20_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_20_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_20_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_20_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -3149,8 +3165,8 @@ static int test_org_20(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -3188,12 +3204,12 @@ static int test_org_20(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -3228,23 +3244,23 @@ static int test_org_21(void)
         Comments:       Some high speed modems may fall back to a compatibility mode, e.g. V.21 or V.23
                         that should be correctly detected by the TUT.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_21_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_21_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_21_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_21_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -3252,8 +3268,8 @@ static int test_org_21(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -3291,12 +3307,12 @@ static int test_org_21(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -3330,23 +3346,23 @@ static int test_org_22(void)
         Comments:       Ideally the TUT should detect the presence of a fax machine and report it back to
                         the user.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_22_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_22_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_22_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_22_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -3354,8 +3370,8 @@ static int test_org_22(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -3393,12 +3409,12 @@ static int test_org_22(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -3432,23 +3448,23 @@ static int test_org_23(void)
         Comments:       Ideally the TUT should report the presence of speech back to the user, e.g. via
                         circuit 135.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_23_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_23_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_23_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_23_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -3456,8 +3472,8 @@ static int test_org_23(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -3495,12 +3511,12 @@ static int test_org_23(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -3536,23 +3552,23 @@ static int test_org_24(void)
                         2) The TUT should reply with transmission of CM as defined in 5.2.13.
                         3) Verify that CM sequence has correct bit pattern.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_24_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_24_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_24_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_24_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -3560,8 +3576,8 @@ static int test_org_24(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -3599,12 +3615,12 @@ static int test_org_24(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -3635,23 +3651,23 @@ static int test_org_25(void)
         Method:         The Test System waits for the TUT to start transmitting V.21 carrier (1).
         Pass criteria:  The TUT should connect by sending V.21 carrier (1).
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_25_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_25_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_25_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, org_25_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -3659,8 +3675,8 @@ static int test_org_25(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -3698,12 +3714,12 @@ static int test_org_25(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -3735,23 +3751,23 @@ static int test_ans_01(void)
                         answers the call. It will then monitor for any signal.
         Pass criteria:  The TUT should start probing 3s after answering the call.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_01_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_01_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_01_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_01_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -3759,8 +3775,8 @@ static int test_ans_01(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -3798,12 +3814,12 @@ static int test_ans_01(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -3837,23 +3853,23 @@ static int test_ans_02(void)
         Comments:       The ANSam tone is a modulated 2100Hz tone. It may have phase reversals. The
                         XCI signal is tested in a separate test.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_02_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_02_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_02_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_02_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -3861,8 +3877,8 @@ static int test_ans_02(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -3900,12 +3916,12 @@ static int test_ans_02(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -3943,23 +3959,23 @@ static int test_ans_03(void)
                            V.18 mode connection is completed.
         Comments:       The TUT should indicate V.18 mode.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_03_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_03_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_03_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_03_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -3967,8 +3983,8 @@ static int test_ans_03(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -4006,12 +4022,12 @@ static int test_ans_03(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -4043,23 +4059,23 @@ static int test_ans_04(void)
         Pass criteria:  The TUT should start probing 3s after ANSam disappears.
         Comments:       It is assumed that timer Ta is restarted on return to Monitor A.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_04_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_04_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_04_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_04_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -4067,8 +4083,8 @@ static int test_ans_04(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -4106,12 +4122,12 @@ static int test_ans_04(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -4144,23 +4160,23 @@ static int test_ans_05(void)
         Pass criteria:  TUT should respond with 1650Hz within 400+-100ms of start of 980Hz.
         Comments:       The TUT should indicate that V.21 mode has been selected.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_05_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_05_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_05_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_05_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -4168,8 +4184,8 @@ static int test_ans_05(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -4207,12 +4223,12 @@ static int test_ans_05(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -4245,23 +4261,23 @@ static int test_ans_06(void)
         Pass criteria:  TUT should respond with 390Hz after 1.7(+0.2-0.0)s of start of 1300Hz.
         Comments:       The TUT should indicate that V.23 mode has been selected.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_06_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_06_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_06_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_06_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -4269,8 +4285,8 @@ static int test_ans_06(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -4308,12 +4324,12 @@ static int test_ans_06(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -4346,23 +4362,23 @@ static int test_ans_07(void)
         Pass criteria:  TUT should respond with 980Hz within 400+-100ms of start of 1650Hz.
         Comments:       The TUT should indicate that V.21 mode has been selected.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_07_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_07_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_07_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_07_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -4370,8 +4386,8 @@ static int test_ans_07(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -4409,12 +4425,12 @@ static int test_ans_07(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -4449,23 +4465,23 @@ static int test_ans_08(void)
         Comments:       The TUT should indicate a V.21 connection. The time for which each frequency is
                         transmitted is random and varies between 0.64 and 2.56s.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_08_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_08_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_08_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_08_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -4473,8 +4489,8 @@ static int test_ans_08(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -4512,12 +4528,12 @@ static int test_ans_08(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -4551,23 +4567,23 @@ static int test_ans_09(void)
                            700ms followed by 1s of silence.
         Comments:       The probe sent by the TUT will depend on the country setting.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_09_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_09_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_09_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_09_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -4575,8 +4591,8 @@ static int test_ans_09(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -4614,12 +4630,12 @@ static int test_ans_09(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -4651,23 +4667,23 @@ static int test_ans_10(void)
         Pass criteria:  The TUT should respond with a 1650Hz tone in 1.5+-0.1s.
         Comments:       The TUT should indicate that V.21 mode has been selected.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_10_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_10_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_10_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_10_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -4675,8 +4691,8 @@ static int test_ans_10(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -4714,12 +4730,12 @@ static int test_ans_10(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -4755,23 +4771,23 @@ static int test_ans_11(void)
                         be lost during the detection process. However, the number lost should be minimal.
                         The data bits and parity are specified in Annex C.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_11_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_11_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_11_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_11_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -4779,8 +4795,8 @@ static int test_ans_11(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -4818,12 +4834,12 @@ static int test_ans_11(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -4860,23 +4876,23 @@ static int test_ans_12(void)
                         (1650Hz) probe. However, it is catered for in V.18. It is more likely that this is
                         where CI or TXP characters would be detected (see test ANS-02).
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_12_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_12_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_12_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_12_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -4884,8 +4900,8 @@ static int test_ans_12(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -4923,12 +4939,12 @@ static int test_ans_12(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -4964,23 +4980,23 @@ static int test_ans_13(void)
                         when timer Tr will start. It is assumed that timer Ta is restarted on re-entering the
                         Monitor A state.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_13_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_13_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_13_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_13_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -4988,8 +5004,8 @@ static int test_ans_13(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -5027,12 +5043,12 @@ static int test_ans_13(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -5065,23 +5081,23 @@ static int test_ans_14(void)
         Comments:       It is assumed that timer Ta (3s) is restarted on re-entering the Monitor A
                         state.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_14_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_14_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_14_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_14_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -5089,8 +5105,8 @@ static int test_ans_14(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -5128,12 +5144,12 @@ static int test_ans_14(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -5176,23 +5192,23 @@ static int test_ans_15(void)
                         automode answer state. The TUT may then select either 45.45 or 50 bit/s for the
                         transmission.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_15_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_15_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_15_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_15_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -5200,8 +5216,8 @@ static int test_ans_15(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -5239,12 +5255,12 @@ static int test_ans_15(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -5278,23 +5294,23 @@ static int test_ans_16(void)
         Comments:       The TUT should indicate that it has selected DTMF mode. The DTMF capabilities
                         of the TUT should comply with ITU-T Q.24 for the Danish Administration.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_16_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_16_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_16_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_16_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -5302,8 +5318,8 @@ static int test_ans_16(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -5341,12 +5357,12 @@ static int test_ans_16(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -5377,23 +5393,23 @@ static int test_ans_17(void)
         Pass criteria:  TUT should respond with 2225Hz tone after 0.7+-0.1s.
         Comments:       The TUT should indicate that Bell 103 mode has been selected.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_17_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_17_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_17_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_17_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -5401,8 +5417,8 @@ static int test_ans_17(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -5440,12 +5456,12 @@ static int test_ans_17(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -5477,23 +5493,23 @@ static int test_ans_18(void)
         Comments:       The TUT should indicate that Bell 103 mode has been selected. Bell 103 modems
                         use 2225Hz as both answer tone and higher frequency of the upper channel.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_18_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_18_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_18_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_18_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -5501,8 +5517,8 @@ static int test_ans_18(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -5540,12 +5556,12 @@ static int test_ans_18(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -5576,23 +5592,23 @@ static int test_ans_19(void)
         Pass criteria:  The TUT should respond with 980Hz after 0.4+-0.2s.
         Comments:       The TUT should indicate that V.21 mode has been selected.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_19_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_19_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_19_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_19_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -5600,8 +5616,8 @@ static int test_ans_19(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -5639,12 +5655,12 @@ static int test_ans_19(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -5678,23 +5694,23 @@ static int test_ans_20(void)
                            700ms followed by 1s of silence.
         Comments:       The probe sent by the TUT will depend on the country setting.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_20_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_20_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_20_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_20_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -5702,8 +5718,8 @@ static int test_ans_20(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -5741,12 +5757,12 @@ static int test_ans_20(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -5777,23 +5793,23 @@ static int test_ans_21(void)
                         Pass criteria: The TUT should respond with 390Hz after 1.7+-0.1s.
         Comments:       The TUT should indicate that V.23 mode has been selected.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_21_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_21_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_21_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_21_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -5801,8 +5817,8 @@ static int test_ans_21(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -5840,12 +5856,12 @@ static int test_ans_21(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -5876,23 +5892,23 @@ static int test_ans_22(void)
                         silent for 500ms then transmit the TXP signal in V.21 (1) mode.
         Pass criteria:  The TUT should respond with TXP using V.21 (2) and select V.18 mode.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_22_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_22_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_22_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_22_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -5900,8 +5916,8 @@ static int test_ans_22(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -5939,12 +5955,12 @@ static int test_ans_22(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -5978,23 +5994,23 @@ static int test_ans_23(void)
         Pass criteria:  The TUT should use the orders described in Appendix I.
         Comments:       The order of the probes is not mandatory.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_23_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_23_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_23_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_23_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -6002,8 +6018,8 @@ static int test_ans_23(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -6041,12 +6057,12 @@ static int test_ans_23(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -6080,23 +6096,23 @@ static int test_ans_24(void)
                         modes followed by a pause of Tm (default 3)s.
         Comments:       The carrierless modes are those described in Annexes A, B and C.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_24_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_24_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_24_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_24_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -6104,8 +6120,8 @@ static int test_ans_24(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -6143,12 +6159,12 @@ static int test_ans_24(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -6182,23 +6198,23 @@ static int test_ans_25(void)
         Pass criteria:  The TUT should transmit silence on detecting the 1270Hz tone and then continue
                         probing starting with the V.23 probe 20s after the end of the 1270Hz signal.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_25_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_25_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_25_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_25_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -6206,8 +6222,8 @@ static int test_ans_25(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -6245,12 +6261,12 @@ static int test_ans_25(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -6284,23 +6300,23 @@ static int test_ans_26(void)
                         75+-5ms and then the 1650Hz, 1300Hz and 2225Hz probes for time Tc.
         Comments:       The carrier modes are those described in Annexes D, E, and F.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_26_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_26_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_26_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_26_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -6308,8 +6324,8 @@ static int test_ans_26(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -6347,12 +6363,12 @@ static int test_ans_26(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -6390,23 +6406,23 @@ static int test_ans_27(void)
                         390Hz. When the 1300Hz probe is not being transmitted, a 390Hz tone may be
                         interpreted as a 400Hz network tone.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_27_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_27_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_27_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_27_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -6414,8 +6430,8 @@ static int test_ans_27(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -6453,12 +6469,12 @@ static int test_ans_27(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -6494,23 +6510,23 @@ static int test_ans_28(void)
         Comments:       It is most likely that the TUT will return to probing time Ta (3s) after the
                         1270Hz tone ceases. This condition needs further clarification.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_28_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_28_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_28_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_28_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -6518,8 +6534,8 @@ static int test_ans_28(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -6557,12 +6573,12 @@ static int test_ans_28(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -6597,23 +6613,23 @@ static int test_ans_29(void)
         Comments:       The TUT may not respond to any signals while a carrierless mode probe is being
                         sent since these modes are half duplex.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_29_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_29_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_29_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_29_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -6621,8 +6637,8 @@ static int test_ans_29(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -6660,12 +6676,12 @@ static int test_ans_29(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -6702,23 +6718,23 @@ static int test_ans_30(void)
                         tones may be ignored. Some devices may only provide a visual indication of the
                         presence and cadence of the tones for instance by a flashing light.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_30_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_30_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_30_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_30_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -6726,8 +6742,8 @@ static int test_ans_30(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -6765,12 +6781,12 @@ static int test_ans_30(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -6804,23 +6820,23 @@ static int test_ans_31(void)
         Comments:       This is an optional test as detection of the fax calling tone is not required by
                         ITU-T V.18.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_31_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_31_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_31_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_31_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -6828,8 +6844,8 @@ static int test_ans_31(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -6867,12 +6883,12 @@ static int test_ans_31(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -6906,23 +6922,23 @@ static int test_ans_32(void)
         Comments:       Ideally the TUT should report the presence of speech back to the user. This is an
                         optional test.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_32_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_32_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_32_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_32_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -6930,8 +6946,8 @@ static int test_ans_32(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -6969,12 +6985,12 @@ static int test_ans_32(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -7013,23 +7029,23 @@ static int test_ans_33(void)
                            V.18 mode connection is completed.
         Comments:       The TUT should indicate V.18 mode.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_33_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_33_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_33_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, ans_33_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -7037,8 +7053,8 @@ static int test_ans_33(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -7076,12 +7092,12 @@ static int test_ans_33(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -7125,23 +7141,23 @@ static int test_mon_21(void)
                         for 1 minute.
         Pass criteria:  The TUT should not start probing.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, mon_21_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, mon_21_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, mon_21_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, mon_21_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -7149,8 +7165,8 @@ static int test_mon_21(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -7188,12 +7204,12 @@ static int test_mon_21(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -7229,23 +7245,23 @@ static int test_mon_22(void)
         Comments:       In automode answer, the 1300Hz calling causes the DCE to start probing. In
                         monitor mode it should only report detection to the DTE.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, mon_22_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, mon_22_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, mon_22_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, mon_22_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -7253,8 +7269,8 @@ static int test_mon_22(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -7292,12 +7308,12 @@ static int test_mon_22(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -7333,23 +7349,23 @@ static int test_mon_23(void)
         Comments:       In automode answer, the 980Hz calling causes the DCE to start probing. In monitor
                         mode it should only report detection to the DTE.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, mon_23_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, mon_23_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, mon_23_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, mon_23_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -7357,8 +7373,8 @@ static int test_mon_23(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -7396,12 +7412,12 @@ static int test_mon_23(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -7411,9 +7427,9 @@ static void x_01_put_text_msg(void *user_data, const uint8_t *msg, int len)
 {
 printf("1-1 %d '%s'\n", len, msg);
     if (user_data == NULL)
-        strcat(result[1], (const char *) msg);
+        strcat(result[TUT], (const char *) msg);
     else
-        v18_put(v18[1], "abcdefghij", 10);
+        v18_put(v18[TUT], "abcdefghij", 10);
 }
 /*- End of function --------------------------------------------------------*/
 
@@ -7444,23 +7460,23 @@ static int test_x_01(void)
                         3) The tester will confirm that 1 start bit and at least 1.5 stop bits are used.
         Comments:       The carrier should be maintained during the 300ms after a character.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_5BIT_4545, V18_AUTOMODING_GLOBAL, x_01_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_5BIT_4545, V18_AUTOMODING_GLOBAL, x_01_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_5BIT_4545, V18_AUTOMODING_GLOBAL, x_01_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_5BIT_4545, V18_AUTOMODING_GLOBAL, x_01_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -7468,9 +7484,9 @@ static int test_x_01(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
-    v18_put(v18[0], "zabcdefghijklmnopq", -1);
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
+    v18_put(v18[TESTER], "zabcdefghijklmnopq", -1);
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -7508,16 +7524,16 @@ static int test_x_01(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     ref = "cdefghij";
-    printf("Result:\n%s\n", result[1]);
+    printf("Result:\n%s\n", result[TUT]);
     printf("Reference result:\n%s\n", ref);
-    if (unexpected_echo  ||  strcmp(result[1], ref) != 0)
+    if (unexpected_echo  ||  strcmp(result[TUT], ref) != 0)
         return -1;
     return 1;
 }
@@ -7549,23 +7565,23 @@ static int test_x_02(void)
                         transmit the string "abcdef" at each rate.
         Pass criteria:  The tester will measure the bit timings and confirm the rates.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_5BIT_4545, V18_AUTOMODING_GLOBAL, x_02_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_5BIT_4545, V18_AUTOMODING_GLOBAL, x_02_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_5BIT_4545, V18_AUTOMODING_GLOBAL, x_02_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_5BIT_4545, V18_AUTOMODING_GLOBAL, x_02_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -7573,8 +7589,8 @@ static int test_x_02(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -7612,12 +7628,12 @@ static int test_x_02(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -7652,23 +7668,23 @@ static int test_x_03(void)
         Comments:       The probe message must be long enough for the tester to establish the bit rate. "GA"
                         may not be sufficient.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_5BIT_4545, V18_AUTOMODING_USA, x_03_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_5BIT_4545, V18_AUTOMODING_USA, x_03_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_5BIT_4545, V18_AUTOMODING_USA, x_03_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_5BIT_4545, V18_AUTOMODING_USA, x_03_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -7676,8 +7692,8 @@ static int test_x_03(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -7715,12 +7731,12 @@ static int test_x_03(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 1;
 }
@@ -7730,16 +7746,16 @@ static void x_04_put_text_msg(void *user_data, const uint8_t *msg, int len)
 {
     if (user_data == NULL)
     {
-        strcat(result[0], (const char *) msg);
+        strcat(result[TESTER], (const char *) msg);
 printf("Unexpected ECHO received (%d) '%s'\n", len, msg);
         unexpected_echo = true;
     }
     else
     {
 printf("1-1 %d '%s'\n", len, msg);
-        strcat(result[1], (const char *) msg);
+        strcat(result[TUT], (const char *) msg);
         /* Echo each received character */
-        //v18_put(v18[1], msg, len);
+        //v18_put(v18[TUT], msg, len);
     }
 }
 /*- End of function --------------------------------------------------------*/
@@ -7775,23 +7791,23 @@ static int test_x_04(void)
                         assumed that the character conversion is the same for Baudot at 50 bit/s and any
                         other supported speed.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_5BIT_4545 | V18_MODE_REPETITIVE_SHIFTS_OPTION, V18_AUTOMODING_GLOBAL, x_04_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_5BIT_4545 | V18_MODE_REPETITIVE_SHIFTS_OPTION, V18_AUTOMODING_GLOBAL, x_04_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_5BIT_4545 | V18_MODE_REPETITIVE_SHIFTS_OPTION, V18_AUTOMODING_GLOBAL, x_04_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_5BIT_4545 | V18_MODE_REPETITIVE_SHIFTS_OPTION, V18_AUTOMODING_GLOBAL, x_04_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -7799,13 +7815,13 @@ static int test_x_04(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     unexpected_echo = false;
     for (i = 0;  i < 127;  i++)
         msg[i] = i + 1;
     msg[127] = '\0';
-    v18_put(v18[0], msg, 127);
+    v18_put(v18[TESTER], msg, 127);
 
     for (i = 0;  i < 2000;  i++)
     {
@@ -7843,16 +7859,16 @@ static int test_x_04(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
-    printf("Result:\n%s\n", result[0]);
-    printf("Result:\n%s\n", result[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
+    printf("Result:\n%s\n", result[TESTER]);
+    printf("Result:\n%s\n", result[TUT]);
     printf("Reference result:\n%s\n", full_baudot_rx);
-    if (unexpected_echo  ||  strcmp(result[1], full_baudot_rx) != 0)
+    if (unexpected_echo  ||  strcmp(result[TUT], full_baudot_rx) != 0)
         return -1;
     return 0;
 }
@@ -7864,14 +7880,14 @@ static void x_05_put_text_msg(void *user_data, const uint8_t *msg, int len)
     {
         /* Gather the received characters, which should be like the transmitted characters,
            but with the first three characters missing. */
-        strcat(result[0], (const char *) msg);
+        strcat(result[TESTER], (const char *) msg);
     }
     else
     {
         /* Receiving a character from the far end should block out its receiver
            for a while. If we send a stream of DTMF back, the first few characters
            (actually 3 for this particular text string) should be lost. */
-        v18_put(v18[1], "behknqtwz", 9);
+        v18_put(v18[TUT], "behknqtwz", 9);
     }
 }
 /*- End of function --------------------------------------------------------*/
@@ -7900,23 +7916,23 @@ static int test_x_05(void)
                         display will show when its receiver is re-enabled.
         Pass criteria:  The receiver should be re-enabled after 300ms.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, x_05_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, x_05_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, x_05_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, x_05_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -7924,10 +7940,10 @@ static int test_x_05(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     /* Sending a character should block out the receiver for a while */
-    v18_put(v18[0], "z", 1);
+    v18_put(v18[TESTER], "z", 1);
 
     for (i = 0;  i < 1000;  i++)
     {
@@ -7966,16 +7982,16 @@ static int test_x_05(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     ref = "knqtwz";
-    printf("Result:\n%s\n", result[0]);
+    printf("Result:\n%s\n", result[TESTER]);
     printf("Reference result:\n%s\n", ref);
-    if (strcmp(result[0], ref) != 0)
+    if (strcmp(result[TESTER], ref) != 0)
         return -1;
     return 0;
 }
@@ -7986,7 +8002,7 @@ static void x_06_put_text_msg(void *user_data, const uint8_t *msg, int len)
     if (user_data == NULL)
         ;
     else
-        strcat(result[1], (const char *) msg);
+        strcat(result[TUT], (const char *) msg);
 }
 /*- End of function --------------------------------------------------------*/
 
@@ -8018,23 +8034,23 @@ static int test_x_06(void)
                         receiving character from the TUT. It is assumed that the echo delay in the test
                         system is negligible.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, x_06_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, x_06_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, x_06_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, x_06_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -8042,12 +8058,12 @@ static int test_x_06(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 127;  i++)
         msg[i] = i + 1;
     msg[127] = '\0';
-    v18_put(v18[0], msg, 127);
+    v18_put(v18[TESTER], msg, 127);
 
     for (i = 0;  i < 10000;  i++)
     {
@@ -8086,19 +8102,19 @@ static int test_x_06(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
     ref = "\b \n\n\n?\n\n\n  !%+().+,-.0123456789:;(=)?"
           "XABCDEFGHIJKLMNOPQRSTUVWXYZ\xC6\xD8\xC5"
           " abcdefghijklmnopqrstuvwxyz\xE6\xF8\xE5 \b";
 
-    printf("Result:\n%s\n", result[0]);
+    printf("Result:\n%s\n", result[TESTER]);
     printf("Reference result:\n%s\n", ref);
-    v18_free(v18[0]);
-    v18_free(v18[1]);
-    if (strcmp(result[1], ref) != 0)
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
+    if (strcmp(result[TUT], ref) != 0)
         return -1;
     return 0;
 }
@@ -8135,23 +8151,23 @@ static int test_x_07(void)
                         3) The tester will confirm that 1 start bit and at least 1.5 stop bits are used.
         Comments:       The carrier should be maintained during the 300ms after a character.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, x_07_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, x_07_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, x_07_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, x_07_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -8159,8 +8175,8 @@ static int test_x_07(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -8198,12 +8214,12 @@ static int test_x_07(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 0;
 }
@@ -8236,23 +8252,23 @@ static int test_x_08(void)
                         2) The tester should confirm that 1 start bit, 7 data bits, 1 even parity bit and 2 stop
                            bits are used.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, x_08_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, x_08_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, x_08_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, x_08_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -8260,8 +8276,8 @@ static int test_x_08(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -8299,12 +8315,12 @@ static int test_x_08(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 0;
 }
@@ -8340,23 +8356,23 @@ static int test_x_09(void)
                            that there are no duplicate characters on the TUT display.
                         3) The received string should be correctly displayed despite the incorrect parity.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, x_09_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, x_09_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, x_09_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, x_09_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -8364,8 +8380,8 @@ static int test_x_09(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -8403,12 +8419,12 @@ static int test_x_09(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 0;
 }
@@ -8447,23 +8463,23 @@ static int test_x_10(void)
         Comments:       This test is only applicable to Minitel Dialogue terminals. Prestel and Minitel
                         Normal terminals cannot operate in this mode.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, x_10_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, x_10_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, x_10_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, x_10_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -8471,8 +8487,8 @@ static int test_x_10(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -8510,12 +8526,12 @@ static int test_x_10(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 0;
 }
@@ -8552,23 +8568,23 @@ static int test_x_11(void)
                         4) The last five characters on the TUT display should be "12345" (no "6")
                            correctly displayed despite the incorrect parity.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, x_11_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, x_11_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, x_11_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, x_11_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -8576,8 +8592,8 @@ static int test_x_11(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -8615,12 +8631,12 @@ static int test_x_11(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 0;
 }
@@ -8654,23 +8670,23 @@ static int test_x_12(void)
         Pass criteria:  The tester should confirm UTF8 encoded UNICODE characters are used with the
                         controls specified in ITU-T T.140.
      */
-    v18[0] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, x_12_put_text_msg, (void *) (intptr_t) 0);
-    logging = v18_get_logging_state(v18[0]);
+    v18[TESTER] = v18_init(NULL, true, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, x_12_put_text_msg, (void *) (intptr_t) 0);
+    logging = v18_get_logging_state(v18[TESTER]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "Tester");
-    v18[1] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, x_12_put_text_msg, (void *) (intptr_t) 1);
-    logging = v18_get_logging_state(v18[1]);
+    v18[TUT] = v18_init(NULL, false, V18_MODE_DTMF, V18_AUTOMODING_GLOBAL, x_12_put_text_msg, (void *) (intptr_t) 1);
+    logging = v18_get_logging_state(v18[TUT]);
     span_log_set_level(logging, SPAN_LOG_SHOW_SEVERITY | SPAN_LOG_SHOW_PROTOCOL | SPAN_LOG_FLOW);
     span_log_set_tag(logging, "TUT");
 
     if ((model = both_ways_line_model_init(line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe1,
+                                           echo_level_co1,
                                            line_model_no,
-                                           (float) noise_level,
-                                           -15.0f,
-                                           -15.0f,
+                                           noise_level,
+                                           echo_level_cpe2,
+                                           echo_level_co2,
                                            channel_codec,
                                            rbs_pattern)) == NULL)
     {
@@ -8678,8 +8694,8 @@ static int test_x_12(void)
         exit(2);
     }
 
-    result[0][0] =
-    result[1][0] = '\0';
+    result[TESTER][0] =
+    result[TUT][0] = '\0';
     for (i = 0;  i < 10000;  i++)
     {
         for (j = 0;  j < 2;  j++)
@@ -8717,12 +8733,12 @@ static int test_x_12(void)
         vec_copyi16(model_amp[0], amp[0], samples);
         vec_copyi16(model_amp[1], amp[1], samples);
 #endif
-        v18_rx(v18[0], model_amp[1], samples);
-        v18_rx(v18[1], model_amp[0], samples);
+        v18_rx(v18[TESTER], model_amp[1], samples);
+        v18_rx(v18[TUT], model_amp[0], samples);
     }
 
-    v18_free(v18[0]);
-    v18_free(v18[1]);
+    v18_free(v18[TESTER]);
+    v18_free(v18[TUT]);
     printf("Test not yet implemented\n");
     return 0;
 }
diff --git a/scripts/dialog-installer.sh b/scripts/dialog-installer.sh
new file mode 100644
index 0000000000..bf9d378367
--- /dev/null
+++ b/scripts/dialog-installer.sh
@@ -0,0 +1,273 @@
+#!/bin/sh
+# (C) 2016 Ken Rice <krice@freeswitch.org>
+# Licensed as per the MPL1.1
+#
+########################################################
+# TODO: FreeSWITCH AutoStart
+# TODO: Install on Raspbian
+# TODO: Allow Selection of Source or Package Install on Debian
+
+DIALOG=${DIALOG=dialog}
+tempfile=`tempfile 2>/dev/null` || tempfile=/tmp/test$$
+trap "rm -f $tempfile" 0 1 2 5 15
+
+. /etc/os-release
+
+install_prereqs() {
+	#install the prereqs
+	echo "Making sure we have the prereqs for this script to run. Please Stand by..."
+	apt-get update 2>&1 >/dev/null
+	apt-get install -y curl dialog git 2>&1 >/dev/null
+}
+
+welcome_screen() {
+	$DIALOG --title "FreeSWITCH with LetsEncrypt AutoInstaller" --clear \
+		--msgbox "This Script with automattically Install FreeSWITCH \
+On your Debian 8 Jessie Machine, it will also install \
+Verto Communicator and use LetsEncrypt for the required \
+SSL Certificates needed for Proper WebRTC Communications.\n\n\
+Please keep in mind that you will need a proper DNS \
+Name pointed at this machine's public IP address along \
+with ports 80 and 443 opened on the firewall. \n\n\
+Additionally, you will need TCP ports 5060, 5061, 8081, \
+8082 and UDP ports 16384-32768 open on your firewall for \
+FreeSWITCH and Verto Communicator for function properly. \n\n\
+Press <Enter> to Continue or <ESC> to abort." 19 60
+
+	case $? in
+		0)
+			;;
+		255)
+			exit 1;;
+	esac
+}
+
+fs_ver_select() {
+	$DIALOG --backtitle "FreeSWITCH Version" \
+		--title "RADIOLIST BOX" --clear \
+		--radiolist "Which Version of FreeSWITCH are you installing? \n" 20 61 5 \
+		"1"  "FreeSWITCH 1.7" ON \
+		"2"  "FreeSWITCH 1.6" off 2> $tempfile
+
+	retval=$?
+
+	choice=`cat $tempfile`
+	case $retval in
+		0)
+			case $choice in
+				1) 
+					FS_REV="master";;
+				2)
+					FS_REV="1.6";;
+			esac;;
+		1)
+			exit 1;;
+		255)
+			exit 1;;
+	esac
+}
+
+get_network_settings() {
+	FQDN=`hostname -f`
+	DOMAIN=`hostname -d`
+	IPADDR=`ifconfig | sed -En 's/127.0.0.1//;s/.*inet (addr:)?(([0-9]*\.){3}[0-9]*).*/\2/p'|tail -n 1`
+	EMAIL="hostmaster@$DOMAIN";
+
+	dialog --title "System Setup Information" \
+		--form "\nVerify or correct the Fully Qualified Domain Name and IP Address of your machine.\nAlso enter a valid Email Address for system and LetsEncrypt email alerts" 25 60 16 \
+		"FQDN:" 1 1 "$FQDN" 1 25 25 40 \
+		"IP Address:" 2 1 "$IPADDR" 2 25 25 30 \
+		"Email Address:" 3 1 "$EMAIL" 3 25 25 40 \
+		2> $tempfile
+	FQDN=`head -n1 $tempfile`
+	IPADDR=`tail -n2 $tempfile|head -n1`
+	EMAIL=`tail -n1 $tempfile`
+
+	retval=$?
+
+	case $retval in
+		0)
+			;;
+		1)
+			exit 1;;
+		255)
+			exit 1;;
+	esac
+}
+
+is_private_ip() {
+	PAT='^10\.|^192\.168\.|^169\.254\.|^172\.1[6-9]\.|^172\.2[0-9]\.|^172\.3[0-1]\.'
+	echo $IPADDR | egrep "$PAT"
+}
+
+verify_ip_fqdn() {
+	DNSIP=`dig +noall +answer @4.2.2.2 $FQDN | cut -d'	' -f3` 
+
+	dialog --title "NO DNS For this FQDN" --clear \
+		--menu "The FQDN and IP Address do not match what is available in Public DNS Servers." 15 60 5 \
+	1 "Continue installation without LetsEncrypt." 2 "Abort Installation" 2> $tempfile
+	LE_CHOICE=`cat $tempfile`
+	if [ "$IPADDR" != "$DNSIP" ]; then
+
+		if [ "x$LE_CHOICE" = "x1" ]; then
+			VIPFQDN=1
+		else 
+			VIPFQDN=2
+		fi 
+	else
+		VIPFQDN=0
+	fi
+}
+
+config_fs_repos() {
+	curl https://files.freeswitch.org/repo/deb/debian/freeswitch_archive_g0.pub | apt-key add -
+	if [ "$FS_REV" = "master" ]; then
+		echo "deb http://files.freeswitch.org/repo/deb/debian-unstable/ jessie main" >/etc/apt/sources.list.d/freeswitch.list
+		REPO="https://freeswitch.org/stash/scm/fs/freeswitch.git"
+	elif [ "$FS_REV" = "1.6" ]; then
+		echo "deb http://files.freeswitch.org/repo/deb/freeswitch-1.6/ jessie main" > /etc/apt/sources.list.d/freeswitch.list
+		REPO="-b v1.6 https://silik0n@freeswitch.org/stash/scm/fs/freeswitch.git"
+	fi
+	apt-get update 2>&1 >/dev/null
+}
+
+get_fs_source() {
+	if [ ! -d /usr/src/freeswitch.git ]; then
+		cd /usr/src
+		git clone $REPO freeswitch.git
+	else
+		cd /usr/src/freeswitch.git
+		git clean -fdx
+		git reset -hard origin/$FS_REV
+		git pull
+	fi
+}
+
+get_letsencrypt() {
+	if [ ! -d /usr/src/letsencrypt ]; then
+		cd /usr/src
+		git clone https://github.com/letsencrypt/letsencrypt.git letsencrypt
+	else
+		git clean -fdx
+		git pull
+	fi
+}
+
+install_certs() {
+	get_letsencrypt
+	cd /usr/src/letsencrypt
+	NEED_CERTS_INSTALL=1
+
+	if [ -f /etc/letsencrypt/live/$FQDN/cert.pem ]; then
+		if openssl x509 -checkend 2592000 -noout -in /etc/letsencrypt/live/$FQDN/cert.pem; then
+			echo "Skipping LetsEncrypt These Certs are good for atleast 30 days."
+			NEED_CERTS_INSTALL=0
+		else
+			echo "Renewing LetsEncrypt Certs as they will expire in the next 30 days."
+			./letsencrypt-auto renew
+		fi
+	else
+		echo "Setting up LetsEncrypt and getting you some nice new Certs for this Server."
+		./letsencrypt-auto run -d $FQDN --email $EMAIL
+	fi
+
+	# if we dont have the FreeSWITCH Certs Directory, make it
+	if [ $NEED_CERTS_INSTALL -eq 1 ]; then
+
+		if [ ! -d /usr/local/freeswitch/certs ]; then
+			mkdir -p /usr/local/freeswitch/certs
+		fi
+
+		cat /etc/letsencrypt/live/$FQDN/cert.pem /etc/letsencrypt/live/$FQDN/privkey.pem \
+			/etc/letsencrypt/live/$FQDN/chain.pem > /usr/local/freeswitch/certs/wss.pem
+	fi
+
+}
+
+build_fs() {
+	get_fs_source
+
+	#if we already have a FreeSWITCH install from source clean out the old bins
+	if [ -d /usr/local/freeswitch/bin ]; then
+		rm -rf /usr/local/freeswitch/{bin,mod,lib}/*
+	fi
+	cd /usr/src/freeswitch.git
+	./bootstrap.sh -j
+	./configure -C
+	make -j$JLIMIT install
+	make uhd-sounds-install
+	make uhd-moh-install
+}
+
+install_vc() {
+	if [ ! -d /usr/src/freeswitch.git/html5/verto/verto_communicator ]; then
+		get_fs_source
+	fi
+
+	if [ ! -x /usr/sbin/apache2 ]; then
+		apt-get update 2>&1 >/dev/null
+		apt-get install -y apache2
+	fi	
+
+	cd /usr/src/freeswitch.git/html5/verto/verto_communicator
+	apt-get update 
+	apt-get install npm nodejs-legacy -y
+	npm install -g grunt grunt-cli bower
+	npm install
+	bower --allow-root install
+	grunt build
+	cp -a dist /var/www/html/vc
+}
+
+
+freeswitch_debian_packages() {
+	apt-get install -o Dpkg::Progress=1 -y freeswitch-all freeswitch-all-dbg gdb 2>&1 | awk -W interactive '/Progress/ { print }'| \
+		sed -u 's/[^0-9]//g' | dialog --gauge "Please wait.\n Installing FreeSWITCH..." 10 70 0
+}
+
+freeswitch_debian_source() {
+	apt-get install -o Dpkg::Progress=1 -y freeswitch-video-deps-most \
+		2>&1 | awk -W interactive '/Progress/ { print }'| sed -u 's/[^0-9]//g' | \
+		dialog --gauge "Please wait.\n Installing Build Requirements..." 10 70 0
+
+	build_fs
+}
+
+freeswitch_raspbian_source() {
+	apt-get install -o Dpkg::Progress=1 -y autoconf automake devscripts gawk libjpeg-dev libncurses5-dev libtool-bin python-dev \
+		libtiff5-dev libperl-dev libgdbm-dev libdb-dev gettext libssl-dev libcurl4-openssl-dev libpcre3-dev libspeex-dev \
+		libspeexdsp-dev libsqlite3-dev libedit-dev libldns-dev libpq-dev libsndfile-dev libopus-dev liblua5.1-0-dev 2>&1 | \
+		awk -W interactive '/Progress/ { print }'| sed -u 's/[^0-9]//g' | dialog --gauge "Please wait.\n Installing Build Requirements..." 10 70 0
+
+}
+
+# install_prereqs
+welcome_screen
+fs_ver_select
+get_network_settings
+
+if [ "$ID" = "debian" ]; then
+	config_fs_repos
+	freeswitch_debian_source
+elif [ "$ID" = "raspbian" ]; then	
+	#freeswitch_raspbiani123
+	JLIMIT="3"
+fi
+
+install_vc
+
+PRIVIP=$(is_private_ip)
+if [ "x$PRIVIP" != "x$IPADDR" ]; then
+	verify_ip_fqdn
+	if [ $VIPFQDN -eq 2 ]; then
+		exit 1;
+	elif [ $VIPFQDN -eq 1 ]; then
+		echo "Skipping LetsEncrypt\n"
+	else 
+		get_dletsencrypt
+		install_certs
+	fi
+else
+	echo "Skipping LetsEncrypt. Since we are on Private IP Space";
+fi
+
diff --git a/scripts/perl/g729_activate b/scripts/perl/g729_activate
new file mode 100644
index 0000000000..d900a2e425
--- /dev/null
+++ b/scripts/perl/g729_activate
@@ -0,0 +1,39 @@
+#!/usr/bin/perl
+
+use IPC::Run qw( run );
+use ESL;
+
+if(!$ARGV[0]) {
+  print "$0 [licensecode]\n";
+  exit 0;
+}
+
+my $code = $ARGV[0];
+
+chdir("/data/tmp");
+
+my $UNZIP;
+$UNZIP     = "/usr/bin/unzip"       if !(-x $UNZIP);
+$UNZIP     = "/usr/local/bin/unzip" if !(-x $UNZIP);
+
+my $in = "$code\n\nY\n";
+my $err;
+my $out;
+
+my $activator = run ( ['/usr/bin/validator'],
+   	            '<pty<', \$in, '>pty>', \$output );
+
+if($output =~ m/Success/) {
+  if(-f "licences.zip") {
+    run( [ $UNZIP, '-o', "-d", "/etc/freeswitch", 'licences.zip'], \$in, \$out, \$err );
+    run( [ '/usr/bin/pkill', '-HUP', 'freeswitch_licence_server'], \$in, \$out, \$err );
+    my $c = new ESL::ESLconnection("localhost", "8021", "ClueCon");
+    my $e = $c->sendRecv('api g729_count');
+    my $count = $e->getBody();
+    print "Success, License count: $count\n";
+    exit 0;
+  }
+} else {
+  print "Failed to activate.";
+  exit 1;
+}
diff --git a/scripts/perl/timezone-gen.pl b/scripts/perl/timezone-gen.pl
index 6127005062..e812023ef0 100755
--- a/scripts/perl/timezone-gen.pl
+++ b/scripts/perl/timezone-gen.pl
@@ -2,6 +2,9 @@
 
 use strict;
 use Getopt::Long;
+use XML::Entities;
+use HTML::Entities;
+
 
 my $base   = "/usr/share/zoneinfo";
 my $output = "timezones.conf.xml";
@@ -70,7 +73,7 @@ print $out " " x 4, "<timezones>\n";
 
 my $lastprefix = "";
 foreach my $zone ( sort( keys(%zones) ) ) {
-    my $str = $zones{$zone};
+    my $str = encode_entities($zones{$zone});
     next if ( !$str );
 
     my $newprefix = $zone;
diff --git a/src/include/switch_core.h b/src/include/switch_core.h
index 16669672e8..7a0a0e49ad 100644
--- a/src/include/switch_core.h
+++ b/src/include/switch_core.h
@@ -1443,18 +1443,18 @@ SWITCH_DECLARE(void *) switch_core_hash_delete(_In_ switch_hash_t *hash, _In_z_
   \param hash the hash to delete from
   \param key the key from which to delete the data
   \param mutex optional mutex to lock
-  \return SWITCH_STATUS_SUCCESS if the data is deleted
+  \return a pointer to the deleted data
 */
-SWITCH_DECLARE(switch_status_t) switch_core_hash_delete_locked(_In_ switch_hash_t *hash, _In_z_ const char *key, _In_opt_ switch_mutex_t *mutex);
+SWITCH_DECLARE(void *) switch_core_hash_delete_locked(_In_ switch_hash_t *hash, _In_z_ const char *key, _In_opt_ switch_mutex_t *mutex);
 
 /*! 
   \brief Delete data from a hash based on desired key
   \param hash the hash to delete from
   \param key the key from which to delete the data
   \param mutex optional rwlock to wrlock
-  \return SWITCH_STATUS_SUCCESS if the data is deleted
+  \return a pointer to the deleted data
 */
-SWITCH_DECLARE(switch_status_t) switch_core_hash_delete_wrlock(_In_ switch_hash_t *hash, _In_z_ const char *key, _In_opt_ switch_thread_rwlock_t *rwlock);
+SWITCH_DECLARE(void *) switch_core_hash_delete_wrlock(_In_ switch_hash_t *hash, _In_z_ const char *key, _In_opt_ switch_thread_rwlock_t *rwlock);
 
 /*! 
   \brief Delete data from a hash based on callback function
@@ -1929,8 +1929,10 @@ SWITCH_DECLARE(switch_status_t) switch_core_file_get_string(_In_ switch_file_han
 */
 SWITCH_DECLARE(switch_status_t) switch_core_file_close(_In_ switch_file_handle_t *fh);
 
+SWITCH_DECLARE(switch_status_t) switch_core_file_command(switch_file_handle_t *fh, switch_file_command_t command);
+
 SWITCH_DECLARE(switch_status_t) switch_core_file_truncate(switch_file_handle_t *fh, int64_t offset);
-SWITCH_DECLARE(switch_bool_t) switch_core_file_has_video(switch_file_handle_t *fh);
+SWITCH_DECLARE(switch_bool_t) switch_core_file_has_video(switch_file_handle_t *fh, switch_bool_t CHECK_OPEN);
 
 
 ///\}
@@ -2735,6 +2737,8 @@ SWITCH_DECLARE(const char *)switch_version_revision_human(void);
 SWITCH_DECLARE(const char *)switch_version_full(void);
 SWITCH_DECLARE(const char *)switch_version_full_human(void);
 
+SWITCH_DECLARE(void) switch_core_autobind_cpu(void);
+
 SWITCH_END_EXTERN_C
 #endif
 /* For Emacs:
diff --git a/src/include/switch_core_media.h b/src/include/switch_core_media.h
index c021d0a90e..ad5028b0b1 100644
--- a/src/include/switch_core_media.h
+++ b/src/include/switch_core_media.h
@@ -39,6 +39,7 @@
 SWITCH_BEGIN_EXTERN_C
 
 #define SWITCH_MAX_CAND_ACL 25
+#define SWITCH_NO_CRYPTO_TAG -1
 
 typedef enum {
 	DTMF_2833,
@@ -168,6 +169,8 @@ typedef struct switch_core_media_params_s {
 	uint32_t video_key_freq;
 	uint32_t video_key_first;
 
+	switch_thread_t *video_write_thread;
+
 } switch_core_media_params_t;
 
 static inline const char *switch_media_type2str(switch_media_type_t type)
@@ -253,6 +256,7 @@ SWITCH_DECLARE(void) switch_core_media_set_telephony_event(switch_core_session_t
 SWITCH_DECLARE(void) switch_core_media_set_telephony_recv_event(switch_core_session_t *session, switch_media_type_t type, switch_payload_t te);
 SWITCH_DECLARE(switch_rtp_stats_t *) switch_core_media_stats(switch_core_session_t *session, switch_media_type_t type, switch_memory_pool_t *pool);
 SWITCH_DECLARE(switch_status_t) switch_core_media_udptl_mode(switch_core_session_t *session, switch_media_type_t type);
+SWITCH_DECLARE(switch_bool_t) switch_core_media_check_udptl_mode(switch_core_session_t *session, switch_media_type_t type);
 
 SWITCH_DECLARE(void) switch_core_media_set_rtp_flag(switch_core_session_t *session, switch_media_type_t type, switch_rtp_flag_t flag);
 SWITCH_DECLARE(void) switch_core_media_clear_rtp_flag(switch_core_session_t *session, switch_media_type_t type, switch_rtp_flag_t flag);
@@ -337,12 +341,15 @@ SWITCH_DECLARE(switch_status_t) switch_core_media_read_lock_unlock(switch_core_s
 SWITCH_DECLARE(void) switch_core_session_stop_media(switch_core_session_t *session);
 SWITCH_DECLARE(switch_media_flow_t) switch_core_session_media_flow(switch_core_session_t *session, switch_media_type_t type);
 SWITCH_DECLARE(switch_status_t) switch_core_media_get_vid_params(switch_core_session_t *session, switch_vid_params_t *vid_params);
+SWITCH_DECLARE(switch_status_t) switch_core_media_lock_video_file(switch_core_session_t *session, switch_rw_t rw);
+SWITCH_DECLARE(switch_status_t) switch_core_media_unlock_video_file(switch_core_session_t *session, switch_rw_t rw);
 SWITCH_DECLARE(switch_status_t) switch_core_media_set_video_file(switch_core_session_t *session, switch_file_handle_t *fh, switch_rw_t rw);
 SWITCH_DECLARE(switch_file_handle_t *) switch_core_media_get_video_file(switch_core_session_t *session, switch_rw_t rw);
 SWITCH_DECLARE(switch_bool_t) switch_core_session_in_video_thread(switch_core_session_t *session);
 SWITCH_DECLARE(switch_bool_t) switch_core_media_check_dtls(switch_core_session_t *session, switch_media_type_t type);
 SWITCH_DECLARE(switch_status_t) switch_core_media_set_outgoing_bitrate(switch_core_session_t *session, switch_media_type_t type, uint32_t bitrate);
 SWITCH_DECLARE(switch_status_t) switch_core_media_reset_jb(switch_core_session_t *session, switch_media_type_t type);
+SWITCH_DECLARE(switch_status_t) switch_core_session_wait_for_video_input_params(switch_core_session_t *session, uint32_t timeout_ms);
 																
 SWITCH_END_EXTERN_C
 #endif
diff --git a/src/include/switch_core_video.h b/src/include/switch_core_video.h
index 33413cd6f4..00b9de4175 100644
--- a/src/include/switch_core_video.h
+++ b/src/include/switch_core_video.h
@@ -330,16 +330,39 @@ SWITCH_DECLARE(void) switch_png_free(switch_png_t **pngP);
 * \param[in]    img       The small Image descriptor
 * \param[in]    x         Leftmost pos
 * \param[in]    y         Topmost pos
-* \param[in]    alpha     Alaha value from 0(completely transparent) to 255(opaque)
+* \param[in]    percent   Alaha value from 0(completely transparent) to 100(opaque)
 */
-SWITCH_DECLARE(void) switch_img_overlay(switch_image_t *IMG, switch_image_t *img, int x, int y, uint8_t alpha);
+SWITCH_DECLARE(void) switch_img_overlay(switch_image_t *IMG, switch_image_t *img, int x, int y, uint8_t percent);
 
 SWITCH_DECLARE(switch_status_t) switch_img_scale(switch_image_t *src, switch_image_t **destP, int width, int height);
 SWITCH_DECLARE(switch_status_t) switch_img_fit(switch_image_t **srcP, int width, int height, switch_img_fit_t fit);
 SWITCH_DECLARE(switch_img_position_t) parse_img_position(const char *name);
 SWITCH_DECLARE(switch_img_fit_t) parse_img_fit(const char *name);
 SWITCH_DECLARE(void) switch_img_find_position(switch_img_position_t pos, int sw, int sh, int iw, int ih, int *xP, int *yP);
-SWITCH_DECLARE(switch_status_t) switch_img_convert(switch_image_t *src, switch_convert_fmt_t fmt, void *dest, switch_size_t *size);
+
+/*!\brief convert img to raw format
+*
+* dest should be pre-allocated and big enough for the target fmt
+*
+* \param[in]    src       The image descriptor
+* \param[in]    dest      The target memory address
+* \param[in]    size      The size of target memory address used for bounds check
+* \param[in]    fmt       The target format
+*/
+SWITCH_DECLARE(switch_status_t) switch_img_to_raw(switch_image_t *src, void *dest, switch_size_t size, switch_img_fmt_t fmt);
+/*!\brief convert raw memory to switch_img_t
+*
+* if dest is NULL then a new img is created, user should destroy it later,
+* otherwize it will re-used the dest img, and the dest img size must match the src width and height,
+* width and height can be 0 in the latter case and it will figure out according to the dest img
+*
+* \param[in]    dest      The image descriptor
+* \param[in]    src       The raw data memory address
+* \param[in]    fmt       The raw data format
+* \param[in]    width     The raw data width
+* \param[in]    height    The raw data height
+*/
+SWITCH_DECLARE(switch_status_t) switch_img_from_raw(switch_image_t *dest, void *src, switch_img_fmt_t fmt, int width, int height);
 SWITCH_DECLARE(switch_image_t *) switch_img_write_text_img(int w, int h, switch_bool_t full, const char *text);
 
 SWITCH_DECLARE(switch_image_t *) switch_img_read_file(const char* file_name);
diff --git a/src/include/switch_event.h b/src/include/switch_event.h
index d13af7d471..fc713fafff 100644
--- a/src/include/switch_event.h
+++ b/src/include/switch_event.h
@@ -427,7 +427,7 @@ SWITCH_DECLARE(void) switch_json_add_presence_data_cols(switch_event_t *event, c
 
 SWITCH_DECLARE(void) switch_event_launch_dispatch_threads(uint32_t max);
 
-SWITCH_DECLARE(uint32_t) switch_event_channel_broadcast(const char *event_channel, cJSON **json, const char *key, switch_event_channel_id_t id);
+SWITCH_DECLARE(switch_status_t) switch_event_channel_broadcast(const char *event_channel, cJSON **json, const char *key, switch_event_channel_id_t id);
 SWITCH_DECLARE(uint32_t) switch_event_channel_unbind(const char *event_channel, switch_event_channel_func_t func);
 SWITCH_DECLARE(switch_status_t) switch_event_channel_bind(const char *event_channel, switch_event_channel_func_t func, switch_event_channel_id_t *id);
 														  
diff --git a/src/include/switch_image.h b/src/include/switch_image.h
index c06d35101c..7958c69806 100644
--- a/src/include/switch_image.h
+++ b/src/include/switch_image.h
@@ -28,7 +28,7 @@ extern "C" {
    * types, removing or reassigning enums, adding/removing/rearranging
    * fields to structures
    */
-#define VPX_IMAGE_ABI_VERSION (3) /**<\hideinitializer*/
+#define VPX_IMAGE_ABI_VERSION (4) /**<\hideinitializer*/
 
 
 #define VPX_IMG_FMT_PLANAR     0x100  /**< Image is a planar format. */
@@ -78,10 +78,17 @@ extern "C" {
     VPX_CS_SRGB       = 7   /**< sRGB */
   } vpx_color_space_t; /**< alias for enum vpx_color_space */
 
+  /*!\brief List of supported color range */
+  typedef enum vpx_color_range {
+    VPX_CR_STUDIO_RANGE = 0,    /**< Y [16..235], UV [16..240] */
+    VPX_CR_FULL_RANGE   = 1     /**< YUV/RGB [0..255] */
+  } vpx_color_range_t; /**< alias for enum vpx_color_range */
+
   /**\brief Image Descriptor */
   typedef struct vpx_image {
     vpx_img_fmt_t fmt; /**< Image Format */
     vpx_color_space_t cs; /**< Color Space */
+    vpx_color_range_t range; /**< Color Range */
 
     /* Image storage dimensions */
     unsigned int  w;           /**< Stored image width */
@@ -92,6 +99,10 @@ extern "C" {
     unsigned int  d_w;   /**< Displayed image width */
     unsigned int  d_h;   /**< Displayed image height */
 
+    /* Image intended rendering dimensions */
+    unsigned int  r_w;   /**< Intended rendering image width */
+    unsigned int  r_h;   /**< Intended rendering image height */
+
     /* Chroma subsampling info */
     unsigned int  x_chroma_shift;   /**< subsampling order, X */
     unsigned int  y_chroma_shift;   /**< subsampling order, Y */
diff --git a/src/include/switch_module_interfaces.h b/src/include/switch_module_interfaces.h
index 1d31bc8b88..04054f6cd4 100644
--- a/src/include/switch_module_interfaces.h
+++ b/src/include/switch_module_interfaces.h
@@ -289,6 +289,8 @@ struct switch_file_interface {
 	switch_status_t (*file_set_string) (switch_file_handle_t *fh, switch_audio_col_t col, const char *string);
 	/*! function to get meta data */
 	switch_status_t (*file_get_string) (switch_file_handle_t *fh, switch_audio_col_t col, const char **string);
+	/*! function to control the underlying tech of the file  */
+	switch_status_t (*file_command) (switch_file_handle_t *fh, switch_file_command_t command);
 	/*! list of supported file extensions */
 	char **extens;
 	switch_thread_rwlock_t *rwlock;
@@ -299,10 +301,10 @@ struct switch_file_interface {
 };
 
 typedef enum {
-	SWITCH_VIDEO_ENCODE_SPEED_DEFAULT,
-	SWITCH_VIDEO_ENCODE_SPEED_SLOW,
+	SWITCH_VIDEO_ENCODE_SPEED_DEFAULT = 0,
+	SWITCH_VIDEO_ENCODE_SPEED_FAST = 0,
 	SWITCH_VIDEO_ENCODE_SPEED_MEDIUM,
-	SWITCH_VIDEO_ENCODE_SPEED_FAST
+	SWITCH_VIDEO_ENCODE_SPEED_SLOW
 } switch_video_encode_speed_t;
 
 typedef enum {
@@ -390,6 +392,7 @@ struct switch_file_handle {
 	char *stream_name;
 	char *modname;
 	switch_mm_t mm;
+	switch_mutex_t *flag_mutex;
 };
 
 /*! \brief Abstract interface to an asr module */
diff --git a/src/include/switch_rtp.h b/src/include/switch_rtp.h
index 3fdd1ed325..f222ce028c 100644
--- a/src/include/switch_rtp.h
+++ b/src/include/switch_rtp.h
@@ -51,7 +51,7 @@ SWITCH_BEGIN_EXTERN_C
 
 typedef struct {
 	switch_rtp_hdr_t header;
-	char body[SWITCH_RTP_MAX_BUF_LEN];
+	char body[SWITCH_RTP_MAX_BUF_LEN+4+sizeof(char *)];
 } switch_rtp_packet_t;
 
 typedef enum {
diff --git a/src/include/switch_types.h b/src/include/switch_types.h
index 1f2005dba9..f740b94932 100644
--- a/src/include/switch_types.h
+++ b/src/include/switch_types.h
@@ -1285,7 +1285,7 @@ typedef enum {
   \brief Channel States (these are the defaults, CS_SOFT_EXECUTE, CS_EXCHANGE_MEDIA, and CS_CONSUME_MEDIA are often overridden by specific apps)
 <pre>
 CS_NEW       - Channel is newly created.
-CS_INIT      - Channel has been initilized.
+CS_INIT      - Channel has been initialized.
 CS_ROUTING   - Channel is looking for an extension to execute.
 CS_SOFT_EXECUTE  - Channel is ready to execute from 3rd party control.
 CS_EXECUTE   - Channel is executing it's dialplan.
@@ -1296,7 +1296,7 @@ CS_HIBERNATE - Channel is in a sleep state.
 CS_RESET 	 - Channel is in a reset state.
 CS_HANGUP    - Channel is flagged for hangup and ready to end.
 CS_REPORTING - Channel is ready to collect call detail.
-CS_DESTROY      - Channel is ready to be destroyed and out of the state machine
+CS_DESTROY      - Channel is ready to be destroyed and out of the state machine.
 </pre>
  */
 typedef enum {
@@ -1491,6 +1491,7 @@ typedef enum {
 	CF_VIDEO_BITRATE_UNMANAGABLE,
 	CF_VIDEO_ECHO,
 	CF_VIDEO_BLANK,
+	CF_VIDEO_WRITING,
 	CF_SLA_INTERCEPT,
 	CF_VIDEO_BREAK,
 	CF_AUDIO_PAUSE,
@@ -1517,8 +1518,9 @@ typedef enum {
 } switch_channel_flag_t;
 
 typedef struct switch_vid_params_s {
-	int width;
-	int height;
+	uint32_t width;
+	uint32_t height;
+	uint32_t fps;
 } switch_vid_params_t;
 
 
@@ -1811,7 +1813,8 @@ typedef enum {
 	SWITCH_FILE_WRITE_OVER = (1 << 16),
 	SWITCH_FILE_NOMUX = (1 << 17),
 	SWITCH_FILE_BREAK_ON_CHANGE = (1 << 18),
-	SWITCH_FILE_FLAG_VIDEO = (1 << 19)
+	SWITCH_FILE_FLAG_VIDEO = (1 << 19),
+	SWITCH_FILE_FLAG_VIDEO_EOF = (1 << 20)
 } switch_file_flag_enum_t;
 typedef uint32_t switch_file_flag_t;
 
@@ -2595,6 +2598,10 @@ typedef enum {
 	SPY_DUAL_CROP
 } switch_vid_spy_fmt_t;
 
+typedef enum {
+	SCFC_FLUSH_AUDIO
+} switch_file_command_t;
+
 SWITCH_END_EXTERN_C
 #endif
 /* For Emacs:
diff --git a/src/include/switch_utils.h b/src/include/switch_utils.h
index 9b47911e10..691395f362 100644
--- a/src/include/switch_utils.h
+++ b/src/include/switch_utils.h
@@ -635,17 +635,17 @@ SWITCH_DECLARE(unsigned char) switch_char_to_rfc2833(char key);
   \param obj the object to set the flags on
   \param flag the or'd list of flags to set
 */
-#define switch_set_flag_locked(obj, flag) assert(obj->flag_mutex != NULL);\
-switch_mutex_lock(obj->flag_mutex);\
+#define switch_set_flag_locked(obj, flag) assert((obj)->flag_mutex != NULL); \
+switch_mutex_lock((obj)->flag_mutex);								\
 (obj)->flags |= (flag);\
-switch_mutex_unlock(obj->flag_mutex);
+switch_mutex_unlock((obj)->flag_mutex);
 
 /*!
   \brief Clear a flag on an arbitrary object
   \param obj the object to test
   \param flag the or'd list of flags to clear
 */
-#define switch_clear_flag_locked(obj, flag) switch_mutex_lock(obj->flag_mutex); (obj)->flags &= ~(flag); switch_mutex_unlock(obj->flag_mutex);
+#define switch_clear_flag_locked(obj, flag) switch_mutex_lock((obj)->flag_mutex); (obj)->flags &= ~(flag); switch_mutex_unlock((obj)->flag_mutex);
 
 /*!
   \brief Clear a flag on an arbitrary object while locked
diff --git a/src/include/switch_vpx.h b/src/include/switch_vpx.h
index a975775a7a..f2791678f7 100644
--- a/src/include/switch_vpx.h
+++ b/src/include/switch_vpx.h
@@ -59,9 +59,33 @@ SWITCH_BEGIN_EXTERN_C
 #define VPX_IMG_FMT_HIGH         0x800  /**< Image uses 16bit framebuffer */
 #endif
 
-#define SWITCH_IMG_FMT_HIGH      VPX_IMG_FMT_HIGH
-#define SWITCH_IMG_FMT_I420	     VPX_IMG_FMT_I420
-#define SWITCH_IMG_FMT_ARGB	     VPX_IMG_FMT_ARGB
+#define SWITCH_IMG_FMT_NONE      VPX_IMG_FMT_NONE
+#define SWITCH_IMG_FMT_RGB24     VPX_IMG_FMT_RGB24
+#define SWITCH_IMG_FMT_RGB32     VPX_IMG_FMT_RGB32
+#define SWITCH_IMG_FMT_RGB565    VPX_IMG_FMT_RGB565
+#define SWITCH_IMG_FMT_RGB555    VPX_IMG_FMT_RGB555
+#define SWITCH_IMG_FMT_UYVY      VPX_IMG_FMT_UYVY
+#define SWITCH_IMG_FMT_YUY2      VPX_IMG_FMT_YUY2
+#define SWITCH_IMG_FMT_YVYU      VPX_IMG_FMT_YVYU
+#define SWITCH_IMG_FMT_BGR24     VPX_IMG_FMT_BGR24
+#define SWITCH_IMG_FMT_RGB32_LE  VPX_IMG_FMT_RGB32_LE
+#define SWITCH_IMG_FMT_ARGB      VPX_IMG_FMT_ARGB
+#define SWITCH_IMG_FMT_ARGB_LE   VPX_IMG_FMT_ARGB_LE
+#define SWITCH_IMG_FMT_RGB565_LE VPX_IMG_FMT_RGB565_LE
+#define SWITCH_IMG_FMT_RGB555_LE VPX_IMG_FMT_RGB555_LE
+#define SWITCH_IMG_FMT_YV12      VPX_IMG_FMT_YV12
+#define SWITCH_IMG_FMT_I420      VPX_IMG_FMT_I420
+#define SWITCH_IMG_FMT_VPXYV12   VPX_IMG_FMT_VPXYV12
+#define SWITCH_IMG_FMT_VPXI420   VPX_IMG_FMT_VPXI420
+#define SWITCH_IMG_FMT_I422      VPX_IMG_FMT_I422
+#define SWITCH_IMG_FMT_I444      VPX_IMG_FMT_I444
+#define SWITCH_IMG_FMT_I440      VPX_IMG_FMT_I440
+#define SWITCH_IMG_FMT_444A      VPX_IMG_FMT_444A
+#define SWITCH_IMG_FMT_I42016    VPX_IMG_FMT_I42016
+#define SWITCH_IMG_FMT_I42216    VPX_IMG_FMT_I42216
+#define SWITCH_IMG_FMT_I44416    VPX_IMG_FMT_I44416
+#define SWITCH_IMG_FMT_I44016    VPX_IMG_FMT_I44016
+/* experimental */
 #define SWITCH_IMG_FMT_GD	     VPX_IMG_FMT_NONE
 
 typedef vpx_img_fmt_t switch_img_fmt_t;
diff --git a/src/mod/Makefile.am b/src/mod/Makefile.am
index f2c8dbeee5..36ed523311 100644
--- a/src/mod/Makefile.am
+++ b/src/mod/Makefile.am
@@ -5,37 +5,44 @@ clean: $(OUR_CLEAN_MODULES) $(OUR_DISABLED_CLEAN_MODULES)
 install: $(OUR_INSTALL_MODULES)
 uninstall: $(OUR_UNINSTALL_MODULES) $(OUR_DISABLED_UNINSTALL_MODULES)
 
+mod_skypopen-all: mod_gsmopen-all
+mod_gsmopen-all: mod_spandsp-all
+mod_unimrcp-all: mod_sofia-all
+
 $(OUR_MODULES) $(OUR_CLEAN_MODULES) $(OUR_INSTALL_MODULES) $(OUR_UNINSTALL_MODULES) $(OUR_DISABLED_MODULES) $(OUR_DISABLED_CLEAN_MODULES) $(OUR_DISABLED_INSTALL_MODULES) $(OUR_DISABLED_UNINSTALL_MODULES):
 	@set fnord $$MAKEFLAGS; amf=$$2; \
 	target=`echo $@ | sed -e 's|^.*-||'`; \
 	modname=`echo $@ | sed -e 's|-.*||' | sed -e 's|^.*/||'`; \
-	confmoddir=`cat $(switch_builddir)/modules.conf | sed -e 's| ||' | grep $$modname$$ | sed -e 's|#||' | head -n 1`; \
-	if test -z "$$confmoddir" ; then \
-		moddir=$@ ; \
-		buildmoddir=$(switch_builddir)/src/mod/$@ ;\
-	else  \
-		if test -d  "$(switch_srcdir)/src/mod/$$confmoddir" ; then \
-			moddir="$(switch_srcdir)/src/mod/$$confmoddir" ; \
-			buildmoddir="$(switch_builddir)/src/mod/$$confmoddir" ; \
-		else \
-			moddir="$$confmoddir" ; \
-			buildmoddir="$(switch_builddir)/src/mod/$$confmoddir" ; \
+	enabled=`echo $(CONF_MODULES) | grep -w $$modname`; \
+	if ! test -z "$$enabled"; then \
+		confmoddir=`cat $(switch_builddir)/modules.conf | sed -e 's| ||' | grep $$modname$$ | sed -e 's|#||' | head -n 1`; \
+		if test -z "$$confmoddir" ; then \
+			moddir=$@ ; \
+			buildmoddir=$(switch_builddir)/src/mod/$@ ;\
+		else  \
+			if test -d  "$(switch_srcdir)/src/mod/$$confmoddir" ; then \
+				moddir="$(switch_srcdir)/src/mod/$$confmoddir" ; \
+				buildmoddir="$(switch_builddir)/src/mod/$$confmoddir" ; \
+			else \
+				moddir="$$confmoddir" ; \
+				buildmoddir="$(switch_builddir)/src/mod/$$confmoddir" ; \
+			fi ; \
 		fi ; \
-	fi ; \
-	if test -z "$$target" ; then target="all" ; fi ; \
-	if ! test -f $$moddir/$$modname.c && ! test -f $$moddir/$$modname.cpp && test $$modname != "mod_com_g729" ;	\
-	then echo ; echo "WARNING $$modname is not a valid FreeSWITCH module dir, skipping it..." ; else \
-		echo ;\
-		echo making $$target $$modname ;\
-		test -d "$$buildmoddir" || mkdir -p $$buildmoddir ; \
-		(if test -f "$$moddir/Makefile" ; then \
-			test -f "$$buildmoddir/Makefile" || cp $$moddir/Makefile $$buildmoddir/Makefile ; \
-			cd $$buildmoddir && MODDIR=$$moddir MODNAME=$$modname BASE=$(switch_builddir) $(MAKE) $(AM_MAKEFLAGS) $$target; \
-		else\
-			cd $$buildmoddir && MODDIR=$$moddir MODNAME=$$modname BASE=$(switch_builddir) $(MAKE) $(AM_MAKEFLAGS) -f $(switch_builddir)/build/modmake.rules $$target ;\
-		fi;) || case "$$amf" in *=*) exit 1;; *k*) fail=yes;; *) exit 1;; esac; \
-	fi; \
-	test -z "$$fail" ;
+		if test -z "$$target" ; then target="all" ; fi ; \
+		if ! test -f $$moddir/$$modname.c && ! test -f $$moddir/$$modname.cpp && test $$modname != "mod_com_g729" ;	\
+		then echo ; echo "WARNING $$modname is not a valid FreeSWITCH module dir, skipping it..." ; else \
+			echo ;\
+			echo making $$target $$modname ;\
+			test -d "$$buildmoddir" || mkdir -p $$buildmoddir ; \
+			(if test -f "$$moddir/Makefile" ; then \
+				test -f "$$buildmoddir/Makefile" || cp $$moddir/Makefile $$buildmoddir/Makefile ; \
+				cd $$buildmoddir && MODDIR=$$moddir MODNAME=$$modname BASE=$(switch_builddir) $(MAKE) $(AM_MAKEFLAGS) $$target; \
+			else\
+				cd $$buildmoddir && MODDIR=$$moddir MODNAME=$$modname BASE=$(switch_builddir) $(MAKE) $(AM_MAKEFLAGS) -f $(switch_builddir)/build/modmake.rules $$target ;\
+			fi;) || case "$$amf" in *=*) exit 1;; *k*) fail=yes;; *) exit 1;; esac; \
+		fi; \
+		test -z "$$fail" ; \
+	fi;
 
 mod_com_g729-activate:
 	cd $(switch_builddir)/src/mod/codecs/mod_com_g729 && $(MAKE) $(AM_MAKEFLAGS) activate
diff --git a/src/mod/applications/mod_av/Makefile.am b/src/mod/applications/mod_av/Makefile.am
index cd5ffdb107..261985c7a9 100644
--- a/src/mod/applications/mod_av/Makefile.am
+++ b/src/mod/applications/mod_av/Makefile.am
@@ -1,8 +1,6 @@
 include $(top_srcdir)/build/modmake.rulesam
 MODNAME=mod_av
 
-if HAVE_YUV
-if HAVE_VPX
 if HAVE_AVFORMAT
 
 mod_LTLIBRARIES = mod_av.la
@@ -17,19 +15,3 @@ all: error
 error:
 	$(error You must install libavformat-dev to build mod_av)
 endif
-
-else
-install: error
-all: error
-error:
-	$(error You must install libvpx2-dev to build mod_av)
-endif
-
-
-else
-install: error
-all: error
-error:
-	$(error You must install libyuv-dev to build mod_av)
-endif
-
diff --git a/src/mod/applications/mod_av/avcodec.c b/src/mod/applications/mod_av/avcodec.c
index 5349e11708..ac9634fdcf 100644
--- a/src/mod/applications/mod_av/avcodec.c
+++ b/src/mod/applications/mod_av/avcodec.c
@@ -766,8 +766,8 @@ static switch_status_t consume_nalu(h264_codec_context_t *context, switch_frame_
 			return SWITCH_STATUS_MORE_DATA;
 		}
 
-		frame->m = 1;
-		return SWITCH_STATUS_SUCCESS;
+		frame->m = context->nalus[context->nalu_current_index].len ? 0 : 1;
+		return frame->m ? SWITCH_STATUS_SUCCESS : SWITCH_STATUS_MORE_DATA;
 	} else {
 		uint8_t nalu_hdr = *(uint8_t *)(nalu->start);
 		uint8_t nri = nalu_hdr & 0x60;
@@ -843,17 +843,19 @@ static switch_status_t open_encoder(h264_codec_context_t *context, uint32_t widt
 	}
 
 	if (context->codec_settings.video.bandwidth) {
-		context->bandwidth = context->codec_settings.video.bandwidth * 8;
+		context->bandwidth = context->codec_settings.video.bandwidth;
 	} else {
-		context->bandwidth = switch_calc_bitrate(context->codec_settings.video.width, context->codec_settings.video.height, 1, 15) * 8;
+		context->bandwidth = switch_calc_bitrate(context->codec_settings.video.width, context->codec_settings.video.height, 1, 15);
 	}
 
 	sane = switch_calc_bitrate(1920, 1080, 2, 30);
 
-	if (context->bandwidth / 8 > sane) {
+	if (context->bandwidth > sane) {
 		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "BITRATE TRUNCATED TO %d\n", sane);
-		context->bandwidth = sane * 8;
+		context->bandwidth = sane;
 	}
+
+	context->bandwidth *= 3;
 	
 	//context->encoder_ctx->bit_rate = context->bandwidth * 1024;
 	context->encoder_ctx->width = context->codec_settings.video.width;
@@ -1098,7 +1100,8 @@ static switch_status_t switch_h264_encode(switch_codec_t *codec, switch_frame_t
 		}
 	}
 
-	if (*got_output) { // Could be more delayed frames
+#if 0
+	if (*got_output) { // TODO: Could be more delayed frames, flush when frame == NULL
 		ret = avcodec_encode_video2(avctx, pkt, NULL, got_output);
 		if (ret < 0) {
 			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Encoding Error %d\n", ret);
@@ -1110,6 +1113,7 @@ static switch_status_t switch_h264_encode(switch_codec_t *codec, switch_frame_t
 			goto process;
 		}
 	}
+#endif
 
 	fill_avframe(avframe, img);
 
@@ -1136,12 +1140,14 @@ static switch_status_t switch_h264_encode(switch_codec_t *codec, switch_frame_t
 		context->need_key_frame = 0;
 	}
 
-process:
+// process:
 
 	if (*got_output) {
 		const uint8_t *p = pkt->data;
 		int i = 0;
 
+		*got_output = 0;
+
 		if (context->av_codec_id == AV_CODEC_ID_H263) {
 			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG5, "Encoded frame %" SWITCH_INT64_T_FMT " (size=%5d) [0x%02x 0x%02x 0x%02x 0x%02x] got_output: %d slices: %d\n", context->pts, pkt->size, *((uint8_t *)pkt->data), *((uint8_t *)(pkt->data + 1)), *((uint8_t *)(pkt->data + 2)), *((uint8_t *)(pkt->data + 3)), *got_output, avctx->slices);
 
diff --git a/src/mod/applications/mod_av/avformat.c b/src/mod/applications/mod_av/avformat.c
index c0eb72eb41..4d08f70644 100644
--- a/src/mod/applications/mod_av/avformat.c
+++ b/src/mod/applications/mod_av/avformat.c
@@ -43,6 +43,58 @@
 #define SCALE_FLAGS SWS_BICUBIC
 #define DFT_RECORD_OFFSET 0
 
+
+#ifndef AVUTIL_TIMESTAMP_H
+#define AVUTIL_TIMESTAMP_H
+
+#define AV_TS_MAX_STRING_SIZE 32
+
+/**
+ * Fill the provided buffer with a string containing a timestamp
+ * representation.
+ *
+ * @param buf a buffer with size in bytes of at least AV_TS_MAX_STRING_SIZE
+ * @param ts the timestamp to represent
+ * @return the buffer in input
+ */
+static inline char *av_ts_make_string(char *buf, int64_t ts)
+{
+    if (ts == AV_NOPTS_VALUE) snprintf(buf, AV_TS_MAX_STRING_SIZE, "NOPTS");
+    else                      snprintf(buf, AV_TS_MAX_STRING_SIZE, "%"PRId64"", ts);
+    return buf;
+}
+
+/**
+ * Convenience macro, the return value should be used only directly in
+ * function arguments but never stand-alone.
+ */
+#define av_ts2str(ts) av_ts_make_string((char[AV_TS_MAX_STRING_SIZE]){0}, ts)
+
+/**
+ * Fill the provided buffer with a string containing a timestamp time
+ * representation.
+ *
+ * @param buf a buffer with size in bytes of at least AV_TS_MAX_STRING_SIZE
+ * @param ts the timestamp to represent
+ * @param tb the timebase of the timestamp
+ * @return the buffer in input
+ */
+static inline char *av_ts_make_time_string(char *buf, int64_t ts, AVRational *tb)
+{
+    if (ts == AV_NOPTS_VALUE) snprintf(buf, AV_TS_MAX_STRING_SIZE, "NOPTS");
+    else                      snprintf(buf, AV_TS_MAX_STRING_SIZE, "%.6g", av_q2d(*tb) * ts);
+    return buf;
+}
+
+/**
+ * Convenience macro, the return value should be used only directly in
+ * function arguments but never stand-alone.
+ */
+#define av_ts2timestr(ts, tb) av_ts_make_time_string((char[AV_TS_MAX_STRING_SIZE]){0}, ts, tb)
+
+#endif /* AVUTIL_TIMESTAMP_H */
+
+
 static switch_status_t av_file_close(switch_file_handle_t *handle);
 SWITCH_MODULE_LOAD_FUNCTION(mod_avformat_load);
 
@@ -143,17 +195,18 @@ typedef struct record_helper_s {
 	switch_queue_t *video_queue;
 	switch_thread_t *video_thread;
 	switch_mm_t *mm;
+	int finalize;
 } record_helper_t;
 
 static void log_packet(const AVFormatContext *fmt_ctx, const AVPacket *pkt)
 {
-	// AVRational *time_base = &fmt_ctx->streams[pkt->stream_index]->time_base;
+	AVRational *time_base = &fmt_ctx->streams[pkt->stream_index]->time_base;
 
-	// printf("pts:%s pts_time:%s dts:%s dts_time:%s duration:%s duration_time:%s stream_index:%d\n",
-	// 	   av_ts2str(pkt->pts), av_ts2timestr(pkt->pts, time_base),
-	// 	   av_ts2str(pkt->dts), av_ts2timestr(pkt->dts, time_base),
-	// 	   av_ts2str(pkt->duration), av_ts2timestr(pkt->duration, time_base),
-	// 	   pkt->stream_index);
+	printf("pts:%s pts_time:%s dts:%s dts_time:%s duration:%s duration_time:%s stream_index:%d\n",
+		   av_ts2str(pkt->pts), av_ts2timestr(pkt->pts, time_base),
+		   av_ts2str(pkt->dts), av_ts2timestr(pkt->dts, time_base),
+		   av_ts2str(pkt->duration), av_ts2timestr(pkt->duration, time_base),
+		   pkt->stream_index);
 }
 
 static int mod_avformat_alloc_output_context2(AVFormatContext **avctx, AVOutputFormat *oformat,
@@ -306,6 +359,14 @@ static switch_status_t add_stream(MediaStream *mst, AVFormatContext *fc, AVCodec
 		if (codec_id == AV_CODEC_ID_H264) {
 			c->ticks_per_frame = 2;
 
+
+			c->coder_type = 1;  // coder = 1
+			c->flags|=CODEC_FLAG_LOOP_FILTER;   // flags=+loop
+			c->me_cmp|= 1;  // cmp=+chroma, where CHROMA = 1
+			c->me_method=ME_HEX;    // me_method=hex
+			c->me_range = 16;   // me_range=16
+			c->max_b_frames = 3;    // bf=3
+			
 			switch (mm->vprofile) {
 			case SWITCH_VIDEO_PROFILE_BASELINE:
 				av_opt_set(c->priv_data, "profile", "baseline", 0);
@@ -314,10 +375,12 @@ static switch_status_t add_stream(MediaStream *mst, AVFormatContext *fc, AVCodec
 			case SWITCH_VIDEO_PROFILE_MAIN:
 				av_opt_set(c->priv_data, "profile", "main", 0);
 				av_opt_set(c->priv_data, "level", "5", 0);
+				c->level = 5;
 				break;
 			case SWITCH_VIDEO_PROFILE_HIGH:
 				av_opt_set(c->priv_data, "profile", "high", 0);
 				av_opt_set(c->priv_data, "level", "52", 0);
+				c->level = 52;
 				break;
 			}
 			
@@ -329,6 +392,7 @@ static switch_status_t add_stream(MediaStream *mst, AVFormatContext *fc, AVCodec
 				av_opt_set(c->priv_data, "preset", "medium", 0);
 				break;
 			case SWITCH_VIDEO_ENCODE_SPEED_FAST:
+				//av_opt_set(c->priv_data, "tune", "zerolatency", 0);
 				av_opt_set(c->priv_data, "preset", "veryfast", 0);
 				break;
 			default:
@@ -336,6 +400,18 @@ static switch_status_t add_stream(MediaStream *mst, AVFormatContext *fc, AVCodec
 			}
 		}
 
+		c->gop_size = 250;  // g=250
+		c->keyint_min = 25; // keyint_min=25
+		c->scenechange_threshold = 40;  // sc_threshold=40
+		c->i_quant_factor = 0.71; // i_qfactor=0.71
+		c->b_frame_strategy = 1;  // b_strategy=1
+		c->qcompress = 0.6; // qcomp=0.6
+		c->qmin = 10;   // qmin=10
+		c->qmax = 31;   // qmax=31
+		c->max_qdiff = 4;   // qdiff=4
+		av_opt_set(c->priv_data, "crf", "18", 0);
+
+
 		if (codec_id == AV_CODEC_ID_VP8) {
 			av_set_options_string(c, "quality=realtime", "=", ":");
 		}
@@ -400,6 +476,7 @@ static switch_status_t open_video(AVFormatContext *fc, AVCodec *codec, MediaStre
 	/* allocate and init a re-usable frame */
 	mst->frame = alloc_picture(c->pix_fmt, c->width, c->height);
 	switch_assert(mst->frame);
+	mst->frame->pts = 0;
 
 	// switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "pix_fmt: %d\n", c->pix_fmt);
 	switch_assert(c->pix_fmt == AV_PIX_FMT_YUV420P); // always I420 for NOW
@@ -514,9 +591,10 @@ static void *SWITCH_THREAD_FUNC video_thread_run(switch_thread_t *thread, void *
 	switch_image_t *img = NULL, *tmp_img = NULL;
 	int d_w = eh->video_st->width, d_h = eh->video_st->height;
 	int size = 0, skip = 0, skip_freq = 0, skip_count = 0, skip_total = 0, skip_total_count = 0;
+	uint64_t hard_delta = 0, delta = 0, last_ts = 0;
 
 	switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "video thread start\n");
-	
+
 	for(;;) {
 		AVPacket pkt = { 0 };
 		int got_packet;
@@ -524,6 +602,10 @@ static void *SWITCH_THREAD_FUNC video_thread_run(switch_thread_t *thread, void *
 
 	top:
 
+		if (eh->mm->fps) {
+			hard_delta = 1000 / eh->mm->fps;
+		}
+
 		if (switch_queue_pop(eh->video_queue, &pop) == SWITCH_STATUS_SUCCESS) {
             switch_img_free(&img);
 
@@ -551,13 +633,14 @@ static void *SWITCH_THREAD_FUNC video_thread_run(switch_thread_t *thread, void *
 				skip_total_count = skip_total;
 				skip_count = 0;
 				skip--;
+
 				goto top;
 			}
 		} else {
 		
 			size = switch_queue_size(eh->video_queue);
 			
-			if (size > 5) {
+			if (size > 5 && !eh->finalize) {
 				skip = size;
 
 				if (size > 10) {
@@ -580,19 +663,39 @@ static void *SWITCH_THREAD_FUNC video_thread_run(switch_thread_t *thread, void *
 			ret = av_frame_make_writable(eh->video_st->frame);
 		}
 
-		if (ret < 0) continue;
+		if (ret < 0) {
+			continue;
+		}
 
 		fill_avframe(eh->video_st->frame, img);
-		switch_core_timer_sync(eh->timer);
 		
-		if (eh->video_st->frame->pts == eh->timer->samplecount) {
-			// never use the same pts, or the encoder coughs
-			eh->video_st->frame->pts++;
-		} else {
-			eh->video_st->frame->pts = eh->timer->samplecount;
+		if (hard_delta) {
+			delta = hard_delta;
 		}
-		// eh->video_st->frame->pts = switch_time_now() / 1000 - eh->video_st->next_pts;
-		// switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "pts: %lld\n", eh->video_st->frame->pts);
+
+		if ((eh->finalize && delta) || hard_delta) {
+			eh->video_st->frame->pts += delta;
+		} else {
+			switch_core_timer_sync(eh->timer);
+		
+			if (eh->video_st->frame->pts == eh->timer->samplecount) {
+				// never use the same pts, or the encoder coughs
+				eh->video_st->frame->pts++;
+			} else {
+				uint64_t delta_tmp = eh->timer->samplecount - last_ts;
+				
+				if (delta_tmp > 10) {
+					delta = delta_tmp;
+				}
+				
+				eh->video_st->frame->pts = eh->timer->samplecount;
+			}
+		}
+		
+		last_ts = eh->video_st->frame->pts;
+
+		//eh->video_st->frame->pts = switch_time_now() / 1000 - eh->video_st->next_pts;
+		//switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "pts: %ld\n", eh->video_st->frame->pts);
 
 		/* encode the image */
 		ret = avcodec_encode_video2(eh->video_st->st->codec, &pkt, eh->video_st->frame, &got_packet);
@@ -615,6 +718,26 @@ static void *SWITCH_THREAD_FUNC video_thread_run(switch_thread_t *thread, void *
 
  endfor:
 
+	for(;;) {
+		AVPacket pkt = { 0 };
+		int got_packet = 0;
+		int ret = 0;
+
+		av_init_packet(&pkt);
+
+		ret = avcodec_encode_video2(eh->video_st->st->codec, &pkt, eh->video_st->frame, &got_packet);
+
+		if (ret < 0) {
+			break;
+		} else if (got_packet) {
+			switch_mutex_lock(eh->mutex);
+			ret = write_frame(eh->fc, &eh->video_st->st->codec->time_base, eh->video_st->st, &pkt);
+			switch_mutex_unlock(eh->mutex);
+			av_free_packet(&pkt);
+			if (ret < 0) break;
+		}
+	}
+
 	while(switch_queue_trypop(eh->video_queue, &pop) == SWITCH_STATUS_SUCCESS) {
 		if (!pop) break;
 		img = (switch_image_t *) pop;
@@ -884,8 +1007,8 @@ SWITCH_STANDARD_APP(record_av_function)
 				switch_buffer_read(buffer, audio_st.frame->data[0], bytes);
 				/* convert to destination format */
 				ret = avresample_convert(audio_st.resample_ctx,
-						(uint8_t **)audio_st.frame->data, 0, out_samples,
-						audio_st.tmp_frame->data, 0, audio_st.frame->nb_samples);
+						audio_st.tmp_frame->data, 0, out_samples,
+						(uint8_t **)audio_st.frame->data, 0, audio_st.frame->nb_samples);
 
 				if (ret < 0) {
 					switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Error while converting %d samples, error text: %s\n",
@@ -1104,6 +1227,7 @@ struct av_file_context {
 	int audio_start;
 	int vid_ready;
 	int audio_ready;
+	int closed;
 
 	MediaStream video_st;
 	MediaStream audio_st;
@@ -1120,6 +1244,7 @@ struct av_file_context {
 	switch_time_t video_start_time;
 	switch_image_t *last_img;
 	int read_fps;
+	switch_time_t last_vid_push;
 };
 
 typedef struct av_file_context av_file_context_t;
@@ -1250,6 +1375,9 @@ err:
 	return status;
 }
 
+//#define ALT_WAY
+#define AUDIO_BUF_SEC 5
+
 static void *SWITCH_THREAD_FUNC file_read_thread_run(switch_thread_t *thread, void *obj)
 {
 	av_file_context_t *context = (av_file_context_t *) obj;
@@ -1257,36 +1385,51 @@ static void *SWITCH_THREAD_FUNC file_read_thread_run(switch_thread_t *thread, vo
 	int got_data = 0;
 	int error;
 	int sync  = 0;
+	int eof = 0;
 
 	context->file_read_thread_running = 1;
 
-#define AUDIO_BUF_SEC 5
+	while (context->file_read_thread_running && !context->closed) {
+		int vid_frames = 0;
 
-	while (context->file_read_thread_running) {
-		if (switch_buffer_inuse(context->audio_buffer) > AUDIO_BUF_SEC * context->audio_st.sample_rate * context->audio_st.channels * 2) {
-			switch_yield(10000);
+		if (context->has_video) {
+			vid_frames = switch_queue_size(context->eh.video_queue);
+		}
+
+		if (switch_buffer_inuse(context->audio_buffer) > AUDIO_BUF_SEC * context->audio_st.sample_rate * context->audio_st.channels * 2 && 
+			(!context->has_video || vid_frames > 5)) {
+			switch_yield(context->has_video ? 1000 : 10000);
 			continue;
 		}
-		
+
 		av_init_packet(&pkt);
 		pkt.data = NULL;
 		pkt.size = 0;
 
-		if ((error = av_read_frame(context->fc, &pkt)) < 0) {
-			if (error == AVERROR_EOF) break;
-
-			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Could not read frame (error '%s')\n", get_error_text(error));
-			break;
+		if (context->video_st.st && (error = av_read_frame(context->fc, &pkt)) < 0) {
+			if (error == AVERROR_EOF) {
+				eof = 1;
+				/* just make sure*/
+				pkt.data = NULL;
+				pkt.size = 0;
+				pkt.stream_index = context->video_st.st->index;
+			} else {
+				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Could not read frame (error '%s')\n", get_error_text(error));
+				break;
+			}
 		}
 
 		// switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "stream: %d, pkt size %d\n", pkt.stream_index, pkt.size);
 		if (context->has_video && pkt.stream_index == context->video_st.st->index) {
-			AVFrame *vframe = av_frame_alloc();
+			AVFrame *vframe;
 			switch_image_t *img;
 			if (!sync) {
 				switch_buffer_zero(context->audio_buffer);
 				sync = 1;
 			}
+
+again:
+			vframe = av_frame_alloc();
 			switch_assert(vframe);
 
 			if ((error = avcodec_decode_video2(context->video_st.st->codec, vframe, &got_data, &pkt)) < 0) {
@@ -1305,7 +1448,7 @@ static void *SWITCH_THREAD_FUNC file_read_thread_run(switch_thread_t *thread, vo
 			//	continue;
 			//}
 
-			if (got_data && error > 0) {
+			if (got_data && error >= 0) {
 				// switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "got picture %dx%d fmt: %d pktpts:%lld pktdts:%lld\n", vframe->width, vframe->height, vframe->format, vframe->pkt_pts, vframe->pkt_dts);
 
 				if (vframe->format != AV_PIX_FMT_YUV420P) {
@@ -1355,14 +1498,39 @@ static void *SWITCH_THREAD_FUNC file_read_thread_run(switch_thread_t *thread, vo
 					uint64_t *pts = malloc(sizeof(uint64_t));
 
 					if (pts) {
+#ifdef ALT_WAY
+						int diff;
+						int sleep = 66000;
+#endif
 						*pts = vframe->pkt_pts;
 						avframe2img(vframe, img);
 						img->user_priv = pts;
+						
+#ifdef ALT_WAY
+						diff = sleep - (switch_time_now() - context->last_vid_push);
+						
+						if (diff > 0 && diff <= sleep) {
+							switch_core_timer_next(&context->video_timer);
+						} else {
+							switch_core_timer_sync(&context->video_timer);
+						}
+#endif			
+
+						context->vid_ready = 1;
 						switch_queue_push(context->eh.video_queue, img);
+						context->last_vid_push = switch_time_now();
 					}
 				}
 			}
 			av_frame_free(&vframe);
+
+			if (eof) {
+				if (got_data) {
+					goto again; // to get all delayed video frames in decoder
+				} else {
+					break;
+				}
+			}
 			continue;
 		} else if (context->has_audio && pkt.stream_index == context->audio_st.st->index) {
 			AVFrame in_frame = { { 0 } };
@@ -1483,7 +1651,7 @@ static switch_status_t av_file_open(switch_file_handle_t *handle, const char *pa
 		if (context->has_video) {
 			switch_queue_create(&context->eh.video_queue, SWITCH_CORE_QUEUE_LEN, handle->memory_pool);
 			switch_mutex_init(&context->eh.mutex, SWITCH_MUTEX_NESTED, handle->memory_pool);
-
+			switch_core_timer_init(&context->video_timer, "soft", 66, 1, context->pool);
 		}
 
 		{
@@ -1525,14 +1693,17 @@ static switch_status_t av_file_open(switch_file_handle_t *handle, const char *pa
 		handle->mm.ab = 128;
 	}
 
+	handle->mm.vb = switch_calc_bitrate(handle->mm.vw, handle->mm.vh, 1, handle->mm.fps);
+
 	if (fmt->video_codec != AV_CODEC_ID_NONE) {
 		const AVCodecDescriptor *desc;
 
-		if (handle->stream_name && (!strcasecmp(handle->stream_name, "rtmp") || !strcasecmp(handle->stream_name, "youtube"))) {
+		if ((handle->stream_name && (!strcasecmp(handle->stream_name, "rtmp") || !strcasecmp(handle->stream_name, "youtube")))) {
+			
 			if (fmt->video_codec != AV_CODEC_ID_H264 ) {
 				fmt->video_codec = AV_CODEC_ID_H264; // force H264
 			}
-			
+
 			fmt->audio_codec = AV_CODEC_ID_AAC;
 			handle->samplerate = 44100;
 			handle->mm.samplerate = 44100;
@@ -1558,12 +1729,12 @@ static switch_status_t av_file_open(switch_file_handle_t *handle, const char *pa
 					handle->mm.vb = 4500;
 					break;
 				default:
-					handle->mm.vb = (handle->mm.vw * handle->mm.vh) / 175;
+					handle->mm.vb = switch_calc_bitrate(handle->mm.vw, handle->mm.vh, 1, handle->mm.fps);
 					break;
 				}
 			}
 
-			if (handle->mm.fps > 0.0f) {
+			if (handle->stream_name && handle->mm.fps > 0.0f) {
 				handle->mm.keyint = (int) 2.0f * handle->mm.fps;
 			}
 		}
@@ -1578,6 +1749,7 @@ static switch_status_t av_file_open(switch_file_handle_t *handle, const char *pa
 		context->audio_st.sample_rate = handle->samplerate;
 
 		add_stream(&context->audio_st, context->fc, &context->audio_codec, fmt->audio_codec, &handle->mm);
+
 		if (open_audio(context->fc, context->audio_codec, &context->audio_st) != SWITCH_STATUS_SUCCESS) {
 			switch_goto_status(SWITCH_STATUS_GENERR, end);
 		}
@@ -1634,7 +1806,12 @@ static switch_status_t av_file_write(switch_file_handle_t *handle, void *data, s
 	uint32_t bytes;
 	int inuse;
 
+	if (!switch_test_flag(handle, SWITCH_FILE_FLAG_WRITE)) {
+		return SWITCH_STATUS_FALSE;
+	}
+
 	if (!context->vid_ready) {
+		switch_buffer_zero(context->audio_buffer);
 		return status;
 	}
 
@@ -1662,6 +1839,14 @@ static switch_status_t av_file_write(switch_file_handle_t *handle, void *data, s
 	//inuse = switch_buffer_inuse(context->audio_buffer);
 	//switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "inuse: %d samples: %d bytes: %d\n", inuse, context->audio_st.frame->nb_samples, bytes);
 
+	if (context->closed) {
+		inuse = switch_buffer_inuse(context->audio_buffer);
+		if (inuse < bytes) {
+			char buf[SWITCH_RECOMMENDED_BUFFER_SIZE] = {0};
+			switch_buffer_write(context->audio_buffer, buf, bytes - inuse);
+		}
+	}
+	
 	
 	while ((inuse = switch_buffer_inuse(context->audio_buffer)) >= bytes) {
 		AVPacket pkt = { 0 };
@@ -1672,14 +1857,13 @@ static switch_status_t av_file_write(switch_file_handle_t *handle, void *data, s
 		
 		if (context->audio_st.resample_ctx) { // need resample
 			int out_samples = avresample_get_out_samples(context->audio_st.resample_ctx, context->audio_st.frame->nb_samples);
-
 			av_frame_make_writable(context->audio_st.frame);
 			av_frame_make_writable(context->audio_st.tmp_frame);
 			switch_buffer_read(context->audio_buffer, context->audio_st.frame->data[0], bytes);
 			/* convert to destination format */
 			ret = avresample_convert(context->audio_st.resample_ctx,
-									 (uint8_t **)context->audio_st.frame->data, 0, out_samples,
-									 context->audio_st.tmp_frame->data, 0, context->audio_st.frame->nb_samples);
+									 context->audio_st.tmp_frame->data, 0, out_samples,
+									 (uint8_t **)context->audio_st.frame->data, 0, context->audio_st.frame->nb_samples);
 			
 			if (ret < 0) {
 				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Error while converting %d samples, error text: %s\n",
@@ -1721,11 +1905,31 @@ static switch_status_t av_file_write(switch_file_handle_t *handle, void *data, s
 	return status;
 }
 
+static switch_status_t av_file_command(switch_file_handle_t *handle, switch_file_command_t command)
+{
+	av_file_context_t *context = (av_file_context_t *)handle->private_info;
+
+	switch(command) {
+	case SCFC_FLUSH_AUDIO:
+		switch_mutex_lock(context->mutex);		
+		switch_buffer_zero(context->audio_buffer);
+		switch_mutex_unlock(context->mutex);		
+		break;
+	default:
+		break;
+	}
+
+	return SWITCH_STATUS_SUCCESS;
+}
+
 static switch_status_t av_file_close(switch_file_handle_t *handle)
 {
 	av_file_context_t *context = (av_file_context_t *)handle->private_info;
 	switch_status_t status;
 
+	context->closed = 1;
+	context->eh.finalize = 1;
+
 	if (context->eh.video_queue) {
 		switch_queue_push(context->eh.video_queue, NULL);
 	}
@@ -1733,8 +1937,10 @@ static switch_status_t av_file_close(switch_file_handle_t *handle)
 	if (context->eh.video_thread) {
 		switch_thread_join(&status, context->eh.video_thread);
 	}
-	
-	av_file_write(handle, NULL, NULL);
+
+	if (switch_test_flag(handle, SWITCH_FILE_FLAG_WRITE)) {
+		av_file_write(handle, NULL, NULL);
+	}
 
 	if (context->file_read_thread_running && context->file_read_thread) {
 		context->file_read_thread_running = 0;
@@ -1784,10 +1990,12 @@ static switch_status_t av_file_read(switch_file_handle_t *handle, void *data, si
 		return SWITCH_STATUS_FALSE;
 	}
 
+	while (context->has_video && !context->vid_ready && !context->closed) {
+		switch_yield(1000);
+	}
+
 	switch_mutex_lock(context->mutex);
-	size = switch_buffer_inuse(context->audio_buffer);
-	if (size > *len * context->audio_st.channels * 2) size = *len * context->audio_st.channels * 2;
-	if (size) size = switch_buffer_read(context->audio_buffer, data, size);
+	size = switch_buffer_read(context->audio_buffer, data, need);
 	switch_mutex_unlock(context->mutex);
 
 	if (size == 0) {
@@ -1808,6 +2016,46 @@ static switch_status_t av_file_read(switch_file_handle_t *handle, void *data, si
 	return *len == 0 ? SWITCH_STATUS_FALSE : SWITCH_STATUS_SUCCESS;
 }
 
+
+#ifdef ALT_WAY
+static switch_status_t av_file_read_video(switch_file_handle_t *handle, switch_frame_t *frame, switch_video_read_flag_t flags)
+{
+	void *pop;
+	av_file_context_t *context = (av_file_context_t *)handle->private_info;
+	switch_status_t status;
+
+
+	if (!context->has_video || context->closed) return SWITCH_STATUS_FALSE;
+
+	if ((flags & SVR_CHECK)) {
+		return SWITCH_STATUS_BREAK;
+	}
+
+	if ((flags & SVR_FLUSH)) {
+		flush_video_queue(context->eh.video_queue, 1);
+	}
+	
+	if ((flags & SVR_BLOCK)) {
+		status = switch_queue_pop(context->eh.video_queue, &pop);
+	} else {
+		status = switch_queue_trypop(context->eh.video_queue, &pop);
+	}
+
+	if (status == SWITCH_STATUS_SUCCESS) {
+		if (!pop) {
+			return SWITCH_STATUS_FALSE;
+		}
+
+		context->vid_ready = 1;
+
+		frame->img = (switch_image_t *) pop;
+		return SWITCH_STATUS_SUCCESS;
+	}
+
+	return (flags & SVR_FLUSH) ? SWITCH_STATUS_BREAK : status;
+}
+#else 
+
 static switch_status_t av_file_read_video(switch_file_handle_t *handle, switch_frame_t *frame, switch_video_read_flag_t flags)
 {
 	av_file_context_t *context = (av_file_context_t *)handle->private_info;
@@ -1815,7 +2063,7 @@ static switch_status_t av_file_read_video(switch_file_handle_t *handle, switch_f
 	MediaStream *mst = &context->video_st;
 	AVStream *st = mst->st;
 	int ticks = 0;
-	int max_delta = 1 * AV_TIME_BASE; // 1 second
+	int64_t max_delta = 1 * AV_TIME_BASE; // 1 second
 	switch_status_t status = SWITCH_STATUS_SUCCESS;
 	double fl_to = 0.02;
 	int do_fl = 0;
@@ -1829,7 +2077,7 @@ static switch_status_t av_file_read_video(switch_file_handle_t *handle, switch_f
 	fl_to = (1000 / context->read_fps) * 1000;
 	//printf("WTF %d (%f)\n",switch_queue_size(context->eh.video_queue), fl_to);
 	if (flags & SVR_FLUSH) {
-		max_delta = fl_to * AV_TIME_BASE;
+		max_delta = fl_to;
 		do_fl = 1;
 	}
 
@@ -1839,10 +2087,19 @@ static switch_status_t av_file_read_video(switch_file_handle_t *handle, switch_f
 		} else if (mst->next_pts && (switch_time_now() - mst->next_pts > -10000)) {
 			frame->img = context->last_img;
 			context->last_img = NULL;
+			context->vid_ready = 1;
 			return SWITCH_STATUS_SUCCESS;
 		}
 
-		if (!(flags & SVR_BLOCK) && !do_fl) return SWITCH_STATUS_BREAK;
+		if (!(flags & SVR_BLOCK) && !do_fl) {
+			if (!mst->next_pts) {
+				frame->img = context->last_img;
+				context->last_img = NULL;
+				context->vid_ready = 1;
+				return SWITCH_STATUS_SUCCESS;
+			}
+			return SWITCH_STATUS_BREAK;
+		}
 	}
 
 	if (!context->file_read_thread_running && switch_queue_size(context->eh.video_queue) == 0) {
@@ -1870,48 +2127,52 @@ static switch_status_t av_file_read_video(switch_file_handle_t *handle, switch_f
 
 	if (pop && status == SWITCH_STATUS_SUCCESS) {
 		switch_image_t *img = (switch_image_t *)pop;
-		uint64_t pts;
-		uint64_t now = switch_time_now();
+		int64_t pts;
+		int64_t now = switch_time_now();
 
 		pts = av_rescale_q(*((uint64_t *)img->user_priv), st->time_base, AV_TIME_BASE_Q);
+		// switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "pkt_pts: %lld pts: %lld queue size: %u\n", *((uint64_t *)img->user_priv), pts, switch_queue_size(context->eh.video_queue));
 
 		if (!context->video_start_time) {
 			context->video_start_time = now - pts;
+			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "set start time: %" SWITCH_INT64_T_FMT " now: %" SWITCH_INT64_T_FMT " pts: %" SWITCH_INT64_T_FMT "\n", context->video_start_time, now, pts);
 		}
 
 		if (st->time_base.num == 0) {
 			mst->next_pts = 0;
 		} else {
-			//uint64_t last_pts = mst->next_pts;
+			// int64_t last_pts = mst->next_pts;
 			mst->next_pts = context->video_start_time + pts;
-			//switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "pts: %" SWITCH_INT64_T_FMT " last_pts: %" SWITCH_INT64_T_FMT " delta: %" SWITCH_INT64_T_FMT " frame_pts: %" SWITCH_INT64_T_FMT " nextpts: %" SWITCH_INT64_T_FMT ", num: %d, den:%d num:%d den:%d sleep: %" SWITCH_INT64_T_FMT "\n",
-			//pts, last_pts, mst->next_pts - last_pts, *((uint64_t *)img->user_priv), mst->next_pts, st->time_base.num, st->time_base.den, st->codec->time_base.num, st->codec->time_base.den, mst->next_pts - now);
+			// switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "pts: %" SWITCH_INT64_T_FMT " last_pts: %" SWITCH_INT64_T_FMT " delta: %" SWITCH_INT64_T_FMT " frame_pts: %" SWITCH_INT64_T_FMT " nextpts: %" SWITCH_INT64_T_FMT ", num: %d, den:%d num:%d den:%d sleep: %" SWITCH_INT64_T_FMT "\n",
+			// pts, last_pts, mst->next_pts - last_pts, *((uint64_t *)img->user_priv), mst->next_pts, st->time_base.num, st->time_base.den, st->codec->time_base.num, st->codec->time_base.den, mst->next_pts - now);
 		}
 
-		if (pts == 0) mst->next_pts = 0;
+		if (pts == 0 || context->video_start_time == 0) mst->next_pts = 0;
 
-		if ((mst->next_pts && switch_time_now() - mst->next_pts > max_delta)) {
-			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG3, "picture is too late, off: %" SWITCH_INT64_T_FMT " queue size:%u\n", (int64_t)(switch_time_now() - mst->next_pts), switch_queue_size(context->eh.video_queue));
+		if ((mst->next_pts && (now - mst->next_pts) > max_delta)) {
+			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "picture is too late, off: %" SWITCH_INT64_T_FMT " max delta: %" SWITCH_INT64_T_FMT " queue size:%u\n", (int64_t)(now - mst->next_pts), max_delta, switch_queue_size(context->eh.video_queue));
 			switch_img_free(&img);
 			max_delta = AV_TIME_BASE;
 
 			if (switch_queue_size(context->eh.video_queue) > 0) {
+				// switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "WTF again\n");
 				goto again;
 			} else if (!(flags & SVR_BLOCK) && !do_fl) {
 				mst->next_pts = 0;
+				context->video_start_time = 0;
 				return SWITCH_STATUS_BREAK;
 			}
 		}
 
 		if ((flags & SVR_BLOCK) || do_fl) {
-			while (switch_micro_time_now() - mst->next_pts < -10000 / 2) {
-				// switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "yield\n");
-				switch_yield(10000);
+			while (switch_micro_time_now() - mst->next_pts < -10000) {
+				// switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "yield, delta=%" SWITCH_INT64_T_FMT "\n", switch_micro_time_now() - mst->next_pts);
+				switch_yield(1000);
 			}
 			frame->img = img;
 			do_fl = 0;
 		} else {
-			if (switch_micro_time_now() - mst->next_pts > -10000 / 2) {
+			if (switch_micro_time_now() - mst->next_pts > -10000) {
 				frame->img = img;
 			} else {
 				context->last_img = img;
@@ -1923,8 +2184,13 @@ static switch_status_t av_file_read_video(switch_file_handle_t *handle, switch_f
 		return SWITCH_STATUS_BREAK;
 	}
 
+	if (frame->img) {
+		context->vid_ready = 1;
+	}
+
 	return frame->img ? SWITCH_STATUS_SUCCESS : SWITCH_STATUS_FALSE;
 }
+#endif
 
 static switch_status_t av_file_write_video(switch_file_handle_t *handle, switch_frame_t *frame)
 {
@@ -2061,6 +2327,7 @@ SWITCH_MODULE_LOAD_FUNCTION(mod_avformat_load)
 	file_interface->file_seek = av_file_seek;
 	file_interface->file_set_string = av_file_set_string;
 	file_interface->file_get_string = av_file_get_string;
+	file_interface->file_command = av_file_command;
 
 	SWITCH_ADD_API(api_interface, "av_format", "av information", av_format_api_function, "");
 
diff --git a/src/mod/applications/mod_avmd/Makefile.am b/src/mod/applications/mod_avmd/Makefile.am
index 0387fee18f..ce877babf7 100644
--- a/src/mod/applications/mod_avmd/Makefile.am
+++ b/src/mod/applications/mod_avmd/Makefile.am
@@ -3,6 +3,6 @@ MODNAME=mod_avmd
 
 mod_LTLIBRARIES = mod_avmd.la
 mod_avmd_la_SOURCES  = mod_avmd.c amplitude.c buffer.c desa2.c goertzel.c fast_acosf.c
-mod_avmd_la_CFLAGS   = $(AM_CFLAGS)
+mod_avmd_la_CFLAGS   = $(AM_CFLAGS) $(AM_MOD_AVMD_CXXFLAGS)
 mod_avmd_la_LIBADD   = $(switch_builddir)/libfreeswitch.la
 mod_avmd_la_LDFLAGS  = -avoid-version -module -no-undefined -shared
diff --git a/src/mod/applications/mod_avmd/buffer.c b/src/mod/applications/mod_avmd/buffer.c
index 02d306fe14..f240f5b989 100644
--- a/src/mod/applications/mod_avmd/buffer.c
+++ b/src/mod/applications/mod_avmd/buffer.c
@@ -9,11 +9,11 @@ extern size_t next_power_of_2(size_t v)
 
     v++;
 
-    do{
+    do {
         prev = v;
         v &= ~tmp;
         tmp <<= 1;
-    }while(v != 0);
+    } while (v != 0);
 
     prev <<= 1;
 
diff --git a/src/mod/applications/mod_avmd/buffer.h b/src/mod/applications/mod_avmd/buffer.h
index b6676767e1..a92904ddbc 100644
--- a/src/mod/applications/mod_avmd/buffer.h
+++ b/src/mod/applications/mod_avmd/buffer.h
@@ -29,7 +29,7 @@ extern size_t next_power_of_2(size_t v);
 	(b)->pos++; \
 	(b)->pos &= (b)->mask; \
 	(b)->lpos++; \
-	if((b)->backlog < (b)->buf_len) (b)->backlog++; \
+	if ((b)->backlog < (b)->buf_len) (b)->backlog++; \
     }
 
 #define DEC_POS(b) \
@@ -37,27 +37,27 @@ extern size_t next_power_of_2(size_t v);
 	(b)->pos--; \
 	(b)->pos &= (b)->mask; \
 	(b)->lpos--; \
-	if(((b)->backlog - 1) < (b)->backlog) (b)->backlog--; \
+	if (((b)->backlog - 1) < (b)->backlog) (b)->backlog--; \
     }
 
 #define GET_SAMPLE(b, i) ((b)->buf[(i) & (b)->mask])
 #define SET_SAMPLE(b, i, v) ((b)->buf[(i) & (b)->mask] = (v))
 
 #define INSERT_FRAME(b, f, l) \
-    do{ \
-	for((b)->i = 0; (b)->i < (l); (b)->i++){ \
+    do { \
+	for ((b)->i = 0; (b)->i < (l); (b)->i++) { \
 	    SET_SAMPLE((b), ((b)->i + (b)->pos), (f)[(b)->i]); \
 	} \
 	(b)->pos += (l); \
 	(b)->lpos += (l); \
 	(b)->pos %= (b)->buf_len; \
 	(b)->backlog += (l); \
-	if((b)->backlog > (b)->buf_len) (b)->backlog = (b)->buf_len; \
-    }while(0)
+	if ((b)->backlog > (b)->buf_len) (b)->backlog = (b)->buf_len; \
+    } while (0)
 
 #define INSERT_INT16_FRAME(b, f, l) \
     { \
-	for((b)->i = 0; (b)->i < (l); (b)->i++){ \
+	for ((b)->i = 0; (b)->i < (l); (b)->i++) { \
 	    SET_SAMPLE( \
 		(b), \
 		((b)->i + (b)->pos), \
@@ -72,7 +72,7 @@ extern size_t next_power_of_2(size_t v);
 	(b)->lpos += (l); \
 	(b)->pos &= (b)->mask; \
 	(b)->backlog += (l); \
-	if((b)->backlog > (b)->buf_len) (b)->backlog = (b)->buf_len; \
+	if ((b)->backlog > (b)->buf_len) (b)->backlog = (b)->buf_len; \
     }
 
 
@@ -87,6 +87,7 @@ extern size_t next_power_of_2(size_t v);
 	(bf)->pos = 0; \
 	(bf)->lpos = 0; \
 	(bf)->backlog = 0; \
+	(bf)->i = 0; \
     }
 
 //#define DESTROY_CIRC_BUFFER(b) free((b)->buf)
@@ -95,10 +96,10 @@ extern size_t next_power_of_2(size_t v);
 #define GET_CURRENT_SAMPLE(b) GET_SAMPLE((b), GET_CURRENT_POS((b)))
 
 #define ADD_SAMPLE(b, s) \
-    do{ \
+    do { \
 	INC_POS((b)); \
 	SET_SAMPLE((b), GET_CURRENT_POS((b)), (s)); \
-    }while(0)
+    } while (0)
 
 #endif
 
diff --git a/src/mod/applications/mod_avmd/desa2.c b/src/mod/applications/mod_avmd/desa2.c
index 3998dc00c4..9550a211ce 100644
--- a/src/mod/applications/mod_avmd/desa2.c
+++ b/src/mod/applications/mod_avmd/desa2.c
@@ -35,7 +35,7 @@ extern double desa2(circ_buffer_t *b, size_t i)
     x2sq = x2 * x2;
 
     d = 2.0 * ((x2sq) - (x1 * x3));
-    if(d == 0.0) return 0.0;
+    if (d == 0.0) return 0.0;
 
     n = ((x2sq) - (x0 * x4)) - ((x1 * x1) - (x0 * x2)) - ((x3 * x3) - (x2 * x4));
 
@@ -46,14 +46,10 @@ extern double desa2(circ_buffer_t *b, size_t i)
     result = 0.5 * acos(n/d);
 #endif
 
-    if(ISNAN(result)){
-	result = 0.0;
-    }
-
+    if (ISNAN(result)) result = 0.0;
 
     return result;
 
 }
 
 #endif
-
diff --git a/src/mod/applications/mod_avmd/fast_acosf.c b/src/mod/applications/mod_avmd/fast_acosf.c
index 8f6fd0df88..6c990d2520 100644
--- a/src/mod/applications/mod_avmd/fast_acosf.c
+++ b/src/mod/applications/mod_avmd/fast_acosf.c
@@ -29,7 +29,7 @@
 #define VARIA_DATA_MASK (0x87FFFFF8)
 #define CONST_DATA_MASK (0x38000000)
 
-#define ACOS_TABLE_LENGTH (1<<25)
+#define ACOS_TABLE_LENGTH (1 << 25)
 #define ACOS_TABLE_FILENAME "/tmp/acos_table.dat"
 
 typedef union {
@@ -65,14 +65,12 @@ extern void compute_table(void)
 
     acos_table_file = fopen(ACOS_TABLE_FILENAME, "w");
 
-
-    for(i = 0; i < (1 << 25); i++){
-	f = acosf(float_from_index(i));
-	ret = fwrite(&f, sizeof(f), 1, acos_table_file);
-	assert(ret != 0);
+    for (i = 0; i < ACOS_TABLE_LENGTH; i++) {
+        f = acosf(float_from_index(i));
+        ret = fwrite(&f, sizeof(f), 1, acos_table_file);
+        assert(ret != 0);
     }
 
-
     ret = fclose(acos_table_file);
     assert(ret != EOF);
 }
@@ -82,13 +80,13 @@ extern void init_fast_acosf(void)
 {
     int ret;
 
-    if(acos_table == NULL){
-	ret = access(ACOS_TABLE_FILENAME, F_OK);
-	if(ret == 0) compute_table();
+    if (acos_table == NULL) {
+        ret = access(ACOS_TABLE_FILENAME, F_OK);
+        if (ret == 0) compute_table();
 
         acos_fd = open(ACOS_TABLE_FILENAME, O_RDONLY);
-	if(acos_fd == -1) perror("Could not open file " ACOS_TABLE_FILENAME);
-	assert(acos_fd != -1);
+        if (acos_fd == -1) perror("Could not open file " ACOS_TABLE_FILENAME);
+        assert(acos_fd != -1);
         acos_table = (float *)mmap(
             NULL,
             ACOS_TABLE_LENGTH * sizeof(float),
@@ -136,5 +134,3 @@ static float float_from_index(uint32_t d)
 
 
 #endif
-
-
diff --git a/src/mod/applications/mod_avmd/goertzel.c b/src/mod/applications/mod_avmd/goertzel.c
index 357af98360..c4edb42b54 100644
--- a/src/mod/applications/mod_avmd/goertzel.c
+++ b/src/mod/applications/mod_avmd/goertzel.c
@@ -21,11 +21,11 @@ extern double goertzel(circ_buffer_t *b, size_t pos, double f, size_t num)
 
     coeff = 2.0 * cos(2.0 * M_PI * f);
 
-    for(i = 0; i < num; i++){
-	/* TODO: optimize to avoid GET_SAMPLE when possible */
-	s = GET_SAMPLE(b, i + pos) + (coeff * p) - p2;
-	p2 = p;
-	p = s;
+    for (i = 0; i < num; i++) {
+	    /* TODO: optimize to avoid GET_SAMPLE when possible */
+	    s = GET_SAMPLE(b, i + pos) + (coeff * p) - p2;
+	    p2 = p;
+	    p = s;
     }
 
     return (p2 * p2) + (p * p) - (coeff * p2 * p);
diff --git a/src/mod/applications/mod_avmd/mod_avmd.c b/src/mod/applications/mod_avmd/mod_avmd.c
index a4a1c07815..14848dcb6b 100644
--- a/src/mod/applications/mod_avmd/mod_avmd.c
+++ b/src/mod/applications/mod_avmd/mod_avmd.c
@@ -128,10 +128,10 @@ static switch_bool_t avmd_callback(switch_media_bug_t * bug, void *user_data, sw
 static void init_avmd_session_data(avmd_session_t *avmd_session,  switch_core_session_t *fs_session);
 
 
-/*! \brief The avmd session data initialization function
+/*! \brief The avmd session data initialization function.
  * @author Eric des Courtis
- * @param avmd_session A reference to a avmd session
- * @param fs_session A reference to a FreeSWITCH session
+ * @param avmd_session A reference to a avmd session.
+ * @param fs_session A reference to a FreeSWITCH session.
  */
 static void init_avmd_session_data(avmd_session_t *avmd_session,  switch_core_session_t *fs_session)
 {
@@ -159,7 +159,7 @@ static void init_avmd_session_data(avmd_session_t *avmd_session,  switch_core_se
 }
 
 
-/*! \brief The callback function that is called when new audio data becomes available
+/*! \brief The callback function that is called when new audio data becomes available.
  *
  * @author Eric des Courtis
  * @param bug A reference to the media bug.
@@ -203,7 +203,7 @@ static switch_bool_t avmd_callback(switch_media_bug_t * bug, void *user_data, sw
 	return SWITCH_TRUE;
 }
 
-/*! \brief FreeSWITCH module loading function
+/*! \brief FreeSWITCH module loading function.
  *
  * @author Eric des Courtis
  * @return Load success or failure.
@@ -255,7 +255,7 @@ SWITCH_MODULE_LOAD_FUNCTION(mod_avmd_load)
 }
 
 /*! \brief FreeSWITCH application handler function.
- *  This handles calls made from applications such as LUA and the dialplan
+ *  This handles calls made from applications such as LUA and the dialplan.
  *
  * @author Eric des Courtis
  * @return Success or failure of the function.
@@ -321,7 +321,7 @@ SWITCH_STANDARD_APP(avmd_start_function)
 	switch_channel_set_private(channel, "_avmd_", bug);
 }
 
-/*! \brief Called when the module shuts down
+/*! \brief Called when the module shuts down.
  *
  * @author Eric des Courtis
  * @return The success or failure of the function.
@@ -476,10 +476,10 @@ end:
 	return SWITCH_STATUS_SUCCESS;
 }
 
-/*! \brief Process one frame of data with avmd algorithm
+/*! \brief Process one frame of data with avmd algorithm.
  * @author Eric des Courtis
- * @param session An avmd session
- * @param frame A audio frame
+ * @param session An avmd session.
+ * @param frame An audio frame.
  */
 static void avmd_process(avmd_session_t *session, switch_frame_t *frame)
 {
@@ -506,9 +506,7 @@ static void avmd_process(avmd_session_t *session, switch_frame_t *frame)
 	b = &session->b;
 
 	/*! If beep has already been detected skip the CPU heavy stuff */
-	if(session->state.beep_state == BEEP_DETECTED){
-		return;
-	}
+	if (session->state.beep_state == BEEP_DETECTED) return;
 
 	/*! Precompute values used heavily in the inner loop */
 	sine_len_i = SINE_LEN(session->rate);
@@ -523,12 +521,12 @@ static void avmd_process(avmd_session_t *session, switch_frame_t *frame)
 	//switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session->session), SWITCH_LOG_INFO, "<<< AVMD sine_len_i=%d >>>\n", sine_len_i);
 
 	/*! INNER LOOP -- OPTIMIZATION TARGET */
-	for(pos = session->pos; pos < (GET_CURRENT_POS(b) - P); pos++){
+	for (pos = session->pos; pos < (GET_CURRENT_POS(b) - P); pos++) {
 		if ((pos % sine_len_i) == 0) {
 			/*! Get a desa2 frequency estimate every sine len */
 			f = desa2(b, pos);
 
-			if(f < MIN_FREQUENCY_R(session->rate) || f > MAX_FREQUENCY_R(session->rate)) {
+			if (f < MIN_FREQUENCY_R(session->rate) || f > MAX_FREQUENCY_R(session->rate)) {
 				v = 99999.0;
 				RESET_SMA_BUFFER(&session->sma_b);
 				RESET_SMA_BUFFER(&session->sqa_b);
@@ -543,24 +541,20 @@ static void avmd_process(avmd_session_t *session, switch_frame_t *frame)
 			}
 
 			/*! If variance is less than threshold then we have detection */
-			if(v < VARIANCE_THRESHOLD){
+			if (v < VARIANCE_THRESHOLD) {
 
 				switch_channel_set_variable_printf(channel, "avmd_total_time", "%d", (int)(switch_micro_time_now() - session->start_time) / 1000);
 				switch_channel_execute_on(channel, "execute_on_avmd_beep");
 
 				/*! Throw an event to FreeSWITCH */
 				status = switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, AVMD_EVENT_BEEP);
-				if(status != SWITCH_STATUS_SUCCESS) {
-					return;
-				}
+				if (status != SWITCH_STATUS_SUCCESS) return;
 
 				switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "Beep-Status", "stop");
 				switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "Unique-ID", switch_core_session_get_uuid(session->session));
 				switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "call-command", "avmd");
 
-				if ((switch_event_dup(&event_copy, event)) != SWITCH_STATUS_SUCCESS) {
-					return;
-				}
+				if ((switch_event_dup(&event_copy, event)) != SWITCH_STATUS_SUCCESS) return;
 
 				switch_core_session_queue_event(session->session, &event);
 				switch_event_fire(&event_copy);
diff --git a/src/mod/applications/mod_callcenter/mod_callcenter.c b/src/mod/applications/mod_callcenter/mod_callcenter.c
index 2b0e28a5e7..1df0afd557 100644
--- a/src/mod/applications/mod_callcenter/mod_callcenter.c
+++ b/src/mod/applications/mod_callcenter/mod_callcenter.c
@@ -1,6 +1,6 @@
 /* 
  * FreeSWITCH Modular Media Switching Software Library / Soft-Switch Application
- * Copyright (C) 2005-2014, Anthony Minessale II <anthm@freeswitch.org>
+ * Copyright (C) 2005-2016, Anthony Minessale II <anthm@freeswitch.org>
  *
  * Version: MPL 1.1
  *
@@ -444,6 +444,8 @@ struct cc_queue {
 	uint32_t max_wait_time;
 	uint32_t max_wait_time_with_no_agent;
 	uint32_t max_wait_time_with_no_agent_time_reached;
+	uint32_t calls_answered;
+	uint32_t calls_abandoned;
 
 	switch_mutex_t *mutex;
 
@@ -560,12 +562,15 @@ cc_queue_t *queue_set_config(cc_queue_t *queue)
 
 static int cc_execute_sql_affected_rows(char *sql) {
 	switch_cache_db_handle_t *dbh = NULL;
+	int res = 0;
 	if (!(dbh = cc_get_db_handle())) {
 		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Error Opening DB\n");
 		return -1;
 	}
 	switch_cache_db_execute_sql(dbh, sql, NULL);
-	return switch_cache_db_affected_rows(dbh);
+	res = switch_cache_db_affected_rows(dbh);
+	switch_cache_db_release_db_handle(&dbh);
+	return res;
 }
 
 char *cc_execute_sql2str(cc_queue_t *queue, switch_mutex_t *mutex, char *sql, char *resbuf, size_t len)
@@ -719,6 +724,8 @@ static cc_queue_t *load_queue(const char *queue_name)
 
 		queue->last_agent_exist = 0;
 		queue->last_agent_exist_check = 0;
+		queue->calls_answered = 0;
+		queue->calls_abandoned = 0;
 
 		switch_mutex_init(&queue->mutex, SWITCH_MUTEX_NESTED, queue->pool);
 		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Added queue %s\n", queue->name);
@@ -2675,6 +2682,38 @@ SWITCH_STANDARD_APP(callcenter_function)
 		cc_base_score_int += ((long) local_epoch_time_now(NULL) - atol(start_epoch));
 	}
 
+	/* for xml_cdr needs */
+	switch_channel_set_variable_printf(member_channel, "cc_queue_joined_epoch", "%" SWITCH_TIME_T_FMT, local_epoch_time_now(NULL));
+	switch_channel_set_variable(member_channel, "cc_queue", queue_name);
+
+	/* We have a previous abandoned user, let's try to recover his place */
+	if (abandoned_epoch > 0) {
+		char res[256];
+
+		switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(member_session), SWITCH_LOG_DEBUG, "Member %s <%s> restoring it previous position in queue %s\n", switch_str_nil(switch_channel_get_variable(member_channel, "caller_id_name")), switch_str_nil(switch_channel_get_variable(member_channel, "caller_id_number")), queue_name);
+
+		/* Update abandoned member */
+		sql = switch_mprintf("UPDATE members SET session_uuid = '%q', state = '%q', rejoined_epoch = '%" SWITCH_TIME_T_FMT "' WHERE uuid = '%q' AND state = '%q'",
+				member_session_uuid, cc_member_state2str(CC_MEMBER_STATE_WAITING), local_epoch_time_now(NULL), member_uuid, cc_member_state2str(CC_MEMBER_STATE_ABANDONED));
+		cc_execute_sql(queue, sql, NULL);
+		switch_safe_free(sql);
+
+		/* Confirm we took that member in */
+		sql = switch_mprintf("SELECT abandoned_epoch FROM members WHERE uuid = '%q' AND session_uuid = '%q' AND state = '%q' AND queue = '%q'", member_uuid, member_session_uuid, cc_member_state2str(CC_MEMBER_STATE_WAITING), queue_name);
+		cc_execute_sql2str(NULL, NULL, sql, res, sizeof(res));
+		switch_safe_free(sql);
+		abandoned_epoch = atol(res);
+
+		if (abandoned_epoch == 0) {
+			/* Failed to get the member !!! */
+			switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(member_session), SWITCH_LOG_ERROR, "Member %s <%s> restoring action failed in queue %s, joining again\n", switch_str_nil(switch_channel_get_variable(member_channel, "caller_id_name")), switch_str_nil(switch_channel_get_variable(member_channel, "caller_id_number")), queue_name);
+			//queue_rwunlock(queue);
+		} else {
+
+		}
+
+	}
+
 	if (switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, CALLCENTER_EVENT) == SWITCH_STATUS_SUCCESS) {
 		switch_channel_event_set_data(member_channel, event);
 		switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "CC-Queue", queue_name);
@@ -2685,9 +2724,7 @@ SWITCH_STANDARD_APP(callcenter_function)
 		switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "CC-Member-CID-Number", switch_str_nil(switch_channel_get_variable(member_channel, "caller_id_number")));
 		switch_event_fire(&event);
 	}
-	/* for xml_cdr needs */
-	switch_channel_set_variable_printf(member_channel, "cc_queue_joined_epoch", "%" SWITCH_TIME_T_FMT, local_epoch_time_now(NULL));
-	switch_channel_set_variable(member_channel, "cc_queue", queue_name);
+
 
 	if (abandoned_epoch == 0) {
 		/* Add the caller to the member queue */
@@ -2709,30 +2746,6 @@ SWITCH_STANDARD_APP(callcenter_function)
 				cc_member_state2str(CC_MEMBER_STATE_WAITING));
 		cc_execute_sql(queue, sql, NULL);
 		switch_safe_free(sql);
-	} else {
-		char res[256];
-
-		switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(member_session), SWITCH_LOG_DEBUG, "Member %s <%s> restoring it previous position in queue %s\n", switch_str_nil(switch_channel_get_variable(member_channel, "caller_id_name")), switch_str_nil(switch_channel_get_variable(member_channel, "caller_id_number")), queue_name);
-
-		/* Update abandoned member */
-		sql = switch_mprintf("UPDATE members SET session_uuid = '%q', state = '%q', rejoined_epoch = '%" SWITCH_TIME_T_FMT "' WHERE uuid = '%q' AND state = '%q'",
-				member_session_uuid, cc_member_state2str(CC_MEMBER_STATE_WAITING), local_epoch_time_now(NULL), member_uuid, cc_member_state2str(CC_MEMBER_STATE_ABANDONED)); 
-		cc_execute_sql(queue, sql, NULL);
-		switch_safe_free(sql);
-
-		/* Confirm we took that member in */
-		sql = switch_mprintf("SELECT abandoned_epoch FROM members WHERE uuid = '%q' AND session_uuid = '%q' AND state = '%q' AND queue = '%q'", member_uuid, member_session_uuid, cc_member_state2str(CC_MEMBER_STATE_WAITING), queue_name);
-		cc_execute_sql2str(NULL, NULL, sql, res, sizeof(res));
-		switch_safe_free(sql);
-
-		if (atol(res) == 0) {
-			/* Failed to get the member !!! */
-			/* TODO Loop back to just create a uuid and add the member as a new member */
-			switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(member_session), SWITCH_LOG_ERROR, "Member %s <%s> restoring action failed in queue %s, exiting\n", switch_str_nil(switch_channel_get_variable(member_channel, "caller_id_name")), switch_str_nil(switch_channel_get_variable(member_channel, "caller_id_number")), queue_name);
-			queue_rwunlock(queue);
-			goto end;
-		}
-
 	}
 
 	/* Send Event with queue count */
@@ -2868,6 +2881,7 @@ SWITCH_STANDARD_APP(callcenter_function)
 						  switch_str_nil(switch_channel_get_variable(member_channel, "caller_id_name")),
 						  switch_str_nil(switch_channel_get_variable(member_channel, "caller_id_number")),
 						  queue_name, cc_member_cancel_reason2str(h->member_cancel_reason));
+		queue->calls_abandoned++;
 
 	} else {
 		switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(member_session), SWITCH_LOG_DEBUG, "Member %s <%s> is answered by an agent in queue %s\n", switch_str_nil(switch_channel_get_variable(member_channel, "caller_id_name")), switch_str_nil(switch_channel_get_variable(member_channel, "caller_id_number")), queue_name);
@@ -2880,6 +2894,7 @@ SWITCH_STANDARD_APP(callcenter_function)
 
 		/* Update some channel variables for xml_cdr needs */
 		switch_channel_set_variable_printf(member_channel, "cc_cause", "%s", "answered");
+		queue->calls_answered++;
 
 	}
 
@@ -3297,7 +3312,12 @@ SWITCH_STANDARD_API(cc_config_api_function)
 			/* queue list */
 			if (argc-initial_argc < 1) {
 				switch_hash_index_t *hi;
-				stream->write_function(stream, "%s", "name|strategy|moh_sound|time_base_score|tier_rules_apply|tier_rule_wait_second|tier_rule_wait_multiply_level|tier_rule_no_agent_no_wait|discard_abandoned_after|abandoned_resume_allowed|max_wait_time|max_wait_time_with_no_agent|max_wait_time_with_no_agent_time_reached|record_template\n");
+				stream->write_function(stream, "%s",
+				                       "name|strategy|moh_sound|time_base_score|tier_rules_apply|"\
+				                       "tier_rule_wait_second|tier_rule_wait_multiply_level|"\
+				                       "tier_rule_no_agent_no_wait|discard_abandoned_after|"\
+				                       "abandoned_resume_allowed|max_wait_time|max_wait_time_with_no_agent|"\
+				                       "max_wait_time_with_no_agent_time_reached|record_template|calls_answered|calls_abandoned\n");
 				switch_mutex_lock(globals.mutex);
 				for (hi = switch_core_hash_first(globals.queue_hash); hi; hi = switch_core_hash_next(&hi)) {
 					void *val = NULL;
@@ -3306,7 +3326,23 @@ SWITCH_STANDARD_API(cc_config_api_function)
 					cc_queue_t *queue;
 					switch_core_hash_this(hi, &key, &keylen, &val);
 					queue = (cc_queue_t *) val;
-					stream->write_function(stream, "%s|%s|%s|%s|%s|%d|%s|%s|%d|%s|%d|%d|%d|%s\n", queue->name, queue->strategy, queue->moh, queue->time_base_score, (queue->tier_rules_apply?"true":"false"), queue->tier_rule_wait_second, (queue->tier_rule_wait_multiply_level?"true":"false"), (queue->tier_rule_no_agent_no_wait?"true":"false"), queue->discard_abandoned_after, (queue->abandoned_resume_allowed?"true":"false"), queue->max_wait_time, queue->max_wait_time_with_no_agent, queue->max_wait_time_with_no_agent_time_reached, queue->record_template);
+					stream->write_function(stream, "%s|%s|%s|%s|%s|%d|%s|%s|%d|%s|%d|%d|%d|%s|%d|%d\n",
+					                       queue->name,
+					                       queue->strategy,
+					                       queue->moh,
+					                       queue->time_base_score,
+					                       (queue->tier_rules_apply?"true":"false"),
+					                       queue->tier_rule_wait_second,
+					                       (queue->tier_rule_wait_multiply_level?"true":"false"),
+					                       (queue->tier_rule_no_agent_no_wait?"true":"false"),
+					                       queue->discard_abandoned_after,
+					                       (queue->abandoned_resume_allowed?"true":"false"),
+					                       queue->max_wait_time,
+					                       queue->max_wait_time_with_no_agent,
+					                       queue->max_wait_time_with_no_agent_time_reached,
+					                       queue->record_template,
+					                       queue->calls_answered,
+					                       queue->calls_abandoned);
 					queue = NULL;
 				}
 				switch_mutex_unlock(globals.mutex);
diff --git a/src/mod/applications/mod_commands/mod_commands.c b/src/mod/applications/mod_commands/mod_commands.c
index b97042f838..35a4965172 100644
--- a/src/mod/applications/mod_commands/mod_commands.c
+++ b/src/mod/applications/mod_commands/mod_commands.c
@@ -7377,6 +7377,8 @@ SWITCH_MODULE_LOAD_FUNCTION(mod_commands_load)
 	switch_console_set_complete("add uuid_loglevel ::console::list_uuid debug");
 	switch_console_set_complete("add uuid_media ::console::list_uuid");
 	switch_console_set_complete("add uuid_media off ::console::list_uuid");
+	switch_console_set_complete("add uuid_media_3p ::console::list_uuid");
+	switch_console_set_complete("add uuid_media_3p off ::console::list_uuid");
 	switch_console_set_complete("add uuid_park ::console::list_uuid");
 	switch_console_set_complete("add uuid_media_reneg ::console::list_uuid");
 	switch_console_set_complete("add uuid_phone_event ::console::list_uuid talk");
diff --git a/src/mod/applications/mod_conference/conference_api.c b/src/mod/applications/mod_conference/conference_api.c
index 72422436fc..a6b3357187 100644
--- a/src/mod/applications/mod_conference/conference_api.c
+++ b/src/mod/applications/mod_conference/conference_api.c
@@ -71,7 +71,7 @@ api_command_t conference_api_sub_commands[] = {
 	{"unvmute", (void_fn_t) & conference_api_sub_unvmute, CONF_API_SUB_MEMBER_TARGET, "unvmute", "<[member_id|all]|last|non_moderator> [<quiet>]"},
 	{"deaf", (void_fn_t) & conference_api_sub_deaf, CONF_API_SUB_MEMBER_TARGET, "deaf", "<[member_id|all]|last|non_moderator>"},
 	{"undeaf", (void_fn_t) & conference_api_sub_undeaf, CONF_API_SUB_MEMBER_TARGET, "undeaf", "<[member_id|all]|last|non_moderator>"},
-	{"relate", (void_fn_t) & conference_api_sub_relate, CONF_API_SUB_ARGS_SPLIT, "relate", "<member_id> <other_member_id> [nospeak|nohear|clear]"},
+	{"relate", (void_fn_t) & conference_api_sub_relate, CONF_API_SUB_ARGS_SPLIT, "relate", "<member_id>[,<member_id>] <other_member_id>[,<other_member_id>] [nospeak|nohear|clear]"},
 	{"lock", (void_fn_t) & conference_api_sub_lock, CONF_API_SUB_ARGS_SPLIT, "lock", ""},
 	{"unlock", (void_fn_t) & conference_api_sub_unlock, CONF_API_SUB_ARGS_SPLIT, "unlock", ""},
 	{"agc", (void_fn_t) & conference_api_sub_agc, CONF_API_SUB_ARGS_SPLIT, "agc", ""},
@@ -1013,6 +1013,10 @@ switch_status_t conference_api_sub_vid_personal(conference_obj_t *conference, sw
 	if (argv[2]) {
 		on = switch_true(argv[2]);
 		if (on) {
+			if (conference->record_count > 0) {
+				stream->write_function(stream, "-ERR conference is recording, not enabling vid-personal.\n");
+				return SWITCH_STATUS_SUCCESS;
+			}
 			conference_utils_set_flag(conference, CFLAG_PERSONAL_CANVAS);
 		} else {
 			conference_utils_clear_flag(conference, CFLAG_PERSONAL_CANVAS);
@@ -1937,35 +1941,134 @@ switch_status_t conference_api_sub_stop(conference_obj_t *conference, switch_str
 	return SWITCH_STATUS_SUCCESS;
 }
 
+void _conference_api_sub_relate_show_member_relationships(conference_obj_t *conference, switch_stream_handle_t *stream, uint32_t member_id)
+{
+	conference_member_t *member;
+	for (member = conference->members; member; member = member->next) {
+		conference_relationship_t *rel;
+
+		if (member_id > 0 && member->id != member_id) continue;
+
+		for (rel = member->relationships; rel; rel = rel->next) {
+			stream->write_function(stream, "%d -> %d %s%s%s\n", member->id, rel->id,
+								   (rel->flags & RFLAG_CAN_SPEAK) ? "SPEAK " : "NOSPEAK ",
+								   (rel->flags & RFLAG_CAN_HEAR) ? "HEAR " : "NOHEAR ",
+								   (rel->flags & RFLAG_CAN_SEND_VIDEO) ? "SENDVIDEO " : "NOSENDVIDEO ");
+		}
+	}
+}
+
+void _conference_api_sub_relate_clear_member_relationship(conference_obj_t *conference, switch_stream_handle_t *stream, uint32_t id, uint32_t oid)
+{
+	conference_member_t *member = NULL, *other_member = NULL;
+	if ((member = conference_member_get(conference, id))) {
+		conference_member_del_relationship(member, oid);
+		other_member = conference_member_get(conference, oid);
+
+		if (other_member) {
+			if (conference_utils_member_test_flag(other_member, MFLAG_RECEIVING_VIDEO)) {
+				conference_utils_member_clear_flag(other_member, MFLAG_RECEIVING_VIDEO);
+				if (conference->floor_holder) {
+					switch_core_session_request_video_refresh(conference->floor_holder->session);
+				}
+			}
+			switch_thread_rwlock_unlock(other_member->rwlock);
+		}
+
+		stream->write_function(stream, "relationship %u->%u cleared.\n", id, oid);
+		switch_thread_rwlock_unlock(member->rwlock);
+	} else {
+		stream->write_function(stream, "relationship %u->%u not found.\n", id, oid);
+	}
+}
+
+void _conference_api_sub_relate_set_member_relationship(conference_obj_t *conference, switch_stream_handle_t *stream, uint32_t id, uint32_t oid, uint8_t nospeak, uint8_t nohear, uint8_t sendvideo, char *action)
+{
+
+	conference_member_t *member = NULL, *other_member = NULL;
+
+	if ((member = conference_member_get(conference, id))) {
+		other_member = conference_member_get(conference, oid);
+	}
+
+	if (member && other_member) {
+		conference_relationship_t *rel = NULL;
+
+		if (sendvideo && conference_utils_member_test_flag(other_member, MFLAG_RECEIVING_VIDEO) && (! (nospeak || nohear))) {
+			stream->write_function(stream, "member %d already receiving video", oid);
+			goto skip;
+		}
+
+		if ((rel = conference_member_get_relationship(member, other_member))) {
+			rel->flags = 0;
+		} else {
+			rel = conference_member_add_relationship(member, oid);
+		}
+
+		if (rel) {
+			switch_set_flag(rel, RFLAG_CAN_SPEAK | RFLAG_CAN_HEAR);
+			if (nospeak) {
+				switch_clear_flag(rel, RFLAG_CAN_SPEAK);
+				conference_utils_member_clear_flag_locked(member, MFLAG_TALKING);
+			}
+			if (nohear) {
+				switch_clear_flag(rel, RFLAG_CAN_HEAR);
+			}
+			if (sendvideo) {
+				switch_set_flag(rel, RFLAG_CAN_SEND_VIDEO);
+				conference_utils_member_set_flag(other_member, MFLAG_RECEIVING_VIDEO);
+				switch_core_session_request_video_refresh(member->session);
+			}
+
+			stream->write_function(stream, "ok %u->%u %s set\n", id, oid, action);
+		} else {
+			stream->write_function(stream, "error!\n");
+		}
+	} else {
+		stream->write_function(stream, "relationship %u->%u not found.\n", id, oid);
+	}
+
+skip:
+	if (member) {
+		switch_thread_rwlock_unlock(member->rwlock);
+	}
+
+	if (other_member) {
+		switch_thread_rwlock_unlock(other_member->rwlock);
+	}
+}
+
 switch_status_t conference_api_sub_relate(conference_obj_t *conference, switch_stream_handle_t *stream, int argc, char **argv)
 {
 	uint8_t nospeak = 0, nohear = 0, sendvideo = 0, clear = 0;
+	int members = 0;
+	int other_members = 0;
+	char *members_array[100] = { 0 };
+	char *other_members_array[100] = { 0 };
+	char *lbuf_members = NULL, *lbuf_other_members = NULL, *action = NULL;
 
 	switch_assert(conference != NULL);
 	switch_assert(stream != NULL);
 
 	if (argc <= 3) {
-		conference_member_t *member;
-
 		switch_mutex_lock(conference->mutex);
 
 		if (conference->relationship_total) {
-			uint32_t member_id = 0;
-
-			if (argc == 3) member_id = atoi(argv[2]);
-
-			for (member = conference->members; member; member = member->next) {
-				conference_relationship_t *rel;
-
-				if (member_id > 0 && member->id != member_id) continue;
-
-				for (rel = member->relationships; rel; rel = rel->next) {
-					stream->write_function(stream, "%d -> %d %s%s%s\n", member->id, rel->id,
-										   (rel->flags & RFLAG_CAN_SPEAK) ? "SPEAK " : "NOSPEAK ",
-										   (rel->flags & RFLAG_CAN_HEAR) ? "HEAR " : "NOHEAR ",
-										   (rel->flags & RFLAG_CAN_SEND_VIDEO) ? "SENDVIDEO " : "NOSENDVIDEO ");
+			if (argc == 3) {
+				char *lbuf = NULL;
+				lbuf = strdup(argv[2]);
+				members = switch_separate_string(lbuf, ',', members_array, (sizeof(members_array) / sizeof(members_array[0])));
+				if (members) {
+					int i;
+					uint32_t member_id;
+					for (i = 0; i < members && members_array[i]; i++) {
+						member_id = atoi(members_array[i]);
+						_conference_api_sub_relate_show_member_relationships(conference, stream, member_id);
+					}
 				}
+				switch_safe_free(lbuf);
 			}
+
 		} else {
 			stream->write_function(stream, "No relationships\n");
 		}
@@ -1988,88 +2091,30 @@ switch_status_t conference_api_sub_relate(conference_obj_t *conference, switch_s
 		return SWITCH_STATUS_GENERR;
 	}
 
-	if (clear) {
-		conference_member_t *member = NULL, *other_member = NULL;
-		uint32_t id = atoi(argv[2]);
-		uint32_t oid = atoi(argv[3]);
-
-		if ((member = conference_member_get(conference, id))) {
-			conference_member_del_relationship(member, oid);
-			other_member = conference_member_get(conference, oid);
-
-			if (other_member) {
-				if (conference_utils_member_test_flag(other_member, MFLAG_RECEIVING_VIDEO)) {
-					conference_utils_member_clear_flag(other_member, MFLAG_RECEIVING_VIDEO);
-					if (conference->floor_holder) {
-						switch_core_session_request_video_refresh(conference->floor_holder->session);
-					}
+	lbuf_members = strdup(argv[2]);
+	lbuf_other_members = strdup(argv[3]);
+	action = strdup(argv[4]);
+	members = switch_separate_string(lbuf_members, ',', members_array, (sizeof(members_array) / sizeof(members_array[0])));
+	other_members = switch_separate_string(lbuf_other_members, ',', other_members_array, (sizeof(other_members_array) / sizeof(other_members_array[0])));
+	if (members && other_members) {
+		int i, i2;
+		uint32_t member_id, other_member_id;
+		for (i = 0; i < members && members_array[i]; i++) {
+			member_id = atoi(members_array[i]);
+			for (i2 = 0; i2 < other_members && other_members_array[i2]; i2++) {
+				other_member_id = atoi(other_members_array[i2]);
+				if (clear) {
+					_conference_api_sub_relate_clear_member_relationship(conference, stream, member_id, other_member_id);
 				}
-				switch_thread_rwlock_unlock(other_member->rwlock);
-			}
-
-			stream->write_function(stream, "relationship %u->%u cleared.\n", id, oid);
-			switch_thread_rwlock_unlock(member->rwlock);
-		} else {
-			stream->write_function(stream, "relationship %u->%u not found.\n", id, oid);
-		}
-		return SWITCH_STATUS_SUCCESS;
-	}
-
-	if (nospeak || nohear || sendvideo) {
-		conference_member_t *member = NULL, *other_member = NULL;
-		uint32_t id = atoi(argv[2]);
-		uint32_t oid = atoi(argv[3]);
-
-		if ((member = conference_member_get(conference, id))) {
-			other_member = conference_member_get(conference, oid);
-		}
-
-		if (member && other_member) {
-			conference_relationship_t *rel = NULL;
-
-			if (sendvideo && conference_utils_member_test_flag(other_member, MFLAG_RECEIVING_VIDEO) && (! (nospeak || nohear))) {
-				stream->write_function(stream, "member %d already receiving video", oid);
-				goto skip;
-			}
-
-			if ((rel = conference_member_get_relationship(member, other_member))) {
-				rel->flags = 0;
-			} else {
-				rel = conference_member_add_relationship(member, oid);
-			}
-
-			if (rel) {
-				switch_set_flag(rel, RFLAG_CAN_SPEAK | RFLAG_CAN_HEAR);
-				if (nospeak) {
-					switch_clear_flag(rel, RFLAG_CAN_SPEAK);
-					conference_utils_member_clear_flag_locked(member, MFLAG_TALKING);
+				if (nospeak || nohear || sendvideo) {
+					_conference_api_sub_relate_set_member_relationship(conference, stream, member_id, other_member_id, nospeak, nohear, sendvideo, action);
 				}
-				if (nohear) {
-					switch_clear_flag(rel, RFLAG_CAN_HEAR);
-				}
-				if (sendvideo) {
-					switch_set_flag(rel, RFLAG_CAN_SEND_VIDEO);
-					conference_utils_member_set_flag(other_member, MFLAG_RECEIVING_VIDEO);
-					switch_core_session_request_video_refresh(member->session);
-				}
-
-				stream->write_function(stream, "ok %u->%u %s set\n", id, oid, argv[4]);
-			} else {
-				stream->write_function(stream, "error!\n");
 			}
-		} else {
-			stream->write_function(stream, "relationship %u->%u not found.\n", id, oid);
-		}
-
-	skip:
-		if (member) {
-			switch_thread_rwlock_unlock(member->rwlock);
-		}
-
-		if (other_member) {
-			switch_thread_rwlock_unlock(other_member->rwlock);
 		}
 	}
+	switch_safe_free(lbuf_members);
+	switch_safe_free(lbuf_other_members);
+	switch_safe_free(action);
 
 	return SWITCH_STATUS_SUCCESS;
 }
@@ -2370,6 +2415,11 @@ switch_status_t conference_api_sub_record(conference_obj_t *conference, switch_s
 		return SWITCH_STATUS_GENERR;
 	}
 
+	if (conference_utils_test_flag(conference, CFLAG_PERSONAL_CANVAS)) {
+		stream->write_function(stream, "-ERR Personal Canvas enabled, recording not permitted.\n");
+		return SWITCH_STATUS_SUCCESS;
+	}
+
 	if (argv[3]) {
 
 		if (argv[3]) {
diff --git a/src/mod/applications/mod_conference/conference_file.c b/src/mod/applications/mod_conference/conference_file.c
index 8ca5b0d754..6cf27559a3 100644
--- a/src/mod/applications/mod_conference/conference_file.c
+++ b/src/mod/applications/mod_conference/conference_file.c
@@ -91,7 +91,7 @@ switch_status_t conference_file_close(conference_obj_t *conference, conference_f
 		conference_al_close(node->al);
 	}
 #endif
-	if (conference->playing_video_file && switch_core_file_has_video(&node->fh) && conference->canvases[0] && node->canvas_id > -1) {
+	if (conference->playing_video_file && switch_core_file_has_video(&node->fh, SWITCH_FALSE) && conference->canvases[0] && node->canvas_id > -1) {
 		if (conference->canvases[node->canvas_id]->timer.timer_interface) {
 			conference->canvases[node->canvas_id]->timer.interval = conference->video_fps.ms;
 			conference->canvases[node->canvas_id]->timer.samples = conference->video_fps.samples;
diff --git a/src/mod/applications/mod_conference/conference_member.c b/src/mod/applications/mod_conference/conference_member.c
index 827812d162..f0d1c2c7a7 100644
--- a/src/mod/applications/mod_conference/conference_member.c
+++ b/src/mod/applications/mod_conference/conference_member.c
@@ -682,7 +682,7 @@ switch_status_t conference_member_add(conference_obj_t *conference, conference_m
 	call_list_t *call_list = NULL;
 	switch_channel_t *channel;
 	const char *controls = NULL, *position = NULL, *var = NULL;
-
+	switch_bool_t has_video = switch_core_has_video();
 
 	switch_assert(conference != NULL);
 	switch_assert(member != NULL);
@@ -760,29 +760,31 @@ switch_status_t conference_member_add(conference_obj_t *conference, conference_m
 
 		conference_video_reset_member_codec_index(member);
 
-		if ((var = switch_channel_get_variable_dup(member->channel, "video_mute_png", SWITCH_FALSE, -1))) {
-			member->video_mute_png = switch_core_strdup(member->pool, var);
-			member->video_mute_img = switch_img_read_png(member->video_mute_png, SWITCH_IMG_FMT_I420);
-		}
+		if (has_video) {
+			if ((var = switch_channel_get_variable_dup(member->channel, "video_mute_png", SWITCH_FALSE, -1))) {
+				member->video_mute_png = switch_core_strdup(member->pool, var);
+				member->video_mute_img = switch_img_read_png(member->video_mute_png, SWITCH_IMG_FMT_I420);
+			}
 
-		if ((var = switch_channel_get_variable_dup(member->channel, "video_reservation_id", SWITCH_FALSE, -1))) {
-			member->video_reservation_id = switch_core_strdup(member->pool, var);
-		}
+			if ((var = switch_channel_get_variable_dup(member->channel, "video_reservation_id", SWITCH_FALSE, -1))) {
+				member->video_reservation_id = switch_core_strdup(member->pool, var);
+			}
 
-		if ((var = switch_channel_get_variable(channel, "video_use_dedicated_encoder")) && switch_true(var)) {
-			conference_utils_member_set_flag_locked(member, MFLAG_NO_MINIMIZE_ENCODING);
-		}
-
-		if ((var = switch_channel_get_variable(member->channel, "rtp_video_max_bandwidth_in"))) {
-			member->max_bw_in = switch_parse_bandwidth_string(var);
-		}
-		
-		if ((var = switch_channel_get_variable(member->channel, "rtp_video_max_bandwidth_out"))) {
-			member->max_bw_out = switch_parse_bandwidth_string(var);;
-
-			if (member->max_bw_out < conference->video_codec_settings.video.bandwidth) {
+			if ((var = switch_channel_get_variable(channel, "video_use_dedicated_encoder")) && switch_true(var)) {
 				conference_utils_member_set_flag_locked(member, MFLAG_NO_MINIMIZE_ENCODING);
-				switch_core_media_set_outgoing_bitrate(member->session, SWITCH_MEDIA_TYPE_VIDEO, member->max_bw_out);
+			}
+
+			if ((var = switch_channel_get_variable(member->channel, "rtp_video_max_bandwidth_in"))) {
+				member->max_bw_in = switch_parse_bandwidth_string(var);
+			}
+		
+			if ((var = switch_channel_get_variable(member->channel, "rtp_video_max_bandwidth_out"))) {
+				member->max_bw_out = switch_parse_bandwidth_string(var);
+
+				if (member->max_bw_out < conference->video_codec_settings.video.bandwidth) {
+					conference_utils_member_set_flag_locked(member, MFLAG_NO_MINIMIZE_ENCODING);
+					switch_core_media_set_outgoing_bitrate(member->session, SWITCH_MEDIA_TYPE_VIDEO, member->max_bw_out);
+				}
 			}
 		}
 		
@@ -1636,7 +1638,7 @@ int conference_member_setup_media(conference_member_t *member, conference_obj_t
 		switch_resample_destroy(&member->read_resampler);
 	}
 
-	switch_core_session_get_read_impl(member->session, &member->orig_read_impl);
+	switch_core_session_get_real_read_impl(member->session, &member->orig_read_impl);
 	member->native_rate = member->orig_read_impl.samples_per_second;
 
 	/* Setup a Signed Linear codec for reading audio. */
diff --git a/src/mod/applications/mod_conference/conference_record.c b/src/mod/applications/mod_conference/conference_record.c
index 37d11178c3..2ff3ae4dbd 100644
--- a/src/mod/applications/mod_conference/conference_record.c
+++ b/src/mod/applications/mod_conference/conference_record.c
@@ -60,6 +60,11 @@ void conference_record_launch_thread(conference_obj_t *conference, char *path, i
 		return;
 	}
 
+	if (conference_utils_test_flag(conference, CFLAG_PERSONAL_CANVAS)) {
+		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_CRIT, "Personal Canvas enabled, recording not permitted.\n");
+		return;
+	}
+
 	rec->conference = conference;
 	rec->path = switch_core_strdup(pool, path);
 	rec->pool = pool;
diff --git a/src/mod/applications/mod_conference/conference_video.c b/src/mod/applications/mod_conference/conference_video.c
index 031f8f3357..2eb91fb95c 100644
--- a/src/mod/applications/mod_conference/conference_video.c
+++ b/src/mod/applications/mod_conference/conference_video.c
@@ -449,6 +449,7 @@ void conference_video_scale_and_patch(mcu_layer_t *layer, switch_image_t *ximg,
 		
 		if (layer->banner_img && !layer->banner_patched) {
 			switch_img_fill(layer->canvas->img, layer->x_pos + layer->geometry.border, layer->y_pos + layer->geometry.border, layer->screen_w, layer->screen_h, &layer->canvas->letterbox_bgcolor);
+			switch_img_fit(&layer->banner_img, layer->screen_w, layer->screen_h, SWITCH_FIT_SIZE);
 			switch_img_patch(IMG, layer->banner_img, layer->x_pos + layer->geometry.border, layer->y_pos + (layer->screen_h - layer->banner_img->d_h) + layer->geometry.border);
 
 			if (!freeze) {
@@ -1183,7 +1184,7 @@ switch_status_t conference_video_init_canvas(conference_obj_t *conference, video
 	return SWITCH_STATUS_SUCCESS;
 }
 
-int conference_video_flush_queue(switch_queue_t *q)
+int conference_video_flush_queue(switch_queue_t *q, int min)
 {
 	switch_image_t *img;
 	void *pop;
@@ -1191,7 +1192,7 @@ int conference_video_flush_queue(switch_queue_t *q)
 
 	if (!q) return 0;
 
-	while (switch_queue_size(q) > 1 && switch_queue_trypop(q, &pop) == SWITCH_STATUS_SUCCESS && pop) {
+	while (switch_queue_size(q) > min && switch_queue_trypop(q, &pop) == SWITCH_STATUS_SUCCESS && pop) {
 		img = (switch_image_t *)pop;
 		switch_img_free(&img);
 		r++;
@@ -1207,7 +1208,7 @@ void conference_video_destroy_canvas(mcu_canvas_t **canvasP) {
 
 	switch_img_free(&canvas->img);
 	switch_img_free(&canvas->bgimg);
-	conference_video_flush_queue(canvas->video_queue);
+	conference_video_flush_queue(canvas->video_queue, 0);
 
 	for (i = 0; i < MCU_MAX_LAYERS; i++) {
 		switch_img_free(&canvas->layers[i].img);
@@ -1486,6 +1487,8 @@ void *SWITCH_THREAD_FUNC conference_video_muxing_write_thread_run(switch_thread_
 		return NULL;
 	}
 
+	switch_core_autobind_cpu();
+
 	while(conference_utils_member_test_flag(member, MFLAG_RUNNING)) {
 		if (switch_queue_pop(member->mux_out_queue, &pop) == SWITCH_STATUS_SUCCESS) {
 			mcu_layer_t *layer = NULL;
@@ -1584,7 +1587,7 @@ void conference_video_check_recording(conference_obj_t *conference, mcu_canvas_t
 			continue;
 		}
 
-		if (switch_test_flag((&imember->rec->fh), SWITCH_FILE_OPEN) && switch_core_file_has_video(&imember->rec->fh)) {
+		if (switch_test_flag((&imember->rec->fh), SWITCH_FILE_OPEN) && switch_core_file_has_video(&imember->rec->fh, SWITCH_TRUE)) {
 			switch_core_file_write_video(&imember->rec->fh, frame);
 		}
 	}
@@ -1658,7 +1661,7 @@ void conference_video_check_flush(conference_member_t *member)
 		return;
 	}
 
-	flushed = conference_video_flush_queue(member->video_queue);
+	flushed = conference_video_flush_queue(member->video_queue, 1);
 
 	if (flushed && member->auto_avatar) {
 		switch_channel_video_sync(member->channel);
@@ -1694,7 +1697,7 @@ void conference_video_patch_fnode(mcu_canvas_t *canvas, conference_file_node_t *
 void conference_video_fnode_check(conference_file_node_t *fnode, int canvas_id) {
 	mcu_canvas_t *canvas = NULL;
 	
-	if (switch_core_file_has_video(&fnode->fh) && switch_core_file_read_video(&fnode->fh, NULL, SVR_CHECK) == SWITCH_STATUS_BREAK) {
+	if (switch_core_file_has_video(&fnode->fh, SWITCH_TRUE) && switch_core_file_read_video(&fnode->fh, NULL, SVR_CHECK) == SWITCH_STATUS_BREAK) {
 		int full_screen = 0;
 		char *res_id = NULL;
 
@@ -1828,7 +1831,7 @@ void conference_video_pop_next_image(conference_member_t *member, switch_image_t
 				break;
 			}
 			size = switch_queue_size(member->video_queue);
-		} while(size > 1);
+		} while(size > 0);
 
 		if (conference_utils_member_test_flag(member, MFLAG_CAN_BE_SEEN) && member->video_layer_id > -1 && switch_core_session_media_flow(member->session, SWITCH_MEDIA_TYPE_VIDEO) != SWITCH_MEDIA_FLOW_SENDONLY) {
 			if (img) {
@@ -2218,13 +2221,13 @@ void *SWITCH_THREAD_FUNC conference_video_muxing_thread_run(switch_thread_t *thr
 			do_refresh = 100;
 		}
 
-		if (conference->async_fnode && switch_core_file_has_video(&conference->async_fnode->fh)) {
+		if (conference->async_fnode && switch_core_file_has_video(&conference->async_fnode->fh, SWITCH_TRUE)) {
 			check_async_file = 1;
 			file_count++;
 			files_playing = 1;
 		}
 
-		if (conference->fnode && switch_core_file_has_video(&conference->fnode->fh)) {
+		if (conference->fnode && switch_core_file_has_video(&conference->fnode->fh, SWITCH_TRUE)) {
 			check_file = 1;
 			file_count++;
 			files_playing = 1;
diff --git a/src/mod/applications/mod_conference/mod_conference.c b/src/mod/applications/mod_conference/mod_conference.c
index b62061967b..1e9902a048 100644
--- a/src/mod/applications/mod_conference/mod_conference.c
+++ b/src/mod/applications/mod_conference/mod_conference.c
@@ -3134,7 +3134,7 @@ conference_obj_t *conference_new(char *name, conference_xml_cfg_t cfg, switch_co
 
 	if (video_canvas_count < 1) video_canvas_count = 1;
 
-	if (conference_utils_test_flag(conference, CFLAG_PERSONAL_CANVAS) && video_canvas_count) {
+	if (conference_utils_test_flag(conference, CFLAG_PERSONAL_CANVAS) && video_canvas_count > 1) {
 		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "Personal Canvas and Multi-Canvas modes are not compatable. 1 canvas will be used.\n");
 		video_canvas_count = 1;
 	}
diff --git a/src/mod/applications/mod_cv/Makefile.am b/src/mod/applications/mod_cv/Makefile.am
index 6386d45971..a01bc92569 100644
--- a/src/mod/applications/mod_cv/Makefile.am
+++ b/src/mod/applications/mod_cv/Makefile.am
@@ -1,8 +1,6 @@
 include $(top_srcdir)/build/modmake.rulesam
 MODNAME=mod_cv
 
-if HAVE_YUV
-if HAVE_VPX
 if HAVE_OPENCV
 
 mod_LTLIBRARIES = mod_cv.la
@@ -17,19 +15,3 @@ all: error
 error:
 	$(error You must install libopencv-dev to build mod_cv)
 endif
-
-else
-install: error
-all: error
-error:
-	$(error You must install libvpx2-dev to build mod_cv)
-endif
-
-
-else
-install: error
-all: error
-error:
-	$(error You must install libyuv-dev to build mod_cv)
-endif
-
diff --git a/src/mod/applications/mod_cv/mod_cv.cpp b/src/mod/applications/mod_cv/mod_cv.cpp
index 9f5d95dac0..5183adc40c 100644
--- a/src/mod/applications/mod_cv/mod_cv.cpp
+++ b/src/mod/applications/mod_cv/mod_cv.cpp
@@ -39,7 +39,6 @@ using namespace std;
 using namespace cv;
 
 #include <switch.h>
-#include <libyuv.h>
 
 #include <cv.h>
 #include "cvaux.h"
@@ -724,12 +723,7 @@ static switch_status_t video_thread_callback(switch_core_session_t *session, swi
             switch_assert(context->rawImage->width * 3 == context->rawImage->widthStep);
         }
 
-        libyuv::I420ToRGB24(frame->img->planes[0], frame->img->stride[0],
-                            frame->img->planes[1], frame->img->stride[1],
-                            frame->img->planes[2], frame->img->stride[2],
-                            (uint8_t *)context->rawImage->imageData, context->rawImage->widthStep,
-                            context->rawImage->width, context->rawImage->height);
-
+        switch_img_to_raw(frame->img, context->rawImage->imageData, context->rawImage->widthStep * context->h, SWITCH_IMG_FMT_RGB24);
         detectAndDraw(context);
 
         if (context->detected.simo_count > 20) {
@@ -824,11 +818,7 @@ static switch_status_t video_thread_callback(switch_core_session_t *session, swi
     }
 
     if (context->rawImage && (context->debug || !context->overlay_count)) {
-        libyuv::RGB24ToI420((uint8_t *)context->rawImage->imageData, context->w * 3,
-                            frame->img->planes[0], frame->img->stride[0],
-                            frame->img->planes[1], frame->img->stride[1],
-                            frame->img->planes[2], frame->img->stride[2],
-                            context->rawImage->width, context->rawImage->height);
+        switch_img_from_raw(frame->img, (uint8_t *)context->rawImage->imageData, SWITCH_IMG_FMT_RGB24, context->rawImage->width, context->rawImage->height);
     }
 
     int abs = 0;
diff --git a/src/mod/applications/mod_dptools/mod_dptools.c b/src/mod/applications/mod_dptools/mod_dptools.c
index cf31deea75..d071321ddb 100644
--- a/src/mod/applications/mod_dptools/mod_dptools.c
+++ b/src/mod/applications/mod_dptools/mod_dptools.c
@@ -1356,6 +1356,27 @@ SWITCH_STANDARD_APP(redirect_function)
 	switch_core_session_receive_message(session, &msg);
 }
 
+SWITCH_STANDARD_APP(video_set_decode_function)
+{
+	switch_channel_t *channel = switch_core_session_get_channel(session);
+	char *txt = (char *) data;
+	int on = 0, wait = 0;
+
+	if (txt) {
+		on = !strcasecmp(txt, "on");
+		wait = !strcasecmp(txt, "wait");
+	}
+
+	if (data && (on || wait)) {
+		switch_channel_set_flag_recursive(channel, CF_VIDEO_DECODED_READ);
+		if (wait) {
+			switch_core_session_wait_for_video_input_params(session, 10000);
+		}
+	} else {
+		switch_channel_clear_flag_recursive(channel, CF_VIDEO_DECODED_READ);
+	}
+}
+
 SWITCH_STANDARD_APP(video_refresh_function)
 {
 	switch_core_session_message_t msg = { 0 };
@@ -2808,6 +2829,7 @@ SWITCH_STANDARD_APP(playback_function)
 	switch_channel_set_variable(channel, SWITCH_PLAYBACK_TERMINATOR_USED, "");
 
 	status = switch_ivr_play_file(session, &fh, file, &args);
+	switch_assert(!(fh.flags & SWITCH_FILE_OPEN));
 
 	switch (status) {
 	case SWITCH_STATUS_SUCCESS:
@@ -4780,9 +4802,9 @@ static switch_status_t next_file(switch_file_handle_t *handle)
 
 
 	if (switch_test_flag((&context->fh), SWITCH_FILE_NATIVE)) {
-		switch_set_flag(handle, SWITCH_FILE_NATIVE);
+		switch_set_flag_locked(handle, SWITCH_FILE_NATIVE);
 	} else {
-		switch_clear_flag(handle, SWITCH_FILE_NATIVE);
+		switch_clear_flag_locked(handle, SWITCH_FILE_NATIVE);
 	}
 
 
@@ -5001,9 +5023,9 @@ static switch_status_t file_url_file_open(switch_file_handle_t *handle, const ch
 		handle->max_samples = 0;
 
 		if (switch_test_flag(fh, SWITCH_FILE_NATIVE)) {
-			switch_set_flag(handle, SWITCH_FILE_NATIVE);
+			switch_set_flag_locked(handle, SWITCH_FILE_NATIVE);
 		} else {
-			switch_clear_flag(handle, SWITCH_FILE_NATIVE);
+			switch_clear_flag_locked(handle, SWITCH_FILE_NATIVE);
 		}
 	}
 	return status;
@@ -6148,6 +6170,8 @@ SWITCH_MODULE_LOAD_FUNCTION(mod_dptools_load)
 				   SAF_SUPPORT_NOMEDIA);
 	SWITCH_ADD_APP(app_interface, "video_refresh", "Send video refresh.", "Send video refresh.", video_refresh_function, "",
 				   SAF_SUPPORT_NOMEDIA);
+	SWITCH_ADD_APP(app_interface, "video_decode", "Set video decode.", "Set video decode.", video_set_decode_function, "[[on|wait]|off]",
+				   SAF_NONE);
 	SWITCH_ADD_APP(app_interface, "send_info", "Send info", "Send info", send_info_function, "<info>", SAF_SUPPORT_NOMEDIA);
 	SWITCH_ADD_APP(app_interface, "jitterbuffer", "Send session jitterbuffer", "Send a jitterbuffer message to a session.", 
 				   jitterbuffer_function, "<jitterbuffer_data>", SAF_SUPPORT_NOMEDIA);
diff --git a/src/mod/applications/mod_fsv/Makefile.am b/src/mod/applications/mod_fsv/Makefile.am
index 3b21b98405..dd75a2f081 100644
--- a/src/mod/applications/mod_fsv/Makefile.am
+++ b/src/mod/applications/mod_fsv/Makefile.am
@@ -1,28 +1,8 @@
 include $(top_srcdir)/build/modmake.rulesam
 MODNAME=mod_fsv
 
-if HAVE_YUV
-if HAVE_VPX
-
 mod_LTLIBRARIES = mod_fsv.la
 mod_fsv_la_SOURCES  = mod_fsv.c
 mod_fsv_la_CFLAGS   = $(AM_CFLAGS)
 mod_fsv_la_LIBADD   = $(switch_builddir)/libfreeswitch.la
 mod_fsv_la_LDFLAGS  = -avoid-version -module -no-undefined -shared
-
-else
-install: error
-all: error
-error:
-	$(error You must install libvpx2-dev to build mod_fsv)
-endif
-
-
-else
-install: error
-all: error
-error:
-	$(error You must install libyuv-dev to build mod_fsv)
-endif
-
-
diff --git a/src/mod/applications/mod_httapi/mod_httapi.c b/src/mod/applications/mod_httapi/mod_httapi.c
index 62055d736f..87f2203742 100644
--- a/src/mod/applications/mod_httapi/mod_httapi.c
+++ b/src/mod/applications/mod_httapi/mod_httapi.c
@@ -2986,9 +2986,9 @@ static switch_status_t file_open(switch_file_handle_t *handle, const char *path,
 	handle->flags |= SWITCH_FILE_NOMUX;
 
 	if (switch_test_flag((&context->fh), SWITCH_FILE_NATIVE)) {
-		switch_set_flag(handle, SWITCH_FILE_NATIVE);
+		switch_set_flag_locked(handle, SWITCH_FILE_NATIVE);
 	} else {
-		switch_clear_flag(handle, SWITCH_FILE_NATIVE);
+		switch_clear_flag_locked(handle, SWITCH_FILE_NATIVE);
 	}
 
 	return SWITCH_STATUS_SUCCESS;
diff --git a/src/mod/applications/mod_http_cache/mod_http_cache.c b/src/mod/applications/mod_http_cache/mod_http_cache.c
index 30e588a102..cd32a6ca69 100644
--- a/src/mod/applications/mod_http_cache/mod_http_cache.c
+++ b/src/mod/applications/mod_http_cache/mod_http_cache.c
@@ -1713,9 +1713,9 @@ static switch_status_t http_cache_file_open(switch_file_handle_t *handle, const
 	handle->flags |= SWITCH_FILE_NOMUX;
 
 	if (switch_test_flag((&context->fh), SWITCH_FILE_NATIVE)) {
-		switch_set_flag(handle, SWITCH_FILE_NATIVE);
+		switch_set_flag_locked(handle, SWITCH_FILE_NATIVE);
 	} else {
-		switch_clear_flag(handle, SWITCH_FILE_NATIVE);
+		switch_clear_flag_locked(handle, SWITCH_FILE_NATIVE);
 	}
 
 	return status;
diff --git a/src/mod/applications/mod_mp4v2/Makefile.am b/src/mod/applications/mod_mp4v2/Makefile.am
index 877dd45e12..f8b8bec05a 100644
--- a/src/mod/applications/mod_mp4v2/Makefile.am
+++ b/src/mod/applications/mod_mp4v2/Makefile.am
@@ -1,27 +1,8 @@
 include $(top_srcdir)/build/modmake.rulesam
 MODNAME=mod_mp4v2
 
-if HAVE_YUV
-if HAVE_VPX
-
 mod_LTLIBRARIES = mod_mp4v2.la
 mod_mp4v2_la_SOURCES  = mod_mp4v2.c
 mod_mp4v2_la_CFLAGS   = $(AM_CFLAGS)
 mod_mp4v2_la_LIBADD   = $(switch_builddir)/libfreeswitch.la
 mod_mp4v2_la_LDFLAGS  = -avoid-version -module -no-undefined -shared -lmp4v2
-
-else
-install: error
-all: error
-error:
-	$(error You must install libvpx2-dev to build mod_mp4v2)
-endif
-
-
-else
-install: error
-all: error
-error:
-	$(error You must install libyuv-dev to build mod_mp4v2)
-endif
-
diff --git a/src/mod/applications/mod_nibblebill/mod_nibblebill.c b/src/mod/applications/mod_nibblebill/mod_nibblebill.c
index a0e5bba2f8..0c8a136cbd 100644
--- a/src/mod/applications/mod_nibblebill/mod_nibblebill.c
+++ b/src/mod/applications/mod_nibblebill/mod_nibblebill.c
@@ -457,7 +457,7 @@ static switch_status_t do_billing(switch_core_session_t *session)
 	double lowbal_amt = globals.lowbal_amt;
 	double balance;
 	double minimum_charge = 0;
-	double rounding_factor = 1;
+	double rounding_factor = 0;
 	double excess = 0;
 	double rounded_billed = 0;
 	int billsecs = 0;
@@ -609,7 +609,7 @@ static switch_status_t do_billing(switch_core_session_t *session)
 			/* we're going to make an assumption that final billing is done here. So we'll see how this goes. */
 			/* round total billed up as required */
 
-			rounded_billed = ceilf((float)(nibble_data->total * rounding_factor)) / rounding_factor;
+			rounded_billed = rounding_factor > 0 ? ceilf((float)(nibble_data->total * rounding_factor)) / rounding_factor : nibble_data->total;
 
 			if (rounded_billed < minimum_charge)
 			{
diff --git a/src/mod/applications/mod_redis/credis.c b/src/mod/applications/mod_redis/credis.c
index 2ab33698ee..9b011a6c7d 100644
--- a/src/mod/applications/mod_redis/credis.c
+++ b/src/mod/applications/mod_redis/credis.c
@@ -45,6 +45,9 @@
 #include <sys/socket.h>
 #include <netdb.h>
 #include <netinet/tcp.h>
+#if defined(__FreeBSD__)
+#include <netinet/in.h>
+#endif
 #include <arpa/inet.h>
 #endif
 #include <fcntl.h>
diff --git a/src/mod/applications/mod_spandsp/Makefile.am b/src/mod/applications/mod_spandsp/Makefile.am
index aa1b65718a..ae6ab7590e 100644
--- a/src/mod/applications/mod_spandsp/Makefile.am
+++ b/src/mod/applications/mod_spandsp/Makefile.am
@@ -16,9 +16,9 @@ mod_spandsp_la_LIBADD   = $(switch_builddir)/libfreeswitch.la $(SPANDSP_LA) $(TI
 mod_spandsp_la_LDFLAGS  = -avoid-version -module -no-undefined -shared
 
 $(SPANDSP_LA): $(TIFF_LA) $(SPANDSP_DIR) $(SPANDSP_DIR)/.update
-	cd $(SPANDSP_BUILDDIR) && $(MAKE) -j1 CPPFLAGS="$(CPPFLAGS) -I$(TIFF_BUILDDIR)/libtiff -I$(TIFF_DIR)/libtiff" CFLAGS="$(CFLAGS)"
+	cd $(SPANDSP_BUILDDIR) && $(MAKE) CPPFLAGS="$(CPPFLAGS) -I$(TIFF_BUILDDIR)/libtiff -I$(TIFF_DIR)/libtiff" CFLAGS="$(CFLAGS)"
 	$(TOUCH_TARGET)
 
 $(TIFF_LA): $(TIFF_DIR) $(TIFF_DIR)/.update
-	cd $(TIFF_BUILDDIR) && $(MAKE) -j1
+	cd $(TIFF_BUILDDIR) && $(MAKE)
 	$(TOUCH_TARGET)
diff --git a/src/mod/applications/mod_voicemail/mod_voicemail.c b/src/mod/applications/mod_voicemail/mod_voicemail.c
index 18583bf181..43917d88e3 100644
--- a/src/mod/applications/mod_voicemail/mod_voicemail.c
+++ b/src/mod/applications/mod_voicemail/mod_voicemail.c
@@ -935,9 +935,9 @@ static switch_status_t control_playback(switch_core_session_t *session, void *in
 
 			if (dtmf->digit == *cc->profile->pause_key) {
 				if (switch_test_flag(fh, SWITCH_FILE_PAUSE)) {
-					switch_clear_flag(fh, SWITCH_FILE_PAUSE);
+					switch_clear_flag_locked(fh, SWITCH_FILE_PAUSE);
 				} else {
-					switch_set_flag(fh, SWITCH_FILE_PAUSE);
+					switch_set_flag_locked(fh, SWITCH_FILE_PAUSE);
 				}
 				return SWITCH_STATUS_SUCCESS;
 			}
diff --git a/src/mod/codecs/mod_amr/mod_amr.c b/src/mod/codecs/mod_amr/mod_amr.c
index ec9723781a..b8aeb1635c 100644
--- a/src/mod/codecs/mod_amr/mod_amr.c
+++ b/src/mod/codecs/mod_amr/mod_amr.c
@@ -25,6 +25,8 @@
  * 
  * Anthony Minessale II <anthm@freeswitch.org>
  * Brian K. West <brian@freeswitch.org>
+ * Dragos Oancea <dragos.oancea@athonet.com>
+ * Federico Favaro <federico.favaro@athonet.com>
  *
  * The amr codec itself is not distributed with this module.
  *
@@ -106,112 +108,90 @@ struct amr_codec_settings {
 	switch_byte_t ptime;
 	switch_byte_t channels;
 	switch_byte_t flags;
+	switch_byte_t enc_modes;
+	switch_byte_t enc_mode;
+
 };
 typedef struct amr_codec_settings amr_codec_settings_t;
 
-static amr_codec_settings_t default_codec_settings = {
-	/*.dtx_mode */ AMR_DTX_ENABLED,
-	/*.change_period */ 0,
-	/*.max_ptime */ 0,
-	/*.ptime */ 0,
-	/*.channels */ 0,
-	/*.flags */ 0,
-};
-
-
 struct amr_context {
 	void *encoder_state;
 	void *decoder_state;
 	switch_byte_t enc_modes;
 	switch_byte_t enc_mode;
+	amr_codec_settings_t codec_settings;
+	switch_byte_t flags;
+	int dtx_mode;
 };
 
 #define AMR_DEFAULT_BITRATE AMR_BITRATE_1220
 
 static struct {
 	switch_byte_t default_bitrate;
+	int debug;
 } globals;
 
-static switch_status_t switch_amr_fmtp_parse(const char *fmtp, switch_codec_fmtp_t *codec_fmtp)
+static const int switch_amr_frame_sizes[] = {12,13,15,17,19,20,26,31,5,0};
+
+#define AMR_OUT_MAX_SIZE 32
+
+static switch_bool_t switch_amr_unpack_oa(unsigned char *buf, uint8_t *tmp, int encoded_data_len)
 {
-	if (codec_fmtp) {
-		amr_codec_settings_t *codec_settings = NULL;
-		if (codec_fmtp->private_info) {
-			codec_settings = codec_fmtp->private_info;
-			memcpy(codec_settings, &default_codec_settings, sizeof(*codec_settings));
-		}
+	uint8_t *tocs;
+	int index;
+	int framesz;
 
-		if (fmtp) {
-			int x, argc;
-			char *argv[10];
-			char *fmtp_dup = strdup(fmtp);
-
-			switch_assert(fmtp_dup);
-
-			argc = switch_separate_string((char *) fmtp_dup, ';', argv, (sizeof(argv) / sizeof(argv[0])));
-			for (x = 0; x < argc; x++) {
-				char *data = argv[x];
-				char *arg;
-				switch_assert(data);
-				while (*data == ' ') {
-					data++;
-				}
-				if ((arg = strchr(data, '='))) {
-					*arg++ = '\0';
-					/*
-					   if (!strcasecmp(data, "bitrate")) {
-					   bit_rate = atoi(arg);
-					   }
-					 */
-					if (codec_settings) {
-						if (!strcasecmp(data, "octet-align")) {
-							if (atoi(arg)) {
-								switch_set_flag(codec_settings, AMR_OPT_OCTET_ALIGN);
-							}
-						} else if (!strcasecmp(data, "mode-change-neighbor")) {
-							if (atoi(arg)) {
-								switch_set_flag(codec_settings, AMR_OPT_MODE_CHANGE_NEIGHBOR);
-							}
-						} else if (!strcasecmp(data, "crc")) {
-							if (atoi(arg)) {
-								switch_set_flag(codec_settings, AMR_OPT_CRC);
-							}
-						} else if (!strcasecmp(data, "robust-sorting")) {
-							if (atoi(arg)) {
-								switch_set_flag(codec_settings, AMR_OPT_ROBUST_SORTING);
-							}
-						} else if (!strcasecmp(data, "interveaving")) {
-							if (atoi(arg)) {
-								switch_set_flag(codec_settings, AMR_OPT_INTERLEAVING);
-							}
-						} else if (!strcasecmp(data, "mode-change-period")) {
-							codec_settings->change_period = atoi(arg);
-						} else if (!strcasecmp(data, "ptime")) {
-							codec_settings->ptime = (switch_byte_t) atoi(arg);
-						} else if (!strcasecmp(data, "channels")) {
-							codec_settings->channels = (switch_byte_t) atoi(arg);
-						} else if (!strcasecmp(data, "maxptime")) {
-							codec_settings->max_ptime = (switch_byte_t) atoi(arg);
-						} else if (!strcasecmp(data, "mode-set")) {
-							int y, m_argc;
-							char *m_argv[7];
-							m_argc = switch_separate_string(arg, ',', m_argv, (sizeof(m_argv) / sizeof(m_argv[0])));
-							for (y = 0; y < m_argc; y++) {
-								codec_settings->enc_modes |= (1 << atoi(m_argv[y]));
-							}
-						} else if (!strcasecmp(data, "dtx")) {
-							codec_settings->dtx_mode = (atoi(arg)) ? AMR_DTX_ENABLED : AMR_DTX_DISABLED;
-						}
-					}
-
-				}
-			}
-			free(fmtp_dup);
-		}
-		//codec_fmtp->bits_per_second = bit_rate;
-		return SWITCH_STATUS_SUCCESS;
+	buf++; /*CMR skip*/
+	tocs = buf;
+	index = ((tocs[0]>>3) & 0xf);
+	buf++; /*point to voice payload*/
+	if (index > 9) {
+		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "AMR decoder (OA): Bad AMRWB TOC, index = %i", index);
+		return SWITCH_FALSE;
 	}
-	return SWITCH_STATUS_FALSE;
+	framesz = switch_amr_frame_sizes[index];
+	if (framesz > encoded_data_len - 1) {
+		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "AMR decoder (OA): Truncated AMR frame\n");
+		return SWITCH_FALSE;
+	}
+	tmp[0] = tocs[0];
+	memcpy(&tmp[1], buf, framesz);
+
+	return SWITCH_TRUE;
+}
+ 
+static switch_bool_t switch_amr_info(unsigned char *encoded_buf, int encoded_data_len, int payload_format, char *print_text) 
+{
+	uint8_t *tocs;
+	int framesz, index, not_last_frame, q, ft;
+
+	if (!encoded_buf) {
+		return SWITCH_FALSE;
+	}
+	
+	/* payload format can be OA (octed-aligned) or BE (bandwidth efficient)*/
+
+	if (payload_format) {
+		/* OA */
+		encoded_buf++;/*CMR skip*/
+		tocs = encoded_buf; 
+		index = (tocs[0] >> 3) & 0x0f;
+		framesz = switch_amr_frame_sizes[index];
+		not_last_frame = (tocs[0] >> 7) & 1; 
+		q = (tocs[0] >> 2) & 1; 
+		ft = tocs[0] >> 3 ;
+		ft &= ~(1 << 5); /* Frame Type*/
+	} else {
+		/* BE mode not supported yet */
+		return SWITCH_FALSE;
+	}
+
+	switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "%s (%s): FT: [0x%x] Q: [0x%x] Frame flag: [%d]\n", 
+													print_text, payload_format ? "OA":"BE", ft, q, not_last_frame);
+	switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "%s (%s): AMR encoded voice payload sz: [%d] : | encoded_data_len: [%d]\n", 
+													print_text, payload_format ? "OA":"BE", framesz, encoded_data_len);
+
+	return SWITCH_TRUE;
 }
 
 #endif
@@ -228,7 +208,7 @@ static switch_status_t switch_amr_init(switch_codec_t *codec, switch_codec_flag_
 
 	struct amr_context *context = NULL;
 	switch_codec_fmtp_t codec_fmtp;
-	amr_codec_settings_t amr_codec_settings;
+	amr_codec_settings_t amr_codec_settings = { 0 };
 	int encoding, decoding;
 	int x, i, argc;
 	char *argv[10];
@@ -243,7 +223,57 @@ static switch_status_t switch_amr_init(switch_codec_t *codec, switch_codec_flag_
 
 		memset(&codec_fmtp, '\0', sizeof(struct switch_codec_fmtp));
 		codec_fmtp.private_info = &amr_codec_settings;
-		switch_amr_fmtp_parse(codec->fmtp_in, &codec_fmtp);
+		context->codec_settings = amr_codec_settings;
+
+		if (codec->fmtp_in) {
+			argc = switch_separate_string(codec->fmtp_in, ';', argv, (sizeof(argv) / sizeof(argv[0])));
+			for (x = 0; x < argc; x++) {
+				char *data = argv[x];
+				char *arg;
+				while (*data && *data == ' ') {
+					data++;
+				}
+				if ((arg = strchr(data, '='))) {
+					*arg++ = '\0';
+					if (!strcasecmp(data, "octet-align")) {
+						if (atoi(arg)) {
+							switch_set_flag(context, AMR_OPT_OCTET_ALIGN);
+						}
+					} else if (!strcasecmp(data, "mode-change-neighbor")) {
+						if (atoi(arg)) {
+							switch_set_flag(context, AMR_OPT_MODE_CHANGE_NEIGHBOR);
+						}
+					} else if (!strcasecmp(data, "crc")) {
+						if (atoi(arg)) {
+							switch_set_flag(context, AMR_OPT_CRC);
+						}
+					} else if (!strcasecmp(data, "robust-sorting")) {
+						if (atoi(arg)) {
+							switch_set_flag(context, AMR_OPT_ROBUST_SORTING);
+						}
+					} else if (!strcasecmp(data, "interleaving")) {
+						if (atoi(arg)) {
+							switch_set_flag(context, AMR_OPT_INTERLEAVING);
+						}
+					} else if (!strcasecmp(data, "mode-change-period")) {
+						context->codec_settings.change_period = atoi(arg);
+					} else if (!strcasecmp(data, "ptime")) {
+						context->codec_settings.ptime = (switch_byte_t) atoi(arg);
+					} else if (!strcasecmp(data, "channels")) {
+						context->codec_settings.channels = (switch_byte_t) atoi(arg);
+					} else if (!strcasecmp(data, "maxptime")) {
+						context->codec_settings.max_ptime = (switch_byte_t) atoi(arg);
+					} else if (!strcasecmp(data, "mode-set")) {
+						int y, m_argc;
+						char *m_argv[8];
+						m_argc = switch_separate_string(arg, ',', m_argv, (sizeof(m_argv) / sizeof(m_argv[0])));
+						for (y = 0; y < m_argc; y++) {
+							context->enc_modes |= (1 << atoi(m_argv[y]));
+						}
+					}
+				}
+			}
+		}
 
 		if (context->enc_modes) {
 			for (i = 7; i > -1; i++) {
@@ -309,12 +339,32 @@ static switch_status_t switch_amr_encode(switch_codec_t *codec,
 	return SWITCH_STATUS_FALSE;
 #else
 	struct amr_context *context = codec->private_info;
+	int n;
+	unsigned char *shift_buf = encoded_data;
 
 	if (!context) {
 		return SWITCH_STATUS_FALSE;
 	}
 
-	*encoded_data_len = Encoder_Interface_Encode(context->encoder_state, context->enc_mode, (int16_t *) decoded_data, (switch_byte_t *) encoded_data, 0);
+	n = Encoder_Interface_Encode(context->encoder_state, context->enc_mode, (int16_t *) decoded_data, (switch_byte_t *) encoded_data + 1, 0);
+	if (n < 0) {
+		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "AMR encoder: Encoder_Interface_Encode() ERROR!\n");
+		return SWITCH_STATUS_FALSE;
+	}
+
+	if (switch_test_flag(context, AMR_OPT_OCTET_ALIGN)) {
+		*(switch_byte_t *) encoded_data = 0xf0; /*CMR*/
+		*encoded_data_len = n + 1;
+	} else {
+		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "AMR encoder: BE mode not supported!\n");
+		return SWITCH_STATUS_FALSE;
+	}
+
+#ifndef AMR_PASSTHROUGH
+	if (globals.debug) {
+			switch_amr_info(shift_buf, *encoded_data_len, switch_test_flag(context, AMR_OPT_OCTET_ALIGN) ? 1 : 0, "AMR encoder");
+	}
+#endif
 
 	return SWITCH_STATUS_SUCCESS;
 #endif
@@ -332,23 +382,64 @@ static switch_status_t switch_amr_decode(switch_codec_t *codec,
 	return SWITCH_STATUS_FALSE;
 #else
 	struct amr_context *context = codec->private_info;
+	unsigned char *buf = encoded_data;
+	uint8_t tmp[AMR_OUT_MAX_SIZE]; 
 
 	if (!context) {
 		return SWITCH_STATUS_FALSE;
 	}
 
-	Decoder_Interface_Decode(context->decoder_state, (unsigned char *) encoded_data, (int16_t *) decoded_data, 0);
+#ifndef AMR_PASSTHROUGH
+	if (globals.debug) {
+			switch_amr_info(buf, encoded_data_len, switch_test_flag(context, AMR_OPT_OCTET_ALIGN) ? 1 : 0, "AMR decoder");
+	}
+#endif
+	if (switch_test_flag(context, AMR_OPT_OCTET_ALIGN)) { 
+		/*Octed Aligned*/
+		if (!switch_amr_unpack_oa(buf, tmp, encoded_data_len)) {
+			return SWITCH_STATUS_FALSE;
+		}
+	} else { 
+		/*"Bandwidth Efficient"*/
+		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "AMR decoder: BE mode not supported!\n");
+		return SWITCH_STATUS_FALSE;
+	}
+
+	Decoder_Interface_Decode(context->decoder_state, tmp, (int16_t *) decoded_data, 0);
 	*decoded_data_len = codec->implementation->decoded_bytes_per_packet;
 
 	return SWITCH_STATUS_SUCCESS;
 #endif
 }
 
+#ifndef AMR_PASSTHROUGH
+#define AMRWB_DEBUG_SYNTAX "<on|off>"
+SWITCH_STANDARD_API(mod_amr_debug)
+{
+		if (zstr(cmd)) {
+			stream->write_function(stream, "-USAGE: %s\n", AMRWB_DEBUG_SYNTAX);
+		} else {
+			if (!strcasecmp(cmd, "on")) {
+				globals.debug = 1;
+				stream->write_function(stream, "AMR Debug: on\n");
+			} else if (!strcasecmp(cmd, "off")) {
+				globals.debug = 0;
+				stream->write_function(stream, "AMR Debug: off\n");
+			} else {
+				stream->write_function(stream, "-USAGE: %s\n", AMRWB_DEBUG_SYNTAX);
+			}	
+		}
+	return SWITCH_STATUS_SUCCESS;
+}
+#endif
+
 /* Registration */
 SWITCH_MODULE_LOAD_FUNCTION(mod_amr_load)
 {
 	switch_codec_interface_t *codec_interface;
+
 #ifndef AMR_PASSTHROUGH
+	switch_api_interface_t *commands_api_interface;
 	char *cf = "amr.conf";
 	switch_xml_t cfg, xml, settings, param;
 
@@ -368,17 +459,20 @@ SWITCH_MODULE_LOAD_FUNCTION(mod_amr_load)
 	}
 #endif
 
-	/* connect my internal structure to the blank pointer passed to me */
+/* connect my internal structure to the blank pointer passed to me */
 	*module_interface = switch_loadable_module_create_module_interface(pool, modname);
 
 	SWITCH_ADD_CODEC(codec_interface, "AMR");
 #ifndef AMR_PASSTHROUGH
-	codec_interface->parse_fmtp = switch_amr_fmtp_parse;
-#endif 
+	SWITCH_ADD_API(commands_api_interface, "amr_debug", "Set AMR Debug", mod_amr_debug, AMRWB_DEBUG_SYNTAX);
+
+	switch_console_set_complete("add amr_debug on");
+	switch_console_set_complete("add amr_debug off");
+#endif
 	switch_core_codec_add_implementation(pool, codec_interface, SWITCH_CODEC_TYPE_AUDIO,	/* enumeration defining the type of the codec */
 										 96,	/* the IANA code number */
 										 "AMR",	/* the IANA code name */
-										 "octet-align=0",	/* default fmtp to send (can be overridden by the init function) */
+										 "octet-align=1",	/* default fmtp to send (can be overridden by the init function) */
 										 8000,	/* samples transferred per second */
 										 8000,	/* actual samples transferred per second */
 										 12200,	/* bits transferred per second */
diff --git a/src/mod/codecs/mod_openh264/Makefile.am b/src/mod/codecs/mod_openh264/Makefile.am
index f7939ea368..c541e9563f 100644
--- a/src/mod/codecs/mod_openh264/Makefile.am
+++ b/src/mod/codecs/mod_openh264/Makefile.am
@@ -1,9 +1,6 @@
 include $(top_srcdir)/build/modmake.rulesam
 MODNAME=mod_openh264
 
-if HAVE_YUV
-if HAVE_VPX
-
 OPENH264_DIR=/usr/local/
 
 mod_LTLIBRARIES = mod_openh264.la
@@ -11,19 +8,3 @@ mod_openh264_la_SOURCES  = mod_openh264.cpp
 mod_openh264_la_CXXFLAGS   = $(AM_CXXFLAGS) -I$(OPENH264_DIR)/include/wels
 mod_openh264_la_LIBADD   = $(switch_builddir)/libfreeswitch.la
 mod_openh264_la_LDFLAGS  = -L$(OPENH264_DIR)/lib/ -lopenh264 -avoid-version -module -no-undefined -shared
-
-else
-install: error
-all: error
-error:
-	$(error You must install libvpx2-dev to build mod_openh264)
-endif
-
-
-else
-install: error
-all: error
-error:
-	$(error You must install libyuv-dev to build mod_openh264)
-endif
-
diff --git a/src/mod/codecs/mod_vpx/Makefile.am b/src/mod/codecs/mod_vpx/Makefile.am
deleted file mode 100644
index 780b9af018..0000000000
--- a/src/mod/codecs/mod_vpx/Makefile.am
+++ /dev/null
@@ -1,15 +0,0 @@
-include $(top_srcdir)/build/modmake.rulesam
-MODNAME=mod_vpx
-
-if HAVE_VPX
-mod_LTLIBRARIES = mod_vpx.la
-mod_vpx_la_SOURCES  = mod_vpx.c
-mod_vpx_la_LIBADD   = $(switch_builddir)/libfreeswitch.la
-mod_vpx_la_CFLAGS   = $(VPX_CFLAGS) $(SWITCH_AM_CFLAGS)
-mod_vpx_la_LDFLAGS  = $(VPX_LIBS) -avoid-version -module -no-undefined -shared
-else
-install: error
-all: error
-error:
-	$(error You must install your distros libvpx-dev to build mod_vpx)
-endif
diff --git a/src/mod/codecs/mod_vpx/README b/src/mod/codecs/mod_vpx/README
deleted file mode 100644
index d2ee1f88a8..0000000000
--- a/src/mod/codecs/mod_vpx/README
+++ /dev/null
@@ -1,9 +0,0 @@
-To build this module I used the following steps:
-
-apt-get install yasm
-git /usr/local/src/
-git clone https://chromium.googlesource.com/webm/libvpx
-cd libvpx/build/
-../configure --enable-pic --enable-shared
-make
-make install
diff --git a/src/mod/endpoints/mod_gsmopen/Makefile.am b/src/mod/endpoints/mod_gsmopen/Makefile.am
index c8ea26614d..ce31c910aa 100644
--- a/src/mod/endpoints/mod_gsmopen/Makefile.am
+++ b/src/mod/endpoints/mod_gsmopen/Makefile.am
@@ -19,9 +19,9 @@ mod_gsmopen_la_LDFLAGS  = -avoid-version -module -no-undefined -lctb-0.16 -lgsmm
 BUILT_SOURCES = $(TIFF_LA) $(SPANDSP_LA)
 
 $(SPANDSP_LA): $(TIFF_LA) $(SPANDSP_DIR) $(SPANDSP_DIR)/.update
-	cd $(SPANDSP_BUILDDIR) && $(MAKE) -j1 CPPFLAGS="$(CPPFLAGS) -I$(TIFF_BUILDDIR)/libtiff -I$(TIFF_DIR)/libtiff" CFLAGS="$(CFLAGS)"
+	cd $(SPANDSP_BUILDDIR) && $(MAKE) CPPFLAGS="$(CPPFLAGS) -I$(TIFF_BUILDDIR)/libtiff -I$(TIFF_DIR)/libtiff" CFLAGS="$(CFLAGS)"
 	$(TOUCH_TARGET)
 
 $(TIFF_LA): $(TIFF_DIR) $(TIFF_DIR)/.update
-	cd $(TIFF_BUILDDIR) && $(MAKE) -j1
+	cd $(TIFF_BUILDDIR) && $(MAKE)
 	$(TOUCH_TARGET)
diff --git a/src/mod/endpoints/mod_skypopen/Makefile.am b/src/mod/endpoints/mod_skypopen/Makefile.am
index 8f6cfdfe31..b5d1b91e4b 100644
--- a/src/mod/endpoints/mod_skypopen/Makefile.am
+++ b/src/mod/endpoints/mod_skypopen/Makefile.am
@@ -19,9 +19,9 @@ mod_skypopen_la_LDFLAGS  = -avoid-version -module -no-undefined -shared -lX11
 BUILT_SOURCES = $(TIFF_LA) $(SPANDSP_LA)
 
 $(SPANDSP_LA): $(TIFF_LA) $(SPANDSP_DIR) $(SPANDSP_DIR)/.update
-	cd $(SPANDSP_BUILDDIR) && $(MAKE) -j1 CFLAGS="$(CFLAGS)" CPPFLAGS="$(CPPFLAGS) -I$(TIFF_BUILDDIR)/libtiff -I$(TIFF_DIR)/libtiff"
+	cd $(SPANDSP_BUILDDIR) && $(MAKE) CFLAGS="$(CFLAGS)" CPPFLAGS="$(CPPFLAGS) -I$(TIFF_BUILDDIR)/libtiff -I$(TIFF_DIR)/libtiff"
 	$(TOUCH_TARGET)
 
 $(TIFF_LA): $(TIFF_DIR) $(TIFF_DIR)/.update
-	cd $(TIFF_BUILDDIR) && $(MAKE) -j1
+	cd $(TIFF_BUILDDIR) && $(MAKE)
 	$(TOUCH_TARGET)
diff --git a/src/mod/endpoints/mod_sofia/mod_sofia.c b/src/mod/endpoints/mod_sofia/mod_sofia.c
index d5d72012ad..9f5cd56af7 100644
--- a/src/mod/endpoints/mod_sofia/mod_sofia.c
+++ b/src/mod/endpoints/mod_sofia/mod_sofia.c
@@ -24,11 +24,12 @@
  * Contributor(s):
  *
  * Anthony Minessale II <anthm@freeswitch.org>
- * Ken Rice <krice at cometsig.com>
+ * Ken Rice <krice at freeswitch.org>
  * Paul D. Tinsley <pdt at jackhammer.org>
  * Bret McDanel <trixter AT 0xdecafbad.com>
  * Raymond Chandler <intralanman@freeswitch.org>
  * Emmanuel Schmidbauer <eschmidbauer@gmail.com>
+ * Kathleen King <kathleen.king@quentustech.com>
  *
  *
  * mod_sofia.c -- SOFIA SIP Endpoint
@@ -1457,6 +1458,9 @@ static switch_status_t sofia_receive_message(switch_core_session_t *session, swi
 		{
 			char *extra_headers = sofia_glue_get_extra_headers(channel, SOFIA_SIP_HEADER_PREFIX);
 
+			switch_channel_clear_flag(tech_pvt->channel, CF_MEDIA_ACK);
+			switch_channel_set_flag(tech_pvt->channel, CF_REQ_MEDIA);
+			
 			nua_invite(tech_pvt->nh, NUTAG_MEDIA_ENABLE(0),
 					   TAG_IF(msg->string_arg, SIPTAG_CONTENT_TYPE_STR("application/sdp")), 
 					   TAG_IF(msg->string_arg, SIPTAG_PAYLOAD_STR(msg->string_arg)), 
@@ -1468,6 +1472,12 @@ static switch_status_t sofia_receive_message(switch_core_session_t *session, swi
 	case SWITCH_MESSAGE_INDICATE_3P_MEDIA:
 		{
 			char *extra_headers = sofia_glue_get_extra_headers(channel, SOFIA_SIP_HEADER_PREFIX);
+
+			switch_channel_clear_flag(tech_pvt->channel, CF_MEDIA_ACK);
+			switch_channel_clear_flag(tech_pvt->channel, CF_MEDIA_SET);
+			switch_channel_set_flag(tech_pvt->channel, CF_REQ_MEDIA);
+
+
 			nua_invite(tech_pvt->nh, NUTAG_MEDIA_ENABLE(0), SIPTAG_PAYLOAD_STR(""), 
 					   TAG_IF(!zstr(extra_headers), SIPTAG_HEADER_STR(extra_headers)), TAG_END());
 
@@ -1767,8 +1777,11 @@ static switch_status_t sofia_receive_message(switch_core_session_t *session, swi
 									   TAG_IF(!zstr(tech_pvt->route_uri), NUTAG_PROXY(tech_pvt->route_uri)),
 									   TAG_IF(!zstr_buf(message), SIPTAG_HEADER_STR(message)),
 									   TAG_IF(!zstr(tech_pvt->user_via), SIPTAG_VIA_STR(tech_pvt->user_via)), TAG_END());
-						} else if ((ua && (switch_stristr("aastra", ua) && !switch_stristr("Intelligate", ua)))) {
-							snprintf(message, sizeof(message), "P-Asserted-Identity: \"%s\" <sip:%s@%s>", name, number, tech_pvt->profile->sipip);
+						} else if (ua && ((switch_stristr("aastra", ua) && !switch_stristr("Intelligate", ua)) ||
+										  (switch_stristr("cisco/spa50", ua) || switch_stristr("cisco/spa525", ua)) ||
+										  switch_stristr("Yealink", ua) ||
+										  switch_stristr("Panasonic", ua))) {
+							snprintf(message, sizeof(message), "P-Asserted-Identity: \"%s\" <sip:%s@%s>", name, number, tech_pvt->profile->printable_sipip);
 
 							sofia_set_flag_locked(tech_pvt, TFLAG_UPDATING_DISPLAY);
 							nua_update(tech_pvt->nh,
@@ -1778,28 +1791,6 @@ static switch_status_t sofia_receive_message(switch_core_session_t *session, swi
 									   TAG_IF(!zstr(tech_pvt->route_uri), NUTAG_PROXY(tech_pvt->route_uri)),
 									   TAG_IF(!zstr_buf(message), SIPTAG_HEADER_STR(message)),
 									   TAG_IF(!zstr(tech_pvt->user_via), SIPTAG_VIA_STR(tech_pvt->user_via)), TAG_END());
-						} else if ((ua && (switch_stristr("cisco/spa50", ua) || switch_stristr("cisco/spa525", ua)))) {
-							snprintf(message, sizeof(message), "P-Asserted-Identity: \"%s\" <sip:%s@%s>", name, number, tech_pvt->profile->sipip);
-
-							sofia_set_flag_locked(tech_pvt, TFLAG_UPDATING_DISPLAY);
-							nua_update(tech_pvt->nh,
-									   NUTAG_SESSION_TIMER(tech_pvt->session_timeout),
-									   NUTAG_SESSION_REFRESHER(tech_pvt->session_refresher),
-									   TAG_IF(call_info, SIPTAG_CALL_INFO_STR(call_info)),
-									   TAG_IF(!zstr(tech_pvt->route_uri), NUTAG_PROXY(tech_pvt->route_uri)),
-									   TAG_IF(!zstr_buf(message), SIPTAG_HEADER_STR(message)),
-									   TAG_IF(!zstr(tech_pvt->user_via), SIPTAG_VIA_STR(tech_pvt->user_via)), TAG_END());
-						} else if ((ua && (switch_stristr("Yealink", ua)))) {
-							snprintf(message, sizeof(message), "P-Asserted-Identity: \"%s\" <sip:%s@%s>", name, number, tech_pvt->profile->sipip);
-
-							sofia_set_flag_locked(tech_pvt, TFLAG_UPDATING_DISPLAY);
-							nua_update(tech_pvt->nh,
-									   NUTAG_SESSION_TIMER(tech_pvt->session_timeout),
-									   NUTAG_SESSION_REFRESHER(tech_pvt->session_refresher),
-						TAG_IF(call_info, SIPTAG_CALL_INFO_STR(call_info)),
-									   TAG_IF(!zstr(tech_pvt->route_uri), NUTAG_PROXY(tech_pvt->route_uri)),
-									   TAG_IF(!zstr_buf(message), SIPTAG_HEADER_STR(message)),
-									   TAG_IF(!zstr(tech_pvt->user_via), SIPTAG_VIA_STR(tech_pvt->user_via)), TAG_END());
 						}
 
 						tech_pvt->last_sent_callee_id_name = switch_core_session_strdup(tech_pvt->session, name);
@@ -3449,16 +3440,16 @@ static switch_status_t cmd_profile(char **argv, int argc, switch_stream_handle_t
 		goto done;
 	}
 
-		if (!strcasecmp(argv[1], "capture")) {
-			   if (argc > 2) {
-					   int value = switch_true(argv[2]);
-					   nua_set_params(profile->nua, TPTAG_CAPT(value ? mod_sofia_globals.capture_server : NULL), TAG_END());
-					   stream->write_function(stream, "%s sip capturing on %s", value ? "Enabled" : "Disabled", profile->name);
-			   } else {
-					   stream->write_function(stream, "Usage: sofia profile <name> capture <on/off>\n");
-			   }
-			   goto done;
+	if (!strcasecmp(argv[1], "capture")) {
+		if (argc > 2) {
+			int value = switch_true(argv[2]);
+			nua_set_params(profile->nua, TPTAG_CAPT(value ? mod_sofia_globals.capture_server : NULL), TAG_END());
+			stream->write_function(stream, "%s sip capturing on %s", value ? "Enabled" : "Disabled", profile->name);
+		} else {
+			stream->write_function(stream, "Usage: sofia profile <name> capture <on/off>\n");
 		}
+		goto done;
+	}
 
 	if (!strcasecmp(argv[1], "watchdog")) {
 		if (argc > 2) {
@@ -6003,50 +5994,29 @@ SWITCH_MODULE_LOAD_FUNCTION(mod_sofia_load)
 
 
 	SWITCH_ADD_API(api_interface, "sofia", "Sofia Controls", sofia_function, "<cmd> <args>");
-	SWITCH_ADD_API(api_interface, "sofia_gateway_data", "Get data from a sofia gateway", sofia_gateway_data_function,
-				   "<gateway_name> [ivar|ovar|var] <name>");
-	switch_console_set_complete("add sofia help");
-	switch_console_set_complete("add sofia status");
-	switch_console_set_complete("add sofia xmlstatus");
+	SWITCH_ADD_API(api_interface, "sofia_gateway_data", "Get data from a sofia gateway", sofia_gateway_data_function, "<gateway_name> [ivar|ovar|var] <name>");
+	switch_console_set_complete("add sofia ::[help:status");
+	switch_console_set_complete("add sofia status profile ::sofia::list_profiles reg");
+	switch_console_set_complete("add sofia status gateway ::sofia::list_gateways");
 
 	switch_console_set_complete("add sofia loglevel ::[all:default:tport:iptsec:nea:nta:nth_client:nth_server:nua:soa:sresolv:stun ::[0:1:2:3:4:5:6:7:8:9");
 	switch_console_set_complete("add sofia tracelevel ::[console:alert:crit:err:warning:notice:info:debug");
 
-	switch_console_set_complete("add sofia global siptrace ::[on:off");
-	switch_console_set_complete("add sofia global standby ::[on:off");
-	switch_console_set_complete("add sofia global capture  ::[on:off");
-	switch_console_set_complete("add sofia global watchdog ::[on:off");
-
+	switch_console_set_complete("add sofia global ::[siptrace::standby::capture::watchdog ::[on:off");
 	switch_console_set_complete("add sofia global debug ::[presence:sla:none");
 
-
-	switch_console_set_complete("add sofia profile");
 	switch_console_set_complete("add sofia profile restart all");
-
-	switch_console_set_complete("add sofia profile ::sofia::list_profiles start");
+	switch_console_set_complete("add sofia profile ::sofia::list_profiles ::[start:rescan:restart:check_sync");
 	switch_console_set_complete("add sofia profile ::sofia::list_profiles stop wait");
-	switch_console_set_complete("add sofia profile ::sofia::list_profiles rescan");
-	switch_console_set_complete("add sofia profile ::sofia::list_profiles restart");
+	switch_console_set_complete("add sofia profile ::sofia::list_profiles flush_inbound_reg reboot");
+	switch_console_set_complete("add sofia profile ::sofia::list_profiles ::[register:unregister all");
+	switch_console_set_complete("add sofia profile ::sofia::list_profiles ::[register:unregister:killgw ::sofia::list_profile_gateway");
+	switch_console_set_complete("add sofia profile ::sofia::list_profiles killgw _all_");
+	switch_console_set_complete("add sofia profile ::sofia::list_profiles ::[siptrace:capture:watchdog ::[on:off");
+	switch_console_set_complete("add sofia profile ::sofia::list_profiles gwlist ::[up:down");
 
-	switch_console_set_complete("add sofia profile ::sofia::list_profiles flush_inbound_reg");
-	switch_console_set_complete("add sofia profile ::sofia::list_profiles check_sync");
-	switch_console_set_complete("add sofia profile ::sofia::list_profiles register ::sofia::list_profile_gateway");
-	switch_console_set_complete("add sofia profile ::sofia::list_profiles unregister ::sofia::list_profile_gateway");
-	switch_console_set_complete("add sofia profile ::sofia::list_profiles killgw ::sofia::list_profile_gateway");
-	switch_console_set_complete("add sofia profile ::sofia::list_profiles siptrace on");
-	switch_console_set_complete("add sofia profile ::sofia::list_profiles siptrace off");
-	switch_console_set_complete("add sofia profile ::sofia::list_profiles capture on");
-	switch_console_set_complete("add sofia profile ::sofia::list_profiles capture off");
-	switch_console_set_complete("add sofia profile ::sofia::list_profiles watchdog on");
-	switch_console_set_complete("add sofia profile ::sofia::list_profiles watchdog off");
+	switch_console_set_complete("add sofia recover flush");
 
-	switch_console_set_complete("add sofia profile ::sofia::list_profiles gwlist up");
-	switch_console_set_complete("add sofia profile ::sofia::list_profiles gwlist down");
-
-	switch_console_set_complete("add sofia status profile ::sofia::list_profiles");
-	switch_console_set_complete("add sofia status profile ::sofia::list_profiles reg");
-	switch_console_set_complete("add sofia status gateway ::sofia::list_gateways");
-	switch_console_set_complete("add sofia xmlstatus profile ::sofia::list_profiles");
 	switch_console_set_complete("add sofia xmlstatus profile ::sofia::list_profiles reg");
 	switch_console_set_complete("add sofia xmlstatus gateway ::sofia::list_gateways");
 
diff --git a/src/mod/endpoints/mod_sofia/mod_sofia.h b/src/mod/endpoints/mod_sofia/mod_sofia.h
index 9038b8cf3f..582c69e534 100644
--- a/src/mod/endpoints/mod_sofia/mod_sofia.h
+++ b/src/mod/endpoints/mod_sofia/mod_sofia.h
@@ -291,6 +291,7 @@ typedef enum {
 	PFLAG_PROXY_REFER,
 	PFLAG_CHANNEL_XML_FETCH_ON_NIGHTMARE_TRANSFER,
 	PFLAG_FIRE_TRANFER_EVENTS,
+	PFLAG_BLIND_AUTH_ENFORCE_RESULT,
 
 	/* No new flags below this line */
 	PFLAG_MAX
@@ -597,6 +598,7 @@ struct sofia_profile {
 
 	char *sdp_username;
 	char *sipip;
+	char *printable_sipip;
 	char *extsipip;
 	char *url;
 	char *public_url;
diff --git a/src/mod/endpoints/mod_sofia/sofia.c b/src/mod/endpoints/mod_sofia/sofia.c
index 98067676be..c6032cbd26 100644
--- a/src/mod/endpoints/mod_sofia/sofia.c
+++ b/src/mod/endpoints/mod_sofia/sofia.c
@@ -1591,7 +1591,9 @@ static void our_sofia_event_callback(nua_event_t event,
 		sofia_handle_sip_i_info(nua, profile, nh, session, sip, de, tags);
 		break;
 	case nua_i_update:
-		sofia_update_callee_id(session, profile, sip, SWITCH_TRUE);
+		if (session) {
+			sofia_update_callee_id(session, profile, sip, SWITCH_TRUE);
+		}
 		break;
 	case nua_r_update:
 		if (session && tech_pvt && locked) {
@@ -4191,6 +4193,7 @@ switch_status_t config_sofia(sofia_config_t reload, char *profile_name)
 
 					sofia_clear_pflag(profile, PFLAG_CHANNEL_XML_FETCH_ON_NIGHTMARE_TRANSFER);
 					sofia_clear_pflag(profile, PFLAG_FIRE_TRANFER_EVENTS);
+					sofia_clear_pflag(profile, PFLAG_BLIND_AUTH_ENFORCE_RESULT);
 					profile->shutdown_type = "false";
 					profile->local_network = "localnet.auto";
 					sofia_set_flag(profile, TFLAG_ENABLE_SOA);
@@ -5469,6 +5472,12 @@ switch_status_t config_sofia(sofia_config_t reload, char *profile_name)
 						}  else {
 							sofia_clear_pflag(profile, PFLAG_FIRE_TRANFER_EVENTS);
 						}
+                                        } else if (!strcasecmp(var, "enforce-blind-auth-result")) {
+                                                if(switch_true(val)) {
+                                                        sofia_set_pflag(profile, PFLAG_BLIND_AUTH_ENFORCE_RESULT);
+                                                }  else {
+                                                        sofia_clear_pflag(profile, PFLAG_BLIND_AUTH_ENFORCE_RESULT);
+                                                }
 					}
 				}
 
@@ -6749,7 +6758,7 @@ static void sofia_handle_sip_i_state(switch_core_session_t *session, int status,
 	}
 
 	if (session) {
-		if (switch_channel_test_flag(channel, CF_ANSWERED) && (status == 180 || status == 183) && !r_sdp) {
+		if ((switch_channel_test_flag(channel, CF_ANSWERED) && (status == 180 || status == 183) && !r_sdp) || (ss_state == nua_callstate_ready && status >= 300)) {
 			switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "Channel %s skipping state [%s][%d]\n",
 							  switch_channel_get_name(channel), nua_callstate_name(ss_state), status);
 			goto done;
@@ -6969,7 +6978,7 @@ static void sofia_handle_sip_i_state(switch_core_session_t *session, int status,
 			}
 
 			if (switch_channel_test_flag(channel, CF_3P_NOMEDIA_REQUESTED)) {
-				switch_channel_clear_flag(channel, CF_3P_NOMEDIA_REQUESTED);
+				
 				if (switch_channel_test_flag(channel, CF_3P_NOMEDIA_REQUESTED_BLEG)) {
 					switch_core_session_t *other_session;
 					
@@ -7001,12 +7010,12 @@ static void sofia_handle_sip_i_state(switch_core_session_t *session, int status,
 					switch_channel_set_variable(channel, SWITCH_R_SDP_VARIABLE, r_sdp);
 				}
 				
+				switch_channel_clear_flag(channel, CF_3P_NOMEDIA_REQUESTED);
 				goto done;
 
 			} else if (switch_channel_test_flag(channel, CF_3P_MEDIA_REQUESTED)) {
 				uint8_t match = 0;
 				
-				switch_channel_clear_flag(channel, CF_3P_MEDIA_REQUESTED);
 				switch_channel_clear_flag(channel, CF_PROXY_MODE);
 				
 				switch_core_media_choose_port(tech_pvt->session, SWITCH_MEDIA_TYPE_AUDIO, 0);
@@ -7046,12 +7055,14 @@ static void sofia_handle_sip_i_state(switch_core_session_t *session, int status,
 							//TAG_IF(sofia_test_pflag(tech_pvt->profile, PFLAG_DISABLE_100REL), NUTAG_INCLUDE_EXTRA_SDP(1)),
 							TAG_END());
 					
+					switch_channel_clear_flag(channel, CF_3P_MEDIA_REQUESTED);
 					goto done;
 				}
 
 				switch_channel_set_variable(channel, SWITCH_ENDPOINT_DISPOSITION_VARIABLE, "NO CODECS");
 				switch_channel_hangup(channel, SWITCH_CAUSE_INCOMPATIBLE_DESTINATION);
 
+				switch_channel_clear_flag(channel, CF_3P_MEDIA_REQUESTED);
 				goto done;
 				//ss_state = nua_callstate_ready;
 				//goto state_process;
@@ -9146,7 +9157,14 @@ void sofia_handle_sip_i_reinvite(switch_core_session_t *session,
 				tech_pvt->mparams.last_sdp_str = tech_pvt->mparams.prev_sdp_str;
 			}
 		}
-		switch_channel_execute_on(channel, "execute_on_sip_reinvite");
+
+		if (switch_core_media_check_udptl_mode(session, SWITCH_MEDIA_TYPE_AUDIO)) {
+			/* Refuse all re-invites once we are doing T.38 */
+			nua_respond(nh, SIP_488_NOT_ACCEPTABLE, TAG_END());
+			switch_channel_hangup(channel, SWITCH_CAUSE_INCOMPATIBLE_DESTINATION);
+		} else {
+			switch_channel_execute_on(channel, "execute_on_sip_reinvite");
+		}
 	}
 
 }
@@ -9434,6 +9452,7 @@ void sofia_handle_sip_i_invite(switch_core_session_t *session, nua_t *nua, sofia
 
 	if (!is_auth && sofia_test_pflag(profile, PFLAG_AUTH_CALLS) && sofia_test_pflag(profile, PFLAG_BLIND_AUTH)) {
 		char *user;
+		switch_status_t blind_result = SWITCH_STATUS_FALSE;
 
 		if (!strcmp(network_ip, profile->sipip) && network_port == profile->sip_port) {
 			calling_myself++;
@@ -9441,10 +9460,11 @@ void sofia_handle_sip_i_invite(switch_core_session_t *session, nua_t *nua, sofia
 
 		if (sip && sip->sip_from) {
 			user = switch_core_session_sprintf(session, "%s@%s", sip->sip_from->a_url->url_user, sip->sip_from->a_url->url_host);
-			switch_ivr_set_user(session, user);
+			blind_result = switch_ivr_set_user(session, user);
+		}
+		if(!sofia_test_pflag(profile, PFLAG_BLIND_AUTH_ENFORCE_RESULT) || blind_result == SWITCH_STATUS_SUCCESS) {
+			is_auth++;
 		}
-
-		is_auth++;
 	}
 
 	if (!is_auth &&
diff --git a/src/mod/endpoints/mod_sofia/sofia_reg.c b/src/mod/endpoints/mod_sofia/sofia_reg.c
index 99dcf49349..c0705c9450 100644
--- a/src/mod/endpoints/mod_sofia/sofia_reg.c
+++ b/src/mod/endpoints/mod_sofia/sofia_reg.c
@@ -24,7 +24,7 @@
  * Contributor(s):
  * 
  * Anthony Minessale II <anthm@freeswitch.org>
- * Ken Rice, <krice at cometsig.com>  (work sponsored by Comet Signaling LLC, CopperCom, Inc and Asteria Solutions Group, Inc)
+ * Ken Rice, <krice at freeswitch.org>  (work sponsored by Comet Signaling LLC, CopperCom, Inc and Asteria Solutions Group, Inc)
  * Paul D. Tinsley <pdt at jackhammer.org>
  * Bret McDanel <trixter AT 0xdecafbad.com>
  * Marcel Barbulescu <marcelbarbulescu@gmail.com>
diff --git a/src/mod/endpoints/mod_verto/mcast/mcast.c b/src/mod/endpoints/mod_verto/mcast/mcast.c
index 915c831df4..f15ff6efc5 100644
--- a/src/mod/endpoints/mod_verto/mcast/mcast.c
+++ b/src/mod/endpoints/mod_verto/mcast/mcast.c
@@ -68,7 +68,7 @@ int mcast_socket_create(const char *host, int16_t port, mcast_handle_t *handle,
 		family = AF_INET6;
 	}
 	
-	if ((!(flags & MCAST_SEND) && !(flags & MCAST_RECV)) || (handle->sock = (mcast_socket_t)socket(family, SOCK_DGRAM, 0)) != mcast_sock_invalid ) {
+	if ((!(flags & MCAST_SEND) && !(flags & MCAST_RECV)) || (handle->sock = (mcast_socket_t)socket(family, SOCK_DGRAM, 0)) == mcast_sock_invalid ) {
 		return -1;
 	}
 
@@ -190,7 +190,8 @@ void mcast_socket_close(mcast_handle_t *handle)
 
 ssize_t mcast_socket_send(mcast_handle_t *handle, void *data, size_t datalen)
 {
-	if (handle->sock != mcast_sock_invalid) {
+	if (handle->sock == mcast_sock_invalid) {
+		errno = EINVAL;
 		return -1;
 	}
 
@@ -202,6 +203,7 @@ ssize_t mcast_socket_send(mcast_handle_t *handle, void *data, size_t datalen)
 	if (handle->family == AF_INET6) {
 		return sendto(handle->sock, data, (int)datalen, 0, (struct sockaddr *) &handle->send_addr6, sizeof(handle->send_addr6));
 	} else {
+		//printf("WTF %d %p %ld %s\n", handle->sock, (void *)data, datalen, inet_ntoa(handle->send_addr.sin_addr));
 		return sendto(handle->sock, data, (int)datalen, 0, (struct sockaddr *) &handle->send_addr, sizeof(handle->send_addr));
 	}
 }
@@ -210,6 +212,11 @@ ssize_t mcast_socket_recv(mcast_handle_t *handle, void *data, size_t datalen, in
 {
 	socklen_t addrlen = sizeof(handle->recv_addr);
 
+	if (handle->sock == mcast_sock_invalid) {
+		errno = EINVAL;
+		return -1;
+	}
+
 	if (data == NULL || datalen == 0) {
 		data = handle->buffer;
 		datalen = sizeof(handle->buffer);
diff --git a/src/mod/endpoints/mod_verto/mod_verto.c b/src/mod/endpoints/mod_verto/mod_verto.c
index ba1e78752f..58fc3499cd 100644
--- a/src/mod/endpoints/mod_verto/mod_verto.c
+++ b/src/mod/endpoints/mod_verto/mod_verto.c
@@ -1113,6 +1113,7 @@ static void attach_jsock(jsock_t *jsock)
 			switch_core_hash_delete(globals.jsock_hash, jsock->uuid_str);
 			ws_write_json(jp, &msg, SWITCH_TRUE);
 			cJSON_Delete(msg);
+			jp->nodelete = 1;
 			jp->drop = 1;
 		}
 	}
@@ -1126,6 +1127,10 @@ static void attach_jsock(jsock_t *jsock)
 
 static void detach_jsock(jsock_t *jsock)
 {
+	if (jsock->nodelete) {
+		return;
+	}
+
 	switch_mutex_lock(globals.jsock_mutex);
 	switch_core_hash_delete(globals.jsock_hash, jsock->uuid_str);
 	switch_mutex_unlock(globals.jsock_mutex);
@@ -3798,14 +3803,16 @@ static switch_bool_t verto__broadcast_func(const char *method, cJSON *params, js
 	if (jsock->profile->mcast_pub.sock != ws_sock_invalid) {
 		if ((json_text = cJSON_PrintUnformatted(params))) {
 
-			if ( mcast_socket_send(&jsock->profile->mcast_pub, json_text, strlen(json_text) + 1) < 0 ) {
-				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "multicast socket send error!\n");
+			if (mcast_socket_send(&jsock->profile->mcast_pub, json_text, strlen(json_text) + 1) <= 0) {
+				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "multicast socket send error! %s\n", strerror(errno));
+				r = SWITCH_FALSE;
+				cJSON_AddItemToObject(*response, "message", cJSON_CreateString("MCAST Data Send failure!"));
+			} else {
+				r = SWITCH_TRUE;
+				cJSON_AddItemToObject(*response, "message", cJSON_CreateString("MCAST Data Sent"));
 			}
-
 			free(json_text);
 			json_text = NULL;
-			r = SWITCH_TRUE;
-			cJSON_AddItemToObject(*response, "message", cJSON_CreateString("MCAST Data Sent"));
 		} else {
 			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "JSON ERROR!\n");
 		}
@@ -4481,6 +4488,10 @@ static switch_status_t parse_config(const char *cf)
 
 			profile->local_network = "localnet.auto";
 
+			profile->mcast_sub.sock = ws_sock_invalid;
+			profile->mcast_pub.sock = ws_sock_invalid;
+
+
 			for (param = switch_xml_child(xprofile, "param"); param; param = param->next) {
 				char *var = NULL;
 				char *val = NULL;
diff --git a/src/mod/endpoints/mod_verto/mod_verto.h b/src/mod/endpoints/mod_verto/mod_verto.h
index eccd095369..b144e70a29 100644
--- a/src/mod/endpoints/mod_verto/mod_verto.h
+++ b/src/mod/endpoints/mod_verto/mod_verto.h
@@ -112,7 +112,8 @@ struct jsock_s {
 	struct passwd pw;
 #endif
 
-	int drop;
+	uint8_t drop;
+	uint8_t nodelete;
 	ws_socket_t local_sock;
 	SSL *ssl;
 
diff --git a/src/mod/endpoints/mod_verto/ws.h b/src/mod/endpoints/mod_verto/ws.h
index 1d020d0b33..56522a23df 100644
--- a/src/mod/endpoints/mod_verto/ws.h
+++ b/src/mod/endpoints/mod_verto/ws.h
@@ -25,7 +25,7 @@
 //#include "sha1.h"
 #include <openssl/ssl.h>
 
-#if defined(_MSC_VER) || defined(__APPLE__) 
+#if defined(_MSC_VER) || defined(__APPLE__) || defined(__FreeBSD__) || (defined(__SVR4) && defined(__sun)) 
 #define __bswap_64(x) \
   x = (x>>56) | \
     ((x<<40) & 0x00FF000000000000) | \
diff --git a/src/mod/event_handlers/mod_amqp/Makefile.am b/src/mod/event_handlers/mod_amqp/Makefile.am
index 3a7ffd0f30..7bb93927b8 100644
--- a/src/mod/event_handlers/mod_amqp/Makefile.am
+++ b/src/mod/event_handlers/mod_amqp/Makefile.am
@@ -4,7 +4,7 @@ MODNAME=mod_amqp
 if HAVE_AMQP
 
 mod_LTLIBRARIES = mod_amqp.la
-mod_amqp_la_SOURCES  = mod_amqp_utils.c mod_amqp_connection.c mod_amqp_producer.c mod_amqp_command.c mod_amqp.c
+mod_amqp_la_SOURCES  = mod_amqp_utils.c mod_amqp_connection.c mod_amqp_producer.c mod_amqp_command.c mod_amqp_logging.c mod_amqp.c
 mod_amqp_la_CFLAGS   = $(AM_CFLAGS) $(AMQP_CFLAGS)
 mod_amqp_la_LIBADD   = $(switch_builddir)/libfreeswitch.la
 mod_amqp_la_LDFLAGS  = -avoid-version -module -no-undefined -shared $(AMQP_LIBS) $(SWITCH_AM_LDFLAGS)
diff --git a/src/mod/event_handlers/mod_amqp/mod_amqp.c b/src/mod/event_handlers/mod_amqp/mod_amqp.c
index a481064975..ce82d739e8 100644
--- a/src/mod/event_handlers/mod_amqp/mod_amqp.c
+++ b/src/mod/event_handlers/mod_amqp/mod_amqp.c
@@ -62,6 +62,7 @@ SWITCH_MODULE_LOAD_FUNCTION(mod_amqp_load)
 	globals.pool = pool;
 	switch_core_hash_init(&(globals.producer_hash));
 	switch_core_hash_init(&(globals.command_hash));
+	switch_core_hash_init(&(globals.logging_hash));
 
 	switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "mod_apqp loading: Version %s\n", switch_version_full());
 
@@ -72,6 +73,8 @@ SWITCH_MODULE_LOAD_FUNCTION(mod_amqp_load)
 
 	SWITCH_ADD_API(api_interface, "amqp", "amqp API", amqp_reload, "syntax");
 
+	switch_log_bind_logger(mod_amqp_logging_recv, SWITCH_LOG_DEBUG, SWITCH_FALSE);
+	
 	return SWITCH_STATUS_SUCCESS;
 }
 
@@ -84,6 +87,7 @@ SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_amqp_shutdown)
 	switch_hash_index_t *hi;
 	mod_amqp_producer_profile_t *producer;
 	mod_amqp_command_profile_t *command;
+	mod_amqp_logging_profile_t *logging;
 
 	switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Mod starting shutting down\n");
 	switch_event_unbind_callback(mod_amqp_producer_event_handler);
@@ -98,6 +102,13 @@ SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_amqp_shutdown)
 		mod_amqp_command_destroy(&command);
 	}
 
+	switch_log_unbind_logger(mod_amqp_logging_recv);
+
+	while ((hi = switch_core_hash_first(globals.logging_hash))) {
+		switch_core_hash_this(hi, NULL, NULL, (void **)&logging);
+		mod_amqp_logging_destroy(&logging);
+	}
+
 	switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Mod finished shutting down\n");
 	return SWITCH_STATUS_SUCCESS;
 }
diff --git a/src/mod/event_handlers/mod_amqp/mod_amqp.h b/src/mod/event_handlers/mod_amqp/mod_amqp.h
index f82a8c5a9c..238236a3b4 100644
--- a/src/mod/event_handlers/mod_amqp/mod_amqp.h
+++ b/src/mod/event_handlers/mod_amqp/mod_amqp.h
@@ -146,17 +146,46 @@ typedef struct {
   char *custom_attr;
 } mod_amqp_command_profile_t;
 
+typedef struct {
+  char *name;
+  
+  char *exchange;
+  char *exchange_type;
+  int exchange_durable;
+  int exchange_auto_delete;
+
+  uint32_t log_level_mask;
+
+  /* Note: The AMQP channel is not reentrant this MUTEX serializes sending events. */
+  mod_amqp_connection_t *conn_root;
+  mod_amqp_connection_t *conn_active;
+  
+  int reconnect_interval_ms;
+
+  /* Logging thread */
+  switch_thread_t *logging_thread;
+  switch_queue_t *send_queue;
+  unsigned int send_queue_size;
+
+  switch_mutex_t *mutex;
+  switch_bool_t running;
+  char *custom_attr;
+  switch_memory_pool_t *pool;
+} mod_amqp_logging_profile_t;
+
 struct {
   switch_memory_pool_t *pool;
   
   switch_hash_t *producer_hash;
   switch_hash_t *command_hash;
+  switch_hash_t *logging_hash;
 } globals;
 
 /* utils */
 switch_status_t mod_amqp_do_config(switch_bool_t reload);
 int mod_amqp_log_if_amqp_error(amqp_rpc_reply_t x, char const *context);
 int mod_amqp_count_chars(const char* string, char ch);
+void mod_amqp_util_msg_destroy(mod_amqp_message_t **msg);
 
 /* connection */
 switch_status_t mod_amqp_connection_create(mod_amqp_connection_t **conn, switch_xml_t cfg, switch_memory_pool_t *pool);
@@ -179,5 +208,11 @@ void * SWITCH_THREAD_FUNC mod_amqp_producer_thread(switch_thread_t *thread, void
 
 char *amqp_util_encode(char *key, char *dest);
 
+/* logging */
+switch_status_t mod_amqp_logging_recv(const switch_log_node_t *node, switch_log_level_t level);
+switch_status_t mod_amqp_logging_create(char *name, switch_xml_t cfg);
+switch_status_t mod_amqp_logging_destroy(mod_amqp_logging_profile_t **prof);
+void * SWITCH_THREAD_FUNC mod_amqp_logging_thread(switch_thread_t *thread, void *data);
+
 #endif /* MOD_AMQP_H */
 
diff --git a/src/mod/event_handlers/mod_amqp/mod_amqp_logging.c b/src/mod/event_handlers/mod_amqp/mod_amqp_logging.c
new file mode 100644
index 0000000000..08427b8083
--- /dev/null
+++ b/src/mod/event_handlers/mod_amqp/mod_amqp_logging.c
@@ -0,0 +1,412 @@
+/*
+* FreeSWITCH Modular Media Switching Software Library / Soft-Switch Application
+* Copyright (C) 2005-2012, Anthony Minessale II <anthm@freeswitch.org>
+*
+* Version: MPL 1.1
+*
+* The contents of this file are subject to the Mozilla Public License Version
+* 1.1 (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+* http://www.mozilla.org/MPL/
+*
+* Software distributed under the License is distributed on an "AS IS" basis,
+* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+* for the specific language governing rights and limitations under the
+* License.
+*
+* The Original Code is FreeSWITCH Modular Media Switching Software Library / Soft-Switch Application
+*
+* The Initial Developer of the Original Code is
+* Anthony Minessale II <anthm@freeswitch.org>
+* Portions created by the Initial Developer are Copyright (C)
+* the Initial Developer. All Rights Reserved.
+*
+* Based on mod_skel by
+* Anthony Minessale II <anthm@freeswitch.org>
+*
+* Contributor(s):
+*
+* Daniel Bryars <danb@aeriandi.com>
+* Tim Brown <tim.brown@aeriandi.com>
+* Anthony Minessale II <anthm@freeswitch.org>
+* William King <william.king@quentustech.com>
+* Mike Jerris <mike@jerris.com>
+*
+* mod_amqp.c -- Sends FreeSWITCH events to an AMQP broker
+*
+*/
+
+#include "mod_amqp.h"
+
+switch_status_t mod_amqp_logging_recv(const switch_log_node_t *node, switch_log_level_t level)
+{
+	switch_hash_index_t *hi = NULL;
+	mod_amqp_message_t *msg = NULL;
+	mod_amqp_logging_profile_t *logging = NULL;
+	char *json = NULL;
+
+	if (!strcmp(node->file, "mod_amqp_logging.c")) {
+		return SWITCH_STATUS_SUCCESS;
+	}
+
+	/*
+	  1. Loop through logging hash of profiles. Check for a profile that accepts this logging level, and file regex.
+	  2. If event not already parsed/created, then create it now
+	  3. Queue copy of event into logging profile send queue
+	  4. Destroy local event copy
+	*/
+	for (hi = switch_core_hash_first(globals.logging_hash); hi; hi = switch_core_hash_next(&hi)) {
+		switch_core_hash_this(hi, NULL, NULL, (void **)&logging);
+
+		if ( logging && switch_log_check_mask(logging->log_level_mask, level) ) {
+			char file[128] = {0};
+			if ( !json ) {
+				cJSON *body = NULL;
+				char date[80] = "";
+				switch_time_exp_t tm;
+
+				switch_time_exp_lt(&tm, node->timestamp);
+				switch_snprintf(date, sizeof(date), "%0.4d-%0.2d-%0.2d %0.2d:%0.2d:%0.2d.%0.6d",
+								tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec, tm.tm_usec);
+
+				/* Create cJSON body */
+				body = cJSON_CreateObject();
+
+				cJSON_AddItemToObject(body, "file", cJSON_CreateString((const char *) node->file));
+				cJSON_AddItemToObject(body, "function", cJSON_CreateString((const char *) node->func));
+				cJSON_AddItemToObject(body, "line", cJSON_CreateNumber((double) node->line));
+				cJSON_AddItemToObject(body, "level", cJSON_CreateString(switch_log_level2str(node->level)));
+				cJSON_AddItemToObject(body, "timestamp", cJSON_CreateString((const char *)date));
+				cJSON_AddItemToObject(body, "timestamp_epoch", cJSON_CreateNumber((double) node->timestamp / 1000000));
+				cJSON_AddItemToObject(body, "content", cJSON_CreateString(node->content ));
+
+				json = cJSON_Print(body);
+				cJSON_Delete(body);
+			}
+
+			/* Create message */
+			switch_malloc(msg, sizeof(mod_amqp_message_t));
+			msg->pjson = strdup(json);
+			strcpy(file, node->file);
+			switch_replace_char(file, '.', '_', 0);
+
+			snprintf(msg->routing_key, sizeof(msg->routing_key), "%s.%s.%s.%s", switch_core_get_hostname(), node->userdata, switch_log_level2str(node->level), file);
+
+			if (switch_queue_trypush(logging->send_queue, msg) != SWITCH_STATUS_SUCCESS) {
+				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "AMQP logging message queue full. Messages will be dropped!\n");
+				return SWITCH_STATUS_SUCCESS;
+			}
+		}
+	}
+
+
+	switch_safe_free(json);
+	return SWITCH_STATUS_SUCCESS;
+}
+
+switch_status_t mod_amqp_logging_destroy(mod_amqp_logging_profile_t **prof)
+{
+	mod_amqp_message_t *msg = NULL;
+	switch_status_t status = SWITCH_STATUS_SUCCESS;
+	mod_amqp_connection_t *conn = NULL, *conn_next = NULL;
+	switch_memory_pool_t *pool;
+	mod_amqp_logging_profile_t *profile;
+
+	if (!prof || !*prof) {
+		return SWITCH_STATUS_SUCCESS;
+	}
+
+	profile = *prof;
+	pool = profile->pool;
+
+	if (profile->name) {
+		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "Profile[%s] shutting down...\n", profile->name);
+		switch_core_hash_delete(globals.logging_hash, profile->name);
+	}
+
+	profile->running = 0;
+
+	if (profile->logging_thread) {
+		switch_thread_join(&status, profile->logging_thread);
+	}
+
+	switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "Profile[%s] closing AMQP socket...\n", profile->name);
+
+	for (conn = profile->conn_root; conn; conn = conn_next) {
+		conn_next = conn->next;
+		mod_amqp_connection_destroy(&conn);
+	}
+
+	profile->conn_active = NULL;
+	profile->conn_root = NULL;
+
+	while (profile->send_queue && switch_queue_trypop(profile->send_queue, (void**)&msg) == SWITCH_STATUS_SUCCESS) {
+		mod_amqp_util_msg_destroy(&msg);
+	}
+
+	if (pool) {
+		switch_core_destroy_memory_pool(&pool);
+	}
+
+	*prof = NULL;
+
+	return SWITCH_STATUS_SUCCESS;
+}
+
+switch_status_t mod_amqp_logging_create(char *name, switch_xml_t cfg)
+{
+	mod_amqp_logging_profile_t *profile = NULL;
+	switch_xml_t params, param, connections, connection;
+	switch_threadattr_t *thd_attr = NULL;
+	char *exchange = NULL, *exchange_type = NULL;
+	int exchange_durable = 1; /* durable */
+	switch_memory_pool_t *pool;
+
+	if (switch_core_new_memory_pool(&pool) != SWITCH_STATUS_SUCCESS) {
+		goto err;
+	}
+
+	profile = switch_core_alloc(pool, sizeof(mod_amqp_logging_profile_t));
+	profile->pool = pool;
+	profile->name = switch_core_strdup(profile->pool, name);
+	profile->running = 1;
+	profile->conn_root   = NULL;
+	profile->conn_active = NULL;
+	profile->log_level_mask = 0;
+	profile->send_queue_size = 5000;
+
+	if ((params = switch_xml_child(cfg, "params")) != NULL) {
+		for (param = switch_xml_child(params, "param"); param; param = param->next) {
+			char *var = (char *) switch_xml_attr_soft(param, "name");
+			char *val = (char *) switch_xml_attr_soft(param, "value");
+
+			if (!var) {
+				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_CRIT, "Profile[%s] param missing 'name' attribute\n", profile->name);
+				continue;
+			}
+
+			if (!val) {
+				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_CRIT, "Profile[%s] param[%s] missing 'value' attribute\n", profile->name, var);
+				continue;
+			}
+
+			if (!strncmp(var, "reconnect_interval_ms", 21)) {
+				int interval = atoi(val);
+				if ( interval && interval > 0 ) {
+					profile->reconnect_interval_ms = interval;
+				}
+			} else if (!strncmp(var, "send_queue_size", 15)) {
+				int interval = atoi(val);
+				if ( interval && interval > 0 ) {
+					profile->send_queue_size = interval;
+				}
+			} else if (!strncmp(var, "exchange-type", 13)) {
+				exchange_type = switch_core_strdup(profile->pool, val);
+			} else if (!strncmp(var, "exchange-name", 13)) {
+				exchange = switch_core_strdup(profile->pool, val);
+			} else if (!strncmp(var, "exchange-durable", 16)) {
+				exchange_durable = switch_true(val);
+			} else if (!strncmp(var, "log-levels", 10)) {
+			  profile->log_level_mask = switch_log_str2mask(val);
+			}
+		} /* params for loop */
+	}
+
+	/* Handle defaults of string types */
+	profile->exchange = exchange ? exchange : switch_core_strdup(profile->pool, "TAP.Events");
+	profile->exchange_type = exchange_type ? exchange_type : switch_core_strdup(profile->pool, "topic");
+	profile->exchange_durable = exchange_durable;
+
+	if ((connections = switch_xml_child(cfg, "connections")) != NULL) {
+		for (connection = switch_xml_child(connections, "connection"); connection; connection = connection->next) {
+			if ( ! profile->conn_root ) { /* Handle first root node */
+				if (mod_amqp_connection_create(&(profile->conn_root), connection, profile->pool) != SWITCH_STATUS_SUCCESS) {
+					/* Handle connection create failure */
+					switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "Profile[%s] failed to create connection\n", profile->name);
+					continue;
+				}
+				profile->conn_active = profile->conn_root;
+			} else {
+				if (mod_amqp_connection_create(&(profile->conn_active->next), connection, profile->pool) != SWITCH_STATUS_SUCCESS) {
+					/* Handle connection create failure */
+					switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "Profile[%s] failed to create connection\n", profile->name);
+					continue;
+				}
+				profile->conn_active = profile->conn_active->next;
+			}
+		}
+	}
+	profile->conn_active = NULL;
+
+	if ( mod_amqp_connection_open(profile->conn_root, &(profile->conn_active), profile->name, profile->custom_attr) != SWITCH_STATUS_SUCCESS) {
+		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Profile[%s] was unable to connect to any connection\n", profile->name);
+		goto err;
+	}
+
+	amqp_exchange_declare(profile->conn_active->state, 1,
+						  amqp_cstring_bytes(profile->exchange),
+						  amqp_cstring_bytes(profile->exchange_type),
+						  0, /* passive */
+						  profile->exchange_durable,
+						  amqp_empty_table);
+	
+	if (mod_amqp_log_if_amqp_error(amqp_get_rpc_reply(profile->conn_active->state), "Declaring exchange")) {
+		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Profile[%s] failed to create exchange\n", profile->name);
+		goto err;
+	}
+	
+	/* Create a bounded FIFO queue for sending messages */
+	if (switch_queue_create(&(profile->send_queue), profile->send_queue_size, profile->pool) != SWITCH_STATUS_SUCCESS) {
+		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Cannot create send queue of size %d!\n", profile->send_queue_size);
+		goto err;
+	}
+
+	/* Start the event send thread. This will set up the initial connection */
+	switch_threadattr_create(&thd_attr, profile->pool);
+	switch_threadattr_stacksize_set(thd_attr, SWITCH_THREAD_STACKSIZE);
+	if (switch_thread_create(&profile->logging_thread, thd_attr, mod_amqp_logging_thread, profile, profile->pool)) {
+		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Cannot create 'amqp event sender' thread!\n");
+		goto err;
+	}
+
+	if ( switch_core_hash_insert(globals.logging_hash, name, (void *) profile) != SWITCH_STATUS_SUCCESS) {
+		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Failed to insert new profile [%s] into mod_amqp profile hash\n", name);
+		goto err;
+	}
+
+	switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Profile[%s] Successfully started\n", profile->name);
+	return SWITCH_STATUS_SUCCESS;
+
+ err:
+	/* Cleanup */
+	mod_amqp_logging_destroy(&profile);
+	return SWITCH_STATUS_GENERR;
+
+}
+
+/* This should only be called in a single threaded context from the logging profile send thread */
+switch_status_t mod_amqp_logging_send(mod_amqp_logging_profile_t *profile, mod_amqp_message_t *msg)
+{
+	amqp_basic_properties_t props;
+	int status;
+
+	if (! profile->conn_active) {
+		/* No connection, so we can not send the message. */
+		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_CRIT, "Profile[%s] not active\n", profile->name);
+		return SWITCH_STATUS_NOT_INITALIZED;
+	}
+	memset(&props, 0, sizeof(amqp_basic_properties_t));
+
+	props._flags = AMQP_BASIC_CONTENT_TYPE_FLAG;
+	props.content_type = amqp_cstring_bytes("application/json");
+
+	status = amqp_basic_publish(
+								profile->conn_active->state,
+								1,
+								amqp_cstring_bytes(profile->exchange),
+								amqp_cstring_bytes(msg->routing_key),
+								0,
+								0,
+								&props,
+								amqp_cstring_bytes(msg->pjson));
+
+	if (status < 0) {
+		const char *errstr = amqp_error_string2(-status);
+		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_CRIT, "Profile[%s] failed to send event on connection[%s]: %s\n",
+						  profile->name, profile->conn_active->name, errstr);
+
+		/* This is bad, we couldn't send the message. Clear up any connection */
+		mod_amqp_connection_close(profile->conn_active);
+		profile->conn_active = NULL;
+		return SWITCH_STATUS_SOCKERR;
+	}
+
+	return SWITCH_STATUS_SUCCESS;
+}
+
+
+
+void * SWITCH_THREAD_FUNC mod_amqp_logging_thread(switch_thread_t *thread, void *data)
+{
+  mod_amqp_message_t *msg = NULL;
+  switch_status_t status = SWITCH_STATUS_SUCCESS;
+  mod_amqp_logging_profile_t *profile = (mod_amqp_logging_profile_t *)data;
+  amqp_boolean_t passive = 0;
+  amqp_boolean_t durable = 1;
+
+  while (profile->running) {
+    if (!profile->conn_active) {
+      switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "Amqp no connection- reconnecting...\n");
+
+      status = mod_amqp_connection_open(profile->conn_root, &(profile->conn_active), profile->name, profile->custom_attr);
+      if ( status	== SWITCH_STATUS_SUCCESS ) {
+	// Ensure that the exchange exists, and is of the correct type
+	amqp_exchange_declare(profile->conn_active->state, 1,
+			      amqp_cstring_bytes(profile->exchange),
+			      amqp_cstring_bytes(profile->exchange_type),
+			      passive,
+			      durable,
+			      amqp_empty_table);
+
+	if (!mod_amqp_log_if_amqp_error(amqp_get_rpc_reply(profile->conn_active->state), "Declaring exchange")) {
+	  switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Amqp reconnect successful- connected\n");
+	  continue;
+	}
+      }
+
+      switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "Profile[%s] failed to connect with code(%d), sleeping for %dms\n",
+			profile->name, status, profile->reconnect_interval_ms);
+      switch_sleep(profile->reconnect_interval_ms * 1000);
+      continue;
+    }
+
+    if (!msg && switch_queue_pop_timeout(profile->send_queue, (void**)&msg, 1000000) != SWITCH_STATUS_SUCCESS) {
+      continue;
+    }
+
+    if (msg) {
+      switch (mod_amqp_logging_send(profile, msg)) {
+      case SWITCH_STATUS_SUCCESS:
+	/* Success: prepare for next message */
+	mod_amqp_util_msg_destroy(&msg);
+	break;
+
+      case SWITCH_STATUS_NOT_INITALIZED:
+	switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Send failed with 'not initialised'\n");
+	break;
+
+      case SWITCH_STATUS_SOCKERR:
+	switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Send failed with 'socket error'\n");
+	break;
+
+      default:
+	switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Send failed with a generic error\n");
+	
+	/* Send failed and closed the connection; reconnect will happen at the beginning of the loop
+	 * NB: do we need a delay here to prevent a fast reconnect-send-fail loop? */
+	break;
+      }
+    }
+  }
+
+  /* Abort the current message */
+  mod_amqp_util_msg_destroy(&msg);
+
+  // Terminate the thread
+  switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Event sender thread stopped\n");
+  switch_thread_exit(thread, SWITCH_STATUS_SUCCESS);
+  return NULL;
+}
+
+
+
+/* For Emacs:
+ * Local Variables:
+ * mode:c
+ * indent-tabs-mode:t
+ * tab-width:4
+ * c-basic-offset:4
+ * End:
+ * For VIM:
+ * vim:set softtabstop=4 shiftwidth=4 tabstop=4
+ */
diff --git a/src/mod/event_handlers/mod_amqp/mod_amqp_producer.c b/src/mod/event_handlers/mod_amqp/mod_amqp_producer.c
index 980cad8a05..39e6d5e5e1 100644
--- a/src/mod/event_handlers/mod_amqp/mod_amqp_producer.c
+++ b/src/mod/event_handlers/mod_amqp/mod_amqp_producer.c
@@ -38,13 +38,6 @@
 
 #include "mod_amqp.h"
 
-void mod_amqp_producer_msg_destroy(mod_amqp_message_t **msg)
-{
-	if (!msg || !*msg) return;
-	switch_safe_free((*msg)->pjson);
-	switch_safe_free(*msg);
-}
-
 switch_status_t mod_amqp_producer_routing_key(mod_amqp_producer_profile_t *profile, char routingKey[MAX_AMQP_ROUTING_KEY_LENGTH],
 											  switch_event_t* evt, mod_amqp_keypart_t routingKeyEventHeaderNames[])
 {
@@ -115,7 +108,7 @@ void mod_amqp_producer_event_handler(switch_event_t* evt)
 		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "AMQP message queue full. Messages will be dropped for %.1fs! (Queue capacity %d)",
 						  profile->circuit_breaker_ms / 1000.0, queue_size);
 
-		mod_amqp_producer_msg_destroy(&amqp_message);
+		mod_amqp_util_msg_destroy(&amqp_message);
 	}
 }
 
@@ -155,7 +148,7 @@ switch_status_t mod_amqp_producer_destroy(mod_amqp_producer_profile_t **prof) {
 	profile->conn_root = NULL;
 
 	while (profile->send_queue && switch_queue_trypop(profile->send_queue, (void**)&msg) == SWITCH_STATUS_SUCCESS) {
-		mod_amqp_producer_msg_destroy(&msg);
+		mod_amqp_util_msg_destroy(&msg);
 	}
 
 	if (pool) {
@@ -497,7 +490,7 @@ void * SWITCH_THREAD_FUNC mod_amqp_producer_thread(switch_thread_t *thread, void
 			switch (mod_amqp_producer_send(profile, msg)) {
 			case SWITCH_STATUS_SUCCESS:
 				/* Success: prepare for next message */
-				mod_amqp_producer_msg_destroy(&msg);
+				mod_amqp_util_msg_destroy(&msg);
 				break;
 
 			case SWITCH_STATUS_NOT_INITALIZED:
@@ -541,7 +534,7 @@ void * SWITCH_THREAD_FUNC mod_amqp_producer_thread(switch_thread_t *thread, void
 	}
 
 	/* Abort the current message */
-	mod_amqp_producer_msg_destroy(&msg);
+	mod_amqp_util_msg_destroy(&msg);
 
 	// Terminate the thread
 	switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Event sender thread stopped\n");
diff --git a/src/mod/event_handlers/mod_amqp/mod_amqp_utils.c b/src/mod/event_handlers/mod_amqp/mod_amqp_utils.c
index 5c59cba1f7..eda879d310 100644
--- a/src/mod/event_handlers/mod_amqp/mod_amqp_utils.c
+++ b/src/mod/event_handlers/mod_amqp/mod_amqp_utils.c
@@ -144,6 +144,30 @@ switch_status_t mod_amqp_do_config(switch_bool_t reload)
 		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "Unable to locate commands section for mod_amqp\n" );
 	}
 
+	if ((profiles = switch_xml_child(cfg, "logging"))) {
+		if ((profile = switch_xml_child(profiles, "profile"))) {
+			for (; profile; profile = profile->next)	{
+				char *name = (char *) switch_xml_attr_soft(profile, "name");
+
+				if (zstr(name)) {
+					switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Failed to load mod_amqp profile. Check configs missing name attr\n");
+					continue;
+				}
+				name = switch_core_strdup(globals.pool, name);
+
+				if ( mod_amqp_logging_create(name, profile) != SWITCH_STATUS_SUCCESS) {
+					switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Failed to load mod_amqp profile [%s]. Check configs\n", name);
+				} else {
+					switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Loaded mod_amqp profile [%s] successfully\n", name);
+				}
+			}
+		} else {
+			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Unable to locate a profile for mod_amqp\n" );
+		}
+	} else {
+		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "Unable to locate logging section for mod_amqp\n" );
+	}
+
 	return SWITCH_STATUS_SUCCESS;
 }
 
@@ -183,6 +207,14 @@ char *amqp_util_encode(char *key, char *dest) {
 	return dest;
 }
 
+void mod_amqp_util_msg_destroy(mod_amqp_message_t **msg)
+{
+	if (!msg || !*msg) return;
+	switch_safe_free((*msg)->pjson);
+	switch_safe_free(*msg);
+}
+
+
 
 /* For Emacs:
  * Local Variables:
diff --git a/src/mod/event_handlers/mod_erlang_event/mod_erlang_event.c b/src/mod/event_handlers/mod_erlang_event/mod_erlang_event.c
index f3e8e7cd87..2196cadfc7 100644
--- a/src/mod/event_handlers/mod_erlang_event/mod_erlang_event.c
+++ b/src/mod/event_handlers/mod_erlang_event/mod_erlang_event.c
@@ -382,6 +382,7 @@ static void remove_session_elem_from_listener(listener_t *listener, session_elem
 static void destroy_session_elem(session_elem_t *session_element)
 {
 	switch_core_session_t *session;
+	void *pop;
 
 	/* wait for readers */
 	switch_thread_rwlock_wrlock(session_element->rwlock);
@@ -395,6 +396,13 @@ static void destroy_session_elem(session_elem_t *session_element)
 		switch_core_session_soft_unlock(session);
 		switch_core_session_rwunlock(session);
 	}
+
+	while (switch_queue_trypop(session_element->event_queue, &pop) == SWITCH_STATUS_SUCCESS) {
+		switch_event_t *event = (switch_event_t *) pop;
+		switch_event_destroy(&event);
+	}
+
+	switch_core_hash_destroy(&session_element->event_hash);
 	switch_core_destroy_memory_pool(&session_element->pool);
 }
 
diff --git a/src/mod/event_handlers/mod_kazoo/kazoo_utils.c b/src/mod/event_handlers/mod_kazoo/kazoo_utils.c
index f23182ff44..1284118daf 100644
--- a/src/mod/event_handlers/mod_kazoo/kazoo_utils.c
+++ b/src/mod/event_handlers/mod_kazoo/kazoo_utils.c
@@ -369,6 +369,9 @@ switch_hash_t *create_default_filter() {
 	switch_core_hash_insert(filter, "Caller-Callee-ID-Number", "1");
 	switch_core_hash_insert(filter, "Caller-Caller-ID-Name", "1");
 	switch_core_hash_insert(filter, "Caller-Caller-ID-Number", "1");
+	switch_core_hash_insert(filter, "Caller-Screen-Bit", "1");
+	switch_core_hash_insert(filter, "Caller-Privacy-Hide-Name", "1");
+	switch_core_hash_insert(filter, "Caller-Privacy-Hide-Number", "1");
 	switch_core_hash_insert(filter, "Caller-Context", "1");
 	switch_core_hash_insert(filter, "Caller-Controls", "1");
 	switch_core_hash_insert(filter, "Caller-Destination-Number", "1");
@@ -514,6 +517,7 @@ switch_hash_t *create_default_filter() {
 	switch_core_hash_insert(filter, "variable_sip_user_agent", "1");
 	switch_core_hash_insert(filter, "variable_duration", "1");
 	switch_core_hash_insert(filter, "variable_billsec", "1");
+	switch_core_hash_insert(filter, "variable_billmsec", "1");
 	switch_core_hash_insert(filter, "variable_progresssec", "1");
 	switch_core_hash_insert(filter, "variable_progress_uepoch", "1");
 	switch_core_hash_insert(filter, "variable_progress_media_uepoch", "1");
@@ -568,36 +572,40 @@ switch_hash_t *create_default_filter() {
 	switch_core_hash_insert(filter, "variable_fax_doc_database", "1");
 
 	/* Secure headers */
-	/*
-	  switch_core_hash_insert(filter, "variable_sdp_secure_savp_only", "1");
-	  switch_core_hash_insert(filter, "variable_rtp_has_crypto", "1");
-	  switch_core_hash_insert(filter, "variable_rtp_secure_media", "1");
-	  switch_core_hash_insert(filter, "variable_rtp_secure_media_confirmed", "1");
-	  switch_core_hash_insert(filter, "variable_rtp_secure_media_confirmed_audio", "1");
-	  switch_core_hash_insert(filter, "variable_rtp_secure_media_confirmed_video", "1");
-	  switch_core_hash_insert(filter, "variable_zrtp_secure_media", "1");
-	  switch_core_hash_insert(filter, "variable_zrtp_secure_media_confirmed", "1");
-	  switch_core_hash_insert(filter, "variable_zrtp_secure_media_confirmed_audio", "1");
-	  switch_core_hash_insert(filter, "variable_zrtp_secure_media_confirmed_video", "1");
-	  switch_core_hash_insert(filter, "sdp_secure_savp_only", "1");
-	  switch_core_hash_insert(filter, "rtp_has_crypto", "1");
-	  switch_core_hash_insert(filter, "rtp_secure_media", "1");
-	  switch_core_hash_insert(filter, "rtp_secure_media_confirmed", "1");
-	  switch_core_hash_insert(filter, "rtp_secure_media_confirmed_audio", "1");
-	  switch_core_hash_insert(filter, "rtp_secure_media_confirmed_video", "1");
-	  switch_core_hash_insert(filter, "zrtp_secure_media", "1");
-	  switch_core_hash_insert(filter, "zrtp_secure_media_confirmed", "1");
-	  switch_core_hash_insert(filter, "zrtp_secure_media_confirmed_audio", "1");
-	  switch_core_hash_insert(filter, "zrtp_secure_media_confirmed_video", "1");
-	*/
+	switch_core_hash_insert(filter, "variable_sdp_secure_savp_only", "1");
+	switch_core_hash_insert(filter, "variable_rtp_has_crypto", "1");
+	switch_core_hash_insert(filter, "variable_rtp_secure_media", "1");
+	switch_core_hash_insert(filter, "variable_rtp_secure_media_confirmed", "1");
+	switch_core_hash_insert(filter, "variable_rtp_secure_media_confirmed_audio", "1");
+	switch_core_hash_insert(filter, "variable_rtp_secure_media_confirmed_video", "1");
+	switch_core_hash_insert(filter, "variable_zrtp_secure_media", "1");
+	switch_core_hash_insert(filter, "variable_zrtp_secure_media_confirmed", "1");
+	switch_core_hash_insert(filter, "variable_zrtp_secure_media_confirmed_audio", "1");
+	switch_core_hash_insert(filter, "variable_zrtp_secure_media_confirmed_video", "1");
+	switch_core_hash_insert(filter, "sdp_secure_savp_only", "1");
+	switch_core_hash_insert(filter, "rtp_has_crypto", "1");
+	switch_core_hash_insert(filter, "rtp_secure_media", "1");
+	switch_core_hash_insert(filter, "rtp_secure_media_confirmed", "1");
+	switch_core_hash_insert(filter, "rtp_secure_media_confirmed_audio", "1");
+	switch_core_hash_insert(filter, "rtp_secure_media_confirmed_video", "1");
+	switch_core_hash_insert(filter, "zrtp_secure_media", "1");
+	switch_core_hash_insert(filter, "zrtp_secure_media_confirmed", "1");
+	switch_core_hash_insert(filter, "zrtp_secure_media_confirmed_audio", "1");
+	switch_core_hash_insert(filter, "zrtp_secure_media_confirmed_video", "1");
 
 	/* Device Redirect headers */
-	/*
-	  switch_core_hash_insert(filter, "variable_last_bridge_hangup_cause", "1");
-	  switch_core_hash_insert(filter, "variable_sip_redirected_by", "1");
-	*/
-
+	switch_core_hash_insert(filter, "variable_last_bridge_hangup_cause", "1");
+	switch_core_hash_insert(filter, "variable_sip_redirected_by", "1");
 	switch_core_hash_insert(filter, "intercepted_by", "1");
+	switch_core_hash_insert(filter, "variable_bridge_uuid", "1");
+	switch_core_hash_insert(filter, "Record-File-Path", "1");
+
+	/* Loopback headers */
+	switch_core_hash_insert(filter, "variable_loopback_bowout_on_execute", "1");
+	switch_core_hash_insert(filter, "variable_loopback_bowout", "1");
+	switch_core_hash_insert(filter, "variable_other_loopback_leg_uuid", "1");
+	switch_core_hash_insert(filter, "variable_loopback_leg", "1");
+	switch_core_hash_insert(filter, "variable_is_loopback", "1");
 
 	// SMS
 	switch_core_hash_insert(filter, "Message-ID", "1");
diff --git a/src/mod/event_handlers/mod_rayo/rayo_output_component.c b/src/mod/event_handlers/mod_rayo/rayo_output_component.c
index 932b3aed47..32f2c7d754 100644
--- a/src/mod/event_handlers/mod_rayo/rayo_output_component.c
+++ b/src/mod/event_handlers/mod_rayo/rayo_output_component.c
@@ -1,6 +1,6 @@
 /*
  * mod_rayo for FreeSWITCH Modular Media Switching Software Library / Soft-Switch Application
- * Copyright (C) 2013-2015, Grasshopper
+ * Copyright (C) 2013-2016, Grasshopper
  *
  * Version: MPL 1.1
  *
@@ -245,15 +245,29 @@ static iks *start_mixer_output_component(struct rayo_actor *mixer, struct rayo_m
 static iks *stop_output_component(struct rayo_actor *component, struct rayo_message *msg, void *data)
 {
 	iks *iq = msg->payload;
+	iks *result = NULL;
+	switch_core_session_t *session = NULL;
 	switch_stream_handle_t stream = { 0 };
 	char *command = switch_mprintf("%s stop", RAYO_JID(component));
 	SWITCH_STANDARD_STREAM(stream);
 	OUTPUT_COMPONENT(component)->stop = 1;
+	if (!strcmp(RAYO_ACTOR(component)->type, RAT_CALL_COMPONENT)) {
+		session = (switch_core_session_t *)data;
+	}
 	switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "%s stopping\n", RAYO_JID(component));
 	switch_api_execute("fileman", command, NULL, &stream);
+	if (!zstr((char *)stream.data) && !strncmp((char *)stream.data, "+OK", 3)) {
+		result = iks_new_iq_result(iq);
+	} else if (session && switch_channel_get_state(switch_core_session_get_channel(session)) >= CS_HANGUP) {
+		result = iks_new_error_detailed(iq, STANZA_ERROR_UNEXPECTED_REQUEST, "call has ended");
+	} else if (!zstr((char *)stream.data)) {
+		result = iks_new_error_detailed_printf(iq, STANZA_ERROR_INTERNAL_SERVER_ERROR, "%s", (char *)stream.data);
+	} else {
+		result = iks_new_error(iq, STANZA_ERROR_INTERNAL_SERVER_ERROR);
+	}
 	switch_safe_free(stream.data);
 	switch_safe_free(command);
-	return iks_new_iq_result(iq);
+	return result;
 }
 
 /**
@@ -262,14 +276,28 @@ static iks *stop_output_component(struct rayo_actor *component, struct rayo_mess
 static iks *pause_output_component(struct rayo_actor *component, struct rayo_message *msg, void *data)
 {
 	iks *iq = msg->payload;
+	iks *result = NULL;
+	switch_core_session_t *session = NULL;
 	switch_stream_handle_t stream = { 0 };
 	char *command = switch_mprintf("%s pause", RAYO_JID(component));
 	SWITCH_STANDARD_STREAM(stream);
+	if (!strcmp(RAYO_ACTOR(component)->type, RAT_CALL_COMPONENT)) {
+		session = (switch_core_session_t *)data;
+	}
 	switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "%s pausing\n", RAYO_JID(component));
 	switch_api_execute("fileman", command, NULL, &stream);
+	if (!zstr((char *)stream.data) && !strncmp((char *)stream.data, "+OK", 3)) {
+		result = iks_new_iq_result(iq);
+	} else if (session && switch_channel_get_state(switch_core_session_get_channel(session)) >= CS_HANGUP) {
+		result = iks_new_error_detailed(iq, STANZA_ERROR_UNEXPECTED_REQUEST, "call has ended");
+	} else if (!zstr((char *)stream.data)) {
+		result = iks_new_error_detailed_printf(iq, STANZA_ERROR_INTERNAL_SERVER_ERROR, "%s", (char *)stream.data);
+	} else {
+		result = iks_new_error(iq, STANZA_ERROR_INTERNAL_SERVER_ERROR);
+	}
 	switch_safe_free(stream.data);
 	switch_safe_free(command);
-	return iks_new_iq_result(iq);
+	return result;
 }
 
 /**
@@ -278,14 +306,28 @@ static iks *pause_output_component(struct rayo_actor *component, struct rayo_mes
 static iks *resume_output_component(struct rayo_actor *component, struct rayo_message *msg, void *data)
 {
 	iks *iq = msg->payload;
+	iks *result = NULL;
+	switch_core_session_t *session = NULL;
 	switch_stream_handle_t stream = { 0 };
 	char *command = switch_mprintf("%s resume", RAYO_JID(component));
 	SWITCH_STANDARD_STREAM(stream);
+	if (!strcmp(RAYO_ACTOR(component)->type, RAT_CALL_COMPONENT)) {
+		session = (switch_core_session_t *)data;
+	}
 	switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "%s resuming\n", RAYO_JID(component));
 	switch_api_execute("fileman", command, NULL, &stream);
+	if (!zstr((char *)stream.data) && !strncmp((char *)stream.data, "+OK", 3)) {
+		result = iks_new_iq_result(iq);
+	} else if (session && switch_channel_get_state(switch_core_session_get_channel(session)) >= CS_HANGUP) {
+		result = iks_new_error_detailed(iq, STANZA_ERROR_UNEXPECTED_REQUEST, "call has ended");
+	} else if (!zstr((char *)stream.data)) {
+		result = iks_new_error_detailed_printf(iq, STANZA_ERROR_INTERNAL_SERVER_ERROR, "%s", (char *)stream.data);
+	} else {
+		result = iks_new_error(iq, STANZA_ERROR_INTERNAL_SERVER_ERROR);
+	}
 	switch_safe_free(stream.data);
 	switch_safe_free(command);
-	return iks_new_iq_result(iq);
+	return result;
 }
 
 /**
@@ -294,14 +336,28 @@ static iks *resume_output_component(struct rayo_actor *component, struct rayo_me
 static iks *speed_up_output_component(struct rayo_actor *component, struct rayo_message *msg, void *data)
 {
 	iks *iq = msg->payload;
+	iks *result = NULL;
+	switch_core_session_t *session = NULL;
 	switch_stream_handle_t stream = { 0 };
 	char *command = switch_mprintf("%s speed:+", RAYO_JID(component));
 	SWITCH_STANDARD_STREAM(stream);
+	if (!strcmp(RAYO_ACTOR(component)->type, RAT_CALL_COMPONENT)) {
+		session = (switch_core_session_t *)data;
+	}
 	switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "%s speeding up\n", RAYO_JID(component));
 	switch_api_execute("fileman", command, NULL, &stream);
+	if (!zstr((char *)stream.data) && !strncmp((char *)stream.data, "+OK", 3)) {
+		result = iks_new_iq_result(iq);
+	} else if (session && switch_channel_get_state(switch_core_session_get_channel(session)) >= CS_HANGUP) {
+		result = iks_new_error_detailed(iq, STANZA_ERROR_UNEXPECTED_REQUEST, "call has ended");
+	} else if (!zstr((char *)stream.data)) {
+		result = iks_new_error_detailed_printf(iq, STANZA_ERROR_INTERNAL_SERVER_ERROR, "%s", (char *)stream.data);
+	} else {
+		result = iks_new_error(iq, STANZA_ERROR_INTERNAL_SERVER_ERROR);
+	}
 	switch_safe_free(stream.data);
 	switch_safe_free(command);
-	return iks_new_iq_result(iq);
+	return result;
 }
 
 /**
@@ -310,14 +366,28 @@ static iks *speed_up_output_component(struct rayo_actor *component, struct rayo_
 static iks *speed_down_output_component(struct rayo_actor *component, struct rayo_message *msg, void *data)
 {
 	iks *iq = msg->payload;
+	iks *result = NULL;
+	switch_core_session_t *session = NULL;
 	switch_stream_handle_t stream = { 0 };
 	char *command = switch_mprintf("%s speed:-", RAYO_JID(component));
 	SWITCH_STANDARD_STREAM(stream);
+	if (!strcmp(RAYO_ACTOR(component)->type, RAT_CALL_COMPONENT)) {
+		session = (switch_core_session_t *)data;
+	}
 	switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "%s slowing down\n", RAYO_JID(component));
 	switch_api_execute("fileman", command, NULL, &stream);
+	if (!zstr((char *)stream.data) && !strncmp((char *)stream.data, "+OK", 3)) {
+		result = iks_new_iq_result(iq);
+	} else if (session && switch_channel_get_state(switch_core_session_get_channel(session)) >= CS_HANGUP) {
+		result = iks_new_error_detailed(iq, STANZA_ERROR_UNEXPECTED_REQUEST, "call has ended");
+	} else if (!zstr((char *)stream.data)) {
+		result = iks_new_error_detailed_printf(iq, STANZA_ERROR_INTERNAL_SERVER_ERROR, "%s", (char *)stream.data);
+	} else {
+		result = iks_new_error(iq, STANZA_ERROR_INTERNAL_SERVER_ERROR);
+	}
 	switch_safe_free(stream.data);
 	switch_safe_free(command);
-	return iks_new_iq_result(iq);
+	return result;
 }
 
 /**
@@ -326,14 +396,28 @@ static iks *speed_down_output_component(struct rayo_actor *component, struct ray
 static iks *volume_up_output_component(struct rayo_actor *component, struct rayo_message *msg, void *data)
 {
 	iks *iq = msg->payload;
+	iks *result = NULL;
+	switch_core_session_t *session = NULL;
 	switch_stream_handle_t stream = { 0 };
 	char *command = switch_mprintf("%s volume:+", RAYO_JID(component));
 	SWITCH_STANDARD_STREAM(stream);
+	if (!strcmp(RAYO_ACTOR(component)->type, RAT_CALL_COMPONENT)) {
+		session = (switch_core_session_t *)data;
+	}
 	switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "%s increasing volume\n", RAYO_JID(component));
 	switch_api_execute("fileman", command, NULL, &stream);
+	if (!zstr((char *)stream.data) && !strncmp((char *)stream.data, "+OK", 3)) {
+		result = iks_new_iq_result(iq);
+	} else if (session && switch_channel_get_state(switch_core_session_get_channel(session)) >= CS_HANGUP) {
+		result = iks_new_error_detailed(iq, STANZA_ERROR_UNEXPECTED_REQUEST, "call has ended");
+	} else if (!zstr((char *)stream.data)) {
+		result = iks_new_error_detailed_printf(iq, STANZA_ERROR_INTERNAL_SERVER_ERROR, "%s", (char *)stream.data);
+	} else {
+		result = iks_new_error(iq, STANZA_ERROR_INTERNAL_SERVER_ERROR);
+	}
 	switch_safe_free(stream.data);
 	switch_safe_free(command);
-	return iks_new_iq_result(iq);
+	return result;
 }
 
 /**
@@ -342,14 +426,28 @@ static iks *volume_up_output_component(struct rayo_actor *component, struct rayo
 static iks *volume_down_output_component(struct rayo_actor *component, struct rayo_message *msg, void *data)
 {
 	iks *iq = msg->payload;
+	iks *result = NULL;
+	switch_core_session_t *session = NULL;
 	switch_stream_handle_t stream = { 0 };
 	char *command = switch_mprintf("%s volume:-", RAYO_JID(component));
 	SWITCH_STANDARD_STREAM(stream);
+	if (!strcmp(RAYO_ACTOR(component)->type, RAT_CALL_COMPONENT)) {
+		session = (switch_core_session_t *)data;
+	}
 	switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "%s lowering volume\n", RAYO_JID(component));
 	switch_api_execute("fileman", command, NULL, &stream);
+	if (!zstr((char *)stream.data) && !strncmp((char *)stream.data, "+OK", 3)) {
+		result = iks_new_iq_result(iq);
+	} else if (session && switch_channel_get_state(switch_core_session_get_channel(session)) >= CS_HANGUP) {
+		result = iks_new_error_detailed(iq, STANZA_ERROR_UNEXPECTED_REQUEST, "call has ended");
+	} else if (!zstr((char *)stream.data)) {
+		result = iks_new_error_detailed_printf(iq, STANZA_ERROR_INTERNAL_SERVER_ERROR, "%s", (char *)stream.data);
+	} else {
+		result = iks_new_error(iq, STANZA_ERROR_INTERNAL_SERVER_ERROR);
+	}
 	switch_safe_free(stream.data);
 	switch_safe_free(command);
-	return iks_new_iq_result(iq);
+	return result;
 }
 
 /**
@@ -361,19 +459,32 @@ static iks *seek_output_component(struct rayo_actor *component, struct rayo_mess
 	iks *seek = iks_find(iq, "seek");
 
 	if (VALIDATE_RAYO_OUTPUT_SEEK(seek)) {
+		iks *result = NULL;
+		switch_core_session_t *session = NULL;
 		int is_forward = !strcmp("forward", iks_find_attrib(seek, "direction"));
 		int amount_ms = iks_find_int_attrib(seek, "amount");
 		char *command = switch_mprintf("%s seek:%s%i", RAYO_JID(component),
 			is_forward ? "+" : "-", amount_ms);
 		switch_stream_handle_t stream = { 0 };
 		SWITCH_STANDARD_STREAM(stream);
+		if (!strcmp(RAYO_ACTOR(component)->type, RAT_CALL_COMPONENT)) {
+			session = (switch_core_session_t *)data;
+		}
 
 		switch_api_execute("fileman", command, NULL, &stream);
-
+		if (!zstr((char *)stream.data) && !strncmp((char *)stream.data, "+OK", 3)) {
+			result = iks_new_iq_result(iq);
+		} else if (session && switch_channel_get_state(switch_core_session_get_channel(session)) >= CS_HANGUP) {
+			result = iks_new_error_detailed(iq, STANZA_ERROR_UNEXPECTED_REQUEST, "call has ended");
+		} else if (!zstr((char *)stream.data)) {
+			result = iks_new_error_detailed_printf(iq, STANZA_ERROR_INTERNAL_SERVER_ERROR, "%s", (char *)stream.data);
+		} else {
+			result = iks_new_error(iq, STANZA_ERROR_INTERNAL_SERVER_ERROR);
+		}
 		switch_safe_free(stream.data);
 		switch_safe_free(command);
 
-		return iks_new_iq_result(iq);
+		return result;
 	}
 	return iks_new_error(iq, STANZA_ERROR_BAD_REQUEST);
 }
@@ -515,9 +626,9 @@ static switch_status_t next_file(switch_file_handle_t *handle)
 	handle->interval = context->fh.interval;
 
 	if (switch_test_flag((&context->fh), SWITCH_FILE_NATIVE)) {
-		switch_set_flag(handle, SWITCH_FILE_NATIVE);
+		switch_set_flag_locked(handle, SWITCH_FILE_NATIVE);
 	} else {
-		switch_clear_flag(handle, SWITCH_FILE_NATIVE);
+		switch_clear_flag_locked(handle, SWITCH_FILE_NATIVE);
 	}
 
 	return SWITCH_STATUS_SUCCESS;
@@ -571,27 +682,27 @@ static switch_status_t rayo_file_close(switch_file_handle_t *handle)
 		struct output_component *output = OUTPUT_COMPONENT(context->component);
 
 		/* send completion and destroy */
-		if (output->stop) {
+		if (!strcmp(RAYO_ACTOR(context->component)->type, RAT_CALL_COMPONENT)) {
+			/* call output... check for hangup */
+			switch_core_session_t *session = switch_core_session_locate(RAYO_ACTOR(context->component)->parent->id);
+			if (session) {
+				if (switch_channel_get_state(switch_core_session_get_channel(session)) >= CS_HANGUP) {
+					rayo_component_send_complete(context->component, COMPONENT_COMPLETE_HANGUP);
+				} else if (output->stop) {
+					rayo_component_send_complete(context->component, COMPONENT_COMPLETE_STOP);
+				} else {
+					rayo_component_send_complete(context->component, OUTPUT_FINISH);
+				}
+				switch_core_session_rwunlock(session);
+			} else {
+				/* session is gone */
+				rayo_component_send_complete(context->component, COMPONENT_COMPLETE_HANGUP);
+			}
+		} else if (output->stop) {
 			rayo_component_send_complete(context->component, COMPONENT_COMPLETE_STOP);
 		} else {
-			if (!strcmp(RAYO_ACTOR(context->component)->type, RAT_CALL_COMPONENT)) {
-				/* call output... check for hangup */
-				switch_core_session_t *session = switch_core_session_locate(RAYO_ACTOR(context->component)->parent->id);
-				if (session) {
-					if (switch_channel_get_state(switch_core_session_get_channel(session)) >= CS_HANGUP) {
-						rayo_component_send_complete(context->component, COMPONENT_COMPLETE_HANGUP);
-					} else {
-						rayo_component_send_complete(context->component, OUTPUT_FINISH);
-					}
-					switch_core_session_rwunlock(session);
-				} else {
-					/* session is gone */
-					rayo_component_send_complete(context->component, COMPONENT_COMPLETE_HANGUP);
-				}
-			} else {
-				/* mixer output... finished */
-				rayo_component_send_complete(context->component, OUTPUT_FINISH);
-			}
+			/* mixer output... finished */
+			rayo_component_send_complete(context->component, OUTPUT_FINISH);
 		}
 		/* TODO timed out */
 
@@ -693,6 +804,8 @@ struct fileman_file_context {
 	const char *uuid;
 	/** fileman control ID */
 	const char *id;
+	/** done flag */
+	int done;
 };
 
 /**
@@ -765,13 +878,13 @@ static switch_status_t fileman_file_open(switch_file_handle_t *handle, const cha
 	handle->interval = context->fh.interval;
 
 	if (switch_test_flag((&context->fh), SWITCH_FILE_NATIVE)) {
-		switch_set_flag(handle, SWITCH_FILE_NATIVE);
+		switch_set_flag_locked(handle, SWITCH_FILE_NATIVE);
 	} else {
-		switch_clear_flag(handle, SWITCH_FILE_NATIVE);
+		switch_clear_flag_locked(handle, SWITCH_FILE_NATIVE);
 	}
 
 	if (handle->params && switch_true(switch_event_get_header(handle->params, "pause"))) {
-		switch_set_flag(handle, SWITCH_FILE_PAUSE);
+		switch_set_flag_locked(handle, SWITCH_FILE_PAUSE);
 	}
 
 	if (handle->seekable && start_offset_ms) {
@@ -861,7 +974,11 @@ static switch_status_t fileman_file_read(switch_file_handle_t *handle, void *dat
 		int do_speed = 1;
 		size_t read_bytes = 0;
 
-		if (switch_test_flag(handle, SWITCH_FILE_PAUSE)) {
+		if (context->done) {
+			/* done with this file */
+			status = SWITCH_STATUS_FALSE;
+			goto done;
+		} else if (switch_test_flag(handle, SWITCH_FILE_PAUSE)) {
 			//switch_log_printf(SWITCH_CHANNEL_UUID_LOG(context->uuid), SWITCH_LOG_DEBUG, "Read pause frame\n");
 			memset(context->abuf, 255, *len * 2);
 			do_speed = 0;
@@ -954,7 +1071,7 @@ static switch_status_t fileman_file_read(switch_file_handle_t *handle, void *dat
 		if (switch_test_flag(fh, SWITCH_FILE_SEEK)) {
 			/* file position has changed flush the buffer */
 			switch_buffer_zero(fh->audio_buffer);
-			switch_clear_flag(fh, SWITCH_FILE_SEEK);
+			switch_clear_flag_locked(fh, SWITCH_FILE_SEEK);
 		}
 
 		/* generate speed frames */
@@ -1099,14 +1216,16 @@ static switch_status_t fileman_process_cmd(const char *cmd, switch_file_handle_t
 
 			return SWITCH_STATUS_FALSE;
 		} else if (!strcasecmp(cmd, "pause")) {
-			switch_set_flag(fhp, SWITCH_FILE_PAUSE);
+			switch_set_flag_locked(fhp, SWITCH_FILE_PAUSE);
 			return SWITCH_STATUS_SUCCESS;
 		} else if (!strcasecmp(cmd, "resume")) {
-			switch_clear_flag(fhp, SWITCH_FILE_PAUSE);
+			switch_clear_flag_locked(fhp, SWITCH_FILE_PAUSE);
 			return SWITCH_STATUS_SUCCESS;
 		} else if (!strcasecmp(cmd, "stop")) {
-			switch_set_flag(fhp, SWITCH_FILE_DONE);
-			return SWITCH_STATUS_FALSE;
+			switch_log_printf(SWITCH_CHANNEL_UUID_LOG(context->uuid), SWITCH_LOG_DEBUG, "Stopping file\n");
+			context->done = 1;
+			switch_set_flag_locked(fhp, SWITCH_FILE_DONE);
+			return SWITCH_STATUS_SUCCESS;
 		} else if (!strcasecmp(cmd, "truncate")) {
 			switch_core_file_truncate(fhp, 0);
 		} else if (!strcasecmp(cmd, "restart")) {
@@ -1175,12 +1294,17 @@ SWITCH_STANDARD_API(fileman_api)
 			switch_mutex_lock(fileman_globals.mutex);
 			fh = (switch_file_handle_t *)switch_core_hash_find(fileman_globals.hash, id);
 			if (fh) {
-				fileman_process_cmd(cmd, fh);
+				if (fileman_process_cmd(cmd, fh) == SWITCH_STATUS_SUCCESS) {
+					stream->write_function(stream, "+OK\n");
+				} else {
+					switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "fileman API failed for file %s\n", zstr(fh->file_path) ? "<null>" : fh->file_path);
+					stream->write_function(stream, "-ERR API call failed");
+				}
 				switch_mutex_unlock(fileman_globals.mutex);
-				stream->write_function(stream, "+OK\n");
 			} else {
+				switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "fileman API failed for ID %s\n", zstr(id) ? "<null>" : id);
 				switch_mutex_unlock(fileman_globals.mutex);
-				stream->write_function(stream, "-ERR No file handle!\n");
+				stream->write_function(stream, "-ERR file handle not found\n");
 			}
 			goto done;
 		}
diff --git a/src/mod/event_handlers/mod_smpp/mod_smpp.c b/src/mod/event_handlers/mod_smpp/mod_smpp.c
index d6b76ca74f..c18761b158 100644
--- a/src/mod/event_handlers/mod_smpp/mod_smpp.c
+++ b/src/mod/event_handlers/mod_smpp/mod_smpp.c
@@ -138,8 +138,8 @@ SWITCH_STANDARD_API(mod_smpp_send_api)
 		switch_goto_status(SWITCH_STATUS_GENERR, done);
 	}
 	
-	switch_event_add_header_string(message, SWITCH_STACK_BOTTOM, "destination_addr", argv[1]);
-	switch_event_add_header_string(message, SWITCH_STACK_BOTTOM, "source_addr", argv[2]);
+	switch_event_add_header_string(message, SWITCH_STACK_BOTTOM, "to_user", argv[1]);
+	switch_event_add_header_string(message, SWITCH_STACK_BOTTOM, "from_user", argv[2]);
 	switch_event_set_body(message, argv[3]);
 
 	if (mod_smpp_gateway_send_message(gateway, message) != SWITCH_STATUS_SUCCESS) {
diff --git a/src/mod/formats/mod_imagick/Makefile.am b/src/mod/formats/mod_imagick/Makefile.am
index c960d33d78..19270b6003 100644
--- a/src/mod/formats/mod_imagick/Makefile.am
+++ b/src/mod/formats/mod_imagick/Makefile.am
@@ -1,8 +1,6 @@
 include $(top_srcdir)/build/modmake.rulesam
 MODNAME=mod_imagick
 
-if HAVE_YUV
-if HAVE_VPX
 if HAVE_MAGICK
 
 mod_LTLIBRARIES = mod_imagick.la
@@ -17,19 +15,3 @@ all: error
 error:
 	$(error You must install libmagickcore-dev to build mod_imagick)
 endif
-
-else
-install: error
-all: error
-error:
-	$(error You must install libvpx2-dev to build mod_imagick)
-endif
-
-
-else
-install: error
-all: error
-error:
-	$(error You must install libyuv-dev to build mod_imagick)
-endif
-
diff --git a/src/mod/formats/mod_imagick/mod_imagick.c b/src/mod/formats/mod_imagick/mod_imagick.c
index 4c04b91cdf..060889f16a 100644
--- a/src/mod/formats/mod_imagick/mod_imagick.c
+++ b/src/mod/formats/mod_imagick/mod_imagick.c
@@ -34,8 +34,6 @@
 
 
 #include <switch.h>
-#include <libyuv.h>
-
 
 #if defined(__clang__)
 /* the imagemagick header files are very badly broken on clang.  They really should be fixing this, in the mean time, this dirty hack works */
@@ -263,12 +261,7 @@ static switch_status_t read_page(pdf_file_context_t *context)
 			return SWITCH_STATUS_FALSE;
 		}
 
-		RAWToI420(storage, w * 3,
-			context->img->planes[0], context->img->stride[0],
-			context->img->planes[1], context->img->stride[1],
-			context->img->planes[2], context->img->stride[2],
-			context->img->d_w, context->img->d_h);
-
+		switch_img_from_raw(context->img, storage, SWITCH_IMG_FMT_BGR24, w, h);
 		free(storage);
 	} else {
 		switch_image_t *img = switch_img_alloc(NULL, SWITCH_IMG_FMT_ARGB, image->columns, image->rows, 0);
diff --git a/src/mod/formats/mod_local_stream/mod_local_stream.c b/src/mod/formats/mod_local_stream/mod_local_stream.c
index 68fc8eb8ed..b314132f95 100644
--- a/src/mod/formats/mod_local_stream/mod_local_stream.c
+++ b/src/mod/formats/mod_local_stream/mod_local_stream.c
@@ -68,10 +68,13 @@ struct local_stream_context {
 	int sent_png;
 	int last_w;
 	int last_h;
+	int newres;
 	int serno;
 	int pop_count;
+	switch_size_t blank;
 	switch_image_t *banner_img;
 	switch_time_t banner_timeout;
+	switch_memory_pool_t *pool;
 	struct local_stream_context *next;
 };
 
@@ -111,13 +114,37 @@ struct local_stream_source {
 	switch_queue_t *video_q;
 	int has_video;
 	switch_image_t *blank_img;
+	switch_image_t *logo_img;
 	switch_image_t *cover_art;
 	char *banner_txt;
 	int serno;
+	switch_size_t abuflen;
+	switch_byte_t *abuf;
+	switch_timer_t timer;
+	int logo_always;
+	switch_img_position_t logo_pos;
+	uint8_t logo_opacity;
+	uint8_t text_opacity;
 };
 
 typedef struct local_stream_source local_stream_source_t;
 
+local_stream_source_t *get_source(const char *path)
+{
+	local_stream_source_t *source = NULL;
+
+	switch_mutex_lock(globals.mutex);
+	if ((source = switch_core_hash_find(globals.source_hash, path))) {
+		if (!RUNNING || source->stopped || switch_thread_rwlock_tryrdlock(source->rwlock) != SWITCH_STATUS_SUCCESS) {
+			source = NULL;
+		}
+	}
+	switch_mutex_unlock(globals.mutex);
+
+	return source;
+}
+
+
 switch_status_t list_streams_full(const char *line, const char *cursor, switch_console_callback_match_t **matches, switch_bool_t show_aliases)
 {
 	local_stream_source_t *source;
@@ -155,39 +182,42 @@ switch_status_t list_streams(const char *line, const char *cursor, switch_consol
 
 static int do_rand(uint32_t count)
 {
-	double r;
-	int index;
+	int r = 0;
 
-	if (count < 3) return 0;
+	if (count == 0) return 0;
 
-	r = ((double) rand() / ((double) (RAND_MAX) + (double) (1)));
-	index = (int) (r * count) + 1;
+	switch_mutex_lock(globals.mutex);
+	r = (rand() % count) + 1;
+	switch_mutex_unlock(globals.mutex);
 
-	return index;
+	return r;
 }
 
 static void flush_video_queue(switch_queue_t *q)
 {
-	void *pop;
+	void *pop = NULL;
 
 	if (switch_queue_size(q) == 0) {
 		return;
 	}
 
 	while (switch_queue_trypop(q, &pop) == SWITCH_STATUS_SUCCESS) {
-		switch_image_t *img = (switch_image_t *) pop;
-		switch_img_free(&img);
+		if (pop) {
+			switch_image_t *img = (switch_image_t *) pop;
+			switch_img_free(&img);
+		} else {
+			break;
+		}
 	}
 
 }
 
 static void *SWITCH_THREAD_FUNC read_stream_thread(switch_thread_t *thread, void *obj)
 {
-	local_stream_source_t *source = obj;
+	volatile local_stream_source_t *s = (local_stream_source_t *) obj;
+	local_stream_source_t *source = (local_stream_source_t *) s;
 	switch_file_handle_t fh = { 0 };
-	local_stream_context_t *cp;
-	char file_buf[128] = "", path_buf[512] = "", last_path[512], png_buf[512] = "", tmp_buf[512] = "";
-	switch_timer_t timer = { 0 };
+	char file_buf[128] = "", path_buf[512] = "", last_path[512] = "", png_buf[512] = "", tmp_buf[512] = "";
 	int fd = -1;
 	switch_buffer_t *audio_buffer;
 	switch_byte_t *dist_buf;
@@ -209,21 +239,35 @@ static void *SWITCH_THREAD_FUNC read_stream_thread(switch_thread_t *thread, void
 		do_shuffle = 1;
 	}
 
+	if (source->prebuf < source->abuflen) {
+		source->prebuf = source->abuflen;
+	}
+
 	switch_queue_create(&source->video_q, 500, source->pool);
 	switch_buffer_create_dynamic(&audio_buffer, 1024, source->prebuf + 10, 0);
 	dist_buf = switch_core_alloc(source->pool, source->prebuf + 10);
 
 	switch_thread_rwlock_create(&source->rwlock, source->pool);
 
+	if (switch_core_timer_init(&source->timer, source->timer_name, source->interval, (int)source->samples, source->pool) != SWITCH_STATUS_SUCCESS) {
+		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_CONSOLE, "Can't start timer.\n");
+		RUNNING = 0;
+	}
+
 	if (RUNNING) {
+		source->ready = 1;
 		switch_mutex_lock(globals.mutex);
 		switch_core_hash_insert(globals.source_hash, source->name, source);
 		switch_mutex_unlock(globals.mutex);
-		source->ready = 1;
 	}
 
-	while (RUNNING && !source->stopped) {
+	while (RUNNING && !source->stopped && source->ready) {
 		const char *fname;
+		
+		if (source->dir_handle) {
+			switch_dir_close(source->dir_handle);
+			source->dir_handle = NULL;
+		}
 
 		if (temp_pool) {
 			switch_core_destroy_memory_pool(&temp_pool);
@@ -258,8 +302,8 @@ static void *SWITCH_THREAD_FUNC read_stream_thread(switch_thread_t *thread, void
 
 		while (RUNNING && !source->stopped) {
 			switch_size_t olen;
-			uint8_t abuf[SWITCH_RECOMMENDED_BUFFER_SIZE] = { 0 };
 			const char *artist = NULL, *title = NULL;
+			char tmp_space[128] = "";
 
 			if (fd > -1) {
 				char *pb;
@@ -288,6 +332,7 @@ static void *SWITCH_THREAD_FUNC read_stream_thread(switch_thread_t *thread, void
 				}
 			}
 
+
 			if (dir_count > 1 && !strcmp(last_path, path_buf)) {
 				continue;
 			}
@@ -311,25 +356,19 @@ static void *SWITCH_THREAD_FUNC read_stream_thread(switch_thread_t *thread, void
 				continue;
 			}
 
-			if (switch_core_file_has_video(&fh)) {
-				flush_video_queue(source->video_q);
-			}
-
 			switch_buffer_zero(audio_buffer);
-			
-			if (switch_core_timer_init(&timer, source->timer_name, source->interval, (int)source->samples, temp_pool) != SWITCH_STATUS_SUCCESS) {
-				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_CONSOLE, "Can't start timer.\n");
-				switch_dir_close(source->dir_handle);
-				source->dir_handle = NULL;
-				goto done;
+
+			if (switch_core_file_has_video(&fh, SWITCH_FALSE)) {
+				flush_video_queue(source->video_q);
 			}
 
 			switch_img_free(&source->cover_art);
 			switch_set_string(tmp_buf, path_buf);
+
 			if ((p = strrchr(tmp_buf, '/'))) {
 				*p++ = '\0';
 				switch_snprintf(png_buf, sizeof(png_buf), "%s/art/%s.png", tmp_buf, p);				
-				if (switch_file_exists(png_buf, source->pool) == SWITCH_STATUS_SUCCESS) {
+				if (switch_file_exists(png_buf, temp_pool) == SWITCH_STATUS_SUCCESS) {
 					source->cover_art = switch_img_read_png(png_buf, SWITCH_IMG_FMT_I420);
 				}
 			}
@@ -340,22 +379,68 @@ static void *SWITCH_THREAD_FUNC read_stream_thread(switch_thread_t *thread, void
 			
 			switch_core_file_get_string(&fh, SWITCH_AUDIO_COL_STR_ARTIST, &artist);
 			switch_core_file_get_string(&fh, SWITCH_AUDIO_COL_STR_TITLE, &title);
-			
-			if (title && (source->cover_art || switch_core_file_has_video(&fh))) {
-				const char *format = "#cccccc:#333333:FreeSans.ttf:3%:";
 
+			if (!title && !artist) {
+				char *e, *p, *args[3];
+				int argc;
+					
+				switch_set_string(tmp_space, path_buf);
+				p = tmp_space;
+					
+				while((e = strchr(p, '/'))) {
+					*e = '\0';
+					p = e+1;
+				}
+
+				argc = switch_split(p, '-', args);
+					
+				if (argc > 0) {
+					while(*args[0] == ' ') {
+						args[0]++;
+					}
+
+					while(end_of(args[0]) == ' ') {
+						end_of(args[0]) = '\0';
+					}
+					
+					artist = args[0];
+
+					if (argc > 1) {
+						while(*args[1] == ' ') {
+							args[1]++;
+						}
+						while(end_of(args[1]) == ' ') {
+							end_of(args[1]) = '\0';
+						}
+						title = args[1];
+					}
+
+					if (!title) {
+						title = artist;
+						artist = NULL;
+					}
+				} else {
+					title = p;
+					artist = NULL;
+				}
+			}
+			
+			if (title && (source->cover_art || switch_core_file_has_video(&fh, SWITCH_TRUE))) {
+				const char *format = "#cccccc:#333333:FreeSans.ttf:3%:";
+				
 				if (artist) {
 					source->banner_txt = switch_mprintf("%s%s (%s)", format, title, artist);
 				} else {
 					source->banner_txt = switch_mprintf("%s%s", format, title);
 				}
 			}
+			
 
 			while (RUNNING && !source->stopped) {
 				int is_open;
 				switch_file_handle_t *use_fh = &fh;
 
-				switch_core_timer_next(&timer);
+				switch_core_timer_next(&source->timer);
 				olen = source->samples;
 				
 				if (source->chime_total) {
@@ -381,7 +466,7 @@ static void *SWITCH_THREAD_FUNC read_stream_thread(switch_thread_t *thread, void
 						}
 
 
-						if (switch_core_file_has_video(&source->chime_fh)) {
+						if (switch_core_file_has_video(&source->chime_fh, SWITCH_FALSE)) {
 							flush_video_queue(source->video_q);
 						}
 
@@ -394,7 +479,7 @@ static void *SWITCH_THREAD_FUNC read_stream_thread(switch_thread_t *thread, void
 
 			retry:
 
-				source->has_video = switch_core_file_has_video(use_fh) || source->cover_art || source->banner_txt;
+				source->has_video = switch_core_file_has_video(use_fh, SWITCH_TRUE) || source->cover_art || source->banner_txt;
 
 				is_open = switch_test_flag(use_fh, SWITCH_FILE_OPEN);
 
@@ -402,9 +487,10 @@ static void *SWITCH_THREAD_FUNC read_stream_thread(switch_thread_t *thread, void
 					source->hup = 0;
 					if (is_open) {
 						is_open = 0;
-
+						
 						switch_core_file_close(use_fh);
 						flush_video_queue(source->video_q);
+						switch_buffer_zero(audio_buffer);
 						if (use_fh == &source->chime_fh) {
 							source->chime_counter = source->rate * source->chime_freq;
 							switch_core_file_close(&fh);
@@ -414,16 +500,13 @@ static void *SWITCH_THREAD_FUNC read_stream_thread(switch_thread_t *thread, void
 					}
 				}
 				
-				if (!is_open) {
-					switch_buffer_zero(audio_buffer);
-					break;
-				} else {
+				if (is_open) {
 					int svr = 0;
 
-					if (switch_core_has_video() && switch_core_file_has_video(use_fh)) {
+					if (switch_core_has_video() && switch_core_file_has_video(use_fh, SWITCH_TRUE)) {
 						switch_frame_t vid_frame = { 0 };
 
-						if (use_fh == &source->chime_fh && switch_core_file_has_video(&fh)) {
+						if (use_fh == &source->chime_fh && switch_core_file_has_video(&fh, SWITCH_TRUE)) {
 							if (switch_core_file_read_video(&fh, &vid_frame, svr) == SWITCH_STATUS_SUCCESS) {
 								switch_img_free(&vid_frame.img);
 							}
@@ -452,11 +535,13 @@ static void *SWITCH_THREAD_FUNC read_stream_thread(switch_thread_t *thread, void
 
 					if (use_fh == &source->chime_fh) {
 						olen = source->samples;
-						switch_core_file_read(&fh, abuf, &olen);
+						switch_core_file_read(&fh, source->abuf, &olen);
 						olen = source->samples;
 					}
 					
-					if (switch_core_file_read(use_fh, abuf, &olen) != SWITCH_STATUS_SUCCESS || !olen) {
+					switch_assert(source->abuflen >= olen * 2 * source->channels);
+
+					if (switch_core_file_read(use_fh, source->abuf, &olen) != SWITCH_STATUS_SUCCESS || !olen) {
 						switch_core_file_close(use_fh);
 						flush_video_queue(source->video_q);
 
@@ -480,7 +565,7 @@ static void *SWITCH_THREAD_FUNC read_stream_thread(switch_thread_t *thread, void
 						}
 						
 						if (source->total) {
-							switch_buffer_write(audio_buffer, abuf, olen * 2 * source->channels);
+							switch_buffer_write(audio_buffer, source->abuf, olen * 2 * source->channels);
 						} else {
 							switch_buffer_zero(audio_buffer);
 						}
@@ -493,35 +578,35 @@ static void *SWITCH_THREAD_FUNC read_stream_thread(switch_thread_t *thread, void
 					break;
 				}
 
-				source->prebuf = (uint32_t)(source->samples * 2 * source->channels);
-
 				if (!source->total) {
 					flush_video_queue(source->video_q);
 					switch_buffer_zero(audio_buffer);
-				} else if (used > source->samples * 2 * source->channels) {
-					//if (!is_open || used >= source->prebuf || (source->total && used > source->samples * 2 * source->channels)) {
+				} else if (used && (!is_open || used >= source->abuflen)) {
 					void *pop;
-					uint32_t bused;
+					uint32_t bused = 0;
+					local_stream_context_t *cp = NULL;
+				
+					switch_assert(source->abuflen <= source->prebuf);
+					used = switch_buffer_read(audio_buffer, dist_buf, source->abuflen);
 					
-					used = switch_buffer_read(audio_buffer, dist_buf, source->samples * 2 * source->channels);
-
-					bused = 0;
-
 					switch_mutex_lock(source->mutex);
 					for (cp = source->context_list; cp && RUNNING; cp = cp->next) {
-							
-						if (source->has_video) {
-							switch_set_flag(cp->handle, SWITCH_FILE_FLAG_VIDEO);
-						} else {
-							switch_clear_flag(cp->handle, SWITCH_FILE_FLAG_VIDEO);
-						}
-							
-						if (switch_test_flag(cp->handle, SWITCH_FILE_CALLBACK)) {
+
+						if (!cp->ready) {
 							continue;
 						}
-							
+						
 						switch_mutex_lock(cp->audio_mutex);
+
+						if (switch_test_flag(cp->handle, SWITCH_FILE_OPEN)) {
+							if (switch_test_flag(cp->handle, SWITCH_FILE_CALLBACK)) {
+								switch_mutex_unlock(cp->audio_mutex);
+								continue;
+							}
+						}
+
 						bused = (uint32_t)switch_buffer_inuse(cp->audio_buffer);
+
 						if (bused > source->samples * 768) {
 							switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG1, "Flushing Stream Handle Buffer [%s() %s:%d] size: %u samples: %ld\n", 
 											  cp->func, cp->file, cp->line, bused, (long)source->samples);
@@ -535,15 +620,24 @@ static void *SWITCH_THREAD_FUNC read_stream_thread(switch_thread_t *thread, void
 
 						
 					while (switch_queue_trypop(source->video_q, &pop) == SWITCH_STATUS_SUCCESS) {
-						switch_image_t *img = (switch_image_t *) pop;
+						switch_image_t *img;
 						switch_image_t *imgcp = NULL;
 
-						if (source->total == 1) {
-							switch_queue_push(source->context_list->video_q, img);
-						} else {
-							if (source->context_list) {
-								switch_mutex_lock(source->mutex);
+						if (!pop) break;
+
+						img = (switch_image_t *) pop;
+
+						switch_mutex_lock(source->mutex);
+						if (source->context_list) {
+							if (source->total == 1) {
+								switch_queue_push(source->context_list->video_q, img);
+							} else {
 								for (cp = source->context_list; cp && RUNNING; cp = cp->next) {
+
+									if (!cp->ready) {
+										continue;
+									}
+
 									if (cp->video_q) {
 										imgcp = NULL;
 										switch_img_copy(img, &imgcp);
@@ -553,24 +647,20 @@ static void *SWITCH_THREAD_FUNC read_stream_thread(switch_thread_t *thread, void
 											}
 										}
 									}
-								}
-								switch_mutex_unlock(source->mutex);
+								}						
+								switch_img_free(&img);
 							}
-							switch_img_free(&img);
 						}
+						switch_mutex_unlock(source->mutex);
 					}
 				}
 			}
 
-			switch_core_timer_destroy(&timer);
 			if (RUNNING && source->shuffle) {
 				skip = do_rand(dir_count);
 			}
 		}
 
-		switch_dir_close(source->dir_handle);
-		source->dir_handle = NULL;
-
 		if (source->full_reload) {
 			if (source->rwlock && switch_thread_rwlock_trywrlock(source->rwlock) != SWITCH_STATUS_SUCCESS) {
 				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Cannot stop local_stream://%s because it is in use.\n",source->name);
@@ -618,9 +708,13 @@ static void *SWITCH_THREAD_FUNC read_stream_thread(switch_thread_t *thread, void
 					}
 					switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "local_stream://%s partially reloaded.\n",source->name);
 					source->part_reload = 0;
+					if (source->timer.interval) {
+						switch_core_timer_destroy(&source->timer);
+					}
 				}
 			} else {
 				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "local_stream://%s fully reloaded.\n",source->name);
+				switch_thread_rwlock_unlock(source->rwlock);
 				launch_streams(source->name);
 				goto done;
 			}
@@ -629,6 +723,15 @@ static void *SWITCH_THREAD_FUNC read_stream_thread(switch_thread_t *thread, void
 
   done:
 
+	if (source->dir_handle) {
+		switch_dir_close(source->dir_handle);
+		source->dir_handle = NULL;
+	}
+
+	if (source->timer.interval) {
+		switch_core_timer_destroy(&source->timer);
+	}
+
 	switch_safe_free(source->banner_txt);
 	
 	if (switch_test_flag((&fh), SWITCH_FILE_OPEN)) {
@@ -639,6 +742,9 @@ static void *SWITCH_THREAD_FUNC read_stream_thread(switch_thread_t *thread, void
 		switch_core_file_close(&source->chime_fh);
 	}
 
+	switch_img_free(&source->blank_img);
+	switch_img_free(&source->logo_img);
+
 	source->ready = 0;
 	switch_mutex_lock(globals.mutex);
 	switch_core_hash_delete(globals.source_hash, source->name);
@@ -674,6 +780,7 @@ static switch_status_t local_stream_file_open(switch_file_handle_t *handle, cons
 	local_stream_source_t *source;
 	char *alt_path = NULL;
 	switch_status_t status = SWITCH_STATUS_SUCCESS;
+	switch_memory_pool_t *pool;
 
 	/* already buffering a step back, so always disable it */
 	handle->pre_buffer_datalen = 0;
@@ -683,22 +790,18 @@ static switch_status_t local_stream_file_open(switch_file_handle_t *handle, cons
 		return SWITCH_STATUS_FALSE;
 	}
 
-	switch_mutex_lock(globals.mutex);
-
   top:
 
 	alt_path = switch_mprintf("%s/%d", path, handle->samplerate);
 
-	if ((source = switch_core_hash_find(globals.source_hash, alt_path))) {
+	if ((source = get_source(alt_path))) {
 		path = alt_path;
 	} else {
-		source = switch_core_hash_find(globals.source_hash, path);
+		source = get_source(path);
 	}
-	if (source) {
-		if (switch_thread_rwlock_tryrdlock(source->rwlock) != SWITCH_STATUS_SUCCESS) {
-			source = NULL;
-		}
-	} else {
+
+
+	if (!source) {
 		if (!switch_stristr("default", alt_path) && !switch_stristr("default", path)) {
 			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "Unknown source %s, trying 'default'\n", path);
 			free(alt_path);
@@ -706,7 +809,6 @@ static switch_status_t local_stream_file_open(switch_file_handle_t *handle, cons
 			goto top;
 		}
 	}
-	switch_mutex_unlock(globals.mutex);
 
 	if (!source) {
 		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Unknown source %s\n", path);
@@ -714,12 +816,20 @@ static switch_status_t local_stream_file_open(switch_file_handle_t *handle, cons
 		goto end;
 	}
 
-	if ((context = switch_core_alloc(handle->memory_pool, sizeof(*context))) == 0) {
-		status = SWITCH_STATUS_MEMERR;
-		goto end;
+	//if (switch_core_new_memory_pool(&pool) != SWITCH_STATUS_SUCCESS) {
+	//	switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_CRIT, "OH OH no pool\n");
+	//	abort();
+	//}
+
+	pool = handle->memory_pool;
+	
+	if ((context = switch_core_alloc(pool, sizeof(*context))) == 0) {
+		abort();
 	}
 
-	switch_queue_create(&context->video_q, 500, handle->memory_pool);
+	context->pool = pool;
+
+	switch_queue_create(&context->video_q, 500, context->pool);
 
 	handle->samples = 0;
 	handle->samplerate = source->rate;
@@ -732,7 +842,7 @@ static switch_status_t local_stream_file_open(switch_file_handle_t *handle, cons
 	handle->interval = source->interval;
 	switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Opening Stream [%s] %dhz\n", path, handle->samplerate);
 
-	switch_mutex_init(&context->audio_mutex, SWITCH_MUTEX_NESTED, handle->memory_pool);
+	switch_mutex_init(&context->audio_mutex, SWITCH_MUTEX_NESTED, context->pool);
 	if (switch_buffer_create_dynamic(&context->audio_buffer, 512, 1024, 0) != SWITCH_STATUS_SUCCESS) {
 		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Memory Error!\n");
 		status = SWITCH_STATUS_MEMERR;
@@ -741,7 +851,7 @@ static switch_status_t local_stream_file_open(switch_file_handle_t *handle, cons
 
 	if (!switch_core_has_video() || 
 		(switch_test_flag(handle, SWITCH_FILE_FLAG_VIDEO) && !source->has_video && !source->blank_img && !source->cover_art && !source->banner_txt)) {
-		switch_clear_flag(handle, SWITCH_FILE_FLAG_VIDEO);
+		switch_clear_flag_locked(handle, SWITCH_FILE_FLAG_VIDEO);
 	}
 
 	context->source = source;
@@ -760,42 +870,59 @@ static switch_status_t local_stream_file_open(switch_file_handle_t *handle, cons
 	switch_mutex_unlock(source->mutex);
 
   end:
+
 	switch_safe_free(alt_path);
 	return status;
 }
 
 static switch_status_t local_stream_file_close(switch_file_handle_t *handle)
 {
-	local_stream_context_t *cp, *last = NULL, *context = handle->private_info;
+	local_stream_context_t *context = NULL, *last = NULL, *cp = NULL;
+	local_stream_source_t *source;
 
+	context = handle->private_info;
+	switch_assert(context);
+
+	//pool = context->pool;
+	source = context->source;
+
+	switch_mutex_lock(source->mutex);
+	switch_clear_flag_locked(handle, SWITCH_FILE_OPEN);
 	context->ready = 0;
 
-	switch_mutex_lock(context->source->mutex);
-	for (cp = context->source->context_list; cp; cp = cp->next) {
+	for (cp = source->context_list; cp; cp = cp->next) {
 		if (cp == context) {
 			if (last) {
 				last->next = cp->next;
 			} else {
-				context->source->context_list = cp->next;
-			}
+				source->context_list = cp->next;
+			}			
 			break;
 		}
 		last = cp;
 	}
+
+	switch_mutex_lock(context->audio_mutex);
 	
-	if (context->video_q) {
+	if (source->has_video) {
 		flush_video_queue(context->video_q);
 		switch_queue_trypush(context->video_q, NULL);
 		switch_queue_interrupt_all(context->video_q);
 		flush_video_queue(context->video_q);
 	}
+							
+	source->total--;
 
 	switch_img_free(&context->banner_img);
-	
-	context->source->total--;
-	switch_mutex_unlock(context->source->mutex);
 	switch_buffer_destroy(&context->audio_buffer);
-	switch_thread_rwlock_unlock(context->source->rwlock);
+	switch_mutex_unlock(context->audio_mutex);
+	//switch_core_destroy_memory_pool(&pool);
+
+	context->handle = NULL;
+	handle->private_info = NULL;
+	switch_mutex_unlock(source->mutex);
+
+	switch_thread_rwlock_unlock(source->rwlock);
 
 	return SWITCH_STATUS_SUCCESS;
 }
@@ -857,7 +984,7 @@ static switch_status_t local_stream_file_read_video(switch_file_handle_t *handle
 		return SWITCH_STATUS_FALSE;
 	}
 	
-	while (switch_queue_size(context->video_q) < 5) {
+	while (!(flags & SVR_BLOCK) && switch_queue_size(context->video_q) < 5) {
 		return SWITCH_STATUS_BREAK;
 	}
 
@@ -874,6 +1001,9 @@ static switch_status_t local_stream_file_read_video(switch_file_handle_t *handle
 
 		frame->img = (switch_image_t *) pop;
 		context->sent_png = 0;
+		if (frame->img->d_w != context->last_w || frame->img->d_h != context->last_h) {
+			context->newres = 1;
+		}
 		context->last_w = frame->img->d_w;
 		context->last_h = frame->img->d_h;
 		goto got_img;
@@ -907,6 +1037,10 @@ static switch_status_t local_stream_file_read_video(switch_file_handle_t *handle
 	
 	if (context->source->banner_txt) {
 		if ((!context->banner_timeout || context->banner_timeout >= now)) {
+			if (context->newres) {
+				switch_img_free(&context->banner_img);
+				context->newres = 0;
+			}
 			if (!context->banner_img) {
 				context->banner_img = switch_img_write_text_img(context->last_w, context->last_h, SWITCH_TRUE, context->source->banner_txt);
 				context->banner_timeout = now + 5000000;
@@ -920,8 +1054,23 @@ static switch_status_t local_stream_file_read_video(switch_file_handle_t *handle
 	}
 
 	if (frame->img && context->banner_img && frame->img->d_w >= context->banner_img->d_w) {
-		//switch_img_overlay(frame->img, context->banner_img, 0, frame->img->d_h - context->banner_img->d_h, 100);
-		switch_img_patch(frame->img, context->banner_img, 0, frame->img->d_h - context->banner_img->d_h);
+		switch_img_overlay(frame->img, context->banner_img, 0, frame->img->d_h - context->banner_img->d_h, context->source->text_opacity);
+	}
+
+	if (frame->img && context->source->logo_img && 
+		(context->source->logo_always || context->banner_img) && frame->img->d_w >= context->source->logo_img->d_w) {
+		int x = 0, y = 0;
+		
+		switch_img_find_position(context->source->logo_pos,
+								 frame->img->d_w, frame->img->d_h, 
+								 context->source->logo_img->d_w, context->source->logo_img->d_h,
+								 &x, &y);
+
+		if (context->banner_img) {
+			y -= context->banner_img->d_h;
+		}
+
+		switch_img_overlay(frame->img, context->source->logo_img, x, y, context->source->logo_opacity);
 	}
 
 	return SWITCH_STATUS_SUCCESS;
@@ -931,29 +1080,46 @@ static switch_status_t local_stream_file_read(switch_file_handle_t *handle, void
 {
 	local_stream_context_t *context = handle->private_info;
 	switch_size_t bytes = 0;
-	size_t need = *len * 2 * handle->real_channels;
+	size_t need;
 
-	if (!context->source->ready) {
+	if (!(context->ready && context->source->ready)) {
 		*len = 0;
 		return SWITCH_STATUS_FALSE;
 	}
+	
+	if (context->source->has_video)  {
+		if (!switch_test_flag(handle, SWITCH_FILE_FLAG_VIDEO)) {
+			switch_set_flag_locked(handle, SWITCH_FILE_FLAG_VIDEO);
+		}
+	} else {
+		if (switch_test_flag(handle, SWITCH_FILE_FLAG_VIDEO)) {
+			switch_clear_flag_locked(handle, SWITCH_FILE_FLAG_VIDEO);
+		}
+	}
 
 	switch_mutex_lock(context->audio_mutex);
+	need = *len * 2 * context->source->channels;
+
 	if ((bytes = switch_buffer_read(context->audio_buffer, data, need))) {
-		*len = bytes / 2 / handle->real_channels;
+		*len = bytes / 2 / context->source->channels;
 	} else {
-		size_t blank = (handle->samplerate / 20) * 2 * handle->real_channels;
+		size_t blank;
+		
+		switch_assert(handle->samplerate <= 48000);
+		switch_assert(handle->real_channels <= 2);
+
+		blank = (handle->samplerate / 4) * 2 * handle->real_channels;
 
 		if (need > blank) {
 			need = blank;
 		}
+
 		memset(data, 0, need);
-		*len = need / 2 / handle->real_channels;
+		*len = need / 2 / context->source->channels;
 	}
-
-
 	switch_mutex_unlock(context->audio_mutex);
 	handle->sample_count += *len;
+
 	return SWITCH_STATUS_SUCCESS;
 }
 
@@ -987,6 +1153,8 @@ static void launch_thread(const char *name, const char *path, switch_xml_t direc
 	source->stopped = 0;
 	source->hup = 0;
 	source->chime_freq = 30;
+	source->logo_opacity = source->text_opacity = 100;
+
 	for (param = switch_xml_child(directory, "param"); param; param = param->next) {
 		char *var = (char *) switch_xml_attr_soft(param, "name");
 		char *val = (char *) switch_xml_attr_soft(param, "value");
@@ -1034,6 +1202,22 @@ static void launch_thread(const char *name, const char *path, switch_xml_t direc
 			source->timer_name = switch_core_strdup(source->pool, val);
 		} else if (!strcasecmp(var, "blank-img") && !zstr(val)) {
 			source->blank_img = switch_img_read_png(val, SWITCH_IMG_FMT_I420);
+		} else if (!strcasecmp(var, "logo-img") && !zstr(val)) {
+			source->logo_img = switch_img_read_png(val, SWITCH_IMG_FMT_ARGB);
+		} else if (!strcasecmp(var, "logo-always") && !zstr(val)) {
+			source->logo_always = switch_true(val);
+		} else if (!strcasecmp(var, "logo-position") && !zstr(val)) {
+			source->logo_pos = parse_img_position(val);
+		} else if (!strcasecmp(var, "logo-opacity") && !zstr(val)) {
+			source->logo_opacity = atoi(val);
+			if (source->logo_opacity < 0 && source->logo_opacity > 100) {
+				source->logo_opacity = 0;
+			}
+		} else if (!strcasecmp(var, "text-opacity") && !zstr(val)) {
+			source->text_opacity = atoi(val);
+			if (source->text_opacity < 0 && source->text_opacity > 100) {
+				source->text_opacity = 0;
+			}
 		}
 	}
 
@@ -1046,6 +1230,8 @@ static void launch_thread(const char *name, const char *path, switch_xml_t direc
 	}
 
 	source->samples = switch_samples_per_packet(source->rate, source->interval);
+	source->abuflen = (source->samples * 2 * source->channels);
+	source->abuf = switch_core_alloc(source->pool, source->abuflen + 1024);
 	switch_mutex_init(&source->mutex, SWITCH_MUTEX_NESTED, source->pool);
 	switch_threadattr_create(&thd_attr, source->pool);
 	switch_threadattr_detach_set(thd_attr, 1);
@@ -1108,55 +1294,37 @@ SWITCH_STANDARD_API(local_stream_function)
 
 	local_stream_name = argv[1];
 
+
 	if (!strcasecmp(argv[0], "hup") && local_stream_name) {
-		switch_mutex_lock(globals.mutex);
-		source = switch_core_hash_find(globals.source_hash, local_stream_name);
-		switch_mutex_unlock(globals.mutex);
-		
-		if (source) {
+		if ((source = get_source(local_stream_name))) {
 			source->hup = 1;
 			stream->write_function(stream, "+OK hup stream: %s", source->name);
-			goto done;
+			switch_thread_rwlock_unlock(source->rwlock);
 		}
 	} else if (!strcasecmp(argv[0], "stop") && local_stream_name) {
-		switch_mutex_lock(globals.mutex); 
-		source = switch_core_hash_find(globals.source_hash, local_stream_name);
-		switch_mutex_unlock(globals.mutex); 
-
-		if (!source) {
+		if ((source = get_source(local_stream_name))) {
+			source->stopped = 1;
+			stream->write_function(stream, "+OK");
+			switch_thread_rwlock_unlock(source->rwlock);
+		} else {
 			stream->write_function(stream, "-ERR Cannot locate local_stream %s!\n", local_stream_name);
-			goto done;
 		}
-		
-		source->stopped = 1;
-		stream->write_function(stream, "+OK");
 	} else if (!strcasecmp(argv[0], "reload") && local_stream_name) {
-		switch_mutex_lock(globals.mutex);
-		source = switch_core_hash_find(globals.source_hash, local_stream_name);
-		switch_mutex_unlock(globals.mutex);
-		
-		if (!source) {
+		if ((source = get_source(local_stream_name))) {
+			source->full_reload = 1;
+			source->part_reload = 1;
+			stream->write_function(stream, "+OK");
+		} else {
 			stream->write_function(stream, "-ERR Cannot locate local_stream %s!\n", local_stream_name);
-			goto done;
 		}
-
-		source->full_reload = 1;
-		source->part_reload = 1;
-		stream->write_function(stream, "+OK");
 	} else if (!strcasecmp(argv[0], "start") && local_stream_name) {
-		switch_mutex_lock(globals.mutex);
-		source = switch_core_hash_find(globals.source_hash, local_stream_name);
-		switch_mutex_unlock(globals.mutex);
-
-		if (source) {
+		if ((source = get_source(local_stream_name))) {
 			source->stopped = 0;
 			stream->write_function(stream, "+OK stream: %s", source->name);
-			goto done;
-		}
-		
-		if ((ok = launch_streams(local_stream_name))) {
-			stream->write_function(stream, "+OK stream: %s", local_stream_name);
-			goto done;
+		} else {
+			if ((ok = launch_streams(local_stream_name))) {
+				stream->write_function(stream, "+OK stream: %s", local_stream_name);
+			}
 		}
 		
 	} else if (!strcasecmp(argv[0], "show")) {
@@ -1165,22 +1333,21 @@ SWITCH_STANDARD_API(local_stream_function)
 		void *val;
 		switch_bool_t xml = SWITCH_FALSE;
 
-		switch_mutex_lock(globals.mutex);
 		if (argc == 1) {
+			switch_mutex_lock(globals.mutex);
 			for (hi = switch_core_hash_first(globals.source_hash); hi; hi = switch_core_hash_next(&hi)) {
 				switch_core_hash_this(hi, &var, NULL, &val);
 				if ((source = (local_stream_source_t *) val)) {
 					stream->write_function(stream, "%s,%s\n", source->name, source->location);
 				}
 			}
+			switch_mutex_unlock(globals.mutex);
 		} else {
 			if (argc == 4 && !strcasecmp("xml", argv[3])) {
 				xml = SWITCH_TRUE;
 			}
 
-			source = switch_core_hash_find(globals.source_hash, local_stream_name);
-
-			if (source) {
+			if ((source = get_source(local_stream_name))) {
 				if (xml) {
 					stream->write_function(stream, "<?xml version=\"1.0\"?>\n<local_stream name=\"%s\">\n", source->name);
 					stream->write_function(stream, "  <location>%s</location>\n", source->location);
@@ -1210,13 +1377,11 @@ SWITCH_STANDARD_API(local_stream_function)
 					stream->write_function(stream, "  stopped:  %s\n", (source->stopped) ? "true" : "false");
 					stream->write_function(stream, "  reloading: %s\n", (source->full_reload) ? "true" : "false");
 				}
+				switch_thread_rwlock_unlock(source->rwlock);
 			} else {
 				stream->write_function(stream, "-ERR Cannot locate local_stream %s!\n", local_stream_name);
 			}
 		}
-		switch_mutex_unlock(globals.mutex);
-
-		goto done;
 	}
 	
 	goto done;
diff --git a/src/mod/formats/mod_shout/Makefile.am b/src/mod/formats/mod_shout/Makefile.am
index 362561d3b7..7fd8e01ab3 100644
--- a/src/mod/formats/mod_shout/Makefile.am
+++ b/src/mod/formats/mod_shout/Makefile.am
@@ -3,6 +3,7 @@ MODNAME=mod_shout
 
 if HAVE_SHOUT
 if HAVE_MPG123
+if HAVE_MP3LAME
 
 mod_LTLIBRARIES = mod_shout.la
 mod_shout_la_SOURCES  = mod_shout.c
@@ -11,10 +12,11 @@ mod_shout_la_CPPFLAGS = $(CURL_CFLAGS) $(AM_CPPFLAGS) $(SHOUT_CFLAGS) $(MP3LAME_
 mod_shout_la_LIBADD   = $(switch_builddir)/libfreeswitch.la
 mod_shout_la_LDFLAGS  = $(CURL_LIBS) -avoid-version -module -no-undefined -shared $(SHOUT_LIBS) $(MP3LAME_LIBS) $(MPG123_LIBS)
 
-
-if !HAVE_MP3LAME
-mod_shout_la_LDFLAGS += -lmp3lame
-mod_shout_la_CFLAGS += -I/usr/include/lame
+else
+install: error
+all: error
+error:
+       $(error You must install libmp3lame-dev to build mod_shout)
 endif
 
 else
diff --git a/src/mod/formats/mod_shout/mod_shout.c b/src/mod/formats/mod_shout/mod_shout.c
index 7e5cc2fef5..60275ff3e7 100644
--- a/src/mod/formats/mod_shout/mod_shout.c
+++ b/src/mod/formats/mod_shout/mod_shout.c
@@ -35,7 +35,7 @@
 #include <switch.h>
 #include "mpg123.h"
 #include <shout/shout.h>
-#include <lame.h>
+#include <lame/lame.h>
 #include <switch_curl.h>
 
 #define OUTSCALE 8192 * 2
diff --git a/src/mod/formats/mod_ssml/mod_ssml.c b/src/mod/formats/mod_ssml/mod_ssml.c
index f99b50c1a3..6c83e42bd2 100644
--- a/src/mod/formats/mod_ssml/mod_ssml.c
+++ b/src/mod/formats/mod_ssml/mod_ssml.c
@@ -451,9 +451,9 @@ static switch_status_t next_file(switch_file_handle_t *handle)
 	handle->interval = context->fh.interval;
 
 	if (switch_test_flag((&context->fh), SWITCH_FILE_NATIVE)) {
-		switch_set_flag(handle, SWITCH_FILE_NATIVE);
+		switch_set_flag_locked(handle, SWITCH_FILE_NATIVE);
 	} else {
-		switch_clear_flag(handle, SWITCH_FILE_NATIVE);
+		switch_clear_flag_locked(handle, SWITCH_FILE_NATIVE);
 	}
 
 	return SWITCH_STATUS_SUCCESS;
diff --git a/src/mod/formats/mod_vlc/Makefile.am b/src/mod/formats/mod_vlc/Makefile.am
index 16bda68ad5..328fed54b9 100644
--- a/src/mod/formats/mod_vlc/Makefile.am
+++ b/src/mod/formats/mod_vlc/Makefile.am
@@ -1,8 +1,6 @@
 include $(top_srcdir)/build/modmake.rulesam
 MODNAME=mod_vlc
 
-if HAVE_YUV
-if HAVE_VPX
 if HAVE_VLC
 
 mod_LTLIBRARIES = mod_vlc.la
@@ -17,18 +15,3 @@ all: error
 error:
 	$(error You must install libvlc-dev to build mod_vlc)
 endif
-
-else
-install: error
-all: error
-error:
-	$(error You must install libvpx2-dev to build mod_vlc)
-endif
-
-else
-install: error
-all: error
-error:
-	$(error You must install libyuv-dev to build mod_vlc)
-endif
-
diff --git a/src/mod/formats/mod_vlc/mod_vlc.c b/src/mod/formats/mod_vlc/mod_vlc.c
index 32798099be..dbacf4e513 100644
--- a/src/mod/formats/mod_vlc/mod_vlc.c
+++ b/src/mod/formats/mod_vlc/mod_vlc.c
@@ -1340,6 +1340,23 @@ static switch_status_t vlc_file_av_close(switch_file_handle_t *handle)
 	return SWITCH_STATUS_SUCCESS;
 }
 
+static switch_status_t vlc_file_command(switch_file_handle_t *handle, switch_file_command_t command)
+{
+	vlc_file_context_t *context = handle->private_info;
+
+	switch(command) {
+	case SCFC_FLUSH_AUDIO:
+		switch_mutex_lock(context->audio_mutex);		
+		switch_buffer_zero(context->audio_buffer);
+		switch_mutex_unlock(context->audio_mutex);		
+		break;
+	default:
+		break;
+	}
+
+	return SWITCH_STATUS_SUCCESS;
+}
+
 static switch_status_t vlc_file_close(switch_file_handle_t *handle)
 {
 	vlc_file_context_t *context = handle->private_info;
@@ -1642,8 +1659,7 @@ int  vlc_write_video_imem_get_callback(void *data, const char *cookie, int64_t *
 		}
 		
 		*output = context->video_frame_buffer;
-		*size = 0;
-		switch_img_convert(img, SWITCH_CONVERT_FMT_YUYV, *output, size);
+		switch_img_to_raw(img, *output, *size, SWITCH_IMG_FMT_YUY2);
 		switch_img_free(&img);
 		return 0;
 	}
@@ -2609,6 +2625,7 @@ SWITCH_MODULE_LOAD_FUNCTION(mod_vlc_load)
 	file_interface->extens = vlc_file_supported_formats;
 	file_interface->file_open = vlc_file_open;
 	file_interface->file_close = vlc_file_close;
+	file_interface->file_command = vlc_file_command;
 	file_interface->file_read = vlc_file_read;
 	file_interface->file_write = vlc_file_write;
 	file_interface->file_read_video = vlc_file_read_video;
diff --git a/src/mod/languages/mod_managed/freeswitch_wrap.cxx b/src/mod/languages/mod_managed/freeswitch_wrap.cxx
index 31f7a0ed92..76a7f4035f 100644
--- a/src/mod/languages/mod_managed/freeswitch_wrap.cxx
+++ b/src/mod/languages/mod_managed/freeswitch_wrap.cxx
@@ -1820,6 +1820,16 @@ SWIGEXPORT char * SWIGSTDCALL CSharp_SWITCH_PARK_AFTER_BRIDGE_VARIABLE_get() {
 }
 
 
+SWIGEXPORT char * SWIGSTDCALL CSharp_SWITCH_PARK_AFTER_EARLY_BRIDGE_VARIABLE_get() {
+  char * jresult ;
+  char *result = 0 ;
+  
+  result = (char *)("park_after_early_bridge");
+  jresult = SWIG_csharp_string_callback((const char *)result); 
+  return jresult;
+}
+
+
 SWIGEXPORT char * SWIGSTDCALL CSharp_SWITCH_TRANSFER_AFTER_BRIDGE_VARIABLE_get() {
   char * jresult ;
   char *result = 0 ;
@@ -1830,6 +1840,16 @@ SWIGEXPORT char * SWIGSTDCALL CSharp_SWITCH_TRANSFER_AFTER_BRIDGE_VARIABLE_get()
 }
 
 
+SWIGEXPORT char * SWIGSTDCALL CSharp_SWITCH_TRANSFER_AFTER_EARLY_BRIDGE_VARIABLE_get() {
+  char * jresult ;
+  char *result = 0 ;
+  
+  result = (char *)("transfer_after_early_bridge");
+  jresult = SWIG_csharp_string_callback((const char *)result); 
+  return jresult;
+}
+
+
 SWIGEXPORT char * SWIGSTDCALL CSharp_SWITCH_EXEC_AFTER_BRIDGE_APP_VARIABLE_get() {
   char * jresult ;
   char *result = 0 ;
@@ -3252,6 +3272,90 @@ SWIGEXPORT int SWIGSTDCALL CSharp_SWITCH_MAX_MANAGEMENT_BUFFER_LEN_get() {
 }
 
 
+SWIGEXPORT void SWIGSTDCALL CSharp_switch_error_period_t_start_set(void * jarg1, long long jarg2) {
+  error_period *arg1 = (error_period *) 0 ;
+  int64_t arg2 ;
+  
+  arg1 = (error_period *)jarg1; 
+  arg2 = (int64_t)jarg2; 
+  if (arg1) (arg1)->start = arg2;
+}
+
+
+SWIGEXPORT long long SWIGSTDCALL CSharp_switch_error_period_t_start_get(void * jarg1) {
+  long long jresult ;
+  error_period *arg1 = (error_period *) 0 ;
+  int64_t result;
+  
+  arg1 = (error_period *)jarg1; 
+  result = (int64_t) ((arg1)->start);
+  jresult = result; 
+  return jresult;
+}
+
+
+SWIGEXPORT void SWIGSTDCALL CSharp_switch_error_period_t_stop_set(void * jarg1, long long jarg2) {
+  error_period *arg1 = (error_period *) 0 ;
+  int64_t arg2 ;
+  
+  arg1 = (error_period *)jarg1; 
+  arg2 = (int64_t)jarg2; 
+  if (arg1) (arg1)->stop = arg2;
+}
+
+
+SWIGEXPORT long long SWIGSTDCALL CSharp_switch_error_period_t_stop_get(void * jarg1) {
+  long long jresult ;
+  error_period *arg1 = (error_period *) 0 ;
+  int64_t result;
+  
+  arg1 = (error_period *)jarg1; 
+  result = (int64_t) ((arg1)->stop);
+  jresult = result; 
+  return jresult;
+}
+
+
+SWIGEXPORT void SWIGSTDCALL CSharp_switch_error_period_t_next_set(void * jarg1, void * jarg2) {
+  error_period *arg1 = (error_period *) 0 ;
+  error_period *arg2 = (error_period *) 0 ;
+  
+  arg1 = (error_period *)jarg1; 
+  arg2 = (error_period *)jarg2; 
+  if (arg1) (arg1)->next = arg2;
+}
+
+
+SWIGEXPORT void * SWIGSTDCALL CSharp_switch_error_period_t_next_get(void * jarg1) {
+  void * jresult ;
+  error_period *arg1 = (error_period *) 0 ;
+  error_period *result = 0 ;
+  
+  arg1 = (error_period *)jarg1; 
+  result = (error_period *) ((arg1)->next);
+  jresult = (void *)result; 
+  return jresult;
+}
+
+
+SWIGEXPORT void * SWIGSTDCALL CSharp_new_switch_error_period_t() {
+  void * jresult ;
+  error_period *result = 0 ;
+  
+  result = (error_period *)new error_period();
+  jresult = (void *)result; 
+  return jresult;
+}
+
+
+SWIGEXPORT void SWIGSTDCALL CSharp_delete_switch_error_period_t(void * jarg1) {
+  error_period *arg1 = (error_period *) 0 ;
+  
+  arg1 = (error_period *)jarg1; 
+  delete arg1;
+}
+
+
 SWIGEXPORT void SWIGSTDCALL CSharp_switch_rtp_numbers_t_raw_bytes_set(void * jarg1, void * jarg2) {
   switch_rtp_numbers_t *arg1 = (switch_rtp_numbers_t *) 0 ;
   switch_size_t arg2 ;
@@ -3994,6 +4098,28 @@ SWIGEXPORT double SWIGSTDCALL CSharp_switch_rtp_numbers_t_mos_get(void * jarg1)
 }
 
 
+SWIGEXPORT void SWIGSTDCALL CSharp_switch_rtp_numbers_t_error_log_set(void * jarg1, void * jarg2) {
+  switch_rtp_numbers_t *arg1 = (switch_rtp_numbers_t *) 0 ;
+  error_period *arg2 = (error_period *) 0 ;
+  
+  arg1 = (switch_rtp_numbers_t *)jarg1; 
+  arg2 = (error_period *)jarg2; 
+  if (arg1) (arg1)->error_log = arg2;
+}
+
+
+SWIGEXPORT void * SWIGSTDCALL CSharp_switch_rtp_numbers_t_error_log_get(void * jarg1) {
+  void * jresult ;
+  switch_rtp_numbers_t *arg1 = (switch_rtp_numbers_t *) 0 ;
+  error_period *result = 0 ;
+  
+  arg1 = (switch_rtp_numbers_t *)jarg1; 
+  result = (error_period *) ((arg1)->error_log);
+  jresult = (void *)result; 
+  return jresult;
+}
+
+
 SWIGEXPORT void * SWIGSTDCALL CSharp_new_switch_rtp_numbers_t() {
   void * jresult ;
   switch_rtp_numbers_t *result = 0 ;
@@ -5575,46 +5701,68 @@ SWIGEXPORT void SWIGSTDCALL CSharp_delete_switch_t38_options_t(void * jarg1) {
 }
 
 
-SWIGEXPORT void SWIGSTDCALL CSharp_switch_vid_params_t_width_set(void * jarg1, int jarg2) {
+SWIGEXPORT void SWIGSTDCALL CSharp_switch_vid_params_t_width_set(void * jarg1, unsigned long jarg2) {
   switch_vid_params_s *arg1 = (switch_vid_params_s *) 0 ;
-  int arg2 ;
+  uint32_t arg2 ;
   
   arg1 = (switch_vid_params_s *)jarg1; 
-  arg2 = (int)jarg2; 
+  arg2 = (uint32_t)jarg2; 
   if (arg1) (arg1)->width = arg2;
 }
 
 
-SWIGEXPORT int SWIGSTDCALL CSharp_switch_vid_params_t_width_get(void * jarg1) {
-  int jresult ;
+SWIGEXPORT unsigned long SWIGSTDCALL CSharp_switch_vid_params_t_width_get(void * jarg1) {
+  unsigned long jresult ;
   switch_vid_params_s *arg1 = (switch_vid_params_s *) 0 ;
-  int result;
+  uint32_t result;
   
   arg1 = (switch_vid_params_s *)jarg1; 
-  result = (int) ((arg1)->width);
-  jresult = result; 
+  result = (uint32_t) ((arg1)->width);
+  jresult = (unsigned long)result; 
   return jresult;
 }
 
 
-SWIGEXPORT void SWIGSTDCALL CSharp_switch_vid_params_t_height_set(void * jarg1, int jarg2) {
+SWIGEXPORT void SWIGSTDCALL CSharp_switch_vid_params_t_height_set(void * jarg1, unsigned long jarg2) {
   switch_vid_params_s *arg1 = (switch_vid_params_s *) 0 ;
-  int arg2 ;
+  uint32_t arg2 ;
   
   arg1 = (switch_vid_params_s *)jarg1; 
-  arg2 = (int)jarg2; 
+  arg2 = (uint32_t)jarg2; 
   if (arg1) (arg1)->height = arg2;
 }
 
 
-SWIGEXPORT int SWIGSTDCALL CSharp_switch_vid_params_t_height_get(void * jarg1) {
-  int jresult ;
+SWIGEXPORT unsigned long SWIGSTDCALL CSharp_switch_vid_params_t_height_get(void * jarg1) {
+  unsigned long jresult ;
   switch_vid_params_s *arg1 = (switch_vid_params_s *) 0 ;
-  int result;
+  uint32_t result;
   
   arg1 = (switch_vid_params_s *)jarg1; 
-  result = (int) ((arg1)->height);
-  jresult = result; 
+  result = (uint32_t) ((arg1)->height);
+  jresult = (unsigned long)result; 
+  return jresult;
+}
+
+
+SWIGEXPORT void SWIGSTDCALL CSharp_switch_vid_params_t_fps_set(void * jarg1, unsigned long jarg2) {
+  switch_vid_params_s *arg1 = (switch_vid_params_s *) 0 ;
+  uint32_t arg2 ;
+  
+  arg1 = (switch_vid_params_s *)jarg1; 
+  arg2 = (uint32_t)jarg2; 
+  if (arg1) (arg1)->fps = arg2;
+}
+
+
+SWIGEXPORT unsigned long SWIGSTDCALL CSharp_switch_vid_params_t_fps_get(void * jarg1) {
+  unsigned long jresult ;
+  switch_vid_params_s *arg1 = (switch_vid_params_s *) 0 ;
+  uint32_t result;
+  
+  arg1 = (switch_vid_params_s *)jarg1; 
+  result = (uint32_t) ((arg1)->fps);
+  jresult = (unsigned long)result; 
   return jresult;
 }
 
@@ -10533,6 +10681,44 @@ SWIGEXPORT void SWIGSTDCALL CSharp_delete_switch_core_thread_session(void * jarg
 }
 
 
+SWIGEXPORT void * SWIGSTDCALL CSharp_switch_must_malloc(unsigned long jarg1) {
+  void * jresult ;
+  size_t arg1 ;
+  void *result = 0 ;
+  
+  arg1 = (size_t)jarg1; 
+  result = (void *)switch_must_malloc(arg1);
+  jresult = (void *)result; 
+  return jresult;
+}
+
+
+SWIGEXPORT void * SWIGSTDCALL CSharp_switch_must_realloc(void * jarg1, unsigned long jarg2) {
+  void * jresult ;
+  void *arg1 = (void *) 0 ;
+  size_t arg2 ;
+  void *result = 0 ;
+  
+  arg1 = (void *)jarg1; 
+  arg2 = (size_t)jarg2; 
+  result = (void *)switch_must_realloc(arg1,arg2);
+  jresult = (void *)result; 
+  return jresult;
+}
+
+
+SWIGEXPORT char * SWIGSTDCALL CSharp_switch_must_strdup(char * jarg1) {
+  char * jresult ;
+  char *arg1 = (char *) 0 ;
+  char *result = 0 ;
+  
+  arg1 = (char *)jarg1; 
+  result = (char *)switch_must_strdup((char const *)arg1);
+  jresult = SWIG_csharp_string_callback((const char *)result); 
+  return jresult;
+}
+
+
 SWIGEXPORT void SWIGSTDCALL CSharp_switch_core_screen_size(void * jarg1, void * jarg2) {
   int *arg1 = (int *) 0 ;
   int *arg2 = (int *) 0 ;
@@ -13057,34 +13243,34 @@ SWIGEXPORT void * SWIGSTDCALL CSharp_switch_core_hash_delete(void * jarg1, char
 }
 
 
-SWIGEXPORT int SWIGSTDCALL CSharp_switch_core_hash_delete_locked(void * jarg1, char * jarg2, void * jarg3) {
-  int jresult ;
+SWIGEXPORT void * SWIGSTDCALL CSharp_switch_core_hash_delete_locked(void * jarg1, char * jarg2, void * jarg3) {
+  void * jresult ;
   switch_hash_t *arg1 = (switch_hash_t *) 0 ;
   char *arg2 = (char *) 0 ;
   switch_mutex_t *arg3 = (switch_mutex_t *) 0 ;
-  switch_status_t result;
+  void *result = 0 ;
   
   arg1 = (switch_hash_t *)jarg1; 
   arg2 = (char *)jarg2; 
   arg3 = (switch_mutex_t *)jarg3; 
-  result = (switch_status_t)switch_core_hash_delete_locked(arg1,(char const *)arg2,arg3);
-  jresult = result; 
+  result = (void *)switch_core_hash_delete_locked(arg1,(char const *)arg2,arg3);
+  jresult = (void *)result; 
   return jresult;
 }
 
 
-SWIGEXPORT int SWIGSTDCALL CSharp_switch_core_hash_delete_wrlock(void * jarg1, char * jarg2, void * jarg3) {
-  int jresult ;
+SWIGEXPORT void * SWIGSTDCALL CSharp_switch_core_hash_delete_wrlock(void * jarg1, char * jarg2, void * jarg3) {
+  void * jresult ;
   switch_hash_t *arg1 = (switch_hash_t *) 0 ;
   char *arg2 = (char *) 0 ;
   switch_thread_rwlock_t *arg3 = (switch_thread_rwlock_t *) 0 ;
-  switch_status_t result;
+  void *result = 0 ;
   
   arg1 = (switch_hash_t *)jarg1; 
   arg2 = (char *)jarg2; 
   arg3 = (switch_thread_rwlock_t *)jarg3; 
-  result = (switch_status_t)switch_core_hash_delete_wrlock(arg1,(char const *)arg2,arg3);
-  jresult = result; 
+  result = (void *)switch_core_hash_delete_wrlock(arg1,(char const *)arg2,arg3);
+  jresult = (void *)result; 
   return jresult;
 }
 
@@ -14040,6 +14226,20 @@ SWIGEXPORT int SWIGSTDCALL CSharp_switch_core_file_close(void * jarg1) {
 }
 
 
+SWIGEXPORT int SWIGSTDCALL CSharp_switch_core_file_command(void * jarg1, int jarg2) {
+  int jresult ;
+  switch_file_handle_t *arg1 = (switch_file_handle_t *) 0 ;
+  switch_file_command_t arg2 ;
+  switch_status_t result;
+  
+  arg1 = (switch_file_handle_t *)jarg1; 
+  arg2 = (switch_file_command_t)jarg2; 
+  result = (switch_status_t)switch_core_file_command(arg1,arg2);
+  jresult = result; 
+  return jresult;
+}
+
+
 SWIGEXPORT int SWIGSTDCALL CSharp_switch_core_file_truncate(void * jarg1, long long jarg2) {
   int jresult ;
   switch_file_handle_t *arg1 = (switch_file_handle_t *) 0 ;
@@ -14054,13 +14254,15 @@ SWIGEXPORT int SWIGSTDCALL CSharp_switch_core_file_truncate(void * jarg1, long l
 }
 
 
-SWIGEXPORT int SWIGSTDCALL CSharp_switch_core_file_has_video(void * jarg1) {
+SWIGEXPORT int SWIGSTDCALL CSharp_switch_core_file_has_video(void * jarg1, int jarg2) {
   int jresult ;
   switch_file_handle_t *arg1 = (switch_file_handle_t *) 0 ;
+  switch_bool_t arg2 ;
   switch_bool_t result;
   
   arg1 = (switch_file_handle_t *)jarg1; 
-  result = (switch_bool_t)switch_core_file_has_video(arg1);
+  arg2 = (switch_bool_t)jarg2; 
+  result = (switch_bool_t)switch_core_file_has_video(arg1,arg2);
   jresult = result; 
   return jresult;
 }
@@ -14608,6 +14810,16 @@ SWIGEXPORT int SWIGSTDCALL CSharp_switch_core_management_exec(char * jarg1, int
 }
 
 
+SWIGEXPORT int SWIGSTDCALL CSharp_switch_core_set_process_privileges() {
+  int jresult ;
+  int32_t result;
+  
+  result = (int32_t)switch_core_set_process_privileges();
+  jresult = result; 
+  return jresult;
+}
+
+
 SWIGEXPORT int SWIGSTDCALL CSharp_set_normal_priority() {
   int jresult ;
   int32_t result;
@@ -16682,6 +16894,11 @@ SWIGEXPORT char * SWIGSTDCALL CSharp_switch_version_full_human() {
 }
 
 
+SWIGEXPORT void SWIGSTDCALL CSharp_switch_core_autobind_cpu() {
+  switch_core_autobind_cpu();
+}
+
+
 SWIGEXPORT void SWIGSTDCALL CSharp_switch_loadable_module_interface_module_name_set(void * jarg1, char * jarg2) {
   switch_loadable_module_interface *arg1 = (switch_loadable_module_interface *) 0 ;
   char *arg2 = (char *) 0 ;
@@ -19347,18 +19564,18 @@ SWIGEXPORT char * SWIGSTDCALL CSharp_switch_util_quote_shell_arg_pool(char * jar
 }
 
 
-SWIGEXPORT int SWIGSTDCALL CSharp_switch_calc_bitrate(int jarg1, int jarg2, int jarg3, int jarg4) {
+SWIGEXPORT int SWIGSTDCALL CSharp_switch_calc_bitrate(int jarg1, int jarg2, int jarg3, double jarg4) {
   int jresult ;
   int arg1 ;
   int arg2 ;
   int arg3 ;
-  int arg4 ;
+  double arg4 ;
   int32_t result;
   
   arg1 = (int)jarg1; 
   arg2 = (int)jarg2; 
   arg3 = (int)jarg3; 
-  arg4 = (int)jarg4; 
+  arg4 = (double)jarg4; 
   result = (int32_t)switch_calc_bitrate(arg1,arg2,arg3,arg4);
   jresult = result; 
   return jresult;
@@ -24795,6 +25012,28 @@ SWIGEXPORT void * SWIGSTDCALL CSharp_switch_file_interface_file_get_string_get(v
 }
 
 
+SWIGEXPORT void SWIGSTDCALL CSharp_switch_file_interface_file_command_set(void * jarg1, void * jarg2) {
+  switch_file_interface *arg1 = (switch_file_interface *) 0 ;
+  switch_status_t (*arg2)(switch_file_handle_t *,switch_file_command_t) = (switch_status_t (*)(switch_file_handle_t *,switch_file_command_t)) 0 ;
+  
+  arg1 = (switch_file_interface *)jarg1; 
+  arg2 = (switch_status_t (*)(switch_file_handle_t *,switch_file_command_t))jarg2; 
+  if (arg1) (arg1)->file_command = arg2;
+}
+
+
+SWIGEXPORT void * SWIGSTDCALL CSharp_switch_file_interface_file_command_get(void * jarg1) {
+  void * jresult ;
+  switch_file_interface *arg1 = (switch_file_interface *) 0 ;
+  switch_status_t (*result)(switch_file_handle_t *,switch_file_command_t) = 0 ;
+  
+  arg1 = (switch_file_interface *)jarg1; 
+  result = (switch_status_t (*)(switch_file_handle_t *,switch_file_command_t)) ((arg1)->file_command);
+  jresult = (void *)result; 
+  return jresult;
+}
+
+
 SWIGEXPORT void SWIGSTDCALL CSharp_switch_file_interface_extens_set(void * jarg1, void * jarg2) {
   switch_file_interface *arg1 = (switch_file_interface *) 0 ;
   char **arg2 = (char **) 0 ;
@@ -25121,6 +25360,28 @@ SWIGEXPORT float SWIGSTDCALL CSharp_switch_mm_t_fps_get(void * jarg1) {
 }
 
 
+SWIGEXPORT void SWIGSTDCALL CSharp_switch_mm_t_source_fps_set(void * jarg1, float jarg2) {
+  switch_mm_s *arg1 = (switch_mm_s *) 0 ;
+  float arg2 ;
+  
+  arg1 = (switch_mm_s *)jarg1; 
+  arg2 = (float)jarg2; 
+  if (arg1) (arg1)->source_fps = arg2;
+}
+
+
+SWIGEXPORT float SWIGSTDCALL CSharp_switch_mm_t_source_fps_get(void * jarg1) {
+  float jresult ;
+  switch_mm_s *arg1 = (switch_mm_s *) 0 ;
+  float result;
+  
+  arg1 = (switch_mm_s *)jarg1; 
+  result = (float) ((arg1)->source_fps);
+  jresult = result; 
+  return jresult;
+}
+
+
 SWIGEXPORT void SWIGSTDCALL CSharp_switch_mm_t_vbuf_set(void * jarg1, int jarg2) {
   switch_mm_s *arg1 = (switch_mm_s *) 0 ;
   int arg2 ;
@@ -25143,6 +25404,28 @@ SWIGEXPORT int SWIGSTDCALL CSharp_switch_mm_t_vbuf_get(void * jarg1) {
 }
 
 
+SWIGEXPORT void SWIGSTDCALL CSharp_switch_mm_t_vprofile_set(void * jarg1, int jarg2) {
+  switch_mm_s *arg1 = (switch_mm_s *) 0 ;
+  switch_video_profile_t arg2 ;
+  
+  arg1 = (switch_mm_s *)jarg1; 
+  arg2 = (switch_video_profile_t)jarg2; 
+  if (arg1) (arg1)->vprofile = arg2;
+}
+
+
+SWIGEXPORT int SWIGSTDCALL CSharp_switch_mm_t_vprofile_get(void * jarg1) {
+  int jresult ;
+  switch_mm_s *arg1 = (switch_mm_s *) 0 ;
+  switch_video_profile_t result;
+  
+  arg1 = (switch_mm_s *)jarg1; 
+  result = (switch_video_profile_t) ((arg1)->vprofile);
+  jresult = result; 
+  return jresult;
+}
+
+
 SWIGEXPORT void SWIGSTDCALL CSharp_switch_mm_t_vencspd_set(void * jarg1, int jarg2) {
   switch_mm_s *arg1 = (switch_mm_s *) 0 ;
   switch_video_encode_speed_t arg2 ;
@@ -26308,6 +26591,28 @@ SWIGEXPORT void * SWIGSTDCALL CSharp_switch_file_handle_mm_get(void * jarg1) {
 }
 
 
+SWIGEXPORT void SWIGSTDCALL CSharp_switch_file_handle_flag_mutex_set(void * jarg1, void * jarg2) {
+  switch_file_handle *arg1 = (switch_file_handle *) 0 ;
+  switch_mutex_t *arg2 = (switch_mutex_t *) 0 ;
+  
+  arg1 = (switch_file_handle *)jarg1; 
+  arg2 = (switch_mutex_t *)jarg2; 
+  if (arg1) (arg1)->flag_mutex = arg2;
+}
+
+
+SWIGEXPORT void * SWIGSTDCALL CSharp_switch_file_handle_flag_mutex_get(void * jarg1) {
+  void * jresult ;
+  switch_file_handle *arg1 = (switch_file_handle *) 0 ;
+  switch_mutex_t *result = 0 ;
+  
+  arg1 = (switch_file_handle *)jarg1; 
+  result = (switch_mutex_t *) ((arg1)->flag_mutex);
+  jresult = (void *)result; 
+  return jresult;
+}
+
+
 SWIGEXPORT void * SWIGSTDCALL CSharp_new_switch_file_handle() {
   void * jresult ;
   switch_file_handle *result = 0 ;
@@ -35666,6 +35971,18 @@ SWIGEXPORT int SWIGSTDCALL CSharp_switch_event_bind(char * jarg1, int jarg2, cha
 }
 
 
+SWIGEXPORT int SWIGSTDCALL CSharp_switch_event_get_custom_events(void * jarg1) {
+  int jresult ;
+  switch_console_callback_match_t **arg1 = (switch_console_callback_match_t **) 0 ;
+  switch_status_t result;
+  
+  arg1 = (switch_console_callback_match_t **)jarg1; 
+  result = (switch_status_t)switch_event_get_custom_events(arg1);
+  jresult = result; 
+  return jresult;
+}
+
+
 SWIGEXPORT int SWIGSTDCALL CSharp_switch_event_bind_removable(char * jarg1, int jarg2, char * jarg3, void * jarg4, void * jarg5, void * jarg6) {
   int jresult ;
   char *arg1 = (char *) 0 ;
@@ -36074,20 +36391,20 @@ SWIGEXPORT void SWIGSTDCALL CSharp_switch_event_launch_dispatch_threads(unsigned
 }
 
 
-SWIGEXPORT unsigned long SWIGSTDCALL CSharp_switch_event_channel_broadcast(char * jarg1, void * jarg2, char * jarg3, unsigned long jarg4) {
-  unsigned long jresult ;
+SWIGEXPORT int SWIGSTDCALL CSharp_switch_event_channel_broadcast(char * jarg1, void * jarg2, char * jarg3, unsigned long jarg4) {
+  int jresult ;
   char *arg1 = (char *) 0 ;
   cJSON **arg2 = (cJSON **) 0 ;
   char *arg3 = (char *) 0 ;
   switch_event_channel_id_t arg4 ;
-  uint32_t result;
+  switch_status_t result;
   
   arg1 = (char *)jarg1; 
   arg2 = (cJSON **)jarg2; 
   arg3 = (char *)jarg3; 
   arg4 = (switch_event_channel_id_t)jarg4; 
-  result = (uint32_t)switch_event_channel_broadcast((char const *)arg1,arg2,(char const *)arg3,arg4);
-  jresult = (unsigned long)result; 
+  result = (switch_status_t)switch_event_channel_broadcast((char const *)arg1,arg2,(char const *)arg3,arg4);
+  jresult = result; 
   return jresult;
 }
 
@@ -37430,6 +37747,22 @@ SWIGEXPORT int SWIGSTDCALL CSharp_switch_ivr_parse_all_signal_data(void * jarg1)
 }
 
 
+SWIGEXPORT int SWIGSTDCALL CSharp_switch_ivr_parse_signal_data(void * jarg1, int jarg2, int jarg3) {
+  int jresult ;
+  switch_core_session_t *arg1 = (switch_core_session_t *) 0 ;
+  switch_bool_t arg2 ;
+  switch_bool_t arg3 ;
+  switch_status_t result;
+  
+  arg1 = (switch_core_session_t *)jarg1; 
+  arg2 = (switch_bool_t)jarg2; 
+  arg3 = (switch_bool_t)jarg3; 
+  result = (switch_status_t)switch_ivr_parse_signal_data(arg1,arg2,arg3);
+  jresult = result; 
+  return jresult;
+}
+
+
 SWIGEXPORT int SWIGSTDCALL CSharp_switch_ivr_parse_next_signal_data(void * jarg1) {
   int jresult ;
   switch_core_session_t *arg1 = (switch_core_session_t *) 0 ;
@@ -39774,8 +40107,8 @@ SWIGEXPORT void SWIGSTDCALL CSharp_switch_rtp_packet_t_body_set(void * jarg1, ch
   arg2 = (char *)jarg2; 
   {
     if(arg2) {
-      strncpy((char*)arg1->body, (const char *)arg2, 16384-1);
-      arg1->body[16384-1] = 0;
+      strncpy((char*)arg1->body, (const char *)arg2, 16384+4+sizeof(char *)-1);
+      arg1->body[16384+4+sizeof(char *)-1] = 0;
     } else {
       arg1->body[0] = 0;
     }
@@ -40816,6 +41149,14 @@ SWIGEXPORT int SWIGSTDCALL CSharp_switch_rtp_set_remote_address(void * jarg1, ch
 }
 
 
+SWIGEXPORT void SWIGSTDCALL CSharp_switch_rtp_reset_jb(void * jarg1) {
+  switch_rtp_t *arg1 = (switch_rtp_t *) 0 ;
+  
+  arg1 = (switch_rtp_t *)jarg1; 
+  switch_rtp_reset_jb(arg1);
+}
+
+
 SWIGEXPORT char * SWIGSTDCALL CSharp_switch_rtp_get_remote_host(void * jarg1) {
   char * jresult ;
   switch_rtp_t *arg1 = (switch_rtp_t *) 0 ;
diff --git a/src/mod/languages/mod_managed/managed/swig.cs b/src/mod/languages/mod_managed/managed/swig.cs
index f94de19823..26bb96f482 100644
--- a/src/mod/languages/mod_managed/managed/swig.cs
+++ b/src/mod/languages/mod_managed/managed/swig.cs
@@ -1211,6 +1211,23 @@ public class freeswitch {
     freeswitchPINVOKE.switch_regex_set_event_header_callback(var, val, SWIGTYPE_p_void.getCPtr(user_data));
   }
 
+  public static SWIGTYPE_p_void switch_must_malloc(uint _b) {
+    IntPtr cPtr = freeswitchPINVOKE.switch_must_malloc(_b);
+    SWIGTYPE_p_void ret = (cPtr == IntPtr.Zero) ? null : new SWIGTYPE_p_void(cPtr, false);
+    return ret;
+  }
+
+  public static SWIGTYPE_p_void switch_must_realloc(SWIGTYPE_p_void _b, uint _z) {
+    IntPtr cPtr = freeswitchPINVOKE.switch_must_realloc(SWIGTYPE_p_void.getCPtr(_b), _z);
+    SWIGTYPE_p_void ret = (cPtr == IntPtr.Zero) ? null : new SWIGTYPE_p_void(cPtr, false);
+    return ret;
+  }
+
+  public static string switch_must_strdup(string _s) {
+    string ret = freeswitchPINVOKE.switch_must_strdup(_s);
+    return ret;
+  }
+
   public static void switch_core_screen_size(SWIGTYPE_p_int x, SWIGTYPE_p_int y) {
     freeswitchPINVOKE.switch_core_screen_size(SWIGTYPE_p_int.getCPtr(x), SWIGTYPE_p_int.getCPtr(y));
   }
@@ -2143,13 +2160,15 @@ public class freeswitch {
     return ret;
   }
 
-  public static switch_status_t switch_core_hash_delete_locked(SWIGTYPE_p_switch_hashtable hash, string key, SWIGTYPE_p_switch_mutex_t mutex) {
-    switch_status_t ret = (switch_status_t)freeswitchPINVOKE.switch_core_hash_delete_locked(SWIGTYPE_p_switch_hashtable.getCPtr(hash), key, SWIGTYPE_p_switch_mutex_t.getCPtr(mutex));
+  public static SWIGTYPE_p_void switch_core_hash_delete_locked(SWIGTYPE_p_switch_hashtable hash, string key, SWIGTYPE_p_switch_mutex_t mutex) {
+    IntPtr cPtr = freeswitchPINVOKE.switch_core_hash_delete_locked(SWIGTYPE_p_switch_hashtable.getCPtr(hash), key, SWIGTYPE_p_switch_mutex_t.getCPtr(mutex));
+    SWIGTYPE_p_void ret = (cPtr == IntPtr.Zero) ? null : new SWIGTYPE_p_void(cPtr, false);
     return ret;
   }
 
-  public static switch_status_t switch_core_hash_delete_wrlock(SWIGTYPE_p_switch_hashtable hash, string key, SWIGTYPE_p_switch_thread_rwlock_t rwlock) {
-    switch_status_t ret = (switch_status_t)freeswitchPINVOKE.switch_core_hash_delete_wrlock(SWIGTYPE_p_switch_hashtable.getCPtr(hash), key, SWIGTYPE_p_switch_thread_rwlock_t.getCPtr(rwlock));
+  public static SWIGTYPE_p_void switch_core_hash_delete_wrlock(SWIGTYPE_p_switch_hashtable hash, string key, SWIGTYPE_p_switch_thread_rwlock_t rwlock) {
+    IntPtr cPtr = freeswitchPINVOKE.switch_core_hash_delete_wrlock(SWIGTYPE_p_switch_hashtable.getCPtr(hash), key, SWIGTYPE_p_switch_thread_rwlock_t.getCPtr(rwlock));
+    SWIGTYPE_p_void ret = (cPtr == IntPtr.Zero) ? null : new SWIGTYPE_p_void(cPtr, false);
     return ret;
   }
 
@@ -2487,13 +2506,18 @@ public class freeswitch {
     return ret;
   }
 
+  public static switch_status_t switch_core_file_command(switch_file_handle fh, switch_file_command_t command) {
+    switch_status_t ret = (switch_status_t)freeswitchPINVOKE.switch_core_file_command(switch_file_handle.getCPtr(fh), (int)command);
+    return ret;
+  }
+
   public static switch_status_t switch_core_file_truncate(switch_file_handle fh, long offset) {
     switch_status_t ret = (switch_status_t)freeswitchPINVOKE.switch_core_file_truncate(switch_file_handle.getCPtr(fh), offset);
     return ret;
   }
 
-  public static switch_bool_t switch_core_file_has_video(switch_file_handle fh) {
-    switch_bool_t ret = (switch_bool_t)freeswitchPINVOKE.switch_core_file_has_video(switch_file_handle.getCPtr(fh));
+  public static switch_bool_t switch_core_file_has_video(switch_file_handle fh, switch_bool_t CHECK_OPEN) {
+    switch_bool_t ret = (switch_bool_t)freeswitchPINVOKE.switch_core_file_has_video(switch_file_handle.getCPtr(fh), (int)CHECK_OPEN);
     return ret;
   }
 
@@ -2682,6 +2706,11 @@ public class freeswitch {
     return ret;
   }
 
+  public static int switch_core_set_process_privileges() {
+    int ret = freeswitchPINVOKE.switch_core_set_process_privileges();
+    return ret;
+  }
+
   public static int set_normal_priority() {
     int ret = freeswitchPINVOKE.set_normal_priority();
     return ret;
@@ -3335,6 +3364,10 @@ public class freeswitch {
     return ret;
   }
 
+  public static void switch_core_autobind_cpu() {
+    freeswitchPINVOKE.switch_core_autobind_cpu();
+  }
+
   public static switch_status_t switch_loadable_module_init(switch_bool_t autoload) {
     switch_status_t ret = (switch_status_t)freeswitchPINVOKE.switch_loadable_module_init((int)autoload);
     return ret;
@@ -4074,7 +4107,7 @@ public class freeswitch {
     return ret;
   }
 
-  public static int switch_calc_bitrate(int w, int h, int quality, int fps) {
+  public static int switch_calc_bitrate(int w, int h, int quality, double fps) {
     int ret = freeswitchPINVOKE.switch_calc_bitrate(w, h, quality, fps);
     return ret;
   }
@@ -5257,6 +5290,11 @@ public class freeswitch {
     return ret;
   }
 
+  public static switch_status_t switch_event_get_custom_events(SWIGTYPE_p_p_switch_console_callback_match matches) {
+    switch_status_t ret = (switch_status_t)freeswitchPINVOKE.switch_event_get_custom_events(SWIGTYPE_p_p_switch_console_callback_match.getCPtr(matches));
+    return ret;
+  }
+
   public static switch_status_t switch_event_bind_removable(string id, switch_event_types_t arg1, string subclass_name, SWIGTYPE_p_f_p_switch_event__void callback, SWIGTYPE_p_void user_data, SWIGTYPE_p_p_switch_event_node node) {
     switch_status_t ret = (switch_status_t)freeswitchPINVOKE.switch_event_bind_removable(id, (int)arg1, subclass_name, SWIGTYPE_p_f_p_switch_event__void.getCPtr(callback), SWIGTYPE_p_void.getCPtr(user_data), SWIGTYPE_p_p_switch_event_node.getCPtr(node));
     return ret;
@@ -5384,8 +5422,8 @@ public class freeswitch {
     freeswitchPINVOKE.switch_event_launch_dispatch_threads(max);
   }
 
-  public static uint switch_event_channel_broadcast(string event_channel, SWIGTYPE_p_p_cJSON json, string key, uint id) {
-    uint ret = freeswitchPINVOKE.switch_event_channel_broadcast(event_channel, SWIGTYPE_p_p_cJSON.getCPtr(json), key, id);
+  public static switch_status_t switch_event_channel_broadcast(string event_channel, SWIGTYPE_p_p_cJSON json, string key, uint id) {
+    switch_status_t ret = (switch_status_t)freeswitchPINVOKE.switch_event_channel_broadcast(event_channel, SWIGTYPE_p_p_cJSON.getCPtr(json), key, id);
     return ret;
   }
 
@@ -5615,6 +5653,11 @@ public class freeswitch {
     return ret;
   }
 
+  public static switch_status_t switch_ivr_parse_signal_data(SWIGTYPE_p_switch_core_session session, switch_bool_t all, switch_bool_t only_session_thread) {
+    switch_status_t ret = (switch_status_t)freeswitchPINVOKE.switch_ivr_parse_signal_data(SWIGTYPE_p_switch_core_session.getCPtr(session), (int)all, (int)only_session_thread);
+    return ret;
+  }
+
   public static switch_status_t switch_ivr_parse_next_signal_data(SWIGTYPE_p_switch_core_session session) {
     switch_status_t ret = (switch_status_t)freeswitchPINVOKE.switch_ivr_parse_next_signal_data(SWIGTYPE_p_switch_core_session.getCPtr(session));
     return ret;
@@ -6377,6 +6420,10 @@ public class freeswitch {
     return ret;
   }
 
+  public static void switch_rtp_reset_jb(SWIGTYPE_p_switch_rtp rtp_session) {
+    freeswitchPINVOKE.switch_rtp_reset_jb(SWIGTYPE_p_switch_rtp.getCPtr(rtp_session));
+  }
+
   public static string switch_rtp_get_remote_host(SWIGTYPE_p_switch_rtp rtp_session) {
     string ret = freeswitchPINVOKE.switch_rtp_get_remote_host(SWIGTYPE_p_switch_rtp.getCPtr(rtp_session));
     return ret;
@@ -7391,7 +7438,9 @@ public class freeswitch {
   public static readonly string SWITCH_LOCAL_VIDEO_PORT_VARIABLE = freeswitchPINVOKE.SWITCH_LOCAL_VIDEO_PORT_VARIABLE_get();
   public static readonly string SWITCH_HANGUP_AFTER_BRIDGE_VARIABLE = freeswitchPINVOKE.SWITCH_HANGUP_AFTER_BRIDGE_VARIABLE_get();
   public static readonly string SWITCH_PARK_AFTER_BRIDGE_VARIABLE = freeswitchPINVOKE.SWITCH_PARK_AFTER_BRIDGE_VARIABLE_get();
+  public static readonly string SWITCH_PARK_AFTER_EARLY_BRIDGE_VARIABLE = freeswitchPINVOKE.SWITCH_PARK_AFTER_EARLY_BRIDGE_VARIABLE_get();
   public static readonly string SWITCH_TRANSFER_AFTER_BRIDGE_VARIABLE = freeswitchPINVOKE.SWITCH_TRANSFER_AFTER_BRIDGE_VARIABLE_get();
+  public static readonly string SWITCH_TRANSFER_AFTER_EARLY_BRIDGE_VARIABLE = freeswitchPINVOKE.SWITCH_TRANSFER_AFTER_EARLY_BRIDGE_VARIABLE_get();
   public static readonly string SWITCH_EXEC_AFTER_BRIDGE_APP_VARIABLE = freeswitchPINVOKE.SWITCH_EXEC_AFTER_BRIDGE_APP_VARIABLE_get();
   public static readonly string SWITCH_EXEC_AFTER_BRIDGE_ARG_VARIABLE = freeswitchPINVOKE.SWITCH_EXEC_AFTER_BRIDGE_ARG_VARIABLE_get();
   public static readonly string SWITCH_MAX_FORWARDS_VARIABLE = freeswitchPINVOKE.SWITCH_MAX_FORWARDS_VARIABLE_get();
@@ -8127,9 +8176,15 @@ class freeswitchPINVOKE {
   [DllImport("mod_managed", EntryPoint="CSharp_SWITCH_PARK_AFTER_BRIDGE_VARIABLE_get")]
   public static extern string SWITCH_PARK_AFTER_BRIDGE_VARIABLE_get();
 
+  [DllImport("mod_managed", EntryPoint="CSharp_SWITCH_PARK_AFTER_EARLY_BRIDGE_VARIABLE_get")]
+  public static extern string SWITCH_PARK_AFTER_EARLY_BRIDGE_VARIABLE_get();
+
   [DllImport("mod_managed", EntryPoint="CSharp_SWITCH_TRANSFER_AFTER_BRIDGE_VARIABLE_get")]
   public static extern string SWITCH_TRANSFER_AFTER_BRIDGE_VARIABLE_get();
 
+  [DllImport("mod_managed", EntryPoint="CSharp_SWITCH_TRANSFER_AFTER_EARLY_BRIDGE_VARIABLE_get")]
+  public static extern string SWITCH_TRANSFER_AFTER_EARLY_BRIDGE_VARIABLE_get();
+
   [DllImport("mod_managed", EntryPoint="CSharp_SWITCH_EXEC_AFTER_BRIDGE_APP_VARIABLE_get")]
   public static extern string SWITCH_EXEC_AFTER_BRIDGE_APP_VARIABLE_get();
 
@@ -8487,6 +8542,30 @@ class freeswitchPINVOKE {
   [DllImport("mod_managed", EntryPoint="CSharp_SWITCH_MAX_MANAGEMENT_BUFFER_LEN_get")]
   public static extern int SWITCH_MAX_MANAGEMENT_BUFFER_LEN_get();
 
+  [DllImport("mod_managed", EntryPoint="CSharp_switch_error_period_t_start_set")]
+  public static extern void switch_error_period_t_start_set(HandleRef jarg1, long jarg2);
+
+  [DllImport("mod_managed", EntryPoint="CSharp_switch_error_period_t_start_get")]
+  public static extern long switch_error_period_t_start_get(HandleRef jarg1);
+
+  [DllImport("mod_managed", EntryPoint="CSharp_switch_error_period_t_stop_set")]
+  public static extern void switch_error_period_t_stop_set(HandleRef jarg1, long jarg2);
+
+  [DllImport("mod_managed", EntryPoint="CSharp_switch_error_period_t_stop_get")]
+  public static extern long switch_error_period_t_stop_get(HandleRef jarg1);
+
+  [DllImport("mod_managed", EntryPoint="CSharp_switch_error_period_t_next_set")]
+  public static extern void switch_error_period_t_next_set(HandleRef jarg1, HandleRef jarg2);
+
+  [DllImport("mod_managed", EntryPoint="CSharp_switch_error_period_t_next_get")]
+  public static extern IntPtr switch_error_period_t_next_get(HandleRef jarg1);
+
+  [DllImport("mod_managed", EntryPoint="CSharp_new_switch_error_period_t")]
+  public static extern IntPtr new_switch_error_period_t();
+
+  [DllImport("mod_managed", EntryPoint="CSharp_delete_switch_error_period_t")]
+  public static extern void delete_switch_error_period_t(HandleRef jarg1);
+
   [DllImport("mod_managed", EntryPoint="CSharp_switch_rtp_numbers_t_raw_bytes_set")]
   public static extern void switch_rtp_numbers_t_raw_bytes_set(HandleRef jarg1, HandleRef jarg2);
 
@@ -8667,6 +8746,12 @@ class freeswitchPINVOKE {
   [DllImport("mod_managed", EntryPoint="CSharp_switch_rtp_numbers_t_mos_get")]
   public static extern double switch_rtp_numbers_t_mos_get(HandleRef jarg1);
 
+  [DllImport("mod_managed", EntryPoint="CSharp_switch_rtp_numbers_t_error_log_set")]
+  public static extern void switch_rtp_numbers_t_error_log_set(HandleRef jarg1, HandleRef jarg2);
+
+  [DllImport("mod_managed", EntryPoint="CSharp_switch_rtp_numbers_t_error_log_get")]
+  public static extern IntPtr switch_rtp_numbers_t_error_log_get(HandleRef jarg1);
+
   [DllImport("mod_managed", EntryPoint="CSharp_new_switch_rtp_numbers_t")]
   public static extern IntPtr new_switch_rtp_numbers_t();
 
@@ -9097,16 +9182,22 @@ class freeswitchPINVOKE {
   public static extern void delete_switch_t38_options_t(HandleRef jarg1);
 
   [DllImport("mod_managed", EntryPoint="CSharp_switch_vid_params_t_width_set")]
-  public static extern void switch_vid_params_t_width_set(HandleRef jarg1, int jarg2);
+  public static extern void switch_vid_params_t_width_set(HandleRef jarg1, uint jarg2);
 
   [DllImport("mod_managed", EntryPoint="CSharp_switch_vid_params_t_width_get")]
-  public static extern int switch_vid_params_t_width_get(HandleRef jarg1);
+  public static extern uint switch_vid_params_t_width_get(HandleRef jarg1);
 
   [DllImport("mod_managed", EntryPoint="CSharp_switch_vid_params_t_height_set")]
-  public static extern void switch_vid_params_t_height_set(HandleRef jarg1, int jarg2);
+  public static extern void switch_vid_params_t_height_set(HandleRef jarg1, uint jarg2);
 
   [DllImport("mod_managed", EntryPoint="CSharp_switch_vid_params_t_height_get")]
-  public static extern int switch_vid_params_t_height_get(HandleRef jarg1);
+  public static extern uint switch_vid_params_t_height_get(HandleRef jarg1);
+
+  [DllImport("mod_managed", EntryPoint="CSharp_switch_vid_params_t_fps_set")]
+  public static extern void switch_vid_params_t_fps_set(HandleRef jarg1, uint jarg2);
+
+  [DllImport("mod_managed", EntryPoint="CSharp_switch_vid_params_t_fps_get")]
+  public static extern uint switch_vid_params_t_fps_get(HandleRef jarg1);
 
   [DllImport("mod_managed", EntryPoint="CSharp_new_switch_vid_params_t")]
   public static extern IntPtr new_switch_vid_params_t();
@@ -10368,6 +10459,15 @@ class freeswitchPINVOKE {
   [DllImport("mod_managed", EntryPoint="CSharp_delete_switch_core_thread_session")]
   public static extern void delete_switch_core_thread_session(HandleRef jarg1);
 
+  [DllImport("mod_managed", EntryPoint="CSharp_switch_must_malloc")]
+  public static extern IntPtr switch_must_malloc(uint jarg1);
+
+  [DllImport("mod_managed", EntryPoint="CSharp_switch_must_realloc")]
+  public static extern IntPtr switch_must_realloc(HandleRef jarg1, uint jarg2);
+
+  [DllImport("mod_managed", EntryPoint="CSharp_switch_must_strdup")]
+  public static extern string switch_must_strdup(string jarg1);
+
   [DllImport("mod_managed", EntryPoint="CSharp_switch_core_screen_size")]
   public static extern void switch_core_screen_size(HandleRef jarg1, HandleRef jarg2);
 
@@ -10933,10 +11033,10 @@ class freeswitchPINVOKE {
   public static extern IntPtr switch_core_hash_delete(HandleRef jarg1, string jarg2);
 
   [DllImport("mod_managed", EntryPoint="CSharp_switch_core_hash_delete_locked")]
-  public static extern int switch_core_hash_delete_locked(HandleRef jarg1, string jarg2, HandleRef jarg3);
+  public static extern IntPtr switch_core_hash_delete_locked(HandleRef jarg1, string jarg2, HandleRef jarg3);
 
   [DllImport("mod_managed", EntryPoint="CSharp_switch_core_hash_delete_wrlock")]
-  public static extern int switch_core_hash_delete_wrlock(HandleRef jarg1, string jarg2, HandleRef jarg3);
+  public static extern IntPtr switch_core_hash_delete_wrlock(HandleRef jarg1, string jarg2, HandleRef jarg3);
 
   [DllImport("mod_managed", EntryPoint="CSharp_switch_core_hash_delete_multi")]
   public static extern int switch_core_hash_delete_multi(HandleRef jarg1, HandleRef jarg2, HandleRef jarg3);
@@ -11136,11 +11236,14 @@ class freeswitchPINVOKE {
   [DllImport("mod_managed", EntryPoint="CSharp_switch_core_file_close")]
   public static extern int switch_core_file_close(HandleRef jarg1);
 
+  [DllImport("mod_managed", EntryPoint="CSharp_switch_core_file_command")]
+  public static extern int switch_core_file_command(HandleRef jarg1, int jarg2);
+
   [DllImport("mod_managed", EntryPoint="CSharp_switch_core_file_truncate")]
   public static extern int switch_core_file_truncate(HandleRef jarg1, long jarg2);
 
   [DllImport("mod_managed", EntryPoint="CSharp_switch_core_file_has_video")]
-  public static extern int switch_core_file_has_video(HandleRef jarg1);
+  public static extern int switch_core_file_has_video(HandleRef jarg1, int jarg2);
 
   [DllImport("mod_managed", EntryPoint="CSharp_switch_core_speech_open")]
   public static extern int switch_core_speech_open(HandleRef jarg1, string jarg2, string jarg3, uint jarg4, uint jarg5, uint jarg6, HandleRef jarg7, HandleRef jarg8);
@@ -11256,6 +11359,9 @@ class freeswitchPINVOKE {
   [DllImport("mod_managed", EntryPoint="CSharp_switch_core_management_exec")]
   public static extern int switch_core_management_exec(string jarg1, int jarg2, string jarg3, HandleRef jarg4);
 
+  [DllImport("mod_managed", EntryPoint="CSharp_switch_core_set_process_privileges")]
+  public static extern int switch_core_set_process_privileges();
+
   [DllImport("mod_managed", EntryPoint="CSharp_set_normal_priority")]
   public static extern int set_normal_priority();
 
@@ -11766,6 +11872,9 @@ class freeswitchPINVOKE {
   [DllImport("mod_managed", EntryPoint="CSharp_switch_version_full_human")]
   public static extern string switch_version_full_human();
 
+  [DllImport("mod_managed", EntryPoint="CSharp_switch_core_autobind_cpu")]
+  public static extern void switch_core_autobind_cpu();
+
   [DllImport("mod_managed", EntryPoint="CSharp_switch_loadable_module_interface_module_name_set")]
   public static extern void switch_loadable_module_interface_module_name_set(HandleRef jarg1, string jarg2);
 
@@ -12355,7 +12464,7 @@ class freeswitchPINVOKE {
   public static extern string switch_util_quote_shell_arg_pool(string jarg1, HandleRef jarg2);
 
   [DllImport("mod_managed", EntryPoint="CSharp_switch_calc_bitrate")]
-  public static extern int switch_calc_bitrate(int jarg1, int jarg2, int jarg3, int jarg4);
+  public static extern int switch_calc_bitrate(int jarg1, int jarg2, int jarg3, double jarg4);
 
   [DllImport("mod_managed", EntryPoint="CSharp_switch_parse_bandwidth_string")]
   public static extern int switch_parse_bandwidth_string(string jarg1);
@@ -13698,6 +13807,12 @@ class freeswitchPINVOKE {
   [DllImport("mod_managed", EntryPoint="CSharp_switch_file_interface_file_get_string_get")]
   public static extern IntPtr switch_file_interface_file_get_string_get(HandleRef jarg1);
 
+  [DllImport("mod_managed", EntryPoint="CSharp_switch_file_interface_file_command_set")]
+  public static extern void switch_file_interface_file_command_set(HandleRef jarg1, HandleRef jarg2);
+
+  [DllImport("mod_managed", EntryPoint="CSharp_switch_file_interface_file_command_get")]
+  public static extern IntPtr switch_file_interface_file_command_get(HandleRef jarg1);
+
   [DllImport("mod_managed", EntryPoint="CSharp_switch_file_interface_extens_set")]
   public static extern void switch_file_interface_extens_set(HandleRef jarg1, ref string jarg2);
 
@@ -13788,12 +13903,24 @@ class freeswitchPINVOKE {
   [DllImport("mod_managed", EntryPoint="CSharp_switch_mm_t_fps_get")]
   public static extern float switch_mm_t_fps_get(HandleRef jarg1);
 
+  [DllImport("mod_managed", EntryPoint="CSharp_switch_mm_t_source_fps_set")]
+  public static extern void switch_mm_t_source_fps_set(HandleRef jarg1, float jarg2);
+
+  [DllImport("mod_managed", EntryPoint="CSharp_switch_mm_t_source_fps_get")]
+  public static extern float switch_mm_t_source_fps_get(HandleRef jarg1);
+
   [DllImport("mod_managed", EntryPoint="CSharp_switch_mm_t_vbuf_set")]
   public static extern void switch_mm_t_vbuf_set(HandleRef jarg1, int jarg2);
 
   [DllImport("mod_managed", EntryPoint="CSharp_switch_mm_t_vbuf_get")]
   public static extern int switch_mm_t_vbuf_get(HandleRef jarg1);
 
+  [DllImport("mod_managed", EntryPoint="CSharp_switch_mm_t_vprofile_set")]
+  public static extern void switch_mm_t_vprofile_set(HandleRef jarg1, int jarg2);
+
+  [DllImport("mod_managed", EntryPoint="CSharp_switch_mm_t_vprofile_get")]
+  public static extern int switch_mm_t_vprofile_get(HandleRef jarg1);
+
   [DllImport("mod_managed", EntryPoint="CSharp_switch_mm_t_vencspd_set")]
   public static extern void switch_mm_t_vencspd_set(HandleRef jarg1, int jarg2);
 
@@ -14088,6 +14215,12 @@ class freeswitchPINVOKE {
   [DllImport("mod_managed", EntryPoint="CSharp_switch_file_handle_mm_get")]
   public static extern IntPtr switch_file_handle_mm_get(HandleRef jarg1);
 
+  [DllImport("mod_managed", EntryPoint="CSharp_switch_file_handle_flag_mutex_set")]
+  public static extern void switch_file_handle_flag_mutex_set(HandleRef jarg1, HandleRef jarg2);
+
+  [DllImport("mod_managed", EntryPoint="CSharp_switch_file_handle_flag_mutex_get")]
+  public static extern IntPtr switch_file_handle_flag_mutex_get(HandleRef jarg1);
+
   [DllImport("mod_managed", EntryPoint="CSharp_new_switch_file_handle")]
   public static extern IntPtr new_switch_file_handle();
 
@@ -16422,6 +16555,9 @@ class freeswitchPINVOKE {
   [DllImport("mod_managed", EntryPoint="CSharp_switch_event_bind")]
   public static extern int switch_event_bind(string jarg1, int jarg2, string jarg3, HandleRef jarg4, HandleRef jarg5);
 
+  [DllImport("mod_managed", EntryPoint="CSharp_switch_event_get_custom_events")]
+  public static extern int switch_event_get_custom_events(HandleRef jarg1);
+
   [DllImport("mod_managed", EntryPoint="CSharp_switch_event_bind_removable")]
   public static extern int switch_event_bind_removable(string jarg1, int jarg2, string jarg3, HandleRef jarg4, HandleRef jarg5, HandleRef jarg6);
 
@@ -16501,7 +16637,7 @@ class freeswitchPINVOKE {
   public static extern void switch_event_launch_dispatch_threads(uint jarg1);
 
   [DllImport("mod_managed", EntryPoint="CSharp_switch_event_channel_broadcast")]
-  public static extern uint switch_event_channel_broadcast(string jarg1, HandleRef jarg2, string jarg3, uint jarg4);
+  public static extern int switch_event_channel_broadcast(string jarg1, HandleRef jarg2, string jarg3, uint jarg4);
 
   [DllImport("mod_managed", EntryPoint="CSharp_switch_event_channel_unbind")]
   public static extern uint switch_event_channel_unbind(string jarg1, HandleRef jarg2);
@@ -16821,6 +16957,9 @@ class freeswitchPINVOKE {
   [DllImport("mod_managed", EntryPoint="CSharp_switch_ivr_parse_all_signal_data")]
   public static extern int switch_ivr_parse_all_signal_data(HandleRef jarg1);
 
+  [DllImport("mod_managed", EntryPoint="CSharp_switch_ivr_parse_signal_data")]
+  public static extern int switch_ivr_parse_signal_data(HandleRef jarg1, int jarg2, int jarg3);
+
   [DllImport("mod_managed", EntryPoint="CSharp_switch_ivr_parse_next_signal_data")]
   public static extern int switch_ivr_parse_next_signal_data(HandleRef jarg1);
 
@@ -17496,6 +17635,9 @@ class freeswitchPINVOKE {
   [DllImport("mod_managed", EntryPoint="CSharp_switch_rtp_set_remote_address")]
   public static extern int switch_rtp_set_remote_address(HandleRef jarg1, string jarg2, ushort jarg3, ushort jarg4, int jarg5, ref string jarg6);
 
+  [DllImport("mod_managed", EntryPoint="CSharp_switch_rtp_reset_jb")]
+  public static extern void switch_rtp_reset_jb(HandleRef jarg1);
+
   [DllImport("mod_managed", EntryPoint="CSharp_switch_rtp_get_remote_host")]
   public static extern string switch_rtp_get_remote_host(HandleRef jarg1);
 
@@ -22172,6 +22314,36 @@ namespace FreeSWITCH.Native {
 using System;
 using System.Runtime.InteropServices;
 
+public class SWIGTYPE_p_f_p_switch_file_handle_enum_switch_file_command_t__switch_status_t {
+  private HandleRef swigCPtr;
+
+  internal SWIGTYPE_p_f_p_switch_file_handle_enum_switch_file_command_t__switch_status_t(IntPtr cPtr, bool futureUse) {
+    swigCPtr = new HandleRef(this, cPtr);
+  }
+
+  protected SWIGTYPE_p_f_p_switch_file_handle_enum_switch_file_command_t__switch_status_t() {
+    swigCPtr = new HandleRef(null, IntPtr.Zero);
+  }
+
+  internal static HandleRef getCPtr(SWIGTYPE_p_f_p_switch_file_handle_enum_switch_file_command_t__switch_status_t obj) {
+    return (obj == null) ? new HandleRef(null, IntPtr.Zero) : obj.swigCPtr;
+  }
+}
+
+}
+/* ----------------------------------------------------------------------------
+ * This file was automatically generated by SWIG (http://www.swig.org).
+ * Version 2.0.12
+ *
+ * Do not make changes to this file unless you know what you are doing--modify
+ * the SWIG interface file instead.
+ * ----------------------------------------------------------------------------- */
+
+namespace FreeSWITCH.Native {
+
+using System;
+using System.Runtime.InteropServices;
+
 public class SWIGTYPE_p_f_p_switch_file_handle_long_long__switch_status_t {
   private HandleRef swigCPtr;
 
@@ -29181,6 +29353,7 @@ public enum switch_channel_flag_t {
   CF_VIDEO_BITRATE_UNMANAGABLE,
   CF_VIDEO_ECHO,
   CF_VIDEO_BLANK,
+  CF_VIDEO_WRITING,
   CF_SLA_INTERCEPT,
   CF_VIDEO_BREAK,
   CF_AUDIO_PAUSE,
@@ -33304,6 +33477,86 @@ namespace FreeSWITCH.Native {
 using System;
 using System.Runtime.InteropServices;
 
+public class switch_error_period_t : IDisposable {
+  private HandleRef swigCPtr;
+  protected bool swigCMemOwn;
+
+  internal switch_error_period_t(IntPtr cPtr, bool cMemoryOwn) {
+    swigCMemOwn = cMemoryOwn;
+    swigCPtr = new HandleRef(this, cPtr);
+  }
+
+  internal static HandleRef getCPtr(switch_error_period_t obj) {
+    return (obj == null) ? new HandleRef(null, IntPtr.Zero) : obj.swigCPtr;
+  }
+
+  ~switch_error_period_t() {
+    Dispose();
+  }
+
+  public virtual void Dispose() {
+    lock(this) {
+      if (swigCPtr.Handle != IntPtr.Zero) {
+        if (swigCMemOwn) {
+          swigCMemOwn = false;
+          freeswitchPINVOKE.delete_switch_error_period_t(swigCPtr);
+        }
+        swigCPtr = new HandleRef(null, IntPtr.Zero);
+      }
+      GC.SuppressFinalize(this);
+    }
+  }
+
+  public long start {
+    set {
+      freeswitchPINVOKE.switch_error_period_t_start_set(swigCPtr, value);
+    } 
+    get {
+      long ret = freeswitchPINVOKE.switch_error_period_t_start_get(swigCPtr);
+      return ret;
+    } 
+  }
+
+  public long stop {
+    set {
+      freeswitchPINVOKE.switch_error_period_t_stop_set(swigCPtr, value);
+    } 
+    get {
+      long ret = freeswitchPINVOKE.switch_error_period_t_stop_get(swigCPtr);
+      return ret;
+    } 
+  }
+
+  public switch_error_period_t next {
+    set {
+      freeswitchPINVOKE.switch_error_period_t_next_set(swigCPtr, switch_error_period_t.getCPtr(value));
+    } 
+    get {
+      IntPtr cPtr = freeswitchPINVOKE.switch_error_period_t_next_get(swigCPtr);
+      switch_error_period_t ret = (cPtr == IntPtr.Zero) ? null : new switch_error_period_t(cPtr, false);
+      return ret;
+    } 
+  }
+
+  public switch_error_period_t() : this(freeswitchPINVOKE.new_switch_error_period_t(), true) {
+  }
+
+}
+
+}
+/* ----------------------------------------------------------------------------
+ * This file was automatically generated by SWIG (http://www.swig.org).
+ * Version 2.0.12
+ *
+ * Do not make changes to this file unless you know what you are doing--modify
+ * the SWIG interface file instead.
+ * ----------------------------------------------------------------------------- */
+
+namespace FreeSWITCH.Native {
+
+using System;
+using System.Runtime.InteropServices;
+
 public class switch_event : IDisposable {
   private HandleRef swigCPtr;
   protected bool swigCMemOwn;
@@ -33705,6 +33958,21 @@ public enum switch_event_types_t {
 
 namespace FreeSWITCH.Native {
 
+public enum switch_file_command_t {
+  SCFC_FLUSH_AUDIO
+}
+
+}
+/* ----------------------------------------------------------------------------
+ * This file was automatically generated by SWIG (http://www.swig.org).
+ * Version 2.0.12
+ *
+ * Do not make changes to this file unless you know what you are doing--modify
+ * the SWIG interface file instead.
+ * ----------------------------------------------------------------------------- */
+
+namespace FreeSWITCH.Native {
+
 [System.Flags] public enum switch_file_flag_enum_t {
   SWITCH_FILE_FLAG_READ = (1 << 0),
   SWITCH_FILE_FLAG_WRITE = (1 << 1),
@@ -33725,7 +33993,8 @@ namespace FreeSWITCH.Native {
   SWITCH_FILE_WRITE_OVER = (1 << 16),
   SWITCH_FILE_NOMUX = (1 << 17),
   SWITCH_FILE_BREAK_ON_CHANGE = (1 << 18),
-  SWITCH_FILE_FLAG_VIDEO = (1 << 19)
+  SWITCH_FILE_FLAG_VIDEO = (1 << 19),
+  SWITCH_FILE_FLAG_VIDEO_EOF = (1 << 20)
 }
 
 }
@@ -34265,6 +34534,17 @@ public class switch_file_handle : IDisposable {
     } 
   }
 
+  public SWIGTYPE_p_switch_mutex_t flag_mutex {
+    set {
+      freeswitchPINVOKE.switch_file_handle_flag_mutex_set(swigCPtr, SWIGTYPE_p_switch_mutex_t.getCPtr(value));
+    } 
+    get {
+      IntPtr cPtr = freeswitchPINVOKE.switch_file_handle_flag_mutex_get(swigCPtr);
+      SWIGTYPE_p_switch_mutex_t ret = (cPtr == IntPtr.Zero) ? null : new SWIGTYPE_p_switch_mutex_t(cPtr, false);
+      return ret;
+    } 
+  }
+
   public switch_file_handle() : this(freeswitchPINVOKE.new_switch_file_handle(), true) {
   }
 
@@ -34434,6 +34714,17 @@ public class switch_file_interface : IDisposable {
     } 
   }
 
+  public SWIGTYPE_p_f_p_switch_file_handle_enum_switch_file_command_t__switch_status_t file_command {
+    set {
+      freeswitchPINVOKE.switch_file_interface_file_command_set(swigCPtr, SWIGTYPE_p_f_p_switch_file_handle_enum_switch_file_command_t__switch_status_t.getCPtr(value));
+    } 
+    get {
+      IntPtr cPtr = freeswitchPINVOKE.switch_file_interface_file_command_get(swigCPtr);
+      SWIGTYPE_p_f_p_switch_file_handle_enum_switch_file_command_t__switch_status_t ret = (cPtr == IntPtr.Zero) ? null : new SWIGTYPE_p_f_p_switch_file_handle_enum_switch_file_command_t__switch_status_t(cPtr, false);
+      return ret;
+    } 
+  }
+
   public string extens {
   set { freeswitchPINVOKE.switch_file_interface_extens_set(swigCPtr, ref value); }
 
@@ -38031,6 +38322,16 @@ public class switch_mm_t : IDisposable {
     } 
   }
 
+  public float source_fps {
+    set {
+      freeswitchPINVOKE.switch_mm_t_source_fps_set(swigCPtr, value);
+    } 
+    get {
+      float ret = freeswitchPINVOKE.switch_mm_t_source_fps_get(swigCPtr);
+      return ret;
+    } 
+  }
+
   public int vbuf {
     set {
       freeswitchPINVOKE.switch_mm_t_vbuf_set(swigCPtr, value);
@@ -38041,6 +38342,16 @@ public class switch_mm_t : IDisposable {
     } 
   }
 
+  public switch_video_profile_t vprofile {
+    set {
+      freeswitchPINVOKE.switch_mm_t_vprofile_set(swigCPtr, (int)value);
+    } 
+    get {
+      switch_video_profile_t ret = (switch_video_profile_t)freeswitchPINVOKE.switch_mm_t_vprofile_get(swigCPtr);
+      return ret;
+    } 
+  }
+
   public switch_video_encode_speed_t vencspd {
     set {
       freeswitchPINVOKE.switch_mm_t_vencspd_set(swigCPtr, (int)value);
@@ -38981,6 +39292,7 @@ public enum switch_rtp_flag_t {
   SWITCH_RTP_FLAG_MUTE,
   SWITCH_RTP_FLAG_NACK,
   SWITCH_RTP_FLAG_TMMBR,
+  SWITCH_RTP_FLAG_GEN_TS_DELTA,
   SWITCH_RTP_FLAG_INVALID
 }
 
@@ -39580,6 +39892,17 @@ public class switch_rtp_numbers_t : IDisposable {
     } 
   }
 
+  public switch_error_period_t error_log {
+    set {
+      freeswitchPINVOKE.switch_rtp_numbers_t_error_log_set(swigCPtr, switch_error_period_t.getCPtr(value));
+    } 
+    get {
+      IntPtr cPtr = freeswitchPINVOKE.switch_rtp_numbers_t_error_log_get(swigCPtr);
+      switch_error_period_t ret = (cPtr == IntPtr.Zero) ? null : new switch_error_period_t(cPtr, false);
+      return ret;
+    } 
+  }
+
   public switch_rtp_numbers_t() : this(freeswitchPINVOKE.new_switch_rtp_numbers_t(), true) {
   }
 
@@ -41412,6 +41735,7 @@ public enum switch_status_t {
   SWITCH_STATUS_CONTINUE,
   SWITCH_STATUS_TERM,
   SWITCH_STATUS_NOT_INITALIZED,
+  SWITCH_STATUS_TOO_LATE,
   SWITCH_STATUS_XBREAK = 35,
   SWITCH_STATUS_WINBREAK = 730035
 }
@@ -42649,10 +42973,27 @@ public class switch_video_codec_settings : IDisposable {
 namespace FreeSWITCH.Native {
 
 public enum switch_video_encode_speed_t {
-  SWITCH_VIDEO_ENCODE_SPEED_DEFAULT,
-  SWITCH_VIDEO_ENCODE_SPEED_SLOW,
+  SWITCH_VIDEO_ENCODE_SPEED_DEFAULT = 0,
+  SWITCH_VIDEO_ENCODE_SPEED_FAST = 0,
   SWITCH_VIDEO_ENCODE_SPEED_MEDIUM,
-  SWITCH_VIDEO_ENCODE_SPEED_FAST
+  SWITCH_VIDEO_ENCODE_SPEED_SLOW
+}
+
+}
+/* ----------------------------------------------------------------------------
+ * This file was automatically generated by SWIG (http://www.swig.org).
+ * Version 2.0.12
+ *
+ * Do not make changes to this file unless you know what you are doing--modify
+ * the SWIG interface file instead.
+ * ----------------------------------------------------------------------------- */
+
+namespace FreeSWITCH.Native {
+
+public enum switch_video_profile_t {
+  SWITCH_VIDEO_PROFILE_BASELINE,
+  SWITCH_VIDEO_PROFILE_MAIN,
+  SWITCH_VIDEO_PROFILE_HIGH
 }
 
 }
@@ -42716,22 +43057,32 @@ public class switch_vid_params_t : IDisposable {
     }
   }
 
-  public int width {
+  public uint width {
     set {
       freeswitchPINVOKE.switch_vid_params_t_width_set(swigCPtr, value);
     } 
     get {
-      int ret = freeswitchPINVOKE.switch_vid_params_t_width_get(swigCPtr);
+      uint ret = freeswitchPINVOKE.switch_vid_params_t_width_get(swigCPtr);
       return ret;
     } 
   }
 
-  public int height {
+  public uint height {
     set {
       freeswitchPINVOKE.switch_vid_params_t_height_set(swigCPtr, value);
     } 
     get {
-      int ret = freeswitchPINVOKE.switch_vid_params_t_height_get(swigCPtr);
+      uint ret = freeswitchPINVOKE.switch_vid_params_t_height_get(swigCPtr);
+      return ret;
+    } 
+  }
+
+  public uint fps {
+    set {
+      freeswitchPINVOKE.switch_vid_params_t_fps_set(swigCPtr, value);
+    } 
+    get {
+      uint ret = freeswitchPINVOKE.switch_vid_params_t_fps_get(swigCPtr);
       return ret;
     } 
   }
diff --git a/src/mod/languages/mod_v8/src/fssession.cpp b/src/mod/languages/mod_v8/src/fssession.cpp
index 260e529696..bd1edd88b1 100644
--- a/src/mod/languages/mod_v8/src/fssession.cpp
+++ b/src/mod/languages/mod_v8/src/fssession.cpp
@@ -347,9 +347,9 @@ switch_status_t FSSession::StreamInputCallback(switch_core_session_t *session, v
 			return SWITCH_STATUS_FALSE;
 		} else if (!strcasecmp(ret, "pause")) {
 			if (switch_test_flag(fh, SWITCH_FILE_PAUSE)) {
-				switch_clear_flag(fh, SWITCH_FILE_PAUSE);
+				switch_clear_flag_locked(fh, SWITCH_FILE_PAUSE);
 			} else {
-				switch_set_flag(fh, SWITCH_FILE_PAUSE);
+				switch_set_flag_locked(fh, SWITCH_FILE_PAUSE);
 			}
 			return SWITCH_STATUS_SUCCESS;
 		} else if (!strcasecmp(ret, "truncate")) {
@@ -416,9 +416,9 @@ switch_status_t FSSession::RecordInputCallback(switch_core_session_t *session, v
 
 		if (!strcasecmp(ret, "pause")) {
 			if (switch_test_flag(fh, SWITCH_FILE_PAUSE)) {
-				switch_clear_flag(fh, SWITCH_FILE_PAUSE);
+				switch_clear_flag_locked(fh, SWITCH_FILE_PAUSE);
 			} else {
-				switch_set_flag(fh, SWITCH_FILE_PAUSE);
+				switch_set_flag_locked(fh, SWITCH_FILE_PAUSE);
 			}
 			return SWITCH_STATUS_SUCCESS;
 		} else if (!strcasecmp(ret, "restart")) {
diff --git a/src/mod/say/mod_say_de/mod_say_de.c b/src/mod/say/mod_say_de/mod_say_de.c
index 7b4aa75457..09ba7f77e3 100644
--- a/src/mod/say/mod_say_de/mod_say_de.c
+++ b/src/mod/say/mod_say_de/mod_say_de.c
@@ -87,7 +87,7 @@ static switch_status_t play_group(switch_say_method_t method, switch_say_gender_
 	if (a) {
 		/*german nominativ for "one" in numbers like 21, 171, 4591 is flexed("ein" instead of "eins"), 2-9 are not*/
 		if ( a == 1 ) {
-			say_file("digits/s-1.wav");
+			say_file("digits/1_n.wav");
 		} else {
 			say_file("digits/%d.wav", a);
 		}
@@ -99,7 +99,7 @@ static switch_status_t play_group(switch_say_method_t method, switch_say_gender_
 			/*german nominativ for "one" in numbers like 21, 171, 4591 is flexed, 2-9 are not*/
 			if (c > 0) {
 				if ( c == 1 ) {
-					say_file("digits/s-1.wav");
+					say_file("digits/1_n.wav");
 				} else {
 					say_file("digits/%d.wav", c);
 				} 
@@ -145,9 +145,9 @@ static switch_status_t play_group(switch_say_method_t method, switch_say_gender_
 			/*"one" used as an article is feminine or masculine in german, e.g. voicemail-message is feminine
 			only applies to the likes of 1, 101, 1001 etc.*/
 			if ( b == 0  && c == 1 && gender == SSG_FEMININE ) {        
-				say_file("digits/1_f.wav");                         
+				say_file("digits/1_f.wav");
 			} else if ( b == 0 && c == 1 && what ) {
-				say_file("digits/s-1.wav");
+				say_file("digits/1_n.wav");
 			} else {
 				say_file("digits/%d.wav", c);
 			}
@@ -163,7 +163,7 @@ static switch_status_t play_group(switch_say_method_t method, switch_say_gender_
 
 static switch_status_t de_say_general_count(switch_core_session_t *session, char *tosay, switch_say_args_t *say_args, switch_input_args_t *args)
 {
-	int in;
+	long in;
 	int x = 0;
 	int places[9] = { 0 };
 	char sbuf[128] = "";
@@ -187,7 +187,7 @@ static switch_status_t de_say_general_count(switch_core_session_t *session, char
 		return SWITCH_STATUS_GENERR;
 	}
 
-	in = atoi(tosay);
+	in = atol(tosay);
 
 	if (in != 0) {   /*fills the places-array with tosay(resp. in) from tail to front e.g. 84371 would be places[|1|7|3|4|8|0|0|0|], up to 1 billion minus 1*/
 		for (x = 8; x >= 0; x--) {
@@ -223,9 +223,9 @@ static switch_status_t de_say_general_count(switch_core_session_t *session, char
 static switch_status_t de_say_time(switch_core_session_t *session, char *tosay, switch_say_args_t *say_args, switch_input_args_t *args)
 {
 	int32_t t;
-	switch_time_t target = 0;
-	switch_time_exp_t tm;
-	uint8_t say_date = 0, say_time = 0;
+	switch_time_t target = 0, target_now = 0;
+	switch_time_exp_t tm, tm_now;
+	uint8_t say_date = 0, say_time = 0, say_year = 0, say_month = 0, say_dow = 0, say_day = 0, say_yesterday = 0, say_today = 0;
 	switch_channel_t *channel = switch_core_session_get_channel(session);
 	const char *tz = switch_channel_get_variable(channel, "timezone");
 
@@ -250,8 +250,9 @@ static switch_status_t de_say_time(switch_core_session_t *session, char *tosay,
 					minutes = atoi(tme);
 				}
 			}
+            free(tme);
 		} else {
-			if ((seconds = atoi(tosay)) <= 0) {
+			if ((seconds = atol(tosay)) <= 0) {
 				seconds = (int64_t) switch_epoch_time_now(NULL);
 			}
 
@@ -270,7 +271,7 @@ static switch_status_t de_say_time(switch_core_session_t *session, char *tosay,
 
 		if (hours) {
 			if (hours == 1) {
-				say_file("digits/1_f.wav");
+				say_file("digits/1_n.wav");
 				say_file("time/hour.wav");
 			} else {
 				say_num(hours, SSM_PRONOUNCED);
@@ -311,10 +312,12 @@ static switch_status_t de_say_time(switch_core_session_t *session, char *tosay,
 		return SWITCH_STATUS_SUCCESS;
 	}
 
-	if ((t = atoi(tosay)) > 0) {
+	if ((t = atol(tosay)) > 0) {
 		target = switch_time_make(t, 0);
+        target_now = switch_micro_time_now();
 	} else {
 		target = switch_micro_time_now();
+		target_now = switch_micro_time_now();
 	}
 
 	if (tz) {
@@ -322,11 +325,14 @@ static switch_status_t de_say_time(switch_core_session_t *session, char *tosay,
 		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Timezone is [%s]\n", tz);
 		if (check) {
 			switch_time_exp_tz(&tm, target, check);
+			switch_time_exp_tz(&tm_now, target_now, check);
 		} else {
 			switch_time_exp_tz_name(tz, &tm, target);
+			switch_time_exp_tz_name(tz, &tm_now, target_now);
 		}
 	} else {
 		switch_time_exp_lt(&tm, target);
+		switch_time_exp_lt(&tm_now, target_now);
 	}
 
 	switch (say_args->type) {
@@ -338,41 +344,95 @@ static switch_status_t de_say_time(switch_core_session_t *session, char *tosay,
 		break;
 	case SST_CURRENT_TIME:
 		say_time = 1;
+		break;
+    case SST_SHORT_DATE_TIME:
+		say_time = 1;
+		//Time is in the future
+		if ((tm.tm_year > tm_now.tm_year) ||
+		    (tm.tm_year == tm_now.tm_year && tm.tm_mon > tm_now.tm_mon) ||
+		    (tm.tm_year == tm_now.tm_year && tm.tm_mon == tm_now.tm_mon && tm.tm_mday > tm_now.tm_mday))
+		{
+			say_date = 1;
+			break;
+		}
+		//Time is today or earlier
+		if (tm.tm_year != tm_now.tm_year) {
+			say_date = 1;
+			break;
+		}
+		if (tm.tm_yday == tm_now.tm_yday) {
+			say_today = 1;
+			break;
+		}
+		if (tm.tm_yday == tm_now.tm_yday - 1) {
+			say_yesterday = 1;
+			break;
+		}
+		if (tm.tm_yday >= tm_now.tm_yday - 5) {
+			say_dow = 1;
+			break;
+		}
+		if (tm.tm_mon != tm_now.tm_mon) {
+			say_month = say_day = say_dow = 1;
+			break;
+		}
+
+		say_month = say_day = say_dow = 1;
+
 		break;
 	default:
 		break;
 	}
 
-	if (say_date) {
-		say_args->gender = SSG_MASCULINE;
-		say_file("time/day-%d.wav", tm.tm_wday);
-		say_num(tm.tm_mday, SSM_COUNTED);		
-		say_file("time/mon-%d.wav", tm.tm_mon);
-		say_num(tm.tm_year + 1900, SSM_PRONOUNCED);
+	if (say_today) {
+		say_file("time/today.wav");
 	}
+	if (say_yesterday) {
+		say_file("time/yesterday.wav");
+	}
+	if (say_dow) {
+		say_file("time/day-%d.wav", tm.tm_wday);
+	}
+    
+    if (say_month) {
+        say_file("time/mon-%d.wav", tm.tm_mon);
+    }
+
+    if (say_day) {
+        say_args->gender = SSG_MASCULINE;
+        say_num(tm.tm_mday, SSM_COUNTED);
+    }
+
+    if (say_year) {
+        say_args->gender = SSG_NEUTER;
+        say_num(tm.tm_year + 1900, SSM_PRONOUNCED_YEAR);
+    }
+
+    if (say_date) {
+        say_dow = say_day = say_month = say_year = 1;
+        say_today = say_yesterday = 0;
+    }
 
 	if (say_time) {
-		if (say_date) {
-	    	say_file("time/at.wav");
-		}
+            if (say_date || say_today || say_yesterday || say_dow) {
+                say_file("time/at.wav");
+            }
 
-		if (tm.tm_hour == 1) {
-			say_file("digits/s-1.wav");
-        } else {
-			say_num(tm.tm_hour, SSM_PRONOUNCED);
-        }
-        say_file("time/oclock.wav");
+	    if (tm.tm_hour == 1) {
+                say_args->gender = SSG_NEUTER;
+                say_num(tm.tm_hour, SSM_PRONOUNCED);
+            } else {
+                say_num(tm.tm_hour, SSM_PRONOUNCED);
+            }
+
+            say_file("time/oclock.wav");
  
-        if (tm.tm_min > 0) {
-			say_file("currency/and.wav");
-			if (tm.tm_min == 1) {
-				say_file("digits/1_f.wav")
-				say_file("time/minute.wav");
-			} else {
-				say_num(tm.tm_min, SSM_PRONOUNCED);
-				say_file("time/minutes.wav");
-			}
-		}
+            if (tm.tm_min < 10) {
+                say_file("digits/0.wav");
+                say_num(tm.tm_min, SSM_PRONOUNCED);
+            } else {
+                say_num(tm.tm_min, SSM_PRONOUNCED);
+            }
 	}
 
 	return SWITCH_STATUS_SUCCESS;
@@ -413,7 +473,7 @@ static switch_status_t de_say_money(switch_core_session_t *session, char *tosay,
  
 	/* Say dollar amount */
 	if (atoi(dollars) == 1) {
-		say_file("digits/1.wav");
+		say_file("digits/1_n.wav");
 		say_file("currency/dollar.wav");
 	} else {
 		de_say_general_count(session, dollars, say_args, args);
@@ -425,7 +485,7 @@ static switch_status_t de_say_money(switch_core_session_t *session, char *tosay,
 		/* Say "and" */
 		say_file("currency/and.wav");
 		if (atoi(cents) == 1) {
-			say_file("digits/1.wav");
+			say_file("digits/1_n.wav");
 			say_file("currency/cent.wav");
 		} else {
 			de_say_general_count(session, cents, say_args, args);
diff --git a/src/mod/say/mod_say_nl/mod_say_nl.c b/src/mod/say/mod_say_nl/mod_say_nl.c
index 543dd8d3b4..f52b741830 100644
--- a/src/mod/say/mod_say_nl/mod_say_nl.c
+++ b/src/mod/say/mod_say_nl/mod_say_nl.c
@@ -39,6 +39,7 @@
  * 
  * Anthony Minessale II <anthm@freeswitch.org>
  * Michael B. Murdock <mike@mmurdock.org>
+ * Leo Noordergraaf <leo.noordergraaf@deanconnect.nl>
  *
  * mod_say_nl.c -- Say for nl
  *
@@ -86,14 +87,13 @@ static switch_status_t play_group(switch_say_method_t method, int a, int b, int
 
 	if (a) {
 		say_file("digits/%d.wav", a);
-		say_file("digits/hundred.wav");
+		say_file("digits/honderd.wav");
 	}
 
 	if (b) {
 		if (b > 1) {
 			if (c) {
-				say_file("digits/%d.wav", c);
-				say_file("currency/and.wav");
+				say_file("digits/%d-en.wav", c);
 			}
 			say_file("digits/%d0.wav", b);
 		} else {
@@ -103,6 +103,7 @@ static switch_status_t play_group(switch_say_method_t method, int a, int b, int
 	}
 
 	if (c) {
+		say_file("digits/%d.wav", c);
 		if (method == SSM_COUNTED) {
 			say_file("digits/h-%d.wav", c);
 		} else {
@@ -156,10 +157,10 @@ static switch_status_t nl_say_general_count(switch_core_session_t *session, char
 		switch (say_args->method) {
 		case SSM_COUNTED:
 		case SSM_PRONOUNCED:
-			if ((status = play_group(SSM_PRONOUNCED, places[8], places[7], places[6], "digits/million.wav", session, args)) != SWITCH_STATUS_SUCCESS) {
+			if ((status = play_group(SSM_PRONOUNCED, places[8], places[7], places[6], "digits/miljoen.wav", session, args)) != SWITCH_STATUS_SUCCESS) {
 				return status;
 			}
-			if ((status = play_group(SSM_PRONOUNCED, places[5], places[4], places[3], "digits/thousand.wav", session, args)) != SWITCH_STATUS_SUCCESS) {
+			if ((status = play_group(SSM_PRONOUNCED, places[5], places[4], places[3], "digits/duizend.wav", session, args)) != SWITCH_STATUS_SUCCESS) {
 				return status;
 			}
 			if ((status = play_group(say_args->method, places[2], places[1], places[0], NULL, session, args)) != SWITCH_STATUS_SUCCESS) {
@@ -201,7 +202,9 @@ static switch_status_t nl_say_time(switch_core_session_t *session, char *tosay,
 				if ((p = strchr(tme, ':'))) {
 					*p++ = '\0';
 					minutes = atoi(p);
-					hours = atoi(tme);
+					if (tme) {
+						hours = atoi(tme);
+					}
 				} else {
 					minutes = atoi(tme);
 				}
@@ -226,28 +229,23 @@ static switch_status_t nl_say_time(switch_core_session_t *session, char *tosay,
 
 		if (hours) {
 			say_num(hours, SSM_PRONOUNCED);
-			if (hours == 1) {
-				say_file("time/hour.wav");
-			} else {
-				say_file("time/hours.wav");
-			}
 		} else {
 			say_file("digits/0.wav");
-			say_file("time/hours.wav");
 		}
+		say_file("time/uur.wav");
 
 		if (minutes) {
 			say_num(minutes, SSM_PRONOUNCED);
 			if (minutes == 1) {
-				say_file("time/minute.wav");
+				say_file("time/minuut.wav");
 			} else {
-				say_file("time/minutes.wav");
+				say_file("time/minuten.wav");
 			}
 		} else {
 			say_file("digits/0.wav");
-			say_file("time/minutes.wav");
+			say_file("time/minuten.wav");
 		}
-
+/* LN: Not in use
 		if (seconds) {
 			say_num(seconds, SSM_PRONOUNCED);
 			if (seconds == 1) {
@@ -259,7 +257,7 @@ static switch_status_t nl_say_time(switch_core_session_t *session, char *tosay,
 			say_file("digits/0.wav");
 			say_file("time/seconds.wav");
 		}
-
+*/
 		return SWITCH_STATUS_SUCCESS;
 	}
 
@@ -297,36 +295,20 @@ static switch_status_t nl_say_time(switch_core_session_t *session, char *tosay,
 
 	if (say_date) {
 		say_file("time/day-%d.wav", tm.tm_wday);
-		say_file("time/mon-%d.wav", tm.tm_mon);
 		say_num(tm.tm_mday, SSM_COUNTED);
-		say_num(tm.tm_year + 1900, SSM_PRONOUNCED);
+		say_file("time/mon-%d.wav", tm.tm_mon);
+		/* say_num(tm.tm_year + 1900, SSM_PRONOUNCED); */
+	}
+
+	if (say_date && say_time) {
+		say_file("time/om.wav");
 	}
 
 	if (say_time) {
-		int32_t hour = tm.tm_hour, pm = 0;
+		say_num(tm.tm_hour, SSM_PRONOUNCED);
+		say_file("time/uur.wav");
 
-		if (hour > 12) {
-			hour -= 12;
-			pm = 1;
-		} else if (hour == 12) {
-			pm = 1;
-		} else if (hour == 0) {
-			hour = 12;
-			pm = 0;
-		}
-
-		say_num(hour, SSM_PRONOUNCED);
-
-		if (tm.tm_min > 9) {
-			say_num(tm.tm_min, SSM_PRONOUNCED);
-		} else if (tm.tm_min) {
-			say_file("time/oh.wav");
-			say_num(tm.tm_min, SSM_PRONOUNCED);
-		} else {
-			say_file("time/oclock.wav");
-		}
-
-		say_file("time/%s.wav", pm ? "p-m" : "a-m");
+		say_num(tm.tm_min, SSM_PRONOUNCED);
 	}
 
 	return SWITCH_STATUS_SUCCESS;
@@ -361,7 +343,7 @@ static switch_status_t nl_say_money(switch_core_session_t *session, char *tosay,
 
 	/* If negative say "negative" */
 	if (sbuf[0] == '-') {
-		say_file("currency/negative.wav");
+		say_file("currency/min.wav");
 		dollars++;
 	}
 
@@ -369,29 +351,20 @@ static switch_status_t nl_say_money(switch_core_session_t *session, char *tosay,
 	if (( status = nl_say_general_count(session, dollars, say_args, args)) != SWITCH_STATUS_SUCCESS ) {
 		return status;
 	}
-
-	if (atoi(dollars) == 1) {
-		say_file("currency/dollar.wav");
-	} else {
-		say_file("currency/dollars.wav");
-	}
+	say_file("currency/euro.wav");
 
 	/* Say "and" */
-	say_file("currency/and.wav");
+	say_file("currency/en.wav");
 
 	/* Say cents */
 	if (cents) {
 		if (( status = nl_say_general_count(session, cents, say_args, args)) != SWITCH_STATUS_SUCCESS) {
 			return status;
 		}
-		if (atoi(cents) == 1) {
-			say_file("currency/cent.wav");
-		} else {
-			say_file("currency/cents.wav");
-		}
+		say_file("currency/cent.wav");
 	} else {
 		say_file("digits/0.wav");
-		say_file("currency/cents.wav");
+		say_file("currency/cent.wav");
 	}
 
 	return SWITCH_STATUS_SUCCESS;
diff --git a/src/mod/xml_int/mod_xml_cdr/mod_xml_cdr.c b/src/mod/xml_int/mod_xml_cdr/mod_xml_cdr.c
index f918a69b5c..163f7fed0e 100644
--- a/src/mod/xml_int/mod_xml_cdr/mod_xml_cdr.c
+++ b/src/mod/xml_int/mod_xml_cdr/mod_xml_cdr.c
@@ -26,6 +26,7 @@
  * Brian West <brian@freeswitch.org>
  * Bret McDanel <trixter AT 0xdecafbad.com>
  * Justin Cassidy <xachenant@hotmail.com>
+ * Emmanuel Schmidbauer <eschmidbauer@gmail.com>
  *
  * mod_xml_cdr.c -- XML CDR Module to files or curl
  *
@@ -200,6 +201,8 @@ static switch_status_t my_on_reporting(switch_core_session_t *session)
 	int is_b;
 	const char *a_prefix = "";
 	char url_joiner = '?';
+	int prefix_a;
+	const char *prefix_a_var = NULL;
 
 	if (globals.shutdown) {
 		return SWITCH_STATUS_SUCCESS;
@@ -212,7 +215,14 @@ static switch_status_t my_on_reporting(switch_core_session_t *session)
 			return SWITCH_STATUS_SUCCESS;
 		}
 	}
-	if (!is_b && globals.prefix_a)
+
+	// channel variable can over-ride global setting "prefix-a-leg"
+	if ((prefix_a_var = switch_channel_get_variable(channel, "prefix-a-leg"))) {
+		prefix_a = switch_true(prefix_a_var);
+	} else {
+		prefix_a = globals.prefix_a;
+	}
+	if (!is_b && prefix_a)
 		a_prefix = "a_";
 
 	if (switch_ivr_generate_xml_cdr(session, &cdr) != SWITCH_STATUS_SUCCESS) {
diff --git a/src/switch_apr.c b/src/switch_apr.c
index 8407d42713..fe305d7cd1 100644
--- a/src/switch_apr.c
+++ b/src/switch_apr.c
@@ -1014,7 +1014,9 @@ SWITCH_DECLARE(switch_status_t) switch_poll(switch_pollfd_t *aprset, int32_t num
 	if (aprset) {
 		st = apr_poll((apr_pollfd_t *) aprset, numsock, nsds, timeout);
 
-		if (st == APR_TIMEUP) {
+		if (numsock == 1 && ((aprset[0].rtnevents & APR_POLLERR) || (aprset[0].rtnevents & APR_POLLHUP) || (aprset[0].rtnevents & APR_POLLNVAL))) {
+			st = SWITCH_STATUS_GENERR;
+		} else if (st == APR_TIMEUP) {
 			st = SWITCH_STATUS_TIMEOUT;
 		}
 	}
diff --git a/src/switch_channel.c b/src/switch_channel.c
index c989ce2cb3..b23236fa47 100644
--- a/src/switch_channel.c
+++ b/src/switch_channel.c
@@ -204,18 +204,21 @@ SWITCH_DECLARE(const char *) switch_channel_cause2str(switch_call_cause_t cause)
 SWITCH_DECLARE(switch_call_cause_t) switch_channel_str2cause(const char *str)
 {
 	uint8_t x;
-	switch_call_cause_t cause = SWITCH_CAUSE_NONE;
+	switch_call_cause_t cause = SWITCH_CAUSE_NORMAL_CLEARING;
 
-	if (*str > 47 && *str < 58) {
-		cause = atoi(str);
-	} else {
-		for (x = 0; x < (sizeof(CAUSE_CHART) / sizeof(struct switch_cause_table)) - 1 && CAUSE_CHART[x].name; x++) {
-			if (!strcasecmp(CAUSE_CHART[x].name, str)) {
-				cause = CAUSE_CHART[x].cause;
-				break;
+	if (!zstr(str)) {
+		if (*str > 47 && *str < 58) {
+			cause = atoi(str);
+		} else {
+			for (x = 0; x < (sizeof(CAUSE_CHART) / sizeof(struct switch_cause_table)) - 1 && CAUSE_CHART[x].name; x++) {
+				if (!strcasecmp(CAUSE_CHART[x].name, str)) {
+					cause = CAUSE_CHART[x].cause;
+					break;
+				}
 			}
 		}
 	}
+
 	return cause;
 }
 
@@ -2110,8 +2113,7 @@ SWITCH_DECLARE(int) switch_channel_state_change_pending(switch_channel_t *channe
 
 SWITCH_DECLARE(int) switch_channel_check_signal(switch_channel_t *channel, switch_bool_t in_thread_only)
 {
-	(void)in_thread_only;
-	switch_ivr_parse_next_signal_data(channel->session);
+	switch_ivr_parse_signal_data(channel->session, SWITCH_FALSE, in_thread_only);
 	return 0;
 }
 
@@ -3244,6 +3246,7 @@ SWITCH_DECLARE(switch_channel_state_t) switch_channel_perform_hangup(switch_chan
 		switch_event_t *event;
 		const char *var;
 
+
 		switch_mutex_lock(channel->profile_mutex);
 		if (channel->hold_record && !channel->hold_record->off) {
 			channel->hold_record->off = switch_time_now();
diff --git a/src/switch_core.c b/src/switch_core.c
index 2d342fb33b..259807a287 100644
--- a/src/switch_core.c
+++ b/src/switch_core.c
@@ -982,24 +982,30 @@ SWITCH_DECLARE(int32_t) set_realtime_priority(void)
 #ifdef SOLARIS_PRIVILEGES
 	/* request the privileges to elevate the priority */
 	if (priv_set(PRIV_ON, PRIV_EFFECTIVE, PRIV_PROC_PRIOCNTL, NULL) < 0) {
+#ifdef PRIV_PROC_PRIOUP
+		/* fallback to PRIV_PROC_PRIOUP on SmartOS */
 		fprintf(stderr, "WARN: Failed to acquire proc_priocntl privilege (%s)\n", strerror(errno));
-	} else {
-		if (sched_setscheduler(0, SCHED_FIFO, &sched) < 0) {
-			fprintf(stderr, "ERROR: Failed to set SCHED_FIFO scheduler (%s)\n", strerror(errno));
-		} else {
-			return 0;
-		}
-	}
-		
-	if (priv_set(PRIV_ON, PRIV_EFFECTIVE, PRIV_PROC_PRIOUP, NULL) < 0) {
-		fprintf(stderr, "ERROR: Failed to acquire proc_prioup privilege (%s)\n", strerror(errno));
-		return -1;
-	} else {
-		if (setpriority(PRIO_PROCESS, 0, -10) < 0) {
-			fprintf(stderr, "ERROR: Could not set nice level\n");
+		if (priv_set(PRIV_ON, PRIV_EFFECTIVE, PRIV_PROC_PRIOUP, NULL) < 0) {
+			fprintf(stderr, "ERROR: Failed to acquire proc_prioup privilege (%s)\n", strerror(errno));
 			return -1;
 		}
+#else
+		fprintf(stderr, "ERROR: Failed to acquire proc_priocntl privilege (%s)\n", strerror(errno));
+		return -1;
+#endif
 	}
+
+	if (sched_setscheduler(0, SCHED_FIFO, &sched) < 0) {
+		fprintf(stderr, "WARN: Failed to set SCHED_FIFO scheduler (%s)\n", strerror(errno));
+	} else {
+		return 0;
+	}
+
+	if (setpriority(PRIO_PROCESS, 0, -10) < 0) {
+		fprintf(stderr, "ERROR: Could not set nice level\n");
+		return -1;
+	}
+
 	return 0;
 #else
 
diff --git a/src/switch_core_codec.c b/src/switch_core_codec.c
index 59ca4e494f..a62ceee473 100644
--- a/src/switch_core_codec.c
+++ b/src/switch_core_codec.c
@@ -333,8 +333,7 @@ SWITCH_DECLARE(switch_status_t) switch_core_session_get_real_read_impl(switch_co
 		*impp = session->real_read_impl;
 		return SWITCH_STATUS_SUCCESS;
 	} else {
-		memset(impp, 0, sizeof(*impp));
-		impp->number_of_channels = 1;
+		return switch_core_session_get_read_impl(session, impp);
 	}
 
 	return SWITCH_STATUS_FALSE;
diff --git a/src/switch_core_file.c b/src/switch_core_file.c
index c6d92ded06..8632e8482f 100644
--- a/src/switch_core_file.c
+++ b/src/switch_core_file.c
@@ -80,6 +80,8 @@ SWITCH_DECLARE(switch_status_t) switch_core_perform_file_open(const char *file,
 		switch_set_flag(fh, SWITCH_FILE_FLAG_FREE_POOL);
 	}
 
+	switch_mutex_init(&fh->flag_mutex, SWITCH_MUTEX_NESTED, fh->memory_pool);
+
 	fh->mm.samplerate = 44100;
 	fh->mm.channels = 1;
 	fh->mm.keyint = 60;
@@ -259,7 +261,7 @@ SWITCH_DECLARE(switch_status_t) switch_core_perform_file_open(const char *file,
 	fh->line = line;
 
 	if (switch_test_flag(fh, SWITCH_FILE_FLAG_VIDEO) && !fh->file_interface->file_read_video) {
-		switch_clear_flag(fh, SWITCH_FILE_FLAG_VIDEO);
+		switch_clear_flag_locked(fh, SWITCH_FILE_FLAG_VIDEO);
 	}
 
 	if (spool_path) {
@@ -292,7 +294,7 @@ SWITCH_DECLARE(switch_status_t) switch_core_perform_file_open(const char *file,
 			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "Spool dir is set.  Make sure [%s] is also a valid path\n", fh->spool_path);
 		}
 		UNPROTECT_INTERFACE(fh->file_interface);
-		switch_goto_status(status, fail);
+		goto fail;
 	}
 
 	fh->real_channels = fh->channels;
@@ -305,7 +307,7 @@ SWITCH_DECLARE(switch_status_t) switch_core_perform_file_open(const char *file,
 		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "File [%s] not created!\n", file_path);
 		fh->file_interface->file_close(fh);
 		UNPROTECT_INTERFACE(fh->file_interface);
-		switch_goto_status(status, fail);
+		goto fail;
 	}
 
 	if (to) {
@@ -341,12 +343,12 @@ SWITCH_DECLARE(switch_status_t) switch_core_perform_file_open(const char *file,
 		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "File has %d channels, muxing to %d channel%s will occur.\n", fh->real_channels, fh->channels, fh->channels == 1 ? "" : "s");
 	}
 
-	switch_set_flag(fh, SWITCH_FILE_OPEN);
+	switch_set_flag_locked(fh, SWITCH_FILE_OPEN);
 	return status;
 
   fail:
 
-	switch_clear_flag(fh, SWITCH_FILE_OPEN);
+	switch_clear_flag_locked(fh, SWITCH_FILE_OPEN);
 
 	if (fh->params) {
 		switch_event_destroy(&fh->params);
@@ -387,7 +389,7 @@ SWITCH_DECLARE(switch_status_t) switch_core_file_read(switch_file_handle_t *fh,
 	}
 
 	if (switch_test_flag(fh, SWITCH_FILE_DONE)) {
-		switch_clear_flag(fh, SWITCH_FILE_DONE);
+		switch_clear_flag_locked(fh, SWITCH_FILE_DONE);
 		*len = 0;
 		return SWITCH_STATUS_FALSE;
 	}
@@ -410,7 +412,7 @@ SWITCH_DECLARE(switch_status_t) switch_core_file_read(switch_file_handle_t *fh,
 				
 
 				if (status != SWITCH_STATUS_SUCCESS || !rlen) {
-					switch_set_flag(fh, SWITCH_FILE_BUFFER_DONE);
+					switch_set_flag_locked(fh, SWITCH_FILE_BUFFER_DONE);
 				} else {
 					fh->samples_in += rlen;
 					if (fh->real_channels != fh->channels && !switch_test_flag(fh, SWITCH_FILE_NOMUX)) {
@@ -425,7 +427,7 @@ SWITCH_DECLARE(switch_status_t) switch_core_file_read(switch_file_handle_t *fh,
 		*len = asis ? rlen : rlen / 2 / fh->channels;
 
 		if (*len == 0) {
-			switch_set_flag(fh, SWITCH_FILE_DONE);
+			switch_set_flag_locked(fh, SWITCH_FILE_DONE);
 			goto top;
 		} else {
 			status = SWITCH_STATUS_SUCCESS;
@@ -438,7 +440,7 @@ SWITCH_DECLARE(switch_status_t) switch_core_file_read(switch_file_handle_t *fh,
 		}
 
 		if (status != SWITCH_STATUS_SUCCESS || !*len) {
-			switch_set_flag(fh, SWITCH_FILE_DONE);
+			switch_set_flag_locked(fh, SWITCH_FILE_DONE);
 			goto top;
 		}
 
@@ -493,9 +495,9 @@ SWITCH_DECLARE(switch_status_t) switch_core_file_read(switch_file_handle_t *fh,
 	return status;
 }
 
-SWITCH_DECLARE(switch_bool_t) switch_core_file_has_video(switch_file_handle_t *fh)
+SWITCH_DECLARE(switch_bool_t) switch_core_file_has_video(switch_file_handle_t *fh, switch_bool_t check_open)
 {
-	return (switch_test_flag(fh, SWITCH_FILE_OPEN) && switch_test_flag(fh, SWITCH_FILE_FLAG_VIDEO)) ? SWITCH_TRUE : SWITCH_FALSE;
+	return ((!check_open || switch_test_flag(fh, SWITCH_FILE_OPEN)) && switch_test_flag(fh, SWITCH_FILE_FLAG_VIDEO)) ? SWITCH_TRUE : SWITCH_FALSE;
 }
 
 SWITCH_DECLARE(switch_status_t) switch_core_file_write(switch_file_handle_t *fh, void *data, switch_size_t *len)
@@ -659,7 +661,7 @@ SWITCH_DECLARE(switch_status_t) switch_core_file_seek(switch_file_handle_t *fh,
 		}
 	}
 
-	switch_set_flag(fh, SWITCH_FILE_SEEK);
+	switch_set_flag_locked(fh, SWITCH_FILE_SEEK);
 	status = fh->file_interface->file_seek(fh, cur_pos, samples, whence);
 
 	fh->offset_pos = *cur_pos;
@@ -733,6 +735,37 @@ SWITCH_DECLARE(switch_status_t) switch_core_file_truncate(switch_file_handle_t *
 
 }
 
+SWITCH_DECLARE(switch_status_t) switch_core_file_command(switch_file_handle_t *fh, switch_file_command_t command)
+{
+	switch_status_t status = SWITCH_STATUS_FALSE;
+	
+	switch_assert(fh != NULL);
+	switch_assert(fh->file_interface != NULL);
+
+	if (!switch_test_flag(fh, SWITCH_FILE_OPEN)) {
+		return SWITCH_STATUS_FALSE;
+	}
+
+	switch(command) {
+	case SCFC_FLUSH_AUDIO:
+		if (fh->pre_buffer) {
+			switch_buffer_zero(fh->pre_buffer);
+		}
+		break;
+	default:
+		break;
+	}
+
+	if (fh->file_interface->file_command) {
+		switch_mutex_lock(fh->flag_mutex);
+		status = fh->file_interface->file_command(fh, command);
+		switch_mutex_unlock(fh->flag_mutex);
+	}
+
+	return status;
+}
+
+
 SWITCH_DECLARE(switch_status_t) switch_core_file_close(switch_file_handle_t *fh)
 {
 	switch_status_t status;
@@ -744,17 +777,6 @@ SWITCH_DECLARE(switch_status_t) switch_core_file_close(switch_file_handle_t *fh)
 		return SWITCH_STATUS_FALSE;
 	}
 
-	if (fh->params) {
-		switch_event_destroy(&fh->params);
-	}
-
-	fh->samples_in = 0;
-	fh->max_samples = 0;
-
-	if (fh->buffer) {
-		switch_buffer_destroy(&fh->buffer);
-	}
-
 	if (fh->pre_buffer) {
 		if (switch_test_flag(fh, SWITCH_FILE_FLAG_WRITE)) {
 			switch_size_t rlen, blen;
@@ -777,11 +799,29 @@ SWITCH_DECLARE(switch_status_t) switch_core_file_close(switch_file_handle_t *fh)
 		switch_buffer_destroy(&fh->pre_buffer);
 	}
 
-	switch_clear_flag(fh, SWITCH_FILE_OPEN);
+	switch_clear_flag_locked(fh, SWITCH_FILE_OPEN);
 	status = fh->file_interface->file_close(fh);
 
+	if (fh->params) {
+		switch_event_destroy(&fh->params);
+	}
+
+	fh->samples_in = 0;
+	fh->max_samples = 0;
+
+	if (fh->buffer) {
+		switch_buffer_destroy(&fh->buffer);
+	}
+
 	switch_resample_destroy(&fh->resampler);
 
+	if (switch_test_flag(fh, SWITCH_FILE_FLAG_FREE_POOL)) {
+		switch_core_destroy_memory_pool(&fh->memory_pool);
+	}
+
+	fh->memory_pool = NULL;
+
+	switch_safe_free(fh->dbuf);
 
 	if (fh->spool_path) {
 		char *command;
@@ -799,15 +839,8 @@ SWITCH_DECLARE(switch_status_t) switch_core_file_close(switch_file_handle_t *fh)
 		free(command);
 	}
 
-
 	UNPROTECT_INTERFACE(fh->file_interface);
-
-	if (switch_test_flag(fh, SWITCH_FILE_FLAG_FREE_POOL)) {
-		switch_core_destroy_memory_pool(&fh->memory_pool);
-	}
-
-	switch_safe_free(fh->dbuf);
-
+	fh->file_interface = NULL;
 
 	return status;
 }
diff --git a/src/switch_core_hash.c b/src/switch_core_hash.c
index cdec290942..d52bb3aab3 100644
--- a/src/switch_core_hash.c
+++ b/src/switch_core_hash.c
@@ -57,39 +57,45 @@ SWITCH_DECLARE(switch_status_t) switch_core_hash_destroy(switch_hash_t **hash)
 
 SWITCH_DECLARE(switch_status_t) switch_core_hash_insert_destructor(switch_hash_t *hash, const char *key, const void *data, hashtable_destructor_t destructor)
 {
-	switch_hashtable_insert_destructor(hash, strdup(key), (void *)data, HASHTABLE_FLAG_FREE_KEY | HASHTABLE_DUP_CHECK, destructor);
+	int r = 0;
+
+	r = switch_hashtable_insert_destructor(hash, strdup(key), (void *)data, HASHTABLE_FLAG_FREE_KEY | HASHTABLE_DUP_CHECK, destructor);
 	
-	return SWITCH_STATUS_SUCCESS;
+	return r ? SWITCH_STATUS_SUCCESS : SWITCH_STATUS_FALSE;
 }
 
 SWITCH_DECLARE(switch_status_t) switch_core_hash_insert_locked(switch_hash_t *hash, const char *key, const void *data, switch_mutex_t *mutex)
 {
+	switch_status_t status = SWITCH_STATUS_FALSE;
+
 	if (mutex) {
 		switch_mutex_lock(mutex);
 	}
 
-	switch_core_hash_insert(hash, key, data);
+	status = switch_core_hash_insert(hash, key, data);
 
 	if (mutex) {
 		switch_mutex_unlock(mutex);
 	}
 
-	return SWITCH_STATUS_SUCCESS;
+	return status;
 }
 
 SWITCH_DECLARE(switch_status_t) switch_core_hash_insert_wrlock(switch_hash_t *hash, const char *key, const void *data, switch_thread_rwlock_t *rwlock)
 {
+	switch_status_t status = SWITCH_STATUS_FALSE;
+
 	if (rwlock) {
 		switch_thread_rwlock_wrlock(rwlock);
 	}
 
-	switch_core_hash_insert(hash, key, data);
+	status = switch_core_hash_insert(hash, key, data);
 
 	if (rwlock) {
 		switch_thread_rwlock_unlock(rwlock);
 	}
 
-	return SWITCH_STATUS_SUCCESS;
+	return status;
 }
 
 SWITCH_DECLARE(void *) switch_core_hash_delete(switch_hash_t *hash, const char *key)
@@ -97,8 +103,10 @@ SWITCH_DECLARE(void *) switch_core_hash_delete(switch_hash_t *hash, const char *
 	return switch_hashtable_remove(hash, (void *)key);
 }
 
-SWITCH_DECLARE(switch_status_t) switch_core_hash_delete_locked(switch_hash_t *hash, const char *key, switch_mutex_t *mutex)
+SWITCH_DECLARE(void *) switch_core_hash_delete_locked(switch_hash_t *hash, const char *key, switch_mutex_t *mutex)
 {
+	void *ret = NULL;
+
 	if (mutex) {
 		switch_mutex_lock(mutex);
 	}
@@ -109,22 +117,24 @@ SWITCH_DECLARE(switch_status_t) switch_core_hash_delete_locked(switch_hash_t *ha
 		switch_mutex_unlock(mutex);
 	}
 
-	return SWITCH_STATUS_SUCCESS;
+	return ret;
 }
 
-SWITCH_DECLARE(switch_status_t) switch_core_hash_delete_wrlock(switch_hash_t *hash, const char *key, switch_thread_rwlock_t *rwlock)
+SWITCH_DECLARE(void *) switch_core_hash_delete_wrlock(switch_hash_t *hash, const char *key, switch_thread_rwlock_t *rwlock)
 {
+	void *ret = NULL;
+
 	if (rwlock) {
 		switch_thread_rwlock_wrlock(rwlock);
 	}
 
-	switch_core_hash_delete(hash, key);
+	ret = switch_core_hash_delete(hash, key);
 
 	if (rwlock) {
 		switch_thread_rwlock_unlock(rwlock);
 	}
 
-	return SWITCH_STATUS_SUCCESS;
+	return ret;
 }
 
 SWITCH_DECLARE(switch_status_t) switch_core_hash_delete_multi(switch_hash_t *hash, switch_hash_delete_callback_t callback, void *pData) {
@@ -252,12 +262,13 @@ SWITCH_DECLARE(switch_status_t) switch_core_inthash_destroy(switch_inthash_t **h
 SWITCH_DECLARE(switch_status_t) switch_core_inthash_insert(switch_inthash_t *hash, uint32_t key, const void *data)
 {
 	uint32_t *k = NULL;
+	int r = 0;
 
 	switch_zmalloc(k, sizeof(*k));
 	*k = key;
-	switch_hashtable_insert_destructor(hash, k, (void *)data, HASHTABLE_FLAG_FREE_KEY | HASHTABLE_DUP_CHECK, NULL);
+	r = switch_hashtable_insert_destructor(hash, k, (void *)data, HASHTABLE_FLAG_FREE_KEY | HASHTABLE_DUP_CHECK, NULL);
 
-	return SWITCH_STATUS_SUCCESS;
+	return r ? SWITCH_STATUS_SUCCESS : SWITCH_STATUS_FALSE;
 }
 
 SWITCH_DECLARE(void *) switch_core_inthash_delete(switch_inthash_t *hash, uint32_t key)
diff --git a/src/switch_core_io.c b/src/switch_core_io.c
index 8c3bf14b7e..f0a775e56b 100644
--- a/src/switch_core_io.c
+++ b/src/switch_core_io.c
@@ -69,7 +69,7 @@ SWITCH_DECLARE(switch_status_t) switch_core_session_read_frame(switch_core_sessi
 {
 	switch_io_event_hook_read_frame_t *ptr;
 	switch_status_t status = SWITCH_STATUS_FALSE;
-	int need_codec, perfect, bug_locked = 0, global_prune = 0, do_bugs = 0, do_resample = 0, is_cng = 0, tap_only = 0;
+	int need_codec, perfect, do_bugs = 0, do_resample = 0, is_cng = 0, tap_only = 0;
 	switch_codec_implementation_t codec_impl;
 	unsigned int flag = 0;
 	int i;
@@ -183,12 +183,12 @@ SWITCH_DECLARE(switch_status_t) switch_core_session_read_frame(switch_core_sessi
 		if (status == SWITCH_STATUS_INUSE) {
 			*frame = &runtime.dummy_cng_frame;
 			switch_yield(20000);
-			switch_goto_status(SWITCH_STATUS_SUCCESS, bail_out);
+			return SWITCH_STATUS_SUCCESS;
 		}
 
 		if (!SWITCH_READ_ACCEPTABLE(status) || !session->read_codec || !switch_core_codec_ready(session->read_codec)) {
 			*frame = NULL;
-			switch_goto_status(SWITCH_STATUS_FALSE, bail_out);
+			return SWITCH_STATUS_FALSE;
 		}
 
 		switch_mutex_lock(session->codec_read_mutex);
@@ -198,7 +198,7 @@ SWITCH_DECLARE(switch_status_t) switch_core_session_read_frame(switch_core_sessi
 			switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "%s has no read codec.\n", switch_channel_get_name(session->channel));
 			switch_channel_hangup(session->channel, SWITCH_CAUSE_INCOMPATIBLE_DESTINATION);
 			*frame = &runtime.dummy_cng_frame;
-			switch_goto_status(SWITCH_STATUS_FALSE, bail_out);
+			return SWITCH_STATUS_FALSE;
 		}
 
 		switch_mutex_lock(session->read_codec->mutex);
@@ -232,20 +232,13 @@ SWITCH_DECLARE(switch_status_t) switch_core_session_read_frame(switch_core_sessi
 		goto done;
 	}
 
-	if (!bug_locked) {
-		switch_thread_rwlock_rdlock(session->bug_rwlock);
-		if (session->bugs) {
-			bug_locked = 1;
-		} else {
-			switch_thread_rwlock_unlock(session->bug_rwlock);
-		}
-	}
-	
 	if (session->bugs && !((*frame)->flags & SFF_CNG) && !((*frame)->flags & SFF_NOT_AUDIO)) {
 		switch_media_bug_t *bp;
 		switch_bool_t ok = SWITCH_TRUE;
 		int prune = 0;
 
+		switch_thread_rwlock_rdlock(session->bug_rwlock);
+
 		for (bp = session->bugs; bp; bp = bp->next) {
 			ok = SWITCH_TRUE;
 
@@ -282,9 +275,10 @@ SWITCH_DECLARE(switch_status_t) switch_core_session_read_frame(switch_core_sessi
 				prune++;
 			}
 		}
+		switch_thread_rwlock_unlock(session->bug_rwlock);
 
 		if (prune) {
-			global_prune++;
+			switch_core_media_bug_prune(session);
 		}
 	}
 
@@ -305,6 +299,7 @@ SWITCH_DECLARE(switch_status_t) switch_core_session_read_frame(switch_core_sessi
 		int prune = 0;		
 
 		if (session->bugs && switch_test_flag((*frame), SFF_CNG)) {
+			switch_thread_rwlock_rdlock(session->bug_rwlock);
 			for (bp = session->bugs; bp; bp = bp->next) {
 				ok = SWITCH_TRUE;
 
@@ -347,9 +342,10 @@ SWITCH_DECLARE(switch_status_t) switch_core_session_read_frame(switch_core_sessi
 					prune++;
 				}
 			}
+			switch_thread_rwlock_unlock(session->bug_rwlock);
 
 			if (prune) {
-				global_prune++;
+				switch_core_media_bug_prune(session);
 			}
 			
 		
@@ -442,6 +438,12 @@ SWITCH_DECLARE(switch_status_t) switch_core_session_read_frame(switch_core_sessi
 			} else {
 				switch_codec_t *use_codec = read_frame->codec;
 				if (do_bugs) {
+					switch_thread_rwlock_wrlock(session->bug_rwlock);
+					if (!session->bugs) {
+						switch_thread_rwlock_unlock(session->bug_rwlock);
+						goto done;
+					}
+
 					if (!switch_core_codec_ready(&session->bug_codec) && switch_core_codec_ready(read_frame->codec)) {
 						switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "Setting BUG Codec %s:%d\n",
 										  read_frame->codec->implementation->iananame, read_frame->codec->implementation->ianacode);
@@ -455,6 +457,14 @@ SWITCH_DECLARE(switch_status_t) switch_core_session_read_frame(switch_core_sessi
 					if (switch_core_codec_ready(&session->bug_codec)) {
 						use_codec = &session->bug_codec;
 					}
+					switch_thread_rwlock_unlock(session->bug_rwlock);
+
+					switch_thread_rwlock_wrlock(session->bug_rwlock);
+					if (!session->bugs) {
+						do_bugs = 0;
+					}
+					switch_thread_rwlock_unlock(session->bug_rwlock);
+					if (!do_bugs) goto done;
 				}
 
 				if (!switch_test_flag(read_frame->codec, SWITCH_CODEC_FLAG_HAS_PLC) &&
@@ -472,10 +482,17 @@ SWITCH_DECLARE(switch_status_t) switch_core_session_read_frame(switch_core_sessi
 				} else {
 					switch_codec_t *codec = use_codec;
 
+					switch_thread_rwlock_rdlock(session->bug_rwlock);
+
 					if (!switch_core_codec_ready(codec)) {
 						codec = read_frame->codec;
 					}
 					
+					if (!switch_core_codec_ready(codec)) {
+						switch_thread_rwlock_unlock(session->bug_rwlock);
+						goto done;
+					}
+					
 					codec->cur_frame = read_frame;
 					session->read_codec->cur_frame = read_frame;
 					status = switch_core_codec_decode(codec,
@@ -486,10 +503,17 @@ SWITCH_DECLARE(switch_status_t) switch_core_session_read_frame(switch_core_sessi
 													  session->raw_read_frame.data, &session->raw_read_frame.datalen, &session->raw_read_frame.rate, 
 													  &read_frame->flags);
 
+					if (status == SWITCH_STATUS_NOT_INITALIZED) {
+						switch_thread_rwlock_unlock(session->bug_rwlock);
+						goto done;
+					}
+
 					session->raw_read_frame.samples = session->raw_read_frame.datalen / 2;
 					session->raw_read_frame.channels = codec->implementation->number_of_channels;
 					codec->cur_frame = NULL;
 					session->read_codec->cur_frame = NULL;
+					switch_thread_rwlock_unlock(session->bug_rwlock);
+
 				}
 				
 				if (status == SWITCH_STATUS_SUCCESS && session->read_impl.number_of_channels == 1) {
@@ -623,6 +647,7 @@ SWITCH_DECLARE(switch_status_t) switch_core_session_read_frame(switch_core_sessi
 			switch_media_bug_t *bp;
 			switch_bool_t ok = SWITCH_TRUE;
 			int prune = 0;
+			switch_thread_rwlock_rdlock(session->bug_rwlock);
 
 			for (bp = session->bugs; bp; bp = bp->next) {
 				ok = SWITCH_TRUE;
@@ -663,9 +688,9 @@ SWITCH_DECLARE(switch_status_t) switch_core_session_read_frame(switch_core_sessi
 
 
 			}
-
+			switch_thread_rwlock_unlock(session->bug_rwlock);
 			if (prune) {
-				global_prune++;
+				switch_core_media_bug_prune(session);
 			}
 		}
 
@@ -673,6 +698,7 @@ SWITCH_DECLARE(switch_status_t) switch_core_session_read_frame(switch_core_sessi
 			switch_media_bug_t *bp;
 			switch_bool_t ok = SWITCH_TRUE;
 			int prune = 0;
+			switch_thread_rwlock_rdlock(session->bug_rwlock);
 
 			for (bp = session->bugs; bp; bp = bp->next) {
 				ok = SWITCH_TRUE;
@@ -723,9 +749,9 @@ SWITCH_DECLARE(switch_status_t) switch_core_session_read_frame(switch_core_sessi
 					prune++;
 				}
 			}
-
+			switch_thread_rwlock_unlock(session->bug_rwlock);
 			if (prune) {
-				global_prune++;
+				switch_core_media_bug_prune(session);
 			}
 		}
 
@@ -853,7 +879,7 @@ SWITCH_DECLARE(switch_status_t) switch_core_session_read_frame(switch_core_sessi
 			switch_media_bug_t *bp;
 			switch_bool_t ok = SWITCH_TRUE;
 			int prune = 0;
-
+			switch_thread_rwlock_rdlock(session->bug_rwlock);
 			for (bp = session->bugs; bp; bp = bp->next) {
 				ok = SWITCH_TRUE;
 
@@ -892,9 +918,9 @@ SWITCH_DECLARE(switch_status_t) switch_core_session_read_frame(switch_core_sessi
 					prune++;
 				}
 			}
-
+			switch_thread_rwlock_unlock(session->bug_rwlock);
 			if (prune) {
-				global_prune++;
+				switch_core_media_bug_prune(session);
 			}
 		}
 	}
@@ -915,15 +941,6 @@ SWITCH_DECLARE(switch_status_t) switch_core_session_read_frame(switch_core_sessi
 		switch_channel_set_callstate(session->channel, CCS_ACTIVE);
 	}
 
- bail_out:
-
-	if (bug_locked) {
-		switch_thread_rwlock_unlock(session->bug_rwlock);
-	}
-
-	if (global_prune) {
-		switch_core_media_bug_prune(session);
-	}
 
 	return status;
 }
diff --git a/src/switch_core_media.c b/src/switch_core_media.c
index 589209be08..947a7528c4 100644
--- a/src/switch_core_media.c
+++ b/src/switch_core_media.c
@@ -66,13 +66,23 @@ typedef struct secure_settings_s {
 	char *remote_crypto_key;
 } switch_secure_settings_t;
 
+typedef struct core_video_globals_s {
+	int cpu_count;
+	int cur_cpu;
+	switch_memory_pool_t *pool;
+	switch_mutex_t *mutex;
+	uint32_t fps;
+	uint32_t synced;
+} core_video_globals_t;
 
+static core_video_globals_t video_globals = { 0 };
 
 struct media_helper {
 	switch_core_session_t *session;
 	switch_thread_cond_t *cond;
 	switch_mutex_t *cond_mutex;
-	switch_mutex_t *file_mutex;
+	switch_mutex_t *file_read_mutex;
+	switch_mutex_t *file_write_mutex;
 	int up;
 };
 
@@ -164,6 +174,7 @@ typedef struct switch_rtp_engine_s {
 	switch_thread_id_t thread_id;
 	uint8_t new_ice;
 	uint8_t new_dtls;
+	uint32_t sdp_bw;
 } switch_rtp_engine_t;
 
 struct switch_media_handle_s {
@@ -222,6 +233,8 @@ struct switch_media_handle_s {
 	time_t vid_started;
 	int ready_loops;
 
+	switch_thread_t *video_write_thread;
+	int video_write_thread_running;
 };
 
 static switch_srtp_crypto_suite_t SUITES[CRYPTO_INVALID] = {
@@ -344,11 +357,21 @@ SWITCH_DECLARE(uint32_t) switch_core_media_get_video_fps(switch_core_session_t *
 		return 0;
 	}
 	
-	fps = switch_round_to_step(smh->vid_frames / (now - smh->vid_started), 5);
-	if (fps < 15) fps = 15;
+	fps = switch_round_to_step(smh->vid_frames / (now - smh->vid_started), 5);	
 
-	smh->vid_started = switch_epoch_time_now(NULL);
-	smh->vid_frames = 1;
+	if (smh->vid_frames > 1000) {
+		smh->vid_started = switch_epoch_time_now(NULL);
+		smh->vid_frames = 1;
+	}
+
+	if (fps > 0) {
+		video_globals.fps = fps;
+	
+		if (smh->vid_params.fps != fps) {
+			switch_channel_set_variable_printf(session->channel, "video_fps", "%d", fps);
+			smh->vid_params.fps = fps;
+		}
+	}
 
 	return fps;
 }
@@ -1053,7 +1076,7 @@ static switch_status_t switch_core_media_build_crypto(switch_media_handle_t *smh
 		*p-- = '\0';
 	}
 
-	if (!index) index = ctype + 1;
+	if (index == SWITCH_NO_CRYPTO_TAG) index = ctype + 1;
 
 	engine->ssec[ctype].local_crypto_key = switch_core_session_sprintf(smh->session, "%d %s inline:%s", index, SUITES[ctype].name, b64_key);
 	switch_channel_set_variable_name_printf(smh->session->channel, engine->ssec[ctype].local_crypto_key, "rtp_last_%s_local_crypto_key", type2str(type));
@@ -1449,10 +1472,10 @@ SWITCH_DECLARE(void) switch_core_session_check_outgoing_crypto(switch_core_sessi
 
 	for (i = 0; smh->crypto_suite_order[i] != CRYPTO_INVALID; i++) {
 		switch_core_media_build_crypto(session->media_handle,
-									   SWITCH_MEDIA_TYPE_AUDIO, 0, smh->crypto_suite_order[i], SWITCH_RTP_CRYPTO_SEND, 0);
+									   SWITCH_MEDIA_TYPE_AUDIO, SWITCH_NO_CRYPTO_TAG, smh->crypto_suite_order[i], SWITCH_RTP_CRYPTO_SEND, 0);
 
 		switch_core_media_build_crypto(session->media_handle,
-									   SWITCH_MEDIA_TYPE_VIDEO, 0, smh->crypto_suite_order[i], SWITCH_RTP_CRYPTO_SEND, 0);
+									   SWITCH_MEDIA_TYPE_VIDEO, SWITCH_NO_CRYPTO_TAG, smh->crypto_suite_order[i], SWITCH_RTP_CRYPTO_SEND, 0);
 	}
 
 }
@@ -1957,7 +1980,7 @@ static void check_jb(switch_core_session_t *session, const char *input, int32_t
 static void check_jb_sync(switch_core_session_t *session)
 {
 	int32_t jb_sync_msec = 0;
-	uint32_t fps, frames = 0;
+	uint32_t fps = 0, frames = 0;
 	uint32_t min_frames = 0;
 	uint32_t max_frames = 0;
 	uint32_t cur_frames = 0;
@@ -1999,14 +2022,8 @@ static void check_jb_sync(switch_core_session_t *session)
 		}
 	}
 	
-	if (smh->vid_frames < 10) {
-		fps = 30; 
-	} else {
-		fps = switch_core_media_get_video_fps(session);
-	}
+	fps = switch_core_media_get_video_fps(session);
 	
-	if (fps < 15) return;
-
 	switch_rtp_get_video_buffer_size(v_engine->rtp_session, &min_frames, &max_frames, &cur_frames, NULL);
 
 	if (!frames) {
@@ -2035,11 +2052,13 @@ static void check_jb_sync(switch_core_session_t *session)
 					  switch_core_session_get_uuid(session),
 					  switch_channel_get_name(session->channel),
 					  switch_channel_get_variable_dup(session->channel, "caller_id_name", SWITCH_FALSE, -1),
-					  jb_sync_msec, frames, fps, sync_audio ? "yes" : "no", sync_video ? "yes" : "no");
+					  jb_sync_msec, frames, video_globals.fps, sync_audio ? "yes" : "no", sync_video ? "yes" : "no");
 	
 	if (sync_audio) {
 		check_jb(session, NULL, jb_sync_msec, 0, SWITCH_TRUE);
 	}
+
+	video_globals.synced++;
 }
 
 
@@ -2113,17 +2132,10 @@ SWITCH_DECLARE(switch_status_t) switch_core_media_read_frame(switch_core_session
 		return SWITCH_STATUS_FALSE;
 	}
 
-	switch_assert(engine->rtp_session != NULL);
-
-
 	if (!switch_channel_up_nosig(session->channel) || !switch_rtp_ready(engine->rtp_session) || switch_channel_test_flag(session->channel, CF_NOT_READY)) {
 		return SWITCH_STATUS_FALSE;
 	}
 
-	if (switch_channel_test_flag(session->channel, CF_LEG_HOLDING)) {
-		return SWITCH_STATUS_INUSE;
-	}
-	
 	if (smh->read_mutex[type] && switch_mutex_trylock(smh->read_mutex[type]) != SWITCH_STATUS_SUCCESS) {
 		/* return CNG, another thread is already reading  */
 		switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG1, "%s is already being read for %s\n", 
@@ -2158,12 +2170,26 @@ SWITCH_DECLARE(switch_status_t) switch_core_media_read_frame(switch_core_session
 			goto end;
 		}
 
+		if (switch_channel_test_flag(session->channel, CF_LEG_HOLDING)) {
+			status = SWITCH_STATUS_INUSE;
+			goto end;
+		}
+	
+		if (status == SWITCH_STATUS_BREAK) {
+			goto end;
+		}
+
 		if (type == SWITCH_MEDIA_TYPE_VIDEO && engine->read_frame.m) {
+			
 			if (!smh->vid_started) {
 				smh->vid_started = switch_epoch_time_now(NULL);
 			}
 			smh->vid_frames++;
 
+			if ((smh->vid_frames % 15) == 0) {
+				switch_core_media_get_video_fps(session);
+			}
+			
 			if (smh->vid_frames == 1 || ((smh->vid_frames % 300) == 0)) {
 				check_jb_sync(session);
 			}
@@ -2629,20 +2655,27 @@ static void switch_core_session_parse_codec_settings(switch_core_session_t *sess
 	switch(type) {
 	case SWITCH_MEDIA_TYPE_AUDIO:
 		break;
-	case SWITCH_MEDIA_TYPE_VIDEO:
-		{
-			const char *bwv = switch_channel_get_variable(session->channel, "rtp_video_max_bandwidth");
+	case SWITCH_MEDIA_TYPE_VIDEO: {
+		uint32_t system_bw = 0;
 
-			if (!bwv) {
-				bwv = switch_channel_get_variable(session->channel, "rtp_video_max_bandwidth_out");
-			}
-
-			if (!bwv) {
-				bwv = "1mb";
-			}
-			
-			engine->codec_settings.video.bandwidth = switch_parse_bandwidth_string(bwv);
+		const char *bwv = switch_channel_get_variable(session->channel, "rtp_video_max_bandwidth");
+		
+		if (!bwv) {
+			bwv = switch_channel_get_variable(session->channel, "rtp_video_max_bandwidth_out");
 		}
+		
+		if (!bwv) {
+			bwv = "1mb";
+		}
+		
+		system_bw = switch_parse_bandwidth_string(bwv);
+
+		if (engine->sdp_bw && engine->sdp_bw <= system_bw) {
+			engine->codec_settings.video.bandwidth = engine->sdp_bw;
+		} else {
+			engine->codec_settings.video.bandwidth = system_bw;
+		}
+	}
 		break;
 	default:
 		break;
@@ -3835,7 +3868,8 @@ SWITCH_DECLARE(uint8_t) switch_core_media_negotiate_sdp(switch_core_session_t *s
 							switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "AUDIO RTP REPORTS ERROR: [%s]\n", err);
 							switch_channel_hangup(channel, SWITCH_CAUSE_INCOMPATIBLE_DESTINATION);
 						}
-						
+
+						switch_core_media_check_autoadj(session);
 					}
 
 					
@@ -4021,7 +4055,7 @@ SWITCH_DECLARE(uint8_t) switch_core_media_negotiate_sdp(switch_core_session_t *s
 					maxptime = atoi(attr->a_value);
 				} else if (got_crypto < 1 && !strcasecmp(attr->a_name, "crypto") && !zstr(attr->a_value)) {
 					int crypto_tag;
-
+					
 					if (!(smh->mparams->ndlb & SM_NDLB_ALLOW_CRYPTO_IN_AVP) && 
 						!switch_true(switch_channel_get_variable(session->channel, "rtp_allow_crypto_in_avp"))) {
 						if (m->m_proto != sdp_proto_srtp && !got_webrtc) {
@@ -4529,6 +4563,18 @@ SWITCH_DECLARE(uint8_t) switch_core_media_negotiate_sdp(switch_core_session_t *s
 			v_engine->rmode = sdp_media_flow(m->m_mode);
 
 			if (sdp_type == SDP_TYPE_REQUEST) {
+				sdp_bandwidth_t *bw;
+				int tias = 0;
+				
+				for (bw = m->m_bandwidths; bw; bw = bw->b_next) {
+					if (bw->b_modifier == sdp_bw_as && !tias) {
+						v_engine->sdp_bw = bw->b_value / 1024;
+					} else if (bw->b_modifier == sdp_bw_tias) {
+						tias = 1;
+						v_engine->sdp_bw = bw->b_value / 1024;
+					}
+				}
+
 				switch(v_engine->rmode) {
 				case SWITCH_MEDIA_FLOW_RECVONLY:
 					switch_channel_set_variable(smh->session->channel, "video_media_flow", "sendonly");
@@ -4935,21 +4981,106 @@ SWITCH_DECLARE(switch_file_handle_t *) switch_core_media_get_video_file(switch_c
 
 	v_engine = &smh->engines[SWITCH_MEDIA_TYPE_VIDEO];	
 
-	switch_mutex_lock(v_engine->mh.file_mutex);
+
 
 	if (rw == SWITCH_RW_READ) {
+		switch_mutex_lock(v_engine->mh.file_read_mutex);
 		fh = smh->video_read_fh;
+		switch_mutex_unlock(v_engine->mh.file_read_mutex);
 	} else {
+		switch_mutex_lock(v_engine->mh.file_write_mutex);
 		fh = smh->video_write_fh;
+		switch_mutex_unlock(v_engine->mh.file_write_mutex);
 	}
 
-	switch_mutex_unlock(v_engine->mh.file_mutex);
+
 
 	return fh;
 }
 
 
-SWITCH_DECLARE(switch_status_t) switch_core_media_set_video_file(switch_core_session_t *session, switch_file_handle_t *fh, switch_rw_t rw)
+static void *SWITCH_THREAD_FUNC video_write_thread(switch_thread_t *thread, void *obj)
+{
+	switch_core_session_t *session = (switch_core_session_t *) obj;
+	switch_media_handle_t *smh;
+	unsigned char *buf = NULL;
+	switch_frame_t fr = { 0 };
+	switch_rtp_engine_t *v_engine;
+	int buflen = SWITCH_RTP_MAX_BUF_LEN;
+	switch_timer_t timer = { 0 };
+	int fps;
+	switch_video_read_flag_t read_flags = SVR_FLUSH|SVR_BLOCK;
+
+	if (switch_core_session_read_lock(session) != SWITCH_STATUS_SUCCESS) {
+		return NULL;
+	}
+	
+	if (!(smh = session->media_handle)) {
+		return NULL;
+	}
+
+	switch_channel_set_flag(session->channel, CF_VIDEO_WRITING);
+
+	v_engine = &smh->engines[SWITCH_MEDIA_TYPE_VIDEO];	
+
+
+	buf = switch_core_session_alloc(session, buflen);
+	fr.packet = buf;
+	fr.packetlen = buflen;
+	fr.data = buf + 12;
+	fr.buflen = buflen - 12;
+	switch_core_media_gen_key_frame(session);
+
+	
+	if (smh->video_write_fh->mm.source_fps) {
+		fps = (int) smh->video_write_fh->mm.source_fps;
+	} else {
+		fps = video_globals.fps;
+	}
+
+	if (!fps) {
+		fps = 15;
+	}
+
+
+	switch_core_timer_init(&timer, "soft", (int)(1000 / fps) , 1, switch_core_session_get_pool(session));
+
+	while (smh->video_write_thread_running > 0 &&
+		   switch_channel_up_nosig(session->channel) && smh->video_write_fh && switch_test_flag(smh->video_write_fh, SWITCH_FILE_OPEN)) {
+		switch_status_t wstatus = SWITCH_STATUS_FALSE;
+
+		switch_core_timer_next(&timer);
+		switch_mutex_lock(v_engine->mh.file_write_mutex);
+
+		if (smh->video_write_fh->mm.source_fps && smh->video_write_fh->mm.source_fps != fps) {
+			switch_core_timer_destroy(&timer);
+			switch_core_timer_init(&timer, "soft", (int)(1000 / fps) , 1, switch_core_session_get_pool(session));
+		}
+
+		if (smh->video_write_fh && !switch_test_flag(smh->video_write_fh, SWITCH_FILE_FLAG_VIDEO_EOF)) {
+			wstatus = switch_core_file_read_video(smh->video_write_fh, &fr, read_flags);
+
+			if (wstatus == SWITCH_STATUS_SUCCESS) {
+				switch_core_session_write_video_frame(session, &fr, SWITCH_IO_FLAG_NONE, SVR_FLUSH);
+				switch_img_free(&fr.img);
+			} else if (wstatus != SWITCH_STATUS_BREAK && wstatus != SWITCH_STATUS_IGNORE) {
+				switch_set_flag_locked(smh->video_write_fh, SWITCH_FILE_FLAG_VIDEO_EOF);
+			}
+		}
+		switch_mutex_unlock(v_engine->mh.file_write_mutex);
+	}
+
+	switch_core_timer_destroy(&timer);
+
+	switch_core_session_rwunlock(session);
+
+	switch_channel_clear_flag(session->channel, CF_VIDEO_WRITING);
+	smh->video_write_thread_running = 0;
+
+	return NULL;
+}
+
+SWITCH_DECLARE(switch_status_t) switch_core_media_lock_video_file(switch_core_session_t *session, switch_rw_t rw)
 {
 	switch_media_handle_t *smh;
 	switch_rtp_engine_t *v_engine;
@@ -4964,6 +5095,63 @@ SWITCH_DECLARE(switch_status_t) switch_core_media_set_video_file(switch_core_ses
 		return SWITCH_STATUS_FALSE;
 	}
 
+	v_engine = &smh->engines[SWITCH_MEDIA_TYPE_VIDEO];
+
+	if (rw == SWITCH_RW_READ) {
+		switch_mutex_lock(v_engine->mh.file_read_mutex);
+	} else {
+		switch_mutex_lock(v_engine->mh.file_write_mutex);
+	}
+
+	return SWITCH_STATUS_SUCCESS;
+
+}
+
+SWITCH_DECLARE(switch_status_t) switch_core_media_unlock_video_file(switch_core_session_t *session, switch_rw_t rw)
+{
+	switch_media_handle_t *smh;
+	switch_rtp_engine_t *v_engine;
+
+	switch_assert(session);
+
+	if (!switch_channel_test_flag(session->channel, CF_VIDEO)) {
+		return SWITCH_STATUS_FALSE;
+	}
+
+	if (!(smh = session->media_handle)) {
+		return SWITCH_STATUS_FALSE;
+	}
+
+	v_engine = &smh->engines[SWITCH_MEDIA_TYPE_VIDEO];
+
+	if (rw == SWITCH_RW_READ) {
+		switch_mutex_unlock(v_engine->mh.file_read_mutex);
+	} else {
+		switch_mutex_unlock(v_engine->mh.file_write_mutex);
+	}
+
+	return SWITCH_STATUS_SUCCESS;
+}
+
+SWITCH_DECLARE(switch_status_t) switch_core_media_set_video_file(switch_core_session_t *session, switch_file_handle_t *fh, switch_rw_t rw)
+{
+	switch_media_handle_t *smh;
+	switch_rtp_engine_t *v_engine;
+
+	switch_assert(session);
+
+	if (!(smh = session->media_handle)) {
+		return SWITCH_STATUS_FALSE;
+	}
+
+	if (!smh->video_read_fh && !smh->video_read_fh && !switch_channel_test_flag(session->channel, CF_VIDEO)) {
+		return SWITCH_STATUS_FALSE;
+	}
+
+	if (fh && !switch_core_file_has_video(fh, SWITCH_TRUE)) {
+		return SWITCH_STATUS_FALSE;
+	}
+
 	v_engine = &smh->engines[SWITCH_MEDIA_TYPE_VIDEO];	
 
 	switch_core_session_start_video_thread(session);
@@ -4972,9 +5160,17 @@ SWITCH_DECLARE(switch_status_t) switch_core_media_set_video_file(switch_core_ses
 	//	return SWITCH_STATUS_FALSE;
 	//}
 
-	switch_mutex_lock(v_engine->mh.file_mutex);
+
 
 	if (rw == SWITCH_RW_READ) {
+		switch_mutex_lock(v_engine->mh.file_read_mutex);
+
+		if (fh && smh->video_read_fh) {
+			switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "File is already open\n");
+			switch_mutex_unlock(v_engine->mh.file_read_mutex);
+			return SWITCH_STATUS_FALSE; 
+		}
+
 		
 		if (fh) {
 			switch_channel_set_flag_recursive(session->channel, CF_VIDEO_DECODED_READ);
@@ -4990,7 +5186,17 @@ SWITCH_DECLARE(switch_status_t) switch_core_media_set_video_file(switch_core_ses
 
 		smh->video_read_fh = fh;
 
+		switch_mutex_unlock(v_engine->mh.file_read_mutex);
+
 	} else {
+		switch_mutex_lock(v_engine->mh.file_write_mutex);
+		if (fh && smh->video_write_fh) {
+			switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "File is already open\n");
+			smh->video_write_fh = fh;
+			switch_mutex_unlock(v_engine->mh.file_write_mutex);
+			return SWITCH_STATUS_SUCCESS; 
+		}
+
 		if (fh) {
 			switch_channel_set_flag(session->channel, CF_VIDEO_WRITE_FILE_ATTACHED);
 		} else {
@@ -4998,17 +5204,64 @@ SWITCH_DECLARE(switch_status_t) switch_core_media_set_video_file(switch_core_ses
 		}
 
 		switch_core_media_gen_key_frame(session);
+
+		if (fh) {
+			switch_threadattr_t *thd_attr = NULL;
+
+			switch_threadattr_create(&thd_attr, switch_core_session_get_pool(session));
+			switch_threadattr_stacksize_set(thd_attr, SWITCH_THREAD_STACKSIZE);
+			smh->video_write_thread_running = 1;
+			switch_thread_create(&smh->video_write_thread, thd_attr, video_write_thread, session, switch_core_session_get_pool(session));
+		}
+
+		if (!fh && smh->video_write_thread) {
+			switch_status_t st;
+
+			if (smh->video_write_thread_running > 0) {
+				smh->video_write_thread_running = -1;
+			}
+			switch_mutex_unlock(v_engine->mh.file_write_mutex);
+			switch_thread_join(&st, smh->video_write_thread);
+			switch_mutex_lock(v_engine->mh.file_write_mutex);
+			smh->video_write_thread = NULL;
+		}
+		
 		smh->video_write_fh = fh;
+
+		switch_mutex_unlock(v_engine->mh.file_write_mutex);
 	}
 
 	if (!fh) switch_channel_video_sync(session->channel);
 	
 	switch_core_session_wake_video_thread(session);
-	switch_mutex_unlock(v_engine->mh.file_mutex);
+
 	
 	return SWITCH_STATUS_SUCCESS;
 }
 
+int next_cpu(void)
+{
+	int x = 0;
+
+	switch_mutex_lock(video_globals.mutex);
+	x = video_globals.cur_cpu++;
+	if (video_globals.cur_cpu == video_globals.cpu_count) {
+		video_globals.cur_cpu = 0;
+	}
+	switch_mutex_unlock(video_globals.mutex);
+	switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG1, "Binding to CPU %d\n", x);
+
+	return x;
+}
+
+SWITCH_DECLARE(void) switch_core_autobind_cpu(void)
+{
+	if (video_globals.cpu_count > 1) {
+		switch_core_thread_set_cpu_affinity(next_cpu());
+	}
+}
+
+
 static void *SWITCH_THREAD_FUNC video_helper_thread(switch_thread_t *thread, void *obj)
 {
 	struct media_helper *mh = obj;
@@ -5018,18 +5271,20 @@ static void *SWITCH_THREAD_FUNC video_helper_thread(switch_thread_t *thread, voi
 	switch_frame_t *read_frame = NULL;
 	switch_media_handle_t *smh;
 	uint32_t loops = 0, xloops = 0, vloops = 0;
+	switch_image_t *blank_img = NULL;
 	switch_frame_t fr = { 0 };
 	unsigned char *buf = NULL;
-	switch_image_t *blank_img = NULL;
 	switch_rgb_color_t bgcolor;
 	switch_rtp_engine_t *v_engine = NULL;
-	
 	const char *var;
+	int buflen = SWITCH_RTP_MAX_BUF_LEN;
 
 	if (!(smh = session->media_handle)) {
 		return NULL;
 	}
 
+	switch_core_autobind_cpu();
+
 	if ((var = switch_channel_get_variable(session->channel, "core_video_blank_image"))) {
 		blank_img = switch_img_read_png(var, SWITCH_IMG_FMT_I420);
 	}
@@ -5057,6 +5312,14 @@ static void *SWITCH_THREAD_FUNC video_helper_thread(switch_thread_t *thread, voi
 					  switch_channel_get_name(session->channel), switch_channel_test_flag(channel, CF_VIDEO_ECHO) ? "on" : "off");
 	switch_core_session_request_video_refresh(session);
 
+	buf = switch_core_session_alloc(session, buflen);
+	fr.packet = buf;
+	fr.packetlen = buflen;
+	fr.data = buf + 12;
+	fr.buflen = buflen - 12;
+
+	switch_core_media_gen_key_frame(session);
+
 	while (switch_channel_up_nosig(channel)) {
 		int send_blank = 0;
 
@@ -5109,62 +5372,38 @@ static void *SWITCH_THREAD_FUNC video_helper_thread(switch_thread_t *thread, voi
 			}
 		}
 
-		//if (!smh->video_write_fh || !switch_channel_test_flag(channel, CF_VIDEO_READY)) {
-		status = switch_core_session_read_video_frame(session, &read_frame, smh->video_write_fh ? SWITCH_IO_FLAG_NOBLOCK : SWITCH_IO_FLAG_NONE, 0);
-		
+		status = switch_core_session_read_video_frame(session, &read_frame, SWITCH_IO_FLAG_NONE, 0);
+
 		if (!SWITCH_READ_ACCEPTABLE(status)) {
 			switch_cond_next();
 			continue;
 		}
-		
-		//if (switch_test_flag(read_frame, SFF_CNG)) {
-		//	continue;
-		//}
-		//}
-		
-		//if (vloops < 300 && (vloops % 100) == 0) {
-		//			switch_core_media_gen_key_frame(session);
-		//switch_core_session_request_video_refresh(session);
-		//}
-		
+
 		vloops++;
-
-		if (!buf) {
-			int buflen = SWITCH_RTP_MAX_BUF_LEN;
-			buf = switch_core_session_alloc(session, buflen);
-			fr.packet = buf;
-			fr.packetlen = buflen;
-			fr.data = buf + 12;
-			fr.buflen = buflen - 12;
-			switch_core_media_gen_key_frame(session);
-		}
-
-		if (switch_channel_test_flag(channel, CF_VIDEO_READY)) {
-			switch_mutex_lock(mh->file_mutex);
-			if (smh->video_write_fh && switch_channel_ready(session->channel) && switch_test_flag(smh->video_write_fh, SWITCH_FILE_OPEN)) {
-				switch_status_t wstatus = switch_core_file_read_video(smh->video_write_fh, &fr, 0);
-				if (wstatus == SWITCH_STATUS_SUCCESS) {
-					switch_core_session_write_video_frame(session, &fr, SWITCH_IO_FLAG_NONE, SVR_FLUSH);
-					switch_img_free(&fr.img);
-				} else if (wstatus != SWITCH_STATUS_BREAK && wstatus != SWITCH_STATUS_IGNORE) {
-					smh->video_write_fh = NULL;
-				}
-				send_blank = 0;
-			} else if (smh->video_read_fh && switch_test_flag(smh->video_read_fh, SWITCH_FILE_OPEN) && read_frame->img) {
+		
+		send_blank = 1;
+		
+		if (switch_channel_test_flag(channel, CF_VIDEO_READY) && !switch_test_flag(read_frame, SFF_CNG)) {
+			switch_mutex_lock(mh->file_read_mutex);
+			if (smh->video_read_fh && switch_test_flag(smh->video_read_fh, SWITCH_FILE_OPEN) && read_frame->img) {
+				smh->video_read_fh->mm.fps = smh->vid_params.fps;
 				switch_core_file_write_video(smh->video_read_fh, read_frame);
-				send_blank = 0;
 			} 
-			switch_mutex_unlock(mh->file_mutex);
-		} else if (switch_channel_test_flag(channel, CF_VIDEO_DECODED_READ) || v_engine->smode == SWITCH_MEDIA_FLOW_SENDONLY) {
-			send_blank = 1;
+			switch_mutex_unlock(mh->file_read_mutex);
+		}
+		
+		if (switch_channel_test_flag(channel, CF_VIDEO_WRITING) || session->video_read_callback) {
+			send_blank = 0;
 		}
 
-		if (blank_img && (send_blank || switch_channel_test_flag(channel, CF_VIDEO_BLANK)) && !session->video_read_callback) {
-			fr.img = blank_img;
-			switch_yield(10000);
-			switch_core_session_write_video_frame(session, &fr, SWITCH_IO_FLAG_FORCE, 0);
-		} else if (read_frame && (switch_channel_test_flag(channel, CF_VIDEO_ECHO))) {
-			switch_core_session_write_video_frame(session, read_frame, SWITCH_IO_FLAG_NONE, 0);
+		if (send_blank) {
+			if (read_frame && (switch_channel_test_flag(channel, CF_VIDEO_ECHO))) {
+				switch_core_session_write_video_frame(session, read_frame, SWITCH_IO_FLAG_NONE, 0);
+			} else if (blank_img) {
+				fr.img = blank_img;
+				switch_yield(10000);
+				switch_core_session_write_video_frame(session, &fr, SWITCH_IO_FLAG_FORCE, 0);
+			}
 		}
 	}
 
@@ -5215,7 +5454,8 @@ SWITCH_DECLARE(switch_status_t) switch_core_session_start_video_thread(switch_co
 	
 	switch_thread_cond_create(&v_engine->mh.cond, pool);
 	switch_mutex_init(&v_engine->mh.cond_mutex, SWITCH_MUTEX_NESTED, pool);
-	switch_mutex_init(&v_engine->mh.file_mutex, SWITCH_MUTEX_NESTED, pool);
+	switch_mutex_init(&v_engine->mh.file_read_mutex, SWITCH_MUTEX_NESTED, pool);
+	switch_mutex_init(&v_engine->mh.file_write_mutex, SWITCH_MUTEX_NESTED, pool);
 	switch_mutex_init(&smh->read_mutex[SWITCH_MEDIA_TYPE_VIDEO], SWITCH_MUTEX_NESTED, pool);
 	switch_mutex_init(&smh->write_mutex[SWITCH_MEDIA_TYPE_VIDEO], SWITCH_MUTEX_NESTED, pool);
 	switch_thread_create(&v_engine->media_thread, thd_attr, video_helper_thread, &v_engine->mh, switch_core_session_get_pool(session));
@@ -6859,7 +7099,7 @@ static void generate_m(switch_core_session_t *session, char *buf, size_t buflen,
 		//switch_snprintf(buf + strlen(buf), buflen - strlen(buf), " %d", cng_type);
 	//}
 		
-	switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "\n");
+	switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "\r\n");
 
 
 	memset(already_did, 0, sizeof(already_did));
@@ -6918,15 +7158,15 @@ static void generate_m(switch_core_session_t *session, char *buf, size_t buflen,
 			int channels = get_channels(imp->iananame, imp->number_of_channels);
 
 			if (channels > 1) {
-				switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=rtpmap:%d %s/%d/%d\n", smh->ianacodes[i], imp->iananame, rate, channels);
+				switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=rtpmap:%d %s/%d/%d\r\n", smh->ianacodes[i], imp->iananame, rate, channels);
 								
 			} else {
-				switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=rtpmap:%d %s/%d\n", smh->ianacodes[i], imp->iananame, rate);
+				switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=rtpmap:%d %s/%d\r\n", smh->ianacodes[i], imp->iananame, rate);
 			}
 		}
 
 		if (fmtp) {
-			switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=fmtp:%d %s\n", smh->ianacodes[i], fmtp);
+			switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=fmtp:%d %s\r\n", smh->ianacodes[i], fmtp);
 		}
 	}
 
@@ -6936,30 +7176,30 @@ static void generate_m(switch_core_session_t *session, char *buf, size_t buflen,
 
 		for (i = 0; i < smh->num_rates; i++) {
 			if (switch_channel_test_flag(session->channel, CF_AVPF)) {
-				switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=rtpmap:%d telephone-event/%d\n", 
+				switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=rtpmap:%d telephone-event/%d\r\n", 
 								smh->dtmf_ianacodes[i], smh->rates[i]);
 			} else {
-				switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=rtpmap:%d telephone-event/%d\na=fmtp:%d 0-16\n", 
+				switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=rtpmap:%d telephone-event/%d\r\na=fmtp:%d 0-16\r\n", 
 								smh->dtmf_ianacodes[i], smh->rates[i], smh->dtmf_ianacodes[i]);
 			}
 		}
 	}
 
 	if (!zstr(a_engine->local_dtls_fingerprint.type) && secure) {
-		switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=fingerprint:%s %s\na=setup:%s\n", a_engine->local_dtls_fingerprint.type, 
+		switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=fingerprint:%s %s\na=setup:%s\r\n", a_engine->local_dtls_fingerprint.type, 
 						a_engine->local_dtls_fingerprint.str, get_setup(a_engine, session, sdp_type));
 	}
 	
 	if (smh->mparams->rtcp_audio_interval_msec) {
 		if (a_engine->rtcp_mux > 0) {
-			switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=rtcp-mux\n");
-			switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=rtcp:%d IN %s %s\n", port, family, ip);
+			switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=rtcp-mux\r\n");
+			switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=rtcp:%d IN %s %s\r\n", port, family, ip);
 		} else {
-			switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=rtcp:%d IN %s %s\n", port + 1, family, ip);
+			switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=rtcp:%d IN %s %s\r\n", port + 1, family, ip);
 		}
 	}
 
-	//switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=ssrc:%u\n", a_engine->ssrc);
+	//switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=ssrc:%u\r\n", a_engine->ssrc);
 
 	if (a_engine->ice_out.cands[0][0].ready) {
 		char tmp1[11] = "";
@@ -6981,17 +7221,17 @@ static void generate_m(switch_core_session_t *session, char *buf, size_t buflen,
 
 		ice_out = &a_engine->ice_out;
 
-		switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=ssrc:%u cname:%s\n", a_engine->ssrc, smh->cname);
-		switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=ssrc:%u msid:%s a0\n", a_engine->ssrc, smh->msid);
-		switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=ssrc:%u mslabel:%s\n", a_engine->ssrc, smh->msid);
-		switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=ssrc:%u label:%sa0\n", a_engine->ssrc, smh->msid);
+		switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=ssrc:%u cname:%s\r\n", a_engine->ssrc, smh->cname);
+		switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=ssrc:%u msid:%s a0\r\n", a_engine->ssrc, smh->msid);
+		switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=ssrc:%u mslabel:%s\r\n", a_engine->ssrc, smh->msid);
+		switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=ssrc:%u label:%sa0\r\n", a_engine->ssrc, smh->msid);
 
 
-		switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=ice-ufrag:%s\n", ice_out->ufrag);
-		switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=ice-pwd:%s\n", ice_out->pwd);
+		switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=ice-ufrag:%s\r\n", ice_out->ufrag);
+		switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=ice-pwd:%s\r\n", ice_out->pwd);
 
 
-		switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=candidate:%s 1 %s %u %s %d typ host generation 0\n", 
+		switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=candidate:%s 1 %s %u %s %d typ host generation 0\r\n", 
 						tmp1, ice_out->cands[0][0].transport, c1,
 						ice_out->cands[0][0].con_addr, ice_out->cands[0][0].con_port
 						);
@@ -7000,7 +7240,7 @@ static void generate_m(switch_core_session_t *session, char *buf, size_t buflen,
 			strcmp(a_engine->local_sdp_ip, ice_out->cands[0][0].con_addr)
 			&& a_engine->local_sdp_port != ice_out->cands[0][0].con_port) {
 
-			switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=candidate:%s 1 %s %u %s %d typ srflx raddr %s rport %d generation 0\n", 
+			switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=candidate:%s 1 %s %u %s %d typ srflx raddr %s rport %d generation 0\r\n", 
 							tmp2, ice_out->cands[0][0].transport, c2,
 							ice_out->cands[0][0].con_addr, ice_out->cands[0][0].con_port,
 							a_engine->local_sdp_ip, a_engine->local_sdp_port
@@ -7010,7 +7250,7 @@ static void generate_m(switch_core_session_t *session, char *buf, size_t buflen,
 		if (a_engine->rtcp_mux < 1 || switch_channel_direction(session->channel) == SWITCH_CALL_DIRECTION_OUTBOUND || switch_channel_test_flag(session->channel, CF_RECOVERING)) {
 			
 
-			switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=candidate:%s 2 %s %u %s %d typ host generation 0\n", 
+			switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=candidate:%s 2 %s %u %s %d typ host generation 0\r\n", 
 							tmp1, ice_out->cands[0][0].transport, c1,
 							ice_out->cands[0][0].con_addr, ice_out->cands[0][0].con_port + (a_engine->rtcp_mux > 0 ? 0 : 1)
 							);
@@ -7019,7 +7259,7 @@ static void generate_m(switch_core_session_t *session, char *buf, size_t buflen,
 				strcmp(a_engine->local_sdp_ip, ice_out->cands[0][1].con_addr)
 				&& a_engine->local_sdp_port != ice_out->cands[0][1].con_port) {
 				
-				switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=candidate:%s 2 %s %u %s %d typ srflx raddr %s rport %d generation 0\n", 
+				switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=candidate:%s 2 %s %u %s %d typ srflx raddr %s rport %d generation 0\r\n", 
 								tmp2, ice_out->cands[0][0].transport, c2,
 								ice_out->cands[0][0].con_addr, ice_out->cands[0][0].con_port + (a_engine->rtcp_mux > 0 ? 0 : 1),
 								a_engine->local_sdp_ip, a_engine->local_sdp_port + (a_engine->rtcp_mux > 0 ? 0 : 1)
@@ -7030,7 +7270,7 @@ static void generate_m(switch_core_session_t *session, char *buf, size_t buflen,
 			
 				
 #ifdef GOOGLE_ICE
-		switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=ice-options:google-ice\n");
+		switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=ice-options:google-ice\r\n");
 #endif
 	}
 
@@ -7042,28 +7282,28 @@ static void generate_m(switch_core_session_t *session, char *buf, size_t buflen,
 			switch_rtp_crypto_key_type_t j = SUITES[smh->crypto_suite_order[i]].type;
 
 			if ((a_engine->crypto_type == j || a_engine->crypto_type == CRYPTO_INVALID) && !zstr(a_engine->ssec[j].local_crypto_key)) {
-				switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=crypto:%s\n", a_engine->ssec[j].local_crypto_key);
+				switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=crypto:%s\r\n", a_engine->ssec[j].local_crypto_key);
 			}
 		}
-		//switch_snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "a=encryption:optional\n");
+		//switch_snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "a=encryption:optional\r\n");
 	}
 
 	if (cng_type) {
 		for (i = 0; i < smh->num_rates; i++) {
 			//if (smh->rates[i] == 8000) {
-			//	switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=rtpmap:%d CN/%d\n", cng_type, smh->rates[i]);
+			//	switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=rtpmap:%d CN/%d\r\n", cng_type, smh->rates[i]);
 			//} else {
-				switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=rtpmap:%d CN/%d\n", smh->cng_ianacodes[i], smh->rates[i]);
+				switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=rtpmap:%d CN/%d\r\n", smh->cng_ianacodes[i], smh->rates[i]);
 				//}
 		}
 	} else {
 		if (switch_media_handle_test_media_flag(smh, SCMF_SUPPRESS_CNG)) { 
-			switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=silenceSupp:off - - - -\n");
+			switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=silenceSupp:off - - - -\r\n");
 		}
 	}
 
 	if (append_audio) {
-		switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "%s%s", append_audio, end_of(append_audio) == '\n' ? "" : "\n");
+		switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "%s%s", append_audio, end_of(append_audio) == '\n' ? "" : "\r\n");
 	}
 
 	if (!cur_ptime) {
@@ -7071,18 +7311,18 @@ static void generate_m(switch_core_session_t *session, char *buf, size_t buflen,
 	}
 	
 	if (!noptime && cur_ptime) {
-		switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=ptime:%d\n", cur_ptime);
+		switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=ptime:%d\r\n", cur_ptime);
 	}
 
 	local_sdp_audio_zrtp_hash = switch_core_media_get_zrtp_hash(session, SWITCH_MEDIA_TYPE_AUDIO, SWITCH_TRUE);
 
 	if (local_sdp_audio_zrtp_hash) {
 		switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "Adding audio a=zrtp-hash:%s\n", local_sdp_audio_zrtp_hash);
-		switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=zrtp-hash:%s\n", local_sdp_audio_zrtp_hash);
+		switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=zrtp-hash:%s\r\n", local_sdp_audio_zrtp_hash);
 	}
 
 	if (!zstr(sr)) {
-		switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=%s\n", sr);
+		switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=%s\r\n", sr);
 	}
 }
 
@@ -7188,19 +7428,19 @@ SWITCH_DECLARE(void)switch_core_media_set_local_sdp(switch_core_session_t *sessi
 static void add_fb(char *buf, uint32_t buflen, int pt, int fir, int nack, int pli, int tmmbr)
 {
 	if (fir) {
-		switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=rtcp-fb:%d ccm fir\n", pt);
+		switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=rtcp-fb:%d ccm fir\r\n", pt);
 	}
 
 	if (tmmbr) {
-		switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=rtcp-fb:%d ccm tmmbr\n", pt);
+		switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=rtcp-fb:%d ccm tmmbr\r\n", pt);
 	}
 
 	if (nack) {
-		switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=rtcp-fb:%d nack\n", pt);
+		switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=rtcp-fb:%d nack\r\n", pt);
 	}
 
 	if (pli) {
-		switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=rtcp-fb:%d nack pli\n", pt);
+		switch_snprintf(buf + strlen(buf), buflen - strlen(buf), "a=rtcp-fb:%d nack pli\r\n", pt);
 	}
 
 }
@@ -7510,25 +7750,25 @@ SWITCH_DECLARE(void) switch_core_media_gen_local_sdp(switch_core_session_t *sess
 	if ((smh->mparams->ndlb & SM_NDLB_SENDRECV_IN_SESSION) ||
 		((var_val = switch_channel_get_variable(session->channel, "ndlb_sendrecv_in_session")) && switch_true(var_val))) {
 		if (!zstr(sr)) {
-			switch_snprintf(srbuf, sizeof(srbuf), "a=%s\n", sr);
+			switch_snprintf(srbuf, sizeof(srbuf), "a=%s\r\n", sr);
 		}
 		sr = NULL;
 	}
 
 	family = strchr(ip, ':') ? "IP6" : "IP4";
 	switch_snprintf(buf, SDPBUFLEN,
-					"v=0\n"
-					"o=%s %010u %010u IN %s %s\n"
-					"s=%s\n"
-					"c=IN %s %s\n" 
-					"t=0 0\n"
+					"v=0\r\n"
+					"o=%s %010u %010u IN %s %s\r\n"
+					"s=%s\r\n"
+					"c=IN %s %s\r\n" 
+					"t=0 0\r\n"
 					"%s",
 					username, smh->owner_id, smh->session_id, family, ip, username, family, ip, srbuf);
 
 
 	if (switch_channel_test_flag(smh->session->channel, CF_ICE)) {
 		gen_ice(session, SWITCH_MEDIA_TYPE_AUDIO, ip, port);
-		switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=msid-semantic: WMS %s\n", smh->msid);
+		switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=msid-semantic: WMS %s\r\n", smh->msid);
 	}
 
 	if (a_engine->codec_negotiated) {
@@ -7559,7 +7799,7 @@ SWITCH_DECLARE(void) switch_core_media_gen_local_sdp(switch_core_session_t *sess
 			switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), " %d", smh->mparams->cng_pt);
 		}
 		
-		switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "\n");
+		switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "\r\n");
 
 
 		rate = a_engine->cur_payload_map->adv_rm_rate;
@@ -7569,22 +7809,22 @@ SWITCH_DECLARE(void) switch_core_media_gen_local_sdp(switch_core_session_t *sess
 		}
 		
 		if (a_engine->cur_payload_map->adv_channels > 1) {
-			switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=rtpmap:%d %s/%d/%d\n", 
+			switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=rtpmap:%d %s/%d/%d\r\n", 
 							a_engine->cur_payload_map->agreed_pt, a_engine->cur_payload_map->rm_encoding, rate, a_engine->cur_payload_map->adv_channels);
 		} else {
-			switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=rtpmap:%d %s/%d\n", 
+			switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=rtpmap:%d %s/%d\r\n", 
 							a_engine->cur_payload_map->agreed_pt, a_engine->cur_payload_map->rm_encoding, rate);
 		}
 
 		if (fmtp_out) {
-			switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=fmtp:%d %s\n", a_engine->cur_payload_map->agreed_pt, fmtp_out);
+			switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=fmtp:%d %s\r\n", a_engine->cur_payload_map->agreed_pt, fmtp_out);
 		}
 
 		if (switch_media_handle_test_media_flag(smh, SCMF_MULTI_ANSWER_AUDIO)) {
 			switch_mutex_lock(smh->sdp_mutex);
 			for (pmap = a_engine->cur_payload_map; pmap && pmap->allocated; pmap = pmap->next) {
 				if (pmap->pt != a_engine->cur_payload_map->pt) {
-					switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=rtpmap:%d %s/%ld\n",
+					switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=rtpmap:%d %s/%ld\r\n",
 									pmap->pt, pmap->iananame,
 									pmap->rate);
 				}
@@ -7603,18 +7843,18 @@ SWITCH_DECLARE(void) switch_core_media_gen_local_sdp(switch_core_session_t *sess
 			&& smh->mparams->te > 95) {
 
 			if (switch_channel_test_flag(session->channel, CF_AVPF)) {
-				switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=rtpmap:%d telephone-event/%d\n", 
+				switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=rtpmap:%d telephone-event/%d\r\n", 
 								smh->mparams->te, smh->mparams->te_rate);
 			} else {
-				switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=rtpmap:%d telephone-event/%d\na=fmtp:%d 0-16\n", 
+				switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=rtpmap:%d telephone-event/%d\na=fmtp:%d 0-16\r\n", 
 								smh->mparams->te, smh->mparams->te_rate, smh->mparams->te);
 			}
 		}
 
 		if (switch_media_handle_test_media_flag(smh, SCMF_SUPPRESS_CNG)) {
-			switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=silenceSupp:off - - - -\n");
+			switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=silenceSupp:off - - - -\r\n");
 		} else if (smh->mparams->cng_pt && use_cng) {
-			switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=rtpmap:%d CN/%lu\n", smh->mparams->cng_pt, smh->mparams->cng_rate);
+			switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=rtpmap:%d CN/%lu\r\n", smh->mparams->cng_pt, smh->mparams->cng_rate);
 
 			if (!a_engine->codec_negotiated) {
 				smh->mparams->cng_pt = 0;
@@ -7622,42 +7862,42 @@ SWITCH_DECLARE(void) switch_core_media_gen_local_sdp(switch_core_session_t *sess
 		}
 
 		if (append_audio) {
-			switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "%s%s", append_audio, end_of(append_audio) == '\n' ? "" : "\n");
+			switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "%s%s", append_audio, end_of(append_audio) == '\n' ? "" : "\r\n");
 		}
 
 		if (ptime) {
-			switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=ptime:%d\n", ptime);
+			switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=ptime:%d\r\n", ptime);
 		}
 
 
 		if (local_sdp_audio_zrtp_hash) {
-			switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "Adding audio a=zrtp-hash:%s\n",
+			switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "Adding audio a=zrtp-hash:%s\r\n",
 							  local_sdp_audio_zrtp_hash);
-			switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=zrtp-hash:%s\n",
+			switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=zrtp-hash:%s\r\n",
 							local_sdp_audio_zrtp_hash);
 		}
 
 		if (!zstr(sr)) {
-			switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=%s\n", sr);
+			switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=%s\r\n", sr);
 		}
 	
 
 		if (!zstr(a_engine->local_dtls_fingerprint.type)) {
-			switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=fingerprint:%s %s\na=setup:%s\n",
+			switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=fingerprint:%s %s\na=setup:%s\r\n",
 							a_engine->local_dtls_fingerprint.type, 
 							a_engine->local_dtls_fingerprint.str, get_setup(a_engine, session, sdp_type));
 		}
 		
 		if (smh->mparams->rtcp_audio_interval_msec) {
 			if (a_engine->rtcp_mux > 0) {
-				switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=rtcp-mux\n");
-				switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=rtcp:%d IN %s %s\n", port, family, ip);
+				switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=rtcp-mux\r\n");
+				switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=rtcp:%d IN %s %s\r\n", port, family, ip);
 			} else {
-				switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=rtcp:%d IN %s %s\n", port + 1, family, ip);
+				switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=rtcp:%d IN %s %s\r\n", port + 1, family, ip);
 			}
 		}
 
-		//switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=ssrc:%u\n", a_engine->ssrc);
+		//switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=ssrc:%u\r\n", a_engine->ssrc);
 
 		if (a_engine->ice_out.cands[0][0].ready) {
 			char tmp1[11] = "";
@@ -7679,11 +7919,11 @@ SWITCH_DECLARE(void) switch_core_media_gen_local_sdp(switch_core_session_t *sess
 			ice_out = &a_engine->ice_out;
 			
 			
-			switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=ice-ufrag:%s\n", ice_out->ufrag);
-			switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=ice-pwd:%s\n", ice_out->pwd);
+			switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=ice-ufrag:%s\r\n", ice_out->ufrag);
+			switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=ice-pwd:%s\r\n", ice_out->pwd);
 
 
-			switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=candidate:%s 1 %s %u %s %d typ host generation 0\n", 
+			switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=candidate:%s 1 %s %u %s %d typ host generation 0\r\n", 
 							tmp1, ice_out->cands[0][0].transport, c1,
 							ice_out->cands[0][0].con_addr, ice_out->cands[0][0].con_port
 							);
@@ -7692,7 +7932,7 @@ SWITCH_DECLARE(void) switch_core_media_gen_local_sdp(switch_core_session_t *sess
 				strcmp(a_engine->local_sdp_ip, ice_out->cands[0][0].con_addr)
 				&& a_engine->local_sdp_port != ice_out->cands[0][0].con_port) {
 
-				switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=candidate:%s 1 %s %u %s %d typ srflx raddr %s rport %d generation 0\n", 
+				switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=candidate:%s 1 %s %u %s %d typ srflx raddr %s rport %d generation 0\r\n", 
 								tmp2, ice_out->cands[0][0].transport, c3,
 								ice_out->cands[0][0].con_addr, ice_out->cands[0][0].con_port,
 								a_engine->local_sdp_ip, a_engine->local_sdp_port
@@ -7702,7 +7942,7 @@ SWITCH_DECLARE(void) switch_core_media_gen_local_sdp(switch_core_session_t *sess
 
 			if (a_engine->rtcp_mux < 1 || is_outbound || switch_channel_test_flag(session->channel, CF_RECOVERING)) {
 
-				switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=candidate:%s 2 %s %u %s %d typ host generation 0\n", 
+				switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=candidate:%s 2 %s %u %s %d typ host generation 0\r\n", 
 								tmp1, ice_out->cands[0][0].transport, c2,
 								ice_out->cands[0][0].con_addr, ice_out->cands[0][0].con_port + (a_engine->rtcp_mux > 0 ? 0 : 1)
 								);
@@ -7713,7 +7953,7 @@ SWITCH_DECLARE(void) switch_core_media_gen_local_sdp(switch_core_session_t *sess
 					strcmp(a_engine->local_sdp_ip, ice_out->cands[0][0].con_addr)
 					&& a_engine->local_sdp_port != ice_out->cands[0][0].con_port) {			
 					
-					switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=candidate:%s 2 %s %u %s %d typ srflx raddr %s rport %d generation 0\n", 
+					switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=candidate:%s 2 %s %u %s %d typ srflx raddr %s rport %d generation 0\r\n", 
 									tmp2, ice_out->cands[0][0].transport, c4,
 									ice_out->cands[0][0].con_addr, ice_out->cands[0][0].con_port + (a_engine->rtcp_mux > 0 ? 0 : 1),
 									a_engine->local_sdp_ip, a_engine->local_sdp_port + (a_engine->rtcp_mux > 0 ? 0 : 1)
@@ -7722,22 +7962,22 @@ SWITCH_DECLARE(void) switch_core_media_gen_local_sdp(switch_core_session_t *sess
 			}
 
 
-			switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=ssrc:%u cname:%s\n", a_engine->ssrc, smh->cname);
-			switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=ssrc:%u msid:%s a0\n", a_engine->ssrc, smh->msid);
-			switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=ssrc:%u mslabel:%s\n", a_engine->ssrc, smh->msid);
-			switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=ssrc:%u label:%sa0\n", a_engine->ssrc, smh->msid);
+			switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=ssrc:%u cname:%s\r\n", a_engine->ssrc, smh->cname);
+			switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=ssrc:%u msid:%s a0\r\n", a_engine->ssrc, smh->msid);
+			switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=ssrc:%u mslabel:%s\r\n", a_engine->ssrc, smh->msid);
+			switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=ssrc:%u label:%sa0\r\n", a_engine->ssrc, smh->msid);
 
 				
 #ifdef GOOGLE_ICE
-			switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=ice-options:google-ice\n");
+			switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=ice-options:google-ice\r\n");
 #endif
 		}
 
 		if (a_engine->crypto_type != CRYPTO_INVALID && !switch_channel_test_flag(session->channel, CF_DTLS) &&
 			!zstr(a_engine->ssec[a_engine->crypto_type].local_crypto_key) && switch_channel_test_flag(session->channel, CF_SECURE)) {
 
-			switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=crypto:%s\n", a_engine->ssec[a_engine->crypto_type].local_crypto_key);
-		//switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=encryption:optional\n");
+			switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=crypto:%s\r\n", a_engine->ssec[a_engine->crypto_type].local_crypto_key);
+		//switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=encryption:optional\r\n");
 		}
 
 	} else if (smh->mparams->num_codecs) {
@@ -7822,7 +8062,7 @@ SWITCH_DECLARE(void) switch_core_media_gen_local_sdp(switch_core_session_t *sess
 	if (!switch_channel_test_flag(session->channel, CF_VIDEO_POSSIBLE)) {
 		if (switch_channel_test_flag(session->channel, CF_VIDEO_SDP_RECVD)) {
 			switch_channel_clear_flag(session->channel, CF_VIDEO_SDP_RECVD);
-			switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "m=video 0 %s 19\n", 
+			switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "m=video 0 %s 19\r\n", 
 							get_media_profile_name(session, 
 												   (switch_channel_test_flag(session->channel, CF_SECURE) 
 													&& switch_channel_direction(session->channel) == SWITCH_CALL_DIRECTION_OUTBOUND) || 
@@ -7911,7 +8151,7 @@ SWITCH_DECLARE(void) switch_core_media_gen_local_sdp(switch_core_session_t *sess
 					}
 				}
 
-				switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "\n");
+				switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "\r\n");
 
 			
 				if (v_engine->codec_negotiated) {
@@ -7927,7 +8167,7 @@ SWITCH_DECLARE(void) switch_core_media_gen_local_sdp(switch_core_session_t *sess
 					//}
 
 					rate = v_engine->cur_payload_map->rm_rate;
-					switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=rtpmap:%d %s/%ld\n",
+					switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=rtpmap:%d %s/%ld\r\n",
 									v_engine->cur_payload_map->pt, v_engine->cur_payload_map->rm_encoding,
 									v_engine->cur_payload_map->rm_rate);
 
@@ -7952,7 +8192,7 @@ SWITCH_DECLARE(void) switch_core_media_gen_local_sdp(switch_core_session_t *sess
 					}
 				
 					if (pass_fmtp) {
-						switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=fmtp:%d %s\n", v_engine->cur_payload_map->pt, pass_fmtp);
+						switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=fmtp:%d %s\r\n", v_engine->cur_payload_map->pt, pass_fmtp);
 					}
 
 
@@ -7960,7 +8200,7 @@ SWITCH_DECLARE(void) switch_core_media_gen_local_sdp(switch_core_session_t *sess
 						switch_mutex_lock(smh->sdp_mutex);
 						for (pmap = v_engine->cur_payload_map; pmap && pmap->allocated; pmap = pmap->next) {
 							if (pmap->pt != v_engine->cur_payload_map->pt && pmap->negotiated) {
-								switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=rtpmap:%d %s/%ld\n",
+								switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=rtpmap:%d %s/%ld\r\n",
 												pmap->pt, pmap->iananame, pmap->rate);
 							
 							}
@@ -7970,13 +8210,13 @@ SWITCH_DECLARE(void) switch_core_media_gen_local_sdp(switch_core_session_t *sess
 
 
 					if (append_video) {
-						switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "%s%s", append_video, end_of(append_video) == '\n' ? "" : "\n");
+						switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "%s%s", append_video, end_of(append_video) == '\n' ? "" : "\r\n");
 					}
 					
 					if (v_engine->smode == SWITCH_MEDIA_FLOW_SENDONLY) {
-						switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "%s", "a=sendonly\n");
+						switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "%s", "a=sendonly\r\n");
 					} else if (v_engine->smode == SWITCH_MEDIA_FLOW_RECVONLY) {
-						switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "%s", "a=recvonly\n");
+						switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "%s", "a=recvonly\r\n");
 					}
 
 				} else if (smh->mparams->num_codecs) {
@@ -8020,10 +8260,10 @@ SWITCH_DECLARE(void) switch_core_media_gen_local_sdp(switch_core_session_t *sess
 						//}
 
 						if (channels > 1) {
-							switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=rtpmap:%d %s/%d/%d\n", ianacode, imp->iananame,
+							switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=rtpmap:%d %s/%d/%d\r\n", ianacode, imp->iananame,
 											imp->samples_per_second, channels);
 						} else {
-							switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=rtpmap:%d %s/%d\n", ianacode, imp->iananame,
+							switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=rtpmap:%d %s/%d\r\n", ianacode, imp->iananame,
 											imp->samples_per_second);
 						}
 					
@@ -8045,7 +8285,7 @@ SWITCH_DECLARE(void) switch_core_media_gen_local_sdp(switch_core_session_t *sess
 						}
 					
 						if (!zstr(fmtp) && strcasecmp(fmtp, "_blank_")) {
-							switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=fmtp:%d %s\n", ianacode, fmtp);
+							switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=fmtp:%d %s\r\n", ianacode, fmtp);
 						}
 					}
 				
@@ -8058,17 +8298,17 @@ SWITCH_DECLARE(void) switch_core_media_gen_local_sdp(switch_core_session_t *sess
 
 
 				if (!zstr(v_engine->local_dtls_fingerprint.type)) {
-					switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=fingerprint:%s %s\na=setup:%s\n", v_engine->local_dtls_fingerprint.type, 
+					switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=fingerprint:%s %s\na=setup:%s\r\n", v_engine->local_dtls_fingerprint.type, 
 									v_engine->local_dtls_fingerprint.str, get_setup(v_engine, session, sdp_type));
 				}
 
 
 				if (smh->mparams->rtcp_video_interval_msec) {
 					if (v_engine->rtcp_mux > 0) {
-						switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=rtcp-mux\n");
-						switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=rtcp:%d IN %s %s\n", v_port, family, ip);
+						switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=rtcp-mux\r\n");
+						switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=rtcp:%d IN %s %s\r\n", v_port, family, ip);
 					} else {
-						switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=rtcp:%d IN %s %s\n", v_port + 1, family, ip);
+						switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=rtcp:%d IN %s %s\r\n", v_port + 1, family, ip);
 					}
 				}
 
@@ -8084,8 +8324,8 @@ SWITCH_DECLARE(void) switch_core_media_gen_local_sdp(switch_core_session_t *sess
 				bw = switch_parse_bandwidth_string(vbw);
 				
 				if (bw > 0) {
-					switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "b=AS:%d\n", bw);
-					//switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "b=TIAS:%d\n", bw);
+					switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "b=AS:%d\r\n", bw);
+					//switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "b=TIAS:%d\r\n", bw);
 				}
 
 				if (sdp_type == SDP_TYPE_REQUEST) {
@@ -8142,7 +8382,7 @@ SWITCH_DECLARE(void) switch_core_media_gen_local_sdp(switch_core_session_t *sess
 					
 				}
 				
-				//switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=ssrc:%u\n", v_engine->ssrc);
+				//switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=ssrc:%u\r\n", v_engine->ssrc);
 
 				if (v_engine->ice_out.cands[0][0].ready) {
 					char tmp1[11] = "";
@@ -8164,18 +8404,18 @@ SWITCH_DECLARE(void) switch_core_media_gen_local_sdp(switch_core_session_t *sess
 					ice_out = &v_engine->ice_out;
 					
 					
-					switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=ssrc:%u cname:%s\n", v_engine->ssrc, smh->cname);
-					switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=ssrc:%u msid:%s v0\n", v_engine->ssrc, smh->msid);
-					switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=ssrc:%u mslabel:%s\n", v_engine->ssrc, smh->msid);
-					switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=ssrc:%u label:%sv0\n", v_engine->ssrc, smh->msid);
+					switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=ssrc:%u cname:%s\r\n", v_engine->ssrc, smh->cname);
+					switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=ssrc:%u msid:%s v0\r\n", v_engine->ssrc, smh->msid);
+					switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=ssrc:%u mslabel:%s\r\n", v_engine->ssrc, smh->msid);
+					switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=ssrc:%u label:%sv0\r\n", v_engine->ssrc, smh->msid);
 				
 
 				
-					switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=ice-ufrag:%s\n", ice_out->ufrag);
-					switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=ice-pwd:%s\n", ice_out->pwd);
+					switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=ice-ufrag:%s\r\n", ice_out->ufrag);
+					switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=ice-pwd:%s\r\n", ice_out->pwd);
 
 
-					switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=candidate:%s 1 %s %u %s %d typ host generation 0\n", 
+					switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=candidate:%s 1 %s %u %s %d typ host generation 0\r\n", 
 									tmp1, ice_out->cands[0][0].transport, c1,
 									ice_out->cands[0][0].con_addr, ice_out->cands[0][0].con_port
 									);
@@ -8184,7 +8424,7 @@ SWITCH_DECLARE(void) switch_core_media_gen_local_sdp(switch_core_session_t *sess
 						strcmp(v_engine->local_sdp_ip, ice_out->cands[0][0].con_addr)
 						&& v_engine->local_sdp_port != ice_out->cands[0][0].con_port) {
 
-						switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=candidate:%s 1 %s %u %s %d typ srflx raddr %s rport %d generation 0\n", 
+						switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=candidate:%s 1 %s %u %s %d typ srflx raddr %s rport %d generation 0\r\n", 
 										tmp2, ice_out->cands[0][0].transport, c3,
 										ice_out->cands[0][0].con_addr, ice_out->cands[0][0].con_port,
 										v_engine->local_sdp_ip, v_engine->local_sdp_port
@@ -8194,7 +8434,7 @@ SWITCH_DECLARE(void) switch_core_media_gen_local_sdp(switch_core_session_t *sess
 
 					if (v_engine->rtcp_mux < 1 || is_outbound || switch_channel_test_flag(session->channel, CF_RECOVERING)) {
 
-						switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=candidate:%s 2 %s %u %s %d typ host generation 0\n", 
+						switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=candidate:%s 2 %s %u %s %d typ host generation 0\r\n", 
 										tmp1, ice_out->cands[0][0].transport, c2,
 										ice_out->cands[0][0].con_addr, ice_out->cands[0][0].con_port + (v_engine->rtcp_mux > 0 ? 0 : 1)
 										);
@@ -8204,7 +8444,7 @@ SWITCH_DECLARE(void) switch_core_media_gen_local_sdp(switch_core_session_t *sess
 							strcmp(v_engine->local_sdp_ip, ice_out->cands[0][1].con_addr)
 							&& v_engine->local_sdp_port != ice_out->cands[0][1].con_port) {
 						
-							switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=candidate:%s 2 %s %u %s %d typ srflx generation 0\n", 
+							switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=candidate:%s 2 %s %u %s %d typ srflx generation 0\r\n", 
 											tmp2, ice_out->cands[0][0].transport, c4,
 											ice_out->cands[0][0].con_addr, ice_out->cands[0][0].con_port + (v_engine->rtcp_mux > 0 ? 0 : 1),
 											v_engine->local_sdp_ip, v_engine->local_sdp_port + (v_engine->rtcp_mux > 0 ? 0 : 1)
@@ -8215,7 +8455,7 @@ SWITCH_DECLARE(void) switch_core_media_gen_local_sdp(switch_core_session_t *sess
 			
 				
 #ifdef GOOGLE_ICE
-					switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=ice-options:google-ice\n");
+					switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=ice-options:google-ice\r\n");
 #endif
 				}
 
@@ -8228,16 +8468,16 @@ SWITCH_DECLARE(void) switch_core_media_gen_local_sdp(switch_core_session_t *sess
 						switch_rtp_crypto_key_type_t j = SUITES[smh->crypto_suite_order[i]].type;
 					
 						if ((a_engine->crypto_type == j || a_engine->crypto_type == CRYPTO_INVALID) && !zstr(a_engine->ssec[j].local_crypto_key)) {
-							switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=crypto:%s\n", v_engine->ssec[j].local_crypto_key);
+							switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=crypto:%s\r\n", v_engine->ssec[j].local_crypto_key);
 						}
 					}
-					//switch_snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "a=encryption:optional\n");
+					//switch_snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "a=encryption:optional\r\n");
 				}
 
 
 				if (local_sdp_video_zrtp_hash) {
 					switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "Adding video a=zrtp-hash:%s\n", local_sdp_video_zrtp_hash);
-					switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=zrtp-hash:%s\n", local_sdp_video_zrtp_hash);
+					switch_snprintf(buf + strlen(buf), SDPBUFLEN - strlen(buf), "a=zrtp-hash:%s\r\n", local_sdp_video_zrtp_hash);
 				}
 
 
@@ -8598,7 +8838,7 @@ SWITCH_DECLARE(void) switch_core_media_patch_sdp(switch_core_session_t *session)
 				smh->session_id++;
 
 
-				snprintf(o_line, sizeof(o_line), "o=%s %010u %010u IN %s %s\n",
+				snprintf(o_line, sizeof(o_line), "o=%s %010u %010u IN %s %s\r\n",
 						 smh->mparams->sdp_username, smh->owner_id, smh->session_id, family, smh->mparams->sipip);
 
 				strncpy(q, o_line, strlen(o_line));
@@ -8621,7 +8861,7 @@ SWITCH_DECLARE(void) switch_core_media_patch_sdp(switch_core_session_t *session)
 				len = (se - p);
 				p += len;
 
-				snprintf(s_line, sizeof(s_line), "s=%s\n", smh->mparams->sdp_username);
+				snprintf(s_line, sizeof(s_line), "s=%s\r\n", smh->mparams->sdp_username);
 
 				strncpy(q, s_line, strlen(s_line));
 				q += strlen(s_line) - 1;
@@ -9582,6 +9822,24 @@ SWITCH_DECLARE(switch_rtp_stats_t *) switch_core_media_get_stats(switch_core_ses
 	return NULL;
 }
 
+//?
+SWITCH_DECLARE(switch_bool_t) switch_core_media_check_udptl_mode(switch_core_session_t *session, switch_media_type_t type)
+{
+	switch_media_handle_t *smh;
+
+	switch_assert(session);
+
+	if (!(smh = session->media_handle)) {
+		return SWITCH_FALSE;
+	}
+
+	if (switch_rtp_ready(smh->engines[type].rtp_session)) {
+		return switch_rtp_test_flag(smh->engines[type].rtp_session, SWITCH_RTP_FLAG_UDPTL) ? SWITCH_TRUE : SWITCH_FALSE;
+	}
+
+	return SWITCH_FALSE;
+}
+
 //?
 SWITCH_DECLARE(switch_status_t) switch_core_media_udptl_mode(switch_core_session_t *session, switch_media_type_t type)
 {
@@ -10266,11 +10524,18 @@ SWITCH_DECLARE (void) switch_core_media_recover_session(switch_core_session_t *s
 SWITCH_DECLARE(void) switch_core_media_init(void)
 {
 	switch_core_gen_certs(DTLS_SRTP_FNAME ".pem");	
+
+	video_globals.cpu_count = switch_core_cpu_count();
+	video_globals.cur_cpu = 0;
+
+	switch_core_new_memory_pool(&video_globals.pool);
+	switch_mutex_init(&video_globals.mutex, SWITCH_MUTEX_NESTED, video_globals.pool);
+
 }
 
 SWITCH_DECLARE(void) switch_core_media_deinit(void)
 {
-	
+	switch_core_destroy_memory_pool(&video_globals.pool);
 }
 
 static int payload_number(const char *name)
@@ -10912,6 +11177,53 @@ SWITCH_DECLARE(switch_status_t) switch_core_session_write_video_frame(switch_cor
 	return status;
 }
 
+SWITCH_DECLARE(switch_status_t) switch_core_session_wait_for_video_input_params(switch_core_session_t *session, uint32_t timeout_ms)
+{
+	switch_media_handle_t *smh;
+	switch_codec_implementation_t read_impl = { 0 };
+	switch_rtp_engine_t *v_engine = NULL;
+
+	switch_assert(session != NULL);
+
+	if (!(smh = session->media_handle)) {
+		return SWITCH_STATUS_FALSE;
+	}
+	
+	if (!switch_channel_test_flag(session->channel, CF_VIDEO_DECODED_READ)) {
+		return SWITCH_STATUS_GENERR;;
+	}
+
+	v_engine = &smh->engines[SWITCH_MEDIA_TYPE_VIDEO];
+
+	if (v_engine->smode == SWITCH_MEDIA_FLOW_SENDONLY) {
+		return SWITCH_STATUS_NOTIMPL;
+	}
+
+	switch_core_session_get_read_impl(session, &read_impl);
+		
+	while(switch_channel_ready(session->channel) && timeout_ms > 0) {
+		switch_frame_t *read_frame;
+		switch_status_t status;
+		
+		if (video_globals.synced && 
+			switch_channel_test_flag(session->channel, CF_VIDEO_READY) && smh->vid_params.width && smh->vid_params.height && smh->vid_params.fps) {
+			return SWITCH_STATUS_SUCCESS;
+		}
+
+		switch_core_session_request_video_refresh(session);
+		status = switch_core_session_read_frame(session, &read_frame, SWITCH_IO_FLAG_NONE, 0);
+
+		if (!SWITCH_READ_ACCEPTABLE(status)) {
+			return SWITCH_STATUS_FALSE;
+		}
+
+		timeout_ms -= (read_impl.microseconds_per_packet / 1000);
+	}
+
+	return SWITCH_STATUS_TIMEOUT;
+	
+}
+
 SWITCH_DECLARE(switch_status_t) switch_core_session_read_video_frame(switch_core_session_t *session, switch_frame_t **frame, switch_io_flag_t flags,
 																	 int stream_id)
 {
@@ -10999,8 +11311,16 @@ SWITCH_DECLARE(switch_status_t) switch_core_session_read_video_frame(switch_core
 		}
 
 		if ((*frame)->img && (*frame)->img->d_w && (*frame)->img->d_h) {
-			smh->vid_params.width = (*frame)->img->d_w;
-			smh->vid_params.height = (*frame)->img->d_h;
+
+			if ((*frame)->img->d_w != smh->vid_params.width) {
+				switch_channel_set_variable_printf(session->channel, "video_width", "%d", (*frame)->img->d_w);
+				smh->vid_params.width = (*frame)->img->d_w;
+			}
+
+			if ((*frame)->img->d_h != smh->vid_params.height) {
+				switch_channel_set_variable_printf(session->channel, "video_height", "%d", (*frame)->img->d_h);
+				smh->vid_params.height = (*frame)->img->d_h;
+			}
 		}
 
 		if (switch_test_flag((*frame), SFF_WAIT_KEY_FRAME)) {
diff --git a/src/switch_core_media_bug.c b/src/switch_core_media_bug.c
index 6a2363f051..fc80c13da8 100644
--- a/src/switch_core_media_bug.c
+++ b/src/switch_core_media_bug.c
@@ -1095,29 +1095,36 @@ SWITCH_DECLARE(switch_status_t) switch_core_media_bug_enumerate(switch_core_sess
 
 SWITCH_DECLARE(switch_status_t) switch_core_media_bug_remove_all_function(switch_core_session_t *session, const char *function)
 {
-	switch_media_bug_t *bp;
+	switch_media_bug_t *bp, *last = NULL;
 	switch_status_t status = SWITCH_STATUS_FALSE;
 
 	if (session->bugs) {
 		switch_thread_rwlock_wrlock(session->bug_rwlock);
 		for (bp = session->bugs; bp; bp = bp->next) {
-			if ((bp->thread_id && bp->thread_id != switch_thread_self()) || switch_test_flag(bp, SMBF_LOCK)) {
+			if (!switch_test_flag(session, SSF_DESTROYABLE) && 
+				((bp->thread_id && bp->thread_id != switch_thread_self()) || switch_test_flag(bp, SMBF_LOCK))) {
 				switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "BUG is thread locked skipping.\n");
+				last = bp;
 				continue;
 			}
 			
 			if (!zstr(function) && strcmp(bp->function, function)) {
+				last = bp;
 				continue;
 			}
 
-
 			if (bp->callback) {
 				bp->callback(bp, bp->user_data, SWITCH_ABC_TYPE_CLOSE);
 			}
 			switch_core_media_bug_destroy(bp);
 			switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "Removing BUG from %s\n", switch_channel_get_name(session->channel));
+
+			if (last) {
+				last->next = bp->next;
+			} else {
+				session->bugs = bp->next;
+			}
 		}
-		session->bugs = NULL;
 		switch_thread_rwlock_unlock(session->bug_rwlock);
 		status = SWITCH_STATUS_SUCCESS;
 	}
diff --git a/src/switch_core_video.c b/src/switch_core_video.c
index 4ea9669d6c..288940bea1 100644
--- a/src/switch_core_video.c
+++ b/src/switch_core_video.c
@@ -31,8 +31,8 @@
 
 #ifdef SWITCH_HAVE_VPX
 #include "vpx/vpx_image.h"
-#if VPX_IMAGE_ABI_VERSION != (3)
-#error VPX_IMAGE_ABI_VERSION is not (3)
+#if VPX_IMAGE_ABI_VERSION != (4)
+#error VPX_IMAGE_ABI_VERSION is not (4)
 #endif
 #endif
 
@@ -358,8 +358,8 @@ SWITCH_DECLARE(void) switch_img_patch(switch_image_t *IMG, switch_image_t *img,
 	len /= 2;
 
 	for (i = y; i < max_h; i += 2) {
-		memcpy(IMG->planes[SWITCH_PLANE_U] + IMG->stride[SWITCH_PLANE_U] * i / 2 + x / 2, img->planes[SWITCH_PLANE_U] + img->stride[SWITCH_PLANE_U] * (i - y + yoff) / 2 + xoff / 2, len);
-		memcpy(IMG->planes[SWITCH_PLANE_V] + IMG->stride[SWITCH_PLANE_V] * i / 2 + x / 2, img->planes[SWITCH_PLANE_V] + img->stride[SWITCH_PLANE_V] * (i - y + yoff) / 2 + xoff / 2, len);
+		memcpy(IMG->planes[SWITCH_PLANE_U] + IMG->stride[SWITCH_PLANE_U] * (i / 2) + x / 2, img->planes[SWITCH_PLANE_U] + img->stride[SWITCH_PLANE_U] * ((i - y + yoff) / 2) + xoff / 2, len);
+		memcpy(IMG->planes[SWITCH_PLANE_V] + IMG->stride[SWITCH_PLANE_V] * (i / 2) + x / 2, img->planes[SWITCH_PLANE_V] + img->stride[SWITCH_PLANE_V] * ((i - y + yoff) / 2) + xoff / 2, len);
 	}
 }
 
@@ -529,8 +529,8 @@ SWITCH_DECLARE(void) switch_img_fill(switch_image_t *img, int x, int y, int w, i
 		len /= 2;
 
 		for (i = y; i < max_h; i += 2) {
-			memset(img->planes[SWITCH_PLANE_U] + img->stride[SWITCH_PLANE_U] * i / 2 + x / 2, yuv_color.u, len);
-			memset(img->planes[SWITCH_PLANE_V] + img->stride[SWITCH_PLANE_V] * i / 2 + x / 2, yuv_color.v, len);
+			memset(img->planes[SWITCH_PLANE_U] + img->stride[SWITCH_PLANE_U] * (i / 2) + x / 2, yuv_color.u, len);
+			memset(img->planes[SWITCH_PLANE_V] + img->stride[SWITCH_PLANE_V] * (i / 2) + x / 2, yuv_color.v, len);
 		}
 	} else if (img->fmt == SWITCH_IMG_FMT_ARGB) {
 		for (i = 0; i < img->d_w; i++) {
@@ -555,8 +555,8 @@ static inline void switch_img_get_yuv_pixel(switch_image_t *img, switch_yuv_colo
 	if (x < 0 || y < 0 || x >= img->d_w || y >= img->d_h) return;
 
 	yuv->y = *(img->planes[SWITCH_PLANE_Y] + img->stride[SWITCH_PLANE_Y] * y + x);
-	yuv->u = *(img->planes[SWITCH_PLANE_U] + img->stride[SWITCH_PLANE_U] * y / 2 + x / 2);
-	yuv->v = *(img->planes[SWITCH_PLANE_V] + img->stride[SWITCH_PLANE_V] * y / 2 + x / 2);
+	yuv->u = *(img->planes[SWITCH_PLANE_U] + img->stride[SWITCH_PLANE_U] * (y / 2) + x / 2);
+	yuv->v = *(img->planes[SWITCH_PLANE_V] + img->stride[SWITCH_PLANE_V] * (y / 2) + x / 2);
 }
 #endif
 
@@ -580,11 +580,13 @@ static inline void switch_img_get_rgb_pixel(switch_image_t *img, switch_rgb_colo
 #endif	
 }
 
-SWITCH_DECLARE(void) switch_img_overlay(switch_image_t *IMG, switch_image_t *img, int x, int y, uint8_t alpha)
+SWITCH_DECLARE(void) switch_img_overlay(switch_image_t *IMG, switch_image_t *img, int x, int y, uint8_t percent)
 {
 	int i, j, len, max_h;
 	switch_rgb_color_t RGB = {0}, rgb = {0}, c = {0};
 	int xoff = 0, yoff = 0;
+	uint8_t alpha = (int8_t)((255 * percent) / 100);
+
 
 	switch_assert(IMG->fmt == SWITCH_IMG_FMT_I420);
 
@@ -1059,7 +1061,7 @@ SWITCH_DECLARE(uint32_t) switch_img_txt_handle_render(switch_img_txt_handle_t *h
 		pen.y += slot->advance.y >> 6;
 	}
 
-	ret = width + slot->bitmap.width * 3;
+	ret = width + slot->bitmap.width * 5;
 
 	FT_Done_Face(face);
 
@@ -1126,6 +1128,7 @@ SWITCH_DECLARE(switch_image_t *) switch_img_write_text_img(int w, int h, switch_
 
     if (len < 5) len = 5;
 
+
 	switch_img_txt_handle_create(&txthandle, font_face, fg, bg, font_size, 0, NULL);
 	switch_color_set_rgb(&bgcolor, bg);
 
@@ -1165,6 +1168,8 @@ SWITCH_DECLARE(switch_image_t *) switch_img_write_text_img(int w, int h, switch_
                                  txt, NULL, fg, bg, 0, 0);
 	switch_img_txt_handle_destroy(&txthandle);
 
+	switch_safe_free(duptxt);
+
 	return txtimg;
 }
 
@@ -1201,14 +1206,14 @@ SWITCH_DECLARE(void) switch_img_patch_hole(switch_image_t *IMG, switch_image_t *
 			int size = rect->x > x ? rect->x - x : 0;
 
 			size /= 2;
-			memcpy(IMG->planes[SWITCH_PLANE_U] + IMG->stride[SWITCH_PLANE_U] * i / 2 + x / 2, img->planes[SWITCH_PLANE_U] + img->stride[SWITCH_PLANE_U] * (i - y) / 2, size);
-			memcpy(IMG->planes[SWITCH_PLANE_V] + IMG->stride[SWITCH_PLANE_V] * i / 2 + x / 2, img->planes[SWITCH_PLANE_V] + img->stride[SWITCH_PLANE_V] * (i - y) / 2, size);
+			memcpy(IMG->planes[SWITCH_PLANE_U] + IMG->stride[SWITCH_PLANE_U] * (i / 2) + x / 2, img->planes[SWITCH_PLANE_U] + img->stride[SWITCH_PLANE_U] * ((i - y) / 2), size);
+			memcpy(IMG->planes[SWITCH_PLANE_V] + IMG->stride[SWITCH_PLANE_V] * (i / 2) + x / 2, img->planes[SWITCH_PLANE_V] + img->stride[SWITCH_PLANE_V] * ((i - y) / 2), size);
 			size = MIN(img->d_w - rect->w - size, IMG->d_w - (rect->x + rect->w)) / 2;
-			memcpy(IMG->planes[SWITCH_PLANE_U] + IMG->stride[SWITCH_PLANE_U] * i / 2 + (rect->x + rect->w) / 2, img->planes[SWITCH_PLANE_U] + img->stride[SWITCH_PLANE_U] * (i - y) / 2 + (rect->w + (rect->x - x)) / 2, size);
-			memcpy(IMG->planes[SWITCH_PLANE_V] + IMG->stride[SWITCH_PLANE_V] * i / 2 + (rect->x + rect->w) / 2, img->planes[SWITCH_PLANE_V] + img->stride[SWITCH_PLANE_V] * (i - y) / 2 + (rect->w + (rect->x - x)) / 2, size);
+			memcpy(IMG->planes[SWITCH_PLANE_U] + IMG->stride[SWITCH_PLANE_U] * (i / 2) + (rect->x + rect->w) / 2, img->planes[SWITCH_PLANE_U] + img->stride[SWITCH_PLANE_U] * ((i - y) / 2) + (rect->w + (rect->x - x)) / 2, size);
+			memcpy(IMG->planes[SWITCH_PLANE_V] + IMG->stride[SWITCH_PLANE_V] * (i / 2) + (rect->x + rect->w) / 2, img->planes[SWITCH_PLANE_V] + img->stride[SWITCH_PLANE_V] * ((i - y) / 2) + (rect->w + (rect->x - x)) / 2, size);
 		} else {
-			memcpy(IMG->planes[SWITCH_PLANE_U] + IMG->stride[SWITCH_PLANE_U] * i / 2 + x / 2, img->planes[SWITCH_PLANE_U] + img->stride[SWITCH_PLANE_U] * (i - y) / 2, len);
-			memcpy(IMG->planes[SWITCH_PLANE_V] + IMG->stride[SWITCH_PLANE_V] * i / 2 + x / 2, img->planes[SWITCH_PLANE_V] + img->stride[SWITCH_PLANE_V] * (i - y) / 2, len);
+			memcpy(IMG->planes[SWITCH_PLANE_U] + IMG->stride[SWITCH_PLANE_U] * (i / 2) + x / 2, img->planes[SWITCH_PLANE_U] + img->stride[SWITCH_PLANE_U] * ((i - y) / 2), len);
+			memcpy(IMG->planes[SWITCH_PLANE_V] + IMG->stride[SWITCH_PLANE_V] * (i / 2) + x / 2, img->planes[SWITCH_PLANE_V] + img->stride[SWITCH_PLANE_V] * ((i - y) / 2), len);
 		}
 	}
 }
@@ -1936,29 +1941,119 @@ SWITCH_DECLARE(switch_status_t) switch_img_fit(switch_image_t **srcP, int width,
 	return SWITCH_STATUS_FALSE;
 }
 
-SWITCH_DECLARE(switch_status_t) switch_img_convert(switch_image_t *src, switch_convert_fmt_t fmt, void *dest, switch_size_t *size)
+#ifdef SWITCH_HAVE_YUV
+static inline uint32_t switch_img_fmt2fourcc(switch_img_fmt_t fmt)
+{
+	uint32_t fourcc;
+
+	switch(fmt) {
+		case SWITCH_IMG_FMT_NONE:      fourcc = FOURCC_ANY ; break;
+		case SWITCH_IMG_FMT_RGB24:     fourcc = FOURCC_24BG; break;
+		case SWITCH_IMG_FMT_RGB32:     fourcc = FOURCC_ANY ; break;
+		case SWITCH_IMG_FMT_RGB565:    fourcc = FOURCC_ANY ; break;
+		case SWITCH_IMG_FMT_RGB555:    fourcc = FOURCC_ANY ; break;
+		case SWITCH_IMG_FMT_UYVY:      fourcc = FOURCC_ANY ; break;
+		case SWITCH_IMG_FMT_YUY2:      fourcc = FOURCC_YUY2; break;
+		case SWITCH_IMG_FMT_YVYU:      fourcc = FOURCC_ANY ; break;
+		case SWITCH_IMG_FMT_BGR24:     fourcc = FOURCC_RAW ; break;
+		case SWITCH_IMG_FMT_RGB32_LE:  fourcc = FOURCC_ANY ; break;
+		case SWITCH_IMG_FMT_ARGB:      fourcc = FOURCC_ANY ; break;
+		case SWITCH_IMG_FMT_ARGB_LE:   fourcc = FOURCC_ANY ; break;
+		case SWITCH_IMG_FMT_RGB565_LE: fourcc = FOURCC_ANY ; break;
+		case SWITCH_IMG_FMT_RGB555_LE: fourcc = FOURCC_ANY ; break;
+		case SWITCH_IMG_FMT_YV12:      fourcc = FOURCC_ANY ; break;
+		case SWITCH_IMG_FMT_I420:      fourcc = FOURCC_I420; break;
+		case SWITCH_IMG_FMT_VPXYV12:   fourcc = FOURCC_ANY ; break;
+		case SWITCH_IMG_FMT_VPXI420:   fourcc = FOURCC_ANY ; break;
+		case SWITCH_IMG_FMT_I422:      fourcc = FOURCC_ANY ; break;
+		case SWITCH_IMG_FMT_I444:      fourcc = FOURCC_ANY ; break;
+		case SWITCH_IMG_FMT_I440:      fourcc = FOURCC_ANY ; break;
+		case SWITCH_IMG_FMT_444A:      fourcc = FOURCC_ANY ; break;
+		case SWITCH_IMG_FMT_I42016:    fourcc = FOURCC_ANY ; break;
+		case SWITCH_IMG_FMT_I42216:    fourcc = FOURCC_ANY ; break;
+		case SWITCH_IMG_FMT_I44416:    fourcc = FOURCC_ANY ; break;
+		case SWITCH_IMG_FMT_I44016:    fourcc = FOURCC_ANY ; break;
+		default: fourcc = FOURCC_ANY;
+    }
+
+    return fourcc;
+}
+#endif
+
+SWITCH_DECLARE(switch_status_t) switch_img_to_raw(switch_image_t *src, void *dest, switch_size_t size, switch_img_fmt_t fmt)
 {
 #ifdef SWITCH_HAVE_YUV
-	switch_assert(src->fmt == SWITCH_IMG_FMT_I420);
+	uint32_t fourcc;
+	int ret;
 
-	switch (fmt) {
-	case SWITCH_CONVERT_FMT_YUYV:
-		{
-			switch_size_t size_in = *size;
-			ConvertFromI420(src->planes[0], src->stride[0],
-							src->planes[1], src->stride[1],
-							src->planes[2], src->stride[2],
-							dest, size_in,
-							src->d_w, src->d_h,
-							FOURCC_YUY2);  
-			*size = src->d_w * src->d_h * 2;
+	switch_assert(src->fmt == SWITCH_IMG_FMT_I420); // todo: support other formats
+	switch_assert(dest);
 
-			return SWITCH_STATUS_SUCCESS;
-		}
-	default:
-		abort();
-		break;
+	fourcc = switch_img_fmt2fourcc(fmt);
+
+	if (fourcc == FOURCC_ANY) {
+		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "unsupported format: %d\n", fmt);
+		return SWITCH_STATUS_FALSE;
 	}
+
+	ret = ConvertFromI420(src->planes[0], src->stride[0],
+					src->planes[1], src->stride[1],
+					src->planes[2], src->stride[2],
+					dest, size,
+					src->d_w, src->d_h,
+					fourcc);
+
+	return ret == 0 ? SWITCH_STATUS_SUCCESS : SWITCH_STATUS_FALSE;
+#else
+	return SWITCH_STATUS_FALSE;
+#endif
+}
+
+SWITCH_DECLARE(switch_status_t) switch_img_from_raw(switch_image_t *dest, void *src, switch_img_fmt_t fmt, int width, int height)
+{
+#ifdef SWITCH_HAVE_YUV
+	uint32_t fourcc;
+	int ret;
+
+	fourcc = switch_img_fmt2fourcc(fmt);
+
+	if (fourcc == FOURCC_ANY) {
+		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "unsupported format: %d\n", fmt);
+		return SWITCH_STATUS_FALSE;
+	}
+
+	if (!dest && width > 0 && height > 0) dest = switch_img_alloc(NULL, SWITCH_IMG_FMT_I420, width, height, 1);
+	if (!dest) return SWITCH_STATUS_FALSE;
+
+	if (width == 0 || height == 0) {
+		width = dest->d_w;
+		height = dest->d_h;
+	}
+
+/*
+	int ConvertToI420(const uint8* src_frame, size_t src_size,
+                  uint8* dst_y, int dst_stride_y,
+                  uint8* dst_u, int dst_stride_u,
+                  uint8* dst_v, int dst_stride_v,
+                  int crop_x, int crop_y,
+                  int src_width, int src_height,
+                  int crop_width, int crop_height,
+                  enum RotationMode rotation,
+                  uint32 format);
+
+	src_size is only used when FOURCC_MJPG which we don't support so always 0
+*/
+
+	ret = ConvertToI420(src, 0,
+					dest->planes[0], dest->stride[0],
+					dest->planes[1], dest->stride[1],
+					dest->planes[2], dest->stride[2],
+					0, 0,
+					width, height,
+					width, height,
+					0, fourcc);
+
+	return ret == 0 ? SWITCH_STATUS_SUCCESS : SWITCH_STATUS_FALSE;
 #else
 	return SWITCH_STATUS_FALSE;
 #endif
diff --git a/src/switch_cpp.cpp b/src/switch_cpp.cpp
index 0467109926..485cb02edf 100644
--- a/src/switch_cpp.cpp
+++ b/src/switch_cpp.cpp
@@ -740,6 +740,7 @@ SWITCH_DECLARE(void) CoreSession::setVariable(char *var, char *val)
 {
 	this_check_void();
 	sanity_check_noreturn;
+	switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "CoreSession::setVariable('%s', '%s')\n", var, val);
 	switch_channel_set_variable_var_check(channel, var, val, SWITCH_FALSE);
 }
 
diff --git a/src/switch_ivr.c b/src/switch_ivr.c
index 467c9e6309..96fc5a7971 100644
--- a/src/switch_ivr.c
+++ b/src/switch_ivr.c
@@ -1573,7 +1573,8 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_3p_media(const char *uuid, switch_med
 	if ((session = switch_core_session_locate(uuid))) {
 		channel = switch_core_session_get_channel(session);
 		
-		if (switch_channel_test_flag(channel, CF_MEDIA_TRANS)) {
+		if (switch_channel_test_flag(channel, CF_MEDIA_TRANS) || !switch_channel_test_flag(channel, CF_PROXY_MODE)) {
+			switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR,  "Operation is invalid\n");
 			switch_core_session_rwunlock(session);
 			return SWITCH_STATUS_INUSE;
 		}
@@ -1584,70 +1585,71 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_3p_media(const char *uuid, switch_med
 			swap = 1;
 		}
 
-		if (switch_channel_test_flag(channel, CF_PROXY_MODE)) {
-			status = SWITCH_STATUS_SUCCESS;
 
-			/* If we had early media in bypass mode before, it is no longer relevant */
-			if (switch_channel_test_flag(channel, CF_EARLY_MEDIA)) {
-				switch_core_session_message_t msg2 = { 0 };
+		status = SWITCH_STATUS_SUCCESS;
+
+		/* If we had early media in bypass mode before, it is no longer relevant */
+		if (switch_channel_test_flag(channel, CF_EARLY_MEDIA)) {
+			switch_core_session_message_t msg2 = { 0 };
 				
-				msg2.message_id = SWITCH_MESSAGE_INDICATE_CLEAR_PROGRESS;
-				msg2.from = __FILE__;
-				switch_core_session_receive_message(session, &msg2);
-			}
+			msg2.message_id = SWITCH_MESSAGE_INDICATE_CLEAR_PROGRESS;
+			msg2.from = __FILE__;
+			switch_core_session_receive_message(session, &msg2);
+		}
 			
-			if ((flags & SMF_REPLYONLY_A)) {
-				msg.numeric_arg = 1;
-			}
+		if ((flags & SMF_REPLYONLY_A)) {
+			msg.numeric_arg = 1;
+		}
 			
-			switch_channel_set_flag(channel, CF_3P_MEDIA_REQUESTED);
+		switch_channel_set_flag(channel, CF_3P_MEDIA_REQUESTED);
 
-			if (switch_core_session_receive_message(session, &msg) != SWITCH_STATUS_SUCCESS) {
-				switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Can't re-establsh media on %s\n", switch_channel_get_name(channel));
-				switch_channel_clear_flag(channel, CF_3P_MEDIA_REQUESTED);
-				switch_core_session_rwunlock(session);
-				return SWITCH_STATUS_GENERR;
-			}
-
-			if ((flags & SMF_REPLYONLY_B)) {
-				msg.numeric_arg = 1;
-			} else {
-				msg.numeric_arg = 0;
-			}
-
-			if ((flags & SMF_IMMEDIATE)) {
-				switch_channel_wait_for_flag(channel, CF_REQ_MEDIA, SWITCH_FALSE, 250, NULL);
-				switch_yield(250000);
-			} else {
-				switch_channel_wait_for_flag(channel, CF_REQ_MEDIA, SWITCH_FALSE, 10000, NULL);
-				switch_channel_wait_for_flag(channel, CF_MEDIA_ACK, SWITCH_TRUE, 10000, NULL);
-				switch_channel_wait_for_flag(channel, CF_MEDIA_SET, SWITCH_TRUE, 10000, NULL);
-				//switch_core_session_read_frame(session, &read_frame, SWITCH_IO_FLAG_NONE, 0);
-			}
-
-			if ((flags & SMF_REBRIDGE)
-				&& (other_uuid = switch_channel_get_variable(channel, SWITCH_SIGNAL_BRIDGE_VARIABLE))
-				&& (other_session = switch_core_session_locate(other_uuid))) {
-
-				other_channel = switch_core_session_get_channel(other_session);
-				switch_assert(other_channel != NULL);
-				
-				switch_channel_set_flag(other_channel, CF_3P_MEDIA_REQUESTED);
-				switch_channel_set_variable(other_channel, "rtp_secure_media", "optional");
-				
-				switch_core_session_receive_message(other_session, &msg);
-				switch_channel_wait_for_flag(other_channel, CF_REQ_MEDIA, SWITCH_FALSE, 10000, NULL);
-				switch_channel_wait_for_flag(other_channel, CF_MEDIA_ACK, SWITCH_TRUE, 10000, NULL);
-				switch_channel_wait_for_flag(other_channel, CF_MEDIA_SET, SWITCH_TRUE, 10000, NULL);
-				//switch_core_session_read_frame(other_session, &read_frame, SWITCH_IO_FLAG_NONE, 0);
-				switch_channel_clear_state_handler(other_channel, NULL);
-				switch_core_session_rwunlock(other_session);
-			}
-			if (other_channel) {
-				switch_channel_clear_state_handler(channel, NULL);
-			}
+		if (switch_core_session_receive_message(session, &msg) != SWITCH_STATUS_SUCCESS) {
+			switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Can't re-establsh media on %s\n", switch_channel_get_name(channel));
+			switch_channel_clear_flag(channel, CF_3P_MEDIA_REQUESTED);
+			switch_core_session_rwunlock(session);
+			return SWITCH_STATUS_GENERR;
 		}
 
+		if ((flags & SMF_REPLYONLY_B)) {
+			msg.numeric_arg = 1;
+		} else {
+			msg.numeric_arg = 0;
+		}
+
+		if ((flags & SMF_IMMEDIATE)) {
+			switch_channel_wait_for_flag(channel, CF_REQ_MEDIA, SWITCH_FALSE, 250, NULL);
+			switch_yield(250000);
+		} else {
+			switch_channel_wait_for_flag(channel, CF_REQ_MEDIA, SWITCH_FALSE, 10000, NULL);
+			switch_channel_wait_for_flag(channel, CF_MEDIA_ACK, SWITCH_TRUE, 10000, NULL);
+			switch_channel_wait_for_flag(channel, CF_MEDIA_SET, SWITCH_TRUE, 10000, NULL);
+			switch_channel_wait_for_flag(channel, CF_3P_MEDIA_REQUESTED, SWITCH_FALSE, 10000, NULL);
+			//switch_core_session_read_frame(session, &read_frame, SWITCH_IO_FLAG_NONE, 0);
+		}
+
+		if ((flags & SMF_REBRIDGE)
+			&& (other_uuid = switch_channel_get_variable(channel, SWITCH_SIGNAL_BRIDGE_VARIABLE))
+			&& (other_session = switch_core_session_locate(other_uuid))) {
+
+			other_channel = switch_core_session_get_channel(other_session);
+			switch_assert(other_channel != NULL);
+				
+			switch_channel_set_flag(other_channel, CF_3P_MEDIA_REQUESTED);
+			switch_channel_set_variable(other_channel, "rtp_secure_media", "optional");
+				
+			switch_core_session_receive_message(other_session, &msg);
+			switch_channel_wait_for_flag(other_channel, CF_REQ_MEDIA, SWITCH_FALSE, 10000, NULL);
+			switch_channel_wait_for_flag(other_channel, CF_MEDIA_ACK, SWITCH_TRUE, 10000, NULL);
+			switch_channel_wait_for_flag(other_channel, CF_MEDIA_SET, SWITCH_TRUE, 10000, NULL);
+			switch_channel_wait_for_flag(other_channel, CF_3P_MEDIA_REQUESTED, SWITCH_FALSE, 10000, NULL);
+			//switch_core_session_read_frame(other_session, &read_frame, SWITCH_IO_FLAG_NONE, 0);
+			switch_channel_clear_state_handler(other_channel, NULL);
+			switch_core_session_rwunlock(other_session);
+		}
+		if (other_channel) {
+			switch_channel_clear_state_handler(channel, NULL);
+		}
+	
 		switch_channel_clear_flag(channel, CF_MEDIA_TRANS);
 		switch_core_session_rwunlock(session);
 
@@ -1781,7 +1783,8 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_3p_nomedia(const char *uuid, switch_m
 		status = SWITCH_STATUS_SUCCESS;
 		channel = switch_core_session_get_channel(session);
 		
-		if (switch_channel_test_flag(channel, CF_MEDIA_TRANS)) {
+		if (switch_channel_test_flag(channel, CF_MEDIA_TRANS) || (!(flags & SMF_FORCE) && switch_channel_test_flag(channel, CF_PROXY_MODE))) {
+			switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR,  "Operation is invalid\n");
 			switch_core_session_rwunlock(session);
 			return SWITCH_STATUS_INUSE;
 		}
@@ -1803,6 +1806,7 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_3p_nomedia(const char *uuid, switch_m
 				switch_channel_set_flag(other_channel, CF_RESET);
 				switch_channel_set_flag(other_channel, CF_REDIRECT);
 
+				switch_channel_set_variable(channel, SWITCH_R_SDP_VARIABLE, NULL);
 				switch_channel_set_flag(channel, CF_3P_NOMEDIA_REQUESTED);
 				switch_core_session_receive_message(session, &msg);
 
@@ -1816,23 +1820,27 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_3p_nomedia(const char *uuid, switch_m
 					switch_channel_wait_for_state(other_channel, channel, CS_PARK);
 				}
 				
+					
+				if (!switch_core_session_in_thread(session)) {
+					switch_channel_wait_for_state(channel, NULL, CS_PARK);
+				}
+
+				switch_channel_wait_for_flag(channel, CF_REQ_MEDIA, SWITCH_FALSE, 10000, NULL);
+				switch_channel_wait_for_flag(channel, CF_MEDIA_ACK, SWITCH_TRUE, 10000, NULL);
+				switch_channel_wait_for_flag(channel, CF_3P_NOMEDIA_REQUESTED, SWITCH_FALSE, 10000, NULL);
+
 				msg.string_arg = switch_channel_get_variable(channel, SWITCH_R_SDP_VARIABLE);
 				switch_channel_set_flag(other_channel, CF_3P_NOMEDIA_REQUESTED);
 				switch_channel_set_flag(other_channel, CF_3P_NOMEDIA_REQUESTED_BLEG);
-				
+
+
 				switch_core_session_receive_message(other_session, &msg);
 				switch_channel_wait_for_flag(other_channel, CF_REQ_MEDIA, SWITCH_FALSE, 10000, NULL);
-				switch_channel_wait_for_flag(other_channel, CF_MEDIA_SET, SWITCH_TRUE, 10000, NULL);
+				switch_channel_wait_for_flag(other_channel, CF_MEDIA_ACK, SWITCH_TRUE, 10000, NULL);
+				switch_channel_wait_for_flag(other_channel, CF_3P_NOMEDIA_REQUESTED, SWITCH_FALSE, 10000, NULL);
 			}
 
 			if (other_channel) {
-				if (!switch_core_session_in_thread(session)) {
-					switch_channel_wait_for_state(channel, NULL, CS_PARK);
-					switch_channel_wait_for_flag(channel, CF_REQ_MEDIA, SWITCH_FALSE, 10000, NULL);
-					switch_channel_wait_for_flag(channel, CF_MEDIA_ACK, SWITCH_TRUE, 10000, NULL);
-					switch_channel_wait_for_flag(channel, CF_MEDIA_SET, SWITCH_TRUE, 10000, NULL);
-				}
-
 				if (swap) {
 					switch_ivr_signal_bridge(other_session, session);
 				} else {
@@ -3866,13 +3874,13 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_process_fh(switch_core_session_t *ses
 			return SWITCH_STATUS_FALSE;
 		} else if (!strcasecmp(cmd, "pause")) {
 			if (switch_test_flag(fhp, SWITCH_FILE_PAUSE)) {
-				switch_clear_flag(fhp, SWITCH_FILE_PAUSE);
+				switch_clear_flag_locked(fhp, SWITCH_FILE_PAUSE);
 			} else {
-				switch_set_flag(fhp, SWITCH_FILE_PAUSE);
+				switch_set_flag_locked(fhp, SWITCH_FILE_PAUSE);
 			}
 			return SWITCH_STATUS_SUCCESS;
 		} else if (!strcasecmp(cmd, "stop")) {
-			switch_set_flag(fhp, SWITCH_FILE_DONE);
+			switch_set_flag_locked(fhp, SWITCH_FILE_DONE);
 			return SWITCH_STATUS_FALSE;
 		} else if (!strcasecmp(cmd, "truncate")) {
 			switch_core_file_truncate(fhp, 0);
diff --git a/src/switch_ivr_async.c b/src/switch_ivr_async.c
index bd6cb6f34f..00837038aa 100644
--- a/src/switch_ivr_async.c
+++ b/src/switch_ivr_async.c
@@ -1048,6 +1048,8 @@ struct record_helper {
 	switch_thread_t *thread;
 	switch_mutex_t *buffer_mutex;
 	int thread_ready;
+	uint32_t writes;
+	uint32_t vwrites;
 	const char *completion_cause;
 };
 
@@ -1129,16 +1131,21 @@ static void *SWITCH_THREAD_FUNC recording_thread(switch_thread_t *thread, void *
 		return NULL;
 	}
 
-	switch_core_session_get_read_impl(session, &read_impl);
-	bsize = read_impl.decoded_bytes_per_packet;
 	rh = switch_core_media_bug_get_user_data(bug);
 	switch_buffer_create_dynamic(&rh->thread_buffer, 1024 * 512, 1024 * 64, 0);
 	rh->thread_ready = 1;
 
 	channels = switch_core_media_bug_test_flag(bug, SMBF_STEREO) ? 2 : rh->read_impl.number_of_channels;
-	data = switch_core_session_alloc(session, bsize);
+	data = switch_core_session_alloc(session, SWITCH_RECOMMENDED_BUFFER_SIZE);
 
 	while(switch_test_flag(rh->fh, SWITCH_FILE_OPEN)) {
+		if (switch_core_file_has_video(rh->fh, SWITCH_TRUE)) {
+			switch_core_session_get_read_impl(session, &read_impl);
+			if (read_impl.decoded_bytes_per_packet > 0 && read_impl.decoded_bytes_per_packet <= SWITCH_RECOMMENDED_BUFFER_SIZE) {
+				bsize = read_impl.decoded_bytes_per_packet;
+			}
+		}
+
 		switch_mutex_lock(rh->buffer_mutex);
 		inuse = switch_buffer_inuse(rh->thread_buffer);
 
@@ -1253,7 +1260,7 @@ static switch_bool_t record_callback(switch_media_bug_t *bug, void *user_data, s
 
 			switch_core_file_write(&rh->in_fh, mask ? null_data : nframe->data, &len);
 			rh->last_read_time = now;
-			
+			rh->writes++;
 		}
 		break;
 	case SWITCH_ABC_TYPE_TAP_NATIVE_WRITE:
@@ -1292,7 +1299,7 @@ static switch_bool_t record_callback(switch_media_bug_t *bug, void *user_data, s
 			
 			switch_core_file_write(&rh->out_fh, mask ? null_data : nframe->data, &len);
 			rh->last_write_time = now;
-			
+			rh->writes++;
 		}
 		break;
 	case SWITCH_ABC_TYPE_CLOSE:
@@ -1345,35 +1352,39 @@ static switch_bool_t record_callback(switch_media_bug_t *bug, void *user_data, s
 				}
 
 				
-				//if (switch_core_file_has_video(rh->fh)) {
+				//if (switch_core_file_has_video(rh->fh, SWITCH_TRUE)) {
 					//switch_core_media_set_video_file(session, NULL, SWITCH_RW_READ);
 					//switch_channel_clear_flag_recursive(session->channel, CF_VIDEO_DECODED_READ);
 				//}
 
 				switch_core_file_close(rh->fh);
 
-				
-
-				if (rh->fh->samples_out < rh->fh->samplerate * rh->min_sec) {
+				if (!rh->writes && !rh->vwrites) {
+					switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "Discarding empty file %s\n", rh->file);
+					switch_channel_set_variable(channel, "RECORD_DISCARDED", "true");
+					switch_file_remove(rh->file, switch_core_session_get_pool(session));
+					set_completion_cause(rh, "empty-file");
+				} else if (rh->fh->samples_out < rh->fh->samplerate * rh->min_sec) {
 					switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "Discarding short file %s\n", rh->file);
 					switch_channel_set_variable(channel, "RECORD_DISCARDED", "true");
 					switch_file_remove(rh->file, switch_core_session_get_pool(session));
 					set_completion_cause(rh, "input-too-short");
-				}
-
-				if (switch_channel_down_nosig(channel)) {
-					/* We got hung up */
-					switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "Channel is hung up\n");
-					if (rh->speech_detected) {
-						/* Treat it as equivalent with final-silence */
-						set_completion_cause(rh, "success-silence");
-					} else {
-						/* Treat it as equivalent with inital-silence timeout */
-						set_completion_cause(rh, "no-input-timeout");
-					}
 				} else {
-					/* Set the completion_cause to maxtime reached, unless it's already set */
-					set_completion_cause(rh, "success-maxtime");
+
+					if (switch_channel_down_nosig(channel)) {
+						/* We got hung up */
+						switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "Channel is hung up\n");
+						if (rh->speech_detected) {
+							/* Treat it as equivalent with final-silence */
+							set_completion_cause(rh, "success-silence");
+						} else {
+							/* Treat it as equivalent with inital-silence timeout */
+							set_completion_cause(rh, "no-input-timeout");
+						}
+					} else {
+						/* Set the completion_cause to maxtime reached, unless it's already set */
+						set_completion_cause(rh, "success-maxtime");
+					}
 				}
 			}
 			
@@ -1441,7 +1452,9 @@ static switch_bool_t record_callback(switch_media_bug_t *bug, void *user_data, s
 						}
 						return SWITCH_FALSE;
 					}
-
+					
+					rh->writes++;
+					
 					/* check for silence timeout */
 					if (rh->silence_threshold) {
 						switch_codec_implementation_t read_impl = { 0 };
@@ -1500,6 +1513,7 @@ static switch_bool_t record_callback(switch_media_bug_t *bug, void *user_data, s
 				switch_core_session_reset(session, SWITCH_TRUE, SWITCH_TRUE);
 				return SWITCH_FALSE;
 			}
+			rh->vwrites++;
 		}
 		break;
 
@@ -2537,7 +2551,7 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_record_session(switch_core_session_t
 			return SWITCH_STATUS_GENERR;
 		}
 
-		if (switch_core_file_has_video(fh)) {
+		if (switch_core_file_has_video(fh, SWITCH_TRUE)) {
 			//switch_core_media_set_video_file(session, fh, SWITCH_RW_READ);
 			//switch_channel_set_flag_recursive(session->channel, CF_VIDEO_DECODED_READ);
 			
@@ -2562,8 +2576,8 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_record_session(switch_core_session_t
 		out_file = switch_core_session_sprintf(session, "%s-out.%s", file, ext);
 		rh->in_fh.pre_buffer_datalen = rh->out_fh.pre_buffer_datalen = fh->pre_buffer_datalen;
 		channels = 1;
-		switch_set_flag(&rh->in_fh, SWITCH_FILE_NATIVE);
-		switch_set_flag(&rh->out_fh, SWITCH_FILE_NATIVE);
+		switch_set_flag_locked(&rh->in_fh, SWITCH_FILE_NATIVE);
+		switch_set_flag_locked(&rh->out_fh, SWITCH_FILE_NATIVE);
 
 		if (switch_core_file_open(&rh->in_fh, in_file, channels, read_impl.actual_samples_per_second, file_flags, NULL) != SWITCH_STATUS_SUCCESS) {
 			switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error opening %s\n", in_file);
diff --git a/src/switch_ivr_bridge.c b/src/switch_ivr_bridge.c
index cdebd44558..8ad4813066 100644
--- a/src/switch_ivr_bridge.c
+++ b/src/switch_ivr_bridge.c
@@ -333,6 +333,7 @@ static void *audio_bridge_thread(switch_thread_t *thread, void *obj)
 	time_t answer_limit = 0;
 	const char *exec_app = NULL;
 	const char *exec_data = NULL;
+	switch_codec_implementation_t read_impl = { 0 };
 
 #ifdef SWITCH_VIDEO_IN_THREADS
 	struct vid_helper vh = { 0 };
@@ -345,6 +346,9 @@ static void *audio_bridge_thread(switch_thread_t *thread, void *obj)
 		return NULL;
 	}
 
+	switch_core_session_get_read_impl(session_a, &read_impl);
+
+
 	input_callback = data->input_callback;
 	user_data = data->session_data;
 	stream_id = data->stream_id;
@@ -405,8 +409,6 @@ static void *audio_bridge_thread(switch_thread_t *thread, void *obj)
 	}
 
 	if ((silence_var = switch_channel_get_variable(chan_a, "bridge_generate_comfort_noise"))) {
-		switch_codec_implementation_t read_impl = { 0 };
-		switch_core_session_get_read_impl(session_a, &read_impl);
 
 		if (!switch_channel_media_up(chan_a)) {
 			switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session_a), SWITCH_LOG_ERROR, "Channel has no media!\n");
@@ -683,7 +685,7 @@ static void *audio_bridge_thread(switch_thread_t *thread, void *obj)
 			if (switch_test_flag(read_frame, SFF_CNG)) {
 				if (silence_val) {
 					switch_generate_sln_silence((int16_t *) silence_frame.data, silence_frame.samples, 
-												read_frame->codec->implementation->number_of_channels, silence_val);
+												read_impl.number_of_channels, silence_val);
 					read_frame = &silence_frame;
 				} else if (!switch_channel_test_flag(chan_b, CF_ACCEPT_CNG)) {
 					continue;
diff --git a/src/switch_ivr_originate.c b/src/switch_ivr_originate.c
index 6b03c1739d..47b49725a4 100644
--- a/src/switch_ivr_originate.c
+++ b/src/switch_ivr_originate.c
@@ -117,7 +117,7 @@ typedef struct {
 	uint8_t ignore_ring_ready;
 	int monitor_early_media_ring_count;
 	int monitor_early_media_ring_total;
-	int cancel_timeout;
+	switch_bool_t cancel_timeout;
 	int continue_on_timeout;
 	int ringback_ok;
 	int sending_ringback;
@@ -262,9 +262,6 @@ static int check_per_channel_timeouts(originate_global_t *oglobals,
 
 	time_t elapsed = switch_epoch_time_now(NULL) - start;
 
-	if (oglobals->cancel_timeout > 0) {
-		return 0;
-	}
 	for (i = 0; i < max; i++) {
 		if (originate_status[i].peer_channel && switch_channel_get_state(originate_status[i].peer_channel) != CS_DESTROY &&
 			switch_channel_get_state(originate_status[i].peer_channel) != CS_REPORTING) {
@@ -738,8 +735,10 @@ static uint8_t check_channel_status(originate_global_t *oglobals, originate_stat
 			if (!zstr(oglobals->key)) {
 				struct key_collect *collect;
 
-				if (oglobals->cancel_timeout < 0) {
-					oglobals->cancel_timeout = 1;
+				if (oglobals->cancel_timeout == SWITCH_TRUE) {
+					/* cancel timeout for this leg only */
+					originate_status[i].per_channel_progress_timelimit_sec = 0;
+					originate_status[i].per_channel_timelimit_sec = 0;
 				}
 
 				if ((collect = switch_core_session_alloc(originate_status[i].peer_session, sizeof(*collect)))) {
@@ -2277,7 +2276,7 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_originate(switch_core_session_t *sess
 #endif
 
 	if (switch_true(switch_event_get_header(var_event, "group_confirm_cancel_timeout"))) {
-		oglobals.cancel_timeout = -1;
+		oglobals.cancel_timeout = SWITCH_TRUE;
 	}
 
 	if ((var = switch_event_get_header(var_event, "group_confirm_key"))) {
diff --git a/src/switch_ivr_play_say.c b/src/switch_ivr_play_say.c
index 89325b2c83..d856131dd8 100644
--- a/src/switch_ivr_play_say.c
+++ b/src/switch_ivr_play_say.c
@@ -359,6 +359,8 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_record_file(switch_core_session_t *se
 	switch_channel_t *channel = switch_core_session_get_channel(session);
 	switch_dtmf_t dtmf = { 0 };
 	switch_file_handle_t lfh = { 0 };
+	switch_file_handle_t vfh = { 0 };
+	switch_file_handle_t ind_fh = { 0 };
 	switch_frame_t *read_frame;
 	switch_codec_t codec, write_codec = { 0 };
 	char *codec_name;
@@ -369,7 +371,7 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_record_file(switch_core_session_t *se
 	uint32_t org_silence_hits = 0;
 	int asis = 0;
 	int32_t sample_start = 0;
-	int waste_resources = 0, fill_cng = 0;
+	int waste_resources = 1400, fill_cng = 0;
 	switch_codec_implementation_t read_impl = { 0 };
 	switch_frame_t write_frame = { 0 };
 	unsigned char write_buf[SWITCH_RECOMMENDED_BUFFER_SIZE] = { 0 };
@@ -377,8 +379,9 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_record_file(switch_core_session_t *se
 	int divisor = 0;
 	int file_flags = SWITCH_FILE_FLAG_WRITE | SWITCH_FILE_DATA_SHORT;
 	int restart_limit_on_dtmf = 0;
-	const char *prefix, *var;
-	
+	const char *prefix, *var, *video_file = NULL;
+	int vid_play_file_flags = SWITCH_FILE_FLAG_READ | SWITCH_FILE_DATA_SHORT | SWITCH_FILE_FLAG_VIDEO;
+	int echo_on = 0;
 
 	if (switch_channel_pre_answer(channel) != SWITCH_STATUS_SUCCESS) {
 		return SWITCH_STATUS_FALSE;
@@ -414,63 +417,17 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_record_file(switch_core_session_t *se
 		fh->samples = 0;
 	}
 
-	if ((vval = switch_channel_get_variable(channel, "record_sample_rate"))) {
+
+	if ((p = switch_channel_get_variable(channel, "record_sample_rate"))) {
 		int tmp = 0;
 
-		tmp = atoi(vval);
+		tmp = atoi(p);
 
 		if (switch_is_valid_rate(tmp)) {
 			fh->samplerate = tmp;
 		}
 	}
 
-
-	if ((vval = switch_channel_get_variable(channel, "record_fill_cng"))) {
-
-		if (!strcasecmp(vval, "true")) {
-			fill_cng = 1400;
-		} else {
-			if ((fill_cng = atoi(vval)) < 0) {
-				fill_cng = 0;
-			}
-		}
-	}
-
-
-	if ((vval = switch_channel_get_variable(channel, "record_waste_resources"))) {
-
-		if (!strcasecmp(vval, "true")) {
-			waste_resources = 1400;
-		} else {
-			if ((waste_resources = atoi(vval)) < 0) {
-				waste_resources = 0;
-			}
-		}
-	}
-
-
-	if (fill_cng || waste_resources) {
-		if (switch_core_codec_init(&write_codec,
-								   "L16",
-								   NULL,
-								   NULL,
-								   read_impl.actual_samples_per_second,
-								   read_impl.microseconds_per_packet / 1000,
-								   read_impl.number_of_channels,
-								   SWITCH_CODEC_FLAG_ENCODE | SWITCH_CODEC_FLAG_DECODE, NULL,
-								   switch_core_session_get_pool(session)) == SWITCH_STATUS_SUCCESS) {
-			switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "Raw Codec Activated, ready to waste resources!\n");
-			write_frame.data = write_buf;
-			write_frame.buflen = sizeof(write_buf);
-			write_frame.datalen = read_impl.decoded_bytes_per_packet;
-			write_frame.samples = write_frame.datalen / 2;
-			write_frame.codec = &write_codec;
-		} else {
-			arg_recursion_check_stop(args);
-			return SWITCH_STATUS_FALSE;
-		}
-	}
-
 	if (!strstr(file, SWITCH_URL_SEPARATOR)) {
 		char *ext;
 
@@ -534,10 +491,17 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_record_file(switch_core_session_t *se
 
 		file_flags |= SWITCH_FILE_FLAG_VIDEO;
 		switch_channel_set_flag_recursive(channel, CF_VIDEO_DECODED_READ);
-		fh->mm.fps = switch_core_media_get_video_fps(session);
+		switch_core_session_request_video_refresh(session);
+		if (switch_core_session_wait_for_video_input_params(session, 10000) != SWITCH_STATUS_SUCCESS) {
+			switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Unable to establish inbound video stream\n");
+			switch_core_session_reset(session, SWITCH_TRUE, SWITCH_TRUE);
+			arg_recursion_check_stop(args);
+			return SWITCH_STATUS_GENERR;
+		}
 		switch_core_media_get_vid_params(session, &vid_params);
 		fh->mm.vw = vid_params.width;
 		fh->mm.vh = vid_params.height;
+		fh->mm.fps = vid_params.fps;
 	}
 
 	if (switch_core_file_open(fh, file, fh->channels, read_impl.actual_samples_per_second, file_flags, NULL) != SWITCH_STATUS_SUCCESS) {
@@ -547,8 +511,96 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_record_file(switch_core_session_t *se
 		return SWITCH_STATUS_GENERR;
 	}
 	
-	if (switch_core_file_has_video(fh)) {
-		switch_channel_set_flag(channel, CF_VIDEO_ECHO);
+
+	if ((p = switch_channel_get_variable(channel, "record_fill_cng")) || (fh->params && (p = switch_event_get_header(fh->params, "record_fill_cng")))) {
+		if (!strcasecmp(p, "true")) {
+			fill_cng = 1400;
+		} else {
+			if ((fill_cng = atoi(p)) < 0) {
+				fill_cng = 0;
+			}
+		}
+	}
+
+	if ((p = switch_channel_get_variable(channel, "record_indication")) || (fh->params && (p = switch_event_get_header(fh->params, "record_indication")))) {
+		int flags = SWITCH_FILE_FLAG_READ | SWITCH_FILE_DATA_SHORT;
+		waste_resources = 1400;
+		
+		if (switch_core_file_open(&ind_fh,
+								  p,
+								  read_impl.number_of_channels,
+								  read_impl.actual_samples_per_second, flags, NULL) != SWITCH_STATUS_SUCCESS) {
+			switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Indication file invalid\n");
+		}
+	}
+
+	if ((p = switch_channel_get_variable(channel, "record_waste_resources")) || 
+		(fh->params && (p = switch_event_get_header(fh->params, "record_waste_resources")))) {
+
+		if (!strcasecmp(p, "true")) {
+			waste_resources = 1400;
+		} else {
+			if ((waste_resources = atoi(p)) < 0) {
+				waste_resources = 0;
+			}
+		}
+	}
+	
+	if (fill_cng || waste_resources) {
+		if (switch_core_codec_init(&write_codec,
+								   "L16",
+								   NULL,
+								   NULL,
+								   read_impl.actual_samples_per_second,
+								   read_impl.microseconds_per_packet / 1000,
+								   read_impl.number_of_channels,
+								   SWITCH_CODEC_FLAG_ENCODE | SWITCH_CODEC_FLAG_DECODE, NULL,
+								   switch_core_session_get_pool(session)) == SWITCH_STATUS_SUCCESS) {
+			switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "Raw Codec Activated, ready to waste resources!\n");
+			write_frame.data = write_buf;
+			write_frame.buflen = sizeof(write_buf);
+			write_frame.datalen = read_impl.decoded_bytes_per_packet;
+			write_frame.samples = write_frame.datalen / 2;
+			write_frame.codec = &write_codec;
+		} else {
+			arg_recursion_check_stop(args);
+			return SWITCH_STATUS_FALSE;
+		}
+	}
+
+
+
+	if (switch_core_file_has_video(fh, SWITCH_TRUE)) {
+		switch_core_session_request_video_refresh(session);	
+		
+		if ((p = switch_channel_get_variable(channel, "record_play_video")) || 
+
+			(fh->params && (p = switch_event_get_header(fh->params, "record_play_video")))) {
+
+			video_file = switch_core_session_strdup(session, p);
+			
+			if (switch_core_file_open(&vfh, video_file, fh->channels, 
+									  read_impl.actual_samples_per_second, vid_play_file_flags, NULL) != SWITCH_STATUS_SUCCESS) {
+				memset(&vfh, 0, sizeof(vfh));
+				switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_WARNING, "Failure opening video playback file.\n");
+			}
+
+			if (switch_core_file_has_video(&vfh, SWITCH_TRUE)) {
+				switch_core_media_set_video_file(session, &vfh, SWITCH_RW_WRITE);
+				switch_core_media_gen_key_frame(session);
+			} else {
+				switch_core_file_close(&vfh);
+				switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_WARNING, "Video playback file does not contain video\n");
+				memset(&vfh, 0, sizeof(vfh));
+			}
+		}
+
+		if (!switch_test_flag(&vfh, SWITCH_FILE_OPEN)) { 
+			echo_on = 1;
+			switch_channel_set_flag_recursive(channel, CF_VIDEO_DECODED_READ);
+			switch_channel_set_flag(channel, CF_VIDEO_ECHO);
+		}
+		
 		switch_core_media_set_video_file(session, fh, SWITCH_RW_READ);
 	} else if (switch_channel_test_flag(channel, CF_VIDEO)) {
 		switch_channel_set_flag(channel, CF_VIDEO_BLANK);
@@ -557,7 +609,7 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_record_file(switch_core_session_t *se
 	if (sample_start > 0) {
 		uint32_t pos = 0;
 		switch_core_file_seek(fh, &pos, sample_start, SEEK_SET);
-		switch_clear_flag(fh, SWITCH_FILE_SEEK);
+		switch_clear_flag_locked(fh, SWITCH_FILE_SEEK);
 		fh->samples = 0;
 	}
 
@@ -568,42 +620,43 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_record_file(switch_core_session_t *se
 
 	restart_limit_on_dtmf = switch_true(switch_channel_get_variable(channel, "record_restart_limit_on_dtmf"));
 
-	if ((p = switch_channel_get_variable(channel, "RECORD_TITLE"))) {
+	if ((p = switch_channel_get_variable(channel, "record_title")) || (fh->params && (p = switch_event_get_header(fh->params, "record_title")))) {
 		vval = switch_core_session_strdup(session, p);
 		switch_core_file_set_string(fh, SWITCH_AUDIO_COL_STR_TITLE, vval);
-		switch_channel_set_variable(channel, "RECORD_TITLE", NULL);
+		switch_channel_set_variable(channel, "record_title", NULL);
 	}
 
-	if ((p = switch_channel_get_variable(channel, "RECORD_COPYRIGHT"))) {
+	if ((p = switch_channel_get_variable(channel, "record_copyright")) || (fh->params && (p = switch_event_get_header(fh->params, "record_copyright")))) {
 		vval = switch_core_session_strdup(session, p);
 		switch_core_file_set_string(fh, SWITCH_AUDIO_COL_STR_COPYRIGHT, vval);
-		switch_channel_set_variable(channel, "RECORD_COPYRIGHT", NULL);
+		switch_channel_set_variable(channel, "record_copyright", NULL);
 	}
 
-	if ((p = switch_channel_get_variable(channel, "RECORD_SOFTWARE"))) {
+	if ((p = switch_channel_get_variable(channel, "record_software")) || (fh->params && (p = switch_event_get_header(fh->params, "record_software")))) {
 		vval = switch_core_session_strdup(session, p);
 		switch_core_file_set_string(fh, SWITCH_AUDIO_COL_STR_SOFTWARE, vval);
-		switch_channel_set_variable(channel, "RECORD_SOFTWARE", NULL);
+		switch_channel_set_variable(channel, "record_software", NULL);
 	}
 
-	if ((p = switch_channel_get_variable(channel, "RECORD_ARTIST"))) {
+	if ((p = switch_channel_get_variable(channel, "record_artist")) || (fh->params && (p = switch_event_get_header(fh->params, "record_artist")))) {
 		vval = switch_core_session_strdup(session, p);
 		switch_core_file_set_string(fh, SWITCH_AUDIO_COL_STR_ARTIST, vval);
-		switch_channel_set_variable(channel, "RECORD_ARTIST", NULL);
+		switch_channel_set_variable(channel, "record_artist", NULL);
 	}
 
-	if ((p = switch_channel_get_variable(channel, "RECORD_COMMENT"))) {
+	if ((p = switch_channel_get_variable(channel, "record_comment")) || (fh->params && (p = switch_event_get_header(fh->params, "record_comment")))) {
 		vval = switch_core_session_strdup(session, p);
 		switch_core_file_set_string(fh, SWITCH_AUDIO_COL_STR_COMMENT, vval);
-		switch_channel_set_variable(channel, "RECORD_COMMENT", NULL);
+		switch_channel_set_variable(channel, "record_comment", NULL);
 	}
 
-	if ((p = switch_channel_get_variable(channel, "RECORD_DATE"))) {
+	if ((p = switch_channel_get_variable(channel, "record_date")) || (fh->params && (p = switch_event_get_header(fh->params, "record_date")))) {
 		vval = switch_core_session_strdup(session, p);
 		switch_core_file_set_string(fh, SWITCH_AUDIO_COL_STR_DATE, vval);
-		switch_channel_set_variable(channel, "RECORD_DATE", NULL);
+		switch_channel_set_variable(channel, "record_date", NULL);
 	}
 
+
 	switch_channel_set_variable(channel, "silence_hits_exhausted", "false");
 
 	if (!asis) {
@@ -623,10 +676,14 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_record_file(switch_core_session_t *se
 			switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR,
 							  "Raw Codec Activation Failed %s@%uhz %u channels %dms\n", codec_name, fh->samplerate,
 							  fh->channels, read_impl.microseconds_per_packet / 1000);
-			if (switch_core_file_has_video(fh)) {
-				switch_channel_clear_flag(channel, CF_VIDEO_ECHO);
-				switch_channel_clear_flag_recursive(channel, CF_VIDEO_DECODED_READ);
+			if (switch_core_file_has_video(fh, SWITCH_FALSE)) {
+				if (echo_on) {
+					switch_channel_clear_flag(channel, CF_VIDEO_ECHO);
+					switch_channel_clear_flag_recursive(channel, CF_VIDEO_DECODED_READ);
+					echo_on = 0;
+				}
 				switch_core_media_set_video_file(session, NULL, SWITCH_RW_READ);
+				switch_core_media_set_video_file(session, NULL, SWITCH_RW_WRITE);
 			}
 			switch_channel_clear_flag(channel, CF_VIDEO_BLANK);
 			switch_core_file_close(fh);
@@ -748,6 +805,39 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_record_file(switch_core_session_t *se
 			}
 		}
 
+		if (switch_test_flag(&vfh, SWITCH_FILE_OPEN)) {
+			switch_core_file_command(&vfh, SCFC_FLUSH_AUDIO);
+
+			if (switch_test_flag(&vfh, SWITCH_FILE_FLAG_VIDEO_EOF)) {
+
+				//switch_core_media_set_video_file(session, NULL, SWITCH_RW_WRITE);
+				
+				switch_core_media_lock_video_file(session, SWITCH_RW_WRITE);
+
+				switch_core_file_close(&vfh);
+				memset(&vfh, 0, sizeof(vfh));
+				
+				if (switch_core_file_open(&vfh, video_file, fh->channels, 
+										  read_impl.actual_samples_per_second, vid_play_file_flags, NULL) != SWITCH_STATUS_SUCCESS) {
+					memset(&vfh, 0, sizeof(vfh));
+					switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_WARNING, "Failure opening video playback file.\n");
+				}
+				
+				if (switch_core_file_has_video(&vfh, SWITCH_TRUE)) {
+					//switch_core_media_set_video_file(session, &vfh, SWITCH_RW_WRITE);
+					switch_core_media_gen_key_frame(session);
+				} else {
+					switch_core_media_set_video_file(session, NULL, SWITCH_RW_WRITE);
+					switch_core_file_close(&vfh);
+					switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_WARNING, "Video playback file does not contain video\n");
+					memset(&vfh, 0, sizeof(vfh));
+				}
+
+				switch_core_media_unlock_video_file(session, SWITCH_RW_WRITE);
+			}
+
+		}
+
 		if (!asis && fh->thresh) {
 			int16_t *fdata = (int16_t *) read_frame->data;
 			uint32_t samples = read_frame->datalen / sizeof(*fdata);
@@ -771,7 +861,20 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_record_file(switch_core_session_t *se
 			}
 		}
 
-		if (fill_cng) {
+		write_frame.datalen = read_impl.decoded_bytes_per_packet;
+		write_frame.samples = write_frame.datalen / 2;
+
+		if (switch_test_flag(&ind_fh, SWITCH_FILE_OPEN)) {
+			switch_size_t olen = write_frame.codec->implementation->samples_per_packet;
+			
+			if (switch_core_file_read(&ind_fh, write_frame.data, &olen) == SWITCH_STATUS_SUCCESS) {
+				write_frame.samples = olen;
+				write_frame.datalen = olen * 2 * ind_fh.channels;;
+			} else {
+				switch_core_file_close(&ind_fh);
+			}
+
+		} else if (fill_cng) {
 			switch_generate_sln_silence((int16_t *) write_frame.data, write_frame.samples, read_impl.number_of_channels, fill_cng);
 		} else if (waste_resources) {
 			switch_generate_sln_silence((int16_t *) write_frame.data, write_frame.samples, read_impl.number_of_channels, waste_resources);
@@ -790,8 +893,9 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_record_file(switch_core_session_t *se
 				break;
 			}
 		}
+		
 
-		if (waste_resources) {
+		if (waste_resources || switch_test_flag(&ind_fh, SWITCH_FILE_OPEN)) {
 			if (switch_core_session_write_frame(session, &write_frame, SWITCH_IO_FLAG_NONE, 0) != SWITCH_STATUS_SUCCESS) {
 				break;
 			}
@@ -801,11 +905,15 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_record_file(switch_core_session_t *se
 	if (fill_cng || waste_resources) {
 		switch_core_codec_destroy(&write_codec);
 	}
-
-	if (switch_core_file_has_video(fh)) {
-		switch_channel_clear_flag(channel, CF_VIDEO_ECHO);
-		switch_channel_clear_flag_recursive(channel, CF_VIDEO_DECODED_READ);
+	
+	if (switch_core_file_has_video(fh, SWITCH_FALSE)) {
+		if (echo_on) {
+			switch_channel_clear_flag(channel, CF_VIDEO_ECHO);
+			switch_channel_clear_flag_recursive(channel, CF_VIDEO_DECODED_READ);
+			echo_on = 0;
+		}
 		switch_core_media_set_video_file(session, NULL, SWITCH_RW_READ);
+		switch_core_media_set_video_file(session, NULL, SWITCH_RW_WRITE);
 	}
 	switch_channel_clear_flag(channel, CF_VIDEO_BLANK);
 	switch_core_file_close(fh);
@@ -1292,7 +1400,7 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_play_file(switch_core_session_t *sess
 		switch_channel_set_private(channel, "__fh", fh);
 		switch_core_session_io_rwunlock(session);
 
-		if (switch_core_file_has_video(fh)) {
+		if (switch_core_file_has_video(fh, SWITCH_TRUE)) {
 			switch_core_media_set_video_file(session, fh, SWITCH_RW_WRITE);
 		}
 
@@ -1306,7 +1414,7 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_play_file(switch_core_session_t *sess
 			uint32_t pos = 0;
 			switch_core_file_seek(fh, &pos, 0, SEEK_SET);
 			switch_core_file_seek(fh, &pos, sample_start, SEEK_CUR);
-			switch_clear_flag(fh, SWITCH_FILE_SEEK);
+			switch_clear_flag_locked(fh, SWITCH_FILE_SEEK);
 		}
 
 		if (switch_core_file_get_string(fh, SWITCH_AUDIO_COL_STR_TITLE, &p) == SWITCH_STATUS_SUCCESS) {
@@ -1369,10 +1477,7 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_play_file(switch_core_session_t *sess
 				switch_channel_set_private(channel, "__fh", NULL);
 				switch_core_session_io_rwunlock(session);
 
-				if (switch_core_file_has_video(fh)) {
-					//switch_channel_clear_flag_recursive(channel, CF_VIDEO_DECODED_READ);
-					switch_core_media_set_video_file(session, NULL, SWITCH_RW_WRITE);
-				}
+				switch_core_media_set_video_file(session, NULL, SWITCH_RW_WRITE);
 
 				switch_core_file_close(fh);
 
@@ -1396,10 +1501,7 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_play_file(switch_core_session_t *sess
 				switch_channel_set_private(channel, "__fh", NULL);
 				switch_core_session_io_rwunlock(session);
 
-				if (switch_core_file_has_video(fh)) {
-					//switch_channel_clear_flag_recursive(channel, CF_VIDEO_DECODED_READ);
-					switch_core_media_set_video_file(session, NULL, SWITCH_RW_WRITE);
-				}
+				switch_core_media_set_video_file(session, NULL, SWITCH_RW_WRITE);
 				switch_core_file_close(fh);
 
 				switch_core_session_reset(session, SWITCH_TRUE, SWITCH_FALSE);
@@ -1426,10 +1528,9 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_play_file(switch_core_session_t *sess
 				switch_core_session_io_write_lock(session);
 				switch_channel_set_private(channel, "__fh", NULL);
 				switch_core_session_io_rwunlock(session);
-				if (switch_core_file_has_video(fh)) {
-					//switch_channel_clear_flag_recursive(channel, CF_VIDEO_DECODED_READ);
-					switch_core_media_set_video_file(session, NULL, SWITCH_RW_WRITE);
-				}
+
+				switch_core_media_set_video_file(session, NULL, SWITCH_RW_WRITE);
+
 				switch_core_file_close(fh);
 				switch_core_session_reset(session, SWITCH_TRUE, SWITCH_FALSE);
 				status = SWITCH_STATUS_GENERR;
@@ -1589,7 +1690,7 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_play_file(switch_core_session_t *sess
 				if (!switch_test_flag(fh, SWITCH_FILE_NATIVE)) {
 					olen /= 2;
 				}
-				switch_set_flag(fh, SWITCH_FILE_BREAK_ON_CHANGE);
+				switch_set_flag_locked(fh, SWITCH_FILE_BREAK_ON_CHANGE);
 
 				if ((rstatus = switch_core_file_read(fh, abuf, &olen)) == SWITCH_STATUS_BREAK) {
 					continue;
@@ -1651,7 +1752,7 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_play_file(switch_core_session_t *sess
 			if (switch_test_flag(fh, SWITCH_FILE_SEEK)) {
 				/* file position has changed flush the buffer */
 				switch_buffer_zero(fh->audio_buffer);
-				switch_clear_flag(fh, SWITCH_FILE_SEEK);
+				switch_clear_flag_locked(fh, SWITCH_FILE_SEEK);
 			}
 
 
@@ -1733,11 +1834,11 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_play_file(switch_core_session_t *sess
 
 					if (args && (args->read_frame_callback)) {
 						int ok = 1;
-						switch_set_flag(fh, SWITCH_FILE_CALLBACK);
+						switch_set_flag_locked(fh, SWITCH_FILE_CALLBACK);
 						if ((status = args->read_frame_callback(session, read_frame, args->user_data)) != SWITCH_STATUS_SUCCESS) {
 							ok = 0;
 						}
-						switch_clear_flag(fh, SWITCH_FILE_CALLBACK);
+						switch_clear_flag_locked(fh, SWITCH_FILE_CALLBACK);
 						if (!ok) {
 							break;
 						}
@@ -1838,11 +1939,8 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_play_file(switch_core_session_t *sess
 		switch_core_session_io_write_lock(session);
 		switch_channel_set_private(channel, "__fh", NULL);
 		switch_core_session_io_rwunlock(session);
-
-		if (switch_core_file_has_video(fh)) {
-			//switch_channel_clear_flag_recursive(channel, CF_VIDEO_DECODED_READ);
-			switch_core_media_set_video_file(session, NULL, SWITCH_RW_WRITE);
-		}
+		
+		switch_core_media_set_video_file(session, NULL, SWITCH_RW_WRITE);
 		switch_core_file_close(fh);
 
 		if (fh->audio_buffer) {
@@ -2005,6 +2103,7 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_wait_for_silence(switch_core_session_
   end:
 
 	if (abuf) {
+
 		switch_core_file_close(&fh);
 		free(abuf);
 	}
diff --git a/src/switch_jitterbuffer.c b/src/switch_jitterbuffer.c
index 649af68b91..4bfca88821 100644
--- a/src/switch_jitterbuffer.c
+++ b/src/switch_jitterbuffer.c
@@ -37,7 +37,7 @@
 #define PERIOD_LEN 250
 #define MAX_FRAME_PADDING 2
 #define MAX_MISSING_SEQ 20
-#define jb_debug(_jb, _level, _format, ...) if (_jb->debug_level >= _level) switch_log_printf(SWITCH_CHANNEL_SESSION_LOG_CLEAN(_jb->session), SWITCH_LOG_ALERT, "JB:%p:%s lv:%d ln:%d sz:%u/%u/%u/%u c:%u %u/%u/%u/%u %.2f%% ->" _format, (void *) _jb, (jb->type == SJB_AUDIO ? "aud" : "vid"), _level, __LINE__,  _jb->min_frame_len, _jb->max_frame_len, _jb->frame_len, _jb->complete_frames, _jb->period_count, _jb->consec_good_count, _jb->period_good_count, _jb->consec_miss_count, _jb->period_miss_count, _jb->period_miss_pct, __VA_ARGS__)
+#define jb_debug(_jb, _level, _format, ...) if (_jb->debug_level >= _level) switch_log_printf(SWITCH_CHANNEL_SESSION_LOG_CLEAN(_jb->session), SWITCH_LOG_ALERT, "JB:%p:%s lv:%d ln:%.4d sz:%.3u/%.3u/%.3u/%.3u c:%.3u %.3u/%.3u/%.3u/%.3u %.2f%% ->" _format, (void *) _jb, (jb->type == SJB_AUDIO ? "aud" : "vid"), _level, __LINE__,  _jb->min_frame_len, _jb->max_frame_len, _jb->frame_len, _jb->complete_frames, _jb->period_count, _jb->consec_good_count, _jb->period_good_count, _jb->consec_miss_count, _jb->period_miss_count, _jb->period_miss_pct, __VA_ARGS__)
 
 //const char *TOKEN_1 = "ONE";
 //const char *TOKEN_2 = "TWO";
@@ -585,19 +585,35 @@ static inline void add_node(switch_jb_t *jb, switch_rtp_packet_t *packet, switch
 	jb_debug(jb, (packet->header.m ? 1 : 2), "PUT packet last_ts:%u ts:%u seq:%u%s\n", 
 			 ntohl(jb->highest_wrote_ts), ntohl(node->packet.header.ts), ntohs(node->packet.header.seq), packet->header.m ? " <MARK>" : "");
 
-	if (jb->write_init && jb->type == SJB_VIDEO && ((abs(((int)ntohs(packet->header.seq) - ntohs(jb->highest_wrote_seq))) >= jb->max_frame_len) || 
-						   (abs((int)((int64_t)ntohl(node->packet.header.ts) - (int64_t)ntohl(jb->highest_wrote_ts))) > (900000 * 5)))) {
-		jb_debug(jb, 2, "CHANGE DETECTED, PUNT %u\n", abs(((int)ntohs(packet->header.seq) - ntohs(jb->highest_wrote_seq))));
-		switch_jb_reset(jb);
+	if (jb->write_init && jb->type == SJB_VIDEO) {
+		int seq_diff = 0, ts_diff = 0;
+
+		if (ntohs(jb->highest_wrote_seq) > (USHRT_MAX - 100) && ntohs(packet->header.seq) < 100) {
+			seq_diff = (USHRT_MAX - ntohs(jb->highest_wrote_seq)) + ntohs(packet->header.seq);
+		} else {
+			seq_diff = abs(((int)ntohs(packet->header.seq) - ntohs(jb->highest_wrote_seq)));
+		}
+		
+		if (ntohl(jb->highest_wrote_ts) > (UINT_MAX - 1000) && ntohl(node->packet.header.ts) < 1000) {
+			ts_diff = (UINT_MAX - ntohl(node->packet.header.ts)) + ntohl(node->packet.header.ts);
+		} else {
+			ts_diff = abs((int)((int64_t)ntohl(node->packet.header.ts) - (int64_t)ntohl(jb->highest_wrote_ts)));
+		}
+		
+		if (((seq_diff >= jb->max_frame_len) || (ts_diff > (900000 * 5)))) {
+			jb_debug(jb, 2, "CHANGE DETECTED, PUNT %u\n", abs(((int)ntohs(packet->header.seq) - ntohs(jb->highest_wrote_seq))));
+			switch_jb_reset(jb);
+		}
 	}
  
 	if (!jb->write_init || ntohs(packet->header.seq) > ntohs(jb->highest_wrote_seq) || 
-		(ntohs(jb->highest_wrote_seq) > USHRT_MAX - 10 && ntohs(packet->header.seq) <= 10) ) {
+		(ntohs(jb->highest_wrote_seq) > USHRT_MAX - 100 && ntohs(packet->header.seq) < 100) ) {
 		jb->highest_wrote_seq = packet->header.seq;
 	}
 
 	if (jb->type == SJB_VIDEO) {
-		if (jb->write_init && htons(packet->header.seq) >= htons(jb->highest_wrote_seq) && (ntohl(node->packet.header.ts) > ntohl(jb->highest_wrote_ts))) {
+		if (jb->write_init && ((htons(packet->header.seq) >= htons(jb->highest_wrote_seq) && (ntohl(node->packet.header.ts) > ntohl(jb->highest_wrote_ts))) ||
+							   (ntohl(jb->highest_wrote_ts) > (UINT_MAX - 1000) && ntohl(node->packet.header.ts) < 1000))) {
 			jb->complete_frames++;
 			jb_debug(jb, 2, "WRITE frame ts: %u complete=%u/%u n:%u\n", ntohl(node->packet.header.ts), jb->complete_frames , jb->frame_len, jb->visible_nodes);
 			jb->highest_wrote_ts = packet->header.ts;
@@ -669,7 +685,9 @@ static inline switch_status_t jb_next_packet_by_seq(switch_jb_t *jb, switch_jb_n
 	}
 
 	if (!jb->target_seq) {
-		if ((node = jb_find_lowest_seq(jb, 0))) {
+		if ((node = switch_core_inthash_find(jb->node_hash, jb->target_seq))) {
+			jb_debug(jb, 2, "FOUND rollover seq: %u\n", ntohs(jb->target_seq));
+		} else if ((node = jb_find_lowest_seq(jb, 0))) {
 			jb_debug(jb, 2, "No target seq using seq: %u as a starting point\n", ntohs(node->packet.header.seq));
 		} else {
 			jb_debug(jb, 1, "%s", "No nodes available....\n");
diff --git a/src/switch_loadable_module.c b/src/switch_loadable_module.c
index b49af923c4..bc4289ac37 100644
--- a/src/switch_loadable_module.c
+++ b/src/switch_loadable_module.c
@@ -1876,7 +1876,11 @@ SWITCH_DECLARE(switch_status_t) switch_loadable_module_init(switch_bool_t autolo
 	switch_loadable_module_load_module("", "CORE_SOFTTIMER_MODULE", SWITCH_FALSE, &err);
 	switch_loadable_module_load_module("", "CORE_PCM_MODULE", SWITCH_FALSE, &err);
 	switch_loadable_module_load_module("", "CORE_SPEEX_MODULE", SWITCH_FALSE, &err);
-
+#ifdef SWITCH_HAVE_YUV
+#ifdef SWITCH_HAVE_VPX
+	switch_loadable_module_load_module("", "CORE_VPX_MODULE", SWITCH_FALSE, &err);
+#endif
+#endif
 
 	if ((xml = switch_xml_open_cfg(cf, &cfg, NULL))) {
 		switch_xml_t mods, ld;
diff --git a/src/switch_rtp.c b/src/switch_rtp.c
index 5646b8e86e..28b53ff23b 100644
--- a/src/switch_rtp.c
+++ b/src/switch_rtp.c
@@ -1942,7 +1942,10 @@ static int check_rtcp_and_ice(switch_rtp_t *rtp_session)
 	int rate = 0, nack_ttl = 0;
 	uint32_t cur_nack[MAX_NACK] = { 0 };
 
-	if (rtp_session->flags[SWITCH_RTP_FLAG_AUTO_CNG] && rtp_session->send_msg.header.ts && rtp_session->cng_pt != INVALID_PT &&
+	if (!rtp_session->flags[SWITCH_RTP_FLAG_UDPTL] &&
+		rtp_session->flags[SWITCH_RTP_FLAG_AUTO_CNG] &&
+		rtp_session->send_msg.header.ts &&
+		rtp_session->cng_pt != INVALID_PT &&
 		(rtp_session->timer.samplecount - rtp_session->last_write_samplecount >= rtp_session->samples_per_interval * 60)) {
 		uint8_t data[10] = { 0 };
 		switch_frame_flag_t frame_flags = SFF_NONE;
@@ -2730,6 +2733,10 @@ SWITCH_DECLARE(switch_status_t) switch_rtp_set_local_address(switch_rtp_t *rtp_s
 
 SWITCH_DECLARE(void) switch_rtp_set_max_missed_packets(switch_rtp_t *rtp_session, uint32_t max)
 {
+	if (!switch_rtp_ready(rtp_session) || rtp_session->flags[SWITCH_RTP_FLAG_UDPTL]) {
+		return;
+	}
+
 	if (rtp_session->missed_count >= max) {
 		
 		switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(rtp_session->session), SWITCH_LOG_WARNING,
@@ -2835,6 +2842,9 @@ SWITCH_DECLARE(switch_status_t) switch_rtp_udptl_mode(switch_rtp_t *rtp_session)
 		switch_rtp_clear_flag(rtp_session, SWITCH_RTP_FLAG_USE_TIMER);
 	}
 
+	rtp_session->missed_count = 0;
+	rtp_session->max_missed_packets = 0;
+
 	rtp_session->flags[SWITCH_RTP_FLAG_ENABLE_RTCP] = 0;
 
 	if (rtp_session->rtcp_sock_input) {
@@ -5152,7 +5162,7 @@ static switch_status_t read_rtp_packet(switch_rtp_t *rtp_session, switch_size_t
 
 	if (*bytes) {
 		b = (unsigned char *) &rtp_session->recv_msg;
-
+		
 		/* version 2 probably rtp, zrtp cookie present means zrtp */
 		rtp_session->has_rtp = (rtp_session->recv_msg.header.version == 2 || ntohl(*(int *)(b+4)) == ZRTP_MAGIC_COOKIE);
 
@@ -5261,6 +5271,7 @@ static switch_status_t read_rtp_packet(switch_rtp_t *rtp_session, switch_size_t
 			if (rtp_session->has_rtcp) {
 				*flags |= SFF_RTCP;
 
+#ifdef ENABLE_SRTP
 				if (rtp_session->flags[SWITCH_RTP_FLAG_SECURE_RECV]) {
 					int sbytes = (int) *bytes;
 					err_status_t stat = 0;
@@ -5275,7 +5286,7 @@ static switch_status_t read_rtp_packet(switch_rtp_t *rtp_session, switch_size_t
 					
 					*bytes = sbytes;
 				}
-
+#endif
 				return SWITCH_STATUS_SUCCESS;
 			}
 		}
@@ -6268,11 +6279,6 @@ static int rtp_common_read(switch_rtp_t *rtp_session, switch_payload_t *payload_
 				pt = 20000;
 			}
 			
-
-			if ((io_flags & SWITCH_IO_FLAG_NOBLOCK)) {
-				pt = 0;
-			}
-
 			if (rtp_session->flags[SWITCH_RTP_FLAG_VIDEO] && !rtp_session->flags[SWITCH_RTP_FLAG_PROXY_MEDIA]) {
 				pt = 200000;
 			}
@@ -6282,6 +6288,10 @@ static int rtp_common_read(switch_rtp_t *rtp_session, switch_payload_t *payload_
 					pt = 0;
 				}
 			}
+
+			if ((io_flags & SWITCH_IO_FLAG_NOBLOCK)) {
+				pt = 0;
+			}
 			
 			poll_status = switch_poll(rtp_session->read_pollfd, 1, &fdr, pt);
 
@@ -6319,7 +6329,8 @@ static int rtp_common_read(switch_rtp_t *rtp_session, switch_payload_t *payload_
 					goto end;
 				}
 
-				if (rtp_session->max_missed_packets && read_loops == 1 && !rtp_session->flags[SWITCH_RTP_FLAG_VIDEO]) {
+				if (rtp_session->max_missed_packets && read_loops == 1 && !rtp_session->flags[SWITCH_RTP_FLAG_VIDEO] && 
+					!rtp_session->flags[SWITCH_RTP_FLAG_UDPTL]) {
 					if (bytes && status == SWITCH_STATUS_SUCCESS) {
 						rtp_session->missed_count = 0;
 					} else if (++rtp_session->missed_count >= rtp_session->max_missed_packets) {
@@ -6357,10 +6368,16 @@ static int rtp_common_read(switch_rtp_t *rtp_session, switch_payload_t *payload_
 			}
 			poll_loop = 0;
 		} else {
+
+			if (!switch_rtp_ready(rtp_session)) {
+				ret = -1;
+				goto end;
+			}
+			
 			if (!SWITCH_STATUS_IS_BREAK(poll_status) && poll_status != SWITCH_STATUS_TIMEOUT) {
 				char tmp[128] = "";
 				switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(rtp_session->session), SWITCH_LOG_ERROR, "Poll failed with error: %d [%s]\n",
-					poll_status, switch_strerror_r(poll_status, tmp, sizeof(tmp)));
+								  poll_status, switch_strerror_r(poll_status, tmp, sizeof(tmp)));
 				ret = -1;
 				goto end;
 			}
@@ -6382,8 +6399,8 @@ static int rtp_common_read(switch_rtp_t *rtp_session, switch_payload_t *payload_
 				ret = -1;
 				goto end;
 			}
-
-		
+			
+			
 			if ((!(io_flags & SWITCH_IO_FLAG_NOBLOCK)) && 
 				(rtp_session->dtmf_data.out_digit_dur == 0) && !rtp_session->flags[SWITCH_RTP_FLAG_ENABLE_RTCP]) {
 				return_cng_frame();
@@ -6491,7 +6508,6 @@ static int rtp_common_read(switch_rtp_t *rtp_session, switch_payload_t *payload_
 							switch_core_timer_sync(&rtp_session->timer);
 							reset_jitter_seq(rtp_session);
 						}
-						
 						goto recvfrom;
 					}
 				}
@@ -7067,11 +7083,13 @@ SWITCH_DECLARE(switch_status_t) switch_rtp_zerocopy_read_frame(switch_rtp_t *rtp
 	if (bytes < 0) {
 		frame->datalen = 0;
 		return bytes == -2 ? SWITCH_STATUS_TIMEOUT : SWITCH_STATUS_GENERR;
-	} else if (bytes < rtp_header_len) {
-		frame->datalen = 0;
-		return SWITCH_STATUS_BREAK;
-	} else {
-		bytes -= rtp_header_len;
+	} else if (!rtp_session->flags[SWITCH_RTP_FLAG_UDPTL]) {
+		if (bytes < rtp_header_len) {
+			frame->datalen = 0;
+			return SWITCH_STATUS_BREAK;
+		} else {
+			bytes -= rtp_header_len;
+		}
 	}
 
 	frame->datalen = bytes;
@@ -7188,8 +7206,9 @@ static int rtp_common_write(switch_rtp_t *rtp_session,
 		if ((rtp_session->rtp_bugs & RTP_BUG_NEVER_SEND_MARKER)) {
 			m = 0;
 		} else {
-			if ((!rtp_session->flags[SWITCH_RTP_FLAG_RESET] && (rtp_session->ts - rtp_session->last_write_ts > rtp_session->samples_per_interval * 10))
-				|| rtp_session->ts == rtp_session->samples_per_interval) {
+			if (!rtp_session->flags[SWITCH_RTP_FLAG_UDPTL] &&
+				((!rtp_session->flags[SWITCH_RTP_FLAG_RESET] && (rtp_session->ts - rtp_session->last_write_ts > rtp_session->samples_per_interval * 10))
+				|| rtp_session->ts == rtp_session->samples_per_interval)) {
 				m++;
 			}
 			
@@ -7237,7 +7256,12 @@ static int rtp_common_write(switch_rtp_t *rtp_session,
 		 */
 
 		if (!rtp_session->ts_norm.ts) {
-			rtp_session->ts_norm.ts = (uint32_t) rand() % 1000000 + 1;
+			if (switch_rtp_test_flag(rtp_session, SWITCH_RTP_FLAG_GEN_TS_DELTA)) {
+				rtp_session->ts_norm.ts = (uint32_t) rand() % 1000000 + 1;
+			} else {
+				switch_core_timer_sync(&rtp_session->timer);
+				rtp_session->ts_norm.ts = rtp_session->timer.samplecount;
+			}
 		}
 
 		if (!rtp_session->ts_norm.last_ssrc || send_msg->header.ssrc != rtp_session->ts_norm.last_ssrc) {
@@ -7273,7 +7297,6 @@ static int rtp_common_write(switch_rtp_t *rtp_session,
 		
 		rtp_session->ts_norm.last_frame = ntohl(send_msg->header.ts);
 		send_msg->header.ts = htonl(rtp_session->ts_norm.ts);
-
 	}
 
 	send_msg->header.ssrc = htonl(rtp_session->ssrc);
@@ -7396,8 +7419,8 @@ static int rtp_common_write(switch_rtp_t *rtp_session,
 			rtp_session->flags[SWITCH_RTP_FLAG_RESET] = 1;
 		}
 
-		if (!switch_rtp_ready(rtp_session) || rtp_session->sending_dtmf || !this_ts || 
-			(!rtp_session->flags[SWITCH_RTP_FLAG_RESET] && this_ts < rtp_session->last_write_ts)) {
+		if (!switch_rtp_ready(rtp_session) || rtp_session->sending_dtmf ||
+			(!rtp_session->flags[SWITCH_RTP_FLAG_RESET] && this_ts > rtp_session->one_second && this_ts < rtp_session->last_write_ts)) {
 			send = 0;
 		}
 	}
diff --git a/src/mod/codecs/mod_vpx/mod_vpx.c b/src/switch_vpx.c
similarity index 80%
rename from src/mod/codecs/mod_vpx/mod_vpx.c
rename to src/switch_vpx.c
index 4b70ad2809..dcb3e8f185 100644
--- a/src/mod/codecs/mod_vpx/mod_vpx.c
+++ b/src/switch_vpx.c
@@ -32,6 +32,8 @@
  */
 
 #include <switch.h>
+#ifdef SWITCH_HAVE_YUV
+#ifdef SWITCH_HAVE_VPX
 #include <vpx/vpx_encoder.h>
 #include <vpx/vpx_decoder.h>
 #include <vpx/vp8cx.h>
@@ -41,7 +43,6 @@
 #define SLICE_SIZE SWITCH_DEFAULT_VIDEO_SIZE
 #define KEY_FRAME_MIN_FREQ 250000
 
-
 /*	http://tools.ietf.org/html/draft-ietf-payload-vp8-10
 
 	The first octets after the RTP header are the VP8 payload descriptor, with the following structure.
@@ -96,26 +97,30 @@ typedef struct {
 	unsigned pid:3;
 } vp8_payload_descriptor_t;
 
-#ifdef WHAT_THEY_FUCKING_SAY
 typedef struct {
 	unsigned have_pid:1;
+	unsigned have_p_layer:1;
 	unsigned have_layer_ind:1;
-	unsigned have_ref_ind:1;
+	unsigned is_flexible:1;
 	unsigned start:1;
 	unsigned end:1;
 	unsigned have_ss:1;
-	unsigned have_su:1;
 	unsigned zero:1;
 } vp9_payload_descriptor_t;
 
-#else
 typedef struct {
-	unsigned dunno:6;
-	unsigned start:1;
-	unsigned key:1;
-} vp9_payload_descriptor_t;
-#endif
+	unsigned n_s:3;
+	unsigned y:1;
+	unsigned g:1;
+	unsigned zero:0;
+} vp9_ss_t;
 
+typedef struct {
+	unsigned t:3;
+	unsigned u:1;
+	unsigned r:2;
+	unsigned zero:2;
+} vp9_n_g_t;
 
 #else /* ELSE LITTLE */
 
@@ -128,24 +133,36 @@ typedef struct {
 	unsigned extended:1;
 } vp8_payload_descriptor_t;
 
-#ifdef WHAT_THEY_FUCKING_SAY
 typedef struct {
 	unsigned zero:1;
-	unsigned have_su:1;
 	unsigned have_ss:1;
 	unsigned end:1;
 	unsigned start:1;
-	unsigned have_ref_ind:1;
+	unsigned is_flexible:1;
 	unsigned have_layer_ind:1;
+	unsigned have_p_layer:1;
 	unsigned have_pid:1;
 } vp9_payload_descriptor_t;
-#else
+
 typedef struct {
-	unsigned key:1;
-	unsigned start:1;
-	unsigned dunno:6;
-} vp9_payload_descriptor_t;
-#endif
+	unsigned zero:4;
+	unsigned g:1;
+	unsigned y:1;
+	unsigned n_s:3;
+} vp9_ss_t;
+
+typedef struct {
+	unsigned zero:2;
+	unsigned r:2;
+	unsigned u:1;
+	unsigned t:3;
+} vp9_n_g_t;
+
+typedef struct {
+	unsigned d:1;
+	unsigned s:3;
+	unsigned gof_idx:4;
+} vp9_layer_t;
 
 #endif
 
@@ -154,6 +171,37 @@ typedef union {
 	vp9_payload_descriptor_t vp9;
 } vpx_payload_descriptor_t;
 
+#define kMaxVp9NumberOfSpatialLayers 16
+
+typedef struct {
+	switch_bool_t has_received_sli;
+	uint8_t picture_id_sli;
+	switch_bool_t has_received_rpsi;
+	uint64_t picture_id_rpsi;
+	int16_t picture_id;  // Negative value to skip pictureId.
+
+	switch_bool_t inter_pic_predicted;  // This layer frame is dependent on previously
+	                           // coded frame(s).
+	switch_bool_t flexible_mode;
+	switch_bool_t ss_data_available;
+
+	int tl0_pic_idx;  // Negative value to skip tl0PicIdx.
+	uint8_t temporal_idx;
+	uint8_t spatial_idx;
+	switch_bool_t temporal_up_switch;
+	switch_bool_t inter_layer_predicted;  // Frame is dependent on directly lower spatial
+	                             // layer frame.
+	uint8_t gof_idx;
+
+	// SS data.
+	size_t num_spatial_layers;
+	switch_bool_t spatial_layer_resolution_present;
+	uint16_t width[kMaxVp9NumberOfSpatialLayers];
+	uint16_t height[kMaxVp9NumberOfSpatialLayers];
+	// GofInfoVP9 gof;
+} vp9_info_t;
+
+
 #ifdef _MSC_VER
 #pragma pack(pop, r1)
 #endif
@@ -186,20 +234,21 @@ static inline int IS_VP8_KEY_FRAME(uint8_t *data)
 	if (S && (PID == 0)) {
 		return __IS_VP8_KEY_FRAME(*data);
 	} else {
-		if (PID > 0) switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "PID: %d\n", PID);
+		// if (PID > 0) switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "PID: %d\n", PID);
 		return 0;
 	}
 }
 
-#define IS_VP9_KEY_FRAME(byte) ((byte) & 0x01)
+#define IS_VP9_KEY_FRAME(byte) ((((byte) & 0x10) == 0) && ((byte) & 0x02))
 #define IS_VP9_START_PKT(byte) ((byte) & 0x02)
 
 SWITCH_MODULE_LOAD_FUNCTION(mod_vpx_load);
-SWITCH_MODULE_DEFINITION(mod_vpx, mod_vpx_load, NULL, NULL);
+SWITCH_MODULE_DEFINITION(CORE_VPX_MODULE, mod_vpx_load, NULL, NULL);
 
 struct vpx_context {
 	switch_codec_t *codec;
 	int is_vp9;
+	vp9_info_t vp9;
 	int lossless;
 	vpx_codec_iface_t *encoder_interface;
 	vpx_codec_iface_t *decoder_interface;
@@ -251,20 +300,20 @@ static switch_status_t init_decoder(switch_codec_t *codec)
 
 	if (context->flags & SWITCH_CODEC_FLAG_DECODE && !context->decoder_init) {
 		vp8_postproc_cfg_t ppcfg;
-		
+
 		//if (context->decoder_init) {
 		//	vpx_codec_destroy(&context->decoder);
 		//	context->decoder_init = 0;
 		//}
 
-		cfg.threads = switch_core_cpu_count();
+		cfg.threads = 1;//(switch_core_cpu_count() > 1) ? 2 : 1;
 
 		if (!context->is_vp9) { // vp8 only
-			dec_flags = VPX_CODEC_USE_POSTPROC;
+			// dec_flags = VPX_CODEC_USE_POSTPROC;
 		}
 
 		if (vpx_codec_dec_init(&context->decoder, context->decoder_interface, &cfg, dec_flags) != VPX_CODEC_OK) {
-			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Codec init error: [%d:%s]\n", context->encoder.err, context->encoder.err_detail);
+			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Codec %s init error: [%d:%s]\n", vpx_codec_iface_name(context->decoder_interface), context->encoder.err, context->encoder.err_detail);
 			return SWITCH_STATUS_FALSE;
 		}
 
@@ -330,7 +379,7 @@ static switch_status_t init_encoder(switch_codec_t *codec)
 
 	context->pkt = NULL;
 
-	switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(codec->session), SWITCH_LOG_DEBUG1, 
+	switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(codec->session), SWITCH_LOG_NOTICE,
 					  "VPX reset encoder picture from %dx%d to %dx%d %u BW\n", 
 					  config->g_w, config->g_h, context->codec_settings.video.width, context->codec_settings.video.height, context->bandwidth);
 
@@ -344,7 +393,7 @@ static switch_status_t init_encoder(switch_codec_t *codec)
 	config->rc_target_bitrate = context->bandwidth;
 	config->g_lag_in_frames = 0;
 	config->kf_max_dist = 2000;
-	config->g_threads = (cpus > 1) ? 2 : 1;
+	config->g_threads = 1;//(cpus > 1) ? 2 : 1;
 	
 	if (context->is_vp9) {
 		//config->rc_dropframe_thresh = 2;
@@ -358,6 +407,11 @@ static switch_status_t init_encoder(switch_codec_t *codec)
 			config->rc_max_quantizer = 63;
 		}
 
+		config->temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING;
+		config->ts_number_layers = 1;
+		config->ts_rate_decimator[0] = 1;
+		config->ts_periodicity = 1;
+		config->ts_layer_id[0] = 0;
 	} else {
 
 		// settings
@@ -425,8 +479,48 @@ static switch_status_t init_encoder(switch_codec_t *codec)
 		}
 	} else if (context->flags & SWITCH_CODEC_FLAG_ENCODE) {
 
+		// #define SHOW(field) fprintf(stderr, "    %-28s = %d\n", #field, config->field);
+
+#ifdef SHOW
+		fprintf(stderr, "Codec: %s\n", vpx_codec_iface_name(context->encoder_interface));
+
+		SHOW(g_usage);
+		SHOW(g_threads);
+		SHOW(g_profile);
+		SHOW(g_w);
+		SHOW(g_h);
+		SHOW(g_bit_depth);
+		SHOW(g_input_bit_depth);
+		SHOW(g_timebase.num);
+		SHOW(g_timebase.den);
+		SHOW(g_error_resilient);
+		SHOW(g_pass);
+		SHOW(g_lag_in_frames);
+		SHOW(rc_dropframe_thresh);
+		SHOW(rc_resize_allowed);
+		SHOW(rc_scaled_width);
+		SHOW(rc_scaled_height);
+		SHOW(rc_resize_up_thresh);
+		SHOW(rc_resize_down_thresh);
+		SHOW(rc_end_usage);
+		SHOW(rc_target_bitrate);
+		SHOW(rc_min_quantizer);
+		SHOW(rc_max_quantizer);
+		SHOW(rc_undershoot_pct);
+		SHOW(rc_overshoot_pct);
+		SHOW(rc_buf_sz);
+		SHOW(rc_buf_initial_sz);
+		SHOW(rc_buf_optimal_sz);
+		SHOW(rc_2pass_vbr_bias_pct);
+		SHOW(rc_2pass_vbr_minsection_pct);
+		SHOW(rc_2pass_vbr_maxsection_pct);
+		SHOW(kf_mode);
+		SHOW(kf_min_dist);
+		SHOW(kf_max_dist);
+#endif
+
 		if (vpx_codec_enc_init(&context->encoder, context->encoder_interface, config, 0 & VPX_CODEC_USE_OUTPUT_PARTITION) != VPX_CODEC_OK) {
-			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Codec init error: [%d:%s]\n", context->encoder.err, context->encoder.err_detail);
+			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Codec %s init error: [%d:%s]\n", vpx_codec_iface_name(context->encoder_interface), context->encoder.err, context->encoder.err_detail);
 			return SWITCH_STATUS_FALSE;
 		}
 		
@@ -566,7 +660,59 @@ static switch_status_t consume_partition(vpx_context_t *context, switch_frame_t
 
 	if (context->is_vp9) {
 		payload_descriptor->vp9.start = start;
-		payload_descriptor->vp9.key = key;
+
+		if (1) {
+			// payload_descriptor->vp9.have_p_layer = key; // key?
+			payload_descriptor->vp9.have_pid = 0;
+
+			if (start) {
+				// context->vp9.picture_id++;
+			}
+
+			// if (context->vp9.picture_id > 0x7f) { // todo rewind to 0
+			// 	*body++ = context->vp9.picture_id >> 8;
+			// 	*body++ = context->vp9.picture_id & 0xff;
+			// 	payload_size--;
+			// 	frame->datalen++;
+			// } else {
+			// 	*body++ = context->vp9.picture_id;
+			// }
+
+			// payload_size--;
+			// frame->datalen++;
+
+			if (key) {
+				vp9_ss_t *ss = (vp9_ss_t *)body;
+
+				payload_descriptor->vp9.have_ss = 1;
+				ss->n_s = 0;
+				ss->g = 0;
+				ss->y = 0;
+				ss->zero = 0;
+				body++;
+				payload_size--;
+				frame->datalen++;
+
+				if (0) { // y ?
+					uint16_t *w;
+					uint16_t *h;
+
+					ss->y = 1;
+
+					w = (uint16_t *)body;
+					body+=2;
+					h = (uint16_t *)body;
+					body+=2;
+
+					*w = (uint16_t)context->codec_settings.video.width;
+					*h = (uint16_t)context->codec_settings.video.height;
+
+					payload_size-= (ss->n_s + 1) * 4;
+					frame->datalen+= (ss->n_s + 1) * 4;
+				}
+			}
+		}
+
 	} else {
 		payload_descriptor->vp8.start = start;
 	}
@@ -583,9 +729,13 @@ static switch_status_t consume_partition(vpx_context_t *context, switch_frame_t
 		frame->m = 0;
 		return SWITCH_STATUS_MORE_DATA;
 	}
+
+	if (frame->m && context->is_vp9) {
+		payload_descriptor->vp9.end = 1;
+	}
 }
 
-static void reset_codec_encoder(switch_codec_t *codec)
+static switch_status_t reset_codec_encoder(switch_codec_t *codec)
 {
 	vpx_context_t *context = (vpx_context_t *)codec->private_info;
 
@@ -597,7 +747,7 @@ static void reset_codec_encoder(switch_codec_t *codec)
 	context->framecount = 0;
 	context->encoder_init = 0;
 	context->pkt = NULL;
-	init_encoder(codec);
+	return init_encoder(codec);
 }
 
 static switch_status_t switch_vpx_encode(switch_codec_t *codec, switch_frame_t *frame)
@@ -616,7 +766,9 @@ static switch_status_t switch_vpx_encode(switch_codec_t *codec, switch_frame_t *
 	}
 
 	if (context->need_encoder_reset != 0) {
-		reset_codec_encoder(codec);
+		if (reset_codec_encoder(codec) != SWITCH_STATUS_SUCCESS) {
+			return SWITCH_STATUS_FALSE;
+		}
 		context->need_encoder_reset = 0;
 	}
 
@@ -633,23 +785,27 @@ static switch_status_t switch_vpx_encode(switch_codec_t *codec, switch_frame_t *
 		context->codec_settings.video.height = height;
 		reset_codec_encoder(codec);
 		frame->flags |= SFF_PICTURE_RESET;
-		context->need_key_frame = 1;
+		context->need_key_frame = 3;
 	}
 
 	
 	if (!context->encoder_init) {
-		init_encoder(codec);
+		if (init_encoder(codec) != SWITCH_STATUS_SUCCESS) {
+			return SWITCH_STATUS_FALSE;
+		}
 	}
 
 	if (context->change_bandwidth) {
 		context->codec_settings.video.bandwidth = context->change_bandwidth;
 		context->change_bandwidth = 0;
-		init_encoder(codec);
+		if (init_encoder(codec) != SWITCH_STATUS_SUCCESS) {
+			return SWITCH_STATUS_FALSE;
+		}
 	}
 
 	now = switch_time_now();
 
-	if (context->need_key_frame != 0) {
+	if (context->need_key_frame > 0) {
 		// force generate a key frame
 
 		if (!context->last_key_frame || (now - context->last_key_frame) > KEY_FRAME_MIN_FREQ) {
@@ -669,7 +825,7 @@ static switch_status_t switch_vpx_encode(switch_codec_t *codec, switch_frame_t *
 	if ((err = vpx_codec_encode(&context->encoder,
 						 (vpx_image_t *) frame->img,
 						 pts,
-						 dur, 
+						 dur,
 						 vpx_flags,
 						 VPX_DL_REALTIME)) != VPX_CODEC_OK) {
 		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "VPX encode error %d:%s:%s\n",
@@ -774,27 +930,92 @@ static switch_status_t buffer_vp8_packets(vpx_context_t *context, switch_frame_t
 	return SWITCH_STATUS_SUCCESS;
 }
 
+// https://tools.ietf.org/id/draft-ietf-payload-vp9-01.txt
+
 static switch_status_t buffer_vp9_packets(vpx_context_t *context, switch_frame_t *frame)
 {
 	uint8_t *data = (uint8_t *)frame->data;
 	uint8_t *vp9  = (uint8_t *)frame->data;
+	vp9_payload_descriptor_t *desc = (vp9_payload_descriptor_t *)vp9;
 	int len = 0;
 
-	// switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "%02x %02x %02x %02x %d %d\n", *data, *(data+1), *(data+2), *(data+3), frame->m, frame->datalen);
+	// switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "%02x %02x %02x %02x m=%d start=%d end=%d len=%d\n", *data, *(data+1), *(data+2), *(data+3), frame->m, desc->start, desc->end, frame->datalen);
 
-	if (switch_buffer_inuse(context->vpx_packet_buffer)) { // middle packet
-		if (IS_VP9_START_PKT(*vp9)) {
-			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG1, "got invalid vp9 packet, packet loss? resetting buffer\n");
-			switch_buffer_zero(context->vpx_packet_buffer);
+	vp9++;
+
+	if (desc->is_flexible) {
+		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "VP9 Flexiable mode is not supported yet\n");
+		switch_buffer_zero(context->vpx_packet_buffer);
+		goto end;
+	}
+
+	if (desc->have_pid) {
+		uint16_t pid = 0;
+
+		pid = *vp9 & 0x7f;
+
+		if (*vp9 & 0x80) {
+			vp9++;
+			pid = (pid << 8) + *vp9;
 		}
-	} else { // start packet
-		if (!IS_VP9_START_PKT(*vp9)) {
-			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG1, "got invalid vp9 packet, packet loss? waiting for a start packet\n");
-			goto end;
+
+		vp9++;
+		// switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "have pid: %d start=%d end=%d\n", pid, desc->start, desc->end);
+	}
+
+	if (desc->have_layer_ind) {
+		vp9_layer_t *layer = (vp9_layer_t *)vp9;
+
+		vp9 += 2;
+		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "have layer idx: %d\n", layer->s);
+	}
+
+	if (desc->have_ss) {
+		vp9_ss_t *ss = (vp9_ss_t *)(vp9++);
+
+		context->got_key_frame = 1;
+		context->got_start_frame = 1;
+		// switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "have ss: %02x n_s: %d y:%d g:%d\n", *(uint8_t *)ss, ss->n_s, ss->y, ss->g);
+
+		if (ss->y) {
+			int i;
+
+			for (i=0; i<=ss->n_s; i++) {
+				int width = ntohs(*(uint16_t *)vp9);
+				int height = ntohs(*(uint16_t *)(vp9 + 2));
+				vp9 += 4;
+				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "SS: %d %dx%d\n", i, width, height);
+			}
+		}
+
+		if (ss->g) {
+			int i;
+			uint8_t ng = *vp9++;
+
+			for (i = 0; ng > 0 && i < ng; i++) {
+				vp9_n_g_t *n_g = (vp9_n_g_t *)(vp9++);
+				vp9 += n_g->r;
+			}
 		}
 	}
 
-	vp9 = data + 1;
+	if (vp9 - data >= frame->datalen) {
+		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Invalid VP9 Packet\n");
+		switch_buffer_zero(context->vpx_packet_buffer);
+		goto end;
+	}
+
+	if (switch_buffer_inuse(context->vpx_packet_buffer)) { // middle packet
+		if (desc->start) {
+			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "got invalid vp9 packet, packet loss? resetting buffer\n");
+			switch_buffer_zero(context->vpx_packet_buffer);
+		}
+	} else { // start packet
+		if (!desc->start) {
+			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "got invalid vp9 packet, packet loss? waiting for a start packet\n");
+			goto end;
+		}
+	}
 
 	len = frame->datalen - (vp9 - data);
 	switch_buffer_write(context->vpx_packet_buffer, vp9, len);
@@ -820,7 +1041,6 @@ static switch_status_t switch_vpx_decode(switch_codec_t *codec, switch_frame_t *
 		is_keyframe = IS_VP8_KEY_FRAME((uint8_t *)frame->data);
 	}
 	
-	
     if (context->got_key_frame <= 0) {
         context->no_key_frame++;
         //switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "no keyframe, %d\n", context->no_key_frame);
@@ -830,7 +1050,7 @@ static switch_status_t switch_vpx_decode(switch_codec_t *codec, switch_frame_t *
             }
         }
     }
-	
+
 	// if (is_keyframe) switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "got key %d\n", is_keyframe);
 
 	if (context->need_decoder_reset != 0) {
@@ -887,7 +1107,7 @@ static switch_status_t switch_vpx_decode(switch_codec_t *codec, switch_frame_t *
 		switch_goto_status(SWITCH_STATUS_SUCCESS, end);
 	}
 
-	//printf("READ buf:%ld got_key:%d st:%d m:%d\n", switch_buffer_inuse(context->vpx_packet_buffer), context->got_key_frame, status, frame->m);
+	// switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "====READ buf:%ld got_key:%d st:%d m:%d\n", switch_buffer_inuse(context->vpx_packet_buffer), context->got_key_frame, status, frame->m);
 
 	len = switch_buffer_inuse(context->vpx_packet_buffer);
 
@@ -921,6 +1141,7 @@ static switch_status_t switch_vpx_decode(switch_codec_t *codec, switch_frame_t *
 			frame->img = NULL;
 		} else {
 			frame->img = (switch_image_t *) vpx_codec_get_frame(decoder, &context->dec_iter);
+			// switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "%dx%d\n", frame->img->d_w, frame->img->d_h);
 		}
 		
 		switch_buffer_zero(context->vpx_packet_buffer);
@@ -1050,6 +1271,8 @@ SWITCH_MODULE_LOAD_FUNCTION(mod_vpx_load)
 	return SWITCH_STATUS_SUCCESS;
 }
 
+#endif
+#endif
 /* For Emacs:
  * Local Variables:
  * mode:c
diff --git a/src/switch_xml.c b/src/switch_xml.c
index 5faf4b235f..ffcdd8b15f 100644
--- a/src/switch_xml.c
+++ b/src/switch_xml.c
@@ -1298,15 +1298,20 @@ static FILE *preprocess_glob(const char *cwd, const char *pattern, FILE *write_f
 	char *dir_path = NULL, *e = NULL;
 	glob_t glob_data;
 	size_t n;
+	int glob_return;
 
 	if (!switch_is_file_path(pattern)) {
 		full_path = switch_mprintf("%s%s%s", cwd, SWITCH_PATH_SEPARATOR, pattern);
 		pattern = full_path;
 	}
 
-	if (glob(pattern, GLOB_NOCHECK, NULL, &glob_data) != 0) {
+	glob_return = glob(pattern, GLOB_ERR, NULL, &glob_data);
+	if (glob_return == GLOB_NOSPACE || glob_return == GLOB_ABORTED) {
 		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Error including %s\n", pattern);
 		goto end;
+	} else if (glob_return == GLOB_NOMATCH) {
+		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "No files to include at %s\n", pattern);
+		goto end;
 	}
 
 	for (n = 0; n < glob_data.gl_pathc; ++n) {
@@ -1324,7 +1329,7 @@ static FILE *preprocess_glob(const char *cwd, const char *pattern, FILE *write_f
 	}
 	globfree(&glob_data);
 
-  end:
+ end:
 
 	switch_safe_free(full_path);
 
diff --git a/support-d/.bashrc b/support-d/.bashrc
index 4b917c3b5c..a02d28f971 100644
--- a/support-d/.bashrc
+++ b/support-d/.bashrc
@@ -84,7 +84,8 @@ if [ ! -f ~/.inputrc ]; then
 fi
 
 set -o emacs
-export PROMPT_COMMAND="history -a; history -c; history -r; ${PROMPT_COMMAND}"
+# BE GONE SATAN!
+#export PROMPT_COMMAND="history -a; history -c; history -r; ${PROMPT_COMMAND}"
 export HISTSIZE=5000
 export TERM=xterm-256color
 export LESSCHARSET="latin1"
diff --git a/support-d/c3p0.pub b/support-d/c3p0.pub
new file mode 100644
index 0000000000..8ee7a2b7c2
--- /dev/null
+++ b/support-d/c3p0.pub
@@ -0,0 +1 @@
+ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAAEAQDfHcRkpFVAv7yKNMwKIenooYqZWDPIrG7ckp2n1ylanX/KgVPopsnhCBcUgGCg0PJp4YVjzhIVtfne5ZSDS/sKFxIBV5Zqwk2e6OClTkYMFkeOoxS2DfG4LqB/3sEg1XAANwfmlgsnQGR5cj7FNVyKZ5uUr+QDMvfzW6r0DB7Oghc9bWJBdPwF7ivyEaIhBKXwEGsPg45937JQ8ViaWQmSKf+qlc7IMLgMgaBz5Cv2q5zn8FeLyzeFrw1IVmjRIpffGYPSIOgaTIgJxUtJ5Jxy/jpSx1AndlgTiCP6K1XpEAXDQTwvh9OPEVM+7Ne5SUlUutGaIuzgZkJ4+uaLbsz+6hCmBA6iqi0Sx5PHzW63k18jPg3VkAh2Vp/GXjkyaU7aIsdRCdNvOjrw9TSH/B3lVPk3ntzfmTW7tuIU7w6lvIFzXjHlhIgo5NkaF1m0dXhTdK0kTRcgvptG7P1Fw8Sly/9UGQWXpdVwHKVsAa1/M9/hQsD3oNZejWbGpJghfyHJvb87dk7wLQBr/xR8LysyTGbwMJH0OEbF3hmHXEjnimQYA2GBpnvKUEz3OsB5fEOKBDRR/qtt12QZHgHJqluaPgoLnHl6gbwbImCrS8kJGqcuj/FNYS1+QJb6gg/tazWNNnDm5ZpTWwdsl8CjrxCL3l0WK7BMIEpdeVU9SSEf82YTQZCLhY/z8vkTU7RLiiVvSnzDsSEX6OeLEF9mxnbfA8A+beYAXq2ZgSKjo6Phs20ZgpXoRbyzO7z4CitYNLtPPPHuRdzyWf1hUr6Gu/ZTtToPyzRHoKjyi7OpBeHm6fvZQ1lwsJtJkmZlzdWPNHrEhAmSTtAEf9hKFxqkogLHc3/BMtlJ4Sl1QBUlS0+3OGB7PANbD/rVx/CCEjj0/TR3P0aMi8xvHkjuWRMgAZ+CARuwB+rJYwxsKxnUveEg0nhhEQsrhJ8IxMLdsQJ8moOYelgyHgM2dh861/NFbXpJJ+lOUUN47CT1OnIsclhVK1EiW5T6rV6Bu9fm9oqdo/MAMiEWd1ALjMtqHGO02IBbxiZwiWmAZtV/SV4KWhTMKWFKny85JpweK5WTqbxaySjKSNwGr3aWI4EWF33xE+YPE4cgrZTDx/JhyTKk40MwAd0XKoKwGuTAsxABl0fa6ViLP3YNalEnjxhwsCIhka1wgdZOWwNvLYZ1MkPvZDZq326T7diVeQudJv8xTZFCAwm7KTJ9LKYWXKI125bdtNUZQ4c1STt0MM0imipS8kMEgcngn8QtUm9s8TzEDrojGe+K/KzdP/LWhES9JKGZIRCEgl2NQNY/r7/JPgSEb7gGNy+4W8e9CQsaeIHTBqkyOvFi04mR1gXTDSykFnxk3qf7 FreeSWITCH Solutions, LLC RSA Key